diff --git a/.github/ISSUE_TEMPLATE/40-tflite-op-request.md b/.github/ISSUE_TEMPLATE/40-tflite-op-request.md
new file mode 100644
index 00000000000..7b391279e47
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/40-tflite-op-request.md
@@ -0,0 +1,24 @@
+---
+name: TensorFlow Lite Op Request
+about: Use this template for reporting ops you are using or missing.
+
+---
+
+
+**System information**
+- OS Platform and Distribution (e.g., Linux Ubuntu 16.04):
+- TensorFlow installed from (source or binary):
+- TensorFlow version (or github SHA if from source):
+
+
+**Provide the text output from tflite_convert**
+
+```
+# Copy and paste here
+```
+
+Also, please include a link to a GraphDef or the model if possible.
+
+**Any other info / logs**
+
+Include any logs or source code that would be helpful to diagnose the problem. If including tracebacks, please include the full traceback. Large logs and files should be attached.
diff --git a/README.md b/README.md
index 8af5370befb..6fefdd32244 100644
--- a/README.md
+++ b/README.md
@@ -14,7 +14,8 @@ data flow graphs.  The graph nodes represent mathematical operations, while
 the graph edges represent the multidimensional data arrays (tensors) that flow
 between them.  This flexible architecture enables you to deploy computation to one
 or more CPUs or GPUs in a desktop, server, or mobile device without rewriting
-code.  TensorFlow also includes [TensorBoard](https://www.tensorflow.org/guide/summaries_and_tensorboard), a data visualization toolkit.
+code.  TensorFlow also includes [TensorBoard](https://github.com/tensorflow/tensorboard),
+a data visualization toolkit.
 
 TensorFlow was originally developed by researchers and engineers
 working on the Google Brain team within Google's Machine Intelligence Research
@@ -111,7 +112,7 @@ The TensorFlow project strives to abide by generally accepted best practices in
 Build Type                                                                                                                                                                                      | Status                                                                                                                                                                                   | Artifacts
 ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------
 **IBM s390x**                                                                                                                                                                                   | [![Build Status](http://ibmz-ci.osuosl.org/job/TensorFlow_IBMZ_CI/badge/icon)](http://ibmz-ci.osuosl.org/job/TensorFlow_IBMZ_CI/)                                                        | TBA
-**IBM ppc64le CPU**                                                                                                                                                                             | [![Build Status](http://powerci.osuosl.org/job/TensorFlow_Ubuntu_16.04_CPU/badge/icon)](http://powerci.osuosl.org/job/TensorFlow_Ubuntu_16.04_CPU/)                                      | TBA
+**IBM ppc64le CPU**                                                                                                                                                                             | [![Build Status](http://powerci.osuosl.org/job/TensorFlow_PPC64LE_CPU_Build/badge/icon)](http://powerci.osuosl.org/job/TensorFlow_PPC64LE_CPU_Build/)                                      | TBA
 **IBM ppc64le GPU** Nightly                                                                                                                                                                     | [![Build Status](https://powerci.osuosl.org/job/TensorFlow_PPC64LE_GPU_Nightly_Artifact/badge/icon)](https://powerci.osuosl.org/job/TensorFlow_PPC64LE_GPU_Nightly_Artifact/)            | [Nightly](https://powerci.osuosl.org/job/TensorFlow_PPC64LE_GPU_Nightly_Artifact/)
 **IBM ppc64le GPU** Stable Release                                                                                                                                                              | [![Build Status](https://powerci.osuosl.org/job/TensorFlow_PPC64LE_GPU_Release_Build/badge/icon)](https://powerci.osuosl.org/job/TensorFlow_PPC64LE_GPU_Release_Build/)                  | [Release](https://powerci.osuosl.org/job/TensorFlow_PPC64LE_GPU_Release_Build/)
 **Linux CPU with Intel® MKL-DNN** Nightly                                                                                                                                                       | [![Build Status](https://tensorflow-ci.intel.com/job/tensorflow-mkl-linux-cpu/badge/icon)](https://tensorflow-ci.intel.com/job/tensorflow-mkl-linux-cpu/)                                | [Nightly](https://tensorflow-ci.intel.com/job/tensorflow-mkl-build-whl-nightly/)
@@ -127,6 +128,7 @@ Build Type
 * [TensorFlow Roadmap](https://www.tensorflow.org/community/roadmap)
 * [TensorFlow White Papers](https://www.tensorflow.org/about/bib)
 * [TensorFlow YouTube Channel](https://www.youtube.com/channel/UC0rqucBdTuFTjJiefW5t-IQ)
+* [TensorFlow Visualization Toolkit](https://github.com/tensorflow/tensorboard)
 
 Learn more about the TensorFlow community at the [community page of tensorflow.org](https://www.tensorflow.org/community) for a few ways to participate.
 
diff --git a/WORKSPACE b/WORKSPACE
index 0c7bc085b51..7cc08e0164a 100644
--- a/WORKSPACE
+++ b/WORKSPACE
@@ -1,5 +1,7 @@
 workspace(name = "org_tensorflow")
 
+load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
+
 http_archive(
     name = "io_bazel_rules_closure",
     sha256 = "a38539c5b5c358548e75b44141b4ab637bba7c4dc02b46b1f62a96d6433f56ae",
@@ -57,9 +59,9 @@ android_workspace()
 # Please add all new TensorFlow dependencies in workspace.bzl.
 tf_workspace()
 
-new_http_archive(
+http_archive(
     name = "inception_v1",
-    build_file = "models.BUILD",
+    build_file = "//:models.BUILD",
     sha256 = "7efe12a8363f09bc24d7b7a450304a15655a57a7751929b2c1593a71183bb105",
     urls = [
         "http://storage.googleapis.com/download.tensorflow.org/models/inception_v1.zip",
@@ -67,9 +69,9 @@ new_http_archive(
     ],
 )
 
-new_http_archive(
+http_archive(
     name = "mobile_ssd",
-    build_file = "models.BUILD",
+    build_file = "//:models.BUILD",
     sha256 = "bddd81ea5c80a97adfac1c9f770e6f55cbafd7cce4d3bbe15fbeb041e6b8f3e8",
     urls = [
         "http://storage.googleapis.com/download.tensorflow.org/models/object_detection/ssd_mobilenet_v1_android_export.zip",
@@ -77,9 +79,9 @@ new_http_archive(
     ],
 )
 
-new_http_archive(
+http_archive(
     name = "mobile_multibox",
-    build_file = "models.BUILD",
+    build_file = "//:models.BUILD",
     sha256 = "859edcddf84dddb974c36c36cfc1f74555148e9c9213dedacf1d6b613ad52b96",
     urls = [
         "http://storage.googleapis.com/download.tensorflow.org/models/mobile_multibox_v1a.zip",
@@ -87,9 +89,9 @@ new_http_archive(
     ],
 )
 
-new_http_archive(
+http_archive(
     name = "stylize",
-    build_file = "models.BUILD",
+    build_file = "//:models.BUILD",
     sha256 = "3d374a730aef330424a356a8d4f04d8a54277c425e274ecb7d9c83aa912c6bfa",
     urls = [
         "http://storage.googleapis.com/download.tensorflow.org/models/stylize_v1.zip",
@@ -97,9 +99,9 @@ new_http_archive(
     ],
 )
 
-new_http_archive(
+http_archive(
     name = "speech_commands",
-    build_file = "models.BUILD",
+    build_file = "//:models.BUILD",
     sha256 = "c3ec4fea3158eb111f1d932336351edfe8bd515bb6e87aad4f25dbad0a600d0c",
     urls = [
         "http://storage.googleapis.com/download.tensorflow.org/models/speech_commands_v0.01.zip",
diff --git a/configure.py b/configure.py
index 234561d94a4..5f429c3de89 100644
--- a/configure.py
+++ b/configure.py
@@ -238,6 +238,13 @@ def setup_python(environ_cp):
   write_to_bazelrc('build --python_path=\"%s"' % python_bin_path)
   environ_cp['PYTHON_BIN_PATH'] = python_bin_path
 
+  # If choosen python_lib_path is from a path specified in the PYTHONPATH
+  # variable, need to tell bazel to include PYTHONPATH
+  if environ_cp.get('PYTHONPATH'):
+    python_paths = environ_cp.get('PYTHONPATH').split(':')
+    if python_lib_path in python_paths:
+         write_action_env_to_bazelrc('PYTHONPATH', environ_cp.get('PYTHONPATH'))
+
   # Write tools/python_bin_path.sh
   with open(
       os.path.join(_TF_WORKSPACE_ROOT, 'tools', 'python_bin_path.sh'),
@@ -445,11 +452,12 @@ def convert_version_to_int(version):
   return int(version_str)
 
 
-def check_bazel_version(min_version):
-  """Check installed bazel version is at least min_version.
+def check_bazel_version(min_version, max_version):
+  """Check installed bazel version is between min_version and max_version.
 
   Args:
     min_version: string for minimum bazel version.
+    max_version: string for maximum bazel version.
 
   Returns:
     The bazel version detected.
@@ -467,6 +475,7 @@ def check_bazel_version(min_version):
 
   min_version_int = convert_version_to_int(min_version)
   curr_version_int = convert_version_to_int(curr_version)
+  max_version_int = convert_version_to_int(max_version)
 
   # Check if current bazel version can be detected properly.
   if not curr_version_int:
@@ -480,6 +489,10 @@ def check_bazel_version(min_version):
     print('Please upgrade your bazel installation to version %s or higher to '
           'build TensorFlow!' % min_version)
     sys.exit(0)
+  if curr_version_int > max_version_int:
+    print('Please downgrade your bazel installation to version %s or lower to '
+          'build TensorFlow!' % max_version)
+    sys.exit(0)
   return curr_version
 
 
@@ -859,7 +872,7 @@ def set_tf_cuda_version(environ_cp):
     cuda_toolkit_paths_full = [
         os.path.join(cuda_toolkit_path, x) for x in cuda_rt_lib_paths
     ]
-    if any([os.path.exists(x) for x in cuda_toolkit_paths_full]):
+    if any(os.path.exists(x) for x in cuda_toolkit_paths_full):
       break
 
     # Reset and retry
@@ -1552,7 +1565,7 @@ def main():
   # environment variables.
   environ_cp = dict(os.environ)
 
-  check_bazel_version('0.15.0')
+  check_bazel_version('0.15.0', '0.20.0')
 
   reset_tf_configure_bazelrc()
   # Explicitly import tools/bazel.rc, this is needed for Bazel 0.19.0 or later
@@ -1694,6 +1707,7 @@ def main():
   config_info_line('nohdfs', 'Disable HDFS support.')
   config_info_line('noignite', 'Disable Apacha Ignite support.')
   config_info_line('nokafka', 'Disable Apache Kafka support.')
+  config_info_line('nonccl', 'Disable NVIDIA NCCL support.')
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/BUILD b/tensorflow/BUILD
index 859dc3b8d77..fd4b94202aa 100644
--- a/tensorflow/BUILD
+++ b/tensorflow/BUILD
@@ -43,6 +43,11 @@ TENSORFLOW_API_INIT_FILES_V2 = (
     TENSORFLOW_API_INIT_FILES + get_compat_files(TENSORFLOW_API_INIT_FILES_V1, 1)
 )
 
+# @unused
+TENSORFLOW_API_INIT_FILES_V1_WITH_COMPAT = (
+    TENSORFLOW_API_INIT_FILES_V1 + get_compat_files(TENSORFLOW_API_INIT_FILES_V1, 1)
+)
+
 # Config setting used when building for products
 # which requires restricted licenses to be avoided.
 config_setting(
@@ -213,31 +218,37 @@ config_setting(
 #
 config_setting(
     name = "no_aws_support",
-    define_values = {"no_aws_support": "false"},
+    define_values = {"no_aws_support": "true"},
     visibility = ["//visibility:public"],
 )
 
 config_setting(
     name = "no_gcp_support",
-    define_values = {"no_gcp_support": "false"},
+    define_values = {"no_gcp_support": "true"},
     visibility = ["//visibility:public"],
 )
 
 config_setting(
     name = "no_hdfs_support",
-    define_values = {"no_hdfs_support": "false"},
+    define_values = {"no_hdfs_support": "true"},
     visibility = ["//visibility:public"],
 )
 
 config_setting(
     name = "no_ignite_support",
-    define_values = {"no_ignite_support": "false"},
+    define_values = {"no_ignite_support": "true"},
     visibility = ["//visibility:public"],
 )
 
 config_setting(
     name = "no_kafka_support",
-    define_values = {"no_kafka_support": "false"},
+    define_values = {"no_kafka_support": "true"},
+    visibility = ["//visibility:public"],
+)
+
+config_setting(
+    name = "no_nccl_support",
+    define_values = {"no_nccl_support": "true"},
     visibility = ["//visibility:public"],
 )
 
@@ -350,7 +361,7 @@ package_group(
         "-//third_party/tensorflow/python/estimator",
         "//learning/meta_rank/...",
         "//tensorflow/...",
-        "//tensorflow_estimator/...",
+        "//tensorflow_estimator/contrib/...",
         "//tensorflow_fold/llgtm/...",
         "//tensorflow_text/...",
         "//third_party/py/tensor2tensor/...",
@@ -554,18 +565,24 @@ genrule(
     }),
     outs = ["__init__.py"],
     cmd = select({
-        "api_version_2": "cp $(@D)/_api/v2/__init__.py $(OUTS)",
-        "//conditions:default": "cp $(@D)/_api/v1/__init__.py $(OUTS)",
+        "api_version_2": "cp $(@D)/_api/v2/v2.py $(OUTS)",
+        "//conditions:default": "cp $(@D)/_api/v1/v1.py $(OUTS)",
     }),
 )
 
 gen_api_init_files(
     name = "tf_python_api_gen_v1",
-    srcs = ["api_template_v1.__init__.py"],
+    srcs = [
+        "api_template_v1.__init__.py",
+        "compat_template_v1.__init__.py",
+    ],
     api_version = 1,
+    compat_api_versions = [1],
+    compat_init_templates = ["compat_template_v1.__init__.py"],
     output_dir = "_api/v1/",
-    output_files = TENSORFLOW_API_INIT_FILES_V1,
+    output_files = TENSORFLOW_API_INIT_FILES_V1_WITH_COMPAT,
     output_package = "tensorflow._api.v1",
+    root_file_name = "v1.py",
     root_init_template = "api_template_v1.__init__.py",
 )
 
@@ -581,6 +598,7 @@ gen_api_init_files(
     output_dir = "_api/v2/",
     output_files = TENSORFLOW_API_INIT_FILES_V2,
     output_package = "tensorflow._api.v2",
+    root_file_name = "v2.py",
     root_init_template = "api_template.__init__.py",
 )
 
diff --git a/tensorflow/api_template.__init__.py b/tensorflow/api_template.__init__.py
index 0d497568385..d81cf067eb0 100644
--- a/tensorflow/api_template.__init__.py
+++ b/tensorflow/api_template.__init__.py
@@ -21,8 +21,6 @@ from __future__ import print_function as _print_function
 import os as _os
 
 # pylint: disable=g-bad-import-order
-from tensorflow.python import pywrap_tensorflow  # pylint: disable=unused-import
-
 from tensorflow.python.tools import component_api_helper as _component_api_helper
 _component_api_helper.package_hook(
     parent_package_str=__name__,
@@ -30,16 +28,16 @@ _component_api_helper.package_hook(
 
 # API IMPORTS PLACEHOLDER
 
-from tensorflow.python.platform import flags  # pylint: disable=g-import-not-at-top
-
 # Make sure directory containing top level submodules is in
 # the __path__ so that "from tensorflow.foo import bar" works.
-_tf_api_dir = _os.path.dirname(_os.path.dirname(app.__file__))  # pylint: disable=undefined-variable
+# We're using bitwise, but there's nothing special about that.
+_tf_api_dir = _os.path.dirname(_os.path.dirname(bitwise.__file__))  # pylint: disable=undefined-variable
 if _tf_api_dir not in __path__:
   __path__.append(_tf_api_dir)
 
-# Calls to enable and disable features.
-enable_eager_execution()  # pylint: disable=undefined-variable
+# Enable TF2 behaviors
+from tensorflow.python.compat import compat as _compat  # pylint: disable=g-import-not-at-top
+_compat.enable_v2_behavior()
 
 # These symbols appear because we import the python package which
 # in turn imports from tensorflow.core and tensorflow.python. They
diff --git a/tensorflow/c/BUILD b/tensorflow/c/BUILD
index b8db1b21449..59c23e7c184 100644
--- a/tensorflow/c/BUILD
+++ b/tensorflow/c/BUILD
@@ -60,6 +60,7 @@ tf_cuda_library(
             "//tensorflow/core:framework",
             "//tensorflow/core:lib",
             "//tensorflow/core:op_gen_lib",
+            "//tensorflow/core/distributed_runtime:server_lib",
         ],
     }),
 )
@@ -120,7 +121,8 @@ tf_cuda_library(
         ":c_api",
         ":c_api_internal",
         "//tensorflow/c/eager:c_api",
-        "//tensorflow/compiler/jit/legacy_flags:mark_for_compilation_pass_flags",
+        "//tensorflow/c/eager:c_api_internal",
+        "//tensorflow/compiler/jit:flags",
         "//tensorflow/contrib/tpu:all_ops",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
@@ -173,6 +175,30 @@ tf_cuda_library(
     ],
 )
 
+tf_cuda_library(
+    name = "kernels",
+    srcs = [
+        "kernels.cc",
+    ],
+    hdrs = [
+        "kernels.h",
+    ],
+    copts = tf_copts(),
+    visibility = ["//visibility:public"],
+    deps = select({
+        "//tensorflow:android": [
+            ":c_api",
+            ":c_api_internal",
+            "//tensorflow/core:android_tensorflow_lib_lite",
+        ],
+        "//conditions:default": [
+            ":c_api",
+            ":c_api_internal",
+            "//tensorflow/core:framework",
+        ],
+    }),
+)
+
 # -----------------------------------------------------------------------------
 # Tests
 
@@ -208,7 +234,10 @@ tf_cuda_cc_test(
         "//tensorflow:darwin": ["-headerpad_max_install_names"],
         "//conditions:default": [],
     }),
-    tags = ["noasan"],
+    tags = [
+        "no_oss",  # http://b/119522529
+        "noasan",
+    ],
     # We must ensure that the dependencies can be dynamically linked since
     # the shared library must be able to use core:framework.
     # linkstatic = tf_kernel_tests_linkstatic(),
@@ -237,7 +266,7 @@ tf_cuda_cc_test(
 
 tf_cc_test(
     name = "c_api_experimental_test",
-    size = "small",
+    size = "medium",
     srcs = ["c_api_experimental_test.cc"],
     data = ["testdata/tf_record"],
     linkopts = select({
@@ -248,8 +277,11 @@ tf_cc_test(
     # the shared library must be able to use core:framework.
     # linkstatic = tf_kernel_tests_linkstatic(),
     deps = [
+        ":c_api",
         ":c_api_experimental",
         ":c_test_util",
+        "//tensorflow/c/eager:c_api",
+        "//tensorflow/c/eager:c_api_test_util",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
@@ -300,6 +332,30 @@ tf_kernel_library(
     alwayslink = 1,
 )
 
+tf_cuda_cc_test(
+    name = "kernels_test",
+    size = "small",
+    srcs = ["kernels_test.cc"],
+    linkopts = select({
+        "//tensorflow:darwin": ["-headerpad_max_install_names"],
+        "//conditions:default": [],
+    }),
+    tags = ["noasan"],
+    # We must ensure that the dependencies can be dynamically linked since
+    # the shared library must be able to use core:framework.
+    # linkstatic = tf_kernel_tests_linkstatic(),
+    deps = [
+        ":c_api",
+        ":kernels",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:proto_text",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
+
 # -----------------------------------------------------------------------------
 # Python API target
 
diff --git a/tensorflow/c/c_api_experimental.cc b/tensorflow/c/c_api_experimental.cc
index fabe2fa0f60..38e29aa74a9 100644
--- a/tensorflow/c/c_api_experimental.cc
+++ b/tensorflow/c/c_api_experimental.cc
@@ -15,13 +15,18 @@ limitations under the License.
 
 #include "tensorflow/c/c_api_experimental.h"
 
+#include "tensorflow/c/c_api.h"
 #include "tensorflow/c/c_api_internal.h"
-#include "tensorflow/compiler/jit/legacy_flags/mark_for_compilation_pass_flags.h"
+#include "tensorflow/c/eager/c_api.h"
+#include "tensorflow/c/eager/c_api_internal.h"
+#include "tensorflow/compiler/jit/flags.h"
 #include "tensorflow/core/common_runtime/eager/attr_builder.h"
 #include "tensorflow/core/framework/tensor.pb.h"
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/graph/node_builder.h"
 #include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/init_main.h"
+#include "tensorflow/core/platform/net.h"
 #include "tensorflow/core/platform/platform.h"
 #include "tensorflow/core/protobuf/config.pb.h"
 #include "tensorflow/core/protobuf/tensorflow_server.pb.h"
@@ -51,8 +56,8 @@ void TF_EnableXLACompilation(TF_SessionOptions* options, unsigned char enable) {
     // These XLA flags are needed to trigger XLA properly from C (more generally
     // non-Python) clients. If this API is called again with `enable` set to
     // false, it is safe to keep these flag values as is.
-    tensorflow::legacy_flags::MarkForCompilationPassFlags* flags =
-        tensorflow::legacy_flags::GetMarkForCompilationPassFlags();
+    tensorflow::MarkForCompilationPassFlags* flags =
+        tensorflow::GetMarkForCompilationPassFlags();
     flags->tf_xla_cpu_global_jit = true;
     flags->tf_xla_min_cluster_size = 1;
   } else {
@@ -71,8 +76,8 @@ TF_Buffer* TF_CreateConfig(unsigned char enable_xla_compilation,
     // These XLA flags are needed to trigger XLA properly from C (more generally
     // non-Python) clients. If this API is called again with `enable` set to
     // false, it is safe to keep these flag values as is.
-    tensorflow::legacy_flags::MarkForCompilationPassFlags* flags =
-        tensorflow::legacy_flags::GetMarkForCompilationPassFlags();
+    tensorflow::MarkForCompilationPassFlags* flags =
+        tensorflow::GetMarkForCompilationPassFlags();
     flags->tf_xla_cpu_global_jit = true;
     flags->tf_xla_min_cluster_size = 1;
   } else {
@@ -6525,7 +6530,7 @@ library {
       }
     }
     node_def {
-      name: "ParallelInterleaveDataset/cycle_length"
+      name: "ExperimentalParallelInterleaveDataset/cycle_length"
       op: "Const"
       attr {
         key: "dtype"
@@ -6546,7 +6551,7 @@ library {
       }
     }
     node_def {
-      name: "ParallelInterleaveDataset/block_length"
+      name: "ExperimentalParallelInterleaveDataset/block_length"
       op: "Const"
       attr {
         key: "dtype"
@@ -6567,7 +6572,7 @@ library {
       }
     }
     node_def {
-      name: "ParallelInterleaveDataset/sloppy"
+      name: "ExperimentalParallelInterleaveDataset/sloppy"
       op: "Const"
       attr {
         key: "dtype"
@@ -6588,7 +6593,7 @@ library {
       }
     }
     node_def {
-      name: "ParallelInterleaveDataset/buffer_output_elements"
+      name: "ExperimentalParallelInterleaveDataset/buffer_output_elements"
       op: "Const"
       attr {
         key: "dtype"
@@ -6609,7 +6614,7 @@ library {
       }
     }
     node_def {
-      name: "ParallelInterleaveDataset/prefetch_input_elements"
+      name: "ExperimentalParallelInterleaveDataset/prefetch_input_elements"
       op: "Const"
       attr {
         key: "dtype"
@@ -6630,14 +6635,14 @@ library {
       }
     }
     node_def {
-      name: "ParallelInterleaveDataset"
-      op: "ParallelInterleaveDataset"
+      name: "ExperimentalParallelInterleaveDataset"
+      op: "ExperimentalParallelInterleaveDataset"
       input: "RepeatDataset:handle:0"
-      input: "ParallelInterleaveDataset/cycle_length:output:0"
-      input: "ParallelInterleaveDataset/block_length:output:0"
-      input: "ParallelInterleaveDataset/sloppy:output:0"
-      input: "ParallelInterleaveDataset/buffer_output_elements:output:0"
-      input: "ParallelInterleaveDataset/prefetch_input_elements:output:0"
+      input: "ExperimentalParallelInterleaveDataset/cycle_length:output:0"
+      input: "ExperimentalParallelInterleaveDataset/block_length:output:0"
+      input: "ExperimentalParallelInterleaveDataset/sloppy:output:0"
+      input: "ExperimentalParallelInterleaveDataset/buffer_output_elements:output:0"
+      input: "ExperimentalParallelInterleaveDataset/prefetch_input_elements:output:0"
       attr {
         key: "Targuments"
         value {
@@ -6737,7 +6742,7 @@ library {
     node_def {
       name: "ShuffleDataset_2"
       op: "ShuffleDataset"
-      input: "ParallelInterleaveDataset:handle:0"
+      input: "ExperimentalParallelInterleaveDataset:handle:0"
       input: "ShuffleDataset_2/buffer_size_1:output:0"
       input: "ShuffleDataset_2/seed_2:output:0"
       input: "ShuffleDataset_2/seed2_2:output:0"
@@ -8739,14 +8744,65 @@ void TFE_TensorHandlePrintDebugString(TFE_TensorHandle* handle) {
   TF_DeleteStatus(status);
 }
 
-TF_CAPI_EXPORT extern void TF_MakeInternalErrorStatus(TF_Status* status,
-                                                      const char* errMsg) {
+struct TFE_ExecuteOpNotification {
+  TFE_ExecuteOpNotification() : status(TF_NewStatus(), TF_DeleteStatus) {}
+  tensorflow::Notification n;
+  std::unique_ptr<tensorflow::Thread> thread;
+  std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status;
+};
+
+TFE_ExecuteOpNotification* TFE_ExecuteOpInNewThread(TFE_Op* op,
+                                                    TFE_TensorHandle** retvals,
+                                                    int* num_retvals,
+                                                    TF_Status* status) {
+  TFE_ExecuteOpNotification* n = new TFE_ExecuteOpNotification;
+
+  n->thread.reset(op->operation.EagerContext()->TFEnv()->StartThread(
+      tensorflow::ThreadOptions(), "ExecuteOpThread",
+      [op, retvals, num_retvals, n]() {
+        TFE_Execute(op, retvals, num_retvals, n->status.get());
+        n->n.Notify();
+      }));
+
+  return n;
+}
+
+void TFE_ExecuteOpNotificationWaitAndDelete(
+    TFE_ExecuteOpNotification* notification, TF_Status* status) {
+  if (notification == nullptr) {
+    status->status = tensorflow::errors::InvalidArgument(
+        "Passed in notification is a nullptr.");
+
+    return;
+  }
+  if (notification->thread == nullptr) {
+    status->status = tensorflow::errors::InvalidArgument(
+        "Passed in notification didn't start a thread correctly. Cleaning up "
+        "this notification. Please re-execute the operation to get a new "
+        "notification.");
+
+    delete notification;
+    return;
+  }
+
+  notification->n.WaitForNotification();
+
+  status->status = notification->status->status;
+
+  delete notification;
+}
+
+void TF_MakeInternalErrorStatus(TF_Status* status, const char* errMsg) {
   status->status = tensorflow::errors::Internal(errMsg);
 }
 
 // This builder is used in the eager API to build a NodeDef.
 struct TF_AttrBuilder : public tensorflow::AttrBuilder {
   using tensorflow::AttrBuilder::AttrBuilder;
+  // The string buffers to make sure that any `attr_name` we pass into
+  // `builder->Set()` will outlive the subsequent
+  // `TF_AttrBuilderCheckCanRunOnDevice()` call(s) on the same `builder`.
+  std::set<std::string> attr_names;
 };
 
 TF_AttrBuilder* TF_NewAttrBuilder(const char* op_name) {
@@ -8757,13 +8813,15 @@ void TF_DeleteAttrBuilder(TF_AttrBuilder* builder) { delete builder; }
 
 void TF_AttrBuilderSetType(TF_AttrBuilder* builder, const char* attr_name,
                            TF_DataType value) {
-  builder->Set(attr_name, static_cast<tensorflow::DataType>(value));
+  auto iter = builder->attr_names.insert(attr_name).first;
+  builder->Set((*iter).c_str(), static_cast<tensorflow::DataType>(value));
 }
 
 void TF_AttrBuilderSetTypeList(TF_AttrBuilder* builder, const char* attr_name,
                                const TF_DataType* values, int num_values) {
+  auto iter = builder->attr_names.insert(attr_name).first;
   builder->Set(
-      attr_name,
+      (*iter).c_str(),
       tensorflow::gtl::ArraySlice<const tensorflow::DataType>(
           reinterpret_cast<const tensorflow::DataType*>(values), num_values));
 }
@@ -8800,3 +8858,31 @@ const char* TF_GetNumberAttrForOpListInput(const char* op_name, int input_index,
   // The returned string is owned by OpRegistry, so liveness is not a concern.
   return input_arg.number_attr().c_str();
 }
+
+int TF_OpIsStateful(const char* op_type, TF_Status* status) {
+  const tensorflow::OpRegistrationData* op_reg_data;
+  status->status =
+      tensorflow::OpRegistry::Global()->LookUp(op_type, &op_reg_data);
+  if (!status->status.ok()) {
+    return 0;
+  }
+  return op_reg_data->op_def.is_stateful();
+}
+
+void TF_InitMain(const char* usage, int* argc, char*** argv) {
+  tensorflow::port::InitMain(usage, argc, argv);
+}
+
+int TF_PickUnusedPortOrDie() {
+  return tensorflow::internal::PickUnusedPortOrDie();
+}
+
+TFE_TensorHandle* TFE_NewTensorHandleFromScalar(TF_DataType dtype_arg,
+                                                void* data, size_t len) {
+  auto dtype = static_cast<tensorflow::DataType>(dtype_arg);
+  DCHECK(tensorflow::DataTypeCanUseMemcpy(dtype));
+
+  tensorflow::Tensor tensor(dtype, tensorflow::TensorShape({}));
+  std::memcpy(tensorflow::TensorCApi::Buffer(tensor)->data(), data, len);
+  return new TFE_TensorHandle(tensor, nullptr, nullptr);
+}
diff --git a/tensorflow/c/c_api_experimental.h b/tensorflow/c/c_api_experimental.h
index 6639b0be72b..80c8bfe594c 100644
--- a/tensorflow/c/c_api_experimental.h
+++ b/tensorflow/c/c_api_experimental.h
@@ -180,6 +180,25 @@ TF_CAPI_EXPORT extern TFE_TensorHandle* TFE_DequeueVariantTensor(
 TF_CAPI_EXPORT extern void TFE_TensorHandlePrintDebugString(
     TFE_TensorHandle* handle);
 
+typedef struct TFE_ExecuteOpNotification TFE_ExecuteOpNotification;
+
+// Allows invoking a kernel asynchronously, and explicitly returns a
+// notification that can be waited upon. This always executes the kernel in a
+// new thread.
+// 1. `retvals` and `num_retvals` can only be consumed after
+// `TFE_ExecuteOp` returns successfully. They shouldn't be used
+// if the return is unsuccessful
+// 2. These new APIs cannot be used together with the TFE context level async
+// support.
+TF_CAPI_EXPORT extern TFE_ExecuteOpNotification* TFE_ExecuteOpInNewThread(
+    TFE_Op* op, TFE_TensorHandle** retvals, int* num_retvals,
+    TF_Status* status);
+
+// Waits to complete the op execution, and cleans up the notification.
+// Errors reported by op execution are set in `status`.
+TF_CAPI_EXPORT extern void TFE_ExecuteOpNotificationWaitAndDelete(
+    TFE_ExecuteOpNotification* notification, TF_Status* status);
+
 TF_CAPI_EXPORT extern void TF_MakeInternalErrorStatus(TF_Status* status,
                                                       const char* errMsg);
 
@@ -209,6 +228,24 @@ TF_CAPI_EXPORT extern void TF_AttrBuilderCheckCanRunOnDevice(
 TF_CAPI_EXPORT extern const char* TF_GetNumberAttrForOpListInput(
     const char* op_name, int input_index, TF_Status* status);
 
+// Returns 1 if the op is stateful, 0 otherwise. The return value is undefined
+// if the status is not ok.
+TF_CAPI_EXPORT extern int TF_OpIsStateful(const char* op_type,
+                                          TF_Status* status);
+
+// Platform specific initialization routine. Very few platforms actually require
+// this to be called.
+TF_CAPI_EXPORT void TF_InitMain(const char* usage, int* argc, char*** argv);
+
+// Platform-specific implementation to return an unused port. (This should used
+// in tests only.)
+TF_CAPI_EXPORT int TF_PickUnusedPortOrDie();
+
+// Fast path method that makes constructing a single scalar tensor require less
+// overhead and copies.
+TF_CAPI_EXPORT extern TFE_TensorHandle* TFE_NewTensorHandleFromScalar(
+    TF_DataType dtype, void* scalar, size_t len);
+
 #ifdef __cplusplus
 } /* end extern "C" */
 #endif
diff --git a/tensorflow/c/c_api_experimental_test.cc b/tensorflow/c/c_api_experimental_test.cc
index c6effd39697..daa7701b7fe 100644
--- a/tensorflow/c/c_api_experimental_test.cc
+++ b/tensorflow/c/c_api_experimental_test.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "tensorflow/c/c_api_experimental.h"
 #include "tensorflow/c/c_test_util.h"
+#include "tensorflow/c/eager/c_api.h"
+#include "tensorflow/c/eager/c_api_test_util.h"
 #include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/logging.h"
@@ -162,5 +164,137 @@ protocol: "grpc"
   TF_DeleteStatus(status);
 }
 
+TEST(CAPI_EXPERIMENTAL, IsStateful) {
+  std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
+      TF_NewStatus(), TF_DeleteStatus);
+  int assign = TF_OpIsStateful("AssignAddVariableOp", status.get());
+  ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
+  EXPECT_EQ(assign, 1);
+  int id = TF_OpIsStateful("Identity", status.get());
+  ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
+  EXPECT_EQ(id, 0);
+}
+
+TEST(CAPI_EXPERIMENTAL, TFE_ExecuteOpInNewThreadTest_Simple) {
+  TF_Status* status = TF_NewStatus();
+  TFE_ContextOptions* opts = TFE_NewContextOptions();
+  TFE_Context* ctx = TFE_NewContext(opts, status);
+  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_DeleteContextOptions(opts);
+
+  TFE_TensorHandle* m = TestMatrixTensorHandle();
+
+  TFE_Op* matmul_op = MatMulOp(ctx, m, m);
+
+  TFE_TensorHandle* retvals[1] = {nullptr};
+  int num_retvals = 1;
+
+  auto* r =
+      TFE_ExecuteOpInNewThread(matmul_op, &retvals[0], &num_retvals, status);
+
+  TFE_ExecuteOpNotificationWaitAndDelete(r, status);
+  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+
+  TF_Tensor* t = TFE_TensorHandleResolve(retvals[0], status);
+  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  float product[4] = {0};
+  EXPECT_EQ(sizeof(product), TF_TensorByteSize(t));
+  memcpy(&product[0], TF_TensorData(t), TF_TensorByteSize(t));
+  TF_DeleteTensor(t);
+  EXPECT_EQ(7, product[0]);
+  EXPECT_EQ(10, product[1]);
+  EXPECT_EQ(15, product[2]);
+  EXPECT_EQ(22, product[3]);
+
+  TFE_DeleteOp(matmul_op);
+  TFE_DeleteTensorHandle(m);
+
+  TFE_DeleteTensorHandle(retvals[0]);
+  TFE_DeleteContext(ctx);
+  TF_DeleteStatus(status);
+}
+
+// Perform a send/recv test. Recv blocks, so they need to be executed
+// asynchronously.
+TEST(CAPI_EXPERIMENTAL, TFE_ExecuteOpInNewThreadTest_Blocking) {
+  TF_Status* status = TF_NewStatus();
+  TFE_ContextOptions* opts = TFE_NewContextOptions();
+  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_Context* ctx = TFE_NewContext(opts, status);
+  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_DeleteContextOptions(opts);
+
+  // Returns a 2x2 float32 Tensor on the CPU, with data 1., 2., 3., 4.
+  TFE_TensorHandle* m = TestMatrixTensorHandle();
+
+  // Build a send op.
+  TFE_Op* send_op = TFE_NewOp(ctx, "_Send", status);
+  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_OpAddInput(send_op, m, status);
+  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+
+  string tensor_name = "Tensor";
+  TFE_OpSetAttrType(send_op, "T", TF_FLOAT);
+  TFE_OpSetAttrString(send_op, "tensor_name", tensor_name.c_str(),
+                      tensor_name.size());
+  string send_device = "/job:localhost/replica:0/task:0/device:CPU:0";
+  TFE_OpSetAttrString(send_op, "send_device", send_device.c_str(),
+                      send_device.size());
+  TFE_OpSetAttrInt(send_op, "send_device_incarnation", 1234);
+  string recv_device = "/job:localhost/replica:0/task:0/device:CPU:0";
+  TFE_OpSetAttrString(send_op, "recv_device", recv_device.c_str(),
+                      recv_device.size());
+  TFE_OpSetAttrBool(send_op, "client_terminated", true);
+
+  // Build a recv op.
+  TFE_Op* recv_op = TFE_NewOp(ctx, "_Recv", status);
+  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+
+  TFE_OpSetAttrType(recv_op, "tensor_type", TF_FLOAT);
+  TFE_OpSetAttrString(recv_op, "tensor_name", tensor_name.c_str(),
+                      tensor_name.size());
+  TFE_OpSetAttrString(recv_op, "send_device", send_device.c_str(),
+                      send_device.size());
+  TFE_OpSetAttrInt(recv_op, "send_device_incarnation", 1234);
+  TFE_OpSetAttrString(recv_op, "recv_device", recv_device.c_str(),
+                      recv_device.size());
+  TFE_OpSetAttrBool(recv_op, "client_terminated", true);
+
+  TFE_TensorHandle* send_retvals;
+  int send_num_retvals = 0;
+  auto* send_result = TFE_ExecuteOpInNewThread(send_op, &send_retvals,
+                                               &send_num_retvals, status);
+
+  TFE_TensorHandle* recv_retvals[1] = {nullptr};
+  int recv_num_retvals = 1;
+  auto* recv_result = TFE_ExecuteOpInNewThread(recv_op, &recv_retvals[0],
+                                               &recv_num_retvals, status);
+
+  TFE_ExecuteOpNotificationWaitAndDelete(send_result, status);
+  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_ExecuteOpNotificationWaitAndDelete(recv_result, status);
+  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+
+  TF_Tensor* t = TFE_TensorHandleResolve(recv_retvals[0], status);
+  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+
+  float product[4] = {0};
+  EXPECT_EQ(sizeof(product), TF_TensorByteSize(t));
+  memcpy(&product[0], TF_TensorData(t), TF_TensorByteSize(t));
+  TF_DeleteTensor(t);
+  EXPECT_EQ(1, product[0]);
+  EXPECT_EQ(2, product[1]);
+  EXPECT_EQ(3, product[2]);
+  EXPECT_EQ(4, product[3]);
+
+  TFE_DeleteOp(send_op);
+  TFE_DeleteOp(recv_op);
+  TFE_DeleteTensorHandle(m);
+
+  TFE_DeleteTensorHandle(recv_retvals[0]);
+  TFE_DeleteContext(ctx);
+  TF_DeleteStatus(status);
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/c/c_api_function.cc b/tensorflow/c/c_api_function.cc
index f68f8a3e90a..28b9f8df9c8 100644
--- a/tensorflow/c/c_api_function.cc
+++ b/tensorflow/c/c_api_function.cc
@@ -392,26 +392,26 @@ Status ProcessInputs(
     EXCLUSIVE_LOCKS_REQUIRED(fn_body->mu) {
   input_tensors->reserve(ninputs);
   for (int i = 0; i < ninputs; ++i) {
-    const Node& node = inputs[i].oper->node;
+    Node* node = &inputs[i].oper->node;
     int idx = inputs[i].index;
 
     TF_RETURN_WITH_CONTEXT_IF_ERROR(
-        fn_body->graph.IsValidOutputTensor(&node, idx),
+        fn_body->graph.IsValidOutputTensor(node, idx),
         "Encountered while processing input ", i, " into function '", fn_name,
         "'");
-    TF_RETURN_WITH_CONTEXT_IF_ERROR(ValidateNonRefOutput(&node, idx),
+    TF_RETURN_WITH_CONTEXT_IF_ERROR(ValidateNonRefOutput(node, idx),
                                     "Encountered while processing input ", i,
                                     " into function '", fn_name, "'");
 
-    input_tensors->emplace_back(&node, idx);
+    input_tensors->emplace_back(node, idx);
 
-    const auto& iter = input_nodes->find(&node);
+    const auto& iter = input_nodes->find(node);
     if (iter == input_nodes->end()) {
-      input_nodes->insert({&node, {idx}});
+      input_nodes->insert({node, {idx}});
     } else {
       auto& indices = iter->second;
       if (std::find(indices.begin(), indices.end(), idx) != indices.end()) {
-        return InvalidArgument("TF_Output ", node.name(), ":", idx,
+        return InvalidArgument("TF_Output ", node->name(), ":", idx,
                                " appears more than once in the input list");
       }
       indices.push_back(idx);
@@ -428,16 +428,16 @@ Status ProcessOutputs(const TF_Graph* fn_body, const char* fn_name,
     EXCLUSIVE_LOCKS_REQUIRED(fn_body->mu) {
   output_tensors->reserve(noutputs);
   for (int i = 0; i < noutputs; ++i) {
-    const Node& node = outputs[i].oper->node;
+    Node* node = &outputs[i].oper->node;
     int idx = outputs[i].index;
     TF_RETURN_WITH_CONTEXT_IF_ERROR(
-        fn_body->graph.IsValidOutputTensor(&node, idx),
+        fn_body->graph.IsValidOutputTensor(node, idx),
         "Encountered while processing output ", i, " from function '", fn_name,
         "'");
-    TF_RETURN_WITH_CONTEXT_IF_ERROR(ValidateNonRefOutput(&node, idx),
+    TF_RETURN_WITH_CONTEXT_IF_ERROR(ValidateNonRefOutput(node, idx),
                                     "Encountered while creating function '",
                                     fn_name, "'");
-    output_tensors->emplace_back(&node, idx);
+    output_tensors->emplace_back(node, idx);
   }
   return Status::OK();
 }
diff --git a/tensorflow/c/eager/BUILD b/tensorflow/c/eager/BUILD
index ba3d8533db7..c34a84fcfee 100644
--- a/tensorflow/c/eager/BUILD
+++ b/tensorflow/c/eager/BUILD
@@ -50,6 +50,7 @@ tf_cuda_library(
         ],
         "//conditions:default": [],
     }) + [
+        "@com_google_absl//absl/memory",
         "//tensorflow/core/common_runtime/eager:eager_operation",
         "//tensorflow/core/distributed_runtime/eager:eager_client",
         "//tensorflow/core/distributed_runtime/rpc/eager:grpc_eager_client",
@@ -143,6 +144,7 @@ tf_cuda_cc_test(
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core/distributed_runtime/rpc:grpc_server_lib",
+        "@com_google_absl//absl/strings",
     ],
 )
 
diff --git a/tensorflow/c/eager/c_api.cc b/tensorflow/c/eager/c_api.cc
index 408277468d7..027d752f420 100755
--- a/tensorflow/c/eager/c_api.cc
+++ b/tensorflow/c/eager/c_api.cc
@@ -21,9 +21,11 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include "absl/memory/memory.h"
 #include "tensorflow/c/c_api.h"
 #include "tensorflow/c/c_api_internal.h"
 #include "tensorflow/c/eager/c_api_internal.h"
+#include "tensorflow/core/platform/host_info.h"
 #ifdef TENSORFLOW_EAGER_USE_XLA
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #endif  // TENSORFLOW_EAGER_USE_XLA
@@ -79,7 +81,7 @@ tensorflow::Status GetAllRemoteDevices(
     const std::vector<string>& remote_workers,
     tensorflow::WorkerCacheInterface* worker_cache,
     std::unique_ptr<tensorflow::DeviceMgr>* device_mgr) {
-  std::vector<tensorflow::Device*> remote_devices;
+  std::vector<std::unique_ptr<tensorflow::Device>> remote_devices;
   tensorflow::Status status;
   // TODO(nareshmodi) do this in parallel instead of serially.
   for (const string& remote_worker : remote_workers) {
@@ -92,7 +94,7 @@ tensorflow::Status GetAllRemoteDevices(
           status = s;
           if (s.ok()) {
             for (tensorflow::Device* d : *devices) {
-              remote_devices.push_back(d);
+              remote_devices.emplace_back(d);
             }
           }
           n.Notify();
@@ -100,7 +102,7 @@ tensorflow::Status GetAllRemoteDevices(
     n.WaitForNotification();
   }
   std::unique_ptr<tensorflow::DeviceMgr> remote_device_mgr(
-      new tensorflow::DeviceMgr(remote_devices));
+      new tensorflow::DeviceMgr(std::move(remote_devices)));
 
   TF_RETURN_IF_ERROR(status);
 
@@ -261,13 +263,13 @@ TF_CAPI_EXPORT extern void TFE_ContextSetAsyncForThread(TFE_Context* ctx,
 void TFE_DeleteContextOptions(TFE_ContextOptions* options) { delete options; }
 
 TFE_Context* TFE_NewContext(const TFE_ContextOptions* opts, TF_Status* status) {
-  std::vector<tensorflow::Device*> devices;
+  std::vector<std::unique_ptr<tensorflow::Device>> devices;
   status->status = tensorflow::DeviceFactory::AddDevices(
       opts->session_options.options, "/job:localhost/replica:0/task:0",
       &devices);
   if (!status->status.ok()) return nullptr;
   std::unique_ptr<tensorflow::DeviceMgr> device_mgr(
-      new tensorflow::DeviceMgr(devices));
+      new tensorflow::DeviceMgr(std::move(devices)));
 
   tensorflow::Rendezvous* r =
       new tensorflow::IntraProcessRendezvous(device_mgr.get());
@@ -409,6 +411,18 @@ const char* TFE_TensorHandleDeviceName(TFE_TensorHandle* h, TF_Status* status) {
                         : d->name().c_str();
 }
 
+const char* TFE_TensorHandleBackingDeviceName(TFE_TensorHandle* h,
+                                              TF_Status* status) {
+  if (h == nullptr || h->handle == nullptr) {
+    status->status = tensorflow::errors::InvalidArgument(
+        "The passed in handle is a nullptr");
+    return nullptr;
+  }
+  tensorflow::Device* d = h->handle->device();
+  return (d == nullptr) ? "/job:localhost/replica:0/task:0/device:CPU:0"
+                        : d->name().c_str();
+}
+
 TF_CAPI_EXPORT extern TFE_TensorHandle* TFE_TensorHandleCopySharingTensor(
     TFE_TensorHandle* h, TF_Status* status) {
   if (h == nullptr || h->handle == nullptr) {
@@ -458,13 +472,20 @@ TFE_Op* TFE_NewOp(TFE_Context* ctx, const char* op_or_function_name,
                   TF_Status* status) {
   const char* name = op_or_function_name;  // Shorthand
   const tensorflow::AttrTypeMap* types;
-  status->status = tensorflow::AttrTypeMapForOp(name, &types);
-  if (status->status.ok()) return new TFE_Op(ctx, name, types);
-  if (TF_GetCode(status) == TF_NOT_FOUND) {
-    if (ctx->context.FindFunctionByName(name)) {
-      status->status = tensorflow::Status::OK();
-      return new TFE_Op(ctx, name, nullptr);
+  bool is_function = false;
+  status->status = tensorflow::AttrTypeMapForOp(name, &types, &is_function);
+  if (status->status.ok()) {
+    if (is_function && !ctx->context.FindFunctionByName(name)) {
+      status->status = tensorflow::errors::NotFound(
+          "'", name,
+          "' is neither a type of a primitive operation nor a name "
+          "of a function registered in binary running on ",
+          tensorflow::port::Hostname(),
+          ". Make sure the operation or function is "
+          "registered in the binary running in this process.");
+      return nullptr;
     }
+    return new TFE_Op(ctx, name, is_function, types);
   }
   return nullptr;
 }
@@ -497,12 +518,6 @@ void TFE_OpAddInput(TFE_Op* op, TFE_TensorHandle* h, TF_Status* status) {
 TF_AttrType TFE_OpGetAttrType(TFE_Op* op, const char* attr_name,
                               unsigned char* is_list, TF_Status* status) {
   TF_AttrType ret;
-  if (op->operation.is_function()) {
-    status->status = tensorflow::errors::Unimplemented(
-        "TODO(apassos): Support for attributes for TensorFlow functions is not "
-        "ready yet.");
-    return TF_ATTR_INT;  // The compiler requires that we return something.
-  }
   status->status = tensorflow::AttrTypeByName(*op->operation.AttrTypes(),
                                               attr_name, &ret, is_list);
   return ret;
diff --git a/tensorflow/c/eager/c_api.h b/tensorflow/c/eager/c_api.h
index b2454d87220..8d6c8d958d5 100755
--- a/tensorflow/c/eager/c_api.h
+++ b/tensorflow/c/eager/c_api.h
@@ -169,10 +169,33 @@ TF_CAPI_EXPORT extern int64_t TFE_TensorHandleNumElements(TFE_TensorHandle* h,
 TF_CAPI_EXPORT extern int64_t TFE_TensorHandleDim(TFE_TensorHandle* h,
                                                   int dim_index,
                                                   TF_Status* status);
+
+// Returns the device of the operation that produced `h`.
+// If `h` was produced by a copy, returns the destination device of
+// the copy. Note that returned device name is not always the device
+// holding the tensor handle's memory. If you want the latter, use
+// TFE_TensorHandleBackingDeviceName.
+// This function will block till the operation that produces `h` has completed.
+//
+// Device on which the kernel of the operation that produced `h` ran.
+//
+// If `h` was produced by a copy, returns the destination device of
+// the copy.
+//
+// Note that returned device name is not always the device that owns the memory
+// that backs the tensor handle. For the latter see
+// TFE_TensorHandleBackingDeviceName.
+//
 // This function will block till the operation that produces `h` has completed.
 TF_CAPI_EXPORT extern const char* TFE_TensorHandleDeviceName(
     TFE_TensorHandle* h, TF_Status* status);
 
+// Returns the name of the device in whose memory `h` resides.
+//
+// This function will block till the operation that produces `h` has completed.
+TF_CAPI_EXPORT extern const char* TFE_TensorHandleBackingDeviceName(
+    TFE_TensorHandle* h, TF_Status* status);
+
 // Return a pointer to a new TFE_TensorHandle that shares the underlying tensor
 // with `h`. On success, `status` is set to OK. On failure, `status` reflects
 // the error and a nullptr is returned.
diff --git a/tensorflow/c/eager/c_api_internal.h b/tensorflow/c/eager/c_api_internal.h
index fa1b22e3af4..67bc1bcd246 100644
--- a/tensorflow/c/eager/c_api_internal.h
+++ b/tensorflow/c/eager/c_api_internal.h
@@ -93,10 +93,9 @@ struct TFE_TensorDebugInfo {
 };
 
 struct TFE_Op {
-  // t is NULL iff the TFE_Op corresponds to a TensorFlow function instead of a
-  // primitive operation.
-  TFE_Op(TFE_Context* ctx, const char* op, const tensorflow::AttrTypeMap* t)
-      : operation(&ctx->context, op, t) {}
+  TFE_Op(TFE_Context* ctx, const char* op, bool is_function,
+         const tensorflow::AttrTypeMap* t)
+      : operation(&ctx->context, op, is_function, t) {}
 
   tensorflow::EagerOperation operation;
 };
diff --git a/tensorflow/c/eager/c_api_test.cc b/tensorflow/c/eager/c_api_test.cc
index 55331022b9d..6b39b79ee82 100644
--- a/tensorflow/c/eager/c_api_test.cc
+++ b/tensorflow/c/eager/c_api_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/c/eager/c_api.h"
 
 #include <string.h>
+#include "absl/strings/match.h"
 #include "tensorflow/c/eager/c_api_test_util.h"
 #include "tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h"
 #include "tensorflow/core/framework/function.pb.h"
@@ -589,9 +590,22 @@ void TensorHandleCopyBetweenTwoGPUDevices(bool async) {
   TF_DeviceList* devices = TFE_ContextListDevices(ctx, status.get());
   ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
   const int num_devices = TF_DeviceListCount(devices);
+  bool has_gpu0 = false;
+  bool has_gpu1 = false;
+  for (int i = 0; i < num_devices; ++i) {
+    const char* dev = TF_DeviceListName(devices, i, status.get());
+    ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
+    string device_name(dev);
+    if (device_name.find("GPU:0") != string::npos) {
+      has_gpu0 = true;
+    }
+    if (device_name.find("GPU:1") != string::npos) {
+      has_gpu1 = true;
+    }
+  }
 
   const char* kCPUDevice = "CPU:0";
-  if (num_devices < 3) {
+  if (!has_gpu0 || !has_gpu1) {
     TF_DeleteDeviceList(devices);
     TF_DeleteTensor(t);
     TFE_DeleteTensorHandle(hcpu);
@@ -781,6 +795,14 @@ TEST(CAPI, TensorHandleNullptr) {
 
   TF_SetStatus(status.get(), TF_OK, "");
 
+  device_name = TFE_TensorHandleBackingDeviceName(h, status.get());
+  ASSERT_EQ(TF_INVALID_ARGUMENT, TF_GetCode(status.get()));
+  ASSERT_EQ(device_name, nullptr);
+  ASSERT_EQ("The passed in handle is a nullptr",
+            string(TF_Message(status.get())));
+
+  TF_SetStatus(status.get(), TF_OK, "");
+
   int num_dims = TFE_TensorHandleNumDims(h, status.get());
   ASSERT_EQ(TF_INVALID_ARGUMENT, TF_GetCode(status.get()));
   ASSERT_EQ(num_dims, -1);
@@ -796,6 +818,62 @@ TEST(CAPI, TensorHandleNullptr) {
             string(TF_Message(status.get())));
 }
 
+TEST(CAPI, TensorHandleDevices) {
+  std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
+      TF_NewStatus(), TF_DeleteStatus);
+  TFE_ContextOptions* opts = TFE_NewContextOptions();
+  TFE_Context* ctx = TFE_NewContext(opts, status.get());
+  TFE_DeleteContextOptions(opts);
+  ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
+
+  TFE_TensorHandle* hcpu = TestMatrixTensorHandle();
+  const char* device_name = TFE_TensorHandleDeviceName(hcpu, status.get());
+  ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
+  ASSERT_TRUE(absl::StrContains(device_name, "CPU:0")) << device_name;
+  const char* backing_device_name =
+      TFE_TensorHandleBackingDeviceName(hcpu, status.get());
+  ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
+  ASSERT_TRUE(absl::StrContains(backing_device_name, "CPU:0"))
+      << backing_device_name;
+
+  // Disable the test if no GPU is present.
+  string gpu_device_name;
+  if (GetDeviceName(ctx, &gpu_device_name, "GPU")) {
+    TFE_TensorHandle* hgpu = TFE_TensorHandleCopyToDevice(
+        hcpu, ctx, gpu_device_name.c_str(), status.get());
+    ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
+
+    TFE_Op* shape_op = ShapeOp(ctx, hgpu);
+    TFE_OpSetDevice(shape_op, gpu_device_name.c_str(), status.get());
+    ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
+    TFE_TensorHandle* retvals[1];
+    int num_retvals = 1;
+    TFE_Execute(shape_op, &retvals[0], &num_retvals, status.get());
+    ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
+
+    // .device of shape is GPU since the op is executed on GPU
+    device_name = TFE_TensorHandleDeviceName(retvals[0], status.get());
+    ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
+    ASSERT_TRUE(absl::StrContains(device_name, "GPU:0")) << device_name;
+
+    // .backing_device of shape is CPU since the tensor is backed by CPU
+    backing_device_name =
+        TFE_TensorHandleBackingDeviceName(retvals[0], status.get());
+    ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
+    ASSERT_TRUE(absl::StrContains(backing_device_name, "CPU:0"))
+        << backing_device_name;
+
+    TFE_DeleteOp(shape_op);
+    TFE_DeleteTensorHandle(retvals[0]);
+    TFE_DeleteTensorHandle(hgpu);
+  }
+
+  TFE_DeleteTensorHandle(hcpu);
+  TFE_ContextAsyncWait(ctx, status.get());
+  EXPECT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
+  TFE_DeleteContext(ctx);
+}
+
 void Execute_MatMul_CPU(bool async) {
   TF_Status* status = TF_NewStatus();
   TFE_ContextOptions* opts = TFE_NewContextOptions();
diff --git a/tensorflow/c/eager/c_api_test_util.cc b/tensorflow/c/eager/c_api_test_util.cc
index 008f088c2dc..bd38127d50c 100644
--- a/tensorflow/c/eager/c_api_test_util.cc
+++ b/tensorflow/c/eager/c_api_test_util.cc
@@ -104,6 +104,19 @@ TFE_Op* MatMulOp(TFE_Context* ctx, TFE_TensorHandle* a, TFE_TensorHandle* b) {
   return op;
 }
 
+TFE_Op* ShapeOp(TFE_Context* ctx, TFE_TensorHandle* a) {
+  TF_Status* status = TF_NewStatus();
+
+  TFE_Op* op = TFE_NewOp(ctx, "Shape", status);
+  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_OpAddInput(op, a, status);
+  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TF_DeleteStatus(status);
+  TFE_OpSetAttrType(op, "T", TFE_TensorHandleDataType(a));
+
+  return op;
+}
+
 TFE_TensorHandle* TestAxisTensorHandle() {
   int64_t dims[] = {1};
   int data[] = {1};
diff --git a/tensorflow/c/eager/c_api_test_util.h b/tensorflow/c/eager/c_api_test_util.h
index 474cae67c89..75ef9459e93 100644
--- a/tensorflow/c/eager/c_api_test_util.h
+++ b/tensorflow/c/eager/c_api_test_util.h
@@ -37,6 +37,9 @@ TFE_TensorHandle* TestMatrixTensorHandle3X2();
 // Return a matmul op multiplying `a` by `b`.
 TFE_Op* MatMulOp(TFE_Context* ctx, TFE_TensorHandle* a, TFE_TensorHandle* b);
 
+// Return a shape op fetching the shape of `a`.
+TFE_Op* ShapeOp(TFE_Context* ctx, TFE_TensorHandle* a);
+
 // Return an 1-D INT32 tensor containing a single value 1.
 TFE_TensorHandle* TestAxisTensorHandle();
 
diff --git a/tensorflow/c/eager/tape.h b/tensorflow/c/eager/tape.h
index 5ba55a203ff..5c11f51e874 100644
--- a/tensorflow/c/eager/tape.h
+++ b/tensorflow/c/eager/tape.h
@@ -141,8 +141,9 @@ class GradientTape {
   // null. The result is populated with one tensor per target element.
   Status ComputeGradient(
       const VSpace<Gradient, BackwardFunction, TapeTensor>& vspace,
-      gtl::ArraySlice<int64> target_tensor_ids,
-      gtl::ArraySlice<int64> source_tensor_id,
+      const gtl::ArraySlice<int64> target_tensor_ids,
+      const gtl::ArraySlice<int64> source_tensor_ids,
+      const gtl::FlatMap<int64, TapeTensor> sources_that_are_targets,
       gtl::ArraySlice<Gradient*> output_gradients,
       std::vector<Gradient*>* result);
 
@@ -396,6 +397,7 @@ template <typename Gradient, typename BackwardFunction, typename TapeTensor>
 Status InitialGradients(
     const VSpace<Gradient, BackwardFunction, TapeTensor>& vspace,
     gtl::ArraySlice<int64> target_tensor_ids,
+    gtl::FlatMap<int64, TapeTensor> sources_that_are_targets,
     gtl::ArraySlice<Gradient*> output_gradients, const TensorTape& tensor_tape,
     const OpTape<BackwardFunction, TapeTensor>& op_tape,
     gtl::FlatMap<int64, std::vector<Gradient*>>* result) {
@@ -425,8 +427,13 @@ Status InitialGradients(
               "none of operations outputs match expected tensor");
         }
       } else {
-        // No record of the target tensor found on the tape, so no gradient
-        // needs to be computed from it. Do nothing.
+        // This target tensor was not generated by any operation recorded on
+        // the tape, so no gradient needs to be computed from it unless this
+        // target is also a source.
+        auto source_tensor = sources_that_are_targets.find(id);
+        if (source_tensor != sources_that_are_targets.end()) {
+          (*result)[id].push_back(vspace.Ones(source_tensor->second));
+        }
       }
     } else {
       (*result)[id].push_back(output_gradients[i]);
@@ -467,8 +474,9 @@ constexpr int kMinAggregateBytes = 128 * 1024 * 1024;
 template <typename Gradient, typename BackwardFunction, typename TapeTensor>
 Status GradientTape<Gradient, BackwardFunction, TapeTensor>::ComputeGradient(
     const VSpace<Gradient, BackwardFunction, TapeTensor>& vspace,
-    gtl::ArraySlice<int64> target_tensor_ids,
-    gtl::ArraySlice<int64> source_tensor_ids,
+    const gtl::ArraySlice<int64> target_tensor_ids,
+    const gtl::ArraySlice<int64> source_tensor_ids,
+    const gtl::FlatMap<int64, TapeTensor> sources_that_are_targets,
     gtl::ArraySlice<Gradient*> output_gradients,
     std::vector<Gradient*>* result) {
   gtl::FlatSet<int64> sources_set(source_tensor_ids.begin(),
@@ -478,7 +486,8 @@ Status GradientTape<Gradient, BackwardFunction, TapeTensor>::ComputeGradient(
   std::vector<int64> op_stack =
       InitialStack(state.op_tape, state.op_missing_tensor);
   gtl::FlatMap<int64, std::vector<Gradient*>> gradients;
-  Status s = InitialGradients(vspace, target_tensor_ids, output_gradients,
+  Status s = InitialGradients(vspace, target_tensor_ids,
+                              sources_that_are_targets, output_gradients,
                               tensor_tape_, state.op_tape, &gradients);
   auto cleanup = [this, &state]() {
     if (!persistent_) {
diff --git a/tensorflow/c/kernels.cc b/tensorflow/c/kernels.cc
new file mode 100644
index 00000000000..3caa5bcb038
--- /dev/null
+++ b/tensorflow/c/kernels.cc
@@ -0,0 +1,143 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+
+#include "tensorflow/c/c_api_internal.h"
+#include "tensorflow/c/kernels.h"
+#include "tensorflow/core/framework/kernel_def_builder.h"
+#include "tensorflow/core/framework/op_kernel.h"
+
+// This file forms the basis of a stable ABI for third-party kernel
+// implementations. It is crucial that changes to this file are made cautiously
+// and with a focus on maintaining both source and binary compatibility.
+
+struct TF_KernelBuilder {
+  ::tensorflow::KernelDefBuilder* cc_builder;
+
+  void* (*create_function)(TF_OpKernelConstruction*);
+  void (*compute_function)(void*, TF_OpKernelContext*);
+  void (*delete_function)(void*);
+};
+
+TF_KernelBuilder* TF_NewKernelBuilder(
+    const char* op_name, const char* device_name,
+    void* (*create_func)(TF_OpKernelConstruction*),
+    void (*compute_func)(void*, TF_OpKernelContext*),
+    void (*delete_func)(void*)) {
+  TF_KernelBuilder* result = new TF_KernelBuilder;
+  result->cc_builder = new ::tensorflow::KernelDefBuilder(op_name);
+  result->cc_builder->Device(device_name);
+  result->create_function = create_func;
+  result->compute_function = compute_func;
+  result->delete_function = delete_func;
+  return result;
+}
+
+void TF_DeleteKernelBuilder(TF_KernelBuilder* builder) {
+  DCHECK_NE(builder, nullptr);
+  delete builder->cc_builder;
+  delete builder;
+}
+
+namespace tensorflow {
+namespace {
+
+// An OpKernel whose methods delegate to C function pointers.
+class COpKernel : public OpKernel {
+ public:
+  explicit COpKernel(OpKernelConstruction* ctx,
+                     void* (*create_func)(TF_OpKernelConstruction*),
+                     void (*compute_func)(void*, TF_OpKernelContext*),
+                     void (*delete_func)(void*))
+      : OpKernel(ctx), compute_func_(compute_func), delete_func_(delete_func) {
+    if (create_func != nullptr) {
+      c_kernel_ =
+          (*create_func)(reinterpret_cast<TF_OpKernelConstruction*>(ctx));
+    } else {
+      c_kernel_ = nullptr;
+    }
+  }
+
+  void Compute(OpKernelContext* ctx) override {
+    (*compute_func_)(c_kernel_, reinterpret_cast<TF_OpKernelContext*>(ctx));
+  }
+
+  ~COpKernel() override {
+    if (delete_func_ != nullptr) {
+      (*delete_func_)(c_kernel_);
+    }
+  }
+
+ private:
+  void (*compute_func_)(void*, TF_OpKernelContext* context);
+  void (*delete_func_)(void*);
+  void* c_kernel_;
+};
+
+// A KernelFactory that returns COpKernel instances.
+class KernelBuilderFactory
+    : public ::tensorflow::kernel_factory::OpKernelFactory {
+ public:
+  explicit KernelBuilderFactory(TF_KernelBuilder* builder)
+      : builder_(builder) {}
+  ::tensorflow::OpKernel* Create(
+      ::tensorflow::OpKernelConstruction* context) override {
+    return new ::tensorflow::COpKernel(context, builder_->create_function,
+                                       builder_->compute_function,
+                                       builder_->delete_function);
+  }
+  ~KernelBuilderFactory() override { TF_DeleteKernelBuilder(builder_); }
+
+ private:
+  TF_KernelBuilder* builder_;
+};
+}  // namespace
+}  // namespace tensorflow
+
+void TF_RegisterKernelBuilder(const char* name, TF_KernelBuilder* builder,
+                              TF_Status* status) {
+  using tensorflow::register_kernel::Name;
+
+  tensorflow::kernel_factory::OpKernelRegistrar(
+      builder->cc_builder->Build(), name,
+      absl::make_unique<tensorflow::KernelBuilderFactory>(builder));
+
+  TF_SetStatus(status, TF_OK, "");
+}
+
+int TF_NumInputs(TF_OpKernelContext* ctx) {
+  auto* cc_ctx = reinterpret_cast<::tensorflow::OpKernelContext*>(ctx);
+  return cc_ctx->num_inputs();
+}
+
+int TF_NumOutputs(TF_OpKernelContext* ctx) {
+  auto* cc_ctx = reinterpret_cast<::tensorflow::OpKernelContext*>(ctx);
+  return cc_ctx->num_outputs();
+}
+
+void TF_GetInput(TF_OpKernelContext* ctx, int i, TF_Tensor** tensor,
+                 TF_Status* status) {
+  auto* cc_ctx = reinterpret_cast<::tensorflow::OpKernelContext*>(ctx);
+  if (i < 0 || i >= cc_ctx->num_inputs()) {
+    TF_SetStatus(status, TF_OUT_OF_RANGE, "input index out of range");
+    return;
+  }
+  const ::tensorflow::Tensor& cc_tensor(cc_ctx->input(i));
+  TF_Tensor* result = ::tensorflow::TF_TensorFromTensor(cc_tensor, status);
+  if (TF_GetCode(status) == TF_OK) {
+    *tensor = result;
+  }
+}
diff --git a/tensorflow/c/kernels.h b/tensorflow/c/kernels.h
new file mode 100644
index 00000000000..d7778829bca
--- /dev/null
+++ b/tensorflow/c/kernels.h
@@ -0,0 +1,110 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_C_KERNELS_H_
+#define TENSORFLOW_C_KERNELS_H_
+
+#include "tensorflow/c/c_api.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// --------------------------------------------------------------------------
+// C API for TensorFlow Kernels.
+//
+// This API allows developers to register custom kernel implementations for
+// TensorFlow.
+//
+// See c_api.h header comments for a discussion about API conventions.
+//
+// Users wishing to extend TensorFlow with new kernels will call
+// `TF_NewKernelBuilder`. The resulting kernel builder can be registered with
+// `TF_RegisterKernelBuilder`, which will allow TF to construct user-provided
+// kernels when necessary.
+
+struct TF_KernelBuilder;
+struct TF_OpKernelConstruction;
+struct TF_OpKernelContext;
+
+// Allocates a new kernel builder and returns a pointer to it.
+//
+// If non-null, TensorFlow will call create_func when it needs to instantiate
+// the kernel. The pointer returned by create_func will be passed to
+// compute_func and delete_func, thereby functioning as a "this" pointer for
+// referring to kernel instances.
+//
+// The TF_OpKernelConstruction pointer passed to create_func is owned by
+// TensorFlow and will be deleted once create_func returns. It must not be used
+// after this.
+//
+// When TensorFlow needs to perform a computation with this kernel, it will
+// call compute_func. This function will receive the pointer returned by
+// create_func (or null if no create_func was provided), along with the inputs
+// to the computation.
+//
+// The TF_OpKernelContext pointer received by compute_func is owned by
+// TensorFlow and will be deleted once compute_func returns. It must not be used
+// after this.
+//
+// Finally, when TensorFlow no longer needs the kernel, it will call
+// delete_func if one is provided. This function will receive the pointer
+// returned in `create_func` or nullptr if no `create_func` was provided.
+//
+// The caller should pass the result of this function to
+// TF_RegisterKernelBuilder, which will take ownership of the pointer. If, for
+// some reason, the kernel builder will not be registered, the caller should
+// delete it with TF_DeleteKernelBuilder.
+TF_CAPI_EXPORT extern TF_KernelBuilder* TF_NewKernelBuilder(
+    const char* op_name, const char* device_name,
+    void* (*create_func)(TF_OpKernelConstruction*),
+    void (*compute_func)(void*, TF_OpKernelContext*),
+    void (*delete_func)(void*));
+
+// Register the given kernel builder with the TensorFlow runtime. If
+// registration fails, the given status will be populated.
+//
+// This call takes ownership of the `builder` pointer.
+TF_CAPI_EXPORT extern void TF_RegisterKernelBuilder(const char* kernel_name,
+                                                    TF_KernelBuilder* builder,
+                                                    TF_Status* status);
+
+// Deletes the given TF_KernelBuilder. This should be called only if the kernel
+// builder is not registered with TensorFlow via TF_RegisterKernelBuilder.
+TF_CAPI_EXPORT extern void TF_DeleteKernelBuilder(TF_KernelBuilder* builder);
+
+// --------------------------------------------------------------------------
+// OpKernelContext routines
+
+// TF_NumInputs returns the number of inputs available in ctx.
+TF_CAPI_EXPORT extern int TF_NumInputs(TF_OpKernelContext* ctx);
+
+// TF_NumOutputs returns the number of outputs to be placed in *ctx by the
+// kernel.
+TF_CAPI_EXPORT extern int TF_NumOutputs(TF_OpKernelContext* ctx);
+
+// Retrieves the ith input from ctx. If TF_GetCode(status) is TF_OK, *tensor is
+// populated and its ownership is passed to the caller. In any other case,
+// *tensor is not modified.
+//
+// If i < 0 or i >= TF_NumInputs(ctx), *status is set to TF_OUT_OF_RANGE.
+TF_CAPI_EXPORT extern void TF_GetInput(TF_OpKernelContext* ctx, int i,
+                                       TF_Tensor** tensor, TF_Status* status);
+
+#ifdef __cplusplus
+} /* end extern "C" */
+#endif
+
+#endif  // TENSORFLOW_C_KERNELS_H_
diff --git a/tensorflow/c/kernels_test.cc b/tensorflow/c/kernels_test.cc
new file mode 100644
index 00000000000..80bf12c0969
--- /dev/null
+++ b/tensorflow/c/kernels_test.cc
@@ -0,0 +1,194 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/c/kernels.h"
+
+#include "tensorflow/c/c_api.h"
+#include "tensorflow/core/framework/kernel_def.pb.h"
+#include "tensorflow/core/framework/node_def.pb_text.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+
+struct MyCustomKernel {
+  bool created;
+  bool compute_called;
+};
+
+static bool delete_called = false;
+
+static void* MyCreateFunc(TF_OpKernelConstruction* ctx) {
+  struct MyCustomKernel* s = new struct MyCustomKernel;
+  s->created = true;
+  s->compute_called = false;
+  return s;
+}
+
+static void MyComputeFunc(void* kernel, TF_OpKernelContext* ctx) {
+  struct MyCustomKernel* s = static_cast<struct MyCustomKernel*>(kernel);
+  s->compute_called = true;
+}
+
+static void MyDeleteFunc(void* kernel) {
+  struct MyCustomKernel* s = static_cast<struct MyCustomKernel*>(kernel);
+  EXPECT_TRUE(s->created);
+  EXPECT_TRUE(s->compute_called);
+  delete_called = true;
+  delete s;
+}
+
+namespace tensorflow {
+
+static std::unique_ptr<OpKernel> GetFakeKernel(const char* device_name,
+                                               const char* op_name,
+                                               Status* status) {
+  NodeDef def;
+  def.set_op(op_name);
+  def.set_device(device_name);
+  def.add_input("input1");
+  def.add_input("input2");
+  return CreateOpKernel(DeviceType(device_name), nullptr, nullptr, def, 1,
+                        status);
+}
+
+// Tests registration of a single C kernel and checks that calls through the
+// C/C++ boundary are being made.
+TEST(TestKernel, TestRegisterKernelBuilder) {
+  const char* kernel_name = "SomeKernelName";
+  const char* op_name = "FooOp";
+  const char* device_name = "FakeDeviceName1";
+
+  REGISTER_OP(op_name)
+      .Input("input1: double")
+      .Input("input2: uint8")
+      .Output("output1: uint8");
+
+  TF_KernelBuilder* builder = TF_NewKernelBuilder(
+      op_name, device_name, &MyCreateFunc, &MyComputeFunc, &MyDeleteFunc);
+
+  {
+    TF_Status* status = TF_NewStatus();
+    TF_RegisterKernelBuilder(kernel_name, builder, status);
+    EXPECT_EQ(TF_OK, TF_GetCode(status));
+    TF_Buffer* buf = TF_GetRegisteredKernelsForOp(op_name, status);
+    EXPECT_EQ(TF_OK, TF_GetCode(status));
+    KernelList list;
+    list.ParseFromArray(buf->data, buf->length);
+    ASSERT_EQ(1, list.kernel_size());
+    ASSERT_EQ(device_name, list.kernel(0).device_type());
+    TF_DeleteBuffer(buf);
+    TF_DeleteStatus(status);
+  }
+
+  {
+    Status status;
+    std::unique_ptr<OpKernel> kernel =
+        GetFakeKernel(device_name, op_name, &status);
+    TF_EXPECT_OK(status);
+    ASSERT_NE(nullptr, kernel.get());
+    kernel->Compute(nullptr);
+  }
+
+  ASSERT_TRUE(delete_called);
+}
+
+class DummyDevice : public DeviceBase {
+ public:
+  DummyDevice(Env* env, bool save) : DeviceBase(env), save_(save) {}
+  bool RequiresRecordingAccessedTensors() const override { return save_; }
+  Allocator* GetAllocator(AllocatorAttributes /*attr*/) override {
+    return cpu_allocator();
+  }
+
+ private:
+  bool save_;
+};
+
+TEST(TestKernel, TestInputAndOutputCount) {
+  const char* kernel_name = "InputOutputCounterKernel";
+  const char* op_name = "BarOp";
+  const char* device_name = "FakeDeviceName2";
+
+  REGISTER_OP(op_name)
+      .Input("input1: double")
+      .Input("input2: uint8")
+      .Output("output1: uint8");
+
+  static int num_inputs = 0;
+  static int num_outputs = 0;
+
+  // A kernel whose Compute function has a side-effect of updating num_inputs
+  // and num_outputs. Various functions on TF_OpKernelContext are also
+  // exercised.
+  auto my_compute_func = [](void* kernel, TF_OpKernelContext* ctx) {
+    num_inputs = TF_NumInputs(ctx);
+    num_outputs = TF_NumOutputs(ctx);
+
+    TF_Tensor* input = nullptr;
+    TF_Status* s = TF_NewStatus();
+    TF_GetInput(ctx, 0, &input, s);
+    EXPECT_EQ(TF_OK, TF_GetCode(s)) << "Failed to get input: " << TF_Message(s);
+    EXPECT_EQ(123, *static_cast<tensorflow::uint8*>(TF_TensorData(input)));
+    TF_GetInput(ctx, -1, &input, s);
+    EXPECT_EQ(TF_OUT_OF_RANGE, TF_GetCode(s));
+    TF_GetInput(ctx, 3, &input, s);
+    EXPECT_EQ(TF_OUT_OF_RANGE, TF_GetCode(s));
+    TF_DeleteStatus(s);
+    if (input != nullptr) {
+      TF_DeleteTensor(input);
+    }
+  };
+
+  TF_KernelBuilder* builder = TF_NewKernelBuilder(op_name, device_name, nullptr,
+                                                  my_compute_func, nullptr);
+
+  {
+    TF_Status* status = TF_NewStatus();
+    TF_RegisterKernelBuilder(kernel_name, builder, status);
+    EXPECT_EQ(TF_OK, TF_GetCode(status));
+    TF_DeleteStatus(status);
+  }
+
+  {
+    OpKernelContext::Params p;
+    DummyDevice dummy_device(nullptr, false);
+    p.device = &dummy_device;
+
+    Tensor t(tensorflow::uint8(123));
+
+    gtl::InlinedVector<TensorValue, 4> inputs;
+    // Simulate 2 inputs
+    inputs.emplace_back(&t);
+    inputs.emplace_back();
+    p.inputs = &inputs;
+
+    Status status;
+    std::unique_ptr<OpKernel> kernel =
+        GetFakeKernel(device_name, op_name, &status);
+    TF_EXPECT_OK(status);
+    ASSERT_NE(nullptr, kernel.get());
+
+    p.op_kernel = kernel.get();
+    OpKernelContext ctx(&p);
+    kernel->Compute(&ctx);
+
+    ASSERT_EQ(2, num_inputs);
+    ASSERT_EQ(1, num_outputs);
+  }
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/c/python_api.cc b/tensorflow/c/python_api.cc
index 247236b760d..98d83933322 100644
--- a/tensorflow/c/python_api.cc
+++ b/tensorflow/c/python_api.cc
@@ -160,4 +160,17 @@ void SetHandleShapeAndType(TF_Graph* graph, TF_Output output, const void* proto,
   ic->set_output_handle_shapes_and_types(output.index, shapes_and_types);
 }
 
+void AddWhileInputHack(TF_Graph* graph, TF_Output new_src, TF_Operation* dst,
+                       TF_Status* status) {
+  mutex_lock l(graph->mu);
+  status->status = graph->graph.AddWhileInputHack(&new_src.oper->node,
+                                                  new_src.index, &dst->node);
+  if (status->status.ok()) {
+    // This modification only updates the destination node for
+    // the purposes of running this graph in a session. Thus, we don't
+    // record the source node as being modified.
+    RecordMutation(graph, *dst, "adding input tensor");
+  }
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/c/python_api.h b/tensorflow/c/python_api.h
index 5cce84020bc..44779ca6561 100644
--- a/tensorflow/c/python_api.h
+++ b/tensorflow/c/python_api.h
@@ -34,6 +34,7 @@ void SetAttr(TF_Graph* graph, TF_Operation* op, const char* attr_name,
 
 void SetRequestedDevice(TF_Graph* graph, TF_Operation* op, const char* device);
 
+// Updates 'dst' to consume 'new_src'.
 void UpdateEdge(TF_Graph* graph, TF_Output new_src, TF_Input dst,
                 TF_Status* status);
 
@@ -65,6 +66,13 @@ std::string GetHandleShapeAndType(TF_Graph* graph, TF_Output output);
 // because I couldn't get SWIG to work otherwise.
 void SetHandleShapeAndType(TF_Graph* graph, TF_Output output, const void* proto,
                            size_t proto_len, TF_Status* status);
+
+// This method is used to add a new input edge to 'dst', which must be a While
+// op. The While op's "T" attribute must have already been updated to include
+// the new edge. This is used to construct tf.while_loop gradients.
+void AddWhileInputHack(TF_Graph* graph, TF_Output new_src, TF_Operation* dst,
+                       TF_Status* status);
+
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_C_PYTHON_API_H_
diff --git a/tensorflow/cc/BUILD b/tensorflow/cc/BUILD
index 83353b79f72..a09becc49b1 100644
--- a/tensorflow/cc/BUILD
+++ b/tensorflow/cc/BUILD
@@ -489,6 +489,7 @@ tf_gen_op_wrappers_cc(
         "image_ops",
         "io_ops",
         "linalg_ops",
+        "list_ops",
         "logging_ops",
         "lookup_ops",
         "manip_ops",
diff --git a/tensorflow/cc/saved_model/BUILD b/tensorflow/cc/saved_model/BUILD
index 3d3895c8fa8..52345a376cc 100644
--- a/tensorflow/cc/saved_model/BUILD
+++ b/tensorflow/cc/saved_model/BUILD
@@ -133,5 +133,6 @@ filegroup(
         "testdata/half_plus_two_pbtxt/**",
         "testdata/half_plus_two_main_op/**",
         "testdata/half_plus_two/**",
+        "testdata/half_plus_two_v2/**",
     ]),
 )
diff --git a/tensorflow/cc/saved_model/constants.h b/tensorflow/cc/saved_model/constants.h
index 645a3f101d1..6f00dc324bd 100644
--- a/tensorflow/cc/saved_model/constants.h
+++ b/tensorflow/cc/saved_model/constants.h
@@ -33,10 +33,10 @@ constexpr char kSavedModelFilenamePb[] = "saved_model.pb";
 /// SavedModel text format proto filename.
 constexpr char kSavedModelFilenamePbTxt[] = "saved_model.pbtxt";
 
-/// SavedModel legacy init op key.
+/// SavedModel legacy init op collection key. Used in v1 SavedModels.
 constexpr char kSavedModelLegacyInitOpKey[] = "legacy_init_op";
 
-/// SavedModel main op key.
+/// SavedModel main op collection key. Used in v1 SavedModels.
 constexpr char kSavedModelMainOpKey[] = "saved_model_main_op";
 
 /// Directory in which to save the SavedModel variables.
@@ -45,6 +45,11 @@ constexpr char kSavedModelVariablesDirectory[] = "variables";
 /// SavedModel variables filename.
 constexpr char kSavedModelVariablesFilename[] = "variables";
 
+/// SavedModel SignatureDef keys for the initialization and train ops. Used in
+/// V2 SavedModels.
+constexpr char kSavedModelInitOpSignatureKey[] = "__saved_model_init_op";
+constexpr char kSavedModelTrainOpSignatureKey[] = "__saved_model_train_op";
+
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_CC_SAVED_MODEL_CONSTANTS_H_
diff --git a/tensorflow/cc/saved_model/loader.cc b/tensorflow/cc/saved_model/loader.cc
index c6abe2f41b9..85d3dd01fa5 100644
--- a/tensorflow/cc/saved_model/loader.cc
+++ b/tensorflow/cc/saved_model/loader.cc
@@ -122,38 +122,58 @@ Status RunOnce(const RunOptions& run_options,
   return run_status;
 }
 
-bool HasMainOp(const MetaGraphDef& meta_graph_def) {
-  const auto& collection_def_map = meta_graph_def.collection_def();
-  if (collection_def_map.find(kSavedModelMainOpKey) !=
-      collection_def_map.end()) {
-    return true;
-  }
-  return false;
-}
-
-Status RunMainOp(const RunOptions& run_options, const string& export_dir,
+// RunInitOp will return OK if the initialization op was run successfully.
+// An empty init_op_name indicates that there are no init ops to run.
+Status RunInitOp(const RunOptions& run_options, const string& export_dir,
                  const MetaGraphDef& meta_graph_def,
                  const std::vector<AssetFileDef>& asset_file_defs,
-                 Session* session, const string& main_op_key) {
-  LOG(INFO) << "Running MainOp with key " << main_op_key
-            << " on SavedModel bundle.";
-  const auto& collection_def_map = meta_graph_def.collection_def();
-  const auto main_op_it = collection_def_map.find(main_op_key);
-  if (main_op_it != collection_def_map.end()) {
-    if (main_op_it->second.node_list().value_size() != 1) {
-      return errors::FailedPrecondition(
-          strings::StrCat("Expected exactly one main op in : ", export_dir));
-    }
+                 Session* session, const string& init_op_name) {
+  if (!init_op_name.empty()) {
+    LOG(INFO) << "Running initialization op on SavedModel bundle.";
     std::vector<std::pair<string, Tensor>> inputs;
     AddAssetsTensorsToInputs(export_dir, asset_file_defs, &inputs);
     RunMetadata run_metadata;
-    const StringPiece main_op_name = main_op_it->second.node_list().value(0);
-    return RunOnce(run_options, inputs, {}, {string(main_op_name)},
+    return RunOnce(run_options, inputs, {}, {init_op_name},
                    nullptr /* outputs */, &run_metadata, session);
   }
   return Status::OK();
 }
 
+// A SavedModel may store the name of the initialization op to run in the
+// in the SignatureDef (v2) or a collection (v1). If an init_op collection
+// exists, then the collection must contain exactly one op.
+Status GetInitOp(const string& export_dir, const MetaGraphDef& meta_graph_def,
+                 string* init_op_name) {
+  const auto& sig_def_map = meta_graph_def.signature_def();
+  const auto& init_op_sig_it =
+      meta_graph_def.signature_def().find(kSavedModelInitOpSignatureKey);
+  if (init_op_sig_it != sig_def_map.end()) {
+    *init_op_name = init_op_sig_it->second.outputs()
+                        .find(kSavedModelInitOpSignatureKey)
+                        ->second.name();
+    return Status::OK();
+  }
+
+  const auto& collection_def_map = meta_graph_def.collection_def();
+  string init_op_collection_key;
+  if (collection_def_map.find(kSavedModelMainOpKey) !=
+      collection_def_map.end()) {
+    init_op_collection_key = kSavedModelMainOpKey;
+  } else {
+    init_op_collection_key = kSavedModelLegacyInitOpKey;
+  }
+
+  const auto init_op_it = collection_def_map.find(init_op_collection_key);
+  if (init_op_it != collection_def_map.end()) {
+    if (init_op_it->second.node_list().value_size() != 1) {
+      return errors::FailedPrecondition(
+          strings::StrCat("Expected exactly one main op in : ", export_dir));
+    }
+    *init_op_name = init_op_it->second.node_list().value(0);
+  }
+  return Status::OK();
+}
+
 Status RunRestore(const RunOptions& run_options, const string& export_dir,
                   const StringPiece restore_op_name,
                   const StringPiece variable_filename_const_op_name,
@@ -193,6 +213,15 @@ Status RunRestore(const RunOptions& run_options, const string& export_dir,
 
 Status GetAssetFileDefs(const MetaGraphDef& meta_graph_def,
                         std::vector<AssetFileDef>* asset_file_defs) {
+  // With SavedModel v2, we write asset file def into metagraph instead of
+  // collection, so read from metagraph first.
+  if (meta_graph_def.asset_file_def_size() > 0) {
+    for (const auto& asset : meta_graph_def.asset_file_def()) {
+      asset_file_defs->push_back(asset);
+    }
+    return Status::OK();
+  }
+  // Fall back to read from collection to be backward compatible with v1.
   const auto& collection_def_map = meta_graph_def.collection_def();
   const auto assets_it = collection_def_map.find(kSavedModelAssetsKey);
   if (assets_it == collection_def_map.end()) {
@@ -227,15 +256,12 @@ Status LoadSavedModelInternal(const SessionOptions& session_options,
                  bundle->meta_graph_def.saver_def().restore_op_name(),
                  bundle->meta_graph_def.saver_def().filename_tensor_name(),
                  asset_file_defs, bundle->session.get()));
-  if (HasMainOp(bundle->meta_graph_def)) {
-    TF_RETURN_IF_ERROR(RunMainOp(run_options, export_dir,
-                                 bundle->meta_graph_def, asset_file_defs,
-                                 bundle->session.get(), kSavedModelMainOpKey));
-  } else {
-    TF_RETURN_IF_ERROR(RunMainOp(
-        run_options, export_dir, bundle->meta_graph_def, asset_file_defs,
-        bundle->session.get(), kSavedModelLegacyInitOpKey));
-  }
+  string init_op_name;
+  TF_RETURN_IF_ERROR(
+      GetInitOp(export_dir, bundle->meta_graph_def, &init_op_name));
+  TF_RETURN_IF_ERROR(RunInitOp(run_options, export_dir, bundle->meta_graph_def,
+                               asset_file_defs, bundle->session.get(),
+                               init_op_name));
   return Status::OK();
 }
 
diff --git a/tensorflow/cc/saved_model/loader_test.cc b/tensorflow/cc/saved_model/loader_test.cc
index 72b8bc18710..597e42bb65a 100644
--- a/tensorflow/cc/saved_model/loader_test.cc
+++ b/tensorflow/cc/saved_model/loader_test.cc
@@ -36,6 +36,8 @@ constexpr char kTestDataMainOp[] =
     "cc/saved_model/testdata/half_plus_two_main_op/00000123";
 constexpr char kTestDataSharded[] =
     "cc/saved_model/testdata/half_plus_two/00000123";
+constexpr char kTestDataInitOpV2[] =
+    "cc/saved_model/testdata/half_plus_two_v2/00000123";
 
 class LoaderTest : public ::testing::Test {
  protected:
@@ -227,5 +229,17 @@ TEST_F(LoaderTest, MaybeSavedModelDirectory) {
   EXPECT_FALSE(MaybeSavedModelDirectory(invalid_export_dir));
 }
 
+TEST_F(LoaderTest, SavedModelInitOpV2Format) {
+  SavedModelBundle bundle;
+  SessionOptions session_options;
+  RunOptions run_options;
+
+  const string export_dir =
+      io::JoinPath(testing::TensorFlowSrcRoot(), kTestDataInitOpV2);
+  TF_ASSERT_OK(LoadSavedModel(session_options, run_options, export_dir,
+                              {kSavedModelTagServe}, &bundle));
+  CheckSavedModelBundle(export_dir, bundle);
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/cc/saved_model/testdata/half_plus_two_v2/00000123/assets/foo.txt b/tensorflow/cc/saved_model/testdata/half_plus_two_v2/00000123/assets/foo.txt
new file mode 100644
index 00000000000..f9ff0366880
--- /dev/null
+++ b/tensorflow/cc/saved_model/testdata/half_plus_two_v2/00000123/assets/foo.txt
@@ -0,0 +1 @@
+asset-file-contents
\ No newline at end of file
diff --git a/tensorflow/cc/saved_model/testdata/half_plus_two_v2/00000123/saved_model.pb b/tensorflow/cc/saved_model/testdata/half_plus_two_v2/00000123/saved_model.pb
new file mode 100644
index 00000000000..a10bbf8fb6b
Binary files /dev/null and b/tensorflow/cc/saved_model/testdata/half_plus_two_v2/00000123/saved_model.pb differ
diff --git a/tensorflow/cc/saved_model/testdata/half_plus_two_v2/00000123/variables/variables.data-00000-of-00001 b/tensorflow/cc/saved_model/testdata/half_plus_two_v2/00000123/variables/variables.data-00000-of-00001
new file mode 100644
index 00000000000..15b75d6ef6b
Binary files /dev/null and b/tensorflow/cc/saved_model/testdata/half_plus_two_v2/00000123/variables/variables.data-00000-of-00001 differ
diff --git a/tensorflow/cc/saved_model/testdata/half_plus_two_v2/00000123/variables/variables.index b/tensorflow/cc/saved_model/testdata/half_plus_two_v2/00000123/variables/variables.index
new file mode 100644
index 00000000000..7ec9fb4fe2d
Binary files /dev/null and b/tensorflow/cc/saved_model/testdata/half_plus_two_v2/00000123/variables/variables.index differ
diff --git a/tensorflow/compiler/aot/codegen.cc b/tensorflow/compiler/aot/codegen.cc
index b17bc658fa0..ab1c1be344e 100644
--- a/tensorflow/compiler/aot/codegen.cc
+++ b/tensorflow/compiler/aot/codegen.cc
@@ -164,7 +164,8 @@ string RewriteWithName(const string& name, string code,
 }
 
 // Generate methods for args (inputs).
-Status GenArgMethods(const tf2xla::Config& config, const xla::ProgramShape& ps,
+Status GenArgMethods(const tf2xla::Config& config,
+                     const xla::ProgramShapeProto& ps,
                      const CompileResult& compile_result, string* methods) {
   size_t num_args = ps.parameters_size();
   if (config.feed_size() != num_args) {
@@ -174,9 +175,10 @@ Status GenArgMethods(const tf2xla::Config& config, const xla::ProgramShape& ps,
   }
   for (int i = 0; i < num_args; ++i) {
     std::vector<std::pair<string, string>> rewrites;
-    TF_RETURN_IF_ERROR(AddRewritesForShape(i, ps.parameters(i), &rewrites));
+    TF_RETURN_IF_ERROR(
+        AddRewritesForShape(i, xla::Shape(ps.parameters(i)), &rewrites));
     const string code = R"(
-  void set_arg{{NAME}}_data(void* data) {
+  void set_arg{{NAME}}_data(const void* data) {
     set_arg_data({{I}}, data);
   }
   {{TYPE}}* arg{{NAME}}_data() {
@@ -204,7 +206,7 @@ Status GenArgMethods(const tf2xla::Config& config, const xla::ProgramShape& ps,
 
 // Generate methods for results (outputs).
 Status GenResultMethods(const tf2xla::Config& config,
-                        const xla::ProgramShape& ps, string* methods) {
+                        const xla::ProgramShapeProto& ps, string* methods) {
   if (ps.result().element_type() != xla::TUPLE) {
     // The XlaCompiler we use to build the xla computation always generates a
     // tuple result, and we rely on this to simplify code generation.
@@ -217,8 +219,8 @@ Status GenResultMethods(const tf2xla::Config& config,
   }
   for (int i = 0; i < ps.result().tuple_shapes_size(); ++i) {
     std::vector<std::pair<string, string>> rewrites;
-    TF_RETURN_IF_ERROR(
-        AddRewritesForShape(i, ps.result().tuple_shapes(i), &rewrites));
+    TF_RETURN_IF_ERROR(AddRewritesForShape(
+        i, xla::Shape(ps.result().tuple_shapes(i)), &rewrites));
     string code = R"(
   {{TYPE}}* result{{NAME}}_data() {
     return static_cast<{{TYPE}}*>(result_data({{I}}));
@@ -336,7 +338,7 @@ Status GenerateHeader(const CodegenOpts& opts, const tf2xla::Config& config,
       ExtractEntryParamBufferInfos(buffer_infos);
   std::vector<BufferInfo> buffer_infos_for_temps =
       ExtractTempBufferInfos(buffer_infos);
-  const xla::ProgramShape& ps = compile_result.program_shape;
+  const xla::ProgramShapeProto& ps = compile_result.program_shape;
   string methods_arg, methods_result;
   TF_RETURN_IF_ERROR(GenArgMethods(config, ps, compile_result, &methods_arg));
   TF_RETURN_IF_ERROR(GenResultMethods(config, ps, &methods_result));
@@ -548,8 +550,8 @@ class {{CLASS}} : public tensorflow::XlaCompiledCpuFunction {
   static const char** StaticResultNames() {{RESULT_NAMES_CODE}}
 
   // Shape of the args and results.
-  static const xla::ProgramShape* StaticProgramShape() {
-    static const xla::ProgramShape* kShape = {{PROGRAM_SHAPE_SHIM_EXPRESSION}};
+  static const xla::ProgramShapeProto* StaticProgramShape() {
+    static const xla::ProgramShapeProto* kShape = {{PROGRAM_SHAPE_SHIM_EXPRESSION}};
     return kShape;
   }
 
@@ -587,7 +589,7 @@ class {{CLASS}} : public tensorflow::XlaCompiledCpuFunction {
       {"{{METHODS_RESULT}}\n", methods_result},
       {"{{NS_END}}\n", ns_end},
       {"{{NS_START}}\n", ns_start},
-      {"{{PROGRAM_SHAPE}}", xla::ShapeUtil::HumanString(ps)},
+      {"{{PROGRAM_SHAPE}}", xla::ShapeUtil::HumanString(xla::ProgramShape(ps))},
       {"{{PROGRAM_SHAPE_SHIM_EXPRESSION}}",
        metadata_result.program_shape_access_shim},
       {"{{RESULT_INDEX}}", absl::StrCat(result_index)},
@@ -615,11 +617,11 @@ static string CreateUniqueIdentifier(const CodegenOpts& opts,
 Status GenerateMetadata(const CodegenOpts& opts,
                         const CompileResult& compile_result,
                         MetadataResult* metadata_result) {
-  std::unique_ptr<xla::ProgramShape> program_shape;
+  std::unique_ptr<xla::ProgramShapeProto> program_shape;
 
   if (opts.gen_program_shape) {
     program_shape =
-        absl::make_unique<xla::ProgramShape>(compile_result.program_shape);
+        absl::make_unique<xla::ProgramShapeProto>(compile_result.program_shape);
 
     // The parameter names are currently meaningless, and redundant with the
     // rest of our metadata, so clear them out to avoid confusion and save
@@ -631,8 +633,8 @@ Status GenerateMetadata(const CodegenOpts& opts,
   // a shim that evaluates to nullptr, which is what we want.
 
   ProtobufToEmbed program_shape_protobuf{
-      CreateUniqueIdentifier(opts, "ProgramShape"), "xla::ProgramShape",
-      program_shape.get()};
+      CreateUniqueIdentifier(opts, "ProgramShapeProto"),
+      "xla::ProgramShapeProto", program_shape.get()};
 
   ProtobufToEmbed hlo_profile_printer_data_protobuf{
       CreateUniqueIdentifier(opts, "HloProfilePrinterData"),
diff --git a/tensorflow/compiler/aot/codegen.h b/tensorflow/compiler/aot/codegen.h
index 90410c46a8e..9485e86b10e 100644
--- a/tensorflow/compiler/aot/codegen.h
+++ b/tensorflow/compiler/aot/codegen.h
@@ -57,7 +57,7 @@ struct MetadataResult {
   std::vector<string> header_variable_decls;
 
   // program_shape_access_shim is a C++ expression that constructs the
-  // xla::ProgramShape instance for the CompileResult passed to
+  // xla::ProgramShapeProto instance for the CompileResult passed to
   // GenerateMetadata.
   string program_shape_access_shim;
 
diff --git a/tensorflow/compiler/aot/codegen_test.cc b/tensorflow/compiler/aot/codegen_test.cc
index bb288d23000..c1788ca32a1 100644
--- a/tensorflow/compiler/aot/codegen_test.cc
+++ b/tensorflow/compiler/aot/codegen_test.cc
@@ -181,13 +181,15 @@ TEST(CodegenTest, Golden) {
        BufferInfo::MakeEntryParameter(/*size=*/96, /*param_number=*/1),
        BufferInfo::MakeTempBuffer(3), BufferInfo::MakeTempBuffer(120)},
       5, {}));
-  compile_result.program_shape = xla::ShapeUtil::MakeProgramShape(
-      {
-          xla::ShapeUtil::MakeShape(xla::F32, {1, 2}),
-          xla::ShapeUtil::MakeShape(xla::S64, {3, 4}),
-      },
-      xla::ShapeUtil::MakeTupleShape(
-          {xla::ShapeUtil::MakeShape(xla::U32, {5, 6})}));
+  compile_result.program_shape =
+      xla::ShapeUtil::MakeProgramShape(
+          {
+              xla::ShapeUtil::MakeShape(xla::F32, {1, 2}),
+              xla::ShapeUtil::MakeShape(xla::S64, {3, 4}),
+          },
+          xla::ShapeUtil::MakeTupleShape(
+              {xla::ShapeUtil::MakeShape(xla::U32, {5, 6})}))
+          .ToProto();
   compile_result.entry_point = "entry_point";
   compile_result.pointer_size = 8;
 
diff --git a/tensorflow/compiler/aot/codegen_test_h.golden b/tensorflow/compiler/aot/codegen_test_h.golden
index e4d8a02877c..968afad65ed 100644
--- a/tensorflow/compiler/aot/codegen_test_h.golden
+++ b/tensorflow/compiler/aot/codegen_test_h.golden
@@ -22,7 +22,7 @@ extern "C" void entry_point(
     void* result, const xla::ExecutableRunOptions* run_options,
     const void** args, void** temps, tensorflow::int64* profile_counters);
 
-extern "C" char __tfcompile_foo_bar_MyClass_ProgramShape_protobuf_array_contents[];
+extern "C" char __tfcompile_foo_bar_MyClass_ProgramShapeProto_protobuf_array_contents[];
 
 
 namespace foo {
@@ -114,7 +114,7 @@ class MyClass : public tensorflow::XlaCompiledCpuFunction {
   //   with dim indices specifying which value. No bounds checking is performed
   //   on dim indices.
 
-  void set_arg0_data(void* data) {
+  void set_arg0_data(const void* data) {
     set_arg_data(0, data);
   }
   float* arg0_data() {
@@ -132,7 +132,7 @@ class MyClass : public tensorflow::XlaCompiledCpuFunction {
         arg_data(0)))[dim0][dim1];
   }
 
-  void set_arg_myfeed_data(void* data) {
+  void set_arg_myfeed_data(const void* data) {
     set_arg_data(0, data);
   }
   float* arg_myfeed_data() {
@@ -150,7 +150,7 @@ class MyClass : public tensorflow::XlaCompiledCpuFunction {
         arg_data(0)))[dim0][dim1];
   }
 
-  void set_arg1_data(void* data) {
+  void set_arg1_data(const void* data) {
     set_arg_data(1, data);
   }
   tensorflow::int64* arg1_data() {
@@ -253,10 +253,10 @@ class MyClass : public tensorflow::XlaCompiledCpuFunction {
   }
 
   // Shape of the args and results.
-  static const xla::ProgramShape* StaticProgramShape() {
-    static const xla::ProgramShape* kShape = []() {
-    xla::ProgramShape* proto = new xla::ProgramShape;
-    proto->ParseFromArray(&__tfcompile_foo_bar_MyClass_ProgramShape_protobuf_array_contents[0], 52);
+  static const xla::ProgramShapeProto* StaticProgramShape() {
+    static const xla::ProgramShapeProto* kShape = []() {
+    xla::ProgramShapeProto* proto = new xla::ProgramShapeProto;
+    proto->ParseFromArray(&__tfcompile_foo_bar_MyClass_ProgramShapeProto_protobuf_array_contents[0], 52);
     return proto;
   }();
     return kShape;
diff --git a/tensorflow/compiler/aot/codegen_test_o.golden b/tensorflow/compiler/aot/codegen_test_o.golden
index eb001c5d45b..ce8e5ec8c96 100644
Binary files a/tensorflow/compiler/aot/codegen_test_o.golden and b/tensorflow/compiler/aot/codegen_test_o.golden differ
diff --git a/tensorflow/compiler/aot/compile.cc b/tensorflow/compiler/aot/compile.cc
index 2b5f97b34cd..9fc223bdc7c 100644
--- a/tensorflow/compiler/aot/compile.cc
+++ b/tensorflow/compiler/aot/compile.cc
@@ -56,17 +56,23 @@ Status CompileXla(xla::CompileOnlyClient* client,
     return errors::Unknown("Couldn't get XLA program shape: ",
                            pshape_or.status().error_message());
   }
-  compile_result->program_shape = *pshape_or.ValueOrDie();
-  xla::ProgramShape* pshape = &compile_result->program_shape;
-  std::vector<const xla::Shape*> arg_layouts;
-  arg_layouts.reserve(pshape->parameters_size());
+  compile_result->program_shape = pshape_or.ValueOrDie()->ToProto();
+  xla::ProgramShapeProto* pshape = &compile_result->program_shape;
+
+  // AotXlaComputationInstance::argument_layouts is a vector of Shape
+  // pointers. Accumulate the Shape objects themselves in a separate vector
+  // while building the vector of pointers.
+  std::vector<const xla::Shape*> arg_layout_ptrs(pshape->parameters_size());
+  std::vector<xla::Shape> arg_layouts(pshape->parameters_size());
   for (int i = 0; i < pshape->parameters_size(); ++i) {
-    arg_layouts.push_back(pshape->mutable_parameters(i));
+    arg_layouts[i] = xla::Shape(*pshape->mutable_parameters(i));
+    arg_layout_ptrs[i] = &arg_layouts[i];
   }
   xla::CompileOnlyClient::AotXlaComputationInstance instance;
   instance.computation = &computation;
-  instance.argument_layouts = std::move(arg_layouts);
-  instance.result_layout = &pshape->result();
+  instance.argument_layouts = std::move(arg_layout_ptrs);
+  xla::Shape result_shape(pshape->result());
+  instance.result_layout = &result_shape;
   xla::StatusOr<std::vector<std::unique_ptr<xla::AotCompilationResult>>>
       aot_or = client->CompileAheadOfTime({instance}, aot_opts);
   if (!aot_or.ok()) {
diff --git a/tensorflow/compiler/aot/compile.h b/tensorflow/compiler/aot/compile.h
index e03c5b1aa77..ee7bb26fabd 100644
--- a/tensorflow/compiler/aot/compile.h
+++ b/tensorflow/compiler/aot/compile.h
@@ -33,9 +33,9 @@ namespace tfcompile {
 struct CompileResult {
   // Contains object file and meta-info.
   std::unique_ptr<xla::cpu::CpuAotCompilationResult> aot;
-  xla::ProgramShape program_shape;  // Static shape of args and results.
-  string entry_point;               // Name of generated function.
-  int pointer_size = 0;             // Size of a pointer in bytes.
+  xla::ProgramShapeProto program_shape;  // Static shape of args and results.
+  string entry_point;                    // Name of generated function.
+  int pointer_size = 0;                  // Size of a pointer in bytes.
 };
 
 // CompileGraph compiles the graph_def into an object file containing a function
diff --git a/tensorflow/compiler/aot/tests/tfcompile_test.cc b/tensorflow/compiler/aot/tests/tfcompile_test.cc
index f10852c7850..4dd79e5882d 100644
--- a/tensorflow/compiler/aot/tests/tfcompile_test.cc
+++ b/tensorflow/compiler/aot/tests/tfcompile_test.cc
@@ -526,13 +526,15 @@ TEST(TFCompileTest, ProgramShape) {
 
   // muladd has the program shape defined.
   MatMulAndAddComp muladd;
-  const xla::ProgramShape* muladd_shape = muladd.ProgramShape();
+  const xla::ProgramShapeProto* muladd_shape = muladd.ProgramShape();
   ASSERT_TRUE(muladd_shape != nullptr);
   ASSERT_EQ(muladd_shape->parameters_size(), 2);
-  EXPECT_TRUE(ShapeUtil::Compatible(muladd_shape->parameters(0), f32_2x2));
-  EXPECT_TRUE(ShapeUtil::Compatible(muladd_shape->parameters(1), f32_2x2));
+  EXPECT_TRUE(
+      ShapeUtil::Compatible(xla::Shape(muladd_shape->parameters(0)), f32_2x2));
+  EXPECT_TRUE(
+      ShapeUtil::Compatible(xla::Shape(muladd_shape->parameters(1)), f32_2x2));
 
-  const xla::Shape& muladd_result = muladd_shape->result();
+  const xla::Shape muladd_result(muladd_shape->result());
   ASSERT_EQ(muladd_result.element_type(), xla::TUPLE);
   ASSERT_EQ(ShapeUtil::TupleElementCount(muladd_result), 2);
   const xla::Shape& muladd_result0 =
diff --git a/tensorflow/compiler/jit/BUILD b/tensorflow/compiler/jit/BUILD
index 5f25e4626ad..be91ed4f432 100644
--- a/tensorflow/compiler/jit/BUILD
+++ b/tensorflow/compiler/jit/BUILD
@@ -23,7 +23,6 @@ package(
 load("//tensorflow:tensorflow.bzl", "cc_header_only_library")
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
-load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda_is_configured")
 load("//tensorflow:tensorflow.bzl", "tf_cuda_cc_test")
 load("//tensorflow:tensorflow.bzl", "tf_custom_op_py_library")
 
@@ -38,7 +37,7 @@ cc_library(
         ":xla_cpu_device",
         ":xla_cpu_jit",
         "//tensorflow/compiler/plugin",
-    ] + if_cuda_is_configured([
+    ] + if_cuda([
         ":xla_gpu_device",
         ":xla_gpu_jit",
     ]),
@@ -51,6 +50,7 @@ cc_library(
     deps = [
         ":jit_compilation_passes",
         "//tensorflow/compiler/jit/kernels:xla_ops",
+        "//tensorflow/compiler/tf2xla/kernels:xla_cpu_only_ops",
         "//tensorflow/compiler/tf2xla/kernels:xla_dummy_ops",
         "//tensorflow/compiler/tf2xla/kernels:xla_ops",
         "//tensorflow/compiler/xla/service:cpu_plugin",
@@ -76,10 +76,10 @@ cc_library(
     srcs = ["xla_cpu_device.cc"],
     visibility = [":friends"],
     deps = [
+        ":flags",
         ":jit_compilation_passes",
         ":xla_device",
         "//tensorflow/compiler/jit/kernels:xla_ops",
-        "//tensorflow/compiler/jit/legacy_flags:xla_device_flags",
         "//tensorflow/compiler/tf2xla:xla_compiler",
         "//tensorflow/compiler/tf2xla/kernels:xla_ops",
         "//tensorflow/compiler/xla/service:cpu_plugin",  # buildcleaner: keep
@@ -210,6 +210,18 @@ cc_library(
 
 # Internal targets below this point.
 
+cc_library(
+    name = "flags",
+    srcs = ["flags.cc"],
+    hdrs = ["flags.h"],
+    visibility = [":friends"],
+    deps = [
+        "//tensorflow/compiler/xla:parse_flags_from_env",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:lib",
+    ],
+)
+
 cc_library(
     name = "common",
     srcs = [
@@ -256,6 +268,7 @@ cc_library(
         "//tensorflow/compiler/tf2xla:common",
         "//tensorflow/compiler/tf2xla:dump_graph",
         "//tensorflow/compiler/tf2xla:xla_compiler",
+        "//tensorflow/compiler/xla:debug_options_flags",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla/client:client_library",
         "//tensorflow/compiler/xla/client:local_client",
@@ -268,6 +281,7 @@ cc_library(
         "//tensorflow/core/kernels:variable_ops",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:optional",
         "@com_google_absl//absl/types:span",
     ],
 )
@@ -487,6 +501,7 @@ cc_library(
     deps = [
         ":common",
         ":encapsulate_util",
+        ":flags",
         ":shape_inference_helpers",
         ":union_find",
         ":xla_cluster_util",
@@ -494,8 +509,6 @@ cc_library(
         "//tensorflow/cc:ops",
         "//tensorflow/cc:scope_internal",
         "//tensorflow/compiler/jit/graphcycles",
-        "//tensorflow/compiler/jit/legacy_flags:build_xla_ops_pass_flags",
-        "//tensorflow/compiler/jit/legacy_flags:mark_for_compilation_pass_flags",
         "//tensorflow/compiler/jit/ops:xla_ops",
         "//tensorflow/compiler/tf2xla:dump_graph",
         "//tensorflow/compiler/tf2xla:resource_operation_table",
@@ -724,7 +737,10 @@ tf_custom_op_py_library(
     visibility = [
         ":friends",
     ],
-    deps = ["//tensorflow/compiler/jit/ops:xla_ops_wrapper_py"],
+    deps = [
+        "//tensorflow/compiler/jit/ops:xla_ops_grad",
+        "//tensorflow/compiler/jit/ops:xla_ops_wrapper_py",
+    ],
 )
 
 # This target can be used by XLA device plugins to prevent circular dependencies, and provides access to all of the required headers for building a device library.
diff --git a/tensorflow/compiler/jit/build_xla_ops_pass.cc b/tensorflow/compiler/jit/build_xla_ops_pass.cc
index 93637a69d5d..9f4042630ed 100644
--- a/tensorflow/compiler/jit/build_xla_ops_pass.cc
+++ b/tensorflow/compiler/jit/build_xla_ops_pass.cc
@@ -23,7 +23,7 @@ limitations under the License.
 #include "tensorflow/cc/ops/control_flow_ops.h"
 #include "tensorflow/compiler/jit/defs.h"
 #include "tensorflow/compiler/jit/encapsulate_subgraphs_pass.h"
-#include "tensorflow/compiler/jit/legacy_flags/build_xla_ops_pass_flags.h"
+#include "tensorflow/compiler/jit/flags.h"
 #include "tensorflow/compiler/jit/xla_cluster_util.h"
 #include "tensorflow/compiler/tf2xla/cc/ops/xla_jit_ops.h"
 #include "tensorflow/compiler/tf2xla/dump_graph.h"
@@ -320,10 +320,10 @@ Status BuildXlaOpsPass::Run(const GraphOptimizationPassOptions& options) {
                     return IsXlaCompiledKernel(*n);
                   });
 
-  bool lazy_compilation_enabled = enable_lazy_compilation_
-                                      ? *enable_lazy_compilation_
-                                      : legacy_flags::GetBuildXlaOpsPassFlags()
-                                            .tf_xla_enable_lazy_compilation;
+  bool lazy_compilation_enabled =
+      enable_lazy_compilation_
+          ? *enable_lazy_compilation_
+          : GetBuildXlaOpsPassFlags().tf_xla_enable_lazy_compilation;
 
   for (Node* n : xla_compiled_kernels) {
     TF_RETURN_IF_ERROR(ReplaceNodeWithXlaCompileAndXlaRun(
diff --git a/tensorflow/compiler/jit/build_xla_ops_pass_test.cc b/tensorflow/compiler/jit/build_xla_ops_pass_test.cc
index 11df946cc18..48a23a4c171 100644
--- a/tensorflow/compiler/jit/build_xla_ops_pass_test.cc
+++ b/tensorflow/compiler/jit/build_xla_ops_pass_test.cc
@@ -42,14 +42,8 @@ class BuildXlaOpsTest : public ::testing::Test {
               .ok());
   }
 
-  void TearDown() override {
-    for (Device* device : devices_) {
-      delete device;
-    }
-  }
-
  private:
-  std::vector<Device*> devices_;
+  std::vector<std::unique_ptr<Device>> devices_;
 };
 
 using ::tensorflow::testing::FindNodeByName;
diff --git a/tensorflow/compiler/jit/create_xla_launch_op_test.cc b/tensorflow/compiler/jit/create_xla_launch_op_test.cc
index 73866607621..0f872a480f4 100644
--- a/tensorflow/compiler/jit/create_xla_launch_op_test.cc
+++ b/tensorflow/compiler/jit/create_xla_launch_op_test.cc
@@ -59,8 +59,9 @@ class CreateXlaLaunchOpTest : public ::testing::Test {
     SessionOptions options;
     auto* device_count = options.config.mutable_device_count();
     device_count->insert({"CPU", 1});
+    std::vector<std::unique_ptr<Device>> devices;
     TF_CHECK_OK(DeviceFactory::AddDevices(
-        options, "/job:localhost/replica:0/task:0", &devices_));
+        options, "/job:localhost/replica:0/task:0", &devices));
 
     FunctionDefLibrary proto;
     for (const auto& fdef : flib) {
@@ -69,7 +70,7 @@ class CreateXlaLaunchOpTest : public ::testing::Test {
     lib_def_ = absl::make_unique<FunctionLibraryDefinition>(
         OpRegistry::Global(), proto);
     OptimizerOptions opts;
-    device_mgr_ = absl::make_unique<DeviceMgr>(devices_);
+    device_mgr_ = absl::make_unique<DeviceMgr>(std::move(devices));
     pflr_ = absl::make_unique<ProcessFunctionLibraryRuntime>(
         device_mgr_.get(), Env::Default(), TF_GRAPH_DEF_VERSION, lib_def_.get(),
         opts, /*default_thread_pool=*/nullptr, /*cluster_flr=*/nullptr);
@@ -77,7 +78,6 @@ class CreateXlaLaunchOpTest : public ::testing::Test {
   }
 
   FunctionLibraryRuntime* flr_;
-  std::vector<Device*> devices_;
   std::unique_ptr<DeviceMgr> device_mgr_;
   std::unique_ptr<FunctionLibraryDefinition> lib_def_;
   std::unique_ptr<ProcessFunctionLibraryRuntime> pflr_;
diff --git a/tensorflow/compiler/jit/encapsulate_util.cc b/tensorflow/compiler/jit/encapsulate_util.cc
index 28ec37b1b9c..1f4b9c90a4f 100644
--- a/tensorflow/compiler/jit/encapsulate_util.cc
+++ b/tensorflow/compiler/jit/encapsulate_util.cc
@@ -86,7 +86,7 @@ Status ProcessControlEdges(Graph* g, const string& xla_computation_attr_name,
       continue;
     } else if (src_xla_computation && !dst_xla_computation) {
       if (src_outside_compilation) {
-        // Case 1d: outside compilation to host computation control edge.
+        // Case 1c: outside compilation to host computation control edge.
         edges_to_remove.push_back(e);
 
         TF_RETURN_IF_ERROR(AppendToListAttr<string>(
@@ -94,7 +94,7 @@ Status ProcessControlEdges(Graph* g, const string& xla_computation_attr_name,
       }
     } else if (!src_xla_computation && dst_xla_computation) {
       if (dst_outside_compilation) {
-        // Case 1d: host computation control to outside compilation edge.
+        // Case 1c: host computation control to outside compilation edge.
         edges_to_remove.push_back(e);
 
         TF_RETURN_IF_ERROR(AppendToListAttr<string>(
@@ -103,40 +103,24 @@ Status ProcessControlEdges(Graph* g, const string& xla_computation_attr_name,
     } else {  // src_xla_computation && dst_xla_computation
       if (*src_xla_computation != *dst_xla_computation) {
         if (src_outside_compilation && dst_outside_compilation) {
-          // Case 1c: outside compilation to outside compilation control edge.
+          // Case 1b: outside compilation to outside compilation control edge.
           edges_to_remove.push_back(e);
 
           TF_RETURN_IF_ERROR(AppendToListAttr<string>(
               e->dst(), kXlaControlDependenciesAttrName, e->src()->name()));
         } else if (src_outside_compilation && !dst_outside_compilation) {
-          // Case 1b: outside compilation to another XLA computaition control
+          // Case 1a: outside compilation to another XLA computaition control
           // edge.
           TF_RETURN_IF_ERROR(AppendToListAttr<string>(
               e->src(), kXlaConnectedToOtherXlaComputationAttrName,
               *dst_xla_computation));
         } else if (!src_outside_compilation && dst_outside_compilation) {
-          // Case 1b: another XLA computaition to outside compilation control
+          // Case 1a: another XLA computaition to outside compilation control
           // edge.
           TF_RETURN_IF_ERROR(AppendToListAttr<string>(
               e->dst(), kXlaConnectedFromOtherXlaComputationAttrName,
               *src_xla_computation));
         }
-      } else {  // *src_xla_computation == *dst_xla_computation
-        if (src_outside_compilation && dst_outside_compilation) {
-          if (*src_outside_compilation != *dst_outside_compilation) {
-            // Case 1c: outside compilation to outside compilation control edge.
-            edges_to_remove.push_back(e);
-
-            TF_RETURN_IF_ERROR(AppendToListAttr<string>(
-                e->dst(), kXlaControlDependenciesAttrName, e->src()->name()));
-          }
-        } else if (src_outside_compilation && !dst_outside_compilation) {
-          // Case 1a: outside compilation to its XLA computation control edge.
-          ReplaceAttr(e->src(), kXlaConnectedToXlaComputationAttrName, true);
-        } else if (!src_outside_compilation && dst_outside_compilation) {
-          // Case 1a: XLA computation to outside compilation in it control edge.
-          ReplaceAttr(e->dst(), kXlaConnectedFromXlaComputationAttrName, true);
-        }
       }
     }
   }
@@ -181,12 +165,6 @@ Status ProcessXlaToXlaDataEdges(Graph* g,
         edges.push_back(EdgeInfo{e->dst_input(), e->dst()->id()});
         VLOG(4) << "XLA -> XLA edge: " << e->DebugString();
       }
-    } else {  // *src_xla_computation == *dst_xla_computation
-      if (src_outside_compilation && dst_outside_compilation &&
-          *src_outside_compilation != *dst_outside_compilation) {
-        edges.push_back(EdgeInfo{e->dst_input(), e->dst()->id()});
-        VLOG(4) << "XLA -> XLA edge: " << e->DebugString();
-      }
     }
   }
 
@@ -263,7 +241,7 @@ Status ProcessDataEdgeBetweenOutsideCompilationAndHostComputation(
 
   // Remove the edge from host to outside compilation. Add a placeholder as
   // outside compilation node input.
-  std::map<string, Node*> placeholders;
+  std::map<std::pair<string, int>, Node*> placeholders;
   for (int i = 0; i < edges.size(); i++) {
     Node* dst = g->FindNodeId(edges[i].dst_node_id);
     const Edge* e;
@@ -275,9 +253,10 @@ Status ProcessDataEdgeBetweenOutsideCompilationAndHostComputation(
     // Find or create placeholder node.
     string new_name =
         edges[i].is_host_to_outside_compilation
-            ? absl::StrCat(src->name(), "_host_to_oc_placeholder")
-            : absl::StrCat(src->name(), "_oc_to_host_placeholder");
-    auto iter = placeholders.find(new_name);
+            ? absl::StrCat(src->name(), "_host_to_oc_placeholder_", src_output)
+            : absl::StrCat(src->name(), "_oc_to_host_placeholder_", src_output);
+    auto placeholder_index = std::make_pair(src->name(), src_output);
+    auto iter = placeholders.find(placeholder_index);
     Node* placeholder_node;
     if (iter == placeholders.end()) {
       NodeDefBuilder placeholder_builder(new_name, "Placeholder");
@@ -310,7 +289,7 @@ Status ProcessDataEdgeBetweenOutsideCompilationAndHostComputation(
       Status s;
       placeholder_node = g->AddNode(placeholder_def, &s);
       TF_RETURN_IF_ERROR(s);
-      placeholders[new_name] = placeholder_node;
+      placeholders[placeholder_index] = placeholder_node;
     } else {
       placeholder_node = iter->second;
     }
@@ -594,14 +573,244 @@ Status AddControlDependencies(
   return Status::OK();
 }
 
+// Step 1 for `PreprocessEdgesBetweenOutsideCompilations`. See comments of
+// `PreprocessEdgesBetweenOutsideCompilations` for details.
+Status PreprocessControlEdgesBetweenOutsideCompilations(
+    Graph* g, const string& outside_compilation_attr_name) {
+  // Gather edges to remove. We should not remove the edge while iterating.
+  std::vector<const Edge*> edges_to_remove;
+  for (const Edge* e : g->edges()) {
+    if (!e->IsControlEdge()) {
+      continue;
+    }
+
+    auto src_outside_compilation =
+        GetStringAttr(*e->src(), outside_compilation_attr_name);
+    auto dst_outside_compilation =
+        GetStringAttr(*e->dst(), outside_compilation_attr_name);
+
+    if (src_outside_compilation && dst_outside_compilation) {
+      if (*src_outside_compilation != *dst_outside_compilation) {
+        // Case 1a: outside compilation to outside compilation control edge.
+        edges_to_remove.push_back(e);
+
+        TF_RETURN_IF_ERROR(AppendToListAttr<string>(
+            e->dst(), kXlaControlDependenciesWithinXlaClusterAttrName,
+            e->src()->name()));
+      }
+    } else if (src_outside_compilation && !dst_outside_compilation) {
+      // Case 1b: outside compilation to its XLA computation control edge.
+      ReplaceAttr(e->src(), kXlaConnectedToXlaComputationAttrName, true);
+    } else if (!src_outside_compilation && dst_outside_compilation) {
+      // Case 1b: XLA computation to outside compilation in it control edge.
+      ReplaceAttr(e->dst(), kXlaConnectedFromXlaComputationAttrName, true);
+    }
+  }
+
+  for (auto e : edges_to_remove) {
+    g->RemoveEdge(e);
+  }
+  return Status::OK();
+}
+
+// Step 2 for `PreprocessEdgesBetweenOutsideCompilations`. See comments of
+// `PreprocessEdgesBetweenOutsideCompilations` for details.
+Status PreprocessDataEdgesBetweenOutsideCompilations(
+    Graph* g, const string& outside_compilation_attr_name) {
+  // Gather edges between outside compilation and host computation. Notice that
+  // we do not store `Edge*` directly because we remove some nodes while adding
+  // Identity nodes, and those Edge pointers might be invalidated.
+  struct EdgeInfo {
+    int dst_input, dst_node_id;
+  };
+  std::vector<EdgeInfo> edges;
+  for (const Edge* e : g->edges()) {
+    if (e->IsControlEdge()) {
+      continue;
+    }
+
+    auto src_outside_compilation =
+        GetStringAttr(*e->src(), outside_compilation_attr_name);
+    auto dst_outside_compilation =
+        GetStringAttr(*e->dst(), outside_compilation_attr_name);
+
+    if (src_outside_compilation && dst_outside_compilation &&
+        *src_outside_compilation != *dst_outside_compilation) {
+      edges.push_back(EdgeInfo{e->dst_input(), e->dst()->id()});
+      VLOG(4) << "Oc -> oc edge: " << e->DebugString();
+    }
+  }
+
+  // Remove the edge from host to outside compilation. Add a placeholder as
+  // outside compilation node input.
+  std::map<std::pair<string, int>, Node*> placeholders;
+  for (int i = 0; i < edges.size(); i++) {
+    Node* dst = g->FindNodeId(edges[i].dst_node_id);
+    const Edge* e;
+    TF_RETURN_IF_ERROR(dst->input_edge(edges[i].dst_input, &e));
+    Node* src = e->src();
+    int src_output = e->src_output(), dst_input = e->dst_input();
+    g->RemoveEdge(e);
+
+    // Find or create placeholder node.
+    string new_name =
+        absl::StrCat(src->name(), "_oc_to_oc_placeholder_", src_output);
+    auto placeholder_index = std::make_pair(src->name(), src_output);
+    auto iter = placeholders.find(placeholder_index);
+    Node* placeholder_node;
+    if (iter == placeholders.end()) {
+      NodeDefBuilder placeholder_builder(new_name, "Placeholder");
+      placeholder_builder.Attr("dtype", src->output_type(src_output));
+      string outside_compilation_attr;
+      TF_RETURN_IF_ERROR(GetNodeAttr(dst->attrs(),
+                                     outside_compilation_attr_name,
+                                     &outside_compilation_attr));
+      placeholder_builder.Attr(outside_compilation_attr_name,
+                               outside_compilation_attr);
+      placeholder_builder.Attr(kOutsideCompilationOriginalNodeAttrName,
+                               src->name());
+      placeholder_builder.Attr(kOutsideCompilationSrcOutputAttrName,
+                               src_output);
+      NodeDef placeholder_def;
+      TF_RETURN_IF_ERROR(placeholder_builder.Finalize(&placeholder_def));
+      Status s;
+      placeholder_node = g->AddNode(placeholder_def, &s);
+      TF_RETURN_IF_ERROR(s);
+      placeholders[placeholder_index] = placeholder_node;
+    } else {
+      placeholder_node = iter->second;
+    }
+    g->AddEdge(placeholder_node, 0, dst, dst_input);
+
+    // Replace `e->dst()` because its input node changed.
+    NodeDef new_def = dst->def();
+    *new_def.mutable_input(dst_input) = placeholder_node->name();
+    TF_ASSIGN_OR_RETURN(Node * dst_replace_node, ReplaceNode(g, dst, new_def));
+
+    // Other edge in `edges` might have `e->dst()` as src or dst
+    // node. Before removing `e->dst()`, replace those edges with
+    // corresponding edges for `dst_replace_node`.
+    for (int j = i + 1; j < edges.size(); j++) {
+      if (edges[j].dst_node_id == edges[i].dst_node_id) {
+        edges[j].dst_node_id = dst_replace_node->id();
+      }
+    }
+  }
+  return Status::OK();
+}
+
+// Step 1 for `PostprocessEdgesBetweenOutsideCompilations`. See comments of
+// `PostprocessEdgesBetweenOutsideCompilations` for details.
+Status PostprocessDataEdgesBetweenOutsideCompilations(
+    Graph* g, const string& outside_compilation_attr_name) {
+  // Gather all outside compilation to outside compilation nodes.
+  std::vector<Node*> placeholder_nodes;
+  for (Node* n : g->nodes()) {
+    if (n->type_string() == "Placeholder" &&
+        HasNodeAttr(n->def(), kOutsideCompilationOriginalNodeAttrName)) {
+      placeholder_nodes.push_back(n);
+    }
+  }
+
+  // Remove the placeholder nodes, and reconnect original edge.
+  auto node_name_index = g->BuildNodeNameIndex();
+  for (auto n : placeholder_nodes) {
+    string node_name;
+    int node_src_output;
+    TF_RETURN_IF_ERROR(GetNodeAttr(
+        n->attrs(), kOutsideCompilationOriginalNodeAttrName, &node_name));
+    TF_RETURN_IF_ERROR(GetNodeAttr(
+        n->attrs(), kOutsideCompilationSrcOutputAttrName, &node_src_output));
+    auto iter = node_name_index.find(node_name);
+    if (iter == node_name_index.end()) {
+      return errors::Internal(
+          "Cannot find original node for oc -> host placeholder node ",
+          node_name);
+    }
+
+    // Change all usage node to use the original node instead.
+    Node* original_node = iter->second;
+    std::vector<const Edge*> control_edges;
+    std::vector<OutEdgeInfo> data_edges;
+    for (auto e : n->out_edges()) {
+      if (e->IsControlEdge()) {
+        control_edges.push_back(e);
+      } else {
+        data_edges.push_back({e->dst(), e->src_output(), e->dst_input()});
+      }
+    }
+    for (const Edge* e : control_edges) {
+      g->AddControlEdge(original_node, e->dst());
+      g->RemoveEdge(e);
+    }
+    for (int i = 0; i < data_edges.size(); i++) {
+      Node* dst = data_edges[i].dst;
+      NodeDef new_def = dst->def();
+      int dst_input = data_edges[i].dst_input;
+      *new_def.mutable_input(dst_input) =
+          absl::StrCat(original_node->name(), ":", node_src_output);
+      TF_ASSIGN_OR_RETURN(Node * replace_node, ReplaceNode(g, dst, new_def));
+
+      const Edge* edge_to_replace = nullptr;
+      TF_RETURN_IF_ERROR(replace_node->input_edge(dst_input, &edge_to_replace));
+      g->RemoveEdge(edge_to_replace);
+      g->AddEdge(original_node, node_src_output, replace_node, dst_input);
+
+      // Other edges might have `dst` as dst node. Update those edges with
+      // `replace_node`.
+      for (int j = i + 1; j < data_edges.size(); j++) {
+        if (data_edges[j].dst == dst) {
+          data_edges[j].dst = replace_node;
+        }
+      }
+
+      // Other placeholder node might have `dst` as original node. Update
+      // `node_name_index` with `replace_node`.
+      node_name_index[replace_node->name()] = replace_node;
+    }
+
+    // Remove placeholder node.
+    g->RemoveNode(n);
+  }
+  return Status::OK();
+}
+
+// Step 2 for `PostprocessEdgesBetweenOutsideCompilations`. See comments of
+// `PostprocessEdgesBetweenOutsideCompilations` for details.
+Status PostprocessControlEdgesBetweenOutsideCompilations(
+    Graph* g, const string& outside_compilation_attr_name) {
+  auto node_name_index = g->BuildNodeNameIndex();
+
+  // Reconnect outside compilation to outside compilation control edge.
+  for (Node* n : g->nodes()) {
+    std::vector<string> control_deps;
+    Status s =
+        GetNodeAttr(n->attrs(), kXlaControlDependenciesWithinXlaClusterAttrName,
+                    &control_deps);
+    if (!s.ok()) {
+      if (s.code() != error::NOT_FOUND) {
+        return s;
+      } else {
+        continue;
+      }
+    } else {
+      n->ClearAttr(kXlaControlDependenciesWithinXlaClusterAttrName);
+      for (const string& control_input : control_deps) {
+        auto iter = node_name_index.find(control_input);
+        if (iter == node_name_index.end()) {
+          return errors::Internal("Cannot find original node for ",
+                                  control_input);
+        }
+        g->AddControlEdge(iter->second, n);
+      }
+    }
+  }
+  return Status::OK();
+}
 }  // namespace
 
 const char kXlaInferredShapesAttrName[] = "_xla_inferred_shapes";
 
-const char kXlaConnectedToXlaComputationAttrName[] =
-    "_xla_connected_to_xla_computation";
-const char kXlaConnectedFromXlaComputationAttrName[] =
-    "_xla_connected_from_xla_computation";
 const char kXlaConnectedToOtherXlaComputationAttrName[] =
     "_xla_connected_to_other_xla_computation";
 const char kXlaConnectedFromOtherXlaComputationAttrName[] =
@@ -616,6 +825,15 @@ const char kHostToOutsideCompilationOriginalNodeAttrName[] =
     "_xla_host_to_oc_node_name";
 const char kHostToOutsideCompilationSrcOutputAttrName[] =
     "_xla_host_to_oc_src_output";
+const char kXlaConnectedToXlaComputationAttrName[] =
+    "_xla_connected_to_xla_computation";
+const char kXlaConnectedFromXlaComputationAttrName[] =
+    "_xla_connected_from_xla_computation";
+const char kOutsideCompilationOriginalNodeAttrName[] =
+    "_xla_oc_to_oc_node_name";
+const char kOutsideCompilationSrcOutputAttrName[] = "_xla_oc_to_oc_src_output";
+const char kXlaControlDependenciesWithinXlaClusterAttrName[] =
+    "_xla_control_dependencies_within_xla_cluster";
 
 Status PerformStaticShapeInferenceBeforeEncapsulation(
     Graph* g, const string& xla_computation_attr_name,
@@ -699,4 +917,39 @@ Status PostprocessForEncapsulation(
   return Status::OK();
 }
 
+Status PreprocessEdgesBetweenOutsideCompilations(
+    Graph* g, const string& outside_compilation_attr_name) {
+  // Remove edges from source node to outside compilation nodes, and edges
+  // from outside compilation nodes to sink node.
+  std::vector<const Edge*> edges_to_remove;
+  for (const Edge* e : g->source_node()->out_edges()) {
+    if (HasNodeAttr(e->dst()->def(), outside_compilation_attr_name)) {
+      edges_to_remove.push_back(e);
+    }
+  }
+  for (const Edge* e : g->sink_node()->in_edges()) {
+    if (HasNodeAttr(e->src()->def(), outside_compilation_attr_name)) {
+      edges_to_remove.push_back(e);
+    }
+  }
+  for (auto e : edges_to_remove) {
+    g->RemoveEdge(e);
+  }
+
+  TF_RETURN_IF_ERROR(PreprocessControlEdgesBetweenOutsideCompilations(
+      g, outside_compilation_attr_name));
+  TF_RETURN_IF_ERROR(PreprocessDataEdgesBetweenOutsideCompilations(
+      g, outside_compilation_attr_name));
+  return Status::OK();
+}
+
+Status PostprocessEdgesBetweenOutsideCompilations(
+    Graph* g, const string& outside_compilation_attr_name) {
+  TF_RETURN_IF_ERROR(PostprocessDataEdgesBetweenOutsideCompilations(
+      g, outside_compilation_attr_name));
+  TF_RETURN_IF_ERROR(PostprocessControlEdgesBetweenOutsideCompilations(
+      g, outside_compilation_attr_name));
+  return Status::OK();
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/encapsulate_util.h b/tensorflow/compiler/jit/encapsulate_util.h
index 5e0c4bf6a0c..e363bc5754a 100644
--- a/tensorflow/compiler/jit/encapsulate_util.h
+++ b/tensorflow/compiler/jit/encapsulate_util.h
@@ -44,14 +44,6 @@ Status PerformStaticShapeInferenceBeforeEncapsulation(
     Graph* g, const string& xla_computation_attr_name,
     const string& outside_compilation_attr_name);
 
-// Attribute indicating that some ops in this node's XLA computation has control
-// dependency on this node. Attribute value will always be "true".
-extern const char kXlaConnectedToXlaComputationAttrName[];
-
-// Attribute indicating that this node has control dependency on some ops in
-// this node's XLA computation. Attribute value will always be "true".
-extern const char kXlaConnectedFromXlaComputationAttrName[];
-
 // Attribute indicating that some ops in other XLA computation has control
 // dependency on this node. Attribute value will be a list of string (XLA
 // computation names).
@@ -81,6 +73,14 @@ extern const char kOutsideCompilationToHostOriginalNodeAttrName[];
 // int (src_output for original edge).
 extern const char kOutsideCompilationToHostSrcOutputAttrName[];
 
+// Attribute indicating that some ops in this node's XLA computation has control
+// dependency on this node. Attribute value will always be "true".
+extern const char kXlaConnectedToXlaComputationAttrName[];
+
+// Attribute indicating that this node has control dependency on some ops in
+// this node's XLA computation. Attribute value will always be "true".
+extern const char kXlaConnectedFromXlaComputationAttrName[];
+
 // Attribute indicating that this is an Placeholder node added to act as a
 // temporary input node for an host node. Attribute value will be string
 // (original input node name).
@@ -91,19 +91,31 @@ extern const char kHostToOutsideCompilationOriginalNodeAttrName[];
 // for original edge).
 extern const char kHostToOutsideCompilationSrcOutputAttrName[];
 
-// Preprocesses the graph for encapsulation. It will perform the following
-// operations in order:
+// Attribute indicating that this is an Placeholder node added to act as a
+// temporary input node for an outside compilation node. Attribute value will be
+// string (original input node name).
+extern const char kOutsideCompilationOriginalNodeAttrName[];
+
+// Attribute indicating that this is an Placeholder node added to act as a
+// temporary input node for an outside compilation node. Attribute value will be
+// int (src_output for original edge).
+extern const char kOutsideCompilationSrcOutputAttrName[];
+
+// Attribute indicating that this node has control dependencies on some other
+// nodes within the same XLA cluster. Attribute value will be a list of string
+// (node names).
+extern const char kXlaControlDependenciesWithinXlaClusterAttrName[];
+
+// Preprocesses edges between different XLA clusters for encapsulation. It will
+// perform the following operations in order:
 //
-// 1a. For control edges between outside compilation and its XLA computation,
-//     add attr "kXlaConnected{From, To}XlaComputationAttrName = true" to the
-//     outside compilation node.
-// 1b. For control edges between outside compilation and another XLA
+// 1a. For control edges between outside compilation and another XLA
 //     computation, add attr "kXlaConnected{From, To}OtherXlaComputationAttrName
 //     = XLA computation node name" to the outside compilation node.
-// 1c. For control edges between different outside compilations, remove the edge
-//     and add attr "kXlaControlDependenciesAttrName = src node name" to dst
-//     node.
-// 1d. For control edges between outside compilation and host computation,
+// 1b. For control edges between different outside compilations (in different
+//     XLA computations), remove the edge and add attr
+//     "kXlaControlDependenciesAttrName = src node name" to dst node.
+// 1c. For control edges between outside compilation and host computation,
 //     remove the edge and add attr "kXlaControlDependenciesAttrName = src node
 //     name" to dst node.
 // 2. For data edges between different XLA computations, if either src or dst
@@ -146,26 +158,53 @@ struct XlaClusterInfo {
   const std::map<string, int> host_compute_core;
 };
 
-// Postprocesses the graph for encapsulation. This function reverts what
-// `PreprocessForEncapsulation` did. It will perform the following operations in
-// order:
+// Postprocesses edges between different XLA clusters for encapsulation. This
+// function reverts what `PreprocessForEncapsulation` did. It will perform the
+// following operations in order:
 //
 // 1. Remove Placeholder nodes between outside compilation and host computation
 //     (created in `PreprocessForEncapsulation` step 3).
 // 2. Remove Identity nodes created in `PreprocessForEncapsulation` step 2.
-// 3a. Reconnect control edges between different outside compilations (marked by
-//     `PreprocessForEncapsulation` step 1c) and control edges between outside
-//     compilation and host computation (marked by `PreprocessForEncapsulation`
-//     step 1d).
-// 3b. Reconnect control edges between outside compilation and another XLA
-//     computation (marked by `PreprocessForEncapsulation` step 1b).
-// Notice that control edges marked by `PreprocessForEncapsulation` step 1a are
-// not handled here. They are handled in `RewriteOutsideCompilationSubgraphFn`.
+// 3a. Reconnect control edges between outside compilation and another XLA
+//     computation (marked by `PreprocessForEncapsulation` step 1a).
+// 3b. Reconnect control edges between different outside compilations (marked by
+//     `PreprocessForEncapsulation` step 1b).
+// 3c. Reconnect control edges between outside compilation and host computation
+//     (marked by `PreprocessForEncapsulation` step 1c).
 Status PostprocessForEncapsulation(
     Graph* g, const string& xla_computation_attr_name,
     const string& outside_compilation_attr_name,
     const std::unordered_map<string, XlaClusterInfo>& clusters);
 
+// Preprocesses edges within the same XLA cluster. It will perform the following
+// operations in order:
+//
+// 0.  Remove edges from source node to outside compilation nodes, and edges
+//     from outside compilation nodes to sink node.
+// 1a. For edges between different outside compilation clusters, remove the edge
+//     and add attr "kXlaControlDependenciesWithinXlaClusterAttrName = src node
+//     name" to dst node.
+// 1b. For control edges between outside compilation and its XLA computation,
+//     add attr "kXlaConnected{From, To}XlaComputationAttrName = true" to the
+//     outside compilation node.
+// 2.  For data edges between different outside compilations, remove the edge
+//     and create a Placeholder node as dst node's input.
+Status PreprocessEdgesBetweenOutsideCompilations(
+    Graph* g, const string& outside_compilation_attr_name);
+
+// Postprocesses edges within the same XLA cluster. This function reverts what
+// `PreprocessEdgesBetweenOutsideCompilations` did. It will perform the
+// following operations in order:
+//
+// 1. Remove Placeholder nodes between different outside compilations (created
+//    in `PreprocessEdgesBetweenOutsideCompilations` step 2).
+// 2a. Reconnect control edges between different outside compilations (marked by
+//     `PreprocessEdgesBetweenOutsideCompilations` step 1a).
+// Notice that control edges marked by
+// `PreprocessEdgesBetweenOutsideCompilations` step 1b are not handled here.
+// They are handled in `RewriteOutsideCompilationSubgraphFn`.
+Status PostprocessEdgesBetweenOutsideCompilations(
+    Graph* g, const string& outside_compilation_attr_name);
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_COMPILER_JIT_ENCAPSULATE_UTIL_H_
diff --git a/tensorflow/compiler/jit/encapsulate_util_test.cc b/tensorflow/compiler/jit/encapsulate_util_test.cc
index 7255df31129..3b8b49cb92f 100644
--- a/tensorflow/compiler/jit/encapsulate_util_test.cc
+++ b/tensorflow/compiler/jit/encapsulate_util_test.cc
@@ -107,28 +107,19 @@ TEST(PreprocessForEncapsulationTest, ControlEdges) {
   identity4_node->AddAttr("_xla", "1");
   identity4_node->AddAttr("_oc", "0");
   identity5_node->AddAttr("_xla", "1");
-  // Case 1a: control edges between outside compilation and its XLA computation.
-  g.AddControlEdge(add_node, identity0_node);
-  g.AddControlEdge(identity0_node, identity1_node);
-  // Case 1b: control edges between outside compilation and another XLA
+  // Case 1a: control edges between outside compilation and another XLA
   // computation.
   g.AddControlEdge(identity0_node, identity3_node);
   g.AddControlEdge(identity1_node, identity4_node);
-  // Case 1c: control edges between different outside compilations.
+  // Case 1b: control edges between different outside compilations.
   g.AddControlEdge(identity0_node, identity4_node);
-  // Case 1d: control edges between outside compilation and host computation.
+  // Case 1c: control edges between outside compilation and host computation.
   g.AddControlEdge(const0_node, identity0_node);
   g.AddControlEdge(identity0_node, identity2_node);
 
   TF_CHECK_OK(PreprocessForEncapsulation(&g, "_xla", "_oc"));
 
-  // Case 1a: add attr "_xla_connected_{from/to}_xla_computation = true" to the
-  // outside compilation node.
-  EXPECT_TRUE(HasNodeAttr(identity0_node->def(),
-                          kXlaConnectedFromXlaComputationAttrName));
-  EXPECT_TRUE(HasNodeAttr(identity0_node->def(),
-                          kXlaConnectedToXlaComputationAttrName));
-  // Case 1b: add attr "_xla_control_deps_{from/to} = XLA computation node name"
+  // Case 1a: add attr "_xla_control_deps_{from/to} = XLA computation node name"
   // to the outside compilation node.
   std::vector<string> attr;
   TF_CHECK_OK(GetNodeAttr(identity0_node->def(),
@@ -140,13 +131,13 @@ TEST(PreprocessForEncapsulationTest, ControlEdges) {
                           kXlaConnectedFromOtherXlaComputationAttrName, &attr));
   EXPECT_EQ(attr.size(), 1);
   EXPECT_EQ(attr[0], "0");
-  // Case 1c: add attr "_xla_control_deps = src node name" to dst node.
+  // Case 1b: add attr "_xla_control_deps = src node name" to dst node.
   attr.clear();
   TF_CHECK_OK(GetNodeAttr(identity4_node->def(),
                           kXlaControlDependenciesAttrName, &attr));
   EXPECT_EQ(attr.size(), 1);
   EXPECT_EQ(attr[0], "identity0");
-  // Case 1d: add attr "_xla_control_deps = src node name" to dst node.
+  // Case 1c: add attr "_xla_control_deps = src node name" to dst node.
   attr.clear();
   TF_CHECK_OK(GetNodeAttr(identity0_node->def(),
                           kXlaControlDependenciesAttrName, &attr));
@@ -162,23 +153,33 @@ TEST(PreprocessForEncapsulationTest, ControlEdges) {
 TEST(PreprocessForEncapsulationTest, DataEdges) {
   // Build the graph:
   // "const_0" and "const_1" in host computation
+  // "identityn0" = ("const_0", "const_1") in host computation 0
   // "add0" = "const_0" + "const_1" in XLA computation 0
   // "add1" = "add0" + "const_0" in XLA computation 0 & outside compilation 0
   // "identity0" = "add1" in XLA computation 0
   // "add2" = "add1" + "identity0" in host computation
   // "add3" = "add1" + "add2" in XLA computation 1
-  // "add4" = "identity0" + "add2" in XLA computation 1 & outside compilation 1
+  // "add4" = "identity0" + "add2" in XLA computation 1 & outside compilation 0
+  // "add5" = "identityn0"[0] + "identityn0"[1] in XLA computation 1 &
+  //                                               outside compilation 0
+  // "identityn1" = ("identityn0"[0], "identityn0"[1]) in XLA computation 1 &
+  //                                                   outside compilation 0
   // "identity1" = "add4" in XLA computation 1
   // "identity2" = "identity1" in host computation
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   Output const_0 = ops::Const(s.WithOpName("const_0"), 1, {});
   Output const_1 = ops::Const(s.WithOpName("const_1"), 2, {});
+  auto identityn0 =
+      ops::IdentityN(s.WithOpName("identityn_0"), {const_0, const_1});
   Output add0 = ops::Add(s.WithOpName("add0"), const_0, const_1);
   Output add1 = ops::Add(s.WithOpName("add1"), add0, const_0);
   Output identity0 = ops::Identity(s.WithOpName("identity0"), add1);
   Output add2 = ops::Add(s.WithOpName("add2"), add1, identity0);
   Output add3 = ops::Add(s.WithOpName("add3"), add1, add2);
   Output add4 = ops::Add(s.WithOpName("add4"), identity0, add2);
+  Output add5 = ops::Add(s.WithOpName("add5"), identityn0[0], identityn0[1]);
+  auto identityn1 = ops::IdentityN(s.WithOpName("identityn_1"),
+                                   {identityn0[0], identityn0[1]});
   Output identity1 = ops::Identity(s.WithOpName("identity1"), add4);
   Output identity2 = ops::Identity(s.WithOpName("identity2"), add4);
   Graph g(OpRegistry::Global());
@@ -189,6 +190,8 @@ TEST(PreprocessForEncapsulationTest, DataEdges) {
   Node *add0_node = node_index["add0"], *add1_node = node_index["add1"],
        *identity0_node = node_index["identity0"],
        *add3_node = node_index["add3"], *add4_node = node_index["add4"],
+       *add5_node = node_index["add5"],
+       *identityn1_node = node_index["identityn_1"],
        *identity1_node = node_index["identity1"];
   add0_node->AddAttr("_xla", "0");
   add1_node->AddAttr("_xla", "0");
@@ -197,6 +200,10 @@ TEST(PreprocessForEncapsulationTest, DataEdges) {
   add3_node->AddAttr("_xla", "1");
   add4_node->AddAttr("_xla", "1");
   add4_node->AddAttr("_oc", "0");
+  add5_node->AddAttr("_xla", "1");
+  add5_node->AddAttr("_oc", "0");
+  identityn1_node->AddAttr("_xla", "1");
+  identityn1_node->AddAttr("_oc", "0");
   identity1_node->AddAttr("_xla", "1");
 
   TF_CHECK_OK(PreprocessForEncapsulation(&g, "_xla", "_oc"));
@@ -214,8 +221,9 @@ TEST(PreprocessForEncapsulationTest, DataEdges) {
   EXPECT_NE(bridge_identity0_add4, nullptr);
   // Step 3: add placeholder for edges between host computation and outside
   // compilation.
-  EXPECT_EQ(bridge_add1_add3->def().input(0), "add1_oc_to_host_placeholder");
-  Node *add1_oc_to_host_placeholder = node_index["add1_oc_to_host_placeholder"];
+  EXPECT_EQ(bridge_add1_add3->def().input(0), "add1_oc_to_host_placeholder_0");
+  Node *add1_oc_to_host_placeholder =
+      node_index["add1_oc_to_host_placeholder_0"];
   TF_CHECK_OK(GetNodeAttr(add1_oc_to_host_placeholder->attrs(),
                           kOutsideCompilationToHostOriginalNodeAttrName, &str));
   EXPECT_EQ(str, "add1");
@@ -226,15 +234,34 @@ TEST(PreprocessForEncapsulationTest, DataEdges) {
   add4_node = node_index["add4"];
   ASSERT_NE(add4_node, nullptr);
   EXPECT_EQ(add4_node->def().input(0),
-            "bridge_identity0_add4_host_to_oc_placeholder");
+            "bridge_identity0_add4_host_to_oc_placeholder_0");
   Node *identity0_host_to_oc_placeholder =
-      node_index["bridge_identity0_add4_host_to_oc_placeholder"];
+      node_index["bridge_identity0_add4_host_to_oc_placeholder_0"];
   TF_CHECK_OK(GetNodeAttr(identity0_host_to_oc_placeholder->attrs(),
                           kHostToOutsideCompilationOriginalNodeAttrName, &str));
   EXPECT_EQ(str, "bridge_identity0_add4");
   TF_CHECK_OK(GetNodeAttr(identity0_host_to_oc_placeholder->attrs(),
                           kHostToOutsideCompilationSrcOutputAttrName, &i));
   EXPECT_EQ(i, 0);
+
+  // Check different placeholder nodes are created for different src_output.
+  Node *placeholder0 = node_index["identityn_0_host_to_oc_placeholder_0"],
+       *placeholder1 = node_index["identityn_0_host_to_oc_placeholder_1"];
+  EXPECT_NE(placeholder0, nullptr);
+  EXPECT_NE(placeholder1, nullptr);
+  // Check we only have 2 placeholder nodes created for "identityn_0".
+  int placeholder_count = 0;
+  for (Node *n : g.nodes()) {
+    if (HasNodeAttr(n->def(), kHostToOutsideCompilationOriginalNodeAttrName)) {
+      string attr;
+      TF_CHECK_OK(GetNodeAttr(
+          n->attrs(), kHostToOutsideCompilationOriginalNodeAttrName, &attr));
+      if (attr == "identityn_0") {
+        ++placeholder_count;
+      }
+    }
+  }
+  EXPECT_EQ(placeholder_count, 2);
 }
 
 TEST(PostprocessForEncapsulationTest, ControlEdges) {
diff --git a/tensorflow/compiler/jit/encapsulate_xla_computations_pass.cc b/tensorflow/compiler/jit/encapsulate_xla_computations_pass.cc
index 2ce6fa73fc4..d334100aa4a 100644
--- a/tensorflow/compiler/jit/encapsulate_xla_computations_pass.cc
+++ b/tensorflow/compiler/jit/encapsulate_xla_computations_pass.cc
@@ -195,8 +195,11 @@ Status RewriteSubgraph(const std::vector<OutputTensor>& arg_source_tensors,
         e->dst()->attrs().Find(kXlaClusterAttr) == nullptr &&
         e->dst()->type_string() != kXlaClusterOutput) {
       return errors::InvalidArgument(
-          "Undeclared output of XLA computation. A common cause of this error "
-          "is variable initializers that depend on the XLA computation. Edge: ",
+          "Undeclared output of XLA computation. Some common causes of this "
+          "error are: 1) variable initializers that depend on the XLA "
+          "computation; 2) gradient computations that depend on the XLA "
+          "computation, which can be mitigated by moving gradient computations "
+          "inside XLA computation. Offending edge: ",
           e->src()->name(), ":", e->src_output(), " -> ", e->dst()->name(), ":",
           e->dst_input());
     }
diff --git a/tensorflow/compiler/jit/extract_outside_compilation_pass.cc b/tensorflow/compiler/jit/extract_outside_compilation_pass.cc
index 8b3587c5087..e3c7e2f89be 100644
--- a/tensorflow/compiler/jit/extract_outside_compilation_pass.cc
+++ b/tensorflow/compiler/jit/extract_outside_compilation_pass.cc
@@ -366,7 +366,7 @@ Status ReplaceOrRemoveOutsideCompilationCallNode(
 //    replace this node with compilation result node.
 // 3) all outside compilation graphs.
 Status ConstructHostGraph(
-    const string& xla_cluster_name,
+    const string& xla_cluster_name, const string& outside_compilation_attr_name,
     const std::vector<string>& outside_compilation_host_graphs,
     FunctionLibraryDefinition* fld, std::unique_ptr<Graph>* host_graph) {
   host_graph->reset(new Graph(fld));
@@ -476,6 +476,10 @@ Status ConstructHostGraph(
       host_graph->get(),
       std::unordered_set<const Node*>{(*host_graph)->sink_node()});
 
+  // Postprocess edges between different outside compilations.
+  TF_RETURN_IF_ERROR(PostprocessEdgesBetweenOutsideCompilations(
+      host_graph->get(), outside_compilation_attr_name));
+
   if (VLOG_IS_ON(4)) {
     dump_graph::DumpGraphToFile(
         absl::StrCat("extract_outside_compilation_host_graph_for_",
@@ -801,6 +805,11 @@ Status ExtractOutsideCompilationForFunction(
       },
       &fbody));
   std::unique_ptr<FunctionBody> fbody_deleter(fbody);
+
+  // Preprocess edges between different outside compilations. They will be
+  // restored in `ConstructHostGraph()`.
+  TF_RETURN_IF_ERROR(PreprocessEdgesBetweenOutsideCompilations(
+      fbody->graph, outside_compilation_attr_name));
   if (VLOG_IS_ON(4)) {
     dump_graph::DumpGraphToFile(
         absl::StrCat("extract_outside_compilation_for_func_before_", func_name),
@@ -860,8 +869,9 @@ Status ExtractOutsideCompilationForFunction(
 
   // Construct host graph.
   if (!outside_compilation_host_graphs.empty()) {
-    TF_RETURN_IF_ERROR(ConstructHostGraph(
-        xla_cluster_name, outside_compilation_host_graphs, fld, host_graph));
+    TF_RETURN_IF_ERROR(
+        ConstructHostGraph(xla_cluster_name, outside_compilation_attr_name,
+                           outside_compilation_host_graphs, fld, host_graph));
   }
 
   // Remove the outside compilation graphs from function library.
diff --git a/tensorflow/compiler/jit/extract_outside_compilation_pass_test.cc b/tensorflow/compiler/jit/extract_outside_compilation_pass_test.cc
index c5bd64f004e..bff956100da 100644
--- a/tensorflow/compiler/jit/extract_outside_compilation_pass_test.cc
+++ b/tensorflow/compiler/jit/extract_outside_compilation_pass_test.cc
@@ -290,21 +290,18 @@ TEST(ExtractOutsideCompilationForFunctionTest, Basic) {
   TF_CHECK_OK(GetNodeAttr(host_compute_1->attrs(), "shapes", &shapes));
   EXPECT_EQ(shapes.size(), 1);
   EXPECT_EQ(shapes[0].dim_size(), 1);
-  // Check XlaHostCompute nodes' "shape_inference_graph" attr. "0" should have a
-  // non-empty value, and "1" should have an empty value.
+  // Check XlaHostCompute nodes' "shape_inference_graph" attr. Both should have
+  // empty values.
   string shape_inference_graph;
   TF_CHECK_OK(GetNodeAttr(host_compute_0->attrs(), "shape_inference_graph",
                           &shape_inference_graph));
-  EXPECT_EQ(shape_inference_graph,
-            "_outside_compilation_shape_inference_cluster_0");
+  EXPECT_EQ(shape_inference_graph, "");
   TF_CHECK_OK(GetNodeAttr(host_compute_1->attrs(), "shape_inference_graph",
                           &shape_inference_graph));
   EXPECT_EQ(shape_inference_graph, "");
 
   // Check `shape_inference_graphs`.
-  EXPECT_EQ(shape_inference_graphs.size(), 1);
-  EXPECT_EQ(shape_inference_graphs[0],
-            "_outside_compilation_shape_inference_cluster_0");
+  EXPECT_EQ(shape_inference_graphs.size(), 0);
 
   // Check `host_graph`: verify we have key placeholder and sequencer.
   Node *key_placeholder = nullptr, *sequencer = nullptr;
@@ -333,8 +330,8 @@ TEST(ExtractOutsideCompilationForFunctionTest, Basic) {
       send_recv_nodes.push_back(n);
     }
   }
-  EXPECT_EQ(num_send_from_host, 2);
-  EXPECT_EQ(num_recv_at_host, 2);
+  EXPECT_EQ(num_send_from_host, 1);
+  EXPECT_EQ(num_recv_at_host, 1);
   for (Node *n : send_recv_nodes) {
     Node *input_node;
     TF_CHECK_OK(n->input_node(n->num_inputs() - 1, &input_node));
diff --git a/tensorflow/compiler/jit/flags.cc b/tensorflow/compiler/jit/flags.cc
new file mode 100644
index 00000000000..98e344b3a08
--- /dev/null
+++ b/tensorflow/compiler/jit/flags.cc
@@ -0,0 +1,152 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <mutex>  // NOLINT
+
+#include "tensorflow/compiler/jit/flags.h"
+#include "tensorflow/compiler/xla/parse_flags_from_env.h"
+#include "tensorflow/core/util/command_line_flags.h"
+
+namespace tensorflow {
+namespace {
+
+BuildXlaOpsPassFlags* build_ops_flags;
+DumpGraphFlags* dump_graph_flags;
+MarkForCompilationPassFlags* mark_for_compilation_flags;
+XlaDeviceFlags* device_flags;
+XlaOpsCommonFlags* ops_flags;
+
+std::vector<Flag>* flag_list;
+std::once_flag flags_init;
+
+void AppendDumpGraphFlagsInternal(std::vector<Flag>* flag_list) {
+  std::vector<Flag> new_flags = {
+      Flag("tf_dump_graph_prefix", &dump_graph_flags->tf_dump_graph_prefix,
+           "Path prefix to which graphs dumped during debugging should be "
+           "written."),
+  };
+  flag_list->insert(flag_list->end(), new_flags.begin(), new_flags.end());
+}
+
+void AppendMarkForCompilationPassFlagsInternal(std::vector<Flag>* flag_list) {
+  std::vector<Flag> new_flags = {
+      Flag("tf_xla_auto_jit", &mark_for_compilation_flags->tf_xla_auto_jit,
+           "Control compilation of operators into XLA computations on CPU and "
+           "GPU devices.  0 = use ConfigProto setting; -1 = off; 1 = on for "
+           "things very likely to be improved; 2 = on for everything.  "
+           "Experimental."),
+      Flag("tf_xla_min_cluster_size",
+           &mark_for_compilation_flags->tf_xla_min_cluster_size,
+           "Minimum number of operators in an XLA compilation. Ignored for "
+           "operators placed on an XLA device or operators explicitly marked "
+           "for compilation."),
+      Flag("tf_xla_max_cluster_size",
+           &mark_for_compilation_flags->tf_xla_max_cluster_size,
+           "Maximum number of operators in an XLA compilation."),
+      Flag("tf_xla_clustering_debug",
+           &mark_for_compilation_flags->tf_xla_clustering_debug,
+           "Dump graphs during XLA compilation."),
+      Flag("tf_xla_cpu_global_jit",
+           &mark_for_compilation_flags->tf_xla_cpu_global_jit,
+           "Enables global JIT compilation for CPU via SessionOptions."),
+      Flag("tf_xla_clustering_fuel",
+           &mark_for_compilation_flags->tf_xla_clustering_fuel,
+           "Places an artificial limit on the number of ops marked as "
+           "eligible for clustering."),
+      Flag("tf_xla_fusion_only",
+           &mark_for_compilation_flags->tf_xla_fusion_only,
+           "enable fusion of element-wise operations only using XLA when "
+           "global_jit_level is ON*.")};
+  flag_list->insert(flag_list->end(), new_flags.begin(), new_flags.end());
+}
+
+void AllocateAndParseFlags() {
+  build_ops_flags = new BuildXlaOpsPassFlags;
+  build_ops_flags->tf_xla_enable_lazy_compilation = true;
+
+  dump_graph_flags = new DumpGraphFlags;
+  dump_graph_flags->tf_dump_graph_prefix = "/tmp/";
+
+  mark_for_compilation_flags = new MarkForCompilationPassFlags;
+  mark_for_compilation_flags->tf_xla_auto_jit = 0;
+  mark_for_compilation_flags->tf_xla_min_cluster_size = 2;
+  mark_for_compilation_flags->tf_xla_max_cluster_size =
+      std::numeric_limits<int32>::max();
+  mark_for_compilation_flags->tf_xla_clustering_debug = false;
+  mark_for_compilation_flags->tf_xla_cpu_global_jit = false;
+  mark_for_compilation_flags->tf_xla_clustering_fuel =
+      std::numeric_limits<int64>::max();
+  mark_for_compilation_flags->tf_xla_fusion_only = false;
+
+  device_flags = new XlaDeviceFlags;
+  device_flags->tf_xla_compile_on_demand = false;
+
+  ops_flags = new XlaOpsCommonFlags;
+  ops_flags->tf_xla_always_defer_compilation = false;
+
+  flag_list = new std::vector<Flag>({
+      Flag("tf_xla_enable_lazy_compilation",
+           &build_ops_flags->tf_xla_enable_lazy_compilation, ""),
+
+      Flag("tf_xla_compile_on_demand", &device_flags->tf_xla_compile_on_demand,
+           "Switch a device into 'on-demand' mode, where instead of "
+           "autoclustering ops are compiled one by one just-in-time."),
+
+      Flag("tf_xla_always_defer_compilation",
+           &ops_flags->tf_xla_always_defer_compilation, ""),
+  });
+  AppendDumpGraphFlagsInternal(flag_list);
+  AppendMarkForCompilationPassFlagsInternal(flag_list);
+  xla::ParseFlagsFromEnvAndDieIfUnknown("TF_XLA_FLAGS", *flag_list);
+}
+
+}  // namespace
+
+const BuildXlaOpsPassFlags& GetBuildXlaOpsPassFlags() {
+  std::call_once(flags_init, &AllocateAndParseFlags);
+  return *build_ops_flags;
+}
+
+DumpGraphFlags* GetDumpGraphFlags() {
+  std::call_once(flags_init, &AllocateAndParseFlags);
+  return dump_graph_flags;
+}
+
+MarkForCompilationPassFlags* GetMarkForCompilationPassFlags() {
+  std::call_once(flags_init, &AllocateAndParseFlags);
+  return mark_for_compilation_flags;
+}
+
+XlaDeviceFlags* GetXlaDeviceFlags() {
+  std::call_once(flags_init, &AllocateAndParseFlags);
+  return device_flags;
+}
+
+const XlaOpsCommonFlags& GetXlaOpsCommonFlags() {
+  std::call_once(flags_init, &AllocateAndParseFlags);
+  return *ops_flags;
+}
+
+void AppendMarkForCompilationPassFlags(std::vector<Flag>* flag_list) {
+  std::call_once(flags_init, &AllocateAndParseFlags);
+  AppendMarkForCompilationPassFlagsInternal(flag_list);
+}
+
+void AppendDumpGraphFlags(std::vector<Flag>* flag_list) {
+  std::call_once(flags_init, &AllocateAndParseFlags);
+  AppendDumpGraphFlagsInternal(flag_list);
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/legacy_flags/mark_for_compilation_pass_flags.h b/tensorflow/compiler/jit/flags.h
similarity index 57%
rename from tensorflow/compiler/jit/legacy_flags/mark_for_compilation_pass_flags.h
rename to tensorflow/compiler/jit/flags.h
index 79b47357a17..5ddea588eef 100644
--- a/tensorflow/compiler/jit/legacy_flags/mark_for_compilation_pass_flags.h
+++ b/tensorflow/compiler/jit/flags.h
@@ -13,10 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_COMPILER_JIT_LEGACY_FLAGS_MARK_FOR_COMPILATION_PASS_FLAGS_H_
-#define TENSORFLOW_COMPILER_JIT_LEGACY_FLAGS_MARK_FOR_COMPILATION_PASS_FLAGS_H_
-
-// Legacy flags for the XLA bridge's mark_for_compilation_pass module.
+#ifndef TENSORFLOW_COMPILER_JIT_FLAGS_H_
+#define TENSORFLOW_COMPILER_JIT_FLAGS_H_
 
 #include <vector>
 
@@ -24,15 +22,8 @@ limitations under the License.
 #include "tensorflow/core/util/command_line_flags.h"
 
 namespace tensorflow {
-namespace legacy_flags {
 
-// Append to *flag_list flag definitions associated with the XLA bridge's
-// mark_for_compilation_pass module.
-void AppendMarkForCompilationPassFlags(
-    std::vector<tensorflow::Flag>* flag_list);
-
-// The values of flags associated with the XLA bridge's
-// mark_for_compilation_pass module.
+// Flags associated with the XLA bridge's mark_for_compilation_pass module.
 struct MarkForCompilationPassFlags {
   int32 tf_xla_auto_jit;  // Control compilation of operators into XLA
                           // computations on CPU and GPU devices.  0 = use
@@ -57,12 +48,56 @@ struct MarkForCompilationPassFlags {
                             // only using XLA.
 };
 
-// Return a pointer to the MarkForCompilationPassFlags struct;
+// Flags associated with the XLA bridge's xla_device module.
+struct XlaDeviceFlags {
+  // Switch the CPU device into "on-demand" mode, where instead of
+  // autoclustering ops are compiled one by one just-in-time.
+  // Enabling this mode by a legacy flag is a temporary mechanism. When this
+  // feature is battle-tested, we will switch this to be a session option.
+  bool tf_xla_compile_on_demand;
+};
+
+// Flags common to the _Xla* ops and their kernels.
+struct XlaOpsCommonFlags {
+  // If true, _XlaCompile always refuses to compile the cluster, which means the
+  // XLA clusters always run in the TF executor.  Defaults to false.
+  bool tf_xla_always_defer_compilation;
+};
+
+// Flags for the build_xla_ops pass.
+struct BuildXlaOpsPassFlags {
+  // Enables lazy compilation for TF/XLA (only when auto-clustering) if true.
+  // Defaults to true.
+  bool tf_xla_enable_lazy_compilation;
+};
+
+// Flags for the XLA bridge's dump_graph module.
+struct DumpGraphFlags {
+  // Path prefix to which graphs dumped during debugging should be written.
+  string tf_dump_graph_prefix;
+};
+
+// Return a pointer to the DumpGraphFlags struct;
 // repeated calls return the same pointer.
 // This should be called only after Flags::Parse() has returned.
-MarkForCompilationPassFlags* GetMarkForCompilationPassFlags();
 
-}  // namespace legacy_flags
+// Getters for flags structs defined above.  The first call to any of these
+// parses TF_XLA_FLAGS for all of them.  Those functions which return a pointer
+// always return the same pointer.
+MarkForCompilationPassFlags* GetMarkForCompilationPassFlags();
+const BuildXlaOpsPassFlags& GetBuildXlaOpsPassFlags();
+XlaDeviceFlags* GetXlaDeviceFlags();
+const XlaOpsCommonFlags& GetXlaOpsCommonFlags();
+DumpGraphFlags* GetDumpGraphFlags();
+
+// Appends the flag definitions associated with
+// MarkForCompilationPassFlags/DumpGraphFlags to `flag_list`.
+//
+// Has the side-effect of parsing TF_XLA_FLAGS if that hasn't happened yet.
+void AppendMarkForCompilationPassFlags(
+    std::vector<tensorflow::Flag>* flag_list);
+void AppendDumpGraphFlags(std::vector<tensorflow::Flag>* flag_list);
+
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_COMPILER_JIT_LEGACY_FLAGS_MARK_FOR_COMPILATION_PASS_FLAGS_H_
+#endif  // TENSORFLOW_COMPILER_JIT_FLAGS_H_
diff --git a/tensorflow/compiler/jit/increase_dynamism_for_auto_jit_pass.cc b/tensorflow/compiler/jit/increase_dynamism_for_auto_jit_pass.cc
index d984ca15cb7..ce53f70b79d 100644
--- a/tensorflow/compiler/jit/increase_dynamism_for_auto_jit_pass.cc
+++ b/tensorflow/compiler/jit/increase_dynamism_for_auto_jit_pass.cc
@@ -23,7 +23,7 @@ limitations under the License.
 #include "tensorflow/cc/ops/array_ops.h"
 #include "tensorflow/cc/ops/const_op.h"
 #include "tensorflow/cc/ops/math_ops.h"
-#include "tensorflow/compiler/jit/legacy_flags/mark_for_compilation_pass_flags.h"
+#include "tensorflow/compiler/jit/flags.h"
 #include "tensorflow/compiler/jit/xla_cluster_util.h"
 #include "tensorflow/compiler/tf2xla/cc/ops/xla_ops.h"
 #include "tensorflow/compiler/tf2xla/dump_graph.h"
@@ -208,8 +208,12 @@ Status ComputeSliceSize(const Scope& host_scope,
     DCHECK_EQ(slice_size.back().type(), DT_INT64);
   }
 
-  *size = ops::Concat(host_scope.WithOpName("slice_size"), slice_size,
-                      ops::Const(host_scope.WithOpName("concat_axis"), 0));
+  // Trivial ConcatV2 nodes (with exactly one input) are disallowed.
+  *size =
+      slice_size.size() == 1
+          ? slice_size[0]
+          : ops::Concat(host_scope.WithOpName("slice_size"), slice_size,
+                        ops::Const(host_scope.WithOpName("concat_axis"), 0));
   return Status::OK();
 }
 
@@ -242,6 +246,9 @@ Status ConvertTensorFlowSliceToStaticShapedSlice(
                      .WithOpName("static_shaped_slice"),
                  slice_inputs_int64.input, slice_inputs_int64.begin, slice_size)
           .node();
+
+  TF_RETURN_IF_ERROR(main_scope.status());
+
   std::vector<string> compile_time_const_inputs;
   compile_time_const_inputs.push_back("size");
   (*result)->AddAttr(kXlaCompileTimeConstantInputsAttr,
@@ -284,49 +291,45 @@ Status RewriteSlice(Graph* g, Node* slice, const SliceInputs& slice_inputs,
   return Status::OK();
 }
 
-// If `n` is a slice we can rewrite to have a static shape (i.e. have the output
-// shape only depend on the "size" input) then returns the a SliceInputs
-// representing the inputs to `n`.  Otherwise returns nullopt.
-StatusOrOptional<SliceInputs> IsRewritableSlice(Node* n) {
+// Return true if `n` is a slice we can rewrite to have a static shape
+// (i.e. have the output shape only depend on the "size" input).
+xla::StatusOr<bool> IsRewritableSlice(Node* n) {
   if (n->type_string() != "Slice") {
-    return {absl::nullopt};
+    return false;
   }
 
   if (!GetXlaClusterForNode(*n).has_value()) {
     // There is no need to change slice ops outside XLA clusters.
-    return {absl::nullopt};
+    return false;
   }
 
   TF_ASSIGN_OR_RETURN(absl::optional<SliceInputs> slice_inputs,
                       GetSliceInputs(n));
   if (!slice_inputs.has_value()) {
-    return {absl::nullopt};
+    return false;
   }
 
   // If slice_size[i] < -1 for any i then executing the slice will throw an
   // error, and we don't do anything here.
-  bool slice_is_ok = absl::c_all_of(slice_inputs->size_as_vector,
-                                    [](int64 size_i) { return size_i >= -1; });
-  if (!slice_is_ok) {
-    return {absl::nullopt};
-  }
-
-  return slice_inputs;
+  return absl::c_all_of(slice_inputs->size_as_vector,
+                        [](int64 size_i) { return size_i >= -1; });
 }
 
 Status FindAndRewriteSlices(Graph* g, bool* changed) {
-  std::vector<std::pair<Node*, SliceInputs>> slices_to_rewrite;
+  std::vector<Node*> slices_to_rewrite;
   for (Node* n : g->nodes()) {
-    TF_ASSIGN_OR_RETURN(absl::optional<SliceInputs> slice_inputs,
-                        IsRewritableSlice(n));
-    if (slice_inputs.has_value()) {
-      slices_to_rewrite.push_back({n, std::move(*slice_inputs)});
+    TF_ASSIGN_OR_RETURN(bool is_rewritable, IsRewritableSlice(n));
+    if (is_rewritable) {
+      slices_to_rewrite.push_back(n);
     }
   }
 
-  for (const auto& pair : slices_to_rewrite) {
-    TF_RETURN_IF_ERROR(RewriteSlice(g, pair.first, pair.second,
-                                    *GetXlaClusterForNode(*pair.first)));
+  for (Node* n : slices_to_rewrite) {
+    TF_ASSIGN_OR_RETURN(absl::optional<SliceInputs> slice_inputs,
+                        GetSliceInputs(n));
+    TF_RET_CHECK(slice_inputs.has_value());
+    TF_RETURN_IF_ERROR(
+        RewriteSlice(g, n, *slice_inputs, *GetXlaClusterForNode(*n)));
   }
 
   if (!slices_to_rewrite.empty()) {
@@ -342,8 +345,7 @@ Status FindAndRewriteSlices(Graph* g, bool* changed) {
 
 Status IncreaseDynamismForAutoJitPass::Run(
     const GraphOptimizationPassOptions& options) {
-  legacy_flags::MarkForCompilationPassFlags* flags =
-      legacy_flags::GetMarkForCompilationPassFlags();
+  MarkForCompilationPassFlags* flags = GetMarkForCompilationPassFlags();
   if (flags->tf_xla_clustering_debug) {
     dump_graph::DumpGraphToFile("before_increase_dynamism_for_auto_jit_pass",
                                 **options.graph, options.flib_def);
diff --git a/tensorflow/compiler/jit/increase_dynamism_for_auto_jit_pass_test.cc b/tensorflow/compiler/jit/increase_dynamism_for_auto_jit_pass_test.cc
index 0f6f612e967..a2f1b831ad7 100644
--- a/tensorflow/compiler/jit/increase_dynamism_for_auto_jit_pass_test.cc
+++ b/tensorflow/compiler/jit/increase_dynamism_for_auto_jit_pass_test.cc
@@ -27,6 +27,7 @@ limitations under the License.
 namespace tensorflow {
 namespace {
 
+using ::testing::_;
 using testing::matchers::AssignedDevice;
 using testing::matchers::Attr;
 using testing::matchers::Const;
@@ -142,6 +143,26 @@ TEST(SliceToDynamicSliceRewriteTest, Basic) {
   EXPECT_THAT(static_shaped_slice, m_dynamic_slice);
 }
 
+TEST(SliceToDynamicSliceRewriteTest, SliceFromVector) {
+  Scope root = Scope::NewRootScope()
+                   .ExitOnError()
+                   .WithAssignedDevice(kDeviceName)
+                   .WithXlaCluster("cluster_0");
+
+  Output input = ops::Placeholder(root.WithOpName("input"), DT_FLOAT);
+  Output begin = ops::Placeholder(root.WithOpName("begin"), DT_INT32);
+  Output size = ops::Const(root.WithOpName("size"), {-1});
+  Output slice = ops::Slice(root.WithOpName("slice"), input, begin, size);
+
+  std::unique_ptr<Graph> result;
+  TF_ASSERT_OK(IncreaseDynamismForAutoJit(root, &result));
+
+  Node* static_shaped_slice = testing::FindNodeByName(
+      result.get(), "slice/static_shaped_slice/static_shaped_slice");
+  EXPECT_NE(static_shaped_slice, nullptr);
+  EXPECT_THAT(result->nodes(), Not(Contains(NodeWith(Op("ConcatV2")))));
+}
+
 TEST(SliceToDynamicSliceRewriteTest, ControlDependencePreserved) {
   Scope root = Scope::NewRootScope()
                    .ExitOnError()
@@ -166,18 +187,18 @@ TEST(SliceToDynamicSliceRewriteTest, ControlDependencePreserved) {
                        CtrlDeps(NodeWith(Op("Placeholder"), Name("control")))));
 }
 
+int64 ToInt64(int v) { return static_cast<int64>(v); }
+
 TEST(SliceToDynamicSliceRewriteTest, Int64Indices) {
   Scope root = Scope::NewRootScope()
                    .ExitOnError()
                    .WithAssignedDevice(kDeviceName)
                    .WithXlaCluster("cluster_0");
 
-  auto to_int64 = [](int v) { return static_cast<int64>(v); };
-
   Output input = ops::Placeholder(root.WithOpName("input"), DT_FLOAT);
   Output begin = ops::Placeholder(root.WithOpName("begin"), DT_INT64);
   Output size =
-      ops::Const(root.WithOpName("size"), {to_int64(-1), to_int64(500)});
+      ops::Const(root.WithOpName("size"), {ToInt64(-1), ToInt64(500)});
   Output slice = ops::Slice(root.WithOpName("slice"), input, begin, size);
 
   std::unique_ptr<Graph> result;
@@ -252,13 +273,35 @@ TEST(SliceToDynamicSliceRewriteTest, DontRewriteSliceWithNonConstSize) {
                                     Attr(kXlaCompileTimeConstantInputsAttr)))));
 }
 
+TEST(SliceToDynamicSliceRewriteTest, ScalarSlice) {
+  Scope root = Scope::NewRootScope()
+                   .ExitOnError()
+                   .WithAssignedDevice(kDeviceName)
+                   .WithXlaCluster("cluster_0");
+
+  Output input = ops::Placeholder(root.WithOpName("input"), DT_FLOAT);
+  Output begin = ops::Placeholder(root.WithOpName("begin"), DT_INT64);
+  Output size = ops::Const<int64>(root.WithOpName("size"), {});
+  Output slice = ops::Slice(root.WithOpName("slice"), input, begin, size);
+
+  std::unique_ptr<Graph> result;
+  TF_ASSERT_OK(IncreaseDynamismForAutoJit(root, &result));
+
+  Node* static_shaped_slice = testing::FindNodeByName(
+      result.get(), "slice/static_shaped_slice/static_shaped_slice");
+  ASSERT_NE(static_shaped_slice, nullptr);
+  EXPECT_THAT(static_shaped_slice,
+              NodeWith(Op("Slice"), Attr(kXlaCompileTimeConstantInputsAttr),
+                       Inputs(_, _, Out(NodeWith(Name(size.node()->name()))))));
+}
+
 TEST(SliceToDynamicSliceRewriteTest, IndicesNotVector) {
   Scope root = Scope::NewRootScope()
                    .ExitOnError()
                    .WithAssignedDevice(kDeviceName)
                    .WithXlaCluster("cluster_0");
 
-  auto to_int64 = [](int v) { return static_cast<int64>(v); };
+  auto ToInt64 = [](int v) { return static_cast<int64>(v); };
 
   Output input = ops::Placeholder(root.WithOpName("input"), DT_FLOAT);
   Output begin = ops::Placeholder(root.WithOpName("begin"), DT_INT64);
@@ -271,7 +314,7 @@ TEST(SliceToDynamicSliceRewriteTest, IndicesNotVector) {
       ops::Slice(root.WithOpName("slice"), input, begin, size_placeholder);
 
   Output size =
-      ops::Const(root.WithOpName("size"), {{to_int64(-1)}, {to_int64(500)}});
+      ops::Const(root.WithOpName("size"), {{ToInt64(-1)}, {ToInt64(500)}});
   TF_ASSERT_OK(root.graph()->UpdateEdge(size.node(), 0, slice.node(), 2));
 
   std::unique_ptr<Graph> result;
@@ -281,5 +324,82 @@ TEST(SliceToDynamicSliceRewriteTest, IndicesNotVector) {
               Not(Contains(NodeWith(Op("Slice"),
                                     Attr(kXlaCompileTimeConstantInputsAttr)))));
 }
+
+TEST(SliceToDynamicSliceRewriteTest, SliceWithSliceInput) {
+  Scope root = Scope::NewRootScope()
+                   .ExitOnError()
+                   .WithAssignedDevice(kDeviceName)
+                   .WithXlaCluster("cluster_0");
+
+  Output input = ops::Placeholder(root.WithOpName("input"), DT_FLOAT);
+  Output begin = ops::Placeholder(root.WithOpName("begin"), DT_INT32);
+  Output size_a = ops::Const(root.WithOpName("size_a"), {-1, 500});
+  Output slice = ops::Slice(root.WithOpName("slice"), input, begin, size_a);
+
+  Output size_b = ops::Const(root.WithOpName("size_a"), {-1, 200});
+  Output slice_with_slice_input = ops::Slice(
+      root.WithOpName("slice_with_slice_input"), slice, begin, size_b);
+
+  std::unique_ptr<Graph> result;
+  TF_ASSERT_OK(IncreaseDynamismForAutoJit(root, &result));
+
+  Node* static_shaped_slice = testing::FindNodeByName(
+      result.get(),
+      "slice_with_slice_input/static_shaped_slice/static_shaped_slice");
+  ASSERT_NE(static_shaped_slice, nullptr);
+  EXPECT_EQ(static_shaped_slice->output_type(0), DT_FLOAT)
+      << "Expected DT_FLOAT, was "
+      << DataType_Name(static_shaped_slice->output_type(0));
+  EXPECT_THAT(
+      static_shaped_slice,
+      NodeWith(
+          Op("Slice"),
+          Inputs(Out(NodeWith(
+                     Op("Slice"),
+                     Name("slice/static_shaped_slice/static_shaped_slice"))),
+                 _, _)));
+}
+
+TEST(SliceToDynamicSliceRewriteTest, SliceWithSliceBegin) {
+  Scope root = Scope::NewRootScope()
+                   .ExitOnError()
+                   .WithAssignedDevice(kDeviceName)
+                   .WithXlaCluster("cluster_0");
+
+  Output input_float =
+      ops::Placeholder(root.WithOpName("input_float"), DT_FLOAT);
+  Output input_i64 = ops::Placeholder(root.WithOpName("input_i64"), DT_INT64);
+
+  Output begin_begin =
+      ops::Placeholder(root.WithOpName("begin_begin"), DT_INT32);
+  Output begin_size = ops::Const(root.WithOpName("begin_size"), {-1});
+  Output begin =
+      ops::Slice(root.WithOpName("begin"), input_i64, begin_begin, begin_size);
+
+  Output size =
+      ops::Const(root.WithOpName("size"), {ToInt64(-1), ToInt64(200)});
+  Output slice_with_slice_begin = ops::Slice(
+      root.WithOpName("slice_with_slice_begin"), input_float, begin, size);
+
+  std::unique_ptr<Graph> result;
+  TF_ASSERT_OK(IncreaseDynamismForAutoJit(root, &result));
+
+  Node* static_shaped_slice = testing::FindNodeByName(
+      result.get(),
+      "slice_with_slice_begin/static_shaped_slice/static_shaped_slice");
+  ASSERT_NE(static_shaped_slice, nullptr);
+  EXPECT_EQ(static_shaped_slice->output_type(0), DT_FLOAT)
+      << "Expected DT_FLOAT, was "
+      << DataType_Name(static_shaped_slice->output_type(0));
+  EXPECT_THAT(
+      static_shaped_slice,
+      NodeWith(
+          Op("Slice"),
+          Inputs(_,
+                 Out(NodeWith(
+                     Op("Slice"),
+                     Name("begin/static_shaped_slice/static_shaped_slice"))),
+                 _)));
+}
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/kernels/BUILD b/tensorflow/compiler/jit/kernels/BUILD
index 830db9ebdd9..0583774714c 100644
--- a/tensorflow/compiler/jit/kernels/BUILD
+++ b/tensorflow/compiler/jit/kernels/BUILD
@@ -12,10 +12,10 @@ cc_library(
     hdrs = ["xla_ops.h"],
     deps = [
         "//tensorflow/compiler/jit:common",
+        "//tensorflow/compiler/jit:flags",
         "//tensorflow/compiler/jit:xla_compilation_cache",
         "//tensorflow/compiler/jit:xla_device",
         "//tensorflow/compiler/jit:xla_launch_util",
-        "//tensorflow/compiler/jit/legacy_flags:xla_ops_common_flags",
         "//tensorflow/compiler/tf2xla:common",
         "//tensorflow/compiler/tf2xla:tf2xla_util",
         "//tensorflow/compiler/tf2xla:xla_compiler",
diff --git a/tensorflow/compiler/jit/kernels/xla_ops.cc b/tensorflow/compiler/jit/kernels/xla_ops.cc
index 055de7afcc5..ad71df5a694 100644
--- a/tensorflow/compiler/jit/kernels/xla_ops.cc
+++ b/tensorflow/compiler/jit/kernels/xla_ops.cc
@@ -18,7 +18,7 @@ limitations under the License.
 #include "absl/container/flat_hash_map.h"
 #include "absl/memory/memory.h"
 #include "tensorflow/compiler/jit/defs.h"
-#include "tensorflow/compiler/jit/legacy_flags/xla_ops_common_flags.h"
+#include "tensorflow/compiler/jit/flags.h"
 #include "tensorflow/compiler/tf2xla/shape_util.h"
 #include "tensorflow/compiler/tf2xla/tf2xla_util.h"
 #include "tensorflow/compiler/tf2xla/xla_compiler.h"
@@ -418,7 +418,7 @@ void XlaCompileOp::Compute(OpKernelContext* ctx) {
     cannot_compile_cluster = cannot_compile_cluster_;
   }
 
-  if (legacy_flags::GetXlaOpsCommonFlags().tf_xla_always_defer_compilation ||
+  if (GetXlaOpsCommonFlags().tf_xla_always_defer_compilation ||
       cannot_compile_cluster) {
     executable = nullptr;
   } else {
diff --git a/tensorflow/compiler/jit/legacy_flags/BUILD b/tensorflow/compiler/jit/legacy_flags/BUILD
deleted file mode 100644
index 5fa6c85f06f..00000000000
--- a/tensorflow/compiler/jit/legacy_flags/BUILD
+++ /dev/null
@@ -1,65 +0,0 @@
-# Legacy command line flags for the XLA bridge libraries.
-
-# Please do not add more flags to this package.
-
-# The XLA bridge libraries were written in an environment that allowed
-# command-line flags to be scattered freely throughout the libraries.  This
-# model, while initially convenient, leads to a proliferation in unused command
-# line flags in tests and binaries, and serious problems in servers, where one
-# might wish parameters to be different in independent RPC calls to the same
-# routine.
-#
-# Please don't add more flags.  If you're a library author, pass options and
-# parameters explicitly through the library's interface.
-
-licenses(["notice"])  # Apache 2.0
-
-package(default_visibility = ["//tensorflow:internal"])
-
-cc_library(
-    name = "mark_for_compilation_pass_flags",
-    srcs = ["mark_for_compilation_pass_flags.cc"],
-    hdrs = ["mark_for_compilation_pass_flags.h"],
-    deps =
-        [
-            "//tensorflow/compiler/xla:parse_flags_from_env",
-            "//tensorflow/core:framework_internal",
-            "//tensorflow/core:lib",
-        ],
-)
-
-cc_library(
-    name = "xla_device_flags",
-    srcs = ["xla_device_flags.cc"],
-    hdrs = ["xla_device_flags.h"],
-    deps =
-        [
-            "//tensorflow/compiler/xla:parse_flags_from_env",
-            "//tensorflow/core:framework_internal",
-            "//tensorflow/core:lib",
-        ],
-)
-
-cc_library(
-    name = "build_xla_ops_pass_flags",
-    srcs = ["build_xla_ops_pass_flags.cc"],
-    hdrs = ["build_xla_ops_pass_flags.h"],
-    deps =
-        [
-            "//tensorflow/compiler/xla:parse_flags_from_env",
-            "//tensorflow/core:framework_internal",
-            "//tensorflow/core:lib",
-        ],
-)
-
-cc_library(
-    name = "xla_ops_common_flags",
-    srcs = ["xla_ops_common_flags.cc"],
-    hdrs = ["xla_ops_common_flags.h"],
-    deps =
-        [
-            "//tensorflow/compiler/xla:parse_flags_from_env",
-            "//tensorflow/core:framework_internal",
-            "//tensorflow/core:lib",
-        ],
-)
diff --git a/tensorflow/compiler/jit/legacy_flags/build_xla_ops_pass_flags.cc b/tensorflow/compiler/jit/legacy_flags/build_xla_ops_pass_flags.cc
deleted file mode 100644
index 961c17c17ea..00000000000
--- a/tensorflow/compiler/jit/legacy_flags/build_xla_ops_pass_flags.cc
+++ /dev/null
@@ -1,47 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <mutex>  // NOLINT
-
-#include "tensorflow/compiler/jit/legacy_flags/build_xla_ops_pass_flags.h"
-#include "tensorflow/compiler/xla/parse_flags_from_env.h"
-#include "tensorflow/core/util/command_line_flags.h"
-
-namespace tensorflow {
-namespace legacy_flags {
-namespace {
-
-BuildXlaOpsPassFlags* flags;
-std::vector<Flag>* flag_list;
-std::once_flag flags_init;
-
-void AllocateAndParseFlags() {
-  flags = new BuildXlaOpsPassFlags;
-  flags->tf_xla_enable_lazy_compilation = true;
-  flag_list = new std::vector<Flag>({
-      Flag("tf_xla_enable_lazy_compilation",
-           &flags->tf_xla_enable_lazy_compilation, ""),
-  });
-  xla::ParseFlagsFromEnv(*flag_list);
-}
-
-}  // namespace
-
-const BuildXlaOpsPassFlags& GetBuildXlaOpsPassFlags() {
-  std::call_once(flags_init, &AllocateAndParseFlags);
-  return *flags;
-}
-}  // namespace legacy_flags
-}  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/legacy_flags/build_xla_ops_pass_flags.h b/tensorflow/compiler/jit/legacy_flags/build_xla_ops_pass_flags.h
deleted file mode 100644
index 9aa5cf64d6d..00000000000
--- a/tensorflow/compiler/jit/legacy_flags/build_xla_ops_pass_flags.h
+++ /dev/null
@@ -1,37 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_COMPILER_JIT_LEGACY_FLAGS_BUILD_XLA_OPS_PASS_FLAGS_H_
-#define TENSORFLOW_COMPILER_JIT_LEGACY_FLAGS_BUILD_XLA_OPS_PASS_FLAGS_H_
-
-namespace tensorflow {
-namespace legacy_flags {
-
-// Flags for the build_xla_ops pass.
-struct BuildXlaOpsPassFlags {
-  // Enables lazy compilation for TF/XLA (only when auto-clustering) if true.
-  // Defaults to true.
-  bool tf_xla_enable_lazy_compilation;
-};
-
-// Parses the flags in BuildXlaOpsPassFlags from the TF_XLA_FLAGS environment
-// variable and returns a reference to the parsed copy.  Parses TF_XLA_FLAGS
-// only the first time this routine is called.
-const BuildXlaOpsPassFlags& GetBuildXlaOpsPassFlags();
-
-}  // namespace legacy_flags
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_COMPILER_JIT_LEGACY_FLAGS_BUILD_XLA_OPS_PASS_FLAGS_H_
diff --git a/tensorflow/compiler/jit/legacy_flags/mark_for_compilation_pass_flags.cc b/tensorflow/compiler/jit/legacy_flags/mark_for_compilation_pass_flags.cc
deleted file mode 100644
index bad306e0b0a..00000000000
--- a/tensorflow/compiler/jit/legacy_flags/mark_for_compilation_pass_flags.cc
+++ /dev/null
@@ -1,98 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// Legacy flags for the XLA bridge's mark_for_compilation_pass module.
-
-#include <mutex>
-#include <vector>
-
-#include "tensorflow/compiler/jit/legacy_flags/mark_for_compilation_pass_flags.h"
-#include "tensorflow/compiler/xla/parse_flags_from_env.h"
-#include "tensorflow/core/platform/logging.h"
-#include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/util/command_line_flags.h"
-
-namespace tensorflow {
-namespace legacy_flags {
-
-// Pointers to the parsed value of the flags and flag descriptors, initialized
-// via flags_init.
-static MarkForCompilationPassFlags* flags;
-static std::vector<Flag>* flag_list;
-static std::once_flag flags_init;
-
-// Allocate *flags.  Called via call_once(&flags_init,...).
-static void AllocateFlags() {
-  flags = new MarkForCompilationPassFlags;
-  flags->tf_xla_auto_jit = 0;
-  flags->tf_xla_min_cluster_size = 2;
-  flags->tf_xla_max_cluster_size = std::numeric_limits<int32>::max();
-  flags->tf_xla_clustering_debug = false;
-  flags->tf_xla_cpu_global_jit = false;
-  flags->tf_xla_clustering_fuel = std::numeric_limits<int64>::max();
-  flags->tf_xla_fusion_only = false;
-  flag_list = new std::vector<Flag>(
-      {Flag("tf_xla_auto_jit", &flags->tf_xla_auto_jit,
-            "Control compilation of operators into XLA computations on CPU and "
-            "GPU devices.  0 = use ConfigProto setting; -1 = off; 1 = on for "
-            "things very likely to be improved; 2 = on for everything.  "
-            "Experimental."),
-       Flag("tf_xla_min_cluster_size", &flags->tf_xla_min_cluster_size,
-            "Minimum number of operators in an XLA compilation. Ignored for "
-            "operators placed on an XLA device or operators explicitly marked "
-            "for compilation."),
-       Flag("tf_xla_max_cluster_size", &flags->tf_xla_max_cluster_size,
-            "Maximum number of operators in an XLA compilation."),
-       Flag("tf_xla_clustering_debug", &flags->tf_xla_clustering_debug,
-            "Dump graphs during XLA compilation."),
-       Flag("tf_xla_cpu_global_jit", &flags->tf_xla_cpu_global_jit,
-            "Enables global JIT compilation for CPU via SessionOptions."),
-       Flag("tf_xla_clustering_fuel", &flags->tf_xla_clustering_fuel,
-            "Places an artificial limit on the number of ops marked as "
-            "eligible for clustering."),
-       Flag("tf_xla_fusion_only", &flags->tf_xla_fusion_only,
-            "enable fusion of element-wise operations only using XLA when "
-            "global_jit_level is ON*.")});
-  xla::ParseFlagsFromEnv(*flag_list);
-
-  if (VLOG_IS_ON(1)) {
-    VLOG(1) << "Parsed MarkForCompilationPassFlags:";
-    VLOG(1) << "  tf_xla_auto_jit = " << flags->tf_xla_auto_jit;
-    VLOG(1) << "  tf_xla_min_cluster_size = " << flags->tf_xla_min_cluster_size;
-    VLOG(1) << "  tf_xla_max_cluster_size = " << flags->tf_xla_max_cluster_size;
-    VLOG(1) << "  tf_xla_clustering_debug = " << flags->tf_xla_clustering_debug;
-    VLOG(1) << "  tf_xla_cpu_global_jit = " << flags->tf_xla_cpu_global_jit;
-    VLOG(1) << "  tf_xla_clustering_fuel = " << flags->tf_xla_clustering_fuel;
-    VLOG(1) << "  tf_xla_fusion_only = " << flags->tf_xla_fusion_only;
-  }
-}
-
-// Append to *append_to flag definitions associated with the XLA bridge's
-// mark_for_compilation_pass module.
-void AppendMarkForCompilationPassFlags(std::vector<Flag>* append_to) {
-  std::call_once(flags_init, &AllocateFlags);
-  append_to->insert(append_to->end(), flag_list->begin(), flag_list->end());
-}
-
-// Return a pointer to the MarkForCompilationPassFlags struct;
-// repeated calls return the same pointer.
-// This should be called only after Flags::Parse() has returned.
-MarkForCompilationPassFlags* GetMarkForCompilationPassFlags() {
-  std::call_once(flags_init, &AllocateFlags);
-  return flags;
-}
-
-}  // namespace legacy_flags
-}  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/legacy_flags/xla_device_flags.cc b/tensorflow/compiler/jit/legacy_flags/xla_device_flags.cc
deleted file mode 100644
index 76b80d3034c..00000000000
--- a/tensorflow/compiler/jit/legacy_flags/xla_device_flags.cc
+++ /dev/null
@@ -1,56 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// Legacy flags for the XLA bridge's xla_device module.
-
-#include <mutex>
-#include <vector>
-
-#include "tensorflow/compiler/jit/legacy_flags/xla_device_flags.h"
-#include "tensorflow/compiler/xla/parse_flags_from_env.h"
-#include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/util/command_line_flags.h"
-
-namespace tensorflow {
-namespace legacy_flags {
-
-// Pointers to the parsed value of the flags and flag descriptors, initialized
-// via flags_init.
-static XlaDeviceFlags* flags;
-static std::vector<Flag>* flag_list;
-static std::once_flag flags_init;
-
-// Allocate *flags.  Called via call_once(&flags_init,...).
-static void AllocateFlags() {
-  flags = new XlaDeviceFlags;
-  flags->tf_xla_compile_on_demand = false;
-  flag_list = new std::vector<Flag>({
-      Flag("tf_xla_compile_on_demand", &flags->tf_xla_compile_on_demand,
-           "Switch a device into 'on-demand' mode, where instead of "
-           "autoclustering ops are compiled one by one just-in-time."),
-  });
-  xla::ParseFlagsFromEnv(*flag_list);
-}
-
-// Return a pointer to the XlaDeviceFlags struct;
-// repeated calls return the same pointer.
-// This should be called only after Flags::Parse() has returned.
-XlaDeviceFlags* GetXlaDeviceFlags() {
-  std::call_once(flags_init, &AllocateFlags);
-  return flags;
-}
-
-}  // namespace legacy_flags
-}  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/legacy_flags/xla_device_flags.h b/tensorflow/compiler/jit/legacy_flags/xla_device_flags.h
deleted file mode 100644
index 27b22121ac1..00000000000
--- a/tensorflow/compiler/jit/legacy_flags/xla_device_flags.h
+++ /dev/null
@@ -1,47 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_COMPILER_JIT_LEGACY_FLAGS_XLA_DEVICE_FLAGS_H_
-#define TENSORFLOW_COMPILER_JIT_LEGACY_FLAGS_XLA_DEVICE_FLAGS_H_
-
-// Legacy flags for the XLA bridge's xla_device module.
-
-#include <vector>
-
-#include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/util/command_line_flags.h"
-
-namespace tensorflow {
-namespace legacy_flags {
-
-// The values of flags associated with the XLA bridge's
-// xla_device module.
-typedef struct {
-  // Switch the CPU device into "on-demand" mode, where instead of
-  // autoclustering ops are compiled one by one just-in-time.
-  // Enabling this mode by a legacy flag is a temporary mechanism. When this
-  // feature is battle-tested, we will switch this to be a session option.
-  bool tf_xla_compile_on_demand;
-} XlaDeviceFlags;
-
-// Return a pointer to the XlaDeviceFlags struct;
-// repeated calls return the same pointer.
-// This should be called only after Flags::Parse() has returned.
-XlaDeviceFlags* GetXlaDeviceFlags();
-
-}  // namespace legacy_flags
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_COMPILER_JIT_LEGACY_FLAGS_XLA_DEVICE_FLAGS_H_
diff --git a/tensorflow/compiler/jit/legacy_flags/xla_ops_common_flags.cc b/tensorflow/compiler/jit/legacy_flags/xla_ops_common_flags.cc
deleted file mode 100644
index 1443d48a734..00000000000
--- a/tensorflow/compiler/jit/legacy_flags/xla_ops_common_flags.cc
+++ /dev/null
@@ -1,52 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <mutex>  // NOLINT
-#include <vector>
-
-#include "tensorflow/compiler/jit/legacy_flags/xla_ops_common_flags.h"
-#include "tensorflow/compiler/xla/parse_flags_from_env.h"
-#include "tensorflow/core/platform/logging.h"
-#include "tensorflow/core/util/command_line_flags.h"
-
-namespace tensorflow {
-namespace legacy_flags {
-
-XlaOpsCommonFlags* flags;
-std::vector<Flag>* flag_list;
-std::once_flag flags_init;
-
-void AllocateAndParseFlags() {
-  flags = new XlaOpsCommonFlags;
-  flags->tf_xla_always_defer_compilation = false;
-  flag_list = new std::vector<Flag>({
-      Flag("tf_xla_always_defer_compilation",
-           &flags->tf_xla_always_defer_compilation, ""),
-  });
-  xla::ParseFlagsFromEnv(*flag_list);
-
-  if (VLOG_IS_ON(1)) {
-    VLOG(1) << "Parsed XlaOpsCommonFlags:";
-    VLOG(1) << "  tf_xla_always_defer_compilation = "
-            << flags->tf_xla_always_defer_compilation;
-  }
-}
-
-const XlaOpsCommonFlags& GetXlaOpsCommonFlags() {
-  std::call_once(flags_init, &AllocateAndParseFlags);
-  return *flags;
-}
-}  // namespace legacy_flags
-}  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/legacy_flags/xla_ops_common_flags.h b/tensorflow/compiler/jit/legacy_flags/xla_ops_common_flags.h
deleted file mode 100644
index 7c5c1818ef2..00000000000
--- a/tensorflow/compiler/jit/legacy_flags/xla_ops_common_flags.h
+++ /dev/null
@@ -1,36 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#ifndef TENSORFLOW_COMPILER_JIT_LEGACY_FLAGS_XLA_OPS_COMMON_FLAGS_H_
-#define TENSORFLOW_COMPILER_JIT_LEGACY_FLAGS_XLA_OPS_COMMON_FLAGS_H_
-
-namespace tensorflow {
-namespace legacy_flags {
-
-// Flags common to the _Xla* ops and their kernels.
-struct XlaOpsCommonFlags {
-  // If true, _XlaCompile always refuses to compile the cluster, which means the
-  // XLA clusters always run in the TF executor.  Defaults to false.
-  bool tf_xla_always_defer_compilation;
-};
-
-// Parses the flags in XlaOpsCommonFlags from the TF_XLA_FLAGS environment
-// variable and returns a reference to the parsed copy.  Parses TF_XLA_FLAGS
-// only the first time this routine is called.
-const XlaOpsCommonFlags& GetXlaOpsCommonFlags();
-
-}  // namespace legacy_flags
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_COMPILER_JIT_LEGACY_FLAGS_XLA_OPS_COMMON_FLAGS_H_
diff --git a/tensorflow/compiler/jit/mark_for_compilation_pass.cc b/tensorflow/compiler/jit/mark_for_compilation_pass.cc
index 70033cae0af..6618e3a58ab 100644
--- a/tensorflow/compiler/jit/mark_for_compilation_pass.cc
+++ b/tensorflow/compiler/jit/mark_for_compilation_pass.cc
@@ -24,8 +24,8 @@ limitations under the License.
 #include "absl/container/flat_hash_set.h"
 #include "tensorflow/compiler/jit/deadness_analysis.h"
 #include "tensorflow/compiler/jit/defs.h"
+#include "tensorflow/compiler/jit/flags.h"
 #include "tensorflow/compiler/jit/graphcycles/graphcycles.h"
-#include "tensorflow/compiler/jit/legacy_flags/mark_for_compilation_pass_flags.h"
 #include "tensorflow/compiler/jit/union_find.h"
 #include "tensorflow/compiler/jit/xla_cluster_util.h"
 #include "tensorflow/compiler/tf2xla/const_analysis.h"
@@ -72,6 +72,11 @@ struct OperationFilter {
   // to resort to a dummy implementation. Currently Assert and CheckNumerics ops
   // have dummy XLA implementations.
   bool allow_dummy_ops;
+
+  // Whether ops that produce or consume DT_VARIANT values are allowed.  We
+  // don't auto-cluster these ops because we don't yet support live-in or
+  // live-out DT_VARIANT values.
+  bool allow_ops_producing_or_consuming_variant;
 };
 
 bool IsDummyImplOp(absl::string_view op_name) {
@@ -81,7 +86,13 @@ bool IsDummyImplOp(absl::string_view op_name) {
 bool IsStatefulRandomOp(absl::string_view op_name) {
   return op_name == "RandomUniform" || op_name == "RandomShuffle" ||
          op_name == "RandomUniformInt" || op_name == "RandomStandardNormal" ||
-         op_name == "TruncatedNormal";
+         op_name == "TruncatedNormal" || op_name == "Multinomial";
+}
+
+bool OpProducesOrConsumesVariant(const Node& node) {
+  auto is_variant = [](DataType dtype) { return dtype == DT_VARIANT; };
+  return absl::c_any_of(node.input_types(), is_variant) ||
+         absl::c_any_of(node.output_types(), is_variant);
 }
 
 bool HasXLAKernel(const Node& node, const DeviceType& jit_device_type) {
@@ -246,6 +257,10 @@ bool IsCompilableCall(const NodeDef& call_def,
     if (!op_filter.allow_dummy_ops && IsDummyImplOp(node->type_string())) {
       return false;
     }
+    if (!op_filter.allow_ops_producing_or_consuming_variant &&
+        OpProducesOrConsumesVariant(*node)) {
+      return false;
+    }
     if (!HasXLAKernel(*node, jit_device_type) &&
         !IsCompilableCall(node->def(), jit_device_type, op_filter, depth + 1,
                           lib_runtime)) {
@@ -427,8 +442,7 @@ Status FindCompilationCandidates(
       BackwardsConstAnalysis(graph, /*compile_time_const_arg_indices=*/nullptr,
                              &compile_time_const_nodes));
 
-  int64& fuel =
-      legacy_flags::GetMarkForCompilationPassFlags()->tf_xla_clustering_fuel;
+  int64& fuel = GetMarkForCompilationPassFlags()->tf_xla_clustering_fuel;
 
   // Iterate over nodes in sorted order so that compiler fuel is deterministic.
   // We can't simply pass op_nodes().begin() and op_nodes().end to the
@@ -471,16 +485,15 @@ Status FindCompilationCandidates(
         XlaOpRegistry::GetCompilationDevice(device_type.type(), &registration));
     DeviceType jit_device_type(registration->compilation_device_name);
 
+    bool always_auto_cluster = registration->autoclustering_policy ==
+                               XlaOpRegistry::AutoclusteringPolicy::kAlways;
+
     OperationFilter op_filter;
     op_filter.allow_resource_ops = registration->compile_resource_ops;
-    op_filter.allow_stateful_rng_ops =
-        (registration->autoclustering_policy ==
-         XlaOpRegistry::AutoclusteringPolicy::kAlways);
-    op_filter.allow_control_trigger =
-        (registration->autoclustering_policy ==
-         XlaOpRegistry::AutoclusteringPolicy::kAlways);
-    op_filter.allow_dummy_ops = (registration->autoclustering_policy ==
-                                 XlaOpRegistry::AutoclusteringPolicy::kAlways);
+    op_filter.allow_stateful_rng_ops = always_auto_cluster;
+    op_filter.allow_control_trigger = always_auto_cluster;
+    op_filter.allow_dummy_ops = always_auto_cluster;
+    op_filter.allow_ops_producing_or_consuming_variant = always_auto_cluster;
 
     if (!HasXLAKernel(*node, jit_device_type) &&
         !IsCompilableCall(node->def(), jit_device_type, op_filter, 0,
@@ -504,6 +517,12 @@ Status FindCompilationCandidates(
               << node->type_string() << ")";
       continue;
     }
+    if (!op_filter.allow_ops_producing_or_consuming_variant &&
+        OpProducesOrConsumesVariant(*node)) {
+      VLOG(2) << "Rejecting " << node->name()
+              << ": produces or consumes DT_VARIANT";
+      continue;
+    }
 
     if (!op_filter.allow_resource_ops &&
         (HasResourceOutput(*node) || IsNonResourceVarResourceOp(*node))) {
@@ -607,8 +626,7 @@ OptimizerOptions::GlobalJitLevel GetGlobalJitLevel(
     // To set compilation to be on by default, change the following line.
     global_jit_level = OptimizerOptions::OFF;
   }
-  legacy_flags::MarkForCompilationPassFlags* flags =
-      legacy_flags::GetMarkForCompilationPassFlags();
+  MarkForCompilationPassFlags* flags = GetMarkForCompilationPassFlags();
   if (flags->tf_xla_auto_jit == -1 ||
       (1 <= flags->tf_xla_auto_jit && flags->tf_xla_auto_jit <= 2)) {
     // If the flag tf_xla_auto_jit is a valid, non-zero setting, it overrides
@@ -641,6 +659,7 @@ bool IsCompilable(FunctionLibraryRuntime* flr, const NodeDef& ndef) {
   op_filter.allow_stateful_rng_ops = true;
   op_filter.allow_control_trigger = true;
   op_filter.allow_dummy_ops = true;
+  op_filter.allow_ops_producing_or_consuming_variant = true;
 
   return IsCompilableCall(ndef, jit_device_type, op_filter, 0, flr);
 }
@@ -651,8 +670,7 @@ Status MarkForCompilationPass::Run(
   // device ahead of time.
   OptimizerOptions::GlobalJitLevel global_jit_level =
       GetGlobalJitLevel(options);
-  legacy_flags::MarkForCompilationPassFlags* flags =
-      legacy_flags::GetMarkForCompilationPassFlags();
+  MarkForCompilationPassFlags* flags = GetMarkForCompilationPassFlags();
   bool fusion_only = flags->tf_xla_fusion_only;
 
   VLOG(1) << "flags->tf_xla_fusion_only = " << flags->tf_xla_fusion_only;
@@ -953,8 +971,7 @@ Status MarkForCompilationPass::RunImpl(
 
   OptimizerOptions::GlobalJitLevel global_jit_level =
       GetGlobalJitLevel(options);
-  legacy_flags::MarkForCompilationPassFlags* flags =
-      legacy_flags::GetMarkForCompilationPassFlags();
+  MarkForCompilationPassFlags* flags = GetMarkForCompilationPassFlags();
 
   // Repeatedly contract edges between clusters that are on the same device,
   // provided the contraction would not create a cycle.
diff --git a/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc b/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc
index 24d78c07726..bf2c5508ea9 100644
--- a/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc
+++ b/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/cc/ops/array_ops.h"
 #include "tensorflow/cc/ops/control_flow_ops_internal.h"
 #include "tensorflow/cc/ops/function_ops.h"
+#include "tensorflow/cc/ops/list_ops.h"
 #include "tensorflow/cc/ops/resource_variable_ops.h"
 #include "tensorflow/cc/ops/sendrecv_ops.h"
 #include "tensorflow/cc/ops/standard_ops.h"
@@ -1147,5 +1148,80 @@ TEST(XlaCompilationTest, DontAutoClusterDummyOps) {
   EXPECT_EQ(clusters["test/check"], "");
 }
 
+TEST(XlaCompilationTest, DontAutoClusterOpsProducingVariant) {
+  Scope root = Scope::NewRootScope().ExitOnError();
+  Output a = ops::Placeholder(root.WithOpName("test/a"), DT_INT64);
+  Output b = ops::Placeholder(root.WithOpName("test/b"), DT_INT64);
+
+  Output cast_a = ops::Cast(root.WithOpName("test/cast_a"), a, DT_INT32);
+  Output cast_b = ops::Cast(root.WithOpName("test/cast_b"), b, DT_INT32);
+
+  Output tensor_list_reserve = ops::TensorListReserve(
+      root.WithOpName("test/tensor_list_reserve"), cast_a, cast_b, DT_FLOAT);
+
+  std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
+  TF_ASSERT_OK(root.ToGraph(graph.get()));
+
+  TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph));
+
+  std::unordered_map<string, string> clusters = GetClusters(*graph);
+  EXPECT_EQ(clusters["test/tensor_list_reserve"], "");
+}
+
+TEST(XlaCompilationTest, DontAutoClusterOpsConsumingVariant) {
+  Scope root = Scope::NewRootScope().ExitOnError();
+  Output dummy_input =
+      ops::Placeholder(root.WithOpName("test/dummy_input"), DT_INT64);
+  Output variant_input =
+      ops::Placeholder(root.WithOpName("test/variant_input"), DT_VARIANT);
+
+  // Create one more node so that we don't avoid creating a cluster solely
+  // because it would be trivial.
+  Output dummy_cast =
+      ops::Cast(root.WithOpName("test/dummy_cast"), dummy_input, DT_INT32);
+
+  Output tensor_list_element_shape = ops::TensorListElementShape(
+      root.WithOpName("test/tensor_list_element_shape"), variant_input,
+      DT_INT32);
+
+  root.graph()->AddControlEdge(dummy_cast.node(),
+                               tensor_list_element_shape.node());
+
+  std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
+  TF_ASSERT_OK(root.ToGraph(graph.get()));
+
+  TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph));
+
+  std::unordered_map<string, string> clusters = GetClusters(*graph);
+  EXPECT_EQ(clusters["test/tensor_list_element_shape"], "");
+}
+
+TEST(XlaCompilationTest, ClusterOpsProducingVariantIfOnXlaDevice) {
+  Scope root = Scope::NewRootScope().ExitOnError();
+  Output a = ops::Placeholder(root.WithOpName("test/a"), DT_INT64);
+  Output b = ops::Placeholder(root.WithOpName("test/b"), DT_INT64);
+
+  Output cast_a = ops::Cast(root.WithOpName("test/cast_a"), a, DT_INT32);
+  Output cast_b = ops::Cast(root.WithOpName("test/cast_b"), b, DT_INT32);
+
+  Output tensor_list_reserve = ops::TensorListReserve(
+      root.WithOpName("test/tensor_list_reserve"), cast_a, cast_b, DT_FLOAT);
+
+  std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
+  TF_ASSERT_OK(root.ToGraph(graph.get()));
+
+  string xla_cpu_device = "/job:worker/replica:0/task:0/device:XLA_CPU:0";
+  for (Node* n : graph->nodes()) {
+    if (absl::StartsWith(n->name(), /*prefix=*/"test/")) {
+      n->set_assigned_device_name(xla_cpu_device);
+    }
+  }
+
+  TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph));
+
+  std::unordered_map<string, string> clusters = GetClusters(*graph);
+  EXPECT_NE(clusters["test/tensor_list_reserve"], "");
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/mark_for_compilation_pass_test_helper.cc b/tensorflow/compiler/jit/mark_for_compilation_pass_test_helper.cc
index d56d0f8ccfc..64a33017457 100644
--- a/tensorflow/compiler/jit/mark_for_compilation_pass_test_helper.cc
+++ b/tensorflow/compiler/jit/mark_for_compilation_pass_test_helper.cc
@@ -34,15 +34,9 @@ namespace tensorflow {
   //
   // It may be worth refactoring out XlaOpRegistry::RegisterCompilationDevice to
   // make this more direct, but probably not worth it solely for this test.
-  std::vector<Device*> devices;
+  std::vector<std::unique_ptr<Device>> devices;
   TF_RETURN_IF_ERROR(DeviceFactory::AddDevices(*session_options, "", &devices));
 
-  auto delete_devices = gtl::MakeCleanup([&] {
-    for (Device* d : devices) {
-      delete d;
-    }
-  });
-
   GraphOptimizationPassOptions opt_options;
   opt_options.graph = graph;
   opt_options.session_options = session_options;
diff --git a/tensorflow/compiler/jit/ops/BUILD b/tensorflow/compiler/jit/ops/BUILD
index f72224545b2..64409d93347 100644
--- a/tensorflow/compiler/jit/ops/BUILD
+++ b/tensorflow/compiler/jit/ops/BUILD
@@ -18,3 +18,9 @@ tf_gen_op_wrapper_py(
     out = "xla_ops.py",
     deps = ["//tensorflow/compiler/jit/ops:xla_ops"],
 )
+
+py_library(
+    name = "xla_ops_grad",
+    srcs = ["xla_ops_grad.py"],
+    deps = ["//tensorflow/python:framework_ops"],
+)
diff --git a/tensorflow/contrib/estimator/python/estimator/dnn.py b/tensorflow/compiler/jit/ops/xla_ops_grad.py
similarity index 62%
rename from tensorflow/contrib/estimator/python/estimator/dnn.py
rename to tensorflow/compiler/jit/ops/xla_ops_grad.py
index 10f657df8de..2d31d8dc714 100644
--- a/tensorflow/contrib/estimator/python/estimator/dnn.py
+++ b/tensorflow/compiler/jit/ops/xla_ops_grad.py
@@ -1,3 +1,4 @@
+"""Gradients for XLA ops."""
 # Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -12,21 +13,17 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""dnn python module.
-
-Importing from tensorflow.python.estimator is unsupported
-and will soon break!
-"""
-# pylint: disable=unused-import,g-bad-import-order,g-import-not-at-top,wildcard-import
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow_estimator.contrib.estimator.python.estimator import dnn
+from tensorflow.python.framework import ops
 
-# Include attrs that start with single underscore.
-_HAS_DYNAMIC_ATTRIBUTES = True
-dnn.__all__ = [s for s in dir(dnn) if not s.startswith('__')]
 
-from tensorflow_estimator.contrib.estimator.python.estimator.dnn import *
+@ops.RegisterGradient("XlaClusterOutput")
+def _XlaClusterOutputGrad(_, grad):
+  del grad  # unused
+  raise RuntimeError("Gradient computation of graph in xla.compile() is "
+                     "prohibited because it can cause performance degradation."
+                     "Please move gradient computation inside xla.compile().")
diff --git a/tensorflow/compiler/jit/partially_decluster_pass.cc b/tensorflow/compiler/jit/partially_decluster_pass.cc
index 36b345ecbff..42ea3926e16 100644
--- a/tensorflow/compiler/jit/partially_decluster_pass.cc
+++ b/tensorflow/compiler/jit/partially_decluster_pass.cc
@@ -26,6 +26,10 @@ limitations under the License.
 
 namespace tensorflow {
 namespace {
+
+bool NotBackedge(const Edge& edge) { return !edge.src()->IsNextIteration(); }
+
+namespace reduce_device_to_host_copies {
 Status FindNodesToDecluster(const Graph& graph,
                             absl::flat_hash_set<Node*>* result,
                             absl::Span<Node* const> post_order) {
@@ -140,8 +144,6 @@ Status PartiallyDeclusterNode(Graph* graph, Node* n) {
   return Status::OK();
 }
 
-bool NotBackedge(const Edge& edge) { return !edge.src()->IsNextIteration(); }
-
 // Clones nodes to outside their cluster to avoid device-to-host copies.  For
 // instance, converts this:
 //
@@ -168,7 +170,7 @@ bool NotBackedge(const Edge& edge) { return !edge.src()->IsNextIteration(); }
 // where the ===> arrow has a hostmem source and destination and would entail a
 // device to host copy if the source and destination were not in the same XLA
 // cluster.
-Status PartiallyDeclusterToRemoveDeviceToHostCopies(Graph* graph) {
+Status PartiallyDeclusterGraph(Graph* graph) {
   // When deciding whether to decluster a particular node, we base our decision
   // on if we've decided that some of its consumers have to be declustered too.
   // Iterating the graph in post-order guarantees that consumers have been
@@ -206,7 +208,9 @@ Status PartiallyDeclusterToRemoveDeviceToHostCopies(Graph* graph) {
 
   return Status::OK();
 }
+}  // namespace reduce_device_to_host_copies
 
+namespace reduce_recompilation {
 bool IsIntraClusterEdge(const Edge& edge) {
   absl::optional<absl::string_view> src_cluster_name =
       GetXlaClusterForNode(*edge.src());
@@ -269,7 +273,7 @@ Status MustCompileNode(const Node* n, bool* must_compile) {
 // regress performance in any significant manner.  We will have to revisit this
 // algorith with a more complex cost model if this assumption turns out to be
 // incorrect.
-Status DeclusterNodesToReduceRecompilations(Graph* graph) {
+Status PartiallyDeclusterGraph(Graph* graph) {
   std::vector<bool> compile_time_const_nodes(graph->num_node_ids());
   TF_RETURN_IF_ERROR(BackwardsConstAnalysis(
       *graph, nullptr, &compile_time_const_nodes, IsIntraClusterEdge));
@@ -322,7 +326,7 @@ Status DeclusterNodesToReduceRecompilations(Graph* graph) {
 
   return Status::OK();
 }
-
+}  // namespace reduce_recompilation
 }  // namespace
 
 Status PartiallyDeclusterPass::Run(
@@ -334,8 +338,9 @@ Status PartiallyDeclusterPass::Run(
 
   Graph* graph = options.graph->get();
 
-  TF_RETURN_IF_ERROR(PartiallyDeclusterToRemoveDeviceToHostCopies(graph));
-  TF_RETURN_IF_ERROR(DeclusterNodesToReduceRecompilations(graph));
+  TF_RETURN_IF_ERROR(
+      reduce_device_to_host_copies::PartiallyDeclusterGraph(graph));
+  TF_RETURN_IF_ERROR(reduce_recompilation::PartiallyDeclusterGraph(graph));
 
   return Status::OK();
 }
diff --git a/tensorflow/compiler/jit/partially_decluster_pass_test.cc b/tensorflow/compiler/jit/partially_decluster_pass_test.cc
index 1fc5da5071f..38a54cc5efa 100644
--- a/tensorflow/compiler/jit/partially_decluster_pass_test.cc
+++ b/tensorflow/compiler/jit/partially_decluster_pass_test.cc
@@ -386,7 +386,7 @@ TEST(PartiallyDeclusterPassTest, DontDeclusterXlaDeviceOps) {
   TF_ASSERT_OK(s.ToGraph(graph.get()));
 
   // This is needed to register the XLA_GPU device.
-  std::vector<Device*> devices;
+  std::vector<std::unique_ptr<Device>> devices;
   TF_ASSERT_OK(DeviceFactory::AddDevices(
       SessionOptions(), "/job:localhost/replica:0/task:0", &devices));
 
@@ -400,10 +400,6 @@ TEST(PartiallyDeclusterPassTest, DontDeclusterXlaDeviceOps) {
   TF_ASSERT_OK(PartiallyDecluster(&graph));
 
   EXPECT_EQ(GetXlaClusterForNode(*n), "cluster_0");
-
-  for (Device* d : devices) {
-    delete d;
-  }
 }
 
 TEST(PartiallyDeclusterPassTest, DontDeclusterNonTensorFlowOps) {
diff --git a/tensorflow/compiler/jit/xla_cpu_device.cc b/tensorflow/compiler/jit/xla_cpu_device.cc
index 116e0756036..7df898ad12a 100644
--- a/tensorflow/compiler/jit/xla_cpu_device.cc
+++ b/tensorflow/compiler/jit/xla_cpu_device.cc
@@ -17,8 +17,8 @@ limitations under the License.
 // operators using XLA via the XLA "Host" (CPU) backend.
 
 #include "absl/memory/memory.h"
+#include "tensorflow/compiler/jit/flags.h"
 #include "tensorflow/compiler/jit/kernels/xla_ops.h"
-#include "tensorflow/compiler/jit/legacy_flags/xla_device_flags.h"
 #include "tensorflow/compiler/jit/xla_compile_on_demand_op.h"
 #include "tensorflow/compiler/jit/xla_device.h"
 #include "tensorflow/compiler/jit/xla_device_ops.h"
@@ -31,13 +31,13 @@ namespace tensorflow {
 class XlaCpuDeviceFactory : public DeviceFactory {
  public:
   Status CreateDevices(const SessionOptions& options, const string& name_prefix,
-                       std::vector<Device*>* devices) override;
+                       std::vector<std::unique_ptr<Device>>* devices) override;
 };
 
-Status XlaCpuDeviceFactory::CreateDevices(const SessionOptions& session_options,
-                                          const string& name_prefix,
-                                          std::vector<Device*>* devices) {
-  legacy_flags::XlaDeviceFlags* flags = legacy_flags::GetXlaDeviceFlags();
+Status XlaCpuDeviceFactory::CreateDevices(
+    const SessionOptions& session_options, const string& name_prefix,
+    std::vector<std::unique_ptr<Device>>* devices) {
+  XlaDeviceFlags* flags = GetXlaDeviceFlags();
   bool compile_on_demand = flags->tf_xla_compile_on_demand;
 
   XlaOpRegistry::DeviceRegistration registration;
@@ -63,8 +63,7 @@ Status XlaCpuDeviceFactory::CreateDevices(const SessionOptions& session_options,
   options.device_ordinal = 0;
   options.compilation_device_name = DEVICE_CPU_XLA_JIT;
   options.use_multiple_streams = false;
-  auto device = absl::make_unique<XlaDevice>(session_options, options);
-  devices->push_back(device.release());
+  devices->push_back(absl::make_unique<XlaDevice>(session_options, options));
   return Status::OK();
 }
 
diff --git a/tensorflow/compiler/jit/xla_device.cc b/tensorflow/compiler/jit/xla_device.cc
index 5c1b55cb57f..4201ff91a89 100644
--- a/tensorflow/compiler/jit/xla_device.cc
+++ b/tensorflow/compiler/jit/xla_device.cc
@@ -218,6 +218,9 @@ XlaDevice::XlaDevice(const SessionOptions& session_options,
 XlaDevice::~XlaDevice() {
   VLOG(1) << "Destroying XLA device " << jit_device_name_ << " " << this;
   mutex_lock lock(mu_);
+  while (outstanding_asynchronous_operations_ > 0) {
+    outstanding_asynchronous_operations_cv_.wait(lock);
+  }
   if (device_context_) {
     device_context_->Unref();
   }
@@ -384,6 +387,7 @@ void XlaDevice::ComputeAsync(AsyncOpKernel* op_kernel, OpKernelContext* context,
 
 Status XlaDevice::Sync() {
   VLOG(1) << "XlaDevice::Sync";
+  tracing::ScopedActivity activity("XlaDevice::Sync", /*is_expensive=*/true);
   std::shared_ptr<se::Stream> stream;
   {
     mutex_lock lock(mu_);
@@ -391,13 +395,46 @@ Status XlaDevice::Sync() {
   }
   if (!stream) return Status::OK();
 
-  if (!stream->parent()->SynchronizeAllActivity() || !stream->ok()) {
+  Status status = stream->BlockHostUntilDone();
+  {
+    mutex_lock lock(mu_);
+    while (outstanding_asynchronous_operations_ > 0) {
+      outstanding_asynchronous_operations_cv_.wait(lock);
+    }
+  }
+  TF_RETURN_IF_ERROR(status);
+  if (!stream->ok()) {
     return errors::Internal("XlaDevice::Sync() failed.");
   }
   VLOG(1) << "XlaDevice::Sync completed";
   return Status::OK();
 }
 
+void XlaDevice::Sync(const DoneCallback& done) {
+  VLOG(1) << "XlaDevice::Sync (asynchronous)";
+  std::shared_ptr<se::Stream> stream;
+  {
+    mutex_lock lock(mu_);
+    stream = stream_;
+  }
+  if (!stream) {
+    done(Status::OK());
+    return;
+  }
+
+  stream->ThenEnqueueOnBackgroundThread(
+      [this, stream, done](se::StreamExecutor*) {
+        tracing::ScopedActivity activity("XlaDevice::Sync::Callback",
+                                         /*is_expensive=*/true);
+        mutex_lock lock(mu_);
+        while (outstanding_asynchronous_operations_ > 0) {
+          outstanding_asynchronous_operations_cv_.wait(lock);
+        }
+        done(stream->ok() ? Status::OK()
+                          : errors::Internal("XlaDevice::Sync() failed."));
+      });
+}
+
 Status XlaDevice::MakeTensorFromProto(const TensorProto& tensor_proto,
                                       const AllocatorAttributes alloc_attrs,
                                       Tensor* tensor) {
@@ -441,6 +478,49 @@ bool XlaDevice::RequiresSyncOnCompletion() const {
   return sync_on_completion_;
 }
 
+XlaDevice::AsynchronousOperationHandle::AsynchronousOperationHandle(
+    XlaDevice* device)
+    : device_(device) {
+  mutex_lock lock(device_->mu_);
+  ++device_->outstanding_asynchronous_operations_;
+}
+
+XlaDevice::AsynchronousOperationHandle::~AsynchronousOperationHandle() {
+  if (device_) {
+    mutex_lock lock(device_->mu_);
+    --device_->outstanding_asynchronous_operations_;
+    device_->outstanding_asynchronous_operations_cv_.notify_all();
+  }
+}
+
+XlaDevice::AsynchronousOperationHandle::AsynchronousOperationHandle(
+    const XlaDevice::AsynchronousOperationHandle& other)
+    : device_(other.device_) {
+  mutex_lock lock(device_->mu_);
+  ++device_->outstanding_asynchronous_operations_;
+}
+
+XlaDevice::AsynchronousOperationHandle::AsynchronousOperationHandle(
+    XlaDevice::AsynchronousOperationHandle&& other)
+    : device_(other.device_) {
+  other.device_ = nullptr;
+}
+
+XlaDevice::AsynchronousOperationHandle& XlaDevice::AsynchronousOperationHandle::
+operator=(const XlaDevice::AsynchronousOperationHandle& other) {
+  device_ = other.device_;
+  mutex_lock lock(device_->mu_);
+  ++device_->outstanding_asynchronous_operations_;
+  return *this;
+}
+
+XlaDevice::AsynchronousOperationHandle& XlaDevice::AsynchronousOperationHandle::
+operator=(XlaDevice::AsynchronousOperationHandle&& other) {
+  device_ = other.device_;
+  other.device_ = nullptr;
+  return *this;
+}
+
 XlaDeviceOpRegistrations* RegisterXlaDeviceKernels(const char* device,
                                                    const char* jit_device) {
   // Any op assigned to the device that isn't rewritten by the graph rewriter
diff --git a/tensorflow/compiler/jit/xla_device.h b/tensorflow/compiler/jit/xla_device.h
index 49f53b477ef..c8bb276cdb9 100644
--- a/tensorflow/compiler/jit/xla_device.h
+++ b/tensorflow/compiler/jit/xla_device.h
@@ -135,6 +135,7 @@ class XlaDevice : public LocalDevice {
   void ComputeAsync(AsyncOpKernel* op_kernel, OpKernelContext* context,
                     AsyncOpKernel::DoneCallback done) override;
   Status Sync() override;
+  void Sync(const DoneCallback& done) override;
 
   Status FillContextMap(const Graph* graph,
                         DeviceContextMap* device_context_map) override
@@ -164,7 +165,30 @@ class XlaDevice : public LocalDevice {
 
   bool RequiresSyncOnCompletion() const override LOCKS_EXCLUDED(mu_);
 
+  // A simple RAII handle. On construction the device's
+  // outstanding_asynchronous_operations_ field is incremented; on destruction
+  // it is decremented.
+  class AsynchronousOperationHandle {
+   public:
+    AsynchronousOperationHandle(XlaDevice* device);
+    ~AsynchronousOperationHandle();
+    AsynchronousOperationHandle(const AsynchronousOperationHandle& other);
+    AsynchronousOperationHandle(AsynchronousOperationHandle&& other);
+    AsynchronousOperationHandle& operator=(
+        const AsynchronousOperationHandle& other);
+    AsynchronousOperationHandle& operator=(AsynchronousOperationHandle&& other);
+
+   private:
+    XlaDevice* device_ = nullptr;
+  };
+
+  AsynchronousOperationHandle CreateAsynchronousOperationHandle() {
+    return AsynchronousOperationHandle(this);
+  }
+
  private:
+  friend class AsynchronousOperationHandle;
+
   xla::LocalClient* client() const;
   Allocator* GetAllocatorLocked(AllocatorAttributes attr)
       EXCLUSIVE_LOCKS_REQUIRED(mu_);
@@ -227,6 +251,11 @@ class XlaDevice : public LocalDevice {
   // True if the device requires XlaDevice::Sync to be called on completion
   // regardless of status.
   bool sync_on_completion_ GUARDED_BY(mu_) = false;
+
+  // Count of outstanding asynchronous operations which must be zero on Sync()
+  // completion.
+  int64 outstanding_asynchronous_operations_ GUARDED_BY(mu_) = 0;
+  condition_variable outstanding_asynchronous_operations_cv_;
 };
 
 // Builds OpKernel registrations on 'device' for the JIT operators
diff --git a/tensorflow/compiler/jit/xla_gpu_device.cc b/tensorflow/compiler/jit/xla_gpu_device.cc
index 44197016958..944f732b99c 100644
--- a/tensorflow/compiler/jit/xla_gpu_device.cc
+++ b/tensorflow/compiler/jit/xla_gpu_device.cc
@@ -29,12 +29,12 @@ namespace tensorflow {
 class XlaGpuDeviceFactory : public DeviceFactory {
  public:
   Status CreateDevices(const SessionOptions& options, const string& name_prefix,
-                       std::vector<Device*>* devices) override;
+                       std::vector<std::unique_ptr<Device>>* devices) override;
 };
 
-Status XlaGpuDeviceFactory::CreateDevices(const SessionOptions& session_options,
-                                          const string& name_prefix,
-                                          std::vector<Device*>* devices) {
+Status XlaGpuDeviceFactory::CreateDevices(
+    const SessionOptions& session_options, const string& name_prefix,
+    std::vector<std::unique_ptr<Device>>* devices) {
   XlaOpRegistry::DeviceRegistration registration;
   registration.compilation_device_name = DEVICE_GPU_XLA_JIT;
   registration.autoclustering_policy =
@@ -70,7 +70,7 @@ Status XlaGpuDeviceFactory::CreateDevices(const SessionOptions& session_options,
       return status;
     }
 
-    devices->push_back(device.release());
+    devices->push_back(std::move(device));
   }
   return Status::OK();
 }
diff --git a/tensorflow/compiler/jit/xla_interpreter_device.cc b/tensorflow/compiler/jit/xla_interpreter_device.cc
index e828bae865d..4007309ed1c 100644
--- a/tensorflow/compiler/jit/xla_interpreter_device.cc
+++ b/tensorflow/compiler/jit/xla_interpreter_device.cc
@@ -33,12 +33,12 @@ constexpr std::array<DataType, 9> kExecAllTypes = {
 class XlaInterpreterDeviceFactory : public DeviceFactory {
  public:
   Status CreateDevices(const SessionOptions& options, const string& name_prefix,
-                       std::vector<Device*>* devices) override;
+                       std::vector<std::unique_ptr<Device>>* devices) override;
 };
 
 Status XlaInterpreterDeviceFactory::CreateDevices(
     const SessionOptions& session_options, const string& name_prefix,
-    std::vector<Device*>* devices) {
+    std::vector<std::unique_ptr<Device>>* devices) {
   static XlaDeviceOpRegistrations* registrations = RegisterXlaDeviceKernels(
       DEVICE_XLA_INTERPRETER, DEVICE_INTERPRETER_XLA_JIT);
   (void)registrations;
@@ -61,8 +61,7 @@ Status XlaInterpreterDeviceFactory::CreateDevices(
   options.device_ordinal = 0;
   options.compilation_device_name = DEVICE_INTERPRETER_XLA_JIT;
   options.use_multiple_streams = false;
-  auto device = absl::make_unique<XlaDevice>(session_options, options);
-  devices->push_back(device.release());
+  devices->push_back(absl::make_unique<XlaDevice>(session_options, options));
 
   return Status::OK();
 }
diff --git a/tensorflow/compiler/tests/BUILD b/tensorflow/compiler/tests/BUILD
index 6b8e6bba1e1..bc3d60b90e5 100644
--- a/tensorflow/compiler/tests/BUILD
+++ b/tensorflow/compiler/tests/BUILD
@@ -375,27 +375,6 @@ tf_xla_py_test(
     ],
 )
 
-tf_xla_py_test(
-    name = "resampler_ops_test",
-    size = "small",
-    srcs = ["resampler_ops_test.py"],
-    disabled_backends = [
-        # TODO(b/74459949) Support BatchDot in CPU backend.
-        "cpu",
-        "cpu_ondemand",
-    ],
-    # TODO(b/112295522): figure out how to make OSS build pass.
-    tags = ["no_oss"],
-    deps = [
-        ":xla_test",
-        "//tensorflow/contrib/resampler:resampler_ops",
-        "//tensorflow/contrib/resampler:resampler_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:platform_test",
-    ],
-)
-
 tf_xla_py_test(
     name = "dynamic_stitch_test",
     size = "small",
@@ -474,7 +453,6 @@ tf_xla_py_test(
         "//tensorflow/python:extra_py_tests_deps",
         "//tensorflow/python:framework",
         "//tensorflow/python:platform_test",
-        "//tensorflow/python:spectral_ops",
         "//tensorflow/python/ops/signal",
     ],
 )
diff --git a/tensorflow/compiler/tests/adagrad_da_test.py b/tensorflow/compiler/tests/adagrad_da_test.py
index 69fb3ec2964..e9c2d363aca 100644
--- a/tensorflow/compiler/tests/adagrad_da_test.py
+++ b/tensorflow/compiler/tests/adagrad_da_test.py
@@ -50,8 +50,8 @@ class AdagradDAOptimizerTest(xla_test.XLATestCase):
             zip([grads0, grads1], [var0, var1]), global_step=global_step)
         variables.global_variables_initializer().run()
 
-        self.assertAllClose([0.0, 0.0], var0.eval())
-        self.assertAllClose([0.0, 0.0], var1.eval())
+        self.assertAllClose([0.0, 0.0], self.evaluate(var0))
+        self.assertAllClose([0.0, 0.0], self.evaluate(var1))
 
         # Run a step of AdagradDA
         update.run()
@@ -63,9 +63,9 @@ class AdagradDAOptimizerTest(xla_test.XLATestCase):
         # For -0.1*3.0*(0.1 - 0)/(0 + sqrt(0.1 + 0.1*0.1)) = -0.904534
         # similarly for others.
         self.assertAllCloseAccordingToType(
-            np.array([-0.904534, -1.603567]), var0.eval())
+            np.array([-0.904534, -1.603567]), self.evaluate(var0))
         self.assertAllCloseAccordingToType(
-            np.array([-0.094821, -0.189358]), var1.eval())
+            np.array([-0.094821, -0.189358]), self.evaluate(var1))
 
   def testAdagradDAwithoutRegularizationBasic2(self):
     for dtype in self.float_types:
@@ -87,16 +87,16 @@ class AdagradDAOptimizerTest(xla_test.XLATestCase):
             zip([grads0, grads1], [var0, var1]), global_step=global_step)
         variables.global_variables_initializer().run()
 
-        self.assertAllCloseAccordingToType([1.0, 2.0], var0.eval())
-        self.assertAllCloseAccordingToType([4.0, 3.0], var1.eval())
+        self.assertAllCloseAccordingToType([1.0, 2.0], self.evaluate(var0))
+        self.assertAllCloseAccordingToType([4.0, 3.0], self.evaluate(var1))
 
         # Run a step of AdagradDA
         update.run()
 
         self.assertAllCloseAccordingToType(
-            np.array([-0.904534, -1.603567]), var0.eval())
+            np.array([-0.904534, -1.603567]), self.evaluate(var0))
         self.assertAllCloseAccordingToType(
-            np.array([-0.094821, -0.189358]), var1.eval())
+            np.array([-0.094821, -0.189358]), self.evaluate(var1))
 
   def testAdagradDAWithL1(self):
     for dtype in self.float_types:
@@ -118,16 +118,16 @@ class AdagradDAOptimizerTest(xla_test.XLATestCase):
             zip([grads0, grads1], [var0, var1]), global_step=global_step)
         variables.global_variables_initializer().run()
 
-        self.assertAllCloseAccordingToType([1.0, 2.0], var0.eval())
-        self.assertAllCloseAccordingToType([4.0, 3.0], var1.eval())
+        self.assertAllCloseAccordingToType([1.0, 2.0], self.evaluate(var0))
+        self.assertAllCloseAccordingToType([4.0, 3.0], self.evaluate(var1))
 
         # Run a step of AdagradDA
         update.run()
 
         self.assertAllCloseAccordingToType(
-            np.array([-0.895489, -1.59555]), var0.eval())
+            np.array([-0.895489, -1.59555]), self.evaluate(var0))
         self.assertAllCloseAccordingToType(
-            np.array([-0.085339, -0.17989]), var1.eval())
+            np.array([-0.085339, -0.17989]), self.evaluate(var1))
 
   def testAdagradDAWithL1_L2(self):
     for dtype in self.float_types:
@@ -149,16 +149,16 @@ class AdagradDAOptimizerTest(xla_test.XLATestCase):
             zip([grads0, grads1], [var0, var1]), global_step=global_step)
         variables.global_variables_initializer().run()
 
-        self.assertAllCloseAccordingToType([1.0, 2.0], var0.eval())
-        self.assertAllCloseAccordingToType([4.0, 3.0], var1.eval())
+        self.assertAllCloseAccordingToType([1.0, 2.0], self.evaluate(var0))
+        self.assertAllCloseAccordingToType([4.0, 3.0], self.evaluate(var1))
 
         # Run a step of AdagradDA
         update.run()
 
         self.assertAllCloseAccordingToType(
-            np.array([-0.046907, -0.093659]), var0.eval())
+            np.array([-0.046907, -0.093659]), self.evaluate(var0))
         self.assertAllCloseAccordingToType(
-            np.array([-0.004275, -0.009023]), var1.eval())
+            np.array([-0.004275, -0.009023]), self.evaluate(var1))
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/compiler/tests/adagrad_test.py b/tensorflow/compiler/tests/adagrad_test.py
index ab69319c59f..e26483303c3 100644
--- a/tensorflow/compiler/tests/adagrad_test.py
+++ b/tensorflow/compiler/tests/adagrad_test.py
@@ -42,17 +42,19 @@ class AdagradOptimizerTest(xla_test.XLATestCase):
             zip([grads0, grads1], [var0, var1]))
         variables.global_variables_initializer().run()
         # Fetch params to validate initial values
-        self.assertAllClose([1.0, 2.0], var0.eval())
-        self.assertAllClose([3.0, 4.0], var1.eval())
+        self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+        self.assertAllClose([3.0, 4.0], self.evaluate(var1))
         # Run 3 steps of adagrad
         for _ in range(3):
           ada_update.run()
         # Validate updated params
         self.assertAllCloseAccordingToType(
-            np.array([-1.6026098728179932, -0.6026098728179932]), var0.eval(),
+            np.array([-1.6026098728179932, -0.6026098728179932]),
+            self.evaluate(var0),
             float_rtol=1e-5)
         self.assertAllCloseAccordingToType(
-            np.array([2.715679168701172, 3.715679168701172]), var1.eval(),
+            np.array([2.715679168701172, 3.715679168701172]),
+            self.evaluate(var1),
             float_rtol=1e-5)
 
   def testTensorLearningRate(self):
@@ -68,17 +70,19 @@ class AdagradOptimizerTest(xla_test.XLATestCase):
             zip([grads0, grads1], [var0, var1]))
         variables.global_variables_initializer().run()
         # Fetch params to validate initial values
-        self.assertAllClose([1.0, 2.0], var0.eval())
-        self.assertAllClose([3.0, 4.0], var1.eval())
+        self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+        self.assertAllClose([3.0, 4.0], self.evaluate(var1))
         # Run 3 steps of adagrad
         for _ in range(3):
           ada_update.run()
         # Validate updated params
         self.assertAllCloseAccordingToType(
-            np.array([-1.6026098728179932, -0.6026098728179932]), var0.eval(),
+            np.array([-1.6026098728179932, -0.6026098728179932]),
+            self.evaluate(var0),
             float_rtol=1e-5)
         self.assertAllCloseAccordingToType(
-            np.array([2.715679168701172, 3.715679168701172]), var1.eval(),
+            np.array([2.715679168701172, 3.715679168701172]),
+            self.evaluate(var1),
             float_rtol=1e-5)
 
   def testSharing(self):
@@ -103,18 +107,20 @@ class AdagradOptimizerTest(xla_test.XLATestCase):
         variables.global_variables_initializer().run()
 
         # Fetch params to validate initial values.
-        self.assertAllClose([1.0, 2.0], var0.eval())
-        self.assertAllClose([3.0, 4.0], var1.eval())
+        self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+        self.assertAllClose([3.0, 4.0], self.evaluate(var1))
         # Mix the first and the second adagrad for 3 steps.
         ada_update1.run()
         ada_update2.run()
         ada_update1.run()
         # Validate updated params (the same as with only 1 Adagrad).
         self.assertAllCloseAccordingToType(
-            np.array([-1.6026098728179932, -0.6026098728179932]), var0.eval(),
+            np.array([-1.6026098728179932, -0.6026098728179932]),
+            self.evaluate(var0),
             float_rtol=1e-5)
         self.assertAllCloseAccordingToType(
-            np.array([2.715679168701172, 3.715679168701172]), var1.eval(),
+            np.array([2.715679168701172, 3.715679168701172]),
+            self.evaluate(var1),
             float_rtol=1e-5)
 
 
diff --git a/tensorflow/compiler/tests/adam_test.py b/tensorflow/compiler/tests/adam_test.py
index 058576b3d4b..8bcff9d379d 100644
--- a/tensorflow/compiler/tests/adam_test.py
+++ b/tensorflow/compiler/tests/adam_test.py
@@ -75,23 +75,24 @@ class AdamOptimizerTest(xla_test.XLATestCase):
         variables.global_variables_initializer().run()
 
         # Fetch params to validate initial values
-        self.assertAllClose([1.0, 2.0], var0.eval())
-        self.assertAllClose([3.0, 4.0], var1.eval())
+        self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+        self.assertAllClose([3.0, 4.0], self.evaluate(var1))
 
         beta1_power, beta2_power = opt._get_beta_accumulators()
 
         # Run 3 steps of Adam
         for t in range(1, 4):
-          self.assertAllCloseAccordingToType(0.9**t, beta1_power.eval())
-          self.assertAllCloseAccordingToType(0.999**t, beta2_power.eval())
+          self.assertAllCloseAccordingToType(0.9**t, self.evaluate(beta1_power))
+          self.assertAllCloseAccordingToType(0.999**t,
+                                             self.evaluate(beta2_power))
           update.run(feed_dict={grads0: grads0_np, grads1: grads1_np})
 
           var0_np, m0, v0 = adam_update_numpy(var0_np, grads0_np, t, m0, v0)
           var1_np, m1, v1 = adam_update_numpy(var1_np, grads1_np, t, m1, v1)
 
           # Validate updated params
-          self.assertAllCloseAccordingToType(var0_np, var0.eval())
-          self.assertAllCloseAccordingToType(var1_np, var1.eval())
+          self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
+          self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
 
   def testTensorLearningRate(self):
     for dtype in self.float_types:
@@ -117,23 +118,24 @@ class AdamOptimizerTest(xla_test.XLATestCase):
         variables.global_variables_initializer().run()
 
         # Fetch params to validate initial values
-        self.assertAllClose([1.0, 2.0], var0.eval())
-        self.assertAllClose([3.0, 4.0], var1.eval())
+        self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+        self.assertAllClose([3.0, 4.0], self.evaluate(var1))
 
         beta1_power, beta2_power = opt._get_beta_accumulators()
 
         # Run 3 steps of Adam
         for t in range(1, 4):
-          self.assertAllCloseAccordingToType(0.9**t, beta1_power.eval())
-          self.assertAllCloseAccordingToType(0.999**t, beta2_power.eval())
+          self.assertAllCloseAccordingToType(0.9**t, self.evaluate(beta1_power))
+          self.assertAllCloseAccordingToType(0.999**t,
+                                             self.evaluate(beta2_power))
           update.run(feed_dict={grads0: grads0_np, grads1: grads1_np})
 
           var0_np, m0, v0 = adam_update_numpy(var0_np, grads0_np, t, m0, v0)
           var1_np, m1, v1 = adam_update_numpy(var1_np, grads1_np, t, m1, v1)
 
           # Validate updated params
-          self.assertAllCloseAccordingToType(var0_np, var0.eval())
-          self.assertAllCloseAccordingToType(var1_np, var1.eval())
+          self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
+          self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
 
   def testSharing(self):
     for dtype in self.float_types:
@@ -162,13 +164,14 @@ class AdamOptimizerTest(xla_test.XLATestCase):
         beta1_power, beta2_power = opt._get_beta_accumulators()
 
         # Fetch params to validate initial values
-        self.assertAllClose([1.0, 2.0], var0.eval())
-        self.assertAllClose([3.0, 4.0], var1.eval())
+        self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+        self.assertAllClose([3.0, 4.0], self.evaluate(var1))
 
         # Run 3 steps of intertwined Adam1 and Adam2.
         for t in range(1, 4):
-          self.assertAllCloseAccordingToType(0.9**t, beta1_power.eval())
-          self.assertAllCloseAccordingToType(0.999**t, beta2_power.eval())
+          self.assertAllCloseAccordingToType(0.9**t, self.evaluate(beta1_power))
+          self.assertAllCloseAccordingToType(0.999**t,
+                                             self.evaluate(beta2_power))
           if t % 2 == 0:
             update1.run(feed_dict={grads0: grads0_np, grads1: grads1_np})
           else:
@@ -178,8 +181,8 @@ class AdamOptimizerTest(xla_test.XLATestCase):
           var1_np, m1, v1 = adam_update_numpy(var1_np, grads1_np, t, m1, v1)
 
           # Validate updated params
-          self.assertAllCloseAccordingToType(var0_np, var0.eval())
-          self.assertAllCloseAccordingToType(var1_np, var1.eval())
+          self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
+          self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/compiler/tests/adamax_test.py b/tensorflow/compiler/tests/adamax_test.py
index 3ed1d41b712..961b46375c9 100644
--- a/tensorflow/compiler/tests/adamax_test.py
+++ b/tensorflow/compiler/tests/adamax_test.py
@@ -78,8 +78,8 @@ class AdaMaxOptimizerTest(xla_test.XLATestCase):
 
         variables.global_variables_initializer().run()
         # Fetch params to validate initial values
-        self.assertAllClose([1.0, 2.0], var0.eval())
-        self.assertAllClose([3.0, 4.0], var1.eval())
+        self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+        self.assertAllClose([3.0, 4.0], self.evaluate(var1))
 
         beta1_power = opt._get_beta_accumulators()
 
@@ -87,14 +87,17 @@ class AdaMaxOptimizerTest(xla_test.XLATestCase):
         for t in range(1, 4):
           update.run()
 
-          self.assertAllCloseAccordingToType(0.9**(t + 1), beta1_power.eval())
+          self.assertAllCloseAccordingToType(0.9**(t + 1),
+                                             self.evaluate(beta1_power))
 
           var0_np, m0, v0 = adamax_update_numpy(var0_np, grads0_np, t, m0, v0)
           var1_np, m1, v1 = adamax_update_numpy(var1_np, grads1_np, t, m1, v1)
 
           # Validate updated params
-          self.assertAllCloseAccordingToType(var0_np, var0.eval(), rtol=1e-2)
-          self.assertAllCloseAccordingToType(var1_np, var1.eval(), rtol=1e-2)
+          self.assertAllCloseAccordingToType(
+              var0_np, self.evaluate(var0), rtol=1e-2)
+          self.assertAllCloseAccordingToType(
+              var1_np, self.evaluate(var1), rtol=1e-2)
           self.assertEqual("var0_%d/AdaMax:0" % (i,),
                            opt.get_slot(var=var0, name="m").name)
 
@@ -118,22 +121,23 @@ class AdaMaxOptimizerTest(xla_test.XLATestCase):
         variables.global_variables_initializer().run()
 
         # Fetch params to validate initial values
-        self.assertAllClose([1.0, 2.0], var0.eval())
-        self.assertAllClose([3.0, 4.0], var1.eval())
+        self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+        self.assertAllClose([3.0, 4.0], self.evaluate(var1))
 
         beta1_power = opt._get_beta_accumulators()
 
         # Run 3 steps of AdaMax
         for t in range(1, 4):
-          self.assertAllCloseAccordingToType(0.9**t, beta1_power.eval())
+          self.assertAllCloseAccordingToType(0.9**t, self.evaluate(beta1_power))
           update.run()
 
           var0_np, m0, v0 = adamax_update_numpy(var0_np, grads0_np, t, m0, v0)
           var1_np, m1, v1 = adamax_update_numpy(var1_np, grads1_np, t, m1, v1)
 
           # Validate updated params
-          self.assertAllCloseAccordingToType(var0_np, var0.eval())
-          self.assertAllCloseAccordingToType(var1_np, var1.eval())
+          self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
+          self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/compiler/tests/addsign_test.py b/tensorflow/compiler/tests/addsign_test.py
index 1bc07ace23c..a37c97e6d37 100644
--- a/tensorflow/compiler/tests/addsign_test.py
+++ b/tensorflow/compiler/tests/addsign_test.py
@@ -90,8 +90,8 @@ class AddSignTest(xla_test.XLATestCase):
         variables.global_variables_initializer().run()
 
         # Fetch params to validate initial values
-        self.assertAllClose([1.0, 2.0], var0.eval())
-        self.assertAllClose([3.0, 4.0], var1.eval())
+        self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+        self.assertAllClose([3.0, 4.0], self.evaluate(var1))
 
         # Run 7 steps of AddSign
         # first 4 steps with positive gradient
@@ -125,8 +125,8 @@ class AddSignTest(xla_test.XLATestCase):
 
           # Validate updated params
           self.assertAllCloseAccordingToType(
-              var0_np, var0.eval(), half_rtol=1e-2)
-          self.assertAllCloseAccordingToType(var1_np, var1.eval())
+              var0_np, self.evaluate(var0), half_rtol=1e-2)
+          self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
 
   def testDense(self):
     decay_steps = 10
diff --git a/tensorflow/compiler/tests/binary_ops_test.py b/tensorflow/compiler/tests/binary_ops_test.py
index 332381c59ee..9a5423c1b2a 100644
--- a/tensorflow/compiler/tests/binary_ops_test.py
+++ b/tensorflow/compiler/tests/binary_ops_test.py
@@ -218,6 +218,21 @@ class BinaryOpsTest(xla_test.XLATestCase):
             ],
             equality_test=self.ListsAreClose)
 
+      # TF doesn't define these for bf16.
+      if dtype != dtypes.bfloat16.as_numpy_dtype:
+        self._testBinary(
+            gen_math_ops.xdivy,
+            np.array([0, 4, 3, 2, 1, 0], dtype=dtype),
+            np.array([0, 5, 6, 7, 8, float("NaN")], dtype=dtype),
+            expected=np.array([0, 0.8, 0.5, 0.285714, 0.125, 0], dtype=dtype))
+
+        self._testBinary(
+            gen_math_ops.xlogy,
+            np.array([0, 4, 3, 2, 1, 0], dtype=dtype),
+            np.array([0, 5, 6, 7, 8, float("NaN")], dtype=dtype),
+            expected=np.array([0, 6.437752, 5.375278, 3.89182, 2.079442, 0],
+                              dtype=dtype))
+
   def testIntOps(self):
     for dtype in self.signed_int_types:
       self._testBinary(
diff --git a/tensorflow/compiler/tests/categorical_op_test.py b/tensorflow/compiler/tests/categorical_op_test.py
index a57d1dc81ea..5d5e486f616 100644
--- a/tensorflow/compiler/tests/categorical_op_test.py
+++ b/tensorflow/compiler/tests/categorical_op_test.py
@@ -27,6 +27,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import random_seed
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import stateless_random_ops
 from tensorflow.python.platform import googletest
 
 
@@ -56,11 +57,11 @@ class CategoricalTest(xla_test.XLATestCase):
     Returns:
       Frequencies from sampled classes; shape [batch_size, num_classes].
     """
-    with self.cached_session() as sess, self.test_scope():
+    with self.cached_session(), self.test_scope():
       random_seed.set_random_seed(1618)
       op = random_ops.multinomial(logits, num_samples,
                                   output_dtype=dtypes.int32)
-      d = sess.run(op)
+      d = self.evaluate(op)
 
     batch_size, num_classes = logits.shape
     freqs_mat = []
@@ -79,15 +80,15 @@ class CategoricalTest(xla_test.XLATestCase):
 
   def _testRngIsNotConstant(self, rng, dtype, output_dtype):
     # Tests that 'rng' does not always return the same value.
-    with self.cached_session() as sess:
+    with self.cached_session():
       with self.test_scope():
         x = rng(dtype, output_dtype)
 
       # The random-number generator, if working correctly, should produce the
       # same output multiple times with low probability.
-      y = sess.run(x)
-      z = sess.run(x)
-      w = sess.run(x)
+      y = self.evaluate(x)
+      z = self.evaluate(x)
+      w = self.evaluate(x)
 
       # We use exact equality here. If the random-number generator is producing
       # deterministic output, all three outputs will be bitwise identical.
@@ -107,12 +108,12 @@ class CategoricalTest(xla_test.XLATestCase):
   def testCategoricalIsInRange(self):
     for dtype in self.float_types:
       for output_dtype in self.output_dtypes():
-        with self.cached_session() as sess:
+        with self.cached_session():
           with self.test_scope():
             x = random_ops.multinomial(
                 array_ops.ones(shape=[1, 20], dtype=dtype), 1000,
                 output_dtype=output_dtype)
-          y = sess.run(x)
+          y = self.evaluate(x)
           self.assertTrue((y >= 0).sum() == 1000)
           self.assertTrue((y < 20).sum() == 1000)
 
@@ -138,6 +139,57 @@ class CategoricalTest(xla_test.XLATestCase):
       chi2 = self._chi2(probs, freqs)
       self.assertLess(chi2, 1e-3)
 
+  def testStatelessMultinomialIsInRange(self):
+    for dtype in self.float_types:
+      for output_dtype in self.output_dtypes():
+        with self.cached_session() as sess:
+          with self.test_scope():
+            seed_t = array_ops.placeholder(dtypes.int32, shape=[2])
+            x = stateless_random_ops.stateless_multinomial(
+                array_ops.ones(shape=[1, 20], dtype=dtype),
+                1000,
+                seed_t,
+                output_dtype=output_dtype)
+          y = sess.run(x, {seed_t: [0x12345678, 0xabcdef12]})
+          self.assertTrue((y >= 0).sum() == 1000)
+          self.assertTrue((y < 20).sum() == 1000)
+
+  def testDeterminismMultinomial(self):
+    # Stateless values should be equal iff the seeds are equal (roughly)
+    num_samples = 10
+    with self.cached_session(), self.test_scope():
+      seed_t = array_ops.placeholder(dtypes.int32, shape=[2])
+      seeds = [(x, y) for x in range(5) for y in range(5)] * 3
+      for logits in ([[0.1, 0.25, 0.5, 0.15]], [[0.5, 0.5], [0.8, 0.2],
+                                                [0.25, 0.75]]):
+        pure = stateless_random_ops.stateless_multinomial(
+            logits, num_samples, seed=seed_t)
+        values = [(seed, pure.eval(feed_dict={seed_t: seed})) for seed in seeds]
+        for s0, v0 in values:
+          for s1, v1 in values:
+            self.assertEqual(s0 == s1, np.all(v0 == v1))
+
+  def testEmpty(self):
+    with self.cached_session():
+      with self.test_scope():
+        x = random_ops.multinomial(
+            array_ops.zeros([42, 40]), 0, output_dtype=dtypes.int32)
+        y = self.evaluate(x)
+        self.assertEqual(y.shape, (42, 0))
+
+  def testEmptyStateless(self):
+    with self.cached_session() as sess:
+      with self.test_scope():
+        seed_t = array_ops.placeholder(dtypes.int32, shape=[2])
+        x = stateless_random_ops.stateless_multinomial(
+            array_ops.zeros([42, 40]),
+            0,
+            seed=seed_t,
+            output_dtype=dtypes.int32)
+        y = sess.run(x, {seed_t: [0x12345678, 0xabcdef12]})
+        self.assertEqual(y.shape, (42, 0))
+
+
 
 if __name__ == '__main__':
   googletest.main()
diff --git a/tensorflow/compiler/tests/clustering_test.py b/tensorflow/compiler/tests/clustering_test.py
index 88bd58b2da6..ef2d7af69de 100644
--- a/tensorflow/compiler/tests/clustering_test.py
+++ b/tensorflow/compiler/tests/clustering_test.py
@@ -43,7 +43,7 @@ class ClusteringTest(xla_test.XLATestCase):
         input1 = constant_op.constant(val1, name="const1")
         input2 = constant_op.constant(val2, name="const2")
         output = math_ops.add(input1, input2)
-      result = output.eval()
+      result = self.evaluate(output)
     self.assertAllClose(result, expected, rtol=1e-3)
 
   def testAddFromCpuMultiple(self):
@@ -57,7 +57,7 @@ class ClusteringTest(xla_test.XLATestCase):
       with self.test_scope():
         output = math_ops.add(input1, input2)
       for _ in xrange(10):
-        result = output.eval()
+        result = self.evaluate(output)
         self.assertAllClose(result, expected, rtol=1e-3)
 
   def testDeadlock(self):
diff --git a/tensorflow/compiler/tests/concat_ops_test.py b/tensorflow/compiler/tests/concat_ops_test.py
index 2d225ad226c..2187f57960f 100644
--- a/tensorflow/compiler/tests/concat_ops_test.py
+++ b/tensorflow/compiler/tests/concat_ops_test.py
@@ -72,7 +72,7 @@ class ConcatTest(xla_test.XLATestCase):
       x2 = constant_op.constant(p2)
       with self.test_scope():
         c = array_ops.concat([x1, x2], 0)
-      result = c.eval()
+      result = self.evaluate(c)
     self.assertAllEqual(result[:2, :], p1)
     self.assertAllEqual(result[2:, :], p2)
 
@@ -150,7 +150,7 @@ class ConcatTest(xla_test.XLATestCase):
             [float(x) for x in grad_inp.flatten()], shape=output_shape)
         grad = gradients_impl.gradients([c], inp_tensors, [grad_tensor])
         concated_grad = array_ops.concat(grad, 1)
-      result = concated_grad.eval()
+      result = self.evaluate(concated_grad)
     self.assertAllEqual(result, grad_inp)
 
   def testGradientsSimpleAll(self):
@@ -177,7 +177,7 @@ class ConcatTest(xla_test.XLATestCase):
             [float(x) for x in grad_inp.flatten()], shape=output_shape)
         grad = gradients_impl.gradients([c], inp_tensors, [grad_tensor])
         concated_grad = array_ops.concat(grad, 0)
-        result = concated_grad.eval()
+        result = self.evaluate(concated_grad)
 
     self.assertAllEqual(result, grad_inp)
 
@@ -205,7 +205,7 @@ class ConcatTest(xla_test.XLATestCase):
             [float(x) for x in grad_inp.flatten()], shape=output_shape)
         grad = gradients_impl.gradients([c], inp_tensors, [grad_tensor])
         concated_grad = array_ops.concat(grad, 2)
-        result = concated_grad.eval()
+        result = self.evaluate(concated_grad)
 
     self.assertAllEqual(result, grad_inp)
 
@@ -242,7 +242,7 @@ class ConcatTest(xla_test.XLATestCase):
             [float(x) for x in grad_inp.flatten()], shape=output_shape)
         grad = gradients_impl.gradients([c], inp_tensors, [grad_tensor])
         concated_grad = array_ops.concat(grad, concat_dim)
-        result = concated_grad.eval()
+        result = self.evaluate(concated_grad)
 
     self.assertAllEqual(result, grad_inp)
 
@@ -254,7 +254,7 @@ class ConcatTest(xla_test.XLATestCase):
   def DISABLED_testZeroSize(self):
     # Verify that concat doesn't crash and burn for zero size inputs
     np.random.seed(7)
-    with self.cached_session() as sess:
+    with self.cached_session():
       with self.test_scope():
         for shape0 in (), (2,):
           axis = len(shape0)
@@ -270,7 +270,7 @@ class ConcatTest(xla_test.XLATestCase):
                 self.assertAllEqual(c.eval(), correct)
                 # Check gradients
                 dc = np.random.randn(*c.get_shape().as_list())
-                dxs = sess.run(gradients_impl.gradients(c, xs, dc))
+                dxs = self.evaluate(gradients_impl.gradients(c, xs, dc))
                 self.assertAllEqual(dc, np.concatenate(dxs, axis=axis))
 
   def testConcatTuple(self):
@@ -280,7 +280,7 @@ class ConcatTest(xla_test.XLATestCase):
       with self.test_scope():
         concat_list_t = array_ops.concat([c1, c2], 0)
         concat_tuple_t = array_ops.concat((c1, c2), 0)
-      self.assertAllEqual(concat_list_t.eval(), concat_tuple_t.eval())
+      self.assertAllEqual(concat_list_t.eval(), self.evaluate(concat_tuple_t))
 
   def testConcatNoScalars(self):
     with self.cached_session():
@@ -330,47 +330,47 @@ class ConcatTest(xla_test.XLATestCase):
 class ConcatOffsetTest(xla_test.XLATestCase):
 
   def testBasic(self):
-    with self.cached_session() as sess:
+    with self.cached_session():
       with self.test_scope():
         cdim = constant_op.constant(1, dtypes.int32)
         s0 = constant_op.constant([2, 3, 5], dtypes.int32)
         s1 = constant_op.constant([2, 7, 5], dtypes.int32)
         s2 = constant_op.constant([2, 20, 5], dtypes.int32)
         off = gen_array_ops.concat_offset(cdim, [s0, s1, s2])
-        ans = sess.run(off)
+        ans = self.evaluate(off)
         self.assertAllEqual(ans, [[0, 0, 0], [0, 3, 0], [0, 10, 0]])
 
 
 class PackTest(xla_test.XLATestCase):
 
   def testBasic(self):
-    with self.cached_session() as sess:
+    with self.cached_session():
       with self.test_scope():
         s0 = constant_op.constant([2, 3, 5], dtypes.int32)
         s1 = constant_op.constant([2, 7, 5], dtypes.int32)
         s2 = constant_op.constant([2, 20, 5], dtypes.int32)
         packed = array_ops.stack([s0, s1, s2])
-        ans = sess.run(packed)
+        ans = self.evaluate(packed)
         self.assertAllEqual(ans, [[2, 3, 5], [2, 7, 5], [2, 20, 5]])
 
   def testScalars(self):
-    with self.cached_session() as sess:
+    with self.cached_session():
       with self.test_scope():
         s0 = constant_op.constant(2, dtypes.int32)
         s1 = constant_op.constant(3, dtypes.int32)
         s2 = constant_op.constant(5, dtypes.int32)
         packed = array_ops.stack([s0, s1, s2])
-        ans = sess.run(packed)
+        ans = self.evaluate(packed)
         self.assertAllEqual(ans, [2, 3, 5])
 
   def testEmpty(self):
-    with self.cached_session() as sess:
+    with self.cached_session():
       with self.test_scope():
         s0 = constant_op.constant([[]], dtypes.int32)
         s1 = constant_op.constant([[]], dtypes.int32)
         s2 = constant_op.constant([[]], dtypes.int32)
         packed = array_ops.stack([s0, s1, s2])
-        ans = sess.run(packed)
+        ans = self.evaluate(packed)
         self.assertAllEqual(ans, [[[]], [[]], [[]]])
 
 
diff --git a/tensorflow/compiler/tests/conv3d_test.py b/tensorflow/compiler/tests/conv3d_test.py
index d59fd0236f4..01cc1b63928 100644
--- a/tensorflow/compiler/tests/conv3d_test.py
+++ b/tensorflow/compiler/tests/conv3d_test.py
@@ -85,7 +85,7 @@ class Conv3DTransposeTest(xla_test.XLATestCase):
           1.0, shape=f_shape, name="filter", dtype=dtypes.float32)
       output = nn_ops.conv3d_transpose(
           x, f, y_shape, strides=strides, padding="SAME")
-      value = output.eval()
+      value = self.evaluate(output)
 
       # We count the number of cells being added at the locations in the output.
       # At the center, #cells = kernel_depth * kernel_height * kernel_width
@@ -135,7 +135,7 @@ class Conv3DTransposeTest(xla_test.XLATestCase):
           1.0, shape=f_shape, name="filter", dtype=dtypes.float32)
       output = nn_ops.conv3d_transpose(
           x, f, y_shape, strides=strides, padding="SAME")
-      value = output.eval()
+      value = self.evaluate(output)
 
       for n in xrange(x_shape[0]):
         for k in xrange(f_shape[3]):
@@ -173,7 +173,7 @@ class Conv3DTransposeTest(xla_test.XLATestCase):
           1.0, shape=f_shape, name="filter", dtype=dtypes.float32)
       output = nn_ops.conv3d_transpose(
           x, f, y_shape, strides=strides, padding="VALID")
-      value = output.eval()
+      value = self.evaluate(output)
 
       cache_values = np.zeros(y_shape, dtype=np.float32)
 
diff --git a/tensorflow/compiler/tests/dense_layer_test.py b/tensorflow/compiler/tests/dense_layer_test.py
index d1b90f098d7..bf5ea7b1fb6 100644
--- a/tensorflow/compiler/tests/dense_layer_test.py
+++ b/tensorflow/compiler/tests/dense_layer_test.py
@@ -42,7 +42,7 @@ def GetRunMetadataLabels(run_metadata):
 
 def InLabels(labels, substr):
   """Returns true iff one of the labels contains substr."""
-  return any([substr in x for x in labels])
+  return any(substr in x for x in labels)
 
 
 class DenseLayerTest(test.TestCase):
@@ -72,7 +72,7 @@ class DenseLayerTest(test.TestCase):
       x = array_ops.placeholder(shape=[None, None, 3], dtype=np.float32)
       y = layers.dense(x, 3)
 
-      sess.run(variables.initialize_all_variables())
+      self.evaluate(variables.initialize_all_variables())
       run_metadata = config_pb2.RunMetadata()
       test_utils.RunWithWarmup(
           sess,
@@ -97,7 +97,7 @@ class DenseLayerTest(test.TestCase):
       with jit_scope():
         y = layers.dense(x, 3)
 
-      sess.run(variables.initialize_all_variables())
+      self.evaluate(variables.initialize_all_variables())
       run_metadata = config_pb2.RunMetadata()
       test_utils.RunWithWarmup(
           sess,
@@ -126,7 +126,7 @@ class DenseLayerTest(test.TestCase):
       with jit_scope():
         y = layers.dense(x, 3)
 
-      sess.run(variables.initialize_all_variables())
+      self.evaluate(variables.initialize_all_variables())
       run_metadata = config_pb2.RunMetadata()
       test_utils.RunWithWarmup(
           sess,
diff --git a/tensorflow/compiler/tests/dynamic_stitch_test.py b/tensorflow/compiler/tests/dynamic_stitch_test.py
index 50b04daa6b9..e89cf975f5d 100644
--- a/tensorflow/compiler/tests/dynamic_stitch_test.py
+++ b/tensorflow/compiler/tests/dynamic_stitch_test.py
@@ -58,6 +58,15 @@ class DynamicStitchTest(xla_test.XLATestCase):
         [idx1, idx2], [val1, val2],
         expected=np.array([[], [], [], []], np.int32))
 
+  def testEmptyIndex(self):
+    idx1 = np.array([], dtype=np.int32)
+    idx2 = np.array([[], []], dtype=np.int32)
+    val1 = np.ndarray(shape=(0, 9), dtype=np.int32)
+    val2 = np.ndarray(shape=(2, 0, 9), dtype=np.int32)
+    self._AssertDynamicStitchResultIs([idx1, idx2], [val1, val2],
+                                      expected=np.ndarray(
+                                          shape=(0, 9), dtype=np.int32))
+
   def testSimple1D(self):
     val1 = np.array([0, 4, 7], dtype=np.int32)
     val2 = np.array([1, 6, 2, 3, 5], dtype=np.int32)
diff --git a/tensorflow/compiler/tests/eager_test.py b/tensorflow/compiler/tests/eager_test.py
index 63cee550fde..2af32b537ba 100644
--- a/tensorflow/compiler/tests/eager_test.py
+++ b/tensorflow/compiler/tests/eager_test.py
@@ -101,12 +101,12 @@ class EagerTest(xla_test.XLATestCase):
       self.assertAllEqual(15, product)
 
     # Run some ops graphly
-    with context.graph_mode(), self.cached_session() as sess:
+    with context.graph_mode(), self.cached_session():
       with self.test_scope():
         three = constant_op.constant(3)
         five = constant_op.constant(5)
         product = three * five
-        self.assertAllEqual(15, sess.run(product))
+        self.assertAllEqual(15, self.evaluate(product))
 
   def testDegenerateSlices(self):
     with self.test_scope():
diff --git a/tensorflow/compiler/tests/fft_test.py b/tensorflow/compiler/tests/fft_test.py
index e92afd5d6fe..0edd0c35aa2 100644
--- a/tensorflow/compiler/tests/fft_test.py
+++ b/tensorflow/compiler/tests/fft_test.py
@@ -27,8 +27,7 @@ from tensorflow.compiler.tests import xla_test
 from tensorflow.python.framework import dtypes
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradients_impl
-from tensorflow.python.ops import signal
-from tensorflow.python.ops import spectral_ops
+from tensorflow.python.ops.signal import signal
 from tensorflow.python.platform import googletest
 
 BATCH_DIMS = (3, 5)
@@ -107,39 +106,39 @@ class FFTTest(xla_test.XLATestCase):
 
   def testFFT(self):
     self._VerifyFftMethod(INNER_DIMS_1D, lambda x: x, np.fft.fft,
-                          spectral_ops.fft)
+                          signal.fft)
 
   def testFFT2D(self):
     self._VerifyFftMethod(INNER_DIMS_2D, lambda x: x, np.fft.fft2,
-                          spectral_ops.fft2d)
+                          signal.fft2d)
 
   def testFFT3D(self):
     self._VerifyFftMethod(INNER_DIMS_3D, lambda x: x,
                           lambda x: np.fft.fftn(x, axes=(-3, -2, -1)),
-                          spectral_ops.fft3d)
+                          signal.fft3d)
 
   def testIFFT(self):
     self._VerifyFftMethod(INNER_DIMS_1D, lambda x: x, np.fft.ifft,
-                          spectral_ops.ifft)
+                          signal.ifft)
 
   def testIFFT2D(self):
     self._VerifyFftMethod(INNER_DIMS_2D, lambda x: x, np.fft.ifft2,
-                          spectral_ops.ifft2d)
+                          signal.ifft2d)
 
   def testIFFT3D(self):
     self._VerifyFftMethod(INNER_DIMS_3D, lambda x: x,
                           lambda x: np.fft.ifftn(x, axes=(-3, -2, -1)),
-                          spectral_ops.ifft3d)
+                          signal.ifft3d)
 
   def testRFFT(self):
     self._VerifyFftMethod(
         INNER_DIMS_1D, np.real, lambda x: np.fft.rfft(x, n=x.shape[-1]),
-        lambda x: spectral_ops.rfft(x, fft_length=[x.shape[-1].value]))
+        lambda x: signal.rfft(x, fft_length=[x.shape[-1].value]))
 
   def testRFFT2D(self):
 
     def _tf_fn(x):
-      return spectral_ops.rfft2d(
+      return signal.rfft2d(
           x, fft_length=[x.shape[-2].value, x.shape[-1].value])
 
     self._VerifyFftMethod(
@@ -153,16 +152,33 @@ class FFTTest(xla_test.XLATestCase):
           x, axes=(-3, -2, -1), s=[x.shape[-3], x.shape[-2], x.shape[-1]])
 
     def _tf_fn(x):
-      return spectral_ops.rfft3d(
+      return signal.rfft3d(
           x,
           fft_length=[x.shape[-3].value, x.shape[-2].value, x.shape[-1].value])
 
     self._VerifyFftMethod(INNER_DIMS_3D, np.real, _to_expected, _tf_fn)
 
+  def testRFFT3DMismatchedSize(self):
+
+    def _to_expected(x):
+      return np.fft.rfftn(
+          x,
+          axes=(-3, -2, -1),
+          s=[x.shape[-3] // 2, x.shape[-2], x.shape[-1] * 2])
+
+    def _tf_fn(x):
+      return signal.rfft3d(
+          x,
+          fft_length=[
+              x.shape[-3].value // 2, x.shape[-2].value, x.shape[-1].value * 2
+          ])
+
+    self._VerifyFftMethod(INNER_DIMS_3D, np.real, _to_expected, _tf_fn)
+
   def testIRFFT(self):
 
     def _tf_fn(x):
-      return spectral_ops.irfft(x, fft_length=[2 * (x.shape[-1].value - 1)])
+      return signal.irfft(x, fft_length=[2 * (x.shape[-1].value - 1)])
 
     self._VerifyFftMethod(
         INNER_DIMS_1D, lambda x: np.fft.rfft(np.real(x), n=x.shape[-1]),
@@ -171,7 +187,7 @@ class FFTTest(xla_test.XLATestCase):
   def testIRFFT2D(self):
 
     def _tf_fn(x):
-      return spectral_ops.irfft2d(
+      return signal.irfft2d(
           x, fft_length=[x.shape[-2].value, 2 * (x.shape[-1].value - 1)])
 
     self._VerifyFftMethod(
@@ -195,7 +211,7 @@ class FFTTest(xla_test.XLATestCase):
           s=[x.shape[-3], x.shape[-2], 2 * (x.shape[-1] - 1)])
 
     def _tf_fn(x):
-      return spectral_ops.irfft3d(
+      return signal.irfft3d(
           x,
           fft_length=[
               x.shape[-3].value, x.shape[-2].value, 2 * (x.shape[-1].value - 1)
@@ -203,6 +219,30 @@ class FFTTest(xla_test.XLATestCase):
 
     self._VerifyFftMethod(INNER_DIMS_3D, _to_input, _to_expected, _tf_fn)
 
+  def testIRFFT3DMismatchedSize(self):
+
+    def _to_input(x):
+      return np.fft.rfftn(
+          np.real(x),
+          axes=(-3, -2, -1),
+          s=[x.shape[-3] // 2, x.shape[-2], x.shape[-1] * 2])
+
+    def _to_expected(x):
+      return np.fft.irfftn(
+          x,
+          axes=(-3, -2, -1),
+          s=[x.shape[-3] // 2, x.shape[-2], x.shape[-1] * 2])
+
+    def _tf_fn(x):
+      return signal.irfft3d(
+          x,
+          fft_length=[
+              x.shape[-3].value // 2, x.shape[-2].value, x.shape[-1].value * 2
+          ])
+
+    self._VerifyFftMethod(INNER_DIMS_3D, _to_input, _to_expected, _tf_fn)
+
+
 
 if __name__ == "__main__":
   googletest.main()
diff --git a/tensorflow/compiler/tests/fifo_queue_test.py b/tensorflow/compiler/tests/fifo_queue_test.py
index 8c7edfd277c..91d77d2f791 100644
--- a/tensorflow/compiler/tests/fifo_queue_test.py
+++ b/tensorflow/compiler/tests/fifo_queue_test.py
@@ -129,7 +129,7 @@ class FIFOQueueTest(xla_test.XLATestCase):
         enqueue_op.run()
 
       for i in xrange(len(elems)):
-        vals = dequeued_t.eval()
+        vals = self.evaluate(dequeued_t)
         self.assertEqual([elems[i]], vals)
 
   def testEnqueueAndBlockingDequeue(self):
@@ -192,9 +192,9 @@ class FIFOQueueTest(xla_test.XLATestCase):
       self.assertEqual([], size.get_shape())
 
       enqueue_op.run()
-      self.assertEqual(1, size.eval())
+      self.assertEqual(1, self.evaluate(size))
       dequeued_t.op.run()
-      self.assertEqual(0, size.eval())
+      self.assertEqual(0, self.evaluate(size))
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/compiler/tests/ftrl_test.py b/tensorflow/compiler/tests/ftrl_test.py
index 5b197afd655..b078053cdbd 100644
--- a/tensorflow/compiler/tests/ftrl_test.py
+++ b/tensorflow/compiler/tests/ftrl_test.py
@@ -50,14 +50,14 @@ class FtrlOptimizerTest(xla_test.XLATestCase):
     ftrl_update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
     variables.global_variables_initializer().run()
     # Fetch params to validate initial values
-    self.assertAllClose([0.0, 0.0], var0.eval())
-    self.assertAllClose([0.0, 0.0], var1.eval())
+    self.assertAllClose([0.0, 0.0], self.evaluate(var0))
+    self.assertAllClose([0.0, 0.0], self.evaluate(var1))
 
     # Run Ftrl for a few steps
     for _ in range(steps):
       ftrl_update.run()
 
-    return var0.eval(), var1.eval()
+    return self.evaluate(var0), self.evaluate(var1)
 
   def equivAdagradTest_AdagradPart(self, steps, dtype):
     var0, var1, grads0, grads1 = self.initVariableAndGradient(dtype)
@@ -65,14 +65,14 @@ class FtrlOptimizerTest(xla_test.XLATestCase):
     adagrad_update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
     variables.global_variables_initializer().run()
     # Fetch params to validate initial values
-    self.assertAllClose([0.0, 0.0], var0.eval())
-    self.assertAllClose([0.0, 0.0], var1.eval())
+    self.assertAllClose([0.0, 0.0], self.evaluate(var0))
+    self.assertAllClose([0.0, 0.0], self.evaluate(var1))
 
     # Run Adagrad for a few steps
     for _ in range(steps):
       adagrad_update.run()
 
-    return var0.eval(), var1.eval()
+    return self.evaluate(var0), self.evaluate(var1)
 
   def equivGradientDescentTest_FtrlPart(self, steps, dtype):
     var0, var1, grads0, grads1 = self.initVariableAndGradient(dtype)
@@ -85,14 +85,14 @@ class FtrlOptimizerTest(xla_test.XLATestCase):
     ftrl_update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
     variables.global_variables_initializer().run()
     # Fetch params to validate initial values
-    self.assertAllClose([0.0, 0.0], var0.eval())
-    self.assertAllClose([0.0, 0.0], var1.eval())
+    self.assertAllClose([0.0, 0.0], self.evaluate(var0))
+    self.assertAllClose([0.0, 0.0], self.evaluate(var1))
 
     # Run Ftrl for a few steps
     for _ in range(steps):
       ftrl_update.run()
 
-    return var0.eval(), var1.eval()
+    return self.evaluate(var0), self.evaluate(var1)
 
   def equivGradientDescentTest_GradientDescentPart(self, steps, dtype):
     var0, var1, grads0, grads1 = self.initVariableAndGradient(dtype)
@@ -100,14 +100,14 @@ class FtrlOptimizerTest(xla_test.XLATestCase):
     sgd_update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
     variables.global_variables_initializer().run()
     # Fetch params to validate initial values
-    self.assertAllClose([0.0, 0.0], var0.eval())
-    self.assertAllClose([0.0, 0.0], var1.eval())
+    self.assertAllClose([0.0, 0.0], self.evaluate(var0))
+    self.assertAllClose([0.0, 0.0], self.evaluate(var1))
 
     # Run GradientDescent for a few steps
     for _ in range(steps):
       sgd_update.run()
 
-    return var0.eval(), var1.eval()
+    return self.evaluate(var0), self.evaluate(var1)
 
   def testFtrlwithoutRegularization(self):
     for dtype in self.float_types:
@@ -124,8 +124,8 @@ class FtrlOptimizerTest(xla_test.XLATestCase):
         ftrl_update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
         variables.global_variables_initializer().run()
         # Fetch params to validate initial values
-        self.assertAllClose([0.0, 0.0], var0.eval())
-        self.assertAllClose([0.0, 0.0], var1.eval())
+        self.assertAllClose([0.0, 0.0], self.evaluate(var0))
+        self.assertAllClose([0.0, 0.0], self.evaluate(var1))
 
         # Run 3 steps FTRL
         for _ in range(3):
@@ -134,12 +134,12 @@ class FtrlOptimizerTest(xla_test.XLATestCase):
         # Validate updated params
         self.assertAllCloseAccordingToType(
             np.array([-2.60260963, -4.29698515]),
-            var0.eval(),
+            self.evaluate(var0),
             float_rtol=1e-4,
             half_rtol=1e-2)
         self.assertAllCloseAccordingToType(
             np.array([-0.28432083, -0.56694895]),
-            var1.eval(),
+            self.evaluate(var1),
             float_rtol=1e-5,
             half_rtol=1e-2)
 
@@ -158,8 +158,8 @@ class FtrlOptimizerTest(xla_test.XLATestCase):
         ftrl_update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
         variables.global_variables_initializer().run()
         # Fetch params to validate initial values
-        self.assertAllClose([1.0, 2.0], var0.eval())
-        self.assertAllClose([4.0, 3.0], var1.eval())
+        self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+        self.assertAllClose([4.0, 3.0], self.evaluate(var1))
 
         # Run 3 steps FTRL
         for _ in range(3):
@@ -167,10 +167,14 @@ class FtrlOptimizerTest(xla_test.XLATestCase):
 
         # Validate updated params
         self.assertAllCloseAccordingToType(
-            np.array([-2.55607247, -3.98729396]), var0.eval(), 1e-5, 1e-5,
+            np.array([-2.55607247, -3.98729396]),
+            self.evaluate(var0),
+            1e-5,
+            1e-5,
             float_rtol=1e-4)
         self.assertAllCloseAccordingToType(
-            np.array([-0.28232238, -0.56096673]), var1.eval(), 1e-5, 1e-5)
+            np.array([-0.28232238, -0.56096673]), self.evaluate(var1), 1e-5,
+            1e-5)
 
   def testFtrlWithL1(self):
     for dtype in self.float_types:
@@ -187,8 +191,8 @@ class FtrlOptimizerTest(xla_test.XLATestCase):
         ftrl_update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
         variables.global_variables_initializer().run()
         # Fetch params to validate initial values
-        self.assertAllClose([1.0, 2.0], var0.eval())
-        self.assertAllClose([4.0, 3.0], var1.eval())
+        self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+        self.assertAllClose([4.0, 3.0], self.evaluate(var1))
 
         # Run 10 steps FTRL
         for _ in range(10):
@@ -197,12 +201,14 @@ class FtrlOptimizerTest(xla_test.XLATestCase):
         # Validate updated params
         self.assertAllCloseAccordingToType(
             np.array([-7.66718769, -10.91273689]),
-            var0.eval(),
+            self.evaluate(var0),
             rtol=1e-4,
             bfloat16_rtol=1e-1,
             bfloat16_atol=1e-1)
         self.assertAllCloseAccordingToType(
-            np.array([-0.93460727, -1.86147261]), var1.eval(), rtol=1e-4)
+            np.array([-0.93460727, -1.86147261]),
+            self.evaluate(var1),
+            rtol=1e-4)
 
   def testFtrlWithL1_L2(self):
     for dtype in self.float_types:
@@ -219,8 +225,8 @@ class FtrlOptimizerTest(xla_test.XLATestCase):
         ftrl_update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
         variables.global_variables_initializer().run()
         # Fetch params to validate initial values
-        self.assertAllClose([1.0, 2.0], var0.eval())
-        self.assertAllClose([4.0, 3.0], var1.eval())
+        self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+        self.assertAllClose([4.0, 3.0], self.evaluate(var1))
 
         # Run 10 steps FTRL
         for _ in range(10):
@@ -228,9 +234,13 @@ class FtrlOptimizerTest(xla_test.XLATestCase):
 
         # Validate updated params
         self.assertAllCloseAccordingToType(
-            np.array([-0.24059935, -0.46829352]), var0.eval(), rtol=1e-5)
+            np.array([-0.24059935, -0.46829352]),
+            self.evaluate(var0),
+            rtol=1e-5)
         self.assertAllCloseAccordingToType(
-            np.array([-0.02406147, -0.04830509]), var1.eval(), rtol=1e-5)
+            np.array([-0.02406147, -0.04830509]),
+            self.evaluate(var1),
+            rtol=1e-5)
 
   def testFtrlWithL1_L2_L2Shrinkage(self):
     """Test the new FTRL op with support for l2 shrinkage.
@@ -254,8 +264,8 @@ class FtrlOptimizerTest(xla_test.XLATestCase):
         ftrl_update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
         variables.global_variables_initializer().run()
         # Fetch params to validate initial values
-        self.assertAllCloseAccordingToType([1.0, 2.0], var0.eval())
-        self.assertAllCloseAccordingToType([4.0, 3.0], var1.eval())
+        self.assertAllCloseAccordingToType([1.0, 2.0], self.evaluate(var0))
+        self.assertAllCloseAccordingToType([4.0, 3.0], self.evaluate(var1))
 
         # Run 10 steps FTRL
         for _ in range(10):
@@ -263,9 +273,13 @@ class FtrlOptimizerTest(xla_test.XLATestCase):
 
         # Validate updated params
         self.assertAllCloseAccordingToType(
-            np.array([-0.22578996, -0.44345799]), var0.eval(), rtol=1e-4)
+            np.array([-0.22578996, -0.44345799]),
+            self.evaluate(var0),
+            rtol=1e-4)
         self.assertAllCloseAccordingToType(
-            np.array([-0.14378493, -0.13229476]), var1.eval(), rtol=1e-4)
+            np.array([-0.14378493, -0.13229476]),
+            self.evaluate(var1),
+            rtol=1e-4)
 
   def testFtrlWithL2ShrinkageDoesNotChangeLrSchedule(self):
     """Verifies that l2 shrinkage in FTRL does not change lr schedule."""
@@ -291,8 +305,8 @@ class FtrlOptimizerTest(xla_test.XLATestCase):
         update1 = opt1.apply_gradients([(grads1, var1)])
         variables.global_variables_initializer().run()
 
-        self.assertAllCloseAccordingToType([1.0, 2.0], var0.eval())
-        self.assertAllCloseAccordingToType([1.0, 2.0], var1.eval())
+        self.assertAllCloseAccordingToType([1.0, 2.0], self.evaluate(var0))
+        self.assertAllCloseAccordingToType([1.0, 2.0], self.evaluate(var1))
 
         # Run 10 steps FTRL
         for _ in range(10):
@@ -301,7 +315,7 @@ class FtrlOptimizerTest(xla_test.XLATestCase):
 
         # var0 is experiencing L2 shrinkage so it should be smaller than var1
         # in magnitude.
-        self.assertTrue((var0.eval()**2 < var1.eval()**2).all())
+        self.assertTrue((var0.eval()**2 < self.evaluate(var1)**2).all())
         accum0 = list(opt0._slots["accum"].values())[0].eval()
         accum1 = list(opt1._slots["accum"].values())[0].eval()
         # L2 shrinkage should not change how we update grad accumulator.
diff --git a/tensorflow/compiler/tests/function_test.py b/tensorflow/compiler/tests/function_test.py
index b1891b918c6..a61827c2ae4 100644
--- a/tensorflow/compiler/tests/function_test.py
+++ b/tensorflow/compiler/tests/function_test.py
@@ -40,7 +40,7 @@ class FunctionTest(xla_test.XLATestCase):
     bval = np.array([5, 6, 7, 8]).reshape([2, 2]).astype(np.float32)
     expected = APlus2B(aval, bval)
 
-    with self.cached_session() as sess:
+    with self.cached_session():
 
       @function.Defun(dtypes.float32, dtypes.float32)
       def Foo(a, b):
@@ -50,7 +50,7 @@ class FunctionTest(xla_test.XLATestCase):
       b = constant_op.constant(bval, name="b")
       with self.test_scope():
         call_f = Foo(a, b)
-      result = sess.run(call_f)
+      result = self.evaluate(call_f)
     self.assertAllClose(result, expected, rtol=1e-3)
 
   def testNestedFunctions(self):
@@ -66,7 +66,7 @@ class FunctionTest(xla_test.XLATestCase):
     bval = np.array([4, 3, 2, 1]).reshape([2, 2]).astype(np.float32)
     expected = APlus2B(aval, bval)
 
-    with self.cached_session() as sess:
+    with self.cached_session():
 
       @function.Defun(dtypes.float32, dtypes.float32)
       def Foo(a, b):
@@ -76,7 +76,7 @@ class FunctionTest(xla_test.XLATestCase):
       b = constant_op.constant(bval, name="b")
       with self.test_scope():
         call_g = Foo(a, b)
-      result = sess.run(call_g)
+      result = self.evaluate(call_g)
     self.assertAllClose(result, expected, rtol=1e-3)
 
   def testFunctionMultipleRetvals(self):
@@ -90,7 +90,7 @@ class FunctionTest(xla_test.XLATestCase):
     bval = np.array([5, 6, 7, 8]).reshape([2, 2]).astype(np.float32)
     expected = Func(aval, bval)
 
-    with self.cached_session() as sess:
+    with self.cached_session():
 
       @function.Defun(dtypes.float32, dtypes.float32)
       def Foo(a, b):
@@ -100,7 +100,7 @@ class FunctionTest(xla_test.XLATestCase):
       b = constant_op.constant(bval, name="b")
       with self.test_scope():
         call_f = Foo(a, b)
-      result = sess.run(call_f)
+      result = self.evaluate(call_f)
     self.assertAllClose(result, expected, rtol=1e-3)
 
   def testCompileTimeConstantsInDefun(self):
diff --git a/tensorflow/compiler/tests/jit_test.py b/tensorflow/compiler/tests/jit_test.py
index 6f51ae33a1b..dbea9849e21 100644
--- a/tensorflow/compiler/tests/jit_test.py
+++ b/tensorflow/compiler/tests/jit_test.py
@@ -75,7 +75,7 @@ def RunMetadataLabels(run_metadata):
 
 def InLabels(labels, substr):
   """Returns true iff one of the labels contains substr."""
-  return any([substr in x for x in labels])
+  return any(substr in x for x in labels)
 
 
 def MetadataHasXlaRunOp(run_metadata):
diff --git a/tensorflow/compiler/tests/listdiff_op_test.py b/tensorflow/compiler/tests/listdiff_op_test.py
index 58622114e4f..0210201fa71 100644
--- a/tensorflow/compiler/tests/listdiff_op_test.py
+++ b/tensorflow/compiler/tests/listdiff_op_test.py
@@ -33,13 +33,13 @@ class ListDiffTest(xla_test.XLATestCase):
   def _testListDiff(self, x, y, out, idx):
     for dtype in [dtypes.int32, dtypes.int64]:
       for index_dtype in [dtypes.int32, dtypes.int64]:
-        with self.cached_session() as sess:
+        with self.cached_session():
           x_tensor = ops.convert_to_tensor(x, dtype=dtype)
           y_tensor = ops.convert_to_tensor(y, dtype=dtype)
           with self.test_scope():
             out_tensor, idx_tensor = array_ops.listdiff(
                 x_tensor, y_tensor, out_idx=index_dtype)
-            tf_out, tf_idx = sess.run([out_tensor, idx_tensor])
+            tf_out, tf_idx = self.evaluate([out_tensor, idx_tensor])
         self.assertAllEqual(out, tf_out)
         self.assertAllEqual(idx, tf_idx)
         self.assertEqual(1, out_tensor.get_shape().ndims)
diff --git a/tensorflow/compiler/tests/lrn_ops_test.py b/tensorflow/compiler/tests/lrn_ops_test.py
index c6ad67993e8..5dddf6ae4e8 100644
--- a/tensorflow/compiler/tests/lrn_ops_test.py
+++ b/tensorflow/compiler/tests/lrn_ops_test.py
@@ -120,8 +120,8 @@ class LRNTest(xla_test.XLATestCase):
       with self.test_scope():
         actual = gen_nn_ops.lrn_grad(out_grads, in_image, out_image,
                                      depth_radius, bias, alpha, beta)
-      expected_val = expected.eval()
-      actual_val = actual.eval()
+      expected_val = self.evaluate(expected)
+      actual_val = self.evaluate(actual)
     self.assertAllClose(actual_val, expected_val, rtol=1e-3)
 
 
diff --git a/tensorflow/compiler/tests/lstm_test.py b/tensorflow/compiler/tests/lstm_test.py
index 265c0b6d141..776ed899e68 100644
--- a/tensorflow/compiler/tests/lstm_test.py
+++ b/tensorflow/compiler/tests/lstm_test.py
@@ -88,8 +88,8 @@ class LSTMTest(test.TestCase):
                  (basename, m_prev_scalar, c_prev_scalar, pad_scalar))
 
       # Initialize variables and run the unrolled LSTM step.
-      sess.run(variables.global_variables_initializer())
-      return sess.run([m, c])
+      self.evaluate(variables.global_variables_initializer())
+      return self.evaluate([m, c])
 
   def testLSTMCell(self):
     # Run with all-0 weights, no padding.
@@ -173,8 +173,8 @@ class LSTMTest(test.TestCase):
                  (basename, m_init_scalar, c_init_scalar, pad_scalar))
 
       # Initialize variables and run the unrolled LSTM layer.
-      sess.run(variables.global_variables_initializer())
-      return sess.run(out_seq)
+      self.evaluate(variables.global_variables_initializer())
+      return self.evaluate(out_seq)
 
   def testLSTMLayer(self):
     # Run with all-0 weights, no padding.
diff --git a/tensorflow/compiler/tests/momentum_test.py b/tensorflow/compiler/tests/momentum_test.py
index f77521a7c49..3416f7dbd6b 100644
--- a/tensorflow/compiler/tests/momentum_test.py
+++ b/tensorflow/compiler/tests/momentum_test.py
@@ -61,37 +61,43 @@ class MomentumOptimizerTest(xla_test.XLATestCase):
         self.assertFalse(slot1 in variables.trainable_variables())
 
         # Fetch params to validate initial values
-        self.assertAllClose([1.0, 2.0], var0.eval())
-        self.assertAllClose([3.0, 4.0], var1.eval())
+        self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+        self.assertAllClose([3.0, 4.0], self.evaluate(var1))
         # Step 1: the momentum accumulators where 0. So we should see a normal
         # update: v -= grad * learning_rate
         mom_update.run()
         # Check that the momentum accumulators have been updated.
-        self.assertAllCloseAccordingToType(np.array([0.1, 0.1]), slot0.eval())
-        self.assertAllCloseAccordingToType(np.array([0.01, 0.01]), slot1.eval())
+        self.assertAllCloseAccordingToType(
+            np.array([0.1, 0.1]), self.evaluate(slot0))
+        self.assertAllCloseAccordingToType(
+            np.array([0.01, 0.01]), self.evaluate(slot1))
         # Check that the parameters have been updated.
         self.assertAllCloseAccordingToType(
-            np.array([1.0 - (0.1 * 2.0), 2.0 - (0.1 * 2.0)]), var0.eval())
+            np.array([1.0 - (0.1 * 2.0), 2.0 - (0.1 * 2.0)]),
+            self.evaluate(var0))
         self.assertAllCloseAccordingToType(
-            np.array([3.0 - (0.01 * 2.0), 4.0 - (0.01 * 2.0)]), var1.eval())
+            np.array([3.0 - (0.01 * 2.0), 4.0 - (0.01 * 2.0)]),
+            self.evaluate(var1))
         # Step 2: the momentum accumulators contain the previous update.
         mom_update.run()
         # Check that the momentum accumulators have been updated.
         self.assertAllCloseAccordingToType(
-            np.array([(0.9 * 0.1 + 0.1), (0.9 * 0.1 + 0.1)]), slot0.eval())
+            np.array([(0.9 * 0.1 + 0.1), (0.9 * 0.1 + 0.1)]),
+            self.evaluate(slot0))
         self.assertAllCloseAccordingToType(
-            np.array([(0.9 * 0.01 + 0.01), (0.9 * 0.01 + 0.01)]), slot1.eval())
+            np.array([(0.9 * 0.01 + 0.01), (0.9 * 0.01 + 0.01)]),
+            self.evaluate(slot1))
         # Check that the parameters have been updated.
         self.assertAllCloseAccordingToType(
             np.array([
                 1.0 - (0.1 * 2.0) - ((0.9 * 0.1 + 0.1) * 2.0),
                 2.0 - (0.1 * 2.0) - ((0.9 * 0.1 + 0.1) * 2.0)
-            ]), var0.eval())
+            ]), self.evaluate(var0))
         self.assertAllCloseAccordingToType(
             np.array([
-                2.98 - ((0.9 * 0.01 + 0.01) * 2.0), 3.98 - (
-                    (0.9 * 0.01 + 0.01) * 2.0)
-            ]), var1.eval())
+                2.98 - ((0.9 * 0.01 + 0.01) * 2.0),
+                3.98 - ((0.9 * 0.01 + 0.01) * 2.0)
+            ]), self.evaluate(var1))
 
   def testNesterovMomentum(self):
     for dtype in self.float_types:
@@ -115,8 +121,8 @@ class MomentumOptimizerTest(xla_test.XLATestCase):
               var0_np, accum0_np, var0_np * 0.8, 0.1, 0.9)
           var1_np, accum1_np = self._update_nesterov_momentum_numpy(
               var1_np, accum1_np, 0.9, 0.1, 0.9)
-          self.assertAllCloseAccordingToType(var0_np, var0.eval())
-          self.assertAllCloseAccordingToType(var1_np, var1.eval())
+          self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
+          self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
 
   def testTensorLearningRateAndMomentum(self):
     for dtype in self.float_types:
@@ -141,37 +147,43 @@ class MomentumOptimizerTest(xla_test.XLATestCase):
         self.assertFalse(slot1 in variables.trainable_variables())
 
         # Fetch params to validate initial values
-        self.assertAllClose([1.0, 2.0], var0.eval())
-        self.assertAllClose([3.0, 4.0], var1.eval())
+        self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+        self.assertAllClose([3.0, 4.0], self.evaluate(var1))
         # Step 1: the momentum accumulators where 0. So we should see a normal
         # update: v -= grad * learning_rate
         mom_update.run()
         # Check that the momentum accumulators have been updated.
-        self.assertAllCloseAccordingToType(np.array([0.1, 0.1]), slot0.eval())
-        self.assertAllCloseAccordingToType(np.array([0.01, 0.01]), slot1.eval())
+        self.assertAllCloseAccordingToType(
+            np.array([0.1, 0.1]), self.evaluate(slot0))
+        self.assertAllCloseAccordingToType(
+            np.array([0.01, 0.01]), self.evaluate(slot1))
         # Check that the parameters have been updated.
         self.assertAllCloseAccordingToType(
-            np.array([1.0 - (0.1 * 2.0), 2.0 - (0.1 * 2.0)]), var0.eval())
+            np.array([1.0 - (0.1 * 2.0), 2.0 - (0.1 * 2.0)]),
+            self.evaluate(var0))
         self.assertAllCloseAccordingToType(
-            np.array([3.0 - (0.01 * 2.0), 4.0 - (0.01 * 2.0)]), var1.eval())
+            np.array([3.0 - (0.01 * 2.0), 4.0 - (0.01 * 2.0)]),
+            self.evaluate(var1))
         # Step 2: the momentum accumulators contain the previous update.
         mom_update.run()
         # Check that the momentum accumulators have been updated.
         self.assertAllCloseAccordingToType(
-            np.array([(0.9 * 0.1 + 0.1), (0.9 * 0.1 + 0.1)]), slot0.eval())
+            np.array([(0.9 * 0.1 + 0.1), (0.9 * 0.1 + 0.1)]),
+            self.evaluate(slot0))
         self.assertAllCloseAccordingToType(
-            np.array([(0.9 * 0.01 + 0.01), (0.9 * 0.01 + 0.01)]), slot1.eval())
+            np.array([(0.9 * 0.01 + 0.01), (0.9 * 0.01 + 0.01)]),
+            self.evaluate(slot1))
         # Check that the parameters have been updated.
         self.assertAllCloseAccordingToType(
             np.array([
                 1.0 - (0.1 * 2.0) - ((0.9 * 0.1 + 0.1) * 2.0),
                 2.0 - (0.1 * 2.0) - ((0.9 * 0.1 + 0.1) * 2.0)
-            ]), var0.eval())
+            ]), self.evaluate(var0))
         self.assertAllCloseAccordingToType(
             np.array([
-                2.98 - ((0.9 * 0.01 + 0.01) * 2.0), 3.98 - (
-                    (0.9 * 0.01 + 0.01) * 2.0)
-            ]), var1.eval())
+                2.98 - ((0.9 * 0.01 + 0.01) * 2.0),
+                3.98 - ((0.9 * 0.01 + 0.01) * 2.0)
+            ]), self.evaluate(var1))
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/compiler/tests/placeholder_test.py b/tensorflow/compiler/tests/placeholder_test.py
index 77bb839409f..9671ae0ae97 100644
--- a/tensorflow/compiler/tests/placeholder_test.py
+++ b/tensorflow/compiler/tests/placeholder_test.py
@@ -33,7 +33,7 @@ class PlaceholderTest(xla_test.XLATestCase):
       ph = array_ops.placeholder_with_default(v, shape=[])
       out = ph * 2
       sess.run(variables.variables_initializer([v]))
-      self.assertEqual(8.0, sess.run(out))
+      self.assertEqual(8.0, self.evaluate(out))
 
   def test_placeholder_with_default_fed(self):
     with self.cached_session() as sess, self.test_scope():
diff --git a/tensorflow/compiler/tests/powersign_test.py b/tensorflow/compiler/tests/powersign_test.py
index 86536da7fed..5b35c200277 100644
--- a/tensorflow/compiler/tests/powersign_test.py
+++ b/tensorflow/compiler/tests/powersign_test.py
@@ -91,8 +91,8 @@ class PowerSignTest(xla_test.XLATestCase):
 
         variables.global_variables_initializer().run()
         # Fetch params to validate initial values
-        self.assertAllClose([1.0, 2.0], var0.eval())
-        self.assertAllClose([3.0, 4.0], var1.eval())
+        self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+        self.assertAllClose([3.0, 4.0], self.evaluate(var1))
 
         # Run 7 steps of powersign
         # first 4 steps with positive gradient
@@ -125,8 +125,8 @@ class PowerSignTest(xla_test.XLATestCase):
           )
 
           # Validate updated params
-          self.assertAllCloseAccordingToType(var0_np, var0.eval())
-          self.assertAllCloseAccordingToType(var1_np, var1.eval())
+          self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
+          self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
 
   def testDense(self):
     decay_steps = 10
diff --git a/tensorflow/compiler/tests/proximal_adagrad_test.py b/tensorflow/compiler/tests/proximal_adagrad_test.py
index c41b4171e26..63cc51a4701 100644
--- a/tensorflow/compiler/tests/proximal_adagrad_test.py
+++ b/tensorflow/compiler/tests/proximal_adagrad_test.py
@@ -45,15 +45,17 @@ class ProximalAdagradOptimizerTest(xla_test.XLATestCase):
       update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
       variables.global_variables_initializer().run()
 
-      self.assertAllClose([0.0, 0.0], var0.eval())
-      self.assertAllClose([0.0, 0.0], var1.eval())
+      self.assertAllClose([0.0, 0.0], self.evaluate(var0))
+      self.assertAllClose([0.0, 0.0], self.evaluate(var1))
 
       # Run 3 steps Proximal Adagrad.
       for _ in range(3):
         update.run()
 
-      self.assertAllClose(np.array([-2.60260963, -4.29698515]), var0.eval())
-      self.assertAllClose(np.array([-0.28432083, -0.56694895]), var1.eval())
+      self.assertAllClose(
+          np.array([-2.60260963, -4.29698515]), self.evaluate(var0))
+      self.assertAllClose(
+          np.array([-0.28432083, -0.56694895]), self.evaluate(var1))
       opt_vars = opt.variables()
       self.assertStartsWith(opt_vars[0].name, var0._shared_name)
       self.assertStartsWith(opt_vars[1].name, var1._shared_name)
@@ -74,14 +76,14 @@ class ProximalAdagradOptimizerTest(xla_test.XLATestCase):
       update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
       variables.global_variables_initializer().run()
 
-      self.assertAllClose([1.0, 2.0], var0.eval())
-      self.assertAllClose([4.0, 3.0], var1.eval())
+      self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+      self.assertAllClose([4.0, 3.0], self.evaluate(var1))
 
       # Run 3 steps Proximal Adagrad.
       for _ in range(3):
         update.run()
-      self.assertAllClose(np.array([-1.60261, -2.296985]), var0.eval())
-      self.assertAllClose(np.array([3.715679, 2.433051]), var1.eval())
+      self.assertAllClose(np.array([-1.60261, -2.296985]), self.evaluate(var0))
+      self.assertAllClose(np.array([3.715679, 2.433051]), self.evaluate(var1))
 
   def testProximalAdagradWithL1(self):
     with self.cached_session(), self.test_scope():
@@ -98,14 +100,14 @@ class ProximalAdagradOptimizerTest(xla_test.XLATestCase):
       update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
       variables.global_variables_initializer().run()
 
-      self.assertAllClose([1.0, 2.0], var0.eval())
-      self.assertAllClose([4.0, 3.0], var1.eval())
+      self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+      self.assertAllClose([4.0, 3.0], self.evaluate(var1))
 
       # Run 10 steps Proximal Adagrad
       for _ in range(10):
         update.run()
-      self.assertAllClose(np.array([-6.663634, -9.190331]), var0.eval())
-      self.assertAllClose(np.array([2.959304, 1.029232]), var1.eval())
+      self.assertAllClose(np.array([-6.663634, -9.190331]), self.evaluate(var0))
+      self.assertAllClose(np.array([2.959304, 1.029232]), self.evaluate(var1))
 
   def testProximalAdagradWithL1_L2(self):
     with self.cached_session(), self.test_scope():
@@ -122,15 +124,15 @@ class ProximalAdagradOptimizerTest(xla_test.XLATestCase):
       update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
       variables.global_variables_initializer().run()
 
-      self.assertAllClose([1.0, 2.0], var0.eval())
-      self.assertAllClose([4.0, 3.0], var1.eval())
+      self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+      self.assertAllClose([4.0, 3.0], self.evaluate(var1))
 
       # Run 10 steps Proximal Adagrad.
       for _ in range(10):
         update.run()
 
-      self.assertAllClose(np.array([-0.0495, -0.0995]), var0.eval())
-      self.assertAllClose(np.array([-0.0045, -0.0095]), var1.eval())
+      self.assertAllClose(np.array([-0.0495, -0.0995]), self.evaluate(var0))
+      self.assertAllClose(np.array([-0.0045, -0.0095]), self.evaluate(var1))
 
   def applyOptimizer(self, opt, steps=5):
     var0 = resource_variable_ops.ResourceVariable([1.0, 2.0])
@@ -141,14 +143,14 @@ class ProximalAdagradOptimizerTest(xla_test.XLATestCase):
     update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
     variables.global_variables_initializer().run()
 
-    self.assertAllClose([1.0, 2.0], var0.eval())
-    self.assertAllClose([3.0, 4.0], var1.eval())
+    self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+    self.assertAllClose([3.0, 4.0], self.evaluate(var1))
 
     # Run ProximalAdagrad for a few steps
     for _ in range(steps):
       update.run()
 
-    return var0.eval(), var1.eval()
+    return self.evaluate(var0), self.evaluate(var1)
 
   def testEquivAdagradwithoutRegularization(self):
     with self.cached_session(), self.test_scope():
diff --git a/tensorflow/compiler/tests/proximal_gradient_descent_test.py b/tensorflow/compiler/tests/proximal_gradient_descent_test.py
index 3d808e6b8a7..5aec433be76 100644
--- a/tensorflow/compiler/tests/proximal_gradient_descent_test.py
+++ b/tensorflow/compiler/tests/proximal_gradient_descent_test.py
@@ -42,15 +42,15 @@ class ProximalGradientDescentOptimizerTest(xla_test.XLATestCase):
       update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
       variables.global_variables_initializer().run()
 
-      self.assertAllClose([0.0, 0.0], var0.eval())
-      self.assertAllClose([0.0, 0.0], var1.eval())
+      self.assertAllClose([0.0, 0.0], self.evaluate(var0))
+      self.assertAllClose([0.0, 0.0], self.evaluate(var1))
 
       # Run 3 steps Proximal Gradient Descent.
       for _ in range(3):
         update.run()
 
-      self.assertAllClose(np.array([-0.9, -1.8]), var0.eval())
-      self.assertAllClose(np.array([-0.09, -0.18]), var1.eval())
+      self.assertAllClose(np.array([-0.9, -1.8]), self.evaluate(var0))
+      self.assertAllClose(np.array([-0.09, -0.18]), self.evaluate(var1))
 
   def testProximalGradientDescentwithoutRegularization2(self):
     with self.cached_session(), self.test_scope():
@@ -64,15 +64,15 @@ class ProximalGradientDescentOptimizerTest(xla_test.XLATestCase):
       update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
       variables.global_variables_initializer().run()
 
-      self.assertAllClose([1.0, 2.0], var0.eval())
-      self.assertAllClose([4.0, 3.0], var1.eval())
+      self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+      self.assertAllClose([4.0, 3.0], self.evaluate(var1))
 
       # Run 3 steps Proximal Gradient Descent
       for _ in range(3):
         update.run()
 
-      self.assertAllClose(np.array([0.1, 0.2]), var0.eval())
-      self.assertAllClose(np.array([3.91, 2.82]), var1.eval())
+      self.assertAllClose(np.array([0.1, 0.2]), self.evaluate(var0))
+      self.assertAllClose(np.array([3.91, 2.82]), self.evaluate(var1))
 
   def testProximalGradientDescentWithL1(self):
     with self.cached_session(), self.test_scope():
@@ -86,15 +86,15 @@ class ProximalGradientDescentOptimizerTest(xla_test.XLATestCase):
       update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
       variables.global_variables_initializer().run()
 
-      self.assertAllClose([1.0, 2.0], var0.eval())
-      self.assertAllClose([4.0, 3.0], var1.eval())
+      self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+      self.assertAllClose([4.0, 3.0], self.evaluate(var1))
 
       # Run 10 steps proximal gradient descent.
       for _ in range(10):
         update.run()
 
-      self.assertAllClose(np.array([-1.988, -3.988001]), var0.eval())
-      self.assertAllClose(np.array([3.67, 2.37]), var1.eval())
+      self.assertAllClose(np.array([-1.988, -3.988001]), self.evaluate(var0))
+      self.assertAllClose(np.array([3.67, 2.37]), self.evaluate(var1))
 
   def testProximalGradientDescentWithL1_L2(self):
     with self.cached_session(), self.test_scope():
@@ -108,15 +108,15 @@ class ProximalGradientDescentOptimizerTest(xla_test.XLATestCase):
       update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
       variables.global_variables_initializer().run()
 
-      self.assertAllClose([1.0, 2.0], var0.eval())
-      self.assertAllClose([4.0, 3.0], var1.eval())
+      self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+      self.assertAllClose([4.0, 3.0], self.evaluate(var1))
 
       # Run 10 steps Proximal Gradient Descent
       for _ in range(10):
         update.run()
 
-      self.assertAllClose(np.array([-0.0495, -0.0995]), var0.eval())
-      self.assertAllClose(np.array([-0.0045, -0.0095]), var1.eval())
+      self.assertAllClose(np.array([-0.0495, -0.0995]), self.evaluate(var0))
+      self.assertAllClose(np.array([-0.0045, -0.0095]), self.evaluate(var1))
 
   def applyOptimizer(self, opt, steps=5):
     var0 = resource_variable_ops.ResourceVariable([1.0, 2.0])
@@ -127,14 +127,14 @@ class ProximalGradientDescentOptimizerTest(xla_test.XLATestCase):
     update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
     variables.global_variables_initializer().run()
 
-    self.assertAllClose([1.0, 2.0], var0.eval())
-    self.assertAllClose([3.0, 4.0], var1.eval())
+    self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+    self.assertAllClose([3.0, 4.0], self.evaluate(var1))
 
     # Run ProximalAdagrad for a few steps
     for _ in range(steps):
       update.run()
 
-    return var0.eval(), var1.eval()
+    return self.evaluate(var0), self.evaluate(var1)
 
   def testEquivGradientDescentwithoutRegularization(self):
     with self.cached_session(), self.test_scope():
diff --git a/tensorflow/compiler/tests/qr_op_test.py b/tensorflow/compiler/tests/qr_op_test.py
index 236b1b881dc..b4d4193e35f 100644
--- a/tensorflow/compiler/tests/qr_op_test.py
+++ b/tensorflow/compiler/tests/qr_op_test.py
@@ -63,7 +63,7 @@ class QrOpTest(xla_test.XLATestCase, parameterized.TestCase):
     # Tests that x[...,:,:]^H * x[...,:,:] is close to the identity.
     xx = math_ops.matmul(x, x, adjoint_a=True)
     identity = array_ops.matrix_band_part(array_ops.ones_like(xx), 0, 0)
-    precision = self.AdjustedNorm(xx.eval() - identity.eval())
+    precision = self.AdjustedNorm(xx.eval() - self.evaluate(identity))
     self.assertTrue(np.all(precision < 5.0))
 
   def _test(self, dtype, shape, full_matrices):
diff --git a/tensorflow/compiler/tests/random_ops_test.py b/tensorflow/compiler/tests/random_ops_test.py
index 36ef6ed5fee..97ffad34c00 100644
--- a/tensorflow/compiler/tests/random_ops_test.py
+++ b/tensorflow/compiler/tests/random_ops_test.py
@@ -46,9 +46,9 @@ class RandomOpsTest(xla_test.XLATestCase):
 
       # The random-number generator, if working correctly, should produce the
       # same output multiple times with low probability.
-      y = sess.run(x)
-      z = sess.run(x)
-      w = sess.run(x)
+      y = self.evaluate(x)
+      z = self.evaluate(x)
+      w = self.evaluate(x)
 
       # We use exact equality here. If the random-number generator is producing
       # deterministic output, all three outputs will be bitwise identical.
@@ -83,7 +83,7 @@ class RandomOpsTest(xla_test.XLATestCase):
         with self.test_scope():
           x = random_ops.random_uniform(
               shape=[1000], dtype=dtype, minval=-2, maxval=33)
-        y = sess.run(x)
+        y = self.evaluate(x)
         self.assertTrue((y >= -2).sum() == 1000)
         self.assertTrue((y < 33).sum() == 1000)
 
@@ -102,7 +102,7 @@ class RandomOpsTest(xla_test.XLATestCase):
       with self.cached_session() as sess:
         with self.test_scope():
           x = random_ops.truncated_normal(shape=[count], dtype=dtype)
-        y = sess.run(x)
+        y = self.evaluate(x)
 
         def normal_cdf(x):
           return .5 * math.erfc(-x / math.sqrt(2))
@@ -111,7 +111,7 @@ class RandomOpsTest(xla_test.XLATestCase):
           return math.exp(-(x**2) / 2.) / math.sqrt(2 * math.pi)
 
         def probit(x, sess=sess):
-          return sess.run(special_math.ndtri(x))
+          return self.evaluate(special_math.ndtri(x))
 
         a = -2.
         b = 2.
@@ -148,7 +148,7 @@ class RandomOpsTest(xla_test.XLATestCase):
       with self.test_scope():
         x = math_ops.range(1 << 16)
         shuffle = random_ops.random_shuffle(x)
-      result = sess.run(shuffle)
+      result = self.evaluate(shuffle)
       expected = range(1 << 16)
       # Compare sets to avoid randomness behavior changes but make sure still
       # have all the values.
@@ -159,7 +159,7 @@ class RandomOpsTest(xla_test.XLATestCase):
       with self.test_scope():
         x = array_ops.diag(math_ops.range(20))
         shuffle = random_ops.random_shuffle(x)
-      result = sess.run(shuffle)
+      result = self.evaluate(shuffle)
       expected = np.diag(range(20)).flatten()
       # Compare sets to avoid randomness behavior changes but make sure still
       # have all the values.
diff --git a/tensorflow/compiler/tests/randomized_tests.cc b/tensorflow/compiler/tests/randomized_tests.cc
index a6b58020126..d23fd125163 100644
--- a/tensorflow/compiler/tests/randomized_tests.cc
+++ b/tensorflow/compiler/tests/randomized_tests.cc
@@ -3382,10 +3382,10 @@ int main(int argc, char** argv) {
   }
   // XLA devices register kernels at construction time; create all known devices
   // to make sure the kernels are registered.
-  std::vector<tensorflow::Device*> devices;
+  std::vector<std::unique_ptr<tensorflow::Device>> devices;
   TF_CHECK_OK(tensorflow::DeviceFactory::AddDevices(
       tensorflow::SessionOptions(), "", &devices));
-  tensorflow::DeviceMgr device_mgr(devices);
+  tensorflow::DeviceMgr device_mgr(std::move(devices));
 
   tensorflow::Device* ignored;
   TF_QCHECK_OK(
diff --git a/tensorflow/compiler/tests/reduce_ops_test.py b/tensorflow/compiler/tests/reduce_ops_test.py
index 132c59c32c9..e8fc81bbb54 100644
--- a/tensorflow/compiler/tests/reduce_ops_test.py
+++ b/tensorflow/compiler/tests/reduce_ops_test.py
@@ -91,6 +91,7 @@ class ReduceOpsTest(xla_test.XLATestCase, parameterized.TestCase):
       np.array([], dtype=np.bool).reshape(0, 3),
       np.array([[False, True, False], [True, True, False]]),
   ]
+  ONES = [np.ones([34000, 2])]
 
   def testReduceSumF32(self, index_dtype):
     self._testReduction(math_ops.reduce_sum, np.sum, np.float32, self.REAL_DATA,
@@ -149,6 +150,11 @@ class ReduceOpsTest(xla_test.XLATestCase, parameterized.TestCase):
     self._testReduction(math_ops.reduce_mean, np.mean, np.float32,
                         self.NONEMPTY_REAL_DATA, index_dtype)
 
+  def testReduceMeanF16(self, index_dtype):
+    if np.float16 in self.all_types:
+      self._testReduction(math_ops.reduce_mean, np.mean, np.float16, self.ONES,
+                          index_dtype)
+
   def testReduceMeanC64(self, index_dtype):
     self._testReduction(math_ops.reduce_mean, np.mean, np.complex64,
                         self.NONEMPTY_COMPLEX_DATA, index_dtype)
diff --git a/tensorflow/compiler/tests/rmsprop_test.py b/tensorflow/compiler/tests/rmsprop_test.py
index 8840a1329a9..dc3e90b4afa 100644
--- a/tensorflow/compiler/tests/rmsprop_test.py
+++ b/tensorflow/compiler/tests/rmsprop_test.py
@@ -76,7 +76,7 @@ class RmspropTest(xla_test.XLATestCase):
           rms_opt = rmsprop.RMSPropOptimizer(learning_rate, centered=centered)
           rms_update = rms_opt.apply_gradients(
               zip([grads0, grads1], [var0, var1]))
-          variables.global_variables_initializer().run()
+          self.evaluate(variables.global_variables_initializer())
 
           mg0 = rms_opt.get_slot(var0, "mg")
           self.assertEqual(mg0 is not None, centered)
@@ -92,12 +92,12 @@ class RmspropTest(xla_test.XLATestCase):
           self.assertTrue(mom1 is not None)
 
           # Fetch params to validate initial values
-          self.assertAllClose([1.0, 2.0], var0.eval())
-          self.assertAllClose([3.0, 4.0], var1.eval())
+          self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+          self.assertAllClose([3.0, 4.0], self.evaluate(var1))
 
           # Run 3 steps of RMSProp
           for _ in range(3):
-            rms_update.run()
+            self.evaluate(rms_update)
 
             var0_np, mg0_np, rms0_np, mom0_np = self._rmsprop_update_numpy(
                 var0_np,
@@ -118,14 +118,14 @@ class RmspropTest(xla_test.XLATestCase):
 
             # Validate updated params
             if centered:
-              self.assertAllCloseAccordingToType(mg0_np, mg0.eval())
-              self.assertAllCloseAccordingToType(mg1_np, mg1.eval())
-            self.assertAllCloseAccordingToType(rms0_np, rms0.eval())
-            self.assertAllCloseAccordingToType(rms1_np, rms1.eval())
-            self.assertAllCloseAccordingToType(mom0_np, mom0.eval())
-            self.assertAllCloseAccordingToType(mom1_np, mom1.eval())
-            self.assertAllCloseAccordingToType(var0_np, var0.eval())
-            self.assertAllCloseAccordingToType(var1_np, var1.eval())
+              self.assertAllCloseAccordingToType(mg0_np, self.evaluate(mg0))
+              self.assertAllCloseAccordingToType(mg1_np, self.evaluate(mg1))
+            self.assertAllCloseAccordingToType(rms0_np, self.evaluate(rms0))
+            self.assertAllCloseAccordingToType(rms1_np, self.evaluate(rms1))
+            self.assertAllCloseAccordingToType(mom0_np, self.evaluate(mom0))
+            self.assertAllCloseAccordingToType(mom1_np, self.evaluate(mom1))
+            self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
+            self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/compiler/tests/scan_ops_test.py b/tensorflow/compiler/tests/scan_ops_test.py
index 897db384b7e..17639bd8a75 100644
--- a/tensorflow/compiler/tests/scan_ops_test.py
+++ b/tensorflow/compiler/tests/scan_ops_test.py
@@ -71,7 +71,7 @@ def handle_options(func, x, axis, exclusive, reverse):
 
 class CumsumTest(xla_test.XLATestCase):
 
-  valid_dtypes = [np.float32]
+  valid_dtypes = [np.float32, np.int32]
 
   def axis_dtypes(self):
     return set(self.int_types).intersection([np.int32, np.int64])
@@ -149,7 +149,7 @@ class CumsumTest(xla_test.XLATestCase):
 
 class CumprodTest(xla_test.XLATestCase):
 
-  valid_dtypes = [np.float32]
+  valid_dtypes = [np.float32, np.int32]
 
   def axis_dtypes(self):
     return set(self.int_types).intersection([np.int32, np.int64])
diff --git a/tensorflow/compiler/tests/stateless_random_ops_test.py b/tensorflow/compiler/tests/stateless_random_ops_test.py
index 21708aa1587..ee7ca7e6f19 100644
--- a/tensorflow/compiler/tests/stateless_random_ops_test.py
+++ b/tensorflow/compiler/tests/stateless_random_ops_test.py
@@ -156,7 +156,7 @@ class StatelessRandomOpsTest(xla_test.XLATestCase):
           return math.exp(-(x**2) / 2.) / math.sqrt(2 * math.pi)
 
         def probit(x, sess=sess):
-          return sess.run(special_math.ndtri(x))
+          return self.evaluate(special_math.ndtri(x))
 
         a = -2.
         b = 2.
diff --git a/tensorflow/compiler/tests/tensor_array_ops_test.py b/tensorflow/compiler/tests/tensor_array_ops_test.py
index 46ca371c8ab..d7e26d79c4c 100644
--- a/tensorflow/compiler/tests/tensor_array_ops_test.py
+++ b/tensorflow/compiler/tests/tensor_array_ops_test.py
@@ -79,7 +79,8 @@ class TensorArrayTest(xla_test.XLATestCase):
       c0 = w2.stack()
 
       self.assertAllEqual(
-          convert([[[4.0, 5.0]], [[6.0, 7.0]], [[8.0, 9.0]]]), c0.eval())
+          convert([[[4.0, 5.0]], [[6.0, 7.0]], [[8.0, 9.0]]]),
+          self.evaluate(c0))
 
   def testTensorArrayWritePack(self):
     for dtype in self.numeric_tf_types:
@@ -97,7 +98,7 @@ class TensorArrayTest(xla_test.XLATestCase):
 
       c0 = w2.stack()
 
-      self.assertAllEqual([3, 0, 1], c0.eval().shape)
+      self.assertAllEqual([3, 0, 1], self.evaluate(c0).shape)
 
   def _testTensorArrayWriteConcat(self, tf_dtype):
     with self.cached_session(), self.test_scope():
@@ -113,8 +114,8 @@ class TensorArrayTest(xla_test.XLATestCase):
       c0 = w2.concat()
 
       self.assertAllEqual(
-          convert([[4.0, 5.0], [104.0, 105.0], [6.0, 7.0],
-                   [106.0, 107.0], [8.0, 9.0], [204.0, 205.0]]), c0.eval())
+          convert([[4.0, 5.0], [104.0, 105.0], [6.0, 7.0], [106.0, 107.0],
+                   [8.0, 9.0], [204.0, 205.0]]), self.evaluate(c0))
 
   def testTensorArrayWriteConcat(self):
     for dtype in self.numeric_tf_types:
@@ -341,7 +342,7 @@ class TensorArrayTest(xla_test.XLATestCase):
         r0_bad = gen_data_flow_ops.tensor_array_read_v3(
             handle=w0.handle, index=0, dtype=dtype2, flow_in=w0.flow)
         with self.assertRaisesOpError("TensorArray dtype is "):
-          r0_bad.eval()
+          self.evaluate(r0_bad)
 
         # Test reading from a different index than the one we wrote to
         w0.read(1)
@@ -422,7 +423,7 @@ class TensorArrayTest(xla_test.XLATestCase):
       w2 = h2.write(0, 5.0)
       r2 = w2.read(0)
       r = r1 + r2
-      self.assertAllClose(9.0, r.eval())
+      self.assertAllClose(9.0, self.evaluate(r))
 
   def _testTensorArrayGradientWriteReadType(self, dtype):
     with self.cached_session() as session, self.test_scope():
@@ -504,7 +505,7 @@ class TensorArrayTest(xla_test.XLATestCase):
                 [-0.5, 1.5],  # read(0) gradient
                 [20.0, 30.0, 40.0, 50.0],  # concat gradient
             ])
-      grad_vals = sess.run(grad_r)  # 2 + 2 entries
+      grad_vals = self.evaluate(grad_r)  # 2 + 2 entries
 
       self.assertAllClose([2.0 - 0.5 + 20.0, 3.0 + 1.5 + 30.0], grad_vals[0])
       self.assertAllEqual([4.0 + 40.0, 5.0 + 50.0], grad_vals[1])
@@ -526,7 +527,7 @@ class TensorArrayTest(xla_test.XLATestCase):
       with ops.control_dependencies([r0_readtwice]):
         r1_readtwice = w_readtwice.read(0)
 
-      self.assertAllEqual([1.0, -1.0], r1_readtwice.eval())
+      self.assertAllEqual([1.0, -1.0], self.evaluate(r1_readtwice))
 
   def _testTensorArrayGradientUnpackRead(self):
     with self.cached_session() as session, self.test_scope():
@@ -592,7 +593,7 @@ class TensorArrayTest(xla_test.XLATestCase):
       ta = tensor_array_ops.TensorArray(
           dtype=dtypes.float32, tensor_array_name="foo", size=3)
       s = ta.size()
-      self.assertAllEqual(3, s.eval())
+      self.assertAllEqual(3, self.evaluate(s))
 
   def testWriteCloseTensorArray(self):
     with self.cached_session(), self.test_scope():
@@ -722,7 +723,7 @@ class TensorArrayTest(xla_test.XLATestCase):
 
   #     r = acc2.stack()
   #     grad = gradients_impl.gradients(r, [x])[0]
-  #     self.assertAllClose(31.0, grad.eval())
+  #     self.assertAllClose(31.0, self.evaluate(grad))
 
   def testSumOfTwoReadVariablesWithoutRepeatGrad(self):
     with self.cached_session() as session, self.test_scope():
@@ -912,7 +913,7 @@ class TensorArrayTest(xla_test.XLATestCase):
       self.assertEqual(0, ta.size().eval())
       ta = ta.unstack(array_ops.zeros([0, 3, 5]))
       packed = ta.stack()
-      self.assertAllEqual([0, 3, 5], packed.eval().shape)
+      self.assertAllEqual([0, 3, 5], self.evaluate(packed).shape)
       # Concatenating zero tensors along their first dimension gives a
       # first dimension of zero
       self.assertAllEqual([0, 5], ta.concat().eval().shape)
@@ -1041,8 +1042,8 @@ class TensorArrayTest(xla_test.XLATestCase):
           (read0, read1, size0, size1))
 
       # Tests that the control dependencies was added and executed.
-      self.assertEqual(1, v0.eval())
-      self.assertEqual(1, v1.eval())
+      self.assertEqual(1, self.evaluate(v0))
+      self.assertEqual(1, self.evaluate(v1))
 
       # Tests correct TensorArray.
       self.assertEqual(read0_v, 0)
diff --git a/tensorflow/compiler/tests/unary_ops_test.py b/tensorflow/compiler/tests/unary_ops_test.py
index d612d3b32dd..95c9e7ffd46 100644
--- a/tensorflow/compiler/tests/unary_ops_test.py
+++ b/tensorflow/compiler/tests/unary_ops_test.py
@@ -481,6 +481,72 @@ class UnaryOpsTest(xla_test.XLATestCase):
           np.array([-1, -0.5, 0, 0.3], dtype=dtype),
           expected=np.array([-1., -0.5, 0., 0.296875], dtype=dtype))
 
+      def quantize_and_dequantize_v2_round_half_up(x):
+        return array_ops.quantize_and_dequantize_v2(
+            x,
+            -1,
+            1.0,
+            signed_input=True,
+            num_bits=8,
+            range_given=True,
+            round_mode="HALF_UP")
+
+      self._assertOpOutputMatchesExpected(
+          quantize_and_dequantize_v2_round_half_up,
+          np.array([-0.8, -0.5, 0, 0.3, 0.8, -2, 33], dtype=dtype),
+          expected=np.array([
+              -102.0 / 127,
+              -63.0 / 127,
+              0,
+              38.0 / 127,
+              102.0 / 127,
+              -128.0 / 127,
+              1,
+          ],
+                            dtype=dtype))
+
+      def quantize_and_dequantize_v2_round_half_to_even(x):
+        return array_ops.quantize_and_dequantize_v2(
+            x,
+            -1.0,
+            1.0,
+            signed_input=True,
+            num_bits=8,
+            range_given=True,
+            round_mode="HALF_TO_EVEN")
+
+      self._assertOpOutputMatchesExpected(
+          quantize_and_dequantize_v2_round_half_to_even,
+          np.array(
+              [
+                  -0.8,
+                  # The -0.5 should become -63.5 after scaling and with
+                  # rounding this should become -64. But with the test
+                  # unary_ops_test_cpu_ondemand, this fails as the result
+                  # before scaling becomes -63.499996 and gets rounded to -63.
+                  # TODO(sreenik): Some one more familiar with this test needs
+                  # to take a look and resolve this. This works on all other
+                  # variations of the platform like cpu, and gpu.
+                  # -0.5,
+                  0,
+                  0.3,
+                  0.8,
+                  -2,
+                  33
+              ],
+              dtype=dtype),
+          expected=np.array(
+              [
+                  -102.0 / 127,
+                  # -64.0 / 127,
+                  0,
+                  38.0 / 127,
+                  102.0 / 127,
+                  -128.0 / 127,
+                  1,
+              ],
+              dtype=dtype))
+
       def quantize_and_dequantize_v3(x):
         return array_ops.quantize_and_dequantize_v3(
             x, -127, 127, num_bits=8, signed_input=True, range_given=False)
diff --git a/tensorflow/compiler/tests/variable_ops_test.py b/tensorflow/compiler/tests/variable_ops_test.py
index 77cdeac8168..fcd7ac5ba1c 100644
--- a/tensorflow/compiler/tests/variable_ops_test.py
+++ b/tensorflow/compiler/tests/variable_ops_test.py
@@ -77,7 +77,7 @@ class VariableOpsTest(xla_test.XLATestCase):
         sess.run(variables.variables_initializer([v]))
         x = v.sparse_read(2)
         self.assertAllClose(
-            np.array([8j, 9, 10, 11]).astype(dtype), sess.run(x))
+            np.array([8j, 9, 10, 11]).astype(dtype), self.evaluate(x))
 
   def testSparseRead1DIndices(self):
     for dtype in self.numeric_types:
@@ -89,7 +89,7 @@ class VariableOpsTest(xla_test.XLATestCase):
         x = v.sparse_read([2, 1])
         self.assertAllClose(
             np.array([[8, 9, 10, 11], [4, 5, 6j, 7]]).astype(dtype),
-            sess.run(x))
+            self.evaluate(x))
 
   def testSparseRead2DIndices(self):
     for dtype in self.numeric_types:
@@ -102,7 +102,7 @@ class VariableOpsTest(xla_test.XLATestCase):
         self.assertAllClose(
             np.array([[[8, 9, 10, 11], [4, 5, 6, 7]],
                       [[0, 1, 2j, 3], [8, 9, 10, 11]]]).astype(dtype),
-            sess.run(x))
+            self.evaluate(x))
 
   def testSparseRead2DIndices3DTensor(self):
     for dtype in self.numeric_types:
@@ -115,9 +115,9 @@ class VariableOpsTest(xla_test.XLATestCase):
         x = v.sparse_read([[2, 1], [3, 0]])
         self.assertAllClose(
             np.array(
-                [[[[20, 21, 22], [23, 24j, 25]], [[10, 11, 12], [13, 14, 15]]
-                 ], [[[30, 31, 32], [33, 34, 35]], [[0, 1, 2], [3, 4, 5]]]
-                ],).astype(dtype), sess.run(x))
+                [[[[20, 21, 22], [23, 24j, 25]], [[10, 11, 12], [13, 14, 15]]],
+                 [[[30, 31, 32], [33, 34, 35]], [[0, 1, 2], [3, 4, 5]]]
+                ],).astype(dtype), self.evaluate(x))
 
   def testShape(self):
     for dtype in self.numeric_types:
@@ -229,7 +229,7 @@ class VariableOpsTest(xla_test.XLATestCase):
           resource_variable_ops.resource_scatter_add(
               handle, [0], constant_op.constant([[2]], dtype=dtypes.int32)))
       read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32)
-      self.assertAllEqual(sess.run(read), [[3], [7]])
+      self.assertAllEqual(self.evaluate(read), [[3], [7]])
 
   def testScatterSub(self):
     with self.test_session() as sess, self.test_scope():
@@ -242,7 +242,7 @@ class VariableOpsTest(xla_test.XLATestCase):
           resource_variable_ops.resource_scatter_sub(
               handle, [1], constant_op.constant([[2]], dtype=dtypes.int32)))
       read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32)
-      self.assertAllEqual(sess.run(read), [[4], [-1]])
+      self.assertAllEqual(self.evaluate(read), [[4], [-1]])
 
   def testScatterMul(self):
     with self.test_session() as sess, self.test_scope():
@@ -255,7 +255,7 @@ class VariableOpsTest(xla_test.XLATestCase):
           resource_variable_ops.resource_scatter_mul(
               handle, [0], constant_op.constant([[5]], dtype=dtypes.int32)))
       read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32)
-      self.assertEqual(sess.run(read), [[5]])
+      self.assertEqual(self.evaluate(read), [[5]])
 
   def testScatterDiv(self):
     with self.test_session() as sess, self.test_scope():
@@ -268,7 +268,7 @@ class VariableOpsTest(xla_test.XLATestCase):
           resource_variable_ops.resource_scatter_div(
               handle, [0], constant_op.constant([[3]], dtype=dtypes.int32)))
       read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32)
-      self.assertAllEqual(sess.run(read), [[2]])
+      self.assertAllEqual(self.evaluate(read), [[2]])
 
   def testScatterMin(self):
     with self.test_session() as sess, self.test_scope():
@@ -281,7 +281,7 @@ class VariableOpsTest(xla_test.XLATestCase):
           resource_variable_ops.resource_scatter_min(
               handle, [0], constant_op.constant([[3]], dtype=dtypes.int32)))
       read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32)
-      self.assertEqual(sess.run(read), [[3]])
+      self.assertEqual(self.evaluate(read), [[3]])
 
   def testScatterMax(self):
     with self.test_session() as sess, self.test_scope():
@@ -294,7 +294,7 @@ class VariableOpsTest(xla_test.XLATestCase):
           resource_variable_ops.resource_scatter_max(
               handle, [0], constant_op.constant([[3]], dtype=dtypes.int32)))
       read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32)
-      self.assertEqual(sess.run(read), [[6]])
+      self.assertEqual(self.evaluate(read), [[6]])
 
   def testScatterUpdate(self):
     with self.test_session() as sess, self.test_scope():
@@ -307,7 +307,7 @@ class VariableOpsTest(xla_test.XLATestCase):
           resource_variable_ops.resource_scatter_update(
               handle, [0], constant_op.constant([[3]], dtype=dtypes.int32)))
       read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32)
-      self.assertEqual(sess.run(read), [[3]])
+      self.assertEqual(self.evaluate(read), [[3]])
 
   def testScatterAddScalar(self):
     with self.test_session() as sess, self.test_scope():
@@ -320,7 +320,7 @@ class VariableOpsTest(xla_test.XLATestCase):
           resource_variable_ops.resource_scatter_add(
               handle, [0], constant_op.constant(2, dtype=dtypes.int32)))
       read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32)
-      self.assertEqual(sess.run(read), [[3]])
+      self.assertEqual(self.evaluate(read), [[3]])
 
   def testScatterSubScalar(self):
     with self.test_session() as sess, self.test_scope():
@@ -333,7 +333,7 @@ class VariableOpsTest(xla_test.XLATestCase):
           resource_variable_ops.resource_scatter_sub(
               handle, [0], constant_op.constant(2, dtype=dtypes.int32)))
       read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32)
-      self.assertEqual(sess.run(read), [[-1]])
+      self.assertEqual(self.evaluate(read), [[-1]])
 
   def testScatterMulScalar(self):
     with self.test_session() as sess, self.test_scope():
@@ -346,7 +346,7 @@ class VariableOpsTest(xla_test.XLATestCase):
           resource_variable_ops.resource_scatter_mul(
               handle, [0], constant_op.constant(5, dtype=dtypes.int32)))
       read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32)
-      self.assertEqual(sess.run(read), [[5]])
+      self.assertEqual(self.evaluate(read), [[5]])
 
   def testScatterDivScalar(self):
     with self.test_session() as sess, self.test_scope():
@@ -359,7 +359,7 @@ class VariableOpsTest(xla_test.XLATestCase):
           resource_variable_ops.resource_scatter_div(
               handle, [0], constant_op.constant(3, dtype=dtypes.int32)))
       read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32)
-      self.assertEqual(sess.run(read), [[2]])
+      self.assertEqual(self.evaluate(read), [[2]])
 
   def testScatterMinScalar(self):
     with self.test_session() as sess, self.test_scope():
@@ -372,7 +372,7 @@ class VariableOpsTest(xla_test.XLATestCase):
           resource_variable_ops.resource_scatter_min(
               handle, [0], constant_op.constant(3, dtype=dtypes.int32)))
       read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32)
-      self.assertEqual(sess.run(read), [[3]])
+      self.assertEqual(self.evaluate(read), [[3]])
 
   def testScatterMaxScalar(self):
     with self.test_session() as sess, self.test_scope():
@@ -385,7 +385,7 @@ class VariableOpsTest(xla_test.XLATestCase):
           resource_variable_ops.resource_scatter_max(
               handle, [0], constant_op.constant(3, dtype=dtypes.int32)))
       read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32)
-      self.assertEqual(sess.run(read), [[6]])
+      self.assertEqual(self.evaluate(read), [[6]])
 
   def testScatterNdAddOps(self):
     with self.test_session() as sess, self.test_scope():
@@ -400,7 +400,7 @@ class VariableOpsTest(xla_test.XLATestCase):
       sess.run(gen_state_ops.resource_scatter_nd_add(handle, indices, updates))
       read = resource_variable_ops.read_variable_op(
           handle, dtype=dtypes.float32)
-      self.assertAllClose(expected, sess.run(read))
+      self.assertAllClose(expected, self.evaluate(read))
 
   def testScatterNdUpdateAddOps(self):
     with self.test_session() as sess, self.test_scope():
@@ -416,7 +416,7 @@ class VariableOpsTest(xla_test.XLATestCase):
           gen_state_ops.resource_scatter_nd_update(handle, indices, updates))
       read = resource_variable_ops.read_variable_op(
           handle, dtype=dtypes.float32)
-      self.assertAllClose(expected, sess.run(read))
+      self.assertAllClose(expected, self.evaluate(read))
 
 
 class StridedSliceAssignChecker(object):
diff --git a/tensorflow/compiler/tests/xla_device_test.py b/tensorflow/compiler/tests/xla_device_test.py
index 28d61fb07dc..ef55292b1be 100644
--- a/tensorflow/compiler/tests/xla_device_test.py
+++ b/tensorflow/compiler/tests/xla_device_test.py
@@ -81,7 +81,7 @@ class XlaDeviceTest(xla_test.XLATestCase):
     with self.cached_session() as sess:
       with self.test_scope():
         x = gen_control_flow_ops.control_trigger()
-      sess.run(x)
+      self.evaluate(x)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/compiler/tf2xla/BUILD b/tensorflow/compiler/tf2xla/BUILD
index e0171415492..5a0d9b9af9d 100644
--- a/tensorflow/compiler/tf2xla/BUILD
+++ b/tensorflow/compiler/tf2xla/BUILD
@@ -9,6 +9,7 @@ package_group(
         "//tensorflow/compiler/jit/...",
         "//tensorflow/compiler/tests/...",
         "//tensorflow/compiler/tf2xla/...",
+        "//tensorflow/contrib/compiler/...",
     ],
 )
 
@@ -195,8 +196,8 @@ cc_library(
         ":sharding_util",
         ":side_effect_util",
         ":tf2xla_util",
+        "//tensorflow/compiler/jit:flags",
         "//tensorflow/compiler/jit:xla_cluster_util",
-        "//tensorflow/compiler/jit/legacy_flags:mark_for_compilation_pass_flags",
         "//tensorflow/compiler/tf2xla/lib:util",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:shape_util",
@@ -204,13 +205,13 @@ cc_library(
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/client",
         "//tensorflow/compiler/xla/client:client_library",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client:xla_builder",
         "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/client/lib:arithmetic",
         "//tensorflow/compiler/xla/client/lib:constants",
-        "//tensorflow/compiler/xla/client/lib:numeric",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
@@ -221,6 +222,7 @@ cc_library(
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:optional",
         "@com_google_absl//absl/types:span",
     ],
     alwayslink = 1,
@@ -437,21 +439,15 @@ cc_library(
     name = "dump_graph",
     srcs = [
         "dump_graph.cc",
-        "dump_graph_flags.cc",
-        "dump_graph_flags.h",
     ],
     hdrs = [
         "dump_graph.h",
     ],
     deps = [
-        "//tensorflow/compiler/xla:parse_flags_from_env",
-        "//tensorflow/core:core_cpu",
-        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/compiler/jit:flags",
         "//tensorflow/core:framework",
-        "//tensorflow/core:framework_internal",
-        "//tensorflow/core:lib",
+        "//tensorflow/core:graph",
         "//tensorflow/core:protos_all_cc",
-        "@com_google_absl//absl/strings",
     ],
 )
 
diff --git a/tensorflow/compiler/tf2xla/dump_graph.cc b/tensorflow/compiler/tf2xla/dump_graph.cc
index 380c6a7e23d..64fdbbebc65 100644
--- a/tensorflow/compiler/tf2xla/dump_graph.cc
+++ b/tensorflow/compiler/tf2xla/dump_graph.cc
@@ -18,87 +18,26 @@ limitations under the License.
 
 #include "tensorflow/compiler/tf2xla/dump_graph.h"
 
-#include "absl/strings/str_cat.h"
-#include "tensorflow/compiler/tf2xla/dump_graph_flags.h"
-#include "tensorflow/core/platform/env.h"
-#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/compiler/jit/flags.h"
+#include "tensorflow/core/util/dump_graph.h"
 
 namespace tensorflow {
 namespace dump_graph {
 
-namespace {
-
-struct NameCounts {
-  mutex counts_mutex;
-  std::unordered_map<string, int> counts;
-};
-
-string MakeUniqueFilename(string name) {
-  static NameCounts& instance = *new NameCounts;
-
-  // Remove illegal characters from `name`.
-  for (int i = 0; i < name.size(); ++i) {
-    char ch = name[i];
-    if (ch == '/' || ch == '[' || ch == ']' || ch == '*' || ch == '?') {
-      name[i] = '_';
-    }
-  }
-
-  int count;
-  {
-    mutex_lock lock(instance.counts_mutex);
-    count = instance.counts[name]++;
-  }
-
-  string filename = name;
-  if (count > 0) {
-    absl::StrAppend(&filename, "_", count);
-  }
-  absl::StrAppend(&filename, ".pbtxt");
-  return filename;
-}
-
-string WriteTextProtoToUniqueFile(
-    Env* env, const string& name, const char* proto_type,
-    const ::tensorflow::protobuf::Message& proto) {
-  const string& dirname =
-      legacy_flags::GetDumpGraphFlags()->tf_dump_graph_prefix;
-  Status status = env->RecursivelyCreateDir(dirname);
-  if (!status.ok()) {
-    LOG(WARNING) << "Failed to create " << dirname << " for dumping "
-                 << proto_type << ": " << status;
-    return "(unavailable)";
-  }
-  string filepath = absl::StrCat(dirname, "/", MakeUniqueFilename(name));
-  status = WriteTextProto(Env::Default(), filepath, proto);
-  if (!status.ok()) {
-    LOG(WARNING) << "Failed to dump " << proto_type << " to file: " << filepath
-                 << " : " << status;
-    return "(unavailable)";
-  }
-  LOG(INFO) << "Dumped " << proto_type << " to " << filepath;
-  return filepath;
-}
-
-}  // anonymous namespace
-
 string DumpGraphDefToFile(const string& name, GraphDef const& graph_def) {
-  return WriteTextProtoToUniqueFile(Env::Default(), name, "GraphDef",
-                                    graph_def);
+  return tensorflow::DumpGraphDefToFile(
+      name, graph_def, GetDumpGraphFlags()->tf_dump_graph_prefix);
 }
 
 string DumpGraphToFile(const string& name, Graph const& graph,
                        const FunctionLibraryDefinition* flib_def) {
-  GraphDef graph_def;
-  graph.ToGraphDef(&graph_def);
-  if (flib_def) {
-    *graph_def.mutable_library() = flib_def->ToProto();
-  }
-  return DumpGraphDefToFile(name, graph_def);
+  return tensorflow::DumpGraphToFile(name, graph, flib_def,
+                                     GetDumpGraphFlags()->tf_dump_graph_prefix);
 }
 
 string DumpFunctionDefToFile(const string& name, FunctionDef const& fdef) {
-  return WriteTextProtoToUniqueFile(Env::Default(), name, "FunctionDef", fdef);
+  return tensorflow::DumpFunctionDefToFile(
+      name, fdef, GetDumpGraphFlags()->tf_dump_graph_prefix);
 }
 
 }  // namespace dump_graph
diff --git a/tensorflow/compiler/tf2xla/dump_graph_flags.cc b/tensorflow/compiler/tf2xla/dump_graph_flags.cc
deleted file mode 100644
index 2eb1f8cd849..00000000000
--- a/tensorflow/compiler/tf2xla/dump_graph_flags.cc
+++ /dev/null
@@ -1,63 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// Legacy flags for the XLA bridge's dump_graph module.
-
-#include <mutex>
-#include <vector>
-
-#include "tensorflow/compiler/tf2xla/dump_graph_flags.h"
-#include "tensorflow/compiler/xla/parse_flags_from_env.h"
-#include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/util/command_line_flags.h"
-
-namespace tensorflow {
-namespace legacy_flags {
-
-// Pointers to the parsed value of the flags and flag descriptors, initialized
-// via flags_init.
-static DumpGraphFlags* flags;
-static std::vector<Flag>* flag_list;
-static std::once_flag flags_init;
-
-// Allocate *flags.  Called via call_once(&flags_init,...).
-static void AllocateFlags() {
-  flags = new DumpGraphFlags;
-  flags->tf_dump_graph_prefix = "/tmp/";
-  flag_list = new std::vector<Flag>({
-      Flag("tf_dump_graph_prefix", &flags->tf_dump_graph_prefix,
-           "Path prefix to which graphs dumped during debugging should be "
-           "written."),
-  });
-  xla::ParseFlagsFromEnv(*flag_list);
-}
-
-// Append to *append_to flag definitions associated with the XLA bridge's
-// dump_graph module.
-void AppendDumpGraphFlags(std::vector<Flag>* append_to) {
-  std::call_once(flags_init, &AllocateFlags);
-  append_to->insert(append_to->end(), flag_list->begin(), flag_list->end());
-}
-
-// Return a pointer to the DumpGraphFlags struct;
-// repeated calls return the same pointer.
-// This should be called only after Flags::Parse() has returned.
-DumpGraphFlags* GetDumpGraphFlags() {
-  std::call_once(flags_init, &AllocateFlags);
-  return flags;
-}
-
-}  // namespace legacy_flags
-}  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/dump_graph_flags.h b/tensorflow/compiler/tf2xla/dump_graph_flags.h
deleted file mode 100644
index 80a3307d920..00000000000
--- a/tensorflow/compiler/tf2xla/dump_graph_flags.h
+++ /dev/null
@@ -1,48 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_COMPILER_TF2XLA_DUMP_GRAPH_FLAGS_H_
-#define TENSORFLOW_COMPILER_TF2XLA_DUMP_GRAPH_FLAGS_H_
-
-// Legacy flags for the XLA bridge's dump_graph module.
-
-#include <vector>
-
-#include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/util/command_line_flags.h"
-
-namespace tensorflow {
-namespace legacy_flags {
-
-// Append to *flag_list flag definitions associated with the XLA bridge's
-// dump_graph module.
-void AppendDumpGraphFlags(std::vector<tensorflow::Flag>* flag_list);
-
-// The values of flags associated with the XLA bridge's
-// dump_graph module.
-typedef struct {
-  string tf_dump_graph_prefix;  // Path prefix to which graphs dumped during
-                                // debugging should be written.
-} DumpGraphFlags;
-
-// Return a pointer to the DumpGraphFlags struct;
-// repeated calls return the same pointer.
-// This should be called only after Flags::Parse() has returned.
-DumpGraphFlags* GetDumpGraphFlags();
-
-}  // namespace legacy_flags
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_COMPILER_TF2XLA_DUMP_GRAPH_FLAGS_H_
diff --git a/tensorflow/compiler/tf2xla/functionalize_control_flow.cc b/tensorflow/compiler/tf2xla/functionalize_control_flow.cc
index 9ef9f49f422..3dfd3f854c8 100644
--- a/tensorflow/compiler/tf2xla/functionalize_control_flow.cc
+++ b/tensorflow/compiler/tf2xla/functionalize_control_flow.cc
@@ -75,6 +75,25 @@ Status FunctionalizeControlFlow(Graph* graph,
   return FunctionalizeControlFlow(/*lookup_library=*/nullptr, graph, library);
 }
 
+Status FunctionalizeControlFlowForGraphDef(GraphDef* graph_def,
+                                           FunctionLibraryDefinition* library) {
+  return FunctionalizeControlFlowForGraphDef(/*lookup_library=*/nullptr,
+                                             graph_def, library);
+}
+
+Status FunctionalizeControlFlowForGraphDef(
+    const FunctionLibraryDefinition* lookup_library, GraphDef* graph_def,
+    FunctionLibraryDefinition* library) {
+  FunctionDefLibrary function_lib = graph_def->library();
+  Graph graph(OpRegistry::Global());
+
+  TF_RETURN_IF_ERROR(ConvertGraphDefToGraph({}, *graph_def, &graph));
+  TF_RETURN_IF_ERROR(FunctionalizeControlFlow(lookup_library, &graph, library));
+  graph.ToGraphDef(graph_def);
+  std::swap(*graph_def->mutable_library(), function_lib);
+  return Status::OK();
+}
+
 Status FunctionalizeControlFlowForFunction(
     const string& func_name, const string& new_func_name,
     const protobuf::Map<string, tensorflow::AttrValue>& attrs,
diff --git a/tensorflow/compiler/tf2xla/functionalize_control_flow.h b/tensorflow/compiler/tf2xla/functionalize_control_flow.h
index ba99205640c..91d33fa4058 100644
--- a/tensorflow/compiler/tf2xla/functionalize_control_flow.h
+++ b/tensorflow/compiler/tf2xla/functionalize_control_flow.h
@@ -33,6 +33,12 @@ Status FunctionalizeControlFlow(const FunctionLibraryDefinition* lookup_library,
                                 Graph* graph,
                                 FunctionLibraryDefinition* library);
 
+Status FunctionalizeControlFlowForGraphDef(GraphDef* graph_def,
+                                           FunctionLibraryDefinition* library);
+Status FunctionalizeControlFlowForGraphDef(
+    const FunctionLibraryDefinition* lookup_library, GraphDef* graph_def,
+    FunctionLibraryDefinition* library);
+
 // This pass looks at the graph and all associated FunctionDefs, and turns
 // traditional control flow structure (Switch/Merge/etc.) into functional
 // control flow structure (If/While).
diff --git a/tensorflow/compiler/tf2xla/functionalize_control_flow_test.cc b/tensorflow/compiler/tf2xla/functionalize_control_flow_test.cc
index c3841f996f8..9784985af83 100644
--- a/tensorflow/compiler/tf2xla/functionalize_control_flow_test.cc
+++ b/tensorflow/compiler/tf2xla/functionalize_control_flow_test.cc
@@ -95,77 +95,87 @@ TEST(FunctionalizeControlFlow, Conditional) {
   }
 
   FunctionLibraryDefinition library(OpRegistry::Global(), {});
+  GraphDef optimized_graph_def;
+  graph.ToGraphDef(&optimized_graph_def);
+  TF_ASSERT_OK(
+      FunctionalizeControlFlowForGraphDef(&optimized_graph_def, &library));
   TF_ASSERT_OK(FunctionalizeControlFlow(&graph, &library));
+  GraphDef converted_graph_def;
+  graph.ToGraphDef(&converted_graph_def);
 
-  GraphDef graph_def;
-  graph.ToGraphDef(&graph_def);
-  string op_name;
-  NameAttrList then_fn;
-  NameAttrList else_fn;
-  TF_EXPECT_OK(FindIfThenAndElse(graph_def, &op_name, &then_fn, &else_fn));
-  InstantiationResultForTest else_result;
-  TF_EXPECT_OK(
-      InstantiateFunctionForTest(else_fn.name(), library, &else_result));
+  for (const GraphDef& graph_def : {optimized_graph_def, converted_graph_def}) {
+    string op_name;
+    NameAttrList then_fn;
+    NameAttrList else_fn;
+    TF_EXPECT_OK(FindIfThenAndElse(graph_def, &op_name, &then_fn, &else_fn));
+    InstantiationResultForTest else_result;
+    TF_EXPECT_OK(
+        InstantiateFunctionForTest(else_fn.name(), library, &else_result));
 
-  // Outer graph
-  {
-    Scope scope = Scope::NewRootScope().ExitOnError();
-    auto y = ops::Placeholder(scope.WithOpName("y"), DT_INT32);
-    auto x = ops::Placeholder(scope.WithOpName("x"), DT_INT32);
-    auto less = ops::Less(scope.WithOpName("cond/Less"), y, x);
-    auto if_op = ops::If(scope.WithOpName(op_name), less,
-                         std::initializer_list<Input>{less, y, x}, {DT_INT32},
-                         then_fn, else_fn);
-    auto id = ops::Identity(scope.WithOpName("cond/Merge"), if_op.output[0]);
-    GraphDef expected;
-    TF_EXPECT_OK(scope.ToGraphDef(&expected));
-    TF_EXPECT_GRAPH_EQ(expected, graph_def);
-  }
+    // Outer graph
+    {
+      Scope scope = Scope::NewRootScope().ExitOnError();
+      auto y = ops::Placeholder(scope.WithOpName("y"), DT_INT32);
+      auto x = ops::Placeholder(scope.WithOpName("x"), DT_INT32);
+      auto less = ops::Less(scope.WithOpName("cond/Less"), y, x);
+      auto if_op = ops::If(scope.WithOpName(op_name), less,
+                           std::initializer_list<Input>{less, y, x}, {DT_INT32},
+                           then_fn, else_fn);
+      auto id = ops::Identity(scope.WithOpName("cond/Merge"), if_op.output[0]);
+      GraphDef expected;
+      TF_EXPECT_OK(scope.ToGraphDef(&expected));
+      TF_EXPECT_GRAPH_EQ(expected, graph_def);
+    }
 
-  // then body.
-  {
-    Scope scope = Scope::NewRootScope().ExitOnError();
-    auto arg_0 = ops::_Arg(scope.WithOpName("_arg0"), DT_BOOL, 0);
-    auto arg_1 = ops::_Arg(scope.WithOpName("_arg1"), DT_INT32, 1);
-    auto arg_2 = ops::_Arg(scope.WithOpName("_arg2"), DT_INT32, 2);
-    auto identity = ops::Identity(scope.WithOpName("cond/Identity"), arg_0);
-    auto cond = ops::Const(
-        scope.WithOpName("cond").WithControlDependencies(identity), 17);
-    auto mul = ops::Mul(scope.WithOpName("cond/Mul"), arg_1, cond);
-    auto retval0 = ops::_Retval(scope.WithOpName("_retval0_RetVal"), mul, 0);
+    // then body.
+    {
+      Scope scope = Scope::NewRootScope().ExitOnError();
+      auto arg_0 = ops::_Arg(scope.WithOpName("_arg0"), DT_BOOL, 0);
+      auto arg_1 = ops::_Arg(scope.WithOpName("_arg1"), DT_INT32, 1);
+      auto arg_2 = ops::_Arg(scope.WithOpName("_arg2"), DT_INT32, 2);
+      auto identity = ops::Identity(scope.WithOpName("cond/Identity"), arg_0);
+      auto cond = ops::Const(
+          scope.WithOpName("cond").WithControlDependencies(identity), 17);
+      auto mul = ops::Mul(scope.WithOpName("cond/Mul"), arg_1, cond);
+      auto retval0 = ops::_Retval(scope.WithOpName("_retval0_RetVal"), mul, 0);
 
-    GraphDef expected;
-    TF_EXPECT_OK(scope.ToGraphDef(&expected));
+      GraphDef expected;
+      TF_EXPECT_OK(scope.ToGraphDef(&expected));
 
-    InstantiationResultForTest result;
-    TF_EXPECT_OK(InstantiateFunctionForTest(then_fn.name(), library, &result));
+      InstantiationResultForTest result;
+      TF_EXPECT_OK(
+          InstantiateFunctionForTest(then_fn.name(), library, &result));
 
-    EXPECT_EQ(DataTypeVector{DT_INT32}, result.ret_types);
-    EXPECT_EQ((DataTypeVector{DT_BOOL, DT_INT32, DT_INT32}), result.arg_types);
-    TF_EXPECT_GRAPH_EQ(expected, result.gdef);
-  }
+      EXPECT_EQ(DataTypeVector{DT_INT32}, result.ret_types);
+      EXPECT_EQ((DataTypeVector{DT_BOOL, DT_INT32, DT_INT32}),
+                result.arg_types);
+      TF_EXPECT_GRAPH_EQ(expected, result.gdef);
+    }
 
-  // else body.
-  {
-    Scope scope = Scope::NewRootScope().ExitOnError();
-    auto arg_0 = ops::_Arg(scope.WithOpName("_arg0"), DT_BOOL, 0);
-    auto arg_1 = ops::_Arg(scope.WithOpName("_arg1"), DT_INT32, 1);
-    auto arg_2 = ops::_Arg(scope.WithOpName("_arg2"), DT_INT32, 2);
-    auto identity = ops::Identity(scope.WithOpName("cond/Identity_1"), arg_0);
-    auto cond_1 = ops::Const(
-        scope.WithOpName("cond_1").WithControlDependencies(identity), 23);
-    auto add = ops::Add(scope.WithOpName("cond/false/add"), arg_2, cond_1);
-    auto retval0 = ops::_Retval(scope.WithOpName("_retval0_RetVal"), add, 0);
+    // else body.
+    {
+      Scope scope = Scope::NewRootScope().ExitOnError();
+      auto arg_0 = ops::_Arg(scope.WithOpName("_arg0"), DT_BOOL, 0);
+      auto arg_1 = ops::_Arg(scope.WithOpName("_arg1"), DT_INT32, 1);
+      auto arg_2 = ops::_Arg(scope.WithOpName("_arg2"), DT_INT32, 2);
+      auto identity = ops::Identity(scope.WithOpName("cond/Identity_1"), arg_0);
+      auto cond_1 = ops::Const(
+          scope.WithOpName("cond_1").WithControlDependencies(identity), 23);
+      auto add = ops::Add(scope.WithOpName("cond/false/add"), arg_2, cond_1);
+      auto retval0 = ops::_Retval(scope.WithOpName("_retval0_RetVal"), add, 0);
 
-    GraphDef expected;
-    TF_EXPECT_OK(scope.ToGraphDef(&expected));
+      GraphDef expected;
+      TF_EXPECT_OK(scope.ToGraphDef(&expected));
 
-    InstantiationResultForTest result;
-    TF_EXPECT_OK(InstantiateFunctionForTest(else_fn.name(), library, &result));
+      InstantiationResultForTest result;
+      TF_EXPECT_OK(
+          InstantiateFunctionForTest(else_fn.name(), library, &result));
 
-    EXPECT_EQ(DataTypeVector{DT_INT32}, result.ret_types);
-    EXPECT_EQ((DataTypeVector{DT_BOOL, DT_INT32, DT_INT32}), result.arg_types);
-    TF_EXPECT_GRAPH_EQ(expected, result.gdef);
+      EXPECT_EQ(DataTypeVector{DT_INT32}, result.ret_types);
+      EXPECT_EQ((DataTypeVector{DT_BOOL, DT_INT32, DT_INT32}),
+                result.arg_types);
+      TF_EXPECT_GRAPH_EQ(expected, result.gdef);
+    }
   }
 }
 
@@ -239,75 +249,77 @@ TEST(FunctionalizeControlFlow, OneLoopVar) {
   }
 
   FunctionLibraryDefinition library(OpRegistry::Global(), {});
+  GraphDef optimized_graph_def;
+  graph.ToGraphDef(&optimized_graph_def);
+  TF_ASSERT_OK(
+      FunctionalizeControlFlowForGraphDef(&optimized_graph_def, &library));
   TF_ASSERT_OK(FunctionalizeControlFlow(&graph, &library));
+  GraphDef converted_graph_def;
+  graph.ToGraphDef(&converted_graph_def);
 
-  GraphDef graph_def;
-  graph.ToGraphDef(&graph_def);
+  for (const GraphDef& graph_def : {optimized_graph_def, converted_graph_def}) {
+    NameAttrList cond_fn, body_fn;
+    TF_EXPECT_OK(FindWhileCondAndBody(graph_def, &cond_fn, &body_fn));
 
-  NameAttrList cond_fn, body_fn;
-  TF_EXPECT_OK(FindWhileCondAndBody(graph_def, &cond_fn, &body_fn));
+    // Outer graph
+    {
+      Scope scope = Scope::NewRootScope().ExitOnError();
+      auto source = ops::Placeholder(scope.WithOpName("source"), DT_INT32);
+      auto while_op =
+          ops::While(scope.WithOpName("while/LoopCond"),
+                     std::initializer_list<Input>{source}, cond_fn, body_fn);
+      auto sink = ops::Identity(scope.WithOpName("sink"), while_op[0]);
+      GraphDef expected;
+      TF_EXPECT_OK(scope.ToGraphDef(&expected));
+      TF_EXPECT_GRAPH_EQ(expected, graph_def);
+    }
 
-  // Outer graph
-  {
-    Scope scope = Scope::NewRootScope().ExitOnError();
-    auto source = ops::Placeholder(scope.WithOpName("source"), DT_INT32);
-    auto while_op =
-        ops::While(scope.WithOpName("while/LoopCond"),
-                   std::initializer_list<Input>{source}, cond_fn, body_fn);
-    auto sink = ops::Identity(scope.WithOpName("sink"), while_op[0]);
-    GraphDef expected;
-    TF_EXPECT_OK(scope.ToGraphDef(&expected));
-    TF_EXPECT_GRAPH_EQ(expected, graph_def);
-  }
+    // Condition graph
+    {
+      Scope scope = Scope::NewRootScope().ExitOnError();
+      auto arg = ops::_Arg(scope.WithOpName("_arg0"), DT_INT32, 0);
+      auto ten = ops::Const<int32>(
+          scope.WithOpName("while/Less/y").WithControlDependencies(arg), 10);
+      auto less = ops::Less(scope.WithOpName("while/Less"), arg, ten);
+      auto retval = ops::_Retval(scope.WithOpName("_retval0_RetVal"), less, 0);
 
-  // Condition graph
-  {
-    Scope scope = Scope::NewRootScope().ExitOnError();
-    auto arg = ops::_Arg(scope.WithOpName("_arg0"), DT_INT32, 0);
-    auto ten = ops::Const<int32>(
-        scope.WithOpName("while/Less/y").WithControlDependencies(arg), 10);
-    auto less = ops::Less(scope.WithOpName("while/Less"), arg, ten);
-    auto retval = ops::_Retval(scope.WithOpName("_retval0_RetVal"), less, 0);
+      GraphDef expected;
+      TF_EXPECT_OK(scope.ToGraphDef(&expected));
 
-    GraphDef expected;
-    TF_EXPECT_OK(scope.ToGraphDef(&expected));
+      InstantiationResultForTest result;
+      TF_EXPECT_OK(
+          InstantiateFunctionForTest(cond_fn.name(), library, &result));
 
-    InstantiationResultForTest result;
-    TF_EXPECT_OK(InstantiateFunctionForTest(cond_fn.name(), library, &result));
+      EXPECT_EQ(DataTypeVector{DT_INT32}, result.arg_types);
+      EXPECT_EQ(DataTypeVector{DT_BOOL}, result.ret_types);
+      TF_EXPECT_GRAPH_EQ(expected, result.gdef);
+    }
 
-    EXPECT_EQ(DataTypeVector{DT_INT32}, result.arg_types);
-    EXPECT_EQ(DataTypeVector{DT_BOOL}, result.ret_types);
-    TF_EXPECT_GRAPH_EQ(expected, result.gdef);
-  }
+    // Body graph.
+    {
+      Scope scope = Scope::NewRootScope().ExitOnError();
+      auto arg = ops::_Arg(scope.WithOpName("_arg0"), DT_INT32, 0);
+      auto identity = ops::Identity(scope.WithOpName("while/Identity"), arg);
+      auto one = ops::Const<int32>(
+          scope.WithOpName("while/add/y").WithControlDependencies(identity), 1);
+      auto add = ops::Add(scope.WithOpName("while/add"), identity, one);
+      auto retval = ops::_Retval(scope.WithOpName("_retval0_RetVal"), add, 0);
 
-  // Body graph.
-  {
-    Scope scope = Scope::NewRootScope().ExitOnError();
-    auto arg = ops::_Arg(scope.WithOpName("_arg0"), DT_INT32, 0);
-    auto identity = ops::Identity(scope.WithOpName("while/Identity"), arg);
-    auto one = ops::Const<int32>(
-        scope.WithOpName("while/add/y").WithControlDependencies(identity), 1);
-    auto add = ops::Add(scope.WithOpName("while/add"), identity, one);
-    auto retval = ops::_Retval(scope.WithOpName("_retval0_RetVal"), add, 0);
+      GraphDef expected;
+      TF_EXPECT_OK(scope.ToGraphDef(&expected));
 
-    GraphDef expected;
-    TF_EXPECT_OK(scope.ToGraphDef(&expected));
+      InstantiationResultForTest result;
+      TF_EXPECT_OK(
+          InstantiateFunctionForTest(body_fn.name(), library, &result));
 
-    InstantiationResultForTest result;
-    TF_EXPECT_OK(InstantiateFunctionForTest(body_fn.name(), library, &result));
-
-    EXPECT_EQ(DataTypeVector{DT_INT32}, result.arg_types);
-    EXPECT_EQ(DataTypeVector{DT_INT32}, result.ret_types);
-    TF_EXPECT_GRAPH_EQ(expected, result.gdef);
+      EXPECT_EQ(DataTypeVector{DT_INT32}, result.arg_types);
+      EXPECT_EQ(DataTypeVector{DT_INT32}, result.ret_types);
+      TF_EXPECT_GRAPH_EQ(expected, result.gdef);
+    }
   }
 }
 
-// @function.Defun(noinline=True)
-// def increment_fn(x):
-//   return [x + 1]
-// Define the above function, and add it to the given graph. It's used as the
-// while loop body in NoinlineLoopBody test.
-Status AddNoinlineFunctionToGraph(const string& node_name, Graph* graph) {
+FunctionDef GetNoinlineFunctionDef() {
   FunctionDef fdef = FunctionDefHelper::Create(
       "increment_fn", {"x:int32"}, {"add:int32"}, {},
       {
@@ -316,8 +328,17 @@ Status AddNoinlineFunctionToGraph(const string& node_name, Graph* graph) {
       },
       {{"add", "add_0:z:0"}});
   (*fdef.mutable_attr())["_noinline"].set_b(true);
+  return fdef;
+}
+
+// @function.Defun(noinline=True)
+// def increment_fn(x):
+//   return [x + 1]
+// Define the above function, and add it to the given graph. It's used as the
+// while loop body in NoinlineLoopBody test.
+Status AddNoinlineFunctionToGraph(const string& node_name, Graph* graph) {
   FunctionDefLibrary fdef_lib;
-  *(fdef_lib.add_function()) = fdef;
+  *(fdef_lib.add_function()) = GetNoinlineFunctionDef();
   TF_RETURN_IF_ERROR(graph->AddFunctionLibrary(fdef_lib));
   NodeDef increment_fn;
   increment_fn.set_name(node_name);
@@ -376,55 +397,88 @@ TEST(FunctionalizeControlFlow, NoinlineLoopBody) {
   FunctionLibraryDefinition lookup_lib(graph.flib_def());
   FunctionLibraryDefinition library(OpRegistry::Global(), {});
   // Function increment_fn will be copied from lookup_lib to library.
+  GraphDef optimized_graph_def;
+  graph.ToGraphDef(&optimized_graph_def);
+
+  *(optimized_graph_def.mutable_library()->add_function()) =
+      GetNoinlineFunctionDef();
+
+  TF_ASSERT_OK(FunctionalizeControlFlowForGraphDef(
+      &lookup_lib, &optimized_graph_def, &library));
   TF_ASSERT_OK(FunctionalizeControlFlow(&lookup_lib, &graph, &library));
+  GraphDef converted_graph_def;
+  graph.ToGraphDef(&converted_graph_def);
 
-  GraphDef graph_def;
-  graph.ToGraphDef(&graph_def);
+  for (const GraphDef& graph_def : {optimized_graph_def, converted_graph_def}) {
+    NameAttrList cond_fn, body_fn;
+    TF_ASSERT_OK(FindWhileCondAndBody(graph_def, &cond_fn, &body_fn));
 
-  NameAttrList cond_fn, body_fn;
-  TF_ASSERT_OK(FindWhileCondAndBody(graph_def, &cond_fn, &body_fn));
+    // Outer graph
+    {
+      Scope scope = Scope::NewRootScope().ExitOnError();
+      auto source = ops::Placeholder(scope.WithOpName("source"), DT_INT32);
+      auto while_op =
+          ops::While(scope.WithOpName("while/LoopCond"),
+                     std::initializer_list<Input>{source}, cond_fn, body_fn);
+      GraphDef expected;
+      TF_ASSERT_OK(scope.ToGraphDef(&expected));
+      TF_EXPECT_GRAPH_EQ(expected, graph_def);
+    }
 
-  // Outer graph
+    // Body graph.
+    {
+      Scope scope = Scope::NewRootScope().ExitOnError();
+      auto arg = ops::_Arg(scope.WithOpName("_arg0"), DT_INT32, 0);
+      TF_ASSERT_OK(
+          AddNoinlineFunctionToGraph(noinline_node_name, scope.graph()));
+      auto identity = ops::Identity(scope.WithOpName("while/Identity"), arg);
+      NodeDef retval;
+      retval.set_name("_retval0_RetVal");
+      retval.set_op(FunctionLibraryDefinition::kRetOp);
+      *retval.add_input() = noinline_node_name;
+      (*retval.mutable_attr())["T"].set_type(DT_INT32);
+      (*retval.mutable_attr())["index"].set_i(0);
+      Status status;
+      scope.graph()->AddNode(retval, &status);
+      TF_ASSERT_OK(status);
+
+      GraphDef expected;
+      TF_ASSERT_OK(scope.ToGraphDef(&expected));
+
+      InstantiationResultForTest result;
+      // Verify that increment_fn has been copied to library.
+      TF_EXPECT_OK(
+          InstantiateFunctionForTest(body_fn.name(), library, &result));
+
+      EXPECT_EQ(DataTypeVector{DT_INT32}, result.arg_types);
+      EXPECT_EQ(DataTypeVector{DT_INT32}, result.ret_types);
+      // Ignore the function library when comparing the graphs.
+      expected.clear_library();
+      TF_EXPECT_GRAPH_EQ(expected, result.gdef);
+    }
+  }
+}
+
+TEST(FunctionalizeControlFlow, MissingFunctionDefInLibrary) {
+  const string& noinline_node_name = "while/increment_fn";
+  Graph graph(OpRegistry::Global());
   {
     Scope scope = Scope::NewRootScope().ExitOnError();
     auto source = ops::Placeholder(scope.WithOpName("source"), DT_INT32);
-    auto while_op =
-        ops::While(scope.WithOpName("while/LoopCond"),
-                   std::initializer_list<Input>{source}, cond_fn, body_fn);
-    GraphDef expected;
-    TF_ASSERT_OK(scope.ToGraphDef(&expected));
-    TF_EXPECT_GRAPH_EQ(expected, graph_def);
-  }
-
-  // Body graph.
-  {
-    Scope scope = Scope::NewRootScope().ExitOnError();
-    auto arg = ops::_Arg(scope.WithOpName("_arg0"), DT_INT32, 0);
+    auto identity = ops::Identity(scope.WithOpName("while/Identity"), source);
     TF_ASSERT_OK(AddNoinlineFunctionToGraph(noinline_node_name, scope.graph()));
-    auto identity = ops::Identity(scope.WithOpName("while/Identity"), arg);
-    NodeDef retval;
-    retval.set_name("_retval0_RetVal");
-    retval.set_op(FunctionLibraryDefinition::kRetOp);
-    *retval.add_input() = noinline_node_name;
-    (*retval.mutable_attr())["T"].set_type(DT_INT32);
-    (*retval.mutable_attr())["index"].set_i(0);
-    Status status;
-    scope.graph()->AddNode(retval, &status);
-    TF_ASSERT_OK(status);
-
-    GraphDef expected;
-    TF_ASSERT_OK(scope.ToGraphDef(&expected));
-
-    InstantiationResultForTest result;
-    // Verify that increment_fn has been copied to library.
-    TF_EXPECT_OK(InstantiateFunctionForTest(body_fn.name(), library, &result));
-
-    EXPECT_EQ(DataTypeVector{DT_INT32}, result.arg_types);
-    EXPECT_EQ(DataTypeVector{DT_INT32}, result.ret_types);
-    // Ignore the function library when comparing the graphs.
-    expected.clear_library();
-    TF_EXPECT_GRAPH_EQ(expected, result.gdef);
+    TF_ASSERT_OK(scope.ToGraph(&graph));
   }
+
+  FunctionLibraryDefinition lookup_lib(graph.flib_def());
+  FunctionLibraryDefinition library(OpRegistry::Global(), {});
+  GraphDef graph_def;
+  graph.ToGraphDef(&graph_def);
+  graph_def.clear_library();
+
+  Status status =
+      FunctionalizeControlFlowForGraphDef(&lookup_lib, &graph_def, &library);
+  EXPECT_EQ(tensorflow::error::NOT_FOUND, status.code());
 }
 
 // Tests functionalizing OneLoopVar where the loop value is not used post the
@@ -467,65 +521,72 @@ TEST(FunctionalizeControlFlow, OneLoopVarWithoutExit) {
   }
 
   FunctionLibraryDefinition library(OpRegistry::Global(), {});
+  GraphDef optimized_graph_def;
+  graph.ToGraphDef(&optimized_graph_def);
+  TF_ASSERT_OK(
+      FunctionalizeControlFlowForGraphDef(&optimized_graph_def, &library));
   TF_ASSERT_OK(FunctionalizeControlFlow(&graph, &library));
+  GraphDef converted_graph_def;
+  graph.ToGraphDef(&converted_graph_def);
 
-  GraphDef graph_def;
-  graph.ToGraphDef(&graph_def);
+  for (const GraphDef& graph_def : {optimized_graph_def, converted_graph_def}) {
+    NameAttrList cond_fn, body_fn;
+    TF_EXPECT_OK(FindWhileCondAndBody(graph_def, &cond_fn, &body_fn));
 
-  NameAttrList cond_fn, body_fn;
-  TF_EXPECT_OK(FindWhileCondAndBody(graph_def, &cond_fn, &body_fn));
+    // Outer graph
+    {
+      Scope scope = Scope::NewRootScope().ExitOnError();
+      auto source = ops::Placeholder(scope.WithOpName("source"), DT_INT32);
+      auto while_op =
+          ops::While(scope.WithOpName("while/LoopCond"),
+                     std::initializer_list<Input>{source}, cond_fn, body_fn);
+      GraphDef expected;
+      TF_EXPECT_OK(scope.ToGraphDef(&expected));
+      TF_EXPECT_GRAPH_EQ(expected, graph_def);
+    }
 
-  // Outer graph
-  {
-    Scope scope = Scope::NewRootScope().ExitOnError();
-    auto source = ops::Placeholder(scope.WithOpName("source"), DT_INT32);
-    auto while_op =
-        ops::While(scope.WithOpName("while/LoopCond"),
-                   std::initializer_list<Input>{source}, cond_fn, body_fn);
-    GraphDef expected;
-    TF_EXPECT_OK(scope.ToGraphDef(&expected));
-    TF_EXPECT_GRAPH_EQ(expected, graph_def);
-  }
+    // Condition graph
+    {
+      Scope scope = Scope::NewRootScope().ExitOnError();
+      auto arg = ops::_Arg(scope.WithOpName("_arg0"), DT_INT32, 0);
+      auto ten = ops::Const<int32>(
+          scope.WithOpName("while/Less/y").WithControlDependencies(arg), 10);
+      auto less = ops::Less(scope.WithOpName("while/Less"), arg, ten);
+      auto retval = ops::_Retval(scope.WithOpName("_retval0_RetVal"), less, 0);
 
-  // Condition graph
-  {
-    Scope scope = Scope::NewRootScope().ExitOnError();
-    auto arg = ops::_Arg(scope.WithOpName("_arg0"), DT_INT32, 0);
-    auto ten = ops::Const<int32>(
-        scope.WithOpName("while/Less/y").WithControlDependencies(arg), 10);
-    auto less = ops::Less(scope.WithOpName("while/Less"), arg, ten);
-    auto retval = ops::_Retval(scope.WithOpName("_retval0_RetVal"), less, 0);
+      GraphDef expected;
+      TF_EXPECT_OK(scope.ToGraphDef(&expected));
 
-    GraphDef expected;
-    TF_EXPECT_OK(scope.ToGraphDef(&expected));
+      InstantiationResultForTest result;
+      TF_EXPECT_OK(
+          InstantiateFunctionForTest(cond_fn.name(), library, &result));
 
-    InstantiationResultForTest result;
-    TF_EXPECT_OK(InstantiateFunctionForTest(cond_fn.name(), library, &result));
+      EXPECT_EQ(DataTypeVector{DT_INT32}, result.arg_types);
+      EXPECT_EQ(DataTypeVector{DT_BOOL}, result.ret_types);
+      TF_EXPECT_GRAPH_EQ(expected, result.gdef);
+    }
 
-    EXPECT_EQ(DataTypeVector{DT_INT32}, result.arg_types);
-    EXPECT_EQ(DataTypeVector{DT_BOOL}, result.ret_types);
-    TF_EXPECT_GRAPH_EQ(expected, result.gdef);
-  }
+    // Body graph.
+    {
+      Scope scope = Scope::NewRootScope().ExitOnError();
+      auto arg = ops::_Arg(scope.WithOpName("_arg0"), DT_INT32, 0);
+      auto identity = ops::Identity(scope.WithOpName("while/Identity"), arg);
+      auto one = ops::Const<int32>(
+          scope.WithOpName("while/add/y").WithControlDependencies(identity), 1);
+      auto add = ops::Add(scope.WithOpName("while/add"), identity, one);
+      auto retval = ops::_Retval(scope.WithOpName("_retval0_RetVal"), add, 0);
 
-  // Body graph.
-  {
-    Scope scope = Scope::NewRootScope().ExitOnError();
-    auto arg = ops::_Arg(scope.WithOpName("_arg0"), DT_INT32, 0);
-    auto identity = ops::Identity(scope.WithOpName("while/Identity"), arg);
-    auto one = ops::Const<int32>(
-        scope.WithOpName("while/add/y").WithControlDependencies(identity), 1);
-    auto add = ops::Add(scope.WithOpName("while/add"), identity, one);
-    auto retval = ops::_Retval(scope.WithOpName("_retval0_RetVal"), add, 0);
+      GraphDef expected;
+      TF_EXPECT_OK(scope.ToGraphDef(&expected));
 
-    GraphDef expected;
-    TF_EXPECT_OK(scope.ToGraphDef(&expected));
+      InstantiationResultForTest result;
+      TF_EXPECT_OK(
+          InstantiateFunctionForTest(body_fn.name(), library, &result));
 
-    InstantiationResultForTest result;
-    TF_EXPECT_OK(InstantiateFunctionForTest(body_fn.name(), library, &result));
-
-    EXPECT_EQ(DataTypeVector{DT_INT32}, result.arg_types);
-    EXPECT_EQ(DataTypeVector{DT_INT32}, result.ret_types);
-    TF_EXPECT_GRAPH_EQ(expected, result.gdef);
+      EXPECT_EQ(DataTypeVector{DT_INT32}, result.arg_types);
+      EXPECT_EQ(DataTypeVector{DT_INT32}, result.ret_types);
+      TF_EXPECT_GRAPH_EQ(expected, result.gdef);
+    }
   }
 }
 
@@ -608,86 +669,95 @@ TEST(FunctionalizeControlFlow, TwoLoopVars) {
   }
 
   FunctionLibraryDefinition library(OpRegistry::Global(), {});
+  GraphDef optimized_graph_def;
+  graph.ToGraphDef(&optimized_graph_def);
+  TF_ASSERT_OK(
+      FunctionalizeControlFlowForGraphDef(&optimized_graph_def, &library));
   TF_ASSERT_OK(FunctionalizeControlFlow(&graph, &library));
+  GraphDef converted_graph_def;
+  graph.ToGraphDef(&converted_graph_def);
 
-  GraphDef graph_def;
-  graph.ToGraphDef(&graph_def);
+  for (const GraphDef& graph_def : {optimized_graph_def, converted_graph_def}) {
+    NameAttrList cond_fn, body_fn;
+    TF_EXPECT_OK(FindWhileCondAndBody(graph_def, &cond_fn, &body_fn));
 
-  NameAttrList cond_fn, body_fn;
-  TF_EXPECT_OK(FindWhileCondAndBody(graph_def, &cond_fn, &body_fn));
+    // Outer graph.
+    {
+      Scope scope = Scope::NewRootScope().ExitOnError();
+      auto x = ops::Placeholder(scope.WithOpName("Placeholder/x"), DT_INT32);
+      auto y = ops::Placeholder(scope.WithOpName("Placeholder/y"), DT_INT32);
+      auto while_op =
+          ops::While(scope.WithOpName("while/LoopCond"),
+                     std::initializer_list<Input>{x, y}, cond_fn, body_fn);
+      auto sink_x = ops::Identity(scope.WithOpName("sink_x"), while_op[0]);
+      auto sink_y = ops::Identity(scope.WithOpName("sink_y"), while_op[1]);
+      GraphDef expected;
+      TF_EXPECT_OK(scope.ToGraphDef(&expected));
+      TF_EXPECT_GRAPH_EQ(expected, graph_def);
+    }
 
-  // Outer graph.
-  {
-    Scope scope = Scope::NewRootScope().ExitOnError();
-    auto x = ops::Placeholder(scope.WithOpName("Placeholder/x"), DT_INT32);
-    auto y = ops::Placeholder(scope.WithOpName("Placeholder/y"), DT_INT32);
-    auto while_op =
-        ops::While(scope.WithOpName("while/LoopCond"),
-                   std::initializer_list<Input>{x, y}, cond_fn, body_fn);
-    auto sink_x = ops::Identity(scope.WithOpName("sink_x"), while_op[0]);
-    auto sink_y = ops::Identity(scope.WithOpName("sink_y"), while_op[1]);
-    GraphDef expected;
-    TF_EXPECT_OK(scope.ToGraphDef(&expected));
-    TF_EXPECT_GRAPH_EQ(expected, graph_def);
-  }
-
-  // Condition graph.
-  {
-    Scope scope = Scope::NewRootScope().ExitOnError();
-    auto arg0 = ops::_Arg(scope.WithOpName("_arg0"), DT_INT32, 0);
-    auto arg1 = ops::_Arg(scope.WithOpName("_arg1"), DT_INT32, 1);
-    auto three = ops::Const<int32>(scope.WithOpName("while/cond/three")
+    // Condition graph.
+    {
+      Scope scope = Scope::NewRootScope().ExitOnError();
+      auto arg0 = ops::_Arg(scope.WithOpName("_arg0"), DT_INT32, 0);
+      auto arg1 = ops::_Arg(scope.WithOpName("_arg1"), DT_INT32, 1);
+      auto three = ops::Const<int32>(scope.WithOpName("while/cond/three")
+                                         .WithControlDependencies(arg0.output),
+                                     3);
+      auto cond_add =
+          ops::Add(scope.WithOpName("while/cond/Add"), arg0.output, three);
+      auto ten = ops::Const<int32>(scope.WithOpName("while/cond/ten")
                                        .WithControlDependencies(arg0.output),
-                                   3);
-    auto cond_add =
-        ops::Add(scope.WithOpName("while/cond/Add"), arg0.output, three);
-    auto ten = ops::Const<int32>(
-        scope.WithOpName("while/cond/ten").WithControlDependencies(arg0.output),
-        10);
-    auto less = ops::Less(scope.WithOpName("while/cond/Less"), cond_add, ten);
-    auto retval = ops::_Retval(scope.WithOpName("_retval0_RetVal"), less, 0);
+                                   10);
+      auto less = ops::Less(scope.WithOpName("while/cond/Less"), cond_add, ten);
+      auto retval = ops::_Retval(scope.WithOpName("_retval0_RetVal"), less, 0);
 
-    GraphDef expected;
-    TF_EXPECT_OK(scope.ToGraphDef(&expected));
+      GraphDef expected;
+      TF_EXPECT_OK(scope.ToGraphDef(&expected));
 
-    InstantiationResultForTest result;
-    TF_EXPECT_OK(InstantiateFunctionForTest(cond_fn.name(), library, &result));
+      InstantiationResultForTest result;
+      TF_EXPECT_OK(
+          InstantiateFunctionForTest(cond_fn.name(), library, &result));
 
-    EXPECT_EQ((DataTypeVector{DT_INT32, DT_INT32}), result.arg_types);
-    EXPECT_EQ(DataTypeVector{DT_BOOL}, result.ret_types);
-    TF_EXPECT_GRAPH_EQ(expected, result.gdef);
-  }
+      EXPECT_EQ((DataTypeVector{DT_INT32, DT_INT32}), result.arg_types);
+      EXPECT_EQ(DataTypeVector{DT_BOOL}, result.ret_types);
+      TF_EXPECT_GRAPH_EQ(expected, result.gdef);
+    }
 
-  // Body graph.
-  {
-    Scope scope = Scope::NewRootScope().ExitOnError();
-    auto arg0 = ops::_Arg(scope.WithOpName("_arg0"), DT_INT32, 0);
-    auto arg1 = ops::_Arg(scope.WithOpName("_arg1"), DT_INT32, 1);
+    // Body graph.
+    {
+      Scope scope = Scope::NewRootScope().ExitOnError();
+      auto arg0 = ops::_Arg(scope.WithOpName("_arg0"), DT_INT32, 0);
+      auto arg1 = ops::_Arg(scope.WithOpName("_arg1"), DT_INT32, 1);
 
-    auto identity_x = ops::Identity(scope.WithOpName("while/Identity/x"), arg0);
-    auto identity_y = ops::Identity(scope.WithOpName("while/Identity/y"), arg1);
+      auto identity_x =
+          ops::Identity(scope.WithOpName("while/Identity/x"), arg0);
+      auto identity_y =
+          ops::Identity(scope.WithOpName("while/Identity/y"), arg1);
 
-    auto one = ops::Const<int32>(
-        scope.WithOpName("while/add/one").WithControlDependencies(identity_x),
-        1);
-    auto two = ops::Const<int32>(
-        scope.WithOpName("while/mul/two").WithControlDependencies(identity_x),
-        2);
+      auto one = ops::Const<int32>(
+          scope.WithOpName("while/add/one").WithControlDependencies(identity_x),
+          1);
+      auto two = ops::Const<int32>(
+          scope.WithOpName("while/mul/two").WithControlDependencies(identity_x),
+          2);
 
-    auto add = ops::Add(scope.WithOpName("while/add"), identity_x, one);
-    auto mul = ops::Add(scope.WithOpName("while/mul"), identity_y, two);
-    auto retval0 = ops::_Retval(scope.WithOpName("_retval0_RetVal"), add, 0);
-    auto retval1 = ops::_Retval(scope.WithOpName("_retval1_RetVal"), mul, 1);
+      auto add = ops::Add(scope.WithOpName("while/add"), identity_x, one);
+      auto mul = ops::Add(scope.WithOpName("while/mul"), identity_y, two);
+      auto retval0 = ops::_Retval(scope.WithOpName("_retval0_RetVal"), add, 0);
+      auto retval1 = ops::_Retval(scope.WithOpName("_retval1_RetVal"), mul, 1);
 
-    GraphDef expected;
-    TF_EXPECT_OK(scope.ToGraphDef(&expected));
+      GraphDef expected;
+      TF_EXPECT_OK(scope.ToGraphDef(&expected));
 
-    InstantiationResultForTest result;
-    TF_EXPECT_OK(InstantiateFunctionForTest(body_fn.name(), library, &result));
+      InstantiationResultForTest result;
+      TF_EXPECT_OK(
+          InstantiateFunctionForTest(body_fn.name(), library, &result));
 
-    EXPECT_EQ((DataTypeVector{DT_INT32, DT_INT32}), result.arg_types);
-    EXPECT_EQ((DataTypeVector{DT_INT32, DT_INT32}), result.ret_types);
-    TF_EXPECT_GRAPH_EQ(expected, result.gdef);
+      EXPECT_EQ((DataTypeVector{DT_INT32, DT_INT32}), result.arg_types);
+      EXPECT_EQ((DataTypeVector{DT_INT32, DT_INT32}), result.ret_types);
+      TF_EXPECT_GRAPH_EQ(expected, result.gdef);
+    }
   }
 }
 
@@ -841,177 +911,192 @@ TEST(FunctionalizeControlFlow, Complex) {
   }
 
   FunctionLibraryDefinition library(OpRegistry::Global(), {});
+  GraphDef optimized_graph_def;
+  graph.ToGraphDef(&optimized_graph_def);
+  TF_ASSERT_OK(
+      FunctionalizeControlFlowForGraphDef(&optimized_graph_def, &library));
   TF_ASSERT_OK(FunctionalizeControlFlow(&graph, &library));
+  GraphDef converted_graph_def;
+  graph.ToGraphDef(&converted_graph_def);
 
-  GraphDef graph_def;
-  graph.ToGraphDef(&graph_def);
-
-  NameAttrList outer_cond_fn, outer_body_fn;
-  TF_EXPECT_OK(FindWhileCondAndBody(graph_def, &outer_cond_fn, &outer_body_fn));
-
-  // Outer graph.
-  {
-    Scope scope = Scope::NewRootScope().ExitOnError();
-    auto x = ops::Placeholder(scope.WithOpName("x"), DT_INT32);
-    auto three = ops::Const<int32>(scope.WithOpName("three"), 3);
-    auto y = ops::Add(scope.WithOpName("y"), x, three);
-
-    auto var = ops::VarHandleOp(scope.WithOpName("Variable"), DT_INT32,
-                                TensorShape({}));
-
-    auto zero = ops::Const<int32>(scope.WithOpName("outer/Const"), 0);
-
-    auto while_op = ops::While(scope.WithOpName("outer/LoopCond"),
-                               std::initializer_list<Input>{zero, y, x, var},
-                               outer_cond_fn, outer_body_fn);
-    auto sink = ops::Identity(scope.WithOpName("sink"), while_op[0]);
-    GraphDef expected;
-    TF_EXPECT_OK(scope.ToGraphDef(&expected));
-    TF_EXPECT_GRAPH_EQ(expected, graph_def);
-  }
-
-  // Outer condition graph.
-  {
-    Scope scope = Scope::NewRootScope().ExitOnError();
-    auto arg0 = ops::_Arg(scope.WithOpName("_arg0"), DT_INT32, 0);
-    auto arg1 = ops::_Arg(scope.WithOpName("_arg1"), DT_INT32, 1);
-    auto arg2 = ops::_Arg(scope.WithOpName("_arg2"), DT_INT32, 2);
-    auto arg3 = ops::_Arg(scope.WithOpName("_arg3"), DT_RESOURCE, 3);
-
-    auto ten = ops::Const<int32>(
-        scope.WithOpName("outer/Less/y").WithControlDependencies(arg0.output),
-        10);
-    auto less = ops::Less(scope.WithOpName("outer/Less_i"), arg0, ten);
-    auto retval = ops::_Retval(scope.WithOpName("_retval0_RetVal"), less, 0);
-
-    GraphDef expected;
-    TF_EXPECT_OK(scope.ToGraphDef(&expected));
-
-    InstantiationResultForTest result;
+  for (const GraphDef& graph_def : {optimized_graph_def, converted_graph_def}) {
+    NameAttrList outer_cond_fn, outer_body_fn;
     TF_EXPECT_OK(
-        InstantiateFunctionForTest(outer_cond_fn.name(), library, &result));
+        FindWhileCondAndBody(graph_def, &outer_cond_fn, &outer_body_fn));
 
-    EXPECT_EQ((DataTypeVector{DT_INT32, DT_INT32, DT_INT32, DT_RESOURCE}),
-              result.arg_types);
-    EXPECT_EQ(DataTypeVector{DT_BOOL}, result.ret_types);
-    TF_EXPECT_GRAPH_EQ(expected, result.gdef);
-  }
+    // Outer graph.
+    {
+      Scope scope = Scope::NewRootScope().ExitOnError();
+      auto x = ops::Placeholder(scope.WithOpName("x"), DT_INT32);
+      auto three = ops::Const<int32>(scope.WithOpName("three"), 3);
+      auto y = ops::Add(scope.WithOpName("y"), x, three);
 
-  // Outer body graph.
-  NameAttrList inner_cond_fn, inner_body_fn;
-  {
-    InstantiationResultForTest result;
-    TF_EXPECT_OK(
-        InstantiateFunctionForTest(outer_body_fn.name(), library, &result));
+      auto var = ops::VarHandleOp(scope.WithOpName("Variable"), DT_INT32,
+                                  TensorShape({}));
 
-    // Find the inner condition and body names.
-    TF_EXPECT_OK(
-        FindWhileCondAndBody(result.gdef, &inner_cond_fn, &inner_body_fn));
+      auto zero = ops::Const<int32>(scope.WithOpName("outer/Const"), 0);
 
-    Scope scope = Scope::NewRootScope().ExitOnError();
-    auto arg0 = ops::_Arg(scope.WithOpName("_arg0"), DT_INT32, 0);
-    auto arg1 = ops::_Arg(scope.WithOpName("_arg1"), DT_INT32, 1);
-    auto arg2 = ops::_Arg(scope.WithOpName("_arg2"), DT_INT32, 2);
-    auto arg3 = ops::_Arg(scope.WithOpName("_arg3"), DT_RESOURCE, 3);
+      auto while_op = ops::While(scope.WithOpName("outer/LoopCond"),
+                                 std::initializer_list<Input>{zero, y, x, var},
+                                 outer_cond_fn, outer_body_fn);
+      auto sink = ops::Identity(scope.WithOpName("sink"), while_op[0]);
+      GraphDef expected;
+      TF_EXPECT_OK(scope.ToGraphDef(&expected));
+      TF_EXPECT_GRAPH_EQ(expected, graph_def);
+    }
 
-    auto identity_i = ops::Identity(scope.WithOpName("outer/Identity"), arg0);
-    auto one_j = ops::Const<int32>(
-        scope.WithOpName("outer/j").WithControlDependencies(identity_i), 1);
-    auto while_op =
-        ops::While(scope.WithOpName("outer/LoopCond_1"),
-                   std::initializer_list<Input>{one_j, arg1, arg2, arg3},
-                   inner_cond_fn, inner_body_fn);
+    // Outer condition graph.
+    {
+      Scope scope = Scope::NewRootScope().ExitOnError();
+      auto arg0 = ops::_Arg(scope.WithOpName("_arg0"), DT_INT32, 0);
+      auto arg1 = ops::_Arg(scope.WithOpName("_arg1"), DT_INT32, 1);
+      auto arg2 = ops::_Arg(scope.WithOpName("_arg2"), DT_INT32, 2);
+      auto arg3 = ops::_Arg(scope.WithOpName("_arg3"), DT_RESOURCE, 3);
 
-    auto one_outer = ops::Const<int32>(
-        scope.WithOpName("outer/add/y").WithControlDependencies(identity_i), 1);
-    auto add_i =
-        ops::Add(scope.WithOpName("outer/add")
-                     .WithControlDependencies(absl::Span<const Operation>{
-                         while_op[0].op(), while_op[1].op()}),
-                 identity_i, one_outer);
+      auto ten = ops::Const<int32>(
+          scope.WithOpName("outer/Less/y").WithControlDependencies(arg0.output),
+          10);
+      auto less = ops::Less(scope.WithOpName("outer/Less_i"), arg0, ten);
+      auto retval = ops::_Retval(scope.WithOpName("_retval0_RetVal"), less, 0);
 
-    auto retval0 = ops::_Retval(scope.WithOpName("_retval0_RetVal"), add_i, 0);
-    auto retval1 = ops::_Retval(scope.WithOpName("_retval1_RetVal"), arg1, 1);
-    auto retval2 = ops::_Retval(scope.WithOpName("_retval2_RetVal"), arg2, 2);
+      GraphDef expected;
+      TF_EXPECT_OK(scope.ToGraphDef(&expected));
 
-    GraphDef expected;
-    TF_EXPECT_OK(scope.ToGraphDef(&expected));
+      InstantiationResultForTest result;
+      TF_EXPECT_OK(
+          InstantiateFunctionForTest(outer_cond_fn.name(), library, &result));
 
-    EXPECT_EQ((DataTypeVector{DT_INT32, DT_INT32, DT_INT32, DT_RESOURCE}),
-              result.arg_types);
-    EXPECT_EQ((DataTypeVector{DT_INT32, DT_INT32, DT_INT32}), result.ret_types);
-    TF_EXPECT_GRAPH_EQ(expected, result.gdef);
-  }
+      EXPECT_EQ((DataTypeVector{DT_INT32, DT_INT32, DT_INT32, DT_RESOURCE}),
+                result.arg_types);
+      EXPECT_EQ(DataTypeVector{DT_BOOL}, result.ret_types);
+      TF_EXPECT_GRAPH_EQ(expected, result.gdef);
+    }
 
-  // Inner condition graph.
-  {
-    Scope scope = Scope::NewRootScope().ExitOnError();
-    auto arg0 = ops::_Arg(scope.WithOpName("_arg0"), DT_INT32, 0);
-    auto arg1 = ops::_Arg(scope.WithOpName("_arg1"), DT_INT32, 1);
-    auto arg2 = ops::_Arg(scope.WithOpName("_arg2"), DT_INT32, 2);
-    auto arg3 = ops::_Arg(scope.WithOpName("_arg3"), DT_RESOURCE, 3);
+    // Outer body graph.
+    NameAttrList inner_cond_fn, inner_body_fn;
+    {
+      InstantiationResultForTest result;
+      TF_EXPECT_OK(
+          InstantiateFunctionForTest(outer_body_fn.name(), library, &result));
 
-    auto five = ops::Const<int32>(
-        scope.WithOpName("outer/inner/Five").WithControlDependencies(arg0), 5);
-    auto less_j = ops::Less(scope.WithOpName("outer/inner/Less_j"), arg0, five);
-    auto retval = ops::_Retval(scope.WithOpName("_retval0_RetVal"), less_j, 0);
+      // Find the inner condition and body names.
+      TF_EXPECT_OK(
+          FindWhileCondAndBody(result.gdef, &inner_cond_fn, &inner_body_fn));
 
-    GraphDef expected;
-    TF_EXPECT_OK(scope.ToGraphDef(&expected));
+      Scope scope = Scope::NewRootScope().ExitOnError();
+      auto arg0 = ops::_Arg(scope.WithOpName("_arg0"), DT_INT32, 0);
+      auto arg1 = ops::_Arg(scope.WithOpName("_arg1"), DT_INT32, 1);
+      auto arg2 = ops::_Arg(scope.WithOpName("_arg2"), DT_INT32, 2);
+      auto arg3 = ops::_Arg(scope.WithOpName("_arg3"), DT_RESOURCE, 3);
 
-    InstantiationResultForTest result;
-    TF_EXPECT_OK(
-        InstantiateFunctionForTest(inner_cond_fn.name(), library, &result));
+      auto identity_i = ops::Identity(scope.WithOpName("outer/Identity"), arg0);
+      auto one_j = ops::Const<int32>(
+          scope.WithOpName("outer/j").WithControlDependencies(identity_i), 1);
+      auto while_op =
+          ops::While(scope.WithOpName("outer/LoopCond_1"),
+                     std::initializer_list<Input>{one_j, arg1, arg2, arg3},
+                     inner_cond_fn, inner_body_fn);
 
-    EXPECT_EQ((DataTypeVector{DT_INT32, DT_INT32, DT_INT32, DT_RESOURCE}),
-              result.arg_types);
-    EXPECT_EQ(DataTypeVector{DT_BOOL}, result.ret_types);
-    TF_EXPECT_GRAPH_EQ(expected, result.gdef);
-  }
+      auto one_outer = ops::Const<int32>(
+          scope.WithOpName("outer/add/y").WithControlDependencies(identity_i),
+          1);
+      auto add_i =
+          ops::Add(scope.WithOpName("outer/add")
+                       .WithControlDependencies(absl::Span<const Operation>{
+                           while_op[0].op(), while_op[1].op()}),
+                   identity_i, one_outer);
 
-  // Inner body graph.
-  {
-    Scope scope = Scope::NewRootScope().ExitOnError();
-    auto arg0 = ops::_Arg(scope.WithOpName("_arg0"), DT_INT32, 0);
-    auto arg1 = ops::_Arg(scope.WithOpName("_arg1"), DT_INT32, 1);
-    auto arg2 = ops::_Arg(scope.WithOpName("_arg2"), DT_INT32, 2);
-    auto arg3 = ops::_Arg(scope.WithOpName("_arg3"), DT_RESOURCE, 3);
+      auto retval0 =
+          ops::_Retval(scope.WithOpName("_retval0_RetVal"), add_i, 0);
+      auto retval1 = ops::_Retval(scope.WithOpName("_retval1_RetVal"), arg1, 1);
+      auto retval2 = ops::_Retval(scope.WithOpName("_retval2_RetVal"), arg2, 2);
 
-    auto identity_j =
-        ops::Identity(scope.WithOpName("outer/inner/Identity_j"), arg0);
-    auto identity_k =
-        ops::Identity(scope.WithOpName("outer/inner/Identity_k"), arg1);
+      GraphDef expected;
+      TF_EXPECT_OK(scope.ToGraphDef(&expected));
 
-    auto mul_jk =
-        ops::Mul(scope.WithOpName("outer/inner/mul"), identity_j, identity_k);
-    auto add_jkx = ops::Add(scope.WithOpName("outer/inner/add"), mul_jk, arg2);
-    auto assign = ops::AssignAddVariableOp(
-        scope.WithOpName("outer/inner/assign_add"), arg3, add_jkx);
+      EXPECT_EQ((DataTypeVector{DT_INT32, DT_INT32, DT_INT32, DT_RESOURCE}),
+                result.arg_types);
+      EXPECT_EQ((DataTypeVector{DT_INT32, DT_INT32, DT_INT32}),
+                result.ret_types);
+      TF_EXPECT_GRAPH_EQ(expected, result.gdef);
+    }
 
-    auto one = ops::Const<int32>(
-        scope.WithOpName("outer/inner/One")
-            .WithControlDependencies(
-                absl::Span<const Operation>{assign.operation}),
-        1);
-    auto add_j =
-        ops::Add(scope.WithOpName("outer/inner/add_j"), identity_j, one);
+    // Inner condition graph.
+    {
+      Scope scope = Scope::NewRootScope().ExitOnError();
+      auto arg0 = ops::_Arg(scope.WithOpName("_arg0"), DT_INT32, 0);
+      auto arg1 = ops::_Arg(scope.WithOpName("_arg1"), DT_INT32, 1);
+      auto arg2 = ops::_Arg(scope.WithOpName("_arg2"), DT_INT32, 2);
+      auto arg3 = ops::_Arg(scope.WithOpName("_arg3"), DT_RESOURCE, 3);
 
-    auto retval0 = ops::_Retval(scope.WithOpName("_retval0_RetVal"), add_j, 0);
-    auto retval1 =
-        ops::_Retval(scope.WithOpName("_retval1_RetVal"), identity_k, 1);
-    auto retval2 = ops::_Retval(scope.WithOpName("_retval2_RetVal"), arg2, 2);
+      auto five = ops::Const<int32>(
+          scope.WithOpName("outer/inner/Five").WithControlDependencies(arg0),
+          5);
+      auto less_j =
+          ops::Less(scope.WithOpName("outer/inner/Less_j"), arg0, five);
+      auto retval =
+          ops::_Retval(scope.WithOpName("_retval0_RetVal"), less_j, 0);
 
-    GraphDef expected;
-    TF_EXPECT_OK(scope.ToGraphDef(&expected));
+      GraphDef expected;
+      TF_EXPECT_OK(scope.ToGraphDef(&expected));
 
-    InstantiationResultForTest result;
-    TF_EXPECT_OK(
-        InstantiateFunctionForTest(inner_body_fn.name(), library, &result));
+      InstantiationResultForTest result;
+      TF_EXPECT_OK(
+          InstantiateFunctionForTest(inner_cond_fn.name(), library, &result));
 
-    EXPECT_EQ((DataTypeVector{DT_INT32, DT_INT32, DT_INT32, DT_RESOURCE}),
-              result.arg_types);
-    EXPECT_EQ((DataTypeVector{DT_INT32, DT_INT32, DT_INT32}), result.ret_types);
-    TF_EXPECT_GRAPH_EQ(expected, result.gdef);
+      EXPECT_EQ((DataTypeVector{DT_INT32, DT_INT32, DT_INT32, DT_RESOURCE}),
+                result.arg_types);
+      EXPECT_EQ(DataTypeVector{DT_BOOL}, result.ret_types);
+      TF_EXPECT_GRAPH_EQ(expected, result.gdef);
+    }
+
+    // Inner body graph.
+    {
+      Scope scope = Scope::NewRootScope().ExitOnError();
+      auto arg0 = ops::_Arg(scope.WithOpName("_arg0"), DT_INT32, 0);
+      auto arg1 = ops::_Arg(scope.WithOpName("_arg1"), DT_INT32, 1);
+      auto arg2 = ops::_Arg(scope.WithOpName("_arg2"), DT_INT32, 2);
+      auto arg3 = ops::_Arg(scope.WithOpName("_arg3"), DT_RESOURCE, 3);
+
+      auto identity_j =
+          ops::Identity(scope.WithOpName("outer/inner/Identity_j"), arg0);
+      auto identity_k =
+          ops::Identity(scope.WithOpName("outer/inner/Identity_k"), arg1);
+
+      auto mul_jk =
+          ops::Mul(scope.WithOpName("outer/inner/mul"), identity_j, identity_k);
+      auto add_jkx =
+          ops::Add(scope.WithOpName("outer/inner/add"), mul_jk, arg2);
+      auto assign = ops::AssignAddVariableOp(
+          scope.WithOpName("outer/inner/assign_add"), arg3, add_jkx);
+
+      auto one = ops::Const<int32>(
+          scope.WithOpName("outer/inner/One")
+              .WithControlDependencies(
+                  absl::Span<const Operation>{assign.operation}),
+          1);
+      auto add_j =
+          ops::Add(scope.WithOpName("outer/inner/add_j"), identity_j, one);
+
+      auto retval0 =
+          ops::_Retval(scope.WithOpName("_retval0_RetVal"), add_j, 0);
+      auto retval1 =
+          ops::_Retval(scope.WithOpName("_retval1_RetVal"), identity_k, 1);
+      auto retval2 = ops::_Retval(scope.WithOpName("_retval2_RetVal"), arg2, 2);
+
+      GraphDef expected;
+      TF_EXPECT_OK(scope.ToGraphDef(&expected));
+
+      InstantiationResultForTest result;
+      TF_EXPECT_OK(
+          InstantiateFunctionForTest(inner_body_fn.name(), library, &result));
+
+      EXPECT_EQ((DataTypeVector{DT_INT32, DT_INT32, DT_INT32, DT_RESOURCE}),
+                result.arg_types);
+      EXPECT_EQ((DataTypeVector{DT_INT32, DT_INT32, DT_INT32}),
+                result.ret_types);
+      TF_EXPECT_GRAPH_EQ(expected, result.gdef);
+    }
   }
 }
 
diff --git a/tensorflow/compiler/tf2xla/kernels/BUILD b/tensorflow/compiler/tf2xla/kernels/BUILD
index d85b4f5ae0c..fa51a72aea4 100644
--- a/tensorflow/compiler/tf2xla/kernels/BUILD
+++ b/tensorflow/compiler/tf2xla/kernels/BUILD
@@ -121,7 +121,6 @@ tf_kernel_library(
         ":while_op",
         "//tensorflow/compiler/tf2xla:common",
         "//tensorflow/compiler/tf2xla:xla_compiler",
-        "//tensorflow/compiler/tf2xla/lib:batch_dot",
         "//tensorflow/compiler/tf2xla/lib:broadcast",
         "//tensorflow/compiler/tf2xla/lib:cholesky",
         "//tensorflow/compiler/tf2xla/lib:qr",
@@ -144,7 +143,7 @@ tf_kernel_library(
         "//tensorflow/compiler/xla/client/lib:arithmetic",
         "//tensorflow/compiler/xla/client/lib:constants",
         "//tensorflow/compiler/xla/client/lib:math",
-        "//tensorflow/compiler/xla/client/lib:numeric",
+        "//tensorflow/compiler/xla/client/lib:matrix",
         "//tensorflow/compiler/xla/client/lib:pooling",
         "//tensorflow/compiler/xla/client/lib:prng",
         "//tensorflow/compiler/xla/client/lib:sorting",
@@ -196,7 +195,6 @@ cc_library(
         "//tensorflow/compiler/xla/client:xla_builder",
         "//tensorflow/compiler/xla/client/lib:arithmetic",
         "//tensorflow/compiler/xla/client/lib:constants",
-        "//tensorflow/compiler/xla/client/lib:numeric",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
@@ -216,7 +214,6 @@ cc_library(
         "//tensorflow/compiler/xla/client:xla_builder",
         "//tensorflow/compiler/xla/client/lib:arithmetic",
         "//tensorflow/compiler/xla/client/lib:constants",
-        "//tensorflow/compiler/xla/client/lib:numeric",
         "//tensorflow/core:framework",
         "//tensorflow/core/kernels:bounds_check",
         "//tensorflow/core/kernels:conv_ops",
diff --git a/tensorflow/compiler/tf2xla/kernels/arg_op.cc b/tensorflow/compiler/tf2xla/kernels/arg_op.cc
index 2db2514397d..795ea09831e 100644
--- a/tensorflow/compiler/tf2xla/kernels/arg_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/arg_op.cc
@@ -50,7 +50,7 @@ class XlaArgOp : public XlaOpKernel {
       return;
     }
 
-    const XlaExpression& arg = XlaContext::Get(ctx).args()[index_];
+    const XlaExpression& arg = ctx->xla_context()->args()[index_];
     OP_REQUIRES(ctx, arg.kind() != XlaExpression::Kind::kInvalid,
                 errors::InvalidArgument("Invalid/missing argument expression"));
     ctx->SetOutputExpression(0, arg);
diff --git a/tensorflow/compiler/tf2xla/kernels/batch_matmul_op.cc b/tensorflow/compiler/tf2xla/kernels/batch_matmul_op.cc
index 4cfe946b2e6..1b254e328a8 100644
--- a/tensorflow/compiler/tf2xla/kernels/batch_matmul_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/batch_matmul_op.cc
@@ -13,9 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/tf2xla/lib/batch_dot.h"
+#include "tensorflow/compiler/tf2xla/lib/util.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/compiler/xla/client/lib/math.h"
+#include "tensorflow/compiler/xla/client/lib/matrix.h"
 
 namespace tensorflow {
 namespace {
@@ -28,9 +30,11 @@ class BatchMatMulOp : public XlaOpKernel {
   }
 
   void Compile(XlaOpKernelContext* ctx) override {
-    auto result = BatchDot(ctx->Input(0), ctx->Input(1),
-                           /*transpose_x=*/adj_x_, /*transpose_y=*/adj_y_,
-                           /*conjugate_x=*/adj_x_, /*conjugate_y=*/adj_y_);
+    auto result =
+        xla::BatchDot(MaybeTransposeInMinorDims(
+                          MaybeConjugate(ctx->Input(0), adj_x_), adj_x_),
+                      MaybeTransposeInMinorDims(
+                          MaybeConjugate(ctx->Input(1), adj_y_), adj_y_));
     ctx->SetOutput(0, result);
   }
 
diff --git a/tensorflow/compiler/tf2xla/kernels/batch_norm_op.cc b/tensorflow/compiler/tf2xla/kernels/batch_norm_op.cc
index a267c0c72fc..0e2f335f335 100644
--- a/tensorflow/compiler/tf2xla/kernels/batch_norm_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/batch_norm_op.cc
@@ -115,9 +115,9 @@ class FusedBatchNormGradOp : public XlaOpKernel {
     // operators. For now, cast everything to the statistics type (which
     // may be more precise than the input type).
     auto grad_backprop =
-        XlaHelpers::ConvertElementType(b, ctx->Input(0), scale_dtype);
+        XlaHelpers::ConvertElementType(ctx->Input(0), scale_dtype);
     auto activations =
-        XlaHelpers::ConvertElementType(b, ctx->Input(1), scale_dtype);
+        XlaHelpers::ConvertElementType(ctx->Input(1), scale_dtype);
     auto scale = ctx->Input(2);
     auto mean = ctx->Input(3);
     auto var = ctx->Input(4);
@@ -151,11 +151,11 @@ class FusedBatchNormGradOp : public XlaOpKernel {
       const DataType accumulation_type =
           XlaHelpers::SumAccumulationType(scale_dtype);
       auto converted =
-          XlaHelpers::ConvertElementType(b, grad_backprop, accumulation_type);
+          XlaHelpers::ConvertElementType(grad_backprop, accumulation_type);
       auto reduce =
           xla::Reduce(converted, XlaHelpers::Zero(b, accumulation_type),
                       *ctx->GetOrCreateAdd(accumulation_type), reduction_dims);
-      offset_backprop = XlaHelpers::ConvertElementType(b, reduce, scale_dtype);
+      offset_backprop = XlaHelpers::ConvertElementType(reduce, scale_dtype);
 
       // scratch1 = rsqrt(pop_var + epsilon)
       auto neg_half = XlaHelpers::FloatLiteral(b, scale_dtype, -0.5);
@@ -165,19 +165,18 @@ class FusedBatchNormGradOp : public XlaOpKernel {
       // scratch2 = sum(y_backprop * (x - mean))
       auto mul =
           xla::Mul(grad_backprop, xla::Sub(activations, mean, {feature_index}));
-      converted = XlaHelpers::ConvertElementType(b, mul, accumulation_type);
+      converted = XlaHelpers::ConvertElementType(mul, accumulation_type);
       reduce =
           xla::Reduce(converted, XlaHelpers::Zero(b, accumulation_type),
                       *ctx->GetOrCreateAdd(accumulation_type), reduction_dims);
-      auto scratch2 = XlaHelpers::ConvertElementType(b, reduce, scale_dtype);
+      auto scratch2 = XlaHelpers::ConvertElementType(reduce, scale_dtype);
 
       x_backprop =
           xla::Mul(grad_backprop, xla::Mul(scratch1, scale), {feature_index});
       scale_backprop = xla::Mul(scratch1, scratch2);
     }
 
-    ctx->SetOutput(0,
-                   XlaHelpers::ConvertElementType(b, x_backprop, input_dtype));
+    ctx->SetOutput(0, XlaHelpers::ConvertElementType(x_backprop, input_dtype));
     ctx->SetOutput(1, scale_backprop);
     ctx->SetOutput(2, offset_backprop);
     ctx->SetConstantOutput(3, Tensor());
diff --git a/tensorflow/compiler/tf2xla/kernels/bias_ops.cc b/tensorflow/compiler/tf2xla/kernels/bias_ops.cc
index 41f540506ba..e7f369b761f 100644
--- a/tensorflow/compiler/tf2xla/kernels/bias_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/bias_ops.cc
@@ -107,11 +107,11 @@ class BiasAddGradOp : public XlaOpKernel {
     const DataType accumulation_type =
         XlaHelpers::SumAccumulationType(input_type(0));
     auto converted =
-        XlaHelpers::ConvertElementType(b, ctx->Input(0), accumulation_type);
+        XlaHelpers::ConvertElementType(ctx->Input(0), accumulation_type);
     auto reduce =
         xla::Reduce(converted, XlaHelpers::Zero(b, accumulation_type),
                     *ctx->GetOrCreateAdd(accumulation_type), reduce_dims);
-    ctx->SetOutput(0, XlaHelpers::ConvertElementType(b, reduce, input_type(0)));
+    ctx->SetOutput(0, XlaHelpers::ConvertElementType(reduce, input_type(0)));
   }
 
  private:
diff --git a/tensorflow/compiler/tf2xla/kernels/binary_ops.cc b/tensorflow/compiler/tf2xla/kernels/binary_ops.cc
index 47e517a6576..5e9280c1fe6 100644
--- a/tensorflow/compiler/tf2xla/kernels/binary_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/binary_ops.cc
@@ -43,6 +43,9 @@ namespace {
         const std::vector<int64>& extend_dimensions) override {          \
       xla::XlaBuilder* b = ctx->builder();                               \
       (void)b;                                                           \
+      (void)lhs_shape;                                                   \
+      (void)rhs_shape;                                                   \
+      (void)extend_dimensions;                                           \
       return HLO;                                                        \
     }                                                                    \
   };                                                                     \
@@ -103,23 +106,23 @@ static xla::XlaOp FloorDivImpl(xla::XlaBuilder* b, DataType dtype, xla::XlaOp x,
 XLA_MAKE_BINARY(FloorDiv,
                 FloorDivImpl(b, input_type(0), lhs, rhs, broadcast_helper));
 
-static xla::XlaOp XlogyImpl(xla::XlaBuilder* b, DataType dtype, xla::XlaOp x,
-                            xla::XlaOp y, const BCast& broadcast_helper) {
+xla::XlaOp XlogyImpl(xla::XlaOp x, xla::XlaOp y,
+                     const BCast& broadcast_helper) {
   std::tie(x, y) = XlaBinaryOp::Broadcast(x, y, broadcast_helper);
-  auto zero = XlaHelpers::Zero(b, dtype);
+  auto zero = xla::ZerosLike(x);
   auto is_zero = xla::Eq(x, zero);
   return xla::Select(is_zero, zero, xla::Mul(x, xla::Log(y)));
 }
-XLA_MAKE_BINARY(Xlogy, XlogyImpl(b, input_type(0), lhs, rhs, broadcast_helper));
+XLA_MAKE_BINARY(Xlogy, XlogyImpl(lhs, rhs, broadcast_helper));
 
-static xla::XlaOp XdivyImpl(xla::XlaBuilder* b, DataType dtype, xla::XlaOp x,
-                            xla::XlaOp y, const BCast& broadcast_helper) {
+xla::XlaOp XdivyImpl(xla::XlaOp x, xla::XlaOp y,
+                     const BCast& broadcast_helper) {
   std::tie(x, y) = XlaBinaryOp::Broadcast(x, y, broadcast_helper);
-  auto zero = XlaHelpers::Zero(b, dtype);
+  auto zero = xla::ZerosLike(x);
   auto is_zero = xla::Eq(x, zero);
   return xla::Select(is_zero, zero, xla::Div(x, y));
 }
-XLA_MAKE_BINARY(Xdivy, XdivyImpl(b, input_type(0), lhs, rhs, broadcast_helper));
+XLA_MAKE_BINARY(Xdivy, XdivyImpl(lhs, rhs, broadcast_helper));
 
 // Implementation of FloorMod. Pseudo-code:
 // T trunc_mod = std::fmod(x, y);
diff --git a/tensorflow/compiler/tf2xla/kernels/categorical_op.cc b/tensorflow/compiler/tf2xla/kernels/categorical_op.cc
index ad85940920e..7199b9b6feb 100644
--- a/tensorflow/compiler/tf2xla/kernels/categorical_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/categorical_op.cc
@@ -21,10 +21,13 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/lib/arithmetic.h"
+#include "tensorflow/compiler/xla/client/lib/prng.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.pb.h"
 
 namespace tensorflow {
 namespace {
@@ -57,11 +60,9 @@ class CategoricalOp : public XlaOpKernel {
     const int64 batch_size = logits_shape.dim_size(0);
     const int64 num_classes = logits_shape.dim_size(1);
 
-    xla::XlaBuilder* builder = ctx->builder();
-
     xla::Shape uniform_shape;
     int class_dimension;
-    if (num_samples > 1) {
+    if (num_samples != 1) {
       std::array<int64, 3> uniform_shape_array = {
           {batch_size, num_samples, num_classes}};
       xla::PrimitiveType uniform_xla_type;
@@ -83,16 +84,16 @@ class CategoricalOp : public XlaOpKernel {
           xla::ShapeUtil::MakeShape(uniform_xla_type, uniform_shape_array);
       class_dimension = 1;
     }
-    xla::XlaOp uniforms =
-        xla::RngUniform(XlaHelpers::Zero(builder, input_type(0)),
-                        XlaHelpers::One(builder, input_type(0)), uniform_shape);
+    xla::PrimitiveType type;
+    OP_REQUIRES_OK(ctx, DataTypeToPrimitiveType(input_type(0), &type));
+    xla::XlaOp log_uniforms = GetLogUniforms(uniform_shape, type, ctx);
 
     // Use Gumbel softmax trick to generate categorical samples.
     // See:
     // https://hips.seas.harvard.edu/blog/2013/04/06/the-gumbel-max-trick-for-discrete-distributions/
     // TODO(b/68769470): Switch to using a cumulative sum approach.
     auto softmax_entries =
-        xla::Sub(logits, xla::Log(-xla::Log(uniforms)),
+        xla::Sub(logits, log_uniforms,
                  /*broadcast_dimensions=*/{0, class_dimension});
 
     xla::PrimitiveType xla_output_type;
@@ -107,6 +108,16 @@ class CategoricalOp : public XlaOpKernel {
     ctx->SetOutput(0, argmax);
   }
 
+  virtual xla::XlaOp GetLogUniforms(xla::Shape uniform_shape,
+                                    xla::PrimitiveType type,
+                                    XlaOpKernelContext* ctx) {
+    xla::XlaBuilder* builder = ctx->builder();
+    auto uniforms =
+        xla::RngUniform(XlaHelpers::Zero(builder, input_type(0)),
+                        XlaHelpers::One(builder, input_type(0)), uniform_shape);
+    return xla::Log(-xla::Log(uniforms));
+  }
+
  private:
   TF_DISALLOW_COPY_AND_ASSIGN(CategoricalOp);
 };
@@ -115,5 +126,48 @@ class CategoricalOp : public XlaOpKernel {
 REGISTER_XLA_OP(Name("Multinomial").CompileTimeConstantInput("num_samples"),
                 CategoricalOp);
 
+class StatelessCategoricalOp : public CategoricalOp {
+ public:
+  explicit StatelessCategoricalOp(OpKernelConstruction* ctx)
+      : CategoricalOp(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("T", &dtype_));
+  }
+
+  xla::XlaOp GetLogUniforms(xla::Shape uniform_shape, xla::PrimitiveType type,
+                            XlaOpKernelContext* ctx) override {
+    xla::XlaOp seed = ctx->Input(2);
+    auto seed0 = xla::Reshape(xla::Slice(seed, {0}, {1}, {1}), {});
+    auto seed1 = xla::Reshape(xla::Slice(seed, {1}, {2}, {1}), {});
+
+    xla::XlaBuilder* builder = ctx->builder();
+    if (uniform_shape.element_type() == xla::BF16) {
+      uniform_shape.set_element_type(xla::F32);
+    }
+    auto uniforms = xla::StatelessRngUniform(
+        {seed0, seed1}, uniform_shape, XlaHelpers::Zero(builder, DT_FLOAT),
+        XlaHelpers::One(builder, DT_FLOAT));
+    return xla::ConvertElementType(xla::Log(-xla::Log(uniforms)), type);
+  }
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    TensorShape seed_shape = ctx->InputShape(2);
+    OP_REQUIRES(ctx, seed_shape.dims() == 1 && seed_shape.dim_size(0) == 2,
+                errors::InvalidArgument("seed must have shape [2], not ",
+                                        seed_shape.DebugString()));
+    CategoricalOp::Compile(ctx);
+  }
+
+ private:
+  DataType dtype_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(StatelessCategoricalOp);
+};
+
+REGISTER_XLA_OP(Name("StatelessMultinomial")
+                    .CompileTimeConstantInput("num_samples")
+                    .TypeConstraint("T", {DT_FLOAT, DT_BFLOAT16})
+                    .TypeConstraint("Tseed", DT_INT32),
+                StatelessCategoricalOp);
+
 }  // anonymous namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/conv_op_helpers.cc b/tensorflow/compiler/tf2xla/kernels/conv_op_helpers.cc
index c9a1be49406..641fefafb35 100644
--- a/tensorflow/compiler/tf2xla/kernels/conv_op_helpers.cc
+++ b/tensorflow/compiler/tf2xla/kernels/conv_op_helpers.cc
@@ -24,7 +24,6 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/lib/arithmetic.h"
 #include "tensorflow/compiler/xla/client/lib/constants.h"
-#include "tensorflow/compiler/xla/client/lib/numeric.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/core/framework/node_def_util.h"
@@ -65,60 +64,63 @@ xla::Shape ExpandedFilterShapeForDepthwiseConvolution(const xla::Shape& shape) {
 //   0 0 1 1 0 0   0 0 1 1 0 0
 //   0 0 0 0 1 1   0 0 0 0 1 1
 //
-// The first step is to create a one tensor, A, that is [3]
-//   0 1 2
+// The first step is to create a iota A with iota_dimension = 2
+//   0 0 0 0 0 0   0 0 0 0 0 0
+//   1 1 1 1 1 1   1 1 1 1 1 1
+//   2 2 2 2 2 2   2 2 2 2 2 2
 //
-// and another tensor, B,  that is [3 * 2]
-//   0 1 2 3 4 5
+//   0 0 0 0 0 0   0 0 0 0 0 0
+//   1 1 1 1 1 1   1 1 1 1 1 1
+//   2 2 2 2 2 2   2 2 2 2 2 2
 //
-// and divide B it by 2 to get
-//   0 0 1 1 2 2
+// and another iota B with iota_dimension = 3
+//   0 1 2 3 4 5  0 1 2 3 4 5
+//   0 1 2 3 4 5  0 1 2 3 4 5
+//   0 1 2 3 4 5  0 1 2 3 4 5
 //
-// then we broadcast the B to [2, 2, 3, 3 * 2]
-//   0 0 1 1 2 2   0 0 1 1 2 2
-//   0 0 1 1 2 2   0 0 1 1 2 2
-//   0 0 1 1 2 2   0 0 1 1 2 2
+//   0 1 2 3 4 5  0 1 2 3 4 5
+//   0 1 2 3 4 5  0 1 2 3 4 5
+//   0 1 2 3 4 5  0 1 2 3 4 5
 //
-//   0 0 1 1 2 2   0 0 1 1 2 2
-//   0 0 1 1 2 2   0 0 1 1 2 2
-//   0 0 1 1 2 2   0 0 1 1 2 2
+// and divide B by 2 to get
+//   0 0 1 1 2 2  0 0 1 1 2 2
+//   0 0 1 1 2 2  0 0 1 1 2 2
+//   0 0 1 1 2 2  0 0 1 1 2 2
 //
-// Finally compare A and broadcasted B in dimension 2 amd return the result at
-// the beginning of the comment.
+//   0 0 1 1 2 2  0 0 1 1 2 2
+//   0 0 1 1 2 2  0 0 1 1 2 2
+//   0 0 1 1 2 2  0 0 1 1 2 2
+//
+// Finally compare A and B and return the result at the beginning of the
+// comment.
 xla::XlaOp CreateExpandedFilterMask(const xla::Shape& filter_shape,
                                     xla::XlaBuilder* builder) {
   xla::Shape expanded_filter_shape =
       ExpandedFilterShapeForDepthwiseConvolution(filter_shape);
   int64 depthwise_multiplier =
       filter_shape.dimensions(filter_shape.dimensions_size() - 1);
-  int64 input_feature =
-      filter_shape.dimensions(filter_shape.dimensions_size() - 2);
 
-  // Create a M sized linspace and an M*N sized linspace that will be
-  // broadcasted into perpendicular dimensions and compared.
-  xla::XlaOp input_feature_iota = xla::Iota(builder, xla::S32, input_feature);
-  xla::XlaOp expanded_feature_iota =
-      xla::Iota(builder, xla::S32, input_feature * depthwise_multiplier);
+  // Create two iotas with the shape of the expanded filter, one of them with
+  // the iota dimension chosen as the feature dimension, and the other a iota
+  // with the iota dimension chosen as the expanded output feature dimension.
+  std::vector<int64> iota_dimensions(expanded_filter_shape.dimensions().begin(),
+                                     expanded_filter_shape.dimensions().end());
+  xla::Shape iota_shape = xla::ShapeUtil::MakeShape(xla::S32, iota_dimensions);
+  xla::XlaOp input_feature_iota = xla::Iota(
+      builder, iota_shape, /*iota_dimension=*/iota_dimensions.size() - 2);
+  xla::XlaOp expanded_feature_iota = xla::Iota(
+      builder, iota_shape, /*iota_dimension=*/iota_dimensions.size() - 1);
 
-  // Divide the M*N sized linspace by the depthwise_multiplier to create
-  // [0 0 1 1 2 2] in the example in the function comment.
+  // Divide 'expanded_feature_iota' by the depthwise_multiplier to create
+  // [0 0 1 1 2 2] ... in the example in the function comment.
   expanded_feature_iota =
       xla::Div(expanded_feature_iota,
                XlaHelpers::IntegerLiteral(builder, DataType::DT_INT32,
                                           depthwise_multiplier));
 
-  // Broadcast the N*M linspace to [H, W, ..., M, M*N].
-  std::vector<int64> expanded_feature_broadcast_dims(
-      expanded_filter_shape.dimensions().begin(),
-      expanded_filter_shape.dimensions().end());
-  expanded_feature_broadcast_dims.pop_back();
-  auto broadcasted_expanded_feature_iota =
-      xla::Broadcast(expanded_feature_iota, expanded_feature_broadcast_dims);
-
-  // Compare the broadcasted linspace to the input feature linspace in the
-  // input feature dimension to create a diagonal predicate.
-  return xla::Eq(broadcasted_expanded_feature_iota, input_feature_iota,
-                 {expanded_filter_shape.dimensions_size() - 2});
+  // Compare 'input_feature_iota' with 'expanded_feature_iota' to create a
+  // diagonal predicate.
+  return xla::Eq(expanded_feature_iota, input_feature_iota);
 }
 
 // Reshapes a filter of shape [H, W, ..., M, N] to [H, W, ..., 1, M*N]. Used to
diff --git a/tensorflow/compiler/tf2xla/kernels/conv_ops.cc b/tensorflow/compiler/tf2xla/kernels/conv_ops.cc
index d820528a430..eafdba876ae 100644
--- a/tensorflow/compiler/tf2xla/kernels/conv_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/conv_ops.cc
@@ -22,7 +22,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/lib/constants.h"
-#include "tensorflow/compiler/xla/client/lib/numeric.h"
+#include "tensorflow/compiler/xla/client/lib/matrix.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/core/framework/node_def_util.h"
diff --git a/tensorflow/compiler/tf2xla/kernels/diag_op.cc b/tensorflow/compiler/tf2xla/kernels/diag_op.cc
index 49c12fc2320..ee79cbc70da 100644
--- a/tensorflow/compiler/tf2xla/kernels/diag_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/diag_op.cc
@@ -19,7 +19,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/lib/constants.h"
-#include "tensorflow/compiler/xla/client/lib/numeric.h"
+#include "tensorflow/compiler/xla/client/lib/matrix.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/core/framework/op_kernel.h"
diff --git a/tensorflow/compiler/tf2xla/kernels/dynamic_stitch_op.cc b/tensorflow/compiler/tf2xla/kernels/dynamic_stitch_op.cc
index b2f6ef43fa9..6e6ba21daf5 100644
--- a/tensorflow/compiler/tf2xla/kernels/dynamic_stitch_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/dynamic_stitch_op.cc
@@ -113,8 +113,20 @@ class DynamicStitchOp : public XlaOpKernel {
       }
     }
     int number_of_indices = max_index + 1;
-    OP_REQUIRES(ctx, number_of_indices > 0,
-                errors::InvalidArgument("no indices supplied"));
+    int64 result_rank = 1 + data0_shape.dims() - indices0_shape.dims();
+    if (number_of_indices == 0) {
+      std::vector<int64> result_shape(result_rank);
+      for (int d = indices0_shape.dims(); d < data0_shape.dims(); d++) {
+        result_shape[d - indices0_shape.dims() + 1] = data0_shape.dim_size(d);
+      }
+      xla::PrimitiveType element_type =
+          ctx->input_xla_type(ctx->num_inputs() - 1);
+      xla::Literal empty_literal = xla::Literal::CreateFromShape(
+          xla::ShapeUtil::MakeShape(element_type, result_shape));
+      ctx->SetOutput(0, xla::ConstantLiteral(ctx->builder(), empty_literal));
+      return;
+    }
+
     // Construct the reverse mapping, for each index, of which slice of which
     // input it comes from.
     std::vector<int32> src_input_vector(number_of_indices);
@@ -157,12 +169,9 @@ class DynamicStitchOp : public XlaOpKernel {
 
     // Set up the vectors for slicing: the first dimension will vary
     // slice by slice, and the rest take the full common extra shape.
-    std::vector<int64> slice_start(1 + data0_shape.dims() -
-                                   indices0_shape.dims());
-    std::vector<int64> slice_limit(1 + data0_shape.dims() -
-                                   indices0_shape.dims());
-    std::vector<int64> stride(1 + data0_shape.dims() - indices0_shape.dims(),
-                              1);
+    std::vector<int64> slice_start(result_rank);
+    std::vector<int64> slice_limit(result_rank);
+    std::vector<int64> stride(result_rank, 1);
     for (int d = indices0_shape.dims(); d < data0_shape.dims(); d++) {
       slice_limit[1 + d - indices0_shape.dims()] = data0_shape.dim_size(d);
     }
diff --git a/tensorflow/compiler/tf2xla/kernels/extract_image_patches_op.cc b/tensorflow/compiler/tf2xla/kernels/extract_image_patches_op.cc
index c68b0bfd796..29687c7b82f 100644
--- a/tensorflow/compiler/tf2xla/kernels/extract_image_patches_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/extract_image_patches_op.cc
@@ -17,7 +17,6 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
-#include "tensorflow/compiler/xla/client/lib/numeric.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/core/util/tensor_format.h"
 
diff --git a/tensorflow/compiler/tf2xla/kernels/fake_quantize_ops.cc b/tensorflow/compiler/tf2xla/kernels/fake_quantize_ops.cc
index cdba6680dee..142be030f73 100644
--- a/tensorflow/compiler/tf2xla/kernels/fake_quantize_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/fake_quantize_ops.cc
@@ -260,19 +260,19 @@ class FakeQuantWithMinMaxVarsGradOp : public XlaOpKernel {
     xla::XlaOp below_min = xla::Lt(input, nudged_input_min);
     xla::XlaOp select1 = xla::Select(below_min, gradient, zeroes);
     xla::XlaOp reduce1 = xla::ReduceAll(
-        XlaHelpers::ConvertElementType(b, select1, accumulation_type),
+        XlaHelpers::ConvertElementType(select1, accumulation_type),
         XlaHelpers::Zero(b, accumulation_type),
         *ctx->GetOrCreateAdd(accumulation_type));
-    xla::XlaOp output1 = XlaHelpers::ConvertElementType(b, reduce1, data_type);
+    xla::XlaOp output1 = XlaHelpers::ConvertElementType(reduce1, data_type);
     ctx->SetOutput(1, output1);
 
     xla::XlaOp above_max = xla::Gt(input, nudged_input_max);
     xla::XlaOp select2 = xla::Select(above_max, gradient, zeroes);
     xla::XlaOp reduce2 = xla::ReduceAll(
-        XlaHelpers::ConvertElementType(b, select2, accumulation_type),
+        XlaHelpers::ConvertElementType(select2, accumulation_type),
         XlaHelpers::Zero(b, accumulation_type),
         *ctx->GetOrCreateAdd(accumulation_type));
-    xla::XlaOp output2 = XlaHelpers::ConvertElementType(b, reduce2, data_type);
+    xla::XlaOp output2 = XlaHelpers::ConvertElementType(reduce2, data_type);
     ctx->SetOutput(2, output2);
   }
 
diff --git a/tensorflow/compiler/tf2xla/kernels/fft_ops.cc b/tensorflow/compiler/tf2xla/kernels/fft_ops.cc
index 9b06357d9b7..6df8b5367d2 100644
--- a/tensorflow/compiler/tf2xla/kernels/fft_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/fft_ops.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor.h"
@@ -50,11 +51,36 @@ class GenericFftOp : public XlaOpKernel {
         errors::InvalidArgument("input must be at least 1 dimensional"));
 
     std::vector<int64> fft_length;
+    xla::XlaOp input = ctx->Input(0);
     if (fft_type_ == FftType::RFFT || fft_type_ == FftType::IRFFT) {
       OP_REQUIRES_OK(ctx, ctx->ConstantInputAsIntVector(1, &fft_length));
       OP_REQUIRES(ctx, fft_length.size() == fft_rank_,
                   errors::InvalidArgument("fft_length must be length ",
                                           fft_rank_, " vector"));
+
+      // Zero pad or truncate the axes we're doing FFT on.
+      absl::InlinedVector<int64, 4> slice_sizes = input_shape.dim_sizes();
+      std::vector<std::pair<int64, int64>> padding_sizes(slice_sizes.size());
+      std::vector<int64> expected_sizes = fft_length;
+      // IRFFT wants the innermost axis to be n / 2 + 1.
+      if (fft_type_ == FftType::IRFFT) {
+        expected_sizes[fft_rank_ - 1] = fft_length[fft_rank_ - 1] / 2 + 1;
+      }
+      for (int i = 0; i < fft_rank_; i++) {
+        int index = input_shape.dims() - fft_rank_ + i;
+        if (input_shape.dim_size(index) > expected_sizes[i]) {
+          slice_sizes[index] = expected_sizes[i];
+        } else {
+          padding_sizes[index].second =
+              expected_sizes[i] - input_shape.dim_size(index);
+        }
+      }
+
+      std::vector<int64> start_indices(input_shape.dims(), 0);
+      std::vector<int64> strides(input_shape.dims(), 1);
+      input = xla::Pad(xla::Slice(input, start_indices, slice_sizes, strides),
+                       XlaHelpers::Zero(ctx->builder(), ctx->input_type(0)),
+                       xla::MakeEdgePaddingConfig(padding_sizes));
     } else {
       // Innermost axis provides the FFT length.
       for (int i = 0; i < fft_rank_; i++) {
@@ -63,7 +89,7 @@ class GenericFftOp : public XlaOpKernel {
       }
     }
 
-    xla::XlaOp fft = xla::Fft(ctx->Input(0), fft_type_, fft_length);
+    xla::XlaOp fft = xla::Fft(input, fft_type_, fft_length);
     ctx->SetOutput(0, fft);
   }
 
diff --git a/tensorflow/compiler/tf2xla/kernels/if_op.cc b/tensorflow/compiler/tf2xla/kernels/if_op.cc
index 56da50f1408..b5e08391255 100644
--- a/tensorflow/compiler/tf2xla/kernels/if_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/if_op.cc
@@ -72,7 +72,7 @@ void XlaIfOp::Compile(XlaOpKernelContext* ctx) {
       arg.shape = resource->shape();
       OP_REQUIRES(ctx, arg.initialized,
                   errors::Unimplemented("Uninitialized arguments: ", arg.name));
-      arg.tensor_array_size = resource->tensor_array_size();
+      arg.max_array_size = resource->max_array_size();
       for (const auto& gradient : resource->tensor_array_gradients()) {
         arg.tensor_array_gradients.insert(gradient.first);
       }
diff --git a/tensorflow/compiler/tf2xla/kernels/image_ops.cc b/tensorflow/compiler/tf2xla/kernels/image_ops.cc
index b49b2516d8b..e9bb0a77e99 100644
--- a/tensorflow/compiler/tf2xla/kernels/image_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/image_ops.cc
@@ -191,12 +191,11 @@ class AdjustContrastOpV2 : public XlaOpKernel {
     DataType type = context->input_type(0);
 
     const DataType accumulation_type = XlaHelpers::SumAccumulationType(type);
-    auto converted =
-        XlaHelpers::ConvertElementType(b, input, accumulation_type);
+    auto converted = XlaHelpers::ConvertElementType(input, accumulation_type);
     auto reduce = xla::Reduce(converted, XlaHelpers::Zero(b, accumulation_type),
                               *context->GetOrCreateAdd(accumulation_type),
                               {height_dim, width_dim});
-    auto output = XlaHelpers::ConvertElementType(b, reduce, type);
+    auto output = XlaHelpers::ConvertElementType(reduce, type);
     output =
         xla::Div(output, XlaHelpers::FloatLiteral(b, type, height * width));
 
diff --git a/tensorflow/compiler/tf2xla/kernels/image_resize_ops.cc b/tensorflow/compiler/tf2xla/kernels/image_resize_ops.cc
index 0c7ca602bfa..5a10c52ba8b 100644
--- a/tensorflow/compiler/tf2xla/kernels/image_resize_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/image_resize_ops.cc
@@ -19,7 +19,6 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/array4d.h"
 #include "tensorflow/compiler/xla/client/lib/constants.h"
-#include "tensorflow/compiler/xla/client/lib/numeric.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/core/framework/kernel_def_builder.h"
 #include "tensorflow/core/framework/register_types.h"
diff --git a/tensorflow/compiler/tf2xla/kernels/index_ops_cpu.cc b/tensorflow/compiler/tf2xla/kernels/index_ops_cpu.cc
index e310db2162d..e2c05b648bb 100644
--- a/tensorflow/compiler/tf2xla/kernels/index_ops_cpu.cc
+++ b/tensorflow/compiler/tf2xla/kernels/index_ops_cpu.cc
@@ -30,7 +30,9 @@ limitations under the License.
 namespace tensorflow {
 namespace {
 
-// The logic below uses a custom-call to implement argmax.
+// The logic below uses a custom-call to implement argmax when possible. When
+// custom-call is not allowed or input shapes are not supported, this kernel
+// falls back to using XLA HLO native ArgMax.
 //
 // Also see b/29507024 for first-class XLA support for indexing ops.
 class ArgMaxCustomCallOp : public XlaOpKernel {
@@ -50,27 +52,40 @@ class ArgMaxCustomCallOp : public XlaOpKernel {
     // overhead, when compiling ahead-of-time.
     int64 dim;
     OP_REQUIRES_OK(ctx, ctx->ConstantInputAsIntScalar(1, &dim));
-    OP_REQUIRES(ctx, dim >= 0, errors::InvalidArgument("dim must be >= 0"));
-    OP_REQUIRES(
-        ctx, dim < input_shape.dims(),
-        errors::InvalidArgument("dim must be < input rank (",
-                                input_shape.dims(), "), but got: ", dim));
-    const int64 dim_size = input_shape.dim_size(dim);
-    OP_REQUIRES(ctx, dim_size > 0,
+
+    const int input_dims = input_shape.dims();
+    const int axis = dim < 0 ? dim + input_dims : dim;
+    OP_REQUIRES(ctx, axis >= 0 && axis < input_dims,
+                errors::InvalidArgument("Expected dimension in the range [",
+                                        -input_dims, ", ", input_dims,
+                                        "), but got ", dim));
+
+    const int64 axis_size = input_shape.dim_size(axis);
+    OP_REQUIRES(ctx, axis_size > 0,
                 errors::InvalidArgument(
                     "Reduction axis ", dim,
                     " is empty in shape: ", input_shape.DebugString()));
 
-    // The output shape is the input shape contracted along dim.
-    TensorShape output_shape;
-    for (int d = 0; d < input_shape.dims() - 1; ++d) {
-      output_shape.AddDim(input_shape.dim_size((d < dim) ? d : d + 1));
+    const DataType dtype = output_type(0);
+    xla::PrimitiveType output_type;
+    OP_REQUIRES_OK(ctx, DataTypeToPrimitiveType(dtype, &output_type));
+
+    // Fall back to XLA ArgMax HLO when CustomCall is not allowed or when input
+    // shape isn't supported.
+    if (!ctx->compiler()->options().allow_cpu_custom_calls ||
+        (input_dims != 1 && input_dims != 2)) {
+      xla::XlaOp output = XlaHelpers::ArgMax(ctx->Input(0), output_type, axis);
+      ctx->SetOutput(0, output);
+      return;
+    }
+
+    xla::XlaOp output;
+    // The output shape is the input shape contracted along axis.
+    TensorShape output_shape;
+    for (int d = 0; d < input_shape.dims() - 1; ++d) {
+      output_shape.AddDim(input_shape.dim_size((d < axis) ? d : d + 1));
     }
 
-    // For now we use a custom-call, only for the 1d and 2d cases.
-    OP_REQUIRES(ctx, XlaContext::Get(ctx).allow_cpu_custom_calls(),
-                errors::InvalidArgument(
-                    "ArgMax implementation requires a CustomCall on CPU"));
     xla::XlaBuilder& b = *ctx->builder();
 
     // XLA passes <out> to the function, so it is not included here.
@@ -84,7 +99,7 @@ class ArgMaxCustomCallOp : public XlaOpKernel {
       args.push_back(xla::ConstantLiteral(
           &b, xla::LiteralUtil::CreateR1<int64>(output_shape.dim_sizes())));
       args.push_back(
-          xla::ConstantLiteral(&b, xla::LiteralUtil::CreateR0<int32>(dim)));
+          xla::ConstantLiteral(&b, xla::LiteralUtil::CreateR0<int32>(axis)));
     }
 
     // The argmax function expects row-major layout.
@@ -101,24 +116,15 @@ class ArgMaxCustomCallOp : public XlaOpKernel {
     }
 
     // Tell XLA to call the custom code, defined in
-    // index_ops_kernel_argmax_float_1d.cc.
-    xla::XlaOp output;
-    switch (input_shape.dims()) {
-      case 1:
-        output = xla::CustomCallWithLayout(&b, "argmax_float_1d_xla_impl", args,
-                                           xla_shape, arg_shapes);
-        break;
-      case 2:
-        output = xla::CustomCallWithLayout(&b, "argmax_float_2d_xla_impl", args,
-                                           xla_shape, arg_shapes);
-        break;
-      default:
-        OP_REQUIRES(ctx, false,
-                    errors::Unimplemented(
-                        "Argmax is only implemented for 1d and 2d tensors"
-                        ", but got shape: ",
-                        input_shape.DebugString()));
+    // index_ops_kernel_argmax_float_{1, 2}d.cc.
+    if (input_dims == 1) {
+      output = xla::CustomCallWithLayout(&b, "argmax_float_1d_xla_impl", args,
+                                         xla_shape, arg_shapes);
+    } else {
+      output = xla::CustomCallWithLayout(&b, "argmax_float_2d_xla_impl", args,
+                                         xla_shape, arg_shapes);
     }
+    output = xla::ConvertElementType(output, output_type);
     ctx->SetOutput(0, output);
   }
 
diff --git a/tensorflow/compiler/tf2xla/kernels/l2loss_op.cc b/tensorflow/compiler/tf2xla/kernels/l2loss_op.cc
index f028e361bcc..93f029731c3 100644
--- a/tensorflow/compiler/tf2xla/kernels/l2loss_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/l2loss_op.cc
@@ -37,12 +37,11 @@ class L2LossOp : public XlaOpKernel {
 
     //  output = sum(t ** 2) / 2
     const DataType accumulation_type = XlaHelpers::SumAccumulationType(dtype);
-    auto t =
-        XlaHelpers::ConvertElementType(b, ctx->Input(0), accumulation_type);
+    auto t = XlaHelpers::ConvertElementType(ctx->Input(0), accumulation_type);
     auto square = xla::Mul(t, t);
     auto reduce = xla::Reduce(square, XlaHelpers::Zero(b, accumulation_type),
                               *ctx->GetOrCreateAdd(accumulation_type), dims);
-    auto deconverted = XlaHelpers::ConvertElementType(b, reduce, dtype);
+    auto deconverted = XlaHelpers::ConvertElementType(reduce, dtype);
     auto two = XlaHelpers::IntegerLiteral(b, dtype, 2);
     ctx->SetOutput(0, xla::Div(deconverted, two));
   }
diff --git a/tensorflow/compiler/tf2xla/kernels/lrn_ops.cc b/tensorflow/compiler/tf2xla/kernels/lrn_ops.cc
index 87ee2d3aede..987901d82b3 100644
--- a/tensorflow/compiler/tf2xla/kernels/lrn_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/lrn_ops.cc
@@ -49,16 +49,14 @@ class LRNOp : public XlaOpKernel {
     // We use a window of depth_radius_ * 2 + 1, to account for the current
     // element and a depth_radius_ on either side.
     auto accumulation_type = XlaHelpers::SumAccumulationType(input_type(0));
-    auto converted =
-        XlaHelpers::ConvertElementType(builder, input, accumulation_type);
+    auto converted = XlaHelpers::ConvertElementType(input, accumulation_type);
     auto squared = xla::Mul(converted, converted);
     auto reduce = xla::ReduceWindow(
         squared, XlaHelpers::Zero(builder, accumulation_type),
         *ctx->GetOrCreateAdd(accumulation_type),
         /* window_dimensions = */ {1, 1, 1, depth_radius_ * 2 + 1},
         /* window_strides = */ {1, 1, 1, 1}, xla::Padding::kSame);
-    auto sqr_sum =
-        XlaHelpers::ConvertElementType(builder, reduce, input_type(0));
+    auto sqr_sum = XlaHelpers::ConvertElementType(reduce, input_type(0));
 
     auto scale = xla::Pow(
         xla::Add(xla::ConstantR0<float>(builder, bias_),
@@ -138,15 +136,14 @@ class LRNGradOp : public XlaOpKernel {
 
     auto accumulation_type = XlaHelpers::SumAccumulationType(input_type(0));
     auto converted =
-        XlaHelpers::ConvertElementType(builder, in_image, accumulation_type);
+        XlaHelpers::ConvertElementType(in_image, accumulation_type);
     auto squared = xla::Mul(converted, converted);
     auto reduce = xla::ReduceWindow(
         squared, XlaHelpers::Zero(builder, accumulation_type),
         *ctx->GetOrCreateAdd(accumulation_type),
         /* window_dimensions = */ {1, 1, 1, depth_radius_ * 2 + 1},
         /* window_strides = */ {1, 1, 1, 1}, xla::Padding::kSame);
-    auto sqr_sum =
-        XlaHelpers::ConvertElementType(builder, reduce, input_type(0));
+    auto sqr_sum = XlaHelpers::ConvertElementType(reduce, input_type(0));
 
     auto norm =
         xla::Add(xla::ConstantR0<float>(builder, bias_),
@@ -157,15 +154,13 @@ class LRNGradOp : public XlaOpKernel {
                  xla::Div(out_image, norm)),
         in_grads);
 
-    auto converted_dy =
-        XlaHelpers::ConvertElementType(builder, dy, accumulation_type);
+    auto converted_dy = XlaHelpers::ConvertElementType(dy, accumulation_type);
     auto dy_reduce = xla::ReduceWindow(
         converted_dy, XlaHelpers::Zero(builder, accumulation_type),
         *ctx->GetOrCreateAdd(accumulation_type),
         /* window_dimensions = */ {1, 1, 1, depth_radius_ * 2 + 1},
         /* window_strides = */ {1, 1, 1, 1}, xla::Padding::kSame);
-    auto dy_reduced =
-        XlaHelpers::ConvertElementType(builder, dy_reduce, input_type(0));
+    auto dy_reduced = XlaHelpers::ConvertElementType(dy_reduce, input_type(0));
 
     xla::XlaOp gradients = xla::Add(
         xla::Mul(in_image, dy_reduced),
diff --git a/tensorflow/compiler/tf2xla/kernels/matrix_band_part_op.cc b/tensorflow/compiler/tf2xla/kernels/matrix_band_part_op.cc
index 8dfd7de591c..2dd0a710e47 100644
--- a/tensorflow/compiler/tf2xla/kernels/matrix_band_part_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/matrix_band_part_op.cc
@@ -16,8 +16,8 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
-#include "tensorflow/compiler/xla/client/lib/numeric.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 
 namespace tensorflow {
@@ -61,11 +61,11 @@ class MatrixBandPartOp : public XlaOpKernel {
 
     // Compute 'offset', which is how many diagonals we are above/below the
     // diagonal.
-    xla::XlaOp iota_m = xla::Iota(builder, index_xla_type, m);
-    xla::XlaOp iota_n = xla::Iota(builder, index_xla_type, n);
+    xla::Shape iota_shape = xla::ShapeUtil::MakeShape(index_xla_type, {m, n});
+    xla::XlaOp iota_m = xla::Iota(builder, iota_shape, /*iota_dimension=*/0);
+    xla::XlaOp iota_n = xla::Iota(builder, iota_shape, /*iota_dimension=*/1);
 
-    auto offset = xla::Sub(xla::Broadcast(iota_n, {m}), iota_m,
-                           /*broadcast_dimensions=*/{0});
+    auto offset = xla::Sub(iota_n, iota_m);
 
     // If num_lower or num_upper are negative, include all lower/upper
     // diagonals.
diff --git a/tensorflow/compiler/tf2xla/kernels/matrix_set_diag_op.cc b/tensorflow/compiler/tf2xla/kernels/matrix_set_diag_op.cc
index c0ca881ff82..4f980b6d14e 100644
--- a/tensorflow/compiler/tf2xla/kernels/matrix_set_diag_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/matrix_set_diag_op.cc
@@ -16,7 +16,6 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
-#include "tensorflow/compiler/xla/client/lib/numeric.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 
 namespace tensorflow {
diff --git a/tensorflow/compiler/tf2xla/kernels/permute_op.cc b/tensorflow/compiler/tf2xla/kernels/permute_op.cc
index 94b51e1a586..71920bf5c1e 100644
--- a/tensorflow/compiler/tf2xla/kernels/permute_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/permute_op.cc
@@ -75,8 +75,7 @@ class DataFormatVecPermuteOp : public XlaOpKernel {
     }
     auto keys = xla::ConstantR1(builder, absl::Span<const int32>(dst_indices));
     if (input_rank == 2) {
-      keys = xla::BroadcastInDim(
-          keys, xla::ShapeUtil::MakeShape(xla::S32, {4, 2}), {0});
+      keys = xla::BroadcastInDim(keys, {4, 2}, {0});
     }
     auto sorted = xla::Sort(keys, {ctx->Input(0)}, 0);
     auto output = xla::GetTupleElement(sorted, 1);
diff --git a/tensorflow/compiler/tf2xla/kernels/pooling_ops.cc b/tensorflow/compiler/tf2xla/kernels/pooling_ops.cc
index a259da6383d..06c6cc37ec9 100644
--- a/tensorflow/compiler/tf2xla/kernels/pooling_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/pooling_ops.cc
@@ -152,7 +152,12 @@ class MaxPoolOp : public PoolingOp {
  public:
   MaxPoolOp(OpKernelConstruction* ctx, int num_spatial_dims)
       : PoolingOp(ctx, /*num_spatial_dims=*/num_spatial_dims,
-                  /*reduction_type=*/ctx->input_type(0)) {}
+                  /*reduction_type=*/ctx->input_type(0)) {
+    string data_format_str;
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("data_format", &data_format_str));
+    OP_REQUIRES(ctx, FormatFromString(data_format_str, &data_format_),
+                errors::InvalidArgument("Invalid data format"));
+  }
 
   void Compile(XlaOpKernelContext* ctx) override {
     auto ksize_or_error = GetKernelSize(ctx);
@@ -180,10 +185,6 @@ class MaxPool2DOp : public MaxPoolOp {
  public:
   explicit MaxPool2DOp(OpKernelConstruction* ctx)
       : MaxPoolOp(ctx, /*num_spatial_dims=*/2) {
-    string data_format_str;
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("data_format", &data_format_str));
-    OP_REQUIRES(ctx, FormatFromString(data_format_str, &data_format_),
-                errors::InvalidArgument("Invalid data format"));
   }
 };
 REGISTER_XLA_OP(Name("MaxPool"), MaxPool2DOp);
@@ -204,7 +205,12 @@ class AvgPoolOp : public PoolingOp {
   AvgPoolOp(OpKernelConstruction* ctx, int num_spatial_dims)
       : PoolingOp(ctx, /*num_spatial_dims=*/num_spatial_dims,
                   /*reduction_type=*/
-                  XlaHelpers::SumAccumulationType(ctx->input_type(0))) {}
+                  XlaHelpers::SumAccumulationType(ctx->input_type(0))) {
+    string data_format_str;
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("data_format", &data_format_str));
+    OP_REQUIRES(ctx, FormatFromString(data_format_str, &data_format_),
+                errors::InvalidArgument("Invalid data format"));
+  }
 
   void Compile(XlaOpKernelContext* ctx) override {
     auto ksize_or_error = GetKernelSize(ctx);
@@ -241,10 +247,6 @@ class AvgPool2DOp : public AvgPoolOp {
  public:
   explicit AvgPool2DOp(OpKernelConstruction* ctx)
       : AvgPoolOp(ctx, /*num_spatial_dims=*/2) {
-    string data_format_str;
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("data_format", &data_format_str));
-    OP_REQUIRES(ctx, FormatFromString(data_format_str, &data_format_),
-                errors::InvalidArgument("Invalid data format"));
   }
 };
 REGISTER_XLA_OP(Name("AvgPool"), AvgPool2DOp);
@@ -390,6 +392,11 @@ class AvgPoolGradOp : public XlaOpKernel {
     OP_REQUIRES(ctx, ksize_[0] == 1 && stride_[0] == 1,
                 errors::Unimplemented(
                     "Pooling is not yet supported on the batch dimension."));
+
+    string data_format;
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("data_format", &data_format));
+    OP_REQUIRES(ctx, FormatFromString(data_format, &data_format_),
+                errors::InvalidArgument("Invalid data format"));
   }
 
   int num_dims() const { return num_spatial_dims_ + 2; }
@@ -449,10 +456,6 @@ class AvgPool2DGradOp : public AvgPoolGradOp {
  public:
   explicit AvgPool2DGradOp(OpKernelConstruction* ctx)
       : AvgPoolGradOp(ctx, /*num_spatial_dims=*/2) {
-    string data_format;
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("data_format", &data_format));
-    OP_REQUIRES(ctx, FormatFromString(data_format, &data_format_),
-                errors::InvalidArgument("Invalid data format"));
   }
 };
 REGISTER_XLA_OP(
diff --git a/tensorflow/compiler/tf2xla/kernels/quantize_and_dequantize_op.cc b/tensorflow/compiler/tf2xla/kernels/quantize_and_dequantize_op.cc
index 6f4ed496a17..7fe102428db 100644
--- a/tensorflow/compiler/tf2xla/kernels/quantize_and_dequantize_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/quantize_and_dequantize_op.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/lib/arithmetic.h"
 #include "tensorflow/compiler/xla/client/lib/constants.h"
+#include "tensorflow/compiler/xla/client/lib/math.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/core/platform/macros.h"
@@ -26,12 +27,26 @@ limitations under the License.
 namespace tensorflow {
 namespace {
 
+enum QuantizerRoundMode {
+  // Round half up: if the fraction of y is exactly 0.5, then
+  // round(y) = y + 0.5
+  // E.g., -5.5 gets rounded to -5, -5.4 goes to -5,
+  // 5.4 goes to 5, and 5.5 goes to 6.
+  ROUND_HALF_UP,
+  // Round half to even: if the fraction of y is exactly 0.5, then round(y) is
+  // the nearest even integer to y.
+  // E.g., 23.5 gets rounded to 24, 24.5 gets rounded to 24, while -23.5 becomes
+  // -24, and -24.5 gets rounded to 24.
+  ROUND_HALF_TO_EVEN,
+};
+
 class QuantizeAndDequantizeOp : public XlaOpKernel {
  public:
   explicit QuantizeAndDequantizeOp(OpKernelConstruction* ctx)
       : XlaOpKernel(ctx) {
     OP_REQUIRES_OK(ctx, ctx->GetAttr("signed_input", &signed_input_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("range_given", &range_given_));
+    round_mode_ = ROUND_HALF_TO_EVEN;
   }
 
   void Compile(XlaOpKernelContext* ctx) override {
@@ -117,8 +132,17 @@ class QuantizeAndDequantizeOp : public XlaOpKernel {
       // in that case they were measured from the tensor.
       input = Clamp(min_range, input, max_range);
     }
-    xla::XlaOp result =
-        Floor((input - min_range) * scale + half) * inverse_scale + min_range;
+    xla::XlaOp result;
+    switch (round_mode_) {
+      case ROUND_HALF_TO_EVEN: {
+        result = xla::RoundToEven(input * scale) * inverse_scale;
+        break;
+      }
+      case ROUND_HALF_UP: {
+        result = Floor(input * scale + half) * inverse_scale;
+        break;
+      }
+    }
     ctx->SetOutput(0, result);
   }
 
@@ -126,6 +150,7 @@ class QuantizeAndDequantizeOp : public XlaOpKernel {
   int64 num_bits_ = -1;
   bool signed_input_;
   bool range_given_;
+  QuantizerRoundMode round_mode_;
 };
 
 class QuantizeAndDequantizeV2Op : public QuantizeAndDequantizeOp {
@@ -136,6 +161,20 @@ class QuantizeAndDequantizeV2Op : public QuantizeAndDequantizeOp {
     OP_REQUIRES(ctx, num_bits_ > 0 && num_bits_ < (signed_input_ ? 62 : 63),
                 errors::InvalidArgument("num_bits is out of range: ", num_bits_,
                                         " with signed_input_ ", signed_input_));
+    string round_mode_string;
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("round_mode", &round_mode_string));
+    OP_REQUIRES(
+        ctx,
+        (round_mode_string == "HALF_UP" || round_mode_string == "HALF_TO_EVEN"),
+        errors::InvalidArgument("Round mode string must be "
+                                "'HALF_UP' or "
+                                "'HALF_TO_EVEN', is '" +
+                                round_mode_string + "'"));
+    if (round_mode_string == "HALF_UP") {
+      round_mode_ = ROUND_HALF_UP;
+    } else if (round_mode_string == "HALF_TO_EVEN") {
+      round_mode_ = ROUND_HALF_TO_EVEN;
+    }
   }
 };
 
diff --git a/tensorflow/compiler/tf2xla/kernels/random_ops.cc b/tensorflow/compiler/tf2xla/kernels/random_ops.cc
index 415ce9b77ff..8822e29f7e7 100644
--- a/tensorflow/compiler/tf2xla/kernels/random_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/random_ops.cc
@@ -26,7 +26,6 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/lib/arithmetic.h"
-#include "tensorflow/compiler/xla/client/lib/numeric.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor.h"
diff --git a/tensorflow/compiler/tf2xla/kernels/reduction_ops.cc b/tensorflow/compiler/tf2xla/kernels/reduction_ops.cc
index 107fa62967a..65e158d64fd 100644
--- a/tensorflow/compiler/tf2xla/kernels/reduction_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/reduction_ops.cc
@@ -113,12 +113,21 @@ class MeanOp : public XlaReductionOp {
     xla::Add(scalar_lhs, scalar_rhs);
   }
 
-  xla::XlaOp BuildFinalizer(xla::XlaBuilder* builder,
-                            const xla::XlaOp& reduce_output,
-                            int64 num_elements_reduced) override {
-    auto divisor = XlaHelpers::IntegerLiteral(builder, input_type(0),
-                                              num_elements_reduced);
-    return reduce_output / divisor;
+  xla::XlaOp BuildFinalizer(
+      xla::XlaBuilder* /*builder*/, const xla::XlaOp& input,
+      const xla::XlaOp& reduce_output,
+      const std::vector<int64>& dimensions_to_reduce) override {
+    if (dimensions_to_reduce.empty()) {
+      return reduce_output;
+    }
+    auto divisor = xla::GetDimensionSize(input, dimensions_to_reduce[0]);
+    for (int i = 1; i < dimensions_to_reduce.size(); i++) {
+      auto size = xla::GetDimensionSize(input, dimensions_to_reduce[i]);
+      divisor = xla::Mul(divisor, size);
+    }
+    divisor = xla::ConvertElementType(divisor, xla_reduction_type_);
+    return XlaHelpers::ConvertElementType(reduce_output / divisor,
+                                          input_type(0));
   }
 };
 
diff --git a/tensorflow/compiler/tf2xla/kernels/reduction_ops.h b/tensorflow/compiler/tf2xla/kernels/reduction_ops.h
index 466e79828d1..af716eab798 100644
--- a/tensorflow/compiler/tf2xla/kernels/reduction_ops.h
+++ b/tensorflow/compiler/tf2xla/kernels/reduction_ops.h
@@ -48,13 +48,14 @@ class XlaReductionOp : public XlaOpKernel {
                             const xla::XlaOp& scalar_rhs) = 0;
 
   // Applies a transformation to the output of the reduction. The desired
-  // computation should be added to 'builder'. Argument 'reduce_output' is the
-  // output of the reduction. 'num_elements_reduced' is the number of elements
-  // that contributed to the reduction. Returns the transformed reduction
-  // output, Defaults to returning 'reduce_output' unchanged.
-  virtual xla::XlaOp BuildFinalizer(xla::XlaBuilder* builder,
-                                    const xla::XlaOp& reduce_output,
-                                    int64 num_elements_reduced);
+  // computation should be added to 'builder'. Argument 'input' is the original
+  // input of the reduction; 'reduce_output' is the output of the reduction.
+  // Returns the transformed reduction output. Defaults to returning
+  // 'reduce_output' converted to the input type.
+  virtual xla::XlaOp BuildFinalizer(
+      xla::XlaBuilder* builder, const xla::XlaOp& input,
+      const xla::XlaOp& reduce_output,
+      const std::vector<int64>& dimensions_to_reduce);
 
   void Compile(XlaOpKernelContext* ctx) override;
 
diff --git a/tensorflow/compiler/tf2xla/kernels/reduction_ops_common.cc b/tensorflow/compiler/tf2xla/kernels/reduction_ops_common.cc
index 118f2798d55..2ca2a85244b 100644
--- a/tensorflow/compiler/tf2xla/kernels/reduction_ops_common.cc
+++ b/tensorflow/compiler/tf2xla/kernels/reduction_ops_common.cc
@@ -35,12 +35,13 @@ XlaReductionOp::XlaReductionOp(OpKernelConstruction* ctx,
       ctx, DataTypeToPrimitiveType(reduction_type_, &xla_reduction_type_));
 }
 
-// Unless BuildFinalizer is overridden the reduction has no
-// finalizer.
-xla::XlaOp XlaReductionOp::BuildFinalizer(xla::XlaBuilder* builder,
-                                          const xla::XlaOp& reduce_output,
-                                          int64 num_elements_reduced) {
-  return reduce_output;
+// The default finalizer converts the results back into the input type. This can
+// be overridden.
+xla::XlaOp XlaReductionOp::BuildFinalizer(
+    xla::XlaBuilder* /*builder*/, const xla::XlaOp& /*input*/,
+    const xla::XlaOp& reduce_output,
+    const std::vector<int64>& /*dimensions_to_reduce*/) {
+  return XlaHelpers::ConvertElementType(reduce_output, input_type(0));
 }
 
 void XlaReductionOp::Compile(XlaOpKernelContext* ctx) {
@@ -71,7 +72,6 @@ void XlaReductionOp::Compile(XlaOpKernelContext* ctx) {
 
   absl::InlinedVector<bool, 4> bitmap(data_shape.dims(), false);
   std::vector<int64> xla_axes;
-  int64 num_elements_reduced = 1LL;
   for (int64 i = 0; i < axes_tensor_shape.num_elements(); ++i) {
     int64 index = axes[i];
     OP_REQUIRES(ctx,
@@ -82,7 +82,6 @@ void XlaReductionOp::Compile(XlaOpKernelContext* ctx) {
     index = (index + data_shape.dims()) % data_shape.dims();
     bitmap[index] = true;
     xla_axes.push_back(index);
-    num_elements_reduced *= data_shape.dim_size(index);
   }
 
   std::vector<int64> final_shape;
@@ -118,8 +117,7 @@ void XlaReductionOp::Compile(XlaOpKernelContext* ctx) {
   xla::XlaComputation reduction_computation = r.Build().ConsumeValueOrDie();
 
   auto reduce = xla::Reduce(data, initial, reduction_computation, xla_axes);
-  auto deconverted = XlaHelpers::ConvertElementType(b, reduce, input_type(0));
-  auto finalized = BuildFinalizer(b, deconverted, num_elements_reduced);
+  auto finalized = BuildFinalizer(b, data, reduce, xla_axes);
   auto result = keep_dims_ ? xla::Reshape(finalized, final_shape) : finalized;
   ctx->SetOutput(0, result);
 }
diff --git a/tensorflow/compiler/tf2xla/kernels/resampler_ops.cc b/tensorflow/compiler/tf2xla/kernels/resampler_ops.cc
index 847704608fb..54d34a38abc 100644
--- a/tensorflow/compiler/tf2xla/kernels/resampler_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/resampler_ops.cc
@@ -24,7 +24,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/array4d.h"
 #include "tensorflow/compiler/xla/client/lib/arithmetic.h"
 #include "tensorflow/compiler/xla/client/lib/constants.h"
-#include "tensorflow/compiler/xla/client/lib/numeric.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/shape_util.h"
@@ -44,9 +43,6 @@ namespace {
 
 using xla::XlaOp;
 
-// TODO(b/112295522): note that sampling from image boundary is not currently
-// being handled properly.
-
 // Calculates the bilinear weight tensor, given basis ratio (px, py) of the
 // sampling position:
 //    W = [(1-px)*(1-py), px*(1-py), (1-px)*py, px*py]
@@ -70,11 +66,8 @@ XlaOp BilinearWeights(XlaOpKernelContext* ctx, XlaOp ratio,
   std::vector<int64> last_two_dims_indices = {(broadcast_dims_size - 2),
                                               (broadcast_dims_size - 1)};
 
-  xla::Shape broadcast_shape =
-      xla::ShapeUtil::MakeShape(xla_type, broadcast_dims);
-
   auto broadcast_first_term =
-      xla::BroadcastInDim(first_term, broadcast_shape, last_two_dims_indices);
+      xla::BroadcastInDim(first_term, broadcast_dims, last_two_dims_indices);
 
   // Ratio is of the same dimension as warp, which is [batch, dim_0,... dim_n,
   // 2], we broadcast ratio tensor to 'broadcast_dim' by keeping the
@@ -85,7 +78,7 @@ XlaOp BilinearWeights(XlaOpKernelContext* ctx, XlaOp ratio,
   ratio_broadcast_indices.erase(ratio_broadcast_indices.end() - 2);
 
   auto broadcast_ratio =
-      xla::BroadcastInDim(ratio, broadcast_shape, ratio_broadcast_indices);
+      xla::BroadcastInDim(ratio, broadcast_dims, ratio_broadcast_indices);
 
   auto first_term_subtract_weights = broadcast_first_term - broadcast_ratio;
 
@@ -96,7 +89,7 @@ XlaOp BilinearWeights(XlaOpKernelContext* ctx, XlaOp ratio,
   sign_change = xla::ConvertElementType(sign_change, xla_type);
 
   auto broadcast_sign_change =
-      xla::BroadcastInDim(sign_change, broadcast_shape, last_two_dims_indices);
+      xla::BroadcastInDim(sign_change, broadcast_dims, last_two_dims_indices);
 
   auto flipped = first_term_subtract_weights * broadcast_sign_change;
 
@@ -232,21 +225,19 @@ XlaOp CalculateGradData(XlaOpKernelContext* ctx, XlaOp grad_output, XlaOp ratio,
 
   std::vector<int64> weights_with_channels_dims = reshaped_weights_dims;
   weights_with_channels_dims.push_back(data_channels);
-  auto weights_with_channels_shape =
-      xla::ShapeUtil::MakeShape(warp_type, weights_with_channels_dims);
   std::vector<int64> reshaped_weights_indices(reshaped_weights_dims.size());
   std::iota(reshaped_weights_indices.begin(), reshaped_weights_indices.end(),
             0);
 
   // The dimension is [batch, dim_0, ..., dim_n, 2, 2, data_channel].
   auto broadcast_reshaped_weights = xla::BroadcastInDim(
-      reshaped_weights, weights_with_channels_shape, reshaped_weights_indices);
+      reshaped_weights, weights_with_channels_dims, reshaped_weights_indices);
 
   std::vector<int64> grad_output_indices(warp_dims_without_last_dims.size());
   std::iota(grad_output_indices.begin(), grad_output_indices.end(), 0);
   grad_output_indices.push_back(weights_with_channels_dims.size() - 1);
   XlaOp broadcast_grad_output = xla::BroadcastInDim(
-      grad_output, weights_with_channels_shape, grad_output_indices);
+      grad_output, weights_with_channels_dims, grad_output_indices);
 
   auto grad_output_multiply_weights =
       broadcast_grad_output * broadcast_reshaped_weights;
@@ -294,13 +285,10 @@ XlaOp CalculateGradWarp(XlaOpKernelContext* ctx, XlaOp grad_output, XlaOp ratio,
   std::vector<int64> warp_dims_without_last_dims(warp_dims.begin(),
                                                  warp_dims.end() - 1);
 
+  // With dimension [batch, dim_0, ...dim_n, 4]
   std::vector<int64> neighbor_broadcast_dims = warp_dims_without_last_dims;
   neighbor_broadcast_dims.push_back(4);
 
-  // With dimension [batch, dim_0, ...dim_n, 4]
-  auto neighbor_broadcast_shape =
-      xla::ShapeUtil::MakeShape(data_type, neighbor_broadcast_dims);
-
   // The dimension is [batch, dim_0, ... dim_n, 4, data_channels]
   auto neighbors_data = Gather2by2Neighbors(
       ctx->builder(), data, gather_indices, data_channels, warp_shape.dims());
@@ -326,7 +314,7 @@ XlaOp CalculateGradWarp(XlaOpKernelContext* ctx, XlaOp grad_output, XlaOp ratio,
       xla::BroadcastInDim(
           xla::ConvertElementType(
               xla::ConstantR1<float>(ctx->builder(), {0, 0, -1, 1}), data_type),
-          neighbor_broadcast_shape, {last_warp_dim}),
+          neighbor_broadcast_dims, {last_warp_dim}),
       neighbors_data, dot_dims, /*precision_config=*/nullptr);
 
   // img_cxfy - img_fxfy
@@ -334,7 +322,7 @@ XlaOp CalculateGradWarp(XlaOpKernelContext* ctx, XlaOp grad_output, XlaOp ratio,
       xla::BroadcastInDim(
           xla::ConvertElementType(
               xla::ConstantR1<float>(ctx->builder(), {-1, 1, 0, 0}), data_type),
-          neighbor_broadcast_shape, {last_warp_dim}),
+          neighbor_broadcast_dims, {last_warp_dim}),
       neighbors_data, dot_dims, /*precision_config=*/nullptr);
 
   // img_cxcy - img_cxfy
@@ -342,7 +330,7 @@ XlaOp CalculateGradWarp(XlaOpKernelContext* ctx, XlaOp grad_output, XlaOp ratio,
       xla::BroadcastInDim(
           xla::ConvertElementType(
               xla::ConstantR1<float>(ctx->builder(), {0, -1, 0, 1}), data_type),
-          neighbor_broadcast_shape, {last_warp_dim}),
+          neighbor_broadcast_dims, {last_warp_dim}),
       neighbors_data, dot_dims, /*precision_config=*/nullptr);
 
   // img_fxcy - img_fxfy
@@ -350,7 +338,7 @@ XlaOp CalculateGradWarp(XlaOpKernelContext* ctx, XlaOp grad_output, XlaOp ratio,
       xla::BroadcastInDim(
           xla::ConvertElementType(
               xla::ConstantR1<float>(ctx->builder(), {-1, 0, 1, 0}), data_type),
-          neighbor_broadcast_shape, {last_warp_dim}),
+          neighbor_broadcast_dims, {last_warp_dim}),
       neighbors_data, dot_dims, /*precision_config=*/nullptr);
 
   // Slice out x and y.
@@ -421,12 +409,13 @@ class ResamplerOp : public XlaOpKernel {
     OP_REQUIRES(ctx, warp_shape.dim_size(last_warp_dim) == 2,
                 errors::InvalidArgument(
                     "the last dimension of warp must be exactly size 2."));
+    xla::PrimitiveType warp_type = ctx->input_xla_type(1);
 
     XlaOp data = ctx->Input("data");
     XlaOp warp = ctx->Input("warp");
 
     // Find the coordinates of the top left corner for the 2x2 region to be
-    // sampled from. The dimensions are (batch, dim_0, ... dim_n, 2) where the
+    // sampled from. The dimensions are [batch, dim_0, ... dim_n, 2] where the
     // last dimension of size 2 in turn is [x, y].
     XlaOp top_left = xla::ConvertElementType(warp, xla::U32);
 
@@ -457,10 +446,54 @@ class ResamplerOp : public XlaOpKernel {
     dot_dims.add_lhs_contracting_dimensions(warp_shape.dims() - 1);
     dot_dims.add_rhs_contracting_dimensions(warp_shape.dims() - 1);
 
+    // The dimension is [batch, dim_0, ...dim_n, data_channels].
     auto blended_pixels = xla::DotGeneral(weights, neighbors_data, dot_dims,
                                           /*precision_config=*/nullptr);
 
-    ctx->SetOutput(0, blended_pixels);
+    // Handle out of boundary cases by constructing a predicate mask array based
+    // on the in-bound condition, and output 0 for the blended pixel value if
+    // out-bound. The dimension is the same as top_left: [batch, dim_0,
+    // ...dim_n, 2] where the last dimension of size 2 is the [x, y] coordinate.
+
+    auto is_ge_zero = xla::Ge(warp, xla::ZerosLike(warp));
+
+    auto is_lt_image_size = xla::Lt(
+        warp,
+        xla::ConvertElementType(
+            xla::ConstantR1<float>(
+                ctx->builder(),
+                {/*width=*/static_cast<float>(data_shape.dim_size(2) - 1),
+                 /*height=*/static_cast<float>(data_shape.dim_size(1) - 1)}),
+            warp_type),
+        /*broadcast_dimensions=*/{warp_shape.dims() - 1});
+
+    auto is_in_bound_x_y = xla::And(is_ge_zero, is_lt_image_size);
+    // Reduce along last dimension. The resulting dimension is:
+    // [batch, dim_0, ...dim_n].
+    auto is_in_bound = xla::Reduce(
+        is_in_bound_x_y, xla::ConstantR0<bool>(ctx->builder(), true),
+        xla::CreateScalarAndComputation(xla::PrimitiveType::PRED,
+                                        ctx->builder()),
+        {last_warp_dim});
+
+    // Broadcast 'is_in_bound' to the same dimension as 'blended_pixels', which
+    // is the dimension of the result:
+    //  [batch, dim_0, ...dim_n, data_channels].
+    auto warp_dims = warp_shape.dim_sizes();
+    std::vector<int64> result_dims(warp_dims.begin(), warp_dims.end() - 1);
+    result_dims.push_back(data_channels);
+
+    std::vector<int64> broadcasted_dims(warp_dims.size() - 1);
+    std::iota(broadcasted_dims.begin(), broadcasted_dims.end(), 0);
+    auto broadcasted_is_in_bound =
+        xla::BroadcastInDim(is_in_bound, result_dims, broadcasted_dims);
+
+    // Set out of bound samples to zero.
+    auto zeros =
+        xla::Broadcast(xla::Zero(ctx->builder(), data_type), result_dims);
+    auto result = xla::Select(broadcasted_is_in_bound, blended_pixels, zeros);
+
+    ctx->SetOutput(0, result);
   }
 };
 
@@ -473,6 +506,8 @@ class ResamplerGradOp : public XlaOpKernel {
     OP_REQUIRES_OK(ctx, ctx->GetAttr("T", &output_dtype));
   }
 
+  // TODO(b/112295522): note that sampling from image boundary is not currently
+  // being handled properly.
   void Compile(XlaOpKernelContext* ctx) override {
     TensorShape data_shape_tf = ctx->InputShape("data");
     OP_REQUIRES(ctx, data_shape_tf.dims() == 4,
diff --git a/tensorflow/compiler/tf2xla/kernels/retval_op.cc b/tensorflow/compiler/tf2xla/kernels/retval_op.cc
index 6970dd0a006..e4046c79557 100644
--- a/tensorflow/compiler/tf2xla/kernels/retval_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/retval_op.cc
@@ -47,8 +47,7 @@ class RetvalOp : public XlaOpKernel {
       // compilation.
       OP_REQUIRES_OK(ctx, frame->SetRetval(index_, input));
     } else {
-      XlaContext& xla_context = XlaContext::Get(ctx);
-      xla_context.SetRetval(index_, ctx->InputExpression(0));
+      ctx->xla_context()->SetRetval(index_, ctx->InputExpression(0));
     }
   }
 
diff --git a/tensorflow/compiler/tf2xla/kernels/reverse_sequence_op.cc b/tensorflow/compiler/tf2xla/kernels/reverse_sequence_op.cc
index 7ff3e916381..d7b38e86cc9 100644
--- a/tensorflow/compiler/tf2xla/kernels/reverse_sequence_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/reverse_sequence_op.cc
@@ -18,7 +18,6 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/lib/constants.h"
-#include "tensorflow/compiler/xla/client/lib/numeric.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/framework/tensor_shape.h"
diff --git a/tensorflow/compiler/tf2xla/kernels/scan_ops.cc b/tensorflow/compiler/tf2xla/kernels/scan_ops.cc
index b5fd7850bfc..4b9e1a578be 100644
--- a/tensorflow/compiler/tf2xla/kernels/scan_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/scan_ops.cc
@@ -39,8 +39,8 @@ namespace {
 
 // TODO(phawkins): implement double-sized windowed reductions in XLA and remove
 // the type constraint.
-constexpr std::array<DataType, 3> kScanOpTypes = {
-    {DT_HALF, DT_BFLOAT16, DT_FLOAT}};
+constexpr std::array<DataType, 4> kScanOpTypes = {
+    {DT_HALF, DT_BFLOAT16, DT_FLOAT, DT_INT32}};
 
 class ScanOp : public XlaOpKernel {
  public:
@@ -103,11 +103,10 @@ class ScanOp : public XlaOpKernel {
       reducer = ctx->GetOrCreateMul(dtype);
     }
     auto output = xla::ReduceWindowWithGeneralPadding(
-        XlaHelpers::ConvertElementType(builder, ctx->Input(0), dtype), init,
-        *reducer, window_dims, window_strides,
+        XlaHelpers::ConvertElementType(ctx->Input(0), dtype), init, *reducer,
+        window_dims, window_strides,
         /*base_dilations=*/{}, /*window_dilations=*/{}, padding);
-    output =
-        XlaHelpers::ConvertElementType(builder, output, ctx->input_type(0));
+    output = XlaHelpers::ConvertElementType(output, ctx->input_type(0));
 
     // In exclusive mode, we have computed an extra element containing the sum
     // of all the input elements. Slice off this extra "last" element.
diff --git a/tensorflow/compiler/tf2xla/kernels/sendrecv_ops.cc b/tensorflow/compiler/tf2xla/kernels/sendrecv_ops.cc
index a7f5a8f1698..84470b230d4 100644
--- a/tensorflow/compiler/tf2xla/kernels/sendrecv_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/sendrecv_ops.cc
@@ -42,7 +42,7 @@ SendOp::SendOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
 }
 
 void SendOp::Compile(XlaOpKernelContext* ctx) {
-  XlaCompiler* compiler = XlaContext::Get(ctx).compiler();
+  XlaCompiler* compiler = ctx->compiler();
   xla::ChannelHandle channel;
   OP_REQUIRES_OK(ctx, compiler->GetChannelHandle(tensor_name_, &channel));
   xla::Send(ctx->Input(0), channel);
@@ -73,7 +73,7 @@ RecvOp::RecvOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
 }
 
 void RecvOp::Compile(XlaOpKernelContext* ctx) {
-  XlaCompiler* compiler = XlaContext::Get(ctx).compiler();
+  XlaCompiler* compiler = ctx->compiler();
   xla::ChannelHandle channel;
   OP_REQUIRES_OK(ctx, compiler->GetChannelHandle(tensor_name_, &channel));
   ctx->SetOutput(0, xla::Recv(ctx->builder(), shape_, channel));
diff --git a/tensorflow/compiler/tf2xla/kernels/sequence_ops.cc b/tensorflow/compiler/tf2xla/kernels/sequence_ops.cc
index 60b011ba6d9..b1fa2915d59 100644
--- a/tensorflow/compiler/tf2xla/kernels/sequence_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/sequence_ops.cc
@@ -18,7 +18,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
-#include "tensorflow/compiler/xla/client/lib/numeric.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/primitive_util.h"
 #include "tensorflow/core/framework/op_kernel.h"
diff --git a/tensorflow/compiler/tf2xla/kernels/softmax_op.cc b/tensorflow/compiler/tf2xla/kernels/softmax_op.cc
index d6bd927135c..20da8033536 100644
--- a/tensorflow/compiler/tf2xla/kernels/softmax_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/softmax_op.cc
@@ -71,7 +71,7 @@ class SoftmaxOp : public XlaOpKernel {
     auto reduce =
         xla::Reduce(converted, xla::Zero(b, xla_accumulation_type),
                     *ctx->GetOrCreateAdd(accumulation_type), {kClassDim});
-    auto sum = XlaHelpers::ConvertElementType(b, reduce, type);
+    auto sum = XlaHelpers::ConvertElementType(reduce, type);
     auto softmax =
         log_
             // softmax = shifted_logits - log(sum(exp(shifted_logits)))
@@ -111,11 +111,11 @@ std::pair<xla::XlaOp, xla::XlaOp> CrossEntropyWithLogits(
   // sum_{class} (exp(logits - max_logits))
   const DataType accumulation_type = XlaHelpers::SumAccumulationType(type);
   auto converted =
-      XlaHelpers::ConvertElementType(b, exp_shifted_logits, accumulation_type);
+      XlaHelpers::ConvertElementType(exp_shifted_logits, accumulation_type);
   auto reduce =
       xla::Reduce(converted, XlaHelpers::Zero(b, accumulation_type),
                   *ctx->GetOrCreateAdd(accumulation_type), {kClassDim});
-  auto sum_exp = XlaHelpers::ConvertElementType(b, reduce, type);
+  auto sum_exp = XlaHelpers::ConvertElementType(reduce, type);
 
   // log(sum(exp(logits - max_logits)))
   auto log_sum_exp = xla::Log(sum_exp);
@@ -126,11 +126,10 @@ std::pair<xla::XlaOp, xla::XlaOp> CrossEntropyWithLogits(
   // (The subtraction broadcasts along the batch dimension.)
   auto sub = xla::Sub(shifted_logits, log_sum_exp, {kBatchDim});
   auto mul = xla::Mul(xla::Neg(labels), sub);
-  auto sum =
-      xla::Reduce(XlaHelpers::ConvertElementType(b, mul, accumulation_type),
-                  XlaHelpers::Zero(b, accumulation_type),
-                  *ctx->GetOrCreateAdd(accumulation_type), {kClassDim});
-  auto loss = XlaHelpers::ConvertElementType(b, sum, type);
+  auto sum = xla::Reduce(XlaHelpers::ConvertElementType(mul, accumulation_type),
+                         XlaHelpers::Zero(b, accumulation_type),
+                         *ctx->GetOrCreateAdd(accumulation_type), {kClassDim});
+  auto loss = XlaHelpers::ConvertElementType(sum, type);
 
   // backprop: prob - labels, where
   //   prob = exp(logits - max_logits) / sum(exp(logits - max_logits))
diff --git a/tensorflow/compiler/tf2xla/kernels/stack_ops.cc b/tensorflow/compiler/tf2xla/kernels/stack_ops.cc
index 7b96b43ad83..8e9e4daf99d 100644
--- a/tensorflow/compiler/tf2xla/kernels/stack_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/stack_ops.cc
@@ -69,7 +69,7 @@ Status MaybeInitializeStack(xla::XlaBuilder* builder, XlaResource* resource,
   }
 
   TensorShape stack_shape;
-  stack_shape.AddDim(resource->tensor_array_size());
+  stack_shape.AddDim(resource->max_array_size());
   stack_shape.AppendShape(elem_shape);
 
   if (!resource->initialized()) {
@@ -97,10 +97,10 @@ class StackOp : public XlaOpKernel {
   }
 
   void Compile(XlaOpKernelContext* ctx) override {
-    int64 size;
-    OP_REQUIRES_OK(ctx, ctx->ConstantInputAsIntScalar(0, &size));
+    int64 max_size;
+    OP_REQUIRES_OK(ctx, ctx->ConstantInputAsIntScalar(0, &max_size));
     OP_REQUIRES(
-        ctx, size >= 0,
+        ctx, max_size >= 0,
         errors::InvalidArgument(
             "XLA compilation requires a fixed stack size upper bound. If "
             "you are using tf.while_loop, set the maximum_iterations parameter "
@@ -108,14 +108,9 @@ class StackOp : public XlaOpKernel {
 
     // We defer initializing the Stack resource until we see the first push.
     // Otherwise we do not know the shape of the stack elements.
-    xla::XlaOp value;
-    XlaContext& xc = XlaContext::Get(ctx);
-    XlaResource* resource;
-    string name = absl::StrCat("Stack: ", stack_name_);
-    OP_REQUIRES_OK(
-        ctx, xc.CreateResource(XlaResource::kStack, -1, std::move(name), dtype_,
-                               TensorShape(), value, /*tensor_array_size=*/size,
-                               /*tensor_array_gradients=*/{}, &resource));
+    XlaResource* resource =
+        ctx->xla_context()->AddResource(XlaResource::CreateStack(
+            /*name=*/absl::StrCat("Stack: ", stack_name_), dtype_, max_size));
     ctx->SetResourceOutput(0, resource);
   }
 
diff --git a/tensorflow/compiler/tf2xla/kernels/stateless_random_ops.cc b/tensorflow/compiler/tf2xla/kernels/stateless_random_ops.cc
index 5db52781be4..50653d7b397 100644
--- a/tensorflow/compiler/tf2xla/kernels/stateless_random_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/stateless_random_ops.cc
@@ -23,7 +23,6 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/lib/constants.h"
 #include "tensorflow/compiler/xla/client/lib/math.h"
-#include "tensorflow/compiler/xla/client/lib/numeric.h"
 #include "tensorflow/compiler/xla/client/lib/prng.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/core/framework/op_kernel.h"
diff --git a/tensorflow/compiler/tf2xla/kernels/tensor_array_ops.cc b/tensorflow/compiler/tf2xla/kernels/tensor_array_ops.cc
index 252967a7464..939d7e19515 100644
--- a/tensorflow/compiler/tf2xla/kernels/tensor_array_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/tensor_array_ops.cc
@@ -61,8 +61,8 @@ Status MaybeInitializeTensorArray(xla::XlaBuilder* builder,
         " but op has dtype ", DataTypeString(dtype), ".");
   }
 
-  TF_RET_CHECK(resource->tensor_array_size() >= 0)
-      << resource->name() << " size " << resource->tensor_array_size();
+  TF_RET_CHECK(resource->max_array_size() >= 0)
+      << resource->name() << " size " << resource->max_array_size();
 
   if (!resource->initialized()) {
     TF_RETURN_IF_ERROR(resource->SetTypeAndShape(dtype, elem_shape));
@@ -78,7 +78,7 @@ Status MaybeInitializeTensorArray(xla::XlaBuilder* builder,
         XLAShapeToTensorShape(shape_or_status.ValueOrDie(), &shape));
 
     TensorShape ta_shape;
-    ta_shape.AddDim(resource->tensor_array_size());
+    ta_shape.AddDim(resource->max_array_size());
     ta_shape.AppendShape(elem_shape);
     if (ta_shape != shape) {
       return errors::InvalidArgument(
@@ -114,7 +114,7 @@ Status CheckTensorArrayIsInitialized(const string& op_name,
 Status GetTensorArrayShape(const XlaResource* resource,
                            xla::XlaBuilder* builder, TensorShape* shape) {
   *shape = resource->shape();
-  shape->InsertDim(0, resource->tensor_array_size());
+  shape->InsertDim(0, resource->max_array_size());
   return Status::OK();
 }
 
@@ -166,13 +166,10 @@ class TensorArrayOp : public XlaOpKernel {
       value = xla::Broadcast(zero, ta_shape.dim_sizes());
     }
 
-    XlaContext& xc = XlaContext::Get(ctx);
-    XlaResource* var;
-    string name = absl::StrCat("TensorArray: ", tensor_array_name_);
-    OP_REQUIRES_OK(
-        ctx, xc.CreateResource(XlaResource::kTensorArray, -1, std::move(name),
-                               dtype_, shape, value, /*tensor_array_size=*/size,
-                               /*tensor_array_gradients=*/{}, &var));
+    XlaResource* var =
+        ctx->xla_context()->AddResource(XlaResource::CreateTensorArray(
+            /*name=*/absl::StrCat("TensorArray: ", tensor_array_name_), dtype_,
+            shape, /*initial_value=*/value, /*max_array_size=*/size));
     ctx->SetResourceOutput(0, var);
 
     Tensor flow(DT_FLOAT, TensorShape({}));
@@ -517,14 +514,13 @@ class TensorArraySplitOp : public XlaOpKernel {
     xla::XlaOp ta = resource->value();
 
     TensorShape ta_shape;
-    ta_shape.AddDim(resource->tensor_array_size());
+    ta_shape.AddDim(resource->max_array_size());
     ta_shape.AppendShape(elem_shape);
 
-    OP_REQUIRES(
-        ctx, lengths.size() == resource->tensor_array_size(),
-        errors::InvalidArgument(
-            "TensorArray's size is not equal to the size of lengths (",
-            lengths.size(), " vs. ", resource->tensor_array_size(), ")"));
+    OP_REQUIRES(ctx, lengths.size() == resource->max_array_size(),
+                errors::InvalidArgument(
+                    "TensorArray's size is not equal to the size of lengths (",
+                    lengths.size(), " vs. ", resource->max_array_size(), ")"));
 
     const xla::XlaOp value = ctx->Input(1);
     const xla::XlaOp flow = ctx->Input(3);
@@ -562,8 +558,7 @@ class TensorArraySizeOp : public XlaOpKernel {
     XlaResource* var;
     OP_REQUIRES_OK(ctx, ctx->GetResourceInput(0, &var));
     Tensor size_tensor(DT_INT32, {});
-    size_tensor.scalar<int32>()() =
-        static_cast<int32>(var->tensor_array_size());
+    size_tensor.scalar<int32>()() = static_cast<int32>(var->max_array_size());
     ctx->SetConstantOutput(0, size_tensor);
   }
 
diff --git a/tensorflow/compiler/tf2xla/kernels/topk_op.cc b/tensorflow/compiler/tf2xla/kernels/topk_op.cc
index 8a0c94cfae1..ee3bdf3394e 100644
--- a/tensorflow/compiler/tf2xla/kernels/topk_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/topk_op.cc
@@ -15,7 +15,6 @@ limitations under the License.
 
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
-#include "tensorflow/compiler/xla/client/lib/numeric.h"
 #include "tensorflow/compiler/xla/client/lib/sorting.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal.h"
diff --git a/tensorflow/compiler/tf2xla/kernels/training_ops.cc b/tensorflow/compiler/tf2xla/kernels/training_ops.cc
index 7077c2e3a54..960c1462ceb 100644
--- a/tensorflow/compiler/tf2xla/kernels/training_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/training_ops.cc
@@ -320,9 +320,8 @@ class ResourceApplyAdagradDA : public XlaOpKernel {
     xla::XlaOp lr = ctx->Input(4);
     xla::XlaOp l1 = ctx->Input(5);
     xla::XlaOp l2 = ctx->Input(6);
-    xla::XlaBuilder* const b = ctx->builder();
     xla::XlaOp global_step =
-        XlaHelpers::ConvertElementType(b, ctx->Input(7), dtype_);
+        XlaHelpers::ConvertElementType(ctx->Input(7), dtype_);
 
     accum = accum + grad;
     squared_accum = squared_accum + xla::Square(grad);
diff --git a/tensorflow/compiler/tf2xla/kernels/while_op.cc b/tensorflow/compiler/tf2xla/kernels/while_op.cc
index 559414eeaa5..ce007fc04a8 100644
--- a/tensorflow/compiler/tf2xla/kernels/while_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/while_op.cc
@@ -64,7 +64,7 @@ Status MakeXlaCompilerArgumentsFromInputs(
       if (!arg.initialized) {
         *has_uninitialized_vars = true;
       }
-      arg.tensor_array_size = resource->tensor_array_size();
+      arg.max_array_size = resource->max_array_size();
       for (const auto& gradient : resource->tensor_array_gradients()) {
         arg.tensor_array_gradients.insert(gradient.first);
       }
diff --git a/tensorflow/compiler/tf2xla/kernels/xla_broadcast_helper_op.cc b/tensorflow/compiler/tf2xla/kernels/xla_broadcast_helper_op.cc
index a9f88a6df25..ad8e707e111 100644
--- a/tensorflow/compiler/tf2xla/kernels/xla_broadcast_helper_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/xla_broadcast_helper_op.cc
@@ -89,13 +89,10 @@ class XlaBroadcastHelperOp : public XlaOpKernel {
               lhs_shape.DebugString(), " and ", rhs_shape.DebugString()));
       broadcast_shape[dim] = min_rank_shape->dim_size(i);
     }
-    xla::PrimitiveType type = context->input_xla_type(0);
-    xla::Shape broadcast_xla_shape =
-        xla::ShapeUtil::MakeShape(type, broadcast_shape);
     if (broadcast_lhs) {
-      lhs = xla::BroadcastInDim(lhs, broadcast_xla_shape, broadcast_dims);
+      lhs = xla::BroadcastInDim(lhs, broadcast_shape, broadcast_dims);
     } else {
-      rhs = xla::BroadcastInDim(rhs, broadcast_xla_shape, broadcast_dims);
+      rhs = xla::BroadcastInDim(rhs, broadcast_shape, broadcast_dims);
     }
     context->SetOutput(0, lhs);
     context->SetOutput(1, rhs);
diff --git a/tensorflow/compiler/tf2xla/lib/BUILD b/tensorflow/compiler/tf2xla/lib/BUILD
index 1ce3930fd1c..422781d536a 100644
--- a/tensorflow/compiler/tf2xla/lib/BUILD
+++ b/tensorflow/compiler/tf2xla/lib/BUILD
@@ -17,20 +17,6 @@ filegroup(
 
 load("//tensorflow/compiler/xla/tests:build_defs.bzl", "xla_test")
 
-cc_library(
-    name = "batch_dot",
-    srcs = ["batch_dot.cc"],
-    hdrs = ["batch_dot.h"],
-    deps = [
-        "//tensorflow/compiler/xla:shape_util",
-        "//tensorflow/compiler/xla:status_macros",
-        "//tensorflow/compiler/xla:statusor",
-        "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/client:xla_builder",
-        "//tensorflow/core:lib",
-    ],
-)
-
 cc_library(
     name = "broadcast",
     srcs = ["broadcast.cc"],
@@ -52,7 +38,6 @@ cc_library(
     srcs = ["cholesky.cc"],
     hdrs = ["cholesky.h"],
     deps = [
-        ":batch_dot",
         ":triangular_solve",
         ":util",
         ":while_loop",
@@ -63,6 +48,8 @@ cc_library(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:xla_builder",
         "//tensorflow/compiler/xla/client/lib:constants",
+        "//tensorflow/compiler/xla/client/lib:matrix",
+        "//tensorflow/compiler/xla/client/lib:slicing",
         "//tensorflow/core:lib",
     ],
 )
@@ -87,7 +74,6 @@ cc_library(
     srcs = ["qr.cc"],
     hdrs = ["qr.h"],
     deps = [
-        ":batch_dot",
         ":util",
         ":while_loop",
         "//tensorflow/compiler/xla:literal_util",
@@ -99,7 +85,8 @@ cc_library(
         "//tensorflow/compiler/xla/client/lib:arithmetic",
         "//tensorflow/compiler/xla/client/lib:constants",
         "//tensorflow/compiler/xla/client/lib:math",
-        "//tensorflow/compiler/xla/client/lib:numeric",
+        "//tensorflow/compiler/xla/client/lib:matrix",
+        "//tensorflow/compiler/xla/client/lib:slicing",
         "//tensorflow/core:lib",
     ],
 )
@@ -129,7 +116,6 @@ cc_library(
     srcs = ["triangular_solve.cc"],
     hdrs = ["triangular_solve.h"],
     deps = [
-        ":batch_dot",
         ":util",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:shape_util",
@@ -140,7 +126,9 @@ cc_library(
         "//tensorflow/compiler/xla/client:xla_builder",
         "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/client/lib:constants",
-        "//tensorflow/compiler/xla/client/lib:numeric",
+        "//tensorflow/compiler/xla/client/lib:math",
+        "//tensorflow/compiler/xla/client/lib:matrix",
+        "//tensorflow/compiler/xla/client/lib:slicing",
         "//tensorflow/core:lib",
     ],
 )
@@ -187,29 +175,6 @@ cc_library(
     ],
 )
 
-xla_test(
-    name = "util_test",
-    srcs = ["util_test.cc"],
-    deps = [
-        ":batch_dot",
-        ":util",
-        "//tensorflow/compiler/xla:array2d",
-        "//tensorflow/compiler/xla:literal",
-        "//tensorflow/compiler/xla:shape_util",
-        "//tensorflow/compiler/xla:statusor",
-        "//tensorflow/compiler/xla:test",
-        "//tensorflow/compiler/xla:types",
-        "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/client:global_data",
-        "//tensorflow/compiler/xla/client:local_client",
-        "//tensorflow/compiler/xla/tests:client_library_test_base",
-        "//tensorflow/compiler/xla/tests:literal_test_util",
-        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:test",
-    ],
-)
-
 cc_library(
     name = "while_loop",
     srcs = ["while_loop.cc"],
diff --git a/tensorflow/compiler/tf2xla/lib/batch_dot.cc b/tensorflow/compiler/tf2xla/lib/batch_dot.cc
deleted file mode 100644
index 5400e8834cb..00000000000
--- a/tensorflow/compiler/tf2xla/lib/batch_dot.cc
+++ /dev/null
@@ -1,115 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/compiler/tf2xla/lib/batch_dot.h"
-
-#include <memory>
-#include <vector>
-
-#include "tensorflow/compiler/xla/client/xla_builder.h"
-#include "tensorflow/compiler/xla/shape_util.h"
-#include "tensorflow/compiler/xla/status_macros.h"
-#include "tensorflow/compiler/xla/statusor.h"
-#include "tensorflow/core/lib/core/errors.h"
-
-namespace tensorflow {
-
-xla::XlaOp BatchDot(xla::XlaOp x, xla::XlaOp y, bool transpose_x,
-                    bool transpose_y, bool conjugate_x, bool conjugate_y,
-                    xla::PrecisionConfig::Precision precision) {
-  xla::XlaBuilder* builder = x.builder();
-  return builder->ReportErrorOrReturn([&]() -> xla::StatusOr<xla::XlaOp> {
-    TF_ASSIGN_OR_RETURN(xla::Shape x_shape, builder->GetShape(x));
-    TF_ASSIGN_OR_RETURN(xla::Shape y_shape, builder->GetShape(y));
-
-    // Check that both tensors have the same number of dimensions. There must be
-    // at least two (the batch dimensions can be empty).
-    if (xla::ShapeUtil::Rank(x_shape) != xla::ShapeUtil::Rank(y_shape)) {
-      return errors::InvalidArgument(
-          "Arguments to BatchedDot have different ranks: ",
-          xla::ShapeUtil::HumanString(x_shape), " vs. ",
-          xla::ShapeUtil::HumanString(y_shape));
-    }
-    const int ndims = xla::ShapeUtil::Rank(x_shape);
-    if (ndims < 2) {
-      return errors::InvalidArgument(
-          "Arguments to BatchedDot must have rank >= 2: ", ndims);
-    }
-
-    // The batch dimensions must be equal and the matrix dimensions must be
-    // valid.
-    std::vector<int64> batch_dimension_numbers;
-    for (int i = 0; i < ndims - 2; ++i) {
-      if (x_shape.dimensions(i) != y_shape.dimensions(i)) {
-        return errors::InvalidArgument(
-            "Dimension ", i, " of inputs to BatchedDot must be equal: ",
-            xla::ShapeUtil::HumanString(x_shape), " vs ",
-            xla::ShapeUtil::HumanString(y_shape));
-      }
-      batch_dimension_numbers.push_back(i);
-    }
-
-    int x_inner_dim = transpose_x ? (ndims - 2) : (ndims - 1);
-    int y_inner_dim = transpose_y ? (ndims - 1) : (ndims - 2);
-    if (x_shape.dimensions(x_inner_dim) != y_shape.dimensions(y_inner_dim)) {
-      return errors::InvalidArgument(
-          "Dimensions ", x_inner_dim, " and ", y_inner_dim,
-          " of arguments to BatchedDot must be equal: ",
-          xla::ShapeUtil::HumanString(x_shape), " transpose: ", transpose_x,
-          " vs. ", xla::ShapeUtil::HumanString(y_shape),
-          " transpose: ", transpose_y);
-    }
-
-    // Check for zero lhs/rhs dim size.
-    if (xla::ShapeUtil::IsZeroElementArray(x_shape) ||
-        xla::ShapeUtil::IsZeroElementArray(y_shape)) {
-      std::vector<int64> dimensions(batch_dimension_numbers.size());
-      for (int i = 0; i < batch_dimension_numbers.size(); ++i) {
-        dimensions[i] = x_shape.dimensions(batch_dimension_numbers[i]);
-      }
-      int x_outer_dim = transpose_x ? (ndims - 1) : (ndims - 2);
-      int y_outer_dim = transpose_y ? (ndims - 2) : (ndims - 1);
-      dimensions.push_back(x_shape.dimensions(x_outer_dim));
-      dimensions.push_back(y_shape.dimensions(y_outer_dim));
-      return xla::Broadcast(
-          xla::ConstantLiteral(builder,
-                               xla::LiteralUtil::Zero(x_shape.element_type())),
-          dimensions);
-    }
-
-    if (x_shape.element_type() == xla::C64 && conjugate_x) {
-      x = xla::Conj(x);
-    }
-    if (y_shape.element_type() == xla::C64 && conjugate_y) {
-      y = xla::Conj(y);
-    }
-
-    xla::PrecisionConfig precision_proto;
-    precision_proto.add_operand_precision(precision);
-    precision_proto.add_operand_precision(precision);
-
-    xla::DotDimensionNumbers dot_dnums;
-    dot_dnums.add_lhs_contracting_dimensions(x_inner_dim);
-    dot_dnums.add_rhs_contracting_dimensions(y_inner_dim);
-    for (auto batch_dimension_number : batch_dimension_numbers) {
-      dot_dnums.add_lhs_batch_dimensions(batch_dimension_number);
-      dot_dnums.add_rhs_batch_dimensions(batch_dimension_number);
-    }
-
-    return xla::DotGeneral(x, y, dot_dnums, &precision_proto);
-  });
-}
-
-}  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/lib/batch_dot.h b/tensorflow/compiler/tf2xla/lib/batch_dot.h
deleted file mode 100644
index 6edd63a4d3b..00000000000
--- a/tensorflow/compiler/tf2xla/lib/batch_dot.h
+++ /dev/null
@@ -1,54 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_COMPILER_TF2XLA_LIB_BATCH_DOT_H_
-#define TENSORFLOW_COMPILER_TF2XLA_LIB_BATCH_DOT_H_
-
-#include "tensorflow/compiler/xla/client/xla_builder.h"
-#include "tensorflow/compiler/xla/xla_data.pb.h"
-
-namespace tensorflow {
-
-// Multiplies slices of two tensors in batches.
-
-// Multiplies all slices of `Tensor` `x` and `y` (each slice can be
-// viewed as an element of a batch), and arranges the individual results
-// in a single output tensor of the same batch size. Each of the
-// individual slices can optionally be transposed before multiplication by
-// setting the `transpose_x` or `transpose_y` flag to `true`. Similarly, each
-// can be elementwise-complex-conjugated by setting the `conjugate_x` or
-// `conjugate_y` flag to `true`. To apply a Hermitian adjoint to `x`, set both
-// `transpose_x` and `conjugate_x` to `true`, and analogously for `y`.
-//
-// The input tensors `x` and `y` are 2-D or higher with shape `[..., r_x, c_x]`
-// and `[..., r_y, c_y]`.
-//
-// The output tensor is 2-D or higher with shape `[..., r_o, c_o]`, where:
-//
-//     r_o = c_x if transpose_x else r_x
-//     c_o = r_y if transpose_y else c_y
-//
-// It is computed as:
-//
-//     output[..., :, :] = matrix(x[..., :, :]) * matrix(y[..., :, :])
-xla::XlaOp BatchDot(
-    xla::XlaOp x, xla::XlaOp y, bool transpose_x = false,
-    bool transpose_y = false, bool conjugate_x = false,
-    bool conjugate_y = false,
-    xla::PrecisionConfig::Precision precision = xla::PrecisionConfig::DEFAULT);
-
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_COMPILER_TF2XLA_LIB_BATCH_DOT_H_
diff --git a/tensorflow/compiler/tf2xla/lib/broadcast.cc b/tensorflow/compiler/tf2xla/lib/broadcast.cc
index 3e402ef855c..be31f116686 100644
--- a/tensorflow/compiler/tf2xla/lib/broadcast.cc
+++ b/tensorflow/compiler/tf2xla/lib/broadcast.cc
@@ -80,10 +80,8 @@ xla::StatusOr<xla::XlaOp> BroadcastTo(xla::XlaOp input,
     broadcast_dim = broadcast_shape_size - broadcast_dim - 1;
   }
   absl::c_reverse(broadcast_shape);
-  xla::XlaOp output = xla::BroadcastInDim(
-      input,
-      xla::ShapeUtil::MakeShape(input_shape.element_type(), broadcast_shape),
-      broadcast_dims);
+  xla::XlaOp output =
+      xla::BroadcastInDim(input, broadcast_shape, broadcast_dims);
   if (broadcast_shape != output_dims) {
     output = xla::Reshape(output, output_dims);
   }
diff --git a/tensorflow/compiler/tf2xla/lib/cholesky.cc b/tensorflow/compiler/tf2xla/lib/cholesky.cc
index ab3d0a56683..7ef8659992f 100644
--- a/tensorflow/compiler/tf2xla/lib/cholesky.cc
+++ b/tensorflow/compiler/tf2xla/lib/cholesky.cc
@@ -18,11 +18,12 @@ limitations under the License.
 #include <memory>
 #include <vector>
 
-#include "tensorflow/compiler/tf2xla/lib/batch_dot.h"
 #include "tensorflow/compiler/tf2xla/lib/triangular_solve.h"
 #include "tensorflow/compiler/tf2xla/lib/util.h"
 #include "tensorflow/compiler/tf2xla/lib/while_loop.h"
 #include "tensorflow/compiler/xla/client/lib/constants.h"
+#include "tensorflow/compiler/xla/client/lib/matrix.h"
+#include "tensorflow/compiler/xla/client/lib/slicing.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/shape_util.h"
@@ -101,10 +102,7 @@ xla::XlaOp CholeskyUnblocked(xla::XlaOp a,
       // a[..., i, i]
       auto a_ii = DynamicSliceInMinorDims(body_a, {i, i}, {1, 1});
       // np.dot(row, np.swapaxes(row, -1, -2))
-      auto diag_dot = BatchDot(row, row,
-                               /*transpose_x=*/false,
-                               /*transpose_y=*/true, /*conjugate_x=*/false,
-                               /*conjugate_y=*/false, precision);
+      auto diag_dot = BatchDot(row, TransposeInMinorDims(row), precision);
       // l[..., i, i] = np.sqrt(a[..., i, i] - np.dot(row,
       //                                              np.swapaxes(row, -1, -2)))
       auto l_ii =
@@ -122,10 +120,7 @@ xla::XlaOp CholeskyUnblocked(xla::XlaOp a,
       // The columns in [i, n] are zeroed out in `row`, so we just have to
       // zero out rows above i+1 after the BatchDot. np.dot(l[..., :, :i],
       // r.T)
-      auto dot = BatchDot(body_l, row,
-                          /*transpose_x=*/false,
-                          /*transpose_y=*/true, /*conjugate_x=*/false,
-                          /*conjugate_y=*/false, precision);
+      auto dot = BatchDot(body_l, TransposeInMinorDims(row), precision);
       // np.dot(l[..., i+1:, :i], r.T)
       auto dot_ip1 =
           xla::Select(xla::Le(mask_range_col, i), mask_zeros_col, dot);
@@ -185,9 +180,7 @@ xla::XlaOp Cholesky(xla::XlaOp a, int64 block_size,
         // a[i:, i:i+k] -= np.dot(l[i:, :i], np.transpose(l[i:i+k, :i]))
         auto lhs = SliceInMinorDims(l, {i, 0}, {n, i});
         auto rhs = SliceInMinorDims(l, {i, 0}, {i + k, i});
-        auto delta = BatchDot(lhs, rhs, /*transpose_x=*/false,
-                              /*transpose_y=*/true, /*conjugate_x=*/false,
-                              /*conjugate_y=*/false, precision);
+        auto delta = BatchDot(lhs, TransposeInMinorDims(rhs), precision);
         auto before = SliceInMinorDims(a, {i, i}, {n, i + k});
         a = UpdateSliceInMinorDims(a, before - delta, {i, i});
       }
diff --git a/tensorflow/compiler/tf2xla/lib/qr.cc b/tensorflow/compiler/tf2xla/lib/qr.cc
index 6b3f2b6e065..d6007748609 100644
--- a/tensorflow/compiler/tf2xla/lib/qr.cc
+++ b/tensorflow/compiler/tf2xla/lib/qr.cc
@@ -18,13 +18,13 @@ limitations under the License.
 #include <memory>
 #include <vector>
 
-#include "tensorflow/compiler/tf2xla/lib/batch_dot.h"
 #include "tensorflow/compiler/tf2xla/lib/util.h"
 #include "tensorflow/compiler/tf2xla/lib/while_loop.h"
 #include "tensorflow/compiler/xla/client/lib/arithmetic.h"
 #include "tensorflow/compiler/xla/client/lib/constants.h"
 #include "tensorflow/compiler/xla/client/lib/math.h"
-#include "tensorflow/compiler/xla/client/lib/numeric.h"
+#include "tensorflow/compiler/xla/client/lib/matrix.h"
+#include "tensorflow/compiler/xla/client/lib/slicing.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
@@ -191,12 +191,8 @@ xla::StatusOr<QRBlockResult> QRBlock(
     auto v_broadcast = xla::Reshape(v, shape);
     // a[:, :] -= tau * np.dot(v[:, np.newaxis],
     //                          np.dot(v[np.newaxis, :], a[:, :]))
-    auto vva =
-        BatchDot(v_broadcast, a, /*transpose_x=*/false, /*transpose_y=*/false,
-                 /*conjugate_x=*/false, /*conjugate_y=*/false, precision);
-    vva =
-        BatchDot(v_broadcast, vva, /*transpose_x=*/true, /*transpose_y=*/false,
-                 /*conjugate_x=*/false, /*conjugate_y=*/false, precision);
+    auto vva = BatchDot(v_broadcast, a, precision);
+    vva = BatchDot(TransposeInMinorDims(v_broadcast), vva, precision);
     a = a - xla::Mul(tau, vva,
                      /*broadcast_dimensions=*/batch_dim_indices);
 
@@ -278,12 +274,9 @@ xla::StatusOr<xla::XlaOp> ComputeWYRepresentation(
     auto beta = DynamicSliceInMinorDims(taus, {j}, {1});
 
     // yv has shape [..., n, 1]
-    auto yv = BatchDot(y, v, /*transpose_x=*/true, /*transpose_y=*/false,
-                       /*conjugate_x=*/false, /*conjugate_y=*/false, precision);
+    auto yv = BatchDot(TransposeInMinorDims(y), v, precision);
     // wyv has shape [..., m, 1]
-    auto wyv =
-        BatchDot(w, yv, /*transpose_x=*/false, /*transpose_y=*/false,
-                 /*conjugate_x=*/false, /*conjugate_y=*/false, precision);
+    auto wyv = BatchDot(w, yv, precision);
 
     auto z = xla::Mul(
         -beta, v + wyv,
@@ -375,23 +368,15 @@ xla::StatusOr<QRDecompositionResult> QRDecomposition(
 
     // a[i:, i+k:] += np.dot(Y, np.dot(W.T, a[i:, i+k:]))
     auto a_panel = SliceInMinorDims(a, {i, i + k}, {m, n});
-    auto a_update =
-        BatchDot(w, a_panel, /*transpose_x=*/true, /*transpose_y=*/false,
-                 /*conjugate_x=*/false, /*conjugate_y=*/false, precision);
-    a_update =
-        BatchDot(y, a_update, /*transpose_x=*/false, /*transpose_y=*/false,
-                 /*conjugate_x=*/false, /*conjugate_y=*/false, precision);
+    auto a_update = BatchDot(TransposeInMinorDims(w), a_panel, precision);
+    a_update = BatchDot(y, a_update, precision);
     a_panel = a_panel + a_update;
     a = UpdateSliceInMinorDims(a, a_panel, {i, i + k});
 
     // q[:, i:] += np.dot(np.dot(q[:, i:], W), Y.T))
     auto q_panel = SliceInMinorDims(q, {0, i}, {m, m});
-    auto q_update =
-        BatchDot(q_panel, w, /*transpose_x=*/false, /*transpose_y=*/false,
-                 /*conjugate_x=*/false, /*conjugate_y=*/false, precision);
-    q_update = BatchDot(q_update, y, /*transpose_x=*/false,
-                        /*transpose_y=*/true, /*conjugate_x=*/false,
-                        /*conjugate_y=*/false, precision);
+    auto q_update = BatchDot(q_panel, w, precision);
+    q_update = BatchDot(q_update, TransposeInMinorDims(y), precision);
     q_panel = q_panel + q_update;
     q = UpdateSliceInMinorDims(q, q_panel, {0, i});
   }
diff --git a/tensorflow/compiler/tf2xla/lib/triangular_solve.cc b/tensorflow/compiler/tf2xla/lib/triangular_solve.cc
index 6524c2a9b1a..192a61dca26 100644
--- a/tensorflow/compiler/tf2xla/lib/triangular_solve.cc
+++ b/tensorflow/compiler/tf2xla/lib/triangular_solve.cc
@@ -18,10 +18,11 @@ limitations under the License.
 #include <memory>
 #include <vector>
 
-#include "tensorflow/compiler/tf2xla/lib/batch_dot.h"
 #include "tensorflow/compiler/tf2xla/lib/util.h"
 #include "tensorflow/compiler/xla/client/lib/constants.h"
-#include "tensorflow/compiler/xla/client/lib/numeric.h"
+#include "tensorflow/compiler/xla/client/lib/math.h"
+#include "tensorflow/compiler/xla/client/lib/matrix.h"
+#include "tensorflow/compiler/xla/client/lib/slicing.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/literal.h"
@@ -311,13 +312,13 @@ xla::XlaOp SolveWithInvertedDiagonalBlocks(
         auto a_row =
             MaybeConjugate(SliceInMinorDims(a, start, end), conjugate_a);
         if (left_side) {
-          remainder = b_row - BatchDot(a_row, x, transpose_a, false,
-                                       /*conjugate_x=*/false,
-                                       /*conjugate_y=*/false, precision);
+          remainder =
+              b_row - BatchDot(MaybeTransposeInMinorDims(a_row, transpose_a), x,
+                               precision);
         } else {
-          remainder = b_row - BatchDot(x, a_row, false, transpose_a,
-                                       /*conjugate_x=*/false,
-                                       /*conjugate_y=*/false, precision);
+          remainder =
+              b_row - BatchDot(x, MaybeTransposeInMinorDims(a_row, transpose_a),
+                               precision);
         }
       }
 
@@ -327,13 +328,12 @@ xla::XlaOp SolveWithInvertedDiagonalBlocks(
           xla::ConstantR0WithType(builder, xla::S32, j * block_size);
       std::vector<xla::XlaOp> update_starts = {start_index, zero};
       if (left_side) {
-        x_update =
-            BatchDot(inv_block, remainder, transpose_a, false,
-                     /*conjugate_x=*/false, /*conjugate_y=*/false, precision);
+        x_update = BatchDot(MaybeTransposeInMinorDims(inv_block, transpose_a),
+                            remainder, precision);
       } else {
-        x_update =
-            BatchDot(remainder, inv_block, false, transpose_a,
-                     /*conjugate_x=*/false, /*conjugate_y=*/false, precision);
+        x_update = BatchDot(remainder,
+                            MaybeTransposeInMinorDims(inv_block, transpose_a),
+                            precision);
         std::swap(update_starts[0], update_starts[1]);
       }
       x = DynamicUpdateSliceInMinorDims(x, x_update, /*starts=*/update_starts);
diff --git a/tensorflow/compiler/tf2xla/lib/util.cc b/tensorflow/compiler/tf2xla/lib/util.cc
index 804671fbc75..c0bd172d17c 100644
--- a/tensorflow/compiler/tf2xla/lib/util.cc
+++ b/tensorflow/compiler/tf2xla/lib/util.cc
@@ -113,36 +113,6 @@ xla::XlaOp IntegerLiteral(xla::XlaBuilder* builder, xla::PrimitiveType type,
   return xla::ConstantLiteral(builder, literal);
 }
 
-xla::XlaOp SliceInMinorDims(xla::XlaOp x, absl::Span<const int64> start,
-                            absl::Span<const int64> end) {
-  xla::XlaBuilder* builder = x.builder();
-  return builder->ReportErrorOrReturn([&]() -> xla::StatusOr<xla::XlaOp> {
-    TF_RET_CHECK(start.size() == end.size());
-    int64 n_minor_dims = start.size();
-
-    TF_ASSIGN_OR_RETURN(xla::Shape shape, builder->GetShape(x));
-
-    const int64 n_dims = xla::ShapeUtil::Rank(shape);
-    TF_RET_CHECK(n_minor_dims <= n_dims);
-    auto major_dims = xla::AsInt64Slice(shape.dimensions())
-                          .subspan(
-                              /*pos=*/0,
-                              /*len=*/n_dims - n_minor_dims);
-
-    // Prepends 0s in the major dim
-    std::vector<int64> padded_start(n_dims, 0);
-    std::copy(start.begin(), start.end(),
-              padded_start.begin() + major_dims.size());
-
-    // Prepends the shape of the major dims.
-    std::vector<int64> padded_end(n_dims);
-    std::copy(major_dims.begin(), major_dims.end(), padded_end.begin());
-    std::copy(end.begin(), end.end(), padded_end.begin() + major_dims.size());
-
-    std::vector<int64> strides(n_dims, 1);
-    return xla::Slice(x, padded_start, padded_end, strides);
-  });
-}
 
 std::vector<int64> ConcatVectors(absl::Span<const int64> xs,
                                  absl::Span<const int64> ys) {
@@ -152,100 +122,4 @@ std::vector<int64> ConcatVectors(absl::Span<const int64> xs,
   return output;
 }
 
-xla::XlaOp DynamicSliceInMinorDims(xla::XlaOp x,
-                                   absl::Span<const xla::XlaOp> starts,
-                                   absl::Span<const int64> sizes) {
-  xla::XlaBuilder* builder = x.builder();
-  return builder->ReportErrorOrReturn([&]() -> xla::StatusOr<xla::XlaOp> {
-    TF_ASSIGN_OR_RETURN(xla::Shape shape, builder->GetShape(x));
-    const int64 n_dims = xla::ShapeUtil::Rank(shape);
-    int64 n_minor_dims = starts.size();
-    TF_RET_CHECK(n_minor_dims == sizes.size());
-    TF_RET_CHECK(n_minor_dims <= n_dims);
-    auto major_dims = xla::AsInt64Slice(shape.dimensions())
-                          .subspan(
-                              /*pos=*/0,
-                              /*len=*/n_dims - sizes.size());
-    auto padded_starts = PrependZerosInMajorDims(x, starts);
-    auto padded_sizes = ConcatVectors(major_dims, sizes);
-    return xla::DynamicSlice(x, padded_starts, padded_sizes);
-  });
-}
-
-xla::XlaOp UpdateSlice(xla::XlaOp x, xla::XlaOp update,
-                       absl::Span<const int64> start) {
-  xla::XlaBuilder* builder = x.builder();
-  return builder->ReportErrorOrReturn([&]() -> xla::StatusOr<xla::XlaOp> {
-    // TODO(phawkins): make int64 work on all backends, remove the int32 cast.
-    std::vector<int32> start_as_int32(start.begin(), start.end());
-    auto start_constant = xla::ConstantR1<int32>(builder, start_as_int32);
-    TF_ASSIGN_OR_RETURN(xla::Shape shape, builder->GetShape(x));
-    const int64 n_dims = xla::ShapeUtil::Rank(shape);
-    TF_ASSIGN_OR_RETURN(xla::Shape start_constant_shape,
-                        builder->GetShape(start_constant));
-    const int64 start_length =
-        xla::ShapeUtil::GetDimension(start_constant_shape, -1);
-    TF_RET_CHECK(start_length == n_dims);
-    return xla::DynamicUpdateSlice(x, update, start_constant);
-  });
-}
-
-xla::XlaOp UpdateSliceInMinorDims(xla::XlaOp x, xla::XlaOp update,
-                                  absl::Span<const int64> start) {
-  xla::XlaBuilder* builder = x.builder();
-  return builder->ReportErrorOrReturn([&]() -> xla::StatusOr<xla::XlaOp> {
-    TF_ASSIGN_OR_RETURN(xla::Shape shape, builder->GetShape(x));
-    const int64 n_dims = xla::ShapeUtil::Rank(shape);
-    const int64 n_minor_dims = start.size();
-    TF_RET_CHECK(n_minor_dims <= n_dims);
-    std::vector<int64> padded_start(n_dims, 0);
-    std::copy(start.begin(), start.end(),
-              padded_start.begin() + (n_dims - n_minor_dims));
-    return UpdateSlice(x, update, padded_start);
-  });
-}
-
-xla::XlaOp DynamicUpdateSliceInMinorDims(xla::XlaOp x, xla::XlaOp update,
-                                         absl::Span<const xla::XlaOp> starts) {
-  auto padded_starts = PrependZerosInMajorDims(x, starts);
-  return xla::DynamicUpdateSlice(x, update, padded_starts);
-}
-
-xla::XlaOp PrependZerosInMajorDims(xla::XlaOp x,
-                                   absl::Span<const xla::XlaOp> starts) {
-  xla::XlaBuilder* builder = x.builder();
-  return builder->ReportErrorOrReturn([&]() -> xla::StatusOr<xla::XlaOp> {
-    TF_ASSIGN_OR_RETURN(xla::Shape shape, builder->GetShape(x));
-    const int64 n_dims = xla::ShapeUtil::Rank(shape);
-    auto zero = xla::Reshape(xla::ConstantR0<int32>(builder, 0), {1});
-    std::vector<xla::XlaOp> padded_starts(n_dims, zero);
-    for (int i = 0; i < starts.size(); ++i) {
-      padded_starts[n_dims - starts.size() + i] = xla::Reshape(starts[i], {1});
-    }
-    return xla::ConcatInDim(builder, padded_starts, 0);
-  });
-}
-
-xla::XlaOp TransposeInMinorDims(xla::XlaOp x) {
-  xla::XlaBuilder* builder = x.builder();
-  return builder->ReportErrorOrReturn([&]() -> xla::StatusOr<xla::XlaOp> {
-    TF_ASSIGN_OR_RETURN(xla::Shape shape, builder->GetShape(x));
-    const int64 n_dims = xla::ShapeUtil::Rank(shape);
-    TF_RET_CHECK(n_dims >= 2);
-    std::vector<int64> permutation(n_dims);
-    std::iota(permutation.begin(), permutation.end(), 0);
-    std::swap(permutation[n_dims - 1], permutation[n_dims - 2]);
-    return xla::Transpose(x, permutation);
-  });
-}
-
-xla::XlaOp MaybeConjugate(xla::XlaOp x, bool conjugate) {
-  xla::XlaBuilder* builder = x.builder();
-  return builder->ReportErrorOrReturn([&]() -> xla::StatusOr<xla::XlaOp> {
-    TF_ASSIGN_OR_RETURN(xla::Shape shape, builder->GetShape(x));
-    auto perform_conj = shape.element_type() == xla::C64 && conjugate;
-    return perform_conj ? xla::Conj(x) : x;
-  });
-}
-
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/lib/util.h b/tensorflow/compiler/tf2xla/lib/util.h
index 80e9e5b002d..aec8061cb43 100644
--- a/tensorflow/compiler/tf2xla/lib/util.h
+++ b/tensorflow/compiler/tf2xla/lib/util.h
@@ -38,44 +38,10 @@ xla::XlaOp PrependZerosInMajorDims(xla::XlaOp x,
 xla::XlaOp IntegerLiteral(xla::XlaBuilder* builder, xla::PrimitiveType type,
                           int64 value);
 
-// Builds a vector of zeros of length rank(x) with the last values being
-// those in `starts`.
-xla::XlaOp PrependZerosInMajorDims(xla::XlaOp x,
-                                   absl::Span<const xla::XlaOp> starts);
-
-// Performs a slice in the minor dimensions of a Tensor.
-xla::XlaOp SliceInMinorDims(xla::XlaOp x, absl::Span<const int64> start,
-                            absl::Span<const int64> end);
-
 // Returns the concatenation of `xs` and `ys`.
 std::vector<int64> ConcatVectors(absl::Span<const int64> xs,
                                  absl::Span<const int64> ys);
 
-// Performs a dynamic slice in the minor dimensions of a Tensor.
-xla::XlaOp DynamicSliceInMinorDims(xla::XlaOp x,
-                                   absl::Span<const xla::XlaOp> starts,
-                                   absl::Span<const int64> sizes);
-
-// Updates a slice of 'x', i.e.,
-// x[start[0], ..., start[n]] = update
-xla::XlaOp UpdateSlice(xla::XlaOp x, xla::XlaOp update,
-                       absl::Span<const int64> start);
-
-// Updates a slice of 'x', where 'start' contains a list of minor dimensions:
-// x[..., start[0], ..., start[n]] = update
-xla::XlaOp UpdateSliceInMinorDims(xla::XlaOp x, xla::XlaOp update,
-                                  absl::Span<const int64> start);
-
-xla::XlaOp DynamicUpdateSliceInMinorDims(xla::XlaOp x, xla::XlaOp update,
-                                         absl::Span<const xla::XlaOp> starts);
-
-// Transposes a stack of matrices `x` by swapping the last two dimensions.
-xla::XlaOp TransposeInMinorDims(xla::XlaOp x);
-
-// Applies a complex conjugation operation if `a` is complex and `conjugate_a`
-// is true, otherwise returns its argument.
-xla::XlaOp MaybeConjugate(xla::XlaOp x, bool conjugate);
-
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_COMPILER_TF2XLA_LIB_UTIL_H_
diff --git a/tensorflow/compiler/tf2xla/python/BUILD b/tensorflow/compiler/tf2xla/python/BUILD
index c9f486edc8d..fef97b98c37 100644
--- a/tensorflow/compiler/tf2xla/python/BUILD
+++ b/tensorflow/compiler/tf2xla/python/BUILD
@@ -1,11 +1,13 @@
 licenses(["notice"])  # Apache 2.0
 
+package_group(
+    name = "friends",
+    includes = ["//tensorflow:internal"],
+)
+
 package(
     default_visibility = [
-        "//learning/deepmind/public/wavenet/python:__subpackages__",
-        "//learning/deepmind/research/alphastar:__subpackages__",
-        "//learning/tfx:__subpackages__",
-        "//tensorflow:internal",
+        ":friends",
     ],
 )
 
diff --git a/tensorflow/compiler/tf2xla/shape_util.h b/tensorflow/compiler/tf2xla/shape_util.h
index f7e34a5b40c..0b231ea8e7a 100644
--- a/tensorflow/compiler/tf2xla/shape_util.h
+++ b/tensorflow/compiler/tf2xla/shape_util.h
@@ -18,6 +18,7 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_TF2XLA_SHAPE_UTIL_H_
 #define TENSORFLOW_COMPILER_TF2XLA_SHAPE_UTIL_H_
 
+#include "tensorflow/compiler/xla/shape.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/types.pb.h"
diff --git a/tensorflow/compiler/tf2xla/xla_compiled_cpu_function.h b/tensorflow/compiler/tf2xla/xla_compiled_cpu_function.h
index 425e769346f..c7341cf8b9e 100644
--- a/tensorflow/compiler/tf2xla/xla_compiled_cpu_function.h
+++ b/tensorflow/compiler/tf2xla/xla_compiled_cpu_function.h
@@ -26,7 +26,7 @@ limitations under the License.
 // Forward-declare, rather than include, to reduce code size for users that
 // never use this functionality.
 namespace xla {
-class ProgramShape;
+class ProgramShapeProto;
 class HloProfilePrinterData;
 }
 
@@ -84,7 +84,7 @@ class XlaCompiledCpuFunction {
     void set_result_names(const char** result_names) {
       result_names_ = result_names;
     }
-    void set_program_shape(const xla::ProgramShape* program_shape) {
+    void set_program_shape(const xla::ProgramShapeProto* program_shape) {
       program_shape_ = program_shape;
     }
     const xla::HloProfilePrinterData* hlo_profile_printer_data() const {
@@ -122,7 +122,7 @@ class XlaCompiledCpuFunction {
     const char** result_names_ = nullptr;
 
     // [Optional] Arg and result shapes.
-    const xla::ProgramShape* program_shape_ = nullptr;
+    const xla::ProgramShapeProto* program_shape_ = nullptr;
 
     // [Optional] Profile printer data.  Null if profiling is disabled.
     const xla::HloProfilePrinterData* hlo_profile_printer_data_ = nullptr;
@@ -206,8 +206,14 @@ class XlaCompiledCpuFunction {
   //
   // Aliasing of argument and result buffers is not allowed, and results in
   // undefined behavior.
-  void set_arg_data(size_t index, void* data) {
-    buffer_table_[arg_index_table_[index]] = data;
+  void set_arg_data(size_t index, const void* data) {
+    // The const_cast is safe because the generated code does not write to arg
+    // buffers.
+    //
+    // buffer_table_ contains pointers to buffers that _will_ be written to by
+    // generated code so it would be misleading to make buffer_table_ a `const
+    // void**`.
+    buffer_table_[arg_index_table_[index]] = const_cast<void*>(data);
   }
 
   // ------------------------------
@@ -264,7 +270,7 @@ class XlaCompiledCpuFunction {
 
   // Returns the shape of the args and results. May return nullptr if the
   // program shape isn't available.
-  const xla::ProgramShape* ProgramShape() const { return program_shape_; }
+  const xla::ProgramShapeProto* ProgramShape() const { return program_shape_; }
 
   bool hlo_profiling_enabled() const {
     return hlo_profile_printer_data_ != nullptr;
@@ -287,11 +293,6 @@ class XlaCompiledCpuFunction {
 
   // Argument i needs to be placed in buffer_table_[arg_index_to_temp_index_[i]]
   // for XLA generated code to be able to find it.
-  //
-  // For now we need to keep around the args_ array because there is code that
-  // depends on args() returning a void**.  However, in the future we may remove
-  // args_ in favor of using buffer_table_ as the sole storage for the
-  // arguments.
   const int32* const arg_index_table_;
 
   // The number of incoming arguments.
@@ -310,7 +311,7 @@ class XlaCompiledCpuFunction {
   // Optional metadata.
   const char** arg_names_ = nullptr;
   const char** result_names_ = nullptr;
-  const xla::ProgramShape* program_shape_ = nullptr;
+  const xla::ProgramShapeProto* program_shape_ = nullptr;
   const xla::HloProfilePrinterData* hlo_profile_printer_data_ = nullptr;
 };
 
diff --git a/tensorflow/compiler/tf2xla/xla_compiler.cc b/tensorflow/compiler/tf2xla/xla_compiler.cc
index a08d030ce71..ee461a3c07d 100644
--- a/tensorflow/compiler/tf2xla/xla_compiler.cc
+++ b/tensorflow/compiler/tf2xla/xla_compiler.cc
@@ -158,7 +158,8 @@ Status BuildComputation(
     xla::XlaBuilder* builder, xla::XlaComputation* computation,
     int* num_computation_outputs, int* num_nonconst_outputs,
     std::vector<XlaCompiler::OutputDescription>* outputs,
-    std::vector<XlaCompiler::ResourceUpdate>* resource_updates) {
+    std::vector<XlaCompiler::ResourceUpdate>* resource_updates,
+    xla::Shape* output_shape) {
   // Attach a common operator name as metadata. This has no semantic effect — it
   // merely makes the HLO graph more readable when visualized via TensorBoard,
   // since TensorBoard forms groups out of operators with similar names.
@@ -176,6 +177,10 @@ Status BuildComputation(
 
   std::vector<xla::XlaOp> elems;
   elems.reserve(retvals.size());
+
+  // Keeps track of which retvals have layout to update. The first element is
+  // the output index, second element is the new layout.
+  std::vector<std::pair<int64, xla::Layout>> retval_to_update_layout;
   for (int i = 0; i < retvals.size(); ++i) {
     XlaCompiler::OutputDescription& output = (*outputs)[i];
     const XlaExpression& retval = retvals[i];
@@ -202,10 +207,12 @@ Status BuildComputation(
           TF_ASSIGN_OR_RETURN(xla::Shape shape, shape_representation_fn(
                                                     output.shape, output.type));
           value = xla::Reshape(value, xla::AsInt64Slice(shape.dimensions()));
+          retval_to_update_layout.emplace_back(elems.size(), shape.layout());
         } else if (it != retval_cores.end()) {
           // Apply the sharding to the output, if there is a core assignment.
           value = identity_op(value);
         }
+
         elems.push_back(value);
         break;
       }
@@ -297,6 +304,21 @@ Status BuildComputation(
     return computation_status.status();
   }
   *computation = computation_status.ConsumeValueOrDie();
+
+  TF_ASSIGN_OR_RETURN(const auto& program_shape,
+                      computation->GetProgramShape());
+  *output_shape = program_shape.result();
+  // Update the output layout to the layout of retval.
+  for (auto& update : retval_to_update_layout) {
+    if (!always_return_tuple && elems.size() == 1) {
+      *output_shape->mutable_layout() = update.second;
+      continue;
+    }
+
+    xla::Shape* output_sub_shape =
+        xla::ShapeUtil::GetMutableSubshape(output_shape, {update.first});
+    *output_sub_shape->mutable_layout() = update.second;
+  }
   return Status::OK();
 }
 
@@ -304,10 +326,10 @@ Status BuildComputation(
 
 bool XlaCompiler::Argument::operator==(
     const XlaCompiler::Argument& other) const {
-  if (std::tie(kind, resource_kind, type, name, initialized, tensor_array_size,
+  if (std::tie(kind, resource_kind, type, name, initialized, max_array_size,
                tensor_array_gradients) !=
       std::tie(other.kind, other.resource_kind, other.type, other.name,
-               other.initialized, other.tensor_array_size,
+               other.initialized, other.max_array_size,
                other.tensor_array_gradients)) {
     return false;
   }
@@ -337,8 +359,8 @@ string XlaCompiler::Argument::HumanString() const {
       string output = absl::StrCat("kind=resource", common, " resource_kind=",
                                    XlaResource::KindToString(resource_kind),
                                    " initialized=", initialized);
-      if (tensor_array_size >= 0) {
-        absl::StrAppend(&output, " tensor_array_size=", tensor_array_size);
+      if (max_array_size >= 0) {
+        absl::StrAppend(&output, " max_array_size=", max_array_size);
       }
       if (!tensor_array_gradients.empty()) {
         absl::StrAppend(&output, " tensor_array_gradients=",
@@ -358,7 +380,7 @@ XlaCompiler::XlaCompiler(XlaCompiler::Options options)
       initialization_status_(Status::OK()),
       next_step_id_(1),
       device_(new XlaCompilationDevice(SessionOptions(), options_.device_type)),
-      device_mgr_({device_}) {
+      device_mgr_(absl::WrapUnique(device_)) {
   CHECK(!options_.device_type.type_string().empty());
   if (options_.populate_resource_manager) {
     initialization_status_ =
@@ -545,12 +567,12 @@ Status XlaCompiler::XLAShapeForArgument(const XlaCompiler::Argument& arg,
           return Status::OK();
         }
         case XlaResource::kTensorArray: {
-          if (arg.tensor_array_size < 0) {
+          if (arg.max_array_size < 0) {
             return errors::InvalidArgument(
-                "Negative tensor_array_size in XLAShapeForArgument");
+                "Negative max_array_size in XLAShapeForArgument");
           }
           TensorShape shape;
-          shape.AddDim(arg.tensor_array_size);
+          shape.AddDim(arg.max_array_size);
           shape.AppendShape(arg.shape);
           TF_RETURN_IF_ERROR(TensorShapeToXLAShape(arg.type, shape, xla_shape));
 
@@ -562,12 +584,12 @@ Status XlaCompiler::XLAShapeForArgument(const XlaCompiler::Argument& arg,
           return Status::OK();
         }
         case XlaResource::kStack: {
-          if (arg.tensor_array_size < 0) {
+          if (arg.max_array_size < 0) {
             return errors::InvalidArgument(
-                "Negative tensor_array_size in XLAShapeForArgument");
+                "Negative max_array_size in XLAShapeForArgument");
           }
           TensorShape shape;
-          shape.AddDim(arg.tensor_array_size);
+          shape.AddDim(arg.max_array_size);
           shape.AppendShape(arg.shape);
           xla::Shape buffer_shape;
           TF_RETURN_IF_ERROR(
@@ -613,21 +635,23 @@ Status XlaCompiler::BuildArguments(
     const XlaCompiler::Argument& arg = args[i];
     XlaExpression& arg_expression = (*arg_expressions)[i];
     switch (arg.kind) {
-      case XlaCompiler::Argument::kResource:
+      case XlaCompiler::Argument::kResource: {
         TF_RET_CHECK(arg.resource_kind != XlaResource::kInvalid);
         // TODO(phawkins): this code assumes that resource arguments do not
         // alias.
-        XlaResource* resource;
-        TF_RETURN_IF_ERROR(context->CreateResource(
-            arg.resource_kind, i, arg.name, arg.type, arg.shape, xla::XlaOp(),
-            /*tensor_array_size=*/arg.tensor_array_size,
-            /*tensor_array_gradients=*/arg.tensor_array_gradients, &resource));
+        XlaResource* resource =
+            context->AddResource(absl::make_unique<XlaResource>(
+                arg.resource_kind, i, arg.name, arg.type, arg.shape,
+                xla::XlaOp(),
+                /*max_array_size=*/arg.max_array_size,
+                /*tensor_array_gradients=*/arg.tensor_array_gradients,
+                /*tensor_array_multiple_writes_aggregate=*/true));
         arg_expression = XlaExpression::Resource(resource);
         if (arg.initialized) {
           input_mapping->push_back(i);
         }
-
         break;
+      }
       case XlaCompiler::Argument::kParameter:
       case XlaCompiler::Argument::kToken: {
         input_mapping->push_back(i);
@@ -901,9 +925,7 @@ Status XlaCompiler::CompileGraph(const XlaCompiler::CompileOptions& options,
                                    options_.device_type, name));
 
   xla::XlaBuilder builder(name);
-  XlaContext* context =
-      new XlaContext(this, &builder, options_.allow_cpu_custom_calls,
-                     &options_.shape_representation_fn);
+  XlaContext* context = new XlaContext(this, &builder);
   core::ScopedUnref context_unref(context);
 
   std::vector<XlaCompiler::Argument> real_args(args.begin(), args.end());
@@ -988,23 +1010,12 @@ Status XlaCompiler::CompileGraph(const XlaCompiler::CompileOptions& options,
       options.return_updated_values_for_all_resources,
       options.always_return_tuple, &builder, result->computation.get(),
       &num_computation_outputs, &num_nonconst_outputs, &result->outputs,
-      &result->resource_updates));
+      &result->resource_updates, &result->xla_output_shape));
 
   VLOG(2) << "Outputs: total: " << context->retvals().size()
           << " nonconstant: " << num_nonconst_outputs;
-
-  // Compute the XLA output shape, if there is a computation with non-constant
-  // outputs.
-  TF_ASSIGN_OR_RETURN(std::unique_ptr<xla::ProgramShape> computation_shape,
-                      client()->GetComputationShape(*result->computation));
-
-  result->xla_output_shape.Swap(computation_shape->mutable_result());
   VLOG(2) << "XLA output shape: "
-          << xla::ShapeUtil::HumanString(result->xla_output_shape);
-
-  // Tensorflow expects a major-to-minor order of results.
-  xla::LayoutUtil::SetToDefaultLayout(&result->xla_output_shape);
-
+          << xla::ShapeUtil::HumanStringWithLayout(result->xla_output_shape);
   return Status::OK();
 }
 
diff --git a/tensorflow/compiler/tf2xla/xla_compiler.h b/tensorflow/compiler/tf2xla/xla_compiler.h
index 63426124686..0d801b73a8c 100644
--- a/tensorflow/compiler/tf2xla/xla_compiler.h
+++ b/tensorflow/compiler/tf2xla/xla_compiler.h
@@ -150,7 +150,7 @@ class XlaCompiler {
 
     // For a TensorArray or Stack resource, what is the array's declared size?
     // (Used for lazy initialization.)
-    int64 tensor_array_size = -1;
+    int64 max_array_size = -1;
 
     // TensorArray resource parameters are passed as (array, gradient array 0,
     // ..., gradient array k), where the gradient arrays are in the same order
diff --git a/tensorflow/compiler/tf2xla/xla_compiler_test.cc b/tensorflow/compiler/tf2xla/xla_compiler_test.cc
index aaee208f634..fe2a5f5b0c9 100644
--- a/tensorflow/compiler/tf2xla/xla_compiler_test.cc
+++ b/tensorflow/compiler/tf2xla/xla_compiler_test.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "tensorflow/cc/ops/function_ops.h"
 #include "tensorflow/cc/ops/resource_variable_ops.h"
 #include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/compiler/tf2xla/shape_util.h"
 #include "tensorflow/compiler/tf2xla/side_effect_util.h"
 #include "tensorflow/compiler/tf2xla/type_util.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
@@ -649,7 +650,7 @@ TEST_F(XlaCompilerTest, CanPassTensorArraysToAndFromComputation) {
   args[0].initialized = true;
   args[0].type = DT_INT32;
   args[0].shape = TensorShape({});
-  args[0].tensor_array_size = 2;
+  args[0].max_array_size = 2;
   args[0].tensor_array_gradients = {"grad2"};
 
   // Compiles the graph.
@@ -708,7 +709,7 @@ TEST_F(XlaCompilerTest, UnwrittenTensorArrayGradientsAreNotComputationOutputs) {
   args[0].initialized = true;
   args[0].type = DT_INT32;
   args[0].shape = TensorShape({});
-  args[0].tensor_array_size = 2;
+  args[0].max_array_size = 2;
   args[0].tensor_array_gradients = {"grad1"};
 
   // Compiles the graph.
@@ -740,7 +741,7 @@ TEST_F(XlaCompilerTest, NewTensorArrayGradientsAreComputationOutputs) {
   args[0].initialized = true;
   args[0].type = DT_INT32;
   args[0].shape = TensorShape({});
-  args[0].tensor_array_size = 2;
+  args[0].max_array_size = 2;
   args[0].tensor_array_gradients = {"grad1"};
 
   // Compiles the graph.
@@ -910,6 +911,82 @@ TEST_F(XlaCompilerTest, Variables) {
   RunAndCheckVariablesComputation(client_, result);
 }
 
+TEST_F(XlaCompilerTest, ResultLayoutSingle) {
+  Scope scope = Scope::NewRootScope().ExitOnError();
+  auto a = ops::_Arg(scope.WithOpName("A"), DT_INT32, 0);
+  auto b = ops::_Retval(scope.WithOpName("RET"), a, 0);
+
+  std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
+  TF_ASSERT_OK(scope.ToGraph(graph.get()));
+
+  // Builds a description of the arguments.
+  std::vector<XlaCompiler::Argument> args(1);
+  args[0].kind = XlaCompiler::Argument::kParameter;
+  args[0].type = DT_INT32;
+  args[0].shape = TensorShape({2, 3});
+
+  auto options = DefaultOptions();
+  // Sets the representation function to return a non-default layout.
+  options.shape_representation_fn =
+      [](const TensorShape& shape, DataType type) -> xla::StatusOr<xla::Shape> {
+    xla::Shape xla_shape;
+    TF_RETURN_IF_ERROR(TensorShapeToXLAShape(type, shape, &xla_shape));
+    *xla_shape.mutable_layout() = xla::LayoutUtil::MakeLayout({0, 1});
+    return xla_shape;
+  };
+
+  // Compiles the graph.
+  XlaCompiler compiler(options);
+
+  XlaCompiler::CompilationResult result;
+  auto compile_options = XlaCompiler::CompileOptions();
+  compile_options.always_return_tuple = false;
+  TF_ASSERT_OK(compiler.CompileGraph(compile_options, "id", std::move(graph),
+                                     args, &result));
+  EXPECT_TRUE(xla::ShapeUtil::Equal(
+      result.xla_output_shape,
+      xla::ShapeUtil::MakeShapeWithLayout(xla::S32, {2, 3}, {0, 1})));
+}
+
+TEST_F(XlaCompilerTest, ResultLayoutMultiple) {
+  Scope scope = Scope::NewRootScope().ExitOnError();
+  auto a = ops::_Arg(scope.WithOpName("A"), DT_INT32, 0);
+  auto b = ops::_Retval(scope.WithOpName("RET1"), a, 0);
+  auto c = ops::_Retval(scope.WithOpName("RET2"), a, 1);
+
+  std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
+  TF_ASSERT_OK(scope.ToGraph(graph.get()));
+
+  // Builds a description of the arguments.
+  std::vector<XlaCompiler::Argument> args(1);
+  args[0].kind = XlaCompiler::Argument::kParameter;
+  args[0].type = DT_INT32;
+  args[0].shape = TensorShape({2, 3});
+
+  auto options = DefaultOptions();
+  // Sets the representation function to return a non-default layout.
+  options.shape_representation_fn =
+      [](const TensorShape& shape, DataType type) -> xla::StatusOr<xla::Shape> {
+    xla::Shape xla_shape;
+    TF_RETURN_IF_ERROR(TensorShapeToXLAShape(type, shape, &xla_shape));
+    *xla_shape.mutable_layout() = xla::LayoutUtil::MakeLayout({0, 1});
+    return xla_shape;
+  };
+
+  // Compiles the graph.
+  XlaCompiler compiler(options);
+
+  XlaCompiler::CompilationResult result;
+  TF_ASSERT_OK(compiler.CompileGraph(XlaCompiler::CompileOptions(), "id",
+                                     std::move(graph), args, &result));
+  xla::Shape result_shape =
+      xla::ShapeUtil::MakeShapeWithLayout(xla::S32, {2, 3}, {0, 1});
+
+  EXPECT_TRUE(xla::ShapeUtil::Equal(
+      result.xla_output_shape,
+      xla::ShapeUtil::MakeTupleShape({result_shape, result_shape})));
+}
+
 // Tests a simple graph that reads and writes a variable.
 TEST_F(XlaCompilerTest, ReturnResourceHandleOnly) {
   Scope scope = Scope::NewRootScope().ExitOnError();
diff --git a/tensorflow/compiler/tf2xla/xla_context.cc b/tensorflow/compiler/tf2xla/xla_context.cc
index 43095fbb473..a69af705033 100644
--- a/tensorflow/compiler/tf2xla/xla_context.cc
+++ b/tensorflow/compiler/tf2xla/xla_context.cc
@@ -54,25 +54,14 @@ const char XlaContext::kXlaContextResourceName[] = "_xla_context";
   return *context;
 }
 
-/* static */ XlaContext& XlaContext::Get(const XlaOpKernelContext* ctx) {
-  return Get(ctx->op_kernel_context());
-}
-
 void XlaContext::set_args(std::vector<XlaExpression> args) {
   args_ = std::move(args);
 }
 
-XlaContext::XlaContext(
-    XlaCompiler* compiler, xla::XlaBuilder* builder,
-    bool allow_cpu_custom_calls,
-    const std::function<xla::StatusOr<xla::Shape>(
-        const TensorShape&, DataType)>* shape_representation_fn)
-    : compiler_(compiler),
-      builder_(builder),
-      allow_cpu_custom_calls_(allow_cpu_custom_calls),
-      shape_representation_fn_(shape_representation_fn) {}
+XlaContext::XlaContext(XlaCompiler* compiler, xla::XlaBuilder* builder)
+    : compiler_(compiler), builder_(builder) {}
 
-string XlaContext::DebugString() { return "TLA JIT context"; }
+string XlaContext::DebugString() { return "XLA JIT context"; }
 
 void XlaContext::SetRetval(int index, const XlaExpression& expression) {
   if (retvals_.size() <= index) {
@@ -81,21 +70,9 @@ void XlaContext::SetRetval(int index, const XlaExpression& expression) {
   retvals_[index] = expression;
 }
 
-Status XlaContext::CreateResource(
-    XlaResource::Kind kind, int arg_num, string name, DataType type,
-    TensorShape shape, const xla::XlaOp& handle, int64 tensor_array_size,
-    const std::set<string>& tensor_array_gradients, XlaResource** resource) {
-  resources_.emplace_back(
-      new XlaResource(kind, arg_num, std::move(name), type, std::move(shape),
-                      handle, tensor_array_size, tensor_array_gradients,
-                      /*tensor_array_multiple_writes_aggregate=*/false));
-  *resource = resources_.back().get();
-  return Status::OK();
-}
-
-xla::StatusOr<xla::Shape> XlaContext::RepresentationShape(
-    const TensorShape& shape, DataType type) const {
-  return (*shape_representation_fn_)(shape, type);
+XlaResource* XlaContext::AddResource(std::unique_ptr<XlaResource> resource) {
+  resources_.push_back(std::move(resource));
+  return resources_.back().get();
 }
 
 const xla::XlaComputation* XlaContext::GetOrCreateMax(const DataType type) {
diff --git a/tensorflow/compiler/tf2xla/xla_context.h b/tensorflow/compiler/tf2xla/xla_context.h
index dbfd344c9ba..0767d1faac1 100644
--- a/tensorflow/compiler/tf2xla/xla_context.h
+++ b/tensorflow/compiler/tf2xla/xla_context.h
@@ -41,14 +41,10 @@ class XlaContext : public ResourceBase {
  public:
   // Retrieves the XlaContext of the current compilation.
   static XlaContext& Get(const OpKernelContext* ctx);
-  static XlaContext& Get(const XlaOpKernelContext* ctx);
 
   // Creates a new XlaContext. See the documentation on the class data fields
   // for descriptions of the arguments.
-  XlaContext(XlaCompiler* compiler, xla::XlaBuilder* builder,
-             bool allow_cpu_custom_calls,
-             const std::function<xla::StatusOr<xla::Shape>(
-                 const TensorShape&, DataType)>* shape_representation_fn);
+  XlaContext(XlaCompiler* compiler, xla::XlaBuilder* builder);
 
   // Virtual method defined by ResourceBase.
   string DebugString() override;
@@ -58,8 +54,6 @@ class XlaContext : public ResourceBase {
   // Returns the XlaBuilder that Ops use for compiling new expressions.
   xla::XlaBuilder* builder() { return builder_; }
 
-  bool allow_cpu_custom_calls() const { return allow_cpu_custom_calls_; }
-
   const std::vector<XlaExpression>& args() const { return args_; }
   void set_args(std::vector<XlaExpression> args);
 
@@ -70,25 +64,13 @@ class XlaContext : public ResourceBase {
   // grows the return values vector to size index+1 if it is smaller.
   void SetRetval(int index, const XlaExpression& expression);
 
-  // Creates a resource with resource `kind` and initial value `handle`. `name`
-  // is a descriptive name for use in error messages. See the `XlaResource`
-  // constructor for a description of the remaining arguments.
-  // Fails if the resource already exists.
-  Status CreateResource(XlaResource::Kind kind, int arg_num, string name,
-                        DataType type, TensorShape shape,
-                        const xla::XlaOp& handle, int64 tensor_array_size,
-                        const std::set<string>& tensor_array_gradients,
-                        XlaResource** resource);
+  // Adds 'resource' to the set of resources owned by the context.
+  XlaResource* AddResource(std::unique_ptr<XlaResource> resource);
 
   const std::vector<std::unique_ptr<XlaResource>>& resources() {
     return resources_;
   }
 
-  // Returns the XLA shape to be used to represent a variable of TF `shape`
-  // and `type`, or of an argument or return value of a top-level computation.
-  xla::StatusOr<xla::Shape> RepresentationShape(const TensorShape& shape,
-                                                DataType type) const;
-
   // Get an XLA lambda to compute Max. This is cached in the
   // XlaContext since it may be used by multiple Ops. There is a
   // separate specialization of the computation for each DataType.
@@ -118,9 +100,6 @@ class XlaContext : public ResourceBase {
   // The XlaBuilder used to construct the subgraph's compiled representation.
   xla::XlaBuilder* builder_;
 
-  // Allow ops to emit CustomCall operations for CPU.
-  const bool allow_cpu_custom_calls_;
-
   // Arguments to the Tensorflow graph, indexed by _Arg index.
   // Includes both compile-time constant arguments and runtime parameters.
   std::vector<XlaExpression> args_;
@@ -131,11 +110,6 @@ class XlaContext : public ResourceBase {
   // Holds ownership of resources. The resources are not ordered.
   std::vector<std::unique_ptr<XlaResource>> resources_;
 
-  // Describes the on-host shapes of parameters and return values. Also see:
-  // XlaDevice::Options::shape_representation_fn.
-  const std::function<xla::StatusOr<xla::Shape>(const TensorShape&, DataType)>*
-      shape_representation_fn_;
-
   // Cache of prebuilt computations indexed by their type.
   using ComputationMap = std::map<DataType, xla::XlaComputation>;
 
diff --git a/tensorflow/compiler/tf2xla/xla_helpers.cc b/tensorflow/compiler/tf2xla/xla_helpers.cc
index 9a34cd8c6ae..c2c07512111 100644
--- a/tensorflow/compiler/tf2xla/xla_helpers.cc
+++ b/tensorflow/compiler/tf2xla/xla_helpers.cc
@@ -26,7 +26,6 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/xla/client/lib/arithmetic.h"
 #include "tensorflow/compiler/xla/client/lib/constants.h"
-#include "tensorflow/compiler/xla/client/lib/numeric.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/types.h"
@@ -216,8 +215,7 @@ DataType XlaHelpers::SumAccumulationType(const DataType& dtype) {
   return dtype;
 }
 
-xla::XlaOp XlaHelpers::ConvertElementType(xla::XlaBuilder* const builder,
-                                          const xla::XlaOp& operand,
+xla::XlaOp XlaHelpers::ConvertElementType(const xla::XlaOp& operand,
                                           const DataType new_element_type) {
   xla::PrimitiveType convert_to;
   TF_CHECK_OK(DataTypeToPrimitiveType(new_element_type, &convert_to));
diff --git a/tensorflow/compiler/tf2xla/xla_helpers.h b/tensorflow/compiler/tf2xla/xla_helpers.h
index 39578144caa..4858dfee55a 100644
--- a/tensorflow/compiler/tf2xla/xla_helpers.h
+++ b/tensorflow/compiler/tf2xla/xla_helpers.h
@@ -80,8 +80,7 @@ class XlaHelpers {
 
   // A helper for creating a ConvertElementType xla op given a DataType rather
   // than the xla::PrimitiveType.
-  static xla::XlaOp ConvertElementType(xla::XlaBuilder* const builder,
-                                       const xla::XlaOp& operand,
+  static xla::XlaOp ConvertElementType(const xla::XlaOp& operand,
                                        const DataType new_element_type);
 };
 
diff --git a/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function.cc b/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function.cc
index 86a78ee429e..fabbcd04fed 100644
--- a/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function.cc
+++ b/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function.cc
@@ -133,7 +133,8 @@ XlaJitCompiledCpuFunction::Compile(
   jit->executable_ = std::move(executable);
   jit->buffer_infos_ = std::move(buffer_infos);
   jit->arg_index_table_ = std::move(arg_index_table);
-  jit->program_shape_ = std::move(program_shape);
+  jit->program_shape_ =
+      absl::make_unique<xla::ProgramShapeProto>(program_shape->ToProto());
   jit->static_data_.set_raw_function(raw_function);
   jit->static_data_.set_buffer_infos(jit->buffer_infos_.data());
   jit->static_data_.set_num_buffers(jit->buffer_infos_.size());
diff --git a/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function.h b/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function.h
index d3c8f22a807..a5392057177 100644
--- a/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function.h
+++ b/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function.h
@@ -80,8 +80,10 @@ class XlaJitCompiledCpuFunction {
   std::vector<const char*> arg_names_;
   std::vector<const char*> result_names_;
 
-  // The backing data for the program shape.
-  std::unique_ptr<const xla::ProgramShape> program_shape_;
+  // The backing data for the program shape. The proto form of program shape is
+  // used because the program shape is serialized and embedded in the object
+  // file.
+  std::unique_ptr<const xla::ProgramShapeProto> program_shape_;
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function_test.cc b/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function_test.cc
index 6d49298a6f3..8846088678b 100644
--- a/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function_test.cc
+++ b/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function_test.cc
@@ -116,13 +116,13 @@ TEST(XlaJitCompiledCpuFunction, Sum) {
   // Check program shape.
   using xla::ShapeUtil;
   const xla::Shape s32 = ShapeUtil::MakeShape(xla::S32, {});
-  const xla::ProgramShape* program_shape = function.ProgramShape();
-  ASSERT_TRUE(program_shape != nullptr);
-  ASSERT_EQ(program_shape->parameters_size(), 2);
-  EXPECT_TRUE(ShapeUtil::Compatible(program_shape->parameters(0), s32));
-  EXPECT_TRUE(ShapeUtil::Compatible(program_shape->parameters(1), s32));
+  ASSERT_TRUE(function.ProgramShape() != nullptr);
+  const xla::ProgramShape program_shape(*function.ProgramShape());
+  ASSERT_EQ(program_shape.parameters_size(), 2);
+  EXPECT_TRUE(ShapeUtil::Compatible(program_shape.parameters(0), s32));
+  EXPECT_TRUE(ShapeUtil::Compatible(program_shape.parameters(1), s32));
 
-  const xla::Shape& result = program_shape->result();
+  const xla::Shape& result = program_shape.result();
   ASSERT_EQ(result.element_type(), xla::TUPLE);
   ASSERT_EQ(ShapeUtil::TupleElementCount(result), 1);
   const xla::Shape& result0 = ShapeUtil::GetTupleElementShape(result, 0);
diff --git a/tensorflow/compiler/tf2xla/xla_op_kernel.cc b/tensorflow/compiler/tf2xla/xla_op_kernel.cc
index 8dd8def0549..58808c76de6 100644
--- a/tensorflow/compiler/tf2xla/xla_op_kernel.cc
+++ b/tensorflow/compiler/tf2xla/xla_op_kernel.cc
@@ -36,8 +36,16 @@ bool XlaOpKernelContext::ValidateInputsAreSameShape(OpKernel* op) {
   return context_->ValidateInputsAreSameShape(op);
 }
 
+XlaContext* XlaOpKernelContext::xla_context() const {
+  return &XlaContext::Get(context_);
+}
+
 xla::XlaBuilder* XlaOpKernelContext::builder() const {
-  return XlaContext::Get(this).builder();
+  return xla_context()->builder();
+}
+
+XlaCompiler* XlaOpKernelContext::compiler() const {
+  return xla_context()->compiler();
 }
 
 // Retrieves an XlaExpression that was allocated by a previous Op.
@@ -338,8 +346,8 @@ Status XlaOpKernelContext::ConstantInputList(
 namespace {
 
 Status ReadVariableInputTensor(const Tensor& tensor, DataType type,
-                               const OpKernelContext* ctx, TensorShape* shape,
-                               xla::XlaOp* value) {
+                               const XlaOpKernelContext* ctx,
+                               TensorShape* shape, xla::XlaOp* value) {
   const XlaExpression* expression = CastExpressionFromTensor(tensor);
   XlaResource* variable = expression->resource();
   TF_RET_CHECK(variable != nullptr);
@@ -357,10 +365,9 @@ Status ReadVariableInputTensor(const Tensor& tensor, DataType type,
     *shape = variable->shape();
   }
 
-  XlaContext& xla_context = XlaContext::Get(ctx);
-  TF_ASSIGN_OR_RETURN(
-      xla::Shape representation_shape,
-      xla_context.RepresentationShape(variable->shape(), variable->type()));
+  TF_ASSIGN_OR_RETURN(xla::Shape representation_shape,
+                      ctx->compiler()->options().shape_representation_fn(
+                          variable->shape(), variable->type()));
   xla::Shape xla_shape;
   TF_RETURN_IF_ERROR(
       TensorShapeToXLAShape(variable->type(), variable->shape(), &xla_shape));
@@ -377,15 +384,15 @@ Status ReadVariableInputTensor(const Tensor& tensor, DataType type,
 Status XlaOpKernelContext::ReadVariableInput(int index, DataType type,
                                              TensorShape* shape,
                                              xla::XlaOp* value) {
-  return ReadVariableInputTensor(context_->input(index), type, context_, shape,
+  return ReadVariableInputTensor(context_->input(index), type, this, shape,
                                  value);
 }
 
 Status XlaOpKernelContext::ReadVariableInput(absl::string_view name,
                                              DataType type, TensorShape* shape,
                                              xla::XlaOp* value) {
-  return ReadVariableInputTensor(GetInputTensorByName(name), type, context_,
-                                 shape, value);
+  return ReadVariableInputTensor(GetInputTensorByName(name), type, this, shape,
+                                 value);
 }
 
 Status XlaOpKernelContext::GetVariableTypeAndShape(int index, DataType* type,
@@ -464,7 +471,7 @@ Status XlaOpKernelContext::GetResourceInput(int index, XlaResource** resource) {
 namespace {
 
 Status AssignVariableTensor(const Tensor& tensor, DataType type,
-                            const OpKernelContext* ctx, xla::XlaOp handle,
+                            const XlaOpKernelContext* ctx, xla::XlaOp handle,
                             xla::XlaBuilder* builder) {
   const XlaExpression* expression = CastExpressionFromTensor(tensor);
   XlaResource* variable = expression->resource();
@@ -481,9 +488,9 @@ Status AssignVariableTensor(const Tensor& tensor, DataType type,
 
   TF_RETURN_IF_ERROR(variable->SetTypeAndShape(type, shape));
 
-  XlaContext& xla_context = XlaContext::Get(ctx);
-  TF_ASSIGN_OR_RETURN(xla::Shape representation_shape,
-                      xla_context.RepresentationShape(shape, type));
+  TF_ASSIGN_OR_RETURN(
+      xla::Shape representation_shape,
+      ctx->compiler()->options().shape_representation_fn(shape, type));
   xla::Shape xla_shape;
   TF_RETURN_IF_ERROR(TensorShapeToXLAShape(type, shape, &xla_shape));
   if (!xla::ShapeUtil::Compatible(xla_shape, representation_shape)) {
@@ -498,19 +505,15 @@ Status AssignVariableTensor(const Tensor& tensor, DataType type,
 Status XlaOpKernelContext::AssignVariable(int input_index, DataType type,
                                           xla::XlaOp handle) {
   TF_RET_CHECK(handle.valid());
-  return AssignVariableTensor(context_->input(input_index), type, context_,
-                              handle, builder());
+  return AssignVariableTensor(context_->input(input_index), type, this, handle,
+                              builder());
 }
 
 Status XlaOpKernelContext::AssignVariable(absl::string_view name, DataType type,
                                           xla::XlaOp handle) {
   TF_RET_CHECK(handle.valid());
-  return AssignVariableTensor(GetInputTensorByName(name), type, context_,
-                              handle, builder());
-}
-
-XlaCompiler* XlaOpKernelContext::compiler() const {
-  return XlaContext::Get(context_).compiler();
+  return AssignVariableTensor(GetInputTensorByName(name), type, this, handle,
+                              builder());
 }
 
 void XlaOpKernelContext::CtxFailure(const Status& s) {
@@ -530,22 +533,22 @@ void XlaOpKernelContext::CtxFailureWithWarning(const char* file, int line,
 
 const xla::XlaComputation* XlaOpKernelContext::GetOrCreateMax(
     const DataType type) {
-  return XlaContext::Get(context_).GetOrCreateMax(type);
+  return xla_context()->GetOrCreateMax(type);
 }
 
 const xla::XlaComputation* XlaOpKernelContext::GetOrCreateMin(
     const DataType type) {
-  return XlaContext::Get(context_).GetOrCreateMin(type);
+  return xla_context()->GetOrCreateMin(type);
 }
 
 const xla::XlaComputation* XlaOpKernelContext::GetOrCreateAdd(
     const DataType type) {
-  return XlaContext::Get(context_).GetOrCreateAdd(type);
+  return xla_context()->GetOrCreateAdd(type);
 }
 
 const xla::XlaComputation* XlaOpKernelContext::GetOrCreateMul(
     const DataType type) {
-  return XlaContext::Get(context_).GetOrCreateMul(type);
+  return xla_context()->GetOrCreateMul(type);
 }
 
 const Tensor& XlaOpKernelContext::GetInputTensorByName(absl::string_view name) {
diff --git a/tensorflow/compiler/tf2xla/xla_op_kernel.h b/tensorflow/compiler/tf2xla/xla_op_kernel.h
index c06efa2c474..1858844bc05 100644
--- a/tensorflow/compiler/tf2xla/xla_op_kernel.h
+++ b/tensorflow/compiler/tf2xla/xla_op_kernel.h
@@ -60,6 +60,8 @@ class XlaOpKernelContext {
  public:
   explicit XlaOpKernelContext(OpKernelContext* context);
 
+  XlaContext* xla_context() const;
+
   // Returns the XLA XlaBuilder containing the output of compilation.
   xla::XlaBuilder* builder() const;
 
diff --git a/tensorflow/compiler/tf2xla/xla_op_registry.cc b/tensorflow/compiler/tf2xla/xla_op_registry.cc
index dcd0e9c5c1f..14237df6908 100644
--- a/tensorflow/compiler/tf2xla/xla_op_registry.cc
+++ b/tensorflow/compiler/tf2xla/xla_op_registry.cc
@@ -18,7 +18,7 @@ limitations under the License.
 #include <functional>
 #include <memory>
 
-#include "tensorflow/compiler/jit/legacy_flags/mark_for_compilation_pass_flags.h"
+#include "tensorflow/compiler/jit/flags.h"
 #include "tensorflow/compiler/jit/xla_cluster_util.h"
 #include "tensorflow/compiler/tf2xla/type_util.h"
 #include "tensorflow/compiler/tf2xla/xla_context.h"
@@ -130,8 +130,7 @@ XlaOpRegistry::~XlaOpRegistry() = default;
   // Lazily register the CPU and GPU JIT devices the first time
   // GetCompilationDevice is called.
   static void* registration_init = [&registry]() {
-    legacy_flags::MarkForCompilationPassFlags* flags =
-        legacy_flags::GetMarkForCompilationPassFlags();
+    MarkForCompilationPassFlags* flags = GetMarkForCompilationPassFlags();
     bool cpu_global_jit = flags->tf_xla_cpu_global_jit;
 
     mutex_lock lock(registry.mutex_);
diff --git a/tensorflow/compiler/tf2xla/xla_resource.cc b/tensorflow/compiler/tf2xla/xla_resource.cc
index a322eb9015e..48a3c012727 100644
--- a/tensorflow/compiler/tf2xla/xla_resource.cc
+++ b/tensorflow/compiler/tf2xla/xla_resource.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <functional>
 #include <memory>
 
+#include "absl/memory/memory.h"
 #include "tensorflow/compiler/tf2xla/shape_util.h"
 #include "tensorflow/compiler/tf2xla/sharding_util.h"
 #include "tensorflow/compiler/tf2xla/xla_context.h"
@@ -39,9 +40,29 @@ namespace tensorflow {
   }
 }
 
+/*static*/ std::unique_ptr<XlaResource> XlaResource::CreateStack(
+    string name, DataType type, int64 max_size) {
+  return absl::make_unique<XlaResource>(
+      XlaResource::kStack, /*arg_num=*/-1, std::move(name), type, TensorShape(),
+      /*initial_value=*/xla::XlaOp(),
+      /*max_array_size=*/max_size,
+      /*tensor_array_gradients=*/std::set<string>{},
+      /*tensor_array_multiple_writes_aggregate=*/false);
+}
+
+/*static*/ std::unique_ptr<XlaResource> XlaResource::CreateTensorArray(
+    string name, DataType type, TensorShape shape, xla::XlaOp initial_value,
+    int64 max_array_size) {
+  return absl::make_unique<XlaResource>(
+      XlaResource::kTensorArray, /*arg_num=*/-1, std::move(name), type, shape,
+      initial_value, max_array_size,
+      /*tensor_array_gradients=*/std::set<string>{},
+      /*tensor_array_multiple_writes_aggregate=*/false);
+}
+
 XlaResource::XlaResource(Kind kind, int arg_num, string name, DataType type,
                          TensorShape shape, const xla::XlaOp& initial_value,
-                         int64 tensor_array_size,
+                         int64 max_array_size,
                          const std::set<string>& tensor_array_gradients,
                          bool tensor_array_multiple_writes_aggregate)
     : kind_(kind),
@@ -51,7 +72,7 @@ XlaResource::XlaResource(Kind kind, int arg_num, string name, DataType type,
       shape_(std::move(shape)),
       value_(initial_value),
       initial_value_(initial_value),
-      tensor_array_size_(tensor_array_size),
+      max_array_size_(max_array_size),
       tensor_array_multiple_writes_aggregate_(
           tensor_array_multiple_writes_aggregate) {
   CHECK(kind_ != kInvalid);
@@ -60,7 +81,7 @@ XlaResource::XlaResource(Kind kind, int arg_num, string name, DataType type,
     tensor_array_gradients_[gradient].reset(new XlaResource(
         /*kind=*/kTensorArray, /*arg_num=*/-1,
         /*name=*/absl::StrCat("TensorArrayGrad: ", name_), type_, shape_,
-        xla::XlaOp(), tensor_array_size_, /*tensor_array_gradients=*/{},
+        xla::XlaOp(), max_array_size_, /*tensor_array_gradients=*/{},
         /*tensor_array_multiple_writes_aggregate=*/true));
   }
 }
@@ -113,7 +134,7 @@ Status XlaResource::SetZeroValue(xla::XlaBuilder* builder) {
     }
     case kTensorArray: {
       TensorShape ta_shape;
-      ta_shape.AddDim(tensor_array_size_);
+      ta_shape.AddDim(max_array_size_);
       ta_shape.AppendShape(shape_);
       value_ = xla::Broadcast(XlaHelpers::Zero(builder, type_),
                               ta_shape.dim_sizes());
@@ -121,7 +142,7 @@ Status XlaResource::SetZeroValue(xla::XlaBuilder* builder) {
     }
     case kStack: {
       TensorShape ta_shape;
-      ta_shape.AddDim(tensor_array_size_);
+      ta_shape.AddDim(max_array_size_);
       ta_shape.AppendShape(shape_);
       value_ =
           xla::Tuple(builder, {xla::Broadcast(XlaHelpers::Zero(builder, type_),
@@ -146,14 +167,14 @@ Status XlaResource::GetOrCreateTensorArrayGradient(const string& source,
   std::unique_ptr<XlaResource>& gradient = tensor_array_gradients_[source];
   if (!gradient) {
     TensorShape ta_shape;
-    ta_shape.AddDim(tensor_array_size_);
+    ta_shape.AddDim(max_array_size_);
     ta_shape.AppendShape(shape_);
     xla::XlaOp gradient_value =
         xla::Broadcast(XlaHelpers::Zero(builder, type_), ta_shape.dim_sizes());
     gradient.reset(
         new XlaResource(/*kind=*/kTensorArray, /*arg_num=*/-1,
                         /*name=*/absl::StrCat("TensorArrayGrad: ", name_),
-                        type_, shape_, gradient_value, tensor_array_size_,
+                        type_, shape_, gradient_value, max_array_size_,
                         /*tensor_array_gradients=*/{},
                         /*tensor_array_multiple_writes_aggregate=*/true));
   }
diff --git a/tensorflow/compiler/tf2xla/xla_resource.h b/tensorflow/compiler/tf2xla/xla_resource.h
index 857b9a928bb..736588bb8b8 100644
--- a/tensorflow/compiler/tf2xla/xla_resource.h
+++ b/tensorflow/compiler/tf2xla/xla_resource.h
@@ -38,9 +38,18 @@ class XlaResource {
   };
   static absl::string_view KindToString(Kind kind);
 
+  // Creates a new Stack resource.
+  static std::unique_ptr<XlaResource> CreateStack(string name, DataType type,
+                                                  int64 max_size);
+
+  // Creates a new TensorArray resource.
+  static std::unique_ptr<XlaResource> CreateTensorArray(
+      string name, DataType type, TensorShape shape, xla::XlaOp initial_value,
+      int64 max_array_size);
+
   XlaResource(Kind kind, int arg_num, string name, DataType type,
               TensorShape shape, const xla::XlaOp& initial_value,
-              int64 tensor_array_size,
+              int64 max_array_size,
               const std::set<string>& tensor_array_gradients,
               bool tensor_array_multiple_writes_aggregate);
 
@@ -119,12 +128,12 @@ class XlaResource {
   // TODO(phawkins): refactor this code to use subclasses, rather than putting
   // kind-specific fields in XlaResource.
 
-  // 'tensor_array_size' stores the expected size of the TensorArray or Stack.
+  // 'max_array_size' stores the expected size of the TensorArray or Stack.
   // We need to store this since sometimes TensorArrays must be initialized
   // lazily since we do not know the element shape at construction time.
   // Used by both TensorArrays and Stacks.
-  int64 tensor_array_size() const { return tensor_array_size_; }
-  void set_tensor_array_size(int64 size) { tensor_array_size_ = size; }
+  int64 max_array_size() const { return max_array_size_; }
+  void set_max_array_size(int64 size) { max_array_size_ = size; }
 
   bool tensor_array_multiple_writes_aggregate() const {
     return tensor_array_multiple_writes_aggregate_;
@@ -151,7 +160,7 @@ class XlaResource {
   xla::XlaOp value_;
   xla::XlaOp initial_value_;
 
-  int64 tensor_array_size_ = -1;
+  int64 max_array_size_ = -1;
   bool tensor_array_multiple_writes_aggregate_ = false;
 
   std::map<string, std::unique_ptr<XlaResource>> tensor_array_gradients_;
diff --git a/tensorflow/compiler/xla/BUILD b/tensorflow/compiler/xla/BUILD
index 91096cf1d04..4360e085796 100644
--- a/tensorflow/compiler/xla/BUILD
+++ b/tensorflow/compiler/xla/BUILD
@@ -226,12 +226,14 @@ cc_library(
         "index_util.cc",
         "layout_util.cc",
         "primitive_util.cc",
+        "shape.cc",
         "shape_util.cc",
     ],
     hdrs = [
         "index_util.h",
         "layout_util.h",
         "primitive_util.h",
+        "shape.h",
         "shape_util.h",
     ],
     visibility = ["//visibility:public"],
@@ -254,6 +256,23 @@ cc_library(
     ],
 )
 
+tf_cc_test(
+    name = "shape_test",
+    srcs = ["shape_test.cc"],
+    deps = [
+        ":shape_util",
+        ":status_macros",
+        ":test",
+        ":test_helpers",
+        ":types",
+        ":util",
+        ":xla_data_proto",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test_main",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
 tf_cc_test(
     name = "shape_util_test",
     srcs = ["shape_util_test.cc"],
@@ -745,6 +764,8 @@ cc_library(
             "//tensorflow/core:framework_internal",
             "//tensorflow/core:lib",
             "@com_google_absl//absl/strings",
+            "@com_google_absl//absl/strings:str_format",
+            "@com_google_absl//absl/types:span",
         ],
 )
 
diff --git a/tensorflow/compiler/xla/array2d.h b/tensorflow/compiler/xla/array2d.h
index 782c966b4c5..e4aca98f67d 100644
--- a/tensorflow/compiler/xla/array2d.h
+++ b/tensorflow/compiler/xla/array2d.h
@@ -104,7 +104,7 @@ std::unique_ptr<Array2D<NativeT>> MakeLinspaceArray2D(double from, double to,
   int64 count = n1 * n2;
   NativeT step =
       static_cast<NativeT>((count > 1) ? (to - from) / (count - 1) : 0);
-  auto set = [&array, n1, n2](int64 index, NativeT value) {
+  auto set = [&array, n2](int64 index, NativeT value) {
     (*array)(index / n2, index % n2) = value;
   };
   for (int64 i = 0; i < count - 1; ++i) {
diff --git a/tensorflow/compiler/xla/client/BUILD b/tensorflow/compiler/xla/client/BUILD
index 42da0ebf499..fe99564d3c6 100644
--- a/tensorflow/compiler/xla/client/BUILD
+++ b/tensorflow/compiler/xla/client/BUILD
@@ -81,6 +81,7 @@ cc_library(
         "//tensorflow/core:lib",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:optional",
         "@com_google_absl//absl/types:span",
     ],
 )
@@ -90,11 +91,12 @@ cc_library(
     srcs = ["executable_build_options.cc"],
     hdrs = ["executable_build_options.h"],
     deps = [
+        "//tensorflow/compiler/xla:debug_options_flags",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla:xla_proto",
         "//tensorflow/compiler/xla/service:device_memory_allocator",
-        "//tensorflow/core:lib",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/types:optional",
@@ -191,6 +193,7 @@ cc_library(
     hdrs = ["xla_computation.h"],
     visibility = ["//visibility:public"],
     deps = [
+        "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
diff --git a/tensorflow/compiler/xla/client/client.cc b/tensorflow/compiler/xla/client/client.cc
index eef2844e0df..74b76f92994 100644
--- a/tensorflow/compiler/xla/client/client.cc
+++ b/tensorflow/compiler/xla/client/client.cc
@@ -20,6 +20,7 @@ limitations under the License.
 
 #include "absl/memory/memory.h"
 #include "absl/strings/str_cat.h"
+#include "absl/types/optional.h"
 #include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/debug_options_flags.h"
 #include "tensorflow/compiler/xla/execution_options_util.h"
@@ -42,7 +43,7 @@ StatusOr<Literal> Client::Transfer(const GlobalData& data,
   TransferToClientRequest request;
   *request.mutable_data() = data.handle();
   if (shape_with_layout != nullptr) {
-    *request.mutable_shape_with_layout() = *shape_with_layout;
+    *request.mutable_shape_with_layout() = shape_with_layout->ToProto();
   }
   TransferToClientResponse response;
 
@@ -123,7 +124,7 @@ StatusOr<Literal> Client::TransferFromOutfeed(
   }
   request.set_replica_id(replica_id);
   if (shape_with_layout != nullptr) {
-    *request.mutable_shape_with_layout() = *shape_with_layout;
+    *request.mutable_shape_with_layout() = shape_with_layout->ToProto();
   }
   TransferFromOutfeedResponse response;
 
@@ -170,11 +171,14 @@ StatusOr<Literal> Client::ExecuteAndTransfer(
       std::unique_ptr<GlobalData> data,
       Execute(computation, arguments, execution_options, execution_profile));
 
-  const Shape* shape_with_output_layout = nullptr;
+  absl::optional<Shape> shape_with_output_layout;
   if (execution_options && execution_options->has_shape_with_output_layout()) {
-    shape_with_output_layout = &execution_options->shape_with_output_layout();
+    shape_with_output_layout =
+        Shape(execution_options->shape_with_output_layout());
   }
-  return Transfer(*data, shape_with_output_layout);
+  return Transfer(*data, shape_with_output_layout.has_value()
+                             ? &(*shape_with_output_layout)
+                             : nullptr);
 }
 
 StatusOr<Literal> Client::ComputeConstant(const XlaComputation& computation,
@@ -229,7 +233,7 @@ StatusOr<ExecutionHandle> Client::Compile(
 
   // The argument shapes affect how the computation is compiled.
   for (const auto& arg_shape : argument_shapes) {
-    *request.add_input_shape_with_layout() = arg_shape;
+    *request.add_input_shape_with_layout() = arg_shape.ToProto();
   }
 
   CompileResponse response;
@@ -458,7 +462,7 @@ StatusOr<Shape> Client::GetShape(const GlobalData& data) {
     return s;
   }
 
-  return response.shape();
+  return Shape(response.shape());
 }
 
 StatusOr<string> Client::ExecutionStatsAsString(
diff --git a/tensorflow/compiler/xla/client/executable_build_options.cc b/tensorflow/compiler/xla/client/executable_build_options.cc
index 0f1745366b7..1f594e551af 100644
--- a/tensorflow/compiler/xla/client/executable_build_options.cc
+++ b/tensorflow/compiler/xla/client/executable_build_options.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/executable_build_options.h"
 
 #include "absl/strings/str_format.h"
+#include "tensorflow/compiler/xla/debug_options_flags.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 
 namespace xla {
@@ -39,6 +40,13 @@ ExecutableBuildOptions& ExecutableBuildOptions::set_device_ordinal(
 
 int ExecutableBuildOptions::device_ordinal() const { return device_ordinal_; }
 
+DebugOptions* ExecutableBuildOptions::mutable_debug_options() {
+  if (!has_debug_options()) {
+    debug_options_ = GetDebugOptionsFromFlags();
+  }
+  return &debug_options_.value();
+}
+
 ExecutableBuildOptions& ExecutableBuildOptions::set_result_layout(
     const Shape& shape_with_layout) {
   result_layout_set_ = true;
@@ -55,68 +63,10 @@ string ExecutableBuildOptions::ToString() const {
   if (result_layout_set_) {
     result_layout = ShapeUtil::HumanStringWithLayout(result_layout_);
   }
-  string generate_hlo_graph = "nullopt";
-  if (generate_hlo_graph_.has_value()) {
-    generate_hlo_graph = generate_hlo_graph_.value();
-  }
   return absl::StrFormat(
       "ExecutableBuildOptions{device_ordinal=%d, result_layout=%s, "
       "generate_hlo_graph=%s}",
-      device_ordinal_, result_layout, generate_hlo_graph);
-}
-
-ExecutableBuildOptions& ExecutableBuildOptions::set_generate_hlo_graph(
-    string regex) {
-  generate_hlo_graph_ = std::move(regex);
-  return *this;
-}
-
-const absl::optional<string>& ExecutableBuildOptions::generate_hlo_graph()
-    const {
-  return generate_hlo_graph_;
-}
-
-ExecutableBuildOptions& ExecutableBuildOptions::set_dump_optimized_hlo_proto_to(
-    absl::string_view dirpath) {
-  dump_optimized_hlo_proto_to_ = string(dirpath);
-  return *this;
-}
-
-const absl::optional<string>&
-ExecutableBuildOptions::dump_optimized_hlo_proto_to() const {
-  return dump_optimized_hlo_proto_to_;
-}
-
-ExecutableBuildOptions&
-ExecutableBuildOptions::set_dump_unoptimized_hlo_proto_to(
-    absl::string_view dirpath) {
-  dump_unoptimized_hlo_proto_to_ = string(dirpath);
-  return *this;
-}
-
-const absl::optional<string>&
-ExecutableBuildOptions::dump_unoptimized_hlo_proto_to() const {
-  return dump_unoptimized_hlo_proto_to_;
-}
-
-ExecutableBuildOptions& ExecutableBuildOptions::set_dump_per_pass_hlo_proto_to(
-    absl::string_view dirpath) {
-  dump_per_pass_hlo_proto_to_ = string(dirpath);
-  return *this;
-}
-
-const absl::optional<string>&
-ExecutableBuildOptions::dump_per_pass_hlo_proto_to() const {
-  return dump_per_pass_hlo_proto_to_;
-}
-
-ExecutableBuildOptions& ExecutableBuildOptions::set_hlo_profile(bool enabled) {
-  hlo_profile_ = enabled;
-  return *this;
-}
-
-absl::optional<bool> ExecutableBuildOptions::hlo_profile() const {
-  return hlo_profile_;
+      device_ordinal_, result_layout, debug_options().xla_generate_hlo_graph());
 }
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/client/executable_build_options.h b/tensorflow/compiler/xla/client/executable_build_options.h
index 93334db88bc..a58090253bf 100644
--- a/tensorflow/compiler/xla/client/executable_build_options.h
+++ b/tensorflow/compiler/xla/client/executable_build_options.h
@@ -19,7 +19,9 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "absl/types/optional.h"
 #include "tensorflow/compiler/xla/service/device_memory_allocator.h"
+#include "tensorflow/compiler/xla/shape.h"
 #include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/compiler/xla/xla.pb.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 
 namespace xla {
@@ -44,6 +46,12 @@ class ExecutableBuildOptions {
   ExecutableBuildOptions& set_result_layout(const Shape& shape_with_layout);
   const Shape* result_layout() const;
 
+  // Expose access to the XLA debug options which will be passed to the
+  // compilation process.
+  bool has_debug_options() const { return debug_options_.has_value(); }
+  const DebugOptions& debug_options() const { return *debug_options_; }
+  DebugOptions* mutable_debug_options();
+
   // If set, this specifies an allocator that can be used to allocate temporary
   // space on the device during compilation.  For example, the compiler might
   // want to run various algorithms on the device and pick the fastest one -- it
@@ -55,56 +63,16 @@ class ExecutableBuildOptions {
       DeviceMemoryAllocator* allocator);
   DeviceMemoryAllocator* device_allocator() const;
 
-  // If set, specifies a regexp of HLO graphs to dump (as in DebugOptions).
-  ExecutableBuildOptions& set_generate_hlo_graph(string regex);
-  const absl::optional<string>& generate_hlo_graph() const;
-
-  // If set, specifies a dirpath to dump the end-of-optimization-pipeline HLO
-  // protobuf to (as in DebugOptions).
-  ExecutableBuildOptions& set_dump_optimized_hlo_proto_to(
-      absl::string_view dirpath);
-  const absl::optional<string>& dump_optimized_hlo_proto_to() const;
-
-  // If set, specifies a dirpath to dump the start-of-optimization-pipeline HLO
-  // protobuf to (as in DebugOptions).
-  ExecutableBuildOptions& set_dump_unoptimized_hlo_proto_to(
-      absl::string_view dirpath);
-  const absl::optional<string>& dump_unoptimized_hlo_proto_to() const;
-
-  // If set, specifies a dirpath to dump the per-pass-in-pipeline HLO protobufs
-  // to (as in DebugOptions).
-  ExecutableBuildOptions& set_dump_per_pass_hlo_proto_to(
-      absl::string_view dirpath);
-  const absl::optional<string>& dump_per_pass_hlo_proto_to() const;
-
-  // If true, specifies that we should record an HLO profile during execution
-  // and log it after execution (as in DebugOptions). If nullopt the default is
-  // used.
-  ExecutableBuildOptions& set_hlo_profile(bool enabled);
-  absl::optional<bool> hlo_profile() const;
-
-  void add_disabled_hlo_pass(absl::string_view pass_name) {
-    disabled_hlo_passes_.push_back(std::string(pass_name));
-  }
-  const absl::Span<const std::string> disabled_hlo_passes() const {
-    return disabled_hlo_passes_;
-  }
-
   // Returns a string representation of the build options, suitable for
   // debugging.
   string ToString() const;
 
  private:
-  absl::optional<bool> hlo_profile_;
   int device_ordinal_ = -1;
   Shape result_layout_;
   bool result_layout_set_ = false;
-  absl::optional<string> generate_hlo_graph_;
-  absl::optional<string> dump_optimized_hlo_proto_to_;
-  absl::optional<string> dump_unoptimized_hlo_proto_to_;
-  absl::optional<string> dump_per_pass_hlo_proto_to_;
+  absl::optional<DebugOptions> debug_options_;
   DeviceMemoryAllocator* device_allocator_ = nullptr;
-  std::vector<std::string> disabled_hlo_passes_;
 };
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/client/lib/BUILD b/tensorflow/compiler/xla/client/lib/BUILD
index f833ddcd323..f0f530d7d77 100644
--- a/tensorflow/compiler/xla/client/lib/BUILD
+++ b/tensorflow/compiler/xla/client/lib/BUILD
@@ -104,13 +104,17 @@ xla_test(
 )
 
 cc_library(
-    name = "numeric",
-    srcs = ["numeric.cc"],
-    hdrs = ["numeric.h"],
+    name = "matrix",
+    srcs = ["matrix.cc"],
+    hdrs = ["matrix.h"],
     deps = [
         ":arithmetic",
         ":constants",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:xla_builder",
         "@com_google_absl//absl/types:span",
@@ -118,11 +122,12 @@ cc_library(
 )
 
 xla_test(
-    name = "numeric_test",
-    srcs = ["numeric_test.cc"],
+    name = "matrix_test",
+    srcs = ["matrix_test.cc"],
     tags = ["enable_for_xla_interpreter"],
     deps = [
-        ":numeric",
+        ":matrix",
+        ":slicing",
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:xla_data_proto",
@@ -164,7 +169,6 @@ cc_library(
     deps = [
         ":constants",
         ":math",
-        ":numeric",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:xla_builder",
@@ -173,13 +177,46 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "slicing",
+    srcs = ["slicing.cc"],
+    hdrs = ["slicing.h"],
+    deps = [
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
+xla_test(
+    name = "slicing_test",
+    srcs = ["slicing_test.cc"],
+    tags = ["enable_for_xla_interpreter"],
+    deps = [
+        ":slicing",
+        "//tensorflow/compiler/xla:literal_util",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/tests:client_library_test_base",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+    ],
+)
+
 cc_library(
     name = "sorting",
     srcs = ["sorting.cc"],
     hdrs = ["sorting.h"],
     deps = [
-        ":numeric",
+        "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:xla_builder",
     ],
@@ -188,10 +225,6 @@ cc_library(
 xla_test(
     name = "sorting_test",
     srcs = ["sorting_test.cc"],
-    blacklisted_backends = [
-        "cpu",
-        "gpu",
-    ],
     tags = ["enable_for_xla_interpreter"],
     deps = [
         ":sorting",
diff --git a/tensorflow/compiler/xla/client/lib/math.cc b/tensorflow/compiler/xla/client/lib/math.cc
index 08a887a6e46..36fdda39b41 100644
--- a/tensorflow/compiler/xla/client/lib/math.cc
+++ b/tensorflow/compiler/xla/client/lib/math.cc
@@ -268,17 +268,16 @@ XlaOp Digamma(XlaOp input) {
 // Implements Banker's rounding: numbers that are equidistant between two
 // integers are rounded towards even.
 XlaOp RoundToEven(XlaOp x) {
-  auto half = xla::ScalarLike(x, 0.5);
-  auto one = xla::ScalarLike(x, 1.0);
-  auto two = xla::ScalarLike(x, 2.0);
+  auto half = ScalarLike(x, 0.5);
+  auto one = ScalarLike(x, 1.0);
+  auto two = ScalarLike(x, 2.0);
 
-  auto round_val = xla::Floor(x);
+  auto round_val = Floor(x);
   auto fraction = x - round_val;
-  auto nearest_even_int = round_val - two * xla::Floor(half * x);
-  auto is_odd = xla::Eq(nearest_even_int, one);
-  return xla::Select(xla::Or(xla::Gt(fraction, half),
-                             xla::And(xla::Eq(fraction, half), is_odd)),
-                     round_val + one, round_val);
+  auto nearest_even_int = round_val - two * Floor(half * x);
+  auto is_odd = Eq(nearest_even_int, one);
+  return Select(Or(Gt(fraction, half), And(Eq(fraction, half), is_odd)),
+                round_val + one, round_val);
 }
 
 // Trigonometric functions.
@@ -320,4 +319,13 @@ XlaOp Cosh(XlaOp x) { return (Exp(x) + Exp(-x)) * ScalarLike(x, 0.5); }
 
 XlaOp Sinh(XlaOp x) { return (Exp(x) - Exp(-x)) * ScalarLike(x, 0.5); }
 
+XlaOp MaybeConjugate(XlaOp x, bool conjugate) {
+  XlaBuilder* builder = x.builder();
+  return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    TF_ASSIGN_OR_RETURN(Shape shape, builder->GetShape(x));
+    auto perform_conj = shape.element_type() == C64 && conjugate;
+    return perform_conj ? Conj(x) : x;
+  });
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/client/lib/math.h b/tensorflow/compiler/xla/client/lib/math.h
index 3f06d04b9ae..17612bf9fdc 100644
--- a/tensorflow/compiler/xla/client/lib/math.h
+++ b/tensorflow/compiler/xla/client/lib/math.h
@@ -86,6 +86,10 @@ XlaOp Cosh(XlaOp x);
 // Computes the hyperbolic sine of 'x'.
 XlaOp Sinh(XlaOp x);
 
+// Applies a complex conjugation operation if `a` is complex and `conjugate`
+// is true, otherwise returns its argument.
+xla::XlaOp MaybeConjugate(xla::XlaOp x, bool conjugate);
+
 }  // namespace xla
 
 #endif  // TENSORFLOW_COMPILER_XLA_CLIENT_LIB_MATH_H_
diff --git a/tensorflow/compiler/xla/client/lib/matrix.cc b/tensorflow/compiler/xla/client/lib/matrix.cc
new file mode 100644
index 00000000000..ffd744d1908
--- /dev/null
+++ b/tensorflow/compiler/xla/client/lib/matrix.cc
@@ -0,0 +1,185 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/client/lib/matrix.h"
+
+#include <numeric>
+#include <vector>
+
+#include "absl/types/span.h"
+#include "tensorflow/compiler/xla/client/lib/arithmetic.h"
+#include "tensorflow/compiler/xla/client/lib/constants.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/util.h"
+
+namespace xla {
+
+XlaOp IdentityMatrix(XlaBuilder* builder, PrimitiveType type, int64 m,
+                     int64 n) {
+  auto a = Iota(builder, type, m);
+  auto b = Iota(builder, type, n);
+  auto indicator = Eq(a, Broadcast(b, {m}), /*broadcast_dimensions=*/{0});
+  return ConvertElementType(indicator, type);
+}
+
+XlaOp GetMatrixDiagonal(XlaOp x) {
+  XlaBuilder* builder = x.builder();
+  return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    TF_ASSIGN_OR_RETURN(Shape shape, builder->GetShape(x));
+    const int64 n_dims = ShapeUtil::Rank(shape);
+    TF_RET_CHECK(n_dims >= 2);
+    const int64 m = shape.dimensions(n_dims - 2);
+    const int64 n = shape.dimensions(n_dims - 1);
+    absl::Span<const int64> major_dims =
+        AsInt64Slice(shape.dimensions()).subspan(/*pos=*/0, /*len=*/n_dims - 2);
+    auto a = Iota(builder, U32, n);
+    auto b = Iota(builder, U32, m);
+    auto indicator = Eq(b, Broadcast(a, {m}), /*broadcast_dimensions=*/{0});
+    auto mask = Broadcast(indicator, major_dims);
+
+    // TPUs don't support S64 add reduction at the moment. But fortunately
+    // OR-reductions work just as well for integers.
+    XlaComputation reducer =
+        primitive_util::IsIntegralType(shape.element_type())
+            ? CreateScalarOrComputation(shape.element_type(), builder)
+            : CreateScalarAddComputation(shape.element_type(), builder);
+
+    return Reduce(Select(mask, x, Zeros(builder, shape)), ScalarLike(x, 0),
+                  reducer, {m >= n ? n_dims - 2 : n_dims - 1});
+  });
+}
+
+XlaOp Triangle(XlaOp x, bool lower) {
+  XlaBuilder* builder = x.builder();
+  return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    TF_ASSIGN_OR_RETURN(Shape shape, builder->GetShape(x));
+    const int64 n_dims = ShapeUtil::Rank(shape);
+    TF_RET_CHECK(n_dims >= 2);
+    const int64 m = shape.dimensions(n_dims - 2);
+    const int64 n = shape.dimensions(n_dims - 1);
+    absl::Span<const int64> major_dims =
+        AsInt64Slice(shape.dimensions()).subspan(/*pos=*/0, /*len=*/n_dims - 2);
+    auto a = Iota(builder, U32, n);
+    auto b = Iota(builder, U32, m);
+    XlaOp indicator;
+    if (lower) {
+      indicator = Ge(b, Broadcast(a, {m}), /*broadcast_dimensions=*/{0});
+    } else {
+      indicator = Le(b, Broadcast(a, {m}), /*broadcast_dimensions=*/{0});
+    }
+    auto mask = Broadcast(indicator, major_dims);
+
+    return Select(mask, x, Zeros(builder, shape));
+  });
+}
+
+XlaOp UpperTriangle(XlaOp x) { return Triangle(x, false); }
+
+XlaOp LowerTriangle(XlaOp x) { return Triangle(x, true); }
+
+XlaOp BatchDot(XlaOp x, XlaOp y, PrecisionConfig::Precision precision) {
+  XlaBuilder* builder = x.builder();
+  return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    TF_ASSIGN_OR_RETURN(Shape x_shape, builder->GetShape(x));
+    TF_ASSIGN_OR_RETURN(Shape y_shape, builder->GetShape(y));
+
+    // Check that both tensors have the same number of dimensions. There must be
+    // at least two (the batch dimensions can be empty).
+    if (ShapeUtil::Rank(x_shape) != ShapeUtil::Rank(y_shape)) {
+      return InvalidArgument(
+          "Arguments to BatchDot have different ranks: %s vs. %s",
+          ShapeUtil::HumanString(x_shape), ShapeUtil::HumanString(y_shape));
+    }
+    const int ndims = ShapeUtil::Rank(x_shape);
+    if (ndims < 2) {
+      return InvalidArgument(
+          "Arguments to BatchDot must have rank >= 2: got %d", ndims);
+    }
+
+    // The batch dimensions must be equal and the matrix dimensions must be
+    // valid.
+    std::vector<int64> batch_dimension_numbers;
+    for (int i = 0; i < ndims - 2; ++i) {
+      if (x_shape.dimensions(i) != y_shape.dimensions(i)) {
+        return InvalidArgument(
+            "Dimension %d of inputs to BatchDot must be equal: shapes %s vs %s",
+            i, ShapeUtil::HumanString(x_shape),
+            ShapeUtil::HumanString(y_shape));
+      }
+      batch_dimension_numbers.push_back(i);
+    }
+
+    int x_inner_dim = ndims - 1;
+    int y_inner_dim = ndims - 2;
+    if (x_shape.dimensions(x_inner_dim) != y_shape.dimensions(y_inner_dim)) {
+      return InvalidArgument(
+          "Dimensions %d and %d of arguments to BatchDot must be equal: "
+          "shapes %s vs %s",
+          x_inner_dim, y_inner_dim, ShapeUtil::HumanString(x_shape),
+          ShapeUtil::HumanString(y_shape));
+    }
+
+    // Check for zero lhs/rhs dim size.
+    if (ShapeUtil::IsZeroElementArray(x_shape) ||
+        ShapeUtil::IsZeroElementArray(y_shape)) {
+      std::vector<int64> dimensions(batch_dimension_numbers.size());
+      for (int i = 0; i < batch_dimension_numbers.size(); ++i) {
+        dimensions[i] = x_shape.dimensions(batch_dimension_numbers[i]);
+      }
+      int x_outer_dim = ndims - 2;
+      int y_outer_dim = ndims - 1;
+      dimensions.push_back(x_shape.dimensions(x_outer_dim));
+      dimensions.push_back(y_shape.dimensions(y_outer_dim));
+      return Broadcast(
+          ConstantLiteral(builder, LiteralUtil::Zero(x_shape.element_type())),
+          dimensions);
+    }
+
+    PrecisionConfig precision_proto;
+    precision_proto.add_operand_precision(precision);
+    precision_proto.add_operand_precision(precision);
+
+    DotDimensionNumbers dot_dnums;
+    dot_dnums.add_lhs_contracting_dimensions(x_inner_dim);
+    dot_dnums.add_rhs_contracting_dimensions(y_inner_dim);
+    for (auto batch_dimension_number : batch_dimension_numbers) {
+      dot_dnums.add_lhs_batch_dimensions(batch_dimension_number);
+      dot_dnums.add_rhs_batch_dimensions(batch_dimension_number);
+    }
+
+    return DotGeneral(x, y, dot_dnums, &precision_proto);
+  });
+}
+
+XlaOp TransposeInMinorDims(XlaOp x) {
+  XlaBuilder* builder = x.builder();
+  return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    TF_ASSIGN_OR_RETURN(Shape shape, builder->GetShape(x));
+    const int64 n_dims = ShapeUtil::Rank(shape);
+    TF_RET_CHECK(n_dims >= 2);
+    std::vector<int64> permutation(n_dims);
+    std::iota(permutation.begin(), permutation.end(), 0);
+    std::swap(permutation[n_dims - 1], permutation[n_dims - 2]);
+    return Transpose(x, permutation);
+  });
+}
+
+XlaOp MaybeTransposeInMinorDims(XlaOp x, bool transpose) {
+  return transpose ? TransposeInMinorDims(x) : x;
+}
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/client/lib/numeric.h b/tensorflow/compiler/xla/client/lib/matrix.h
similarity index 56%
rename from tensorflow/compiler/xla/client/lib/numeric.h
rename to tensorflow/compiler/xla/client/lib/matrix.h
index efd8cdc2572..8856f99c7a0 100644
--- a/tensorflow/compiler/xla/client/lib/numeric.h
+++ b/tensorflow/compiler/xla/client/lib/matrix.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_COMPILER_XLA_CLIENT_LIB_NUMERIC_H_
-#define TENSORFLOW_COMPILER_XLA_CLIENT_LIB_NUMERIC_H_
+#ifndef TENSORFLOW_COMPILER_XLA_CLIENT_LIB_MATRIX_H_
+#define TENSORFLOW_COMPILER_XLA_CLIENT_LIB_MATRIX_H_
 
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/types.h"
@@ -22,9 +22,6 @@ limitations under the License.
 
 namespace xla {
 
-// Returns a rank 1 tensor of `type` containing values [0, 1, 2, ...].
-XlaOp Iota(XlaBuilder* builder, PrimitiveType type, int64 size);
-
 // Returns an m x n matrix with 1s on the diagonal elements, zeros everywhere
 // else.
 XlaOp IdentityMatrix(XlaBuilder* builder, PrimitiveType type, int64 m, int64 n);
@@ -43,6 +40,34 @@ XlaOp UpperTriangle(XlaOp x);
 // Get the lower triangle part of the last two dimensions
 XlaOp LowerTriangle(XlaOp x);
 
+// Multiplies slices of two tensors in batches.
+
+// Multiplies all slices of `Tensor` `x` and `y` (each slice can be
+// viewed as an element of a batch), and arranges the individual results
+// in a single output tensor of the same batch size.
+//
+// The input tensors `x` and `y` are 2-D or higher with shape `[..., r_x, c_x]`
+// and `[..., r_y, c_y]`.
+//
+// The output tensor is 2-D or higher with shape `[..., r_o, c_o]`, where:
+//
+//     r_o = c_x if transpose_x else r_x
+//     c_o = r_y if transpose_y else c_y
+//
+// It is computed as:
+//
+//     output[..., :, :] = matrix(x[..., :, :]) * matrix(y[..., :, :])
+xla::XlaOp BatchDot(
+    xla::XlaOp x, xla::XlaOp y,
+    xla::PrecisionConfig::Precision precision = xla::PrecisionConfig::DEFAULT);
+
+// Transposes a stack of matrices `x` by swapping the last two dimensions.
+xla::XlaOp TransposeInMinorDims(xla::XlaOp x);
+
+// Transposes `x` in its minor dimensions if `transpose` is true, otherwise
+// returns `x` unchanged.
+xla::XlaOp MaybeTransposeInMinorDims(xla::XlaOp x, bool transpose);
+
 }  // namespace xla
 
-#endif  // TENSORFLOW_COMPILER_XLA_CLIENT_LIB_NUMERIC_H_
+#endif  // TENSORFLOW_COMPILER_XLA_CLIENT_LIB_MATRIX_H_
diff --git a/tensorflow/compiler/xla/client/lib/numeric_test.cc b/tensorflow/compiler/xla/client/lib/matrix_test.cc
similarity index 53%
rename from tensorflow/compiler/xla/client/lib/numeric_test.cc
rename to tensorflow/compiler/xla/client/lib/matrix_test.cc
index 7d6aedd4946..0593a7517ac 100644
--- a/tensorflow/compiler/xla/client/lib/numeric_test.cc
+++ b/tensorflow/compiler/xla/client/lib/matrix_test.cc
@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/xla/client/lib/numeric.h"
+#include "tensorflow/compiler/xla/client/lib/matrix.h"
+
+#include "tensorflow/compiler/xla/client/lib/slicing.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
@@ -24,13 +26,13 @@ limitations under the License.
 namespace xla {
 namespace {
 
-class NumericTest : public ClientLibraryTestBase {
+class MatrixTest : public ClientLibraryTestBase {
  protected:
   template <typename T>
   void TestMatrixDiagonal();
 };
 
-XLA_TEST_F(NumericTest, Triangle) {
+XLA_TEST_F(MatrixTest, Triangle) {
   XlaBuilder builder(TestName());
   Array3D<int32> input(2, 3, 4);
   input.FillIota(0);
@@ -45,7 +47,7 @@ XLA_TEST_F(NumericTest, Triangle) {
 }
 
 template <typename T>
-void NumericTest::TestMatrixDiagonal() {
+void MatrixTest::TestMatrixDiagonal() {
   XlaBuilder builder("GetMatrixDiagonal");
   Array3D<T> input(2, 3, 4);
   input.FillIota(0);
@@ -58,11 +60,46 @@ void NumericTest::TestMatrixDiagonal() {
   ComputeAndCompareR2<T>(&builder, expected, {a_data.get()});
 }
 
-XLA_TEST_F(NumericTest, GetMatrixDiagonal_S32) { TestMatrixDiagonal<int32>(); }
+XLA_TEST_F(MatrixTest, GetMatrixDiagonal_S32) { TestMatrixDiagonal<int32>(); }
 
-XLA_TEST_F(NumericTest, GetMatrixDiagonal_S64) { TestMatrixDiagonal<int64>(); }
+XLA_TEST_F(MatrixTest, GetMatrixDiagonal_S64) { TestMatrixDiagonal<int64>(); }
 
-XLA_TEST_F(NumericTest, GetMatrixDiagonal_F32) { TestMatrixDiagonal<float>(); }
+XLA_TEST_F(MatrixTest, GetMatrixDiagonal_F32) { TestMatrixDiagonal<float>(); }
 
+Array3D<float> BatchedAValsFull() {
+  return {{
+              {2, 0, 1, 2},
+              {3, 6, 0, 1},
+              {4, 7, 9, 0},
+              {5, 8, 10, 11},
+          },
+          {
+              {16, 24, 8, 12},
+              {24, 61, 82, 48},
+              {8, 82, 456, 106},
+              {12, 48, 106, 62},
+          }};
+}
+
+XLA_TEST_F(MatrixTest, RowBatchDot) {
+  XlaBuilder builder(TestName());
+
+  int n = 4;
+
+  XlaOp a, row, index;
+  auto a_data =
+      CreateR3Parameter<float>(BatchedAValsFull(), 0, "a", &builder, &a);
+  auto row_data = CreateR3Parameter<float>({{{9, 1, 0, 0}}, {{2, 4, 0, 0}}}, 1,
+                                           "row", &builder, &row);
+  // Select {{3, 6, 0, 1}, {24, 61,  82,  48}} out of BatchedAValsFull().
+  auto index_data = CreateR0Parameter<int>(1, 2, "index", &builder, &index);
+
+  auto l_index = DynamicSliceInMinorDims(
+      a, {index, ConstantR0<int32>(&builder, 0)}, {1, n});
+  BatchDot(l_index, TransposeInMinorDims(row));
+
+  ComputeAndCompareR3<float>(&builder, {{{33}}, {{292}}},
+                             {a_data.get(), row_data.get(), index_data.get()});
+}
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/client/lib/numeric.cc b/tensorflow/compiler/xla/client/lib/numeric.cc
deleted file mode 100644
index 377654220b5..00000000000
--- a/tensorflow/compiler/xla/client/lib/numeric.cc
+++ /dev/null
@@ -1,89 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <numeric>
-#include <vector>
-
-#include "absl/types/span.h"
-#include "tensorflow/compiler/xla/client/lib/arithmetic.h"
-#include "tensorflow/compiler/xla/client/lib/constants.h"
-#include "tensorflow/compiler/xla/client/lib/numeric.h"
-
-namespace xla {
-
-XlaOp IdentityMatrix(XlaBuilder* builder, PrimitiveType type, int64 m,
-                     int64 n) {
-  auto a = Iota(builder, type, m);
-  auto b = Iota(builder, type, n);
-  auto indicator = Eq(a, Broadcast(b, {m}), /*broadcast_dimensions=*/{0});
-  return ConvertElementType(indicator, type);
-}
-
-XlaOp GetMatrixDiagonal(XlaOp x) {
-  XlaBuilder* builder = x.builder();
-  return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
-    TF_ASSIGN_OR_RETURN(Shape shape, builder->GetShape(x));
-    const int64 n_dims = ShapeUtil::Rank(shape);
-    TF_RET_CHECK(n_dims >= 2);
-    const int64 m = shape.dimensions(n_dims - 2);
-    const int64 n = shape.dimensions(n_dims - 1);
-    absl::Span<const int64> major_dims =
-        AsInt64Slice(shape.dimensions()).subspan(/*pos=*/0, /*len=*/n_dims - 2);
-    auto a = Iota(builder, U32, n);
-    auto b = Iota(builder, U32, m);
-    auto indicator = Eq(b, Broadcast(a, {m}), /*broadcast_dimensions=*/{0});
-    auto mask = Broadcast(indicator, major_dims);
-
-    // TPUs don't support S64 add reduction at the moment. But fortunately
-    // OR-reductions work just as well for integers.
-    XlaComputation reducer =
-        primitive_util::IsIntegralType(shape.element_type())
-            ? CreateScalarOrComputation(shape.element_type(), builder)
-            : CreateScalarAddComputation(shape.element_type(), builder);
-
-    return Reduce(Select(mask, x, Zeros(builder, shape)), ScalarLike(x, 0),
-                  reducer, {m >= n ? n_dims - 2 : n_dims - 1});
-  });
-}
-
-XlaOp Triangle(XlaOp x, bool lower) {
-  XlaBuilder* builder = x.builder();
-  return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
-    TF_ASSIGN_OR_RETURN(Shape shape, builder->GetShape(x));
-    const int64 n_dims = ShapeUtil::Rank(shape);
-    TF_RET_CHECK(n_dims >= 2);
-    const int64 m = shape.dimensions(n_dims - 2);
-    const int64 n = shape.dimensions(n_dims - 1);
-    absl::Span<const int64> major_dims =
-        AsInt64Slice(shape.dimensions()).subspan(/*pos=*/0, /*len=*/n_dims - 2);
-    auto a = Iota(builder, U32, n);
-    auto b = Iota(builder, U32, m);
-    xla::XlaOp indicator;
-    if (lower) {
-      indicator = Ge(b, Broadcast(a, {m}), /*broadcast_dimensions=*/{0});
-    } else {
-      indicator = Le(b, Broadcast(a, {m}), /*broadcast_dimensions=*/{0});
-    }
-    auto mask = Broadcast(indicator, major_dims);
-
-    return Select(mask, x, Zeros(builder, shape));
-  });
-}
-
-XlaOp UpperTriangle(XlaOp x) { return Triangle(x, false); }
-
-XlaOp LowerTriangle(XlaOp x) { return Triangle(x, true); }
-
-}  // namespace xla
diff --git a/tensorflow/compiler/xla/client/lib/prng.cc b/tensorflow/compiler/xla/client/lib/prng.cc
index c6f68c8ee2f..85b9e1827dc 100644
--- a/tensorflow/compiler/xla/client/lib/prng.cc
+++ b/tensorflow/compiler/xla/client/lib/prng.cc
@@ -18,7 +18,6 @@ limitations under the License.
 #include "absl/base/casts.h"
 #include "tensorflow/compiler/xla/client/lib/constants.h"
 #include "tensorflow/compiler/xla/client/lib/math.h"
-#include "tensorflow/compiler/xla/client/lib/numeric.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/util.h"
 
diff --git a/tensorflow/compiler/xla/client/lib/slicing.cc b/tensorflow/compiler/xla/client/lib/slicing.cc
new file mode 100644
index 00000000000..f8c7df3ff51
--- /dev/null
+++ b/tensorflow/compiler/xla/client/lib/slicing.cc
@@ -0,0 +1,134 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/client/lib/slicing.h"
+
+namespace xla {
+
+XlaOp SliceInMinorDims(XlaOp x, absl::Span<const int64> start,
+                       absl::Span<const int64> end) {
+  XlaBuilder* builder = x.builder();
+  return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    TF_RET_CHECK(start.size() == end.size());
+    int64 n_minor_dims = start.size();
+
+    TF_ASSIGN_OR_RETURN(Shape shape, builder->GetShape(x));
+
+    const int64 n_dims = ShapeUtil::Rank(shape);
+    TF_RET_CHECK(n_minor_dims <= n_dims);
+    auto major_dims = AsInt64Slice(shape.dimensions())
+                          .subspan(
+                              /*pos=*/0,
+                              /*len=*/n_dims - n_minor_dims);
+
+    // Prepends 0s in the major dim
+    std::vector<int64> padded_start(n_dims, 0);
+    std::copy(start.begin(), start.end(),
+              padded_start.begin() + major_dims.size());
+
+    // Prepends the shape of the major dims.
+    std::vector<int64> padded_end(n_dims);
+    std::copy(major_dims.begin(), major_dims.end(), padded_end.begin());
+    std::copy(end.begin(), end.end(), padded_end.begin() + major_dims.size());
+
+    std::vector<int64> strides(n_dims, 1);
+    return Slice(x, padded_start, padded_end, strides);
+  });
+}
+
+XlaOp UpdateSlice(XlaOp x, XlaOp update, absl::Span<const int64> start) {
+  XlaBuilder* builder = x.builder();
+  return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    // TODO(phawkins): make int64 work on all backends, remove the int32 cast.
+    std::vector<int32> start_as_int32(start.begin(), start.end());
+    auto start_constant = ConstantR1<int32>(builder, start_as_int32);
+    TF_ASSIGN_OR_RETURN(Shape shape, builder->GetShape(x));
+    const int64 n_dims = ShapeUtil::Rank(shape);
+    TF_ASSIGN_OR_RETURN(Shape start_constant_shape,
+                        builder->GetShape(start_constant));
+    const int64 start_length =
+        ShapeUtil::GetDimension(start_constant_shape, -1);
+    TF_RET_CHECK(start_length == n_dims);
+    return DynamicUpdateSlice(x, update, start_constant);
+  });
+}
+
+XlaOp UpdateSliceInMinorDims(XlaOp x, XlaOp update,
+                             absl::Span<const int64> start) {
+  XlaBuilder* builder = x.builder();
+  return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    TF_ASSIGN_OR_RETURN(Shape shape, builder->GetShape(x));
+    const int64 n_dims = ShapeUtil::Rank(shape);
+    const int64 n_minor_dims = start.size();
+    TF_RET_CHECK(n_minor_dims <= n_dims);
+    std::vector<int64> padded_start(n_dims, 0);
+    std::copy(start.begin(), start.end(),
+              padded_start.begin() + (n_dims - n_minor_dims));
+    return UpdateSlice(x, update, padded_start);
+  });
+}
+
+namespace {
+
+std::vector<int64> ConcatVectors(absl::Span<const int64> xs,
+                                 absl::Span<const int64> ys) {
+  std::vector<int64> output(xs.size() + ys.size());
+  std::copy(xs.begin(), xs.end(), output.begin());
+  std::copy(ys.begin(), ys.end(), output.begin() + xs.size());
+  return output;
+}
+
+XlaOp PrependZerosInMajorDims(XlaOp x, absl::Span<const XlaOp> starts) {
+  XlaBuilder* builder = x.builder();
+  return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    TF_ASSIGN_OR_RETURN(Shape shape, builder->GetShape(x));
+    const int64 n_dims = ShapeUtil::Rank(shape);
+    auto zero = Reshape(ConstantR0<int32>(builder, 0), {1});
+    std::vector<XlaOp> padded_starts(n_dims, zero);
+    for (int i = 0; i < starts.size(); ++i) {
+      padded_starts[n_dims - starts.size() + i] = Reshape(starts[i], {1});
+    }
+    return ConcatInDim(builder, padded_starts, 0);
+  });
+}
+
+}  // namespace
+
+XlaOp DynamicSliceInMinorDims(XlaOp x, absl::Span<const XlaOp> starts,
+                              absl::Span<const int64> sizes) {
+  XlaBuilder* builder = x.builder();
+  return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    TF_ASSIGN_OR_RETURN(Shape shape, builder->GetShape(x));
+    const int64 n_dims = ShapeUtil::Rank(shape);
+    int64 n_minor_dims = starts.size();
+    TF_RET_CHECK(n_minor_dims == sizes.size());
+    TF_RET_CHECK(n_minor_dims <= n_dims);
+    auto major_dims = AsInt64Slice(shape.dimensions())
+                          .subspan(
+                              /*pos=*/0,
+                              /*len=*/n_dims - sizes.size());
+    auto padded_starts = PrependZerosInMajorDims(x, starts);
+    auto padded_sizes = ConcatVectors(major_dims, sizes);
+    return DynamicSlice(x, padded_starts, padded_sizes);
+  });
+}
+
+XlaOp DynamicUpdateSliceInMinorDims(XlaOp x, XlaOp update,
+                                    absl::Span<const XlaOp> starts) {
+  auto padded_starts = PrependZerosInMajorDims(x, starts);
+  return DynamicUpdateSlice(x, update, padded_starts);
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/client/lib/slicing.h b/tensorflow/compiler/xla/client/lib/slicing.h
new file mode 100644
index 00000000000..6c482a38b54
--- /dev/null
+++ b/tensorflow/compiler/xla/client/lib/slicing.h
@@ -0,0 +1,48 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "absl/types/span.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/types.h"
+
+#ifndef TENSORFLOW_COMPILER_XLA_CLIENT_LIB_SLICING_H_
+#define TENSORFLOW_COMPILER_XLA_CLIENT_LIB_SLICING_H_
+
+namespace xla {
+
+// Updates a slice of 'x', i.e.,
+// x[start[0], ..., start[n]] = update
+XlaOp UpdateSlice(XlaOp x, XlaOp update, absl::Span<const int64> start);
+
+// Performs a slice in the minor dimensions of a tensor.
+// x[..., start[0]:end[0], ..., start[n]:end[n]]
+XlaOp SliceInMinorDims(XlaOp x, absl::Span<const int64> start,
+                       absl::Span<const int64> end);
+
+// Updates a slice of 'x', where 'start' contains a list of minor dimensions:
+// x[..., start[0]:..., ..., start[n]:...] = update
+XlaOp UpdateSliceInMinorDims(XlaOp x, XlaOp update,
+                             absl::Span<const int64> start);
+
+// Performs a dynamic slice in the minor dimensions of a tensor.
+XlaOp DynamicSliceInMinorDims(XlaOp x, absl::Span<const XlaOp> starts,
+                              absl::Span<const int64> sizes);
+
+XlaOp DynamicUpdateSliceInMinorDims(XlaOp x, XlaOp update,
+                                    absl::Span<const XlaOp> starts);
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_CLIENT_LIB_SLICING_H_
diff --git a/tensorflow/compiler/tf2xla/lib/util_test.cc b/tensorflow/compiler/xla/client/lib/slicing_test.cc
similarity index 67%
rename from tensorflow/compiler/tf2xla/lib/util_test.cc
rename to tensorflow/compiler/xla/client/lib/slicing_test.cc
index 442fe92c34c..8d362119e01 100644
--- a/tensorflow/compiler/tf2xla/lib/util_test.cc
+++ b/tensorflow/compiler/xla/client/lib/slicing_test.cc
@@ -13,28 +13,19 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/tf2xla/lib/util.h"
+#include "tensorflow/compiler/xla/client/lib/slicing.h"
 
-#include <memory>
-#include <numeric>
-#include <vector>
-
-#include "tensorflow/compiler/tf2xla/lib/batch_dot.h"
-#include "tensorflow/compiler/xla/array2d.h"
-#include "tensorflow/compiler/xla/literal.h"
-#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
-#include "tensorflow/compiler/xla/tests/literal_test_util.h"
 #include "tensorflow/compiler/xla/tests/test_macros.h"
 #include "tensorflow/compiler/xla/types.h"
-#include "tensorflow/core/lib/core/status_test_util.h"
 
-namespace tensorflow {
+namespace xla {
 namespace {
 
-using UtilTest = xla::ClientLibraryTestBase;
-using UtilLeftLookingTest = xla::ClientLibraryTestBase;
+using SlicingTest = xla::ClientLibraryTestBase;
 
 xla::Array2D<float> BValsRight() {
   return {{1, 2, 3, 4}, {5, 6, 7, 8}, {9, 10, 11, 12}};
@@ -63,7 +54,7 @@ xla::Array3D<float> BatchedAValsFull() {
           }};
 }
 
-XLA_TEST_F(UtilTest, Simple2dLookup) {
+XLA_TEST_F(SlicingTest, Simple2dLookup) {
   xla::XlaBuilder builder(TestName());
 
   xla::XlaOp a, x, y;
@@ -77,7 +68,7 @@ XLA_TEST_F(UtilTest, Simple2dLookup) {
                              xla::ErrorSpec(1e-2, 1e-2));
 }
 
-XLA_TEST_F(UtilTest, Simple3dLookup) {
+XLA_TEST_F(SlicingTest, Simple3dLookup) {
   xla::XlaBuilder builder(TestName());
 
   xla::XlaOp a, index;
@@ -92,7 +83,7 @@ XLA_TEST_F(UtilTest, Simple3dLookup) {
                              {a_data.get(), index_data.get()});
 }
 
-XLA_TEST_F(UtilTest, SimpleSliceUpdate) {
+XLA_TEST_F(SlicingTest, SimpleSliceUpdate) {
   xla::XlaBuilder builder(TestName());
 
   xla::XlaOp a, b, x, y;
@@ -111,26 +102,5 @@ XLA_TEST_F(UtilTest, SimpleSliceUpdate) {
       {a_data.get(), b_data.get(), x_data.get(), y_data.get()});
 }
 
-XLA_TEST_F(UtilTest, RowBatchDot) {
-  xla::XlaBuilder builder(TestName());
-
-  int n = 4;
-
-  xla::XlaOp a, row, index;
-  auto a_data =
-      CreateR3Parameter<float>(BatchedAValsFull(), 0, "a", &builder, &a);
-  auto row_data = CreateR3Parameter<float>({{{9, 1, 0, 0}}, {{2, 4, 0, 0}}}, 1,
-                                           "row", &builder, &row);
-  // Select {{3, 6, 0, 1}, {24, 61,  82,  48}} out of BatchedAValsFull().
-  auto index_data = CreateR0Parameter<int>(1, 2, "index", &builder, &index);
-
-  auto l_index = DynamicSliceInMinorDims(
-      a, {index, xla::ConstantR0<int32>(&builder, 0)}, {1, n});
-  BatchDot(l_index, row, /*transpose_x=*/false, /*transpose_y=*/true);
-
-  ComputeAndCompareR3<float>(&builder, {{{33}}, {{292}}},
-                             {a_data.get(), row_data.get(), index_data.get()});
-}
-
 }  // namespace
-}  // namespace tensorflow
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/client/lib/sorting.cc b/tensorflow/compiler/xla/client/lib/sorting.cc
index 0475fd9c94f..e8553a08bb0 100644
--- a/tensorflow/compiler/xla/client/lib/sorting.cc
+++ b/tensorflow/compiler/xla/client/lib/sorting.cc
@@ -14,7 +14,9 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/compiler/xla/client/lib/sorting.h"
-#include "tensorflow/compiler/xla/client/lib/numeric.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/util.h"
 
 namespace xla {
 
@@ -23,13 +25,12 @@ XlaOp TopK(XlaOp input, int64 k) {
   return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(Shape input_shape, builder->GetShape(input));
     int last_dim = input_shape.dimensions_size() - 1;
-    int last_dim_size = input_shape.dimensions(last_dim);
 
-    XlaOp iota_s32 = Iota(builder, S32, last_dim_size);
+    Shape iota_shape =
+        ShapeUtil::MakeShape(S32, AsInt64Slice(input_shape.dimensions()));
+    XlaOp iota_s32 = Iota(builder, iota_shape, last_dim);
     auto input_dims = input_shape.dimensions();
-    std::vector<int64> broadcast_dims(input_dims.begin(), input_dims.end() - 1);
-    XlaOp broadcast_s32 = Broadcast(iota_s32, broadcast_dims);
-    XlaOp sort_result = Sort(Neg(input), {broadcast_s32});
+    XlaOp sort_result = Sort(Neg(input), {iota_s32});
     std::vector<int64> start_indices(input_shape.dimensions_size(), 0);
     std::vector<int64> limit_indices(input_dims.begin(), input_dims.end());
     limit_indices[last_dim] = k;
diff --git a/tensorflow/compiler/xla/client/lib/sorting_test.cc b/tensorflow/compiler/xla/client/lib/sorting_test.cc
index fef98c99230..27ff36c7491 100644
--- a/tensorflow/compiler/xla/client/lib/sorting_test.cc
+++ b/tensorflow/compiler/xla/client/lib/sorting_test.cc
@@ -14,6 +14,9 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/compiler/xla/client/lib/sorting.h"
+
+#include <limits>
+
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
@@ -41,6 +44,28 @@ XLA_TEST_F(SortingTest, TopK3From8Indices) {
   ComputeAndCompareR1<int>(&builder, {0, 1, 2}, {});
 }
 
+// TODO(b/119930279): enable this test.
+XLA_TEST_F(SortingTest, DISABLED_TopKFullSortMinInt) {
+  XlaBuilder builder(TestName());
+  auto x_rev = ConstantR1<int>(&builder, {std::numeric_limits<int>::min(),
+                                          std::numeric_limits<int>::min() + 1,
+                                          std::numeric_limits<int>::max()});
+  xla::GetTupleElement(xla::TopK(x_rev, 3), 1);
+  ComputeAndCompareR1<int>(&builder, {2, 1, 0}, {});
+}
+
+XLA_TEST_F(SortingTest, NOT_TopKFullSortMinInt) {
+  XlaBuilder builder(TestName());
+  auto x_rev = ConstantR1<int>(&builder, {std::numeric_limits<int>::min(),
+                                          std::numeric_limits<int>::min() + 1,
+                                          std::numeric_limits<int>::max()});
+  xla::GetTupleElement(xla::TopK(x_rev, 3), 1);
+  // TopK currently negates the keys, which doesn't work correctly for
+  // std::numeric_limits<int>::min(). Therefore, it will sort this key to the
+  // front instead of to the back.
+  ComputeAndCompareR1<int>(&builder, {0, 2, 1}, {});
+}
+
 XLA_TEST_F(SortingTest, TopKFullSort) {
   XlaBuilder builder(TestName());
   const int kSize = 16;
@@ -56,5 +81,13 @@ XLA_TEST_F(SortingTest, TopKFullSort) {
   ComputeAndCompareR1<float>(&builder, inputs, {});
 }
 
+XLA_TEST_F(SortingTest, TopKFullSortWithDuplicates) {
+  XlaBuilder builder(TestName());
+  XlaOp a;
+  auto a_data = CreateR1Parameter<int>({1, 1, 2, 2, 1}, 0, "a", &builder, &a);
+  xla::GetTupleElement(xla::TopK(a, 5), 1);
+  ComputeAndCompareR1<int>(&builder, {2, 3, 0, 1, 4}, {a_data.get()});
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/client/lib/testing.cc b/tensorflow/compiler/xla/client/lib/testing.cc
index a44681f5862..a95bbf2c8c8 100644
--- a/tensorflow/compiler/xla/client/lib/testing.cc
+++ b/tensorflow/compiler/xla/client/lib/testing.cc
@@ -66,7 +66,7 @@ std::unique_ptr<GlobalData> MakeFakeDataViaDeviceOrDie(const Shape& shape,
   XlaComputation computation = b.Build().ConsumeValueOrDie();
 
   auto execution_options = CreateDefaultExecutionOptions();
-  *execution_options.mutable_shape_with_output_layout() = shape;
+  *execution_options.mutable_shape_with_output_layout() = shape.ToProto();
   return client->Execute(computation, /*arguments=*/{}, &execution_options)
       .ConsumeValueOrDie();
 }
@@ -98,8 +98,8 @@ std::vector<std::unique_ptr<GlobalData>> MakeFakeArgumentsOrDie(
   auto program_shape = computation.proto().host_program_shape();
 
   std::vector<std::unique_ptr<GlobalData>> results;
-  for (const Shape& shape : program_shape.parameters()) {
-    results.push_back(MakeFakeDataOrDie(shape, client));
+  for (const ShapeProto& shape : program_shape.parameters()) {
+    results.push_back(MakeFakeDataOrDie(Shape(shape), client));
   }
   return results;
 }
diff --git a/tensorflow/compiler/xla/client/local_client.cc b/tensorflow/compiler/xla/client/local_client.cc
index f96b6c9c261..aaa5d6989ee 100644
--- a/tensorflow/compiler/xla/client/local_client.cc
+++ b/tensorflow/compiler/xla/client/local_client.cc
@@ -310,4 +310,28 @@ StatusOr<int> LocalClient::ReplicaNumberToDeviceOrdinal(int replica_number) {
   return local_service_->ReplicaNumberToDeviceOrdinal(replica_number);
 }
 
+StatusOr<TransferToServerResponse> LocalClient::TransferToLocalServer(
+    const ::xla::BorrowingLiteral& literal, int device_oridinal) {
+  const ::xla::Shape& shape = literal.shape();
+
+  TF_ASSIGN_OR_RETURN(
+      ::xla::ScopedShapedBuffer shaped_buffer,
+      backend().transfer_manager()->AllocateScopedShapedBuffer(
+          shape, backend().memory_allocator(), device_oridinal));
+  TF_ASSIGN_OR_RETURN(auto stream,
+                      mutable_backend()->BorrowStream(device_oridinal));
+  TF_RETURN_IF_ERROR(backend().transfer_manager()->TransferLiteralToDevice(
+      stream.get(), literal, shaped_buffer));
+  std::vector<::xla::ScopedShapedBuffer> replicated_buffer;
+  replicated_buffer.emplace_back(std::move(shaped_buffer));
+  ::xla::TransferToServerResponse result;
+  TF_ASSIGN_OR_RETURN(*result.mutable_data(),
+                      local_service_->RegisterReplicatedBuffers(
+                          std::move(replicated_buffer),
+                          absl::StrCat("TransferToServer literal of shape ",
+                                       ::xla::ShapeUtil::HumanString(shape))));
+
+  return result;
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/client/local_client.h b/tensorflow/compiler/xla/client/local_client.h
index e49451ca970..ddb36680e8b 100644
--- a/tensorflow/compiler/xla/client/local_client.h
+++ b/tensorflow/compiler/xla/client/local_client.h
@@ -129,6 +129,10 @@ class LocalClient : public Client {
       const Literal& literal, int device_ordinal,
       DeviceMemoryAllocator* allocator = nullptr);
 
+  // Transfer the BorrowingLiteral to the device with the given ordinal.
+  StatusOr<TransferToServerResponse> TransferToLocalServer(
+      const ::xla::BorrowingLiteral& literal, int device_oridinal);
+
   // Copy the data from the device contained in the given ShapedBuffer and
   // return as a Literal.
   StatusOr<Literal> ShapedBufferToLiteral(const ShapedBuffer& shaped_buffer);
diff --git a/tensorflow/compiler/xla/client/sharding_builder.cc b/tensorflow/compiler/xla/client/sharding_builder.cc
index 176802b33ef..fb9ea6ec3fc 100644
--- a/tensorflow/compiler/xla/client/sharding_builder.cc
+++ b/tensorflow/compiler/xla/client/sharding_builder.cc
@@ -36,7 +36,7 @@ OpSharding Tile(const Shape& tile_shape,
                 const TileAssignment& tile_assignment) {
   OpSharding result;
   result.set_type(OpSharding::Type::OpSharding_Type_OTHER);
-  *result.mutable_tile_shape() = tile_shape;
+  *result.mutable_tile_shape() = tile_shape.ToProto();
   for (int64 dim : tile_assignment.dimensions()) {
     result.add_tile_assignment_dimensions(dim);
   }
@@ -52,7 +52,7 @@ OpSharding Tile1D(const Shape& tile_shape, int64 num_tiles) {
 
   CHECK_EQ(ShapeUtil::Rank(tile_shape), 1);
   std::vector<int64> dimensions(1, num_tiles);
-  *result.mutable_tile_shape() = tile_shape;
+  *result.mutable_tile_shape() = tile_shape.ToProto();
   auto& tile_dimension =
       (*result.mutable_tile_shape()->mutable_dimensions())[0];
   tile_dimension = CeilOfRatio(static_cast<int64>(tile_dimension), num_tiles);
diff --git a/tensorflow/compiler/xla/client/xla_builder.cc b/tensorflow/compiler/xla/client/xla_builder.cc
index 0a587725d20..60df2ec3959 100644
--- a/tensorflow/compiler/xla/client/xla_builder.cc
+++ b/tensorflow/compiler/xla/client/xla_builder.cc
@@ -102,7 +102,7 @@ StatusOr<Shape> XlaBuilder::GetShape(const XlaOp& op) const {
   TF_RETURN_IF_ERROR(first_error_);
 
   TF_ASSIGN_OR_RETURN(auto instr, LookUpInstruction(op));
-  return instr->shape();
+  return Shape(instr->shape());
 }
 
 StatusOr<std::vector<Shape>> XlaBuilder::GetOperandShapes(
@@ -155,7 +155,7 @@ StatusOr<ProgramShape> XlaBuilder::GetProgramShape(int64 root_id) const {
 
   ProgramShape program_shape;
 
-  *program_shape.mutable_result() = root_proto->shape();
+  *program_shape.mutable_result() = Shape(root_proto->shape());
 
   // Check that the parameter numbers are continuous from 0, and add parameter
   // shapes and names to the program shape.
@@ -172,7 +172,7 @@ StatusOr<ProgramShape> XlaBuilder::GetProgramShape(int64 root_id) const {
       const int64 index = instr.parameter_number();
       TF_RET_CHECK(index >= 0 && index < param_count)
           << "invalid parameter number: " << index;
-      *program_shape.mutable_parameters(index) = instr.shape();
+      *program_shape.mutable_parameters(index) = Shape(instr.shape());
       *program_shape.mutable_parameter_names(index) = instr.name();
     }
   }
@@ -239,6 +239,19 @@ void XlaBuilder::IsConstantVisitor(const int64 op_handle,
   visited->insert(op_handle);
 }
 
+Status XlaBuilder::SetDynamicBinding(int64 dynamic_size_param_num,
+                                     ShapeIndex dynamic_size_param_index,
+                                     int64 target_param_num,
+                                     ShapeIndex target_param_index,
+                                     int64 target_dim_num) {
+  TF_RETURN_IF_ERROR(dynamic_parameter_binding_.Bind(
+      DynamicParameterBinding::DynamicParameter{dynamic_size_param_num,
+                                                dynamic_size_param_index},
+      DynamicParameterBinding::DynamicDimension{
+          target_param_num, target_param_index, target_dim_num}));
+  return Status::OK();
+}
+
 XlaComputation XlaBuilder::BuildAndNoteError() {
   DCHECK(parent_builder_ != nullptr);
   auto build_status = Build();
@@ -275,7 +288,8 @@ StatusOr<XlaComputation> XlaBuilder::Build(int64 root_id) {
 
   HloComputationProto entry;
   SetProtoIdAndName(&entry, name_, kNameSeparator, GetNextId());
-  TF_ASSIGN_OR_RETURN(*entry.mutable_program_shape(), GetProgramShape(root_id));
+  TF_ASSIGN_OR_RETURN(ProgramShape program_shape, GetProgramShape(root_id));
+  *entry.mutable_program_shape() = program_shape.ToProto();
   entry.set_root_id(root_id);
 
   for (auto& instruction : instructions_) {
@@ -297,6 +311,9 @@ StatusOr<XlaComputation> XlaBuilder::Build(int64 root_id) {
   }
   module->add_computations()->Swap(&entry);
 
+  *(module->mutable_dynamic_parameter_binding()) =
+      dynamic_parameter_binding_.ToProto();
+
   // Clear data held by this builder.
   this->instructions_.clear();
   this->handle_to_index_.clear();
@@ -312,7 +329,7 @@ StatusOr<XlaOp> XlaBuilder::InDimBroadcast(
   TF_RETURN_IF_ERROR(first_error_);
 
   HloInstructionProto instr;
-  *instr.mutable_shape() = shape;
+  *instr.mutable_shape() = shape.ToProto();
   for (int64 dim : broadcast_dimensions) {
     instr.add_dimensions(dim);
   }
@@ -363,8 +380,9 @@ XlaOp XlaBuilder::UnaryOp(HloOpcode unop, const XlaOp& operand) {
   return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
     HloInstructionProto instr;
     TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand));
-    TF_ASSIGN_OR_RETURN(*instr.mutable_shape(),
+    TF_ASSIGN_OR_RETURN(Shape shape,
                         ShapeInference::InferUnaryOpShape(unop, operand_shape));
+    *instr.mutable_shape() = shape.ToProto();
     return AddInstruction(std::move(instr), unop, {operand});
   });
 }
@@ -375,9 +393,10 @@ XlaOp XlaBuilder::BinaryOp(HloOpcode binop, const XlaOp& lhs, const XlaOp& rhs,
     HloInstructionProto instr;
     TF_ASSIGN_OR_RETURN(const Shape& lhs_shape, GetShape(lhs));
     TF_ASSIGN_OR_RETURN(const Shape& rhs_shape, GetShape(rhs));
-    TF_ASSIGN_OR_RETURN(*instr.mutable_shape(),
+    TF_ASSIGN_OR_RETURN(Shape shape,
                         ShapeInference::InferBinaryOpShape(
                             binop, lhs_shape, rhs_shape, broadcast_dimensions));
+    *instr.mutable_shape() = shape.ToProto();
 
     const int64 lhs_rank = ShapeUtil::Rank(lhs_shape);
     const int64 rhs_rank = ShapeUtil::Rank(rhs_shape);
@@ -391,7 +410,7 @@ XlaOp XlaBuilder::BinaryOp(HloOpcode binop, const XlaOp& lhs, const XlaOp& rhs,
       const Shape& from_shape = should_broadcast_lhs ? lhs_shape : rhs_shape;
 
       std::vector<int64> to_size;
-      for (int64 size : instr.shape().dimensions()) {
+      for (int64 size : shape.dimensions()) {
         to_size.push_back(size);
       }
       for (int64 from_dim = 0; from_dim < ShapeUtil::Rank(from_shape);
@@ -411,14 +430,14 @@ XlaOp XlaBuilder::BinaryOp(HloOpcode binop, const XlaOp& lhs, const XlaOp& rhs,
     }
 
     TF_ASSIGN_OR_RETURN(Shape updated_lhs_shape, GetShape(updated_lhs));
-    if (!ShapeUtil::SameDimensions(instr.shape(), updated_lhs_shape)) {
+    if (!ShapeUtil::SameDimensions(shape, updated_lhs_shape)) {
       TF_ASSIGN_OR_RETURN(updated_lhs,
-                          AddBroadcastSequence(instr.shape(), updated_lhs));
+                          AddBroadcastSequence(shape, updated_lhs));
     }
     TF_ASSIGN_OR_RETURN(Shape updated_rhs_shape, GetShape(updated_rhs));
-    if (!ShapeUtil::SameDimensions(instr.shape(), updated_rhs_shape)) {
+    if (!ShapeUtil::SameDimensions(shape, updated_rhs_shape)) {
       TF_ASSIGN_OR_RETURN(updated_rhs,
-                          AddBroadcastSequence(instr.shape(), updated_rhs));
+                          AddBroadcastSequence(shape, updated_rhs));
     }
 
     return AddInstruction(std::move(instr), binop, {updated_lhs, updated_rhs});
@@ -432,30 +451,28 @@ XlaOp XlaBuilder::TernaryOp(HloOpcode triop, const XlaOp& lhs, const XlaOp& rhs,
     TF_ASSIGN_OR_RETURN(const Shape& lhs_shape, GetShape(lhs));
     TF_ASSIGN_OR_RETURN(const Shape& rhs_shape, GetShape(rhs));
     TF_ASSIGN_OR_RETURN(const Shape& ehs_shape, GetShape(ehs));
-    TF_ASSIGN_OR_RETURN(*instr.mutable_shape(),
-                        ShapeInference::InferTernaryOpShape(
-                            triop, lhs_shape, rhs_shape, ehs_shape));
+    TF_ASSIGN_OR_RETURN(
+        Shape shape, ShapeInference::InferTernaryOpShape(triop, lhs_shape,
+                                                         rhs_shape, ehs_shape));
+    *instr.mutable_shape() = shape.ToProto();
     XlaOp updated_lhs = lhs;
     XlaOp updated_rhs = rhs;
     XlaOp updated_ehs = ehs;
-    if (!ShapeUtil::IsTuple(instr.shape())) {
+    if (!ShapeUtil::IsTuple(shape)) {
       if (!ShapeUtil::IsTuple(lhs_shape) &&
-          !ShapeUtil::SameDimensions(instr.shape(), lhs_shape)) {
+          !ShapeUtil::SameDimensions(shape, lhs_shape)) {
         // lhs is being implicitly broadcasted. Change to explicit.
-        TF_ASSIGN_OR_RETURN(updated_lhs,
-                            AddBroadcastSequence(instr.shape(), lhs));
+        TF_ASSIGN_OR_RETURN(updated_lhs, AddBroadcastSequence(shape, lhs));
       }
       if (!ShapeUtil::IsTuple(rhs_shape) &&
-          !ShapeUtil::SameDimensions(instr.shape(), rhs_shape)) {
+          !ShapeUtil::SameDimensions(shape, rhs_shape)) {
         // rhs is being implicitly broadcasted. Change to explicit.
-        TF_ASSIGN_OR_RETURN(updated_rhs,
-                            AddBroadcastSequence(instr.shape(), rhs));
+        TF_ASSIGN_OR_RETURN(updated_rhs, AddBroadcastSequence(shape, rhs));
       }
       if (!ShapeUtil::IsTuple(ehs_shape) &&
-          !ShapeUtil::SameDimensions(instr.shape(), ehs_shape)) {
+          !ShapeUtil::SameDimensions(shape, ehs_shape)) {
         // ehs is being implicitly broadcasted. Change to explicit.
-        TF_ASSIGN_OR_RETURN(updated_ehs,
-                            AddBroadcastSequence(instr.shape(), ehs));
+        TF_ASSIGN_OR_RETURN(updated_ehs, AddBroadcastSequence(shape, ehs));
       }
     }
     return AddInstruction(std::move(instr), triop,
@@ -476,7 +493,7 @@ XlaOp XlaBuilder::Mul(const XlaOp& lhs, const XlaOp& rhs,
 XlaOp XlaBuilder::ConstantLiteral(const LiteralSlice& literal) {
   return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
     HloInstructionProto instr;
-    *instr.mutable_shape() = literal.shape();
+    *instr.mutable_shape() = literal.shape().ToProto();
     *instr.mutable_literal() = literal.ToProto();
     return AddInstruction(std::move(instr), HloOpcode::kConstant);
   });
@@ -485,7 +502,7 @@ XlaOp XlaBuilder::ConstantLiteral(const LiteralSlice& literal) {
 XlaOp XlaBuilder::Iota(const Shape& shape, int64 iota_dimension) {
   return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
     HloInstructionProto instr;
-    *instr.mutable_shape() = shape;
+    *instr.mutable_shape() = shape.ToProto();
     instr.add_dimensions(iota_dimension);
     return AddInstruction(std::move(instr), HloOpcode::kIota);
   });
@@ -505,10 +522,10 @@ XlaOp XlaBuilder::Call(const XlaComputation& computation,
                       [](const Shape& shape) { return &shape; });
     TF_ASSIGN_OR_RETURN(const ProgramShape& called_program_shape,
                         computation.GetProgramShape());
-    TF_ASSIGN_OR_RETURN(
-        *instr.mutable_shape(),
-        ShapeInference::InferCallShape(operand_shape_ptrs,
-                                       /*to_apply=*/called_program_shape));
+    TF_ASSIGN_OR_RETURN(Shape shape, ShapeInference::InferCallShape(
+                                         operand_shape_ptrs,
+                                         /*to_apply=*/called_program_shape));
+    *instr.mutable_shape() = shape.ToProto();
 
     AddCalledComputation(computation, &instr);
 
@@ -526,7 +543,7 @@ XlaOp XlaBuilder::Parameter(int64 parameter_number, const Shape& shape,
     }
     instr.set_parameter_number(parameter_number);
     instr.set_name(name);
-    *instr.mutable_shape() = shape;
+    *instr.mutable_shape() = shape.ToProto();
     return AddInstruction(std::move(instr), HloOpcode::kParameter);
   });
 }
@@ -556,27 +573,35 @@ XlaOp XlaBuilder::Broadcast(const XlaOp& operand,
 }
 
 XlaOp XlaBuilder::BroadcastInDim(
-    const XlaOp& operand, const Shape& shape,
+    const XlaOp& operand, const absl::Span<const int64> out_dim_size,
     const absl::Span<const int64> broadcast_dimensions) {
   return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand));
-    TF_RETURN_IF_ERROR(ShapeInference::InferBroadcastShape(operand_shape, shape,
-                                                           broadcast_dimensions)
+    // Output shape, in the case of degenerate broadcast, the out_dim_size is
+    // not necessarily the same as the dimension sizes of the output shape.
+    const auto& output_shape =
+        ShapeUtil::MakeShape(operand_shape.element_type(), out_dim_size);
+
+    TF_RETURN_IF_ERROR(ShapeInference::InferBroadcastShape(
+                           operand_shape, output_shape, broadcast_dimensions)
                            .status());
-    std::vector<int64> in_dim_size(ShapeUtil::Rank(shape));
-    absl::c_copy(shape.dimensions(), in_dim_size.begin());
+    std::vector<int64> in_dim_size(out_dim_size.begin(), out_dim_size.end());
     for (int i = 0; i < broadcast_dimensions.size(); i++) {
       in_dim_size[broadcast_dimensions[i]] = operand_shape.dimensions(i);
     }
     const auto& in_dim_shape =
-        ShapeUtil::MakeShape(shape.element_type(), in_dim_size);
+        ShapeUtil::MakeShape(operand_shape.element_type(), in_dim_size);
     TF_ASSIGN_OR_RETURN(
         XlaOp in_dim_broadcast,
         InDimBroadcast(in_dim_shape, operand, broadcast_dimensions));
-    if (ShapeUtil::Equal(in_dim_shape, shape)) {
+
+    // If broadcast is not degenerate, return broadcasted result.
+    if (ShapeUtil::Equal(in_dim_shape, output_shape)) {
       return in_dim_broadcast;
     }
-    return AddBroadcastSequence(shape, in_dim_broadcast);
+
+    // Otherwise handle degenerate broadcast case.
+    return AddBroadcastSequence(output_shape, in_dim_broadcast);
   });
 }
 
@@ -584,7 +609,7 @@ StatusOr<XlaOp> XlaBuilder::Reshape(const Shape& shape, const XlaOp& operand) {
   TF_RETURN_IF_ERROR(first_error_);
 
   HloInstructionProto instr;
-  *instr.mutable_shape() = shape;
+  *instr.mutable_shape() = shape.ToProto();
   return AddInstruction(std::move(instr), HloOpcode::kReshape, {operand});
 }
 
@@ -596,9 +621,9 @@ XlaOp XlaBuilder::Slice(const XlaOp& operand,
     HloInstructionProto instr;
     TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand));
     TF_ASSIGN_OR_RETURN(
-        *instr.mutable_shape(),
-        ShapeInference::InferSliceShape(operand_shape, start_indices,
-                                        limit_indices, strides));
+        Shape shape, ShapeInference::InferSliceShape(
+                         operand_shape, start_indices, limit_indices, strides));
+    *instr.mutable_shape() = shape.ToProto();
     for (int i = 0; i < start_indices.size(); i++) {
       auto* slice_config = instr.add_slice_dimensions();
       slice_config->set_start(start_indices[i]);
@@ -633,9 +658,10 @@ XlaOp XlaBuilder::DynamicSlice(const XlaOp& operand, const XlaOp& start_indices,
     TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand));
     TF_ASSIGN_OR_RETURN(const Shape& start_indices_shape,
                         GetShape(start_indices));
-    TF_ASSIGN_OR_RETURN(*instr.mutable_shape(),
+    TF_ASSIGN_OR_RETURN(Shape shape,
                         ShapeInference::InferDynamicSliceShape(
                             operand_shape, start_indices_shape, slice_sizes));
+    *instr.mutable_shape() = shape.ToProto();
 
     for (int64 size : slice_sizes) {
       instr.add_dynamic_slice_sizes(size);
@@ -655,9 +681,10 @@ XlaOp XlaBuilder::DynamicUpdateSlice(const XlaOp& operand, const XlaOp& update,
     TF_ASSIGN_OR_RETURN(const Shape& update_shape, GetShape(update));
     TF_ASSIGN_OR_RETURN(const Shape& start_indices_shape,
                         GetShape(start_indices));
-    TF_ASSIGN_OR_RETURN(*instr.mutable_shape(),
+    TF_ASSIGN_OR_RETURN(Shape shape,
                         ShapeInference::InferDynamicUpdateSliceShape(
                             operand_shape, update_shape, start_indices_shape));
+    *instr.mutable_shape() = shape.ToProto();
 
     return AddInstruction(std::move(instr), HloOpcode::kDynamicUpdateSlice,
                           {operand, update, start_indices});
@@ -673,9 +700,9 @@ XlaOp XlaBuilder::ConcatInDim(absl::Span<const XlaOp> operands,
     TF_ASSIGN_OR_RETURN(const auto& operand_shapes, GetOperandShapes(operands));
     absl::c_transform(operand_shapes, std::back_inserter(operand_shape_ptrs),
                       [](const Shape& shape) { return &shape; });
-    TF_ASSIGN_OR_RETURN(
-        *instr.mutable_shape(),
-        ShapeInference::InferConcatOpShape(operand_shape_ptrs, dimension));
+    TF_ASSIGN_OR_RETURN(Shape shape, ShapeInference::InferConcatOpShape(
+                                         operand_shape_ptrs, dimension));
+    *instr.mutable_shape() = shape.ToProto();
 
     instr.add_dimensions(dimension);
 
@@ -692,10 +719,9 @@ XlaOp XlaBuilder::Pad(const XlaOp& operand, const XlaOp& padding_value,
     TF_ASSIGN_OR_RETURN(const Shape& padding_value_shape,
                         GetShape(padding_value));
     TF_ASSIGN_OR_RETURN(
-        *instr.mutable_shape(),
-        ShapeInference::InferPadShape(operand_shape, padding_value_shape,
-                                      padding_config));
-
+        Shape shape, ShapeInference::InferPadShape(
+                         operand_shape, padding_value_shape, padding_config));
+    *instr.mutable_shape() = shape.ToProto();
     *instr.mutable_padding_config() = padding_config;
 
     return AddInstruction(std::move(instr), HloOpcode::kPad,
@@ -708,7 +734,7 @@ XlaOp XlaBuilder::Reshape(const XlaOp& operand,
                           absl::Span<const int64> new_sizes) {
   return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand));
-    TF_ASSIGN_OR_RETURN(const Shape& shape,
+    TF_ASSIGN_OR_RETURN(const Shape shape,
                         ShapeInference::InferReshapeShape(
                             operand_shape, dimensions, new_sizes));
     XlaOp transposed = IsIdentityPermutation(dimensions)
@@ -721,7 +747,7 @@ XlaOp XlaBuilder::Reshape(const XlaOp& operand,
 XlaOp XlaBuilder::Reshape(const XlaOp& operand,
                           absl::Span<const int64> new_sizes) {
   return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
-    TF_ASSIGN_OR_RETURN(auto shape, GetShape(operand));
+    TF_ASSIGN_OR_RETURN(Shape shape, GetShape(operand));
     std::vector<int64> dimensions(shape.dimensions_size());
     std::iota(dimensions.begin(), dimensions.end(), 0);
     return Reshape(operand, dimensions, new_sizes);
@@ -771,7 +797,7 @@ XlaOp XlaBuilder::Collapse(const XlaOp& operand,
 void XlaBuilder::Trace(const string& tag, const XlaOp& operand) {
   ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
     HloInstructionProto instr;
-    *instr.mutable_shape() = ShapeUtil::MakeNil();
+    *instr.mutable_shape() = ShapeUtil::MakeNil().ToProto();
     *instr.mutable_literal() = LiteralUtil::CreateR1U8(tag).ToProto();
     return AddInstruction(std::move(instr), HloOpcode::kTrace, {operand});
   });
@@ -797,9 +823,10 @@ XlaOp XlaBuilder::Tuple(absl::Span<const XlaOp> elements) {
     TF_ASSIGN_OR_RETURN(const auto& operand_shapes, GetOperandShapes(elements));
     absl::c_transform(operand_shapes, std::back_inserter(operand_shape_ptrs),
                       [](const Shape& shape) { return &shape; });
-    TF_ASSIGN_OR_RETURN(*instr.mutable_shape(),
+    TF_ASSIGN_OR_RETURN(const Shape shape,
                         ShapeInference::InferVariadicOpShape(
                             HloOpcode::kTuple, operand_shape_ptrs));
+    *instr.mutable_shape() = shape.ToProto();
     return AddInstruction(std::move(instr), HloOpcode::kTuple, elements);
   });
 }
@@ -814,7 +841,7 @@ XlaOp XlaBuilder::GetTupleElement(const XlaOp& tuple_data, int64 index) {
           ShapeUtil::HumanString(tuple_shape));
     }
     *instr.mutable_shape() =
-        ShapeUtil::GetTupleElementShape(tuple_shape, index);
+        ShapeUtil::GetTupleElementShape(tuple_shape, index).ToProto();
 
     instr.set_tuple_index(index);
 
@@ -873,9 +900,10 @@ XlaOp XlaBuilder::DotGeneral(const XlaOp& lhs, const XlaOp& rhs,
     HloInstructionProto instr;
     TF_ASSIGN_OR_RETURN(const Shape& lhs_shape, GetShape(lhs));
     TF_ASSIGN_OR_RETURN(const Shape& rhs_shape, GetShape(rhs));
-    TF_ASSIGN_OR_RETURN(*instr.mutable_shape(),
+    TF_ASSIGN_OR_RETURN(Shape shape,
                         ShapeInference::InferDotOpShape(lhs_shape, rhs_shape,
                                                         dimension_numbers));
+    *instr.mutable_shape() = shape.ToProto();
     *instr.mutable_dot_dimension_numbers() = dimension_numbers;
     if (precision_config != nullptr) {
       *instr.mutable_precision_config() = *precision_config;
@@ -1017,10 +1045,11 @@ XlaOp XlaBuilder::ConvGeneralDilated(
                         MakeWindow(window_dimensions, window_strides, padding,
                                    lhs_dilation, rhs_dilation));
 
-    TF_ASSIGN_OR_RETURN(*instr.mutable_shape(),
+    TF_ASSIGN_OR_RETURN(Shape shape,
                         ShapeInference::InferConvolveShape(
                             lhs_shape, rhs_shape, feature_group_count,
                             instr.window(), dimension_numbers));
+    *instr.mutable_shape() = shape.ToProto();
 
     *instr.mutable_convolution_dimension_numbers() = dimension_numbers;
     instr.set_feature_group_count(feature_group_count);
@@ -1093,10 +1122,9 @@ XlaOp XlaBuilder::Fft(const XlaOp& operand, const FftType fft_type,
   return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
     HloInstructionProto instr;
     TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand));
-    TF_ASSIGN_OR_RETURN(
-        *instr.mutable_shape(),
-        ShapeInference::InferFftShape(operand_shape, fft_type, fft_length));
-
+    TF_ASSIGN_OR_RETURN(Shape shape, ShapeInference::InferFftShape(
+                                         operand_shape, fft_type, fft_length));
+    *instr.mutable_shape() = shape.ToProto();
     instr.set_fft_type(fft_type);
     for (int64 i : fft_length) {
       instr.add_fft_length(i);
@@ -1114,7 +1142,7 @@ XlaOp XlaBuilder::Infeed(const Shape& shape, const string& config) {
     }
     const Shape infeed_instruction_shape =
         ShapeUtil::MakeTupleShape({shape, ShapeUtil::MakeTokenShape()});
-    *instr.mutable_shape() = infeed_instruction_shape;
+    *instr.mutable_shape() = infeed_instruction_shape.ToProto();
     instr.set_infeed_config(config);
 
     if (ShapeUtil::IsArray(shape) && sharding() &&
@@ -1135,7 +1163,7 @@ XlaOp XlaBuilder::Infeed(const Shape& shape, const string& config) {
     XlaOp token;
     auto make_token = [&]() {
       HloInstructionProto token_instr;
-      *token_instr.mutable_shape() = ShapeUtil::MakeTokenShape();
+      *token_instr.mutable_shape() = ShapeUtil::MakeTokenShape().ToProto();
       return AddInstruction(std::move(token_instr), HloOpcode::kAfterAll, {});
     };
     if (sharding()) {
@@ -1174,7 +1202,7 @@ XlaOp XlaBuilder::Infeed(const Shape& shape, const string& config) {
     // TODO(b/80000000): Remove this when clients have been updated to handle
     // tokens.
     HloInstructionProto infeed_data;
-    *infeed_data.mutable_shape() = shape;
+    *infeed_data.mutable_shape() = shape.ToProto();
     infeed_data.set_tuple_index(0);
     return AddInstruction(std::move(infeed_data), HloOpcode::kGetTupleElement,
                           {infeed});
@@ -1190,7 +1218,7 @@ XlaOp XlaBuilder::InfeedWithToken(const XlaOp& token, const Shape& shape,
     }
     const Shape infeed_instruction_shape =
         ShapeUtil::MakeTupleShape({shape, ShapeUtil::MakeTokenShape()});
-    *instr.mutable_shape() = infeed_instruction_shape;
+    *instr.mutable_shape() = infeed_instruction_shape.ToProto();
     instr.set_infeed_config(config);
 
     if (ShapeUtil::IsArray(shape) && sharding() &&
@@ -1215,7 +1243,7 @@ void XlaBuilder::Outfeed(const XlaOp& operand, const Shape& shape_with_layout,
   ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
     HloInstructionProto instr;
 
-    *instr.mutable_shape() = ShapeUtil::MakeTokenShape();
+    *instr.mutable_shape() = ShapeUtil::MakeTokenShape().ToProto();
 
     // Check and set outfeed shape.
     if (!LayoutUtil::HasLayout(shape_with_layout)) {
@@ -1228,14 +1256,14 @@ void XlaBuilder::Outfeed(const XlaOp& operand, const Shape& shape_with_layout,
           ShapeUtil::HumanStringWithLayout(shape_with_layout),
           ShapeUtil::HumanStringWithLayout(operand_shape));
     }
-    *instr.mutable_outfeed_shape() = shape_with_layout;
+    *instr.mutable_outfeed_shape() = shape_with_layout.ToProto();
 
     instr.set_outfeed_config(outfeed_config);
 
     // Outfeed takes a token as its second operand. Generate the token to pass
     // to the outfeed.
     HloInstructionProto token_instr;
-    *token_instr.mutable_shape() = ShapeUtil::MakeTokenShape();
+    *token_instr.mutable_shape() = ShapeUtil::MakeTokenShape().ToProto();
     TF_ASSIGN_OR_RETURN(XlaOp token, AddInstruction(std::move(token_instr),
                                                     HloOpcode::kAfterAll, {}));
 
@@ -1249,7 +1277,7 @@ void XlaBuilder::Outfeed(const XlaOp& operand, const Shape& shape_with_layout,
     // TODO(b/80000000): Remove this when clients have been updated to handle
     // tokens.
     HloInstructionProto tuple_instr;
-    *tuple_instr.mutable_shape() = ShapeUtil::MakeNil();
+    *tuple_instr.mutable_shape() = ShapeUtil::MakeNil().ToProto();
 
     // The dummy tuple should have no sharding.
     {
@@ -1268,7 +1296,7 @@ XlaOp XlaBuilder::OutfeedWithToken(const XlaOp& operand, const XlaOp& token,
   return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
     HloInstructionProto instr;
 
-    *instr.mutable_shape() = ShapeUtil::MakeTokenShape();
+    *instr.mutable_shape() = ShapeUtil::MakeTokenShape().ToProto();
 
     // Check and set outfeed shape.
     if (!LayoutUtil::HasLayout(shape_with_layout)) {
@@ -1281,7 +1309,7 @@ XlaOp XlaBuilder::OutfeedWithToken(const XlaOp& operand, const XlaOp& token,
           ShapeUtil::HumanStringWithLayout(shape_with_layout),
           ShapeUtil::HumanStringWithLayout(operand_shape));
     }
-    *instr.mutable_outfeed_shape() = shape_with_layout;
+    *instr.mutable_outfeed_shape() = shape_with_layout.ToProto();
 
     instr.set_outfeed_config(outfeed_config);
 
@@ -1293,7 +1321,7 @@ XlaOp XlaBuilder::OutfeedWithToken(const XlaOp& operand, const XlaOp& token,
 XlaOp XlaBuilder::CreateToken() {
   return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
     HloInstructionProto instr;
-    *instr.mutable_shape() = ShapeUtil::MakeTokenShape();
+    *instr.mutable_shape() = ShapeUtil::MakeTokenShape().ToProto();
     return AddInstruction(std::move(instr), HloOpcode::kAfterAll);
   });
 }
@@ -1303,8 +1331,17 @@ XlaOp XlaBuilder::AfterAll(absl::Span<const XlaOp> tokens) {
     if (tokens.empty()) {
       return InvalidArgument("AfterAll requires at least one operand");
     }
+    for (int i = 0; i < tokens.size(); ++i) {
+      const XlaOp& operand = tokens[i];
+      TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand));
+      if (!ShapeUtil::IsToken(operand_shape)) {
+        return InvalidArgument(
+            "All operands to AfterAll must be tokens; operand %d has shape %s",
+            i, ShapeUtil::HumanString(operand_shape));
+      }
+    }
     HloInstructionProto instr;
-    *instr.mutable_shape() = ShapeUtil::MakeTokenShape();
+    *instr.mutable_shape() = ShapeUtil::MakeTokenShape().ToProto();
     return AddInstruction(std::move(instr), HloOpcode::kAfterAll, tokens);
   });
 }
@@ -1321,7 +1358,7 @@ XlaOp XlaBuilder::CustomCall(
           "are reserved for internal use.",
           call_target_name);
     }
-    *instr.mutable_shape() = shape;
+    *instr.mutable_shape() = shape.ToProto();
     instr.set_custom_call_target(call_target_name);
     instr.set_custom_call_opaque(opaque);
     if (operand_shapes_with_layout.has_value()) {
@@ -1345,7 +1382,7 @@ XlaOp XlaBuilder::CustomCall(
               "constrained layout.",
               operand_num);
         }
-        *instr.add_operand_shapes_with_layout() = operand_shape;
+        *instr.add_operand_shapes_with_layout() = operand_shape.ToProto();
         ++operand_num;
       }
     }
@@ -1499,9 +1536,9 @@ XlaOp XlaBuilder::Transpose(const XlaOp& operand,
   return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
     HloInstructionProto instr;
     TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand));
-    TF_ASSIGN_OR_RETURN(
-        *instr.mutable_shape(),
-        ShapeInference::InferTransposeShape(operand_shape, permutation));
+    TF_ASSIGN_OR_RETURN(Shape shape, ShapeInference::InferTransposeShape(
+                                         operand_shape, permutation));
+    *instr.mutable_shape() = shape.ToProto();
     for (int64 dim : permutation) {
       instr.add_dimensions(dim);
     }
@@ -1514,9 +1551,9 @@ XlaOp XlaBuilder::Rev(const XlaOp& operand,
   return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
     HloInstructionProto instr;
     TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand));
-    TF_ASSIGN_OR_RETURN(
-        *instr.mutable_shape(),
-        ShapeInference::InferReverseShape(operand_shape, dimensions));
+    TF_ASSIGN_OR_RETURN(Shape shape, ShapeInference::InferReverseShape(
+                                         operand_shape, dimensions));
+    *instr.mutable_shape() = shape.ToProto();
     for (int64 dim : dimensions) {
       instr.add_dimensions(dim);
     }
@@ -1535,9 +1572,9 @@ XlaOp XlaBuilder::Sort(const XlaOp& keys, absl::Span<const XlaOp> values,
                         GetOperandShapes(values));
     absl::c_transform(values_shapes, std::back_inserter(operand_shape_ptrs),
                       [](const Shape& shape) { return &shape; });
-    TF_ASSIGN_OR_RETURN(*instr.mutable_shape(),
-                        ShapeInference::InferVariadicOpShape(
-                            HloOpcode::kSort, operand_shape_ptrs));
+    TF_ASSIGN_OR_RETURN(Shape shape, ShapeInference::InferVariadicOpShape(
+                                         HloOpcode::kSort, operand_shape_ptrs));
+    *instr.mutable_shape() = shape.ToProto();
     if (dimension == -1) {
       TF_ASSIGN_OR_RETURN(const Shape& keys_shape, GetShape(keys));
       dimension = ShapeUtil::Rank(keys_shape) - 1;
@@ -1559,9 +1596,9 @@ XlaOp XlaBuilder::ConvertElementType(const XlaOp& operand,
   return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
     HloInstructionProto instr;
     TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand));
-    TF_ASSIGN_OR_RETURN(
-        *instr.mutable_shape(),
-        ShapeInference::InferConvertShape(operand_shape, new_element_type));
+    TF_ASSIGN_OR_RETURN(Shape shape, ShapeInference::InferConvertShape(
+                                         operand_shape, new_element_type));
+    *instr.mutable_shape() = shape.ToProto();
     return AddInstruction(std::move(instr), HloOpcode::kConvert, {operand});
   });
 }
@@ -1571,9 +1608,9 @@ XlaOp XlaBuilder::BitcastConvertType(const XlaOp& operand,
   return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
     HloInstructionProto instr;
     TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand));
-    TF_ASSIGN_OR_RETURN(
-        *instr.mutable_shape(),
-        ShapeInference::InferConvertShape(operand_shape, new_element_type));
+    TF_ASSIGN_OR_RETURN(Shape shape, ShapeInference::InferConvertShape(
+                                         operand_shape, new_element_type));
+    *instr.mutable_shape() = shape.ToProto();
     return AddInstruction(std::move(instr), HloOpcode::kBitcastConvert,
                           {operand});
   });
@@ -1605,11 +1642,11 @@ XlaOp XlaBuilder::Map(absl::Span<const XlaOp> operands,
     TF_ASSIGN_OR_RETURN(const ProgramShape& called_program_shape,
                         computation.GetProgramShape());
     TF_ASSIGN_OR_RETURN(
-        *instr.mutable_shape(),
-        ShapeInference::InferMapShape(operand_shape_ptrs, called_program_shape,
-                                      dimensions));
+        Shape shape, ShapeInference::InferMapShape(
+                         operand_shape_ptrs, called_program_shape, dimensions));
+    *instr.mutable_shape() = shape.ToProto();
 
-    const Shape& output_shape = instr.shape();
+    Shape output_shape(instr.shape());
     const int64 output_rank = ShapeUtil::Rank(output_shape);
     AddCalledComputation(computation, &instr);
     std::vector<XlaOp> new_operands(operands.begin(), operands.end());
@@ -1652,7 +1689,7 @@ XlaOp XlaBuilder::RngOp(RandomDistribution distribution,
     }
 
     TF_RETURN_IF_ERROR(ShapeUtil::ValidateShapeWithOptionalLayout(shape));
-    *instr.mutable_shape() = shape;
+    *instr.mutable_shape() = shape.ToProto();
 
     instr.set_distribution(distribution);
 
@@ -1680,10 +1717,10 @@ XlaOp XlaBuilder::While(const XlaComputation& condition,
     TF_ASSIGN_OR_RETURN(const auto& condition_program_shape,
                         condition.GetProgramShape());
     TF_ASSIGN_OR_RETURN(const Shape& init_shape, GetShape(init));
-    TF_ASSIGN_OR_RETURN(
-        *instr.mutable_shape(),
-        ShapeInference::InferWhileShape(condition_program_shape,
-                                        body_program_shape, init_shape));
+    TF_ASSIGN_OR_RETURN(Shape shape, ShapeInference::InferWhileShape(
+                                         condition_program_shape,
+                                         body_program_shape, init_shape));
+    *instr.mutable_shape() = shape.ToProto();
     // Body comes before condition computation in the vector.
     AddCalledComputation(body, &instr);
     AddCalledComputation(condition, &instr);
@@ -1700,10 +1737,10 @@ XlaOp XlaBuilder::Gather(const XlaOp& input, const XlaOp& start_indices,
     TF_ASSIGN_OR_RETURN(const Shape& input_shape, GetShape(input));
     TF_ASSIGN_OR_RETURN(const Shape& start_indices_shape,
                         GetShape(start_indices));
-    TF_ASSIGN_OR_RETURN(
-        *instr.mutable_shape(),
-        ShapeInference::InferGatherShape(input_shape, start_indices_shape,
+    TF_ASSIGN_OR_RETURN(Shape shape, ShapeInference::InferGatherShape(
+                                         input_shape, start_indices_shape,
                                          dimension_numbers, slice_sizes));
+    *instr.mutable_shape() = shape.ToProto();
 
     *instr.mutable_gather_dimension_numbers() = dimension_numbers;
     for (int64 bound : slice_sizes) {
@@ -1728,10 +1765,11 @@ XlaOp XlaBuilder::Scatter(const XlaOp& input, const XlaOp& scatter_indices,
     TF_ASSIGN_OR_RETURN(const Shape& updates_shape, GetShape(updates));
     TF_ASSIGN_OR_RETURN(const ProgramShape& to_apply_shape,
                         update_computation.GetProgramShape());
-    TF_ASSIGN_OR_RETURN(*instr.mutable_shape(),
+    TF_ASSIGN_OR_RETURN(Shape shape,
                         ShapeInference::InferScatterShape(
                             input_shape, scatter_indices_shape, updates_shape,
                             to_apply_shape, dimension_numbers));
+    *instr.mutable_shape() = shape.ToProto();
 
     *instr.mutable_scatter_dimension_numbers() = dimension_numbers;
 
@@ -1758,10 +1796,11 @@ XlaOp XlaBuilder::Conditional(const XlaOp& predicate, const XlaOp& true_operand,
     TF_ASSIGN_OR_RETURN(const ProgramShape& false_computation_shape,
                         false_computation.GetProgramShape());
     TF_ASSIGN_OR_RETURN(
-        *instr.mutable_shape(),
+        Shape shape,
         ShapeInference::InferConditionalShape(
             predicate_shape, true_operand_shape, false_operand_shape,
             true_computation_shape, false_computation_shape));
+    *instr.mutable_shape() = shape.ToProto();
 
     // The index of true_computation must be 0 and that of false computation
     // must be 1.
@@ -1803,9 +1842,10 @@ XlaOp XlaBuilder::Reduce(absl::Span<const XlaOp> operands,
                       [](const Shape& shape) { return &shape; });
 
     TF_ASSIGN_OR_RETURN(
-        *instr.mutable_shape(),
+        Shape shape,
         ShapeInference::InferReduceShape(
             operand_shape_ptrs, dimensions_to_reduce, called_program_shape));
+    *instr.mutable_shape() = shape.ToProto();
 
     for (int64 dim : dimensions_to_reduce) {
       instr.add_dimensions(dim);
@@ -1868,10 +1908,10 @@ XlaOp XlaBuilder::ReduceWindowWithGeneralPadding(
                         MakeWindow(window_dimensions, window_strides, padding,
                                    /*lhs_dilation=*/base_dilations,
                                    /*rhs_dilation=*/window_dilations));
-    TF_ASSIGN_OR_RETURN(
-        *instr.mutable_shape(),
-        ShapeInference::InferReduceWindowShape(operand_shape, init_shape,
-                                               instr.window(), to_apply_shape));
+    TF_ASSIGN_OR_RETURN(Shape shape, ShapeInference::InferReduceWindowShape(
+                                         operand_shape, init_shape,
+                                         instr.window(), to_apply_shape));
+    *instr.mutable_shape() = shape.ToProto();
 
     AddCalledComputation(computation, &instr);
     return AddInstruction(std::move(instr), HloOpcode::kReduceWindow,
@@ -1889,9 +1929,10 @@ XlaOp XlaBuilder::BatchNormTraining(const XlaOp& operand, const XlaOp& scale,
     TF_ASSIGN_OR_RETURN(const Shape& scale_shape, GetShape(scale));
     TF_ASSIGN_OR_RETURN(const Shape& offset_shape, GetShape(offset));
     TF_ASSIGN_OR_RETURN(
-        *instr.mutable_shape(),
+        Shape shape,
         ShapeInference::InferBatchNormTrainingShape(
             operand_shape, scale_shape, offset_shape, feature_index));
+    *instr.mutable_shape() = shape.ToProto();
 
     instr.set_epsilon(epsilon);
     instr.set_feature_index(feature_index);
@@ -1913,10 +1954,11 @@ XlaOp XlaBuilder::BatchNormInference(const XlaOp& operand, const XlaOp& scale,
     TF_ASSIGN_OR_RETURN(const Shape& offset_shape, GetShape(offset));
     TF_ASSIGN_OR_RETURN(const Shape& mean_shape, GetShape(mean));
     TF_ASSIGN_OR_RETURN(const Shape& variance_shape, GetShape(variance));
-    TF_ASSIGN_OR_RETURN(*instr.mutable_shape(),
-                        ShapeInference::InferBatchNormInferenceShape(
-                            operand_shape, scale_shape, offset_shape,
-                            mean_shape, variance_shape, feature_index));
+    TF_ASSIGN_OR_RETURN(
+        Shape shape, ShapeInference::InferBatchNormInferenceShape(
+                         operand_shape, scale_shape, offset_shape, mean_shape,
+                         variance_shape, feature_index));
+    *instr.mutable_shape() = shape.ToProto();
 
     instr.set_epsilon(epsilon);
     instr.set_feature_index(feature_index);
@@ -1938,10 +1980,11 @@ XlaOp XlaBuilder::BatchNormGrad(const XlaOp& operand, const XlaOp& scale,
     TF_ASSIGN_OR_RETURN(const Shape& batch_mean_shape, GetShape(batch_mean));
     TF_ASSIGN_OR_RETURN(const Shape& batch_var_shape, GetShape(batch_var));
     TF_ASSIGN_OR_RETURN(const Shape& grad_output_shape, GetShape(grad_output));
-    TF_ASSIGN_OR_RETURN(*instr.mutable_shape(),
+    TF_ASSIGN_OR_RETURN(Shape shape,
                         ShapeInference::InferBatchNormGradShape(
                             operand_shape, scale_shape, batch_mean_shape,
                             batch_var_shape, grad_output_shape, feature_index));
+    *instr.mutable_shape() = shape.ToProto();
 
     instr.set_epsilon(epsilon);
     instr.set_feature_index(feature_index);
@@ -1972,9 +2015,9 @@ XlaOp XlaBuilder::CrossReplicaSum(
   return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
     HloInstructionProto instr;
     TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand));
-    TF_ASSIGN_OR_RETURN(
-        *instr.mutable_shape(),
-        ShapeInference::InferCrossReplicaSumShape({&operand_shape}));
+    TF_ASSIGN_OR_RETURN(Shape shape, ShapeInference::InferCrossReplicaSumShape(
+                                         {&operand_shape}));
+    *instr.mutable_shape() = shape.ToProto();
 
     for (const ReplicaGroup& group : replica_groups) {
       *instr.add_replica_groups() = group;
@@ -2027,8 +2070,8 @@ XlaOp XlaBuilder::AllToAll(const XlaOp& operand, int64 split_dimension,
     absl::c_transform(slice_shapes, std::back_inserter(slice_shape_ptrs),
                       [](const Shape& shape) { return &shape; });
     TF_ASSIGN_OR_RETURN(
-        *instr.mutable_shape(),
-        ShapeInference::InferAllToAllTupleShape(slice_shape_ptrs));
+        Shape shape, ShapeInference::InferAllToAllTupleShape(slice_shape_ptrs));
+    *instr.mutable_shape() = shape.ToProto();
     for (const ReplicaGroup& group : replica_groups) {
       *instr.add_replica_groups() = group;
     }
@@ -2053,8 +2096,9 @@ XlaOp XlaBuilder::CollectivePermute(
     TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand));
     HloInstructionProto instr;
     TF_ASSIGN_OR_RETURN(
-        *instr.mutable_shape(),
+        Shape shape,
         ShapeInference::InferCollectivePermuteShape(operand_shape));
+    *instr.mutable_shape() = shape.ToProto();
 
     for (const auto& pair : source_target_pairs) {
       auto* proto_pair = instr.add_source_target_pairs();
@@ -2103,10 +2147,11 @@ XlaOp XlaBuilder::SelectAndScatterWithGeneralPadding(
     TF_ASSIGN_OR_RETURN(*instr.mutable_window(),
                         MakeWindow(window_dimensions, window_strides, padding,
                                    /*lhs_dilation=*/{}, /*rhs_dilation=*/{}));
-    TF_ASSIGN_OR_RETURN(*instr.mutable_shape(),
+    TF_ASSIGN_OR_RETURN(Shape shape,
                         ShapeInference::InferSelectAndScatterShape(
                             operand_shape, select_shape, instr.window(),
                             source_shape, init_shape, scatter_shape));
+    *instr.mutable_shape() = shape.ToProto();
 
     AddCalledComputation(select, &instr);
     AddCalledComputation(scatter, &instr);
@@ -2121,9 +2166,10 @@ XlaOp XlaBuilder::ReducePrecision(const XlaOp& operand, const int exponent_bits,
   return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
     HloInstructionProto instr;
     TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand));
-    TF_ASSIGN_OR_RETURN(*instr.mutable_shape(),
+    TF_ASSIGN_OR_RETURN(Shape shape,
                         ShapeInference::InferReducePrecisionShape(
                             operand_shape, exponent_bits, mantissa_bits));
+    *instr.mutable_shape() = shape.ToProto();
     instr.set_exponent_bits(exponent_bits);
     instr.set_mantissa_bits(mantissa_bits);
     return AddInstruction(std::move(instr), HloOpcode::kReducePrecision,
@@ -2138,7 +2184,7 @@ void XlaBuilder::Send(const XlaOp& operand, const ChannelHandle& handle) {
     // TODO(b/80000000): Remove this when clients have been updated to handle
     // tokens.
     HloInstructionProto token_instr;
-    *token_instr.mutable_shape() = ShapeUtil::MakeTokenShape();
+    *token_instr.mutable_shape() = ShapeUtil::MakeTokenShape().ToProto();
     TF_ASSIGN_OR_RETURN(XlaOp token, AddInstruction(std::move(token_instr),
                                                     HloOpcode::kAfterAll, {}));
 
@@ -2157,15 +2203,17 @@ XlaOp XlaBuilder::SendWithToken(const XlaOp& operand, const XlaOp& token,
     // token}.
     HloInstructionProto send_instr;
     TF_ASSIGN_OR_RETURN(const Shape& shape, GetShape(operand));
-    *send_instr.mutable_shape() = ShapeUtil::MakeTupleShape(
-        {shape, ShapeUtil::MakeShape(U32, {}), ShapeUtil::MakeTokenShape()});
+    *send_instr.mutable_shape() =
+        ShapeUtil::MakeTupleShape(
+            {shape, ShapeUtil::MakeShape(U32, {}), ShapeUtil::MakeTokenShape()})
+            .ToProto();
     send_instr.set_channel_id(handle.handle());
     TF_ASSIGN_OR_RETURN(XlaOp send,
                         AddInstruction(std::move(send_instr), HloOpcode::kSend,
                                        {operand, token}));
 
     HloInstructionProto send_done_instr;
-    *send_done_instr.mutable_shape() = ShapeUtil::MakeTokenShape();
+    *send_done_instr.mutable_shape() = ShapeUtil::MakeTokenShape().ToProto();
     send_done_instr.set_channel_id(handle.handle());
     return AddInstruction(std::move(send_done_instr), HloOpcode::kSendDone,
                           {send});
@@ -2179,7 +2227,7 @@ XlaOp XlaBuilder::Recv(const Shape& shape, const ChannelHandle& handle) {
     // TODO(b/80000000): Remove this when clients have been updated to handle
     // tokens.
     HloInstructionProto token_instr;
-    *token_instr.mutable_shape() = ShapeUtil::MakeTokenShape();
+    *token_instr.mutable_shape() = ShapeUtil::MakeTokenShape().ToProto();
     TF_ASSIGN_OR_RETURN(XlaOp token, AddInstruction(std::move(token_instr),
                                                     HloOpcode::kAfterAll, {}));
 
@@ -2190,7 +2238,7 @@ XlaOp XlaBuilder::Recv(const Shape& shape, const ChannelHandle& handle) {
     // TODO(b/80000000): Remove this when clients have been updated to handle
     // tokens.
     HloInstructionProto recv_data;
-    *recv_data.mutable_shape() = shape;
+    *recv_data.mutable_shape() = shape.ToProto();
     recv_data.set_tuple_index(0);
     return AddInstruction(std::move(recv_data), HloOpcode::kGetTupleElement,
                           {recv});
@@ -2207,15 +2255,18 @@ XlaOp XlaBuilder::RecvWithToken(const XlaOp& token, const Shape& shape,
     // Recv instruction produces a tuple of {receive buffer, U32 context,
     // token}.
     HloInstructionProto recv_instr;
-    *recv_instr.mutable_shape() = ShapeUtil::MakeTupleShape(
-        {shape, ShapeUtil::MakeShape(U32, {}), ShapeUtil::MakeTokenShape()});
+    *recv_instr.mutable_shape() =
+        ShapeUtil::MakeTupleShape(
+            {shape, ShapeUtil::MakeShape(U32, {}), ShapeUtil::MakeTokenShape()})
+            .ToProto();
     recv_instr.set_channel_id(handle.handle());
     TF_ASSIGN_OR_RETURN(XlaOp recv, AddInstruction(std::move(recv_instr),
                                                    HloOpcode::kRecv, {token}));
 
     HloInstructionProto recv_done_instr;
     *recv_done_instr.mutable_shape() =
-        ShapeUtil::MakeTupleShape({shape, ShapeUtil::MakeTokenShape()});
+        ShapeUtil::MakeTupleShape({shape, ShapeUtil::MakeTokenShape()})
+            .ToProto();
     recv_done_instr.set_channel_id(handle.handle());
     return AddInstruction(std::move(recv_done_instr), HloOpcode::kRecvDone,
                           {recv});
@@ -2249,9 +2300,11 @@ XlaOp XlaBuilder::SendToHost(const XlaOp& operand, const XlaOp& token,
     // Send instruction produces a tuple of {aliased operand, U32 context,
     // token}.
     HloInstructionProto send_instr;
-    *send_instr.mutable_shape() = ShapeUtil::MakeTupleShape(
-        {shape_with_layout, ShapeUtil::MakeShape(U32, {}),
-         ShapeUtil::MakeTokenShape()});
+    *send_instr.mutable_shape() =
+        ShapeUtil::MakeTupleShape({shape_with_layout,
+                                   ShapeUtil::MakeShape(U32, {}),
+                                   ShapeUtil::MakeTokenShape()})
+            .ToProto();
     send_instr.set_channel_id(handle.handle());
     send_instr.set_is_host_transfer(true);
     TF_ASSIGN_OR_RETURN(XlaOp send,
@@ -2259,7 +2312,7 @@ XlaOp XlaBuilder::SendToHost(const XlaOp& operand, const XlaOp& token,
                                        {operand, token}));
 
     HloInstructionProto send_done_instr;
-    *send_done_instr.mutable_shape() = ShapeUtil::MakeTokenShape();
+    *send_done_instr.mutable_shape() = ShapeUtil::MakeTokenShape().ToProto();
     send_done_instr.set_channel_id(handle.handle());
     send_done_instr.set_is_host_transfer(true);
     return AddInstruction(std::move(send_done_instr), HloOpcode::kSendDone,
@@ -2288,8 +2341,10 @@ XlaOp XlaBuilder::RecvFromHost(const XlaOp& token, const Shape& shape,
     // Recv instruction produces a tuple of {receive buffer, U32 context,
     // token}.
     HloInstructionProto recv_instr;
-    *recv_instr.mutable_shape() = ShapeUtil::MakeTupleShape(
-        {shape, ShapeUtil::MakeShape(U32, {}), ShapeUtil::MakeTokenShape()});
+    *recv_instr.mutable_shape() =
+        ShapeUtil::MakeTupleShape(
+            {shape, ShapeUtil::MakeShape(U32, {}), ShapeUtil::MakeTokenShape()})
+            .ToProto();
     recv_instr.set_channel_id(handle.handle());
     recv_instr.set_is_host_transfer(true);
     TF_ASSIGN_OR_RETURN(XlaOp recv, AddInstruction(std::move(recv_instr),
@@ -2297,7 +2352,8 @@ XlaOp XlaBuilder::RecvFromHost(const XlaOp& token, const Shape& shape,
 
     HloInstructionProto recv_done_instr;
     *recv_done_instr.mutable_shape() =
-        ShapeUtil::MakeTupleShape({shape, ShapeUtil::MakeTokenShape()});
+        ShapeUtil::MakeTupleShape({shape, ShapeUtil::MakeTokenShape()})
+            .ToProto();
     recv_done_instr.set_channel_id(handle.handle());
     recv_done_instr.set_is_host_transfer(true);
     return AddInstruction(std::move(recv_done_instr), HloOpcode::kRecvDone,
@@ -2309,9 +2365,9 @@ XlaOp XlaBuilder::GetDimensionSize(const XlaOp& operand, int64 dimension) {
   return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
     HloInstructionProto instr;
     TF_ASSIGN_OR_RETURN(const auto& operand_shape, GetShape(operand));
-    TF_ASSIGN_OR_RETURN(
-        *instr.mutable_shape(),
-        ShapeInference::InferGetDimensionSizeShape(operand_shape, dimension));
+    TF_ASSIGN_OR_RETURN(Shape shape, ShapeInference::InferGetDimensionSizeShape(
+                                         operand_shape, dimension));
+    *instr.mutable_shape() = shape.ToProto();
     instr.add_dimensions(dimension);
     return AddInstruction(std::move(instr), HloOpcode::kGetDimensionSize,
                           {operand});
@@ -2356,7 +2412,7 @@ StatusOr<XlaComputation> XlaBuilder::BuildConstantSubGraph(
   SetProtoIdAndName(&entry, StrCat(name_, "_compute_constant"), kNameSeparator,
                     GetNextId());
   entry.set_root_id(root->id());
-  ProgramShape* program_shape = entry.mutable_program_shape();
+  ProgramShapeProto* program_shape = entry.mutable_program_shape();
   *program_shape->mutable_result() = root->shape();
 
   // We use std::set to keep the instruction ids in ascending order (which is
@@ -2617,9 +2673,10 @@ XlaOp Broadcast(const XlaOp& operand, absl::Span<const int64> broadcast_sizes) {
   return operand.builder()->Broadcast(operand, broadcast_sizes);
 }
 
-XlaOp BroadcastInDim(const XlaOp& operand, const Shape& shape,
+XlaOp BroadcastInDim(const XlaOp& operand,
+                     const absl::Span<const int64> out_dim_size,
                      const absl::Span<const int64> broadcast_dimensions) {
-  return operand.builder()->BroadcastInDim(operand, shape,
+  return operand.builder()->BroadcastInDim(operand, out_dim_size,
                                            broadcast_dimensions);
 }
 
diff --git a/tensorflow/compiler/xla/client/xla_builder.h b/tensorflow/compiler/xla/client/xla_builder.h
index 68314a026ea..098efb60f9b 100644
--- a/tensorflow/compiler/xla/client/xla_builder.h
+++ b/tensorflow/compiler/xla/client/xla_builder.h
@@ -29,6 +29,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/service/dynamic_parameter_binding.h"
 #include "tensorflow/compiler/xla/service/hlo.pb.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/shape_util.h"
@@ -263,35 +264,30 @@ class XlaBuilder {
   // evaluating the computation.
   StatusOr<bool> IsConstant(const XlaOp& operand) const;
 
+  // Sets up binding which indicates that the `target_dim_num` in the subshape
+  // `target_param_index` of parameter `target_param_num` is a dynamic dimension
+  // and its real dynamic size is represented by `dynamic_param_index` in
+  // parameter `dynamic_param_num`.
+  //
+  // TODO(b/119520625): Remove this API once we have more dynamic shape infra
+  // ready.
+  Status SetDynamicBinding(int64 dynamic_size_param_num,
+                           ShapeIndex dynamic_size_param_index,
+                           int64 target_param_num,
+                           ShapeIndex target_param_index, int64 target_dim_num);
+
  private:
   // Build helper which takes the id of the root operation..
   StatusOr<XlaComputation> Build(int64 root_id);
 
-  // Enqueues a "retrieve parameter value" instruction for a parameter that was
-  // passed to the computation.
+  // Description for the methods below can be found in the corresponding public
+  // functions section in this file.
+
   XlaOp Parameter(int64 parameter_number, const Shape& shape,
                   const string& name);
 
-  // Enqueues a constant with the value of the given literal onto the
-  // computation.
   XlaOp ConstantLiteral(const LiteralSlice& literal);
 
-  // Enqueues a constant onto the computation. Methods are templated on the
-  // native host type (NativeT) which corresponds to a specific XLA
-  // PrimitiveType as given in the following table:
-  //
-  //  Native Type   PrimitiveType
-  // -----------------------------
-  //   bool           PRED
-  //   int32          S32
-  //   int64          S64
-  //   uint32         U32
-  //   uint64         U64
-  //   float          F32
-  //   double         F64
-  //
-  // Note: not all primitive types defined in xla_data.proto have a
-  // corresponding native type yet.
   template <typename NativeT>
   XlaOp ConstantR0(NativeT value);
   template <typename NativeT>
@@ -321,181 +317,79 @@ class XlaBuilder {
   template <typename NativeT>
   XlaOp ConstantR4FromArray4D(const Array4D<NativeT>& values);
 
-  // Enqueues a rank one constant (vector) onto the computation. The vector has
-  // size 'length' and every element has the value 'value'.
   template <typename NativeT>
   XlaOp ConstantR1(int64 length, NativeT value);
 
-  // Adds dimensions to an array by duplicating the data in the array.
-  //
-  // The new dimensions are inserted on the left, i.e. if
-  // broadcast_sizes has values {a0, ..., aN} and the operand shape
-  // has dimensions {b0, ..., bM} then the shape of the output has
-  // dimensions {a0, ..., aN, b0, ..., bM}.
-  //
-  // The new dimensions index into copies of the operand, i.e.
-  //
-  //   output[i0, ..., iN, j0, ..., jM] = operand[j0, ..., jM]
   XlaOp Broadcast(const XlaOp& operand,
                   absl::Span<const int64> broadcast_sizes);
 
-  XlaOp BroadcastInDim(const XlaOp& operand, const Shape& shape,
+  XlaOp BroadcastInDim(const XlaOp& operand,
+                       const absl::Span<const int64> out_dim_size,
                        const absl::Span<const int64> broadcast_dimensions);
 
-  // Enqueues a pad operation onto the computation that pads the given value on
-  // the edges as well as between the elements of the input. padding_config
-  // specifies the padding amount for each dimension.
   XlaOp Pad(const XlaOp& operand, const XlaOp& padding_value,
             const PaddingConfig& padding_config);
 
-  // Enqueues an operation onto the computation that flattens the operand based
-  // on the dimension order (major/slowest-varying to minor/fastest-varying)
-  // given, followed by reshaping it into the shape with the given dimension
-  // sizes (also major to minor). Conceptually, this is a limited form of
-  // "shape casting".
   XlaOp Reshape(const XlaOp& operand, absl::Span<const int64> dimensions,
                 absl::Span<const int64> new_sizes);
 
-  // Enqueues an operation onto the computation that collapses the operand, from
-  // first to last dimension (C order), then reshapes it to the given dimension
-  // sizes. Conceptually, this is a limited form of "shape casting".
   XlaOp Reshape(const XlaOp& operand, absl::Span<const int64> new_sizes);
 
-  // Wrapper for Reshape.
-  // Enqueues an operation to collapse the provided dimensions; e.g. an
-  // operand with dimensions {x=256, y=2, z=2, p=32} can be collapsed to
-  // {x=1024, y=32} by collapsing dims {0, 1, 2}. Collapsing dimensions must
-  // be a consecutive, in-order subsequence of the operand dimensions.
-  //
-  // Note that collapsing a single dimension does nothing:
-  //
-  //    {256} collapsing {0} => {256}
-  //    {1} collapsing {0} => {1}
-  //
-  // Collapsing multiple dimensions produces a single result dimension:
-  //
-  //    {256, 2} collapsing {0,1} => {512}
-  //    {256, 2, 3} collapsing {0,1} => {512, 3}
-  //
-  // This could potentially cause data to be moved -- it provides a more
-  // structured form of reshaping than an arbitrary Reshape operation.
   XlaOp Collapse(const XlaOp& operand, absl::Span<const int64> dimensions);
 
-  // Enqueues a slice operation onto the computation that slices the operand
-  // from the start indices to the limit indices; e.g.
-  //
-  //        x
-  //   [ 0 1 2 3 ]
-  // y [ 4 5 6 7 ] => slice(start={1, 1}, limit={2, 3}) => [ 5 6 ]
-  //   [ 8 9 a b ]
-  //
-  // Note that "limit" means up-to-but-not-including; i.e. [start, limit) in 1D
-  // range notation.
-  // The strides parameter determines the stride over the slice
   XlaOp Slice(const XlaOp& operand, absl::Span<const int64> start_indices,
               absl::Span<const int64> limit_indices,
               absl::Span<const int64> strides);
 
-  // Enqueues a slice operation in a given dimension, taking all other
-  // dimensions as they are; e.g. if dimno is 1 from start_index 2 to
-  // limit_index 4 by 1, and the shape is f32[7,8,9], this call is short-hand
-  // for:
-  //
-  //  array[:, 2:4:1, :]
   XlaOp SliceInDim(const XlaOp& operand, int64 start_index, int64 limit_index,
                    int64 stride, int64 dimno);
 
-  // Enqueues a slice operation onto the computation that slices the 'operand'
-  // from dynamic start indices which are passed in 'start_indices'.
-  // The size of the slice in each dimension is passed in 'slice_sizes',
-  // which specify the end point of exclusive slice intervals in each
-  // dimension [start, start + size).
-  // The shape of 'start_indices' must be rank == 1, with dimension size
-  // equal to the rank of the 'operand'.
-  // Slice index calculations are computed modulo input dimension sizes to
-  // prevent dynamic start indices from generating out-of-bound array accesses.
   XlaOp DynamicSlice(const XlaOp& operand, const XlaOp& start_indices,
                      absl::Span<const int64> slice_sizes);
 
-  // Enqueues a dynamic update slice operation onto the computation, which
-  // updates a slice of 'operand' with 'update' at dynamic 'start_indices'.
-  // The shape of 'update' determines the shape of the slice of 'operand'
-  // which is updated.
-  // The indices specified in 'start_indices' specify the offset of the slice
-  // of 'operand' which is updated.
-  //
-  //               update = {10, 11} // calculated at runtime.
-  //   [1 2 3]     start  = {1, 1}   // calculated at runtime.  [1 2  3 ]
-  //   [4 5 6]  => DynamicUpdateslice(data, update, start)   => [4 10 11]
-  //   [7 8 9]                                                  [7 8  9 ]
-  //
-  // The shape of 'start_indices' must be rank == 1, with dimension size
-  // equal to the rank of the 'operand'.
-  // Slice index calculations are computed modulo update dimension sizes to
-  // prevent dynamic start indices from generating out-of-bound array accesses.
   XlaOp DynamicUpdateSlice(const XlaOp& operand, const XlaOp& update,
                            const XlaOp& start_indices);
 
-  // Enqueues a concatenate instruction onto the computation. 'operands' must
-  // have >= 1 entry.
   XlaOp ConcatInDim(absl::Span<const XlaOp> operands, int64 dimension);
 
-  // Enqueue a tracing operation onto the computation; the computation will emit
-  // a logging message with the operand.
   void Trace(const string& tag, const XlaOp& operand);
 
-  // Enqueues a conditional-move-like select operation onto the computation;
-  // predicated on pred, selects between on_true and on_false.
   XlaOp Select(const XlaOp& pred, const XlaOp& on_true, const XlaOp& on_false);
 
-  // Enqueues a tuple-creation instruction onto the computation.
   XlaOp Tuple(absl::Span<const XlaOp> elements);
 
-  // Enqueues a tuple-element-get instruction onto the computation.
   XlaOp GetTupleElement(const XlaOp& tuple_data, int64 index);
 
-  // Enqueues an equal-to comparison instruction onto the computation.
   XlaOp Eq(const XlaOp& lhs, const XlaOp& rhs,
            absl::Span<const int64> broadcast_dimensions = {});
 
-  // Enqueues a not-equal comparison instruction onto the computation.
   XlaOp Ne(const XlaOp& lhs, const XlaOp& rhs,
            absl::Span<const int64> broadcast_dimensions = {});
 
-  // Enqueues a greater-or-equal comparison instruction onto the computation.
   XlaOp Ge(const XlaOp& lhs, const XlaOp& rhs,
            absl::Span<const int64> broadcast_dimensions = {});
 
-  // Enqueues a greater-than comparison instruction onto the computation.
   XlaOp Gt(const XlaOp& lhs, const XlaOp& rhs,
            absl::Span<const int64> broadcast_dimensions = {});
 
-  // Enqueues a less-than comparison instruction onto the computation.
   XlaOp Lt(const XlaOp& lhs, const XlaOp& rhs,
            absl::Span<const int64> broadcast_dimensions = {});
 
-  // Enqueues a less-or-equal comparison instruction onto the computation.
   XlaOp Le(const XlaOp& lhs, const XlaOp& rhs,
            absl::Span<const int64> broadcast_dimensions = {});
 
-  // Enqueues a dot instruction onto the computation.
   XlaOp Dot(const XlaOp& lhs, const XlaOp& rhs,
             const PrecisionConfig* precision_config = nullptr);
 
-  // Enqueues a general dot instruction onto the computation.
   XlaOp DotGeneral(const XlaOp& lhs, const XlaOp& rhs,
                    const DotDimensionNumbers& dimension_numbers,
                    const PrecisionConfig* precision_config = nullptr);
 
-  // Enqueues a convolution instruction onto the computation, which uses the
-  // default convolution dimension numbers.
   XlaOp Conv(const XlaOp& lhs, const XlaOp& rhs,
              absl::Span<const int64> window_strides, Padding padding,
              int64 feature_group_count = 1,
              const PrecisionConfig* precision_config = nullptr);
 
-  // Enqueues a convolution instruction onto the computation, with the caller
-  // provided padding configuration in the format returned by MakePadding().
   XlaOp ConvWithGeneralPadding(
       const XlaOp& lhs, const XlaOp& rhs,
       absl::Span<const int64> window_strides,
@@ -503,8 +397,6 @@ class XlaBuilder {
       int64 feature_group_count = 1,
       const PrecisionConfig* precision_config = nullptr);
 
-  // Enqueues a convolution instruction onto the computation, with the caller
-  // provided dimension numbers configuration.
   XlaOp ConvWithGeneralDimensions(
       const XlaOp& lhs, const XlaOp& rhs,
       absl::Span<const int64> window_strides, Padding padding,
@@ -512,8 +404,6 @@ class XlaBuilder {
       int64 feature_group_count = 1,
       const PrecisionConfig* precision_config = nullptr);
 
-  // Enqueues a convolution instruction onto the computation, with the caller
-  // provided padding configuration as well as the dimension numbers.
   XlaOp ConvGeneral(const XlaOp& lhs, const XlaOp& rhs,
                     absl::Span<const int64> window_strides,
                     absl::Span<const std::pair<int64, int64>> padding,
@@ -521,8 +411,6 @@ class XlaBuilder {
                     int64 feature_group_count = 1,
                     const PrecisionConfig* precision_config = nullptr);
 
-  // Enqueues a convolution instruction onto the computation, with the caller
-  // provided padding configuration, dilation factors and dimension numbers.
   XlaOp ConvGeneralDilated(const XlaOp& lhs, const XlaOp& rhs,
                            absl::Span<const int64> window_strides,
                            absl::Span<const std::pair<int64, int64>> padding,
@@ -532,80 +420,53 @@ class XlaBuilder {
                            int64 feature_group_count = 1,
                            const PrecisionConfig* precision_config = nullptr);
 
-  // Enqueues an FFT instruction onto the computation, of the given type and
-  // with the given FFT length.
   XlaOp Fft(const XlaOp& operand, FftType fft_type,
             absl::Span<const int64> fft_length);
 
-  // Enqueues an infeed instruction onto the computation, which writes data of
-  // the given shape to the infeed buffer of the device.
   XlaOp Infeed(const Shape& shape, const string& config = "");
   XlaOp InfeedWithToken(const XlaOp& token, const Shape& shape,
                         const string& config = "");
 
-  // Enqueues an outfeed instruction onto the computation. This instruction
-  // generates outgoing data transfers for the given data.
-  //
-  // shape_with_layout communicates the laid out shape that we want to outfeed
-  // -- if !ShapeUtil::Compatible(GetShape(operand), shape_with_layout) an error
-  // will occur.
   void Outfeed(const XlaOp& operand, const Shape& shape_with_layout,
                const string& outfeed_config);
   XlaOp OutfeedWithToken(const XlaOp& operand, const XlaOp& token,
                          const Shape& shape_with_layout,
                          const string& outfeed_config);
 
-  // Enqueues a call instruction onto the computation.
   XlaOp Call(const XlaComputation& computation,
              absl::Span<const XlaOp> operands);
 
-  // Enqueues a custom call instruction onto the computation.
   XlaOp CustomCall(
       const string& call_target_name, absl::Span<const XlaOp> operands,
       const Shape& shape_with_layout, const string& opaque,
       absl::optional<absl::Span<const Shape>> operand_shapes_with_layout);
 
-  // The following methods enqueue element-wise binary arithmetic operations
-  // onto the computation. The shapes of the operands have to match unless one
-  // of the operands is a scalar, or an explicit broadcast dimension is given
-  // (see g3doc for more details).
-
-  // Enqueues a complex compose instruction onto the computation.
   XlaOp Complex(const XlaOp& real, const XlaOp& imag,
                 absl::Span<const int64> broadcast_dimensions = {});
 
-  // Enqueues a complex conjugate instruction onto the computation.
   XlaOp Conj(const XlaOp& operand);
 
-  // Enqueues an add instruction onto the computation.
   XlaOp Add(const XlaOp& lhs, const XlaOp& rhs,
             absl::Span<const int64> broadcast_dimensions = {});
 
-  // Enqueues a subtract instruction onto the computation.
   XlaOp Sub(const XlaOp& lhs, const XlaOp& rhs,
             absl::Span<const int64> broadcast_dimensions = {});
 
-  // Enqueues a multiply instruction onto the computation.
   XlaOp Mul(const XlaOp& lhs, const XlaOp& rhs,
             absl::Span<const int64> broadcast_dimensions = {});
 
-  // Enqueues a divide instruction onto the computation.
   XlaOp Div(const XlaOp& lhs, const XlaOp& rhs,
             absl::Span<const int64> broadcast_dimensions = {});
 
-  // Enqueues a remainder instruction onto the computation.
   XlaOp Rem(const XlaOp& lhs, const XlaOp& rhs,
             absl::Span<const int64> broadcast_dimensions = {});
 
-  // Enqueues a max instruction onto the computation.
   XlaOp Max(const XlaOp& lhs, const XlaOp& rhs,
             absl::Span<const int64> broadcast_dimensions = {});
 
-  // Enqueues a min instruction onto the computation.
   XlaOp Min(const XlaOp& lhs, const XlaOp& rhs,
             absl::Span<const int64> broadcast_dimensions = {});
 
-  // Element-wise logical operators
   XlaOp And(const XlaOp& lhs, const XlaOp& rhs,
             absl::Span<const int64> broadcast_dimensions = {});
 
@@ -624,32 +485,23 @@ class XlaBuilder {
   XlaOp ShiftRightLogical(const XlaOp& lhs, const XlaOp& rhs,
                           absl::Span<const int64> broadcast_dimensions = {});
 
-  // Reduces an array among the provided dimensions, given "computation" as a
-  // reduction operator.
   XlaOp Reduce(const XlaOp& operand, const XlaOp& init_value,
                const XlaComputation& computation,
                absl::Span<const int64> dimensions_to_reduce);
 
-  // Reduces several arrays simultaneously among the provided dimensions, given
-  // "computation" as a reduction operator.
   XlaOp Reduce(absl::Span<const XlaOp> operands,
                absl::Span<const XlaOp> init_values,
                const XlaComputation& computation,
                absl::Span<const int64> dimensions_to_reduce);
 
-  // Convenience wrapper around the above that reduces all the dimensions in the
-  // operand shape.
   XlaOp ReduceAll(const XlaOp& operand, const XlaOp& init_value,
                   const XlaComputation& computation);
 
-  // Enqueues a windowed reduce instruction onto the computation.
   XlaOp ReduceWindow(const XlaOp& operand, const XlaOp& init_value,
                      const XlaComputation& computation,
                      absl::Span<const int64> window_dimensions,
                      absl::Span<const int64> window_strides, Padding padding);
 
-  // As ReduceWindow(), but the padding is given in the format
-  // returned by MakePadding().
   XlaOp ReduceWindowWithGeneralPadding(
       const XlaOp& operand, const XlaOp& init_value,
       const XlaComputation& computation,
@@ -659,48 +511,22 @@ class XlaBuilder {
       absl::Span<const int64> window_dilations,
       absl::Span<const std::pair<int64, int64>> padding);
 
-  // Returns the sum of the operand value within each subgroup of replicas. All
-  // replicas supply one input to the sum and all replicas receive the resulting
-  // sum for each subgroup.
   XlaOp CrossReplicaSum(const XlaOp& operand,
                         absl::Span<const ReplicaGroup> replica_groups = {});
 
-  // Enqueues an operation that do an AllReduce of the operand cross cores. Here
-  // AllReduce means doing a reduction on the input operand cross cores and then
-  // broadcasting the reduction result to those cores. The reduction function is
-  // defined by `computation`, which should be a commutative computation on
-  // scalars, e.g., add, min, or max. The way that AllReduce is applied is
-  // configured by:
-  //
-  // - `replica_groups`: each ReplicaGroup contains a list of replica id. If
-  // empty, all replicas belong to one group. Allreduce will be applied within
-  // subgroups. For example, we have 4 replicas, then
-  // replica_groups={{0,2},{1,3}} means, replica 0 and 2 are in subgroup 0,
-  // replica 1 and 3 are in subgroup 1.
-  //
-  // - `channel_id`: for Allreduce nodes from different modules, if they have
-  // the same channel_id, they will be 'Allreduce'd. If empty, Allreduce will
-  // not be applied cross modules.
-  //
-  // TODO(b/117564385): Rename this to AllReduce when it's ready to use.
   XlaOp CrossReplicaSum(
       const XlaOp& operand, const XlaComputation& computation,
       absl::Span<const ReplicaGroup> replica_groups = {},
       const absl::optional<ChannelHandle>& channel_id = absl::nullopt);
 
-  // Enqueues an operation that do an Alltoall of the operand cross cores.
   XlaOp AllToAll(const XlaOp& operand, int64 split_dimension,
                  int64 concat_dimension, int64 split_count,
                  const std::vector<ReplicaGroup>& replica_groups);
 
-  // Enqueues an operation that do an CollectivePermute of the operand cross
-  // cores.
   XlaOp CollectivePermute(
       const XlaOp& operand,
       const std::vector<std::pair<int64, int64>>& source_target_pairs);
 
-  // Enqueues an operation that scatters the `source` array to the selected
-  // indices of each window.
   XlaOp SelectAndScatter(const XlaOp& operand, const XlaComputation& select,
                          absl::Span<const int64> window_dimensions,
                          absl::Span<const int64> window_strides,
@@ -708,8 +534,6 @@ class XlaBuilder {
                          const XlaOp& init_value,
                          const XlaComputation& scatter);
 
-  // As SelectAndScatter(), but the padding is given in the format
-  // returned by MakePadding().
   XlaOp SelectAndScatterWithGeneralPadding(
       const XlaOp& operand, const XlaComputation& select,
       absl::Span<const int64> window_dimensions,
@@ -717,217 +541,119 @@ class XlaBuilder {
       absl::Span<const std::pair<int64, int64>> padding, const XlaOp& source,
       const XlaOp& init_value, const XlaComputation& scatter);
 
-  // Enqueues an abs instruction onto the computation.
   XlaOp Abs(const XlaOp& operand);
 
-  // Enqueues a atan2 instruction onto the computation.
   XlaOp Atan2(const XlaOp& y, const XlaOp& x,
               absl::Span<const int64> broadcast_dimensions = {});
 
-  // Enqueues an exp instruction onto the computation.
   XlaOp Exp(const XlaOp& operand);
 
-  // Enqueues an expm1 instruction onto the computation.
   XlaOp Expm1(const XlaOp& operand);
 
-  // Enqueues a floor instruction onto the computation.
   XlaOp Floor(const XlaOp& operand);
 
-  // Enqueues a ceil instruction onto the computation.
   XlaOp Ceil(const XlaOp& operand);
 
-  // Enqueues a round instruction onto the computation, rounding to nearest even
-  // with half-way cases rounding away from zero.
   XlaOp Round(const XlaOp& operand);
 
-  // Enqueues an log instruction (natural logarithm) onto the computation.
   XlaOp Log(const XlaOp& operand);
 
-  // Enqueues an log1p instruction (log(x+1)) onto the computation.
   XlaOp Log1p(const XlaOp& operand);
 
-  // Enqueues a sign instruction onto the computation.
   XlaOp Sign(const XlaOp& operand);
 
-  // Enqueues a count leading zeros instruction onto the computation.
   XlaOp Clz(const XlaOp& operand);
 
-  // Enqueues a cosine instruction onto the computation.
   XlaOp Cos(const XlaOp& operand);
 
-  // Enqueues a sine instruction onto the computation.
   XlaOp Sin(const XlaOp& operand);
 
-  // Enqueues a tanh instruction onto the computation.
   XlaOp Tanh(const XlaOp& operand);
 
-  // Enqueues a real-part instruction onto the computation.
   XlaOp Real(const XlaOp& operand);
 
-  // Enqueues an imaginary-part instruction onto the computation.
   XlaOp Imag(const XlaOp& operand);
 
-  // Enqueues a lhs^rhs computation onto the computation.
   XlaOp Pow(const XlaOp& lhs, const XlaOp& rhs,
             absl::Span<const int64> broadcast_dimensions = {});
 
-  // Enqueues an operator that tests if the operand's values are finite, i.e.,
-  // not Inf or NaN. Defined only for floating-point types. Returns an array of
-  // booleans with the same shape where entries are true iff the corresponding
-  // entry was NaN.
   XlaOp IsFinite(const XlaOp& operand);
 
-  // Enqueues an iota operation onto the computation.
   XlaOp Iota(const Shape& shape, int64 iota_dimension);
 
-  // Enqueues a rank-1 iota operation onto the computation.
   XlaOp Iota(PrimitiveType type, int64 size);
 
-  // Enqueues a convert instruction onto the computation that changes the
-  // element type of the operand array to primitive_type.
   XlaOp ConvertElementType(const XlaOp& operand,
                            PrimitiveType new_element_type);
 
-  // Enqueues a no-op instruction onto the computation that changes
-  // the element type of the operand array to primitive_type. The
-  // bit-widths of the source and destination element types must be
-  // identical.
   XlaOp BitcastConvertType(const XlaOp& operand,
                            PrimitiveType new_element_type);
 
-  // Enqueues a negate instruction onto the computation.
   XlaOp Neg(const XlaOp& operand);
 
-  // Enqueues a transpose instruction onto the computation.
   XlaOp Transpose(const XlaOp& operand, absl::Span<const int64> permutation);
 
-  // Enqueues a reverse instruction onto the computation. The order of the
-  // elements in the given dimensions is reversed (i.e., the element at index i
-  // is moved to index dimension_size - 1 - i).
   XlaOp Rev(const XlaOp& operand, absl::Span<const int64> dimensions);
 
-  // Enqueues a sort (as increasing order) instruction onto the computation.
-  // If only keys are provided:
-  // * If the keys are an rank-1 tensor (an array), the result is a sorted array
-  // of keys, in ascending order.
-  // * If the keys have higher rank, the keys are sorted along the provided
-  // dimension. For example, for a rank-2 tensor (a matrix) of keys, a dimension
-  // value of 0 will indepenently sort every column, and a dimension value of 1
-  // will independently sort each row. If no dimension number is provided, then
-  // the last dimension is chosen by default.
-  //
-  // If both keys and values are provided:
-  // * The keys and all values must be tensors with the same dimensions. The
-  // element types of the tensors may be different.
-  // * The result is a tuple that consists of a sorted tensor of keys (along the
-  // provided dimension, as above) as the first element, and tensors with their
-  // corresponding values as the other elements.
   XlaOp Sort(const XlaOp& keys, absl::Span<const XlaOp> values = {},
              int64 dimension = -1);
 
-  // Enqueues a clamp instruction onto the computation.
   XlaOp Clamp(const XlaOp& min, const XlaOp& operand, const XlaOp& max);
 
-  // Enqueues a map instruction onto the computation.
   XlaOp Map(absl::Span<const XlaOp> operands, const XlaComputation& computation,
             absl::Span<const int64> dimensions,
             absl::Span<const XlaOp> static_operands = {});
 
-  // Enqueues a N(mu, sigma) random number generation instruction onto the
-  // computation.
   XlaOp RngNormal(const XlaOp& mu, const XlaOp& sigma, const Shape& shape);
 
-  // Enqueues a U(a, b) random number generation instruction onto the
-  // computation. Returns values in the semi-open interval [a, b).
   XlaOp RngUniform(const XlaOp& a, const XlaOp& b, const Shape& shape);
 
-  // Enqueues a while node onto the computation.
   XlaOp While(const XlaComputation& condition, const XlaComputation& body,
               const XlaOp& init);
 
-  // Enqueues a conditional node onto the computation.
   XlaOp Conditional(const XlaOp& predicate, const XlaOp& true_operand,
                     const XlaComputation& true_computation,
                     const XlaOp& false_operand,
                     const XlaComputation& false_computation);
 
-  // Enqueues a ReducePrecision node onto the computation.
   XlaOp ReducePrecision(const XlaOp& operand, const int exponent_bits,
                         const int mantissa_bits);
 
-  // Enqueues a Gather node onto the computation.
   XlaOp Gather(const XlaOp& input, const XlaOp& start_indices,
                const GatherDimensionNumbers& dimension_numbers,
                absl::Span<const int64> slice_sizes);
 
-  // Enqueues a Scatter node onto the computation.
   XlaOp Scatter(const XlaOp& input, const XlaOp& scatter_indices,
                 const XlaOp& updates, const XlaComputation& update_computation,
                 const ScatterDimensionNumbers& dimension_numbers);
 
-  // Enqueues a Send node onto the computation for device-to-device
-  // communication, to send the given operand to a Recv instruction that shares
-  // the same channel handle.
   void Send(const XlaOp& operand, const ChannelHandle& handle);
   XlaOp SendWithToken(const XlaOp& operand, const XlaOp& token,
                       const ChannelHandle& handle);
 
-  // Enqueues a Send node which sends data to the host.
   XlaOp SendToHost(const XlaOp& operand, const XlaOp& token,
                    const Shape& shape_with_layout, const ChannelHandle& handle);
 
-  // Enqueues a Recv node which receives data from the host.
   XlaOp RecvFromHost(const XlaOp& token, const Shape& shape,
                      const ChannelHandle& handle);
 
-  // Enqueues an AfterAll operation with no operands producing a token-shaped
-  // value.
   XlaOp CreateToken();
 
-  // Enqueues an AfterAll operation with no operands producing a token-shaped
-  // value.
   XlaOp AfterAll(absl::Span<const XlaOp> tokens);
 
-  // Enqueues a Recv node onto the computation. The data comes from a Send
-  // instruction that shares the same channel handle and its shape must
-  // be the same as the given shape.
   XlaOp Recv(const Shape& shape, const ChannelHandle& handle);
   XlaOp RecvWithToken(const XlaOp& token, const Shape& shape,
                       const ChannelHandle& handle);
 
-  // Normalizes operand across spatial and batch dimensions for each feature.
-  //
-  // Returns a tuple (normalized, batch_mean, batch_var) where `normalized`
-  // is the normalized result and batch_mean and batch_var are the mean and
-  // variance, respectively, across batch for the operand.
   XlaOp BatchNormTraining(const XlaOp& operand, const XlaOp& scale,
                           const XlaOp& offset, float epsilon,
                           int64 feature_index);
 
-  // Normalizes operand across spatial and batch dimensions for each feature.
-  //
-  // `BatchNormInference` is equivalent to calling `BatchNormTraining` without
-  // computing `mean` and `variance` for each batch inside the operation. It
-  // uses the input `mean` and `variance` instead as estimated values. The
-  // purpose of this op is to reduce latency in inference, hence the name
-  // `BatchNormInference`.
-  //
-  // The output has the same shape as `operand`, and contains the normalized
-  // values for each batch.
   XlaOp BatchNormInference(const XlaOp& operand, const XlaOp& scale,
                            const XlaOp& offset, const XlaOp& mean,
                            const XlaOp& variance, float epsilon,
                            int64 feature_index);
 
-  // Calculates the gradients of a batch norm op.
-  //
-  // The inputs `batch_mean` and `batch_var` represent the mean and variance
-  // across the batch.
-  //
-  // Returns a tuple of three elements:
-  //   - grad_operand: Gradient with respect to input `operand`
-  //   - grad_offset: Gradient with respect to input `offset`
-  //   - grad_scale: Gradient with respect to input `scale`
   XlaOp BatchNormGrad(const XlaOp& operand, const XlaOp& scale,
                       const XlaOp& batch_mean, const XlaOp& batch_var,
                       const XlaOp& grad_output, float epsilon,
@@ -1019,6 +745,9 @@ class XlaBuilder {
   // The instructions of this computation.
   std::vector<HloInstructionProto> instructions_;
 
+  // Dynamic parameter configuration of this computation.
+  DynamicParameterBinding dynamic_parameter_binding_;
+
   // A map from XlaOp::Handle to the index in the instructions_ vector where the
   // instruction is held.
   absl::flat_hash_map<int64, int64> handle_to_index_;
@@ -1096,7 +825,7 @@ class XlaBuilder {
                          absl::Span<const int64> broadcast_sizes);
 
   friend XlaOp BroadcastInDim(
-      const XlaOp& operand, const Shape& shape,
+      const XlaOp& operand, const absl::Span<const int64> out_dim_size,
       const absl::Span<const int64> broadcast_dimensions);
 
   friend XlaOp Pad(const XlaOp& operand, const XlaOp& padding_value,
@@ -1393,6 +1122,7 @@ class XlaScopedShardingAssignment {
 // Free functions for building XlaOps. The intention is that these will
 // become the public API for building XlaOps rather than calling methods on
 // XlaBuilder directly.
+//
 
 // Enqueues a "retrieve parameter value" instruction for a parameter that was
 // passed to the computation.
@@ -1488,7 +1218,8 @@ XlaOp Broadcast(const XlaOp& operand, absl::Span<const int64> broadcast_sizes);
 //   will generate output
 //   {{1 , 1},
 //    {2 , 2}}
-XlaOp BroadcastInDim(const XlaOp& operand, const Shape& shape,
+XlaOp BroadcastInDim(const XlaOp& operand,
+                     const absl::Span<const int64> out_dim_size,
                      const absl::Span<const int64> broadcast_dimensions);
 
 // Enqueues a pad operation onto the computation that pads the given value on
@@ -2138,6 +1869,7 @@ XlaOp BatchNormGrad(const XlaOp& operand, const XlaOp& scale,
 XlaOp GetDimensionSize(const XlaOp& operand, int64 dimension);
 
 // Implementation details below this point.
+//
 
 template <typename NativeT>
 XlaOp XlaBuilder::ConstantR0(NativeT value) {
diff --git a/tensorflow/compiler/xla/client/xla_builder_test.cc b/tensorflow/compiler/xla/client/xla_builder_test.cc
index 8aa85c3cd63..b3f5be300d3 100644
--- a/tensorflow/compiler/xla/client/xla_builder_test.cc
+++ b/tensorflow/compiler/xla/client/xla_builder_test.cc
@@ -267,7 +267,7 @@ TEST_F(XlaBuilderTest, BinopHasInDimAndDegenerateBroadcast) {
 TEST_F(XlaBuilderTest, BroadcastInDim) {
   XlaBuilder b(TestName());
   auto x = Parameter(&b, 0, ShapeUtil::MakeShape(F32, {2, 3}), "x");
-  BroadcastInDim(x, ShapeUtil::MakeShape(F32, {2, 4, 3}),
+  BroadcastInDim(x, {2, 4, 3},
                  /*broadcast_dimensions=*/{0, 2});
   TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(&b));
   auto root = module->entry_computation()->root_instruction();
@@ -277,7 +277,7 @@ TEST_F(XlaBuilderTest, BroadcastInDim) {
 TEST_F(XlaBuilderTest, BroadcastInDimWithDegeneratedDim) {
   XlaBuilder b(TestName());
   auto x = Parameter(&b, 0, ShapeUtil::MakeShape(F32, {2, 1, 4}), "x");
-  BroadcastInDim(x, ShapeUtil::MakeShape(F32, {2, 3, 4}),
+  BroadcastInDim(x, {2, 3, 4},
                  /*broadcast_dimensions=*/{0, 1, 2});
   TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(&b));
   EXPECT_THAT(module->entry_computation()->root_instruction(),
@@ -446,5 +446,14 @@ TEST_F(XlaBuilderTest, ProtoMatches) {
   EXPECT_EQ(c0_string, c1_string);
 }
 
+TEST_F(XlaBuilderTest, AfterAllWithNonTokenOperands) {
+  XlaBuilder b(TestName());
+  AfterAll(&b, {CreateToken(&b), ConstantR0<float>(&b, 1.0)});
+  Status status = b.Build().status();
+  ASSERT_IS_NOT_OK(status);
+  EXPECT_THAT(status.error_message(),
+              ::testing::HasSubstr("All operands to AfterAll must be tokens"));
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/client/xla_computation.cc b/tensorflow/compiler/xla/client/xla_computation.cc
index c9870b65b91..f317892c125 100644
--- a/tensorflow/compiler/xla/client/xla_computation.cc
+++ b/tensorflow/compiler/xla/client/xla_computation.cc
@@ -25,7 +25,7 @@ namespace xla {
 
 StatusOr<ProgramShape> XlaComputation::GetProgramShape() const {
   TF_RET_CHECK(proto_.has_host_program_shape());
-  return proto_.host_program_shape();
+  return ProgramShape(proto_.host_program_shape());
 }
 
 StatusOr<std::unique_ptr<HloSnapshot>> XlaComputation::Snapshot() const {
diff --git a/tensorflow/compiler/xla/client/xla_computation.h b/tensorflow/compiler/xla/client/xla_computation.h
index 71598ef8b29..3ccbfb28bd0 100644
--- a/tensorflow/compiler/xla/client/xla_computation.h
+++ b/tensorflow/compiler/xla/client/xla_computation.h
@@ -19,6 +19,7 @@ limitations under the License.
 #include <utility>
 
 #include "tensorflow/compiler/xla/service/hlo.pb.h"
+#include "tensorflow/compiler/xla/shape.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 
diff --git a/tensorflow/compiler/xla/debug_options_flags.cc b/tensorflow/compiler/xla/debug_options_flags.cc
index 033887d7c11..d7e7b9e6218 100644
--- a/tensorflow/compiler/xla/debug_options_flags.cc
+++ b/tensorflow/compiler/xla/debug_options_flags.cc
@@ -54,7 +54,7 @@ void SetDebugOptionsDefaults(DebugOptions* flags) {
   // TODO(jlebar): Disable fastmath once doing so is not a performance
   // regression.
   flags->set_xla_cpu_enable_fast_math(true);
-  flags->set_xla_gpu_enable_fast_math(true);
+  flags->set_xla_gpu_enable_fast_min_max(true);
 
   flags->set_xla_force_host_platform_device_count(1);
 }
@@ -160,11 +160,11 @@ void AllocateFlags() {
           "Enable unsafe fast-math optimizations in the CPU compiler; "
           "this may produce faster code at the expense of some accuracy."),
       tensorflow::Flag(
-          "xla_gpu_enable_fast_math",
-          bool_setter_for(&DebugOptions::set_xla_cpu_enable_fast_math),
-          flag_values->xla_cpu_enable_fast_math(),
-          "Enable unsafe fast-math optimizations in the GPU compiler; "
-          "this may produce faster code at the expense of some accuracy."),
+          "xla_gpu_enable_fast_min_max",
+          bool_setter_for(&DebugOptions::set_xla_gpu_enable_fast_min_max),
+          flag_values->xla_gpu_enable_fast_min_max(),
+          "Enable fast floating point min/max lowering that does not propagate "
+          "NaNs."),
       tensorflow::Flag(
           "xla_llvm_enable_alias_scope_metadata",
           bool_setter_for(
@@ -335,7 +335,7 @@ void AllocateFlags() {
           "behavior to help run tests on the host that run models in parallel "
           "across multiple devices."),
   });
-  ParseFlagsFromEnv(*flag_objects);
+  ParseFlagsFromEnvAndDieIfUnknown("XLA_FLAGS", *flag_objects);
 }
 
 }  // namespace
diff --git a/tensorflow/compiler/xla/experimental/xla_sharding/xla_sharding.py b/tensorflow/compiler/xla/experimental/xla_sharding/xla_sharding.py
index fb135f5ceda..1fea816a803 100644
--- a/tensorflow/compiler/xla/experimental/xla_sharding/xla_sharding.py
+++ b/tensorflow/compiler/xla/experimental/xla_sharding/xla_sharding.py
@@ -18,12 +18,9 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import math
-
 import numpy as _np  # Avoids becoming a part of public Tensorflow API.
 
 from tensorflow.compiler.xla import xla_data_pb2
-from tensorflow.compiler.xla.python_api import xla_shape
 from tensorflow.core.framework import attr_value_pb2
 
 
@@ -64,22 +61,18 @@ class Sharding(object):
             tile_assignment_devices=[core]))
 
   @classmethod
-  def tile(cls, tile_shape, tile_assignment):
+  def tile(cls, tile_assignment):
     """Returns a Tiled sharding attribute.
 
     This causes an op to be partially computed on multiple cores in the
     XLA device.
 
     Args:
-      tile_shape: A xla_shape.Shape describing the tile shape that each core
-        will compute.
-        The tile shape does not need to be divisible by the tile assignment.
       tile_assignment: An np.ndarray describing the topology of the tiling and
         which device will compute which part of the topology.
 
     Raises:
-      TypeError: tile_assignment was not of np.array type or tile_shape was
-         not of xla_shape.Shape type.
+      TypeError: tile_assignment was not of np.array type.
 
     TODO(jmolloy): This concept is nefarious and is not
     something we really want to expose to users (especially as the
@@ -87,14 +80,11 @@ class Sharding(object):
     """
     if not isinstance(tile_assignment, _np.ndarray):
       raise TypeError('Tile assignment must be of type np.ndarray')
-    if not isinstance(tile_shape, xla_shape.Shape):
-      raise TypeError('Tile shape must be of type xla_shape.Shape')
     dims = list(tile_assignment.shape)
     flattened_devices = tile_assignment.reshape(-1, order='C')
     return Sharding(
         proto=xla_data_pb2.OpSharding(
             type=xla_data_pb2.OpSharding.OTHER,
-            tile_shape=tile_shape.message,
             tile_assignment_dimensions=dims,
             tile_assignment_devices=list(flattened_devices)))
 
@@ -118,14 +108,8 @@ class Sharding(object):
     shape = tensor.shape.as_list()
     if shape[split_dimension] < num_devices:
       raise ValueError('Split dimension was smaller than the required number '
-                       'of splits: shape=%r, dimension=%r, num_devices=%r',
-                       shape, split_dimension, num_devices)
-
-    tile_shape = shape
-    tile_shape[split_dimension] = int(
-        math.ceil(tile_shape[split_dimension] / num_devices))
-    tile_shape_proto = xla_data_pb2.Shape(
-        element_type=xla_data_pb2.F32, dimensions=tile_shape)
+                       'of splits: shape=%r, dimension=%r, num_devices=%r' %
+                       (shape, split_dimension, num_devices))
 
     tile_assignment_dims = [1] * len(shape)
     tile_assignment_dims[split_dimension] = num_devices
@@ -133,7 +117,6 @@ class Sharding(object):
     return Sharding(
         proto=xla_data_pb2.OpSharding(
             type=xla_data_pb2.OpSharding.OTHER,
-            tile_shape=tile_shape_proto,
             tile_assignment_dimensions=tile_assignment_dims,
             tile_assignment_devices=range(num_devices)))
 
@@ -149,7 +132,6 @@ class Sharding(object):
           type=xla_data_pb2.OpSharding.TUPLE, tuple_shardings=tuple_shardings)
     else:
       proto = self._proto
-
     attr_value = attr_value_pb2.AttrValue(s=proto.SerializeToString())
     # TODO(jmolloy): This need to be seriously revisited before declaring this
     # API available for public use.
@@ -194,8 +176,8 @@ def assign_device(tensor, device):
   return tensor
 
 
-def tile(tensor, tile_shape, tile_assignment):
-  Sharding.tile(tile_shape, tile_assignment).apply_to_tensor(tensor)
+def tile(tensor, tile_assignment):
+  Sharding.tile(tile_assignment).apply_to_tensor(tensor)
   return tensor
 
 
diff --git a/tensorflow/compiler/xla/g3doc/_book.yaml b/tensorflow/compiler/xla/g3doc/_book.yaml
index bcfbcc3a22f..12b7094705e 100644
--- a/tensorflow/compiler/xla/g3doc/_book.yaml
+++ b/tensorflow/compiler/xla/g3doc/_book.yaml
@@ -3,15 +3,15 @@ upper_tabs:
 - include: /_upper_tabs_left.yaml
 - include: /api_docs/_upper_tabs_api.yaml
 # Dropdown menu
-- name: Ecosystem
-  path: /ecosystem
+- name: Resources
+  path: /resources
   is_default: true
   menu:
-  - include: /ecosystem/_menu_toc.yaml
+  - include: /resources/_menu_toc.yaml
   lower_tabs:
     # Subsite tabs
     other:
-    - name: Guide
+    - name: Guide & Tutorials
       contents:
       - title: XLA overview
         path: /xla/overview
@@ -27,3 +27,7 @@ upper_tabs:
         path: /xla/shapes
       - title: Using AOT compilation
         path: /xla/tfcompile
+      - heading: Tutorials
+      - title: XLA compile API
+        path: /xla/tutorials/xla_compile
+        status: experimental
diff --git a/tensorflow/compiler/xla/g3doc/_index.yaml b/tensorflow/compiler/xla/g3doc/_index.yaml
index 7934cd11ba2..858de427119 100644
--- a/tensorflow/compiler/xla/g3doc/_index.yaml
+++ b/tensorflow/compiler/xla/g3doc/_index.yaml
@@ -17,7 +17,7 @@ landing_page:
   - classname: devsite-landing-row-cards
     items:
     - heading: XLA - TensorFlow, compiled
-      image_path: /ecosystem/images/tf-logo-card-16x9.png
+      image_path: /resources/images/tf-logo-card-16x9.png
       path: https://developers.googleblog.com/2017/03/xla-tensorflow-compiled.html
       buttons:
       - label: Read on Google Developers blog
@@ -28,7 +28,7 @@ landing_page:
       - label: Watch the video
         path: https://www.youtube.com/watch?v=kAOanJczHA0
     - heading: XLA on GitHub
-      image_path: /ecosystem/images/github-card-16x9.png
+      image_path: /resources/images/github-card-16x9.png
       path: https://github.com/tensorflow/tensorflow/tree/master/tensorflow/compiler/xla
       buttons:
       - label: View on GitHub
diff --git a/tensorflow/compiler/xla/g3doc/images/xla_array_layout_figure1.png b/tensorflow/compiler/xla/g3doc/images/xla_array_layout_figure1.png
new file mode 100644
index 00000000000..00cefe4c780
Binary files /dev/null and b/tensorflow/compiler/xla/g3doc/images/xla_array_layout_figure1.png differ
diff --git a/tensorflow/compiler/xla/g3doc/images/xla_array_layout_figure2.png b/tensorflow/compiler/xla/g3doc/images/xla_array_layout_figure2.png
new file mode 100644
index 00000000000..6439c6e4027
Binary files /dev/null and b/tensorflow/compiler/xla/g3doc/images/xla_array_layout_figure2.png differ
diff --git a/tensorflow/compiler/xla/g3doc/jit.md b/tensorflow/compiler/xla/g3doc/jit.md
index ded1e582b24..85fa16ccc7f 100644
--- a/tensorflow/compiler/xla/g3doc/jit.md
+++ b/tensorflow/compiler/xla/g3doc/jit.md
@@ -86,7 +86,7 @@ on uncompilable operator, xla.compile() returns an explicit error. This is
 useful if you want more predictable behaviors from XLA compilation.
 
 Please see
-[xla.compile() tutorial Colab](https://colab.sandbox.google.com/github/tensorflow/compiler/xla/g3doc/tutorials/xla_compile.ipynb)
+[xla.compile() tutorial Colab](./tutorials/xla_compile.ipynb)
 for how to use it.
 
 ### Placing operators on XLA devices
@@ -144,7 +144,7 @@ Execute the python script to train the model with XLA and turn on a debugging
 feature of XLA via an environmental variable that outputs the XLA graph.
 
 ```shell
-TF_XLA_FLAGS="--xla_hlo_graph_path=/tmp --xla_generate_hlo_graph=.*" python mnist_softmax_xla.py
+XLA_FLAGS="--xla_hlo_graph_path=/tmp --xla_generate_hlo_graph=.*" python mnist_softmax_xla.py
 ```
 
 Open the timeline file created (`timeline.ctf.json`).  The rendered timeline
diff --git a/tensorflow/compiler/xla/g3doc/layout_with_tiling.md b/tensorflow/compiler/xla/g3doc/layout_with_tiling.md
new file mode 100644
index 00000000000..5e990851af7
--- /dev/null
+++ b/tensorflow/compiler/xla/g3doc/layout_with_tiling.md
@@ -0,0 +1,159 @@
+# Tiled layout
+
+*Note: This doc describes how tiled layout is intended to work. Tiling is being
+implemented, but this is an early effort and it is currently not even guaranteed
+to get an Unimplemented error if one tries to use tiling - it may be just
+silently ignored.*
+
+<center> ![](images/xla_array_layout_figure1.png)
+
+Figure 1 </center>
+
+Figure 1 shows how an array F32[3,5] is laid out in memory with 2x2 tiling. A
+shape with this layout is written as F32[3,5]{1,0:(2,2)}, where 1,0 relates to
+the physical order of dimensions (minor_to_major field in Layout) while (2,2)
+after the colon indicates tiling of the physical dimensions by a 2x2 tile.
+
+Intuitively tiles are laid out to cover the shape and then within each tile,
+elements are then laid out without tiling, as in the example above, where the
+right part of the example shows the layout in memory, including the white
+padding elements that are added in order to have complete 2x2 tiles even though
+the original array bounds are not even.
+
+The extra elements in the padding are not required to contain any particular
+value.
+
+## Linear index formulas for tiling given a shape and a tile
+
+Without tiling, an element e=(e<sub>n</sub>, e<sub>n-1</sub>, ... ,
+e<sub>1</sub>) in an array with array bounds d=(d<sub>n</sub>, d<sub>n-1</sub>,
+... , d<sub>1</sub>) (d1 is the most minor dimension) is laid out by major to
+minor order at position:
+
+&nbsp;&nbsp; linear_index(e, d) \
+= linear_index((e<sub>n</sub>, e<sub>n-1</sub>, ... , e<sub>1</sub>),
+(d<sub>n</sub>, d<sub>n-1</sub>, ... , d<sub>1</sub>)) \
+= e<sub>n</sub>d<sub>n-1</sub>...d<sub>1</sub> +
+e<sub>n-1</sub>d<sub>n-2</sub>...d<sub>1</sub> + ... + e<sub>1</sub>
+
+For simplicity of notation in this document we assume a tile has the same number
+of dimensions as the array. In XLA's implementation of tiling, this is
+generalized to tilings with fewer dimensions by leaving the initial most-major
+dimensions unchanged and applying the tiling only to the most minor dimensions,
+so that the tiling that is specified mentions a suffix of the physical
+dimensions of the shape being tiled.
+
+When tiling of size (t<sub>n</sub>, t<sub>n-1</sub>, ... , t<sub>1</sub>) is
+used, an element in the array with indices (e<sub>n</sub>, e<sub>n-1</sub>, ...
+, e<sub>1</sub>) is mapped to this position in the final layout:
+
+&nbsp;&nbsp; linear_index_with_tile(e, d, t) \
+= linear_index((⌊e/t⌋, e mod t), (⌈d/t⌉, t)) &nbsp; &nbsp; (arithmetic is
+elementwise, (a,b) is concatenation) \
+= linear_index((⌊e<sub>n</sub>/t<sub>n</sub>⌋, ... ,
+⌊e<sub>1</sub>/t<sub>1</sub>⌋, e<sub>n</sub> mod t<sub>n</sub>, ... ,
+e<sub>1</sub> mod t<sub>1</sub>), (⌈d<sub>n</sub>/t<sub>n</sub>⌉, ... ,
+⌈d<sub>1</sub>/t<sub>1</sub>⌉, t<sub>n</sub>, t<sub>n-1</sub>, ... ,
+t<sub>1</sub>)) \
+= linear_index((⌊e<sub>n</sub>/t<sub>n</sub>⌋, ... ,
+⌊e<sub>1</sub>/t<sub>1</sub>⌋), (⌈d<sub>n</sub>/t<sub>n</sub>⌉, ... ,
+⌈d<sub>1</sub>/t<sub>1</sub>⌉))∙t<sub>n</sub>t<sub>n-1</sub>...t<sub>1</sub> +
+linear_index((e<sub>n</sub> mod t<sub>n</sub>, ... , e<sub>1</sub> mod
+t<sub>1</sub>), (t<sub>n</sub>, t<sub>n-1</sub>, ... , t<sub>1</sub>))
+
+The layout can be thought of as having two parts:
+(⌊e<sub>n</sub>/t<sub>n</sub>⌋, ... , ⌊e<sub>1</sub>/t<sub>1</sub>⌋), which
+corresponds to a tile index in an array of tiles of size
+(⌈d<sub>n</sub>/t<sub>n</sub>⌉, ... , ⌈d<sub>1</sub>/t<sub>1</sub>⌉), and
+(e<sub>n</sub> mod t<sub>n</sub>, ... , e<sub>1</sub> mod t<sub>1</sub>), which
+corresponds to a within-tile index. The ceil function appears in
+⌈d<sub>i</sub>/t<sub>i</sub>⌉ because if tiles overrun the bounds of the larger
+array, padding is inserted as in Figure 1. Both the tiles and elements within
+tiles are laid out recursively without tiling.
+
+For the example in Figure 1, element (2,3) has tile index (1,1), and within-tile
+index (0,1), for a combined coordinate vector of (1, 1, 0, 1). The tile indices
+have bounds (2, 3) and the tile itself is (2, 2) for a combined vector of (2, 3,
+2, 2). The linear index with tile for the element with index (2, 3) in the
+logical shape is then
+
+&nbsp;&nbsp; linear_index_with_tile((2,3), (3,5), (2,2)) \
+= linear_index((1,1,0,1), (2,3,2,2)) \
+= linear_index((1,1), (2,3)) ∙ 2 ∙ 2 + linear_index((0,1), (2,2)) \
+= (1 ∙ 3 + 1) ∙ 2 ∙ 2 + (0 ∙ 2 + 1) \
+= 17.
+
+# Tiling as pad-reshape-transpose
+
+Tiling-based layout operates as follows: \
+Consider an array of dimensions (d<sub>n</sub>, d<sub>n-1</sub>, ... , d1) (d1
+is the most minor dimension). When it’s laid out with tiling of size
+(t<sub>n</sub>, t<sub>n-1</sub>, ... , t<sub>1</sub>) (t<sub>1</sub> is the most
+minor dimension), that tiling can be described in terms of pad-reshape-transpose
+in the following way.
+
+1.  The array is padded to (⌈d<sub>n</sub>/t<sub>n</sub>⌉∙t<sub>n</sub>, ... ,
+    ⌈d<sub>1</sub>/t<sub>1</sub>⌉∙t<sub>1</sub>).
+2.  Each dimension i is broken into (⌈d<sub>i</sub>/t</sub>i</sub>⌉,
+    t<sub>i</sub>), i.e. the array is reshaped to \
+    &nbsp; &nbsp; (⌈d<sub>n</sub>/t<sub>n</sub>⌉, t<sub>n</sub>, ... ,
+    ⌈d<sub>1</sub>/t<sub>1</sub>⌉, t<sub>1</sub>). \
+    There is no physical layout change in this reshape by itself, so this
+    reshape is a bitcast. If one is not explicitly thinking of a tiling, this
+    reshape could express any shape with the same number of elements as the
+    padded shape - the example here is of how to express a tile in this way.
+3.  A transpose happens by moving t<sub>n</sub>, ... , t<sub>1</sub> to the most
+    minor dimensions while keeping their relative order, so that the order of
+    dimensions from most major to most minor becomes \
+    &nbsp; &nbsp; (⌈d<sub>n</sub>/t<sub>n</sub>⌉, ... ,
+    ⌈d<sub>1</sub>/t<sub>1</sub>⌉, t<sub>n</sub>, ... , t<sub>1</sub>).
+
+The final shape has the prefix \
+&nbsp; &nbsp; (⌈d<sub>n</sub>/t<sub>n</sub>⌉, ... ,
+⌈d<sub>1</sub>/t<sub>1</sub>⌉), which describes the number of tiles in each
+dimension. An element in the array (e<sub>n</sub>, ... , e<sub>1</sub>) is
+mapped to this element in the final shape: \
+&nbsp; &nbsp; (⌊e<sub>n</sub>/t<sub>n</sub>⌋, ... ,
+⌊e<sub>0</sub>/t<sub>0</sub>⌋, e<sub>n</sub> mod t<sub>n</sub>, ... ,
+e<sub>1</sub> mod t<sub>1</sub>). It is easy to see that the linear index of the
+element follows the formula above as expected.
+
+# Repeated tiling
+
+XLA's tiling becomes even more flexible by applying it repeatedly.
+
+<center> ![](images/xla_array_layout_figure2.png)
+
+Figure 2 </center>
+
+Figure 2 shows how an array of size 4x8 is tiled by two levels of tiling (first
+2x4 then 2x1). We represent this repeated tiling as (2,4)(2,1). Each color
+indicates a 2x4 tile and each red border box is a 2x1 tile. The numbers
+indicates the linear index in memory of that element in the tiled format. This
+format matches the format used for BF16 on TPU, except that the initial tile is
+bigger, namely the tiling is (8,128)(2,1), where the purpose of the second
+tiling by 2x1 is to collect together two 16 bit values to form one 32 bit value
+in a way that aligns with the architecture of a TPU.
+
+Note that a second or later tile can refer to both the minor within-tile
+dimensions, which just rearranges data within the tile, as in this example with
+(8,128)(2,1), but can also refer to the major cross-tile dimensions from the
+prior tiling.
+
+# Combining dimensions using tiles
+
+XLA's tiling also supports combining dimensions. For example, it can combine
+dimensions in F32[2,7,8,11,10]{4,3,2,1,0} into F32[112,110]{1,0} first before
+tiling it with (2,3). The tile used is (&lowast;,&lowast;,2,&lowast;,3). Here an
+asterisk in a tile implies taking that dimension and combining it with the next
+more minor dimension. Multiple adjacent dimensions can be subsumed together into
+one dimension. A subsumed dimension is represented by a tile value of -1 in that
+dimension of the tile, which is not otherwise valid in a tile as a dimension
+size.
+
+More precisely, if dimension i of the shape is eliminated via an asterisk in the
+tile, then before the prior definition of tiling is applied, that dimension is
+removed from both the shape being tiled and the tile vector, and what was
+dimension i-1 of the shape has its array bound increased from d<sub>i-1</sub> to
+d<sub>i</sub>d<sub>i-1</sub>. This step is repeated for each asterisk in the
+tile vector.
diff --git a/tensorflow/compiler/xla/g3doc/operation_semantics.md b/tensorflow/compiler/xla/g3doc/operation_semantics.md
index 73a9db75f6b..d888b1f23f3 100644
--- a/tensorflow/compiler/xla/g3doc/operation_semantics.md
+++ b/tensorflow/compiler/xla/g3doc/operation_semantics.md
@@ -13,6 +13,22 @@ arbitrary-dimensional array. For convenience, special cases have more specific
 and familiar names; for example a *vector* is a 1-dimensional array and a
 *matrix* is a 2-dimensional array.
 
+## AfterAll
+
+See also
+[`XlaBuilder::AfterAll`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h).
+
+AfterAll takes a variadic number of tokens and produces a single token. Tokens
+are primitive types which can be threaded between side-effecting operations to
+enforce ordering. `AfterAll` can be used as a join of tokens for ordering a
+operation after a set operations.
+
+<b> `AfterAll(operands)` </b>
+
+Arguments  | Type    | Semantics
+---------- | ------- | -------------------------
+`operands` | `XlaOp` | variadic number of tokens
+
 ## AllToAll
 
 See also
@@ -402,6 +418,33 @@ then v12 == f32[8x3] {{10, 11, 12},
 
 ```
 
+## CollectivePermute
+
+See also
+[`XlaBuilder::CollectivePermute`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h).
+
+CollectivePermute is a collective operation that sends and receives data cross
+replicas.
+
+<b> `CollectivePermute(operand, source_target_pairs)` </b>
+
+| Arguments             | Type                    | Semantics                  |
+| --------------------- | ----------------------- | -------------------------- |
+| `operand`             | `XlaOp`                 | n dimensional input array  |
+| `source_target_pairs` | `<int64, int64>` vector | A list of                  |
+:                       :                         : (source_replica_id,        :
+:                       :                         : target_replica_id) pairs.  :
+:                       :                         : For each pair, the operand :
+:                       :                         : is sent from source        :
+:                       :                         : replica to target replica. :
+
+Note that there are the following restrictions on the `source_target_pair`:
+
+-   Any two pairs should not have the same target replica id, and they should
+    not have the same source replica id.
+-   If a replica id is not a target in any pair, then the output on that replica
+    is a tensor consists of 0(s) with the same shape as the input.
+
 ## Concatenate
 
 See also
@@ -1423,10 +1466,11 @@ Builds a constant literal on device rather than a potentially large host
 transfer. Creates a rank 1 array of values starting at zero and incrementing by
 one.
 
-Arguments | Type            | Semantics
---------- | --------------- | ------------------------------------
-`type`    | `PrimitiveType` | type U
-`size`    | `int64`         | The number of elements in the array.
+Arguments        | Type            | Semantics
+---------------- | --------------- | ------------------------------------
+`type`           | `PrimitiveType` | type U
+`size`           | `int64`         | The number of elements in the array.
+`iota_dimension` | `int64`         | The dimension to increment along.
 
 ## Map
 
@@ -1780,8 +1824,9 @@ XlaBuilder builder(client_, "reduce_window_2x3");
 auto shape = ShapeUtil::MakeShape(F32, {4, 6});
 auto input = builder.Parameter(0, shape, "input");
 builder.ReduceWindow(
-    input, *max,
+    input,
     /*init_val=*/builder.ConstantLiteral(LiteralUtil::MinValue(F32)),
+    *max,
     /*window_dimensions=*/{2, 3},
     /*window_stride_dimensions=*/{2, 3},
     Padding::kValid);
diff --git a/tensorflow/compiler/xla/g3doc/tutorials/xla_compile.ipynb b/tensorflow/compiler/xla/g3doc/tutorials/xla_compile.ipynb
index a83e3f78598..2a83092805b 100644
--- a/tensorflow/compiler/xla/g3doc/tutorials/xla_compile.ipynb
+++ b/tensorflow/compiler/xla/g3doc/tutorials/xla_compile.ipynb
@@ -1,25 +1,38 @@
 {
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "colab": {
+      "name": "The XLA compile API",
+      "version": "0.3.2",
+      "provenance": [],
+      "collapsed_sections": [],
+      "toc_visible": true
+    },
+    "kernelspec": {
+      "name": "python3",
+      "display_name": "Python 3"
+    }
+  },
   "cells": [
     {
-      "cell_type": "markdown",
       "metadata": {
         "colab_type": "text",
         "id": "f4TSNCvpENrW"
       },
+      "cell_type": "markdown",
       "source": [
         "##### Copyright 2018 The TensorFlow Authors."
       ]
     },
     {
-      "cell_type": "code",
-      "execution_count": 0,
       "metadata": {
         "cellView": "form",
-        "colab": {},
         "colab_type": "code",
-        "id": "vamNSA0vEP-m"
+        "id": "vamNSA0vEP-m",
+        "colab": {}
       },
-      "outputs": [],
+      "cell_type": "code",
       "source": [
         "#@title Licensed under the Apache License, Version 2.0 (the \"License\");\n",
         "# you may not use this file except in compliance with the License.\n",
@@ -32,139 +45,84 @@
         "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
         "# See the License for the specific language governing permissions and\n",
         "# limitations under the License."
-      ]
-    },
-    {
-      "cell_type": "code",
+      ],
       "execution_count": 0,
-      "metadata": {
-        "cellView": "form",
-        "colab": {},
-        "colab_type": "code",
-        "id": "xD_ydfejEV7H"
-      },
-      "outputs": [],
-      "source": [
-        "#@title MIT License\n",
-        "#\n",
-        "# Copyright (c) 2017 François Chollet\n",
-        "#\n",
-        "# Permission is hereby granted, free of charge, to any person obtaining a\n",
-        "# copy of this software and associated documentation files (the \"Software\"),\n",
-        "# to deal in the Software without restriction, including without limitation\n",
-        "# the rights to use, copy, modify, merge, publish, distribute, sublicense,\n",
-        "# and/or sell copies of the Software, and to permit persons to whom the\n",
-        "# Software is furnished to do so, subject to the following conditions:\n",
-        "#\n",
-        "# The above copyright notice and this permission notice shall be included in\n",
-        "# all copies or substantial portions of the Software.\n",
-        "#\n",
-        "# THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n",
-        "# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n",
-        "# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL\n",
-        "# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n",
-        "# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING\n",
-        "# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER\n",
-        "# DEALINGS IN THE SOFTWARE."
-      ]
+      "outputs": []
     },
     {
-      "cell_type": "markdown",
       "metadata": {
         "colab_type": "text",
         "id": "e1oSi4lHFt3z"
       },
+      "cell_type": "markdown",
       "source": [
-        "# Welcome to `xla.compile()` tutorial"
+        "# The XLA compile API"
       ]
     },
     {
-      "cell_type": "markdown",
       "metadata": {
         "colab_type": "text",
         "id": "b7noD9NjFRL-"
       },
+      "cell_type": "markdown",
       "source": [
-        "\u003ctable class=\"tfo-notebook-buttons\" align=\"left\"\u003e\n",
-        "  \u003ctd\u003e\n",
-        "    \u003ca target=\"_blank\" href=\"https://www.tensorflow.org/xla/jit#turning_on_jit_compilation\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/tf_logo_32px.png\" /\u003eView on TensorFlow.org\u003c/a\u003e\n",
-        "  \u003c/td\u003e\n",
-        "  \u003ctd\u003e\n",
-        "    \u003ca target=\"_blank\" href=\"https://colab.sandbox.google.com/github/tensorflow/compiler/xla/g3doc/tutorials/xla_compile.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" /\u003eRun in Google Colab\u003c/a\u003e\n",
-        "  \u003c/td\u003e\n",
-        "  \u003ctd\u003e\n",
-        "    \u003ca target=\"_blank\" href=\"https://github.com/tensorflow/compiler/xla/g3doc/tutorials/xla_compile.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" /\u003eView source on GitHub\u003c/a\u003e\n",
-        "  \u003c/td\u003e\n",
-        "\u003c/table\u003e"
+        "<table class=\"tfo-notebook-buttons\" align=\"left\">\n",
+        "  <td>\n",
+        "    <a target=\"_blank\" href=\"https://www.tensorflow.org/xla/tutorials/xla_compile\"><img src=\"https://www.tensorflow.org/images/tf_logo_32px.png\" />View on TensorFlow.org</a>\n",
+        "  </td>\n",
+        "  <td>\n",
+        "    <a target=\"_blank\" href=\"https://colab.research.google.com/github/tensorflow/tensorflow/blob/master/tensorflow/compiler/xla/g3doc/tutorials/xla_compile.ipynb\"><img src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" />Run in Google Colab</a>\n",
+        "  </td>\n",
+        "  <td>\n",
+        "    <a target=\"_blank\" href=\"https://github.com/tensorflow/tensorflow/blob/master/tensorflow/compiler/xla/g3doc/tutorials/xla_compile.ipynb\"><img src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" />View source on GitHub</a>\n",
+        "  </td>\n",
+        "</table>"
       ]
     },
     {
-      "cell_type": "markdown",
       "metadata": {
         "colab_type": "text",
         "id": "v9YbsuLZaBXy"
       },
+      "cell_type": "markdown",
       "source": [
-        "xla.compile() is a new experimental API that compiles part or all of a model with [XLA](https://www.tensorflow.org/extend/xla/).\n",
         "\n",
-        "Please run all code blocks in order."
+        "\n",
+        "Import TensorFlow and the XLA library. XLA contains `xla.compile()`, an experimental API that compiles part or all of a model with [XLA](https://www.tensorflow.org/extend/xla/)."
       ]
     },
     {
-      "cell_type": "code",
-      "execution_count": 0,
       "metadata": {
-        "colab": {},
         "colab_type": "code",
-        "id": "45kUPj5ZFrRa"
+        "id": "45kUPj5ZFrRa",
+        "colab": {}
       },
-      "outputs": [],
-      "source": [
-        "import tensorflow as tf"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "colab_type": "text",
-        "id": "9NMQFjroSMns"
-      },
-      "source": [
-        "Imports XLA library, which includes xla.compile() experimental API."
-      ]
-    },
-    {
       "cell_type": "code",
-      "execution_count": 0,
-      "metadata": {
-        "colab": {},
-        "colab_type": "code",
-        "id": "-Uggy03rSGJm"
-      },
-      "outputs": [],
       "source": [
+        "import tensorflow as tf\n",
+        "\n",
         "from tensorflow.contrib.compiler import xla"
-      ]
+      ],
+      "execution_count": 0,
+      "outputs": []
     },
     {
-      "cell_type": "markdown",
       "metadata": {
         "colab_type": "text",
         "id": "GZVNiRmTDV-5"
       },
+      "cell_type": "markdown",
       "source": [
-        "Define some necessary constants and prepare MNIST dataset."
+        "Define some necessary constants and prepare the MNIST dataset."
       ]
     },
     {
-      "cell_type": "code",
-      "execution_count": 0,
       "metadata": {
-        "colab": {},
         "colab_type": "code",
-        "id": "f37TSEGvGX4_"
+        "id": "f37TSEGvGX4_",
+        "colab": {}
       },
-      "outputs": [],
+      "cell_type": "code",
       "source": [
         "# Size of each input image, 28 x 28 pixels\n",
         "IMAGE_SIZE = 28 * 28\n",
@@ -174,17 +132,17 @@
         "TRAIN_BATCH_SIZE = 100\n",
         "# Number of training steps to run\n",
         "TRAIN_STEPS = 1000"
-      ]
+      ],
+      "execution_count": 0,
+      "outputs": []
     },
     {
-      "cell_type": "code",
-      "execution_count": 0,
       "metadata": {
-        "colab": {},
         "colab_type": "code",
-        "id": "TiVXchblG5hK"
+        "id": "TiVXchblG5hK",
+        "colab": {}
       },
-      "outputs": [],
+      "cell_type": "code",
       "source": [
         "# Loads MNIST dataset.\n",
         "train, test = tf.keras.datasets.mnist.load_data()\n",
@@ -195,16 +153,18 @@
         "images, labels = iterator.get_next()\n",
         "images = tf.reshape(images, [-1, IMAGE_SIZE])\n",
         "images, labels = tf.cast(images, tf.float32), tf.cast(labels, tf.int64)"
-      ]
+      ],
+      "execution_count": 0,
+      "outputs": []
     },
     {
-      "cell_type": "markdown",
       "metadata": {
         "colab_type": "text",
         "id": "x_ZehpZP-SfS"
       },
+      "cell_type": "markdown",
       "source": [
-        "## Defines build_mnist_model function to construct model\n",
+        "# Define the model constructing function\n",
         "\n",
         "Following code block contains a function that constructs a simple model with one dense layer, including both forward and backward propagation.\n",
         "\n",
@@ -212,14 +172,12 @@
       ]
     },
     {
-      "cell_type": "code",
-      "execution_count": 0,
       "metadata": {
-        "colab": {},
         "colab_type": "code",
-        "id": "ZbhJl_WvGa3g"
+        "id": "ZbhJl_WvGa3g",
+        "colab": {}
       },
-      "outputs": [],
+      "cell_type": "code",
       "source": [
         "def build_mnist_model(x, y_):\n",
         "  y = tf.keras.layers.Dense(NUM_CLASSES).apply(x)\n",
@@ -228,47 +186,41 @@
         "  train_step = tf.train.GradientDescentOptimizer(0.5).minimize(cross_entropy)\n",
         "\n",
         "  return y, train_step"
-      ]
+      ],
+      "execution_count": 0,
+      "outputs": []
     },
     {
-      "cell_type": "markdown",
       "metadata": {
         "colab_type": "text",
         "id": "7Jh3lyQHDfM9"
       },
-      "source": [
-        "## Uses xla.compile with build_mnist_model function to enable XLA"
-      ]
-    },
-    {
       "cell_type": "markdown",
-      "metadata": {
-        "colab_type": "text",
-        "id": "EtDwez_1gjzv"
-      },
       "source": [
-        "Following code block wraps the model with xla.compile(), which allows the target function with provided inputs to be executed by XLA."
+        "# Enable XLA\n",
+        "\n",
+        "Use `xla.compile` with the `build_mnist_model` function to enable XLA. Following code block wraps the model with `xla.compile()`, which allows the target function with provided inputs to be executed by XLA."
       ]
     },
     {
-      "cell_type": "code",
-      "execution_count": 0,
       "metadata": {
-        "colab": {},
         "colab_type": "code",
-        "id": "kYpCXCdRHNuN"
+        "id": "kYpCXCdRHNuN",
+        "colab": {}
       },
-      "outputs": [],
+      "cell_type": "code",
       "source": [
         "[y] = xla.compile(build_mnist_model, inputs=[images, labels])"
-      ]
+      ],
+      "execution_count": 0,
+      "outputs": []
     },
     {
-      "cell_type": "markdown",
       "metadata": {
         "colab_type": "text",
         "id": "4giQh62IrZGF"
       },
+      "cell_type": "markdown",
       "source": [
         "When compiling the graph, XLA replaces all the graph nodes constructed in the target function with a few XLA ops.\n",
         "\n",
@@ -293,62 +245,62 @@
       ]
     },
     {
-      "cell_type": "markdown",
       "metadata": {
         "colab_type": "text",
         "id": "TPGas4jjFLZl"
       },
+      "cell_type": "markdown",
       "source": [
         "If you were to print the constructed graph now, you will see that it is not much different from a normal Tensorflow graph and you won't be able to find XLA ops mentioned before. This is because the actual compilation happens later when you try to execute the graph with `sess.run()`.  At that time, Tensorflow triggers a series of graph rewrite passes that actually generate XLA ops, which compiles and executes computation when all inputs are ready."
       ]
     },
     {
-      "cell_type": "markdown",
       "metadata": {
         "colab_type": "text",
         "id": "EZD1m_n1DxAF"
       },
+      "cell_type": "markdown",
       "source": [
-        "## Trains and tests the model"
+        "# Train and test the model"
       ]
     },
     {
-      "cell_type": "code",
-      "execution_count": 0,
       "metadata": {
-        "colab": {},
         "colab_type": "code",
-        "id": "qe28bAHNHUG2"
+        "id": "qe28bAHNHUG2",
+        "colab": {}
       },
-      "outputs": [],
+      "cell_type": "code",
       "source": [
         "# Creates session and initialize all variables.\n",
         "# xla.compile() doesn't work with Keras model.fit() API or TF eager mode yet.\n",
         "sess = tf.Session()\n",
         "sess.run(tf.global_variables_initializer())"
-      ]
+      ],
+      "execution_count": 0,
+      "outputs": []
     },
     {
-      "cell_type": "markdown",
       "metadata": {
         "colab_type": "text",
         "id": "qgsKmz3n2UiW"
       },
+      "cell_type": "markdown",
       "source": [
-        "Following code block trains model.\n",
-        "\n",
-        "Note that evaluating `y` also triggers its control dependency node `train_step`, which updates model variables."
+        "Following code block trains model. Evaluating `y` also triggers its control dependency node `train_step`, which updates model variables."
       ]
     },
     {
-      "cell_type": "code",
-      "execution_count": 0,
       "metadata": {
-        "colab": {},
         "colab_type": "code",
-        "id": "_GxF6jTRHVuA"
+        "id": "_GxF6jTRHVuA",
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 34
+        },
+        "outputId": "fbf299ca-02d5-4e95-f9fe-8f3c0432d132"
       },
-      "outputs": [],
+      "cell_type": "code",
       "source": [
         "# Feeds training dataset\n",
         "sess.run(iterator.make_initializer(train_ds))\n",
@@ -356,18 +308,31 @@
         "# Runs TRAIN_STEPS steps\n",
         "for i in range(TRAIN_STEPS):\n",
         "  sess.run(y)\n",
+        "\n",
         "print(\"Model trained for %s steps.\" % TRAIN_STEPS)"
+      ],
+      "execution_count": 21,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "text": [
+            "Model trained for 1000 steps.\n"
+          ],
+          "name": "stdout"
+        }
       ]
     },
     {
-      "cell_type": "code",
-      "execution_count": 0,
       "metadata": {
-        "colab": {},
         "colab_type": "code",
-        "id": "dHlQlRSRHXD1"
+        "id": "dHlQlRSRHXD1",
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 34
+        },
+        "outputId": "9c3677a2-ec84-406f-9d2c-d722844f3093"
       },
-      "outputs": [],
+      "cell_type": "code",
       "source": [
         "# Tests trained model\n",
         "\n",
@@ -378,35 +343,31 @@
         "correct_prediction = tf.equal(tf.argmax(y, 1), labels)\n",
         "accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))\n",
         "print(\"Prediction accuracy after training: %s\" % sess.run(accuracy))"
+      ],
+      "execution_count": 22,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "text": [
+            "Prediction accuracy after training: 0.91\n"
+          ],
+          "name": "stdout"
+        }
       ]
     },
     {
-      "cell_type": "code",
-      "execution_count": 0,
       "metadata": {
-        "colab": {},
         "colab_type": "code",
-        "id": "ynJQIuzjHYOb"
+        "id": "ynJQIuzjHYOb",
+        "colab": {}
       },
-      "outputs": [],
+      "cell_type": "code",
       "source": [
         "# Cleans up session\n",
         "sess.close()"
-      ]
+      ],
+      "execution_count": 0,
+      "outputs": []
     }
-  ],
-  "metadata": {
-    "colab": {
-      "collapsed_sections": [],
-      "name": "xla.compile() Tutorial",
-      "provenance": [],
-      "version": "0.3.2"
-    },
-    "kernelspec": {
-      "display_name": "Python 2",
-      "name": "python2"
-    }
-  },
-  "nbformat": 4,
-  "nbformat_minor": 0
-}
+  ]
+}
\ No newline at end of file
diff --git a/tensorflow/compiler/xla/index_util.h b/tensorflow/compiler/xla/index_util.h
index 458bdaf2f89..d76f61eb62c 100644
--- a/tensorflow/compiler/xla/index_util.h
+++ b/tensorflow/compiler/xla/index_util.h
@@ -21,6 +21,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/types/span.h"
+#include "tensorflow/compiler/xla/shape.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/platform/macros.h"
diff --git a/tensorflow/compiler/xla/layout_util.cc b/tensorflow/compiler/xla/layout_util.cc
index 2398470dd49..dbb81381acd 100644
--- a/tensorflow/compiler/xla/layout_util.cc
+++ b/tensorflow/compiler/xla/layout_util.cc
@@ -460,6 +460,13 @@ std::ostream& operator<<(std::ostream& out, const Layout& layout) {
   }
   hash_value = Hash64Combine(hash_value, layout.max_sparse_elements());
 
+  for (Tile tile : layout.tiles()) {
+    for (int64 tile_dim : tile.dimensions()) {
+      hash_value = Hash64Combine(hash_value, hash<int64>()(tile_dim));
+    }
+  }
+  hash_value = Hash64Combine(hash_value, layout.element_size_in_bits());
+
   return hash_value;
 }
 
diff --git a/tensorflow/compiler/xla/layout_util.h b/tensorflow/compiler/xla/layout_util.h
index 6e0390763da..6c298e57252 100644
--- a/tensorflow/compiler/xla/layout_util.h
+++ b/tensorflow/compiler/xla/layout_util.h
@@ -21,6 +21,7 @@ limitations under the License.
 #include <string>
 
 #include "absl/types/span.h"
+#include "tensorflow/compiler/xla/shape.h"
 #include "tensorflow/compiler/xla/status.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
diff --git a/tensorflow/compiler/xla/literal.cc b/tensorflow/compiler/xla/literal.cc
index cb00a0ab16d..8f480c1f107 100644
--- a/tensorflow/compiler/xla/literal.cc
+++ b/tensorflow/compiler/xla/literal.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
 #include "absl/strings/str_join.h"
+#include "absl/types/span.h"
 #include "tensorflow/compiler/xla/index_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
@@ -62,6 +63,14 @@ void ConvertEndianShort(char* bytes, int64 size) {
   }
 }
 
+// Since Eigen::half doesn't satisfy the absl::bit_cast contract, we need to be
+// able to transparently access the raw 16-bit value contained within.
+template <typename T>
+T GetRawValue(T val) {
+  return val;
+}
+uint16 GetRawValue(Eigen::half val) { return val.x; }
+
 }  // namespace
 
 LiteralBase::~LiteralBase() {}
@@ -283,16 +292,17 @@ Status MutableLiteralBase::CopyElementFrom(const LiteralSlice& src_literal,
   if (!proto.has_shape()) {
     return InvalidArgument("LiteralProto has no shape");
   }
-  if (ShapeUtil::HasPrimitiveType(proto.shape(), OPAQUE)) {
+  Shape shape(proto.shape());
+  if (ShapeUtil::HasPrimitiveType(shape, OPAQUE)) {
     return InvalidArgument("Literal shape cannot include OPAQUE sub-shape");
   }
-  if (!LayoutUtil::HasLayout(proto.shape())) {
+  if (!LayoutUtil::HasLayout(shape)) {
     return InvalidArgument("LiteralProto has no layout");
   }
 
-  TF_RETURN_IF_ERROR(ShapeUtil::ValidateShapeWithOptionalLayout(proto.shape()));
+  TF_RETURN_IF_ERROR(ShapeUtil::ValidateShapeWithOptionalLayout(shape));
 
-  Literal literal(proto.shape());
+  Literal literal(shape);
 
   TF_RETURN_IF_ERROR(literal.root_piece_->ForEachMutableSubpieceWithStatus(
       [&](const ShapeIndex& index, Piece* piece) {
@@ -1012,166 +1022,143 @@ void LiteralBase::Piece::SortSparseElementsInternal() {
 
 namespace {
 
+string ShapeToString(bool print_layout, const Shape& shape) {
+  return print_layout ? ShapeUtil::HumanStringWithLayout(shape)
+                      : ShapeUtil::HumanString(shape);
+}
+
+void ToStringHelper(const LiteralBase& literal, const ShapeIndex& shape_index,
+                    bool print_layout, std::vector<string>* pieces);
+
+void TupleToStringHelper(const LiteralBase& literal,
+                         const ShapeIndex& shape_index, bool print_layout,
+                         std::vector<string>* pieces) {
+  const Shape& subshape = ShapeUtil::GetSubshape(literal.shape(), shape_index);
+  pieces->push_back(ShapeToString(print_layout, subshape));
+  pieces->push_back(" (\n");
+  std::vector<string> tuple_pieces;
+  for (int i = 0; i < ShapeUtil::TupleElementCount(subshape); ++i) {
+    ShapeIndex element_index = shape_index;
+    element_index.push_back(i);
+    std::vector<string> element_pieces;
+    ToStringHelper(literal, element_index, print_layout, &element_pieces);
+    tuple_pieces.push_back(absl::StrJoin(element_pieces, ""));
+  }
+  pieces->push_back(absl::StrJoin(tuple_pieces, ",\n"));
+  pieces->push_back("\n)");
+}
+
+void SparseArrayToStringHelper(const LiteralBase& literal,
+                               const Shape& subshape, bool print_layout,
+                               std::vector<string>* pieces) {
+  pieces->push_back(ShapeToString(print_layout, subshape));
+  pieces->push_back("{");
+  int64 rank = ShapeUtil::Rank(subshape);
+  int64 num_elements = literal.sparse_element_count();
+  for (int64 i = 0; i < num_elements; ++i) {
+    if (i > 0) {
+      pieces->push_back(", ");
+    }
+    if (rank == 1) {
+      pieces->push_back(StrCat(literal.GetSparseIndex(i)[0]));
+      pieces->push_back(": ");
+    } else {
+      pieces->push_back("[");
+      pieces->push_back(absl::StrJoin(literal.GetSparseIndex(i), ", "));
+      pieces->push_back("]: ");
+    }
+    pieces->push_back(literal.GetSparseElementAsString(i));
+  }
+  pieces->push_back("}");
+}
+
+void DenseArrayToStringHelper(const LiteralBase& literal,
+                              const ShapeIndex& shape_index, bool print_layout,
+                              std::vector<string>* pieces) {
+  const Shape& subshape = ShapeUtil::GetSubshape(literal.shape(), shape_index);
+  int64 rank = ShapeUtil::Rank(subshape);
+
+  std::function<void(absl::Span<const int64> dimensions, std::vector<int64>*)>
+      to_string_recursive = [&](absl::Span<const int64> dimensions,
+                                std::vector<int64>* accum_indices) {
+        // dimensions.size() decreases by 1 at each recursive call,
+        // and accum_indices->size() increases by 1.
+        // Their sum is equal to the rank of the tensor.
+        CHECK_EQ(rank, dimensions.size() + accum_indices->size());
+
+        auto brace_to_string = [&](string brace) -> string {
+          // Handle 1D tensor
+          if (rank == 1) {
+            return brace;
+          }
+          // Handle the innermost tensor of a 2D+ tensor.
+          if (dimensions.size() == 1 && brace == "{") {
+            return StrCat("  ", brace, dimensions[0] <= 1 ? "" : " ");
+          }
+          if (dimensions.size() == 1 && brace == "}") {
+            return StrCat(dimensions[0] <= 1 ? "" : " ", brace);
+          }
+          // Handle the non-innermost tensors of a 2D+ tensor.
+          if (brace == "{") {
+            if (rank > 3 && !accum_indices->empty() &&
+                accum_indices->size() < rank) {
+              int index = accum_indices->size() - 1;
+              int value = accum_indices->back();
+              return StrCat(brace, " /*i", index, "=", value, "*/\n");
+            }
+            return StrCat(brace, "\n");
+          }
+          return StrCat("\n", brace);
+        };
+
+        if (dimensions.empty()) {
+          // Display predicates as 0s and 1s so that the string is more dense.
+          string elem;
+          if (subshape.element_type() == PRED && rank > 0) {
+            elem = literal.Get<bool>(*accum_indices, shape_index) ? "1" : "0";
+          } else {
+            elem = literal.GetAsString(*accum_indices, shape_index);
+          }
+          pieces->push_back(elem);
+        } else {
+          pieces->push_back(brace_to_string("{"));
+          for (int i = 0; i < dimensions[0]; ++i) {
+            std::vector<int64> cloned_indices(*accum_indices);
+            cloned_indices.push_back(i);
+            to_string_recursive(dimensions.subspan(1), &cloned_indices);
+            if (i < dimensions[0] - 1) {
+              pieces->push_back(",");
+              pieces->push_back(dimensions.size() > 1 ? "\n" : " ");
+            }
+          }
+          pieces->push_back(brace_to_string("}"));
+        }
+      };
+
+  if (rank > 1) {
+    pieces->push_back(ShapeToString(print_layout, subshape));
+    pieces->push_back(" ");
+  }
+  std::vector<int64> indices = {};
+  std::vector<int64> dimensions(subshape.dimensions().begin(),
+                                subshape.dimensions().end());
+  to_string_recursive(dimensions, &indices);
+}
+
 void ToStringHelper(const LiteralBase& literal, const ShapeIndex& shape_index,
                     bool print_layout, std::vector<string>* pieces) {
   const Shape& subshape = ShapeUtil::GetSubshape(literal.shape(), shape_index);
   CHECK(LayoutUtil::HasLayout(literal.shape()));
   CHECK(LayoutUtil::HasLayout(subshape));
-
-  auto shape_to_string = [print_layout](const Shape& shape) {
-    if (print_layout) {
-      return ShapeUtil::HumanStringWithLayout(shape);
-    } else {
-      return ShapeUtil::HumanString(shape);
-    }
-  };
-
-  // TODO(b/32894291): refactor this code to reduce code duplication.
   if (ShapeUtil::IsTuple(subshape)) {
-    pieces->push_back(shape_to_string(subshape));
-    pieces->push_back(" (\n");
-    std::vector<string> tuple_pieces;
-    for (int i = 0; i < ShapeUtil::TupleElementCount(subshape); ++i) {
-      ShapeIndex element_index = shape_index;
-      element_index.push_back(i);
-      std::vector<string> element_pieces;
-      ToStringHelper(literal, element_index, print_layout, &element_pieces);
-      tuple_pieces.push_back(absl::StrJoin(element_pieces, ""));
-    }
-    pieces->push_back(absl::StrJoin(tuple_pieces, ",\n"));
-    pieces->push_back("\n)");
-    return;
-  }
-
-  if (ShapeUtil::IsToken(subshape)) {
+    TupleToStringHelper(literal, shape_index, print_layout, pieces);
+  } else if (ShapeUtil::IsToken(subshape)) {
     pieces->push_back("token");
-    return;
-  }
-
-  if (LayoutUtil::IsSparseArray(subshape)) {
-    pieces->push_back(shape_to_string(subshape));
-    pieces->push_back("{");
-    int64 rank = ShapeUtil::Rank(subshape);
-    int64 num_elements = literal.sparse_element_count();
-    for (int64 i = 0; i < num_elements; ++i) {
-      if (i > 0) {
-        pieces->push_back(", ");
-      }
-      if (rank == 1) {
-        pieces->push_back(StrCat(literal.GetSparseIndex(i)[0]));
-        pieces->push_back(": ");
-      } else {
-        pieces->push_back("[");
-        pieces->push_back(absl::StrJoin(literal.GetSparseIndex(i), ", "));
-        pieces->push_back("]: ");
-      }
-      pieces->push_back(literal.GetSparseElementAsString(i));
-    }
-    pieces->push_back("}");
-    return;
-  }
-
-  CHECK(LayoutUtil::IsDenseArray(subshape));
-
-  auto element_to_string = [&](absl::Span<const int64> indices) -> string {
-    PrimitiveType element_type = subshape.element_type();
-    // We display predicates as 0s and 1s so that the string is more dense.
-    string elem = element_type == PRED
-                      ? literal.Get<bool>(indices, shape_index) ? "1" : "0"
-                      : literal.GetAsString(indices, shape_index);
-    return ((!indices.empty() && indices.back() > 0) ? ", " : "") + elem;
-  };
-
-  if (ShapeUtil::Rank(subshape) == 0) {
-    pieces->push_back(literal.GetAsString({}, shape_index));
-  } else if (ShapeUtil::Rank(subshape) == 1) {
-    pieces->push_back("{");
-    for (int64 i0 = 0; i0 < subshape.dimensions(0); ++i0) {
-      pieces->push_back(element_to_string({i0}));
-    }
-    pieces->push_back("}");
-  } else if (ShapeUtil::Rank(subshape) == 2) {
-    pieces->push_back(shape_to_string(subshape));
-    pieces->push_back(" {\n");
-    for (int64 i0 = 0; i0 < subshape.dimensions(0); ++i0) {
-      pieces->push_back("  { ");
-      for (int64 i1 = 0; i1 < subshape.dimensions(1); ++i1) {
-        pieces->push_back(element_to_string({i0, i1}));
-      }
-      pieces->push_back(" ");
-      pieces->push_back(i0 == subshape.dimensions(0) - 1 ? "}\n" : "},\n");
-    }
-    pieces->push_back("}");
-  } else if (ShapeUtil::Rank(subshape) == 3) {
-    pieces->push_back(shape_to_string(subshape));
-    pieces->push_back(" {\n");
-    for (int64 i0 = 0; i0 < subshape.dimensions(0); ++i0) {
-      pieces->push_back(i0 > 0 ? ",\n{" : "{");
-      for (int64 i1 = 0; i1 < subshape.dimensions(1); ++i1) {
-        pieces->push_back(i1 > 0 ? ",\n  { " : " { ");
-        for (int64 i2 = 0; i2 < subshape.dimensions(2); ++i2) {
-          pieces->push_back(element_to_string({i0, i1, i2}));
-        }
-        pieces->push_back(" }");
-      }
-      pieces->push_back(" }");
-    }
-    pieces->push_back("\n}");
-  } else if (ShapeUtil::Rank(subshape) == 4) {
-    pieces->push_back(shape_to_string(subshape));
-    pieces->push_back(" {\n");
-    for (int64 i0 = 0; i0 < subshape.dimensions(0); ++i0) {
-      pieces->push_back(StrFormat("  {  /*i0=%d*/\n", i0));
-      for (int64 i1 = 0; i1 < subshape.dimensions(1); ++i1) {
-        pieces->push_back(StrFormat("    {  /*i1=%d*/\n", i1));
-        for (int64 i2 = 0; i2 < subshape.dimensions(2); ++i2) {
-          pieces->push_back("      {");
-          for (int64 i3 = 0; i3 < subshape.dimensions(3); ++i3) {
-            pieces->push_back(element_to_string({i0, i1, i2, i3}));
-          }
-          pieces->push_back(i2 == subshape.dimensions(2) - 1 ? "}\n" : "},\n");
-        }
-        pieces->push_back(i1 == subshape.dimensions(1) - 1 ? "    }\n"
-                                                           : "    },\n");
-      }
-      pieces->push_back(i0 == subshape.dimensions(0) - 1 ? "  }\n" : "  },\n");
-    }
-    pieces->push_back("}");
-  } else if (ShapeUtil::Rank(subshape) == 5) {
-    pieces->push_back(shape_to_string(subshape));
-    pieces->push_back(" {\n");
-    for (int64 i0 = 0; i0 < subshape.dimensions(0); ++i0) {
-      pieces->push_back(StrFormat("  {  /*i0=%d*/\n", i0));
-      for (int64 i1 = 0; i1 < subshape.dimensions(1); ++i1) {
-        pieces->push_back(StrFormat("    {  /*i1=%d*/\n", i1));
-        for (int64 i2 = 0; i2 < subshape.dimensions(2); ++i2) {
-          pieces->push_back(StrFormat("      {  /*i2=%d*/\n", i2));
-          for (int64 i3 = 0; i3 < subshape.dimensions(3); ++i3) {
-            pieces->push_back("        {");
-            for (int64 i4 = 0; i4 < subshape.dimensions(4); ++i4) {
-              pieces->push_back(element_to_string({i0, i1, i2, i3, i4}));
-            }
-            pieces->push_back(i3 == subshape.dimensions(3) - 1 ? "}\n"
-                                                               : "},\n");
-          }
-          pieces->push_back(i2 == subshape.dimensions(2) - 1 ? "      }\n"
-                                                             : "      },\n");
-        }
-        pieces->push_back(i1 == subshape.dimensions(1) - 1 ? "    }\n"
-                                                           : "    },\n");
-      }
-      pieces->push_back(i0 == subshape.dimensions(0) - 1 ? "  }\n" : "  },\n");
-    }
-    pieces->push_back("}");
+  } else if (LayoutUtil::IsSparseArray(subshape)) {
+    SparseArrayToStringHelper(literal, subshape, print_layout, pieces);
   } else {
-    pieces->push_back(shape_to_string(subshape));
-    pieces->push_back(" {");
-    literal.EachCellAsString(
-        [&](absl::Span<const int64> indices, const string& value) {
-          pieces->push_back(" ");
-          pieces->push_back(value);
-        });
-    pieces->push_back("}");
+    CHECK(LayoutUtil::IsDenseArray(subshape));
+    DenseArrayToStringHelper(literal, shape_index, print_layout, pieces);
   }
 }
 
@@ -1228,16 +1215,32 @@ Literal ConvertBetweenNativeTypes(const LiteralBase& src_literal) {
 }
 
 template <typename NativeSrcT, typename NativeDestT>
-typename std::enable_if<(sizeof(NativeSrcT) == sizeof(NativeDestT)),
+typename std::enable_if<(sizeof(NativeSrcT) == sizeof(NativeDestT) &&
+                         !std::is_same<NativeDestT, Eigen::half>::value),
                         Literal>::type
 BitcastBetweenNativeTypes(const LiteralBase& src_literal) {
   auto converter = [](NativeSrcT src) {
-    return absl::bit_cast<NativeDestT>(src);
+    return absl::bit_cast<NativeDestT>(GetRawValue(src));
   };
   return ConvertBetweenNativeTypesWithConverter<NativeSrcT, NativeDestT>(
       src_literal, converter);
 }
 
+template <typename NativeSrcT, typename NativeDestT>
+typename std::enable_if<(sizeof(NativeSrcT) == sizeof(Eigen::half) &&
+                         std::is_same<NativeDestT, Eigen::half>::value),
+                        Literal>::type
+BitcastBetweenNativeTypes(const LiteralBase& src_literal) {
+  // Eigen::half doesn't satisfy the absl::bit_cast contract, so explicitly
+  // cast to unsigned short and then use raw_uint16_to_half.
+  auto converter = [](NativeSrcT src) {
+    return Eigen::half_impl::raw_uint16_to_half(
+        absl::bit_cast<uint16>(GetRawValue(src)));
+  };
+  return ConvertBetweenNativeTypesWithConverter<NativeSrcT, Eigen::half>(
+      src_literal, converter);
+}
+
 // This template specialization is here to make the compiler happy. bit_cast has
 // a static check that the types are the same size. This specialization should
 // never be used because the source and destination types are checked for
@@ -1792,7 +1795,7 @@ void CopyToRepeatedField(RepeatedFieldT* dest,
 }  // namespace
 
 void LiteralBase::Piece::WriteToProto(LiteralProto* proto) const {
-  *proto->mutable_shape() = subshape();
+  *proto->mutable_shape() = subshape().ToProto();
   switch (subshape().element_type()) {
     case PRED:
       CopyToRepeatedField(proto->mutable_preds(), data<bool>());
@@ -1898,8 +1901,9 @@ Status LiteralBase::Piece::CopyFromProto(const LiteralProto& proto) {
   // These conditions should have been checked in
   // MutableLiteralBase::CreateFromProto.
   TF_RET_CHECK(proto.has_shape());
-  TF_RET_CHECK(LayoutUtil::HasLayout(proto.shape()));
-  TF_RET_CHECK(ShapeUtil::Equal(proto.shape(), subshape()));
+  Shape shape(proto.shape());
+  TF_RET_CHECK(LayoutUtil::HasLayout(shape));
+  TF_RET_CHECK(ShapeUtil::Equal(shape, subshape()));
 
   if (LayoutUtil::IsSparseArray(subshape())) {
     // Compute the number of elements (indices) in the sparse shape and reserve
diff --git a/tensorflow/compiler/xla/literal.h b/tensorflow/compiler/xla/literal.h
index e791048b4d9..fa9a71af4ce 100644
--- a/tensorflow/compiler/xla/literal.h
+++ b/tensorflow/compiler/xla/literal.h
@@ -301,7 +301,7 @@ class LiteralBase {
   //
   // Note: It's an antipattern to use this method then immediately call
   // MutableLiteralBase::Populate on the result (since that results in zero
-  // initialization, then reinitialization. Conside if a call to
+  // initialization, then reinitialization. Consider if a call to
   // absl::make_unique<Literal>(shape), followed by the call to
   // MutableLiteralBase::Populate can be used instead.
   static Literal CreateFromShape(const Shape& shape);
diff --git a/tensorflow/compiler/xla/literal_test.cc b/tensorflow/compiler/xla/literal_test.cc
index 8cec37897a9..49363ad802d 100644
--- a/tensorflow/compiler/xla/literal_test.cc
+++ b/tensorflow/compiler/xla/literal_test.cc
@@ -150,12 +150,58 @@ TEST_F(LiteralUtilTest, R3ToString) {
   const auto literal =
       LiteralUtil::CreateR3({{{1}, {2}}, {{3}, {4}}, {{5}, {6}}});
   const string expected = R"(s32[3,2,1] {
-{ { 1 },
-  { 2 } },
-{ { 3 },
-  { 4 } },
-{ { 5 },
-  { 6 } }
+{
+  {1},
+  {2}
+},
+{
+  {3},
+  {4}
+},
+{
+  {5},
+  {6}
+}
+})";
+  EXPECT_EQ(expected, literal.ToString());
+}
+
+TEST_F(LiteralUtilTest, R6ToString) {
+  const auto literal =
+      LiteralUtil::CreateFromDimensions(S32, {2, 2, 1, 1, 1, 2});
+  const string expected = R"(s32[2,2,1,1,1,2] {
+{ /*i0=0*/
+{ /*i1=0*/
+{ /*i2=0*/
+{ /*i3=0*/
+  { 0, 0 }
+}
+}
+},
+{ /*i1=1*/
+{ /*i2=0*/
+{ /*i3=0*/
+  { 0, 0 }
+}
+}
+}
+},
+{ /*i0=1*/
+{ /*i1=0*/
+{ /*i2=0*/
+{ /*i3=0*/
+  { 0, 0 }
+}
+}
+},
+{ /*i1=1*/
+{ /*i2=0*/
+{ /*i3=0*/
+  { 0, 0 }
+}
+}
+}
+}
 })";
   EXPECT_EQ(expected, literal.ToString());
 }
@@ -190,12 +236,16 @@ TEST_F(LiteralUtilTest, CreateR3FromArray3d) {
   EXPECT_THAT(literal.shape().dimensions(), ElementsAre(2, 3, 2));
   string result = literal.ToString();
   const string expected = R"(f32[2,3,2] {
-{ { 1, 2 },
+{
+  { 1, 2 },
   { 3, 4 },
-  { 5, 6 } },
-{ { 7, 8 },
+  { 5, 6 }
+},
+{
+  { 7, 8 },
   { 9, 10 },
-  { 11, 12 } }
+  { 11, 12 }
+}
 })";
   EXPECT_EQ(expected, result);
 }
@@ -247,18 +297,18 @@ TEST_F(LiteralUtilTest, LiteralR4F32ProjectedStringifies) {
   EXPECT_THAT(literal.shape().dimensions(), ElementsAre(1, 2, 3, 2));
   string result = literal.ToString();
   const string expected = R"(f32[1,2,3,2] {
-  {  /*i0=0*/
-    {  /*i1=0*/
-      {1, 2},
-      {1001, 1002},
-      {2001, 2002}
-    },
-    {  /*i1=1*/
-      {1, 2},
-      {1001, 1002},
-      {2001, 2002}
-    }
-  }
+{ /*i0=0*/
+{ /*i1=0*/
+  { 1, 2 },
+  { 1001, 1002 },
+  { 2001, 2002 }
+},
+{ /*i1=1*/
+  { 1, 2 },
+  { 1001, 1002 },
+  { 2001, 2002 }
+}
+}
 })";
   EXPECT_EQ(expected, result);
 }
@@ -268,30 +318,30 @@ TEST_F(LiteralUtilTest, LiteralR4F32Stringifies) {
               ElementsAre(2, 2, 3, 3));
   string result = literal_r4_2x2x3x3_dim0major_.ToString();
   const string expected = R"(f32[2,2,3,3] {
-  {  /*i0=0*/
-    {  /*i1=0*/
-      {1, 2, 3},
-      {4, 5, 6},
-      {7, 8, 9}
-    },
-    {  /*i1=1*/
-      {11, 12, 13},
-      {14, 15, 16},
-      {17, 18, 19}
-    }
-  },
-  {  /*i0=1*/
-    {  /*i1=0*/
-      {101, 102, 103},
-      {104, 105, 106},
-      {107, 108, 109}
-    },
-    {  /*i1=1*/
-      {201, 202, 203},
-      {204, 205, 206},
-      {207, 208, 209}
-    }
-  }
+{ /*i0=0*/
+{ /*i1=0*/
+  { 1, 2, 3 },
+  { 4, 5, 6 },
+  { 7, 8, 9 }
+},
+{ /*i1=1*/
+  { 11, 12, 13 },
+  { 14, 15, 16 },
+  { 17, 18, 19 }
+}
+},
+{ /*i0=1*/
+{ /*i1=0*/
+  { 101, 102, 103 },
+  { 104, 105, 106 },
+  { 107, 108, 109 }
+},
+{ /*i1=1*/
+  { 201, 202, 203 },
+  { 204, 205, 206 },
+  { 207, 208, 209 }
+}
+}
 })";
   EXPECT_EQ(expected, result);
 }
@@ -1327,13 +1377,26 @@ TEST_F(LiteralUtilTest, BitcastConvertBetweenInvalidTypes) {
       absl::StrContains(status.error_message(), "bit widths are different"));
 }
 
+// Sets the layout of the given ShapeProto to the default.
+void SetDefaultLayoutOnProto(ShapeProto* shape_proto) {
+  CHECK(ShapeUtil::IsArrayPrimitiveType(shape_proto->element_type()));
+  shape_proto->mutable_layout()->set_format(DENSE);
+  auto* minor_to_major =
+      shape_proto->mutable_layout()->mutable_minor_to_major();
+  minor_to_major->Resize(shape_proto->dimensions_size(), 0);
+  const int64 size = minor_to_major->size();
+  for (int64 i = 0; i < size; ++i) {
+    minor_to_major->Set(i, size - 1 - i);
+  }
+}
+
 TEST_F(LiteralUtilTest, CopyFromProto_Bool) {
   LiteralProto p;
   p.mutable_shape()->set_element_type(PRED);
   for (int len = 0; len < 25; ++len) {
     p.mutable_shape()->clear_dimensions();
     p.mutable_shape()->add_dimensions(len);
-    LayoutUtil::SetToDefaultLayout(p.mutable_shape());
+    SetDefaultLayoutOnProto(p.mutable_shape());
     p.clear_preds();
     for (int i = 0; i < len; ++i) {
       p.add_preds((i % 2) == (len % 2));
@@ -1359,7 +1422,7 @@ TEST_F(LiteralUtilTest, ToProto_f16) {
   EXPECT_EQ(4, m.data<half>().size());
 
   LiteralProto p = m.ToProto();
-  EXPECT_EQ(4, ShapeUtil::ElementsIn(p.shape()));
+  EXPECT_EQ(4, ShapeUtil::ElementsIn(Shape(p.shape())));
   EXPECT_EQ(8, p.f16s().size());
   const char* d = p.f16s().data();
   EXPECT_EQ(d[0], 0);
@@ -1382,7 +1445,7 @@ TEST_F(LiteralUtilTest, CopyFromProto_f16) {
   p.mutable_shape()->set_element_type(F16);
   p.mutable_shape()->clear_dimensions();
   p.mutable_shape()->add_dimensions(4);
-  LayoutUtil::SetToDefaultLayout(p.mutable_shape());
+  SetDefaultLayoutOnProto(p.mutable_shape());
   p.clear_f16s();
   p.set_f16s(half_vals, 8);
   TF_ASSERT_OK_AND_ASSIGN(Literal literal, Literal::CreateFromProto(p));
@@ -1404,7 +1467,7 @@ TEST_F(LiteralUtilTest, CopyFromProto_u16) {
   p.mutable_shape()->set_element_type(U16);
   p.mutable_shape()->clear_dimensions();
   p.mutable_shape()->add_dimensions(4);
-  LayoutUtil::SetToDefaultLayout(p.mutable_shape());
+  SetDefaultLayoutOnProto(p.mutable_shape());
   p.clear_u16s();
   p.set_u16s(uint16_vals, 8);
   TF_ASSERT_OK_AND_ASSIGN(Literal literal, Literal::CreateFromProto(p));
@@ -1537,9 +1600,9 @@ TEST_F(LiteralUtilTest, DecomposeTuple) {
   Literal nested_tuple = LiteralUtil::MakeTuple(
       {&tuple_elements[0], &tuple_elements[1], &nil_literal});
 
-  EXPECT_FALSE(ShapeUtil::IsNil(nested_tuple.shape()));
+  EXPECT_FALSE(ShapeUtil::IsEmptyTuple(nested_tuple.shape()));
   std::vector<Literal> elements = nested_tuple.DecomposeTuple();
-  EXPECT_TRUE(ShapeUtil::IsNil(nested_tuple.shape()));
+  EXPECT_TRUE(ShapeUtil::IsEmptyTuple(nested_tuple.shape()));
 
   ASSERT_EQ(elements.size(), 3);
 
@@ -1590,7 +1653,7 @@ TEST_F(LiteralUtilTest, MoveIntoTuple) {
   EXPECT_EQ(literal.Get<double>({1}, /*shape_index=*/{2, 1}), 44.0);
 
   for (const Literal& element : elements) {
-    EXPECT_TRUE(ShapeUtil::IsNil(element.shape()));
+    EXPECT_TRUE(ShapeUtil::IsEmptyTuple(element.shape()));
   }
 }
 
@@ -1706,7 +1769,7 @@ TEST_F(LiteralUtilTest, ProtoRoundTrip) {
 TEST_F(LiteralUtilTest, InvalidProtoNoValues) {
   // Proto contains a shape, but no values.
   LiteralProto proto;
-  *proto.mutable_shape() = ShapeUtil::MakeShape(F32, {3});
+  *proto.mutable_shape() = ShapeUtil::MakeShape(F32, {3}).ToProto();
   Status status = Literal::CreateFromProto(proto).status();
   ASSERT_FALSE(status.ok());
   EXPECT_THAT(status.error_message(),
@@ -1727,7 +1790,7 @@ TEST_F(LiteralUtilTest, InvalidProtoNoShape) {
 TEST_F(LiteralUtilTest, InvalidProtoWrongContainer) {
   // Proto contains values in wrong container.
   LiteralProto proto;
-  *proto.mutable_shape() = ShapeUtil::MakeShape(F32, {3});
+  *proto.mutable_shape() = ShapeUtil::MakeShape(F32, {3}).ToProto();
   proto.add_preds(false);
   proto.add_preds(true);
   proto.add_preds(false);
@@ -1740,7 +1803,7 @@ TEST_F(LiteralUtilTest, InvalidProtoWrongContainer) {
 TEST_F(LiteralUtilTest, InvalidProtoTooFewValues) {
   // Proto contains too few values.
   LiteralProto proto;
-  *proto.mutable_shape() = ShapeUtil::MakeShape(F32, {42, 2});
+  *proto.mutable_shape() = ShapeUtil::MakeShape(F32, {42, 2}).ToProto();
   proto.add_f32s(1.0);
   proto.add_f32s(2.0);
   proto.add_f32s(3.0);
@@ -1753,7 +1816,7 @@ TEST_F(LiteralUtilTest, InvalidProtoTooFewValues) {
 TEST_F(LiteralUtilTest, InvalidProtoTooManyValues) {
   // Proto contains too many values.
   LiteralProto proto;
-  *proto.mutable_shape() = ShapeUtil::MakeShape(S32, {2});
+  *proto.mutable_shape() = ShapeUtil::MakeShape(S32, {2}).ToProto();
   proto.add_s32s(42);
   proto.add_s32s(-10);
   proto.add_s32s(100);
@@ -1766,8 +1829,8 @@ TEST_F(LiteralUtilTest, InvalidProtoTooManyValues) {
 TEST_F(LiteralUtilTest, InvalidProtoMissingLayout) {
   // Proto shape missing layout.
   LiteralProto proto;
-  *proto.mutable_shape() = ShapeUtil::MakeShape(PRED, {2, 2});
-  LayoutUtil::ClearLayout(proto.mutable_shape());
+  *proto.mutable_shape() = ShapeUtil::MakeShape(PRED, {2, 2}).ToProto();
+  proto.mutable_shape()->clear_layout();
   proto.add_preds(true);
   proto.add_preds(false);
   proto.add_preds(true);
@@ -1780,11 +1843,13 @@ TEST_F(LiteralUtilTest, InvalidProtoMissingLayout) {
 TEST_F(LiteralUtilTest, InvalidProtoTooFewTupleElements) {
   // Proto has the too few tuple elements.
   LiteralProto proto;
-  *proto.mutable_shape() = ShapeUtil::MakeTupleShape(
-      {ShapeUtil::MakeShape(PRED, {2}), ShapeUtil::MakeShape(F32, {})});
+  *proto.mutable_shape() =
+      ShapeUtil::MakeTupleShape(
+          {ShapeUtil::MakeShape(PRED, {2}), ShapeUtil::MakeShape(F32, {})})
+          .ToProto();
   LiteralProto* element0 = proto.add_tuple_literals();
   *element0->mutable_shape() =
-      ShapeUtil::GetTupleElementShape(proto.shape(), 0);
+      ShapeUtil::GetTupleElementShape(Shape(proto.shape()), 0).ToProto();
   element0->add_preds(false);
   element0->add_preds(true);
 
@@ -1796,19 +1861,21 @@ TEST_F(LiteralUtilTest, InvalidProtoTooFewTupleElements) {
 TEST_F(LiteralUtilTest, InvalidProtoTooManyTupleElements) {
   // Proto has the too many tuple elements.
   LiteralProto proto;
-  *proto.mutable_shape() = ShapeUtil::MakeTupleShape(
-      {ShapeUtil::MakeShape(PRED, {2}), ShapeUtil::MakeShape(F32, {})});
+  *proto.mutable_shape() =
+      ShapeUtil::MakeTupleShape(
+          {ShapeUtil::MakeShape(PRED, {2}), ShapeUtil::MakeShape(F32, {})})
+          .ToProto();
   LiteralProto* element0 = proto.add_tuple_literals();
   *element0->mutable_shape() =
-      ShapeUtil::GetTupleElementShape(proto.shape(), 0);
+      ShapeUtil::GetTupleElementShape(Shape(proto.shape()), 0).ToProto();
   element0->add_preds(false);
   element0->add_preds(true);
   LiteralProto* element1 = proto.add_tuple_literals();
   *element1->mutable_shape() =
-      ShapeUtil::GetTupleElementShape(proto.shape(), 1);
+      ShapeUtil::GetTupleElementShape(Shape(proto.shape()), 1).ToProto();
   element1->add_f32s(42.0);
   LiteralProto* element2 = proto.add_tuple_literals();
-  *element2->mutable_shape() = ShapeUtil::MakeShape(F32, {});
+  *element2->mutable_shape() = ShapeUtil::MakeShape(F32, {}).ToProto();
   element2->add_f32s(123.0);
 
   Status status = Literal::CreateFromProto(proto).status();
diff --git a/tensorflow/compiler/xla/parse_flags_from_env.cc b/tensorflow/compiler/xla/parse_flags_from_env.cc
index 40481331b69..5b568888d14 100644
--- a/tensorflow/compiler/xla/parse_flags_from_env.cc
+++ b/tensorflow/compiler/xla/parse_flags_from_env.cc
@@ -13,15 +13,20 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-// This module exports ParseFlagsFromEnv(), which allows other modules to parse
-// flags from an environtment variable, or a file named by the environment
-// variable.
+// This module exports ParseFlagsFromEnvAndDieIfUnknown(), which allows other
+// modules to parse flags from an environtment variable, or a file named by the
+// environment variable.
 
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
+#include <memory>
+#include <unordered_map>
 #include <vector>
 
+#include "absl/strings/str_format.h"
+#include "absl/strings/str_join.h"
+#include "absl/types/span.h"
 #include "tensorflow/compiler/xla/parse_flags_from_env.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/core/platform/logging.h"
@@ -32,7 +37,6 @@ limitations under the License.
 
 namespace xla {
 
-static const char kEnvVar[] = "TF_XLA_FLAGS";  // environment variable queried
 static const char kWS[] = " \t\r\n";           // whitespace
 
 // The following struct represents an argv[]-style array, parsed
@@ -42,12 +46,20 @@ static const char kWS[] = " \t\r\n";           // whitespace
 // constructor/destructor collisions with other "private" types
 // in the same named namespace.
 namespace {
+
+// Functor which deletes objects by calling `free`.  Necessary to free strdup'ed
+// strings created by AppendToEnvArgv.
+struct FreeDeleter {
+  void operator()(char* ptr) { free(ptr); }
+};
+
 struct EnvArgv {
   EnvArgv() : initialized(false), argc(0) {}
   bool initialized;         // whether the other fields have been set.
   int argc;                 // elements used in argv[]
   std::vector<char*> argv;  // flag arguments parsed from environment string.
-  std::vector<char*> argv_save;  // saved values from argv[] to avoid leaks
+  // saved values from argv[] to avoid leaks
+  std::vector<std::unique_ptr<char, FreeDeleter>> argv_save;
 };
 }  // anonymous namespace
 
@@ -63,7 +75,7 @@ static void AppendToEnvArgv(const char* s0, size_t s0len, const char* s1,
     string s = string(s0, s0len) + string(s1, s1len);
     char* str = strdup(s.c_str());
     a->argv.push_back(str);
-    a->argv_save.push_back(str);
+    a->argv_save.emplace_back(str);
     a->argc++;
   }
 }
@@ -127,14 +139,14 @@ static void ParseArgvFromString(const string& flag_str, EnvArgv* a) {
   }
 }
 
-// Call ParseArgvFromString(..., a) on a string derived from the setting of an
-// environment variable kEnvVar, or a file it points to.
-static void SetArgvFromEnv(EnvArgv* a) {
+// Call ParseArgvFromString(..., a) on a string derived from the setting of the
+// environment variable `envvar`, or a file it points to.
+static void SetArgvFromEnv(absl::string_view envvar, EnvArgv* a) {
   if (!a->initialized) {
     static const char kDummyArgv[] = "<argv[0]>";
     AppendToEnvArgv(kDummyArgv, strlen(kDummyArgv), nullptr, 0,
                     a);  // dummy argv[0]
-    const char* env = getenv(kEnvVar);
+    const char* env = getenv(string(envvar).c_str());
     if (env == nullptr || env[0] == '\0') {
       // nothing
     } else if (env[strspn(env, kWS)] == '-') {  // flags in env var value
@@ -157,48 +169,66 @@ static void SetArgvFromEnv(EnvArgv* a) {
   }
 }
 
-// The simulated argv[] parsed from the environment.
-static EnvArgv* env_argv;
+// The simulated argv[] parsed from the environment, one for each different
+// environment variable we've seen.
+static std::unordered_map<string, EnvArgv>& EnvArgvs() {
+  static auto* env_argvs = new std::unordered_map<string, EnvArgv>();
+  return *env_argvs;
+}
 
-// Used to protect accesses to env_argv.
+// Used to protect accesses to env_argvs.
 static tensorflow::mutex env_argv_mu(tensorflow::LINKER_INITIALIZED);
 
-// Call Flags::Parse(argc, argv, flag_list) against any as yet unrecognized
-// flags passed in from the environment.
-bool ParseFlagsFromEnv(const std::vector<tensorflow::Flag>& flag_list) {
-  env_argv_mu.lock();
-  if (env_argv == nullptr) {
-    env_argv = new EnvArgv;
-  }
-  SetArgvFromEnv(env_argv);  // a no-op if already initialized
+bool ParseFlagsFromEnvAndDieIfUnknown(
+    absl::string_view envvar, const std::vector<tensorflow::Flag>& flag_list) {
+  tensorflow::mutex_lock lock(env_argv_mu);
+  auto* env_argv = &EnvArgvs()[string(envvar)];
+  SetArgvFromEnv(envvar, env_argv);  // a no-op if already initialized
   bool result =
       tensorflow::Flags::Parse(&env_argv->argc, &env_argv->argv[0], flag_list);
-  env_argv_mu.unlock();
+
+  // There's always at least one unparsed argc, namely the fake argv[0].
+  if (result && env_argv->argc != 1) {
+    // Skip the first argv, which is the fake argv[0].
+    auto unknown_flags = absl::MakeSpan(env_argv->argv);
+    unknown_flags.remove_prefix(1);
+
+    // Some flags are set on XLA_FLAGS, others on TF_XLA_FLAGS.  If we find an
+    // unrecognized flag, suggest the alternative.
+    string alternate_envvar;
+    if (envvar == "TF_XLA_FLAGS") {
+      alternate_envvar = "XLA_FLAGS";
+    } else if (envvar == "XLA_FLAGS") {
+      alternate_envvar = "TF_XLA_FLAGS";
+    }
+    string did_you_mean;
+    if (!alternate_envvar.empty()) {
+      did_you_mean = absl::StrFormat(
+          "\nPerhaps you meant to specify these on the %s envvar?",
+          alternate_envvar);
+    }
+
+    LOG(FATAL) << "Unknown flag" << (unknown_flags.size() > 1 ? "s" : "")
+               << " in " << envvar << ": " << absl::StrJoin(unknown_flags, " ")
+               << did_you_mean;
+    return false;
+  }
   return result;
 }
 
 // Testing only.
-// Reset the env_argv struct so that subsequent calls to ParseFlagsFromEnv()
-// will parse the environment variable (or the file it points to) anew, and set
-// *pargc, and *pargv to point to the internal locations of the argc and argv
-// constructed from the environment.
-void ResetFlagsFromEnvForTesting(int** pargc, std::vector<char*>** pargv) {
-  env_argv_mu.lock();
-  if (env_argv == nullptr) {
-    env_argv = new EnvArgv;
-  }
-  if (!env_argv->argv_save.empty()) {
-    for (int i = 0; env_argv->argv_save[i] != nullptr; i++) {
-      free(env_argv->argv_save[i]);
-    }
-  }
-  env_argv->initialized = false;
-  env_argv->argc = 0;
-  env_argv->argv.clear();
-  env_argv->argv_save.clear();
-  env_argv_mu.unlock();
-  *pargc = &env_argv->argc;
-  *pargv = &env_argv->argv;
+//
+// Resets the env_argv struct so that subsequent calls to
+// ParseFlagsFromEnvAndDieIfUnknown() will parse the environment variable (or
+// the file it points to) anew, and set *pargc, and *pargv to point to the
+// internal locations of the argc and argv constructed from the environment.
+void ResetFlagsFromEnvForTesting(absl::string_view envvar, int** pargc,
+                                 std::vector<char*>** pargv) {
+  tensorflow::mutex_lock lock(env_argv_mu);
+  EnvArgvs().erase(string(envvar));
+  auto& env_argv = EnvArgvs()[string(envvar)];
+  *pargc = &env_argv.argc;
+  *pargv = &env_argv.argv;
 }
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/parse_flags_from_env.h b/tensorflow/compiler/xla/parse_flags_from_env.h
index fe86ee687f8..76940a4299a 100644
--- a/tensorflow/compiler/xla/parse_flags_from_env.h
+++ b/tensorflow/compiler/xla/parse_flags_from_env.h
@@ -16,48 +16,58 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_PARSE_FLAGS_FROM_ENV_H_
 #define TENSORFLOW_COMPILER_XLA_PARSE_FLAGS_FROM_ENV_H_
 
-// This module exports ParseFlagsFromEnv(), which allows other modules to parse
-// flags from the environtment variable TF_XLA_FLAGS, or (if the first
+// This module exports ParseFlagsFromEnvAndDieIfUnknown(), which allows other
+// modules to parse flags from an environtment variable, or (if the first
 // non-whitespace in the variable value is not '-'), a file named by that
-// environment variable.  The accepted syntax is that flags arguments are of
-// the form --flag=value or (for boolean flags) --flag, and are whitespace
-// separated.  The <value> may be one of:
-// - <non-whitespace, non-nul not starting with single-quote or double-quote>
-//   in which case the effective value is the string itself
-// - <single-quote><characters string not containing nul or
-//   single-quote><single_quote> in which case the effective value is the
-//   string with the single-quotes removed
-// - <double-quote><character string not containing nul or unesecaped
-//   double-quote><double_quote> in which case the effective value if the
-//   string with the double-quotes removed, and escaped sequences of
-//   <backslash><char> replaced by <char>.
+// environment variable.
+//
+// The accepted syntax is that flags arguments are of the form --flag=value or
+// (for boolean flags) --flag, and are whitespace separated.  The <value> may be
+// one of:
+//
+//  - <non-whitespace, non-nul not starting with single-quote or double-quote>
+//    in which case the effective value is the string itself
+//  - <single-quote><characters string not containing nul or
+//    single-quote><single_quote> in which case the effective value is the
+//    string with the single-quotes removed
+//  - <double-quote><character string not containing nul or unesecaped
+//    double-quote><double_quote> in which case the effective value if the
+//    string with the double-quotes removed, and escaped sequences of
+//    <backslash><char> replaced by <char>.
 //
 // Flags values inconsistent with the type of the flag will be rejected by the
 // flag parser.
 //
 // Examples:
-//    TF_XLA_FLAGS="--foo=bar  --wombat='value with a space'"
 //
-//    TF_XLA_FLAGS=/tmp/flagfile
+//  - TF_XLA_FLAGS="--foo=bar  --wombat='value with a space'"
+//  - TF_XLA_FLAGS=/tmp/flagfile
+//
 // where /tmp/flagfile might contain
-//    --some_flag="This is a string containing a \" and a '."
-//    --another_flag=wombats
+//
+//  --some_flag="This is a string containing a \" and a '."
+//  --another_flag=wombats
 
 #include <vector>
 
+#include "absl/strings/string_view.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/util/command_line_flags.h"
 
 namespace xla {
 
-// Call tensorflow::Flags::Parse(argc, argv, flag_list) against any as yet
-// unrecognized flags passed in from the environment, and return its
-// return value.
-bool ParseFlagsFromEnv(const std::vector<tensorflow::Flag>& flag_list);
+// Calls tensorflow::Flags::Parse(argc, argv, flag_list) against any as yet
+// unrecognized flags passed in the environment variable `envvar`, and returns
+// its return value.
+//
+// Raises a fatal error if any flags in `envvar` were not recognized.
+bool ParseFlagsFromEnvAndDieIfUnknown(
+    absl::string_view envvar, const std::vector<tensorflow::Flag>& flag_list);
 
 // Used only for testing.  Not to be used by clients.
-void ResetFlagsFromEnvForTesting(int** pargc, std::vector<char*>** pargv);
+void ResetFlagsFromEnvForTesting(absl::string_view envvar, int** pargc,
+                                 std::vector<char*>** pargv);
 
 }  // namespace xla
 
diff --git a/tensorflow/compiler/xla/parse_flags_from_env_test.cc b/tensorflow/compiler/xla/parse_flags_from_env_test.cc
index edd6538402d..3465552ebbf 100644
--- a/tensorflow/compiler/xla/parse_flags_from_env_test.cc
+++ b/tensorflow/compiler/xla/parse_flags_from_env_test.cc
@@ -37,20 +37,7 @@ static void TestParseFlagsFromEnv(const char* msg) {
   // Initialize module under test.
   int* pargc;
   std::vector<char*>* pargv;
-  ResetFlagsFromEnvForTesting(&pargc, &pargv);
-
-  // Ensure that environment variable can be parsed when
-  // no flags are expected.
-  std::vector<tensorflow::Flag> empty_flag_list;
-  bool parsed_ok = ParseFlagsFromEnv(empty_flag_list);
-  CHECK(parsed_ok) << msg;
-  const std::vector<char*>& argv_first = *pargv;
-  CHECK_NE(argv_first[0], nullptr) << msg;
-  int i = 0;
-  while (argv_first[i] != nullptr) {
-    i++;
-  }
-  CHECK_EQ(i, *pargc) << msg;
+  ResetFlagsFromEnvForTesting("TF_XLA_FLAGS", &pargc, &pargv);
 
   // Check that actual flags can be parsed.
   bool simple = false;
@@ -65,7 +52,7 @@ static void TestParseFlagsFromEnv(const char* msg) {
       tensorflow::Flag("single_quoted", &single_quoted, ""),
       tensorflow::Flag("double_quoted", &double_quoted, ""),
   };
-  parsed_ok = ParseFlagsFromEnv(flag_list);
+  bool parsed_ok = ParseFlagsFromEnvAndDieIfUnknown("TF_XLA_FLAGS", flag_list);
   CHECK_EQ(*pargc, 1) << msg;
   const std::vector<char*>& argv_second = *pargv;
   CHECK_NE(argv_second[0], nullptr) << msg;
@@ -171,7 +158,8 @@ int main(int argc, char* argv[]) {
       tensorflow::Flag("int_flag", &int_flag, "An integer flag to test with"),
   };
   xla::string usage = tensorflow::Flags::Usage(argv[0], flag_list);
-  bool parse_ok = xla::ParseFlagsFromEnv(flag_list);
+  bool parse_ok =
+      xla::ParseFlagsFromEnvAndDieIfUnknown("TF_XLA_FLAGS", flag_list);
   if (!parse_ok) {
     LOG(QFATAL) << "can't parse from environment\n" << usage;
   }
diff --git a/tensorflow/compiler/xla/protobuf_util.cc b/tensorflow/compiler/xla/protobuf_util.cc
index b507a2ef79f..ac342bf40fb 100644
--- a/tensorflow/compiler/xla/protobuf_util.cc
+++ b/tensorflow/compiler/xla/protobuf_util.cc
@@ -40,16 +40,6 @@ bool ProtobufEquals(const tensorflow::protobuf::Message& m1,
 
 namespace {
 
-string SanitizeFilename(const string& file_name) {
-  string safe_file_name = file_name;
-  for (char& c : safe_file_name) {
-    if (c == '/' || c == '\\') {
-      c = '_';
-    }
-  }
-  return safe_file_name;
-}
-
 std::pair<tensorflow::mutex*, std::vector<std::function<string(string)>>*>
 GetDirectoryExpanders() {
   static auto* mutex = new tensorflow::mutex;
diff --git a/tensorflow/compiler/xla/python/local_computation_builder.cc b/tensorflow/compiler/xla/python/local_computation_builder.cc
index 4d2a37cfac3..6e2ee866321 100644
--- a/tensorflow/compiler/xla/python/local_computation_builder.cc
+++ b/tensorflow/compiler/xla/python/local_computation_builder.cc
@@ -148,14 +148,19 @@ static StatusOr<ScopedShapedBuffer> ToBuffer(LocalClient* client,
 
 /* static */
 StatusOr<LocalShapedBuffer*> LocalShapedBuffer::FromLiteral(
-    const Literal& argument, const absl::optional<Shape>& shape_with_layout) {
+    const Literal& argument, const absl::optional<Shape>& shape_with_layout,
+    int replica_number) {
   LocalClient* client = GetOrCreateLocalClient();
+  TF_ASSIGN_OR_RETURN(int device_ordinal,
+                      client->ReplicaNumberToDeviceOrdinal(replica_number));
+  VLOG(1) << "Creating shaped buffer from literal on replica/ordinal: "
+          << replica_number << "/" << device_ordinal;
   StatusOr<ScopedShapedBuffer> buf = [&] {
     if (shape_with_layout) {
       Literal relaid = argument.Relayout(shape_with_layout.value());
-      return ToBuffer(client, /*device_ordinal=*/0, relaid);
+      return ToBuffer(client, device_ordinal, relaid);
     }
-    return ToBuffer(client, /*device_ordinal=*/0, argument);
+    return ToBuffer(client, device_ordinal, argument);
   }();
   TF_RETURN_IF_ERROR(buf.status());
   return new LocalShapedBuffer(std::move(buf).ValueOrDie());
@@ -312,67 +317,127 @@ CompiledLocalComputation::CompiledLocalComputation(
 StatusOr<LocalShapedBuffer*> CompiledLocalComputation::Execute(
     absl::Span<LocalShapedBuffer* const> argument_handles) {
   LocalClient* client = GetOrCreateLocalClient();
+  StatusOr<int> device_ordinal_status = client->ReplicaNumberToDeviceOrdinal(0);
+  StatusOr<ScopedShapedBuffer> result_buffer_status;
+  if (!device_ordinal_status.ok()) {
+    result_buffer_status = device_ordinal_status.status();
+  } else {
+    const int device_ordinal = device_ordinal_status.ValueOrDie();
+    VLOG(3) << "Replica 0 mapped to device ordinal for execution: "
+            << device_ordinal;
 
-  VLOG(1) << "Execution requested with " << GetReplicaCount() << " replicas.";
+    std::vector<const ShapedBuffer*> argument_buffers;
+    argument_buffers.reserve(argument_handles.size());
+    for (auto& handle : argument_handles) {
+      argument_buffers.push_back(handle->shaped_buffer());
+    }
+
+    DeviceAssignment device_assignment =
+        client->backend()
+            .computation_placer()
+            ->AssignDevices(1, /*computation_count=*/1)
+            .ConsumeValueOrDie();
+
+    ExecutableRunOptions options;
+    options.set_device_ordinal(device_ordinal);
+    options.set_allocator(client->backend().memory_allocator());
+    options.set_intra_op_thread_pool(
+        client->backend().eigen_intra_op_thread_pool_device());
+    options.set_device_assignment(&device_assignment);
+
+    result_buffer_status = executable_->Run(argument_buffers, options);
+  }
+
+  if (!result_buffer_status.ok()) {
+    return InternalError(
+        "Failed running replica 0 (other replicas may have failed as well): "
+        "%s.",
+        result_buffer_status.status().ToString());
+  }
+  return new LocalShapedBuffer(std::move(result_buffer_status).ValueOrDie());
+}
+
+StatusOr<LocalShapedBufferTuple*> CompiledLocalComputation::ExecutePerReplica(
+    absl::Span<const std::vector<LocalShapedBuffer*>> argument_handles) {
+  LocalClient* client = GetOrCreateLocalClient();
+  const int num_replicas = GetReplicaCount();
+
+  if (argument_handles.size() != num_replicas) {
+    return InvalidArgument(
+        "Attempted to execute with %d replicas when replica count is %d",
+        argument_handles.size(), num_replicas);
+  }
+
+  VLOG(1) << "Executing with " << num_replicas << " replicas.";
 
   // Each replica populates a StatusOr result, but only the output value of
   // replica zero is returned.
-  std::vector<StatusOr<ScopedShapedBuffer>> results(GetReplicaCount());
-  {
-    tensorflow::thread::ThreadPool pool(tensorflow::Env::Default(), "xlarun",
-                                        GetReplicaCount());
-
-    for (int replica = 0; replica < GetReplicaCount(); ++replica) {
-      pool.Schedule(
-          [this, client, replica, &argument_handles, &results] {
-            StatusOr<int> device_ordinal_status =
-                client->ReplicaNumberToDeviceOrdinal(replica);
-            if (!device_ordinal_status.ok()) {
-              results[replica] = device_ordinal_status.status();
-              return;
-            }
-            const int device_ordinal = device_ordinal_status.ValueOrDie();
-            VLOG(3) << "Replica " << replica
-                    << " mapped to device ordinal for execution: "
-                    << device_ordinal;
-
-            std::vector<const ShapedBuffer*> argument_buffers;
-            argument_buffers.reserve(argument_handles.size());
-            for (auto& handle : argument_handles) {
-              argument_buffers.push_back(handle->shaped_buffer());
-            }
-
-            DeviceAssignment device_assignment =
-                client->backend()
-                    .computation_placer()
-                    ->AssignDevices(GetReplicaCount(), /*computation_count=*/1)
-                    .ConsumeValueOrDie();
-
-            ExecutableRunOptions options;
-            options.set_device_ordinal(device_ordinal);
-            options.set_allocator(client->backend().memory_allocator());
-            options.set_intra_op_thread_pool(
-                client->backend().eigen_intra_op_thread_pool_device());
-            options.set_device_assignment(&device_assignment);
-            StatusOr<ScopedShapedBuffer> result_buffer_status =
-                executable_->Run(argument_buffers, options);
-
-            results[replica] = std::move(result_buffer_status);
-          });
+  std::vector<StatusOr<ScopedShapedBuffer>> results(num_replicas);
+  auto execute = [this, client, num_replicas, &argument_handles,
+                  &results](int replica) {
+    StatusOr<int> device_ordinal_status =
+        client->ReplicaNumberToDeviceOrdinal(replica);
+    if (!device_ordinal_status.ok()) {
+      results[replica] = device_ordinal_status.status();
+      return;
     }
+    const int device_ordinal = device_ordinal_status.ValueOrDie();
+    VLOG(3) << "Replica " << replica
+            << " mapped to device ordinal for execution: " << device_ordinal;
+
+    std::vector<const ShapedBuffer*> argument_buffers;
+    argument_buffers.reserve(argument_handles[replica].size());
+    for (auto& handle : argument_handles[replica]) {
+      argument_buffers.push_back(handle->shaped_buffer());
+    }
+
+    DeviceAssignment device_assignment =
+        client->backend()
+            .computation_placer()
+            ->AssignDevices(num_replicas, /*computation_count=*/1)
+            .ConsumeValueOrDie();
+
+    ExecutableRunOptions options;
+    options.set_device_ordinal(device_ordinal);
+    options.set_allocator(client->backend().memory_allocator());
+    options.set_intra_op_thread_pool(
+        client->backend().eigen_intra_op_thread_pool_device());
+    options.set_device_assignment(&device_assignment);
+    StatusOr<ScopedShapedBuffer> result_buffer_status =
+        executable_->Run(argument_buffers, options);
+
+    results[replica] = std::move(result_buffer_status);
+  };
+
+  if (num_replicas == 1) {
+    // Fast-path if there is only one replica — run the computation on the
+    // current thread.
+    execute(0);
+  } else {
+    // TODO(phawkins): don't recreate the threadpool for each execution.
+    tensorflow::thread::ThreadPool pool(tensorflow::Env::Default(), "xlarun",
+                                        num_replicas - 1);
+
+    for (int replica = 0; replica < num_replicas - 1; ++replica) {
+      pool.Schedule([&execute, replica] { execute(replica); });
+    }
+    execute(num_replicas - 1);
   }
 
-  for (int replica = 0; replica < GetReplicaCount(); ++replica) {
-    const auto& statusor = results[replica];
+  std::vector<LocalShapedBuffer*> wrapped_results(num_replicas);
+  for (int replica = 0; replica < num_replicas; ++replica) {
+    auto& statusor = results[replica];
     if (!statusor.ok()) {
       return InternalError(
           "Failed running replica %d (other replicas may have failed as well): "
           "%s.",
           replica, statusor.status().ToString());
     }
+    wrapped_results[replica] =
+        new LocalShapedBuffer(std::move(statusor).ValueOrDie());
   }
 
-  return new LocalShapedBuffer(std::move(results[0]).ValueOrDie());
+  return new LocalShapedBufferTuple(std::move(wrapped_results));
 }
 
 static StatusOr<Shape> GetReturnValueShape(const XlaComputation& computation) {
@@ -487,12 +552,13 @@ StatusOr<CompiledXrtComputation*> LocalComputation::CompileForXrt(
 
   xrt::XLAComputation c;
   auto config = c.mutable_config();
-  auto shapes = config->mutable_program_shape();
+  ProgramShape shapes;
   for (auto& shape : argument_shapes) {
-    *shapes->add_parameters() = shape;
+    *shapes.add_parameters() = shape;
   }
-  TF_ASSIGN_OR_RETURN(*shapes->mutable_result(), GetReturnValueShape());
-  LayoutUtil::SetToDefaultLayout(shapes);
+  TF_ASSIGN_OR_RETURN(*shapes.mutable_result(), GetReturnValueShape());
+  LayoutUtil::SetToDefaultLayout(&shapes);
+  *config->mutable_program_shape() = shapes.ToProto();
   auto snapshot = computation().Snapshot().ValueOrDie();
   *c.mutable_hlo_snapshot() = *snapshot;
 
@@ -584,9 +650,9 @@ LocalOp LocalComputationBuilder::Broadcast(
 }
 
 LocalOp LocalComputationBuilder::BroadcastInDim(
-    const LocalOp& operand, const Shape& shape,
+    const LocalOp& operand, absl::Span<const int64> out_dim_sizes,
     absl::Span<const int64> broadcast_dimensions) {
-  return xla::BroadcastInDim(operand.op(), shape, broadcast_dimensions);
+  return xla::BroadcastInDim(operand.op(), out_dim_sizes, broadcast_dimensions);
 }
 
 LocalOp LocalComputationBuilder::Pad(const LocalOp& operand,
diff --git a/tensorflow/compiler/xla/python/local_computation_builder.h b/tensorflow/compiler/xla/python/local_computation_builder.h
index 9e617c48bdc..149e44570df 100644
--- a/tensorflow/compiler/xla/python/local_computation_builder.h
+++ b/tensorflow/compiler/xla/python/local_computation_builder.h
@@ -71,7 +71,8 @@ StatusOr<Literal> TransferFromOutfeedLocalReplica(const Shape& shape,
 class LocalShapedBuffer {
  public:
   static StatusOr<LocalShapedBuffer*> FromLiteral(
-      const Literal& argument, const absl::optional<Shape>& shape_with_layout);
+      const Literal& argument, const absl::optional<Shape>& shape_with_layout,
+      int replica_number);
 
   LocalShapedBuffer(ScopedShapedBuffer shaped_buffer);
   StatusOr<Literal> ToLiteral() const;
@@ -175,6 +176,12 @@ class CompiledLocalComputation {
   StatusOr<LocalShapedBuffer*> Execute(
       absl::Span<LocalShapedBuffer* const> argument_handles);
 
+  // Execute on many replicas. Takes a sequence of argument lists (one argument
+  // list per replica) and returns a tuple of results (one result per replica).
+  // The number of argument lists must be equal to the replica count.
+  StatusOr<LocalShapedBufferTuple*> ExecutePerReplica(
+      absl::Span<const std::vector<LocalShapedBuffer*> > argument_handles);
+
  private:
   std::unique_ptr<LocalExecutable> executable_;
 };
@@ -282,7 +289,8 @@ class LocalComputationBuilder {
   LocalOp Broadcast(const LocalOp& operand,
                     absl::Span<const int64> broadcast_sizes);
 
-  LocalOp BroadcastInDim(const LocalOp& operand, const Shape& shape,
+  LocalOp BroadcastInDim(const LocalOp& operand,
+                         absl::Span<const int64> out_dim_sizes,
                          absl::Span<const int64> broadcast_dimensions);
 
   LocalOp Pad(const LocalOp& operand, const LocalOp& padding_value,
diff --git a/tensorflow/compiler/xla/python/local_computation_builder.i b/tensorflow/compiler/xla/python/local_computation_builder.i
index feabfdb889c..d23d693c1e5 100644
--- a/tensorflow/compiler/xla/python/local_computation_builder.i
+++ b/tensorflow/compiler/xla/python/local_computation_builder.i
@@ -363,6 +363,37 @@ tensorflow::ImportNumpy();
   $1 = temps;
 }
 
+%typemap(in) absl::Span<const std::vector<xla::swig::LocalShapedBuffer*> >
+    (std::vector<std::vector<LocalShapedBuffer*> > temps) {
+  if (!PySequence_Check($input)) {
+    PyErr_SetString(PyExc_TypeError, "Argument is not a sequence");
+    SWIG_fail;
+  }
+  const int size = PySequence_Size($input);
+  temps.reserve(size);
+  for (int i = 0; i < size; ++i) {
+    PyObject* o = PySequence_GetItem($input, i);
+    std::vector<LocalShapedBuffer*> vec;
+    const int vec_size = PySequence_Size(o);
+    vec.reserve(vec_size);
+    for (int j = 0; j < vec_size; ++j) {
+      PyObject* vec_elt = PySequence_GetItem(o, j);
+      LocalShapedBuffer* lsbp;
+      if ((SWIG_ConvertPtr(vec_elt, (void**) &lsbp, $descriptor(xla::swig::LocalShapedBuffer*),
+                           SWIG_POINTER_EXCEPTION)) == -1) {
+        Py_DECREF(vec_elt);
+        Py_DECREF(o);
+        SWIG_fail;
+      }
+      vec.push_back(lsbp);
+      Py_DECREF(vec_elt);
+    }
+    temps.push_back(vec);
+    Py_DECREF(o);
+  }
+  $1 = temps;
+}
+
 %typemap(in) absl::Span<xla::swig::XrtAllocation* const>
     (std::vector<XrtAllocation*> temps) {
   if (!PySequence_Check($input)) {
@@ -921,22 +952,22 @@ tensorflow::ImportNumpy();
     $1 = NULL;
   } else {
     if (!HandleStringAttribute($input, "generate_hlo_graph", [&](string s) {
-      build_options.set_generate_hlo_graph(std::move(s));
+      build_options.mutable_debug_options()->set_xla_generate_hlo_graph(std::move(s));
     })) {
       return nullptr;
     }
     if (!HandleStringAttribute($input, "dump_optimized_hlo_proto_to", [&](string s) {
-      build_options.set_dump_optimized_hlo_proto_to(std::move(s));
+      build_options.mutable_debug_options()->set_xla_dump_optimized_hlo_proto_to(std::move(s));
     })) {
       return nullptr;
     }
     if (!HandleStringAttribute($input, "dump_unoptimized_hlo_proto_to", [&](string s) {
-      build_options.set_dump_unoptimized_hlo_proto_to(std::move(s));
+      build_options.mutable_debug_options()->set_xla_dump_unoptimized_hlo_proto_to(std::move(s));
     })) {
       return nullptr;
     }
     if (!HandleStringAttribute($input, "dump_per_pass_hlo_proto_to", [&](string s) {
-      build_options.set_dump_per_pass_hlo_proto_to(std::move(s));
+      build_options.mutable_debug_options()->set_xla_dump_per_pass_hlo_proto_to(std::move(s));
     })) {
       return nullptr;
     }
@@ -950,7 +981,7 @@ tensorflow::ImportNumpy();
         PyErr_SetString(PyExc_TypeError, "ExecutableBuildOptions.hlo_profile must be a bool or None.");
         SWIG_fail;
       }
-      build_options.set_hlo_profile(o == Py_True);
+      build_options.mutable_debug_options()->set_xla_hlo_profile(o == Py_True);
     }
     Py_DECREF(o);
 
@@ -992,11 +1023,13 @@ tensorflow::ImportNumpy();
 %unignore xla::swig::XrtAllocation;
 %unignore xla::swig::XrtAllocation::FromLiteral;
 %unignore xla::swig::XrtAllocation::ToLiteral;
+%unignore xla::swig::XrtAllocation::shape;
 %unignore xla::swig::XrtAllocationTuple;
 %unignore xla::swig::XrtAllocationTuple::Release;
 %unignore xla::swig::XrtAllocationTuple::size;
 %unignore xla::swig::CompiledLocalComputation;
 %unignore xla::swig::CompiledLocalComputation::Execute;
+%unignore xla::swig::CompiledLocalComputation::ExecutePerReplica;
 %unignore xla::swig::CompiledXrtComputation;
 %unignore xla::swig::CompiledXrtComputation::Execute;
 %unignore xla::swig::LocalComputation;
diff --git a/tensorflow/compiler/xla/python/xla_client.py b/tensorflow/compiler/xla/python/xla_client.py
index 92b0685dbba..c91a2aaf56d 100644
--- a/tensorflow/compiler/xla/python/xla_client.py
+++ b/tensorflow/compiler/xla/python/xla_client.py
@@ -26,6 +26,9 @@ import os
 
 import numpy as np
 
+import six
+from six.moves import xrange
+
 from tensorflow.compiler.xla import xla_data_pb2
 from tensorflow.compiler.xla.python import pywrap_xla as c_api
 from tensorflow.compiler.xla.service import hlo_pb2
@@ -75,6 +78,13 @@ def CurrentSourceInfoMetadata(op_type=None, op_name=None, skip_frames=1):
       source_line=lineno)
 
 
+def _maybe_encode_string(s):
+  if six.PY3:
+    return s.encode('utf-8')
+  else:
+    return s
+
+
 class PaddingType(enum.Enum):
   VALID = 1
   SAME = 2
@@ -212,23 +222,33 @@ class LocalBuffer(object):
   means the referent is in device memory.
   """
 
-  def __init__(self, c_buffer, backend):
+  def __init__(self, c_buffer, backend, replica):
     self.c_buffer = c_buffer
     self._backend = backend
+    self._replica = replica
     if backend.backend_type == BackendType.XRT:
       self._delete = c_api.DeleteXrtAllocation
     else:
       self._delete = c_api.DeleteLocalShapedBuffer
 
   @staticmethod
-  def from_pyval(pyval, backend=XLA_LOCAL_BACKEND):
+  def from_pyval(pyval, replica=0, backend=XLA_LOCAL_BACKEND):
     """Allocate and copy to XLA the given python value."""
     pyval = require_numpy_array_layout(pyval)
+    num_replicas = get_replica_count()
+    if not 0 <= replica < num_replicas:
+      raise ValueError(
+          'Attempt to place buffer on replica {} when the replica count is {}'
+          .format(replica, num_replicas))
     if backend.backend_type == BackendType.XRT:
-      cbuf = c_api.XrtAllocation.FromLiteral(pyval, backend.target)
+      if replica != 0:
+        raise NotImplementedError(
+            'Multi-replica execution is not yet supported via the XRT backend.')
+      cbuf = c_api.XrtAllocation.FromLiteral(
+          pyval, _maybe_encode_string(backend.target))
     else:
-      cbuf = c_api.LocalShapedBuffer.FromLiteral(pyval, None)
-    return LocalBuffer(cbuf, backend)
+      cbuf = c_api.LocalShapedBuffer.FromLiteral(pyval, None, replica)
+    return LocalBuffer(cbuf, backend, replica)
 
   def to_py(self):
     return self.c_buffer.ToLiteral()
@@ -236,6 +256,9 @@ class LocalBuffer(object):
   def shape(self):
     return _wrap_shape(self.c_buffer.shape())
 
+  def replica(self):
+    return self._replica
+
   def delete(self):
     if self.c_buffer is not None:
       self._delete(self.c_buffer)
@@ -245,14 +268,15 @@ class LocalBuffer(object):
     """Assuming a tuple buffer, unpack it into constituent tuple elements."""
     assert self.c_buffer is not None
     if self._backend.backend_type == BackendType.XRT:
-      result = c_api.DestructureXrtAllocationTuple(self.c_buffer,
-                                                   self._backend.target)
+      result = c_api.DestructureXrtAllocationTuple(
+          self.c_buffer, _maybe_encode_string(self._backend.target))
     else:
       result = c_api.DestructureLocalShapedBufferTuple(self.c_buffer)
     self.delete()
     size = result.size()
     destructured = tuple(
-        LocalBuffer(result.Release(i), backend=self._backend)
+        LocalBuffer(
+            result.Release(i), replica=self._replica, backend=self._backend)
         for i in xrange(size))
     return destructured
 
@@ -322,6 +346,9 @@ class Shape(object):
   def __ne__(self, other):
     return not self == other
 
+  def __hash__(self):
+    return hash((self._dtype, self._dimensions, self._minor_to_major))
+
   def __repr__(self):
     return ('xla_client.Shape(_dtype={!r}, _dimensions={!r}, '
             '_is_tuple={!r}, _minor_to_major={!r})').format(
@@ -541,10 +568,13 @@ class LocalComputation(object):
       ]
       result_shape = result_shape.map_leaves(layout_fn)
 
+    argument_shapes = list(argument_shapes)
+
     compile_options = compile_options or CompileOptions()
     compile_options.result_shape = result_shape
     if self._backend.backend_type == BackendType.XRT:
-      c = self.computation.CompileForXrt(argument_shapes, self._backend.target)
+      c = self.computation.CompileForXrt(
+          argument_shapes, _maybe_encode_string(self._backend.target))
     else:
       c = self.computation.Compile(argument_shapes, compile_options)
     return LocalComputation(c, is_compiled=True, backend=self._backend)
@@ -558,23 +588,87 @@ class LocalComputation(object):
         compile_options=compile_options,
         layout_fn=layout_fn)
 
-  def Execute(self, arguments=()):
-    """Execute with LocalBuffer arguments and return value."""
+  def GetReturnValueShape(self):
+    return _wrap_shape(self._c_computation.GetReturnValueShape())
+
+  def Execute(self, arguments=(), check_for_deleted_args=True):
+    """Execute on one replica with LocalBuffer arguments and return value."""
+    if check_for_deleted_args and any(arg.is_deleted() for arg in arguments):
+      raise ValueError('Executing with deleted local buffer argument')
+    raw_args = [arg.c_buffer for arg in arguments]
+    output_buffer = self._c_computation.Execute(raw_args)
+    return LocalBuffer(output_buffer, backend=self._backend, replica=0)
+
+  def ExecutePerReplica(self, arguments=None):
+    """Execute on many replicas with LocalBuffer arguments and return value.
+
+    Args:
+      arguments: A sequence of sequences of LocalBuffers. The i'th inner
+        sequence comprises the arguments for execution on the i'th replica.
+
+    Returns:
+      A list of the computation's outputs on each replica, as a LocalBuffer. If
+      a shallow sequence of arguments was passed in for `arguments`, then the
+      sole, zero'th replica's output is returned instead, as a LocalBuffer.
+    """
     if not self._is_compiled:
       raise ValueError('Cannot execute an uncompiled local XLA computation.')
-    arguments = tuple(arguments)
-    if any(arg.is_deleted() for arg in arguments):
-      raise ValueError('Executing with deleted local buffer argument')
-    return LocalBuffer(
-        self._c_computation.Execute([arg.c_buffer for arg in arguments]),
-        backend=self._backend)
+    if arguments is None:
+      arguments = ((),) * get_replica_count()
+    else:
+      arguments = [list(replica_args) for replica_args in arguments]
+
+    # Check arguments
+    for replica, replica_args in enumerate(arguments):
+      for arg in replica_args:
+        if arg.is_deleted():
+          raise ValueError('Executing with deleted local buffer argument')
+        if arg.replica() != replica:
+          raise ValueError(
+              'Executing on replica {} with argument from replica {}'.format(
+                  replica, arg.replica()))
+
+    # Pull out argument buffer handles
+    stripped_args = [
+        [arg.c_buffer for arg in replica_args] for replica_args in arguments
+    ]
+
+    # Execute
+    if self._backend.backend_type == BackendType.XRT:
+      if len(stripped_args) > 1:
+        raise NotImplementedError(
+            'Multi-replica execution is not yet supported via the XRT backend.')
+      output_buffers = [self._c_computation.Execute(stripped_args[0])]
+    else:
+      output_buffer_tup = self._c_computation.ExecutePerReplica(stripped_args)
+      size = output_buffer_tup.size()
+      output_buffers = [output_buffer_tup.Release(i) for i in xrange(size)]
+
+    # Wrap output handles in LocalBuffer instances
+    return tuple(
+        LocalBuffer(output_buffer, backend=self._backend, replica=replica)
+        for replica, output_buffer in enumerate(output_buffers))
 
   def ExecuteWithPythonValues(self, arguments=()):
-    """Execute with Python values as arguments and return value."""
-    arguments = tuple(
-        LocalBuffer.from_pyval(arg, backend=self._backend) for arg in arguments)
+    """Execute on one replica with Python values as arguments and output."""
+
+    def put(arg):
+      return LocalBuffer.from_pyval(arg, backend=self._backend)
+
+    arguments = [put(arg) for arg in arguments]
     return self.Execute(arguments).to_py()
 
+  def ExecuteWithPythonValuesPerReplica(self, arguments):
+    """Execute on many replicas with Python values as arguments and output."""
+
+    def put(arg, replica):
+      return LocalBuffer.from_pyval(arg, replica, backend=self._backend)
+
+    arguments = [[put(arg, replica)
+                  for arg in replica_args]
+                 for replica, replica_args in enumerate(arguments)]
+    return [out.to_py() for out in self.ExecutePerReplica(arguments)]
+
   def __del__(self):
     self._delete(self._c_computation)
 
@@ -761,8 +855,7 @@ class ComputationBuilder(object):
     Returns:
       A LocalOp representing the added broadcast-in-dimensions op.
     """
-    xla_shape = Shape.array_shape(self.GetShape(operand).element_type(), shape)
-    return self._client.BroadcastInDim(operand, xla_shape, broadcast_dimensions)
+    return self._client.BroadcastInDim(operand, shape, broadcast_dimensions)
 
   def Concatenate(self, operands, dimension):
     """Enqueues a concatenate operation onto the computation.
@@ -1380,6 +1473,7 @@ def initialize_platform_name(platform_name):
   Raises:
     A runtime exception if the XLA service has already been initialized.
   """
+  platform_name = _maybe_encode_string(platform_name)
   c_api.InitializePlatformName(platform_name)
 
 
diff --git a/tensorflow/compiler/xla/python_api/xla_shape.py b/tensorflow/compiler/xla/python_api/xla_shape.py
index f158f6b2410..95b2bf300ec 100644
--- a/tensorflow/compiler/xla/python_api/xla_shape.py
+++ b/tensorflow/compiler/xla/python_api/xla_shape.py
@@ -25,9 +25,10 @@ from tensorflow.compiler.xla.python_api import types
 
 
 class Shape(object):
-  """Wraps a xla_data_pb2.Shape message with a convenient Python type.
+  """Wraps a xla_data_pb2.ShapeProto message with a convenient Python type.
 
-  Provides direct access to the underlying xla_data_pb2.Shape message in the
+  Provides direct access to the underlying xla_data_pb2.ShapeProto message in
+  the
   message attribute, along with accessor wrappers to the message's fields.
   Avoid direct access to .message unless interacting directly with protobuf APIs
   like CopyFrom. In other words, prefer hauling the shape around in a Shape, and
@@ -48,7 +49,7 @@ class Shape(object):
     Raises:
       ValueError: if element_type is TUPLE but dimensions are not Shape objects.
     """
-    self.message = xla_data_pb2.Shape()
+    self.message = xla_data_pb2.ShapeProto()
     self.message.element_type = element_type
     if element_type == xla_data_pb2.TUPLE:
       if not all(isinstance(subshape, Shape) for subshape in dimensions):
diff --git a/tensorflow/compiler/xla/rpc/BUILD b/tensorflow/compiler/xla/rpc/BUILD
index 3abb3855a42..26affbcceb3 100644
--- a/tensorflow/compiler/xla/rpc/BUILD
+++ b/tensorflow/compiler/xla/rpc/BUILD
@@ -16,7 +16,6 @@ xla_proto_library(
     use_grpc_plugin = True,
     visibility = ["//visibility:public"],
     deps = [
-        "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla:xla_proto",
     ],
 )
diff --git a/tensorflow/compiler/xla/rpc/xla_service.proto b/tensorflow/compiler/xla/rpc/xla_service.proto
index e4f332cda22..0ff8adc2acb 100644
--- a/tensorflow/compiler/xla/rpc/xla_service.proto
+++ b/tensorflow/compiler/xla/rpc/xla_service.proto
@@ -43,7 +43,6 @@ limitations under the License.
 syntax = "proto3";
 
 import "tensorflow/compiler/xla/xla.proto";
-import "tensorflow/compiler/xla/xla_data.proto";
 
 package xla;
 
diff --git a/tensorflow/compiler/xla/service/BUILD b/tensorflow/compiler/xla/service/BUILD
index 19b5c1ca25d..81e71eee520 100644
--- a/tensorflow/compiler/xla/service/BUILD
+++ b/tensorflow/compiler/xla/service/BUILD
@@ -281,10 +281,12 @@ tf_cc_test(
         "//tensorflow/compiler/xla/service:hlo_element_type_converter",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
+        "//tensorflow/compiler/xla/tests:test_utils",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",  # fixdeps: keep
         "//tensorflow/core:lib",
         "//tensorflow/core:test",
         "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings:str_format",
     ],
 )
 
@@ -292,6 +294,7 @@ cc_library(
     name = "hlo",
     srcs = [
         "dfs_hlo_visitor.cc",
+        "dynamic_parameter_binding.cc",
         "hlo_computation.cc",
         "hlo_input_output_alias_config.cc",
         "hlo_instruction.cc",
@@ -305,6 +308,7 @@ cc_library(
     hdrs = [
         "dfs_hlo_visitor.h",
         "dfs_hlo_visitor_with_default.h",
+        "dynamic_parameter_binding.h",
         "hlo_clone_context.h",
         "hlo_computation.h",
         "hlo_domain_metadata.h",
@@ -350,6 +354,25 @@ cc_library(
     ],
 )
 
+tf_cc_test(
+    name = "dynamic_parameter_binding_test",
+    srcs = ["dynamic_parameter_binding_test.cc"],
+    deps = [
+        ":hlo",
+        ":hlo_dce",
+        ":hlo_memory_scheduler",
+        ":hlo_ordering",
+        ":hlo_parser",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "//tensorflow/core:test",
+        "@com_google_absl//absl/algorithm:container",
+    ],
+)
+
 tf_cc_test(
     name = "dfs_hlo_visitor_with_default_test",
     srcs = ["dfs_hlo_visitor_with_default_test.cc"],
@@ -387,9 +410,36 @@ tf_cc_test(
         ":hlo",
         ":pattern_matcher",
         "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:test",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+cc_library(
+    name = "pattern_matcher_gmock",
+    testonly = 1,
+    hdrs = ["pattern_matcher_gmock.h"],
+    deps = [
+        ":pattern_matcher",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/core:test",
+    ],
+)
+
+tf_cc_test(
+    name = "pattern_matcher_gmock_test",
+    srcs = ["pattern_matcher_gmock_test.cc"],
+    deps = [
+        ":hlo",
+        ":pattern_matcher",
+        ":pattern_matcher_gmock",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "//tensorflow/core:test",
     ],
 )
 
@@ -403,6 +453,7 @@ cc_library(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "@com_google_absl//absl/base",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/types:span",
     ],
@@ -1336,6 +1387,7 @@ cc_library(
         ":hlo",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:shape_util",
+        "@com_google_absl//absl/container:flat_hash_set",
     ],
 )
 
@@ -1539,7 +1591,10 @@ tf_cc_test(
         ":hlo",
         ":hlo_casting_utils",
         ":hlo_matchers",
+        ":hlo_parser",
         ":hlo_pass",
+        ":pattern_matcher",
+        ":pattern_matcher_gmock",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:test",
@@ -1707,7 +1762,9 @@ cc_library(
         ":hlo",
         ":hlo_pass",
         ":hlo_query",
+        ":pattern_matcher",
         ":while_loop_analysis",
+        "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:statusor",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
@@ -1720,9 +1777,14 @@ tf_cc_test(
     name = "while_loop_simplifier_test",
     srcs = ["while_loop_simplifier_test.cc"],
     deps = [
+        ":algebraic_simplifier",
         ":hlo",
+        ":hlo_cse",
         ":hlo_dce",
         ":hlo_matchers",
+        ":hlo_pass",
+        ":hlo_pass_pipeline",
+        ":tuple_simplifier",
         ":while_loop_simplifier",
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
@@ -2347,6 +2409,7 @@ cc_library(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/container:flat_hash_map",
     ],
 )
 
@@ -2600,6 +2663,8 @@ tf_cc_test(
         ":hlo",
         ":hlo_matchers",
         ":layout_assignment",
+        ":pattern_matcher",
+        ":pattern_matcher_gmock",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:shape_layout",
         "//tensorflow/compiler/xla:shape_util",
@@ -2744,6 +2809,8 @@ tf_cc_test(
         ":hlo_matchers",
         ":hlo_parser",
         ":hlo_pass",
+        ":pattern_matcher",
+        ":pattern_matcher_gmock",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:test",
@@ -2855,6 +2922,46 @@ tf_cc_test(
     ],
 )
 
+cc_library(
+    name = "hlo_get_dimension_size_rewriter",
+    srcs = ["hlo_get_dimension_size_rewriter.cc"],
+    hdrs = ["hlo_get_dimension_size_rewriter.h"],
+    deps = [
+        ":hlo",
+        ":hlo_pass",
+        ":shape_inference",
+        "//tensorflow/compiler/xla:literal",
+        "//tensorflow/compiler/xla:literal_util",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/core:lib",
+        "@com_google_absl//absl/algorithm:container",
+    ],
+)
+
+tf_cc_test(
+    name = "hlo_get_dimension_size_rewriter_test",
+    srcs = ["hlo_get_dimension_size_rewriter_test.cc"],
+    deps = [
+        ":hlo",
+        ":hlo_get_dimension_size_rewriter",
+        ":hlo_matchers",
+        ":hlo_parser",
+        "//tensorflow/compiler/xla:literal",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/compiler/xla/tests:literal_test_util",
+        "//tensorflow/compiler/xla/tests:test_utils",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+    ],
+)
+
 cc_library(
     name = "device_memory_allocator",
     srcs = [
@@ -2913,6 +3020,7 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/strings",
         "@llvm//:core",
         "@llvm//:transform_utils",
@@ -3026,6 +3134,7 @@ cc_library(
         ":hlo_casting_utils",
         ":hlo_execution_profile",
         ":hlo_tfgraph_builder",
+        ":pattern_matcher",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:types",
@@ -3318,9 +3427,9 @@ cc_library(
         ":tuple_util",
         ":while_loop_analysis",
         ":while_util",
+        "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:util",
-        "//tensorflow/core:lib",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
@@ -3463,6 +3572,8 @@ tf_cc_test(
         ":hlo_casting_utils",
         ":hlo_matchers",
         ":hlo_parser",
+        ":pattern_matcher",
+        ":pattern_matcher_gmock",
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla:window_util",
         "//tensorflow/core:lib",
@@ -3513,6 +3624,41 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "ar_crs_combiner",
+    srcs = ["ar_crs_combiner.cc"],
+    hdrs = ["ar_crs_combiner.h"],
+    deps = [
+        ":call_graph",
+        ":pattern_matcher",
+        "//tensorflow/compiler/xla:literal",
+        "//tensorflow/compiler/xla:literal_util",
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/service:hlo_pass",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+tf_cc_test(
+    name = "ar_crs_combiner_test",
+    srcs = ["ar_crs_combiner_test.cc"],
+    deps = [
+        ":ar_crs_combiner",
+        ":hlo",
+        ":hlo_matchers",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+    ],
+)
+
 tf_cc_test(
     name = "map_inliner_test",
     srcs = ["map_inliner_test.cc"],
diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier.cc b/tensorflow/compiler/xla/service/algebraic_simplifier.cc
index 89e62bd2f0d..985c5af1c4d 100644
--- a/tensorflow/compiler/xla/service/algebraic_simplifier.cc
+++ b/tensorflow/compiler/xla/service/algebraic_simplifier.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/algebraic_simplifier.h"
 
 #include <algorithm>
+#include <cmath>
 #include <iterator>
 #include <memory>
 #include <numeric>
@@ -68,6 +69,45 @@ bool IsAll(const HloInstruction* op, int8 value) {
   }
 }
 
+// Checks whether `op` is a floating-point constant or broadcast of a constant
+// of the form +/- 2^k for some integer k positive, negative, or zero.  Such
+// values are interesting because multiplying by a power of 2 just moves the
+// exponent.
+bool IsAllFpConstantPowerOf2(const HloInstruction* op) {
+  // Unwrap the broadcast if necessary.
+  const HloInstruction* c;
+  if (!Match(op, m::ConstantEffectiveScalar(&c)) &&
+      !Match(op, m::Broadcast(m::Constant(&c).WithShape(
+                     m::Shape().IsEffectiveScalar())))) {
+    return false;
+  }
+  auto val = [&]() -> absl::optional<double> {
+    switch (c->shape().element_type()) {
+      case BF16:
+        return static_cast<double>(c->literal().GetFirstElement<bfloat16>());
+      case F16:
+        return static_cast<double>(c->literal().GetFirstElement<Eigen::half>());
+      case F32:
+        return c->literal().GetFirstElement<float>();
+      case F64:
+        return c->literal().GetFirstElement<double>();
+      default:
+        // Cowardly refuse to consider complex types.
+        return absl::nullopt;
+    }
+  }();
+  if (!val) {
+    return false;
+  }
+
+  int exp;
+  double mantissa = std::frexp(*val, &exp);
+  // frexp returns a value in the range (-1; -0.5] U [0.5, 1).  A return value
+  // of +/-0.5 therefore indicates that the floating point value is a power of
+  // 2.
+  return mantissa == 0.5 || mantissa == -0.5;
+}
+
 // Returns whether the given transpose produces a result which is bit-wise
 // identical to its operand and thus may be replaced with a bitcast.
 bool TransposeIsBitcast(const HloInstruction* transpose) {
@@ -84,7 +124,8 @@ bool TransposeIsBitcast(const HloInstruction* transpose) {
 // reshape may still be a bitcast. For example, a reshape from [28x28] to [784].
 bool ReshapeOrCopyIsBitcast(
     const HloInstruction* instr,
-    const AlgebraicSimplifier::ValidBitcastCallback& valid_bitcast_callback) {
+    const AlgebraicSimplifierOptions::ValidBitcastCallback&
+        valid_bitcast_callback) {
   CHECK(HloOpcode::kReshape == instr->opcode() ||
         HloOpcode::kCopy == instr->opcode());
 
@@ -95,6 +136,11 @@ bool ReshapeOrCopyIsBitcast(
          valid_bitcast_callback(operand->shape(), instr->shape());
 }
 
+bool IsUnstridedSlice(const HloInstruction* hlo) {
+  return absl::c_all_of(hlo->slice_strides(),
+                        [](int64 stride) { return stride == 1; });
+}
+
 // AlgebraicSimplifierVisitor traverses the HLO computation and reduces certain
 // algebraic expressions to simplified forms. Note: This only supports
 // simplifications that simply look at the operands of an instruction. For the
@@ -180,21 +226,13 @@ class AlgebraicSimplifierVisitor : public DfsHloVisitorWithDefault {
   const bool changed() const { return changed_; }
 
   // Runs the visitor on a computation.
-  static bool Run(
-      HloComputation* computation, bool is_layout_sensitive,
-      AlgebraicSimplifier::ValidBitcastCallback valid_bitcast_callback,
-      bool enable_dot_strength_reduction, bool enable_conv_simplification);
+  static bool Run(HloComputation* computation,
+                  const AlgebraicSimplifierOptions& options);
 
  private:
-  explicit AlgebraicSimplifierVisitor(
-      HloComputation* computation, bool is_layout_sensitive,
-      AlgebraicSimplifier::ValidBitcastCallback valid_bitcast_callback,
-      bool enable_dot_strength_reduction, bool enable_conv_simplification)
-      : computation_(computation),
-        is_layout_sensitive_(is_layout_sensitive),
-        valid_bitcast_callback_(std::move(valid_bitcast_callback)),
-        enable_dot_strength_reduction_(enable_dot_strength_reduction),
-        enable_conv_simplification_(enable_conv_simplification) {}
+  explicit AlgebraicSimplifierVisitor(HloComputation* computation,
+                                      const AlgebraicSimplifierOptions& options)
+      : computation_(computation), options_(options) {}
 
   // Transforms Dots where at least one input is a vector or has a degenerate
   // dimension and converts it into a multiply and reduce. This should enable
@@ -233,10 +271,10 @@ class AlgebraicSimplifierVisitor : public DfsHloVisitorWithDefault {
                                      HloInstruction* new_instruction);
 
   // Returns whether the shape of the output of the given instructions are the
-  // same for the purposes of simplification. If is_layout_sensitive_ is true,
-  // then this tests shape equality including layout (ShapeUtil::Equal). If
-  // is_layout_sensitive_ is false, then the tests shape compatibility
-  // (ShapeUtil::Compatible).
+  // same for the purposes of simplification. If options_.is_layout_sensitive()
+  // is true, then this tests shape equality including layout
+  // (ShapeUtil::Equal). If options_.is_layout_sensitive() is false, then the
+  // tests shape compatibility (ShapeUtil::Compatible).
   bool SameShape(const HloInstruction* lhs, const HloInstruction* rhs) const;
 
   // Returns whether it was possible to transform `root` to a clamp instruction.
@@ -325,22 +363,12 @@ class AlgebraicSimplifierVisitor : public DfsHloVisitorWithDefault {
   // traversing.
   HloComputation* computation_;
 
+  // The backend-specific options selected for the algebraic simplifier.
+  const AlgebraicSimplifierOptions& options_;
+
   // Whether algebraic simplification has occurred.
   bool changed_ = false;
 
-  // Whether layout is considered during transformation.
-  bool is_layout_sensitive_;
-
-  // Callback used to determine if a bitcast is possible.
-  AlgebraicSimplifier::ValidBitcastCallback valid_bitcast_callback_;
-
-  // Disable dot strength reduction on platforms where it causes a slowdown.
-  bool enable_dot_strength_reduction_;
-
-  // Disable convolution -> dot simplification on platforms where it causes a
-  // slowdown.
-  bool enable_conv_simplification_;
-
   // Cached computation for adding two scalar F32.
   HloComputation* scalar_add_computation_ = nullptr;
 };
@@ -348,19 +376,15 @@ class AlgebraicSimplifierVisitor : public DfsHloVisitorWithDefault {
 }  // namespace
 
 bool AlgebraicSimplifierVisitor::Run(
-    HloComputation* computation, bool is_layout_sensitive,
-    AlgebraicSimplifier::ValidBitcastCallback valid_bitcast_callback,
-    bool enable_dot_strength_reduction, bool enable_conv_simplification) {
-  AlgebraicSimplifierVisitor visitor(
-      computation, is_layout_sensitive, std::move(valid_bitcast_callback),
-      enable_dot_strength_reduction, enable_conv_simplification);
+    HloComputation* computation, const AlgebraicSimplifierOptions& options) {
+  AlgebraicSimplifierVisitor visitor(computation, options);
   TF_CHECK_OK(computation->Accept(&visitor));
   return visitor.changed_;
 }
 
 bool AlgebraicSimplifierVisitor::SameShape(const HloInstruction* lhs,
                                            const HloInstruction* rhs) const {
-  if (is_layout_sensitive_) {
+  if (options_.is_layout_sensitive()) {
     return ShapeUtil::Equal(lhs->shape(), rhs->shape());
   } else {
     return ShapeUtil::Compatible(lhs->shape(), rhs->shape());
@@ -431,6 +455,40 @@ Status AlgebraicSimplifierVisitor::HandleAdd(HloInstruction* add) {
                                           sum_of_constants));
   }
 
+  // A*C + B*C => (A+B)*C
+  //
+  //  - If A, B, and C are integers, do this unconditionally. Proof of
+  //    correctness: https://rise4fun.com/Alive/u9X.
+  //
+  //  - If A, B, and C are floating point, do this if C is a scalar constant or
+  //    broadcast of scalar constant and is equal to +/- 2^k for some (possibly
+  //    negative) integer k.
+  //
+  //    Multiplying by a power of 2 just moves the exponent, so our answer is
+  //    exact modulo rounding of intermediate results so long as
+  //
+  //     - none of the three products has an exponent which underflows (so the
+  //       result is 0 or denormal), and
+  //     - none of the three products overflows to inf.
+  //
+  //    Proof: See algebraic_simplifier_proof_distributive_property.py.
+  //
+  //    We deem these differences in rounding, underflow, and overflow
+  //    acceptable in the ML context.
+  HloInstruction *b, *c;
+  if (((Match(lhs, m::Multiply(m::Op(&a), m::Op(&c))) &&
+        Match(rhs, m::MultiplyAnyOrder(m::Op().Is(c), m::Op(&b)))) ||
+       (Match(lhs, m::Multiply(m::Op(&c), m::Op(&a))) &&
+        Match(rhs, m::MultiplyAnyOrder(m::Op().Is(c), m::Op(&b))))) &&
+      (ShapeUtil::ElementIsIntegral(add->shape()) ||
+       IsAllFpConstantPowerOf2(c))) {
+    return ReplaceWithNewInstruction(
+        add, HloInstruction::CreateBinary(
+                 add->shape(), HloOpcode::kMultiply,
+                 computation_->AddInstruction(HloInstruction::CreateBinary(
+                     add->shape(), HloOpcode::kAdd, a, b)),
+                 c));
+  }
   return Status::OK();
 }
 
@@ -504,8 +562,8 @@ Status AlgebraicSimplifierVisitor::HandleCopy(HloInstruction* copy) {
     return Status::OK();
   }
 
-  if (is_layout_sensitive_ &&
-      ReshapeOrCopyIsBitcast(copy, valid_bitcast_callback_)) {
+  if (options_.is_layout_sensitive() &&
+      ReshapeOrCopyIsBitcast(copy, options_.valid_bitcast_callback())) {
     ReplaceWithBitcast(copy);
   }
 
@@ -541,7 +599,74 @@ Status AlgebraicSimplifierVisitor::HandleConcatenate(
     VLOG(10) << "trying to replace " << concatenate->ToString() << " with "
              << replacement->ToString();
     ReplaceInstructionIfSameShape(concatenate, replacement);
-  } else if (operands.size() == 2) {
+    return Status::OK();
+  }
+
+  // Check if we can merge "adjacent" slice operands which take slices from the
+  // same other op. For simplicity we only merge unstrided slices.
+  int64 concatenate_dimension = concatenate->concatenate_dimension();
+  for (int64 i = 0; i < operands.size(); ++i) {
+    if (operands[i]->opcode() != HloOpcode::kSlice ||
+        !IsUnstridedSlice(operands[i])) {
+      continue;
+    }
+    int64 slice_end = operands[i]->slice_limits(concatenate_dimension);
+    HloInstruction* slice_operand = operands[i]->mutable_operand(0);
+    int64 j = i + 1;
+    while (j < operands.size() && operands[j]->opcode() == HloOpcode::kSlice &&
+           IsUnstridedSlice(operands[j]) &&
+           operands[j]->operand(0) == slice_operand &&
+           operands[j]->slice_starts(concatenate_dimension) == slice_end) {
+      // Check that all the slice_start values are the same in all other
+      // dimensions. This implies that the slice_limit values are also the same,
+      // because operands of concatenate need to have the same shape, and we
+      // already checked that the slices are unstrided.
+      bool same_other_starts = true;
+      for (int64 k = 0; k < operands[j]->slice_starts().size(); ++k) {
+        if (k == concatenate_dimension) {
+          continue;
+        }
+        if (operands[i]->slice_starts(k) != operands[j]->slice_starts(k)) {
+          same_other_starts = false;
+          break;
+        }
+      }
+      if (!same_other_starts) {
+        break;
+      }
+      slice_end = operands[j]->slice_limits(concatenate_dimension);
+      ++j;
+    }
+    if (j - i > 1) {
+      Shape new_slice_shape = operands[i]->shape();
+      new_slice_shape.set_dimensions(
+          concatenate_dimension,
+          slice_end - operands[i]->slice_starts(concatenate_dimension));
+      auto new_limit_indices = operands[i]->slice_limits();
+      new_limit_indices[concatenate_dimension] = slice_end;
+      auto new_slice_op =
+          computation_->AddInstruction(HloInstruction::CreateSlice(
+              new_slice_shape, slice_operand,
+              /*start_indices=*/operands[i]->slice_starts(),
+              /*limit_indices=*/new_limit_indices,
+              /*strides=*/operands[i]->slice_strides()));
+      std::vector<HloInstruction*> new_operands;
+      for (int64 k = 0; k < i; ++k) {
+        new_operands.push_back(operands[k]);
+      }
+      new_operands.push_back(new_slice_op);
+      for (int64 k = j; k < operands.size(); ++k) {
+        new_operands.push_back(operands[k]);
+      }
+      auto replacement =
+          computation_->AddInstruction(concatenate->CloneWithNewOperands(
+              concatenate->shape(), new_operands));
+      ReplaceInstructionIfSameShape(concatenate, replacement);
+      return Status::OK();
+    }
+  }
+
+  if (operands.size() == 2) {
     // A binary concat with a broadcasted scalar as an operand can be converted
     // into a pad which is simpler to fold into other operations.
     bool is_effective_low_pad = Match(
@@ -557,7 +682,7 @@ Status AlgebraicSimplifierVisitor::HandleConcatenate(
       padding_config_dim->set_edge_padding_high(0);
       padding_config_dim->set_edge_padding_low(0);
       padding_config_dim->set_interior_padding(0);
-      if (dim == concatenate->concatenate_dimension()) {
+      if (dim == concatenate_dimension) {
         if (is_effective_low_pad) {
           padding_config_dim->set_edge_padding_low(
               operands[0]->shape().dimensions(dim));
@@ -1215,7 +1340,8 @@ Status AlgebraicSimplifierVisitor::HandleDot(HloInstruction* dot) {
     return ReplaceInstruction(dot, dot_of_gather_optimized);
   }
 
-  if (enable_dot_strength_reduction_ && !is_layout_sensitive_) {
+  if (options_.enable_dot_strength_reduction() &&
+      !options_.is_layout_sensitive()) {
     TF_ASSIGN_OR_RETURN(bool did_strength_reduction,
                         HandleDotStrengthReduction(dot));
     if (did_strength_reduction) {
@@ -1619,6 +1745,27 @@ Status AlgebraicSimplifierVisitor::HandlePad(HloInstruction* pad) {
         pad, HloInstruction::CreateBroadcast(pad->shape(),
                                              pad->mutable_operand(1), {}));
   }
+
+  // Interior padding on one sized dimensions have no effect. As a result it
+  // makes other simplifications possible if there is no interior padding.
+  if (HasInteriorPadding(pad->padding_config())) {
+    PaddingConfig padding_config = pad->padding_config();
+    bool cleared_interior_padding = false;
+    for (int64 i = 0; i < ShapeUtil::Rank(pad->shape()); ++i) {
+      if (padding_config.dimensions(i).interior_padding() > 0 &&
+          pad->operand(0)->shape().dimensions(i) == 1) {
+        cleared_interior_padding = true;
+        padding_config.mutable_dimensions(i)->set_interior_padding(0);
+      }
+    }
+    if (cleared_interior_padding) {
+      return ReplaceWithNewInstruction(
+          pad,
+          HloInstruction::CreatePad(pad->shape(), pad->mutable_operand(0),
+                                    pad->mutable_operand(1), padding_config));
+    }
+  }
+
   // Eliminate nop pads (padding all zero), and replace a pad with negative
   // padding with a pad with non-negative padding followed by a slice.
   bool all_zero = true;
@@ -1910,8 +2057,8 @@ Status AlgebraicSimplifierVisitor::HandleReshape(HloInstruction* reshape) {
   }
 
   // Make this a bitcast if possible.
-  if (is_layout_sensitive_ &&
-      ReshapeOrCopyIsBitcast(reshape, valid_bitcast_callback_)) {
+  if (options_.is_layout_sensitive() &&
+      ReshapeOrCopyIsBitcast(reshape, options_.valid_bitcast_callback())) {
     ReplaceWithBitcast(reshape);
     return Status::OK();
   }
@@ -2030,11 +2177,6 @@ StatusOr<bool> AlgebraicSimplifierVisitor::TrySimplifyScalarSlice(
   return false;
 }
 
-bool IsUnstridedSlice(const HloInstruction* hlo) {
-  return absl::c_all_of(hlo->slice_strides(),
-                        [](int64 stride) { return stride == 1; });
-}
-
 StatusOr<bool> AlgebraicSimplifierVisitor::TryToReorderSliceAndReshape(
     HloInstruction* slice) {
   CHECK_EQ(slice->opcode(), HloOpcode::kSlice);
@@ -2501,6 +2643,108 @@ Status AlgebraicSimplifierVisitor::HandleSort(HloInstruction* sort) {
     return ReplaceWithNewInstruction(
         sort, HloInstruction::CreateTuple(sort->operands()));
   }
+  if (!options_.enable_permutation_sort_replacement()) {
+    return Status::OK();
+  }
+  // Check if we are sorting a permutation. In that case, we know that the keys
+  // will be sorted to the identity permutation, and we can represent the
+  // changes to the 'values' parameter as a scatter.
+  if (sort->operand_count() == 2 &&
+      operand->opcode() == HloOpcode::kGetTupleElement) {
+    const HloInstruction* other_sort = operand->operand(0);
+    // Check whether the 'values' parameter is the result of another sort with
+    // the same sort dimension.
+    if (other_sort->opcode() == HloOpcode::kSort &&
+        other_sort->operand_count() >= 2 &&
+        other_sort->dimensions(0) == dimension_to_sort &&
+        other_sort->operand(operand->tuple_index())->opcode() ==
+            HloOpcode::kIota) {
+      auto* iota =
+          Cast<HloIotaInstruction>(other_sort->operand(operand->tuple_index()));
+      // The sort operand needs to be an integral iota, and the iota dimension
+      // needs to be the dimension that was sorted.
+      if (iota->iota_dimension() == dimension_to_sort &&
+          ShapeUtil::ElementIsIntegral(iota->shape())) {
+        // We use the following construction method for a Scatter that applies
+        // the permutation from 'keys' to the 'values' parameter.
+        // - Take the "keys" parameter of the second sort and reshape it to have
+        //   another "1" dimension at the end.
+        // - Concatenate it with iotas of the same extended shape with all
+        //   different iota_dimensions except the dimension_to_sort in the order
+        //   of iota_dimensions/dimension_to_sort, so e.g. with rank 3 and
+        //   dimension_to_sort = 1, we would have concatenate of (iota with
+        //   iota_dimension=0, keys, iota with iota_dimension = 2)
+        // - Use this as the indices parameter of scatter, and set updates
+        //   of the scatter to be a reshaped 'values' parameter of sort (adding
+        //   'rank' many 1 dimensions at the end).
+        int64 rank = ShapeUtil::Rank(operand->shape());
+        Shape extended_shape = operand->shape();
+        extended_shape.add_dimensions(1);
+        extended_shape.mutable_layout()->add_minor_to_major(rank);
+        auto reshaped_permutation = computation_->AddInstruction(
+            HloInstruction::CreateReshape(extended_shape, operand));
+        std::vector<HloInstruction*> concat_operands;
+        for (int64 i = 0; i < rank; ++i) {
+          if (i == dimension_to_sort) {
+            concat_operands.push_back(reshaped_permutation);
+          } else {
+            concat_operands.push_back(computation_->AddInstruction(
+                HloInstruction::CreateIota(extended_shape, i)));
+          }
+        }
+        Shape concat_shape = operand->shape();
+        concat_shape.add_dimensions(rank);
+        concat_shape.mutable_layout()->add_minor_to_major(rank);
+        auto scatter_indices =
+            rank > 1 ? computation_->AddInstruction(
+                           HloInstruction::CreateConcatenate(
+                               concat_shape, concat_operands, rank))
+                     : reshaped_permutation;
+
+        // We don't care about the operand, it will be completely overridden by
+        // the updates.
+        auto scatter_operand = computation_->AddInstruction(
+            HloInstruction::CreateIota(sort->operand(1)->shape(), 0));
+
+        // Construct the updates operand of scatter.
+        Shape update_shape = sort->operand(1)->shape();
+        for (int64 i = 0; i < rank; ++i) {
+          update_shape.add_dimensions(1);
+          update_shape.mutable_layout()->add_minor_to_major(rank + i);
+        }
+        auto scatter_updates =
+            computation_->AddInstruction(HloInstruction::CreateReshape(
+                update_shape, sort->mutable_operand(1)));
+
+        // Construct the updates computation, which simply replaces the operand
+        // values with the update values.
+        HloComputation::Builder b("update_replace_computation");
+        Shape scalar_shape = ShapeUtil::MakeShape(S32, {});
+        b.AddInstruction(
+            HloInstruction::CreateParameter(0, scalar_shape, "scalar_lhs"));
+        auto scalar_rhs = b.AddInstruction(
+            HloInstruction::CreateParameter(1, scalar_shape, "scalar_rhs"));
+        auto update_replace_computation =
+            computation_->parent()->AddEmbeddedComputation(b.Build(scalar_rhs));
+
+        ScatterDimensionNumbers dim_numbers;
+        dim_numbers.set_index_vector_dim(rank);
+        for (int64 i = 0; i < rank; ++i) {
+          dim_numbers.add_update_window_dims(rank + i);
+          dim_numbers.add_scatter_dims_to_operand_dims(i);
+        }
+        auto scatter =
+            computation_->AddInstruction(HloInstruction::CreateScatter(
+                sort->operand(1)->shape(), scatter_operand, scatter_indices,
+                scatter_updates, update_replace_computation, dim_numbers));
+        return ReplaceWithNewInstruction(
+            sort, HloInstruction::CreateTuple(
+                      {computation_->AddInstruction(HloInstruction::CreateIota(
+                           operand->shape(), dimension_to_sort)),
+                       scatter}));
+      }
+    }
+  }
   return Status::OK();
 }
 
@@ -2525,7 +2769,7 @@ Status AlgebraicSimplifierVisitor::HandleTranspose(HloInstruction* transpose) {
     return ReplaceInstruction(transpose, operand);
   }
 
-  if (is_layout_sensitive_ && TransposeIsBitcast(transpose)) {
+  if (options_.is_layout_sensitive() && TransposeIsBitcast(transpose)) {
     ReplaceWithBitcast(transpose);
     return Status::OK();
   }
@@ -2674,13 +2918,13 @@ StatusOr<bool> AlgebraicSimplifierVisitor::SimplifyConvToDot(
   const ConvolutionDimensionNumbers& dnums =
       convolution->convolution_dimension_numbers();
 
-  if (!enable_conv_simplification_) {
+  if (!options_.enable_conv_simplification()) {
     return false;
   }
 
   // TODO(b/31337498): For now, we cowardly refuse to do this optimization in
   // layout-insensitive mode, for fear of adding nontrivial reshapes.
-  if (!is_layout_sensitive_) {
+  if (!options_.is_layout_sensitive()) {
     return false;
   }
 
@@ -2770,9 +3014,9 @@ StatusOr<bool> AlgebraicSimplifierVisitor::SimplifyConvToDot(
   // We cannot insert bitcasts if the layouts will not be compatible.
   // TODO(b/33178038): Consider inserting a transpose if a bitcast would be
   // invalid.
-  if (!valid_bitcast_callback_(input_shape, new_input_shape) ||
-      !valid_bitcast_callback_(filter_shape, new_filter_shape) ||
-      !valid_bitcast_callback_(dot_output_shape, convolution_shape)) {
+  if (!options_.valid_bitcast_callback()(input_shape, new_input_shape) ||
+      !options_.valid_bitcast_callback()(filter_shape, new_filter_shape) ||
+      !options_.valid_bitcast_callback()(dot_output_shape, convolution_shape)) {
     return false;
   }
 
@@ -2878,9 +3122,7 @@ StatusOr<bool> AlgebraicSimplifier::Run(HloModule* module) {
                  "AlgebraicSimplifier::Run(), before:\n" + module->ToString());
   bool changed = false;
   for (auto* comp : module->MakeNonfusionComputations()) {
-    if (AlgebraicSimplifierVisitor::Run(
-            comp, is_layout_sensitive_, valid_bitcast_callback_,
-            enable_dot_strength_reduction_, enable_conv_simplification_)) {
+    if (AlgebraicSimplifierVisitor::Run(comp, options_)) {
       changed = true;
     }
   }
diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier.h b/tensorflow/compiler/xla/service/algebraic_simplifier.h
index 9f8d0ee88bd..d2775b9fafa 100644
--- a/tensorflow/compiler/xla/service/algebraic_simplifier.h
+++ b/tensorflow/compiler/xla/service/algebraic_simplifier.h
@@ -23,8 +23,7 @@ limitations under the License.
 
 namespace xla {
 
-// A pass which performs algebraic simplifications.
-class AlgebraicSimplifier : public HloModulePass {
+class AlgebraicSimplifierOptions {
  public:
   // Given shapes 'from_shape' and 'to_shape', determines if it is valid to
   // bitcast from 'from_shape' to 'to_shape' after considering platform
@@ -34,18 +33,63 @@ class AlgebraicSimplifier : public HloModulePass {
   using ValidBitcastCallback =
       std::function<bool(const Shape& from_shape, const Shape& to_shape)>;
 
+  explicit AlgebraicSimplifierOptions(
+      ValidBitcastCallback valid_bitcast_callback)
+      : valid_bitcast_callback_(std::move(valid_bitcast_callback)) {}
+  // If valid_bitcast_callback returns true, then the pass will replace reshapes
+  // and transposes with bitcasts.
+  const ValidBitcastCallback& valid_bitcast_callback() const {
+    return valid_bitcast_callback_;
+  }
+
   // If is_layout_sensitive is true, then the simplifier preserves layout during
-  // transformation. Otherwise, layout is ignored. If valid_bitcast_callback
-  // returns true, then the pass will replace reshapes and transposes with
-  // bitcasts.
-  AlgebraicSimplifier(bool is_layout_sensitive,
-                      ValidBitcastCallback valid_bitcast_callback,
-                      bool enable_dot_strength_reduction = true,
-                      bool enable_conv_simplification = true)
-      : is_layout_sensitive_(is_layout_sensitive),
-        valid_bitcast_callback_(std::move(valid_bitcast_callback)),
-        enable_dot_strength_reduction_(enable_dot_strength_reduction),
-        enable_conv_simplification_(enable_conv_simplification) {}
+  // transformation. Otherwise, layout is ignored.
+  void set_is_layout_sensitive(bool is_layout_sensitive) {
+    is_layout_sensitive_ = is_layout_sensitive;
+  }
+  bool is_layout_sensitive() const { return is_layout_sensitive_; }
+
+  // Enable dot simplification on platforms where it is profitable.
+  void set_enable_dot_strength_reduction(bool enable_dot_strength_reduction) {
+    enable_dot_strength_reduction_ = enable_dot_strength_reduction;
+  }
+  bool enable_dot_strength_reduction() const {
+    return enable_dot_strength_reduction_;
+  }
+
+  // Enable convolution simplification on platforms where it is profitable.
+  void set_enable_conv_simplification(bool enable_conv_simplification) {
+    enable_conv_simplification_ = enable_conv_simplification;
+  }
+  bool enable_conv_simplification() const {
+    return enable_conv_simplification_;
+  }
+
+  // If enable_permutation_sort_replacement is true, a sort op that is known to
+  // sort a permutation will be replaced with a scatter op.
+  void set_enable_permutation_sort_replacement(
+      bool enable_permutation_sort_replacement) {
+    enable_permutation_sort_replacement_ = enable_permutation_sort_replacement;
+  }
+  bool enable_permutation_sort_replacement() const {
+    return enable_permutation_sort_replacement_;
+  }
+
+ private:
+  ValidBitcastCallback valid_bitcast_callback_;
+  bool is_layout_sensitive_{false};
+  bool enable_dot_strength_reduction_{true};
+  bool enable_conv_simplification_{true};
+  bool enable_permutation_sort_replacement_{false};
+};
+
+// A pass which performs algebraic simplifications.
+class AlgebraicSimplifier : public HloModulePass {
+ public:
+  // If is_layout_sensitive is true, then the simplifier preserves layout during
+  // transformation. Otherwise, layout is ignored.
+  explicit AlgebraicSimplifier(const AlgebraicSimplifierOptions& options)
+      : options_(options) {}
   ~AlgebraicSimplifier() override = default;
   absl::string_view name() const override { return "algsimp"; }
 
@@ -54,14 +98,7 @@ class AlgebraicSimplifier : public HloModulePass {
   StatusOr<bool> Run(HloModule* module) override;
 
  private:
-  bool is_layout_sensitive_;
-  ValidBitcastCallback valid_bitcast_callback_;
-
-  // Enable dot simplification on platforms where it is profitable.
-  bool enable_dot_strength_reduction_;
-
-  // Enable convolution simplification on platforms where it is profitable.
-  bool enable_conv_simplification_;
+  AlgebraicSimplifierOptions options_;
 };
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier_proof_distributive_property.py b/tensorflow/compiler/xla/service/algebraic_simplifier_proof_distributive_property.py
new file mode 100644
index 00000000000..5da13da041b
--- /dev/null
+++ b/tensorflow/compiler/xla/service/algebraic_simplifier_proof_distributive_property.py
@@ -0,0 +1,82 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Proof that transforming (A*C)+(B*C) <=> (A+B)*C is "safe" if C=2^k.
+
+Specifically, for all floating-point values A, B, and C, if
+
+ - C is equal to +/- 2^k for some (possibly negative) integer k, and
+ - A, B, C, A*C, B*C, and A+B are not subnormal, zero, or inf,
+
+then there exists a rounding mode rm in [RTZ, RNE] such that
+
+ (A*C) + (B*C) == (A+B) * C  (computed with rounding mode rm).
+
+Informally, this means that the equivalence holds for powers of 2 C, modulo
+flushing to zero or inf, and modulo rounding of intermediate results.
+
+Requires z3 python bindings; try `pip install z3-solver`.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import z3
+
+# We do float16 because it lets the solver run much faster.  These results
+# should generalize to fp32 and fp64, and you can verify this by changing the
+# value of FLOAT_TY (and then waiting a while).
+FLOAT_TY = z3.Float16
+
+a = z3.FP("a", FLOAT_TY())
+b = z3.FP("b", FLOAT_TY())
+c = z3.FP("c", FLOAT_TY())
+
+s = z3.Solver()
+
+# C must be a power of 2, i.e. significand bits must all be 0.
+s.add(z3.Extract(FLOAT_TY().sbits() - 1, 0, z3.fpToIEEEBV(c)) == 0)
+
+for rm in [z3.RTZ(), z3.RNE()]:
+  z3.set_default_rounding_mode(rm)
+  before = a * c + b * c
+  after = (a + b) * c
+
+  # Check that before == after, allowing that 0 == -0.
+  s.add(
+      z3.Not(
+          z3.Or(
+              before == after,  #
+              z3.And(z3.fpIsZero(before), z3.fpIsZero(after)))))
+
+  for x in [
+      (a * c),
+      (b * c),
+      (a + b),
+  ]:
+    s.add(z3.Not(z3.fpIsSubnormal(x)))
+    s.add(z3.Not(z3.fpIsZero(x)))
+    s.add(z3.Not(z3.fpIsInf(x)))
+
+if s.check() == z3.sat:
+  m = s.model()
+  print("Counterexample found!")
+  print(m)
+  print("a*c:       ", z3.simplify(m[a] * m[c]))
+  print("b*c:       ", z3.simplify(m[b] * m[c]))
+  print("a+b:       ", z3.simplify(m[a] + m[b]))
+  print("a*c + b*c: ", z3.simplify(m[a] * m[c] + m[b] * m[c]))
+  print("(a+b) * c: ", z3.simplify((m[a] + m[b]) * m[c]))
+else:
+  print("Proved!")
diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc b/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc
index e4c4da1b0e7..14ce519b6a0 100644
--- a/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc
+++ b/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc
@@ -27,9 +27,11 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_instructions.h"
-#include "tensorflow/compiler/xla/service/hlo_matchers.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_fix.h"
+#include "tensorflow/compiler/xla/service/pattern_matcher.h"
+#include "tensorflow/compiler/xla/service/pattern_matcher_gmock.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
@@ -42,18 +44,20 @@ namespace xla {
 namespace {
 
 using ::testing::ElementsAre;
+namespace m = match;
 
-namespace op = xla::testing::opcode_matchers;
-
-AlgebraicSimplifier::ValidBitcastCallback bitcasting_callback() {
+AlgebraicSimplifierOptions::ValidBitcastCallback bitcasting_callback() {
   return [](const Shape&, const Shape&) { return true; };
 }
 
-AlgebraicSimplifier::ValidBitcastCallback non_bitcasting_callback() {
+AlgebraicSimplifierOptions::ValidBitcastCallback non_bitcasting_callback() {
   return [](const Shape&, const Shape&) { return false; };
 }
 
-class AlgebraicSimplifierTest : public HloTestBase {};
+class AlgebraicSimplifierTest : public HloTestBase {
+ protected:
+  AlgebraicSimplifierOptions default_options_{non_bitcasting_callback()};
+};
 
 // Test that A + 0 is simplified to A
 TEST_F(AlgebraicSimplifierTest, AddZero) {
@@ -70,13 +74,134 @@ TEST_F(AlgebraicSimplifierTest, AddZero) {
   auto computation = m->AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root->opcode(), HloOpcode::kAdd);
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
   root = computation->root_instruction();
   EXPECT_EQ(root, param0);
 }
 
+TEST_F(AlgebraicSimplifierTest, FactorIntegerAddition) {
+  const char* kModuleStr = R"(
+    HloModule m
+    test {
+      p0 = s32[8] parameter(0)
+      p1 = s32[8] parameter(1)
+      p2 = s32[8] parameter(2)
+      x = s32[8] multiply(p0, p2)
+      y = s32[8] multiply(p1, p2)
+      ROOT sum = s32[8] add(x, y)
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(kModuleStr));
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
+  EXPECT_THAT(
+      m->entry_computation()->root_instruction(),
+      GmockMatch(m::MultiplyAnyOrder(
+          m::AddAnyOrder(m::Parameter(0), m::Parameter(1)), m::Parameter(2))));
+}
+
+// A*C + B*C => (A+B)*C if C is a floating-point power of 2.
+TEST_F(AlgebraicSimplifierTest, FactorFpAddition) {
+  const char* kModuleStr = R"(
+    HloModule m
+    test {
+      p0 = f32[] parameter(0)
+      p1 = f32[] parameter(1)
+      c = f32[] constant(0.125)
+      x = f32[] multiply(p0, c)
+      y = f32[] multiply(p1, c)
+      ROOT sum = f32[] add(x, y)
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(kModuleStr));
+  ASSERT_TRUE(AlgebraicSimplifier(default_options_).Run(m.get()).ValueOrDie());
+  EXPECT_THAT(m->entry_computation()->root_instruction(),
+              GmockMatch(m::MultiplyAnyOrder(
+                  m::AddAnyOrder(m::Parameter(0), m::Parameter(1)),
+                  m::ConstantScalar(0.125))));
+}
+
+// A*C + B*C => (A+B)*C if C is a broadcast of a floating-point power of 2.
+TEST_F(AlgebraicSimplifierTest, FactorFpAdditionWithBroadcast) {
+  const char* kModuleStr = R"(
+    HloModule m
+    test {
+      p0 = f32[4] parameter(0)
+      p1 = f32[4] parameter(1)
+      c = f32[] constant(0.125)
+      b = f32[4] broadcast(c), dimensions={}
+      x = f32[4] multiply(p0, b)
+      y = f32[4] multiply(p1, b)
+      ROOT sum = f32[4] add(x, y)
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(kModuleStr));
+  ASSERT_TRUE(AlgebraicSimplifier(default_options_).Run(m.get()).ValueOrDie());
+  EXPECT_THAT(m->entry_computation()->root_instruction(),
+              GmockMatch(m::MultiplyAnyOrder(
+                  m::AddAnyOrder(m::Parameter(0), m::Parameter(1)),
+                  m::Broadcast(m::ConstantScalar(0.125)))));
+}
+
+// A*C + B*C => (A+B)*C simplification should not happen if C is not a
+// floating-point power of 2.
+TEST_F(AlgebraicSimplifierTest, FactorFpAdditionNotPowerOf2) {
+  const char* kModuleStr = R"(
+    HloModule m
+    test {
+      p0 = f32[] parameter(0)
+      p1 = f32[] parameter(1)
+      c = f32[] constant(0.3)
+      x = f32[] multiply(p0, c)
+      y = f32[] multiply(p1, c)
+      ROOT sum = f32[] add(x, y)
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(kModuleStr));
+  EXPECT_FALSE(AlgebraicSimplifier(default_options_).Run(m.get()).ValueOrDie());
+}
+
+// A*C + B*C => (A+B)*C simplification should not happen if A, B, and C are
+// complex numbers.
+TEST_F(AlgebraicSimplifierTest, FactorFpAdditionComplex) {
+  const char* kModuleStr = R"(
+    HloModule m
+    test {
+      p0 = c64[8] parameter(0)
+      p1 = c64[8] parameter(1)
+      p2 = c64[8] parameter(2)
+      x = c64[8] multiply(p0, p2)
+      y = c64[8] multiply(p1, p2)
+      ROOT sum = c64[8] add(x, y)
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(kModuleStr));
+  EXPECT_FALSE(AlgebraicSimplifier(default_options_).Run(m.get()).ValueOrDie());
+}
+
+// A*C + B*C => (A+B)*C simplification is OK if A, B, and C are complex.
+TEST_F(AlgebraicSimplifierTest, FactorFpAdditionBfloat16) {
+  const char* kModuleStr = R"(
+    HloModule m
+    test {
+      p0 = bf16[4] parameter(0)
+      p1 = bf16[4] parameter(1)
+      c = bf16[] constant(0.125)
+      b = bf16[4] broadcast(c), dimensions={}
+      x = bf16[4] multiply(p0, b)
+      y = bf16[4] multiply(p1, b)
+      ROOT sum = bf16[4] add(x, y)
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(kModuleStr));
+  ASSERT_TRUE(AlgebraicSimplifier(default_options_).Run(m.get()).ValueOrDie());
+  EXPECT_THAT(m->entry_computation()->root_instruction(),
+              GmockMatch(m::MultiplyAnyOrder(
+                  m::AddAnyOrder(m::Parameter(0), m::Parameter(1)),
+                  m::Broadcast(m::ConstantScalar(0.125)))));
+}
+
 // Test that A * 0 is simplified to 0
 TEST_F(AlgebraicSimplifierTest, MulZero) {
   auto m = CreateNewVerifiedModule();
@@ -92,8 +217,7 @@ TEST_F(AlgebraicSimplifierTest, MulZero) {
   auto computation = m->AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root->opcode(), HloOpcode::kMultiply);
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
   EXPECT_EQ(computation->root_instruction(), zero);
 }
@@ -115,8 +239,7 @@ TEST_F(AlgebraicSimplifierTest, SelectTrue) {
   auto computation = module->AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root->opcode(), HloOpcode::kSelect);
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
   EXPECT_EQ(computation->root_instruction(), param0);
 }
@@ -138,8 +261,7 @@ TEST_F(AlgebraicSimplifierTest, SelectFalse) {
   auto computation = module->AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root->opcode(), HloOpcode::kSelect);
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
   EXPECT_EQ(computation->root_instruction(), param1);
 }
@@ -159,8 +281,7 @@ TEST_F(AlgebraicSimplifierTest, SelectIdentical) {
   auto computation = module->AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root->opcode(), HloOpcode::kSelect);
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
   EXPECT_EQ(computation->root_instruction(), param1);
 }
@@ -196,11 +317,10 @@ TEST_F(AlgebraicSimplifierTest, TwoReducesToOne) {
   builder.AddInstruction(HloInstruction::CreateReduce(r1f32, reduce0, zero,
                                                       dims1, add_computation));
   m->AddEntryComputation(builder.Build());
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
   HloInstruction* root = m->entry_computation()->root_instruction();
-  EXPECT_THAT(root, op::Reduce(param, zero));
+  EXPECT_THAT(root, GmockMatch(m::Reduce(m::Parameter(0), m::Op().Is(zero))));
   EXPECT_EQ(root->dimensions(), std::vector<int64>({0, 2, 3}));
 }
 
@@ -219,11 +339,10 @@ TEST_F(AlgebraicSimplifierTest, AddConstOnLHS) {
   auto computation = m->AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root->opcode(), HloOpcode::kAdd);
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
   root = computation->root_instruction();
-  EXPECT_THAT(root, op::Add(param0, op::Constant()));
+  EXPECT_THAT(root, GmockMatch(m::Add(m::Parameter(0), m::Constant())));
 }
 
 // Test that [(A + C1) + C2] => [A + (C1 + C2)] for constants C1 and C2.
@@ -246,11 +365,12 @@ TEST_F(AlgebraicSimplifierTest, AddReassociateMergeConstants) {
   auto computation = m->AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root->opcode(), HloOpcode::kAdd);
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
   root = computation->root_instruction();
-  EXPECT_THAT(root, op::Add(param0, op::Add(constant1, constant2)));
+  EXPECT_THAT(root, GmockMatch(m::Add(
+                        m::Op().Is(param0),
+                        m::Add(m::Op().Is(constant1), m::Op().Is(constant2)))));
 }
 
 TEST_F(AlgebraicSimplifierTest, AddBroadcastZeroR0Operand) {
@@ -269,8 +389,7 @@ TEST_F(AlgebraicSimplifierTest, AddBroadcastZeroR0Operand) {
   auto computation = m->AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root->opcode(), HloOpcode::kAdd);
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
   root = computation->root_instruction();
   EXPECT_EQ(root, param0);
@@ -306,11 +425,11 @@ TEST_F(AlgebraicSimplifierTest, InlineTrivialMap) {
   auto computation = m->AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root->opcode(), HloOpcode::kMap);
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
   root = computation->root_instruction();
-  EXPECT_THAT(root, op::Add(param0, op::Broadcast(zero)));
+  EXPECT_THAT(root, GmockMatch(m::Add(m::Parameter(0),
+                                      m::Broadcast(m::Op().Is(zero)))));
 }
 
 TEST_F(AlgebraicSimplifierTest, AddBroadcastZeroR1Operand) {
@@ -329,8 +448,7 @@ TEST_F(AlgebraicSimplifierTest, AddBroadcastZeroR1Operand) {
   auto computation = m->AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root->opcode(), HloOpcode::kAdd);
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
   root = computation->root_instruction();
   EXPECT_EQ(root, param0);
@@ -344,12 +462,11 @@ TEST_F(AlgebraicSimplifierTest, ConstantToBroadcast) {
 
   auto computation = m->AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
-  EXPECT_THAT(root, op::Constant());
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  EXPECT_THAT(root, GmockMatch(m::Constant()));
+  AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
   root = computation->root_instruction();
-  EXPECT_THAT(root, op::Broadcast(op::Constant()));
+  EXPECT_THAT(root, GmockMatch(m::Broadcast(m::Constant())));
   EXPECT_EQ(3.14f, root->operand(0)->literal().GetFirstElement<float>());
 }
 
@@ -361,12 +478,11 @@ TEST_F(AlgebraicSimplifierTest, ConstantNotToBroadcast) {
 
   auto computation = m->AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
-  EXPECT_THAT(root, op::Constant());
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  EXPECT_THAT(root, GmockMatch(m::Constant()));
+  AlgebraicSimplifier simplifier(default_options_);
   ASSERT_FALSE(simplifier.Run(m.get()).ValueOrDie());
   root = computation->root_instruction();
-  EXPECT_THAT(root, op::Constant());
+  EXPECT_THAT(root, GmockMatch(m::Constant()));
 }
 
 TEST_F(AlgebraicSimplifierTest, IotaToBroadcast) {
@@ -377,12 +493,11 @@ TEST_F(AlgebraicSimplifierTest, IotaToBroadcast) {
 
   auto computation = m->AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
-  EXPECT_THAT(root, op::Constant());
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  EXPECT_THAT(root, GmockMatch(m::Constant()));
+  AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
   root = computation->root_instruction();
-  EXPECT_THAT(root, op::Iota());
+  EXPECT_THAT(root, GmockMatch(m::Iota()));
 }
 
 // Test that A - 0 is simplified to A
@@ -400,8 +515,7 @@ TEST_F(AlgebraicSimplifierTest, SubZero) {
   auto computation = m->AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root->opcode(), HloOpcode::kSubtract);
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
   root = computation->root_instruction();
   EXPECT_EQ(root, param0);
@@ -422,11 +536,11 @@ TEST_F(AlgebraicSimplifierTest, SubConstCanonicalization) {
   auto computation = m->AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root->opcode(), HloOpcode::kSubtract);
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
   root = computation->root_instruction();
-  EXPECT_THAT(root, op::Add(param0, op::Negate(constant)));
+  EXPECT_THAT(root, GmockMatch(m::Add(m::Parameter(0),
+                                      m::Negate(m::Op().Is(constant)))));
 }
 
 // Test that (A/B)/C is simplified to A/(B*C).
@@ -448,14 +562,16 @@ TEST_F(AlgebraicSimplifierTest, LhsDivOfDiv) {
   auto computation = m->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
-              op::Divide(op::Divide(param0, param1), param2));
+              GmockMatch(m::Divide(m::Divide(m::Parameter(0), m::Parameter(1)),
+                                   m::Parameter(2))));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
-  EXPECT_THAT(computation->root_instruction(),
-              op::Divide(param0, op::Multiply(param1, param2)));
+  EXPECT_THAT(
+      computation->root_instruction(),
+      GmockMatch(m::Divide(m::Parameter(0),
+                           m::Multiply(m::Parameter(1), m::Parameter(2)))));
 }
 
 // Test that A/(B/C) is simplified to (A*C)/B.
@@ -476,15 +592,18 @@ TEST_F(AlgebraicSimplifierTest, RhsDivOfDiv) {
 
   auto computation = m->AddEntryComputation(builder.Build());
 
-  EXPECT_THAT(computation->root_instruction(),
-              op::Divide(param0, op::Divide(param1, param2)));
+  EXPECT_THAT(
+      computation->root_instruction(),
+      GmockMatch(m::Divide(m::Parameter(0),
+                           m::Divide(m::Parameter(1), m::Parameter(2)))));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
-  EXPECT_THAT(computation->root_instruction(),
-              op::Divide(op::Multiply(param0, param2), param1));
+  EXPECT_THAT(
+      computation->root_instruction(),
+      GmockMatch(m::Divide(m::Multiply(m::Parameter(0), m::Parameter(2)),
+                           m::Parameter(1))));
 }
 
 // Test that (A/B)/(C/D) is simplified to (A*D)/(B*C).
@@ -511,15 +630,16 @@ TEST_F(AlgebraicSimplifierTest, DivOfDivAndDiv) {
 
   EXPECT_THAT(
       computation->root_instruction(),
-      op::Divide(op::Divide(param0, param1), op::Divide(param2, param3)));
+      GmockMatch(m::Divide(m::Divide(m::Parameter(0), m::Parameter(1)),
+                           m::Divide(m::Parameter(2), m::Parameter(3)))));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
   EXPECT_THAT(
       computation->root_instruction(),
-      op::Divide(op::Multiply(param0, param3), op::Multiply(param1, param2)));
+      GmockMatch(m::Divide(m::Multiply(m::Parameter(0), m::Parameter(3)),
+                           m::Multiply(m::Parameter(1), m::Parameter(2)))));
 }
 
 // Test that A/exp(B) is simplified to A*exp(-B).
@@ -539,14 +659,14 @@ TEST_F(AlgebraicSimplifierTest, DivOfExp) {
   auto computation = m->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
-              op::Divide(param0, op::Exp(param1)));
+              GmockMatch(m::Divide(m::Parameter(0), m::Exp(m::Parameter(1)))));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(),
-              op::Multiply(param0, op::Exp(op::Negate(param1))));
+              GmockMatch(m::Multiply(m::Parameter(0),
+                                     m::Exp(m::Negate(m::Parameter(1))))));
 }
 
 // Test that A/pow(B,C) is simplified to A*pow(B,-C).
@@ -567,15 +687,18 @@ TEST_F(AlgebraicSimplifierTest, DivOfPower) {
 
   auto computation = m->AddEntryComputation(builder.Build());
 
-  EXPECT_THAT(computation->root_instruction(),
-              op::Divide(param0, op::Power(param1, param2)));
+  EXPECT_THAT(
+      computation->root_instruction(),
+      GmockMatch(m::Divide(m::Parameter(0),
+                           m::Power(m::Parameter(1), m::Parameter(2)))));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(),
-              op::Multiply(param0, op::Power(param1, op::Negate(param2))));
+              GmockMatch(m::Multiply(
+                  m::Parameter(0),
+                  m::Power(m::Parameter(1), m::Negate(m::Parameter(2))))));
 }
 
 // Test that broadcasting is done on the right step when simplifying A/pow(B,C)
@@ -597,15 +720,18 @@ TEST_F(AlgebraicSimplifierTest, DivOfBroadcastingPower) {
 
   auto computation = m->AddEntryComputation(builder.Build());
 
-  EXPECT_THAT(computation->root_instruction(),
-              op::Divide(param0, op::Power(param1, param2)));
+  EXPECT_THAT(
+      computation->root_instruction(),
+      GmockMatch(m::Divide(m::Parameter(0),
+                           m::Power(m::Parameter(1), m::Parameter(2)))));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
   ASSERT_THAT(computation->root_instruction(),
-              op::Multiply(param0, op::Power(param1, op::Negate(param2))));
+              GmockMatch(m::Multiply(
+                  m::Parameter(0),
+                  m::Power(m::Parameter(1), m::Negate(m::Parameter(2))))));
 }
 
 // A / Const => A * InvertedConst
@@ -623,12 +749,11 @@ TEST_F(AlgebraicSimplifierTest, DivideByConstant) {
 
   auto computation = m->AddEntryComputation(builder.Build());
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(),
-              op::Multiply(param0, op::Constant()));
+              GmockMatch(m::Multiply(m::Parameter(0), m::Constant())));
 }
 
 // pow(pow(A, X), Y) => pow(A, X*Y)
@@ -648,11 +773,12 @@ TEST_F(AlgebraicSimplifierTest, PowerOfPower) {
                                                       inner_power, exp2));
 
   auto computation = m->AddEntryComputation(builder.Build());
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
-  EXPECT_THAT(computation->root_instruction(),
-              op::Power(base, op::Multiply(exp1, exp2)));
+  EXPECT_THAT(
+      computation->root_instruction(),
+      GmockMatch(m::Power(m::Op().Is(base),
+                          m::Multiply(m::Op().Is(exp1), m::Op().Is(exp2)))));
 }
 
 // Don't simplify pow(pow(A, X), Y) => pow(A, X*Y) if X and Y are complex
@@ -673,8 +799,7 @@ TEST_F(AlgebraicSimplifierTest, PowerOfPowerComplex) {
                                                       inner_power, exp2));
 
   m->AddEntryComputation(builder.Build());
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   ASSERT_FALSE(simplifier.Run(m.get()).ValueOrDie());
 }
 
@@ -693,8 +818,7 @@ TEST_F(AlgebraicSimplifierTest, DivOneScalar) {
   auto computation = m->AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root, div);
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
   root = computation->root_instruction();
   EXPECT_EQ(root, param0);
@@ -715,8 +839,7 @@ TEST_F(AlgebraicSimplifierTest, DivOneArray) {
   auto computation = m->AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root, div);
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
   root = computation->root_instruction();
   EXPECT_EQ(root, param0);
@@ -740,8 +863,7 @@ TEST_F(AlgebraicSimplifierTest, ComplexOfRealImagC) {
   auto computation = m->AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root, cplx);
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
   root = computation->root_instruction();
   EXPECT_EQ(root, param0);
@@ -765,8 +887,7 @@ TEST_F(AlgebraicSimplifierTest, RealOfComplex) {
   auto computation = m->AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root, real);
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
   root = computation->root_instruction();
   EXPECT_EQ(root, param0);
@@ -790,8 +911,7 @@ TEST_F(AlgebraicSimplifierTest, ImagOfComplex) {
   auto computation = m->AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root, imag);
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
   root = computation->root_instruction();
   EXPECT_EQ(root, param1);
@@ -818,11 +938,10 @@ TEST_F(AlgebraicSimplifierTest, SelectMakeTuple) {
   auto computation = m->AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root, add);
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
   root = computation->root_instruction();
-  EXPECT_THAT(root, op::Add(param1, param2));
+  EXPECT_THAT(root, GmockMatch(m::Add(m::Parameter(1), m::Parameter(2))));
 }
 
 // Test that exp(A)/exp(B) is simplified to exp(A-B)
@@ -843,15 +962,16 @@ TEST_F(AlgebraicSimplifierTest, ExpDiv) {
 
   auto computation = m->AddEntryComputation(builder.Build());
 
-  EXPECT_THAT(computation->root_instruction(),
-              op::Divide(op::Exp(param0), op::Exp(param1)));
+  EXPECT_THAT(
+      computation->root_instruction(),
+      GmockMatch(m::Divide(m::Exp(m::Parameter(0)), m::Exp(m::Parameter(1)))));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
-  EXPECT_THAT(computation->root_instruction(),
-              op::Exp(op::Subtract(param0, param1)));
+  EXPECT_THAT(
+      computation->root_instruction(),
+      GmockMatch(m::Exp(m::Subtract(m::Parameter(0), m::Parameter(1)))));
 }
 
 // Test that exp(A)*exp(B) is simplified to exp(A+B)
@@ -873,14 +993,14 @@ TEST_F(AlgebraicSimplifierTest, ExpMul) {
   auto computation = m->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
-              op::Multiply(op::Exp(param0), op::Exp(param1)));
+              GmockMatch(m::Multiply(m::Exp(m::Parameter(0)),
+                                     m::Exp(m::Parameter(1)))));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(),
-              op::Exp(op::Add(param0, param1)));
+              GmockMatch(m::Exp(m::Add(m::Parameter(0), m::Parameter(1)))));
 }
 
 // Test that pow(exp(A), B) is simplified to exp(A*B)
@@ -900,14 +1020,14 @@ TEST_F(AlgebraicSimplifierTest, PowExp) {
   auto computation = m->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
-              op::Power(op::Exp(param0), param1));
+              GmockMatch(m::Power(m::Exp(m::Parameter(0)), m::Parameter(1))));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
-  EXPECT_THAT(computation->root_instruction(),
-              op::Exp(op::Multiply(param0, param1)));
+  EXPECT_THAT(
+      computation->root_instruction(),
+      GmockMatch(m::Exp(m::Multiply(m::Parameter(0), m::Parameter(1)))));
 }
 
 // Test that ln(pow(A, B)) is simplified to ln(A)*B
@@ -927,14 +1047,14 @@ TEST_F(AlgebraicSimplifierTest, LnPow) {
   auto computation = m->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
-              op::Log(op::Power(param0, param1)));
+              GmockMatch(m::Log(m::Power(m::Parameter(0), m::Parameter(1)))));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
-  EXPECT_THAT(computation->root_instruction(),
-              op::Multiply(op::Log(param0), param1));
+  EXPECT_THAT(
+      computation->root_instruction(),
+      GmockMatch(m::Multiply(m::Log(m::Parameter(0)), m::Parameter(1))));
 }
 
 // Test that ln(exp(A)) is simplified to A
@@ -951,10 +1071,10 @@ TEST_F(AlgebraicSimplifierTest, LnExp) {
 
   auto computation = m->AddEntryComputation(builder.Build());
 
-  EXPECT_THAT(computation->root_instruction(), op::Log(op::Exp(param0)));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Log(m::Exp(m::Parameter(0)))));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
   EXPECT_EQ(computation->root_instruction(), param0);
@@ -981,13 +1101,14 @@ TEST_F(AlgebraicSimplifierTest, LnExpDiv) {
   auto computation = m->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
-              op::Log(op::Divide(op::Exp(param0), op::Exp(param1))));
+              GmockMatch(m::Log(m::Divide(m::Exp(m::Parameter(0)),
+                                          m::Exp(m::Parameter(1))))));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
-  EXPECT_THAT(computation->root_instruction(), op::Subtract(param0, param1));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Subtract(m::Parameter(0), m::Parameter(1))));
 }
 
 // Test that pow(A, 0) where A is a scalar is simplified to the scalar
@@ -1005,14 +1126,14 @@ TEST_F(AlgebraicSimplifierTest, Pow0Scalar) {
 
   auto computation = m->AddEntryComputation(builder.Build());
 
-  EXPECT_THAT(computation->root_instruction(), op::Power(param0, zero));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Power(m::Parameter(0), m::Op().Is(zero))));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
   HloInstruction* root = computation->root_instruction();
-  EXPECT_THAT(root, op::Constant());
+  EXPECT_THAT(root, GmockMatch(m::Constant()));
   EXPECT_EQ(root->literal().GetFirstElement<float>(), 1);
 }
 
@@ -1030,14 +1151,14 @@ TEST_F(AlgebraicSimplifierTest, Pow0Vector) {
 
   auto computation = m->AddEntryComputation(builder.Build());
 
-  EXPECT_THAT(computation->root_instruction(), op::Power(param0, zero));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Power(m::Parameter(0), m::Op().Is(zero))));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
   HloInstruction* root = computation->root_instruction();
-  EXPECT_THAT(root, op::Broadcast());
+  EXPECT_THAT(root, GmockMatch(m::Broadcast()));
   EXPECT_TRUE(ShapeUtil::Equal(root->shape(), r1f32))
       << ShapeUtil::HumanString(root->shape());
   EXPECT_EQ(root->dimensions().size(), 0);
@@ -1059,10 +1180,10 @@ TEST_F(AlgebraicSimplifierTest, Pow1) {
 
   auto computation = m->AddEntryComputation(builder.Build());
 
-  EXPECT_THAT(computation->root_instruction(), op::Power(param0, one));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Power(m::Parameter(0), m::Op().Is(one))));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
   EXPECT_EQ(computation->root_instruction(), param0);
@@ -1082,13 +1203,14 @@ TEST_F(AlgebraicSimplifierTest, Pow2) {
 
   auto computation = m->AddEntryComputation(builder.Build());
 
-  EXPECT_THAT(computation->root_instruction(), op::Power(param0, two));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Power(m::Parameter(0), m::Op().Is(two))));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
-  EXPECT_THAT(computation->root_instruction(), op::Multiply(param0, param0));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Multiply(m::Parameter(0), m::Parameter(0))));
 }
 
 // Test that pow(A, -1) is simplified to 1/A.
@@ -1105,14 +1227,14 @@ TEST_F(AlgebraicSimplifierTest, PowNegative1) {
 
   auto computation = m->AddEntryComputation(builder.Build());
 
-  EXPECT_THAT(computation->root_instruction(), op::Power(param0, negative_one));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Power(m::Parameter(0), m::Op().Is(negative_one))));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
   HloInstruction* root = computation->root_instruction();
-  EXPECT_THAT(root, op::Divide(op::Broadcast(), param0));
+  EXPECT_THAT(root, GmockMatch(m::Divide(m::Broadcast(), m::Parameter(0))));
   EXPECT_EQ(root->operand(0)->opcode(), HloOpcode::kBroadcast);
   EXPECT_EQ(root->operand(0)->operand(0)->literal().GetFirstElement<float>(),
             1);
@@ -1153,13 +1275,12 @@ TEST_F(AlgebraicSimplifierTest, ZeroSizedConvolution) {
       ShapeUtil::MakeShape(F32, {3, 3, 3}), lhs, rhs, /*feature_group_count=*/1,
       window, dnums, DefaultPrecisionConfig(2)));
   m->AddEntryComputation(builder.Build());
-  HloPassFix<AlgebraicSimplifier> simplifier(/*is_layout_sensitive=*/false,
-                                             non_bitcasting_callback());
+  HloPassFix<AlgebraicSimplifier> simplifier(default_options_);
   EXPECT_THAT(m->entry_computation()->root_instruction(),
-              op::Convolution(lhs, rhs));
+              GmockMatch(m::Convolution(m::Op().Is(lhs), m::Op().Is(rhs))));
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
   EXPECT_THAT(m->entry_computation()->root_instruction(),
-              op::Broadcast(op::Constant()));
+              GmockMatch(m::Broadcast(m::Constant())));
 }
 
 TEST_F(AlgebraicSimplifierTest, ZeroSizedReduceWindow) {
@@ -1196,13 +1317,12 @@ TEST_F(AlgebraicSimplifierTest, ZeroSizedReduceWindow) {
           HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(0.0f))),
       window, add_computation));
   m->AddEntryComputation(builder.Build());
-  HloPassFix<AlgebraicSimplifier> simplifier(/*is_layout_sensitive=*/false,
-                                             non_bitcasting_callback());
+  HloPassFix<AlgebraicSimplifier> simplifier(default_options_);
   EXPECT_THAT(m->entry_computation()->root_instruction(),
-              op::ReduceWindow(param, op::Constant()));
+              GmockMatch(m::ReduceWindow(m::Parameter(0), m::Constant())));
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
   EXPECT_THAT(m->entry_computation()->root_instruction(),
-              op::Broadcast(op::Constant()));
+              GmockMatch(m::Broadcast(m::Constant())));
 }
 
 TEST_F(AlgebraicSimplifierTest, ZeroSizedPad) {
@@ -1225,12 +1345,11 @@ TEST_F(AlgebraicSimplifierTest, ZeroSizedPad) {
       padding));
   m->AddEntryComputation(builder.Build());
   EXPECT_THAT(m->entry_computation()->root_instruction(),
-              op::Pad(param, op::Constant()));
-  HloPassFix<AlgebraicSimplifier> simplifier(/*is_layout_sensitive=*/false,
-                                             non_bitcasting_callback());
+              GmockMatch(m::Pad(m::Parameter(0), m::Constant())));
+  HloPassFix<AlgebraicSimplifier> simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
   EXPECT_THAT(m->entry_computation()->root_instruction(),
-              op::Broadcast(op::Constant()));
+              GmockMatch(m::Broadcast(m::Constant())));
 }
 
 TEST_F(AlgebraicSimplifierTest, ReshapeBroadcast) {
@@ -1251,10 +1370,9 @@ TEST_F(AlgebraicSimplifierTest, ReshapeBroadcast) {
   m->AddEntryComputation(std::move(computation));
 
   EXPECT_THAT(m->entry_computation()->root_instruction(),
-              op::Reshape(op::Broadcast(op::Reshape(op))));
+              GmockMatch(m::Reshape(m::Broadcast(m::Reshape(m::Op().Is(op))))));
 
-  HloPassFix<AlgebraicSimplifier> simplifier(/*is_layout_sensitive=*/false,
-                                             non_bitcasting_callback());
+  HloPassFix<AlgebraicSimplifier> simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
   EXPECT_THAT(m->entry_computation()->root_instruction(), op);
@@ -1271,10 +1389,10 @@ TEST_F(AlgebraicSimplifierTest, ConvertBetweenSameType) {
 
   auto computation = m->AddEntryComputation(builder.Build());
 
-  EXPECT_THAT(computation->root_instruction(), op::Convert(input));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Convert(m::Op().Is(input))));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(), input);
@@ -1292,10 +1410,10 @@ TEST_F(AlgebraicSimplifierTest, RemoveCopy) {
 
   auto computation = m->AddEntryComputation(builder.Build());
 
-  EXPECT_THAT(computation->root_instruction(), op::Copy(param0));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Copy(m::Parameter(0))));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(), param0);
@@ -1314,19 +1432,24 @@ TEST_F(AlgebraicSimplifierTest, CopyEqualsBitcast) {
   *copy->mutable_shape()->mutable_layout() =
       LayoutUtil::MakeLayout({1, 2, 0, 3});
   auto computation = m->AddEntryComputation(builder.Build());
-  EXPECT_THAT(computation->root_instruction(), op::Copy(param));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Copy(m::Parameter(0))));
 
-  AlgebraicSimplifier simplifier1(/*is_layout_sensitive=*/true,
-                                  non_bitcasting_callback());
+  AlgebraicSimplifierOptions options(non_bitcasting_callback());
+  options.set_is_layout_sensitive(true);
+  AlgebraicSimplifier simplifier1(options);
   ASSERT_FALSE(simplifier1.Run(m.get()).ValueOrDie());
   // Verify that the copy is not replaced.
-  EXPECT_THAT(computation->root_instruction(), op::Copy(param));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Copy(m::Parameter(0))));
 
-  AlgebraicSimplifier simplifier2(/*is_layout_sensitive=*/true,
-                                  bitcasting_callback());
+  AlgebraicSimplifierOptions options2(bitcasting_callback());
+  options2.set_is_layout_sensitive(true);
+  AlgebraicSimplifier simplifier2(options2);
   ASSERT_TRUE(simplifier2.Run(m.get()).ValueOrDie());
   // Verify that the copy is replaced.
-  EXPECT_THAT(computation->root_instruction(), op::Bitcast(param));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Bitcast(m::Parameter(0))));
 }
 
 // Test that unary concatenates are removed.
@@ -1341,10 +1464,10 @@ TEST_F(AlgebraicSimplifierTest, RemoveUnaryConcatenate) {
 
   auto computation = m->AddEntryComputation(builder.Build());
 
-  EXPECT_THAT(computation->root_instruction(), op::Concatenate(param0));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Concatenate(m::Parameter(0))));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(), param0);
@@ -1371,16 +1494,17 @@ TEST_F(AlgebraicSimplifierTest, RemoveEmptyConcatenateOperands) {
 
   auto computation = m->AddEntryComputation(builder.Build());
 
-  EXPECT_THAT(
-      computation->root_instruction(),
-      op::Concatenate(empty_literal, param0, param0, empty_slice, param1));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Concatenate(
+                  m::Op().Is(empty_literal), m::Parameter(0), m::Parameter(0),
+                  m::Op().Is(empty_slice), m::Parameter(1))));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(),
-              op::Concatenate(param0, param0, param1));
+              GmockMatch(m::Concatenate(m::Parameter(0), m::Parameter(0),
+                                        m::Parameter(1))));
 }
 
 // Test that reduce of concat is simplified.
@@ -1423,14 +1547,14 @@ TEST_F(AlgebraicSimplifierTest, SimplifyReduceOfConcat) {
 
   auto computation = m->AddEntryComputation(builder.Build());
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
   EXPECT_THAT(
       computation->root_instruction(),
-      op::Map(op::Map(op::Reduce(param0, zero), op::Reduce(param1, zero)),
-              op::Reduce(param2, zero)));
+      GmockMatch(m::Map(m::Map(m::Reduce(m::Parameter(0), m::Op().Is(zero)),
+                               m::Reduce(m::Parameter(1), m::Op().Is(zero))),
+                        m::Reduce(m::Parameter(2), m::Op().Is(zero)))));
 }
 
 // Test a concatenate with only empty operands is removed.
@@ -1453,10 +1577,10 @@ TEST_F(AlgebraicSimplifierTest, OnlyEmptyConcatenateOperands) {
   auto computation = m->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
-              op::Concatenate(empty_literal, empty_slice));
+              GmockMatch(m::Concatenate(m::Op().Is(empty_literal),
+                                        m::Op().Is(empty_slice))));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
   EXPECT_EQ(computation->root_instruction(), empty_literal);
@@ -1479,10 +1603,80 @@ TEST_F(AlgebraicSimplifierTest, ConcatenateOfBroadcastBecomesPad) {
 
   auto computation = m->AddEntryComputation(builder.Build());
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
-  EXPECT_THAT(computation->root_instruction(), op::Pad(param0, param1));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Pad(m::Parameter(0), m::Parameter(1))));
+}
+
+TEST_F(AlgebraicSimplifierTest, SimplifyConcatenateOfSlices) {
+  auto m = CreateNewVerifiedModule();
+  Shape r2f32 = ShapeUtil::MakeShape(F32, {100, 99});
+  Shape concat_shape = ShapeUtil::MakeShape(F32, {50, 80});
+  HloComputation::Builder builder(TestName());
+  HloInstruction* param0 = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, r2f32, "param0"));
+  HloInstruction* param1 = builder.AddInstruction(
+      HloInstruction::CreateParameter(1, r2f32, "param1"));
+
+  HloInstruction* slice0 = builder.AddInstruction(HloInstruction::CreateSlice(
+      ShapeUtil::MakeShape(F32, {50, 10}), param0, /*start_indices=*/{0, 0},
+      /*limit_indices=*/{50, 10}, /*strides=*/{1, 1}));
+
+  // Cannot merge 'slice0' and 'slice1' because of different start indices in
+  // dimension 0.
+  HloInstruction* slice1 = builder.AddInstruction(HloInstruction::CreateSlice(
+      ShapeUtil::MakeShape(F32, {50, 10}), param0, /*start_indices=*/{50, 10},
+      /*limit_indices=*/{100, 20}, /*strides=*/{1, 1}));
+
+  // Cannot merge 'slice1' and 'slice2' because of stride in dimension 2.
+  HloInstruction* slice2 = builder.AddInstruction(HloInstruction::CreateSlice(
+      ShapeUtil::MakeShape(F32, {50, 10}), param0, /*start_indices=*/{50, 20},
+      /*limit_indices=*/{100, 40}, /*strides=*/{1, 2}));
+
+  // Cannot merge 'slice2' and 'slice3' because of stride in dimension 2.
+  HloInstruction* slice3 = builder.AddInstruction(HloInstruction::CreateSlice(
+      ShapeUtil::MakeShape(F32, {50, 10}), param0, /*start_indices=*/{50, 40},
+      /*limit_indices=*/{100, 50}, /*strides=*/{1, 1}));
+
+  // Can merge 'slice3' and 'slice4'.
+  HloInstruction* slice4 = builder.AddInstruction(HloInstruction::CreateSlice(
+      ShapeUtil::MakeShape(F32, {50, 10}), param0, /*start_indices=*/{50, 50},
+      /*limit_indices=*/{100, 60}, /*strides=*/{1, 1}));
+
+  // Can merge 'slice4' and 'slice5'.
+  HloInstruction* slice5 = builder.AddInstruction(HloInstruction::CreateSlice(
+      ShapeUtil::MakeShape(F32, {50, 10}), param0, /*start_indices=*/{50, 60},
+      /*limit_indices=*/{100, 70}, /*strides=*/{1, 1}));
+
+  // Cannot merge 'slice5' and 'slice6' because of overlap.
+  HloInstruction* slice6 = builder.AddInstruction(HloInstruction::CreateSlice(
+      ShapeUtil::MakeShape(F32, {50, 10}), param0, /*start_indices=*/{50, 69},
+      /*limit_indices=*/{100, 79}, /*strides=*/{1, 1}));
+
+  // Cannot merge 'slice6' and 'slice7' because of slicing from a different
+  // parameter.
+  HloInstruction* slice7 = builder.AddInstruction(HloInstruction::CreateSlice(
+      ShapeUtil::MakeShape(F32, {50, 10}), param1, /*start_indices=*/{50, 79},
+      /*limit_indices=*/{100, 89}, /*strides=*/{1, 1}));
+
+  builder.AddInstruction(HloInstruction::CreateConcatenate(
+      concat_shape,
+      {slice0, slice1, slice2, slice3, slice4, slice5, slice6, slice7}, 1));
+  auto computation = m->AddEntryComputation(builder.Build());
+
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
+  auto s = m::Slice(m::Parameter(0));
+  EXPECT_THAT(
+      computation->root_instruction(),
+      GmockMatch(m::Concatenate(s, s, s, s, s, m::Slice(m::Parameter(1)))));
+  // The operand 3 should be a merge of 'slice3', 'slice4' and 'slice5', so its
+  // shape should have dimensions {50, 30}.
+  EXPECT_TRUE(
+      ShapeUtil::Equal(computation->root_instruction()->operand(3)->shape(),
+                       ShapeUtil::MakeShape(F32, {50, 30})));
+  EXPECT_EQ(computation->root_instruction()->operand(3)->slice_starts(1), 40);
 }
 
 // Test that a simplification which changes layouts is not performed if layout
@@ -1502,14 +1696,17 @@ TEST_F(AlgebraicSimplifierTest, CopyWithDifferentLayout) {
   *param0->mutable_shape()->mutable_layout() = LayoutUtil::MakeLayout({0, 1});
   *copy->mutable_shape()->mutable_layout() = LayoutUtil::MakeLayout({1, 0});
 
-  EXPECT_THAT(computation->root_instruction(), op::Copy(param0));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Copy(m::Parameter(0))));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/true,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifierOptions options(non_bitcasting_callback());
+  options.set_is_layout_sensitive(true);
+  AlgebraicSimplifier simplifier(options);
   EXPECT_FALSE(simplifier.Run(m.get()).ValueOrDie());
 
   // Copy has not been removed.
-  EXPECT_THAT(computation->root_instruction(), op::Copy(param0));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Copy(m::Parameter(0))));
 }
 
 // Test that a simplification which preserves layouts is performed if layout
@@ -1529,10 +1726,12 @@ TEST_F(AlgebraicSimplifierTest, CopyWithSameLayout) {
   *param0->mutable_shape()->mutable_layout() = LayoutUtil::MakeLayout({0, 1});
   *copy->mutable_shape()->mutable_layout() = LayoutUtil::MakeLayout({0, 1});
 
-  EXPECT_THAT(computation->root_instruction(), op::Copy(param0));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Copy(m::Parameter(0))));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/true,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifierOptions options(non_bitcasting_callback());
+  options.set_is_layout_sensitive(true);
+  AlgebraicSimplifier simplifier(options);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
   // Copy has been removed.
@@ -1557,14 +1756,17 @@ TEST_F(AlgebraicSimplifierTest, NoBitcastAdded) {
 
   auto computation = m->AddEntryComputation(builder.Build());
 
-  EXPECT_THAT(computation->root_instruction(), op::Reshape(param0));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Reshape(m::Parameter(0))));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/true,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifierOptions options(non_bitcasting_callback());
+  options.set_is_layout_sensitive(true);
+  AlgebraicSimplifier simplifier(options);
   EXPECT_FALSE(simplifier.Run(m.get()).ValueOrDie());
 
   // Reshape is not replaced with a bitcast.
-  EXPECT_THAT(computation->root_instruction(), op::Reshape(param0));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Reshape(m::Parameter(0))));
 }
 
 // Test transforming reshapes and transposes of rng.
@@ -1588,13 +1790,13 @@ TEST_F(AlgebraicSimplifierTest, ReshapeOfTransposeOfRngToRng) {
 
   auto computation = m->AddEntryComputation(builder.Build());
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 bitcasting_callback());
+  AlgebraicSimplifier simplifier(
+      (AlgebraicSimplifierOptions(bitcasting_callback())));
   EXPECT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
-  // Verify that that reshape(transpose(rng)) is replace by a single rng of the
+  // Verify that reshape(transpose(rng)) is replace by a single rng of the
   // same shape as the reshape.
-  EXPECT_THAT(computation->root_instruction(), op::Rng());
+  EXPECT_THAT(computation->root_instruction(), GmockMatch(m::Rng()));
   EXPECT_TRUE(ShapeUtil::Equal(computation->root_instruction()->shape(),
                                reshape_shape));
 }
@@ -1636,17 +1838,20 @@ TEST_F(AlgebraicSimplifierTest, ReshapeReplacedWithBitcast) {
   auto computation = m->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
-              op::Tuple(transformable_reshape, dimensions_wrong_reshape,
-                        layout_wrong_reshape));
+              GmockMatch(m::Tuple(m::Op().Is(transformable_reshape),
+                                  m::Op().Is(dimensions_wrong_reshape),
+                                  m::Op().Is(layout_wrong_reshape))));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/true,
-                                 bitcasting_callback());
+  AlgebraicSimplifierOptions options(bitcasting_callback());
+  options.set_is_layout_sensitive(true);
+  AlgebraicSimplifier simplifier(options);
   simplifier.Run(m.get()).ValueOrDie();
 
   // Verify that only the first reshape is replaced.
   EXPECT_THAT(
       computation->root_instruction(),
-      op::Tuple(op::Bitcast(), dimensions_wrong_reshape, layout_wrong_reshape));
+      GmockMatch(m::Tuple(m::Bitcast(), m::Op().Is(dimensions_wrong_reshape),
+                          m::Op().Is(layout_wrong_reshape))));
 }
 
 // Regression test for a bug where if we failed to sink a reshape, we'd set the
@@ -1667,8 +1872,8 @@ TEST_F(AlgebraicSimplifierTest, FailureToSinkReshapeDoesntAffectChangedBit) {
   builder.AddInstruction(
       HloInstruction::CreateReshape(ShapeUtil::MakeShape(F32, {4}), add));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 bitcasting_callback());
+  AlgebraicSimplifier simplifier(
+      (AlgebraicSimplifierOptions(bitcasting_callback())));
   m->AddEntryComputation(builder.Build());
   EXPECT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 }
@@ -1692,8 +1897,8 @@ TEST_F(AlgebraicSimplifierTest, FailureToSinkBroadcastDoesntAffectChangedBit) {
       HloInstruction::CreateBroadcast(ShapeUtil::MakeShape(F32, {2, 2, 2}), add,
                                       /*broadcast_dimensions=*/{0, 1}));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 bitcasting_callback());
+  AlgebraicSimplifier simplifier(
+      (AlgebraicSimplifierOptions(bitcasting_callback())));
   m->AddEntryComputation(builder.Build());
   EXPECT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 }
@@ -1715,14 +1920,17 @@ TEST_F(AlgebraicSimplifierTest, TransposeEqualsBitcast1) {
 
   auto computation = m->AddEntryComputation(builder.Build());
 
-  EXPECT_THAT(computation->root_instruction(), op::Transpose(param));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Transpose(m::Parameter(0))));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/true,
-                                 bitcasting_callback());
+  AlgebraicSimplifierOptions options(bitcasting_callback());
+  options.set_is_layout_sensitive(true);
+  AlgebraicSimplifier simplifier(options);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
   // Verify that the reshape is replaced.
-  EXPECT_THAT(computation->root_instruction(), op::Bitcast(param));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Bitcast(m::Parameter(0))));
 }
 
 TEST_F(AlgebraicSimplifierTest, TransposeEqualsBitcast2) {
@@ -1742,14 +1950,17 @@ TEST_F(AlgebraicSimplifierTest, TransposeEqualsBitcast2) {
 
   auto computation = m->AddEntryComputation(builder.Build());
 
-  EXPECT_THAT(computation->root_instruction(), op::Transpose(param));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Transpose(m::Parameter(0))));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/true,
-                                 bitcasting_callback());
+  AlgebraicSimplifierOptions options(bitcasting_callback());
+  options.set_is_layout_sensitive(true);
+  AlgebraicSimplifier simplifier(options);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
   // Verify that the reshape is replaced.
-  EXPECT_THAT(computation->root_instruction(), op::Bitcast(param));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Bitcast(m::Parameter(0))));
 }
 
 TEST_F(AlgebraicSimplifierTest, ReshapesMerged) {
@@ -1769,13 +1980,13 @@ TEST_F(AlgebraicSimplifierTest, ReshapesMerged) {
   auto computation = m->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
-              op::Reshape(op::Reshape(param0)));
+              GmockMatch(m::Reshape(m::Reshape(m::Parameter(0)))));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
-  EXPECT_THAT(computation->root_instruction(), op::Reshape(param0));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Reshape(m::Parameter(0))));
 }
 
 TEST_F(AlgebraicSimplifierTest, CopiesMerged) {
@@ -1796,13 +2007,16 @@ TEST_F(AlgebraicSimplifierTest, CopiesMerged) {
 
   auto computation = m->AddEntryComputation(builder.Build());
 
-  EXPECT_THAT(computation->root_instruction(), op::Copy(op::Copy(param0)));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Copy(m::Copy(m::Parameter(0)))));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/true,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifierOptions options(non_bitcasting_callback());
+  options.set_is_layout_sensitive(true);
+  AlgebraicSimplifier simplifier(options);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
-  EXPECT_THAT(computation->root_instruction(), op::Copy(param0));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Copy(m::Parameter(0))));
 }
 
 TEST_F(AlgebraicSimplifierTest, TransposesMerged) {
@@ -1821,13 +2035,14 @@ TEST_F(AlgebraicSimplifierTest, TransposesMerged) {
 
   auto computation = m->AddEntryComputation(builder.Build());
 
-  EXPECT_THAT(computation->root_instruction(), op::Transpose(transpose1));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Transpose(m::Op().Is(transpose1))));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
-  EXPECT_THAT(computation->root_instruction(), op::Transpose(param0));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Transpose(m::Parameter(0))));
   EXPECT_EQ(std::vector<int64>({2, 1, 0}),
             computation->root_instruction()->dimensions());
 }
@@ -1846,13 +2061,13 @@ TEST_F(AlgebraicSimplifierTest, ReshapeAndBroadcastMerged) {
   auto computation = m->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
-              op::Broadcast(op::Reshape(param0)));
+              GmockMatch(m::Broadcast(m::Reshape(m::Parameter(0)))));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
-  EXPECT_THAT(computation->root_instruction(), op::Broadcast(param0));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Broadcast(m::Parameter(0))));
 }
 
 // Test merging broadcast and reshape.
@@ -1869,13 +2084,13 @@ TEST_F(AlgebraicSimplifierTest, BroadcastAndReshapeMerged) {
   auto computation = m->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
-              op::Reshape(op::Broadcast(param0)));
+              GmockMatch(m::Reshape(m::Broadcast(m::Parameter(0)))));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
-  EXPECT_THAT(computation->root_instruction(), op::Broadcast(param0));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Broadcast(m::Parameter(0))));
 }
 
 TEST_F(AlgebraicSimplifierTest, BroadcastAndReshape_1_3x1_3) {
@@ -1891,14 +2106,13 @@ TEST_F(AlgebraicSimplifierTest, BroadcastAndReshape_1_3x1_3) {
   auto computation = m->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
-              op::Reshape(op::Broadcast(param)));
+              GmockMatch(m::Reshape(m::Broadcast(m::Parameter(0)))));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   EXPECT_FALSE(simplifier.Run(m.get()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(),
-              op::Reshape(op::Broadcast(param)));
+              GmockMatch(m::Reshape(m::Broadcast(m::Parameter(0)))));
 }
 
 TEST_F(AlgebraicSimplifierTest, BroadcastAndReshape_4_3x2x4_6x1x1x4) {
@@ -1914,13 +2128,13 @@ TEST_F(AlgebraicSimplifierTest, BroadcastAndReshape_4_3x2x4_6x1x1x4) {
   HloComputation* computation = m->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
-              op::Reshape(op::Broadcast(param)));
+              GmockMatch(m::Reshape(m::Broadcast(m::Parameter(0)))));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
-  EXPECT_THAT(computation->root_instruction(), op::Broadcast(param));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Broadcast(m::Parameter(0))));
   EXPECT_THAT(computation->root_instruction()->dimensions(),
               ::testing::ElementsAre(3));
 }
@@ -1938,13 +2152,13 @@ TEST_F(AlgebraicSimplifierTest, BroadcastAndReshape_1_3x2x1_6x1x1x1) {
   HloComputation* computation = m->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
-              op::Reshape(op::Broadcast(param)));
+              GmockMatch(m::Reshape(m::Broadcast(m::Parameter(0)))));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
-  EXPECT_THAT(computation->root_instruction(), op::Broadcast(param));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Broadcast(m::Parameter(0))));
   const std::vector<int64> broadcast_dims =
       computation->root_instruction()->dimensions();
   EXPECT_EQ(1, broadcast_dims.size());
@@ -1964,14 +2178,13 @@ TEST_F(AlgebraicSimplifierTest, BroadcastAndReshape_4_3x2x4x2_6x8) {
   HloComputation* computation = m->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
-              op::Reshape(op::Broadcast(param)));
+              GmockMatch(m::Reshape(m::Broadcast(m::Parameter(0)))));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   EXPECT_FALSE(simplifier.Run(m.get()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(),
-              op::Reshape(op::Broadcast(param)));
+              GmockMatch(m::Reshape(m::Broadcast(m::Parameter(0)))));
 }
 
 TEST_F(AlgebraicSimplifierTest, IotaAndReshapeMerged) {
@@ -1984,13 +2197,13 @@ TEST_F(AlgebraicSimplifierTest, IotaAndReshapeMerged) {
 
   auto computation = m->AddEntryComputation(builder.Build());
 
-  EXPECT_THAT(computation->root_instruction(), op::Reshape(op::Iota()));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Reshape(m::Iota())));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
-  EXPECT_THAT(computation->root_instruction(), op::Iota());
+  EXPECT_THAT(computation->root_instruction(), GmockMatch(m::Iota()));
   EXPECT_TRUE(
       ShapeUtil::Equal(computation->root_instruction()->shape(), result_shape));
 }
@@ -2004,14 +2217,13 @@ TEST_F(AlgebraicSimplifierTest, IotaEffectiveScalar) {
 
   auto computation = m->AddEntryComputation(builder.Build());
 
-  EXPECT_THAT(computation->root_instruction(), op::Iota());
+  EXPECT_THAT(computation->root_instruction(), GmockMatch(m::Iota()));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
   auto root = computation->root_instruction();
-  EXPECT_THAT(root, op::Broadcast(op::Constant()));
+  EXPECT_THAT(root, GmockMatch(m::Broadcast(m::Constant())));
   EXPECT_EQ(0.0f, root->operand(0)->literal().GetFirstElement<float>());
   EXPECT_TRUE(
       ShapeUtil::Equal(computation->root_instruction()->shape(), result_shape));
@@ -2027,13 +2239,14 @@ TEST_F(AlgebraicSimplifierTest, IotaAndReshape_1_3x2_6) {
 
   auto computation = m->AddEntryComputation(builder.Build());
 
-  EXPECT_THAT(computation->root_instruction(), op::Reshape(op::Iota()));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Reshape(m::Iota())));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   EXPECT_FALSE(simplifier.Run(m.get()).ValueOrDie());
 
-  EXPECT_THAT(computation->root_instruction(), op::Reshape(op::Iota()));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Reshape(m::Iota())));
 }
 
 TEST_F(AlgebraicSimplifierTest, IotaAndReshape_4_3x2x4_6x1x1x4) {
@@ -2046,13 +2259,13 @@ TEST_F(AlgebraicSimplifierTest, IotaAndReshape_4_3x2x4_6x1x1x4) {
 
   HloComputation* computation = m->AddEntryComputation(builder.Build());
 
-  EXPECT_THAT(computation->root_instruction(), op::Reshape(op::Iota()));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Reshape(m::Iota())));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
-  EXPECT_THAT(computation->root_instruction(), op::Iota());
+  EXPECT_THAT(computation->root_instruction(), GmockMatch(m::Iota()));
   EXPECT_EQ(Cast<HloIotaInstruction>(computation->root_instruction())
                 ->iota_dimension(),
             3);
@@ -2068,13 +2281,13 @@ TEST_F(AlgebraicSimplifierTest, IotaAndReshape_1_3x2x2_6x1x1x2) {
 
   HloComputation* computation = m->AddEntryComputation(builder.Build());
 
-  EXPECT_THAT(computation->root_instruction(), op::Reshape(op::Iota()));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Reshape(m::Iota())));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
-  EXPECT_THAT(computation->root_instruction(), op::Iota());
+  EXPECT_THAT(computation->root_instruction(), GmockMatch(m::Iota()));
   const int64 iota_dim =
       Cast<HloIotaInstruction>(computation->root_instruction())
           ->iota_dimension();
@@ -2091,13 +2304,14 @@ TEST_F(AlgebraicSimplifierTest, IotaAndReshape_4_3x2x4x2_6x8) {
 
   HloComputation* computation = m->AddEntryComputation(builder.Build());
 
-  EXPECT_THAT(computation->root_instruction(), op::Reshape(op::Iota()));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Reshape(m::Iota())));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   EXPECT_FALSE(simplifier.Run(m.get()).ValueOrDie());
 
-  EXPECT_THAT(computation->root_instruction(), op::Reshape(op::Iota()));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Reshape(m::Iota())));
 }
 
 TEST_F(AlgebraicSimplifierTest, RemoveNoopPad) {
@@ -2120,10 +2334,10 @@ TEST_F(AlgebraicSimplifierTest, RemoveNoopPad) {
   auto module = CreateNewVerifiedModule();
   HloComputation* computation = module->AddEntryComputation(builder.Build());
 
-  EXPECT_THAT(computation->root_instruction(), op::Pad(param, zero));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Pad(m::Parameter(0), m::Op().Is(zero))));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(), param);
@@ -2153,8 +2367,7 @@ TEST_F(AlgebraicSimplifierTest, NegativePadding) {
   auto module = CreateNewVerifiedModule();
   HloComputation* computation = module->AddEntryComputation(builder.Build());
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
 
   auto has_negative_padding = [](const HloInstruction* pad) {
     for (auto& padding_dimension : pad->padding_config().dimensions()) {
@@ -2166,16 +2379,54 @@ TEST_F(AlgebraicSimplifierTest, NegativePadding) {
     return false;
   };
 
-  EXPECT_THAT(computation->root_instruction(), op::Pad(param, zero));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Pad(m::Parameter(0), m::Op().Is(zero))));
   EXPECT_TRUE(has_negative_padding(pad));
 
   ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
 
-  EXPECT_THAT(computation->root_instruction(), op::Slice(op::Pad(param, zero)));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Slice(m::Pad(m::Parameter(0), m::Op().Is(zero)))));
   EXPECT_FALSE(
       has_negative_padding(computation->root_instruction()->operand(0)));
 }
 
+TEST_F(AlgebraicSimplifierTest, TrivialInteriorPadding) {
+  // Verify that a pad instruction with interior padding on one-sized
+  // dimensions, removes the interior padding.
+  HloComputation::Builder builder(TestName());
+  HloInstruction* param =
+      builder.AddInstruction(HloInstruction::CreateParameter(
+          0, ShapeUtil::MakeShape(F32, {2, 1}), "param"));
+  HloInstruction* zero = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(0.0f)));
+  PaddingConfig padding;
+  for (int i = 0; i < 2; ++i) {
+    auto dimension = padding.add_dimensions();
+    dimension->set_edge_padding_low(3);
+    dimension->set_edge_padding_high(3);
+    dimension->set_interior_padding(i * 3);
+  }
+  HloInstruction* pad = builder.AddInstruction(HloInstruction::CreatePad(
+      ShapeUtil::MakeShape(F32, {8, 7}), param, zero, padding));
+
+  auto module = CreateNewVerifiedModule();
+  HloComputation* computation = module->AddEntryComputation(builder.Build());
+
+  AlgebraicSimplifier simplifier(default_options_);
+
+  ASSERT_THAT(computation->root_instruction(),
+              GmockMatch(m::Pad(m::Parameter(0), m::Op().Is(zero))));
+  ASSERT_TRUE(HasInteriorPadding(pad->padding_config()));
+
+  EXPECT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Pad(m::Parameter(0), m::Op().Is(zero))));
+  EXPECT_FALSE(
+      HasInteriorPadding(computation->root_instruction()->padding_config()));
+}
+
 TEST_F(AlgebraicSimplifierTest, RemoveNoopReshape) {
   HloComputation::Builder builder(TestName());
   HloInstruction* param =
@@ -2187,10 +2438,10 @@ TEST_F(AlgebraicSimplifierTest, RemoveNoopReshape) {
   auto module = CreateNewVerifiedModule();
   HloComputation* computation = module->AddEntryComputation(builder.Build());
 
-  EXPECT_THAT(computation->root_instruction(), op::Reshape(param));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Reshape(m::Parameter(0))));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(), param);
@@ -2210,10 +2461,10 @@ TEST_F(AlgebraicSimplifierTest, RemoveNoopSlice) {
   auto module = CreateNewVerifiedModule();
   HloComputation* computation = module->AddEntryComputation(builder.Build());
 
-  EXPECT_THAT(computation->root_instruction(), op::Slice(param));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Slice(m::Parameter(0))));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(), param);
@@ -2239,13 +2490,14 @@ TEST_F(AlgebraicSimplifierTest, SliceOfSliceToSlice) {
   auto module = CreateNewVerifiedModule();
   HloComputation* computation = module->AddEntryComputation(builder.Build());
 
-  EXPECT_THAT(computation->root_instruction(), op::Slice(op::Slice(param)));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Slice(m::Slice(m::Parameter(0)))));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
 
-  EXPECT_THAT(computation->root_instruction(), op::Slice(param));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Slice(m::Parameter(0))));
   EXPECT_EQ(computation->root_instruction()->slice_starts(0), 3);
   EXPECT_EQ(computation->root_instruction()->slice_starts(1), 5);
   EXPECT_EQ(computation->root_instruction()->slice_limits(0), dim0 - 2);
@@ -2271,13 +2523,14 @@ TEST_F(AlgebraicSimplifierTest, SliceOfReshapeToReshapeOfSlice) {
   auto module = CreateNewVerifiedModule();
   HloComputation* computation = module->AddEntryComputation(builder.Build());
 
-  EXPECT_THAT(computation->root_instruction(), op::Slice(op::Reshape(param)));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Slice(m::Reshape(m::Parameter(0)))));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
 
-  EXPECT_THAT(computation->root_instruction(), op::Reshape(op::Slice(param)));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Reshape(m::Slice(m::Parameter(0)))));
 }
 
 TEST_F(AlgebraicSimplifierTest, SliceOfReshapeUnchanged) {
@@ -2296,10 +2549,10 @@ TEST_F(AlgebraicSimplifierTest, SliceOfReshapeUnchanged) {
   auto module = CreateNewVerifiedModule();
   HloComputation* computation = module->AddEntryComputation(builder.Build());
 
-  EXPECT_THAT(computation->root_instruction(), op::Slice(op::Reshape(param)));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Slice(m::Reshape(m::Parameter(0)))));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   ASSERT_FALSE(simplifier.Run(module.get()).ValueOrDie());
 }
 
@@ -2312,12 +2565,84 @@ TEST_F(AlgebraicSimplifierTest, RemoveNoopSort) {
   builder.AddInstruction(HloInstruction::CreateSort(keys_shape, 0, keys));
   auto module = CreateNewVerifiedModule();
   HloComputation* computation = module->AddEntryComputation(builder.Build());
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
   EXPECT_THAT(computation->root_instruction(), keys);
 }
 
+TEST_F(AlgebraicSimplifierTest, ReplacePermutationSortWithScatter) {
+  const char* hlo_string = R"(
+    HloModule permutation_sort
+
+    ENTRY sort_computation {
+      keys = f32[64,8732]{1,0} parameter(0)
+      values = s32[64,8732]{1,0} iota(), iota_dimension=1
+      sort = (f32[64,8732]{1,0}, s32[64,8732]{1,0}) sort(keys, values), dimensions={1}
+      gte = s32[64,8732]{1,0} get-tuple-element(sort), index=1
+      ROOT sort2 = (s32[64,8732]{1,0}, s32[64,8732]{1,0}) sort(gte, values), dimensions={1}
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  AlgebraicSimplifierOptions options(non_bitcasting_callback());
+  options.set_enable_permutation_sort_replacement(true);
+  AlgebraicSimplifier simplifier(options);
+  EXPECT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root,
+              GmockMatch(m::Tuple(
+                  m::Iota(),
+                  m::Scatter(m::Iota(), m::Concatenate(m::Iota(), m::Reshape()),
+                             m::Reshape()))));
+}
+
+TEST_F(AlgebraicSimplifierTest, DontReplacePermutationSortIfNonIntegral) {
+  // Same as ReplacePermutationSortWithScatter except that the iota has F32
+  // type.
+  const char* hlo_string = R"(
+    HloModule permutation_sort
+
+    ENTRY sort_computation {
+      keys = f32[64,8732]{1,0} parameter(0)
+      values = f32[64,8732]{1,0} iota(), iota_dimension=1
+      sort = (f32[64,8732]{1,0}, f32[64,8732]{1,0}) sort(keys, values), dimensions={1}
+      gte = f32[64,8732]{1,0} get-tuple-element(sort), index=1
+      ROOT sort2 = (f32[64,8732]{1,0}, f32[64,8732]{1,0}) sort(gte, values), dimensions={1}
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  AlgebraicSimplifierOptions options(non_bitcasting_callback());
+  options.set_enable_permutation_sort_replacement(true);
+  AlgebraicSimplifier simplifier(options);
+  EXPECT_FALSE(simplifier.Run(module.get()).ValueOrDie());
+}
+
+TEST_F(AlgebraicSimplifierTest, DontReplacePermutationSortWrongDimensions) {
+  // Same as ReplacePermutationSortWithScatter except that the sort dimensions
+  // don't match.
+  const char* hlo_string = R"(
+   HloModule permutation_sort
+
+    ENTRY sort_computation {
+      keys = f32[64,8732]{1,0} parameter(0)
+      values = s32[64,8732]{1,0} iota(), iota_dimension=1
+      sort = (f32[64,8732]{1,0}, s32[64,8732]{1,0}) sort(keys, values), dimensions={1}
+      gte = s32[64,8732]{1,0} get-tuple-element(sort), index=1
+      ROOT sort2 = (s32[64,8732]{1,0}, s32[64,8732]{1,0}) sort(gte, values), dimensions={0}
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  AlgebraicSimplifierOptions options(non_bitcasting_callback());
+  options.set_enable_permutation_sort_replacement(true);
+  AlgebraicSimplifier simplifier(options);
+  EXPECT_FALSE(simplifier.Run(module.get()).ValueOrDie());
+}
+
 TEST_F(AlgebraicSimplifierTest, ReplaceEffectiveScalarKeyValueSortWithTuple) {
   auto builder = HloComputation::Builder(TestName());
 
@@ -2334,11 +2659,11 @@ TEST_F(AlgebraicSimplifierTest, ReplaceEffectiveScalarKeyValueSortWithTuple) {
       keys, {values0, values1}));
   auto module = CreateNewVerifiedModule();
   HloComputation* computation = module->AddEntryComputation(builder.Build());
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
   EXPECT_THAT(computation->root_instruction(),
-              op::Tuple(keys, values0, values1));
+              GmockMatch(m::Tuple(m::Op().Is(keys), m::Op().Is(values0),
+                                  m::Op().Is(values1))));
 }
 
 // Test that A && True is simplified to A
@@ -2356,8 +2681,7 @@ TEST_F(AlgebraicSimplifierTest, AndTrue) {
   auto computation = m->AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root->opcode(), HloOpcode::kAnd);
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
   root = computation->root_instruction();
   EXPECT_EQ(root, param0);
@@ -2378,8 +2702,7 @@ TEST_F(AlgebraicSimplifierTest, AndTrue2) {
   auto computation = m->AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root->opcode(), HloOpcode::kAnd);
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
   root = computation->root_instruction();
   EXPECT_EQ(root, param0);
@@ -2400,8 +2723,7 @@ TEST_F(AlgebraicSimplifierTest, AndFalse) {
   auto computation = m->AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root->opcode(), HloOpcode::kAnd);
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
   root = computation->root_instruction();
   EXPECT_EQ(root, const_false);
@@ -2422,8 +2744,7 @@ TEST_F(AlgebraicSimplifierTest, AndFalse2) {
   auto computation = m->AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root->opcode(), HloOpcode::kAnd);
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
   root = computation->root_instruction();
   EXPECT_EQ(root, const_false);
@@ -2444,8 +2765,7 @@ TEST_F(AlgebraicSimplifierTest, OrTrue) {
   auto computation = m->AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root->opcode(), HloOpcode::kOr);
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
   root = computation->root_instruction();
   EXPECT_EQ(root, const_true);
@@ -2466,8 +2786,7 @@ TEST_F(AlgebraicSimplifierTest, OrTrue2) {
   auto computation = m->AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root->opcode(), HloOpcode::kOr);
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
   root = computation->root_instruction();
   EXPECT_EQ(root, const_true);
@@ -2488,8 +2807,7 @@ TEST_F(AlgebraicSimplifierTest, OrFalse) {
   auto computation = m->AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root->opcode(), HloOpcode::kOr);
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
   root = computation->root_instruction();
   EXPECT_EQ(root, param0);
@@ -2510,8 +2828,7 @@ TEST_F(AlgebraicSimplifierTest, OrFalse2) {
   auto computation = m->AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root->opcode(), HloOpcode::kOr);
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
   root = computation->root_instruction();
   EXPECT_EQ(root, param0);
@@ -2641,15 +2958,15 @@ TEST_P(ConvInputPaddingTest, DoTest) {
   auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   if (testcase.expected_conv_window.empty()) {
     ASSERT_FALSE(simplifier.Run(module.get()).ValueOrDie());
   } else {
     ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
     auto* conv = module->entry_computation()->root_instruction();
     SCOPED_TRACE(module->ToString());
-    ASSERT_THAT(conv, op::Convolution(op::Parameter(), op::Parameter()));
+    ASSERT_THAT(conv,
+                GmockMatch(m::Convolution(m::Parameter(), m::Parameter())));
     EXPECT_EQ(window_util::ToString(conv->window()),
               absl::StrCat("size=3x3 ", testcase.expected_conv_window));
   }
@@ -2759,15 +3076,15 @@ TEST_P(ConvFilterPaddingTest, DoIt) {
   auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   if (testcase.expected_conv_window.empty()) {
     ASSERT_FALSE(simplifier.Run(module.get()).ValueOrDie());
   } else {
     ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
     auto* conv = module->entry_computation()->root_instruction();
     SCOPED_TRACE(module->ToString());
-    ASSERT_THAT(conv, op::Convolution(op::Parameter(), op::Parameter()));
+    ASSERT_THAT(conv,
+                GmockMatch(m::Convolution(m::Parameter(), m::Parameter())));
     EXPECT_EQ(window_util::ToString(conv->window()),
               absl::StrFormat("size=%dx%d %s",
                               conv->operand(1)->shape().dimensions(2),
@@ -2908,8 +3225,9 @@ TEST_F(AlgebraicSimplifierTest, ConvertConvToMatmul) {
     auto module = CreateNewUnverifiedModule();
     auto* computation = module->AddEntryComputation(b.Build());
 
-    AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/true,
-                                   bitcasting_callback());
+    AlgebraicSimplifierOptions simplifier_options(bitcasting_callback());
+    simplifier_options.set_is_layout_sensitive(true);
+    AlgebraicSimplifier simplifier(simplifier_options);
     if (!simplifier.Run(module.get()).ValueOrDie()) {
       return "NO_CHANGE";
     }
@@ -3032,17 +3350,15 @@ TEST_F(AlgebraicSimplifierTest, ScalarBroadcastToSlice) {
   EXPECT_EQ(root, slice);
   EXPECT_TRUE(ShapeUtil::Equal(root->shape(), slice_shape));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
 
   ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
 
   // Running simplification again should not result in any further changes.
   ASSERT_FALSE(simplifier.Run(module.get()).ValueOrDie());
-
-  root = computation->root_instruction();
-  EXPECT_THAT(root, op::Broadcast(scalar_param));
-  EXPECT_TRUE(ShapeUtil::Equal(root->shape(), slice_shape));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Broadcast(m::Op().Is(scalar_param))
+                             .WithShapeEqualTo(&slice_shape)));
 }
 
 // Test that reshape(transpose(broadcast(/*scalar value*/))) simplifies to a
@@ -3071,13 +3387,11 @@ TEST_F(AlgebraicSimplifierTest, ScalarBroadcastToTransposeReshape) {
   EXPECT_EQ(root, reshape);
   EXPECT_TRUE(ShapeUtil::Equal(root->shape(), reshape_shape));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
-
-  root = computation->root_instruction();
-  EXPECT_THAT(root, op::Broadcast(forty_two));
-  EXPECT_TRUE(ShapeUtil::Equal(root->shape(), reshape_shape));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Broadcast(m::Op().Is(forty_two))
+                             .WithShapeEqualTo(&reshape_shape)));
 }
 
 // Test that ReduceWindow(Pad(op, x), y) can simplify to ReduceWindow(op, x).
@@ -3138,8 +3452,7 @@ TEST_F(AlgebraicSimplifierTest, FoldPadIntoReduceWindow) {
   auto computation = module->AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root, reduce_window);
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
 
   // Running simplification again should not result in any further changes.
@@ -3147,7 +3460,8 @@ TEST_F(AlgebraicSimplifierTest, FoldPadIntoReduceWindow) {
 
   // Verify the result
   root = computation->root_instruction();
-  EXPECT_THAT(root, op::ReduceWindow(operand, op::Constant()));
+  EXPECT_THAT(root,
+              GmockMatch(m::ReduceWindow(m::Op().Is(operand), m::Constant())));
   EXPECT_TRUE(ShapeUtil::Equal(root->shape(), reduce_window_shape))
       << ShapeUtil::HumanString(root->shape()) << " vs "
       << ShapeUtil::HumanString(reduce_window_shape);
@@ -3224,8 +3538,7 @@ TEST_F(AlgebraicSimplifierTest, FoldConvertedPadIntoReduceWindow) {
   auto computation = module->AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root, reduce_window);
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
 
   // Running simplification again should not result in any further changes.
@@ -3233,7 +3546,8 @@ TEST_F(AlgebraicSimplifierTest, FoldConvertedPadIntoReduceWindow) {
 
   // Verify the result
   root = computation->root_instruction();
-  EXPECT_THAT(root, op::ReduceWindow(op::Convert(parameter), op::Constant()));
+  EXPECT_THAT(root, GmockMatch(m::ReduceWindow(m::Convert(m::Parameter(0)),
+                                               m::Constant())));
   EXPECT_TRUE(ShapeUtil::Equal(root->shape(), reduce_window_shape))
       << ShapeUtil::HumanString(root->shape()) << " vs "
       << ShapeUtil::HumanString(reduce_window_shape);
@@ -3258,8 +3572,7 @@ TEST_F(AlgebraicSimplifierTest, ReversalOfTrivialDimensionsToBitcast) {
   auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
 
   HloInstruction* root = computation->root_instruction();
@@ -3295,8 +3608,7 @@ TEST_F(AlgebraicSimplifierTest, IteratorInvalidation) {
 
   m->AddEmbeddedComputation(std::move(dot_computation));
   m->AddEntryComputation(call_builder.Build());
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 }
 
@@ -3313,11 +3625,10 @@ TEST_F(AlgebraicSimplifierTest, ConstantTupleBecomesTupleOfConstants) {
 
   auto computation = m->AddEntryComputation(builder.Build());
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
   EXPECT_THAT(computation->root_instruction(),
-              op::Tuple(op::Constant(), op::Constant()));
+              GmockMatch(m::Tuple(m::Constant(), m::Constant())));
 }
 
 // A dynamic-slice is trivial if its start indices are all zeroes and the size
@@ -3337,10 +3648,9 @@ TEST_F(AlgebraicSimplifierTest, TrivialDynamicSlice) {
       /*slice_sizes=*/{10, 100, 1000}));
 
   auto computation = m->AddEntryComputation(builder.Build());
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
-  EXPECT_THAT(computation->root_instruction(), op::Parameter());
+  EXPECT_THAT(computation->root_instruction(), GmockMatch(m::Parameter()));
 }
 
 // A dynamic-update-slice is trivial if its start indices are all zeroes and the
@@ -3371,11 +3681,10 @@ TEST_F(AlgebraicSimplifierTest, TrivialDynamicUpdateSlice) {
           3, ShapeUtil::MakeShape(U32, {3}), "update_indices"))));
 
   auto computation = m->AddEntryComputation(builder.Build());
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
   EXPECT_THAT(computation->root_instruction(),
-              op::DynamicSlice(op::Parameter(), op::Parameter()));
+              GmockMatch(m::DynamicSlice(m::Parameter(), m::Parameter())));
 }
 
 // Test that two consecutive broadcasts can be merged to one.
@@ -3394,11 +3703,10 @@ TEST_F(AlgebraicSimplifierTest, MergeBroadcasts) {
   auto computation = m->AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root->opcode(), HloOpcode::kBroadcast);
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
   root = computation->root_instruction();
-  EXPECT_THAT(root, op::Broadcast(op::Constant()));
+  EXPECT_THAT(root, GmockMatch(m::Broadcast(m::Constant())));
   EXPECT_THAT(root->dimensions(), ElementsAre(2));
 }
 
@@ -3421,11 +3729,10 @@ TEST_F(AlgebraicSimplifierTest, MergeBroadcasts2) {
   auto computation = m->AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root->opcode(), HloOpcode::kBroadcast);
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
   root = computation->root_instruction();
-  EXPECT_THAT(root, op::Broadcast(op::Parameter(0)));
+  EXPECT_THAT(root, GmockMatch(m::Broadcast(m::Parameter(0))));
   EXPECT_THAT(root->dimensions(), ElementsAre(1, 3));
 }
 
@@ -3442,11 +3749,10 @@ TEST_F(AlgebraicSimplifierTest, MergeBroadcastAndIota) {
   auto computation = m->AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root->opcode(), HloOpcode::kBroadcast);
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
   root = computation->root_instruction();
-  EXPECT_THAT(root, op::Iota());
+  EXPECT_THAT(root, GmockMatch(m::Iota()));
   EXPECT_EQ(Cast<HloIotaInstruction>(root)->iota_dimension(), 2);
 }
 
@@ -3464,11 +3770,10 @@ TEST_F(AlgebraicSimplifierTest, MergeBroadcastAndIota2) {
   auto computation = m->AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root->opcode(), HloOpcode::kBroadcast);
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
   root = computation->root_instruction();
-  EXPECT_THAT(root, op::Iota());
+  EXPECT_THAT(root, GmockMatch(m::Iota()));
   EXPECT_EQ(Cast<HloIotaInstruction>(root)->iota_dimension(), 2);
 }
 
@@ -3486,11 +3791,11 @@ TEST_F(AlgebraicSimplifierTest, SliceOfPadLow) {
   TF_ASSERT_OK_AND_ASSIGN(auto module,
                           ParseAndReturnVerifiedModule(hlo_string));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 bitcasting_callback());
+  AlgebraicSimplifierOptions options(bitcasting_callback());
+  AlgebraicSimplifier simplifier(options);
   EXPECT_TRUE(simplifier.Run(module.get()).ValueOrDie());
   auto root = module->entry_computation()->root_instruction();
-  EXPECT_THAT(root, op::Reshape(op::Constant()));
+  EXPECT_THAT(root, GmockMatch(m::Reshape(m::Constant())));
 }
 
 TEST_F(AlgebraicSimplifierTest, SliceOfPadHigh) {
@@ -3507,11 +3812,11 @@ TEST_F(AlgebraicSimplifierTest, SliceOfPadHigh) {
   TF_ASSERT_OK_AND_ASSIGN(auto module,
                           ParseAndReturnVerifiedModule(hlo_string));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 bitcasting_callback());
+  AlgebraicSimplifierOptions options(bitcasting_callback());
+  AlgebraicSimplifier simplifier(options);
   EXPECT_TRUE(simplifier.Run(module.get()).ValueOrDie());
   auto root = module->entry_computation()->root_instruction();
-  EXPECT_THAT(root, op::Reshape(op::Constant()));
+  EXPECT_THAT(root, GmockMatch(m::Reshape(m::Constant())));
 }
 
 TEST_F(AlgebraicSimplifierTest, SliceOfPadMidNonScalar) {
@@ -3528,8 +3833,8 @@ TEST_F(AlgebraicSimplifierTest, SliceOfPadMidNonScalar) {
   TF_ASSERT_OK_AND_ASSIGN(auto module,
                           ParseAndReturnVerifiedModule(hlo_string));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 bitcasting_callback());
+  AlgebraicSimplifierOptions options(bitcasting_callback());
+  AlgebraicSimplifier simplifier(options);
   EXPECT_FALSE(simplifier.Run(module.get()).ValueOrDie());
 }
 
@@ -3547,11 +3852,11 @@ TEST_F(AlgebraicSimplifierTest, SliceOfPadMidScalar) {
   TF_ASSERT_OK_AND_ASSIGN(auto module,
                           ParseAndReturnVerifiedModule(hlo_string));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 bitcasting_callback());
+  AlgebraicSimplifierOptions options(bitcasting_callback());
+  AlgebraicSimplifier simplifier(options);
   EXPECT_TRUE(simplifier.Run(module.get()).ValueOrDie());
   auto root = module->entry_computation()->root_instruction();
-  EXPECT_THAT(root, op::Parameter());
+  EXPECT_THAT(root, GmockMatch(m::Parameter()));
 }
 
 TEST_F(AlgebraicSimplifierTest, SliceOfConcatScalarInput) {
@@ -3569,11 +3874,11 @@ TEST_F(AlgebraicSimplifierTest, SliceOfConcatScalarInput) {
   TF_ASSERT_OK_AND_ASSIGN(auto module,
                           ParseAndReturnVerifiedModule(hlo_string));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 bitcasting_callback());
+  AlgebraicSimplifierOptions options(bitcasting_callback());
+  AlgebraicSimplifier simplifier(options);
   EXPECT_TRUE(simplifier.Run(module.get()).ValueOrDie());
   auto root = module->entry_computation()->root_instruction();
-  EXPECT_THAT(root, op::Parameter(1));
+  EXPECT_THAT(root, GmockMatch(m::Parameter(1)));
 }
 
 TEST_F(AlgebraicSimplifierTest, SliceOfConcatNonScalarInput) {
@@ -3591,11 +3896,11 @@ TEST_F(AlgebraicSimplifierTest, SliceOfConcatNonScalarInput) {
   TF_ASSERT_OK_AND_ASSIGN(auto module,
                           ParseAndReturnVerifiedModule(hlo_string));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 bitcasting_callback());
+  AlgebraicSimplifierOptions options(bitcasting_callback());
+  AlgebraicSimplifier simplifier(options);
   EXPECT_TRUE(simplifier.Run(module.get()).ValueOrDie());
   auto root = module->entry_computation()->root_instruction();
-  EXPECT_THAT(root, op::Slice(op::Parameter(2)));
+  EXPECT_THAT(root, GmockMatch(m::Slice(m::Parameter(2))));
   EXPECT_EQ(root->slice_starts(0), 1);
   EXPECT_EQ(root->slice_limits(0), 2);
 }
@@ -3613,11 +3918,11 @@ TEST_F(AlgebraicSimplifierTest, NegateNegate) {
   TF_ASSERT_OK_AND_ASSIGN(auto module,
                           ParseAndReturnVerifiedModule(hlo_string));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 bitcasting_callback());
+  AlgebraicSimplifierOptions options(bitcasting_callback());
+  AlgebraicSimplifier simplifier(options);
   EXPECT_TRUE(simplifier.Run(module.get()).ValueOrDie());
   auto root = module->entry_computation()->root_instruction();
-  EXPECT_THAT(root, op::Parameter(0));
+  EXPECT_THAT(root, GmockMatch(m::Parameter(0)));
 }
 
 TEST_F(AlgebraicSimplifierTest, NotNot) {
@@ -3633,11 +3938,11 @@ TEST_F(AlgebraicSimplifierTest, NotNot) {
   TF_ASSERT_OK_AND_ASSIGN(auto module,
                           ParseAndReturnVerifiedModule(hlo_string));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 bitcasting_callback());
+  AlgebraicSimplifierOptions options(bitcasting_callback());
+  AlgebraicSimplifier simplifier(options);
   EXPECT_TRUE(simplifier.Run(module.get()).ValueOrDie());
   auto root = module->entry_computation()->root_instruction();
-  EXPECT_THAT(root, op::Parameter(0));
+  EXPECT_THAT(root, GmockMatch(m::Parameter(0)));
 }
 
 struct PadReduceWindowEffectiveBroadcastCase {
@@ -3733,8 +4038,7 @@ TEST_P(PadReduceWindowEffectiveBroadcastTest, DoIt) {
       output_shape, pad, zero, window, add_computation));
 
   auto computation = m->AddEntryComputation(builder.Build());
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   TF_ASSERT_OK_AND_ASSIGN(bool run_successful, simplifier.Run(m.get()));
   ASSERT_TRUE(run_successful);
 
@@ -3742,10 +4046,10 @@ TEST_P(PadReduceWindowEffectiveBroadcastTest, DoIt) {
       ShapeUtil::Equal(computation->root_instruction()->shape(), output_shape));
 
   if (param.should_become_broadcast) {
-    EXPECT_THAT(computation->root_instruction(), op::Broadcast(::testing::_));
+    EXPECT_THAT(computation->root_instruction(), GmockMatch(m::Broadcast()));
   } else {
     EXPECT_THAT(computation->root_instruction(),
-                op::ReduceWindow(::testing::_, zero));
+                GmockMatch(m::ReduceWindow(m::Op(), m::Op().Is(zero))));
   }
 }
 
@@ -3815,8 +4119,7 @@ TEST_P(DotStrengthReductionTest, DotStrengthReduction) {
   builder.AddInstruction(HloInstruction::CreateDot(
       dot_shape, lhs, rhs, dot_dnums, DefaultPrecisionConfig(2)));
   auto computation = module->AddEntryComputation(builder.Build());
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   TF_ASSERT_OK_AND_ASSIGN(bool changed, simplifier.Run(module.get()));
   const bool dot_should_be_transformed = m == 1 || k == 1 || n == 1;
   const bool computation_should_be_modified =
@@ -3845,7 +4148,7 @@ struct DotOfConcatTestSpec {
 };
 
 class DotOfConcatSimplificationTest
-    : public HloTestBase,
+    : public AlgebraicSimplifierTest,
       public ::testing::WithParamInterface<DotOfConcatTestSpec> {};
 
 // Test that we transform
@@ -3893,19 +4196,19 @@ TEST_P(DotOfConcatSimplificationTest, ConstantLHS) {
       dot_shape, lhs, rhs, dot_dnums, DefaultPrecisionConfig(2)));
 
   auto computation = m->AddEntryComputation(builder.Build());
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   TF_ASSERT_OK_AND_ASSIGN(bool run_successful, simplifier.Run(m.get()));
   ASSERT_TRUE(run_successful);
 
   EXPECT_TRUE(
       ShapeUtil::Equal(computation->root_instruction()->shape(), dot_shape));
 
-  auto match_dot_0 = op::Dot(op::Slice(op::Constant()), op::Parameter(0));
-  auto match_dot_1 = op::Dot(op::Slice(op::Constant()), op::Parameter(1));
-  auto match_dot_2 = op::Dot(op::Slice(op::Constant()), op::Parameter(2));
-  EXPECT_THAT(computation->root_instruction(),
-              op::Add(op::Add(match_dot_0, match_dot_1), match_dot_2));
+  auto match_dot_0 = m::Dot(m::Slice(m::Constant()), m::Parameter(0));
+  auto match_dot_1 = m::Dot(m::Slice(m::Constant()), m::Parameter(1));
+  auto match_dot_2 = m::Dot(m::Slice(m::Constant()), m::Parameter(2));
+  EXPECT_THAT(
+      computation->root_instruction(),
+      GmockMatch(m::Add(m::Add(match_dot_0, match_dot_1), match_dot_2)));
 }
 
 // Test that we transform
@@ -3958,20 +4261,20 @@ TEST_P(DotOfConcatSimplificationTest, ConstantRHS) {
       dot_shape, lhs, rhs, dot_dnums, DefaultPrecisionConfig(2)));
 
   auto computation = m->AddEntryComputation(builder.Build());
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   TF_ASSERT_OK_AND_ASSIGN(bool run_successful, simplifier.Run(m.get()));
   ASSERT_TRUE(run_successful);
   EXPECT_TRUE(
       ShapeUtil::Equal(computation->root_instruction()->shape(), dot_shape));
 
-  auto match_dot_0 = op::Dot(op::Parameter(0), op::Slice(op::Constant()));
-  auto match_dot_1 = op::Dot(op::Parameter(1), op::Slice(op::Constant()));
-  auto match_dot_2 = op::Dot(op::Parameter(2), op::Slice(op::Constant()));
-  auto match_dot_3 = op::Dot(op::Parameter(3), op::Slice(op::Constant()));
-  EXPECT_THAT(computation->root_instruction(),
-              op::Add(op::Add(op::Add(match_dot_0, match_dot_1), match_dot_2),
-                      match_dot_3));
+  auto match_dot_0 = m::Dot(m::Parameter(0), m::Slice(m::Constant()));
+  auto match_dot_1 = m::Dot(m::Parameter(1), m::Slice(m::Constant()));
+  auto match_dot_2 = m::Dot(m::Parameter(2), m::Slice(m::Constant()));
+  auto match_dot_3 = m::Dot(m::Parameter(3), m::Slice(m::Constant()));
+  EXPECT_THAT(
+      computation->root_instruction(),
+      GmockMatch(m::Add(m::Add(m::Add(match_dot_0, match_dot_1), match_dot_2),
+                        match_dot_3)));
 }
 
 DotOfConcatTestSpec kDotOfConcatTestSpecs[] = {
@@ -4000,8 +4303,7 @@ TEST_F(AlgebraicSimplifierTest, DynamicUpdateSliceZeroUpdate) {
   const HloComputation* const computation =
       m->AddEntryComputation(builder.Build());
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
   EXPECT_THAT(computation->root_instruction(), operand);
 }
@@ -4021,7 +4323,7 @@ struct DotOfGatherTestSpec {
 };
 
 class DotOfGatherSimplificationTest
-    : public HloTestBase,
+    : public AlgebraicSimplifierTest,
       public ::testing::WithParamInterface<DotOfGatherTestSpec> {};
 
 // input: dot(DS(ctA), ctB))
@@ -4078,8 +4380,7 @@ TEST_P(DotOfGatherSimplificationTest, ConstantRHS) {
       dot_shape, ds, rhs, dot_dnums, DefaultPrecisionConfig(2)));
 
   auto computation = m->AddEntryComputation(builder.Build());
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   TF_ASSERT_OK_AND_ASSIGN(bool run_successful, simplifier.Run(m.get()));
   ASSERT_TRUE(run_successful);
   EXPECT_TRUE(
@@ -4090,8 +4391,8 @@ TEST_P(DotOfGatherSimplificationTest, ConstantRHS) {
               HloOpcode::kDynamicSlice);
   } else {
     EXPECT_THAT(computation->root_instruction(),
-                op::DynamicSlice(op::Dot(op::Constant(), op::Constant()),
-                                 op::Concatenate()));
+                GmockMatch(m::DynamicSlice(m::Dot(m::Constant(), m::Constant()),
+                                           m::Concatenate())));
   }
 }
 
@@ -4149,8 +4450,7 @@ TEST_P(DotOfGatherSimplificationTest, ConstantLHS) {
       dot_shape, lhs, ds, dot_dnums, DefaultPrecisionConfig(2)));
 
   auto computation = m->AddEntryComputation(builder.Build());
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   TF_ASSERT_OK_AND_ASSIGN(bool run_successful, simplifier.Run(m.get()));
   ASSERT_TRUE(run_successful);
   EXPECT_TRUE(
@@ -4161,8 +4461,8 @@ TEST_P(DotOfGatherSimplificationTest, ConstantLHS) {
               HloOpcode::kDynamicSlice);
   } else {
     EXPECT_THAT(computation->root_instruction(),
-                op::DynamicSlice(op::Dot(op::Constant(), op::Constant()),
-                                 op::Concatenate()));
+                GmockMatch(m::DynamicSlice(m::Dot(m::Constant(), m::Constant()),
+                                           m::Concatenate())));
   }
 }
 
diff --git a/tensorflow/compiler/xla/service/ar_crs_combiner.cc b/tensorflow/compiler/xla/service/ar_crs_combiner.cc
new file mode 100644
index 00000000000..c11452a6fbd
--- /dev/null
+++ b/tensorflow/compiler/xla/service/ar_crs_combiner.cc
@@ -0,0 +1,286 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/ar_crs_combiner.h"
+
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "tensorflow/compiler/xla/literal.h"
+#include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/service/call_graph.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/service/pattern_matcher.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+
+namespace xla {
+
+namespace {
+
+namespace m = match;
+
+// If the argument instruction is a CRS in the sequence
+// AR -> Convert -> Add -> CRS
+// then return the AR in the sequence.
+// TODO(b/117554291): Rewrite this to recognize more general patterns,
+// not just the specific one of AR -> Add -> Convert -> CRS.
+absl::optional<HloInstruction*> MatchesArCrsPattern(
+    HloInstruction* instruction) {
+  HloInstruction *ar, *convert, *add, *crs;
+  if (Match(instruction,
+            m::CrossReplicaSum(
+                &crs, m::Add(&add, m::Op(),
+                             m::Convert(&convert,
+                                        m::CrossReplicaSum(&ar, m::Op()))))) &&
+      ar->users().size() == 1 && ar->shape().element_type() == BF16 &&
+      convert->shape().element_type() == F32 && !crs->all_reduce_id()) {
+    return ar;
+  }
+  return absl::optional<HloInstruction*>();
+}
+
+}  // namespace
+
+absl::optional<HloInstruction*> ArCrsCombiner::WhileFromBodyParameter(
+    HloInstruction* instruction) {
+  CHECK(HloOpcode::kParameter == instruction->opcode());
+  HloComputation* computation = instruction->parent();
+  auto caller_instructions = call_graph_->GetComputationCallers(computation);
+  if (caller_instructions.size() == 1) {
+    auto caller_instruction = caller_instructions[0];
+    if (caller_instruction->opcode() == HloOpcode::kWhile) {
+      return caller_instruction;
+    }
+  }
+  return absl::optional<HloInstruction*>();
+}
+
+std::vector<HloInstruction*> ArCrsCombiner::GetAllTuples(
+    HloInstruction* instruction) {
+  if (instruction->opcode() == HloOpcode::kTuple) {
+    return {instruction};
+  }
+  if (instruction->opcode() == HloOpcode::kDomain) {
+    return GetAllTuples(instruction->operands()[0]);
+  }
+  if (instruction->opcode() == HloOpcode::kParameter) {
+    auto maybe_while = WhileFromBodyParameter(instruction);
+    if (!maybe_while) {
+      return {};
+    }
+    auto while_instr = *maybe_while;
+    auto init_tuples = GetAllTuples(while_instr->while_init());
+    auto body_tuples =
+        GetAllTuples(while_instr->while_body()->root_instruction());
+    if (init_tuples.empty() || body_tuples.empty()) {
+      return {};
+    }
+    init_tuples.insert(init_tuples.end(), body_tuples.begin(),
+                       body_tuples.end());
+    return init_tuples;
+  }
+  if (instruction->opcode() == HloOpcode::kGetTupleElement) {
+    std::vector<HloInstruction*> result_tuples;
+    for (auto tuple : GetAllTuples(instruction->operands()[0])) {
+      auto tmp_tuples =
+          GetAllTuples(tuple->mutable_operand(instruction->tuple_index()));
+      if (tmp_tuples.empty()) {
+        return {};
+      }
+      result_tuples.insert(result_tuples.end(), tmp_tuples.begin(),
+                           tmp_tuples.end());
+    }
+    return result_tuples;
+  }
+  return {};
+}
+
+bool ArCrsCombiner::TupleElementsComputeSameValue(
+    HloInstruction* tuple_shaped_instruction, int64 i1, int64 i2,
+    absl::flat_hash_map<int64, int64>* visited_pairs) {
+  auto tuples = GetAllTuples(tuple_shaped_instruction);
+  if (tuples.empty()) {
+    return false;
+  }
+  for (auto tuple : tuples) {
+    CHECK(tuple->opcode() == HloOpcode::kTuple);
+    if (!InstructionsComputeSameValue(tuple->mutable_operand(i1),
+                                      tuple->mutable_operand(i2),
+                                      visited_pairs)) {
+      return false;
+    }
+  }
+  return true;
+}
+
+/* static */
+bool ArCrsCombiner::TestInstructionsComputeSameValue(HloInstruction* i1,
+                                                     HloInstruction* i2) {
+  ArCrsCombiner combiner(/*num_spatial_partitions=*/2);
+  auto module = i1->parent()->parent();
+  CHECK_EQ(module, i2->parent()->parent());
+  combiner.call_graph_ = CallGraph::Build(module);
+  absl::flat_hash_map<int64, int64> visited_pairs;
+  return combiner.InstructionsComputeSameValue(i1, i2, &visited_pairs);
+}
+
+bool ArCrsCombiner::InstructionsComputeSameValue(
+    HloInstruction* i1, HloInstruction* i2,
+    absl::flat_hash_map<int64, int64>* visited_pairs) {
+  if (i1 == i2) {
+    return true;
+  }
+  auto uid1 = i1->unique_id();
+  auto uid2 = i2->unique_id();
+  auto min_uid = std::min(uid1, uid2);
+  auto max_uid = std::max(uid1, uid2);
+  auto it = visited_pairs->find(min_uid);
+  if (it != visited_pairs->end() && max_uid == it->second) {
+    return true;
+  }
+  auto opcode1 = i1->opcode();
+  auto operands1 = i1->operands();
+  if (opcode1 != i2->opcode() || operands1.size() != i2->operands().size()) {
+    return false;
+  }
+  if (opcode1 == HloOpcode::kConstant || i1->IsCrossModuleAllReduce()) {
+    return i1->Identical(
+        *i2,
+        /*eq_operands=*/std::equal_to<const HloInstruction*>(),
+        /*eq_computations=*/std::equal_to<const HloComputation*>(),
+        /*layout_sensitive=*/false);
+  }
+  visited_pairs->emplace(min_uid, max_uid);
+  for (int i = 0; i < operands1.size(); ++i) {
+    auto operand1 = operands1[i];
+    auto operand2 = i2->operands()[i];
+    if (!InstructionsComputeSameValue(operand1, operand2, visited_pairs)) {
+      return false;
+    }
+  }
+  if (opcode1 == HloOpcode::kGetTupleElement) {
+    if (i1->tuple_index() == i2->tuple_index()) {
+      return true;
+    }
+    return TupleElementsComputeSameValue(operands1[0], i1->tuple_index(),
+                                         i2->tuple_index(), visited_pairs);
+  }
+  return true;
+}
+
+void ArCrsCombiner::GroupAllReducesById(HloModule* module) {
+  for (HloComputation* computation : module->MakeNonfusionComputations()) {
+    for (HloInstruction* instruction : computation->instructions()) {
+      auto ar = MatchesArCrsPattern(instruction);
+      if (ar) {
+        all_reduce_map_[*((*ar)->all_reduce_id())].push_back(*ar);
+      }
+    }
+  }
+}
+
+void ArCrsCombiner::KeepProvablyEqualInstructionGroups() {
+  for (auto it : all_reduce_map_) {
+    auto instruction_vec = it.second;
+    CHECK_EQ(instruction_vec.size(), num_spatial_partitions_);
+
+    auto instr_0 = instruction_vec[0];
+    auto add_0 = instr_0->users()[0]->users()[0];
+    CHECK(HloOpcode::kAdd == add_0->opcode());
+
+    for (int i = 1; i < instruction_vec.size(); ++i) {
+      auto instr_i = instruction_vec[i];
+      auto add_i = instr_i->users()[0]->users()[0];
+      CHECK(HloOpcode::kAdd == add_i->opcode());
+      absl::flat_hash_map<int64, int64> visited_pairs;
+      if (!InstructionsComputeSameValue(add_0, add_i, &visited_pairs)) {
+        all_reduce_map_.erase(it.first);
+      }
+    }
+  }
+}
+
+StatusOr<bool> ArCrsCombiner::RewriteGraph() {
+  if (all_reduce_map_.empty()) {
+    return false;
+  }
+
+  auto computation_is_addition = [](HloComputation* c) {
+    return c->instruction_count() == 3 &&
+           Match(c->root_instruction(), m::Add(m::Parameter(), m::Parameter()));
+  };
+
+  for (auto it : all_reduce_map_) {
+    auto instruction_vec = it.second;
+    for (auto all_reduce : instruction_vec) {
+      auto parent_computation = all_reduce->parent();
+      auto convert = all_reduce->users()[0];
+      auto add = convert->users()[0];
+      auto crs = add->users()[0];
+
+      if (!computation_is_addition(all_reduce->called_computations()[0]) ||
+          !computation_is_addition(crs->called_computations()[0])) {
+        continue;
+      }
+      HloInstruction* other_summand = (add->operands()[0] == convert)
+                                          ? add->operands()[1]
+                                          : add->operands()[0];
+      // Remove the AllReduce and replace the CRS with:
+      // AllReduce - (other_summand * (num_spatial_partitions_ - 1))
+      TF_CHECK_OK(
+          all_reduce->ReplaceAllUsesWith(all_reduce->mutable_operand(0)));
+      crs->set_all_reduce_id(all_reduce->all_reduce_id());
+      auto new_shape = crs->shape();
+      HloInstruction* to_subtract;
+      if (num_spatial_partitions_ == 2) {
+        to_subtract = other_summand;
+      } else {
+        Literal partitions_minus_1_lit = Literal(new_shape);
+        partitions_minus_1_lit.PopulateWithValue<float>(
+            num_spatial_partitions_ - 1);
+        auto partitions_minus_1_const = parent_computation->AddInstruction(
+            HloInstruction::CreateConstant(partitions_minus_1_lit.Clone()));
+        to_subtract =
+            parent_computation->AddInstruction(HloInstruction::CreateBinary(
+                new_shape, HloOpcode::kMultiply, other_summand,
+                partitions_minus_1_const));
+      }
+      auto sub =
+          parent_computation->AddInstruction(HloInstruction::CreateBinary(
+              new_shape, HloOpcode::kSubtract, crs, to_subtract));
+      TF_CHECK_OK(crs->ReplaceAllUsesWith(sub));
+      TF_CHECK_OK(parent_computation->RemoveInstruction(all_reduce));
+    }
+  }
+
+  return true;
+}
+
+StatusOr<bool> ArCrsCombiner::Run(HloModule* module) {
+  call_graph_ = CallGraph::Build(module);
+
+  GroupAllReducesById(module);
+
+  KeepProvablyEqualInstructionGroups();
+
+  return RewriteGraph();
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/ar_crs_combiner.h b/tensorflow/compiler/xla/service/ar_crs_combiner.h
new file mode 100644
index 00000000000..f6a7ef76ec3
--- /dev/null
+++ b/tensorflow/compiler/xla/service/ar_crs_combiner.h
@@ -0,0 +1,88 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_AR_CRS_COMBINER_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_AR_CRS_COMBINER_H_
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/strings/string_view.h"
+#include "tensorflow/compiler/xla/service/call_graph.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
+#include "tensorflow/compiler/xla/statusor.h"
+
+namespace xla {
+
+// Combine an AllReduce and a CrossReplicaSum when they are close to each other
+// in the graph, to use an efficient CrossReplicaSum implementation that
+// fully utilizes the interconnect bandwidth.
+class ArCrsCombiner : public HloModulePass {
+ public:
+  ArCrsCombiner(int num_spatial_partitions)
+      : num_spatial_partitions_(num_spatial_partitions) {}
+  absl::string_view name() const override { return "ar-crs-combiner"; }
+  StatusOr<bool> Run(HloModule* module) override;
+
+  // Helper method to allow testing of InstructionsComputeSameValue.
+  static bool TestInstructionsComputeSameValue(HloInstruction* i1,
+                                               HloInstruction* i2);
+
+ private:
+  // If the passed instruction is a while parameter, and the while body is only
+  // called by a single while instruction, return the while instruction.
+  absl::optional<HloInstruction*> WhileFromBodyParameter(
+      HloInstruction* instruction);
+
+  // Returns a vector of tuple instructions.
+  // If all instructions that flow to "instruction" are tuples, return them.
+  // Otherwise, return an empty vector.
+  std::vector<HloInstruction*> GetAllTuples(HloInstruction* instruction);
+
+  // Checks whether two different elements in the same tuple compute the same
+  // value.
+  bool TupleElementsComputeSameValue(
+      HloInstruction* tuple_shaped_instruction, int64 i1, int64 i2,
+      absl::flat_hash_map<int64, int64>* visited_pairs);
+
+  // Returns whether the instructions i1 and i2 can be shown to evaluate to the
+  // same value. Handling WHILE requires recursion, which may cause us to visit
+  // the same instruction again. To avoid infinite loops, we pass a cache of
+  // visited instruction pairs.
+  bool InstructionsComputeSameValue(
+      HloInstruction* i1, HloInstruction* i2,
+      absl::flat_hash_map<int64, int64>* visited_pairs);
+
+  // Populates all_reduce_map_.
+  void GroupAllReducesById(HloModule* module);
+
+  // Looks at each AllReduce group in all_reduce_map_, and keeps only the
+  // groups for which it's safe to move the AllReduce later in the HLO graph.
+  void KeepProvablyEqualInstructionGroups();
+
+  // Performs the graph rewrite that eliminates the early AllReduce and turns
+  // the later CRS into an AllReduce.
+  StatusOr<bool> RewriteGraph();
+
+  int num_spatial_partitions_;
+
+  // Map from all-reduce ids to the all reduce instructions.
+  absl::flat_hash_map<int64, std::vector<HloInstruction*>> all_reduce_map_;
+
+  std::unique_ptr<CallGraph> call_graph_;
+};
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_AR_CRS_COMBINER_H_
diff --git a/tensorflow/compiler/xla/service/ar_crs_combiner_test.cc b/tensorflow/compiler/xla/service/ar_crs_combiner_test.cc
new file mode 100644
index 00000000000..9d5eaf63ccf
--- /dev/null
+++ b/tensorflow/compiler/xla/service/ar_crs_combiner_test.cc
@@ -0,0 +1,415 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/ar_crs_combiner.h"
+#include "tensorflow/compiler/xla/service/hlo_matchers.h"
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+
+namespace xla {
+namespace {
+
+namespace op = xla::testing::opcode_matchers;
+
+class ArCrsCombinerTest : public HloTestBase {};
+
+TEST_F(ArCrsCombinerTest, SameValueTestBasecase) {
+  const char* module_str = R"(
+HloModule foobar
+
+ENTRY %entrycomp (p: f32[2,2]) -> (f32[2,2], f32[2,2]) {
+  %p = f32[2,2] parameter(0)
+  %constant.f32.1 = f32[2,2] constant(f32[2,2] {{1, 2}, {3, 4}})
+  %constant.f32.2 = f32[2,2] constant(f32[2,2] {{1, 2}, {3, 4}})
+  ROOT %tuple = (f32[2,2], f32[2,2]) tuple(%constant.f32.1, %constant.f32.2)
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(module_str));
+  auto root_tuple = module->entry_computation()->root_instruction();
+  auto i1 = root_tuple->operands()[0];
+  auto i2 = root_tuple->operands()[1];
+  EXPECT_FALSE(ArCrsCombiner::TestInstructionsComputeSameValue(
+      i1, module->entry_computation()->parameter_instruction(0)));
+  EXPECT_TRUE(ArCrsCombiner::TestInstructionsComputeSameValue(i1, i2));
+}
+
+TEST_F(ArCrsCombinerTest, SameValueTestNumOperands) {
+  const char* module_str = R"(
+HloModule foobar
+
+ENTRY %entrycomp (p: f32[2,2]) -> ((f32[2,2]), (f32[2,2], f32[2,2])) {
+  %p = f32[2,2] parameter(0)
+  %constant.f32 = f32[2,2] constant(f32[2,2] {{1, 2}, {3, 4}})
+  %tuple1 = (f32[2,2]) tuple(%constant.f32)
+  %tuple2 = (f32[2,2], f32[2,2]) tuple(%constant.f32, %constant.f32)
+  ROOT %tuple = ((f32[2,2]), (f32[2,2], f32[2,2])) tuple(%tuple1, %tuple2)
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(module_str));
+  auto root_tuple = module->entry_computation()->root_instruction();
+  auto i1 = root_tuple->operands()[0];
+  auto i2 = root_tuple->operands()[1];
+  EXPECT_FALSE(ArCrsCombiner::TestInstructionsComputeSameValue(i1, i2));
+}
+
+TEST_F(ArCrsCombinerTest, SameValueTestTupleElementSameIndex) {
+  const char* module_str = R"(
+HloModule foobar
+
+ENTRY %entrycomp (p: f32[2,2]) -> (f32[2,2], f32[2,2]) {
+  %p = f32[2,2] parameter(0)
+  %constant.f32 = f32[2,2] constant(f32[2,2] {{1, 2}, {3, 4}})
+  %tuple.1 = (f32[2,2], f32[2,2]) tuple(%constant.f32, %constant.f32)
+  %get-tuple-element.1 = f32[2,2] get-tuple-element(%tuple.1), index=0
+  %get-tuple-element.2 = f32[2,2] get-tuple-element(%tuple.1), index=0
+  ROOT %tuple = (f32[2,2], f32[2,2]) tuple(%get-tuple-element.1, %get-tuple-element.2)
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(module_str));
+  auto root_tuple = module->entry_computation()->root_instruction();
+  auto i1 = root_tuple->operands()[0];
+  auto i2 = root_tuple->operands()[1];
+  EXPECT_TRUE(ArCrsCombiner::TestInstructionsComputeSameValue(i1, i2));
+}
+
+TEST_F(ArCrsCombinerTest, SameValueTestTupleElementDifferentIndex1) {
+  const char* module_str = R"(
+HloModule foobar
+
+ENTRY %entrycomp (p: f32[2,2]) -> (f32[2,2], f32[2,2]) {
+  %p = f32[2,2] parameter(0)
+  %constant.f32 = f32[2,2] constant(f32[2,2] {{1, 2}, {3, 4}})
+  %tuple.1 = (f32[2,2], f32[2,2]) tuple(%constant.f32, %constant.f32)
+  %get-tuple-element.1 = f32[2,2] get-tuple-element(%tuple.1), index=0
+  %get-tuple-element.2 = f32[2,2] get-tuple-element(%tuple.1), index=1
+  ROOT %tuple = (f32[2,2], f32[2,2]) tuple(%get-tuple-element.1, %get-tuple-element.2)
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(module_str));
+  auto root_tuple = module->entry_computation()->root_instruction();
+  auto i1 = root_tuple->operands()[0];
+  auto i2 = root_tuple->operands()[1];
+  EXPECT_TRUE(ArCrsCombiner::TestInstructionsComputeSameValue(i1, i2));
+}
+
+TEST_F(ArCrsCombinerTest, SameValueTestTupleElementDifferentIndex2) {
+  const char* module_str = R"(
+HloModule foobar
+
+ENTRY %entrycomp (p: f32[2,2]) -> (f32[2,2], f32[2,2]) {
+  %p = f32[2,2] parameter(0)
+  %constant.f32.1 = f32[2,2] constant(f32[2,2] {{1, 2}, {3, 4}})
+  %constant.f32.2 = f32[2,2] constant(f32[2,2] {{2, 3}, {4, 5}})
+  %tuple.1 = (f32[2,2], f32[2,2]) tuple(%constant.f32.1, %constant.f32.2)
+  %get-tuple-element.1 = f32[2,2] get-tuple-element(%tuple.1), index=0
+  %get-tuple-element.2 = f32[2,2] get-tuple-element(%tuple.1), index=1
+  ROOT %tuple = (f32[2,2], f32[2,2]) tuple(%get-tuple-element.1, %get-tuple-element.2)
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(module_str));
+  auto root_tuple = module->entry_computation()->root_instruction();
+  auto i1 = root_tuple->operands()[0];
+  auto i2 = root_tuple->operands()[1];
+  EXPECT_FALSE(ArCrsCombiner::TestInstructionsComputeSameValue(i1, i2));
+}
+
+TEST_F(ArCrsCombinerTest, SameValueTestWhile1) {
+  const char* module_str = R"(
+HloModule foobar
+
+%condition (x: (f32[2,2], f32[2,2])) -> pred[] {
+  %x = (f32[2,2], f32[2,2]) parameter(0)
+  %constant.0 = s32[] constant(0)
+  %constant.1 = s32[] constant(1)
+  ROOT %greater-than = pred[] greater-than(s32[] %constant.1, s32[] %constant.0)
+}
+
+%body (x: (f32[2,2], f32[2,2])) -> (f32[2,2], f32[2,2]) {
+  %x = (f32[2,2], f32[2,2]) parameter(0)
+  %constant.f32 = f32[2,2] constant(f32[2,2] {{1, 2}, {3, 4}})
+  %get-tuple-element.1 = f32[2,2] get-tuple-element(%x), index=0
+  %get-tuple-element.2 = f32[2,2] get-tuple-element(%x), index=1
+  %add.1 = f32[2,2] add(%get-tuple-element.1, %constant.f32)
+  %add.2 = f32[2,2] add(%get-tuple-element.2, %constant.f32)
+  ROOT %tuple = (f32[2,2], f32[2,2]) tuple(%add.1, %add.2)
+}
+
+ENTRY %WhileLoop () -> (f32[2,2], f32[2,2]) {
+  %constant.f32 = f32[2,2] constant(f32[2,2] {{3, 4}, {5, 6}})
+  %init.tuple = (f32[2,2], f32[2,2]) tuple(%constant.f32, %constant.f32)
+  ROOT %while = (f32[2,2], f32[2,2]) while(%init.tuple), condition=%condition, body=%body
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(module_str));
+  auto root_while = module->entry_computation()->root_instruction();
+  auto body_tuple = root_while->while_body()->root_instruction();
+  auto i1 = body_tuple->operands()[0];
+  auto i2 = body_tuple->operands()[1];
+  EXPECT_TRUE(ArCrsCombiner::TestInstructionsComputeSameValue(i1, i2));
+}
+
+TEST_F(ArCrsCombinerTest, SameValueTestWhile2) {
+  const char* module_str = R"(
+HloModule foobar
+
+%condition (x: (f32[2,2], f32[2,2])) -> pred[] {
+  %x = (f32[2,2], f32[2,2]) parameter(0)
+  %constant.0 = s32[] constant(0)
+  %constant.1 = s32[] constant(1)
+  ROOT %greater-than = pred[] greater-than(s32[] %constant.1, s32[] %constant.0)
+}
+
+%body (x: (f32[2,2], f32[2,2])) -> (f32[2,2], f32[2,2]) {
+  %x = (f32[2,2], f32[2,2]) parameter(0)
+  %constant.f32 = f32[2,2] constant(f32[2,2] {{1, 2}, {3, 4}})
+  %get-tuple-element.1 = f32[2,2] get-tuple-element(%x), index=0
+  %get-tuple-element.2 = f32[2,2] get-tuple-element(%x), index=1
+  %add.1 = f32[2,2] add(%get-tuple-element.1, %constant.f32)
+  %add.2 = f32[2,2] add(%get-tuple-element.2, %constant.f32)
+  ROOT %tuple = (f32[2,2], f32[2,2]) tuple(%add.1, %add.2)
+}
+
+ENTRY %WhileLoop () -> (f32[2,2], f32[2,2]) {
+  %constant.f32.1 = f32[2,2] constant(f32[2,2] {{3, 4}, {5, 6}})
+  %constant.f32.2 = f32[2,2] constant(f32[2,2] {{3, 4}, {7, 8}})
+  %init.tuple = (f32[2,2], f32[2,2]) tuple(%constant.f32.1, %constant.f32.2)
+  ROOT %while = (f32[2,2], f32[2,2]) while(%init.tuple), condition=%condition, body=%body
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(module_str));
+  auto root_while = module->entry_computation()->root_instruction();
+  auto body_tuple = root_while->while_body()->root_instruction();
+  auto i1 = body_tuple->operands()[0];
+  auto i2 = body_tuple->operands()[1];
+  EXPECT_FALSE(ArCrsCombiner::TestInstructionsComputeSameValue(i1, i2));
+}
+
+TEST_F(ArCrsCombinerTest, SameValueTestWhile3) {
+  const char* module_str = R"(
+HloModule foobar
+
+%condition (x: (f32[2,2], f32[2,2])) -> pred[] {
+  %x = (f32[2,2], f32[2,2]) parameter(0)
+  %constant.0 = s32[] constant(0)
+  %constant.1 = s32[] constant(1)
+  ROOT %greater-than = pred[] greater-than(s32[] %constant.1, s32[] %constant.0)
+}
+
+%body (x: (f32[2,2], f32[2,2])) -> (f32[2,2], f32[2,2]) {
+  %x = (f32[2,2], f32[2,2]) parameter(0)
+  %constant.f32.1 = f32[2,2] constant(f32[2,2] {{1, 2}, {3, 4}})
+  %constant.f32.2 = f32[2,2] constant(f32[2,2] {{3, 4}, {1, 2}})
+  %get-tuple-element.1 = f32[2,2] get-tuple-element(%x), index=0
+  %get-tuple-element.2 = f32[2,2] get-tuple-element(%x), index=1
+  %add.1 = f32[2,2] add(%get-tuple-element.1, %constant.f32.1)
+  %add.2 = f32[2,2] add(%get-tuple-element.2, %constant.f32.2)
+  ROOT %tuple = (f32[2,2], f32[2,2]) tuple(%add.1, %add.2)
+}
+
+ENTRY %WhileLoop () -> (f32[2,2], f32[2,2]) {
+  %constant.f32 = f32[2,2] constant(f32[2,2] {{3, 4}, {5, 6}})
+  %init.tuple = (f32[2,2], f32[2,2]) tuple(%constant.f32, %constant.f32)
+  ROOT %while = (f32[2,2], f32[2,2]) while(%init.tuple), condition=%condition, body=%body
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(module_str));
+  auto root_while = module->entry_computation()->root_instruction();
+  auto body_tuple = root_while->while_body()->root_instruction();
+  auto i1 = body_tuple->operands()[0]->operands()[0];  // %get-tuple-element.1
+  auto i2 = body_tuple->operands()[1]->operands()[0];  // %get-tuple-element.2
+  EXPECT_FALSE(ArCrsCombiner::TestInstructionsComputeSameValue(i1, i2));
+}
+
+TEST_F(ArCrsCombinerTest, RewritePatternArConvertAddCrs) {
+  const char* module_str = R"(
+HloModule foobar
+
+%binary_add (a: bf16[], b: bf16[]) -> bf16[] {
+  %a = bf16[] parameter(0)
+  %b = bf16[] parameter(1)
+  ROOT %add = bf16[] add(%a, %b)
+}
+
+%sum.f32 (x: f32[], y: f32[]) -> f32[] {
+  %x = f32[] parameter(0)
+  %y = f32[] parameter(1)
+  ROOT %add = f32[] add(%x, %y)
+}
+
+ENTRY %entrycomp (p: f32[2,2]) -> (f32[2,2], f32[2,2]) {
+  %p = f32[2,2] parameter(0)
+  %constant.bf16 = bf16[2,2] constant(bf16[2,2] {{1, 2}, {3, 4}})
+  %constant.f32 = f32[2,2] constant(f32[2,2] {{1, 2}, {3, 4}})
+
+  %cross-replica-sum.ar.1 = bf16[2,2]
+      cross-replica-sum(%constant.bf16),
+      replica_groups={{0},{1}},
+      all_reduce_id=1,
+      to_apply=%binary_add,
+      sharding={maximal device=0}
+  %convert.1 = f32[2,2]
+      convert(%cross-replica-sum.ar.1),
+      sharding={maximal device=0}
+  %add.1 = f32[2,2]
+      add(%constant.f32, %convert.1),
+      sharding={maximal device=0}
+  %cross-replica-sum.1 = f32[2,2]
+      cross-replica-sum(%add.1),
+      replica_groups={{0,1}},
+      to_apply=%sum.f32,
+      sharding={maximal device=0}
+
+  %cross-replica-sum.ar.2 = bf16[2,2]
+      cross-replica-sum(%constant.bf16),
+      replica_groups={{0},{1}},
+      all_reduce_id=1,
+      to_apply=%binary_add,
+      sharding={maximal device=1}
+  %convert.2 = f32[2,2]
+      convert(%cross-replica-sum.ar.2),
+      sharding={maximal device=1}
+  %add.2 = f32[2,2]
+      add(%constant.f32, %convert.2),
+      sharding={maximal device=1}
+  %cross-replica-sum.2 = f32[2,2]
+      cross-replica-sum(%add.2),
+      replica_groups={{0,1}},
+      to_apply=%sum.f32,
+      sharding={maximal device=1}
+
+  ROOT %tuple = (f32[2,2], f32[2,2])
+      tuple(%cross-replica-sum.1, %cross-replica-sum.2),
+      sharding={{maximal device=0}, {maximal device=1}}
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(module_str));
+  auto crs_before =
+      module->entry_computation()->root_instruction()->operands()[0];
+  auto replica_groups_before = crs_before->replica_groups();
+  ArCrsCombiner combiner(2);
+  auto changed = combiner.Run(module.get()).ValueOrDie();
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              op::Tuple(op::Subtract(op::CrossReplicaSum(), op::Constant()),
+                        op::Subtract(op::CrossReplicaSum(), op::Constant())));
+  auto sub = module->entry_computation()->root_instruction()->operands()[0];
+  auto crs_after = sub->operands()[0];
+  auto replica_groups_after = crs_after->replica_groups();
+  ASSERT_EQ(replica_groups_before.size(), replica_groups_after.size());
+  for (int i = 0; i < replica_groups_before.size(); ++i) {
+    // Somewhat verbose way to compare the replica_ids, because EqualsProto
+    // is not available in the open-source build.
+    auto group_before = replica_groups_before[i];
+    std::vector<int64> ids_before(group_before.replica_ids().begin(),
+                                  group_before.replica_ids().end());
+    auto group_after = replica_groups_after[i];
+    std::vector<int64> ids_after(group_after.replica_ids().begin(),
+                                 group_after.replica_ids().end());
+    EXPECT_EQ(ids_before, ids_after);
+  }
+}
+
+TEST_F(ArCrsCombinerTest, OtherSummandNotTheSameDontRewrite) {
+  const char* module_str = R"(
+HloModule foobar
+
+%binary_add (a: bf16[], b: bf16[]) -> bf16[] {
+  %a = bf16[] parameter(0)
+  %b = bf16[] parameter(1)
+  ROOT %add = bf16[] add(%a, %b)
+}
+
+%sum.f32 (x: f32[], y: f32[]) -> f32[] {
+  %x = f32[] parameter(0)
+  %y = f32[] parameter(1)
+  ROOT %add = f32[] add(%x, %y)
+}
+
+ENTRY %entrycomp (p: f32[2,2]) -> (f32[2,2], f32[2,2]) {
+  %p = f32[2,2] parameter(0)
+  %constant.bf16 = bf16[2,2] constant(bf16[2,2] {{1, 2}, {3, 4}})
+  %constant.f32.1 = f32[2,2] constant(f32[2,2] {{1, 2}, {3, 4}})
+  %constant.f32.2 = f32[2,2] constant(f32[2,2] {{3, 4}, {5, 6}})
+
+  %cross-replica-sum.ar.1 = bf16[2,2]
+      cross-replica-sum(%constant.bf16),
+      replica_groups={{0},{1}},
+      all_reduce_id=1,
+      to_apply=%binary_add,
+      sharding={maximal device=0}
+  %convert.1 = f32[2,2]
+      convert(%cross-replica-sum.ar.1),
+      sharding={maximal device=0}
+  %add.1 = f32[2,2]
+      add(%constant.f32.1, %convert.1),
+      sharding={maximal device=0}
+  %cross-replica-sum.1 = f32[2,2]
+      cross-replica-sum(%add.1),
+      replica_groups={{0,1}},
+      to_apply=%sum.f32,
+      sharding={maximal device=0}
+
+  %cross-replica-sum.ar.2 = bf16[2,2]
+      cross-replica-sum(%constant.bf16),
+      replica_groups={{0},{1}},
+      all_reduce_id=1,
+      to_apply=%binary_add,
+      sharding={maximal device=1}
+  %convert.2 = f32[2,2]
+      convert(%cross-replica-sum.ar.2),
+      sharding={maximal device=1}
+  %add.2 = f32[2,2]
+      add(%constant.f32.2, %convert.2),
+      sharding={maximal device=1}
+  %cross-replica-sum.2 = f32[2,2]
+      cross-replica-sum(%add.2),
+      replica_groups={{0,1}},
+      to_apply=%sum.f32,
+      sharding={maximal device=1}
+
+  ROOT %tuple = (f32[2,2], f32[2,2])
+      tuple(%cross-replica-sum.1, %cross-replica-sum.2),
+      sharding={{maximal device=0}, {maximal device=1}}
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(module_str));
+  ArCrsCombiner combiner(2);
+  auto changed = combiner.Run(module.get()).ValueOrDie();
+  EXPECT_FALSE(changed);
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/batchnorm_expander.cc b/tensorflow/compiler/xla/service/batchnorm_expander.cc
index f70f6ddfec6..0e6ca1871b3 100644
--- a/tensorflow/compiler/xla/service/batchnorm_expander.cc
+++ b/tensorflow/compiler/xla/service/batchnorm_expander.cc
@@ -107,19 +107,37 @@ class BatchNormExpanderVisitor : public DfsHloVisitorWithDefault {
   }
 
   std::unique_ptr<HloInstruction> Mean(
-      int64 element_count, HloInstruction* operand,
+      HloInstruction* element_count, HloInstruction* operand,
       const std::function<HloInstruction*(std::unique_ptr<HloInstruction>)>&
           add_instruction) {
-    HloInstruction* elem_count_recip =
-        add_instruction(HloInstruction::CreateBroadcast(
-            operand->shape(),
-            add_instruction(HloInstruction::CreateConvert(
-                ShapeUtil::MakeShape(operand->shape().element_type(), {}),
-                add_instruction(HloInstruction::CreateConstant(
-                    LiteralUtil::CreateR0<float>(1.0 / element_count))))),
-            {}));
-    return HloInstruction::CreateBinary(operand->shape(), HloOpcode::kMultiply,
-                                        operand, elem_count_recip);
+    auto broadcast = add_instruction(
+        HloInstruction::CreateBroadcast(operand->shape(), element_count, {}));
+    return HloInstruction::CreateBinary(operand->shape(), HloOpcode::kDivide,
+                                        operand, broadcast);
+  }
+
+  std::unique_ptr<HloInstruction> DynamicElementCountPerFeature(
+      HloInstruction* operand, int64 feature_index,
+      const std::function<HloInstruction*(std::unique_ptr<HloInstruction>)>&
+          add_instruction) {
+    auto elements_per_feature_u32 = add_instruction(
+        HloInstruction::CreateConstant(LiteralUtil::CreateR0<uint32>(1)));
+
+    for (int64 i = 0; i < ShapeUtil::Rank(operand->shape()); ++i) {
+      if (i == feature_index) {
+        continue;
+      }
+      auto dynamic_dimension_size =
+          add_instruction(HloInstruction::CreateGetDimensionSize(
+              ShapeUtil::MakeShape(U32, {}), operand, i));
+      elements_per_feature_u32 = add_instruction(HloInstruction::CreateBinary(
+          ShapeUtil::MakeShape(U32, {}), HloOpcode::kMultiply,
+          dynamic_dimension_size, elements_per_feature_u32));
+    }
+
+    return HloInstruction::CreateConvert(
+        ShapeUtil::MakeShape(operand->shape().element_type(), {}),
+        elements_per_feature_u32);
   }
 
   // Replaces the existing HLO instruction old_instruction, with
@@ -195,9 +213,6 @@ Status BatchNormExpanderVisitor::HandleBatchNormTraining(
   const Shape operand_shape = operand->shape();
   PrimitiveType ptype = operand_shape.element_type();
   int64 feature_index = batch_norm->feature_index();
-  const int64 feature_count = operand_shape.dimensions(feature_index);
-  const int64 size_in_elements = ShapeUtil::ElementsIn(operand_shape);
-  int64 elements_per_feature_int64 = size_in_elements / feature_count;
 
   HloInstruction* scale = batch_norm->mutable_operand(1);
   HloInstruction* offset = batch_norm->mutable_operand(2);
@@ -220,6 +235,9 @@ Status BatchNormExpanderVisitor::HandleBatchNormTraining(
     }
   }
 
+  auto elements_per_feature =
+      add(DynamicElementCountPerFeature(operand, feature_index, add));
+
   auto scale_broadcasted = add(
       HloInstruction::CreateBroadcast(operand_shape, scale, {feature_index}));
 
@@ -243,13 +261,13 @@ Status BatchNormExpanderVisitor::HandleBatchNormTraining(
       add_reduce_computation));
 
   // E[X].
-  auto mean = add(Mean(elements_per_feature_int64, sum, add));
+  auto mean = add(Mean(elements_per_feature, sum, add));
 
   auto mean_broadcasted = add(
       HloInstruction::CreateBroadcast(operand_shape, mean, {feature_index}));
 
   // E[X^2].
-  auto square_mean = add(Mean(elements_per_feature_int64, squared_sum, add));
+  auto square_mean = add(Mean(elements_per_feature, squared_sum, add));
 
   // E^2[X].
   auto mean_square =
@@ -458,9 +476,8 @@ Status BatchNormExpanderVisitor::HandleBatchNormGrad(
 
   int64 feature_index = batch_norm->feature_index();
 
-  const int64 size_in_elements = ShapeUtil::ElementsIn(activation_shape);
-  const int64 feature_count = activation_shape.dimensions(feature_index);
-  const int64 elements_per_feature_int64 = size_in_elements / feature_count;
+  auto elements_per_feature =
+      add(DynamicElementCountPerFeature(activation, feature_index, add));
 
   auto zero_literal = LiteralUtil::CreateR0(0.0f);
   TF_ASSIGN_OR_RETURN(zero_literal, zero_literal.Convert(ptype));
@@ -553,15 +570,9 @@ Status BatchNormExpanderVisitor::HandleBatchNormGrad(
       add_binary(activation_shape, HloOpcode::kMultiply, scale_broadcasted,
                  rsqrt_var_add_epsilon_broadcasted);
 
-  scale_times_rsqrt_var_add_epsilon = add(
-      Mean(elements_per_feature_int64, scale_times_rsqrt_var_add_epsilon, add));
+  scale_times_rsqrt_var_add_epsilon =
+      add(Mean(elements_per_feature, scale_times_rsqrt_var_add_epsilon, add));
 
-  auto elements_per_feature_literal =
-      LiteralUtil::CreateR0<float>(elements_per_feature_int64);
-  TF_ASSIGN_OR_RETURN(elements_per_feature_literal,
-                      elements_per_feature_literal.Convert(ptype));
-  auto elements_per_feature = add(
-      HloInstruction::CreateConstant(std::move(elements_per_feature_literal)));
   auto i1 = add_binary(activation_shape, HloOpcode::kMultiply, grad_output,
                        add(HloInstruction::CreateBroadcast(
                            activation_shape, elements_per_feature, {})));
diff --git a/tensorflow/compiler/xla/service/batchnorm_expander_test.cc b/tensorflow/compiler/xla/service/batchnorm_expander_test.cc
index 08cf8026177..8e8fbbd935b 100644
--- a/tensorflow/compiler/xla/service/batchnorm_expander_test.cc
+++ b/tensorflow/compiler/xla/service/batchnorm_expander_test.cc
@@ -36,7 +36,21 @@ limitations under the License.
 namespace xla {
 namespace {
 
-using BatchNormExpanderTest = HloTestBase;
+class BatchNormExpanderTest : public HloTestBase {
+ protected:
+  // BatchNorm should have a dynamic sized dividor for mean operations.
+  int64 CountGetDimensionSize(const HloModule& module) {
+    int64 count = 0;
+    for (HloComputation* comp : module.computations()) {
+      for (HloInstruction* inst : comp->instructions()) {
+        if (inst->opcode() == HloOpcode::kGetDimensionSize) {
+          count++;
+        }
+      }
+    }
+    return count;
+  }
+};
 
 // Test that we expand BatchNormTraining.
 TEST_F(BatchNormExpanderTest, BatchNormTraining) {
@@ -68,6 +82,7 @@ TEST_F(BatchNormExpanderTest, BatchNormTraining) {
                              /*rewrite_grad_op=*/true);
   ASSERT_TRUE(rewriter.Run(module.get()).ValueOrDie());
   root = computation->root_instruction();
+  EXPECT_EQ(CountGetDimensionSize(*module), 3);
   // Make sure this operation is expanded.
   EXPECT_EQ(root->opcode(), HloOpcode::kTuple);
 }
@@ -110,6 +125,7 @@ TEST_F(BatchNormExpanderTest, BatchNormGrad) {
                              /*rewrite_grad_op=*/true);
   ASSERT_TRUE(rewriter.Run(module.get()).ValueOrDie());
   root = computation->root_instruction();
+  EXPECT_EQ(CountGetDimensionSize(*module), 3);
   // Make sure this operation is expanded.
   EXPECT_EQ(root->opcode(), HloOpcode::kTuple);
 }
diff --git a/tensorflow/compiler/xla/service/buffer_assignment.cc b/tensorflow/compiler/xla/service/buffer_assignment.cc
index 40c012a5e42..8d7c6244785 100644
--- a/tensorflow/compiler/xla/service/buffer_assignment.cc
+++ b/tensorflow/compiler/xla/service/buffer_assignment.cc
@@ -746,8 +746,7 @@ StatusOr<std::unique_ptr<BufferAssignment>> BufferAssigner::Run(
     LogicalBuffer::AlignmentFunction color_alignment,
     bool allow_input_output_aliasing, bool allocate_buffers_for_constants,
     BufferLiveness::Colorer colorer, ReuseAllocationFunction reuse_checker) {
-  BufferAssigner assigner(allow_input_output_aliasing,
-                          allocate_buffers_for_constants, std::move(colorer),
+  BufferAssigner assigner(allocate_buffers_for_constants, std::move(colorer),
                           std::move(reuse_checker));
   return assigner.CreateAssignment(module, std::move(hlo_ordering),
                                    std::move(buffer_size),
@@ -1434,33 +1433,40 @@ BufferAssigner::MergeColocatedBufferSets(
            computation == module->entry_computation();
   };
 
+  std::vector<bool> set_can_be_merged(colocated_buffer_sets.size(), true);
+
+  // Do not merge if one of the sets includes live outs, entry parameters or
+  // constants.
+  //
+  // Buffer liveness does not report the correct live range for entry
+  // parameter and live out buffers so we have to special case them here.  On
+  // backends that support constant buffer allocations, constant buffers are
+  // assigned globals in readonly storage so we can't merge colocated buffer
+  // sets containing constants with colocated buffer sets containing writing
+  // instructions or other constants.
+  //
+  // Moreover (on the CPU/GPU backends) the entry parameter buffers belong to
+  // the caller of the executable so we can't write to entry parameters
+  // either, and the argument for not merging constants also applies to entry
+  // parameters.
+  for (int64 i = 0; i < colocated_buffer_sets.size(); ++i) {
+    for (auto& buffer : colocated_buffer_sets[i]) {
+      if (buffer_liveness.MaybeLiveOut(*buffer) ||
+          is_entry_parameter(*buffer) ||
+          buffer->instruction()->opcode() == HloOpcode::kConstant) {
+        set_can_be_merged[i] = false;
+        break;
+      }
+    }
+  }
+
   // Returns true if the two colocated buffer sets (specified by their indices
   // into the colocated_buffer_sets) can be merged into a single set.
   auto cannot_merge_buffer_sets = [&colocated_buffer_sets, &buffer_liveness,
                                    &buffer_size,
-                                   &is_entry_parameter](int64 i, int64 j) {
-    // Do not merge if one of the sets includes live outs, entry parameters or
-    // constants.
-    //
-    // Buffer liveness does not report the correct live range for entry
-    // parameter and live out buffers so we have to special case them here.  On
-    // backends that support constant buffer allocations, constant buffers are
-    // assigned globals in readonly storage so we can't merge colocated buffer
-    // sets containing constants with colocated buffer sets containing writing
-    // instructions or other constants.
-    //
-    // Moreover (on the CPU/GPU backends) the entry parameter buffers belong to
-    // the caller of the executable so we can't write to entry parameters
-    // either, and the argument for not merging constants also applies to entry
-    // parameters.
-    for (int64 key : {i, j}) {
-      for (auto& buffer : colocated_buffer_sets[key]) {
-        if (buffer_liveness.MaybeLiveOut(*buffer) ||
-            is_entry_parameter(*buffer) ||
-            buffer->instruction()->opcode() == HloOpcode::kConstant) {
-          return true;
-        }
-      }
+                                   &set_can_be_merged](int64 i, int64 j) {
+    if (!set_can_be_merged[i] || !set_can_be_merged[j]) {
+      return true;
     }
 
     // Colocated sets satisfy the invariant that all buffers within a set have
diff --git a/tensorflow/compiler/xla/service/buffer_assignment.h b/tensorflow/compiler/xla/service/buffer_assignment.h
index d8e1612b899..0a9fdede803 100644
--- a/tensorflow/compiler/xla/service/buffer_assignment.h
+++ b/tensorflow/compiler/xla/service/buffer_assignment.h
@@ -545,12 +545,10 @@ class BufferAssigner {
       ReuseAllocationFunction reuse_checker = nullptr);
 
  private:
-  BufferAssigner(bool allow_input_output_aliasing,
-                 bool allocate_buffers_for_constants,
+  BufferAssigner(bool allocate_buffers_for_constants,
                  BufferLiveness::Colorer colorer,
                  ReuseAllocationFunction reuse_checker)
-      : allow_input_output_aliasing_(allow_input_output_aliasing),
-        allocate_buffers_for_constants_(allocate_buffers_for_constants),
+      : allocate_buffers_for_constants_(allocate_buffers_for_constants),
         colorer_(colorer),
         reuse_checker_(reuse_checker) {}
   virtual ~BufferAssigner() = default;
@@ -640,10 +638,6 @@ class BufferAssigner {
                       LogicalBuffer::Color::Hasher>
   SplitBuffersByColor(const absl::flat_hash_set<const LogicalBuffer*>& buffers);
 
-  // If true, buffer assignments assumes that input parameter buffers and output
-  // buffers can be shared if their sizes match.
-  bool allow_input_output_aliasing_;
-
   // If true, allocate buffers for constant instructions.
   bool allocate_buffers_for_constants_;
 
diff --git a/tensorflow/compiler/xla/service/buffer_assignment_test.cc b/tensorflow/compiler/xla/service/buffer_assignment_test.cc
index b1fc50cb188..8f482e6ba8c 100644
--- a/tensorflow/compiler/xla/service/buffer_assignment_test.cc
+++ b/tensorflow/compiler/xla/service/buffer_assignment_test.cc
@@ -137,8 +137,7 @@ class BufferAssignmentTest : public HloTestBase {
   }
 
   std::unique_ptr<BufferAssignment> RunBufferAssignmentWithInstructionSequence(
-      HloModule* module,
-      absl::Span<const HloInstruction* const> instruction_sequence,
+      HloModule* module, absl::Span<HloInstruction* const> instruction_sequence,
       int64 alignment = 1) {
     HloSchedule schedule(module);
     schedule.set_sequence(module->entry_computation(), instruction_sequence);
@@ -1853,7 +1852,7 @@ class WhileBufferAssignmentTest : public HloTestBase {
   std::unique_ptr<BufferAssignment> RunBufferAssignment(HloModule* module,
                                                         int64 alignment = 1) {
     HloSchedule schedule =
-        ScheduleModule(*module, ByteSizeOf).ConsumeValueOrDie();
+        ScheduleModule(module, ByteSizeOf).ConsumeValueOrDie();
     return BufferAssigner::Run(
                module, absl::make_unique<SequentialHloOrdering>(schedule),
                ByteSizeOf,
@@ -2162,7 +2161,7 @@ TEST_F(WhileBufferAssignmentTest, ColocatedBuffers) {
   // nodes are traversed during BufferAssignment.
   TF_ASSERT_OK_AND_ASSIGN(
       HloSchedule schedule,
-      ScheduleModule(*module, [](const BufferValue& buffer) {
+      ScheduleModule(module.get(), [](const BufferValue& buffer) {
         return ShapeUtil::ByteSizeOf(buffer.shape(),
                                      /*pointer_size=*/sizeof(void*));
       }));
@@ -2391,15 +2390,16 @@ TEST_F(WhileBufferAssignmentTest, WhileLoopsInterferingResultRange) {
   RunCopyInsertion(module.get());
 
   HloSchedule schedule =
-      ScheduleModule(*module, ByteSizeOf).ConsumeValueOrDie();
+      ScheduleModule(module.get(), ByteSizeOf).ConsumeValueOrDie();
 
   // To trigger b/38494731, we want a specific Hlo schedule for the
   // root computation, so we overwrite that entry with a manually
   // crafted sequence.
-  schedule.set_sequence(module->entry_computation(),
-                        {input1, weights1, one, output1, while1->operand(0),
-                         while1, input0, weights0, zero, output0,
-                         while0->operand(0), while0, gte0, gte1, root_add});
+  schedule.set_sequence(
+      module->entry_computation(),
+      {input1, weights1, one, output1, while1->mutable_operand(0), while1,
+       input0, weights0, zero, output0, while0->mutable_operand(0), while0,
+       gte0, gte1, root_add});
 
   // If this ASSERT fails, we constructed a bogus sequence above and this test
   // itself is buggy.
diff --git a/tensorflow/compiler/xla/service/buffer_liveness_test.cc b/tensorflow/compiler/xla/service/buffer_liveness_test.cc
index aeee543e843..40825a78716 100644
--- a/tensorflow/compiler/xla/service/buffer_liveness_test.cc
+++ b/tensorflow/compiler/xla/service/buffer_liveness_test.cc
@@ -117,7 +117,7 @@ TEST_F(BufferLivenessTest, ElementwiseChain) {
   auto log = builder.AddInstruction(
       HloInstruction::CreateUnary(vec_, HloOpcode::kLog, exp));
 
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
   auto liveness =
@@ -164,7 +164,7 @@ TEST_F(BufferLivenessTest, MultipleEntryParameters_Sequential) {
   auto add = builder.AddInstruction(
       HloInstruction::CreateBinary(vec_, HloOpcode::kAdd, negate, exp));
 
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   HloComputation* entry = module->AddEntryComputation(builder.Build());
 
   HloSchedule schedule(module.get());
@@ -213,7 +213,7 @@ TEST_F(BufferLivenessTest, NonElementwiseOperand) {
   auto reverse =
       builder.AddInstruction(HloInstruction::CreateReverse(vec_, negate, {0}));
 
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
   auto liveness =
@@ -247,7 +247,7 @@ TEST_F(BufferLivenessTest, OverlappedBuffers) {
   auto add = builder.AddInstruction(
       HloInstruction::CreateBinary(vec_, HloOpcode::kAdd, negate, exp));
 
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
   auto liveness =
@@ -289,7 +289,7 @@ TEST_F(BufferLivenessTest, OverlappedBuffersSequentialOrder) {
   auto add = builder.AddInstruction(
       HloInstruction::CreateBinary(vec_, HloOpcode::kAdd, negate, exp));
 
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
   HloSchedule schedule(module.get());
@@ -336,7 +336,7 @@ TEST_F(BufferLivenessTest, RootInstructionIsNotLastInSequentialOrder) {
       HloInstruction::CreateSend(recv_done, token, /*channel_id=*/1));
   auto send_done = builder.AddInstruction(HloInstruction::CreateSendDone(send));
 
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build(add));
 
   HloSchedule schedule(module.get());
@@ -373,7 +373,7 @@ TEST_F(BufferLivenessTest, TupleLiveOut) {
   auto outer_tuple =
       builder.AddInstruction(HloInstruction::CreateTuple({inner_tuple, exp}));
 
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
   auto liveness =
@@ -393,7 +393,7 @@ TEST_F(BufferLivenessTest, TupleLiveOut) {
 
 TEST_F(BufferLivenessTest, EmbeddedComputation) {
   // Test MaybeLiveOut and MayInterfere for embedded computation.
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
 
   auto embedded_builder = HloComputation::Builder(TestName() + "_embedded");
   auto embedded_param = embedded_builder.AddInstruction(
@@ -450,7 +450,7 @@ TEST_F(BufferLivenessTest, TupleConstantLiveOut) {
   builder.AddInstruction(HloInstruction::CreateGetTupleElement(
       inner_tuple0.shape(), tuple_constant, 0));
 
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
   auto liveness =
@@ -576,7 +576,7 @@ TEST_F(BufferLivenessTest, DependentTupleElements) {
   auto tuple_root =
       builder.AddInstruction(HloInstruction::CreateTuple({add0, add1}));
 
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(BuildDummyComputation());
   module->AddEmbeddedComputation(builder.Build());
 
@@ -611,8 +611,8 @@ TEST_F(BufferLivenessTest, DependentTupleElements) {
 class FusedDynamicUpdateSliceLivenessTest : public BufferLivenessTest {
  protected:
   // Builds and runs a computation (see test case computation graphs below).
-  std::unique_ptr<HloModule> BuildModule(const bool update_uses_tuple_element1,
-                                         const bool fuse_gte0) {
+  std::unique_ptr<VerifiedHloModule> BuildModule(
+      const bool update_uses_tuple_element1, const bool fuse_gte0) {
     auto builder = HloComputation::Builder(TestName());
     // Create param0 Tuple.
     Shape data_shape = ShapeUtil::MakeShape(F32, {8});
@@ -646,7 +646,7 @@ class FusedDynamicUpdateSliceLivenessTest : public BufferLivenessTest {
     builder.AddInstruction(
         HloInstruction::CreateTuple({gte0, dynamic_update_slice}));
     // Build module and get reference to entry computation.
-    auto module = CreateNewUnverifiedModule();
+    auto module = CreateNewVerifiedModule();
     module->AddEntryComputation(builder.Build());
     auto* computation = module->entry_computation();
     // Create fusion instruction based on number of tuple element 1 users.
@@ -802,7 +802,7 @@ class DynamicUpdateSliceLivenessTest : public BufferLivenessTest {
     auto tuple_root = builder.AddInstruction(
         HloInstruction::CreateTuple({gte0, dynamic_update_slice}));
     // Build module and get reference to entry computation.
-    auto module = CreateNewUnverifiedModule();
+    auto module = CreateNewVerifiedModule();
     module->AddEntryComputation(BuildDummyComputation());
     module->AddEmbeddedComputation(builder.Build());
     // Run BufferLiveness on 'module'.
diff --git a/tensorflow/compiler/xla/service/call_graph.cc b/tensorflow/compiler/xla/service/call_graph.cc
index bdd5069632e..7987343bfaf 100644
--- a/tensorflow/compiler/xla/service/call_graph.cc
+++ b/tensorflow/compiler/xla/service/call_graph.cc
@@ -325,6 +325,15 @@ bool CallGraph::IsFlattened() const {
   return true;
 }
 
+std::vector<HloInstruction*> CallGraph::GetComputationCallers(
+    HloComputation* c) {
+  std::vector<HloInstruction*> callers;
+  for (auto callsite : GetNode(c).caller_callsites()) {
+    callers.push_back(callsite.instruction());
+  }
+  return callers;
+}
+
 std::pair<HloInstruction*, HloInstruction*>
 CallGraph::NearestAncestorsInSameComputation(HloInstruction* a,
                                              HloInstruction* b) const {
diff --git a/tensorflow/compiler/xla/service/call_graph.h b/tensorflow/compiler/xla/service/call_graph.h
index cb56f4789d0..05c7c998738 100644
--- a/tensorflow/compiler/xla/service/call_graph.h
+++ b/tensorflow/compiler/xla/service/call_graph.h
@@ -236,6 +236,10 @@ class CallGraph {
   // FlattenCallGraph.
   bool IsFlattened() const;
 
+  // Returns a vector of instructions calling the passed computation.
+  // (Often a vector of size 1.)
+  std::vector<HloInstruction*> GetComputationCallers(HloComputation* c);
+
   string ToString() const;
 
  private:
diff --git a/tensorflow/compiler/xla/service/compile_only_service.cc b/tensorflow/compiler/xla/service/compile_only_service.cc
index 67132274c0d..1965925fa7f 100644
--- a/tensorflow/compiler/xla/service/compile_only_service.cc
+++ b/tensorflow/compiler/xla/service/compile_only_service.cc
@@ -86,15 +86,15 @@ CompileOnlyService::CompileAheadOfTime(
           Executable::DumpToDirectory(per_host_path, filename, hlo_snapshot));
     }
 
-    const auto& program_shape = instance.computation.host_program_shape();
     ExecutionOptions execution_options;
     *execution_options.mutable_debug_options() = debug_options;
     *execution_options.mutable_shape_with_output_layout() =
-        *instance.result_layout;
+        instance.result_layout->ToProto();
     TF_ASSIGN_OR_RETURN(
         std::unique_ptr<HloModuleConfig> module_config,
-        CreateModuleConfig(program_shape, instance.argument_layouts,
-                           &execution_options));
+        CreateModuleConfig(
+            ProgramShape(instance.computation.host_program_shape()),
+            instance.argument_layouts, &execution_options));
 
     TF_ASSIGN_OR_RETURN(
         std::unique_ptr<HloModule> hlo_module,
diff --git a/tensorflow/compiler/xla/service/computation_placer.h b/tensorflow/compiler/xla/service/computation_placer.h
index c899ffb9dc5..844b42a38d7 100644
--- a/tensorflow/compiler/xla/service/computation_placer.h
+++ b/tensorflow/compiler/xla/service/computation_placer.h
@@ -105,8 +105,6 @@ class ComputationPlacer {
   // Map from platform kind to computation placer singleton.
   static std::map<se::Platform::Id, State>* GetPlatformComputationPlacers();
 
-  se::Platform::Id platform_id_;
-
   TF_DISALLOW_COPY_AND_ASSIGN(ComputationPlacer);
 };
 
diff --git a/tensorflow/compiler/xla/service/convolution_feature_group_converter.cc b/tensorflow/compiler/xla/service/convolution_feature_group_converter.cc
index 7f7f1503a09..95c7724c3c9 100644
--- a/tensorflow/compiler/xla/service/convolution_feature_group_converter.cc
+++ b/tensorflow/compiler/xla/service/convolution_feature_group_converter.cc
@@ -142,16 +142,16 @@ std::vector<int32> GetMaskIds(int64 group_size, int64 group_count) {
 // Finally we use the Eq op of these two broadcasted constants and get the
 // desired mask.
 HloInstruction* GetExpandedFilterMask(
-    const Shape& filter_shape, int64 input_feature_dim,
-    int64 output_feature_dim, int64 group_count,
+    const Shape& filter_shape, int64 kernel_input_feature_dim,
+    int64 kernel_output_feature_dim, int64 group_count,
     const std::function<HloInstruction*(std::unique_ptr<HloInstruction>)>&
         add_instruction) {
   Shape expanded_filter_shape =
-      ExpandedFilterShape(filter_shape, group_count, input_feature_dim);
+      ExpandedFilterShape(filter_shape, group_count, kernel_input_feature_dim);
   Shape mask_shape = ShapeUtil::MakeShape(
       S32, AsInt64Slice(expanded_filter_shape.dimensions()));
-  int64 output_feature = filter_shape.dimensions(output_feature_dim);
-  int64 group_size = filter_shape.dimensions(input_feature_dim);
+  int64 output_feature = filter_shape.dimensions(kernel_output_feature_dim);
+  int64 group_size = filter_shape.dimensions(kernel_input_feature_dim);
 
   // Create a 'input_feature' sized linspace and 'output_feature' sized linspace
   // that will be broadcasted into perpendicular dimensions and compared.
@@ -159,15 +159,14 @@ HloInstruction* GetExpandedFilterMask(
       GetMaskIds(group_size, group_count);
   const std::vector<int32> output_feature_filter_mask =
       GetMaskIds(output_feature / group_count, group_count);
-
   auto mask1 = add_instruction(HloInstruction::CreateConstant(
       LiteralUtil::CreateR1<int32>(input_feature_filter_mask)));
-  auto broadcasted_mask1 = add_instruction(
-      HloInstruction::CreateBroadcast(mask_shape, mask1, {input_feature_dim}));
+  auto broadcasted_mask1 = add_instruction(HloInstruction::CreateBroadcast(
+      mask_shape, mask1, {kernel_input_feature_dim}));
   auto mask2 = add_instruction(HloInstruction::CreateConstant(
       LiteralUtil::CreateR1<int32>(output_feature_filter_mask)));
-  auto broadcasted_mask2 = add_instruction(
-      HloInstruction::CreateBroadcast(mask_shape, mask2, {output_feature_dim}));
+  auto broadcasted_mask2 = add_instruction(HloInstruction::CreateBroadcast(
+      mask_shape, mask2, {kernel_output_feature_dim}));
 
   // Compare the broadcasted output feature linspace to the input feature
   // linspace to create a diagonal predicate.
@@ -189,91 +188,203 @@ Status ConvolutionVisitor::HandleConvolution(HloInstruction* convolution) {
   };
 
   auto dim_numbers = convolution->convolution_dimension_numbers();
-  int64 input_feature_dim = dim_numbers.kernel_input_feature_dimension();
-  int64 group_size = filter->shape().dimensions(input_feature_dim);
-  int64 output_feature_dim = dim_numbers.kernel_output_feature_dimension();
-  auto expanded_filter_shape =
-      ExpandedFilterShape(filter->shape(), group_count, input_feature_dim);
-  HloInstruction* filter_mask = GetExpandedFilterMask(
-      filter->shape(), input_feature_dim, output_feature_dim, group_count, add);
+  int64 kernel_input_feature_dim = dim_numbers.kernel_input_feature_dimension();
+  int64 group_size = filter->shape().dimensions(kernel_input_feature_dim);
+  int64 kernel_output_feature_dim =
+      dim_numbers.kernel_output_feature_dimension();
+  auto expanded_filter_shape = ExpandedFilterShape(filter->shape(), group_count,
+                                                   kernel_input_feature_dim);
+  HloInstruction* filter_mask =
+      GetExpandedFilterMask(filter->shape(), kernel_input_feature_dim,
+                            kernel_output_feature_dim, group_count, add);
   HloInstruction* expanded_filter;
 
   if (group_size == 1) {
     bool depthwise_separable =
-        (group_count == filter->shape().dimensions(output_feature_dim));
+        (group_count == filter->shape().dimensions(kernel_output_feature_dim));
     // If the code generator handles depthwise separable convolutions
     // inherently, then no filter expansion is needed.
     if (!filter_expansion_ && depthwise_separable) {
-      const int64 old_kernel_input_feature_dimension =
-          dim_numbers.kernel_input_feature_dimension();
-      const int64 old_kernel_output_feature_dimension =
-          dim_numbers.kernel_output_feature_dimension();
-
-      // For depthwise convolutions, we want the kernel input feature dimension
-      // to be smaller than the output feature dimension. If that's not the
-      // case, we swap the dimensions.
-      if (old_kernel_input_feature_dimension >
-          old_kernel_output_feature_dimension) {
-        Shape reshaped_filter_shape = filter->shape();
-        auto& dimensions = *reshaped_filter_shape.mutable_dimensions();
-        std::swap(dimensions[old_kernel_input_feature_dimension],
-                  dimensions[old_kernel_output_feature_dimension]);
-
-        auto reshaped_filter =
-            add(HloInstruction::CreateReshape(reshaped_filter_shape, filter));
-
-        dim_numbers.set_kernel_input_feature_dimension(
-            old_kernel_output_feature_dimension);
-
-        dim_numbers.set_kernel_output_feature_dimension(
-            old_kernel_input_feature_dimension);
-
-        auto new_convolution = HloInstruction::CreateConvolve(
-            convolution->shape(), convolution->mutable_operand(0),
-            reshaped_filter, group_count, convolution->window(), dim_numbers,
-            convolution->precision_config());
-
-        TF_RETURN_IF_ERROR(computation_->ReplaceWithNewInstruction(
-            convolution, std::move(new_convolution)));
-      }
       return Status::OK();
     }
     // We want to repeat 'filter' in the 'input_feature_dim' dimension
     // 'group_count' times.
     Shape reshaped_filter_shape =
-        ShapeUtil::DeleteDimension(input_feature_dim, filter->shape());
+        ShapeUtil::DeleteDimension(kernel_input_feature_dim, filter->shape());
     auto reshaped_filter =
         add(HloInstruction::CreateReshape(reshaped_filter_shape, filter));
     std::vector<int64> broadcast_dims;
     for (int64 i = 0; i < filter->shape().dimensions_size(); ++i) {
-      if (i == input_feature_dim) {
+      if (i == kernel_input_feature_dim) {
         continue;
       }
       broadcast_dims.push_back(i);
     }
     expanded_filter = add(HloInstruction::CreateBroadcast(
         expanded_filter_shape, reshaped_filter, broadcast_dims));
+
+    auto zero = add(HloInstruction::CreateConstant(
+        LiteralUtil::Zero(expanded_filter_shape.element_type())));
+    auto zero_filter =
+        add(HloInstruction::CreateBroadcast(expanded_filter_shape, zero, {}));
+    auto new_filter = add(HloInstruction::CreateTernary(
+        expanded_filter_shape, HloOpcode::kSelect, filter_mask, expanded_filter,
+        zero_filter));
+
+    auto new_convolution = HloInstruction::CreateConvolve(
+        convolution->shape(), convolution->mutable_operand(0), new_filter,
+        /*feature_group_count=*/1, convolution->window(), dim_numbers,
+        convolution->precision_config());
+    TF_RETURN_IF_ERROR(computation_->ReplaceWithNewInstruction(
+        convolution, std::move(new_convolution)));
   } else {
-    // We could possibly also use reshape, broadcast, reshape instead of concat
-    // here, but it would require more complex code, and for depthwise
-    // convolution we would never end up in this branch.
-    std::vector<HloInstruction*> concat_operands(group_count, filter);
-    expanded_filter = add(HloInstruction::CreateConcatenate(
-        expanded_filter_shape, concat_operands, input_feature_dim));
+    int64 activation_input_feature_dim = dim_numbers.input_feature_dimension();
+
+    int64 output_feature =
+        filter->shape().dimensions(kernel_output_feature_dim);
+
+    // If group_count == output_feature, then we map those grouped convolutions
+    // onto depthwise convolution. This is done by adding an additional spatial
+    // dimension to the activations, kernel, and the output.
+    // E.g., we would turn
+    // [2, 12]{B, IF} conv [3, 4]{IF, OF} into
+    // [3, 2, 4]{S, B, IF} depth conv [3, 1, 4]{S, IF, OF}, where S is the
+    // additional spatial dimension. The generated convolution output will be
+    // [1, 2, 4]{S, B, OF} and then reshape the output back to [2, 4] {B, OF}.
+
+    if (group_count == output_feature && !filter_expansion_) {
+      auto filter = convolution->mutable_operand(1);
+      auto activation = convolution->mutable_operand(0);
+
+      // Add spatial dimension to the activation, and reshape.
+      Shape reshaped_activation_shape = activation->shape();
+      ShapeUtil::AppendMajorDimension(group_size, &reshaped_activation_shape);
+
+      int64 new_spatial_dim = reshaped_activation_shape.dimensions().size() - 1;
+
+      reshaped_activation_shape.set_dimensions(activation_input_feature_dim,
+                                               group_count);
+      activation = add(
+          HloInstruction::CreateReshape(reshaped_activation_shape, activation));
+
+      // Add spatial dimension to the filter, and reshape.
+      Shape reshaped_filter_shape = filter->shape();
+      ShapeUtil::AppendMajorDimension(1, &reshaped_filter_shape);
+
+      filter =
+          add(HloInstruction::CreateReshape(reshaped_filter_shape, filter));
+
+      Shape new_output_shape = convolution->shape();
+      ShapeUtil::AppendMajorDimension(1, &new_output_shape);
+
+      // Edit convolution dimension numbers. Note that kernel_input_feature_dim
+      // now becomes a spatial dimension, and the newly added dimension of size
+      // 1 is the new kernel_input_feature_dim.
+      dim_numbers.add_input_spatial_dimensions(new_spatial_dim);
+      dim_numbers.add_kernel_spatial_dimensions(kernel_input_feature_dim);
+      dim_numbers.set_kernel_input_feature_dimension(new_spatial_dim);
+      dim_numbers.add_output_spatial_dimensions(new_spatial_dim);
+
+      // Add window for the new spatial dimension.
+      Window new_window = convolution->window();
+      auto* dim = new_window.add_dimensions();
+      dim->set_window_dilation(1);
+      dim->set_base_dilation(1);
+      dim->set_stride(1);
+      dim->set_size(group_size);
+
+      auto new_convolution = add(HloInstruction::CreateConvolve(
+          new_output_shape, activation, filter, group_count, new_window,
+          dim_numbers, convolution->precision_config()));
+
+      // Delete the extra spatial dimension, and reshape.
+      Shape reshaped_convolution_shape =
+          ShapeUtil::DeleteDimension(new_spatial_dim, new_convolution->shape());
+      auto reshaped_convolution = HloInstruction::CreateReshape(
+          reshaped_convolution_shape, new_convolution);
+
+      TF_RETURN_IF_ERROR(computation_->ReplaceWithNewInstruction(
+          convolution, std::move(reshaped_convolution)));
+
+    } else {
+      // The filter expansion mechanism adds zeroes in the kernel.
+      // For an OF = 12, IF = 6, and kernel IF = 2, the expanded filter mask
+      // would look like (IF on the Y-axis, OF on the X-axis)
+      // 1 1 1 1 0 0 0 0 0 0 0 0
+      // 1 1 1 1 0 0 0 0 0 0 0 0
+      // 0 0 0 0 1 1 1 1 0 0 0 0
+      // 0 0 0 0 1 1 1 1 0 0 0 0
+      // 0 0 0 0 0 0 0 0 1 1 1 1
+      // 0 0 0 0 0 0 0 0 1 1 1 1
+      //
+      // Instead of convolving the above with the input, we instead slice the
+      // kernel into three kernels, each containing islands of 1s from the
+      // filter above. We also slice the activations in the IF dimension with
+      // each slice of size = group_size. For each slice, we perform
+      // convolutions, and concatenate the generated outputs in the output OF
+      // dimension.
+
+      std::vector<HloInstruction*> sliced_convolutions;
+      auto activation = convolution->mutable_operand(0);
+      std::vector<int64> slice_strides(filter->shape().dimensions_size(), 1);
+      std::vector<int64> filter_slice_starts(filter->shape().dimensions_size(),
+                                             0);
+      std::vector<int64> filter_slice_limits(
+          filter->shape().dimensions().begin(),
+          filter->shape().dimensions().end());
+      std::vector<int64> activation_slice_starts(
+          activation->shape().dimensions_size(), 0);
+      std::vector<int64> activation_slice_limits(
+          activation->shape().dimensions().begin(),
+          activation->shape().dimensions().end());
+
+      int64 output_feature =
+          filter->shape().dimensions(kernel_output_feature_dim);
+      auto output_feature_dim = dim_numbers.output_feature_dimension();
+      int64 filter_slice_width = output_feature / group_count;
+
+      int64 activation_input_feature_dim =
+          dim_numbers.input_feature_dimension();
+
+      for (int64 i = 0; i < group_count; i++) {
+        filter_slice_starts[kernel_output_feature_dim] = i * filter_slice_width;
+        filter_slice_limits[kernel_output_feature_dim] =
+            (i + 1) * filter_slice_width;
+        auto filter_sliced_shape = filter->shape();
+        filter_sliced_shape.set_dimensions(kernel_output_feature_dim,
+                                           filter_slice_width);
+        auto filter_slice = add(HloInstruction::CreateSlice(
+            filter_sliced_shape, filter, filter_slice_starts,
+            filter_slice_limits, slice_strides));
+
+        activation_slice_starts[activation_input_feature_dim] = i * group_size;
+        activation_slice_limits[activation_input_feature_dim] =
+            (i + 1) * group_size;
+        auto activation_sliced_shape = activation->shape();
+        activation_sliced_shape.set_dimensions(activation_input_feature_dim,
+                                               group_size);
+        auto activation_slice = add(HloInstruction::CreateSlice(
+            activation_sliced_shape, activation, activation_slice_starts,
+            activation_slice_limits, slice_strides));
+
+        auto conv_slice_shape = convolution->shape();
+        conv_slice_shape.set_dimensions(output_feature_dim, filter_slice_width);
+
+        auto new_convolution = add(HloInstruction::CreateConvolve(
+            conv_slice_shape, activation_slice, filter_slice,
+            /*feature_group_count=*/1, convolution->window(), dim_numbers,
+            convolution->precision_config()));
+
+        sliced_convolutions.push_back(new_convolution);
+      }
+
+      auto new_conv = HloInstruction::CreateConcatenate(
+          convolution->shape(), sliced_convolutions, output_feature_dim);
+      TF_RETURN_IF_ERROR(computation_->ReplaceWithNewInstruction(
+          convolution, std::move(new_conv)));
+    }
   }
-  auto zero = add(HloInstruction::CreateConstant(
-      LiteralUtil::Zero(expanded_filter_shape.element_type())));
-  auto zero_filter =
-      add(HloInstruction::CreateBroadcast(expanded_filter_shape, zero, {}));
-  auto new_filter = add(
-      HloInstruction::CreateTernary(expanded_filter_shape, HloOpcode::kSelect,
-                                    filter_mask, expanded_filter, zero_filter));
-  auto new_convolution = HloInstruction::CreateConvolve(
-      convolution->shape(), convolution->mutable_operand(0), new_filter,
-      /*feature_group_count=*/1, convolution->window(), dim_numbers,
-      convolution->precision_config());
-  TF_RETURN_IF_ERROR(computation_->ReplaceWithNewInstruction(
-      convolution, std::move(new_convolution)));
+
   return Status::OK();
 }
 
diff --git a/tensorflow/compiler/xla/service/convolution_feature_group_converter_test.cc b/tensorflow/compiler/xla/service/convolution_feature_group_converter_test.cc
index 28373ebf636..e6bf2143a21 100644
--- a/tensorflow/compiler/xla/service/convolution_feature_group_converter_test.cc
+++ b/tensorflow/compiler/xla/service/convolution_feature_group_converter_test.cc
@@ -82,18 +82,14 @@ ENTRY %Convolve1D1Window_0.v3 (input: f32[1,2,4], filter: f32[1,2,2]) -> f32[1,2
   ConvolutionFeatureGroupConverter converter;
   ASSERT_TRUE(converter.Run(module.get()).ValueOrDie());
   root = computation->root_instruction();
-  // Make sure the convolution is converted to one with feature_group_count = 1.
-  EXPECT_EQ(root->opcode(), HloOpcode::kConvolution);
-  EXPECT_EQ(root->feature_group_count(), 1);
-  // Verify that the filter operand has been replaced.
-  EXPECT_THAT(root->operand(1),
-              op::Select(op::Eq(op::Broadcast(op::Constant()),
-                                op::Broadcast(op::Constant())),
-                         // We expect to see Concatenate here instead of
-                         // Broadcast, because feature_group_count < input
-                         // feature dimension.
-                         op::Concatenate(op::Parameter(), op::Parameter()),
-                         op::Broadcast(op::Constant())));
+  // Make sure the convolution is replaced with a concatenate.
+  EXPECT_EQ(root->opcode(), HloOpcode::kConcatenate);
+  // And the operands of the concatenate are convolutions, each with a feature
+  // group count = 1.
+  EXPECT_EQ(root->operand(0)->opcode(), HloOpcode::kConvolution);
+  EXPECT_EQ(root->operand(1)->opcode(), HloOpcode::kConvolution);
+  EXPECT_EQ(root->operand(0)->feature_group_count(), 1);
+  EXPECT_EQ(root->operand(1)->feature_group_count(), 1);
 }
 
 }  // namespace
diff --git a/tensorflow/compiler/xla/service/copy_insertion.cc b/tensorflow/compiler/xla/service/copy_insertion.cc
index 4e547d925f6..df605966387 100644
--- a/tensorflow/compiler/xla/service/copy_insertion.cc
+++ b/tensorflow/compiler/xla/service/copy_insertion.cc
@@ -442,7 +442,6 @@ class CopyRemover {
               const HloOrdering& ordering, HloModule* module)
       : module_(module),
         alias_analysis_(alias_analysis),
-        ordering_(ordering),
         buffer_value_tracker_(*module, alias_analysis, ordering) {}
 
   // Try to elide the given copy. The copy is elided if the instruction is not
@@ -1003,7 +1002,6 @@ class CopyRemover {
 
   HloModule* module_;
   const HloAliasAnalysis& alias_analysis_;
-  const HloOrdering& ordering_;
 
   // Object tracking the HLO values contained in each HLO buffer.
   BufferValueTracker buffer_value_tracker_;
diff --git a/tensorflow/compiler/xla/service/copy_insertion_test.cc b/tensorflow/compiler/xla/service/copy_insertion_test.cc
index 7446bc7cc11..e4e9d7ba05c 100644
--- a/tensorflow/compiler/xla/service/copy_insertion_test.cc
+++ b/tensorflow/compiler/xla/service/copy_insertion_test.cc
@@ -94,7 +94,7 @@ TEST_F(CopyInsertionTest, SingleParameter) {
 
   EXPECT_THAT(x->users(), UnorderedElementsAre(tuple));
 
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
   InsertCopies(module.get());
@@ -114,7 +114,7 @@ TEST_F(CopyInsertionTest, SingleConstant) {
 
   EXPECT_THAT(constant->users(), UnorderedElementsAre(tuple));
 
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
   InsertCopies(module.get());
@@ -127,7 +127,7 @@ TEST_F(CopyInsertionTest, SingleConstant) {
 TEST_F(CopyInsertionTest, ExistingCopiesNotRemoved) {
   // Verify that kCopy instructions which change layout and exist before
   // copy-insertion remain in the graph after copy-insertion.
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
 
   auto builder = HloComputation::Builder(TestName());
   HloInstruction* constant =
@@ -181,7 +181,7 @@ TEST_F(CopyInsertionTest, MultipleConstantsAndParameters) {
 
   builder.AddInstruction(HloInstruction::CreateTuple({constant2, x, add}));
 
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
   InsertCopies(module.get());
@@ -217,7 +217,7 @@ TEST_F(CopyInsertionTest, AmbiguousPointsToSet) {
   EXPECT_THAT(constant2->users(), UnorderedElementsAre(tuple1, tuple2));
   EXPECT_THAT(constant3->users(), UnorderedElementsAre(tuple2));
 
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
   HloInstruction* old_root = module->entry_computation()->root_instruction();
@@ -238,7 +238,7 @@ TEST_F(CopyInsertionTest, BitcastParameter) {
   HloInstruction* bitcast = builder.AddInstruction(HloInstruction::CreateUnary(
       ShapeUtil::MakeShape(F32, {2, 2}), HloOpcode::kBitcast, x));
 
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(x->users(), UnorderedElementsAre(bitcast));
@@ -261,7 +261,7 @@ TEST_F(CopyInsertionTest, BitcastConstant) {
   HloInstruction* bitcast = builder.AddInstruction(HloInstruction::CreateUnary(
       ShapeUtil::MakeShape(F32, {2, 2}), HloOpcode::kBitcast, constant));
 
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(constant->users(), UnorderedElementsAre(bitcast));
@@ -283,7 +283,7 @@ TEST_F(CopyInsertionTest, BitcastTupleElementParameter) {
       ShapeUtil::MakeShape(F32, {2, 2}), HloOpcode::kBitcast, x));
   builder.AddInstruction(HloInstruction::CreateTuple({bitcast}));
 
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(x->users(), UnorderedElementsAre(bitcast));
@@ -310,7 +310,7 @@ TEST_F(CopyInsertionTest, NestedTupleParameter) {
            ShapeUtil::MakeShape(F32, {42})}),
       "param0"));
 
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
   EXPECT_EQ(HloOpcode::kParameter,
@@ -351,7 +351,7 @@ TEST_F(CopyInsertionTest, ElementOfNestedTupleParameter) {
   auto gte = builder.AddInstruction(HloInstruction::CreateGetTupleElement(
       ShapeUtil::GetSubshape(param->shape(), {0}), param, 0));
 
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
   EXPECT_EQ(gte, module->entry_computation()->root_instruction());
@@ -388,7 +388,7 @@ TEST_F(CopyInsertionTest, AmbiguousTopLevelRoot) {
       builder.AddInstruction(HloInstruction::CreateGetTupleElement(
           ShapeUtil::GetSubshape(select->shape(), {0}), select, 0));
 
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
   EXPECT_EQ(gte, module->entry_computation()->root_instruction());
@@ -1295,7 +1295,7 @@ TEST_F(WhileCopyInsertionTest, InitPointsToNonDistinctUsedByTwoWhileLoops) {
 TEST_F(CopyInsertionTest, SwizzlingWhile) {
   // Test a while instruction with a body which permutes its tuple parameter
   // elements.
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   const Shape loop_state_shape =
       ShapeUtil::MakeTupleShape({scalar_shape_, scalar_shape_});
 
@@ -1362,7 +1362,7 @@ TEST_F(CopyInsertionTest, CrossingParameters) {
   //   |  / \ |
   //   | /   \|
   //  (p1  ,  p0)
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   const Shape tuple_shape =
       ShapeUtil::MakeTupleShape({scalar_shape_, scalar_shape_});
 
@@ -1395,7 +1395,7 @@ TEST_F(CopyInsertionTest, ParametersAliasing) {
   //   |      |
   //   |      |
   //  (p0 ,  p1)
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   const Shape tuple_shape =
       ShapeUtil::MakeTupleShape({scalar_shape_, scalar_shape_});
 
@@ -1428,7 +1428,7 @@ TEST_F(CopyInsertionTest, ParameterWithNoAliasing) {
   //   |      |
   //   |      |
   //  (p0 ,  p1)
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   const Shape tuple_shape =
       ShapeUtil::MakeTupleShape({scalar_shape_, scalar_shape_});
 
@@ -1461,7 +1461,7 @@ TEST_F(CopyInsertionTest, ParameterWithPartialAliasing) {
   //   |      |
   //   |      |
   //  (p0 ,  p1)
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   const Shape tuple_shape =
       ShapeUtil::MakeTupleShape({scalar_shape_, scalar_shape_});
 
@@ -1496,7 +1496,7 @@ TEST_F(CopyInsertionTest, ParameterAndParallelOpsWithPartialAliasing) {
   //   |    |      |
   //   |    |      |
   //   +-- (p0 ,  p1)
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   const Shape tuple_shape =
       ShapeUtil::MakeTupleShape({scalar_shape_, scalar_shape_});
 
@@ -1534,7 +1534,7 @@ TEST_F(CopyInsertionTest, ParameterAndOpsWithPartialAliasing) {
   //   |    Add----+
   //   |    |      |
   //   +-- (p0 ,  p1)
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   const Shape tuple_shape =
       ShapeUtil::MakeTupleShape({scalar_shape_, scalar_shape_});
 
@@ -1569,7 +1569,7 @@ TEST_F(CopyInsertionTest, SwizzlingWhileWithOneOp) {
   // the operation (instruction) on the element makes the live range of the
   // respective input and output elements different than if the instruction were
   // not there (as in the SwizzlingWhile test above).
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   const Shape loop_state_shape =
       ShapeUtil::MakeTupleShape({scalar_shape_, scalar_shape_});
 
@@ -1632,7 +1632,7 @@ TEST_F(CopyInsertionTest, SwizzlingWhileSharedInput) {
   // the while body is a single constant (both loop state elements are the same
   // constant). This means no copies are necessary because both loop state
   // elements are the same so interchanging them is a no-op.
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   const Shape loop_state_shape =
       ShapeUtil::MakeTupleShape({scalar_shape_, scalar_shape_});
 
@@ -1693,7 +1693,7 @@ TEST_F(CopyInsertionTest, SequentialWhiles) {
   const Shape loop_state_shape = ShapeUtil::MakeTupleShape(
       {element_shape, element_shape, element_shape, element_shape});
 
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   auto builder = HloComputation::Builder(TestName());
   auto param_0 = builder.AddInstruction(
       HloInstruction::CreateParameter(0, element_shape, "param_0"));
@@ -1783,7 +1783,7 @@ TEST_F(CopyInsertionTest, SequentialWhiles) {
 TEST_F(CopyInsertionTest, WhileBodyWithConstantRoot) {
   // Test a while body and condition which are each simply a constant (root of
   // computation is a constant). The body constant should be copied.
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   auto builder = HloComputation::Builder(TestName());
   auto param_0 = builder.AddInstruction(
       HloInstruction::CreateParameter(0, scalar_shape_, "param_0"));
diff --git a/tensorflow/compiler/xla/service/cpu/BUILD b/tensorflow/compiler/xla/service/cpu/BUILD
index 2763d18121a..ce4c2a9cc69 100644
--- a/tensorflow/compiler/xla/service/cpu/BUILD
+++ b/tensorflow/compiler/xla/service/cpu/BUILD
@@ -96,6 +96,7 @@ cc_library(
         "@com_google_absl//absl/types:span",
         "//tensorflow/compiler/tf2xla:cpu_function_runtime",
         "//tensorflow/compiler/xla/service:map_inliner",
+        "//tensorflow/compiler/xla/service:hlo_get_dimension_size_rewriter",
         "//tensorflow/compiler/xla/service:scatter_expander",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:protobuf_util",
diff --git a/tensorflow/compiler/xla/service/cpu/compiler_functor.cc b/tensorflow/compiler/xla/service/cpu/compiler_functor.cc
index 73b03440cbb..796a7cf94d0 100644
--- a/tensorflow/compiler/xla/service/cpu/compiler_functor.cc
+++ b/tensorflow/compiler/xla/service/cpu/compiler_functor.cc
@@ -61,19 +61,6 @@ Disabling these as a starting point.
 // TODO(b/64227304) Creating a custom pass pipeline will replace this.
 
 namespace {
-class FilteredFunctionPassManager : public llvm::legacy::FunctionPassManager {
- public:
-  FilteredFunctionPassManager(llvm::Module* m, bool disable_expensive_passes)
-      : llvm::legacy::FunctionPassManager(m),
-        disable_expensive_passes_(disable_expensive_passes) {}
-  void add(llvm::Pass* p) override {
-    llvm::legacy::FunctionPassManager::add(p);
-  }
-
- private:
-  bool disable_expensive_passes_;
-};
-
 class FilteredPassManager : public llvm::legacy::PassManager {
  public:
   explicit FilteredPassManager(bool disable_expensive_passes)
@@ -96,8 +83,7 @@ class FilteredPassManager : public llvm::legacy::PassManager {
 std::unique_ptr<llvm::MemoryBuffer> CompilerFunctor::operator()(
     llvm::Module& module) const {
   FilteredPassManager module_passes(disable_expensive_passes_);
-  FilteredFunctionPassManager function_passes(&module,
-                                              disable_expensive_passes_);
+  llvm::legacy::FunctionPassManager function_passes(&module);
 
   VLOG(2) << "IR before optimizations";
   XLA_VLOG_LINES(2, llvm_ir::DumpModuleToString(module));
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
index 4ce5a8a2925..6374822c81b 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
@@ -76,6 +76,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_cse.h"
 #include "tensorflow/compiler/xla/service/hlo_dce.h"
 #include "tensorflow/compiler/xla/service/hlo_element_type_converter.h"
+#include "tensorflow/compiler/xla/service/hlo_get_dimension_size_rewriter.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_memory_scheduler.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
@@ -268,10 +269,11 @@ Status CpuCompiler::RunHloPassesThroughLayoutAssn(
         /*rewrite_training_op=*/true,
         /*rewrite_inference_op=*/true,
         /*rewrite_grad_op=*/true);
-    pass.AddPass<AlgebraicSimplifier>(
-        /*is_layout_sensitive=*/false,
-        [](const Shape&, const Shape&) { return false; },
-        /*enable_dot_strength_reduction=*/false);
+    pipeline.AddPass<HloGetDimensionSizeRewriter>();
+    AlgebraicSimplifierOptions options(
+        [](const Shape&, const Shape&) { return false; });
+    options.set_enable_dot_strength_reduction(false);
+    pass.AddPass<AlgebraicSimplifier>(options);
     pass.AddPass<HloDCE>();
 
     // BatchNormExpander can create zero-sized ops, so zero-sized HLO
@@ -334,10 +336,11 @@ Status CpuCompiler::RunHloPassesAfterLayoutAssn(
     pass.AddInvariantChecker<HloVerifier>(
         /*layout_sensitive=*/true,
         /*allow_mixed_precision=*/false);
-    pass.AddPass<HloPassFix<AlgebraicSimplifier>>(
-        /*is_layout_sensitive=*/true,
-        [](const Shape&, const Shape&) { return true; },
-        /*enable_dot_strength_reduction=*/false);
+    AlgebraicSimplifierOptions options(
+        [](const Shape&, const Shape&) { return true; });
+    options.set_is_layout_sensitive(true);
+    options.set_enable_dot_strength_reduction(false);
+    pass.AddPass<HloPassFix<AlgebraicSimplifier>>(options);
     pass.AddPass<HloDCE>();
     pass.AddPass<HloCSE>(/*is_layout_sensitive=*/true);
   }
@@ -587,9 +590,9 @@ StatusOr<std::unique_ptr<Executable>> CpuCompiler::RunBackend(
   // Select an order for emitting the HLO instructions for each
   // computation. Using this sequence enables tighter buffer liveness analysis
   // and reduced memory usage (as compared to using DependencyHloOrdering).
-  TF_ASSIGN_OR_RETURN(
-      HloSchedule schedule,
-      ScheduleModule(*module, BufferSizeBytesFunction(), DFSMemoryScheduler));
+  TF_ASSIGN_OR_RETURN(HloSchedule schedule,
+                      ScheduleModule(module.get(), BufferSizeBytesFunction(),
+                                     DFSMemoryScheduler));
 
   // Run buffer allocation on the HLO graph.
   TF_ASSIGN_OR_RETURN(
@@ -779,7 +782,7 @@ CpuCompiler::CompileAheadOfTime(std::unique_ptr<HloModuleGroup> module_group,
     XLA_VLOG_LINES(2, module->ToString());
 
     TF_ASSIGN_OR_RETURN(HloSchedule schedule,
-                        ScheduleModule(*module, BufferSizeBytesFunction()));
+                        ScheduleModule(module, BufferSizeBytesFunction()));
 
     // Run buffer analysis on the HLO graph. This analysis figures out which
     // temporary buffers are required to run the computation.
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_executable.cc b/tensorflow/compiler/xla/service/cpu/cpu_executable.cc
index 29abf38e439..818b2b0d0db 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_executable.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_executable.cc
@@ -51,8 +51,7 @@ namespace cpu {
 CpuExecutable::CpuExecutable(
     std::unique_ptr<SimpleOrcJIT> jit,
     std::unique_ptr<const BufferAssignment> assignment,
-    std::unique_ptr<const HloModule> hlo_module,
-    const string& entry_function_name,
+    std::unique_ptr<HloModule> hlo_module, const string& entry_function_name,
     std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data,
     std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map)
     : Executable(std::move(hlo_module), std::move(hlo_profile_printer_data),
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_executable.h b/tensorflow/compiler/xla/service/cpu/cpu_executable.h
index 3c3c047bfe8..3b91b15ba9b 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_executable.h
+++ b/tensorflow/compiler/xla/service/cpu/cpu_executable.h
@@ -49,7 +49,7 @@ class CpuExecutable : public Executable {
  public:
   CpuExecutable(std::unique_ptr<SimpleOrcJIT> jit,
                 std::unique_ptr<const BufferAssignment> assignment,
-                std::unique_ptr<const HloModule> hlo_module,
+                std::unique_ptr<HloModule> hlo_module,
                 const string& entry_function_name,
                 std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data,
                 std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map);
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion.cc b/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion.cc
index f9cd61bea3d..6f79ad7c146 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion.cc
@@ -48,10 +48,15 @@ bool IsMatrixVectorDot(const HloInstruction* hlo) {
          (hlo_shape.dimensions(0) == 1 || hlo_shape.dimensions(1) == 1);
 }
 
+bool HasExactlyOneUse(const HloInstruction& hlo_instr) {
+  return hlo_instr.user_count() == 1 &&
+         absl::c_count(hlo_instr.users().front()->operands(), &hlo_instr) == 1;
+}
+
 bool CanBeOutputFused(const HloInstruction* producer,
                       const HloInstruction* consumer) {
   return consumer->opcode() == HloOpcode::kAdd && IsMatrixVectorDot(producer) &&
-         producer->user_count() == 1;
+         HasExactlyOneUse(*producer) == 1;
 }
 
 bool CanBeOutputFusedIntoSomeOperand(const HloInstruction* consumer) {
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion_test.cc b/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion_test.cc
index c95a514ca04..527df0bd1c2 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion_test.cc
@@ -321,7 +321,7 @@ TEST_F(OpcodeFusionTest, Exponential_Reshape_Negate) {
   builder.AddInstruction(
       HloInstruction::CreateUnary(result_shape, HloOpcode::kNegate, reshape2));
 
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
   RunFusionAndCheckOpcodesWereFused(
@@ -370,7 +370,7 @@ TEST_F(OpcodeFusionTest, Broadcast_Negate) {
   builder.AddInstruction(HloInstruction::CreateUnary(
       result_shape, HloOpcode::kNegate, broadcast1));
 
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
   RunFusionAndCheckOpcodesWereFused(
@@ -410,7 +410,7 @@ TEST_F(OpcodeFusionTest, Exponential_Negate) {
   builder.AddInstruction(
       HloInstruction::CreateUnary(param_shape, HloOpcode::kNegate, exp1));
 
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
   RunFusionAndCheckOpcodesWereFused(
@@ -429,7 +429,7 @@ TEST_F(OpcodeFusionTest, Reshape_Negate) {
   builder.AddInstruction(
       HloInstruction::CreateUnary(result_shape, HloOpcode::kNegate, reshape1));
 
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
   RunFusionAndCheckOpcodesWereFused(
@@ -447,7 +447,7 @@ TEST_F(OpcodeFusionTest, Reverse_Negate) {
   builder.AddInstruction(
       HloInstruction::CreateUnary(param_shape, HloOpcode::kNegate, reverse1));
 
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
   RunFusionAndCheckOpcodesWereFused(
@@ -489,7 +489,7 @@ TEST_F(OpcodeFusionTest, Exponential_Transpose_Negate) {
   builder.AddInstruction(HloInstruction::CreateUnary(
       result_shape, HloOpcode::kNegate, transpose2));
 
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
   RunFusionAndCheckOpcodesWereFused(
@@ -498,7 +498,7 @@ TEST_F(OpcodeFusionTest, Exponential_Transpose_Negate) {
 }
 
 TEST_F(OpcodeFusionTest, UnaryMapOfExp) {
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
 
   HloComputation::Builder builder(TestName());
   Shape shape = ShapeUtil::MakeShape(F32, {3, 4});
@@ -517,7 +517,7 @@ TEST_F(OpcodeFusionTest, UnaryMapOfExp) {
 }
 
 TEST_F(OpcodeFusionTest, BinaryMapOfExps) {
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
 
   HloComputation::Builder builder(TestName());
   Shape shape = ShapeUtil::MakeShape(F32, {3, 4});
@@ -542,7 +542,7 @@ TEST_F(OpcodeFusionTest, BinaryMapOfExps) {
 }
 
 TEST_F(OpcodeFusionTest, DynamicSliceWithDynamicUpdateSlice) {
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
 
   HloComputation::Builder builder(TestName());
   Shape full_shape = ShapeUtil::MakeShape(F32, {10, 100, 1000});
@@ -573,7 +573,7 @@ TEST_F(OpcodeFusionTest, DynamicSliceWithDynamicUpdateSlice) {
 }
 
 TEST_F(OpcodeFusionTest, MessOfFusibleNodes) {
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   HloComputation::Builder builder(TestName());
 
   Shape full_shape = ShapeUtil::MakeShape(F32, {4, 100, 10, 100, 50});
@@ -712,7 +712,7 @@ void CreateComputationForDotAddOutputFusionTest(const string& test_name,
 }
 
 TEST_F(OpcodeFusionTest, DotAddOutputFusion_1x50x19) {
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   CreateComputationForDotAddOutputFusionTest(TestName(), module.get(), /*m=*/1,
                                              /*k=*/50, /*n=*/19,
                                              /*add_extra_use_for_dot=*/false);
@@ -725,7 +725,7 @@ TEST_F(OpcodeFusionTest, DotAddOutputFusion_1x50x19) {
 }
 
 TEST_F(OpcodeFusionTest, DotAddOutputFusion_19x50x1) {
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   CreateComputationForDotAddOutputFusionTest(TestName(), module.get(), /*m=*/19,
                                              /*k=*/50, /*n=*/1,
                                              /*add_extra_use_for_dot=*/false);
@@ -738,7 +738,7 @@ TEST_F(OpcodeFusionTest, DotAddOutputFusion_19x50x1) {
 }
 
 TEST_F(OpcodeFusionTest, DotAddOutputFusion_19x50x19) {
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   CreateComputationForDotAddOutputFusionTest(TestName(), module.get(), /*m=*/19,
                                              /*k=*/50, /*n=*/19,
                                              /*add_extra_use_for_dot=*/false);
@@ -751,7 +751,7 @@ TEST_F(OpcodeFusionTest, DotAddOutputFusion_19x50x19) {
 }
 
 TEST_F(OpcodeFusionTest, DotAddOutputFusion_19x50x1_multi_use) {
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   CreateComputationForDotAddOutputFusionTest(TestName(), module.get(), /*m=*/19,
                                              /*k=*/50, /*n=*/1,
                                              /*add_extra_use_for_dot=*/true);
@@ -763,6 +763,28 @@ TEST_F(OpcodeFusionTest, DotAddOutputFusion_19x50x1_multi_use) {
               Not(op::Fusion()));
 }
 
+TEST_F(InstructionFusionTest,
+       DotOperationFusion_DontOutputFuseDuplicateOperands) {
+  absl::string_view module_string = R"(
+HloModule module
+
+ENTRY main {
+  a = f32[50,60]{1,0} parameter(0)
+  b = f32[60,1]{1,0} parameter(1)
+  c = f32[50,1]{1,0} dot(a, b), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+  ROOT d = f32[50,1]{1,0} add(c, c)
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(module_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool fused_something,
+                          CpuInstructionFusion().Run(module.get()));
+  EXPECT_FALSE(fused_something);
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              Not(op::Fusion()));
+}
+
 struct GatherLoopFusionTestSpec {
   string test_name;
   string hlo_computation_text;
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_layout_assignment_test.cc b/tensorflow/compiler/xla/service/cpu/cpu_layout_assignment_test.cc
index 2cd52e4a18a..6c61b64758e 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_layout_assignment_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_layout_assignment_test.cc
@@ -73,7 +73,7 @@ TEST_F(CpuLayoutAssignmentTest, DotWithConstantRhsTensor) {
   auto result = builder.AddInstruction(
       CreateCanonicalDot(result_shape, dot_lhs, dot_rhs));
 
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   HloComputation* computation = module->AddEntryComputation(builder.Build());
 
   ComputationLayout computation_layout(computation->ComputeProgramShape());
@@ -114,7 +114,7 @@ TEST_F(CpuLayoutAssignmentTest, MultipleDotsWithSameConstantRhsTensor0) {
   builder.AddInstruction(HloInstruction::CreateBinary(
       result_shape, HloOpcode::kAdd, dot_a_result, dot_b_result));
 
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   HloComputation* computation = module->AddEntryComputation(builder.Build());
 
   ComputationLayout computation_layout(computation->ComputeProgramShape());
@@ -158,7 +158,7 @@ TEST_F(CpuLayoutAssignmentTest, MultipleDotsWithSameConstantRhsTensor1) {
   auto tuple_result = builder.AddInstruction(
       HloInstruction::CreateTuple({dot_a_result, dot_b_result}));
 
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   HloComputation* computation = module->AddEntryComputation(builder.Build());
 
   ComputationLayout computation_layout(computation->ComputeProgramShape());
@@ -192,7 +192,7 @@ TEST_F(CpuLayoutAssignmentTest, DotWithConstantLhsTensor) {
   auto dot_result = builder.AddInstruction(
       CreateCanonicalDot(result_shape, dot_lhs, dot_rhs));
 
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   HloComputation* computation = module->AddEntryComputation(builder.Build());
 
   ComputationLayout computation_layout(computation->ComputeProgramShape());
@@ -232,7 +232,7 @@ TEST_F(CpuLayoutAssignmentTest, DotWithConstantRhsTensorThroughGTE) {
   auto dot_result = builder.AddInstruction(
       CreateCanonicalDot(result_shape, dot_lhs, dot_rhs));
 
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   HloComputation* computation = module->AddEntryComputation(builder.Build());
 
   ComputationLayout computation_layout(computation->ComputeProgramShape());
@@ -353,7 +353,7 @@ static void AssertCorrectLayoutForDotOutputFusion(
 }
 
 TEST_F(CpuLayoutAssignmentTest, DotOutputFusion_1x50x19_dot_idx_0) {
-  std::unique_ptr<HloModule> module = CreateNewUnverifiedModule();
+  std::unique_ptr<HloModule> module = CreateNewVerifiedModule();
   TF_ASSERT_OK_AND_ASSIGN(
       DotOutputFusionLayoutAssignmentResult layout_assignment_result,
       RunDotOutputFusion(module.get(), TestName(), /*m=*/1, /*k=*/50, /*n=*/19,
@@ -365,7 +365,7 @@ TEST_F(CpuLayoutAssignmentTest, DotOutputFusion_1x50x19_dot_idx_0) {
 }
 
 TEST_F(CpuLayoutAssignmentTest, DotOutputFusion_1x50x19_dot_idx_1) {
-  std::unique_ptr<HloModule> module = CreateNewUnverifiedModule();
+  std::unique_ptr<HloModule> module = CreateNewVerifiedModule();
   TF_ASSERT_OK_AND_ASSIGN(
       DotOutputFusionLayoutAssignmentResult layout_assignment_result,
       RunDotOutputFusion(module.get(), TestName(), /*m=*/1, /*k=*/50, /*n=*/19,
@@ -377,7 +377,7 @@ TEST_F(CpuLayoutAssignmentTest, DotOutputFusion_1x50x19_dot_idx_1) {
 }
 
 TEST_F(CpuLayoutAssignmentTest, DotOutputFusion_19x50x1_dot_idx_0) {
-  std::unique_ptr<HloModule> module = CreateNewUnverifiedModule();
+  std::unique_ptr<HloModule> module = CreateNewVerifiedModule();
   TF_ASSERT_OK_AND_ASSIGN(
       DotOutputFusionLayoutAssignmentResult layout_assignment_result,
       RunDotOutputFusion(module.get(), TestName(), /*m=*/19, /*k=*/50, /*n=*/1,
@@ -389,7 +389,7 @@ TEST_F(CpuLayoutAssignmentTest, DotOutputFusion_19x50x1_dot_idx_0) {
 }
 
 TEST_F(CpuLayoutAssignmentTest, DotOutputFusion_19x50x1_dot_idx_1) {
-  std::unique_ptr<HloModule> module = CreateNewUnverifiedModule();
+  std::unique_ptr<HloModule> module = CreateNewVerifiedModule();
   TF_ASSERT_OK_AND_ASSIGN(
       DotOutputFusionLayoutAssignmentResult layout_assignment_result,
       RunDotOutputFusion(module.get(), TestName(), /*m=*/19, /*k=*/50, /*n=*/1,
@@ -401,7 +401,7 @@ TEST_F(CpuLayoutAssignmentTest, DotOutputFusion_19x50x1_dot_idx_1) {
 }
 
 TEST_F(CpuLayoutAssignmentTest, DotOutputFusion_19x50x19_dot_idx_0) {
-  std::unique_ptr<HloModule> module = CreateNewUnverifiedModule();
+  std::unique_ptr<HloModule> module = CreateNewVerifiedModule();
   TF_ASSERT_OK_AND_ASSIGN(
       DotOutputFusionLayoutAssignmentResult layout_assignment_result,
       RunDotOutputFusion(module.get(), TestName(), /*m=*/19, /*k=*/50, /*n=*/19,
@@ -413,7 +413,7 @@ TEST_F(CpuLayoutAssignmentTest, DotOutputFusion_19x50x19_dot_idx_0) {
 }
 
 TEST_F(CpuLayoutAssignmentTest, DotOutputFusion_19x50x19_dot_idx_1) {
-  std::unique_ptr<HloModule> module = CreateNewUnverifiedModule();
+  std::unique_ptr<HloModule> module = CreateNewVerifiedModule();
   TF_ASSERT_OK_AND_ASSIGN(
       DotOutputFusionLayoutAssignmentResult layout_assignment_result,
       RunDotOutputFusion(module.get(), TestName(), /*m=*/19, /*k=*/50, /*n=*/19,
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_options.cc b/tensorflow/compiler/xla/service/cpu/cpu_options.cc
index b8ace570268..92debb83e33 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_options.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_options.cc
@@ -22,7 +22,6 @@ limitations under the License.
 namespace {
 
 const char* const kXlaOptimizeForSizeCpuOption = "xla_cpu_optimize_for_size";
-const char* const kXlaDisableVectorizedReduce = "xla_disable_vectorized_reduce";
 const char* const kLlvmIrDotTilingFactor = "xla_llvm_dot_tiling_factor";
 const char* const kXlaEnableExperimentalLlvmIrGemm =
     "xla_enable_experimental_llvm_ir_gemm";
diff --git a/tensorflow/compiler/xla/service/cpu/ir_emitter.cc b/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
index 620c45fa391..4032c2da2f3 100644
--- a/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
@@ -111,7 +111,7 @@ IrEmitter::IrEmitter(
 StatusOr<llvm::Function*> IrEmitter::EmitComputation(
     HloComputation* computation, const string& function_name_prefix,
     bool is_top_level_computation,
-    const std::vector<const HloInstruction*>* instruction_order) {
+    const std::vector<HloInstruction*>* instruction_order) {
   string function_name = name_uniquer_.GetUniqueName(function_name_prefix);
   VLOG(2) << "Emitting IR for CPU function [" << function_name_prefix
           << "]; ordered? " << (instruction_order != nullptr);
@@ -140,7 +140,7 @@ StatusOr<llvm::Function*> IrEmitter::EmitComputation(
   // readcyclecounter if it is unavailable.
   bool use_rdtscp = arch_type_ == llvm::Triple::ArchType::x86 ||
                     arch_type_ == llvm::Triple::ArchType::x86_64;
-  profiling_state_ = ProfilingState(use_rdtscp, GetProfileCountersArgument());
+  profiling_state_ = ProfilingState(use_rdtscp);
   if (instruction_order == nullptr) {
     TF_RETURN_IF_ERROR(computation->Accept(this));
   } else {
@@ -1379,33 +1379,6 @@ Status IrEmitter::HandleCrossReplicaSum(HloInstruction* crs) {
   return Status::OK();
 }
 
-// Fills up the free variables in 'index_with_free_var' with values from
-// 'filler_index'. The size of free variables must be the same as the
-// size of 'filler_index'.
-//
-// This is often used after dimension reduction, where
-// 'index_with_free_var' has one or more dimensions reduced, which serves as
-// free variables (represented as nullptr). For example, if we have a 4
-// dimensional input and index for the dimension being reduced is
-// 2 (third dimension), we will have an index like [i, j, NULL, k]
-// after reduced dimension.
-//
-// Here we fill up that free variable by 'filler_index', which contains
-// the value in the reduced dimension.
-static llvm_ir::IrArray::Index FillReducedDimensionIndex(
-    llvm_ir::IrArray::Index index_with_free_var,
-    llvm_ir::IrArray::Index filler_index) {
-  llvm_ir::IrArray::Index::const_iterator it = filler_index.begin();
-
-  for (size_t i = 0; i < index_with_free_var.size(); ++i) {
-    if (index_with_free_var[i] == nullptr) {
-      index_with_free_var[i] = *it++;
-    }
-  }
-  CHECK(filler_index.end() == it);
-  return index_with_free_var;
-}
-
 Status IrEmitter::HandleParameter(HloInstruction* parameter) {
   VLOG(2) << "HandleParameter: " << parameter->ToString();
   return EmitTargetAddressForOp(parameter);
@@ -2194,14 +2167,6 @@ Status IrEmitter::HandlePad(HloInstruction* pad) {
   return Status::OK();
 }
 
-// If `hlo` is a Transpose, returns its operand; otherwise returns `hlo` itself.
-static const HloInstruction* StripTranspose(const HloInstruction& hlo) {
-  if (hlo.IsRank2Transpose()) {
-    return hlo.operand(0);
-  }
-  return &hlo;
-}
-
 Status IrEmitter::HandleFusion(HloInstruction* fusion) {
   auto* root = fusion->fused_expression_root();
   if (llvm_ir::CanEmitFusedDynamicUpdateSliceInPlace(fusion, assignment_)) {
@@ -2600,10 +2565,17 @@ Status IrEmitter::HandleConditional(HloInstruction* conditional) {
   return Status::OK();
 }
 
-Status IrEmitter::HandleAfterAll(HloInstruction* gen_token) {
-  TF_RET_CHECK(ByteSizeOf(gen_token->shape()) == 0);
+Status IrEmitter::HandleAfterAll(HloInstruction* after_all) {
+  TF_RET_CHECK(ByteSizeOf(after_all->shape()) == 0);
   // No code to generate, but we need to emit an address for book-keeping.
-  TF_RETURN_IF_ERROR(EmitTargetAddressForOp(gen_token));
+  TF_RETURN_IF_ERROR(EmitTargetAddressForOp(after_all));
+  return Status::OK();
+}
+
+Status IrEmitter::HandleAddDependency(HloInstruction* add_dependency) {
+  // AddDedendency just forwards its zero-th operand.
+  emitted_value_[add_dependency] =
+      GetEmittedValueFor(add_dependency->operand(0));
   return Status::OK();
 }
 
diff --git a/tensorflow/compiler/xla/service/cpu/ir_emitter.h b/tensorflow/compiler/xla/service/cpu/ir_emitter.h
index 136b88ff75e..559a8162a2d 100644
--- a/tensorflow/compiler/xla/service/cpu/ir_emitter.h
+++ b/tensorflow/compiler/xla/service/cpu/ir_emitter.h
@@ -101,7 +101,7 @@ class IrEmitter : public DfsHloVisitorWithDefault,
   StatusOr<llvm::Function*> EmitComputation(
       HloComputation* computation, const string& function_name_prefix,
       bool is_top_level_computation,
-      const std::vector<const HloInstruction*>* instruction_order);
+      const std::vector<HloInstruction*>* instruction_order);
 
   llvm::IRBuilder<>* b() { return &b_; }
 
@@ -159,7 +159,8 @@ class IrEmitter : public DfsHloVisitorWithDefault,
   Status HandleConcatenate(HloInstruction* concatenate) override;
   Status HandleConditional(HloInstruction* conditional) override;
   Status HandleScatter(HloInstruction* scatter) override;
-  Status HandleAfterAll(HloInstruction* gen_token) override;
+  Status HandleAfterAll(HloInstruction* after_all) override;
+  Status HandleAddDependency(HloInstruction* add_dependency) override;
   Status HandleRng(HloInstruction* rng) override;
   Status FinishVisit(HloInstruction* root) override;
 
@@ -467,9 +468,8 @@ class IrEmitter : public DfsHloVisitorWithDefault,
   // profiling a computation.
   class ProfilingState {
    public:
-    ProfilingState() : use_rdtscp_(false), prof_counters_(nullptr) {}
-    ProfilingState(bool use_rdtscp, llvm::Value* prof_counters)
-        : use_rdtscp_(use_rdtscp), prof_counters_(prof_counters) {}
+    ProfilingState() : use_rdtscp_(false) {}
+    explicit ProfilingState(bool use_rdtscp) : use_rdtscp_(use_rdtscp) {}
 
     // Record the cycle counter before an HLO executes.
     void RecordCycleStart(llvm::IRBuilder<>* b, HloInstruction* hlo);
@@ -494,9 +494,6 @@ class IrEmitter : public DfsHloVisitorWithDefault,
     // intrinsic?
     bool use_rdtscp_;
 
-    // The argument which corresponds to the profile counter buffer.
-    llvm::Value* prof_counters_;
-
     // The first read cycle counter in the program.
     llvm::Value* first_read_cycle_start_ = nullptr;
 
diff --git a/tensorflow/compiler/xla/service/cpu/runtime_key_value_sort.cc b/tensorflow/compiler/xla/service/cpu/runtime_key_value_sort.cc
index 669eeb95f32..722aa3120ef 100644
--- a/tensorflow/compiler/xla/service/cpu/runtime_key_value_sort.cc
+++ b/tensorflow/compiler/xla/service/cpu/runtime_key_value_sort.cc
@@ -17,6 +17,7 @@ limitations under the License.
 #include <algorithm>
 #include <cmath>
 #include <cstring>
+#include <limits>
 #include <memory>
 #include <string>
 #include <utility>
@@ -41,61 +42,60 @@ void KeyValueSort(std::pair<KeyType, int64>* row_to_sort, int64 num_elements) {
   std::sort(row_to_sort, row_to_sort + num_elements);
 }
 
-// For floating point numbers, we want a total order comparator. -NaN and NaN
-// should appear at the beginning and end of the ordering, and -0.0 should
-// appear before 0.0. Also we want to have a stable sort, so if the keys are the
-// same, we compare the index values.
-template <typename KeyType>
-bool LessThan(KeyType lhs, int64 lhs_index, KeyType rhs, int64 rhs_index) {
-  bool lhs_is_negative = std::signbit(lhs);
-  bool rhs_is_negative = std::signbit(rhs);
-  // If the signs are different, we can just compare the signs.
-  if (lhs_is_negative != rhs_is_negative) {
-    return lhs_is_negative && !rhs_is_negative;
+// We would like a total order of floating point numbers so that the
+// sort has a predictable behavior in the presence of NaNs. Rather
+// than using floating point comparison, we use the following trick:
+// If f is a float, and
+// x = bit_cast<int32>(f);
+// y = x < 0 ? 0x7FFFFFFF - x : x;
+// then y is ordered as an int32 such that finite values have the
+// obvious order, -0 is ordered before 0, and -NaN and NaN appear at
+// the beginning and end of the ordering.
+template <typename CastType, typename UnsignedCastType, typename KeyType>
+CastType Convert(KeyType value) {
+  CastType casted_value;
+  memcpy(&casted_value, &value, sizeof(CastType));
+  if (casted_value < 0) {
+    return static_cast<UnsignedCastType>(std::numeric_limits<CastType>::max()) -
+           casted_value;
   }
-  bool lhs_nan = std::isnan(lhs);
-  bool rhs_nan = std::isnan(rhs);
-  // Exactly one number is nan?
-  if (lhs_nan != rhs_nan) {
-    if (lhs_nan) {
-      return lhs_is_negative;
-    }
-    return !rhs_is_negative;
-  }
-  if (lhs != rhs) {
-    return lhs < rhs;
-  }
-  return lhs_index < rhs_index;
+  return casted_value;
+}
+
+template <typename CastType, typename UnsignedCastType, typename KeyType>
+bool LessThan(KeyType lhs, KeyType rhs) {
+  return Convert<CastType, UnsignedCastType>(lhs) <
+         Convert<CastType, UnsignedCastType>(rhs);
 }
 
 template <>
 void KeyValueSort(std::pair<double, int64>* row_to_sort, int64 num_elements) {
-  std::sort(row_to_sort, row_to_sort + num_elements,
-            [](const std::pair<double, int64>& lhs,
-               const std::pair<double, int64>& rhs) -> bool {
-              return LessThan(lhs.first, lhs.second, rhs.first, rhs.second);
-            });
+  std::stable_sort(row_to_sort, row_to_sort + num_elements,
+                   [](const std::pair<double, int64>& lhs,
+                      const std::pair<double, int64>& rhs) -> bool {
+                     return LessThan<int64, uint64>(lhs.first, rhs.first);
+                   });
 }
 
 template <>
 void KeyValueSort(std::pair<float, int64>* row_to_sort, int64 num_elements) {
-  std::sort(row_to_sort, row_to_sort + num_elements,
-            [](const std::pair<float, int64>& lhs,
-               const std::pair<float, int64>& rhs) -> bool {
-              return LessThan(lhs.first, lhs.second, rhs.first, rhs.second);
-            });
+  std::stable_sort(row_to_sort, row_to_sort + num_elements,
+                   [](const std::pair<float, int64>& lhs,
+                      const std::pair<float, int64>& rhs) -> bool {
+                     return LessThan<int32, uint32>(lhs.first, rhs.first);
+                   });
 }
 
 template <>
 void KeyValueSort(std::pair<Eigen::half, int64>* row_to_sort,
                   int64 num_elements) {
-  std::sort(row_to_sort, row_to_sort + num_elements,
-            [](const std::pair<Eigen::half, int64>& lhs,
-               const std::pair<Eigen::half, int64>& rhs) -> bool {
-              return LessThan(
-                  Eigen::half_impl::half_to_float(lhs.first), lhs.second,
-                  Eigen::half_impl::half_to_float(rhs.first), rhs.second);
-            });
+  std::stable_sort(row_to_sort, row_to_sort + num_elements,
+                   [](const std::pair<Eigen::half, int64>& lhs,
+                      const std::pair<Eigen::half, int64>& rhs) -> bool {
+                     return LessThan<int32, uint32>(
+                         Eigen::half_impl::half_to_float(lhs.first),
+                         Eigen::half_impl::half_to_float(rhs.first));
+                   });
 }
 
 template <typename KeyType>
diff --git a/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc b/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc
index f77641eb7da..efccadedf27 100644
--- a/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc
+++ b/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc
@@ -128,8 +128,18 @@ SimpleOrcJIT::SimpleOrcJIT(const llvm::TargetOptions& target_options,
 }
 
 llvm::JITSymbol SimpleOrcJIT::ResolveRuntimeSymbol(const std::string& name) {
-  void* func_addr = CustomCallTargetRegistry::Global()->Lookup(name);
+  void* func_addr = nullptr;
+  if (name.size() > 1 && name.front() == data_layout_.getGlobalPrefix()) {
+    // On Mac OS X, 'name' may have a leading underscore prefix, even though the
+    // registered name may not.
+    std::string stripped_name(name.begin() + 1, name.end());
+    func_addr = CustomCallTargetRegistry::Global()->Lookup(stripped_name);
+  } else {
+    func_addr = CustomCallTargetRegistry::Global()->Lookup(name);
+  }
+
   if (func_addr == nullptr) {
+    VLOG(2) << "Unable to resolve runtime symbol: " << name;
     return nullptr;
   }
   llvm::JITEvaluatedSymbol symbol_info(reinterpret_cast<uint64_t>(func_addr),
diff --git a/tensorflow/compiler/xla/service/cpu/tests/cpu_eigen_dot_operation_test.cc b/tensorflow/compiler/xla/service/cpu/tests/cpu_eigen_dot_operation_test.cc
index 691b3c7bee2..f8f5f392da8 100644
--- a/tensorflow/compiler/xla/service/cpu/tests/cpu_eigen_dot_operation_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/tests/cpu_eigen_dot_operation_test.cc
@@ -50,7 +50,7 @@ class CpuEigenDotOperationTest
         /*entry_point_name=*/"entry",
         /*relocation_model=*/CpuAotCompilationOptions::RelocationModel::Static};
 
-    auto hlo_module = CreateNewUnverifiedModule();
+    auto hlo_module = CreateNewVerifiedModule();
     hlo_module->AddEntryComputation(std::move(entry_computation));
 
     CompileAheadOfTimeAndVerifyIr(std::move(hlo_module), options,
diff --git a/tensorflow/compiler/xla/service/cpu/tests/cpu_external_constants_test.cc b/tensorflow/compiler/xla/service/cpu/tests/cpu_external_constants_test.cc
index d201a151d7a..e30f95311fc 100644
--- a/tensorflow/compiler/xla/service/cpu/tests/cpu_external_constants_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/tests/cpu_external_constants_test.cc
@@ -46,7 +46,7 @@ class CpuExternalConstantsTest : public CpuCodegenTest {
     builder.AddInstruction(
         HloInstruction::CreateBinary(shape, HloOpcode::kAdd, param, constant));
 
-    std::unique_ptr<HloModule> module = CreateNewUnverifiedModule();
+    std::unique_ptr<HloModule> module = CreateNewVerifiedModule();
     module->AddEntryComputation(builder.Build());
 
     CompileAndVerifyIr(std::move(module), filecheck_pattern,
diff --git a/tensorflow/compiler/xla/service/cpu/tests/cpu_intrinsic_test.cc b/tensorflow/compiler/xla/service/cpu/tests/cpu_intrinsic_test.cc
index 773336c7a92..9b10c49f4f5 100644
--- a/tensorflow/compiler/xla/service/cpu/tests/cpu_intrinsic_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/tests/cpu_intrinsic_test.cc
@@ -91,7 +91,7 @@ TEST_P(CpuUnaryIntrinsicTest, DoIt) {
       /*entry_point_name=*/"entry",
       /*relocation_model=*/CpuAotCompilationOptions::RelocationModel::Static};
 
-  auto hlo_module = CreateNewUnverifiedModule();
+  auto hlo_module = CreateNewVerifiedModule();
   hlo_module->AddEntryComputation(std::move(computation));
 
   string check_lines{spec.check_lines.data(), spec.check_lines.size()};
diff --git a/tensorflow/compiler/xla/service/cpu/tests/cpu_literal_caching_test.cc b/tensorflow/compiler/xla/service/cpu/tests/cpu_literal_caching_test.cc
index 3b87683ffff..fa0e09ff6b5 100644
--- a/tensorflow/compiler/xla/service/cpu/tests/cpu_literal_caching_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/tests/cpu_literal_caching_test.cc
@@ -63,7 +63,7 @@ CHECK-NOT: private constant [48 x i8]
 )";
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          ParseHloString(hlo_text));
+                          ParseAndReturnVerifiedModule(hlo_text));
 
   CpuAotCompilationOptions options{
       /*triple=*/"x86_64-pc-linux", /*cpu_name=*/"", /*features=*/"",
@@ -104,14 +104,14 @@ ENTRY main {
 )";
 
   string filecheck_pattern = R"(
-CHECK: private constant [4 x i8]
-CHECK: private constant [8 x i8]
+CHECK-DAG: private constant [4 x i8]
+CHECK-DAG: private constant [8 x i8]
 CHECK-NOT: private constant [4 x i8]
 CHECK-NOT: private constant [8 x i8]
 )";
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          ParseHloString(hlo_text));
+                          ParseAndReturnVerifiedModule(hlo_text));
 
   CpuAotCompilationOptions options{
       /*triple=*/"x86_64-pc-linux", /*cpu_name=*/"", /*features=*/"",
diff --git a/tensorflow/compiler/xla/service/cpu/tests/cpu_noalias_test.cc b/tensorflow/compiler/xla/service/cpu/tests/cpu_noalias_test.cc
index f5419b7063b..a7702c2aeea 100644
--- a/tensorflow/compiler/xla/service/cpu/tests/cpu_noalias_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/tests/cpu_noalias_test.cc
@@ -56,7 +56,7 @@ TEST_F(CpuNoAliasTest, Concat) {
 
   std::unique_ptr<HloComputation> computation = builder.Build();
 
-  auto hlo_module = CreateNewUnverifiedModule();
+  auto hlo_module = CreateNewVerifiedModule();
   hlo_module->AddEntryComputation(std::move(computation));
 
   // Now that we have an HLO module, build an llvm_ir::AliasAnalysis for it.
diff --git a/tensorflow/compiler/xla/service/cpu/xfeed_manager.h b/tensorflow/compiler/xla/service/cpu/xfeed_manager.h
index 990ff94ba23..70008947f37 100644
--- a/tensorflow/compiler/xla/service/cpu/xfeed_manager.h
+++ b/tensorflow/compiler/xla/service/cpu/xfeed_manager.h
@@ -23,6 +23,7 @@ limitations under the License.
 #include <deque>
 
 #include "absl/types/span.h"
+#include "tensorflow/compiler/xla/shape.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
diff --git a/tensorflow/compiler/xla/service/dfs_hlo_visitor.h b/tensorflow/compiler/xla/service/dfs_hlo_visitor.h
index d6371283221..e84bf00153a 100644
--- a/tensorflow/compiler/xla/service/dfs_hlo_visitor.h
+++ b/tensorflow/compiler/xla/service/dfs_hlo_visitor.h
@@ -251,6 +251,7 @@ class DfsHloVisitorBase {
 
   virtual Status HandleBatchNormGrad(HloInstructionPtr hlo) = 0;
 
+  virtual Status HandleAddDependency(HloInstructionPtr add_dependency) = 0;
   virtual Status HandleAfterAll(HloInstructionPtr token) = 0;
 
   // Invoked to inform the visitor that the traversal has completed, and that
diff --git a/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h b/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h
index e57184f639f..80ea5be298a 100644
--- a/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h
+++ b/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h
@@ -206,6 +206,9 @@ class DfsHloVisitorWithDefaultBase
   Status HandleGetDimensionSize(HloInstructionPtr get_size) override {
     return DefaultAction(get_size);
   }
+  Status HandleAddDependency(HloInstructionPtr add_dependency) override {
+    return DefaultAction(add_dependency);
+  }
 
   // Invoked to inform the visitor that the traversal has completed, and that
   // the root was "root".
diff --git a/tensorflow/compiler/xla/service/dynamic_parameter_binding.cc b/tensorflow/compiler/xla/service/dynamic_parameter_binding.cc
new file mode 100644
index 00000000000..c8bfc890506
--- /dev/null
+++ b/tensorflow/compiler/xla/service/dynamic_parameter_binding.cc
@@ -0,0 +1,138 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/dynamic_parameter_binding.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+
+namespace xla {
+
+Status DynamicParameterBinding::Bind(
+    const DynamicParameter& dynamic_parameter,
+    const DynamicDimension& dynamic_dimension) {
+  auto result = bindings_.emplace(dynamic_dimension, dynamic_parameter);
+  TF_RET_CHECK(result.second);
+  return Status::OK();
+}
+
+absl::optional<DynamicParameterBinding::DynamicParameter>
+DynamicParameterBinding::GetBinding(const DynamicDimension& dynamic_dimension) {
+  auto param_iter = bindings_.find(dynamic_dimension);
+  if (param_iter == bindings_.end()) {
+    return absl::nullopt;
+  }
+  return param_iter->second;
+}
+
+DynamicParameterBindingProto DynamicParameterBinding::ToProto() const {
+  DynamicParameterBindingProto result;
+  for (const auto& binding : bindings_) {
+    const DynamicDimension& dynamic_dimension = binding.first;
+    const DynamicParameter& dynamic_param = binding.second;
+    DynamicParameterBindingProto::Binding binding_proto;
+    binding_proto.set_dynamic_param_num(dynamic_param.parameter_num);
+    for (int64 i : dynamic_param.parameter_index) {
+      binding_proto.add_dynamic_param_index(i);
+    }
+
+    binding_proto.set_target_param_num(dynamic_dimension.parameter_num);
+
+    for (int64 i : dynamic_dimension.parameter_index) {
+      binding_proto.add_target_param_index(i);
+    }
+
+    binding_proto.set_target_param_dim_num(dynamic_dimension.dimension);
+    result.add_entries()->Swap(&binding_proto);
+  }
+  return result;
+}
+
+StatusOr<DynamicParameterBinding> DynamicParameterBinding::CreateFromProto(
+    const DynamicParameterBindingProto& proto) {
+  DynamicParameterBinding result;
+  for (const DynamicParameterBindingProto::Binding& binding : proto.entries()) {
+    int64 dynamic_param_num = binding.dynamic_param_num();
+    ShapeIndex dynamic_param_index(binding.dynamic_param_index().begin(),
+                                   binding.dynamic_param_index().end());
+    int64 target_param_num = binding.target_param_num();
+    ShapeIndex target_param_index(binding.target_param_index().begin(),
+                                  binding.target_param_index().end());
+    int64 target_dim_num = binding.target_param_num();
+
+    TF_RETURN_IF_ERROR(
+        result.Bind(DynamicParameter{dynamic_param_num, dynamic_param_index},
+                    DynamicDimension{target_param_num, target_param_index,
+                                     target_dim_num}));
+  }
+
+  return result;
+}
+
+string DynamicParameterBinding::ToString() const {
+  std::vector<string> pieces;
+  pieces.push_back("DynamicParameterBinding: ");
+  for (const auto& binding : bindings_) {
+    const DynamicDimension& dynamic_dimension = binding.first;
+    const DynamicParameter& dynamic_param = binding.second;
+    pieces.push_back(absl::StrFormat(
+        " -- Input param number %lld at %s has dim %lld as dynamic"
+        " dimension, which is represented by param number %lld at "
+        "%s",
+        dynamic_dimension.parameter_num,
+        dynamic_dimension.parameter_index.ToString(),
+        dynamic_dimension.dimension, dynamic_param.parameter_num,
+        dynamic_param.parameter_index.ToString()));
+  }
+  return absl::StrJoin(pieces, "\n");
+}
+
+Status DynamicParameterBinding::ForEachBinding(BindingFn fn) const {
+  for (const auto& binding : bindings_) {
+    TF_RETURN_IF_ERROR(fn(binding.second, binding.first));
+  }
+  return Status::OK();
+}
+
+Status DynamicParameterBinding::Verify(const HloModule& module) const {
+  const HloComputation* entry = module.entry_computation();
+  return ForEachBinding([&](const DynamicParameter& dynamic_parameter,
+                            const DynamicDimension& dynamic_dimension)
+                            -> Status {
+    TF_RET_CHECK(dynamic_parameter.parameter_num < entry->num_parameters());
+    TF_RET_CHECK(dynamic_dimension.parameter_num < entry->num_parameters());
+    TF_RET_CHECK(ShapeUtil::IndexIsValid(
+        entry->parameter_instruction(dynamic_parameter.parameter_num)->shape(),
+        dynamic_parameter.parameter_index));
+    TF_RET_CHECK(ShapeUtil::IndexIsValid(
+        entry->parameter_instruction(dynamic_dimension.parameter_num)->shape(),
+        dynamic_dimension.parameter_index));
+    TF_RET_CHECK(
+        dynamic_dimension.dimension <
+        ShapeUtil::Rank(ShapeUtil::GetSubshape(
+            entry->parameter_instruction(dynamic_dimension.parameter_num)
+                ->shape(),
+            dynamic_dimension.parameter_index)));
+    return Status::OK();
+  });
+}
+
+std::ostream& operator<<(std::ostream& out,
+                         const DynamicParameterBinding& binding) {
+  out << binding.ToString();
+  return out;
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/dynamic_parameter_binding.h b/tensorflow/compiler/xla/service/dynamic_parameter_binding.h
new file mode 100644
index 00000000000..dd474d8eed1
--- /dev/null
+++ b/tensorflow/compiler/xla/service/dynamic_parameter_binding.h
@@ -0,0 +1,125 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_DYNAMIC_PARAMETER_BINDING_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_DYNAMIC_PARAMETER_BINDING_H_
+
+#include <utility>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/types/optional.h"
+#include "tensorflow/compiler/xla/service/hlo.pb.h"
+#include "tensorflow/compiler/xla/shape_tree.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+
+namespace xla {
+
+class HloModule;
+// We currently use an explicit API that takes an extra parameter to indicate
+// the runtime size of a dynamic dimension. DynamicParameterBinding indicates
+// the relationship between parameter: We can have a dynamic parameter that
+// points to another target parameter to indicate that the target parameter is
+// dynamic.
+//
+//
+// TODO(b/119520625): Remove this API once we have more dynamic shape infra
+// ready.
+class DynamicParameterBinding {
+ public:
+  // DynamicParameter represents a special parameter that is used to represent
+  // the runtime size of a dimension of another parameter. A dynamic parameter
+  // has to be a scalar value.
+  struct DynamicParameter {
+    // The parameter number of dynamic parameter.
+    int64 parameter_num;
+    // The index of the parameter.
+    ShapeIndex parameter_index;
+  };
+
+  // DynamicDimension represents a dimension whose size is determined at
+  // runtime. A DynamicDimension's runtime size is determined by the binded
+  // DynamicParameter using `DynamicParameterBinding::Bind` method.
+  struct DynamicDimension {
+    // The parameter number of dynamic dimension.
+    int64 parameter_num;
+    // The subshape index of the parameter.
+    ShapeIndex parameter_index;
+    // The dimension number in the subshape.
+    int64 dimension;
+
+    // "friend" keyword are added so these functions can be found by ADL.
+    template <typename H>
+    friend H AbslHashValue(H h, const DynamicDimension& m) {
+      return H::combine(std::move(h), m.parameter_num, m.parameter_index,
+                        m.dimension);
+    }
+
+    friend bool operator==(const DynamicDimension& lhs,
+                           const DynamicDimension& rhs) {
+      return lhs.parameter_num == rhs.parameter_num &&
+             lhs.parameter_index == rhs.parameter_index &&
+             lhs.dimension == rhs.dimension;
+    }
+  };
+
+  DynamicParameterBinding() = default;
+
+  virtual ~DynamicParameterBinding() = default;
+
+  // Adds binding which indicates that the dimension indicated by
+  // `dynamic_dimension` is dynamic, and its runtime size is represented by
+  // `dynamic_parameter`.
+  Status Bind(const DynamicParameter& dynamic_parameter,
+              const DynamicDimension& dynamic_dimension);
+
+  // Returns the parameter and the index representing the runtime size of
+  // dimension `dim_num` of parameter `param_num` at `param_index`.
+  //
+  // Returns nullopt if the binding is not set.
+  absl::optional<DynamicParameter> GetBinding(
+      const DynamicDimension& dynamic_dimension);
+
+  using BindingFn =
+      std::function<Status(const DynamicParameter& dynamic_parameter,
+                           const DynamicDimension& dynamic_dimension)>;
+
+  // Iterate through each binding.
+  Status ForEachBinding(BindingFn fn) const;
+
+  DynamicParameterBindingProto ToProto() const;
+
+  static StatusOr<DynamicParameterBinding> CreateFromProto(
+      const DynamicParameterBindingProto& proto);
+
+  string ToString() const;
+
+  // Verifies that the given binding is valid for the given module.
+  // Specifically, the binding's parameter and parameter size should be valid.
+  Status Verify(const HloModule& module) const;
+
+ private:
+  // Keeps track of mappings from DynamicDimension to DynamicParameter. The
+  // direction of is chosen so that we can easily query if a dimension is
+  // dynamic and which dynamic parameter represents the real size of that
+  // dimension.
+  absl::flat_hash_map<DynamicDimension, DynamicParameter> bindings_;
+};
+
+std::ostream& operator<<(std::ostream& out,
+                         const DynamicParameterBinding& binding);
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_DYNAMIC_PARAMETER_BINDING_H_
diff --git a/tensorflow/compiler/xla/service/dynamic_parameter_binding_test.cc b/tensorflow/compiler/xla/service/dynamic_parameter_binding_test.cc
new file mode 100644
index 00000000000..83a6d83dffd
--- /dev/null
+++ b/tensorflow/compiler/xla/service/dynamic_parameter_binding_test.cc
@@ -0,0 +1,153 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/dynamic_parameter_binding.h"
+
+#include <memory>
+#include <string>
+
+#include "absl/algorithm/container.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_dce.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_memory_scheduler.h"
+#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/service/hlo_ordering.h"
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+
+namespace xla {
+namespace {
+class DynamicParameterBindingTest : public HloTestBase {};
+
+TEST_F(DynamicParameterBindingTest, SimpleBinding) {
+  // 'b' is a dynamic shape; 'a' represents the real size of b's first
+  // dimension.
+  const string module_str = R"(
+HloModule TEST
+
+ENTRY main {
+  a = f32[] parameter(0)
+  b = f32[10] parameter(1)
+  ROOT root = (f32[], f32[10]) tuple(%a, %b)
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseHloString(module_str));
+
+  DynamicParameterBinding binding;
+
+  TF_EXPECT_OK(
+      binding.Bind(DynamicParameterBinding::DynamicParameter{0, {}},
+                   DynamicParameterBinding::DynamicDimension{1, {}, 0}));
+
+  absl::optional<DynamicParameterBinding::DynamicParameter> param =
+      binding.GetBinding(
+          DynamicParameterBinding::DynamicDimension{/*parameter_num=*/1,
+                                                    /*parameter_index=*/{},
+                                                    /*dimension=*/0});
+  EXPECT_TRUE(param);
+  EXPECT_EQ(param->parameter_num, 0);
+  EXPECT_EQ(param->parameter_index, ShapeIndex({}));
+  TF_EXPECT_OK(binding.Verify(*module));
+}
+
+TEST_F(DynamicParameterBindingTest, TupleBinding) {
+  // 'gte2' is a dynamic shape; 'gte1' represents the real size of gte2's first
+  // dimension.
+  const string module_str = R"(
+HloModule TEST
+
+ENTRY main {
+  param = (f32[], f32[10]) parameter(0)
+  gte1 = f32[] get-tuple-element(%param), index=0
+  gte2 = f32[10] get-tuple-element(%param), index=1
+  ROOT root = (f32[], f32[10]) tuple(%gte1, %gte2)
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseHloString(module_str));
+
+  DynamicParameterBinding binding;
+
+  TF_EXPECT_OK(
+      binding.Bind(DynamicParameterBinding::DynamicParameter{0, {0}},
+                   DynamicParameterBinding::DynamicDimension{0, {1}, 0}));
+
+  absl::optional<DynamicParameterBinding::DynamicParameter> param =
+      binding.GetBinding(
+          DynamicParameterBinding::DynamicDimension{/*parameter_num=*/0,
+                                                    /*parameter_index=*/{1},
+                                                    /*dimension=*/0});
+
+  EXPECT_TRUE(param);
+  EXPECT_EQ(param->parameter_num, 0);
+  EXPECT_EQ(param->parameter_index, ShapeIndex({0}));
+  TF_EXPECT_OK(binding.Verify(*module));
+}
+
+TEST_F(DynamicParameterBindingTest, TupleBindingWithMultiDimension) {
+  // 'gte2' is a dynamic shape; 'gte1' represents the real size of gte2's both
+  // dimensions.
+  const string module_str = R"(
+HloModule TEST
+
+ENTRY main {
+  param = (f32[], f32[10, 10]) parameter(0)
+  gte1 = f32[] get-tuple-element(%param), index=0
+  gte2 = f32[10, 10] get-tuple-element(%param), index=1
+  ROOT root = (f32[], f32[10, 10]) tuple(%gte1, %gte2)
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseHloString(module_str));
+
+  DynamicParameterBinding binding;
+
+  TF_EXPECT_OK(
+      binding.Bind(DynamicParameterBinding::DynamicParameter{0, {0}},
+                   DynamicParameterBinding::DynamicDimension{0, {1}, 0}));
+
+  TF_EXPECT_OK(
+      binding.Bind(DynamicParameterBinding::DynamicParameter{0, {0}},
+                   DynamicParameterBinding::DynamicDimension{0, {1}, 1}));
+
+  absl::optional<DynamicParameterBinding::DynamicParameter> param =
+      binding.GetBinding(
+          DynamicParameterBinding::DynamicDimension{/*parameter_num=*/0,
+                                                    /*parameter_index=*/{1},
+                                                    /*dimension=*/0});
+
+  EXPECT_TRUE(param);
+  EXPECT_EQ(param->parameter_num, 0);
+  EXPECT_EQ(param->parameter_index, ShapeIndex({0}));
+
+  absl::optional<DynamicParameterBinding::DynamicParameter> param2 =
+      binding.GetBinding(
+          DynamicParameterBinding::DynamicDimension{/*parameter_num=*/0,
+                                                    /*parameter_index=*/{1},
+                                                    /*dimension=*/0});
+  EXPECT_TRUE(param2);
+  EXPECT_EQ(param2->parameter_num, 0);
+  EXPECT_EQ(param2->parameter_index, ShapeIndex({0}));
+
+  TF_EXPECT_OK(binding.Verify(*module));
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/elemental_ir_emitter.cc b/tensorflow/compiler/xla/service/elemental_ir_emitter.cc
index f98c943669b..6f1f95f2e90 100644
--- a/tensorflow/compiler/xla/service/elemental_ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/elemental_ir_emitter.cc
@@ -22,6 +22,7 @@ limitations under the License.
 
 // IWYU pragma: no_include "llvm/IR/Intrinsics.gen.inc"
 #include "absl/algorithm/container.h"
+#include "absl/container/flat_hash_map.h"
 #include "absl/strings/str_cat.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Instructions.h"
@@ -1671,26 +1672,66 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitElementalConcatenate(
 
   b_->SetInsertPoint(init_block);
 
+  // Assign a unique id for each *different* operand, and count how often each
+  // operand is used. If all operands are different, the usage count will be 1
+  // for each operand.
+  absl::flat_hash_map<const HloInstruction*, int64> to_unique_operand_id;
+  std::vector<int64> operand_usage_count;
+  for (const auto* operand : hlo->operands()) {
+    if (to_unique_operand_id.contains(operand)) {
+      ++operand_usage_count[to_unique_operand_id[operand]];
+    } else {
+      int64 unique_operand_id = to_unique_operand_id.size();
+      to_unique_operand_id[operand] = unique_operand_id;
+      operand_usage_count.push_back(1);
+    }
+  }
+
+  // To avoid that we emit the same operand more than once, we create one basic
+  // block for each *different* operand with a PHI node for the different source
+  // index inputs.
+  std::vector<llvm::BasicBlock*> emit_operand_blocks(
+      to_unique_operand_id.size(), nullptr);
+  std::vector<llvm::PHINode*> source_index_phis(to_unique_operand_id.size(),
+                                                nullptr);
+  for (const auto* operand : hlo->operands()) {
+    int64 operand_id = to_unique_operand_id[operand];
+    if (emit_operand_blocks[operand_id] != nullptr) {
+      continue;
+    }
+
+    emit_operand_blocks[operand_id] = llvm_ir::CreateBasicBlock(
+        exit_block, StrCat("concat_index_from_operand_id", operand_id), b_);
+    auto saved_insert_point = b_->GetInsertPoint();
+    llvm_ir::SetToFirstInsertPoint(emit_operand_blocks[operand_id], b_);
+    source_index_phis[operand_id] =
+        PHI(source_index.GetType(), operand_usage_count[operand_id]);
+    auto operand_index = source_index;
+    operand_index[concat_dim] = source_index_phis[operand_id];
+
+    // Create the terminator of the block before calling operand generators,
+    // because they require non-degenerate basic blocks.
+    b_->SetInsertPoint(llvm::BranchInst::Create(
+        exit_block, /*InsertAtEnd=*/emit_operand_blocks[operand_id]));
+    TF_ASSIGN_OR_RETURN(llvm::Value * value,
+                        operand_to_generator.at(operand)(operand_index));
+    output->addIncoming(value, b_->GetInsertBlock());
+    b_->SetInsertPoint(init_block, saved_insert_point);
+  }
+
   for (int64 operand_idx = 0; operand_idx < hlo->operand_count();
        ++operand_idx) {
     const HloInstruction* operand = hlo->operand(operand_idx);
-    auto true_block = llvm_ir::CreateBasicBlock(
-        exit_block, StrCat("concat_index_from_operand", operand_idx), b_);
     auto false_block = llvm_ir::CreateBasicBlock(
         exit_block, StrCat("concat_index_not_from_operand", operand_idx), b_);
     auto concat_dim_size =
         llvm::ConstantInt::get(source_index[concat_dim]->getType(),
                                operand->shape().dimensions(concat_dim));
-    CondBr(ICmpULT(source_index[concat_dim], concat_dim_size), true_block,
-           false_block);
-
-    // Create the terminator of the true block before calling operand
-    // generators, because they require non-degenerate basic blocks.
-    b_->SetInsertPoint(
-        llvm::BranchInst::Create(exit_block, /*InsertAtEnd=*/true_block));
-    TF_ASSIGN_OR_RETURN(llvm::Value * value,
-                        operand_to_generator.at(operand)(source_index));
-    output->addIncoming(value, b_->GetInsertBlock());
+    int64 operand_id = to_unique_operand_id[operand];
+    source_index_phis[operand_id]->addIncoming(source_index[concat_dim],
+                                               b_->GetInsertBlock());
+    CondBr(ICmpULT(source_index[concat_dim], concat_dim_size),
+           emit_operand_blocks[operand_id], false_block);
 
     // Subtract the size of the concat dimension of the current operand
     // from the source index.
@@ -2204,13 +2245,15 @@ llvm_ir::ElementGenerator ElementalIrEmitter::MakeElementGenerator(
                 : iota->shape();
         PrimitiveType component_element_type = component_shape.element_type();
         llvm::Value* iota_result;
-        if (ShapeUtil::ElementIsIntegral(component_shape)) {
+        if (primitive_util::IsIntegralType(component_element_type) ||
+            component_element_type == PRED) {
           iota_result = b_->CreateIntCast(
               elem_index_linear,
               llvm_ir::PrimitiveTypeToIrType(component_element_type, module_),
               /*isSigned=*/false);
         } else {
-          TF_RET_CHECK(ShapeUtil::ElementIsFloating(component_shape))
+          TF_RET_CHECK(
+              primitive_util::IsFloatingPointType(component_element_type))
               << component_element_type;
           llvm::Type* float_ir_type;
           if (component_element_type == BF16) {
diff --git a/tensorflow/compiler/xla/service/executable.h b/tensorflow/compiler/xla/service/executable.h
index 45f620f3f33..b34bca55a48 100644
--- a/tensorflow/compiler/xla/service/executable.h
+++ b/tensorflow/compiler/xla/service/executable.h
@@ -61,7 +61,7 @@ struct ExecutionOutput {
 class Executable {
  public:
   explicit Executable(
-      std::unique_ptr<const HloModule> hlo_module,
+      std::unique_ptr<HloModule> hlo_module,
       std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data,
       std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map)
       : hlo_module_(std::move(hlo_module)),
@@ -162,7 +162,7 @@ class Executable {
     return hlo_profile_printer_data_ != nullptr;
   }
 
-  const HloModule& module() const { return *hlo_module_; }
+  HloModule& module() const { return *hlo_module_; }
 
   const bool has_module() const { return hlo_module_ != nullptr; }
 
@@ -199,7 +199,7 @@ class Executable {
   // HloModule this was compiled from. BufferAssignment keeps pointers to
   // HloInstructions owned by the HloModule so we need to keep the HloModule
   // around.
-  const std::unique_ptr<const HloModule> hlo_module_;
+  const std::unique_ptr<HloModule> hlo_module_;
 
   // HloSnapshot this was compiled from. Null if not dumping executions.
   std::unique_ptr<HloSnapshot> hlo_snapshot_;
diff --git a/tensorflow/compiler/xla/service/gpu/BUILD b/tensorflow/compiler/xla/service/gpu/BUILD
index b1629616acd..bfd1b6cb149 100644
--- a/tensorflow/compiler/xla/service/gpu/BUILD
+++ b/tensorflow/compiler/xla/service/gpu/BUILD
@@ -701,6 +701,7 @@ cc_library(
         "//tensorflow/compiler/xla/service:hlo_cse",
         "//tensorflow/compiler/xla/service:hlo_dce",
         "//tensorflow/compiler/xla/service:hlo_element_type_converter",
+        "//tensorflow/compiler/xla/service:hlo_get_dimension_size_rewriter",
         "//tensorflow/compiler/xla/service:hlo_pass",
         "//tensorflow/compiler/xla/service:hlo_pass_pipeline",
         "//tensorflow/compiler/xla/service:hlo_proto",
diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_conv_rewriter.cc b/tensorflow/compiler/xla/service/gpu/cudnn_conv_rewriter.cc
index 4ce877f62a5..e81850db69e 100644
--- a/tensorflow/compiler/xla/service/gpu/cudnn_conv_rewriter.cc
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_conv_rewriter.cc
@@ -77,7 +77,11 @@ bool CanImplementAsCudnnForwardConv(HloInstruction* conv) {
     return false;
   }
 
-  if (window_util::HasWindowReversal(conv->window())) {
+  // CuDNN can perform either cross correlation (no reversal),
+  // or convolution (all dimensions reversed).
+  if (dnums.input_spatial_dimensions_size() == 2
+          ? !window_util::AllOrNoneReversed(conv->window())
+          : window_util::HasWindowReversal(conv->window())) {
     return false;
   }
   return true;
diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_conv_runner.cc b/tensorflow/compiler/xla/service/gpu/cudnn_conv_runner.cc
index 492d290bf4a..3425e1b4942 100644
--- a/tensorflow/compiler/xla/service/gpu/cudnn_conv_runner.cc
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_conv_runner.cc
@@ -138,6 +138,7 @@ Status RunCudnnConvImpl(CudnnConvParams params,
 
   const int num_dimensions = window.dimensions_size();
   CHECK_LE(num_dimensions, 3);
+  CHECK_GE(num_dimensions, 1);
   // cuDNN does not support 1D convolutions. We therefore express 1D
   // convolutions as 2D convolutions where the first spatial dimension is 1.
   // This matches the behavior of TF (see definition of conv1d in
@@ -148,10 +149,15 @@ Status RunCudnnConvImpl(CudnnConvParams params,
            output_shape.element_type())
       << ShapeUtil::HumanString(output_shape);
 
+  // If one dimension is reversed, we need to have all dimensions reversed (so
+  // we're doing convolution not cross correlation).
+  const bool dims_reversed = window.dimensions()[0].window_reversal();
+
   CHECK_EQ(num_dimensions, dnums.input_spatial_dimensions_size());
   CHECK_EQ(num_dimensions, dnums.kernel_spatial_dimensions_size());
   CHECK_EQ(num_dimensions, dnums.output_spatial_dimensions_size());
   for (const WindowDimension& dim : window.dimensions()) {
+    CHECK_EQ(dims_reversed, dim.window_reversal());
     CHECK_EQ(dim.padding_low(), dim.padding_high());
     CHECK_EQ(dim.base_dilation(), 1)
         << "cudnn does not support base dilation; it "
@@ -198,6 +204,7 @@ Status RunCudnnConvImpl(CudnnConvParams params,
 
   ConvolutionDescriptor convolution_descriptor(effective_num_dimensions);
   convolution_descriptor.set_group_count(feature_group_count);
+  convolution_descriptor.set_convolution_not_crosscorr(dims_reversed);
   for (int dim = 0; dim < num_dimensions; ++dim) {
     convolution_descriptor
         .set_zero_padding(
@@ -363,14 +370,12 @@ StatusOr<CudnnConvParams> GetCudnnConvParams(
       params.output_shape = &conv_result_shape;
       params.fusion.emplace();
       auto& fusion = *params.fusion;
-      if (backend_config.activation_mode() <
-          static_cast<int64>(se::dnn::ActivationMode::kNumActivationModes)) {
-        fusion.mode = static_cast<se::dnn::ActivationMode>(
-            backend_config.activation_mode());
-      } else {
+      if (!se::dnn::ActivationMode_IsValid(backend_config.activation_mode())) {
         return InternalError("Bad activation mode: %s",
                              backend_config.ShortDebugString());
       }
+      fusion.mode = static_cast<se::dnn::ActivationMode>(
+          backend_config.activation_mode());
       fusion.side_input_scale = backend_config.side_input_scale();
       params.input_buf = operand_buffers[0];
       params.filter_buf = operand_buffers[1];
diff --git a/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.cc b/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.cc
index 6dcdaf1cfe0..2ab754a4710 100644
--- a/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.cc
@@ -161,6 +161,16 @@ StatusOr<llvm::Value*> GpuElementalIrEmitter::EmitFloatBinaryOp(
   PrimitiveType lhs_input_type = op->operand(0)->shape().element_type();
   PrimitiveType rhs_input_type = op->operand(1)->shape().element_type();
   PrimitiveType output_type = op->shape().element_type();
+  HloOpcode opcode = op->opcode();
+
+  if (hlo_module_config_.debug_options().xla_gpu_enable_fast_min_max() &&
+      (opcode == HloOpcode::kMaximum || opcode == HloOpcode::kMinimum)) {
+    return llvm_ir::EmitCallToIntrinsic(
+        opcode == HloOpcode::kMaximum ? llvm::Intrinsic::maxnum
+                                      : llvm::Intrinsic::minnum,
+        {lhs_value, rhs_value}, {lhs_value->getType()}, b_);
+  }
+
   switch (op->opcode()) {
     case HloOpcode::kRemainder: {
       return EmitLibdeviceMathCall("__nv_fmod", {lhs_value, rhs_value},
diff --git a/tensorflow/compiler/xla/service/gpu/fusion_merger.cc b/tensorflow/compiler/xla/service/gpu/fusion_merger.cc
index 30c1f908896..470457935ac 100644
--- a/tensorflow/compiler/xla/service/gpu/fusion_merger.cc
+++ b/tensorflow/compiler/xla/service/gpu/fusion_merger.cc
@@ -229,7 +229,7 @@ Status FusionInstructionMerger::HandleFusion(HloInstruction* fusion) {
   if (!absl::c_all_of(fusion->users(), [&](const HloInstruction* user) {
         return user->opcode() == HloOpcode::kFusion &&
                (user->fusion_kind() == HloInstruction::FusionKind::kLoop ||
-                (user->fusion_kind() == HloInstruction::FusionKind::kInput &&
+                (IsReduceInputFusion(*user) &&
                  LayoutsAreReduceInputFusionFriendly(*fusion, *user)));
       })) {
     VLOG(3) << "Not merging " << fusion->name()
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
index 57426327822..ae2e718db29 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
@@ -51,7 +51,7 @@ GpuExecutable::GpuExecutable(
     const string& ptx, const std::vector<uint8>& cubin,
     std::pair<int, int> compute_capability,
     std::unique_ptr<const ThunkSchedule> thunk_schedule,
-    std::unique_ptr<const HloModule> hlo_module,
+    std::unique_ptr<HloModule> hlo_module,
     std::unique_ptr<const BufferAssignment> assignment,
     std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data,
     std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map)
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_executable.h b/tensorflow/compiler/xla/service/gpu/gpu_executable.h
index 0e276282e40..2b3c77f5b82 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_executable.h
+++ b/tensorflow/compiler/xla/service/gpu/gpu_executable.h
@@ -54,7 +54,7 @@ class GpuExecutable : public Executable {
   GpuExecutable(const string& ptx, const std::vector<uint8>& cubin,
                 std::pair<int, int> compute_capability,
                 std::unique_ptr<const ThunkSchedule> thunk_schedule,
-                std::unique_ptr<const HloModule> hlo_module,
+                std::unique_ptr<HloModule> hlo_module,
                 std::unique_ptr<const BufferAssignment> assignment,
                 std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data,
                 std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map);
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_fusible.cc b/tensorflow/compiler/xla/service/gpu/gpu_fusible.cc
index 2d31fd5570c..452e763a8ea 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_fusible.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_fusible.cc
@@ -55,7 +55,7 @@ bool LayoutsAreReduceInputFusionFriendly(const HloInstruction& producer,
   });
 }
 
-bool IsInputFusibleReduction(const HloInstruction& instr) {
+bool IsReduceInputFusion(const HloInstruction& instr) {
   if (instr.IsMultiOutputFusion()) {
     for (const HloInstruction* operand :
          instr.fused_expression_root()->operands()) {
@@ -67,17 +67,70 @@ bool IsInputFusibleReduction(const HloInstruction& instr) {
         return true;
       }
     }
-    return false;
-  } else if (instr.opcode() == HloOpcode::kFusion) {
-    if (IsReductionToVector(*instr.fused_expression_root())) {
-      CHECK(instr.fusion_kind() == HloInstruction::FusionKind::kInput)
-          << " Fusion rooted at reduction-to-vector op must be of kind kInput: "
-          << instr.ToString();
-      return true;
+  } else if (instr.opcode() == HloOpcode::kFusion &&
+             IsReductionToVector(*instr.fused_expression_root())) {
+    CHECK(instr.fusion_kind() == HloInstruction::FusionKind::kInput)
+        << " Fusion rooted at reduction-to-vector op must be of kind kInput: "
+        << instr.ToString();
+    return true;
+  }
+  return false;
+}
+
+bool IsInputFusibleReduction(const HloInstruction& instr) {
+  return IsReduceInputFusion(instr) || IsReductionToVector(instr);
+}
+
+bool ShapesCompatibleForMultiOutputFusion(const HloInstruction& instr1,
+                                          const HloInstruction& instr2) {
+  // Returns the instructions that determines the emitter used for lowering,
+  // sometimes referred to as "the real hero".
+  auto get_real_hero =
+      [&](const HloInstruction* instr) -> const HloInstruction* {
+    if (instr->opcode() == HloOpcode::kFusion) {
+      auto fused_expression_root = instr->fused_expression_root();
+      if (instr->IsMultiOutputFusion()) {
+        // If possible, we want to pick a reduction-to-vector operand of the
+        // fusion root, because it has the most constraints.
+        for (const auto* inst : fused_expression_root->operands()) {
+          if (IsReductionToVector(*inst)) {
+            return inst;
+          }
+        }
+        return fused_expression_root->operands()[0];
+      }
+      return fused_expression_root;
     }
+    return instr;
+  };
+
+  // Multi-output fusion kernels share a common parallel loop. The loop
+  // dimenstions are determined by instruction shapes.
+  auto get_loop_shape = [&](const HloInstruction* element_instr) {
+    // Special-case reduction-to-vector ops: The loop dimensions are determined
+    // by the shape of the first operand.
+    if (IsReductionToVector(*element_instr)) {
+      return element_instr->operand(0)->shape();
+    }
+    return element_instr->shape();
+  };
+
+  // All shapes of the root tuple of multi-output fusions should agree, i.e. all
+  // root ops should have equal output shapes. An exception are
+  // reduction-to-vector ops. Here the input shapes of the reduction (first
+  // operand shape) and the reduction dimensions need to match.
+  auto* instr_1 = get_real_hero(&instr1);
+  auto* instr_2 = get_real_hero(&instr2);
+  // TODO(tjoerg): Relax the shape constraint. The datatype does not matter.
+  if (IsReductionToVector(*instr_1) && IsReductionToVector(*instr_2) &&
+      (!ShapeUtil::Equal(instr_1->shape(), instr_2->shape()) ||
+       instr_1->dimensions() != instr_2->dimensions())) {
     return false;
   }
-  return IsReductionToVector(instr);
+  // The elementwise output shapes must be the same (including layout).
+  // TODO(tjoerg): Further relax the constraint. The datatype does not matter.
+  return ShapeUtil::EqualIgnoringFpPrecision(get_loop_shape(instr_1),
+                                             get_loop_shape(instr_2));
 }
 
 }  // namespace gpu
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_fusible.h b/tensorflow/compiler/xla/service/gpu/gpu_fusible.h
index f7c24a0d5bb..e9d7ba1c4cf 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_fusible.h
+++ b/tensorflow/compiler/xla/service/gpu/gpu_fusible.h
@@ -33,16 +33,29 @@ namespace gpu {
 bool LayoutsAreReduceInputFusionFriendly(const HloInstruction& producer,
                                          const HloInstruction& reduce);
 
-// Whether `instr` is fusible as root of a reduce input fusions, i.e. `instr`
-// is either an unfused reduction-to-vector op, an input fusion rooted at a
-// reduction-to-vector op, or a multi-output input fusion with at least one
-// reduction-to-vector op root.
 // Note that reduction ops are lowered in different ways. Reduce input fusions
 // are lowered by IrEmitterUnnested::EmitReductionToVector and must be rooted at
 // reduction-to-vector ops. Other reduction ops are lowered by
 // GpuElementalIrEmitter and fused like elementwise ops.
+
+// Whether `instr` is an input fusion rooted at a reduction-to-vector op or a
+// multi-output input fusion with at least one reduction-to-vector op root.
+bool IsReduceInputFusion(const HloInstruction& instr);
+
+// Whether `instr` is fusible as root of a reduce input fusions, i.e. `instr`
+// is either an unfused reduction-to-vector op or a reduce input fusion.
 bool IsInputFusibleReduction(const HloInstruction& instr);
 
+// Whether instruction shapes are compatible for multi-output fusion, i.e.
+// whether the emitters support lowering the resulting fusion.
+// This function works for both, sibling and producer-conumser multi-output
+// fusion.
+// So far, multi-output fusion is supported for loop fusions and reduce
+// input fusions only. It is up to the caller to ensure the instructions
+// themselves are fusible!
+bool ShapesCompatibleForMultiOutputFusion(const HloInstruction& instr1,
+                                          const HloInstruction& instr2);
+
 }  // namespace gpu
 }  // namespace xla
 
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_fusible_test.cc b/tensorflow/compiler/xla/service/gpu/gpu_fusible_test.cc
index d91b7bc61fd..15d4ee206ce 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_fusible_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_fusible_test.cc
@@ -178,7 +178,7 @@ TEST_F(GpuFusibleTest,
   EXPECT_TRUE(LayoutsAreReduceInputFusionFriendly(*loop_fusion, *reduce));
 }
 
-TEST_F(GpuFusibleTest, IsInputFusibleReduction_ReductionToVector) {
+TEST_F(GpuFusibleTest, IsReduceInputFusion_ReductionToVector) {
   auto module = ParseHloString(absl::StrCat(kModulePrefix, R"(
     ENTRY entry {
       c0 = f32[] parameter(0)
@@ -191,10 +191,11 @@ TEST_F(GpuFusibleTest, IsInputFusibleReduction_ReductionToVector) {
   const HloInstruction* reduce =
       module->entry_computation()->root_instruction();
   ASSERT_EQ(reduce->opcode(), HloOpcode::kReduce);
+  EXPECT_FALSE(IsReduceInputFusion(*reduce));
   EXPECT_TRUE(IsInputFusibleReduction(*reduce));
 }
 
-TEST_F(GpuFusibleTest, IsInputFusibleReduction_ElementalReduction) {
+TEST_F(GpuFusibleTest, IsReduceInputFusion_ElementalReduction) {
   auto module = ParseHloString(absl::StrCat(kModulePrefix, R"(
     ENTRY entry {
       c0 = f32[] parameter(0)
@@ -207,10 +208,11 @@ TEST_F(GpuFusibleTest, IsInputFusibleReduction_ElementalReduction) {
   const HloInstruction* reduce =
       module->entry_computation()->root_instruction();
   ASSERT_EQ(reduce->opcode(), HloOpcode::kReduce);
+  EXPECT_FALSE(IsReduceInputFusion(*reduce));
   EXPECT_FALSE(IsInputFusibleReduction(*reduce));
 }
 
-TEST_F(GpuFusibleTest, IsInputFusibleReduction_SingleOutputInputReduceFusion) {
+TEST_F(GpuFusibleTest, IsReduceInputFusion_SingleOutputInputReduceFusion) {
   auto module = ParseHloString(absl::StrCat(kModulePrefix, R"(
     fused_reduction {
       c0 = f32[] parameter(0)
@@ -225,10 +227,11 @@ TEST_F(GpuFusibleTest, IsInputFusibleReduction_SingleOutputInputReduceFusion) {
   const HloInstruction* reduce =
       module->entry_computation()->root_instruction();
   ASSERT_EQ(reduce->opcode(), HloOpcode::kFusion);
+  EXPECT_TRUE(IsReduceInputFusion(*reduce));
   EXPECT_TRUE(IsInputFusibleReduction(*reduce));
 }
 
-TEST_F(GpuFusibleTest, IsInputFusibleReduction_SingleOutputLoopReduceFusion) {
+TEST_F(GpuFusibleTest, IsReduceInputFusion_SingleOutputLoopReduceFusion) {
   auto module = ParseHloString(absl::StrCat(kModulePrefix, R"(
     fused_reduction {
       c0 = f32[] parameter(0)
@@ -243,10 +246,11 @@ TEST_F(GpuFusibleTest, IsInputFusibleReduction_SingleOutputLoopReduceFusion) {
   const HloInstruction* reduce =
       module->entry_computation()->root_instruction();
   ASSERT_EQ(reduce->opcode(), HloOpcode::kFusion);
+  EXPECT_FALSE(IsReduceInputFusion(*reduce));
   EXPECT_FALSE(IsInputFusibleReduction(*reduce));
 }
 
-TEST_F(GpuFusibleTest, IsInputFusibleReduction_MultiOutputInputReduceFusion) {
+TEST_F(GpuFusibleTest, IsReduceInputFusion_MultiOutputInputReduceFusion) {
   auto module = ParseHloString(absl::StrCat(kModulePrefix, R"(
     fused_reduction {
       c0 = f32[] parameter(0)
@@ -263,11 +267,12 @@ TEST_F(GpuFusibleTest, IsInputFusibleReduction_MultiOutputInputReduceFusion) {
   const HloInstruction* reduce =
       module->entry_computation()->root_instruction();
   ASSERT_EQ(reduce->opcode(), HloOpcode::kFusion);
+  EXPECT_TRUE(IsReduceInputFusion(*reduce));
   EXPECT_TRUE(IsInputFusibleReduction(*reduce));
 }
 
 TEST_F(GpuFusibleTest,
-       IsInputFusibleReduction_MultiOutputInputReduceFusionWithExtraOutputs) {
+       IsReduceInputFusion_MultiOutputInputReduceFusionWithExtraOutputs) {
   auto module = ParseHloString(absl::StrCat(kModulePrefix, R"(
     fused_reduction {
       c0 = f32[] parameter(0)
@@ -284,10 +289,11 @@ TEST_F(GpuFusibleTest,
   const HloInstruction* reduce =
       module->entry_computation()->root_instruction();
   ASSERT_EQ(reduce->opcode(), HloOpcode::kFusion);
+  EXPECT_TRUE(IsReduceInputFusion(*reduce));
   EXPECT_TRUE(IsInputFusibleReduction(*reduce));
 }
 
-TEST_F(GpuFusibleTest, IsInputFusibleReduction_MultiOutputLoopReduceFusion) {
+TEST_F(GpuFusibleTest, IsReduceInputFusion_MultiOutputLoopReduceFusion) {
   auto module = ParseHloString(absl::StrCat(kModulePrefix, R"(
     fused_reduction {
       c0 = f32[] parameter(0)
@@ -304,11 +310,12 @@ TEST_F(GpuFusibleTest, IsInputFusibleReduction_MultiOutputLoopReduceFusion) {
   const HloInstruction* reduce =
       module->entry_computation()->root_instruction();
   ASSERT_EQ(reduce->opcode(), HloOpcode::kFusion);
+  EXPECT_FALSE(IsReduceInputFusion(*reduce));
   EXPECT_FALSE(IsInputFusibleReduction(*reduce));
 }
 
 TEST_F(GpuFusibleTest,
-       IsInputFusibleReduction_MultiOutputLoopFusionReduceAndElementwiseOp) {
+       IsReduceInputFusion_MultiOutputLoopFusionReduceAndElementwiseOp) {
   auto module = ParseHloString(absl::StrCat(kModulePrefix, R"(
     fused_reduction {
       c0 = f32[] parameter(0)
@@ -325,8 +332,304 @@ TEST_F(GpuFusibleTest,
   const HloInstruction* reduce =
       module->entry_computation()->root_instruction();
   ASSERT_EQ(reduce->opcode(), HloOpcode::kFusion);
+  EXPECT_FALSE(IsReduceInputFusion(*reduce));
   EXPECT_FALSE(IsInputFusibleReduction(*reduce));
 }
 
+TEST_F(GpuFusibleTest, ShapesCompatibleForMultiOutputFusion_LoopFusions) {
+  auto module = ParseHloString(absl::StrCat(kModulePrefix, R"(
+    fused_computation_1 {
+      p0.1 = f32[6400]{0} parameter(0)
+      ROOT mul = f32[6400]{0} multiply(p0.1, p0.1)
+    }
+
+    fused_computation_2 {
+      p0.2 = f32[6400]{0} parameter(0)
+      const.2 = f32[] constant(1)
+      ROOT div = f32[6400]{0} divide(p0.2, const.2)
+    }
+
+    ENTRY entry {
+      p0 = f32[6400]{0} parameter(0)
+      fusion.1 = f32[6400]{0} fusion(p0), kind=kLoop, calls=fused_computation_1
+      fusion.2 = f32[6400]{0} fusion(p0), kind=kLoop, calls=fused_computation_2
+      ROOT root = (f32[6400]{0}, f32[6400]{0}) tuple(fusion.1, fusion.2)
+    })"))
+                    .ValueOrDie();
+  const HloInstruction* fusion_1 =
+      module->entry_computation()->root_instruction()->operand(0);
+  const HloInstruction* fusion_2 =
+      module->entry_computation()->root_instruction()->operand(1);
+  EXPECT_TRUE(ShapesCompatibleForMultiOutputFusion(*fusion_1, *fusion_2));
+}
+
+TEST_F(GpuFusibleTest, ShapesCompatibleForMultiOutputFusion_IgnoreFpPrecision) {
+  auto module = ParseHloString(absl::StrCat(kModulePrefix, R"(
+    fused_computation_1 {
+      p0.1 = f32[6400]{0} parameter(0)
+      ROOT mul = f32[6400]{0} multiply(p0.1, p0.1)
+    }
+
+    fused_computation_2 {
+      p0.2 = f32[6400]{0} parameter(0)
+      ROOT convert = f16[6400]{0} convert(p0.2)
+    }
+
+    ENTRY entry {
+      p0 = f32[6400]{0} parameter(0)
+      fusion.1 = f32[6400]{0} fusion(p0), kind=kLoop, calls=fused_computation_1
+      fusion.2 = f32[6400]{0} fusion(p0), kind=kLoop, calls=fused_computation_2
+      ROOT root = (f32[6400]{0}, f32[6400]{0}) tuple(fusion.1, fusion.2)
+    })"))
+                    .ValueOrDie();
+  const HloInstruction* fusion_1 =
+      module->entry_computation()->root_instruction()->operand(0);
+  const HloInstruction* fusion_2 =
+      module->entry_computation()->root_instruction()->operand(1);
+  EXPECT_TRUE(ShapesCompatibleForMultiOutputFusion(*fusion_1, *fusion_2));
+}
+
+TEST_F(GpuFusibleTest, ShapesCompatibleForMultiOutputFusion_Reduce) {
+  auto module = ParseHloString(absl::StrCat(kModulePrefix, R"(
+    fused_computation_1 {
+      p0.1 = f32[6400]{0} parameter(0)
+      ROOT mul = f32[6400]{0} multiply(p0.1, p0.1)
+    }
+
+    ENTRY entry {
+      p0 = f32[6400]{0} parameter(0)
+      fusion.1 = f32[6400]{0} fusion(p0), kind=kLoop, calls=fused_computation_1
+      const.2 = f32[] constant(0)
+      reduce = f32[] reduce(p0, const.2), dimensions={0}, to_apply=scalar_add
+      ROOT root = (f32[6400]{0}, f32[]) tuple(fusion.1, reduce)
+    })"))
+                    .ValueOrDie();
+  const HloInstruction* fusion =
+      module->entry_computation()->root_instruction()->operand(0);
+  const HloInstruction* reduce =
+      module->entry_computation()->root_instruction()->operand(1);
+  EXPECT_TRUE(ShapesCompatibleForMultiOutputFusion(*fusion, *reduce));
+}
+
+TEST_F(GpuFusibleTest, ShapesCompatibleForMultiOutputFusion_Elementwise) {
+  auto module = ParseHloString(absl::StrCat(kModulePrefix, R"(
+    fused_computation_1 {
+      p0.1 = f32[6400]{0} parameter(0)
+      ROOT mul = f32[6400]{0} multiply(p0.1, p0.1)
+    }
+
+    ENTRY entry {
+      p0 = f32[6400]{0} parameter(0)
+      fusion.1 = f32[6400]{0} fusion(p0), kind=kLoop, calls=fused_computation_1
+      const.2 = f32[] constant(1)
+      div = f32[6400]{0} divide(p0, const.2)
+      ROOT root = (f32[6400]{0}, f32[6400]{0}) tuple(fusion.1, div)
+    })"))
+                    .ValueOrDie();
+  const HloInstruction* fusion =
+      module->entry_computation()->root_instruction()->operand(0);
+  const HloInstruction* div =
+      module->entry_computation()->root_instruction()->operand(1);
+  EXPECT_TRUE(ShapesCompatibleForMultiOutputFusion(*fusion, *div));
+}
+
+TEST_F(GpuFusibleTest,
+       ShapesCompatibleForMultiOutputFusion_MultiOutputLoopFusion) {
+  auto module = ParseHloString(absl::StrCat(kModulePrefix, R"(
+    fused_computation_1 {
+      p0.1 = f32[8,1,5,16,1,1]{5,4,3,2,1,0} parameter(0)
+      mul = f32[8,1,5,16,1,1]{5,4,3,2,1,0} multiply(p0.1, p0.1)
+      exp = f32[8,1,5,16,1,1]{5,4,3,2,1,0} exponential(p0.1)
+      ROOT tuple = (f32[8,1,5,16,1,1]{5,4,3,2,1,0}, f32[8,1,5,16,1,1]{5,4,3,2,1,0}) tuple(mul, exp)
+    }
+
+    fused_computation_2 {
+      p0.2 = f32[8,1,5,16,1,1]{5,4,3,2,1,0} parameter(0)
+      const.2 = f32[] constant(0)
+      ROOT add = f32[8,1,5,16,1,1]{5,4,3,2,1,0} add(p0.2, const.2)
+    }
+
+    ENTRY entry {
+      p0 = f32[8,1,5,16,1,1]{5,4,3,2,1,0} parameter(0)
+      fusion.1 = (f32[8,1,5,16,1,1]{5,4,3,2,1,0}, f32[8,1,5,16,1,1]{5,4,3,2,1,0}) fusion(p0), kind=kLoop, calls=fused_computation_1
+      fusion.2 = f32[8,1,5,16,1,1]{5,4,3,2,1,0} fusion(p0), kind=kLoop, calls=fused_computation_2
+      gte0 = f32[8,1,5,16,1,1]{5,4,3,2,1,0} get-tuple-element(fusion.1), index=0
+      gte1 = f32[8,1,5,16,1,1]{5,4,3,2,1,0} get-tuple-element(fusion.1), index=1
+      ROOT root = (f32[8,1,5,16,1,1]{5,4,3,2,1,0}, f32[8,1,5,16,1,1]{5,4,3,2,1,0}, f32[8,1,5,16,1,1]{5,4,3,2,1,0}) tuple(gte0, gte1, fusion.2)
+    })"))
+                    .ValueOrDie();
+  const HloInstruction* fusion_1 =
+      module->entry_computation()->root_instruction()->operand(0)->operand(0);
+  const HloInstruction* fusion_2 =
+      module->entry_computation()->root_instruction()->operand(1)->operand(0);
+  EXPECT_TRUE(ShapesCompatibleForMultiOutputFusion(*fusion_1, *fusion_2));
+}
+
+TEST_F(GpuFusibleTest, ShapesCompatibleForMultiOutputFusion_UnfusedOps) {
+  auto module = ParseHloString(absl::StrCat(kModulePrefix, R"(
+    ENTRY reduce {
+      p0 = f32[2,2,2]{2,1,0} parameter(0)
+      c0 = f32[] constant(0)
+      exp = f32[2,2,2]{2,1,0} exponential(p0)
+      reduce = f32[2,2]{1,0} reduce(exp, c0), dimensions={2}, to_apply=scalar_add
+      ROOT root = (f32[2,2]{1,0}, f32[2,2,2]{2,1,0}) tuple(reduce, exp)
+    })"))
+                    .ValueOrDie();
+  const HloInstruction* reduce =
+      module->entry_computation()->root_instruction()->operand(0);
+  const HloInstruction* exp =
+      module->entry_computation()->root_instruction()->operand(1);
+  EXPECT_TRUE(ShapesCompatibleForMultiOutputFusion(*reduce, *exp));
+}
+
+TEST_F(GpuFusibleTest, ShapesCompatibleForMultiOutputFusion_DifferentLayouts) {
+  auto module = ParseHloString(absl::StrCat(kModulePrefix, R"(
+    ENTRY reduce {
+      p0 = f32[2,2,2]{2,1,0} parameter(0)
+      p1 = f32[2,2,2]{0,1,2} parameter(1)
+      c0 = f32[] constant(0)
+      exp = f32[2,2,2]{2,1,0} exponential(p0)
+      reduce = f32[2,2]{0,1} reduce(p1, c0), dimensions={2}, to_apply=scalar_add
+      ROOT root = (f32[2,2]{0,1}, f32[2,2,2]{2,1,0}) tuple(reduce, exp)
+    })"))
+                    .ValueOrDie();
+  const HloInstruction* reduce =
+      module->entry_computation()->root_instruction()->operand(0);
+  const HloInstruction* exp =
+      module->entry_computation()->root_instruction()->operand(1);
+  EXPECT_FALSE(ShapesCompatibleForMultiOutputFusion(*reduce, *exp));
+}
+
+TEST_F(GpuFusibleTest,
+       ShapesCompatibleForMultiOutputFusion_MultiOutputReduceFusion) {
+  auto module = ParseHloString(absl::StrCat(kModulePrefix, R"(
+    fused_select {
+      p1.1 = f32[2,2,2]{2,1,0} parameter(1)
+      c0 = f32[] constant(0)
+      broadcast = f32[2,2,2]{2,1,0} broadcast(f32[] c0), dimensions={}
+      greater-than = pred[2,2,2]{2,1,0} greater-than(f32[2,2,2]{2,1,0} p1.1, f32[2,2,2]{2,1,0} broadcast)
+      p0.1 = f32[2,2,2]{2,1,0} parameter(0)
+      ROOT select = f32[2,2,2]{2,1,0} select(pred[2,2,2]{2,1,0} greater-than, f32[2,2,2]{2,1,0} p0.1, f32[2,2,2]{2,1,0} broadcast)
+    }
+
+    fused_reduce {
+      p0.2 = f32[2,2,2]{2,1,0} parameter(0)
+      c1 = f32[] constant(0)
+      r1 = f32[2,2]{1,0} reduce(p0.2, c1), dimensions={2}, to_apply=scalar_add
+      mul = f32[2,2,2]{2,1,0} multiply(p0.2, p0.2)
+      r2 = f32[2,2]{1,0} reduce(mul, c1), dimensions={2}, to_apply=scalar_add
+      ROOT tuple = (f32[2,2]{1,0}, f32[2,2]{1,0}) tuple(r1, r2)
+    }
+
+    ENTRY reduce {
+      p0 = f32[2,2,2]{2,1,0} parameter(0)
+      p1 = f32[2,2,2]{2,1,0} parameter(1)
+      select = f32[2,2,2]{2,1,0} fusion(p0, p1), kind=kLoop, calls=fused_select
+      fusion = (f32[2,2]{1,0}, f32[2,2]{1,0}) fusion(select), kind=kInput, calls=fused_reduce
+      gte0 = f32[2,2]{1,0} get-tuple-element(fusion), index=0
+      gte1 = f32[2,2]{1,0} get-tuple-element(fusion), index=1
+      ROOT root = (f32[2,2]{1,0}, f32[2,2]{1,0}, f32[2,2,2]{2,1,0}) tuple(gte1, gte1, select)
+    })"))
+                    .ValueOrDie();
+  const HloInstruction* fusion_1 =
+      module->entry_computation()->root_instruction()->operand(0)->operand(0);
+  const HloInstruction* fusion_2 =
+      module->entry_computation()->root_instruction()->operand(1)->operand(0);
+  EXPECT_TRUE(ShapesCompatibleForMultiOutputFusion(*fusion_1, *fusion_2));
+}
+
+TEST_F(GpuFusibleTest, ShapesCompatibleForMultiOutputFusion_ReduceFusions) {
+  auto module = ParseHloString(absl::StrCat(kModulePrefix, R"(
+    fused_reduce_1 {
+      p0.1 = f32[2,2,2]{2,1,0} parameter(0)
+      c0 = f32[] constant(0)
+      ROOT reduce = f32[2,2]{1,0} reduce(f32[2,2,2]{2,1,0} p0.1, f32[] c0), dimensions={0}, to_apply=scalar_add
+    }
+
+    fused_reduce_2 {
+      p0.2 = f32[2,2,2]{2,1,0} parameter(0)
+      mul = f32[2,2,2]{2,1,0} multiply(f32[2,2,2]{2,1,0} p0.2, f32[2,2,2]{2,1,0} p0.2)
+      c1 = f32[] constant(0)
+      ROOT reduce = f32[2,2]{1,0} reduce(f32[2,2,2]{2,1,0} mul, f32[] c1), dimensions={0}, to_apply=scalar_add
+    }
+
+    ENTRY reduce {
+      p0 = f32[2,2,2]{2,1,0} parameter(0)
+      p1 = f32[2,2,2]{2,1,0} parameter(1)
+      reduce_1 = f32[2,2]{1,0} fusion(p0), kind=kLoop, calls=fused_reduce_1
+      reduce_2 = f32[2,2]{1,0} fusion(p1), kind=kLoop, calls=fused_reduce_2
+      ROOT root = (f32[2,2]{1,0}, f32[2,2,2]{2,1,0}) tuple(reduce_1, reduce_2)
+    })"))
+                    .ValueOrDie();
+  const HloInstruction* fusion_1 =
+      module->entry_computation()->root_instruction()->operand(0);
+  const HloInstruction* fusion_2 =
+      module->entry_computation()->root_instruction()->operand(1);
+  EXPECT_TRUE(ShapesCompatibleForMultiOutputFusion(*fusion_1, *fusion_2));
+}
+
+TEST_F(GpuFusibleTest,
+       ShapesCompatibleForMultiOutputFusion_DifferentReduceDimensions) {
+  auto module = ParseHloString(absl::StrCat(kModulePrefix, R"(
+    fused_reduce_1 {
+      p0.1 = f32[2,2,2]{2,1,0} parameter(0)
+      c0 = f32[] constant(0)
+      ROOT reduce = f32[2,2]{1,0} reduce(f32[2,2,2]{2,1,0} p0.1, f32[] c0), dimensions={0}, to_apply=scalar_add
+    }
+
+    fused_reduce_2 {
+      p0.2 = f32[2,2,2]{2,1,0} parameter(0)
+      mul = f32[2,2,2]{2,1,0} multiply(f32[2,2,2]{2,1,0} p0.2, f32[2,2,2]{2,1,0} p0.2)
+      c1 = f32[] constant(0)
+      ROOT reduce = f32[2,2]{1,0} reduce(f32[2,2,2]{2,1,0} mul, f32[] c1), dimensions={2}, to_apply=scalar_add
+    }
+
+    ENTRY reduce {
+      p0 = f32[2,2,2]{2,1,0} parameter(0)
+      p1 = f32[2,2,2]{2,1,0} parameter(1)
+      reduce_1 = f32[2,2]{1,0} fusion(p0), kind=kLoop, calls=fused_reduce_1
+      reduce_2 = f32[2,2]{1,0} fusion(p1), kind=kLoop, calls=fused_reduce_2
+      ROOT root = (f32[2,2]{1,0}, f32[2,2,2]{2,1,0}) tuple(reduce_1, reduce_2)
+    })"))
+                    .ValueOrDie();
+  const HloInstruction* fusion_1 =
+      module->entry_computation()->root_instruction()->operand(0);
+  const HloInstruction* fusion_2 =
+      module->entry_computation()->root_instruction()->operand(1);
+  EXPECT_FALSE(ShapesCompatibleForMultiOutputFusion(*fusion_1, *fusion_2));
+}
+
+TEST_F(GpuFusibleTest,
+       ShapesCompatibleForMultiOutputFusion_NoReductionToVector) {
+  auto module = ParseHloString(absl::StrCat(kModulePrefix, R"(
+    fused_element_wise {
+      p0.1 = f32[2,2,2]{2,1,0} parameter(0)
+      p1.1 = f32[2,2,2]{2,1,0} parameter(1)
+      ROOT add = f32[2,2,2]{2,1,0} add(p0.1, p1.1)
+    }
+
+    fused_reduce {
+      p0.2 = f32[2,2,2]{2,1,0} parameter(0)
+      mul = f32[2,2,2]{2,1,0} multiply(f32[2,2,2]{2,1,0} p0.2, f32[2,2,2]{2,1,0} p0.2)
+      c1 = f32[] constant(0)
+      // Note that reduce is not a reduction-to-vector.
+      ROOT reduce = f32[2,2]{1,0} reduce(f32[2,2,2]{2,1,0} mul, f32[] c1), dimensions={1}, to_apply=scalar_add
+    }
+
+    ENTRY reduce {
+      p0 = f32[2,2,2]{2,1,0} parameter(0)
+      p1 = f32[2,2,2]{2,1,0} parameter(1)
+      element_wise = f32[2,2,2]{2,1,0} fusion(p0, p1), kind=kLoop, calls=fused_element_wise
+      fusion = (f32[2,2]{1,0}, f32[2,2]{1,0}) fusion(element_wise), kind=kLoop, calls=fused_reduce
+      ROOT root = (f32[2,2]{1,0}, f32[2,2,2]{2,1,0}) tuple(fusion, element_wise)
+    })"))
+                    .ValueOrDie();
+  const HloInstruction* fusion_1 =
+      module->entry_computation()->root_instruction()->operand(0);
+  const HloInstruction* fusion_2 =
+      module->entry_computation()->root_instruction()->operand(1);
+  EXPECT_FALSE(ShapesCompatibleForMultiOutputFusion(*fusion_1, *fusion_2));
+}
+
 }  // namespace gpu
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_hlo_schedule.cc b/tensorflow/compiler/xla/service/gpu/gpu_hlo_schedule.cc
index 91609c730b6..1126943624a 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_hlo_schedule.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_hlo_schedule.cc
@@ -37,7 +37,7 @@ class GpuHloOrdering : public PredecessorHloOrdering {
  public:
   GpuHloOrdering(const HloModule* module,
                  const StreamAssignment& stream_assignment,
-                 const std::vector<const HloInstruction*>& thunk_launch_order);
+                 const std::vector<HloInstruction*>& thunk_launch_order);
   ~GpuHloOrdering() override = default;
 
   // Only the entry computation can possibly be sequentially ordered, and only
@@ -56,7 +56,7 @@ class GpuHloOrdering : public PredecessorHloOrdering {
 
 GpuHloOrdering::GpuHloOrdering(
     const HloModule* module, const StreamAssignment& stream_assignment,
-    const std::vector<const HloInstruction*>& thunk_launch_order)
+    const std::vector<HloInstruction*>& thunk_launch_order)
     : PredecessorHloOrdering(module) {
   // The entry computation has a total order when there's only one stream.
   if (stream_assignment.StreamCount() == 1) {
@@ -150,7 +150,7 @@ GpuHloOrdering::GpuHloOrdering(
 // However, if the total order is A,B,D,C,E, then C and E can run
 // concurrently.
 void BFSLaunchOrder(const HloComputation* computation,
-                    std::vector<const HloInstruction*>* launch_order) {
+                    std::vector<HloInstruction*>* launch_order) {
   // This topological sort uses two data structures:
   // 1. `incoming_edge_count` which keeps track of the number of incoming
   // edges to each HLO;
@@ -158,9 +158,9 @@ void BFSLaunchOrder(const HloComputation* computation,
   //
   // The sorting algorithm repeatedly pops the top from the queue and deletes
   // that HLO from the graph, making more HLOs incoming-edge free.
-  std::deque<const HloInstruction*> queue;
+  std::deque<HloInstruction*> queue;
   std::unordered_map<const HloInstruction*, int64> incoming_edge_count;
-  for (const auto& hlo : computation->instructions()) {
+  for (auto* hlo : computation->instructions()) {
     if (hlo->operand_count() == 0) {
       queue.push_back(hlo);
     } else {
@@ -172,10 +172,10 @@ void BFSLaunchOrder(const HloComputation* computation,
   }
 
   while (!queue.empty()) {
-    const HloInstruction* x = queue.front();
+    HloInstruction* x = queue.front();
     queue.pop_front();
     launch_order->push_back(x);
-    for (const HloInstruction* y : x->users()) {
+    for (HloInstruction* y : x->users()) {
       --incoming_edge_count[y];
       if (incoming_edge_count[y] == 0) {
         queue.push_back(y);
@@ -195,14 +195,14 @@ StatusOr<std::unique_ptr<GpuHloSchedule>> GpuHloSchedule::Build(
   std::unique_ptr<GpuHloSchedule> schedule(new GpuHloSchedule);
 
   // Initialize thunk_launch_order_, the total order of thunk launches.
-  const HloComputation* entry_computation = module.entry_computation();
+  HloComputation* entry_computation = module.entry_computation();
   if (stream_assignment.StreamCount() == 1) {
     // All kernels are launched on a single stream, so there's no loss of
     // concurrency by optimizing for minimal memory usage.
     TF_ASSIGN_OR_RETURN(
         HloInstructionSequence sequence,
         ScheduleComputation(
-            *entry_computation, [pointer_size](const BufferValue& buffer) {
+            entry_computation, [pointer_size](const BufferValue& buffer) {
               return ShapeUtil::ByteSizeOf(buffer.shape(), pointer_size);
             }));
     schedule->thunk_launch_order_ = sequence.instructions();
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_hlo_schedule.h b/tensorflow/compiler/xla/service/gpu/gpu_hlo_schedule.h
index 07a7fc67aa5..7f224ffe4f0 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_hlo_schedule.h
+++ b/tensorflow/compiler/xla/service/gpu/gpu_hlo_schedule.h
@@ -46,7 +46,7 @@ class GpuHloSchedule {
 
   // Returns the total order of thunk launches, represented in terms of HLO
   // instructions.
-  const std::vector<const HloInstruction*>& ThunkLaunchOrder() const {
+  const std::vector<HloInstruction*>& ThunkLaunchOrder() const {
     return thunk_launch_order_;
   }
 
@@ -60,7 +60,7 @@ class GpuHloSchedule {
  private:
   GpuHloSchedule();
 
-  std::vector<const HloInstruction*> thunk_launch_order_;
+  std::vector<HloInstruction*> thunk_launch_order_;
   std::unique_ptr<HloOrdering> hlo_ordering_;
 };
 
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_hlo_schedule_test.cc b/tensorflow/compiler/xla/service/gpu/gpu_hlo_schedule_test.cc
index 6d3aed15ebe..91db7151f22 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_hlo_schedule_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_hlo_schedule_test.cc
@@ -33,7 +33,7 @@ namespace gpu {
 
 class GpuHloScheduleTest : public HloTestBase {
  protected:
-  using HloVec = std::vector<const HloInstruction*>;
+  using HloVec = std::vector<HloInstruction*>;
 
   // Pre-canned shapes.
   Shape f32_2x2_ = ShapeUtil::MakeShape(F32, {2, 2});
@@ -44,7 +44,7 @@ class GpuHloScheduleTest : public HloTestBase {
         .ConsumeValueOrDie();
   }
 
-  std::unique_ptr<HloModule> CreateNewUnverifiedModule() {
+  std::unique_ptr<HloModule> CreateNewVerifiedModule() {
     HloModuleConfig config;
     auto debug_options = GetDebugOptionsForTest();
     debug_options.set_xla_gpu_disable_multi_streaming(false);
@@ -79,7 +79,7 @@ TEST_F(GpuHloScheduleTest, SequentialMatMul) {
   HloInstruction* dot2 =
       builder.AddInstruction(CreateCanonicalDot(f32_2x2_, dot1, z));
 
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build(dot2));
 
   std::unique_ptr<StreamAssignment> streams = AssignStreams(*module);
@@ -139,7 +139,7 @@ TEST_F(GpuHloScheduleTest, SequentialAdd) {
   HloInstruction* add3 = builder.AddInstruction(
       HloInstruction::CreateBinary(f32_2x2_, HloOpcode::kAdd, add1, add2));
 
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build(add3));
 
   std::unique_ptr<StreamAssignment> streams = AssignStreams(*module);
@@ -209,7 +209,7 @@ TEST_F(GpuHloScheduleTest, ConcurrentMatMul) {
   HloInstruction* add =
       builder.AddInstruction(CreateCanonicalDot(f32_2x2_, dot1, dot2));
 
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build(add));
 
   std::unique_ptr<StreamAssignment> streams = AssignStreams(*module);
@@ -288,7 +288,7 @@ TEST_F(GpuHloScheduleTest, LatticeMatMul) {
   HloInstruction* d40 =
       builder.AddInstruction(CreateCanonicalDot(f32_2x2_, d30, d31));
 
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build(d40));
 
   std::unique_ptr<StreamAssignment> streams = AssignStreams(*module);
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment.cc b/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment.cc
index 1c0a23fa3eb..f59da2caa18 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment.cc
@@ -65,8 +65,8 @@ HeuristicLayoutAssignment(const HloInstruction* instr,
 
   VLOG(2) << "Using heuristic to figure out layouts for " << instr->ToString();
 
-  // Empirically we've found with Volta and cudnn 7 that backward-input convs
-  // with stride are significantly faster with NCHW layouts.
+  // Empirically we've found with Volta and cudnn <= 7.3 that backward-input
+  // convs with stride are significantly faster with NCHW layouts.
   //
   // We could have used a mixed layout combination, e.g. (NHWC, NCHW, NCHW),
   // which on paper gives good performance. However, there are two observations:
@@ -75,11 +75,17 @@ HeuristicLayoutAssignment(const HloInstruction* instr,
   // * we've also observed that for mixed layouts, cuDNN transposes data back
   //   and forth from a different layout combination. If we end up with
   //   transposes anyway, we prefer to have them in XLA, as they can be fused.
-  // TODO(timshen): Figure out the exact condition. This may be achieved by
-  // auto-tuning layouts offline.
-  if (instr->custom_call_target() == kCudnnConvBackwardInputCallTarget &&
-      window_util::HasStride(instr->window())) {
-    return kAllNCHW;
+  if (auto* dnn = stream_executor->AsDnn()) {
+    auto version_status = dnn->GetVersion();
+    if (version_status.ok()) {
+      auto version = version_status.ConsumeValueOrDie();
+      if (std::make_tuple(version.major_version(), version.minor_version()) <=
+              std::make_tuple(7, 3) &&
+          instr->custom_call_target() == kCudnnConvBackwardInputCallTarget &&
+          window_util::HasStride(instr->window())) {
+        return kAllNCHW;
+      }
+    }
   }
 
   // For other Volta f16 convolutions, use NHWC.
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment_test.cc b/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment_test.cc
index 8cc76c872c6..2ffc8bfb49b 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment_test.cc
@@ -61,7 +61,7 @@ TEST_F(LayoutAssignmentTest, Elementwise) {
             HloInstruction::CreateParameter(1, ashape, "y"));
         auto add = builder.AddInstruction(
             HloInstruction::CreateBinary(ashape, HloOpcode::kAdd, x, y));
-        auto module = CreateNewUnverifiedModule();
+        auto module = CreateNewVerifiedModule();
         HloComputation* computation =
             module->AddEntryComputation(builder.Build(add));
 
@@ -148,7 +148,7 @@ TEST_F(LayoutAssignmentTest, BatchNormInference) {
           {operand, scale, offset, mean, variance, epsilon, feature_index},
           kCudnnBatchNormForwardInferenceCallTarget));
 
-      auto module = CreateNewUnverifiedModule();
+      auto module = CreateNewVerifiedModule();
       HloComputation* computation =
           module->AddEntryComputation(builder.Build(batchnorm));
 
@@ -217,7 +217,7 @@ TEST_F(LayoutAssignmentTest, BatchNormTraining) {
           batchnorm_shape, {operand, scale, offset, epsilon, feature_index},
           kCudnnBatchNormForwardTrainingCallTarget));
 
-      auto module = CreateNewUnverifiedModule();
+      auto module = CreateNewVerifiedModule();
       HloComputation* computation =
           module->AddEntryComputation(builder.Build(batchnorm));
 
@@ -298,7 +298,7 @@ TEST_F(LayoutAssignmentTest, BatchNormGrad) {
                  feature_index},
                 kCudnnBatchNormBackwardCallTarget));
 
-        auto module = CreateNewUnverifiedModule();
+        auto module = CreateNewVerifiedModule();
         HloComputation* computation =
             module->AddEntryComputation(builder.Build(batchnorm));
 
diff --git a/tensorflow/compiler/xla/service/gpu/instruction_fusion.cc b/tensorflow/compiler/xla/service/gpu/instruction_fusion.cc
index 43f43b50e4a..6151dd8ff4c 100644
--- a/tensorflow/compiler/xla/service/gpu/instruction_fusion.cc
+++ b/tensorflow/compiler/xla/service/gpu/instruction_fusion.cc
@@ -80,7 +80,7 @@ bool IsIEEEFloatingPointScalarConstant(const HloInstruction* constant) {
 // This function limits the maximum number of operands to a fusion.
 //
 // There's a cap on how many parameters we can pass to a CUDA kernel, but
-// exactly what that limit is is hazy, as it depends on (among other things) how
+// exactly what that limit is hazy, as it depends on (among other things) how
 // much GPU constant memory is in use for other purposes.
 //
 // Moreover, we don't even know at the point that we're running fusion how many
@@ -181,7 +181,8 @@ bool GpuInstructionFusion::ShouldFuse(HloInstruction* consumer,
         return true;
       }
     } else if (consumer->operand_count() == 2 &&
-               consumer->opcode() == HloOpcode::kAdd) {
+               consumer->opcode() == HloOpcode::kAdd &&
+               consumer->operand(other_operand_index) != producer) {
       // Fuse a bias add into the output of the dot.
       return true;
     }
diff --git a/tensorflow/compiler/xla/service/gpu/instruction_fusion_test.cc b/tensorflow/compiler/xla/service/gpu/instruction_fusion_test.cc
index fb77bc4b8eb..688604cd36e 100644
--- a/tensorflow/compiler/xla/service/gpu/instruction_fusion_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/instruction_fusion_test.cc
@@ -117,7 +117,7 @@ TEST_F(InstructionFusionTest, PotentialBitcastReshapeOfDotUnfused) {
   auto reshape2 = builder.AddInstruction(HloInstruction::CreateReshape(
       ShapeUtil::MakeShape(S32, {1, 1, 1}), dot1));
 
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
   EXPECT_EQ(reshape2, computation->root_instruction());
   EXPECT_FALSE(GpuInstructionFusion(/*may_duplicate=*/true)
@@ -134,7 +134,7 @@ TEST_F(InstructionFusionTest, PotentialBitcastTransposeOfDotUnfused) {
   auto transpose2 = builder.AddInstruction(HloInstruction::CreateTranspose(
       ShapeUtil::MakeShape(S32, {1, 1}), dot1, {0, 1}));
 
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
   EXPECT_EQ(transpose2, computation->root_instruction());
   EXPECT_FALSE(GpuInstructionFusion(/*may_duplicate=*/true)
@@ -358,6 +358,29 @@ TEST_F(InstructionFusionTest, DotOutputFusionBiasAdd) {
                       op::Parameter()));
 }
 
+TEST_F(InstructionFusionTest,
+       DotOperationFusion_DontOutputFuseDuplicateOperands) {
+  absl::string_view module_string = R"(
+HloModule module
+
+ENTRY main {
+  a = f32[50,60]{1,0} parameter(0)
+  b = f32[60,1]{1,0} parameter(1)
+  c = f32[50,1]{1,0} dot(a, b), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+  ROOT d = f32[50,1]{1,0} add(c, c)
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(module_string));
+  TF_ASSERT_OK_AND_ASSIGN(
+      bool fused_something,
+      GpuInstructionFusion(/*may_duplicate=*/false).Run(module.get()));
+  EXPECT_FALSE(fused_something);
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              Not(op::Fusion()));
+}
+
 // Compute sum(1/p0), where p0 has type f32, twice.  Check that the division is
 // duplicated and fused into both reduces.
 TEST_F(InstructionFusionTest, FloatingPointDivIsCheap) {
@@ -723,7 +746,7 @@ TEST_F(InstructionFusionTest, AvoidsLargeFusion) {
     sum = b.AddInstruction(
         HloInstruction::CreateBinary(shape, HloOpcode::kAdd, sum, param));
   }
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(b.Build());
   EXPECT_TRUE(GpuInstructionFusion(/*may_duplicate=*/true)
                   .Run(module.get())
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter.cc
index 7fcdd805ed3..6693f66d62d 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter.cc
@@ -63,9 +63,6 @@ IrEmitter::IrEmitter(const HloModuleConfig& hlo_module_config,
                 &ir_emitter_context->buffer_assignment(), &b_, module_,
                 is_nested),
       hlo_module_config_(hlo_module_config) {
-  b_.setFastMathFlags(llvm_ir::GetFastMathFlags(
-      /*fast_math_enabled=*/hlo_module_config.debug_options()
-          .xla_gpu_enable_fast_math()));
 }
 
 Status IrEmitter::DefaultAction(HloInstruction* hlo) {
@@ -97,6 +94,18 @@ Status IrEmitter::HandleBitcast(HloInstruction* bitcast) {
   return Status::OK();
 }
 
+Status IrEmitter::HandleAddDependency(HloInstruction* add_dependency) {
+  VLOG(2) << "HandleAddDependency: " << add_dependency->ToString();
+  const HloInstruction* operand = add_dependency->operand(0);
+  // Add_Dependency is a no-op, but we still want to bind it to an llvm::Value
+  // sometimes, e.g., when it's operand is a constant or a bitcast of a
+  // constant.
+  if (bindings_.BoundToIrValue(*operand)) {
+    bindings_.BindHloToIrValue(*add_dependency, GetBasePointer(*operand));
+  }
+  return Status::OK();
+}
+
 Status IrEmitter::HandleGetTupleElement(HloInstruction* get_tuple_element) {
   auto operand = get_tuple_element->operand(0);
   CHECK(bindings_.BoundToIrValue(*operand));
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter.h b/tensorflow/compiler/xla/service/gpu/ir_emitter.h
index 56c3f452006..2da46c01693 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter.h
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter.h
@@ -100,6 +100,7 @@ class IrEmitter : public DfsHloVisitorWithDefault,
   Status HandleBatchNormInference(HloInstruction* batch_norm) override;
   Status HandleBatchNormTraining(HloInstruction* batch_norm) override;
   Status HandleBatchNormGrad(HloInstruction* batch_norm) override;
+  Status HandleAddDependency(HloInstruction* add_dependency) override;
 
   Status FinishVisit(HloInstruction* root) override { return Status::OK(); }
 
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
index 87b6cd640ac..bbe1583c011 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
@@ -22,7 +22,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h"
 
 #include "absl/algorithm/container.h"
-#include "absl/container/inlined_vector.h"
 #include "absl/memory/memory.h"
 #include "absl/strings/str_cat.h"
 #include "absl/types/optional.h"
@@ -65,11 +64,11 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_instructions.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/buffer_assignment_util.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/dynamic_update_slice_util.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.h"
-#include "tensorflow/compiler/xla/service/llvm_ir/kernel_support_library.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/sort_util.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/tuple_ops.h"
@@ -88,6 +87,8 @@ limitations under the License.
 namespace xla {
 namespace gpu {
 
+using llvm_ir::KernelMappingScheme;
+
 namespace {
 
 using absl::InlinedVector;
@@ -546,91 +547,7 @@ Status IrEmitterUnnested::HandleFusion(HloInstruction* fusion) {
           // TODO(b/112040122): Support variadic reduce.
           return Unimplemented("Variadic reduce is not supported on GPU");
         }
-        VLOG(3) << "Emitting fused reduction to vector: " << fusion->ToString();
-        std::vector<std::unique_ptr<Thunk>> thunks;
-        absl::Span<HloInstruction* const> output_instructions =
-            root->opcode() == HloOpcode::kTuple
-                ? root->operands()
-                : absl::Span<HloInstruction* const>(&root, 1);
-
-        // For multi-output fusion emit an initializer for each tuple element.
-        // Otherwise it's sufficient to just initialize the single output.
-        HloInstruction* first_reduce = nullptr;
-        for (int i = 0, e = output_instructions.size(); i != e; ++i) {
-          if (output_instructions[i]->opcode() == HloOpcode::kReduce) {
-            TF_ASSIGN_OR_RETURN(
-                std::unique_ptr<Thunk> initializer_thunk,
-                BuildInitializerThunk(fusion, output_instructions[i] == root
-                                                  ? ShapeIndex()
-                                                  : ShapeIndex({i})));
-            thunks.push_back(std::move(initializer_thunk));
-            first_reduce =
-                first_reduce == nullptr ? output_instructions[i] : first_reduce;
-          }
-        }
-        CHECK(first_reduce != nullptr);
-        std::unique_ptr<KernelThunk> kernel_thunk =
-            BuildKernelThunk(fusion, /*implements_whole_instruction=*/false);
-        GpuElementalIrEmitter elemental_emitter(
-            hlo_module_config_, ir_emitter_context_->llvm_module(), &b_,
-            GetNestedComputer());
-        FusedIrEmitter fused_emitter(GetGeneratorForOperandIrArrays(fusion),
-                                     &elemental_emitter);
-        TF_RETURN_IF_ERROR(root->Accept(&fused_emitter));
-
-        // For multi-output fusion CHECK the constraints and feed all the
-        // reduces into a single loop code generator. Single-output reduce
-        // fusion is a special case of that.
-        InlinedVector<llvm_ir::ElementGenerator, 1> input_gens;
-        InlinedVector<llvm_ir::ElementGenerator, 1> init_value_gens;
-        std::vector<std::pair<llvm_ir::ElementGenerator, ShapeIndex>>
-            extra_output_gens;
-        InlinedVector<HloComputation*, 1> reducers;
-        InlinedVector<ShapeIndex, 1> reduce_output_shapes;
-        for (int i = 0, e = output_instructions.size(); i != e; ++i) {
-          const HloInstruction* inst = output_instructions[i];
-          ShapeIndex output_shape_index;
-          if (root->opcode() == HloOpcode::kTuple) {
-            output_shape_index = {i};
-          }
-          if (inst->opcode() == HloOpcode::kReduce) {
-            CHECK(IsReductionToVector(*inst))
-                << "Only reductions to vector are supported";
-            // Shapes, layouts and dimensions must be the same for all reduces
-            // inside of this fusion.
-            CHECK(ShapeUtil::Equal(first_reduce->shape(), inst->shape()));
-            CHECK(ShapeUtil::Equal(first_reduce->operand(0)->shape(),
-                                   inst->operand(0)->shape()));
-            CHECK(ShapeUtil::Equal(first_reduce->operand(1)->shape(),
-                                   inst->operand(1)->shape()));
-            CHECK(first_reduce->dimensions() == inst->dimensions());
-            input_gens.push_back(fused_emitter.GetGenerator(inst->operand(0)));
-            init_value_gens.push_back(
-                fused_emitter.GetGenerator(inst->operand(1)));
-            reducers.push_back(inst->to_apply());
-            reduce_output_shapes.push_back(std::move(output_shape_index));
-          } else {
-            // For extra outputs we can relax shape equality to allow different
-            // types (with the same number of elements). Layouts still have to
-            // match.
-            CHECK(ShapeUtil::CompatibleIgnoringElementType(
-                first_reduce->operand(0)->shape(), inst->shape()));
-            CHECK(LayoutUtil::Equal(first_reduce->operand(0)->shape().layout(),
-                                    inst->shape().layout()));
-            extra_output_gens.emplace_back(fused_emitter.GetGenerator(inst),
-                                           std::move(output_shape_index));
-          }
-        }
-        const Shape& input_shape = first_reduce->operand(0)->shape();
-        TF_CHECK_OK(EmitReductionToVector(
-            kernel_thunk.get(), first_reduce, input_shape, input_gens,
-            init_value_gens, first_reduce->dimensions(), reducers,
-            reduce_output_shapes, extra_output_gens));
-        thunks.push_back(std::move(kernel_thunk));
-        std::unique_ptr<SequentialThunk> sequential_thunk =
-            absl::make_unique<SequentialThunk>(std::move(thunks), fusion);
-        AddThunkToThunkSequence(std::move(sequential_thunk));
-        return Status::OK();
+        return EmitReductionToVector(fusion);
       }
       default:
         LOG(FATAL) << "Bad opcode for input fusion: "
@@ -700,13 +617,12 @@ Status IrEmitterUnnested::HandleCopy(HloInstruction* copy) {
 }
 
 Status IrEmitterUnnested::EmitExtraOutputsForReduce(
-    const HloInstruction* reduce, const IrArray::Index& index,
+    const HloInstruction* unnested_hlo, const IrArray::Index& index,
     absl::Span<const std::pair<llvm_ir::ElementGenerator, ShapeIndex>>
         extra_output_gens) {
   for (int i = 0; i != extra_output_gens.size(); ++i) {
-    const HloInstruction* output = reduce->parent()->FusionInstruction();
     llvm::Value* extra_output_address =
-        GetIrArray(*output, *output, extra_output_gens[i].second)
+        GetIrArray(*unnested_hlo, *unnested_hlo, extra_output_gens[i].second)
             .EmitArrayElementAddress(index, &b_,
                                      "extra_output_element_address");
     TF_ASSIGN_OR_RETURN(llvm::Value* const extra_output_ir_value,
@@ -716,984 +632,13 @@ Status IrEmitterUnnested::EmitExtraOutputsForReduce(
   return Status::OK();
 }
 
-Status IrEmitterUnnested::EmitReductionToScalar(
-    KernelThunk* kernel_thunk, HloInstruction* reduce, const Shape& input_shape,
-    absl::Span<const llvm_ir::ElementGenerator> input_gens,
-    absl::Span<const llvm_ir::ElementGenerator> init_value_gens,
-    absl::Span<HloComputation* const> reducers,
-    absl::Span<const ShapeIndex> reduce_output_shapes,
-    absl::Span<const std::pair<llvm_ir::ElementGenerator, ShapeIndex>>
-        extra_output_gens) {
-  // Number of elements processed by a single thread.
-  constexpr int64 kTileSize = 16;
-  int64 num_elems = ShapeUtil::ElementsIn(input_shape);
-
-  // Round up the number of tiles to a multiple of the warp size.  This is
-  // necessary for correctness.  We launch one thread per tile, and if the
-  // number of threads isn't a multiple of the number of the warp size, our
-  // shuffles will read from inactive threads, producing undefined values.
-  int64 num_tiles =
-      RoundUpToNearest(CeilOfRatio(num_elems, kTileSize), kWarpSize);
-
-  Shape tiled_input_shape = ShapeUtil::MakeShapeWithLayout(
-      reduce->shape().element_type(), {num_tiles}, {0});
-  LaunchDimensions launch_dimensions = CalculateLaunchDimensions(
-      tiled_input_shape, ir_emitter_context_->device_description());
-
-  llvm::Type* index_ty =
-      GetIndexTypeForKernel(reduce, launch_dimensions.launch_bound(), &b_);
-
-  auto index_typed_constant = [&](uint64 c) -> llvm::Constant* {
-    return llvm::ConstantInt::get(index_ty, c);
-  };
-
-  // Check whether every thread will process a full tile's worth of elements
-  // without reading outside the bounds of the input.  If this is true, we can
-  // skip some bounds checks in the final algorithm.
-  bool all_threads_in_bounds = num_tiles * kTileSize == num_elems;
-
-  // __global__ void full_reduce_kernel() {
-  //   x_in_tiles = threadIdx.x + blockIdx.x * blockDim.x;
-  //   x = x_in_tiles * kTileSize;
-  //
-  //   partial_result = init_value;
-  //   if (all_threads_in_bounds || x + kTileSize <= num_elems) {
-  //     for (i = 0; i < kTileSize; ++i) {
-  //       partial_result = Reducer(partial_result, input[x + i]);
-  //     }
-  //   } else {
-  //     for (i = 0; i < kTileSize; ++i) {
-  //       if (x + i < num_elems) {
-  //         partial_result = Reducer(partial_result, input[x + i]);
-  //       }
-  //     }
-  //   }
-  //   for (i = warpSize / 2; i > 0; i /= 2) {
-  //     partial_result = Reducer(partial_result,
-  //                              __shfl_down(partial_result, i));
-  //   }
-  //   if (lane_id == 0) {
-  //     AtomicReducer(&output[y], partial_result);
-  //   }
-  // }
-  //
-  // // Choose num_blocks and threads_per_block such that:
-  // //
-  // //   num_blocks * threads_per_block =
-  // //     RoundUpToNextMultipleOf(Ceil(num_elems / kTileSize), warpSize),
-  // //
-  // // and threads_per_block is a multiple of warpSize.
-  // reduce_kernel  //
-  auto loop_body_emitter = [=](const IrArray::Index& tile_index) -> Status {
-    const int num_reduces = reducers.size();
-    llvm::Type* element_ir_type =
-        llvm_ir::PrimitiveTypeToIrType(input_shape.element_type(), module_);
-    std::vector<llvm::Value*> partial_reduction_result_addresses;
-    for (int i = 0; i != num_reduces; ++i) {
-      llvm::Value* partial_reduction_result_address =
-          Alloca(element_ir_type, /*ArraySize=*/nullptr,
-                 "partial_reduction_result." + llvm::Twine(i));
-      TF_ASSIGN_OR_RETURN(llvm::Value* const init_ir_value,
-                          init_value_gens[i](IrArray::Index(index_ty)));
-      Store(init_ir_value, partial_reduction_result_address);
-      partial_reduction_result_addresses.push_back(
-          partial_reduction_result_address);
-    }
-
-    llvm::Value* x_in_tiles = tile_index[0];
-    x_in_tiles = ZExtOrTrunc(x_in_tiles, index_ty);
-
-    // Emit an inner for-loop that reduces the elements in the tile.
-    auto emit_tile_element_loop = [=](bool tile_in_bounds) -> Status {
-      std::unique_ptr<llvm_ir::ForLoop> tile_element_loop =
-          llvm_ir::ForLoop::EmitForLoop(
-              "element_id_in_tile", index_typed_constant(0),
-              index_typed_constant(kTileSize), index_typed_constant(1), &b_);
-
-      // Emit the body of the partial reduction loop.
-      llvm_ir::SetToFirstInsertPoint(tile_element_loop->GetBodyBasicBlock(),
-                                     &b_);
-      llvm::Value* x =
-          NSWAdd(NSWMul(x_in_tiles, index_typed_constant(kTileSize)),
-                 tile_element_loop->GetIndVarValue());
-      // Unless we know the tile is entirely in bounds, we have to emit a
-      // x-in-bounds check before reading from the input.
-      if (!tile_in_bounds) {
-        llvm_ir::LlvmIfData if_data = llvm_ir::EmitIfThenElse(
-            ICmpULT(x, index_typed_constant(num_elems)), "x_in_bounds", &b_);
-
-        // Emit code that reads the input element and accumulates it to
-        // the partial reduction result.
-        llvm_ir::SetToFirstInsertPoint(if_data.true_block, &b_);
-      }
-
-      IrArray::Index input_index(
-          /*linear=*/x, input_shape, &b_);
-      llvm::Value* input_address = Alloca(element_ir_type);
-      for (int i = 0; i != num_reduces; ++i) {
-        TF_ASSIGN_OR_RETURN(llvm::Value* const input_ir_value,
-                            input_gens[i](input_index));
-        Store(input_ir_value, input_address);
-        TF_RETURN_IF_ERROR(EmitCallToNestedComputation(
-            *reducers[i],
-            {partial_reduction_result_addresses[i], input_address},
-            partial_reduction_result_addresses[i]));
-      }
-      return EmitExtraOutputsForReduce(reduce, input_index, extra_output_gens);
-    };
-
-    // x_end = kTileSize + x_in_tiles * kTileSize, i.e., the location that's
-    // immediately beyond the tile.
-    llvm::Value* x_end =
-        NSWAdd(index_typed_constant(kTileSize),
-               NSWMul(x_in_tiles, index_typed_constant(kTileSize)));
-    // The tile is entirely in bound if all_threads_in_bounds or
-    // x_end <= num_elems.
-    llvm::Value* tile_in_bounds =
-        Or(ICmpULE(x_end, index_typed_constant(num_elems)),
-           b_.getInt1(all_threads_in_bounds));
-    llvm_ir::LlvmIfData if_tile_in_bounds_data =
-        llvm_ir::EmitIfThenElse(tile_in_bounds, "tile_in_bounds", &b_);
-    llvm_ir::SetToFirstInsertPoint(if_tile_in_bounds_data.true_block, &b_);
-    TF_RETURN_IF_ERROR(emit_tile_element_loop(/*tile_in_bounds=*/true));
-    llvm_ir::SetToFirstInsertPoint(if_tile_in_bounds_data.false_block, &b_);
-    TF_RETURN_IF_ERROR(emit_tile_element_loop(/*tile_in_bounds=*/false));
-
-    // After the if-then-else statement on tile_in_bounds, emit calls to
-    // shfl_down that accumulate the partial reduction results of all threads
-    // from the warp.
-    llvm_ir::SetToFirstInsertPoint(if_tile_in_bounds_data.after_block, &b_);
-    int bit_width = llvm_ir::GetSizeInBits(element_ir_type);
-    // bitcast cannot be applied to aggregate types (even packed ones), so we
-    // instead bitcast addresses of load/store to intN* of the same bit-width.
-    llvm::Type* shuffle_ir_type = element_ir_type->isStructTy()
-                                      ? b_.getIntNTy(bit_width)
-                                      : element_ir_type;
-    for (int shuffle_distance = kWarpSize / 2; shuffle_distance >= 1;
-         shuffle_distance /= 2) {
-      llvm::Value* result_from_other_lane =
-          Alloca(element_ir_type, nullptr, "result_from_other_lane");
-      for (int i = 0; i != num_reduces; ++i) {
-        llvm::Value* partial_reduction_result =
-            Load(BitCast(partial_reduction_result_addresses[i],
-                         shuffle_ir_type->getPointerTo()),
-                 "partial_reduction_result");
-        CHECK_EQ(launch_dimensions.threads_per_block() % kWarpSize, 0)
-            << "Requires block size a multiple of the warp size, otherwise we "
-               "will read undefined elements.";
-        Store(EmitFullWarpShuffleDown(partial_reduction_result,
-                                      b_.getInt32(shuffle_distance), &b_),
-              BitCast(result_from_other_lane, shuffle_ir_type->getPointerTo()));
-        TF_RETURN_IF_ERROR(EmitCallToNestedComputation(
-            *reducers[i],
-            {partial_reduction_result_addresses[i], result_from_other_lane},
-            partial_reduction_result_addresses[i]));
-      }
-    }
-
-    const HloInstruction* output =
-        reduce->IsFused() ? reduce->parent()->FusionInstruction() : reduce;
-
-    // Emit an atomic operation that accumulates the partial reduction result of
-    // lane 0 (which holds the partially accumulated result for its warp) to the
-    // output element.
-    llvm::Value* lane_id =
-        URem(x_in_tiles, index_typed_constant(kWarpSize), "lane_id");
-    llvm_ir::LlvmIfData if_lane_id_is_zero_data = llvm_ir::EmitIfThenElse(
-        ICmpEQ(lane_id, index_typed_constant(0)), "lane_id_is_zero", &b_);
-    llvm_ir::SetToFirstInsertPoint(if_lane_id_is_zero_data.true_block, &b_);
-
-    for (int i = 0; i != num_reduces; ++i) {
-      llvm::Value* output_address =
-          GetIrArray(*output, *output, reduce_output_shapes[i])
-              .EmitArrayElementAddress(
-                  IrArray::Index(
-                      /*linear=*/b_.getInt64(0),
-                      ShapeUtil::GetSubshape(output->shape(),
-                                             reduce_output_shapes[i]),
-                      &b_),
-                  &b_, "output_element_address");
-      TF_RETURN_IF_ERROR(EmitAtomicOperationForNestedComputation(
-          *reducers[i], output_address, partial_reduction_result_addresses[i]));
-    }
-    return Status::OK();
-  };
-
-  // Emit a parallel loop that iterates through all input tiles, one per thread.
-  UpdateLaunchDimensions(launch_dimensions, kernel_thunk,
-                         ir_emitter_context_->llvm_module());
-  return ParallelLoopEmitter(loop_body_emitter, tiled_input_shape,
-                             launch_dimensions, &b_)
-      .EmitLoop(IrName(reduce), index_ty);
-}
-
-Status IrEmitterUnnested::EmitColumnReduction(
-    KernelThunk* kernel_thunk, int64 height, int64 width,
-    HloInstruction* reduce, const Shape& input_shape,
-    absl::Span<const llvm_ir::ElementGenerator> input_gens,
-    absl::Span<const llvm_ir::ElementGenerator> init_value_gens,
-    absl::Span<HloComputation* const> reducers,
-    absl::Span<const ShapeIndex> reduce_output_shapes,
-    absl::Span<const std::pair<llvm_ir::ElementGenerator, ShapeIndex>>
-        extra_output_gens) {
-  // Divide the input matrix into tiles of size KxL. For example, when the
-  // input matrix is 4x4, K=2, and L=1 the tiled matrix looks like
-  //
-  //   0123
-  //   0123
-  //   4567
-  //   4567  // Numbers indicate tile IDs.
-  //
-  // Each tile is first partially reduced to a scalar by a thread, and then the
-  // scalar is accumulated to the output vector using atomic operations.
-  //
-  // We choose 128 as the tile size based on empirical evidence. It's big enough
-  // to reduce the amount of atomic adds in the end, maximizing the memory
-  // bandwidth. A tile width of 2 allows for high memory bandwidth utilization
-  // on 16b input data.
-  constexpr int64 kTileHeight = 128;
-  constexpr int64 kTileWidth = 2;
-
-  // If the height is not a multiple of kTileHeight, we pad the bottom of the
-  // input matrix.
-  const int64 height_in_tiles = CeilOfRatio(height, kTileHeight);
-  // If width is not a multiple of kTileWidth the rightmost thread will process
-  // fewer input elements.
-  const int64 width_in_tiles = CeilOfRatio(width, kTileWidth);
-  Shape tiled_input_shape =
-      ShapeUtil::MakeShapeWithLayout(reduce->shape().element_type(),
-                                     {height_in_tiles, width_in_tiles}, {1, 0});
-  LaunchDimensions launch_dimensions = CalculateLaunchDimensions(
-      tiled_input_shape, ir_emitter_context_->device_description());
-
-  // TODO(b/110211620): Convert to use i32 index_type when it is possible.
-  llvm::Type* index_ty = b_.getInt64Ty();
-
-  auto index_typed_constant = [&](uint64 c) -> llvm::Constant* {
-    return llvm::ConstantInt::get(index_ty, c);
-  };
-
-  // for (linear_index = threadIdx.x + blockIdx.x * blockDim.x;
-  //      linear_index < height_in_tiles * width_in_tiles;
-  //      linear_index += blockDim.x * gridDim.x) {
-  //   y_in_tiles = linear_index / width_in_tiles;
-  //   x_in_tiles = linear_index % width_in_tiles;
-  //
-  //   partial_results[kTileWidth] = init_values;
-  //   tile_in_y_bounds = height % kTileHeight == 0 ||
-  //       y_in_tiles * kTileHeight + kTileHeight <= height;
-  //   tile_in_x_bounds = width % kTileWidth == 0 ||
-  //       x_in_tiles * kTileWidth + kTileWidth <= width;
-  //   // The implementation handles y and x bound checks separately.
-  //   if (tile_in_y_bounds && tile_in_x_bounds) {
-  //     for (y_offset : range(kTileHeight)) {
-  //       y = y_in_tiles * kTileHeight + y_offset;
-  //       for (x_offset : range(kTileWidth)) {
-  //         x = x_in_tiles * kTileWidth + x_offset;
-  //         partial_result = Reducer(partial_result[x_offset], input[y][x]);
-  //       }
-  //     }
-  //   } else {
-  //     for (y_offset : range(kTileHeight)) {
-  //       y = y_in_tiles * kTileHeight + y_offset;
-  //       for (y_offset : range(kTileHeight)) {
-  //         x = x_in_tiles * kTileWidth + x_offset;
-  //         if (y < height && x < width) {
-  //           partial_result = Reducer(partial_result, input[y][x]);
-  //         }
-  //       }
-  //     }
-  //   }
-  //   for (x_offset : range(kTileWidth)) {
-  //     AtomicReducer(&output[x + x_offset], partial_result[x_offset]);
-  //   }
-  // }
-  auto loop_body_emitter = [=](const IrArray::Index& tile_index) -> Status {
-    const int num_reduces = reducers.size();
-    // Emit the loop body that reduces one tile.
-    llvm::Type* element_ir_type =
-        llvm_ir::PrimitiveTypeToIrType(input_shape.element_type(), module_);
-    std::vector<llvm::Value*> partial_reduction_result_addresses;
-    for (int i = 0; i != num_reduces; ++i) {
-      for (int x_offset = 0; x_offset < kTileWidth; ++x_offset) {
-        llvm::Value* partial_reduction_result_address =
-            Alloca(element_ir_type, /*ArraySize=*/nullptr,
-                   "partial_reduction_result." +
-                       llvm::Twine(i * kTileWidth + x_offset));
-        TF_ASSIGN_OR_RETURN(llvm::Value* const init_ir_value,
-                            init_value_gens[i](IrArray::Index(index_ty)));
-        Store(init_ir_value, partial_reduction_result_address);
-        partial_reduction_result_addresses.push_back(
-            partial_reduction_result_address);
-      }
-    }
-
-    // Emit an inner for-loop that partially reduces the elements in the given
-    // tile.
-    llvm::Value* y_in_tiles = tile_index[0];
-    llvm::Value* x_in_tiles = tile_index[1];
-
-    y_in_tiles = ZExtOrTrunc(y_in_tiles, index_ty);
-    x_in_tiles = ZExtOrTrunc(x_in_tiles, index_ty);
-
-    auto emit_tile_element_loop = [=](bool tile_in_y_bounds,
-                                      bool tile_in_x_bounds) -> Status {
-      std::unique_ptr<llvm_ir::ForLoop> tile_element_loop =
-          llvm_ir::ForLoop::EmitForLoop(
-              "element_id_in_tile", index_typed_constant(0),
-              index_typed_constant(kTileHeight), index_typed_constant(1), &b_);
-
-      // Emit the body of the partial reduction loop.
-      llvm_ir::SetToFirstInsertPoint(tile_element_loop->GetBodyBasicBlock(),
-                                     &b_);
-      llvm::Value* y =
-          NSWAdd(NSWMul(y_in_tiles, index_typed_constant(kTileHeight)),
-                 tile_element_loop->GetIndVarValue());
-
-      // Unless we know that y is in bounds, we have to emit a check before
-      // reading from the input.
-      if (!tile_in_y_bounds) {
-        llvm_ir::LlvmIfData if_data = llvm_ir::EmitIfThenElse(
-            ICmpULT(y, index_typed_constant(height)), "y_in_bounds", &b_);
-
-        // Emit code that reads the input element and accumulates it to
-        // the partial reduction result.
-        llvm_ir::SetToFirstInsertPoint(if_data.true_block, &b_);
-      }
-      for (int x_offset = 0; x_offset < kTileWidth; ++x_offset) {
-        llvm::Value* x =
-            NSWAdd(NSWMul(x_in_tiles, index_typed_constant(kTileWidth)),
-                   index_typed_constant(x_offset));
-        // Unless we know that x is in bounds, we have to emit a check before
-        // reading from the input.
-        if (!tile_in_x_bounds) {
-          llvm_ir::LlvmIfData if_data = llvm_ir::EmitIfThenElse(
-              ICmpULT(x, index_typed_constant(width)), "x_in_bounds", &b_);
-          llvm_ir::SetToFirstInsertPoint(if_data.true_block, &b_);
-        }
-        llvm::Value* input_address = Alloca(element_ir_type);
-        // {y,x} is an index to input_matrix_shape [height,width]. We need to
-        // convert that to an index to input_shape (the shape of the operand of
-        // "reduce"). This conversion is composed of a transposition from
-        // input_shape to normalized_input_shape and a reshape from
-        // normalized_input_shape to input_matrix_shape.
-        const Shape normalized_input_shape =
-            ShapeUtil::MakeShapeWithDescendingLayoutAndSamePhysicalLayout(
-                input_shape);
-        auto input_shape_min2maj = LayoutUtil::MinorToMajor(input_shape);
-        const std::vector<int64> transpose_dimension_mapping(
-            input_shape_min2maj.rbegin(), input_shape_min2maj.rend());
-
-        const Shape input_matrix_shape =
-            ShapeUtil::MakeShapeWithDescendingLayout(input_shape.element_type(),
-                                                     {height, width});
-        const IrArray::Index input_matrix_index({y, x}, input_matrix_shape,
-                                                &b_);
-        const IrArray::Index input_index =
-            input_matrix_index
-                .SourceIndexOfReshape(input_matrix_shape,
-                                      normalized_input_shape, &b_)
-                .SourceIndexOfTranspose(normalized_input_shape, input_shape,
-                                        transpose_dimension_mapping, &b_);
-        for (int i = 0; i != num_reduces; ++i) {
-          TF_ASSIGN_OR_RETURN(llvm::Value* const input_ir_value,
-                              input_gens[i](input_index));
-          Store(input_ir_value, input_address);
-          TF_RETURN_IF_ERROR(EmitCallToNestedComputation(
-              *reducers[i],
-              {partial_reduction_result_addresses[i * kTileWidth + x_offset],
-               input_address},
-              partial_reduction_result_addresses[i * kTileWidth + x_offset]));
-          TF_RETURN_IF_ERROR(EmitExtraOutputsForReduce(reduce, input_index,
-                                                       extra_output_gens));
-        }
-      }
-      return Status::OK();
-    };
-
-    // y_end = kTileHeight + y_in_tiles * kTileHeight, i.e., the y location
-    // that's immediately beyond the tile.
-    llvm::Value* y_end =
-        NSWAdd(index_typed_constant(kTileHeight),
-               NSWMul(y_in_tiles, index_typed_constant(kTileHeight)));
-    // x_end = kTileWidth + x_in_tiles * kTileWidth, i.e., the x location
-    // that's immediately beyond the tile.
-    llvm::Value* x_end =
-        NSWAdd(index_typed_constant(kTileWidth),
-               NSWMul(x_in_tiles, index_typed_constant(kTileWidth)));
-    llvm::Value* tile_in_y_bounds =
-        Or(ICmpULE(y_end, index_typed_constant(height)),
-           b_.getInt1(height % kTileHeight == 0));
-    llvm::Value* tile_in_x_bounds =
-        Or(ICmpULE(x_end, index_typed_constant(width)),
-           b_.getInt1(width % kTileWidth == 0));
-    // The tile is in y bounds if "height" is a multiple of kTileHeight or
-    // y_end <= height.
-    llvm_ir::LlvmIfData if_tile_in_y_bounds_data =
-        llvm_ir::EmitIfThenElse(tile_in_y_bounds, "tile_in_y_bounds", &b_);
-    llvm_ir::SetToFirstInsertPoint(if_tile_in_y_bounds_data.true_block, &b_);
-    // The tile is in x bounds if "width" is a multiple of kTileWidth or
-    // x_end <= width.
-    llvm_ir::LlvmIfData if_tile_in_x_bounds_data =
-        llvm_ir::EmitIfThenElse(tile_in_x_bounds, "tile_in_x_bounds", &b_);
-    llvm_ir::SetToFirstInsertPoint(if_tile_in_x_bounds_data.true_block, &b_);
-    TF_RETURN_IF_ERROR(emit_tile_element_loop(/*tile_in_y_bounds=*/true,
-                                              /*tile_in_x_bounds=*/true));
-    llvm_ir::SetToFirstInsertPoint(if_tile_in_x_bounds_data.false_block, &b_);
-    TF_RETURN_IF_ERROR(emit_tile_element_loop(/*tile_in_y_bounds=*/true,
-                                              /*tile_in_x_bounds=*/false));
-    llvm_ir::SetToFirstInsertPoint(if_tile_in_y_bounds_data.false_block, &b_);
-    if_tile_in_x_bounds_data =
-        llvm_ir::EmitIfThenElse(tile_in_x_bounds, "tile_in_x_bounds", &b_);
-    llvm_ir::SetToFirstInsertPoint(if_tile_in_x_bounds_data.true_block, &b_);
-    TF_RETURN_IF_ERROR(emit_tile_element_loop(/*tile_in_y_bounds=*/false,
-                                              /*tile_in_x_bounds=*/true));
-    llvm_ir::SetToFirstInsertPoint(if_tile_in_x_bounds_data.false_block, &b_);
-    TF_RETURN_IF_ERROR(emit_tile_element_loop(/*tile_in_y_bounds=*/false,
-                                              /*tile_in_x_bounds=*/false));
-
-    // After the nested if-then-else statement on tile_in_y_bounds and
-    // tile_in_x_bounds, emit atomic operations to accumulate the partial
-    // reduction result to the output element.
-    llvm_ir::SetToFirstInsertPoint(if_tile_in_y_bounds_data.after_block, &b_);
-    const HloInstruction* output =
-        reduce->IsFused() ? reduce->parent()->FusionInstruction() : reduce;
-    for (int i = 0; i != num_reduces; ++i) {
-      for (int x_offset = 0; x_offset < kTileWidth; ++x_offset) {
-        llvm::Value* x =
-            NSWAdd(NSWMul(x_in_tiles, index_typed_constant(kTileWidth)),
-                   index_typed_constant(x_offset));
-        llvm::Value* output_address =
-            GetIrArray(*output, *output, reduce_output_shapes[i])
-                .EmitArrayElementAddress(
-                    IrArray::Index(
-                        x,
-                        ShapeUtil::GetSubshape(output->shape(),
-                                               reduce_output_shapes[i]),
-                        &b_),
-                    &b_, "output_element_address");
-        TF_RETURN_IF_ERROR(EmitAtomicOperationForNestedComputation(
-            *reducers[i], output_address,
-            partial_reduction_result_addresses[i * kTileWidth + x_offset]));
-      }
-    }
-    return Status::OK();
-  };
-
-  // Emit a parallel loop that iterate through all input tiles.
-  UpdateLaunchDimensions(launch_dimensions, kernel_thunk,
-                         ir_emitter_context_->llvm_module());
-  return ParallelLoopEmitter(loop_body_emitter, tiled_input_shape,
-                             launch_dimensions, &b_)
-      .EmitLoop(IrName(reduce), index_ty);
-}
-
-static std::pair<int64, int64> ComputeTilingSchemeForReduction(
-    int64 depth, int64 width, int64 kWarpSize) {
-  constexpr int64 kTargetNumElementsPerThread = 64;
-  int64 x_tile_size = kTargetNumElementsPerThread;
-  int64 z_tile_size = 1;
-
-  // Only tile along the x dimension with tile size kTargetNumElementsPerThread
-  // if doing so doesn't require a slow version of loop with bound check on each
-  // dimension. A more sophisticated heuristics is to enable tile along the
-  // x dimension with tile size kTargetNumElementsPerThread when either width is
-  // a factor of (kWarpSize * kTargetNumElementsPerThread) or width is big
-  // enough so that only a small fraction of the threads execute the slow
-  // version of loop with bound check.
-  if (width % (kWarpSize * kTargetNumElementsPerThread) != 0) {
-    x_tile_size = 8;
-    z_tile_size = 8;
-    while (depth % z_tile_size != 0) {
-      z_tile_size -= 1;
-    }
-  }
-
-  return std::pair<int64, int64>(x_tile_size, z_tile_size);
-}
-
-Status IrEmitterUnnested::EmitRowReduction(
-    KernelThunk* kernel_thunk, int64 depth, int64 height, int64 width,
-    HloInstruction* reduce, const Shape& input_shape,
-    absl::Span<const llvm_ir::ElementGenerator> input_gens,
-    absl::Span<const llvm_ir::ElementGenerator> init_value_gens,
-    absl::Span<HloComputation* const> reducers,
-    absl::Span<const ShapeIndex> reduce_output_shapes,
-    absl::Span<const std::pair<llvm_ir::ElementGenerator, ShapeIndex>>
-        extra_output_gens) {
-  // A naive algorithm is:
-  // 1. Divide the x dimension of the input tensor into tiles of size 1x1xX.
-  // 2. Partially reduces each tile to a scalar using one thread.
-  // 3. Accumulates that scalar to the output vector using atomic operations.
-  //
-  // for (linear_index = threadIdx.x + blockIdx.x * blockDim.x;
-  //      linear_index < depth * height * width_in_tiles;
-  //      linear_index += blockDim.x * gridDim.x) {
-  //   int x_in_tiles = linear_index % width_in_tiles;
-  //   int y = linear_index / width_in_tiles % height;
-  //   int z = linear_index / (height * width_in_tiles);
-  //   float partial_result = 0;
-  //   for (element_id_in_tile : range(x_tile_size)) {
-  //     int x = x_in_tiles * x_tile_size + element_id_in_tile;
-  //     if (x < width)
-  //       partial_result = reducer(partial_result, input[z][y][x]);
-  //   }
-  //   AtomicReducer(&output[y], partial_result);
-  // }
-  //
-  // Four optimizations are performed.
-  //
-  // 1. To coalesce global memory accesses, dilate the tile with a factor of 32
-  // (i.e. the warp size). For example, suppose the width is 8x32=256. Instead
-  // of making each tile consecutive, we let make tile 0 column
-  // [0,32,64,...,224], tile 1 column [1,33,65,...,225], and so on. This ensures
-  // that threads in a warp access consecutive memory in one iteration (i.e.
-  // coalesced). In the above example, the warp that contains thread 0-31
-  // accesses column 0-31 in the first iteration, and 32-63 in the second
-  // iteration, and so on.
-  //
-  // 2. Partially accumulate partial reduced results computed by threads in the
-  // same warp using shfl_down. Using shfl_down is faster than directly using
-  // atomic operations because shfl_down transfers the data between threads
-  // using shared memory and threads in the same warp run in lock step (thus no
-  // extra synchronization needed). See
-  // https://devblogs.nvidia.com/parallelforall/faster-parallel-reductions-kepler/
-  // for details. The downside is, to produce correct results when using
-  // shfl_down, we need to guarantee threads in the same warp work on input
-  // elements with the same y, so the number of tiles in each row must be a
-  // multiple of 32.
-  //
-  // 3. Specialize the case that the entire tile is in bounds. When that is
-  // true, we don't need to emit "if(x<width)" inside the loop on
-  // element_id_in_tile, which makes the code more friendly to optimizations
-  // such as LICM.
-  //
-  // 4. When the width is too small and x_tile_size is less than the target
-  //    number of elements per thread and use a small factor of depth as
-  //    z_tile_size to increase the number of elements calculated by each
-  //    partial sum. This can reduce the needed number of dynamic shfl_down and
-  //    atomic operations.
-  //
-  // for (linear_index = threadIdx.x + blockIdx.x * blockDim.x;
-  //      linear_index < depth * height * width_in_tiles;
-  //      linear_index += blockDim.x * gridDim.x) {
-  //   int x_in_tiles = linear_index % width_in_tiles;
-  //   int y = linear_index / width_in_tiles % height;
-  //   int z_in_tiles = linear_index / (height * width_in_tiles);
-  //   int warp_id = x_in_tiles / warpSize;
-  //   int lane_id = x_in_tiles % warpSize;
-  //   float partial_result = 0;
-  //   int x = warp_id * kTileSize * warpSize + lane_id;
-  //   if (width % (x_tile_size * warpSize) == 0 ||
-  //       x + (x_tile_size - 1) * warpSize < width) {
-  //     // The entire x_tile is in bounds.
-  //     for (int element_id_in_z_tile = 0; element_id_in_z_tile < z_tile_size;
-  //          ++element_id_in_z_tile) {
-  //       z = z_in_tiles * z_tile_size + element_id_in_z_tile;
-  //       int tx = x;
-  //       for (int element_id_in_x_tile = 0;
-  //            element_id_in_x_tile < x_tile_size;
-  //            ++element_id_in_x_tile, tx += warpSize) {
-  //         partial_result = Reducer(partial_result, input[z][y][tx]);
-  //       }
-  //     }
-  //   } else {
-  //     // The tile is partially in bounds.
-  //     for (int element_id_in_z_tile = 0; element_id_in_z_tile < z_tile_size;
-  //          ++element_id_in_z_tile) {
-  //       z = z_in_tiles * z_tile_size + element_id_in_z_tile;
-  //       int tx = x;
-  //       for (int element_id_in_x_tile = 0; element_id_in_x_tile <
-  //            x_tile_size; ++element_id_in_tile, tx += warpSize) {
-  //         if (tx < width)
-  //           partial_result = Reducer(partial_result, input[z][y][tx]);
-  //       }
-  //     }
-  //   }
-  //   for (shuffle_distance = 16; shuffle_distance > 0; shuffle_distance /= 2)
-  //     partial_result = Reducer(
-  //         partial_result,
-  //         __shfl_down_sync(CUDA_WARP_ALL, partial_result, shuffle_distance));
-  //   if (lane_id == 0)
-  //     AtomicReducer(&output[y], partial_result);
-  // }
-  //
-
-  int64 x_tile_size;
-  int64 z_tile_size;
-  std::tie(x_tile_size, z_tile_size) =
-      ComputeTilingSchemeForReduction(depth, width, kWarpSize);
-
-  // Round the width in tiles up to the nearest multiple of kWarpSize, so that
-  // the use of shfl_down is valid.
-  const int64 width_in_tiles =
-      RoundUpToNearest(CeilOfRatio(width, x_tile_size), kWarpSize);
-  Shape tiled_input_shape = ShapeUtil::MakeShapeWithLayout(
-      reduce->shape().element_type(),
-      {depth / z_tile_size, height, width_in_tiles}, {2, 1, 0});
-  LaunchDimensions launch_dimensions = CalculateLaunchDimensions(
-      tiled_input_shape, ir_emitter_context_->device_description());
-  llvm::Type* index_ty =
-      GetIndexTypeForKernel(reduce, launch_dimensions.launch_bound(), &b_);
-
-  auto index_typed_constant = [&](uint64 c) -> llvm::Constant* {
-    return llvm::ConstantInt::get(index_ty, c);
-  };
-
-  auto loop_body_emitter = [=](const IrArray::Index& tile_index) {
-    const int num_reduces = reducers.size();
-    llvm::Type* element_ir_type = llvm_ir::PrimitiveTypeToIrType(
-        input_shape.element_type(), ir_emitter_context_->llvm_module());
-    std::vector<llvm::Value*> partial_reduction_result_addresses;
-    for (int i = 0; i != num_reduces; ++i) {
-      llvm::Value* partial_reduction_result_address =
-          Alloca(element_ir_type, /*ArraySize=*/nullptr,
-                 "partial_reduction_result." + llvm::Twine(i));
-      TF_ASSIGN_OR_RETURN(llvm::Value* const init_ir_value,
-                          init_value_gens[i](IrArray::Index(index_ty)));
-      Store(init_ir_value, partial_reduction_result_address);
-      partial_reduction_result_addresses.push_back(
-          partial_reduction_result_address);
-    }
-
-    llvm::Value* z_tile = tile_index[0];
-    llvm::Value* y = tile_index[1];
-    llvm::Value* x_tile = tile_index[2];
-
-    x_tile = ZExtOrTrunc(x_tile, index_ty);
-
-    llvm::Value* warp_id =
-        UDiv(x_tile, index_typed_constant(kWarpSize), "warp_id");
-    llvm::Value* lane_id =
-        URem(x_tile, index_typed_constant(kWarpSize), "lane_id");
-
-    // The x-location of the last element in this z-x-tile.
-    // last_x = lane_id + warpSize * (x_tile_size - 1 + warp_id * x_tile_size);
-    llvm::Value* last_x = NSWAdd(
-        lane_id,
-        NSWMul(index_typed_constant(kWarpSize),
-               NSWAdd(index_typed_constant(x_tile_size - 1),
-                      NSWMul(warp_id, index_typed_constant(x_tile_size)))));
-
-    KernelSupportLibrary ksl(
-        &b_,
-        /*unroll_mode=*/xla::llvm_ir::UnrollMode::kFullyUnroll,
-        /*prevent_vectorization=*/false);
-
-    // Emit a for-loop that partially reduces the elements in the given
-    // z-x-tile.
-    auto emit_z_x_tile_element_loop = [&](bool x_tile_in_bounds,
-                                          int64 x_tile_loop_bound) -> Status {
-      auto emit_z_tile_element_loop = [&](llvm::Value* z_indvar) -> Status {
-        llvm::Value* z =
-            NSWAdd(z_indvar, NSWMul(index_typed_constant(z_tile_size), z_tile));
-        TF_RETURN_IF_ERROR(ksl.For(
-            "x_tile",
-            /*start=*/index_typed_constant(0),
-            /*end=*/index_typed_constant(x_tile_loop_bound),
-            /*step=*/1, [&](llvm::Value* x_indvar) -> Status {
-              // x = lane_id +
-              //     warpSize * (element_id_in_x_tile + warp_id * x_tile_size);
-              llvm::Value* x = NSWAdd(
-                  lane_id,
-                  NSWMul(index_typed_constant(kWarpSize),
-                         NSWAdd(x_indvar,
-                                NSWMul(warp_id, llvm::ConstantInt::get(
-                                                    index_ty, x_tile_size)))));
-
-              // Unless we know the x-tile is entirely in bounds, we have to
-              // emit a x-in-bounds check before reading from the input.
-              if (!x_tile_in_bounds) {
-                llvm_ir::LlvmIfData if_x_in_bounds_data =
-                    llvm_ir::EmitIfThenElse(
-                        ICmpULT(x, index_typed_constant(width)), "x_in_bounds",
-                        &b_);
-                // Points b_ to the then-block.
-                llvm_ir::SetToFirstInsertPoint(if_x_in_bounds_data.true_block,
-                                               &b_);
-              }
-
-              // Emit code that reads the input element and accumulates it
-              // to the partial reduction result.
-              llvm::Value* input_address = Alloca(element_ir_type);
-              {
-                // {z,y,x} is an index to input_3d_tensor_shape
-                // [depth,height,width]. We need to convert that to an index
-                // to input_shape (the shape of the operand of "reduce").
-                // This conversion is composed of a transposition from
-                // input_shape to normalized_input_shape and a reshape from
-                // normalized_input_shape to input_3d_tensor_shape.
-                const Shape normalized_input_shape = ShapeUtil::
-                    MakeShapeWithDescendingLayoutAndSamePhysicalLayout(
-                        input_shape);
-                auto input_shape_min2maj =
-                    LayoutUtil::MinorToMajor(input_shape);
-                const std::vector<int64> transpose_dimension_mapping(
-                    input_shape_min2maj.rbegin(), input_shape_min2maj.rend());
-                const Shape input_3d_tensor_shape =
-                    ShapeUtil::MakeShapeWithDescendingLayout(
-                        input_shape.element_type(), {depth, height, width});
-                const IrArray::Index input_3d_tensor_index(
-                    {z, y, x}, input_3d_tensor_shape, &b_);
-                const IrArray::Index input_index =
-                    input_3d_tensor_index
-                        .SourceIndexOfReshape(input_3d_tensor_shape,
-                                              normalized_input_shape, &b_)
-                        .SourceIndexOfTranspose(
-                            normalized_input_shape, input_shape,
-                            transpose_dimension_mapping, &b_);
-
-                for (int i = 0; i != num_reduces; ++i) {
-                  TF_ASSIGN_OR_RETURN(llvm::Value* const input_ir_value,
-                                      input_gens[i](input_index));
-                  Store(input_ir_value, input_address);
-                  TF_RETURN_IF_ERROR(EmitCallToNestedComputation(
-                      *reducers[i],
-                      {partial_reduction_result_addresses[i], input_address},
-                      partial_reduction_result_addresses[i]));
-                }
-                return EmitExtraOutputsForReduce(reduce, input_index,
-                                                 extra_output_gens);
-              }
-            }));
-        return Status::OK();
-      };
-
-      return ksl.For("z_tile",
-                     /*start=*/index_typed_constant(0),
-                     /*end=*/index_typed_constant(z_tile_size),
-                     /*step=*/1, emit_z_tile_element_loop);
-    };
-
-    llvm::Value* tile_in_bounds =
-        Or(b_.getInt1(width % (x_tile_size * kWarpSize) == 0),
-           ICmpULT(last_x, index_typed_constant(width)));
-
-    TF_RETURN_IF_ERROR(
-        ksl.If(tile_in_bounds,
-               /*true_block_generator=*/
-               [&]() -> Status {
-                 return emit_z_x_tile_element_loop(/*x_tile_in_bounds=*/true,
-                                                   x_tile_size);
-               },
-               /*false_block_generator=*/
-               [&]() -> Status {
-                 return emit_z_x_tile_element_loop(
-                     /*x_tile_in_bounds=*/false,
-                     CeilOfRatio(width % (x_tile_size * kWarpSize), kWarpSize));
-               }));
-
-    // After accumulating the elements of the z_x_tile, emit calls to
-    // shfl_down that accumulate the partial reduction results of all
-    // threads in a warp.
-    int bit_width = llvm_ir::GetSizeInBits(element_ir_type);
-    // bitcast cannot be applied to aggregate types (even packed ones), so we
-    // instead bitcast addresses of load/store to intN* of the same bit-width.
-    llvm::Type* shuffle_ir_type = element_ir_type->isStructTy()
-                                      ? b_.getIntNTy(bit_width)
-                                      : element_ir_type;
-    for (int shuffle_distance = 16; shuffle_distance >= 1;
-         shuffle_distance /= 2) {
-      llvm::Value* result_from_other_lane =
-          Alloca(element_ir_type, nullptr, "result_from_other_lane");
-      for (int i = 0; i != num_reduces; ++i) {
-        llvm::Value* partial_reduction_result =
-            Load(BitCast(partial_reduction_result_addresses[i],
-                         shuffle_ir_type->getPointerTo()),
-                 "partial_reduction_result");
-        CHECK_EQ(launch_dimensions.threads_per_block() % kWarpSize, 0)
-            << "Requires block size a multiple of the warp size, otherwise we "
-               "will read undefined elements.";
-        Store(EmitFullWarpShuffleDown(partial_reduction_result,
-                                      b_.getInt32(shuffle_distance), &b_),
-              BitCast(result_from_other_lane, shuffle_ir_type->getPointerTo()));
-        TF_RETURN_IF_ERROR(EmitCallToNestedComputation(
-            *reducers[i],
-            {partial_reduction_result_addresses[i], result_from_other_lane},
-            partial_reduction_result_addresses[i]));
-      }
-    }
-
-    const HloInstruction* output =
-        reduce->IsFused() ? reduce->parent()->FusionInstruction() : reduce;
-
-    // Emit an atomic operation that accumulates the partial reduction result of
-    // lane 0 (which holds the partially accumulated result for its warp) to the
-    // output element.
-    llvm_ir::LlvmIfData if_lane_id_is_zero_data = llvm_ir::EmitIfThenElse(
-        ICmpEQ(lane_id, index_typed_constant(0)), "lane_id_is_zero", &b_);
-    llvm_ir::SetToFirstInsertPoint(if_lane_id_is_zero_data.true_block, &b_);
-    for (int i = 0; i != num_reduces; ++i) {
-      llvm::Value* output_address =
-          GetIrArray(*output, *output, reduce_output_shapes[i])
-              .EmitArrayElementAddress(
-                  IrArray::Index(y,
-                                 ShapeUtil::GetSubshape(
-                                     output->shape(), reduce_output_shapes[i]),
-                                 &b_),
-                  &b_, "output_element_address");
-      // We don't need to emit atomic operations if there is only one tile of
-      // results. 'depth' is the z dimension, 'width' is the x dimension.
-      if (z_tile_size >= depth && x_tile_size >= width) {
-        TF_RETURN_IF_ERROR(EmitCallToNestedComputation(
-            *reducers[i],
-            {output_address, partial_reduction_result_addresses[i]},
-            output_address));
-      } else {
-        TF_RETURN_IF_ERROR(EmitAtomicOperationForNestedComputation(
-            *reducers[i], output_address,
-            partial_reduction_result_addresses[i]));
-      }
-    }
-    return Status::OK();
-  };
-
-  // Emit a parallel loop that iterates through every input tiles.
-  UpdateLaunchDimensions(launch_dimensions, kernel_thunk,
-                         ir_emitter_context_->llvm_module());
-  return ParallelLoopEmitter(loop_body_emitter, tiled_input_shape,
-                             launch_dimensions, &b_)
-      .EmitLoop(IrName(reduce), index_ty);
-}
-
-// Figures out whether `reduce` is a row or column reduction, and which
-// dimensions to reduce, and calls either `EmitRowReduction` or
-// `EmitColumnReduction` as appropriate.
-// Prerequisite: all the dimensions to keep are contiguous in the input layout
-//               and, if `reduce` is fused, the fused subgraph is pure
-//               elementwise.
-Status IrEmitterUnnested::EmitReductionToVector(
-    KernelThunk* kernel_thunk, HloInstruction* reduce, const Shape& input_shape,
-    absl::Span<const llvm_ir::ElementGenerator> input_gens,
-    absl::Span<const llvm_ir::ElementGenerator> init_value_gens,
-    absl::Span<const int64> dimensions_to_reduce,
-    absl::Span<HloComputation* const> reducers,
-    absl::Span<const ShapeIndex> reduce_output_shapes,
-    absl::Span<const std::pair<llvm_ir::ElementGenerator, ShapeIndex>>
-        extra_output_gens) {
-  // This emission requires "reduce" to have an input layout. It is either set
-  // by LayoutAssignment (for a top-level kReduce) or by InstructionFusion (for
-  // a fused kReduce).
-  CHECK(input_shape.has_layout()) << "LayoutAssignment or InstructionFusion "
-                                     "doesn't set the input layout of "
-                                  << reduce->ToString();
-
-  // Specialize multi-dimensional-array-to-vector reduction.
-  std::vector<int64> input_dims_to_keep;
-  for (int64 input_dim = 0; input_dim < ShapeUtil::Rank(input_shape);
-       ++input_dim) {
-    if (std::find(dimensions_to_reduce.begin(), dimensions_to_reduce.end(),
-                  input_dim) == dimensions_to_reduce.end()) {
-      input_dims_to_keep.push_back(input_dim);
-    }
-  }
-
-  // Sort the dimensions to keep from minor to major, to facilitate checking
-  // whether another dimension is major or minor of them.
-  std::sort(input_dims_to_keep.begin(), input_dims_to_keep.end(),
-            [&input_shape](int64 dim_a, int64 dim_b) {
-              return PositionInContainer(LayoutUtil::MinorToMajor(input_shape),
-                                         dim_a) <
-                     PositionInContainer(LayoutUtil::MinorToMajor(input_shape),
-                                         dim_b);
-            });
-  // Now, if output rank is at least 1, `input_dims_to_keep.front()` is
-  // minormost and `input_dims_to_keep.back()` is majormost.
-
-  // If the dimensions to keep are minormost, emit a column reduction. As all
-  // the dimensions to keep are contiguous, by prerequisite of
-  // `EmitReductionToVector`, we only need to check whether the minormost
-  // dimension of the input is to keep.
-  if (ShapeUtil::IsEffectiveScalar(reduce->shape())) {
-    return EmitReductionToScalar(kernel_thunk, reduce, input_shape, input_gens,
-                                 init_value_gens, reducers,
-                                 reduce_output_shapes, extra_output_gens);
-  } else if (input_dims_to_keep.front() ==
-             LayoutUtil::Minor(input_shape.layout(), 0)) {
-    // Column reduction. Treat the result of "input" as a matrix whose width
-    // is the most minor dimension and height the product of other dimensions,
-    // and treat "reduce" as a column reduction of the input matrix.
-    const int64 width = ShapeUtil::ElementsIn(reduce->shape());
-    // "width" can be zero, so don't do
-    //   height = ShapeUtil::ElementsIn(input_shape) / width;
-    int64 height = 1;
-    for (int64 input_dim = 0; input_dim < ShapeUtil::Rank(input_shape);
-         ++input_dim) {
-      if (!std::count(input_dims_to_keep.begin(), input_dims_to_keep.end(),
-                      input_dim)) {
-        height *= input_shape.dimensions(input_dim);
-      }
-    }
-    return EmitColumnReduction(kernel_thunk, height, width, reduce, input_shape,
-                               input_gens, init_value_gens, reducers,
-                               reduce_output_shapes, extra_output_gens);
-  } else {
-    // Reduce the row dimension of a matrix or reduce dimension 0 and 2 in a
-    // 3D tensor. The size of dimension 1 (the height) is the size of the
-    // dimension to keep, the size of dimension 0 (the depth) is the product
-    // of dimensions that are more major than the dimension to keep, and the
-    // size of dimension 2 (the width) is the product of more minor
-    // dimensions.
-    int64 depth = 1;
-    int64 width = 1;
-    for (int64 input_dim = 0; input_dim < ShapeUtil::Rank(input_shape);
-         ++input_dim) {
-      if (PositionInContainer(LayoutUtil::MinorToMajor(input_shape),
-                              input_dim) >
-          PositionInContainer(LayoutUtil::MinorToMajor(input_shape),
-                              input_dims_to_keep.back())) {
-        depth *= input_shape.dimensions(input_dim);
-      } else if (PositionInContainer(LayoutUtil::MinorToMajor(input_shape),
-                                     input_dim) <
-                 PositionInContainer(LayoutUtil::MinorToMajor(input_shape),
-                                     input_dims_to_keep.front())) {
-        width *= input_shape.dimensions(input_dim);
-      }
-    }
-    const int64 height = ShapeUtil::ElementsIn(reduce->shape());
-    return EmitRowReduction(kernel_thunk, depth, height, width, reduce,
-                            input_shape, input_gens, init_value_gens, reducers,
-                            reduce_output_shapes, extra_output_gens);
-  }
-}
-
 Status IrEmitterUnnested::HandleReduce(HloInstruction* reduce) {
   // TODO(b/112040122): Support multi-output reduce.
   if (!ShapeUtil::IsArray(reduce->shape())) {
     return Unimplemented("Multi-output reduce is not supported on GPU");
   }
-  auto input = reduce->operand(0);
-  auto init_value = reduce->operand(1);
-  absl::Span<const int64> dimensions_to_reduce(reduce->dimensions());
-  HloComputation* reducer = reduce->to_apply();
-  // HandleReduce specializes reduction from a multi-dimensional array to a 1D
-  // array. The specialized version requires an initializer thunk that
-  // initializes the output array to the initial value of the reduce.
   if (IsReductionToVector(*reduce)) {
-    TF_ASSIGN_OR_RETURN(std::unique_ptr<Thunk> initializer_thunk,
-                        BuildInitializerThunk(reduce));
-    std::vector<std::unique_ptr<Thunk>> thunks;
-    thunks.push_back(std::move(initializer_thunk));
-    std::unique_ptr<KernelThunk> kernel_thunk =
-        BuildKernelThunk(reduce, /*implements_whole_instruction=*/false);
-
-    TF_CHECK_OK(EmitReductionToVector(
-        kernel_thunk.get(), reduce, input->shape(),
-        {[&](const IrArray::Index& index) {
-          return GetIrArray(*input, *reduce).EmitReadArrayElement(index, &b_);
-        }},
-        {[&](const IrArray::Index& index) {
-          return GetIrArray(*init_value, *reduce)
-              .EmitReadArrayElement(index, &b_);
-        }},
-        dimensions_to_reduce, {reducer}, {{}}, {}));
-
-    thunks.push_back(std::move(kernel_thunk));
-
-    std::unique_ptr<SequentialThunk> sequential_thunk =
-        absl::make_unique<SequentialThunk>(std::move(thunks), reduce);
-    AddThunkToThunkSequence(std::move(sequential_thunk));
-    return Status::OK();
+    return EmitReductionToVector(reduce);
   }
 
   return IrEmitter::HandleReduce(reduce);
@@ -1818,7 +763,7 @@ Status IrEmitterUnnested::HandleSelectAndScatter(
     // Create the inner loop to iterate over the window.
     llvm_ir::ForLoopNest window_loops(IrName(select_and_scatter, "inner"), &b_,
                                       index_type);
-    std::vector<int64> window_size;
+    DimensionVector window_size;
     for (const auto& dim : window.dimensions()) {
       window_size.push_back(dim.size());
       CHECK_GT(dim.size(), 0);
@@ -2171,7 +1116,18 @@ Status IrEmitterUnnested::HandleSelect(HloInstruction* select) {
 Status IrEmitterUnnested::HandleSort(HloInstruction* sort) {
   std::vector<std::unique_ptr<Thunk>> thunks;
   Shape keys_shape = sort->operand(0)->shape();
+  int64 dimension_to_sort = sort->dimensions(0);
+  // In case there is a 'values' parameter that is a iota, we take note and use
+  // it later to ensure a stable sort. Otherwise, we don't guarantee a stable
+  // sort.
+  int64 iota_values_parameter_index = -1;
   for (int64 i = 0; i < sort->operand_count(); ++i) {
+    if (i > 0 && sort->operand(i)->opcode() == HloOpcode::kIota &&
+        ShapeUtil::ElementIsIntegral(sort->operand(i)->shape()) &&
+        Cast<HloIotaInstruction>(sort->operand(i))->iota_dimension() ==
+            dimension_to_sort) {
+      iota_values_parameter_index = i;
+    }
     ShapeIndex shape_index =
         sort->operand_count() > 1 ? ShapeIndex({i}) : ShapeIndex({});
     // We assume that the layout of all involved operands and outputs is the
@@ -2196,7 +1152,6 @@ Status IrEmitterUnnested::HandleSort(HloInstruction* sort) {
     }
   }
 
-  int64 dimension_to_sort = sort->dimensions(0);
   uint64 dimension_to_sort_bound = keys_shape.dimensions(dimension_to_sort);
   int64 num_stages = tensorflow::Log2Ceiling(dimension_to_sort_bound);
   CHECK_GE(1ULL << num_stages, dimension_to_sort_bound);
@@ -2298,8 +1253,9 @@ Status IrEmitterUnnested::HandleSort(HloInstruction* sort) {
       }
     }
     return llvm_ir::EmitSortInPlace(
-        dimension_to_sort, keys_array, values_arrays, IrName(sort), xor_masks,
-        &b_, launch_dimensions,
+        dimension_to_sort, keys_array, values_arrays,
+        iota_values_parameter_index, IrName(sort), xor_masks, &b_,
+        launch_dimensions,
         xor_masks.size() > 1 ? num_iterations_in_sort_dim
                              : standard_num_iterations_in_sort_dim,
         kTileSize);
@@ -2385,7 +1341,7 @@ Status IrEmitterUnnested::HandleCrossReplicaSum(HloInstruction* crs) {
   return Status::OK();
 }
 
-Status IrEmitterUnnested::HandleAfterAll(HloInstruction* gen_token) {
+Status IrEmitterUnnested::HandleAfterAll(HloInstruction* after_all) {
   return Status::OK();
 }
 
@@ -3146,31 +2102,6 @@ std::vector<IrArray> IrEmitterUnnested::ConstructIrArrayForInputs(
   return param_arrays;
 }
 
-int IrEmitterUnnested::ConstructOutputReducedShapeAndCastOutputIrArrayToShape(
-    const HloInstruction& hlo, const std::vector<IrArray>& output_arrays,
-    absl::Span<const int64> reduced_output_dims,
-    std::vector<Shape>* output_reduced_shapes,
-    std::vector<IrArray>* output_in_reduced_shape_arrays) {
-  int64 num_outputs = 1;
-  if (hlo.IsMultiOutputFusion()) {
-    num_outputs = ShapeUtil::TupleElementCount(hlo.shape());
-    output_in_reduced_shape_arrays->reserve(num_outputs);
-    output_reduced_shapes->reserve(num_outputs);
-    for (int64 i = 0; i < num_outputs; ++i) {
-      output_reduced_shapes->push_back(ShapeUtil::MakeShapeWithDescendingLayout(
-          ShapeUtil::GetSubshape(hlo.shape(), {i}).element_type(),
-          reduced_output_dims));
-      output_in_reduced_shape_arrays->push_back(
-          output_arrays[i].CastToShape((*output_reduced_shapes)[i], &b_));
-    }
-  } else {
-    output_reduced_shapes->push_back(ShapeUtil::MakeShapeWithDescendingLayout(
-        hlo.shape().element_type(), reduced_output_dims));
-    output_in_reduced_shape_arrays->push_back(
-        output_arrays[0].CastToShape((*output_reduced_shapes)[0], &b_));
-  }
-  return num_outputs;
-}
 
 int IrEmitterUnnested::ConstructInputReducedShapeAndCastInputIrArrayToShape(
     const HloInstruction& hlo, const std::vector<IrArray>& param_arrays,
@@ -3230,82 +2161,854 @@ llvm::Value* GetBlockIdx(llvm::IRBuilder<>* builder, llvm::Type* index_ty,
                                 "block.id.x");
 }
 
-// Emits code to process up to (tile_size/num_rows) elements in a tile, given
-// `emit_elem_function` is the function to emit code to process one element, `y`
-// and `x` are the coordinates for the first element to process, and `index` is
-// the index for the origin of the tile. Emits bounds check to ensure that each
-// processed element is within the boundary defined by `tile_width` and
-// `tile_height`.
-void EmitTiledElementalCodeWithBoundsCheck(
-    int64 tile_size, int64 num_rows, const IrArray::Index& index,
-    const string& loop_name, KernelSupportLibrary* ksl,
-    llvm::IRBuilder<>* builder, llvm::Value* y, llvm::Value* x,
-    llvm::Value* tile_width, llvm::Value* tile_height,
-    const std::function<void(const IrArray::Index&, llvm::Value*)>&
-        emit_elem_function) {
-  llvm::Type* index_ty = tile_width->getType();
-  // Emits a constant value with index type.
-  auto index_typed_constant = [&](uint64 c) -> llvm::Constant* {
-    return llvm::ConstantInt::get(index_ty, c);
-  };
-  // Adds `addend` to the given `dim` of `index`.
-  auto offset_dim = [&](IrArray::Index index, llvm::Value* addend, int64 dim) {
-    index[dim] = builder->CreateAdd(index[dim], addend);
-    return index;
-  };
-
-  auto emit_full_tile = [&] {
-    for (int64 i = 0; i < tile_size; i += num_rows) {
-      auto source_idx = offset_dim(index, index_typed_constant(i), /*dim=*/1);
-      auto y_loc = builder->CreateAdd(index_typed_constant(i), y);
-      emit_elem_function(source_idx, y_loc);
+void EmitFullTile(const KernelMappingScheme* mapping_scheme,
+                  const IrArray::Index& tile_origin_index,
+                  llvm::IRBuilder<>* builder, llvm::Value* y, llvm::Value* x,
+                  llvm::Type* index_ty,
+                  const std::function<void(const IrArray::Index&, llvm::Value*,
+                                           llvm::Value*)>& emit_elem_function) {
+  int64 num_threads_x = mapping_scheme->GetNumberOfThreadsForDimensionX();
+  int64 num_threads_y = mapping_scheme->GetNumberOfThreadsForDimensionY();
+  int64 tile_size_x = mapping_scheme->GetTileSizeForDimensionX();
+  int64 tile_size_y = mapping_scheme->GetTileSizeForDimensionY();
+  for (int64 i = 0; i < tile_size_y; i += num_threads_y) {
+    IrArray::Index source_idx_y =
+        tile_origin_index.AddOffsetToDim(llvm::ConstantInt::get(index_ty, i),
+                                         KernelMappingScheme::DimY, builder);
+    llvm::Value* y_loc =
+        builder->CreateAdd(llvm::ConstantInt::get(index_ty, i), y);
+    for (int64 j = 0; j < tile_size_x; j += num_threads_x) {
+      IrArray::Index source_idx =
+          source_idx_y.AddOffsetToDim(llvm::ConstantInt::get(index_ty, j),
+                                      KernelMappingScheme::DimX, builder);
+      llvm::Value* x_loc =
+          builder->CreateAdd(llvm::ConstantInt::get(index_ty, j), x);
+      emit_elem_function(source_idx, y_loc, x_loc);
     }
-  };
+  }
+}
+
+void EmitPartialTile(
+    const KernelMappingScheme* mapping_scheme,
+    const IrArray::Index& tile_origin_index, const string& loop_name,
+    KernelSupportLibrary* ksl, llvm::IRBuilder<>* builder, llvm::Value* y,
+    llvm::Value* x, llvm::Value* tile_height, llvm::Value* tile_width,
+    llvm::Type* index_ty,
+    const std::function<void(const IrArray::Index&, llvm::Value*,
+                             llvm::Value*)>& emit_elem_function) {
+  int64 num_threads_x = mapping_scheme->GetNumberOfThreadsForDimensionX();
+  int64 num_threads_y = mapping_scheme->GetNumberOfThreadsForDimensionY();
+  int64 tile_size_x = mapping_scheme->GetTileSizeForDimensionX();
+
+  for (int64 j = 0; j < tile_size_x; j += num_threads_x) {
+    IrArray::Index source_idx =
+        tile_origin_index.AddOffsetToDim(llvm::ConstantInt::get(index_ty, j),
+                                         KernelMappingScheme::DimX, builder);
+    llvm::Value* x_loc =
+        builder->CreateAdd(llvm::ConstantInt::get(index_ty, j), x);
+
+    ksl->IfReturnVoid(
+        loop_name + "_x_in_tile", builder->CreateICmpULT(x_loc, tile_width),
+        [&] {
+          // tile_height_bound =
+          //   ceil(tile_height / num_threads_y) * num_threads_y
+          llvm::Value* ceiling_of_ratio = builder->CreateUDiv(
+              builder->CreateAdd(tile_height, llvm::ConstantInt::get(
+                                                  index_ty, num_threads_y - 1)),
+              llvm::ConstantInt::get(index_ty, num_threads_y));
+          llvm::Value* tile_height_bound = builder->CreateMul(
+              ceiling_of_ratio,
+              llvm::ConstantInt::get(index_ty, num_threads_y));
+          ksl->ForReturnVoid(
+              loop_name, /*start=*/llvm::ConstantInt::get(index_ty, 0),
+              /*end=*/tile_height_bound,
+              /*step=*/llvm::ConstantInt::get(index_ty, num_threads_y),
+              [&](llvm::Value* y_indvar) {
+                llvm::Value* y_loc = builder->CreateAdd(y_indvar, y);
+                ksl->IfReturnVoid(
+                    loop_name + "_y_in_tile",
+                    builder->CreateICmpULT(y_loc, tile_height), [&] {
+                      emit_elem_function(
+                          source_idx.AddOffsetToDim(
+                              y_indvar, KernelMappingScheme::DimY, builder),
+                          y_loc, x_loc);
+                    });
+              });
+        });
+  }
+}
+
+// Emits code to process up to
+// (tile_size_x/num_threads_x * tile_size_y/num_threads_y) elements in a tile,
+// given `emit_elem_function` is the function to emit code to process one
+// element, `y` and `x` are the intra-tile coordinates for the first element
+// to process, and `index` is the index for the origin of the tile. Information
+// about tile_size_x/y and num_threads_x/y are stored in `mapping_scheme`. Emits
+// bounds check to ensure that each processed element is within the boundary
+// defined by `tile_width` and `tile_height`.
+void EmitTiledElementalCodeWithBoundsCheck(
+    const KernelMappingScheme* mapping_scheme,
+    const IrArray::Index& tile_origin_index, const string& loop_name,
+    KernelSupportLibrary* ksl, llvm::IRBuilder<>* builder, llvm::Value* y,
+    llvm::Value* x, llvm::Value* tile_height, llvm::Value* tile_width,
+    const std::function<void(const IrArray::Index&, llvm::Value*,
+                             llvm::Value*)>& emit_elem_function) {
+  int64 tile_size_x = mapping_scheme->GetTileSizeForDimensionX();
+  int64 tile_size_y = mapping_scheme->GetTileSizeForDimensionY();
+  llvm::Type* index_ty = tile_width->getType();
 
-  auto emit_last_row = [&] {
-    ksl->IfReturnVoid("x_in_tile", builder->CreateICmpULT(x, tile_width), [&] {
-      // tile_height_upper_bound =
-      //   ceil(tile_height / num_rows) * num_rows
-      auto tile_height_upper_bound = builder->CreateMul(
-          builder->CreateUDiv(
-              builder->CreateAdd(tile_height,
-                                 index_typed_constant(num_rows - 1)),
-              index_typed_constant(num_rows)),
-          index_typed_constant(num_rows));
-      ksl->ForReturnVoid(
-          loop_name, /*start=*/index_typed_constant(0),
-          /*end=*/tile_height_upper_bound,
-          /*step=*/index_typed_constant(num_rows), [&](llvm::Value* y_indvar) {
-            auto y_loc = builder->CreateAdd(y_indvar, y);
-            ksl->IfReturnVoid(
-                "y_in_tile", builder->CreateICmpULT(y_loc, tile_height), [&] {
-                  emit_elem_function(offset_dim(index, y_indvar, /*dim=*/1),
-                                     y_loc);
-                });
-          });
-    });
-  };
   ksl->IfReturnVoid(
-      "full_tile",
+      loop_name + "_full_tile",
       builder->CreateAnd(
-          builder->CreateICmpEQ(index_typed_constant(tile_size), tile_width),
-          builder->CreateICmpEQ(index_typed_constant(tile_size), tile_height)),
-      emit_full_tile, emit_last_row);
+          builder->CreateICmpEQ(llvm::ConstantInt::get(index_ty, tile_size_x),
+                                tile_width),
+          builder->CreateICmpEQ(llvm::ConstantInt::get(index_ty, tile_size_y),
+                                tile_height)),
+      [&] {
+        EmitFullTile(mapping_scheme, tile_origin_index, builder, y, x, index_ty,
+                     emit_elem_function);
+      },
+      [&] {
+        EmitPartialTile(mapping_scheme, tile_origin_index, loop_name, ksl,
+                        builder, y, x, tile_height, tile_width, index_ty,
+                        emit_elem_function);
+      });
 }
 }  // namespace
 
+// Emits code to process a tensor element in a tile for the given kCopy HLO that
+// performs a 0-2-1 transpose.
+//
+// index: The index for the first output element in the normalized tensor. The
+//   normalized tensor is the resulting tensor after collapsing contiguous
+//   dimensions that play the same role in the transpose.
+// y_loc: The y coordinate within a tile.
+// x_loc: The x coordinate within a tile.
+// kernel_info: Other information to support the kernel code generation.
+void IrEmitterUnnested::EmitTileElementForCopy(
+    HloInstruction* hlo, const llvm_ir::IrArray::Index& index,
+    const KernelCodegenInfo* kernel_info, llvm::Value* y_loc,
+    llvm::Value* x_loc) {
+  llvm_ir::TiledParameterInfo* tiled_param_info =
+      kernel_info->GetTiledParameterInfo();
+  // TODO(jlebar): Add AA metadata to this load.
+  llvm::Instruction* load_from_shmem_buffer =
+      Load(GEP(tiled_param_info->GetBufferForParameter(0),
+               {b_.getInt64(0), x_loc, y_loc}),
+           "output_element");
+  llvm_ir::IrArray output_array = GetIrArray(*hlo, *hlo);
+  Shape output_reduced_shape = ShapeUtil::MakeShapeWithDescendingLayout(
+      hlo->shape().element_type(),
+      kernel_info->GetKernelMappingScheme()->GetDimensionsInElements());
+  // When the output_reduced_shape is a 0-2-1 transpose of the input shape,
+  // the 0-2-1 transpose is achieved through EmitWriteArrayElement.
+  output_array.CastToShape(output_reduced_shape, &b_)
+      .EmitWriteArrayElement(index, load_from_shmem_buffer, &b_);
+}
+
+// Emits code to process a tensor element in a tile for the given kLoop fusion
+// HLO containing parameters that are 0-2-1 transpose of its outputs.
+//
+// index: The index for the first output element in the normalized tensor, that
+//   is the resulting tensor after collapsing contiguous dimensions that play
+//   the same role in the transpose.
+// kernel_info: Other information to support the kernel code generation.
+// y_loc: The y coordinate within a tile.
+// x_loc: The x coordinate within a tile.
+void IrEmitterUnnested::EmitTileElementForFusion(
+    HloInstruction* hlo, const llvm_ir::IrArray::Index& index,
+    const KernelCodegenInfo* kernel_info, llvm::Value* y_loc,
+    llvm::Value* x_loc) {
+  llvm_ir::TiledParameterInfo* tiled_param_info =
+      kernel_info->GetTiledParameterInfo();
+  std::vector<IrArray> output_arrays = ConstructIrArrayForOutputs(*hlo);
+  GpuElementalIrEmitter elem_emitter(hlo_module_config_, module_, &b_,
+                                     GetNestedComputer());
+  FusedIrEmitter fused_emitter(GetGeneratorForOperandIrArrays(hlo),
+                               &elem_emitter);
+  tiled_param_info->set_y(y_loc);
+  tiled_param_info->set_x(x_loc);
+  fused_emitter.SetTiledParameterInfo(tiled_param_info);
+  TF_CHECK_OK(hlo->fused_expression_root()->Accept(&fused_emitter));
+  IrArray::Index untiled_index =
+      kernel_info->GetKernelMappingScheme()->GetUnnormalizedIndex(
+          index, output_arrays[0].GetShape());
+  const llvm_ir::ElementGenerator& output_generator =
+      fused_emitter.GetRootGenerator();
+  llvm::Value* output_value = output_generator(untiled_index).ValueOrDie();
+  if (hlo->IsMultiOutputFusion()) {
+    DCHECK(output_value->getType()->isStructTy());
+    DCHECK_EQ(output_value->getType()->getStructNumElements(),
+              output_arrays.size());
+    for (int64 i = 0; i < output_arrays.size(); ++i) {
+      output_arrays[i].EmitWriteArrayElement(
+          untiled_index, ExtractValue(output_value, i), &b_);
+    }
+  } else {
+    output_arrays[0].EmitWriteArrayElement(untiled_index, output_value, &b_);
+  }
+}
+
+// Information to support the code generation for a tiled reduction kernel.
+using AddressVector = InlinedVector<llvm::AllocaInst*, 1>;
+class ReductionCodegenInfo : public IrEmitterUnnested::KernelCodegenInfo {
+ public:
+  explicit ReductionCodegenInfo(llvm_ir::KernelMappingScheme* mapping_scheme,
+                                bool is_row_reduction)
+      : KernelCodegenInfo(mapping_scheme),
+        current_output_linear_index_address_(nullptr),
+        current_output_inbound_address_(nullptr),
+        is_row_reduction_(is_row_reduction) {}
+
+  void SetCurrentOutputLinearIndexAddress(llvm::AllocaInst* a) {
+    current_output_linear_index_address_ = a;
+  }
+  // Returns the address of the memory that stores the linear index of the
+  // current output. Since we are processing reduction to contiguous physical
+  // dimensions, this linear index is the linear index of the 1D output array.
+  llvm::AllocaInst* GetCurrentOutputLinearIndexAddress() const {
+    return current_output_linear_index_address_;
+  }
+
+  void SetCurrentOutputInboundAddress(llvm::AllocaInst* a) {
+    current_output_inbound_address_ = a;
+  }
+
+  llvm::AllocaInst* GetCurrentOutputInboundAddress() const {
+    return current_output_inbound_address_;
+  }
+
+  AddressVector* GetMutablePartialResultAddresses() {
+    return &partial_result_addresses_;
+  }
+  const AddressVector& GetPartialResultAddresses() const {
+    return partial_result_addresses_;
+  }
+
+  AddressVector* GetMutableReductionInputAddresses() {
+    return &reduction_input_addresses_;
+  }
+  const AddressVector& GetReductionInputAddresses() const {
+    return reduction_input_addresses_;
+  }
+
+  InlinedVector<HloComputation*, 1>* GetMutableReducers() { return &reducers_; }
+  const InlinedVector<HloComputation*, 1>& GetReducers() const {
+    return reducers_;
+  }
+  int GetNumberOfReduces() const { return reducers_.size(); }
+
+  InlinedVector<ShapeIndex, 1>* GetMutableReductionOutputShapeIndices() {
+    return &reduction_output_shape_indices_;
+  }
+  const InlinedVector<ShapeIndex, 1>& GetReductionOutputShapeIndices() const {
+    return reduction_output_shape_indices_;
+  }
+
+  bool IsRowReduction() const { return is_row_reduction_; }
+
+  // Return the dimension that is being reduced between DimX and DimY.
+  int GetReducedDimensionEnum() const {
+    return IsRowReduction() ? llvm_ir::KernelMappingScheme::DimX
+                            : llvm_ir::KernelMappingScheme::DimY;
+  }
+
+  // Return the dimension that is being ketp between DimX and DimY.
+  int GetKeptDimensionEnum() const {
+    return IsRowReduction() ? llvm_ir::KernelMappingScheme::DimY
+                            : llvm_ir::KernelMappingScheme::DimX;
+  }
+
+ private:
+  AddressVector partial_result_addresses_;
+  AddressVector reduction_input_addresses_;
+  InlinedVector<HloComputation*, 1> reducers_;
+  InlinedVector<ShapeIndex, 1> reduction_output_shape_indices_;
+  llvm::AllocaInst* current_output_linear_index_address_;
+  llvm::AllocaInst* current_output_inbound_address_;
+  bool is_row_reduction_;
+};
+
+namespace {
+// Returns a group of instructions that generate the output for the kernel
+// containing the given HLO instruction. The result may be an unnested kReduce
+// HLO, a nested kReduce HLO of a kInput fusion, or the operands of the tuple
+// for a multiple output fusion.
+absl::Span<HloInstruction* const> GetOutputInstructions(
+    HloInstruction* const* reduce_or_tuple_pointer) {
+  HloOpcode opcode = (*reduce_or_tuple_pointer)->opcode();
+  CHECK(opcode == HloOpcode::kReduce || opcode == HloOpcode::kTuple);
+  return opcode == HloOpcode::kTuple
+             ? (*reduce_or_tuple_pointer)->operands()
+             : absl::Span<HloInstruction* const>(reduce_or_tuple_pointer, 1);
+}
+
+const HloInstruction* GetFirstReduceInstruction(
+    absl::Span<HloInstruction* const> instructions) {
+  auto first_reduce_iter =
+      absl::c_find_if(instructions, [](const HloInstruction* inst) {
+        return inst->opcode() == HloOpcode::kReduce;
+      });
+  CHECK_NE(first_reduce_iter, instructions.end());
+  return *first_reduce_iter;
+}
+
+};  // namespace
+
+void IrEmitterUnnested::EmitPrologueForOneReduction(
+    HloInstruction* unnested_hlo, HloInstruction* reduce_inst, int reduce_idx,
+    KernelCodegenInfo* kernel_info, GpuElementalIrEmitter* elemental_emitter,
+    ShapeIndex output_shape_index) {
+  ReductionCodegenInfo* reduction_info =
+      static_cast<ReductionCodegenInfo*>(kernel_info);
+
+  InlinedVector<HloComputation*, 1>* reducers =
+      reduction_info->GetMutableReducers();
+  CHECK(IsReductionToVector(*reduce_inst));
+  reducers->push_back(reduce_inst->to_apply());
+
+  InlinedVector<ShapeIndex, 1>* reduction_output_shape_indices =
+      reduction_info->GetMutableReductionOutputShapeIndices();
+  reduction_output_shape_indices->push_back(std::move(output_shape_index));
+
+  AddressVector* reduction_input_addresses =
+      reduction_info->GetMutableReductionInputAddresses();
+  llvm::Type* element_type = llvm_ir::PrimitiveTypeToIrType(
+      reduce_inst->shape().element_type(), ir_emitter_context_->llvm_module());
+  llvm::AllocaInst* reduction_input_address = Alloca(element_type);
+  reduction_input_addresses->push_back(reduction_input_address);
+
+  AddressVector* partial_result_addresses =
+      reduction_info->GetMutablePartialResultAddresses();
+  llvm::AllocaInst* partial_result_address =
+      Alloca(element_type, /*ArraySize=*/nullptr,
+             "partial_reduction_result." + llvm::Twine(reduce_idx));
+  partial_result_addresses->push_back(partial_result_address);
+
+  // Initialize the partial result with the initial value of the reduction.
+  llvm::Value* init_ir_value;
+  if (unnested_hlo->opcode() == HloOpcode::kFusion) {
+    HloInstruction* init_value_operand = reduce_inst->mutable_operand(1);
+    FusedIrEmitter fused_emitter(GetGeneratorForOperandIrArrays(unnested_hlo),
+                                 elemental_emitter);
+
+    TF_CHECK_OK(init_value_operand->Accept(&fused_emitter));
+    init_ir_value =
+        fused_emitter
+            .GetGenerator(init_value_operand)(IrArray::Index(b_.getInt32Ty()))
+            .ValueOrDie();
+  } else {
+    const HloInstruction* init_value = unnested_hlo->operand(1);
+    init_ir_value =
+        GetIrArray(*init_value, *unnested_hlo)
+            .EmitReadArrayElement(IrArray::Index(b_.getInt32Ty()), &b_);
+  }
+
+  Store(init_ir_value, partial_result_address);
+}
+
+void IrEmitterUnnested::EmitPrologueForReduction(
+    HloInstruction* unnested_hlo, KernelCodegenInfo* kernel_info) {
+  VLOG(10) << "Emit prologue for reduction " << unnested_hlo->ToString();
+  // Find the unnested kReduce or the tuple that contains a list of kReduce.
+  HloInstruction* reduce_or_tuple = unnested_hlo->opcode() == HloOpcode::kFusion
+                                        ? unnested_hlo->fused_expression_root()
+                                        : unnested_hlo;
+  absl::Span<HloInstruction* const> output_instructions =
+      GetOutputInstructions(&reduce_or_tuple);
+  ReductionCodegenInfo* reduction_info =
+      static_cast<ReductionCodegenInfo*>(kernel_info);
+  GpuElementalIrEmitter elemental_emitter(hlo_module_config_,
+                                          ir_emitter_context_->llvm_module(),
+                                          &b_, GetNestedComputer());
+  const HloInstruction* first_reduce = nullptr;
+  for (int i = 0, e = output_instructions.size(); i != e; ++i) {
+    if (output_instructions[i]->opcode() != HloOpcode::kReduce) {
+      continue;
+    }
+    HloInstruction* reduce_inst = output_instructions[i];
+    if (first_reduce == nullptr) {
+      first_reduce = reduce_inst;
+    } else {
+      CHECK(first_reduce->dimensions() == reduce_inst->dimensions());
+    }
+    ShapeIndex output_shape_index;
+    if (reduce_or_tuple->opcode() == HloOpcode::kTuple) {
+      output_shape_index = {i};
+    }
+
+    EmitPrologueForOneReduction(unnested_hlo, reduce_inst, i, kernel_info,
+                                &elemental_emitter,
+                                std::move(output_shape_index));
+  }
+
+  // Allocate stack storage to store the current output linear index and record
+  // the address of the storage.
+  reduction_info->SetCurrentOutputLinearIndexAddress(
+      Alloca(reduction_info->GetIndexType()));
+
+  if (!reduction_info->IsRowReduction()) {
+    llvm::Type* bool_ty = b_.getInt1Ty();
+    llvm::AllocaInst* output_inbound_addr = Alloca(bool_ty);
+    Store(llvm::ConstantInt::get(bool_ty, 0), output_inbound_addr);
+    reduction_info->SetCurrentOutputInboundAddress(output_inbound_addr);
+  }
+}
+
+void IrEmitterUnnested::EmitFullWarpShuffleDownLoopForAllReduces(
+    const InlinedVector<HloComputation*, 1>& reducers,
+    const AddressVector& partial_result_addresses) {
+  for (int distance = 16; distance >= 1; distance /= 2) {
+    for (int i = 0; i != reducers.size(); ++i) {
+      llvm::Type* element_type =
+          partial_result_addresses[i]->getType()->getElementType();
+      int bit_width = llvm_ir::GetSizeInBits(element_type);
+      llvm::Value* result_from_other_lane = Alloca(
+          element_type, nullptr, "result_from_other_lane" + llvm::Twine(i));
+      // Bitcast cannot be applied to aggregate types (even packed ones), so
+      // we bitcast addresses of load/store to intN* of the same bit-width.
+      llvm::Type* shuffled_value_type =
+          element_type->isStructTy() ? b_.getIntNTy(bit_width) : element_type;
+      auto convert_pointer_for_shuffle = [&](llvm::Value* ptr) {
+        return BitCast(ptr, shuffled_value_type->getPointerTo());
+      };
+      llvm::Value* partial_result =
+          Load(convert_pointer_for_shuffle(partial_result_addresses[i]),
+               "partial_reduction_result");
+      Store(EmitFullWarpShuffleDown(partial_result, b_.getInt32(distance), &b_),
+            convert_pointer_for_shuffle(result_from_other_lane));
+      TF_CHECK_OK(EmitCallToNestedComputation(
+          *reducers[i], {partial_result_addresses[i], result_from_other_lane},
+          partial_result_addresses[i]));
+    }
+  }
+}
+
+void IrEmitterUnnested::EmitEpilogueForReduction(
+    HloInstruction* unnested_hlo, KernelCodegenInfo* kernel_info) {
+  ReductionCodegenInfo* reduction_info =
+      static_cast<ReductionCodegenInfo*>(kernel_info);
+  int num_reduces = reduction_info->GetNumberOfReduces();
+  const AddressVector& partial_result_addresses =
+      reduction_info->GetPartialResultAddresses();
+  const InlinedVector<HloComputation*, 1>& reducers =
+      reduction_info->GetReducers();
+  const InlinedVector<ShapeIndex, 1>& reduction_output_shape_indices =
+      reduction_info->GetReductionOutputShapeIndices();
+
+  if (reduction_info->IsRowReduction()) {
+    EmitFullWarpShuffleDownLoopForAllReduces(reducers,
+                                             partial_result_addresses);
+    llvm::Value* lane_id = reduction_info->GetLaneId();
+    llvm_ir::LlvmIfData if_lane_id_is_zero_data = llvm_ir::EmitIfThenElse(
+        ICmpEQ(lane_id, llvm::ConstantInt::get(lane_id->getType(), 0)),
+        "lane_id_is_zero", &b_);
+    llvm_ir::SetToFirstInsertPoint(if_lane_id_is_zero_data.true_block, &b_);
+  } else {
+    llvm::Value* output_inbound_addr =
+        reduction_info->GetCurrentOutputInboundAddress();
+    llvm::Value* output_inbound = Load(output_inbound_addr);
+    llvm_ir::LlvmIfData if_output_inbound_data = llvm_ir::EmitIfThenElse(
+        ICmpEQ(output_inbound,
+               llvm::ConstantInt::get(output_inbound->getType(), 1)),
+        "output_inbound", &b_);
+    llvm_ir::SetToFirstInsertPoint(if_output_inbound_data.true_block, &b_);
+  }
+
+  // Emit an atomic operation that accumulates the partial reduction to the
+  // output element. For row reduction, this is only for lane 0 due to the
+  // if-statement emitted above.
+  for (int i = 0; i != num_reduces; ++i) {
+    IrArray::Index element_index(
+        /*linear=*/Load(reduction_info->GetCurrentOutputLinearIndexAddress(),
+                        "output_linear_addr"),
+        ShapeUtil::GetSubshape(unnested_hlo->shape(),
+                               reduction_output_shape_indices[i]),
+        &b_);
+    llvm::Value* output_address =
+        GetIrArray(*unnested_hlo, *unnested_hlo,
+                   reduction_output_shape_indices[i])
+            .EmitArrayElementAddress(element_index, &b_,
+                                     "output_element_address");
+    // Do not emit atomic operations if each element in the reduction result is
+    // computed by one block, that is the dimension being reduced has only one
+    // block.
+    const llvm_ir::KernelMappingScheme* mapping_scheme =
+        reduction_info->GetKernelMappingScheme();
+    if (mapping_scheme->GetTileBlockSizeForDimension(
+            llvm_ir::KernelMappingScheme::DimZ) == 1 &&
+        mapping_scheme->GetTileBlockSizeForDimension(
+            reduction_info->GetReducedDimensionEnum()) == 1) {
+      TF_CHECK_OK(EmitCallToNestedComputation(
+          *reducers[i], {output_address, partial_result_addresses[i]},
+          output_address));
+    } else {
+      TF_CHECK_OK(EmitAtomicOperationForNestedComputation(
+          *reducers[i], output_address, partial_result_addresses[i]));
+    }
+  }
+}
+
+void IrEmitterUnnested::EmitTileElementForReduction(
+    HloInstruction* unnested_hlo, const llvm_ir::IrArray::Index& index,
+    const KernelCodegenInfo* kernel_info, llvm::Value* y_loc,
+    llvm::Value* x_loc) {
+  VLOG(10) << "Emit tile element for reduce " << unnested_hlo->ToString();
+  HloInstruction* reduce_or_tuple = unnested_hlo->opcode() == HloOpcode::kFusion
+                                        ? unnested_hlo->fused_expression_root()
+                                        : unnested_hlo;
+  llvm_ir::TiledParameterInfo* tiled_param_info =
+      kernel_info->GetTiledParameterInfo();
+  tiled_param_info->set_y(y_loc);
+  tiled_param_info->set_x(x_loc);
+
+  // Record the linear address for the current reduction.
+  const ReductionCodegenInfo* reduction_info =
+      dynamic_cast<const ReductionCodegenInfo*>(kernel_info);
+  Store(index[reduction_info->GetKeptDimensionEnum()],
+        reduction_info->GetCurrentOutputLinearIndexAddress());
+  if (!reduction_info->IsRowReduction()) {
+    llvm::Type* bool_ty = b_.getInt1Ty();
+    llvm::AllocaInst* output_inbound_addr =
+        reduction_info->GetCurrentOutputInboundAddress();
+    Store(llvm::ConstantInt::get(bool_ty, 1), output_inbound_addr);
+  }
+
+  InlinedVector<llvm_ir::ElementGenerator, 1> input_gens;
+  std::vector<std::pair<llvm_ir::ElementGenerator, ShapeIndex>>
+      extra_output_gens;
+  GpuElementalIrEmitter elem_emitter(hlo_module_config_, module_, &b_,
+                                     GetNestedComputer());
+  FusedIrEmitter fused_emitter(GetGeneratorForOperandIrArrays(unnested_hlo),
+                               &elem_emitter);
+  absl::Span<HloInstruction* const> output_instructions =
+      GetOutputInstructions(&reduce_or_tuple);
+  // Construct the ElementGenerator for each reduction and extra output in the
+  // the group of output instructions.
+  if (unnested_hlo->opcode() == HloOpcode::kFusion) {
+    fused_emitter.SetTiledParameterInfo(tiled_param_info);
+    TF_CHECK_OK(unnested_hlo->fused_expression_root()->Accept(&fused_emitter));
+
+    for (int i = 0, e = output_instructions.size(); i != e; ++i) {
+      const HloInstruction* inst = output_instructions[i];
+      ShapeIndex output_shape_index;
+      if (reduce_or_tuple->opcode() == HloOpcode::kTuple) {
+        output_shape_index = {i};
+      }
+      if (inst->opcode() == HloOpcode::kReduce) {
+        input_gens.push_back(fused_emitter.GetGenerator(inst->operand(0)));
+      } else {
+        extra_output_gens.emplace_back(fused_emitter.GetGenerator(inst),
+                                       std::move(output_shape_index));
+      }
+    }
+  } else {
+    input_gens.push_back([&](const IrArray::Index& index) {
+      return GetIrArray(*unnested_hlo->operand(0), *unnested_hlo)
+          .EmitReadArrayElement(index, &b_);
+    });
+  }
+
+  IrArray::Index input_index =
+      reduction_info->GetKernelMappingScheme()->GetUnnormalizedIndex(
+          index,
+          GetFirstReduceInstruction(output_instructions)->operand(0)->shape());
+  const AddressVector& partial_reduction_result_addresses =
+      reduction_info->GetPartialResultAddresses();
+  const AddressVector& reduction_input_addresses =
+      reduction_info->GetReductionInputAddresses();
+  const InlinedVector<HloComputation*, 1>& reducers =
+      reduction_info->GetReducers();
+
+  // Emit code to generate the input and perform the reduction computation for
+  // each reduction instruction.
+  for (int i = 0; i != reducers.size(); ++i) {
+    llvm::Value* const input_ir_value = input_gens[i](input_index).ValueOrDie();
+    Store(input_ir_value, reduction_input_addresses[i]);
+    TF_CHECK_OK(EmitCallToNestedComputation(
+        *reducers[i],
+        {partial_reduction_result_addresses[i], reduction_input_addresses[i]},
+        partial_reduction_result_addresses[i]));
+  }
+
+  // Emit code to generate the output for the non-reduction instructions in the
+  // fusion, if any.
+  TF_CHECK_OK(
+      EmitExtraOutputsForReduce(unnested_hlo, input_index, extra_output_gens));
+}
+
+// Emits a kernel for the hlo instruction using the given tiling scheme.
+void IrEmitterUnnested::EmitBlock(const TileGenerator& emit_one_tile,
+                                  const KernelCodegenInfo* kernel_info,
+                                  KernelSupportLibrary& ksl,
+                                  llvm::Type* index_ty) {
+  KernelMappingScheme* mapping_scheme = kernel_info->GetKernelMappingScheme();
+  absl::Span<const int64> dims_in_tile = mapping_scheme->GetDimensionsInTiles();
+  absl::Span<const int64> dims_in_block =
+      mapping_scheme->GetDimensionsInBlocks();
+  absl::Span<const int64> block_sizes = mapping_scheme->GetBlockSizes();
+  auto index_typed_constant = [&](uint64 c) -> llvm::Constant* {
+    return llvm::ConstantInt::get(index_ty, c);
+  };
+
+  // Emit all the tiles for a given dimension in a tile block.
+  auto emit_tiles_for_block_dim =
+      [&](const string& loop_name, const IrArray::Index& starting_tile,
+          int dim_id,
+          const std::function<void(const IrArray::Index& tile_index)>
+              emit_next_block_dim) {
+        if (block_sizes[dim_id] == 1) {
+          emit_next_block_dim(starting_tile);
+        } else {
+          llvm::Value* starting_tile_index_for_dim = starting_tile[dim_id];
+          llvm::Value* block_size_for_dim =
+              index_typed_constant(block_sizes[dim_id]);
+          llvm::Value* block_id_for_dim =
+              b_.CreateUDiv(starting_tile_index_for_dim, block_size_for_dim);
+          llvm::Value* last_block_for_dim =
+              index_typed_constant(dims_in_block[dim_id] - 1);
+          llvm::Value* last_block_size_for_dim = index_typed_constant(
+              dims_in_tile[dim_id] -
+              (dims_in_block[dim_id] - 1) * block_sizes[dim_id]);
+          llvm::Value* num_tiles_in_block =
+              Select(ICmpEQ(last_block_for_dim, block_id_for_dim),
+                     last_block_size_for_dim, block_size_for_dim);
+
+          ksl.ForReturnVoid(
+              loop_name,
+              /*start=*/index_typed_constant(0),
+              /*end=*/num_tiles_in_block,
+              /*step=*/1, [&](llvm::Value* block_dim_induction_var) {
+                IrArray::Index tile_index = starting_tile.AddOffsetToDim(
+                    block_dim_induction_var, dim_id, &b_);
+                emit_next_block_dim(tile_index);
+              });
+        }
+      };
+
+  absl::Span<const int64> reduced_dims =
+      mapping_scheme->GetDimensionsInElements();
+  const bool block_contains_multi_tiles =
+      mapping_scheme->GetNumberOfTilesInOneBlock() > 1;
+
+  // Emit the tile with a given tile_index, by calculating the tight bounds for
+  // each dimension of the tile and then calling emit_one_tile.
+  auto emit_one_tile_for_tile_index = [&](const IrArray::Index& tile_index) {
+    std::vector<llvm::Value*> output_tile_bounds(3);
+    for (int i = KernelMappingScheme::DimY; i < KernelMappingScheme::DimTot;
+         ++i) {
+      int64 tile_size_for_dim = mapping_scheme->GetTileSizeForDimension(i);
+      // Only last row or column may not have full size.
+      llvm::Value* is_last_row =
+          ICmpEQ(tile_index[i], index_typed_constant(dims_in_tile[i] - 1));
+      int64 partial_row_size =
+          reduced_dims[i] - (dims_in_tile[i] - 1) * tile_size_for_dim;
+      output_tile_bounds[i] =
+          Select(is_last_row, index_typed_constant(partial_row_size),
+                 index_typed_constant(tile_size_for_dim), "tile_bound");
+    }
+
+    IrArray::Index tile_origin =
+        mapping_scheme->GetElementIndexForTileOrigin(tile_index);
+    emit_one_tile(tile_origin, output_tile_bounds, block_contains_multi_tiles);
+  };
+
+  const IrArray::Index starting_block =
+      mapping_scheme->EmitBlockIndex(index_ty);
+  const IrArray::Index starting_tile_for_dim_z =
+      mapping_scheme->GetTileIndexForBlockOrigin(starting_block);
+
+  // Emit the three dimensional block of tiles.
+  emit_tiles_for_block_dim(
+      "block_dim_z", starting_tile_for_dim_z, KernelMappingScheme::DimZ,
+      [&](const IrArray::Index& starting_tile_for_dim_y) {
+        emit_tiles_for_block_dim(
+            "block_dim_y", starting_tile_for_dim_y, KernelMappingScheme::DimY,
+            [&](const IrArray::Index& starting_tile_for_dim_x) {
+              emit_tiles_for_block_dim("block_dim_x", starting_tile_for_dim_x,
+                                       KernelMappingScheme::DimX,
+                                       emit_one_tile_for_tile_index);
+            });
+      });
+}
+
+// Emits a kernel for the hlo instruction using the given kernel mapping scheme.
+//
+// unnested_hlo: The unnested hlo instruction for which the kernel is generated.
+//   Currently, these hlo instructions are supported: kLoop fusion, kCopy.
+// tiled_param_ids: The IDs for the parameters that are 0-2-1 transpose of
+//   other tensors with the same dimensions and need to be tiled and tranposed.
+// mapping_scheme: The tiling scheme to use.
+// kernel_generator: Contains function objects for code generation, such as
+//   element generator, block prologue and epilogue generators.
+// kernel_info: Represent other information to support the code generation
+//   of the tiled kernel for the hlo.
+LaunchDimensions IrEmitterUnnested::EmitKernel(
+    HloInstruction* unnested_hlo, absl::Span<const int64> tiled_param_ids,
+    const KernelCodeGenerator& kernel_generator,
+    KernelCodegenInfo* kernel_info) {
+  KernelMappingScheme* mapping_scheme = kernel_info->GetKernelMappingScheme();
+
+  std::vector<IrArray> param_arrays = ConstructIrArrayForInputs(*unnested_hlo);
+  int64 num_params = param_arrays.size();
+  // Allocate shared memory buffers to store the tiled inputs.
+  std::vector<llvm::Value*> param_shmem_buffers(num_params, nullptr);
+  for (int64 id : tiled_param_ids) {
+    const HloInstruction* param = unnested_hlo->operand(id);
+    param_shmem_buffers[id] =
+        mapping_scheme->GetSharedMemoryBufferForElementType(
+            llvm_ir::PrimitiveTypeToIrType(param->shape().element_type(),
+                                           module_),
+            IrName(unnested_hlo, StrCat("tile", id)));
+    VLOG(3) << "Added shmem buffer for parameter " << id << ": "
+            << llvm_ir::DumpToString(*param_shmem_buffers[id]);
+  }
+
+  LaunchDimensions launch_dimensions = LaunchDimensions(
+      mapping_scheme->GetNumberOfBlocks(), mapping_scheme->GetThreadsPerTile());
+  llvm::Type* index_ty = GetIndexTypeForKernel(
+      unnested_hlo, launch_dimensions.launch_bound(), &b_);
+  auto index_typed_constant = [&](uint64 c) -> llvm::Constant* {
+    return llvm::ConstantInt::get(index_ty, c);
+  };
+
+  // For each tiled parameter, cast its input IrArray to the corresponding
+  // reduced shape and keep the reduced shape live during IR emission.
+  std::vector<IrArray> param_in_reduced_shape_arrays;
+  std::vector<Shape> param_reduced_shapes;
+  absl::Span<const int64> reduced_dims =
+      mapping_scheme->GetDimensionsInElements();
+  int num_shapes = ConstructInputReducedShapeAndCastInputIrArrayToShape(
+      *unnested_hlo, param_arrays, param_shmem_buffers, reduced_dims,
+      &param_reduced_shapes, &param_in_reduced_shape_arrays);
+  DCHECK_EQ(num_shapes, num_params);
+
+  // Calculate the starting element coordinate within a tile for the current
+  // thread, (y, x) from thread_id.
+  llvm::Value* x;
+  llvm::Value* y;
+  std::tie(y, x) = mapping_scheme->EmitThreadYXCoordinate(index_ty);
+
+  kernel_info->SetLaneId(
+      mapping_scheme->GetNumberOfThreadsForDimensionX() == kWarpSize ? x
+                                                                     : nullptr);
+  kernel_info->SetIndexType(index_ty);
+
+  KernelSupportLibrary ksl(&b_, llvm_ir::UnrollMode::kDefaultUnroll);
+  // Curry a few parameters to EmitTiledElementalCodeWithBoundsCheck.
+  auto emit_tiled_elemental_code_with_bounds_check =
+      [&](const IrArray::Index& index, const string& loop_name,
+          llvm::Value* tile_height, llvm::Value* tile_width,
+          const std::function<void(const IrArray::Index&, llvm::Value*,
+                                   llvm::Value*)>& emit_elem_function) {
+        EmitTiledElementalCodeWithBoundsCheck(mapping_scheme, index, loop_name,
+                                              &ksl, &b_, y, x, tile_height,
+                                              tile_width, emit_elem_function);
+      };
+
+  auto emit_one_tile = [&](const IrArray::Index& output_tile_origin,
+                           absl::Span<llvm::Value* const> output_tile_bounds,
+                           bool block_contains_multi_tiles) {
+    // Calculate the input tile origin from the output tile origin.
+    const IrArray::Index input_tile_origin(
+        Permute({0, 2, 1}, output_tile_origin.multidim()));
+
+    const IrArray::Index input_index =
+        input_tile_origin.AddOffsetToDim(x, KernelMappingScheme::DimX, &b_)
+            .AddOffsetToDim(y, KernelMappingScheme::DimY, &b_);
+
+    // If shared memory transpose is needed, wait for all threads to reach this
+    // point, lest we copy a value from tile to output before the other thread
+    // copies it from input to tile. This is `__syncthreads` in CUDA.
+    if (!tiled_param_ids.empty()) {
+      // Copy input parameter values to shared memory buffers:
+      // tile[y, x] = input[index]
+      // Note that tile_width and tile_height are flipped here because we are
+      // reading a transposed tile.
+      emit_tiled_elemental_code_with_bounds_check(
+          input_index, "input", output_tile_bounds[2], output_tile_bounds[1],
+          [&](const IrArray::Index& index, llvm::Value* y_loc,
+              llvm::Value* x_loc) {
+            for (int64 id : tiled_param_ids) {
+              IrArray& input_in_logical_shape =
+                  param_in_reduced_shape_arrays[id];
+              llvm::Value* shmem_buffer = param_shmem_buffers[id];
+              // TODO(jlebar): Add AA metadata to this store.  Tile buffers are
+              // global variables, so LLVM can't infer much about it.
+              Store(input_in_logical_shape.EmitReadArrayElement(
+                        index, &b_, "input_element"),
+                    GEP(shmem_buffer, {index_typed_constant(0), y_loc, x_loc}));
+            }
+          });
+
+      // Wait for all threads to reach this point using `__syncthreads` in CUDA.
+      llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::nvvm_barrier0, {}, {}, &b_);
+    }
+
+    llvm_ir::TiledParameterInfo tiled_param_info(param_shmem_buffers, y, x);
+    kernel_info->SetTiledParamInfo(&tiled_param_info);
+
+    const IrArray::Index output_index =
+        output_tile_origin.AddOffsetToDim(x, KernelMappingScheme::DimX, &b_)
+            .AddOffsetToDim(y, KernelMappingScheme::DimY, &b_);
+
+    // Write to output[index] by emitting code like normal, except that values
+    // for the tiled parameters are read from the shmem buffers.
+    emit_tiled_elemental_code_with_bounds_check(
+        output_index, "output", output_tile_bounds[1], output_tile_bounds[2],
+        [&](const IrArray::Index& index, llvm::Value* y_loc,
+            llvm::Value* x_loc) {
+          kernel_generator.GetTileElementGenerator()(unnested_hlo, index,
+                                                     kernel_info, y_loc, x_loc);
+        });
+
+    // If a tile block contains multiple tiles and shared memory buffers are
+    // used, we need to wait for all threads to finish using the shared memory
+    // buffer for the current tile before we move on to process the next tile
+    // and overwrite the shared memory buffers.
+    if (block_contains_multi_tiles && !tiled_param_ids.empty()) {
+      llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::nvvm_barrier0, {}, {}, &b_);
+    }
+  };
+
+  const BlockPrologueGenerator& block_prologue_generator =
+      kernel_generator.GetBlockPrologueGenerator();
+  if (block_prologue_generator) {
+    block_prologue_generator(unnested_hlo, kernel_info);
+  }
+
+  EmitBlock(std::move(emit_one_tile), kernel_info, ksl, index_ty);
+
+  const BlockEpilogueGenerator& block_epilogue_generator =
+      kernel_generator.GetBlockEpilogueGenerator();
+  if (block_epilogue_generator) {
+    block_epilogue_generator(unnested_hlo, kernel_info);
+  }
+
+  // For multioutput fusion, emit a tuple with pointers to all the individual
+  // outputs.
+  if (unnested_hlo->IsMultiOutputFusion()) {
+    std::vector<IrArray> output_arrays =
+        ConstructIrArrayForOutputs(*unnested_hlo);
+    llvm_ir::EmitTuple(GetIrArray(*unnested_hlo, *unnested_hlo), output_arrays,
+                       &b_, module_);
+  }
+
+  return launch_dimensions;
+}
+
 // Emits a kernel for the given hlo instruction using a tiled 0-2-1 transpose
 // algorithm to improve the memory access patterns for the input parameters
-// which have a shape that is a 0-2-1 transpose of the output tensors.
+// with a shape that is a 0-2-1 transpose of the output tensor shape.
 //
 // For the purpose of tiling, the output tensors have a logical shape of three
-// components 0-2-1 while the relevant input parameters have a logical shape of
-// three components 0-1-2 in the order major to minor. The x- and y- dimensions
-// of the tensors are tiled in square tiles of edge length `kTileSize`. Each
-// thread block of `kTileSize` x `kNumRows` threads transposes one tile: each
-// thread copies kTileSize/kNumRows elements from the input to a shared memory
-// tile, then the otherwise "regular hlo kernel" reads from the shared memory
-// instead of the original input.
+// components 0-2-1 while the relevant input parameters have a logical shape
+// of three components 0-1-2 in the order major to minor. The x- and y-
+// dimensions of the tensors are tiled in square tiles with an edge length
+// `kTileSize`. Each thread block of `kTileSize` x `kNumRows` threads
+// transposes one tile: each thread copies kTileSize/kNumRows elements from
+// the input to a shared memory tile, then the otherwise "regular HLO kernel"
+// reads from the shared memory instead of the original input.
 //
 // This is similar to the following CUDA algorithm in TensorFlow:
 // https://goo.gl/MStRV6.
@@ -3313,219 +3016,37 @@ void EmitTiledElementalCodeWithBoundsCheck(
 // `kTileSize` should usually be same as warp size. We currently choose 32 for
 // `kTileSize` and 4 for `kNumRows`. The CUDA algorithm uses 8 for `kNumRows`.
 //
-// TODO(b/33320379): Here each block transposes 1 tile. It may be more efficient
-// to launch fewer blocks so each transposes many tiles.
+// TODO(b/33320379): Here each block transposes 1 tile. It may be more
+// efficient to launch fewer blocks so each transposes many tiles.
 LaunchDimensions IrEmitterUnnested::EmitHlo021Tile(
     HloInstruction* hlo, absl::Span<const int64> reduced_output_dims,
     absl::Span<const int64> tiled_param_ids) {
-  // Parameters for the tiling algorithm.
-  constexpr int64 kTileSize = 32;
-  constexpr int64 kNumRows = 4;
-  constexpr int64 kThreadsPerTile = kTileSize * kNumRows;
-
-  // Construct IrArrays for the inputs and outputs.
-  std::vector<IrArray> output_arrays = ConstructIrArrayForOutputs(*hlo);
-  int64 num_outputs = output_arrays.size();
-  std::vector<IrArray> param_arrays = ConstructIrArrayForInputs(*hlo);
-  int64 num_params = param_arrays.size();
-
-  // Allocate shared memory buffers to store the tiled inputs.
-  std::vector<llvm::Value*> param_shmem_buffers(num_params, nullptr);
-  for (int64 id : tiled_param_ids) {
-    const HloInstruction* param = hlo->operand(id);
-    // Add 1 to the minor dimension to reduce shared memory bank conflicts.
-    llvm::Type* tile_type = llvm::ArrayType::get(
-        llvm::ArrayType::get(llvm_ir::PrimitiveTypeToIrType(
-                                 param->shape().element_type(), module_),
-                             kTileSize + 1),
-        kTileSize);
-    auto* tile_base_ptr = llvm_ir::AllocateSharedMemoryTile(
-        b_.GetInsertBlock()->getParent()->getParent(), tile_type,
-        IrName(hlo, StrCat("tile", id)));
-    param_shmem_buffers[id] = tile_base_ptr;
-    VLOG(3) << "Added shmem buffer for parameter " << id << ": "
-            << llvm_ir::DumpToString(*tile_base_ptr);
-  }
-
-  // The 0-2-1 shape of the tiling scheme is the reduced shape of the HLO result
-  // for the purpose of tiling. Calculate the logical output dimensions in the
-  // tile from the reduced output dimensions.
-  std::vector<int64> output_dims_in_tiles = std::vector<int64>(
-      reduced_output_dims.begin(), reduced_output_dims.end());
-  CHECK_EQ(output_dims_in_tiles.size(), 3);
-  for (int i = 1; i < 3; ++i) {
-    output_dims_in_tiles[i] =
-        CeilOfRatio<int64>(output_dims_in_tiles[i], kTileSize);
-  }
-  const int64 num_tiles =
-      absl::c_accumulate(output_dims_in_tiles, 1, std::multiplies<int64>());
-  LaunchDimensions launch_dimensions(num_tiles, kThreadsPerTile);
-
-  llvm::Type* index_ty =
-      GetIndexTypeForKernel(hlo, launch_dimensions.launch_bound(), &b_);
-  auto index_typed_constant = [&](uint64 c) -> llvm::Constant* {
-    return llvm::ConstantInt::get(index_ty, c);
-  };
-
-  // Cast each output IrArray to its corresponding reduced shape and keep the
-  // reduced shape live during IR emission.
-  std::vector<IrArray> output_in_reduced_shape_arrays;
-  std::vector<Shape> output_reduced_shapes;
-  CHECK_EQ(ConstructOutputReducedShapeAndCastOutputIrArrayToShape(
-               *hlo, output_arrays, reduced_output_dims, &output_reduced_shapes,
-               &output_in_reduced_shape_arrays),
-           num_outputs);
-
-  // For each tiled parameter, cast its input IrArray to the corresponding
-  // reduced shape and keep the reduced shape live during IR emission.
-  std::vector<IrArray> param_in_reduced_shape_arrays;
-  std::vector<Shape> param_reduced_shapes;
-  CHECK_EQ(ConstructInputReducedShapeAndCastInputIrArrayToShape(
-               *hlo, param_arrays, param_shmem_buffers, reduced_output_dims,
-               &param_reduced_shapes, &param_in_reduced_shape_arrays),
-           num_params);
-
-  // Calculate the starting element coordinate within a tile for the current
-  // thread, (y, x) from thread_id.
-  llvm::Value* x;
-  llvm::Value* y;
-  std::tie(y, x) = CalculateYXCoordinateWithinTile(
-      &b_, index_typed_constant(kTileSize), kThreadsPerTile);
-
-  // Calculate the index for the current output tile from block_id.
-  const IrArray::Index output_tile_index(
-      GetBlockIdx(&b_, index_ty, num_tiles),
-      ShapeUtil::MakeShapeWithDescendingLayout(PRED /*arbitrary*/,
-                                               output_dims_in_tiles),
-      &b_);
-
-  // Output tile origin is the index for the first element of the current output
-  // tile.
-  const IrArray::Index output_tile_origin = [&] {
-    IrArray::Index index = output_tile_index;
-    for (int i = 1; i < 3; ++i) {
-      index[i] = Mul(output_tile_index[i], index_typed_constant(kTileSize),
-                     "tile_origin." + std::to_string(i));
-    }
-    return index;
-  }();
-
-  // Calculate the input tile origin from the output tile origin.
-  const IrArray::Index input_tile_origin(
-      Permute({0, 2, 1}, output_tile_origin.multidim()));
-
-  // Calculate the current output tile bounds in each of the logical dimensions.
-  std::vector<llvm::Value*> output_tile_bounds(3);
-  for (int i = 1; i < 3; ++i) {
-    // Only last row or column may not have full size.
-    output_tile_bounds[i] =
-        Select(ICmpEQ(output_tile_index[i],
-                      index_typed_constant(output_dims_in_tiles[i] - 1)),
-               index_typed_constant(reduced_output_dims[i] -
-                                    (output_dims_in_tiles[i] - 1) * kTileSize),
-               index_typed_constant(kTileSize), "kTileSize");
-  }
-
-  KernelSupportLibrary ksl(&b_, llvm_ir::UnrollMode::kDefaultUnroll);
-
-  // Curry a few parameters to EmitTiledElementalCodeWithBoundsCheck.
-  auto emit_tiled_elemental_code_with_bounds_check =
-      [&](const IrArray::Index& index, const string& loop_name,
-          llvm::Value* tile_width, llvm::Value* tile_height,
-          const std::function<void(const IrArray::Index&, llvm::Value*)>&
-              emit_elem_function) {
-        EmitTiledElementalCodeWithBoundsCheck(
-            kTileSize, kNumRows, index, loop_name, &ksl, &b_, y, x, tile_width,
-            tile_height, emit_elem_function);
-      };
-
-  // Adds `addend` to the given `dim` of `index`.
-  auto offset_dim = [&](IrArray::Index index, llvm::Value* addend, int64 dim) {
-    index[dim] = Add(index[dim], addend);
-    return index;
-  };
-  const IrArray::Index input_index =
-      offset_dim(offset_dim(input_tile_origin, x, /*dim=*/2), y, /*dim=*/1);
-
-  // Copy input parameter values to shared memory buffers:
-  // tile[y, x] = input[index]
-  emit_tiled_elemental_code_with_bounds_check(
-      input_index, "input", output_tile_bounds[1], output_tile_bounds[2],
-      [&](const IrArray::Index& index, llvm::Value* y_loc) {
-        for (int64 id : tiled_param_ids) {
-          IrArray& input_in_logical_shape = param_in_reduced_shape_arrays[id];
-          llvm::Value* shmem_buffer = param_shmem_buffers[id];
-          // TODO(jlebar): Add AA metadata to this store.  Tile buffers are
-          // global variables, so LLVM can't infer much about it.
-          Store(input_in_logical_shape.EmitReadArrayElement(index, &b_,
-                                                            "input_element"),
-                GEP(shmem_buffer, {index_typed_constant(0), y_loc, x}));
-        }
-      });
-
-  // Wait for all threads to reach this point, lest we copy a value from tile to
-  // output before the other thread copies it from input to tile.
-  // This is `__syncthreads` in CUDA.
-  llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::nvvm_barrier0, {}, {}, &b_);
-
-  llvm_ir::TiledParameterInfo tiled_param_info(param_shmem_buffers, y, x);
-
-  const IrArray::Index output_index =
-      offset_dim(offset_dim(output_tile_origin, x, /*dim=*/2), y, /*dim=*/1);
-
-  // Write to output[index] by emitting code like normal, except that values for
-  // the tiled parameters are read from the shmem buffers.
+  constexpr int kNumRows = 4;
+  KernelMappingScheme mapping_scheme(
+      reduced_output_dims, /*tile_size_y=*/kWarpSize,
+      /*tile_size_x=*/kWarpSize, /*req_block_sizes=*/{1, 1, 1},
+      /*num_threads_y=*/kNumRows,
+      /*num_threads_x=*/kWarpSize, &b_);
+  TileElementGenerator element_generator;
   if (hlo->opcode() == HloOpcode::kCopy) {
-    emit_tiled_elemental_code_with_bounds_check(
-        output_index, "output", output_tile_bounds[2], output_tile_bounds[1],
-        [&](const IrArray::Index& index, llvm::Value* y_loc) {
-          // TODO(jlebar): Add AA metadata to this load.
-          llvm::Instruction* load_from_shmem_buffer =
-              Load(GEP(param_shmem_buffers[0], {b_.getInt64(0), x, y_loc}),
-                   "output_element");
-          output_in_reduced_shape_arrays[0].EmitWriteArrayElement(
-              index, load_from_shmem_buffer, &b_);
-        });
+    element_generator = [&](HloInstruction* hlo,
+                            const llvm_ir::IrArray::Index& index,
+                            const KernelCodegenInfo* kernel_info,
+                            llvm::Value* y_loc, llvm::Value* x_loc) {
+      EmitTileElementForCopy(hlo, index, kernel_info, y_loc, x_loc);
+    };
   } else {
-    CHECK_EQ(hlo->opcode(), HloOpcode::kFusion);
-    emit_tiled_elemental_code_with_bounds_check(
-        output_index, "output", output_tile_bounds[2], output_tile_bounds[1],
-        [&](const IrArray::Index& index, llvm::Value* y_loc) {
-          GpuElementalIrEmitter elem_emitter(hlo_module_config_, module_, &b_,
-                                             GetNestedComputer());
-          FusedIrEmitter fused_emitter(GetGeneratorForOperandIrArrays(hlo),
-                                       &elem_emitter);
-          tiled_param_info.set_y(y_loc);
-          fused_emitter.SetTiledParameterInfo(&tiled_param_info);
-          TF_CHECK_OK(hlo->fused_expression_root()->Accept(&fused_emitter));
-          IrArray::Index untiled_index = llvm_ir::GetUnreducedOutputIndex(
-              index, output_reduced_shapes[0], output_arrays[0].GetShape(),
-              &b_);
-          const llvm_ir::ElementGenerator& output_generator =
-              fused_emitter.GetRootGenerator();
-          llvm::Value* output_value =
-              output_generator(untiled_index).ValueOrDie();
-          if (hlo->IsMultiOutputFusion()) {
-            CHECK(output_value->getType()->isStructTy());
-            CHECK_EQ(output_value->getType()->getStructNumElements(),
-                     output_in_reduced_shape_arrays.size());
-            for (int64 i = 0; i < output_in_reduced_shape_arrays.size(); ++i) {
-              output_in_reduced_shape_arrays[i].EmitWriteArrayElement(
-                  index, ExtractValue(output_value, i), &b_);
-            }
-          } else {
-            output_in_reduced_shape_arrays[0].EmitWriteArrayElement(
-                index, output_value, &b_);
-          }
-        });
+    DCHECK_EQ(hlo->opcode(), HloOpcode::kFusion);
+    element_generator = [&](HloInstruction* hlo,
+                            const llvm_ir::IrArray::Index& index,
+                            const KernelCodegenInfo* kernel_info,
+                            llvm::Value* y_loc, llvm::Value* x_loc) {
+      EmitTileElementForFusion(hlo, index, kernel_info, y_loc, x_loc);
+    };
   }
-
-  // For multioutput fusion, emit a tuple with all the individual outputs.
-  if (hlo->IsMultiOutputFusion()) {
-    llvm_ir::EmitTuple(GetIrArray(*hlo, *hlo), output_arrays, &b_, module_);
-  }
-
-  return launch_dimensions;
+  KernelCodegenInfo kernel_info(&mapping_scheme);
+  KernelCodeGenerator kernel_generator(std::move(element_generator));
+  return EmitKernel(hlo, tiled_param_ids, kernel_generator, &kernel_info);
 }
 
 namespace {
@@ -3562,8 +3083,8 @@ bool IrEmitterUnnested::CheckAndEmitHloWithTile021(HloInstruction* hlo) {
                                   ? ShapeUtil::GetSubshape(hlo->shape(), {0})
                                   : hlo->shape();
 
-  // If the output_shape is reduced to 021 shape, find all the parameters of the
-  // hlo that are in the corresponding 012 shape.
+  // If the output_shape is reduced to 021 shape, find all the parameters of
+  // the HLO that are in the corresponding 012 shape.
   std::vector<int64> params_012;
   optional<std::vector<int64>> reduced_dims_021;
   for (int64 operand_idx = 0; operand_idx < hlo->operand_count();
@@ -3600,9 +3121,9 @@ bool IrEmitterUnnested::CheckAndEmitHloWithTile021(HloInstruction* hlo) {
   }
 
   // Each of our shared memory tiles has 32*33 elements (so ~4kb, if the
-  // elements are of size 4 bytes), and CUDA has an architectural limit of 48kb
-  // shared memory per SM.  (This is increased to 96kb in Volta, but we don't
-  // use this, in part because it eats into our L1 cache space.)
+  // elements are of size 4 bytes), and CUDA has an architectural limit of
+  // 48kb shared memory per SM.  (This is increased to 96kb in Volta, but we
+  // don't use this, in part because it eats into our L1 cache space.)
   //
   // For correctness we need to ensure that we don't make more than 48kb worth
   // of shmem tiles per block.  And for performance, we'd probably like to use
@@ -3610,9 +3131,9 @@ bool IrEmitterUnnested::CheckAndEmitHloWithTile021(HloInstruction* hlo) {
   // gpu core.
   //
   // We say without benchmarks that we want at least 3 threads/block,
-  // corresponding to 3 shmem tiles if the elements are 32 bits wide.  We choose
-  // which params get the shmem transpose treatment arbitrarily; it's not clear
-  // if there's a Right Choice.
+  // corresponding to 3 shmem tiles if the elements are 32 bits wide.  We
+  // choose which params get the shmem transpose treatment arbitrarily; it's
+  // not clear if there's a Right Choice.
   //
   // This is only sound if tiled transposes are the only place where we use
   // shared memory in fusions.  If in the future other fusible ops use shared
@@ -3645,6 +3166,246 @@ bool IrEmitterUnnested::CheckAndEmitHloWithTile021(HloInstruction* hlo) {
   return true;
 }
 
+namespace {
+// Checks that the outputs of a fusion with reduction are consistent.
+Status AreFusedReductionOutputsConsistent(
+    absl::Span<HloInstruction* const> output_instructions,
+    const HloInstruction* first_reduce) {
+  for (const HloInstruction* inst : output_instructions) {
+    if (inst->opcode() == HloOpcode::kReduce) {
+      // Shapes, layouts and dimensions must be the same for all reduces
+      // inside of this fusion.
+      TF_RET_CHECK(ShapeUtil::Equal(first_reduce->shape(), inst->shape()));
+      TF_RET_CHECK(ShapeUtil::Equal(first_reduce->operand(0)->shape(),
+                                    inst->operand(0)->shape()));
+      TF_RET_CHECK(ShapeUtil::Equal(first_reduce->operand(1)->shape(),
+                                    inst->operand(1)->shape()));
+      TF_RET_CHECK(first_reduce->dimensions() == inst->dimensions());
+    } else {
+      // For extra outputs we can relax shape equality to allow different
+      // types (with the same number of elements). Layouts still have to
+      // match.
+      TF_RET_CHECK(ShapeUtil::CompatibleIgnoringElementType(
+          first_reduce->operand(0)->shape(), inst->shape()));
+      TF_RET_CHECK(LayoutUtil::Equal(first_reduce->operand(0)->shape().layout(),
+                                     inst->shape().layout()));
+    }
+  }
+  return Status::OK();
+}
+
+// Finds the dimensions to keep for the reduction, sorts and returns the
+// dimensions from minor to major.
+DimensionVector GetDimensionsToKeepMinorToMajor(
+    const Shape& input_shape, absl::Span<const int64> dims_to_reduce) {
+  DimensionVector input_dims(ShapeUtil::Rank(input_shape), 0);
+  absl::c_iota(input_dims, 0);
+  DimensionVector input_dims_to_keep;
+  for (int input_dim : input_dims) {
+    auto it = absl::c_find_if(dims_to_reduce, [&](int64 dim_to_reduce) {
+      return dim_to_reduce == input_dim;
+    });
+    if (it == dims_to_reduce.end()) {
+      input_dims_to_keep.push_back(input_dim);
+    }
+  }
+
+  // Sort the dimensions to keep from minor to major.
+  absl::c_sort(input_dims_to_keep, [&input_shape](int64 dim_a, int64 dim_b) {
+    return PositionInContainer(LayoutUtil::MinorToMajor(input_shape), dim_a) <
+           PositionInContainer(LayoutUtil::MinorToMajor(input_shape), dim_b);
+  });
+
+  VLOG(10) << "dims to keep minor to major"
+           << absl::StrJoin(input_dims_to_keep, ",");
+  return input_dims_to_keep;
+}
+
+// Given the input shape and dimensions to reduce for the reduction to vector,
+// returns <num_reduced_major, num_kept, num_reduced_minor>:
+// num_kept: the number of elements in the contiguous dimensions to keep.
+// num_reduced_major: the number of elements in the dimensions to reduce that
+//   are more major than the dimensions to keep.
+// num_reduced_minor: the number of elements in the dimensions to reduce that
+//   are more minor than the dimensions to kept.
+std::tuple<int64, int64, int64> GetReductionToVectorDimensions(
+    const Shape& input_shape, absl::Span<const int64> dims_to_reduce) {
+  DimensionVector input_dims_to_keep_minor_to_major =
+      GetDimensionsToKeepMinorToMajor(input_shape, dims_to_reduce);
+  CHECK(LayoutUtil::AreDimensionsConsecutive(
+      input_shape.layout(), input_dims_to_keep_minor_to_major));
+  int num_reduced_major = 1, num_kept = 1, num_reduced_minor = 1;
+  if (input_dims_to_keep_minor_to_major.empty()) {
+    return std::make_tuple(num_reduced_major, num_kept, num_reduced_minor);
+  }
+  DimensionVector input_dims(ShapeUtil::Rank(input_shape), 0);
+  absl::c_iota(input_dims, 0);
+  absl::Span<const int64> minor_to_major =
+      LayoutUtil::MinorToMajor(input_shape);
+  for (int input_dim : input_dims) {
+    int64 curr_dim_size = input_shape.dimensions(input_dim);
+    if (PositionInContainer(minor_to_major, input_dim) >
+        PositionInContainer(minor_to_major,
+                            input_dims_to_keep_minor_to_major.back())) {
+      num_reduced_major *= curr_dim_size;
+    } else if (PositionInContainer(minor_to_major, input_dim) <
+               PositionInContainer(minor_to_major,
+                                   input_dims_to_keep_minor_to_major.front())) {
+      num_reduced_minor *= curr_dim_size;
+    } else {
+      num_kept *= curr_dim_size;
+    }
+  }
+
+  return std::make_tuple(num_reduced_major, num_kept, num_reduced_minor);
+}
+
+std::tuple<KernelMappingScheme, bool> ComputeMappingSchemeAndReductionKind(
+    const HloInstruction* first_reduce, llvm::IRBuilder<>* b) {
+  int64 depth = 1;
+  int64 height = 1;
+  int64 width = 1;
+  bool is_row_reduction = true;
+  int64 tile_size_x = 1;
+  int64 tile_size_y = 1;
+  int64 block_size_y = 1;
+  int64 block_size_z = 1;
+  int64 num_threads_x = 1;
+  int64 num_threads_y = 1;
+  const Shape& input_shape = first_reduce->operand(0)->shape();
+  int64 num_input_elems = ShapeUtil::ElementsIn(input_shape);
+  int64 num_output_elems = ShapeUtil::ElementsIn(first_reduce->shape());
+  int64 num_reduced_major, num_kept, num_reduced_minor;
+  std::tie(num_reduced_major, num_kept, num_reduced_minor) =
+      GetReductionToVectorDimensions(input_shape, first_reduce->dimensions());
+  CHECK_EQ(num_output_elems, num_kept);
+
+  if (num_kept == 1) {
+    // Scalar reduction is a special row reduction with depth = height = 1.
+    width = num_input_elems;
+    tile_size_x = kWarpSize * 16;
+    num_threads_x = kWarpSize;
+  } else if (num_reduced_minor == 1) {
+    // Column reduction reduces inputs with dimension [height, width], where
+    // width is the minor dimension, to dimension [width].
+    height = num_reduced_major;
+    width = num_kept;
+    is_row_reduction = false;
+    tile_size_x = std::min(kWarpSize, num_kept);
+    // The old Column reduction algorithm uses kTileHeight = 128. We choose
+    // tile_size_y * block_size_y = 128 to match the value of kTileHeight. Using
+    // a non-trivial block_size_y here is a way to avoid unrolling all the 128
+    // iterations.
+    tile_size_y = 32;
+    block_size_y = 4;
+    num_threads_x = tile_size_x;
+  } else {
+    // Row reduction reduces inputs with dimension [depth, height, width],
+    // where width is the most minor dimension, to dimension [height] .
+    depth = num_reduced_major;
+    height = num_kept;
+    width = num_reduced_minor;
+    num_threads_x = kWarpSize;
+    if (width % (kWarpSize * 64) == 0) {
+      tile_size_x = kWarpSize * 64;
+    } else {
+      tile_size_x = kWarpSize * 8;
+      block_size_z = 8;
+      while (depth % block_size_z != 0) {
+        block_size_z -= 1;
+      }
+    }
+  }
+  DCHECK_EQ(depth * height * width, num_input_elems);
+  VLOG(10) << "is_row_reduction " << is_row_reduction << depth << " " << height
+           << " " << width;
+
+  DimensionVector dims_in_elem{depth, height, width};
+  DimensionVector req_block_sizes{block_size_z, block_size_y, 1};
+  llvm_ir::KernelMappingScheme mapping_scheme(dims_in_elem, tile_size_y,
+                                              tile_size_x, req_block_sizes,
+                                              num_threads_y, num_threads_x, b);
+  return std::make_tuple(mapping_scheme, is_row_reduction);
+}
+
+}  // namespace
+
+Status IrEmitterUnnested::EmitReductionToVector(HloInstruction* unnested_hlo) {
+  VLOG(10) << "Emitting reduction to vector " << unnested_hlo->ToString();
+
+  HloInstruction* reduce_or_tuple = unnested_hlo->opcode() == HloOpcode::kFusion
+                                        ? unnested_hlo->fused_expression_root()
+                                        : unnested_hlo;
+  absl::Span<HloInstruction* const> output_instructions =
+      GetOutputInstructions(&reduce_or_tuple);
+  const HloInstruction* first_reduce =
+      GetFirstReduceInstruction(output_instructions);
+
+  if (output_instructions.size() > 1) {
+    TF_RETURN_IF_ERROR(
+        AreFusedReductionOutputsConsistent(output_instructions, first_reduce));
+  }
+
+  // Build an initializer thunk to initialize each reduction output.
+  std::vector<std::unique_ptr<Thunk>> thunks;
+  for (int i = 0, e = output_instructions.size(); i != e; ++i) {
+    if (output_instructions[i]->opcode() != HloOpcode::kReduce) {
+      continue;
+    }
+    TF_ASSIGN_OR_RETURN(
+        std::unique_ptr<Thunk> initializer_thunk,
+        BuildInitializerThunk(unnested_hlo,
+                              (output_instructions[i] == reduce_or_tuple)
+                                  ? ShapeIndex()
+                                  : ShapeIndex({i})));
+    thunks.push_back(std::move(initializer_thunk));
+  }
+
+  // Build a kernel thunk to compute all the outputs.
+  std::unique_ptr<KernelThunk> kernel_thunk =
+      BuildKernelThunk(unnested_hlo, /*implements_whole_instruction=*/false);
+
+  const Shape& input_shape = first_reduce->operand(0)->shape();
+  // The layout of a reduction input is either set by LayoutAssignment for
+  // unnested kReduce or by InstructionFusion for fused kReduce.
+  CHECK(input_shape.has_layout()) << "LayoutAssignment or InstructionFusion "
+                                     "doesn't set the input layout of "
+                                  << first_reduce->ToString();
+
+  bool is_row_reduction;
+  llvm_ir::KernelMappingScheme mapping_scheme;
+  std::tie(mapping_scheme, is_row_reduction) =
+      ComputeMappingSchemeAndReductionKind(first_reduce, &b_);
+  ReductionCodegenInfo reduction_info(&mapping_scheme, is_row_reduction);
+  KernelCodeGenerator kernel_generator(
+      /*tile_element_generator=*/
+      [&](HloInstruction* hlo, const llvm_ir::IrArray::Index& index,
+          const KernelCodegenInfo* kernel_info, llvm::Value* y_loc,
+          llvm::Value* x_loc) {
+        EmitTileElementForReduction(hlo, index, kernel_info, y_loc, x_loc);
+      },
+      /*block_prologue_generator=*/
+      [&](HloInstruction* hlo, KernelCodegenInfo* kernel_info) {
+        EmitPrologueForReduction(hlo, kernel_info);
+      },
+      /*block_epilogue_generator*/
+      [&](HloInstruction* hlo, KernelCodegenInfo* kernel_info) {
+        EmitEpilogueForReduction(hlo, kernel_info);
+      });
+
+  LaunchDimensions launch_dimensions =
+      EmitKernel(unnested_hlo, {}, kernel_generator, &reduction_info);
+  UpdateLaunchDimensions(launch_dimensions, kernel_thunk.get(),
+                         ir_emitter_context_->llvm_module());
+
+  thunks.push_back(std::move(kernel_thunk));
+  std::unique_ptr<SequentialThunk> sequential_thunk =
+      absl::make_unique<SequentialThunk>(std::move(thunks), unnested_hlo);
+  AddThunkToThunkSequence(std::move(sequential_thunk));
+
+  return Status::OK();
+}
+
 Status IrEmitterUnnested::EmitConstantGlobals() {
   for (const BufferAllocation& allocation :
        ir_emitter_context_->buffer_assignment().Allocations()) {
@@ -3666,10 +3427,10 @@ Status IrEmitterUnnested::EmitConstantGlobals() {
     }
 
     // These globals will be looked up by name by GpuExecutable so we need to
-    // give them an external linkage.  Not all of their uses are visible in the
-    // LLVM IR (e.g. TupleThunk) so we can't give then a linkage that merely
-    // preserves their names (like available_externally), we also need to ensure
-    // that they stick around even if they're "unused".
+    // give them an external linkage.  Not all of their uses are visible in
+    // the LLVM IR (e.g. TupleThunk) so we can't give then a linkage that
+    // merely preserves their names (like available_externally), we also need
+    // to ensure that they stick around even if they're "unused".
     //
     // We may have to be more more clever here in the future if we notice that
     // we're keeping around too many globals because of their linkage.
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h
index 334c0b3c20b..85a0e5328c4 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h
@@ -16,9 +16,11 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_IR_EMITTER_UNNESTED_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_IR_EMITTER_UNNESTED_H_
 
+#include "absl/container/inlined_vector.h"
 #include "tensorflow/compiler/xla/service/gpu/ir_emitter.h"
 #include "tensorflow/compiler/xla/service/gpu/sequential_thunk.h"
 #include "tensorflow/compiler/xla/service/gpu/thunk.h"
+#include "tensorflow/compiler/xla/service/llvm_ir/kernel_support_library.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/kernel_tiling.h"
 
 namespace xla {
@@ -47,6 +49,99 @@ namespace gpu {
 //
 class IrEmitterUnnested : public IrEmitter {
  public:
+  // Parameter block_contains_multi_tiles indicates whether a tile block
+  // consists of multiple tiles or not. If the tile block contains only one
+  // tile, there is no need to use atomic operation to accumulate a local result
+  // to a global result to implement reduction.
+  using TileGenerator =
+      std::function<void(const llvm_ir::IrArray::Index& output_tile_origin,
+                         absl::Span<llvm::Value* const> output_tile_bounds,
+                         bool block_contains_multi_tiles)>;
+  // KernelCodegenInfo records the common information to support the code
+  // generation for a kernel to process tensor elements by blocks. A block of
+  // tensor elements may contain one or multiple tiles. The code generators that
+  // generate code for tile elements or block prologue/epilogue refer to this
+  // class in their prototypes. If the implementations of such code generators
+  // require other information that are specific to the HLO instructions, the
+  // implementations need to define and use derived classes of this class.
+  class KernelCodegenInfo {
+   public:
+    explicit KernelCodegenInfo(llvm_ir::KernelMappingScheme* mapping_scheme)
+        : mapping_scheme_(mapping_scheme),
+          tiled_param_info_(nullptr),
+          lane_id_(nullptr),
+          index_ty_(nullptr) {}
+    virtual ~KernelCodegenInfo() {}
+
+    void SetLaneId(llvm::Value* v) { lane_id_ = v; }
+    void SetIndexType(llvm::Type* t) { index_ty_ = t; }
+    void SetTiledParamInfo(llvm_ir::TiledParameterInfo* tiled_param_info) {
+      CHECK_EQ(tiled_param_info_, nullptr);
+      tiled_param_info_ = tiled_param_info;
+    }
+
+    llvm::Value* GetLaneId() const { return lane_id_; }
+    llvm_ir::KernelMappingScheme* GetKernelMappingScheme() const {
+      return mapping_scheme_;
+    }
+    llvm_ir::TiledParameterInfo* GetTiledParameterInfo() const {
+      return tiled_param_info_;
+    }
+    llvm::Type* GetIndexType() const { return index_ty_; }
+
+   private:
+    llvm_ir::KernelMappingScheme* mapping_scheme_;
+    llvm_ir::TiledParameterInfo* tiled_param_info_;
+    llvm::Value* lane_id_;
+    llvm::Type* index_ty_;
+  };
+
+  // A function object to prepare for the code generation for a tile block.
+  using BlockPrologueGenerator =
+      std::function<void(HloInstruction* hlo, KernelCodegenInfo* kernel_info)>;
+  // A function object to finalize the code generation for a tile block.
+  using BlockEpilogueGenerator =
+      std::function<void(HloInstruction* hlo, KernelCodegenInfo* kernel_info)>;
+  // A function object to generate code to process one element in a tile.
+  //
+  // hlo: the instruction for which the code is generated for.
+  // index: the index for the first output element of the current thread.
+  // y_loc: The y coordinate within a tile.
+  // x_loc: The x coordinate within a tile.
+  // kernel_info: Other information to support the kernel code generation.
+  using TileElementGenerator = std::function<void(
+      HloInstruction* hlo, const llvm_ir::IrArray::Index& index,
+      const KernelCodegenInfo* kernel_info, llvm::Value* y_loc,
+      llvm::Value* x_loc)>;
+
+  // KernelCodeGenerator records the code generator objects that generate code
+  // for tile elements or tile block prologue/epilogue.
+  class KernelCodeGenerator {
+   public:
+    explicit KernelCodeGenerator(
+        TileElementGenerator tile_element_generator,
+        BlockPrologueGenerator block_prologue_generator = {},
+        BlockEpilogueGenerator block_epilogue_generator = {})
+        : tile_element_generator_(std::move(tile_element_generator)),
+          block_prologue_generator_(std::move(block_prologue_generator)),
+          block_epilogue_generator_(std::move(block_epilogue_generator)) {}
+
+    const TileElementGenerator& GetTileElementGenerator() const {
+      return tile_element_generator_;
+    }
+    const BlockPrologueGenerator& GetBlockPrologueGenerator() const {
+      return block_prologue_generator_;
+    }
+    const BlockEpilogueGenerator& GetBlockEpilogueGenerator() const {
+      return block_epilogue_generator_;
+    }
+
+   private:
+    TileElementGenerator tile_element_generator_;
+    BlockPrologueGenerator block_prologue_generator_;
+    BlockEpilogueGenerator block_epilogue_generator_;
+  };
+
   IrEmitterUnnested(const HloModuleConfig& hlo_module_config,
                     const HloComputation* hlo_computation,
                     IrEmitterContext* ir_emitter_context);
@@ -82,7 +177,7 @@ class IrEmitterUnnested : public IrEmitter {
   Status HandleSort(HloInstruction* sort) override;
   Status HandleTupleSelect(HloInstruction* tuple_select) override;
   Status HandleCrossReplicaSum(HloInstruction* crs) override;
-  Status HandleAfterAll(HloInstruction* gen_token) override;
+  Status HandleAfterAll(HloInstruction* after_all) override;
 
   Status EmitTargetElementLoop(
       const HloInstruction& hlo,
@@ -111,82 +206,14 @@ class IrEmitterUnnested : public IrEmitter {
 
   // Helper for writing extra outputs from inside a reduce kernel.
   Status EmitExtraOutputsForReduce(
-      const HloInstruction* reduce, const llvm_ir::IrArray::Index& index,
+      const HloInstruction* unnested_hlo, const llvm_ir::IrArray::Index& index,
       absl::Span<const std::pair<llvm_ir::ElementGenerator, ShapeIndex>>
           extra_output_gens);
 
-  // EmitColumnReduction and EmitRowReduction emit code for column and row
-  // reduction of a matrix and/or 3D tensor. Row and column reduction have
-  // different memory access pattern, so for performance their implementations
-  // are significantly different.
+  // Generates code for reduction to contiguous dimensions.
   //
-  // Emits code that reduces a matrix of shape [height x width] to a vector of
-  // [width]. Other parameters have the same meaning as those of
-  // `EmitReductionToVector`. Note that input shape might not be
-  // [height x width], but can be bitcast to [height x width] with "height"
-  // being the major dimension.
-  Status EmitColumnReduction(
-      KernelThunk* kernel_thunk, int64 height, int64 width,
-      HloInstruction* reduce, const Shape& input_shape,
-      absl::Span<const llvm_ir::ElementGenerator> input_gens,
-      absl::Span<const llvm_ir::ElementGenerator> init_value_gens,
-      absl::Span<HloComputation* const> reducers,
-      absl::Span<const ShapeIndex> reduce_output_shapes,
-      absl::Span<const std::pair<llvm_ir::ElementGenerator, ShapeIndex>>
-          extra_output_gens);
-
-  // Emits code that reduces a 3D tensor of shape [depth x height x width] to a
-  // vector of shape [height]. Other parameters have the same meaning as those
-  // of `EmitReductionToVector`. Note that input shape might not be
-  // [depth x height x width], but can be bitcast to [depth x height x width]
-  // with "depth" being the most major dimension.
-  Status EmitRowReduction(
-      KernelThunk* kernel_thunk, int64 depth, int64 height, int64 width,
-      HloInstruction* reduce, const Shape& input_shape,
-      absl::Span<const llvm_ir::ElementGenerator> input_gens,
-      absl::Span<const llvm_ir::ElementGenerator> init_value_gens,
-      absl::Span<HloComputation* const> reducers,
-      absl::Span<const ShapeIndex> reduce_output_shapes,
-      absl::Span<const std::pair<llvm_ir::ElementGenerator, ShapeIndex>>
-          extra_output_gens);
-
-  // Emits code that reduces a tensor of arbitrary rank to a scalar.
-  Status EmitReductionToScalar(
-      KernelThunk* kernel_thunk, HloInstruction* reduce,
-      const Shape& input_shape,
-      absl::Span<const llvm_ir::ElementGenerator> input_gens,
-      absl::Span<const llvm_ir::ElementGenerator> init_value_gens,
-      absl::Span<HloComputation* const> reducers,
-      absl::Span<const ShapeIndex> reduce_output_shapes,
-      absl::Span<const std::pair<llvm_ir::ElementGenerator, ShapeIndex>>
-          extra_output_gens);
-
-  // Figures out whether `reduce` is a row or column reduction, and which
-  // dimensions to reduce, and calls either `EmitRowReduction` or
-  // `EmitColumnReduction` as appropriate. `input_shape` is the shape of the
-  // input array, which is the operand of the Reduce instruction if unfused or
-  // of the Fusion instruction if fused. `input_gen` and `init_value_gen`
-  // generate elements of the input and the initial value. Other parameters mean
-  // the same as for `HandleReduce`.
-  //
-  // Multiple reduces can be emitted in the same loop, assuming they have the
-  // same input and output shapes, and the same reduce dimensions.
-  //
-  // extra_output_gens can contain extra generators for intermediate outputs.
-  // These must have the same shape as the reduce input as they are computed
-  // when the reduce inputs are being read.
-  //
-  // Prerequisite: `IsReductionToVector(*reduce)`
-  Status EmitReductionToVector(
-      KernelThunk* kernel_thunk, HloInstruction* reduce,
-      const Shape& input_shape,
-      absl::Span<const llvm_ir::ElementGenerator> input_gens,
-      absl::Span<const llvm_ir::ElementGenerator> init_value_gens,
-      absl::Span<const int64> dimensions_to_reduce,
-      absl::Span<HloComputation* const> reducers,
-      absl::Span<const ShapeIndex> reduce_output_shapes,
-      absl::Span<const std::pair<llvm_ir::ElementGenerator, ShapeIndex>>
-          extra_output_gens);
+  // Prerequisite: `IsReductionToVector(*unnested_hlo)`
+  Status EmitReductionToVector(HloInstruction* unnested_hlo);
 
   // Emits code for an in-place scatter, modifying `thunk`s launch dimensions in
   // the process. `scatter` may be fused, scatter indices are taken from
@@ -205,22 +232,55 @@ class IrEmitterUnnested : public IrEmitter {
   LaunchDimensions EmitHlo021Tile(HloInstruction* hlo,
                                   absl::Span<const int64> reduced_output_dims,
                                   absl::Span<const int64> tiled_param_ids);
+  // Emits a kernel for an unnested HLO instruction.
+  LaunchDimensions EmitKernel(HloInstruction* unnested_hlo,
+                              absl::Span<const int64> param_ids,
+                              const KernelCodeGenerator& kernel_generator,
+                              KernelCodegenInfo* kernel_info);
+  void EmitBlock(const TileGenerator& emit_one_tile,
+                 const KernelCodegenInfo* kernel_info,
+                 KernelSupportLibrary& ksl, llvm::Type* index_ty);
+  // Emits code to process a tensor element in a tile for the given kCopy HLO
+  // that performs a 0-2-1 transpose.
+  void EmitTileElementForCopy(HloInstruction* hlo,
+                              const llvm_ir::IrArray::Index& index,
+                              const KernelCodegenInfo* kernel_info,
+                              llvm::Value* y_loc, llvm::Value* x_loc);
+  // Emits code to process a tensor element in a tile for the given kLoop fusion
+  // HLO containing parameters that are 0-2-1 transpose of its outputs.
+  void EmitTileElementForFusion(HloInstruction* hlo,
+                                const llvm_ir::IrArray::Index& index,
+                                const KernelCodegenInfo* kernel_info,
+                                llvm::Value* y_loc, llvm::Value* x_loc);
+  // Emits code to process a tensor element in a tile for the given input hlo
+  // that is either a unnested kReduce or a kInput fusion.
+  void EmitTileElementForReduction(HloInstruction* unnested_hlo,
+                                   const llvm_ir::IrArray::Index& index,
+                                   const KernelCodegenInfo* kernel_info,
+                                   llvm::Value* y_loc, llvm::Value* x_loc);
+  // Prepares for the code generation for a tile block of a reduction kernel.
+  void EmitPrologueForReduction(HloInstruction* unnested_hlo,
+                                KernelCodegenInfo* kernel_info);
+  void EmitPrologueForOneReduction(HloInstruction* unnested_hlo,
+                                   HloInstruction* reduce_inst, int reduce_idx,
+                                   KernelCodegenInfo* kernel_info,
+                                   GpuElementalIrEmitter* elemental_emitter,
+                                   ShapeIndex output_shape_index);
+  // Wraps up the code generation for a tile block of a reduction kernel.
+  void EmitEpilogueForReduction(HloInstruction* unnested_hlo,
+                                KernelCodegenInfo* kernel_info);
+  // For each reducer, emits the shuffle-down loop to accumulate the partial
+  // result to the global result.
+  void EmitFullWarpShuffleDownLoopForAllReduces(
+      const absl::InlinedVector<HloComputation*, 1>& reducers,
+      const absl::InlinedVector<llvm::AllocaInst*, 1>&
+          partial_result_addresses);
 
   // Generates the IrArray for each input of an hlo and returns a vector that
   // constains such IrArrays.
   std::vector<llvm_ir::IrArray> ConstructIrArrayForInputs(
       const HloInstruction& hlo);
 
-  // For each output of the `hlo` instruction, constructs the reduced shape for
-  // the output with the given `reduced_output_dims` and cast the original
-  // output IrArray element in `output_arrays` to the reduced shape. Returns
-  // the number of outputs.
-  int ConstructOutputReducedShapeAndCastOutputIrArrayToShape(
-      const HloInstruction& hlo,
-      const std::vector<llvm_ir::IrArray>& output_arrays,
-      absl::Span<const int64> reduced_output_dims,
-      std::vector<Shape>* output_reduced_shapes,
-      std::vector<llvm_ir::IrArray>* output_in_reduced_shape_arrays);
   // For each input of the `hlo` instruction, checks its value in
   // `param_buffers` to find out whether the input has a reduced shape. If the
   // input has a reduced shape, constructs the reduced shape for the input and
diff --git a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/nvptx_backend_lib.cc b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/nvptx_backend_lib.cc
index 8751e3a9c2a..24f07e68973 100644
--- a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/nvptx_backend_lib.cc
+++ b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/nvptx_backend_lib.cc
@@ -177,13 +177,6 @@ std::unique_ptr<llvm::TargetMachine> GetTargetMachine(
   }
 
   TargetOptions target_options = InitTargetOptionsFromCodeGenFlags();
-  llvm_ir::SetTargetOptions(
-      /*fast_math_enabled=*/hlo_module_config.debug_options()
-          .xla_gpu_enable_fast_math(),
-      &target_options);
-
-  // Enable FMA synthesis.
-  target_options.AllowFPOpFusion = FPOpFusion::Fast;
 
   // Set the verbose assembly options.
   target_options.MCOptions.AsmVerbose = false;
@@ -453,18 +446,21 @@ void GPUBackendInit(const HloModuleConfig& hlo_module_config) {
   // * 3-6 gives similar results as 2;
   // * >6 start hurting the performance of at least dot product kernels.
   //
-  // TODO(jingyue): The current threshold only considers the numbr of IR
+  // TODO(jingyue): The current threshold only considers the number of IR
   // instructions which do not accurately reflect the true cost. We need a
   // better cost model.
   FeedLLVMWithFlags({"-bonus-inst-threshold=2"});
-  // TODO(b/22073864): Increase limit when scan memory dependency.
-  // This helps to reduce more redundant load instructions.
+  // Increase limit when scanning memory dependencies.  This helps to reduce
+  // more redundant load instructions.
   //
   // The specific value is currently large enough for s3d in shoc benchmark,
   // which contains a lot of load instructions and many arithmetic instructions
   // between those loads.
   FeedLLVMWithFlags({"-memdep-block-scan-limit=500"});
 
+  // Use div.approx -- it matters for some float-division heavy benchmarks.
+  FeedLLVMWithFlags({"-nvptx-prec-divf32=0"});
+
   llvm_ir::InitializeLLVMCommandLineOptions(hlo_module_config);
 
   // Initialize the NVPTX target; it's the only target we link with, so call its
diff --git a/tensorflow/compiler/xla/service/gpu/multi_output_fusion.cc b/tensorflow/compiler/xla/service/gpu/multi_output_fusion.cc
index d9b06828e2b..01fddcede64 100644
--- a/tensorflow/compiler/xla/service/gpu/multi_output_fusion.cc
+++ b/tensorflow/compiler/xla/service/gpu/multi_output_fusion.cc
@@ -41,50 +41,7 @@ GpuMultiOutputFusion::GpuMultiOutputFusion() : MultiOutputFusion(INT64_MAX) {}
 
 bool GpuMultiOutputFusion::ShapesCompatibleForFusion(HloInstruction* instr1,
                                                      HloInstruction* instr2) {
-  auto get_element_instr =
-      [&](const HloInstruction* instr) -> const HloInstruction* {
-    const HloInstruction* element_instr = instr;
-    if (instr->opcode() == HloOpcode::kFusion) {
-      auto fused_expression_root = instr->fused_expression_root();
-      if (instr->IsMultiOutputFusion()) {
-        // If possible, we want to pick a reduce operand of the fusion root,
-        // because it has the most constraints.
-        for (const auto* inst : fused_expression_root->operands()) {
-          if (IsReductionToVector(*inst)) {
-            return inst;
-          }
-        }
-        return fused_expression_root->operands()[0];
-      } else {
-        element_instr = fused_expression_root;
-      }
-    }
-    return element_instr;
-  };
-
-  auto get_element_shape = [&](const HloInstruction* element_instr) {
-    // Special handling of kReduce instructions -- the fusion
-    // applies to the first operand.
-    if (IsReductionToVector(*element_instr)) {
-      return element_instr->operand(0)->shape();
-    }
-    return element_instr->shape();
-  };
-
-  // The shapes in all tuple operands should agree, unless it is a reduce.
-  // In that case, the operand of the reduce needs to have the same shape
-  // as the other tuple operands, but also we need to compare the output
-  // shapes of the reduces.
-  auto* element_instr_1 = get_element_instr(instr1);
-  auto* element_instr_2 = get_element_instr(instr2);
-  if (element_instr_1->opcode() == HloOpcode::kReduce &&
-      element_instr_2->opcode() == HloOpcode::kReduce &&
-      !ShapeUtil::Equal(element_instr_1->shape(), element_instr_2->shape())) {
-    return false;
-  }
-  // The elementwise output shapes must be the same (including layout).
-  return ShapeUtil::EqualIgnoringFpPrecision(
-      get_element_shape(element_instr_1), get_element_shape(element_instr_2));
+  return ShapesCompatibleForMultiOutputFusion(*instr1, *instr2);
 }
 
 bool GpuMultiOutputFusion::IsFusible(HloInstruction* instr) {
@@ -205,7 +162,7 @@ bool GpuMultiOutputFusion::DoProducerConsumerMultiOutputFusion() {
         VLOG(3) << producer->name() << " is not a loop fusion.";
         continue;
       }
-      if (!ShapesCompatibleForFusion(producer, consumer)) {
+      if (!ShapesCompatibleForMultiOutputFusion(*producer, *consumer)) {
         VLOG(3) << producer->name() << " has an incompatible shape.";
         continue;
       }
diff --git a/tensorflow/compiler/xla/service/gpu/multi_output_fusion_test.cc b/tensorflow/compiler/xla/service/gpu/multi_output_fusion_test.cc
index dc221f22a74..d16c87ba5c6 100644
--- a/tensorflow/compiler/xla/service/gpu/multi_output_fusion_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/multi_output_fusion_test.cc
@@ -580,7 +580,7 @@ TEST_F(MultiOutputFusionTest, AvoidsLargeFusion) {
   //   ...
   // where each of the (pi * pj)'s is represented as a fusion node so that
   // multi-output fusion will pay attention to it.
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   HloComputation::Builder b(TestName());
   Shape shape = ShapeUtil::MakeShape(F32, {10, 100});
 
diff --git a/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc b/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc
index de04ed85c30..e934cbda176 100644
--- a/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc
+++ b/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc
@@ -67,6 +67,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_cse.h"
 #include "tensorflow/compiler/xla/service/hlo_dce.h"
 #include "tensorflow/compiler/xla/service/hlo_element_type_converter.h"
+#include "tensorflow/compiler/xla/service/hlo_get_dimension_size_rewriter.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_fix.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_pipeline.h"
@@ -173,13 +174,16 @@ Status OptimizeHloModule(HloModule* hlo_module, se::StreamExecutor* stream_exec,
           /*rewrite_inference_op=*/true,
           /*rewrite_grad_op=*/true);
 
+      pipeline.AddPass<HloGetDimensionSizeRewriter>();
+
       // BatchNormExpander can create zero-sized ops, so zero-sized HLO
       // elimination has to come after that pass.
       pipeline.AddPass<ZeroSizedHloElimination>();
 
-      pass.AddPass<AlgebraicSimplifier>(
-          /*is_layout_sensitive=*/false,
+      AlgebraicSimplifierOptions options(
           [](const Shape&, const Shape&) { return false; });
+      options.set_enable_permutation_sort_replacement(true);
+      pass.AddPass<AlgebraicSimplifier>(options);
       pass.AddPass<TupleSimplifier>();
       pass.AddPass<WhileLoopConstantSinking>();
       pass.AddPass<WhileLoopSimplifier>();
@@ -248,11 +252,13 @@ Status OptimizeHloModule(HloModule* hlo_module, se::StreamExecutor* stream_exec,
 
     // The LayoutAssignment pass may leave behind kCopy instructions which are
     // duplicate or NOPs, so remove them with algebraic simplification and CSE.
-    pipeline.AddPass<HloPassFix<AlgebraicSimplifier>>(
-        /*is_layout_sensitive=*/true,
+    AlgebraicSimplifierOptions options(
         /*valid_bitcast_callback=*/[](const Shape&, const Shape&) {
           return true;
         });
+    options.set_is_layout_sensitive(true);
+    options.set_enable_permutation_sort_replacement(true);
+    pipeline.AddPass<HloPassFix<AlgebraicSimplifier>>(options);
 
     // Choose the fastest algorithm for each conv.
     //
@@ -810,7 +816,7 @@ std::vector<uint8> NVPTXCompiler::CompilePtxOrGetCachedResult(const string& ptx,
             // binaries are not available. We don't want to spam logs with
             // identical warnings in this case.
 
-            // TODO(zhengxq): we should implement a LOG_FIRST_N and LOG_EVERY_N
+            // TODO(jlebar): we should implement a LOG_FIRST_N and LOG_EVERY_N
             // for more general usage.
             static std::atomic<bool> warning_done(false);
             log_warning = !warning_done.exchange(true);
diff --git a/tensorflow/compiler/xla/service/gpu/stream_assignment_test.cc b/tensorflow/compiler/xla/service/gpu/stream_assignment_test.cc
index f2ef11e1e6a..31a5d7a8c04 100644
--- a/tensorflow/compiler/xla/service/gpu/stream_assignment_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/stream_assignment_test.cc
@@ -30,7 +30,7 @@ namespace gpu {
 
 class StreamAssignmentTest : public HloTestBase {
  protected:
-  std::unique_ptr<HloModule> CreateNewUnverifiedModule() {
+  std::unique_ptr<HloModule> CreateNewVerifiedModule() {
     HloModuleConfig config;
     auto debug_options = GetDebugOptionsForTest();
     debug_options.set_xla_gpu_disable_multi_streaming(false);
@@ -55,7 +55,7 @@ TEST_F(StreamAssignmentTest, SequentialMatMul) {
   HloInstruction* dot2 =
       builder.AddInstruction(CreateCanonicalDot(f32_2x2_, dot1, z));
 
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build(dot2));
 
   std::unique_ptr<StreamAssignment> assignment = AssignStreams(*module);
@@ -76,7 +76,7 @@ TEST_F(StreamAssignmentTest, ConcurrentMatMul) {
   HloInstruction* add = builder.AddInstruction(
       HloInstruction::CreateBinary(f32_2x2_, HloOpcode::kAdd, dot1, dot2));
 
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build(add));
 
   std::unique_ptr<StreamAssignment> assignment = AssignStreams(*module);
@@ -120,7 +120,7 @@ TEST_F(StreamAssignmentTest, LatticeMatMul) {
   HloInstruction* d40 =
       builder.AddInstruction(CreateCanonicalDot(f32_2x2_, d30, d31));
 
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build(d40));
 
   std::unique_ptr<StreamAssignment> assignment = AssignStreams(*module);
diff --git a/tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.h b/tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.h
index d2f30ae7bc4..d917320e363 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.h
+++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.h
@@ -26,7 +26,7 @@ namespace gpu {
 // Tests that verify IR or PTX emitted by the GPU backend is as expected.
 class GpuCodegenTest : public LlvmIrGenTestBase {
  protected:
-  // Like HloTestBase::CreateNewUnverifiedModule(), with a flag for configuring
+  // Like HloTestBase::CreateNewVerifiedModule(), with a flag for configuring
   // the ftz option.
   std::unique_ptr<HloModule> CreateNewUnverifiedModuleWithFTZ(bool ftz);
 
diff --git a/tensorflow/compiler/xla/service/gpu/tests/gpu_copy_test.cc b/tensorflow/compiler/xla/service/gpu/tests/gpu_copy_test.cc
index 268b48a1cad..a1ed8499040 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/gpu_copy_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_copy_test.cc
@@ -46,7 +46,7 @@ TEST_F(GpuCopyTest, UseMemcpy) {
 
   std::unique_ptr<HloComputation> computation = builder.Build();
 
-  auto hlo_module = CreateNewUnverifiedModule();
+  auto hlo_module = CreateNewVerifiedModule();
   hlo_module->AddEntryComputation(std::move(computation));
 
   // There should not be any kernel prefixed "copy".
diff --git a/tensorflow/compiler/xla/service/gpu/tests/gpu_ftz_test.cc b/tensorflow/compiler/xla/service/gpu/tests/gpu_ftz_test.cc
index d0ccd8619bd..5e524faab18 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/gpu_ftz_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_ftz_test.cc
@@ -75,16 +75,16 @@ class GpuFtzDisabledTest : public GpuFtzTest {
 // Check that we emit mul.ftz.f32 when in ftz mode, and plain mul.f32 otherwise.
 TEST_F(GpuFtzEnabledTest, MultiplyFtz) {
   CompileAndVerifyPtx(CreateBinaryOpModule(HloOpcode::kMultiply), R"(
-    CHECK-NOT: mul.f32
-    CHECK: mul.ftz.f32
-    CHECK-NOT: mul.f32
+    CHECK-NOT: mul.rn.f32
+    CHECK: mul.rn.ftz.f32
+    CHECK-NOT: mul.rn.f32
   )");
 }
 TEST_F(GpuFtzDisabledTest, MultiplyFtz) {
   CompileAndVerifyPtx(CreateBinaryOpModule(HloOpcode::kMultiply), R"(
-    CHECK-NOT: mul.ftz.f32
-    CHECK: mul.f32
-    CHECK-NOT: mul.ftz.f32
+    CHECK-NOT: mul.rn.ftz.f32
+    CHECK: mul.rn.f32
+    CHECK-NOT: mul.rn.ftz.f32
   )");
 }
 
diff --git a/tensorflow/compiler/xla/service/gpu/tests/gpu_index_test.cc b/tensorflow/compiler/xla/service/gpu/tests/gpu_index_test.cc
index da8e513a2c3..6814be779e0 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/gpu_index_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_index_test.cc
@@ -51,7 +51,7 @@ TEST_F(GpuIndexTest, CompatibleUseLinearIndex) {
   builder.AddInstruction(HloInstruction::CreateBinary(
       ShapeUtil::MakeShape(PRED, {5, 7, 2}), HloOpcode::kGe, param_x, param_y));
 
-  auto hlo_module = CreateNewUnverifiedModule();
+  auto hlo_module = CreateNewVerifiedModule();
   hlo_module->AddEntryComputation(builder.Build());
 
   // Check the optimized IR as the unoptimized IR contains dead udiv and urem.
diff --git a/tensorflow/compiler/xla/service/gpu/tests/gpu_ldg_test.cc b/tensorflow/compiler/xla/service/gpu/tests/gpu_ldg_test.cc
index ea1fee040dd..3019215c015 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/gpu_ldg_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_ldg_test.cc
@@ -48,7 +48,7 @@ TEST_F(GpuLdgTest, LdgForParamRead) {
       HloInstruction::CreateBinary(shape, HloOpcode::kAdd, param, param));
   std::unique_ptr<HloComputation> computation = builder.Build();
 
-  auto hlo_module = CreateNewUnverifiedModule();
+  auto hlo_module = CreateNewVerifiedModule();
   hlo_module->AddEntryComputation(std::move(computation));
 
   CompileAndVerifyPtx(std::move(hlo_module), R"(
@@ -73,7 +73,7 @@ TEST_F(GpuLdgTest, LdgForNonParamRead) {
   builder.AddInstruction(HloInstruction::CreateTuple({add, square}));
   std::unique_ptr<HloComputation> computation = builder.Build();
 
-  auto hlo_module = CreateNewUnverifiedModule();
+  auto hlo_module = CreateNewVerifiedModule();
   hlo_module->AddEntryComputation(std::move(computation));
 
   CompileAndVerifyPtx(std::move(hlo_module), R"(
@@ -95,7 +95,7 @@ TEST_F(GpuLdgTest, LdgForNonParamRead) {
 // reduce in the foreseeable future.  But if that turns out to be wrong, I give
 // you, future reader, permission to delete this test.
 TEST_F(GpuLdgTest, NoLdgWhenSharingBuffer) {
-  auto hlo_module = CreateNewUnverifiedModule();
+  auto hlo_module = CreateNewVerifiedModule();
   HloComputation::Builder builder(TestName());
 
   HloComputation* reduce_computation;
diff --git a/tensorflow/compiler/xla/service/gpu/tests/gpu_noalias_test.cc b/tensorflow/compiler/xla/service/gpu/tests/gpu_noalias_test.cc
index 14285459b5a..ca0a78034d7 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/gpu_noalias_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_noalias_test.cc
@@ -47,7 +47,7 @@ TEST_F(GpuNoAliasTest, Concat) {
 
   std::unique_ptr<HloComputation> computation = builder.Build();
 
-  auto hlo_module = CreateNewUnverifiedModule();
+  auto hlo_module = CreateNewVerifiedModule();
   hlo_module->AddEntryComputation(std::move(computation));
 
   CompileAndVerifyIr(std::move(hlo_module),
diff --git a/tensorflow/compiler/xla/service/gpu/thunk_schedule.cc b/tensorflow/compiler/xla/service/gpu/thunk_schedule.cc
index 141f3219387..6b2d76764a0 100644
--- a/tensorflow/compiler/xla/service/gpu/thunk_schedule.cc
+++ b/tensorflow/compiler/xla/service/gpu/thunk_schedule.cc
@@ -45,7 +45,7 @@ void ThunkSchedule::AddDependenciesOnTransitiveOperands(
 ThunkSchedule::ThunkSchedule(
     std::unique_ptr<ThunkSequence> thunks,
     std::unique_ptr<StreamAssignment> stream_assignment,
-    const std::vector<const HloInstruction*>& hlo_total_order)
+    const std::vector<HloInstruction*>& hlo_total_order)
     : thunks_(std::move(thunks)),
       stream_assignment_(std::move(stream_assignment)) {
   std::unordered_map<const HloInstruction*, Thunk*> hlo_to_thunk;
@@ -53,7 +53,7 @@ ThunkSchedule::ThunkSchedule(
     InsertOrDie(&hlo_to_thunk, thunk->hlo_instruction(), thunk.get());
   }
 
-  for (const HloInstruction* hlo : hlo_total_order) {
+  for (HloInstruction* hlo : hlo_total_order) {
     if (hlo_to_thunk.count(hlo)) {
       thunk_total_order_.push_back(FindOrDie(hlo_to_thunk, hlo));
     }
diff --git a/tensorflow/compiler/xla/service/gpu/thunk_schedule.h b/tensorflow/compiler/xla/service/gpu/thunk_schedule.h
index d3352994f84..43b628a1baf 100644
--- a/tensorflow/compiler/xla/service/gpu/thunk_schedule.h
+++ b/tensorflow/compiler/xla/service/gpu/thunk_schedule.h
@@ -46,7 +46,7 @@ class ThunkSchedule {
  public:
   ThunkSchedule(std::unique_ptr<ThunkSequence> thunks,
                 std::unique_ptr<StreamAssignment> stream_assignment,
-                const std::vector<const HloInstruction*>& hlo_total_order);
+                const std::vector<HloInstruction*>& hlo_total_order);
 
   // Returns the total order of executing all the thunks.
   const std::vector<Thunk*>& TotalOrder() const { return thunk_total_order_; }
diff --git a/tensorflow/compiler/xla/service/gpu/while_transformer_test.cc b/tensorflow/compiler/xla/service/gpu/while_transformer_test.cc
index c7f51127649..2dce7749bbd 100644
--- a/tensorflow/compiler/xla/service/gpu/while_transformer_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/while_transformer_test.cc
@@ -29,7 +29,7 @@ namespace {
 class WhileTransformerTest : public HloTestBase {
  protected:
   WhileTransformerTest()
-      : module_(CreateNewUnverifiedModule()),
+      : module_(CreateNewVerifiedModule()),
         induction_variable_shape_(ShapeUtil::MakeShape(S32, {})),
         data_shape_(ShapeUtil::MakeShape(F32, {8})),
         condition_result_shape_(ShapeUtil::MakeShape(PRED, {})) {}
diff --git a/tensorflow/compiler/xla/service/heap_simulator_test.cc b/tensorflow/compiler/xla/service/heap_simulator_test.cc
index fad3215fc81..dc40b9446ad 100644
--- a/tensorflow/compiler/xla/service/heap_simulator_test.cc
+++ b/tensorflow/compiler/xla/service/heap_simulator_test.cc
@@ -258,7 +258,7 @@ class HeapSimulatorTracker {
   // Constructor for testing a single entry computation.
   HeapSimulatorTracker(
       const string& name, std::unique_ptr<HloComputation> computation,
-      const std::vector<const HloInstruction*>& instruction_sequence) {
+      const std::vector<HloInstruction*>& instruction_sequence) {
     HloModuleConfig config;
     module_ = absl::make_unique<HloModule>(name, config);
     module_->AddEntryComputation(std::move(computation));
@@ -286,7 +286,7 @@ class HeapSimulatorTracker {
   // Similar to the single entry computation constructor above, but runs the
   // simulation over the entire module.
   void RunWholeModule(
-      const std::vector<const HloInstruction*>& full_module_sequence) {
+      const std::vector<HloInstruction*>& full_module_sequence) {
     points_to_analysis_ =
         TuplePointsToAnalysis::Run(module_.get()).ConsumeValueOrDie();
 
@@ -294,7 +294,7 @@ class HeapSimulatorTracker {
     HloSchedule schedule(module_.get());
     absl::flat_hash_map<const HloInstruction*, int> reverse_position;
     for (int i = 0; i < full_module_sequence.size(); ++i) {
-      const HloInstruction* instruction = full_module_sequence[i];
+      HloInstruction* instruction = full_module_sequence[i];
       schedule.GetOrCreateSequence(instruction->parent())
           .push_back(instruction);
       reverse_position[instruction] = full_module_sequence.size() - i;
diff --git a/tensorflow/compiler/xla/service/hlo.proto b/tensorflow/compiler/xla/service/hlo.proto
index dbab62f847e..414c6327124 100644
--- a/tensorflow/compiler/xla/service/hlo.proto
+++ b/tensorflow/compiler/xla/service/hlo.proto
@@ -51,7 +51,7 @@ message HloInstructionProto {
 
   string name = 1;
   string opcode = 2;
-  xla.Shape shape = 3;
+  xla.ShapeProto shape = 3;
 
   xla.OpMetadata metadata = 7;
 
@@ -132,7 +132,7 @@ message HloInstructionProto {
   string custom_call_opaque = 53;
 
   // Shape of outfeed request.
-  xla.Shape outfeed_shape = 29;
+  xla.ShapeProto outfeed_shape = 29;
 
   // Describes the dimension numbers used for a dot operation
   xla.DotDimensionNumbers dot_dimension_numbers = 30;
@@ -190,7 +190,7 @@ message HloInstructionProto {
   // 'operand_shapes_with_layout' must contain a shape with layout for each
   // operand.
   bool constrain_layout = 56;
-  repeated Shape operand_shapes_with_layout = 57;
+  repeated xla.ShapeProto operand_shapes_with_layout = 57;
 }
 
 // Serialization of HloComputation.
@@ -205,7 +205,8 @@ message HloComputationProto {
   repeated HloInstructionProto instructions = 2;
 
   // The program shape (with layout) of this computation.
-  xla.ProgramShape program_shape = 4;
+
+  xla.ProgramShapeProto program_shape = 4;
 
   // The id of this computation.
   int64 id = 5;
@@ -251,6 +252,41 @@ message HloInputOutputAliasProto {
   repeated AliasEntryProto entries = 1;
 }
 
+message DynamicParameterBindingProto {
+  // A list of bindings which indicates that the `target_dim_num` in
+  // the subshape `target_param_index` of parameter `target_param_num`
+  // is a dynamic dimension and its real dynamic size is represented
+  // by `dynamic_param_index` in parameter `dynamic_param_num`.
+  //
+  // As an example, imagine we have a program:
+  //
+  // ENTRY main {
+  //   a = f32[] parameter(0)
+  //   b = f32[10] parameter(1)
+  //   ROOT root = (f32[], f32[10]) tuple(%a, %b)
+  // }
+  //
+  // Let's say 'b' (param index 1) is a dynamic shape whose input has
+  // an upperbound of 10 and real size is determined at runtime.'a'
+  // represents the real size of b's first dimension.
+  //
+  // In this case, the fields are set in the following way:
+  // dynamic_param_num = 1
+  // dynamic_param_index = {}
+  // target_param_num = 0
+  // target_param_index = {}
+  // target_param_dim = 0
+  message Binding {
+    int64 dynamic_param_num = 1;
+    repeated int64 dynamic_param_index = 2;
+    int64 target_param_num = 3;
+    repeated int64 target_param_index = 4;
+    int64 target_param_dim_num = 5;
+  }
+
+  repeated Binding entries = 1;
+}
+
 // Serialization of HloModule.
 message HloModuleProto {
   string name = 1;
@@ -262,7 +298,7 @@ message HloModuleProto {
   repeated HloComputationProto computations = 3;
 
   // The host program shape (with layout) of the entry computation.
-  xla.ProgramShape host_program_shape = 4;
+  xla.ProgramShapeProto host_program_shape = 4;
 
   // The id of this module.
   int64 id = 5;
@@ -272,6 +308,8 @@ message HloModuleProto {
 
   // Describes alias information between inputs and outputs.
   HloInputOutputAliasProto input_output_alias = 8;
+
+  DynamicParameterBindingProto dynamic_parameter_binding = 9;
 }
 
 // Serialization of LogicalBuffer.
diff --git a/tensorflow/compiler/xla/service/hlo_computation.cc b/tensorflow/compiler/xla/service/hlo_computation.cc
index 0c20d207ddb..ff122b529bd 100644
--- a/tensorflow/compiler/xla/service/hlo_computation.cc
+++ b/tensorflow/compiler/xla/service/hlo_computation.cc
@@ -499,7 +499,7 @@ HloComputationProto HloComputation::ToProto() const {
     proto.add_instructions()->Swap(&instruction_proto);
   }
   proto.set_root_id(root_instruction()->unique_id());
-  *proto.mutable_program_shape() = ComputeProgramShape();
+  *proto.mutable_program_shape() = ComputeProgramShape().ToProto();
   return proto;
 }
 
@@ -711,6 +711,8 @@ bool HloComputation::operator==(const HloComputation& other) const {
   return eq(root_instruction(), other.root_instruction());
 }
 
+uint64 HloComputation::Hash() const { return root_instruction()->Hash(); }
+
 Status HloComputation::ReplaceWithNewInstruction(
     HloInstruction* old_instruction,
     std::unique_ptr<HloInstruction> new_instruction) {
@@ -795,7 +797,7 @@ Status HloComputation::AcceptWithOperandOrder(
 template <typename HloInstructionPtr>
 Status HloComputation::AcceptOrdered(
     DfsHloVisitorBase<HloInstructionPtr>* visitor,
-    const std::vector<const HloInstruction*>& order) const {
+    const std::vector<HloInstruction*>& order) const {
   VLOG(3) << "Accepting visitor with order.";
   for (HloInstruction* root : CollectUnreachableRoots()) {
     TF_RET_CHECK(std::find(order.begin(), order.end(), root) != order.end())
@@ -825,9 +827,9 @@ Status HloComputation::AcceptOrdered(
 
 // Explicit instantiations.
 template Status HloComputation::AcceptOrdered(
-    DfsHloVisitor*, const std::vector<const HloInstruction*>&) const;
+    DfsHloVisitor*, const std::vector<HloInstruction*>&) const;
 template Status HloComputation::AcceptOrdered(
-    ConstDfsHloVisitor*, const std::vector<const HloInstruction*>&) const;
+    ConstDfsHloVisitor*, const std::vector<HloInstruction*>&) const;
 
 Status HloComputation::Accept(
     const std::function<Status(HloInstruction*)>& visitor_func) {
diff --git a/tensorflow/compiler/xla/service/hlo_computation.h b/tensorflow/compiler/xla/service/hlo_computation.h
index fc7d2035e5b..c584e4c7ca5 100644
--- a/tensorflow/compiler/xla/service/hlo_computation.h
+++ b/tensorflow/compiler/xla/service/hlo_computation.h
@@ -264,6 +264,12 @@ class HloComputation {
   // Return whether `*this` and `other` are functionally equivalent.
   bool operator==(const HloComputation& other) const;
 
+  // Generates a hash value of an HLO computation. Hash considers
+  // information on opcode, shape, operands, and typically a root instruction.
+  // This function returns the same hash value for equivalent HLO computations,
+  // with respect to HloInstruction::Identical() method.
+  uint64 Hash() const;
+
   // Replaces old instruction with newly created instruction. Removes old
   // instruction from computation. Updates uses and root instruction.
   Status ReplaceWithNewInstruction(
@@ -301,7 +307,7 @@ class HloComputation {
   // be a topological sort of all instructions in the computation.
   template <typename HloInstructionPtr>
   Status AcceptOrdered(DfsHloVisitorBase<HloInstructionPtr>* visitor,
-                       const std::vector<const HloInstruction*>& order) const;
+                       const std::vector<HloInstruction*>& order) const;
 
   // Same as Accept() above, but the visitor is given as a function.
   Status Accept(const std::function<Status(HloInstruction*)>& visitor_func);
diff --git a/tensorflow/compiler/xla/service/hlo_computation_test.cc b/tensorflow/compiler/xla/service/hlo_computation_test.cc
index 1e7a6e197f5..8b50cfa9aed 100644
--- a/tensorflow/compiler/xla/service/hlo_computation_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_computation_test.cc
@@ -65,7 +65,7 @@ class HloComputationTest : public HloTestBase {
 };
 
 TEST_F(HloComputationTest, GetEmbeddedComputationsEmpty) {
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   auto negate_computation =
       module->AddEntryComputation(CreateNegateComputation());
   EXPECT_TRUE(negate_computation->MakeEmbeddedComputationsList().empty());
@@ -73,7 +73,7 @@ TEST_F(HloComputationTest, GetEmbeddedComputationsEmpty) {
 
 TEST_F(HloComputationTest, GetEmbeddedComputationsOneComputation) {
   // Create computation which calls one other computation.
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   auto negate_computation =
       module->AddEmbeddedComputation(CreateNegateComputation());
   auto map_computation =
@@ -85,7 +85,7 @@ TEST_F(HloComputationTest, GetEmbeddedComputationsOneComputation) {
 
 TEST_F(HloComputationTest, GetEmbeddedComputationsDiamond) {
   // Create computations with a diamond-shaped callgraph.
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   auto negate_computation =
       module->AddEmbeddedComputation(CreateNegateComputation());
   auto map1_computation =
@@ -119,7 +119,7 @@ TEST_F(HloComputationTest, PostOrderSingleton) {
   auto builder = HloComputation::Builder(TestName());
   auto constant = builder.AddInstruction(
       HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(42.0f)));
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
   EXPECT_THAT(computation->MakeInstructionPostOrder(), ElementsAre(constant));
 }
@@ -134,7 +134,7 @@ TEST_F(HloComputationTest, PostOrderSimple) {
       HloInstruction::CreateUnary(r0f32_, HloOpcode::kNegate, constant));
   auto negate2 = builder.AddInstruction(
       HloInstruction::CreateUnary(r0f32_, HloOpcode::kNegate, negate1));
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
   EXPECT_THAT(computation->MakeInstructionPostOrder(),
               ElementsAre(constant, negate1, negate2));
@@ -170,7 +170,7 @@ TEST_F(HloComputationTest, PostOrderDisconnectedInstructions) {
       HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(42.0f)));
   auto constant4 = builder.AddInstruction(
       HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(42.0f)));
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
   EXPECT_THAT(computation->MakeInstructionPostOrder(),
               UnorderedElementsAre(constant1, constant2, constant3, constant4));
@@ -192,7 +192,7 @@ TEST_F(HloComputationTest, PostOrderWithMultipleRoots) {
       r0f32_, HloOpcode::kAdd, constant2, constant3));
   auto add3 = builder.AddInstruction(HloInstruction::CreateBinary(
       r0f32_, HloOpcode::kAdd, constant1, constant3));
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
   auto post_order = computation->MakeInstructionPostOrder();
   EXPECT_EQ(6, post_order.size());
@@ -217,7 +217,7 @@ TEST_F(HloComputationTest, VisitWithMultipleRoots) {
                                                       constant2, constant3));
   builder.AddInstruction(HloInstruction::CreateBinary(r0f32_, HloOpcode::kAdd,
                                                       constant1, constant3));
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
   // Visitor which keeps track of which instructions have been visited.
   class TestVisitor : public DfsHloVisitorWithDefault {
@@ -257,7 +257,7 @@ TEST_F(HloComputationTest, DeepCopyArray) {
   auto builder = HloComputation::Builder(TestName());
   auto constant = builder.AddInstruction(HloInstruction::CreateConstant(
       LiteralUtil::CreateR1<float>({1.0, 2.0, 3.0})));
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
   auto copy = computation->DeepCopyInstruction(constant).ValueOrDie();
 
@@ -274,7 +274,7 @@ TEST_F(HloComputationTest, DeepCopyTuple) {
   auto tuple = builder.AddInstruction(
       HloInstruction::CreateTuple({constant1, constant2}));
 
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
   auto tuple_copy = computation->DeepCopyInstruction(tuple).ValueOrDie();
 
@@ -376,7 +376,7 @@ TEST_F(HloComputationTest, DeepCopyToken) {
   // copied.
   auto builder = HloComputation::Builder(TestName());
   auto token = builder.AddInstruction(HloInstruction::CreateToken());
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
   auto copy = computation->DeepCopyInstruction(token).ValueOrDie();
 
@@ -393,7 +393,7 @@ TEST_F(HloComputationTest, DeepCopyTokenTuple) {
       HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(42.0)));
   auto tuple =
       builder.AddInstruction(HloInstruction::CreateTuple({token, constant}));
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
   auto copy = computation->DeepCopyInstruction(tuple).ValueOrDie();
 
@@ -440,7 +440,7 @@ TEST_F(HloComputationTest, RemoveInstructionWithDuplicateOperand) {
       r0f32_, HloOpcode::kAdd, dead_negate, dead_negate));
   auto negate = builder.AddInstruction(
       HloInstruction::CreateUnary(r0f32_, HloOpcode::kNegate, constant));
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
   EXPECT_EQ(4, computation->instruction_count());
   EXPECT_THAT(computation->root_instruction(), op::Negate(constant));
@@ -466,7 +466,7 @@ TEST_F(HloComputationTest, CloneWithControlDependency) {
       HloInstruction::CreateParameter(0, r0f32_, "param0"));
   auto negate = builder.AddInstruction(
       HloInstruction::CreateUnary(r0f32_, HloOpcode::kNegate, param));
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   auto computation =
       module->AddEntryComputation(builder.Build(/*root_instruction=*/add));
 
@@ -505,7 +505,7 @@ TEST_F(HloComputationTest, Stringification) {
       2, PrecisionConfig::DEFAULT);
   builder.AddInstruction(
       HloInstruction::CreateDot(sout, x, reshape, dot_dnums, precision_config));
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   auto* computation = module->AddEntryComputation(builder.Build());
 
   auto options = HloPrintOptions().set_print_metadata(false);
@@ -540,7 +540,7 @@ TEST_F(HloComputationTest, StringificationIndent) {
       2, PrecisionConfig::DEFAULT);
   builder.AddInstruction(
       HloInstruction::CreateDot(sout, x, reshape, dot_dnums, precision_config));
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   auto* computation = module->AddEntryComputation(builder.Build());
 
   auto options =
@@ -576,7 +576,7 @@ TEST_F(HloComputationTest, StringificationCanonical) {
       2, PrecisionConfig::DEFAULT);
   builder.AddInstruction(
       HloInstruction::CreateDot(sout, x, reshape, dot_dnums, precision_config));
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   auto* computation = module->AddEntryComputation(builder.Build());
 
   auto options = HloPrintOptions().set_print_metadata(false);
diff --git a/tensorflow/compiler/xla/service/hlo_constant_folding_test.cc b/tensorflow/compiler/xla/service/hlo_constant_folding_test.cc
index d12f920722e..4f81dc94e57 100644
--- a/tensorflow/compiler/xla/service/hlo_constant_folding_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_constant_folding_test.cc
@@ -22,21 +22,22 @@ limitations under the License.
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_matchers.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_parser.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_fix.h"
+#include "tensorflow/compiler/xla/service/pattern_matcher.h"
+#include "tensorflow/compiler/xla/service/pattern_matcher_gmock.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
 #include "tensorflow/compiler/xla/types.h"
 
-namespace op = xla::testing::opcode_matchers;
-
 namespace xla {
 namespace {
 
+namespace m = xla::match;
+
 using HloConstantFoldingTest = HloTestBase;
 
 TEST_F(HloConstantFoldingTest, ConvertF32ToS64) {
@@ -49,13 +50,14 @@ TEST_F(HloConstantFoldingTest, ConvertF32ToS64) {
   auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
-  EXPECT_THAT(computation->root_instruction(), op::Convert(input));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Convert().WithOperand(0, m::Op().Is(input))));
 
   HloConstantFolding const_folder;
   TF_ASSERT_OK_AND_ASSIGN(bool result, const_folder.Run(module.get()));
   EXPECT_TRUE(result);
 
-  EXPECT_THAT(computation->root_instruction(), op::Constant());
+  EXPECT_THAT(computation->root_instruction(), GmockMatch(m::Constant()));
   EXPECT_EQ(computation->root_instruction()->literal().GetFirstElement<int64>(),
             42);
 }
@@ -70,13 +72,14 @@ TEST_F(HloConstantFoldingTest, ConvertS64ToF32) {
   auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
-  EXPECT_THAT(computation->root_instruction(), op::Convert(input));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Convert().WithOperand(0, m::Op().Is(input))));
 
   HloConstantFolding const_folder;
   TF_ASSERT_OK_AND_ASSIGN(bool result, const_folder.Run(module.get()));
   EXPECT_TRUE(result);
 
-  EXPECT_THAT(computation->root_instruction(), op::Constant());
+  EXPECT_THAT(computation->root_instruction(), GmockMatch(m::Constant()));
   EXPECT_EQ(computation->root_instruction()->literal().GetFirstElement<float>(),
             42.0f);
 }
@@ -91,13 +94,14 @@ TEST_F(HloConstantFoldingTest, ConvertF32ArrayToS64Array) {
   auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
-  EXPECT_THAT(computation->root_instruction(), op::Convert(input));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Convert().WithOperand(0, m::Op().Is(input))));
 
   HloConstantFolding const_folder;
   TF_ASSERT_OK_AND_ASSIGN(bool result, const_folder.Run(module.get()));
   EXPECT_TRUE(result);
 
-  EXPECT_THAT(computation->root_instruction(), op::Constant());
+  EXPECT_THAT(computation->root_instruction(), GmockMatch(m::Constant()));
   EXPECT_EQ(computation->root_instruction()->literal().Get<int64>({0}), 42);
   EXPECT_EQ(computation->root_instruction()->literal().Get<int64>({1}), 19);
 }
@@ -138,7 +142,7 @@ TEST_F(HloConstantFoldingTest, Concatenate) {
     EXPECT_TRUE(result);
 
     HloInstruction* root = computation->root_instruction();
-    EXPECT_THAT(root, op::Constant());
+    EXPECT_THAT(root, GmockMatch(m::Constant()));
     EXPECT_TRUE(ShapeUtil::Equal(root->shape(), shape));
   }
 }
@@ -165,7 +169,7 @@ TEST_F(HloConstantFoldingTest, Slice) {
   EXPECT_TRUE(result);
 
   HloInstruction* root = computation->root_instruction();
-  EXPECT_THAT(root, op::Constant());
+  EXPECT_THAT(root, GmockMatch(m::Constant()));
   EXPECT_TRUE(ShapeUtil::Equal(root->shape(), shape));
 }
 
@@ -190,7 +194,7 @@ TEST_F(HloConstantFoldingTest, TransposeConstantFold) {
   EXPECT_TRUE(result);
 
   HloInstruction* root = computation->root_instruction();
-  EXPECT_THAT(root, op::Constant());
+  EXPECT_THAT(root, GmockMatch(m::Constant()));
   EXPECT_TRUE(ShapeUtil::Compatible(root->shape(), shape));
 
   using NativeT = typename primitive_util::PrimitiveTypeToNative<F32>::type;
@@ -240,7 +244,8 @@ TEST_F(HloConstantFoldingTest, ConstantFoldReduceNoLayout) {
   TF_ASSERT_OK_AND_ASSIGN(bool result, const_folder.Run(m.get()));
   EXPECT_FALSE(result);
 
-  EXPECT_THAT(m->entry_computation()->root_instruction(), op::Reduce());
+  EXPECT_THAT(m->entry_computation()->root_instruction(),
+              GmockMatch(m::Reduce()));
 }
 
 const char* const kConstantFoldLargePad = R"(
@@ -260,7 +265,7 @@ TEST_F(HloConstantFoldingTest, DoesNotFoldLargePad) {
   EXPECT_FALSE(result);
 
   EXPECT_THAT(module->entry_computation()->root_instruction(),
-              op::Pad(op::Constant(), op::Constant()));
+              GmockMatch(m::Pad(m::Constant(), m::Constant())));
 }
 
 }  // namespace
diff --git a/tensorflow/compiler/xla/service/hlo_cost_analysis.cc b/tensorflow/compiler/xla/service/hlo_cost_analysis.cc
index fdfb38b858c..df7d3826dba 100644
--- a/tensorflow/compiler/xla/service/hlo_cost_analysis.cc
+++ b/tensorflow/compiler/xla/service/hlo_cost_analysis.cc
@@ -419,6 +419,21 @@ Status HloCostAnalysis::HandleTranspose(const HloInstruction*) {
 }
 
 Status HloCostAnalysis::HandleAfterAll(const HloInstruction*) {
+  // This instruction is used to enforce ordering at compile time. No code is
+  // emitted.
+  current_should_compute_bottleneck_time_ = false;
+  current_properties_[kBytesAccessedKey] = 0;
+  current_properties_[kOptimalSecondsKey] = 0;
+  return Status::OK();
+}
+
+Status HloCostAnalysis::HandleAddDependency(
+    const HloInstruction* add_dependency) {
+  // This instruction is used to enforce ordering at compile time. No code is
+  // emitted.
+  current_should_compute_bottleneck_time_ = false;
+  current_properties_[kBytesAccessedKey] = 0;
+  current_properties_[kOptimalSecondsKey] = 0;
   return Status::OK();
 }
 
diff --git a/tensorflow/compiler/xla/service/hlo_cost_analysis.h b/tensorflow/compiler/xla/service/hlo_cost_analysis.h
index 8ced9d776e1..33983119c9b 100644
--- a/tensorflow/compiler/xla/service/hlo_cost_analysis.h
+++ b/tensorflow/compiler/xla/service/hlo_cost_analysis.h
@@ -101,6 +101,7 @@ class HloCostAnalysis : public ConstDfsHloVisitor {
   Status HandleBroadcast(const HloInstruction* broadcast) override;
   Status HandlePad(const HloInstruction* pad) override;
   Status HandleReshape(const HloInstruction* reshape) override;
+  Status HandleAddDependency(const HloInstruction* add_dependency) override;
   Status HandleAfterAll(const HloInstruction* token) override;
   Status HandleTranspose(const HloInstruction* transpose) override;
   Status HandleWhile(const HloInstruction* xla_while) override;
diff --git a/tensorflow/compiler/xla/service/hlo_cost_analysis_test.cc b/tensorflow/compiler/xla/service/hlo_cost_analysis_test.cc
index 6a15b3440c6..ff32faf298d 100644
--- a/tensorflow/compiler/xla/service/hlo_cost_analysis_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_cost_analysis_test.cc
@@ -387,7 +387,7 @@ TEST_F(FusionCostAnalysis, LoopFusion) {
         HloInstruction::CreateBinary(r2f32, HloOpcode::kSubtract, mul, clamp));
     auto tuple = HloInstruction::CreateTuple({sub, sub, mul, c1});
 
-    auto module = CreateNewUnverifiedModule();
+    auto module = CreateNewVerifiedModule();
     auto* computation = module->AddEntryComputation(builder.Build());
     auto* fusion = computation->CreateFusionInstruction(
         {sub, mul, exp, clamp, add}, HloInstruction::FusionKind::kLoop);
@@ -429,7 +429,7 @@ TEST_F(FusionCostAnalysis, NoLayout) {
   auto add = builder.AddInstruction(HloInstruction::CreateBinary(
       shape_with_layout, HloOpcode::kAdd, c1, broadcast));
 
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   auto* computation = module->AddEntryComputation(builder.Build());
   auto* fusion = computation->CreateFusionInstruction(
       {add, broadcast}, HloInstruction::FusionKind::kLoop);
@@ -472,7 +472,7 @@ TEST_F(DomainCostAnalysis, DomainCost) {
   auto domain = builder.AddInstruction(
       HloInstruction::CreateDomain(tuple->shape(), tuple, nullptr, nullptr));
 
-  auto hlo_module = CreateNewUnverifiedModule();
+  auto hlo_module = CreateNewVerifiedModule();
   hlo_module->AddEntryComputation(builder.Build());
 
   EXPECT_EQ(hlo_module->entry_computation()->root_instruction(), domain);
diff --git a/tensorflow/compiler/xla/service/hlo_dataflow_analysis.cc b/tensorflow/compiler/xla/service/hlo_dataflow_analysis.cc
index 5dcf6bc985f..3ed3d3c11c7 100644
--- a/tensorflow/compiler/xla/service/hlo_dataflow_analysis.cc
+++ b/tensorflow/compiler/xla/service/hlo_dataflow_analysis.cc
@@ -466,6 +466,21 @@ bool HloDataflowAnalysis::UpdateDomainValueSet(HloInstruction* domain) {
   return changed;
 }
 
+bool HloDataflowAnalysis::UpdateAddDependencyValueSet(
+    HloInstruction* add_dependency) {
+  // AddDependency just forwards the value of its zero-th operand.
+  CHECK_EQ(add_dependency->opcode(), HloOpcode::kAddDependency);
+  const InstructionValueSet& operand_set =
+      GetInstructionValueSet(add_dependency->operand(0));
+  InstructionValueSet& add_dependency_set =
+      GetInstructionValueSet(add_dependency);
+  if (operand_set != add_dependency_set) {
+    add_dependency_set = operand_set;
+    return true;
+  }
+  return false;
+}
+
 bool HloDataflowAnalysis::UpdateGetTupleElementValueSet(HloInstruction* gte) {
   CHECK_EQ(gte->opcode(), HloOpcode::kGetTupleElement);
   bool changed = false;
@@ -622,6 +637,8 @@ bool HloDataflowAnalysis::UpdateInstructionValueSet(
     HloInstruction* instruction) {
   // Recompute from operands.
   switch (instruction->opcode()) {
+    case HloOpcode::kAddDependency:
+      return UpdateAddDependencyValueSet(instruction);
     case HloOpcode::kBitcast:
       return UpdateBitcastValueSet(instruction);
     case HloOpcode::kDomain:
@@ -795,6 +812,7 @@ Status HloDataflowAnalysis::InitializeInstructionValueSets() {
             define_all_values();
           }
           break;
+        case HloOpcode::kAddDependency:
         case HloOpcode::kWhile:
         case HloOpcode::kCall:
         case HloOpcode::kConditional:
diff --git a/tensorflow/compiler/xla/service/hlo_dataflow_analysis.h b/tensorflow/compiler/xla/service/hlo_dataflow_analysis.h
index abac398c04f..ece17fc4c3e 100644
--- a/tensorflow/compiler/xla/service/hlo_dataflow_analysis.h
+++ b/tensorflow/compiler/xla/service/hlo_dataflow_analysis.h
@@ -193,6 +193,7 @@ class HloDataflowAnalysis {
   bool UpdateSendValueSet(HloInstruction* send);
   bool UpdateTupleValueSet(HloInstruction* tuple);
   bool UpdateWhileValueSet(HloInstruction* xla_while);
+  bool UpdateAddDependencyValueSet(HloInstruction* add_dependency);
 
   // Propagate the dataflow through the module.
   void Propagate();
diff --git a/tensorflow/compiler/xla/service/hlo_dataflow_analysis_test.cc b/tensorflow/compiler/xla/service/hlo_dataflow_analysis_test.cc
index 6422346c101..f7a1f19a6f5 100644
--- a/tensorflow/compiler/xla/service/hlo_dataflow_analysis_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_dataflow_analysis_test.cc
@@ -43,7 +43,7 @@ using ::testing::UnorderedElementsAre;
 class HloDataflowAnalysisTest : public HloTestBase,
                                 public ::testing::WithParamInterface<bool> {
  protected:
-  HloDataflowAnalysisTest() : module_(CreateNewUnverifiedModule()) {}
+  HloDataflowAnalysisTest() : module_(CreateNewVerifiedModule()) {}
 
   // Run dataflow analysis on the member module. For convenience returns a
   // reference to the generated analysis stored in analysis_.
@@ -1877,6 +1877,30 @@ TEST_P(HloDataflowAnalysisTest, NestedConditionals) {
   }
 }
 
+TEST_P(HloDataflowAnalysisTest, AddDependency) {
+  string module_string = R"(
+HloModule AddDependency
+ENTRY %AddDependency (p: f32[3]) -> f32[3] {
+  %p = f32[3] parameter(0)
+  %token = token[] after-all()
+  ROOT %add_dep = f32[3] add-dependency(f32[3] %p, token[] %token)
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<HloModule> module,
+      ParseHloString(module_string, GetModuleConfigForTest()));
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloDataflowAnalysis> analysis,
+                          HloDataflowAnalysis::Run(*module));
+  const HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_EQ(root->opcode(), HloOpcode::kAddDependency);
+
+  // The after-all and parameter should define a value. Add-dependency should
+  // not.
+  EXPECT_EQ(analysis->values().size(), 2);
+  EXPECT_FALSE(analysis->ValueIsDefinedAt(root));
+}
+
 INSTANTIATE_TEST_CASE_P(HloDataflowAnalysisInstantiation,
                         HloDataflowAnalysisTest,
                         ::testing::Values(false, true));
diff --git a/tensorflow/compiler/xla/service/hlo_dce_test.cc b/tensorflow/compiler/xla/service/hlo_dce_test.cc
index 6c8095d3977..1fa4259a3e4 100644
--- a/tensorflow/compiler/xla/service/hlo_dce_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_dce_test.cc
@@ -59,7 +59,7 @@ TEST_F(HloDceTest, NoDeadCode) {
   builder.AddInstruction(HloInstruction::CreateBinary(
       constant1->shape(), HloOpcode::kAdd, constant1, constant2));
 
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
   EXPECT_EQ(3, computation->instruction_count());
@@ -110,7 +110,7 @@ TEST_F(HloDceTest, DeadParameters) {
   builder.AddInstruction(HloInstruction::CreateUnary(
       live_param->shape(), HloOpcode::kNegate, live_param));
 
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
   EXPECT_EQ(5, computation->instruction_count());
@@ -150,7 +150,7 @@ TEST_F(HloDceTest, ControlDependencies) {
   builder.AddInstruction(HloInstruction::CreateBinary(
       constant1->shape(), HloOpcode::kAdd, constant1, constant2));
 
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
   // Add a control dependency between two instructions.
@@ -175,7 +175,7 @@ TEST_F(HloDceTest, ControlDependencies) {
 
 // Tests that a dead call instruction is removed.
 TEST_F(HloDceTest, DeadInstructionWithCalledComputation) {
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   Shape shape = ShapeUtil::MakeShape(F32, {});
 
   // Called computation for the call instruction.
@@ -323,7 +323,7 @@ TEST_F(HloDceTest, CalledComputationWithNestedSideEffect) {
 }
 
 TEST_F(HloDceTest, RemoveDeadSubcomputation) {
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   HloComputation::Builder builder(TestName());
 
   HloComputation::Builder subcomp_builder("reduction_subcomp");
diff --git a/tensorflow/compiler/xla/service/hlo_evaluator.cc b/tensorflow/compiler/xla/service/hlo_evaluator.cc
index 7fcafafc097..3a7652a8dc8 100644
--- a/tensorflow/compiler/xla/service/hlo_evaluator.cc
+++ b/tensorflow/compiler/xla/service/hlo_evaluator.cc
@@ -39,6 +39,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_query.h"
 #include "tensorflow/compiler/xla/service/shape_inference.h"
 #include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/compiler/xla/window_util.h"
@@ -396,6 +397,16 @@ StatusOr<Literal> HloEvaluator::EvaluateDotOp(
   return Evaluate(cloned_instruction.get());
 }
 
+Status HloEvaluator::HandleBitcast(HloInstruction* bitcast) {
+  const Literal& operand_literal = GetEvaluatedLiteralFor(bitcast->operand(0));
+  Literal result(bitcast->shape());
+  TF_RET_CHECK(operand_literal.size_bytes() == result.size_bytes());
+  memcpy(result.untyped_data(), operand_literal.untyped_data(),
+         operand_literal.size_bytes());
+  evaluated_[bitcast] = std::move(result);
+  return Status::OK();
+}
+
 Status HloEvaluator::HandleParameter(HloInstruction* parameter) {
   CHECK_LT(parameter->parameter_number(), arg_literals_.size());
   const Literal* input_literal = arg_literals_[parameter->parameter_number()];
@@ -1046,8 +1057,15 @@ Status HloEvaluator::HandleBroadcast(HloInstruction* broadcast) {
   return Status::OK();
 }
 
-Status HloEvaluator::HandleAfterAll(HloInstruction* token) {
-  evaluated_[token] = LiteralUtil::CreateToken();
+Status HloEvaluator::HandleAfterAll(HloInstruction* after_all) {
+  evaluated_[after_all] = LiteralUtil::CreateToken();
+  return Status::OK();
+}
+
+Status HloEvaluator::HandleAddDependency(HloInstruction* add_dependency) {
+  // AddDedendency just forwards its zero-th operand.
+  evaluated_[add_dependency] =
+      GetEvaluatedLiteralFor(add_dependency->operand(0)).Clone();
   return Status::OK();
 }
 
@@ -1279,10 +1297,10 @@ StatusOr<Literal> EvaluateSortInternal(HloInstruction* sort,
           key_value_vector.push_back(
               std::make_pair(keys_data[i], values_data[i]));
         }
-        std::sort(key_value_vector.begin(), key_value_vector.end(),
-                  [](const kv_pair& a, const kv_pair& b) {
-                    return SafeLess<KeyType>(a.first, b.first);
-                  });
+        std::stable_sort(key_value_vector.begin(), key_value_vector.end(),
+                         [](const kv_pair& a, const kv_pair& b) {
+                           return SafeLess<KeyType>(a.first, b.first);
+                         });
         std::vector<KeyType> result_keys;
         // We use a InlinedVector here because we need to convert it to an
         // absl::Span later, and this would not work with std::vector<bool>.
diff --git a/tensorflow/compiler/xla/service/hlo_evaluator.h b/tensorflow/compiler/xla/service/hlo_evaluator.h
index 07f8d0aad4a..45ed8131dc6 100644
--- a/tensorflow/compiler/xla/service/hlo_evaluator.h
+++ b/tensorflow/compiler/xla/service/hlo_evaluator.h
@@ -144,6 +144,8 @@ class HloEvaluator : public DfsHloVisitorWithDefault {
   // Operations that are type-agnostic or always return a specific type, such as
   // HandleIsFinite where boolean is always returned.
   //
+  Status HandleBitcast(HloInstruction* bitcast) override;
+
   Status HandleParameter(HloInstruction* parameter) override;
 
   Status HandleConstant(HloInstruction* constant) override;
@@ -180,7 +182,9 @@ class HloEvaluator : public DfsHloVisitorWithDefault {
 
   Status HandleBroadcast(HloInstruction* broadcast) override;
 
-  Status HandleAfterAll(HloInstruction* token) override;
+  Status HandleAfterAll(HloInstruction* after_all) override;
+
+  Status HandleAddDependency(HloInstruction* add_dependency) override;
 
   Status HandleSort(HloInstruction* sort) override;
 
@@ -221,16 +225,7 @@ class HloEvaluator : public DfsHloVisitorWithDefault {
       const Literal& operand_literal) {
     const auto shape = instruction->shape();
     const auto* operand = instruction->operand(0);
-
-    // TODO(b/35950897, b/27796129): add DCHECK back once implicit broadcast is
-    // removed.
-    if (!ShapeUtil::SameDimensions(shape, operand->shape())) {
-      return Unimplemented(
-          "Implicit broadcasting is currently unsupported in HLO evaluator "
-          "Shape Mismatch: %s vs %s",
-          ShapeUtil::HumanString(shape),
-          ShapeUtil::HumanString(operand->shape()));
-    }
+    TF_RET_CHECK(ShapeUtil::SameDimensions(shape, operand->shape()));
 
     Literal result(shape);
     TF_RETURN_IF_ERROR(
diff --git a/tensorflow/compiler/xla/service/hlo_evaluator_test.cc b/tensorflow/compiler/xla/service/hlo_evaluator_test.cc
index d95b6ad04f2..4eaaab20ea0 100644
--- a/tensorflow/compiler/xla/service/hlo_evaluator_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_evaluator_test.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/memory/memory.h"
+#include "absl/strings/str_format.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/reference_util.h"
@@ -35,6 +36,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
+#include "tensorflow/compiler/xla/tests/test_utils.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
@@ -2765,6 +2767,33 @@ ENTRY main {
   EXPECT_TRUE(LiteralTestUtil::Equal(arg, actual));
 }
 
+TEST_P(HloEvaluatorTest, Bitcast) {
+  // Regression test for b/114735354.
+  constexpr absl::string_view hlo_text_base = R"(
+HloModule Bitcast
+
+ENTRY main {
+  param = %s[32,121]{1,0} parameter(0)
+  ROOT bitcast = %s[121,32,1]{0,1,2} bitcast(%s[32,121]{1,0} param)
+}
+)";
+  string hlo_text;
+  if (use_bfloat16_) {
+    hlo_text = absl::StrFormat(hlo_text_base, "bf16", "bf16", "bf16");
+  } else {
+    hlo_text = absl::StrFormat(hlo_text_base, "f32", "f32", "f32");
+  }
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
+  auto args = MakeFakeArguments(m_.get()).ConsumeValueOrDie();
+  Literal actual = Evaluate({&args[0]});
+  if (use_bfloat16_) {
+    EXPECT_TRUE(
+        absl::c_equal(args[0].data<bfloat16>(), actual.data<bfloat16>()));
+  } else {
+    EXPECT_TRUE(absl::c_equal(args[0].data<float>(), actual.data<float>()));
+  }
+}
+
 INSTANTIATE_TEST_CASE_P(HloEvaluatorTest_Instantiation, HloEvaluatorTest,
                         ::testing::ValuesIn(use_bf16_params));
 
diff --git a/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor.h b/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor.h
index ebed875eb49..b87fc3e3401 100644
--- a/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor.h
+++ b/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor.h
@@ -161,9 +161,6 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
                          HloOpcodeString(hlo_instruction->opcode()));
   }
 
-  // TODO(b/35950897): many of the stl functions used in the handlers are not
-  // overloaded for every XLA primitive type.
-
   template <typename NativeT,
             typename std::enable_if<std::is_unsigned<NativeT>::value>::type* =
                 nullptr>
@@ -596,7 +593,7 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
     return Status::OK();
   }
 
-  Status HandleDivide(HloInstruction* divide) {
+  Status HandleDivide(HloInstruction* divide) override {
     return HandleDivide<ElementwiseT>(divide);
   }
 
@@ -1556,10 +1553,10 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
           const auto& row_data = row_to_sort.data<NativeT>();
 
           std::vector<NativeT> result_data(row_data.begin(), row_data.end());
-          std::sort(result_data.begin(), result_data.end(),
-                    [](const NativeT& a, const NativeT& b) {
-                      return SafeLess<NativeT>(a, b);
-                    });
+          std::stable_sort(result_data.begin(), result_data.end(),
+                           [](const NativeT& a, const NativeT& b) {
+                             return SafeLess<NativeT>(a, b);
+                           });
           Literal sorted_row(ShapeUtil::MakeShape(keys->shape().element_type(),
                                                   {sort_dim_elements}));
           sorted_row.PopulateR1(absl::Span<const NativeT>(result_data));
@@ -2546,12 +2543,14 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
 
   template <typename NativeT,
             typename std::enable_if<
-                std::is_same<NativeT, float>::value ||
-                std::is_same<NativeT, int32>::value ||
-                std::is_same<NativeT, uint32>::value>::type* = nullptr>
+                std::is_integral<NativeT>::value ||
+                std::is_floating_point<NativeT>::value>::type* = nullptr>
   Status HandleIota(HloInstruction* instruction) {
     auto* iota = Cast<HloIotaInstruction>(instruction);
-    std::vector<NativeT> data(iota->shape().dimensions(iota->iota_dimension()));
+    // Avoid using std::vector since std::vector<bool> does not convert to
+    // absl::Span<bool>.
+    absl::InlinedVector<NativeT, 1> data(
+        iota->shape().dimensions(iota->iota_dimension()));
     std::iota(data.begin(), data.end(), 0);
     auto result = LiteralUtil::CreateR1<NativeT>(data);
 
@@ -2568,9 +2567,8 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
   }
   template <typename NativeT,
             typename std::enable_if<
-                !(std::is_same<NativeT, float>::value ||
-                  std::is_same<NativeT, int32>::value ||
-                  std::is_same<NativeT, uint32>::value)>::type* = nullptr>
+                !(std::is_integral<NativeT>::value ||
+                  std::is_floating_point<NativeT>::value)>::type* = nullptr>
   Status HandleIota(HloInstruction* iota) {
     return InvalidArgument("Unsupported type for iota");
   }
@@ -2722,17 +2720,8 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
     const auto shape = instruction->shape();
     const auto* lhs = instruction->operand(0);
     const auto* rhs = instruction->operand(1);
-
-    // TODO(b/35950897, b/27796129): add DCHECK back once implicit broadcast
-    // is removed.
-    if (!(ShapeUtil::SameDimensions(shape, rhs->shape()) &&
-          ShapeUtil::SameDimensions(lhs->shape(), rhs->shape()))) {
-      return Unimplemented(
-          "Implicit broadcasting is currently unsupported in HLO evaluator "
-          "Shape Mismatch: %s vs %s vs %s: ",
-          ShapeUtil::HumanString(shape), ShapeUtil::HumanString(lhs->shape()),
-          ShapeUtil::HumanString(rhs->shape()));
-    }
+    TF_RET_CHECK(ShapeUtil::SameDimensions(shape, rhs->shape()));
+    TF_RET_CHECK(ShapeUtil::SameDimensions(lhs->shape(), rhs->shape()));
 
     const Literal& lhs_literal = parent_->GetEvaluatedLiteralFor(lhs);
     const Literal& rhs_literal = parent_->GetEvaluatedLiteralFor(rhs);
@@ -2756,19 +2745,9 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
     const auto* lhs = instruction->operand(0);
     const auto* rhs = instruction->operand(1);
     const auto* ehs = instruction->operand(2);
-
-    // TODO(b/35950897, b/27796129): add DCHECK back once implicit
-    // broadcast is removed.
-    if (!(ShapeUtil::SameDimensions(shape, lhs->shape()) &&
-          ShapeUtil::SameDimensions(lhs->shape(), rhs->shape()) &&
-          ShapeUtil::SameDimensions(rhs->shape(), ehs->shape()))) {
-      return Unimplemented(
-          "Implicit broadcasting is currently unsupported in HLO evaluator "
-          "Shape Mismatch: %s vs %s vs %s vs %s: ",
-          ShapeUtil::HumanString(shape), ShapeUtil::HumanString(lhs->shape()),
-          ShapeUtil::HumanString(rhs->shape()),
-          ShapeUtil::HumanString(ehs->shape()));
-    }
+    TF_RET_CHECK(ShapeUtil::SameDimensions(shape, lhs->shape()));
+    TF_RET_CHECK(ShapeUtil::SameDimensions(lhs->shape(), rhs->shape()));
+    TF_RET_CHECK(ShapeUtil::SameDimensions(rhs->shape(), ehs->shape()));
 
     const Literal& lhs_literal = parent_->GetEvaluatedLiteralFor(lhs);
     const Literal& rhs_literal = parent_->GetEvaluatedLiteralFor(rhs);
diff --git a/tensorflow/compiler/xla/service/hlo_get_dimension_size_rewriter.cc b/tensorflow/compiler/xla/service/hlo_get_dimension_size_rewriter.cc
new file mode 100644
index 00000000000..c919dbd82d3
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_get_dimension_size_rewriter.cc
@@ -0,0 +1,61 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/hlo_get_dimension_size_rewriter.h"
+
+#include "absl/algorithm/container.h"
+#include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/service/shape_inference.h"
+
+namespace xla {
+
+namespace {
+
+StatusOr<bool> ReplaceGetSize(HloInstruction* instr) {
+  if (instr->opcode() != HloOpcode::kGetDimensionSize) {
+    return false;
+  }
+  HloComputation* computation = instr->parent();
+
+  TF_ASSIGN_OR_RETURN(auto legal_shape,
+                      ShapeInference::InferGetDimensionSizeShape(
+                          instr->operand(0)->shape(), instr->dimension()));
+  TF_RET_CHECK(ShapeUtil::Equal(instr->shape(), legal_shape));
+  TF_RET_CHECK(ShapeUtil::HasPrimitiveType(instr->shape(), U32));
+  uint32 size = instr->operand(0)->shape().dimensions(instr->dimension());
+  HloInstruction* new_instr = computation->AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<uint32>(size)));
+  TF_RETURN_IF_ERROR(instr->ReplaceAllUsesWith(new_instr));
+  return true;
+}
+
+}  // namespace
+
+StatusOr<bool> HloGetDimensionSizeRewriter::Run(HloModule* module) {
+  bool changed = false;
+  HloProto proto;
+  *proto.mutable_hlo_module() = module->ToProto();
+  for (auto* computation : module->computations()) {
+    for (auto instruction : computation->instructions()) {
+      TF_ASSIGN_OR_RETURN(bool replaced, ReplaceGetSize(instruction));
+      changed = changed || replaced;
+    }
+  }
+  return changed;
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_get_dimension_size_rewriter.h b/tensorflow/compiler/xla/service/hlo_get_dimension_size_rewriter.h
new file mode 100644
index 00000000000..30f44c23a83
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_get_dimension_size_rewriter.h
@@ -0,0 +1,36 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_HLO_GET_DIMENSION_SIZE_REWRITER_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_HLO_GET_DIMENSION_SIZE_REWRITER_H_
+
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
+
+namespace xla {
+
+// Pass to replace a kGetDimensionSize instruction with a constant instruction.
+class HloGetDimensionSizeRewriter : public HloModulePass {
+ public:
+  absl::string_view name() const override {
+    return "hlo-get-dimension-size-rewriter";
+  }
+
+  StatusOr<bool> Run(HloModule* module) override;
+};
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_HLO_GET_DIMENSION_SIZE_REWRITER_H_
diff --git a/tensorflow/compiler/xla/service/hlo_get_dimension_size_rewriter_test.cc b/tensorflow/compiler/xla/service/hlo_get_dimension_size_rewriter_test.cc
new file mode 100644
index 00000000000..a86aebdd5b6
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_get_dimension_size_rewriter_test.cc
@@ -0,0 +1,83 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/hlo_get_dimension_size_rewriter.h"
+
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_matchers.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/compiler/xla/tests/literal_test_util.h"
+#include "tensorflow/compiler/xla/tests/test_utils.h"
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace xla {
+namespace {
+
+namespace op = xla::testing::opcode_matchers;
+
+class HloGetDimensionSizeRewriterTest : public HloTestBase {
+ protected:
+  HloGetDimensionSizeRewriterTest() {}
+};
+
+TEST_F(HloGetDimensionSizeRewriterTest, Ok) {
+  auto module = ParseHloString(R"(
+HloModule _
+ENTRY gds {
+  p = s32[3,4] parameter(0)
+  size0 = u32[] get-dimension-size(p), dimensions={0}
+  size1 = u32[] get-dimension-size(p), dimensions={1}
+  ROOT mul = u32[] multiply(size0, size1)
+})")
+                    .ValueOrDie();
+  HloGetDimensionSizeRewriter pass;
+  EXPECT_TRUE(pass.Run(module.get()).ValueOrDie());
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              op::Multiply(op::Constant(), op::Constant()));
+}
+
+TEST_F(HloGetDimensionSizeRewriterTest, IllegalType) {
+  auto module = ParseHloString(R"(
+HloModule _
+ENTRY gds {
+  p = s32[3]{0} parameter(0)
+  ROOT gds = s64[] get-dimension-size(p), dimensions={0}
+})")
+                    .ValueOrDie();
+  HloGetDimensionSizeRewriter pass;
+  EXPECT_FALSE(pass.Run(module.get()).ok());
+}
+
+TEST_F(HloGetDimensionSizeRewriterTest, IllegalDimension) {
+  auto module = ParseHloString(R"(
+HloModule _
+ENTRY gds {
+  p = f32[2,5] parameter(0)
+  ROOT gds = u32[] get-dimension-size(p), dimensions={2}
+})")
+                    .ValueOrDie();
+  HloGetDimensionSizeRewriter pass;
+  EXPECT_FALSE(pass.Run(module.get()).ok());
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_graph_dumper.cc b/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
index 05cc1593e4e..302eca656be 100644
--- a/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
+++ b/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include <deque>
 #include <map>
 #include <memory>
+#include <queue>
 #include <string>
 #include <tuple>
 #include <unordered_map>
@@ -38,6 +39,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_instructions.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_tfgraph_builder.h"
+#include "tensorflow/compiler/xla/service/pattern_matcher.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/window_util.h"
@@ -111,11 +113,6 @@ class NodeFilter {
            result == kSomeUsersOmitted;
   }
 
-  bool ShowFusionSubcomputation(const HloInstruction* instr) const {
-    CHECK_EQ(instr->opcode(), HloOpcode::kFusion);
-    return Show(instr) && !SomeOrAllOperandsOmitted(instr);
-  }
-
  private:
   std::function<NodeFilterResult(const HloInstruction* instr)> filter_;
 };
@@ -240,34 +237,28 @@ string HtmlLikeStringSanitize(absl::string_view s) {
 // it to a short string lets us tell the user what the subcomputation is without
 // drawing it as a graph.
 optional<string> MatchTrivialComputation(const HloComputation* computation) {
+  namespace m = match;
+
   if (computation->instruction_count() != 3) {
     return nullopt;
   }
-
   HloInstruction* root = computation->root_instruction();
-  if (root->operand_count() != 2) {
+  const HloInstruction *param0, *param1;
+  if (!Match(root, m::Op()
+                       .WithNumOperands(2)
+                       .WithShape(m::Shape().IsEffectiveScalar())
+                       .WithBinaryOperandsAnyOrder(
+                           m::Parameter(&param0, 0)
+                               .WithShape(m::Shape().IsEffectiveScalar()),
+                           m::Parameter(&param1, 1)
+                               .WithShape(m::Shape().IsEffectiveScalar())))) {
     return nullopt;
   }
 
-  // Check that both of the operands to the root are parameters.
-  const HloInstruction* operand0 = root->operand(0);
-  const HloInstruction* operand1 = root->operand(1);
-  if (operand0->opcode() != HloOpcode::kParameter ||
-      operand1->opcode() != HloOpcode::kParameter) {
-    return nullopt;
-  }
-
-  // Check that the two operands of root are param0 and param1.  All of the
-  // opcodes we recognize are commutative, so we're OK with either order.
-  auto n0 = operand0->parameter_number();
-  auto n1 = operand1->parameter_number();
-  if (!(n0 == 0 && n1 == 1) && !(n1 == 0 && n0 == 1)) {
-    return nullopt;
-  }
-
-  // If the params are reversed, check that the operation being performed is
-  // commutative.
-  if (n0 == 1) {
+  // If the params are reversed (i.e. operand0 is param1 and operand1 is
+  // param0), check that the operation being performed is commutative.
+  if (root->operand(0) == param1) {
+    CHECK_EQ(root->operand(1), param0);
     switch (root->opcode()) {
       case HloOpcode::kLe:
       case HloOpcode::kGe:
@@ -279,13 +270,6 @@ optional<string> MatchTrivialComputation(const HloComputation* computation) {
     }
   }
 
-  // Check that the root and params are all effective scalars.
-  if (!ShapeUtil::IsEffectiveScalar(root->shape()) ||
-      !ShapeUtil::IsEffectiveScalar(operand0->shape()) ||
-      !ShapeUtil::IsEffectiveScalar(operand1->shape())) {
-    return nullopt;
-  }
-
   // If we recognize the root's opcode, we've successfully pattern-matched!
   switch (root->opcode()) {
     case HloOpcode::kAdd:
@@ -578,7 +562,7 @@ bool HloDotDumper::ShouldShowSubcomputation(const HloComputation* subcomp) {
 
   // Show the subcomputation if we're showing any of its members.
   return std::any_of(
-      computation_->instructions().begin(), computation_->instructions().end(),
+      subcomp->instructions().begin(), subcomp->instructions().end(),
       [&](const HloInstruction* instr) { return filter_.Show(instr); });
 }
 
@@ -987,6 +971,7 @@ ColorScheme HloDotDumper::GetInstructionColor(const HloInstruction* instr) {
     case HloOpcode::kGetTupleElement:
     case HloOpcode::kTrace:
     case HloOpcode::kAfterAll:
+    case HloOpcode::kAddDependency:
     case HloOpcode::kTuple:
       return kWhite;
     case HloOpcode::kBroadcast:
@@ -1267,12 +1252,12 @@ const HloInstruction* HloDotDumper::GetNodeForEdge(
 
 class GraphRendererRegistry {
  public:
-  void AddRenderer(GraphRendererInterface* graph_renderer) {
+  void SetRenderer(std::shared_ptr<GraphRendererInterface> graph_renderer) {
     tensorflow::mutex_lock lock(mu_);
     graph_renderer_ = graph_renderer;
   }
 
-  GraphRendererInterface* GetDefaultRenderer() {
+  std::shared_ptr<GraphRendererInterface> GetDefaultRenderer() {
     tensorflow::mutex_lock lock(mu_);
     return graph_renderer_;
   }
@@ -1284,20 +1269,21 @@ class GraphRendererRegistry {
 
  private:
   tensorflow::mutex mu_;
-  GraphRendererInterface* graph_renderer_ = nullptr;
+  std::shared_ptr<GraphRendererInterface> graph_renderer_ GUARDED_BY(mu_);
 };
 
 }  // namespace
 
-Registrar::Registrar(GraphRendererInterface* dumper) {
-  GraphRendererRegistry::Default()->AddRenderer(dumper);
+Registrar::Registrar(std::shared_ptr<GraphRendererInterface> dumper) {
+  GraphRendererRegistry::Default()->SetRenderer(dumper);
 }
 
 namespace {
 
 // Gets a NodeFilter that includes roughly all instructions whose distance from
 // root is <= radius.
-NodeFilter MakeNodeFilter(const HloInstruction* root, int64 radius) {
+NodeFilter MakeNodeRadiusAroundFilter(const HloInstruction* root,
+                                      int64 radius) {
   // First, find the neighborhood of nodes with distance from root <= radius.
   // These nodes are our initial set of "normal" nodes.
   std::unordered_map<const HloInstruction*, NodeFilterResult> nodes;
@@ -1404,6 +1390,56 @@ NodeFilter MakeNodeFilter(const HloInstruction* root, int64 radius) {
   });
 }
 
+// Gets a node filter that includes nodes on all paths from `from` to `to`.  If
+// the all-paths set contains more than max_nodes elements, includes the nodes
+// on the shortest paths and sets hit_limit to true.
+NodeFilter MakeNodeFromToFilter(const HloInstruction* from,
+                                const HloInstruction* to, int64 max_nodes,
+                                bool* hit_limit) {
+  *hit_limit = false;
+
+  // Elements in the queue are paths through the graph.
+  std::deque<std::vector<const HloInstruction*>> queue;
+  queue.push_front({from});
+
+  // Compute the set of nodes we want to show using a slightly-modified
+  // Djikstra's algorithm.  The only real difference is, rather than stopping
+  // when we find a (shortest) path, we continue until we've found max_nodes
+  // nodes on some path.
+  std::unordered_set<const HloInstruction*> visited;
+  std::unordered_set<const HloInstruction*> to_display = {from, to};
+  while (!queue.empty() && to_display.size() < max_nodes) {
+    std::vector<const HloInstruction*> path = std::move(queue.front());
+    queue.pop_front();
+    if (!visited.insert(path.back()).second) {
+      continue;
+    }
+
+    for (const auto* user : path.back()->users()) {
+      if (user == to) {
+        auto it = path.begin();
+        for (; it != path.end() && to_display.size() < max_nodes; ++it) {
+          to_display.insert(*it);
+        }
+        if (it != path.end()) {
+          *hit_limit = true;
+        }
+      } else if (!visited.count(user)) {
+        auto new_path = path;
+        new_path.push_back(user);
+        queue.push_back(std::move(new_path));
+      }
+    }
+  }
+
+  return NodeFilter([=](const HloInstruction* instr) {
+    if (instr == from || instr == to) {
+      return kHighlightNode;
+    }
+    return to_display.count(instr) ? kNormalNode : kHideNode;
+  });
+}
+
 string SaveGraph(const string& graph,
                  GraphRendererInterface::GraphKind graph_kind,
                  const string& dest_path) {
@@ -1483,7 +1519,7 @@ string DumpNeighborhoodAround(const HloInstruction& node, int radius,
   auto debug_options = node.GetModule()->config().debug_options();
   string label =
       StrCat("Neighborhood of ", radius, " nodes around ", node.name());
-  NodeFilter filter = MakeNodeFilter(&node, radius);
+  NodeFilter filter = MakeNodeRadiusAroundFilter(&node, radius);
   string graph =
       HloDotDumper(node.parent(), label, debug_options, show_backend_config,
                    /*profile=*/nullptr, filter)
@@ -1491,6 +1527,29 @@ string DumpNeighborhoodAround(const HloInstruction& node, int radius,
   return ExportGraph(graph, GraphRendererInterface::DOT_GRAPH, debug_options);
 }
 
+string DumpAllPathsFromTo(const HloInstruction& from, const HloInstruction& to,
+                          int64 max_nodes, bool show_backend_config) {
+  CHECK_EQ(from.parent(), to.parent()) << "Nodes must be in same computation!";
+  auto debug_options = from.GetModule()->config().debug_options();
+
+  bool hit_limit = false;
+  NodeFilter filter = MakeNodeFromToFilter(&from, &to, max_nodes, &hit_limit);
+  string label;
+  if (!hit_limit) {
+    label = StrCat("All paths from ", from.name(), " to ", to.name());
+  } else {
+    label = StrCat(max_nodes, " nodes on the shortest paths from ", from.name(),
+                   " to ", to.name(),
+                   "<br/><br/>***SHOWING ONLY A SUBSET OF ALL PATHS BETWEEN "
+                   "NODES***<br/><br/>");
+  }
+  string graph =
+      HloDotDumper(from.parent(), label, debug_options, show_backend_config,
+                   /*profile=*/nullptr, filter)
+          .Dump();
+  return ExportGraph(graph, GraphRendererInterface::DOT_GRAPH, debug_options);
+}
+
 void DumpText(const HloModule& module, const string& label,
               const string& directory_path, bool do_prefix) {
   Env* env = Env::Default();
diff --git a/tensorflow/compiler/xla/service/hlo_graph_dumper.h b/tensorflow/compiler/xla/service/hlo_graph_dumper.h
index 0b11f34abb7..de1eefab776 100644
--- a/tensorflow/compiler/xla/service/hlo_graph_dumper.h
+++ b/tensorflow/compiler/xla/service/hlo_graph_dumper.h
@@ -66,6 +66,12 @@ string DumpGraph(const HloComputation& computation, const string& label,
 string DumpNeighborhoodAround(const HloInstruction& node, int radius,
                               bool show_backend_config = false);
 
+// Dumps nodes on any of the paths from `from` to `to`.  If there are more than
+// max_nodes on all paths, restricts to the max_nodes nodes on the shortest
+// paths.
+string DumpAllPathsFromTo(const HloInstruction& from, const HloInstruction& to,
+                          int64 max_nodes, bool show_backend_config = false);
+
 // Dumps the HloModule::ToString() as a file into the provided directory path
 // suffixed with the provided label.
 //
@@ -87,13 +93,13 @@ void DumpText(const HloModule& module, const string& label,
 // Class that registers a graph renderer.
 class Registrar {
  public:
-  Registrar(GraphRendererInterface* dumper);
+  Registrar(std::shared_ptr<GraphRendererInterface> dumper);
 };
 
-#define XLA_INTERNAL_REGISTER_GRAPH_RENDERER(factory, ctr, ...)   \
-  static ::xla::hlo_graph_dumper::Registrar                       \
-      XLA_INTERNAL_REGISTER_GRAPH_RENDERER_NAME(ctr)(new factory, \
-                                                     ##__VA_ARGS__)
+#define XLA_INTERNAL_REGISTER_GRAPH_RENDERER(factory, ctr, ...) \
+  static ::xla::hlo_graph_dumper::Registrar                     \
+      XLA_INTERNAL_REGISTER_GRAPH_RENDERER_NAME(ctr)(           \
+          std::make_shared<factory>(), ##__VA_ARGS__)
 
 // __COUNTER__ must go through another macro to be properly expanded
 #define XLA_INTERNAL_REGISTER_GRAPH_RENDERER_NAME(ctr) ___##ctr##__object_
diff --git a/tensorflow/compiler/xla/service/hlo_instruction.cc b/tensorflow/compiler/xla/service/hlo_instruction.cc
index 26786ee950b..21b1dbc1676 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.cc
+++ b/tensorflow/compiler/xla/service/hlo_instruction.cc
@@ -93,7 +93,8 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
       [&computation_map](int64 id) { return computation_map.contains(id); }))
       << proto.name() << " instruction references invalid computation id(s)";
 
-  TF_RETURN_IF_ERROR(ShapeUtil::ValidateShapeWithOptionalLayout(proto.shape()));
+  Shape shape(proto.shape());
+  TF_RETURN_IF_ERROR(ShapeUtil::ValidateShapeWithOptionalLayout(shape));
 
   switch (opcode) {
     // Ops migrated to subclasses.
@@ -101,23 +102,23 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
       TF_RET_CHECK(proto.operand_ids_size() == 3)
           << "BatchNormTraining instruction should have 3 operands but sees "
           << proto.operand_ids_size();
-      instruction = CreateBatchNormTraining(
-          proto.shape(), operands(0), operands(1), operands(2), proto.epsilon(),
-          proto.feature_index());
+      instruction =
+          CreateBatchNormTraining(shape, operands(0), operands(1), operands(2),
+                                  proto.epsilon(), proto.feature_index());
       break;
     case HloOpcode::kBatchNormInference:
       TF_RET_CHECK(proto.operand_ids_size() == 5)
           << "BatchNormInference instruction should have 5 operands but sees "
           << proto.operand_ids_size();
       instruction = CreateBatchNormInference(
-          proto.shape(), operands(0), operands(1), operands(2), operands(3),
+          shape, operands(0), operands(1), operands(2), operands(3),
           operands(4), proto.epsilon(), proto.feature_index());
       break;
     case HloOpcode::kBatchNormGrad:
       TF_RET_CHECK(proto.operand_ids_size() == 5)
           << "BatchNormGrad instruction should have 5 operands but sees "
           << proto.operand_ids_size();
-      instruction = CreateBatchNormGrad(proto.shape(), operands(0), operands(1),
+      instruction = CreateBatchNormGrad(shape, operands(0), operands(1),
                                         operands(2), operands(3), operands(4),
                                         proto.epsilon(), proto.feature_index());
       break;
@@ -127,7 +128,7 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
           << proto.operand_ids_size();
       std::vector<int64> fft_length(proto.fft_length().begin(),
                                     proto.fft_length().end());
-      instruction = CreateFft(proto.shape(), operands(0), proto.fft_type(),
+      instruction = CreateFft(shape, operands(0), proto.fft_type(),
                               absl::Span<const int64>(fft_length));
       break;
     }
@@ -148,7 +149,7 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
       TF_RET_CHECK(proto.operand_ids_size() == 1)
           << "Recv instruction should have 1 operand but sees "
           << proto.operand_ids_size();
-      instruction = CreateRecv(proto.shape().tuple_shapes(0), operands(0),
+      instruction = CreateRecv(shape.tuple_shapes(0), operands(0),
                                proto.channel_id(), proto.is_host_transfer());
       break;
     case HloOpcode::kRecvDone:
@@ -161,7 +162,7 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
       TF_RET_CHECK(proto.operand_ids_size() == 1)
           << "Reverse instruction should have 1 operand but sees "
           << proto.operand_ids_size();
-      instruction = CreateReverse(proto.shape(), operands(0),
+      instruction = CreateReverse(shape, operands(0),
                                   std::vector<int64>(proto.dimensions().begin(),
                                                      proto.dimensions().end()));
       break;
@@ -170,7 +171,7 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
           << "Concatenate instruction should have 1 dimension but sees "
           << proto.dimensions_size();
       instruction =
-          CreateConcatenate(proto.shape(), all_operands(), proto.dimensions(0));
+          CreateConcatenate(shape, all_operands(), proto.dimensions(0));
       break;
     case HloOpcode::kReduce:
       TF_RET_CHECK(proto.operand_ids_size() % 2 == 0)
@@ -188,7 +189,7 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
             absl::MakeSpan(reduce_operands)
                 .subspan(reduce_operands.size() / 2, reduce_operands.size());
         instruction =
-            CreateReduce(proto.shape(), inputs, init_values,
+            CreateReduce(shape, inputs, init_values,
                          std::vector<int64>(proto.dimensions().begin(),
                                             proto.dimensions().end()),
                          computations(0));
@@ -203,7 +204,7 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
       auto sort_operands = all_operands();
       HloInstruction* keys = sort_operands[0];
       instruction = CreateSort(
-          proto.shape(), proto.dimensions(0), keys,
+          shape, proto.dimensions(0), keys,
           absl::Span<HloInstruction* const>(sort_operands).subspan(1));
       break;
     }
@@ -212,7 +213,7 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
           << "Transpose instruction should have 1 operand but sees "
           << proto.operand_ids_size();
       instruction =
-          CreateTranspose(proto.shape(), operands(0),
+          CreateTranspose(shape, operands(0),
                           std::vector<int64>(proto.dimensions().begin(),
                                              proto.dimensions().end()));
       break;
@@ -221,7 +222,7 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
           << "Broadcast instruction should have 1 operand but sees "
           << proto.operand_ids_size();
       instruction =
-          CreateBroadcast(proto.shape(), operands(0),
+          CreateBroadcast(shape, operands(0),
                           std::vector<int64>(proto.dimensions().begin(),
                                              proto.dimensions().end()));
       break;
@@ -229,7 +230,7 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
       TF_RET_CHECK(proto.called_computation_ids_size() == 1)
           << "Map instruction should have 1 called computation but sees "
           << proto.called_computation_ids_size();
-      instruction = CreateMap(proto.shape(), all_operands(), computations(0));
+      instruction = CreateMap(shape, all_operands(), computations(0));
       break;
     case HloOpcode::kSlice: {
       TF_RET_CHECK(proto.operand_ids_size() == 1)
@@ -242,8 +243,8 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
         slice_limits.push_back(slice_dimensions.limit());
         slice_strides.push_back(slice_dimensions.stride());
       }
-      instruction = CreateSlice(proto.shape(), operands(0), slice_starts,
-                                slice_limits, slice_strides);
+      instruction = CreateSlice(shape, operands(0), slice_starts, slice_limits,
+                                slice_strides);
       break;
     }
     case HloOpcode::kConstant: {
@@ -253,7 +254,7 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
                             Literal::CreateFromProto(proto.literal()));
         instruction = CreateConstant(std::move(literal));
       } else {
-        instruction = absl::make_unique<HloConstantInstruction>(proto.shape());
+        instruction = absl::make_unique<HloConstantInstruction>(shape);
       }
       break;
     }
@@ -284,55 +285,54 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
           tensorflow::gtl::FindPtrOrNull(computation_map, fusion_id);
       TF_RET_CHECK(fused_computation != nullptr)
           << "No fusion computation with id " << fusion_id;
-      instruction = CreateFusion(proto.shape(), fusion_kind, all_operands(),
-                                 fused_computation);
+      instruction =
+          CreateFusion(shape, fusion_kind, all_operands(), fused_computation);
       break;
     }
     case HloOpcode::kRng:
-      instruction =
-          CreateRng(proto.shape(), proto.distribution(), all_operands());
+      instruction = CreateRng(shape, proto.distribution(), all_operands());
       break;
     case HloOpcode::kParameter:
-      instruction = CreateParameter(proto.parameter_number(), proto.shape(),
-                                    proto.name());
+      instruction =
+          CreateParameter(proto.parameter_number(), shape, proto.name());
       break;
     case HloOpcode::kGetTupleElement:
       TF_RET_CHECK(proto.operand_ids_size() == 1)
           << "GetTupleElement instruction should have 1 operand but sees "
           << proto.operand_ids_size();
-      instruction = CreateGetTupleElement(proto.shape(), operands(0),
-                                          proto.tuple_index());
+      instruction =
+          CreateGetTupleElement(shape, operands(0), proto.tuple_index());
       break;
     case HloOpcode::kReducePrecision:
       TF_RET_CHECK(proto.operand_ids_size() == 1)
           << "ReducePrecision instruction should have 1 operand but sees "
           << proto.operand_ids_size();
-      instruction =
-          CreateReducePrecision(proto.shape(), operands(0),
-                                proto.exponent_bits(), proto.mantissa_bits());
+      instruction = CreateReducePrecision(
+          shape, operands(0), proto.exponent_bits(), proto.mantissa_bits());
       break;
     case HloOpcode::kInfeed: {
-      TF_RET_CHECK(ShapeUtil::IsTuple(proto.shape()) &&
-                   (ShapeUtil::TupleElementCount(proto.shape()) == 2))
+      TF_RET_CHECK(ShapeUtil::IsTuple(shape) &&
+                   (ShapeUtil::TupleElementCount(shape) == 2))
           << "Infeed should have a tuple shape with 2 operands, but has: "
-          << proto.shape();
-      const Shape& data_shape =
-          ShapeUtil::GetTupleElementShape(proto.shape(), 0);
+          << shape;
+      const Shape& data_shape = ShapeUtil::GetTupleElementShape(shape, 0);
       TF_RET_CHECK(proto.operand_ids_size() == 1)
           << "Infeed instruction should have 1 operand but sees "
           << proto.operand_ids_size();
       instruction =
           CreateInfeed(data_shape, operands(0), proto.infeed_config());
     } break;
-    case HloOpcode::kOutfeed:
+    case HloOpcode::kOutfeed: {
       TF_RET_CHECK(proto.operand_ids_size() == 2)
           << "Outfeed instruction should have 2 operands but sees "
           << proto.operand_ids_size();
+      Shape outfeed_shape(proto.outfeed_shape());
       TF_RETURN_IF_ERROR(
-          ShapeUtil::ValidateShapeWithOptionalLayout(proto.outfeed_shape()));
-      instruction = CreateOutfeed(proto.outfeed_shape(), operands(0),
-                                  operands(1), proto.outfeed_config());
+          ShapeUtil::ValidateShapeWithOptionalLayout(outfeed_shape));
+      instruction = CreateOutfeed(outfeed_shape, operands(0), operands(1),
+                                  proto.outfeed_config());
       break;
+    }
     case HloOpcode::kCrossReplicaSum: {
       TF_RET_CHECK(proto.called_computation_ids_size() == 1)
           << "CrossReplicaSum should have 1 called computation but sees "
@@ -342,7 +342,7 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
         all_reduce_id = proto.all_reduce_id();
       }
       instruction = CreateCrossReplicaSum(
-          proto.shape(), all_operands(), computations(0),
+          shape, all_operands(), computations(0),
           /*replica_groups=*/
           std::vector<ReplicaGroup>(proto.replica_groups().begin(),
                                     proto.replica_groups().end()),
@@ -352,7 +352,7 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
     }
     case HloOpcode::kAllToAll: {
       instruction = CreateAllToAll(
-          proto.shape(), all_operands(),
+          shape, all_operands(),
           /*replica_groups=*/
           std::vector<ReplicaGroup>(proto.replica_groups().begin(),
                                     proto.replica_groups().end()));
@@ -368,8 +368,8 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
         source_target_pairs[i].first = proto.source_target_pairs(i).source();
         source_target_pairs[i].second = proto.source_target_pairs(i).target();
       }
-      instruction = CreateCollectivePermute(proto.shape(), operands(0),
-                                            source_target_pairs);
+      instruction =
+          CreateCollectivePermute(shape, operands(0), source_target_pairs);
       break;
     }
     case HloOpcode::kConvolution: {
@@ -382,7 +382,7 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
       precision_config.mutable_operand_precision()->Resize(
           proto.operand_ids_size(), PrecisionConfig::DEFAULT);
       instruction = CreateConvolve(
-          proto.shape(), operands(0), operands(1),
+          shape, operands(0), operands(1),
           std::max<int64>(proto.feature_group_count(), 1), proto.window(),
           proto.convolution_dimension_numbers(), precision_config);
       break;
@@ -394,7 +394,7 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
       TF_RET_CHECK(proto.called_computation_ids_size() == 1)
           << "ReduceWindow should have 1 called computation but sees "
           << proto.called_computation_ids_size();
-      instruction = CreateReduceWindow(proto.shape(), operands(0), operands(1),
+      instruction = CreateReduceWindow(shape, operands(0), operands(1),
                                        proto.window(), computations(0));
       break;
     case HloOpcode::kSelectAndScatter:
@@ -404,9 +404,9 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
       TF_RET_CHECK(proto.called_computation_ids_size() == 2)
           << "SelectAndScatter should have 2 called computations but sees "
           << proto.called_computation_ids_size();
-      instruction = CreateSelectAndScatter(
-          proto.shape(), operands(0), computations(0), proto.window(),
-          operands(1), operands(2), computations(1));
+      instruction = CreateSelectAndScatter(shape, operands(0), computations(0),
+                                           proto.window(), operands(1),
+                                           operands(2), computations(1));
       break;
     case HloOpcode::kCustomCall:
       if (proto.constrain_layout()) {
@@ -414,16 +414,17 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
         // vector of pointers essentially) so create a vector of shapes to pass
         // in.
         std::vector<Shape> operand_shapes;
-        for (const Shape& shape : proto.operand_shapes_with_layout()) {
-          operand_shapes.push_back(shape);
+        for (const ShapeProto& shape_proto :
+             proto.operand_shapes_with_layout()) {
+          operand_shapes.emplace_back(shape_proto);
         }
-        instruction = CreateCustomCall(
-            proto.shape(), all_operands(), proto.custom_call_target(),
-            operand_shapes, proto.custom_call_opaque());
+        instruction =
+            CreateCustomCall(shape, all_operands(), proto.custom_call_target(),
+                             operand_shapes, proto.custom_call_opaque());
       } else {
-        instruction = CreateCustomCall(proto.shape(), all_operands(),
-                                       proto.custom_call_target(),
-                                       proto.custom_call_opaque());
+        instruction =
+            CreateCustomCall(shape, all_operands(), proto.custom_call_target(),
+                             proto.custom_call_opaque());
       }
       if (proto.has_window()) {
         static_cast<HloCustomCallInstruction*>(instruction.get())
@@ -443,8 +444,8 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
           << "Pad instruction should have 2 operands but sees "
           << proto.operand_ids_size();
       TF_RET_CHECK(proto.has_padding_config());
-      instruction = CreatePad(proto.shape(), operands(0), operands(1),
-                              proto.padding_config());
+      instruction =
+          CreatePad(shape, operands(0), operands(1), proto.padding_config());
       break;
     case HloOpcode::kDynamicSlice: {
       TF_RET_CHECK(proto.operand_ids_size() == 2)
@@ -452,8 +453,8 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
           << proto.operand_ids_size();
       std::vector<int64> slice_sizes(proto.dynamic_slice_sizes_size());
       absl::c_copy(proto.dynamic_slice_sizes(), slice_sizes.begin());
-      instruction = CreateDynamicSlice(proto.shape(), operands(0), operands(1),
-                                       slice_sizes);
+      instruction =
+          CreateDynamicSlice(shape, operands(0), operands(1), slice_sizes);
       break;
     }
     case HloOpcode::kGather: {
@@ -469,7 +470,7 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
       for (int64 bound : proto.gather_slice_sizes()) {
         gather_slice_sizes.push_back(bound);
       }
-      instruction = CreateGather(proto.shape(), operands(0), operands(1),
+      instruction = CreateGather(shape, operands(0), operands(1),
                                  *gather_dimension_numbers, gather_slice_sizes);
       break;
     }
@@ -485,16 +486,15 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
       auto scatter_dimension_numbers =
           absl::make_unique<ScatterDimensionNumbers>(
               proto.scatter_dimension_numbers());
-      instruction =
-          CreateScatter(proto.shape(), operands(0), operands(1), operands(2),
-                        computations(0), *scatter_dimension_numbers);
+      instruction = CreateScatter(shape, operands(0), operands(1), operands(2),
+                                  computations(0), *scatter_dimension_numbers);
       break;
     }
     case HloOpcode::kIota:
       TF_RET_CHECK(proto.dimensions_size() == 1)
           << "Iota instruction should have 1 dimension but sees "
           << proto.dimensions_size();
-      instruction = CreateIota(proto.shape(), proto.dimensions(0));
+      instruction = CreateIota(shape, proto.dimensions(0));
       break;
     case HloOpcode::kDot: {
       TF_RET_CHECK(proto.has_dot_dimension_numbers())
@@ -506,8 +506,8 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
       precision_config.mutable_operand_precision()->Resize(
           proto.operand_ids_size(), PrecisionConfig::DEFAULT);
       instruction = absl::make_unique<HloDotInstruction>(
-          proto.shape(), operands(0), operands(1),
-          proto.dot_dimension_numbers(), precision_config);
+          shape, operands(0), operands(1), proto.dot_dimension_numbers(),
+          precision_config);
       break;
     }
     case HloOpcode::kDomain: {
@@ -529,7 +529,7 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
         exit_hlo_sharding = std::make_shared<const HloSharding>(sharding);
       }
       instruction = absl::make_unique<HloDomainInstruction>(
-          proto.shape(), operands(0),
+          shape, operands(0),
           absl::make_unique<ShardingMetadata>(entry_hlo_sharding),
           absl::make_unique<ShardingMetadata>(exit_hlo_sharding));
       break;
@@ -537,11 +537,11 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
     case HloOpcode::kGetDimensionSize:
       TF_RET_CHECK(proto.operand_ids_size() == 1);
       TF_RET_CHECK(proto.dimensions_size() == 1);
-      instruction = CreateGetDimensionSize(proto.shape(), operands(0),
-                                           proto.dimensions(0));
+      instruction =
+          CreateGetDimensionSize(shape, operands(0), proto.dimensions(0));
       break;
     default: {
-      instruction = absl::WrapUnique(new HloInstruction(opcode, proto.shape()));
+      instruction = absl::WrapUnique(new HloInstruction(opcode, shape));
       for (const int64 operand_id : proto.operand_ids()) {
         instruction->AppendOperand(instruction_map.at(operand_id));
       }
@@ -855,6 +855,16 @@ HloInstruction::CreateCollectivePermute(
       new HloInstruction(HloOpcode::kAfterAll, ShapeUtil::MakeTokenShape()));
 }
 
+/* static */ std::unique_ptr<HloInstruction>
+HloInstruction::CreateAddDependency(HloInstruction* data_operand,
+                                    HloInstruction* token_operand) {
+  auto instruction = absl::WrapUnique(
+      new HloInstruction(HloOpcode::kAddDependency, data_operand->shape()));
+  instruction->AppendOperand(data_operand);
+  instruction->AppendOperand(token_operand);
+  return instruction;
+}
+
 /* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateWhile(
     const Shape& shape, HloComputation* condition, HloComputation* body,
     HloInstruction* init) {
@@ -1394,6 +1404,10 @@ std::unique_ptr<HloInstruction> HloInstruction::CloneWithNewOperands(
         clone = CreateAfterAll(new_operands);
       }
       break;
+    case HloOpcode::kAddDependency:
+      CHECK_EQ(new_operands.size(), 2);
+      clone = CreateAddDependency(new_operands[0], new_operands[1]);
+      break;
   }
   // SetupDerivedInstruction will setup the precision_config_ field.
   SetupDerivedInstruction(clone.get());
@@ -1680,6 +1694,7 @@ bool HloInstruction::IdenticalSlowPath(
 
     // This opcode has complex or special behavior so just return false.
     case HloOpcode::kAfterAll:
+    case HloOpcode::kAddDependency:
       return false;
 
     // Remaining instructions with special values.
@@ -1745,6 +1760,26 @@ bool HloInstruction::IdenticalSlowPath(
   return false;
 }
 
+uint64 HloInstruction::Hash() const {
+  using tensorflow::Hash64Combine;
+
+  uint64 hash_value = Hash64Combine(0, static_cast<uint64>(opcode()));
+  hash_value = Hash64Combine(hash_value, ShapeUtil::Hash(shape()));
+
+  if (!IsCrossModuleAllReduce()) {
+    if (!operands().empty()) {
+      for (size_t i = 0; i < operands().size(); ++i) {
+        hash_value = Hash64Combine(hash_value, operand(i)->Hash());
+      }
+    }
+  }
+
+  hash_value = Hash64Combine(hash_value, InnerHash());
+  return hash_value;
+}
+
+uint64 HloInstruction::InnerHash() const { return 13; }
+
 void HloInstruction::RemoveUser(HloInstruction* user) {
   auto set_it = user_set_.find(user);
   CHECK(set_it != user_set_.end());
@@ -1900,6 +1935,11 @@ void HloInstruction::set_while_body(HloComputation* computation) {
   called_computations_[kBodyComputationIndex] = computation;
 }
 
+HloInstruction* HloInstruction::while_init() const {
+  CHECK_EQ(HloOpcode::kWhile, opcode_);
+  return operands_[0];
+}
+
 HloComputation* HloInstruction::true_computation() const {
   CHECK_EQ(HloOpcode::kConditional, opcode_);
   return called_computations_[kTrueComputationIndex];
@@ -2214,7 +2254,7 @@ HloInstructionProto HloInstruction::ToProto() const {
   proto.set_id(unique_id_);
   proto.set_name(name_);
   proto.set_opcode(HloOpcodeString(opcode_));
-  *proto.mutable_shape() = shape_;
+  *proto.mutable_shape() = shape_.ToProto();
   for (const HloInstruction* operand : operands_) {
     proto.add_operand_ids(operand->unique_id());
   }
@@ -2462,6 +2502,8 @@ Status HloInstruction::Visit(DfsHloVisitorBase<HloInstructionPtr>* visitor) {
       return visitor->HandleDomain(this);
     case HloOpcode::kAfterAll:
       return visitor->HandleAfterAll(this);
+    case HloOpcode::kAddDependency:
+      return visitor->HandleAddDependency(this);
     case HloOpcode::kIota:
       return visitor->HandleIota(this);
     case HloOpcode::kGetDimensionSize:
@@ -2623,36 +2665,6 @@ Status HloInstruction::AcceptWithOperandOrder(
   return Status::OK();
 }
 
-namespace {
-
-// Returns true if the given order is a topological sort of the instructions
-// it contains.
-bool OrderIsTopologicalSort(const std::vector<const HloInstruction*>& order) {
-  // Create a map from instruction to its position in 'order'.
-  std::unordered_map<const HloInstruction*, int> order_position;
-  for (int i = 0; i < order.size(); i++) {
-    if (!order_position.insert({order[i], i}).second) {
-      // Instruction order[i] is duplicated in the order.
-      return false;
-    }
-  }
-  // Verify that the operand of each instruction in the order is also in the
-  // order *and* the operand's position is earlier (defs are before uses for
-  // all ops).
-  for (auto* instruction : order) {
-    for (auto* operand : instruction->operands()) {
-      if (!ContainsKey(order_position, operand) ||
-          order_position.at(operand) >= order_position.at(instruction)) {
-        return false;
-      }
-    }
-  }
-
-  return true;
-}
-
-}  // namespace
-
 Status HloInstruction::Accept(
     const std::function<Status(HloInstruction*)>& visitor_func) {
   FunctionVisitor visitor(visitor_func);
@@ -3022,6 +3034,16 @@ const PrecisionConfig& HloInstruction::precision_config() const {
   LOG(FATAL) << "Unimplemented method.";
 }
 
+PrecisionConfig* HloInstruction::mutable_precision_config() {
+  if (auto* convolution = DynCast<HloConvolutionInstruction>(this)) {
+    return convolution->mutable_precision_config();
+  }
+  if (auto* dot = DynCast<HloDotInstruction>(this)) {
+    return dot->mutable_precision_config();
+  }
+  LOG(FATAL) << "Unimplemented method.";
+}
+
 HloModule* HloInstruction::GetModule() const {
   if (parent_) {
     return parent_->parent();
@@ -3064,6 +3086,10 @@ int64 HloInstruction::concatenate_dimension() const {
   return Cast<HloConcatenateInstruction>(this)->concatenate_dimension();
 }
 
+int64 HloInstruction::dimension() const {
+  return Cast<HloGetDimensionSizeInstruction>(this)->dimension();
+}
+
 bool HloInstruction::IsRank2Transpose() const {
   auto transpose = DynCast<HloTransposeInstruction>(this);
   return transpose != nullptr && transpose->IsRank2Transpose();
@@ -3243,6 +3269,11 @@ absl::optional<int64> HloInstruction::all_reduce_id() const {
   return Cast<HloAllReduceInstruction>(this)->all_reduce_id();
 }
 
+void HloInstruction::set_all_reduce_id(
+    const absl::optional<int64>& all_reduce_id) {
+  return Cast<HloAllReduceInstruction>(this)->set_all_reduce_id(all_reduce_id);
+}
+
 const ConvolutionDimensionNumbers&
 HloInstruction::convolution_dimension_numbers() const {
   if (auto convolution = DynCast<HloConvolutionInstruction>(this)) {
diff --git a/tensorflow/compiler/xla/service/hlo_instruction.h b/tensorflow/compiler/xla/service/hlo_instruction.h
index 818d4ede0f3..a54716217d6 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.h
+++ b/tensorflow/compiler/xla/service/hlo_instruction.h
@@ -770,6 +770,9 @@ class HloInstruction {
   static std::unique_ptr<HloInstruction> CreateGetDimensionSize(
       const Shape& shape, HloInstruction* operand, int64 dimension);
 
+  static std::unique_ptr<HloInstruction> CreateAddDependency(
+      HloInstruction* data_operand, HloInstruction* token_operand);
+
   // Returns the opcode for this instruction.
   HloOpcode opcode() const { return opcode_; }
 
@@ -883,11 +886,15 @@ class HloInstruction {
       return false;
     }
 
-    // Use an explicit loop rather than ContainerEquals, because copying around
-    // std::functions may be too expensive in some cases.
-    for (size_t i = 0; i < operands().size(); ++i) {
-      if (!eq_operands(operand(i), other.operand(i))) {
-        return false;
+    // Two AllReduces are Identical if they have the same all_reduce_id.
+    // Their operands don't have to be Identical.
+    if (!IsCrossModuleAllReduce()) {
+      // Use an explicit loop rather than ContainerEquals, because copying
+      // around std::functions may be too expensive in some cases.
+      for (size_t i = 0; i < operands().size(); ++i) {
+        if (!eq_operands(operand(i), other.operand(i))) {
+          return false;
+        }
       }
     }
 
@@ -898,6 +905,12 @@ class HloInstruction {
     return IdenticalSlowPath(other, eq_computations);
   }
 
+  // Generates a hash value of an HLO instruction. Hash considers
+  // information on opcode, shape, operands, and typically a root instruction.
+  // This function returns the same hash value for equivalent HLO instructions,
+  // with respect to HloInstruction::Identical() method.
+  uint64 Hash() const;
+
   // Returns whether the instruction has a constant operand.
   bool HasConstantOperand() const;
 
@@ -997,6 +1010,8 @@ class HloInstruction {
   void set_while_condition(HloComputation* while_condition);
   void set_while_body(HloComputation* while_body);
 
+  HloInstruction* while_init() const;
+
   // Gets/sets the true and false HloComputation for Conditional. The setters
   // should only be called by HloModule or HloComputation methods.
   //
@@ -1257,6 +1272,7 @@ class HloInstruction {
   // superior.
   // Precondition: opcode must be kConvolution or kDot.
   const PrecisionConfig& precision_config() const;
+  PrecisionConfig* mutable_precision_config();
 
   // Sets the debug metadata for this instruction.
   void set_metadata(const OpMetadata& metadata) { metadata_ = metadata; }
@@ -1317,6 +1333,9 @@ class HloInstruction {
   // Delegates to HloConcatenateInstruction::concatenate_dimension.
   int64 concatenate_dimension() const;
 
+  // Delegates to HloGetDimensionSizeInstruction::dimension.
+  int64 dimension() const;
+
   // Returns whether this instruction does a rank-2 transposition.
   bool IsRank2Transpose() const;
 
@@ -1435,6 +1454,7 @@ class HloInstruction {
 
   // Delegates to HloAllReduceInstruction::all_reduce_id.
   absl::optional<int64> all_reduce_id() const;
+  void set_all_reduce_id(const absl::optional<int64>& all_reduce_id);
 
   // Returns data on the window in a windowed operation such as
   // convolution.
@@ -1599,6 +1619,10 @@ class HloInstruction {
       const std::function<bool(const HloComputation*, const HloComputation*)>&
           eq_computations) const;
 
+  // Generates a hash value specific to a particular type of an instruction.
+  // This function typically considers the inner root instruction.
+  virtual uint64 InnerHash() const;
+
   // Creates an n-ary elementwise operation.
   static std::unique_ptr<HloInstruction> CreateNary(
       const Shape& shape, HloOpcode opcode,
diff --git a/tensorflow/compiler/xla/service/hlo_instructions.cc b/tensorflow/compiler/xla/service/hlo_instructions.cc
index 4c765aa375c..1ea02cf9c03 100644
--- a/tensorflow/compiler/xla/service/hlo_instructions.cc
+++ b/tensorflow/compiler/xla/service/hlo_instructions.cc
@@ -370,6 +370,11 @@ HloAllReduceInstruction::HloAllReduceInstruction(
   AppendComputation(reduce_computation);
 }
 
+void HloAllReduceInstruction::set_all_reduce_id(
+    const absl::optional<int64>& all_reduce_id) {
+  all_reduce_id_ = all_reduce_id;
+}
+
 HloInstructionProto HloAllReduceInstruction::ToProto() const {
   HloInstructionProto proto = HloCollectiveInstruction::ToProto();
   // Proto3 is so sad.
@@ -1367,6 +1372,10 @@ bool HloFusionInstruction::IdenticalSlowPath(
                          other.fused_instructions_computation());
 }
 
+uint64 HloFusionInstruction::InnerHash() const {
+  return fused_instructions_computation()->Hash();
+}
+
 std::unique_ptr<HloInstruction> HloFusionInstruction::CloneWithNewOperandsImpl(
     const Shape& shape, absl::Span<HloInstruction* const> new_operands,
     HloCloneContext* context) const {
@@ -1610,7 +1619,7 @@ HloOutfeedInstruction::HloOutfeedInstruction(const Shape& outfeed_shape,
 HloInstructionProto HloOutfeedInstruction::ToProto() const {
   HloInstructionProto proto = HloInstruction::ToProto();
   proto.set_outfeed_config(outfeed_config());
-  *proto.mutable_outfeed_shape() = outfeed_shape();
+  *proto.mutable_outfeed_shape() = outfeed_shape().ToProto();
   return proto;
 }
 
@@ -1862,7 +1871,7 @@ HloInstructionProto HloCustomCallInstruction::ToProto() const {
   if (layout_constrained()) {
     proto.set_constrain_layout(true);
     for (const Shape& shape : operand_shapes_with_layout_) {
-      *proto.add_operand_shapes_with_layout() = shape;
+      *proto.add_operand_shapes_with_layout() = shape.ToProto();
     }
   }
   return proto;
diff --git a/tensorflow/compiler/xla/service/hlo_instructions.h b/tensorflow/compiler/xla/service/hlo_instructions.h
index d43a8973ccf..b5c28137a14 100644
--- a/tensorflow/compiler/xla/service/hlo_instructions.h
+++ b/tensorflow/compiler/xla/service/hlo_instructions.h
@@ -252,6 +252,7 @@ class HloAllReduceInstruction : public HloCollectiveInstruction {
   }
 
   absl::optional<int64> all_reduce_id() const { return all_reduce_id_; }
+  void set_all_reduce_id(const absl::optional<int64>& all_reduce_id);
 
   // Returns a serialized representation of this instruction.
   HloInstructionProto ToProto() const override;
@@ -742,6 +743,8 @@ class HloFusionInstruction : public HloInstruction {
       const HloInstruction& other,
       const std::function<bool(const HloComputation*, const HloComputation*)>&
           eq_computations) const override;
+  uint64 InnerHash() const override;
+
   // Implementation for non-common logic of CloneWithNewOperands.
   std::unique_ptr<HloInstruction> CloneWithNewOperandsImpl(
       const Shape& shape, absl::Span<HloInstruction* const> new_operands,
@@ -954,6 +957,7 @@ class HloConvolutionInstruction : public HloInstruction {
   // information but it is presumed that the alternate lowering is strictly
   // superior.
   const PrecisionConfig& precision_config() const { return precision_config_; }
+  PrecisionConfig* mutable_precision_config() { return &precision_config_; }
 
   string ToCategory() const override;
   // Returns a serialized representation of this instruction.
@@ -1325,6 +1329,7 @@ class HloDotInstruction : public HloInstruction {
   // information but it is presumed that the alternate lowering is strictly
   // superior.
   const PrecisionConfig& precision_config() const { return precision_config_; }
+  PrecisionConfig* mutable_precision_config() { return &precision_config_; }
 
   // Returns a serialized representation of this instruction.
   HloInstructionProto ToProto() const override;
diff --git a/tensorflow/compiler/xla/service/hlo_lexer.h b/tensorflow/compiler/xla/service/hlo_lexer.h
index 3e2f8bcd52f..d6a2b292a39 100644
--- a/tensorflow/compiler/xla/service/hlo_lexer.h
+++ b/tensorflow/compiler/xla/service/hlo_lexer.h
@@ -20,6 +20,7 @@ limitations under the License.
 
 #include "absl/strings/string_view.h"
 #include "tensorflow/compiler/xla/service/hlo_token.h"
+#include "tensorflow/compiler/xla/shape.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/platform/logging.h"
diff --git a/tensorflow/compiler/xla/service/hlo_matchers.cc b/tensorflow/compiler/xla/service/hlo_matchers.cc
index 5269cad94d3..d28e79d41ad 100644
--- a/tensorflow/compiler/xla/service/hlo_matchers.cc
+++ b/tensorflow/compiler/xla/service/hlo_matchers.cc
@@ -237,8 +237,4 @@ void PrintTo(const HloInstruction* inst, ::std::ostream* os) {
   *os << (inst ? inst->ToString() : "nullptr");
 }
 
-void PrintTo(HloInstruction* inst, ::std::ostream* os) {
-  PrintTo(const_cast<const HloInstruction*>(inst), os);
-}
-
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_matchers.h b/tensorflow/compiler/xla/service/hlo_matchers.h
index 170ec93a334..235efb19ce4 100644
--- a/tensorflow/compiler/xla/service/hlo_matchers.h
+++ b/tensorflow/compiler/xla/service/hlo_matchers.h
@@ -385,7 +385,6 @@ std::vector<const HloInstruction*> Pointers(const Container& container) {
 // Tell GMock to print HloInstruction* by value, so error messages are nice.
 // Has to be in the same namespace as 'HloInstruction'.
 void PrintTo(const HloInstruction* inst, ::std::ostream* os);
-void PrintTo(HloInstruction* inst, ::std::ostream* os);
 
 }  // namespace xla
 
diff --git a/tensorflow/compiler/xla/service/hlo_memory_scheduler.cc b/tensorflow/compiler/xla/service/hlo_memory_scheduler.cc
index 234fcd266aa..d2740bcce26 100644
--- a/tensorflow/compiler/xla/service/hlo_memory_scheduler.cc
+++ b/tensorflow/compiler/xla/service/hlo_memory_scheduler.cc
@@ -73,7 +73,7 @@ class ListScheduler {
   // Construct and return a memory-minimizing sequence of HLO instructions
   // containing the given HLO computation.
   static StatusOr<HloInstructionSequence> Run(
-      const HloComputation& computation,
+      HloComputation* computation,
       const TuplePointsToAnalysis& points_to_analysis,
       const LogicalBuffer::SizeFunction& size_function,
       const absl::flat_hash_map<const HloComputation*, int64>&
@@ -98,7 +98,7 @@ class ListScheduler {
   // comparison operators.
   using Priority = std::pair<int64, int64>;
 
-  ListScheduler(const HloComputation& computation,
+  ListScheduler(HloComputation* computation,
                 const TuplePointsToAnalysis& points_to_analysis,
                 const LogicalBuffer::SizeFunction& size_function,
                 const absl::flat_hash_map<const HloComputation*, int64>&
@@ -111,7 +111,7 @@ class ListScheduler {
     // instruction. An HLO instruction "uses" a LogicalBuffer if the
     // LogicalBuffer is in an operand of the instruction as indicated by
     // points-to analysis.
-    for (auto* instruction : computation.instructions()) {
+    for (auto* instruction : computation->instructions()) {
       absl::flat_hash_set<const LogicalBuffer*> instr_uses;
       for (auto* operand : instruction->operands()) {
         points_to_analysis.GetPointsToSet(operand).ForEachElement(
@@ -126,13 +126,13 @@ class ListScheduler {
 
     // Create map containing the number of unscheduled uses (hlo instructions)
     // of each logical buffer.
-    for (auto* instruction : computation.instructions()) {
+    for (auto* instruction : computation->instructions()) {
       for (auto* buffer :
            points_to_analysis.GetBuffersDefinedByInstruction(instruction)) {
         unscheduled_use_count_[buffer] = 0;
       }
     }
-    for (auto* instruction : computation.instructions()) {
+    for (auto* instruction : computation->instructions()) {
       for (const LogicalBuffer* buffer : buffer_uses_.at(instruction)) {
         ++unscheduled_use_count_[buffer];
       }
@@ -141,7 +141,7 @@ class ListScheduler {
     // Buffers live out of the computation have an implicit use at the end of
     // the computation.
     for (const LogicalBuffer* live_out_buffer :
-         points_to_analysis.GetPointsToSet(computation.root_instruction())
+         points_to_analysis.GetPointsToSet(computation->root_instruction())
              .CreateFlattenedSet()) {
       ++unscheduled_use_count_[live_out_buffer];
     }
@@ -157,7 +157,7 @@ class ListScheduler {
   // HloInstruction, plus some cached metadata, saved for the purposes of making
   // BytesFreedIfScheduled fast.
   struct ReadyListEntry {
-    const HloInstruction* instruction;
+    HloInstruction* instruction;
 
     // The total size of all buffers defined by this instruction.
     int64 bytes_defined;
@@ -171,7 +171,7 @@ class ListScheduler {
   };
 
   // Creates a ReadyListEntry for the given instruction.
-  ReadyListEntry MakeReadyListEntry(const HloInstruction* instruction) {
+  ReadyListEntry MakeReadyListEntry(HloInstruction* instruction) {
     ReadyListEntry entry;
     entry.instruction = instruction;
 
@@ -250,13 +250,13 @@ class ListScheduler {
     // Populate the ready list with instructions which have no operands or
     // control predecessors.
     absl::flat_hash_map<const HloInstruction*, int64> unscheduled_pred_count;
-    for (auto* instruction : computation_.instructions()) {
+    for (auto* instruction : computation_->instructions()) {
       // TODO(b/34466113): Replace this and above with successors() or
       // predecessors() when these methods are added to HloInstruction.
-      for (const HloInstruction* user : instruction->users()) {
+      for (HloInstruction* user : instruction->users()) {
         unscheduled_pred_count[user]++;
       }
-      for (const HloInstruction* succ : instruction->control_successors()) {
+      for (HloInstruction* succ : instruction->control_successors()) {
         unscheduled_pred_count[succ]++;
       }
     }
@@ -275,7 +275,7 @@ class ListScheduler {
       ready_instructions[inst] = it;
     };
 
-    for (auto* instruction : computation_.instructions()) {
+    for (auto* instruction : computation_->instructions()) {
       if (instruction->operands().empty() &&
           instruction->control_predecessors().empty()) {
         add_to_ready_queue(instruction);
@@ -287,7 +287,7 @@ class ListScheduler {
       // schedule.
       auto best_it = ready_queue.end();
       --best_it;
-      const HloInstruction* best = best_it->second.instruction;
+      HloInstruction* best = best_it->second.instruction;
       VLOG(2) << "Schedule instruction: " << best->ToShortString()
               << " Bytes freed: " << best_it->first.first;
       ready_queue.erase(best_it);
@@ -348,13 +348,13 @@ class ListScheduler {
         }
       }
     }
-    CHECK_EQ(schedule.size(), computation_.instruction_count());
-    CHECK_EQ(scheduled_instructions_.size(), computation_.instruction_count());
+    CHECK_EQ(schedule.size(), computation_->instruction_count());
+    CHECK_EQ(scheduled_instructions_.size(), computation_->instruction_count());
 
     return schedule;
   }
 
-  const HloComputation& computation_;
+  HloComputation* computation_;
   const TuplePointsToAnalysis& points_to_analysis_;
   const LogicalBuffer::SizeFunction& size_function_;
   // Computations are analyzed in post-order. When scheduling an instruction
@@ -386,13 +386,13 @@ int64 SumLogicalBufferSizes(
 }
 
 StatusOr<HloInstructionSequence> ScheduleComputationHelper(
-    const HloComputation& computation,
+    HloComputation* computation,
     const TuplePointsToAnalysis& points_to_analysis,
     const LogicalBuffer::SizeFunction& size_function,
     const MemorySchedulerAlgorithm& algorithm,
     const absl::flat_hash_map<const HloComputation*, int64>&
         memory_by_computation) {
-  VLOG(2) << "Computation: " << computation.name();
+  VLOG(2) << "Computation: " << computation->name();
   if (algorithm) {
     return algorithm(computation, points_to_analysis, size_function,
                      memory_by_computation);
@@ -404,17 +404,17 @@ StatusOr<HloInstructionSequence> ScheduleComputationHelper(
 }  // namespace
 
 StatusOr<HloInstructionSequence> DFSMemoryScheduler(
-    const HloComputation& computation,
+    HloComputation* computation,
     const TuplePointsToAnalysis& points_to_analysis,
     const LogicalBuffer::SizeFunction& size_function,
     const absl::flat_hash_map<const HloComputation*, int64>&
         memory_by_computation) {
   // These variables are a hack to prevent overflows.
   int64 cumulative_total_size = 0;
-  int64 total_hlos = computation.parent()->instruction_count();
+  int64 total_hlos = computation->parent()->instruction_count();
   absl::flat_hash_map<const HloInstruction*, int64> extra_users;
   absl::flat_hash_map<const HloInstruction*, int64> total_sizes;
-  for (const HloInstruction* hlo : computation.MakeInstructionPostOrder()) {
+  for (const HloInstruction* hlo : computation->MakeInstructionPostOrder()) {
     if (ListScheduler::IgnoreInstruction(*hlo)) {
       extra_users[hlo] = 0;
       total_sizes[hlo] = 0;
@@ -448,8 +448,8 @@ StatusOr<HloInstructionSequence> DFSMemoryScheduler(
     total_sizes[hlo] = std::min(total_sizes[hlo], cumulative_total_size);
     extra_users[hlo] = std::min(extra_users[hlo], total_hlos);
   }
-  CHECK_EQ(extra_users.size(), computation.instruction_count());
-  CHECK_EQ(total_sizes.size(), computation.instruction_count());
+  CHECK_EQ(extra_users.size(), computation->instruction_count());
+  CHECK_EQ(total_sizes.size(), computation->instruction_count());
 
   // Construct a total order based on DFS post-order, visiting operands in
   // decreasing cumulative extra user order, and next by cumulative size, with a
@@ -459,7 +459,7 @@ StatusOr<HloInstructionSequence> DFSMemoryScheduler(
     sequence.push_back(hlo);
     return Status::OK();
   });
-  TF_RETURN_IF_ERROR(computation.AcceptWithOperandOrder(
+  TF_RETURN_IF_ERROR(computation->AcceptWithOperandOrder(
       &visitor, [&extra_users, &total_sizes](const HloInstruction* a,
                                              const HloInstruction* b) {
         if (extra_users[a] != extra_users[b]) {
@@ -470,12 +470,12 @@ StatusOr<HloInstructionSequence> DFSMemoryScheduler(
         }
         return a->name() < b->name();
       }));
-  CHECK_EQ(sequence.size(), computation.instruction_count());
+  CHECK_EQ(sequence.size(), computation->instruction_count());
   return sequence;
 }  // namespace xla
 
 StatusOr<HloInstructionSequence> ListMemoryScheduler(
-    const HloComputation& computation,
+    HloComputation* computation,
     const TuplePointsToAnalysis& points_to_analysis,
     const LogicalBuffer::SizeFunction& size_function,
     const absl::flat_hash_map<const HloComputation*, int64>&
@@ -485,16 +485,16 @@ StatusOr<HloInstructionSequence> ListMemoryScheduler(
 }
 
 StatusOr<HloInstructionSequence> PostOrderMemoryScheduler(
-    const HloComputation& computation,
+    HloComputation* computation,
     const TuplePointsToAnalysis& points_to_analysis,
     const LogicalBuffer::SizeFunction& size_function,
     const absl::flat_hash_map<const HloComputation*, int64>&
         memory_by_computation) {
-  return HloInstructionSequence(computation.MakeInstructionPostOrder());
+  return HloInstructionSequence(computation->MakeInstructionPostOrder());
 }
 
 StatusOr<HloInstructionSequence> DefaultMemoryScheduler(
-    const HloComputation& computation,
+    HloComputation* computation,
     const TuplePointsToAnalysis& points_to_analysis,
     const LogicalBuffer::SizeFunction& size_function,
     const absl::flat_hash_map<const HloComputation*, int64>&
@@ -513,7 +513,7 @@ StatusOr<HloInstructionSequence> DefaultMemoryScheduler(
                           memory_by_computation));
   TF_ASSIGN_OR_RETURN(const int64 list_memory,
                       HeapSimulator::MinimumMemoryForComputation(
-                          computation, list_sequence, points_to_analysis,
+                          *computation, list_sequence, points_to_analysis,
                           size_function, &memory_by_computation));
   VLOG(2) << "Min-memory list sequence: " << HumanReadableNumBytes(list_memory);
 
@@ -522,7 +522,7 @@ StatusOr<HloInstructionSequence> DefaultMemoryScheduler(
                                          size_function, memory_by_computation));
   TF_ASSIGN_OR_RETURN(const int64 dfs_memory,
                       HeapSimulator::MinimumMemoryForComputation(
-                          computation, dfs_sequence, points_to_analysis,
+                          *computation, dfs_sequence, points_to_analysis,
                           size_function, &memory_by_computation));
   VLOG(2) << "Min-memory dfs sequence: " << HumanReadableNumBytes(dfs_memory);
 
@@ -532,7 +532,7 @@ StatusOr<HloInstructionSequence> DefaultMemoryScheduler(
                                memory_by_computation));
   TF_ASSIGN_OR_RETURN(const int64 post_order_memory,
                       HeapSimulator::MinimumMemoryForComputation(
-                          computation, post_order_sequence, points_to_analysis,
+                          *computation, post_order_sequence, points_to_analysis,
                           size_function, &memory_by_computation));
   VLOG(2) << "Min-memory post order sequence: "
           << HumanReadableNumBytes(post_order_memory);
@@ -555,17 +555,17 @@ StatusOr<HloInstructionSequence> DefaultMemoryScheduler(
 }
 
 StatusOr<HloSchedule> ScheduleModule(
-    const HloModule& module, const LogicalBuffer::SizeFunction& size_function,
+    HloModule* module, const LogicalBuffer::SizeFunction& size_function,
     const MemorySchedulerAlgorithm& algorithm) {
-  HloSchedule schedule(&module);
+  HloSchedule schedule(module);
   TF_ASSIGN_OR_RETURN(std::unique_ptr<TuplePointsToAnalysis> points_to_analysis,
-                      TuplePointsToAnalysis::Run(&module));
+                      TuplePointsToAnalysis::Run(module));
   absl::flat_hash_map<const HloComputation*, int64> memory_by_computation;
-  for (const auto* computation : module.MakeComputationPostOrder()) {
+  for (auto* computation : module->MakeComputationPostOrder()) {
     if (!computation->IsFusionComputation()) {
       TF_ASSIGN_OR_RETURN(HloInstructionSequence computation_sequence,
                           ScheduleComputationHelper(
-                              *computation, *points_to_analysis, size_function,
+                              computation, *points_to_analysis, size_function,
                               algorithm, memory_by_computation));
       memory_by_computation[computation] =
           HeapSimulator::MinimumMemoryForComputation(
@@ -583,11 +583,11 @@ StatusOr<HloSchedule> ScheduleModule(
 }
 
 StatusOr<HloInstructionSequence> ScheduleComputation(
-    const HloComputation& computation,
+    HloComputation* computation,
     const LogicalBuffer::SizeFunction& size_function) {
-  CHECK(!computation.IsFusionComputation());
+  CHECK(!computation->IsFusionComputation());
   TF_ASSIGN_OR_RETURN(std::unique_ptr<TuplePointsToAnalysis> points_to_analysis,
-                      TuplePointsToAnalysis::Run(computation.parent()));
+                      TuplePointsToAnalysis::Run(computation->parent()));
   absl::flat_hash_map<const HloComputation*, int64> empty_map;
   return ScheduleComputationHelper(computation, *points_to_analysis,
                                    size_function, nullptr, empty_map);
@@ -600,7 +600,7 @@ HloMemoryScheduler::HloMemoryScheduler(
 
 StatusOr<bool> HloMemoryScheduler::Run(HloModule* module) {
   TF_ASSIGN_OR_RETURN(HloSchedule schedule,
-                      ScheduleModule(*module, size_function_, algorithm_));
+                      ScheduleModule(module, size_function_, algorithm_));
   TF_RETURN_IF_ERROR(module->set_schedule(std::move(schedule)));
   return true;
 }
diff --git a/tensorflow/compiler/xla/service/hlo_memory_scheduler.h b/tensorflow/compiler/xla/service/hlo_memory_scheduler.h
index cca5dc49398..7227bfb27c7 100644
--- a/tensorflow/compiler/xla/service/hlo_memory_scheduler.h
+++ b/tensorflow/compiler/xla/service/hlo_memory_scheduler.h
@@ -36,14 +36,14 @@ namespace xla {
 // that describes buffer aliasing, together with a target-specific size function
 // that maps a tensor's logical size to its padded size.
 typedef std::function<StatusOr<HloInstructionSequence>(
-    const HloComputation&, const TuplePointsToAnalysis&,
+    HloComputation*, const TuplePointsToAnalysis&,
     const LogicalBuffer::SizeFunction&,
     const absl::flat_hash_map<const HloComputation*, int64>&)>
     MemorySchedulerAlgorithm;
 
 // List scheduler
 StatusOr<HloInstructionSequence> ListMemoryScheduler(
-    const HloComputation& computation,
+    HloComputation* computation,
     const TuplePointsToAnalysis& points_to_analysis,
     const LogicalBuffer::SizeFunction& size_function,
     const absl::flat_hash_map<const HloComputation*, int64>&
@@ -51,7 +51,7 @@ StatusOr<HloInstructionSequence> ListMemoryScheduler(
 
 // DFS-order scheduler
 StatusOr<HloInstructionSequence> DFSMemoryScheduler(
-    const HloComputation& computation,
+    HloComputation* computation,
     const TuplePointsToAnalysis& points_to_analysis,
     const LogicalBuffer::SizeFunction& size_function,
     const absl::flat_hash_map<const HloComputation*, int64>&
@@ -59,7 +59,7 @@ StatusOr<HloInstructionSequence> DFSMemoryScheduler(
 
 // Naive Post Order scheduler
 StatusOr<HloInstructionSequence> PostOrderMemoryScheduler(
-    const HloComputation& computation,
+    HloComputation* computation,
     const TuplePointsToAnalysis& points_to_analysis,
     const LogicalBuffer::SizeFunction& size_function,
     const absl::flat_hash_map<const HloComputation*, int64>&
@@ -69,7 +69,7 @@ StatusOr<HloInstructionSequence> PostOrderMemoryScheduler(
 // and the DFS scheduler, and chooses whichever returns a lower min-memory,
 // not accounting for fragmentation.
 StatusOr<HloInstructionSequence> DefaultMemoryScheduler(
-    const HloComputation& computation,
+    HloComputation* computation,
     const TuplePointsToAnalysis& points_to_analysis,
     const LogicalBuffer::SizeFunction& size_function,
     const absl::flat_hash_map<const HloComputation*, int64>&
@@ -79,13 +79,13 @@ StatusOr<HloInstructionSequence> DefaultMemoryScheduler(
 // the computation. size_function is the function returning the number of bytes
 // required for a LogicalBuffer.
 StatusOr<HloSchedule> ScheduleModule(
-    const HloModule& module, const LogicalBuffer::SizeFunction& size_function,
+    HloModule* module, const LogicalBuffer::SizeFunction& size_function,
     const MemorySchedulerAlgorithm& algorithm = {});
 
 // Computes the schedule for a single computation.
 // Currently only used by the GPU backend.
 StatusOr<HloInstructionSequence> ScheduleComputation(
-    const HloComputation& computation,
+    HloComputation* computation,
     const LogicalBuffer::SizeFunction& size_function);
 
 // A pass which schedules the HLO instructions in a module. The HloModule's
diff --git a/tensorflow/compiler/xla/service/hlo_memory_scheduler_test.cc b/tensorflow/compiler/xla/service/hlo_memory_scheduler_test.cc
index 984a6266abb..bc0d7e2bc00 100644
--- a/tensorflow/compiler/xla/service/hlo_memory_scheduler_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_memory_scheduler_test.cc
@@ -65,7 +65,7 @@ TEST_F(HloSchedulingTest, LastUseScheduledFirst) {
   auto sub = builder.AddInstruction(
       HloInstruction::CreateBinary(vec, HloOpcode::kSubtract, add, negate));
 
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
   HloMemoryScheduler scheduler([](const BufferValue& buffer) {
@@ -78,7 +78,7 @@ TEST_F(HloSchedulingTest, LastUseScheduledFirst) {
   TF_ASSERT_OK(module->schedule().Verify());
 
   // Verify that all instructions are in the sequence.
-  const std::vector<const HloInstruction*>& sequence =
+  const std::vector<HloInstruction*>& sequence =
       module->schedule().sequence(module->entry_computation()).instructions();
   EXPECT_EQ(module->entry_computation()->instruction_count(), sequence.size());
 
@@ -124,9 +124,9 @@ ENTRY root {
   };
   TF_ASSERT_OK_AND_ASSIGN(
       HloSchedule schedule,
-      ScheduleModule(*module, size_fn, ListMemoryScheduler));
+      ScheduleModule(module.get(), size_fn, ListMemoryScheduler));
   // Verify that all instructions are in the sequence.
-  const std::vector<const HloInstruction*>& sequence =
+  const std::vector<HloInstruction*>& sequence =
       schedule.sequence(module->entry_computation()).instructions();
   EXPECT_EQ(module->entry_computation()->instruction_count(), sequence.size());
 
@@ -172,15 +172,16 @@ TEST_F(HloSchedulingTest, TuplesAreAccountedCorrectly) {
   builder.AddInstruction(HloInstruction::CreateBinary(r1f32, HloOpcode::kAdd,
                                                       tuple_elm, abs_abs2));
 
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
   TF_ASSERT_OK_AND_ASSIGN(HloSchedule schedule,
-                          ScheduleModule(*module,
-                                         [](const BufferValue& buffer) {
-                                           return ShapeUtil::ByteSizeOf(
-                                               buffer.shape(), TUPLE_SIZE);
-                                         },
-                                         ListMemoryScheduler));
+                          ScheduleModule(
+                              module.get(),
+                              [](const BufferValue& buffer) {
+                                return ShapeUtil::ByteSizeOf(buffer.shape(),
+                                                             TUPLE_SIZE);
+                              },
+                              ListMemoryScheduler));
 
   // Verify that all instructions are in the sequence.
   EXPECT_EQ(module->entry_computation()->instruction_count(),
@@ -218,19 +219,19 @@ TEST_F(HloSchedulingTest, MultiOutputFusionAccountedCorrectly) {
   builder.AddInstruction(
       HloInstruction::CreateBinary(r1f32, HloOpcode::kAdd, tuple_elm, exp));
 
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   auto* computation = module->AddEntryComputation(builder.Build());
 
   auto fusion = computation->CreateFusionInstruction(
       {tuple, mul, add}, HloInstruction::FusionKind::kLoop);
 
   TF_ASSERT_OK_AND_ASSIGN(HloSchedule schedule,
-                          ScheduleModule(*module,
-                                         [](const BufferValue& buffer) {
-                                           return ShapeUtil::ByteSizeOf(
-                                               buffer.shape(), 2);
-                                         },
-                                         ListMemoryScheduler));
+                          ScheduleModule(
+                              module.get(),
+                              [](const BufferValue& buffer) {
+                                return ShapeUtil::ByteSizeOf(buffer.shape(), 2);
+                              },
+                              ListMemoryScheduler));
 
   // Verify that all instructions are in the sequence.
   EXPECT_EQ(module->entry_computation()->instruction_count(),
@@ -252,7 +253,7 @@ TEST_F(HloSchedulingTest, HeapSimulatorAccountsForSubcomputations) {
       HloInstruction::CreateParameter(0, r1f32, "cond_param"));
   HloInstruction* zero_vector =
       cond_builder.AddInstruction(HloInstruction::CreateConstant(
-          LiteralUtil::CreateR2<float>({{0, 0, 0, 0}})));
+          LiteralUtil::CreateR1<float>({0, 0, 0, 0})));
   cond_builder.AddInstruction(HloInstruction::CreateBinary(
       ShapeUtil::MakeShape(PRED, {}), HloOpcode::kNe, cond_param, zero_vector));
   auto cond_computation = module->AddEmbeddedComputation(cond_builder.Build());
@@ -284,7 +285,7 @@ TEST_F(HloSchedulingTest, HeapSimulatorAccountsForSubcomputations) {
   };
   TF_ASSERT_OK_AND_ASSIGN(
       HloSchedule schedule,
-      ScheduleModule(*module, size_fn, ListMemoryScheduler));
+      ScheduleModule(module.get(), size_fn, ListMemoryScheduler));
   // Verify that all instructions are in the sequence.
   auto entry_computation = module->entry_computation();
   EXPECT_EQ(module->entry_computation()->instruction_count(),
diff --git a/tensorflow/compiler/xla/service/hlo_module.cc b/tensorflow/compiler/xla/service/hlo_module.cc
index 14bf17f4be1..fe8371384c0 100644
--- a/tensorflow/compiler/xla/service/hlo_module.cc
+++ b/tensorflow/compiler/xla/service/hlo_module.cc
@@ -240,8 +240,10 @@ HloModuleProto HloModule::ToProto() const {
     *proto.mutable_schedule() = schedule().ToProto().ValueOrDie();
   }
   *proto.mutable_host_program_shape() =
-      entry_computation_layout().ComputeProgramShape();
+      entry_computation_layout().ComputeProgramShape().ToProto();
   *proto.mutable_input_output_alias() = input_output_alias_config().ToProto();
+  *proto.mutable_dynamic_parameter_binding() =
+      dynamic_parameter_binding().ToProto();
   return proto;
 }
 
@@ -255,7 +257,7 @@ StatusOr<std::unique_ptr<HloModule>> HloModule::CreateFromProto(
   // the entry parameters and root.
   TF_RET_CHECK(proto.has_host_program_shape())
       << "No program shape found in the proto";
-  const auto& expected_program_shape = proto.host_program_shape();
+  ProgramShape expected_program_shape(proto.host_program_shape());
   TF_RET_CHECK(expected_program_shape.parameters_size() ==
                module_config.entry_computation_layout().parameter_count());
   for (int i = 0; i < expected_program_shape.parameters_size(); ++i) {
@@ -325,6 +327,10 @@ StatusOr<std::unique_ptr<HloModule>> HloModule::CreateFromProto(
 
   // Because we didn't uniquify the names or the ids, double-check that the
   // instruction and computation names and ids are unique from the proto.
+  TF_ASSIGN_OR_RETURN(module->dynamic_parameter_binding_,
+                      DynamicParameterBinding::CreateFromProto(
+                          proto.dynamic_parameter_binding()));
+
   absl::flat_hash_set<string> computation_names;
   absl::flat_hash_set<string> instruction_names;
   absl::flat_hash_set<int> computation_ids;
@@ -363,9 +369,9 @@ StatusOr<HloModuleConfig> HloModule::CreateModuleConfigFromProto(
     const HloModuleProto& module, const DebugOptions& debug_options) {
   TF_RET_CHECK(module.has_host_program_shape())
       << "No program shape found in the proto";
-  const auto& program_shape = module.host_program_shape();
+  ProgramShape program_shape(module.host_program_shape());
 
-  HloModuleConfig module_config(program_shape);
+  HloModuleConfig module_config(ProgramShape{program_shape});
   module_config.set_debug_options(debug_options);
 
   // The module config is constructed with default layouts regardless of what is
diff --git a/tensorflow/compiler/xla/service/hlo_module.h b/tensorflow/compiler/xla/service/hlo_module.h
index 8a1f999e3ab..7b9cbf9a53a 100644
--- a/tensorflow/compiler/xla/service/hlo_module.h
+++ b/tensorflow/compiler/xla/service/hlo_module.h
@@ -28,6 +28,7 @@ limitations under the License.
 #include "absl/types/optional.h"
 #include "absl/types/span.h"
 #include "tensorflow/compiler/xla/iterator_util.h"
+#include "tensorflow/compiler/xla/service/dynamic_parameter_binding.h"
 #include "tensorflow/compiler/xla/service/hlo.pb.h"
 #include "tensorflow/compiler/xla/service/hlo_clone_context.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
@@ -103,11 +104,7 @@ class HloModule {
                                        HloCloneContext* context = nullptr);
 
   // Return a pointer to the entry computation of the module.
-  const HloComputation* entry_computation() const {
-    CHECK_NE(nullptr, entry_computation_);
-    return entry_computation_;
-  }
-  HloComputation* entry_computation() {
+  HloComputation* entry_computation() const {
     CHECK_NE(nullptr, entry_computation_);
     return entry_computation_;
   }
@@ -135,6 +132,12 @@ class HloModule {
     return config_.entry_computation_layout();
   }
 
+  // Generates a hash value of an HLO module. Hash considers
+  // information on opcode, shape, operands, and typically a root instruction.
+  // This function returns the same hash value for equivalent HLO modules,
+  // with respect to HloInstruction::Identical() method.
+  uint64 Hash() const { return entry_computation()->Hash(); }
+
   // Gets the computations in this module.
   //
   // Returns a view of HloComputation*s, so you can iterate over this in the
@@ -232,6 +235,16 @@ class HloModule {
     return input_output_alias_config_;
   }
 
+  // DynamicParameterBinding holds the list of bindings that indicates which
+  // parameter dimensions are dynamic and which parameters represent their
+  // runtime value.
+  DynamicParameterBinding& dynamic_parameter_binding() {
+    return dynamic_parameter_binding_;
+  }
+  const DynamicParameterBinding& dynamic_parameter_binding() const {
+    return dynamic_parameter_binding_;
+  }
+
   // Returns an id that is unique to this module across all modules created over
   // the lifetime of this process.
   int unique_id() const { return unique_id_; }
@@ -285,6 +298,9 @@ class HloModule {
   // alias_config indicates the alias information of input/output buffers that
   // are expected from the module.
   HloInputOutputAliasConfig input_output_alias_config_;
+
+  // Bindings for dynamic parameter mapping.
+  DynamicParameterBinding dynamic_parameter_binding_;
 };
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_module_test.cc b/tensorflow/compiler/xla/service/hlo_module_test.cc
index 3ae67e4e5ee..620cb7e01ad 100644
--- a/tensorflow/compiler/xla/service/hlo_module_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_module_test.cc
@@ -63,7 +63,7 @@ class HloModuleTest : public HloTestBase {
 
 TEST_F(HloModuleTest, OneComputationPostOrder) {
   // Create a module with a single computation.
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(CreateConstantComputation());
 
   EXPECT_THAT(module->MakeComputationPostOrder(),
@@ -72,7 +72,7 @@ TEST_F(HloModuleTest, OneComputationPostOrder) {
 
 TEST_F(HloModuleTest, TwoComputationsPostOrder) {
   // Create a module with two unconnected computations.
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   auto computation1 = module->AddEntryComputation(CreateConstantComputation());
   auto computation2 =
       module->AddEmbeddedComputation(CreateConstantComputation());
@@ -88,7 +88,7 @@ TEST_F(HloModuleTest, TwoComputationsPostOrder) {
 
 TEST_F(HloModuleTest, CloneTest) {
   // Create and copy a module with a diamond call graph of computations.
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   auto computation1 =
       module->AddEmbeddedComputation(CreateConstantComputation());
   auto computation2 =
@@ -111,7 +111,7 @@ TEST_F(HloModuleTest, CloneTest) {
 }
 
 TEST_F(HloModuleTest, CloneHasFusion) {
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
 
   // Create the fused computation.
   HloComputation* fused_computation;
@@ -154,7 +154,7 @@ TEST_F(HloModuleTest, CloneHasFusion) {
 
 TEST_F(HloModuleTest, DiamondComputationsPostOrder) {
   // Create a module with a diamond call graph of computations.
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   auto computation1 =
       module->AddEmbeddedComputation(CreateConstantComputation());
   auto computation2 =
@@ -174,7 +174,7 @@ TEST_F(HloModuleTest, DiamondComputationsPostOrder) {
 
 TEST_F(HloModuleTest, LargeConstantToString) {
   // Create a module with a single computation.
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   auto builder = HloComputation::Builder("Constant");
   std::vector<float> values(16, 42.0);
   builder.AddInstruction(
@@ -194,8 +194,8 @@ TEST_F(HloModuleTest, LargeConstantToString) {
 }
 
 TEST_F(HloModuleTest, UniqueModuleId) {
-  auto module_a = CreateNewUnverifiedModule();
-  auto module_b = CreateNewUnverifiedModule();
+  auto module_a = CreateNewVerifiedModule();
+  auto module_b = CreateNewVerifiedModule();
   EXPECT_NE(module_a->unique_id(), module_b->unique_id());
 }
 
diff --git a/tensorflow/compiler/xla/service/hlo_opcode.h b/tensorflow/compiler/xla/service/hlo_opcode.h
index 70c7d70b41c..127cfd165a5 100644
--- a/tensorflow/compiler/xla/service/hlo_opcode.h
+++ b/tensorflow/compiler/xla/service/hlo_opcode.h
@@ -47,6 +47,8 @@ namespace xla {
 #define HLO_OPCODE_LIST(V)                                   \
   V(kAbs, "abs")                                             \
   V(kAdd, "add")                                             \
+  V(kAddDependency, "add-dependency")                        \
+  V(kAfterAll, "after-all", kHloOpcodeIsVariadic)            \
   V(kAllToAll, "all-to-all")                                 \
   V(kAtan2, "atan2")                                         \
   V(kBatchNormGrad, "batch-norm-grad")                       \
@@ -84,7 +86,6 @@ namespace xla {
   V(kGather, "gather")                                       \
   V(kGe, "greater-than-or-equal-to", kHloOpcodeIsComparison) \
   V(kGetDimensionSize, "get-dimension-size")                 \
-  V(kAfterAll, "after-all", kHloOpcodeIsVariadic)            \
   V(kGetTupleElement, "get-tuple-element")                   \
   V(kGt, "greater-than", kHloOpcodeIsComparison)             \
   V(kImag, "imag")                                           \
diff --git a/tensorflow/compiler/xla/service/hlo_ordering.cc b/tensorflow/compiler/xla/service/hlo_ordering.cc
index f5f99bece18..ca6a154809b 100644
--- a/tensorflow/compiler/xla/service/hlo_ordering.cc
+++ b/tensorflow/compiler/xla/service/hlo_ordering.cc
@@ -356,8 +356,7 @@ void SequentialHloOrdering::Initialize() {
   // Create a map from instruction to its order position.
   TF_DCHECK_OK(schedule_.Verify());
   for (const auto& computation_sequence : schedule_.sequences()) {
-    const std::vector<const HloInstruction*>& order =
-        computation_sequence.second.instructions();
+    const auto& order = computation_sequence.second.instructions();
     for (int i = 0; i < order.size(); ++i) {
       InsertOrDie(&order_position_, order[i], i);
     }
diff --git a/tensorflow/compiler/xla/service/hlo_ordering_test.cc b/tensorflow/compiler/xla/service/hlo_ordering_test.cc
index 2ab8aa57f6e..3ca77e60cd5 100644
--- a/tensorflow/compiler/xla/service/hlo_ordering_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_ordering_test.cc
@@ -53,7 +53,7 @@ TEST_F(HloOrderingTest, InstructionsInDifferentComputations) {
   //   %c = Constant(42.0f)
   //
   // This results in a diamond-shaped callgraph.
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   const Shape scalar_shape = ShapeUtil::MakeShape(xla::F32, {});
 
   auto builder_c = HloComputation::Builder("C");
@@ -126,7 +126,7 @@ TEST_F(HloOrderingTest, InstructionsInWhileComputations) {
   //   %constant = Constant(1.0)
   //   return While(%constant, body, condition)
   //
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   const Shape scalar_shape = ShapeUtil::MakeShape(xla::F32, {});
 
   auto body_builder = HloComputation::Builder("body");
@@ -176,7 +176,7 @@ TEST_F(HloOrderingTest, InstructionsInWhileComputations) {
 
 TEST_F(HloOrderingTest, ParametersDefinedBeforeOthers) {
   // Entry parameter should always be defined before other instruction.
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   const Shape scalar_shape = ShapeUtil::MakeShape(xla::F32, {});
   auto builder = HloComputation::Builder(TestName());
   auto constant = builder.AddInstruction(
@@ -209,7 +209,7 @@ TEST_F(HloOrderingTest, ValuesInWhileComputations) {
   //   %while = While(%constant, body, condition)
   //   %add = Add(%constant, %while)
   //
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   const Shape scalar_shape = ShapeUtil::MakeShape(xla::F32, {});
 
   auto body_builder = HloComputation::Builder("body");
@@ -407,7 +407,7 @@ TEST_F(HloOrderingTest,
   //   %dead = Constant(123.0)
   //
   // %root should interfere with %dead.
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   const Shape scalar_shape = ShapeUtil::MakeShape(xla::F32, {});
 
   auto builder = HloComputation::Builder(TestName());
@@ -455,7 +455,7 @@ TEST_F(HloOrderingTest,
   //   ROOT %call = call({%c}), subcomputation
   //
   // %root should interfere with %dead.
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   const Shape scalar_shape = ShapeUtil::MakeShape(xla::F32, {});
 
   auto subbuilder = HloComputation::Builder(TestName() + ".sub");
diff --git a/tensorflow/compiler/xla/service/hlo_parser.cc b/tensorflow/compiler/xla/service/hlo_parser.cc
index 4390145c6bd..9b5bb5d0bd6 100644
--- a/tensorflow/compiler/xla/service/hlo_parser.cc
+++ b/tensorflow/compiler/xla/service/hlo_parser.cc
@@ -47,11 +47,11 @@ const double kF16max = 65504;
 
 // Creates and returns a schedule created using the order of the instructions in
 // the HloComputation::instructions() vectors in the module.
-HloSchedule ScheduleFromInstructionOrder(const HloModule* module) {
+HloSchedule ScheduleFromInstructionOrder(HloModule* module) {
   HloSchedule schedule(module);
-  for (const HloComputation* computation : module->computations()) {
+  for (HloComputation* computation : module->computations()) {
     if (!computation->IsFusionComputation()) {
-      for (const HloInstruction* instruction : computation->instructions()) {
+      for (HloInstruction* instruction : computation->instructions()) {
         schedule.GetOrCreateSequence(computation).push_back(instruction);
       }
     }
@@ -850,6 +850,15 @@ bool HloParser::ParseInstructionRhs(HloComputation::Builder* builder,
       }
       break;
     }
+    case HloOpcode::kAddDependency: {
+      if (!ParseOperands(&operands, /*expected_size=*/2) ||
+          !ParseAttributes(attrs)) {
+        return false;
+      }
+      instruction = builder->AddInstruction(
+          HloInstruction::CreateAddDependency(operands[0], operands[1]));
+      break;
+    }
     case HloOpcode::kSort: {
       optional<std::vector<tensorflow::int64>> dimensions;
       attrs["dimensions"] = {/*required=*/true, AttrTy::kBracedInt64List,
diff --git a/tensorflow/compiler/xla/service/hlo_parser_test.cc b/tensorflow/compiler/xla/service/hlo_parser_test.cc
index c59bdc0a0b3..ab71f011ac9 100644
--- a/tensorflow/compiler/xla/service/hlo_parser_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_parser_test.cc
@@ -21,7 +21,8 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
 #include "tensorflow/compiler/xla/service/hlo_instructions.h"
-#include "tensorflow/compiler/xla/service/hlo_matchers.h"
+#include "tensorflow/compiler/xla/service/pattern_matcher.h"
+#include "tensorflow/compiler/xla/service/pattern_matcher_gmock.h"
 #include "tensorflow/compiler/xla/window_util.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
@@ -29,7 +30,7 @@ limitations under the License.
 namespace xla {
 namespace {
 
-namespace op = ::xla::testing::opcode_matchers;
+namespace m = ::xla::match;
 using absl::string_view;
 
 struct TestData {
@@ -195,7 +196,7 @@ ENTRY %add_constants () -> f32[] {
 R"(HloModule TupleConstant_module
 
 ENTRY %TupleConstant.v1 () -> (f32[2,1], f32[2]) {
-  ROOT %constant = (f32[2,1]{1,0}, f32[2]{0}) constant((f32[2,1], f32[2]) ( f32[2,1] { { 1 }, { 2 } }, {2, 42} ))
+  ROOT %constant = (f32[2,1]{1,0}, f32[2]{0}) constant((f32[2,1], f32[2]) ( f32[2,1] { {1}, {2} }, {2, 42} ))
 }
 
 )"
@@ -587,7 +588,7 @@ ENTRY %DynamicUpdateSlice.v4 (input: s32[1,1,25,1], update: s32[1,1,2,1], start_
 R"(HloModule BasicTraining_module
 
 ENTRY %BasicTraining.v4 () -> (f32[2,2,1,2], f32[2], f32[2]) {
-  %constant = f32[2,2,1,2]{3,2,1,0} constant(f32[2,2,1,2] { { /*i0=0*/ { /*i1=0*/ {1, 2} }, { /*i1=1*/ {3, 4} } }, { /*i0=1*/ { /*i1=0*/ {5, 6} }, { /*i1=1*/ {7, 8} } } })
+  %constant = f32[2,2,1,2]{3,2,1,0} constant(f32[2,2,1,2] { { /*i0=0*/ { /*i1=0*/ { 1, 2 } }, { /*i1=1*/ { 3, 4 } } }, { /*i0=1*/ { /*i1=0*/ { 5, 6 } }, { /*i1=1*/ { 7, 8 } } } })
   %constant.1 = f32[2]{0} constant({2, 3})
   %constant.2 = f32[2]{0} constant({1, 2})
   ROOT %batch-norm-training = (f32[2,2,1,2]{3,2,1,0}, f32[2]{0}, f32[2]{0}) batch-norm-training(f32[2,2,1,2]{3,2,1,0} %constant, f32[2]{0} %constant.1, f32[2]{0} %constant.2), epsilon=0.001, feature_index=3
@@ -1241,7 +1242,38 @@ ENTRY Sort {
 }
 
 )"
+    },
+// AfterAll with multiple operands
+{
+"AfterAllWithMultipleOperands",
+R"(HloModule AfterAllWithMultipleOperands
+
+ENTRY AfterAllWithMultipleOperands {
+  p0 = f32[] parameter(0)
+  token0 = token[] after-all()
+  token1 = token[] after-all()
+  ROOT after-all = token[] after-all(p0, token0, token1)
 }
+
+)"
+},
+// AddDependency
+// A dependency chain is created from 'neg' to 'exp' using tokens.
+{
+"AddDependency",
+R"(HloModule AddDependency
+
+ENTRY AddDependency {
+  p = f32[] parameter(0)
+  neg = f32[] negate(p)
+  token = token[] after-all(neg)
+  p_after_token = f32[] add-dependency(p, token)
+  exp = f32[] exponential(p_after_token)
+  ROOT sum = f32[] add(neg, exp)
+}
+
+)"
+},
 });
   // clang-format on
 }
@@ -1862,7 +1894,8 @@ ENTRY ReduceR3ToR2 {
 )";
   TF_ASSERT_OK_AND_ASSIGN(auto module, ParseHloString(original));
   ASSERT_NE(module->entry_computation(), nullptr);
-  EXPECT_THAT(module->entry_computation()->root_instruction(), op::Reduce());
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              GmockMatch(m::Reduce()));
 }
 
 TEST_F(HloParserTest, ParseSharding) {
@@ -1922,7 +1955,7 @@ TEST(HloParserSingleOpTest, SingleOp) {
   const HloComputation* computation = module->entry_computation();
   ASSERT_NE(computation, nullptr);
   EXPECT_THAT(computation->root_instruction(),
-              op::Multiply(op::Parameter(0), op::Parameter(1)));
+              GmockMatch(m::Multiply(m::Parameter(0), m::Parameter(1))));
 }
 
 TEST(HloParserSingleOpTest, SingleOpNoShapeProducesError) {
@@ -1950,7 +1983,7 @@ TEST(HloParserSingleOpTest, SingleOpNoNames) {
   const HloComputation* computation = module->entry_computation();
   ASSERT_NE(computation, nullptr);
   EXPECT_THAT(computation->root_instruction(),
-              op::Multiply(op::Parameter(0), op::Parameter(1)));
+              GmockMatch(m::Multiply(m::Parameter(0), m::Parameter(1))));
 }
 
 TEST(HloParserSingleOpTest, CanonicalOp) {
@@ -1959,7 +1992,7 @@ TEST(HloParserSingleOpTest, CanonicalOp) {
   const HloComputation* computation = module->entry_computation();
   ASSERT_NE(computation, nullptr);
   EXPECT_THAT(computation->root_instruction(),
-              op::Multiply(op::Parameter(0), op::Parameter(1)));
+              GmockMatch(m::Multiply(m::Parameter(0), m::Parameter(1))));
   EXPECT_EQ(
       computation->root_instruction()->ToString(HloPrintOptions::Canonical()),
       text);
@@ -2013,7 +2046,11 @@ TEST(HloParserSingleOpTest, SingleOpWithNested) {
   const HloComputation* computation = module->entry_computation();
   ASSERT_NE(computation, nullptr);
   EXPECT_THAT(computation->root_instruction(),
-              op::Fusion(op::Parameter(0), op::Parameter(1)));
+              GmockMatch(m::Op()
+                             .WithOpcode(HloOpcode::kFusion)
+                             .WithNumOperands(2)
+                             .WithOperand(0, m::Parameter(0))
+                             .WithOperand(1, m::Parameter(1))));
 }
 
 TEST(HloParserSingleOpTest, SingleOpWithNested_DoesNotExist) {
@@ -2057,7 +2094,7 @@ TEST(HloParserSingleOpTest, ConvolutionTrivialFeatureGroupCount) {
   const HloComputation* computation = module->entry_computation();
   ASSERT_NE(computation, nullptr);
   EXPECT_THAT(computation->root_instruction(),
-              op::Convolution(op::Parameter(0), op::Parameter(1)));
+              GmockMatch(m::Convolution(m::Parameter(0), m::Parameter(1))));
   auto* convolution =
       Cast<HloConvolutionInstruction>(computation->root_instruction());
   EXPECT_EQ(convolution->feature_group_count(), 1);
@@ -2121,8 +2158,10 @@ ENTRY %axpy.v5 (alpha: f32[], x: f32[2,4], y: f32[2,4]) -> f32[2,4] {
       module->schedule().is_computation_scheduled(module->entry_computation()));
   EXPECT_THAT(
       module->schedule().sequence(module->entry_computation()).instructions(),
-      ::testing::ElementsAre(op::Parameter(), op::Broadcast(), op::Parameter(),
-                             op::Multiply(), op::Parameter(), op::Add()));
+      ::testing::ElementsAre(
+          GmockMatch(m::Parameter()), GmockMatch(m::Broadcast()),
+          GmockMatch(m::Parameter()), GmockMatch(m::Multiply()),
+          GmockMatch(m::Parameter()), GmockMatch(m::Add())));
 }
 
 TEST_F(HloParserTest, IsScheduledIsTrueDifferentOrder) {
@@ -2148,8 +2187,10 @@ ENTRY %axpy.v5 (alpha: f32[], x: f32[2,4], y: f32[2,4]) -> f32[2,4] {
       module->schedule().is_computation_scheduled(module->entry_computation()));
   EXPECT_THAT(
       module->schedule().sequence(module->entry_computation()).instructions(),
-      ::testing::ElementsAre(op::Parameter(), op::Parameter(), op::Parameter(),
-                             op::Broadcast(), op::Multiply(), op::Add()));
+      ::testing::ElementsAre(
+          GmockMatch(m::Parameter()), GmockMatch(m::Parameter()),
+          GmockMatch(m::Parameter()), GmockMatch(m::Broadcast()),
+          GmockMatch(m::Multiply()), GmockMatch(m::Add())));
 }
 
 TEST_F(HloParserTest, CustomCallWrongNumberofOperandConstraints) {
diff --git a/tensorflow/compiler/xla/service/hlo_proto_util.cc b/tensorflow/compiler/xla/service/hlo_proto_util.cc
index cf33668f5bf..981d06ce101 100644
--- a/tensorflow/compiler/xla/service/hlo_proto_util.cc
+++ b/tensorflow/compiler/xla/service/hlo_proto_util.cc
@@ -48,7 +48,7 @@ StatusOr<std::unique_ptr<HloModule>> CreateModuleFromProto(
   return std::move(module);
 }
 
-StatusOr<std::vector<const Shape*>> EntryComputationParameterShapes(
+StatusOr<std::vector<const ShapeProto*>> EntryComputationParameterShapes(
     const HloProto& hlo_proto) {
   if (!hlo_proto.has_hlo_module()) {
     return NotFound("HloProto missing HloModuleProto.");
@@ -57,15 +57,16 @@ StatusOr<std::vector<const Shape*>> EntryComputationParameterShapes(
     return NotFound("HloProto missing program shape.");
   }
 
-  std::vector<const Shape*> parameter_shapes;
+  std::vector<const ShapeProto*> parameter_shapes;
   const auto& program_shape = hlo_proto.hlo_module().host_program_shape();
-  for (const Shape& shape : program_shape.parameters()) {
+  for (const ShapeProto& shape : program_shape.parameters()) {
     parameter_shapes.push_back(&shape);
   }
   return parameter_shapes;
 }
 
-StatusOr<const Shape*> EntryComputationOutputShape(const HloProto& hlo_proto) {
+StatusOr<const ShapeProto*> EntryComputationOutputShape(
+    const HloProto& hlo_proto) {
   if (!hlo_proto.has_hlo_module()) {
     return NotFound("HloProto missing HloModuleProto.");
   }
diff --git a/tensorflow/compiler/xla/service/hlo_proto_util.h b/tensorflow/compiler/xla/service/hlo_proto_util.h
index 1db82dd6fca..31ea2aaffd9 100644
--- a/tensorflow/compiler/xla/service/hlo_proto_util.h
+++ b/tensorflow/compiler/xla/service/hlo_proto_util.h
@@ -43,12 +43,13 @@ StatusOr<std::unique_ptr<HloModule>> CreateModuleFromProto(
 
 // Returns the shapes of the parameters of the entry computation. Shape pointers
 // refer to shapes inside of the given HloProto.
-StatusOr<std::vector<const Shape*>> EntryComputationParameterShapes(
+StatusOr<std::vector<const ShapeProto*>> EntryComputationParameterShapes(
     const HloProto& hlo_proto);
 
 // Returns the shape of the output of the entry computation. The shape pointer
 // refers to the output shape inside of the given HloProto.
-StatusOr<const Shape*> EntryComputationOutputShape(const HloProto& hlo_proto);
+StatusOr<const ShapeProto*> EntryComputationOutputShape(
+    const HloProto& hlo_proto);
 
 }  // namespace xla
 
diff --git a/tensorflow/compiler/xla/service/hlo_rematerialization.cc b/tensorflow/compiler/xla/service/hlo_rematerialization.cc
index 49e46ecd00e..48add75523f 100644
--- a/tensorflow/compiler/xla/service/hlo_rematerialization.cc
+++ b/tensorflow/compiler/xla/service/hlo_rematerialization.cc
@@ -130,10 +130,10 @@ using ItemList = absl::InlinedVector<Item*, 3>;
 // before arbitrary elements.
 class InstructionList {
  public:
-  explicit InstructionList(const std::vector<const HloInstruction*>& order) {
+  explicit InstructionList(const HloInstructionSequence& order) {
     int64 position = 0;
     Item* last = nullptr;
-    for (const HloInstruction* inst : order) {
+    for (HloInstruction* inst : order.instructions()) {
       // Add a new item to the linked list.
       Item* item = new Item;
       item->next = nullptr;
@@ -151,7 +151,7 @@ class InstructionList {
       // to be monotonically increasing through the list, and so is still useful
       // for quickly(-ish) determining the order of arbitrary instructions in
       // the list.
-      item->instruction = const_cast<HloInstruction*>(inst);
+      item->instruction = inst;
       item->position = position;
       position++;
 
@@ -927,7 +927,7 @@ Item* PickRematerializationCandidate(
 
 StatusOr<int64> HloRematerialization::ComputePeakMemory(
     const HloComputation* computation,
-    const std::vector<const HloInstruction*>& order) const {
+    const HloInstructionSequence& order) const {
   InstructionList instruction_list(order);
   MemoryUsageTracker tracker(computation, size_function_, *points_to_analysis_,
                              instruction_list);
@@ -971,8 +971,7 @@ StatusOr<bool> HloRematerialization::RematerializeComputation(
           << HumanReadableNumBytes(computation_peak_memory_.at(computation));
   CHECK(!ContainsKey(rematerialized_computations_, computation));
 
-  InstructionList instruction_list(
-      schedule->sequence(computation).instructions());
+  InstructionList instruction_list(schedule->sequence(computation));
   MemoryUsageTracker memory_tracker(computation, size_function_,
                                     *points_to_analysis_, instruction_list);
   bool changed = false;
@@ -1184,7 +1183,7 @@ StatusOr<bool> HloRematerialization::RematerializeComputation(
   sequence.clear();
   for (auto* item = instruction_list.first(); item != nullptr;
        item = instruction_list.next(item)) {
-    const HloInstruction* instruction = item->instruction;
+    HloInstruction* instruction = item->instruction;
     sequence.push_back(instruction);
   }
   rematerialized_computations_.insert(computation);
@@ -1235,10 +1234,8 @@ StatusOr<bool> HloRematerialization::Run(HloModule* module) {
         if (node.context() == CallContext::kSequential) {
           TF_ASSIGN_OR_RETURN(
               computation_peak_memory_[node.computation()],
-              ComputePeakMemory(node.computation(),
-                                module->schedule()
-                                    .sequence(node.computation())
-                                    .instructions()));
+              ComputePeakMemory(node.computation(), module->schedule().sequence(
+                                                        node.computation())));
         }
         return Status::OK();
       },
diff --git a/tensorflow/compiler/xla/service/hlo_rematerialization.h b/tensorflow/compiler/xla/service/hlo_rematerialization.h
index 70d83c04f07..a07d348041b 100644
--- a/tensorflow/compiler/xla/service/hlo_rematerialization.h
+++ b/tensorflow/compiler/xla/service/hlo_rematerialization.h
@@ -87,9 +87,8 @@ class HloRematerialization : public HloModulePass {
   // peak memory is the maximum total size of all live HLO instruction values at
   // any program point. 'order' is the order in which the HLO instructions will
   // be emitted which is used to determine lifespans of HLO values.
-  StatusOr<int64> ComputePeakMemory(
-      const HloComputation* computation,
-      const std::vector<const HloInstruction*>& order) const;
+  StatusOr<int64> ComputePeakMemory(const HloComputation* computation,
+                                    const HloInstructionSequence& order) const;
 
   // Returns the peak memory usage of the called computations for the given
   // instruction. Zero is returned if the instruction calls no computations.
diff --git a/tensorflow/compiler/xla/service/hlo_runner.cc b/tensorflow/compiler/xla/service/hlo_runner.cc
index 3f0ca342b4c..5a9b820a9d7 100644
--- a/tensorflow/compiler/xla/service/hlo_runner.cc
+++ b/tensorflow/compiler/xla/service/hlo_runner.cc
@@ -205,6 +205,40 @@ StatusOr<ScopedShapedBuffer> HloRunner::ExecuteWithDeviceBuffers(
       /*profile=*/profile);
 }
 
+StatusOr<ScopedShapedBuffer> HloRunner::ExecuteWithDeviceBuffers(
+    std::unique_ptr<Executable> executable,
+    const absl::Span<const ShapedBuffer* const> arguments,
+    ExecutionProfile* profile) {
+  // Get service run options.
+  se::Stream stream(backend().default_stream_executor());
+  stream.Init();
+  ServiceExecutableRunOptions service_run_options =
+      GetServiceRunOptionsForDevice(backend().default_device_ordinal(), &stream,
+                                    nullptr);
+
+  TF_ASSIGN_OR_RETURN(
+      ScopedShapedBuffer retval,
+      executable->ExecuteOnStreamWrapper(&service_run_options,
+                                         /*profile=*/profile, arguments));
+  TF_RETURN_IF_ERROR(stream.BlockHostUntilDone());
+  return std::move(retval);
+}
+
+StatusOr<ScopedShapedBuffer> HloRunner::ExecuteWithDeviceBuffers(
+    std::unique_ptr<Executable> executable,
+    const absl::Span<const ScopedShapedBuffer> arguments,
+    ExecutionProfile* profile) {
+  std::vector<const ShapedBuffer*> argument_pointers;
+  argument_pointers.reserve(arguments.size());
+  for (const auto& argument : arguments) {
+    argument_pointers.push_back(&argument);
+  }
+  return ExecuteWithDeviceBuffers(
+      /*executable=*/std::move(executable),
+      /*arguments=*/argument_pointers,
+      /*profile=*/profile);
+}
+
 StatusOr<std::vector<Literal>> HloRunner::ExecuteReplicated(
     std::unique_ptr<HloModule> module,
     const ReplicatedExecuteOptions& options) {
diff --git a/tensorflow/compiler/xla/service/hlo_runner.h b/tensorflow/compiler/xla/service/hlo_runner.h
index 2e934bf66ae..bb792cf8c98 100644
--- a/tensorflow/compiler/xla/service/hlo_runner.h
+++ b/tensorflow/compiler/xla/service/hlo_runner.h
@@ -136,6 +136,21 @@ class HloRunner {
       const absl::Span<const ScopedShapedBuffer> arguments,
       bool run_hlo_passes = true, ExecutionProfile* profile = nullptr);
 
+  StatusOr<ScopedShapedBuffer> ExecuteWithDeviceBuffers(
+      std::unique_ptr<Executable> executable,
+      const absl::Span<const ShapedBuffer* const> arguments,
+      ExecutionProfile* profile = nullptr);
+
+  StatusOr<ScopedShapedBuffer> ExecuteWithDeviceBuffers(
+      std::unique_ptr<Executable> executable,
+      const absl::Span<const ScopedShapedBuffer> arguments,
+      ExecutionProfile* profile = nullptr);
+
+  // Creates an executable object given an HLO module. If run_hlo_passes is
+  // true, the HLO passes will be run as part of compilation.
+  StatusOr<std::unique_ptr<Executable>> CreateExecutable(
+      std::unique_ptr<HloModule> module, bool run_hlo_passes);
+
   // Executes a given HLO module into a set of replicas, and returns a map
   // with the replica number as key, and the corresponding returned literal as
   // value.
@@ -152,11 +167,6 @@ class HloRunner {
   const Backend& backend() const;
 
  private:
-  // Creates an executable object given an HLO module. If run_hlo_passes is
-  // true, the HLO passes will be run before.
-  StatusOr<std::unique_ptr<Executable>> CreateExecutable(
-      std::unique_ptr<HloModule> module, bool run_hlo_passes);
-
   // Creates a ServiceExecutableRunOptions object to configure a run on device,
   // using the provided stream object. If device_assignment is not nullptr, it
   // will be used to configure the replication parameters. Replicated executions
diff --git a/tensorflow/compiler/xla/service/hlo_schedule.cc b/tensorflow/compiler/xla/service/hlo_schedule.cc
index a5780b7551a..8f6eb974c51 100644
--- a/tensorflow/compiler/xla/service/hlo_schedule.cc
+++ b/tensorflow/compiler/xla/service/hlo_schedule.cc
@@ -46,8 +46,8 @@ namespace xla {
         << "No computation exists in HLO module with id " << computation_id;
     const HloComputation* computation = comp_it->second;
 
-    absl::flat_hash_map<int64, const HloInstruction*> id_to_instruction;
-    for (const HloInstruction* instruction : computation->instructions()) {
+    absl::flat_hash_map<int64, HloInstruction*> id_to_instruction;
+    for (HloInstruction* instruction : computation->instructions()) {
       id_to_instruction[instruction->unique_id()] = instruction;
     }
 
@@ -81,9 +81,8 @@ StatusOr<HloScheduleProto> HloSchedule::ToProto() const {
   return std::move(proto);
 }
 
-void HloSchedule::set_sequence(
-    const HloComputation* computation,
-    absl::Span<const HloInstruction* const> sequence) {
+void HloSchedule::set_sequence(const HloComputation* computation,
+                               absl::Span<HloInstruction* const> sequence) {
   set_sequence(computation, HloInstructionSequence(sequence));
 }
 
@@ -114,8 +113,8 @@ Status HloSchedule::UpdateComputationSchedule(
     const HloComputation* computation) {
   // Map from unique ID to HloInstruction pointer for instructions in the
   // computation.
-  absl::flat_hash_map<int, const HloInstruction*> id_to_instruction;
-  for (const HloInstruction* instruction : computation->instructions()) {
+  absl::flat_hash_map<int, HloInstruction*> id_to_instruction;
+  for (HloInstruction* instruction : computation->instructions()) {
     InsertOrDie(&id_to_instruction, instruction->unique_id(), instruction);
   }
 
@@ -128,7 +127,7 @@ Status HloSchedule::UpdateComputationSchedule(
   // Map from HloInstruction X to newly added instructions (instruction is in
   // computation, but not in schedule) which use X. If an instruction is not in
   // the map, then it has no users which are newly added instructions.
-  absl::flat_hash_map<const HloInstruction*, std::vector<const HloInstruction*>>
+  absl::flat_hash_map<const HloInstruction*, std::vector<HloInstruction*>>
       new_instruction_uses;
 
   // For each newly added instruction, this is the count of the instruction's
@@ -138,9 +137,9 @@ Status HloSchedule::UpdateComputationSchedule(
 
   // Create a worklist of newly added instructions which are ready to be added
   // to the schedule. Initialize worklist with those that have zero operands.
-  std::queue<const HloInstruction*> worklist;
+  std::queue<HloInstruction*> worklist;
 
-  for (const HloInstruction* instruction : computation->instructions()) {
+  for (HloInstruction* instruction : computation->instructions()) {
     if (ids_in_schedule.count(instruction->unique_id()) == 0) {
       // This is a newly added instruction which is not in the schedule.
       if (instruction->operands().empty()) {
@@ -161,17 +160,17 @@ Status HloSchedule::UpdateComputationSchedule(
   // Lambda which schedules all instructions on the worklist.
   auto schedule_worklist = [&]() {
     while (!worklist.empty()) {
-      const HloInstruction* instruction = worklist.front();
+      HloInstruction* instruction = worklist.front();
       worklist.pop();
       new_sequence.push_back(instruction);
-      std::vector<const HloInstruction*>* new_users =
+      std::vector<HloInstruction*>* new_users =
           tensorflow::gtl::FindOrNull(new_instruction_uses, instruction);
       if (new_users != nullptr) {
         // This just-scheduled instruction has users which are newly added to
         // the module. Update the number of unscheduled operands and push the
         // newly added instruction to the worklist if it is ready to
         // schedule.
-        for (const HloInstruction* new_user : *new_users) {
+        for (HloInstruction* new_user : *new_users) {
           unscheduled_operand_count.at(new_user)--;
           CHECK_GE(unscheduled_operand_count.at(new_user), 0);
           if (unscheduled_operand_count.at(new_user) == 0) {
diff --git a/tensorflow/compiler/xla/service/hlo_schedule.h b/tensorflow/compiler/xla/service/hlo_schedule.h
index 0a714101ee5..486ddbf499d 100644
--- a/tensorflow/compiler/xla/service/hlo_schedule.h
+++ b/tensorflow/compiler/xla/service/hlo_schedule.h
@@ -35,14 +35,14 @@ class HloInstructionSequence {
  public:
   HloInstructionSequence() = default;
   explicit HloInstructionSequence(
-      absl::Span<const HloInstruction* const> instructions) {
-    for (const HloInstruction* instruction : instructions) {
+      absl::Span<HloInstruction* const> instructions) {
+    for (HloInstruction* instruction : instructions) {
       push_back(instruction);
     }
   }
 
   // Adds the instruction to the end of the sequence.
-  void push_back(const HloInstruction* instruction) {
+  void push_back(HloInstruction* instruction) {
     instruction_sequence_.push_back(instruction);
     id_sequence_.push_back(instruction->unique_id());
   }
@@ -56,7 +56,7 @@ class HloInstructionSequence {
   int64 size() const { return instruction_sequence_.size(); }
 
   // Returns the sequence of HLO instructions.
-  const std::vector<const HloInstruction*>& instructions() const {
+  const std::vector<HloInstruction*>& instructions() const {
     return instruction_sequence_;
   }
 
@@ -65,7 +65,7 @@ class HloInstructionSequence {
 
  private:
   // The sequence as HloInstructions.
-  std::vector<const HloInstruction*> instruction_sequence_;
+  std::vector<HloInstruction*> instruction_sequence_;
 
   // The sequence of HLO instructions, represented by their unique IDs. The
   // sequence is stored as both HloInstructions and unique IDs because the
@@ -98,7 +98,7 @@ class HloSchedule {
 
   // Sets the sequence for the given computation to the given sequence.
   void set_sequence(const HloComputation* computation,
-                    absl::Span<const HloInstruction* const> sequence);
+                    absl::Span<HloInstruction* const> sequence);
   void set_sequence(const HloComputation* computation,
                     HloInstructionSequence sequence);
 
diff --git a/tensorflow/compiler/xla/service/hlo_schedule_test.cc b/tensorflow/compiler/xla/service/hlo_schedule_test.cc
index 1424569ac1f..0e56e6f760e 100644
--- a/tensorflow/compiler/xla/service/hlo_schedule_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_schedule_test.cc
@@ -56,10 +56,10 @@ ENTRY main {
                           ParseHloString(module_str));
   TF_ASSERT_OK_AND_ASSIGN(
       HloSchedule schedule,
-      ScheduleModule(*module, [](const BufferValue& buffer) {
+      ScheduleModule(module.get(), [](const BufferValue& buffer) {
         return ShapeUtil::ByteSizeOf(buffer.shape());
       }));
-  const std::vector<const HloInstruction*>& entry_schedule =
+  const auto& entry_schedule =
       schedule.sequence(module->entry_computation()).instructions();
 
   EXPECT_EQ(entry_schedule.size(), 6);
@@ -90,7 +90,7 @@ ENTRY main {
                           ParseHloString(module_str));
   TF_ASSERT_OK_AND_ASSIGN(
       HloSchedule schedule,
-      ScheduleModule(*module, [](const BufferValue& buffer) {
+      ScheduleModule(module.get(), [](const BufferValue& buffer) {
         return ShapeUtil::ByteSizeOf(buffer.shape());
       }));
 
@@ -139,7 +139,7 @@ ENTRY main {
                           ParseHloString(module_str));
   TF_ASSERT_OK_AND_ASSIGN(
       HloSchedule schedule,
-      ScheduleModule(*module, [](const BufferValue& buffer) {
+      ScheduleModule(module.get(), [](const BufferValue& buffer) {
         return ShapeUtil::ByteSizeOf(buffer.shape());
       }));
 
@@ -183,7 +183,7 @@ ENTRY main {
                           ParseHloString(module_str));
   TF_ASSERT_OK_AND_ASSIGN(
       HloSchedule schedule,
-      ScheduleModule(*module, [](const BufferValue& buffer) {
+      ScheduleModule(module.get(), [](const BufferValue& buffer) {
         return ShapeUtil::ByteSizeOf(buffer.shape());
       }));
 
@@ -244,7 +244,7 @@ ENTRY %WhileLoop () -> s32[] {
                           ParseHloString(module_str));
   TF_ASSERT_OK_AND_ASSIGN(
       HloSchedule schedule,
-      ScheduleModule(*module, [](const BufferValue& buffer) {
+      ScheduleModule(module.get(), [](const BufferValue& buffer) {
         return ShapeUtil::ByteSizeOf(buffer.shape(),
                                      /*pointer_size=*/sizeof(void*));
       }));
@@ -313,7 +313,7 @@ ENTRY %WhileLoop () -> s32[] {
                           ParseHloString(module_str));
   TF_ASSERT_OK_AND_ASSIGN(
       HloSchedule schedule,
-      ScheduleModule(*module, [](const BufferValue& buffer) {
+      ScheduleModule(module.get(), [](const BufferValue& buffer) {
         return ShapeUtil::ByteSizeOf(buffer.shape(),
                                      /*pointer_size=*/sizeof(void*));
       }));
diff --git a/tensorflow/compiler/xla/service/hlo_sharding_metadata.cc b/tensorflow/compiler/xla/service/hlo_sharding_metadata.cc
index 88329c89979..f5061304456 100644
--- a/tensorflow/compiler/xla/service/hlo_sharding_metadata.cc
+++ b/tensorflow/compiler/xla/service/hlo_sharding_metadata.cc
@@ -253,7 +253,7 @@ StatusOr<bool> ApplyShardingFromUsers(HloInstruction* instruction,
       instruction->shape(), HloSharding::AssignDevice(kUnassignedDevice));
   for (HloInstruction* user : instruction->users()) {
     if (user->opcode() == HloOpcode::kDomain &&
-        domain.exit_domains.count(const_cast<HloInstruction*>(user)) > 0) {
+        domain.exit_domains.count(user) > 0) {
       // If a user is a domain and it is registered in the domain exits, then
       // the instruction sharding is taken directly from the domain, and no
       // further users need to be visited.
diff --git a/tensorflow/compiler/xla/service/hlo_subcomputation_unification_test.cc b/tensorflow/compiler/xla/service/hlo_subcomputation_unification_test.cc
index 11994d99c93..c1073911ea9 100644
--- a/tensorflow/compiler/xla/service/hlo_subcomputation_unification_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_subcomputation_unification_test.cc
@@ -66,7 +66,7 @@ class HloSubcomputationUnificationTest : public HloTestBase {
 };
 
 TEST_F(HloSubcomputationUnificationTest, UnifyIdentities) {
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   auto builder = HloComputation::Builder(TestName());
 
   auto callee1 =
@@ -103,7 +103,7 @@ TEST_F(HloSubcomputationUnificationTest, UnifyIdentities) {
 }
 
 TEST_F(HloSubcomputationUnificationTest, UnifyAdditions) {
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   auto builder = HloComputation::Builder(TestName());
 
   auto callee1 =
@@ -184,7 +184,7 @@ TEST_F(HloSubcomputationUnificationTest, DifferentParameterShapes) {
 // Regression test for b/31466798. Checks that entry_computation is still valid
 // after unification.
 TEST_F(HloSubcomputationUnificationTest, TwoIdenticalComputations) {
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   for (int i = 0; i < 2; ++i) {
     HloComputation::Builder builder("pow");
     auto x =
diff --git a/tensorflow/compiler/xla/service/hlo_value.h b/tensorflow/compiler/xla/service/hlo_value.h
index b6670d409b9..1f01b0bb365 100644
--- a/tensorflow/compiler/xla/service/hlo_value.h
+++ b/tensorflow/compiler/xla/service/hlo_value.h
@@ -166,9 +166,6 @@ class HloValue : public BufferValue {
 
   // Whether this value is live out of the HLO module.
   bool live_out_of_module_ = false;
-
-  // Whether this value is live out of its computation.
-  bool live_out_of_computation_ = false;
 };
 
 std::ostream& operator<<(std::ostream& out, const HloValue& hlo_value);
diff --git a/tensorflow/compiler/xla/service/hlo_verifier.cc b/tensorflow/compiler/xla/service/hlo_verifier.cc
index 27fd685a69a..77db7b098a3 100644
--- a/tensorflow/compiler/xla/service/hlo_verifier.cc
+++ b/tensorflow/compiler/xla/service/hlo_verifier.cc
@@ -753,13 +753,19 @@ Status ShapeVerifier::HandleAfterAll(HloInstruction* token) {
   for (const HloInstruction* operand : token->operands()) {
     operand_shapes.push_back(&operand->shape());
   }
-  return CheckShape(token, ShapeInference::InferAfterAllShape(operand_shapes));
+  return CheckShape(token, ShapeUtil::MakeTokenShape());
+}
+
+Status ShapeVerifier::HandleAddDependency(HloInstruction* add_dependency) {
+  TF_RETURN_IF_ERROR(CheckOperandCount(add_dependency, 2));
+  TF_RETURN_IF_ERROR(CheckIsTokenOperand(add_dependency, 1));
+  return CheckShape(add_dependency, add_dependency->operand(0)->shape());
 }
 
 Status ShapeVerifier::HandleGetDimensionSize(HloInstruction* get_size) {
-  return CheckShape(
-      get_size, ShapeInference::InferGetDimensionSizeShape(
-                    get_size->operand(0)->shape(), get_size->dimensions(0)));
+  return CheckShape(get_size,
+                    ShapeInference::InferGetDimensionSizeShape(
+                        get_size->operand(0)->shape(), get_size->dimension()));
 }
 
 Status ShapeVerifier::CheckShape(const HloInstruction* instruction,
@@ -1373,9 +1379,8 @@ class InstructionVerifier : public DfsHloVisitorWithDefault {
           const Layout& operand_layout = operand_shape.layout();
           TF_RET_CHECK(LayoutUtil::Equal(result_layout, operand_layout))
               << "Instruction shouldn't change layouts "
-              << instruction->ToString() << " From "
-              << ShapeUtil::HumanString(result_shape) << " To "
-              << ShapeUtil::HumanString(operand_shape);
+              << instruction->ToString() << " From " << result_shape << " To "
+              << operand_shape;
         }
       }
     }
@@ -1426,6 +1431,8 @@ StatusOr<bool> HloVerifier::Run(HloModule* module) {
         return target_metadata_->ShapeSize(shape);
       }));
 
+  TF_RETURN_IF_ERROR(module->dynamic_parameter_binding().Verify(*module));
+
   return false;
 }
 
diff --git a/tensorflow/compiler/xla/service/hlo_verifier.h b/tensorflow/compiler/xla/service/hlo_verifier.h
index 9fbfd6a21c1..e4d0c3d6957 100644
--- a/tensorflow/compiler/xla/service/hlo_verifier.h
+++ b/tensorflow/compiler/xla/service/hlo_verifier.h
@@ -95,6 +95,7 @@ class ShapeVerifier : public DfsHloVisitor {
   Status HandleScatter(HloInstruction* scatter) override;
   Status HandleAfterAll(HloInstruction* token) override;
   Status HandleGetDimensionSize(HloInstruction* get_size) override;
+  Status HandleAddDependency(HloInstruction* add_dependency) override;
 
   Status FinishVisit(HloInstruction*) override { return Status::OK(); }
 
diff --git a/tensorflow/compiler/xla/service/hlo_verifier_test.cc b/tensorflow/compiler/xla/service/hlo_verifier_test.cc
index 5ddfe0a944f..4bc557e4e62 100644
--- a/tensorflow/compiler/xla/service/hlo_verifier_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_verifier_test.cc
@@ -35,6 +35,10 @@ namespace {
 
 using ::testing::HasSubstr;
 
+std::unique_ptr<HloModule> CreateUnverifiedModule() {
+  return absl::make_unique<HloModule>("module", HloModuleConfig());
+}
+
 // This class cannot be converted to use HloTestBase. It explicitly
 // uses HloTestBase to create and test malformed HLOs.
 class HloVerifierTest : public HloTestBase {
@@ -66,7 +70,7 @@ TEST_F(HloVerifierTest, NullInstructionParent) {
       HloInstruction::CreateParameter(0, scalar_shape, "param"));
   HloInstruction* negate = builder.AddInstruction(
       HloInstruction::CreateUnary(scalar_shape, HloOpcode::kNegate, param));
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateUnverifiedModule();
   module->AddEntryComputation(builder.Build());
 
   TF_ASSERT_OK(verifier().Run(module.get()).status());
@@ -85,7 +89,7 @@ TEST_F(HloVerifierTest, NullComputationParent) {
       HloInstruction::CreateParameter(0, scalar_shape, "param"));
   builder.AddInstruction(
       HloInstruction::CreateUnary(scalar_shape, HloOpcode::kNegate, param));
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateUnverifiedModule();
   HloComputation* computation = module->AddEntryComputation(builder.Build());
 
   TF_ASSERT_OK(verifier().Run(module.get()).status());
@@ -104,7 +108,7 @@ TEST_F(HloVerifierTest, DifferentOperandParents) {
       HloInstruction::CreateParameter(0, scalar_shape, "param"));
   HloInstruction* negate = builder.AddInstruction(
       HloInstruction::CreateUnary(scalar_shape, HloOpcode::kNegate, param));
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateUnverifiedModule();
   module->AddEntryComputation(builder.Build());
 
   HloComputation::Builder emb_builder(TestName());
@@ -138,7 +142,7 @@ TEST_F(HloVerifierTest, ResetsShapeVerifierState) {
   builder.AddInstruction(
       HloInstruction::CreateBinary(s2, HloOpcode::kMultiply, add, add));
 
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateUnverifiedModule();
   module->AddEntryComputation(builder.Build());
 
   // Run the verifier twice.  It should fail both times, because it shouldn't
@@ -303,7 +307,7 @@ TEST_F(HloVerifierTest, NegativeInteriorPaddingNotAllowed) {
           HloInstruction::CreateConstant(LiteralUtil::Zero(F32))),
       padding_config));
 
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateUnverifiedModule();
   module->AddEntryComputation(builder.Build());
 
   auto status = verifier().Run(module.get()).status();
@@ -327,7 +331,7 @@ TEST_F(HloVerifierTest, PadNegativeInteriorDilationNotAllowed) {
           HloInstruction::CreateConstant(LiteralUtil::Zero(F32).Clone())),
       padding_config));
 
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateUnverifiedModule();
   module->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(verifier().Run(module.get()).status().error_message(),
diff --git a/tensorflow/compiler/xla/service/indexed_array_analysis_test.cc b/tensorflow/compiler/xla/service/indexed_array_analysis_test.cc
index 20cc18f9815..98246d5403e 100644
--- a/tensorflow/compiler/xla/service/indexed_array_analysis_test.cc
+++ b/tensorflow/compiler/xla/service/indexed_array_analysis_test.cc
@@ -481,8 +481,8 @@ ENTRY main {
   const char* expected_root_expression = R"(
 (scalar-indexed-const
   (constant s32[2,1,1,1,6] s32[2,1,1,1,6] {
-    { /*i0=0*/ { /*i1=0*/ { /*i2=0*/ {1, 2, 3, 4, 5, 6} } } },
-    { /*i0=1*/ { /*i1=0*/ { /*i2=0*/ {1, 2, 3, 4, 5, 6} } } } })
+    { /*i0=0*/ { /*i1=0*/ { /*i2=0*/ { 1, 2, 3, 4, 5, 6 } } } },
+    { /*i0=1*/ { /*i1=0*/ { /*i2=0*/ { 1, 2, 3, 4, 5, 6 } } } } })
   (reshape %indices to s32[])
   0->[])
 )";
@@ -512,8 +512,8 @@ ENTRY main {
   const char* expected_root_expression = R"(
 (scalar-indexed-const
   (constant s32[2,1,1,6] s32[2,1,1,6] {
-    { /*i0=0*/ { /*i1=0*/ {1, 2, 3, 4, 5, 6} } },
-    { /*i0=1*/ { /*i1=0*/ {1, 2, 3, 4, 5, 6} } } })
+    { /*i0=0*/ { /*i1=0*/ { 1, 2, 3, 4, 5, 6 } } },
+    { /*i0=1*/ { /*i1=0*/ { 1, 2, 3, 4, 5, 6 } } } })
   (reshape %indices to s32[5])
   0->[2])
 )";
diff --git a/tensorflow/compiler/xla/service/instruction_fusion.cc b/tensorflow/compiler/xla/service/instruction_fusion.cc
index 7f2d7e7cffc..7559ed1bab8 100644
--- a/tensorflow/compiler/xla/service/instruction_fusion.cc
+++ b/tensorflow/compiler/xla/service/instruction_fusion.cc
@@ -103,7 +103,6 @@ bool IsAlwaysDuplicable(const HloInstruction& instruction) {
     case HloOpcode::kShiftRightLogical:
     case HloOpcode::kSlice:
     case HloOpcode::kSubtract:
-    case HloOpcode::kAfterAll:
     case HloOpcode::kTranspose:
     case HloOpcode::kTuple:
     case HloOpcode::kTupleSelect:
@@ -116,7 +115,10 @@ bool IsAlwaysDuplicable(const HloInstruction& instruction) {
     case HloOpcode::kSin:
       return ShapeUtil::ElementIsComplex(instruction.shape());
 
-    // Expensive instructions.
+    // Expensive instructions or unusual instructions for which fusion is
+    // nonsensical.
+    case HloOpcode::kAddDependency:
+    case HloOpcode::kAfterAll:
     case HloOpcode::kAtan2:
     case HloOpcode::kBatchNormGrad:
     case HloOpcode::kBatchNormInference:
@@ -455,8 +457,13 @@ StatusOr<bool> InstructionFusion::Run(HloModule* module) {
     computation_ = computation;
     reachability_ = HloReachabilityMap::Build(computation_);
 
-    HloInstructionSet do_not_duplicate =
-        ComputeGloballyUnfusible(computation_->MakeInstructionPostOrder());
+    HloInstructionSet do_not_duplicate;
+    // If we allow duplications, we need to compute which instructions we do not
+    // want to duplicate based on a global analysis of the graph.
+    if (may_duplicate_) {
+      do_not_duplicate =
+          ComputeGloballyUnfusible(computation_->MakeInstructionPostOrder());
+    }
     auto fusion_queue = GetFusionQueue(computation_);
 
     // Instruction fusion effectively fuses edges in the computation graph
@@ -564,8 +571,8 @@ HloInstruction* InstructionFusion::FuseIntoMultiOutput(
 bool InstructionFusion::MultiOutputFusionCreatesCycle(
     HloInstruction* producer, HloInstruction* consumer) {
   auto is_reachable = [&](const HloInstruction* a, const HloInstruction* b) {
-    // A consumer operand may have been multii-output fused into a parallel
-    // consumer and thus be missing  from the oridinal reachability map.
+    // A consumer operand may have been multi-output fused into a parallel
+    // consumer and thus be missing from the original reachability map.
     if (!reachability_->IsPresent(a) || !reachability_->IsPresent(b)) {
       reachability_ = HloReachabilityMap::Build(consumer->parent());
     }
diff --git a/tensorflow/compiler/xla/service/instruction_fusion_test.cc b/tensorflow/compiler/xla/service/instruction_fusion_test.cc
index 39904bd54b0..58b7135cea7 100644
--- a/tensorflow/compiler/xla/service/instruction_fusion_test.cc
+++ b/tensorflow/compiler/xla/service/instruction_fusion_test.cc
@@ -117,7 +117,7 @@ TEST_F(InstructionFusionTest, PotentialBitcastReshapeOfParameterUnfused) {
   auto reshape1 = builder.AddInstruction(
       HloInstruction::CreateReshape(ShapeUtil::MakeShape(S32, {1, 1}), param0));
 
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
   EXPECT_EQ(reshape1, computation->root_instruction());
   EXPECT_FALSE(
@@ -133,7 +133,7 @@ TEST_F(InstructionFusionTest, PotentialBitcastSimpleReshapeOfParameterUnfused) {
   auto reshape1 = builder.AddInstruction(
       HloInstruction::CreateReshape(ShapeUtil::MakeShape(S32, {1, 1}), param0));
 
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
   EXPECT_EQ(reshape1, computation->root_instruction());
   EXPECT_FALSE(
@@ -149,7 +149,7 @@ TEST_F(InstructionFusionTest, PotentialBitcastTransposeOfParameterUnfused) {
   auto transpose1 = builder.AddInstruction(HloInstruction::CreateTranspose(
       ShapeUtil::MakeShape(S32, {}), param0, {}));
 
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
   EXPECT_EQ(transpose1, computation->root_instruction());
   EXPECT_FALSE(
@@ -394,6 +394,56 @@ TEST_F(InstructionFusionTest, AllowEffectiveUnaryDuplication) {
           .ValueOrDie());
 }
 
+TEST_F(InstructionFusionTest, FuseDiamondGraphsNoDuplication) {
+  auto module = ParseHloString(R"(
+  HloModule test_module
+  ENTRY Test {
+    p0 = f32[100] parameter(0)
+    p1 = f32[100] parameter(1)
+    add = f32[100] add(p0, p1)
+    slice1 = f32[99] slice(add), slice={[0:99:1]}
+    slice2 = f32[99] slice(add), slice={[1:100:1]}
+    ROOT add2 = f32[99] add(slice1, slice2)
+  })")
+                    .ValueOrDie();
+  EXPECT_TRUE(
+      InstructionFusion(InstructionFusion::IsExpensive, /*may_duplicate=*/false)
+          .Run(module.get())
+          .ValueOrDie())
+      << module->ToString();
+
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  // 'add' would originally need to be duplicated if fused. However after its
+  // two users 'slice1' and 'slice2' are fused into 'add2', 'add' has only one
+  // user and can now be also fused.
+  EXPECT_THAT(root, op::Fusion(op::Parameter(), op::Parameter()));
+}
+
+TEST_F(InstructionFusionTest, FuseDiamondGraphsAllowDuplication) {
+  auto module = ParseHloString(R"(
+  HloModule test_module
+  ENTRY Test {
+    p0 = f32[100] parameter(0)
+    p1 = f32[100] parameter(1)
+    add = f32[100] add(p0, p1)
+    slice1 = f32[99] slice(add), slice={[0:99:1]}
+    slice2 = f32[99] slice(add), slice={[1:100:1]}
+    ROOT add2 = f32[99] add(slice1, slice2)
+  })")
+                    .ValueOrDie();
+  EXPECT_TRUE(
+      InstructionFusion(InstructionFusion::IsExpensive, /*may_duplicate=*/true)
+          .Run(module.get())
+          .ValueOrDie())
+      << module->ToString();
+
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  // 'add' would originally need to be duplicated if fused. However after its
+  // two users 'slice1' and 'slice2' are fused into 'add2', 'add' has only one
+  // user and can now be also fused.
+  EXPECT_THAT(root, op::Fusion(op::Parameter(), op::Parameter()));
+}
+
 TEST_F(InstructionFusionTest,
        WideningConvertsAreAlwaysDuplicableIntoConsumers) {
   auto module = ParseHloString(R"(
diff --git a/tensorflow/compiler/xla/service/interpreter/executable.cc b/tensorflow/compiler/xla/service/interpreter/executable.cc
index a06d6113e84..7635fbfed6f 100644
--- a/tensorflow/compiler/xla/service/interpreter/executable.cc
+++ b/tensorflow/compiler/xla/service/interpreter/executable.cc
@@ -37,7 +37,7 @@ namespace xla {
 namespace interpreter {
 
 InterpreterExecutable::InterpreterExecutable(
-    std::unique_ptr<const HloModule> hlo_module,
+    std::unique_ptr<HloModule> hlo_module,
     std::unique_ptr<HloEvaluator> evaluator)
     : Executable(std::move(hlo_module), /*hlo_profile_printer=*/nullptr,
                  /*hlo_profile_index_map=*/nullptr),
diff --git a/tensorflow/compiler/xla/service/interpreter/executable.h b/tensorflow/compiler/xla/service/interpreter/executable.h
index 3b1ebce0c75..bda13d37636 100644
--- a/tensorflow/compiler/xla/service/interpreter/executable.h
+++ b/tensorflow/compiler/xla/service/interpreter/executable.h
@@ -42,7 +42,7 @@ namespace interpreter {
 // buffer allocation. Refer to interpreter/README.md for more.
 class InterpreterExecutable : public Executable {
  public:
-  InterpreterExecutable(std::unique_ptr<const HloModule> hlo_module,
+  InterpreterExecutable(std::unique_ptr<HloModule> hlo_module,
                         std::unique_ptr<HloEvaluator> evaluator);
   ~InterpreterExecutable() override;
 
diff --git a/tensorflow/compiler/xla/service/interpreter/executor.cc b/tensorflow/compiler/xla/service/interpreter/executor.cc
index 4fb67bd0b72..e3e5fa71543 100644
--- a/tensorflow/compiler/xla/service/interpreter/executor.cc
+++ b/tensorflow/compiler/xla/service/interpreter/executor.cc
@@ -78,9 +78,14 @@ port::Status XlaInterpreterExecutor::SynchronousMemcpy(
   return port::Status::OK();
 }
 
-bool XlaInterpreterExecutor::HostCallback(Stream *stream,
-                                          std::function<void()> callback) {
-  AsExecutorStream(stream)->EnqueueTask(callback);
+bool XlaInterpreterExecutor::HostCallback(
+    Stream *stream, std::function<port::Status()> callback) {
+  AsExecutorStream(stream)->EnqueueTask([callback]() {
+    port::Status s = callback();
+    if (!s.ok()) {
+      LOG(WARNING) << "Host callback failed: " << s;
+    }
+  });
   return true;
 }
 
diff --git a/tensorflow/compiler/xla/service/interpreter/executor.h b/tensorflow/compiler/xla/service/interpreter/executor.h
index fbb99457847..400c3051546 100644
--- a/tensorflow/compiler/xla/service/interpreter/executor.h
+++ b/tensorflow/compiler/xla/service/interpreter/executor.h
@@ -125,7 +125,8 @@ class XlaInterpreterExecutor : public internal::StreamExecutorInterface {
     return port::Status{port::error::UNIMPLEMENTED, ""};
   }
 
-  bool HostCallback(Stream *stream, std::function<void()> callback) override;
+  bool HostCallback(Stream *stream,
+                    std::function<port::Status()> callback) override;
 
   port::Status AllocateEvent(Event *event) override {
     return port::Status{port::error::UNIMPLEMENTED, ""};
diff --git a/tensorflow/compiler/xla/service/layout_assignment.cc b/tensorflow/compiler/xla/service/layout_assignment.cc
index a9041192220..eddef850cf5 100644
--- a/tensorflow/compiler/xla/service/layout_assignment.cc
+++ b/tensorflow/compiler/xla/service/layout_assignment.cc
@@ -2000,6 +2000,7 @@ bool LayoutAssignment::InstructionCanChangeLayout(
   switch (instruction->opcode()) {
     case HloOpcode::kAbs:
     case HloOpcode::kAdd:
+    case HloOpcode::kAddDependency:
     case HloOpcode::kAnd:
     case HloOpcode::kAtan2:
     case HloOpcode::kBitcastConvert:
diff --git a/tensorflow/compiler/xla/service/layout_assignment_test.cc b/tensorflow/compiler/xla/service/layout_assignment_test.cc
index 2400b7bb7c4..311bd789054 100644
--- a/tensorflow/compiler/xla/service/layout_assignment_test.cc
+++ b/tensorflow/compiler/xla/service/layout_assignment_test.cc
@@ -31,6 +31,8 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_parser.h"
+#include "tensorflow/compiler/xla/service/pattern_matcher.h"
+#include "tensorflow/compiler/xla/service/pattern_matcher_gmock.h"
 #include "tensorflow/compiler/xla/shape_layout.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/test.h"
@@ -42,11 +44,10 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 
-namespace op = xla::testing::opcode_matchers;
-
 namespace xla {
 namespace {
 
+namespace m = xla::match;
 using ::testing::ElementsAre;
 
 class LayoutAssignmentTest : public HloTestBase {
@@ -328,11 +329,10 @@ TEST_F(LayoutAssignmentTest, ConflictingLayoutTuple) {
   //  %tuple.1 = Tuple(%copy) layout=({0,1})
   //  %tuple.2 = Tuple(%tuple.0, %tuple.1) layout=(({1,0}), ({0,1}))
   //
-  EXPECT_TRUE(
-      AlgebraicSimplifier(/*is_layout_sensitive=*/true,
-                          [](const Shape&, const Shape&) { return false; })
-          .Run(m.get())
-          .ValueOrDie());
+  AlgebraicSimplifierOptions options(
+      [](const Shape&, const Shape&) { return false; });
+  options.set_is_layout_sensitive(true);
+  EXPECT_TRUE(AlgebraicSimplifier(options).Run(m.get()).ValueOrDie());
   HloInstruction* root = m->entry_computation()->root_instruction();
   // Verify layout of the root and the root's operands.
   EXPECT_TRUE(ShapeUtil::Equal(result_shape, root->shape()));
@@ -343,7 +343,8 @@ TEST_F(LayoutAssignmentTest, ConflictingLayoutTuple) {
 
   // Verify the structure of the HLO graph.
   EXPECT_THAT(root,
-              op::Tuple(op::Tuple(constant), op::Tuple(op::Copy(constant))));
+              GmockMatch(m::Tuple(m::Tuple(m::Op().Is(constant)),
+                                  m::Tuple(m::Copy(m::Op().Is(constant))))));
 }
 
 TEST_F(LayoutAssignmentTest, ElementwiseAndReshape) {
@@ -947,9 +948,11 @@ TEST_F(LayoutAssignmentTest, CopySliceOperandToAvoidImplicitLayoutChange) {
   HloInstruction* root =
       compiled_module->entry_computation()->root_instruction();
   Shape shape_copy = ShapeUtil::MakeShapeWithLayout(F32, {4, 5}, {1, 0});
-  EXPECT_THAT(root, op::Add(op::Parameter(),
-                            op::Slice(AllOf(op::Copy(op::Parameter(1)),
-                                            op::ShapeWithLayout(shape_copy)))));
+  EXPECT_THAT(
+      root,
+      GmockMatch(m::Add(
+          m::Parameter(),
+          m::Slice(m::Copy(m::Parameter(1)).WithShapeEqualTo(&shape_copy)))));
 }
 
 TEST_F(LayoutAssignmentTest, CopyDSliceOperandToAvoidImplicitLayoutChange) {
@@ -977,10 +980,11 @@ TEST_F(LayoutAssignmentTest, CopyDSliceOperandToAvoidImplicitLayoutChange) {
       compiled_module->entry_computation()->root_instruction();
   Shape shape_copy = ShapeUtil::MakeShapeWithLayout(F32, {4, 5}, {1, 0});
   EXPECT_THAT(root,
-              op::Add(op::Parameter(),
-                      op::DynamicSlice(AllOf(op::Copy(op::Parameter(1)),
-                                             op::ShapeWithLayout(shape_copy)),
-                                       op::Parameter(2))));
+              GmockMatch(m::Add(
+                  m::Parameter(),
+                  m::DynamicSlice(
+                      m::Copy(m::Parameter(1)).WithShapeEqualTo(&shape_copy),
+                      m::Parameter(2)))));
 }
 
 TEST_F(LayoutAssignmentTest, CopyConcatOperandToAvoidImplicitLayoutChange) {
@@ -1008,11 +1012,12 @@ TEST_F(LayoutAssignmentTest, CopyConcatOperandToAvoidImplicitLayoutChange) {
   HloInstruction* root =
       compiled_module->entry_computation()->root_instruction();
   Shape shape_copy = ShapeUtil::MakeShapeWithLayout(F32, {3, 5}, {1, 0});
-  EXPECT_THAT(root,
-              op::Add(op::Parameter(),
-                      op::Concatenate(AllOf(op::Copy(op::Parameter(1)),
-                                            op::ShapeWithLayout(shape_copy)),
-                                      op::Parameter(2))));
+  EXPECT_THAT(
+      root,
+      GmockMatch(m::Add(
+          m::Parameter(),
+          m::Concatenate(m::Copy(m::Parameter(1)).WithShapeEqualTo(&shape_copy),
+                         m::Parameter(2)))));
 }
 
 TEST_F(LayoutAssignmentTest,
@@ -1039,7 +1044,8 @@ TEST_F(LayoutAssignmentTest,
           .ConsumeValueOrDie();
   HloInstruction* root =
       compiled_module->entry_computation()->root_instruction();
-  EXPECT_THAT(root, op::Convolution(op::Parameter(0), op::Parameter(1)));
+  EXPECT_THAT(root,
+              GmockMatch(m::Convolution(m::Parameter(0), m::Parameter(1))));
 }
 
 TEST_F(LayoutAssignmentTest, PropagatingLayoutFromResultToOperand) {
@@ -1063,8 +1069,9 @@ TEST_F(LayoutAssignmentTest, PropagatingLayoutFromResultToOperand) {
   HloInstruction* root =
       compiled_module->entry_computation()->root_instruction();
   Shape shape_copy = ShapeUtil::MakeShapeWithLayout(F32, {4, 5}, {0, 1});
-  EXPECT_THAT(root, op::Slice(AllOf(op::Copy(op::Parameter(0)),
-                                    op::ShapeWithLayout(shape_copy))));
+  EXPECT_THAT(root,
+              GmockMatch(m::Slice(
+                  m::Copy(m::Parameter(0)).WithShapeEqualTo(&shape_copy))));
 }
 
 TEST_F(LayoutAssignmentTest, TupleCopyOnLayoutMismatch) {
@@ -1150,7 +1157,7 @@ ENTRY %CustomCallWithNotLayoutConstrained (p: f32[42,2,3]) -> f32[1,2,3,4] {
     AssignLayouts(m.get(), &computation_layout);
 
     HloInstruction* root = m->entry_computation()->root_instruction();
-    ASSERT_THAT(root, op::CustomCall(op::Parameter()));
+    ASSERT_THAT(root, GmockMatch(m::CustomCall(m::Parameter())));
     ExpectLayoutIs(root->shape(), {3, 2, 0, 1});
     ExpectLayoutIs(root->operand(0)->shape(), {0, 2, 1});
   }
@@ -1166,7 +1173,7 @@ ENTRY %CustomCallWithNotLayoutConstrained (p: f32[42,2,3]) -> f32[1,2,3,4] {
     AssignLayouts(m.get(), &computation_layout);
 
     HloInstruction* root = m->entry_computation()->root_instruction();
-    ASSERT_THAT(root, op::CustomCall(op::Parameter()));
+    ASSERT_THAT(root, GmockMatch(m::CustomCall(m::Parameter())));
     ExpectLayoutIs(root->shape(), {0, 2, 3, 1});
     ExpectLayoutIs(root->operand(0)->shape(), {0, 1, 2});
   }
@@ -1197,7 +1204,7 @@ ENTRY %CustomCallWithLayoutConstraints (p0: f32[4,4], p1: f32[2,3]) -> f32[1,2,3
   // The custom call should be partially encapsulated in kCopy instructions
   // because of the layout mismatches.
   ASSERT_THAT(m->entry_computation()->root_instruction(),
-              op::Copy(op::CustomCall(op::Copy(), op::Parameter())));
+              GmockMatch(m::Copy(m::CustomCall(m::Copy(), m::Parameter()))));
 
   const HloInstruction* custom_call =
       m->entry_computation()->root_instruction()->operand(0);
@@ -1223,7 +1230,7 @@ ENTRY %CustomCallLayoutConstrainedZeroOperands () -> f32[1,2,3,4] {
   AssignLayouts(m.get(), &computation_layout);
 
   ASSERT_THAT(m->entry_computation()->root_instruction(),
-              op::Copy(op::CustomCall()));
+              GmockMatch(m::Copy(m::CustomCall())));
 
   const HloInstruction* custom_call =
       m->entry_computation()->root_instruction()->operand(0);
@@ -1257,7 +1264,7 @@ ENTRY %CustomCallLayoutConstrainedTupleOperand (p0: f32[4,4], p1: f32[2,3]) -> f
   ExpectLayoutIs(root->shape(), {2, 1, 0, 3});
 
   ASSERT_THAT(m->entry_computation()->root_instruction(),
-              op::Copy(op::CustomCall(op::Tuple())));
+              GmockMatch(m::Copy(m::CustomCall(m::Tuple()))));
 
   const HloInstruction* custom_call =
       m->entry_computation()->root_instruction()->operand(0);
diff --git a/tensorflow/compiler/xla/service/llvm_ir/ir_array.h b/tensorflow/compiler/xla/service/llvm_ir/ir_array.h
index f4b05f29c38..d6d84994ee1 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/ir_array.h
+++ b/tensorflow/compiler/xla/service/llvm_ir/ir_array.h
@@ -25,6 +25,7 @@ limitations under the License.
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Value.h"
 #include "tensorflow/compiler/xla/map_util.h"
+#include "tensorflow/compiler/xla/shape.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/platform/logging.h"
@@ -108,6 +109,14 @@ class IrArray {
     Index(absl::Span<llvm::Value* const> multidim, llvm::Value* linear,
           const Shape& shape);
 
+    // Returns an index that adds `addend` to the given `dim` of the object.
+    Index AddOffsetToDim(llvm::Value* addend, int64 dim,
+                         llvm::IRBuilder<>* b) const {
+      IrArray::Index index = *this;
+      index[dim] = b->CreateAdd(index[dim], addend);
+      return index;
+    }
+
     const std::vector<llvm::Value*>& multidim() const { return multidim_; }
     llvm::Value* linear() const { return linear_; }
 
diff --git a/tensorflow/compiler/xla/service/llvm_ir/kernel_tiling.cc b/tensorflow/compiler/xla/service/llvm_ir/kernel_tiling.cc
index e5fbdbd51b8..1aa85eb8d2d 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/kernel_tiling.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/kernel_tiling.cc
@@ -52,6 +52,29 @@ Shape MergeDimensions(absl::Span<const size_t> segs, const Shape& shape) {
   return ShapeUtil::MakeShapeWithDescendingLayout(shape.element_type(),
                                                   dimensions);
 }
+
+// Given an index for a shape, return the equivalent new index if the shape is
+// reshaped to another shape.
+IrArray::Index GetReshapedIndex(const IrArray::Index& index, const Shape& shape,
+                                const Shape& reshaped_shape,
+                                llvm::IRBuilder<>* b) {
+  auto bounds = shape.dimensions();
+  auto minor_to_major = shape.layout().minor_to_major();
+  llvm::Value* linear_index = index.GetConstantWithIndexType(0);
+  int64 multiplier = 1;
+  for (int i = 0; i < index.size(); ++i) {
+    int64 dim = minor_to_major[i];
+    llvm::Value* addend = b->CreateMul(
+        index[dim], index.GetConstantWithIndexType(multiplier), "linearizing",
+        /*HasNUW=*/true, /*HasNSW=*/true);
+    linear_index = b->CreateAdd(linear_index, addend, "",
+                                /*HasNUW=*/true, /*HasNSW=*/true);
+    multiplier *= bounds[dim];
+  }
+
+  return IrArray::Index(linear_index, reshaped_shape, b);
+}
+
 }  // namespace
 
 absl::optional<std::vector<int64> > FindTranspose021(const Shape& a,
@@ -60,28 +83,30 @@ absl::optional<std::vector<int64> > FindTranspose021(const Shape& a,
     return absl::nullopt;
   }
 
-  std::vector<int64> perm(a.dimensions().size());
-  {
-    auto layout_a_orig = LayoutUtil::MinorToMajor(a);
-    std::vector<int64> layout_a(layout_a_orig.rbegin(), layout_a_orig.rend());
-    auto layout_b_orig = LayoutUtil::MinorToMajor(b);
-    std::vector<int64> layout_b(layout_b_orig.rbegin(), layout_b_orig.rend());
-    for (size_t i = 0; i < perm.size(); ++i) {
-      perm[i] = PositionInContainer(layout_b, layout_a[i]);
-    }
+  std::vector<int64> permutation(a.dimensions().size());
+  absl::Span<const int64> minor_to_major_a = LayoutUtil::MinorToMajor(a);
+  std::vector<int64> major_to_minor_a(minor_to_major_a.rbegin(),
+                                      minor_to_major_a.rend());
+  absl::Span<const int64> minor_to_major_b = LayoutUtil::MinorToMajor(b);
+  std::vector<int64> major_to_minor_b(minor_to_major_b.rbegin(),
+                                      minor_to_major_b.rend());
+  for (size_t i = 0; i < permutation.size(); ++i) {
+    permutation[i] = PositionInContainer(major_to_minor_b, major_to_minor_a[i]);
   }
-  auto segs = ConsecutiveSegments(perm);
-  if ((3 == segs.size() && 0 == perm[0]) || 2 == segs.size()) {
-    Shape norm_a =
+
+  std::vector<size_t> segments = ConsecutiveSegments(permutation);
+  if ((3 == segments.size() && 0 == permutation[0]) || 2 == segments.size()) {
+    Shape descending_layout_shape =
         ShapeUtil::MakeShapeWithDescendingLayoutAndSamePhysicalLayout(a);
-    Shape reduced_a = MergeDimensions(segs, norm_a);
-    auto reduced_a_dims = reduced_a.dimensions();
+    Shape normalized_shape = MergeDimensions(segments, descending_layout_shape);
+    absl::Span<const int64> normalized_dims =
+        AsInt64Slice(normalized_shape.dimensions());
     std::vector<int64> dims_021;
-    if (2 == segs.size()) {
+    if (2 == segments.size()) {
       // The logical component-0 is of size one.
-      dims_021 = {1, reduced_a_dims[1], reduced_a_dims[0]};
+      dims_021 = {1, normalized_dims[1], normalized_dims[0]};
     } else {
-      dims_021 = {reduced_a_dims[0], reduced_a_dims[2], reduced_a_dims[1]};
+      dims_021 = {normalized_dims[0], normalized_dims[2], normalized_dims[1]};
     }
 
     return dims_021;
@@ -90,27 +115,117 @@ absl::optional<std::vector<int64> > FindTranspose021(const Shape& a,
   return absl::nullopt;
 }
 
-IrArray::Index GetUnreducedOutputIndex(
-    const IrArray::Index& reduced_output_index,
-    const Shape& reduced_output_shape, const Shape& unreduced_output_shape,
-    llvm::IRBuilder<>* b) {
-  auto bounds = reduced_output_shape.dimensions();
-  auto minor_to_major = reduced_output_shape.layout().minor_to_major();
-  llvm::Value* linear_index = reduced_output_index.GetConstantWithIndexType(0);
-  int64 multiplier = 1;
-  for (int i = 0; i < reduced_output_index.size(); ++i) {
-    int64 dim = minor_to_major[i];
-    llvm::Value* addend =
-        b->CreateMul(reduced_output_index[dim],
-                     reduced_output_index.GetConstantWithIndexType(multiplier),
-                     "linearizing",
-                     /*HasNUW=*/true, /*HasNSW=*/true);
-    linear_index = b->CreateAdd(linear_index, addend, "",
-                                /*HasNUW=*/true, /*HasNSW=*/true);
-    multiplier *= bounds[dim];
-  }
+KernelMappingScheme::KernelMappingScheme(
+    absl::Span<const int64> dims_in_elems, int64 tile_size_y, int64 tile_size_x,
+    absl::Span<const int64> req_block_sizes, int64 num_threads_y,
+    int64 num_threads_x, llvm::IRBuilder<>* b)
+    : b_(b),
+      dims_in_elems_(dims_in_elems.begin(), dims_in_elems.end()),
+      tile_sizes_{1, tile_size_y, tile_size_x},
+      num_threads_x_(num_threads_x),
+      num_threads_y_(num_threads_y) {
+  DCHECK_EQ(dims_in_elems_.size(), 3);
+  DCHECK_EQ(req_block_sizes.size(), 3);
 
-  return IrArray::Index(linear_index, unreduced_output_shape, b);
+  DCHECK_EQ(tile_size_y % num_threads_y_, 0);
+  DCHECK_EQ(tile_size_x % num_threads_x_, 0);
+
+  dims_in_tiles_ = ElementWiseCeilOfRatio<int64>(dims_in_elems_, tile_sizes_);
+  block_sizes_.reserve(req_block_sizes.size());
+  absl::c_transform(req_block_sizes, dims_in_tiles_,
+                    std::back_inserter(block_sizes_),
+                    [](const int64 requested_size, const int64 max_size) {
+                      return std::min(requested_size, max_size);
+                    });
+  dims_in_blocks_ = ElementWiseCeilOfRatio<int64>(dims_in_tiles_, block_sizes_);
+
+  VLOG(10) << "dims_in_elems_ = [" << absl::StrJoin(dims_in_elems_, ",") << "]";
+  VLOG(10) << "dims_in_tiles_ = [" << absl::StrJoin(dims_in_tiles_, ",") << "]";
+  VLOG(10) << "dims_in_blocks_ = [" << absl::StrJoin(dims_in_blocks_, ",")
+           << "]";
+}
+
+IrArray::Index KernelMappingScheme::GetUnnormalizedIndex(
+    const IrArray::Index& normalized_shape_index,
+    const Shape& unnormalized_shape) {
+  DCHECK_EQ(normalized_shape_index.size(), dims_in_elems_.size());
+  Shape output_shape = ShapeUtil::MakeShapeWithDescendingLayout(
+      unnormalized_shape.element_type(), GetDimensionsInElements());
+  return GetReshapedIndex(normalized_shape_index, output_shape,
+                          unnormalized_shape, b_);
+}
+
+IrArray::Index KernelMappingScheme::EmitBlockIndex(llvm::Type* index_ty) {
+  llvm::Value* block_id = llvm_ir::EmitCallToIntrinsic(
+      llvm::Intrinsic::nvvm_read_ptx_sreg_ctaid_x, {}, {}, b_);
+  llvm_ir::AddRangeMetadata(0, GetNumberOfBlocks(),
+                            llvm::cast<llvm::Instruction>(block_id));
+  llvm::Value* linear_block_id =
+      b_->CreateIntCast(block_id, index_ty, /*isSigned=*/true, "block.id.x");
+  return IrArray::Index(linear_block_id,
+                        ShapeUtil::MakeShapeWithDescendingLayout(
+                            PRED /*arbitrary*/, dims_in_blocks_),
+                        b_);
+}
+
+IrArray::Index KernelMappingScheme::GetTileIndexForBlockOrigin(
+    const IrArray::Index& block_index) {
+  IrArray::Index tile_index = block_index;
+  for (int i = 0; i < block_sizes_.size(); ++i) {
+    tile_index[i] = b_->CreateMul(
+        block_index[i],
+        llvm::ConstantInt::get(block_index[i]->getType(), block_sizes_[i]),
+        "block_origin." + std::to_string(i));
+  }
+  return tile_index;
+}
+
+IrArray::Index KernelMappingScheme::GetElementIndexForTileOrigin(
+    const IrArray::Index& tile_index) {
+  IrArray::Index elem_index = tile_index;
+  for (int i = DimY; i < DimTot; ++i) {
+    elem_index[i] =
+        b_->CreateMul(tile_index[i],
+                      llvm::ConstantInt::get(tile_index[i]->getType(),
+                                             GetTileSizeForDimension(i)),
+                      "tile_origin." + std::to_string(i));
+  }
+  return elem_index;
+}
+
+llvm::GlobalVariable* KernelMappingScheme::GetSharedMemoryBufferForElementType(
+    llvm::Type* elem_ty, absl::string_view buffer_name) {
+  // If shared memory tranpose is needed, we use square tiles.
+  CHECK_EQ(GetTileSizeForDimensionX(), GetTileSizeForDimensionY());
+
+  // For Nvidia GPUs, the warp size is 32 threads and the shared memory bank is
+  // organized into 32-way. We usually use the warp size or a multiplier or a
+  // the warp size as the size for tiling. This may cause all elements in the
+  // same column of a tile use the same memory bank and therefore shared memory
+  // bank conflicts. Adding 1 to the minor dimension of the shared memory buffer
+  // can reduce such shared memory bank conflicts.
+  llvm::Type* buffer_type = llvm::ArrayType::get(
+      llvm::ArrayType::get(elem_ty, GetTileSizeForDimension(DimX) + 1),
+      GetTileSizeForDimension(DimY));
+  return llvm_ir::AllocateSharedMemoryTile(b_->GetInsertBlock()->getModule(),
+                                           buffer_type, buffer_name);
+}
+
+std::tuple<llvm::Value*, llvm::Value*>
+KernelMappingScheme::EmitThreadYXCoordinate(llvm::Type* index_ty) {
+  // Calculate (y, x) coordinate of the thread in the 2D view of thread block
+  // defined by (num_thread_y, num_thread_x) from thread_id.
+  llvm::CallInst* thread_id_raw = llvm_ir::EmitCallToIntrinsic(
+      llvm::Intrinsic::nvvm_read_ptx_sreg_tid_x, {}, {}, b_);
+  llvm_ir::AddRangeMetadata(0, GetThreadsPerTile(), thread_id_raw);
+  llvm::Value* thread_id_int =
+      b_->CreateIntCast(thread_id_raw, index_ty,
+                        /*isSigned=*/true, "thread.id.x");
+  llvm::Value* num_thread_x =
+      llvm::ConstantInt::get(index_ty, GetNumberOfThreadsForDimensionX());
+  llvm::Value* x = b_->CreateURem(thread_id_int, num_thread_x);
+  llvm::Value* y = b_->CreateUDiv(thread_id_int, num_thread_x);
+  return std::make_tuple(y, x);
 }
 
 }  // namespace llvm_ir
diff --git a/tensorflow/compiler/xla/service/llvm_ir/kernel_tiling.h b/tensorflow/compiler/xla/service/llvm_ir/kernel_tiling.h
index 5ea05b3188a..7277aeac8ad 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/kernel_tiling.h
+++ b/tensorflow/compiler/xla/service/llvm_ir/kernel_tiling.h
@@ -28,23 +28,165 @@ namespace llvm_ir {
 // If a shape can be viewed as three logical components 0-1-2 in the order of
 // major to minor, a 0-2-1-transpose changes the order of such logical
 // components to 0-2-1. We call the shape being transposed the input shape and
-// the transposed shape the output shape. The logical view of the input and
-// output shapes for the transpose are called the 0-1-2 shape or reduced input
-// shape and the 0-2-1 shape or the reduced output shape respectively. The
-// original input and output shapes are called the unreduced input and output
-// shapes.
-
+// the transposed shape the output shape. The logical view of the input/output
+// shapes for the transpose are called the 0-1-2/0-2-1 shapes or the normalized
+// shapes. The original input/output shapes are called unnormalized shapes.
+//
 // If `b` is a 0-2-1 transpose of `a` in 0-1-2, return the dimensions for the
-// reduced shape of `b` or the 0-2-1 shape.
+// normalized shape of `b` or the 0-2-1 shape.
 absl::optional<std::vector<int64> > FindTranspose021(const Shape& a,
                                                      const Shape& b);
 
-// Return the unreduced output index corresponding to the given reduced output
-// index.
-IrArray::Index GetUnreducedOutputIndex(
-    const IrArray::Index& reduced_output_index,
-    const Shape& reduced_output_shape, const Shape& unreduced_output_shape,
-    llvm::IRBuilder<>* b);
+// A tile is a spatial subdivision of a tensor. We group tensor elements into
+// tiles so that we can launch kernels to process the tensor elements in blocks
+// of tiles.
+//
+// A kernel mapping scheme describes a method to partition the tensors accessed
+// by an unnested HLO instruction into tiles and blocks of tiles, and the
+// associated information to use hardware threads to process the tensor elements
+// in blocks of tiles.
+//
+// Currently, there are two main use cases for a tiling scheme. First, we
+// implement kernels with 0-2-1 memory transpose using shared memory to improve
+// memory access pattern. Second, we implement reduction to contiguous
+// dimensions in layout, with or without memory tranpsose, to achieve better
+// memory access pattern as well as to reduce the need numbers of executed
+// expensive instructions, such as thread synchronization related instructions
+// and atomic operations. For both use cases, we can apply a normalization to
+// the original tensors, to collapse contiguous dimensions for the same purpose
+// and produce normlized three dimensional tensors. For this reason, the tiling
+// scheme class only needs to handle normalized three dimensional tensors and
+// two dimensional tiles.
+//
+// The current implementation of the class is somewhat NVIDIA GPU oriented. This
+// situation can be improved when there is a need though. The idea of 0-2-1
+// transpose using shared memory can be found in the following CUDA algorithm in
+// TensorFlow: https://goo.gl/MStRV6.
+//
+// We use a thread block to process a tile because we want to use the HW thread
+// block synchronization primitives to synchronize the processing of all the
+// elements in the same tile. A thread block can be viewed as a two dimensional
+// array of threads, described by the number of threads for the Y and X
+// dimensions. A thread block (num_threads_y, num_threads_x) processes a tile of
+// (tile_size_y, tile_size_x) as follows: each thread in the thread block
+// processes one element in the tile so that all the threads in the thread block
+// together process a subdivision of the tile that has the same dimension as the
+// thread block array. Then the thread block moves on to process the next
+// subdivision of the tile until the whole tile is processed. Therefore, each
+// thread in the thread block processes
+// tile_size_x/num_threads_x * tile_size_y/num_threads_y elements in a tile.
+//
+// There are situations where we want a thread block to process multiple
+// tiles. We can't group those tiles into a bigger tiles because we limit a tile
+// to a two dimensional spatial subdivision of a tensor. For example, when we
+// use tiling to implement reduction with tranpose, we want the partial sum
+// produced by each thread to accumulate values for more elements before using
+// shlf_down and atomic_add instructions for further reduction, to amortize the
+// cost of such expensive instructions. The concept of tile block is introduced
+// for this purpose. A tile block is a three dimensional array of tiles, of
+// which some dimensions may be degenerated to only one tile.
+class KernelMappingScheme {
+ public:
+  enum { DimZ = 0, DimY, DimX, DimTot };
+
+ public:
+  KernelMappingScheme() {}
+  // dims_in_elems: the normalized tensor dimensions.
+  // req_block_sizes: the requested block size in number of tiles for each
+  //   dimension. The actual block size is set to min(req_block_size,
+  //   dims_in_number_of_blocks).
+  KernelMappingScheme(absl::Span<const int64> dims_in_elems, int64 tile_size_y,
+                      int64 tile_size_x,
+                      absl::Span<const int64> req_block_sizes,
+                      int64 num_threads_y, int64 num_threads_x,
+                      llvm::IRBuilder<>* b);
+
+  absl::Span<const int64> GetDimensionsInElements() const {
+    return dims_in_elems_;
+  }
+  absl::Span<const int64> GetDimensionsInTiles() const {
+    return dims_in_tiles_;
+  }
+  absl::Span<const int64> GetDimensionsInBlocks() const {
+    return dims_in_blocks_;
+  }
+
+  int64 GetNumberOfTilesInTotal() const {
+    return absl::c_accumulate(dims_in_tiles_, 1LL, std::multiplies<int64>());
+  }
+  int64 GetNumberOfTilesInOneBlock() const {
+    return absl::c_accumulate(block_sizes_, 1, std::multiplies<int64>());
+  }
+
+  int64 GetNumberOfBlocks() const {
+    return absl::c_accumulate(dims_in_blocks_, 1, std::multiplies<int64>());
+  }
+
+  int64 GetTileSizeForDimension(int d) const {
+    DCHECK(d >= DimZ && d <= DimX);
+    return tile_sizes_[d];
+  }
+  int64 GetTileSizeForDimensionX() const {
+    return GetTileSizeForDimension(DimX);
+  }
+  int64 GetTileSizeForDimensionY() const {
+    return GetTileSizeForDimension(DimY);
+  }
+
+  absl::Span<const int64> GetBlockSizes() const { return block_sizes_; }
+  int64 GetTileBlockSizeForDimension(int d) const {
+    DCHECK(d >= DimZ && d <= DimX);
+    return dims_in_blocks_[d];
+  }
+
+  int64 GetNumberOfThreadsForDimensionX() const { return num_threads_x_; }
+  int64 GetNumberOfThreadsForDimensionY() const { return num_threads_y_; }
+
+  int64 GetThreadsPerTile() const {
+    return GetNumberOfThreadsForDimensionX() *
+           GetNumberOfThreadsForDimensionY();
+  }
+
+  IrArray::Index EmitBlockIndex(llvm::Type* index_ty);
+  // Returns the index for the first tile in the block with the given block
+  // index.
+  IrArray::Index GetTileIndexForBlockOrigin(const IrArray::Index& block_index);
+  // Returns the index for the first element in the tile with the given tile
+  // index.
+  IrArray::Index GetElementIndexForTileOrigin(const IrArray::Index& tile_index);
+
+  std::tuple<llvm::Value*, llvm::Value*> EmitThreadYXCoordinate(
+      llvm::Type* index_ty);
+
+  IrArray::Index GetUnnormalizedIndex(
+      const IrArray::Index& normalized_shape_index,
+      const Shape& unnormalized_shape);
+
+  llvm::GlobalVariable* GetSharedMemoryBufferForElementType(
+      llvm::Type* elem_ty, absl::string_view buffer_name);
+
+ private:
+  llvm::IRBuilder<>* b_;
+  // The number of elements in each dimension.
+  std::vector<int64> dims_in_elems_;
+
+  // The number of elements for each dimension of a tile.
+  std::vector<int64> tile_sizes_;
+  // The number of tiles in each dimension. It is computed from dims_in_elem_
+  // and tile_sizes_.
+  std::vector<int64> dims_in_tiles_;
+
+  // The number of tiles for each dimension of a tile block.
+  std::vector<int64> block_sizes_;
+  // The number of blocks in each dimension of a tile block. It is computed from
+  // dims_in_tile_ and block_sizes_.
+  std::vector<int64> dims_in_blocks_;
+
+  // Number of threads used to process elements in the X direction of a tile.
+  int64 num_threads_x_;
+  // Number of threads used to process elements in the Y direction of a tile.
+  int64 num_threads_y_;
+};
 
 // A class to represent information for tiled parameters to support IR emission
 // for 021 transpose.
diff --git a/tensorflow/compiler/xla/service/llvm_ir/llvm_util.cc b/tensorflow/compiler/xla/service/llvm_ir/llvm_util.cc
index df78726166e..ceea24685af 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/llvm_util.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/llvm_util.cc
@@ -244,10 +244,11 @@ StatusOr<llvm::Value*> EncodeSelfDescribingShapeConstant(const Shape& shape,
 
 StatusOr<Shape> DecodeSelfDescribingShapeConstant(const void* shape_ptr,
                                                   int32 size_bytes) {
-  Shape shape;
-  TF_RET_CHECK(shape.ParseFromArray(shape_ptr, size_bytes));
+  ShapeProto shape_proto;
+  TF_RET_CHECK(shape_proto.ParseFromArray(shape_ptr, size_bytes));
+  Shape shape(shape_proto);
   TF_RETURN_IF_ERROR(ShapeUtil::ValidateShape(shape));
-  return shape;
+  return std::move(shape);
 }
 
 llvm::Constant* ConvertLiteralToIrConstant(const Literal& literal,
diff --git a/tensorflow/compiler/xla/service/llvm_ir/sort_util.cc b/tensorflow/compiler/xla/service/llvm_ir/sort_util.cc
index fd16af67fe9..e22c2173c27 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/sort_util.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/sort_util.cc
@@ -47,7 +47,8 @@ namespace {
 // Adds the inner comparison loop body where we compare elements.
 void EmitCompareLoopBody(
     int64 iteration_bound, PrimitiveType key_type, int64 num_values,
-    llvm::Value* element_pair_index, int64 xor_mask, llvm::Type* index_type,
+    int64 iota_values_parameter_index, llvm::Value* element_pair_index,
+    int64 xor_mask, llvm::Type* index_type,
     std::function<llvm::Value*(int64 operand, llvm::Value* index)> read_element,
     std::function<void(int64 operand, llvm::Value* index, llvm::Value* value)>
         write_element,
@@ -139,34 +140,42 @@ void EmitCompareLoopBody(
       is_signed_comparison = false;
     }
     // If key2 < key1
-    ksl.IfReturnVoid(
-        "is_smaller_than",
+    auto is_smaller_than =
         b->CreateICmp(is_signed_comparison ? llvm::ICmpInst::ICMP_SLT
                                            : llvm::ICmpInst::ICMP_ULT,
-                      compare_key2, compare_key1),
-        [&]() {
-          // Swap key1 with key2.
-          write_element(0, current_keys_index, key2);
-          write_element(0, compare_keys_index, key1);
-          for (int64 i = 1; i <= num_values; ++i) {
-            // Also swap the values.
-            auto value1 = read_element(i, current_keys_index);
-            auto value2 = read_element(i, compare_keys_index);
-            write_element(i, current_keys_index, value2);
-            write_element(i, compare_keys_index, value1);
-          }
-        });
+                      compare_key2, compare_key1);
+    if (iota_values_parameter_index >= 0) {
+      auto keys_equal = b->CreateICmpEQ(compare_key1, compare_key2);
+      auto key_index1 =
+          read_element(iota_values_parameter_index, current_keys_index);
+      auto key_index2 =
+          read_element(iota_values_parameter_index, compare_keys_index);
+      auto index_is_smaller_than =
+          b->CreateICmp(llvm::ICmpInst::ICMP_ULT, key_index2, key_index1);
+      is_smaller_than = b->CreateOr(
+          is_smaller_than, b->CreateAnd(keys_equal, index_is_smaller_than));
+    }
+    ksl.IfReturnVoid("is_smaller_than", is_smaller_than, [&]() {
+      // Swap key1 with key2.
+      write_element(0, current_keys_index, key2);
+      write_element(0, compare_keys_index, key1);
+      for (int64 i = 1; i <= num_values; ++i) {
+        // Also swap the values.
+        auto value1 = read_element(i, current_keys_index);
+        auto value2 = read_element(i, compare_keys_index);
+        write_element(i, current_keys_index, value2);
+        write_element(i, compare_keys_index, value1);
+      }
+    });
   });
 }
 
-void EmitTiledCompareLoop(const IrArray::Index& tiled_keys_index,
-                          int64 dimension_to_sort,
-                          int64 dimension_to_sort_bound,
-                          PrimitiveType keys_type,
-                          absl::Span<const int64> xor_masks,
-                          const std::vector<IrArray>& params,
-                          const std::vector<llvm::Value*>& param_shmem_buffers,
-                          int64 tile_size, llvm::IRBuilder<>* b) {
+void EmitTiledCompareLoop(
+    const IrArray::Index& tiled_keys_index, int64 dimension_to_sort,
+    int64 dimension_to_sort_bound, PrimitiveType keys_type,
+    absl::Span<const int64> xor_masks, const std::vector<IrArray>& params,
+    const std::vector<llvm::Value*>& param_shmem_buffers,
+    int64 iota_values_parameter_index, int64 tile_size, llvm::IRBuilder<>* b) {
   KernelSupportLibrary ksl(b);
   llvm::Value* thread_id = llvm_ir::EmitCallToIntrinsic(
       llvm::Intrinsic::nvvm_read_ptx_sreg_tid_x, {}, {}, b);
@@ -253,20 +262,22 @@ void EmitTiledCompareLoop(const IrArray::Index& tiled_keys_index,
                   RoundDownToNearest(dimension_to_sort_bound, tile_size))),
           [&]() {
             EmitCompareLoopBody(dimension_to_sort_bound % tile_size, keys_type,
-                                params.size() - 1, element_pair_index, xor_mask,
+                                params.size() - 1, iota_values_parameter_index,
+                                element_pair_index, xor_mask,
                                 tiled_keys_index.GetType(), read_element,
                                 write_element, b);
           },
           [&]() {
-            EmitCompareLoopBody(
-                tile_size, keys_type, params.size() - 1, element_pair_index,
-                xor_mask, tiled_keys_index.GetType(), read_element,
-                write_element, b, /*needs_bounds_checks=*/false);
+            EmitCompareLoopBody(tile_size, keys_type, params.size() - 1,
+                                iota_values_parameter_index, element_pair_index,
+                                xor_mask, tiled_keys_index.GetType(),
+                                read_element, write_element, b,
+                                /*needs_bounds_checks=*/false);
           });
     } else {
       EmitCompareLoopBody(tile_size, keys_type, params.size() - 1,
-                          element_pair_index, xor_mask,
-                          tiled_keys_index.GetType(), read_element,
+                          iota_values_parameter_index, element_pair_index,
+                          xor_mask, tiled_keys_index.GetType(), read_element,
                           write_element, b, /*needs_bounds_checks=*/false);
     }
     // Wait until all comparisons have happened.
@@ -296,6 +307,7 @@ void EmitTiledCompareLoop(const IrArray::Index& tiled_keys_index,
 
 Status EmitSortInPlace(int64 dimension_to_sort, const IrArray& keys_array,
                        const std::vector<IrArray>& values_arrays,
+                       int64 iota_values_parameter_index,
                        absl::string_view name,
                        absl::Span<const int64> xor_masks, llvm::IRBuilder<>* b,
                        const gpu::LaunchDimensions& launch_dimensions,
@@ -367,8 +379,8 @@ Status EmitSortInPlace(int64 dimension_to_sort, const IrArray& keys_array,
     if (xor_masks.size() > 1) {
       EmitTiledCompareLoop(keys_index, dimension_to_sort,
                            dimension_to_sort_bound, keys_shape.element_type(),
-                           xor_masks, params, param_shmem_buffers, tile_size,
-                           b);
+                           xor_masks, params, param_shmem_buffers,
+                           iota_values_parameter_index, tile_size, b);
     } else {
       auto read_element = [&](int64 operand, llvm::Value* index) {
         keys_index[dimension_to_sort] = index;
@@ -380,9 +392,10 @@ Status EmitSortInPlace(int64 dimension_to_sort, const IrArray& keys_array,
         params[operand].EmitWriteArrayElement(keys_index, value, b);
       };
       EmitCompareLoopBody(dimension_to_sort_bound, keys_shape.element_type(),
-                          values_arrays.size(), tiles_index[rank - 1],
-                          xor_masks[0], tiles_index.GetType(), read_element,
-                          write_element, b);
+                          values_arrays.size(), iota_values_parameter_index,
+                          tiles_index[rank - 1], xor_masks[0],
+                          tiles_index.GetType(), read_element, write_element,
+                          b);
     }
     return Status::OK();
   };
diff --git a/tensorflow/compiler/xla/service/llvm_ir/sort_util.h b/tensorflow/compiler/xla/service/llvm_ir/sort_util.h
index 556a217322d..685f9383acb 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/sort_util.h
+++ b/tensorflow/compiler/xla/service/llvm_ir/sort_util.h
@@ -31,9 +31,12 @@ namespace llvm_ir {
 // Emits llvm IR to do pairwise comparisons/swaps in the 'dimension_to_sort'
 // dimension of 'keys_array'. All other dimensions are kept as-is. This
 // implements the inner loop of BitonicSort. It is assumed that 'xor_masks'
-// contains only powers of 2, or values 2^k - 1 (k > 0).
+// contains only powers of 2, or values 2^k - 1 (k > 0). If
+// 'iota_values_parameter_index' is >= 0, it points at a 'values_arrays' operand
+// that is a iota and can be used to make the sorting stable.
 Status EmitSortInPlace(int64 dimension_to_sort, const IrArray& keys_array,
                        const std::vector<IrArray>& values_arrays,
+                       int64 iota_values_parameter_index,
                        absl::string_view name,
                        absl::Span<const int64> xor_masks, llvm::IRBuilder<>* b,
                        const gpu::LaunchDimensions& launch_dimensions,
diff --git a/tensorflow/compiler/xla/service/local_service.cc b/tensorflow/compiler/xla/service/local_service.cc
index cca37556173..6c897009833 100644
--- a/tensorflow/compiler/xla/service/local_service.cc
+++ b/tensorflow/compiler/xla/service/local_service.cc
@@ -96,44 +96,18 @@ ExecutionOptions CreateExecutionOptions(
     const ExecutableBuildOptions& build_options,
     const ProgramShape* program_shape) {
   ExecutionOptions execution_options = CreateDefaultExecutionOptions();
-  if (build_options.hlo_profile().has_value()) {
-    execution_options.mutable_debug_options()->set_xla_hlo_profile(
-        *build_options.hlo_profile());
-  }
-  if (build_options.generate_hlo_graph().has_value()) {
-    execution_options.mutable_debug_options()->set_xla_generate_hlo_graph(
-        build_options.generate_hlo_graph().value());
-  }
-  if (build_options.dump_optimized_hlo_proto_to().has_value()) {
-    execution_options.mutable_debug_options()
-        ->set_xla_dump_optimized_hlo_proto_to(
-            build_options.dump_optimized_hlo_proto_to().value());
-  }
-  if (build_options.dump_unoptimized_hlo_proto_to().has_value()) {
-    execution_options.mutable_debug_options()
-        ->set_xla_dump_unoptimized_hlo_proto_to(
-            build_options.dump_unoptimized_hlo_proto_to().value());
-  }
-  if (build_options.dump_per_pass_hlo_proto_to().has_value()) {
-    execution_options.mutable_debug_options()
-        ->set_xla_dump_per_pass_hlo_proto_to(
-            build_options.dump_per_pass_hlo_proto_to().value());
+  if (build_options.has_debug_options()) {
+    *execution_options.mutable_debug_options() = build_options.debug_options();
   }
   if (build_options.result_layout() != nullptr) {
     *execution_options.mutable_shape_with_output_layout() =
-        *build_options.result_layout();
+        build_options.result_layout()->ToProto();
   } else {
+    Shape result_shape(program_shape->result());
+    LayoutUtil::SetToDefaultLayout(&result_shape);
     *execution_options.mutable_shape_with_output_layout() =
-        program_shape->result();
-    LayoutUtil::SetToDefaultLayout(
-        execution_options.mutable_shape_with_output_layout());
+        result_shape.ToProto();
   }
-
-  for (const std::string& disabled_pass : build_options.disabled_hlo_passes()) {
-    execution_options.mutable_debug_options()->add_xla_disable_hlo_passes(
-        disabled_pass);
-  }
-
   return execution_options;
 }
 
@@ -145,7 +119,7 @@ StatusOr<std::unique_ptr<Executable>> LocalService::CompileExecutable(
     const ExecutableBuildOptions& build_options) {
   const HloModuleProto& proto = computation.proto();
   TF_RET_CHECK(proto.has_host_program_shape());
-  const ProgramShape& program_shape = proto.host_program_shape();
+  ProgramShape program_shape(proto.host_program_shape());
 
   // Validate incoming layouts.
   if (argument_layouts.size() != program_shape.parameters_size()) {
@@ -220,4 +194,10 @@ StatusOr<const ShapedBuffer*> LocalService::GlobalDataToShapedBuffer(
   return buffers[replica_number];
 }
 
+StatusOr<GlobalDataHandle> LocalService::RegisterReplicatedBuffers(
+    std::vector<ScopedShapedBuffer> replicated_buffers, const string& tag) {
+  return allocation_tracker_.RegisterReplicatedBuffers(
+      std::move(replicated_buffers), tag);
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/local_service.h b/tensorflow/compiler/xla/service/local_service.h
index 3b4f0b50832..f56ba32b04b 100644
--- a/tensorflow/compiler/xla/service/local_service.h
+++ b/tensorflow/compiler/xla/service/local_service.h
@@ -63,6 +63,11 @@ class LocalService : public Service {
   StatusOr<const ShapedBuffer*> GlobalDataToShapedBuffer(
       const GlobalDataHandle& data, int replica_number);
 
+  // Registers a vector of shaped buffers of device memory, one per replica, and
+  // returns a corresponding handle that can be used for talking to XLA clients.
+  StatusOr<GlobalDataHandle> RegisterReplicatedBuffers(
+      std::vector<ScopedShapedBuffer> replicated_buffers, const string& tag);
+
  private:
   explicit LocalService(const ServiceOptions& options,
                         std::unique_ptr<Backend> backend);
diff --git a/tensorflow/compiler/xla/service/logical_buffer_analysis.cc b/tensorflow/compiler/xla/service/logical_buffer_analysis.cc
index ec52a24d782..972a5b9ced0 100644
--- a/tensorflow/compiler/xla/service/logical_buffer_analysis.cc
+++ b/tensorflow/compiler/xla/service/logical_buffer_analysis.cc
@@ -113,6 +113,13 @@ Status LogicalBufferAnalysis::HandleGetTupleElement(HloInstruction*) {
   return Status::OK();
 }
 
+Status LogicalBufferAnalysis::HandleAddDependency(
+    HloInstruction* add_dependency) {
+  // AddDependency just forwards the value of its zero-th operand and does not
+  // create buffers.
+  return Status::OK();
+}
+
 Status LogicalBufferAnalysis::HandleCopy(HloInstruction* copy) {
   // The top-level buffer (index={}) for kCopy is newly created, but all other
   // buffers (in the case of a tuple shape) come from the operand
diff --git a/tensorflow/compiler/xla/service/logical_buffer_analysis.h b/tensorflow/compiler/xla/service/logical_buffer_analysis.h
index 81f524d84a8..7ffca943d0f 100644
--- a/tensorflow/compiler/xla/service/logical_buffer_analysis.h
+++ b/tensorflow/compiler/xla/service/logical_buffer_analysis.h
@@ -64,6 +64,7 @@ class LogicalBufferAnalysis : public DfsHloVisitorWithDefault {
   Status HandleRecvDone(HloInstruction* recv_done) override;
   Status HandleSend(HloInstruction* send) override;
   Status HandleTupleSelect(HloInstruction* tuple_select) override;
+  Status HandleAddDependency(HloInstruction* add_dependency) override;
 
   // A map from the buffer ID to the logical buffer
   std::vector<std::unique_ptr<LogicalBuffer>> logical_buffers_;
diff --git a/tensorflow/compiler/xla/service/pattern_matcher.h b/tensorflow/compiler/xla/service/pattern_matcher.h
index 6152cdc6099..432aa1ea0b6 100644
--- a/tensorflow/compiler/xla/service/pattern_matcher.h
+++ b/tensorflow/compiler/xla/service/pattern_matcher.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_PATTERN_MATCHER_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_PATTERN_MATCHER_H_
 
+#include "absl/strings/str_replace.h"
 #include "absl/strings/string_view.h"
 #include "absl/utility/utility.h"
 #include "tensorflow/compiler/xla/layout_util.h"
@@ -44,32 +45,48 @@ namespace xla {
 //
 // This pattern will match Add instructions whose first operand is a constant.
 //
-// Each pattern type has the following modifiers:
+// Each pattern type has the following modifiers, which are described where
+// nontrivial.
 //
 //   Op():
-//     - WithName: match operations with the given name
-//     - WithOpcode: match operations with the given opcode
-//     - WithShape: match operations whose shape matches the given pattern
-//     - WithOperand: match operations whose operand matches the given pattern
+//     - Is: is the given HloInstruction* (i.e. pointer equality)
+//     - WithName
+//     - WithOpcode
+//     - WithoutOpcode: anything other than the given opcode
+//     - WithShape: instr's shape matches the given pattern
+//     - WithShapeEqualTo: instr's shape is equal to the given Shape
+//     - WithShapeCompatibleTo: instr's shape is compatible with the given Shape
+//     - WithNumOperands
+//     - WithOperand: operand at the given index matches the given pattern
+//     - IsConstant
+//     - IsNonConstant
+//     - IsConstantScalar/IsEffectiveConstantScalar: Optionally accepts a value,
+//       e.g. IsConstantScalar() or IsConstantScalar(42).
+//     - WithFusionKind
+//     - WithTupleIndex: get-tuple-element operations with the given tuple index
+//     - WithOneUse: Instruction is used as an operand exactly once.
+//     - WithOneUser: Instruction is used by exactly one other instruction, but
+//       is possibly used more than once as an operand (e.g. multiply(x,x)).
 //
 //   Shape():
-//     - EqualTo: matches shapes that are equal to the argument
-//     - CompatibleTo: matches shapes that are compatible to the argument
-//     - IsScalar/IsArray/IsTuple: matches scalar/array/tuple shapes
-//     - IsDenseArray/IsSparseArray: matches arrays with dense/sparse format
-//     - WithLayout: match shapes whose layout matches the given pattern
-//     - WithLayoutEqualTo: matches shapes whose layouts equal the argument
-//     - WithSubshape: matches tuple shapes whose subshape matches the given
-//       pattern
-//     - WithSubshapeEqualTo: matches shapes with a subshape equal the argument
-//     - WithElementType: matches array/scalar shapes with the given element
-//       type
-//     - WithRank: matches array/scalar types with the given rank
+//     - EqualTo
+//     - CompatibleTo
+//     - IsScalar/IsEffectiveScalar/IsArray/IsTuple
+//     - IsDenseArray/IsSparseArray
+//     - WithLayout: layout shape's layout matches the given pattern (e.g.
+//       Layout().WithDenseFormat())
+//     - WithLayoutEqualTo: shape's layout equals the argument (i.e. another
+//       Layout, but not the result of Layout().foo())
+//     - WithSubshape: shape is a tuple whose subshape matches the given pattern
+//       (e.g. Shape().IsScalar()).
+//     - WithSubshapeEqualTo: shape is a tuple with a subshape equal to the arg
+//       (i.e. another Shape, but not the result of Shape().foo())
+//     - WithElementType: shape is an array/scalar with the given elem type
+//     - WithRank: shape is an array/scalar with the given rank
 //
 //  Layout():
-//     - EqualTo: matches layouts that are equal to the argument
-//     - WithDenseFormat/WithSparseFormat: matches layouts with dense/sparse
-//       format
+//     - EqualTo
+//     - WithDenseFormat/WithSparseFormat
 //
 // Op(), Shape(), and Layout() may be passed an argument of type
 // HloInstruction**, Shape**, or Layout**, respectively, or const versions of
@@ -82,53 +99,55 @@ namespace xla {
 //   CHECK(Match(foo,
 //               match::Op().WithOperand(0, match::Op(&matched_operand))));
 //
-// Helpers are provided for common nullary, unary, binary, and ternary
-// instructions. These helpers can be called with no arguments, in which case
-// they will match any instruction matching the opcode. They may also be called
-// with matches for the operands and with an optional capture. (The capture must
-// be the first argument.) Some examples of these helpers and their equivalents
-// are provided below.
-//
+// Helpers are provided for most HLO instructions. These helpers can be called
+// with no arguments, in which case they will match any instruction matching the
+// opcode. They may also be called with matches for the operands and with an
+// optional capture. (The capture must be the first argument.) Some examples of
+// these helpers and their equivalents are provided below.
+
 // Example nullary instruction:
-//   Param()                        == Op().WithOpcode(HloOpcode::kParam)
-//   Param(&a)                      == Op(&a).WithOpcode(HloOpcode::kParam)
+//   Parameter()                    == Op().WithOpcode(HloOpcode::kParameter)
+//   Parameter(&a)                  == Op(&a).WithOpcode(HloOpcode::kParameter)
 //
 // Example unary instruction:
-//   Abs()                             == Op().WithOpcode(HloOpcode::kAbs)
-//   Abs(Op(&a))                       == Op().WithOpcode(HloOpcode::kAbs)
-//                                            .WithOperand(0, Op(&a)))
-//   Abs(&a, Op(&b))                   == Op(&a).WithOpcode(HloOpcode::kAbs)
-//                                              .WithOperand(0, Op(&b))
+//   Abs()                          == Op().WithOpcode(HloOpcode::kAbs)
+//   Abs(Op(&a))                    == Op().WithOpcode(HloOpcode::kAbs)
+//                                         .WithOperand(0, Op(&a)))
+//   Abs(&a, Op(&b))                == Op(&a).WithOpcode(HloOpcode::kAbs)
+//                                           .WithOperand(0, Op(&b))
 //
-// Example binary instruction:
-//   Add()                             == Op().WithOpcode(HloOpcode::kAdd)
-//   Add(Op(&a), Op(&b))               == Op().WithOpcode(HloOpcode::kAdd)
-//                                            .WithOperand(0, Op(&a))
-//                                            .WithOperand(1, Op(&b))
-//   Add(&a, Op(&b), Op(&c))           == Op(&a).WithOpcode(HloOpcode::kAdd)
-//                                              .WithOperand(0, Op(&b))
-//                                              .WithOperand(1, Op(&c))
+// Commutative binary instructions have a special form that accepts either order
+// of args, e.g.:
 //
-// Example ternary instruction:
-//   Clamp()                           == Op().WithOpcode(HloOpcode::kClamp)
-//   Clamp(Op(&a), Op(&b), Op(&c))     == Op().WithOpcode(HloOpcode::kClamp)
-//                                            .WithOperand(0, Op(&a))
-//                                            .WithOperand(1, Op(&b))
-//                                            .WithOperand(2, Op(&c))
-//   Clamp(&a, Op(&b), Op(&c), Op(&d)) == Op(&a).WithOpcode(HloOpcode::kClamp)
-//                                              .WithOperand(0, Op(&b))
-//                                              .WithOperand(1, Op(&c))
-//                                              .WithOperand(2, Op(&d))
+//   AddAnyOrder(Parameter(1), Abs()) ==
+//     Op().WithOpcode(HloOpcode::kAdd)
+//         .WithBinaryOperandsAnyOrder(Op().WithParameterNum(1), Abs());
 //
+//   MultiplyAnyOrder(&a, Parameter(), Abs())  // Captures the mul in `a`.
+//
+// The following additional helpers are provided.  In all cases, `&a` is
+// optional.
+//
+//   ConstantScalar(&a)               == Op(&a).IsConstantScalar();
+//   ConstantScalar(&a, v)            == Op(&a).IsConstantScalar(v);
+//   ConstantEffectiveScalar(&a)      == Op(&a).IsConstantEffectiveScalar();
+//   ConstantEffectiveScalar(&a, v)   == Op(&a).IsConstantEffectiveScalar(&a, v)
+//   NonConstant(&a)                  == Op(&a).IsNonConstant()
+//   GetTupleElement(&a, b, index)    == Op(&a).WithTupleIndex(index)
+//                                             .WithOperand(0, b);
+//   Parameter(&a, n)                 == Op(&a).WithParameterNum(n);
 
 struct MatchOption {
   // If true, actually capture matched item into the user pointer.
   bool capture;
+
+  // An explanation for why we failed to match is streamed here, if not-null.
+  std::ostream* explain_os;
 };
 
 template <typename Value, typename Pattern>
 bool Match(Value* value, const Pattern& pattern,
-           MatchOption option = {/*.capture=*/true}) {
+           MatchOption option = {/*.capture=*/true, /*.explain_os=*/nullptr}) {
   if (option.capture) {
     auto new_option = option;
     new_option.capture = false;
@@ -143,6 +162,77 @@ namespace match {
 
 namespace detail {
 
+// Macro for streaming to option.explain_os if it's not null.
+//
+//   EXPLAIN << "value of foo(): " << foo()
+//
+#pragma push_macro("EXPLAIN")
+#define EXPLAIN \
+  if (option.explain_os) *option.explain_os
+
+// kIndentInc is the additional number of spaces that we indent by when we
+// increase the indent "by one".
+enum {
+  kIndentInc = 2,
+};
+
+// Writes a newline and then `indent` spaces.
+//
+// We follow an unintuitive convention in this file's pretty-printers: Indents
+// are performed by the caller, not the callee.  For example, if you want to
+// print
+//
+//   foo:
+//    - bar
+//
+// you'd do:
+//
+//  Foo::DescribeTo(std::ostream* os, int64 indent) {
+//    *os << "foo:";
+//    Indent(os, indent)  // Create a newline at the *current* indent level.
+//    *os << " - ";
+//    bar.DescribeTo(os, indent + 3);  // + 3 because strlen(" * ") == 3.
+//  }
+//
+//  Bar::DescribeTo(std::ostream* os, int64 indent) { *os << "bar"; }
+//
+// Notice that Bar::DescribeTo() does not call Indent; the indenting is
+// performed by Foo.  This convention allows the caller to decide whether a
+// matcher is preceded by a newline, which is important e.g. for the AllOf
+// matcher.
+//
+// (Incidentally, indenting in Match's explanations is handled differently.
+// Indents are a common case in DescribeTo [we're printing a whole tree], but
+// they're a special case in Match [we're printing only a path through the tree
+// that encounters a failing node]. Indents in Match only appear when we
+// encounter a failing disjunction, so we just handle them as a special case
+// there.)
+inline void Indent(std::ostream* os, int64 indent) {
+  *os << "\n";
+  for (int64 i = 0; i < indent; ++i) {
+    *os << " ";
+  }
+}
+
+// SFINAE template that determines whether T declares a static member
+// kIsTrivialMatcher.
+//
+// Trivial matchers get special treatment.  For example, when printing
+// a conjunction of matchers, we don't print "and" after a trivial matcher. This
+// yields e.g.
+//    "a shape compatible with f32[1,2]"
+// rather than
+//    "a shape AND compatible with f32[1,2]"
+template <typename T, typename Dummy = void>
+struct IsTrivialMatcher {
+  static constexpr bool value = false;
+};
+template <typename T>
+struct IsTrivialMatcher<T,
+                        typename std::enable_if<T::kIsTrivialMatcher>::type> {
+  static constexpr bool value = true;
+};
+
 template <typename Item, typename... Patterns>
 class AllOfPattern {
  public:
@@ -162,10 +252,19 @@ class AllOfPattern {
     return matched;
   }
 
+  void DescribeTo(std::ostream* os, int64 indent = 0) const {
+    DescribeToImpl(os, std::integral_constant<size_t, 0>(), indent);
+  }
+
+  // Accessor for patterns_.  Please don't use this outside of this file.
+  const std::tuple<Patterns...>& patterns() const { return patterns_; }
+
  private:
   template <typename ItemType, size_t index>
   bool MatchImpl(ItemType* item, MatchOption option,
                  std::integral_constant<size_t, index>) const {
+    // We don't need to do any EXPLAINing here; it's all correctly handled by
+    // our sub-matchers (if any fail).
     return std::get<index>(patterns_).Match(item, option) &&
            MatchImpl(item, option, std::integral_constant<size_t, index + 1>());
   }
@@ -176,6 +275,73 @@ class AllOfPattern {
     return true;
   }
 
+  // Pretty-printing a conjunction has some special cases to make it easy to
+  // read in the simple (common) case.
+  //
+  // If sizeof...(Patterns) == 1, prints as e.g.
+  //
+  //   a shape
+  //
+  // If sizeof...(Patterns) == 2 and patterns_[0] is a trivial matcher (e.g. "a
+  // shape") prints as
+  //
+  //   a shape compatible with f32[1,2]
+  //
+  // If sizeof...(Patterns) > 2 and patterns_[0] is a trivial matcher, prints as
+  //
+  //   a shape:
+  //    * compatible with f32[1,2] AND
+  //    * that represents a scalar
+  //
+  // Otherwise prints as:
+  //
+  //   all of:
+  //    * foo AND
+  //    * bar
+  //
+  template <size_t index>
+  void DescribeToImpl(std::ostream* os, std::integral_constant<size_t, index>,
+                      int64 indent) const {
+    constexpr bool first_is_trivial =
+        IsTrivialMatcher<typename std::remove_reference<decltype(
+            std::get<0>(patterns_))>::type>::value;
+    constexpr bool is_last = index == sizeof...(Patterns) - 1;
+    const auto& submatcher = std::get<index>(patterns_);
+
+    auto print_bulleted_item = [&] {
+      *os << " * ";
+      submatcher.DescribeTo(os, indent + 3);
+      if (!is_last) {
+        *os << " AND";
+        Indent(os, indent);
+      }
+    };
+
+    if (index == 0) {
+      if (first_is_trivial || is_last) {
+        submatcher.DescribeTo(os, indent + kIndentInc);
+        if (sizeof...(Patterns) > 2) {
+          *os << ":";
+          Indent(os, indent);
+        }
+      } else {
+        *os << "all of:";
+        Indent(os, indent);
+        print_bulleted_item();
+      }
+    } else if (first_is_trivial && index == 1 && sizeof...(Patterns) == 2) {
+      *os << " ";
+      submatcher.DescribeTo(os, indent);
+    } else {
+      print_bulleted_item();
+    }
+    DescribeToImpl(os, std::integral_constant<size_t, index + 1>(), indent);
+  }
+
+  void DescribeToImpl(std::ostream* os,
+                      std::integral_constant<size_t, sizeof...(Patterns)>,
+                      int64 indent) const {}
+
   std::tuple<Patterns...> patterns_;
 };
 
@@ -183,10 +349,6 @@ class AllOfPattern {
 
 // Returns a pattern that represents the conjunction of all input patterns. All
 // patterns need to match in order to have the AllOf pattern match.
-//
-// TODO(timshen): Currently AllOf is still nested, e.g. AllOf<AllOf<A>, B> is
-// not AllOf<A, B>. We might want to flatten the AllOf type structure if the
-// C++ compile error message gets annoying.
 template <typename Item, typename... Patterns>
 detail::AllOfPattern<typename std::remove_const<Item>::type, Patterns...> AllOf(
     const Patterns&... patterns) {
@@ -194,6 +356,25 @@ detail::AllOfPattern<typename std::remove_const<Item>::type, Patterns...> AllOf(
                               Patterns...>(patterns...);
 }
 
+// AllOf<AllOf<A, B...>, X, Y, ...> => AllOf<A, B, ..., X, Y, ...>.
+//
+// This transformation is necessary for good pretty-printing.
+template <typename Item, typename... InnerPs, typename... OuterPs>
+detail::AllOfPattern<typename std::remove_const<Item>::type, InnerPs...,
+                     OuterPs...>
+AllOf(const detail::AllOfPattern<Item, InnerPs...>& inner_p,
+      const OuterPs&... outer_ps) {
+  // Invoke constructor of AllOfPattern<Item, InnerPs..., OuterPs...>.
+  auto make_all_of = [](const InnerPs&... inner_ps,
+                        const OuterPs&... outer_ps) {
+    return detail::AllOfPattern<typename std::remove_const<Item>::type,
+                                InnerPs..., OuterPs...>(inner_ps...,
+                                                        outer_ps...);
+  };
+  return absl::apply(make_all_of, std::tuple_cat(inner_p.patterns(),
+                                                 std::make_tuple(outer_ps...)));
+}
+
 namespace detail {
 
 template <typename LayoutType, typename Impl>
@@ -204,8 +385,18 @@ class LayoutPattern;
 class LayoutPatternBaseImpl {
  public:
   bool Match(const ::xla::Layout* layout, MatchOption option) const {
-    return layout != nullptr;
+    if (layout == nullptr) {
+      EXPLAIN << "Layout is null";
+      return false;
+    }
+    return true;
   }
+
+  void DescribeTo(std::ostream* os, int64 indent = 0) const {
+    *os << "a layout";
+  }
+
+  static constexpr bool kIsTrivialMatcher = true;
 };
 
 // A LayoutPattern implementation that matches only if the layout equals a
@@ -216,7 +407,17 @@ class LayoutPatternEqualImpl {
       : layout_(layout) {}
 
   bool Match(const ::xla::Layout* layout, MatchOption option) const {
-    return LayoutUtil::Equal(*layout_, *layout);
+    if (!LayoutUtil::Equal(*layout_, *layout)) {
+      EXPLAIN << "Layout " << LayoutUtil::HumanString(*layout)
+              << " is not equal to expected "
+              << LayoutUtil::HumanString(*layout_);
+      return false;
+    }
+    return true;
+  }
+
+  void DescribeTo(std::ostream* os, int64 indent = 0) const {
+    *os << "equal to " << LayoutUtil::HumanString(*layout_);
   }
 
  private:
@@ -230,7 +431,16 @@ class LayoutPatternFormatImpl {
   explicit constexpr LayoutPatternFormatImpl(Format format) : format_(format) {}
 
   bool Match(const ::xla::Layout* layout, MatchOption option) const {
-    return layout->format() == format_;
+    if (layout->format() != format_) {
+      EXPLAIN << "Layout has format " << Format_Name(layout->format())
+              << " but expected " << Format_Name(format_);
+      return false;
+    }
+    return true;
+  }
+
+  void DescribeTo(std::ostream* os, int64 indent = 0) const {
+    *os << "with format " << Format_Name(format_);
   }
 
  private:
@@ -242,11 +452,13 @@ template <typename LayoutType, typename Impl>
 class LayoutPattern {
  private:
   template <typename NewImpl>
-  LayoutPattern<LayoutType, AllOfPattern<::xla::Layout, Impl, NewImpl>>
-  AppendImpl(NewImpl new_impl) const {
-    return LayoutPattern<LayoutType,
-                         AllOfPattern<::xla::Layout, Impl, NewImpl>>(
-        AllOf<Layout>(impl_, std::move(new_impl)), matched_layout_);
+  auto AppendImpl(NewImpl new_impl) const
+      -> LayoutPattern<LayoutType,
+                       decltype(AllOf<Layout>(std::declval<Impl>(),
+                                              std::move(new_impl)))> {
+    auto new_allof = AllOf<Layout>(impl_, std::move(new_impl));
+    return LayoutPattern<LayoutType, decltype(new_allof)>(std::move(new_allof),
+                                                          matched_layout_);
   }
 
  public:
@@ -276,6 +488,10 @@ class LayoutPattern {
     return false;
   }
 
+  void DescribeTo(std::ostream* os, int64 indent = 0) const {
+    impl_.DescribeTo(os, indent);
+  }
+
   // Modifies the pattern to match only if the layout equals the given proto.
   // The layout must outlive the returned pattern.
   constexpr auto EqualTo(const ::xla::Layout* layout) const
@@ -306,19 +522,48 @@ class AnyOfPattern {
   explicit AnyOfPattern(const Patterns&... patterns) : patterns_(patterns...) {}
 
   bool Match(const Item* item, MatchOption option) const {
-    return MatchImpl(item, option, std::integral_constant<size_t, 0>());
+    return MatchImpl(item, option);
   }
 
   bool Match(Item* item, MatchOption option) const {
-    return MatchImpl(item, option, std::integral_constant<size_t, 0>());
+    return MatchImpl(item, option);
+  }
+
+  void DescribeTo(std::ostream* os, int64 indent = 0) const {
+    *os << "any of:";
+    Indent(os, indent);
+    DescribeToImpl(os, std::integral_constant<size_t, 0>(), indent);
   }
 
  private:
+  template <typename ItemType>
+  bool MatchImpl(ItemType* item, MatchOption option) const {
+    // If we're generating an explanation, buffer it until we know we failed.
+    absl::optional<std::stringstream> explanation;
+    MatchOption new_option = option;
+    if (option.explain_os) {
+      new_option.explain_os = &explanation.emplace();
+    }
+    bool rv = MatchRecursiveImpl(item, new_option,
+                                 std::integral_constant<size_t, 0>());
+    if (!rv && option.explain_os) {
+      EXPLAIN << "None of the following matchers succeeded:";
+      EXPLAIN << explanation->str();
+    }
+    return rv;
+  }
+
   template <typename ItemType, size_t index>
-  bool MatchImpl(ItemType* item, MatchOption option,
-                 std::integral_constant<size_t, index>) const {
+  bool MatchRecursiveImpl(ItemType* item, MatchOption option,
+                          std::integral_constant<size_t, index>) const {
     auto new_option = option;
     new_option.capture = false;
+
+    absl::optional<std::stringstream> explanation;
+    if (option.explain_os) {
+      new_option.explain_os = &explanation.emplace();
+    }
+
     // Try to match the sub-pattern without capturing behavior.
     if (std::get<index>(patterns_).Match(item, new_option)) {
       // Capture the branch.
@@ -337,20 +582,46 @@ class AnyOfPattern {
         // AnyOf will be a runtime number indicate which sub-pattern is matched.
         // Then we run another pass to do captures only with the help of the
         // trace.
-        bool ret = std::get<index>(patterns_).Match(item, option);
-        DCHECK(ret);
+        bool matched = std::get<index>(patterns_).Match(item, option);
+        DCHECK(matched);
       }
       return true;
     }
-    return MatchImpl(item, option, std::integral_constant<size_t, index + 1>());
+    if (option.explain_os) {
+      EXPLAIN << "\nMatcher #" << index + 1;
+      EXPLAIN << "\n - ";
+      std::get<index>(patterns_).DescribeTo(option.explain_os, /*indent=*/3);
+      EXPLAIN << "\nfailed with";
+      EXPLAIN << "\n - ";
+      EXPLAIN << absl::StrReplaceAll(explanation->str(), {{"\n", "\n   "}});
+    }
+    return MatchRecursiveImpl(item, option,
+                              std::integral_constant<size_t, index + 1>());
   }
 
   template <typename ItemType>
-  bool MatchImpl(ItemType* item, MatchOption option,
-                 std::integral_constant<size_t, sizeof...(Patterns)>) const {
+  bool MatchRecursiveImpl(
+      ItemType* item, MatchOption option,
+      std::integral_constant<size_t, sizeof...(Patterns)>) const {
     return false;
   }
 
+  template <size_t index>
+  void DescribeToImpl(std::ostream* os, std::integral_constant<size_t, index>,
+                      int64 indent) const {
+    *os << " - ";
+    std::get<index>(patterns_).DescribeTo(os, indent + 3);
+    if (index != sizeof...(Patterns) - 1) {
+      *os << " OR";
+      Indent(os, indent);
+    }
+    DescribeToImpl(os, std::integral_constant<size_t, index + 1>(), indent);
+  }
+
+  void DescribeToImpl(std::ostream* os,
+                      std::integral_constant<size_t, sizeof...(Patterns)>,
+                      int64 indent) const {}
+
   std::tuple<Patterns...> patterns_;
 };
 
@@ -395,8 +666,17 @@ class ShapePattern;
 class ShapePatternBaseImpl {
  public:
   bool Match(const ::xla::Shape* shape, MatchOption option) const {
+    if (shape == nullptr) {
+      EXPLAIN << "Shape is null";
+    }
     return shape != nullptr;
   }
+
+  void DescribeTo(std::ostream* os, int64 indent = 0) const {
+    *os << "a shape";
+  }
+
+  static constexpr bool kIsTrivialMatcher = true;
 };
 
 // A ShapePattern implementation that matches only if the shape equals a Shape
@@ -407,7 +687,16 @@ class ShapePatternEqualImpl {
       : shape_(shape) {}
 
   bool Match(const ::xla::Shape* shape, MatchOption option) const {
-    return ShapeUtil::Equal(*shape_, *shape);
+    if (!ShapeUtil::Equal(*shape_, *shape)) {
+      EXPLAIN << "Shape not equal to "
+              << ShapeUtil::HumanStringWithLayout(*shape_);
+      return false;
+    }
+    return true;
+  }
+
+  void DescribeTo(std::ostream* os, int64 indent = 0) const {
+    *os << "equal to " << ShapeUtil::HumanStringWithLayout(*shape_);
   }
 
  private:
@@ -422,7 +711,16 @@ class ShapePatternCompatibleImpl {
       : shape_(shape) {}
 
   bool Match(const ::xla::Shape* shape, MatchOption option) const {
-    return ShapeUtil::Compatible(*shape_, *shape);
+    if (!ShapeUtil::Compatible(*shape_, *shape)) {
+      EXPLAIN << "Shape not compatible with "
+              << ShapeUtil::HumanString(*shape_);
+      return false;
+    }
+    return true;
+  }
+
+  void DescribeTo(std::ostream* os, int64 indent = 0) const {
+    *os << "compatible with " << ShapeUtil::HumanString(*shape_);
   }
 
  private:
@@ -437,7 +735,16 @@ class ShapePatternElementTypeImpl {
       : element_type_(element_type) {}
 
   bool Match(const ::xla::Shape* shape, MatchOption option) const {
-    return shape->element_type() == element_type_;
+    if (shape->element_type() != element_type_) {
+      EXPLAIN << "Shape does not have element type "
+              << PrimitiveType_Name(element_type_);
+      return false;
+    }
+    return true;
+  }
+
+  void DescribeTo(std::ostream* os, int64 indent = 0) const {
+    *os << "with element type " << PrimitiveType_Name(element_type_);
   }
 
  private:
@@ -450,7 +757,15 @@ class ShapePatternIsScalarImpl {
   explicit constexpr ShapePatternIsScalarImpl() {}
 
   bool Match(const ::xla::Shape* shape, MatchOption option) const {
-    return ShapeUtil::IsScalar(*shape);
+    if (!ShapeUtil::IsScalar(*shape)) {
+      EXPLAIN << "Shape is not a scalar";
+      return false;
+    }
+    return true;
+  }
+
+  void DescribeTo(std::ostream* os, int64 indent = 0) const {
+    *os << "that represents a scalar";
   }
 };
 
@@ -460,7 +775,15 @@ class ShapePatternIsArrayImpl {
   explicit constexpr ShapePatternIsArrayImpl() {}
 
   bool Match(const ::xla::Shape* shape, MatchOption option) const {
-    return ShapeUtil::IsArray(*shape);
+    if (!ShapeUtil::IsArray(*shape)) {
+      EXPLAIN << "Shape is not an array";
+      return false;
+    }
+    return true;
+  }
+
+  void DescribeTo(std::ostream* os, int64 indent = 0) const {
+    *os << "that represents an array";
   }
 };
 
@@ -470,7 +793,34 @@ class ShapePatternIsTupleImpl {
   explicit constexpr ShapePatternIsTupleImpl() {}
 
   bool Match(const ::xla::Shape* shape, MatchOption option) const {
-    return ShapeUtil::IsTuple(*shape);
+    if (!ShapeUtil::IsTuple(*shape)) {
+      EXPLAIN << "Shape is not a tuple";
+      return false;
+    }
+    return true;
+  }
+
+  void DescribeTo(std::ostream* os, int64 indent = 0) const {
+    *os << "that represents a tuple";
+  }
+};
+
+// A ShapePattern implementation that matches only if the shape is an effective
+// scalar.
+class ShapePatternEffectiveScalarImpl {
+ public:
+  explicit constexpr ShapePatternEffectiveScalarImpl() {}
+
+  bool Match(const ::xla::Shape* shape, MatchOption option) const {
+    if (!ShapeUtil::IsEffectiveScalar(*shape)) {
+      EXPLAIN << "Shape is not an effective scalar";
+      return false;
+    }
+    return true;
+  }
+
+  void DescribeTo(std::ostream* os, int64 indent = 0) const {
+    *os << "that is an effective scalar";
   }
 };
 
@@ -481,7 +831,23 @@ class ShapePatternRankImpl {
   explicit constexpr ShapePatternRankImpl(int64 rank) : rank_(rank) {}
 
   bool Match(const ::xla::Shape* shape, MatchOption option) const {
-    return ShapeUtil::Rank(*shape) == rank_;
+    if (ShapeUtil::Rank(*shape) != rank_) {
+      if (rank_ == 0) {
+        EXPLAIN << "Shape is not a scalar";
+      } else {
+        EXPLAIN << "Shape does not have rank " << rank_;
+      }
+      return false;
+    }
+    return true;
+  }
+
+  void DescribeTo(std::ostream* os, int64 indent = 0) const {
+    if (rank_ == 0) {
+      *os << "that is a scalar";
+    } else {
+      *os << "that has " << rank_ << " dimension" << (rank_ != 1 ? "s" : "");
+    }
   }
 
  private:
@@ -503,8 +869,21 @@ class ShapePatternLayoutImpl {
   }
 
   bool Match(Shape* shape, MatchOption option) const {
-    return LayoutUtil::HasLayout(*shape) &&
-           layout_.Match(shape->mutable_layout(), option);
+    if (!LayoutUtil::HasLayout(*shape)) {
+      EXPLAIN << "Shape does not have a layout";
+      return false;
+    }
+    if (!layout_.Match(shape->mutable_layout(), option)) {
+      EXPLAIN << "\nin layout";
+      return false;
+    }
+    return true;
+  }
+
+  void DescribeTo(std::ostream* os, int64 indent = 0) const {
+    *os << "with";
+    Indent(os, indent + kIndentInc);
+    layout_.DescribeTo(os, indent + kIndentInc);
   }
 
  private:
@@ -522,17 +901,40 @@ class ShapePatternSubshapeImpl {
       : index_(index), subshape_(subshape) {}
 
   bool Match(const ::xla::Shape* shape, MatchOption option) const {
-    return ShapeUtil::IndexIsValid(*shape, index_) &&
-           subshape_.Match(&ShapeUtil::GetSubshape(*shape, index_), option);
+    return MatchImpl(shape, option);
   }
 
   bool Match(::xla::Shape* shape, MatchOption option) const {
-    return ShapeUtil::IndexIsValid(*shape, index_) &&
-           subshape_.Match(ShapeUtil::GetMutableSubshape(shape, index_),
-                           option);
+    return MatchImpl(shape, option);
+  }
+
+  void DescribeTo(std::ostream* os, int64 indent = 0) const {
+    *os << "with subshape at index " << index_.ToString() << " which is";
+    Indent(os, indent + kIndentInc);
+    subshape_.DescribeTo(os, indent + kIndentInc);
   }
 
  private:
+  Shape* GetSubshape(Shape* shape) const {
+    return ShapeUtil::GetMutableSubshape(shape, index_);
+  }
+  const Shape* GetSubshape(const Shape* shape) const {
+    return &ShapeUtil::GetSubshape(*shape, index_);
+  }
+
+  template <typename ShapeType>
+  bool MatchImpl(ShapeType* shape, MatchOption option) const {
+    if (!ShapeUtil::IndexIsValid(*shape, index_)) {
+      EXPLAIN << "No subshape at " << index_.ToString();
+      return false;
+    }
+    if (!subshape_.Match(GetSubshape(shape), option)) {
+      EXPLAIN << "\nin subshape at " << index_.ToString();
+      return false;
+    }
+    return true;
+  }
+
   ShapeIndexView index_;
   ShapePattern<SubshapeType, SubshapeImpl> subshape_;
 };
@@ -542,10 +944,12 @@ template <typename ShapeType, typename Impl>
 class ShapePattern {
  private:
   template <typename NewImpl>
-  ShapePattern<ShapeType, AllOfPattern<::xla::Shape, Impl, NewImpl>> AppendImpl(
-      NewImpl new_impl) const {
-    return ShapePattern<ShapeType, AllOfPattern<::xla::Shape, Impl, NewImpl>>(
-        AllOf<Shape>(impl_, std::move(new_impl)), matched_shape_);
+  auto AppendImpl(NewImpl new_impl) const
+      -> ShapePattern<ShapeType, decltype(AllOf<Shape>(std::declval<Impl>(),
+                                                       std::move(new_impl)))> {
+    auto new_all_of = AllOf<Shape>(impl_, std::move(new_impl));
+    return ShapePattern<ShapeType, decltype(new_all_of)>(std::move(new_all_of),
+                                                         matched_shape_);
   }
 
  public:
@@ -560,6 +964,11 @@ class ShapePattern {
       }
       return true;
     }
+    if (shape) {
+      EXPLAIN << "\nin "
+              << (shape->has_layout() ? ShapeUtil::HumanStringWithLayout(*shape)
+                                      : ShapeUtil::HumanString(*shape));
+    }
     return false;
   }
 
@@ -571,9 +980,16 @@ class ShapePattern {
       }
       return true;
     }
+    EXPLAIN << "\nin "
+            << (shape->has_layout() ? ShapeUtil::HumanStringWithLayout(*shape)
+                                    : ShapeUtil::HumanString(*shape));
     return false;
   }
 
+  void DescribeTo(std::ostream* os, int64 indent = 0) const {
+    return impl_.DescribeTo(os, indent);
+  }
+
   // Modifies the pattern to match only if the shape equals the given proto.
   // The layout must outlive the returned pattern.
   constexpr auto EqualTo(const ::xla::Shape* shape) const
@@ -612,6 +1028,11 @@ class ShapePattern {
     return AppendImpl(ShapePatternIsTupleImpl());
   }
 
+  constexpr auto IsEffectiveScalar() const
+      -> decltype(this->AppendImpl(ShapePatternEffectiveScalarImpl())) {
+    return AppendImpl(ShapePatternEffectiveScalarImpl());
+  }
+
   // Modifies the pattern to match only if the shape has the given rank.
   constexpr auto WithRank(int64 rank) const
       -> decltype(this->AppendImpl(ShapePatternRankImpl(rank))) {
@@ -706,6 +1127,22 @@ Shape(::xla::Shape** matched_shape) {
 
 namespace detail {
 
+// Overloads to get a const or non-const operand out of an instruction.
+inline HloInstruction* HloOperand(HloInstruction* instr, int64 idx) {
+  return instr->mutable_operand(idx);
+}
+inline const HloInstruction* HloOperand(const HloInstruction* instr,
+                                        int64 idx) {
+  return instr->operand(idx);
+}
+
+// Pretty-printer for HloInstruction.  Sort of like ToShortString, but with
+// fewer %s and more shapes.
+inline string InstToString(const HloInstruction* inst) {
+  return inst->ToString(
+      HloPrintOptions().set_print_metadata(false).set_print_percent(false));
+}
+
 template <typename HloInstructionType, typename Impl>
 class HloInstructionPattern;
 
@@ -714,8 +1151,18 @@ class HloInstructionPattern;
 class HloInstructionPatternBaseImpl {
  public:
   bool Match(const ::xla::HloInstruction* inst, MatchOption option) const {
-    return inst != nullptr;
+    if (inst == nullptr) {
+      EXPLAIN << "HloInstruction* is null";
+      return false;
+    }
+    return true;
   }
+
+  void DescribeTo(std::ostream* os, int64 indent = 0) const {
+    *os << "an HloInstruction";
+  }
+
+  static constexpr bool kIsTrivialMatcher = true;
 };
 
 // An HloInstructionPattern implementation that matches only if the instruction
@@ -726,13 +1173,44 @@ class HloInstructionPatternNameImpl {
       : name_(name) {}
 
   bool Match(const ::xla::HloInstruction* inst, MatchOption option) const {
-    return inst->name() == name_;
+    if (inst->name() != name_) {
+      EXPLAIN << "HloInstruction not named \"" << name_ << "\"";
+      return false;
+    }
+    return true;
+  }
+
+  void DescribeTo(std::ostream* os, int64 indent = 0) const {
+    *os << "named \"" << name_ << "\"";
   }
 
  private:
   absl::string_view name_;
 };
 
+// An HloInstructionPattern implementation that matches only if the instruction
+// equals a particular pointer.
+class HloInstructionIsImpl {
+ public:
+  explicit HloInstructionIsImpl(const HloInstruction* inst) : inst_(inst) {}
+
+  bool Match(const ::xla::HloInstruction* inst, MatchOption option) const {
+    if (inst != inst_) {
+      EXPLAIN << "HloInstruction " << inst << " is not " << inst_ << " ("
+              << InstToString(inst_) << ")";
+      return false;
+    }
+    return true;
+  }
+
+  void DescribeTo(std::ostream* os, int64 indent = 0) const {
+    *os << "which is " << inst_ << " (" << InstToString(inst_) << ")";
+  }
+
+ private:
+  const HloInstruction* inst_;
+};
+
 // An HloInstructionPattern implementation that matches only if the instruction
 // has a given opcode.
 class HloInstructionPatternOpcodeImpl {
@@ -742,7 +1220,25 @@ class HloInstructionPatternOpcodeImpl {
       : opcode_(opcode), invert_(invert) {}
 
   bool Match(const ::xla::HloInstruction* inst, MatchOption option) const {
-    return (invert_ ^ (inst->opcode() == opcode_));
+    if (invert_ && inst->opcode() == opcode_) {
+      EXPLAIN << "HloInstruction has opcode " << HloOpcodeString(opcode_)
+              << ", expected anything else";
+      return false;
+    }
+    if (!invert_ && inst->opcode() != opcode_) {
+      EXPLAIN << "HloInstruction doesn't have opcode "
+              << HloOpcodeString(opcode_);
+      return false;
+    }
+    return true;
+  }
+
+  void DescribeTo(std::ostream* os, int64 indent = 0) const {
+    if (!invert_) {
+      *os << "with opcode " << HloOpcodeString(opcode_);
+    } else {
+      *os << "with any opcode other than " << HloOpcodeString(opcode_);
+    }
   }
 
  private:
@@ -757,8 +1253,17 @@ class HloInstructionPatternNumOperandsImpl {
   explicit constexpr HloInstructionPatternNumOperandsImpl(int64 num_operands)
       : num_operands_(num_operands) {}
 
-  bool Match(const ::xla::HloInstruction* inst, MatchOption /*option*/) const {
-    return inst->operand_count() == num_operands_;
+  bool Match(const ::xla::HloInstruction* inst, MatchOption option) const {
+    if (inst->operand_count() != num_operands_) {
+      EXPLAIN << "HloInstruction doesn't have " << num_operands_ << " operands";
+      return false;
+    }
+    return true;
+  }
+
+  void DescribeTo(std::ostream* os, int64 indent = 0) const {
+    *os << "with " << num_operands_ << " operand"
+        << (num_operands_ != 1 ? "s" : "");
   }
 
  private:
@@ -775,11 +1280,25 @@ class HloInstructionPatternShapeImpl {
       : shape_(shape) {}
 
   bool Match(const ::xla::HloInstruction* inst, MatchOption option) const {
-    return shape_.Match(&inst->shape(), option);
+    if (!shape_.Match(&inst->shape(), option)) {
+      EXPLAIN << "\nin output shape";
+      return false;
+    }
+    return true;
   }
 
   bool Match(::xla::HloInstruction* inst, MatchOption option) const {
-    return shape_.Match(inst->mutable_shape(), option);
+    if (!shape_.Match(inst->mutable_shape(), option)) {
+      EXPLAIN << "\nin output shape";
+      return false;
+    }
+    return true;
+  }
+
+  void DescribeTo(std::ostream* os, int64 indent = 0) const {
+    *os << "outputting";
+    Indent(os, indent + kIndentInc);
+    shape_.DescribeTo(os, indent + kIndentInc);
   }
 
  private:
@@ -797,20 +1316,197 @@ class HloInstructionPatternOperandImpl {
       : operand_index_(operand_index), operand_(operand) {}
 
   bool Match(const ::xla::HloInstruction* inst, MatchOption option) const {
-    return operand_index_ < inst->operand_count() &&
-           operand_.Match(inst->operand(operand_index_), option);
+    return MatchImpl(inst, option);
   }
 
   bool Match(::xla::HloInstruction* inst, MatchOption option) const {
-    return operand_index_ < inst->operand_count() &&
-           operand_.Match(inst->mutable_operand(operand_index_), option);
+    return MatchImpl(inst, option);
+  }
+
+  void DescribeTo(std::ostream* os, int64 indent = 0) const {
+    *os << "with operand " << operand_index_ << " which is:";
+    Indent(os, indent + kIndentInc);
+    operand_.DescribeTo(os, indent + kIndentInc);
   }
 
  private:
+  template <typename HloInstructionType>
+  bool MatchImpl(HloInstructionType* inst, MatchOption option) const {
+    if (operand_index_ >= inst->operand_count()) {
+      EXPLAIN << "desired operand index " << operand_index_
+              << " is out of bounds";
+      return false;
+    }
+    if (!operand_.Match(HloOperand(inst, operand_index_), option)) {
+      EXPLAIN << "\nin operand " << operand_index_;
+      return false;
+    }
+    return true;
+  }
+
   int64 operand_index_;
   HloInstructionPattern<OperandType, OperandImpl> operand_;
 };
 
+// Matches a binary instruction whose operands come in any order.
+template <typename OperandType1, typename OperandImpl1, typename OperandType2,
+          typename OperandImpl2>
+class HloInstructionPatternBinaryOperandsAnyOrderImpl {
+ public:
+  explicit constexpr HloInstructionPatternBinaryOperandsAnyOrderImpl(
+      const HloInstructionPattern<OperandType1, OperandImpl1>& op1,
+      const HloInstructionPattern<OperandType2, OperandImpl2>& op2)
+      : op1_(op1), op2_(op2) {}
+
+  bool Match(HloInstruction* inst, MatchOption option) const {
+    return MatchImpl(inst, option);
+  }
+
+  bool Match(const HloInstruction* inst, MatchOption option) const {
+    return MatchImpl(inst, option);
+  }
+
+  void DescribeTo(std::ostream* os, int64 indent = 0) const {
+    *os << "with two operands in either order:";
+    Indent(os, indent);
+    *os << " - ";
+    op1_.DescribeTo(os, indent + 3);
+    Indent(os, indent);
+    *os << " - ";
+    op2_.DescribeTo(os, indent + 3);
+  }
+
+ private:
+  HloInstruction* operand(HloInstruction* inst, int64 idx) const {
+    return inst->mutable_operand(idx);
+  }
+  const HloInstruction* operand(const HloInstruction* inst, int64 idx) const {
+    return inst->operand(idx);
+  }
+
+  template <typename HloInstructionType>
+  bool MatchImpl(HloInstructionType* inst, MatchOption option) const {
+    // We could implement this using AnyOf and AllOf matchers, but the templates
+    // get pretty difficult to debug, since any compile error herein becomes
+    // not-an-error via SFINAE.  Also this way lets us give better messages on
+    // failure.
+    if (inst->operand_count() != 2) {
+      EXPLAIN << "HloInstruction did not have two operands";
+      return false;
+    }
+
+    // If we're not generating explanations, this is pretty simple.
+    if (!option.explain_os) {
+      auto try_match = [&](int64 idx1, int64 idx2) {
+        MatchOption new_option = option;
+        new_option.capture = false;
+        if (op1_.Match(operand(inst, idx1), new_option) &&
+            op2_.Match(operand(inst, idx2), new_option)) {
+          if (option.capture) {
+            bool matched = op1_.Match(operand(inst, idx1), option) &&
+                           op2_.Match(operand(inst, idx2), option);
+            DCHECK(matched);
+          }
+          return true;
+        }
+        return false;
+      };
+      return try_match(0, 1) || try_match(1, 0);
+    }
+
+    // If we are generating explanations, we have some work to do in order to
+    // generate a helpful error.
+    //
+    // First, try all four operand/matcher combinations, recording the
+    // failure explanations separately from option.explain_os. matches[i][j]
+    // tells us if matcher_i matches operand j.
+    bool matches[/*matcher*/ 2][/*operand*/ 2];
+    std::stringstream explanations[/*matcher*/ 2][/*operand*/ 2];
+    for (int i = 0; i < 2; ++i) {
+      for (int j = 0; j < 2; ++j) {
+        MatchOption new_option = option;
+        new_option.capture = false;
+        new_option.explain_os = &explanations[i][j];
+        matches[i][j] = i == 0 ? op1_.Match(operand(inst, j), new_option)
+                               : op2_.Match(operand(inst, j), new_option);
+      }
+    }
+
+    // Check if the match succeeded.
+    for (int i = 0; i < 2; ++i) {
+      if (matches[0][i] && matches[1][(i + 1) % 2]) {
+        // Rerun the matches with capture enabled if necessary.
+        if (option.capture) {
+          auto* operand1 = operand(inst, i);
+          auto* operand2 = operand(inst, (i + 1) % 2);
+          bool matched =
+              op1_.Match(operand1, option) && op2_.Match(operand2, option);
+          DCHECK(matched);
+        }
+        return true;
+      }
+    }
+
+    auto describe_matcher = [&](int matcher_idx) {
+      EXPLAIN << "\n - ";
+      if (matcher_idx == 0) {
+        op1_.DescribeTo(option.explain_os, /*indent=*/3);
+      } else {
+        CHECK_EQ(matcher_idx, 1);
+        op2_.DescribeTo(option.explain_os, /*indent=*/3);
+      }
+      for (int i = 0; i < 2; ++i) {
+        if (matches[matcher_idx][/*operand*/ i]) {
+          continue;
+        }
+        EXPLAIN << "\ndoes not match " << (i == 0 ? "LHS" : "RHS") << ":\n";
+        EXPLAIN << " - ";
+        EXPLAIN << absl::StrReplaceAll(
+            explanations[matcher_idx][/*operand*/ i].str(), {{"\n", "\n   "}});
+      }
+    };
+
+    // If we failed to match, one of the following is true:
+    //  1. op1 (op2) matches neither LHS nor RHS, or
+    //  2. op1 and op2 both match LHS (RHS), but neither matches RHS (LHS).
+    // We print different explanations depending on which case we're in.
+
+    // Case 1.
+    bool wrote_explanation = false;
+    for (int i = 0; !wrote_explanation && i < 2; ++i) {
+      if (!matches[i][0] && !matches[i][1]) {
+        EXPLAIN << "HloInstruction's operands (ignoring order) did not match "
+                << (i == 0 ? "first" : "second") << " matcher.  Specifically,";
+        describe_matcher(i);
+        wrote_explanation = true;
+      }
+    }
+
+    // Case 2.
+    for (int i = 0; !wrote_explanation && i < 2; ++i) {
+      if (matches[/*matcher*/ 0][/*operand*/ i] &&
+          matches[/*matcher*/ 1][/*operand*/ i]) {
+        CHECK(!matches[0][(i + 1) % 2]);
+        CHECK(!matches[1][(i + 1) % 2]);
+        CHECK(!wrote_explanation);
+        EXPLAIN << "HloInstruction's " << (i == 1 ? "LHS" : "RHS")
+                << " operand did not match either of the two matchers.  "
+                   "Specifically,";
+        describe_matcher(0);
+        EXPLAIN << "\nand";
+        describe_matcher(1);
+        wrote_explanation = true;
+      }
+    }
+
+    CHECK(wrote_explanation);
+    return false;
+  }
+
+  HloInstructionPattern<OperandType1, OperandImpl1> op1_;
+  HloInstructionPattern<OperandType2, OperandImpl2> op2_;
+};
+
 // An HloInstructionPattern implementation that matches only if the instruction
 // is a fusion node with a particular kind.
 class HloInstructionPatternFusionKindImpl {
@@ -820,14 +1516,32 @@ class HloInstructionPatternFusionKindImpl {
       : kind_(kind) {}
 
   bool Match(const ::xla::HloInstruction* inst, MatchOption option) const {
-    return inst->opcode() == HloOpcode::kFusion && inst->fusion_kind() == kind_;
+    return MatchImpl(inst, option);
   }
 
   bool Match(::xla::HloInstruction* inst, MatchOption option) const {
-    return inst->opcode() == HloOpcode::kFusion && inst->fusion_kind() == kind_;
+    return MatchImpl(inst, option);
+  }
+
+  void DescribeTo(std::ostream* os, int64 indent = 0) const {
+    *os << "with fusion kind " << ToString(kind_);
   }
 
  private:
+  template <typename HloInstructionType>
+  bool MatchImpl(HloInstructionType* inst, MatchOption option) const {
+    if (inst->opcode() != HloOpcode::kFusion) {
+      EXPLAIN << "HloInstruction does not have fusion kind " << ToString(kind_)
+              << "; it's not a fusion";
+      return false;
+    }
+    if (inst->fusion_kind() != kind_) {
+      EXPLAIN << "HloInstruction does not have fusion kind " << ToString(kind_);
+      return false;
+    }
+    return true;
+  }
+
   ::xla::HloInstruction::FusionKind kind_;
 };
 
@@ -839,47 +1553,211 @@ class HloInstructionPatternTupleIndexImpl {
       : tuple_index_(tuple_index) {}
 
   bool Match(const ::xla::HloInstruction* inst, MatchOption option) const {
-    return inst->opcode() == HloOpcode::kGetTupleElement &&
-           inst->tuple_index() == tuple_index_;
+    return MatchImpl(inst, option);
   }
 
   bool Match(::xla::HloInstruction* inst, MatchOption option) const {
-    return inst->opcode() == HloOpcode::kGetTupleElement &&
-           inst->tuple_index() == tuple_index_;
+    return MatchImpl(inst, option);
+  }
+
+  void DescribeTo(std::ostream* os, int64 indent = 0) const {
+    *os << "which is a GTE with index " << tuple_index_;
   }
 
  private:
+  template <typename HloInstructionType>
+  bool MatchImpl(HloInstructionType* inst, MatchOption option) const {
+    if (inst->opcode() != HloOpcode::kGetTupleElement) {
+      EXPLAIN << "HloInstruction is not a GTE with index " << tuple_index_
+              << "; it's not a GTE at all";
+      return false;
+    }
+    if (inst->tuple_index() != tuple_index_) {
+      EXPLAIN << "HloInstruction is not a GTE with index " << tuple_index_;
+      return false;
+    }
+    return true;
+  }
+
   int64 tuple_index_;
 };
 
-template <typename ItemType, typename Predicate>
-class HloPredicatePatternImpl {
+class HloInstructionPatternParameterNumImpl {
  public:
-  explicit HloPredicatePatternImpl(Predicate pred) : pred_(std::move(pred)) {}
+  explicit constexpr HloInstructionPatternParameterNumImpl(int64 parameter_num)
+      : parameter_num_(parameter_num) {}
 
-  bool Match(const ItemType* item, MatchOption option) const {
-    return pred_(item);
+  bool Match(const ::xla::HloInstruction* inst, MatchOption option) const {
+    return MatchImpl(inst, option);
   }
 
-  bool Match(ItemType* item, MatchOption option) const { return pred_(item); }
+  bool Match(::xla::HloInstruction* inst, MatchOption option) const {
+    return MatchImpl(inst, option);
+  }
+
+  void DescribeTo(std::ostream* os, int64 indent = 0) const {
+    *os << "which is parameter " << parameter_num_;
+  }
 
  private:
-  Predicate pred_;
+  template <typename HloInstructionType>
+  bool MatchImpl(HloInstructionType* inst, MatchOption option) const {
+    if (inst->opcode() != HloOpcode::kParameter ||
+        inst->parameter_number() != parameter_num_) {
+      EXPLAIN << "HloInstruction is not parameter " << parameter_num_;
+      return false;
+    }
+    return true;
+  }
+
+  int64 parameter_num_;
 };
 
-struct PatternFriend;
+// Superclass that contains common code used by Op::WithOneUse() and
+// Op::WithOneUser().
+class HloInstructionPatternOneUseOrUserImpl {
+ protected:
+  bool MatchOneUser(const HloInstruction* inst, MatchOption option) const {
+    if (inst->user_count() != 1) {
+      EXPLAIN << "HloInstruction has " << inst->user_count()
+              << " users, but expected exactly one.";
+      if (inst->user_count() > 1) {
+        EXPLAIN << "\nAll users:";
+        for (const HloInstruction* user : inst->users()) {
+          EXPLAIN << "\n - " << InstToString(user);
+        }
+      }
+      return false;
+    }
+    return true;
+  }
+};
+
+class HloInstructionPatternOneUseImpl
+    : public HloInstructionPatternOneUseOrUserImpl {
+ public:
+  bool Match(const HloInstruction* inst, MatchOption option) const {
+    if (!MatchOneUser(inst, option)) {
+      return false;
+    }
+
+    int64 use_count = absl::c_count_if(
+        inst->users()[0]->operands(),
+        [&](const HloInstruction* operand) { return operand == inst; });
+    if (use_count != 1) {
+      EXPLAIN << "HloInstruction is used " << use_count
+              << " times by its user, but is expected to be used just once: "
+              << InstToString(inst->users()[0]);
+      return false;
+    }
+    return true;
+  }
+
+  void DescribeTo(std::ostream* os, int64 indent = 0) const {
+    *os << "which has exactly one use";
+  }
+};
+
+class HloInstructionPatternOneUserImpl
+    : public HloInstructionPatternOneUseOrUserImpl {
+ public:
+  bool Match(const HloInstruction* inst, MatchOption option) const {
+    return MatchOneUser(inst, option);
+  }
+
+  void DescribeTo(std::ostream* os, int64 indent = 0) const {
+    *os << "which has exactly one user (but possibly is used multiple times by "
+           "that instruction)";
+  }
+};
+
+// Matches a constant scalar or effective scalar, optionally with a given value.
+template <typename ScalarTy>
+class HloConstantScalarImpl {
+ public:
+  explicit constexpr HloConstantScalarImpl(bool match_effective_scalar)
+      : val_(absl::nullopt), match_effective_scalar_(match_effective_scalar) {}
+
+  constexpr HloConstantScalarImpl(ScalarTy val, bool match_effective_scalar)
+      : val_(val), match_effective_scalar_(match_effective_scalar) {}
+
+  bool Match(const ::xla::HloInstruction* inst, MatchOption option) const {
+    return MatchImpl(inst, option);
+  }
+
+  bool Match(::xla::HloInstruction* inst, MatchOption option) const {
+    return MatchImpl(inst, option);
+  }
+
+  void DescribeTo(std::ostream* os, int64 indent = 0) const {
+    *os << "which is a constant "
+        << (match_effective_scalar_ ? "effective " : "") << "scalar";
+    if (val_.has_value()) {
+      *os << " with value " << *val_;
+    }
+  }
+
+ private:
+  template <typename InstTy>
+  bool MatchImpl(InstTy* inst, MatchOption option) const {
+    const auto* const_inst = DynCast<HloConstantInstruction>(inst);
+    if (!const_inst) {
+      EXPLAIN << "HloInstruction is not a constant";
+      return false;
+    }
+    if (match_effective_scalar_ &&
+        !ShapeUtil::IsEffectiveScalar(inst->shape())) {
+      EXPLAIN << "HloInstruction is not an effective scalar";
+      return false;
+    }
+    if (!match_effective_scalar_ && !ShapeUtil::IsScalar(inst->shape())) {
+      EXPLAIN << "HloInstruction is not a scalar";
+      return false;
+    }
+    if (!val_.has_value()) {
+      return true;
+    }
+
+    // Check that literal == static_cast<LitearlTy>(val) and
+    // val == static_cast<ValTy>(literal).  This is sufficient to ensure that
+    // the two constant scalars are actually "equal".
+    auto val_literal = LiteralUtil::CreateR0(*val_);
+    auto literal_r0_or = const_inst->literal().Reshape({});
+    auto val_as_literal_ty_or =
+        val_literal.Convert(const_inst->shape().element_type());
+    if (!literal_r0_or.ok() || !val_as_literal_ty_or.ok()) {
+      EXPLAIN << "could not construct relevant Literals (how did this happen?)";
+      return false;
+    }
+    auto literal_r0 = std::move(literal_r0_or).ValueOrDie();
+    auto val_as_literal_ty = std::move(val_as_literal_ty_or).ValueOrDie();
+    auto literal_r0_as_val_ty_or =
+        literal_r0.Convert(val_literal.shape().element_type());
+    bool rv = literal_r0_as_val_ty_or.ok() &&  //
+              literal_r0_as_val_ty_or.ValueOrDie() == val_literal &&
+              literal_r0 == val_as_literal_ty;
+    if (!rv) {
+      EXPLAIN << "HloInstruction's constant value " << literal_r0.ToString()
+              << " did not match expected value " << *val_;
+    }
+    return rv;
+  }
+
+  absl::optional<ScalarTy> val_;
+  bool match_effective_scalar_;
+};
 
 // A pattern that matches HloInstructions.
 template <typename HloInstructionType, typename Impl>
 class HloInstructionPattern {
  private:
   template <typename NewImpl>
-  HloInstructionPattern<HloInstructionType,
-                        AllOfPattern<::xla::HloInstruction, Impl, NewImpl>>
-  AppendImpl(NewImpl new_impl) const {
-    return HloInstructionPattern<
-        HloInstructionType, AllOfPattern<::xla::HloInstruction, Impl, NewImpl>>(
-        AllOf<HloInstruction>(impl_, std::move(new_impl)), matched_inst_);
+  auto AppendImpl(NewImpl new_impl) const -> HloInstructionPattern<
+      HloInstructionType, decltype(AllOf<HloInstruction>(
+                              std::declval<Impl>(), std::move(new_impl)))> {
+    auto new_allof = AllOf<HloInstruction>(impl_, std::move(new_impl));
+    return HloInstructionPattern<HloInstructionType, decltype(new_allof)>(
+        std::move(new_allof), matched_inst_);
   }
 
  public:
@@ -895,6 +1773,9 @@ class HloInstructionPattern {
       }
       return true;
     }
+    if (inst != nullptr) {
+      EXPLAIN << "\nin " << InstToString(inst);
+    }
     return false;
   }
 
@@ -906,6 +1787,7 @@ class HloInstructionPattern {
       }
       return true;
     }
+    EXPLAIN << "\nin " << InstToString(inst);
     return false;
   }
 
@@ -935,12 +1817,47 @@ class HloInstructionPattern {
     return AppendImpl(HloInstructionPatternOpcodeImpl(opcode, true));
   }
 
+  constexpr auto Is(const HloInstruction* instr) const
+      -> decltype(this->AppendImpl(HloInstructionIsImpl(instr))) {
+    return AppendImpl(HloInstructionIsImpl(instr));
+  }
+
   // Modifies the pattern to match only if the instruction is a constant.
   constexpr auto IsConstant() const
       -> decltype(this->WithOpcode(HloOpcode::kConstant)) {
     return WithOpcode(HloOpcode::kConstant);
   }
 
+  constexpr auto IsConstantScalar() const -> decltype(this->AppendImpl(
+      HloConstantScalarImpl</*Dummy*/ int>(/*match_effective_scalar=*/false))) {
+    return AppendImpl(
+        HloConstantScalarImpl</*Dummy*/ int>(/*match_effective_scalar=*/false));
+  }
+
+  // This does not check that T has the same type as the instruction, so e.g.
+  // IsConstantScalar(1.0) may match a constant of shape int32[].
+  template <typename ScalarTy>
+  constexpr auto IsConstantScalar(const ScalarTy& val) const
+      -> decltype(this->AppendImpl(HloConstantScalarImpl<ScalarTy>(
+          val, /*match_effective_scalar=*/false))) {
+    return AppendImpl(
+        HloConstantScalarImpl<ScalarTy>(val, /*match_effective_scalar=*/false));
+  }
+
+  constexpr auto IsConstantEffectiveScalar() const -> decltype(this->AppendImpl(
+      HloConstantScalarImpl</*Dummy*/ int>(/*match_effective_scalar=*/true))) {
+    return AppendImpl(
+        HloConstantScalarImpl</*Dummy*/ int>(/*match_effective_scalar=*/true));
+  }
+
+  template <typename ScalarTy>
+  constexpr auto IsConstantEffectiveScalar(const ScalarTy& val) const
+      -> decltype(this->AppendImpl(HloConstantScalarImpl<ScalarTy>(
+          val, /*match_effective_scalar=*/true))) {
+    return AppendImpl(
+        HloConstantScalarImpl<ScalarTy>(val, /*match_effective_scalar=*/true));
+  }
+
   // Modifies the pattern to match only if the instruction is not a constant.
   constexpr auto IsNonConstant() const
       -> decltype(this->WithoutOpcode(HloOpcode::kConstant)) {
@@ -957,6 +1874,22 @@ class HloInstructionPattern {
         HloInstructionPatternShapeImpl<ShapeType, ShapeImpl>(shape));
   }
 
+  // Make this a templated function to work around gcc 4.9.4 template infinite
+  // recursion bug.
+  template <typename Dummy = void>
+  constexpr auto WithShapeEqualTo(const ::xla::Shape* shape)
+      -> decltype(this->WithShape(Shape().EqualTo(shape))) {
+    return WithShape(Shape().EqualTo(shape));
+  }
+
+  // Make this a templated function to work around gcc 4.9.4 template infinite
+  // recursion bug.
+  template <typename Dummy = void>
+  constexpr auto WithShapeCompatibleTo(const ::xla::Shape* shape)
+      -> decltype(this->WithShape(Shape().CompatibleTo(shape))) {
+    return WithShape(Shape().CompatibleTo(shape));
+  }
+
   // Modifies the pattern to match only if the instruction has an operand that
   // matches the given pattern.
   template <typename OperandType, typename OperandImpl>
@@ -971,6 +1904,20 @@ class HloInstructionPattern {
             operand_index, operand));
   }
 
+  template <typename OperandType1, typename OperandImpl1, typename OperandType2,
+            typename OperandImpl2>
+  constexpr auto WithBinaryOperandsAnyOrder(
+      const HloInstructionPattern<OperandType1, OperandImpl1>& op1,
+      const HloInstructionPattern<OperandType2, OperandImpl2>& op2) const
+      -> decltype(this->AppendImpl(
+          HloInstructionPatternBinaryOperandsAnyOrderImpl<
+              OperandType1, OperandImpl1, OperandType2, OperandImpl2>(op1,
+                                                                      op2))) {
+    return AppendImpl(
+        HloInstructionPatternBinaryOperandsAnyOrderImpl<
+            OperandType1, OperandImpl1, OperandType2, OperandImpl2>(op1, op2));
+  }
+
   // Modifies the pattern to match only if the instruction is a fusion node with
   // the given kind.
   constexpr auto WithFusionKind(HloInstruction::FusionKind kind) const
@@ -985,17 +1932,34 @@ class HloInstructionPattern {
     return AppendImpl(HloInstructionPatternTupleIndexImpl(tuple_index));
   }
 
- private:
-  template <typename Predicate>
-  constexpr auto WithPredicate(Predicate pred) const -> decltype(
-      this->AppendImpl(HloPredicatePatternImpl<HloInstruction, Predicate>(
-          std::move(pred)))) {
-    return AppendImpl(
-        HloPredicatePatternImpl<HloInstruction, Predicate>(std::move(pred)));
+  // Modifies the pattern to match only if the instruction is a parameter
+  // with the given parameter number.
+  constexpr auto WithParameterNum(int64 parameter_num) const -> decltype(
+      this->AppendImpl(HloInstructionPatternParameterNumImpl(parameter_num))) {
+    return AppendImpl(HloInstructionPatternParameterNumImpl(parameter_num));
   }
 
-  friend struct PatternFriend;
+  // Modifies the pattern to match if the instruction is used exactly once.
+  // Does not match if the instruction is used twice by the same user (e.g.
+  // multiply(x,x)).
+  constexpr auto WithOneUse() const
+      -> decltype(this->AppendImpl(HloInstructionPatternOneUseImpl())) {
+    return AppendImpl(HloInstructionPatternOneUseImpl());
+  }
 
+  // Modifies the pattern to match if the instruction is used by exactly one
+  // other instruction.  Will match if the instruction is used twice, so long as
+  // it's by the same user (e.g.  multiply(x,x)).
+  constexpr auto WithOneUser() const
+      -> decltype(this->AppendImpl(HloInstructionPatternOneUserImpl())) {
+    return AppendImpl(HloInstructionPatternOneUserImpl());
+  }
+
+  void DescribeTo(std::ostream* os, int64 indent = 0) const {
+    impl_.DescribeTo(os, indent);
+  }
+
+ private:
   Impl impl_;
   HloInstructionType** matched_inst_;
 };
@@ -1036,6 +2000,7 @@ Op(::xla::HloInstruction** matched_inst) {
 XLA_NULLOP_PATTERN(Constant)
 XLA_NULLOP_PATTERN(Parameter)
 XLA_NULLOP_PATTERN(Iota)
+XLA_NULLOP_PATTERN(Rng)
 #undef XLA_NULLOP_PATTERN
 
 // Helpers for unary instructions.
@@ -1067,8 +2032,10 @@ XLA_UNOP_PATTERN(RoundNearestAfz)
 XLA_UNOP_PATTERN(Bitcast)
 XLA_UNOP_PATTERN(Broadcast)
 XLA_UNOP_PATTERN(Ceil)
+XLA_UNOP_PATTERN(Convert)
 XLA_UNOP_PATTERN(Copy)
 XLA_UNOP_PATTERN(Cos)
+XLA_UNOP_PATTERN(CrossReplicaSum)
 XLA_UNOP_PATTERN(Exp)
 XLA_UNOP_PATTERN(Fft)
 XLA_UNOP_PATTERN(Floor)
@@ -1088,6 +2055,7 @@ XLA_UNOP_PATTERN(Reverse)
 XLA_UNOP_PATTERN(SendDone)
 XLA_UNOP_PATTERN(Sign)
 XLA_UNOP_PATTERN(Sin)
+XLA_UNOP_PATTERN(Slice)
 XLA_UNOP_PATTERN(Sort)
 XLA_UNOP_PATTERN(Tanh)
 XLA_UNOP_PATTERN(Transpose)
@@ -1125,25 +2093,32 @@ XLA_UNOP_PATTERN(Transpose)
 #define XLA_COMMUTATIVE_BINOP_PATTERN(NAME)                                 \
   XLA_BINOP_PATTERN(NAME)                                                   \
                                                                             \
-  template <typename Lhs, typename Rhs>                                     \
-  inline auto NAME##AnyOrder(Lhs&& lhs, Rhs&& rhs)                          \
-      ->decltype(AnyOf<HloInstruction>(NAME(lhs, rhs), NAME(rhs, lhs))) {   \
-    return AnyOf<HloInstruction>(NAME(lhs, rhs), NAME(rhs, lhs));           \
-  }                                                                         \
-                                                                            \
   template <typename HloInstructionType, typename Lhs, typename Rhs>        \
   inline auto NAME##AnyOrder(HloInstructionType** matched_inst, Lhs&& lhs,  \
                              Rhs&& rhs)                                     \
-      ->decltype(AnyOf<HloInstructionType>(NAME(matched_inst, lhs, rhs),    \
-                                           NAME(matched_inst, rhs, lhs))) { \
-    return AnyOf<HloInstructionType>(NAME(matched_inst, lhs, rhs),          \
-                                     NAME(matched_inst, rhs, lhs));         \
+      ->decltype(Op(matched_inst)                                           \
+                     .WithOpcode(HloOpcode::k##NAME)                        \
+                     .WithBinaryOperandsAnyOrder(std::forward<Lhs>(lhs),    \
+                                                 std::forward<Rhs>(rhs))) { \
+    return Op(matched_inst)                                                 \
+        .WithOpcode(HloOpcode::k##NAME)                                     \
+        .WithBinaryOperandsAnyOrder(std::forward<Lhs>(lhs),                 \
+                                    std::forward<Rhs>(rhs));                \
+  }                                                                         \
+  template <typename Lhs, typename Rhs>                                     \
+  inline auto NAME##AnyOrder(Lhs&& lhs, Rhs&& rhs)                          \
+      ->decltype(NAME##AnyOrder<const HloInstruction>(                      \
+          nullptr, std::forward<Lhs>(lhs), std::forward<Rhs>(rhs))) {       \
+    return NAME##AnyOrder<const HloInstruction>(                            \
+        nullptr, std::forward<Lhs>(lhs), std::forward<Rhs>(rhs));           \
   }
 XLA_COMMUTATIVE_BINOP_PATTERN(Add)
 XLA_BINOP_PATTERN(Atan2)
 XLA_BINOP_PATTERN(Divide)
 XLA_BINOP_PATTERN(Complex)
+XLA_BINOP_PATTERN(Convolution)
 XLA_BINOP_PATTERN(Dot)
+XLA_BINOP_PATTERN(DynamicSlice)
 XLA_COMMUTATIVE_BINOP_PATTERN(Eq)
 XLA_BINOP_PATTERN(Gather)
 XLA_BINOP_PATTERN(Ge)
@@ -1155,7 +2130,9 @@ XLA_COMMUTATIVE_BINOP_PATTERN(Minimum)
 XLA_COMMUTATIVE_BINOP_PATTERN(Multiply)
 XLA_COMMUTATIVE_BINOP_PATTERN(Ne)
 XLA_BINOP_PATTERN(Outfeed)
+XLA_BINOP_PATTERN(Pad)
 XLA_BINOP_PATTERN(Power)
+XLA_BINOP_PATTERN(ReduceWindow)
 XLA_BINOP_PATTERN(Remainder)
 XLA_BINOP_PATTERN(Send)
 XLA_BINOP_PATTERN(Subtract)
@@ -1202,6 +2179,7 @@ XLA_BINOP_PATTERN(ShiftRightLogical)
         .WithOperand(2, std::forward<Arg2>(arg2));                     \
   }
 XLA_TERNOP_PATTERN(Clamp);
+XLA_TERNOP_PATTERN(Scatter);
 XLA_TERNOP_PATTERN(Select);
 #undef XLA_TERNOP_PATTERN
 
@@ -1255,31 +2233,10 @@ inline auto WithOperands(Matcher&& m, int64 operand_num, FirstArg&& first_arg,
 // We could implement all ops as "variadic" ops, but it would make the
 // already-bad compile errors even worse.
 XLA_VARIADIC_OP_PATTERN(Concatenate);
+XLA_VARIADIC_OP_PATTERN(CustomCall);
+XLA_VARIADIC_OP_PATTERN(Map)
 XLA_VARIADIC_OP_PATTERN(Reduce);
-
-namespace detail {
-struct PatternFriend {
-  template <typename T>
-  static auto ConstantScalar(T constant) -> decltype(
-      Constant()
-          .WithShape(match::Shape().IsScalar())
-          .WithPredicate(
-              std::declval<std::function<bool(const HloInstruction*)>>())) {
-    std::function<bool(const HloInstruction*)> pred =
-        [constant](const HloInstruction* instr) {
-          const auto& literal = Cast<HloConstantInstruction>(instr)->literal();
-          auto status_or_const = LiteralUtil::CreateR0(constant).Convert(
-              literal.shape().element_type());
-          return status_or_const.ok() &&
-                 literal == status_or_const.ConsumeValueOrDie();
-        };
-
-    return Constant()
-        .WithShape(match::Shape().IsScalar())
-        .WithPredicate(std::move(pred));
-  }
-};
-}  // namespace detail
+XLA_VARIADIC_OP_PATTERN(Tuple);
 
 // Helpers for matching non-constant instructions.
 inline auto NonConstant() -> decltype(Op().IsNonConstant()) {
@@ -1318,14 +2275,71 @@ inline auto GetTupleElement(HloInstructionType** matched_inst, Arg&& arg,
       .WithTupleIndex(tuple_index);
 }
 
-template <typename T>
-inline auto ConstantScalar(T constant)
-    -> decltype(detail::PatternFriend::ConstantScalar(constant)) {
-  return detail::PatternFriend::ConstantScalar(constant);
+// Add overloads for Parameter which take an int64 specifying the parameter
+// number.
+inline auto Parameter(int64 parameter_num) -> decltype(
+    Op().WithOpcode(HloOpcode::kParameter).WithParameterNum(parameter_num)) {
+  return Op().WithOpcode(HloOpcode::kParameter).WithParameterNum(parameter_num);
+}
+template <typename HloInstructionType>
+inline auto Parameter(HloInstructionType** matched_inst, int64 parameter_num)
+    -> decltype(Op(matched_inst)
+                    .WithOpcode(HloOpcode::kParameter)
+                    .WithParameterNum(parameter_num)) {
+  return Op(matched_inst)
+      .WithOpcode(HloOpcode::kParameter)
+      .WithParameterNum(parameter_num);
+}
+
+inline auto ConstantScalar() -> decltype(Op().IsConstantScalar()) {
+  return Op().IsConstantScalar();
+}
+
+template <typename HloInstructionType>
+inline auto ConstantScalar(HloInstructionType** matched_inst)
+    -> decltype(Op(matched_inst).IsConstantScalar()) {
+  return Op(matched_inst).IsConstantScalar();
+}
+
+template <typename ScalarTy>
+inline auto ConstantScalar(ScalarTy val)
+    -> decltype(Op().IsConstantScalar(val)) {
+  return Op().IsConstantScalar(val);
+}
+
+template <typename HloInstructionType, typename ScalarTy>
+inline auto ConstantScalar(HloInstructionType** matched_inst, ScalarTy val)
+    -> decltype(Op(matched_inst).IsConstantScalar(val)) {
+  return Op(matched_inst).IsConstantScalar(val);
+}
+
+inline auto ConstantEffectiveScalar() -> decltype(Op().IsConstantScalar()) {
+  return Op().IsConstantEffectiveScalar();
+}
+
+template <typename HloInstructionType>
+inline auto ConstantEffectiveScalar(HloInstructionType** matched_inst)
+    -> decltype(Op(matched_inst).IsConstantScalar()) {
+  return Op(matched_inst).IsConstantEffectiveScalar();
+}
+
+template <typename ScalarTy>
+inline auto ConstantEffectiveScalar(ScalarTy val)
+    -> decltype(Op().IsConstantEffectiveScalar(val)) {
+  return Op().IsConstantEffectiveScalar(val);
+}
+
+template <typename HloInstructionType, typename ScalarTy>
+inline auto ConstantEffectiveScalar(HloInstructionType** matched_inst,
+                                    ScalarTy val)
+    -> decltype(Op(matched_inst).IsConstantEffectiveScalar(val)) {
+  return Op(matched_inst).IsConstantEffectiveScalar(val);
 }
 
 }  // namespace match
 
 }  // namespace xla
 
+#undef EXPLAIN
+#pragma pop_macro("EXPLAIN")
 #endif  // TENSORFLOW_COMPILER_XLA_SERVICE_PATTERN_MATCHER_H_
diff --git a/tensorflow/compiler/xla/service/pattern_matcher_gmock.h b/tensorflow/compiler/xla/service/pattern_matcher_gmock.h
new file mode 100644
index 00000000000..8fe2d10a11b
--- /dev/null
+++ b/tensorflow/compiler/xla/service/pattern_matcher_gmock.h
@@ -0,0 +1,92 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_PATTERN_MATCHER_GMOCK_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_PATTERN_MATCHER_GMOCK_H_
+
+#include <ostream>
+#include "tensorflow/compiler/xla/service/pattern_matcher.h"
+#include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace xla {
+
+namespace pattern_matcher_gmock_detail {
+template <typename Pattern>
+class GmockMatcher {
+ public:
+  explicit GmockMatcher(Pattern p) : pattern_(std::move(p)) {}
+
+  // In service of better error messages, list out the overloads explicitly
+  // rather than just using a template.  gMock's polymorphism plus
+  // pattern_matcher yields some pretty gnarly stuff.
+  bool MatchAndExplain(const Layout& l,
+                       ::testing::MatchResultListener* listener) const {
+    return MatchAndExplainImpl(&l, listener);
+  }
+  bool MatchAndExplain(const Layout* l,
+                       ::testing::MatchResultListener* listener) const {
+    return MatchAndExplainImpl(l, listener);
+  }
+
+  bool MatchAndExplain(const Shape& s,
+                       ::testing::MatchResultListener* listener) const {
+    return MatchAndExplainImpl(&s, listener);
+  }
+  bool MatchAndExplain(const Shape* s,
+                       ::testing::MatchResultListener* listener) const {
+    return MatchAndExplainImpl(s, listener);
+  }
+
+  bool MatchAndExplain(const HloInstruction& instr,
+                       ::testing::MatchResultListener* listener) const {
+    return MatchAndExplainImpl(&instr, listener);
+  }
+  bool MatchAndExplain(const HloInstruction* instr,
+                       ::testing::MatchResultListener* listener) const {
+    return MatchAndExplainImpl(instr, listener);
+  }
+
+  void DescribeTo(std::ostream* os) const { pattern_.DescribeTo(os); }
+
+  void DescribeNegationTo(std::ostream* os) const {
+    *os << "is NOT: ";
+    DescribeTo(os);
+  }
+
+ private:
+  template <typename T>
+  bool MatchAndExplainImpl(const T* t,
+                           ::testing::MatchResultListener* listener) const {
+    MatchOption options{/*.capture=*/true, /*.explain_os=*/listener->stream()};
+    return Match(t, pattern_, options);
+  }
+
+  Pattern pattern_;
+};
+}  // namespace pattern_matcher_gmock_detail
+
+template <typename Pattern>
+::testing::PolymorphicMatcher<
+    pattern_matcher_gmock_detail::GmockMatcher<Pattern>>
+GmockMatch(Pattern&& p) {
+  return ::testing::MakePolymorphicMatcher(
+      pattern_matcher_gmock_detail::GmockMatcher<Pattern>(
+          std::forward<Pattern>(p)));
+}
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_PATTERN_MATCHER_GMOCK_H_
diff --git a/tensorflow/compiler/xla/service/pattern_matcher_gmock_test.cc b/tensorflow/compiler/xla/service/pattern_matcher_gmock_test.cc
new file mode 100644
index 00000000000..9ca2fb05c1f
--- /dev/null
+++ b/tensorflow/compiler/xla/service/pattern_matcher_gmock_test.cc
@@ -0,0 +1,76 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/pattern_matcher_gmock.h"
+#include "tensorflow/compiler/xla/service/pattern_matcher.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace xla {
+namespace {
+
+namespace m = ::xla::match;
+using ::testing::Eq;
+using ::testing::Not;
+
+template <typename MatchedTy>
+string Describe(const ::testing::Matcher<MatchedTy>& m) {
+  std::stringstream ss;
+  m.DescribeTo(&ss);
+  return ss.str();
+}
+
+template <typename MatchedTy>
+string Explain(
+    const MatchedTy& val,
+    const ::testing::Matcher<typename std::remove_cv<MatchedTy>::type>& m) {
+  ::testing::StringMatchResultListener listener;
+  EXPECT_THAT(val, ::testing::Not(m));  // For the error message.
+  EXPECT_FALSE(m.MatchAndExplain(val, &listener));
+  return listener.str();
+}
+
+// This file tests the GmockMatch function.  The actual explanation and
+// description returned by matchers is tested in pattern_matchers_test.
+TEST(PatternMatcherGmock, MatchShape) {
+  Shape s = ShapeUtil::MakeShape(F32, {10, 100});
+  // You can pass const Shape& or a const Shape*.
+  EXPECT_THAT(s, GmockMatch(m::Shape()));
+  EXPECT_THAT(&s, Not(GmockMatch(m::Shape().WithElementType(F16))));
+  EXPECT_THAT(Describe<Shape>(GmockMatch(m::Shape().IsArray())),
+              "a shape that represents an array");
+}
+
+TEST(PatternMatcherGmock, MatchLayout) {
+  Layout l = LayoutUtil::MakeLayout({0, 1});
+  EXPECT_THAT(l, GmockMatch(m::Layout()));
+  EXPECT_THAT(&l, Not(GmockMatch(m::Layout().WithSparseFormat())));
+  EXPECT_THAT(Describe<Layout>(GmockMatch(m::Layout().WithSparseFormat())),
+              "a layout with format SPARSE");
+}
+
+TEST(PatternMatchGmock, MatchInstruction) {
+  auto instr =
+      HloInstruction::CreateParameter(0, ShapeUtil::MakeShape(F32, {42}), "p");
+  EXPECT_THAT(instr.get(), GmockMatch(m::Parameter()));
+  EXPECT_THAT(*instr, GmockMatch(m::Parameter(0)));
+  EXPECT_THAT(*instr, Not(GmockMatch(m::Parameter(1))));
+  EXPECT_THAT(Describe<HloInstruction*>(GmockMatch(m::Parameter())),
+              "an HloInstruction with opcode parameter");
+}
+
+}  // anonymous namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/pattern_matcher_test.cc b/tensorflow/compiler/xla/service/pattern_matcher_test.cc
index 3f74273517a..186ef0c7911 100644
--- a/tensorflow/compiler/xla/service/pattern_matcher_test.cc
+++ b/tensorflow/compiler/xla/service/pattern_matcher_test.cc
@@ -14,14 +14,18 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/compiler/xla/service/pattern_matcher.h"
+#include "absl/strings/str_cat.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_parser.h"
+#include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace xla {
 namespace {
 
+namespace m = match;
+
 TEST(PatternMatcherTest, AddOp) {
   constexpr char kModuleStr[] = R"(HloModule two_plus_two_module
     ENTRY %two_plus_two_computation () -> f32[] {
@@ -229,23 +233,74 @@ TEST(PatternMatcherTest, AnyOf) {
 }
 
 TEST(PatternMatcherTest, ConstantScalar) {
+  using match::ConstantEffectiveScalar;
+  using match::ConstantScalar;
+  using match::Op;
+  using match::Tuple;
+
   constexpr char kModuleStr[] = R"(
-    HloModule test_module ENTRY test { ROOT constant = f16[] constant(42) })";
+    HloModule test_module
+    ENTRY test {
+      a = s32[] constant(1)
+      b = s32[1,1] constant(s32[1,1]{{2}})
+      c = s32[1,2] constant(s32[1,2]{{2,2}})
+      d = f32[] constant(1)
+      e = f32[] constant(1.25)
+      ROOT tuple = (s32[], s32[1,1], s32[1,2], f32[], f32[]) tuple(a,b,c,d,e)
+    })";
   TF_ASSERT_OK_AND_ASSIGN(auto hlo_module, ParseHloString(kModuleStr));
   auto* root = hlo_module->entry_computation()->root_instruction();
 
-  EXPECT_TRUE(Match(root, match::ConstantScalar(42)));
-  EXPECT_FALSE(Match(root, match::ConstantScalar(41)));
-  EXPECT_FALSE(Match(root, match::ConstantScalar(0)));
-}
+  const HloInstruction* a = root->operand(0);
+  const HloInstruction* b = root->operand(1);
+  const HloInstruction* c = root->operand(2);
+  const HloInstruction* d = root->operand(3);
+  const HloInstruction* e = root->operand(4);
+  EXPECT_TRUE(Match(a, ConstantScalar()));
+  EXPECT_TRUE(Match(a, ConstantScalar(1)));
+  EXPECT_TRUE(Match(a, ConstantEffectiveScalar()));
+  EXPECT_TRUE(Match(a, ConstantEffectiveScalar(1)));
+  EXPECT_FALSE(Match(a, ConstantScalar(2)));
+  EXPECT_FALSE(Match(a, ConstantScalar(2.01)));
+  EXPECT_FALSE(Match(a, ConstantEffectiveScalar(2)));
+  EXPECT_FALSE(Match(a, ConstantEffectiveScalar(1.01)));
 
-TEST(PatternMatcherTest, NoMatchConstantScalar) {
-  constexpr char kModuleStr[] = R"(
-    HloModule test_module ENTRY test { ROOT v = f16[] parameter(0) })";
-  TF_ASSERT_OK_AND_ASSIGN(auto hlo_module, ParseHloString(kModuleStr));
-  auto* root = hlo_module->entry_computation()->root_instruction();
+  EXPECT_FALSE(Match(b, ConstantScalar()));
+  EXPECT_FALSE(Match(b, ConstantScalar(2)));
+  EXPECT_TRUE(Match(b, ConstantEffectiveScalar()));
+  EXPECT_TRUE(Match(b, ConstantEffectiveScalar(2)));
 
-  EXPECT_FALSE(Match(root, match::ConstantScalar(42)));
+  EXPECT_FALSE(Match(c, ConstantScalar()));
+  EXPECT_FALSE(Match(c, ConstantScalar(2)));
+  EXPECT_FALSE(Match(c, ConstantEffectiveScalar()));
+  EXPECT_FALSE(Match(c, ConstantEffectiveScalar(2)));
+
+  EXPECT_TRUE(Match(d, ConstantScalar(1)));
+  EXPECT_TRUE(Match(d, ConstantEffectiveScalar(1)));
+  EXPECT_TRUE(Match(d, ConstantScalar(1.0)));
+  EXPECT_TRUE(Match(d, ConstantEffectiveScalar(1.0)));
+
+  EXPECT_TRUE(Match(e, ConstantScalar(1.25f)));
+  EXPECT_TRUE(Match(e, ConstantScalar(1.25)));
+  EXPECT_TRUE(Match(e, ConstantEffectiveScalar(1.25)));
+  EXPECT_FALSE(Match(e, ConstantScalar(1)));
+  EXPECT_FALSE(Match(e, ConstantEffectiveScalar(1)));
+
+  const HloInstruction* instr = nullptr;
+  EXPECT_TRUE(Match(a, ConstantScalar(&instr)));
+  EXPECT_EQ(instr, a);
+
+  instr = nullptr;
+  EXPECT_TRUE(Match(a, ConstantScalar(&instr, 1)));
+  EXPECT_EQ(instr, a);
+
+  instr = nullptr;
+  EXPECT_TRUE(Match(a, ConstantEffectiveScalar(&instr)));
+  EXPECT_EQ(instr, a);
+
+  instr = nullptr;
+  EXPECT_TRUE(Match(a, ConstantEffectiveScalar(&instr, 1)));
+  EXPECT_EQ(instr, a);
 }
 
 TEST(PatternMatcherTest, MultiplyAnyOrder) {
@@ -267,6 +322,15 @@ TEST(PatternMatcherTest, MultiplyAnyOrder) {
       root, MultiplyAnyOrder(&instr, ConstantScalar(42), ConstantScalar(52))));
   EXPECT_TRUE(Match(
       root, MultiplyAnyOrder(&instr, ConstantScalar(52), ConstantScalar(42))));
+
+  // Check that MultiplyAnyOrder exposes the same API as Op(), so we can call
+  // e.g. IsNonConstant() on it.
+  EXPECT_TRUE(Match(
+      root, MultiplyAnyOrder(&instr, ConstantScalar(42), ConstantScalar(52))
+                .IsNonConstant()));
+  EXPECT_TRUE(
+      Match(root, MultiplyAnyOrder(ConstantScalar(42), ConstantScalar(52))
+                      .IsNonConstant()));
 }
 
 TEST(PatternMatcherTest, AnyOfShortCircuit) {
@@ -315,14 +379,22 @@ TEST(PatternMatcherTest, AllOf) {
   TF_ASSERT_OK_AND_ASSIGN(auto hlo_module, ParseHloString(kModuleStr));
   auto* root = hlo_module->entry_computation()->root_instruction();
 
+  auto f16_scalar = ShapeUtil::MakeShape(F16, {});
+  auto f16_pattern = Constant().WithShapeEqualTo(&f16_scalar);
+  auto f16_compatible_pattern = Constant().WithShapeCompatibleTo(&f16_scalar);
   auto scalar_pattern = Constant().WithShape(match::Shape().IsScalar());
-  auto f16_pattern = Constant().WithShape(match::Shape().WithElementType(F16));
   ASSERT_TRUE(Match(root, scalar_pattern));
   ASSERT_TRUE(Match(root, f16_pattern));
-  EXPECT_TRUE(Match(root, AllOf<HloInstruction>(scalar_pattern, f16_pattern)));
-  EXPECT_TRUE(Match(root, AllOf<HloInstruction>(f16_pattern, scalar_pattern)));
+  ASSERT_TRUE(Match(root, f16_compatible_pattern));
+  EXPECT_TRUE(Match(root, AllOf<HloInstruction>(scalar_pattern, f16_pattern,
+                                                f16_compatible_pattern)));
+  EXPECT_TRUE(
+      Match(root, AllOf<HloInstruction>(f16_pattern, f16_compatible_pattern,
+                                        scalar_pattern)));
   EXPECT_FALSE(
       Match(root, AllOf<HloInstruction>(Broadcast(Op()), f16_pattern)));
+  EXPECT_FALSE(Match(
+      root, AllOf<HloInstruction>(Broadcast(Op()), f16_compatible_pattern)));
   EXPECT_FALSE(
       Match(root, AllOf<HloInstruction>(Broadcast(Op()), scalar_pattern)));
 }
@@ -431,5 +503,433 @@ TEST(PatternMatcherTest, TestConcat) {
                         Reshape(ConstantScalar(4)))));
 }
 
+template <typename Pattern>
+string Description(const Pattern& pattern) {
+  std::stringstream ss;
+  pattern.DescribeTo(&ss);
+  return ss.str();
+}
+
+template <typename Elem, typename Pattern>
+string Explanation(Elem* elem, const Pattern& pattern) {
+  std::stringstream ss;
+  MatchOption options{/*.capture=*/true, /*.explain_os=*/&ss};
+  Match(elem, pattern, options);
+  return ss.str();
+}
+template <typename Elem, typename Pattern>
+string Explanation(const std::unique_ptr<Elem>& elem, const Pattern& pattern) {
+  return Explanation(elem.get(), pattern);
+}
+template <typename Elem, typename Pattern>
+string Explanation(const Elem& elem, const Pattern& pattern) {
+  return Explanation(&elem, pattern);
+}
+
+// Helper macro for checking a pattern's description and the explanation printed
+// when attempting to match (and presumably failing) on a given object.
+//
+// We use a macro rather than a function because we want good line numbers in
+// errors.  We use this rather than writing a helper that returns a pair of
+// (description, explanation) and doing something like
+//
+//   EXPECT_THAT(DescAndExplanation(...), ::testing::Pair(..., ...));
+//
+// because EXPECT_EQ prints a unified diff if multiline string comparison fails,
+// while EXPECT_THAT does not.  This unified diff makes the errors much easier
+// to read.
+#define EXPECT_DESC_AND_EXPLANATION(elem, pattern, expected_desc,    \
+                                    expected_explanation)            \
+  do {                                                               \
+    EXPECT_EQ(Description(pattern), (expected_desc));                \
+    EXPECT_EQ(Explanation((elem), (pattern)), expected_explanation); \
+  } while (0)
+
+TEST(PatternMatcherTest, LayoutDescribeToAndExplain) {
+  auto layout = LayoutUtil::MakeLayout({1, 2});
+  auto layout2 = LayoutUtil::MakeLayout({2, 2});
+
+  EXPECT_DESC_AND_EXPLANATION(static_cast<const Layout*>(nullptr), m::Layout(),
+                              "a layout", "Layout is null");
+  EXPECT_DESC_AND_EXPLANATION(layout2, m::Layout().EqualTo(&layout),
+                              "a layout equal to {1,2}",
+                              "Layout {2,2} is not equal to expected {1,2}");
+  EXPECT_DESC_AND_EXPLANATION(layout2, m::Layout().WithSparseFormat(),
+                              "a layout with format SPARSE",
+                              "Layout has format DENSE but expected SPARSE");
+  EXPECT_DESC_AND_EXPLANATION(layout,
+                              m::Layout().EqualTo(&layout).WithSparseFormat(),
+                              "a layout:\n"
+                              " * equal to {1,2} AND\n"
+                              " * with format SPARSE",
+                              "Layout has format DENSE but expected SPARSE");
+}
+
+TEST(PatternMatcherTest, ShapeDescribeToAndExplain) {
+  auto shape = ShapeUtil::MakeShapeWithLayout(F32, {1, 2}, {0, 1});
+  auto layout = shape.layout();
+
+  EXPECT_DESC_AND_EXPLANATION(static_cast<const Shape*>(nullptr), m::Shape(),
+                              "a shape", "Shape is null");
+  EXPECT_DESC_AND_EXPLANATION(
+      ShapeUtil::MakeShapeWithLayout(F32, {1, 2}, {1, 0}),
+      m::Shape().EqualTo(&shape), "a shape equal to f32[1,2]{0,1}",
+      "Shape not equal to f32[1,2]{0,1}\n"
+      "in f32[1,2]{1,0}");
+  EXPECT_DESC_AND_EXPLANATION(ShapeUtil::MakeShape(F32, {2, 2}),
+                              m::Shape().CompatibleTo(&shape),
+                              "a shape compatible with f32[1,2]",
+                              "Shape not compatible with f32[1,2]\n"
+                              "in f32[2,2]{1,0}");
+  EXPECT_DESC_AND_EXPLANATION(shape, m::Shape().WithElementType(F16),
+                              "a shape with element type F16",
+                              "Shape does not have element type F16\n"
+                              "in f32[1,2]{0,1}");
+  EXPECT_DESC_AND_EXPLANATION(shape, m::Shape().IsScalar(),
+                              "a shape that represents a scalar",
+                              "Shape is not a scalar\n"
+                              "in f32[1,2]{0,1}");
+  EXPECT_DESC_AND_EXPLANATION(ShapeUtil::MakeNil(), m::Shape().IsArray(),
+                              "a shape that represents an array",
+                              "Shape is not an array\n"
+                              "in ()");
+  EXPECT_DESC_AND_EXPLANATION(shape, m::Shape().IsTuple(),
+                              "a shape that represents a tuple",
+                              "Shape is not a tuple\n"
+                              "in f32[1,2]{0,1}");
+  EXPECT_DESC_AND_EXPLANATION(shape, m::Shape().IsEffectiveScalar(),
+                              "a shape that is an effective scalar",
+                              "Shape is not an effective scalar\n"
+                              "in f32[1,2]{0,1}");
+  EXPECT_DESC_AND_EXPLANATION(shape, m::Shape().WithRank(42),
+                              "a shape that has 42 dimensions",
+                              "Shape does not have rank 42\n"
+                              "in f32[1,2]{0,1}");
+  EXPECT_DESC_AND_EXPLANATION(shape, m::Shape().WithRank(0),
+                              "a shape that is a scalar",
+                              "Shape is not a scalar\n"
+                              "in f32[1,2]{0,1}");
+  EXPECT_DESC_AND_EXPLANATION(shape, m::Shape().WithRank(1).IsArray(),
+                              "a shape:\n"
+                              " * that has 1 dimension AND\n"
+                              " * that represents an array",
+                              "Shape does not have rank 1\n"
+                              "in f32[1,2]{0,1}");
+  EXPECT_DESC_AND_EXPLANATION(ShapeUtil::MakeNil(),
+                              m::Shape().IsArray().WithRank(1),
+                              "a shape:\n"
+                              " * that represents an array AND\n"
+                              " * that has 1 dimension",
+                              "Shape is not an array\n"
+                              "in ()");
+  EXPECT_DESC_AND_EXPLANATION(
+      ShapeUtil::MakeShapeWithLayout(F32, {1, 2}, {1, 0}),
+      m::Shape().WithLayoutEqualTo(&layout),
+      "a shape with\n  a layout equal to {0,1}",
+      "Layout {1,0} is not equal to expected {0,1}\n"
+      "in f32[1,2]{1,0}");
+  EXPECT_DESC_AND_EXPLANATION(
+      shape, m::Shape().WithLayout(m::Layout().WithSparseFormat()),
+      "a shape with\n  a layout with format SPARSE",
+      "Layout has format DENSE but expected SPARSE\n"
+      "in f32[1,2]{0,1}");
+  EXPECT_DESC_AND_EXPLANATION(shape,
+                              m::Shape().WithSubshapeEqualTo({10}, &shape),
+                              "a shape with subshape at index {10} which is\n"
+                              "  a shape equal to f32[1,2]{0,1}",
+                              "No subshape at {10}\n"
+                              "in f32[1,2]{0,1}");
+  EXPECT_DESC_AND_EXPLANATION(
+      ShapeUtil::MakeTupleShape({ShapeUtil::MakeShape(F32, {2, 2})}),
+      m::Shape().WithSubshapeEqualTo({0}, &shape),
+      "a shape with subshape at index {0} which is\n"
+      "  a shape equal to f32[1,2]{0,1}",
+      "Shape not equal to f32[1,2]{0,1}\n"
+      "in f32[2,2]{1,0}\n"
+      "in subshape at {0}\n"
+      "in (f32[2,2])");
+  EXPECT_DESC_AND_EXPLANATION(shape,
+                              m::Shape().WithSubshapeCompatibleTo({10}, &shape),
+                              "a shape with subshape at index {10} which is\n"
+                              "  a shape compatible with f32[1,2]",
+                              "No subshape at {10}\n"
+                              "in f32[1,2]{0,1}");
+  EXPECT_DESC_AND_EXPLANATION(
+      ShapeUtil::MakeTupleShape({ShapeUtil::MakeShape(F32, {2, 2})}),
+      m::Shape().WithSubshapeCompatibleTo({0}, &shape),
+      "a shape with subshape at index {0} which is\n"
+      "  a shape compatible with f32[1,2]",
+      "Shape not compatible with f32[1,2]\n"
+      "in f32[2,2]{1,0}\n"
+      "in subshape at {0}\n"
+      "in (f32[2,2])");
+  EXPECT_DESC_AND_EXPLANATION(
+      ShapeUtil::MakeTupleShape({ShapeUtil::MakeTupleShape({shape})}),
+      m::Shape().WithSubshape({0, 0}, m::Shape().IsScalar()),
+      "a shape with subshape at index {0,0} which is\n"
+      "  a shape that represents a scalar",
+      "Shape is not a scalar\n"
+      "in f32[1,2]{0,1}\n"
+      "in subshape at {0,0}\n"
+      "in ((f32[1,2]))");
+}
+
+std::unique_ptr<HloInstruction> SetName(absl::string_view name,
+                                        std::unique_ptr<HloInstruction> instr) {
+  instr->SetAndSanitizeName(string(name));
+  return instr;
+}
+
+TEST(PatternMatcherTest, HloInstructionDescribeToAndExplain) {
+  std::unique_ptr<HloInstruction> iota =
+      SetName("i", HloInstruction::CreateIota(ShapeUtil::MakeShape(S32, {42}),
+                                              /*iota_dimension=*/0));
+  std::unique_ptr<HloInstruction> constant =
+      SetName("c", HloInstruction::CreateConstant(LiteralUtil::CreateR0(0)));
+
+  EXPECT_DESC_AND_EXPLANATION(static_cast<const HloInstruction*>(nullptr),
+                              m::Op(), "an HloInstruction",
+                              "HloInstruction* is null");
+  EXPECT_DESC_AND_EXPLANATION(iota, m::Op().WithName("foo"),
+                              "an HloInstruction named \"foo\"",
+                              "HloInstruction not named \"foo\"\n"
+                              "in i = s32[42]{0} iota(), iota_dimension=0");
+  EXPECT_DESC_AND_EXPLANATION(iota, m::Op().WithOpcode(HloOpcode::kAdd),
+                              "an HloInstruction with opcode add",
+                              "HloInstruction doesn't have opcode add\n"
+                              "in i = s32[42]{0} iota(), iota_dimension=0");
+  EXPECT_DESC_AND_EXPLANATION(
+      constant, m::Op().IsNonConstant(),
+      "an HloInstruction with any opcode other than constant",
+      "HloInstruction has opcode constant, expected anything else\n"
+      "in c = s32[] constant(0)");
+  EXPECT_DESC_AND_EXPLANATION(iota, m::Op().WithNumOperands(42),
+                              "an HloInstruction with 42 operands",
+                              "HloInstruction doesn't have 42 operands\n"
+                              "in i = s32[42]{0} iota(), iota_dimension=0");
+  EXPECT_DESC_AND_EXPLANATION(iota, m::Op().WithShape(m::Shape().IsTuple()),
+                              "an HloInstruction outputting\n"
+                              "  a shape that represents a tuple",
+                              "Shape is not a tuple\n"
+                              "in s32[42]{0}\n"
+                              "in output shape\n"
+                              "in i = s32[42]{0} iota(), iota_dimension=0");
+  EXPECT_DESC_AND_EXPLANATION(
+      iota, m::Op().WithOperand(2, m::Op().WithOpcode(HloOpcode::kAdd)),
+      "an HloInstruction with operand 2 which is:\n"
+      "  an HloInstruction with opcode add",
+      "desired operand index 2 is out of bounds\n"
+      "in i = s32[42]{0} iota(), iota_dimension=0");
+
+  EXPECT_DESC_AND_EXPLANATION(
+      SetName("a", HloInstruction::CreateBinary(ShapeUtil::MakeShape(S32, {}),
+                                                HloOpcode::kAdd, constant.get(),
+                                                constant.get())),
+      m::Op().WithOperand(1, m::Op().IsNonConstant()),
+      "an HloInstruction with operand 1 which is:\n"
+      "  an HloInstruction with any opcode other than constant",
+      "HloInstruction has opcode constant, expected anything else\n"
+      "in c = s32[] constant(0)\n"
+      "in operand 1\n"
+      "in a = s32[] add(s32[] c, s32[] c)");
+  EXPECT_DESC_AND_EXPLANATION(
+      iota, m::Op().WithFusionKind(HloInstruction::FusionKind::kLoop),
+      "an HloInstruction with fusion kind kLoop",
+      "HloInstruction does not have fusion kind kLoop; it's not a fusion\n"
+      "in i = s32[42]{0} iota(), iota_dimension=0");
+  EXPECT_DESC_AND_EXPLANATION(
+      iota, m::Op().WithTupleIndex(42),
+      "an HloInstruction which is a GTE with index 42",
+      "HloInstruction is not a GTE with index 42; it's not a GTE at all\n"
+      "in i = s32[42]{0} iota(), iota_dimension=0");
+  EXPECT_DESC_AND_EXPLANATION(iota, m::Op().IsConstantScalar(),
+                              "an HloInstruction which is a constant scalar",
+                              "HloInstruction is not a constant\n"
+                              "in i = s32[42]{0} iota(), iota_dimension=0");
+  EXPECT_DESC_AND_EXPLANATION(
+      SetName("c", HloInstruction::CreateConstant(
+                       LiteralUtil::CreateR1<int>({1, 2}))),
+      m::Op().IsConstantEffectiveScalar(),
+      "an HloInstruction which is a constant effective scalar",
+      "HloInstruction is not an effective scalar\n"
+      "in c = s32[2]{0} constant({1, 2})");
+  EXPECT_DESC_AND_EXPLANATION(
+      SetName("c", HloInstruction::CreateConstant(LiteralUtil::CreateR0(10))),
+      m::Op().IsConstantScalar(42),
+      "an HloInstruction which is a constant scalar with value 42",
+      "HloInstruction's constant value 10 did not match expected value 42\n"
+      "in c = s32[] constant(10)");
+  EXPECT_DESC_AND_EXPLANATION(
+      SetName("c", HloInstruction::CreateConstant(LiteralUtil::CreateR0(2.25))),
+      m::Op().IsConstantEffectiveScalar(1.25),
+      "an HloInstruction which is a constant effective scalar with value 1.25",
+      "HloInstruction's constant value 2.25 did not match expected value 1.25\n"
+      "in c = f64[] constant(2.25)");
+  EXPECT_DESC_AND_EXPLANATION(
+      constant, m::Op().Is(iota.get()),
+      absl::StrCat("an HloInstruction which is 0x", absl::Hex(iota.get()),
+                   " (i = s32[42]{0} iota(), iota_dimension=0)"),
+      absl::StrCat("HloInstruction 0x", absl::Hex(constant.get()), " is not 0x",
+                   absl::Hex(iota.get()),
+                   " (i = s32[42]{0} iota(), iota_dimension=0)\n"
+                   "in c = s32[] constant(0)"));
+}
+
+TEST(PatternMatcherTest, HloInstructionMatcherAnyOrderDescribeTo) {
+  auto scalar_s32 = ShapeUtil::MakeShape(S32, {});
+  EXPECT_DESC_AND_EXPLANATION(
+      SetName("a", HloInstruction::CreateBinary(
+                       scalar_s32, HloOpcode::kAdd,
+                       SetName("b", HloInstruction::CreateConstant(
+                                        LiteralUtil::CreateR0(0)))
+                           .get(),
+                       SetName("c", HloInstruction::CreateConstant(
+                                        LiteralUtil::CreateR0(0)))
+                           .get())),
+      m::AddAnyOrder(m::Op().WithName("b"), m::Op().WithName("bar")),
+      "an HloInstruction:\n"
+      " * with opcode add AND\n"
+      " * with two operands in either order:\n"
+      "    - an HloInstruction named \"b\"\n"
+      "    - an HloInstruction named \"bar\"",
+      "HloInstruction's operands (ignoring order) did not match second "
+      "matcher.  Specifically,\n"
+      " - an HloInstruction named \"bar\"\n"
+      "does not match LHS:\n"
+      " - HloInstruction not named \"bar\"\n"
+      "   in b = s32[] constant(0)\n"
+      "does not match RHS:\n"
+      " - HloInstruction not named \"bar\"\n"
+      "   in c = s32[] constant(0)\n"
+      "in a = s32[] add(s32[] b, s32[] c)");
+
+  EXPECT_DESC_AND_EXPLANATION(
+      SetName("a",
+              HloInstruction::CreateBinary(
+                  scalar_s32, HloOpcode::kAdd,
+                  HloInstruction::CreateParameter(0, scalar_s32, "p").get(),
+                  SetName("c", HloInstruction::CreateConstant(
+                                   LiteralUtil::CreateR0(0)))
+                      .get())),
+      m::AddAnyOrder(m::Op().IsConstantScalar(), m::Op().IsConstant()),
+      "an HloInstruction:\n"
+      " * with opcode add AND\n"
+      " * with two operands in either order:\n"
+      "    - an HloInstruction which is a constant scalar\n"
+      "    - an HloInstruction with opcode constant",
+      "HloInstruction's LHS operand did not match either of the two matchers.  "
+      "Specifically,\n"
+      " - an HloInstruction which is a constant scalar\n"
+      "does not match LHS:\n"
+      " - HloInstruction is not a constant\n"
+      "   in p = s32[] parameter(0)\n"
+      "and\n"
+      " - an HloInstruction with opcode constant\n"
+      "does not match LHS:\n"
+      " - HloInstruction doesn't have opcode constant\n"
+      "   in p = s32[] parameter(0)\n"
+      "in a = s32[] add(s32[] p, s32[] c)");
+}
+
+TEST(PatternMatcherTest, AnyOfMatcherDescribeToAndExplain) {
+  EXPECT_DESC_AND_EXPLANATION(
+      SetName("c", HloInstruction::CreateConstant(LiteralUtil::CreateR0(0))),
+      m::AnyOf<HloInstruction>(m::Op().WithName("foo"),
+                               m::Op().WithName("bar")),
+      "any of:\n"
+      " - an HloInstruction named \"foo\" OR\n"
+      " - an HloInstruction named \"bar\"",
+      "None of the following matchers succeeded:\n"
+      "Matcher #1\n"
+      " - an HloInstruction named \"foo\"\n"
+      "failed with\n"
+      " - HloInstruction not named \"foo\"\n"
+      "   in c = s32[] constant(0)\n"
+      "Matcher #2\n"
+      " - an HloInstruction named \"bar\"\n"
+      "failed with\n"
+      " - HloInstruction not named \"bar\"\n"
+      "   in c = s32[] constant(0)");
+}
+
+TEST(PatternMatcherTest, Parameter) {
+  auto param =
+      HloInstruction::CreateParameter(1, ShapeUtil::MakeShape(F32, {}), "p1");
+  auto non_param =
+      SetName("c", HloInstruction::CreateConstant(LiteralUtil::CreateR0(0)));
+  EXPECT_FALSE(Match(param.get(), m::Parameter(0)));
+  EXPECT_TRUE(Match(param.get(), m::Parameter()));
+  EXPECT_TRUE(Match(param.get(), m::Parameter(1)));
+  EXPECT_FALSE(Match(non_param.get(), m::Parameter()));
+  EXPECT_FALSE(Match(non_param.get(), m::Parameter(1)));
+
+  EXPECT_DESC_AND_EXPLANATION(non_param, m::Parameter(1),
+                              "an HloInstruction:\n"
+                              " * with opcode parameter AND\n"
+                              " * which is parameter 1",
+                              "HloInstruction doesn't have opcode parameter\n"
+                              "in c = s32[] constant(0)");
+  EXPECT_EQ(Explanation(HloInstruction::CreateParameter(
+                            0, ShapeUtil::MakeShape(F32, {}), "p0"),
+                        m::Parameter(1)),
+            "HloInstruction is not parameter 1\n"
+            "in p0 = f32[] parameter(0)");
+}
+
+TEST(PatternMatcherTest, OneUseAndOneUser) {
+  auto param =
+      HloInstruction::CreateParameter(0, ShapeUtil::MakeShape(F32, {}), "p0");
+
+  EXPECT_FALSE(Match(param.get(), m::Op().WithOneUse()));
+  EXPECT_DESC_AND_EXPLANATION(
+      param, m::Op().WithOneUse(),
+      "an HloInstruction which has exactly one use",
+      "HloInstruction has 0 users, but expected exactly one.\n"
+      "in p0 = f32[] parameter(0)");
+
+  EXPECT_FALSE(Match(param.get(), m::Op().WithOneUser()));
+  EXPECT_DESC_AND_EXPLANATION(
+      param, m::Op().WithOneUser(),
+      "an HloInstruction which has exactly one user (but possibly is used "
+      "multiple times by that instruction)",
+      "HloInstruction has 0 users, but expected exactly one.\n"
+      "in p0 = f32[] parameter(0)");
+
+  {
+    auto reshape =
+        SetName("r", HloInstruction::CreateReshape(
+                         ShapeUtil::MakeShape(F32, {1}), param.get()));
+    EXPECT_TRUE(Match(param.get(), m::Op().WithOneUse()));
+    EXPECT_TRUE(Match(param.get(), m::Op().WithOneUser()));
+
+    auto reshape1 =
+        SetName("r1", HloInstruction::CreateReshape(
+                          ShapeUtil::MakeShape(F32, {1}), param.get()));
+    EXPECT_FALSE(Match(param.get(), m::Op().WithOneUse()));
+    EXPECT_FALSE(Match(param.get(), m::Op().WithOneUser()));
+
+    const char* kMultipleUserExplanation =
+        "HloInstruction has 2 users, but expected exactly one.\n"
+        "All users:\n"
+        " - r = f32[1]{0} reshape(f32[] p0)\n"
+        " - r1 = f32[1]{0} reshape(f32[] p0)\n"
+        "in p0 = f32[] parameter(0)";
+    EXPECT_EQ(Explanation(param.get(), m::Op().WithOneUse()),
+              kMultipleUserExplanation);
+    EXPECT_EQ(Explanation(param.get(), m::Op().WithOneUser()),
+              kMultipleUserExplanation);
+  }
+
+  auto add = SetName("add", HloInstruction::CreateBinary(
+                                ShapeUtil::MakeShape(F32, {}), HloOpcode::kAdd,
+                                param.get(), param.get()));
+  EXPECT_TRUE(Match(param.get(), m::Op().WithOneUser()));
+  EXPECT_FALSE(Match(param.get(), m::Op().WithOneUse()));
+  EXPECT_EQ(Explanation(param.get(), m::Op().WithOneUse()),
+            "HloInstruction is used 2 times by its user, but is expected to be "
+            "used just once: add = f32[] add(f32[] p0, f32[] p0)\n"
+            "in p0 = f32[] parameter(0)");
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/reduce_precision_insertion_test.cc b/tensorflow/compiler/xla/service/reduce_precision_insertion_test.cc
index 16fa80d53e7..efeec965714 100644
--- a/tensorflow/compiler/xla/service/reduce_precision_insertion_test.cc
+++ b/tensorflow/compiler/xla/service/reduce_precision_insertion_test.cc
@@ -54,7 +54,7 @@ TEST_F(ReducePrecisionInsertionTest, BeforeUnaryInstruction) {
   HloInstruction* b = builder.AddInstruction(
       HloInstruction::CreateUnary(shape, HloOpcode::kCos, a));
 
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
   // Confirm expected state before adding ops.
@@ -81,7 +81,7 @@ TEST_F(ReducePrecisionInsertionTest, BeforeUnaryScalarInstruction) {
   HloInstruction* b = builder.AddInstruction(
       HloInstruction::CreateUnary(shape, HloOpcode::kCos, a));
 
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
   // Confirm expected state before adding ops.
@@ -111,7 +111,7 @@ TEST_F(ReducePrecisionInsertionTest, BeforeBinaryInstruction) {
   HloInstruction* c = builder.AddInstruction(
       HloInstruction::CreateBinary(shape, HloOpcode::kAdd, a, b));
 
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
   // Confirm expected state before adding ops.
@@ -140,7 +140,7 @@ TEST_F(ReducePrecisionInsertionTest, BeforeZeroInputInstruction) {
   HloInstruction* b = builder.AddInstruction(
       HloInstruction::CreateUnary(shape, HloOpcode::kCos, a));
 
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
   // Confirm expected state before adding ops.
@@ -173,7 +173,7 @@ TEST_F(ReducePrecisionInsertionTest, AvoidAddingDuplicateInstructions) {
   HloInstruction* d = builder.AddInstruction(
       HloInstruction::CreateBinary(shape, HloOpcode::kAdd, b, c));
 
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
   // Confirm expected state before adding ops.
@@ -205,7 +205,7 @@ TEST_F(ReducePrecisionInsertionTest, AfterRootInstruction) {
   HloInstruction* b = builder.AddInstruction(
       HloInstruction::CreateUnary(shape, HloOpcode::kCos, a));
 
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
   // Confirm expected state before adding ops.
@@ -242,7 +242,7 @@ TEST_F(ReducePrecisionInsertionTest, AfterNonRootInstruction) {
   HloInstruction* c = builder.AddInstruction(
       HloInstruction::CreateBinary(shape, HloOpcode::kAdd, a_cos, b_cos));
 
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
   // Confirm expected graph before adding ops.
@@ -295,7 +295,7 @@ TEST_F(ReducePrecisionInsertionTest, ShouldReduceOutputPrecisionIsFalse) {
   HloInstruction* y = builder.AddInstruction(
       HloInstruction::CreateUnary(shape, HloOpcode::kCos, x));
 
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
   // Confirm expected graph before adding ops.
@@ -321,7 +321,7 @@ TEST_F(ReducePrecisionInsertionTest, InsertionIsNotRecursive) {
   HloInstruction* b = builder.AddInstruction(
       HloInstruction::CreateReducePrecision(shape, a, 8, 23));
 
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
   // Confirm expected state before adding ops.
@@ -348,7 +348,7 @@ TEST_F(ReducePrecisionInsertionTest, SkipRedundantReducePrecisionAfter) {
   HloInstruction* y = builder.AddInstruction(
       HloInstruction::CreateReducePrecision(shape, x, 5, 10));
 
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
   // Confirm expected graph before adding ops.
@@ -376,7 +376,7 @@ TEST_F(ReducePrecisionInsertionTest, AddNonRedundantReducePrecision) {
   HloInstruction* y = builder.AddInstruction(
       HloInstruction::CreateReducePrecision(shape, x, 8, 23));
 
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
   // Confirm expected graph before adding ops.
@@ -402,7 +402,7 @@ TEST_F(ReducePrecisionInsertionTest, IgnoreOpsInsideFusionNode) {
       builder.AddInstruction(HloInstruction::CreateParameter(0, shape, "x"));
   HloInstruction* y = builder.AddInstruction(
       HloInstruction::CreateUnary(shape, HloOpcode::kCos, x));
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
   // Manually fuse the kCos operation into a fusion operation.
@@ -438,7 +438,7 @@ TEST_F(ReducePrecisionInsertionTest, OpGetsInsertedInHeadOfFusionNode) {
       builder.AddInstruction(HloInstruction::CreateParameter(0, shape, "x"));
   HloInstruction* y = builder.AddInstruction(
       HloInstruction::CreateUnary(shape, HloOpcode::kCos, x));
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
   // Manually fuse the kCos operation into a fusion operation.
@@ -485,7 +485,7 @@ TEST_F(ReducePrecisionInsertionTest, OpGetsInsertedInTailOfFusionNode) {
       builder.AddInstruction(HloInstruction::CreateParameter(0, shape, "x"));
   HloInstruction* y = builder.AddInstruction(
       HloInstruction::CreateUnary(shape, HloOpcode::kCos, x));
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
   // Manually fuse the kCos operation into a fusion operation.
diff --git a/tensorflow/compiler/xla/service/service.cc b/tensorflow/compiler/xla/service/service.cc
index 75f7413b3c3..5ec7fe2aded 100644
--- a/tensorflow/compiler/xla/service/service.cc
+++ b/tensorflow/compiler/xla/service/service.cc
@@ -41,6 +41,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/source_map_util.h"
 #include "tensorflow/compiler/xla/service/stream_pool.h"
 #include "tensorflow/compiler/xla/service/transfer_manager.h"
+#include "tensorflow/compiler/xla/shape.h"
 #include "tensorflow/compiler/xla/shape_layout.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
@@ -275,8 +276,8 @@ StatusOr<std::unique_ptr<HloModuleConfig>> Service::CreateModuleConfig(
   }
   if (execution_options != nullptr &&
       execution_options->has_shape_with_output_layout()) {
-    const auto& shape_with_output_layout =
-        execution_options->shape_with_output_layout();
+    const Shape shape_with_output_layout(
+        execution_options->shape_with_output_layout());
     TF_RETURN_IF_ERROR(
         ValidateResultShape(shape_with_output_layout, program_shape.result()));
     TF_RETURN_IF_ERROR(
@@ -658,9 +659,9 @@ Status Service::ExecuteGraphParallel(const ExecuteGraphParallelRequest* arg,
     // replica 0.
     TF_ASSIGN_OR_RETURN(
         std::unique_ptr<HloModuleConfig> module_config,
-        CreateModuleConfig(request.computation().host_program_shape(),
-                           replicated_arguments.front(),
-                           request.execution_options()));
+        CreateModuleConfig(
+            ProgramShape{request.computation().host_program_shape()},
+            replicated_arguments.front(), request.execution_options()));
     VLOG(3)
         << "ExecuteGraphParallel created HloModuleConfig computation layout: "
         << module_config->entry_computation_layout().ToString();
@@ -745,9 +746,9 @@ Status Service::GetDeviceHandles(const GetDeviceHandlesRequest* arg,
   }
   if (available_device_count < arg->device_count() * replica_count) {
     return ResourceExhausted(
-        "Requested device count (%d) exceeds the number of available devices "
-        "on the target (%d)",
-        arg->device_count(), available_device_count);
+        "Requested logical device count (%d) with replica count (%d) exceeds "
+        "the number of available physical devices on the target (%d)",
+        arg->device_count(), replica_count, available_device_count);
   }
 
   for (int64 i = 0; i < arg->device_count(); ++i) {
@@ -818,14 +819,17 @@ Status Service::Compile(const CompileRequest* arg, CompileResponse* result) {
         "The compile request does not support multiple device handles.");
   }
 
-  std::vector<const Shape*> argument_shapes;
-  absl::c_transform(arg->input_shape_with_layout(),
-                    std::back_inserter(argument_shapes),
-                    [](const Shape& shape) { return &shape; });
+  std::vector<Shape> argument_shapes;
+  argument_shapes.reserve(arg->input_shape_with_layout_size());
+  std::vector<const Shape*> argument_shape_ptrs;
+  for (const ShapeProto& shape_proto : arg->input_shape_with_layout()) {
+    argument_shapes.push_back(Shape(shape_proto));
+    argument_shape_ptrs.push_back(&argument_shapes.back());
+  }
   TF_ASSIGN_OR_RETURN(
       std::unique_ptr<HloModuleConfig> module_config,
-      CreateModuleConfig(arg->computation().host_program_shape(),
-                         argument_shapes, &arg->execution_options()));
+      CreateModuleConfig(ProgramShape{arg->computation().host_program_shape()},
+                         argument_shape_ptrs, &arg->execution_options()));
   VLOG(3) << "Compile created HloModuleConfig computation layout: "
           << module_config->entry_computation_layout().ToString();
 
@@ -930,14 +934,14 @@ Status Service::TransferToClient(const TransferToClientRequest* arg,
   TF_ASSIGN_OR_RETURN(const ShapedBuffer* shaped_buffer,
                       allocation_tracker_.ResolveForReplica(arg->data(), 0));
 
-  const Shape* return_shape;
+  Shape return_shape;
   if (arg->has_shape_with_layout()) {
-    if (!LayoutUtil::HasLayout(arg->shape_with_layout())) {
+    return_shape = Shape(arg->shape_with_layout());
+    if (!LayoutUtil::HasLayout(return_shape)) {
       return InvalidArgument("shape_with_layout must have layout if present.");
     }
-    return_shape = &arg->shape_with_layout();
   } else {
-    return_shape = &shaped_buffer->on_host_shape();
+    return_shape = Shape(shaped_buffer->on_host_shape());
   }
 
   TF_ASSIGN_OR_RETURN(auto stream, execute_backend_->BorrowStream(
@@ -948,30 +952,15 @@ Status Service::TransferToClient(const TransferToClientRequest* arg,
       execute_backend_->transfer_manager()->TransferLiteralFromDevice(
           stream.get(), *shaped_buffer));
 
-  if (LayoutUtil::LayoutsInShapesEqual(*return_shape, result_literal.shape())) {
+  if (LayoutUtil::LayoutsInShapesEqual(return_shape, result_literal.shape())) {
     *result->mutable_literal() = result_literal.ToProto();
   } else {
     *result->mutable_literal() =
-        result_literal.Relayout(*return_shape).ToProto();
+        result_literal.Relayout(return_shape).ToProto();
   }
   return Status::OK();
 }
 
-namespace {
-
-// Creates a clone of the given shaped buffer with the given device ordinal. The
-// shape and DeviceMemoryBase values of the clone are identical to the original.
-std::unique_ptr<ShapedBuffer> CloneShapedBufferOnDevice(
-    const ShapedBuffer& shaped_buffer, int device_ordinal) {
-  auto clone = absl::make_unique<ShapedBuffer>(
-      shaped_buffer.on_host_shape(), shaped_buffer.on_device_shape(),
-      shaped_buffer.platform(), device_ordinal);
-  clone->buffers() = shaped_buffer.buffers();
-  return clone;
-}
-
-}  // namespace
-
 Status Service::TransferToServer(const TransferToServerRequest* arg,
                                  TransferToServerResponse* result) {
   TF_ASSIGN_OR_RETURN(Literal literal,
@@ -1060,11 +1049,11 @@ Status Service::TransferFromOutfeed(const TransferFromOutfeedRequest* arg,
     executor = replicas[arg->replica_id()];
   }
 
-  auto literal = Literal::CreateFromShape(arg->shape_with_layout());
+  auto literal = Literal::CreateFromShape(Shape(arg->shape_with_layout()));
 
   TF_RETURN_IF_ERROR(
       execute_backend_->transfer_manager()->TransferLiteralFromOutfeed(
-          executor, arg->shape_with_layout(), literal));
+          executor, Shape(arg->shape_with_layout()), literal));
   *result->mutable_literal() = literal.ToProto();
   return Status::OK();
 }
@@ -1087,7 +1076,7 @@ Status Service::ComputeConstantGraph(const ComputeConstantGraphRequest* arg,
         "constant computation may not depend on any parameters.");
   }
 
-  ProgramShape program_shape = arg->computation().host_program_shape();
+  ProgramShape program_shape(arg->computation().host_program_shape());
   TF_DCHECK_OK(ShapeUtil::ValidateShape(program_shape.result()));
   if (arg->has_output_layout()) {
     TF_RETURN_IF_ERROR(LayoutUtil::ValidateLayoutForShape(
@@ -1118,7 +1107,7 @@ Status Service::ComputeConstantGraph(const ComputeConstantGraphRequest* arg,
 Status Service::GetShape(const GetShapeRequest* arg, GetShapeResponse* result) {
   TF_ASSIGN_OR_RETURN(const ShapedBuffer* buffer,
                       allocation_tracker_.ResolveForReplica(arg->data(), 0));
-  *result->mutable_shape() = buffer->on_host_shape();
+  *result->mutable_shape() = buffer->on_host_shape().ToProto();
   return Status::OK();
 }
 
@@ -1131,7 +1120,7 @@ Status Service::GetComputationGraphStats(
     return InvalidArgument("Program shape may not be empty.");
   }
 
-  HloModuleConfig config(arg->computation().host_program_shape());
+  HloModuleConfig config(ProgramShape{arg->computation().host_program_shape()});
   config.set_debug_options(arg->debug_options());
   TF_ASSIGN_OR_RETURN(std::unique_ptr<HloModule> module,
                       CreateModuleFromProto(arg->computation(), config));
diff --git a/tensorflow/compiler/xla/service/shape_inference.cc b/tensorflow/compiler/xla/service/shape_inference.cc
index 61a60ef9efa..7e7282a7370 100644
--- a/tensorflow/compiler/xla/service/shape_inference.cc
+++ b/tensorflow/compiler/xla/service/shape_inference.cc
@@ -391,17 +391,6 @@ StatusOr<Shape> InferWindowOutputShape(const Shape& base_shape,
   return ShapeUtil::MakeShape(element_type, new_dimensions);
 }
 
-/* static */ StatusOr<Shape> ShapeInference::InferAfterAllShape(
-    absl::Span<const Shape* const> arg_shapes) {
-  for (const Shape* arg_shape : arg_shapes) {
-    if (arg_shape->element_type() != TOKEN) {
-      return InvalidArgument(
-          "Operands of token instructions must be TOKEN types.");
-    }
-  }
-  return ShapeUtil::MakeTokenShape();
-}
-
 /* static */ StatusOr<Shape> ShapeInference::InferConvertShape(
     const Shape& operand_shape, PrimitiveType new_element_type) {
   auto old_element_type = operand_shape.element_type();
@@ -1029,7 +1018,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
   switch (opcode) {
     case HloOpcode::kTuple: {
       Shape result = ShapeUtil::MakeTupleShape({});
-      result.mutable_tuple_shapes()->Reserve(operand_shapes.size());
+      result.mutable_tuple_shapes()->reserve(operand_shapes.size());
       for (const Shape* shape : operand_shapes) {
         ShapeUtil::AppendShapeToTuple(*shape, &result);
       }
@@ -2038,7 +2027,16 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
                            dimension);
   }
 
-  return ShapeUtil::MakeShape(S64, {});
+  // TODO(b/119580730): Remove this restriction when very large dimension size
+  // is needed.
+  if (shape.dimensions(dimension) > std::numeric_limits<uint32>::max()) {
+    return InvalidArgument(
+        "GetDimensionSize's input shape is %s, the %dth dimension exceeds the "
+        "UINT_MAX limit.",
+        ShapeUtil::HumanString(shape), dimension);
+  }
+
+  return ShapeUtil::MakeShape(U32, {});
 }
 
 /* static */ StatusOr<Shape> ShapeInference::InferSliceShape(
diff --git a/tensorflow/compiler/xla/service/shape_inference.h b/tensorflow/compiler/xla/service/shape_inference.h
index 31ef4b2e410..d94385a04d5 100644
--- a/tensorflow/compiler/xla/service/shape_inference.h
+++ b/tensorflow/compiler/xla/service/shape_inference.h
@@ -232,13 +232,6 @@ class ShapeInference {
   static StatusOr<Shape> InferConcatOpShape(
       absl::Span<const Shape* const> arg_shapes, int64 dimension);
 
-  // Infers the shape produced by a kAfterAll. Trivially this shape is always a
-  // TOKEN shape. However, ShapeInference serves two purposes: inferring shapes
-  // and checking operand shapes. This method verifies that the operand shapes
-  // are all TOKENs.
-  static StatusOr<Shape> InferAfterAllShape(
-      absl::Span<const Shape* const> arg_shapes);
-
   // Helper that validates the given operand shape can be converted to the
   // target output_shape via a convert instruction -- the requirement is that
   // the shape is identical except for the element type.
diff --git a/tensorflow/compiler/xla/service/transpose_folding_test.cc b/tensorflow/compiler/xla/service/transpose_folding_test.cc
index 7a565bf0768..17cdaa74fc3 100644
--- a/tensorflow/compiler/xla/service/transpose_folding_test.cc
+++ b/tensorflow/compiler/xla/service/transpose_folding_test.cc
@@ -172,7 +172,7 @@ TEST_F(TransposeFoldingTest, FuseDotWithConstantOperands) {
   HloInstruction* mul = builder.AddInstruction(HloInstruction::CreateBinary(
       add->shape(), HloOpcode::kMultiply, add, sub));
 
-  auto module = CreateNewUnverifiedModule("fuse_with_constant_operands");
+  auto module = CreateNewVerifiedModule("fuse_with_constant_operands");
   HloComputation* entry_computation =
       module->AddEntryComputation(builder.Build(mul));
   HloInstruction* call = module->OutlineExpressionFromComputation(
@@ -247,7 +247,7 @@ TEST_F(TransposeFoldingTest, FoldConvDimSwapTransposeRhs) {
       conv_shape.ValueOrDie(), x, transpose_y,
       /*feature_group_count=*/1, window, dnums, DefaultPrecisionConfig(2)));
 
-  auto module = CreateNewUnverifiedModule("test_module");
+  auto module = CreateNewVerifiedModule("test_module");
   HloComputation* entry_computation =
       module->AddEntryComputation(builder.Build(conv));
   FoldTranspose(module.get());
@@ -302,7 +302,7 @@ TEST_F(TransposeFoldingTest, FoldConvComplexTransposeRhs) {
       conv_shape.ValueOrDie(), x, transpose_y,
       /*feature_group_count=*/1, window, dnums, DefaultPrecisionConfig(2)));
 
-  auto module = CreateNewUnverifiedModule("test_module");
+  auto module = CreateNewVerifiedModule("test_module");
   HloComputation* entry_computation =
       module->AddEntryComputation(builder.Build(conv));
   FoldTranspose(module.get());
@@ -362,7 +362,7 @@ TEST_F(TransposeFoldingTest, FoldConvTransposeLhs) {
       conv_shape.ValueOrDie(), transpose_x, y,
       /*feature_group_count=*/1, window, dnums, DefaultPrecisionConfig(2)));
 
-  auto module = CreateNewUnverifiedModule("test_module");
+  auto module = CreateNewVerifiedModule("test_module");
   HloComputation* entry_computation =
       module->AddEntryComputation(builder.Build(conv));
   FoldTranspose(module.get());
@@ -428,7 +428,7 @@ TEST_F(TransposeFoldingTest, FoldConvComplexTransposeLhs) {
       conv_shape.ValueOrDie(), transpose_x, y,
       /*feature_group_count=*/1, window, dnums, DefaultPrecisionConfig(2)));
 
-  auto module = CreateNewUnverifiedModule("test_module");
+  auto module = CreateNewVerifiedModule("test_module");
   HloComputation* entry_computation =
       module->AddEntryComputation(builder.Build(conv));
   FoldTranspose(module.get());
diff --git a/tensorflow/compiler/xla/service/tuple_points_to_analysis.cc b/tensorflow/compiler/xla/service/tuple_points_to_analysis.cc
index 96f3055c98e..50d51eaeb76 100644
--- a/tensorflow/compiler/xla/service/tuple_points_to_analysis.cc
+++ b/tensorflow/compiler/xla/service/tuple_points_to_analysis.cc
@@ -280,6 +280,13 @@ Status TuplePointsToAnalysis::HandleDomain(HloInstruction* domain) {
   return Status::OK();
 }
 
+Status TuplePointsToAnalysis::HandleAddDependency(
+    HloInstruction* add_dependency) {
+  // AddDependency just forwards the value of its zero-th operand.
+  CreateCopiedPointsToSet(add_dependency, add_dependency->operand(0));
+  return Status::OK();
+}
+
 Status TuplePointsToAnalysis::HandleRecvDone(HloInstruction* recv_done) {
   // RecvDone aliases its input (Recv) tuple element {0} to element {0} of its
   // output. The other indices ({} and {1}) define their own buffers.
diff --git a/tensorflow/compiler/xla/service/tuple_points_to_analysis.h b/tensorflow/compiler/xla/service/tuple_points_to_analysis.h
index bcfcb388f95..0a1d5649d6d 100644
--- a/tensorflow/compiler/xla/service/tuple_points_to_analysis.h
+++ b/tensorflow/compiler/xla/service/tuple_points_to_analysis.h
@@ -252,6 +252,7 @@ class TuplePointsToAnalysis : public DfsHloVisitorWithDefault {
   Status HandleRecvDone(HloInstruction* recv_done) override;
   Status HandleSend(HloInstruction* send) override;
   Status HandleTupleSelect(HloInstruction* tuple_select) override;
+  Status HandleAddDependency(HloInstruction* add_dependency) override;
 
   string ToString() const;
 
diff --git a/tensorflow/compiler/xla/service/tuple_points_to_analysis_test.cc b/tensorflow/compiler/xla/service/tuple_points_to_analysis_test.cc
index 10ef2d38fa2..561762b5d42 100644
--- a/tensorflow/compiler/xla/service/tuple_points_to_analysis_test.cc
+++ b/tensorflow/compiler/xla/service/tuple_points_to_analysis_test.cc
@@ -264,6 +264,22 @@ TEST_F(TuplePointsToAnalysisTest, GetTupleElement) {
               UnorderedElementsAre(inner_tuple));
 }
 
+TEST_F(TuplePointsToAnalysisTest, AddDependency) {
+  auto builder = HloComputation::Builder(TestName());
+  auto constant = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(1.0)));
+  auto token = builder.AddInstruction(HloInstruction::CreateToken());
+  auto add_dependency = builder.AddInstruction(
+      HloInstruction::CreateAddDependency(constant, token));
+  BuildModuleAndRunAnalysis(builder.Build());
+
+  auto& points_to_set = points_to_analysis_->GetPointsToSet(add_dependency);
+  EXPECT_EQ(1, points_to_set.size());
+  EXPECT_FALSE(points_to_set.IsAmbiguous());
+  EXPECT_TRUE(points_to_set.IsDistinct());
+  ExpectHasTopLevelBuffers(points_to_set.CreateFlattenedSet(), {constant});
+}
+
 TEST_F(TuplePointsToAnalysisTest, DuplicatedElement) {
   // Create a tuple which contains duplicate elements.
   auto builder = HloComputation::Builder(TestName());
diff --git a/tensorflow/compiler/xla/service/while_loop_invariant_code_motion.cc b/tensorflow/compiler/xla/service/while_loop_invariant_code_motion.cc
index b7c28bfac78..41011176ffa 100644
--- a/tensorflow/compiler/xla/service/while_loop_invariant_code_motion.cc
+++ b/tensorflow/compiler/xla/service/while_loop_invariant_code_motion.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/tuple_util.h"
 #include "tensorflow/compiler/xla/service/while_loop_analysis.h"
 #include "tensorflow/compiler/xla/service/while_util.h"
+#include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/util.h"
 
 namespace xla {
@@ -207,6 +208,37 @@ WhileLoopInvariantCodeMotion::TryHoistingInvariantInstructionsFromWhileBody(
       continue;
     }
 
+    if (!hoist_size_inflating_ops_) {
+      // Check that hoisting the instruction doesn't cause a significant memory
+      // blow-up. LICM extends the live-range of the output of the hoisted
+      // instruction to be the entire while loop, which may be problematic on
+      // platforms where memory is limited. This can be especially harmful if
+      // the instruction has a significantly larger output than its input, e.g.
+      // kIota, kBroadcast or kConstant.
+      int64 input_size = 0, output_size = 0;
+
+      for (auto* operand : instruction->operands()) {
+        ShapeUtil::ForEachSubshape(
+            operand->shape(),
+            [&input_size](const Shape& subshape, const ShapeIndex& /*index*/) {
+              if (ShapeUtil::IsArray(subshape)) {
+                input_size += ShapeUtil::ByteSizeOfElements(subshape);
+              }
+            });
+      }
+      ShapeUtil::ForEachSubshape(
+          instruction->shape(),
+          [&output_size](const Shape& subshape, const ShapeIndex& /*index*/) {
+            if (ShapeUtil::IsArray(subshape)) {
+              output_size += ShapeUtil::ByteSizeOfElements(subshape);
+            }
+          });
+
+      if (output_size > input_size) {
+        continue;
+      }
+    }
+
     auto is_invariant = [&](HloInstruction* op) {
       return hoisted_instructions.find(op) != hoisted_instructions.end() ||
              unhoisted_invariant_instructions.count(op) ||
diff --git a/tensorflow/compiler/xla/service/while_loop_invariant_code_motion.h b/tensorflow/compiler/xla/service/while_loop_invariant_code_motion.h
index 3031899f71e..bd6232dc0a9 100644
--- a/tensorflow/compiler/xla/service/while_loop_invariant_code_motion.h
+++ b/tensorflow/compiler/xla/service/while_loop_invariant_code_motion.h
@@ -34,8 +34,14 @@ class WhileLoopInvariantCodeMotion : public HloModulePass {
   // Setting `hoist_constants` to false can be help if LICM is run in the mid
   // level HLO pipeline because hoisting constants out of while loop bodies can
   // break optimizations like constant folding.
-  explicit WhileLoopInvariantCodeMotion(bool hoist_constants = false)
-      : hoist_constants_(hoist_constants) {}
+  // Setting `hoist_size_inflating_ops` to false will forbid hoisting
+  // instructions where the size of the output(s) is larger than the size of the
+  // input(s). This is useful on platforms on which it's important to prevent
+  // blow-ups in memory size.
+  explicit WhileLoopInvariantCodeMotion(bool hoist_constants = false,
+                                        bool hoist_size_inflating_ops = true)
+      : hoist_constants_(hoist_constants),
+        hoist_size_inflating_ops_(hoist_size_inflating_ops) {}
   ~WhileLoopInvariantCodeMotion() override = default;
 
   absl::string_view name() const override {
@@ -49,6 +55,7 @@ class WhileLoopInvariantCodeMotion : public HloModulePass {
       HloInstruction* while_instr);
 
   bool hoist_constants_;
+  bool hoist_size_inflating_ops_;
 };
 }  // namespace xla
 
diff --git a/tensorflow/compiler/xla/service/while_loop_invariant_code_motion_test.cc b/tensorflow/compiler/xla/service/while_loop_invariant_code_motion_test.cc
index 046ccb2d3f2..8e7c4bc8828 100644
--- a/tensorflow/compiler/xla/service/while_loop_invariant_code_motion_test.cc
+++ b/tensorflow/compiler/xla/service/while_loop_invariant_code_motion_test.cc
@@ -570,5 +570,59 @@ TEST_F(WhileLoopInvariantCodeMotionTest, DoNotHoistOutOfSingleIteration) {
   EXPECT_FALSE(simplified_loop);
 }
 
+const char* const kInflatingTestCase = R"(
+HloModule ModuleWithWhile
+
+mul {
+  lhs = f32[] parameter(0)
+  rhs = f32[] parameter(1)
+  ROOT mul = f32[] multiply(lhs, rhs)
+}
+
+body {
+  p_body = (f32[]) parameter(0)
+  iota = f32[1024, 1024] iota(), iota_dimension=0
+  add = f32[1024, 1024] add(iota, iota)
+  constant = f32[] constant(1.0)
+  reduce = f32[] reduce(f32[1024, 1024] add, f32[] constant), dimensions={0,1}, to_apply=mul
+  ROOT root = (f32[]) tuple(reduce)
+}
+
+condition {
+  p_cond = (f32[]) parameter(0)
+  ROOT result = pred[] constant(true)
+}
+
+ENTRY entry {
+  param = f32[] parameter(0)
+  while_init = (f32[]) tuple(param)
+  ROOT while = (f32[]) while(while_init), condition=condition, body=body
+}
+)";
+
+TEST_F(WhileLoopInvariantCodeMotionTest, HoistsInflatingByDefault) {
+  auto m = ParseAndReturnVerifiedModule(kInflatingTestCase).ValueOrDie();
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      bool simplified_loop,
+      WhileLoopInvariantCodeMotion(/*hoist_constants=*/true).Run(m.get()));
+  EXPECT_TRUE(simplified_loop);
+
+  HloComputation* while_body = m->GetComputationWithName("wide.body");
+  ASSERT_NE(while_body, nullptr);
+  EXPECT_THAT(while_body->instructions(), Not(Contains(op::Iota())));
+}
+
+TEST_F(WhileLoopInvariantCodeMotionTest, NoHoistInflating) {
+  auto m = ParseAndReturnVerifiedModule(kInflatingTestCase).ValueOrDie();
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      bool simplified_loop,
+      WhileLoopInvariantCodeMotion(/*hoist_constants=*/true,
+                                   /*hoist_size_inflating_ops=*/false)
+          .Run(m.get()));
+  EXPECT_FALSE(simplified_loop);
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/while_loop_simplifier.cc b/tensorflow/compiler/xla/service/while_loop_simplifier.cc
index 6f924a29d8a..d30f67dd811 100644
--- a/tensorflow/compiler/xla/service/while_loop_simplifier.cc
+++ b/tensorflow/compiler/xla/service/while_loop_simplifier.cc
@@ -19,13 +19,17 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
 #include "absl/types/optional.h"
+#include "tensorflow/compiler/xla/primitive_util.h"
 #include "tensorflow/compiler/xla/service/call_inliner.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_instructions.h"
 #include "tensorflow/compiler/xla/service/hlo_query.h"
+#include "tensorflow/compiler/xla/service/pattern_matcher.h"
 #include "tensorflow/compiler/xla/service/while_loop_analysis.h"
 
 namespace xla {
 
+namespace m = match;
 using absl::optional;
 using hlo_query::ContainsInstrWithOpcode;
 
@@ -302,6 +306,147 @@ static StatusOr<bool> TryRemoveDeadWhileParams(HloInstruction* while_op) {
   return true;
 }
 
+// Removes each loop parameter (i.e. member of the while loop tuple) that is a
+// constant and is the same in the while loop body and the while loop init.
+static StatusOr<bool> TryRemoveConstantParams(HloInstruction* while_op) {
+  HloModule* module = while_op->GetModule();
+  HloComputation* computation = while_op->parent();
+  auto* while_init = while_op->mutable_operand(0);
+  auto* while_body = while_op->while_body();
+  auto* while_cond = while_op->while_condition();
+  auto* while_body_root = while_body->root_instruction();
+  if (while_init->opcode() != HloOpcode::kTuple ||
+      while_body_root->opcode() != HloOpcode::kTuple) {
+    return false;
+  }
+
+  TF_RET_CHECK(while_cond->num_parameters() == 1);
+  TF_RET_CHECK(while_body->num_parameters() == 1);
+  TF_RET_CHECK(
+      ShapeUtil::Compatible(while_init->shape(), while_body_root->shape()));
+
+  absl::flat_hash_set<int64> constant_tuple_indices;
+  const auto& while_shape = while_init->shape();
+  for (int64 i = 0; i < while_shape.tuple_shapes_size(); ++i) {
+    auto* init_elem = while_init->operand(i);
+    auto* body_elem = while_body_root->operand(i);
+    if (init_elem->opcode() == HloOpcode::kConstant &&
+        body_elem->opcode() == HloOpcode::kConstant &&
+        init_elem->literal() == body_elem->literal()) {
+      constant_tuple_indices.insert(i);
+    }
+  }
+
+  if (constant_tuple_indices.empty()) {
+    return false;
+  }
+
+  // OK, we found some constant elements of the while parameter!  Eliminate
+  // them.
+  std::vector<Shape> new_while_shape_elems;
+  for (int64 i = 0; i < while_shape.tuple_shapes_size(); ++i) {
+    if (!constant_tuple_indices.count(i)) {
+      new_while_shape_elems.push_back(while_shape.tuple_shapes(i));
+    }
+  }
+  Shape new_while_shape = ShapeUtil::MakeTupleShape(new_while_shape_elems);
+
+  // `new_instrs` holds instructions created outside of a computation for
+  // cloning.  Elements added here just need to live until the end of the
+  // relevant CloneWithReplacement call.
+  std::vector<std::unique_ptr<HloInstruction>> new_instrs;
+  auto add_new_instr = [&](std::unique_ptr<HloInstruction> instr) {
+    new_instrs.push_back(std::move(instr));
+    return new_instrs.back().get();
+  };
+
+  // Returns a new tuple without the elements of constant_tuple_indices.
+  auto remove_constant_elems = [&](HloInstruction* instr) {
+    CHECK(ShapeUtil::Compatible(instr->shape(), while_shape));
+
+    std::vector<HloInstruction*> tuple_elems;
+    for (int64 i = 0; i < while_shape.tuple_shapes_size(); ++i) {
+      if (!constant_tuple_indices.count(i)) {
+        tuple_elems.push_back(
+            add_new_instr(HloInstruction::CreateGetTupleElement(
+                while_shape.tuple_shapes(i), instr, i)));
+      }
+    }
+    return HloInstruction::CreateTuple(tuple_elems);
+  };
+
+  auto add_constant_elems = [&](HloInstruction* instr) {
+    CHECK(ShapeUtil::Compatible(instr->shape(), new_while_shape));
+
+    std::vector<HloInstruction*> tuple_elems;
+    int64 j = 0;
+    for (int64 i = 0; i < while_shape.tuple_shapes_size(); ++i) {
+      if (constant_tuple_indices.count(i)) {
+        tuple_elems.push_back(while_init->mutable_operand(i));
+      } else {
+        tuple_elems.push_back(
+            add_new_instr(HloInstruction::CreateGetTupleElement(
+                while_shape.tuple_shapes(i), instr, j)));
+        ++j;
+      }
+    }
+    return HloInstruction::CreateTuple(tuple_elems);
+  };
+
+  // Special case: constant_tuple_indices covers the whole while parameter, so
+  // the new while shape is the empty tuple.  In this case, the value of the
+  // while loop is simply equal to the value of `init`.
+  //
+  // It's unfortunate to special-case this, but it's simpler than the
+  // alternative.  The problem is that if our while parameter has no
+  // non-constant elems, the tuple returned by `add_constant_elems` won't depend
+  // on instr (the loop body/cond parameter), and therefore
+  // CloneWithReplacementPairs will *leave the parameter out entirely*, creating
+  // invalid HLO.
+  if (ShapeUtil::IsEmptyTuple(new_while_shape)) {
+    TF_RETURN_IF_ERROR(computation->ReplaceInstruction(while_op, while_init));
+    return true;
+  }
+
+  std::unique_ptr<HloComputation> new_while_cond =
+      while_cond->CloneWithReplacementPairs({
+          while_cond->parameter_instruction(0),
+          add_constant_elems(add_new_instr(HloInstruction::CreateParameter(
+              0, new_while_shape,
+              while_cond->parameter_instruction(0)->name()))),
+      });
+
+  std::unique_ptr<HloComputation> new_while_body =
+      while_body->CloneWithReplacementPairs(
+          {
+              while_body->parameter_instruction(0),
+              add_constant_elems(add_new_instr(HloInstruction::CreateParameter(
+                  0, new_while_shape,
+                  while_cond->parameter_instruction(0)->name()))),
+          },
+          {
+              while_body->root_instruction(),
+              remove_constant_elems(
+                  add_new_instr(while_body->root_instruction()->Clone())),
+          });
+
+  // Create the final while loop, and add any new instructions created to
+  // `computation`.
+  new_instrs.clear();
+  TF_RETURN_IF_ERROR(computation->ReplaceWithNewInstruction(
+      while_op,
+      add_constant_elems(
+          computation->AddInstruction(HloInstruction::CreateWhile(
+              new_while_shape,
+              module->AddEmbeddedComputation(std::move(new_while_cond)),
+              module->AddEmbeddedComputation(std::move(new_while_body)),
+              add_new_instr(remove_constant_elems(while_init)))))));
+  for (auto& instr : new_instrs) {
+    computation->AddInstruction(std::move(instr));
+  }
+  return true;
+}
+
 // Tries to remove a while loop from the graph.
 //
 //  - Loops with trip count of 0 can be replaced by the loop's "init" value.
@@ -381,16 +526,14 @@ static StatusOr<bool> TryPropagateConstant(HloInstruction* while_op) {
   // performance by forcing us to copy constants.
   absl::flat_hash_map<int, const HloInstruction*> index_to_constant;
   for (int i = 0; i < root_operands.size(); i++) {
-    HloInstruction* instr = root_operands[i];
-    if (instr->opcode() == HloOpcode::kGetTupleElement &&
-        instr->tuple_index() == i && instr->operand(0) == while_body_param &&
-        ShapeUtil::IsScalar(instr->shape())) {
-      auto tuple_element = while_init->operand(i);
-      if (tuple_element->IsConstant()) {
-        VLOG(3) << "Found loop invariant tuple element " << i << " "
-                << tuple_element->ToString();
-        index_to_constant[i] = tuple_element;
-      }
+    const HloInstruction* init_tuple_elem = nullptr;
+    if (Match(root_operands[i],
+              m::GetTupleElement(m::Op().Is(while_body_param), i)
+                  .WithShape(m::Shape().IsScalar())) &&
+        Match(while_init->operand(i), m::Constant(&init_tuple_elem))) {
+      VLOG(3) << "Found loop invariant tuple element " << i << " "
+              << init_tuple_elem->ToString();
+      index_to_constant[i] = init_tuple_elem;
     }
   }
 
@@ -519,14 +662,6 @@ static StatusOr<bool> TryFlattenNestedTuples(HloInstruction* while_op) {
     return false;
   }
 
-  // Cowardly refuse to perform this optimization in the presence of kDomain
-  // instructions, which may reference other instructions in the loop and
-  // therefore make this complicated.
-  if (ContainsInstrWithOpcode(while_body, {HloOpcode::kDomain}) ||
-      ContainsInstrWithOpcode(while_cond, {HloOpcode::kDomain})) {
-    return false;
-  }
-
   std::vector<Shape> flattened_shape_elems;
   ShapeUtil::ForEachSubshape(while_shape,
                              [&](const Shape& s, const ShapeIndex& /*index*/) {
@@ -605,6 +740,243 @@ static StatusOr<bool> TryFlattenNestedTuples(HloInstruction* while_op) {
   return true;
 }
 
+// Tries to merge loop induction variables of a given type.
+//
+// In this pass we're only concerned with elements of the loop's tuple that
+// are effective-scalars of type `elem_ty`.  Some terminology:
+//
+//  - The trip counter is the first element of the loop's tuple that starts at
+//    0 and does x++ on each iteration.
+//
+//  - An induction variable is an element of the loop's tuple that is not the
+//    trip counter and does `x += <constant>` on each iteration of the loop.
+//    Negative constants are OK.
+//
+// This pass adds a trip counter if one isn't already present, then replaces
+// each induction variable with
+//
+//   <initial_value> + <trip_count> * <constant>.
+//
+// This reduces the number of scalar operations in the loop, which is important
+// e.g. on GPUs, where each scalar operation is nontrivially expensive because
+// it's a separate kernel launch.
+//
+// Returns the new loop if a change was made, or null if no change was made.
+// Note that the new loop is not a valid replacement for the old loop; it may
+// need to be wrapped in a tuple that changes its shape.  We return the loop
+// itself so that you can call TryMergeInductionVariables in a loop, once for
+// each integral type elem_ty.
+static StatusOr<HloInstruction*> TryMergeInductionVariables(
+    HloInstruction* while_op, PrimitiveType elem_ty) {
+  CHECK(primitive_util::IsIntegralType(elem_ty)) << PrimitiveType_Name(elem_ty);
+  HloModule* module = while_op->GetModule();
+  HloComputation* computation = while_op->parent();
+  auto* while_init = while_op->mutable_operand(0);
+  auto* while_body = while_op->while_body();
+  auto* while_cond = while_op->while_condition();
+  auto* while_body_root = while_body->root_instruction();
+  if (while_init->opcode() != HloOpcode::kTuple ||
+      while_body_root->opcode() != HloOpcode::kTuple) {
+    return nullptr;
+  }
+
+  TF_RET_CHECK(while_cond->num_parameters() == 1);
+  TF_RET_CHECK(while_body->num_parameters() == 1);
+  TF_RET_CHECK(
+      ShapeUtil::Compatible(while_init->shape(), while_body_root->shape()));
+  Shape while_shape = while_init->shape();
+
+  // The tuple index of the trip counter, if one is present.
+  absl::optional<int64> trip_counter;
+  // Maps the tuple index of each induction variable to its constant increment.
+  absl::flat_hash_map<int64, const HloConstantInstruction*> induction_vars;
+  for (int64 i = 0; i < while_body_root->operand_count(); ++i) {
+    HloInstruction* constant;
+    if (!Match(while_body_root->mutable_operand(i),
+               m::AddAnyOrder(m::GetTupleElement(m::Parameter(), i),
+                              m::ConstantScalar(&constant))
+                   .WithShape(m::Shape().WithElementType(elem_ty)))) {
+      continue;
+    }
+    if (!trip_counter && constant->literal().IsAll(1) &&
+        while_init->operand(i)->IsConstant() &&
+        while_init->operand(i)->literal().IsAll(0)) {
+      VLOG(10) << "Found existing trip counter at index " << i;
+      trip_counter = i;
+    } else {
+      VLOG(10) << "Found induction variable at index " << i;
+      induction_vars.emplace(i, Cast<HloConstantInstruction>(constant));
+    }
+  }
+
+  // There's only something to simplify if we can either:
+  //
+  //  - combine one or more induction vars with an existing trip counter, or
+  //  - replace two or more induction variables with a new trip counter.
+  //
+  // Put another way, there's only something to simplify if the number of
+  // induction vars plus the number of existing trip counters (0 or 1) is >= 2.
+  if (induction_vars.size() + (trip_counter.has_value() ? 1 : 0) < 2) {
+    return nullptr;
+  }
+
+  // OK, we're going to do the transformation!  Set up some helpers.
+
+  // `new_instrs` holds instructions created outside of a computation for
+  // cloning.  Elements added here just need to live until the end of the
+  // relevant CloneWithReplacement call.
+  std::vector<std::unique_ptr<HloInstruction>> new_instrs;
+  auto add_new_instr = [&](std::unique_ptr<HloInstruction> instr) {
+    new_instrs.push_back(std::move(instr));
+    return new_instrs.back().get();
+  };
+
+  auto add_binary_op = [&](const Shape& shape, HloOpcode opcode,
+                           HloInstruction* lhs, HloInstruction* rhs) {
+    // Reshape lhs/rhs to the output shape if necessary.  This deals with the
+    // fact that induction variables need only be effective scalars, not true
+    // scalars.
+    if (!ShapeUtil::Compatible(shape, lhs->shape())) {
+      lhs = add_new_instr(HloInstruction::CreateReshape(shape, lhs));
+    }
+    if (!ShapeUtil::Compatible(shape, rhs->shape())) {
+      rhs = add_new_instr(HloInstruction::CreateReshape(shape, rhs));
+    }
+    return add_new_instr(HloInstruction::CreateBinary(shape, opcode, lhs, rhs));
+  };
+
+  auto add_gte = [&](HloInstruction* src, int64 idx) {
+    return add_new_instr(HloInstruction::CreateGetTupleElement(
+        src->shape().tuple_shapes(idx), src, idx));
+  };
+
+  // Our new while loop will have the same shape as the old while loop, except
+  // we'll add a trip counter to the end if it wasn't originally present.
+  Shape new_while_shape = while_shape;
+  bool added_trip_counter = false;
+  if (!trip_counter) {
+    VLOG(10) << "Adding new trip counter to end of loop's tuple.";
+    trip_counter = new_while_shape.tuple_shapes_size();
+    *new_while_shape.add_tuple_shapes() =
+        ShapeUtil::MakeShape(elem_ty, /*dimensions=*/{});
+    added_trip_counter = true;
+  }
+
+  // Converts `instr` into a tuple of the "old" form -- that is, to a tuple with
+  // shape `while_body->shape()` and where the induction variables are "reified"
+  // (i.e. they have value <init> + <counter> * <constant>).
+  auto convert_to_old_form = [&](HloInstruction* instr) {
+    CHECK(ShapeUtil::Compatible(instr->shape(), new_while_shape));
+    std::vector<HloInstruction*> tuple_elems;
+    for (int64 i = 0; i < while_shape.tuple_shapes_size(); ++i) {
+      const auto& elem_shape = while_shape.tuple_shapes(i);
+      if (!induction_vars.count(i)) {
+        tuple_elems.push_back(add_gte(instr, i));
+        continue;
+      }
+      tuple_elems.push_back(add_binary_op(
+          elem_shape, HloOpcode::kAdd, add_gte(instr, i),
+          add_binary_op(elem_shape, HloOpcode::kMultiply,
+                        add_gte(instr, *trip_counter),
+                        add_new_instr(induction_vars.at(i)->Clone()))));
+    }
+    return HloInstruction::CreateTuple(tuple_elems);
+  };
+
+  // Converts `root` into a tuple of the "new" form -- that is, to a tuple with
+  // shape `new_while_shape` and where the induction variables (but not trip
+  // counters) are replaced with their unchanging <loop_body_param> values.
+  auto convert_to_new_form = [&](HloInstruction* old_root,
+                                 HloParameterInstruction* loop_body_param) {
+    CHECK(ShapeUtil::Compatible(old_root->shape(), while_shape));
+    std::vector<HloInstruction*> tuple_elems;
+
+    // In the new form, induction variables come from `init`, everything else
+    // (including the trip counter if it's not one we created ourselves) comes
+    // from the `root` tuple unmodified.
+    for (int64 i = 0; i < while_shape.tuple_shapes_size(); ++i) {
+      tuple_elems.push_back(
+          add_gte((induction_vars.count(i) ? loop_body_param : old_root), i));
+    }
+    // If we created a trip counter ourselves, add 1 to it in the next
+    // iteration.
+    if (added_trip_counter) {
+      tuple_elems.push_back(add_binary_op(
+          new_while_shape.tuple_shapes(*trip_counter), HloOpcode::kAdd,
+          add_gte(loop_body_param, *trip_counter),
+          add_new_instr(
+              HloInstruction::CreateConstant(LiteralUtil::One(elem_ty)))));
+    }
+
+    return HloInstruction::CreateTuple(tuple_elems);
+  };
+
+  // Creates a new init tuple, which is the same as the old init tuple except if
+  // we added a trip counter, it's set to 0.
+  auto get_new_while_init = [&](HloInstruction* init) {
+    CHECK(ShapeUtil::Compatible(init->shape(), while_shape));
+    if (!added_trip_counter) {
+      return init;
+    }
+    std::vector<HloInstruction*> tuple_elems;
+    for (int64 i = 0; i < while_shape.tuple_shapes_size(); ++i) {
+      tuple_elems.push_back(add_gte(init, i));
+    }
+    tuple_elems.push_back(add_new_instr(
+        HloInstruction::CreateConstant(LiteralUtil::Zero(elem_ty))));
+    return add_new_instr(HloInstruction::CreateTuple(tuple_elems));
+  };
+
+  std::unique_ptr<HloComputation> new_while_cond =
+      while_cond->CloneWithReplacementPairs({
+          while_cond->parameter_instruction(0),
+          convert_to_old_form(add_new_instr(HloInstruction::CreateParameter(
+              0, new_while_shape,
+              while_cond->parameter_instruction(0)->name()))),
+      });
+
+  // Creating the new while body proceeds in two steps.  First we convert the
+  // users of the parameter to the old form.  Then as a second
+  // CloneWithReplacement operation we convert the root to the new form.  We
+  // have to do this in two steps because the new root needs to use the new
+  // param0, and during the first clone operation, only the *old-form* param0 is
+  // accessible.
+  //
+  // We have to add temp_new_while_body to the module because cloning a
+  // computation touches the module (to get its NameUniquer).
+  HloComputation* temp_new_while_body =
+      module->AddEmbeddedComputation(while_body->CloneWithReplacementPairs({
+          while_body->parameter_instruction(0),
+          convert_to_old_form(add_new_instr(HloInstruction::CreateParameter(
+              0, new_while_shape,
+              while_body->parameter_instruction(0)->name()))),
+      }));
+  std::unique_ptr<HloComputation> new_while_body =
+      temp_new_while_body->CloneWithReplacementPairs({
+          temp_new_while_body->root_instruction(),
+          convert_to_new_form(
+              add_new_instr(temp_new_while_body->root_instruction()->Clone()),
+              Cast<HloParameterInstruction>(
+                  temp_new_while_body->parameter_instruction(0))),
+      });
+  TF_RETURN_IF_ERROR(module->RemoveEmbeddedComputation(temp_new_while_body));
+
+  // Create the final while loop, and add any new instructions created to
+  // `computation`.
+  new_instrs.clear();
+  auto* new_while = computation->AddInstruction(HloInstruction::CreateWhile(
+      new_while_shape,
+      module->AddEmbeddedComputation(std::move(new_while_cond)),
+      module->AddEmbeddedComputation(std::move(new_while_body)),
+      get_new_while_init(while_init)));
+  TF_RETURN_IF_ERROR(computation->ReplaceWithNewInstruction(
+      while_op, convert_to_old_form(new_while)));
+  for (auto& instr : new_instrs) {
+    computation->AddInstruction(std::move(instr));
+  }
+  return new_while;
+}
+
 StatusOr<bool> WhileLoopSimplifier::Run(HloModule* module) {
   XLA_VLOG_LINES(3,
                  "WhileLoopSimplifier::Run(), before:\n" + module->ToString());
@@ -650,19 +1022,50 @@ StatusOr<bool> WhileLoopSimplifier::Run(HloModule* module) {
       continue;
     }
 
+    // TODO(b/119281462): Cowardly refuse to perform any of the following
+    // optimizations in the presence of kDomain instructions.  It seems that
+    // modifying a while loop's tuple doesn't work when kDomain is present.
+    if (ContainsInstrWithOpcode(while_op->while_body(), {HloOpcode::kDomain}) ||
+        ContainsInstrWithOpcode(while_op->while_condition(),
+                                {HloOpcode::kDomain})) {
+      continue;
+    }
+
+    // Each of the optimizations below modifies the while loop itself if it's
+    // successful, meaning that `while_op` is no longer valid after one of these
+    // transformations returns true.
+
     TF_ASSIGN_OR_RETURN(result, TryFlattenNestedTuples(while_op));
     changed |= result;
     if (result) {
-      // Successfully flattening nested tuples results in us cloning and
-      // replacing the while loop, meaning that `while_op` is no longer valid.
       continue;
     }
 
     TF_ASSIGN_OR_RETURN(result, TryRemoveDeadWhileParams(while_op));
     changed |= result;
     if (result) {
-      // Successfully removing dead while params results in us cloning and
-      // replacing the while loop, meaning that `while_op` is no longer valid.
+      continue;
+    }
+
+    TF_ASSIGN_OR_RETURN(result, TryRemoveConstantParams(while_op));
+    changed |= result;
+    if (result) {
+      continue;
+    }
+
+    bool merged_induction_vars = false;
+    // Notably missing from this list are S16 and U16.  These don't currently
+    // work because S/U16 literals are not implemented.
+    for (auto elem_ty : {S8, U8, S32, U32, S64, U64}) {
+      TF_ASSIGN_OR_RETURN(auto* new_while_op,
+                          TryMergeInductionVariables(while_op, elem_ty));
+      if (new_while_op) {
+        while_op = new_while_op;
+        changed = true;
+        merged_induction_vars = true;
+      }
+    }
+    if (merged_induction_vars) {
       continue;
     }
   }
diff --git a/tensorflow/compiler/xla/service/while_loop_simplifier_test.cc b/tensorflow/compiler/xla/service/while_loop_simplifier_test.cc
index 05005e0b262..4950e8269e9 100644
--- a/tensorflow/compiler/xla/service/while_loop_simplifier_test.cc
+++ b/tensorflow/compiler/xla/service/while_loop_simplifier_test.cc
@@ -17,9 +17,12 @@ limitations under the License.
 
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_replace.h"
+#include "tensorflow/compiler/xla/service/algebraic_simplifier.h"
+#include "tensorflow/compiler/xla/service/hlo_cse.h"
 #include "tensorflow/compiler/xla/service/hlo_dce.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_matchers.h"
+#include "tensorflow/compiler/xla/service/tuple_simplifier.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
@@ -27,8 +30,17 @@ limitations under the License.
 namespace xla {
 namespace {
 
+using ::testing::_;
 namespace op = xla::testing::opcode_matchers;
 
+// Returns the first kWhile instruction within m's entry computation.
+HloInstruction* FindFirstWhile(HloModule* m) {
+  const auto& instrs = m->entry_computation()->instructions();
+  return *absl::c_find_if(instrs, [](const HloInstruction* instr) {
+    return instr->opcode() == HloOpcode::kWhile;
+  });
+}
+
 class WhileLoopSimplifierTest : public HloTestBase {
  protected:
   // Makes an HloModule that contains a loop with `num_iters` iteration.
@@ -540,11 +552,7 @@ TEST_F(WhileLoopSimplifierTest, FlattenNestedTuple) {
   // it easy to find.
   EXPECT_TRUE(HloDCE().Run(m.get()).ok());
 
-  const auto& instrs = m->entry_computation()->instructions();
-  HloInstruction* new_while =
-      *absl::c_find_if(instrs, [](const HloInstruction* instr) {
-        return instr->opcode() == HloOpcode::kWhile;
-      });
+  HloInstruction* new_while = FindFirstWhile(m.get());
   Shape flat_tuple =
       ShapeUtil::ParseShapeString("(s32[1], s32[2], s32[3], s32[4])")
           .ValueOrDie();
@@ -563,5 +571,177 @@ TEST_F(WhileLoopSimplifierTest, FlattenNestedTuple) {
           .ValueOrDie()));
 }
 
+// Edge-case: All elements of the loop carry are constants which can be removed,
+// leaving us with a nullary loop.  This is a special case, we just replace the
+// loop with its init.
+TEST_F(WhileLoopSimplifierTest, OnlyConstantsInLoopCarry) {
+  const string hlo_string = R"(
+  HloModule Test
+  Body {
+    param = (s32[1]) parameter(0)
+    a = s32[1] constant({0})
+    ROOT tuple = (s32[1]) tuple(a)
+  }
+  Cond {
+    param = (s32[1]) parameter(0)
+    ROOT cond = pred[] constant(true)
+  }
+  ENTRY Loop {
+    a = s32[1] constant({0})
+    init = (s32[1]) tuple(a)
+    ROOT while = (s32[1]) while(init), condition=Cond, body=Body
+  })";
+
+  auto m = ParseAndReturnVerifiedModule(hlo_string).ValueOrDie();
+  EXPECT_TRUE(WhileLoopSimplifier().Run(m.get()).ValueOrDie());
+  EXPECT_TRUE(HloDCE().Run(m.get()).ok());
+  EXPECT_TRUE(TupleSimplifier().Run(m.get()).ok());
+  EXPECT_THAT(m->entry_computation()->root_instruction(),
+              op::Tuple(op::Constant()));
+}
+
+TEST_F(WhileLoopSimplifierTest, RemoveConstantFromLoopCarry) {
+  const string hlo_string = R"(
+  HloModule Test
+  Body {
+    param = (s32[1], s32[2], s32[3]) parameter(0)
+    a = s32[1] get-tuple-element(param), index=0
+    a.1 = s32[1] add(a, a)
+    b = s32[2] constant({1,1})
+    c = s32[3] constant({10,10,10})
+    ROOT tuple = (s32[1], s32[2], s32[3]) tuple(a.1, b, c)
+  }
+  Cond {
+    param = (s32[1], s32[2], s32[3]) parameter(0)
+    /* Use each tuple element.  The verifier will then ensure that if any of
+     * these get modified, they're replaced with values of the correct shape. */
+    a = s32[1] get-tuple-element(param), index=0
+    b = s32[2] get-tuple-element(param), index=1
+    c = s32[3] get-tuple-element(param), index=2
+    ROOT cond = pred[] constant(true)
+  }
+  ENTRY Loop {
+    /* Only `b` should be simplified away.  `a` is not a constant within the
+     * loop, and `c`'s value changes depending on whether we run 0 or 1
+     * iterations of the loop. */
+    a = s32[1] constant({0})
+    b = s32[2] constant({1,1})
+    c = s32[3] constant({2,2,2})
+    init = (s32[1], s32[2], s32[3]) tuple(a,b,c)
+    ROOT while = (s32[1], s32[2], s32[3]) while(init),
+      condition=Cond, body=Body
+  })";
+
+  auto m = ParseAndReturnVerifiedModule(hlo_string).ValueOrDie();
+  EXPECT_TRUE(WhileLoopSimplifier().Run(m.get()).ValueOrDie());
+  // DCE away the old loop so there's just one while loop in the module, making
+  // it easy to find.
+  EXPECT_TRUE(HloDCE().Run(m.get()).ok());
+  // Run the tuple simplifier to make the resulting HLO a bit easier to check.
+  EXPECT_TRUE(TupleSimplifier().Run(m.get()).ok());
+
+  HloInstruction* new_while = FindFirstWhile(m.get());
+  Shape new_while_shape =
+      ShapeUtil::ParseShapeString("(s32[1], s32[3])").ValueOrDie();
+  EXPECT_TRUE(ShapeUtil::Equal(new_while->shape(), new_while_shape));
+  EXPECT_TRUE(ShapeUtil::Equal(
+      new_while->while_body()->root_instruction()->shape(), new_while_shape));
+  EXPECT_TRUE(ShapeUtil::Equal(
+      new_while->while_body()->parameter_instruction(0)->shape(),
+      new_while_shape));
+  EXPECT_TRUE(ShapeUtil::Equal(
+      new_while->while_condition()->parameter_instruction(0)->shape(),
+      new_while_shape));
+  EXPECT_TRUE(ShapeUtil::Equal(
+      m->entry_computation()->root_instruction()->shape(),
+      ShapeUtil::ParseShapeString("(s32[1], s32[2], s32[3])").ValueOrDie()));
+  EXPECT_THAT(m->entry_computation()->root_instruction(),
+              op::Tuple(_, op::Constant(), _));
+}
+
+const char* const kSimpleMergeInductionVariablesModule = R"(
+  HloModule Test
+  Body {
+    param = (TYPE[], TYPE[], TYPE[]) parameter(0)
+
+    a = TYPE[] get-tuple-element(param), index=0
+    one = TYPE[] constant(1)
+    a1 = TYPE[] add(a, one)
+
+    b = TYPE[] get-tuple-element(param), index=1
+    negone = TYPE[] constant(-1)
+    b1 = TYPE[] add(b, negone)
+
+    c = TYPE[] add(a, b)
+
+    ROOT tuple = (TYPE[], TYPE[], TYPE[]) tuple(a1,b1,c)
+  }
+  Cond {
+    param = (TYPE[], TYPE[], TYPE[]) parameter(0)
+    a = TYPE[] get-tuple-element(param), index=0
+    b = TYPE[] get-tuple-element(param), index=1
+    sum = TYPE[] power(a, b)
+    ten = TYPE[] constant(10)
+    ROOT cond = pred[] less-than(sum, ten)
+  }
+  ENTRY Loop {
+    a = TYPE[] constant(10)
+    b = TYPE[] constant(100)
+    c = TYPE[] constant(0)
+    init = (TYPE[], TYPE[], TYPE[]) tuple(a,b,c)
+    while = (TYPE[], TYPE[], TYPE[]) while(init), condition=Cond, body=Body
+
+    a1 = TYPE[] get-tuple-element(while), index=0
+    b1 = TYPE[] get-tuple-element(while), index=1
+    ROOT sum = TYPE[] add(a1, b1)
+  })";
+
+TEST_F(WhileLoopSimplifierTest, MergeInductionVariables_Simple) {
+  string hlo_string = absl::StrReplaceAll(kSimpleMergeInductionVariablesModule,
+                                          {{"TYPE", "s32"}});
+
+  auto m = ParseAndReturnVerifiedModule(hlo_string).ValueOrDie();
+  EXPECT_TRUE(WhileLoopSimplifier().Run(m.get()).ValueOrDie());
+  // DCE away the old loop so there's just one while loop in the module, making
+  // it easy to find, and run the tuple simplifier to make the resulting HLO
+  // easier to check.
+  EXPECT_TRUE(HloDCE().Run(m.get()).ok());
+  EXPECT_TRUE(TupleSimplifier().Run(m.get()).ok());
+
+  HloInstruction* new_while = FindFirstWhile(m.get());
+  // We should have added a new loop counter for s32[] to the end of the tuple.
+  SCOPED_TRACE(m->ToString());
+  Shape new_while_shape =
+      ShapeUtil::ParseShapeString("(s32[], s32[], s32[], s32[])").ValueOrDie();
+  EXPECT_TRUE(ShapeUtil::Equal(new_while->shape(), new_while_shape));
+  EXPECT_TRUE(ShapeUtil::Equal(
+      new_while->while_body()->root_instruction()->shape(), new_while_shape));
+  EXPECT_TRUE(ShapeUtil::Equal(
+      new_while->while_body()->parameter_instruction(0)->shape(),
+      new_while_shape));
+  EXPECT_TRUE(ShapeUtil::Equal(
+      new_while->while_condition()->parameter_instruction(0)->shape(),
+      new_while_shape));
+
+  EXPECT_THAT(new_while->while_body()->root_instruction(),
+              op::Tuple(op::GetTupleElement(op::Parameter(), 0),
+                        op::GetTupleElement(op::Parameter(), 1), op::Add(),
+                        op::Add(op::GetTupleElement(op::Parameter(), 3),
+                                op::Constant())));
+  EXPECT_THAT(new_while->while_condition()->root_instruction(),
+              op::Lt(op::Power(op::Add(), op::Add()), op::Constant()));
+}
+
+// We shouldn't merge S16 induction variables; we can't create constants of this
+// type because S16 literals are not implemented.
+TEST_F(WhileLoopSimplifierTest, MergeInductionVariables_SkipS16) {
+  string hlo_string = absl::StrReplaceAll(kSimpleMergeInductionVariablesModule,
+                                          {{"TYPE", "s16"}});
+  EXPECT_FALSE(
+      WhileLoopSimplifier()
+          .Run(ParseAndReturnVerifiedModule(hlo_string).ValueOrDie().get())
+          .ValueOrDie());
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/shape.cc b/tensorflow/compiler/xla/shape.cc
new file mode 100644
index 00000000000..746ab9e9977
--- /dev/null
+++ b/tensorflow/compiler/xla/shape.cc
@@ -0,0 +1,107 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/shape.h"
+
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_join.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+
+namespace xla {
+
+Shape::Shape(const ShapeProto& shape_proto) {
+  set_element_type(shape_proto.element_type());
+  dimensions_.reserve(shape_proto.dimensions_size());
+  for (const int64 dimension : shape_proto.dimensions()) {
+    add_dimensions(dimension);
+  }
+  tuple_shapes_.reserve(shape_proto.tuple_shapes_size());
+  for (const ShapeProto& element_shape : shape_proto.tuple_shapes()) {
+    *add_tuple_shapes() = Shape(element_shape);
+  }
+  if (shape_proto.has_layout()) {
+    *mutable_layout() = shape_proto.layout();
+  }
+}
+
+ShapeProto Shape::ToProto() const {
+  ShapeProto proto;
+  proto.set_element_type(element_type_);
+  proto.mutable_dimensions()->Reserve(dimensions_size());
+  for (const int64 dimension : dimensions()) {
+    proto.add_dimensions(dimension);
+  }
+  proto.mutable_tuple_shapes()->Reserve(tuple_shapes_size());
+  for (const Shape& shape : tuple_shapes()) {
+    *proto.add_tuple_shapes() = shape.ToProto();
+  }
+  if (has_layout()) {
+    *proto.mutable_layout() = layout();
+  }
+  return proto;
+}
+
+string Shape::ToString(bool print_layout) const {
+  if (print_layout) {
+    return ShapeUtil::HumanStringWithLayout(*this);
+  } else {
+    return ShapeUtil::HumanString(*this);
+  }
+}
+
+std::ostream& operator<<(std::ostream& out, const Shape& shape) {
+  out << shape.ToString(/*print_layout=*/true);
+  return out;
+}
+
+ProgramShape::ProgramShape(const ProgramShapeProto& program_shape_proto) {
+  for (const ShapeProto& shape_proto : program_shape_proto.parameters()) {
+    *add_parameters() = Shape(shape_proto);
+  }
+  *mutable_result() = Shape(program_shape_proto.result());
+  for (const string& name : program_shape_proto.parameter_names()) {
+    add_parameter_names(name);
+  }
+}
+
+ProgramShapeProto ProgramShape::ToProto() const {
+  ProgramShapeProto proto;
+  for (const Shape& shape : parameters()) {
+    *proto.add_parameters() = shape.ToProto();
+  }
+  *proto.mutable_result() = result().ToProto();
+  for (const string& name : parameter_names()) {
+    proto.add_parameter_names(name);
+  }
+  return proto;
+}
+
+string ProgramShape::ToString() const {
+  std::vector<string> parameter_strings(parameters_size());
+  for (int i = 0; i < parameters_size(); ++i) {
+    parameter_strings[i] = absl::StrCat(
+        i < parameter_names_size() ? parameter_names(i) : "(unknown)", ": ",
+        ShapeUtil::HumanString(parameters(i)));
+  }
+  return absl::StrCat("(", absl::StrJoin(parameter_strings, ", "), ") -> ",
+                      ShapeUtil::HumanString(result()));
+}
+
+std::ostream& operator<<(std::ostream& out, const ProgramShape& program_shape) {
+  out << program_shape.ToString() << "\n";
+  return out;
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/shape.h b/tensorflow/compiler/xla/shape.h
new file mode 100644
index 00000000000..7f6b14ab428
--- /dev/null
+++ b/tensorflow/compiler/xla/shape.h
@@ -0,0 +1,204 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SHAPE_H_
+#define TENSORFLOW_COMPILER_XLA_SHAPE_H_
+
+#include <string>
+#include <vector>
+
+#include "absl/types/optional.h"
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace xla {
+
+// A shape describes the number of dimensions in a array, the bounds of each
+// dimension, and the primitive component type. For tuples, shape describes the
+// structure (number of elements and nesting).
+class Shape {
+ public:
+  Shape() = default;
+
+  // Construct a shape from a ShapeProto.
+  explicit Shape(const ShapeProto& shape_proto);
+
+  // Returns a ShapeProto representation of the Shape.
+  ShapeProto ToProto() const;
+
+  // Returns a human-readable string that represents the given shape, with or
+  // without layout. e.g. "F32[42,12] {0, 1}" or "F32[64]".
+  string ToString(bool print_layout = false) const;
+
+  // The following methods mirror the protobuf generated code interface for the
+  // message ShapeProto. This enabled easy migration of this data structure
+  // from a proto to a proper C++ class.
+  // TODO(b/29771030): Replace or augment these methods with a more ergonomic
+  // interface.
+
+  // Methods for accessing the primitive type.
+  PrimitiveType element_type() const { return element_type_; }
+  void set_element_type(PrimitiveType value) { element_type_ = value; }
+
+  // Methods for accessing the dimensions array.
+  int dimensions_size() const { return dimensions_.size(); }
+  int64 dimensions(int index) const { return dimensions_.at(index); }
+  void set_dimensions(int index, int64 value) { dimensions_.at(index) = value; }
+  void add_dimensions(int64 value) { dimensions_.push_back(value); }
+  void clear_dimensions() { dimensions_.clear(); }
+  const std::vector<int64>& dimensions() const { return dimensions_; }
+  std::vector<int64>* mutable_dimensions() { return &dimensions_; }
+
+  // Methods for accessing the tuple subshapes. This field only non-empty for
+  // tuple shapes.
+  int tuple_shapes_size() const { return tuple_shapes_.size(); }
+  const Shape& tuple_shapes(int index) const { return tuple_shapes_.at(index); }
+  Shape* mutable_tuple_shapes(int index) { return &tuple_shapes_.at(index); }
+  Shape* add_tuple_shapes() {
+    tuple_shapes_.push_back(Shape());
+    return &tuple_shapes_.back();
+  }
+  void clear_tuple_shapes() { tuple_shapes_.clear(); }
+  const std::vector<Shape>& tuple_shapes() const { return tuple_shapes_; }
+  std::vector<Shape>* mutable_tuple_shapes() { return &tuple_shapes_; }
+
+  // Methods for accessing the layout field.
+  bool has_layout() const { return layout_.has_value(); }
+  const Layout& layout() const {
+    if (layout_.has_value()) {
+      return *layout_;
+    } else {
+      return Layout::default_instance();
+    }
+  }
+  Layout* mutable_layout() {
+    if (!layout_.has_value()) {
+      layout_ = Layout();
+    }
+    return &layout_.value();
+  }
+  void clear_layout() { layout_.reset(); }
+
+  void Swap(Shape* other) {
+    using std::swap;
+    swap(*this, *other);
+  }
+
+  void Clear() {
+    element_type_ = PRIMITIVE_TYPE_INVALID;
+    dimensions_.clear();
+    tuple_shapes_.clear();
+    layout_.reset();
+  }
+
+  string SerializeAsString() const { return ToProto().SerializeAsString(); }
+  string ShortDebugString() const { return ToProto().ShortDebugString(); }
+  string DebugString() const { return ToProto().DebugString(); }
+
+ public:
+  // The element type of this shape (tuple, array, etc).
+  PrimitiveType element_type_ = PRIMITIVE_TYPE_INVALID;
+
+  // The array bounds of the dimensions. This is nonempty only for array shapes.
+  std::vector<int64> dimensions_;
+
+  // The tuple element subshapes. This is nonempty only for tuple shapes.
+  std::vector<Shape> tuple_shapes_;
+
+  // The array layout of the shape. This is present only for array shapes.
+  absl::optional<Layout> layout_;
+};
+
+// Shape of the parameters and output of an XLA computation. This is analogous
+// to a traditional function signature.
+class ProgramShape {
+ public:
+  ProgramShape() = default;
+
+  // Creates a ProgramShape from a ProgramShapeProto protobuf.
+  explicit ProgramShape(const ProgramShapeProto& program_shape_proto);
+
+  // Returns a proto representation of the object.
+  ProgramShapeProto ToProto() const;
+
+  string ToString() const;
+
+  // The following methods mirror the protobuf generated code interface for the
+  // message ProgramShapeProto. This enabled easy migration of this data
+  // structure from a proto to a proper C++ class.
+  // TODO(b/29771030): Replace or augment these methods with a more ergonomic
+  // interface.
+
+  // Methods for accessing and manipulating the Shape of the parameters.
+  int parameters_size() const { return parameters_.size(); }
+  const Shape& parameters(int index) const { return parameters_.at(index); }
+  Shape* mutable_parameters(int index) { return &parameters_.at(index); }
+  Shape* add_parameters() {
+    parameters_.emplace_back();
+    return &parameters_.back();
+  }
+  void clear_parameters() { parameters_.clear(); }
+  const std::vector<Shape>& parameters() const { return parameters_; }
+  std::vector<Shape>* mutable_parameters() { return &parameters_; }
+
+  // Methods for accessing and manipulating the Shape of the result.
+  const Shape& result() const { return result_; }
+  Shape* mutable_result() { return &result_; }
+
+  // Methods for accessing and manipulating the names of the parameters.
+  int parameter_names_size() const { return parameter_names_.size(); }
+  const string& parameter_names(int index) const {
+    return parameter_names_.at(index);
+  }
+  void set_parameter_names(int index, const string& value) {
+    parameter_names_.at(index) = value;
+  }
+  string* mutable_parameter_names(int index) {
+    return &parameter_names_.at(index);
+  }
+  void add_parameter_names(const string& value) {
+    parameter_names_.push_back(value);
+  }
+  string* add_parameter_names() {
+    parameter_names_.push_back("");
+    return &parameter_names_.back();
+  }
+  void clear_parameter_names() { parameter_names_.clear(); }
+  const std::vector<string>& parameter_names() const {
+    return parameter_names_;
+  }
+  std::vector<string>* mutable_parameter_names() { return &parameter_names_; }
+
+  string ShortDebugString() const { return ToProto().ShortDebugString(); }
+  string DebugString() const { return ToProto().DebugString(); }
+
+ private:
+  // The shapes of the parameters of the computation represented by this object.
+  std::vector<Shape> parameters_;
+
+  // The names of the parameters of the computation represented by this object.
+  std::vector<string> parameter_names_;
+
+  // The shape of the result of the computation represented by this object.
+  Shape result_;
+};
+
+std::ostream& operator<<(std::ostream& out, const Shape& shape);
+std::ostream& operator<<(std::ostream& out, const ProgramShape& program_shape);
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SHAPE_H_
diff --git a/tensorflow/compiler/xla/shape_test.cc b/tensorflow/compiler/xla/shape_test.cc
new file mode 100644
index 00000000000..e396897eeeb
--- /dev/null
+++ b/tensorflow/compiler/xla/shape_test.cc
@@ -0,0 +1,149 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/shape.h"
+
+#include <numeric>
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_join.h"
+#include "tensorflow/compiler/xla/layout_util.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/compiler/xla/test_helpers.h"
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+
+namespace xla {
+namespace {
+
+class ShapeTest : public ::testing::Test {
+ protected:
+  const Shape opaque_ = ShapeUtil::MakeOpaqueShape();
+  const Shape token_ = ShapeUtil::MakeTokenShape();
+  const Shape scalar_ = ShapeUtil::MakeShape(F32, {});
+  const Shape matrix_ = ShapeUtil::MakeShape(U32, {1, 2});
+  const Shape matrix2_ = ShapeUtil::MakeShapeWithLayout(S32, {3, 4}, {0, 1});
+  const Shape tuple_ =
+      ShapeUtil::MakeTupleShape({opaque_, scalar_, matrix_, matrix2_});
+  const Shape nested_tuple_ =
+      ShapeUtil::MakeTupleShape({tuple_, matrix_, token_});
+};
+
+TEST_F(ShapeTest, ShapeToFromProto) {
+  for (const Shape& shape :
+       {opaque_, token_, scalar_, matrix_, matrix2_, tuple_, nested_tuple_}) {
+    Shape shape_copy(shape.ToProto());
+    EXPECT_TRUE(ShapeUtil::Equal(shape, shape_copy))
+        << shape << " != " << shape_copy;
+  }
+}
+
+TEST_F(ShapeTest, ShapeToString) {
+  EXPECT_EQ("opaque[]", opaque_.ToString());
+  EXPECT_EQ("token[]", token_.ToString());
+  EXPECT_EQ("f32[]", scalar_.ToString());
+  EXPECT_EQ("u32[1,2]", matrix_.ToString());
+  EXPECT_EQ("s32[3,4]", matrix2_.ToString());
+  EXPECT_EQ("(opaque[], f32[], u32[1,2], s32[3,4])", tuple_.ToString());
+  EXPECT_EQ("((opaque[], f32[], u32[1,2], s32[3,4]), u32[1,2], token[])",
+            nested_tuple_.ToString());
+
+  EXPECT_EQ("opaque[]", opaque_.ToString(/*print_layout=*/true));
+  EXPECT_EQ("f32[]", scalar_.ToString(/*print_layout=*/true));
+  EXPECT_EQ("u32[1,2]{1,0}", matrix_.ToString(/*print_layout=*/true));
+  EXPECT_EQ("s32[3,4]{0,1}", matrix2_.ToString(/*print_layout=*/true));
+  EXPECT_EQ("(opaque[], f32[], u32[1,2]{1,0}, s32[3,4]{0,1})",
+            tuple_.ToString(/*print_layout=*/true));
+  EXPECT_EQ(
+      "((opaque[], f32[], u32[1,2]{1,0}, s32[3,4]{0,1}), u32[1,2]{1,0}, "
+      "token[])",
+      nested_tuple_.ToString(/*print_layout=*/true));
+}
+
+TEST_F(ShapeTest, ProgramShapeToFromProto) {
+  ProgramShape program_shape;
+  *program_shape.add_parameters() = ShapeUtil::MakeShape(F32, {1, 2, 3});
+  *program_shape.add_parameters() = ShapeUtil::MakeTokenShape();
+  *program_shape.add_parameters() = ShapeUtil::MakeShape(S64, {});
+  *program_shape.add_parameters() = ShapeUtil::MakeTupleShape(
+      {ShapeUtil::MakeShape(S32, {}),
+       ShapeUtil::MakeTupleShape({ShapeUtil::MakeTokenShape()}),
+       ShapeUtil::MakeShape(F32, {42, 42})});
+
+  *program_shape.mutable_result() = ShapeUtil::MakeShape(F32, {7});
+
+  program_shape.add_parameter_names("foo");
+  program_shape.add_parameter_names("bar");
+  program_shape.add_parameter_names("baz");
+  program_shape.add_parameter_names("qux qux");
+
+  // Create a copy of the program shape by round-tripping through a proto.
+  ProgramShape program_shape_copy(program_shape.ToProto());
+  ASSERT_EQ(program_shape.parameters_size(),
+            program_shape_copy.parameters_size());
+  for (int i = 0; i < program_shape.parameters_size(); ++i) {
+    EXPECT_TRUE(ShapeUtil::Equal(program_shape.parameters(i),
+                                 program_shape_copy.parameters(i)));
+  }
+
+  EXPECT_TRUE(
+      ShapeUtil::Equal(program_shape.result(), program_shape_copy.result()));
+
+  ASSERT_EQ(program_shape.parameter_names_size(),
+            program_shape_copy.parameter_names_size());
+  for (int i = 0; i < program_shape.parameter_names_size(); ++i) {
+    EXPECT_EQ(program_shape.parameter_names(i),
+              program_shape_copy.parameter_names(i));
+  }
+}
+
+TEST_F(ShapeTest, ProgramShapeToString) {
+  ProgramShape prog = ShapeUtil::MakeProgramShape(
+      {opaque_, scalar_, matrix_, matrix2_, tuple_, nested_tuple_},
+      nested_tuple_);
+  EXPECT_EQ(
+      "((unknown): opaque[], "
+      "(unknown): f32[], "
+      "(unknown): u32[1,2], "
+      "(unknown): s32[3,4], "
+      "(unknown): (opaque[], f32[], u32[1,2], s32[3,4]), "
+      "(unknown): ((opaque[], f32[], u32[1,2], s32[3,4]), u32[1,2], token[])) "
+      "-> "
+      "((opaque[], f32[], u32[1,2], s32[3,4]), u32[1,2], token[])",
+      prog.ToString());
+
+  prog.add_parameter_names("arg0");
+  prog.add_parameter_names("scalar");
+  prog.add_parameter_names("matrix");
+  prog.add_parameter_names("matrix2");
+  prog.add_parameter_names("tuple");
+  prog.add_parameter_names("nested_tuple");
+  EXPECT_EQ(
+      "(arg0: opaque[], "
+      "scalar: f32[], "
+      "matrix: u32[1,2], "
+      "matrix2: s32[3,4], "
+      "tuple: (opaque[], f32[], u32[1,2], s32[3,4]), "
+      "nested_tuple: ((opaque[], f32[], u32[1,2], s32[3,4]), u32[1,2], "
+      "token[])) "
+      "-> "
+      "((opaque[], f32[], u32[1,2], s32[3,4]), u32[1,2], token[])",
+      prog.ToString());
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/shape_tree.h b/tensorflow/compiler/xla/shape_tree.h
index df610102b4c..7bf97729165 100644
--- a/tensorflow/compiler/xla/shape_tree.h
+++ b/tensorflow/compiler/xla/shape_tree.h
@@ -667,12 +667,11 @@ void ShapeTree<T>::CopySubtreeFrom(const ShapeTree<T>& other,
 template <typename T>
 bool ShapeTree<T>::operator==(const ShapeTree<T>& other) const {
   bool equal = true;
-  ForEachElement(
-      [this, &other, &equal](const ShapeIndex& index, const T& data) {
-        if (data != other.element(index)) {
-          equal = false;
-        }
-      });
+  ForEachElement([&other, &equal](const ShapeIndex& index, const T& data) {
+    if (data != other.element(index)) {
+      equal = false;
+    }
+  });
   return equal;
 }
 
diff --git a/tensorflow/compiler/xla/shape_tree_test.cc b/tensorflow/compiler/xla/shape_tree_test.cc
index c8ff55e7845..2b6c484bc4f 100644
--- a/tensorflow/compiler/xla/shape_tree_test.cc
+++ b/tensorflow/compiler/xla/shape_tree_test.cc
@@ -52,10 +52,10 @@ class ShapeTreeTest : public ::testing::Test {
 
 TEST_F(ShapeTreeTest, DefaultConstructor) {
   ShapeTree<int> int_tree;
-  EXPECT_TRUE(ShapeUtil::IsNil(int_tree.shape()));
+  EXPECT_TRUE(ShapeUtil::IsEmptyTuple(int_tree.shape()));
 
   ShapeTree<bool> bool_tree;
-  EXPECT_TRUE(ShapeUtil::IsNil(bool_tree.shape()));
+  EXPECT_TRUE(ShapeUtil::IsEmptyTuple(bool_tree.shape()));
 }
 
 void ShapeTreeTest::TestShapeConstructor(const Shape& shape,
diff --git a/tensorflow/compiler/xla/shape_util.cc b/tensorflow/compiler/xla/shape_util.cc
index d0c35d8dee4..f3cc51ca915 100644
--- a/tensorflow/compiler/xla/shape_util.cc
+++ b/tensorflow/compiler/xla/shape_util.cc
@@ -79,14 +79,14 @@ bool ShapeIndexView::StartsWith(ShapeIndexView prefix) const {
          indices_.subspan(0, prefix.size()) == prefix.indices_;
 }
 
-namespace {
-
-// Returns whether the given primitive type corresponds to an array shape.
-bool IsArrayPrimitiveType(PrimitiveType primitive_type) {
+/* static */ bool ShapeUtil::IsArrayPrimitiveType(
+    PrimitiveType primitive_type) {
   return primitive_type != PRIMITIVE_TYPE_INVALID && primitive_type != TUPLE &&
          primitive_type != OPAQUE && primitive_type != TOKEN;
 }
 
+namespace {
+
 // Recursive helper for comparing the equality of two shapes. Returns true if
 // the shapes are the same. If compare_layouts is true, then layouts must also
 // match.
@@ -121,6 +121,23 @@ bool CompareShapes(const Shape& lhs, const Shape& rhs, bool compare_layouts,
         VLOG(3) << "CompareShapes: lhs layout != rhs layout";
         return false;
       }
+
+      const auto& lhs_tiles = lhs.layout().tiles();
+      const auto& rhs_tiles = rhs.layout().tiles();
+      if (lhs_tiles.size() != rhs_tiles.size()) {
+        return false;
+      }
+      for (int64 i = 0; i < lhs_tiles.size(); i++) {
+        if (!absl::c_equal(lhs_tiles[i].dimensions(),
+                           rhs_tiles[i].dimensions())) {
+          return false;
+        }
+      }
+
+      if (lhs.layout().element_size_in_bits() !=
+          rhs.layout().element_size_in_bits()) {
+        return false;
+      }
     }
   }
 
@@ -203,7 +220,7 @@ StatusOr<Shape> MakeShapeWithLayoutInternal(
 /* static */ ProgramShape ShapeUtil::MakeProgramShape(
     std::initializer_list<Shape> parameters, Shape result) {
   ProgramShape program_shape;
-  for (const auto& shape : parameters) {
+  for (const Shape& shape : parameters) {
     *program_shape.add_parameters() = shape;
   }
   *program_shape.mutable_result() = std::move(result);
@@ -272,7 +289,7 @@ ShapeUtil::MakeShapeWithDescendingLayoutAndSamePhysicalLayout(
 /* static */ Shape ShapeUtil::MakeTupleShape(absl::Span<const Shape> shapes) {
   Shape result;
   result.set_element_type(TUPLE);
-  result.mutable_tuple_shapes()->Reserve(shapes.size());
+  result.mutable_tuple_shapes()->reserve(shapes.size());
   for (const auto& shape : shapes) {
     AppendShapeToTuple(shape, &result);
   }
@@ -372,10 +389,6 @@ ShapeUtil::MakeShapeWithDescendingLayoutAndSamePhysicalLayout(
   return IsTuple(shape) && TupleElementCount(shape) == 0;
 }
 
-/* static */ bool ShapeUtil::IsNil(const Shape& shape) {
-  return IsEmptyTuple(shape);
-}
-
 /* static */ int64 ShapeUtil::TupleElementCount(const Shape& shape) {
   CHECK(IsTuple(shape)) << HumanString(shape);
   return shape.tuple_shapes_size();
@@ -1155,7 +1168,7 @@ Status ForEachMutableSubshapeHelper(
   // Let the argument `permutation` be P.  This is a permutation over `shape`'s
   // dimensions, so our return value will be a shape with dims P.I = P.  Our
   // goal is to construct a layout permutation L* that we can apply to P such
-  // that that the physical dimension ordering of the returned shape is the same
+  // that the physical dimension ordering of the returned shape is the same
   // as that of the original shape, namely L'.
   //
   // Our returned shape has dims P and layout L*, so its in-memory layout is
@@ -1600,7 +1613,8 @@ ShapeUtil::DimensionsUnmodifiedByReshape(const Shape& input_shape,
 /* static */ Shape ShapeUtil::DeleteDimension(int64 dim_to_delete,
                                               Shape shape) {
   CHECK(IsArray(shape));
-  shape.mutable_dimensions()->erase(shape.dimensions().begin() + dim_to_delete);
+  shape.mutable_dimensions()->erase(shape.mutable_dimensions()->begin() +
+                                    dim_to_delete);
   if (LayoutUtil::HasLayout(shape)) {
     Layout* layout = shape.mutable_layout();
     layout->set_format(DENSE);
@@ -1634,11 +1648,6 @@ ShapeUtil::DimensionsUnmodifiedByReshape(const Shape& input_shape,
   return shape;
 }
 
-std::ostream& operator<<(std::ostream& out, const Shape& shape) {
-  out << ShapeUtil::HumanStringWithLayout(shape);
-  return out;
-}
-
 /*static*/ size_t ShapeUtil::Hash(const Shape& shape) {
   using tensorflow::hash;
   using tensorflow::Hash64Combine;
diff --git a/tensorflow/compiler/xla/shape_util.h b/tensorflow/compiler/xla/shape_util.h
index a7a3026cf3f..84a27f662a5 100644
--- a/tensorflow/compiler/xla/shape_util.h
+++ b/tensorflow/compiler/xla/shape_util.h
@@ -28,6 +28,7 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/primitive_util.h"
+#include "tensorflow/compiler/xla/shape.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/types.h"
@@ -37,6 +38,7 @@ limitations under the License.
 #include "tensorflow/core/platform/cpu_info.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace xla {
@@ -100,6 +102,11 @@ class ShapeIndex {
 
   string ToString() const;
 
+  template <typename H>
+  friend H AbslHashValue(H h, const ShapeIndex& index) {
+    return H::combine(std::move(h), index.indices_);
+  }
+
  private:
   container_type indices_;
 };
@@ -461,6 +468,9 @@ class ShapeUtil {
   // arrays.
   static bool IsArray(const Shape& shape);
 
+  // Returns whether the given primitive type corresponds to an array shape.
+  static bool IsArrayPrimitiveType(PrimitiveType primitive_type);
+
   // Returns whether the shape is a tuple with at least one element which is
   // also a tuple.
   static bool IsNestedTuple(const Shape& shape);
@@ -468,9 +478,6 @@ class ShapeUtil {
   // Returns true if shape is an empty tuple.
   static bool IsEmptyTuple(const Shape& shape);
 
-  // Returns true if shape is the nil shape (an empty tuple).
-  static bool IsNil(const Shape& shape);
-
   // Returns the number of elements in the given tuple shape.
   // Precondition: IsTuple(shape)
   static int64 TupleElementCount(const Shape& shape);
@@ -754,10 +761,18 @@ class ShapeUtil {
       pool.emplace(tensorflow::Env::Default(), "foreach", kNumThreads);
     }
 
+    tensorflow::mutex mu;
+    Status status;  // Guarded by mu
+
     while (n < rank) {
       if (pool != absl::nullopt) {
-        pool->Schedule(
-            [indexes, &visitor_function] { visitor_function(indexes); });
+        pool->Schedule([indexes, &visitor_function, &mu, &status] {
+          StatusOr<bool> result = visitor_function(indexes);
+          if (!result.ok()) {
+            tensorflow::mutex_lock lock(mu);
+            status = status.ok() ? result.status() : status;
+          }
+        });
       } else {
         TF_ASSIGN_OR_RETURN(bool should_continue, visitor_function(indexes));
         if (!should_continue) {
@@ -775,14 +790,14 @@ class ShapeUtil {
       }
     }
 
-    return Status::OK();
+    // Waits for the scheduled work to complete.
+    pool.reset();
+    return status;
   }
 
   TF_DISALLOW_COPY_AND_ASSIGN(ShapeUtil);
 };
 
-std::ostream& operator<<(std::ostream& out, const Shape& shape);
-
 }  // namespace xla
 
 #endif  // TENSORFLOW_COMPILER_XLA_SHAPE_UTIL_H_
diff --git a/tensorflow/compiler/xla/shape_util_test.cc b/tensorflow/compiler/xla/shape_util_test.cc
index 0c647369a37..60bdbe30204 100644
--- a/tensorflow/compiler/xla/shape_util_test.cc
+++ b/tensorflow/compiler/xla/shape_util_test.cc
@@ -376,12 +376,12 @@ TEST(ShapeUtilTest, ByteSizeOfWithoutPadding) {
 }
 
 TEST(ShapeUtilTest, NilShape) {
-  EXPECT_TRUE(ShapeUtil::IsNil(ShapeUtil::MakeNil()));
-  EXPECT_FALSE(ShapeUtil::IsNil(ShapeUtil::MakeShape(F32, {1, 2, 3})));
-  EXPECT_FALSE(ShapeUtil::IsNil(ShapeUtil::MakeShape(F32, {0, 1})));
-  EXPECT_FALSE(ShapeUtil::IsNil(
+  EXPECT_TRUE(ShapeUtil::IsEmptyTuple(ShapeUtil::MakeNil()));
+  EXPECT_FALSE(ShapeUtil::IsEmptyTuple(ShapeUtil::MakeShape(F32, {1, 2, 3})));
+  EXPECT_FALSE(ShapeUtil::IsEmptyTuple(ShapeUtil::MakeShape(F32, {0, 1})));
+  EXPECT_FALSE(ShapeUtil::IsEmptyTuple(
       ShapeUtil::MakeTupleShape({ShapeUtil::MakeShape(S32, {})})));
-  EXPECT_FALSE(ShapeUtil::IsNil(
+  EXPECT_FALSE(ShapeUtil::IsEmptyTuple(
       ShapeUtil::MakeTupleShape({ShapeUtil::MakeShape(F32, {0})})));
 }
 
@@ -546,68 +546,6 @@ TEST(ShapeUtilTest, IsLeafIndex) {
   EXPECT_TRUE(ShapeUtil::IsLeafIndex(nested_tuple_shape, {1, 1}));
 }
 
-TEST(ShapeUtilTest, HumanString) {
-  Shape opaque = ShapeUtil::MakeOpaqueShape();
-  Shape token = ShapeUtil::MakeTokenShape();
-  Shape scalar = ShapeUtil::MakeShape(F32, {});
-  Shape matrix = ShapeUtil::MakeShape(U32, {1, 2});
-  Shape matrix2 = ShapeUtil::MakeShapeWithLayout(S32, {3, 4}, {0, 1});
-  Shape tuple = ShapeUtil::MakeTupleShape({opaque, scalar, matrix, matrix2});
-  Shape nested_tuple = ShapeUtil::MakeTupleShape({tuple, matrix, token});
-
-  EXPECT_EQ("opaque[]", ShapeUtil::HumanString(opaque));
-  EXPECT_EQ("token[]", ShapeUtil::HumanString(token));
-  EXPECT_EQ("f32[]", ShapeUtil::HumanString(scalar));
-  EXPECT_EQ("u32[1,2]", ShapeUtil::HumanString(matrix));
-  EXPECT_EQ("s32[3,4]", ShapeUtil::HumanString(matrix2));
-  EXPECT_EQ("(opaque[], f32[], u32[1,2], s32[3,4])",
-            ShapeUtil::HumanString(tuple));
-  EXPECT_EQ("((opaque[], f32[], u32[1,2], s32[3,4]), u32[1,2], token[])",
-            ShapeUtil::HumanString(nested_tuple));
-
-  EXPECT_EQ("opaque[]", ShapeUtil::HumanStringWithLayout(opaque));
-  EXPECT_EQ("f32[]", ShapeUtil::HumanStringWithLayout(scalar));
-  EXPECT_EQ("u32[1,2]{1,0}", ShapeUtil::HumanStringWithLayout(matrix));
-  EXPECT_EQ("s32[3,4]{0,1}", ShapeUtil::HumanStringWithLayout(matrix2));
-  EXPECT_EQ("(opaque[], f32[], u32[1,2]{1,0}, s32[3,4]{0,1})",
-            ShapeUtil::HumanStringWithLayout(tuple));
-  EXPECT_EQ(
-      "((opaque[], f32[], u32[1,2]{1,0}, s32[3,4]{0,1}), u32[1,2]{1,0}, "
-      "token[])",
-      ShapeUtil::HumanStringWithLayout(nested_tuple));
-
-  ProgramShape prog = ShapeUtil::MakeProgramShape(
-      {opaque, scalar, matrix, matrix2, tuple, nested_tuple}, nested_tuple);
-  EXPECT_EQ(
-      "((unknown): opaque[], "
-      "(unknown): f32[], "
-      "(unknown): u32[1,2], "
-      "(unknown): s32[3,4], "
-      "(unknown): (opaque[], f32[], u32[1,2], s32[3,4]), "
-      "(unknown): ((opaque[], f32[], u32[1,2], s32[3,4]), u32[1,2], token[])) "
-      "-> "
-      "((opaque[], f32[], u32[1,2], s32[3,4]), u32[1,2], token[])",
-      ShapeUtil::HumanString(prog));
-
-  prog.add_parameter_names("arg0");
-  prog.add_parameter_names("scalar");
-  prog.add_parameter_names("matrix");
-  prog.add_parameter_names("matrix2");
-  prog.add_parameter_names("tuple");
-  prog.add_parameter_names("nested_tuple");
-  EXPECT_EQ(
-      "(arg0: opaque[], "
-      "scalar: f32[], "
-      "matrix: u32[1,2], "
-      "matrix2: s32[3,4], "
-      "tuple: (opaque[], f32[], u32[1,2], s32[3,4]), "
-      "nested_tuple: ((opaque[], f32[], u32[1,2], s32[3,4]), u32[1,2], "
-      "token[])) "
-      "-> "
-      "((opaque[], f32[], u32[1,2], s32[3,4]), u32[1,2], token[])",
-      ShapeUtil::HumanString(prog));
-}
-
 TEST(ShapeUtilTest, ForEachSubshapeArray) {
   const Shape shape = ShapeUtil::MakeShape(F32, {2, 3});
   int calls = 0;
diff --git a/tensorflow/compiler/xla/tests/BUILD b/tensorflow/compiler/xla/tests/BUILD
index db34d34f969..f7f090fe4ab 100644
--- a/tensorflow/compiler/xla/tests/BUILD
+++ b/tensorflow/compiler/xla/tests/BUILD
@@ -79,6 +79,7 @@ cc_library(
         "//tensorflow/compiler/xla/service:hlo_verifier",
         "//tensorflow/compiler/xla/service:transfer_manager",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/base",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/types:span",
     ],
@@ -135,6 +136,7 @@ cc_library(
         "//tensorflow/core:stream_executor_no_cuda",
         "//tensorflow/core:test",
         "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/types:optional",
         "@com_google_absl//absl/types:span",
@@ -297,6 +299,56 @@ xla_test(
     ],
 )
 
+xla_test(
+    name = "conv_depthwise_test",
+    timeout = "long",
+    srcs = ["conv_depthwise_test.cc"],
+    blacklisted_backends = [
+        # disabled because of a break b/119590850.
+        "gpu",
+    ],
+    shard_count = 50,
+    deps = [
+        "//tensorflow/compiler/xla:execution_options_util",
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla/client:xla_computation",
+        "//tensorflow/compiler/xla/service:bfloat16_normalization",
+        "//tensorflow/compiler/xla/service:despecializer",
+        "//tensorflow/compiler/xla/service:hlo_parser",
+        "//tensorflow/compiler/xla/tests:client_library_test_base",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "@com_google_absl//absl/types:optional",
+    ],
+)
+
+xla_test(
+    name = "grouped_convolution_test",
+    timeout = "long",
+    srcs = ["grouped_convolution_test.cc"],
+    blacklisted_backends = [
+        # disabled because of a break b/119590850.
+        "gpu",
+        # disabled because it times out.
+        "cpu",
+    ],
+    shard_count = 50,
+    deps = [
+        "//tensorflow/compiler/xla:execution_options_util",
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla/client:xla_computation",
+        "//tensorflow/compiler/xla/service:bfloat16_normalization",
+        "//tensorflow/compiler/xla/service:despecializer",
+        "//tensorflow/compiler/xla/service:hlo_parser",
+        "//tensorflow/compiler/xla/tests:client_library_test_base",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "@com_google_absl//absl/types:optional",
+    ],
+)
+
 xla_test(
     name = "check_execution_arity_test",
     srcs = ["check_execution_arity_test.cc"],
@@ -1265,6 +1317,7 @@ xla_test(
         "enable_for_xla_interpreter",
     ],
     deps = [
+        "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/service:hlo_verifier",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -1865,6 +1918,7 @@ xla_test(
 xla_test(
     name = "multioutput_fusion_test",
     srcs = ["multioutput_fusion_test.cc"],
+    backends = ["gpu"],
     deps = [
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:shape_util",
diff --git a/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc b/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc
index 2180b22cb3b..f6be27bee27 100644
--- a/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc
+++ b/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc
@@ -350,6 +350,44 @@ TEST_P(ArrayElementwiseOpTestParamCount, AddManyValues) {
                              error_spec_);
 }
 
+// TODO(b/119692968): This test runs OOM on the GPU and CPU backend.
+XLA_TEST_F(ArrayElementwiseOpTest,
+           DISABLED_ON_GPU(DISABLED_ON_CPU(DeeplyNestedAddWithSlices))) {
+  XlaBuilder builder(TestName());
+  std::vector<float> values(30, 0.0);
+  auto a_literal = LiteralUtil::CreateR1<float>(values);
+  auto a = Parameter(&builder, 0, a_literal.shape(), "x");
+  auto b_literal = LiteralUtil::CreateR1<float>(values);
+  auto b = Parameter(&builder, 1, b_literal.shape(), "x");
+
+  // Construct a sequence of diamond-shaped gadgets like this:
+  //
+  //      add
+  //    /    \
+  //  slice  slice
+  //     \   /
+  //      add
+  //
+  // Each 'left' slice removes the last element, each 'right' slice removes the
+  // first element. In this way, we index into the add with different
+  // multi-dimensional index arrays, which defeats the caching we use to avoid
+  // exponential compile time.
+  std::function<XlaOp(int64)> generate_recursive =
+      [&](int64 slice_size) -> XlaOp {
+    if (slice_size == values.size()) {
+      return Add(a, b);
+    }
+    XlaOp param = generate_recursive(slice_size + 1);
+    auto slice1 = Slice(param, {0}, {slice_size}, {1});
+    auto slice2 = Slice(param, {1}, {slice_size + 1}, {1});
+    return Add(slice1, slice2);
+  };
+  generate_recursive(1);
+  auto a_data = client_->TransferToServer(a_literal).ConsumeValueOrDie();
+  auto b_data = client_->TransferToServer(b_literal).ConsumeValueOrDie();
+  ComputeAndCompareR1<float>(&builder, {0.0}, {a_data.get(), b_data.get()});
+}
+
 XLA_TEST_F(ArrayElementwiseOpTest, SubTwoConstantF32s) {
   XlaBuilder builder(TestName());
   auto a = ConstantR1<float>(&builder, {-2.5f, 3.14f, 2.25f, -10.0f, 6.0f});
@@ -2744,12 +2782,16 @@ XLA_TEST_F(ArrayElementwiseOpTest, CompareGtR3F32sWithDegenerateDim2) {
   Array3D<int> expected_3d(
       {{{0, 1}, {0, 0}, {0, 0}}, {{0, 1}, {1, 0}, {0, 1}}});
   const string expected = R"(pred[2,3,2] {
-{ { 0, 1 },
+{
+  { 0, 1 },
   { 0, 0 },
-  { 0, 0 } },
-{ { 0, 1 },
+  { 0, 0 }
+},
+{
+  { 0, 1 },
   { 1, 0 },
-  { 0, 1 } }
+  { 0, 1 }
+}
 })";
   EXPECT_EQ(expected, ExecuteToString(&builder, {}));
 }
diff --git a/tensorflow/compiler/xla/tests/broadcast_simple_test.cc b/tensorflow/compiler/xla/tests/broadcast_simple_test.cc
index dde19fb65d6..702fb32adfc 100644
--- a/tensorflow/compiler/xla/tests/broadcast_simple_test.cc
+++ b/tensorflow/compiler/xla/tests/broadcast_simple_test.cc
@@ -161,8 +161,7 @@ XLA_TEST_F(BroadcastSimpleTest, 1DTo2D) {
 
 XLA_TEST_F(BroadcastSimpleTest, 1DTo2D_WithDimsUsual) {
   XlaBuilder b(TestName());
-  BroadcastInDim(ConstantR1<float>(&b, {1, 2}),
-                 ShapeUtil::MakeShape(F32, {2, 2}), {1});
+  BroadcastInDim(ConstantR1<float>(&b, {1, 2}), {2, 2}, {1});
 
   Array2D<float> expected(2, 2);
   expected(0, 0) = 1;
@@ -175,8 +174,7 @@ XLA_TEST_F(BroadcastSimpleTest, 1DTo2D_WithDimsUsual) {
 
 XLA_TEST_F(BroadcastSimpleTest, 1DTo2D_WithDimsTranspose) {
   XlaBuilder b(TestName());
-  BroadcastInDim(ConstantR1<float>(&b, {1, 2}),
-                 ShapeUtil::MakeShape(F32, {2, 2}), {0});
+  BroadcastInDim(ConstantR1<float>(&b, {1, 2}), {2, 2}, {0});
 
   Array2D<float> expected(2, 2);
   expected(0, 0) = 1;
@@ -189,8 +187,8 @@ XLA_TEST_F(BroadcastSimpleTest, 1DTo2D_WithDimsTranspose) {
 
 XLA_TEST_F(BroadcastSimpleTest, 2DTo3D_WithDims) {
   XlaBuilder b(TestName());
-  BroadcastInDim(ConstantR2<float>(&b, {{1.0, 5.0}, {2.0, 6.0}}),
-                 ShapeUtil::MakeShape(F32, {2, 2, 2}), {0, 1});
+  BroadcastInDim(ConstantR2<float>(&b, {{1.0, 5.0}, {2.0, 6.0}}), {2, 2, 2},
+                 {0, 1});
 
   Array3D<float> expected(2, 2, 2);
   expected(0, 0, 0) = 1.0;
@@ -207,8 +205,8 @@ XLA_TEST_F(BroadcastSimpleTest, 2DTo3D_WithDims) {
 
 XLA_TEST_F(BroadcastSimpleTest, 2DTo3D_WithDimsNotPossibleWithBroadCast) {
   XlaBuilder b(TestName());
-  BroadcastInDim(ConstantR2<float>(&b, {{1.0, 5.0}, {2.0, 6.0}}),
-                 ShapeUtil::MakeShape(F32, {2, 2, 2}), {0, 2});
+  BroadcastInDim(ConstantR2<float>(&b, {{1.0, 5.0}, {2.0, 6.0}}), {2, 2, 2},
+                 {0, 2});
 
   Array3D<float> expected(2, 2, 2);
   expected(0, 0, 0) = 1.0;
@@ -225,8 +223,7 @@ XLA_TEST_F(BroadcastSimpleTest, 2DTo3D_WithDimsNotPossibleWithBroadCast) {
 
 XLA_TEST_F(BroadcastSimpleTest, 1DTo2D_WithDimsNotPossibleWithBroadCast) {
   XlaBuilder b(TestName());
-  BroadcastInDim(ConstantR1<float>(&b, {1, 2}),
-                 ShapeUtil::MakeShape(F32, {3, 2}), {1});
+  BroadcastInDim(ConstantR1<float>(&b, {1, 2}), {3, 2}, {1});
 
   Array2D<float> expected(3, 2);
   expected(0, 0) = 1;
diff --git a/tensorflow/compiler/xla/tests/client_library_test_base.cc b/tensorflow/compiler/xla/tests/client_library_test_base.cc
index b98572e24c8..12c02998333 100644
--- a/tensorflow/compiler/xla/tests/client_library_test_base.cc
+++ b/tensorflow/compiler/xla/tests/client_library_test_base.cc
@@ -107,7 +107,7 @@ StatusOr<Literal> ClientLibraryTestBase::ExecuteAndTransfer(
   ExecutionOptions execution_options = execution_options_;
   if (shape_with_output_layout != nullptr) {
     *execution_options.mutable_shape_with_output_layout() =
-        *shape_with_output_layout;
+        shape_with_output_layout->ToProto();
   }
   return client_->ExecuteAndTransfer(computation, arguments,
                                      &execution_options);
@@ -127,7 +127,7 @@ StatusOr<Literal> ClientLibraryTestBase::ExecuteAndTransferReference(
   ExecutionOptions execution_options = execution_options_;
   if (shape_with_output_layout != nullptr) {
     *execution_options.mutable_shape_with_output_layout() =
-        *shape_with_output_layout;
+        shape_with_output_layout->ToProto();
   }
   execution_options.clear_device_handles();
   return ref_client_->ExecuteAndTransfer(computation, arguments,
diff --git a/tensorflow/compiler/xla/tests/client_library_test_base.h b/tensorflow/compiler/xla/tests/client_library_test_base.h
index 34148e5886d..65a23dd8835 100644
--- a/tensorflow/compiler/xla/tests/client_library_test_base.h
+++ b/tensorflow/compiler/xla/tests/client_library_test_base.h
@@ -76,7 +76,7 @@ class ClientLibraryTestBase : public ::testing::Test {
   void SetFastMathDisabled(bool disabled) {
     auto* opts = execution_options_.mutable_debug_options();
     opts->set_xla_cpu_enable_fast_math(!disabled);
-    opts->set_xla_gpu_enable_fast_math(!disabled);
+    opts->set_xla_gpu_enable_fast_min_max(!disabled);
   }
 
   void SetSeed(uint64 seed) { execution_options_.set_seed(seed); }
diff --git a/tensorflow/compiler/xla/tests/client_test.cc b/tensorflow/compiler/xla/tests/client_test.cc
index 6f2ca84bb64..363dee74b27 100644
--- a/tensorflow/compiler/xla/tests/client_test.cc
+++ b/tensorflow/compiler/xla/tests/client_test.cc
@@ -50,7 +50,8 @@ XLA_TEST_F(ClientTest, ExecuteWithLayout) {
       ExecutionOptions execution_options = execution_options_;
       *execution_options.mutable_shape_with_output_layout() =
           ShapeUtil::MakeShapeWithLayout(S32, /*dimensions=*/{2, 2},
-                                         execute_layout);
+                                         execute_layout)
+              .ToProto();
       TF_ASSERT_OK_AND_ASSIGN(
           std::unique_ptr<GlobalData> data,
           client_->Execute(computation, {}, &execution_options));
@@ -84,7 +85,8 @@ XLA_TEST_F(ClientTest, ExecuteWithTupleLayout) {
           {ShapeUtil::MakeShapeWithLayout(S32, /*dimensions=*/{2, 2},
                                           /*minor_to_major=*/{0, 1}),
            ShapeUtil::MakeShapeWithLayout(S32, /*dimensions=*/{2, 2},
-                                          /*minor_to_major=*/{1, 0})});
+                                          /*minor_to_major=*/{1, 0})})
+          .ToProto();
 
   TF_ASSERT_OK_AND_ASSIGN(
       auto result,
diff --git a/tensorflow/compiler/xla/tests/concat_test.cc b/tensorflow/compiler/xla/tests/concat_test.cc
index 9811a015e91..4f5b525a342 100644
--- a/tensorflow/compiler/xla/tests/concat_test.cc
+++ b/tensorflow/compiler/xla/tests/concat_test.cc
@@ -492,6 +492,32 @@ XLA_TEST_F(ConcatTest, ConcatR3WeirdDims) {
   ComputeAndCompareR3<float>(&builder, expected, {p0.get(), p1.get()});
 }
 
+XLA_TEST_F(ConcatTest, ConcatDeeplyNested) {
+  XlaBuilder builder(TestName());
+  auto a_literal = LiteralUtil::CreateR1<float>({256.0});
+  auto a = Parameter(&builder, 0, a_literal.shape(), "x");
+  auto b = ConcatInDim(&builder, {a, a}, 0);
+  auto c = ConcatInDim(&builder, {b, b}, 0);
+  auto d = ConcatInDim(&builder, {c, c}, 0);
+  auto e = ConcatInDim(&builder, {d, d}, 0);
+  auto f = ConcatInDim(&builder, {e, e}, 0);
+  auto g = ConcatInDim(&builder, {f, f}, 0);
+  auto h = ConcatInDim(&builder, {g, g}, 0);
+  auto i = ConcatInDim(&builder, {h, h}, 0);
+  auto j = ConcatInDim(&builder, {i, i}, 0);
+  auto k = ConcatInDim(&builder, {j, j}, 0);
+  auto l = ConcatInDim(&builder, {k, k}, 0);
+  auto m = ConcatInDim(&builder, {l, l}, 0);
+  auto n = ConcatInDim(&builder, {m, m}, 0);
+  auto o = ConcatInDim(&builder, {n, n}, 0);
+  auto p = ConcatInDim(&builder, {o, o}, 0);
+  auto q = ConcatInDim(&builder, {p, p}, 0);
+  ConcatInDim(&builder, {q, q}, 0);
+  std::vector<float> expected(131072, 256.0);
+  auto a_data = client_->TransferToServer(a_literal).ConsumeValueOrDie();
+  ComputeAndCompareR1<float>(&builder, expected, {a_data.get()});
+}
+
 // Describes a binary rank-2 concatenation test.
 struct R2BinarySpec {
   int64 lhs_dim0;
diff --git a/tensorflow/compiler/xla/tests/conv_depthwise_test.cc b/tensorflow/compiler/xla/tests/conv_depthwise_test.cc
new file mode 100644
index 00000000000..bc9bd8a2691
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/conv_depthwise_test.cc
@@ -0,0 +1,234 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "absl/types/optional.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
+#include "tensorflow/compiler/xla/execution_options_util.h"
+#include "tensorflow/compiler/xla/service/bfloat16_normalization.h"
+#include "tensorflow/compiler/xla/service/despecializer.h"
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/compiler/xla/tests/client_library_test_base.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/compiler/xla/tests/test_macros.h"
+
+namespace xla {
+namespace {
+
+string GetFloatDataType(bool use_bfloat16) {
+  return use_bfloat16 ? "bf16" : "f32";
+}
+
+struct DepthwiseConvolution2DSpec {
+  int64 output_feature, window, stride, pad, lhs_dilate;
+  std::vector<int64> activation_dims;
+  std::vector<int64> activation_layout;
+  std::vector<int64> kernel_dims;
+  std::vector<int64> kernel_layout;
+  std::vector<int64> output_dims;
+  std::vector<int64> output_layout;
+};
+
+class DepthwiseConvolution2DTest
+    : public HloTestBase,
+      public ::testing::WithParamInterface<
+          ::testing::tuple<DepthwiseConvolution2DSpec, bool>> {};
+
+static std::vector<DepthwiseConvolution2DSpec> GetConv2DTestCases() {
+  std::vector<DepthwiseConvolution2DSpec> config_set;
+  std::vector<std::vector<int64>> config_options = {
+      {128, 6, 3, 64},  {256, 5, 3, 256},  {256, 5, 2, 144}, {144, 5, 3, 64},
+      {144, 5, 2, 256}, {8, 48, 17, 8},    {128, 20, 6, 64}, {128, 1, 2, 144},
+      {256, 1, 2, 64},  {64, 14, 12, 172}, {16, 9, 4, 16}};
+
+  for (auto option : config_options) {
+    int64 feature = option[0];
+    int64 activation_size = option[1];
+    int64 kernel_size = option[2];
+    int64 batch = option[3];
+
+    std::vector<int64> kernel_layout = {3, 2, 1, 0};
+    DepthwiseConvolution2DSpec config;
+    config.output_feature = feature;
+    config.window = kernel_size;
+
+    config.activation_dims = {batch, activation_size, activation_size, feature};
+    config.activation_layout = {3, 0, 2, 1};
+
+    config.kernel_dims = {kernel_size, kernel_size, 1, feature};
+    config.kernel_layout = {3, 2, 1, 0};
+
+    if (activation_size == 1 && kernel_size == 2) {
+      // Test for outer dim.
+      config.output_dims = {batch, activation_size + kernel_size - 1,
+                            activation_size + kernel_size, feature};
+    } else if (feature == 256) {
+      // Restrict dilation-based tests only to one feature configuration.
+      config.stride = activation_size - 1;
+      config.pad = 0;
+      config.lhs_dilate = feature / 32;
+      config.output_dims = {batch, feature / 32,
+                            activation_size - kernel_size + 1, feature};
+    } else {
+      config.stride = config.pad = config.lhs_dilate = -1;
+      config.output_dims = {batch, activation_size - kernel_size + 1,
+                            activation_size - kernel_size + 1, feature};
+    }
+
+    // Try this layout for all kernel shapes.
+    config.output_layout = {3, 0, 2, 1};
+    config_set.push_back(config);
+
+    // Try other layouts only for certain kernel shapes.
+    if (kernel_size % 2 == 0) {
+      config.activation_layout = {0, 3, 2, 1};
+      config_set.push_back(config);
+
+      config.output_layout = {0, 3, 2, 1};
+      config_set.push_back(config);
+
+      config.activation_layout = {3, 0, 2, 1};
+      config_set.push_back(config);
+    }
+  }
+
+  return config_set;
+}
+
+string DepthwiseConvolution2DTestDataToString(
+    const ::testing::TestParamInfo<
+        ::testing::tuple<DepthwiseConvolution2DSpec, bool>>& data) {
+  const auto& spec = ::testing::get<0>(data.param);
+  const string data_type = GetFloatDataType(::testing::get<1>(data.param));
+  string str = absl::StrCat(
+      "activation_dims_", absl::StrJoin(spec.activation_dims, "x"),
+      "_activation_layout_", absl::StrJoin(spec.activation_layout, "_"),
+      "_kernel_dims_", absl::StrJoin(spec.kernel_dims, "x"), "_kernel_layout_",
+      absl::StrJoin(spec.kernel_layout, "_"), "_output_dims_",
+      absl::StrJoin(spec.output_dims, "x"), "_output_layout_",
+      absl::StrJoin(spec.output_layout, "_"), data_type);
+  // -1 indicates non-existence.
+  if (spec.stride != -1) {
+    absl::StrAppend(&str, "_lhs_dilation_", spec.lhs_dilate, "x1");
+  }
+
+  // Test names are not allowed to contain the '-' character.
+  absl::c_replace(str, '-', 'n');
+  return str;
+}
+
+string BuildHloTextDepthwiseConvolution2D(
+    const DepthwiseConvolution2DSpec& spec, bool use_bfloat16) {
+  const string data_type = GetFloatDataType(use_bfloat16);
+  if (spec.activation_dims[1] == 1 && spec.kernel_dims[1] == 2) {
+    return absl::StrFormat(
+        R"(
+    HloModule TensorFlowDepthwiseConv
+
+    ENTRY main {
+      activation = %s[%s]{%s} parameter(0)
+      kernel = %s[%s]{%s} parameter(1)
+      ROOT conv = %s[%s]{%s} convolution(%s[%s]{%s} activation, %s[%s]{%s} kernel),
+          window={size=%dx%d  pad=1_1x%d_%d rhs_dilate=1x%d}, dim_labels=b01f_01io->b01f,
+          feature_group_count=%d
+    }
+    )",
+        data_type, absl::StrJoin(spec.activation_dims, ","),
+        absl::StrJoin(spec.activation_layout, ","), data_type,
+        absl::StrJoin(spec.kernel_dims, ","),
+        absl::StrJoin(spec.kernel_layout, ","), data_type,
+        absl::StrJoin(spec.output_dims, ","),
+        absl::StrJoin(spec.output_layout, ","), data_type,
+        absl::StrJoin(spec.activation_dims, ","),
+        absl::StrJoin(spec.activation_layout, ","), data_type,
+        absl::StrJoin(spec.kernel_dims, ","),
+        absl::StrJoin(spec.kernel_layout, ","), spec.window, spec.window,
+        spec.window, spec.window, spec.window, spec.output_feature);
+
+  } else if (spec.stride == -1) {
+    return absl::StrFormat(
+        R"(
+      HloModule TensorFlowDepthwiseConv
+
+      ENTRY main {
+        activation = %s[%s]{%s} parameter(0)
+        kernel = %s[%s]{%s} parameter(1)
+        ROOT conv = %s[%s]{%s} convolution(%s[%s]{%s} activation, %s[%s]{%s} kernel),
+            window={size=%dx%d}, dim_labels=b01f_01io->b01f,
+            feature_group_count=%d
+      }
+      )",
+        data_type, absl::StrJoin(spec.activation_dims, ","),
+        absl::StrJoin(spec.activation_layout, ","), data_type,
+        absl::StrJoin(spec.kernel_dims, ","),
+        absl::StrJoin(spec.kernel_layout, ","), data_type,
+        absl::StrJoin(spec.output_dims, ","),
+        absl::StrJoin(spec.output_layout, ","), data_type,
+        absl::StrJoin(spec.activation_dims, ","),
+        absl::StrJoin(spec.activation_layout, ","), data_type,
+        absl::StrJoin(spec.kernel_dims, ","),
+        absl::StrJoin(spec.kernel_layout, ","), spec.window, spec.window,
+        spec.output_feature);
+  } else {
+    return absl::StrFormat(
+        R"(
+    HloModule TensorFlowDepthwiseConv
+
+    ENTRY main {
+      activation = %s[%s]{%s} parameter(0)
+      kernel = %s[%s]{%s} parameter(1)
+      ROOT conv = %s[%s]{%s} convolution(%s[%s]{%s} activation, %s[%s]{%s} kernel),
+          window={size=%dx%d stride=%dx1 pad=%d_%dx0_0 lhs_dilate=%dx1}, 
+          dim_labels=b01f_01io->b01f, feature_group_count=%d
+    }
+    )",
+        data_type, absl::StrJoin(spec.activation_dims, ","),
+        absl::StrJoin(spec.activation_layout, ","), data_type,
+        absl::StrJoin(spec.kernel_dims, ","),
+        absl::StrJoin(spec.kernel_layout, ","), data_type,
+        absl::StrJoin(spec.output_dims, ","),
+        absl::StrJoin(spec.output_layout, ","), data_type,
+        absl::StrJoin(spec.activation_dims, ","),
+        absl::StrJoin(spec.activation_layout, ","), data_type,
+        absl::StrJoin(spec.kernel_dims, ","),
+        absl::StrJoin(spec.kernel_layout, ","), spec.window, spec.window,
+        spec.stride, 0, 0, spec.lhs_dilate, spec.output_feature);
+  }
+}
+
+XLA_TEST_P(DepthwiseConvolution2DTest, DoIt) {
+  const DepthwiseConvolution2DSpec& spec = ::testing::get<0>(GetParam());
+  bool use_bfloat16 = ::testing::get<1>(GetParam());
+  const string hlo_text =
+      BuildHloTextDepthwiseConvolution2D(spec, use_bfloat16);
+
+  EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{0.01, 0.01},
+                            [](HloModule* module) -> Status {
+                              BFloat16MixedPrecisionRemoval remover;
+                              TF_RETURN_IF_ERROR(remover.Run(module).status());
+                              Despecializer despecializer;
+                              return despecializer.Run(module).status();
+                            }));
+}
+
+INSTANTIATE_TEST_CASE_P(
+    DepthwiseConvolution2DTestWithRandomIndices, DepthwiseConvolution2DTest,
+    ::testing::Combine(::testing::ValuesIn(GetConv2DTestCases()),
+                       ::testing::Bool()),
+    DepthwiseConvolution2DTestDataToString);
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/convolution_test.cc b/tensorflow/compiler/xla/tests/convolution_test.cc
index 211d004ec8c..459add96813 100644
--- a/tensorflow/compiler/xla/tests/convolution_test.cc
+++ b/tensorflow/compiler/xla/tests/convolution_test.cc
@@ -721,8 +721,6 @@ class Convolve2D_1x4x4x512_3x3x1x512_Depthwise_Valid : public ConvolutionTest {
     ComputeAndCompareLiteral(&builder, expected_r4,
                              {input_literal.get(), filter_literal.get()},
                              error_spec_);
-
-    auto filter_r = filter_r1.Reshape(filter_dims);
   }
 };
 
@@ -731,6 +729,291 @@ TYPED_TEST(Convolve2D_1x4x4x512_3x3x1x512_Depthwise_Valid, Types) {
   this->RunTest();
 }
 
+template <typename T>
+class Convolve2D_1x4x4x512_3x3x1x512_Depthwise_Valid_Output_Batch_In_Lanes
+    : public ConvolutionTest {
+ public:
+  void RunTest() {
+    XlaBuilder builder(TestName());
+    std::vector<int64> input_dims = {1, 4, 4, 512};
+    std::vector<int64> filter_dims = {3, 3, 1, 512};
+    Shape input_shape = ShapeUtil::MakeShapeWithType<T>(input_dims);
+    Shape filter_shape = ShapeUtil::MakeShapeWithType<T>(filter_dims);
+    {
+      auto input = Parameter(&builder, 0, input_shape, "input");
+      auto filter = Parameter(&builder, 1, filter_shape, "filter");
+
+      // Tensorflow dimension numbers for 2D convolution.
+      ConvolutionDimensionNumbers dnums;
+      dnums.set_input_batch_dimension(0);
+      dnums.set_output_batch_dimension(0);
+      dnums.add_input_spatial_dimensions(1);
+      dnums.add_output_spatial_dimensions(1);
+      dnums.add_input_spatial_dimensions(2);
+      dnums.add_output_spatial_dimensions(2);
+      dnums.set_input_feature_dimension(3);
+      dnums.set_output_feature_dimension(3);
+      dnums.add_kernel_spatial_dimensions(0);
+      dnums.add_kernel_spatial_dimensions(1);
+      dnums.set_kernel_input_feature_dimension(2);
+      dnums.set_kernel_output_feature_dimension(3);
+
+      ConvWithGeneralDimensions(input, filter, {1, 1}, Padding::kValid, dnums,
+                                /*feature_group_count=*/512);
+    }
+
+    std::vector<T> input_elems(ShapeUtil::ElementsIn(input_shape),
+                               static_cast<T>(1));
+    auto input_r1 = LiteralUtil::CreateR1<T>(input_elems);
+    auto input_r4 = input_r1.Reshape(input_dims).ConsumeValueOrDie();
+
+    std::vector<T> filter_elems(ShapeUtil::ElementsIn(filter_shape),
+                                static_cast<T>(2));
+    auto filter_r1 = LiteralUtil::CreateR1<T>(filter_elems);
+    auto filter_r4 = filter_r1.Reshape(filter_dims).ConsumeValueOrDie();
+
+    std::vector<T> output_elems(2048, static_cast<T>(18));
+
+    auto expected_r1 = LiteralUtil::CreateR1<T>(output_elems);
+    auto expected_r4 = expected_r1.Reshape({1, 2, 2, 512}).ConsumeValueOrDie();
+    auto expected_r4_relaid =
+        expected_r4.Relayout(LayoutUtil::MakeLayout({0, 3, 2, 1}));
+
+    auto input_literal =
+        client_->TransferToServer(input_r4).ConsumeValueOrDie();
+    auto filter_literal =
+        client_->TransferToServer(filter_r4).ConsumeValueOrDie();
+
+    ComputeAndCompareLiteral(&builder, expected_r4_relaid,
+                             {input_literal.get(), filter_literal.get()},
+                             error_spec_, &expected_r4_relaid.shape());
+  }
+};
+
+TYPED_TEST_CASE(
+    Convolve2D_1x4x4x512_3x3x1x512_Depthwise_Valid_Output_Batch_In_Lanes,
+    TestTypes);
+TYPED_TEST(Convolve2D_1x4x4x512_3x3x1x512_Depthwise_Valid_Output_Batch_In_Lanes,
+           Types) {
+  this->RunTest();
+}
+
+template <typename T>
+class Convolve2D_256x4x4x512_3x3x1x512_Depthwise_Input_Batch_in_Lanes
+    : public ConvolutionTest {
+ public:
+  void RunTest() {
+    XlaBuilder builder(TestName());
+    std::vector<int64> input_dims = {256, 4, 4, 512};
+    std::vector<int64> filter_dims = {3, 3, 1, 512};
+    Shape input_shape = ShapeUtil::MakeShapeWithType<T>(input_dims);
+    Shape filter_shape = ShapeUtil::MakeShapeWithType<T>(filter_dims);
+    {
+      auto input = Parameter(&builder, 0, input_shape, "input");
+      auto filter = Parameter(&builder, 1, filter_shape, "filter");
+
+      // Tensorflow dimension numbers for 2D convolution.
+      ConvolutionDimensionNumbers dnums;
+      dnums.set_input_batch_dimension(0);
+      dnums.set_output_batch_dimension(0);
+      dnums.add_input_spatial_dimensions(1);
+      dnums.add_output_spatial_dimensions(1);
+      dnums.add_input_spatial_dimensions(2);
+      dnums.add_output_spatial_dimensions(2);
+      dnums.set_input_feature_dimension(3);
+      dnums.set_output_feature_dimension(3);
+      dnums.add_kernel_spatial_dimensions(0);
+      dnums.add_kernel_spatial_dimensions(1);
+      dnums.set_kernel_input_feature_dimension(2);
+      dnums.set_kernel_output_feature_dimension(3);
+
+      ConvWithGeneralDimensions(input, filter, {1, 1}, Padding::kValid, dnums,
+                                /*feature_group_count=*/512);
+    }
+
+    std::vector<T> input_elems(ShapeUtil::ElementsIn(input_shape),
+                               static_cast<T>(1));
+    auto input_r1 = LiteralUtil::CreateR1<T>(input_elems);
+    auto input_r4 = input_r1.Reshape(input_dims).ConsumeValueOrDie();
+    auto input_r4_relaid =
+        input_r4.Relayout(LayoutUtil::MakeLayout({0, 3, 2, 1}));
+
+    std::vector<T> filter_elems(ShapeUtil::ElementsIn(filter_shape),
+                                static_cast<T>(2));
+    auto filter_r1 = LiteralUtil::CreateR1<T>(filter_elems);
+    auto filter_r4 = filter_r1.Reshape(filter_dims).ConsumeValueOrDie();
+
+    std::vector<T> output_elems(2048 * 256, static_cast<T>(18));
+
+    auto expected_r1 = LiteralUtil::CreateR1<T>(output_elems);
+    auto expected_r4 =
+        expected_r1.Reshape({256, 2, 2, 512}).ConsumeValueOrDie();
+
+    auto input_literal =
+        client_->TransferToServer(input_r4_relaid).ConsumeValueOrDie();
+    auto filter_literal =
+        client_->TransferToServer(filter_r4).ConsumeValueOrDie();
+
+    ComputeAndCompareLiteral(&builder, expected_r4,
+                             {input_literal.get(), filter_literal.get()},
+                             error_spec_);
+  }
+};
+
+TYPED_TEST_CASE(Convolve2D_256x4x4x512_3x3x1x512_Depthwise_Input_Batch_in_Lanes,
+                TestTypes);
+TYPED_TEST(Convolve2D_256x4x4x512_3x3x1x512_Depthwise_Input_Batch_in_Lanes,
+           Types) {
+  this->RunTest();
+}
+
+template <typename T>
+class Convolve2D_256x4x4x512_3x3x1x512_Depthwise_Both_Batch_in_Lanes
+    : public ConvolutionTest {
+ public:
+  void RunTest() {
+    XlaBuilder builder(TestName());
+    std::vector<int64> input_dims = {256, 4, 4, 512};
+    std::vector<int64> filter_dims = {3, 3, 1, 512};
+    Shape input_shape = ShapeUtil::MakeShapeWithType<T>(input_dims);
+    Shape filter_shape = ShapeUtil::MakeShapeWithType<T>(filter_dims);
+    {
+      auto input = Parameter(&builder, 0, input_shape, "input");
+      auto filter = Parameter(&builder, 1, filter_shape, "filter");
+
+      // Tensorflow dimension numbers for 2D convolution.
+      ConvolutionDimensionNumbers dnums;
+      dnums.set_input_batch_dimension(0);
+      dnums.set_output_batch_dimension(0);
+      dnums.add_input_spatial_dimensions(1);
+      dnums.add_output_spatial_dimensions(1);
+      dnums.add_input_spatial_dimensions(2);
+      dnums.add_output_spatial_dimensions(2);
+      dnums.set_input_feature_dimension(3);
+      dnums.set_output_feature_dimension(3);
+      dnums.add_kernel_spatial_dimensions(0);
+      dnums.add_kernel_spatial_dimensions(1);
+      dnums.set_kernel_input_feature_dimension(2);
+      dnums.set_kernel_output_feature_dimension(3);
+
+      ConvWithGeneralDimensions(input, filter, {1, 1}, Padding::kValid, dnums,
+                                /*feature_group_count=*/512);
+    }
+
+    std::vector<T> input_elems(ShapeUtil::ElementsIn(input_shape),
+                               static_cast<T>(1));
+    auto input_r1 = LiteralUtil::CreateR1<T>(input_elems);
+    auto input_r4 = input_r1.Reshape(input_dims).ConsumeValueOrDie();
+    auto input_r4_relaid =
+        input_r4.Relayout(LayoutUtil::MakeLayout({0, 3, 2, 1}));
+
+    std::vector<T> filter_elems(ShapeUtil::ElementsIn(filter_shape),
+                                static_cast<T>(2));
+    auto filter_r1 = LiteralUtil::CreateR1<T>(filter_elems);
+    auto filter_r4 = filter_r1.Reshape(filter_dims).ConsumeValueOrDie();
+
+    std::vector<T> output_elems(2048 * 256, static_cast<T>(18));
+
+    auto expected_r1 = LiteralUtil::CreateR1<T>(output_elems);
+    auto expected_r4 =
+        expected_r1.Reshape({256, 2, 2, 512}).ConsumeValueOrDie();
+    auto expected_r4_relaid =
+        expected_r4.Relayout(LayoutUtil::MakeLayout({0, 3, 2, 1}));
+
+    auto input_literal =
+        client_->TransferToServer(input_r4_relaid).ConsumeValueOrDie();
+    auto filter_literal =
+        client_->TransferToServer(filter_r4).ConsumeValueOrDie();
+
+    ComputeAndCompareLiteral(&builder, expected_r4_relaid,
+                             {input_literal.get(), filter_literal.get()},
+                             error_spec_, &expected_r4_relaid.shape());
+  }
+};
+
+TYPED_TEST_CASE(Convolve2D_256x4x4x512_3x3x1x512_Depthwise_Both_Batch_in_Lanes,
+                TestTypes);
+TYPED_TEST(Convolve2D_256x4x4x512_3x3x1x512_Depthwise_Both_Batch_in_Lanes,
+           Types) {
+  this->RunTest();
+}
+
+template <typename T>
+class Convolve2D_1x4x4x5_3x3x1x5_Depthwise_Valid_Output_Batch_In_Lanes
+    : public ConvolutionTest {
+ public:
+  void RunTest() {
+    XlaBuilder builder(TestName());
+    std::vector<int64> input_dims = {1, 4, 4, 5};
+    std::vector<int64> filter_dims = {3, 3, 1, 5};
+    Shape input_shape = ShapeUtil::MakeShapeWithType<T>(input_dims);
+    Shape filter_shape = ShapeUtil::MakeShapeWithType<T>(filter_dims);
+    {
+      auto input = Parameter(&builder, 0, input_shape, "input");
+      auto filter = Parameter(&builder, 1, filter_shape, "filter");
+
+      // Tensorflow dimension numbers for 2D convolution.
+      ConvolutionDimensionNumbers dnums;
+      dnums.set_input_batch_dimension(0);
+      dnums.set_output_batch_dimension(0);
+      dnums.add_input_spatial_dimensions(1);
+      dnums.add_output_spatial_dimensions(1);
+      dnums.add_input_spatial_dimensions(2);
+      dnums.add_output_spatial_dimensions(2);
+      dnums.set_input_feature_dimension(3);
+      dnums.set_output_feature_dimension(3);
+      dnums.add_kernel_spatial_dimensions(0);
+      dnums.add_kernel_spatial_dimensions(1);
+      dnums.set_kernel_input_feature_dimension(2);
+      dnums.set_kernel_output_feature_dimension(3);
+
+      ConvWithGeneralDimensions(input, filter, {1, 1}, Padding::kValid, dnums,
+                                /*feature_group_count=*/5);
+    }
+
+    std::vector<T> input_elems(ShapeUtil::ElementsIn(input_shape));
+    iota_int_init_value(input_elems, 1);
+    auto input_r1 = LiteralUtil::CreateR1<T>(input_elems);
+    auto input_r4 = input_r1.Reshape(input_dims).ConsumeValueOrDie();
+    auto input_r4_relaid =
+        input_r4.Relayout(LayoutUtil::MakeLayout({0, 3, 2, 1}));
+
+    std::vector<T> filter_elems(ShapeUtil::ElementsIn(filter_shape));
+    iota_int_init_value(filter_elems, 1);
+    auto filter_r1 = LiteralUtil::CreateR1<T>(filter_elems);
+    auto filter_r4 = filter_r1.Reshape(filter_dims).ConsumeValueOrDie();
+
+    auto expected_r1 = LiteralUtil::CreateR1<T>(
+        {static_cast<T>(6864),  static_cast<T>(7296),  static_cast<T>(7746),
+         static_cast<T>(8214),  static_cast<T>(8700),  static_cast<T>(7809),
+         static_cast<T>(8286),  static_cast<T>(8781),  static_cast<T>(9294),
+         static_cast<T>(9825),  static_cast<T>(10644), static_cast<T>(11256),
+         static_cast<T>(11886), static_cast<T>(12534), static_cast<T>(13200),
+         static_cast<T>(11589), static_cast<T>(12246), static_cast<T>(12921),
+         static_cast<T>(13614), static_cast<T>(14325)});
+    auto expected_r4 = expected_r1.Reshape({1, 2, 2, 5}).ConsumeValueOrDie();
+    auto expected_r4_relaid =
+        expected_r4.Relayout(LayoutUtil::MakeLayout({0, 3, 2, 1}));
+
+    auto input_literal =
+        client_->TransferToServer(input_r4_relaid).ConsumeValueOrDie();
+    auto filter_literal =
+        client_->TransferToServer(filter_r4).ConsumeValueOrDie();
+
+    ComputeAndCompareLiteral(&builder, expected_r4_relaid,
+                             {input_literal.get(), filter_literal.get()},
+                             error_spec_, &expected_r4_relaid.shape());
+  }
+};
+
+TYPED_TEST_CASE(
+    Convolve2D_1x4x4x5_3x3x1x5_Depthwise_Valid_Output_Batch_In_Lanes,
+    TestTypes);
+TYPED_TEST(Convolve2D_1x4x4x5_3x3x1x5_Depthwise_Valid_Output_Batch_In_Lanes,
+           Types) {
+  this->RunTest();
+}
+
 template <typename T>
 class Convolve2D_1x4x4x160_3x3x1x160_Depthwise_Valid : public ConvolutionTest {
  public:
@@ -786,8 +1069,6 @@ class Convolve2D_1x4x4x160_3x3x1x160_Depthwise_Valid : public ConvolutionTest {
     ComputeAndCompareLiteral(&builder, expected_r4,
                              {input_literal.get(), filter_literal.get()},
                              error_spec_);
-
-    auto filter_r = filter_r1.Reshape(filter_dims);
   }
 };
 
@@ -796,6 +1077,146 @@ TYPED_TEST(Convolve2D_1x4x4x160_3x3x1x160_Depthwise_Valid, Types) {
   this->RunTest();
 }
 
+template <typename T>
+class Convolve2D_1x4x4x160_3x3x1x160_Depthwise_Input_Batch_In_Lanes
+    : public ConvolutionTest {
+ public:
+  void RunTest() {
+    XlaBuilder builder(TestName());
+    std::vector<int64> input_dims = {1, 4, 4, 160};
+    std::vector<int64> filter_dims = {3, 3, 1, 160};
+    Shape input_shape = ShapeUtil::MakeShapeWithType<T>(input_dims);
+    Shape filter_shape = ShapeUtil::MakeShapeWithType<T>(filter_dims);
+    {
+      auto input = Parameter(&builder, 0, input_shape, "input");
+      auto filter = Parameter(&builder, 1, filter_shape, "filter");
+
+      // Tensorflow dimension numbers for 2D convolution.
+      ConvolutionDimensionNumbers dnums;
+      dnums.set_input_batch_dimension(0);
+      dnums.set_output_batch_dimension(0);
+      dnums.add_input_spatial_dimensions(1);
+      dnums.add_output_spatial_dimensions(1);
+      dnums.add_input_spatial_dimensions(2);
+      dnums.add_output_spatial_dimensions(2);
+      dnums.set_input_feature_dimension(3);
+      dnums.set_output_feature_dimension(3);
+      dnums.add_kernel_spatial_dimensions(0);
+      dnums.add_kernel_spatial_dimensions(1);
+      dnums.set_kernel_input_feature_dimension(2);
+      dnums.set_kernel_output_feature_dimension(3);
+
+      ConvWithGeneralDimensions(input, filter, {1, 1}, Padding::kValid, dnums,
+                                /*feature_group_count=*/160);
+    }
+
+    std::vector<T> input_elems(ShapeUtil::ElementsIn(input_shape),
+                               static_cast<T>(1));
+    auto input_r1 = LiteralUtil::CreateR1<T>(input_elems);
+    auto input_r4 = input_r1.Reshape(input_dims).ConsumeValueOrDie();
+    auto input_r4_relaid =
+        input_r4.Relayout(LayoutUtil::MakeLayout({0, 3, 2, 1}));
+
+    std::vector<T> filter_elems(ShapeUtil::ElementsIn(filter_shape),
+                                static_cast<T>(2));
+    auto filter_r1 = LiteralUtil::CreateR1<T>(filter_elems);
+    auto filter_r4 = filter_r1.Reshape(filter_dims).ConsumeValueOrDie();
+
+    std::vector<T> output_elems(640, static_cast<T>(18));
+
+    auto expected_r1 = LiteralUtil::CreateR1<T>(output_elems);
+    auto expected_r4 = expected_r1.Reshape({1, 2, 2, 160}).ConsumeValueOrDie();
+    auto expected_r4_relaid =
+        expected_r4.Relayout(LayoutUtil::MakeLayout({3, 0, 2, 1}));
+
+    auto input_literal =
+        client_->TransferToServer(input_r4_relaid).ConsumeValueOrDie();
+    auto filter_literal =
+        client_->TransferToServer(filter_r4).ConsumeValueOrDie();
+
+    ComputeAndCompareLiteral(&builder, expected_r4_relaid,
+                             {input_literal.get(), filter_literal.get()},
+                             error_spec_, &expected_r4_relaid.shape());
+  }
+};
+
+TYPED_TEST_CASE(Convolve2D_1x4x4x160_3x3x1x160_Depthwise_Input_Batch_In_Lanes,
+                TestTypes);
+TYPED_TEST(Convolve2D_1x4x4x160_3x3x1x160_Depthwise_Input_Batch_In_Lanes,
+           Types) {
+  this->RunTest();
+}
+
+template <typename T>
+class Convolve2D_1x4x4x160_3x3x1x160_Dephtwise_Both_Batch_In_Lanes
+    : public ConvolutionTest {
+ public:
+  void RunTest() {
+    XlaBuilder builder(TestName());
+    std::vector<int64> input_dims = {1, 4, 4, 160};
+    std::vector<int64> filter_dims = {3, 3, 1, 160};
+    Shape input_shape = ShapeUtil::MakeShapeWithType<T>(input_dims);
+    Shape filter_shape = ShapeUtil::MakeShapeWithType<T>(filter_dims);
+    {
+      auto input = Parameter(&builder, 0, input_shape, "input");
+      auto filter = Parameter(&builder, 1, filter_shape, "filter");
+
+      // Tensorflow dimension numbers for 2D convolution.
+      ConvolutionDimensionNumbers dnums;
+      dnums.set_input_batch_dimension(0);
+      dnums.set_output_batch_dimension(0);
+      dnums.add_input_spatial_dimensions(1);
+      dnums.add_output_spatial_dimensions(1);
+      dnums.add_input_spatial_dimensions(2);
+      dnums.add_output_spatial_dimensions(2);
+      dnums.set_input_feature_dimension(3);
+      dnums.set_output_feature_dimension(3);
+      dnums.add_kernel_spatial_dimensions(0);
+      dnums.add_kernel_spatial_dimensions(1);
+      dnums.set_kernel_input_feature_dimension(2);
+      dnums.set_kernel_output_feature_dimension(3);
+
+      ConvWithGeneralDimensions(input, filter, {1, 1}, Padding::kValid, dnums,
+                                /*feature_group_count=*/160);
+    }
+
+    std::vector<T> input_elems(ShapeUtil::ElementsIn(input_shape),
+                               static_cast<T>(1));
+    auto input_r1 = LiteralUtil::CreateR1<T>(input_elems);
+    auto input_r4 = input_r1.Reshape(input_dims).ConsumeValueOrDie();
+    auto input_r4_relaid =
+        input_r4.Relayout(LayoutUtil::MakeLayout({0, 3, 2, 1}));
+
+    std::vector<T> filter_elems(ShapeUtil::ElementsIn(filter_shape),
+                                static_cast<T>(2));
+    auto filter_r1 = LiteralUtil::CreateR1<T>(filter_elems);
+    auto filter_r4 = filter_r1.Reshape(filter_dims).ConsumeValueOrDie();
+
+    std::vector<T> output_elems(640, static_cast<T>(18));
+
+    auto expected_r1 = LiteralUtil::CreateR1<T>(output_elems);
+    auto expected_r4 = expected_r1.Reshape({1, 2, 2, 160}).ConsumeValueOrDie();
+    auto expected_r4_relaid =
+        expected_r4.Relayout(LayoutUtil::MakeLayout({0, 3, 2, 1}));
+
+    auto input_literal =
+        client_->TransferToServer(input_r4_relaid).ConsumeValueOrDie();
+    auto filter_literal =
+        client_->TransferToServer(filter_r4).ConsumeValueOrDie();
+
+    ComputeAndCompareLiteral(&builder, expected_r4_relaid,
+                             {input_literal.get(), filter_literal.get()},
+                             error_spec_, &expected_r4_relaid.shape());
+  }
+};
+
+TYPED_TEST_CASE(Convolve2D_1x4x4x160_3x3x1x160_Dephtwise_Both_Batch_In_Lanes,
+                TestTypes);
+TYPED_TEST(Convolve2D_1x4x4x160_3x3x1x160_Dephtwise_Both_Batch_In_Lanes,
+           Types) {
+  this->RunTest();
+}
+
 template <typename T>
 class Convolve2D_1x4x4x1024_3x3x1x1024_Depthwise_Valid
     : public ConvolutionTest {
@@ -852,8 +1273,6 @@ class Convolve2D_1x4x4x1024_3x3x1x1024_Depthwise_Valid
     ComputeAndCompareLiteral(&builder, expected_r4,
                              {input_literal.get(), filter_literal.get()},
                              error_spec_);
-
-    auto filter_r = filter_r1.Reshape(filter_dims);
   }
 };
 
@@ -863,7 +1282,7 @@ TYPED_TEST(Convolve2D_1x4x4x1024_3x3x1x1024_Depthwise_Valid, Types) {
 }
 
 template <typename T>
-class Convolve2D_1x2x2x6_2x2x1x12_Grouped_Valid : public ConvolutionTest {
+class Convolve2D_1x2x2x6_2x2x2x12_Grouped_Valid : public ConvolutionTest {
  public:
   void RunTest() {
     XlaBuilder builder(TestName());
@@ -922,8 +1341,329 @@ class Convolve2D_1x2x2x6_2x2x1x12_Grouped_Valid : public ConvolutionTest {
   }
 };
 
-TYPED_TEST_CASE(Convolve2D_1x2x2x6_2x2x1x12_Grouped_Valid, TestTypes);
-TYPED_TEST(Convolve2D_1x2x2x6_2x2x1x12_Grouped_Valid, Types) {
+TYPED_TEST_CASE(Convolve2D_1x2x2x6_2x2x2x12_Grouped_Valid, TestTypes);
+TYPED_TEST(Convolve2D_1x2x2x6_2x2x2x12_Grouped_Valid, Types) {
+  this->RunTest();
+}
+
+template <typename T>
+class Convolve2D_1x2x2x1024_2x2x128x512_Grouped_Valid : public ConvolutionTest {
+ public:
+  void RunTest() {
+    XlaBuilder builder(TestName());
+    std::vector<int64> input_dims = {1, 2, 2, 1024};
+    std::vector<int64> filter_dims = {2, 2, 128, 512};
+    Shape input_shape = ShapeUtil::MakeShapeWithType<T>(input_dims);
+    Shape filter_shape = ShapeUtil::MakeShapeWithType<T>(filter_dims);
+    {
+      auto input = Parameter(&builder, 0, input_shape, "input");
+      auto filter = Parameter(&builder, 1, filter_shape, "filter");
+
+      // Tensorflow dimension numbers for 2D convolution.
+      ConvolutionDimensionNumbers dnums;
+      dnums.set_input_batch_dimension(0);
+      dnums.set_output_batch_dimension(0);
+      dnums.add_input_spatial_dimensions(1);
+      dnums.add_output_spatial_dimensions(1);
+      dnums.add_input_spatial_dimensions(2);
+      dnums.add_output_spatial_dimensions(2);
+      dnums.set_input_feature_dimension(3);
+      dnums.set_output_feature_dimension(3);
+      dnums.add_kernel_spatial_dimensions(0);
+      dnums.add_kernel_spatial_dimensions(1);
+      dnums.set_kernel_input_feature_dimension(2);
+      dnums.set_kernel_output_feature_dimension(3);
+
+      ConvWithGeneralDimensions(input, filter, {1, 1}, Padding::kValid, dnums,
+                                /*feature_group_count=*/8);
+    }
+
+    std::vector<T> input_elems(ShapeUtil::ElementsIn(input_shape),
+                               static_cast<T>(1));
+
+    auto input_r1 = LiteralUtil::CreateR1<T>(input_elems);
+    auto input_r4 = input_r1.Reshape(input_dims).ConsumeValueOrDie();
+
+    std::vector<T> filter_elems(ShapeUtil::ElementsIn(filter_shape),
+                                static_cast<T>(2));
+
+    auto filter_r1 = LiteralUtil::CreateR1<T>(filter_elems);
+    auto filter_r4 = filter_r1.Reshape(filter_dims).ConsumeValueOrDie();
+
+    std::vector<T> output_elems(512, static_cast<T>(1024));
+    auto expected_r1 = LiteralUtil::CreateR1<T>(output_elems);
+    auto expected_r4 = expected_r1.Reshape({1, 1, 1, 512}).ConsumeValueOrDie();
+
+    auto input_literal =
+        client_->TransferToServer(input_r4).ConsumeValueOrDie();
+    auto filter_literal =
+        client_->TransferToServer(filter_r4).ConsumeValueOrDie();
+
+    ComputeAndCompareLiteral(&builder, expected_r4,
+                             {input_literal.get(), filter_literal.get()},
+                             error_spec_);
+  }
+};
+
+TYPED_TEST_CASE(Convolve2D_1x2x2x1024_2x2x128x512_Grouped_Valid, TestTypes);
+TYPED_TEST(Convolve2D_1x2x2x1024_2x2x128x512_Grouped_Valid, Types) {
+  this->RunTest();
+}
+
+template <typename T>
+class Convolve2D_1x2x2x1024_2x2x128x8_Grouped_Valid : public ConvolutionTest {
+ public:
+  void RunTest() {
+    XlaBuilder builder(TestName());
+    std::vector<int64> input_dims = {1, 2, 2, 1024};
+    std::vector<int64> filter_dims = {2, 2, 128, 8};
+    Shape input_shape = ShapeUtil::MakeShapeWithType<T>(input_dims);
+    Shape filter_shape = ShapeUtil::MakeShapeWithType<T>(filter_dims);
+    {
+      auto input = Parameter(&builder, 0, input_shape, "input");
+      auto filter = Parameter(&builder, 1, filter_shape, "filter");
+
+      // Tensorflow dimension numbers for 2D convolution.
+      ConvolutionDimensionNumbers dnums;
+      dnums.set_input_batch_dimension(0);
+      dnums.set_output_batch_dimension(0);
+      dnums.add_input_spatial_dimensions(1);
+      dnums.add_output_spatial_dimensions(1);
+      dnums.add_input_spatial_dimensions(2);
+      dnums.add_output_spatial_dimensions(2);
+      dnums.set_input_feature_dimension(3);
+      dnums.set_output_feature_dimension(3);
+      dnums.add_kernel_spatial_dimensions(0);
+      dnums.add_kernel_spatial_dimensions(1);
+      dnums.set_kernel_input_feature_dimension(2);
+      dnums.set_kernel_output_feature_dimension(3);
+
+      ConvWithGeneralDimensions(input, filter, {1, 1}, Padding::kValid, dnums,
+                                /*feature_group_count=*/8);
+    }
+
+    std::vector<T> input_elems(ShapeUtil::ElementsIn(input_shape),
+                               static_cast<T>(1));
+
+    auto input_r1 = LiteralUtil::CreateR1<T>(input_elems);
+    auto input_r4 = input_r1.Reshape(input_dims).ConsumeValueOrDie();
+
+    std::vector<T> filter_elems(ShapeUtil::ElementsIn(filter_shape),
+                                static_cast<T>(2));
+
+    auto filter_r1 = LiteralUtil::CreateR1<T>(filter_elems);
+    auto filter_r4 = filter_r1.Reshape(filter_dims).ConsumeValueOrDie();
+
+    std::vector<T> output_elems(8, static_cast<T>(1024));
+    auto expected_r1 = LiteralUtil::CreateR1<T>(output_elems);
+    auto expected_r4 = expected_r1.Reshape({1, 1, 1, 8}).ConsumeValueOrDie();
+
+    auto input_literal =
+        client_->TransferToServer(input_r4).ConsumeValueOrDie();
+    auto filter_literal =
+        client_->TransferToServer(filter_r4).ConsumeValueOrDie();
+
+    ComputeAndCompareLiteral(&builder, expected_r4,
+                             {input_literal.get(), filter_literal.get()},
+                             error_spec_);
+  }
+};
+
+TYPED_TEST_CASE(Convolve2D_1x2x2x1024_2x2x128x8_Grouped_Valid, TestTypes);
+TYPED_TEST(Convolve2D_1x2x2x1024_2x2x128x8_Grouped_Valid, Types) {
+  this->RunTest();
+}
+
+template <typename T>
+class Convolve2D_1x2x2x12_2x2x3x4_Grouped_Valid : public ConvolutionTest {
+ public:
+  void RunTest() {
+    XlaBuilder builder(TestName());
+    std::vector<int64> input_dims = {1, 2, 2, 12};
+    std::vector<int64> filter_dims = {2, 2, 3, 4};
+    Shape input_shape = ShapeUtil::MakeShapeWithType<T>(input_dims);
+    Shape filter_shape = ShapeUtil::MakeShapeWithType<T>(filter_dims);
+    {
+      auto input = Parameter(&builder, 0, input_shape, "input");
+      auto filter = Parameter(&builder, 1, filter_shape, "filter");
+
+      // Tensorflow dimension numbers for 2D convolution.
+      ConvolutionDimensionNumbers dnums;
+      dnums.set_input_batch_dimension(0);
+      dnums.set_output_batch_dimension(0);
+      dnums.add_input_spatial_dimensions(1);
+      dnums.add_output_spatial_dimensions(1);
+      dnums.add_input_spatial_dimensions(2);
+      dnums.add_output_spatial_dimensions(2);
+      dnums.set_input_feature_dimension(3);
+      dnums.set_output_feature_dimension(3);
+      dnums.add_kernel_spatial_dimensions(0);
+      dnums.add_kernel_spatial_dimensions(1);
+      dnums.set_kernel_input_feature_dimension(2);
+      dnums.set_kernel_output_feature_dimension(3);
+
+      ConvWithGeneralDimensions(input, filter, {1, 1}, Padding::kValid, dnums,
+                                /*feature_group_count=*/4);
+    }
+
+    std::vector<T> input_elems(ShapeUtil::ElementsIn(input_shape));
+    iota_int_init_value(input_elems, 1);
+    auto input_r1 = LiteralUtil::CreateR1<T>(input_elems);
+    auto input_r4 = input_r1.Reshape(input_dims).ConsumeValueOrDie();
+
+    std::vector<T> filter_elems(ShapeUtil::ElementsIn(filter_shape));
+    iota_int_init_value(filter_elems, 1);
+    auto filter_r1 = LiteralUtil::CreateR1<T>(filter_elems);
+    auto filter_r4 = filter_r1.Reshape(filter_dims).ConsumeValueOrDie();
+
+    auto expected_r1 =
+        LiteralUtil::CreateR1<T>({static_cast<T>(7712), static_cast<T>(8816),
+                                  static_cast<T>(9992), static_cast<T>(11240)});
+    auto expected_r4 = expected_r1.Reshape({1, 1, 1, 4}).ConsumeValueOrDie();
+
+    auto input_literal =
+        client_->TransferToServer(input_r4).ConsumeValueOrDie();
+    auto filter_literal =
+        client_->TransferToServer(filter_r4).ConsumeValueOrDie();
+
+    ComputeAndCompareLiteral(&builder, expected_r4,
+                             {input_literal.get(), filter_literal.get()},
+                             error_spec_);
+  }
+};
+
+TYPED_TEST_CASE(Convolve2D_1x2x2x12_2x2x3x4_Grouped_Valid, TestTypes);
+TYPED_TEST(Convolve2D_1x2x2x12_2x2x3x4_Grouped_Valid, Types) {
+  this->RunTest();
+}
+
+template <typename T>
+class Convolve2D_1x2x2x12_2x2x3x4_Grouped_Valid_Filter_OF_In_Sublanes
+    : public ConvolutionTest {
+ public:
+  void RunTest() {
+    XlaBuilder builder(TestName());
+    std::vector<int64> input_dims = {1, 2, 2, 12};
+    std::vector<int64> filter_dims = {2, 2, 4, 3};
+    Shape input_shape = ShapeUtil::MakeShapeWithType<T>(input_dims);
+    Shape filter_shape = ShapeUtil::MakeShapeWithType<T>(filter_dims);
+    {
+      auto input = Parameter(&builder, 0, input_shape, "input");
+      auto filter = Parameter(&builder, 1, filter_shape, "filter");
+
+      // Tensorflow dimension numbers for 2D convolution.
+      ConvolutionDimensionNumbers dnums;
+      dnums.set_input_batch_dimension(0);
+      dnums.set_output_batch_dimension(0);
+      dnums.add_input_spatial_dimensions(1);
+      dnums.add_output_spatial_dimensions(1);
+      dnums.add_input_spatial_dimensions(2);
+      dnums.add_output_spatial_dimensions(2);
+      dnums.set_input_feature_dimension(3);
+      dnums.set_output_feature_dimension(3);
+      dnums.add_kernel_spatial_dimensions(0);
+      dnums.add_kernel_spatial_dimensions(1);
+      dnums.set_kernel_input_feature_dimension(3);
+      dnums.set_kernel_output_feature_dimension(2);
+
+      ConvWithGeneralDimensions(input, filter, {1, 1}, Padding::kValid, dnums,
+                                /*feature_group_count=*/4);
+    }
+
+    std::vector<T> input_elems(ShapeUtil::ElementsIn(input_shape));
+    iota_int_init_value(input_elems, 1);
+    auto input_r1 = LiteralUtil::CreateR1<T>(input_elems);
+    auto input_r4 = input_r1.Reshape(input_dims).ConsumeValueOrDie();
+
+    std::vector<T> filter_elems(ShapeUtil::ElementsIn(filter_shape));
+    iota_int_init_value(filter_elems, 1);
+    auto filter_r1 = LiteralUtil::CreateR1<T>(filter_elems);
+    auto filter_r4 = filter_r1.Reshape(filter_dims).ConsumeValueOrDie();
+    auto filter_r4_relaid =
+        filter_r4.Relayout(LayoutUtil::MakeLayout({3, 2, 1, 0}));
+    auto expected_r1 = LiteralUtil::CreateR1<T>(
+        {static_cast<T>(6968), static_cast<T>(8516), static_cast<T>(10280),
+         static_cast<T>(12260)});
+    auto expected_r4 = expected_r1.Reshape({1, 1, 1, 4}).ConsumeValueOrDie();
+
+    auto input_literal =
+        client_->TransferToServer(input_r4).ConsumeValueOrDie();
+    auto filter_literal =
+        client_->TransferToServer(filter_r4_relaid).ConsumeValueOrDie();
+
+    ComputeAndCompareLiteral(&builder, expected_r4,
+                             {input_literal.get(), filter_literal.get()},
+                             error_spec_);
+  }
+};
+
+TYPED_TEST_CASE(Convolve2D_1x2x2x12_2x2x3x4_Grouped_Valid_Filter_OF_In_Sublanes,
+                TestTypes);
+TYPED_TEST(Convolve2D_1x2x2x12_2x2x3x4_Grouped_Valid_Filter_OF_In_Sublanes,
+           Types) {
+  this->RunTest();
+}
+
+template <typename T>
+class Convolve2D_1x1x1x12_1x1x3x4_Grouped_Valid : public ConvolutionTest {
+ public:
+  void RunTest() {
+    XlaBuilder builder(TestName());
+    std::vector<int64> input_dims = {1, 1, 1, 12};
+    std::vector<int64> filter_dims = {1, 1, 3, 4};
+    Shape input_shape = ShapeUtil::MakeShapeWithType<T>(input_dims);
+    Shape filter_shape = ShapeUtil::MakeShapeWithType<T>(filter_dims);
+    {
+      auto input = Parameter(&builder, 0, input_shape, "input");
+      auto filter = Parameter(&builder, 1, filter_shape, "filter");
+
+      // Tensorflow dimension numbers for 2D convolution.
+      ConvolutionDimensionNumbers dnums;
+      dnums.set_input_batch_dimension(0);
+      dnums.set_output_batch_dimension(0);
+      dnums.add_input_spatial_dimensions(1);
+      dnums.add_output_spatial_dimensions(1);
+      dnums.add_input_spatial_dimensions(2);
+      dnums.add_output_spatial_dimensions(2);
+      dnums.set_input_feature_dimension(3);
+      dnums.set_output_feature_dimension(3);
+      dnums.add_kernel_spatial_dimensions(0);
+      dnums.add_kernel_spatial_dimensions(1);
+      dnums.set_kernel_input_feature_dimension(2);
+      dnums.set_kernel_output_feature_dimension(3);
+
+      ConvWithGeneralDimensions(input, filter, {1, 1}, Padding::kValid, dnums,
+                                /*feature_group_count=*/4);
+    }
+
+    std::vector<T> input_elems(ShapeUtil::ElementsIn(input_shape));
+    iota_int_init_value(input_elems, 1);
+    auto input_r1 = LiteralUtil::CreateR1<T>(input_elems);
+    auto input_r4 = input_r1.Reshape(input_dims).ConsumeValueOrDie();
+
+    std::vector<T> filter_elems(ShapeUtil::ElementsIn(filter_shape));
+    iota_int_init_value(filter_elems, 1);
+    auto filter_r1 = LiteralUtil::CreateR1<T>(filter_elems);
+    auto filter_r4 = filter_r1.Reshape(filter_dims).ConsumeValueOrDie();
+
+    auto expected_r1 =
+        LiteralUtil::CreateR1<T>({static_cast<T>(38), static_cast<T>(98),
+                                  static_cast<T>(176), static_cast<T>(272)});
+    auto expected_r4 = expected_r1.Reshape({1, 1, 1, 4}).ConsumeValueOrDie();
+
+    auto input_literal =
+        client_->TransferToServer(input_r4).ConsumeValueOrDie();
+    auto filter_literal =
+        client_->TransferToServer(filter_r4).ConsumeValueOrDie();
+
+    ComputeAndCompareLiteral(&builder, expected_r4,
+                             {input_literal.get(), filter_literal.get()},
+                             error_spec_);
+  }
+};
+
+TYPED_TEST_CASE(Convolve2D_1x1x1x12_1x1x3x4_Grouped_Valid, TestTypes);
+TYPED_TEST(Convolve2D_1x1x1x12_1x1x3x4_Grouped_Valid, Types) {
   this->RunTest();
 }
 
@@ -1217,6 +1957,18 @@ ENTRY Test {
   EXPECT_TRUE(RunAndCompare(kHlo, ErrorSpec{0.001}));
 }
 
+XLA_TEST_F(ConvolutionHloTest, DISABLED_ON_CPU(ConvolveF64ForwardReversed)) {
+  constexpr char kHlo[] = R"(
+HloModule TestModule
+
+ENTRY Test {
+  %arg0 = f64[3,56,56,16] parameter(0)
+  %arg1 = f64[3,3,3,64] parameter(1)
+  ROOT %conv = f64[54,54,16,64] convolution(%arg0, %arg1), window={size=3x3 rhs_reversal=1x1}, dim_labels=f01b_i01o->01bf
+})";
+  EXPECT_TRUE(RunAndCompare(kHlo, ErrorSpec{0.001}));
+}
+
 XLA_TEST_F(ConvolutionHloTest, DISABLED_ON_CPU(ConvolveF64BackwardFilter)) {
   constexpr char kHlo[] = R"(
 HloModule TestModule
diff --git a/tensorflow/compiler/xla/tests/dot_operation_test.cc b/tensorflow/compiler/xla/tests/dot_operation_test.cc
index 6c0847a8757..25091b8d5d5 100644
--- a/tensorflow/compiler/xla/tests/dot_operation_test.cc
+++ b/tensorflow/compiler/xla/tests/dot_operation_test.cc
@@ -637,6 +637,76 @@ XLA_TYPED_TEST(DotOperationTest_F16F32F64CF64, GeneralMatMul) {
       {x_data.get(), y_data.get()}, this->error_spec_);
 }
 
+#ifndef XLA_TEST_BACKEND_CPU
+// TODO(b/74459949): failed on CPU on 2018-10-29.
+XLA_TYPED_TEST(DotOperationTest_F16F32F64CF64, GeneralMatMulR3LhsR2Rhs) {
+  using T = TypeParam;
+
+  XlaBuilder builder(this->TestName());
+  auto x =
+      Parameter(&builder, 0, ShapeUtil::MakeShapeWithType<T>({2, 2, 2}), "x");
+  auto y = Parameter(&builder, 1, ShapeUtil::MakeShapeWithType<T>({2, 2}), "y");
+
+  DotDimensionNumbers dnums;
+  dnums.add_lhs_contracting_dimensions(1);
+  dnums.add_rhs_contracting_dimensions(1);
+  dnums.add_lhs_batch_dimensions(0);
+  dnums.add_rhs_batch_dimensions(0);
+
+  DotGeneral(x, y, dnums);
+
+  auto x_data =
+      this->client_
+          ->TransferToServer(LiteralUtil::CreateR3FromArray3D<T>(
+              {{{1.0f, 2.0f}, {3.0f, 4.0f}}, {{5.0f, 6.0f}, {7.0f, 8.0f}}}))
+          .ConsumeValueOrDie();
+
+  auto y_data = this->client_
+                    ->TransferToServer(LiteralUtil::CreateR2FromArray2D<T>(
+                        {{1.0f, 0.0f}, {0.0f, 1.0f}}))
+                    .ConsumeValueOrDie();
+
+  this->template ComputeAndCompareR2<T>(
+      &builder,
+      /*expected=*/{{1.0f, 2.0f}, {7.0f, 8.0f}}, {x_data.get(), y_data.get()},
+      this->error_spec_);
+}
+
+// TODO(b/74459949): failed on CPU on 2018-10-29.
+XLA_TYPED_TEST(DotOperationTest_F16F32F64CF64, GeneralMatMulR2LhsR3Rhs) {
+  using T = TypeParam;
+
+  XlaBuilder builder(this->TestName());
+  auto x = Parameter(&builder, 0, ShapeUtil::MakeShapeWithType<T>({2, 2}), "x");
+  auto y =
+      Parameter(&builder, 1, ShapeUtil::MakeShapeWithType<T>({2, 2, 2}), "y");
+
+  DotDimensionNumbers dnums;
+  dnums.add_lhs_contracting_dimensions(1);
+  dnums.add_rhs_contracting_dimensions(1);
+  dnums.add_lhs_batch_dimensions(0);
+  dnums.add_rhs_batch_dimensions(0);
+
+  DotGeneral(x, y, dnums);
+
+  auto x_data = this->client_
+                    ->TransferToServer(LiteralUtil::CreateR2FromArray2D<T>(
+                        {{1.0f, 0.0f}, {0.0f, 1.0f}}))
+                    .ConsumeValueOrDie();
+
+  auto y_data =
+      this->client_
+          ->TransferToServer(LiteralUtil::CreateR3FromArray3D<T>(
+              {{{1.0f, 2.0f}, {3.0f, 4.0f}}, {{5.0f, 6.0f}, {7.0f, 8.0f}}}))
+          .ConsumeValueOrDie();
+
+  this->template ComputeAndCompareR2<T>(
+      &builder,
+      /*expected=*/{{1.0f, 2.0f}, {7.0f, 8.0f}}, {x_data.get(), y_data.get()},
+      this->error_spec_);
+}
+#endif  // XLA_TEST_BACKEND_CPU
+
 XLA_TYPED_TEST(DotOperationTest_F16F32F64CF64, GeneralMatMulMultipleBatch) {
   using T = TypeParam;
 
diff --git a/tensorflow/compiler/xla/tests/grouped_convolution_test.cc b/tensorflow/compiler/xla/tests/grouped_convolution_test.cc
new file mode 100644
index 00000000000..8f7049910e7
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/grouped_convolution_test.cc
@@ -0,0 +1,245 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "absl/types/optional.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
+#include "tensorflow/compiler/xla/execution_options_util.h"
+#include "tensorflow/compiler/xla/service/bfloat16_normalization.h"
+#include "tensorflow/compiler/xla/service/despecializer.h"
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/compiler/xla/tests/client_library_test_base.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/compiler/xla/tests/test_macros.h"
+
+namespace xla {
+namespace {
+
+string GetFloatDataType(bool use_bfloat16) {
+  return use_bfloat16 ? "bf16" : "f32";
+}
+
+struct GroupedConvolution2DSpec {
+  int64 input_feature, output_feature, window, stride, pad, lhs_dilate;
+  int64 group_size, group_count;
+  std::vector<int64> activation_dims;
+  std::vector<int64> activation_layout;
+  std::vector<int64> kernel_dims;
+  std::vector<int64> kernel_layout;
+  std::vector<int64> output_dims;
+  std::vector<int64> output_layout;
+};
+
+class GroupedConvolution2DTest
+    : public HloTestBase,
+      public ::testing::WithParamInterface<
+          ::testing::tuple<GroupedConvolution2DSpec, bool>> {};
+
+static std::vector<GroupedConvolution2DSpec> GetConv2DTestCases() {
+  std::vector<GroupedConvolution2DSpec> config_set;
+  // Add to this set if you want a new test configuration.
+  // Rule : the penultimate number must be divisible by the last number.
+  std::vector<std::vector<int64>> config_options = {{8, 2, 2, 1, 1024, 128},
+                                                    {512, 3, 3, 144, 1024, 16},
+                                                    {256, 3, 3, 129, 512, 64},
+                                                    {64, 1, 2, 127, 32, 8},
+                                                    {256, 3, 3, 256, 1024, 4}};
+
+  for (auto option : config_options) {
+    int64 output_feature = option[0];
+    int64 activation_size = option[1];
+    int64 kernel_size = option[2];
+    int64 batch = option[3];
+    int64 input_feature = option[4];
+    int64 group_size = option[5];
+
+    std::vector<int64> kernel_layout = {3, 2, 1, 0};
+    GroupedConvolution2DSpec config;
+    config.group_size = group_size;
+    config.group_count = input_feature / group_size;
+    config.output_feature = output_feature;
+    config.window = kernel_size;
+
+    config.activation_dims = {batch, activation_size, activation_size,
+                              input_feature};
+    config.activation_layout = {3, 0, 2, 1};
+
+    config.kernel_dims = {kernel_size, kernel_size, group_size, output_feature};
+    config.kernel_layout = {3, 2, 1, 0};
+
+    if (activation_size == 1 && kernel_size == 2) {
+      // Test for outer dim.
+      config.output_dims = {batch, activation_size + kernel_size - 1,
+                            activation_size + kernel_size, output_feature};
+    } else if (output_feature == 256) {
+      // Restrict dilation-based tests only to one feature configuration.
+      config.stride = activation_size - 1;
+      config.pad = 0;
+      config.lhs_dilate = output_feature / 32;
+      config.output_dims = {batch, output_feature / 32,
+                            activation_size - kernel_size + 1, output_feature};
+    } else {
+      config.stride = config.pad = config.lhs_dilate = -1;
+      config.output_dims = {batch, activation_size - kernel_size + 1,
+                            activation_size - kernel_size + 1, output_feature};
+    }
+
+    // Try this layout for all kernel shapes.
+    config.output_layout = {3, 0, 2, 1};
+    config_set.push_back(config);
+
+    // Try other layouts only for certain kernel shapes.
+    if (kernel_size % 2 == 0) {
+      config.activation_layout = {0, 3, 2, 1};
+      config_set.push_back(config);
+
+      config.output_layout = {0, 3, 2, 1};
+      config_set.push_back(config);
+
+      config.activation_layout = {3, 0, 2, 1};
+      config_set.push_back(config);
+    }
+  }
+
+  return config_set;
+}
+
+string GroupedConvolution2DTestDataToString(
+    const ::testing::TestParamInfo<
+        ::testing::tuple<GroupedConvolution2DSpec, bool>>& data) {
+  const auto& spec = ::testing::get<0>(data.param);
+  const string data_type = GetFloatDataType(::testing::get<1>(data.param));
+  string str = absl::StrCat(
+      "activation_dims_", absl::StrJoin(spec.activation_dims, "x"),
+      "_activation_layout_", absl::StrJoin(spec.activation_layout, "_"),
+      "_kernel_dims_", absl::StrJoin(spec.kernel_dims, "x"), "_kernel_layout_",
+      absl::StrJoin(spec.kernel_layout, "_"), "_output_dims_",
+      absl::StrJoin(spec.output_dims, "x"), "_output_layout_",
+      absl::StrJoin(spec.output_layout, "_"), data_type);
+  // -1 indicates non-existence.
+  if (spec.stride != -1) {
+    absl::StrAppend(&str, "_lhs_dilation_", spec.lhs_dilate, "x1");
+  }
+
+  // Test names are not allowed to contain the '-' character.
+  absl::c_replace(str, '-', 'n');
+  return str;
+}
+
+string BuildHloTextGroupedConvolution2D(const GroupedConvolution2DSpec& spec,
+                                        bool use_bfloat16) {
+  const string data_type = GetFloatDataType(use_bfloat16);
+  if (spec.activation_dims[1] == 1 && spec.kernel_dims[1] == 2) {
+    // Check for outer dim.
+    return absl::StrFormat(
+        R"(
+    HloModule TensorFlowDepthwiseConv
+
+    ENTRY main {
+      activation = %s[%s]{%s} parameter(0)
+      kernel = %s[%s]{%s} parameter(1)
+      ROOT conv = %s[%s]{%s} convolution(%s[%s]{%s} activation, %s[%s]{%s} kernel),
+          window={size=%dx%d  pad=1_1x%d_%d rhs_dilate=1x%d}, dim_labels=b01f_01io->b01f,
+          feature_group_count=%d
+    }
+    )",
+        data_type, absl::StrJoin(spec.activation_dims, ","),
+        absl::StrJoin(spec.activation_layout, ","), data_type,
+        absl::StrJoin(spec.kernel_dims, ","),
+        absl::StrJoin(spec.kernel_layout, ","), data_type,
+        absl::StrJoin(spec.output_dims, ","),
+        absl::StrJoin(spec.output_layout, ","), data_type,
+        absl::StrJoin(spec.activation_dims, ","),
+        absl::StrJoin(spec.activation_layout, ","), data_type,
+        absl::StrJoin(spec.kernel_dims, ","),
+        absl::StrJoin(spec.kernel_layout, ","), spec.window, spec.window,
+        spec.window, spec.window, spec.window, spec.group_count);
+
+  } else if (spec.stride == -1) {
+    // Check for basic, non-dilated cases.
+    return absl::StrFormat(
+        R"(
+      HloModule TensorFlowDepthwiseConv
+
+      ENTRY main {
+        activation = %s[%s]{%s} parameter(0)
+        kernel = %s[%s]{%s} parameter(1)
+        ROOT conv = %s[%s]{%s} convolution(%s[%s]{%s} activation, %s[%s]{%s} kernel),
+            window={size=%dx%d}, dim_labels=b01f_01io->b01f,
+            feature_group_count=%d
+      }
+      )",
+        data_type, absl::StrJoin(spec.activation_dims, ","),
+        absl::StrJoin(spec.activation_layout, ","), data_type,
+        absl::StrJoin(spec.kernel_dims, ","),
+        absl::StrJoin(spec.kernel_layout, ","), data_type,
+        absl::StrJoin(spec.output_dims, ","),
+        absl::StrJoin(spec.output_layout, ","), data_type,
+        absl::StrJoin(spec.activation_dims, ","),
+        absl::StrJoin(spec.activation_layout, ","), data_type,
+        absl::StrJoin(spec.kernel_dims, ","),
+        absl::StrJoin(spec.kernel_layout, ","), spec.window, spec.window,
+        spec.group_count);
+  } else {
+    // Check for base dilations.
+    return absl::StrFormat(
+        R"(
+    HloModule TensorFlowDepthwiseConv
+
+    ENTRY main {
+      activation = %s[%s]{%s} parameter(0)
+      kernel = %s[%s]{%s} parameter(1)
+      ROOT conv = %s[%s]{%s} convolution(%s[%s]{%s} activation, %s[%s]{%s} kernel),
+          window={size=%dx%d stride=%dx1 pad=%d_%dx0_0 lhs_dilate=%dx1}, 
+          dim_labels=b01f_01io->b01f, feature_group_count=%d
+    }
+    )",
+        data_type, absl::StrJoin(spec.activation_dims, ","),
+        absl::StrJoin(spec.activation_layout, ","), data_type,
+        absl::StrJoin(spec.kernel_dims, ","),
+        absl::StrJoin(spec.kernel_layout, ","), data_type,
+        absl::StrJoin(spec.output_dims, ","),
+        absl::StrJoin(spec.output_layout, ","), data_type,
+        absl::StrJoin(spec.activation_dims, ","),
+        absl::StrJoin(spec.activation_layout, ","), data_type,
+        absl::StrJoin(spec.kernel_dims, ","),
+        absl::StrJoin(spec.kernel_layout, ","), spec.window, spec.window,
+        spec.stride, 0, 0, spec.lhs_dilate, spec.group_count);
+  }
+}
+
+XLA_TEST_P(GroupedConvolution2DTest, DoIt) {
+  const GroupedConvolution2DSpec& spec = ::testing::get<0>(GetParam());
+  bool use_bfloat16 = ::testing::get<1>(GetParam());
+  const string hlo_text = BuildHloTextGroupedConvolution2D(spec, use_bfloat16);
+
+  EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{0.01, 0.01},
+                            [](HloModule* module) -> Status {
+                              BFloat16MixedPrecisionRemoval remover;
+                              TF_RETURN_IF_ERROR(remover.Run(module).status());
+                              Despecializer despecializer;
+                              return despecializer.Run(module).status();
+                            }));
+}
+
+INSTANTIATE_TEST_CASE_P(
+    GroupedConvolution2DTestWithRandomIndices, GroupedConvolution2DTest,
+    ::testing::Combine(::testing::ValuesIn(GetConv2DTestCases()),
+                       ::testing::Bool()),
+    GroupedConvolution2DTestDataToString);
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/hlo_test_base.cc b/tensorflow/compiler/xla/tests/hlo_test_base.cc
index d8fa00272f8..989a7c705a8 100644
--- a/tensorflow/compiler/xla/tests/hlo_test_base.cc
+++ b/tensorflow/compiler/xla/tests/hlo_test_base.cc
@@ -99,6 +99,8 @@ void VerifiedHloModule::VerifyOrAddFailure(const string& message) {
     ADD_FAILURE() << "HloVerifier failed on module " << name()
                   << (message.empty() ? "" : absl::StrCat(" (", message, ")"))
                   << ": " << status;
+    LOG(ERROR) << "Contents of bad module:";
+    XLA_LOG_LINES(tensorflow::ERROR, ToString());
   }
 }
 
@@ -140,14 +142,6 @@ std::unique_ptr<VerifiedHloModule> HloTestBase::CreateNewVerifiedModule(
       allow_mixed_precision_in_hlo_verifier_);
 }
 
-StatusOr<std::unique_ptr<HloModule>>
-HloTestBase::ParseAndReturnUnverifiedModule(absl::string_view hlo_text,
-                                            const HloModuleConfig& config) {
-  auto module = absl::make_unique<HloModule>(TestName(), config);
-  TF_RETURN_IF_ERROR(ParseHloString(hlo_text, module.get()));
-  return std::move(module);
-}
-
 StatusOr<std::unique_ptr<VerifiedHloModule>>
 HloTestBase::ParseAndReturnVerifiedModule(absl::string_view hlo_text,
                                           const HloModuleConfig& config) {
diff --git a/tensorflow/compiler/xla/tests/hlo_test_base.h b/tensorflow/compiler/xla/tests/hlo_test_base.h
index 366726d90b4..1d1e7f43729 100644
--- a/tensorflow/compiler/xla/tests/hlo_test_base.h
+++ b/tensorflow/compiler/xla/tests/hlo_test_base.h
@@ -20,6 +20,7 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include "absl/base/macros.h"
 #include "absl/types/optional.h"
 #include "absl/types/span.h"
 #include "tensorflow/compiler/xla/service/backend.h"
@@ -100,6 +101,7 @@ class HloTestBase : public ::testing::Test {
   //
   // This returns a vanilla HloModule that doesn't run the HLO verifier on
   // destruction.
+  ABSL_DEPRECATED("Use CreateNewVerifiedModule instead.")
   std::unique_ptr<HloModule> CreateNewUnverifiedModule(
       const string& name = TestName());
 
@@ -108,12 +110,6 @@ class HloTestBase : public ::testing::Test {
   std::unique_ptr<VerifiedHloModule> CreateNewVerifiedModule(
       const string& name = TestName());
 
-  // Parses the given string and returns module as a vanilla, unverified
-  // HloModule.
-  StatusOr<std::unique_ptr<HloModule>> ParseAndReturnUnverifiedModule(
-      absl::string_view hlo_text,
-      const HloModuleConfig& config = HloModuleConfig());
-
   // Parses the given string and returns module as a VerifiedHloModule.
   StatusOr<std::unique_ptr<VerifiedHloModule>> ParseAndReturnVerifiedModule(
       absl::string_view hlo_text,
diff --git a/tensorflow/compiler/xla/tests/iota_test.cc b/tensorflow/compiler/xla/tests/iota_test.cc
index 310f3495922..65205f53ddc 100644
--- a/tensorflow/compiler/xla/tests/iota_test.cc
+++ b/tensorflow/compiler/xla/tests/iota_test.cc
@@ -113,5 +113,26 @@ INSTANTIATE_TEST_CASE_P(IotaR3TestInstantiation, IotaR3Test,
                                                             /*step=*/10),
                                            ::testing::Values(0, 1, 2)));
 
+class IotaR3PredTest : public ClientLibraryTestBase,
+                       public ::testing::WithParamInterface<int> {};
+
+TEST_P(IotaR3PredTest, DoIt) {
+  const auto element_type = PRED;
+  const int64 num_elements = 2;
+  const int64 iota_dim = GetParam();
+  XlaBuilder builder(TestName() + "_" + PrimitiveType_Name(element_type));
+  std::vector<int64> dimensions = {42, 19};
+  dimensions.insert(dimensions.begin() + iota_dim, num_elements);
+  Iota(&builder, ShapeUtil::MakeShape(element_type, dimensions), iota_dim);
+  if (primitive_util::IsFloatingPointType(element_type)) {
+    ComputeAndCompare(&builder, {}, ErrorSpec{0.0001});
+  } else {
+    ComputeAndCompare(&builder, {});
+  }
+}
+
+INSTANTIATE_TEST_CASE_P(IotaR3PredTestInstantiation, IotaR3PredTest,
+                        ::testing::Values(0, 1, 2));
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/replay_test.cc b/tensorflow/compiler/xla/tests/replay_test.cc
index 5cf87e565bf..34c7dc7c464 100644
--- a/tensorflow/compiler/xla/tests/replay_test.cc
+++ b/tensorflow/compiler/xla/tests/replay_test.cc
@@ -55,7 +55,8 @@ TEST_F(ReplayTest, TwoPlusTwoReplay) {
       client_->GetComputationShape(computation).ConsumeValueOrDie();
   std::unique_ptr<ProgramShape> replayed_shape =
       client_->GetComputationShape(replayed).ConsumeValueOrDie();
-  ASSERT_TRUE(protobuf_util::ProtobufEquals(*original_shape, *replayed_shape));
+  ASSERT_TRUE(protobuf_util::ProtobufEquals(original_shape->ToProto(),
+                                            replayed_shape->ToProto()));
 
   // Run it.
   Literal literal =
@@ -87,7 +88,8 @@ XLA_TEST_F(ReplayTest, XPlusYReplayWithParameters) {
       client_->GetComputationShape(computation).ConsumeValueOrDie();
   std::unique_ptr<ProgramShape> replayed_shape =
       client_->GetComputationShape(replayed).ConsumeValueOrDie();
-  ASSERT_TRUE(protobuf_util::ProtobufEquals(*original_shape, *replayed_shape));
+  ASSERT_TRUE(protobuf_util::ProtobufEquals(original_shape->ToProto(),
+                                            replayed_shape->ToProto()));
 
   // Run it.
   std::unique_ptr<GlobalData> x_data =
@@ -133,7 +135,8 @@ TEST_F(ReplayTest, MapPlusTwoOverR1) {
       client_->GetComputationShape(computation).ConsumeValueOrDie();
   std::unique_ptr<ProgramShape> replayed_shape =
       client_->GetComputationShape(replayed).ConsumeValueOrDie();
-  ASSERT_TRUE(protobuf_util::ProtobufEquals(*original_shape, *replayed_shape));
+  ASSERT_TRUE(protobuf_util::ProtobufEquals(original_shape->ToProto(),
+                                            replayed_shape->ToProto()));
 
   // Run it.
   Literal literal =
diff --git a/tensorflow/compiler/xla/tests/reshape_test.cc b/tensorflow/compiler/xla/tests/reshape_test.cc
index dedc95b5ae8..298136002e9 100644
--- a/tensorflow/compiler/xla/tests/reshape_test.cc
+++ b/tensorflow/compiler/xla/tests/reshape_test.cc
@@ -618,7 +618,8 @@ XLA_TEST_P(ReshapeTest, R4Dim0MinorLayoutToR2Dim0MajorLayout) {
   ExecutionOptions execution_options = execution_options_;
   *execution_options.mutable_shape_with_output_layout() =
       ShapeUtil::MakeShapeWithLayout(use_bfloat16() ? BF16 : F32, {2, 8},
-                                     {1, 0});
+                                     {1, 0})
+          .ToProto();
   Literal actual =
       client_
           ->ExecuteAndTransfer(computation, {input.get()}, &execution_options)
@@ -767,7 +768,8 @@ XLA_TEST_P(ReshapeTest, NoopReshape) {
   ExecutionOptions execution_options = execution_options_;
   *execution_options.mutable_shape_with_output_layout() =
       ShapeUtil::MakeShapeWithLayout(use_bfloat16() ? BF16 : F32, {7, 2, 3, 5},
-                                     {2, 3, 0, 1});
+                                     {2, 3, 0, 1})
+          .ToProto();
   Literal output_literal =
       client_
           ->ExecuteAndTransfer(computation, {input_data.get()},
diff --git a/tensorflow/compiler/xla/tests/scatter_test.cc b/tensorflow/compiler/xla/tests/scatter_test.cc
index 7e1f4aa0eb4..32de0fdf78f 100644
--- a/tensorflow/compiler/xla/tests/scatter_test.cc
+++ b/tensorflow/compiler/xla/tests/scatter_test.cc
@@ -129,6 +129,42 @@ ENTRY main {
   RunTest(hlo_text, &operand, &scatter_indices, &updates);
 }
 
+XLA_TEST_F(ScatterTest, TensorFlowScatterV2_InversePermutation) {
+  const char* hlo_text = R"(
+HloModule TensorFlowScatterV2
+
+update_s32 (lhs: s32[], rhs: s32[]) -> s32[] {
+  lhs = s32[] parameter(0)
+  ROOT rhs = s32[] parameter(1)
+}
+
+ENTRY main {
+  permutation = s32[3,4] parameter(0)
+  reshape = s32[3,4,1] reshape(permutation)
+  operand = s32[3,4] iota(), iota_dimension=1
+  updates = s32[3,4,1,1] iota(), iota_dimension=1
+  iota = s32[3,4,1] iota(), iota_dimension=0
+  indices = s32[3,4,2] concatenate(iota, reshape), dimensions={2}
+  ROOT scatter = s32[3,4] scatter(operand, indices, updates),
+      to_apply=update_s32,
+      update_window_dims={2,3},
+      inserted_window_dims={},
+      scatter_dims_to_operand_dims={0,1},
+      index_vector_dim=2
+}
+)";
+  Literal permutation =
+      LiteralUtil::CreateR2<int32>({{1, 3, 2, 0}, {3, 0, 2, 1}, {2, 3, 1, 0}});
+  HloModuleConfig config;
+  config.set_debug_options(GetDebugOptionsForTest());
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseHloString(hlo_text, config));
+  auto actual = ExecuteAndTransfer(std::move(module), {&permutation});
+  Literal expected =
+      LiteralUtil::CreateR2<int32>({{3, 0, 2, 1}, {1, 3, 2, 0}, {3, 2, 0, 1}});
+  EXPECT_TRUE(LiteralTestUtil::Equal(expected, actual));
+}
+
 XLA_TEST_F(ScatterTest, SimpleR4) {
   const char* hlo_text = R"(
 HloModule SimpleR4
diff --git a/tensorflow/compiler/xla/tests/test_utils.cc b/tensorflow/compiler/xla/tests/test_utils.cc
index 2f18036ff4c..eafa48ed7b8 100644
--- a/tensorflow/compiler/xla/tests/test_utils.cc
+++ b/tensorflow/compiler/xla/tests/test_utils.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include <cmath>
 
+#include "absl/base/casts.h"
 #include "absl/memory/memory.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/primitive_util.h"
@@ -28,65 +29,113 @@ namespace xla {
 namespace {
 
 template <typename FloatT, typename GeneratorT>
-void PopulateWithRandomFloatingPointDataImpl(Literal* literal,
-                                             std::minstd_rand0* engine,
-                                             bool no_duplicates) {
-  CHECK(engine != nullptr);
-  CHECK_EQ(literal->shape().element_type(),
-           primitive_util::NativeToPrimitiveType<FloatT>());
-  if (no_duplicates) {
-    // Duplicates may be generated if the number of elements in the literal
-    // exceeds the number of positive values supported by the type.
-    FloatT next_value = std::numeric_limits<FloatT>::min();
-    for (FloatT& value : literal->data<FloatT>()) {
-      value = next_value;
-      next_value =
-          std::nextafter(next_value, std::numeric_limits<FloatT>::max());
-    }
-    std::shuffle(literal->data<FloatT>().begin(), literal->data<FloatT>().end(),
-                 *engine);
-  } else {
-    std::uniform_real_distribution<GeneratorT> generator(-0.1f, 0.2f);
-    for (FloatT& value : literal->data<FloatT>()) {
-      value = static_cast<FloatT>(generator(*engine));
-    }
+void PopulateWithRandomFloatingPointData(Literal* literal,
+                                         std::minstd_rand0* engine) {
+  std::uniform_real_distribution<GeneratorT> generator(-0.1f, 0.2f);
+  for (FloatT& value : literal->data<FloatT>()) {
+    value = static_cast<FloatT>(generator(*engine));
   }
 }
 
 template <typename FloatT>
-void PopulateWithRandomFloatingPointData(Literal* literal,
-                                         std::minstd_rand0* engine,
-                                         bool no_duplicates) {
-  CHECK(engine != nullptr);
-  PopulateWithRandomFloatingPointDataImpl<FloatT, FloatT>(literal, engine,
-                                                          no_duplicates);
-}
+void PopulateWithIntNext(Literal* literal);
 
 template <>
-void PopulateWithRandomFloatingPointData<half>(Literal* literal,
-                                               std::minstd_rand0* engine,
-                                               bool no_duplicates) {
-  // no_duplicates is ignored for half types. Unique values can only be
-  // generated for arrays with fewer than ~2**16 elements and no_duplicates is
-  // best-effort anyway.
-  CHECK(engine != nullptr);
-  std::uniform_real_distribution<float> generator(-0.1f, 0.2f);
+void PopulateWithIntNext<half>(Literal* literal) {
+  // Duplicates may be generated if we don't have enough bits.
+  uint16 next_value = 0;
   for (half& value : literal->data<half>()) {
-    value = static_cast<half>(generator(*engine));
+    // Zero-out the MSB of the exponent to avoid Infs and NaNs, and put it into
+    // the sign bit. We could be less wasteful, but this is best-effort anyway.
+    uint16 exponent_msb = next_value & 0x4000;
+    value.x = (next_value & 0xBFFF) | (exponent_msb << 1);
+    next_value++;
   }
 }
 
 template <>
-void PopulateWithRandomFloatingPointData<bfloat16>(Literal* literal,
-                                                   std::minstd_rand0* engine,
-                                                   bool no_duplicates) {
-  // no_duplicates is ignored for bfloat types. Unique values can only be
-  // generated for arrays with fewer than ~2**16 elements and no_duplicates is
-  // best-effort anyway.
-  CHECK(engine != nullptr);
-  std::uniform_real_distribution<float> generator(-0.1f, 0.2f);
+void PopulateWithIntNext<bfloat16>(Literal* literal) {
+  // Duplicates may be generated if we don't have enough bits.
+  // Start at 0x80 rather than 0 to avoid denormals.
+  uint16 next_value = 0x80;
   for (bfloat16& value : literal->data<bfloat16>()) {
-    value = static_cast<bfloat16>(generator(*engine));
+    // Zero-out the MSB of the exponent to avoid Infs and NaNs, and put it into
+    // the sign bit. We could be less wasteful, but this is best-effort anyway.
+    uint16 exponent_msb = next_value & 0x4000;
+    value.value = (next_value & 0xBFFF) | (exponent_msb << 1);
+    next_value++;
+  }
+}
+
+template <typename FloatT>
+void PopulateWithNextAfter(Literal* literal) {
+  // Duplicates may be generated if the number of elements in the literal
+  // exceeds the number of positive values supported by the type.
+  float next_value = std::numeric_limits<float>::min();
+  for (float& value : literal->data<float>()) {
+    value = next_value;
+    next_value = std::nextafter(next_value, std::numeric_limits<float>::max());
+  }
+}
+
+template <typename FloatT,
+          typename std::enable_if<std::is_same<bfloat16, FloatT>::value ||
+                                      std::is_same<half, FloatT>::value,
+                                  int>::type = 0>
+void PopulateWithNoDuplicateData(Literal* literal, std::minstd_rand0* engine) {
+  PopulateWithIntNext<FloatT>(literal);
+  std::shuffle(literal->data<FloatT>().begin(), literal->data<FloatT>().end(),
+               *engine);
+}
+
+template <typename FloatT,
+          typename std::enable_if<!std::is_same<bfloat16, FloatT>::value &&
+                                      !std::is_same<half, FloatT>::value,
+                                  int>::type = 0>
+void PopulateWithNoDuplicateData(Literal* literal, std::minstd_rand0* engine) {
+  PopulateWithNextAfter<FloatT>(literal);
+  std::shuffle(literal->data<FloatT>().begin(), literal->data<FloatT>().end(),
+               *engine);
+}
+
+template <typename FloatT>
+void PopulateWithFloatingPointData(Literal* literal, std::minstd_rand0* engine,
+                                   bool no_duplicates) {
+  CHECK(engine != nullptr);
+  CHECK_EQ(literal->shape().element_type(),
+           primitive_util::NativeToPrimitiveType<FloatT>());
+  if (no_duplicates) {
+    PopulateWithNoDuplicateData<FloatT>(literal, engine);
+  } else {
+    PopulateWithRandomFloatingPointData<FloatT, FloatT>(literal, engine);
+  }
+}
+
+template <>
+void PopulateWithFloatingPointData<half>(Literal* literal,
+                                         std::minstd_rand0* engine,
+                                         bool no_duplicates) {
+  CHECK(engine != nullptr);
+  CHECK_EQ(literal->shape().element_type(),
+           primitive_util::NativeToPrimitiveType<half>());
+  if (no_duplicates) {
+    PopulateWithNoDuplicateData<half>(literal, engine);
+  } else {
+    PopulateWithRandomFloatingPointData<half, float>(literal, engine);
+  }
+}
+
+template <>
+void PopulateWithFloatingPointData<bfloat16>(Literal* literal,
+                                             std::minstd_rand0* engine,
+                                             bool no_duplicates) {
+  CHECK(engine != nullptr);
+  CHECK_EQ(literal->shape().element_type(),
+           primitive_util::NativeToPrimitiveType<bfloat16>());
+  if (no_duplicates) {
+    PopulateWithNoDuplicateData<bfloat16>(literal, engine);
+  } else {
+    PopulateWithRandomFloatingPointData<bfloat16, float>(literal, engine);
   }
 }
 
@@ -135,20 +184,16 @@ StatusOr<Literal> MakeFakeLiteralInternal(const Shape& shape,
   Literal literal(shape);
   switch (shape.element_type()) {
     case BF16:
-      PopulateWithRandomFloatingPointData<bfloat16>(&literal, engine,
-                                                    no_duplicates);
+      PopulateWithFloatingPointData<bfloat16>(&literal, engine, no_duplicates);
       break;
     case F16:
-      PopulateWithRandomFloatingPointData<half>(&literal, engine,
-                                                no_duplicates);
+      PopulateWithFloatingPointData<half>(&literal, engine, no_duplicates);
       break;
     case F32:
-      PopulateWithRandomFloatingPointData<float>(&literal, engine,
-                                                 no_duplicates);
+      PopulateWithFloatingPointData<float>(&literal, engine, no_duplicates);
       break;
     case F64:
-      PopulateWithRandomFloatingPointData<double>(&literal, engine,
-                                                  no_duplicates);
+      PopulateWithFloatingPointData<double>(&literal, engine, no_duplicates);
       break;
     case S8:
       PopulateWithRandomIntegralData<int8>(&literal, engine, no_duplicates);
diff --git a/tensorflow/compiler/xla/tests/test_utils_test.cc b/tensorflow/compiler/xla/tests/test_utils_test.cc
index e066b3f4f22..e8f5d7a9a79 100644
--- a/tensorflow/compiler/xla/tests/test_utils_test.cc
+++ b/tensorflow/compiler/xla/tests/test_utils_test.cc
@@ -175,5 +175,28 @@ ENTRY %sort.148.1589 (parameter.0: s32[1048576], parameter.1: s32[1048576]) -> (
   }
 }
 
+XLA_TEST_F(TestUtilsTest, NoDuplicatesBfloat16) {
+  // Inputs which are sort keys in key/value sorts should have no duplicates.
+  auto module = ParseHloString(R"(
+HloModule sort, is_scheduled=true
+
+ENTRY %sort. (parameter.0: bf16[2,1452], parameter.1: s32[2,1452]) -> (bf16[2,1452], s32[2,1452]) {
+  %parameter.0 = bf16[2,1452]{1,0} parameter(0)
+  %parameter.1 = s32[2,1452]{1,0} parameter(1)
+  ROOT %sort = (bf16[2,1452]{1,0}, s32[2,1452]{1,0}) sort(bf16[2,1452]{1,0} %parameter.0, s32[2,1452]{1,0} %parameter.1), dimensions={1}
+}
+)")
+                    .ValueOrDie();
+  TF_ASSERT_OK_AND_ASSIGN(std::vector<Literal> args,
+                          MakeFakeArguments(module.get()));
+  ASSERT_EQ(args.size(), 2);
+  const Literal& key_arg = args[0];
+
+  absl::flat_hash_set<uint16> key_set;
+  for (const bfloat16& value : key_arg.data<bfloat16>()) {
+    EXPECT_TRUE(key_set.insert(absl::bit_cast<uint16>(value)).second);
+  }
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/token_hlo_test.cc b/tensorflow/compiler/xla/tests/token_hlo_test.cc
index a2b7c26331b..601c6b06938 100644
--- a/tensorflow/compiler/xla/tests/token_hlo_test.cc
+++ b/tensorflow/compiler/xla/tests/token_hlo_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include <array>
 
 #include "absl/strings/str_cat.h"
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
 #include "tensorflow/compiler/xla/service/hlo_verifier.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/tests/test_macros.h"
@@ -108,26 +109,6 @@ XLA_TEST_F(TokenHloTest, InvalidTupleTokenShapedEntryParameter) {
       ::testing::HasSubstr("Entry parameter 0 is or contains a token shape"));
 }
 
-XLA_TEST_F(TokenHloTest, InvalidOperandToTokenInstruction) {
-  std::unique_ptr<HloModule> module = CreateNewUnverifiedModule();
-  auto builder = HloComputation::Builder(TestName());
-  auto param = builder.AddInstruction(
-      HloInstruction::CreateParameter(0, ShapeUtil::MakeShape(F32, {}), "p0"));
-  builder.AddInstruction(HloInstruction::CreateAfterAll({param}));
-  builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<int32>(123)));
-  module->AddEntryComputation(builder.Build());
-
-  Status status =
-      HloVerifier(/*layout_sensitive=*/false, /*allow_mixed_precision=*/false)
-          .Run(module.get())
-          .status();
-  ASSERT_IS_NOT_OK(status);
-  EXPECT_THAT(status.error_message(),
-              ::testing::HasSubstr(
-                  "Operands of token instructions must be TOKEN types"));
-}
-
 XLA_TEST_F(TokenHloTest, TokenInWhileLoop) {
   // Thread a token around a while loop. Token is created and consumed by a
   // AfterAll instruction in the while body.
@@ -220,5 +201,95 @@ ENTRY %TokenInConditional (param.3: pred[]) -> s32[] {
   }
 }
 
+XLA_TEST_F(TokenHloTest, AddDependency) {
+  string module_string = R"(
+HloModule AddDependency, is_scheduled=true
+
+// Computes (p0 + 42) * (-p1)
+// where there is a dependency from the add to the negation using a token
+// with after-all and add-dependency instructions.
+ENTRY %AddDependency (p0: f32[], p1: f32[]) -> f32[] {
+  %p0 = f32[] parameter(0)
+  %p1 = f32[] parameter(1)
+
+  %forty_two = f32[] constant(42.0)
+  %add = f32[] add(f32[] %p0, f32[] %forty_two)
+  %token = token[] after-all(f32[] %add)
+  %p1_after_token = f32[] add-dependency(f32[] %p1, token[] %token)
+  %neg = f32[] negate(f32[] %p1_after_token)
+  ROOT %product = f32[] multiply(f32[] %add, f32[] %neg)
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<HloModule> module,
+      ParseHloString(module_string, GetModuleConfigForTest()));
+  auto p0 = LiteralUtil::CreateR0<float>(10.0);
+  auto p1 = LiteralUtil::CreateR0<float>(3.0);
+  auto expected = LiteralUtil::CreateR0<float>(-156.0);
+  EXPECT_EQ(expected, ExecuteNoHloPasses(std::move(module), {&p0, &p1}));
+}
+
+XLA_TEST_F(TokenHloTest, AddDependencyOfConstant) {
+  string module_string = R"(
+HloModule AddDependencyOfConstant, is_scheduled=true
+
+ENTRY %AddDependency (p0: f32[]) -> f32[] {
+  %p0 = f32[] parameter(0)
+  %forty_two = f32[] constant(42.0)
+  %token = token[] after-all(f32[] %p0)
+  %forty_two_after_token = f32[] add-dependency(f32[] %forty_two, token[] %token)
+  ROOT %product = f32[] multiply(f32[] %p0, f32[] %forty_two_after_token)
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<HloModule> module,
+      ParseHloString(module_string, GetModuleConfigForTest()));
+  auto p0 = LiteralUtil::CreateR0<float>(10.0);
+  auto expected = LiteralUtil::CreateR0<float>(420.0);
+  EXPECT_EQ(expected, ExecuteNoHloPasses(std::move(module), {&p0}));
+}
+
+XLA_TEST_F(TokenHloTest, AddDependencyAsRoot) {
+  string module_string = R"(
+HloModule AddDependencyAsRoot, is_scheduled=true
+ENTRY %AddDependency (p: f32[3]) -> f32[3] {
+  %p = f32[3] parameter(0)
+  %neg = f32[3] negate(f32[3] %p)
+  %token = token[] after-all()
+  ROOT %add_dep = f32[3] add-dependency(f32[3] %neg, token[] %token)
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<HloModule> module,
+      ParseHloString(module_string, GetModuleConfigForTest()));
+  auto input = LiteralUtil::CreateR1<float>({1.0, 3.0, 7.0});
+  auto expected = LiteralUtil::CreateR1<float>({-1.0, -3.0, -7.0});
+  EXPECT_EQ(expected, ExecuteNoHloPasses(std::move(module), {&input}));
+}
+
+XLA_TEST_F(TokenHloTest, TupleShapedAddDependency) {
+  string module_string = R"(
+HloModule TupleShapedAddDependency, is_scheduled=true
+ENTRY %TupleShapedAddDependency (p0: f32[3], p1: f32[3]) -> f32[3] {
+  %p0 = f32[3] parameter(0)
+  %p1 = f32[3] parameter(1)
+  %forty_two = f32[] constant(42.0)
+  %token = token[] after-all()
+  %tuple = (f32[3], token[], f32[3], f32[]) tuple(f32[3] %p0, token[] %token, f32[3] %p1, f32[] %forty_two)
+  %add_dep = (f32[3], token[], f32[3], f32[]) add-dependency((f32[3], token[], f32[3], f32[]) %tuple, token[] %token)
+  %elem0 = f32[3] get-tuple-element((f32[3], token[], f32[3], f32[]) %add_dep), index=0
+  %elem2 = f32[3] get-tuple-element((f32[3], token[], f32[3], f32[]) %add_dep), index=2
+  ROOT %diff = f32[3] subtract(f32[3] %elem0, f32[3] %elem2)
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<HloModule> module,
+      ParseHloString(module_string, GetModuleConfigForTest()));
+  auto p0 = LiteralUtil::CreateR1<float>({3.0, 3.0, 47.0});
+  auto p1 = LiteralUtil::CreateR1<float>({1.0, -2.0, 2.0});
+  auto expected = LiteralUtil::CreateR1<float>({2.0, 5.0, 45.0});
+  EXPECT_EQ(expected, ExecuteNoHloPasses(std::move(module), {&p0, &p1}));
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/xla_hlo_profile_test.cc b/tensorflow/compiler/xla/tests/xla_hlo_profile_test.cc
index ca036f1ae0d..e57d072a063 100644
--- a/tensorflow/compiler/xla/tests/xla_hlo_profile_test.cc
+++ b/tensorflow/compiler/xla/tests/xla_hlo_profile_test.cc
@@ -157,10 +157,12 @@ void ExecuteAndFetchProfile(string* profile_output, LocalClient* client,
   TF_ASSERT_OK(transfer_manager->TransferLiteralToDevice(
       stream_ptr.get(), Literal::CreateFromShape(rhs_arg_shape), rhs_arg));
 
+  ExecutableBuildOptions build_options;
+  build_options.mutable_debug_options()->set_xla_hlo_profile(true);
   TF_ASSERT_OK_AND_ASSIGN(
       std::unique_ptr<LocalExecutable> local_executable,
       client->Compile(computation, {&lhs_arg_shape, &rhs_arg_shape},
-                      ExecutableBuildOptions().set_hlo_profile(true)));
+                      build_options));
 
   Executable* executable = local_executable->executable();
   HloExecutionProfile hlo_execution_profile(
@@ -208,7 +210,7 @@ XLA_TEST_F(HloProfileTest, ProfileSingleComputation) {
   string profile_output;
   ExecuteAndFetchProfile(&profile_output, client, computation, lhs_shape,
                          rhs_shape);
-
+  VLOG(4) << "Profile Output:\n" << profile_output;
   std::vector<string> profile_output_lines =
       absl::StrSplit(profile_output, '\n');
 
diff --git a/tensorflow/compiler/xla/tools/replay_computation.cc b/tensorflow/compiler/xla/tools/replay_computation.cc
index 47be9f5adf1..ff2c3399928 100644
--- a/tensorflow/compiler/xla/tools/replay_computation.cc
+++ b/tensorflow/compiler/xla/tools/replay_computation.cc
@@ -82,13 +82,17 @@ struct Options {
 std::unique_ptr<LocalExecutable> CompileExecutable(const HloSnapshot& module,
                                                    LocalClient* client) {
   XlaComputation computation(module.hlo().hlo_module());
-  std::vector<const Shape*> argument_layouts;
-  for (const auto& param :
+  std::vector<Shape> argument_layouts;
+  argument_layouts.reserve(
+      computation.proto().host_program_shape().parameters_size());
+  std::vector<const Shape*> argument_layout_ptrs;
+  for (const ShapeProto& param :
        computation.proto().host_program_shape().parameters()) {
-    argument_layouts.push_back(&param);
+    argument_layouts.push_back(Shape(param));
+    argument_layout_ptrs.push_back(&argument_layouts.back());
   }
   return client
-      ->Compile(computation, argument_layouts, ExecutableBuildOptions())
+      ->Compile(computation, argument_layout_ptrs, ExecutableBuildOptions())
       .ValueOrDie();
 }
 
@@ -149,7 +153,7 @@ StatusOr<Literal> ReplayComputation(const HloSnapshot& module,
               << "--generate_fake_infeed only works if the model has 0 or 1 "
                  "infeed ops, but this one has >= 2.";
           provide_infeed = true;
-          infeed_shape = instruction.shape();
+          infeed_shape = Shape(instruction.shape());
           LOG(INFO) << "Generating fake infeed shape for inferred shape: "
                     << ShapeUtil::HumanString(infeed_shape);
         }
@@ -315,9 +319,10 @@ int RealMain(absl::Span<char* const> args, const Options& opts) {
       if (snapshot.has_result()) {
         Literal literal =
             Literal::CreateFromProto(snapshot.result()).ConsumeValueOrDie();
-        fprintf(stdout, "was %s:%s\n",
-                ShapeUtil::HumanString(snapshot.result().shape()).c_str(),
-                literal.ToString().c_str());
+        fprintf(
+            stdout, "was %s:%s\n",
+            ShapeUtil::HumanString(Shape(snapshot.result().shape())).c_str(),
+            literal.ToString().c_str());
       }
     }
   }
diff --git a/tensorflow/compiler/xla/util.h b/tensorflow/compiler/xla/util.h
index 8ce74164741..6722641e9d2 100644
--- a/tensorflow/compiler/xla/util.h
+++ b/tensorflow/compiler/xla/util.h
@@ -152,6 +152,13 @@ static inline absl::Span<const int64> AsInt64Slice(
                                  slice.size());
 }
 
+// TODO(b/29771030): This nop overload was added to simplify the migration of
+// Shape from a proto to a C++ class. Remove after class has been migrated.
+static inline absl::Span<const int64> AsInt64Slice(
+    absl::Span<const int64> slice) {
+  return slice;
+}
+
 // As above, but for uint64 types.
 static inline absl::Span<const uint64> AsUInt64Slice(
     const tensorflow::protobuf::RepeatedField<tensorflow::protobuf_uint64>& v) {
@@ -387,6 +394,19 @@ T CeilOfRatio(T dividend, T divisor) {
   return tensorflow::MathUtil::CeilOfRatio<T>(dividend, divisor);
 }
 
+template <typename T>
+std::vector<T> ElementWiseCeilOfRatio(absl::Span<const T> dividends,
+                                      absl::Span<const T> divisors) {
+  std::vector<T> ceil_of_ratios;
+  CHECK_EQ(dividends.size(), divisors.size());
+  ceil_of_ratios.reserve(dividends.size());
+  absl::c_transform(dividends, divisors, std::back_inserter(ceil_of_ratios),
+                    [](const T dividend, const T divisor) {
+                      return CeilOfRatio<T>(dividend, divisor);
+                    });
+  return ceil_of_ratios;
+}
+
 // Rounds the value up to a multiple of the divisor by first calling CeilOfRatio
 // then multiplying by the divisor. For example: RoundUpToNearest(13, 8) => 16
 template <typename T>
diff --git a/tensorflow/compiler/xla/window_util.cc b/tensorflow/compiler/xla/window_util.cc
index 8ea8dbab257..f113a705b41 100644
--- a/tensorflow/compiler/xla/window_util.cc
+++ b/tensorflow/compiler/xla/window_util.cc
@@ -185,6 +185,17 @@ bool HasWindowReversal(const Window& window) {
   return false;
 }
 
+bool AllOrNoneReversed(const Window& window) {
+  if (window.dimensions().size() == 0) {
+    return true;
+  }
+  bool reversed = window.dimensions()[0].window_reversal();
+  return std::all_of(window.dimensions().begin(), window.dimensions().end(),
+                     [&](const WindowDimension& dim) {
+                       return dim.window_reversal() == reversed;
+                     });
+}
+
 bool HasDilation(const Window& window) {
   return HasBaseDilation(window) || HasWindowDilation(window);
 }
diff --git a/tensorflow/compiler/xla/window_util.h b/tensorflow/compiler/xla/window_util.h
index 1fb9e855fc1..099d7ecdd5c 100644
--- a/tensorflow/compiler/xla/window_util.h
+++ b/tensorflow/compiler/xla/window_util.h
@@ -56,6 +56,7 @@ bool HasWindowDilation(const Window& window);
 bool HasDilation(const Window& window);
 
 bool HasWindowReversal(const Window& window);
+bool AllOrNoneReversed(const Window& window);
 
 // Returns true if the given logical dimension is inactive in the sense that it
 // has window bound 1, no striding and no padding.
diff --git a/tensorflow/compiler/xla/xla.proto b/tensorflow/compiler/xla/xla.proto
index 28df3b03f39..bdeb1728fa2 100644
--- a/tensorflow/compiler/xla/xla.proto
+++ b/tensorflow/compiler/xla/xla.proto
@@ -193,7 +193,11 @@ message DebugOptions {
   //  - Assuming that operations never produce or consume NaN or +/- Inf.
   //  - Assuming that +0 and -0 are indistinguishable.
   bool xla_cpu_enable_fast_math = 99;
-  bool xla_gpu_enable_fast_math = 100;
+
+  // When true we lower the Minimum and Maximum hlos in the GPU backend such
+  // that Min(NotNaN, NaN) = Min(NaN, NotNaN) = NotNaN.  In other words, if flag
+  // this is true we don't propagate NaNs through Min and Max.
+  bool xla_gpu_enable_fast_min_max = 100;
 
   // Crashes the program when any kind of verification fails, instead of just
   // logging the failures. One example is cross checking of convolution results
@@ -224,7 +228,7 @@ message ExecutionOptions {
   // may be faster when using this layout.
   //
   // We use a Shape here to accommodate computations that return a tuple.
-  Shape shape_with_output_layout = 2;
+  ShapeProto shape_with_output_layout = 2;
 
   // Used to seed random-number generators used in this computation.  If this is
   // 0, we generate a seed ourselves.
@@ -253,7 +257,7 @@ message TransferToClientRequest {
 
   // This optional field directs the service to return the literal in this
   // layout. A shape is used to hold the layout to accommodate tuples.
-  Shape shape_with_layout = 2;
+  ShapeProto shape_with_layout = 2;
 }
 
 message TransferToClientResponse {
@@ -281,7 +285,7 @@ message TransferToInfeedResponse {
 message TransferFromOutfeedRequest {
   // This optional field directs the service to return the literal in this
   // layout. A shape is used to hold the layout to accommodate tuples.
-  Shape shape_with_layout = 1;
+  ShapeProto shape_with_layout = 1;
 
   int64 replica_id = 2;
   DeviceHandle device_handle = 3;
@@ -332,7 +336,7 @@ message CompileRequest {
   // The layouts of the input arguments. If not set, the default layout will be
   // used. Although the real arguments are not needed in compilation, the
   // layouts of the arguments can affect the compilation.
-  repeated Shape input_shape_with_layout = 3;
+  repeated ShapeProto input_shape_with_layout = 3;
 }
 
 message CompileResponse {
@@ -406,7 +410,7 @@ message LoadDataRequest {
   string columnio_field = 2;
 
   // Individual element shape, excluding rows.
-  Shape element_shape = 3;
+  ShapeProto element_shape = 3;
 
   // Warning: ColumnIO does not support random-access, so use offset with
   // caution in performance-critical scenarios.
@@ -422,7 +426,7 @@ message LoadDataRequest {
 
 message LoadDataResponse {
   GlobalDataHandle data = 1;
-  Shape data_shape = 2;
+  ShapeProto data_shape = 2;
   int64 available_rows = 3;
   int64 rows_loaded = 4;
   int64 nanoseconds = 5;
@@ -433,7 +437,7 @@ message GetShapeRequest {
 }
 
 message GetShapeResponse {
-  Shape shape = 1;
+  ShapeProto shape = 1;
 }
 
 message UnpackRequest {
diff --git a/tensorflow/compiler/xla/xla_data.proto b/tensorflow/compiler/xla/xla_data.proto
index 683ccc40f16..85ec83437a1 100644
--- a/tensorflow/compiler/xla/xla_data.proto
+++ b/tensorflow/compiler/xla/xla_data.proto
@@ -108,6 +108,16 @@ enum Format {
   SPARSE = 2;
 }
 
+// Describes a tile used in tiling-based layout. Refer to
+// g3doc/layout_with_tiling.md for details about tiling-based layout.
+message Tile {
+  // Number of elements in each dimension of the tile. It's ordered from the
+  // most major dimension of the tile to the most minor dimension of the tile.
+  // The dimensions correspond to a suffix of the dimensions of the shape being
+  // tiled.
+  repeated int64 dimensions = 1;
+}
+
 // A layout describes how the array is placed in (1D) memory space.  This
 // includes the minor-to-major ordering of dimensions within a shape.
 //
@@ -138,6 +148,20 @@ message Layout {
   // memory.  This field must be unset unless the format is SPARSE.
   int64 max_sparse_elements = 5;
 
+  // A sequence of tiles, starting from the tile that's applied first to the
+  // Shape.
+  //
+  // TODO(b/119839262): implement tiling in each backend or add Unimplemented
+  // error.
+  repeated Tile tiles = 6;
+
+  // Bit size of each element. If the size is bigger than what the element
+  // type requires, the value is stored in the least significant
+  // bits and the additional most significant bits are filled with 0's.
+  //
+  // TODO(b/119839262): implement in each backend or add Unimplemented error.
+  int64 element_size_in_bits = 7;
+
   // Important: if any field is added, be sure to modify ShapeUtil::Equal() and
   // LayoutUtil::Hash appropriately to account for the new field.
 }
@@ -154,7 +178,7 @@ message Layout {
 // See the XLA documentation for more information on shapes and layouts.
 //
 // LINT.IfChange
-message Shape {
+message ShapeProto {
   reserved 1;
   reserved "rank";
 
@@ -169,7 +193,7 @@ message Shape {
   repeated int64 dimensions = 3;
 
   // For tuples only, the shapes of constitutent shapes in the tuple sequence.
-  repeated Shape tuple_shapes = 4;
+  repeated ShapeProto tuple_shapes = 4;
 
   // The layout used to back this shape.
   Layout layout = 5;
@@ -183,9 +207,9 @@ message Shape {
 
 // Shape of the parameters and output of a computation (like a traditional
 // function signature).
-message ProgramShape {
-  repeated Shape parameters = 1;
-  Shape result = 2;
+message ProgramShapeProto {
+  repeated ShapeProto parameters = 1;
+  ShapeProto result = 2;
   repeated string parameter_names = 3;
 }
 
@@ -320,7 +344,7 @@ message DeviceAssignmentProto {
 // Transfers to/from the client are encoded in literal form, and the structure
 // of the repeated fields is implied by the shape.
 message LiteralProto {
-  Shape shape = 1;
+  ShapeProto shape = 1;
   repeated bool preds = 2;
   bytes s8s = 15;
   bytes u8s = 3;
@@ -521,7 +545,7 @@ message OpSharding {
   }
   Type type = 1;
   // The shape of the sharded tile.
-  Shape tile_shape = 2;
+  ShapeProto tile_shape = 2;
   // The shape of the tile assignment tensor - this must be the same rank as
   // tile_shape and the product of its dimensions must equal
   // tile_assignment_devices.size().
diff --git a/tensorflow/compiler/xrt/BUILD b/tensorflow/compiler/xrt/BUILD
index 2ff97914f86..2dae746d034 100644
--- a/tensorflow/compiler/xrt/BUILD
+++ b/tensorflow/compiler/xrt/BUILD
@@ -22,6 +22,7 @@ xla_proto_library(
     deps = [
         "//tensorflow/compiler/tf2xla:host_compute_metadata_proto",
         "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla:xla_proto",
         "//tensorflow/compiler/xla/service:hlo_proto",
     ],
 )
@@ -32,20 +33,25 @@ cc_library(
         "xrt_compilation_cache.cc",
         "xrt_device.cc",
         "xrt_state.cc",
+        "xrt_util.cc",
     ],
     hdrs = [
         "xrt_compilation_cache.h",
         "xrt_device.h",
         "xrt_state.h",
+        "xrt_util.h",
     ],
     deps = [
         "//tensorflow/compiler/jit:xla_device",
         "//tensorflow/compiler/tf2xla:xla_compiler",
+        "//tensorflow/compiler/xla:debug_options_flags",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla:xla_proto",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/service:backend",
         "//tensorflow/compiler/xla/service:device_memory_allocator",
diff --git a/tensorflow/compiler/xrt/kernels/xrt_compile_ops.cc b/tensorflow/compiler/xrt/kernels/xrt_compile_ops.cc
index dc62cf7a6b2..2ccdf0f02d8 100644
--- a/tensorflow/compiler/xrt/kernels/xrt_compile_ops.cc
+++ b/tensorflow/compiler/xrt/kernels/xrt_compile_ops.cc
@@ -33,6 +33,7 @@ limitations under the License.
 #include "tensorflow/compiler/xrt/xrt.pb.h"
 #include "tensorflow/compiler/xrt/xrt_compilation_cache.h"
 #include "tensorflow/compiler/xrt/xrt_device.h"
+#include "tensorflow/compiler/xrt/xrt_util.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/resource_mgr.h"
 #include "tensorflow/core/framework/tensor.h"
@@ -108,19 +109,26 @@ Status XRTCompileOp::Compile(OpKernelContext* ctx,
   TF_ASSIGN_OR_RETURN(xla::XlaComputation computation,
                       client->LoadSnapshot(computation_proto.hlo_snapshot()));
 
-  std::vector<const xla::Shape*> argument_layouts(
+  std::vector<xla::Shape> argument_layouts(
+      config.program_shape().parameters_size());
+  std::vector<const xla::Shape*> argument_layout_ptrs(
       config.program_shape().parameters_size());
   for (int i = 0; i < config.program_shape().parameters_size(); ++i) {
-    argument_layouts[i] = &config.program_shape().parameters(i);
+    argument_layouts[i] = xla::Shape(config.program_shape().parameters(i));
+    argument_layout_ptrs[i] = &argument_layouts[i];
   }
   xla::ExecutableBuildOptions build_options;
   build_options.set_device_ordinal(client->default_device_ordinal());
-  build_options.set_result_layout(config.program_shape().result());
+  build_options.set_result_layout(xla::Shape(config.program_shape().result()));
   build_options.set_device_allocator(device_ref.backend()->memory_allocator());
+  if (config.has_debug_options()) {
+    *build_options.mutable_debug_options() =
+        BuildXlaDebugOptions(config.debug_options());
+  }
 
   VLOG(1) << "Building executable";
   auto compile_result =
-      client->Compile(computation, argument_layouts, build_options);
+      client->Compile(computation, argument_layout_ptrs, build_options);
   if (!compile_result.ok()) {
     return compile_result.status();
   }
@@ -174,11 +182,12 @@ void XRTCompileOp::Compute(OpKernelContext* ctx) {
   ctx->set_output(0, handle_output);
 
   xla::LocalExecutable* executable = entry->get().get_executable();
-  xla::ProgramShape program_shape = executable->executable()
-                                        ->module()
-                                        .config()
-                                        .entry_computation_layout()
-                                        .ComputeProgramShape();
+  xla::ProgramShapeProto program_shape = executable->executable()
+                                             ->module()
+                                             .config()
+                                             .entry_computation_layout()
+                                             .ComputeProgramShape()
+                                             .ToProto();
   Tensor program_shape_output(DT_STRING, TensorShape({1}));
   program_shape_output.vec<string>()(0) = program_shape.SerializeAsString();
   ctx->set_output(1, program_shape_output);
diff --git a/tensorflow/compiler/xrt/kernels/xrt_execute_op.cc b/tensorflow/compiler/xrt/kernels/xrt_execute_op.cc
index 8c6191ddc06..751329eefc3 100644
--- a/tensorflow/compiler/xrt/kernels/xrt_execute_op.cc
+++ b/tensorflow/compiler/xrt/kernels/xrt_execute_op.cc
@@ -228,14 +228,35 @@ Status XRTExecuteOp::DoWork(OpKernelContext* context) {
   TF_RETURN_IF_ERROR(XRTTupleAllocation::CreateFromBuffer(
       shaped_buffer, device_ref.backend(), device_ref.device_ordinal(),
       &output_tuple));
+  if (config_proto.return_exploded_tuple() &&
+      xla::ShapeUtil::IsTuple(output_tuple->on_device_shape())) {
+    int64 tuple_element_count =
+        xla::ShapeUtil::TupleElementCount(output_tuple->on_device_shape());
+    Tensor* output_tensor;
+    TF_RETURN_IF_ERROR(context->allocate_output(
+        0, TensorShape({tuple_element_count}), &output_tensor));
 
-  Tensor* output_tensor;
-  TF_RETURN_IF_ERROR(
-      context->allocate_output(0, TensorShape({}), &output_tensor));
-  int64 key;
-  TF_RETURN_IF_ERROR(output_tuple->Intern(rm, &key));
-  output_tensor->scalar<int64>()() = key;
+    for (int64 i = 0; i < tuple_element_count; ++i) {
+      xla::ShapeIndex shape_index;
+      shape_index.push_back(i);
 
+      XRTTupleAllocation* suballocation;
+      TF_RETURN_IF_ERROR(XRTTupleAllocation::MakeSubBuffer(
+          output_tuple, shape_index, &suballocation,
+          /*alias_parent_allocation=*/false));
+      int64 key;
+      TF_RETURN_IF_ERROR(suballocation->Intern(rm, &key));
+      output_tensor->vec<int64>()(i) = key;
+    }
+    output_tuple->Unref();
+  } else {
+    Tensor* output_tensor;
+    TF_RETURN_IF_ERROR(
+        context->allocate_output(0, TensorShape({}), &output_tensor));
+    int64 key;
+    TF_RETURN_IF_ERROR(output_tuple->Intern(rm, &key));
+    output_tensor->scalar<int64>()() = key;
+  }
   return Status::OK();
 }
 
diff --git a/tensorflow/compiler/xrt/kernels/xrt_state_ops.cc b/tensorflow/compiler/xrt/kernels/xrt_state_ops.cc
index ffea592491d..3258286c106 100644
--- a/tensorflow/compiler/xrt/kernels/xrt_state_ops.cc
+++ b/tensorflow/compiler/xrt/kernels/xrt_state_ops.cc
@@ -87,6 +87,19 @@ REGISTER_KERNEL_BUILDER(Name("XRTReadLiteral")
                             .HostMemory("literal"),
                         XRTReadLiteralOp<false, XRTGenericDeviceAccessor>);
 
+REGISTER_KERNEL_BUILDER(Name("XRTWriteLiteral")
+                            .Device(DEVICE_XLA_GPU)
+                            .HostMemory("handle")
+                            .HostMemory("literal")
+                            .HostMemory("output_handle"),
+                        XRTWriteLiteralOp<XRTGenericDeviceAccessor>);
+REGISTER_KERNEL_BUILDER(Name("XRTWriteLiteral")
+                            .Device(DEVICE_XLA_CPU)
+                            .HostMemory("handle")
+                            .HostMemory("literal")
+                            .HostMemory("output_handle"),
+                        XRTWriteLiteralOp<XRTGenericDeviceAccessor>);
+
 REGISTER_KERNEL_BUILDER(Name("XRTReadLiteralAndRelease")
                             .Device(DEVICE_XLA_GPU)
                             .HostMemory("handle")
diff --git a/tensorflow/compiler/xrt/kernels/xrt_state_ops.h b/tensorflow/compiler/xrt/kernels/xrt_state_ops.h
index 54b06558adc..26a58fa42d8 100644
--- a/tensorflow/compiler/xrt/kernels/xrt_state_ops.h
+++ b/tensorflow/compiler/xrt/kernels/xrt_state_ops.h
@@ -393,6 +393,56 @@ class XRTReadLiteralOp : public OpKernel {
   }
 };
 
+// Op that writes a new literal value into device-resident memory.
+template <class DeviceAccessor>
+class XRTWriteLiteralOp : public OpKernel {
+ public:
+  explicit XRTWriteLiteralOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+  ~XRTWriteLiteralOp() override = default;
+  XRTWriteLiteralOp(const XRTWriteLiteralOp&) = delete;
+  XRTWriteLiteralOp& operator=(const XRTWriteLiteralOp&) = delete;
+
+  void Compute(OpKernelContext* ctx) override {
+    VLOG(1) << "XRTWriteLiteralOp::Compute";
+
+    const Tensor& handle_tensor = ctx->input(0);
+    OP_REQUIRES(
+        ctx, TensorShapeUtils::IsScalar(handle_tensor.shape()),
+        errors::Internal("computation input should be an int64 scalar"));
+    int64 allocation_handle = handle_tensor.scalar<int64>()();
+
+    const Tensor& literal_info = ctx->input(1);
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(literal_info.shape()),
+                errors::Internal("literal input should be a string scalar"));
+    xla::LiteralProto literal_proto;
+    OP_REQUIRES(ctx,
+                literal_proto.ParseFromString(literal_info.scalar<string>()()),
+                errors::InvalidArgument(
+                    "Unable to parse allocation input to LiteralProto"));
+    xla::Literal literal;
+    OP_REQUIRES_OK(ctx, XRTStateHelpers::MakeLiteral(literal_proto, &literal));
+
+    ResourceMgr* rm;
+    OP_REQUIRES_OK(ctx, DeviceAccessor::GetResourceManager(ctx, &rm));
+
+    XRTTupleAllocation* allocation;
+    OP_REQUIRES_OK(
+        ctx, XRTTupleAllocation::Lookup(rm, allocation_handle, &allocation));
+    core::ScopedUnref allocation_unref(allocation);
+    // We are guaranteed that the underlying device object won't be deleted out
+    // from under us, while the ScopedRef is live.
+    typename DeviceAccessor::ScopedRef device_ref;
+    OP_REQUIRES_OK(ctx, DeviceAccessor::InitScopedRef(
+                            ctx, allocation->device_ordinal(), &device_ref));
+    OP_REQUIRES_OK(ctx,
+                   allocation->WriteLiteral(device_ref.backend(), literal));
+
+    Tensor output(DT_INT64, TensorShape({}));
+    output.scalar<int64>()() = allocation_handle;
+    ctx->set_output(0, output);
+  }
+};
+
 // Op that discards a handle to device memory.
 template <class DeviceAccessor>
 class XRTReleaseAllocationOp : public OpKernel {
diff --git a/tensorflow/compiler/xrt/ops/xrt_state_ops.cc b/tensorflow/compiler/xrt/ops/xrt_state_ops.cc
index 07d025ce343..a3d63106fa1 100644
--- a/tensorflow/compiler/xrt/ops/xrt_state_ops.cc
+++ b/tensorflow/compiler/xrt/ops/xrt_state_ops.cc
@@ -95,6 +95,20 @@ Copies an allocated tuple from device memory and returns it as a literal.
 'literal' is a serialized xla::LiteralProto proto.
 )");
 
+REGISTER_OP("XRTWriteLiteral")
+    .Input("handle: int64")
+    .Input("literal: string")
+    .Output("output_handle: int64")
+    .SetShapeFn(tensorflow::shape_inference::ScalarShape)
+    .Doc(
+        R"(
+Copies the input literal into the device memory pointed to by handle.
+Returns the handle itself.
+
+'handle' is the id returned from the Op that produced the on-device allocation.
+'literal' is a serialized xla::LiteralProto proto to be written to device memory.
+)");
+
 REGISTER_OP("XRTReadLiteralAndRelease")
     .Input("handle: int64")
     .Output("literal: string")
diff --git a/tensorflow/compiler/xrt/tests/raw_api_test.cc b/tensorflow/compiler/xrt/tests/raw_api_test.cc
index 25464b5554d..abaa17e50e3 100644
--- a/tensorflow/compiler/xrt/tests/raw_api_test.cc
+++ b/tensorflow/compiler/xrt/tests/raw_api_test.cc
@@ -102,7 +102,7 @@ bool CompareLiteralProtos(const xla::LiteralProto& a,
   auto l_b = xla::Literal::CreateFromProto(b).ValueOrDie();
   bool equal = l_a == l_b;
   if (!equal) {
-    LOG(INFO) << "LiteralProtos don't match " << a.DebugString()
+    LOG(INFO) << "LiteralProtos don't match: " << a.DebugString()
               << " != " << b.DebugString();
   }
   return equal;
@@ -175,6 +175,18 @@ xla::XlaComputation AddAndTuple() {
   return builder.Build().ValueOrDie();
 }
 
+xla::XlaComputation AddAndSubTuple() {
+  xla::XlaBuilder builder("AddAndSubTuple");
+  auto p0 = xla::Parameter(&builder, 0, xla::ShapeUtil::MakeShape(xla::F32, {}),
+                           "P0");
+  auto p1 = xla::Parameter(&builder, 1, xla::ShapeUtil::MakeShape(xla::F32, {}),
+                           "P1");
+  auto sum = xla::Add(p0, p1);
+  auto sub = xla::Sub(p0, p1);
+  xla::Tuple(&builder, {sum, sub});
+  return builder.Build().ValueOrDie();
+}
+
 void StoreComputationSnapshot(const xla::XlaComputation& computation,
                               xla::HloSnapshot* dst) {
   auto snapshot = computation.Snapshot().ValueOrDie();
@@ -203,6 +215,56 @@ xla::ProgramShape XlaCompiledProgramShape(
       ->ComputeProgramShape();
 }
 
+TEST(RawApiTest, AllocAndRewrite) {
+  xrt::XLAAllocation alloc;
+  alloc.set_device_ordinal(0);
+  *alloc.mutable_value() =
+      xla::LiteralUtil::CreateR2({{4, 5}, {6, 7}}).ToProto();
+
+  Scope root = Scope::NewRootScope().WithDevice(DeviceFromFlag());
+  auto value =
+      ops::Const(root.WithDevice("/device:CPU:0"), alloc.SerializeAsString());
+  auto handle = ops::XRTAllocate(root, value);
+  auto read_back = ops::XRTReadLiteral(root, handle);
+  TF_ASSERT_OK(root.status());
+
+  tensorflow::ClientSession session(root);
+  std::vector<tensorflow::Tensor> outputs;
+  TF_EXPECT_OK(session.Run({read_back, handle}, &outputs));
+  EXPECT_EQ(outputs.size(), 2);
+
+  int64 allocation_handle = outputs[1].scalar<int64>()();
+  xla::LiteralProto response;
+  EXPECT_TRUE(response.ParseFromString(outputs[0].scalar<string>()()));
+  EXPECT_TRUE(CompareLiteralProtos(alloc.value(), response));
+  outputs.clear();
+
+  xla::LiteralProto new_literal =
+      xla::LiteralUtil::CreateR2({{9, 2}, {4, 1}}).ToProto();
+  auto new_value = ops::Const(root.WithDevice("/device:CPU:0"),
+                              new_literal.SerializeAsString());
+  auto write_op =
+      ops::XRTWriteLiteral(root, Input(allocation_handle), new_value);
+  TF_ASSERT_OK(root.status());
+  TF_EXPECT_OK(session.Run({write_op}, &outputs));
+  EXPECT_EQ(outputs.size(), 1);
+  EXPECT_EQ(allocation_handle, outputs[0].scalar<int64>()());
+  outputs.clear();
+
+  auto read_after_write = ops::XRTReadLiteral(root, Input(allocation_handle));
+  TF_EXPECT_OK(session.Run({read_after_write}, &outputs));
+  EXPECT_EQ(outputs.size(), 1);
+
+  xla::LiteralProto new_response;
+  EXPECT_TRUE(new_response.ParseFromString(outputs[0].scalar<string>()()));
+  EXPECT_TRUE(CompareLiteralProtos(new_literal, new_response));
+
+  auto release =
+      ops::XRTReleaseAllocationHandle(root, Input(allocation_handle));
+  TF_EXPECT_OK(session.Run(tensorflow::ClientSession::FeedType(), {}, {release},
+                           &outputs));
+}
+
 TEST(RawApiTest, ReadAndWriteState) {
   xrt::XLAAllocation alloc;
   alloc.set_device_ordinal(0);
@@ -375,9 +437,12 @@ TEST(RawApiTest, CompileAndExecute) {
   xrt::XLAComputation c;
   auto config = c.mutable_config();
   auto shapes = config->mutable_program_shape();
-  *shapes->add_parameters() = xla::ShapeUtil::MakeShape(xla::F32, {2});
-  *shapes->add_parameters() = xla::ShapeUtil::MakeShape(xla::F32, {2});
-  *shapes->mutable_result() = xla::ShapeUtil::MakeShape(xla::F32, {2});
+  *shapes->add_parameters() =
+      xla::ShapeUtil::MakeShape(xla::F32, {2}).ToProto();
+  *shapes->add_parameters() =
+      xla::ShapeUtil::MakeShape(xla::F32, {2}).ToProto();
+  *shapes->mutable_result() =
+      xla::ShapeUtil::MakeShape(xla::F32, {2}).ToProto();
   StoreComputationSnapshot(AddAndScale(), c.mutable_hlo_snapshot());
 
   xrt::XRTExecutionConfig e;
@@ -411,7 +476,7 @@ TEST(RawApiTest, CompileAndExecute) {
   auto expected = xla::LiteralUtil::CreateR1<float>({27.0f, 21.0f});
   EXPECT_TRUE(CompareLiteralToLiteralProto(expected, response));
 
-  xla::ProgramShape program_shape;
+  xla::ProgramShapeProto program_shape;
   EXPECT_TRUE(program_shape.ParseFromString(outputs[1].vec<string>()(0)));
   EXPECT_EQ(program_shape.parameters_size(), 2);
 }
@@ -427,9 +492,12 @@ TEST(RawApiTest, CompileAndExecuteWithArgumentVector) {
   xrt::XLAComputation c;
   auto config = c.mutable_config();
   auto shapes = config->mutable_program_shape();
-  *shapes->add_parameters() = xla::ShapeUtil::MakeShape(xla::F32, {2});
-  *shapes->add_parameters() = xla::ShapeUtil::MakeShape(xla::F32, {2});
-  *shapes->mutable_result() = xla::ShapeUtil::MakeShape(xla::F32, {2});
+  *shapes->add_parameters() =
+      xla::ShapeUtil::MakeShape(xla::F32, {2}).ToProto();
+  *shapes->add_parameters() =
+      xla::ShapeUtil::MakeShape(xla::F32, {2}).ToProto();
+  *shapes->mutable_result() =
+      xla::ShapeUtil::MakeShape(xla::F32, {2}).ToProto();
   StoreComputationSnapshot(AddAndScale(), c.mutable_hlo_snapshot());
 
   xrt::XRTExecutionConfig e;
@@ -465,7 +533,7 @@ TEST(RawApiTest, CompileAndExecuteWithArgumentVector) {
   auto expected = xla::LiteralUtil::CreateR1<float>({27.0f, 21.0f});
   EXPECT_TRUE(CompareLiteralToLiteralProto(expected, response));
 
-  xla::ProgramShape program_shape;
+  xla::ProgramShapeProto program_shape;
   EXPECT_TRUE(program_shape.ParseFromString(outputs[1].vec<string>()(0)));
   EXPECT_EQ(program_shape.parameters_size(), 2);
 }
@@ -494,8 +562,8 @@ TEST(RawApiTest, CompileWithXlaReturnShapes) {
   xrt::XLAComputation c;
   auto config = c.mutable_config();
   auto shapes = config->mutable_program_shape();
-  *shapes->add_parameters() = param_shape;
-  *shapes->mutable_result() = result_shape;
+  *shapes->add_parameters() = param_shape.ToProto();
+  *shapes->mutable_result() = result_shape.ToProto();
   StoreComputationSnapshot(xla_computation, c.mutable_hlo_snapshot());
 
   Scope root = Scope::NewRootScope().WithDevice(DeviceFromFlag());
@@ -510,8 +578,9 @@ TEST(RawApiTest, CompileWithXlaReturnShapes) {
   TF_EXPECT_OK(session.Run(tensorflow::ClientSession::FeedType(),
                            {c_handle.program_shape}, {release}, &outputs));
 
-  xla::ProgramShape program_shape;
-  EXPECT_TRUE(program_shape.ParseFromString(outputs[0].vec<string>()(0)));
+  xla::ProgramShapeProto program_shape_proto;
+  EXPECT_TRUE(program_shape_proto.ParseFromString(outputs[0].vec<string>()(0)));
+  xla::ProgramShape program_shape(program_shape_proto);
   EXPECT_EQ(program_shape.parameters_size(), 1);
 
   VLOG(2) << "Param: "
@@ -520,7 +589,7 @@ TEST(RawApiTest, CompileWithXlaReturnShapes) {
           << xla::ShapeUtil::HumanStringWithLayout(program_shape.result());
 
   xla::ProgramShape xla_program_shape =
-      XlaCompiledProgramShape(xla_computation, *shapes);
+      XlaCompiledProgramShape(xla_computation, xla::ProgramShape(*shapes));
   EXPECT_TRUE(xla::LayoutUtil::Equal(
       xla::ShapeUtil::GetSubshape(program_shape.parameters(0), {0}).layout(),
       xla::ShapeUtil::GetSubshape(xla_program_shape.parameters(0), {0})
@@ -547,11 +616,11 @@ TEST(RawApiTest, DotGeneralWithLayoutTest) {
   auto config = c.mutable_config();
   auto shapes = config->mutable_program_shape();
   *shapes->add_parameters() =
-      xla::ShapeUtil::MakeShapeWithLayout(xla::F32, {2, 2}, {0, 1});
+      xla::ShapeUtil::MakeShapeWithLayout(xla::F32, {2, 2}, {0, 1}).ToProto();
   *shapes->add_parameters() =
-      xla::ShapeUtil::MakeShapeWithLayout(xla::F32, {2, 1}, {0, 1});
+      xla::ShapeUtil::MakeShapeWithLayout(xla::F32, {2, 1}, {0, 1}).ToProto();
   *shapes->mutable_result() =
-      xla::ShapeUtil::MakeShapeWithLayout(xla::F32, {2, 1}, {0, 1});
+      xla::ShapeUtil::MakeShapeWithLayout(xla::F32, {2, 1}, {0, 1}).ToProto();
   StoreComputationSnapshot(Dot(), c.mutable_hlo_snapshot());
 
   xrt::XRTExecutionConfig e;
@@ -592,7 +661,7 @@ TEST(RawApiTest, CompileAndExecuteZeroArg) {
   xrt::XLAComputation c;
   auto config = c.mutable_config();
   auto shapes = config->mutable_program_shape();
-  *shapes->mutable_result() = xla::ShapeUtil::MakeShape(xla::F32, {});
+  *shapes->mutable_result() = xla::ShapeUtil::MakeShape(xla::F32, {}).ToProto();
 
   xrt::XRTExecutionConfig e;
   e.set_release_input_handles(true);
@@ -632,10 +701,13 @@ TEST(RawApiTest, CompileAndExecuteReturnTuple) {
   xrt::XLAComputation c;
   auto config = c.mutable_config();
   auto shapes = config->mutable_program_shape();
-  *shapes->add_parameters() = xla::ShapeUtil::MakeShape(xla::F32, {2});
-  *shapes->add_parameters() = xla::ShapeUtil::MakeShape(xla::F32, {2});
-  *shapes->mutable_result() = xla::ShapeUtil::MakeTupleShape(
-      {xla::ShapeUtil::MakeShape(xla::F32, {2})});
+  *shapes->add_parameters() =
+      xla::ShapeUtil::MakeShape(xla::F32, {2}).ToProto();
+  *shapes->add_parameters() =
+      xla::ShapeUtil::MakeShape(xla::F32, {2}).ToProto();
+  *shapes->mutable_result() =
+      xla::ShapeUtil::MakeTupleShape({xla::ShapeUtil::MakeShape(xla::F32, {2})})
+          .ToProto();
   StoreComputationSnapshot(AddAndTuple(), c.mutable_hlo_snapshot());
 
   xrt::XRTExecutionConfig e;
@@ -671,14 +743,81 @@ TEST(RawApiTest, CompileAndExecuteReturnTuple) {
   EXPECT_TRUE(CompareLiteralToLiteralProto(expected, response));
 }
 
+TEST(RawApiTest, CompileAndExecuteReturnExplodedTuple) {
+  xrt::XLAAllocation p0;
+  p0.set_device_ordinal(0);
+  *p0.mutable_value() = xla::LiteralUtil::CreateR0<float>(12.0f).ToProto();
+
+  xrt::XLAAllocation p1;
+  p1.set_device_ordinal(0);
+  *p1.mutable_value() = xla::LiteralUtil::CreateR0<float>(3.0f).ToProto();
+
+  xrt::XLAComputation c;
+  auto config = c.mutable_config();
+  auto shapes = config->mutable_program_shape();
+  *shapes->add_parameters() = xla::ShapeUtil::MakeShape(xla::F32, {}).ToProto();
+  *shapes->add_parameters() = xla::ShapeUtil::MakeShape(xla::F32, {}).ToProto();
+  *shapes->mutable_result() =
+      xla::ShapeUtil::MakeTupleShape({xla::ShapeUtil::MakeShape(xla::F32, {}),
+                                      xla::ShapeUtil::MakeShape(xla::F32, {})})
+          .ToProto();
+  StoreComputationSnapshot(AddAndSubTuple(), c.mutable_hlo_snapshot());
+
+  xrt::XRTExecutionConfig e;
+  e.set_release_input_handles(true);
+  e.set_release_compilation_handle(true);
+  e.set_return_exploded_tuple(true);
+
+  Scope root = Scope::NewRootScope().WithDevice(DeviceFromFlag());
+  auto e_config =
+      ops::Const(root.WithDevice("/device:CPU:0"), e.SerializeAsString());
+  auto computation =
+      ops::Const(root.WithDevice("/device:CPU:0"), c.SerializeAsString());
+  auto c_handle = ops::XRTCompile(root, computation);
+  auto p0_value =
+      ops::Const(root.WithDevice("/device:CPU:0"), p0.SerializeAsString());
+  auto p0_handle = ops::XRTAllocate(root, p0_value);
+  auto p1_value =
+      ops::Const(root.WithDevice("/device:CPU:0"), p1.SerializeAsString());
+  auto p1_handle = ops::XRTAllocate(root, p1_value);
+  auto result = ops::XRTExecute(root, c_handle.handle, e_config,
+                                {Output(p0_handle), Output(p1_handle)});
+  TF_ASSERT_OK(root.status());
+
+  ClientSession session(root);
+  std::vector<Tensor> outputs;
+  TF_EXPECT_OK(session.Run({result}, &outputs));
+  EXPECT_EQ(outputs.size(), 1);
+
+  auto handles_vec = outputs.front().vec<int64>();
+  EXPECT_EQ(handles_vec.size(), 2);
+
+  const float kResults[2] = {15.0f, 9.0f};
+  for (int64 i = 0; i < handles_vec.size(); ++i) {
+    auto read_back = ops::XRTReadLiteralAndRelease(root, Input(handles_vec(i)));
+    std::vector<Tensor> voutputs;
+    TF_EXPECT_OK(session.Run({read_back}, &voutputs));
+    EXPECT_EQ(voutputs.size(), 1);
+
+    xla::LiteralProto response;
+    EXPECT_TRUE(response.ParseFromString(voutputs[0].scalar<string>()()));
+
+    auto expected = xla::LiteralUtil::CreateR0<float>(kResults[i]);
+    EXPECT_TRUE(CompareLiteralToLiteralProto(expected, response));
+  }
+}
+
 TEST(RawApiTest, LeakCompilationReference) {
   xrt::XLAComputation c;
   auto config = c.mutable_config();
   auto shapes = config->mutable_program_shape();
-  *shapes->add_parameters() = xla::ShapeUtil::MakeShape(xla::F32, {2});
-  *shapes->add_parameters() = xla::ShapeUtil::MakeShape(xla::F32, {2});
-  *shapes->mutable_result() = xla::ShapeUtil::MakeTupleShape(
-      {xla::ShapeUtil::MakeShape(xla::F32, {2})});
+  *shapes->add_parameters() =
+      xla::ShapeUtil::MakeShape(xla::F32, {2}).ToProto();
+  *shapes->add_parameters() =
+      xla::ShapeUtil::MakeShape(xla::F32, {2}).ToProto();
+  *shapes->mutable_result() =
+      xla::ShapeUtil::MakeTupleShape({xla::ShapeUtil::MakeShape(xla::F32, {2})})
+          .ToProto();
   StoreComputationSnapshot(AddAndTuple(), c.mutable_hlo_snapshot());
 
   Scope root = Scope::NewRootScope().WithDevice(DeviceFromFlag());
@@ -703,9 +842,9 @@ TEST(RawApiTest, CompileAndExecuteWithS64Argument) {
   xrt::XLAComputation c;
   auto config = c.mutable_config();
   auto shapes = config->mutable_program_shape();
-  *shapes->add_parameters() = xla::ShapeUtil::MakeShape(xla::S64, {});
-  *shapes->add_parameters() = xla::ShapeUtil::MakeShape(xla::S64, {});
-  *shapes->mutable_result() = xla::ShapeUtil::MakeShape(xla::S64, {});
+  *shapes->add_parameters() = xla::ShapeUtil::MakeShape(xla::S64, {}).ToProto();
+  *shapes->add_parameters() = xla::ShapeUtil::MakeShape(xla::S64, {}).ToProto();
+  *shapes->mutable_result() = xla::ShapeUtil::MakeShape(xla::S64, {}).ToProto();
   StoreComputationSnapshot(AddS64(), c.mutable_hlo_snapshot());
 
   xrt::XRTExecutionConfig e;
@@ -739,11 +878,11 @@ TEST(RawApiTest, CompileAndExecuteWithS64Argument) {
   auto expected = xla::LiteralUtil::CreateR0<int64>(15123899);
   EXPECT_TRUE(CompareLiteralToLiteralProto(expected, response));
 
-  xla::ProgramShape program_shape;
+  xla::ProgramShapeProto program_shape;
   EXPECT_TRUE(program_shape.ParseFromString(outputs[1].vec<string>()(0)));
   EXPECT_EQ(program_shape.parameters_size(), 2);
-  EXPECT_TRUE(
-      xla::ShapeUtil::HasPrimitiveType(program_shape.result(), xla::S64));
+  EXPECT_TRUE(xla::ShapeUtil::HasPrimitiveType(
+      xla::Shape(program_shape.result()), xla::S64));
 }
 
 }  // namespace
diff --git a/tensorflow/compiler/xrt/xrt.proto b/tensorflow/compiler/xrt/xrt.proto
index 6ab77fbaaf0..378bb9246f2 100644
--- a/tensorflow/compiler/xrt/xrt.proto
+++ b/tensorflow/compiler/xrt/xrt.proto
@@ -3,6 +3,7 @@ syntax = "proto3";
 package xrt;
 
 import "tensorflow/compiler/tf2xla/host_compute_metadata.proto";
+import "tensorflow/compiler/xla/xla.proto";
 import "tensorflow/compiler/xla/xla_data.proto";
 import "tensorflow/compiler/xla/service/hlo.proto";
 
@@ -36,16 +37,18 @@ message XLAComputationConfig {
   tensorflow.tf2xla.HostComputeMetadata host_compute_metadata = 3;
 
   // The arg/result shapes for the whole computation.
-  xla.ProgramShape program_shape = 4;
+  xla.ProgramShapeProto program_shape = 4;
   // The arg/result shapes for each core of a model-parallel
   // computation. per_core_args_and_result_shapes is optional for a
   // single-core computation.
-  repeated xla.ProgramShape per_core_program_shape = 5;
+  repeated xla.ProgramShapeProto per_core_program_shape = 5;
   // Describes how replicated computation instances should be assigned to
   // devices. There are num_cores_per_replica computations, and each one will be
   // sent and executed to the set of replica device numbers described in the
   // DeviceAssignment proto.
   DeviceAssignment device_assignment = 6;
+  // The debugging options to be passed to the XLA compilation process.
+  xla.DebugOptions debug_options = 7;
 }
 
 // Options and XLA computation for a compilation.
@@ -98,4 +101,8 @@ message XRTExecutionConfig {
   bool release_input_handles = 5;
   // If true, release the handle to the computation after running.
   bool release_compilation_handle = 6;
+  // If set to true, and the result shape is a tuple, then instead of returning
+  // a single tuple allocation the execution will return a vector of
+  // allocations, one for each of the first-level elements of the result tuple.
+  bool return_exploded_tuple = 7;
 }
diff --git a/tensorflow/compiler/xrt/xrt_state.cc b/tensorflow/compiler/xrt/xrt_state.cc
index 3a99820d7aa..5c7c537c340 100644
--- a/tensorflow/compiler/xrt/xrt_state.cc
+++ b/tensorflow/compiler/xrt/xrt_state.cc
@@ -183,6 +183,20 @@ Status XRTTupleAllocation::ToLiteral(xla::Backend* backend, int device_ordinal,
   return Status::OK();
 }
 
+Status XRTTupleAllocation::WriteLiteral(xla::Backend* backend,
+                                        const xla::Literal& literal) {
+  if (!xla::ShapeUtil::Equal(literal.shape(), on_host_shape())) {
+    return errors::InvalidArgument(
+        "New literal shape not matching the existing one: literal=",
+        xla::ShapeUtil::HumanStringWithLayout(literal.shape()),
+        " device=", xla::ShapeUtil::HumanStringWithLayout(on_host_shape()));
+  }
+  auto transfer_manager = backend->transfer_manager();
+  TF_ASSIGN_OR_RETURN(auto stream, backend->BorrowStream(device_ordinal()));
+  return transfer_manager->TransferLiteralToDevice(stream.get(), literal,
+                                                   ToShapedBuffer());
+}
+
 void XRTTupleAllocation::DiscardAllocation(
     const xla::ShapeIndex& buffer_index) {
   buffers_.element(buffer_index)->DiscardAllocation();
diff --git a/tensorflow/compiler/xrt/xrt_state.h b/tensorflow/compiler/xrt/xrt_state.h
index 73b5584e38f..3664c0cd4e6 100644
--- a/tensorflow/compiler/xrt/xrt_state.h
+++ b/tensorflow/compiler/xrt/xrt_state.h
@@ -137,6 +137,9 @@ class XRTTupleAllocation : public ResourceBase {
   Status ToLiteral(xla::Backend* backend, int device_ordinal,
                    xla::Literal* literal);
 
+  // Write a new literal value to the allocation.
+  Status WriteLiteral(xla::Backend* backend, const xla::Literal& literal);
+
   // True if none of the buffers in the allocation are aliased by any other live
   // handle.
   bool IsExclusiveOwner();
diff --git a/tensorflow/compiler/xrt/xrt_util.cc b/tensorflow/compiler/xrt/xrt_util.cc
new file mode 100644
index 00000000000..3ef8bedc732
--- /dev/null
+++ b/tensorflow/compiler/xrt/xrt_util.cc
@@ -0,0 +1,76 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xrt/xrt_util.h"
+
+#include <stdlib.h>
+#include <string.h>
+
+#include "tensorflow/compiler/xla/debug_options_flags.h"
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace tensorflow {
+namespace {
+
+bool DebugOptionsPassThroughEnabled() {
+  const char* env = getenv("TF_XLA_DEBUG_OPTIONS_PASSTHROUGH");
+  bool enabled =
+      env != nullptr && (strcmp(env, "1") == 0 || strcmp(env, "true") == 0);
+  if (enabled) {
+    LOG(WARNING) << "Passing through XLA debug options!";
+  } else {
+    LOG(WARNING) << "TF_XLA_DEBUG_OPTIONS_PASSTHROUGH not set, not all options "
+                    "will be retained";
+  }
+  return enabled;
+}
+
+string SafeDebugPath(const string& path) {
+  if (path.empty() || path.compare(0, 5, "gs://") == 0 ||
+      path.compare(0, 11, "bigstore://") == 0) {
+    return path;
+  }
+  LOG(WARNING) << "Invalid config path (will be dropped): " << path;
+  return string();
+}
+
+}  // namespace
+
+xla::DebugOptions BuildXlaDebugOptions(const xla::DebugOptions& ref_options) {
+  static const bool options_passthrough = DebugOptionsPassThroughEnabled();
+  if (options_passthrough) {
+    return ref_options;
+  }
+  xla::DebugOptions options = xla::GetDebugOptionsFromFlags();
+  options.set_xla_generate_hlo_text_to(
+      SafeDebugPath(ref_options.xla_generate_hlo_text_to()));
+  options.set_xla_dump_optimized_hlo_proto_to(
+      SafeDebugPath(ref_options.xla_dump_optimized_hlo_proto_to()));
+  options.set_xla_dump_computations_to(
+      SafeDebugPath(ref_options.xla_dump_computations_to()));
+  options.set_xla_dump_executions_to(
+      SafeDebugPath(ref_options.xla_dump_executions_to()));
+  for (auto& pass : ref_options.xla_disable_hlo_passes()) {
+    options.add_xla_disable_hlo_passes(pass);
+  }
+  options.set_xla_dump_unoptimized_hlo_proto_to(
+      SafeDebugPath(ref_options.xla_dump_unoptimized_hlo_proto_to()));
+  options.set_xla_dump_per_pass_hlo_proto_to(
+      SafeDebugPath(ref_options.xla_dump_per_pass_hlo_proto_to()));
+  return options;
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/xrt/xrt_util.h b/tensorflow/compiler/xrt/xrt_util.h
new file mode 100644
index 00000000000..d9c05a7f340
--- /dev/null
+++ b/tensorflow/compiler/xrt/xrt_util.h
@@ -0,0 +1,34 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Utility functions in support of the XRT API.
+
+#ifndef TENSORFLOW_COMPILER_XRT_XRT_UTIL_H_
+#define TENSORFLOW_COMPILER_XRT_XRT_UTIL_H_
+
+#include "tensorflow/compiler/xla/xla.pb.h"
+
+namespace tensorflow {
+
+// Filters the debug options provided as argument according to the value of the
+// TF_XLA_DEBUG_OPTIONS_PASSTHROUGH environment variable. If such variable is
+// set to "1" or "true", the debug options will be returned as is. Otherwise
+// only a subset of them will be set in the returned ones, and all the paths
+// contained in it, will be limited to gs:// and bigstore:// ones.
+xla::DebugOptions BuildXlaDebugOptions(const xla::DebugOptions& ref_options);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_XRT_XRT_UTIL_H_
diff --git a/tensorflow/contrib/all_reduce/BUILD b/tensorflow/contrib/all_reduce/BUILD
index a513aa1e7c4..f6c6560c1c3 100644
--- a/tensorflow/contrib/all_reduce/BUILD
+++ b/tensorflow/contrib/all_reduce/BUILD
@@ -9,8 +9,6 @@ licenses(["notice"])  # Apache 2.0
 
 exports_files(["LICENSE"])
 
-load("//tensorflow:tensorflow.bzl", "tf_py_test")
-
 py_library(
     name = "all_reduce_py",
     srcs = ["__init__.py"],
@@ -29,29 +27,6 @@ py_library(
     srcs_version = "PY2AND3",
     visibility = ["//visibility:public"],
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:nccl_ops",
-    ],
-)
-
-tf_py_test(
-    name = "all_reduce_test",
-    srcs = ["python/all_reduce_test.py"],
-    additional_deps = [
-        ":all_reduce",
-        "//third_party/py/numpy",
-        "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:platform",
-        "//tensorflow/python:platform_test",
-        "//tensorflow/python:state_ops",
+        "//tensorflow/python/distribute:all_reduce",
     ],
 )
diff --git a/tensorflow/contrib/all_reduce/python/all_reduce.py b/tensorflow/contrib/all_reduce/python/all_reduce.py
index 25f4b4b8d34..238cdaf8a79 100644
--- a/tensorflow/contrib/all_reduce/python/all_reduce.py
+++ b/tensorflow/contrib/all_reduce/python/all_reduce.py
@@ -18,842 +18,5 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import collections
-import math
-
-from tensorflow.python.framework import device as device_lib
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import nccl_ops
-
-
-def _flatten_tensors(tensors):
-  """Check tensors for isomorphism and flatten.
-
-  Args:
-    tensors: list of T `tf.Tensor` which must all have the same shape.
-
-  Returns:
-    tensors: a list of T `tf.Tensor` which are flattened (1D) views of tensors
-    shape: the original shape of each element of input tensors
-
-  Raises:
-    ValueError: tensors are empty or non-isomorphic or have unknown shape.
-  """
-  if not tensors:
-    raise ValueError("tensors cannot be empty")
-  shape = tensors[0].shape
-  for tensor in tensors:
-    shape = shape.merge_with(tensor.shape)
-  if not shape.is_fully_defined():
-    raise ValueError("Tensors must have statically known shape.")
-  if len(shape) != 1:
-    reshaped = []
-    for t in tensors:
-      with ops.colocate_with(t):
-        reshaped.append(array_ops.reshape(t, [-1]))
-    tensors = reshaped
-  return tensors, shape
-
-
-def _reshape_tensors(tensors, shape):
-  """Reshape tensors flattened by _flatten_tensors.
-
-  Args:
-    tensors: list of T `tf.Tensor` of identical length 1D tensors.
-    shape: list of integers describing the desired shape.  Product of
-      the elements must equal the length of each tensor.
-
-  Returns:
-    list of T `tf.Tensor` which are the reshaped inputs.
-  """
-  reshaped = []
-  for t in tensors:
-    with ops.colocate_with(t):
-      reshaped.append(array_ops.reshape(t, shape))
-  return reshaped
-
-
-def _padded_split(tensor, pieces):
-  """Like split for 1D tensors but pads-out case where len % pieces != 0.
-
-  Args:
-    tensor: T `tf.Tensor` that must be 1D.
-    pieces: a positive integer specifying the number of pieces into which
-      tensor should be split.
-
-  Returns:
-    list of T `tf.Tensor` of length pieces, which hold the values of
-      thin input tensor, in order.  The final tensor may
-      be zero-padded on the end to make its size equal to those of all
-      of the other tensors.
-
-  Raises:
-    ValueError: The input tensor is not 1D.
-  """
-  shape = tensor.shape
-  if 1 != len(shape):
-    raise ValueError("input tensor must be 1D")
-  tensor_len = shape.dims[0].value
-  with ops.colocate_with(tensor):
-    if tensor_len % pieces != 0:
-      # pad to an even length
-      chunk_size = 1 + tensor_len // pieces
-      if pieces > tensor_len:
-        # This is an edge case that should not come up in practice,
-        # i.e. a different reduction algorithm would be better,
-        # but we'll make it work just for completeness.
-        pad_len = pieces - tensor_len
-        extended_whole = array_ops.concat(
-            [tensor, array_ops.zeros([pad_len], dtype=tensor.dtype)], 0)
-        parts = array_ops.split(extended_whole, pieces)
-        return parts, pad_len
-      elif (pieces - 1) * chunk_size >= tensor_len:
-        # Another edge case of limited real interest.
-        pad_len = (pieces * chunk_size) % tensor_len
-        extended_whole = array_ops.concat(
-            [tensor, array_ops.zeros([pad_len], dtype=tensor.dtype)], 0)
-        parts = array_ops.split(extended_whole, pieces)
-        return parts, pad_len
-      else:
-        last_chunk_size = tensor_len - (pieces - 1) * chunk_size
-        pad_len = chunk_size - last_chunk_size
-        piece_lens = [chunk_size for _ in range(pieces - 1)] + [last_chunk_size]
-        parts = array_ops.split(tensor, piece_lens)
-        parts[-1] = array_ops.concat(
-            [parts[-1], array_ops.zeros([pad_len], dtype=tensor.dtype)], 0)
-        return parts, pad_len
-    else:
-      return array_ops.split(tensor, pieces), 0
-
-
-def _strip_padding(tensors, pad_len):
-  """Strip the suffix padding added by _padded_split.
-
-  Args:
-    tensors: list of T `tf.Tensor` of identical length 1D tensors.
-    pad_len: number of elements to be stripped from the end of each tensor.
-
-  Returns:
-    list of T `tf.Tensor` which are the stripped inputs.
-
-  Raises:
-    ValueError: tensors must be a non-empty list of 1D tensors, and
-      each must be longer than pad_len.
-  """
-  if not tensors:
-    raise ValueError("tensors cannot be empty")
-  shape = tensors[0].shape
-  if len(shape) > 1:
-    raise ValueError("tensors must be 1D")
-  prefix_len = int(shape[0] - pad_len)
-  if prefix_len < 0:
-    raise ValueError("pad_len longer than tensor")
-  stripped = []
-  for t in tensors:
-    with ops.colocate_with(t):
-      stripped.append(array_ops.slice(t, [0], [prefix_len]))
-  return stripped
-
-
-def _ragged_split(tensor, pieces):
-  """Like split for 1D tensors but allows case where len % pieces != 0.
-
-  Args:
-    tensor: T `tf.Tensor` that must be 1D.
-    pieces: a positive integer specifying the number of pieces into which
-      tensor should be split.
-
-  Returns:
-    list of T `tf.Tensor` of length pieces, which hold the values of
-      the input tensor, in order.  The final tensor may be shorter
-      than the others, which will all be of equal length.
-
-  Raises:
-    ValueError: input tensor must be 1D.
-  """
-  shape = tensor.shape
-  if 1 != len(shape):
-    raise ValueError("input tensor must be 1D")
-  tensor_len = shape.dims[0].value
-  chunk_size = tensor_len // pieces
-  with ops.colocate_with(tensor):
-    if tensor_len != (pieces * chunk_size):
-      # last piece will be short
-      assert pieces > 1
-      last_chunk_size = tensor_len - ((pieces - 1) * chunk_size)
-      assert last_chunk_size > 0
-      piece_lens = [chunk_size for _ in range(pieces - 1)] + [last_chunk_size]
-      return array_ops.split(tensor, piece_lens)
-    else:
-      return array_ops.split(tensor, pieces)
-
-
-def _ring_permutations(num_workers, num_subchunks, gpu_perm):
-  """"Generate an array of device index arrays, one for each subchunk.
-
-  In the basic ring reduction algorithm there are size(T)/num_devices
-  data chunks and each device process one chunk per tick, i.e. sending
-  one chunk and receiving one chunk.  The idea of subchunking is that
-  each device processes num_subchunks smaller data regions per tick,
-  and the ring rank permutation is different for each subchunk index
-  so that a device is potentially sending to and receiving from
-  num_subchunks different other devices at each tick.  Where multiple
-  independent data channels exist between devices, this strategy
-  supplies a method of using them in parallel.
-
-  Args:
-    num_workers: number of worker tasks
-    num_subchunks: number of subchunks into which to divide each per-GPU chunk.
-    gpu_perm: an array of integers in [0, num_gpus-1] giving the default
-      ring order of GPUs at each worker.  Other permutations will be generated
-      by rotating this array and splicing together per-worker instances.
-
-  Raises:
-    ValueError: the number of subchunks may not exceed the number of GPUs.
-
-  Returns:
-    pred_by_s_d: list of lists that maps (by index) from (subchunk, dev) to
-        preceding device in the permutation for that subchunk.  The
-        device index of GPU i at worker j is i + (j * num_gpus).
-    rank_by_s_d: list of lists that maps (by index) from (subchunk, dev) to
-       local rank of device d in the permutation for that subchunk.
-  """
-  num_gpus = len(gpu_perm)
-  devices = num_workers * num_gpus
-  if devices == 0:
-    return [], []
-  if num_subchunks > num_gpus:
-    raise ValueError(
-        "num_subchunks %d must be <= num_gpus %d" % (num_subchunks, num_gpus))
-  rotation_interval = max(1, int(num_gpus / num_subchunks))
-  perms_by_s = []
-  for s in range(0, num_subchunks):
-    full_order = []
-    offset = s * rotation_interval
-    for w in range(0, num_workers):
-      default_order = [(w * num_gpus) + i for i in gpu_perm]
-      dev_order = default_order[offset:] + default_order[:offset]
-      full_order += dev_order
-    perms_by_s.append(full_order)
-  pred_by_s_d = [[-1 for d in range(0, devices)]
-                 for s in range(0, num_subchunks)]
-  rank_by_s_d = [[-1 for d in range(0, devices)]
-                 for s in range(0, num_subchunks)]
-  for s in range(0, num_subchunks):
-    for d in range(0, devices):
-      for t in range(0, devices):
-        if d == perms_by_s[s][t]:
-          rank_by_s_d[s][d] = t
-          pred_by_s_d[s][d] = perms_by_s[s][(t + devices - 1) % devices]
-          break
-  return (pred_by_s_d, rank_by_s_d)
-
-
-def build_ring_all_reduce(input_tensors, num_workers, num_subchunks,
-                          gpu_perm, red_op, un_op=None):
-  """Construct a subgraph performing a ring-style all-reduce of input_tensors.
-
-  Args:
-    input_tensors: a list of T `tf.Tensor` objects, which must all
-      have the same shape and type.
-    num_workers: number of worker tasks spanned by input_tensors.
-    num_subchunks: number of subchunks each device should process in one tick.
-    gpu_perm: a list of ints giving a ring-wise rank ordering of GPUs at
-      each worker.  All workers must have the same number of
-      GPUs with the same rank ordering.  If NVLINK is available, this should
-      be a ring order supported by NVLINK edges.
-    red_op: a binary operator for elementwise reduction.
-    un_op: an optional unary operator to apply to fully reduced values.
-
-  Raises:
-    ValueError: empty input_tensors or they don't all have same
-    size.
-
-  Returns:
-    a list of T `tf.Tensor` identical sum-reductions of input_tensors.
-  """
-  if len(input_tensors) < 2:
-    raise ValueError("input_tensors must be length 2 or longer")
-  input_tensors, shape = _flatten_tensors(input_tensors)
-  devices = [t.device for t in input_tensors]
-  (pred_by_s_d, rank_by_s_d) = _ring_permutations(
-      num_workers, num_subchunks, gpu_perm)
-  chunks_by_dev, pad_len = _build_ring_gather(
-      input_tensors, devices,
-      num_subchunks, pred_by_s_d, rank_by_s_d, red_op)
-  if un_op:
-    chunks_by_dev = _apply_unary_to_chunks(un_op, chunks_by_dev)
-  output_tensors = _build_ring_scatter(pred_by_s_d, rank_by_s_d,
-                                       chunks_by_dev)
-  if pad_len > 0:
-    output_tensors = _strip_padding(output_tensors, pad_len)
-  if len(shape) != 1:
-    output_tensors = _reshape_tensors(output_tensors, shape)
-  return output_tensors
-
-
-def _build_ring_gather(input_tensors, devices, num_subchunks,
-                       pred_by_s_d, rank_by_s_d, red_op):
-  """Construct a subgraph for the first (reduction) pass of ring all-reduce.
-
-  Args:
-    input_tensors: a list of T `tf.Tensor` 1D input tensors of same
-      shape and type.
-    devices: array of device name strings
-    num_subchunks: number of subchunks each device should process in one tick.
-    pred_by_s_d: as produced by _ring_permutations
-    rank_by_s_d: as produced by _ring_permutations
-    red_op: a binary operator for elementwise reduction
-
-  Raises:
-    ValueError: tensors must all be one dimensional.
-
-  Returns:
-    list of list of T `tf.Tensor` of (partially) reduced values where
-    exactly num_subchunks chunks at each device are fully reduced.
-  """
-  num_devices = len(input_tensors)
-  if num_devices == 0:
-    return []
-  if num_devices == 1:
-    return input_tensors
-  shape = input_tensors[0].shape
-  if 1 != len(shape):
-    raise ValueError("input tensors must be 1D")
-  num_chunks = num_devices * num_subchunks
-  num_ticks = num_devices - 1
-  # Initialize chunks_by_dev with splits of the input tensors.
-  chunks_by_dev = []
-  split_pad_len = 0
-  for d in range(0, num_devices):
-    with ops.device(devices[d]):
-      splits, split_pad_len = _padded_split(input_tensors[d], num_chunks)
-      chunks_by_dev.append(splits)
-  # Reduction phase
-  for tick in range(0, num_ticks):
-    # One new partial reduction for every chunk
-    new_partial_reductions = [None for _ in range(0, num_chunks)]
-    # Compute reductions with respect to last tick's values
-    for d in range(0, num_devices):
-      with ops.device(devices[d]):
-        for s in range(0, num_subchunks):
-          rank = rank_by_s_d[s][d]
-          seg_index = (rank + num_devices - (2 + tick)) % num_devices
-          pred_dev = pred_by_s_d[s][d]
-          chunk_index = (seg_index * num_subchunks) + s
-          new_partial_reductions[chunk_index] = red_op(
-              chunks_by_dev[pred_dev][chunk_index],
-              chunks_by_dev[d][chunk_index])
-    # Update chunks_by_dev with the new values at the end of the tick.
-    for d in range(0, num_devices):
-      for s in range(0, num_subchunks):
-        rank = rank_by_s_d[s][d]
-        seg_index = (rank + num_devices - (2 + tick)) % num_devices
-        chunk_index = (seg_index * num_subchunks) + s
-        chunks_by_dev[d][chunk_index] = new_partial_reductions[chunk_index]
-  return chunks_by_dev, split_pad_len
-
-
-def _apply_unary_to_chunks(f, chunks_by_dev):
-  """Apply a unary op to each tensor in chunks_by_dev, on same device.
-
-  Args:
-    f: a unary function over T `tf.Tensor`.
-    chunks_by_dev: list of lists of T `tf.Tensor`.
-
-  Returns:
-    new list of lists of T `tf.Tensor` with the same structure as
-    chunks_by_dev containing the derived tensors.
-  """
-  output = []
-  for x in chunks_by_dev:
-    with ops.colocate_with(x[0]):
-      output.append([f(t) for t in x])
-  return output
-
-
-def _build_ring_scatter(pred_by_s_d, rank_by_s_d,
-                        chunks_by_dev):
-  """Construct subgraph for second (scatter) pass of ring all-reduce.
-
-  Args:
-    pred_by_s_d: as produced by _ring_permutations
-    rank_by_s_d: as produced by _ring_permutations
-    chunks_by_dev: list of list of T `tf.Tensor` indexed by ints
-      (device, chunk)
-
-  Raises:
-    ValueError: chunks_by_dev is not well-formed
-
-  Returns:
-    list of T `tf.Tensor` which are the fully reduced tensors, one
-    at each device corresponding to the outer dimension of chunks_by_dev.
-  """
-  num_devices = len(chunks_by_dev)
-  num_chunks = len(chunks_by_dev[0])
-  if 0 != num_chunks % num_devices:
-    raise ValueError(
-        "Expect number of chunks per device to be divisible by num_devices")
-  num_subchunks = int(num_chunks / num_devices)
-  num_ticks = num_devices - 1
-  for tick in range(0, num_ticks):
-    passed_values = [None for _ in range(0, num_chunks)]
-    for d in range(0, num_devices):
-      with ops.colocate_with(chunks_by_dev[d][0]):
-        for s in range(0, num_subchunks):
-          rank = rank_by_s_d[s][d]
-          seg_index = (rank + num_devices - (1 + tick)) % num_devices
-          pred_dev = pred_by_s_d[s][d]
-          chunk_index = (seg_index * num_subchunks) + s
-          passed_values[chunk_index] = array_ops.identity(
-              chunks_by_dev[pred_dev][chunk_index])
-    for d in range(0, num_devices):
-      for s in range(0, num_subchunks):
-        rank = rank_by_s_d[s][d]
-        seg_index = (rank + num_devices - (1 + tick)) % num_devices
-        chunk_index = (seg_index * num_subchunks) + s
-        chunks_by_dev[d][chunk_index] = passed_values[chunk_index]
-  # Join chunks at each device.
-  output = []
-  for x in chunks_by_dev:
-    with ops.colocate_with(x[0]):
-      output.append(array_ops.concat(x, 0))
-  return output
-
-
-def build_recursive_hd_all_reduce(input_tensors, red_op, un_op=None):
-  """Construct a subgraph for recursive halving-doubling all-reduce.
-
-  The recursive halving-doubling algorithm is described in
-  http://www.mcs.anl.gov/~thakur/papers/ijhpca-coll.pdf
-
-  The concept is to arrange the participating n devices in
-  a linear sequence where devices exchange data pairwise
-  with one other device in each round.  During the gather
-  phase there are lg(n) rounds where devices exchange
-  increasingly smaller sub-tensors with another device
-  at increasingly greater distances, until at the top
-  each device has 1/n of the fully reduced values.  During the
-  scatter phase each device exchanges its fully reduced
-  sub-tensor (which doubles in length at each round)
-  with one other device at increasingly smaller distances
-  until each device has all of the fully reduced values.
-
-  Note: this preliminary version requires that len(input_tensors) be a
-    power of 2.  TODO(tucker): relax this restriction.  Also, the
-    number of elements in each tensor must be divisible by 2^h where h
-    is the number of hops in each phase.  This will also be relaxed in
-    the future with edge-case specific logic.
-
-  Args:
-    input_tensors: list of T `tf.Tensor` to be elementwise reduced.
-    red_op: a binary elementwise reduction Op.
-    un_op: an optional unary elementwise Op to apply to reduced values.
-
-  Returns:
-    list of T `tf.Tensor` which are the fully reduced tensors, one
-    at each device of input_tensors.
-
-  Raises:
-    ValueError: num_devices not a power of 2, or tensor len not divisible
-    by 2 the proper number of times.
-  """
-  devices = [t.device for t in input_tensors]
-  input_tensors, shape = _flatten_tensors(input_tensors)
-  reduced_shards = _build_recursive_hd_gather(input_tensors, devices, red_op)
-  if un_op:
-    reduced_shards = [un_op(t) for t in reduced_shards]
-  output_tensors = _build_recursive_hd_scatter(reduced_shards, devices)
-  if len(shape) != 1:
-    output_tensors = _reshape_tensors(output_tensors, shape)
-  return output_tensors
-
-
-def _build_recursive_hd_gather(input_tensors, devices, red_op):
-  """Construct the gather phase of recursive halving-doubling all-reduce.
-
-  Args:
-    input_tensors: list of T `tf.Tensor` to be elementwise reduced.
-    devices: a list of strings naming the devices hosting input_tensors,
-      which will also be used to host the (partial) reduction values.
-    red_op: a binary elementwise reduction Op.
-
-  Returns:
-    list of T `tf.Tensor` which are the fully reduced tensor shards.
-
-  Raises:
-    ValueError: num_devices not a power of 2, or tensor len not divisible
-    by 2 the proper number of times.
-  """
-  num_devices = len(devices)
-  num_hops = int(math.log(num_devices, 2))
-  if num_devices != (2 ** num_hops):
-    raise ValueError("num_devices must be a power of 2")
-  chunks = input_tensors
-  for h in range(0, num_hops):
-    span = 2 ** h
-    group_size = span * 2
-    new_chunks = [[] for _ in devices]
-    for d in range(0, num_devices):
-      if (d % group_size) >= (group_size / 2):
-        # skip right half of a pair
-        continue
-      left_dev = devices[d]
-      right_dev = devices[d + span]
-      left_split = array_ops.split(chunks[d], 2)
-      right_split = array_ops.split(chunks[d+span], 2)
-      with ops.device(left_dev):
-        new_chunks[d] = red_op(left_split[0], right_split[0])
-      with ops.device(right_dev):
-        new_chunks[d + span] = red_op(left_split[1], right_split[1])
-    chunks = new_chunks
-  return chunks
-
-
-def _build_recursive_hd_scatter(input_tensors, devices):
-  """Construct the scatter phase of recursive halving-doublng all-reduce.
-
-  Args:
-    input_tensors: list of T `tf.Tensor` that are fully-reduced shards.
-    devices: a list of strings naming the devices on which the reconstituted
-      full tensors should be placed.
-
-  Returns:
-    list of T `tf.Tensor` which are the fully reduced tensors.
-  """
-  num_devices = len(devices)
-  num_hops = int(math.log(num_devices, 2))
-  assert num_devices == (2 ** num_hops), "num_devices must be a power of 2"
-  chunks = input_tensors
-  for h in reversed(range(0, num_hops)):
-    span = 2 ** h
-    group_size = span * 2
-    new_chunks = [[] for _ in devices]
-    for d in range(0, num_devices):
-      if (d % group_size) >= (group_size / 2):
-        # skip right half of a pair
-        continue
-      left_idx = d
-      right_idx = d + span
-      left_dev = devices[left_idx]
-      right_dev = devices[right_idx]
-      with ops.device(left_dev):
-        new_chunks[left_idx] = array_ops.concat([chunks[left_idx],
-                                                 chunks[right_idx]], 0)
-      with ops.device(right_dev):
-        new_chunks[right_idx] = array_ops.concat([chunks[left_idx],
-                                                  chunks[right_idx]], 0)
-    chunks = new_chunks
-  return chunks
-
-
-def build_shuffle_all_reduce(input_tensors, gather_devices, red_op, un_op=None):
-  """Construct a subgraph for shuffle all-reduce.
-
-  Shuffle reduce is essentially the algorithm implemented when using
-  parameter servers.  Suppose tensor length is n, there are d devices
-  and g gather shards.  Each device sends a n/g length sub-tensor to
-  each gather shard.  The gather shards perform a reduction across d
-  fragments, then broadcast the result back to each device.  The
-  devices then join the g fully reduced fragments they receive from
-  the shards.  The gather shards could perform d-1 pairwise
-  reductions, or one d-way reduction.  The first is better where
-  reduction Op time is low compared to transmission time, the second
-  better in the other case.
-
-  Args:
-    input_tensors: list of T @(tf.Tensor} values to be reduced.
-    gather_devices: list of names of devices on which reduction shards
-      should be placed.
-    red_op: an n-array elementwise reduction Op
-    un_op: optional elementwise unary Op to be applied to fully-reduced values.
-
-  Returns:
-    list of T `tf.Tensor` which are the fully reduced tensors.
-  """
-  input_tensors, shape = _flatten_tensors(input_tensors)
-  dst_devices = [t.device for t in input_tensors]
-  reduced_shards = _build_shuffle_gather(input_tensors, gather_devices,
-                                         red_op, un_op)
-  output_tensors = _build_shuffle_scatter(reduced_shards, dst_devices)
-  if len(shape) != 1:
-    output_tensors = _reshape_tensors(output_tensors, shape)
-  return output_tensors
-
-
-def _build_shuffle_gather(input_tensors, gather_devices, red_op, un_op=None):
-  """Construct the gather (concentrate and reduce) phase of shuffle all-reduce.
-
-  Args:
-    input_tensors: list of T @(tf.Tensor} values to be reduced.
-    gather_devices: list of names of devices on which reduction shards
-      should be placed.
-    red_op: the binary reduction Op
-    un_op: optional elementwise unary Op to be applied to fully-reduced values.
-
-  Returns:
-    list of T `tf.Tensor` which are the fully reduced shards.
-
-  Raises:
-    ValueError: inputs not well-formed.
-  """
-  num_source_devices = len(input_tensors)
-  num_gather_devices = len(gather_devices)
-  shape = input_tensors[0].shape
-  if len(shape) != 1:
-    raise ValueError("input_tensors must be 1D")
-  shards_by_source = []
-  for d in range(0, num_source_devices):
-    with ops.colocate_with(input_tensors[d]):
-      shards_by_source.append(
-          _ragged_split(input_tensors[d], num_gather_devices))
-  reduced_shards = []
-  for d in range(0, num_gather_devices):
-    with ops.device(gather_devices[d]):
-      values = [s[d] for s in shards_by_source]
-      red_shard = red_op(values)
-      if un_op:
-        red_shard = un_op(red_shard)
-      reduced_shards.append(red_shard)
-  return reduced_shards
-
-
-def _build_shuffle_scatter(reduced_shards, dst_devices):
-  """Build the scatter phase of shuffle all-reduce.
-
-  Args:
-    reduced_shards:  list of T @(tf.Tensor} fully reduced shards
-    dst_devices: list of names of devices at which the fully-reduced value
-      should be reconstituted.
-
-  Returns:
-    list of T `tf.Tensor` scattered tensors.
-  """
-  num_devices = len(dst_devices)
-  out_tensors = []
-  for d in range(0, num_devices):
-    with ops.device(dst_devices[d]):
-      out_tensors.append(array_ops.concat(reduced_shards, 0))
-  return out_tensors
-
-
-def _split_by_task(devices, values):
-  """Partition devices and values by common task.
-
-  Args:
-    devices: list of device name strings
-    values: list of T `tf.tensor` of same length as devices.
-
-  Returns:
-    (per_task_devices, per_task_values) where both values are
-    lists of lists with isomorphic structure: the outer list is
-    indexed by task, and the inner list has length of the number
-    of values belonging to that task.  per_task_devices contains
-    the specific devices to which the values are local, and
-    per_task_values contains the corresponding values.
-
-  Raises:
-    ValueError: devices must be same length as values.
-  """
-  num_devices = len(devices)
-  if num_devices != len(values):
-    raise ValueError("len(devices) must equal len(values)")
-  per_task_devices = collections.OrderedDict()
-  per_task_values = collections.OrderedDict()
-  for d in range(num_devices):
-    d_spec = device_lib.DeviceSpec.from_string(devices[d])
-    if not hasattr(d_spec, "task") or d_spec.task is None:
-      assert False, "failed to parse device %s" % devices[d]
-    index = (d_spec.job or "localhost", d_spec.replica or 0, d_spec.task)
-    if index not in per_task_devices:
-      per_task_devices[index] = []
-      per_task_values[index] = []
-    per_task_devices[index].append(devices[d])
-    per_task_values[index].append(values[d])
-
-  return (list(per_task_devices.values()), list(per_task_values.values()))
-
-
-def build_nccl_all_reduce(input_tensors, red_op, un_op=None):
-  """Build a subgraph that does one full all-reduce, using NCCL.
-
-  Args:
-    input_tensors: list of T `tf.Tensor` of same-shape and type values to
-      be reduced.
-    red_op: binary elementwise reduction operator.  Must be one of
-      {tf.add}
-    un_op: optional unary elementwise Op to apply to fully-reduce values.
-
-  Returns:
-    list of T `tf.Tensor` of reduced values.
-
-  Raises:
-    ValueError: red_op not supported.
-  """
-  if red_op == math_ops.add:
-    output_tensors = nccl_ops.all_sum(input_tensors)
-  else:
-    raise ValueError("red_op not supported by NCCL all-reduce: ", red_op)
-  if un_op:
-    un_op_wrapped = []
-    for t in output_tensors:
-      with ops.colocate_with(t):
-        un_op_wrapped.append(un_op(t))
-    output_tensors = un_op_wrapped
-  return output_tensors
-
-
-def _build_nccl_hybrid(input_tensors, red_op, upper_level_f):
-  """Construct a subgraph for NCCL hybrid all-reduce.
-
-  Args:
-    input_tensors: list of T `tf.Tensor` of same-shape and type values to
-      be reduced.
-    red_op: binary elementwise reduction operator.
-    upper_level_f: function for reducing one value per worker, across
-      workers.
-
-  Returns:
-    list of T `tf.Tensor` of reduced values.
-
-  Raises:
-    ValueError: inputs not well-formed.
-  """
-  input_tensors, shape = _flatten_tensors(input_tensors)
-  devices = [t.device for t in input_tensors]
-  per_worker_devices, per_worker_values = _split_by_task(devices, input_tensors)
-  num_workers = len(per_worker_devices)
-  up_values = [None for w in range(0, num_workers)]
-  up_devices = up_values[:]
-  down_values = up_values[:]
-  # First stage: reduce within each worker using NCCL
-  for w in range(0, num_workers):
-    worker_values = build_nccl_all_reduce(per_worker_values[w], red_op)
-    # NOTE: these reductions will not run to completion unless
-    # every output value is used.  Since we only need one, we
-    # need to put control dependencies on the rest.
-    with ops.control_dependencies(worker_values):
-      with ops.device(worker_values[0].device):
-        up_values[w] = array_ops.identity(worker_values[0])
-      up_devices[w] = per_worker_devices[w][0]
-  # Second stage: Apply upper_level_f to reduce across first device at
-  # each worker
-  level_2_output = upper_level_f(up_values)
-  # Third stage: propagate within each worker using NCCL Broadcast
-  for w in range(0, num_workers):
-    dst_tensors = []
-    with ops.device(per_worker_devices[w][0]):
-      broadcast_src = nccl_ops.broadcast(array_ops.identity(level_2_output[w]))
-    for d in per_worker_devices[w]:
-      with ops.device(d):
-        dst_tensors.append(array_ops.identity(broadcast_src))
-    down_values[w] = dst_tensors
-  output_tensors = [v for sublist in down_values for v in sublist]
-  if len(shape) != 1:
-    output_tensors = _reshape_tensors(output_tensors, shape)
-  return output_tensors
-
-
-def _reduce_non_singleton(input_tensors, red_f, un_op):
-  """If input_tensors has more than one element apply red_f, else apply un_op."""
-  if len(input_tensors) > 1:
-    return red_f(input_tensors)
-  else:
-    if not un_op:
-      return input_tensors
-    output_tensors = []
-    for t in input_tensors:
-      with ops.colocate_with(t):
-        output_tensors.append(un_op(t))
-    return output_tensors
-
-
-def build_nccl_then_ring(input_tensors, subdiv, red_op, un_op=None):
-  """Construct hybrid of NCCL within workers, Ring across workers."""
-  def upper_builder(y):
-    return build_ring_all_reduce(y, len(y), subdiv, [0], red_op, un_op)
-  def upper_level_f(x):
-    return _reduce_non_singleton(x, upper_builder, un_op)
-  return _build_nccl_hybrid(input_tensors, red_op, upper_level_f)
-
-
-def build_nccl_then_recursive_hd(input_tensors, red_op, un_op=None):
-  """Construct hybrid of NCCL within workers, Recursive-HD across workers."""
-  upper_level_f = lambda x: build_recursive_hd_all_reduce(x, red_op, un_op)
-  return _build_nccl_hybrid(input_tensors, red_op, upper_level_f)
-
-
-def build_nccl_then_shuffle(input_tensors, gather_devices, nccl_red_op,
-                            shuffle_red_op, un_op=None):
-  """Construct hybrid of NCCL within workers, Shuffle across workers."""
-  upper_level_f = lambda x: build_shuffle_all_reduce(x, gather_devices,
-                                                     shuffle_red_op, un_op)
-  return _build_nccl_hybrid(input_tensors, nccl_red_op, upper_level_f)
-
-
-def _build_shuffle_hybrid(input_tensors, gather_devices, red_op, upper_level_f):
-  """Construct a subgraph for Shuffle hybrid all-reduce.
-
-  Args:
-    input_tensors: list of T `tf.Tensor` of same-shape and type values to
-      be reduced.
-    gather_devices: list of device names on which to host gather shards.
-    red_op: binary elementwise reduction operator.
-    upper_level_f: function for reducing one value per worker, across
-      workers.
-
-  Returns:
-    list of T `tf.Tensor` of reduced values.
-
-  Raises:
-    ValueError: inputs not well-formed.
-  """
-  input_tensors, shape = _flatten_tensors(input_tensors)
-  # First stage, reduce across each worker using gather_devices.
-  devices = [t.device for t in input_tensors]
-  per_worker_devices, per_worker_values = _split_by_task(devices, input_tensors)
-  num_workers = len(per_worker_devices)
-  up_values = []
-  if len(gather_devices) != num_workers:
-    raise ValueError("For shuffle hybrid, gather_devices must contain one "
-                     "device per worker. ")
-  for w in range(0, num_workers):
-    reduced_shards = _build_shuffle_gather(
-        per_worker_values[w], [gather_devices[w]], red_op)
-    up_values.append(reduced_shards[0])
-  # Second stage, apply upper_level_f.
-  level_2_output = upper_level_f(up_values)
-  # Third stage, apply shuffle scatter at each worker.
-  output_tensors = []
-  for w in range(0, num_workers):
-    output_tensors += _build_shuffle_scatter(
-        [level_2_output[w]], per_worker_devices[w])
-  if len(shape) != 1:
-    output_tensors = _reshape_tensors(output_tensors, shape)
-  return output_tensors
-
-
-def build_shuffle_then_ring(input_tensors, gather_devices, subdiv,
-                            red_n_op, red_op, un_op=None):
-  """Construct hybrid of Shuffle within workers, Ring across workers."""
-  def upper_builder(tensors):
-    return build_ring_all_reduce(tensors, len(tensors), subdiv, [0],
-                                 red_op, un_op)
-  def upper_level_f(tensors):
-    return _reduce_non_singleton(tensors, upper_builder, un_op)
-  return _build_shuffle_hybrid(
-      input_tensors, gather_devices, red_n_op, upper_level_f)
-
-
-def build_shuffle_then_shuffle(input_tensors, first_gather_devices,
-                               second_gather_devices, red_op, un_op=None):
-  """Construct hybrid of Shuffle within workers, Shuffle across workers."""
-  def upper_builder(tensors):
-    return build_shuffle_all_reduce(tensors, second_gather_devices,
-                                    red_op, un_op)
-  def upper_level_f(tensors):
-    return _reduce_non_singleton(tensors, upper_builder, un_op)
-  return _build_shuffle_hybrid(
-      input_tensors, first_gather_devices, red_op, upper_level_f)
+# pylint: disable=unused-import,wildcard-import
+from tensorflow.python.distribute.all_reduce import *
diff --git a/tensorflow/contrib/autograph/examples/benchmarks/BUILD b/tensorflow/contrib/autograph/examples/benchmarks/BUILD
new file mode 100644
index 00000000000..6d2d70c99b4
--- /dev/null
+++ b/tensorflow/contrib/autograph/examples/benchmarks/BUILD
@@ -0,0 +1,36 @@
+licenses(["notice"])  # Apache 2.0
+
+load("//tensorflow:tensorflow.bzl", "py_test")
+load("//tensorflow/tools/test:performance.bzl", "tf_py_logged_benchmark")
+
+py_library(
+    name = "benchmark_base",
+    srcs = [
+        "benchmark_base.py",
+    ],
+    deps = [
+        "//tensorflow:tensorflow_py",
+    ],
+)
+
+py_test(
+    name = "cartpole_benchmark",
+    size = "enormous",
+    srcs = ["cartpole_benchmark.py"],
+    tags = [
+        "local",
+        "manual",
+        "no_oss",
+        "notap",
+        "nozapfhahn",
+    ],
+    deps = [
+        ":benchmark_base",
+        # Note: required gym dependency may need to be added here.
+    ],
+)
+
+tf_py_logged_benchmark(
+    name = "cartpole_logged_benchmark",
+    target = "//tensorflow/contrib/autograph/examples/benchmarks:cartpole_benchmark",
+)
diff --git a/tensorflow/contrib/autograph/examples/benchmarks/benchmark_base.py b/tensorflow/contrib/autograph/examples/benchmarks/benchmark_base.py
new file mode 100644
index 00000000000..93c694849c4
--- /dev/null
+++ b/tensorflow/contrib/autograph/examples/benchmarks/benchmark_base.py
@@ -0,0 +1,62 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Common benchmarking code.
+
+See https://www.tensorflow.org/community/benchmarks for usage.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import time
+
+import numpy as np
+
+import tensorflow as tf
+
+
+class ReportingBenchmark(tf.test.Benchmark):
+  """Base class for a benchmark that reports general performance metrics.
+
+  Subclasses only need to call one of the _profile methods, and optionally
+  report_results.
+  """
+
+  def time_execution(self, name, target, iters, warm_up_iters=5):
+    for _ in range(warm_up_iters):
+      target()
+
+    all_times = []
+    for _ in range(iters):
+      iter_time = time.time()
+      target()
+      all_times.append(time.time() - iter_time)
+
+    avg_time = np.average(all_times)
+
+    extras = dict()
+    extras['all_times'] = all_times
+
+    if isinstance(name, tuple):
+      extras['name'] = name
+      name = '_'.join(str(piece) for piece in name)
+
+    self.report_benchmark(
+        iters=iters, wall_time=avg_time, name=name, extras=extras)
+
+
+if __name__ == '__main__':
+  tf.test.main()
diff --git a/tensorflow/contrib/autograph/examples/benchmarks/cartpole_benchmark.py b/tensorflow/contrib/autograph/examples/benchmarks/cartpole_benchmark.py
new file mode 100644
index 00000000000..4f553be58e9
--- /dev/null
+++ b/tensorflow/contrib/autograph/examples/benchmarks/cartpole_benchmark.py
@@ -0,0 +1,492 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""A basic RL cartpole benchmark.
+
+The RL model uses the OpenAI Gym environment to train a simple network using
+the policy gradients method. The training scales the gradients for each step
+by the episode's cumulative discounted reward and averages these gradients over
+a fixed number of games before applying the optimization step.
+
+For benchmarking purposes, we replace the OpenAI Gym environment to a fake
+that returns random actions and rewards and never ends the episode. This way
+the benchmarks compare the same amount of computation at each step.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import gym
+import numpy as np
+import tensorflow as tf
+
+from tensorflow.contrib import eager
+from tensorflow.contrib.autograph.examples.benchmarks import benchmark_base
+from tensorflow.python import autograph as ag
+from tensorflow.python.eager import context
+
+#
+# AutoGraph implementation
+#
+
+
+@ag.convert()
+def graph_append_discounted_rewards(destination, rewards, discount_rate):
+  """Discounts episode rewards and appends them to destination."""
+  ag.set_element_type(rewards, tf.float32)
+
+  cdr = 0.0
+  reverse_discounted = []
+  ag.set_element_type(reverse_discounted, tf.float32)
+
+  for i in range(len(rewards) - 1, -1, -1):
+    cdr = cdr * discount_rate + rewards[i]
+    cdr.set_shape(())
+    reverse_discounted.append(cdr)
+
+  retval = destination
+  # Note: AutoGraph doesn't yet support reversed() so we use a loop instead.
+  for i in range(len(reverse_discounted) - 1, -1, -1):
+    retval.append(reverse_discounted[i])
+
+  return retval
+
+
+class GraphPolicyNetwork(tf.keras.Model):
+  """Policy network for the cart-pole reinforcement learning problem.
+
+  The forward path of the network takes an observation from the cart-pole
+  environment (length-4 vector) and outputs an action.
+  """
+
+  def __init__(self, hidden_size):
+    super(GraphPolicyNetwork, self).__init__()
+    self._hidden_layer = tf.keras.layers.Dense(
+        hidden_size, activation=tf.nn.elu)
+    self._output_layer = tf.keras.layers.Dense(1)
+
+  def call(self, inputs):
+    """Calculates logits and action.
+
+    Args:
+      inputs: Observations from a step in the cart-pole environment, of shape
+        `(batch_size, input_size)`
+
+    Returns:
+      logits: the logits output by the output layer. This can be viewed as the
+        likelihood vales of choosing the left (0) action. Shape:
+        `(batch_size, 1)`.
+      actions: randomly selected actions ({0, 1}) based on the logits. Shape:
+        `(batch_size, 1)`.
+    """
+    hidden = self._hidden_layer(inputs)
+    logits = self._output_layer(hidden)
+
+    left_prob = tf.nn.sigmoid(logits)
+    action_probs = tf.concat([left_prob, 1.0 - left_prob], 1)
+
+    actions = tf.multinomial(tf.log(action_probs), 1)
+    return logits, actions
+
+  # TODO(mdan): Move this method out of the class.
+  @ag.convert()
+  def train(self, cart_pole_env, optimizer, discount_rate, num_games,
+            max_steps_per_game):
+    var_list = tf.trainable_variables()
+    grad_list = [
+        tf.TensorArray(tf.float32, 0, dynamic_size=True) for _ in var_list
+    ]
+
+    step_counts = []
+    discounted_rewards = []
+    ag.set_element_type(discounted_rewards, tf.float32)
+    ag.set_element_type(step_counts, tf.int32)
+
+    # Note: we use a shared object, cart_pole_env here. Because calls to the
+    # object's method are made through py_func, TensorFlow cannot detect its
+    # data dependencies. Hence we must manually synchronize access to it
+    # and ensure the control dependencies are set in such a way that
+    # calls to reset(), take_one_step, etc. are made in the correct order.
+    sync_counter = tf.constant(0)
+
+    for _ in tf.range(num_games):
+      with tf.control_dependencies([sync_counter]):
+        obs = cart_pole_env.reset()
+        with tf.control_dependencies([obs]):
+          sync_counter += 1
+
+        game_rewards = []
+        ag.set_element_type(game_rewards, tf.float32)
+
+        for step in tf.range(max_steps_per_game):
+          logits, actions = self(obs)  # pylint:disable=not-callable
+          logits = tf.reshape(logits, ())
+          actions = tf.reshape(actions, ())
+
+          labels = 1.0 - tf.cast(actions, tf.float32)
+          loss = tf.nn.sigmoid_cross_entropy_with_logits(
+              labels=labels, logits=logits)
+          grads = tf.gradients(loss, var_list)
+
+          for i in range(len(grads)):
+            grad_list[i].append(grads[i])
+
+          with tf.control_dependencies([sync_counter]):
+            obs, reward, done = cart_pole_env.step(actions)
+            with tf.control_dependencies([obs]):
+              sync_counter += 1
+            obs = tf.reshape(obs, (1, 4))
+
+          game_rewards.append(reward)
+          if reward < 0.1 or done:
+            step_counts.append(step + 1)
+            break
+
+        discounted_rewards = graph_append_discounted_rewards(
+            discounted_rewards, game_rewards, discount_rate)
+
+    discounted_rewards = ag.stack(discounted_rewards)
+    discounted_rewards.set_shape((None,))
+    mean, variance = tf.nn.moments(discounted_rewards, [0])
+    normalized_rewards = (discounted_rewards - mean) / tf.sqrt(variance)
+
+    for i in range(len(grad_list)):
+      g = ag.stack(grad_list[i])
+
+      # This block just adjusts the shapes to match for multiplication.
+      r = normalized_rewards
+      if r.shape.ndims < g.shape.ndims:
+        r = tf.expand_dims(r, -1)
+      if r.shape.ndims < g.shape.ndims:
+        r = tf.expand_dims(r, -1)
+
+      grad_list[i] = tf.reduce_mean(g * r, axis=0)
+
+    optimizer.apply_gradients(
+        zip(grad_list, var_list), global_step=tf.train.get_global_step())
+
+    return ag.stack(step_counts)
+
+
+@ag.convert()
+def graph_train_model(policy_network, cart_pole_env, optimizer, iterations):
+  """Trains the policy network for a given number of iterations."""
+  i = tf.constant(0)
+  mean_steps_per_iteration = []
+  ag.set_element_type(mean_steps_per_iteration, tf.int32)
+
+  while i < iterations:
+    steps_per_game = policy_network.train(
+        cart_pole_env,
+        optimizer,
+        discount_rate=0.95,
+        num_games=20,
+        max_steps_per_game=200)
+    mean_steps_per_iteration.append(tf.reduce_mean(steps_per_game))
+    i += 1
+
+  return ag.stack(mean_steps_per_iteration)
+
+
+class GraphGymCartpoleEnv(object):
+  """An env backed by OpenAI Gym's CartPole environment.
+
+  Used to confirm a functional model only.
+  """
+
+  def __init__(self):
+    cart_pole_env = gym.make('CartPole-v1')
+    cart_pole_env.seed(0)
+    cart_pole_env.reset()
+    self.env = cart_pole_env
+
+  def reset(self):
+    obs = ag.utils.wrap_py_func(self.env.reset, tf.float64, ())
+    obs = tf.reshape(obs, (1, 4))
+    obs = tf.cast(obs, tf.float32)
+    return obs
+
+  def step(self, actions):
+
+    def take_one_step(actions):
+      obs, reward, done, _ = self.env.step(actions)
+      obs = obs.astype(np.float32)
+      reward = np.float32(reward)
+      return obs, reward, done
+
+    return ag.utils.wrap_py_func(take_one_step,
+                                 (tf.float32, tf.float32, tf.bool), (actions,))
+
+
+class GraphRandomCartpoleEnv(object):
+  """An environment that returns random actions and never finishes.
+
+  Used during benchmarking, it will cause training to run a constant number of
+  steps.
+  """
+
+  def reset(self):
+    return tf.random.normal((1, 4))
+
+  def step(self, actions):
+    with tf.control_dependencies([actions]):
+      random_obs = tf.random.normal((1, 4))
+      fixed_reward = tf.constant(0.001)
+      done = tf.constant(False)
+      return random_obs, fixed_reward, done
+
+
+#
+# Eager implementation
+#
+
+
+def eager_append_discounted_rewards(discounted_rewards, rewards, discount_rate):
+  cdr = 0.0
+  reverse_discounted = []
+
+  for i in range(len(rewards) - 1, -1, -1):
+    cdr = cdr * discount_rate + rewards[i]
+    reverse_discounted.append(cdr)
+
+  discounted_rewards.extend(reversed(reverse_discounted))
+  return discounted_rewards
+
+
+class EagerPolicyNetwork(tf.keras.Model):
+  """Policy network for the cart-pole reinforcement learning problem.
+
+  The forward path of the network takes an observation from the cart-pole
+  environment (length-4 vector) and outputs an action.
+  """
+
+  def __init__(self, hidden_size):
+    super(EagerPolicyNetwork, self).__init__()
+    self._hidden_layer = tf.keras.layers.Dense(
+        hidden_size, activation=tf.nn.elu)
+    self._output_layer = tf.keras.layers.Dense(1)
+
+  def call(self, inputs):
+    """Calculates logits and action.
+
+    Args:
+      inputs: Observations from a step in the cart-pole environment, of shape
+        `(batch_size, input_size)`
+
+    Returns:
+      logits: the logits output by the output layer. This can be viewed as the
+        likelihood vales of choosing the left (0) action. Shape:
+        `(batch_size, 1)`.
+      actions: randomly selected actions ({0, 1}) based on the logits. Shape:
+        `(batch_size, 1)`.
+    """
+    hidden = self._hidden_layer(inputs)
+    logits = self._output_layer(hidden)
+
+    left_prob = tf.nn.sigmoid(logits)
+    action_probs = tf.concat([left_prob, 1.0 - left_prob], 1)
+
+    self._grad_fn = eager.implicit_gradients(
+        self._get_cross_entropy_and_save_actions)
+
+    actions = tf.multinomial(tf.log(action_probs), 1)
+    return logits, actions
+
+  def _get_cross_entropy_and_save_actions(self, inputs):
+    logits, actions = self(inputs)  # pylint:disable=not-callable
+    self._current_actions = actions
+    labels = 1.0 - tf.cast(actions, tf.float32)
+    return tf.nn.sigmoid_cross_entropy_with_logits(labels=labels, logits=logits)
+
+  def train(self, cart_pole_env, optimizer, discount_rate, num_games,
+            max_steps_per_game):
+    grad_list = None
+
+    step_counts = []
+    discounted_rewards = []
+
+    for _ in range(num_games):
+      obs = cart_pole_env.reset()
+
+      game_rewards = []
+
+      for step in range(max_steps_per_game):
+        grads_and_vars = self._grad_fn(tf.constant([obs], dtype=tf.float32))
+        grads, var_list = zip(*grads_and_vars)
+        actions = self._current_actions.numpy()[0][0]
+
+        if grad_list is None:
+          grad_list = [[g] for g in grads]
+        else:
+          for i in range(len(grads)):
+            grad_list[i].append(grads[i])
+
+        obs, reward, done = cart_pole_env.step(actions)
+
+        game_rewards.append(reward)
+        if reward < 0.1 or done:
+          step_counts.append(step + 1)
+          break
+
+      discounted_rewards = eager_append_discounted_rewards(
+          discounted_rewards, game_rewards, discount_rate)
+
+    discounted_rewards = tf.stack(discounted_rewards)
+    mean, variance = tf.nn.moments(discounted_rewards, [0])
+    normalized_rewards = (discounted_rewards - mean) / tf.sqrt(variance)
+
+    for i in range(len(grad_list)):
+      g = tf.stack(grad_list[i])
+
+      r = normalized_rewards
+      while r.shape.ndims < g.shape.ndims:
+        r = tf.expand_dims(r, -1)
+
+      grad_list[i] = tf.reduce_mean(g * r, axis=0)
+
+    optimizer.apply_gradients(
+        zip(grad_list, var_list), global_step=tf.train.get_global_step())
+
+    return tf.stack(step_counts)
+
+
+def eager_train_model(policy_network, cart_pole_env, optimizer, iterations):
+  """Trains the policy network for a given number of iterations."""
+  mean_steps_per_iteration = []
+
+  for _ in range(iterations):
+    steps_per_game = policy_network.train(
+        cart_pole_env,
+        optimizer,
+        discount_rate=0.95,
+        num_games=20,
+        max_steps_per_game=200)
+    mean_steps_per_iteration.append(tf.reduce_mean(steps_per_game))
+
+  return mean_steps_per_iteration
+
+
+class EagerGymCartpoleEnv(object):
+  """An env backed by OpenAI Gym's CartPole environment.
+
+  Used to confirm a functional model only.
+  """
+
+  def __init__(self):
+    cart_pole_env = gym.make('CartPole-v1')
+    cart_pole_env.seed(0)
+    cart_pole_env.reset()
+    self.env = cart_pole_env
+
+  def reset(self):
+    return self.env.reset()
+
+  def step(self, actions):
+    obs, reward, done, _ = self.env.step(actions)
+    return obs, reward, done
+
+
+class EagerRandomCartpoleEnv(object):
+  """An environment that returns random actions and never finishes.
+
+  Used during benchmarking, it will cause training to run a constant number of
+  steps.
+  """
+
+  def reset(self):
+    return np.random.normal(size=(4,))
+
+  def step(self, actions):
+    with tf.control_dependencies([actions]):
+      random_obs = np.random.normal(size=(4,))
+      fixed_reward = 0.001
+      done = False
+      return random_obs, fixed_reward, done
+
+
+def graph_demo_training():
+  """Not used in the benchmark. Used to confirm a functional model."""
+  with tf.Graph().as_default():
+    tf.set_random_seed(0)
+
+    network = GraphPolicyNetwork(hidden_size=5)
+    network.build((1, 4))
+    env = GraphGymCartpoleEnv()
+    opt = tf.train.AdamOptimizer(0.05)
+
+    train_ops = graph_train_model(network, env, opt, iterations=5)
+
+    with tf.Session() as sess:
+      sess.run(tf.global_variables_initializer())
+      sess.run(tf.local_variables_initializer())
+      steps_per_iteration = sess.run(train_ops)
+      for i, steps in enumerate(steps_per_iteration):
+        print('Step {} iterations: {}'.format(i, steps))
+
+
+def eager_demo_training():
+  with context.eager_mode():
+    network = EagerPolicyNetwork(hidden_size=5)
+    network.build((1, 4))
+    env = EagerGymCartpoleEnv()
+    opt = tf.train.AdamOptimizer(0.05)
+
+    steps_per_iteration = eager_train_model(network, env, opt, iterations=5)
+    for i, steps in enumerate(steps_per_iteration):
+      print('Step {} iterations: {}'.format(i, steps))
+
+
+class RLCartPoleBenchmark(benchmark_base.ReportingBenchmark):
+  """Actual benchmark.
+
+  Trains the RL agent a fixed number of times, on random environments that
+  result in constant number of steps.
+  """
+
+  def benchmark_cartpole(self):
+
+    def train_session(sess, ops):
+      return lambda: sess.run(ops)
+
+    def train_eager(network, env, opt):
+      return lambda: eager_train_model(network, env, opt, iterations=10)
+
+    for model_size in (10, 100, 1000):
+      with tf.Graph().as_default():
+        network = GraphPolicyNetwork(hidden_size=model_size)
+        network.build((1, 4))
+        env = GraphRandomCartpoleEnv()
+        opt = tf.train.AdamOptimizer(0.05)
+        train_ops = graph_train_model(network, env, opt, iterations=10)
+
+        with tf.Session() as sess:
+          sess.run(tf.global_variables_initializer())
+          sess.run(tf.local_variables_initializer())
+
+          self.time_execution(('cartpole', 'autograph', model_size),
+                              train_session(sess, train_ops), 20)
+
+      with context.eager_mode():
+        network = EagerPolicyNetwork(hidden_size=model_size)
+        network.build((1, 4))
+        env = EagerRandomCartpoleEnv()
+        opt = tf.train.AdamOptimizer(0.05)
+
+        self.time_execution(('cartpole', 'eager', model_size),
+                            train_eager(network, env, opt), 20)
+
+
+if __name__ == '__main__':
+  tf.test.main()
diff --git a/tensorflow/contrib/batching/python/ops/batch_ops.py b/tensorflow/contrib/batching/python/ops/batch_ops.py
index 55faad983f2..3e4d0dc1cec 100644
--- a/tensorflow/contrib/batching/python/ops/batch_ops.py
+++ b/tensorflow/contrib/batching/python/ops/batch_ops.py
@@ -18,8 +18,9 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.framework import function
+from tensorflow.python.eager import function
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_spec
 from tensorflow.python.ops import gen_batch_ops
 # go/tf-wildcard-import
 # pylint: disable=wildcard-import
@@ -101,12 +102,15 @@ def batch_function(num_batch_threads,
   def decorator(fn):  # pylint: disable=missing-docstring
 
     def decorated(*args):  # pylint: disable=missing-docstring
-      types = [arg.dtype for arg in args]
 
-      @function.Defun(*types)
+      @function.defun()
       def computation(*computation_args):
         return fn(*computation_args)
 
+      computation = computation.get_concrete_function(
+          *[tensor_spec.TensorSpec(dtype=x.dtype, shape=x.shape, name=str(i))
+            for i, x in enumerate(args)])
+
       with ops.name_scope("batch") as name:
         for a in args:
           if not isinstance(a, ops.Tensor):
@@ -123,7 +127,7 @@ def batch_function(num_batch_threads,
             f=computation,
             in_tensors=list(args),
             captured_tensors=computation.captured_inputs,
-            Tout=[o.type for o in computation.definition.signature.output_arg])
+            Tout=[o.dtype for o in computation.outputs])
 
     return decorated
 
diff --git a/tensorflow/contrib/batching/python/ops/batch_ops_test.py b/tensorflow/contrib/batching/python/ops/batch_ops_test.py
index 01ee8703a93..9109b9c1c91 100644
--- a/tensorflow/contrib/batching/python/ops/batch_ops_test.py
+++ b/tensorflow/contrib/batching/python/ops/batch_ops_test.py
@@ -219,6 +219,7 @@ class BatchOpsTest(test.TestCase):
 
       @batch_ops.batch_function(1, 10, 100000)
       def computation(in_t):
+        self.assertTrue(in_t.shape is not None)
         return in_t + 1
 
       inp = array_ops.placeholder(dtype=dtypes.int32, shape=[1])
diff --git a/tensorflow/contrib/bayesflow/python/kernel_tests/monte_carlo_test.py b/tensorflow/contrib/bayesflow/python/kernel_tests/monte_carlo_test.py
index 13215ffabf3..8b6ed9f041b 100644
--- a/tensorflow/contrib/bayesflow/python/kernel_tests/monte_carlo_test.py
+++ b/tensorflow/contrib/bayesflow/python/kernel_tests/monte_carlo_test.py
@@ -81,7 +81,7 @@ class ExpectationImportanceSampleTest(test.TestCase):
       # Compute E_p[X_1 * X_2 > 0], with X_i the ith component of X ~ p(x).
       # Should equal 1/2 because p is a spherical Gaussian centered at (0, 0).
       def indicator(x):
-        x1_times_x2 = math_ops.reduce_prod(x, reduction_indices=[-1])
+        x1_times_x2 = math_ops.reduce_prod(x, axis=[-1])
         return 0.5 * (math_ops.sign(x1_times_x2) + 1.0)
 
       prob = mc.expectation_importance_sampler(
diff --git a/tensorflow/contrib/bayesflow/python/ops/monte_carlo_impl.py b/tensorflow/contrib/bayesflow/python/ops/monte_carlo_impl.py
index 18d40fc1dff..e83a5485119 100644
--- a/tensorflow/contrib/bayesflow/python/ops/monte_carlo_impl.py
+++ b/tensorflow/contrib/bayesflow/python/ops/monte_carlo_impl.py
@@ -353,12 +353,12 @@ def expectation(f, samples, log_prob=None, use_reparametrization=True,
 
 def _sample_mean(values):
   """Mean over sample indices.  In this module this is always [0]."""
-  return math_ops.reduce_mean(values, reduction_indices=[0])
+  return math_ops.reduce_mean(values, axis=[0])
 
 
 def _sample_max(values):
   """Max over sample indices.  In this module this is always [0]."""
-  return math_ops.reduce_max(values, reduction_indices=[0])
+  return math_ops.reduce_max(values, axis=[0])
 
 
 def _get_samples(dist, z, n, seed):
diff --git a/tensorflow/contrib/bigtable/README.md b/tensorflow/contrib/bigtable/README.md
index 2c44abed5e1..79052bee35c 100644
--- a/tensorflow/contrib/bigtable/README.md
+++ b/tensorflow/contrib/bigtable/README.md
@@ -51,25 +51,18 @@ BIGTABLE_TABLE_NAME = '<FILL_ME_IN>'
 PREFIX = 'train-'
 
 def main():
+  tf.enable_eager_execution()
+
   client = tf.contrib.cloud.BigtableClient(GCP_PROJECT_ID, BIGTABLE_INSTANCE_ID)
   table = client.table(BIGTABLE_TABLE_NAME)
   dataset = table.keys_by_prefix_dataset(PREFIX)
-  iterator = dataset.make_initializable_iterator()
-  get_next_op = iterator.get_next()
 
-  with tf.Session() as sess:
-    print('Initializing the iterator.')
-    sess.run(iterator.initializer)
-    print('Retrieving rows:')
-    row_index = 0
-    while True:
-      try:
-        row_key = sess.run(get_next_op)
-        print('Row key %d: %s' % (row_index, row_key))
-        row_index += 1
-      except tf.errors.OutOfRangeError:
-        print('Finished reading data!')
-        break
+  print('Retrieving rows:')
+  row_index = 0
+  for row_key in dataset:
+    print('Row key %d: %s' % (row_index, row_key))
+    row_index += 1
+  print('Finished reading data!')
 
 if __name__ == '__main__':
   main()
diff --git a/tensorflow/contrib/bigtable/kernels/test_kernels/bigtable_test_client.cc b/tensorflow/contrib/bigtable/kernels/test_kernels/bigtable_test_client.cc
index f083ce6f44b..e95dc577184 100644
--- a/tensorflow/contrib/bigtable/kernels/test_kernels/bigtable_test_client.cc
+++ b/tensorflow/contrib/bigtable/kernels/test_kernels/bigtable_test_client.cc
@@ -366,6 +366,39 @@ BigtableTestClient::MutateRows(
   return MakeUnique<MutateRowsResponse>(request.entries_size());
 }
 
+std::unique_ptr<grpc::ClientAsyncResponseReaderInterface<
+    google::bigtable::v2::MutateRowResponse>>
+BigtableTestClient::AsyncMutateRow(
+    grpc::ClientContext* context,
+    google::bigtable::v2::MutateRowRequest const& request,
+    grpc::CompletionQueue* cq) {
+  LOG(WARNING) << "Call to InMemoryDataClient::" << __func__
+               << "(); this will likely cause a crash!";
+  return nullptr;
+}
+
+std::unique_ptr<::grpc::ClientAsyncReaderInterface<
+    ::google::bigtable::v2::SampleRowKeysResponse>>
+BigtableTestClient::AsyncSampleRowKeys(
+    ::grpc::ClientContext* context,
+    const ::google::bigtable::v2::SampleRowKeysRequest& request,
+    ::grpc::CompletionQueue* cq, void* tag) {
+  LOG(WARNING) << "Call to InMemoryDataClient::" << __func__
+               << "(); this will likely cause a crash!";
+  return nullptr;
+}
+
+std::unique_ptr<::grpc::ClientAsyncReaderInterface<
+    ::google::bigtable::v2::MutateRowsResponse>>
+BigtableTestClient::AsyncMutateRows(
+    ::grpc::ClientContext* context,
+    const ::google::bigtable::v2::MutateRowsRequest& request,
+    ::grpc::CompletionQueue* cq, void* tag) {
+  LOG(WARNING) << "Call to InMemoryDataClient::" << __func__
+               << "(); this will likely cause a crash!";
+  return nullptr;
+}
+
 std::shared_ptr<grpc::Channel> BigtableTestClient::Channel() {
   LOG(WARNING) << "Call to InMemoryDataClient::Channel(); this will likely "
                   "cause a crash!";
diff --git a/tensorflow/contrib/bigtable/kernels/test_kernels/bigtable_test_client.h b/tensorflow/contrib/bigtable/kernels/test_kernels/bigtable_test_client.h
index dac2b16a216..c4a1f06bc50 100644
--- a/tensorflow/contrib/bigtable/kernels/test_kernels/bigtable_test_client.h
+++ b/tensorflow/contrib/bigtable/kernels/test_kernels/bigtable_test_client.h
@@ -61,6 +61,25 @@ class BigtableTestClient : public ::google::cloud::bigtable::DataClient {
   MutateRows(grpc::ClientContext* context,
              google::bigtable::v2::MutateRowsRequest const& request) override;
 
+  std::unique_ptr<grpc::ClientAsyncResponseReaderInterface<
+      google::bigtable::v2::MutateRowResponse>>
+  AsyncMutateRow(grpc::ClientContext* context,
+                 google::bigtable::v2::MutateRowRequest const& request,
+                 grpc::CompletionQueue* cq) override;
+
+  std::unique_ptr<::grpc::ClientAsyncReaderInterface<
+      ::google::bigtable::v2::SampleRowKeysResponse>>
+  AsyncSampleRowKeys(
+      ::grpc::ClientContext* context,
+      const ::google::bigtable::v2::SampleRowKeysRequest& request,
+      ::grpc::CompletionQueue* cq, void* tag) override;
+
+  std::unique_ptr<::grpc::ClientAsyncReaderInterface<
+      ::google::bigtable::v2::MutateRowsResponse>>
+  AsyncMutateRows(::grpc::ClientContext* context,
+                  const ::google::bigtable::v2::MutateRowsRequest& request,
+                  ::grpc::CompletionQueue* cq, void* tag) override;
+
   std::shared_ptr<grpc::Channel> Channel() override;
 
  private:
diff --git a/tensorflow/contrib/bigtable/python/kernel_tests/bigtable_ops_test.py b/tensorflow/contrib/bigtable/python/kernel_tests/bigtable_ops_test.py
index 316da9ebe15..197f5578eb0 100644
--- a/tensorflow/contrib/bigtable/python/kernel_tests/bigtable_ops_test.py
+++ b/tensorflow/contrib/bigtable/python/kernel_tests/bigtable_ops_test.py
@@ -57,7 +57,7 @@ class BigtableOpsTest(test.TestCase):
     sess.run(write_op)
 
   def runReadKeyTest(self, read_ds):
-    itr = read_ds.make_initializable_iterator()
+    itr = dataset_ops.make_initializable_iterator(read_ds)
     n = itr.get_next()
     expected = list(self.COMMON_ROW_KEYS)
     expected.reverse()
@@ -78,7 +78,7 @@ class BigtableOpsTest(test.TestCase):
     self.runReadKeyTest(self._table.keys_by_range_dataset("r1", "r4"))
 
   def runScanTest(self, read_ds):
-    itr = read_ds.make_initializable_iterator()
+    itr = dataset_ops.make_initializable_iterator(read_ds)
     n = itr.get_next()
     expected_keys = list(self.COMMON_ROW_KEYS)
     expected_keys.reverse()
@@ -120,7 +120,7 @@ class BigtableOpsTest(test.TestCase):
   def testLookup(self):
     ds = self._table.keys_by_prefix_dataset("r")
     ds = ds.apply(self._table.lookup_columns(cf1="c1"))
-    itr = ds.make_initializable_iterator()
+    itr = dataset_ops.make_initializable_iterator(ds)
     n = itr.get_next()
     expected_keys = list(self.COMMON_ROW_KEYS)
     expected_values = list(self.COMMON_VALUES)
@@ -141,7 +141,7 @@ class BigtableOpsTest(test.TestCase):
 
   def testSampleKeys(self):
     ds = self._table.sample_keys()
-    itr = ds.make_initializable_iterator()
+    itr = dataset_ops.make_initializable_iterator(ds)
     n = itr.get_next()
     expected_key = self.COMMON_ROW_KEYS[0]
     with self.cached_session() as sess:
@@ -161,7 +161,7 @@ class BigtableOpsTest(test.TestCase):
         sess.run(n)
 
   def runSampleKeyPairsTest(self, ds, expected_key_pairs):
-    itr = ds.make_initializable_iterator()
+    itr = dataset_ops.make_initializable_iterator(ds)
     n = itr.get_next()
     with self.cached_session() as sess:
       self._writeCommonValues(sess)
@@ -218,7 +218,7 @@ class BigtableOpsTest(test.TestCase):
   def testSampleKeyPairsPrefixAndStartKey(self):
     ds = bigtable_api._BigtableSampleKeyPairsDataset(
         self._table, prefix="r", start="r1", end="")
-    itr = ds.make_initializable_iterator()
+    itr = dataset_ops.make_initializable_iterator(ds)
     with self.cached_session() as sess:
       with self.assertRaises(errors.InvalidArgumentError):
         sess.run(itr.initializer)
@@ -226,14 +226,14 @@ class BigtableOpsTest(test.TestCase):
   def testSampleKeyPairsPrefixAndEndKey(self):
     ds = bigtable_api._BigtableSampleKeyPairsDataset(
         self._table, prefix="r", start="", end="r3")
-    itr = ds.make_initializable_iterator()
+    itr = dataset_ops.make_initializable_iterator(ds)
     with self.cached_session() as sess:
       with self.assertRaises(errors.InvalidArgumentError):
         sess.run(itr.initializer)
 
   def testParallelScanPrefix(self):
     ds = self._table.parallel_scan_prefix(prefix="r", cf1="c1")
-    itr = ds.make_initializable_iterator()
+    itr = dataset_ops.make_initializable_iterator(ds)
     n = itr.get_next()
     with self.cached_session() as sess:
       self._writeCommonValues(sess)
@@ -251,7 +251,7 @@ class BigtableOpsTest(test.TestCase):
 
   def testParallelScanRange(self):
     ds = self._table.parallel_scan_range(start="r1", end="r4", cf1="c1")
-    itr = ds.make_initializable_iterator()
+    itr = dataset_ops.make_initializable_iterator(ds)
     n = itr.get_next()
     with self.cached_session() as sess:
       self._writeCommonValues(sess)
diff --git a/tensorflow/contrib/bigtable/python/ops/bigtable_api.py b/tensorflow/contrib/bigtable/python/ops/bigtable_api.py
index 7c87b0daeb0..9f97934193d 100644
--- a/tensorflow/contrib/bigtable/python/ops/bigtable_api.py
+++ b/tensorflow/contrib/bigtable/python/ops/bigtable_api.py
@@ -222,7 +222,7 @@ class BigtableTable(object):
       A `tf.data.Dataset`. containing `tf.string` Tensors corresponding to all
       of the row keys matching that prefix.
     """
-    return _BigtablePrefixKeyDataset(self, prefix)
+    return dataset_ops.DatasetV1Adapter(_BigtablePrefixKeyDataset(self, prefix))
 
   def sample_keys(self):
     """Retrieves a sampling of row keys from the Bigtable table.
@@ -234,7 +234,7 @@ class BigtableTable(object):
     Returns:
       A `tf.data.Dataset` returning string row keys.
     """
-    return _BigtableSampleKeysDataset(self)
+    return dataset_ops.DatasetV1Adapter(_BigtableSampleKeysDataset(self))
 
   def scan_prefix(self, prefix, probability=None, columns=None, **kwargs):
     """Retrieves row (including values) from the Bigtable service.
@@ -279,7 +279,8 @@ class BigtableTable(object):
     """
     probability = _normalize_probability(probability)
     normalized = _normalize_columns(columns, kwargs)
-    return _BigtableScanDataset(self, prefix, "", "", normalized, probability)
+    return dataset_ops.DatasetV1Adapter(
+        _BigtableScanDataset(self, prefix, "", "", normalized, probability))
 
   def scan_range(self, start, end, probability=None, columns=None, **kwargs):
     """Retrieves rows (including values) from the Bigtable service.
@@ -324,7 +325,8 @@ class BigtableTable(object):
     """
     probability = _normalize_probability(probability)
     normalized = _normalize_columns(columns, kwargs)
-    return _BigtableScanDataset(self, "", start, end, normalized, probability)
+    return dataset_ops.DatasetV1Adapter(
+        _BigtableScanDataset(self, "", start, end, normalized, probability))
 
   def parallel_scan_prefix(self,
                            prefix,
@@ -380,7 +382,8 @@ class BigtableTable(object):
     """
     probability = _normalize_probability(probability)
     normalized = _normalize_columns(columns, kwargs)
-    ds = _BigtableSampleKeyPairsDataset(self, prefix, "", "")
+    ds = dataset_ops.DatasetV1Adapter(
+        _BigtableSampleKeyPairsDataset(self, prefix, "", ""))
     return self._make_parallel_scan_dataset(ds, num_parallel_scans, probability,
                                             normalized)
 
@@ -442,7 +445,8 @@ class BigtableTable(object):
     """
     probability = _normalize_probability(probability)
     normalized = _normalize_columns(columns, kwargs)
-    ds = _BigtableSampleKeyPairsDataset(self, "", start, end)
+    ds = dataset_ops.DatasetV1Adapter(
+        _BigtableSampleKeyPairsDataset(self, "", start, end))
     return self._make_parallel_scan_dataset(ds, num_parallel_scans, probability,
                                             normalized)
 
diff --git a/tensorflow/contrib/boosted_trees/estimator_batch/BUILD b/tensorflow/contrib/boosted_trees/estimator_batch/BUILD
index 14b6fc4ac26..d3b23d949ee 100644
--- a/tensorflow/contrib/boosted_trees/estimator_batch/BUILD
+++ b/tensorflow/contrib/boosted_trees/estimator_batch/BUILD
@@ -132,6 +132,7 @@ py_library(
     srcs = ["estimator.py"],
     srcs_version = "PY2AND3",
     deps = [
+        ":custom_loss_head",
         ":estimator_utils",
         ":model",
         "//tensorflow/contrib/boosted_trees:losses",
diff --git a/tensorflow/contrib/boosted_trees/estimator_batch/custom_export_strategy.py b/tensorflow/contrib/boosted_trees/estimator_batch/custom_export_strategy.py
index a3df272e692..b314b4d74df 100644
--- a/tensorflow/contrib/boosted_trees/estimator_batch/custom_export_strategy.py
+++ b/tensorflow/contrib/boosted_trees/estimator_batch/custom_export_strategy.py
@@ -41,7 +41,8 @@ def make_custom_export_strategy(name,
                                 convert_fn,
                                 feature_columns,
                                 export_input_fn,
-                                use_core_columns=False):
+                                use_core_columns=False,
+                                feature_engineering_fn=None):
   """Makes custom exporter of GTFlow tree format.
 
   Args:
@@ -52,6 +53,7 @@ def make_custom_export_strategy(name,
     export_input_fn: A function that takes no arguments and returns an
       `InputFnOps`.
     use_core_columns: A boolean, whether core feature columns were used.
+    feature_engineering_fn: Feature eng function to be called on the input.
 
   Returns:
     An `ExportStrategy`.
@@ -59,9 +61,12 @@ def make_custom_export_strategy(name,
   base_strategy = saved_model_export_utils.make_export_strategy(
       serving_input_fn=export_input_fn, strip_default_attrs=True)
   input_fn = export_input_fn()
+  features = input_fn.features
+  if feature_engineering_fn is not None:
+    features, _ = feature_engineering_fn(features, labels=None)
   (sorted_feature_names, dense_floats, sparse_float_indices, _, _,
    sparse_int_indices, _, _) = gbdt_batch.extract_features(
-       input_fn.features, feature_columns, use_core_columns)
+       features, feature_columns, use_core_columns)
 
   def export_fn(estimator, export_dir, checkpoint_path=None, eval_result=None):
     """A wrapper to export to SavedModel, and convert it to other formats."""
diff --git a/tensorflow/contrib/boosted_trees/estimator_batch/dnn_tree_combined_estimator.py b/tensorflow/contrib/boosted_trees/estimator_batch/dnn_tree_combined_estimator.py
index ca73e4af2fb..358404cd946 100644
--- a/tensorflow/contrib/boosted_trees/estimator_batch/dnn_tree_combined_estimator.py
+++ b/tensorflow/contrib/boosted_trees/estimator_batch/dnn_tree_combined_estimator.py
@@ -36,7 +36,7 @@ from tensorflow.contrib.learn.python.learn.estimators import estimator
 from tensorflow.contrib.learn.python.learn.estimators import head as head_lib
 from tensorflow.python.estimator import estimator as core_estimator
 from tensorflow.contrib.learn.python.learn.estimators import model_fn
-from tensorflow.python.feature_column import feature_column as feature_column_lib
+from tensorflow.python.feature_column import feature_column_lib
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
diff --git a/tensorflow/contrib/boosted_trees/estimator_batch/estimator.py b/tensorflow/contrib/boosted_trees/estimator_batch/estimator.py
index 38d19976ef3..a178820841c 100644
--- a/tensorflow/contrib/boosted_trees/estimator_batch/estimator.py
+++ b/tensorflow/contrib/boosted_trees/estimator_batch/estimator.py
@@ -18,6 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import functools
+
 from tensorflow.contrib.boosted_trees.estimator_batch import model
 from tensorflow.contrib.boosted_trees.python.utils import losses
 from tensorflow.contrib.learn.python.learn.estimators import estimator
@@ -26,7 +28,8 @@ from tensorflow.python.estimator.canned import head as core_head_lib
 from tensorflow.python.estimator import estimator as core_estimator
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.losses import losses as core_losses
-
+from tensorflow.contrib.boosted_trees.estimator_batch import custom_loss_head
+from tensorflow.python.ops import array_ops
 
 # ================== Old estimator interface===================================
 # The estimators below were designed for old feature columns and old estimator
@@ -414,6 +417,108 @@ class GradientBoostedDecisionTreeRanker(estimator.Estimator):
         config=config,
         feature_engineering_fn=feature_engineering_fn)
 
+# When using this estimator, make sure to regularize the hessian (at least l2,
+# min_node_weight)!
+# TODO(nponomareva): extend to take multiple quantiles in one go.
+class GradientBoostedDecisionTreeQuantileRegressor(estimator.Estimator):
+  """An estimator that does quantile regression and returns quantile estimates.
+  """
+
+  def __init__(self,
+               learner_config,
+               examples_per_layer,
+               quantiles,
+               label_dimension=1,
+               num_trees=None,
+               feature_columns=None,
+               weight_column_name=None,
+               model_dir=None,
+               config=None,
+               feature_engineering_fn=None,
+               logits_modifier_function=None,
+               center_bias=True,
+               use_core_libs=False,
+               output_leaf_index=False,
+               override_global_step_value=None,
+               num_quantiles=100):
+    """Initializes a GradientBoostedDecisionTreeQuantileRegressor instance.
+
+    Args:
+      learner_config: A config for the learner.
+      examples_per_layer: Number of examples to accumulate before growing a
+        layer. It can also be a function that computes the number of examples
+        based on the depth of the layer that's being built.
+      quantiles: a list of quantiles for the loss, each between 0 and 1.
+      label_dimension: Dimension of regression label. This is the size
+        of the last dimension of the labels `Tensor` (typically, this has shape
+        `[batch_size, label_dimension]`). When label_dimension>1, it is
+        recommended to use multiclass strategy diagonal hessian or full hessian.
+      num_trees: An int, number of trees to build.
+      feature_columns: A list of feature columns.
+      weight_column_name: Name of the column for weights, or None if not
+        weighted.
+      model_dir: Directory for model exports, etc.
+      config: `RunConfig` object to configure the runtime settings.
+      feature_engineering_fn: Feature engineering function. Takes features and
+        labels which are the output of `input_fn` and returns features and
+        labels which will be fed into the model.
+      logits_modifier_function: A modifier function for the logits.
+      center_bias: Whether a separate tree should be created for first fitting
+        the bias.
+      use_core_libs: Whether feature columns and loss are from the core (as
+        opposed to contrib) version of tensorflow.
+      output_leaf_index: whether to output leaf indices along with predictions
+        during inference. The leaf node indexes are available in predictions
+        dict by the key 'leaf_index'. For example,
+        result_dict = classifier.predict(...)
+        for example_prediction_result in result_dict:
+          # access leaf index list by example_prediction_result["leaf_index"]
+          # which contains one leaf index per tree
+      override_global_step_value: If after the training is done, global step
+        value must be reset to this value. This should be used to reset global
+        step to a number > number of steps used to train the current ensemble.
+        For example, the usual way is to train a number of trees and set a very
+        large number of training steps. When the training is done (number of
+        trees were trained), this parameter can be used to set the global step
+        to a large value, making it look like that number of training steps ran.
+        If None, no override of global step will happen.
+      num_quantiles: Number of quantiles to build for numeric feature values.
+    """
+
+    if len(quantiles) > 1:
+      raise ValueError('For now, just one quantile per estimator is supported')
+
+    def _quantile_regression_head(quantile):
+      # Use quantile regression.
+      head = custom_loss_head.CustomLossHead(
+          loss_fn=functools.partial(
+              losses.per_example_quantile_regression_loss, quantile=quantile),
+          link_fn=array_ops.identity,
+          logit_dimension=label_dimension)
+      return head
+
+    learner_config.num_classes = max(2, label_dimension)
+
+    super(GradientBoostedDecisionTreeQuantileRegressor, self).__init__(
+        model_fn=model.model_builder,
+        params={
+            'head': _quantile_regression_head(quantiles[0]),
+            'feature_columns': feature_columns,
+            'learner_config': learner_config,
+            'num_trees': num_trees,
+            'weight_column_name': weight_column_name,
+            'examples_per_layer': examples_per_layer,
+            'logits_modifier_function': logits_modifier_function,
+            'center_bias': center_bias,
+            'use_core_libs': use_core_libs,
+            'output_leaf_index': False,
+            'override_global_step_value': override_global_step_value,
+            'num_quantiles': num_quantiles,
+        },
+        model_dir=model_dir,
+        config=config,
+        feature_engineering_fn=feature_engineering_fn)
+
 # ================== New Estimator interface===================================
 # The estimators below use new core Estimator interface and must be used with
 # new feature columns and heads.
@@ -437,12 +542,42 @@ def core_multiclass_head(
 
   # pylint:disable=protected-access
   head_fn = core_head_lib._multi_class_head_with_softmax_cross_entropy_loss(
-      n_classes=n_classes, loss_fn=loss_fn, loss_reduction=loss_reduction)
+      n_classes=n_classes,
+      loss_fn=loss_fn,
+      loss_reduction=loss_reduction,
+      weight_column=weight_column)
   # pylint:enable=protected-access
 
   return head_fn
 
 
+# For quantile regression, use this head with Core..Estimator, or use
+# Core..QuantileRegressor directly,
+def core_quantile_regression_head(
+    quantiles,
+    label_dimension=1,
+    weight_column=None,
+    loss_reduction=core_losses.Reduction.SUM_OVER_NONZERO_WEIGHTS):
+  """Core head for quantile regression problems."""
+
+  def loss_fn(labels, logits):
+    result = losses.per_example_quantile_regression_loss(
+        labels=labels,
+        predictions=logits,
+        weights=weight_column,
+        quantile=quantiles)
+    return result[0]
+
+  # pylint:disable=protected-access
+  head_fn = core_head_lib._regression_head(
+      label_dimension=label_dimension,
+      loss_fn=loss_fn,
+      loss_reduction=loss_reduction,
+      weight_column=weight_column)
+  # pylint:enable=protected-access
+  return head_fn
+
+
 class CoreGradientBoostedDecisionTreeEstimator(core_estimator.Estimator):
   """An estimator using gradient boosted decision trees.
 
@@ -606,3 +741,104 @@ class CoreGradientBoostedDecisionTreeRanker(core_estimator.Estimator):
 
     super(CoreGradientBoostedDecisionTreeRanker, self).__init__(
         model_fn=_model_fn, model_dir=model_dir, config=config)
+
+
+# When using this estimator, make sure to regularize the hessian (at least l2,
+# min_node_weight)!
+# TODO(nponomareva): extend to take multiple quantiles in one go.
+class CoreGradientBoostedDecisionTreeQuantileRegressor(
+    core_estimator.Estimator):
+  """An estimator that does quantile regression and returns quantile estimates.
+  """
+
+  def __init__(self,
+               learner_config,
+               examples_per_layer,
+               quantiles,
+               label_dimension=1,
+               num_trees=None,
+               feature_columns=None,
+               weight_column_name=None,
+               model_dir=None,
+               config=None,
+               label_keys=None,
+               feature_engineering_fn=None,
+               logits_modifier_function=None,
+               center_bias=True,
+               output_leaf_index=False,
+               num_quantiles=100):
+    """Initializes a core version of GradientBoostedDecisionTreeEstimator.
+
+    Args:
+      learner_config: A config for the learner.
+      examples_per_layer: Number of examples to accumulate before growing a
+        layer. It can also be a function that computes the number of examples
+        based on the depth of the layer that's being built.
+      quantiles: a list of quantiles for the loss, each between 0 and 1.
+      label_dimension: Dimension of regression label. This is the size
+        of the last dimension of the labels `Tensor` (typically, this has shape
+        `[batch_size, label_dimension]`). When label_dimension>1, it is
+        recommended to use multiclass strategy diagonal hessian or full hessian.
+      num_trees: An int, number of trees to build.
+      feature_columns: A list of feature columns.
+      weight_column_name: Name of the column for weights, or None if not
+        weighted.
+      model_dir: Directory for model exports, etc.
+      config: `RunConfig` object to configure the runtime settings.
+      label_keys: Optional list of strings with size `[n_classes]` defining the
+        label vocabulary. Only supported for `n_classes` > 2.
+      feature_engineering_fn: Feature engineering function. Takes features and
+        labels which are the output of `input_fn` and returns features and
+        labels which will be fed into the model.
+      logits_modifier_function: A modifier function for the logits.
+      center_bias: Whether a separate tree should be created for first fitting
+        the bias.
+      output_leaf_index: whether to output leaf indices along with predictions
+        during inference. The leaf node indexes are available in predictions
+        dict by the key 'leaf_index'. For example,
+        result_dict = classifier.predict(...)
+        for example_prediction_result in result_dict:
+          # access leaf index list by example_prediction_result["leaf_index"]
+          # which contains one leaf index per tree
+      num_quantiles: Number of quantiles to build for numeric feature values.
+    """
+    if len(quantiles) > 1:
+      raise ValueError('For now, just one quantile per estimator is supported')
+
+    def _model_fn(features, labels, mode, config):
+      return model.model_builder(
+          features=features,
+          labels=labels,
+          mode=mode,
+          config=config,
+          params={
+              'head':
+                  core_quantile_regression_head(
+                      quantiles[0], label_dimension=label_dimension),
+              'feature_columns':
+                  feature_columns,
+              'learner_config':
+                  learner_config,
+              'num_trees':
+                  num_trees,
+              'weight_column_name':
+                  weight_column_name,
+              'examples_per_layer':
+                  examples_per_layer,
+              'center_bias':
+                  center_bias,
+              'logits_modifier_function':
+                  logits_modifier_function,
+              'use_core_libs':
+                  True,
+              'output_leaf_index':
+                  output_leaf_index,
+              'override_global_step_value':
+                  None,
+              'num_quantiles':
+                  num_quantiles,
+          },
+          output_type=model.ModelBuilderOutputType.ESTIMATOR_SPEC)
+
+    super(CoreGradientBoostedDecisionTreeQuantileRegressor, self).__init__(
+        model_fn=_model_fn, model_dir=model_dir, config=config)
diff --git a/tensorflow/contrib/boosted_trees/estimator_batch/estimator_test.py b/tensorflow/contrib/boosted_trees/estimator_batch/estimator_test.py
index c155128c0e4..ee052ac6038 100644
--- a/tensorflow/contrib/boosted_trees/estimator_batch/estimator_test.py
+++ b/tensorflow/contrib/boosted_trees/estimator_batch/estimator_test.py
@@ -25,6 +25,7 @@ from tensorflow.contrib.boosted_trees.proto import learner_pb2
 from tensorflow.contrib.layers.python.layers import feature_column as contrib_feature_column
 from tensorflow.contrib.learn.python.learn.estimators import run_config
 from tensorflow.python.estimator.canned import head as head_lib
+from tensorflow.python.estimator.inputs import numpy_io
 from tensorflow.python.feature_column import feature_column_lib as core_feature_column
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -47,8 +48,8 @@ def _multiclass_train_input_fn():
   features = {
       "x": constant_op.constant([[2.], [1.], [1.], [5.], [3.5], [4.6], [3.5]])
   }
-  label = constant_op.constant(
-      [[1], [0], [0], [2], [2], [0], [1]], dtype=dtypes.int32)
+  label = constant_op.constant([[1], [0], [0], [2], [2], [0], [1]],
+                               dtype=dtypes.int32)
   return features, label
 
 
@@ -77,6 +78,59 @@ def _infer_ranking_train_input_fn():
   return features, None
 
 
+_QUANTILE_REGRESSION_SIZE = 1000
+
+
+def _quantile_regression_input_fns(two_dimension=False):
+  # The data generation is taken from
+  # http://scikit-learn.org/stable/auto_examples/ensemble/plot_gradient_boosting_quantile.html
+  np.random.seed(1)
+
+  def f(x):
+    """The function to predict."""
+    return x * np.sin(x)
+
+  def g(x):
+    """The function to predict."""
+    return x * np.cos(x)
+
+  #  Training data.
+  x = np.atleast_2d(np.random.uniform(0, 10.0,
+                                      size=_QUANTILE_REGRESSION_SIZE)).T
+  x = x.astype(np.float32)
+
+  # Labels.
+  if not two_dimension:
+    y = f(x).ravel()
+  else:
+    y = np.column_stack((f(x).ravel(), g(x).ravel()))
+
+  # Add random noise.
+  dy = 1.5 + 1.0 * np.random.random(y.shape)
+  noise = np.random.normal(0, dy)
+  y += noise
+  y_original = y.astype(np.float32)
+  if not two_dimension:
+    y = y.reshape(_QUANTILE_REGRESSION_SIZE, 1)
+
+  train_input_fn = numpy_io.numpy_input_fn(
+      x=x,
+      y=y,
+      batch_size=_QUANTILE_REGRESSION_SIZE,
+      num_epochs=None,
+      shuffle=True)
+
+  # Test on the training data to make sure the predictions are calibrated.
+  test_input_fn = numpy_io.numpy_input_fn(
+      x=x,
+      y=y,
+      batch_size=_QUANTILE_REGRESSION_SIZE,
+      num_epochs=1,
+      shuffle=False)
+
+  return train_input_fn, test_input_fn, y_original
+
+
 class BoostedTreeEstimatorTest(test_util.TensorFlowTestCase):
 
   def setUp(self):
@@ -341,6 +395,130 @@ class BoostedTreeEstimatorTest(test_util.TensorFlowTestCase):
     for prediction_dict in result_iter:
       self.assertTrue("classes" in prediction_dict)
 
+  # One dimensional quantile regression.
+  def testQuantileRegression(self):
+    learner_config = learner_pb2.LearnerConfig()
+    learner_config.num_classes = 2
+    learner_config.constraints.max_tree_depth = 3
+    learner_config.growing_mode = learner_pb2.LearnerConfig.WHOLE_TREE
+    learner_config.constraints.min_node_weight = 1 / _QUANTILE_REGRESSION_SIZE
+    learner_config.regularization.l2 = 1.0 / _QUANTILE_REGRESSION_SIZE
+    learner_config.regularization.l1 = 1.0 / _QUANTILE_REGRESSION_SIZE
+    learner_config.regularization.tree_complexity = (
+        1.0 / _QUANTILE_REGRESSION_SIZE)
+
+    train_input_fn, test_input_fn, y = _quantile_regression_input_fns()
+
+    # 95% percentile.
+    model_upper = estimator.GradientBoostedDecisionTreeQuantileRegressor(
+        quantiles=[0.95],
+        learner_config=learner_config,
+        num_trees=100,
+        examples_per_layer=_QUANTILE_REGRESSION_SIZE,
+        center_bias=False)
+
+    model_upper.fit(input_fn=train_input_fn, steps=1000)
+    result_iter = model_upper.predict(input_fn=test_input_fn)
+    upper = []
+    for prediction_dict in result_iter:
+      upper.append(prediction_dict["scores"])
+
+    frac_below_upper = round(1. * np.count_nonzero(upper > y) / len(y), 3)
+    # +/- 3%
+    self.assertTrue(frac_below_upper >= 0.92)
+    self.assertTrue(frac_below_upper <= 0.98)
+
+    train_input_fn, test_input_fn, _ = _quantile_regression_input_fns()
+    model_lower = estimator.GradientBoostedDecisionTreeQuantileRegressor(
+        quantiles=[0.05],
+        learner_config=learner_config,
+        num_trees=100,
+        examples_per_layer=_QUANTILE_REGRESSION_SIZE,
+        center_bias=False)
+
+    model_lower.fit(input_fn=train_input_fn, steps=1000)
+    result_iter = model_lower.predict(input_fn=test_input_fn)
+    lower = []
+    for prediction_dict in result_iter:
+      lower.append(prediction_dict["scores"])
+
+    frac_above_lower = round(1. * np.count_nonzero(lower < y) / len(y), 3)
+    # +/- 3%
+    self.assertTrue(frac_above_lower >= 0.92)
+    self.assertTrue(frac_above_lower <= 0.98)
+
+  # Multi-dimensional quantile regression.
+  def testQuantileRegressionMultiDimLabel(self):
+    learner_config = learner_pb2.LearnerConfig()
+    learner_config.num_classes = 2
+    learner_config.constraints.max_tree_depth = 3
+    learner_config.growing_mode = learner_pb2.LearnerConfig.WHOLE_TREE
+    learner_config.constraints.min_node_weight = 1 / _QUANTILE_REGRESSION_SIZE
+    learner_config.regularization.l2 = 1.0 / _QUANTILE_REGRESSION_SIZE
+    learner_config.regularization.l1 = 1.0 / _QUANTILE_REGRESSION_SIZE
+    learner_config.regularization.tree_complexity = (
+        1.0 / _QUANTILE_REGRESSION_SIZE)
+
+    train_input_fn, test_input_fn, y = _quantile_regression_input_fns(
+        two_dimension=True)
+
+    # 95% percentile.
+    model_upper = estimator.GradientBoostedDecisionTreeQuantileRegressor(
+        quantiles=[0.95],
+        learner_config=learner_config,
+        label_dimension=2,
+        num_trees=100,
+        examples_per_layer=_QUANTILE_REGRESSION_SIZE,
+        center_bias=False)
+
+    model_upper.fit(input_fn=train_input_fn, steps=1000)
+    result_iter = model_upper.predict(input_fn=test_input_fn)
+    upper = []
+    for prediction_dict in result_iter:
+      upper.append(prediction_dict["scores"])
+
+    count_below_upper = np.count_nonzero(upper > y, axis=0)
+    count_both_below_upper = np.count_nonzero(np.prod(upper > y, axis=1))
+    frac_below_upper_0 = round(1. * count_below_upper[0] / len(y), 3)
+    frac_below_upper_1 = round(1. * count_below_upper[1] / len(y), 3)
+    frac_both_below_upper = round(1. * count_both_below_upper / len(y), 3)
+    # +/- 3%
+    self.assertTrue(frac_below_upper_0 >= 0.92)
+    self.assertTrue(frac_below_upper_0 <= 0.98)
+    self.assertTrue(frac_below_upper_1 >= 0.92)
+    self.assertTrue(frac_below_upper_1 <= 0.98)
+    self.assertTrue(frac_both_below_upper >= 0.92)
+    self.assertTrue(frac_both_below_upper <= 0.98)
+
+    train_input_fn, test_input_fn, _ = _quantile_regression_input_fns(
+        two_dimension=True)
+    model_lower = estimator.GradientBoostedDecisionTreeQuantileRegressor(
+        quantiles=[0.05],
+        learner_config=learner_config,
+        label_dimension=2,
+        num_trees=100,
+        examples_per_layer=_QUANTILE_REGRESSION_SIZE,
+        center_bias=False)
+
+    model_lower.fit(input_fn=train_input_fn, steps=1000)
+    result_iter = model_lower.predict(input_fn=test_input_fn)
+    lower = []
+    for prediction_dict in result_iter:
+      lower.append(prediction_dict["scores"])
+
+    count_above_lower = np.count_nonzero(lower < y, axis=0)
+    count_both_aboce_lower = np.count_nonzero(np.prod(lower < y, axis=1))
+    frac_above_lower_0 = round(1. * count_above_lower[0] / len(y), 3)
+    frac_above_lower_1 = round(1. * count_above_lower[1] / len(y), 3)
+    frac_both_above_lower = round(1. * count_both_aboce_lower / len(y), 3)
+    # +/- 3%
+    self.assertTrue(frac_above_lower_0 >= 0.92)
+    self.assertTrue(frac_above_lower_0 <= 0.98)
+    self.assertTrue(frac_above_lower_1 >= 0.92)
+    self.assertTrue(frac_above_lower_1 <= 0.98)
+    self.assertTrue(frac_both_above_lower >= 0.92)
+    self.assertTrue(frac_both_above_lower <= 0.98)
+
 
 class CoreGradientBoostedDecisionTreeEstimators(test_util.TensorFlowTestCase):
 
@@ -489,8 +667,8 @@ class CoreGradientBoostedDecisionTreeEstimators(test_util.TensorFlowTestCase):
 
     feature_columns = [
         core_feature_column.weighted_categorical_column(
-            categorical_column=core_feature_column.
-            categorical_column_with_vocabulary_list(
+            categorical_column=core_feature_column
+            .categorical_column_with_vocabulary_list(
                 key="word", vocabulary_list=["the", "cat", "dog"]),
             weight_feature_key="weight")
     ]
@@ -509,8 +687,8 @@ class CoreGradientBoostedDecisionTreeEstimators(test_util.TensorFlowTestCase):
         # Weights for the words are 5 - cat, 6- dog and 1 -the.
         features_dict["word"] = sparse_tensor.SparseTensor(
             indices=[[0, 0], [0, 1], [1, 0], [3, 0]],
-            values=constant_op.constant(
-                ["the", "cat", "dog", "the"], dtype=dtypes.string),
+            values=constant_op.constant(["the", "cat", "dog", "the"],
+                                        dtype=dtypes.string),
             dense_shape=[4, 3])
         features_dict["weight"] = sparse_tensor.SparseTensor(
             indices=[[0, 0], [0, 1], [1, 0], [3, 0]],
@@ -534,6 +712,132 @@ class CoreGradientBoostedDecisionTreeEstimators(test_util.TensorFlowTestCase):
     est.evaluate(input_fn=input_fn, steps=1)
     est.predict(input_fn=input_fn)
 
+  # One dimensional quantile regression.
+  def testQuantileRegression(self):
+    learner_config = learner_pb2.LearnerConfig()
+    learner_config.num_classes = 2
+    learner_config.constraints.max_tree_depth = 3
+    learner_config.growing_mode = learner_pb2.LearnerConfig.WHOLE_TREE
+    learner_config.constraints.min_node_weight = 1 / _QUANTILE_REGRESSION_SIZE
+    learner_config.regularization.l2 = 1.0 / _QUANTILE_REGRESSION_SIZE
+    learner_config.regularization.l1 = 1.0 / _QUANTILE_REGRESSION_SIZE
+    learner_config.regularization.tree_complexity = (
+        1.0 / _QUANTILE_REGRESSION_SIZE)
+
+    train_input_fn, test_input_fn, y = _quantile_regression_input_fns()
+    y = y.reshape(_QUANTILE_REGRESSION_SIZE, 1)
+
+    # 95% percentile.
+    model_upper = estimator.CoreGradientBoostedDecisionTreeQuantileRegressor(
+        quantiles=[0.95],
+        learner_config=learner_config,
+        num_trees=100,
+        examples_per_layer=_QUANTILE_REGRESSION_SIZE,
+        center_bias=False)
+
+    model_upper.train(input_fn=train_input_fn, steps=1000)
+    result_iter = model_upper.predict(input_fn=test_input_fn)
+    upper = []
+    for prediction_dict in result_iter:
+      upper.append(prediction_dict["predictions"])
+
+    frac_below_upper = round(1. * np.count_nonzero(upper > y) / len(y), 3)
+    # +/- 3%
+    self.assertTrue(frac_below_upper >= 0.92)
+    self.assertTrue(frac_below_upper <= 0.98)
+
+    train_input_fn, test_input_fn, _ = _quantile_regression_input_fns()
+    model_lower = estimator.CoreGradientBoostedDecisionTreeQuantileRegressor(
+        quantiles=[0.05],
+        learner_config=learner_config,
+        num_trees=100,
+        examples_per_layer=_QUANTILE_REGRESSION_SIZE,
+        center_bias=False)
+
+    model_lower.train(input_fn=train_input_fn, steps=1000)
+    result_iter = model_lower.predict(input_fn=test_input_fn)
+    lower = []
+    for prediction_dict in result_iter:
+      lower.append(prediction_dict["predictions"])
+
+    frac_above_lower = round(1. * np.count_nonzero(lower < y) / len(y), 3)
+    # +/- 3%
+    self.assertTrue(frac_above_lower >= 0.92)
+    self.assertTrue(frac_above_lower <= 0.98)
+
+  # Multi-dimensional quantile regression.
+  def testQuantileRegressionMultiDimLabel(self):
+    learner_config = learner_pb2.LearnerConfig()
+    learner_config.num_classes = 2
+    learner_config.constraints.max_tree_depth = 3
+    learner_config.growing_mode = learner_pb2.LearnerConfig.WHOLE_TREE
+    learner_config.constraints.min_node_weight = 1 / _QUANTILE_REGRESSION_SIZE
+    learner_config.regularization.l2 = 1.0 / _QUANTILE_REGRESSION_SIZE
+    learner_config.regularization.l1 = 1.0 / _QUANTILE_REGRESSION_SIZE
+    learner_config.regularization.tree_complexity = (
+        1.0 / _QUANTILE_REGRESSION_SIZE)
+
+    train_input_fn, test_input_fn, y = _quantile_regression_input_fns(
+        two_dimension=True)
+    y = y.reshape(_QUANTILE_REGRESSION_SIZE, 2)
+
+    # 95% percentile.
+    model_upper = estimator.CoreGradientBoostedDecisionTreeQuantileRegressor(
+        quantiles=[0.95],
+        learner_config=learner_config,
+        num_trees=100,
+        label_dimension=2,
+        examples_per_layer=_QUANTILE_REGRESSION_SIZE,
+        center_bias=False)
+
+    model_upper.train(input_fn=train_input_fn, steps=1000)
+    result_iter = model_upper.predict(input_fn=test_input_fn)
+    upper = []
+    for prediction_dict in result_iter:
+      upper.append(prediction_dict["predictions"])
+
+    count_below_upper = np.count_nonzero(upper > y, axis=0)
+    count_both_below_upper = np.count_nonzero(np.prod(upper > y, axis=1))
+    frac_below_upper_0 = round(1. * count_below_upper[0] / len(y), 3)
+    frac_below_upper_1 = round(1. * count_below_upper[1] / len(y), 3)
+    frac_both_below_upper = round(1. * count_both_below_upper / len(y), 3)
+    # +/- 3%
+    self.assertTrue(frac_below_upper_0 >= 0.92)
+    self.assertTrue(frac_below_upper_0 <= 0.98)
+    self.assertTrue(frac_below_upper_1 >= 0.92)
+    self.assertTrue(frac_below_upper_1 <= 0.98)
+    self.assertTrue(frac_both_below_upper >= 0.92)
+    self.assertTrue(frac_both_below_upper <= 0.98)
+
+    train_input_fn, test_input_fn, _ = _quantile_regression_input_fns(
+        two_dimension=True)
+    model_lower = estimator.CoreGradientBoostedDecisionTreeQuantileRegressor(
+        quantiles=[0.05],
+        learner_config=learner_config,
+        num_trees=100,
+        label_dimension=2,
+        examples_per_layer=_QUANTILE_REGRESSION_SIZE,
+        center_bias=False)
+
+    model_lower.train(input_fn=train_input_fn, steps=1000)
+    result_iter = model_lower.predict(input_fn=test_input_fn)
+    lower = []
+    for prediction_dict in result_iter:
+      lower.append(prediction_dict["predictions"])
+
+    count_above_lower = np.count_nonzero(lower < y, axis=0)
+    count_both_aboce_lower = np.count_nonzero(np.prod(lower < y, axis=1))
+    frac_above_lower_0 = round(1. * count_above_lower[0] / len(y), 3)
+    frac_above_lower_1 = round(1. * count_above_lower[1] / len(y), 3)
+    frac_both_above_lower = round(1. * count_both_aboce_lower / len(y), 3)
+    # +/- 3%
+    self.assertTrue(frac_above_lower_0 >= 0.92)
+    self.assertTrue(frac_above_lower_0 <= 0.98)
+    self.assertTrue(frac_above_lower_1 >= 0.92)
+    self.assertTrue(frac_above_lower_1 <= 0.98)
+    self.assertTrue(frac_both_above_lower >= 0.92)
+    self.assertTrue(frac_both_above_lower <= 0.98)
+
 
 if __name__ == "__main__":
   googletest.main()
diff --git a/tensorflow/contrib/boosted_trees/examples/boston.py b/tensorflow/contrib/boosted_trees/examples/boston.py
index 54c4ff059e3..09b240a7006 100644
--- a/tensorflow/contrib/boosted_trees/examples/boston.py
+++ b/tensorflow/contrib/boosted_trees/examples/boston.py
@@ -90,13 +90,13 @@ def _make_experiment_fn(output_dir):
   (x_train, y_train), (x_test,
                        y_test) = tf.keras.datasets.boston_housing.load_data()
 
-  train_input_fn = tf.estimator.inputs.numpy_input_fn(
+  train_input_fn = tf.compat.v1.estimator.inputs.numpy_input_fn(
       x={"x": x_train},
       y=y_train,
       batch_size=FLAGS.batch_size,
       num_epochs=None,
       shuffle=True)
-  eval_input_fn = tf.estimator.inputs.numpy_input_fn(
+  eval_input_fn = tf.compat.v1.estimator.inputs.numpy_input_fn(
       x={"x": x_test}, y=y_test, num_epochs=1, shuffle=False)
 
   feature_columns = [
diff --git a/tensorflow/contrib/boosted_trees/examples/boston_combined.py b/tensorflow/contrib/boosted_trees/examples/boston_combined.py
index e04b56afbfd..d640af354f5 100644
--- a/tensorflow/contrib/boosted_trees/examples/boston_combined.py
+++ b/tensorflow/contrib/boosted_trees/examples/boston_combined.py
@@ -80,13 +80,13 @@ def _make_experiment_fn(output_dir):
   (x_train, y_train), (x_test,
                        y_test) = tf.keras.datasets.boston_housing.load_data()
 
-  train_input_fn = tf.estimator.inputs.numpy_input_fn(
+  train_input_fn = tf.compat.v1.estimator.inputs.numpy_input_fn(
       x={"x": x_train},
       y=y_train,
       batch_size=FLAGS.batch_size,
       num_epochs=None,
       shuffle=True)
-  eval_input_fn = tf.estimator.inputs.numpy_input_fn(
+  eval_input_fn = tf.compat.v1.estimator.inputs.numpy_input_fn(
       x={"x": x_test}, y=y_test, num_epochs=1, shuffle=False)
 
   feature_columns = [
diff --git a/tensorflow/contrib/boosted_trees/kernels/split_handler_ops.cc b/tensorflow/contrib/boosted_trees/kernels/split_handler_ops.cc
index 8edb5d6c640..6d78e27e8f6 100644
--- a/tensorflow/contrib/boosted_trees/kernels/split_handler_ops.cc
+++ b/tensorflow/contrib/boosted_trees/kernels/split_handler_ops.cc
@@ -834,8 +834,13 @@ class BuildCategoricalEqualitySplitsOp : public OpKernel {
       root_gradient_stats *= normalizer_ratio;
       NodeStats root_stats = state->ComputeNodeStats(root_gradient_stats);
       int32 best_feature_idx = 0;
+      bool best_feature_updated = false;
       NodeStats best_right_node_stats(0);
       NodeStats best_left_node_stats(0);
+      CHECK(end_index - start_index >= 2)
+          << "Partition should have a non bias feature. Start index "
+          << start_index << " and end index " << end_index;
+
       for (int64 feature_idx = start_index + 1; feature_idx < end_index;
            ++feature_idx) {
         GradientStats left_gradient_stats(*gradients_t, *hessians_t,
@@ -845,11 +850,13 @@ class BuildCategoricalEqualitySplitsOp : public OpKernel {
             root_gradient_stats - left_gradient_stats;
         NodeStats left_stats = state->ComputeNodeStats(left_gradient_stats);
         NodeStats right_stats = state->ComputeNodeStats(right_gradient_stats);
-        if (left_stats.gain + right_stats.gain > best_gain) {
+        if (!best_feature_updated ||
+            left_stats.gain + right_stats.gain > best_gain) {
           best_gain = left_stats.gain + right_stats.gain;
           best_left_node_stats = left_stats;
           best_right_node_stats = right_stats;
           best_feature_idx = feature_idx;
+          best_feature_updated = true;
         }
       }
       SplitInfo split_info;
@@ -864,7 +871,7 @@ class BuildCategoricalEqualitySplitsOp : public OpKernel {
           << feature_ids(best_feature_idx, 0) << ", "
           << feature_ids(best_feature_idx, 1)
           << "\nPartition IDS: " << partition_ids(start_index) << "  "
-          << partition_ids(best_feature_idx);
+          << partition_ids(best_feature_idx) << " and best gain " << best_gain;
       equality_split->set_feature_id(feature_ids(best_feature_idx, 0));
       auto* left_child = split_info.mutable_left_child();
       auto* right_child = split_info.mutable_right_child();
diff --git a/tensorflow/contrib/boosted_trees/lib/learner/batch/categorical_split_handler.py b/tensorflow/contrib/boosted_trees/lib/learner/batch/categorical_split_handler.py
index 4da25298cb8..d26af584197 100644
--- a/tensorflow/contrib/boosted_trees/lib/learner/batch/categorical_split_handler.py
+++ b/tensorflow/contrib/boosted_trees/lib/learner/batch/categorical_split_handler.py
@@ -119,7 +119,7 @@ class EqualitySplitHandler(base_split_handler.BaseSplitHandler):
 
     def not_active_inputs():
       return (constant_op.constant([], dtype=dtypes.int32),
-              constant_op.constant([], dtype=dtypes.int64, shape=[1, 2]),
+              constant_op.constant_v1([], dtype=dtypes.int64, shape=[1, 2]),
               empty_gradients, empty_hessians)
 
     def active_inputs():
diff --git a/tensorflow/contrib/boosted_trees/lib/learner/batch/categorical_split_handler_test.py b/tensorflow/contrib/boosted_trees/lib/learner/batch/categorical_split_handler_test.py
index a2f708081a4..386dc19fc7b 100644
--- a/tensorflow/contrib/boosted_trees/lib/learner/batch/categorical_split_handler_test.py
+++ b/tensorflow/contrib/boosted_trees/lib/learner/batch/categorical_split_handler_test.py
@@ -36,9 +36,9 @@ def get_empty_tensors(gradient_shape, hessian_shape):
   empty_hess_shape = [1] + hessian_shape.as_list()
   empty_grad_shape = [1] + gradient_shape.as_list()
 
-  empty_gradients = constant_op.constant(
+  empty_gradients = constant_op.constant_v1(
       [], dtype=dtypes.float32, shape=empty_grad_shape)
-  empty_hessians = constant_op.constant(
+  empty_hessians = constant_op.constant_v1(
       [], dtype=dtypes.float32, shape=empty_hess_shape)
 
   return empty_gradients, empty_hessians
@@ -486,8 +486,8 @@ class EqualitySplitHandlerTest(test_util.TensorFlowTestCase):
       gradients = array_ops.constant([0.2, -0.5, 1.2, 4.0])
       hessians = array_ops.constant([0.12, 0.07, 0.2, 0.13])
       partition_ids = [0, 0, 0, 1]
-      indices = array_ops.constant([], dtype=dtypes.int64, shape=[0, 2])
-      values = array_ops.constant([], dtype=dtypes.int64)
+      indices = constant_op.constant_v1([], dtype=dtypes.int64, shape=[0, 2])
+      values = constant_op.constant_v1([], dtype=dtypes.int64)
 
       gradient_shape = tensor_shape.scalar()
       hessian_shape = tensor_shape.scalar()
diff --git a/tensorflow/contrib/boosted_trees/lib/learner/batch/ordinal_split_handler.py b/tensorflow/contrib/boosted_trees/lib/learner/batch/ordinal_split_handler.py
index 1fffbb5f660..0476bed2cd3 100644
--- a/tensorflow/contrib/boosted_trees/lib/learner/batch/ordinal_split_handler.py
+++ b/tensorflow/contrib/boosted_trees/lib/learner/batch/ordinal_split_handler.py
@@ -605,7 +605,7 @@ def dense_make_stats_update(is_active, are_buckets_ready, float_column,
                             quantile_buckets, example_partition_ids, gradients,
                             hessians, weights, empty_gradients, empty_hessians):
   """Updates the state for dense split handler."""
-  empty_float = constant_op.constant([], dtype=dtypes.float32)
+  empty_float = constant_op.constant_v1([], dtype=dtypes.float32)
 
   quantile_values, quantile_weights = control_flow_ops.cond(
       is_active[1],  # For the next layer, this handler is inactive.
@@ -621,8 +621,8 @@ def dense_make_stats_update(is_active, are_buckets_ready, float_column,
     return (example_partition_ids, quantized_feature, gradients, hessians)
 
   def not_ready_inputs_fn():
-    return (constant_op.constant([], dtype=dtypes.int32),
-            constant_op.constant([[]], dtype=dtypes.int64, shape=[1, 2]),
+    return (constant_op.constant_v1([], dtype=dtypes.int32),
+            constant_op.constant_v1([[]], dtype=dtypes.int64, shape=[1, 2]),
             empty_gradients, empty_hessians)
 
   example_partition_ids, feature_ids, gradients, hessians = (
@@ -708,11 +708,11 @@ def sparse_make_stats_update(
 
   def quantiles_not_ready():
     """The subgraph for when the quantiles are not ready."""
-    return (constant_op.constant([], dtype=dtypes.int32),
-            constant_op.constant([], dtype=dtypes.int64, shape=[1, 2]),
+    return (constant_op.constant_v1([], dtype=dtypes.int32),
+            constant_op.constant_v1([], dtype=dtypes.int64, shape=[1, 2]),
             empty_gradients, empty_hessians)
 
-  empty_float = constant_op.constant([], dtype=dtypes.float32)
+  empty_float = constant_op.constant_v1([], dtype=dtypes.float32)
   handler_not_active = (constant_op.constant(
       [], dtype=dtypes.int64, shape=[0, 2]), empty_float,
                         constant_op.constant([0, 1], dtype=dtypes.int64),
diff --git a/tensorflow/contrib/boosted_trees/lib/learner/batch/ordinal_split_handler_test.py b/tensorflow/contrib/boosted_trees/lib/learner/batch/ordinal_split_handler_test.py
index 74b0ea6989c..4a1b528646e 100644
--- a/tensorflow/contrib/boosted_trees/lib/learner/batch/ordinal_split_handler_test.py
+++ b/tensorflow/contrib/boosted_trees/lib/learner/batch/ordinal_split_handler_test.py
@@ -39,9 +39,9 @@ def get_empty_tensors(gradient_shape, hessian_shape):
   empty_hess_shape = [1] + hessian_shape.as_list()
   empty_grad_shape = [1] + gradient_shape.as_list()
 
-  empty_gradients = constant_op.constant(
+  empty_gradients = constant_op.constant_v1(
       [], dtype=dtypes.float32, shape=empty_grad_shape)
-  empty_hessians = constant_op.constant(
+  empty_hessians = constant_op.constant_v1(
       [], dtype=dtypes.float32, shape=empty_hess_shape)
 
   return empty_gradients, empty_hessians
@@ -1476,9 +1476,9 @@ class SparseSplitHandlerTest(test_util.TensorFlowTestCase):
 
   def testEmpty(self):
     with self.cached_session() as sess:
-      indices = array_ops.constant([], dtype=dtypes.int64, shape=[0, 2])
+      indices = constant_op.constant_v1([], dtype=dtypes.int64, shape=[0, 2])
       # No values in this feature column in this mini-batch.
-      values = array_ops.constant([], dtype=dtypes.float32)
+      values = constant_op.constant_v1([], dtype=dtypes.float32)
       sparse_column = sparse_tensor.SparseTensor(indices, values, [4, 1])
 
       gradient_shape = tensor_shape.scalar()
@@ -1549,8 +1549,9 @@ class SparseSplitHandlerTest(test_util.TensorFlowTestCase):
       sparse_column = array_ops.sparse_placeholder(dtypes.float32)
 
       # We have two batches - at first, a sparse feature is empty.
-      empty_indices = array_ops.constant([], dtype=dtypes.int64, shape=[0, 2])
-      empty_values = array_ops.constant([], dtype=dtypes.float32)
+      empty_indices = constant_op.constant_v1([], dtype=dtypes.int64,
+                                              shape=[0, 2])
+      empty_values = constant_op.constant_v1([], dtype=dtypes.float32)
       empty_sparse_column = sparse_tensor.SparseTensor(empty_indices,
                                                        empty_values, [4, 2])
       empty_sparse_column = empty_sparse_column.eval(session=sess)
diff --git a/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch.py b/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch.py
index ab5713fbe26..9fdc2fc0c2c 100644
--- a/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch.py
+++ b/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch.py
@@ -897,9 +897,9 @@ class GradientBoostedDecisionTreeModel(object):
     empty_hess_shape = [1] + self._hessian_shape.as_list()
     empty_grad_shape = [1] + self._gradient_shape.as_list()
 
-    empty_gradients = constant_op.constant(
+    empty_gradients = constant_op.constant_v1(
         [], dtype=dtypes.float32, shape=empty_grad_shape)
-    empty_hessians = constant_op.constant(
+    empty_hessians = constant_op.constant_v1(
         [], dtype=dtypes.float32, shape=empty_hess_shape)
 
     active_handlers = array_ops.unstack(active_handlers, axis=0)
@@ -1257,13 +1257,12 @@ class GradientBoostedDecisionTreeModel(object):
   def _get_replica_device_setter(self, worker_device):
     """Creates a replica device setter."""
     ps_tasks = self._num_ps_replicas
-    ps_ops = [
-        "Variable",
-        "VariableV2",
+    ps_ops = list(device_setter.STANDARD_PS_OPS)
+    ps_ops.extend([
         "DecisionTreeEnsembleResourceHandleOp",
         "StatsAccumulatorScalarResourceHandleOp",
         "StatsAccumulatorTensorResourceHandleOp",
-    ]
+    ])
     ps_strategy = _OpRoundRobinStrategy(ps_ops, ps_tasks)
     return device_setter.replica_device_setter(
         worker_device=worker_device,
diff --git a/tensorflow/contrib/boosted_trees/python/utils/losses.py b/tensorflow/contrib/boosted_trees/python/utils/losses.py
index b5ebaf19995..220e981618b 100644
--- a/tensorflow/contrib/boosted_trees/python/utils/losses.py
+++ b/tensorflow/contrib/boosted_trees/python/utils/losses.py
@@ -48,6 +48,47 @@ def per_example_logistic_loss(labels, weights, predictions):
       labels=labels, logits=predictions)
   return unweighted_loss * weights, control_flow_ops.no_op()
 
+# MUST USE WITH HESSIAN REGULARIZATION,
+# This loss can have zero hessian, so it must be used with l2 or min_node_weight
+# regularization.
+# An example config is
+# learner_config.constraints.min_node_weight = 1 / num_examples_per_layer
+# learner_config.regularization.l2 = 1.0 / num_examples_per_layer
+# TODO(nponomareva): make it multidimensional so we can estimate several
+# quantiles at once.
+def per_example_quantile_regression_loss(labels, weights, predictions,
+                                         quantile):
+  """Smoothed loss for quantile regression.
+
+  The standard quantile regression loss is quantile*(y-y') when y>y' and
+  (quantile-1)*(y-y') otherwise, y' is a prediction, y is a label. The impl
+  below is this loss but squared in the region where the loss value < 1.
+
+  Args:
+    labels: Rank 2 (N, D) tensor of per-example labels.
+    weights: Rank 2 (N, 1) tensor of per-example weights.
+    predictions: Rank 2 (N, D) tensor of per-example predictions.
+    quantile: The quantile to use.
+
+  Returns:
+    loss: A Rank 2 (N, 1) tensor of per-example quantile loss.
+    update_op: An update operation to update the loss's internal state.
+  """
+  labels = math_ops.to_float(labels)
+  error = labels - predictions
+  square_loss_right = array_ops.where(error * quantile < 1.0,
+                                      math_ops.square(quantile * error),
+                                      quantile * error)
+  square_loss_left = array_ops.where(error * (quantile - 1) < 1,
+                                     math_ops.square((quantile - 1) * error),
+                                     (quantile - 1) * error)
+
+  unweighted_loss = array_ops.where(error > 0, square_loss_right,
+                                    square_loss_left)
+  if weights is None:
+    return unweighted_loss, control_flow_ops.no_op()
+  else:
+    return unweighted_loss * weights, control_flow_ops.no_op()
 
 # This is classical form of Maximum entropy loss, that is twice differentiable
 # (sparse_softmax_cross_entropy which is what we go for is not twice
@@ -78,8 +119,7 @@ def per_example_maxent_loss(labels, weights, logits, num_classes, eps=1e-15):
     labels = array_ops.expand_dims(labels, 1)
   # Labels are indices of classes, convert them to one hot encodings.
   target_one_hot = array_ops.one_hot(indices=labels, depth=num_classes)
-  labels = math_ops.reduce_sum(
-      input_tensor=target_one_hot, reduction_indices=[1])
+  labels = math_ops.reduce_sum(input_tensor=target_one_hot, axis=[1])
   labels = math_ops.to_float(labels)
 
   # Calculate softmax probabilities for each class.
diff --git a/tensorflow/contrib/checkpoint/python/containers.py b/tensorflow/contrib/checkpoint/python/containers.py
index 242c1e8ba45..5418e2605b7 100644
--- a/tensorflow/contrib/checkpoint/python/containers.py
+++ b/tensorflow/contrib/checkpoint/python/containers.py
@@ -46,6 +46,10 @@ class UniqueNameTracker(data_structures.CheckpointableDataStructure):
     self._maybe_initialize_checkpointable()
     self._name_counts = {}
 
+  @property
+  def _values(self):
+    return [dep.ref for dep in self._checkpoint_dependencies]
+
   def track(self, checkpointable, base_name):
     """Add a dependency on `checkpointable`.
 
diff --git a/tensorflow/contrib/cluster_resolver/BUILD b/tensorflow/contrib/cluster_resolver/BUILD
index 9e1867ea9d0..f944b7f8843 100644
--- a/tensorflow/contrib/cluster_resolver/BUILD
+++ b/tensorflow/contrib/cluster_resolver/BUILD
@@ -21,85 +21,18 @@ py_library(
 
 py_library(
     name = "cluster_resolver_py",
-    srcs = [
+    srcs = glob([
         "__init__.py",
-        "python/training/__init__.py",
-    ],
+        "python/training/*.py",
+    ]),
     srcs_version = "PY2AND3",
     visibility = ["//visibility:public"],
-    deps = [
-        ":base_cluster_resolver_py",
-        ":gce_cluster_resolver_py",
-        ":kubernetes_cluster_resolver_py",
-        ":slurm_cluster_resolver_py",
-        ":tfconfig_cluster_resolver_py",
-        ":tpu_cluster_resolver_py",
-        "//tensorflow/python:util",
-    ],
-)
-
-py_library(
-    name = "base_cluster_resolver_py",
-    srcs = ["python/training/cluster_resolver.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        "//tensorflow/python:training",
-    ],
-)
-
-py_library(
-    name = "gce_cluster_resolver_py",
-    srcs = ["python/training/gce_cluster_resolver.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":base_cluster_resolver_py",
-        "//tensorflow/python:training",
-    ],
-)
-
-py_library(
-    name = "tfconfig_cluster_resolver_py",
-    srcs = ["python/training/tfconfig_cluster_resolver.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":base_cluster_resolver_py",
-        "//tensorflow/python:training",
-    ],
-)
-
-py_library(
-    name = "tpu_cluster_resolver_py",
-    srcs = ["python/training/tpu_cluster_resolver.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":base_cluster_resolver_py",
-        "//tensorflow/python:training",
-    ],
-)
-
-py_library(
-    name = "slurm_cluster_resolver_py",
-    srcs = ["python/training/slurm_cluster_resolver.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":base_cluster_resolver_py",
-        "//tensorflow/python:training",
-    ],
-)
-
-py_library(
-    name = "kubernetes_cluster_resolver_py",
-    srcs = ["python/training/kubernetes_cluster_resolver.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":base_cluster_resolver_py",
-        "//tensorflow/python:training",
-    ],
+    deps = ["//tensorflow/python/distribute/cluster_resolver:cluster_resolver_lib"],
 )
 
 tf_py_test(
-    name = "base_cluster_resolver_py_test",
-    srcs = ["python/training/cluster_resolver_test.py"],
+    name = "cluster_resolver_initialization_test",
+    srcs = ["cluster_resolver_initialization_test.py"],
     additional_deps = [
         ":cluster_resolver_py",
         "//tensorflow/python:client_testlib",
@@ -108,86 +41,5 @@ tf_py_test(
         "//tensorflow/python:platform_test",
         "//tensorflow/python:training",
     ],
-    main = "python/training/cluster_resolver_test.py",
-)
-
-tf_py_test(
-    name = "gce_cluster_resolver_py_test",
-    size = "small",
-    srcs = ["python/training/gce_cluster_resolver_test.py"],
-    additional_deps = [
-        ":cluster_resolver_py",
-        ":gce_cluster_resolver_py",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:platform_test",
-        "//tensorflow/python:training",
-    ],
-    main = "python/training/gce_cluster_resolver_test.py",
-)
-
-tf_py_test(
-    name = "tfconfig_cluster_resolver_py_test",
-    size = "small",
-    srcs = ["python/training/tfconfig_cluster_resolver_test.py"],
-    additional_deps = [
-        ":tfconfig_cluster_resolver_py",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:platform_test",
-        "//tensorflow/python:training",
-    ],
-    grpc_enabled = True,
-    main = "python/training/tfconfig_cluster_resolver_test.py",
-)
-
-tf_py_test(
-    name = "tpu_cluster_resolver_py_test",
-    size = "small",
-    srcs = ["python/training/tpu_cluster_resolver_test.py"],
-    additional_deps = [
-        ":tpu_cluster_resolver_py",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:platform_test",
-        "//tensorflow/python:training",
-    ],
-    grpc_enabled = True,
-    main = "python/training/tpu_cluster_resolver_test.py",
-)
-
-tf_py_test(
-    name = "slurm_cluster_resolver_py_test",
-    size = "small",
-    srcs = ["python/training/slurm_cluster_resolver_test.py"],
-    additional_deps = [
-        ":cluster_resolver_py",
-        ":slurm_cluster_resolver_py",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:platform_test",
-        "//tensorflow/python:training",
-    ],
-    main = "python/training/slurm_cluster_resolver_test.py",
-    tags = [],
-)
-
-tf_py_test(
-    name = "kubernetes_cluster_resolver_py_test",
-    size = "small",
-    srcs = ["python/training/kubernetes_cluster_resolver_test.py"],
-    additional_deps = [
-        ":cluster_resolver_py",
-        ":kubernetes_cluster_resolver_py",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:platform_test",
-        "//tensorflow/python:training",
-    ],
-    main = "python/training/kubernetes_cluster_resolver_test.py",
+    main = "cluster_resolver_initialization_test.py",
 )
diff --git a/tensorflow/contrib/cluster_resolver/__init__.py b/tensorflow/contrib/cluster_resolver/__init__.py
index fd1263fe81a..390b3e7550b 100644
--- a/tensorflow/contrib/cluster_resolver/__init__.py
+++ b/tensorflow/contrib/cluster_resolver/__init__.py
@@ -20,12 +20,14 @@ from __future__ import division
 from __future__ import print_function
 
 # pylint: disable=wildcard-import,unused-import
-from tensorflow.contrib.cluster_resolver.python.training.cluster_resolver import ClusterResolver
-from tensorflow.contrib.cluster_resolver.python.training.cluster_resolver import SimpleClusterResolver
-from tensorflow.contrib.cluster_resolver.python.training.cluster_resolver import UnionClusterResolver
-from tensorflow.contrib.cluster_resolver.python.training.gce_cluster_resolver import GceClusterResolver
-from tensorflow.contrib.cluster_resolver.python.training.slurm_cluster_resolver import SlurmClusterResolver
-from tensorflow.contrib.cluster_resolver.python.training.tpu_cluster_resolver import TPUClusterResolver
+from tensorflow.python.distribute.cluster_resolver.cluster_resolver import ClusterResolver
+from tensorflow.python.distribute.cluster_resolver.cluster_resolver import SimpleClusterResolver
+from tensorflow.python.distribute.cluster_resolver.cluster_resolver import UnionClusterResolver
+from tensorflow.python.distribute.cluster_resolver.gce_cluster_resolver import GceClusterResolver
+from tensorflow.python.distribute.cluster_resolver.kubernetes_cluster_resolver import KubernetesClusterResolver
+from tensorflow.python.distribute.cluster_resolver.slurm_cluster_resolver import SlurmClusterResolver
+from tensorflow.python.distribute.cluster_resolver.tfconfig_cluster_resolver import TFConfigClusterResolver
+from tensorflow.python.distribute.cluster_resolver.tpu_cluster_resolver import TPUClusterResolver
 # pylint: enable=wildcard-import,unused-import
 
 from tensorflow.python.util.all_util import remove_undocumented
@@ -35,6 +37,8 @@ _allowed_symbols = [
     'SimpleClusterResolver',
     'UnionClusterResolver',
     'GceClusterResolver',
+    'KubernetesClusterResolver',
+    'TFConfigClusterResolver',
     'TPUClusterResolver',
     'SlurmClusterResolver',
 ]
diff --git a/tensorflow/contrib/cluster_resolver/cluster_resolver_initialization_test.py b/tensorflow/contrib/cluster_resolver/cluster_resolver_initialization_test.py
new file mode 100644
index 00000000000..01ff1478c69
--- /dev/null
+++ b/tensorflow/contrib/cluster_resolver/cluster_resolver_initialization_test.py
@@ -0,0 +1,53 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests to ensure ClusterResolvers are usable via the old contrib path."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.cluster_resolver import SimpleClusterResolver
+from tensorflow.contrib.cluster_resolver.python.training import cluster_resolver
+from tensorflow.contrib.cluster_resolver.python.training import UnionClusterResolver
+from tensorflow.python.platform import test
+from tensorflow.python.training import server_lib
+
+
+class ClusterResolverInitializationTest(test.TestCase):
+
+  def testCreateSimpleClusterResolverFromLib(self):
+    base_cluster_spec = server_lib.ClusterSpec({
+        "ps": ["ps0:2222", "ps1:2222"],
+        "worker": ["worker0:2222", "worker1:2222", "worker2:2222"]
+    })
+    cluster_resolver.SimpleClusterResolver(base_cluster_spec)
+
+  def testCreateSimpleClusterResolver(self):
+    base_cluster_spec = server_lib.ClusterSpec({
+        "ps": ["ps0:2222", "ps1:2222"],
+        "worker": ["worker0:2222", "worker1:2222", "worker2:2222"]
+    })
+    SimpleClusterResolver(base_cluster_spec)
+
+  def testCreateUnionClusterResolver(self):
+    base_cluster_spec = server_lib.ClusterSpec({
+        "ps": ["ps0:2222", "ps1:2222"],
+        "worker": ["worker0:2222", "worker1:2222", "worker2:2222"]
+    })
+    simple_cr = SimpleClusterResolver(base_cluster_spec)
+    UnionClusterResolver(simple_cr)
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/cluster_resolver/python/training/__init__.py b/tensorflow/contrib/cluster_resolver/python/training/__init__.py
index 6d9120a3b96..10d93549ebb 100644
--- a/tensorflow/contrib/cluster_resolver/python/training/__init__.py
+++ b/tensorflow/contrib/cluster_resolver/python/training/__init__.py
@@ -18,11 +18,36 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.cluster_resolver.python.training.cluster_resolver import ClusterResolver
-from tensorflow.contrib.cluster_resolver.python.training.cluster_resolver import SimpleClusterResolver
-from tensorflow.contrib.cluster_resolver.python.training.cluster_resolver import UnionClusterResolver
-from tensorflow.contrib.cluster_resolver.python.training.gce_cluster_resolver import GceClusterResolver
-from tensorflow.contrib.cluster_resolver.python.training.kubernetes_cluster_resolver import KubernetesClusterResolver
-from tensorflow.contrib.cluster_resolver.python.training.slurm_cluster_resolver import SlurmClusterResolver
-from tensorflow.contrib.cluster_resolver.python.training.tfconfig_cluster_resolver import TFConfigClusterResolver
-from tensorflow.contrib.cluster_resolver.python.training.tpu_cluster_resolver import TPUClusterResolver
+# This file (and all files in this directory in general) is a backwards
+# compatibility shim that exists to re-export ClusterResolvers such that
+# existing OSS code will not be broken.
+
+from tensorflow.python.distribute.cluster_resolver.cluster_resolver import ClusterResolver
+from tensorflow.python.distribute.cluster_resolver.cluster_resolver import SimpleClusterResolver
+from tensorflow.python.distribute.cluster_resolver.cluster_resolver import UnionClusterResolver
+from tensorflow.python.distribute.cluster_resolver.gce_cluster_resolver import GceClusterResolver
+from tensorflow.python.distribute.cluster_resolver.kubernetes_cluster_resolver import KubernetesClusterResolver
+from tensorflow.python.distribute.cluster_resolver.slurm_cluster_resolver import SlurmClusterResolver
+from tensorflow.python.distribute.cluster_resolver.tfconfig_cluster_resolver import TFConfigClusterResolver
+from tensorflow.python.distribute.cluster_resolver.tpu_cluster_resolver import TPUClusterResolver
+
+from tensorflow.python.util.all_util import remove_undocumented
+
+_allowed_symbols = [
+    'cluster_resolver',
+    'gce_cluster_resolver',
+    'kubernetes_cluster_resolver',
+    'slurm_cluster_resolver',
+    'tfconfig_cluster_resolver',
+    'tpu_cluster_resolver',
+    'ClusterResolver',
+    'SimpleClusterResolver',
+    'UnionClusterResolver',
+    'GceClusterResolver',
+    'KubernetesClusterResolver',
+    'TFConfigClusterResolver',
+    'TPUClusterResolver',
+    'SlurmClusterResolver',
+]
+
+remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/contrib/cluster_resolver/python/training/cluster_resolver.py b/tensorflow/contrib/cluster_resolver/python/training/cluster_resolver.py
index 40b1e667ee6..99840fb5166 100644
--- a/tensorflow/contrib/cluster_resolver/python/training/cluster_resolver.py
+++ b/tensorflow/contrib/cluster_resolver/python/training/cluster_resolver.py
@@ -1,4 +1,4 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,333 +12,29 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Cluster Resolvers are used for dynamic cluster IP/hostname resolution."""
+"""Stub file for ClusterResolver to maintain backwards compatibility."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import abc
+# This file (and all files in this directory in general) is a backwards
+# compatibility shim that exists to re-export ClusterResolvers such that
+# existing OSS code will not be broken.
 
-import six
+# pylint: disable=unused-import
+from tensorflow.python.distribute.cluster_resolver.cluster_resolver import ClusterResolver
+from tensorflow.python.distribute.cluster_resolver.cluster_resolver import SimpleClusterResolver
+from tensorflow.python.distribute.cluster_resolver.cluster_resolver import UnionClusterResolver
+# pylint: enable=unused-import
 
-from tensorflow.python.training.server_lib import ClusterSpec
+from tensorflow.python.util.all_util import remove_undocumented
 
+_allowed_symbols = [
+    'ClusterResolver',
+    'SimpleClusterResolver',
+    'UnionClusterResolver',
+]
 
-def _format_master_url(master, rpc_layer=None):
-  if rpc_layer:
-    return '%s://%s' % (rpc_layer, master)
-  else:
-    return master
+remove_undocumented(__name__, _allowed_symbols)
 
-
-@six.add_metaclass(abc.ABCMeta)
-class ClusterResolver(object):
-  """Abstract class for all implementations of ClusterResolvers.
-
-  This defines the skeleton for all implementations of ClusterResolvers.
-  ClusterResolvers are a way for TensorFlow to communicate with various cluster
-  management systems (e.g. GCE, AWS, etc...).
-
-  By letting TensorFlow communicate with these systems, we will be able to
-  automatically discover and resolve IP addresses for various TensorFlow
-  workers. This will eventually allow us to automatically recover from
-  underlying machine failures and scale TensorFlow worker clusters up and down.
-  """
-
-  @abc.abstractmethod
-  def cluster_spec(self):
-    """Retrieve the current state of the cluster and returns a ClusterSpec.
-
-    Returns:
-      A ClusterSpec representing the state of the cluster at the moment this
-      function is called.
-
-    Implementors of this function must take care in ensuring that the
-    ClusterSpec returned is up-to-date at the time of calling this function.
-    This usually means retrieving the information from the underlying cluster
-    management system every time this function is invoked and reconstructing
-    a cluster_spec, rather than attempting to cache anything.
-    """
-    raise NotImplementedError(
-        'cluster_spec is not implemented for {}.'.format(self))
-
-  @abc.abstractmethod
-  def master(self, task_type=None, task_index=None, rpc_layer=None):
-    """Retrieves the name or URL of the session master.
-
-    Args:
-      task_type: (Optional) The type of the TensorFlow task of the master.
-      task_index: (Optional) The index of the TensorFlow task of the master.
-      rpc_layer: (Optional) The RPC protocol for the given cluster.
-
-    Returns:
-      The name or URL of the session master.
-
-    Implementors of this function must take care in ensuring that the master
-    returned is up-to-date at the time to calling this function. This usually
-    means retrieving the master every time this function is invoked.
-    """
-    raise NotImplementedError('master is not implemented for {}.'.format(self))
-
-
-class SimpleClusterResolver(ClusterResolver):
-  """Simple implementation of ClusterResolver that accepts a ClusterSpec."""
-
-  def __init__(self, cluster_spec, master='', task_type=None, task_index=None,
-               environment='', num_accelerators_per_worker=0,
-               rpc_layer=None):
-    """Creates a SimpleClusterResolver from a ClusterSpec."""
-    super(SimpleClusterResolver, self).__init__()
-
-    self._task_type = task_type
-    self._task_index = task_index
-    self._environment = environment
-    self._num_accelerators_per_worker = num_accelerators_per_worker
-    self._rpc_layer = rpc_layer
-
-    if not isinstance(cluster_spec, ClusterSpec):
-      raise TypeError('cluster_spec must be a ClusterSpec.')
-    self._cluster_spec = cluster_spec
-
-    if not isinstance(master, str):
-      raise TypeError('master must be a string.')
-    self._master = master
-
-  def cluster_spec(self):
-    """Returns the ClusterSpec passed into the constructor."""
-    return self._cluster_spec
-
-  def master(self, task_type=None, task_index=None, rpc_layer=None):
-    """Returns the master address to use when creating a session.
-
-    Args:
-      task_type: (Optional) The type of the TensorFlow task of the master.
-      task_index: (Optional) The index of the TensorFlow task of the master.
-      rpc_layer: (Optional) The RPC used by distributed TensorFlow.
-
-    Returns:
-      The name or URL of the session master.
-
-    If a task_type and task_index is given, this will override the `master`
-    string passed into the initialization function.
-    """
-    if task_type is not None and task_index is not None:
-      master = self.cluster_spec().task_address(task_type, task_index)
-    else:
-      master = self._master
-
-    return _format_master_url(master, rpc_layer or self._rpc_layer)
-
-  @property
-  def task_type(self):
-    return self._task_type
-
-  @property
-  def task_index(self):
-    return self._task_index
-
-  @task_type.setter
-  def task_type(self, task_type):
-    self._task_type = task_type
-
-  @task_index.setter
-  def task_index(self, task_index):
-    self._task_index = task_index
-
-  @property
-  def environment(self):
-    return self._environment
-
-  def num_accelerators_per_worker(self, session_config=None):
-    """Returns the number of accelerator cores per worker.
-
-    Args:
-      session_config: Unused. The SimpleClusterResolver does not do automatic
-        detection of accelerators, so a TensorFlow session will never be
-        created, and thus a `session_config` is never necessary here, and will
-        be ignored.
-    """
-    del session_config
-    return self._num_accelerators_per_worker
-
-  @property
-  def rpc_layer(self):
-    return self._rpc_layer
-
-  @rpc_layer.setter
-  def rpc_layer(self, rpc_layer):
-    self._rpc_layer = rpc_layer
-
-
-class UnionClusterResolver(ClusterResolver):
-  """Performs a union on underlying ClusterResolvers.
-
-  This class performs a union given two or more existing ClusterResolvers. It
-  merges the underlying ClusterResolvers, and returns one unified ClusterSpec
-  when cluster_spec is called. The details of the merge function is
-  documented in the cluster_spec function.
-
-  For additional Cluster Resolver properties such as task type, task index,
-  rpc layer, environment, etc..., we will return the value from the first
-  ClusterResolver in the union.
-  """
-
-  def __init__(self, *args, **kwargs):
-    """Initializes a UnionClusterResolver with other ClusterResolvers.
-
-    Args:
-      *args: `ClusterResolver` objects to be unionized.
-      **kwargs:
-        rpc_layer - (Optional) Override value for the RPC layer used by
-          TensorFlow.
-        task_type - (Optional) Override value for the current task type.
-        task_index - (Optional) Override value for the current task index.
-
-    Raises:
-      TypeError: If any argument is not a subclass of `ClusterResolvers`.
-      ValueError: If there are no arguments passed.
-    """
-    super(UnionClusterResolver, self).__init__()
-
-    self._rpc_layer = kwargs.pop('rpc_layer', None)
-    self._task_type = kwargs.pop('task_type', None)
-    self._task_index = kwargs.pop('task_index', None)
-
-    if kwargs:
-      raise ValueError('Unexpected kwargs provided {!r}'.format(kwargs))
-
-    if not args:
-      raise ValueError('At least one ClusterResolver is required.')
-
-    for cluster_resolver in args:
-      if not isinstance(cluster_resolver, ClusterResolver):
-        raise TypeError('All arguments must be a sub-class of '
-                        '`ClusterResolver.`')
-    self._cluster_resolvers = args
-
-  def cluster_spec(self):
-    """Returns a union of all the ClusterSpecs from the ClusterResolvers.
-
-    Returns:
-      A ClusterSpec containing host information merged from all the underlying
-      ClusterResolvers.
-
-    Raises:
-      KeyError: If there are conflicting keys detected when merging two or
-      more dictionaries, this exception is raised.
-
-    Note: If there are multiple ClusterResolvers exposing ClusterSpecs with the
-    same job name, we will merge the list/dict of workers.
-
-    If *all* underlying ClusterSpecs expose the set of workers as lists, we will
-    concatenate the lists of workers, starting with the list of workers from
-    the first ClusterResolver passed into the constructor.
-
-    If *any* of the ClusterSpecs expose the set of workers as a dict, we will
-    treat all the sets of workers as dicts (even if they are returned as lists)
-    and will only merge them into a dict if there is no conflicting keys. If
-    there is a conflicting key, we will raise a `KeyError`.
-    """
-
-    merged_cluster = {}
-
-    # We figure out whether it is all lists for a particular job, or whether
-    # there are dicts inside.
-    for cluster_resolver in self._cluster_resolvers:
-      cluster_spec = cluster_resolver.cluster_spec()
-      cluster_dict = cluster_spec.as_dict()
-
-      for job_name, tasks in cluster_dict.items():
-        if job_name in merged_cluster:
-          # If we see a dict, then we write a dict out regardless.
-          if isinstance(tasks, dict):
-            merged_cluster[job_name] = {}
-        else:
-          # We take whichever type is present.
-          if isinstance(tasks, list):
-            merged_cluster[job_name] = []
-          else:
-            merged_cluster[job_name] = {}
-
-    # We then do the merge as appropriate in merged_cluster[job].
-    for cluster_resolver in self._cluster_resolvers:
-      cluster_spec = cluster_resolver.cluster_spec()
-      cluster_dict = cluster_spec.as_dict()
-
-      for job_name, tasks in cluster_dict.items():
-        if isinstance(merged_cluster[job_name], list):
-          # We all have lists, we can just concatenate and be done.
-          merged_cluster[job_name].extend(tasks)
-        else:
-          if isinstance(tasks, list):
-            # We convert to a dictionary if the type is a list.
-            task_dict = dict(zip(range(0, len(tasks)), tasks))
-          else:
-            # We can simply make a copy (for update) and be done.
-            task_dict = tasks.copy()
-
-          # We detect if there are duplicates, and raise an error if so.
-          task_keys = set(task_dict)
-          merged_keys = set(merged_cluster[job_name].keys())
-          intersected_keys = task_keys.intersection(merged_keys)
-          if intersected_keys:
-            raise KeyError('Duplicate keys detected when merging two '
-                           'ClusterSpecs: %s' % repr(intersected_keys))
-
-          # We do the merge after all the processing.
-          merged_cluster[job_name].update(task_dict)
-
-    return ClusterSpec(merged_cluster)
-
-  def master(self, task_type=None, task_index=None, rpc_layer=None):
-    """Returns the master address to use when creating a session.
-
-    This usually returns the master from the first ClusterResolver passed in,
-    but you can override this by specifying the task_type and task_index.
-
-    Args:
-      task_type: (Optional) The type of the TensorFlow task of the master.
-      task_index: (Optional) The index of the TensorFlow task of the master.
-      rpc_layer: (Optional) The RPC protocol for the given cluster.
-
-    Returns:
-      The name or URL of the session master.
-    """
-    if task_type is not None and task_index is not None:
-      master = self.cluster_spec().task_address(task_type, task_index)
-      return _format_master_url(master, rpc_layer or self._rpc_layer)
-
-    return self._cluster_resolvers[0].master(rpc_layer=rpc_layer)
-
-  @property
-  def task_type(self):
-    return self._task_type or self._cluster_resolvers[0].task_type
-
-  @property
-  def task_index(self):
-    return self._task_index or self._cluster_resolvers[0].task_index
-
-  @task_type.setter
-  def task_type(self, task_type):
-    self._task_type = task_type
-
-  @task_index.setter
-  def task_index(self, task_index):
-    self._task_index = task_index
-
-  @property
-  def environment(self):
-    return self._cluster_resolvers[0].environment
-
-  def num_accelerators_per_worker(self, session_config=None):
-    return self._cluster_resolvers[0].num_accelerators_per_worker(
-        session_config)
-
-  @property
-  def rpc_layer(self):
-    return self._rpc_layer or self._cluster_resolvers[0].rpc_layer
-
-  @rpc_layer.setter
-  def rpc_layer(self, rpc_layer):
-    self._rpc_layer = rpc_layer
diff --git a/tensorflow/contrib/cluster_resolver/python/training/gce_cluster_resolver.py b/tensorflow/contrib/cluster_resolver/python/training/gce_cluster_resolver.py
index 195b68959b6..55e61155c68 100644
--- a/tensorflow/contrib/cluster_resolver/python/training/gce_cluster_resolver.py
+++ b/tensorflow/contrib/cluster_resolver/python/training/gce_cluster_resolver.py
@@ -1,4 +1,4 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,197 +12,24 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Implementation of Cluster Resolvers for GCE Instance Groups."""
+"""Stub file for GceClusterResolver to maintain backwards compatibility."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+# This file (and all files in this directory in general) is a backwards
+# compatibility shim that exists to re-export ClusterResolvers such that
+# existing OSS code will not be broken.
 
-from tensorflow.contrib.cluster_resolver.python.training.cluster_resolver import ClusterResolver
-from tensorflow.python.training.server_lib import ClusterSpec
+# pylint: disable=unused-import
+from tensorflow.python.distribute.cluster_resolver.gce_cluster_resolver import GceClusterResolver
+# pylint: enable=unused-import
 
-_GOOGLE_API_CLIENT_INSTALLED = True
-try:
-  from googleapiclient import discovery  # pylint: disable=g-import-not-at-top
-  from oauth2client.client import GoogleCredentials  # pylint: disable=g-import-not-at-top
-except ImportError:
-  _GOOGLE_API_CLIENT_INSTALLED = False
+from tensorflow.python.util.all_util import remove_undocumented
 
+_allowed_symbols = [
+    'GceClusterResolver',
+]
 
-def _format_master_url(master, rpc_layer=None):
-  return '%s://%s' % (rpc_layer, master) if rpc_layer else master
-
-
-class GceClusterResolver(ClusterResolver):
-  """Cluster Resolver for Google Compute Engine.
-
-  This is an implementation of cluster resolvers for the Google Compute Engine
-  instance group platform. By specifying a project, zone, and instance group,
-  this will retrieve the IP address of all the instances within the instance
-  group and return a Cluster Resolver object suitable for use for distributed
-  TensorFlow.
-  """
-
-  def __init__(self,
-               project,
-               zone,
-               instance_group,
-               port,
-               task_type='worker',
-               task_index=0,
-               rpc_layer='grpc',
-               num_accelerators_per_worker=0,
-               credentials='default',
-               service=None):
-    """Creates a new GceClusterResolver object.
-
-    This takes in a few parameters and creates a GceClusterResolver project. It
-    will then use these parameters to query the GCE API for the IP addresses of
-    each instance in the instance group.
-
-    Args:
-      project: Name of the GCE project.
-      zone: Zone of the GCE instance group.
-      instance_group: Name of the GCE instance group.
-      port: Port of the listening TensorFlow server (default: 8470)
-      task_type: Name of the TensorFlow job this GCE instance group of VM
-        instances belong to.
-      task_index: The task index for this particular VM, within the GCE
-        instance group. In particular, every single instance should be assigned
-        a unique ordinal index within an instance group manually so that they
-        can be distinguished from each other.
-      rpc_layer: The RPC layer TensorFlow should use to communicate across
-        instances.
-      num_accelerators_per_worker: Number of accelerators (GPUs) present per
-        instance.
-      credentials: GCE Credentials. If nothing is specified, this defaults to
-        GoogleCredentials.get_application_default().
-      service: The GCE API object returned by the googleapiclient.discovery
-        function. (Default: discovery.build('compute', 'v1')). If you specify a
-        custom service object, then the credentials parameter will be ignored.
-
-    Raises:
-      ImportError: If the googleapiclient is not installed.
-    """
-    self._project = project
-    self._zone = zone
-    self._instance_group = instance_group
-    self._task_type = task_type
-    self._task_index = task_index
-    self._rpc_layer = rpc_layer
-    self._port = port
-    self._credentials = credentials
-
-    if credentials == 'default':
-      if _GOOGLE_API_CLIENT_INSTALLED:
-        self._credentials = GoogleCredentials.get_application_default()
-
-    if service is None:
-      if not _GOOGLE_API_CLIENT_INSTALLED:
-        raise ImportError('googleapiclient must be installed before using the '
-                          'GCE cluster resolver')
-      self._service = discovery.build(
-          'compute', 'v1',
-          credentials=self._credentials)
-    else:
-      self._service = service
-
-  def cluster_spec(self):
-    """Returns a ClusterSpec object based on the latest instance group info.
-
-    This returns a ClusterSpec object for use based on information from the
-    specified instance group. We will retrieve the information from the GCE APIs
-    every time this method is called.
-
-    Returns:
-      A ClusterSpec containing host information retrieved from GCE.
-    """
-    request_body = {'instanceState': 'RUNNING'}
-    request = self._service.instanceGroups().listInstances(
-        project=self._project,
-        zone=self._zone,
-        instanceGroups=self._instance_group,
-        body=request_body,
-        orderBy='name')
-
-    worker_list = []
-
-    while request is not None:
-      response = request.execute()
-
-      items = response['items']
-      for instance in items:
-        instance_name = instance['instance'].split('/')[-1]
-
-        instance_request = self._service.instances().get(
-            project=self._project,
-            zone=self._zone,
-            instance=instance_name)
-
-        if instance_request is not None:
-          instance_details = instance_request.execute()
-          ip_address = instance_details['networkInterfaces'][0]['networkIP']
-          instance_url = '%s:%s' % (ip_address, self._port)
-          worker_list.append(instance_url)
-
-      request = self._service.instanceGroups().listInstances_next(
-          previous_request=request,
-          previous_response=response)
-
-    worker_list.sort()
-    return ClusterSpec({self._task_type: worker_list})
-
-  def master(self, task_type=None, task_index=None, rpc_layer=None):
-    task_type = task_type if task_type is not None else self._task_type
-    task_index = task_index if task_index is not None else self._task_index
-
-    if task_type is not None and task_index is not None:
-      master = self.cluster_spec().task_address(task_type, task_index)
-      if rpc_layer or self._rpc_layer:
-        return '%s://%s' % (rpc_layer or self._rpc_layer, master)
-      else:
-        return master
-
-    return ''
-
-  @property
-  def task_type(self):
-    return self._task_type
-
-  @property
-  def task_index(self):
-    return self._task_index
-
-  @task_type.setter
-  def task_type(self, task_type):
-    raise RuntimeError(
-        'You cannot reset the task_type of the GceClusterResolver after it has '
-        'been created.')
-
-  @task_index.setter
-  def task_index(self, task_index):
-    self._task_index = task_index
-
-  @property
-  def environment(self):
-    """Returns the current environment which TensorFlow is running in.
-
-    For users in the GCE environment, the environment property is always an
-    empty string, and Google users will not use this ClusterResolver for running
-    on internal systems.
-    """
-    return ''
-
-  @property
-  def rpc_layer(self):
-    return self._rpc_layer
-
-  @rpc_layer.setter
-  def rpc_layer(self, rpc_layer):
-    self._rpc_layer = rpc_layer
-
-  def num_accelerators_per_worker(self, session_config=None):
-    del session_config  # Unused, since this is set manually in __init__.
-    return self._num_accelerators_per_worker
-
+remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/contrib/cluster_resolver/python/training/kubernetes_cluster_resolver.py b/tensorflow/contrib/cluster_resolver/python/training/kubernetes_cluster_resolver.py
index ddae64839f0..a8eaf33629a 100644
--- a/tensorflow/contrib/cluster_resolver/python/training/kubernetes_cluster_resolver.py
+++ b/tensorflow/contrib/cluster_resolver/python/training/kubernetes_cluster_resolver.py
@@ -12,121 +12,25 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Implementation of Cluster Resolvers for Kubernetes."""
+"""Stub file for KubernetesClusterResolver for backwards compatibility."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.cluster_resolver.python.training.cluster_resolver import ClusterResolver
-from tensorflow.python.training import server_lib
+# This file (and all files in this directory in general) is a backwards
+# compatibility shim that exists to re-export ClusterResolvers such that
+# existing OSS code will not be broken.
 
-_KUBERNETES_API_CLIENT_INSTALLED = True
-try:
-  from kubernetes import client as k8sclient  # pylint: disable=g-import-not-at-top
-  from kubernetes import config as k8sconfig  # pylint: disable=g-import-not-at-top
-except ImportError:
-  _KUBERNETES_API_CLIENT_INSTALLED = False
+# pylint: disable=unused-import
+from tensorflow.python.distribute.cluster_resolver.kubernetes_cluster_resolver import KubernetesClusterResolver
+# pylint: enable=unused-import
 
+from tensorflow.python.util.all_util import remove_undocumented
 
-class KubernetesClusterResolver(ClusterResolver):
-  """Cluster Resolver for Kubernetes.
+_allowed_symbols = [
+    'KubernetesClusterResolver',
+]
 
-  This is an implementation of cluster resolvers for Kubernetes. When given the
-  the Kubernetes namespace and label selector for pods, we will retrieve the
-  pod IP addresses of all running pods matching the selector, and return a
-  ClusterSpec based on that information.
-  """
+remove_undocumented(__name__, _allowed_symbols)
 
-  def __init__(self,
-               job_to_label_mapping=None,
-               tf_server_port=8470,
-               override_client=None):
-    """Initializes a new KubernetesClusterResolver.
-
-    This initializes a new Kubernetes Cluster Resolver. The Cluster Resolver
-    will attempt to talk to the Kubernetes master to retrieve all the instances
-    of pods matching a label selector.
-
-    Args:
-      job_to_label_mapping: A mapping of TensorFlow jobs to label selectors.
-        This allows users to specify many TensorFlow jobs in one Cluster
-        Resolver, and each job can have pods belong with different label
-        selectors. For example, a sample mapping might be
-        ```
-        {'worker': ['job-name=worker-cluster-a', 'job-name=worker-cluster-b'],
-         'ps': ['job-name=ps-1', 'job-name=ps-2']}
-        ```
-      tf_server_port: The port the TensorFlow server is listening on.
-      override_client: The Kubernetes client (usually automatically retrieved
-        using `from kubernetes import client as k8sclient`). If you pass this
-        in, you are responsible for setting Kubernetes credentials manually.
-
-    Raises:
-      ImportError: If the Kubernetes Python client is not installed and no
-        `override_client` is passed in.
-    """
-    if _KUBERNETES_API_CLIENT_INSTALLED:
-      k8sconfig.load_kube_config()
-
-    if not job_to_label_mapping:
-      job_to_label_mapping = {'worker': ['job-name=tensorflow']}
-
-    if not override_client and not _KUBERNETES_API_CLIENT_INSTALLED:
-      raise ImportError('The Kubernetes Python client must be installed before'
-                        'using the Kubernetes Cluster Resolver. To install the'
-                        'Kubernetes Python client, run `pip install '
-                        'kubernetes` on your command line.')
-
-    self._job_to_label_mapping = job_to_label_mapping
-    self._tf_server_port = tf_server_port
-    self._override_client = override_client
-
-  def master(self):
-    # TODO(frankchn): Figure out a standard way to pass in the current task type
-    # and task id via Kubernetes.
-    pass
-
-  def get_master(self):
-    return self.master()
-
-  def get_job_name(self):
-    return self._job_name
-
-  def cluster_spec(self):
-    """Returns a ClusterSpec object based on the latest info from Kubernetes.
-
-    We retrieve the information from the Kubernetes master every time this
-    method is called.
-
-    Returns:
-      A ClusterSpec containing host information returned from Kubernetes.
-
-    Raises:
-      RuntimeError: If any of the pods returned by the master is not in the
-        `Running` phase.
-    """
-    if not self._override_client:
-      k8sconfig.load_kube_config()
-
-    client = self._override_client or k8sclient.CoreV1Api()
-    cluster_map = {}
-
-    for tf_job in self._job_to_label_mapping:
-      all_pods = []
-      for selector in self._job_to_label_mapping[tf_job]:
-        ret = client.list_pod_for_all_namespaces(label_selector=selector)
-        selected_pods = []
-
-        # Sort the list by the name to make sure it doesn't change call to call.
-        for pod in sorted(ret.items, key=lambda x: x.metadata.name):
-          if pod.status.phase == 'Running':
-            selected_pods.append(
-                '%s:%s' % (pod.status.host_ip, self._tf_server_port))
-          else:
-            raise RuntimeError('Pod "%s" is not running; phase: "%s"' %
-                               (pod.metadata.name, pod.status.phase))
-        all_pods.extend(selected_pods)
-      cluster_map[tf_job] = all_pods
-
-    return server_lib.ClusterSpec(cluster_map)
diff --git a/tensorflow/contrib/cluster_resolver/python/training/slurm_cluster_resolver.py b/tensorflow/contrib/cluster_resolver/python/training/slurm_cluster_resolver.py
index dabe2fe1d39..fcd2a846eeb 100644
--- a/tensorflow/contrib/cluster_resolver/python/training/slurm_cluster_resolver.py
+++ b/tensorflow/contrib/cluster_resolver/python/training/slurm_cluster_resolver.py
@@ -12,185 +12,24 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Implementation of Cluster Resolvers for Slurm workload manager."""
+"""Stub file for SlurmClusterResolver to maintain backwards compatibility."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import collections
-import os
-import subprocess
+# This file (and all files in this directory in general) is a backwards
+# compatibility shim that exists to re-export ClusterResolvers such that
+# existing OSS code will not be broken.
 
-from tensorflow.contrib.cluster_resolver.python.training.cluster_resolver import ClusterResolver
-from tensorflow.python.training.server_lib import ClusterSpec
+# pylint: disable=unused-import
+from tensorflow.python.distribute.cluster_resolver.slurm_cluster_resolver import SlurmClusterResolver
+# pylint: enable=unused-import
 
+from tensorflow.python.util.all_util import remove_undocumented
 
-class SlurmClusterResolver(ClusterResolver):
-  """Cluster Resolver for system with Slurm workload manager.
+_allowed_symbols = [
+    'SlurmClusterResolver',
+]
 
-  This is an implementation of cluster resolvers for Slurm clusters. This allows
-  the specification of jobs and task counts, number of tasks per node, number of
-  GPUs on each node and number of GPUs for each task, It retrieves system
-  attributes by Slurm environment variables, resolves allocated computing node
-  names, construct a cluster and return a Cluster Resolver object which an be
-  use for distributed TensorFlow.
-  """
-
-  def _resolve_hostnames(self):
-    """Resolve host names of nodes allocated in current jobs.
-
-    Returns:
-      A list of node names as strings.
-    """
-    hostlist = (subprocess.check_output(['scontrol', 'show', 'hostname']).
-                decode('utf-8').strip().split('\n'))
-    return hostlist
-
-  def __init__(self,
-               jobs,
-               port_base=8888,
-               gpus_per_node=1,
-               gpus_per_task=1,
-               tasks_per_node=None,
-               auto_set_gpu=True):
-    """Creates a new SlurmClusterResolver object.
-
-    This takes in parameters and creates a SlurmClusterResolver object. It uses
-    those parameters to check which nodes will processes reside and resolves
-    their hostnames. With the number of the GPUs on each node and number of GPUs
-    for each task it offsets the port number for each processes and allocate
-    GPUs to tasks by setting environment variables. The resolver currently
-    supports homogeneous tasks and default Slurm process allocation.
-
-    Args:
-      jobs: Dictionary with job names as key and number of tasks in the job as
-        value
-      port_base: The first port number to start with for processes on a node.
-      gpus_per_node: Number of GPUs available on each node.
-      gpus_per_task: Number of GPUs to be used for each task.
-      tasks_per_node: Number of tasks to run on each node, if not set defaults
-        to Slurm's output environment variable SLURM_NTASKS_PER_NODE.
-      auto_set_gpu: Set the visible CUDA devices automatically while resolving
-        the cluster by setting CUDA_VISIBLE_DEVICES environment variable.
-        Defaults to True.
-
-    Returns:
-      A ClusterResolver object which can be used with distributed TensorFlow.
-
-    Raises:
-      RuntimeError: If requested more GPUs per node then available or requested
-      more tasks then assigned tasks.
-    """
-
-    # check if launched by mpirun
-    if 'OMPI_COMM_WORLD_RANK' in os.environ:
-      self._rank = int(os.environ['OMPI_COMM_WORLD_RANK'])
-      num_tasks = int(os.environ['OMPI_COMM_WORLD_SIZE'])
-    else:
-      self._rank = int(os.environ['SLURM_PROCID'])
-      num_tasks = int(os.environ['SLURM_NTASKS'])
-
-    self._jobs = collections.OrderedDict(sorted(jobs.items()))
-    self._port_base = port_base
-
-    # user specification overrides SLURM specification
-    if tasks_per_node is not None:
-      self._tasks_per_node = tasks_per_node
-    elif tasks_per_node is None and 'SLURM_NTASKS_PER_NODE' in os.environ:
-      self._tasks_per_node = int(os.environ['SLURM_NTASKS_PER_NODE'])
-    else:
-      raise RuntimeError('Neither `tasks_per_node` or '
-                         'SLURM_NTASKS_PER_NODE is set.')
-
-    self._gpus_per_node = gpus_per_node
-    self._gpus_per_task = gpus_per_task
-
-    self._auto_set_gpu = auto_set_gpu
-    self._job_name = None
-    self._task_index = None
-
-    self._gpu_allocation = []
-    self._cluster_allocation = {}
-
-    if self._tasks_per_node * self._gpus_per_task > self._gpus_per_node:
-      raise RuntimeError('Requested more GPUs per node then available.')
-
-    if sum(self._jobs.values()) != num_tasks:
-      raise RuntimeError('Requested more tasks then assigned tasks.')
-
-  def cluster_spec(self):
-    """Returns a ClusterSpec object based on the latest instance group info.
-
-    This returns a ClusterSpec object for use based on information from the
-    specified initialization parameters and Slurm environment variables. The
-    cluster specification is resolved each time this function is called. The
-    resolver extract hostnames of nodes by scontrol and pack tasks in that
-    order until a node a has number of tasks that is equal to specification.
-    GPUs on nodes are allocated to tasks by specification through setting
-    CUDA_VISIBLE_DEVICES environment variable.
-
-    Returns:
-      A ClusterSpec containing host information retrieved from Slurm's
-        environment variables.
-    """
-    hostlist = self._resolve_hostnames()
-
-    task_list = []
-    self._gpu_allocation = []
-    self._cluster_allocation = {}
-
-    for host in hostlist:
-      for port_offset, gpu_offset in zip(
-          range(self._tasks_per_node),
-          range(0, self._gpus_per_node, self._gpus_per_task)):
-
-        host_addr = '%s:%d' % (host, self._port_base + port_offset)
-        task_list.append(host_addr)
-        gpu_id_list = []
-
-        for gpu_id in range(gpu_offset, gpu_offset + self._gpus_per_task):
-          gpu_id_list.append(str(gpu_id))
-
-        self._gpu_allocation.append(','.join(gpu_id_list))
-
-    cluster_rank_offset_start = 0
-    cluster_rank_offset_end = 0
-
-    for job_name, num_tasks in self._jobs.items():
-      cluster_rank_offset_end = cluster_rank_offset_start + num_tasks
-
-      self._cluster_allocation[job_name] = \
-        task_list[cluster_rank_offset_start:cluster_rank_offset_end]
-
-      if self._rank >= cluster_rank_offset_start and \
-          self._rank < cluster_rank_offset_end:
-
-        self._job_name = job_name
-        self._task_index = self._rank - cluster_rank_offset_start
-
-      cluster_rank_offset_start = cluster_rank_offset_end
-
-    if self._auto_set_gpu is True:
-      os.environ['CUDA_VISIBLE_DEVICES'] = self._gpu_allocation[self._rank]
-
-    return ClusterSpec(self._cluster_allocation)
-
-  def get_task_info(self):
-    """Returns job name and task_index for the process which calls this.
-
-    This returns the job name and task index for the process which calls this
-    function according to its rank and cluster specification. The job name and
-    task index are set after a cluster is constructed by cluster_spec otherwise
-    defaults to None.
-
-    Returns:
-      A string specifying job name the process belongs to and an integner
-        specifying the task index the process belongs to in that job.
-    """
-    return self._job_name, self._task_index
-
-  def master(self, task_type=None, task_index=None):
-    if task_type and task_index:
-      return self.cluster_spec().task_address(task_type, task_index)
-    return self._cluster_allocation[str(self._job_name)][self._task_index]
+remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/contrib/cluster_resolver/python/training/tfconfig_cluster_resolver.py b/tensorflow/contrib/cluster_resolver/python/training/tfconfig_cluster_resolver.py
index 7bbd189d03d..9db7f47dcb4 100644
--- a/tensorflow/contrib/cluster_resolver/python/training/tfconfig_cluster_resolver.py
+++ b/tensorflow/contrib/cluster_resolver/python/training/tfconfig_cluster_resolver.py
@@ -12,81 +12,25 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Implementation of Cluster Resolvers for TF_CONFIG Environment Variables."""
-
+"""Stub file for TFConfigClusterResolver to maintain backwards compatibility."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import json
-import os
+# This file (and all files in this directory in general) is a backwards
+# compatibility shim that exists to re-export ClusterResolvers such that
+# existing OSS code will not be broken.
 
-from tensorflow.contrib.cluster_resolver.python.training.cluster_resolver import ClusterResolver
-from tensorflow.python.training.server_lib import ClusterSpec
+# pylint: disable=unused-import
+from tensorflow.python.distribute.cluster_resolver.tfconfig_cluster_resolver import TFConfigClusterResolver
+# pylint: enable=unused-import
 
-_TF_CONFIG_ENV = 'TF_CONFIG'
-_SESSION_MASTER_KEY = 'session_master'
+from tensorflow.python.util.all_util import remove_undocumented
 
+_allowed_symbols = [
+    'TFConfigClusterResolver',
+]
 
-class TFConfigClusterResolver(ClusterResolver):
-  """Implementation of a ClusterResolver which reads the TF_CONFIG EnvVar."""
+remove_undocumented(__name__, _allowed_symbols)
 
-  def _load_tf_config(self):
-    return json.loads(os.environ.get(_TF_CONFIG_ENV, '{}'))
-
-  def cluster_spec(self):
-    """Returns a ClusterSpec based on the TF_CONFIG environment variable.
-
-    Returns:
-      A ClusterSpec with information from the TF_CONFIG environment variable.
-    """
-    tf_config = self._load_tf_config()
-    if 'cluster' not in tf_config:
-      return ClusterSpec({})
-    return ClusterSpec(tf_config['cluster'])
-
-  def master(self, task_type=None, task_index=0):
-    """Returns the master address to use when creating a TensorFlow session.
-
-    Args:
-      task_type: (String, optional) Overrides and sets the task_type of the
-        master.
-      task_index: (Integer, optional) Overrides and sets the task id of the
-        master.
-
-    Returns:
-      The address of the master.
-
-    Raises:
-      RuntimeError: If the task_type or task_id is not specified and the
-        `TF_CONFIG` environment variable does not contain a task section.
-    """
-
-    # If `session_master` is set, just use that.
-    tf_config = self._load_tf_config()
-    if _SESSION_MASTER_KEY in tf_config:
-      return tf_config[_SESSION_MASTER_KEY]
-
-    if 'rpc_layer' in tf_config:
-      rpclayer = '%s://' % tf_config['rpc_layer']
-    else:
-      rpclayer = ''
-
-    # Return an empty string if we are the only job in the ClusterSpec.
-    cluster_spec = self.cluster_spec()
-    if (not cluster_spec.jobs or
-        (len(cluster_spec.jobs) == 1 and
-         len(cluster_spec.job_tasks(cluster_spec.jobs[0])) == 1)):
-      return ''
-
-    # We try to auto-detect the task type and id, but uses the user-supplied one
-    # where available
-    if not task_type:
-      if 'task' not in tf_config:
-        raise RuntimeError('You must either specify a `task_type`, or your '
-                           'TF_CONFIG must contain a `task` section.')
-      task_type = tf_config['task']['type']
-      task_index = tf_config['task']['index']
-
-    return rpclayer + cluster_spec.task_address(task_type, task_index)
diff --git a/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver.py b/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver.py
index 1f6803a9ff9..3a1eaccd06e 100644
--- a/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver.py
+++ b/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver.py
@@ -1,4 +1,4 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,341 +12,24 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Implementation of Cluster Resolvers for Cloud TPUs."""
+"""Stub file for TPUClusterResolver to maintain backwards compatibility."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import os
+# This file (and all files in this directory in general) is a backwards
+# compatibility shim that exists to re-export ClusterResolvers such that
+# existing OSS code will not be broken.
 
-from six.moves.urllib.request import Request
-from six.moves.urllib.request import urlopen
+# pylint: disable=unused-import
+from tensorflow.python.distribute.cluster_resolver.tpu_cluster_resolver import TPUClusterResolver
+# pylint: enable=unused-import
 
-from tensorflow.contrib.cluster_resolver.python.training.cluster_resolver import ClusterResolver
-from tensorflow.python.training import server_lib
-from tensorflow.python.util import compat
+from tensorflow.python.util.all_util import remove_undocumented
 
-_GOOGLE_API_CLIENT_INSTALLED = True
-try:
-  from googleapiclient import discovery  # pylint: disable=g-import-not-at-top
-  from oauth2client.client import GoogleCredentials  # pylint: disable=g-import-not-at-top
-except ImportError:
-  _GOOGLE_API_CLIENT_INSTALLED = False
+_allowed_symbols = [
+    'TPUClusterResolver',
+]
 
-
-_GKE_ENV_VARIABLE = 'KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS'
-_ENDPOINTS_SEPARATOR = ','
-_DEFAULT_ENV_VARIABLE = 'TPU_NAME'
-_DISCOVERY_SERVICE_URL_ENV_VARIABLE = 'TPU_API_DISCOVERY_URL'
-
-
-class TPUClusterResolver(ClusterResolver):
-  """Cluster Resolver for Google Cloud TPUs.
-
-  This is an implementation of cluster resolvers for the Google Cloud TPU
-  service. As Cloud TPUs are in alpha, you will need to specify a API definition
-  file for this to consume, in addition to a list of Cloud TPUs in your Google
-  Cloud Platform project.
-  """
-
-  def _tpuService(self):
-    """Creates a new Cloud TPU API object.
-
-    This works around an issue where the underlying HTTP connection sometimes
-    times out when the script has been running for too long. Other methods in
-    this object calls this method to get a new API object whenever they need
-    to communicate with the Cloud API.
-
-    Returns:
-      A Google Cloud TPU API object.
-    """
-    if self._service:
-      return self._service
-
-    credentials = self._credentials
-    if credentials is None or credentials == 'default':
-      credentials = GoogleCredentials.get_application_default()
-
-    if self._discovery_url:
-      return discovery.build(
-          'tpu', 'v1alpha1',
-          credentials=credentials,
-          discoveryServiceUrl=self._discovery_url)
-    else:
-      return discovery.build(
-          'tpu', 'v1alpha1',
-          credentials=credentials)
-
-  def _requestComputeMetadata(self, path):
-    req = Request('http://metadata/computeMetadata/v1/%s' % path,
-                  headers={'Metadata-Flavor': 'Google'})
-    resp = urlopen(req)
-    return compat.as_bytes(resp.read())
-
-  def _shouldResolve(self):
-    if (self._tpu == compat.as_bytes('') or
-        self._tpu == compat.as_bytes('local') or
-        self._tpu.startswith(compat.as_bytes('/bns')) or
-        self._tpu.startswith(compat.as_bytes('localhost:')) or
-        self._tpu.startswith(compat.as_bytes('grpc://'))):
-      return False
-    return True
-
-  @staticmethod
-  def _inGke():
-    """When running in GKE, the environment variable will be set."""
-    return _GKE_ENV_VARIABLE in os.environ
-
-  @staticmethod
-  def _gkeEndpoints():
-    return os.environ[_GKE_ENV_VARIABLE]
-
-  @staticmethod
-  def _envVarFallback():
-    if _DEFAULT_ENV_VARIABLE in os.environ:
-      return os.environ[_DEFAULT_ENV_VARIABLE]
-    return None
-
-  @staticmethod
-  def _environmentDiscoveryUrl():
-    return os.environ.get(_DISCOVERY_SERVICE_URL_ENV_VARIABLE)
-
-  def __init__(self,
-               tpu=None,
-               zone=None,
-               project=None,
-               job_name='worker',
-               coordinator_name=None,
-               coordinator_address=None,
-               credentials='default',
-               service=None,
-               discovery_url=None):
-    """Creates a new TPUClusterResolver object.
-
-    The ClusterResolver will then use the parameters to query the Cloud TPU APIs
-    for the IP addresses and ports of each Cloud TPU listed.
-
-    Args:
-      tpu: Either a string, or a list of strings corresponding to the TPUs to
-        use. If the single string is the empty string, the string 'local', or a
-        string that begins with 'grpc://' or '/bns', then it is assumed to not
-        correspond with a Cloud TPU and will instead be passed as the session
-        master and no ClusterSpec propagation will be done.
-      zone: Zone where the TPUs are located. If omitted or empty, we will assume
-        that the zone of the TPU is the same as the zone of the GCE VM, which we
-        will try to discover from the GCE metadata service.
-      project: Name of the GCP project containing Cloud TPUs. If omitted or
-        empty, we will try to discover the project name of the GCE VM from the
-        GCE metadata service.
-      job_name: Name of the TensorFlow job the TPUs belong to.
-      coordinator_name: The name to use for the coordinator. Set to None if the
-        coordinator should not be included in the computed ClusterSpec.
-      coordinator_address: The address of the coordinator (typically an ip:port
-        pair). If set to None, a TF server will be started. If coordinator_name
-        is None, a TF server will not be started even if coordinator_address is
-        None.
-      credentials: GCE Credentials. If None, then we use default credentials
-        from the oauth2client
-      service: The GCE API object returned by the googleapiclient.discovery
-        function. If you specify a custom service object, then the credentials
-        parameter will be ignored.
-      discovery_url: A URL template that points to the location of
-        the discovery service. It should have two parameters {api} and
-        {apiVersion} that when filled in produce an absolute URL to the
-        discovery document for that service. The environment variable
-        'TPU_API_DISCOVERY_URL' will override this.
-
-    Raises:
-      ImportError: If the googleapiclient is not installed.
-      ValueError: If no TPUs are specified.
-    """
-    if isinstance(tpu, list):
-      if not tpu:
-        raise ValueError('At least one TPU must be specified.')
-      if len(tpu) != 1:
-        raise NotImplementedError(
-            'Using multiple TPUs in a single session is not yet implemented')
-      tpu = tpu[0]
-
-    in_gke = self._inGke()
-    # When using GKE with Cloud TPUs, the env variable will be set.
-    if tpu is None:
-      if in_gke:
-        tpu = self._gkeEndpoints()
-      else:
-        tpu = self._envVarFallback()
-
-    if tpu is None:
-      raise ValueError('Please provide a TPU Name to connect to.')
-
-    self._tpu = compat.as_bytes(tpu)  # self._tpu is always bytes
-    self._job_name = job_name
-
-    # Whether we should actually attempt to contact Cloud APIs
-    should_resolve = self._shouldResolve()
-
-    # We error out if we are in a non-Cloud environment which cannot talk to the
-    # Cloud APIs using the standard class and a special object is not passed in.
-    self._service = service
-    if (self._service is None and should_resolve and
-        not _GOOGLE_API_CLIENT_INSTALLED):
-      raise ImportError('googleapiclient and oauth2client must be installed '
-                        'before using the TPU cluster resolver. Execute: '
-                        '`pip install --upgrade google-api-python-client` '
-                        'and `pip install --upgrade oauth2client` to '
-                        'install with pip.')
-
-    # We save user-passed credentials, unless the user didn't pass in anything.
-    self._credentials = credentials
-    if (credentials == 'default' and should_resolve and
-        _GOOGLE_API_CLIENT_INSTALLED):
-      self._credentials = None
-
-    # Automatically detect project and zone if unspecified.
-    if not project and should_resolve:
-      project = compat.as_str(
-          self._requestComputeMetadata('project/project-id'))
-    if not zone and should_resolve:
-      zone_path = compat.as_str(self._requestComputeMetadata('instance/zone'))
-      zone = zone_path.split('/')[-1]
-    self._project = project
-    self._zone = zone
-
-    self._discovery_url = self._environmentDiscoveryUrl() or discovery_url
-
-    self._coordinator_name = coordinator_name
-    if (coordinator_name and not coordinator_address and
-        (should_resolve or in_gke)):
-      self._start_local_server()
-    else:
-      self._coordinator_address = coordinator_address
-
-  def master(self, task_type=None, task_index=None):
-    """Get the Master string to be used for the session.
-
-    In the normal case, this returns the grpc path (grpc://1.2.3.4:8470) of
-    first instance in the ClusterSpec returned by the cluster_spec function.
-
-    If a non-TPU name is used when constructing a TPUClusterResolver, that will
-    be returned instead (e.g. If the tpus argument's value when constructing
-    this TPUClusterResolver was 'grpc://10.240.1.2:8470',
-    'grpc://10.240.1.2:8470' will be returned).
-
-    Args:
-      task_type: (Optional) The type of the TensorFlow task of the master.
-      task_index: (Optional) The index of the TensorFlow task of the master.
-
-    Returns:
-      string, the connection string to use when creating a session.
-
-    Raises:
-      ValueError: If none of the TPUs specified exists.
-    """
-    if not self._shouldResolve():
-      return self._tpu.split(compat.as_bytes(_ENDPOINTS_SEPARATOR))[0]
-
-    cluster_spec = self.cluster_spec()
-    if task_type and task_index:
-      return cluster_spec.task_address(task_type, task_index)
-
-    job_tasks = cluster_spec.job_tasks(self._job_name)
-    if not job_tasks:
-      raise ValueError('No TPUs exists with the specified names exist.')
-
-    return 'grpc://' + job_tasks[0]
-
-  def get_master(self):
-    return self.master()
-
-  def get_job_name(self):
-    if self._shouldResolve():
-      return self._job_name
-
-  def cluster_spec(self):
-    """Returns a ClusterSpec object based on the latest TPU information.
-
-    We retrieve the information from the GCE APIs every time this method is
-    called.
-
-    Returns:
-      A ClusterSpec containing host information returned from Cloud TPUs.
-
-    Raises:
-      RuntimeError: If the provided TPU is not healthy.
-    """
-    ############################################################################
-    # There are 5 potential cases this code must handle:
-    #  1. [Normal case.] We should resolve the TPU name to a set of tasks, and
-    #      a. Create a ClusterSpec that includes the coordinator job
-    #      b. Create a ClusterSpec without the coordinator job.
-    #  2. [GKE / No API Access.] We should not resolve the TPU name to a set of
-    #     tasks and
-    #      a. Create a ClusterSpec with the coordinator
-    #      b. Create a ClusterSpec without the coordinator
-    #  3. [Other (legacy non-gRPC).] We should return an empty ClusterSpec.
-    ############################################################################
-
-    if self._shouldResolve():
-      # Case 1.
-      full_name = 'projects/%s/locations/%s/nodes/%s' % (
-          self._project, self._zone, compat.as_text(self._tpu))
-      service = self._tpuService()
-      request = service.projects().locations().nodes().get(name=full_name)
-      response = request.execute()
-
-      if 'state' in response and response['state'] != 'READY':
-        raise RuntimeError('TPU "%s" is not yet ready; state: "%s"' %
-                           (compat.as_text(self._tpu), response['state']))
-
-      if 'health' in response and response['health'] != 'HEALTHY':
-        raise RuntimeError('TPU "%s" is unhealthy: "%s"' %
-                           (compat.as_text(self._tpu), response['health']))
-
-      if 'networkEndpoints' in response:
-        worker_list = [
-            '%s:%s' % (endpoint['ipAddress'], endpoint['port'])
-            for endpoint in response['networkEndpoints']
-        ]
-      else:
-        # Fall back to the deprecated response format
-        instance_url = '%s:%s' % (response['ipAddress'], response['port'])
-        worker_list = [instance_url]
-
-      cluster_spec = {self._job_name: worker_list}
-    else:
-      if not self._tpu.startswith(compat.as_bytes('grpc://')):
-        # Case 3.
-        return None
-      # Case 2.
-      cluster_spec = {
-          self._job_name: [
-              x[len(compat.as_bytes('grpc://')):]
-              for x in self._tpu.split(compat.as_bytes(_ENDPOINTS_SEPARATOR))
-          ]
-      }
-
-    if self._coordinator_address:
-      # {1, 2}.a
-      cluster_spec[self._coordinator_name] = [self._coordinator_address]
-
-    return server_lib.ClusterSpec(cluster_spec)
-
-  def _start_local_server(self):
-    address = self._requestComputeMetadata('instance/network-interfaces/0/ip')
-    self._server = server_lib.Server(
-        {
-            'local': ['0.0.0.0:0']
-        }, protocol='grpc', config=None, start=True)
-    # self._server.target is of the form: grpc://ipaddress:port
-    target = compat.as_bytes(self._server.target)
-    splits = target.split(compat.as_bytes(':'))
-    assert len(splits) == 3, self._server.target
-    assert splits[0] == compat.as_bytes('grpc'), self._server.target
-    self._coordinator_port = compat.as_text(splits[2])
-    self._coordinator_address = '%s:%s' % (
-        address, compat.as_text(self._coordinator_port))
-
-  def __deepcopy__(self, memo):
-    # TODO(b/73668574): Remove this once RunConfig avoids performing deepcopy.
-    return self
+remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/contrib/cmake/CMakeLists.txt b/tensorflow/contrib/cmake/CMakeLists.txt
index a63366e1361..124d6cfd478 100644
--- a/tensorflow/contrib/cmake/CMakeLists.txt
+++ b/tensorflow/contrib/cmake/CMakeLists.txt
@@ -12,7 +12,7 @@ if(WIN32)
 endif()
 
 # Project
-project(tensorflow C CXX)
+project(tensorflow VERSION 1.12.0 LANGUAGES C CXX)
 
 # Set C++14 as standard for the whole project
 set(CMAKE_CXX_STANDARD 14)
@@ -193,6 +193,7 @@ if(WIN32)
   set(CMAKE_SUPPRESS_REGENERATION ON)
 endif()
 
+
 if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-exceptions -std=c++11")
 endif()
@@ -281,6 +282,14 @@ else (systemlib_ZLIB)
     ${zlib_STATIC_LIBRARIES})
 endif (systemlib_ZLIB)
 
+if (systemlib_ABSEIL_CPP)
+  set(tensorflow_EXTERNAL_LIBRARIES ${tensorflow_EXTERNAL_LIBRARIES}
+      ${abseil_cpp_LIBRARIES})
+else (systemlib_ABSEIL_CPP)
+  set(tensorflow_EXTERNAL_LIBRARIES ${tensorflow_EXTERNAL_LIBRARIES}
+    ${abseil_cpp_STATIC_LIBRARIES})
+endif (systemlib_ABSEIL_CPP)
+
 set(tensorflow_EXTERNAL_DEPENDENCIES
     zlib_copy_headers_to_destination
     gif_copy_headers_to_destination
@@ -394,6 +403,7 @@ if (tensorflow_ENABLE_GPU)
   set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};--include-path ${PROJECT_BINARY_DIR}/$\{build_configuration\};--expt-relaxed-constexpr)
   set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-ftz=true)  # Flush denormals to zero
   set(CUDA_INCLUDE ${CUDA_TOOLKIT_TARGET_DIR} ${CUDA_TOOLKIT_TARGET_DIR}/extras/CUPTI/include)
+
   include_directories(${CUDA_INCLUDE})
   if (WIN32)
     add_definitions(-DGOOGLE_CUDA=1 -DTF_EXTRA_CUDA_CAPABILITIES=3.7,5.2,6.0,6.1,7.0)
@@ -546,14 +556,20 @@ if (tensorflow_ENABLE_GPU)
       cudnn_version_number=${tensorflow_CUDNN_VERSION})
   endif(WIN32)
 else(tensorflow_ENABLE_GPU)
-  set(tensorflow_BUILD_INFO_FLAGS --build_config cpu --key_value
-    msvcp_dll_name=msvcp140.dll)
+  if(WIN32)
+    set(tensorflow_BUILD_INFO_FLAGS --build_config cpu --key_value
+      msvcp_dll_name=msvcp140.dll)
+  else()
+    set(tensorflow_BUILD_INFO_FLAGS --build_config cpu)
+  endif()
 endif(tensorflow_ENABLE_GPU)
 
-# Find python executable
-include(FindPythonInterp)
-if(NOT ${PYTHONINTERP_FOUND})
-    message(FATAL_ERROR "CMake was unable to find a python interpreter.")
+if(tensorflow_BUILD_PYTHON_BINDINGS)
+  # Find python executable
+  include(FindPythonInterp)
+  if(NOT ${PYTHONINTERP_FOUND})
+      message(FATAL_ERROR "CMake was unable to find a python interpreter.")
+  endif()
 endif()
 
 # Let's get to work!
@@ -574,6 +590,7 @@ include(tf_cc_ops.cmake)
 include(tf_c.cmake)
 include(tf_grappler.cmake)
 include(tf_core_profiler.cmake)
+include(tf_core_eager_runtime.cmake)
 if(tensorflow_BUILD_CC_EXAMPLE)
   include(tf_tutorials.cmake)
   include(tf_label_image_example.cmake)
@@ -587,4 +604,4 @@ if(tensorflow_BUILD_SHARED_LIB)
 endif()
 if(tensorflow_BUILD_CC_TESTS OR tensorflow_BUILD_PYTHON_TESTS)
   include(tf_tests.cmake)
-endif()
+endif()
\ No newline at end of file
diff --git a/tensorflow/contrib/cmake/README.md b/tensorflow/contrib/cmake/README.md
index 84c679162c3..df5ff6cd532 100644
--- a/tensorflow/contrib/cmake/README.md
+++ b/tensorflow/contrib/cmake/README.md
@@ -6,9 +6,9 @@ platforms. For details, see the
 [TensorFlow install guide](https://www.tensorflow.org/install/).
 
 This directory contains CMake files for building TensorFlow on Microsoft
-Windows. [CMake](https://cmake.org) is a cross-platform tool that can
+Windows and Linux. [CMake](https://cmake.org) is a cross-platform tool that can
 generate build scripts for multiple build systems, including Microsoft
-Visual Studio.
+Visual Studio and GCC. "The method has not been tested on Mac OS X.
 
 **N.B.** We provide Linux build instructions primarily for the purpose of
 testing the build. We recommend using the standard Bazel-based build on
@@ -23,6 +23,7 @@ for instructions on how to install a pre-built TensorFlow package on Windows.
 ### Current known limitations
 * It is not possible to load a custom Op library.
 * GCS file system is not supported.
+* Debug build is not available since Python for Windows is no longer distributed with a debug library.
 
 ## Building with CMake
 
@@ -53,12 +54,12 @@ bindings.
 ### Known-good configurations
 
 * Microsoft Windows 10
-  - Microsoft Visual Studio Enterprise 2015 with Visual C++ 2015
+  - Microsoft Visual Studio Enterprise/ Community 2015 with Visual C++ 2015
   - [Anaconda 4.1.1 (Python 3.5 64-bit)](https://www.anaconda.com/download/)
   - [Git for Windows version 2.9.2.windows.1](https://git-scm.com/download/win)
   - [swigwin-3.0.10](http://www.swig.org/download.html)
-  - [NVidia CUDA Toolkit 8.0](https://developer.nvidia.com/cuda-downloads)
-  - [NVidia CUDNN 5.1](https://developer.nvidia.com/cudnn)
+  - [NVidia CUDA Toolkit 9.0](https://developer.nvidia.com/cuda-downloads)
+  - [NVidia CUDNN 7](https://developer.nvidia.com/cudnn)
   - [CMake 3.6](https://cmake.org/files/v3.6/cmake-3.6.3-win64-x64.msi)
 
 * Ubuntu 14.04
@@ -66,8 +67,8 @@ bindings.
   - Docker 1.9.1 (for automated testing)
 
 ### Current known limitations
-  - The Python package supports **Python 3.5 only**, because that is the only
-    version for which standard Python binaries exist and those binaries are
+  - The Python package supports **Python 3.5/3.6 only**, because these are the only
+    versions for which standard Python binaries exist and those binaries are
     compatible with the TensorFlow runtime. (On Windows, the standard Python
     binaries for versions earlier than 3.5 were compiled with older compilers
     that do not have all of the features (e.g. C++11 support) needed to compile
@@ -104,8 +105,151 @@ We are actively working on improving CMake and Windows support, and addressing
 these limitations. We would appreciate pull requests that implement missing
 ops or APIs.
 
+CMake GUI build (all platforms)
+==================================
+Install from CMake GUI would be a convenient way to generate C++ build projects. The software supports Windows, MacOS and Linux, while the posix platform provides an extra ccmake binary to run command line GUI. Both working principal of cmake, ccmake and cmake-gui are the same, the only difference is by providing suitable interface for project configuration and dependency setting.
 
-Step-by-step Windows build
+0. Pre-buid checklist:
+    The following binary/libraries should be setted in system path, otherwise you need to set manualy via cmake.
+    * Compiler (GCC for Linux, MSVC for Windows)
+    * Make sure compiler directory has been set to system path
+    * CUDA 9.0 (GPU build)
+    * CUDNN (GPU build)
+    * NCCL (GPU build on Linux)
+    * SWIG (python binding) 
+    * Perl (required if you need ssl support, optional)
+    * Go (required if you need ssl support, optional)
+    * NASM/YASM (required by grpc for ssl support, optional)
+1. Start CMake GUI
+2. Click on `Browse Source` and direct to the the folder `<tensorflow-source>/tensorflow/contrib/cmake`
+3. Click on `Browse Build` and spectify a location that you want tensorflow to be build
+4. Click on `Configure`, a new window will be prompted out, specify the generator mode for the project generation. For Windows, choose `Visual Studio <version> <year> Win64`, for Linux, choose `Unix Makefiles`, then press `Finish`. Wait for a moment, the default project dependecy would automatically generate.
+5. There are a few options that you can customize your own build. **The setting here is crucial for a sucessful build, please check all items carefully.**
+    * `tensorflow_BUILD_ALL_KERNELS` should alway be `on`
+    * `tensorflow_BUILD_CC_EXAMPLE` is default to be `on`. This can help you to test build (optional)
+    * `tensorflow_BUILD_CONTRIB_KERNELS` is default to be `on`, but it won't affect tensorflow function, turn it to `off` if you want a slim build. (optional)
+    * `tensorflow_BUILD_PYTHON_BINDING` is default to be `on`. Set to `off` if you don't need python interaface. If SWIG is not in system path, you need set it manually. (optional)
+    * `tensorflow_BUILD_SHARED_LIB` is default to be `off`. Set to `on` if you want the c++ interface. (optional)
+    * `tensorflow_ENABLE_GPU` is default to be `off`. Set to `on` if you want GPU support. It will search CUDA and CUDNN dependecies if you have set them to system path, otherwise CMake would prompt error and request you to set it manually. (optional)
+    * `tensorflow_ENABLE_GRPC_SUPPORT` is default to be `on`. For Linux build, this option must always be `on`. This need to be `on` for a gpu build. Reminded that Perl, Go and NASM/YASM are required for this option if you want to build grpc with offical SSL support.
+    * `tensorflow_ENABLE_POSITION_INDEPENDENT_CODE` should always be `on`
+    * `tensorflow_ENABLE_SNAPPY_SUPPORT` should always be `on` 
+    * `tensorflow_OPTIMIZE_FOR_NATIVE_ARCH` should always be `on`
+    * `CMAKE_INSTALL_PREFIX` is the location where the final package will be installed. You may change it to your own preferred path (optional)
+
+6. After changing the configuration in step 5, press `Configure` again
+7. If not error is found, press `Generate`
+
+#### Windows
+
+1. Open `tensorflow.sln` in the build folder (Windows). Change build type from `Debug` to `Release`. Choose `Build`->`Build Solution`. This may take more than hours of compilation. If everything is alright, the output window would show no error.
+
+    ##### Python
+
+    In solution explorer, right click on `tf_python_build_pip_package` -> `build`. It will generate the wheel file in `<tensorflow-build>/tf_python/dist`. Install with following command:
+
+     ```pip install --upgrade tensorflow-<config>.whl```
+
+    ***The wheel name varies depends on you config. Change to your own wheel filename.***
+
+    Reminded that some pip installation requires administrator right command prompt.
+
+    ##### C++
+
+    You can directly use the build folder tree for C++ interface with cmake. If you want to do installation for api releasing, right click on `Install` -> `build`. The headers and library will be installed in the directory specify by `CMAKE_INSTALL_PREFIX` during configuration.
+
+2. For smaller RAM computer, it is noticed that out of heap space error appears. Change to command prompt build is an alternative to do step 1. 
+
+    Open `VS2015 x64 Native Tools Command Prompt`. You can open it by press `Start`, then type the binary name. Use `VS2017 x64 Native Tools Command Prompt` if you are using MSVC 2017.
+
+    ##### Python
+
+    Directly build python wheel package by following command:
+
+    ```MSBuild /p:Configuration=Release <path-to-tf_python_build_pip_package.vcxproj>```
+
+    Remember to change `<path-to-tf_python_build_pip_package.vcxproj>` to the actual path of the file, it can be found at the root of build directory
+
+    Install the wheel file generated as instructed by step 1.
+
+    ##### C++ interface
+    Build from VS native toolchain with following command:
+    ```MSBuild /p:Configuration=Release <path-to-ALL_BUILD.vcxproj>```
+
+    Headers are discretely located in the build folders. Tensorflow library can be found at `<path-to-build>/Release`, namely `tensorflow.dll` and `tensorflow.lib`.
+
+    * Build to install for api release (optional):
+    ```MSBuild /p:Configuration=Release <path-to-INSTALL.vcxproj>```
+
+    Remember to change `<path-to-ALL_BUILD.vcxproj>` and `<path-to-INSTALL.vcxproj>` to the actual path of the file, it can be found at the root of build directory.
+
+#### Linux/MacOS (command line GNU build)
+
+1. Open the terminal, change working directory to the one specified in step 3.
+
+2. Type the following command:
+
+    ```make -sj<number-of-threads> all```
+
+    ##### Python
+
+    **Important Note** CMake generated python wheel for Linux/MacOs is currently under development. Please use bazel build.
+
+    Follow code is an expected Linux/MacOS python package build after development work is completed.
+
+    ```
+    make -sj<number-of-threads> tf_python_build_pip_package
+    cd tf_python
+    pip install --upgrade tensorflow-<config>.whl
+    ```
+
+    ##### C++ interface
+
+    ```make -sj<number-of-threads> install```
+
+    Where `<number-of-threads>` is the threads used for the compilation, change to any integer less or equal to your computer's maxiumum thread number.
+
+     Headers are discretely located in the build folders. Tensorflow library can be found at `<path-to-build>`, namely `tensorflow.so` (Linux) or `tensorflow.dylib` (MacOS).
+
+#### Start a Tensorflow C++ project with CMake
+Here we assume that you have basic knowledge on gathering dependency with `CMakeLists.txt`. Here we introduce how the C++ api works with [official hello world tutorial](https://www.tensorflow.org/api_guides/cc/guide).
+
+1. Create a new working directory and create a new text file named `CMakeLists.txt` and the c++ file `main.cxx`
+2. Fill in the `main.cxx` with the code provided in [official c++ api basic](https://www.tensorflow.org/api_guides/cc/guide).
+3. Fill in the `CMakeLists.txt` with following code:
+    ``` cmake
+    cmake_minimum_required (VERSION 2.6)
+    project (tf_hello)
+
+    # Tensorflow
+    find_package(Tensorflow REQUIRED)
+    include_directories(${TENSORFLOW_INCLUDE_DIRS})
+
+    # compiler setting required by tensorflow, to be tested on all compilers
+    # currently only tested on MSVC and GCC
+    if (${CMAKE_CXX_COMPILER_ID} STREQUAL MSVC) 
+      add_definitions(-DCOMPILER_MSVC)
+    elseif (${CMAKE_CXX_COMPILER_ID} STREQUAL GNU)
+      if (${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS "3")
+        add_definitions(-DCOMPILER_GCC3)
+      else()
+        add_definitions(-D__GNUC__)
+      endif()
+    else()
+      message(ERROR " compiler ${CMAKE_CXX_COMPILER_ID} not supported by this CMakeList.txt, under development")
+    endif()
+
+    add_executable(tf_hello main.cxx)
+    target_link_libraries(tf_hello ${TENSORFLOW_LIBRARIES})
+    ```
+4. Configure the folder with cmake-gui, an error should be prompted out, requesting you to locate the folder containing `TensorflowConfig.cmake`. This file can be found at `<tensorflow-build>` or `<tensorflow-intall>` (for those have build install in previous steps).
+
+5. Configure again, generate the project.
+6. Compile the project with `Release` config (Windows). For Linux users, just compile the project.
+7. Copy the `tensorflow.dll`(Windows)/`tensorflow.so`(Linux) from build directory to the build folder containing `tf_hello` binary.
+8. Run `tf_hello` binary
+
+Step-by-step Windows build (command prompt)
 ==========================
 
 1.  Install the prerequisites detailed above, and set up your environment.
@@ -292,4 +436,4 @@ $ cd tensorflow
 $ tensorflow/tools/ci_build/ci_build.sh CMAKE tensorflow/tools/ci_build/builds/cmake.sh
 ```
 
-That's it. Dependencies included.
+That's it. Dependencies included.
\ No newline at end of file
diff --git a/tensorflow/contrib/cmake/TensorflowConfig.cmake.in b/tensorflow/contrib/cmake/TensorflowConfig.cmake.in
new file mode 100644
index 00000000000..cc04db6e952
--- /dev/null
+++ b/tensorflow/contrib/cmake/TensorflowConfig.cmake.in
@@ -0,0 +1,16 @@
+# - Config file for the Tensorflow package
+# It defines the following variables
+#  TENSORFLOW_INCLUDE_DIRS - include directories for FooBar
+#  TENSORFLOW_LIBRARIES    - libraries to link against
+ 
+# Compute paths
+get_filename_component(TENSORFLOW_CMAKE_DIR "${CMAKE_CURRENT_LIST_FILE}" PATH)
+set(TENSORFLOW_INCLUDE_DIRS "@CONF_INCLUDE_DIRS@")
+ 
+# Our library dependencies (contains definitions for IMPORTED targets)
+if(NOT TENSORFLOW_BINARY_DIR)
+  include("${TENSORFLOW_CMAKE_DIR}/TensorflowTargets.cmake")
+endif()
+ 
+# These are IMPORTED targets created by TensorflowTargets.cmake
+set(TENSORFLOW_LIBRARIES tensorflow)
\ No newline at end of file
diff --git a/tensorflow/contrib/cmake/TensorflowConfigVersion.cmake.in b/tensorflow/contrib/cmake/TensorflowConfigVersion.cmake.in
new file mode 100644
index 00000000000..2a9609ddb9c
--- /dev/null
+++ b/tensorflow/contrib/cmake/TensorflowConfigVersion.cmake.in
@@ -0,0 +1,11 @@
+set(PACKAGE_VERSION "@TENSORFLOW_VERSION@")
+ 
+# Check whether the requested PACKAGE_FIND_VERSION is compatible
+if("${PACKAGE_VERSION}" VERSION_LESS "${PACKAGE_FIND_VERSION}")
+  set(PACKAGE_VERSION_COMPATIBLE FALSE)
+else()
+  set(PACKAGE_VERSION_COMPATIBLE TRUE)
+  if ("${PACKAGE_VERSION}" VERSION_EQUAL "${PACKAGE_FIND_VERSION}")
+    set(PACKAGE_VERSION_EXACT TRUE)
+  endif()
+endif()
\ No newline at end of file
diff --git a/tensorflow/contrib/cmake/external/abseil_cpp.cmake b/tensorflow/contrib/cmake/external/abseil_cpp.cmake
index 4546dbdecc0..46a193971c5 100644
--- a/tensorflow/contrib/cmake/external/abseil_cpp.cmake
+++ b/tensorflow/contrib/cmake/external/abseil_cpp.cmake
@@ -31,27 +31,24 @@ if (systemlib_ABSEIL_CPP)
   message(STATUS "  abseil_cpp includes: ${ABSEIL_CPP_INCLUDE_DIR}")
   message(STATUS "  abseil_cpp libraries: ${ABSEIL_CPP_LIBRARIES}")
 
-  add_custom_target(abseil_cpp_build)
-  list(APPEND tensorflow_EXTERNAL_DEPENDENCIES abseil_cpp_build)
+  add_custom_target(abseil_cpp)
+  list(APPEND tensorflow_EXTERNAL_DEPENDENCIES abseil_cpp)
 
 else (systemlib_ABSEIL_CPP)
 
   include (ExternalProject)
 
-  set(abseil_cpp_INCLUDE_DIR ${CMAKE_BINARY_DIR}/abseil_cpp/src/abseil_cpp_build)
+  set(abseil_cpp_INCLUDE_DIR ${CMAKE_BINARY_DIR}/abseil_cpp/src/abseil_cpp)
   set(abseil_cpp_URL https://github.com/abseil/abseil-cpp/archive/e01d95528ea2137a4a27a88d1f57c6cb260aafed.tar.gz)
   set(abseil_cpp_HASH SHA256=84043ed402d2a2a6ba4cdddb7e85118b1158fd81fe4ac3a14adc343d054c1e2e)
-  set(abseil_cpp_BUILD ${CMAKE_BINARY_DIR}/abseil_cpp/src/abseil_cpp_build)
+  set(abseil_cpp_BUILD ${CMAKE_BINARY_DIR}/abseil_cpp/src/abseil_cpp-build)
 
   if(WIN32)
     if(${CMAKE_GENERATOR} MATCHES "Visual Studio.*")
       set(abseil_cpp_STATIC_LIBRARIES
           ${abseil_cpp_BUILD}/absl/base/Release/absl_base.lib
-          ${abseil_cpp_BUILD}/absl/base/Release/absl_spinlock_wait.lib
           ${abseil_cpp_BUILD}/absl/base/Release/absl_dynamic_annotations.lib
-          ${abseil_cpp_BUILD}/absl/base/Release/absl_malloc_internal.lib
-          ${abseil_cpp_BUILD}/absl/base/Release/absl_throw_delegate.lib
-          ${abseil_cpp_BUILD}/absl/numeric/Release/absl_int128.lib
+          ${abseil_cpp_BUILD}/absl/base/Release/absl_internal_malloc_internal.lib
           ${abseil_cpp_BUILD}/absl/strings/Release/absl_strings.lib
           ${abseil_cpp_BUILD}/absl/strings/Release/str_format_internal.lib
           ${abseil_cpp_BUILD}/absl/types/Release/absl_bad_optional_access.lib)
@@ -80,15 +77,12 @@ else (systemlib_ABSEIL_CPP)
         ${abseil_cpp_BUILD}/absl/types/libabsl_bad_optional_access.a)
   endif()
 
-  ExternalProject_Add(abseil_cpp_build
+  ExternalProject_Add(abseil_cpp
       PREFIX abseil_cpp
       URL ${abseil_cpp_URL}
       URL_HASH ${abseil_cpp_HASH}
       DOWNLOAD_DIR "${DOWNLOAD_LOCATION}"
-      BUILD_IN_SOURCE 1
       BUILD_BYPRODUCTS ${abseil_cpp_STATIC_LIBRARIES}
-      BUILD_COMMAND ${CMAKE_COMMAND} --build . --config Release
-      COMMAND ${CMAKE_COMMAND} --build . --config Release
       INSTALL_COMMAND ""
       CMAKE_CACHE_ARGS
           -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=${tensorflow_ENABLE_POSITION_INDEPENDENT_CODE}
@@ -99,6 +93,6 @@ else (systemlib_ABSEIL_CPP)
   include_directories(${abseil_cpp_INCLUDE_DIR})
   list(APPEND tensorflow_EXTERNAL_LIBRARIES ${abseil_cpp_STATIC_LIBRARIES})
 
-  list(APPEND tensorflow_EXTERNAL_DEPENDENCIES abseil_cpp_build)
+  list(APPEND tensorflow_EXTERNAL_DEPENDENCIES abseil_cpp)
 
-endif (systemlib_ABSEIL_CPP)
+endif (systemlib_ABSEIL_CPP)
\ No newline at end of file
diff --git a/tensorflow/contrib/cmake/external/png.cmake b/tensorflow/contrib/cmake/external/png.cmake
index 1a147e9c8e5..32e6d78e508 100644
--- a/tensorflow/contrib/cmake/external/png.cmake
+++ b/tensorflow/contrib/cmake/external/png.cmake
@@ -59,6 +59,7 @@ ExternalProject_Add(png
         -DCMAKE_VERBOSE_MAKEFILE:BOOL=OFF
         -DCMAKE_INSTALL_PREFIX:STRING=${png_INSTALL}
 	-DZLIB_ROOT:STRING=${ZLIB_INSTALL}
+  -DPNG_TESTS:BOOL=OFF
 )
 
 ## put png includes in the directory where they are expected
diff --git a/tensorflow/contrib/cmake/modules/FindAbseilCpp.cmake b/tensorflow/contrib/cmake/modules/FindAbseilCpp.cmake
index d4f8bb1bec9..944ae3997a9 100644
--- a/tensorflow/contrib/cmake/modules/FindAbseilCpp.cmake
+++ b/tensorflow/contrib/cmake/modules/FindAbseilCpp.cmake
@@ -24,10 +24,10 @@ if(EXISTS "${ABSEIL_CPP_INCLUDE_DIR}" AND NOT "${ABSEIL_CPP_INCLUDE_DIR}" STREQU
     # search all libraries if no COMPONENTS was requested
     set(AbseilCpp_FIND_COMPONENTS
         "absl_algorithm;absl_any;absl_bad_any_cast"
-        "absl_bad_optional_access;absl_base absl_container;absl_debugging"
+        "absl_bad_optional_access;absl_base;absl_container;absl_debugging"
         "absl_dynamic_annotations;absl_examine_stack;absl_failure_signal_handler"
-        "absl_int128;absl_leak_check;absl_malloc_internal;absl_memory;absl_meta"
-        "absl_numeric;absl_optional;absl_span;absl_spinlock_wait;absl_stack_consumption"
+        "absl_int128;absl_leak_check;absl_internal_malloc_internal;absl_memory;absl_meta"
+        "absl_numeric;absl_optional;absl_span;absl_internal_spinlock_wait;absl_stack_consumption"
         "absl_stacktrace;absl_str_format;absl_strings;absl_symbolize;absl_synchronization"
         "absl_throw_delegate;absl_time;absl_utility;str_format_extension_internal"
         "str_format_internal;test_instance_tracker_lib")
diff --git a/tensorflow/contrib/cmake/tf_c.cmake b/tensorflow/contrib/cmake/tf_c.cmake
index 7a30eb94f54..a04142bd249 100644
--- a/tensorflow/contrib/cmake/tf_c.cmake
+++ b/tensorflow/contrib/cmake/tf_c.cmake
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
+
 ########################################################
 # tf_c_framework library
 ########################################################
diff --git a/tensorflow/contrib/cmake/tf_core_cpu.cmake b/tensorflow/contrib/cmake/tf_core_cpu.cmake
index a54cbff33b6..d8884d464fb 100644
--- a/tensorflow/contrib/cmake/tf_core_cpu.cmake
+++ b/tensorflow/contrib/cmake/tf_core_cpu.cmake
@@ -39,6 +39,8 @@ file(GLOB_RECURSE tf_core_cpu_exclude_srcs
     "${tensorflow_source_dir}/tensorflow/core/*test*.h"
     "${tensorflow_source_dir}/tensorflow/core/*test*.cc"
     "${tensorflow_source_dir}/tensorflow/core/*main.cc"
+    "${tensorflow_source_dir}/tensorflow/core/common_runtime/eager/*.cc"
+    "${tensorflow_source_dir}/tensorflow/core/common_runtime/eager/*.h"
     "${tensorflow_source_dir}/tensorflow/core/common_runtime/gpu/*.cc"
     "${tensorflow_source_dir}/tensorflow/core/common_runtime/gpu_device_factory.cc"
     "${tensorflow_source_dir}/tensorflow/core/common_runtime/direct_session.cc"
diff --git a/tensorflow/contrib/cmake/tf_core_eager_runtime.cmake b/tensorflow/contrib/cmake/tf_core_eager_runtime.cmake
new file mode 100644
index 00000000000..78e4c0d3035
--- /dev/null
+++ b/tensorflow/contrib/cmake/tf_core_eager_runtime.cmake
@@ -0,0 +1,57 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+########################################################
+# tf_core_eager_runtime library
+########################################################
+file(GLOB_RECURSE tf_core_eager_runtime_srcs
+    "${tensorflow_source_dir}/tensorflow/core/common_runtime/eager/*.cc"
+    "${tensorflow_source_dir}/tensorflow/core/common_runtime/eager/*.h"
+)
+
+file(GLOB_RECURSE tf_core_eager_runtime_exclude_srcs
+    "${tensorflow_source_dir}/tensorflow/core/common_runtime/eager/*test*.h"
+    "${tensorflow_source_dir}/tensorflow/core/common_runtime/eager/*test*.cc"
+)
+
+list(REMOVE_ITEM tf_core_eager_runtime_srcs ${tf_core_eager_runtime_exclude_srcs})
+
+add_library(tf_core_eager_runtime OBJECT ${tf_core_eager_runtime_srcs})
+add_dependencies(
+	tf_core_eager_runtime 
+	tf_c 
+	tf_core_lib)
+
+
+file(GLOB_RECURSE tf_c_eager_srcs
+    "${tensorflow_source_dir}/tensorflow/c/eager/*.cc"
+    "${tensorflow_source_dir}/tensorflow/c/eager/*.h"
+)
+
+file(GLOB_RECURSE tf_c_eager_exlclude_srcs
+    "${tensorflow_source_dir}/tensorflow/c/eager/*test*.h"
+    "${tensorflow_source_dir}/tensorflow/c/eager/*test*.cc"
+)
+
+list(REMOVE_ITEM tf_c_eager_srcs ${tf_c_eager_exlclude_srcs})
+
+add_library(tf_c_eager OBJECT ${tf_c_eager_srcs})
+add_dependencies(
+  tf_c_eager
+  tf_core_eager_runtime
+  tf_c
+  tf_cc_framework
+  tf_cc_while_loop
+  tf_core_lib
+  tf_protos_cc)
\ No newline at end of file
diff --git a/tensorflow/contrib/cmake/tf_core_framework.cmake b/tensorflow/contrib/cmake/tf_core_framework.cmake
index 7e806685b84..d7b2a1339e0 100644
--- a/tensorflow/contrib/cmake/tf_core_framework.cmake
+++ b/tensorflow/contrib/cmake/tf_core_framework.cmake
@@ -140,16 +140,19 @@ set(tf_proto_text_srcs
     "tensorflow/core/example/example.proto"
     "tensorflow/core/example/feature.proto"
     "tensorflow/core/framework/allocation_description.proto"
+    "tensorflow/core/framework/api_def.proto"
     "tensorflow/core/framework/attr_value.proto"
     "tensorflow/core/framework/cost_graph.proto"
     "tensorflow/core/framework/device_attributes.proto"
     "tensorflow/core/framework/function.proto"
     "tensorflow/core/framework/graph.proto"
     "tensorflow/core/framework/graph_transfer_info.proto"
+    "tensorflow/core/framework/iterator.proto"
     "tensorflow/core/framework/kernel_def.proto"
     "tensorflow/core/framework/log_memory.proto"
     "tensorflow/core/framework/node_def.proto"
     "tensorflow/core/framework/op_def.proto"
+    "tensorflow/core/framework/reader_base.proto"
     "tensorflow/core/framework/remote_fused_graph_execute_info.proto"
     "tensorflow/core/framework/resource_handle.proto"
     "tensorflow/core/framework/step_stats.proto"
@@ -159,6 +162,7 @@ set(tf_proto_text_srcs
     "tensorflow/core/framework/tensor_shape.proto"
     "tensorflow/core/framework/tensor_slice.proto"
     "tensorflow/core/framework/types.proto"
+    "tensorflow/core/framework/variable.proto"
     "tensorflow/core/framework/versions.proto"
     "tensorflow/core/lib/core/error_codes.proto"
     "tensorflow/core/protobuf/cluster.proto"
@@ -204,10 +208,10 @@ file(GLOB tf_core_platform_srcs
     "${tensorflow_source_dir}/tensorflow/core/framework/resource_handle.h"
     "${tensorflow_source_dir}/tensorflow/core/framework/resource_handle.cc")
 if (NOT tensorflow_ENABLE_GPU)
-  file(GLOB tf_core_platform_gpu_srcs
+  file(GLOB tf_core_platform_gpu_srcs_exclude
       "${tensorflow_source_dir}/tensorflow/core/platform/cuda_libdevice_path.*"
       "${tensorflow_source_dir}/tensorflow/core/platform/default/cuda_libdevice_path.*")
-  list(REMOVE_ITEM tf_core_platform_srcs ${tf_core_platform_gpu_srcs})
+  list(REMOVE_ITEM tf_core_platform_srcs ${tf_core_platform_gpu_srcs_exclude})
 else()
   file(GLOB tf_core_platform_srcs_exclude
       "${tensorflow_source_dir}/tensorflow/core/platform/default/device_tracer.cc")
diff --git a/tensorflow/contrib/cmake/tf_core_ops.cmake b/tensorflow/contrib/cmake/tf_core_ops.cmake
index 9cfa8b90749..6e75963313a 100644
--- a/tensorflow/contrib/cmake/tf_core_ops.cmake
+++ b/tensorflow/contrib/cmake/tf_core_ops.cmake
@@ -13,13 +13,14 @@
 # limitations under the License.
 # ==============================================================================
 set(tf_op_lib_names
-    "audio_ops"
     "array_ops"
+    "audio_ops"
     "batch_ops"
     "bitwise_ops"
     "boosted_trees_ops"
     "candidate_sampling_ops"
     "checkpoint_ops"
+    "collective_ops"
     "control_flow_ops"
     "ctc_ops"
     "cudnn_rnn_ops"
@@ -32,8 +33,8 @@ set(tf_op_lib_names
     "io_ops"
     "linalg_ops"
     "list_ops"
-    "lookup_ops"
     "logging_ops"
+    "lookup_ops"
     "manip_ops"
     "math_ops"
     "nn_ops"
@@ -43,10 +44,11 @@ set(tf_op_lib_names
     "remote_fused_graph_ops"
     "resource_variable_ops"
     "rpc_ops"
+    "scoped_allocator_ops"
     "script_ops"
     "sdca_ops"
-    "set_ops"
     "sendrecv_ops"
+    "set_ops"
     "sparse_ops"
     "spectral_ops"
     "state_ops"
@@ -54,6 +56,7 @@ set(tf_op_lib_names
     "string_ops"
     "summary_ops"
     "training_ops"
+    "word2vec_ops"
 )
 
 foreach(tf_op_lib_name ${tf_op_lib_names})
diff --git a/tensorflow/contrib/cmake/tf_python.cmake b/tensorflow/contrib/cmake/tf_python.cmake
index df7b854afcc..50284985982 100755
--- a/tensorflow/contrib/cmake/tf_python.cmake
+++ b/tensorflow/contrib/cmake/tf_python.cmake
@@ -313,15 +313,14 @@ function(GENERATE_PYTHON_OP_LIB tf_python_op_lib_name)
         ${GENERATE_PYTHON_OP_LIB_DESTINATION} PARENT_SCOPE)
 endfunction()
 
-GENERATE_PYTHON_OP_LIB("audio_ops")
 GENERATE_PYTHON_OP_LIB("array_ops")
+GENERATE_PYTHON_OP_LIB("audio_ops")
 GENERATE_PYTHON_OP_LIB("batch_ops")
 GENERATE_PYTHON_OP_LIB("bitwise_ops")
 GENERATE_PYTHON_OP_LIB("boosted_trees_ops")
-GENERATE_PYTHON_OP_LIB("math_ops")
-GENERATE_PYTHON_OP_LIB("functional_ops")
 GENERATE_PYTHON_OP_LIB("candidate_sampling_ops")
 GENERATE_PYTHON_OP_LIB("checkpoint_ops")
+GENERATE_PYTHON_OP_LIB("collective_ops")
 GENERATE_PYTHON_OP_LIB("control_flow_ops"
   ADDITIONAL_LIBRARIES $<TARGET_OBJECTS:tf_no_op>)
 GENERATE_PYTHON_OP_LIB("ctc_ops")
@@ -332,14 +331,18 @@ GENERATE_PYTHON_OP_LIB("decode_proto_ops"
   DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/proto/python/ops/gen_decode_proto_op.py)
 GENERATE_PYTHON_OP_LIB("encode_proto_ops"
   DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/proto/python/ops/gen_encode_proto_op.py)
+GENERATE_PYTHON_OP_LIB("function_ops")
+GENERATE_PYTHON_OP_LIB("functional_ops")
 GENERATE_PYTHON_OP_LIB("image_ops")
 GENERATE_PYTHON_OP_LIB("io_ops")
 GENERATE_PYTHON_OP_LIB("linalg_ops")
 GENERATE_PYTHON_OP_LIB("list_ops")
 GENERATE_PYTHON_OP_LIB("logging_ops")
 GENERATE_PYTHON_OP_LIB("lookup_ops")
-GENERATE_PYTHON_OP_LIB("nn_ops")
 GENERATE_PYTHON_OP_LIB("manip_ops")
+GENERATE_PYTHON_OP_LIB("math_ops")
+GENERATE_PYTHON_OP_LIB("nn_ops")
+GENERATE_PYTHON_OP_LIB("no_op")
 GENERATE_PYTHON_OP_LIB("parsing_ops")
 GENERATE_PYTHON_OP_LIB("random_ops")
 GENERATE_PYTHON_OP_LIB("remote_fused_graph_ops"
@@ -347,17 +350,21 @@ GENERATE_PYTHON_OP_LIB("remote_fused_graph_ops"
 GENERATE_PYTHON_OP_LIB("resource_variable_ops")
 GENERATE_PYTHON_OP_LIB("rpc_ops"
   DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/rpc/python/ops/gen_rpc_op.py)
+GENERATE_PYTHON_OP_LIB("scoped_allocator_ops")
 GENERATE_PYTHON_OP_LIB("script_ops")
 GENERATE_PYTHON_OP_LIB("sdca_ops")
+GENERATE_PYTHON_OP_LIB("sendrecv_ops")
 GENERATE_PYTHON_OP_LIB("set_ops")
-GENERATE_PYTHON_OP_LIB("state_ops")
 GENERATE_PYTHON_OP_LIB("sparse_ops")
 GENERATE_PYTHON_OP_LIB("spectral_ops")
+GENERATE_PYTHON_OP_LIB("state_ops")
+GENERATE_PYTHON_OP_LIB("stateless_random_ops")
 GENERATE_PYTHON_OP_LIB("string_ops")
 GENERATE_PYTHON_OP_LIB("summary_ops")
 GENERATE_PYTHON_OP_LIB("user_ops")
 GENERATE_PYTHON_OP_LIB("training_ops"
   DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/python/training/gen_training_ops.py)
+GENERATE_PYTHON_OP_LIB("word2vec_ops")
 
 GENERATE_PYTHON_OP_LIB("contrib_boosted_trees_model_ops"
   DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/boosted_trees/python/ops/gen_model_ops.py)
@@ -391,11 +398,8 @@ GENERATE_PYTHON_OP_LIB("contrib_layers_sparse_feature_cross_ops"
   DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/layers/ops/gen_sparse_feature_cross_op.py)
 GENERATE_PYTHON_OP_LIB("contrib_memory_stats_ops"
   DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/memory_stats/ops/gen_memory_stats_ops.py)
-GENERATE_PYTHON_OP_LIB("contrib_nccl_ops"
-  DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/nccl/ops/gen_nccl_ops.py)
 GENERATE_PYTHON_OP_LIB("contrib_periodic_resample_ops"
   DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/periodic_resample/python/ops/gen_periodic_resample_op.py)
-
 GENERATE_PYTHON_OP_LIB("contrib_nearest_neighbor_ops"
   DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/nearest_neighbor/ops/gen_nearest_neighbor_ops.py)
 GENERATE_PYTHON_OP_LIB("contrib_resampler_ops"
@@ -524,11 +528,13 @@ if(WIN32)
     add_library(pywrap_tensorflow_internal_static STATIC
         ${pywrap_tensorflow_internal_src}
         $<TARGET_OBJECTS:tf_c>
+        $<TARGET_OBJECTS:tf_c_eager>
         $<TARGET_OBJECTS:tf_c_python_api>
         $<TARGET_OBJECTS:tf_core_lib>
         $<TARGET_OBJECTS:tf_core_cpu>
         $<TARGET_OBJECTS:tf_core_framework>
         $<TARGET_OBJECTS:tf_core_profiler>
+        $<TARGET_OBJECTS:tf_core_eager_runtime>
         $<TARGET_OBJECTS:tf_cc>
         $<TARGET_OBJECTS:tf_cc_ops>
         $<TARGET_OBJECTS:tf_cc_while_loop>
@@ -581,11 +587,13 @@ endif(WIN32)
 add_library(pywrap_tensorflow_internal SHARED
     ${pywrap_tensorflow_internal_src}
     $<TARGET_OBJECTS:tf_c>
+    $<TARGET_OBJECTS:tf_c_eager>
     $<TARGET_OBJECTS:tf_c_python_api>
     $<TARGET_OBJECTS:tf_core_lib>
     $<TARGET_OBJECTS:tf_core_cpu>
     $<TARGET_OBJECTS:tf_core_framework>
     $<TARGET_OBJECTS:tf_core_profiler>
+    $<TARGET_OBJECTS:tf_core_eager_runtime>
     $<TARGET_OBJECTS:tf_cc>
     $<TARGET_OBJECTS:tf_cc_ops>
     $<TARGET_OBJECTS:tf_cc_while_loop>
@@ -615,13 +623,28 @@ target_include_directories(pywrap_tensorflow_internal PUBLIC
     ${NUMPY_INCLUDE_DIR}
 )
 
-target_link_libraries(pywrap_tensorflow_internal PRIVATE
+if(CMAKE_COMPILER_IS_GNUCC AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 5.0)
+	# There is a bug in GCC 5 resulting in undefined reference to a __cpu_model function when
+	# linking to the tensorflow library. Adding the following libraries fixes it.
+	# See issue on github: https://github.com/tensorflow/tensorflow/issues/9593
+	target_link_libraries(pywrap_tensorflow_internal PRIVATE
+    ${tf_core_gpu_kernels_lib}
+    ${tensorflow_EXTERNAL_LIBRARIES}
+    tf_protos_cc
+    tf_python_protos_cc
+    ${PYTHON_LIBRARIES}
+    gcc_s
+    gcc
+)
+else()
+	target_link_libraries(pywrap_tensorflow_internal PRIVATE
     ${tf_core_gpu_kernels_lib}
     ${tensorflow_EXTERNAL_LIBRARIES}
     tf_protos_cc
     tf_python_protos_cc
     ${PYTHON_LIBRARIES}
 )
+endif()
 
 if(WIN32)
 
diff --git a/tensorflow/contrib/cmake/tf_shared_lib.cmake b/tensorflow/contrib/cmake/tf_shared_lib.cmake
index fdf522f1fd9..62005dd113b 100644
--- a/tensorflow/contrib/cmake/tf_shared_lib.cmake
+++ b/tensorflow/contrib/cmake/tf_shared_lib.cmake
@@ -23,6 +23,8 @@ if(WIN32)
   # we need.
   #
   add_library(tensorflow_static STATIC
+      $<TARGET_OBJECTS:tf_c_eager>
+      $<TARGET_OBJECTS:tf_core_eager_runtime>
       $<TARGET_OBJECTS:tf_c>
       $<TARGET_OBJECTS:tf_cc>
       $<TARGET_OBJECTS:tf_cc_framework>
@@ -65,6 +67,8 @@ endif(WIN32)
 # tensorflow is a shared library containing all of the
 # TensorFlow runtime and the standard ops and kernels.
 add_library(tensorflow SHARED
+    $<TARGET_OBJECTS:tf_c_eager>
+    $<TARGET_OBJECTS:tf_core_eager_runtime>
     $<TARGET_OBJECTS:tf_c>
     $<TARGET_OBJECTS:tf_cc>
     $<TARGET_OBJECTS:tf_cc_framework>
@@ -96,6 +100,27 @@ if(CMAKE_COMPILER_IS_GNUCC AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 5.0)
     target_link_libraries(tensorflow PRIVATE gcc_s gcc)
 endif()
 
+# Offer the user the choice of overriding the installation directories
+set(INSTALL_LIB_DIR lib CACHE PATH "Installation directory for libraries")
+set(INSTALL_BIN_DIR bin CACHE PATH "Installation directory for executables")
+set(INSTALL_INCLUDE_DIR include CACHE PATH
+  "Installation directory for header files")
+if(WIN32 AND NOT CYGWIN)
+  set(DEF_INSTALL_CMAKE_DIR cmake)
+else()
+  set(DEF_INSTALL_CMAKE_DIR lib/cmake)
+endif()
+set(INSTALL_CMAKE_DIR ${DEF_INSTALL_CMAKE_DIR} CACHE PATH
+  "Installation directory for CMake files")
+
+# Make relative paths absolute (needed later on)
+foreach(p LIB BIN INCLUDE CMAKE)
+  set(var INSTALL_${p}_DIR)
+  if(NOT IS_ABSOLUTE "${${var}}")
+    set(${var} "${CMAKE_INSTALL_PREFIX}/${${var}}")
+  endif()
+endforeach()
+
 if(WIN32)
   add_dependencies(tensorflow tensorflow_static)
 endif(WIN32)
@@ -103,14 +128,57 @@ endif(WIN32)
 target_include_directories(tensorflow PUBLIC 
     $<INSTALL_INTERFACE:include/>)
 
-install(TARGETS tensorflow EXPORT tensorflow_export
-        RUNTIME DESTINATION bin
-        LIBRARY DESTINATION lib
-        ARCHIVE DESTINATION lib)
+# Add all targets to build-tree export set
+export(TARGETS tensorflow
+  FILE ${PROJECT_BINARY_DIR}/TensorflowTargets.cmake)
+
+# Export the package for use from the build-tree
+export(PACKAGE Tensorflow)
+
+# Create the TensorflowConfig.cmake and TensorflowConfigVersion files
+file(RELATIVE_PATH REL_INCLUDE_DIR "${INSTALL_CMAKE_DIR}"
+   "${INSTALL_INCLUDE_DIR}")
+# for the build tree
+set(CONF_INCLUDE_DIRS "${tensorflow_source_dir}" 
+                      "${PROJECT_BINARY_DIR}"
+                      "${CMAKE_CURRENT_BINARY_DIR}/protobuf/src/protobuf/src"
+                      "${CMAKE_CURRENT_BINARY_DIR}/nsync/install/include" # Please if there is a better directory
+                      "${CMAKE_CURRENT_BINARY_DIR}/eigen/src/eigen/Eigen/"
+                      "${CMAKE_CURRENT_BINARY_DIR}/external/eigen_archive/"
+                      "${tensorflow_source_dir}/third_party/eigen3/"
+                      "${CMAKE_CURRENT_BINARY_DIR}/eigen/src/eigen/unsupported/Eigen/")
+configure_file(TensorflowConfig.cmake.in
+  "${PROJECT_BINARY_DIR}/TensorflowConfig.cmake" @ONLY)
+# for the install tree, yet to be complete
+set(CONF_INCLUDE_DIRS "\${TENSORFLOW_CMAKE_DIR}/${REL_INCLUDE_DIR}")
+configure_file(TensorflowConfig.cmake.in
+  "${PROJECT_BINARY_DIR}/${CMAKE_FILES_DIRECTORY}/TensorflowConfig.cmake" @ONLY)
+# for both
+configure_file(TensorflowConfigVersion.cmake.in
+  "${PROJECT_BINARY_DIR}/TensorflowConfigVersion.cmake" @ONLY)
+
+# install(TARGETS tensorflow EXPORT tensorflow_export
+#         RUNTIME DESTINATION ${INSTALL_BIN_DIR}
+#         LIBRARY DESTINATION ${INSTALL_LIB_DIR}
+#         ARCHIVE DESTINATION ${INSTALL_LIB_DIR})
+
+# install(EXPORT tensorflow_export
+#         FILE TensorflowConfig.cmake
+#         DESTINATION ${INSTALL_CMAKE_DIR})
         
-install(EXPORT tensorflow_export
-        FILE TensorflowConfig.cmake
-        DESTINATION lib/cmake)
+install(FILES
+  "${PROJECT_BINARY_DIR}/${CMAKE_FILES_DIRECTORY}/TensorflowConfig.cmake"
+  "${PROJECT_BINARY_DIR}/TensorflowConfigVersion.cmake"
+  DESTINATION "${INSTALL_CMAKE_DIR}" COMPONENT dev)
+
+# install the export set for use with the install-tree
+install(EXPORT TensorflowTargets 
+  DESTINATION ${INSTALL_CMAKE_DIR})
+
+install(TARGETS tensorflow EXPORT TensorflowTargets
+        RUNTIME DESTINATION ${INSTALL_BIN_DIR}
+        LIBRARY DESTINATION ${INSTALL_LIB_DIR}
+        ARCHIVE DESTINATION ${INSTALL_LIB_DIR})
 
 # install necessary headers
 # tensorflow headers
@@ -145,6 +213,10 @@ install(DIRECTORY ${tensorflow_source_dir}/third_party/eigen3/
 # unsupported Eigen directory
 install(DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/eigen/src/eigen/unsupported/Eigen/
         DESTINATION include/unsupported/Eigen)
+# absl directory
+install(DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/abseil_cpp/src/abseil_cpp/absl/
+        DESTINATION include/absl
+        FILES_MATCHING PATTERN "*.h")
 # mkl
 if (tensorflow_ENABLE_MKL_SUPPORT)
     install(DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/mkl/src/mkl/include/
diff --git a/tensorflow/contrib/compiler/BUILD b/tensorflow/contrib/compiler/BUILD
index 1630f010ab6..e4566437c60 100644
--- a/tensorflow/contrib/compiler/BUILD
+++ b/tensorflow/contrib/compiler/BUILD
@@ -58,6 +58,7 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         "//tensorflow/compiler/jit:xla_ops_py",
+        "//tensorflow/compiler/jit/ops:xla_ops_grad",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:framework_ops",
diff --git a/tensorflow/contrib/compiler/xla.py b/tensorflow/contrib/compiler/xla.py
index 335ac794648..f867cd15b67 100644
--- a/tensorflow/contrib/compiler/xla.py
+++ b/tensorflow/contrib/compiler/xla.py
@@ -23,6 +23,7 @@ import contextlib
 from six.moves import xrange  # pylint: disable=redefined-builtin
 
 from tensorflow.compiler.jit.ops import xla_ops
+from tensorflow.compiler.jit.ops import xla_ops_grad  # pylint: disable=unused-import
 from tensorflow.core.framework import attr_value_pb2
 from tensorflow.python.estimator import model_fn as model_fn_lib
 from tensorflow.python.framework import ops
diff --git a/tensorflow/contrib/constrained_optimization/python/constrained_minimization_problem.py b/tensorflow/contrib/constrained_optimization/python/constrained_minimization_problem.py
index 41258edd908..6926c0d03fe 100644
--- a/tensorflow/contrib/constrained_optimization/python/constrained_minimization_problem.py
+++ b/tensorflow/contrib/constrained_optimization/python/constrained_minimization_problem.py
@@ -74,8 +74,8 @@ class ConstrainedMinimizationProblem(object):
 
     if (constraints_shape.ndims is None or
         proxy_constraints_shape.ndims is None or
-        any([ii is None for ii in constraints_shape.as_list()]) or
-        any([ii is None for ii in proxy_constraints_shape.as_list()])):
+        any(ii is None for ii in constraints_shape.as_list()) or
+        any(ii is None for ii in proxy_constraints_shape.as_list())):
       raise ValueError(
           "constraints and proxy_constraints must have fully-known shapes")
     if constraints_shape != proxy_constraints_shape:
diff --git a/tensorflow/contrib/crf/python/ops/crf.py b/tensorflow/contrib/crf/python/ops/crf.py
index 656633f0bf2..40e159b8fcb 100644
--- a/tensorflow/contrib/crf/python/ops/crf.py
+++ b/tensorflow/contrib/crf/python/ops/crf.py
@@ -38,12 +38,12 @@ tf_unary_scores, tf_sequence_lengths, tf_transition_params, _ = session.run(
     [unary_scores, sequence_lengths, transition_params, train_op])
 for tf_unary_scores_, tf_sequence_length_ in zip(tf_unary_scores,
                                                  tf_sequence_lengths):
-# Remove padding.
-tf_unary_scores_ = tf_unary_scores_[:tf_sequence_length_]
+    # Remove padding.
+    tf_unary_scores_ = tf_unary_scores_[:tf_sequence_length_]
 
-# Compute the highest score and its tag sequence.
-tf_viterbi_sequence, tf_viterbi_score = tf.contrib.crf.viterbi_decode(
-    tf_unary_scores_, tf_transition_params)
+    # Compute the highest score and its tag sequence.
+    tf_viterbi_sequence, tf_viterbi_score = tf.contrib.crf.viterbi_decode(
+        tf_unary_scores_, tf_transition_params)
 """
 
 from __future__ import absolute_import
diff --git a/tensorflow/contrib/cudnn_rnn/BUILD b/tensorflow/contrib/cudnn_rnn/BUILD
index 670b5494327..8d35622e393 100644
--- a/tensorflow/contrib/cudnn_rnn/BUILD
+++ b/tensorflow/contrib/cudnn_rnn/BUILD
@@ -42,10 +42,11 @@ tf_custom_op_py_library(
 
 cuda_py_test(
     name = "cudnn_rnn_ops_test",
-    size = "large",
+    size = "medium",
     srcs = ["python/kernel_tests/cudnn_rnn_ops_test.py"],
     additional_deps = [
         ":cudnn_rnn_py",
+        "@absl_py//absl/testing:parameterized",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/contrib/rnn:rnn_py",
         "//tensorflow/python/ops/losses:losses",
@@ -61,7 +62,7 @@ cuda_py_test(
         "//tensorflow/python:training",
         "//tensorflow/python:variables",
     ],
-    shard_count = 6,
+    shard_count = 2,
     tags = [
         "noasan",  # http://b/62067814
         "requires-gpu-sm35",
diff --git a/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_ops_test.py b/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_ops_test.py
index ae839108ebe..a268415f0e6 100644
--- a/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_ops_test.py
+++ b/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_ops_test.py
@@ -18,24 +18,30 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import collections
 import itertools
 import os
 import unittest
 
+from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.contrib.cudnn_rnn.python.ops import cudnn_rnn_ops
 from tensorflow.core.protobuf import saver_pb2
+from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import random_seed
 from tensorflow.python.framework.test_util import TensorFlowTestCase
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import gradient_checker
-from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import gradients_impl
+from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import rnn
+from tensorflow.python.ops import rnn_cell_impl
 from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import googletest
 from tensorflow.python.platform import test
@@ -56,714 +62,989 @@ CUDNN_RNN_TANH_PARAMS_PER_LAYER = cudnn_rnn_ops.CUDNN_RNN_TANH_PARAMS_PER_LAYER
 CUDNN_RNN_RELU_PARAMS_PER_LAYER = cudnn_rnn_ops.CUDNN_RNN_RELU_PARAMS_PER_LAYER
 
 
-def _CreateModel(rnn_mode,
-                 num_layers,
-                 num_units,
-                 input_size,
-                 input_mode="linear_input",
-                 direction=cudnn_rnn_ops.CUDNN_RNN_UNIDIRECTION,
-                 dtype=dtypes.float32,
-                 dropout=0.):
-  del input_mode
-  if rnn_mode == cudnn_rnn_ops.CUDNN_LSTM:
-    model_fn = cudnn_rnn_ops.CudnnLSTM
-  elif rnn_mode == cudnn_rnn_ops.CUDNN_GRU:
-    model_fn = cudnn_rnn_ops.CudnnGRU
-  elif rnn_mode == cudnn_rnn_ops.CUDNN_RNN_TANH:
-    model_fn = cudnn_rnn_ops.CudnnRNNTanh
-  elif rnn_mode == cudnn_rnn_ops.CUDNN_RNN_RELU:
-    model_fn = cudnn_rnn_ops.CudnnRNNRelu
+def RunLSTM(sess,
+            num_units,
+            input_size,
+            batch_size,
+            time,
+            num_layers=1,
+            is_training=True,
+            dropout=0.,
+            num_dirs=True,
+            dtype=dtypes.float32):
+  # TODO(jamesqin): add multi-layer tests.
+  # TODO(jamesqin): add multi-dir tests
+  assert num_layers == 1
+  assert num_dirs == 1
+  if is_training and not np.isclose(dropout, 0):
+    raise ValueError("dropout can not be 0. when test training.")
+
+  # set graph level random seed and numpy random seed.
+  random_seed.set_random_seed(0)
+  np.random.seed(0)
+
+  inputs = variable_scope.get_variable(
+      "inputs",
+      initializer=np.random.rand(time, batch_size,
+                                 input_size).astype(dtype.as_numpy_dtype),
+      dtype=dtype)
+  initial_h_op = variable_scope.get_variable(
+      "initial_h_op",
+      initializer=np.random.rand(batch_size,
+                                 num_units).astype(dtype.as_numpy_dtype),
+      dtype=dtype)
+  initial_c_op = variable_scope.get_variable(
+      "initial_c_op",
+      initializer=np.random.rand(batch_size,
+                                 num_units).astype(dtype.as_numpy_dtype),
+      dtype=dtype)
+
+  initializer = init_ops.random_uniform_initializer(
+      -0.01, 0.01, dtype=dtype, seed=19980904)
+
+  with variable_scope.variable_scope("test", initializer=initializer):
+    w = variable_scope.get_variable(
+        "rnn/lstm_cell/kernel",
+        shape=[input_size + num_units, num_units * 4],
+        dtype=dtype)
+    b = variable_scope.get_variable(
+        "rnn/lstm_cell/bias", shape=[num_units * 4], dtype=dtype)
+
+    # canonical lstm. must set forget_bias to 0. to align with cudnn lstm.
+    cell = rnn_cell_impl.LSTMCell(num_units, forget_bias=0., reuse=True)
+    outputs_op, state_tuple_op = rnn.dynamic_rnn(
+        cell,
+        inputs,
+        initial_state=rnn_cell_impl.LSTMStateTuple(
+            h=initial_h_op, c=initial_c_op),
+        dtype=dtype,
+        time_major=True,
+        scope=None)
+
+  # Convert to cudnn opaque param.
+  format_converter = cudnn_rnn_ops.CudnnParamsFormatConverterLSTM(
+      num_layers, num_units, input_size)
+  opaque_params = format_converter.tf_canonical_to_opaque([w, b])
+
+  cu_initial_h_op = array_ops.expand_dims(initial_h_op, axis=0)
+  cu_initial_c_op = array_ops.expand_dims(initial_c_op, axis=0)
+  cu_outputs_op, cu_h_op, cu_c_op = cudnn_rnn_ops._cudnn_rnn(
+      inputs,
+      cu_initial_h_op,
+      cu_initial_c_op,
+      opaque_params,
+      dropout=dropout,
+      is_training=is_training,
+      rnn_mode=cudnn_rnn_ops.CUDNN_LSTM)
+  # Remove the trivial 1st dimension.
+  cu_state_tuple_op = rnn_cell_impl.LSTMStateTuple(
+      c=array_ops.squeeze(cu_c_op, axis=0),
+      h=array_ops.squeeze(cu_h_op, axis=0))
+
+  if is_training:
+    (inp_grad_op, hgrad_op,
+     cgrad_op, wgrad_op, bgrad_op) = gradients_impl.gradients(
+         outputs_op, [inputs, initial_h_op, initial_c_op, w, b])
+
+    (cu_inp_grad_op, cu_hgrad_op,
+     cu_cgrad_op, opaque_grad_op) = gradients_impl.gradients(
+         cu_outputs_op,
+         [inputs, cu_initial_h_op, cu_initial_c_op, opaque_params])
+    # Remove the trivial 1st dimension
+    cu_hgrad_op = array_ops.squeeze(cu_hgrad_op, axis=0)
+    # Remove the trivial 1st dimension
+    cu_cgrad_op = array_ops.squeeze(cu_cgrad_op, axis=0)
+
+    cu_wgrad_op, cu_bgrad_op = format_converter.opaque_to_tf_canonical(
+        opaque_grad_op)
+    cu_wgrad_op = cu_wgrad_op[0]
+    cu_bgrad_op = cu_bgrad_op[0]
+    # cudnn lstm has 2 biases each gate. When converting to tf canonical format,
+    # the two biases are summed into one. Thus here bias gradient should be
+    # halved when comparing with tf lstm.
+    cu_bgrad_op *= 0.5
+
+  init_op = variables.global_variables_initializer()
+  sess.run(init_op)
+
+  if is_training:
+    outputs, state_tuple, inp_grad, state_grad, wgrad, bgrad = sess.run([
+        outputs_op, state_tuple_op, inp_grad_op,
+        (hgrad_op, cgrad_op), wgrad_op, bgrad_op
+    ])
+    (cu_outputs, cu_state_tuple, cu_inp_grad, cu_state_grad, cu_wgrad,
+     cu_bgrad) = sess.run([
+         cu_outputs_op, cu_state_tuple_op, cu_inp_grad_op,
+         (cu_hgrad_op, cu_cgrad_op), cu_wgrad_op, cu_bgrad_op
+     ])
+
+    logging.vlog(1, "outputs: %s" % outputs)
+    logging.vlog(1, "cu_outputs: %s" % cu_outputs)
+    logging.vlog(1, "state_tuple: %s" % str(state_tuple))
+    logging.vlog(1, "cu_state_tuple: %s" % str(cu_state_tuple))
+    logging.vlog(1, "inp_grad: %s" % inp_grad)
+    logging.vlog(1, "cu_inp_grad: %s" % cu_inp_grad)
+    logging.vlog(1, "state_grad: %s" % str(state_grad))
+    logging.vlog(1, "cu_state_grad: %s" % str(cu_state_grad))
+    logging.vlog(1, "wgrad: %s" % str(wgrad))
+    logging.vlog(1, "bgrad: %s" % str(bgrad))
+    logging.vlog(1, "cu_wgrad: %s" % str(cu_wgrad))
+    logging.vlog(1, "cu_bgrad: %s" % str(cu_bgrad))
+    return (outputs, cu_outputs, state_tuple, cu_state_tuple, inp_grad,
+            cu_inp_grad, state_grad, cu_state_grad, wgrad, bgrad, cu_wgrad,
+            cu_bgrad)
   else:
-    raise ValueError("Invalid rnn_mode: %s" % rnn_mode)
-  return model_fn(
-      num_layers,
-      num_units,
-      input_size,
-      direction=direction,
-      dtype=dtype,
-      dropout=dropout)
+    outputs, state_tuple = sess.run([outputs_op, state_tuple_op])
+    cu_outputs, cu_state_tuple = sess.run([cu_outputs_op, cu_state_tuple_op])
+
+    logging.vlog(1, "outputs: %s" % outputs)
+    logging.vlog(1, "cu_outputs: %s" % cu_outputs)
+    logging.vlog(1, "state_tuple: %s" % str(state_tuple))
+    logging.vlog(1, "cu_state_tuple: %s" % str(cu_state_tuple))
+  return outputs, cu_outputs, state_tuple, cu_state_tuple
 
 
-def _CreateParamsSavable(params,
-                         model,
-                         base_variable_scope=None,
-                         name="params_canonical"):
-  """Create a RNNParamsSaveable for the weight and bias parameters.
+# Basic set of RNN configs to test. They can be further extended in relevant
+# test (e.g. adding num_dirs).
+NAMED_RNN_TESTCASES = ({
+    "testcase_name": "xsmall",
+    "num_units": 1,
+    "input_size": 1,
+    "batch_size": 1,
+    "time": 1,
+    "num_layers": 1,
+}, {
+    "testcase_name": "small",
+    "num_units": 4,
+    "input_size": 4,
+    "batch_size": 4,
+    "time": 4,
+    "num_layers": 1,
+}, {
+    "testcase_name": "medium",
+    "num_units": 128,
+    "input_size": 64,
+    "batch_size": 8,
+    "time": 16,
+    "num_layers": 1,
+}, {
+    "testcase_name": "large",
+    "num_units": 128,
+    "input_size": 128,
+    "batch_size": 16,
+    "time": 32,
+    "num_layers": 1,
+})
+
+
+def ExpandNamedTestCases(inputs, *remove_keys, **extra_configs):
+  """Expands testcase with new config dimensions.
+
+  Example:
+    inputs = (
+      {'testcase_name': 'test1', 'gender': 'male'}
+      {'testcase_name': 'test2', 'gender': 'female'}
+    )
+    remove_keys:  empty
+    extra_configs = {
+      'age': [40, 80]
+      'height': [5, 6]
+    }
+
+    Returns:
+      (
+        {'testcase_name': 'test1_age_40_height_5','gender': 'male', 'age':
+        40,'height': 5}
+        {'testcase_name': 'test1_age_40_height_6', 'gender': 'male', 'age': 40,
+        'height': 6}
+        {'testcase_name': 'test1_age_80_height_5', 'gender': 'male', 'age': 80,
+        'height': 5}
+        {'testcase_name': 'test1_age_80_height_6', 'gender': 'male', 'age': 80,
+        'height': 6}
+
+        {'testcase_name': 'test2_age_40_height_5', 'gender': 'female', 'age':
+        40,
+        'height': 5}
+        {'testcase_name': 'test2_age_40_height_6', 'gender': 'female', 'age':
+        40,
+        'height': 6}
+        {'testcase_name': 'test2_age_80_height_5', 'gender': 'female', 'age':
+        80,
+        'height': 5}
+        {'testcase_name': 'test2_age_80_height_6', 'gender': 'female', 'age':
+        80,
+        'height': 6}
+      )
 
   Args:
-    params: a Variable for weight and bias parameters.
-    model: a CudnnRNN model.
-    base_variable_scope: a string, prefix of names of saved variables.
-    name: a string, name of the RNNParamsSaveable object.
+    inputs: A list of dictionary, each being a testcase.
+    *remove_keys: A list of keys into testcase which are not needed in new
+      testcases.
+    **extra_configs: A dict of new test dimension and applicable values in that
+      dimension.
+
   Returns:
-    a RNNParamsSaveable object.
+    A list of dictionary with expanded test cases.
   """
-  if model._rnn_mode == CUDNN_LSTM:
-    fn = cudnn_rnn_ops.CudnnLSTMSaveable
-  elif model._rnn_mode == CUDNN_GRU:
-    fn = cudnn_rnn_ops.CudnnGRUSaveable
-  elif model._rnn_mode == CUDNN_RNN_TANH:
-    fn = cudnn_rnn_ops.CudnnRNNTanhSaveable
-  elif model._rnn_mode == CUDNN_RNN_RELU:
-    fn = cudnn_rnn_ops.CudnnRNNReluSaveable
-  params_saveable = fn(
-      params,
-      model.num_layers,
-      model.num_units,
-      model.input_size,
-      model.input_mode,
-      model.direction,
-      scope=base_variable_scope,
-      name=name)
-  ops.add_to_collection(ops.GraphKeys.SAVEABLE_OBJECTS, params_saveable)
-  return params_saveable
+  res = []
+  ordered_extra_configs = collections.OrderedDict(extra_configs)
+  keys = ordered_extra_configs.keys()
+  # A list of list of configs.
+  # The outer loop is iterating keys, the innner is values of one key.
+  combined_kv = [[(k, v) for v in ordered_extra_configs[k]] for k in keys]
+  logging.info("combined_kv: %s", combined_kv)
+
+  for inp in inputs:
+    # Each inp is a dict
+    for config in itertools.product(*combined_kv):
+      new_inp = dict(inp)
+      # config is a list in the form of [(k_i, v_j), (k_p, v_q), ...]
+      suffix = ["%s_%s" % (p[0], str(p[1])) for p in config]
+      suffix = "_".join(suffix)
+      new_inp["testcase_name"] += "_" + suffix
+      for k, v in config:
+        new_inp[k] = v
+      # Remove not used keys from the new test case.
+      if remove_keys:
+        if not isinstance(remove_keys, (list, tuple)):
+          remove_keys = [remove_keys]
+        for k in remove_keys:
+          new_inp.pop(k, None)
+      logging.info("new_inp: %s", new_inp)
+      res.append(new_inp)
+  # Dedup, necessary if `remove_keys` is set.
+  return [dict(t) for t in {tuple(d.items()) for d in res}]
 
 
-def _MinLSTMParamSize(num_layers,
-                      num_units,
-                      input_size,
-                      direction=cudnn_rnn_ops.CUDNN_RNN_UNIDIRECTION):
-  if direction == cudnn_rnn_ops.CUDNN_RNN_UNIDIRECTION:
-    first_layer_weights = 4 * num_units * (num_units + input_size)
-    higher_layer_weights = 8 * (num_layers - 1) * num_units * num_units
-    all_biases = 8 * num_layers * num_units
-    return first_layer_weights + higher_layer_weights + all_biases
-  elif direction == cudnn_rnn_ops.CUDNN_RNN_BIDIRECTION:
-    first_layer_weights = 4 * num_units * (num_units + input_size)
-    higher_layer_weights = (num_layers - 1) * (
-        4 * 2 * num_units * num_units + 4 * num_units**2)
-    all_biases = 8 * num_layers * num_units
-    return 2 * (first_layer_weights + higher_layer_weights + all_biases)
+class CudnnLSTMTest(TensorFlowTestCase, parameterized.TestCase):
+
+  def _test_training_helper(self,
+                            num_units,
+                            input_size,
+                            batch_size,
+                            time,
+                            num_layers,
+                            dtype,
+                            rtol=2e-6,
+                            atol=2e-6):
+    with self.session(use_gpu=True) as sess:
+      (outputs, cu_outputs, state_tuple, cu_state_tuple, inp_grad, cu_inp_grad,
+       state_grad, cu_state_grad, wgrad, bgrad, cu_wgrad, cu_bgrad) = RunLSTM(
+           sess, num_units, input_size, batch_size, time, num_layers)
+
+      self.assertAllClose(outputs, cu_outputs, rtol=rtol, atol=atol)
+      for s, cu_s in zip(state_tuple, cu_state_tuple):
+        self.assertAllClose(s, cu_s, rtol=rtol, atol=atol)
+      for sg, cu_sg in zip(state_grad, cu_state_grad):
+        self.assertAllClose(sg, cu_sg, rtol=rtol, atol=atol)
+      self.assertAllClose(inp_grad, cu_inp_grad, rtol=rtol, atol=atol)
+      self.assertAllClose(bgrad, cu_bgrad, rtol=rtol, atol=atol)
+      self.assertAllClose(wgrad, cu_wgrad, rtol=rtol, atol=atol)
+
+  @parameterized.named_parameters(*NAMED_RNN_TESTCASES)
+  @unittest.skipUnless(test.is_built_with_cuda(),
+                       "Test only applicable when running on GPUs")
+  def test_training(self, num_units, input_size, batch_size, time, num_layers):
+    if not context.context().num_gpus():
+      self.skipTest("No GPUs found")
+    self._test_training_helper(num_units, input_size, batch_size, time,
+                               num_layers, dtypes.float32)
+
+  @parameterized.named_parameters(*NAMED_RNN_TESTCASES)
+  @unittest.skipUnless(test.is_built_with_cuda(),
+                       "Test only applicable when running on GPUs")
+  def test_training_fp16(self, num_units, input_size, batch_size, time,
+                         num_layers):
+    if not context.context().num_gpus():
+      self.skipTest("No GPUs found")
+    self._test_training_helper(
+        num_units,
+        input_size,
+        batch_size,
+        time,
+        num_layers,
+        dtypes.float16,
+        rtol=5e-3,
+        atol=5e-4)
+
+  @parameterized.named_parameters(*NAMED_RNN_TESTCASES)
+  @unittest.skipUnless(test.is_built_with_cuda(),
+                       "Test only applicable when running on GPUs")
+  def test_inference(self, num_units, input_size, batch_size, time, num_layers):
+    if not context.context().num_gpus():
+      self.skipTest("No GPUs found")
+    with self.session(use_gpu=True) as sess:
+      (outputs, cu_outputs, state_tuple, cu_state_tuple) = RunLSTM(
+          sess,
+          num_units,
+          input_size,
+          batch_size,
+          time,
+          num_layers,
+          is_training=False)
+
+      self.assertAllClose(outputs, cu_outputs)
+      # h
+      self.assertAllClose(state_tuple.h, cu_state_tuple.h)
+      # c
+      self.assertAllClose(state_tuple.c, cu_state_tuple.c)
+
+  @parameterized.named_parameters(*NAMED_RNN_TESTCASES)
+  @unittest.skipUnless(test.is_built_with_cuda(),
+                       "Test only applicable when running on GPUs")
+  def test_inference_fp16(self, num_units, input_size, batch_size, time,
+                          num_layers):
+    if not context.context().num_gpus():
+      self.skipTest("No GPUs found")
+    with self.session(use_gpu=True) as sess:
+      (outputs, cu_outputs, state_tuple, cu_state_tuple) = RunLSTM(
+          sess,
+          num_units,
+          input_size,
+          batch_size,
+          time,
+          num_layers,
+          is_training=False,
+          dtype=dtypes.float16)
+
+      rtol, atol = 5e-3, 5e-4
+      self.assertAllClose(outputs, cu_outputs, rtol=rtol, atol=atol)
+      # h
+      self.assertAllClose(
+          state_tuple.h, cu_state_tuple.h, rtol=rtol, atol=atol)
+      # c
+      self.assertAllClose(
+          state_tuple.c, cu_state_tuple.c, rtol=rtol, atol=atol)
+
+  @parameterized.named_parameters(*NAMED_RNN_TESTCASES)
+  @unittest.skipUnless(test.is_built_with_cuda(),
+                       "Test only applicable when running on GPUs")
+  def test_inference_with_dropout(self, num_units, input_size, batch_size, time,
+                                  num_layers):
+    """Validates that dropout does not affect Cudnn Rnn inference."""
+    if not context.context().num_gpus():
+      self.skipTest("No GPUs found")
+    # Hand-picked dropouts are used below (0. and 1.)
+    with ops.Graph().as_default() as g:
+      with self.session(use_gpu=True, graph=g) as sess:
+        # 1st time w/o dropout.
+        (_, cu_outputs, _, cu_state_tuple) = RunLSTM(
+            sess,
+            num_units,
+            input_size,
+            batch_size,
+            time,
+            num_layers,
+            is_training=False,
+            dropout=0.)
+
+    with ops.Graph().as_default() as g:
+      with self.session(use_gpu=True, graph=g) as sess:
+        (_, cu_outputs2, _, cu_state_tuple2) = RunLSTM(
+            sess,
+            num_units,
+            input_size,
+            batch_size,
+            time,
+            num_layers,
+            is_training=False,
+            dropout=1.)
+
+    self.assertAllClose(cu_outputs, cu_outputs2)
+    # h
+    self.assertAllClose(cu_state_tuple.h, cu_state_tuple2.h)
+    # c
+    self.assertAllClose(cu_state_tuple.c, cu_state_tuple2.c)
+
+
+def RunGRU(sess,
+           num_units,
+           input_size,
+           batch_size,
+           time,
+           num_layers=1,
+           is_training=True,
+           dropout=0.,
+           num_dirs=True,
+           dtype=dtypes.float32):
+  # TODO(jamesqin): add multi-layer tests.
+  # TODO(jamesqin): add multi-dir tests
+  assert num_layers == 1
+  assert num_dirs == 1
+  if is_training and not np.isclose(dropout, 0):
+    raise ValueError("dropout can not be 0. when test training.")
+
+  # set graph level random seed and numpy random seed.
+  random_seed.set_random_seed(0)
+  np.random.seed(0)
+
+  inputs = variable_scope.get_variable(
+      "inputs",
+      initializer=np.random.rand(time, batch_size,
+                                 input_size).astype(dtype.as_numpy_dtype),
+      dtype=dtype)
+  initial_h_op = variable_scope.get_variable(
+      "initial_h_op",
+      initializer=np.random.rand(batch_size,
+                                 num_units).astype(dtype.as_numpy_dtype),
+      dtype=dtype)
+
+  initializer = init_ops.random_uniform_initializer(
+      -0.01, 0.01, dtype=dtype, seed=19980904)
+  with variable_scope.variable_scope("test", initializer=initializer):
+    gate_kernel = variable_scope.get_variable(
+        "rnn/cudnn_compatible_gru_cell/gates/kernel",
+        shape=[input_size + num_units, num_units * 2],
+        dtype=dtype)
+    gate_bias = variable_scope.get_variable(
+        "rnn/cudnn_compatible_gru_cell/gates/bias",
+        shape=[num_units * 2],
+        dtype=dtype)
+    candidate_inp_kernel = variable_scope.get_variable(
+        "rnn/cudnn_compatible_gru_cell/candidate/input_projection/kernel",
+        shape=[input_size, num_units],
+        dtype=dtype)
+    candidate_inp_bias = variable_scope.get_variable(
+        "rnn/cudnn_compatible_gru_cell/candidate/input_projection/bias",
+        shape=[num_units],
+        dtype=dtype)
+    candidate_hid_kernel = variable_scope.get_variable(
+        "rnn/cudnn_compatible_gru_cell/candidate/hidden_projection/kernel",
+        shape=[num_units, num_units],
+        dtype=dtype)
+    candidate_hid_bias = variable_scope.get_variable(
+        "rnn/cudnn_compatible_gru_cell/candidate/hidden_projection/bias",
+        shape=[num_units],
+        dtype=dtype)
+
+    cell = cudnn_rnn_ops.CudnnCompatibleGRUCell(num_units, reuse=True)
+    outputs_op, h_op = rnn.dynamic_rnn(
+        cell,
+        inputs,
+        initial_state=initial_h_op,
+        dtype=dtype,
+        time_major=True,
+        scope=None)
+
+  ws = [gate_kernel, candidate_inp_kernel, candidate_hid_kernel]
+  bs = [gate_bias, candidate_inp_bias, candidate_hid_bias]
+  # Convert to cudnn opaque param.
+  format_converter = cudnn_rnn_ops.CudnnParamsFormatConverterGRU(
+      num_layers, num_units, input_size)
+  opaque_params = format_converter.tf_canonical_to_opaque(ws + bs)
+
+  cu_initial_h_op = array_ops.expand_dims(initial_h_op, axis=0)
+  cu_outputs_op, cu_h_op, _ = cudnn_rnn_ops._cudnn_rnn(
+      inputs,
+      cu_initial_h_op,
+      array_ops.zeros_like(cu_initial_h_op),  # not used
+      opaque_params,
+      dropout=dropout,
+      is_training=is_training,
+      rnn_mode=cudnn_rnn_ops.CUDNN_GRU)
+
+  if is_training:
+    (inp_grad_op, hgrad_op, gk_grad_op, cik_grad_op, chk_grad_op, gb_grad_op,
+     cib_grad_op, chb_grad_op) = gradients_impl.gradients(
+         outputs_op, [inputs, initial_h_op] + ws + bs)
+
+    (cu_inp_grad_op, cu_hgrad_op, opaque_grad_op) = gradients_impl.gradients(
+        cu_outputs_op, [inputs, cu_initial_h_op, opaque_params])
+    # Remove the trivial 1st dimension
+    cu_hgrad_op = array_ops.squeeze(cu_hgrad_op, axis=0)
+
+    cu_wgrad_op, cu_bgrad_op = format_converter.opaque_to_tf_canonical(
+        opaque_grad_op)
+    (cu_gk_grad_op, cu_cik_grad_op, cu_chk_grad_op) = cu_wgrad_op
+    (cu_gb_grad_op, cu_cib_grad_op, cu_chb_grad_op) = cu_bgrad_op
+    # cudnn gru has 2 biases for reset and update gates. When converting to tf
+    # canonical format, the two biases are summed into one.  Thus here relevant
+    # bias gradient should be halved before comparing with tf gru.
+    cu_gb_grad_op *= 0.5
+
+  init_op = variables.global_variables_initializer()
+  sess.run(init_op)
+
+  if is_training:
+    outputs, h, inp_grad, hgrad, wgrad, bgrad = sess.run([
+        outputs_op, h_op, inp_grad_op, hgrad_op,
+        (gk_grad_op, cik_grad_op, chk_grad_op),
+        (gb_grad_op, cib_grad_op, chb_grad_op)
+    ])
+    (cu_outputs, cu_h, cu_inp_grad, cu_hgrad, cu_wgrad, cu_bgrad) = sess.run([
+        cu_outputs_op, cu_h_op, cu_inp_grad_op, cu_hgrad_op,
+        (cu_gk_grad_op, cu_cik_grad_op, cu_chk_grad_op),
+        (cu_gb_grad_op, cu_cib_grad_op, cu_chb_grad_op)
+    ])
+    # Remove the trivial 1st dimension
+    cu_h = np.squeeze(cu_h, axis=0)
+
+    logging.vlog(1, "outputs: %s" % outputs)
+    logging.vlog(1, "cu_outputs: %s" % cu_outputs)
+    logging.vlog(1, "h: %s" % h)
+    logging.vlog(1, "cu_h: %s" % h)
+    logging.vlog(1, "inp_grad: %s" % inp_grad)
+    logging.vlog(1, "cu_inp_grad: %s" % cu_inp_grad)
+    logging.vlog(1, "hgrad: %s" % hgrad)
+    logging.vlog(1, "cu_hgrad: %s" % cu_hgrad)
+    logging.vlog(1, "wgrad: %s" % str(wgrad))
+    logging.vlog(1, "bgrad: %s" % str(bgrad))
+    logging.vlog(1, "cu_wgrad: %s" % str(cu_wgrad))
+    logging.vlog(1, "cu_bgrad: %s" % str(cu_bgrad))
+    return (outputs, cu_outputs, h, cu_h, inp_grad, cu_inp_grad, hgrad,
+            cu_hgrad, wgrad, bgrad, cu_wgrad, cu_bgrad)
   else:
-    raise ValueError("%s direction is not supported.")
+    outputs, h = sess.run([outputs_op, h_op])
+    cu_outputs, cu_h = sess.run([cu_outputs_op, cu_h_op])
+    # Remove the trivial 1st dimension.
+    cu_h = np.squeeze(cu_h, axis=0)
+
+    logging.vlog(1, "outputs: %s" % outputs)
+    logging.vlog(1, "cu_outputs: %s" % cu_outputs)
+    logging.vlog(1, "h: %s" % h)
+    logging.vlog(1, "cu_h: %s" % h)
+  return outputs, cu_outputs, h, cu_h
 
 
-class CudnnRNNTestSaveRestore(TensorFlowTestCase):
+class CudnnGRUTest(TensorFlowTestCase, parameterized.TestCase):
 
-  def _CompareWeights(self, lhs, rhs):
-    self.assertEqual(len(lhs), len(rhs))
-    for lw, rw in zip(lhs, rhs):
-      self.assertAllEqual(lw, rw)
+  def _test_training_helper(self,
+                            num_units,
+                            input_size,
+                            batch_size,
+                            time,
+                            num_layers,
+                            dtype,
+                            rtol=2e-6,
+                            atol=2e-6):
+    with self.session(use_gpu=True) as sess:
+      (outputs, cu_outputs, h, cu_h, inp_grad, cu_inp_grad, hgrad,
+       cu_hgrad, wgrad, bgrad, cu_wgrad, cu_bgrad) = RunGRU(
+           sess, num_units, input_size, batch_size, time, num_layers)
 
-  def _CompareBiases(self, lhs, rhs, rnn_mode, num_layers, direction):
-    self.assertEqual(len(lhs), len(rhs))
-    if rnn_mode == CUDNN_LSTM:
-      num_params_per_layer = CUDNN_LSTM_PARAMS_PER_LAYER
-    elif rnn_mode == CUDNN_GRU:
-      num_params_per_layer = CUDNN_GRU_PARAMS_PER_LAYER
-    elif rnn_mode == CUDNN_RNN_TANH:
-      num_params_per_layer = CUDNN_RNN_TANH_PARAMS_PER_LAYER
-    else:
-      num_params_per_layer = CUDNN_RNN_RELU_PARAMS_PER_LAYER
-    num_dirs = 1 if direction == CUDNN_RNN_UNIDIRECTION else 2
-    num_params_per_layer *= num_dirs
-    self.assertEqual(num_params_per_layer * num_layers, len(lhs))
+      self.assertAllClose(outputs, cu_outputs, rtol=rtol, atol=atol)
+      self.assertAllClose(h, cu_h, rtol=rtol, atol=atol)
+      self.assertAllClose(hgrad, cu_hgrad, rtol=rtol, atol=atol)
+      self.assertAllClose(inp_grad, cu_inp_grad, rtol=rtol, atol=atol)
+      for bg, cu_bg in zip(bgrad, cu_bgrad):
+        self.assertAllClose(bg, cu_bg, rtol=rtol, atol=atol)
+      for wg, cu_wg in zip(wgrad, cu_wgrad):
+        self.assertAllClose(wg, cu_wg, rtol=rtol, atol=atol)
 
-    for i in range(num_layers):
-      layer_lhs = lhs[i * num_params_per_layer: (i+1) * num_params_per_layer]
-      layer_rhs = rhs[i * num_params_per_layer: (i+1) * num_params_per_layer]
-      if direction == CUDNN_RNN_UNIDIRECTION:
-        self._CompareSingleLayerBiases(layer_lhs, layer_rhs)
-      else:
-        size = len(layer_lhs)
-        fw_lhs, bw_lhs = layer_lhs[:size//2], layer_lhs[size//2:]
-        fw_rhs, bw_rhs = layer_rhs[:size//2], layer_rhs[size//2:]
-        self._CompareSingleLayerBiases(fw_lhs, fw_rhs)
-        self._CompareSingleLayerBiases(bw_lhs, bw_rhs)
+  @parameterized.named_parameters(*NAMED_RNN_TESTCASES)
+  @unittest.skipUnless(test.is_built_with_cuda(),
+                       "Test only applicable when running on GPUs")
+  def test_training(self, num_units, input_size, batch_size, time, num_layers):
+    if not context.context().num_gpus():
+      self.skipTest("No GPUs found")
+    self._test_training_helper(num_units, input_size, batch_size, time,
+                               num_layers, dtypes.float32)
 
-  def _CompareSingleLayerBiases(self, lhs, rhs):
-    self.assertEqual(len(lhs), len(rhs))
+  @parameterized.named_parameters(*NAMED_RNN_TESTCASES)
+  @unittest.skipUnless(test.is_built_with_cuda(),
+                       "Test only applicable when running on GPUs")
+  def test_training_fp16(self, num_units, input_size, batch_size, time,
+                         num_layers):
+    if not context.context().num_gpus():
+      self.skipTest("No GPUs found")
+    self._test_training_helper(
+        num_units,
+        input_size,
+        batch_size,
+        time,
+        num_layers,
+        dtypes.float16,
+        rtol=5e-3,
+        atol=5e-4)
 
-    lf_lhs, rt_lhs = lhs[:len(lhs)//2], lhs[len(lhs)//2:]
-    lf_rhs, rt_rhs = rhs[:len(rhs)//2], rhs[len(rhs)//2:]
-    self.assertEqual(len(lf_lhs), len(rt_lhs))
-    self.assertEqual(len(lf_rhs), len(rt_rhs))
+  @parameterized.named_parameters(*NAMED_RNN_TESTCASES)
+  @unittest.skipUnless(test.is_built_with_cuda(),
+                       "Test only applicable when running on GPUs")
+  def test_inference(self, num_units, input_size, batch_size, time, num_layers):
+    if not context.context().num_gpus():
+      self.skipTest("No GPUs found")
+    with self.session(use_gpu=True) as sess:
+      (outputs, cu_outputs, h, cu_h) = RunGRU(
+          sess,
+          num_units,
+          input_size,
+          batch_size,
+          time,
+          num_layers,
+          is_training=False)
+      self.assertAllClose(outputs, cu_outputs)
+      self.assertAllClose(h, cu_h)
 
-    sum_lhs, sum_rhs = [], []
-    for lf, rt in zip(lf_lhs, rt_lhs):
-      sum_lhs.append(lf + rt)
-    for lf, rt in zip(lf_rhs, rt_rhs):
-      sum_rhs.append(lf + rt)
-    self.assertEqual(len(sum_lhs), len(sum_rhs))
-    for lf, rt in zip(sum_lhs, sum_rhs):
-      self.assertAllEqual(lf, rt)
+  @parameterized.named_parameters(*NAMED_RNN_TESTCASES)
+  @unittest.skipUnless(test.is_built_with_cuda(),
+                       "Test only applicable when running on GPUs")
+  def test_inference_fp16(self, num_units, input_size, batch_size, time,
+                          num_layers):
+    if not context.context().num_gpus():
+      self.skipTest("No GPUs found")
+    with self.session(use_gpu=True) as sess:
+      (outputs, cu_outputs, h, cu_h) = RunGRU(
+          sess,
+          num_units,
+          input_size,
+          batch_size,
+          time,
+          num_layers,
+          is_training=False,
+          dtype=dtypes.float16)
 
-  def _testSaveRestoreVariable(self, rnn_mode, direction, dtype):
-    num_layers = 2
-    num_units = 7
-    input_size = 3
-    with ops.Graph().as_default():
-      model = _CreateModel(
-          rnn_mode,
-          num_layers=num_layers,
-          num_units=num_units,
-          input_size=input_size,
-          direction=direction,
-          dtype=dtype)
-      random_seed.set_random_seed(1234)
-      params_size_t = model.params_size()
-      params = variables.VariableV1(
-          random_ops.random_uniform([params_size_t], dtype=dtype),
-          dtype=dtype,
-          validate_shape=False)
-      saveable = _CreateParamsSavable(params, model)
-      weights, biases = saveable.format_converter._opaque_to_cu_canonical(
-          saveable._variables)
-      reset_params = state_ops.assign(
-          params,
-          array_ops.zeros([params_size_t], dtype=dtype),
-          validate_shape=False)
-      save_path = os.path.join(self.get_temp_dir(),
-                               "save-restore-variable-test")
-      saver = saver_lib.Saver(write_version=saver_pb2.SaverDef.V2)
-      # Passing graph explicitly, otherwise an old sess would be reused.
-      with self.test_session(
-          use_gpu=True, graph=ops.get_default_graph()) as sess:
-        sess.run(variables.global_variables_initializer())
-        val = saver.save(sess, save_path)
-        self.assertEqual(save_path, val)
+      rtol, atol = 5e-3, 5e-4
+      self.assertAllClose(outputs, cu_outputs, rtol=rtol, atol=atol)
+      self.assertAllClose(h, cu_h, rtol=rtol, atol=atol)
 
-        weights_v, biases_v = sess.run([weights, biases])
+  @parameterized.named_parameters(*NAMED_RNN_TESTCASES)
+  @unittest.skipUnless(test.is_built_with_cuda(),
+                       "Test only applicable when running on GPUs")
+  def test_inference_with_dropout(self, num_units, input_size, batch_size, time,
+                                  num_layers):
+    """Validates that dropout does not affect Cudnn Rnn inference."""
+    # Hand-picked dropouts are used below (0. and 1.)
+    if not context.context().num_gpus():
+      self.skipTest("No GPUs found")
+    with ops.Graph().as_default() as g:
+      with self.session(use_gpu=True, graph=g) as sess:
+        # 1st time w/o dropout.
+        (_, cu_outputs, _, cu_h) = RunGRU(
+            sess,
+            num_units,
+            input_size,
+            batch_size,
+            time,
+            num_layers,
+            is_training=False,
+            dropout=0.)
 
-        sess.run(reset_params)
-        saver.restore(sess, save_path)
-        weights_v_restored, biases_v_restored = sess.run([weights, biases])
+    with ops.Graph().as_default() as g:
+      with self.session(use_gpu=True, graph=g) as sess:
+        (_, cu_outputs2, _, cu_h2) = RunGRU(
+            sess,
+            num_units,
+            input_size,
+            batch_size,
+            time,
+            num_layers,
+            is_training=False,
+            dropout=1.)
 
-        self._CompareWeights(weights_v, weights_v_restored)
-        self._CompareBiases(biases_v, biases_v_restored, rnn_mode, num_layers,
-                            direction)
+    self.assertAllClose(cu_outputs, cu_outputs2)
+    self.assertAllClose(cu_h[0], cu_h2[0])
 
-  def _testSaveRestoreTwoVariables(self, rnn_mode, direction, dtype):
-    num_layers = 2
-    num_units = 7
-    input_size = 3
-    with ops.Graph().as_default():
-      model = _CreateModel(
-          rnn_mode,
-          num_layers=num_layers,
-          num_units=num_units,
-          input_size=input_size,
-          direction=direction,
-          dtype=dtype)
-      random_seed.set_random_seed(1234)
-      params_size_t = model.params_size()
-      names = ["rnn_1", "rnn_2"]
-      param_vars = [
-          variables.VariableV1(
-              random_ops.random_uniform([params_size_t], dtype=dtype),
-              dtype=dtype,
-              validate_shape=False) for name in names
-      ]
-      saveables = []
-      for name, params in zip(names, param_vars):
-        saveables.append(_CreateParamsSavable(params, model, name, name))
-      weights1, biases1 = saveables[0].format_converter._opaque_to_cu_canonical(
-          saveables[0]._variables)
-      weights2, biases2 = saveables[1].format_converter._opaque_to_cu_canonical(
-          saveables[1]._variables)
-      reset_params = [
-          state_ops.assign(
-              params,
-              array_ops.zeros([params_size_t], dtype=dtype),
-              validate_shape=False) for params in param_vars
-      ]
-      save_path = os.path.join(self.get_temp_dir(),
-                               "save-restore-variable-test")
-      saver = saver_lib.Saver(write_version=saver_pb2.SaverDef.V2)
-      # Passing graph explicitly, otherwise an old sess would be reused.
-      with self.test_session(use_gpu=True,
-                             graph=ops.get_default_graph()) as sess:
-        sess.run(variables.global_variables_initializer())
-        val = saver.save(sess, save_path)
-        self.assertEqual(save_path, val)
-        weights1_v, biases1_v = sess.run([weights1, biases1])
-        weights2_v, biases2_v = sess.run([weights2, biases2])
 
-        sess.run(reset_params)
-        saver.restore(sess, save_path)
-        weights1_v_restored, biases1_v_restored = sess.run([weights1, biases1])
-        weights2_v_restored, biases2_v_restored = sess.run([weights2, biases2])
+class CudnnParamsFormatConverterTest(TensorFlowTestCase,
+                                     parameterized.TestCase):
+  """Class for testing various format converters."""
 
-        self._CompareWeights(weights1_v, weights1_v_restored)
-        self._CompareWeights(weights2_v, weights2_v_restored)
-        self._CompareBiases(biases1_v, biases1_v_restored, rnn_mode, num_layers,
-                            direction)
-        self._CompareBiases(biases2_v, biases2_v_restored, rnn_mode, num_layers,
-                            direction)
+  def _test_lstm_helper(self, num_units, input_size, num_layers, direction):
+    with self.session(use_gpu=True) as sess:
+      random_seed.set_random_seed(0)
+      np.random.seed(0)
 
-  def _testSaveRestoreOutput(self, rnn_mode, direction, dtype):
-    with ops.Graph().as_default():
-      num_layers = 2
-      num_units = 7
-      input_size = 7
-      seq_length = 10
-      batch_size = 5
-      dir_count = 1 if direction == cudnn_rnn_ops.CUDNN_RNN_UNIDIRECTION else 2
-      model = _CreateModel(
-          rnn_mode,
+      num_dirs = 1 if direction == cudnn_rnn_ops.CUDNN_RNN_UNIDIRECTION else 2
+      format_converter = cudnn_rnn_ops.CudnnParamsFormatConverterLSTM(
+          num_layers, num_units, input_size, direction=direction)
+
+      ws, bs = [], []
+      for _ in range(num_layers * num_dirs):
+        w = constant_op.constant(
+            np.random.rand(input_size + num_units, 4 * num_units),
+            dtype=dtypes.float32)
+        b = constant_op.constant(
+            np.random.rand(4 * num_units), dtype=dtypes.float32)
+        ws.append(w)
+        bs.append(b)
+
+      opaque_params = format_converter.tf_canonical_to_opaque(ws + bs)
+      opaque_params_size = cudnn_rnn_ops.cudnn_rnn_opaque_params_size(
+          cudnn_rnn_ops.CUDNN_LSTM,
           num_layers,
           num_units,
           input_size,
-          direction=direction,
-          dtype=dtype)
-      params_size_t = model.params_size()
-      params = variables.VariableV1(
-          array_ops.ones([params_size_t], dtype=dtype),
-          validate_shape=False,
-          dtype=dtype)
-      _CreateParamsSavable(params, model)
-      save_path = os.path.join(self.get_temp_dir(), "save-restore-output-test")
+          direction=direction)
+
+      ws_r, bs_r = format_converter.opaque_to_tf_canonical(opaque_params)
+
+      # Test tf_canonical_to_opaque() followed by opaque_to_tf_canonical()
+      # returns the original input.
+      ws, ws_r, bs, bs_r = sess.run([ws, ws_r, bs, bs_r])
+      for w, w_r in zip(ws, ws_r):
+        self.assertAllClose(w, w_r)
+      for b, b_r in zip(bs, bs_r):
+        self.assertAllClose(b, b_r)
+
+      # Test opaque_params size lower bound
+      opaque_params_size_v = sess.run(opaque_params_size)
+      min_params_size = sum(x.size for x in ws) + np.sum(x.size for x in bs)
+      logging.info("min_parm_size: %d vs actual_opaque_param_size: %d",
+                   min_params_size, opaque_params_size_v)
+      self.assertLessEqual(min_params_size, opaque_params_size_v)
+
+  @parameterized.named_parameters((c["testcase_name"], c["num_units"],
+                                   c["input_size"], c["num_layers"])
+                                  for c in NAMED_RNN_TESTCASES)
+  @unittest.skipUnless(test.is_built_with_cuda(),
+                       "Test only applicable when running on GPUs")
+  def test_lstm(self, num_units, input_size, num_layers):
+    if not context.context().num_gpus():
+      self.skipTest("No GPUs found")
+    self._test_lstm_helper(num_units, input_size, num_layers,
+                           cudnn_rnn_ops.CUDNN_RNN_UNIDIRECTION)
+
+  @parameterized.named_parameters((c["testcase_name"], c["num_units"],
+                                   c["input_size"], c["num_layers"])
+                                  for c in NAMED_RNN_TESTCASES)
+  @unittest.skipUnless(test.is_built_with_cuda(),
+                       "Test only applicable when running on GPUs")
+  def test_lstm_bidi(self, num_units, input_size, num_layers):
+    if not context.context().num_gpus():
+      self.skipTest("No GPUs found")
+    self._test_lstm_helper(num_units, input_size, num_layers,
+                           cudnn_rnn_ops.CUDNN_RNN_BIDIRECTION)
+
+  def _test_gru_helper(self, num_units, input_size, num_layers, direction):
+    with self.session(use_gpu=True) as sess:
+      random_seed.set_random_seed(0)
+      np.random.seed(0)
+
+      num_dirs = 1 if direction == cudnn_rnn_ops.CUDNN_RNN_UNIDIRECTION else 2
+      format_converter = cudnn_rnn_ops.CudnnParamsFormatConverterGRU(
+          num_layers, num_units, input_size, direction=direction)
+
+      ws, bs = [], []
+      for _ in range(num_layers * num_dirs):
+        gate_kernel = constant_op.constant(
+            np.random.rand(input_size + num_units, num_units * 2),
+            dtype=dtypes.float32)
+        gate_bias = constant_op.constant(
+            np.random.rand(num_units * 2), dtype=dtypes.float32)
+        candidate_inp_kernel = constant_op.constant(
+            np.random.rand(input_size, num_units), dtype=dtypes.float32)
+        candidate_inp_bias = constant_op.constant(
+            np.random.rand(num_units), dtype=dtypes.float32)
+        candidate_hid_kernel = constant_op.constant(
+            np.random.rand(num_units, num_units), dtype=dtypes.float32)
+        candidate_hid_bias = constant_op.constant(
+            np.random.rand(num_units), dtype=dtypes.float32)
+        ws.extend([gate_kernel, candidate_inp_kernel, candidate_hid_kernel])
+        bs.extend([gate_bias, candidate_inp_bias, candidate_hid_bias])
+
+      opaque_params = format_converter.tf_canonical_to_opaque(ws + bs)
+      opaque_params_size = cudnn_rnn_ops.cudnn_rnn_opaque_params_size(
+          cudnn_rnn_ops.CUDNN_GRU,
+          num_layers,
+          num_units,
+          input_size,
+          direction=direction)
+
+      ws_r, bs_r = format_converter.opaque_to_tf_canonical(opaque_params)
+
+      # Test tf_canonical_to_opaque() followed by opaque_to_tf_canonical()
+      # returns the original input.
+      ws, ws_r, bs, bs_r = sess.run([ws, ws_r, bs, bs_r])
+      for w, w_r in zip(ws, ws_r):
+        self.assertAllClose(w, w_r)
+      for b, b_r in zip(bs, bs_r):
+        self.assertAllClose(b, b_r)
+
+      # Test opaque_params size lower bound
+      opaque_params_size_v = sess.run(opaque_params_size)
+      min_params_size = sum(x.size for x in ws) + sum(x.size for x in bs)
+      logging.info("min_parm_size: %d vs actual_opaque_param_size: %d",
+                   min_params_size, opaque_params_size_v)
+      self.assertLessEqual(min_params_size, opaque_params_size_v)
+
+  @parameterized.named_parameters((c["testcase_name"], c["num_units"],
+                                   c["input_size"], c["num_layers"])
+                                  for c in NAMED_RNN_TESTCASES)
+  @unittest.skipUnless(test.is_built_with_cuda(),
+                       "Test only applicable when running on GPUs")
+  def test_gru(self, num_units, input_size, num_layers):
+    if not context.context().num_gpus():
+      self.skipTest("No GPUs found")
+    self._test_gru_helper(num_units, input_size, num_layers,
+                          cudnn_rnn_ops.CUDNN_RNN_UNIDIRECTION)
+
+  @parameterized.named_parameters((c["testcase_name"], c["num_units"],
+                                   c["input_size"], c["num_layers"])
+                                  for c in NAMED_RNN_TESTCASES)
+  @unittest.skipUnless(test.is_built_with_cuda(),
+                       "Test only applicable when running on GPUs")
+  def test_gru_bidi(self, num_units, input_size, num_layers):
+    if not context.context().num_gpus():
+      self.skipTest("No GPUs found")
+    self._test_gru_helper(num_units, input_size, num_layers,
+                          cudnn_rnn_ops.CUDNN_RNN_BIDIRECTION)
+
+
+class CudnnRnnSaveRestoreTest(TensorFlowTestCase, parameterized.TestCase):
+  """Class for testing various Cudnn Rnn SaveableObjects."""
+
+  def _create_opaque_param(self,
+                           rnn_mode,
+                           num_units,
+                           input_size,
+                           num_layers,
+                           direction,
+                           name=None):
+    param_size_t = cudnn_rnn_ops.cudnn_rnn_opaque_params_size(
+        rnn_mode, num_layers, num_units, input_size, direction=direction)
+    init_val = random_ops.random_uniform([param_size_t])
+    return variable_scope.get_variable(
+        name or "opaque_param", initializer=init_val, validate_shape=False)
+
+  def _create_saveable(self, opaque_param, rnn_mode, num_units, input_size,
+                       num_layers, direction):
+    if rnn_mode == CUDNN_LSTM:
+      fn = cudnn_rnn_ops.CudnnLSTMSaveable
+    elif rnn_mode == CUDNN_GRU:
+      fn = cudnn_rnn_ops.CudnnGRUSaveable
+    elif rnn_mode == CUDNN_RNN_TANH:
+      fn = cudnn_rnn_ops.CudnnRNNTanhSaveable
+    elif rnn_mode == CUDNN_RNN_RELU:
+      fn = cudnn_rnn_ops.CudnnRNNReluSaveable
+    saveable = fn(
+        opaque_param, num_layers, num_units, input_size, direction=direction)
+    return saveable
+
+  def _compare_weights(self, lhs, rhs):
+    self.assertLen(rhs, len(lhs))
+    for lw, rw in zip(lhs, rhs):
+      self.assertAllEqual(lw, rw)
+
+  def _compare_biases(self, lhs, rhs):
+    self.assertLen(rhs, len(lhs))
+    for lf, rt in zip(lhs, rhs):
+      self.assertAllEqual(lf, rt)
+
+  @parameterized.named_parameters(
+      ExpandNamedTestCases(
+          NAMED_RNN_TESTCASES, "time", "batch_size", **{
+              "rnn_mode": [
+                  CUDNN_LSTM, CUDNN_GRU, CUDNN_RNN_RELU, CUDNN_RNN_TANH
+              ],
+              "direction": [CUDNN_RNN_UNIDIRECTION, CUDNN_RNN_BIDIRECTION]
+          }))
+  @unittest.skipUnless(test.is_built_with_cuda(),
+                       "Test only applicable when running on GPUs")
+  def test_save_restore_variable(self, rnn_mode, num_units, input_size,
+                                 num_layers, direction):
+    # Verify the restored opaque param, once converted to tf_canonical format,
+    # is the same as the tf canonicals of the pre-restored param.
+    if not context.context().num_gpus():
+      self.skipTest("No GPUs found")
+    with self.session(use_gpu=True) as sess:
+      opaque_param = self._create_opaque_param(rnn_mode, num_units, input_size,
+                                               num_layers, direction)
+      saveable = self._create_saveable(opaque_param, rnn_mode, num_units,
+                                       input_size, num_layers, direction)
+      ops.add_to_collection(ops.GraphKeys.SAVEABLE_OBJECTS, saveable)
+      weights_op, biases_op = saveable.format_converter.opaque_to_tf_canonical(
+          saveable._variables)
+
+      save_path = os.path.join(self.get_temp_dir(), "save_restore_var_test")
       saver = saver_lib.Saver(write_version=saver_pb2.SaverDef.V2)
 
-      np.random.seed(1234)
-      has_input_c = (rnn_mode == cudnn_rnn_ops.CUDNN_LSTM)
-      input_data = constant_op.constant(
-          np.random.randn(seq_length, batch_size, input_size), dtype=dtype)
-      input_h = constant_op.constant(
-          np.random.randn(num_layers * dir_count, batch_size, num_units),
-          dtype=dtype)
-      if has_input_c:
-        input_c = constant_op.constant(
-            np.random.randn(num_layers * dir_count, batch_size, num_units),
-            dtype=dtype)
-        outputs = model(
-            input_data=input_data,
-            input_h=input_h,
-            input_c=input_c,
-            params=params,
-            is_training=False)
-      else:
-        outputs = model(
-            input_data=input_data,
-            input_h=input_h,
-            params=params,
-            is_training=False)
-      total_sum = sum(map(math_ops.reduce_sum, outputs))
-      # Passing graph explicitly, otherwise an old sess would be reused.
-      with self.test_session(
-          use_gpu=True, graph=ops.get_default_graph()) as sess:
-        sess.run(variables.global_variables_initializer())
-        total_sum_v = sess.run(total_sum)
-        val = saver.save(sess, save_path)
-        self.assertEqual(save_path, val)
-      # Passing graph explicitly, otherwise an old sess would be reused.
-      with self.test_session(
-          use_gpu=True, graph=ops.get_default_graph()) as sess:
-        reset_params = state_ops.assign(
-            params,
-            array_ops.zeros([params_size_t], dtype=dtype),
-            validate_shape=False)
-        sess.run(reset_params)
+      init_op = variables.global_variables_initializer()
+      reset_op = state_ops.assign(opaque_param,
+                                  array_ops.zeros_like(opaque_param))
+      sess.run(init_op)
+      self.assertEqual(save_path, saver.save(sess, save_path))
+
+      # Get the tf canonical vals before reset-restore
+      weights, biases = sess.run([weights_op, biases_op])
+
+      # Reset the opaque param value
+      sess.run(reset_op)
+      # Assert reset happened.
+      weights_z, biases_z = sess.run([weights_op, biases_op])
+      for w in weights_z:
+        self.assertAllClose(w, np.zeros_like(w))
+      for b in biases_z:
+        self.assertAllClose(b, np.zeros_like(b))
+
+      # Restore opaque param value from checkpoint.
+      saver.restore(sess, save_path)
+      weights_r, biases_r = sess.run([weights_op, biases_op])
+      self._compare_weights(weights, weights_r)
+      self._compare_biases(biases, biases_r)
+
+  @parameterized.named_parameters(
+      ExpandNamedTestCases(
+          NAMED_RNN_TESTCASES, "time", "batch_size", **{
+              "rnn_mode": [
+                  CUDNN_LSTM, CUDNN_GRU, CUDNN_RNN_RELU, CUDNN_RNN_TANH
+              ],
+              "direction": [CUDNN_RNN_UNIDIRECTION, CUDNN_RNN_BIDIRECTION]
+          }))
+  @unittest.skipUnless(test.is_built_with_cuda(),
+                       "Test only applicable when running on GPUs")
+  def test_save_restore_multi_variables(self, rnn_mode, num_units, input_size,
+                                        num_layers, direction):
+    # Verify the restored opaque param, once converted to tf_canonical format,
+    # is the same as the tf canonicals of the pre-restored param.
+    if not context.context().num_gpus():
+      self.skipTest("No GPUs found")
+    with self.session(use_gpu=True) as sess:
+      opaque_params = []
+      saveables = []
+      num_opaque_params = 2
+      for i in range(num_opaque_params):
+        opaque_params.append(
+            self._create_opaque_param(
+                rnn_mode,
+                num_units,
+                input_size,
+                num_layers,
+                direction,
+                name="opaque_param_%d" % i))
+        saveable = self._create_saveable(opaque_params[i], rnn_mode, num_units,
+                                         input_size, num_layers, direction)
+        ops.add_to_collection(ops.GraphKeys.SAVEABLE_OBJECTS, saveable)
+        saveables.append(saveable)
+
+      weights_ops, biases_ops = [], []
+      for i in range(num_opaque_params):
+        weights_op, biases_op = (
+            saveables[i].format_converter.opaque_to_tf_canonical(
+                saveables[i]._variables))
+        weights_ops.append(weights_op)
+        biases_ops.append(biases_op)
+
+      save_path = os.path.join(self.get_temp_dir(), "save_restore_var_test")
+      saver = saver_lib.Saver(write_version=saver_pb2.SaverDef.V2)
+
+      init_op = variables.global_variables_initializer()
+      reset_ops = []
+      for i in range(num_opaque_params):
+        reset_ops.append(
+            state_ops.assign(opaque_params[i],
+                             array_ops.zeros_like(opaque_params[i])))
+      sess.run(init_op)
+      self.assertEqual(save_path, saver.save(sess, save_path))
+
+      # Get the tf canonical vals before reset-restore
+      for i in range(num_opaque_params):
+        weights, biases = sess.run([weights_ops[i], biases_ops[i]])
+
+        # Reset the opaque param value
+        sess.run(reset_ops[i])
+
+        # Assert reset happened.
+        weights_z, biases_z = sess.run([weights_ops[i], biases_ops[i]])
+        for w in weights_z:
+          self.assertAllClose(w, np.zeros_like(w))
+        for b in biases_z:
+          self.assertAllClose(b, np.zeros_like(b))
+
+        # Restore opaque param value from checkpoint.
         saver.restore(sess, save_path)
-        total_sum_v_restored = sess.run(total_sum)
-        self.assertAllClose(total_sum_v, total_sum_v_restored, atol=1e-5)
-
-  @unittest.skipUnless(test.is_built_with_cuda(),
-                       "Test only applicable when running on GPUs")
-  def testSaveRestore(self):
-    rnn_modes = [
-        cudnn_rnn_ops.CUDNN_LSTM, cudnn_rnn_ops.CUDNN_GRU,
-        cudnn_rnn_ops.CUDNN_RNN_TANH, cudnn_rnn_ops.CUDNN_RNN_RELU
-    ]
-    directions = [
-        cudnn_rnn_ops.CUDNN_RNN_UNIDIRECTION,
-        cudnn_rnn_ops.CUDNN_RNN_BIDIRECTION
-    ]
-    dtype_list = [dtypes.float32, dtypes.float64]
-    for rnn_mode, direction, dtype in itertools.product(rnn_modes, directions,
-                                                        dtype_list):
-      self._testSaveRestoreVariable(rnn_mode, direction, dtype)
-      self._testSaveRestoreTwoVariables(rnn_mode, direction, dtype)
-      self._testSaveRestoreOutput(rnn_mode, direction, dtype)
-
-
-class CudnnRNNTestParamsSize(TensorFlowTestCase):
-
-  def _testOneLSTMParamsSize(self, num_layers, num_units, input_size,
-                             direction):
-    logging.info("Testing one lstm param size with config: %s", locals())
-    min_params_size = _MinLSTMParamSize(num_layers, num_units, input_size,
-                                        direction)
-    model = _CreateModel(
-        cudnn_rnn_ops.CUDNN_LSTM,
-        num_layers,
-        num_units,
-        input_size,
-        direction=direction)
-    params_size = model.params_size()
-    with self.test_session(use_gpu=True, graph=ops.get_default_graph()) as sess:
-      params_size_v = sess.run(params_size)
-      self.assertLessEqual(min_params_size, params_size_v)
-
-  @unittest.skipUnless(test.is_built_with_cuda(),
-                       "Test only applicable when running on GPUs")
-  def testLSTMParamsSize(self):
-    test_configs = [
-        [4, 200, 200],
-        [4, 200, 300],
-        [4, 200, 100],
-        [1, 100, 200],
-        [2, 200, 100],
-        [3, 200, 400],
-    ]
-    directions = [
-        cudnn_rnn_ops.CUDNN_RNN_UNIDIRECTION,
-        cudnn_rnn_ops.CUDNN_RNN_BIDIRECTION
-    ]
-    for (config, direction) in itertools.product(test_configs, directions):
-      num_layers, num_units, input_size = config
-      with ops.Graph().as_default():
-        self._testOneLSTMParamsSize(num_layers, num_units, input_size,
-                                    direction)
-
-  @unittest.skipUnless(test.is_built_with_cuda(),
-                       "Test only applicable when running on GPUs")
-  def testLSTMParamsSizeShape(self):
-    with self.assertRaisesRegexp(
-        ValueError, "Shape must be rank 0 but is rank 1"):
-      model = _CreateModel(
-          cudnn_rnn_ops.CUDNN_LSTM,
-          constant_op.constant([4]), 200, 200,
-          direction=cudnn_rnn_ops.CUDNN_RNN_UNIDIRECTION)
-      _ = model.params_size()
-    with self.assertRaisesRegexp(
-        ValueError, "Shape must be rank 0 but is rank 1"):
-      model = _CreateModel(
-          cudnn_rnn_ops.CUDNN_LSTM,
-          4, constant_op.constant([200]), 200,
-          direction=cudnn_rnn_ops.CUDNN_RNN_UNIDIRECTION)
-      _ = model.params_size()
-    with self.assertRaisesRegexp(
-        ValueError, "Shape must be rank 0 but is rank 1"):
-      model = _CreateModel(
-          cudnn_rnn_ops.CUDNN_LSTM,
-          4, 200, constant_op.constant([200]),
-          direction=cudnn_rnn_ops.CUDNN_RNN_UNIDIRECTION)
-      _ = model.params_size()
-
-
-class CudnnRNNTestInference(TensorFlowTestCase):
-
-  def _testOneSimpleInference(self, rnn_mode, num_layers, num_units, input_size,
-                              batch_size, seq_length, dir_count, dropout,
-                              expected, tolerance):
-    random_seed.set_random_seed(5678)
-    model = _CreateModel(
-        rnn_mode,
-        num_layers,
-        num_units,
-        input_size,
-        input_mode="auto_select",
-        direction=(cudnn_rnn_ops.CUDNN_RNN_UNIDIRECTION if dir_count == 1
-                   else cudnn_rnn_ops.CUDNN_RNN_BIDIRECTION),
-        dropout=dropout)
-    has_input_c = (rnn_mode == cudnn_rnn_ops.CUDNN_LSTM)
-    params_size_t = model.params_size()
-    input_data = array_ops.ones([seq_length, batch_size, input_size])
-    input_h = array_ops.ones([num_layers * dir_count, batch_size, num_units])
-    params = variables.VariableV1(
-        array_ops.ones([params_size_t]), validate_shape=False)
-    if has_input_c:
-      input_c = array_ops.ones([num_layers * dir_count, batch_size, num_units])
-      output, output_h, output_c = model(
-          input_data=input_data,
-          input_h=input_h,
-          input_c=input_c,
-          params=params,
-          is_training=False)
-    else:
-      output, output_h = model(
-          input_data=input_data,
-          input_h=input_h,
-          params=params,
-          is_training=False)
-    output_sum = math_ops.reduce_sum(output)
-    output_h_sum = math_ops.reduce_sum(output_h)
-    total_sum = output_sum + output_h_sum
-    if has_input_c:
-      output_c_sum = math_ops.reduce_sum(output_c)
-      total_sum += output_c_sum
-    with self.test_session(use_gpu=True, graph=ops.get_default_graph()) as sess:
-      sess.run(variables.global_variables_initializer())
-      total_sum_v = sess.run([total_sum])
-
-      self.assertAllClose(
-          total_sum_v[0], expected, atol=tolerance, rtol=tolerance)
-
-  @unittest.skipUnless(test.is_built_with_cuda(),
-                       "Test only applicable when running on GPUs")
-  def testSimpleInference(self):
-    test_configs = [
-        {
-            "rnn_mode": cudnn_rnn_ops.CUDNN_LSTM,
-            "expected": 231833.22,
-            "tolerance": 1e-2,
-            "shape": {
-                "num_layers": 4,
-                "num_units": 200,
-                "input_size": 200,
-                "batch_size": 20,
-                "seq_length": 10,
-                "dir_count": 1,
-            },
-        },
-        {
-            "rnn_mode": cudnn_rnn_ops.CUDNN_GRU,
-            "expected": 56000,
-            "tolerance": 1e-2,
-            "shape": {
-                "num_layers": 4,
-                "num_units": 200,
-                "input_size": 200,
-                "batch_size": 20,
-                "seq_length": 10,
-                "dir_count": 1,
-            },
-        },
-        {
-            "rnn_mode": cudnn_rnn_ops.CUDNN_RNN_TANH,
-            "expected": 56000,
-            "tolerance": 1e-2,
-            "shape": {
-                "num_layers": 4,
-                "num_units": 200,
-                "input_size": 200,
-                "batch_size": 20,
-                "seq_length": 10,
-                "dir_count": 1,
-            },
-        },
-        {
-            "rnn_mode": cudnn_rnn_ops.CUDNN_RNN_RELU,
-            "expected": 130688,
-            "tolerance": 1e-2,
-            "shape": {
-                "num_layers": 2,
-                "num_units": 8,
-                "input_size": 4,
-                "batch_size": 4,
-                "seq_length": 2,
-                "dir_count": 1,
-            },
-        },
-    ]
-    # Cudnn scales result for dropout during training, therefore dropout has no
-    # impact for inference results.
-    # (lstm, gru, rnn_tanh are saturated in the test. rnn_relu case is most
-    # demonstrative of the dropout-invariant nature of CudnnRnn.)
-    dropouts = [0., 0.5, 1.]
-    for (config, dropout) in itertools.product(test_configs, dropouts):
-      rnn_mode = config["rnn_mode"]
-      expected = config["expected"]
-      tolerance = config["tolerance"]
-      shape = config["shape"]
-      with ops.Graph().as_default():
-        self._testOneSimpleInference(
-            rnn_mode, shape["num_layers"], shape["num_units"],
-            shape["input_size"], shape["batch_size"], shape["seq_length"],
-            shape["dir_count"], dropout, expected, tolerance)
-
-
-class CudnnRNNTestTraining(TensorFlowTestCase):
-
-  def _testOneSimpleTraining(self, rnn_mode, num_layers, num_units, input_size,
-                             batch_size, seq_length, dir_count, dropout, dtype,
-                             delta, tolerance):
-    # Gradient checking runs two forward ops with almost the same input. Need to
-    # make sure the drop patterns across the two runs are the same.
-    logging.info("Training test with config: %s", locals())
-    old_env_state = os.environ.get("TF_CUDNN_RESET_RND_GEN_STATE", str(False))
-    os.environ["TF_CUDNN_RESET_RND_GEN_STATE"] = str(True)
-    has_input_c = (rnn_mode == cudnn_rnn_ops.CUDNN_LSTM)
-    random_seed.set_random_seed(5678)
-    direction = (cudnn_rnn_ops.CUDNN_RNN_UNIDIRECTION if dir_count == 1
-                 else cudnn_rnn_ops.CUDNN_RNN_BIDIRECTION)
-    model = _CreateModel(
-        rnn_mode,
-        num_layers,
-        num_units,
-        input_size,
-        direction=direction,
-        dtype=dtype,
-        dropout=dropout)
-    params_size_t = model.params_size()
-    input_data = variables.VariableV1(
-        random_ops.random_uniform(
-            [seq_length, batch_size, input_size], dtype=dtype),
-        dtype=dtype)
-    input_h = variables.VariableV1(
-        random_ops.random_uniform(
-            [num_layers * dir_count, batch_size, num_units], dtype=dtype),
-        dtype=dtype)
-    params = variables.VariableV1(
-        random_ops.random_uniform([params_size_t], dtype=dtype),
-        validate_shape=False,
-        dtype=dtype)
-    if has_input_c:
-      input_c = variables.VariableV1(
-          random_ops.random_uniform(
-              [num_layers * dir_count, batch_size, num_units], dtype=dtype),
-          dtype=dtype)
-
-      output, output_h, output_c = model(
-          input_data=input_data,
-          input_h=input_h,
-          input_c=input_c,
-          params=params)
-    else:
-      output, output_h = model(
-          input_data=input_data, input_h=input_h, params=params)
-    output_sum = math_ops.reduce_sum(output)
-    output_h_sum = math_ops.reduce_sum(output_h)
-    total_sum = output_sum + output_h_sum
-    if has_input_c:
-      output_c_sum = math_ops.reduce_sum(output_c)
-      total_sum += output_c_sum
-
-    with self.test_session(use_gpu=True, graph=ops.get_default_graph()) as sess:
-      params_size_v = sess.run(params_size_t)
-      inputs_and_shapes = [
-          (input_data, [seq_length, batch_size, input_size]),
-          (input_h, [num_layers * dir_count, batch_size, num_units]),
-          (params, [params_size_v]),
-      ]
-      if has_input_c:
-        inputs_and_shapes.append(
-            (input_c, [num_layers * dir_count, batch_size, num_units]),)
-      sess.run(variables.global_variables_initializer())
-      all_inputs = [entry[0] for entry in inputs_and_shapes]
-      all_shapes = [entry[1] for entry in inputs_and_shapes]
-
-      err = gradient_checker.compute_gradient_error(
-          all_inputs, all_shapes, total_sum, [1], delta=delta)
-
-      self.assertLess(err, tolerance)
-      os.environ["TF_CUDNN_RESET_RND_GEN_STATE"] = old_env_state
-
-  @unittest.skipUnless(test.is_built_with_cuda(),
-                       "Test only applicable when running on GPUs")
-  def DISABLED_testSimpleTraining(self):
-    # TODO(jamesqin): fix b/117989214
-    test_configs = [
-        {
-            "rnn_mode": cudnn_rnn_ops.CUDNN_LSTM,
-            "dtype": dtypes.float64,
-            "delta": 1e-4,
-            "tolerance": 5e-6,
-            "shape": {
-                "num_layers": 2,
-                "num_units": 3,
-                "input_size": 4,
-                "batch_size": 3,
-                "seq_length": 4,
-                "dir_count": 1,
-            },
-        },
-        {
-            "rnn_mode": cudnn_rnn_ops.CUDNN_GRU,
-            "dtype": dtypes.float64,
-            "delta": 1e-4,
-            "tolerance": 5e-6,
-            "shape": {
-                "num_layers": 2,
-                "num_units": 3,
-                "input_size": 4,
-                "batch_size": 3,
-                "seq_length": 4,
-                "dir_count": 1,
-            },
-        },
-        {
-            "rnn_mode": cudnn_rnn_ops.CUDNN_RNN_TANH,
-            "dtype": dtypes.float64,
-            "delta": 1e-4,
-            "tolerance": 5e-6,
-            "shape": {
-                "num_layers": 2,
-                "num_units": 3,
-                "input_size": 4,
-                "batch_size": 3,
-                "seq_length": 4,
-                "dir_count": 1,
-            },
-        },
-        {
-            "rnn_mode": cudnn_rnn_ops.CUDNN_RNN_RELU,
-            "dtype": dtypes.float64,
-            "delta": 1e-4,
-            "tolerance": 5e-6,
-            "shape": {
-                "num_layers": 2,
-                "num_units": 3,
-                "input_size": 4,
-                "batch_size": 3,
-                "seq_length": 4,
-                "dir_count": 1,
-            },
-        },
-        {
-            "rnn_mode": cudnn_rnn_ops.CUDNN_LSTM,
-            "dtype": dtypes.float32,
-            "tolerance": 1.5e-2,
-            "shape": {
-                "num_layers": 2,
-                "num_units": 3,
-                "input_size": 4,
-                "batch_size": 3,
-                "seq_length": 4,
-            },
-        },
-        {
-            "rnn_mode": cudnn_rnn_ops.CUDNN_GRU,
-            "dtype": dtypes.float32,
-            "tolerance": 4e-3,
-            "shape": {
-                "num_layers": 2,
-                "num_units": 3,
-                "input_size": 4,
-                "batch_size": 3,
-                "seq_length": 4,
-            },
-        },
-        {
-            "rnn_mode": cudnn_rnn_ops.CUDNN_RNN_TANH,
-            "dtype": dtypes.float32,
-            "tolerance": 5e-3,
-            "shape": {
-                "num_layers": 2,
-                "num_units": 3,
-                "input_size": 4,
-                "batch_size": 3,
-                "seq_length": 4,
-            },
-        },
-        {
-            "rnn_mode": cudnn_rnn_ops.CUDNN_RNN_RELU,
-            "dtype": dtypes.float32,
-            "tolerance": 5e-1,
-            "shape": {
-                "num_layers": 2,
-                "num_units": 3,
-                "input_size": 4,
-                "batch_size": 3,
-                "seq_length": 4,
-            },
-        },
-    ]
-    dropouts = [0., 0.5, 1.]
-    dir_counts = [1]
-    for config, dropout, dir_count in itertools.product(test_configs, dropouts,
-                                                        dir_counts):
-      rnn_mode = config["rnn_mode"]
-      dtype = config.get("dtype", dtypes.float32)
-      delta = config.get("delta", 1e-3)
-      tolerance = config["tolerance"]
-      shape = config["shape"]
-      with ops.Graph().as_default():
-        self._testOneSimpleTraining(rnn_mode, shape["num_layers"],
-                                    shape["num_units"], shape["input_size"],
-                                    shape["batch_size"], shape["seq_length"],
-                                    dir_count, dropout, dtype, delta, tolerance)
+        weights_r, biases_r = sess.run([weights_ops[i], biases_ops[i]])
+        self._compare_weights(weights, weights_r)
+        self._compare_biases(biases, biases_r)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_test.py b/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_test.py
index 1954f6717bb..7e1b4062ce4 100644
--- a/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_test.py
+++ b/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_test.py
@@ -536,7 +536,9 @@ class CudnnRNNTestSaveRestore(test_util.TensorFlowTestCase):
       save_path = os.path.join(self.get_temp_dir(),
                                "save-restore-variable-test")
       saver = saver_lib.Saver()
-      weights, biases = model.rnn.saveable._OpaqueParamsToCanonical()
+      weights, biases = (
+          model.rnn.saveable.format_converter._opaque_to_cu_canonical(
+              model.rnn.saveable._variables))
       opaque_params = rnn.trainable_variables[0]
       # CudnnTestModel() creates CudnnOpaqueParamsSaveable that helps saver save
       # Cudnn vars in canonical format.
@@ -583,8 +585,12 @@ class CudnnRNNTestSaveRestore(test_util.TensorFlowTestCase):
             dtype=dtype)
       opaque_params = (model1.rnn.trainable_variables[0],
                        model2.rnn.trainable_variables[0])
-      weights1, biases1 = model1.rnn.saveable._OpaqueParamsToCanonical()
-      weights2, biases2 = model2.rnn.saveable._OpaqueParamsToCanonical()
+      saveable1 = model1.rnn.saveable
+      weights1, biases1 = saveable1.format_converter._opaque_to_cu_canonical(
+          saveable1._variables)
+      saveable2 = model1.rnn.saveable
+      weights2, biases2 = saveable2.format_converter._opaque_to_cu_canonical(
+          saveable2._variables)
       reset_params = [
           state_ops.assign(params,
                            array_ops.zeros_like(params, dtype=dtype))
@@ -1039,8 +1045,8 @@ class CudnnRNNTestParamsSize(test_util.TensorFlowTestCase):
 
     # Min param size estimate = sum(weights.size) + sum(biases.size)
     min_params_size = (
-        np.sum(list(map(np.prod, rnn.canonical_weight_shapes))) +
-        np.sum([sp[0] for sp in rnn.canonical_bias_shapes]))
+        sum(map(np.prod, rnn.canonical_weight_shapes)) +
+        sum(sp[0] for sp in rnn.canonical_bias_shapes))
 
     opaque_params = rnn.trainable_variables[0]
     with self.test_session(use_gpu=True, graph=ops.get_default_graph()):
diff --git a/tensorflow/contrib/cudnn_rnn/python/layers/cudnn_rnn.py b/tensorflow/contrib/cudnn_rnn/python/layers/cudnn_rnn.py
index 8bbcc7cd039..8e25637ed91 100644
--- a/tensorflow/contrib/cudnn_rnn/python/layers/cudnn_rnn.py
+++ b/tensorflow/contrib/cudnn_rnn/python/layers/cudnn_rnn.py
@@ -21,6 +21,7 @@ from tensorflow.contrib.cudnn_rnn.python.ops import cudnn_rnn_ops
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.keras.engine import input_spec
 from tensorflow.python.layers import base as base_layer
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import init_ops
@@ -322,7 +323,7 @@ class _CudnnRNN(base_layer.Layer):
       raise ValueError("The last dimension of the inputs to `CudnnRNN` "
                        "should be defined. Found `None`.")
     self._input_size = input_shape[-1].value
-    self.input_spec = base_layer.InputSpec(ndim=3, axes={-1: self._input_size})
+    self.input_spec = input_spec.InputSpec(ndim=3, axes={-1: self._input_size})
 
     self._set_scope(None)
 
diff --git a/tensorflow/contrib/cudnn_rnn/python/ops/cudnn_rnn_ops.py b/tensorflow/contrib/cudnn_rnn/python/ops/cudnn_rnn_ops.py
index d06d0c6bdaa..1ce29b42d52 100644
--- a/tensorflow/contrib/cudnn_rnn/python/ops/cudnn_rnn_ops.py
+++ b/tensorflow/contrib/cudnn_rnn/python/ops/cudnn_rnn_ops.py
@@ -738,7 +738,7 @@ class CudnnOpaqueParamsSaveable(saver.BaseSaverBuilder.SaveableObject):
         self._variables, opaque_params, validate_shape=False)
 
   def _checkpointable_save(self, save_buffer):
-    weights, biases = self.format_converter.opaque_params_to_tf_canonical(
+    weights, biases = self.format_converter.opaque_to_tf_canonical(
         self._variables)
     for name, tensor in zip(self._param_names, weights + biases):
       save_buffer[name] = array_ops.identity(tensor)
diff --git a/tensorflow/contrib/data/python/kernel_tests/assert_element_shape_test.py b/tensorflow/contrib/data/python/kernel_tests/assert_element_shape_test.py
index 0456463a192..6c5f8c6b009 100644
--- a/tensorflow/contrib/data/python/kernel_tests/assert_element_shape_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/assert_element_shape_test.py
@@ -46,7 +46,7 @@ class AssertElementShapeTest(test_base.DatasetTestBase):
     result = dataset.apply(batching.assert_element_shape(expected_shapes))
     self.assertEqual(expected_shapes, result.output_shapes)
 
-    iterator = result.make_initializable_iterator()
+    iterator = dataset_ops.make_initializable_iterator(result)
     init_op = iterator.initializer
     get_next = iterator.get_next()
     with self.cached_session() as sess:
@@ -88,7 +88,7 @@ class AssertElementShapeTest(test_base.DatasetTestBase):
     result = dataset.apply(batching.assert_element_shape(expected_shapes))
     self.assertEqual(expected_shapes, result.output_shapes)
 
-    iterator = result.make_initializable_iterator()
+    iterator = dataset_ops.make_initializable_iterator(result)
     init_op = iterator.initializer
     get_next = iterator.get_next()
     with self.cached_session() as sess:
@@ -115,9 +115,8 @@ class AssertElementShapeTest(test_base.DatasetTestBase):
 
     wrong_shapes = (tensor_shape.TensorShape(2),
                     tensor_shape.TensorShape((3, 10)))
-    iterator = (
-        dataset.apply(batching.assert_element_shape(wrong_shapes))
-        .make_initializable_iterator())
+    iterator = dataset_ops.make_initializable_iterator(
+        dataset.apply(batching.assert_element_shape(wrong_shapes)))
     init_op = iterator.initializer
     get_next = iterator.get_next()
     with self.cached_session() as sess:
@@ -142,7 +141,7 @@ class AssertElementShapeTest(test_base.DatasetTestBase):
                      tensor_shape.TensorShape((3, 4)))
     self.assertEqual(actual_shapes, result.output_shapes)
 
-    iterator = result.make_initializable_iterator()
+    iterator = dataset_ops.make_initializable_iterator(result)
     init_op = iterator.initializer
     get_next = iterator.get_next()
     with self.cached_session() as sess:
@@ -184,7 +183,7 @@ class AssertElementShapeTest(test_base.DatasetTestBase):
     result = dataset.apply(batching.assert_element_shape(expected_shapes))
     self.assertEqual(expected_shapes, result.output_shapes)
 
-    iterator = result.make_initializable_iterator()
+    iterator = dataset_ops.make_initializable_iterator(result)
     init_op = iterator.initializer
     get_next = iterator.get_next()
     with self.cached_session() as sess:
@@ -211,9 +210,8 @@ class AssertElementShapeTest(test_base.DatasetTestBase):
 
     wrong_shapes = (tensor_shape.TensorShape(2),
                     tensor_shape.TensorShape((None, 10)))
-    iterator = (
-        dataset.apply(batching.assert_element_shape(wrong_shapes))
-        .make_initializable_iterator())
+    iterator = dataset_ops.make_initializable_iterator(
+        dataset.apply(batching.assert_element_shape(wrong_shapes)))
     init_op = iterator.initializer
     get_next = iterator.get_next()
     with self.cached_session() as sess:
diff --git a/tensorflow/contrib/data/python/kernel_tests/lmdb_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/lmdb_dataset_op_test.py
index d2a72272db1..b9840b1ff1a 100644
--- a/tensorflow/contrib/data/python/kernel_tests/lmdb_dataset_op_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/lmdb_dataset_op_test.py
@@ -23,6 +23,7 @@ import shutil
 
 from tensorflow.contrib.data.python.ops import readers
 from tensorflow.python.data.kernel_tests import test_base
+from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
@@ -48,7 +49,7 @@ class LMDBDatasetTest(test_base.DatasetTestBase):
     num_repeats = 2
 
     dataset = readers.LMDBDataset(filenames).repeat(num_repeats)
-    iterator = dataset.make_initializable_iterator()
+    iterator = dataset_ops.make_initializable_iterator(dataset)
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
diff --git a/tensorflow/contrib/data/python/kernel_tests/slide_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/slide_dataset_op_test.py
index c5a78623225..2527706709f 100644
--- a/tensorflow/contrib/data/python/kernel_tests/slide_dataset_op_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/slide_dataset_op_test.py
@@ -63,13 +63,13 @@ class SlideDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
     # The pipeline is TensorSliceDataset -> MapDataset(square_3) ->
     # RepeatDataset(count) ->
     # _SlideDataset(window_size, window_shift, window_stride).
-    iterator = (
+    iterator = dataset_ops.make_initializable_iterator(
         dataset_ops.Dataset.from_tensor_slices(components).map(_map_fn)
         .repeat(count).apply(
             sliding.sliding_window_batch(
                 window_size=window_size_t,
                 window_shift=window_shift_t,
-                window_stride=window_stride_t)).make_initializable_iterator())
+                window_stride=window_stride_t)))
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
@@ -127,13 +127,13 @@ class SlideDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
 
     # The pipeline is TensorSliceDataset -> MapDataset(square_3) ->
     # RepeatDataset(count) -> _SlideDataset(window_size, stride, window_stride).
-    iterator = (
+    iterator = dataset_ops.make_initializable_iterator(
         dataset_ops.Dataset.from_tensor_slices(components).map(_map_fn)
         .repeat(count).apply(
             sliding.sliding_window_batch(
                 window_size=window_size_t,
                 stride=stride_t,
-                window_stride=window_stride_t)).make_initializable_iterator())
+                window_stride=window_stride_t)))
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
@@ -173,12 +173,12 @@ class SlideDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
     window_shift_t = array_ops.placeholder(dtypes.int64, shape=[])
     window_stride_t = array_ops.placeholder(dtypes.int64, shape=[])
 
-    iterator = (
+    iterator = dataset_ops.make_initializable_iterator(
         dataset_ops.Dataset.range(10).map(lambda x: x).repeat(count_t).apply(
             sliding.sliding_window_batch(
                 window_size=window_size_t,
                 window_shift=window_shift_t,
-                window_stride=window_stride_t)).make_initializable_iterator())
+                window_stride=window_stride_t)))
     init_op = iterator.initializer
 
     with self.cached_session() as sess:
@@ -204,9 +204,9 @@ class SlideDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
       return sparse_tensor.SparseTensorValue(
           indices=[[0]], values=(i * [1]), dense_shape=[1])
 
-    iterator = dataset_ops.Dataset.range(10).map(_sparse).apply(
-        sliding.sliding_window_batch(
-            window_size=5, window_shift=3)).make_initializable_iterator()
+    iterator = dataset_ops.make_initializable_iterator(
+        dataset_ops.Dataset.range(10).map(_sparse).apply(
+            sliding.sliding_window_batch(window_size=5, window_shift=3)))
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
@@ -233,9 +233,9 @@ class SlideDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
           values=array_ops.fill([math_ops.to_int32(i)], i),
           dense_shape=[i])
 
-    iterator = dataset_ops.Dataset.range(10).map(_sparse).apply(
-        sliding.sliding_window_batch(
-            window_size=5, window_shift=3)).make_initializable_iterator()
+    iterator = dataset_ops.make_initializable_iterator(
+        dataset_ops.Dataset.range(10).map(_sparse).apply(
+            sliding.sliding_window_batch(window_size=5, window_shift=3)))
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
@@ -265,11 +265,10 @@ class SlideDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
       return sparse_tensor.SparseTensorValue(
           indices=[[0]], values=(i * [1]), dense_shape=[1])
 
-    iterator = (
+    iterator = dataset_ops.make_initializable_iterator(
         dataset_ops.Dataset.range(10).map(_sparse).apply(
             sliding.sliding_window_batch(window_size=4, window_shift=2)).apply(
-                sliding.sliding_window_batch(window_size=3, window_shift=1))
-        .make_initializable_iterator())
+                sliding.sliding_window_batch(window_size=3, window_shift=1)))
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
@@ -305,11 +304,10 @@ class SlideDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
       yield [4.0, 5.0, 6.0]
       yield [7.0, 8.0, 9.0, 10.0]
 
-    iterator = (
+    iterator = dataset_ops.make_initializable_iterator(
         dataset_ops.Dataset.from_generator(
             generator, dtypes.float32, output_shapes=[None]).apply(
-                sliding.sliding_window_batch(window_size=3, window_shift=1))
-        .make_initializable_iterator())
+                sliding.sliding_window_batch(window_size=3, window_shift=1)))
     next_element = iterator.get_next()
 
     with self.cached_session() as sess:
diff --git a/tensorflow/contrib/data/python/ops/BUILD b/tensorflow/contrib/data/python/ops/BUILD
index 34dc2379d0c..0fb406f1167 100644
--- a/tensorflow/contrib/data/python/ops/BUILD
+++ b/tensorflow/contrib/data/python/ops/BUILD
@@ -188,8 +188,7 @@ py_library(
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:function",
         "//tensorflow/python/data/ops:dataset_ops",
-        "//tensorflow/python/data/util:nest",
-        "//tensorflow/python/data/util:sparse",
+        "//tensorflow/python/data/util:structure",
     ],
 )
 
diff --git a/tensorflow/contrib/data/python/ops/readers.py b/tensorflow/contrib/data/python/ops/readers.py
index 4601376dff4..aa42782807a 100644
--- a/tensorflow/contrib/data/python/ops/readers.py
+++ b/tensorflow/contrib/data/python/ops/readers.py
@@ -355,7 +355,7 @@ def read_batch_features(file_pattern,
       shuffle=randomize_input,
       num_epochs=num_epochs,
       shuffle_buffer_size=capacity)
-  iterator = dataset.make_one_shot_iterator()
+  iterator = dataset_ops.make_one_shot_iterator(dataset)
   outputs = iterator.get_next()
   return outputs
 
@@ -379,15 +379,13 @@ class LMDBDataset(dataset_ops.DatasetSource):
     (key value) pairs sequentially.
     For example:
     ```python
+    tf.enable_eager_execution()
+
     dataset = tf.contrib.lmdb.LMDBDataset("/foo/bar.mdb")
-    iterator = dataset.make_one_shot_iterator()
-    next_element = iterator.get_next()
+
     # Prints the (key, value) pairs inside a lmdb file.
-    while True:
-      try:
-        print(sess.run(next_element))
-      except tf.errors.OutOfRangeError:
-        break
+    for key, value in dataset:
+      print(key, value)
     ```
     Args:
       filenames: A `tf.string` tensor containing one or more filenames.
diff --git a/tensorflow/contrib/data/python/ops/sliding.py b/tensorflow/contrib/data/python/ops/sliding.py
index bcc383587c5..9ebdca317f2 100644
--- a/tensorflow/contrib/data/python/ops/sliding.py
+++ b/tensorflow/contrib/data/python/ops/sliding.py
@@ -18,11 +18,10 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.data.util import nest
+from tensorflow.python.data.util import structure
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_shape
-from tensorflow.python.ops import gen_dataset_ops
+from tensorflow.python.ops import gen_experimental_dataset_ops as ged_ops
 from tensorflow.python.util import deprecation
 
 
@@ -40,29 +39,31 @@ class _SlideDataset(dataset_ops.UnaryDataset):
     self._window_shift = ops.convert_to_tensor(
         window_shift, dtype=dtypes.int64, name="window_shift")
 
+    # pylint: disable=protected-access
+    input_structure = structure.Structure._from_legacy_structure(
+        input_dataset.output_types, input_dataset.output_shapes,
+        input_dataset.output_classes)
+    self._output_structure = input_structure._batch(None)
+
   def _as_variant_tensor(self):
-    return gen_dataset_ops.slide_dataset(
+    return ged_ops.experimental_sliding_window_dataset(
         self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
         window_size=self._window_size,
         window_shift=self._window_shift,
         window_stride=self._window_stride,
-        **dataset_ops.flat_structure(self))
+        **dataset_ops.flat_structure(structure=self._output_structure))
 
   @property
   def output_classes(self):
-    return self._input_dataset.output_classes
+    return self._output_structure._to_legacy_output_classes()  # pylint: disable=protected-access
 
   @property
   def output_shapes(self):
-    input_shapes = self._input_dataset.output_shapes
-    return nest.pack_sequence_as(input_shapes, [
-        tensor_shape.vector(None).concatenate(s)
-        for s in nest.flatten(self._input_dataset.output_shapes)
-    ])
+    return self._output_structure._to_legacy_output_shapes()  # pylint: disable=protected-access
 
   @property
   def output_types(self):
-    return self._input_dataset.output_types
+    return self._output_structure._to_legacy_output_types()  # pylint: disable=protected-access
 
 
 @deprecation.deprecated_args(
diff --git a/tensorflow/contrib/distribute/BUILD b/tensorflow/contrib/distribute/BUILD
index a87a5624c88..3ecd755d86f 100644
--- a/tensorflow/contrib/distribute/BUILD
+++ b/tensorflow/contrib/distribute/BUILD
@@ -26,7 +26,6 @@ py_library(
     visibility = ["//tensorflow:internal"],
     deps = [
         "//tensorflow/contrib/distribute/python:collective_all_reduce_strategy",
-        "//tensorflow/contrib/distribute/python:cross_tower_ops",
         "//tensorflow/contrib/distribute/python:mirrored_strategy",
         "//tensorflow/contrib/distribute/python:monitor",
         "//tensorflow/contrib/distribute/python:one_device_strategy",
@@ -35,6 +34,7 @@ py_library(
         "//tensorflow/contrib/distribute/python:tpu_strategy",
         "//tensorflow/python:training",
         "//tensorflow/python:util",
+        "//tensorflow/python/distribute:cross_device_ops",
         "//tensorflow/python/distribute:distribute_config",
         "//tensorflow/python/distribute:distribute_coordinator",
     ],
diff --git a/tensorflow/contrib/distribute/README.md b/tensorflow/contrib/distribute/README.md
index a938f8629d8..81574a2047e 100644
--- a/tensorflow/contrib/distribute/README.md
+++ b/tensorflow/contrib/distribute/README.md
@@ -134,7 +134,7 @@ def model_fn(features, labels, mode):
     return tf.estimator.EstimatorSpec(mode, loss=loss)
 
   if mode == tf.estimator.ModeKeys.TRAIN:
-    train_op = tf.train.GradientDescentOptimizer(0.2).minimize(loss_fn())
+    train_op = tf.train.GradientDescentOptimizer(0.2).minimize(loss)
     return tf.estimator.EstimatorSpec(mode, loss=loss, train_op=train_op)
 ```
 
@@ -251,10 +251,10 @@ start multi-worker training using `tf.estimator.train_and_evaluate`:
 
 ```python
 def model_main():
-  estimator = ...
   distribution = tf.contrib.distribute.CollectiveAllReduceStrategy(
       num_gpus_per_worker=2)
   config = tf.estimator.RunConfig(train_distribute=distribution)
+  estimator = tf.estimator.Estimator(model_fn=model_fn, config=config)
   train_spec = tf.estimator.TrainSpec(input_fn=input_fn)
   eval_spec = tf.estimator.EvalSpec(input_fn=eval_input_fn)
   tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)
@@ -327,13 +327,13 @@ start training.
 On your laptop, you can run
 
 ```python
-estimator = ...
 distribution = tf.contrib.distribute.CollectiveAllReduceStrategy(
     num_gpus_per_worker=2)
 config = tf.estimator.RunConfig(
     experimental_distribute=tf.contrib.distribute.DistributeConfig(
         train_distribute=distribution,
         remote_cluster={"worker": ["host1:port", "host2:port", "host3:port"]}))
+estimator = tf.estimator.Estimator(model_fn=model_fn, config=config)
 train_spec = tf.estimator.TrainSpec(input_fn=input_fn)
 eval_spec = tf.estimator.EvalSpec(input_fn=eval_input_fn)
 tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)
diff --git a/tensorflow/contrib/distribute/__init__.py b/tensorflow/contrib/distribute/__init__.py
index ab2f221dc64..8ec73654e30 100644
--- a/tensorflow/contrib/distribute/__init__.py
+++ b/tensorflow/contrib/distribute/__init__.py
@@ -25,13 +25,13 @@ from __future__ import print_function
 
 # pylint: disable=unused-import,wildcard-import
 from tensorflow.contrib.distribute.python.collective_all_reduce_strategy import CollectiveAllReduceStrategy
-from tensorflow.contrib.distribute.python.cross_tower_ops import *
 from tensorflow.contrib.distribute.python.mirrored_strategy import MirroredStrategy
 from tensorflow.contrib.distribute.python.monitor import Monitor
 from tensorflow.contrib.distribute.python.one_device_strategy import OneDeviceStrategy
 from tensorflow.contrib.distribute.python.parameter_server_strategy import ParameterServerStrategy
 from tensorflow.contrib.distribute.python.step_fn import *
 from tensorflow.contrib.distribute.python.tpu_strategy import TPUStrategy
+from tensorflow.python.distribute.cross_device_ops import *
 from tensorflow.python.distribute.distribute_config import DistributeConfig
 from tensorflow.python.distribute.distribute_coordinator import run_standard_tensorflow_server
 from tensorflow.python.training.distribute import *
@@ -46,6 +46,7 @@ _allowed_symbols = [
     'CrossDeviceOps',
     'DistributeConfig',
     'DistributionStrategy',
+    'DistributionStrategyExtended',
     'MirroredStrategy',
     'Monitor',
     'MultiWorkerAllReduce',
@@ -62,6 +63,7 @@ _allowed_symbols = [
     'get_loss_reduction',
     'get_replica_context',
     'has_distribution_strategy',
+    'in_cross_replica_context',
     'require_replica_context',
     'run_standard_tensorflow_server',
     'UpdateContext',
diff --git a/tensorflow/contrib/distribute/python/BUILD b/tensorflow/contrib/distribute/python/BUILD
index 4094e52169a..4c9c35da5a3 100644
--- a/tensorflow/contrib/distribute/python/BUILD
+++ b/tensorflow/contrib/distribute/python/BUILD
@@ -16,45 +16,26 @@ load("//tensorflow:tensorflow.bzl", "cuda_py_test")
 # TODO(priyag): Figure out testonly issues that are preventing us from
 # including our tests in pip for now.
 
-py_library(
-    name = "values",
-    srcs = ["values.py"],
-    visibility = ["//tensorflow:internal"],
-    deps = [
-        ":input_ops",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:device_util",
-        "//tensorflow/python:distribute",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:resource_variable_ops",
-        "//tensorflow/python:training",
-        "//tensorflow/python:util",
-        "//tensorflow/python/data/ops:multi_device_iterator_ops",
-        "//tensorflow/python/eager:context",
-        "//tensorflow/python/training/checkpointable:base",
-        "@six_archive//:six",
-    ],
-)
-
 cuda_py_test(
     name = "values_test",
     srcs = ["values_test.py"],
     additional_deps = [
+        ":combinations",
         ":mirrored_strategy",
         ":multi_worker_test_base",
-        ":values",
+        "@absl_py//absl/testing:parameterized",
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python/data/ops:dataset_ops",
-        "//tensorflow/python:errors",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:constant_op",
+        "//tensorflow/python:errors",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:training",
         "//tensorflow/python:variable_scope",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/distribute:device_util",
+        "//tensorflow/python/distribute:values",
         "//tensorflow/python/eager:context",
-        "//tensorflow/python:device_util",
         "//tensorflow/python/eager:test",
         "//tensorflow/python/estimator:estimator_py",
     ],
@@ -68,25 +49,9 @@ py_library(
     srcs = ["mirrored_strategy.py"],
     visibility = ["//tensorflow:internal"],
     deps = [
-        ":cross_tower_ops",
-        ":shared_variable_creator",
-        ":values",
-        "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:device",
-        "//tensorflow/python:device_util",
-        "//tensorflow/python:distribute",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:pywrap_tensorflow",
-        "//tensorflow/python:training",
-        "//tensorflow/python:util",
-        "//tensorflow/python:variable_scope",
-        "//tensorflow/python:variables",
-        "//tensorflow/python/distribute:multi_worker_util",
-        "//tensorflow/python/eager:context",
-        "//tensorflow/python/eager:tape",
+        "//tensorflow/python/distribute:distribute_lib",
+        "//tensorflow/python/distribute:mirrored_strategy",
+        "//tensorflow/python/distribute:values",
     ],
 )
 
@@ -95,16 +60,17 @@ py_library(
     srcs = ["parameter_server_strategy.py"],
     visibility = ["//tensorflow:internal"],
     deps = [
-        ":cross_tower_ops",
         ":mirrored_strategy",
-        ":values",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:resource_variable_ops",
         "//tensorflow/python:training",
         "//tensorflow/python:util",
+        "//tensorflow/python/distribute:cross_device_ops",
         "//tensorflow/python/distribute:multi_worker_util",
+        "//tensorflow/python/distribute:reduce_util",
+        "//tensorflow/python/distribute:values",
         "//tensorflow/python/eager:context",
     ],
 )
@@ -116,7 +82,7 @@ cuda_py_test(
         ":combinations",
         ":multi_worker_test_base",
         ":parameter_server_strategy",
-        ":values",
+        ":strategy_test_lib",
         "@absl_py//absl/testing:parameterized",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
@@ -127,10 +93,12 @@ cuda_py_test(
         "//tensorflow/python:gradients",
         "//tensorflow/python:layers",
         "//tensorflow/python:session",
+        "//tensorflow/python:tensor_util",
         "//tensorflow/python:training",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python:variables",
         "//tensorflow/python/distribute:multi_worker_util",
+        "//tensorflow/python/distribute:values",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/estimator:estimator_py",
     ],
@@ -145,12 +113,13 @@ py_library(
     srcs = ["one_device_strategy.py"],
     visibility = ["//tensorflow:internal"],
     deps = [
-        ":values",
-        "//tensorflow/contrib/eager/python:datasets",
         "//tensorflow/python:array_ops",
-        "//tensorflow/python:distribute",
+        "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:math_ops",
+        "//tensorflow/python/distribute:distribute_lib",
+        "//tensorflow/python/distribute:reduce_util",
+        "//tensorflow/python/distribute:values",
         "//tensorflow/python/eager:context",
         "@six_archive//:six",
     ],
@@ -161,16 +130,16 @@ py_library(
     srcs = ["collective_all_reduce_strategy.py"],
     visibility = ["//tensorflow:internal"],
     deps = [
-        ":cross_tower_ops",
-        ":cross_tower_utils",
         ":mirrored_strategy",
-        ":values",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:collective_ops",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:training",
+        "//tensorflow/python/distribute:cross_device_ops",
+        "//tensorflow/python/distribute:cross_device_utils",
         "//tensorflow/python/distribute:multi_worker_util",
+        "//tensorflow/python/distribute:values",
         "//tensorflow/python/eager:context",
     ],
 )
@@ -187,11 +156,11 @@ py_library(
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:constant_op",
-        "//tensorflow/python:distribute",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:layers",
         "//tensorflow/python:training",
         "//tensorflow/python:variables",
+        "//tensorflow/python/distribute:distribute_lib",
         "//tensorflow/python/eager:backprop",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:test",
@@ -212,10 +181,10 @@ py_library(
         ":tpu_strategy",
         "//tensorflow/contrib/cluster_resolver:cluster_resolver_pip",
         "//tensorflow/contrib/optimizer_v2:training",
-        "//tensorflow/python:distribute",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:training",
         "//tensorflow/python:util",
+        "//tensorflow/python/distribute:distribute_lib",
         "//tensorflow/python/eager:context",
         "@absl_py//absl/testing:parameterized",
     ],
@@ -233,28 +202,6 @@ py_test(
     ],
 )
 
-py_test(
-    name = "mirrored_strategy_test",
-    srcs = ["mirrored_strategy_test.py"],
-    srcs_version = "PY2AND3",
-    tags = [
-        "no_pip",
-    ],
-    deps = [
-        ":mirrored_strategy",
-        ":multi_worker_test_base",
-        ":strategy_test_lib",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:distribute",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:training",
-        "//tensorflow/python:variable_scope",
-        "//tensorflow/python/eager:context",
-        "//tensorflow/python/eager:test",
-    ],
-)
-
 py_test(
     name = "one_device_strategy_test",
     srcs = ["one_device_strategy_test.py"],
@@ -270,35 +217,32 @@ py_test(
     ],
 )
 
+# TODO(priyag): Rename this test to mirrored_strategy_test
 cuda_py_test(
     name = "mirrored_strategy_multigpu_test",
     srcs = ["mirrored_strategy_multigpu_test.py"],
     additional_deps = [
+        ":combinations",
         ":mirrored_strategy",
         ":multi_worker_test_base",
-        ":values",
         ":strategy_test_lib",
-        "//tensorflow/python:distribute",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:constant_op",
+        "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:layers",
         "//tensorflow/python:state_ops",
         "//tensorflow/python:variable_scope",
-        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python/distribute:distribute_lib",
+        "//tensorflow/python/distribute:values",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:test",
     ],
+    shard_count = 5,
     tags = [
         "guitar",
-        "no_pip",
         "multi_and_single_gpu",
-        # Do not perform the extra analysis on this test, because it is already
-        # performed for the `:mirrored_strategy_test` target.
-        "no_oss",
-        "noasan",
-        "notap",
-        "notsan",
+        "no_pip",
     ],
 )
 
@@ -337,12 +281,15 @@ py_library(
     visibility = ["//tensorflow:internal"],
     deps = [
         ":one_device_strategy",
-        ":values",
         "//tensorflow/contrib/tpu:tpu_lib",
         "//tensorflow/python:constant_op",
         "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
+        "//tensorflow/python:tensor_util",
         "//tensorflow/python:util",
+        "//tensorflow/python/distribute:reduce_util",
+        "//tensorflow/python/distribute:values",
     ],
 )
 
@@ -352,7 +299,6 @@ cuda_py_test(
     additional_deps = [
         ":collective_all_reduce_strategy",
         ":combinations",
-        ":cross_tower_utils",
         ":multi_worker_test_base",
         ":strategy_test_lib",
         "@absl_py//absl/testing:parameterized",
@@ -368,6 +314,7 @@ cuda_py_test(
         "//tensorflow/python:layers",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python:variables",
+        "//tensorflow/python/distribute:cross_device_utils",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/estimator:estimator_py",
     ],
@@ -469,6 +416,7 @@ cuda_py_test(
         "multi_and_single_gpu",
         "no_oss",  # http://b/119349471
         "no_pip",
+        "tf_integration_test",
     ],
 )
 
@@ -476,28 +424,18 @@ cuda_py_test(
     name = "keras_optimizer_v2_test",
     srcs = ["keras_optimizer_v2_test.py"],
     additional_deps = [
-        ":combinations",
-        "@absl_py//absl/testing:parameterized",
-        "//third_party/py/numpy",
-        "//tensorflow/contrib/optimizer_v2:training",
-        "//tensorflow/python/data/ops:dataset_ops",
-        "//tensorflow/python/eager:test",
-        "//tensorflow/python/estimator:estimator_py",
-        "//tensorflow/python/feature_column",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:platform",
-        "//tensorflow/python:summary",
+        ":keras_test_lib",
     ],
     tags = [
         "multi_and_single_gpu",
         "no_oss",  # http://b/119349471
         "no_pip",
+        "tf_integration_test",
     ],
 )
 
 cuda_py_test(
     name = "estimator_training_test",
-    size = "large",
     srcs = ["estimator_training_test.py"],
     additional_deps = [
         ":collective_all_reduce_strategy",
@@ -508,7 +446,9 @@ cuda_py_test(
         "//third_party/py/numpy",
         "//tensorflow/contrib/optimizer_v2:training",
         "//tensorflow/python/data/ops:dataset_ops",
-        "//tensorflow/python/distribute",
+        "//tensorflow/python/distribute:distribute_config",
+        "//tensorflow/python/distribute:distribute_coordinator",
+        "//tensorflow/python/distribute:distribute_coordinator_context",
         "//tensorflow/python/eager:test",
         "//tensorflow/python/estimator:estimator_py",
         "//tensorflow/python/feature_column",
@@ -516,7 +456,7 @@ cuda_py_test(
         "//tensorflow/python:platform",
         "//tensorflow/python:summary",
     ],
-    shard_count = 5,
+    shard_count = 48,
     tags = [
         "multi_and_single_gpu",
         "no_pip",
@@ -524,6 +464,7 @@ cuda_py_test(
         "noasan",
         "nomsan",
         "notsan",
+        "no_oss",  # http://b/119349471
     ],
 )
 
@@ -599,52 +540,16 @@ cuda_py_test(
     ],
 )
 
-py_library(
-    name = "shared_variable_creator",
-    srcs = ["shared_variable_creator.py"],
-    visibility = ["//tensorflow:internal"],
-)
-
-py_test(
-    name = "shared_variable_creator_test",
-    srcs = ["shared_variable_creator_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":shared_variable_creator",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:variable_scope",
-        "//tensorflow/python/eager:test",
-    ],
-)
-
-py_library(
-    name = "cross_tower_utils",
-    srcs = ["cross_tower_utils.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":values",
-        "//tensorflow/contrib/all_reduce:all_reduce_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:collective_ops",
-        "//tensorflow/python:device",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:gradients",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:nccl_ops",
-    ],
-)
-
 cuda_py_test(
-    name = "cross_tower_utils_test",
-    srcs = ["cross_tower_utils_test.py"],
+    name = "cross_device_utils_test",
+    srcs = ["cross_device_utils_test.py"],
     additional_deps = [
         ":combinations",
-        ":cross_tower_utils",
         "@absl_py//absl/testing:parameterized",
         "//tensorflow/python:constant_op",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:math_ops",
+        "//tensorflow/python/distribute:cross_device_utils",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:test",
     ],
@@ -653,40 +558,20 @@ cuda_py_test(
     ],
 )
 
-py_library(
-    name = "cross_tower_ops",
-    srcs = ["cross_tower_ops.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":cross_tower_utils",
-        ":values",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:device_lib",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:platform",
-        "//tensorflow/python:resource_variable_ops",
-        "//tensorflow/python:training",
-        "//tensorflow/python:variable_scope",
-        "//tensorflow/python/eager:context",
-        "@six_archive//:six",
-    ],
-)
-
 cuda_py_test(
-    name = "cross_tower_ops_test",
-    srcs = ["cross_tower_ops_test.py"],
+    name = "cross_device_ops_test",
+    srcs = ["cross_device_ops_test.py"],
     additional_deps = [
         ":combinations",
-        ":cross_tower_ops",
         ":multi_worker_test_base",
         ":mirrored_strategy",
-        ":values",
         "@absl_py//absl/testing:parameterized",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:constant_op",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:math_ops",
+        "//tensorflow/python/distribute:cross_device_ops",
+        "//tensorflow/python/distribute:values",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:test",
     ],
@@ -696,37 +581,6 @@ cuda_py_test(
     ],
 )
 
-py_library(
-    name = "input_ops",
-    srcs = ["input_ops.py"],
-    visibility = ["//tensorflow:internal"],
-    deps = [
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python/data/util:nest",
-    ],
-)
-
-cuda_py_test(
-    name = "input_ops_test",
-    srcs = ["input_ops_test.py"],
-    additional_deps = [
-        ":input_ops",
-        "//tensorflow/python/data/ops:dataset_ops",
-        "//tensorflow/contrib/data/python/ops:batching",
-        "//tensorflow/contrib/data/python/ops:interleave_ops",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:io_ops",
-        "//tensorflow/python/data/ops:readers",
-        "//tensorflow/python:util",
-    ],
-    tags = [
-        "no_pip",
-    ],
-)
-
 py_library(
     name = "keras_test_lib",
     testonly = 1,
@@ -737,6 +591,7 @@ py_library(
         "//tensorflow/contrib/distribute/python:tpu_strategy",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:training",
+        "//tensorflow/python/eager:test",
         "//tensorflow/python/estimator:estimator_py",
         "//tensorflow/python/keras",
         "//third_party/py/numpy",
@@ -766,7 +621,6 @@ py_library(
     srcs = ["metrics_v1_test.py"],
     deps = [
         ":combinations",
-        "//tensorflow/contrib/data/python/ops:batching",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:metrics",
         "//tensorflow/python:variables",
diff --git a/tensorflow/contrib/distribute/python/checkpoint_utils_test.py b/tensorflow/contrib/distribute/python/checkpoint_utils_test.py
index d38bdb592a3..31bd0e996a2 100644
--- a/tensorflow/contrib/distribute/python/checkpoint_utils_test.py
+++ b/tensorflow/contrib/distribute/python/checkpoint_utils_test.py
@@ -43,7 +43,9 @@ class CheckpointUtilsWithDistributionStrategyTest(
       distribution=[combinations.default_strategy,
                     combinations.one_device_strategy,
                     combinations.mirrored_strategy_with_gpu_and_cpu,
-                    combinations.mirrored_strategy_with_two_gpus],
+                    combinations.mirrored_strategy_with_two_gpus,
+                    combinations.core_mirrored_strategy_with_gpu_and_cpu,
+                    combinations.core_mirrored_strategy_with_two_gpus],
       in_replica_mode=[True, False],
       mode=["graph"]))
   def testInitFromCheckpoint(self, distribution, in_replica_mode):
diff --git a/tensorflow/contrib/distribute/python/collective_all_reduce_strategy.py b/tensorflow/contrib/distribute/python/collective_all_reduce_strategy.py
index efa99d1fc52..e988b63a287 100644
--- a/tensorflow/contrib/distribute/python/collective_all_reduce_strategy.py
+++ b/tensorflow/contrib/distribute/python/collective_all_reduce_strategy.py
@@ -18,12 +18,16 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.distribute.python import cross_tower_ops as cross_tower_ops_lib
-from tensorflow.contrib.distribute.python import cross_tower_utils
+import copy
+
 from tensorflow.contrib.distribute.python import mirrored_strategy
-from tensorflow.contrib.distribute.python import values
 from tensorflow.core.protobuf import rewriter_config_pb2
+from tensorflow.python.distribute import cross_device_ops as cross_device_ops_lib
+from tensorflow.python.distribute import cross_device_utils
+from tensorflow.python.distribute import device_util
+from tensorflow.python.distribute import distribute_lib
 from tensorflow.python.distribute import multi_worker_util
+from tensorflow.python.distribute import values
 from tensorflow.python.eager import context
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
@@ -32,7 +36,7 @@ from tensorflow.python.platform import tf_logging as logging
 
 
 # TODO(yuefengz): support in-graph replication.
-class CollectiveAllReduceStrategy(mirrored_strategy.MirroredStrategy):
+class CollectiveAllReduceStrategy(distribute_lib.DistributionStrategy):
   """Distribution strategy that uses collective ops for all-reduce.
 
   It is similar to the MirroredStrategy but it uses collective ops for
@@ -53,6 +57,17 @@ class CollectiveAllReduceStrategy(mirrored_strategy.MirroredStrategy):
       num_gpus_per_worker: number of local GPUs or GPUs per worker, the default
         is 0 meaning CPU only.
     """
+    super(CollectiveAllReduceStrategy, self).__init__(
+        CollectiveAllReduceExtended(self, num_gpus_per_worker))
+
+
+class CollectiveAllReduceExtended(mirrored_strategy.MirroredExtended):
+  """Implementation of CollectiveAllReduceStrategy."""
+
+  def __init__(self, container_strategy, num_gpus_per_worker):
+    distribute_lib.DistributionStrategyExtended.__init__(
+        self, container_strategy)
+    self._cross_device_ops = None
     self._num_gpus_per_worker = num_gpus_per_worker
     self._initialize_local_worker(num_gpus_per_worker)
 
@@ -67,14 +82,14 @@ class CollectiveAllReduceStrategy(mirrored_strategy.MirroredStrategy):
       ]
     else:
       local_devices = ["/device:CPU:0"]
+    self._worker_device = device_util.canonicalize("/device:CPU:0")
 
-    self._collective_keys = cross_tower_utils.CollectiveKeys()
-    super(CollectiveAllReduceStrategy, self).__init__(
-        devices=local_devices,
-        cross_tower_ops=cross_tower_ops_lib.CollectiveAllReduce(
-            num_workers=1,
-            num_gpus_per_worker=num_gpus_per_worker,
-            collective_keys=self._collective_keys))
+    self._collective_keys = cross_device_utils.CollectiveKeys()
+    self._initialize_local(local_devices)
+    self._cross_tower_ops = cross_device_ops_lib.CollectiveAllReduce(
+        num_workers=self._num_workers,
+        num_gpus_per_worker=num_gpus_per_worker,
+        collective_keys=self._collective_keys)
 
     self._cluster_spec = None
     self._task_type = None
@@ -94,8 +109,7 @@ class CollectiveAllReduceStrategy(mirrored_strategy.MirroredStrategy):
           "Unrecognized task_type: %r, valid task types are: \"chief\", "
           "\"worker\"." % task_type)
     cluster_spec = multi_worker_util.normalize_cluster_spec(cluster_spec)
-    self._num_workers = len(cluster_spec.as_dict().get("worker", [])) + len(
-        cluster_spec.as_dict().get("chief", []))
+    self._num_workers = multi_worker_util.worker_count(cluster_spec, task_type)
     if not self._num_workers:
       raise ValueError("No `worker` or `chief` tasks can be found in "
                        "`cluster_spec`.")
@@ -103,22 +117,21 @@ class CollectiveAllReduceStrategy(mirrored_strategy.MirroredStrategy):
     self._is_chief = multi_worker_util.is_chief(cluster_spec, task_type,
                                                 task_id)
 
-    worker_device = "/job:%s/task:%d" % (task_type, task_id)
+    self._worker_device = "/job:%s/task:%d" % (task_type, task_id)
     if num_gpus_per_worker:
       local_devices = [
-          "%s/device:GPU:%d" % (worker_device, i)
+          "%s/device:GPU:%d" % (self._worker_device, i)
           for i in range(num_gpus_per_worker)
       ]
     else:
-      local_devices = [worker_device]
+      local_devices = [self._worker_device]
 
-    self._collective_keys = cross_tower_utils.CollectiveKeys()
-    super(CollectiveAllReduceStrategy, self).__init__(
-        devices=local_devices,
-        cross_tower_ops=cross_tower_ops_lib.CollectiveAllReduce(
-            num_workers=self._num_workers,
-            num_gpus_per_worker=num_gpus_per_worker,
-            collective_keys=self._collective_keys))
+    self._collective_keys = cross_device_utils.CollectiveKeys()
+    self._initialize_local(local_devices)
+    self._cross_tower_ops = cross_device_ops_lib.CollectiveAllReduce(
+        num_workers=self._num_workers,
+        num_gpus_per_worker=num_gpus_per_worker,
+        collective_keys=self._collective_keys)
 
     # Add a default device so that ops without specified devices will not end up
     # on other workers.
@@ -202,17 +215,40 @@ class CollectiveAllReduceStrategy(mirrored_strategy.MirroredStrategy):
     return mirrored_strategy._create_mirrored_variable(
         devices, _real_mirrored_creator, *args, **kwargs)
 
-  def distribute_dataset(self, dataset_fn):
+  def _distribute_dataset(self, dataset_fn):
     """Distributes the dataset to each local GPU."""
     # TODO(yuefengz): shard the dataset.
     return values.PerReplicaDataset(
         self._call_dataset_fn(dataset_fn), self._devices, True)
 
-  def configure(self,
-                session_config=None,
-                cluster_spec=None,
-                task_type=None,
-                task_id=None):
+  def _make_dataset_iterator(self, dataset):
+    worker_device_pairs = [(self._worker_device, self._devices)]
+    return values.DatasetIterator(dataset, worker_device_pairs,
+                                  self._num_replicas_in_sync)
+
+  def _make_input_fn_iterator(
+      self,
+      input_fn,
+      replication_mode=distribute_lib.InputReplicationMode.PER_WORKER):
+    """Distributes the dataset to each local GPU."""
+    if self._cluster_spec is None:
+      input_pipeline_id = 0
+    else:
+      input_pipeline_id = multi_worker_util.id_in_cluster(
+          self._cluster_spec, self._task_type, self._task_id)
+    input_context = distribute_lib.InputContext(
+        num_input_pipelines=self._num_workers,
+        input_pipeline_id=input_pipeline_id,
+        num_replicas_in_sync=self._num_replicas_in_sync)
+
+    return values.InputFunctionIterator(
+        input_fn, [(self._worker_device, self._devices)], [input_context])
+
+  def _configure(self,
+                 session_config=None,
+                 cluster_spec=None,
+                 task_type=None,
+                 task_id=None):
     """Configures the object.
 
     Args:
@@ -232,13 +268,15 @@ class CollectiveAllReduceStrategy(mirrored_strategy.MirroredStrategy):
       self._initialize_multi_worker(self._num_gpus_per_worker, cluster_spec,
                                     task_type, task_id)
 
-    if not session_config:
-      return
+    if session_config:
+      session_config.CopyFrom(self._update_config_proto(session_config))
 
+  def _update_config_proto(self, config_proto):
+    updated_config = copy.deepcopy(config_proto)
     # Enable the scoped allocator optimization for CollectiveOps.  This
     # optimization converts many small all-reduces into fewer larger
     # all-reduces.
-    rewrite_options = session_config.graph_options.rewrite_options
+    rewrite_options = updated_config.graph_options.rewrite_options
     rewrite_options.scoped_allocator_optimization = (
         rewriter_config_pb2.RewriterConfig.ON)
     # We turn on ScopedAllocator only for CollectiveReduce op, i.e. enable_op =
@@ -248,7 +286,7 @@ class CollectiveAllReduceStrategy(mirrored_strategy.MirroredStrategy):
     rewrite_options.scoped_allocator_opts.enable_op.append("CollectiveReduce")
 
     if not self._cluster_spec:
-      return
+      return updated_config
 
     assert self._task_type
     assert self._task_id is not None
@@ -256,26 +294,28 @@ class CollectiveAllReduceStrategy(mirrored_strategy.MirroredStrategy):
     # Collective group leader is needed for collective ops to coordinate
     # workers.
     if "chief" in self._cluster_spec.jobs:
-      session_config.experimental.collective_group_leader = (
+      updated_config.experimental.collective_group_leader = (
           "/job:chief/replica:0/task:0")
     else:
       if "worker" not in self._cluster_spec.jobs:
         raise ValueError(
             "You must have `chief` or `worker` jobs in the `cluster_spec`.")
-      session_config.experimental.collective_group_leader = (
+      updated_config.experimental.collective_group_leader = (
           "/job:worker/replica:0/task:0")
 
     # The device filters prevent communication between workers.
-    del session_config.device_filters[:]
-    session_config.device_filters.append(
+    del updated_config.device_filters[:]
+    updated_config.device_filters.append(
         "/job:%s/task:%d" % (self._task_type, self._task_id))
 
+    return updated_config
+
   @property
-  def between_graph(self):
+  def experimental_between_graph(self):
     return True
 
   @property
-  def should_init(self):
+  def experimental_should_init(self):
     return True
 
   @property
@@ -287,6 +327,10 @@ class CollectiveAllReduceStrategy(mirrored_strategy.MirroredStrategy):
     return self._is_chief
 
   @property
-  def num_replicas_in_sync(self):
+  def _num_replicas_in_sync(self):
     return len(self._devices) * self._num_workers
 
+  # TODO(priyag): Delete this once all strategies use global batch size.
+  @property
+  def _global_batch_size(self):
+    return False
diff --git a/tensorflow/contrib/distribute/python/collective_all_reduce_strategy_test.py b/tensorflow/contrib/distribute/python/collective_all_reduce_strategy_test.py
index e3d919dd0d4..8a9e583f0af 100644
--- a/tensorflow/contrib/distribute/python/collective_all_reduce_strategy_test.py
+++ b/tensorflow/contrib/distribute/python/collective_all_reduce_strategy_test.py
@@ -23,13 +23,19 @@ import numpy as np
 
 from tensorflow.contrib.distribute.python import collective_all_reduce_strategy
 from tensorflow.contrib.distribute.python import combinations
-from tensorflow.contrib.distribute.python import cross_tower_utils
 from tensorflow.contrib.distribute.python import multi_worker_test_base
+from tensorflow.contrib.distribute.python import strategy_test_lib
 from tensorflow.core.protobuf import config_pb2
+from tensorflow.core.protobuf import rewriter_config_pb2
 from tensorflow.python import keras
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.distribute import cross_device_utils
+from tensorflow.python.distribute import reduce_util
+from tensorflow.python.distribute import values
 from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.layers import core
 from tensorflow.python.ops import array_ops
@@ -51,9 +57,6 @@ class CollectiveAllReduceStrategyTestBase(
   collective_key_base = 0
 
   def setUp(self):
-    self._run_options = config_pb2.RunOptions()
-    self._run_options.experimental.collective_graph_key = 6
-
     # We use a different key_base for each test so that collective keys won't be
     # reused.
     # TODO(yuefengz, tucker): enable it to reuse collective keys in different
@@ -71,15 +74,16 @@ class CollectiveAllReduceStrategyTestBase(
           cluster_spec=self._cluster_spec,
           task_type=task_type,
           task_id=task_id)
-    collective_keys = cross_tower_utils.CollectiveKeys(
+    collective_keys = cross_device_utils.CollectiveKeys(
         group_key_start=10 * num_gpus +
         CollectiveAllReduceStrategyTestBase.collective_key_base,
         instance_key_start=num_gpus * 100 +
         CollectiveAllReduceStrategyTestBase.collective_key_base,
         instance_key_with_id_start=num_gpus * 10000 +
         CollectiveAllReduceStrategyTestBase.collective_key_base)
-    distribution._collective_keys = collective_keys
-    distribution._cross_tower_ops._collective_keys = collective_keys
+    distribution.extended._collective_keys = collective_keys
+    distribution.extended._inferred_cross_device_ops._collective_keys = (
+        collective_keys)
     if task_type and task_id is not None:
       return distribution, 'grpc://' + self._cluster_spec[task_type][
           task_id], session_config
@@ -93,7 +97,8 @@ class CollectiveAllReduceStrategyTestBase(
          self.cached_session(config=config,
                              target=master_target) as sess, \
          d.scope():
-      l = core.Dense(1, use_bias=False, name='gpu_%d' % d._num_gpus_per_worker)
+      l = core.Dense(1, use_bias=False,
+                     name='gpu_%d' % d.extended._num_gpus_per_worker)
 
       def loss_fn(x):
         y = array_ops.reshape(l(x), []) - constant_op.constant(1.)
@@ -127,8 +132,8 @@ class CollectiveAllReduceStrategyTestBase(
           before_list.append(fetched)
           with ops.control_dependencies([fetched]):
             # TODO(yuefengz): support non-Mirrored variable as destinations.
-            g = d.reduce(
-                variable_scope.VariableAggregation.SUM, g, destinations=v)
+            g = d.extended.reduce_to(
+                reduce_util.ReduceOp.SUM, g, destinations=v)
             with ops.control_dependencies(
                 d.update(v, update, g, grouped=False)):
               after_list.append(d.read_var(v))
@@ -136,14 +141,13 @@ class CollectiveAllReduceStrategyTestBase(
 
       before_out, after_out = step()
 
-      if context.num_gpus() < d._num_gpus_per_worker:
+      if context.num_gpus() < d.extended._num_gpus_per_worker:
         return True
 
-      sess.run(
-          variables.global_variables_initializer(), options=self._run_options)
+      sess.run(variables.global_variables_initializer())
 
       for i in range(10):
-        b, a = sess.run((before_out, after_out), options=self._run_options)
+        b, a = sess.run((before_out, after_out))
         if i == 0:
           before, = b
         after, = a
@@ -222,26 +226,54 @@ class CollectiveAllReduceStrategyTestBase(
         return array_ops.identity(x)
 
       x = distribution.call_for_each_replica(model_fn)
-      reduced_x = distribution.unwrap(
-          distribution.reduce(
-              variable_scope.VariableAggregation.MEAN, x,
-              destinations='/cpu:0'))[0]
+      reduced_x = distribution.reduce(reduce_util.ReduceOp.MEAN, x)
       x = distribution.unwrap(x)[0]
 
-      sess.run(
-          variables.global_variables_initializer(), options=self._run_options)
+      sess.run(variables.global_variables_initializer())
 
-      x_value, reduced_x_value = sess.run([x, reduced_x],
-                                          options=self._run_options)
+      x_value, reduced_x_value = sess.run([x, reduced_x])
       self.assertTrue(
           np.allclose(x_value, reduced_x_value, atol=1e-5),
           msg=('x_value = %r, reduced_x_value = %r' % (x_value,
                                                        reduced_x_value)))
     return np.allclose(x_value, reduced_x_value, atol=1e-5)
 
+  def _test_input_fn_iterator(self, task_type, task_id, num_gpus, input_fn,
+                              expected_values):
+    distribution, master_target, config = self._get_test_object(
+        task_type, task_id, num_gpus)
+    devices = distribution.extended.worker_devices
+
+    with ops.Graph().as_default(), \
+         self.cached_session(config=config,
+                             target=master_target) as sess:
+      iterator = distribution.make_input_fn_iterator(input_fn)
+      sess.run(iterator.initialize())
+
+      for expected_value in expected_values:
+        next_element = iterator.get_next()
+        computed_value = sess.run(
+            [values.select_device(d, next_element) for d in devices])
+        self.assertEqual(expected_value, computed_value)
+
+      with self.assertRaises(errors.OutOfRangeError):
+        next_element = iterator.get_next()
+        sess.run([values.select_device(d, next_element) for d in devices])
+
+      # After re-initializing the iterator, should be able to iterate again.
+      sess.run(iterator.initialize())
+
+      for expected_value in expected_values:
+        next_element = iterator.get_next()
+        computed_value = sess.run(
+            [values.select_device(d, next_element) for d in devices])
+        self.assertEqual(expected_value, computed_value)
+
 
 class DistributedCollectiveAllReduceStrategyTest(
-    CollectiveAllReduceStrategyTestBase, parameterized.TestCase):
+    CollectiveAllReduceStrategyTestBase,
+    strategy_test_lib.DistributionTestBase,
+    parameterized.TestCase):
 
   @classmethod
   def setUpClass(cls):
@@ -269,7 +301,7 @@ class DistributedCollectiveAllReduceStrategyTest(
       combinations.combine(mode=['graph'], num_gpus=[0, 1, 2], required_gpus=1))
   def testVariableInitialization(self, num_gpus):
     if context.num_gpus() < num_gpus:
-      return
+      self.skipTest('Not enough GPUs')
     self._run_between_graph_clients(
         self._test_variable_initialization,
         self._cluster_spec,
@@ -279,10 +311,56 @@ class DistributedCollectiveAllReduceStrategyTest(
       combinations.combine(mode=['graph'], num_gpus=[0, 1, 2], required_gpus=1))
   def testComplexModel(self, num_gpus):
     if context.num_gpus() < num_gpus:
-      return
+      self.skipTest('Not enough GPUs')
     self._run_between_graph_clients(
         self._test_complex_model, self._cluster_spec, num_gpus=num_gpus)
 
+  # TODO(yuefengz): Update how we use num_gpus and required_gpus
+  @combinations.generate(
+      combinations.combine(mode=['graph'], num_gpus=[0, 1, 2], required_gpus=1))
+  def testMakeInputFnIterator(self, num_gpus):
+    if context.num_gpus() < num_gpus:
+      self.skipTest('Not enough GPUs')
+    dataset_fn = lambda: dataset_ops.Dataset.range(100)
+    # We use CPU as the device when num_gpus = 0
+    devices_per_worker = max(1, num_gpus)
+    expected_values = [[i+j for j in range(devices_per_worker)]
+                       for i in range(0, 100, devices_per_worker)]
+
+    input_fn = self._input_fn_to_test_input_context(
+        dataset_fn,
+        expected_num_replicas_in_sync=3*devices_per_worker,
+        expected_num_input_pipelines=3,
+        expected_input_pipeline_id=1)  # because task_id = 1
+    self._test_input_fn_iterator('worker', 1, num_gpus,
+                                 input_fn, expected_values)
+
+  def testUpdateConfigProto(self):
+    distribution = collective_all_reduce_strategy.CollectiveAllReduceStrategy(
+        num_gpus_per_worker=2)
+    distribution.configure(
+        cluster_spec=self._cluster_spec, task_type='worker', task_id=1)
+
+    config_proto = config_pb2.ConfigProto(device_filters=['to_be_overridden'])
+    rewrite_options = config_proto.graph_options.rewrite_options
+    rewrite_options.scoped_allocator_opts.enable_op.append('to_be_removed')
+
+    new_config = distribution.update_config_proto(config_proto)
+
+    # Verify group leader
+    self.assertEqual('/job:worker/replica:0/task:0',
+                     new_config.experimental.collective_group_leader)
+
+    # Verify device filters.
+    self.assertEqual(['/job:worker/task:1'], new_config.device_filters)
+
+    # Verify rewrite options.
+    new_rewrite_options = new_config.graph_options.rewrite_options
+    self.assertEqual(rewriter_config_pb2.RewriterConfig.ON,
+                     new_rewrite_options.scoped_allocator_optimization)
+    self.assertEqual(['CollectiveReduce'],
+                     new_rewrite_options.scoped_allocator_opts.enable_op)
+
 
 class DistributedCollectiveAllReduceStrategyTestWithChief(
     CollectiveAllReduceStrategyTestBase, parameterized.TestCase):
@@ -293,10 +371,6 @@ class DistributedCollectiveAllReduceStrategyTestWithChief(
     cls._cluster_spec = multi_worker_test_base.create_in_process_cluster(
         num_workers=3, num_ps=0, has_chief=True)
 
-  def setUp(self):
-    super(DistributedCollectiveAllReduceStrategyTestWithChief, self).setUp()
-    self._run_options.experimental.collective_graph_key = 7
-
   @combinations.generate(
       combinations.combine(mode=['graph'], num_gpus=[0, 1, 2], required_gpus=1))
   def testMinimizeLossGraph(self, num_gpus):
@@ -323,20 +397,36 @@ class DistributedCollectiveAllReduceStrategyTestWithChief(
 
 
 class LocalCollectiveAllReduceStrategy(CollectiveAllReduceStrategyTestBase,
+                                       strategy_test_lib.DistributionTestBase,
                                        parameterized.TestCase):
 
   def testMinimizeLossGraph(self, num_gpus=2):
     # Collective ops doesn't support strategy with one device.
     if context.num_gpus() < num_gpus:
-      return
+      self.skipTest('Not enough GPUs')
     self._test_minimize_loss_graph(None, None, num_gpus)
 
   def testComplexModel(self, num_gpus=2):
     # Collective ops doesn't support strategy with one device.
     if context.num_gpus() < num_gpus:
-      return
+      self.skipTest('Not enough GPUs')
     self._test_complex_model(None, None, num_gpus)
 
+  def testMakeInputFnIterator(self, num_gpus=2):
+    # Collective ops doesn't support strategy with one device.
+    if context.num_gpus() < num_gpus:
+      self.skipTest('Not enough GPUs')
+    dataset_fn = lambda: dataset_ops.Dataset.range(10)
+    expected_values = [[i, i+1] for i in range(0, 10, 2)]
+
+    input_fn = self._input_fn_to_test_input_context(
+        dataset_fn,
+        expected_num_replicas_in_sync=num_gpus,
+        expected_num_input_pipelines=1,
+        expected_input_pipeline_id=0)
+    self._test_input_fn_iterator(None, None, num_gpus,
+                                 input_fn, expected_values)
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/contrib/distribute/python/combinations.py b/tensorflow/contrib/distribute/python/combinations.py
index a5137165403..365ce5cdec7 100644
--- a/tensorflow/contrib/distribute/python/combinations.py
+++ b/tensorflow/contrib/distribute/python/combinations.py
@@ -53,11 +53,11 @@ from tensorflow.contrib.distribute.python import tpu_strategy as tpu_lib
 from tensorflow.contrib.optimizer_v2 import adagrad as adagrad_v2
 from tensorflow.contrib.optimizer_v2 import adam as adam_v2
 from tensorflow.contrib.optimizer_v2 import gradient_descent as gradient_descent_v2
+from tensorflow.python.distribute import distribution_strategy_context
 from tensorflow.python.eager import context
 from tensorflow.python.framework import ops
 from tensorflow.python.training import adagrad
 from tensorflow.python.training import adam
-from tensorflow.python.training import distribution_strategy_context
 from tensorflow.python.training import gradient_descent
 from tensorflow.python.training import rmsprop
 from tensorflow.python.util import tf_inspect
@@ -168,6 +168,8 @@ def _augment_with_special_arguments(test_method):
       if GPU_TEST:
         self.skipTest("Test that doesn't require GPUs.")
     elif context.num_gpus() < required_gpus:
+      # TODO(priyag): Consider allowing tests in graph mode using soft
+      # placement.
       self.skipTest(
           "{} GPUs are not available for this test. {} GPUs are available".
           format(required_gpus, context.num_gpus()))
@@ -190,7 +192,7 @@ def _augment_with_special_arguments(test_method):
         kwargs_to_pass[arg] = kwargs[arg]
 
     if mode == "eager":
-      with ops.Graph().as_default(), context.eager_mode():
+      with context.eager_mode():
         if distribution:
           kwargs_to_pass["distribution"] = distribution.strategy
         test_method(**kwargs_to_pass)
@@ -335,6 +337,13 @@ tpu_strategy_one_step = NamedDistribution(
     "TPUOneStep", lambda: tpu_lib.TPUStrategy(
         TPUClusterResolver(""), steps_per_run=1),
     required_tpu=True)
+mirrored_strategy_with_one_cpu = NamedDistribution(
+    "Mirrored1CPU",
+    lambda: mirrored_lib.MirroredStrategy(["/cpu:0"]))
+mirrored_strategy_with_one_gpu = NamedDistribution(
+    "Mirrored1GPU",
+    lambda: mirrored_lib.MirroredStrategy(["/gpu:0"]),
+    required_gpus=1)
 mirrored_strategy_with_gpu_and_cpu = NamedDistribution(
     "MirroredCPUAndGPU",
     lambda: mirrored_lib.MirroredStrategy(["/gpu:0", "/cpu:0"]),
@@ -343,6 +352,21 @@ mirrored_strategy_with_two_gpus = NamedDistribution(
     "Mirrored2GPUs",
     lambda: mirrored_lib.MirroredStrategy(["/gpu:0", "/gpu:1"]),
     required_gpus=2)
+core_mirrored_strategy_with_one_cpu = NamedDistribution(
+    "CoreMirrored1CPU",
+    lambda: mirrored_lib.CoreMirroredStrategy(["/cpu:0"]))
+core_mirrored_strategy_with_one_gpu = NamedDistribution(
+    "CoreMirrored1GPU",
+    lambda: mirrored_lib.CoreMirroredStrategy(["/gpu:0"]),
+    required_gpus=1)
+core_mirrored_strategy_with_gpu_and_cpu = NamedDistribution(
+    "CoreMirroredCPUAndGPU",
+    lambda: mirrored_lib.CoreMirroredStrategy(["/gpu:0", "/cpu:0"]),
+    required_gpus=1)
+core_mirrored_strategy_with_two_gpus = NamedDistribution(
+    "CoreMirrored2GPUs",
+    lambda: mirrored_lib.CoreMirroredStrategy(["/gpu:0", "/gpu:1"]),
+    required_gpus=2)
 
 
 gradient_descent_optimizer_v1_fn = NamedObject(
@@ -373,8 +397,11 @@ def distributions_and_v1_optimizers():
   """A common set of combination with DistributionStrategies and Optimizers."""
   return combine(
       distribution=[
-          one_device_strategy, mirrored_strategy_with_gpu_and_cpu,
-          mirrored_strategy_with_two_gpus
+          one_device_strategy,
+          mirrored_strategy_with_gpu_and_cpu,
+          mirrored_strategy_with_two_gpus,
+          core_mirrored_strategy_with_gpu_and_cpu,
+          core_mirrored_strategy_with_two_gpus,
       ],
       optimizer_fn=optimizers_v1)
 
@@ -383,7 +410,10 @@ def distributions_and_v2_optimizers():
   """DistributionStrategies and V2 Optimizers."""
   return combine(
       distribution=[
-          one_device_strategy, mirrored_strategy_with_gpu_and_cpu,
-          mirrored_strategy_with_two_gpus
+          one_device_strategy,
+          mirrored_strategy_with_gpu_and_cpu,
+          mirrored_strategy_with_two_gpus,
+          core_mirrored_strategy_with_gpu_and_cpu,
+          core_mirrored_strategy_with_two_gpus,
       ],
       optimizer_fn=optimizers_v2)
diff --git a/tensorflow/contrib/distribute/python/cross_tower_ops_test.py b/tensorflow/contrib/distribute/python/cross_device_ops_test.py
similarity index 79%
rename from tensorflow/contrib/distribute/python/cross_tower_ops_test.py
rename to tensorflow/contrib/distribute/python/cross_device_ops_test.py
index 3e274ba67ca..d6e9521c1c1 100644
--- a/tensorflow/contrib/distribute/python/cross_tower_ops_test.py
+++ b/tensorflow/contrib/distribute/python/cross_device_ops_test.py
@@ -24,24 +24,24 @@ from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.contrib.distribute.python import combinations
-from tensorflow.contrib.distribute.python import cross_tower_ops as cross_tower_ops_lib
-from tensorflow.contrib.distribute.python import cross_tower_utils
 from tensorflow.contrib.distribute.python import mirrored_strategy
 from tensorflow.contrib.distribute.python import multi_worker_test_base
-from tensorflow.contrib.distribute.python import values as value_lib
 from tensorflow.core.protobuf import config_pb2
+from tensorflow.python.distribute import cross_device_ops as cross_device_ops_lib
+from tensorflow.python.distribute import cross_device_utils
+from tensorflow.python.distribute import device_util
+from tensorflow.python.distribute import reduce_util
+from tensorflow.python.distribute import values as value_lib
 from tensorflow.python.eager import context
 from tensorflow.python.eager import test
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import variable_scope as vs
-from tensorflow.python.training import device_util
 
 
 def _make_per_replica(values, devices, regroup=False):
-  devices = cross_tower_ops_lib.get_devices_from(devices)
+  devices = cross_device_ops_lib.get_devices_from(devices)
   assert len(values) == len(devices)
 
   # We simulate the result of regroup called on PerReplica which strips the
@@ -66,7 +66,7 @@ def _fake_mirrored(value, devices):
   All components of the returned Mirrored have the same objects, which is not
   true in reality.
   """
-  devices = cross_tower_ops_lib.get_devices_from(devices)
+  devices = cross_device_ops_lib.get_devices_from(devices)
   return value_lib.Mirrored(
       {d: v for d, v in zip(devices, [value] * len(devices))})
 
@@ -118,8 +118,8 @@ class CrossDeviceOpsTestBase(test.TestCase, parameterized.TestCase):
           self.assertEqual(
               sess.run(list(left._index.values())), list(right._index.values()))
 
-  def _testReductionAndBroadcast(self, cross_tower_ops, distribution):
-    devices = distribution.worker_devices
+  def _testReductionAndBroadcast(self, cross_device_ops, distribution):
+    devices = distribution.extended.worker_devices
 
     values = [constant_op.constant(float(d)) for d in range(len(devices))]
     per_replica = _make_per_replica(values, devices)
@@ -132,35 +132,33 @@ class CrossDeviceOpsTestBase(test.TestCase, parameterized.TestCase):
     destination_mirrored = _fake_mirrored(1., devices)
     destination_different = _fake_mirrored(1., _cpu_device)
     destination_str = _cpu_device
-    destination_list = devices
 
     all_destinations = [
         destination_mirrored, destination_different, destination_str,
-        destination_list
     ]
 
     # test reduce()
     for destinations in all_destinations:
       self._assert_values_equal(
-          cross_tower_ops.reduce(
-              vs.VariableAggregation.MEAN,
+          cross_device_ops.reduce(
+              reduce_util.ReduceOp.MEAN,
               per_replica,
               destinations=destinations),
           _fake_mirrored(mean, destinations))
       self._assert_values_equal(
-          cross_tower_ops.reduce(
-              vs.VariableAggregation.MEAN,
+          cross_device_ops.reduce(
+              reduce_util.ReduceOp.MEAN,
               per_replica_2,
               destinations=destinations),
           _fake_mirrored(mean_2, destinations))
       self._assert_values_equal(
-          cross_tower_ops.reduce(
-              vs.VariableAggregation.SUM, per_replica,
+          cross_device_ops.reduce(
+              reduce_util.ReduceOp.SUM, per_replica,
               destinations=destinations),
           _fake_mirrored(mean * len(devices), destinations))
       self._assert_values_equal(
-          cross_tower_ops.reduce(
-              vs.VariableAggregation.SUM,
+          cross_device_ops.reduce(
+              reduce_util.ReduceOp.SUM,
               per_replica_2,
               destinations=destinations),
           _fake_mirrored(mean_2 * len(devices), destinations))
@@ -168,16 +166,16 @@ class CrossDeviceOpsTestBase(test.TestCase, parameterized.TestCase):
     # test batch_reduce()
     for d1, d2 in itertools.product(all_destinations, all_destinations):
       self._assert_values_equal(
-          cross_tower_ops.batch_reduce(
-              vs.VariableAggregation.MEAN,
+          cross_device_ops.batch_reduce(
+              reduce_util.ReduceOp.MEAN,
               [(per_replica, d1), (per_replica_2, d2)]),
           [
               _fake_mirrored(mean, d1),
               _fake_mirrored(mean_2, d2)
           ])
       self._assert_values_equal(
-          cross_tower_ops.batch_reduce(
-              vs.VariableAggregation.SUM,
+          cross_device_ops.batch_reduce(
+              reduce_util.ReduceOp.SUM,
               [(per_replica, d1), (per_replica_2, d2)]),
           [
               _fake_mirrored(mean * len(devices), d1),
@@ -187,7 +185,7 @@ class CrossDeviceOpsTestBase(test.TestCase, parameterized.TestCase):
     # test broadcast()
     for destinations in all_destinations:
       self._assert_values_equal(
-          cross_tower_ops.broadcast(constant_op.constant(1.), destinations),
+          cross_device_ops.broadcast(constant_op.constant(1.), destinations),
           _fake_mirrored(1., destinations))
 
 
@@ -196,62 +194,65 @@ class SingleWorkerCrossDeviceOpsTest(CrossDeviceOpsTestBase):
   # combinations module so that we can pass in devices instead of a distribution
   # strategy.
   reduction_to_one_combinations = combinations.combine(
-      cross_tower_ops=[
+      cross_device_ops=[
           combinations.NamedObject(
               "DefaultReductionToOneDeviceCrossDeviceOps",
-              cross_tower_ops_lib.ReductionToOneDeviceCrossDeviceOps()),
+              cross_device_ops_lib.ReductionToOneDeviceCrossDeviceOps()),
           combinations.NamedObject(
               "ReductionToCPUDeviceCrossDeviceOps",
-              cross_tower_ops_lib.ReductionToOneDeviceCrossDeviceOps(
+              cross_device_ops_lib.ReductionToOneDeviceCrossDeviceOps(
                   reduce_to_device=_cpu_device)),
           combinations.NamedObject(
               "AccumulateNCrossDeviceOp",
-              cross_tower_ops_lib.ReductionToOneDeviceCrossDeviceOps(
+              cross_device_ops_lib.ReductionToOneDeviceCrossDeviceOps(
                   accumulation_fn=math_ops.accumulate_n)),
       ],
       distribution=[
           combinations.one_device_strategy,
           combinations.mirrored_strategy_with_gpu_and_cpu,
-          combinations.mirrored_strategy_with_two_gpus
+          combinations.mirrored_strategy_with_two_gpus,
+          combinations.core_mirrored_strategy_with_gpu_and_cpu,
+          combinations.core_mirrored_strategy_with_two_gpus
       ],
       mode=["graph", "eager"])
   allreduce_combinations = combinations.combine(
-      cross_tower_ops=[
+      cross_device_ops=[
           combinations.NamedObject(
               "AllReduce",
-              cross_tower_ops_lib.AllReduceCrossDeviceOps("nccl", 1, 0, 0)),
+              cross_device_ops_lib.AllReduceCrossDeviceOps("nccl", 1, 0, 0)),
           combinations.NamedObject(
               "HierarchicalCopy",
-              cross_tower_ops_lib.AllReduceCrossDeviceOps(
+              cross_device_ops_lib.AllReduceCrossDeviceOps(
                   "hierarchical_copy", 8, 0, 0)),
           combinations.NamedObject(
               "AllReduceNoGradientRepacking",
-              cross_tower_ops_lib.AllReduceCrossDeviceOps("nccl", 0, 0, 0)),
+              cross_device_ops_lib.AllReduceCrossDeviceOps("nccl", 0, 0, 0)),
           combinations.NamedObject(
               "HierarchicalCopyAggregateSmallTensors",
-              cross_tower_ops_lib.AllReduceCrossDeviceOps(
+              cross_device_ops_lib.AllReduceCrossDeviceOps(
                   "hierarchical_copy", 0, 100, 10))
       ],
-      distribution=[combinations.mirrored_strategy_with_two_gpus],
+      distribution=[combinations.mirrored_strategy_with_two_gpus,
+                    combinations.core_mirrored_strategy_with_two_gpus],
       mode=["graph", "eager"])
 
   @combinations.generate(reduction_to_one_combinations + allreduce_combinations)
-  def testReductionAndBroadcast(self, cross_tower_ops, distribution):
+  def testReductionAndBroadcast(self, cross_device_ops, distribution):
     with distribution.scope():
-      self._testReductionAndBroadcast(cross_tower_ops, distribution)
+      self._testReductionAndBroadcast(cross_device_ops, distribution)
 
   def testChooseAlgorithm(self):
     device_links = [[1, 2, 3, 4], [0, 2, 3, 5], [0, 1, 3, 6], [0, 1, 2, 7],
                     [0, 5, 6, 7], [1, 4, 6, 7], [2, 4, 5, 7], [3, 4, 5, 6]]
-    result = cross_tower_ops_lib._choose_all_reduce_algorithm(device_links)
-    self.assertIsInstance(result, cross_tower_ops_lib.AllReduceCrossDeviceOps)
+    result = cross_device_ops_lib._choose_all_reduce_algorithm(device_links)
+    self.assertIsInstance(result, cross_device_ops_lib.AllReduceCrossDeviceOps)
     self.assertEqual(result._all_reduce_alg, "hierarchical_copy")
     self.assertEqual(result._num_packs, 8)
 
     # if there are only 4 devices
     device_links = [[1, 2, 3, 4], [0, 2, 3, 5], [0, 1, 3, 6], [0, 1, 2, 7]]
-    result = cross_tower_ops_lib._choose_all_reduce_algorithm(device_links)
-    self.assertIsInstance(result, cross_tower_ops_lib.AllReduceCrossDeviceOps)
+    result = cross_device_ops_lib._choose_all_reduce_algorithm(device_links)
+    self.assertIsInstance(result, cross_device_ops_lib.AllReduceCrossDeviceOps)
     self.assertEqual(result._all_reduce_alg, "nccl")
     self.assertEqual(result._num_packs, 1)
 
@@ -259,16 +260,16 @@ class SingleWorkerCrossDeviceOpsTest(CrossDeviceOpsTestBase):
     device_links = [[0, 1, 2, 3, 4], [0, 1, 2, 3, 5], [0, 1, 2, 3, 6],
                     [0, 1, 2, 3, 7], [0, 4, 5, 6, 7], [1, 4, 5, 6, 7],
                     [2, 4, 5, 6, 7], [3, 4, 5, 6, 7]]
-    result = cross_tower_ops_lib._choose_all_reduce_algorithm(device_links)
-    self.assertIsInstance(result, cross_tower_ops_lib.AllReduceCrossDeviceOps)
+    result = cross_device_ops_lib._choose_all_reduce_algorithm(device_links)
+    self.assertIsInstance(result, cross_device_ops_lib.AllReduceCrossDeviceOps)
     self.assertEqual(result._all_reduce_alg, "hierarchical_copy")
     self.assertEqual(result._num_packs, 8)
 
     # if not dgx1-like links
     device_links = [[0, 2, 3, 5], [0, 1, 3, 6], [0, 1, 2, 7], [0, 5, 6, 7],
                     [1, 4, 6, 7], [2, 4, 5, 7], [3, 4, 5, 6], [1, 2, 3, 4]]
-    result = cross_tower_ops_lib._choose_all_reduce_algorithm(device_links)
-    self.assertIsInstance(result, cross_tower_ops_lib.AllReduceCrossDeviceOps)
+    result = cross_device_ops_lib._choose_all_reduce_algorithm(device_links)
+    self.assertIsInstance(result, cross_device_ops_lib.AllReduceCrossDeviceOps)
     self.assertEqual(result._all_reduce_alg, "nccl")
     self.assertEqual(result._num_packs, 1)
 
@@ -280,8 +281,8 @@ class SingleWorkerCrossDeviceOpsTest(CrossDeviceOpsTestBase):
     t0 = _make_indexed_slices([[1., 2.]], [1], [5, 2], devices[0])
     t1 = _make_indexed_slices([[3., 4.], [5., 6.]], [1, 3], [5, 2], devices[1])
     per_replica = value_lib.PerReplica({devices[0]: t0, devices[1]: t1})
-    result = cross_tower_ops_lib._simple_reduce(
-        per_replica, devices[0], math_ops.add_n, vs.VariableAggregation.SUM)
+    result = cross_device_ops_lib._simple_reduce(
+        per_replica, devices[0], math_ops.add_n, reduce_util.ReduceOp.SUM)
 
     # Test that the result is semantically equal to both the concatenated
     # IndexedSlices with and without duplicate indices.
@@ -294,19 +295,19 @@ class SingleWorkerCrossDeviceOpsTest(CrossDeviceOpsTestBase):
 
   @combinations.generate(
       combinations.combine(
-          cross_tower_ops_instance=[
+          cross_device_ops_instance=[
               combinations.NamedObject(
                   "ReductionToOneDeviceCrossDeviceOps",
-                  cross_tower_ops_lib.ReductionToOneDeviceCrossDeviceOps()),
+                  cross_device_ops_lib.ReductionToOneDeviceCrossDeviceOps()),
               combinations.NamedObject(
                   "AllReduceCrossDeviceOps",
-                  cross_tower_ops_lib.AllReduceCrossDeviceOps())
+                  cross_device_ops_lib.AllReduceCrossDeviceOps())
           ],
-          aggregation=[vs.VariableAggregation.SUM, vs.VariableAggregation.MEAN],
+          reduce_op=[reduce_util.ReduceOp.SUM, reduce_util.ReduceOp.MEAN],
           batch_reduce=[True, False],
           mode=["graph", "eager"],
           required_gpus=1))
-  def testIndexedSlicesAllReduce(self, cross_tower_ops_instance, aggregation,
+  def testIndexedSlicesAllReduce(self, cross_device_ops_instance, reduce_op,
                                  batch_reduce):
     devices = ["/cpu:0", "/gpu:0"]
     dense_shape = [5, 2]
@@ -316,20 +317,20 @@ class SingleWorkerCrossDeviceOpsTest(CrossDeviceOpsTestBase):
     per_replica = value_lib.PerReplica({devices[0]: t0, devices[1]: t1})
 
     if batch_reduce:
-      result = cross_tower_ops_instance.batch_reduce(
-          aggregation, [(per_replica, devices)])
+      result = cross_device_ops_instance.batch_reduce(
+          reduce_op, [(per_replica, per_replica)])
     else:
-      result = cross_tower_ops_instance.reduce(
-          aggregation, per_replica, devices)
+      result = cross_device_ops_instance.reduce(
+          reduce_op, per_replica, per_replica)
 
     total_indices_with_dups = [1, 1, 3]
     total_indices_without_dups = [1, 3]
 
-    if aggregation == vs.VariableAggregation.SUM:
+    if reduce_op == reduce_util.ReduceOp.SUM:
       total_values_with_dups = [[1., 2.], [3., 4.], [5., 6.]]
       total_values_without_dups = [[4., 6.], [5., 6.]]
     else:
-      assert aggregation == vs.VariableAggregation.MEAN
+      assert reduce_op == reduce_util.ReduceOp.MEAN
       total_values_with_dups = [[0.5, 1.], [1.5, 2.], [2.5, 3.]]
       total_values_without_dups = [[2., 3.], [2.5, 3.]]
 
@@ -356,49 +357,63 @@ class MultiWorkerCrossDeviceOpsTest(multi_worker_test_base.MultiWorkerTestBase,
       "/job:worker/replica:0/task:0", "/job:worker/replica:0/task:1"
   ]
   multi_worker_allreduce_combinations = combinations.combine(
-      cross_tower_ops=[
+      cross_device_ops=[
           combinations.NamedObject(
               "MultiWorkerAllReduce",
-              cross_tower_ops_lib.MultiWorkerAllReduce(
+              cross_device_ops_lib.MultiWorkerAllReduce(
                   worker_devices, 2, ("pscpu/pscpu", 2, -1), 0, 0, 0)),
           combinations.NamedObject(
               "MultiWorkerAllReducePack",
-              cross_tower_ops_lib.MultiWorkerAllReduce(
+              cross_device_ops_lib.MultiWorkerAllReduce(
                   worker_devices, 2, ("pscpu/pscpu", 2, -1), 1, 0, 0)),
           combinations.NamedObject(
               "MultiWorkerAllReduceAggregation",
-              cross_tower_ops_lib.MultiWorkerAllReduce(
+              cross_device_ops_lib.MultiWorkerAllReduce(
                   worker_devices, 2, ("pscpu/pscpu", 2, -1), 0, 100, 10)),
           combinations.NamedObject(
               "MultiWorkerAllReduceMultipleSpecs",
-              cross_tower_ops_lib.MultiWorkerAllReduce(
+              cross_device_ops_lib.MultiWorkerAllReduce(
                   worker_devices, 2, [("pscpu/pscpu", 2, 100),
                                       ("xring", 2, -1)], 0, 0, 0)),
       ],
       distribution=[
           combinations.NamedDistribution(
               "MirroredCPU",
-              lambda: mirrored_strategy.MirroredStrategy(num_gpus=0),
+              lambda: mirrored_strategy.MirroredStrategy(num_gpus_per_worker=0),
               required_gpus=0),
           combinations.NamedDistribution(
               "Mirrored1GPU",
-              lambda: mirrored_strategy.MirroredStrategy(num_gpus=1),
+              lambda: mirrored_strategy.MirroredStrategy(num_gpus_per_worker=1),
               required_gpus=1),
           combinations.NamedDistribution(
               "Mirrored2GPUs",
-              lambda: mirrored_strategy.MirroredStrategy(num_gpus=2),
+              lambda: mirrored_strategy.MirroredStrategy(num_gpus_per_worker=2),
+              required_gpus=2),
+          # pylint: disable=g-long-lambda
+          combinations.NamedDistribution(
+              "CoreMirroredCPU",
+              lambda: mirrored_strategy.CoreMirroredStrategy(["/device:CPU:0"]),
+              required_gpus=0),
+          combinations.NamedDistribution(
+              "CoreMirrored1GPU",
+              lambda: mirrored_strategy.CoreMirroredStrategy(["/device:GPU:0"]),
+              required_gpus=1),
+          combinations.NamedDistribution(
+              "CoreMirrored2GPUs",
+              lambda: mirrored_strategy.CoreMirroredStrategy(
+                  ["/device:GPU:0", "/device:GPU:1"]),
               required_gpus=2),
       ],
       mode=["graph"])
 
   @combinations.generate(multi_worker_allreduce_combinations)
-  def testReductionAndBroadcast(self, cross_tower_ops, distribution):
+  def testReductionAndBroadcast(self, cross_device_ops, distribution):
     distribution.configure(cluster_spec={
         "worker":
             ["/job:worker/replica:0/task:0", "/job:worker/replica:0/task:1"]
     })
     with distribution.scope():
-      self._testReductionAndBroadcast(cross_tower_ops, distribution)
+      self._testReductionAndBroadcast(cross_device_ops, distribution)
 
 
 class MultiWorkerCollectiveAllReduceTest(
@@ -419,7 +434,7 @@ class MultiWorkerCollectiveAllReduceTest(
     MultiWorkerCollectiveAllReduceTest.collective_key_base += 100000
 
   def _get_test_objects(self, task_type, task_id, num_gpus=0, local_mode=False):
-    collective_keys = cross_tower_utils.CollectiveKeys(
+    collective_keys = cross_device_utils.CollectiveKeys(
         group_key_start=10 * num_gpus +
         MultiWorkerCollectiveAllReduceTest.collective_key_base,
         instance_key_start=num_gpus * 100 +
@@ -427,7 +442,7 @@ class MultiWorkerCollectiveAllReduceTest(
         instance_key_with_id_start=num_gpus * 10000 +
         MultiWorkerCollectiveAllReduceTest.collective_key_base)
     if local_mode:
-      collective_all_reduce_ops = cross_tower_ops_lib.CollectiveAllReduce(
+      collective_all_reduce_ops = cross_device_ops_lib.CollectiveAllReduce(
           1, num_gpus, collective_keys=collective_keys)
       if num_gpus:
         devices = ["/device:GPU:%d" % i for i in range(num_gpus)]
@@ -435,7 +450,7 @@ class MultiWorkerCollectiveAllReduceTest(
         devices = ["/device:CPU:0"]
       return collective_all_reduce_ops, devices, ""
     else:
-      collective_all_reduce_ops = cross_tower_ops_lib.CollectiveAllReduce(
+      collective_all_reduce_ops = cross_device_ops_lib.CollectiveAllReduce(
           3, num_gpus, collective_keys=collective_keys)
       if num_gpus:
         devices = [
@@ -491,37 +506,35 @@ class MultiWorkerCollectiveAllReduceTest(
       destination_mirrored = _fake_mirrored(1., devices)
       destination_different = _fake_mirrored(1., _cpu_device)
       destination_str = _cpu_device
-      destination_list = devices
 
       all_destinations = [
-          destination_different, destination_mirrored, destination_str,
-          destination_list
+          destination_different, destination_mirrored, destination_str
       ]
 
       # test reduce()
       for destinations in all_destinations:
         self._assert_values_equal(
             collective_all_reduce.reduce(
-                vs.VariableAggregation.MEAN,
+                reduce_util.ReduceOp.MEAN,
                 per_replica,
                 destinations=destinations),
             _fake_mirrored(mean, destinations), sess)
         self._assert_values_equal(
             collective_all_reduce.reduce(
-                vs.VariableAggregation.MEAN,
+                reduce_util.ReduceOp.MEAN,
                 per_replica_2,
                 destinations=destinations),
             _fake_mirrored(mean_2, destinations), sess)
         self._assert_values_equal(
             collective_all_reduce.reduce(
-                vs.VariableAggregation.SUM,
+                reduce_util.ReduceOp.SUM,
                 per_replica,
                 destinations=destinations),
             _fake_mirrored(mean * len(devices) * num_workers, destinations),
             sess)
         self._assert_values_equal(
             collective_all_reduce.reduce(
-                vs.VariableAggregation.SUM,
+                reduce_util.ReduceOp.SUM,
                 per_replica_2,
                 destinations=destinations),
             _fake_mirrored(mean_2 * len(devices) * num_workers, destinations),
@@ -530,7 +543,7 @@ class MultiWorkerCollectiveAllReduceTest(
       # test batch_reduce()
       for d1, d2 in itertools.product(all_destinations, all_destinations):
         self._assert_values_equal(
-            collective_all_reduce.batch_reduce(vs.VariableAggregation.MEAN,
+            collective_all_reduce.batch_reduce(reduce_util.ReduceOp.MEAN,
                                                [(per_replica, d1),
                                                 (per_replica_2, d2)]),
             [
@@ -538,7 +551,7 @@ class MultiWorkerCollectiveAllReduceTest(
                 _fake_mirrored(mean_2, d2)
             ], sess)
         self._assert_values_equal(
-            collective_all_reduce.batch_reduce(vs.VariableAggregation.SUM,
+            collective_all_reduce.batch_reduce(reduce_util.ReduceOp.SUM,
                                                [(per_replica, d1),
                                                 (per_replica_2, d2)]),
             [
diff --git a/tensorflow/contrib/distribute/python/cross_tower_utils_test.py b/tensorflow/contrib/distribute/python/cross_device_utils_test.py
similarity index 83%
rename from tensorflow/contrib/distribute/python/cross_tower_utils_test.py
rename to tensorflow/contrib/distribute/python/cross_device_utils_test.py
index e46240abbfa..2303a31677a 100644
--- a/tensorflow/contrib/distribute/python/cross_tower_utils_test.py
+++ b/tensorflow/contrib/distribute/python/cross_device_utils_test.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for cross_tower_utils."""
+"""Tests for cross_device_utils."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -21,14 +21,14 @@ from __future__ import print_function
 from absl.testing import parameterized
 
 from tensorflow.contrib.distribute.python import combinations
-from tensorflow.contrib.distribute.python import cross_tower_utils
-from tensorflow.contrib.distribute.python import values as value_lib
+from tensorflow.python.distribute import cross_device_utils
+from tensorflow.python.distribute import device_util
+from tensorflow.python.distribute import values as value_lib
 from tensorflow.python.eager import test
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import math_ops
-from tensorflow.python.training import device_util
 
 
 class IndexedSlicesUtilsTest(test.TestCase, parameterized.TestCase):
@@ -43,7 +43,7 @@ class IndexedSlicesUtilsTest(test.TestCase, parameterized.TestCase):
     t0 = constant_op.constant([[1., 2.], [0, 0], [3., 4.]])
     t1 = constant_op.constant([[0., 0.], [5, 6], [7., 8.]])
     total = constant_op.constant([[1., 2.], [5, 6], [10., 12.]])
-    result = cross_tower_utils.aggregate_tensors_or_indexed_slices([t0, t1])
+    result = cross_device_utils.aggregate_tensors_or_indexed_slices([t0, t1])
     self._assert_values_equal(total, result)
 
   @test_util.run_in_graph_and_eager_modes
@@ -53,7 +53,7 @@ class IndexedSlicesUtilsTest(test.TestCase, parameterized.TestCase):
     t1 = math_ops._as_indexed_slices(
         constant_op.constant([[0., 0.], [5, 6], [7., 8.]]))
     total = constant_op.constant([[1., 2.], [5, 6], [10., 12.]])
-    result = cross_tower_utils.aggregate_tensors_or_indexed_slices([t0, t1])
+    result = cross_device_utils.aggregate_tensors_or_indexed_slices([t0, t1])
     self.assertIsInstance(result, ops.IndexedSlices)
     self._assert_values_equal(total, result)
 
@@ -62,7 +62,7 @@ class IndexedSlicesUtilsTest(test.TestCase, parameterized.TestCase):
     t = constant_op.constant([[1., 2.], [0, 0], [3., 4.]])
     n = 2
     expected = constant_op.constant([[0.5, 1.], [0, 0], [1.5, 2.]])
-    result = cross_tower_utils.divide_by_n_tensors_or_indexed_slices(t, n)
+    result = cross_device_utils.divide_by_n_tensors_or_indexed_slices(t, n)
     self._assert_values_equal(expected, result)
 
   @test_util.run_in_graph_and_eager_modes
@@ -71,7 +71,7 @@ class IndexedSlicesUtilsTest(test.TestCase, parameterized.TestCase):
         constant_op.constant([[1., 2.], [0, 0], [3., 4.]]))
     n = 2
     expected = constant_op.constant([[0.5, 1.], [0, 0], [1.5, 2.]])
-    result = cross_tower_utils.divide_by_n_tensors_or_indexed_slices(t, n)
+    result = cross_device_utils.divide_by_n_tensors_or_indexed_slices(t, n)
     self.assertIsInstance(result, ops.IndexedSlices)
     self._assert_values_equal(expected, result)
 
@@ -79,7 +79,7 @@ class IndexedSlicesUtilsTest(test.TestCase, parameterized.TestCase):
   def testIsIndexedSlices(self):
     t = math_ops._as_indexed_slices(
         constant_op.constant([[1., 2.], [0, 0], [3., 4.]]))
-    self.assertTrue(cross_tower_utils.contains_indexed_slices(t))
+    self.assertTrue(cross_device_utils.contains_indexed_slices(t))
 
   @test_util.run_in_graph_and_eager_modes
   def testContainsIndexedSlices_List(self):
@@ -87,7 +87,7 @@ class IndexedSlicesUtilsTest(test.TestCase, parameterized.TestCase):
         constant_op.constant([[1., 2.], [0, 0], [3., 4.]]))
     t1 = math_ops._as_indexed_slices(
         constant_op.constant([[0., 0.], [5, 6], [7., 8.]]))
-    self.assertTrue(cross_tower_utils.contains_indexed_slices([t0, t1]))
+    self.assertTrue(cross_device_utils.contains_indexed_slices([t0, t1]))
 
   @test_util.run_in_graph_and_eager_modes
   def testContainsIndexedSlices_Tuple(self):
@@ -95,7 +95,7 @@ class IndexedSlicesUtilsTest(test.TestCase, parameterized.TestCase):
         constant_op.constant([[1., 2.], [0, 0], [3., 4.]]))
     t1 = math_ops._as_indexed_slices(
         constant_op.constant([[0., 0.], [5, 6], [7., 8.]]))
-    self.assertTrue(cross_tower_utils.contains_indexed_slices((t0, t1)))
+    self.assertTrue(cross_device_utils.contains_indexed_slices((t0, t1)))
 
   @test_util.run_in_graph_and_eager_modes
   def testContainsIndexedSlices_PerReplica(self):
@@ -104,7 +104,7 @@ class IndexedSlicesUtilsTest(test.TestCase, parameterized.TestCase):
     t1 = math_ops._as_indexed_slices(
         constant_op.constant([[0., 0.], [5, 6], [7., 8.]]))
     per_replica = value_lib.PerReplica({"/gpu:0": t0, "/cpu:0": t1})
-    self.assertTrue(cross_tower_utils.contains_indexed_slices(per_replica))
+    self.assertTrue(cross_device_utils.contains_indexed_slices(per_replica))
 
   @combinations.generate(combinations.combine(
       mode=["graph", "eager"],
@@ -113,7 +113,7 @@ class IndexedSlicesUtilsTest(test.TestCase, parameterized.TestCase):
     with ops.device("/cpu:0"):
       t = constant_op.constant([[1., 2.], [0, 0], [3., 4.]])
     destination = "/gpu:0"
-    result = cross_tower_utils.copy_tensor_or_indexed_slices_to_device(
+    result = cross_device_utils.copy_tensor_or_indexed_slices_to_device(
         t, destination)
 
     self._assert_values_equal(t, result)
@@ -128,7 +128,7 @@ class IndexedSlicesUtilsTest(test.TestCase, parameterized.TestCase):
       t = math_ops._as_indexed_slices(
           constant_op.constant([[1., 2.], [0, 0], [3., 4.]]))
     destination = "/gpu:0"
-    result = cross_tower_utils.copy_tensor_or_indexed_slices_to_device(
+    result = cross_device_utils.copy_tensor_or_indexed_slices_to_device(
         t, destination)
 
     self.assertIsInstance(result, ops.IndexedSlices)
diff --git a/tensorflow/contrib/distribute/python/estimator_integration_test.py b/tensorflow/contrib/distribute/python/estimator_integration_test.py
index a1355c0b09e..e17085628ba 100644
--- a/tensorflow/contrib/distribute/python/estimator_integration_test.py
+++ b/tensorflow/contrib/distribute/python/estimator_integration_test.py
@@ -34,7 +34,7 @@ from tensorflow.python.estimator.canned import dnn_linear_combined
 from tensorflow.python.estimator.canned import prediction_keys
 from tensorflow.python.estimator.export import export
 from tensorflow.python.estimator.inputs import numpy_io
-from tensorflow.python.feature_column import feature_column
+from tensorflow.python.feature_column import feature_column_lib as feature_column
 from tensorflow.python.framework import ops
 from tensorflow.python.platform import gfile
 from tensorflow.python.summary.writer import writer_cache
@@ -63,7 +63,9 @@ class DNNLinearCombinedClassifierIntegrationTest(test.TestCase,
           distribution=[
               combinations.one_device_strategy,
               combinations.mirrored_strategy_with_gpu_and_cpu,
-              combinations.mirrored_strategy_with_two_gpus
+              combinations.mirrored_strategy_with_two_gpus,
+              combinations.core_mirrored_strategy_with_gpu_and_cpu,
+              combinations.core_mirrored_strategy_with_two_gpus
           ],
           use_train_and_evaluate=[True, False]))
   def test_complete_flow_with_mode(self, distribution, use_train_and_evaluate):
@@ -75,12 +77,12 @@ class DNNLinearCombinedClassifierIntegrationTest(test.TestCase,
     train_input_fn = self.dataset_input_fn(
         x={'x': data},
         y=data,
-        batch_size=batch_size // len(distribution.worker_devices),
+        batch_size=batch_size // distribution.num_replicas_in_sync,
         shuffle=True)
     eval_input_fn = self.dataset_input_fn(
         x={'x': data},
         y=data,
-        batch_size=batch_size // len(distribution.worker_devices),
+        batch_size=batch_size // distribution.num_replicas_in_sync,
         shuffle=False)
     predict_input_fn = numpy_io.numpy_input_fn(
         x={'x': data}, batch_size=batch_size, shuffle=False)
diff --git a/tensorflow/contrib/distribute/python/estimator_training_test.py b/tensorflow/contrib/distribute/python/estimator_training_test.py
index 8f82b4c92aa..b369a7fefe6 100644
--- a/tensorflow/contrib/distribute/python/estimator_training_test.py
+++ b/tensorflow/contrib/distribute/python/estimator_training_test.py
@@ -24,7 +24,6 @@ import json
 import os
 import sys
 import tempfile
-import threading
 from absl.testing import parameterized
 import numpy as np
 
@@ -45,11 +44,13 @@ from tensorflow.python.estimator import training as estimator_training
 from tensorflow.python.estimator.canned import dnn_linear_combined
 from tensorflow.python.estimator.canned import prediction_keys
 from tensorflow.python.estimator.export import export as export_lib
-from tensorflow.python.feature_column import feature_column
+from tensorflow.python.feature_column import feature_column_lib as feature_column
 from tensorflow.python.platform import gfile
 from tensorflow.python.platform import test
 from tensorflow.python.summary import summary_iterator
 from tensorflow.python.summary.writer import writer_cache
+from tensorflow.python.training import session_manager
+
 
 BATCH_SIZE = 10
 LABEL_DIMENSION = 2
@@ -68,57 +69,19 @@ PS = dc._TaskType.PS
 original_run_std_server = dc._run_std_server
 
 
-class MockOsEnv(dict):
-
-  def __init__(self, *args):
-    self._thread_local = threading.local()
-    super(MockOsEnv, self).__init__(*args)
-
-  def get(self, key, default):
-    if not hasattr(self._thread_local, "dict"):
-      self._thread_local.dict = dict()
-    if key == "TF_CONFIG":
-      return dict.get(self._thread_local.dict, key, default)
-    else:
-      return dict.get(self, key, default)
-
-  def __getitem__(self, key):
-    if not hasattr(self._thread_local, "dict"):
-      self._thread_local.dict = dict()
-    if key == "TF_CONFIG":
-      return dict.__getitem__(self._thread_local.dict, key)
-    else:
-      return dict.__getitem__(self, key)
-
-  def __setitem__(self, key, val):
-    if not hasattr(self._thread_local, "dict"):
-      self._thread_local.dict = dict()
-    if key == "TF_CONFIG":
-      return dict.__setitem__(self._thread_local.dict, key, val)
-    else:
-      return dict.__setitem__(self, key, val)
-
-
-class DistributeCoordinatorIntegrationTest(test.TestCase,
-                                           parameterized.TestCase):
+class DistributeCoordinatorIntegrationTest(
+    multi_worker_test_base.IndependentWorkerTestBase, parameterized.TestCase):
 
   @classmethod
   def setUpClass(cls):
     """Create a local cluster with 2 workers."""
+    super(DistributeCoordinatorIntegrationTest, cls).setUpClass()
     cls._cluster_spec = multi_worker_test_base.create_in_process_cluster(
         num_workers=3, num_ps=2, has_eval=True)
 
   def setUp(self):
     self._model_dir = tempfile.mkdtemp()
-    self._mock_os_env = MockOsEnv()
-    self._mock_context = test.mock.patch.object(os, "environ",
-                                                self._mock_os_env)
     super(DistributeCoordinatorIntegrationTest, self).setUp()
-    self._mock_context.__enter__()
-
-  def tearDown(self):
-    self._mock_context.__exit__(None, None, None)
-    super(DistributeCoordinatorIntegrationTest, self).tearDown()
 
   def dataset_input_fn(self, x, y, batch_size, shuffle):
 
@@ -141,8 +104,8 @@ class DistributeCoordinatorIntegrationTest(test.TestCase,
   def _extract_loss_and_global_step(self, event_folder):
     """Returns the loss and global step in last event."""
     event_paths = glob.glob(os.path.join(event_folder, "events*"))
-    self.assertGreater(len(event_paths), 0,
-                       msg="Event file not found in dir %s" % event_folder)
+    self.assertNotEmpty(
+        event_paths, msg="Event file not found in dir %s" % event_folder)
 
     loss = None
     global_step_count = None
@@ -202,10 +165,10 @@ class DistributeCoordinatorIntegrationTest(test.TestCase,
     train_input_fn = self.dataset_input_fn(
         x={"x": DATA},
         y=DATA,
-        batch_size=BATCH_SIZE // len(train_distribute.worker_devices),
+        batch_size=BATCH_SIZE // train_distribute.num_replicas_in_sync,
         shuffle=True)
     if eval_distribute:
-      eval_batch_size = BATCH_SIZE // len(eval_distribute.worker_devices)
+      eval_batch_size = BATCH_SIZE // eval_distribute.num_replicas_in_sync
     else:
       eval_batch_size = BATCH_SIZE
     eval_input_fn = self.dataset_input_fn(
@@ -285,27 +248,34 @@ class DistributeCoordinatorIntegrationTest(test.TestCase,
     ])
     self.assertAllEqual((BATCH_SIZE, LABEL_DIMENSION), predicted_proba.shape)
 
+  def _get_strategy_object(self, strategy_cls):
+    if strategy_cls == mirrored_strategy.CoreMirroredStrategy:
+      return strategy_cls(mirrored_strategy.all_local_devices())
+    else:
+      return strategy_cls(num_gpus_per_worker=context.num_gpus())
+
   @combinations.generate(
       combinations.combine(
           mode=["graph"],
           train_distribute_cls=[
               collective_all_reduce_strategy.CollectiveAllReduceStrategy,
               mirrored_strategy.MirroredStrategy,
+              mirrored_strategy.CoreMirroredStrategy,
               parameter_server_strategy.ParameterServerStrategy
           ],
           eval_distribute_cls=[
-              None, mirrored_strategy.MirroredStrategy,
+              None,
+              mirrored_strategy.MirroredStrategy,
+              mirrored_strategy.CoreMirroredStrategy,
               parameter_server_strategy.ParameterServerStrategy,
           ],
           required_gpus=[0, 1]))
   def test_complete_flow_standalone_client(self, train_distribute_cls,
                                            eval_distribute_cls):
-    train_distribute = train_distribute_cls(
-        num_gpus_per_worker=context.num_gpus())
+    train_distribute = self._get_strategy_object(train_distribute_cls)
 
     if eval_distribute_cls:
-      eval_distribute = eval_distribute_cls(
-          num_gpus_per_worker=context.num_gpus())
+      eval_distribute = self._get_strategy_object(eval_distribute_cls)
     else:
       eval_distribute = None
 
@@ -322,20 +292,20 @@ class DistributeCoordinatorIntegrationTest(test.TestCase,
           mode=["graph"],
           train_distribute_cls=[
               mirrored_strategy.MirroredStrategy,
+              mirrored_strategy.CoreMirroredStrategy,
           ],
           eval_distribute_cls=[
               None,
               mirrored_strategy.MirroredStrategy,
+              mirrored_strategy.CoreMirroredStrategy,
           ],
           required_gpus=[0, 1]))
   def test_estimator_standalone_client(self, train_distribute_cls,
                                        eval_distribute_cls):
-    train_distribute = train_distribute_cls(
-        num_gpus_per_worker=context.num_gpus())
+    train_distribute = self._get_strategy_object(train_distribute_cls)
 
     if eval_distribute_cls:
-      eval_distribute = eval_distribute_cls(
-          num_gpus_per_worker=context.num_gpus())
+      eval_distribute = self._get_strategy_object(eval_distribute_cls)
     else:
       eval_distribute = None
 
@@ -355,47 +325,15 @@ class DistributeCoordinatorIntegrationTest(test.TestCase,
     self._barrier.wait()
     return ret
 
-  def _task_thread(self, train_distribute, eval_distribute, tf_config):
-    os.environ["TF_CONFIG"] = json.dumps(tf_config)
+  def _independent_worker_fn(
+      self,
+      train_distribute,
+      eval_distribute,
+  ):
     with test.mock.patch.object(dc, "_run_std_server",
                                 self._mock_run_std_server):
       self._complete_flow(train_distribute, eval_distribute)
 
-  def _run_task_in_thread(self, cluster_spec, task_type, task_id,
-                          train_distribute, eval_distribute):
-    if task_type:
-      tf_config = {
-          "cluster": cluster_spec,
-          "task": {
-              "type": task_type,
-              "index": task_id
-          }
-      }
-    else:
-      tf_config = {
-          "cluster": cluster_spec,
-          "task": {
-              "type": task_type,
-              "index": task_id
-          }
-      }
-    t = threading.Thread(
-        target=self._task_thread,
-        args=(train_distribute, eval_distribute, tf_config))
-    t.start()
-    return t
-
-  def _run_multiple_tasks_in_threads(self, cluster_spec, train_distribute,
-                                     eval_distribute):
-    threads = {}
-    for task_type in cluster_spec.keys():
-      threads[task_type] = []
-      for task_id in range(len(cluster_spec[task_type])):
-        t = self._run_task_in_thread(cluster_spec, task_type, task_id,
-                                     train_distribute, eval_distribute)
-        threads[task_type].append(t)
-    return threads
-
   @combinations.generate(
       combinations.combine(
           mode=["graph"],
@@ -405,21 +343,20 @@ class DistributeCoordinatorIntegrationTest(test.TestCase,
           ],
           eval_distribute_cls=[
               None, mirrored_strategy.MirroredStrategy,
+              mirrored_strategy.CoreMirroredStrategy,
               parameter_server_strategy.ParameterServerStrategy,
           ],
           required_gpus=[0, 1]))
   def test_complete_flow_indepedent_worker_between_graph(
       self, train_distribute_cls, eval_distribute_cls):
-    train_distribute = train_distribute_cls(
-        num_gpus_per_worker=context.num_gpus())
-
     if (context.num_gpus() < 2 and eval_distribute_cls ==
         collective_all_reduce_strategy.CollectiveAllReduceStrategy):
       self.skipTest("`CollectiveAllReduceStrategy` needs at least two towers.")
 
+    train_distribute = self._get_strategy_object(train_distribute_cls)
+
     if eval_distribute_cls:
-      eval_distribute = eval_distribute_cls(
-          num_gpus_per_worker=context.num_gpus())
+      eval_distribute = self._get_strategy_object(eval_distribute_cls)
     else:
       eval_distribute = None
 
@@ -435,8 +372,9 @@ class DistributeCoordinatorIntegrationTest(test.TestCase,
       # 3 workers and 1 evaluator.
       self._barrier = dc._Barrier(4)
 
-    threads = self._run_multiple_tasks_in_threads(
-        cluster_spec, train_distribute, eval_distribute)
+    threads = self.run_multiple_tasks_in_threads(self._independent_worker_fn,
+                                                 cluster_spec, train_distribute,
+                                                 eval_distribute)
     for task_type, ts in threads.items():
       if task_type == PS:
         continue
@@ -449,17 +387,22 @@ class DistributeCoordinatorIntegrationTest(test.TestCase,
   @combinations.generate(
       combinations.combine(
           mode=["graph"],
-          train_distribute_cls=[mirrored_strategy.MirroredStrategy],
-          eval_distribute_cls=[None, mirrored_strategy.MirroredStrategy],
+          train_distribute_cls=[
+              mirrored_strategy.MirroredStrategy,
+              mirrored_strategy.CoreMirroredStrategy
+          ],
+          eval_distribute_cls=[
+              None,
+              mirrored_strategy.MirroredStrategy,
+              mirrored_strategy.CoreMirroredStrategy
+          ],
           required_gpus=[0, 1]))
   def test_complete_flow_indepedent_worker_in_graph(self, train_distribute_cls,
                                                     eval_distribute_cls):
-    train_distribute = train_distribute_cls(
-        num_gpus_per_worker=context.num_gpus())
+    train_distribute = self._get_strategy_object(train_distribute_cls)
 
     if eval_distribute_cls:
-      eval_distribute = eval_distribute_cls(
-          num_gpus_per_worker=context.num_gpus())
+      eval_distribute = self._get_strategy_object(eval_distribute_cls)
     else:
       eval_distribute = None
 
@@ -467,8 +410,9 @@ class DistributeCoordinatorIntegrationTest(test.TestCase,
         num_workers=3, num_ps=0, has_eval=True)
     # 3 workers and 1 evaluator.
     self._barrier = dc._Barrier(4)
-    threads = self._run_multiple_tasks_in_threads(
-        cluster_spec, train_distribute, eval_distribute)
+    threads = self.run_multiple_tasks_in_threads(self._independent_worker_fn,
+                                                 cluster_spec, train_distribute,
+                                                 eval_distribute)
     threads[WORKER][0].join()
     threads[EVALUATOR][0].join()
 
@@ -506,7 +450,8 @@ class RunConfigTest(test.TestCase):
         "os.environ", {"TF_CONFIG": json.dumps(TF_CONFIG_WITHOUT_TASK)}):
       run_config_lib.RunConfig(
           experimental_distribute=DistributeConfig(
-              train_distribute=mirrored_strategy.MirroredStrategy(num_gpus=2)))
+              train_distribute=mirrored_strategy.CoreMirroredStrategy(
+                  ["/device:GPU:0", "/device:GPU:1"])))
 
   def test_should_run_distribute_coordinator(self):
     """Tests that should_run_distribute_coordinator return a correct value."""
@@ -529,10 +474,12 @@ class RunConfigTest(test.TestCase):
                               {"TF_CONFIG": json.dumps(TF_CONFIG_WITH_CHIEF)}):
       config_with_train_distribute = run_config_lib.RunConfig(
           experimental_distribute=DistributeConfig(
-              train_distribute=mirrored_strategy.MirroredStrategy(num_gpus=2)))
+              train_distribute=mirrored_strategy.CoreMirroredStrategy(
+                  ["/device:GPU:0", "/device:GPU:1"])))
       config_with_eval_distribute = run_config_lib.RunConfig(
           experimental_distribute=DistributeConfig(
-              eval_distribute=mirrored_strategy.MirroredStrategy(num_gpus=2)))
+              eval_distribute=mirrored_strategy.CoreMirroredStrategy(
+                  ["/device:GPU:0", "/device:GPU:1"])))
     self.assertTrue(
         dc_training.should_run_distribute_coordinator(
             config_with_train_distribute))
@@ -545,26 +492,27 @@ class RunConfigTest(test.TestCase):
                               {"TF_CONFIG": json.dumps(TF_CONFIG_WITH_MASTER)}):
       config = run_config_lib.RunConfig(
           experimental_distribute=DistributeConfig(
-              train_distribute=mirrored_strategy.MirroredStrategy(num_gpus=2)))
+              train_distribute=mirrored_strategy.CoreMirroredStrategy(
+                  ["/device:GPU:0", "/device:GPU:1"])))
     self.assertFalse(dc_training.should_run_distribute_coordinator(config))
 
   def test_init_run_config_duplicate_distribute(self):
     with self.assertRaises(ValueError):
       run_config_lib.RunConfig(
-          train_distribute=mirrored_strategy.MirroredStrategy(),
+          train_distribute=mirrored_strategy.CoreMirroredStrategy(),
           experimental_distribute=DistributeConfig(
-              train_distribute=mirrored_strategy.MirroredStrategy()))
+              train_distribute=mirrored_strategy.CoreMirroredStrategy()))
 
     with self.assertRaises(ValueError):
       run_config_lib.RunConfig(
-          eval_distribute=mirrored_strategy.MirroredStrategy(),
+          eval_distribute=mirrored_strategy.CoreMirroredStrategy(),
           experimental_distribute=DistributeConfig(
-              eval_distribute=mirrored_strategy.MirroredStrategy()))
+              eval_distribute=mirrored_strategy.CoreMirroredStrategy()))
 
   def test_init_run_config_none_distribute_coordinator_mode(self):
     # We don't use distribute coordinator for local training.
     config = run_config_lib.RunConfig(
-        train_distribute=mirrored_strategy.MirroredStrategy())
+        train_distribute=mirrored_strategy.CoreMirroredStrategy())
     dc_training.init_run_config(config, {})
     self.assertIsNone(config._distribute_coordinator_mode)
 
@@ -572,7 +520,7 @@ class RunConfigTest(test.TestCase):
     with test.mock.patch.dict("os.environ",
                               {"TF_CONFIG": json.dumps(TF_CONFIG_WITH_MASTER)}):
       config = run_config_lib.RunConfig(
-          train_distribute=mirrored_strategy.MirroredStrategy())
+          train_distribute=mirrored_strategy.CoreMirroredStrategy())
       self.assertIsNone(config._distribute_coordinator_mode)
 
     # When `train_distribute` is not specified, don't use distribute
@@ -588,7 +536,7 @@ class RunConfigTest(test.TestCase):
     with test.mock.patch.dict("os.environ",
                               {"TF_CONFIG": json.dumps(TF_CONFIG_WITH_CHIEF)}):
       config = run_config_lib.RunConfig(
-          train_distribute=mirrored_strategy.MirroredStrategy())
+          train_distribute=mirrored_strategy.CoreMirroredStrategy())
     self.assertEqual(config._distribute_coordinator_mode,
                      dc.CoordinatorMode.INDEPENDENT_WORKER)
 
@@ -597,7 +545,7 @@ class RunConfigTest(test.TestCase):
     # `experimental.remote_cluster` is set use distribute coordinator with
     # STANDALONE_CLIENT mode.
     config = run_config_lib.RunConfig(
-        train_distribute=mirrored_strategy.MirroredStrategy(),
+        train_distribute=mirrored_strategy.CoreMirroredStrategy(),
         experimental_distribute=DistributeConfig(
             remote_cluster={"chief": ["fake_worker"]}))
     self.assertEqual(config._distribute_coordinator_mode,
@@ -605,5 +553,15 @@ class RunConfigTest(test.TestCase):
 
 
 if __name__ == "__main__":
+  # Reduce `recovery_wait_secs` from 30 seconds so the test completes quickly.
+  orig_init = session_manager.SessionManager.__init__
+
+  def new_init(*args, **kwargs):
+    kwargs.pop("recovery_wait_secs", None)
+    kwargs["recovery_wait_secs"] = 0.5
+    orig_init(*args, **kwargs)
+
+  session_manager.SessionManager.__init__ = new_init
+
   with test.mock.patch.object(sys, "exit", os._exit):
     test.main()
diff --git a/tensorflow/contrib/distribute/python/examples/keras_mnist.py b/tensorflow/contrib/distribute/python/examples/keras_mnist.py
index 0fd3acd0451..60fda996642 100644
--- a/tensorflow/contrib/distribute/python/examples/keras_mnist.py
+++ b/tensorflow/contrib/distribute/python/examples/keras_mnist.py
@@ -20,6 +20,10 @@ from __future__ import print_function
 import tensorflow as tf
 
 
+from tensorflow.python.distribute import mirrored_strategy
+from tensorflow.python.keras.optimizer_v2 import rmsprop
+
+
 NUM_CLASSES = 10
 
 
@@ -102,18 +106,23 @@ def main(_):
   # Build the train and eval datasets from the MNIST data. Also return the
   # input shape which is constructed based on the `image_data_format`
   # i.e channels_first or channels_last.
+  tf.enable_eager_execution()
+
   train_ds, eval_ds, input_shape = get_input_datasets()
   model = get_model(input_shape)
 
   # Instantiate the MirroredStrategy object. If we don't specify `num_gpus` or
   # the `devices` argument then all the GPUs available on the machine are used.
-  strategy = tf.contrib.distribute.MirroredStrategy()
+  # TODO(priyag): Use `tf.distribute.MirroredStrategy` once available.
+  strategy = mirrored_strategy.MirroredStrategy(['/gpu:0', '/cpu:0'])
+
+  optimizer = rmsprop.RMSProp(learning_rate=0.001)
 
   # Compile the model by passing the distribution strategy object to the
   # `distribute` argument. `fit`, `evaluate` and `predict` will be distributed
   # based on the strategy instantiated.
   model.compile(loss=tf.keras.losses.categorical_crossentropy,
-                optimizer=tf.train.RMSPropOptimizer(learning_rate=0.001),
+                optimizer=optimizer,
                 metrics=['accuracy'],
                 distribute=strategy)
 
diff --git a/tensorflow/contrib/distribute/python/keras_optimizer_v2_test.py b/tensorflow/contrib/distribute/python/keras_optimizer_v2_test.py
index 46a1cf41c55..6dfd85bcc4f 100644
--- a/tensorflow/contrib/distribute/python/keras_optimizer_v2_test.py
+++ b/tensorflow/contrib/distribute/python/keras_optimizer_v2_test.py
@@ -25,18 +25,23 @@ import numpy as np
 import six
 
 from tensorflow.contrib.distribute.python import combinations
-from tensorflow.contrib.distribute.python import mirrored_strategy
 from tensorflow.core.protobuf import config_pb2
+from tensorflow.python import keras
 from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.eager import context
+from tensorflow.python.distribute import distribution_strategy_context as ds_context
 from tensorflow.python.estimator import run_config
 from tensorflow.python.estimator import training
 from tensorflow.python.estimator.canned import dnn_linear_combined
 from tensorflow.python.estimator.canned import prediction_keys
 from tensorflow.python.estimator.export import export
 from tensorflow.python.estimator.inputs import numpy_io
-from tensorflow.python.feature_column import feature_column
+from tensorflow.python.feature_column import feature_column_lib as feature_column
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
 from tensorflow.python.keras.optimizer_v2 import adam
+from tensorflow.python.keras.optimizer_v2 import gradient_descent
+from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import gfile
@@ -64,7 +69,9 @@ class KerasOptimizerV2IntegrationTest(test.TestCase, parameterized.TestCase):
           distribution=[
               combinations.one_device_strategy,
               combinations.mirrored_strategy_with_gpu_and_cpu,
-              combinations.mirrored_strategy_with_two_gpus
+              combinations.mirrored_strategy_with_two_gpus,
+              combinations.core_mirrored_strategy_with_gpu_and_cpu,
+              combinations.core_mirrored_strategy_with_two_gpus
           ],
           use_train_and_evaluate=[True, False]))
   def test_complete_flow_with_mode(self, distribution, use_train_and_evaluate):
@@ -76,11 +83,11 @@ class KerasOptimizerV2IntegrationTest(test.TestCase, parameterized.TestCase):
     train_input_fn = self.dataset_input_fn(
         x={'x': data},
         y=data,
-        batch_size=batch_size // len(distribution.worker_devices))
+        batch_size=batch_size // distribution.num_replicas_in_sync)
     eval_input_fn = self.dataset_input_fn(
         x={'x': data},
         y=data,
-        batch_size=batch_size // len(distribution.worker_devices))
+        batch_size=batch_size // distribution.num_replicas_in_sync)
     predict_input_fn = numpy_io.numpy_input_fn(
         x={'x': data}, batch_size=batch_size, shuffle=False)
 
@@ -136,44 +143,51 @@ class KerasOptimizerV2IntegrationTest(test.TestCase, parameterized.TestCase):
       shutil.rmtree(self._model_dir)
 
 
-class MirroredStrategyOptimizerV2Test(test.TestCase):
+def get_model():
+  x = keras.layers.Input(shape=(3,), name='input')
+  y = keras.layers.Dense(4, name='dense')(x)
+  model = keras.Model(x, y)
+  return model
 
-  def testKerasOptimizerWithUnequalInput(self):
-    if context.num_gpus() < 1:
-      self.skipTest('Not enough GPUs.')
 
-    def create_fn(device_id):
+class MirroredStrategyOptimizerV2Test(test.TestCase, parameterized.TestCase):
+
+  @combinations.generate(combinations.combine(
+      distribution=[
+          combinations.mirrored_strategy_with_gpu_and_cpu,
+          combinations.core_mirrored_strategy_with_gpu_and_cpu],
+      mode=['graph']))
+  def testKerasOptimizerWithUnequalInput(self, distribution):
+    def create_fn():
       var = variables.Variable(
           2.0, name='var', aggregation=variable_scope.VariableAggregation.SUM)
       # grad for cpu is 1, grad for gpu is 2, avg grad is 1.5.
-      loss = (device_id + 1) * var
+      loss = math_ops.cast(_replica_id() + 1, dtype=dtypes.float32) * var
       optimizer = adam.Adam(learning_rate=0.01, beta_1=0.2, beta_2=0.2)
       train_op = optimizer.minimize(loss, var_list=[var])
       m = optimizer.get_slot(var, 'm')
       v = optimizer.get_slot(var, 'v')
-      return (var, m, v, train_op, optimizer.iteration)
+      return (var, m, v, train_op, optimizer.iterations)
 
     devices = ['/device:GPU:0', '/device:CPU:0']
-    dist = mirrored_strategy.MirroredStrategy(devices)
-    with dist.scope():
-      (var, m, v, op, counter) = dist.call_for_each_replica(
-          create_fn, args=[dist.worker_device_index])
+    with distribution.scope():
+      (var, m, v, op, counter) = distribution.call_for_each_replica(create_fn)
       self.evaluate(variables.global_variables_initializer())
       var_val = [2.0, 2.0, 2.0]
       self.assertAllClose(
           var_val,
           self.evaluate(
-              [dist.read_var(var),
+              [distribution.read_var(var),
                var.get(devices[0]),
                var.get(devices[1])]))
       self.assertAllClose([0, 0, 0],
                           self.evaluate([
-                              dist.read_var(counter),
+                              distribution.read_var(counter),
                               counter.get(devices[0]),
                               counter.get(devices[1])
                           ]))
 
-      train_op = dist.unwrap(op)
+      train_op = distribution.unwrap(op)
       self.evaluate(train_op)
       # m(1) = beta1 * m(0) + (1-beta1) * grad = 0.2 * 0 + 0.8 * (1 + 2) / 2
       m_val = [1.2, 1.2, 1.2]
@@ -181,7 +195,7 @@ class MirroredStrategyOptimizerV2Test(test.TestCase):
       self.assertAllClose(
           m_val,
           self.evaluate(
-              [dist.read_var(m),
+              [distribution.read_var(m),
                m.get(devices[0]),
                m.get(devices[1])]))
       # v(1) = beta2 * v(0) + (1-beta2) * grad^2 = 0.2 * 0 + 0.8 * 2.25
@@ -189,7 +203,7 @@ class MirroredStrategyOptimizerV2Test(test.TestCase):
       self.assertAllClose(
           v_val,
           self.evaluate(
-              [dist.read_var(v),
+              [distribution.read_var(v),
                v.get(devices[0]),
                v.get(devices[1])]))
       # var(1) = var(0) - lr * m(1) * sqrt(1 - beta2) / sqrt(v(1)) / (1 - beta1)
@@ -198,12 +212,12 @@ class MirroredStrategyOptimizerV2Test(test.TestCase):
       self.assertAllClose(
           var_val,
           self.evaluate(
-              [dist.read_var(var),
+              [distribution.read_var(var),
                var.get(devices[0]),
                var.get(devices[1])]))
       self.assertAllClose([1, 1, 1],
                           self.evaluate([
-                              dist.read_var(counter),
+                              distribution.read_var(counter),
                               counter.get(devices[0]),
                               counter.get(devices[1])
                           ]))
@@ -214,7 +228,7 @@ class MirroredStrategyOptimizerV2Test(test.TestCase):
       self.assertAllClose(
           m_val,
           self.evaluate(
-              [dist.read_var(m),
+              [distribution.read_var(m),
                m.get(devices[0]),
                m.get(devices[1])]))
       # v(2) = beta2 * v(1) + (1-beta2) * grad^2 = 0.2 * 1.8 + 0.8 * 2.25
@@ -222,16 +236,50 @@ class MirroredStrategyOptimizerV2Test(test.TestCase):
       self.assertAllClose(
           v_val,
           self.evaluate(
-              [dist.read_var(v),
+              [distribution.read_var(v),
                v.get(devices[0]),
                v.get(devices[1])]))
       self.assertAllClose([2, 2, 2],
                           self.evaluate([
-                              dist.read_var(counter),
+                              distribution.read_var(counter),
                               counter.get(devices[0]),
                               counter.get(devices[1])
                           ]))
 
+  @combinations.generate(combinations.combine(
+      distribution=[
+          combinations.mirrored_strategy_with_gpu_and_cpu,
+          combinations.core_mirrored_strategy_with_gpu_and_cpu],
+      mode=['graph']))
+  def testOptimizerWithKerasModelAndNumpyArrays(self, distribution):
+
+    with self.cached_session():
+      model = get_model()
+      optimizer = gradient_descent.SGD(0.001)
+      loss = 'mse'
+      metrics = ['mae']
+      model.compile(optimizer, loss, metrics=metrics, distribute=distribution)
+
+      inputs = np.zeros((64, 3), dtype=np.float32)
+      targets = np.zeros((64, 4), dtype=np.float32)
+
+      model.fit(
+          inputs,
+          targets,
+          epochs=1,
+          batch_size=2,
+          verbose=0,
+          validation_data=(inputs, targets))
+      model.evaluate(inputs, targets)
+      model.predict(inputs)
+
+
+def _replica_id():
+  replica_id = ds_context.get_replica_context().replica_id_in_sync_group
+  if not isinstance(replica_id, ops.Tensor):
+    replica_id = constant_op.constant(replica_id)
+  return replica_id
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/contrib/distribute/python/keras_test.py b/tensorflow/contrib/distribute/python/keras_test.py
index 0db5844e4c4..e530ab6f173 100644
--- a/tensorflow/contrib/distribute/python/keras_test.py
+++ b/tensorflow/contrib/distribute/python/keras_test.py
@@ -24,9 +24,10 @@ import numpy as np
 from tensorflow.contrib.distribute.python import combinations
 from tensorflow.contrib.distribute.python import mirrored_strategy
 from tensorflow.contrib.distribute.python import tpu_strategy
-from tensorflow.contrib.distribute.python import values
 from tensorflow.python import keras
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.distribute import values
+from tensorflow.python.eager import test
 from tensorflow.python.estimator import keras as keras_lib
 from tensorflow.python.estimator import run_config as run_config_lib
 from tensorflow.python.framework import constant_op
@@ -35,14 +36,13 @@ from tensorflow.python.framework import random_seed
 from tensorflow.python.framework import test_util
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.keras.engine import distributed_training_utils
+from tensorflow.python.keras.optimizer_v2 import gradient_descent as gradient_descent_keras
 from tensorflow.python.ops.parsing_ops import gen_parsing_ops
 from tensorflow.python.platform import gfile
-from tensorflow.python.platform import test
 from tensorflow.python.summary.writer import writer_cache
 from tensorflow.python.training import gradient_descent
 from tensorflow.python.training import rmsprop
 
-
 _RANDOM_SEED = 1337
 _TRAIN_SIZE = 200
 _INPUT_SIZE = (10,)
@@ -212,13 +212,18 @@ def multi_input_output_model():
   return model
 
 
-def get_correctness_test_inputs(use_numpy, with_distribution,
+def get_correctness_test_inputs(use_numpy, use_validation_data,
+                                with_distribution,
                                 x_train, y_train, x_predict):
   """Generates the inputs for correctness check when enable Keras with DS."""
   global_batch_size = 64
   batch_size = global_batch_size
   # TODO(b/118776054): Use global batch size for Keras/DS support.
-  if with_distribution:
+  use_per_core_batch_size = (
+      with_distribution and
+      not distributed_training_utils.global_batch_size_supported(
+          with_distribution))
+  if use_per_core_batch_size:
     batch_size //= with_distribution.num_replicas_in_sync
 
   if use_numpy:
@@ -229,16 +234,17 @@ def get_correctness_test_inputs(use_numpy, with_distribution,
         'epochs': 1,
         'shuffle': False,
     }
-    eval_inputs = {
-        'batch_size': batch_size,
-        'x': x_train,
-        'y': y_train,
-    }
+
+    if use_validation_data:
+      eval_inputs = None
+      training_inputs['validation_data'] = (x_train, y_train)
+    else:
+      eval_inputs = {
+          'batch_size': batch_size,
+          'x': x_train,
+          'y': y_train,
+      }
     predict_inputs = {
-        # TODO(b/119318587): We should not require batch_size when distribution
-        # is enabled.
-        'batch_size': (len(x_predict) // with_distribution.num_replicas_in_sync
-                       if with_distribution else None),
         'x': np.array(x_predict, dtype=np.float32),
     }
   else:
@@ -256,20 +262,28 @@ def get_correctness_test_inputs(use_numpy, with_distribution,
         'shuffle': False,
         'steps_per_epoch': len(x_train) // global_batch_size,
     }
-    eval_inputs = {
-        'batch_size': None,
-        'x': x,
-        'y': None,
-        'steps': 20,
-    }
+    if use_validation_data:
+      eval_inputs = None  # Remove the eval_inputs
+      eval_dataset = dataset_ops.Dataset.from_tensor_slices(
+          (x_train, y_train))
+      x = batch_wrapper(eval_dataset, batch_size, with_distribution)
+      training_inputs['validation_data'] = x
+      training_inputs['validation_steps'] = 5
+    else:
+      eval_inputs = {
+          'batch_size': None,
+          'x': x,
+          'y': None,
+          'steps': 20,
+      }
+
     predict_batch_size = len(x_predict)
-    if with_distribution:
+    if use_per_core_batch_size:
       predict_batch_size //= with_distribution.num_replicas_in_sync
     predict_dataset = dataset_ops.Dataset.from_tensor_slices(x_predict)
     predict_dataset = batch_wrapper(predict_dataset,
                                     predict_batch_size, with_distribution)
     predict_inputs = {
-        'batch_size': None,
         'steps': 1,
         'x': predict_dataset,
     }
@@ -277,47 +291,71 @@ def get_correctness_test_inputs(use_numpy, with_distribution,
   return training_inputs, eval_inputs, predict_inputs
 
 
-strategies = [combinations.default_strategy,
-              combinations.one_device_strategy,
-              combinations.mirrored_strategy_with_gpu_and_cpu,
-              combinations.mirrored_strategy_with_two_gpus,
-              combinations.tpu_strategy,  # steps_per_run=2
-              combinations.tpu_strategy_one_step]
+strategies_minus_tpu = [
+    combinations.default_strategy,
+    combinations.one_device_strategy,
+    combinations.mirrored_strategy_with_gpu_and_cpu,
+    combinations.mirrored_strategy_with_two_gpus,
+    combinations.core_mirrored_strategy_with_gpu_and_cpu,
+    combinations.core_mirrored_strategy_with_two_gpus]
+
+tpu_strategies = [
+    combinations.tpu_strategy,  # steps_per_run=2
+    combinations.tpu_strategy_one_step]
 
 
 def strategy_minus_tpu_combinations():
   return combinations.combine(
-      distribution=[combinations.default_strategy,
-                    combinations.one_device_strategy,
-                    combinations.mirrored_strategy_with_gpu_and_cpu,
-                    combinations.mirrored_strategy_with_two_gpus],
-      mode=['graph'])
+      distribution=strategies_minus_tpu,
+      mode=['graph', 'eager'])
 
 
-def strategy_combinations():
+def tpu_strategy_combinations():
   return combinations.combine(
-      distribution=strategies,
+      distribution=tpu_strategies,
       mode=['graph'])
 
 
+def all_strategy_combinations():
+  return strategy_minus_tpu_combinations() + tpu_strategy_combinations()
+
+
+# TODO(priyag): Add v2 optimizers here.
 def strategy_and_optimizer_combinations():
+  return combinations.times(
+      all_strategy_combinations(),
+      combinations.combine(
+          optimizer=[combinations.adagrad_optimizer_v1_fn,
+                     combinations.adam_optimizer_v1_fn,
+                     combinations.gradient_descent_optimizer_v1_fn,
+                     combinations.rmsprop_optimizer_v1_fn]))
+
+
+def strategy_and_input_combinations():
+  return (
+      combinations.times(
+          combinations.combine(distribution=strategies_minus_tpu),
+          combinations.combine(mode=['graph'],
+                               use_numpy=[True, False],
+                               use_validation_data=[True, False])
+          + combinations.combine(mode=['eager'],
+                                 use_numpy=[False],
+                                 use_validation_data=[False])) +
+      combinations.times(
+          combinations.combine(distribution=tpu_strategies),
+          combinations.combine(mode=['graph'],
+                               use_numpy=[True, False],
+                               use_validation_data=[True, False])))
+
+
+def strategy_for_numpy_input_combinations():
   return combinations.combine(
-      distribution=strategies,
-      optimizer=[combinations.adagrad_optimizer_v1_fn,
-                 combinations.adam_optimizer_v1_fn,
-                 combinations.gradient_descent_optimizer_v1_fn,
-                 combinations.rmsprop_optimizer_v1_fn],
+      distribution=strategies_minus_tpu + tpu_strategies,
       mode=['graph'])
 
 
-def strategy_and_inputs():
-  return combinations.combine(
-      distribution=strategies,
-      use_numpy=[True, False],
-      mode=['graph'])
-
-
-class TestEstimatorDistributionStrategy(test_util.TensorFlowTestCase):
+class TestEstimatorDistributionStrategy(test_util.TensorFlowTestCase,
+                                        parameterized.TestCase):
 
   def setUp(self):
     self._base_dir = os.path.join(self.get_temp_dir(),
@@ -325,17 +363,18 @@ class TestEstimatorDistributionStrategy(test_util.TensorFlowTestCase):
     gfile.MakeDirs(self._base_dir)
     self._config = run_config_lib.RunConfig(
         tf_random_seed=_RANDOM_SEED, model_dir=self._base_dir)
-    self._dist = mirrored_strategy.MirroredStrategy(
-        devices=['/device:GPU:0', '/device:GPU:1'])
 
   def tearDown(self):
     writer_cache.FileWriterCache.clear()
     if os.path.isdir(self._base_dir):
       gfile.DeleteRecursively(self._base_dir)
 
-  def test_train_functional_with_distribution_strategy(self):
-    dist = mirrored_strategy.MirroredStrategy(
-        devices=['/device:GPU:0', '/device:GPU:1'])
+  @combinations.generate(combinations.combine(
+      distribution=[
+          combinations.mirrored_strategy_with_two_gpus,
+          combinations.core_mirrored_strategy_with_two_gpus],
+      mode=['graph']))
+  def test_train_functional_with_distribution_strategy(self, distribution):
     keras_model = simple_functional_model()
     keras_model.compile(
         loss='categorical_crossentropy',
@@ -343,8 +382,8 @@ class TestEstimatorDistributionStrategy(test_util.TensorFlowTestCase):
         optimizer=rmsprop.RMSPropOptimizer(learning_rate=0.01))
     config = run_config_lib.RunConfig(tf_random_seed=_RANDOM_SEED,
                                       model_dir=self._base_dir,
-                                      train_distribute=dist,
-                                      eval_distribute=dist)
+                                      train_distribute=distribution,
+                                      eval_distribute=distribution)
     with self.cached_session():
       est_keras = keras_lib.model_to_estimator(
           keras_model=keras_model, config=config)
@@ -358,9 +397,12 @@ class TestEstimatorDistributionStrategy(test_util.TensorFlowTestCase):
     writer_cache.FileWriterCache.clear()
     gfile.DeleteRecursively(self._config.model_dir)
 
-  def test_train_sequential_with_distribution_strategy(self):
-    dist = mirrored_strategy.MirroredStrategy(
-        devices=['/device:GPU:0', '/device:GPU:1'])
+  @combinations.generate(combinations.combine(
+      distribution=[
+          combinations.mirrored_strategy_with_two_gpus,
+          combinations.core_mirrored_strategy_with_two_gpus],
+      mode=['graph']))
+  def test_train_sequential_with_distribution_strategy(self, distribution):
     keras_model = simple_sequential_model()
     keras_model.compile(
         loss='categorical_crossentropy',
@@ -368,7 +410,7 @@ class TestEstimatorDistributionStrategy(test_util.TensorFlowTestCase):
         optimizer=rmsprop.RMSPropOptimizer(learning_rate=0.01))
     config = run_config_lib.RunConfig(tf_random_seed=_RANDOM_SEED,
                                       model_dir=self._base_dir,
-                                      train_distribute=dist)
+                                      train_distribute=distribution)
     with self.cached_session():
       est_keras = keras_lib.model_to_estimator(
           keras_model=keras_model, config=config)
@@ -382,7 +424,12 @@ class TestEstimatorDistributionStrategy(test_util.TensorFlowTestCase):
     writer_cache.FileWriterCache.clear()
     gfile.DeleteRecursively(self._config.model_dir)
 
-  def test_multi_inputs_multi_outputs_with_input_fn_as_dict(self):
+  @combinations.generate(combinations.combine(
+      distribution=[
+          combinations.mirrored_strategy_with_two_gpus,
+          combinations.core_mirrored_strategy_with_two_gpus],
+      mode=['graph']))
+  def test_multi_inputs_multi_outputs_with_input_fn_as_dict(self, distribution):
     train_data, test_data = get_multi_inputs_multi_outputs_data()
 
     def train_input_fn():
@@ -412,14 +459,14 @@ class TestEstimatorDistributionStrategy(test_util.TensorFlowTestCase):
                                                      output_dict)).batch(16)
 
     self.do_test_multi_inputs_multi_outputs_with_input_fn(
-        train_input_fn, eval_input_fn)
+        distribution, train_input_fn, eval_input_fn)
 
-  def do_test_multi_inputs_multi_outputs_with_input_fn(self, train_input_fn,
-                                                       eval_input_fn):
+  def do_test_multi_inputs_multi_outputs_with_input_fn(
+      self, distribution, train_input_fn, eval_input_fn):
     config = run_config_lib.RunConfig(
         tf_random_seed=_RANDOM_SEED,
         model_dir=self._base_dir,
-        train_distribute=self._dist)
+        train_distribute=distribution)
     with self.cached_session():
       model = multi_inputs_multi_outputs_model()
       est_keras = keras_lib.model_to_estimator(keras_model=model, config=config)
@@ -429,9 +476,12 @@ class TestEstimatorDistributionStrategy(test_util.TensorFlowTestCase):
       eval_results = est_keras.evaluate(input_fn=eval_input_fn, steps=1)
       self.assertLess(eval_results['loss'], baseline_eval_results['loss'])
 
-  def test_keras_optimizer_with_distribution_strategy(self):
-    dist = mirrored_strategy.MirroredStrategy(
-        devices=['/device:GPU:0', '/device:GPU:1'])
+  @combinations.generate(combinations.combine(
+      distribution=[
+          combinations.mirrored_strategy_with_two_gpus,
+          combinations.core_mirrored_strategy_with_two_gpus],
+      mode=['graph']))
+  def test_keras_optimizer_with_distribution_strategy(self, distribution):
     keras_model = simple_sequential_model()
     keras_model.compile(
         loss='categorical_crossentropy',
@@ -439,7 +489,7 @@ class TestEstimatorDistributionStrategy(test_util.TensorFlowTestCase):
 
     config = run_config_lib.RunConfig(tf_random_seed=_RANDOM_SEED,
                                       model_dir=self._base_dir,
-                                      train_distribute=dist)
+                                      train_distribute=distribution)
     with self.cached_session():
       est_keras = keras_lib.model_to_estimator(keras_model=keras_model,
                                                config=config)
@@ -455,7 +505,7 @@ class TestEstimatorDistributionStrategy(test_util.TensorFlowTestCase):
 class TestDistributionStrategyWithNumpyArrays(test.TestCase,
                                               parameterized.TestCase):
 
-  @combinations.generate(strategy_combinations())
+  @combinations.generate(strategy_for_numpy_input_combinations())
   def test_creating_var_with_numpy_arrays(self, distribution):
     with self.cached_session():
       x = np.asarray(np.random.random((64, 3)), dtype=np.float32)
@@ -464,84 +514,135 @@ class TestDistributionStrategyWithNumpyArrays(test.TestCase,
       # Verify that the numpy value is copied to the variable.
       self.assertAllEqual(x, val)
 
-  def test_calculating_batch_params(self):
-    # This verifies that we calculate the number of steps when the batch size
-    # is specified.
+  @combinations.generate(strategy_for_numpy_input_combinations())
+  def test_calculating_input_params_no_steps_no_batch_size(self, distribution):
+    # Calculate the per_replica_batch_size scaling factor for strategies
+    # that use per_core_batch_size
+    replica_scale_factor = 1.0
+    if not distributed_training_utils.global_batch_size_supported(distribution):
+      replica_scale_factor = distribution.num_replicas_in_sync
+
     with self.cached_session():
-      # 64 is the number of input samples.
-      inputs = np.zeros((64, 3), dtype=np.float32)
-      # The number of replicas is equal to 3.
-      strategy = mirrored_strategy.MirroredStrategy(['/device:GPU:0',
-                                                     '/device:CPU:0',
-                                                     '/device:GPU:1'])
+      # Input samples of different sizes
+      input_20_samples = np.zeros((20, 3), dtype=np.float32)
+      input_63_samples = np.zeros((63, 3), dtype=np.float32)
+      input_64_samples = np.zeros((64, 3), dtype=np.float32)
 
-      with self.assertRaisesRegexp(ValueError, 'Please specify a batch_size '
-                                               'that is smaller than'):
-        # The batch size(128) is larger than the number of input
-        # samples(64).
-        distributed_training_utils.get_input_batch_params(inputs,
-                                                          128,
-                                                          strategy)
-
-      with self.assertRaisesRegexp(ValueError, 'is smaller than the number '
-                                               'of replicas'):
-        # The batch size(32) * num_replicas_in_sync(3) is 96 which is greater
-        # than the number of input samples(64).
-        distributed_training_utils.get_input_batch_params(inputs,
-                                                          32,
-                                                          strategy)
-
-      # The number of replicas now is equal to 2.
-      strategy = mirrored_strategy.MirroredStrategy(['/device:GPU:0',
-                                                     '/device:CPU:0'])
-      # 32 is the batch size per replica.
-      steps = distributed_training_utils.get_input_batch_params(inputs,
-                                                                32,
-                                                                strategy)
-      # The number of batches is the ratio of input samples(64) to
-      # batch size(32) which is 2. The number of steps(1) is the ratio of
-      # number of batches(2) to the number of replicas(2).
-      self.assertEqual(steps, 1)
-
-      # 16 is the batch size per replica.
-      steps = distributed_training_utils.get_input_batch_params(inputs,
-                                                                16,
-                                                                strategy)
-      # The number of batches is the ratio of input samples(64) to
-      # batch size(16) which is 4. The number of steps(2) is the ratio of
-      # number of batches(4) to the number of replicas(2).
+      # Default global batch size 32 for input with 64 samples run in 2 steps
+      steps, batch_size = distributed_training_utils.get_input_params(
+          distribution, input_64_samples, steps=None, batch_size=None)
+      self.assertEqual(batch_size, 32 // replica_scale_factor)
       self.assertEqual(steps, 2)
 
-  def test_calculating_batch_size(self):
+      # Computed global batch size 20 is lower than 32 if we pass less samples.
+      steps, batch_size = distributed_training_utils.get_input_params(
+          distribution, input_20_samples, steps=None, batch_size=None)
+      self.assertEqual(batch_size, 20 // replica_scale_factor)
+      self.assertEqual(steps, 1)
+
+      #  Default global batch size 32 cannot be used with 63 samples.
+      with self.assertRaisesRegexp(ValueError, 'not divisible by batch size'):
+        distributed_training_utils.get_input_params(
+            distribution, input_63_samples, steps=None, batch_size=None)
+
+  @combinations.generate(strategy_for_numpy_input_combinations())
+  def test_calculating_input_params_with_steps_no_batch_size(self,
+                                                             distribution):
+    # Calculate the per_replica_batch_size scaling factor for strategies
+    # that use per_core_batch_size
+    replica_scale_factor = 1.0
+    if not distributed_training_utils.global_batch_size_supported(distribution):
+      replica_scale_factor = distribution.num_replicas_in_sync
+
     with self.cached_session():
-      # 64 is the number of input samples.
-      inputs = np.zeros((64, 3), dtype=np.float32)
-      targets = np.zeros((64, 4), dtype=np.float32)
+      # Input samples of different sizes
+      input_63_samples = np.zeros((63, 3), dtype=np.float32)
+      input_64_samples = np.zeros((64, 3), dtype=np.float32)
 
-      model = get_model()
-      optimizer = gradient_descent.GradientDescentOptimizer(0.001)
-      loss = 'mse'
-      strategy = mirrored_strategy.MirroredStrategy(['/device:GPU:0',
-                                                     '/device:CPU:0'])
-      strategy._require_static_shapes = True
+      # Computed global batch size is correct for number of specified 1 step
+      steps, batch_size = distributed_training_utils.get_input_params(
+          distribution, input_64_samples, steps=1, batch_size=None)
+      self.assertEqual(batch_size, 64 // replica_scale_factor)
+      self.assertEqual(steps, 1)
 
-      model.compile(optimizer, loss, distribute=strategy)
-      iterator = model._distribution_standardize_user_data(inputs,
-                                                           targets,
-                                                           batch_size=None,
-                                                           check_steps=True,
-                                                           steps_name='steps',
-                                                           steps=3)
+      # Computed global batch size is correct for number of specified 2 steps
+      steps, batch_size = distributed_training_utils.get_input_params(
+          distribution, input_64_samples, steps=2, batch_size=None)
+      self.assertEqual(batch_size, 32 // replica_scale_factor)
+      self.assertEqual(steps, 2)
 
-      # The global batch size(21) across all replicas is the ratio of the input
-      # samples(64) to the steps(3).
-      # The batch size(10) per device is the ratio of the global batch size(21)
-      # to the number of replicas(2).
-      # The global batch size and batch size are rounded integer values.
-      self.assertEqual(10, distributed_training_utils.get_batch_dimension(
-          iterator._iterator))
+      # All samples can not be consumed in specified number of steps
+      with self.assertRaisesRegexp(ValueError, 'not divisible by steps'):
+        distributed_training_utils.get_input_params(
+            distribution, input_63_samples, steps=2, batch_size=None)
 
-  @combinations.generate(strategy_combinations())
+      # This cases is different for different strategies due to the
+      # difference in supported batch size being global or per-replica.
+      if replica_scale_factor == 1:
+        # Computed global batch size is correct even if not sharadable
+        steps, batch_size = distributed_training_utils.get_input_params(
+            distribution, input_63_samples, steps=3, batch_size=None)
+        self.assertEqual(batch_size, 21)
+        self.assertEqual(steps, 3)
+      else:
+        # Computed global batch size can not be sharded across replicas
+        with self.assertRaisesRegexp(ValueError, 'could not be sharded evenly '
+                                     'across the sync replicas'):
+          distributed_training_utils.get_input_params(
+              distribution, input_63_samples, steps=1, batch_size=None)
+
+  @combinations.generate(strategy_for_numpy_input_combinations())
+  def test_calculating_input_params_no_steps_with_batch_size(self,
+                                                             distribution):
+    # Calculate the per_replica_batch_size scaling factor for strategies
+    # that use per_core_batch_size
+    replica_scale_factor = 1.0
+    if not distributed_training_utils.global_batch_size_supported(distribution):
+      replica_scale_factor = distribution.num_replicas_in_sync
+
+    with self.cached_session():
+      input_64_samples = np.zeros((64, 3), dtype=np.float32)
+
+      # Computed steps is correct for specified batch size
+      steps, batch_size = distributed_training_utils.get_input_params(
+          distribution, input_64_samples, steps=None, batch_size=16)
+      self.assertEqual(batch_size, 16)
+      self.assertEqual(steps, 4 // replica_scale_factor)
+
+      # Computed steps is correct for specified batch size
+      steps, batch_size = distributed_training_utils.get_input_params(
+          distribution, input_64_samples, steps=None, batch_size=32)
+      self.assertEqual(batch_size, 32)
+      self.assertEqual(steps, 2 // replica_scale_factor)
+
+      # Number of samples is not divisible by the global batch size
+      with self.assertRaisesRegexp(ValueError, 'not divisible by batch size'):
+        distributed_training_utils.get_input_params(
+            distribution, input_64_samples, steps=None, batch_size=20)
+
+      # Number of samples is not divisible by the global batch size
+      with self.assertRaisesRegexp(ValueError, 'not divisible by batch size'):
+        distributed_training_utils.get_input_params(
+            distribution, input_64_samples, steps=None, batch_size=3)
+
+  @combinations.generate(strategy_for_numpy_input_combinations())
+  def test_calculating_input_params_with_steps_with_batch_size(self,
+                                                               distribution):
+    with self.cached_session():
+      input_64_samples = np.zeros((64, 3), dtype=np.float32)
+
+      # No change to steps and batch size if both specified and feasible
+      steps, batch_size = distributed_training_utils.get_input_params(
+          distribution, input_64_samples, steps=5, batch_size=3)
+      self.assertEqual(batch_size, 3)
+      self.assertEqual(steps, 5)
+
+      # Number of samples is less than global batch size * steps
+      with self.assertRaisesRegexp(ValueError, 'less than samples required'):
+        distributed_training_utils.get_input_params(
+            distribution, input_64_samples, steps=10, batch_size=13)
+
+  @combinations.generate(strategy_for_numpy_input_combinations())
   def test_calling_model_with_numpy_arrays(self, distribution):
     with self.cached_session():
       model = get_model()
@@ -572,7 +673,7 @@ class TestDistributionStrategyWithNumpyArrays(test.TestCase,
       # with batch_size
       model.predict(inputs, batch_size=8)
 
-  @combinations.generate(strategy_combinations())
+  @combinations.generate(strategy_for_numpy_input_combinations())
   def test_calling_model_with_nested_numpy_arrays(self, distribution):
     with self.cached_session():
       model = multi_input_output_model()
@@ -606,21 +707,22 @@ class TestDistributionStrategyWithNumpyArrays(test.TestCase,
       # with batch_size
       model.predict(inputs, batch_size=8)
 
-  @combinations.generate(strategy_minus_tpu_combinations())
+  @combinations.generate(combinations.combine(
+      distribution=strategies_minus_tpu, mode=['graph']))
   def test_numpy_with_sample_weights(self, distribution):
     model = get_model()
     optimizer = rmsprop.RMSPropOptimizer(learning_rate=0.001)
     loss = 'mse'
     model.compile(optimizer, loss, distribute=distribution)
 
-    inputs = np.zeros((10, 3), np.float32)
-    targets = np.zeros((10, 4), np.float32)
-    sample_weights = np.ones((10), np.float32)
+    inputs = np.zeros((20, 3), np.float32)
+    targets = np.zeros((20, 4), np.float32)
+    sample_weights = np.ones((20), np.float32)
 
     model.fit(inputs, targets, sample_weight=sample_weights, epochs=1,
               steps_per_epoch=2, verbose=1)
 
-  @combinations.generate(strategy_combinations())
+  @combinations.generate(strategy_for_numpy_input_combinations())
   def test_flatten_predict_outputs(self, distribution):
     with self.cached_session():
       model = multi_input_output_model()
@@ -638,7 +740,7 @@ class TestDistributionStrategyWithNumpyArrays(test.TestCase,
       # `predict` a list that is equal in length to the number of model outputs.
       # In this test our model has two outputs and each element of `outs`
       # corresponds to all the samples of one of the model outputs.
-      self.assertEqual(2, len(outs))
+      self.assertLen(outs, 2)
       # Each of the output samples have a dimension of 7. We should process all
       # the available input samples(6).
       self.assertAllEqual([6, 7], outs[0].shape)
@@ -648,7 +750,7 @@ class TestDistributionStrategyWithNumpyArrays(test.TestCase,
 class TestDistributionStrategyWithDatasets(test.TestCase,
                                            parameterized.TestCase):
 
-  @combinations.generate(strategy_combinations())
+  @combinations.generate(all_strategy_combinations())
   def test_calling_model_on_same_dataset(self, distribution):
     with self.cached_session():
       model = get_model()
@@ -667,7 +769,7 @@ class TestDistributionStrategyWithDatasets(test.TestCase,
                 validation_data=dataset, validation_steps=2)
       model.predict(get_predict_dataset(distribution), steps=2)
 
-  @combinations.generate(strategy_combinations())
+  @combinations.generate(all_strategy_combinations())
   def test_model_interleaved_eval_same_as_direct_eval(self, distribution):
     with self.cached_session():
       user_controlled_model = get_model()
@@ -710,16 +812,20 @@ class TestDistributionStrategyWithDatasets(test.TestCase,
   # TODO(priyag): Enable this test for TPU. Currently tuples/dict don't work
   # as clone_model's input_tensors argument only seems to accept list and not
   # tuples or dict.
-  def test_fit_with_tuple_and_dict_dataset_inputs(self):
+
+  @combinations.generate(combinations.combine(
+      distribution=[
+          combinations.mirrored_strategy_with_gpu_and_cpu,
+          combinations.core_mirrored_strategy_with_gpu_and_cpu],
+      mode=['graph', 'eager']))
+  def test_fit_with_tuple_and_dict_dataset_inputs(self, distribution):
     with self.cached_session():
       model = multi_input_output_model()
 
       optimizer = gradient_descent.GradientDescentOptimizer(learning_rate=0.001)
       loss = 'mse'
       metrics = ['mae', keras.metrics.CategoricalAccuracy()]
-      strategy = mirrored_strategy.MirroredStrategy(['/device:GPU:0',
-                                                     '/device:CPU:0'])
-      model.compile(optimizer, loss, metrics=metrics, distribute=strategy)
+      model.compile(optimizer, loss, metrics=metrics, distribute=distribution)
 
       input_a_np = np.random.random((10, 3))
       input_b_np = np.random.random((10, 5))
@@ -743,7 +849,7 @@ class TestDistributionStrategyWithDatasets(test.TestCase,
 
       model.fit(dataset_dict, epochs=1, steps_per_epoch=2, verbose=1)
 
-  @combinations.generate(strategy_combinations())
+  @combinations.generate(all_strategy_combinations())
   def test_fit_eval_and_predict_methods_on_dataset(self, distribution):
     with self.cached_session():
       model = get_model()
@@ -792,25 +898,18 @@ class TestDistributionStrategyWithDatasets(test.TestCase,
     model.evaluate(dataset, steps=2, verbose=1)
     model.predict(dataset, steps=2)
 
-  def test_dataset_input_shape_validation(self):
+  @combinations.generate(combinations.combine(
+      distribution=[
+          combinations.mirrored_strategy_with_two_gpus,
+          combinations.core_mirrored_strategy_with_two_gpus],
+      mode=['graph', 'eager']))
+  def test_dataset_wrong_input_shape(self, distribution):
     with self.cached_session():
       model = get_model()
 
       optimizer = rmsprop.RMSPropOptimizer(learning_rate=0.001)
       loss = 'mse'
-      strategy = mirrored_strategy.MirroredStrategy(['/device:GPU:1',
-                                                     '/device:GPU:0'])
-
-      model.compile(optimizer, loss, distribute=strategy)
-
-      # User forgets to batch the dataset
-      inputs = np.zeros((10, 3), dtype=np.float32)
-      targets = np.zeros((10, 4), dtype=np.float32)
-      dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets))
-      dataset = dataset.repeat(100)
-
-      with self.assertRaisesRegexp(ValueError, 'expected input to have shape'):
-        model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=0)
+      model.compile(optimizer, loss, distribute=distribution)
 
       # Wrong input shape
       inputs = np.zeros((10, 5), dtype=np.float32)
@@ -823,6 +922,26 @@ class TestDistributionStrategyWithDatasets(test.TestCase,
                                    'expected input to have shape'):
         model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=0)
 
+  @combinations.generate(combinations.combine(
+      distribution=[combinations.mirrored_strategy_with_two_gpus],
+      mode=['graph', 'eager']))
+  def test_dataset_no_batch_input_validation(self, distribution):
+    with self.cached_session():
+      model = get_model()
+
+      optimizer = rmsprop.RMSPropOptimizer(learning_rate=0.001)
+      loss = 'mse'
+      model.compile(optimizer, loss, distribute=distribution)
+
+      # User forgets to batch the dataset
+      inputs = np.zeros((10, 3), dtype=np.float32)
+      targets = np.zeros((10, 4), dtype=np.float32)
+      dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets))
+      dataset = dataset.repeat(100)
+
+      with self.assertRaisesRegexp(ValueError, 'expected input to have shape'):
+        model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=0)
+
   @combinations.generate(combinations.combine(
       distribution=[combinations.tpu_strategy_one_step],
       mode=['graph']))
@@ -842,7 +961,12 @@ class TestDistributionStrategyWithDatasets(test.TestCase,
       with self.assertRaisesRegexp(ValueError, 'requires fully defined shapes'):
         model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=0)
 
-  def test_learning_phase_value(self):
+  @combinations.generate(combinations.combine(
+      distribution=[
+          combinations.mirrored_strategy_with_two_gpus,
+          combinations.core_mirrored_strategy_with_two_gpus],
+      mode=['graph', 'eager']))
+  def test_learning_phase_value(self, distribution):
     # TODO(anjalisridhar): Modify this test to use Lambdas since we can compare
     # meaningful values. Currently we don't pass the learning phase if the
     # Lambda layer uses the learning phase.
@@ -856,15 +980,17 @@ class TestDistributionStrategyWithDatasets(test.TestCase,
       optimizer = gradient_descent.GradientDescentOptimizer(0.005)
       loss = 'mse'
       metrics = ['acc']
-      strategy = mirrored_strategy.MirroredStrategy(
-          ['/device:GPU:0', '/device:GPU:1'])
+      model.compile(optimizer, loss, metrics=metrics, distribute=distribution)
 
-      model.compile(optimizer, loss, metrics=metrics, distribute=strategy)
+      batch_size = 8
+      if isinstance(distribution, mirrored_strategy.CoreMirroredStrategy):
+        # CoreMirroredStrategy uses global batch size.
+        batch_size = 8 * distribution.num_replicas_in_sync
 
       inputs = np.ones((10, 1), dtype=np.float32)
       targets = np.ones((10, 1), dtype=np.float32)
       dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets))
-      dataset = dataset.repeat().batch(8)
+      dataset = dataset.repeat().batch(batch_size)
       hist = model.fit(dataset, epochs=1, steps_per_epoch=20, verbose=1)
       self.assertAlmostEqual(hist.history['acc'][0], 0, 0)
 
@@ -875,24 +1001,51 @@ class TestDistributionStrategyWithDatasets(test.TestCase,
 
       inputs = np.ones((10, 1), dtype=np.float32)
       predict_dataset = dataset_ops.Dataset.from_tensor_slices(inputs)
-      predict_dataset = predict_dataset.repeat().batch(5)
+
+      predict_dataset = predict_dataset.repeat().batch(batch_size)
       output = model.predict(predict_dataset, steps=10)
-      # `predict` runs for 10 steps and in each step you process 100 samples.
-      ref_output = np.ones((100, 1), dtype=np.float32)
+      # `predict` runs for 10 steps
+      ref_output = np.ones((160, 1), dtype=np.float32)
       self.assertArrayNear(output, ref_output, 1e-1)
 
+  @combinations.generate(strategy_minus_tpu_combinations())
+  def testOptimizerWithCallbacks(self, distribution):
+    with self.cached_session():
+      model = get_model()
+
+      optimizer = gradient_descent_keras.SGD(0.01)
+      loss = 'mse'
+      model.compile(optimizer, loss, distribute=distribution)
+
+      dataset = get_dataset(distribution)
+
+      def schedule(_):
+        return 0.001
+
+      model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=0,
+                callbacks=[keras.callbacks.LearningRateScheduler(schedule)])
+      grouped_models = distribution.unwrap(model._grouped_model)
+      with distribution.scope():
+        for m in grouped_models:
+          self.assertAllClose(0.001, keras.backend.get_value(
+              m.optimizer.lr), atol=1e-05, rtol=1e-05)
+
 
 class TestDistributionStrategyErrorCases(test.TestCase, parameterized.TestCase):
 
-  def test_validating_dataset_input_tensors_with_shape_mismatch(self):
+  @combinations.generate(combinations.combine(
+      distribution=[
+          combinations.mirrored_strategy_with_gpu_and_cpu,
+          combinations.core_mirrored_strategy_with_gpu_and_cpu],
+      mode=['graph', 'eager']))
+  def test_validating_dataset_input_tensors_with_shape_mismatch(self,
+                                                                distribution):
     with self.cached_session():
-      strategy = mirrored_strategy.MirroredStrategy(['/device:GPU:0',
-                                                     '/device:CPU:0'])
       a = constant_op.constant([1, 2], shape=(1, 2))
       b = constant_op.constant([[1, 2], [1, 2]], shape=(2, 2))
       x = values.DistributedValues({'/device:CPU:0': a, '/device:GPU:0': b})
       y = values.DistributedValues({'/device:CPU:0': a, '/device:GPU:0': a})
-      with strategy.scope():
+      with distribution.scope():
         # Removed device and input tensor shape details from the error message
         # since the order of the device and the corresponding input tensor shape
         # is not deterministic over different runs.
@@ -901,17 +1054,21 @@ class TestDistributionStrategyErrorCases(test.TestCase, parameterized.TestCase):
                                      'distributed tensor inputs '
                                      'DistributedValues:.+'):
           distributed_training_utils.validate_distributed_dataset_inputs(
-              strategy, x, y)
+              distribution, x, y)
 
-  def test_validating_dataset_input_tensors_with_dtype_mismatch(self):
+  @combinations.generate(combinations.combine(
+      distribution=[
+          combinations.mirrored_strategy_with_gpu_and_cpu,
+          combinations.core_mirrored_strategy_with_gpu_and_cpu],
+      mode=['graph', 'eager']))
+  def test_validating_dataset_input_tensors_with_dtype_mismatch(self,
+                                                                distribution):
     with self.cached_session():
-      strategy = mirrored_strategy.MirroredStrategy(['/device:GPU:0',
-                                                     '/device:CPU:0'])
       a = constant_op.constant([1, 2], shape=(1, 2), dtype=dtypes.int32)
       b = constant_op.constant([1, 2], shape=(1, 2), dtype=dtypes.float64)
       x = values.DistributedValues({'/device:CPU:0': a, '/device:GPU:0': b})
       y = values.DistributedValues({'/device:CPU:0': a, '/device:GPU:0': a})
-      with strategy.scope():
+      with distribution.scope():
         # Removed device and input tensor dtype details from the error message
         # since the order of the device and the corresponding input tensor dtype
         # is not deterministic over different runs.
@@ -920,21 +1077,23 @@ class TestDistributionStrategyErrorCases(test.TestCase, parameterized.TestCase):
                                      'distributed tensor inputs '
                                      'DistributedValues:.+'):
           distributed_training_utils.validate_distributed_dataset_inputs(
-              strategy, x, y)
+              distribution, x, y)
 
-  def test_unsupported_features(self):
+  @combinations.generate(combinations.combine(
+      distribution=[
+          combinations.mirrored_strategy_with_two_gpus,
+          combinations.core_mirrored_strategy_with_two_gpus],
+      mode=['graph', 'eager']))
+  def test_unsupported_features(self, distribution):
     with self.cached_session():
       model = get_model()
 
       optimizer = gradient_descent.GradientDescentOptimizer(0.001)
       loss = 'mse'
       metrics = ['mae']
-      strategy = mirrored_strategy.MirroredStrategy(['/device:GPU:1',
-                                                     '/device:GPU:0'])
+      model.compile(optimizer, loss, metrics=metrics, distribute=distribution)
 
-      model.compile(optimizer, loss, metrics=metrics, distribute=strategy)
-
-      dataset = get_dataset(strategy)
+      dataset = get_dataset(distribution)
 
       # Test with validation split
       with self.assertRaisesRegexp(
@@ -969,30 +1128,33 @@ class TestDistributionStrategyErrorCases(test.TestCase, parameterized.TestCase):
                                    'you should specify the `steps` argument'):
         model.predict(dataset, verbose=0)
 
-  def test_calling_with_unsupported_predefined_callbacks(self):
+  @combinations.generate(combinations.combine(
+      distribution=[
+          combinations.mirrored_strategy_with_two_gpus,
+          combinations.core_mirrored_strategy_with_two_gpus],
+      mode=['graph', 'eager']))
+  def test_calling_with_unsupported_predefined_callbacks(self, distribution):
     with self.cached_session():
       model = get_model()
 
       optimizer = gradient_descent.GradientDescentOptimizer(0.001)
       loss = 'mse'
       metrics = ['mae']
-      strategy = mirrored_strategy.MirroredStrategy(['/device:GPU:1',
-                                                     '/device:GPU:0'])
-      model.compile(optimizer, loss, metrics=metrics, distribute=strategy)
+      model.compile(optimizer, loss, metrics=metrics, distribute=distribution)
 
-      dataset = get_dataset(strategy)
+      dataset = get_dataset(distribution)
 
       def schedule(_):
         return 0.001
       with self.assertRaisesRegexp(ValueError,
-                                   'LearningRateScheduler callback is not '
-                                   'supported with DistributionStrategy.'):
+                                   'You must specify a Keras Optimizer V2 when '
+                                   'using'):
         model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=0,
                   callbacks=[keras.callbacks.LearningRateScheduler(schedule)])
 
       with self.assertRaisesRegexp(ValueError,
-                                   'ReduceLROnPlateau callback is not '
-                                   'supported with DistributionStrategy.'):
+                                   'You must specify a Keras Optimizer V2 when '
+                                   'using'):
         model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=0,
                   callbacks=[keras.callbacks.ReduceLROnPlateau()])
       with self.assertRaisesRegexp(ValueError,
@@ -1003,11 +1165,17 @@ class TestDistributionStrategyErrorCases(test.TestCase, parameterized.TestCase):
                   callbacks=[keras.callbacks.TensorBoard(histogram_freq=10)])
 
 
-class TestDistributionStrategyWithLossMasking(test.TestCase):
+class TestDistributionStrategyWithLossMasking(test.TestCase,
+                                              parameterized.TestCase):
 
   # TODO(priyag): Enable all strategies for this test. Currently it does not
   # work for TPU due to some invalid datatype.
-  def test_masking(self):
+  @combinations.generate(combinations.combine(
+      distribution=[
+          combinations.mirrored_strategy_with_two_gpus,
+          combinations.core_mirrored_strategy_with_two_gpus],
+      mode=['graph', 'eager']))
+  def test_masking(self, distribution):
     with self.cached_session():
       np.random.seed(1337)
       x = np.array([[[1], [1]], [[0], [0]]])
@@ -1016,12 +1184,9 @@ class TestDistributionStrategyWithLossMasking(test.TestCase):
       model.add(
           keras.layers.TimeDistributed(
               keras.layers.Dense(1, kernel_initializer='one')))
-      strategy = mirrored_strategy.MirroredStrategy(['/device:GPU:1',
-                                                     '/device:GPU:0'])
-
       model.compile(loss='mse',
                     optimizer=gradient_descent.GradientDescentOptimizer(0.01),
-                    distribute=strategy)
+                    distribute=distribution)
       y = np.array([[[1], [1]], [[1], [1]]])
       dataset = dataset_ops.Dataset.from_tensor_slices((x, y))
       dataset = dataset.repeat(100)
@@ -1033,7 +1198,7 @@ class TestDistributionStrategyWithLossMasking(test.TestCase):
 class TestDistributionStrategyWithNormalizationLayer(
     test.TestCase, parameterized.TestCase):
 
-  @combinations.generate(strategy_combinations())
+  @combinations.generate(all_strategy_combinations())
   def test_batchnorm_correctness(self, distribution):
     with self.cached_session():
       model = keras.models.Sequential()
@@ -1065,7 +1230,7 @@ class TestDistributionStrategyWithNormalizationLayer(
 class TestDistributionStrategyCorrectness(test.TestCase,
                                           parameterized.TestCase):
 
-  @combinations.generate(strategy_combinations())
+  @combinations.generate(all_strategy_combinations())
   def test_metric_correctness(self, distribution):
     with self.cached_session():
       keras.backend.set_image_data_format('channels_last')
@@ -1088,22 +1253,32 @@ class TestDistributionStrategyCorrectness(test.TestCase,
           distribute=distribution)
 
       batch_size = 64
-      batch_size //= distribution.num_replicas_in_sync
+      if not distributed_training_utils.global_batch_size_supported(
+          distribution):
+        batch_size //= distribution.num_replicas_in_sync
       train_dataset = dataset_ops.Dataset.from_tensor_slices((x_train, y_train))
       train_dataset = batch_wrapper(train_dataset, batch_size, distribution)
 
       history = model.fit(x=train_dataset, epochs=1, steps_per_epoch=10)
       self.assertEqual(history.history['binary_accuracy'], [1.0])
 
-  @combinations.generate(strategy_and_inputs())
-  def test_correctness(self, distribution, use_numpy):
+  @combinations.generate(strategy_and_input_combinations())
+  def test_correctness(self, distribution, use_numpy, use_validation_data):
+
     with self.cached_session():
       tolerance = 1e-5
 
-      if isinstance(distribution, mirrored_strategy.MirroredStrategy):
+      if isinstance(distribution, (mirrored_strategy.MirroredStrategy,
+                                   mirrored_strategy.CoreMirroredStrategy)):
         # TODO(b/119257215): use the default one once the flakyness is fixed.
         tolerance = 1e-4
 
+      if (use_validation_data and
+          not isinstance(distribution, tpu_strategy.TPUStrategy)):
+        # TODO(b/120435565): Enable tests with use_validation_data once the
+        # the underlying bug is fixed.
+        return
+
       keras.backend.set_image_data_format('channels_last')
       np.random.seed(_RANDOM_SEED)
       random_seed.set_random_seed(_RANDOM_SEED)
@@ -1123,49 +1298,72 @@ class TestDistributionStrategyCorrectness(test.TestCase,
       # This is used to initialize the model for both the distribution and
       # non-distribution run. In addition, we add few non-linear layers to make
       # it non-trivial.
-      model = keras.Sequential()
-      model.add(keras.layers.Dense(10, activation='relu', input_shape=(1,)))
-      model.add(keras.layers.Dense(10, activation='relu'))
-      model.add(keras.layers.Dense(10, activation='relu'))
-      model.add(keras.layers.Dense(1))
-      initial_weights = model.get_weights()
+      def _create_model():
+        model = keras.Sequential()
+        model.add(keras.layers.Dense(10, activation='relu', input_shape=(1,)))
+        model.add(keras.layers.Dense(10, activation='relu'))
+        model.add(keras.layers.Dense(10, activation='relu'))
+        model.add(keras.layers.Dense(1))
+        return model
 
-      def fit_and_predict(with_distribution=None):
+      model = _create_model()
+      initial_weights = model.get_weights()
+      del model  # avoid accident usage.
+
+      def fit_eval_and_predict(with_distribution=None):
+        model = _create_model()
         # We have initialized the model to the same weight for the distribution
         # and non-distribution run.
         model.set_weights(initial_weights)
         model.compile(
             loss=keras.losses.mean_squared_error,
-            optimizer=gradient_descent.GradientDescentOptimizer(0.5),
+            optimizer=gradient_descent_keras.SGD(0.5),
             distribute=with_distribution)
 
         training_inputs, eval_inputs, predict_inputs = (
-            get_correctness_test_inputs(use_numpy, with_distribution,
+            get_correctness_test_inputs(use_numpy, use_validation_data,
+                                        with_distribution,
                                         x_train, y_train, x_predict))
 
-        model.fit(**training_inputs)
-        eval_result = model.evaluate(**eval_inputs)
+        traning_history = model.fit(**training_inputs).history
+
+        if eval_inputs is not None:
+          eval_result = model.evaluate(**eval_inputs)
+        else:
+          # Creates a dummy identical eval_result to be compared later.
+          eval_result = 1.0
+
         weights = model.get_weights()
         predict_result = model.predict(**predict_inputs)
 
-        return weights, eval_result, predict_result
+        return weights, traning_history, eval_result, predict_result
 
-      wts_with_ds, eval_with_ds, predict_with_ds = fit_and_predict(
-          with_distribution=distribution)
-      wts_without_ds, eval_without_ds, predict_without_ds = fit_and_predict(
-          with_distribution=None)
+      wts_with_ds, history_with_ds, eval_with_ds, predict_with_ds = (
+          fit_eval_and_predict(with_distribution=distribution))
 
-      # Verify that the weights, eval results, predict outputs  are the same
-      # within some limits of tolerance.
+      (wts_without_ds, history_without_ds, eval_without_ds,
+       predict_without_ds) = fit_eval_and_predict(with_distribution=None)
+
+      # Verify that the weights, training history, eval results, predict outputs
+      # are the same within some limits of tolerance.
       self.assertAllClose(
-          wts_with_ds, wts_without_ds, atol=tolerance, rtol=tolerance)
-      self.assertAllClose(
-          eval_with_ds, eval_without_ds, atol=tolerance, rtol=tolerance)
-      self.assertAllClose(
-          predict_with_ds, predict_without_ds, atol=tolerance, rtol=tolerance)
+          wts_with_ds, wts_without_ds, atol=tolerance, rtol=tolerance,
+          msg='Fail to assert weights after training.')
 
+      self.assertAllClose(
+          eval_with_ds, eval_without_ds, atol=tolerance, rtol=tolerance,
+          msg='Fail to assert eval results.')
+      self.assertAllClose(
+          predict_with_ds, predict_without_ds, atol=tolerance, rtol=tolerance,
+          msg='Fail to assert predict results.')
 
-# TODO(priyag): Add a test for TPUStrategy with steps_per_run > 1.
+      if not (isinstance(distribution, tpu_strategy.TPUStrategy)
+              and distribution.extended.steps_per_run > 1):
+        # TODO(b/119894254): Enable this test for all cases once the underlying
+        # bug is fixed.
+        self.assertAllClose(
+            history_with_ds, history_without_ds, atol=tolerance, rtol=tolerance,
+            msg='Fail to assert training history.')
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/contrib/distribute/python/metrics_v1_test.py b/tensorflow/contrib/distribute/python/metrics_v1_test.py
index c28ab416518..8ac659abe96 100644
--- a/tensorflow/contrib/distribute/python/metrics_v1_test.py
+++ b/tensorflow/contrib/distribute/python/metrics_v1_test.py
@@ -72,14 +72,14 @@ def _regression_dataset_fn():
       "predictions": [1., .75, .25, 0.]}).repeat()
 
 
-# TODO(priyag): Add TPU Strategy to this once metrics aggregate correctly using
-# ReplicaLocalVariables on TPUs. Submit http://cl/208914352.
 def all_combinations():
   return combinations.combine(
       distribution=[combinations.default_strategy,
                     combinations.one_device_strategy,
                     combinations.mirrored_strategy_with_gpu_and_cpu,
-                    combinations.mirrored_strategy_with_two_gpus],
+                    combinations.mirrored_strategy_with_two_gpus,
+                    combinations.core_mirrored_strategy_with_gpu_and_cpu,
+                    combinations.core_mirrored_strategy_with_two_gpus],
       mode=["graph"])
 
 
@@ -100,18 +100,19 @@ class MetricsV1Test(test.TestCase, parameterized.TestCase):
       if isinstance(distribution, tpu_strategy.TPUStrategy):
         def step_fn(ctx, inputs):
           value, update = distribution.call_for_each_replica(
-              metric_fn, args=[inputs])
+              metric_fn, args=inputs)
           ctx.set_non_tensor_output(name="value", output=value)
           return distribution.group(update)
 
         ctx = distribution.run_steps_on_dataset(
-            step_fn, iterator, iterations=distribution.steps_per_run)
+            step_fn, iterator, iterations=distribution.extended.steps_per_run)
         update = ctx.run_op
         value = ctx.non_tensor_outputs["value"]
         # In each run, we run multiple steps, and each steps consumes as many
         # batches as number of replicas.
         batches_per_update = (
-            distribution.num_replicas_in_sync * distribution.steps_per_run)
+            distribution.num_replicas_in_sync *
+            distribution.extended.steps_per_run)
       else:
         value, update = distribution.call_for_each_replica(
             metric_fn, iterator.get_next())
diff --git a/tensorflow/contrib/distribute/python/minimize_loss_test.py b/tensorflow/contrib/distribute/python/minimize_loss_test.py
index c6562463edb..dcc9df4cda5 100644
--- a/tensorflow/contrib/distribute/python/minimize_loss_test.py
+++ b/tensorflow/contrib/distribute/python/minimize_loss_test.py
@@ -25,6 +25,7 @@ from tensorflow.contrib.distribute.python import combinations
 from tensorflow.contrib.distribute.python.single_loss_example import batchnorm_example
 from tensorflow.contrib.distribute.python.single_loss_example import minimize_loss_example
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.distribute import reduce_util
 from tensorflow.python.eager import context
 from tensorflow.python.eager import test
 from tensorflow.python.framework import constant_op
@@ -63,7 +64,7 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase):
       model_fn, dataset_fn, layer = minimize_loss_example(
           optimizer_fn, use_bias=True, use_callable_loss=use_callable_loss)
 
-      def step_fn(ctx, *inputs):
+      def step_fn(ctx, inputs):
         del ctx  # Unused
         return distribution.group(
             distribution.call_for_each_replica(model_fn, args=inputs))
@@ -157,7 +158,7 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase):
           use_callable_loss=True,
           create_optimizer_inside_model_fn=True)
 
-      def step_fn(ctx, *inputs):
+      def step_fn(ctx, inputs):
         del ctx  # Unused
         return distribution.group(
             distribution.call_for_each_replica(model_fn, args=inputs))
@@ -226,7 +227,7 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase):
           renorm=renorm,
           update_ops_in_replica_mode=not update_ops_in_cross_replica_mode)
 
-      def step_fn(ctx, *inputs):
+      def step_fn(ctx, inputs):
         del ctx  # Unused
         fetches = distribution.unwrap(
             distribution.call_for_each_replica(model_fn, args=inputs))
@@ -285,7 +286,9 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase):
                   distribution=[
                       combinations.one_device_strategy,
                       combinations.mirrored_strategy_with_gpu_and_cpu,
-                      combinations.mirrored_strategy_with_two_gpus
+                      combinations.mirrored_strategy_with_two_gpus,
+                      combinations.core_mirrored_strategy_with_gpu_and_cpu,
+                      combinations.core_mirrored_strategy_with_two_gpus
                   ]),
               combinations.combine(
                   mode=["graph"], use_callable_loss=[True, False]) +
@@ -321,10 +324,10 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase):
         labels = dataset_ops.Dataset.from_tensors([[6.], [21.]])
         return dataset_ops.Dataset.zip((features, labels)).repeat()
 
-      def step_fn(ctx, x, y):
+      def step_fn(ctx, inputs):
         del ctx  # Unused
         return distribution.group(
-            distribution.call_for_each_replica(model_fn, args=(x, y)))
+            distribution.call_for_each_replica(model_fn, args=inputs))
 
       iterator = self._get_iterator(distribution.distribute_dataset(dataset_fn))
 
@@ -341,7 +344,7 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase):
       run_step()
 
       v = all_vars[0]
-      self.assertTrue(all([v is vi for vi in all_vars[1:]]))
+      self.assertTrue(all(v is vi for vi in all_vars[1:]))
       weight = numpy.squeeze(self.evaluate(v))
       # Our model is:
       #   predict = x * w
@@ -402,21 +405,21 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase):
         train_op = optimizer.minimize(loss_fn)
         loss = loss_fn()
         output_context.set_last_step_output(
-            name="replica_loss_agg",
+            name="replica_loss_reduced",
             output=loss,
-            aggregation=variables_lib.VariableAggregation.MEAN)
+            reduce_op=reduce_util.ReduceOp.MEAN)
         output_context.set_non_tensor_output(key1, value1)
         return (train_op, loss)
 
-      def step_fn(output_context, *inputs):
+      def step_fn(output_context, inputs):
         (train_op, loss) = distribution.call_for_each_replica(
             model_fn, args=(output_context,) + inputs)
         output_context.set_last_step_output(
-            name="cross_replica_loss_agg",
+            name="cross_replica_loss_reduced",
             output=loss,
-            aggregation=variables_lib.VariableAggregation.MEAN)
+            reduce_op=reduce_util.ReduceOp.MEAN)
         output_context.set_last_step_output(
-            name="cross_replica_loss_noagg",
+            name="cross_replica_loss_not_reduced",
             output=loss)
         return distribution.group(train_op)
 
@@ -424,16 +427,16 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase):
 
       def run_step():
         initial_loss = lambda: constant_op.constant(1e7)
-        # Initial values corresponding to aggregated losses are just single
-        # tensors. But for non aggregated losses, we need to have initial
+        # Initial values corresponding to reduced losses are just single
+        # tensors. But for non reduced losses, we need to have initial
         # values that are of the same structure as non reduced losses. In
         # MirroredStrategy, this will be a list of losses, in TPUStrategy
         # it will be single tensor. Using `broadcast` followed by `unwrap`
         # gives us the desired initial value structure.
         initial_loop_values = {
-            "replica_loss_agg": initial_loss(),
-            "cross_replica_loss_agg": initial_loss(),
-            "cross_replica_loss_noagg":
+            "replica_loss_reduced": initial_loss(),
+            "cross_replica_loss_reduced": initial_loss(),
+            "cross_replica_loss_not_reduced":
             distribution.unwrap(distribution.broadcast(initial_loss()))
         }
         ctx = distribution.run_steps_on_dataset(
@@ -443,17 +446,17 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase):
         self.assertEqual({key1: [value1]}, ctx.non_tensor_outputs)
         self._verify_loss_output(
             initial_loss(),
-            loss_output=ctx.last_step_outputs["replica_loss_agg"],
-            aggregated=True, distribution=distribution)
+            loss_output=ctx.last_step_outputs["replica_loss_reduced"],
+            reduced=True, distribution=distribution)
         self._verify_loss_output(
             initial_loss(),
-            loss_output=ctx.last_step_outputs["cross_replica_loss_agg"],
-            aggregated=True, distribution=distribution)
+            loss_output=ctx.last_step_outputs["cross_replica_loss_reduced"],
+            reduced=True, distribution=distribution)
         self._verify_loss_output(
             initial_loss(),
-            loss_output=ctx.last_step_outputs["cross_replica_loss_noagg"],
-            aggregated=False, distribution=distribution)
-        return (ctx.run_op, ctx.last_step_outputs["replica_loss_agg"])
+            loss_output=ctx.last_step_outputs["cross_replica_loss_not_reduced"],
+            reduced=False, distribution=distribution)
+        return (ctx.run_op, ctx.last_step_outputs["replica_loss_reduced"])
 
       self.evaluate(distribution.initialize())
       if not context.executing_eagerly():
@@ -478,18 +481,16 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase):
       error_is_not_increasing = all(y <= x for x, y in zip(error, error[1:]))
       self.assertTrue(error_is_not_increasing)
 
-  def _verify_loss_output(self, initial_loss, loss_output, aggregated,
+  def _verify_loss_output(self, initial_loss, loss_output, reduced,
                           distribution):
-    if not aggregated:
-      self.assertEqual(distribution.num_replicas_in_sync,
-                       len(distribution.unwrap(loss_output)))
-      loss_output = distribution.reduce(
-          aggregation=variables_lib.VariableAggregation.MEAN,
-          value=loss_output, destinations="/device:CPU:0")
-
-    unwrapped_output = distribution.unwrap(loss_output)
-    self.assertEqual(1, len(unwrapped_output))
-    loss_tensor = unwrapped_output[0]
+    if not reduced:
+      self.assertLen(distribution.unwrap(loss_output),
+                     distribution.num_replicas_in_sync)
+      loss_tensor = distribution.reduce(reduce_util.ReduceOp.MEAN, loss_output)
+    else:
+      unwrapped_output = distribution.unwrap(loss_output)
+      self.assertLen(unwrapped_output, 1)
+      loss_tensor = unwrapped_output[0]
     self.assertEqual(initial_loss.dtype, loss_tensor.dtype)
     self.assertEqual(initial_loss.shape, loss_tensor.shape)
 
diff --git a/tensorflow/contrib/distribute/python/mirrored_strategy.py b/tensorflow/contrib/distribute/python/mirrored_strategy.py
index 2d75024e7a0..20f1a08d426 100644
--- a/tensorflow/contrib/distribute/python/mirrored_strategy.py
+++ b/tensorflow/contrib/distribute/python/mirrored_strategy.py
@@ -12,293 +12,35 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Class MirroredStrategy implementing DistributionStrategy."""
+"""Contrib version of MirroredStrategy."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import contextlib
-from functools import partial
-import threading
+import functools
 
-from tensorflow.contrib.distribute.python import cross_tower_ops as cross_tower_ops_lib
-from tensorflow.contrib.distribute.python import shared_variable_creator
-from tensorflow.contrib.distribute.python import values
-from tensorflow.python import pywrap_tensorflow
-from tensorflow.python.distribute import multi_worker_util
-from tensorflow.python.eager import context
-from tensorflow.python.eager import tape
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import device as tf_device
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import variable_scope
-from tensorflow.python.ops import variables as variables_lib
-from tensorflow.python.training import coordinator
-from tensorflow.python.training import device_util
-from tensorflow.python.training import distribute as distribute_lib
-from tensorflow.python.util import nest
+from tensorflow.python.distribute import device_util
+from tensorflow.python.distribute import distribute_lib
+from tensorflow.python.distribute import mirrored_strategy
+from tensorflow.python.distribute import values
 
 
-# TODO(josh11b): Replace asserts in this file with if ...: raise ...
-
-
-@contextlib.contextmanager
-def _enter_graph(g):
-  if context.executing_eagerly():
-    with g.as_default(), context.eager_mode():
-      yield
-  else:
-    with g.as_default():
-      yield
-
-
-def _cpu_device(device):
-  cpu_device = tf_device.DeviceSpec.from_string(device)
-  cpu_device.merge_from(tf_device.DeviceSpec(device_type="CPU", device_index=0))
-  return cpu_device.to_string()
-
-
-class _RequestedStop(Exception):
-  pass
-
-
-# _call_for_each_replica and _reduce_non_distributed_value are not members of
-# MirroredStrategy so that they are generally not allowed to use anything
-# specific to MirroredStrategy and thus can be shared with other distribution
-# strategies.
-
-
-# TODO(yuefengz): maybe create a common class for those who need to call this
-# _call_for_each_replica.
-def _call_for_each_replica(distribution, fn, args, kwargs):
-  """Run `fn` in separate threads, once per replica/worker device.
-
-  Args:
-    distribution: the DistributionStrategy object.
-    fn: function to run (will be run once per device, each in its own thread).
-    args: positional arguments for `fn`
-    kwargs: keyword arguments for `fn`.
-
-  Returns:
-    Merged return value of `fn` across all replicas.
-
-  Raises:
-    RuntimeError: If fn() calls get_replica_context().merge_call() a different
-        number of times from the available devices.
-  """
-  # TODO(josh11b): Add this option once we add synchronization to variable
-  # creation. Until then, this is pretty unsafe to use.
-  run_concurrently = False
-  if not context.executing_eagerly():
-    # Needed for per-thread device, etc. contexts in graph mode.
-    ops.get_default_graph().switch_to_thread_local()
-
-  coord = coordinator.Coordinator(clean_stop_exception_types=(_RequestedStop,))
-
-  shared_variable_store = {}
-
-  # TODO(isaprykin): Create these threads once instead of during every run()
-  # call.
-  threads = []
-  for index, d in enumerate(distribution.worker_devices):
-    variable_creator_fn = shared_variable_creator.make_fn(
-        shared_variable_store, index)
-    t = MirroredStrategy._MirroredReplicaThread(  # pylint: disable=protected-access
-        distribution, coord, d, variable_creator_fn, fn,
-        *values.select_device(d, args), **values.select_device(d, kwargs))
-    threads.append(t)
-
-  for t in threads:
-    t.start()
-
-  # When `fn` starts `should_run` event is set on _MirroredReplicaThread
-  # (`MRT`) threads. The execution waits until
-  # `MRT.has_paused` is set, which indicates that either `fn` is
-  # complete or a `get_replica_context().merge_call()` is called.  If `fn` is
-  # complete, then `MRT.done` is set to True.  Otherwise, arguments
-  # of `get_replica_context().merge_call` from all paused threads are grouped
-  # and the `merge_fn` is performed.  Results of the
-  # `get_replica_context().merge_call` are then set to `MRT.merge_result`.
-  # Each such `get_replica_context().merge_call` call returns the
-  # `MRT.merge_result` for that thread when `MRT.should_run` event
-  # is reset again. Execution of `fn` resumes.
-
-  try:
-    with coord.stop_on_exception():
-      all_done = False
-      while not all_done and not coord.should_stop():
-        done = []
-        if run_concurrently:
-          for t in threads:
-            t.should_run.set()
-          for t in threads:
-            t.has_paused.wait()
-            t.has_paused.clear()
-            if coord.should_stop():
-              return None
-            done.append(t.done)
-        else:
-          for t in threads:
-            t.should_run.set()
-            t.has_paused.wait()
-            t.has_paused.clear()
-            if coord.should_stop():
-              return None
-            done.append(t.done)
-        if coord.should_stop():
-          return None
-        all_done = all(done)
-        if not all_done:
-          if any(done):
-            raise RuntimeError("Some replicas made a different number of "
-                               "replica_context().merge_call() calls.")
-          # get_replica_context().merge_call() case
-          merge_args = values.regroup({t.device: t.merge_args for t in threads})
-          merge_kwargs = values.regroup(
-              {t.device: t.merge_kwargs for t in threads})
-          # We capture the name_scope of the MRT when we call merge_fn
-          # to ensure that if we have opened a name scope in the MRT,
-          # it will be respected when executing the merge function. We only
-          # capture the name_scope from the first MRT and assume it is
-          # the same for all other MRTs.
-          mtt_captured_name_scope = threads[0].captured_name_scope
-          with ops.name_scope(mtt_captured_name_scope):
-            merge_result = threads[0].merge_fn(distribution, *merge_args,
-                                               **merge_kwargs)
-          for t in threads:
-            t.merge_result = values.select_device(t.device, merge_result)
-  finally:
-    for t in threads:
-      t.should_run.set()
-    coord.join(threads)
-
-  return values.regroup({t.device: t.main_result for t in threads})
-
-
-def _reduce_non_distributed_value(distribution, aggregation, value,
-                                  destinations):
-  """Reduce a non-DistributedValue `value` to `destinations`."""
-  if isinstance(value, values.DistributedValues):
-    raise ValueError("You are passing a `DistributedValue` to "
-                     "`_reduce_non_distributed_value`, which is not allowed.")
-
-  # If the same value is present on all replicas then the PerReplica value will
-  # be a single value. We also handle the case when `value` is a single value
-  # and equal to 0.
-  if value == 0:
-    return 0
-  # If the aggregation type is MEAN or ONLY_FIRST_REPLICA, then this
-  # essentially means that the same value should be on all destinations.
-  if aggregation in (
-      variable_scope.VariableAggregation.MEAN,
-      variable_scope.VariableAggregation.ONLY_FIRST_REPLICA):
-    return value
-
-  cross_tower_ops_lib.validate_destinations(destinations)
-  # We do not support an aggregation type of SUM if the value is the same across
-  # all replicas. We call this as part of assign functions for MirroredVariables
-  # and summing up identical values across replicas is not clearly defined.
-  if (len(distribution.worker_devices) != 1 or
-      not cross_tower_ops_lib.check_destinations(destinations)):
-    raise ValueError("A non-DistributedValues value %s cannot be reduced with "
-                     "the given aggregation %s." % (value, aggregation))
-  # TODO(anjalisridhar): Moves these methods to a device utility file?
-  devices = cross_tower_ops_lib.get_devices_from(destinations)
-  if len(devices) == 1:
-    with ops.device(devices[0]):
-      return array_ops.identity(value)
-  else:
-    value_updates = {}
-    for d in devices:
-      with ops.device(d):
-        value_updates[d] = array_ops.identity(value)
-    return values.Mirrored(value_updates)
-
-
-def _create_mirrored_variable(devices, real_mirrored_creator, *args, **kwargs):  # pylint: disable=g-missing-docstring
-  # Figure out what collections this variable should be added to.
-  # We'll add the MirroredVariable to those collections instead.
-  collections = kwargs.pop("collections", None)
-  if collections is None:
-    collections = [ops.GraphKeys.GLOBAL_VARIABLES]
-  kwargs["collections"] = []
-
-  # Get synchronization value
-  synchronization = kwargs.get("synchronization",
-                               variable_scope.VariableSynchronization.ON_WRITE)
-  if synchronization == variable_scope.VariableSynchronization.NONE:
-    raise ValueError("`NONE` variable synchronization mode is not "
-                     "supported with `Mirrored` distribution strategy. Please"
-                     " change the `synchronization` for variable: " +
-                     kwargs["name"])
-  elif synchronization == variable_scope.VariableSynchronization.ON_READ:
-    # Variables that are to be synced on read are replica local.
-    is_replica_local = True
-    kwargs["trainable"] = False
-  elif (synchronization == variable_scope.VariableSynchronization.ON_WRITE or
-        synchronization == variable_scope.VariableSynchronization.AUTO):
-    # `AUTO` synchronization for `MirroredStrategy` is `ON_WRITE`.
-    is_replica_local = False
-  else:
-    raise ValueError("Invalid variable synchronization mode: " +
-                     synchronization + " for variable: " + kwargs["name"])
-
-  # Get aggregation value
-  aggregation = kwargs.pop("aggregation",
-                           variable_scope.VariableAggregation.NONE)
-  if aggregation not in (
-      variable_scope.VariableAggregation.NONE,
-      variable_scope.VariableAggregation.SUM,
-      variable_scope.VariableAggregation.MEAN,
-      variable_scope.VariableAggregation.ONLY_FIRST_REPLICA
-  ):
-    raise ValueError("Invalid variable aggregation mode: " + aggregation +
-                     " for variable: " + kwargs["name"])
-
-  # Ignore user-specified caching device, not needed for mirrored variables.
-  kwargs.pop("caching_device", None)
-
-  # TODO(josh11b,apassos): It would be better if variable initialization
-  # was never recorded on the tape instead of having to do this manually
-  # here.
-  with tape.stop_recording():
-    index = real_mirrored_creator(devices, *args, **kwargs)
-
-    if is_replica_local:
-      result = values.ReplicaLocalVariable(
-          index, index[devices[0]], aggregation)
-    else:
-      result = values.MirroredVariable(index, index[devices[0]], aggregation)
-
-  # Add the wrapped variable to the requested collections.
-  # The handling of eager mode and the global step matches
-  # ResourceVariable._init_from_args().
-  if not context.executing_eagerly():
-    g = ops.get_default_graph()
-    # If "trainable" is True, next_creator() will add the member variables
-    # to the TRAINABLE_VARIABLES collection, so we manually remove
-    # them and replace with the MirroredVariable. We can't set
-    # "trainable" to False for next_creator() since that causes functions
-    # like implicit_gradients to skip those variables.
-    if kwargs.get("trainable", True):
-      collections.append(ops.GraphKeys.TRAINABLE_VARIABLES)
-      l = g.get_collection_ref(ops.GraphKeys.TRAINABLE_VARIABLES)
-      for v in index.values():
-        if v in l:
-          l.remove(v)
-    g.add_to_collections(collections, result)
-  elif ops.GraphKeys.GLOBAL_STEP in collections:
-    ops.add_to_collections(ops.GraphKeys.GLOBAL_STEP, result)
-
-  return result
+# pylint: disable=protected-access,invalid-name
+_call_for_each_replica = mirrored_strategy._call_for_each_replica
+_reduce_non_distributed_value = mirrored_strategy._reduce_non_distributed_value
+_create_mirrored_variable = mirrored_strategy._create_mirrored_variable
+all_local_devices = mirrored_strategy.all_local_devices
+CoreMirroredStrategy = mirrored_strategy.MirroredStrategy
+CoreMirroredExtended = mirrored_strategy.MirroredExtended
+# pylint: enable=protected-access,invalid-name
 
 
 class MirroredStrategy(distribute_lib.DistributionStrategy):
   """Mirrors vars to distribute across multiple devices and machines.
 
+  *** contrib version ***
+
   This strategy uses one replica per device and sync replication for its
   multi-GPU version.
 
@@ -353,468 +95,66 @@ class MirroredStrategy(distribute_lib.DistributionStrategy):
                cross_device_ops=None,
                auto_shard_dataset=False,
                cross_tower_ops=None):
-    super(MirroredStrategy, self).__init__()
-
     assert not (cross_device_ops and cross_tower_ops)
-    self._cross_tower_ops = cross_device_ops or cross_tower_ops
-    self._auto_shard_dataset = auto_shard_dataset
-    # Remember num GPUs which might be needed by `configure` method.
     if num_gpus is not None and num_gpus_per_worker is not None:
       raise ValueError(
           "You cannot specify both `num_gpus` and `num_gpus_per_worker`.")
-    if num_gpus is not None:
-      self._num_gpus = num_gpus
-    else:
-      self._num_gpus = num_gpus_per_worker
-
-    self._initialize_local(self._num_gpus, devices)
-
-  def _initialize_local(self, num_gpus, devices):
-    """Initializes the object for local training."""
-    self._cluster_spec = None
-    # Convert `num_gpus` into `devices`, shouldn't specify both.
-    if devices is None:
-      if num_gpus is None:
-        num_gpus = context.num_gpus()
-      if num_gpus == 0:
-        devices = ["/device:CPU:0"]
-      else:
-        devices = ["/device:GPU:%d" % d for d in range(num_gpus)]
-    elif num_gpus is not None:
-      raise ValueError("Must only specify one of `devices` and `num_gpus`.")
-    self._num_gpus = num_gpus
-    # TODO(yuefengz): consider setting the default device.
-
-    assert devices, "Must specify at least one device."
-    assert len(set(devices)) == len(devices), (
-        "No duplicates allowed in `devices` argument.")
-    # TODO(josh11b): Require at least 2 devices?
-    self._devices = [device_util.resolve(d) for d in devices]
-    self._canonical_device_set = set(self._devices)
-    self._device_index = values.PerReplica(
-        {d: i for i, d in enumerate(devices)})
-
-  def _initialize_multi_worker(self, num_gpus, cluster_spec):
-    """Initializes the object for multi-worker training."""
-    cluster_spec = multi_worker_util.normalize_cluster_spec(cluster_spec)
-    self._cluster_spec = cluster_spec
-
-    self._workers = []
-    for job in ["chief", "worker"]:
-      for task in range(len(cluster_spec.as_dict().get(job, []))):
-        self._workers.append("/job:%s/task:%d" % (job, task))
-
     if num_gpus is None:
-      raise ValueError("`num_gpus` is required if `cluster_spec` is given.")
-    if num_gpus > 0:
-      self._worker_devices = [
-          (worker, [
-              device_util.canonicalize(worker + "/device:GPU:%d" % gpu)
-              for gpu in range(num_gpus)
-          ]) for worker in self._workers
-      ]
+      num_gpus = num_gpus_per_worker
+    extended = MirroredExtended(self, devices, num_gpus,
+                                cross_device_ops or cross_tower_ops,
+                                auto_shard_dataset)
+    super(MirroredStrategy, self).__init__(extended)
+
+
+class MirroredExtended(CoreMirroredExtended):
+  """Implementation of (contrib) MirroredStrategy."""
+
+  def __init__(self,
+               container_strategy,
+               devices=None,
+               num_gpus_per_worker=None,
+               cross_device_ops=None,
+               auto_shard_dataset=False):
+    if devices is None:
+      devices = mirrored_strategy.all_local_devices(num_gpus_per_worker)
+    elif num_gpus_per_worker is not None:
+      raise ValueError(
+          "Must only specify one of `devices` and `num_gpus_per_worker`.")
+    super(MirroredExtended, self).__init__(container_strategy, devices,
+                                           cross_device_ops)
+    self._auto_shard_dataset = auto_shard_dataset
+
+  def _make_dataset_iterator(self, dataset):
+    """Make iterator from dataset without splitting the batch.
+
+    This implementation is different than the one in
+    `tf.distribute.MirroredStrategy` for purposes of backward compatibility.
+    We treat the incoming dataset's batch size as per replica batch size.
+
+    Args:
+      dataset: `tf.data.Dataset` for input.
+    Returns:
+      An `InputIterator` which returns inputs for each step of the computation.
+    """
+    if self._local_mode:
+      worker = device_util.canonicalize("/device:CPU:0")
+      worker_device_pairs = [(worker, self._devices)]
     else:
-      self._worker_devices = [
-          (worker, [device_util.canonicalize(worker, "/device:CPU:0")])
-          for worker in self._workers
-      ]
+      worker_device_pairs = self._worker_devices
+    return values.DatasetIterator(dataset, worker_device_pairs)
 
-    devices = nest.flatten([l for _, l in self._worker_devices])
-
-    # Setting `_default_device` will add a device scope in the
-    # distribution.scope. We set the default device to the first worker. When
-    # users specify device under distribution.scope by
-    #   with tf.device("/cpu:0"):
-    #     ...
-    # their ops will end up on the cpu device of its first worker, e.g.
-    # "/job:worker/task:0/device:CPU:0". Note this is not used in replica mode.
-    self._default_device = self._workers[0]
-
-    assert devices, "Must specify at least one device."
-    assert len(set(devices)) == len(devices), (
-        "No duplicates allowed in `devices` argument.")
-    # TODO(josh11b): Require at least 2 devices?
-    self._devices = [device_util.resolve(d) for d in devices]
-    self._canonical_device_set = set(self._devices)
-    self._device_index = values.PerReplica(
-        {d: i for i, d in enumerate(devices)})
-
-  def _create_variable(self, next_creator, *args, **kwargs):
-    """Create a mirrored variable. See `DistributionStrategy.scope`."""
-    colocate_with = kwargs.pop("colocate_with", None)
-    devices = self._get_devices_from(colocate_with)
-
-    def _real_mirrored_creator(devices, *args, **kwargs):  # pylint: disable=g-missing-docstring
-      index = {}
-      for i, d in enumerate(devices):
-        with ops.device(d):
-          if i > 0:
-            # Give replicas meaningful distinct names:
-            var0name = index[devices[0]].name.split(":")[0]
-            # We append a / to variable names created on replicas with id > 0 to
-            # ensure that we ignore the name scope and instead use the given
-            # name as the absolute name of the variable.
-            kwargs["name"] = "%s/replica_%d/" % (var0name, i)
-            # Initialize replicas with the same value:
-            def initial_value_fn(device=d):
-              if context.executing_eagerly():
-                init_value = index[devices[0]].value()
-                return array_ops.identity(init_value)
-              else:
-                with ops.device(device):
-                  init_value = index[devices[0]].initial_value
-                  return array_ops.identity(init_value)
-            kwargs["initial_value"] = initial_value_fn
-          with context.context().device_policy(context.DEVICE_PLACEMENT_SILENT):
-            # Don't record operations (e.g. other variable reads) during
-            # variable creation.
-            with tape.stop_recording():
-              v = next_creator(*args, **kwargs)
-          assert not isinstance(v, values.DistributedVariable)
-          index[d] = v
-      return index
-
-    return _create_mirrored_variable(devices, _real_mirrored_creator, *args,
-                                     **kwargs)
-
-  def distribute_dataset(self, dataset_fn):
-    if self._cluster_spec:
-      return values.MultiWorkerDataset(
-          partial(self._call_dataset_fn, dataset_fn), self._worker_devices,
-          auto_shard=self._auto_shard_dataset)
-    else:
+  def _distribute_dataset(self, dataset_fn):
+    if self._local_mode:
       return values.PerReplicaDataset(
           self._call_dataset_fn(dataset_fn), self._devices)
-
-  # TODO(priyag): Deal with OutOfRange errors once b/111349762 is fixed.
-  def _run_steps_on_dataset(self, fn, iterator, iterations,
-                            initial_loop_values=None):
-    if initial_loop_values is None:
-      initial_loop_values = {}
-    initial_loop_values = nest.flatten(initial_loop_values)
-
-    ctx = values.MultiStepContext()
-    def body(i, *args):
-      """A wrapper around `fn` to create the while loop body."""
-      del args
-      fn_inputs = iterator.get_next()
-      if not isinstance(fn_inputs, tuple):
-        fn_inputs = (fn_inputs,)
-      fn_result = fn(ctx, *fn_inputs)
-      for (name, output) in ctx.last_step_outputs.items():
-        # Convert all outputs to tensors, potentially from `DistributedValues`.
-        ctx.last_step_outputs[name] = self.unwrap(output)
-      flat_last_step_outputs = nest.flatten(ctx.last_step_outputs)
-      with ops.control_dependencies([fn_result]):
-        return [i + 1] + flat_last_step_outputs
-
-    # We capture the control_flow_context at this point, before we run `fn`
-    # inside a while_loop. This is useful in cases where we might need to exit
-    # these contexts and get back to the outer context to do some things, for
-    # e.g. create an op which should be evaluated only once at the end of the
-    # loop on the host. One such usage is in creating metrics' value op.
-    self._outer_control_flow_context = (
-        ops.get_default_graph()._get_control_flow_context())  # pylint: disable=protected-access
-
-    cond = lambda i, *args: i < iterations
-    i = constant_op.constant(0)
-    loop_result = control_flow_ops.while_loop(
-        cond, body, [i] + initial_loop_values, name="",
-        parallel_iterations=1, back_prop=False, swap_memory=False,
-        return_same_structure=True)
-    del self._outer_control_flow_context
-
-    ctx.run_op = control_flow_ops.group(loop_result)
-
-    # Convert the last_step_outputs from a list to the original dict structure
-    # of last_step_outputs.
-    last_step_tensor_outputs = loop_result[1:]
-    last_step_tensor_outputs_dict = nest.pack_sequence_as(
-        ctx.last_step_outputs, last_step_tensor_outputs)
-
-    for (name, aggregation) in ctx._last_step_outputs_aggregations.items():  # pylint: disable=protected-access
-      output = last_step_tensor_outputs_dict[name]
-      # For outputs that have already been aggregated, wrap them in a Mirrored
-      # container, else in a PerReplica container.
-      if aggregation is variables_lib.VariableAggregation.NONE:
-        last_step_tensor_outputs_dict[name] = values.regroup(
-            {d: t for d, t in zip(self._devices, output)}, values.PerReplica)
-      else:
-        assert len(output) == 1
-        last_step_tensor_outputs_dict[name] = output[0]
-
-    ctx._set_last_step_outputs(last_step_tensor_outputs_dict)  # pylint: disable=protected-access
-    return ctx
-
-  def _broadcast(self, tensor, destinations):
-    # TODO(josh11b): In eager mode, use one thread per device, or async mode.
-    return self._get_cross_tower_ops().broadcast(tensor, destinations or
-                                                 self._devices)
-
-  def _call_for_each_replica(self, fn, args, kwargs):
-    return _call_for_each_replica(self, fn, args, kwargs)
-
-  def configure(self,
-                session_config=None,
-                cluster_spec=None,
-                task_type=None,
-                task_id=None):
-    del task_type, task_id
-
-    if session_config:
-      session_config.isolate_session_state = True
-
-    if cluster_spec:
-      self._initialize_multi_worker(self._num_gpus, cluster_spec)
-
-    if self._cross_tower_ops is None:
-      if self._cluster_spec:
-        # It currently cannot detect the toplogy of remote workers. So we
-        # hard-code the multi-worker all-reduce algorithm for now.
-        if len(self._workers) == 1:
-          # The default is "nccl".
-          self._cross_tower_ops = cross_tower_ops_lib.AllReduceCrossDeviceOps()
-        else:
-          # The default is hierarchical reduce and broadcast.
-          self._cross_tower_ops = cross_tower_ops_lib.MultiWorkerAllReduce(
-              self._workers, self._num_gpus)
-      else:
-        self._cross_tower_ops = cross_tower_ops_lib.choose_the_best(
-            self._devices, session_config=session_config)
-
-  def _get_cross_tower_ops(self):
-    if self._cross_tower_ops is None:
-      self._cross_tower_ops = (
-          cross_tower_ops_lib.ReductionToOneDeviceCrossDeviceOps())
-    return self._cross_tower_ops
-
-  def _reduce(self, aggregation, value, destinations):
-    assert not isinstance(value, values.Mirrored)
-    if not isinstance(value, values.DistributedValues):
-      # This function handles reducing values that are not PerReplica or
-      # Mirrored values. For example, the same value could be present on all
-      # replicas in which case `value` would be a single value or value could
-      # be 0.
-      return _reduce_non_distributed_value(self, aggregation, value,
-                                           destinations)
-    if aggregation == variable_scope.VariableAggregation.ONLY_FIRST_REPLICA:
-      value = value.get(self._devices[0])
-      if isinstance(value, (int, float)):
-        return value
-      return self.broadcast(value, destinations)
-    return self._get_cross_tower_ops().reduce(
-        aggregation, value, destinations=destinations)
-
-  def _batch_reduce(self, aggregation, value_destination_pairs):
-    if aggregation == variable_scope.VariableAggregation.ONLY_FIRST_REPLICA:
-      return [self.broadcast(v.get(self._devices[0]), d)
-              for v, d in value_destination_pairs]
-    return self._get_cross_tower_ops().batch_reduce(aggregation,
-                                                    value_destination_pairs)
-
-  def _update(self, var, options, fn, *args, **kwargs):
-    # TODO(josh11b): In eager mode, use one thread per device.
-    assert isinstance(var, values.DistributedVariable)
-    should_group = options.pop("grouped")
-    assert not options  # Validate that we are processing all of the options.
-    updates = {}
-    for d, v in var._index.items():  # pylint: disable=protected-access
-      name = "update_%d" % self._device_index.get(d)
-      with ops.device(d), distribute_lib.UpdateContext(d), ops.name_scope(name):
-        # If args and kwargs are not mirrored, the value is returned as is.
-        updates[d] = fn(v,
-                        *values.select_device_mirrored(d, args),
-                        **values.select_device_mirrored(d, kwargs))
-    return values.update_regroup(self, updates, should_group)
-
-  def _update_non_slot(self, colocate_with, options, fn, *args, **kwargs):
-    assert isinstance(colocate_with, list)
-    should_group = options.pop("grouped")
-    assert not options  # Validate that we are processing all of the options.
-    # TODO(josh11b): In eager mode, use one thread per device.
-    updates = {}
-    for d in colocate_with:
-      name = "update_%d" % self._device_index.get(d)
-      with ops.device(d), distribute_lib.UpdateContext(d), ops.name_scope(name):
-        updates[d] = fn(*values.select_device_mirrored(d, args),
-                        **values.select_device_mirrored(d, kwargs))
-    return values.update_regroup(self, updates, should_group)
-
-  def read_var(self, replica_local_var):
-    """Read the aggregate value of a replica-local variable."""
-    if isinstance(replica_local_var, values.ReplicaLocalVariable):
-      return replica_local_var._get_cross_replica()  # pylint: disable=protected-access
-    assert isinstance(replica_local_var, values.Mirrored)
-    return array_ops.identity(replica_local_var.get())
-
-  def _unwrap(self, val):
-    if isinstance(val, values.DistributedValues):
-      # Return in a deterministic order.
-      if set(val.devices) == self._canonical_device_set:
-        return [val.get(device=d) for d in self._devices]
-      return [val.get(device=d) for d in sorted(val.devices)]
-    return [val]
-
-  def value_container(self, val):
-    return values.value_container(val)
-
-  @property
-  def num_replicas(self):
-    return len(self._devices)
-
-  @property
-  def num_replicas_in_sync(self):
-    return len(self._devices)
-
-  def _worker_device_index(self):
-    return self._device_index
-
-  @property
-  def worker_devices(self):
-    # Make a copy to prevent users from accidentally mutating our copy.
-    return list(self._devices)
-
-  @property
-  def parameter_devices(self):
-    return list(self._devices)
-
-  @property
-  def between_graph(self):
-    return False
-
-  @property
-  def should_init(self):
-    return True
-
-  @property
-  def should_checkpoint(self):
-    return True
-
-  @property
-  def should_save_summary(self):
-    return True
-
-  def non_slot_devices(self, var_list):
-    del var_list
-    return list(self._devices)
-
-  def _get_devices_from(self, colocate_with=None):
-    if colocate_with is None:
-      return self._devices
     else:
-      return cross_tower_ops_lib.get_devices_from(colocate_with)
-
-  class _MirroredReplicaThread(threading.Thread):
-    """A thread that runs() a function on a device."""
-
-    def __init__(self, dist, coord, device, variable_creator_fn, fn, *args,
-                 **kwargs):
-      super(MirroredStrategy._MirroredReplicaThread, self).__init__()  # pylint: disable=protected-access
-      self.coord = coord
-      self.distribution = dist
-      self.device = device
-      self.replica_id = dist.worker_devices.index(device)
-      self.variable_creator_fn = variable_creator_fn
-      # State needed to run and return the results of `fn`.
-      self.main_fn = fn
-      self.main_args = args
-      self.main_kwargs = kwargs
-      self.main_result = None
-      self.done = False
-      # State needed to run the next merge_call() (if any) requested via
-      # ReplicaContext.
-      self.merge_fn = None
-      self.merge_args = None
-      self.merge_kwargs = None
-      self.merge_result = None
-      self.captured_name_scope = None
-      # We use a thread.Event for the main thread to signal when this
-      # thread should start running (`should_run`), and another for
-      # this thread to transfer control back to the main thread
-      # (`has_paused`, either when it gets to a
-      # `get_replica_context().merge_call` or when `fn` returns). In
-      # either case the event starts cleared, is signaled by calling
-      # set(). The receiving thread waits for the signal by calling
-      # wait() and then immediately clearing the event using clear().
-      self.should_run = threading.Event()
-      self.has_paused = threading.Event()
-      # These fields have to do with inheriting various contexts from the
-      # parent thread:
-      # pylint: disable=protected-access
-      self.context_mode = context.context()._eager_context.mode
-      if not context.context()._context_handle:
-        context.context()._initialize_handle_and_devices()
-      self.context_device_policy = (
-          pywrap_tensorflow.TFE_ContextGetDevicePlacementPolicy(
-              context.context()._context_handle))
-      self.graph = ops.get_default_graph()
-      self._variable_creator_stack = self.graph._variable_creator_stack[:]
-      self._captured_var_scope = variable_scope.get_variable_scope()
-      # Adding a "/" at end lets us re-enter this scope later.
-      self._name_scope = self.graph.get_name_scope()
-      if self._name_scope:
-        self._name_scope += "/"
-      if self.replica_id > 0:
-        if not self._name_scope:
-          self._name_scope = ""
-        self._name_scope += "replica_%d/" % self.replica_id
-
-    def run(self):
-      # pylint: disable=protected-access
-      self.graph._variable_creator_stack = self._variable_creator_stack
-      self.should_run.wait()
-      self.should_run.clear()
-      try:
-        if self.coord.should_stop():
-          return
-        with self.coord.stop_on_exception(), \
-            context.context()._mode(self.context_mode), \
-            context.context().device_policy(self.context_device_policy), \
-            _enter_graph(self.graph), \
-            MirroredReplicaContext(self.distribution, self.replica_id), \
-            ops.device(self.device), \
-            ops.name_scope(self._name_scope), \
-            variable_scope.variable_scope(
-                self._captured_var_scope, reuse=self.replica_id > 0), \
-            variable_scope.variable_creator_scope(self.variable_creator_fn):
-          self.main_result = self.main_fn(*self.main_args, **self.main_kwargs)
-          self.done = True
-      finally:
-        self.has_paused.set()
-
-
-class MirroredReplicaContext(distribute_lib.ReplicaContext):
-  """ReplicaContext used in MirroredStrategy.call_for_each_replica().
-
-  Opened in `_MirroredReplicaThread`, to allow the user to invoke
-  `MirroredStrategy`'s specific implementation of `merge_call()`,
-  which works by delegating the function and its arguments to
-  the main thread (the one that invoked
-  `MirroredStrategy.call_for_each_replica()`).
-  """
-
-  def _merge_call(self, fn, args, kwargs):
-    """Delegate to the main thread to actually perform merge_call()."""
-    t = threading.current_thread()  # a _MirroredReplicaThread
-    t.merge_fn = fn
-    t.merge_args = args
-    t.merge_kwargs = kwargs
-    t.captured_name_scope = t.graph.get_name_scope()
-    # Adding a "/" at end lets us re-enter this scope later.
-    if t.captured_name_scope:
-      t.captured_name_scope += "/"
-    t.has_paused.set()
-    t.should_run.wait()
-    t.should_run.clear()
-    if t.coord.should_stop():
-      raise _RequestedStop()
-    return t.merge_result
+      return values.MultiWorkerDataset(
+          functools.partial(self._call_dataset_fn, dataset_fn),
+          self._worker_devices,
+          auto_shard=self._auto_shard_dataset)
 
+  # TODO(priyag): Delete this once all strategies use global batch size.
   @property
-  def device(self):
-    raise RuntimeError("Use .devices instead")
-
-  @property
-  def devices(self):
-    distribute_lib.require_replica_context(self)
-    return [self._distribution_strategy.worker_devices[self._replica_id]]
+  def _global_batch_size(self):
+    return False
diff --git a/tensorflow/contrib/distribute/python/mirrored_strategy_multigpu_test.py b/tensorflow/contrib/distribute/python/mirrored_strategy_multigpu_test.py
index 1fd18e09c01..66512f983e1 100644
--- a/tensorflow/contrib/distribute/python/mirrored_strategy_multigpu_test.py
+++ b/tensorflow/contrib/distribute/python/mirrored_strategy_multigpu_test.py
@@ -20,22 +20,27 @@ from __future__ import print_function
 
 import sys
 
+from absl.testing import parameterized
 import numpy as np
 
+from tensorflow.contrib.distribute.python import combinations
 from tensorflow.contrib.distribute.python import mirrored_strategy
 from tensorflow.contrib.distribute.python import multi_worker_test_base
 from tensorflow.contrib.distribute.python import strategy_test_lib
-from tensorflow.contrib.distribute.python import values
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.distribute import device_util
+from tensorflow.python.distribute import distribution_strategy_context as ds_context
+from tensorflow.python.distribute import reduce_util
+from tensorflow.python.distribute import values
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
 from tensorflow.python.eager import function
 from tensorflow.python.eager import test
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import func_graph
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
-from tensorflow.python.framework import test_util
 from tensorflow.python.keras.engine import training as keras_training
 from tensorflow.python.keras.layers import core as keras_core
 from tensorflow.python.layers import core
@@ -46,8 +51,6 @@ from tensorflow.python.ops import rnn_cell_impl
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
-from tensorflow.python.training import device_util
-from tensorflow.python.training import distribution_strategy_context
 from tensorflow.python.training import gradient_descent
 from tensorflow.python.training import optimizer as optimizer_lib
 from tensorflow.python.training import server_lib
@@ -56,248 +59,229 @@ from tensorflow.python.training import server_lib
 GPU_TEST = "test_gpu" in sys.argv[0]
 
 
-class MirroredTwoDeviceDistributionTest(strategy_test_lib.DistributionTestBase):
+@combinations.generate(combinations.combine(
+    distribution=[
+        combinations.mirrored_strategy_with_gpu_and_cpu,
+        combinations.mirrored_strategy_with_two_gpus,
+        combinations.core_mirrored_strategy_with_gpu_and_cpu,
+        combinations.core_mirrored_strategy_with_two_gpus],
+    mode=["graph", "eager"]))
+class MirroredTwoDeviceDistributionTest(strategy_test_lib.DistributionTestBase,
+                                        parameterized.TestCase):
 
-  def _get_distribution_strategy(self):
-    devices = ["/device:CPU:0", "/device:GPU:0"]
-    if GPU_TEST:
-      self.assertGreater(context.num_gpus(), 0)
-      if context.num_gpus() > 1:
-        devices = ["/device:GPU:0", "/device:GPU:1"]
-    print(self.id().split(".")[-1], "devices:", ", ".join(devices))
-    return mirrored_strategy.MirroredStrategy(devices)
+  def testMinimizeLoss(self, distribution):
+    if context.executing_eagerly():
+      self._test_minimize_loss_eager(distribution)
+    else:
+      self._test_minimize_loss_graph(distribution)
 
-  def testMinimizeLossEager(self):
-    if not GPU_TEST:
-      self.skipTest("Not GPU test")
-    self._test_minimize_loss_eager(self._get_distribution_strategy())
+  def testReplicaId(self, distribution):
+    self._test_replica_id(distribution)
 
-  def testMinimizeLossGraph(self):
-    soft_placement = not GPU_TEST
-    print("testMinimizeLossGraph soft_placement:", soft_placement)
-    self._test_minimize_loss_graph(
-        self._get_distribution_strategy(), soft_placement=soft_placement)
+  def testNumReplicasInSync(self, distribution):
+    self.assertEqual(2, distribution.num_replicas_in_sync)
 
-  def testDeviceIndex(self):
-    if not GPU_TEST:
-      self.skipTest("Not GPU test")
-    self._test_device_index(self._get_distribution_strategy())
+  def testCallAndMergeExceptions(self, distribution):
+    self._test_call_and_merge_exceptions(distribution)
 
-  def testReplicaId(self):
-    if not GPU_TEST:
-      self.skipTest("Not GPU test")
-    self._test_replica_id(self._get_distribution_strategy())
-
-  def testNumReplicas(self):
-    if not GPU_TEST:
-      self.skipTest("Not GPU test")
-    self.assertEqual(2, self._get_distribution_strategy().num_replicas)
-
-  def testNumReplicasInSync(self):
-    if not GPU_TEST:
-      self.skipTest("Not GPU test")
-    self.assertEqual(2, self._get_distribution_strategy().
-                     num_replicas_in_sync)
-
-  @test_util.run_in_graph_and_eager_modes
-  def testCallAndMergeExceptions(self):
-    if not GPU_TEST:
-      self.skipTest("Not GPU test")
-    self._test_call_and_merge_exceptions(self._get_distribution_strategy())
-
-  @test_util.run_in_graph_and_eager_modes
-  def testRunRegroupError(self):
-
-    def run_fn(device_id):
+  def testRunRegroupError(self, distribution):
+    def run_fn():
+      replica_id = int(self.evaluate(_replica_id()))
       # Generates a list with different lengths on different devices.
       # Will fail in _regroup() (if more than one device).
-      return list(range(device_id))
+      return list(range(replica_id))
 
-    dist = self._get_distribution_strategy()
-    with dist.scope(), self.assertRaises(AssertionError):
-      dist.call_for_each_replica(run_fn, args=(dist.worker_device_index,))
+    with distribution.scope(), self.assertRaises(AssertionError):
+      distribution.extended.call_for_each_replica(run_fn)
 
-  @test_util.run_in_graph_and_eager_modes
-  def testReduceToCpu(self):
-    if not GPU_TEST:
-      self.skipTest("Not GPU test")
+  def testReduceToCpu(self, distribution):
+    with distribution.scope():
+      result = distribution.extended.call_for_each_replica(_replica_id)
+      reduced = distribution.reduce(reduce_util.ReduceOp.SUM, result)
+      expected = sum(range(distribution.num_replicas_in_sync))
+      self.assertEqual(expected, self.evaluate(reduced))
 
-    def run_fn(device_id):
-      return device_id
+  def testMakeInputFnIterator(self, distribution):
+    dataset_fn = lambda: dataset_ops.Dataset.range(10)
+    expected_values = [[i, i+1] for i in range(0, 10, 2)]
 
-    dist = self._get_distribution_strategy()
-    with dist.scope():
-      result = dist.call_for_each_replica(
-          run_fn, args=(dist.worker_device_index,))
-      reduced = dist.reduce(
-          variable_scope.VariableAggregation.SUM,
-          result,
-          destinations="/device:CPU:0")
-      unwrapped = dist.unwrap(reduced)
-      self.assertEqual(1, len(unwrapped))
-      expected = sum(range(len(dist.worker_devices)))
-      self.assertEqual(expected, self.evaluate(unwrapped[0]))
+    input_fn = self._input_fn_to_test_input_context(
+        dataset_fn,
+        expected_num_replicas_in_sync=2,
+        expected_num_input_pipelines=1,
+        expected_input_pipeline_id=0)
+    iterator = distribution.make_input_fn_iterator(input_fn)
+    self._test_input_fn_iterator(iterator, distribution.extended.worker_devices,
+                                 expected_values)
 
-  @test_util.run_in_graph_and_eager_modes
-  def testReduceOnlyFirstReplicaUpdates(self):
-    if not GPU_TEST:
-      self.skipTest("Not GPU test")
-
-    def run_fn(device_id):
-      return constant_op.constant(3 + 5 * device_id)
-
-    dist = self._get_distribution_strategy()
-    with dist.scope():
-      result = dist.call_for_each_replica(
-          run_fn, args=(dist.worker_device_index,))
-      reduced = dist.reduce(
-          variable_scope.VariableAggregation.ONLY_FIRST_REPLICA,
-          result,
-          destinations="/device:CPU:0")
-      unwrapped = dist.unwrap(reduced)
-      self.assertEqual(1, len(unwrapped))
-      self.assertEqual(3, self.evaluate(unwrapped[0]))
-
-  @test_util.run_in_graph_and_eager_modes()
-  def testReduceToMultipleDestinations(self):
-    if not GPU_TEST:
-      self.skipTest("Not GPU test")
-
-    devices = ["/device:GPU:0"]
-    if GPU_TEST:
-      self.assertGreater(context.num_gpus(), 0)
-    print(self.id().split(".")[-1], "devices:", ", ".join(devices))
-
-    dist = mirrored_strategy.MirroredStrategy(devices)
-    with dist.scope():
-      reduced = dist.reduce(
-          variable_scope.VariableAggregation.SUM,
-          1.0,
-          destinations=["/device:CPU:0", "/device:GPU:0"])
-      unwrapped = dist.unwrap(reduced)
-      self.assertEqual(2, len(unwrapped))
-      self.assertEqual(1.0, self.evaluate(unwrapped[0]))
+  def testGlobalStepUpdate(self, distribution):
+    self._test_global_step_update(distribution)
 
 
+def one_device_combinations():
+  return combinations.combine(
+      distribution=[
+          combinations.mirrored_strategy_with_one_cpu,
+          combinations.mirrored_strategy_with_one_gpu,
+          combinations.core_mirrored_strategy_with_one_cpu,
+          combinations.core_mirrored_strategy_with_one_gpu],
+      mode=["graph", "eager"])
+
+
+class MirroredOneDeviceDistributionTest(
+    strategy_test_lib.DistributionTestBase,
+    parameterized.TestCase):
+
+  @combinations.generate(one_device_combinations())
+  def testMinimizeLoss(self, distribution):
+    if context.executing_eagerly():
+      self._test_minimize_loss_eager(distribution)
+    else:
+      self._test_minimize_loss_graph(distribution)
+
+  @combinations.generate(one_device_combinations())
+  def testReplicaId(self, distribution):
+    self._test_replica_id(distribution)
+
+  @combinations.generate(one_device_combinations())
+  def testCallAndMergeExceptions(self, distribution):
+    self._test_call_and_merge_exceptions(distribution)
+
+
+class MirroredStrategyVariableCreatorStackTest(
+    test.TestCase, parameterized.TestCase):
+
+  @combinations.generate(combinations.combine(
+      distribution=[combinations.mirrored_strategy_with_gpu_and_cpu,
+                    combinations.core_mirrored_strategy_with_gpu_and_cpu],
+      mode=["graph"]))
+  def testCreatorStacksAreThreadLocal(self, distribution):
+    def model_fn():
+      replica_id_str = str(self.evaluate(_replica_id()))
+
+      def thread_creator_fn(next_creator, *args, **kwargs):
+        return next_creator(*args, **kwargs) + ":thread_" + replica_id_str
+
+      with variable_scope.variable_creator_scope(thread_creator_fn):
+        # Create a variable in this scope.
+        v = variable_scope.variable(1.0)
+
+        # This will pause the current thread, and execute the other thread.
+        ds_context.get_replica_context().merge_call(lambda _: _)
+      return v
+
+    def main_thread_creator(next_creator, *args, **kwargs):
+      # We are not using the underlying next_creator for test purposes.
+      del next_creator, args, kwargs
+      return "main_thread"
+
+    with context.graph_mode(), \
+        distribution.scope(), \
+        variable_scope.variable_creator_scope(main_thread_creator):
+      result = distribution.extended.call_for_each_replica(model_fn)
+      result = distribution.unwrap(result)
+      expected = ["main_thread:thread_0", "main_thread:thread_1"]
+      self.assertEqual(expected, result)
+
+
+@combinations.generate(combinations.combine(
+    distribution=[
+        combinations.mirrored_strategy_with_gpu_and_cpu,
+        combinations.core_mirrored_strategy_with_gpu_and_cpu],
+    mode=["graph", "eager"]))
 class MirroredStrategyVariableCreationTest(test.TestCase):
 
-  config = config_pb2.ConfigProto()
-  config.allow_soft_placement = True
+  # TODO(priyag): Modify more tests to use this helper and check more
+  # properties.
+  def _test_mv_properties(self, var, name):
+    self.assertIsInstance(var, values.MirroredVariable)
+    self.assertEqual(name, var.name)
+    for d in var.devices:
+      self.assertEqual(d, var.get(d).device)
 
-  def _skip_eager_if_gpus_less_than(self, num_gpus):
-    if context.num_gpus() < num_gpus and context.executing_eagerly():
-      self.skipTest("Enough GPUs not available for this test in eager mode.")
+  def testVariableInFuncGraph(self, distribution):
+    def model_fn():
+      v = variable_scope.variable(2.0, name="bar")
+      ds_context.get_replica_context().merge_call(lambda _: _)
+      return v
 
-  @test_util.run_in_graph_and_eager_modes(config=config)
-  def testSingleVariable(self):
-    self._skip_eager_if_gpus_less_than(1)
+    with func_graph.FuncGraph("fg").as_default(), distribution.scope():
+      v1 = variable_scope.variable(1.0, name="foo")
+      v2 = distribution.extended.call_for_each_replica(model_fn)
 
+    self._test_mv_properties(v1, "foo:0")
+    self._test_mv_properties(v2, "bar:0")
+
+  def testSingleVariable(self, distribution):
     def model_fn():
       # This variable should be created only once across the threads because of
-      # special variable_creator functions used by `dist.call_for_each_replica`.
+      # special variable_creator functions used by
+      # `distribution.extended.call_for_each_replica`.
       v = variable_scope.variable(1.0, name="foo")
-      distribution_strategy_context.get_replica_context().merge_call(
-          lambda _: _)
+      ds_context.get_replica_context().merge_call(lambda _: _)
       return v
 
-    dist = mirrored_strategy.MirroredStrategy(
-        ["/device:GPU:0", "/device:CPU:0"])
-
-    with dist.scope():
-      result = dist.call_for_each_replica(model_fn)
-      self.assertIsInstance(result, values.MirroredVariable)
-      self.assertEquals("foo:0", result.name)
-
-  @test_util.run_in_graph_and_eager_modes(config=config)
-  def testUnnamedVariable(self):
-    self._skip_eager_if_gpus_less_than(1)
+    with distribution.scope():
+      result = distribution.extended.call_for_each_replica(model_fn)
+      self._test_mv_properties(result, "foo:0")
 
+  def testUnnamedVariable(self, distribution):
     def model_fn():
       v = variable_scope.variable(1.0)
-      distribution_strategy_context.get_replica_context().merge_call(
-          lambda _: _)
+      ds_context.get_replica_context().merge_call(lambda _: _)
       return v
 
-    dist = mirrored_strategy.MirroredStrategy(
-        ["/device:GPU:0", "/device:CPU:0"])
-
-    with dist.scope():
-      result = dist.call_for_each_replica(model_fn)
-      self.assertIsInstance(result, values.MirroredVariable)
-      # Default name of "Variable" will be used.
-      self.assertEquals("Variable:0", result.name)
-
-  @test_util.run_in_graph_and_eager_modes(config=config)
-  def testMultipleVariables(self):
-    self._skip_eager_if_gpus_less_than(1)
+    with distribution.scope():
+      result = distribution.extended.call_for_each_replica(model_fn)
+      self._test_mv_properties(result, "Variable:0")
 
+  def testMultipleVariables(self, distribution):
     def model_fn():
       vs = []
       for i in range(5):
         vs.append(variable_scope.variable(1.0, name="foo" + str(i)))
-      distribution_strategy_context.get_replica_context().merge_call(
-          lambda _: _)
+      ds_context.get_replica_context().merge_call(lambda _: _)
       return vs
 
-    dist = mirrored_strategy.MirroredStrategy(
-        ["/device:GPU:0", "/device:CPU:0"])
-
-    with dist.scope():
-      result = dist.call_for_each_replica(model_fn)
+    with distribution.scope():
+      result = distribution.extended.call_for_each_replica(model_fn)
       for i, v in enumerate(result):
-        self.assertIsInstance(v, values.MirroredVariable)
-        self.assertEquals("foo" + str(i) + ":0", v.name)
-
-  @test_util.run_in_graph_and_eager_modes(config=config)
-  def testMultipleVariablesWithSameCanonicalName(self):
-    self._skip_eager_if_gpus_less_than(1)
+        self._test_mv_properties(v, "foo" + str(i) + ":0")
 
+  def testMultipleVariablesWithSameCanonicalName(self, distribution):
     def model_fn():
       vs = []
       vs.append(variable_scope.variable(1.0, name="foo/bar"))
       vs.append(variable_scope.variable(1.0, name="foo_1/bar"))
       vs.append(variable_scope.variable(1.0, name="foo_1/bar_1"))
       vs.append(variable_scope.variable(1.0, name="foo/bar_1"))
-      distribution_strategy_context.get_replica_context().merge_call(
-          lambda _: _)
+      ds_context.get_replica_context().merge_call(lambda _: _)
       return vs
 
-    dist = mirrored_strategy.MirroredStrategy(
-        ["/device:GPU:0", "/device:CPU:0"])
-
-    with dist.scope():
-      result = dist.call_for_each_replica(model_fn)
+    with distribution.scope():
+      result = distribution.extended.call_for_each_replica(model_fn)
       for v in result:
         self.assertIsInstance(v, values.MirroredVariable)
-      self.assertEquals(4, len(result))
-      self.assertEquals("foo/bar:0", result[0].name)
-      self.assertEquals("foo_1/bar:0", result[1].name)
-      self.assertEquals("foo_1/bar_1:0", result[2].name)
-      self.assertEquals("foo/bar_1:0", result[3].name)
+      self.assertEqual(4, len(result))
+      self.assertEqual("foo/bar:0", result[0].name)
+      self.assertEqual("foo_1/bar:0", result[1].name)
+      self.assertEqual("foo_1/bar_1:0", result[2].name)
+      self.assertEqual("foo/bar_1:0", result[3].name)
 
-  @test_util.run_in_graph_and_eager_modes(config=config)
-  def testVariableWithSameCanonicalNameAcrossThreads(self):
-    self._skip_eager_if_gpus_less_than(1)
-
-    def model_fn(device_id):
-      v = variable_scope.variable(1.0, name="foo_" + str(device_id))
-      distribution_strategy_context.get_replica_context().merge_call(
-          lambda _: _)
+  def testVariableWithSameCanonicalNameAcrossThreads(self, distribution):
+    def model_fn():
+      replica_id = self.evaluate(_replica_id())
+      v = variable_scope.variable(1.0, name="foo_" + str(replica_id))
+      ds_context.get_replica_context().merge_call(lambda _: _)
       return v
 
-    dist = mirrored_strategy.MirroredStrategy(
-        ["/device:GPU:0", "/device:CPU:0"])
-
-    with dist.scope():
-      result = dist.call_for_each_replica(
-          model_fn, args=(dist.worker_device_index,))
+    with distribution.scope():
+      result = distribution.extended.call_for_each_replica(model_fn)
       self.assertIsInstance(result, values.MirroredVariable)
       # The resulting mirrored variable will use the name from the first device.
-      self.assertEquals("foo_0:0", result.name)
+      self.assertEqual("foo_0:0", result.name)
 
-  @test_util.run_in_graph_and_eager_modes(config=config)
-  def testWithLayers(self):
-    self._skip_eager_if_gpus_less_than(1)
+  def testWithLayers(self, distribution):
     def model_fn(features):
       with variable_scope.variable_scope("common"):
         layer1 = core.Dense(1)
@@ -305,17 +289,14 @@ class MirroredStrategyVariableCreationTest(test.TestCase):
         layer2 = core.Dense(1)
         layer2(features)
         # This will pause the current thread, and execute the other thread.
-        distribution_strategy_context.get_replica_context().merge_call(
-            lambda _: _)
+        ds_context.get_replica_context().merge_call(lambda _: _)
         layer3 = core.Dense(1)
         layer3(features)
         return [(layer1.kernel, layer1.bias),
                 (layer2.kernel, layer2.bias),
                 (layer3.kernel, layer3.bias)]
 
-    dist = mirrored_strategy.MirroredStrategy(
-        ["/device:GPU:0", "/device:CPU:0"])
-    ds = dist.distribute_dataset(
+    ds = distribution.distribute_dataset(
         lambda: dataset_ops.Dataset.from_tensors([[1.]]).repeat(10))
     if context.executing_eagerly():
       iterator = ds.make_one_shot_iterator()
@@ -325,26 +306,23 @@ class MirroredStrategyVariableCreationTest(test.TestCase):
 
     features = iterator.get_next()
 
-    with dist.scope():
-      result = dist.call_for_each_replica(model_fn, args=(features,))
+    with distribution.scope():
+      result = distribution.extended.call_for_each_replica(
+          model_fn, args=(features,))
       suffixes = ["", "_1", "_2"]
       for (kernel, bias), suffix in zip(result, suffixes):
         self.assertIsInstance(kernel, values.MirroredVariable)
-        self.assertEquals("common/dense" + suffix + "/kernel:0", kernel.name)
+        self.assertEqual("common/dense" + suffix + "/kernel:0", kernel.name)
         self.assertIsInstance(bias, values.MirroredVariable)
-        self.assertEquals("common/dense" + suffix + "/bias:0", bias.name)
-
-  @test_util.run_in_graph_and_eager_modes(config=config)
-  def testWithVariableAndVariableScope(self):
-    self._skip_eager_if_gpus_less_than(1)
+        self.assertEqual("common/dense" + suffix + "/bias:0", bias.name)
 
+  def testWithVariableAndVariableScope(self, distribution):
     def model_fn():
       v0 = variable_scope.variable(1.0, name="var0", aggregation=None)
       with variable_scope.variable_scope("common"):
         v1 = variable_scope.variable(1.0, name="var1")
         # This will pause the current thread, and execute the other thread.
-        distribution_strategy_context.get_replica_context().merge_call(
-            lambda _: _)
+        ds_context.get_replica_context().merge_call(lambda _: _)
         v2 = variable_scope.variable(
             1.0,
             name="var2",
@@ -358,37 +336,31 @@ class MirroredStrategyVariableCreationTest(test.TestCase):
 
       return v0, v1, v2, v3
 
-    devices = ["/device:CPU:0", "/device:GPU:0"]
-    dist = mirrored_strategy.MirroredStrategy(devices)
-    with dist.scope():
+    with distribution.scope():
       v = variable_scope.variable(1.0, name="var-main0")
-      self.assertEquals("var-main0:0", v.name)
+      self.assertEqual("var-main0:0", v.name)
 
-      result = dist.call_for_each_replica(model_fn)
-      self.assertEquals(4, len(result))
+      result = distribution.extended.call_for_each_replica(model_fn)
+      self.assertEqual(4, len(result))
       v0, v1, v2, v3 = result
       self.assertIsInstance(v0, values.MirroredVariable)
-      self.assertEquals("var0:0", v0.name)
+      self.assertEqual("var0:0", v0.name)
       self.assertIsInstance(v1, values.MirroredVariable)
-      self.assertEquals("common/var1:0", v1.name)
+      self.assertEqual("common/var1:0", v1.name)
       self.assertIsInstance(v2, values.ReplicaLocalVariable)
-      self.assertEquals("common/var2:0", v2.name)
-      self.assertEquals(variable_scope.VariableAggregation.SUM, v2.aggregation)
+      self.assertEqual("common/var2:0", v2.name)
+      self.assertEqual(variable_scope.VariableAggregation.SUM, v2.aggregation)
       self.assertIsInstance(v3, values.MirroredVariable)
-      self.assertEquals("common/var3:0", v3.name)
-      self.assertEquals(variable_scope.VariableAggregation.MEAN, v3.aggregation)
-
-  @test_util.run_in_graph_and_eager_modes(config=config)
-  def testWithGetVariableAndVariableScope(self):
-    self._skip_eager_if_gpus_less_than(1)
+      self.assertEqual("common/var3:0", v3.name)
+      self.assertEqual(variable_scope.VariableAggregation.MEAN, v3.aggregation)
 
+  def testWithGetVariableAndVariableScope(self, distribution):
     def model_fn():
       v0 = variable_scope.get_variable("var0", [1])
       with variable_scope.variable_scope("common"):
         v1 = variable_scope.get_variable("var1", [1])
         # This will pause the current thread, and execute the other thread.
-        distribution_strategy_context.get_replica_context().merge_call(
-            lambda _: _)
+        ds_context.get_replica_context().merge_call(lambda _: _)
         v2 = variable_scope.get_variable(
             "var2", [1],
             synchronization=variable_scope.VariableSynchronization.ON_READ,
@@ -400,33 +372,28 @@ class MirroredStrategyVariableCreationTest(test.TestCase):
 
       return v0, v1, v2, v3
 
-    devices = ["/device:CPU:0", "/device:GPU:0"]
-    dist = mirrored_strategy.MirroredStrategy(devices)
-    with dist.scope():
+    with distribution.scope():
       with variable_scope.variable_scope("main"):
         v = variable_scope.get_variable("var-main0", [1])
-        self.assertEquals("main/var-main0:0", v.name)
+        self.assertEqual("main/var-main0:0", v.name)
 
-        result = dist.call_for_each_replica(model_fn)
-        self.assertEquals(4, len(result))
+        result = distribution.extended.call_for_each_replica(model_fn)
+        self.assertEqual(4, len(result))
         v0, v1, v2, v3 = result
         self.assertIsInstance(v0, values.MirroredVariable)
-        self.assertEquals("main/var0:0", v0.name)
+        self.assertEqual("main/var0:0", v0.name)
         self.assertIsInstance(v1, values.MirroredVariable)
-        self.assertEquals("main/common/var1:0", v1.name)
+        self.assertEqual("main/common/var1:0", v1.name)
         self.assertIsInstance(v2, values.ReplicaLocalVariable)
-        self.assertEquals("main/common/var2:0", v2.name)
-        self.assertEquals(variable_scope.VariableAggregation.SUM,
-                          v2.aggregation)
+        self.assertEqual("main/common/var2:0", v2.name)
+        self.assertEqual(variable_scope.VariableAggregation.SUM,
+                         v2.aggregation)
         self.assertIsInstance(v3, values.MirroredVariable)
-        self.assertEquals("main/common/var3:0", v3.name)
-        self.assertEquals(variable_scope.VariableAggregation.MEAN,
-                          v3.aggregation)
-
-  @test_util.run_in_graph_and_eager_modes(config=config)
-  def testOnlyFirstReplicaUpdatesVariables(self):
-    self._skip_eager_if_gpus_less_than(1)
+        self.assertEqual("main/common/var3:0", v3.name)
+        self.assertEqual(variable_scope.VariableAggregation.MEAN,
+                         v3.aggregation)
 
+  def testOnlyFirstReplicaUpdatesVariables(self, distribution):
     def create_fn():
       aggregation = variable_scope.VariableAggregation.ONLY_FIRST_REPLICA
       v0 = variable_scope.variable(
@@ -442,71 +409,73 @@ class MirroredStrategyVariableCreationTest(test.TestCase):
       return v0, v1
 
     devices = ["/device:GPU:0", "/device:CPU:0"]
-    dist = mirrored_strategy.MirroredStrategy(devices)
-    with dist.scope():
-      v0, v1 = dist.call_for_each_replica(create_fn)
+    with distribution.scope():
+      v0, v1 = distribution.extended.call_for_each_replica(create_fn)
       self.evaluate(v0.initializer)
       self.assertEqual(2.0, self.evaluate(v0.get(devices[0])))
       self.assertEqual(2.0, self.evaluate(v0.get(devices[1])))
-      self.assertEqual(2.0, self.evaluate(dist.read_var(v0)))
+      self.assertEqual(2.0, self.evaluate(distribution.extended.read_var(v0)))
       self.evaluate(v1.initializer)
       self.assertEqual(3.0, self.evaluate(v1.get(devices[0])))
       self.assertEqual(3.0, self.evaluate(v1.get(devices[1])))
-      self.assertEqual(3.0, self.evaluate(dist.read_var(v1)))
+      self.assertEqual(3.0, self.evaluate(distribution.extended.read_var(v1)))
+
+      def replica_id_plus_one():
+        return math_ops.cast(_replica_id() + 1, dtype=dtypes.float32)
 
       # Update using the assign_add member function.
-      def update_member_fn(device_id):
-        update0 = v0.assign_add(5.0 * (device_id + 1))
-        update1 = v1.assign_add(7.0 * (device_id + 1))
+      def update_member_fn():
+        update0 = v0.assign_add(5.0 * replica_id_plus_one())
+        update1 = v1.assign_add(7.0 * replica_id_plus_one())
         return update0, update1
 
-      update0a, update1a = dist.call_for_each_replica(
-          update_member_fn, args=(dist.worker_device_index,))
+      update0a, update1a = distribution.extended.call_for_each_replica(
+          update_member_fn)
 
       # Update "sync on read" variable.
-      self.evaluate(dist.group(update0a))
+      self.evaluate(distribution.group(update0a))
       self.assertEqual(2.0 + 5.0, self.evaluate(v0.get(devices[0])))
       # Writes are not synchronized for "sync on read" variables,
       # so device[1] can end up with a different value.
       self.assertEqual(2.0 + 2*5.0, self.evaluate(v0.get(devices[1])))
       # Always reads from device 0.
-      self.assertEqual(2.0 + 5.0, self.evaluate(dist.read_var(v0)))
+      self.assertEqual(2.0 + 5.0, self.evaluate(
+          distribution.extended.read_var(v0)))
 
       # Update "sync on write" variable.
-      self.evaluate(dist.group(update1a))
+      self.evaluate(distribution.group(update1a))
       self.assertEqual(3.0 + 7.0, self.evaluate(v1.get(devices[0])))
       # Writes are synchronized for v1, only the argument to assign_add on
       # device[0] is used.
       self.assertEqual(3.0 + 7.0, self.evaluate(v1.get(devices[1])))
-      self.assertEqual(3.0 + 7.0, self.evaluate(dist.read_var(v1)))
+      self.assertEqual(3.0 + 7.0, self.evaluate(
+          distribution.extended.read_var(v1)))
 
       # Update using state_ops.assign_add global function.
-      def update_state_ops_fn(device_id):
-        update0 = state_ops.assign_add(v0, 11.0 * (device_id + 1))
-        update1 = state_ops.assign_add(v1, 13.0 * (device_id + 1))
+      def update_state_ops_fn():
+        update0 = state_ops.assign_add(v0, 11.0 * replica_id_plus_one())
+        update1 = state_ops.assign_add(v1, 13.0 * replica_id_plus_one())
         return update0, update1
 
-      update0b, update1b = dist.call_for_each_replica(
-          update_state_ops_fn, args=(dist.worker_device_index,))
-      self.evaluate(dist.group(update0b))
+      update0b, update1b = distribution.extended.call_for_each_replica(
+          update_state_ops_fn)
+      self.evaluate(distribution.group(update0b))
 
       # Update "sync on read" variable.
       self.assertEqual(2.0 + 5.0 + 11.0, self.evaluate(v0.get(devices[0])))
       self.assertEqual(2.0 + 2*5.0 + 2*11.0, self.evaluate(v0.get(devices[1])))
-      self.assertEqual(2.0 + 5.0 + 11.0, self.evaluate(dist.read_var(v0)))
+      self.assertEqual(2.0 + 5.0 + 11.0, self.evaluate(
+          distribution.extended.read_var(v0)))
 
       # Update "sync on write" variable.
-      self.evaluate(dist.group(update1b))
+      self.evaluate(distribution.group(update1b))
       self.assertEqual(3.0 + 7.0 + 13.0, self.evaluate(v1.get(devices[0])))
       self.assertEqual(3.0 + 7.0 + 13.0, self.evaluate(v1.get(devices[1])))
-      self.assertEqual(3.0 + 7.0 + 13.0, self.evaluate(dist.read_var(v1)))
+      self.assertEqual(3.0 + 7.0 + 13.0, self.evaluate(
+          distribution.extended.read_var(v1)))
 
-  @test_util.run_in_graph_and_eager_modes(config=config)
-  def testNoneSynchronizationWithGetVariable(self):
-    self._skip_eager_if_gpus_less_than(1)
-    devices = ["/device:CPU:0", "/device:GPU:0"]
-    dist = mirrored_strategy.MirroredStrategy(devices)
-    with dist.scope():
+  def testNoneSynchronizationWithGetVariable(self, distribution):
+    with distribution.scope():
       with self.assertRaisesRegexp(
           ValueError, "`NONE` variable synchronization mode is not "
           "supported with `Mirrored` distribution strategy. Please change "
@@ -515,12 +484,8 @@ class MirroredStrategyVariableCreationTest(test.TestCase):
             "v", [1],
             synchronization=variable_scope.VariableSynchronization.NONE)
 
-  @test_util.run_in_graph_and_eager_modes(config=config)
-  def testNoneSynchronizationWithVariable(self):
-    self._skip_eager_if_gpus_less_than(1)
-    devices = ["/device:CPU:0", "/device:GPU:0"]
-    dist = mirrored_strategy.MirroredStrategy(devices)
-    with dist.scope():
+  def testNoneSynchronizationWithVariable(self, distribution):
+    with distribution.scope():
       with self.assertRaisesRegexp(
           ValueError, "`NONE` variable synchronization mode is not "
           "supported with `Mirrored` distribution strategy. Please change "
@@ -530,23 +495,15 @@ class MirroredStrategyVariableCreationTest(test.TestCase):
             name="v",
             synchronization=variable_scope.VariableSynchronization.NONE)
 
-  @test_util.run_in_graph_and_eager_modes(config=config)
-  def testInvalidSynchronizationWithVariable(self):
-    self._skip_eager_if_gpus_less_than(1)
-    devices = ["/device:CPU:0", "/device:GPU:0"]
-    dist = mirrored_strategy.MirroredStrategy(devices)
-    with dist.scope():
+  def testInvalidSynchronizationWithVariable(self, distribution):
+    with distribution.scope():
       with self.assertRaisesRegexp(
           ValueError, "Invalid variable synchronization mode: Invalid for "
           "variable: v"):
         variable_scope.variable(1.0, name="v", synchronization="Invalid")
 
-  @test_util.run_in_graph_and_eager_modes(config=config)
-  def testInvalidAggregationWithGetVariable(self):
-    self._skip_eager_if_gpus_less_than(1)
-    devices = ["/device:CPU:0", "/device:GPU:0"]
-    dist = mirrored_strategy.MirroredStrategy(devices)
-    with dist.scope():
+  def testInvalidAggregationWithGetVariable(self, distribution):
+    with distribution.scope():
       with self.assertRaisesRegexp(
           ValueError, "Invalid variable aggregation mode: invalid for "
           "variable: v"):
@@ -555,12 +512,8 @@ class MirroredStrategyVariableCreationTest(test.TestCase):
             synchronization=variable_scope.VariableSynchronization.ON_WRITE,
             aggregation="invalid")
 
-  @test_util.run_in_graph_and_eager_modes(config=config)
-  def testInvalidAggregationWithVariable(self):
-    self._skip_eager_if_gpus_less_than(1)
-    devices = ["/device:CPU:0", "/device:GPU:0"]
-    dist = mirrored_strategy.MirroredStrategy(devices)
-    with dist.scope():
+  def testInvalidAggregationWithVariable(self, distribution):
+    with distribution.scope():
       with self.assertRaisesRegexp(
           ValueError, "Invalid variable aggregation mode: invalid for "
           "variable: v"):
@@ -570,55 +523,28 @@ class MirroredStrategyVariableCreationTest(test.TestCase):
             synchronization=variable_scope.VariableSynchronization.ON_WRITE,
             aggregation="invalid")
 
-  @test_util.run_in_graph_and_eager_modes(config=config)
-  def testThreeDevices(self):
-    self._skip_eager_if_gpus_less_than(2)
-
-    def model_fn():
-      v = variable_scope.variable(1.0, name="foo")
-      distribution_strategy_context.get_replica_context().merge_call(
-          lambda _: _)
-      return v
-
-    dist = mirrored_strategy.MirroredStrategy(
-        ["/device:GPU:0", "/device:GPU:1", "/device:CPU:0"])
-
-    with dist.scope():
-      result = dist.call_for_each_replica(model_fn)
-      self.assertIsInstance(result, values.MirroredVariable)
-      self.assertEquals("foo:0", result.name)
-
-  @test_util.run_in_graph_and_eager_modes(config=config)
-  def testNonMatchingVariableCreation(self):
-    self._skip_eager_if_gpus_less_than(1)
-
+  def testNonMatchingVariableCreation(self, distribution):
     def model_fn(name):
       v = variable_scope.variable(1.0, name=name)
-      distribution_strategy_context.get_replica_context().merge_call(
-          lambda _: _)
+      ds_context.get_replica_context().merge_call(lambda _: _)
       return v
 
-    dist = mirrored_strategy.MirroredStrategy(
-        ["/device:GPU:0", "/device:CPU:0"])
-
-    with dist.scope():
+    with distribution.scope():
       names = values.DistributedValues({
           "/device:CPU:0": "foo",
           "/device:GPU:0": "bar"
       })
       with self.assertRaises(RuntimeError):
-        _ = dist.call_for_each_replica(model_fn, args=(names,))
-
-  @test_util.run_in_graph_and_eager_modes(config=config)
-  def testReplicaLocalVariable(self):
-    self._skip_eager_if_gpus_less_than(1)
+        _ = distribution.extended.call_for_each_replica(model_fn, args=(names,))
 
+  def testReplicaLocalVariable(self, distribution):
     all_v_sum = {}
     all_v_mean = {}
     components_sum = {}
     components_mean = {}
 
-    def model_fn(device_id):
+    def model_fn():
+      replica_id = self.evaluate(_replica_id())
       v_sum = variable_scope.variable(
           1.0,
           synchronization=variable_scope.VariableSynchronization.ON_READ,
@@ -629,26 +555,22 @@ class MirroredStrategyVariableCreationTest(test.TestCase):
           aggregation=variable_scope.VariableAggregation.MEAN)
       self.assertTrue(isinstance(v_sum, values.ReplicaLocalVariable))
       self.assertTrue(isinstance(v_mean, values.ReplicaLocalVariable))
-      updates = [v_sum.assign_add(2.0 + device_id),
-                 v_mean.assign(6.0 * device_id)]
-      all_v_sum[device_id] = v_sum
-      all_v_mean[device_id] = v_mean
+      updates = [v_sum.assign_add(2.0 + replica_id),
+                 v_mean.assign(6.0 * replica_id)]
+      all_v_sum[replica_id] = v_sum
+      all_v_mean[replica_id] = v_mean
       c_sum = v_sum.get()
       c_mean = v_mean.get()
-      components_sum[device_id] = c_sum
-      components_mean[device_id] = c_mean
+      components_sum[replica_id] = c_sum
+      components_mean[replica_id] = c_mean
       self.assertIsNot(v_sum, c_sum)
       self.assertIsNot(v_mean, c_mean)
       return updates, v_sum, v_mean, c_sum, c_mean
 
-    dist = mirrored_strategy.MirroredStrategy(
-        ["/device:GPU:0", "/device:CPU:0"])
-
-    with dist.scope():
+    with distribution.scope():
       # Create "sum" and "mean" versions of ReplicaLocalVariables.
       ret_ops, ret_v_sum, ret_v_mean, regrouped_sum, regrouped_mean = (
-          dist.call_for_each_replica(
-              model_fn, args=(dist.worker_device_index,)))
+          distribution.extended.call_for_each_replica(model_fn))
       # Should see the same wrapping instance in all replicas.
       self.assertIs(all_v_sum[0], ret_v_sum)
       self.assertIs(all_v_mean[0], ret_v_mean)
@@ -663,10 +585,10 @@ class MirroredStrategyVariableCreationTest(test.TestCase):
 
       # Apply updates
       self.evaluate(variables.global_variables_initializer())
-      self.evaluate([y for x in ret_ops for y in dist.unwrap(x)])
+      self.evaluate([y for x in ret_ops for y in distribution.unwrap(x)])
       expected_sum = 0.0
       expected_mean = 0.0
-      for i, d in enumerate(dist.worker_devices):
+      for i, d in enumerate(distribution.extended.worker_devices):
         # Should see different values on different devices.
         v_sum_value = self.evaluate(ret_v_sum.get(d).read_value())
         v_mean_value = self.evaluate(ret_v_mean.get(d).read_value())
@@ -676,135 +598,22 @@ class MirroredStrategyVariableCreationTest(test.TestCase):
         expected = i * 6.0
         self.assertEqual(expected, v_mean_value)
         expected_mean += expected
-      expected_mean /= len(dist.worker_devices)
+      expected_mean /= len(distribution.extended.worker_devices)
 
       # Without get(device), should return the value you get by
       # applying the reduction across all replicas (whether you use
       # read_var(), get(), or nothing).
-      self.assertEqual(expected_sum, self.evaluate(dist.read_var(ret_v_sum)))
-      self.assertEqual(expected_mean, self.evaluate(dist.read_var(ret_v_mean)))
+      self.assertEqual(expected_sum, self.evaluate(
+          distribution.extended.read_var(ret_v_sum)))
+      self.assertEqual(expected_mean, self.evaluate(
+          distribution.extended.read_var(ret_v_mean)))
       self.assertEqual(expected_sum, self.evaluate(ret_v_sum.get()))
       self.assertEqual(expected_mean, self.evaluate(ret_v_mean.get()))
       self.assertEqual(expected_sum, self.evaluate(ret_v_sum))
       self.assertEqual(expected_mean, self.evaluate(ret_v_mean))
 
-  # NOTE(priyag): Names and name scopes are ignored in eager, hence we are not
-  # testing this in eager mode.
-
-  def testNameScope(self):
-    def model_fn():
-      with ops.name_scope("foo"):
-        a = constant_op.constant(1.0, name="a")
-        distribution_strategy_context.get_replica_context().merge_call(
-            lambda _: _)
-        b = constant_op.constant(1.0, name="b")
-      return a, b
-
-    dist = mirrored_strategy.MirroredStrategy(
-        ["/device:GPU:0", "/device:CPU:0"])
-
-    with context.graph_mode(), dist.scope():
-      with ops.name_scope("main"):
-        result = dist.call_for_each_replica(model_fn)
-        self.assertEquals(2, len(result))
-        for v, name in zip(result, ["a", "b"]):
-          self.assertIsInstance(v, values.DistributedValues)
-          v0, v1 = dist.unwrap(v)
-          self.assertEquals("main/foo/" + name + ":0", v0.name)
-          self.assertEquals("main/replica_1/foo/" + name + ":0", v1.name)
-
-  def testWithDefaultName(self):
-    def model_fn():
-      with ops.name_scope(None, "foo"):
-        a = constant_op.constant(1.0, name="a")
-        distribution_strategy_context.get_replica_context().merge_call(
-            lambda _: _)
-        b = constant_op.constant(2.0, name="b")
-      return a, b
-
-    dist = mirrored_strategy.MirroredStrategy(
-        ["/device:GPU:0", "/device:CPU:0"])
-
-    with context.graph_mode(), dist.scope():
-      result = dist.call_for_each_replica(model_fn)
-      self.assertEquals(2, len(result))
-      for v, name in zip(result, ["a", "b"]):
-        self.assertIsInstance(v, values.DistributedValues)
-        v0, v1 = dist.unwrap(v)
-        self.assertEquals("foo/" + name + ":0", v0.name)
-        self.assertEquals("replica_1/foo/" + name + ":0", v1.name)
-
-  # variable_scope.variable() respects name scopes when creating
-  # variables. On the other hand variable_scope.get_variable() ignores name
-  # scopes when creating variables. We test both methods of creating variables
-  # to make sure that we have the same variable names in both cases.
-  def testNameScopeWithVariable(self):
-    def in_cross_replica(_):
-      c = variable_scope.variable(1.0, name="c")
-      return c
-
-    def model_fn():
-      b = variable_scope.variable(1.0, name="b")
-      with ops.name_scope("foo"):
-        c = distribution_strategy_context.get_replica_context().merge_call(
-            in_cross_replica)
-      return b, c
-
-    dist = mirrored_strategy.MirroredStrategy(
-        ["/device:GPU:0", "/device:CPU:0"])
-
-    with context.graph_mode(), dist.scope():
-      with ops.name_scope("main"):
-        a = variable_scope.variable(1.0, name="a")
-        result = dist.call_for_each_replica(model_fn)
-      result_b = result[0]
-      result_c = result[1]
-      self.assertIsInstance(result_b, values.DistributedValues)
-      self.assertIsInstance(result_c, values.DistributedValues)
-      a0, a1 = dist.unwrap(a)
-      b0, b1 = dist.unwrap(result_b)
-      c0, c1 = dist.unwrap(result_c)
-      self.assertEquals("main/a:0", a0.name)
-      self.assertEquals("main/a/replica_1:0", a1.name)
-      self.assertEquals("main/b:0", b0.name)
-      self.assertEquals("main/b/replica_1:0", b1.name)
-      self.assertEquals("main/foo/c:0", c0.name)
-      self.assertEquals("main/foo/c/replica_1:0", c1.name)
-
-  def testNameScopeWithGetVariable(self):
-    def in_cross_replica(_):
-      c = variable_scope.get_variable("c", [1])
-      return c
-
-    def model_fn():
-      b = variable_scope.get_variable("b", [1])
-      with ops.name_scope("foo"):
-        c = distribution_strategy_context.get_replica_context().merge_call(
-            in_cross_replica)
-      return b, c
-
-    dist = mirrored_strategy.MirroredStrategy(
-        ["/device:GPU:0", "/device:CPU:0"])
-
-    with context.graph_mode(), dist.scope():
-      with ops.name_scope("main"):
-        a = variable_scope.get_variable("a", [1])
-        result = dist.call_for_each_replica(model_fn)
-      result_b = result[0]
-      result_c = result[1]
-      self.assertIsInstance(result_b, values.DistributedValues)
-      self.assertIsInstance(result_c, values.DistributedValues)
-      a0, a1 = dist.unwrap(a)
-      b0, b1 = dist.unwrap(result_b)
-      c0, c1 = dist.unwrap(result_c)
-      self.assertEquals("a:0", a0.name)
-      self.assertEquals("a/replica_1:0", a1.name)
-      self.assertEquals("b:0", b0.name)
-      self.assertEquals("b/replica_1:0", b1.name)
-      self.assertEquals("c:0", c0.name)
-      self.assertEquals("c/replica_1:0", c1.name)
-
-  def testDynamicRnnVariables(self):
+  # TODO(priyag): Update this test to work in eager mode as well.
+  def testDynamicRnnVariables(self, distribution):
     def model_fn():
       inputs = constant_op.constant(2 * [2 * [[0.0, 1.0, 2.0, 3.0, 4.0]]])
       cell_fw = rnn_cell_impl.LSTMCell(300)
@@ -816,81 +625,208 @@ class MirroredStrategyVariableCreationTest(test.TestCase):
           dtype=dtypes.float32)
       return outputs
 
-    dist = mirrored_strategy.MirroredStrategy(
-        ["/device:GPU:0", "/device:CPU:0"])
-
-    with context.graph_mode(), dist.scope():
-      result = dist.call_for_each_replica(model_fn)
+    with context.graph_mode(), distribution.scope():
+      result = distribution.extended.call_for_each_replica(model_fn)
       # Two variables are created by the RNN layer.
-      self.assertEquals(2, len(result))
+      self.assertEqual(2, len(result))
       for v in result:
         self.assertIsInstance(v, values.DistributedValues)
-        _, v1 = dist.unwrap(v)
-        self.assertStartsWith(v1.name, "replica_1/")
+        _, v1 = distribution.unwrap(v)
+        self.assertStartsWith(v1._op.name, "replica_1/")
 
-  @test_util.run_in_graph_and_eager_modes(config=config)
-  def testReplicaLocalVariableUpdate(self):
-    with context.graph_mode():
+  def testReplicaLocalVariableUpdate(self, distribution):
+    def model_fn():
+      v_sum = variable_scope.variable(
+          1.0,
+          synchronization=variable_scope.VariableSynchronization.ON_READ,
+          aggregation=variable_scope.VariableAggregation.SUM)
+      self.assertTrue(isinstance(v_sum, values.ReplicaLocalVariable))
+      return v_sum
 
-      def model_fn():
-        v_sum = variable_scope.variable(
-            1.0,
-            synchronization=variable_scope.VariableSynchronization.ON_READ,
-            aggregation=variable_scope.VariableAggregation.SUM)
-        self.assertTrue(isinstance(v_sum, values.ReplicaLocalVariable))
-        return v_sum
+    def update(var, value):
+      return var.assign(value)
 
-      dist = mirrored_strategy.MirroredStrategy(
-          ["/device:GPU:0", "/device:GPU:1"])
+    with distribution.scope():
+      ret_v_sum = distribution.extended.call_for_each_replica(model_fn)
 
-      def update(var, value):
-        return var.assign(value)
+      # Initialize variables.
+      self.evaluate(variables.global_variables_initializer())
+      # Assert that the aggregated value of the replica local vars is the sum
+      # of the individual values before running the update ops.
+      self.assertEqual(1.0, self.evaluate(ret_v_sum.get(
+          distribution.extended.worker_devices[0]).read_value()))
+      self.assertEqual(2.0, self.evaluate(ret_v_sum))
 
-      with dist.scope():
-        ret_v_sum = dist.call_for_each_replica(model_fn)
-        update_ops = dist.update(ret_v_sum, update, 5.0, grouped=False)
-
-        # Initialize variables.
-        self.evaluate(variables.global_variables_initializer())
-        # Assert that the aggregated value of the replica local vars is the sum
-        # of the individual values before running the update ops.
-        self.assertEquals(1.0, self.evaluate(
-            ret_v_sum.get(dist._devices[0]).read_value()))
-        self.assertEquals(2.0, self.evaluate(ret_v_sum))
-
-        # Apply updates.
-        self.evaluate(update_ops)
-        # Assert that the aggregated value of the replica local vars is the sum
-        # of the individual values after running the update ops.
-        self.assertEquals(5.0, self.evaluate(
-            ret_v_sum.get(dist._devices[0]).read_value()))
-        self.assertEquals(10.0, self.evaluate(ret_v_sum))
+      # Apply updates.
+      update_ops = distribution.extended.update(
+          ret_v_sum, update, args=(5.0,), group=False)
+      self.evaluate(update_ops)
+      # Assert that the aggregated value of the replica local vars is the sum
+      # of the individual values after running the update ops.
+      self.assertEqual(5.0, self.evaluate(ret_v_sum.get(
+          distribution.extended.worker_devices[0]).read_value()))
+      self.assertEqual(10.0, self.evaluate(ret_v_sum))
 
 
+@combinations.generate(combinations.combine(
+    distribution=[
+        combinations.mirrored_strategy_with_gpu_and_cpu,
+        combinations.core_mirrored_strategy_with_gpu_and_cpu],
+    mode=["graph"]))
+class MirroredStrategyNameScopeTest(test.TestCase):
+  # NOTE(priyag): Names and name scopes are ignored in eager, hence we are not
+  # testing this in eager mode.
+
+  def testNameScope(self, distribution):
+    def model_fn():
+      with ops.name_scope("foo"):
+        a = constant_op.constant(1.0, name="a")
+        ds_context.get_replica_context().merge_call(lambda _: _)
+        b = constant_op.constant(1.0, name="b")
+      return a, b
+
+    with context.graph_mode(), distribution.scope():
+      with ops.name_scope("main"):
+        result = distribution.extended.call_for_each_replica(model_fn)
+        self.assertEqual(2, len(result))
+        for v, name in zip(result, ["a", "b"]):
+          self.assertIsInstance(v, values.DistributedValues)
+          v0, v1 = distribution.unwrap(v)
+          self.assertEqual("main/foo/" + name + ":0", v0.name)
+          self.assertEqual("main/replica_1/foo/" + name + ":0", v1.name)
+
+  def testWithDefaultName(self, distribution):
+    def model_fn():
+      with ops.name_scope(None, "foo"):
+        a = constant_op.constant(1.0, name="a")
+        ds_context.get_replica_context().merge_call(lambda _: _)
+        b = constant_op.constant(2.0, name="b")
+      return a, b
+
+    with context.graph_mode(), distribution.scope():
+      result = distribution.extended.call_for_each_replica(model_fn)
+      self.assertEqual(2, len(result))
+      for v, name in zip(result, ["a", "b"]):
+        self.assertIsInstance(v, values.DistributedValues)
+        v0, v1 = distribution.unwrap(v)
+        self.assertEqual("foo/" + name + ":0", v0.name)
+        self.assertEqual("replica_1/foo/" + name + ":0", v1.name)
+
+  # variable_scope.variable() respects name scopes when creating
+  # variables. On the other hand variable_scope.get_variable() ignores name
+  # scopes when creating variables. We test both methods of creating variables
+  # to make sure that we have the same variable names in both cases.
+  def testNameScopeWithVariable(self, distribution):
+    def in_cross_replica(_):
+      c = variable_scope.variable(1.0, name="c")
+      return c
+
+    def model_fn():
+      b = variable_scope.variable(1.0, name="b")
+      with ops.name_scope("foo"):
+        c = ds_context.get_replica_context().merge_call(in_cross_replica)
+      return b, c
+
+    with context.graph_mode(), distribution.scope():
+      with ops.name_scope("main"):
+        a = variable_scope.variable(1.0, name="a")
+        result = distribution.extended.call_for_each_replica(model_fn)
+      result_b = result[0]
+      result_c = result[1]
+      self.assertIsInstance(result_b, values.DistributedValues)
+      self.assertIsInstance(result_c, values.DistributedValues)
+      a0, a1 = distribution.unwrap(a)
+      b0, b1 = distribution.unwrap(result_b)
+      c0, c1 = distribution.unwrap(result_c)
+      self.assertEqual("main/a:0", a0.name)
+      self.assertEqual("main/a/replica_1:0", a1.name)
+      self.assertEqual("main/b:0", b0.name)
+      self.assertEqual("main/b/replica_1:0", b1.name)
+      self.assertEqual("main/foo/c:0", c0.name)
+      self.assertEqual("main/foo/c/replica_1:0", c1.name)
+
+  def testNameScopeWithGetVariable(self, distribution):
+    def in_cross_replica(_):
+      c = variable_scope.get_variable("c", [1])
+      return c
+
+    def model_fn():
+      b = variable_scope.get_variable("b", [1])
+      with ops.name_scope("foo"):
+        c = ds_context.get_replica_context().merge_call(in_cross_replica)
+      return b, c
+
+    with context.graph_mode(), distribution.scope():
+      with ops.name_scope("main"):
+        a = variable_scope.get_variable("a", [1])
+        result = distribution.extended.call_for_each_replica(model_fn)
+      result_b = result[0]
+      result_c = result[1]
+      self.assertIsInstance(result_b, values.DistributedValues)
+      self.assertIsInstance(result_c, values.DistributedValues)
+      a0, a1 = distribution.unwrap(a)
+      b0, b1 = distribution.unwrap(result_b)
+      c0, c1 = distribution.unwrap(result_c)
+      self.assertEqual("a:0", a0.name)
+      self.assertEqual("a/replica_1:0", a1.name)
+      self.assertEqual("b:0", b0.name)
+      self.assertEqual("b/replica_1:0", b1.name)
+      self.assertEqual("c:0", c0.name)
+      self.assertEqual("c/replica_1:0", c1.name)
+
+
+@combinations.generate(
+    combinations.combine(
+        distribution=[
+            combinations.NamedDistribution(
+                "Mirrored3Devices",
+                # pylint: disable=g-long-lambda
+                lambda: mirrored_strategy.MirroredStrategy(
+                    ["/device:GPU:0", "/device:GPU:1", "/device:CPU:0"]),
+                required_gpus=2),
+            combinations.NamedDistribution(
+                "CoreMirrored3Devices",
+                # pylint: disable=g-long-lambda
+                lambda: mirrored_strategy.CoreMirroredStrategy(
+                    ["/device:GPU:0", "/device:GPU:1", "/device:CPU:0"]),
+                required_gpus=2)
+        ],
+        mode=["graph", "eager"]))
+class MirroredThreeDeviceDistributionTest(
+    strategy_test_lib.DistributionTestBase,
+    parameterized.TestCase):
+
+  def testThreeDevices(self, distribution):
+    def model_fn():
+      v = variable_scope.variable(1.0, name="foo")
+      ds_context.get_replica_context().merge_call(lambda _: _)
+      return v
+
+    with distribution.scope():
+      result = distribution.extended.call_for_each_replica(model_fn)
+      self.assertIsInstance(result, values.MirroredVariable)
+      self.assertEqual("foo:0", result.name)
+
+
+@combinations.generate(combinations.combine(
+    distribution=[
+        combinations.mirrored_strategy_with_gpu_and_cpu,
+        combinations.core_mirrored_strategy_with_gpu_and_cpu],
+    mode=["graph", "eager"]))
 class MirroredVariableUpdateTest(test.TestCase):
   # The following tests check assign, assign_add and assign_sub on Mirrored
   # variables in replica and cross replica context.
-  config = config_pb2.ConfigProto()
-  config.allow_soft_placement = True
 
-  def _skip_eager_if_gpus_less_than(self, num_gpus):
-    if context.num_gpus() < num_gpus and context.executing_eagerly():
-      self.skipTest("Enough GPUs not available for this test in eager mode.")
-
-  @test_util.run_in_graph_and_eager_modes(config=config)
-  def testAssignMirroredVarReplicaContextWithoutAggregationType(self):
+  def testAssignMirroredVarReplicaContextWithoutAggregationType(self,
+                                                                distribution):
     # Test that we always have an aggregation type set on the mirrored variable
     # if we assign to it in replica mode.
-    self._skip_eager_if_gpus_less_than(1)
     def var_fn():
       v = variable_scope.variable(1.0, name="foo")
       return v
 
-    dist = mirrored_strategy.MirroredStrategy(
-        ["/device:GPU:0", "/device:CPU:0"])
-
-    with dist.scope():
-      mirrored_var = dist.call_for_each_replica(var_fn)
+    with distribution.scope():
+      mirrored_var = distribution.extended.call_for_each_replica(var_fn)
       self.assertIsInstance(mirrored_var, values.MirroredVariable)
       self.evaluate(variables.global_variables_initializer())
 
@@ -900,23 +836,19 @@ class MirroredVariableUpdateTest(test.TestCase):
       with self.assertRaisesRegexp(
           ValueError, "You must specify an aggregation method to update a "
                       "MirroredVariable in Replica Context."):
-        self.evaluate(dist.unwrap(dist.call_for_each_replica(model_fn)))
+        self.evaluate(distribution.unwrap(
+            distribution.extended.call_for_each_replica(model_fn)))
 
-  @test_util.run_in_graph_and_eager_modes(config=config)
-  def testAssignMirroredVarReplicaContextWithSum(self):
+  def testAssignMirroredVarReplicaContextWithSum(self, distribution):
     # Test that we don't reduce a non-per-replica value with the "sum"
     # aggregation type.
-    self._skip_eager_if_gpus_less_than(1)
     def var_fn():
       v = variable_scope.variable(
           1.0, name="foo", aggregation=variable_scope.VariableAggregation.SUM)
       return v
 
-    dist = mirrored_strategy.MirroredStrategy(
-        ["/device:GPU:0", "/device:CPU:0"])
-
-    with dist.scope():
-      mirrored_var = dist.call_for_each_replica(var_fn)
+    with distribution.scope():
+      mirrored_var = distribution.extended.call_for_each_replica(var_fn)
       self.assertIsInstance(mirrored_var, values.MirroredVariable)
       self.evaluate(variables.global_variables_initializer())
 
@@ -925,219 +857,184 @@ class MirroredVariableUpdateTest(test.TestCase):
 
       with self.assertRaisesRegexp(
           ValueError, "A non-DistributedValues value 5.0 cannot be reduced "
-          "with the given aggregation VariableAggregation.SUM."):
-        self.evaluate(dist.unwrap(dist.call_for_each_replica(model_fn)))
+          "with the given reduce op ReduceOp.SUM."):
+        self.evaluate(distribution.unwrap(
+            distribution.extended.call_for_each_replica(model_fn)))
 
-  @test_util.run_in_graph_and_eager_modes(config=config)
-  def testAssignMirroredVarCrossDeviceContext(self):
-    self._skip_eager_if_gpus_less_than(1)
+  def testAssignMirroredVarCrossDeviceContext(self, distribution):
     def var_fn():
       return variable_scope.variable(1.0, name="foo")
 
-    dist = mirrored_strategy.MirroredStrategy(
-        ["/device:GPU:0", "/device:CPU:0"])
-
-    with dist.scope():
-      mirrored_var = dist.call_for_each_replica(var_fn)
+    with distribution.scope():
+      mirrored_var = distribution.extended.call_for_each_replica(var_fn)
       self.assertIsInstance(mirrored_var, values.MirroredVariable)
       self.evaluate(variables.global_variables_initializer())
-      self.assertEquals(1.0, self.evaluate(mirrored_var))
+      self.assertEqual(1.0, self.evaluate(mirrored_var))
       mirrored_var_result = self.evaluate(mirrored_var.assign(6.0))
-      self.assertEquals(6.0, mirrored_var_result)
+      self.assertEqual(6.0, mirrored_var_result)
 
-  @test_util.run_in_graph_and_eager_modes(config=config)
-  def testAssignMirroredVarReplicaContext(self):
-    self._skip_eager_if_gpus_less_than(1)
+  def testAssignMirroredVarReplicaContext(self, distribution):
     def var_fn():
       return variable_scope.variable(
           1.0, name="foo", aggregation=variable_scope.VariableAggregation.MEAN)
 
-    dist = mirrored_strategy.MirroredStrategy(
-        ["/device:GPU:0", "/device:CPU:0"])
-
-    with dist.scope():
-      mirrored_var = dist.call_for_each_replica(var_fn)
+    with distribution.scope():
+      mirrored_var = distribution.extended.call_for_each_replica(var_fn)
       self.assertIsInstance(mirrored_var, values.MirroredVariable)
       self.evaluate(variables.global_variables_initializer())
-      self.assertEquals(1.0, self.evaluate(mirrored_var))
+      self.assertEqual(1.0, self.evaluate(mirrored_var))
 
       def model_fn():
         value = math_ops.cast(
-            distribution_strategy_context.get_replica_context().replica_id,
+            ds_context.get_replica_context().replica_id_in_sync_group,
             mirrored_var.dtype)
         return mirrored_var.assign(value)
 
-      self.evaluate(dist.unwrap(dist.call_for_each_replica(model_fn)))
-      self.assertEquals(0.5, self.evaluate(mirrored_var))
+      self.evaluate(distribution.unwrap(
+          distribution.extended.call_for_each_replica(model_fn)))
+      self.assertEqual(0.5, self.evaluate(mirrored_var))
 
-  @test_util.run_in_graph_and_eager_modes(config=config)
-  def testAssignMirroredVarReplicaContextWithSingleValue(self):
-    self._skip_eager_if_gpus_less_than(1)
+  def testAssignMirroredVarReplicaContextWithSingleValue(self, distribution):
     def var_fn():
       return variable_scope.variable(
           1.0, name="foo", aggregation=variable_scope.VariableAggregation.MEAN)
 
-    dist = mirrored_strategy.MirroredStrategy(
-        ["/device:GPU:0", "/device:CPU:0"])
-
-    with dist.scope():
-      mirrored_var = dist.call_for_each_replica(var_fn)
+    with distribution.scope():
+      mirrored_var = distribution.extended.call_for_each_replica(var_fn)
       self.assertIsInstance(mirrored_var, values.MirroredVariable)
       self.evaluate(variables.global_variables_initializer())
-      self.assertEquals(1.0, self.evaluate(mirrored_var))
+      self.assertEqual(1.0, self.evaluate(mirrored_var))
 
       def model_fn():
         return mirrored_var.assign(5.0)
 
-      self.evaluate(dist.unwrap(dist.call_for_each_replica(model_fn)))
-      self.assertEquals(5.0, self.evaluate(mirrored_var))
+      self.evaluate(distribution.unwrap(
+          distribution.extended.call_for_each_replica(model_fn)))
+      self.assertEqual(5.0, self.evaluate(mirrored_var))
 
-  @test_util.run_in_graph_and_eager_modes(config=config)
-  def testAssignAddMirroredVarCrossDeviceContext(self):
-    self._skip_eager_if_gpus_less_than(1)
+  def testAssignAddMirroredVarCrossDeviceContext(self, distribution):
     def var_fn():
       return variable_scope.variable(1.0, name="foo")
 
-    dist = mirrored_strategy.MirroredStrategy(
-        ["/device:GPU:0", "/device:CPU:0"])
-
-    with dist.scope():
-      mirrored_var = dist.call_for_each_replica(var_fn)
+    with distribution.scope():
+      mirrored_var = distribution.extended.call_for_each_replica(var_fn)
       self.assertIsInstance(mirrored_var, values.MirroredVariable)
       self.evaluate(variables.global_variables_initializer())
-      self.assertEquals(1.0, self.evaluate(mirrored_var))
+      self.assertEqual(1.0, self.evaluate(mirrored_var))
 
       # read_value == True
       mirrored_var_result = self.evaluate(
           mirrored_var.assign_add(6.0, read_value=True))
-      self.assertEquals(7.0, mirrored_var_result)
-      self.assertEquals(7.0, self.evaluate(mirrored_var.get("/device:CPU:0")))
-      self.assertEquals(7.0, self.evaluate(mirrored_var.get("/device:GPU:0")))
+      self.assertEqual(7.0, mirrored_var_result)
+      self.assertEqual(7.0, self.evaluate(mirrored_var.get("/device:CPU:0")))
+      self.assertEqual(7.0, self.evaluate(mirrored_var.get("/device:GPU:0")))
 
       # read_value == False
       self.evaluate(mirrored_var.assign_add(2.0, read_value=False))
-      self.assertEquals(9.0, self.evaluate(mirrored_var.get("/device:CPU:0")))
-      self.assertEquals(9.0, self.evaluate(mirrored_var.get("/device:GPU:0")))
+      self.assertEqual(9.0, self.evaluate(mirrored_var.get("/device:CPU:0")))
+      self.assertEqual(9.0, self.evaluate(mirrored_var.get("/device:GPU:0")))
 
-  @test_util.run_in_graph_and_eager_modes(config=config)
-  def testAssignAddMirroredVarReplicaContext(self):
-    self._skip_eager_if_gpus_less_than(1)
+  def testAssignAddMirroredVarReplicaContext(self, distribution):
     def var_fn():
       return variable_scope.variable(
           1.0, name="foo", aggregation=variable_scope.VariableAggregation.MEAN)
 
-    dist = mirrored_strategy.MirroredStrategy(
-        ["/device:GPU:0", "/device:CPU:0"])
-
-    with dist.scope():
-      mirrored_var = dist.call_for_each_replica(var_fn)
+    with distribution.scope():
+      mirrored_var = distribution.extended.call_for_each_replica(var_fn)
       self.assertIsInstance(mirrored_var, values.MirroredVariable)
       self.evaluate(variables.global_variables_initializer())
-      self.assertEquals(1.0, self.evaluate(mirrored_var))
+      self.assertEqual(1.0, self.evaluate(mirrored_var))
 
       def model_fn():
         value = math_ops.cast(
-            distribution_strategy_context.get_replica_context().replica_id,
+            ds_context.get_replica_context().replica_id_in_sync_group,
             mirrored_var.dtype)
         return mirrored_var.assign_add(value)
 
-      self.evaluate(dist.unwrap(dist.call_for_each_replica(model_fn)))
-      self.assertEquals(1.5, self.evaluate(mirrored_var))
+      self.evaluate(distribution.unwrap(
+          distribution.extended.call_for_each_replica(model_fn)))
+      self.assertEqual(1.5, self.evaluate(mirrored_var))
 
-  @test_util.run_in_graph_and_eager_modes(config=config)
-  def testAssignAddMirroredVarReplicaContextWithSingleValue(self):
-    self._skip_eager_if_gpus_less_than(1)
+  def testAssignAddMirroredVarReplicaContextWithSingleValue(self, distribution):
     def var_fn():
       return variable_scope.variable(
           1.0, name="foo", aggregation=variable_scope.VariableAggregation.MEAN)
 
-    dist = mirrored_strategy.MirroredStrategy(
-        ["/device:GPU:0", "/device:CPU:0"])
-
-    with dist.scope():
-      mirrored_var = dist.call_for_each_replica(var_fn)
+    with distribution.scope():
+      mirrored_var = distribution.extended.call_for_each_replica(var_fn)
       self.assertIsInstance(mirrored_var, values.MirroredVariable)
       self.evaluate(variables.global_variables_initializer())
-      self.assertEquals(1.0, self.evaluate(mirrored_var))
+      self.assertEqual(1.0, self.evaluate(mirrored_var))
 
       def model_fn():
         return mirrored_var.assign_add(5.0)
 
-      self.evaluate(dist.unwrap(dist.call_for_each_replica(model_fn)))
-      self.assertEquals(6.0, self.evaluate(mirrored_var))
+      self.evaluate(distribution.unwrap(
+          distribution.extended.call_for_each_replica(model_fn)))
+      self.assertEqual(6.0, self.evaluate(mirrored_var))
 
-  @test_util.run_in_graph_and_eager_modes(config=config)
-  def testAssignSubMirroredVarCrossDeviceContext(self):
-    self._skip_eager_if_gpus_less_than(1)
+  def testAssignSubMirroredVarCrossDeviceContext(self, distribution):
     def var_fn():
       return variable_scope.variable(5.0, name="foo")
 
-    dist = mirrored_strategy.MirroredStrategy(
-        ["/device:GPU:0", "/device:CPU:0"])
-
-    with dist.scope():
-      mirrored_var = dist.call_for_each_replica(var_fn)
+    with distribution.scope():
+      mirrored_var = distribution.extended.call_for_each_replica(var_fn)
       self.assertIsInstance(mirrored_var, values.MirroredVariable)
       self.evaluate(variables.global_variables_initializer())
-      self.assertEquals(5.0, self.evaluate(mirrored_var))
+      self.assertEqual(5.0, self.evaluate(mirrored_var))
       mirrored_var_result = self.evaluate(mirrored_var.assign_sub(2.0))
-      self.assertEquals(3.0, mirrored_var_result)
-      self.assertEquals(3.0, self.evaluate(mirrored_var.get("/device:GPU:0")))
-      self.assertEquals(3.0, self.evaluate(mirrored_var.get("/device:CPU:0")))
+      self.assertEqual(3.0, mirrored_var_result)
+      self.assertEqual(3.0, self.evaluate(mirrored_var.get("/device:GPU:0")))
+      self.assertEqual(3.0, self.evaluate(mirrored_var.get("/device:CPU:0")))
 
-  @test_util.run_in_graph_and_eager_modes(config=config)
-  def testAssignSubMirroredVarReplicaContext(self):
-    self._skip_eager_if_gpus_less_than(1)
+  def testAssignSubMirroredVarReplicaContext(self, distribution):
     def var_fn():
       return variable_scope.variable(
           5.0, name="foo", aggregation=variable_scope.VariableAggregation.MEAN)
 
-    dist = mirrored_strategy.MirroredStrategy(
-        ["/device:GPU:0", "/device:CPU:0"])
-
-    with dist.scope():
-      mirrored_var = dist.call_for_each_replica(var_fn)
+    with distribution.scope():
+      mirrored_var = distribution.extended.call_for_each_replica(var_fn)
       self.assertIsInstance(mirrored_var, values.MirroredVariable)
       self.evaluate(variables.global_variables_initializer())
-      self.assertEquals(5.0, self.evaluate(mirrored_var))
+      self.assertEqual(5.0, self.evaluate(mirrored_var))
 
       def model_fn():
         value = math_ops.cast(
-            distribution_strategy_context.get_replica_context().replica_id,
+            ds_context.get_replica_context().replica_id_in_sync_group,
             mirrored_var.dtype)
         return mirrored_var.assign_sub(value)
 
-      self.evaluate(dist.unwrap(dist.call_for_each_replica(model_fn)))
-      self.assertEquals(4.5, self.evaluate(mirrored_var))
+      self.evaluate(distribution.unwrap(
+          distribution.extended.call_for_each_replica(model_fn)))
+      self.assertEqual(4.5, self.evaluate(mirrored_var))
 
-  @test_util.run_in_graph_and_eager_modes(config=config)
-  def testAssignSubMirroredVarReplicaContextWithSingleValue(self):
-    self._skip_eager_if_gpus_less_than(1)
+  def testAssignSubMirroredVarReplicaContextWithSingleValue(self, distribution):
     def var_fn():
       return variable_scope.variable(
           5.0, name="foo", aggregation=variable_scope.VariableAggregation.MEAN)
 
-    dist = mirrored_strategy.MirroredStrategy(
-        ["/device:GPU:0", "/device:CPU:0"])
-
-    with dist.scope():
-      mirrored_var = dist.call_for_each_replica(var_fn)
+    with distribution.scope():
+      mirrored_var = distribution.extended.call_for_each_replica(var_fn)
       self.assertIsInstance(mirrored_var, values.MirroredVariable)
       self.evaluate(variables.global_variables_initializer())
-      self.assertEquals(5.0, self.evaluate(mirrored_var))
+      self.assertEqual(5.0, self.evaluate(mirrored_var))
 
       def model_fn():
         return mirrored_var.assign_sub(1.0)
 
-      self.evaluate(dist.unwrap(dist.call_for_each_replica(model_fn)))
-      self.assertEquals(4.0, self.evaluate(mirrored_var))
+      self.evaluate(distribution.unwrap(
+          distribution.extended.call_for_each_replica(model_fn)))
+      self.assertEqual(4.0, self.evaluate(mirrored_var))
 
 
+@combinations.generate(combinations.combine(
+    distribution=[
+        combinations.mirrored_strategy_with_gpu_and_cpu,
+        combinations.core_mirrored_strategy_with_gpu_and_cpu],
+    mode=["graph", "eager"]))
 class MirroredAndReplicaLocalVariableInitializerTest(test.TestCase):
-  config = config_pb2.ConfigProto()
-  config.allow_soft_placement = True
 
-  def testAssignMirroredVarInitializer(self):
+  def testAssignMirroredVarInitializer(self, distribution):
     # This test is not eager compatible since in eager variables are initialized
     # upon construction instead of once the initialization op is run.
     with context.graph_mode():
@@ -1145,17 +1042,14 @@ class MirroredAndReplicaLocalVariableInitializerTest(test.TestCase):
         v = variable_scope.variable(1.0, name="foo")
         return v
 
-      dist = mirrored_strategy.MirroredStrategy(
-          ["/device:GPU:0", "/device:CPU:0"])
-
-      with dist.scope():
-        mirrored_var = dist.call_for_each_replica(var_fn)
+      with distribution.scope():
+        mirrored_var = distribution.extended.call_for_each_replica(var_fn)
         self.assertIsInstance(mirrored_var, values.MirroredVariable)
         self.assertFalse(self.evaluate(mirrored_var.is_initialized()))
         self.evaluate(mirrored_var.initializer)
         self.assertTrue(self.evaluate(mirrored_var.is_initialized()))
 
-  def testAssignReplicaLocalVarInitializer(self):
+  def testAssignReplicaLocalVarInitializer(self, distribution):
     # This test is not eager compatible since in eager variables are initialized
     # upon construction instead of once the initialization op is run.
     with context.graph_mode():
@@ -1167,11 +1061,9 @@ class MirroredAndReplicaLocalVariableInitializerTest(test.TestCase):
         self.assertTrue(isinstance(v_sum, values.ReplicaLocalVariable))
         return v_sum
 
-      dist = mirrored_strategy.MirroredStrategy(
-          ["/device:GPU:0", "/device:CPU:0"])
-
-      with dist.scope():
-        replica_local_var = dist.call_for_each_replica(model_fn)
+      with distribution.scope():
+        replica_local_var = distribution.extended.call_for_each_replica(
+            model_fn)
         self.assertTrue(isinstance(replica_local_var,
                                    values.ReplicaLocalVariable))
         self.assertFalse(self.evaluate(replica_local_var.is_initialized()))
@@ -1179,17 +1071,14 @@ class MirroredAndReplicaLocalVariableInitializerTest(test.TestCase):
         self.assertTrue(self.evaluate(replica_local_var.is_initialized()))
 
 
+@combinations.generate(combinations.combine(
+    distribution=[
+        combinations.mirrored_strategy_with_gpu_and_cpu,
+        combinations.core_mirrored_strategy_with_gpu_and_cpu],
+    mode=["graph", "eager"]))
 class ReplicaLocalVariableAssignTest(test.TestCase):
-  config = config_pb2.ConfigProto()
-  config.allow_soft_placement = True
 
-  def _skip_eager_if_gpus_less_than(self, num_gpus):
-    if context.num_gpus() < num_gpus and context.executing_eagerly():
-      self.skipTest("Not enough GPUs available for this test in eager mode.")
-
-  @test_util.run_in_graph_and_eager_modes(config=config)
-  def testAssignReplicaLocalVarSumAggregation(self):
-    self._skip_eager_if_gpus_less_than(1)
+  def testAssignReplicaLocalVarSumAggregation(self, distribution):
     def model_fn():
       v_sum = variable_scope.variable(
           1.0,
@@ -1197,18 +1086,16 @@ class ReplicaLocalVariableAssignTest(test.TestCase):
           aggregation=variable_scope.VariableAggregation.SUM)
       return v_sum
 
-    dist = mirrored_strategy.MirroredStrategy(
-        ["/device:GPU:0", "/device:CPU:0"])
-
-    with dist.scope():
-      replica_local_var = dist.call_for_each_replica(model_fn)
+    with distribution.scope():
+      replica_local_var = distribution.extended.call_for_each_replica(model_fn)
       self.assertTrue(isinstance(replica_local_var,
                                  values.ReplicaLocalVariable))
       self.evaluate(variables.global_variables_initializer())
       # Each replica has a value of 1.0 assigned to it in replica context.
       # When we read the value using `read_var` we should see the SUM of each of
       # values on each of the replicas.
-      self.assertEqual(2.0, self.evaluate(dist.read_var(replica_local_var)))
+      self.assertEqual(2.0, self.evaluate(
+          distribution.read_var(replica_local_var)))
       # Assigning 6.0 in cross replica context will assign a value of
       # 6.0/num_replicas to each replica.
       tlv_ops = replica_local_var.assign(6.0)
@@ -1216,11 +1103,10 @@ class ReplicaLocalVariableAssignTest(test.TestCase):
       # On reading the replica local var we should get the assigned value back.
       # The value on all the replicas are added before being returned by
       # `read_var`.
-      self.assertEqual(6.0, self.evaluate(dist.read_var(replica_local_var)))
+      self.assertEqual(6.0, self.evaluate(
+          distribution.read_var(replica_local_var)))
 
-  @test_util.run_in_graph_and_eager_modes(config=config)
-  def testAssignReplicaLocalVarMeanAggregation(self):
-    self._skip_eager_if_gpus_less_than(1)
+  def testAssignReplicaLocalVarMeanAggregation(self, distribution):
     def model_fn():
       v_sum = variable_scope.variable(
           1.0,
@@ -1228,23 +1114,22 @@ class ReplicaLocalVariableAssignTest(test.TestCase):
           aggregation=variable_scope.VariableAggregation.MEAN)
       return v_sum
 
-    dist = mirrored_strategy.MirroredStrategy(
-        ["/device:GPU:0", "/device:CPU:0"])
-
-    with dist.scope():
-      replica_local_var = dist.call_for_each_replica(model_fn)
+    with distribution.scope():
+      replica_local_var = distribution.extended.call_for_each_replica(model_fn)
       self.assertTrue(isinstance(replica_local_var,
                                  values.ReplicaLocalVariable))
       self.evaluate(variables.global_variables_initializer())
       # Each replica has a value of 1.0 assigned to it in replica context.
       # When we read the value using `read_var` we should see the MEAN of values
       # on all replicas which is the value assigned in replica context.
-      self.assertEqual(1.0, self.evaluate(dist.read_var(replica_local_var)))
+      self.assertEqual(1.0, self.evaluate(
+          distribution.read_var(replica_local_var)))
       tlv_ops = replica_local_var.assign(6.0)
       self.evaluate(tlv_ops)
       # On reading the replica local var we should get the MEAN of all values
       # which is equal to the value assigned.
-      self.assertEqual(6.0, self.evaluate(dist.read_var(replica_local_var)))
+      self.assertEqual(6.0, self.evaluate(
+          distribution.read_var(replica_local_var)))
 
 
 class MockModel(object):
@@ -1278,24 +1163,25 @@ class MiniModel(keras_training.Model):
     return self.fc(inputs)
 
 
+@combinations.generate(combinations.combine(
+    distribution=[
+        combinations.mirrored_strategy_with_gpu_and_cpu,
+        combinations.core_mirrored_strategy_with_gpu_and_cpu],
+    mode=["graph", "eager"]))
 class MirroredStrategyDefunTest(test.TestCase):
 
-  def _skip_eager_if_gpus_less_than(self, num_gpus):
-    if context.num_gpus() < num_gpus and context.executing_eagerly():
-      self.skipTest("Not enough GPUs available for this test in eager mode.")
-
-  def _call_and_check(self, model_fn, inputs, expected_result, defuns,
-                      two_variables=False):
+  def _call_and_check(self, distribution, model_fn, inputs, expected_result,
+                      defuns, two_variables=False):
     cpu_dev = device_util.canonicalize("CPU:0")
     gpu_dev = device_util.canonicalize("GPU:0")
     devices = [cpu_dev, gpu_dev]
-    dist = mirrored_strategy.MirroredStrategy(devices)
 
-    with dist.scope():
+    with distribution.scope():
       mock_model = MockModel(two_variables)
       self.evaluate(variables.global_variables_initializer())
 
-      result = dist.call_for_each_replica(model_fn, args=[mock_model] + inputs)
+      result = distribution.extended.call_for_each_replica(
+          model_fn, args=[mock_model] + inputs)
       for device in devices:
         device_result = values.select_device(device, result)
         device_expected_result = values.select_device(device, expected_result)
@@ -1307,17 +1193,15 @@ class MirroredStrategyDefunTest(test.TestCase):
         # call_for_each has one trace per device. To check that the expected set
         # of variables was accessed on each trace, we first retrieve each
         # device-specific graph function.
-        per_replica_graph_functions = dist.call_for_each_replica(
-            defun.get_concrete_function, args=[mock_model] + inputs)
+        per_replica_graph_functions = (
+            distribution.extended.call_for_each_replica(
+                defun.get_concrete_function, args=[mock_model] + inputs))
         for device in devices:
           graph_function = per_replica_graph_functions.get(device=device)
           self.assertEqual(set(mock_model.variables),
                            set(graph_function.graph.variables))
 
-  @test_util.run_in_graph_and_eager_modes()
-  def testVariableInDefun(self):
-    self._skip_eager_if_gpus_less_than(1)
-
+  def testVariableInDefun(self, distribution):
     @function.defun
     def times_two(mock_model):
       return mock_model()
@@ -1325,12 +1209,9 @@ class MirroredStrategyDefunTest(test.TestCase):
     def model_fn(mock_model):
       return times_two(mock_model)
 
-    self._call_and_check(model_fn, [], 2.5, [times_two])
-
-  @test_util.run_in_graph_and_eager_modes()
-  def testVariableInNestedDefun(self):
-    self._skip_eager_if_gpus_less_than(1)
+    self._call_and_check(distribution, model_fn, [], 2.5, [times_two])
 
+  def testVariableInNestedDefun(self, distribution):
     @function.defun
     def times_two(mock_model):
       return mock_model()
@@ -1342,12 +1223,10 @@ class MirroredStrategyDefunTest(test.TestCase):
     def model_fn(mock_model):
       return two_x_plus_one(mock_model)
 
-    self._call_and_check(model_fn, [], 3.5, [times_two, two_x_plus_one])
-
-  @test_util.run_in_graph_and_eager_modes()
-  def testTwoVariablesInNestedDefun(self):
-    self._skip_eager_if_gpus_less_than(1)
+    self._call_and_check(distribution, model_fn, [], 3.5,
+                         [times_two, two_x_plus_one])
 
+  def testTwoVariablesInNestedDefun(self, distribution):
     @function.defun
     def fn1(mock_model):
       return mock_model()
@@ -1359,12 +1238,10 @@ class MirroredStrategyDefunTest(test.TestCase):
     def model_fn(mock_model):
       return fn2(mock_model)
 
-    self._call_and_check(model_fn, [], 5.5, [fn1, fn2], two_variables=True)
-
-  @test_util.run_in_graph_and_eager_modes()
-  def testGradientTapeOverNestedDefuns(self):
-    self._skip_eager_if_gpus_less_than(1)
+    self._call_and_check(distribution, model_fn, [], 5.5, [fn1, fn2],
+                         two_variables=True)
 
+  def testGradientTapeOverNestedDefuns(self, distribution):
     @function.defun
     def fn1(mock_model):
       return mock_model()
@@ -1380,13 +1257,10 @@ class MirroredStrategyDefunTest(test.TestCase):
                              [v.get() for v in mock_model.variables])
       return grads
 
-    self._call_and_check(model_fn, [], [2.0, 1.0], [fn1, fn2],
+    self._call_and_check(distribution, model_fn, [], [2.0, 1.0], [fn1, fn2],
                          two_variables=True)
 
-  @test_util.run_in_graph_and_eager_modes()
-  def testPassPerReplica(self):
-    self._skip_eager_if_gpus_less_than(1)
-
+  def testPassPerReplica(self, distribution):
     @function.defun
     def fn1(mock_model, factor):
       return mock_model(factor)
@@ -1394,18 +1268,10 @@ class MirroredStrategyDefunTest(test.TestCase):
     factors = values.PerReplica({"CPU:0": 5.0, "GPU:0": 3.0})
     expected_result = values.PerReplica({"CPU:0": 5.0 * 1.25,
                                          "GPU:0": 3.0 * 1.25})
-    self._call_and_check(fn1, [factors], expected_result, [fn1])
+    self._call_and_check(distribution, fn1, [factors], expected_result, [fn1])
 
-  @test_util.run_in_graph_and_eager_modes()
-  def testTrain(self):
-    self._skip_eager_if_gpus_less_than(1)
-
-    cpu_dev = device_util.canonicalize("CPU:0")
-    gpu_dev = device_util.canonicalize("GPU:0")
-    devices = [cpu_dev, gpu_dev]
-    dist = mirrored_strategy.MirroredStrategy(devices)
-
-    with dist.scope():
+  def testTrain(self, distribution):
+    with distribution.scope():
       mock_model = MiniModel()
       mock_model.call = function.defun(mock_model.call)
 
@@ -1415,10 +1281,11 @@ class MirroredStrategyDefunTest(test.TestCase):
 
       gradients_fn = backprop.implicit_grad(loss_fn)
       gradients_fn = optimizer_lib.get_filtered_grad_fn(gradients_fn)
-      grads_and_vars = dist.call_for_each_replica(gradients_fn, args=(None,))
+      grads_and_vars = distribution.extended.call_for_each_replica(
+          gradients_fn, args=(None,))
 
       optimizer = gradient_descent.GradientDescentOptimizer(0.25)
-      update_ops = optimizer._distributed_apply(dist, grads_and_vars)  # pylint: disable=protected-access
+      update_ops = optimizer._distributed_apply(distribution, grads_and_vars)  # pylint: disable=protected-access
 
       if not context.executing_eagerly():
         self.evaluate(variables.global_variables_initializer())
@@ -1430,30 +1297,82 @@ class MirroredStrategyDefunTest(test.TestCase):
       self.assertAllEqual([0.5], updated_var_values[1])
 
 
+@combinations.generate(
+    combinations.combine(
+        distribution=[
+            combinations.NamedDistribution(
+                "Mirrored",
+                # pylint: disable=g-long-lambda
+                lambda: mirrored_strategy.MirroredStrategy(num_gpus_per_worker=
+                                                           context.num_gpus()),
+                required_gpus=1),
+            combinations.NamedDistribution(
+                "CoreMirrored",
+                # pylint: disable=g-long-lambda
+                lambda: mirrored_strategy.CoreMirroredStrategy(
+                    mirrored_strategy.all_local_devices()),
+                required_gpus=1)
+        ],
+        mode=["graph"]))
 class MultiWorkerMirroredStrategyTest(
     multi_worker_test_base.MultiWorkerTestBase,
     strategy_test_lib.DistributionTestBase):
 
-  def _get_distribution_strategy(self):
+  def _configure_distribution_strategy(self, distribution):
     cluster_spec = server_lib.ClusterSpec({
         "worker": ["/job:worker/task:0", "/job:worker/task:1"]
     })
-    strategy = mirrored_strategy.MirroredStrategy(num_gpus=context.num_gpus())
-    strategy.configure(cluster_spec=cluster_spec)
-    return strategy
+    distribution.configure(cluster_spec=cluster_spec)
 
-  def test_num_replicas_in_sync(self):
-    if not GPU_TEST:
-      self.skipTest("Not GPU test")
-
-    strategy = self._get_distribution_strategy()
+  def test_num_replicas_in_sync(self, distribution):
+    self._configure_distribution_strategy(distribution)
     # We calculate the total number of gpus across the workers(2) specified in
     # the cluster spec.
-    self.assertEqual(context.num_gpus() * 2, strategy.num_replicas_in_sync)
+    self.assertEqual(context.num_gpus() * 2, distribution.num_replicas_in_sync)
 
-  def testMinimizeLossGraph(self):
-    self._test_minimize_loss_graph(self._get_distribution_strategy(),
-                                   learning_rate=0.05)
+  def testMinimizeLossGraph(self, distribution):
+    self._configure_distribution_strategy(distribution)
+    self._test_minimize_loss_graph(distribution, learning_rate=0.05)
+
+  def testDeviceScope(self, distribution):
+    """Test the device scope of multi-worker MirroredStrategy."""
+    self._configure_distribution_strategy(distribution)
+    with distribution.scope():
+      a = constant_op.constant(1.)
+      with ops.device("/cpu:0"):
+        b = constant_op.constant(1.)
+      self.assertEqual(a.device, "/job:worker/task:0")
+      self.assertEqual(b.device, "/job:worker/task:0/device:CPU:0")
+
+  def testMakeInputFnIterator(self, distribution):
+    self._configure_distribution_strategy(distribution)
+    dataset_fn = lambda: dataset_ops.Dataset.range(100)
+    num_gpus = context.num_gpus()
+    num_workers = 2
+
+    expected_values = [[i+j for j in range(num_gpus)] * num_workers
+                       for i in range(0, 100, num_gpus)]
+
+    with context.graph_mode(), self.cached_session() as sess:
+      # `expected_input_pipeline_id` is None because the input_fn will be called
+      # multiple times, each with a different input_pipeline_id.
+      input_fn = self._input_fn_to_test_input_context(
+          dataset_fn,
+          expected_num_replicas_in_sync=num_workers*num_gpus,
+          expected_num_input_pipelines=num_workers,
+          expected_input_pipeline_id=None)
+      iterator = distribution.make_input_fn_iterator(input_fn)
+      self._test_input_fn_iterator(
+          iterator, distribution.extended.worker_devices, expected_values, sess)
+
+  def testUpdateConfigProto(self, distribution):
+    distribution.configure(cluster_spec={"worker": ["fake1", "fake2"]})
+
+    config_proto = config_pb2.ConfigProto()
+    new_config = distribution.update_config_proto(config_proto)
+
+    # Verify isolate_session_state
+    self.assertTrue(new_config.isolate_session_state)
 
 
 class MultiWorkerMirroredStrategyTestWithChief(
@@ -1473,6 +1392,19 @@ class MultiWorkerMirroredStrategyTestWithChief(
     strategy.configure(cluster_spec=self._cluster_spec)
     self._test_minimize_loss_graph(strategy, learning_rate=0.05)
 
+  def testMinimizeLossGraphCoreMirroredStrategy(self):
+    strategy = mirrored_strategy.CoreMirroredStrategy(
+        mirrored_strategy.all_local_devices())
+    strategy.configure(cluster_spec=self._cluster_spec)
+    self._test_minimize_loss_graph(strategy, learning_rate=0.05)
+
+
+def _replica_id():
+  replica_id = ds_context.get_replica_context().replica_id_in_sync_group
+  if not isinstance(replica_id, ops.Tensor):
+    replica_id = constant_op.constant(replica_id)
+  return replica_id
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/distribute/python/mirrored_strategy_test.py b/tensorflow/contrib/distribute/python/mirrored_strategy_test.py
deleted file mode 100644
index bea684e77ca..00000000000
--- a/tensorflow/contrib/distribute/python/mirrored_strategy_test.py
+++ /dev/null
@@ -1,107 +0,0 @@
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for class MirroredStrategy."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.contrib.distribute.python import mirrored_strategy
-from tensorflow.contrib.distribute.python import strategy_test_lib
-from tensorflow.python.eager import context
-from tensorflow.python.eager import test
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import test_util
-from tensorflow.python.ops import variable_scope
-from tensorflow.python.training import distribution_strategy_context
-
-
-class MirroredOneCPUDistributionTest(strategy_test_lib.DistributionTestBase):
-
-  def _get_distribution_strategy(self):
-    return mirrored_strategy.MirroredStrategy(["/device:CPU:0"])
-
-  def testMinimizeLossEager(self):
-    self._test_minimize_loss_eager(self._get_distribution_strategy())
-
-  def testMinimizeLossGraph(self):
-    self._test_minimize_loss_graph(self._get_distribution_strategy())
-
-  def testDeviceIndex(self):
-    self._test_device_index(self._get_distribution_strategy())
-
-  def testReplicaId(self):
-    self._test_replica_id(self._get_distribution_strategy())
-
-  @test_util.run_in_graph_and_eager_modes
-  def testCallAndMergeExceptions(self):
-    self._test_call_and_merge_exceptions(self._get_distribution_strategy())
-
-
-class VariableCreatorStackTest(test.TestCase):
-
-  def testCreatorStacksAreThreadLocal(self):
-    devices = ["/device:CPU:0", "/device:GPU:0"]
-    dist = mirrored_strategy.MirroredStrategy(devices)
-
-    def model_fn(device_id):
-      assert isinstance(device_id, int)
-
-      def thread_creator_fn(next_creator, *args, **kwargs):
-        return next_creator(*args, **kwargs) + ":thread_" + str(device_id)
-
-      with variable_scope.variable_creator_scope(thread_creator_fn):
-        # Create a variable in this scope.
-        v = variable_scope.variable(1.0)
-
-        # This will pause the current thread, and execute the other thread.
-        distribution_strategy_context.get_replica_context().merge_call(
-            lambda _: _)
-      return v
-
-    def main_thread_creator(next_creator, *args, **kwargs):
-      # We are not using the underlying next_creator for test purposes.
-      del next_creator, args, kwargs
-      return "main_thread"
-
-    with context.graph_mode(), \
-        dist.scope(), \
-        variable_scope.variable_creator_scope(main_thread_creator):
-      result = dist.call_for_each_replica(
-          model_fn, args=(dist.worker_device_index,))
-      result = dist.unwrap(result)
-      expected = ["main_thread:thread_0", "main_thread:thread_1"]
-      self.assertEquals(expected, result)
-
-
-class MultiWorkerMirroredStrategyTest(test.TestCase):
-
-  def testDeviceScope(self):
-    """Test the device scope of multi-worker MirroredStrategy."""
-    with context.graph_mode():
-      strategy = mirrored_strategy.MirroredStrategy(num_gpus=context.num_gpus())
-      strategy.configure(
-          cluster_spec={"worker": ["/job:worker/task:0", "/job:worker/task:1"]})
-      with strategy.scope():
-        a = constant_op.constant(1.)
-        with ops.device("/cpu:0"):
-          b = constant_op.constant(1.)
-        self.assertEqual(a.device, "/job:worker/task:0")
-        self.assertEqual(b.device, "/job:worker/task:0/device:CPU:0")
-
-
-if __name__ == "__main__":
-  test.main()
diff --git a/tensorflow/contrib/distribute/python/moving_averages_test.py b/tensorflow/contrib/distribute/python/moving_averages_test.py
index 7ecc852d205..c492d8bafc9 100644
--- a/tensorflow/contrib/distribute/python/moving_averages_test.py
+++ b/tensorflow/contrib/distribute/python/moving_averages_test.py
@@ -32,7 +32,8 @@ from tensorflow.python.training import moving_averages
 all_combinations = combinations.combine(
     distribution=[combinations.default_strategy,
                   combinations.one_device_strategy,
-                  combinations.mirrored_strategy_with_gpu_and_cpu],
+                  combinations.mirrored_strategy_with_gpu_and_cpu,
+                  combinations.core_mirrored_strategy_with_gpu_and_cpu],
     mode=["graph"])
 
 
diff --git a/tensorflow/contrib/distribute/python/multi_worker_test_base.py b/tensorflow/contrib/distribute/python/multi_worker_test_base.py
index 8eec3dc0f6e..147c9b83f86 100644
--- a/tensorflow/contrib/distribute/python/multi_worker_test_base.py
+++ b/tensorflow/contrib/distribute/python/multi_worker_test_base.py
@@ -18,8 +18,11 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import collections
 import contextlib
 import copy
+import json
+import os
 import threading
 import numpy as np
 
@@ -271,7 +274,6 @@ class MultiWorkerTestBase(test.TestCase):
 
     return config
 
-
   def _run_client(self, client_fn, task_type, task_id, num_gpus, *args,
                   **kwargs):
     result = client_fn(task_type, task_id, num_gpus, *args, **kwargs)
@@ -303,3 +305,101 @@ class MultiWorkerTestBase(test.TestCase):
     for t in threads:
       t.join()
     self.assertEqual(self._result, len(threads))
+
+
+class MockOsEnv(collections.Mapping):
+  """A class that allows per-thread TF_CONFIG."""
+
+  def __init__(self, *args):
+    self._dict = dict()
+    self._thread_local = threading.local()
+    super(MockOsEnv, self).__init__(*args)
+
+  def get(self, key, default=None):
+    if not hasattr(self._thread_local, 'dict'):
+      self._thread_local.dict = dict()
+    if key == 'TF_CONFIG':
+      return dict.get(self._thread_local.dict, key, default)
+    else:
+      return dict.get(self._dict, key, default)
+
+  def __getitem__(self, key):
+    if not hasattr(self._thread_local, 'dict'):
+      self._thread_local.dict = dict()
+    if key == 'TF_CONFIG':
+      return dict.__getitem__(self._thread_local.dict, key)
+    else:
+      return dict.__getitem__(self._dict, key)
+
+  def __setitem__(self, key, val):
+    if not hasattr(self._thread_local, 'dict'):
+      self._thread_local.dict = dict()
+    if key == 'TF_CONFIG':
+      return dict.__setitem__(self._thread_local.dict, key, val)
+    else:
+      return dict.__setitem__(self._dict, key, val)
+
+  def __iter__(self):
+    if not hasattr(self._thread_local, 'dict'):
+      self._thread_local.dict = dict()
+    for x in self._thread_local.dict.items():
+      yield x
+    for x in self._dict.items():
+      yield x
+
+  def __len__(self):
+    if not hasattr(self._thread_local, 'dict'):
+      self._thread_local.dict = dict()
+    return self._thread_local.dict.__len__() + self._dict.__len__()
+
+
+class IndependentWorkerTestBase(test.TestCase):
+  """Testing infra for independent workers."""
+
+  def setUp(self):
+    self._mock_os_env = MockOsEnv()
+    self._mock_context = test.mock.patch.object(os, 'environ',
+                                                self._mock_os_env)
+    super(IndependentWorkerTestBase, self).setUp()
+    self._mock_context.__enter__()
+
+  def tearDown(self):
+    self._mock_context.__exit__(None, None, None)
+    super(IndependentWorkerTestBase, self).tearDown()
+
+  def _task_thread(self, task_fn, tf_config, *args, **kwargs):
+    os.environ['TF_CONFIG'] = json.dumps(tf_config)
+    task_fn(*args, **kwargs)
+
+  def _run_task_in_thread(self, task_fn, cluster_spec, task_type, task_id,
+                          *args, **kwargs):
+    if task_type:
+      tf_config = {
+          'cluster': cluster_spec,
+          'task': {
+              'type': task_type,
+              'index': task_id
+          }
+      }
+    else:
+      tf_config = {
+          'cluster': cluster_spec,
+      }
+    t = threading.Thread(
+        target=self._task_thread,
+        args=(task_fn, tf_config) + args,
+        kwargs=kwargs)
+    t.start()
+    return t
+
+  def run_multiple_tasks_in_threads(self, task_fn, cluster_spec, *args,
+                                    **kwargs):
+    # The task_fn should create std_server by itself.
+    threads = {}
+    for task_type in cluster_spec.keys():
+      threads[task_type] = []
+      for task_id in range(len(cluster_spec[task_type])):
+        t = self._run_task_in_thread(task_fn, cluster_spec, task_type, task_id,
+                                     *args, **kwargs)
+        threads[task_type].append(t)
+    return threads
diff --git a/tensorflow/contrib/distribute/python/one_device_strategy.py b/tensorflow/contrib/distribute/python/one_device_strategy.py
index a0d8f938874..e322b6acb84 100644
--- a/tensorflow/contrib/distribute/python/one_device_strategy.py
+++ b/tensorflow/contrib/distribute/python/one_device_strategy.py
@@ -20,12 +20,14 @@ from __future__ import print_function
 
 import six
 
-from tensorflow.contrib.distribute.python import values
+from tensorflow.python.distribute import device_util
+from tensorflow.python.distribute import distribute_lib
+from tensorflow.python.distribute import values
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.training import distribute as distribute_lib
 from tensorflow.python.util import nest
 
 
@@ -39,7 +41,14 @@ class OneDeviceStrategy(distribute_lib.DistributionStrategy):
   # implementations?
 
   def __init__(self, device):
-    super(OneDeviceStrategy, self).__init__()
+    super(OneDeviceStrategy, self).__init__(OneDeviceExtended(self, device))
+
+
+class OneDeviceExtended(distribute_lib.DistributionStrategyExtended):
+  """Implementation of OneDeviceStrategy."""
+
+  def __init__(self, container_strategy, device):
+    super(OneDeviceExtended, self).__init__(container_strategy)
     self._device = device
     self._default_device = device
 
@@ -58,17 +67,33 @@ class OneDeviceStrategy(distribute_lib.DistributionStrategy):
     with ops.colocate_with(colocate_with):
       return next_creator(*args, **kwargs)
 
-  def distribute_dataset(self, dataset_fn):
+  def _make_dataset_iterator(self, dataset):
+    """Make iterator from dataset without splitting the batch."""
+    worker = device_util.canonicalize("/device:CPU:0")
+    worker_device_pairs = [(worker, [self._device])]
+    return values.DatasetIterator(dataset, worker_device_pairs)
+
+  def _distribute_dataset(self, dataset_fn):
     return values.PerReplicaDataset(
         self._call_dataset_fn(dataset_fn), [self._device])
 
-  def _broadcast(self, tensor, destinations):
+  def _make_input_fn_iterator(
+      self,
+      input_fn,
+      replication_mode=distribute_lib.InputReplicationMode.PER_WORKER):
+    worker = device_util.canonicalize("/device:CPU:0")
+    worker_device_pairs = [(worker, [self._device])]
+    return values.InputFunctionIterator(
+        input_fn, worker_device_pairs,
+        [distribute_lib.InputContext()])
+
+  def _broadcast_to(self, tensor, destinations):
     del destinations
     return tensor
 
   # TODO(priyag): Deal with OutOfRange errors  once b/111349762 is fixed.
-  def _run_steps_on_dataset(self, fn, iterator, iterations,
-                            initial_loop_values=None):
+  def _experimental_run_steps_on_iterator(self, fn, iterator, iterations,
+                                          initial_loop_values=None):
     if initial_loop_values is None:
       initial_loop_values = {}
     initial_loop_values = nest.flatten(initial_loop_values)
@@ -80,7 +105,7 @@ class OneDeviceStrategy(distribute_lib.DistributionStrategy):
       fn_inputs = iterator.get_next()
       if not isinstance(fn_inputs, tuple):
         fn_inputs = (fn_inputs,)
-      fn_result = fn(ctx, *fn_inputs)
+      fn_result = fn(ctx, fn_inputs)
       flat_last_step_outputs = nest.flatten(ctx.last_step_outputs)
       with ops.control_dependencies([fn_result]):
         return [i + 1] + flat_last_step_outputs
@@ -114,25 +139,24 @@ class OneDeviceStrategy(distribute_lib.DistributionStrategy):
     return ctx
 
   def _call_for_each_replica(self, fn, args, kwargs):
-    with ops.device(self._device), _OneDeviceReplicaContext(self):
+    strategy = self._container_strategy()
+    with ops.device(self._device), _OneDeviceReplicaContext(strategy):
       return fn(*args, **kwargs)
 
-  def _reduce(self, aggregation, value, destinations):
-    del aggregation, destinations
+  def _reduce_to(self, reduce_op, value, destinations):
+    del reduce_op, destinations
     return value
 
-  def _update(self, var, options, fn, *args, **kwargs):
+  def _update(self, var, fn, args, kwargs, group):
     # The implementations of _update() and _update_non_slot() are identical
     # except _update() passes `var` as the first argument to `fn()`.
-    return self._update_non_slot(var, options, fn, var, *args, **kwargs)
+    return self._update_non_slot(var, fn, (var,) + tuple(args), kwargs, group)
 
-  def _update_non_slot(self, colocate_with, options, fn, *args, **kwargs):
+  def _update_non_slot(self, colocate_with, fn, args, kwargs, group):
     del colocate_with
-    should_group = options.pop("grouped")
-    assert not options  # Validate that we are processing all of the options.
     with ops.device(self._device), distribute_lib.UpdateContext(self._device):
       result = fn(*args, **kwargs)
-      if should_group:
+      if group:
         return result
       else:
         return nest.map_structure(self._unwrap, result)
@@ -148,11 +172,7 @@ class OneDeviceStrategy(distribute_lib.DistributionStrategy):
     return value
 
   @property
-  def num_replicas(self):
-    return 1
-
-  @property
-  def num_replicas_in_sync(self):
+  def _num_replicas_in_sync(self):
     return 1
 
   @property
@@ -167,8 +187,22 @@ class OneDeviceStrategy(distribute_lib.DistributionStrategy):
     del var_list
     return [self._device]
 
-  def _worker_device_index(self):
-    return 0
+  @property
+  def experimental_should_init(self):
+    return True
+
+  @property
+  def should_checkpoint(self):
+    return True
+
+  @property
+  def should_save_summary(self):
+    return True
+
+  # TODO(priyag): Delete this once all strategies use global batch size.
+  @property
+  def _global_batch_size(self):
+    return True
 
 
 class _OneDeviceReplicaContext(distribute_lib.ReplicaContext):
@@ -176,12 +210,10 @@ class _OneDeviceReplicaContext(distribute_lib.ReplicaContext):
 
   def __init__(self, distribution_strategy):
     distribute_lib.ReplicaContext.__init__(
-        self, distribution_strategy, replica_id=0)
-
-  @property
-  def device(self):
-    raise RuntimeError("Use .devices instead")
+        self,
+        distribution_strategy,
+        replica_id_in_sync_group=constant_op.constant(0, dtypes.int32))
 
   @property
   def devices(self):
-    return [self._distribution_strategy.worker_devices[0]]
+    return [self._distribution_strategy.extended.worker_devices[0]]
diff --git a/tensorflow/contrib/distribute/python/one_device_strategy_test.py b/tensorflow/contrib/distribute/python/one_device_strategy_test.py
index 95f4cdb7868..d46cd6f529e 100644
--- a/tensorflow/contrib/distribute/python/one_device_strategy_test.py
+++ b/tensorflow/contrib/distribute/python/one_device_strategy_test.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 
 from tensorflow.contrib.distribute.python import one_device_strategy
 from tensorflow.contrib.distribute.python import strategy_test_lib
+from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.eager import test
 from tensorflow.python.framework import test_util
 
@@ -35,9 +36,6 @@ class OneDeviceStrategyTest(strategy_test_lib.DistributionTestBase):
   def testMinimizeLossGraph(self):
     self._test_minimize_loss_graph(self._get_distribution_strategy())
 
-  def testDeviceIndex(self):
-    self._test_device_index(self._get_distribution_strategy())
-
   def testReplicaId(self):
     self._test_replica_id(self._get_distribution_strategy())
 
@@ -45,6 +43,20 @@ class OneDeviceStrategyTest(strategy_test_lib.DistributionTestBase):
   def testCallAndMergeExceptions(self):
     self._test_call_and_merge_exceptions(self._get_distribution_strategy())
 
+  @test_util.run_in_graph_and_eager_modes
+  def testMakeInputFnIterator(self):
+    d = one_device_strategy.OneDeviceStrategy("/device:CPU:0")
+    dataset_fn = lambda: dataset_ops.Dataset.range(10)
+    expected_values = [[i] for i in range(10)]
+    input_fn = self._input_fn_to_test_input_context(
+        dataset_fn,
+        expected_num_replicas_in_sync=1,
+        expected_num_input_pipelines=1,
+        expected_input_pipeline_id=0)
+    iterator = d.make_input_fn_iterator(input_fn)
+    self._test_input_fn_iterator(
+        iterator, d.extended.worker_devices, expected_values)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/distribute/python/parameter_server_strategy.py b/tensorflow/contrib/distribute/python/parameter_server_strategy.py
index 790b37f8601..eaeb4d70301 100644
--- a/tensorflow/contrib/distribute/python/parameter_server_strategy.py
+++ b/tensorflow/contrib/distribute/python/parameter_server_strategy.py
@@ -18,10 +18,14 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.distribute.python import cross_tower_ops as cross_tower_ops_lib
+import copy
+
 from tensorflow.contrib.distribute.python import mirrored_strategy
-from tensorflow.contrib.distribute.python import values
+from tensorflow.python.distribute import cross_device_ops as cross_device_ops_lib
+from tensorflow.python.distribute import device_util
+from tensorflow.python.distribute import distribute_lib
 from tensorflow.python.distribute import multi_worker_util
+from tensorflow.python.distribute import values
 from tensorflow.python.eager import context
 from tensorflow.python.framework import device as tf_device
 from tensorflow.python.framework import ops
@@ -30,8 +34,6 @@ from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import variable_scope as vs
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import device_setter
-from tensorflow.python.training import device_util
-from tensorflow.python.training import distribute as distribute_lib
 from tensorflow.python.util import nest
 
 _LOCAL_CPU = "/device:CPU:0"
@@ -94,13 +96,21 @@ class ParameterServerStrategy(distribute_lib.DistributionStrategy):
       ValueError: if `cluster_spec` is given but `task_type` or `task_id` is
         not.
     """
-    super(ParameterServerStrategy, self).__init__()
+    super(ParameterServerStrategy, self).__init__(
+        ParameterServerExtended(self, num_gpus_per_worker))
+
+
+class ParameterServerExtended(distribute_lib.DistributionStrategyExtended):
+  """Implementation of ParameterServerStrategy."""
+
+  def __init__(self, container_strategy, num_gpus_per_worker):
+    super(ParameterServerExtended, self).__init__(container_strategy)
     self._num_gpus_per_worker = num_gpus_per_worker
     self._initialize_local(num_gpus_per_worker)
 
     # We typically don't need to do all-reduce in this strategy.
-    self._cross_tower_ops = (
-        cross_tower_ops_lib.ReductionToOneDeviceCrossDeviceOps(
+    self._cross_device_ops = (
+        cross_device_ops_lib.ReductionToOneDeviceCrossDeviceOps(
             reduce_to_device=_LOCAL_CPU))
 
   def _initialize_multi_worker(self, num_gpus_per_worker, cluster_spec,
@@ -189,6 +199,7 @@ class ParameterServerStrategy(distribute_lib.DistributionStrategy):
 
   def _initialize_local(self, num_gpus_per_worker):
     """Initialize internal devices for local training."""
+    self._worker_device = device_util.canonicalize("/device:CPU:0")
     # Define compute devices which is a list of device strings and one for each
     # replica. When there are GPUs, replicate operations on these GPUs.
     # Otherwise, place operations on CPU.
@@ -221,15 +232,48 @@ class ParameterServerStrategy(distribute_lib.DistributionStrategy):
         "ParameterServerStrategy with compute_devices = %r, "
         "variable_device = %r", self._compute_devices, self._variable_device)
 
-  def distribute_dataset(self, dataset_fn):
+  def _distribute_dataset(self, dataset_fn):
     """Distributes the dataset to each local GPU."""
     return values.PerReplicaDataset(
         self._call_dataset_fn(dataset_fn), self._compute_devices, True)
 
-  def _broadcast(self, tensor, destinations):
-    if not cross_tower_ops_lib.check_destinations(destinations):
+  def _make_dataset_iterator(self, dataset):
+    worker_device_pairs = [(self._worker_device, self._compute_devices)]
+    return values.DatasetIterator(dataset, worker_device_pairs,
+                                  self._num_replicas_in_sync)
+
+  def _make_input_fn_iterator(
+      self,
+      input_fn,
+      replication_mode=distribute_lib.InputReplicationMode.PER_WORKER):
+    """Distributes the dataset to each local GPU."""
+    if self._cluster_spec:
+      input_pipeline_id = multi_worker_util.id_in_cluster(
+          self._cluster_spec, self._task_type, self._task_id)
+      num_input_pipelines = multi_worker_util.worker_count(
+          self._cluster_spec, self._task_type)
+    else:
+      input_pipeline_id = 0
+      num_input_pipelines = 1
+    input_context = distribute_lib.InputContext(
+        num_input_pipelines=num_input_pipelines,
+        input_pipeline_id=input_pipeline_id,
+        num_replicas_in_sync=self._num_replicas_in_sync)
+    worker_device_pairs = [(self._worker_device, self._compute_devices)]
+    return values.InputFunctionIterator(
+        input_fn, worker_device_pairs, [input_context])
+
+  def _broadcast_to(self, tensor, destinations):
+    # This is both a fast path for Python constants, and a way to delay
+    # converting Python values to a tensor until we know what type it
+    # should be converted to. Otherwise we have trouble with:
+    #   global_step.assign_add(1)
+    # since the `1` gets broadcast as an int32 but global_step is int64.
+    if isinstance(tensor, (float, int)):
+      return tensor
+    if not cross_device_ops_lib.check_destinations(destinations):
       destinations = self._compute_devices
-    return self._cross_tower_ops.broadcast(tensor, destinations)
+    return self._cross_device_ops.broadcast(tensor, destinations)
 
   def _allow_variable_partition(self):
     return not context.executing_eagerly()
@@ -237,7 +281,7 @@ class ParameterServerStrategy(distribute_lib.DistributionStrategy):
   # TODO(yuefengz): not all ops in device_setter.STANDARD_PS_OPS will go through
   # this creator, such as "MutableHashTable".
   def _create_variable(self, next_creator, *args, **kwargs):
-    if self.num_replicas_in_sync > 1:
+    if self._num_replicas_in_sync > 1:
       aggregation = kwargs.pop("aggregation", vs.VariableAggregation.NONE)
       if aggregation not in (
           vs.VariableAggregation.NONE,
@@ -293,39 +337,35 @@ class ParameterServerStrategy(distribute_lib.DistributionStrategy):
 
   def _call_for_each_replica(self, fn, args, kwargs):
     # pylint: disable=protected-access
-    return mirrored_strategy._call_for_each_replica(self, fn, args, kwargs)
+    return mirrored_strategy._call_for_each_replica(
+        self._container_strategy(), fn, args, kwargs)
 
   def _verify_destinations_not_different_worker(self, destinations):
     if not self._cluster_spec:
       return
     if destinations is None:
       return
-    for d in cross_tower_ops_lib.get_devices_from(destinations):
+    for d in cross_device_ops_lib.get_devices_from(destinations):
       d_spec = tf_device.DeviceSpec.from_string(d)
       if d_spec.job == self._task_type and d_spec.task != self._task_id:
         raise ValueError(
             "Cannot reduce to another worker: %r, current worker is %r" %
             (d, self._worker_device))
 
-  def _reduce(self, aggregation, value, destinations):
+  def _reduce_to(self, reduce_op, value, destinations):
     self._verify_destinations_not_different_worker(destinations)
     if not isinstance(value, values.DistributedValues):
       # pylint: disable=protected-access
       return mirrored_strategy._reduce_non_distributed_value(
-          self, aggregation, value, destinations)
-    if aggregation == vs.VariableAggregation.ONLY_FIRST_REPLICA:
-      return self.broadcast(value.get(self._compute_devices[0]), destinations)
-    return self._cross_tower_ops.reduce(
-        aggregation, value, destinations=destinations)
+          self, reduce_op, value, destinations)
+    return self._cross_device_ops.reduce(
+        reduce_op, value, destinations=destinations)
 
-  def _batch_reduce(self, aggregation, value_destination_pairs):
-    if aggregation == vs.VariableAggregation.ONLY_FIRST_REPLICA:
-      return [self.broadcast(v.get(self._compute_devices[0]), d)
-              for v, d in value_destination_pairs]
+  def _batch_reduce_to(self, reduce_op, value_destination_pairs):
     for _, destinations in value_destination_pairs:
       self._verify_destinations_not_different_worker(destinations)
-    return self._cross_tower_ops.batch_reduce(aggregation,
-                                              value_destination_pairs)
+    return self._cross_device_ops.batch_reduce(reduce_op,
+                                               value_destination_pairs)
 
   def _select_single_value(self, structured):
     """Select any single values in `structured`."""
@@ -349,30 +389,26 @@ class ParameterServerStrategy(distribute_lib.DistributionStrategy):
 
     return nest.map_structure(_select_fn, structured)
 
-  def _update(self, var, options, fn, *args, **kwargs):
+  def _update(self, var, fn, args, kwargs, group):
     if isinstance(var, values.AggregatingVariable):
       var = var.get()
     if not isinstance(var, resource_variable_ops.ResourceVariable):
       raise ValueError(
           "You can not update `var` %r. It must be a Variable." % var)
-    should_group = options.pop("grouped")
-    assert not options  # Validate that we are processing all of the options.
     with ops.colocate_with(var), distribute_lib.UpdateContext(var.device):
       result = fn(var, *self._select_single_value(args),
                   **self._select_single_value(kwargs))
-      if should_group:
+      if group:
         return result
       else:
         return nest.map_structure(self._unwrap, result)
 
   # TODO(yuefengz): does it need to call _select_single_value?
-  def _update_non_slot(self, colocate_with, options, fn, *args, **kwargs):
-    should_group = options.pop("grouped")
-    assert not options  # Validate that we are processing all of the options.
+  def _update_non_slot(self, colocate_with, fn, args, kwargs, group):
     with ops.device(
         colocate_with.device), distribute_lib.UpdateContext(colocate_with):
       result = fn(*args, **kwargs)
-      if should_group:
+      if group:
         return result
       else:
         return nest.map_structure(self._unwrap, result)
@@ -398,11 +434,11 @@ class ParameterServerStrategy(distribute_lib.DistributionStrategy):
     # variables.
     return array_ops.identity(var)
 
-  def configure(self,
-                session_config=None,
-                cluster_spec=None,
-                task_type=None,
-                task_id=None):
+  def _configure(self,
+                 session_config=None,
+                 cluster_spec=None,
+                 task_type=None,
+                 task_id=None):
     """Configures the strategy class.
 
     The strategy object will be re-initialized if `cluster_spec` is given but
@@ -433,28 +469,30 @@ class ParameterServerStrategy(distribute_lib.DistributionStrategy):
       self._initialize_multi_worker(self._num_gpus_per_worker,
                                     self._cluster_spec, task_type, task_id)
 
-    if not session_config or not self._cluster_spec:
-      return
+    if session_config:
+      session_config.CopyFrom(self._update_config_proto(session_config))
 
-    session_config.isolate_session_state = False
+  def _update_config_proto(self, config_proto):
+    updated_config = copy.deepcopy(config_proto)
+    if not self._cluster_spec:
+      updated_config.isolate_session_state = True
+      return updated_config
+
+    updated_config.isolate_session_state = False
 
-    assert self._cluster_spec
     assert self._task_type
     assert self._task_id is not None
 
     # The device filters prevent communication between workers.
     if self._task_type not in ["chief", "worker"]:
-      return
-    del session_config.device_filters[:]
-    session_config.device_filters.extend(
+      return updated_config
+    del updated_config.device_filters[:]
+    updated_config.device_filters.extend(
         ["/job:%s/task:%d" % (self._task_type, self._task_id), "/job:ps"])
+    return updated_config
 
   @property
-  def num_replicas(self):
-    return len(self._compute_devices)
-
-  @property
-  def num_replicas_in_sync(self):
+  def _num_replicas_in_sync(self):
     return len(self._compute_devices)
 
   @property
@@ -470,11 +508,12 @@ class ParameterServerStrategy(distribute_lib.DistributionStrategy):
     return min(var_list, key=lambda x: x.name)
 
   @property
-  def between_graph(self):
+  def experimental_between_graph(self):
+    # TODO(yuefengz): Should this return False in the local case?
     return True
 
   @property
-  def should_init(self):
+  def experimental_should_init(self):
     return self._is_chief
 
   @property
@@ -484,3 +523,8 @@ class ParameterServerStrategy(distribute_lib.DistributionStrategy):
   @property
   def should_save_summary(self):
     return self._is_chief
+
+  # TODO(priyag): Delete this once all strategies use global batch size.
+  @property
+  def _global_batch_size(self):
+    return False
diff --git a/tensorflow/contrib/distribute/python/parameter_server_strategy_test.py b/tensorflow/contrib/distribute/python/parameter_server_strategy_test.py
index 81a23c89030..83d7473666a 100644
--- a/tensorflow/contrib/distribute/python/parameter_server_strategy_test.py
+++ b/tensorflow/contrib/distribute/python/parameter_server_strategy_test.py
@@ -25,14 +25,21 @@ from absl.testing import parameterized
 from tensorflow.contrib.distribute.python import combinations
 from tensorflow.contrib.distribute.python import multi_worker_test_base
 from tensorflow.contrib.distribute.python import parameter_server_strategy
-from tensorflow.contrib.distribute.python import values
+from tensorflow.contrib.distribute.python import strategy_test_lib
 from tensorflow.core.protobuf import config_pb2
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.distribute import device_util
+from tensorflow.python.distribute import distribution_strategy_context as ds_context
 from tensorflow.python.distribute import multi_worker_util
+from tensorflow.python.distribute import reduce_util
+from tensorflow.python.distribute import values
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
 from tensorflow.python.estimator import run_config
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_util
 from tensorflow.python.layers import core
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
@@ -41,8 +48,6 @@ from tensorflow.python.ops import partitioned_variables
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
-from tensorflow.python.training import device_util
-from tensorflow.python.training import distribution_strategy_context
 from tensorflow.python.training import training_util
 
 CHIEF = run_config.TaskType.CHIEF
@@ -50,6 +55,13 @@ WORKER = run_config.TaskType.WORKER
 PS = run_config.TaskType.PS
 
 
+def _get_replica_id_integer():
+  replica_id = ds_context.get_replica_context().replica_id_in_sync_group
+  if isinstance(replica_id, ops.Tensor):
+    replica_id = tensor_util.constant_value(replica_id)
+  return replica_id
+
+
 class ParameterServerStrategyTestBase(
     multi_worker_test_base.MultiWorkerTestBase):
 
@@ -94,9 +106,8 @@ class ParameterServerStrategyTestBase(
         if num_gpus == 0:
           last_part_device = 'device:CPU:0'
         else:
-          last_part_device = (
-              'device:GPU:%d' %
-              distribution_strategy_context.get_replica_context().replica_id)
+          replica_id = _get_replica_id_integer()
+          last_part_device = ('device:GPU:%d' % replica_id)
 
         a = constant_op.constant(1.0)
         b = constant_op.constant(2.0)
@@ -261,18 +272,16 @@ class ParameterServerStrategyTestBase(
         if 'CPU' in compute_device:
           replica_compute_device = '/device:CPU:0'
         else:
-          replica_compute_device = (
-              '/device:GPU:%d' %
-              distribution_strategy_context.get_replica_context().replica_id)
+          replica_id = _get_replica_id_integer()
+          replica_compute_device = ('/device:GPU:%d' % replica_id)
         replica_compute_device = device_util.canonicalize(
             replica_compute_device)
 
         if 'CPU' in variable_device:
           replica_variable_device = '/device:CPU:0'
         else:
-          replica_variable_device = (
-              '/device:GPU:%d' %
-              distribution_strategy_context.get_replica_context().replica_id)
+          replica_id = _get_replica_id_integer()
+          replica_variable_device = ('/device:GPU:%d' % replica_id)
         replica_variable_device = device_util.canonicalize(
             replica_variable_device)
 
@@ -354,9 +363,9 @@ class ParameterServerStrategyTestBase(
   def _test_simple_increment(self, task_type, task_id, num_gpus):
     d, master_target, sess_config = self._get_test_objects(
         task_type, task_id, num_gpus)
-    if hasattr(d, '_cluster_spec') and d._cluster_spec:
-      num_workers = len(d._cluster_spec.as_dict().get(WORKER))
-      if 'chief' in d._cluster_spec.as_dict():
+    if d.extended._cluster_spec:
+      num_workers = len(d.extended._cluster_spec.as_dict().get(WORKER))
+      if 'chief' in d.extended._cluster_spec.as_dict():
         num_workers += 1
     else:
       num_workers = 1
@@ -389,7 +398,7 @@ class ParameterServerStrategyTestBase(
       x, y, z, train_op = d.call_for_each_replica(model_fn)
       train_op = d.group(train_op)
 
-      if context.num_gpus() < d._num_gpus_per_worker:
+      if context.num_gpus() < d.extended._num_gpus_per_worker:
         return True
 
       if task_id == 0:
@@ -426,9 +435,9 @@ class ParameterServerStrategyTestBase(
         task_type, task_id, num_gpus)
     if task_type:
       # Multi-worker
-      assert hasattr(d, '_cluster_spec') and d._cluster_spec
-      num_workers = len(d._cluster_spec.as_dict().get(WORKER))
-      if CHIEF in d._cluster_spec.as_dict():
+      assert hasattr(d.extended, '_cluster_spec') and d.extended._cluster_spec
+      num_workers = len(d.extended._cluster_spec.as_dict().get(WORKER))
+      if CHIEF in d.extended._cluster_spec.as_dict():
         num_workers += 1
     else:
       # local
@@ -472,8 +481,8 @@ class ParameterServerStrategyTestBase(
           before_list.append(fetched)
           with ops.control_dependencies([fetched]):
             # TODO(yuefengz): support non-Mirrored variable as destinations.
-            g = d.reduce(
-                variable_scope.VariableAggregation.SUM, g, destinations=v)
+            g = d.extended.reduce_to(
+                reduce_util.ReduceOp.SUM, g, destinations=v)
             with ops.control_dependencies(
                 d.update(v, update, g, grouped=False)):
               after_list.append(d.read_var(v))
@@ -481,11 +490,12 @@ class ParameterServerStrategyTestBase(
 
       before_out, after_out = step()
 
-      if context.num_gpus() < d._num_gpus_per_worker:
+      if context.num_gpus() < d.extended._num_gpus_per_worker:
         return True
 
       if (not task_type or
-          multi_worker_util.is_chief(d._cluster_spec, task_type, task_id)):
+          multi_worker_util.is_chief(
+              d.extended._cluster_spec, task_type, task_id)):
         variables.global_variables_initializer().run()
 
       # Workers waiting for chief worker's initializing variables.
@@ -508,8 +518,40 @@ class ParameterServerStrategyTestBase(
       self.assertLess(error_after, error_before)
       return error_after < error_before
 
+  def _test_input_fn_iterator(self, task_type, task_id, num_gpus, input_fn,
+                              expected_values):
+    distribution, master_target, config = self._get_test_objects(
+        task_type, task_id, num_gpus)
+    devices = distribution.extended.worker_devices
+
+    with ops.Graph().as_default(), \
+         self.cached_session(config=config,
+                             target=master_target) as sess:
+      iterator = distribution.make_input_fn_iterator(input_fn)
+      sess.run(iterator.initialize())
+
+      for expected_value in expected_values:
+        next_element = iterator.get_next()
+        computed_value = sess.run(
+            [values.select_device(d, next_element) for d in devices])
+        self.assertEqual(expected_value, computed_value)
+
+      with self.assertRaises(errors.OutOfRangeError):
+        next_element = iterator.get_next()
+        sess.run([values.select_device(d, next_element) for d in devices])
+
+      # After re-initializing the iterator, should be able to iterate again.
+      sess.run(iterator.initialize())
+
+      for expected_value in expected_values:
+        next_element = iterator.get_next()
+        computed_value = sess.run(
+            [values.select_device(d, next_element) for d in devices])
+        self.assertEqual(expected_value, computed_value)
+
 
 class ParameterServerStrategyTest(ParameterServerStrategyTestBase,
+                                  strategy_test_lib.DistributionTestBase,
                                   parameterized.TestCase):
 
   @classmethod
@@ -574,6 +616,73 @@ class ParameterServerStrategyTest(ParameterServerStrategyTestBase,
   def testMinimizeLossGraphLocal(self, num_gpus):
     self._test_minimize_loss_graph(None, None, num_gpus)
 
+  # TODO(priyag): Refactor this and other multi worker tests.
+  @combinations.generate(
+      combinations.combine(mode=['graph'], num_gpus=[1, 2], required_gpus=1))
+  def testMakeInputFnIteratorDistributed(self, num_gpus):
+    if context.num_gpus() < num_gpus:
+      self.skipTest('Not enough GPUs')
+    dataset_fn = lambda: dataset_ops.Dataset.range(100)
+    expected_values = [[i+j for j in range(num_gpus)]
+                       for i in range(0, 100, num_gpus)]
+
+    input_fn = self._input_fn_to_test_input_context(
+        dataset_fn,
+        expected_num_replicas_in_sync=num_gpus,
+        expected_num_input_pipelines=3,
+        expected_input_pipeline_id=1)  # because task_id = 1
+    self._test_input_fn_iterator('worker', 1, num_gpus,
+                                 input_fn, expected_values)
+
+  @combinations.generate(
+      combinations.combine(mode=['graph'], num_gpus=[1, 2], required_gpus=1))
+  def testMakeInputFnIteratorLocal(self, num_gpus):
+    if context.num_gpus() < num_gpus:
+      self.skipTest('Not enough GPUs')
+    dataset_fn = lambda: dataset_ops.Dataset.range(100)
+    expected_values = [[i+j for j in range(num_gpus)]
+                       for i in range(0, 100, num_gpus)]
+
+    input_fn = self._input_fn_to_test_input_context(
+        dataset_fn,
+        expected_num_replicas_in_sync=num_gpus,
+        expected_num_input_pipelines=1,
+        expected_input_pipeline_id=0)  # only one worker and pipeline for local.
+    self._test_input_fn_iterator(None, None, num_gpus,
+                                 input_fn, expected_values)
+
+  def testGlobalStepUpdate(self):
+    strategy = parameter_server_strategy.ParameterServerStrategy(
+        num_gpus_per_worker=context.num_gpus())
+    self._test_global_step_update(strategy)
+
+  def testUpdateConfigProtoMultiWorker(self):
+    distribution = parameter_server_strategy.ParameterServerStrategy(
+        num_gpus_per_worker=2)
+    distribution.configure(
+        cluster_spec=self._cluster_spec, task_type='worker', task_id=1)
+
+    config_proto = config_pb2.ConfigProto(device_filters=['to_be_overridden'])
+
+    new_config = distribution.update_config_proto(config_proto)
+
+    # Verify device filters.
+    self.assertEqual(['/job:worker/task:1', '/job:ps'],
+                     new_config.device_filters)
+
+    # Verify isolate_session_state
+    self.assertFalse(new_config.isolate_session_state)
+
+  def testUpdateConfigProtoLocal(self):
+    distribution = parameter_server_strategy.ParameterServerStrategy(
+        num_gpus_per_worker=2)
+
+    config_proto = config_pb2.ConfigProto()
+    new_config = distribution.update_config_proto(config_proto)
+
+    # Verify isolate_session_state
+    self.assertTrue(new_config.isolate_session_state)
+
 
 class ParameterServerStrategyWithChiefTest(ParameterServerStrategyTestBase,
                                            parameterized.TestCase):
@@ -616,9 +725,9 @@ class ParameterServerStrategyWithChiefTest(ParameterServerStrategyTestBase,
           v = variable_scope.get_variable('v', initializer=10.0)
           _ = v * v
         v, = tape.watched_variables()
-        w = distribution.value_container(v)
+        w = distribution.extended.value_container(v)
         self.assertIs(values.AggregatingVariable, type(w))
-      distribution.call_for_each_replica(f)
+      distribution.extended.call_for_each_replica(f)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/contrib/distribute/python/step_fn.py b/tensorflow/contrib/distribute/python/step_fn.py
index 3dc815f0371..c928b6d9f1f 100644
--- a/tensorflow/contrib/distribute/python/step_fn.py
+++ b/tensorflow/contrib/distribute/python/step_fn.py
@@ -94,7 +94,7 @@ class StandardSingleLossStep(StandardInputStep):
 
   def __call__(self):
     with self._distribution.scope():
-      def step_fn(ctx, *inputs):
+      def step_fn(ctx, inputs):
         """Function to run one iteration with one input."""
         gradients_fn = backprop.implicit_grad(self._loss_fn)
         gradients_fn = optimizer_lib.get_filtered_grad_fn(gradients_fn)
diff --git a/tensorflow/contrib/distribute/python/strategy_test_lib.py b/tensorflow/contrib/distribute/python/strategy_test_lib.py
index 3c0c10430eb..d50b142c5e9 100644
--- a/tensorflow/contrib/distribute/python/strategy_test_lib.py
+++ b/tensorflow/contrib/distribute/python/strategy_test_lib.py
@@ -19,16 +19,21 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.core.protobuf import config_pb2
+from tensorflow.python.distribute import distribution_strategy_context as ds_context
+from tensorflow.python.distribute import reduce_util
+from tensorflow.python.distribute import values
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
 from tensorflow.python.eager import test
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.layers import core
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
-from tensorflow.python.training import distribution_strategy_context
 from tensorflow.python.training import optimizer
 
 
@@ -45,8 +50,7 @@ def _raise_exception_fn(_=None):
 # Must be the argument to a distribution.call_for_each_replica() call, calls a
 # get_replica_context().merge_call() that raises an exception.
 def _merge_raises_fn():
-  distribution_strategy_context.get_replica_context().merge_call(
-      _raise_exception_fn)
+  ds_context.get_replica_context().merge_call(_raise_exception_fn)
 
 
 # Must be the argument to a get_replica_context().merge_call() call, calls
@@ -59,8 +63,7 @@ def _call_raises_fn(dist):
 # calls a get_replica_context().merge_call() that calls a
 # call_for_each_replica() that raises an exception.
 def _merge_call_raises_fn():
-  distribution_strategy_context.get_replica_context().merge_call(
-      _call_raises_fn)
+  ds_context.get_replica_context().merge_call(_call_raises_fn)
 
 
 # Must be the argument to a get_replica_context().merge_call() call, calls
@@ -74,8 +77,7 @@ def _call_merge_raises_fn(dist):
 # get_replica_context().merge_call() that calls a call_for_each_replica() that
 # calls a get_replica_context().merge_call() that raises an exception.
 def _merge_call_merge_raises_fn():
-  distribution_strategy_context.get_replica_context().merge_call(
-      _call_merge_raises_fn)
+  ds_context.get_replica_context().merge_call(_call_merge_raises_fn)
 
 
 class DistributionTestBase(test.TestCase):
@@ -114,8 +116,8 @@ class DistributionTestBase(test.TestCase):
           before_list.append(fetched)
           # control_dependencies irrelevant but harmless in eager execution
           with ops.control_dependencies([fetched]):
-            g = d.reduce(
-                variable_scope.VariableAggregation.SUM, g, destinations=v)
+            g = d.extended.reduce_to(
+                reduce_util.ReduceOp.SUM, g, destinations=v)
             with ops.control_dependencies(d.update(
                 v, update, g, grouped=False)):
               after_list.append(d.read_var(v))
@@ -169,8 +171,8 @@ class DistributionTestBase(test.TestCase):
           fetched = d.read_var(v)
           before_list.append(fetched)
           with ops.control_dependencies([fetched]):
-            g = d.reduce(
-                variable_scope.VariableAggregation.SUM, g, destinations=v)
+            g = d.extended.reduce_to(
+                reduce_util.ReduceOp.SUM, g, destinations=v)
             with ops.control_dependencies(d.update(
                 v, update, g, grouped=False)):
               after_list.append(d.read_var(v))
@@ -189,31 +191,20 @@ class DistributionTestBase(test.TestCase):
       # Error should go down
       self.assertLess(error_after, error_before)
 
-  def _test_device_index(self, d):
-    with d.scope():
-      expected_devices = [False] * len(d.worker_devices)
-
-      def mark_devices_fn(device_id):
-        self.assertLess(device_id, len(d.worker_devices))
-        self.assertFalse(expected_devices[device_id])
-        expected_devices[device_id] = True
-
-      d.call_for_each_replica(mark_devices_fn, args=(d.worker_device_index,))
-      self.assertAllEqual(expected_devices, [True] * len(d.worker_devices))
-
   def _test_replica_id(self, d):
     with d.scope():
-      expected_devices = [False] * len(d.worker_devices)
+      expected_devices = [False] * len(d.extended.worker_devices)
 
       def mark_devices_fn():
-        replica_id = (
-            distribution_strategy_context.get_replica_context().replica_id)
-        self.assertLess(replica_id, len(d.worker_devices))
+        replica_id = self.evaluate(
+            ds_context.get_replica_context().replica_id_in_sync_group)
+        self.assertLess(replica_id, len(d.extended.worker_devices))
         self.assertFalse(expected_devices[replica_id])
         expected_devices[replica_id] = True
 
       d.call_for_each_replica(mark_devices_fn)
-      self.assertAllEqual(expected_devices, [True] * len(d.worker_devices))
+      self.assertAllEqual(expected_devices,
+                          [True] * len(d.extended.worker_devices))
 
   def _test_call_and_merge_exceptions(self, dist):
     with dist.scope():
@@ -225,3 +216,78 @@ class DistributionTestBase(test.TestCase):
         dist.call_for_each_replica(_merge_call_raises_fn)
       with self.assertRaises(_TestException):
         dist.call_for_each_replica(_merge_call_merge_raises_fn)
+
+  def _input_fn_to_test_input_context(self,
+                                      dataset_fn,
+                                      expected_num_replicas_in_sync,
+                                      expected_num_input_pipelines,
+                                      expected_input_pipeline_id):
+    # Use a list of one element as counter so that it can be captured by the
+    # `_input_fn`. This counter is incremented by 1 each time an input_fn is
+    # called. We use this counter to check whether the `input_pipeline_id`
+    # matches the counter in the in-graph replication.
+    worker_id_counter = [0]
+
+    def _input_fn(input_context):
+      """Input fn for testing."""
+      self.assertIsNotNone(input_context)
+      self.assertEqual(expected_num_replicas_in_sync,
+                       input_context.num_replicas_in_sync)
+      self.assertEqual(expected_num_input_pipelines,
+                       input_context.num_input_pipelines)
+      if expected_input_pipeline_id is not None:
+        self.assertEqual(expected_input_pipeline_id,
+                         input_context.input_pipeline_id)
+      else:
+        self.assertEqual(worker_id_counter[0], input_context.input_pipeline_id)
+        worker_id_counter[0] += 1
+
+      return dataset_fn()
+
+    return _input_fn
+
+  def _test_input_fn_iterator(self, iterator, devices, expected_values,
+                              sess=None):
+    evaluate = lambda x: sess.run(x) if sess else self.evaluate(x)
+    evaluate(iterator.initialize())
+
+    for expected_value in expected_values:
+      next_element = iterator.get_next()
+      computed_value = evaluate(
+          [values.select_device(d, next_element) for d in devices])
+      self.assertEqual(expected_value, computed_value)
+
+    with self.assertRaises(errors.OutOfRangeError):
+      next_element = iterator.get_next()
+      evaluate([values.select_device(d, next_element) for d in devices])
+
+    # After re-initializing the iterator, should be able to iterate again.
+    evaluate(iterator.initialize())
+
+    for expected_value in expected_values:
+      next_element = iterator.get_next()
+      computed_value = evaluate(
+          [values.select_device(d, next_element) for d in devices])
+      self.assertEqual(expected_value, computed_value)
+
+  def _test_global_step_update(self, strategy):
+    with strategy.scope():
+      global_step = variable_scope.get_variable(
+          "global_step",
+          shape=[],
+          dtype=dtypes.int64,
+          initializer=init_ops.zeros_initializer(),
+          trainable=False,
+          aggregation=variables.VariableAggregation.ONLY_FIRST_REPLICA)
+      self.evaluate(variables.global_variables_initializer())
+
+      def model_fn():
+        train_op = global_step.assign_add(1)
+        value = global_step.read_value()
+        return train_op, value
+
+      train_ops, value = strategy.call_for_each_replica(model_fn)
+      self.evaluate(strategy.group(train_ops))
+      global_step_tensors = strategy.unwrap(value)
+      global_step_values = self.evaluate(global_step_tensors)
+      self.assertEqual([1] * len(global_step_tensors), global_step_values)
diff --git a/tensorflow/contrib/distribute/python/tpu_strategy.py b/tensorflow/contrib/distribute/python/tpu_strategy.py
index f5b4531ba8c..39ed8f7cf10 100644
--- a/tensorflow/contrib/distribute/python/tpu_strategy.py
+++ b/tensorflow/contrib/distribute/python/tpu_strategy.py
@@ -21,25 +21,28 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import copy
 import functools
 
-from tensorflow.contrib.distribute.python import cross_tower_ops as cross_tower_ops_lib
-from tensorflow.contrib.distribute.python import values
 from tensorflow.contrib.tpu.python.ops import tpu_ops
 from tensorflow.contrib.tpu.python.tpu import tpu
 from tensorflow.contrib.tpu.python.tpu import tpu_system_metadata as tpu_system_metadata_lib
 from tensorflow.contrib.tpu.python.tpu import training_loop
+from tensorflow.python.distribute import cross_device_ops as cross_device_ops_lib
+from tensorflow.python.distribute import device_util
+from tensorflow.python.distribute import distribute_lib
+from tensorflow.python.distribute import reduce_util
+from tensorflow.python.distribute import values
 from tensorflow.python.eager import context
 from tensorflow.python.eager import tape
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variable_scope as vs
-from tensorflow.python.ops import variables as variables_lib
-from tensorflow.python.training import device_util
-from tensorflow.python.training import distribute as distribute_lib
 from tensorflow.python.util import nest
 
 
@@ -130,8 +133,21 @@ class TPUStrategy(distribute_lib.DistributionStrategy):
       num_cores: Number of cores to use on the TPU. If None specified, then
           auto-detect the cores and topology of the TPU system.
     """
-    super(TPUStrategy, self).__init__()
+    super(TPUStrategy, self).__init__(TPUExtended(
+        self, tpu_cluster_resolver, steps_per_run, num_cores))
 
+  @property
+  def steps_per_run(self):
+    """DEPRECATED: use .extended.steps_per_run instead."""
+    return self._extended.steps_per_run
+
+
+class TPUExtended(distribute_lib.DistributionStrategyExtended):
+  """Implementation of TPUStrategy."""
+
+  def __init__(self, container_strategy, tpu_cluster_resolver, steps_per_run,
+               num_cores=None):
+    super(TPUExtended, self).__init__(container_strategy)
     self._tpu_cluster_resolver = tpu_cluster_resolver
     self._tpu_metadata = get_tpu_system_metadata(self._tpu_cluster_resolver)
     # TODO(sourabhbajaj): Change this from num_cores to metadata_override
@@ -145,7 +161,7 @@ class TPUStrategy(distribute_lib.DistributionStrategy):
     self._host_device = self.get_host_cpu_device(0)
     self._tpu_devices = sorted(device_map.keys())
     # Only create variables for the number of replicas we're running.
-    self._tpu_devices = self._tpu_devices[:self.num_replicas]
+    self._tpu_devices = self._tpu_devices[:self._num_replicas_in_sync]
 
     # TODO(sourabhbajaj): Remove this once performance of running one step
     # at a time is comparable to multiple steps.
@@ -214,7 +230,17 @@ class TPUStrategy(distribute_lib.DistributionStrategy):
 
     return enqueue_op_per_host
 
-  def distribute_dataset(self, dataset_fn):
+  def _make_dataset_iterator(self, dataset):
+    """Make iterators for each of the TPU hosts."""
+
+    worker_devices = [
+        (self.get_host(hid), [self.get_host_cpu_device(hid)])
+        for hid in range(self.num_hosts)
+    ]
+    return values.DatasetIterator(dataset, worker_devices,
+                                  self._num_replicas_in_sync)
+
+  def _distribute_dataset(self, dataset_fn):
     worker_devices = [
         (self.get_host(hid), [self.get_host_cpu_device(hid)])
         for hid in range(self.num_hosts)
@@ -225,12 +251,11 @@ class TPUStrategy(distribute_lib.DistributionStrategy):
   # TODO(priyag): Deal with OutOfRange errors once b/111349762 is fixed.
   # TODO(sourabhbajaj): Remove the initial_loop_values parameter when we have
   # a mechanism to infer the outputs of `fn`. Pending b/110550782.
-  def _run_steps_on_dataset(self, fn, multi_worker_iterator, iterations,
-                            initial_loop_values=None):
-
+  def _experimental_run_steps_on_iterator(
+      self, fn, multi_worker_iterator, iterations, initial_loop_values=None):
     output_shapes = multi_worker_iterator.output_shapes
     shapes = nest.flatten(output_shapes)
-    if any([not s.is_fully_defined() for s in shapes]):
+    if any(not s.is_fully_defined() for s in shapes):
       raise ValueError(
           "TPU currently requires fully defined shapes. Either use "
           "set_shape() on the input tensors or use "
@@ -251,13 +276,13 @@ class TPUStrategy(distribute_lib.DistributionStrategy):
       initial_loop_values = {}
     initial_loop_values = nest.flatten(initial_loop_values)
     ctx = values.MultiStepContext()
-    def run_fn(*args, **kwargs):
+
+    def run_fn():
       """Single step on the TPU device."""
-      del args, kwargs
       fn_inputs = dequeue_fn()
       if not isinstance(fn_inputs, tuple):
         fn_inputs = (fn_inputs,)
-      fn_result = fn(ctx, *fn_inputs)
+      fn_result = fn(ctx, fn_inputs)
       flat_last_step_outputs = nest.flatten(ctx.last_step_outputs)
       if flat_last_step_outputs:
         with ops.control_dependencies([fn_result]):
@@ -265,11 +290,6 @@ class TPUStrategy(distribute_lib.DistributionStrategy):
       else:
         return fn_result
 
-    # TODO(sourabhbajaj): The input to while loop should be based on the output
-    # type of the step_fn
-    def iterate_on_tpu():
-      return training_loop.repeat(iterations, run_fn, initial_loop_values)
-
     # We capture the control_flow_context at this point, before we run `fn`
     # inside a while_loop and TPU replicate context. This is useful in cases
     # where we might need to exit these contexts and get back to the outer
@@ -279,38 +299,70 @@ class TPUStrategy(distribute_lib.DistributionStrategy):
     self._outer_control_flow_context = (
         ops.get_default_graph()._get_control_flow_context())  # pylint: disable=protected-access
 
-    replicate_inputs = [[]] * self.num_replicas
-    replicate_outputs = tpu.replicate(iterate_on_tpu, replicate_inputs)
+    def rewrite_fn(*args):
+      """The rewritten step fn running on TPU."""
+      del args
+      replicate_inputs = [[]] * self._num_replicas_in_sync
+      replicate_outputs = tpu.replicate(run_fn, replicate_inputs)
+
+      # If run_fn has tensor outputs, tpu.replicate returns a list of list. We
+      # will flatten it in this case. If run_fn has no tensor outputs,
+      # tpu.replicate returns a list of no_ops, we will keep the output as it
+      # is.
+      if isinstance(replicate_outputs[0], list):
+        replicate_outputs = nest.flatten(replicate_outputs)
+
+      return replicate_outputs
+
+    # TODO(sourabhbajaj): The input to while loop should be based on the output
+    # type of the step_fn
+    assert isinstance(initial_loop_values, list)
+    initial_loop_values = initial_loop_values * self._num_replicas_in_sync
+
+    # Put the while loop op on host 0.
+    with ops.device(self.get_host_cpu_device(0)):
+      replicate_outputs = training_loop.repeat(iterations, rewrite_fn,
+                                               initial_loop_values)
+
     del self._outer_control_flow_context
     ctx.run_op = control_flow_ops.group(replicate_outputs, enqueue_ops)
 
-    # Filter out any ops from the outputs, typically this would be the case
-    # when there were no tensor outputs.
-    last_step_tensor_outputs = [x for x in replicate_outputs
-                                if not isinstance(x, ops.Operation)]
+    if isinstance(replicate_outputs, list):
+      # Filter out any ops from the outputs, typically this would be the case
+      # when there were no tensor outputs.
+      last_step_tensor_outputs = [
+          x for x in replicate_outputs if not isinstance(x, ops.Operation)
+      ]
 
-    # Outputs are currently of the structure (grouped by device)
-    # [[output0_device0, output1_device0, output2_device0],
-    #  [output0_device1, output1_device1, output2_device1]]
-    # Convert this to the following structure instead: (grouped by output)
-    # [[output0_device0, output0_device1],
-    #  [output1_device0, output1_device1],
-    #  [output2_device0, output2_device1]]
-    last_step_tensor_outputs = [list(x) for x in zip(*last_step_tensor_outputs)]
+      # Outputs are currently of the structure (flattened)
+      # [output0_device0, output1_device0, output2_device0,
+      #  output0_device1, output1_device1, output2_device1,
+      #  ...]
+      # Convert this to the following structure instead: (grouped by output)
+      # [[output0_device0, output0_device1],
+      #  [output1_device0, output1_device1],
+      #  [output2_device0, output2_device1]]
+      output_num = len(last_step_tensor_outputs) // self._num_replicas_in_sync
+      last_step_tensor_outputs = [
+          last_step_tensor_outputs[i::output_num] for i in range(output_num)
+      ]
+    else:
+      # no tensors returned.
+      last_step_tensor_outputs = []
 
     # Convert replicate_outputs to the original dict structure of
     # last_step_outputs.
     last_step_tensor_outputs_dict = nest.pack_sequence_as(
         ctx.last_step_outputs, last_step_tensor_outputs)
 
-    for (name, aggregation) in ctx._last_step_outputs_aggregations.items():  # pylint: disable=protected-access
+    for name, reduce_op in ctx._last_step_outputs_reduce_ops.items():  # pylint: disable=protected-access
       output = last_step_tensor_outputs_dict[name]
-      # For outputs that have already been aggregated, take the first value
+      # For outputs that have already been reduced, take the first value
       # from the list as each value should be the same. Else return the full
       # list of values.
-      # TODO(josh11b): If aggregation is NONE, we should return a PerReplica
+      # TODO(josh11b): If reduce_op is NONE, we should return a PerReplica
       # value.
-      if aggregation is not variables_lib.VariableAggregation.NONE:
+      if reduce_op is not None:
         # TODO(priyag): Should this return the element or a list with 1 element
         last_step_tensor_outputs_dict[name] = output[0]
     ctx._set_last_step_outputs(last_step_tensor_outputs_dict)  # pylint: disable=protected-access
@@ -320,10 +372,10 @@ class TPUStrategy(distribute_lib.DistributionStrategy):
   def _call_for_each_replica(self, fn, args, kwargs):
     # TODO(jhseu): Consider making it so call_for_each_replica implies that
     # we're in a tpu.rewrite(), and update TPUMirroredVariable accordingly.
-    with _TPUReplicaContext(self):
+    with _TPUReplicaContext(self._container_strategy()):
       return fn(*args, **kwargs)
 
-  def initialize(self):
+  def _initialize(self):
     if context.executing_eagerly():
       # TODO(priyag): Add appopriate call here when eager is supported for TPUs.
       raise NotImplementedError("Eager mode not supported in TPUStrategy.")
@@ -338,7 +390,7 @@ class TPUStrategy(distribute_lib.DistributionStrategy):
                               tpu.initialize_system())
       return graph.get_collection(_TPU_INITIALIZE_SYSTEM_COLLECTION)
 
-  def finalize(self):
+  def _finalize(self):
     if context.executing_eagerly():
       # TODO(priyag): Add appopriate call here when eager is supported for TPUs.
       raise NotImplementedError("Eager mode not supported in TPUStrategy.")
@@ -346,7 +398,7 @@ class TPUStrategy(distribute_lib.DistributionStrategy):
       return [tpu.shutdown_system()]
 
   def _get_devices_from(self, colocate_with=None):
-     # TODO(jhseu): Change this when we support model parallelism.
+    # TODO(jhseu): Change this when we support model parallelism.
     return self._tpu_devices
 
   def _create_variable(self, next_creator, *args, **kwargs):
@@ -383,12 +435,12 @@ class TPUStrategy(distribute_lib.DistributionStrategy):
     return _create_tpu_mirrored_variable(devices, _real_mirrored_creator, *args,
                                          **kwargs)
 
-  def _reduce(self, aggregation, value, destinations):
+  def _reduce_to(self, reduce_op, value, destinations):
     if values._enclosing_tpu_context() is not None:  # pylint: disable=protected-access
-      if aggregation == vs.VariableAggregation.MEAN:
+      if reduce_op == reduce_util.ReduceOp.MEAN:
         # TODO(jhseu):  Revisit once we support model-parallelism.
-        value *= (1. / self.num_replicas)
-      elif aggregation != vs.VariableAggregation.SUM:
+        value *= (1. / self._num_replicas_in_sync)
+      elif reduce_op != reduce_util.ReduceOp.SUM:
         raise NotImplementedError(
             "Currently only support sum & mean in TPUStrategy.")
       return tpu_ops.cross_replica_sum(value)
@@ -396,27 +448,22 @@ class TPUStrategy(distribute_lib.DistributionStrategy):
     # Validate that the destination is same as the host device
     # Note we don't do this when in replicate context as the reduction is
     # performed on the TPU device itself.
-    devices = cross_tower_ops_lib.get_devices_from(destinations)
+    devices = cross_device_ops_lib.get_devices_from(destinations)
     if len(devices) == 1:
       assert device_util.canonicalize(devices[0]) == device_util.canonicalize(
           self._host_device)
     else:
       raise ValueError("Multiple devices are not supported for TPUStrategy")
 
-    if aggregation == vs.VariableAggregation.ONLY_FIRST_REPLICA:
-      return value[0]
     output = math_ops.add_n(value)
-    if aggregation == vs.VariableAggregation.MEAN:
+    if reduce_op == reduce_util.ReduceOp.MEAN:
       return output * (1. / len(value))
     return output
 
-  def _update(self, var, options, fn, *args, **kwargs):
+  def _update(self, var, fn, args, kwargs, group):
     assert isinstance(var, values.TPUMirroredVariable)
-    should_group = options.pop("grouped")
-    assert not options  # Validate that we are processing all of the options.
-
     if values._enclosing_tpu_context() is not None:  # pylint: disable=protected-access
-      if should_group:
+      if group:
         return fn(var, *args, **kwargs)
       else:
         return [fn(var, *args, **kwargs)]
@@ -431,9 +478,7 @@ class TPUStrategy(distribute_lib.DistributionStrategy):
         updates[d] = fn(v,
                         *values.select_device_mirrored(d, args),
                         **values.select_device_mirrored(d, kwargs))
-    return values.update_regroup(self, updates, should_group)
-
-  # TODO(josh11b): Need to implement _update_non_slot()!
+    return values.update_regroup(self, updates, group)
 
   def read_var(self, var):
     assert isinstance(var, values.TPUMirroredVariable)
@@ -453,14 +498,10 @@ class TPUStrategy(distribute_lib.DistributionStrategy):
   def value_container(self, value):
     return value
 
-  def _broadcast(self, tensor, destinations):
+  def _broadcast_to(self, tensor, destinations):
     del destinations
     return tensor
 
-  @property
-  def num_replicas(self):
-    return self._num_cores_override or self._tpu_metadata.num_cores
-
   @property
   def num_hosts(self):
     return self._tpu_metadata.num_hosts
@@ -470,15 +511,15 @@ class TPUStrategy(distribute_lib.DistributionStrategy):
     return self._tpu_metadata.num_of_cores_per_host
 
   @property
-  def num_replicas_in_sync(self):
-    return self.num_replicas
+  def _num_replicas_in_sync(self):
+    return self._num_cores_override or self._tpu_metadata.num_cores
 
   @property
-  def between_graph(self):
+  def experimental_between_graph(self):
     return False
 
   @property
-  def should_init(self):
+  def experimental_should_init(self):
     return True
 
   @property
@@ -500,14 +541,12 @@ class TPUStrategy(distribute_lib.DistributionStrategy):
   def non_slot_devices(self, var_list):
     return self._host_device
 
-  def _update_non_slot(self, colocate_with, options, fn, *args, **kwargs):
+  def _update_non_slot(self, colocate_with, fn, args, kwargs, group):
     del colocate_with
-    should_group = options.pop("grouped")
-    assert not options  # Validate that we are processing all of the options.
     with ops.device(self._host_device), distribute_lib.UpdateContext(
         self._host_device):
       result = fn(*args, **kwargs)
-      if should_group:
+      if group:
         return result
       else:
         return nest.map_structure(self._unwrap, result)
@@ -521,17 +560,27 @@ class TPUStrategy(distribute_lib.DistributionStrategy):
   def get_host_cpu_device(self, host_id):
     return self.get_host(host_id) + "/device:CPU:0"
 
-  def configure(self,
-                session_config=None,
-                cluster_spec=None,
-                task_type=None,
-                task_id=None):
+  def _configure(self,
+                 session_config=None,
+                 cluster_spec=None,
+                 task_type=None,
+                 task_id=None):
     del cluster_spec, task_type, task_id
     if session_config:
-      session_config.isolate_session_state = True
-      cluster_spec = self._tpu_cluster_resolver.cluster_spec()
-      if cluster_spec:
-        session_config.cluster_def.CopyFrom(cluster_spec.as_cluster_def())
+      session_config.CopyFrom(self._update_config_proto(session_config))
+
+  def _update_config_proto(self, config_proto):
+    updated_config = copy.deepcopy(config_proto)
+    updated_config.isolate_session_state = True
+    cluster_spec = self._tpu_cluster_resolver.cluster_spec()
+    if cluster_spec:
+      updated_config.cluster_def.CopyFrom(cluster_spec.as_cluster_def())
+    return updated_config
+
+  # TODO(priyag): Delete this once all strategies use global batch size.
+  @property
+  def _global_batch_size(self):
+    return True
 
 
 class _TPUReplicaContext(distribute_lib.ReplicaContext):
@@ -540,13 +589,14 @@ class _TPUReplicaContext(distribute_lib.ReplicaContext):
   # TODO(sourabhbajaj): Call for each tower should be updating this.
   def __init__(self, distribution_strategy):
     distribute_lib.ReplicaContext.__init__(
-        self, distribution_strategy, replica_id=0)
-
-  @property
-  def device(self):
-    raise RuntimeError("Use .devices instead")
+        self,
+        distribution_strategy,
+        # TODO(b/118385803): properly initialize replica_id, instead of always 0
+        replica_id_in_sync_group=constant_op.constant(0, dtypes.int32))
 
   @property
   def devices(self):
     distribute_lib.require_replica_context(self)
-    return [self._distribution_strategy.worker_devices[self._replica_id]]
+    ds = self._distribution_strategy
+    replica_id = tensor_util.constant_value(self._replica_id_in_sync_group)
+    return [ds.extended.worker_devices[replica_id]]
diff --git a/tensorflow/contrib/distribute/python/values_test.py b/tensorflow/contrib/distribute/python/values_test.py
index 268393ee801..538b859f3d1 100644
--- a/tensorflow/contrib/distribute/python/values_test.py
+++ b/tensorflow/contrib/distribute/python/values_test.py
@@ -19,12 +19,15 @@ from __future__ import division
 from __future__ import print_function
 
 import os
+from absl.testing import parameterized
 
-from tensorflow.contrib.distribute.python import mirrored_strategy
+from tensorflow.contrib.distribute.python import combinations
 from tensorflow.contrib.distribute.python import multi_worker_test_base
-from tensorflow.contrib.distribute.python import values
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.distribute import device_util
+from tensorflow.python.distribute import distribute_lib
+from tensorflow.python.distribute import values
 from tensorflow.python.eager import context
 from tensorflow.python.eager import test
 from tensorflow.python.estimator import model_fn as model_fn_lib
@@ -34,10 +37,10 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables as variables_lib
-from tensorflow.python.training import device_util
 from tensorflow.python.training import saver as saver_lib
 from tensorflow.python.util import nest
 
@@ -324,20 +327,20 @@ class RegroupAndSelectDeviceTest(test.TestCase):
 
       self.assertTrue(
           isinstance(merged_estimator_spec, model_fn_lib.EstimatorSpec))
-      self.assertEquals(model_fn_lib.ModeKeys.TRAIN, merged_estimator_spec.mode)
+      self.assertEqual(model_fn_lib.ModeKeys.TRAIN, merged_estimator_spec.mode)
       for device_id in range(3):
         d = _device_str(device_id)
-        self.assertEquals(created_estimator_specs[device_id].loss,
-                          merged_estimator_spec.loss.get(d))
-        self.assertEquals(created_estimator_specs[device_id].train_op,
-                          merged_estimator_spec.train_op.get(d))
+        self.assertEqual(created_estimator_specs[device_id].loss,
+                         merged_estimator_spec.loss.get(d))
+        self.assertEqual(created_estimator_specs[device_id].train_op,
+                         merged_estimator_spec.train_op.get(d))
         # Scaffold is populated by `EstimatorSpec.__new__`.
-        self.assertEquals(created_estimator_specs[device_id].scaffold,
-                          merged_estimator_spec.scaffold.get(d))
+        self.assertEqual(created_estimator_specs[device_id].scaffold,
+                         merged_estimator_spec.scaffold.get(d))
         # Also test that we can undo the merge using select_device()
-        self.assertEquals(created_estimator_specs[device_id],
-                          values.select_device(_device_str(device_id),
-                                               merged_estimator_spec))
+        self.assertEqual(created_estimator_specs[device_id],
+                         values.select_device(_device_str(device_id),
+                                              merged_estimator_spec))
 
 
 class PerReplicaDatasetTest(test.TestCase):
@@ -568,7 +571,184 @@ class MultiWorkerDatasetTest(multi_worker_test_base.MultiWorkerTestBase):
         multi_worker_iterator.get_next()
 
 
-class MirroredVariableTest(test.TestCase):
+class InputIteratorTestBase(test.TestCase):
+
+  def _test_iterator(self, input_type, dataset_fn, worker_device_pairs,
+                     expected_values, sess=None, split_batch_by=None):
+    devices = nest.flatten([ds for _, ds in worker_device_pairs])
+
+    if input_type == "input_fn":
+      input_contexts = [
+          distribute_lib.InputContext() for _ in worker_device_pairs]
+      input_fn = lambda _: dataset_fn()
+      iterator = values.InputFunctionIterator(input_fn, worker_device_pairs,
+                                              input_contexts)
+    else:
+      iterator = values.DatasetIterator(dataset_fn(), worker_device_pairs,
+                                        split_batch_by)
+
+    evaluate = lambda x: sess.run(x) if sess else self.evaluate(x)
+
+    evaluate(control_flow_ops.group(iterator.initialize()))
+
+    for expected_value in expected_values:
+      next_element = iterator.get_next()
+      computed_value = evaluate(
+          [values.select_device(d, next_element) for d in devices])
+      self.assertAllEqual(expected_value, computed_value)
+
+    with self.assertRaises(errors.OutOfRangeError):
+      next_element = iterator.get_next()
+      evaluate([values.select_device(d, next_element) for d in devices])
+
+    # After re-initializing the iterator, should be able to iterate again.
+    evaluate(control_flow_ops.group(iterator.initialize()))
+
+    for expected_value in expected_values:
+      next_element = iterator.get_next()
+      computed_value = evaluate(
+          [values.select_device(d, next_element) for d in devices])
+      self.assertAllEqual(expected_value, computed_value)
+
+
+class InputIteratorSingleWorkerTest(InputIteratorTestBase,
+                                    parameterized.TestCase):
+
+  @combinations.generate(combinations.combine(
+      mode=["graph", "eager"],
+      input_type=["input_fn", "dataset"]))
+  def testOneDeviceCPU(self, input_type):
+    worker_device_pairs = [("", ["/device:CPU:0"])]
+    dataset_fn = lambda: dataset_ops.Dataset.range(10)
+
+    expected_values = [[i] for i in range(10)]
+
+    self._test_iterator(input_type, dataset_fn, worker_device_pairs,
+                        expected_values)
+
+  @combinations.generate(combinations.combine(
+      mode=["graph", "eager"],
+      input_type=["input_fn", "dataset"],
+      required_gpus=1))
+  def testTwoDevicesOneGPUOneCPU(self, input_type):
+    worker_device_pairs = [("", ["/device:GPU:0", "/device:CPU:0"])]
+    dataset_fn = lambda: dataset_ops.Dataset.range(10)
+
+    expected_values = [[i, i+1] for i in range(0, 10, 2)]
+
+    self._test_iterator(input_type, dataset_fn, worker_device_pairs,
+                        expected_values)
+
+  @combinations.generate(combinations.combine(
+      mode=["graph", "eager"],
+      input_type=["input_fn", "dataset"],
+      required_gpus=1))
+  def testTupleDataset(self, input_type):
+    worker_device_pairs = [("", ["/device:GPU:0", "/device:CPU:0"])]
+    def dataset_fn():
+      dataset1 = dataset_ops.Dataset.range(10)
+      dataset2 = dataset_ops.Dataset.range(10).map(lambda x: x**2)
+      return dataset_ops.Dataset.zip((dataset1, dataset2))
+
+    expected_values = [[(i, i**2), (i+1, (i+1)**2)] for i in range(0, 10, 2)]
+
+    self._test_iterator(input_type, dataset_fn, worker_device_pairs,
+                        expected_values)
+
+  @combinations.generate(combinations.combine(
+      mode=["graph", "eager"],
+      input_type=["input_fn", "dataset"],
+      required_gpus=1))
+  def testUnevenDatasetBatches(self, input_type):
+    worker_device_pairs = [("", ["/device:GPU:0", "/device:CPU:0"])]
+    dataset_fn = lambda: dataset_ops.Dataset.range(11)
+
+    expected_values = [[i, i+1] for i in range(0, 10, 2)]
+    self._test_iterator(input_type, dataset_fn, worker_device_pairs,
+                        expected_values)
+
+  @combinations.generate(combinations.combine(
+      mode=["graph", "eager"],
+      input_type=["dataset"],
+      split_batch_by=[None, 2],
+      required_gpus=1))
+  def testBatchSplitting(self, input_type, split_batch_by):
+    worker_device_pairs = [("", ["/device:GPU:0", "/device:CPU:0"])]
+    batch_size = 10
+    dataset_fn = lambda: dataset_ops.Dataset.range(100).batch(batch_size)
+
+    updated_batch_size = (
+        batch_size // split_batch_by if split_batch_by else batch_size)
+    expected_values = [[range(i, i+updated_batch_size),
+                        range(i+updated_batch_size, i+2*updated_batch_size)]
+                       for i in range(0, 100, updated_batch_size*2)]
+
+    self._test_iterator(input_type, dataset_fn, worker_device_pairs,
+                        expected_values, sess=None,
+                        split_batch_by=split_batch_by)
+
+
+class InputIteratorMultiWorkerTest(
+    multi_worker_test_base.MultiWorkerTestBase, InputIteratorTestBase,
+    parameterized.TestCase):
+
+  def _cpu_devices(self):
+    return [
+        ("/job:worker/replica:0/task:0",
+         ["/job:worker/replica:0/task:0/device:CPU:0"]),
+        ("/job:worker/replica:0/task:1",
+         ["/job:worker/replica:0/task:1/device:CPU:0"])]
+
+  def _cpu_and_one_gpu_devices(self):
+    return [
+        ("/job:worker/replica:0/task:0", [
+            "/job:worker/replica:0/task:0/device:GPU:0",
+            "/job:worker/replica:0/task:0/device:CPU:0"
+        ]),
+        ("/job:worker/replica:0/task:1", [
+            "/job:worker/replica:0/task:1/device:GPU:0",
+            "/job:worker/replica:0/task:1/device:CPU:0"
+        ])
+    ]
+
+  @combinations.generate(combinations.combine(
+      mode=["graph"],
+      input_type=["input_fn", "dataset"]))
+  def testOneDevicePerWorker(self, input_type):
+    worker_devices = self._cpu_devices()
+    with context.graph_mode(), self.cached_session() as sess:
+      dataset_fn = lambda: dataset_ops.Dataset.range(4)
+      self._test_iterator(input_type, dataset_fn, worker_devices,
+                          [[0, 0], [1, 1], [2, 2], [3, 3]], sess)
+
+  @combinations.generate(combinations.combine(
+      mode=["graph"],
+      input_type=["input_fn", "dataset"],
+      required_gpus=1))
+  def testTwoDevicesPerWorker(self, input_type):
+    worker_devices = self._cpu_and_one_gpu_devices()
+    with context.graph_mode(), self.cached_session() as sess:
+      dataset_fn = lambda: dataset_ops.Dataset.range(4)
+      self._test_iterator(input_type, dataset_fn, worker_devices,
+                          [[0, 1, 0, 1], [2, 3, 2, 3]], sess)
+
+  @combinations.generate(combinations.combine(
+      mode=["graph"],
+      input_type=["input_fn", "dataset"]))
+  def testTupleDataset(self, input_type):
+    worker_devices = self._cpu_devices()
+    with context.graph_mode(), self.cached_session() as sess:
+      def dataset_fn():
+        dataset1 = dataset_ops.Dataset.range(4)
+        dataset2 = dataset_ops.Dataset.range(4).map(lambda x: x**2)
+        return dataset_ops.Dataset.zip((dataset1, dataset2))
+
+      expected_values = [[(i, i**2), (i, i**2)] for i in range(0, 4)]
+      self._test_iterator(input_type, dataset_fn, worker_devices,
+                          expected_values, sess)
+
+
+class MirroredVariableTest(test.TestCase, parameterized.TestCase):
 
   config = config_pb2.ConfigProto()
   config.allow_soft_placement = True
@@ -580,9 +760,9 @@ class MirroredVariableTest(test.TestCase):
 
     v, _, mirrored = _make_mirrored()
 
-    self.assertEquals(v[0].name, mirrored.name)
-    self.assertEquals(v[0].dtype, mirrored.dtype)
-    self.assertEquals(v[0].shape, mirrored.shape)
+    self.assertEqual(v[0].name, mirrored.name)
+    self.assertEqual(v[0].dtype, mirrored.dtype)
+    self.assertEqual(v[0].shape, mirrored.shape)
 
   @test_util.run_in_graph_and_eager_modes(config=config)
   def testVariableOnAnotherDevice(self):
@@ -592,9 +772,9 @@ class MirroredVariableTest(test.TestCase):
     mirrored = values.MirroredVariable(index, v,
                                        variable_scope.VariableAggregation.MEAN)
 
-    self.assertEquals(v.name, mirrored.name)
-    self.assertEquals(v.dtype, mirrored.dtype)
-    self.assertEquals(v.shape, mirrored.shape)
+    self.assertEqual(v.name, mirrored.name)
+    self.assertEqual(v.dtype, mirrored.dtype)
+    self.assertEqual(v.shape, mirrored.shape)
 
   def _assign_mirrored(self, devices, v, new):
     for d, var, n in zip(devices, v, new):
@@ -714,14 +894,13 @@ class MirroredVariableTest(test.TestCase):
     save_path = self._save_normal()
     self._restore_mirrored(save_path)
 
-  @test_util.run_in_graph_and_eager_modes(config=config)
-  def testFetchAMirroredVariable(self):
-    if context.num_gpus() < 1 or context.executing_eagerly():
-      self.skipTest("A GPU is not available for this test or it's eager mode.")
-
-    with self.session(
-        graph=ops.Graph()) as sess, mirrored_strategy.MirroredStrategy(
-            ["/device:GPU:0"]).scope():
+  @combinations.generate(combinations.combine(
+      distribution=[
+          combinations.mirrored_strategy_with_one_gpu,
+          combinations.core_mirrored_strategy_with_one_gpu],
+      mode=["graph"]))
+  def testFetchAMirroredVariable(self, distribution):
+    with self.session(graph=ops.Graph()) as sess, distribution.scope():
       with ops.device("/device:GPU:0"):
         v = variable_scope.get_variable(
             name="v", initializer=1., use_resource=True)
@@ -747,7 +926,7 @@ def _make_replica_local(method):
   return v, replica_local
 
 
-class ReplicaLocalVariableTest(test.TestCase):
+class ReplicaLocalVariablePropertiesTest(test.TestCase):
 
   config = config_pb2.ConfigProto()
   config.allow_soft_placement = True
@@ -756,15 +935,14 @@ class ReplicaLocalVariableTest(test.TestCase):
   def testProperties(self):
     if context.num_gpus() < 1 and context.executing_eagerly():
       self.skipTest("A GPU is not available for this test in eager mode.")
-
     v, replica_local = _make_replica_local(
         variable_scope.VariableAggregation.SUM)
 
-    self.assertEquals(v[0].name, replica_local.name)
-    self.assertEquals(v[0].dtype, replica_local.dtype)
-    self.assertEquals(v[0].shape, replica_local.shape)
-    self.assertEquals(variable_scope.VariableAggregation.SUM,
-                      replica_local.aggregation)
+    self.assertEqual(v[0].name, replica_local.name)
+    self.assertEqual(v[0].dtype, replica_local.dtype)
+    self.assertEqual(v[0].shape, replica_local.shape)
+    self.assertEqual(variable_scope.VariableAggregation.SUM,
+                     replica_local.aggregation)
 
   @test_util.run_in_graph_and_eager_modes(config=config)
   def testVariableOnAnotherDevice(self):
@@ -774,11 +952,32 @@ class ReplicaLocalVariableTest(test.TestCase):
     replica_local = values.ReplicaLocalVariable(
         index, v, variable_scope.VariableAggregation.MEAN)
 
-    self.assertEquals(v.name, replica_local.name)
-    self.assertEquals(v.dtype, replica_local.dtype)
-    self.assertEquals(v.shape, replica_local.shape)
-    self.assertEquals(variable_scope.VariableAggregation.MEAN,
-                      replica_local.aggregation)
+    self.assertEqual(v.name, replica_local.name)
+    self.assertEqual(v.dtype, replica_local.dtype)
+    self.assertEqual(v.shape, replica_local.shape)
+    self.assertEqual(variable_scope.VariableAggregation.MEAN,
+                     replica_local.aggregation)
+
+  def testTensorConversion(self):
+    with context.graph_mode():
+      _, replica_local = _make_replica_local(
+          variable_scope.VariableAggregation.SUM)
+      converted = ops.internal_convert_to_tensor(replica_local, as_ref=False)
+      self.assertIsInstance(converted, ops.Tensor)
+      self.assertEqual(converted.dtype, replica_local.dtype)
+
+      converted = ops.internal_convert_to_tensor(replica_local, as_ref=True)
+      # Resources variable are converted to tensors as well when as_ref is True.
+      self.assertIsInstance(converted, ops.Tensor)
+      self.assertEqual(converted.dtype, replica_local.dtype)
+
+
+@combinations.generate(combinations.combine(
+    distribution=[
+        combinations.mirrored_strategy_with_gpu_and_cpu,
+        combinations.core_mirrored_strategy_with_gpu_and_cpu],
+    mode=["graph", "eager"]))
+class ReplicaLocalVariableTest(test.TestCase, parameterized.TestCase):
 
   def _assign_replica_local(self, devices, v, new):
     for d, var, n in zip(devices, v, new):
@@ -795,22 +994,15 @@ class ReplicaLocalVariableTest(test.TestCase):
     save_path, _ = self._save_return_saver(sess, var)
     return save_path
 
-  def _dist_scope(self):
-    return mirrored_strategy.MirroredStrategy(_devices).scope()
-
-  @test_util.run_in_graph_and_eager_modes(config=config)
-  def testSaveAndRestoreReplicaLocalSumOneGraph(self):
-    if context.num_gpus() < 1 and context.executing_eagerly():
-      self.skipTest("A GPU is not available for this test in eager mode.")
-
-    with self.cached_session(config=self.config) as sess:
+  def testSaveAndRestoreReplicaLocalSumOneGraph(self, distribution):
+    with self.cached_session() as sess:
       v, replica_local = _make_replica_local(
           variable_scope.VariableAggregation.SUM)
 
       # Overwrite the initial values.
       self._assign_replica_local(_devices, v, [3., 4.])
 
-      with self._dist_scope():
+      with distribution.scope():
         # Saves the current value of v[0] + v[1], 7.
         save_path, saver = self._save_return_saver(sess, replica_local)
 
@@ -822,19 +1014,18 @@ class ReplicaLocalVariableTest(test.TestCase):
         saver.restore(sess, save_path)
         self.assertEqual([3.5, 3.5], self.evaluate([v[0], v[1]]))
 
-  @test_util.run_in_graph_and_eager_modes(config=config)
-  def testSaveAndRestoreReplicaLocalMeanOneGraph(self):
+  def testSaveAndRestoreReplicaLocalMeanOneGraph(self, distribution):
     if context.num_gpus() < 1 and context.executing_eagerly():
       self.skipTest("A GPU is not available for this test in eager mode.")
 
-    with self.cached_session(config=self.config) as sess:
+    with self.cached_session() as sess:
       v, replica_local = _make_replica_local(
           variable_scope.VariableAggregation.MEAN)
 
       # Overwrite the initial values.
       self._assign_replica_local(_devices, v, [3., 4.])
 
-      with self._dist_scope():
+      with distribution.scope():
         # Saves the current value of (v[0] + v[1])/2, 3.5.
         save_path, saver = self._save_return_saver(sess, replica_local)
 
@@ -845,7 +1036,7 @@ class ReplicaLocalVariableTest(test.TestCase):
         saver.restore(sess, save_path)
         self.assertEqual([3.5, 3.5], self.evaluate([v[0], v[1]]))
 
-  def _save_replica_local_mean(self):
+  def _save_replica_local_mean(self, distribution):
     """Save variables with mirroring, returns save_path."""
     with self.session(graph=ops.Graph()) as sess:
       v, replica_local = _make_replica_local(
@@ -854,7 +1045,7 @@ class ReplicaLocalVariableTest(test.TestCase):
       # Overwrite the initial values.
       self._assign_replica_local(_devices, v, [3., 4.])
 
-      with self._dist_scope():
+      with distribution.scope():
         # Saves the current value of (v[0] + v[1])/2, 3.5
         save_path = self._save(sess, replica_local)
 
@@ -862,7 +1053,7 @@ class ReplicaLocalVariableTest(test.TestCase):
         self._assign_replica_local(_devices, v, [5., 6.])
     return save_path
 
-  def _save_replica_local_sum(self):
+  def _save_replica_local_sum(self, distribution):
     """Save variables with mirroring, returns save_path."""
     with self.session(graph=ops.Graph()) as sess:
       v, replica_local = _make_replica_local("sum")
@@ -870,7 +1061,7 @@ class ReplicaLocalVariableTest(test.TestCase):
       # Overwrite the initial values.
       self._assign_replica_local(_devices, v, [1.5, 2.])
 
-      with self._dist_scope():
+      with distribution.scope():
         # Saves the current value of v[0] + v[1], 3.5
         save_path = self._save(sess, replica_local)
 
@@ -908,7 +1099,7 @@ class ReplicaLocalVariableTest(test.TestCase):
       saver.restore(sess, save_path)
       self.assertEqual(3.5, self.evaluate(var))
 
-  def _restore_replica_local_mean(self, save_path):
+  def _restore_replica_local_mean(self, save_path, distribution):
     """Restore to variables with mirroring in a fresh graph."""
     with self.session(graph=ops.Graph()) as sess:
       v, replica_local = _make_replica_local(
@@ -917,13 +1108,13 @@ class ReplicaLocalVariableTest(test.TestCase):
       # Overwrite the initial values.
       self._assign_replica_local(_devices, v, [7., 8.])
 
-      with self._dist_scope():
+      with distribution.scope():
         # Restores the saved value of 3.5 to both variables.
         saver = saver_lib.Saver(var_list=[replica_local])
         saver.restore(sess, save_path)
         self.assertEqual([3.5, 3.5], self.evaluate([v[0], v[1]]))
 
-  def _restore_replica_local_sum(self, save_path):
+  def _restore_replica_local_sum(self, save_path, distribution):
     """Restore to variables with mirroring in a fresh graph."""
     with self.session(graph=ops.Graph()) as sess:
       v, replica_local = _make_replica_local(
@@ -932,72 +1123,35 @@ class ReplicaLocalVariableTest(test.TestCase):
       # Overwrite the initial values.
       self._assign_replica_local(_devices, v, [7., 8.])
 
-      with self._dist_scope():
+      with distribution.scope():
         # Restores the saved value of 3.5 to both variables.
         saver = saver_lib.Saver(var_list=[replica_local])
         saver.restore(sess, save_path)
         self.assertEqual([1.75, 1.75], self.evaluate([v[0], v[1]]))
 
-  @test_util.run_in_graph_and_eager_modes(config=config)
-  def testSaveReplicaLocalRestoreReplicaLocalMean(self):
-    if context.num_gpus() < 1 and context.executing_eagerly():
-      self.skipTest("A GPU is not available for this test in eager mode.")
+  def testSaveReplicaLocalRestoreReplicaLocalMean(self, distribution):
+    save_path = self._save_replica_local_mean(distribution)
+    self._restore_replica_local_mean(save_path, distribution)
 
-    save_path = self._save_replica_local_mean()
-    self._restore_replica_local_mean(save_path)
+  def testSaveReplicaLocalRestoreReplicaLocalSum(self, distribution):
+    save_path = self._save_replica_local_sum(distribution)
+    self._restore_replica_local_sum(save_path, distribution)
 
-  @test_util.run_in_graph_and_eager_modes(config=config)
-  def testSaveReplicaLocalRestoreReplicaLocalSum(self):
-    if context.num_gpus() < 1 and context.executing_eagerly():
-      self.skipTest("A GPU is not available for this test in eager mode.")
-
-    save_path = self._save_replica_local_sum()
-    self._restore_replica_local_sum(save_path)
-
-  @test_util.run_in_graph_and_eager_modes(config=config)
-  def testSaveReplicaLocalMeanRestoreNormal(self):
-    if context.num_gpus() < 1 and context.executing_eagerly():
-      self.skipTest("A GPU is not available for this test in eager mode.")
-
-    save_path = self._save_replica_local_mean()
+  def testSaveReplicaLocalMeanRestoreNormal(self, distribution):
+    save_path = self._save_replica_local_mean(distribution)
     self._restore_normal(save_path)
 
-  @test_util.run_in_graph_and_eager_modes(config=config)
-  def testSaveReplicaLocalSumRestoreNormal(self):
-    if context.num_gpus() < 1 and context.executing_eagerly():
-      self.skipTest("A GPU is not available for this test in eager mode.")
-
-    save_path = self._save_replica_local_sum()
+  def testSaveReplicaLocalSumRestoreNormal(self, distribution):
+    save_path = self._save_replica_local_sum(distribution)
     self._restore_normal(save_path)
 
-  @test_util.run_in_graph_and_eager_modes(config=config)
-  def testSaveNormalRestoreReplicaLocalMean(self):
-    if context.num_gpus() < 1 and context.executing_eagerly():
-      self.skipTest("A GPU is not available for this test in eager mode.")
-
+  def testSaveNormalRestoreReplicaLocalMean(self, distribution):
     save_path = self._save_normal()
-    self._restore_replica_local_mean(save_path)
-
-  @test_util.run_in_graph_and_eager_modes(config=config)
-  def testSaveNormalRestoreReplicaLocalSum(self):
-    if context.num_gpus() < 1 and context.executing_eagerly():
-      self.skipTest("A GPU is not available for this test in eager mode.")
+    self._restore_replica_local_mean(save_path, distribution)
 
+  def testSaveNormalRestoreReplicaLocalSum(self, distribution):
     save_path = self._save_normal()
-    self._restore_replica_local_sum(save_path)
-
-  def testTensorConversion(self):
-    with context.graph_mode():
-      _, replica_local = _make_replica_local(
-          variable_scope.VariableAggregation.SUM)
-      converted = ops.internal_convert_to_tensor(replica_local, as_ref=False)
-      self.assertIsInstance(converted, ops.Tensor)
-      self.assertEqual(converted.dtype, replica_local.dtype)
-
-      converted = ops.internal_convert_to_tensor(replica_local, as_ref=True)
-      # Resources variable are converted to tensors as well when as_ref is True.
-      self.assertIsInstance(converted, ops.Tensor)
-      self.assertEqual(converted.dtype, replica_local.dtype)
+    self._restore_replica_local_sum(save_path, distribution)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/contrib/distribute/python/warm_starting_util_test.py b/tensorflow/contrib/distribute/python/warm_starting_util_test.py
index 5d57d144c1c..b0bcf9b1745 100644
--- a/tensorflow/contrib/distribute/python/warm_starting_util_test.py
+++ b/tensorflow/contrib/distribute/python/warm_starting_util_test.py
@@ -44,7 +44,9 @@ class WarmStartingUtilWithDistributionStrategyTest(
       distribution=[combinations.default_strategy,
                     combinations.one_device_strategy,
                     combinations.mirrored_strategy_with_gpu_and_cpu,
-                    combinations.mirrored_strategy_with_two_gpus],
+                    combinations.mirrored_strategy_with_two_gpus,
+                    combinations.core_mirrored_strategy_with_gpu_and_cpu,
+                    combinations.core_mirrored_strategy_with_two_gpus],
       save_with_distribution=[True, False],
       restore_with_distribution=[True, False],
       mode=["graph"]))
diff --git a/tensorflow/contrib/distributions/BUILD b/tensorflow/contrib/distributions/BUILD
index 60f6b90edcb..3079175015a 100644
--- a/tensorflow/contrib/distributions/BUILD
+++ b/tensorflow/contrib/distributions/BUILD
@@ -72,7 +72,6 @@ py_library(
         "//tensorflow/python:nn",
         "//tensorflow/python:nn_ops",
         "//tensorflow/python:random_ops",
-        "//tensorflow/python:spectral_ops",
         "//tensorflow/python:state_ops",
         "//tensorflow/python:tensor_util",
         "//tensorflow/python:util",
@@ -80,6 +79,7 @@ py_library(
         "//tensorflow/python:variables",
         "//tensorflow/python/ops/distributions",
         "//tensorflow/python/ops/linalg",
+        "//tensorflow/python/ops/signal",
         "//third_party/py/numpy",
         "@six_archive//:six",
     ],
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/normal_conjugate_posteriors_test.py b/tensorflow/contrib/distributions/python/kernel_tests/normal_conjugate_posteriors_test.py
index 29eeaf43c51..ab3c07172a6 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/normal_conjugate_posteriors_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/normal_conjugate_posteriors_test.py
@@ -82,7 +82,7 @@ class NormalTest(test.TestCase):
       x = constant_op.constant(
           [[-2.5, 2.5, 4.0, 0.0, -1.0, 2.0], [2.5, -2.5, -4.0, 0.0, 1.0, -2.0]],
           dtype=dtypes.float32)
-      s = math_ops.reduce_sum(x, reduction_indices=[1])
+      s = math_ops.reduce_sum(x, axis=[1])
       x = array_ops.transpose(x)  # Reshape to shape (6, 2)
       n = constant_op.constant([6] * 2)
       prior = distributions.Normal(loc=mu0, scale=sigma0)
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/wishart_test.py b/tensorflow/contrib/distributions/python/kernel_tests/wishart_test.py
index a60056c444a..cdee30bbc42 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/wishart_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/wishart_test.py
@@ -147,14 +147,13 @@ class WishartCholeskyTest(test.TestCase):
       x = chol_w.sample(10000, seed=42)
       self.assertAllEqual((10000, 3, 3), x.get_shape())
 
-      moment1_estimate = math_ops.reduce_mean(x, reduction_indices=[0]).eval()
+      moment1_estimate = math_ops.reduce_mean(x, axis=[0]).eval()
       self.assertAllClose(chol_w.mean().eval(), moment1_estimate, rtol=0.05)
 
       # The Variance estimate uses the squares rather than outer-products
       # because Wishart.Variance is the diagonal of the Wishart covariance
       # matrix.
-      variance_estimate = (math_ops.reduce_mean(
-          math_ops.square(x), reduction_indices=[0]) -
+      variance_estimate = (math_ops.reduce_mean(math_ops.square(x), axis=[0]) -
                            math_ops.square(moment1_estimate)).eval()
       self.assertAllClose(
           chol_w.variance().eval(), variance_estimate, rtol=0.05)
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/softmax_centered.py b/tensorflow/contrib/distributions/python/ops/bijectors/softmax_centered.py
index 15c241d5d7a..74765f19e58 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/softmax_centered.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/softmax_centered.py
@@ -168,7 +168,7 @@ class SoftmaxCentered(bijector.Bijector):
     #   log_normalization = 1 + reduce_sum(exp(logits))
     #   -log_normalization + reduce_sum(logits - log_normalization)
     log_normalization = nn_ops.softplus(
-        math_ops.reduce_logsumexp(x, axis=-1, keep_dims=True))
+        math_ops.reduce_logsumexp(x, axis=-1, keepdims=True))
     return array_ops.squeeze(
         (-log_normalization + math_ops.reduce_sum(
             x - log_normalization, axis=-1, keepdims=True)), axis=-1)
diff --git a/tensorflow/contrib/distributions/python/ops/sample_stats.py b/tensorflow/contrib/distributions/python/ops/sample_stats.py
index aa680a92be6..978e627d663 100644
--- a/tensorflow/contrib/distributions/python/ops/sample_stats.py
+++ b/tensorflow/contrib/distributions/python/ops/sample_stats.py
@@ -29,8 +29,8 @@ from tensorflow.python.ops import clip_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
-from tensorflow.python.ops import spectral_ops
 from tensorflow.python.ops.distributions import util
+from tensorflow.python.ops.signal import fft_ops
 
 __all__ = [
     "auto_correlation",
@@ -157,11 +157,11 @@ def auto_correlation(
                                        dtype.real_dtype.as_numpy_dtype(0.))
 
     # Autocorrelation is IFFT of power-spectral density (up to some scaling).
-    fft_x_rotated_pad = spectral_ops.fft(x_rotated_pad)
+    fft_x_rotated_pad = fft_ops.fft(x_rotated_pad)
     spectral_density = fft_x_rotated_pad * math_ops.conj(fft_x_rotated_pad)
     # shifted_product is R[m] from above detailed explanation.
     # It is the inner product sum_n X[n] * Conj(X[n - m]).
-    shifted_product = spectral_ops.ifft(spectral_density)
+    shifted_product = fft_ops.ifft(spectral_density)
 
     # Cast back to real-valued if x was real to begin with.
     shifted_product = math_ops.cast(shifted_product, dtype)
diff --git a/tensorflow/contrib/eager/python/datasets.py b/tensorflow/contrib/eager/python/datasets.py
index 3aed121233b..34614b86a75 100644
--- a/tensorflow/contrib/eager/python/datasets.py
+++ b/tensorflow/contrib/eager/python/datasets.py
@@ -52,12 +52,6 @@ class Iterator(iterator_ops.EagerIterator):
       TypeError: If `dataset` is an unsupported type.
       RuntimeError: When invoked without eager execution enabled.
     """
-    if isinstance(dataset, prefetching_ops._PrefetchToDeviceDataset):  # pylint: disable=protected-access
-      raise TypeError(
-          "`tf.data.experimental.prefetch_to_device()` is not compatible with "
-          "`tf.contrib.eager.Iterator`. Use `for ... in dataset:` to iterate "
-          "over the dataset instead.")
-
     if not context.context().device_spec.device_type:
       is_remote_device = False
     else:
diff --git a/tensorflow/contrib/eager/python/datasets_test.py b/tensorflow/contrib/eager/python/datasets_test.py
index 6a508fc6ba9..257d02057ae 100644
--- a/tensorflow/contrib/eager/python/datasets_test.py
+++ b/tensorflow/contrib/eager/python/datasets_test.py
@@ -26,7 +26,6 @@ import numpy as np
 from tensorflow.contrib import lookup
 from tensorflow.contrib.eager.python import datasets
 from tensorflow.python.data import Dataset
-from tensorflow.python.data.experimental.ops import prefetching_ops
 from tensorflow.python.data.experimental.ops import threadpool
 from tensorflow.python.data.experimental.ops import unique
 from tensorflow.python.eager import test
@@ -208,18 +207,6 @@ class IteratorTest(test.TestCase):
         y = math_ops.add(x, x)
     self.assertAllEqual([0., 2.], y.numpy())
 
-  def testTensorsExplicitPrefetchToDevice(self):
-    ds = Dataset.from_tensor_slices([0., 1.])
-    ds = ds.apply(prefetching_ops.prefetch_to_device(test.gpu_device_name()))
-
-    with self.assertRaisesRegexp(TypeError, 'prefetch_to_device'):
-      datasets.Iterator(ds)
-
-    for i, x in enumerate(ds):
-      with ops.device(test.gpu_device_name()):
-        x = math_ops.add(x, x)
-        self.assertEqual(float(i) + float(i), x.numpy())
-
   def testOverrideThreadPool(self):
 
     def get_thread_id(_):
diff --git a/tensorflow/contrib/eager/python/evaluator.py b/tensorflow/contrib/eager/python/evaluator.py
index 7949a3f6da2..51443d24829 100644
--- a/tensorflow/contrib/eager/python/evaluator.py
+++ b/tensorflow/contrib/eager/python/evaluator.py
@@ -22,6 +22,7 @@ import six
 
 from tensorflow.contrib.eager.python import datasets
 from tensorflow.contrib.eager.python import metrics
+from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.eager import context
 from tensorflow.python.eager import function
 from tensorflow.python.framework import errors_impl
@@ -164,8 +165,8 @@ class Evaluator(object):
         self.__call__(example, *args, **kwargs)
       return self.all_metric_results(summary_logdir)
     # Graph construction
-    call_op = self.__call__(dataset.make_one_shot_iterator().get_next(), *args,
-                            **kwargs)
+    call_op = self.__call__(
+        dataset_ops.make_one_shot_iterator(dataset).get_next(), *args, **kwargs)
     init_op = self.init_variables()
     results_op = self.all_metric_results(summary_logdir)
     return (init_op, call_op, results_op)
diff --git a/tensorflow/contrib/eager/python/examples/densenet/BUILD b/tensorflow/contrib/eager/python/examples/densenet/BUILD
index 2dc196f550a..e2154fcc5fc 100644
--- a/tensorflow/contrib/eager/python/examples/densenet/BUILD
+++ b/tensorflow/contrib/eager/python/examples/densenet/BUILD
@@ -3,6 +3,7 @@ licenses(["notice"])  # Apache 2.0
 package(default_visibility = ["//tensorflow:internal"])
 
 load("//tensorflow:tensorflow.bzl", "cuda_py_test")
+load("//tensorflow:tensorflow.bzl", "py_binary")
 
 py_binary(
     name = "densenet",
diff --git a/tensorflow/contrib/eager/python/examples/densenet/densenet_graph_test.py b/tensorflow/contrib/eager/python/examples/densenet/densenet_graph_test.py
index 4b3cb624bc9..24f6b007b52 100644
--- a/tensorflow/contrib/eager/python/examples/densenet/densenet_graph_test.py
+++ b/tensorflow/contrib/eager/python/examples/densenet/densenet_graph_test.py
@@ -119,7 +119,8 @@ class DensenetBenchmark(tf.test.Benchmark):
       with tf.Graph().as_default():
         np_images, np_labels = random_batch(batch_size)
         dataset = tf.data.Dataset.from_tensors((np_images, np_labels)).repeat()
-        (images, labels) = dataset.make_one_shot_iterator().get_next()
+        (images, labels) = tf.compat.v1.data.make_one_shot_iterator(
+            dataset).get_next()
 
         model = densenet.DenseNet(self.depth, self.growth_rate, self.num_blocks,
                                   self.output_classes,
diff --git a/tensorflow/contrib/eager/python/examples/gan/mnist_graph_test.py b/tensorflow/contrib/eager/python/examples/gan/mnist_graph_test.py
index 12b39b0cde4..e73841fbf72 100644
--- a/tensorflow/contrib/eager/python/examples/gan/mnist_graph_test.py
+++ b/tensorflow/contrib/eager/python/examples/gan/mnist_graph_test.py
@@ -42,7 +42,8 @@ class MnistGraphGanBenchmark(tf.test.Benchmark):
     # Generate some random data.
     images_data = np.random.randn(batch_size, 784).astype(np.float32)
     dataset = tf.data.Dataset.from_tensors(images_data)
-    images = dataset.repeat().make_one_shot_iterator().get_next()
+    images = tf.compat.v1.data.make_one_shot_iterator(
+        dataset.repeat()).get_next()
 
     # Create the models and optimizers
     generator = mnist.Generator(data_format())
diff --git a/tensorflow/contrib/eager/python/examples/generative_examples/cvae.ipynb b/tensorflow/contrib/eager/python/examples/generative_examples/cvae.ipynb
index ca27a85a229..1a08cc0fd06 100644
--- a/tensorflow/contrib/eager/python/examples/generative_examples/cvae.ipynb
+++ b/tensorflow/contrib/eager/python/examples/generative_examples/cvae.ipynb
@@ -470,7 +470,7 @@
         "\n",
         "  if epoch % 1 == 0:\n",
         "    loss = tfe.metrics.Mean()\n",
-        "    for test_x in test_dataset.make_one_shot_iterator():\n",
+        "    for test_x in test_dataset:\n",
         "      loss(compute_loss(model, test_x))\n",
         "    elbo = -loss.result()\n",
         "    display.clear_output(wait=False)\n",
diff --git a/tensorflow/contrib/eager/python/examples/generative_examples/image_captioning_with_attention.ipynb b/tensorflow/contrib/eager/python/examples/generative_examples/image_captioning_with_attention.ipynb
index 3acecd283cd..12c5eff2b4a 100644
--- a/tensorflow/contrib/eager/python/examples/generative_examples/image_captioning_with_attention.ipynb
+++ b/tensorflow/contrib/eager/python/examples/generative_examples/image_captioning_with_attention.ipynb
@@ -1,1184 +1,1174 @@
 {
-  "nbformat": 4,
-  "nbformat_minor": 0,
-  "metadata": {
-    "colab": {
-      "name": "image_captioning_with_attention.ipynb",
-      "version": "0.3.2",
-      "views": {},
-      "default_view": {},
-      "provenance": [
-        {
-          "file_id": "1HI8OK2sMjcx9CTWVn0122QAHOuXaOaMg",
-          "timestamp": 1530222436922
-        }
-      ],
-      "private_outputs": true,
-      "collapsed_sections": [],
-      "toc_visible": true
-    },
-    "kernelspec": {
-      "display_name": "Python 3",
-      "language": "python",
-      "name": "python3"
-    },
-    "accelerator": "GPU"
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text",
+    "id": "K2s1A9eLRPEj"
+   },
+   "source": [
+    "##### Copyright 2018 The TensorFlow Authors.\n",
+    "\n",
+    "Licensed under the Apache License, Version 2.0 (the \"License\").\n"
+   ]
   },
-  "cells": [
-    {
-      "metadata": {
-        "id": "K2s1A9eLRPEj",
-        "colab_type": "text"
-      },
-      "cell_type": "markdown",
-      "source": [
-        "##### Copyright 2018 The TensorFlow Authors.\n",
-        "\n",
-        "Licensed under the Apache License, Version 2.0 (the \"License\").\n"
-      ]
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text",
+    "id": "Cffg2i257iMS"
+   },
+   "source": [
+    "# Image Captioning with Attention\n",
+    "\n",
+    "<table class=\"tfo-notebook-buttons\" align=\"left\"><td>\n",
+    "<a target=\"_blank\"  href=\"https://colab.research.google.com/github/tensorflow/tensorflow/blob/master/tensorflow/contrib/eager/python/examples/generative_examples/image_captioning_with_attention.ipynb\">\n",
+    "    <img src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" />Run in Google Colab</a>  \n",
+    "</td><td>\n",
+    "<a target=\"_blank\"  href=\"https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/eager/python/examples/generative_examples/image_captioning_with_attention.ipynb\"><img width=32px src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" />View source on GitHub</a></td></table>"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text",
+    "id": "QASbY_HGo4Lq"
+   },
+   "source": [
+    "Image captioning is the task of generating a caption for an image. Given an image like this:\n",
+    "\n",
+    "![Man Surfing](https://tensorflow.org/images/surf.jpg) \n",
+    "\n",
+    "[Image Source](https://commons.wikimedia.org/wiki/Surfing#/media/File:Surfing_in_Hawaii.jpg), License: Public Domain\n",
+    "\n",
+    "Our goal is to generate a caption, such as \"a surfer riding on a wave\". Here, we'll use an attention-based model. This enables us to see which parts of the image the model focuses on as it generates a caption.\n",
+    "\n",
+    "![Prediction](https://tensorflow.org/images/imcap_prediction.png)\n",
+    "\n",
+    "This model architecture below is similar to [Show, Attend and Tell: Neural Image Caption Generation with Visual Attention](https://arxiv.org/abs/1502.03044). \n",
+    "\n",
+    "The code uses [tf.keras](https://www.tensorflow.org/programmers_guide/keras) and [eager execution](https://www.tensorflow.org/programmers_guide/eager), which you can learn more about in the linked guides.\n",
+    "\n",
+    "This notebook is an end-to-end example. If you run it, it will download the  [MS-COCO](http://cocodataset.org/#home) dataset, preprocess and cache a subset of the images using Inception V3, train an encoder-decoder model, and use it to generate captions on new images.\n",
+    "\n",
+    "The code requires TensorFlow version >=1.9. If you're running this in [Colab]()\n",
+    "\n",
+    "In this example, we're training on a relatively small amount of data as an example. On a single P100 GPU, this example will take about ~2 hours to train. We train on the first 30,000 captions (corresponding to about ~20,000 images depending on shuffling, as there are multiple captions per image in the dataset)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "autoexec": {
+      "startup": false,
+      "wait_interval": 0
+     }
     },
-    {
-      "metadata": {
-        "id": "Cffg2i257iMS",
-        "colab_type": "text"
-      },
-      "cell_type": "markdown",
-      "source": [
-        "# Image Captioning with Attention\n",
-        "\n",
-        "<table class=\"tfo-notebook-buttons\" align=\"left\"><td>\n",
-        "<a target=\"_blank\"  href=\"https://colab.research.google.com/github/tensorflow/tensorflow/blob/master/tensorflow/contrib/eager/python/examples/generative_examples/image_captioning_with_attention.ipynb\">\n",
-        "    <img src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" />Run in Google Colab</a>  \n",
-        "</td><td>\n",
-        "<a target=\"_blank\"  href=\"https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/eager/python/examples/generative_examples/image_captioning_with_attention.ipynb\"><img width=32px src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" />View source on GitHub</a></td></table>"
-      ]
+    "colab_type": "code",
+    "id": "U8l4RJ0XRPEm"
+   },
+   "outputs": [],
+   "source": [
+    "# Import TensorFlow and enable eager execution\n",
+    "# This code requires TensorFlow version >=1.9\n",
+    "import tensorflow as tf\n",
+    "tf.enable_eager_execution()\n",
+    "\n",
+    "# We'll generate plots of attention in order to see which parts of an image\n",
+    "# our model focuses on during captioning\n",
+    "import matplotlib.pyplot as plt\n",
+    "\n",
+    "# Scikit-learn includes many helpful utilities\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "from sklearn.utils import shuffle\n",
+    "\n",
+    "import re\n",
+    "import numpy as np\n",
+    "import os\n",
+    "import time\n",
+    "import json\n",
+    "from glob import glob\n",
+    "from PIL import Image\n",
+    "import pickle"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text",
+    "id": "b6qbGw8MRPE5"
+   },
+   "source": [
+    "## Download and prepare the MS-COCO dataset\n",
+    "\n",
+    "We will use the [MS-COCO dataset](http://cocodataset.org/#home) to train our model. This dataset contains >82,000 images, each of which has been annotated with at least 5 different captions. The code below will download and extract the dataset automatically.  \n",
+    "\n",
+    "**Caution: large download ahead**. We'll use the training set, it's a 13GB file."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "autoexec": {
+      "startup": false,
+      "wait_interval": 0
+     }
     },
-    {
-      "metadata": {
-        "id": "QASbY_HGo4Lq",
-        "colab_type": "text"
-      },
-      "cell_type": "markdown",
-      "source": [
-        "Image captioning is the task of generating a caption for an image. Given an image like this:\n",
-        "\n",
-        "![Man Surfing](https://tensorflow.org/images/surf.jpg) \n",
-        "\n",
-        "[Image Source](https://commons.wikimedia.org/wiki/Surfing#/media/File:Surfing_in_Hawaii.jpg), License: Public Domain\n",
-        "\n",
-        "Our goal is to generate a caption, such as \"a surfer riding on a wave\". Here, we'll use an attention-based model. This enables us to see which parts of the image the model focuses on as it generates a caption.\n",
-        "\n",
-        "![Prediction](https://tensorflow.org/images/imcap_prediction.png)\n",
-        "\n",
-        "This model architecture below is similar to [Show, Attend and Tell: Neural Image Caption Generation with Visual Attention](https://arxiv.org/abs/1502.03044). \n",
-        "\n",
-        "The code uses [tf.keras](https://www.tensorflow.org/programmers_guide/keras) and [eager execution](https://www.tensorflow.org/programmers_guide/eager), which you can learn more about in the linked guides.\n",
-        "\n",
-        "This notebook is an end-to-end example. If you run it, it will download the  [MS-COCO](http://cocodataset.org/#home) dataset, preprocess and cache a subset of the images using Inception V3, train an encoder-decoder model, and use it to generate captions on new images.\n",
-        "\n",
-        "The code requires TensorFlow version >=1.9. If you're running this in [Colab]()\n",
-        "\n",
-        "In this example, we're training on a relatively small amount of data as an example. On a single P100 GPU, this example will take about ~2 hours to train. We train on the first 30,000 captions (corresponding to about ~20,000 images depending on shuffling, as there are multiple captions per image in the dataset)\n"
-      ]
+    "colab_type": "code",
+    "id": "krQuPYTtRPE7"
+   },
+   "outputs": [],
+   "source": [
+    "annotation_zip = tf.keras.utils.get_file('captions.zip', \n",
+    "                                          cache_subdir=os.path.abspath('.'),\n",
+    "                                          origin = 'http://images.cocodataset.org/annotations/annotations_trainval2014.zip',\n",
+    "                                          extract = True)\n",
+    "annotation_file = os.path.dirname(annotation_zip)+'/annotations/captions_train2014.json'\n",
+    "\n",
+    "name_of_zip = 'train2014.zip'\n",
+    "if not os.path.exists(os.path.abspath('.') + '/' + name_of_zip):\n",
+    "  image_zip = tf.keras.utils.get_file(name_of_zip, \n",
+    "                                      cache_subdir=os.path.abspath('.'),\n",
+    "                                      origin = 'http://images.cocodataset.org/zips/train2014.zip',\n",
+    "                                      extract = True)\n",
+    "  PATH = os.path.dirname(image_zip)+'/train2014/'\n",
+    "else:\n",
+    "  PATH = os.path.abspath('.')+'/train2014/'"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text",
+    "id": "aANEzb5WwSzg"
+   },
+   "source": [
+    "## Optionally, limit the size of the training set for faster training\n",
+    "For this example, we'll select a subset of 30,000 captions and use these and the corresponding images to train our model. As always, captioning quality will improve if you choose to use more data."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "autoexec": {
+      "startup": false,
+      "wait_interval": 0
+     }
     },
-    {
-      "metadata": {
-        "id": "U8l4RJ0XRPEm",
-        "colab_type": "code",
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        }
-      },
-      "cell_type": "code",
-      "source": [
-        "# Import TensorFlow and enable eager execution\n",
-        "# This code requires TensorFlow version >=1.9\n",
-        "import tensorflow as tf\n",
-        "tf.enable_eager_execution()\n",
-        "\n",
-        "# We'll generate plots of attention in order to see which parts of an image\n",
-        "# our model focuses on during captioning\n",
-        "import matplotlib.pyplot as plt\n",
-        "\n",
-        "# Scikit-learn includes many helpful utilities\n",
-        "from sklearn.model_selection import train_test_split\n",
-        "from sklearn.utils import shuffle\n",
-        "\n",
-        "import re\n",
-        "import numpy as np\n",
-        "import os\n",
-        "import time\n",
-        "import json\n",
-        "from glob import glob\n",
-        "from PIL import Image\n",
-        "import pickle"
-      ],
-      "execution_count": 0,
-      "outputs": []
+    "colab_type": "code",
+    "id": "4G3b8x8_RPFD"
+   },
+   "outputs": [],
+   "source": [
+    "# read the json file\n",
+    "with open(annotation_file, 'r') as f:\n",
+    "    annotations = json.load(f)\n",
+    "\n",
+    "# storing the captions and the image name in vectors\n",
+    "all_captions = []\n",
+    "all_img_name_vector = []\n",
+    "\n",
+    "for annot in annotations['annotations']:\n",
+    "    caption = '<start> ' + annot['caption'] + ' <end>'\n",
+    "    image_id = annot['image_id']\n",
+    "    full_coco_image_path = PATH + 'COCO_train2014_' + '%012d.jpg' % (image_id)\n",
+    "    \n",
+    "    all_img_name_vector.append(full_coco_image_path)\n",
+    "    all_captions.append(caption)\n",
+    "\n",
+    "# shuffling the captions and image_names together\n",
+    "# setting a random state\n",
+    "train_captions, img_name_vector = shuffle(all_captions,\n",
+    "                                          all_img_name_vector,\n",
+    "                                          random_state=1)\n",
+    "\n",
+    "# selecting the first 30000 captions from the shuffled set\n",
+    "num_examples = 30000\n",
+    "train_captions = train_captions[:num_examples]\n",
+    "img_name_vector = img_name_vector[:num_examples]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "autoexec": {
+      "startup": false,
+      "wait_interval": 0
+     }
     },
-    {
-      "metadata": {
-        "id": "b6qbGw8MRPE5",
-        "colab_type": "text"
-      },
-      "cell_type": "markdown",
-      "source": [
-        "## Download and prepare the MS-COCO dataset\n",
-        "\n",
-        "We will use the [MS-COCO dataset](http://cocodataset.org/#home) to train our model. This dataset contains >82,000 images, each of which has been annotated with at least 5 different captions. The code below will download and extract the dataset automatically.  \n",
-        "\n",
-        "**Caution: large download ahead**. We'll use the training set, it's a 13GB file."
-      ]
+    "colab_type": "code",
+    "id": "mPBMgK34RPFL"
+   },
+   "outputs": [],
+   "source": [
+    "len(train_captions), len(all_captions)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text",
+    "id": "8cSW4u-ORPFQ"
+   },
+   "source": [
+    "## Preprocess the images using InceptionV3\n",
+    "Next, we will use InceptionV3 (pretrained on Imagenet) to classify each image. We will extract features from the last convolutional layer. \n",
+    "\n",
+    "First, we will need to convert the images into the format inceptionV3 expects by:\n",
+    "* Resizing the image to (299, 299)\n",
+    "* Using the [preprocess_input](https://www.tensorflow.org/api_docs/python/tf/keras/applications/inception_v3/preprocess_input) method to place the pixels in the range of -1 to 1 (to match the format of the images used to train InceptionV3)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "autoexec": {
+      "startup": false,
+      "wait_interval": 0
+     }
     },
-    {
-      "metadata": {
-        "id": "krQuPYTtRPE7",
-        "colab_type": "code",
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        }
-      },
-      "cell_type": "code",
-      "source": [
-        "annotation_zip = tf.keras.utils.get_file('captions.zip', \n",
-        "                                          cache_subdir=os.path.abspath('.'),\n",
-        "                                          origin = 'http://images.cocodataset.org/annotations/annotations_trainval2014.zip',\n",
-        "                                          extract = True)\n",
-        "annotation_file = os.path.dirname(annotation_zip)+'/annotations/captions_train2014.json'\n",
-        "\n",
-        "name_of_zip = 'train2014.zip'\n",
-        "if not os.path.exists(os.path.abspath('.') + '/' + name_of_zip):\n",
-        "  image_zip = tf.keras.utils.get_file(name_of_zip, \n",
-        "                                      cache_subdir=os.path.abspath('.'),\n",
-        "                                      origin = 'http://images.cocodataset.org/zips/train2014.zip',\n",
-        "                                      extract = True)\n",
-        "  PATH = os.path.dirname(image_zip)+'/train2014/'\n",
-        "else:\n",
-        "  PATH = os.path.abspath('.')+'/train2014/'"
-      ],
-      "execution_count": 0,
-      "outputs": []
+    "colab_type": "code",
+    "id": "zXR0217aRPFR"
+   },
+   "outputs": [],
+   "source": [
+    "def load_image(image_path):\n",
+    "    img = tf.read_file(image_path)\n",
+    "    img = tf.image.decode_jpeg(img, channels=3)\n",
+    "    img = tf.image.resize_images(img, (299, 299))\n",
+    "    img = tf.keras.applications.inception_v3.preprocess_input(img)\n",
+    "    return img, image_path"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text",
+    "id": "MDvIu4sXRPFV"
+   },
+   "source": [
+    "## Initialize InceptionV3 and load the pretrained Imagenet weights\n",
+    "\n",
+    "To do so, we'll create a tf.keras model where the output layer is the last convolutional layer in the InceptionV3 architecture. \n",
+    "* Each image is forwarded through the network and the vector that we get at the end is stored in a dictionary (image_name --> feature_vector). \n",
+    "* We use the last convolutional layer because we are using attention in this example. The shape of the output of this layer is ```8x8x2048```. \n",
+    "* We avoid doing this during training so it does not become a bottleneck. \n",
+    "* After all the images are passed through the network, we pickle the dictionary and save it to disk."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "autoexec": {
+      "startup": false,
+      "wait_interval": 0
+     }
     },
-    {
-      "metadata": {
-        "id": "aANEzb5WwSzg",
-        "colab_type": "text"
-      },
-      "cell_type": "markdown",
-      "source": [
-        "## Optionally, limit the size of the training set for faster training\n",
-        "For this example, we'll select a subset of 30,000 captions and use these and the corresponding images to train our model. As always, captioning quality will improve if you choose to use more data."
-      ]
+    "colab_type": "code",
+    "id": "RD3vW4SsRPFW"
+   },
+   "outputs": [],
+   "source": [
+    "image_model = tf.keras.applications.InceptionV3(include_top=False, \n",
+    "                                                weights='imagenet')\n",
+    "new_input = image_model.input\n",
+    "hidden_layer = image_model.layers[-1].output\n",
+    "\n",
+    "image_features_extract_model = tf.keras.Model(new_input, hidden_layer)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text",
+    "id": "rERqlR3WRPGO"
+   },
+   "source": [
+    "## Caching the features extracted from InceptionV3\n",
+    "\n",
+    "We will pre-process each image with InceptionV3 and cache the output to disk. Caching the output in RAM would be faster but memory intensive, requiring 8 \\* 8 \\* 2048 floats per image. At the time of writing, this would exceed the memory limitations of Colab (although these may change, an instance appears to have about 12GB of memory currently). \n",
+    "\n",
+    "Performance could be improved with a more sophisticated caching strategy (e.g., by sharding the images to reduce random access disk I/O) at the cost of more code.\n",
+    "\n",
+    "This will take about 10 minutes to run in Colab with a GPU. If you'd like to see a progress bar, you could: install [tqdm](https://github.com/tqdm/tqdm) (```!pip install tqdm```), then change this line: \n",
+    "\n",
+    "```for img, path in image_dataset:``` \n",
+    "\n",
+    "to:\n",
+    "\n",
+    "```for img, path in tqdm(image_dataset):```."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "autoexec": {
+      "startup": false,
+      "wait_interval": 0
+     }
     },
-    {
-      "metadata": {
-        "id": "4G3b8x8_RPFD",
-        "colab_type": "code",
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        }
-      },
-      "cell_type": "code",
-      "source": [
-        "# read the json file\n",
-        "with open(annotation_file, 'r') as f:\n",
-        "    annotations = json.load(f)\n",
-        "\n",
-        "# storing the captions and the image name in vectors\n",
-        "all_captions = []\n",
-        "all_img_name_vector = []\n",
-        "\n",
-        "for annot in annotations['annotations']:\n",
-        "    caption = '<start> ' + annot['caption'] + ' <end>'\n",
-        "    image_id = annot['image_id']\n",
-        "    full_coco_image_path = PATH + 'COCO_train2014_' + '%012d.jpg' % (image_id)\n",
-        "    \n",
-        "    all_img_name_vector.append(full_coco_image_path)\n",
-        "    all_captions.append(caption)\n",
-        "\n",
-        "# shuffling the captions and image_names together\n",
-        "# setting a random state\n",
-        "train_captions, img_name_vector = shuffle(all_captions,\n",
-        "                                          all_img_name_vector,\n",
-        "                                          random_state=1)\n",
-        "\n",
-        "# selecting the first 30000 captions from the shuffled set\n",
-        "num_examples = 30000\n",
-        "train_captions = train_captions[:num_examples]\n",
-        "img_name_vector = img_name_vector[:num_examples]"
-      ],
-      "execution_count": 0,
-      "outputs": []
+    "colab_type": "code",
+    "id": "Dx_fvbVgRPGQ"
+   },
+   "outputs": [],
+   "source": [
+    "# getting the unique images\n",
+    "encode_train = sorted(set(img_name_vector))\n",
+    "\n",
+    "# feel free to change the batch_size according to your system configuration\n",
+    "image_dataset = tf.data.Dataset.from_tensor_slices(\n",
+    "                                encode_train).map(load_image).batch(16)\n",
+    "\n",
+    "for img, path in image_dataset:\n",
+    "  batch_features = image_features_extract_model(img)\n",
+    "  batch_features = tf.reshape(batch_features, \n",
+    "                              (batch_features.shape[0], -1, batch_features.shape[3]))\n",
+    "\n",
+    "  for bf, p in zip(batch_features, path):\n",
+    "    path_of_feature = p.numpy().decode(\"utf-8\")\n",
+    "    np.save(path_of_feature, bf.numpy())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text",
+    "id": "nyqH3zFwRPFi"
+   },
+   "source": [
+    "## Preprocess and tokenize the captions\n",
+    "\n",
+    "* First, we'll tokenize the captions (e.g., by splitting on spaces). This will give us a  vocabulary of all the unique words in the data (e.g., \"surfing\", \"football\", etc).\n",
+    "* Next, we'll limit the vocabulary size to the top 5,000 words to save memory. We'll replace all other words with the token \"UNK\" (for unknown).\n",
+    "* Finally, we create a word --> index mapping and vice-versa.\n",
+    "* We will then pad all sequences to the be same length as the longest one. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "autoexec": {
+      "startup": false,
+      "wait_interval": 0
+     }
     },
-    {
-      "metadata": {
-        "id": "mPBMgK34RPFL",
-        "colab_type": "code",
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        }
-      },
-      "cell_type": "code",
-      "source": [
-        "len(train_captions), len(all_captions)"
-      ],
-      "execution_count": 0,
-      "outputs": []
+    "colab_type": "code",
+    "id": "HZfK8RhQRPFj"
+   },
+   "outputs": [],
+   "source": [
+    "# This will find the maximum length of any caption in our dataset\n",
+    "def calc_max_length(tensor):\n",
+    "    return max(len(t) for t in tensor)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "autoexec": {
+      "startup": false,
+      "wait_interval": 0
+     }
     },
-    {
-      "metadata": {
-        "id": "8cSW4u-ORPFQ",
-        "colab_type": "text"
-      },
-      "cell_type": "markdown",
-      "source": [
-        "## Preprocess the images using InceptionV3\n",
-        "Next, we will use InceptionV3 (pretrained on Imagenet) to classify each image. We will extract features from the last convolutional layer. \n",
-        "\n",
-        "First, we will need to convert the images into the format inceptionV3 expects by:\n",
-        "* Resizing the image to (299, 299)\n",
-        "* Using the [preprocess_input](https://www.tensorflow.org/api_docs/python/tf/keras/applications/inception_v3/preprocess_input) method to place the pixels in the range of -1 to 1 (to match the format of the images used to train InceptionV3)."
-      ]
+    "colab_type": "code",
+    "id": "oJGE34aiRPFo"
+   },
+   "outputs": [],
+   "source": [
+    "# The steps above is a general process of dealing with text processing\n",
+    "\n",
+    "# choosing the top 5000 words from the vocabulary\n",
+    "top_k = 5000\n",
+    "tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=top_k, \n",
+    "                                                  oov_token=\"<unk>\", \n",
+    "                                                  filters='!\"#$%&()*+.,-/:;=?@[\\]^_`{|}~ ')\n",
+    "tokenizer.fit_on_texts(train_captions)\n",
+    "train_seqs = tokenizer.texts_to_sequences(train_captions)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "autoexec": {
+      "startup": false,
+      "wait_interval": 0
+     }
     },
-    {
-      "metadata": {
-        "id": "zXR0217aRPFR",
-        "colab_type": "code",
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        }
-      },
-      "cell_type": "code",
-      "source": [
-        "def load_image(image_path):\n",
-        "    img = tf.read_file(image_path)\n",
-        "    img = tf.image.decode_jpeg(img, channels=3)\n",
-        "    img = tf.image.resize_images(img, (299, 299))\n",
-        "    img = tf.keras.applications.inception_v3.preprocess_input(img)\n",
-        "    return img, image_path"
-      ],
-      "execution_count": 0,
-      "outputs": []
+    "colab_type": "code",
+    "id": "8Q44tNQVRPFt"
+   },
+   "outputs": [],
+   "source": [
+    "tokenizer.word_index['<pad>'] = 0"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "autoexec": {
+      "startup": false,
+      "wait_interval": 0
+     }
     },
-    {
-      "metadata": {
-        "id": "MDvIu4sXRPFV",
-        "colab_type": "text"
-      },
-      "cell_type": "markdown",
-      "source": [
-        "## Initialize InceptionV3 and load the pretrained Imagenet weights\n",
-        "\n",
-        "To do so, we'll create a tf.keras model where the output layer is the last convolutional layer in the InceptionV3 architecture. \n",
-        "* Each image is forwarded through the network and the vector that we get at the end is stored in a dictionary (image_name --> feature_vector). \n",
-        "* We use the last convolutional layer because we are using attention in this example. The shape of the output of this layer is ```8x8x2048```. \n",
-        "* We avoid doing this during training so it does not become a bottleneck. \n",
-        "* After all the images are passed through the network, we pickle the dictionary and save it to disk."
-      ]
+    "colab_type": "code",
+    "id": "0fpJb5ojRPFv"
+   },
+   "outputs": [],
+   "source": [
+    "# creating the tokenized vectors\n",
+    "train_seqs = tokenizer.texts_to_sequences(train_captions)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "autoexec": {
+      "startup": false,
+      "wait_interval": 0
+     }
     },
-    {
-      "metadata": {
-        "id": "RD3vW4SsRPFW",
-        "colab_type": "code",
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        }
-      },
-      "cell_type": "code",
-      "source": [
-        "image_model = tf.keras.applications.InceptionV3(include_top=False, \n",
-        "                                                weights='imagenet')\n",
-        "new_input = image_model.input\n",
-        "hidden_layer = image_model.layers[-1].output\n",
-        "\n",
-        "image_features_extract_model = tf.keras.Model(new_input, hidden_layer)"
-      ],
-      "execution_count": 0,
-      "outputs": []
+    "colab_type": "code",
+    "id": "AidglIZVRPF4"
+   },
+   "outputs": [],
+   "source": [
+    "# padding each vector to the max_length of the captions\n",
+    "# if the max_length parameter is not provided, pad_sequences calculates that automatically\n",
+    "cap_vector = tf.keras.preprocessing.sequence.pad_sequences(train_seqs, padding='post')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "autoexec": {
+      "startup": false,
+      "wait_interval": 0
+     }
     },
-    {
-      "metadata": {
-        "id": "rERqlR3WRPGO",
-        "colab_type": "text"
-      },
-      "cell_type": "markdown",
-      "source": [
-        "## Caching the features extracted from InceptionV3\n",
-        "\n",
-        "We will pre-process each image with InceptionV3 and cache the output to disk. Caching the output in RAM would be faster but memory intensive, requiring 8 \\* 8 \\* 2048 floats per image. At the time of writing, this would exceed the memory limitations of Colab (although these may change, an instance appears to have about 12GB of memory currently). \n",
-        "\n",
-        "Performance could be improved with a more sophisticated caching strategy (e.g., by sharding the images to reduce random access disk I/O) at the cost of more code.\n",
-        "\n",
-        "This will take about 10 minutes to run in Colab with a GPU. If you'd like to see a progress bar, you could: install [tqdm](https://github.com/tqdm/tqdm) (```!pip install tqdm```), then change this line: \n",
-        "\n",
-        "```for img, path in image_dataset:``` \n",
-        "\n",
-        "to:\n",
-        "\n",
-        "```for img, path in tqdm(image_dataset):```."
-      ]
+    "colab_type": "code",
+    "id": "gL0wkttkRPGA"
+   },
+   "outputs": [],
+   "source": [
+    "# calculating the max_length \n",
+    "# used to store the attention weights\n",
+    "max_length = calc_max_length(train_seqs)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text",
+    "id": "M3CD75nDpvTI"
+   },
+   "source": [
+    "## Split the data into training and testing"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "autoexec": {
+      "startup": false,
+      "wait_interval": 0
+     }
     },
-    {
-      "metadata": {
-        "id": "Dx_fvbVgRPGQ",
-        "colab_type": "code",
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        }
-      },
-      "cell_type": "code",
-      "source": [
-        "# getting the unique images\n",
-        "encode_train = sorted(set(img_name_vector))\n",
-        "\n",
-        "# feel free to change the batch_size according to your system configuration\n",
-        "image_dataset = tf.data.Dataset.from_tensor_slices(\n",
-        "                                encode_train).map(load_image).batch(16)\n",
-        "\n",
-        "for img, path in image_dataset:\n",
-        "  batch_features = image_features_extract_model(img)\n",
-        "  batch_features = tf.reshape(batch_features, \n",
-        "                              (batch_features.shape[0], -1, batch_features.shape[3]))\n",
-        "\n",
-        "  for bf, p in zip(batch_features, path):\n",
-        "    path_of_feature = p.numpy().decode(\"utf-8\")\n",
-        "    np.save(path_of_feature, bf.numpy())"
-      ],
-      "execution_count": 0,
-      "outputs": []
+    "colab_type": "code",
+    "id": "iS7DDMszRPGF"
+   },
+   "outputs": [],
+   "source": [
+    "# Create training and validation sets using 80-20 split\n",
+    "img_name_train, img_name_val, cap_train, cap_val = train_test_split(img_name_vector, \n",
+    "                                                                    cap_vector, \n",
+    "                                                                    test_size=0.2, \n",
+    "                                                                    random_state=0)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "autoexec": {
+      "startup": false,
+      "wait_interval": 0
+     }
     },
-    {
-      "metadata": {
-        "id": "nyqH3zFwRPFi",
-        "colab_type": "text"
-      },
-      "cell_type": "markdown",
-      "source": [
-        "## Preprocess and tokenize the captions\n",
-        "\n",
-        "* First, we'll tokenize the captions (e.g., by splitting on spaces). This will give us a  vocabulary of all the unique words in the data (e.g., \"surfing\", \"football\", etc).\n",
-        "* Next, we'll limit the vocabulary size to the top 5,000 words to save memory. We'll replace all other words with the token \"UNK\" (for unknown).\n",
-        "* Finally, we create a word --> index mapping and vice-versa.\n",
-        "* We will then pad all sequences to the be same length as the longest one. "
-      ]
+    "colab_type": "code",
+    "id": "XmViPkRFRPGH"
+   },
+   "outputs": [],
+   "source": [
+    "len(img_name_train), len(cap_train), len(img_name_val), len(cap_val)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text",
+    "id": "uEWM9xrYcg45"
+   },
+   "source": [
+    "## Our images and captions are ready! Next, let's create a tf.data dataset to use for training our model.\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "autoexec": {
+      "startup": false,
+      "wait_interval": 0
+     }
     },
-    {
-      "metadata": {
-        "id": "HZfK8RhQRPFj",
-        "colab_type": "code",
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        }
-      },
-      "cell_type": "code",
-      "source": [
-        "# This will find the maximum length of any caption in our dataset\n",
-        "def calc_max_length(tensor):\n",
-        "    return max(len(t) for t in tensor)"
-      ],
-      "execution_count": 0,
-      "outputs": []
+    "colab_type": "code",
+    "id": "Q3TnZ1ToRPGV"
+   },
+   "outputs": [],
+   "source": [
+    "# feel free to change these parameters according to your system's configuration\n",
+    "\n",
+    "BATCH_SIZE = 64\n",
+    "BUFFER_SIZE = 1000\n",
+    "embedding_dim = 256\n",
+    "units = 512\n",
+    "vocab_size = len(tokenizer.word_index)\n",
+    "# shape of the vector extracted from InceptionV3 is (64, 2048)\n",
+    "# these two variables represent that\n",
+    "features_shape = 2048\n",
+    "attention_features_shape = 64"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "autoexec": {
+      "startup": false,
+      "wait_interval": 0
+     }
     },
-    {
-      "metadata": {
-        "id": "oJGE34aiRPFo",
-        "colab_type": "code",
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        }
-      },
-      "cell_type": "code",
-      "source": [
-        "# The steps above is a general process of dealing with text processing\n",
-        "\n",
-        "# choosing the top 5000 words from the vocabulary\n",
-        "top_k = 5000\n",
-        "tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=top_k, \n",
-        "                                                  oov_token=\"<unk>\", \n",
-        "                                                  filters='!\"#$%&()*+.,-/:;=?@[\\]^_`{|}~ ')\n",
-        "tokenizer.fit_on_texts(train_captions)\n",
-        "train_seqs = tokenizer.texts_to_sequences(train_captions)"
-      ],
-      "execution_count": 0,
-      "outputs": []
+    "colab_type": "code",
+    "id": "SmZS2N0bXG3T"
+   },
+   "outputs": [],
+   "source": [
+    "# loading the numpy files \n",
+    "def map_func(img_name, cap):\n",
+    "    img_tensor = np.load(img_name.decode('utf-8')+'.npy')\n",
+    "    return img_tensor, cap"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "autoexec": {
+      "startup": false,
+      "wait_interval": 0
+     }
     },
-    {
-      "metadata": {
-        "id": "8Q44tNQVRPFt",
-        "colab_type": "code",
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        }
-      },
-      "cell_type": "code",
-      "source": [
-        "tokenizer.word_index = {key:value for key, value in tokenizer.word_index.items() if value <= top_k}\n",
-        "# putting <unk> token in the word2idx dictionary\n",
-        "tokenizer.word_index[tokenizer.oov_token] = top_k + 1\n",
-        "tokenizer.word_index['<pad>'] = 0"
-      ],
-      "execution_count": 0,
-      "outputs": []
+    "colab_type": "code",
+    "id": "FDF_Nm3tRPGZ"
+   },
+   "outputs": [],
+   "source": [
+    "dataset = tf.data.Dataset.from_tensor_slices((img_name_train, cap_train))\n",
+    "\n",
+    "# using map to load the numpy files in parallel\n",
+    "# NOTE: Be sure to set num_parallel_calls to the number of CPU cores you have\n",
+    "# https://www.tensorflow.org/api_docs/python/tf/py_func\n",
+    "dataset = dataset.map(lambda item1, item2: tf.py_func(\n",
+    "          map_func, [item1, item2], [tf.float32, tf.int32]), num_parallel_calls=8)\n",
+    "\n",
+    "# shuffling and batching\n",
+    "dataset = dataset.shuffle(BUFFER_SIZE)\n",
+    "# https://www.tensorflow.org/api_docs/python/tf/contrib/data/batch_and_drop_remainder\n",
+    "dataset = dataset.batch(BATCH_SIZE)\n",
+    "dataset = dataset.prefetch(1)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text",
+    "id": "nrvoDphgRPGd"
+   },
+   "source": [
+    "## Model\n",
+    "\n",
+    "Fun fact, the decoder below is identical to the one in the example for [Neural Machine Translation with Attention]( https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/eager/python/examples/nmt_with_attention/nmt_with_attention.ipynb).\n",
+    "\n",
+    "The model architecture is inspired by the [Show, Attend and Tell](https://arxiv.org/pdf/1502.03044.pdf) paper.\n",
+    "\n",
+    "* In this example, we extract the features from the lower convolutional layer of InceptionV3 giving us a vector of shape (8, 8, 2048). \n",
+    "* We squash that to a shape of (64, 2048).\n",
+    "* This vector is then passed through the CNN Encoder(which consists of a single Fully connected layer).\n",
+    "* The RNN(here GRU) attends over the image to predict the next word."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "autoexec": {
+      "startup": false,
+      "wait_interval": 0
+     }
     },
-    {
-      "metadata": {
-        "id": "0fpJb5ojRPFv",
-        "colab_type": "code",
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        }
-      },
-      "cell_type": "code",
-      "source": [
-        "# creating the tokenized vectors\n",
-        "train_seqs = tokenizer.texts_to_sequences(train_captions)"
-      ],
-      "execution_count": 0,
-      "outputs": []
+    "colab_type": "code",
+    "id": "AAppCGLKRPGd"
+   },
+   "outputs": [],
+   "source": [
+    "def gru(units):\n",
+    "  # If you have a GPU, we recommend using the CuDNNGRU layer (it provides a \n",
+    "  # significant speedup).\n",
+    "  if tf.test.is_gpu_available():\n",
+    "    return tf.keras.layers.CuDNNGRU(units, \n",
+    "                                    return_sequences=True, \n",
+    "                                    return_state=True, \n",
+    "                                    recurrent_initializer='glorot_uniform')\n",
+    "  else:\n",
+    "    return tf.keras.layers.GRU(units, \n",
+    "                               return_sequences=True, \n",
+    "                               return_state=True, \n",
+    "                               recurrent_activation='sigmoid', \n",
+    "                               recurrent_initializer='glorot_uniform')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "autoexec": {
+      "startup": false,
+      "wait_interval": 0
+     }
     },
-    {
-      "metadata": {
-        "id": "olQArbgbRPF1",
-        "colab_type": "code",
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        }
-      },
-      "cell_type": "code",
-      "source": [
-        "# creating a reverse mapping (index -> word)\n",
-        "index_word = {value:key for key, value in tokenizer.word_index.items()}"
-      ],
-      "execution_count": 0,
-      "outputs": []
+    "colab_type": "code",
+    "id": "ja2LFTMSdeV3"
+   },
+   "outputs": [],
+   "source": [
+    "class BahdanauAttention(tf.keras.Model):\n",
+    "  def __init__(self, units):\n",
+    "    super(BahdanauAttention, self).__init__()\n",
+    "    self.W1 = tf.keras.layers.Dense(units)\n",
+    "    self.W2 = tf.keras.layers.Dense(units)\n",
+    "    self.V = tf.keras.layers.Dense(1)\n",
+    "  \n",
+    "  def call(self, features, hidden):\n",
+    "    # features(CNN_encoder output) shape == (batch_size, 64, embedding_dim)\n",
+    "    \n",
+    "    # hidden shape == (batch_size, hidden_size)\n",
+    "    # hidden_with_time_axis shape == (batch_size, 1, hidden_size)\n",
+    "    hidden_with_time_axis = tf.expand_dims(hidden, 1)\n",
+    "    \n",
+    "    # score shape == (batch_size, 64, hidden_size)\n",
+    "    score = tf.nn.tanh(self.W1(features) + self.W2(hidden_with_time_axis))\n",
+    "    \n",
+    "    # attention_weights shape == (batch_size, 64, 1)\n",
+    "    # we get 1 at the last axis because we are applying score to self.V\n",
+    "    attention_weights = tf.nn.softmax(self.V(score), axis=1)\n",
+    "    \n",
+    "    # context_vector shape after sum == (batch_size, hidden_size)\n",
+    "    context_vector = attention_weights * features\n",
+    "    context_vector = tf.reduce_sum(context_vector, axis=1)\n",
+    "    \n",
+    "    return context_vector, attention_weights"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "autoexec": {
+      "startup": false,
+      "wait_interval": 0
+     }
     },
-    {
-      "metadata": {
-        "id": "AidglIZVRPF4",
-        "colab_type": "code",
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        }
-      },
-      "cell_type": "code",
-      "source": [
-        "# padding each vector to the max_length of the captions\n",
-        "# if the max_length parameter is not provided, pad_sequences calculates that automatically\n",
-        "cap_vector = tf.keras.preprocessing.sequence.pad_sequences(train_seqs, padding='post')"
-      ],
-      "execution_count": 0,
-      "outputs": []
+    "colab_type": "code",
+    "id": "AZ7R1RxHRPGf"
+   },
+   "outputs": [],
+   "source": [
+    "class CNN_Encoder(tf.keras.Model):\n",
+    "    # Since we have already extracted the features and dumped it using pickle\n",
+    "    # This encoder passes those features through a Fully connected layer\n",
+    "    def __init__(self, embedding_dim):\n",
+    "        super(CNN_Encoder, self).__init__()\n",
+    "        # shape after fc == (batch_size, 64, embedding_dim)\n",
+    "        self.fc = tf.keras.layers.Dense(embedding_dim)\n",
+    "        \n",
+    "    def call(self, x):\n",
+    "        x = self.fc(x)\n",
+    "        x = tf.nn.relu(x)\n",
+    "        return x"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "autoexec": {
+      "startup": false,
+      "wait_interval": 0
+     }
     },
-    {
-      "metadata": {
-        "id": "gL0wkttkRPGA",
-        "colab_type": "code",
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        }
-      },
-      "cell_type": "code",
-      "source": [
-        "# calculating the max_length \n",
-        "# used to store the attention weights\n",
-        "max_length = calc_max_length(train_seqs)"
-      ],
-      "execution_count": 0,
-      "outputs": []
+    "colab_type": "code",
+    "id": "V9UbGQmERPGi"
+   },
+   "outputs": [],
+   "source": [
+    "class RNN_Decoder(tf.keras.Model):\n",
+    "  def __init__(self, embedding_dim, units, vocab_size):\n",
+    "    super(RNN_Decoder, self).__init__()\n",
+    "    self.units = units\n",
+    "\n",
+    "    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)\n",
+    "    self.gru = gru(self.units)\n",
+    "    self.fc1 = tf.keras.layers.Dense(self.units)\n",
+    "    self.fc2 = tf.keras.layers.Dense(vocab_size)\n",
+    "    \n",
+    "    self.attention = BahdanauAttention(self.units)\n",
+    "        \n",
+    "  def call(self, x, features, hidden):\n",
+    "    # defining attention as a separate model\n",
+    "    context_vector, attention_weights = self.attention(features, hidden)\n",
+    "    \n",
+    "    # x shape after passing through embedding == (batch_size, 1, embedding_dim)\n",
+    "    x = self.embedding(x)\n",
+    "    \n",
+    "    # x shape after concatenation == (batch_size, 1, embedding_dim + hidden_size)\n",
+    "    x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)\n",
+    "    \n",
+    "    # passing the concatenated vector to the GRU\n",
+    "    output, state = self.gru(x)\n",
+    "    \n",
+    "    # shape == (batch_size, max_length, hidden_size)\n",
+    "    x = self.fc1(output)\n",
+    "    \n",
+    "    # x shape == (batch_size * max_length, hidden_size)\n",
+    "    x = tf.reshape(x, (-1, x.shape[2]))\n",
+    "    \n",
+    "    # output shape == (batch_size * max_length, vocab)\n",
+    "    x = self.fc2(x)\n",
+    "\n",
+    "    return x, state, attention_weights\n",
+    "\n",
+    "  def reset_state(self, batch_size):\n",
+    "    return tf.zeros((batch_size, self.units))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "autoexec": {
+      "startup": false,
+      "wait_interval": 0
+     }
     },
-    {
-      "metadata": {
-        "id": "M3CD75nDpvTI",
-        "colab_type": "text"
-      },
-      "cell_type": "markdown",
-      "source": [
-        "## Split the data into training and testing"
-      ]
+    "colab_type": "code",
+    "id": "Qs_Sr03wRPGk"
+   },
+   "outputs": [],
+   "source": [
+    "encoder = CNN_Encoder(embedding_dim)\n",
+    "decoder = RNN_Decoder(embedding_dim, units, vocab_size)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "autoexec": {
+      "startup": false,
+      "wait_interval": 0
+     }
     },
-    {
-      "metadata": {
-        "id": "iS7DDMszRPGF",
-        "colab_type": "code",
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        }
-      },
-      "cell_type": "code",
-      "source": [
-        "# Create training and validation sets using 80-20 split\n",
-        "img_name_train, img_name_val, cap_train, cap_val = train_test_split(img_name_vector, \n",
-        "                                                                    cap_vector, \n",
-        "                                                                    test_size=0.2, \n",
-        "                                                                    random_state=0)"
-      ],
-      "execution_count": 0,
-      "outputs": []
+    "colab_type": "code",
+    "id": "-bYN7xA0RPGl"
+   },
+   "outputs": [],
+   "source": [
+    "optimizer = tf.train.AdamOptimizer()\n",
+    "\n",
+    "# We are masking the loss calculated for padding\n",
+    "def loss_function(real, pred):\n",
+    "    mask = 1 - np.equal(real, 0)\n",
+    "    loss_ = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=real, logits=pred) * mask\n",
+    "    return tf.reduce_mean(loss_)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text",
+    "id": "PHod7t72RPGn"
+   },
+   "source": [
+    "## Training\n",
+    "\n",
+    "* We extract the features stored in the respective `.npy` files and then pass those features through the encoder.\n",
+    "* The encoder output, hidden state(initialized to 0) and the decoder input (which is the start token) is passed to the decoder.\n",
+    "* The decoder returns the predictions and the decoder hidden state.\n",
+    "* The decoder hidden state is then passed back into the model and the predictions are used to calculate the loss.\n",
+    "* Use teacher forcing to decide the next input to the decoder.\n",
+    "* Teacher forcing is the technique where the target word is passed as the next input to the decoder.\n",
+    "* The final step is to calculate the gradients and apply it to the optimizer and backpropagate.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "autoexec": {
+      "startup": false,
+      "wait_interval": 0
+     }
     },
-    {
-      "metadata": {
-        "id": "XmViPkRFRPGH",
-        "colab_type": "code",
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        }
-      },
-      "cell_type": "code",
-      "source": [
-        "len(img_name_train), len(cap_train), len(img_name_val), len(cap_val)"
-      ],
-      "execution_count": 0,
-      "outputs": []
+    "colab_type": "code",
+    "id": "Vt4WZ5mhJE-E"
+   },
+   "outputs": [],
+   "source": [
+    "# adding this in a separate cell because if you run the training cell \n",
+    "# many times, the loss_plot array will be reset\n",
+    "loss_plot = []"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "autoexec": {
+      "startup": false,
+      "wait_interval": 0
+     }
     },
-    {
-      "metadata": {
-        "id": "uEWM9xrYcg45",
-        "colab_type": "text"
-      },
-      "cell_type": "markdown",
-      "source": [
-        "## Our images and captions are ready! Next, let's create a tf.data dataset to use for training our model.\n",
-        "\n"
-      ]
+    "colab_type": "code",
+    "id": "UlA4VIQpRPGo"
+   },
+   "outputs": [],
+   "source": [
+    "EPOCHS = 20\n",
+    "\n",
+    "for epoch in range(EPOCHS):\n",
+    "    start = time.time()\n",
+    "    total_loss = 0\n",
+    "    \n",
+    "    for (batch, (img_tensor, target)) in enumerate(dataset):\n",
+    "        loss = 0\n",
+    "        \n",
+    "        # initializing the hidden state for each batch\n",
+    "        # because the captions are not related from image to image\n",
+    "        hidden = decoder.reset_state(batch_size=target.shape[0])\n",
+    "\n",
+    "        dec_input = tf.expand_dims([tokenizer.word_index['<start>']] * BATCH_SIZE, 1)\n",
+    "        \n",
+    "        with tf.GradientTape() as tape:\n",
+    "            features = encoder(img_tensor)\n",
+    "            \n",
+    "            for i in range(1, target.shape[1]):\n",
+    "                # passing the features through the decoder\n",
+    "                predictions, hidden, _ = decoder(dec_input, features, hidden)\n",
+    "\n",
+    "                loss += loss_function(target[:, i], predictions)\n",
+    "                \n",
+    "                # using teacher forcing\n",
+    "                dec_input = tf.expand_dims(target[:, i], 1)\n",
+    "        \n",
+    "        total_loss += (loss / int(target.shape[1]))\n",
+    "        \n",
+    "        variables = encoder.variables + decoder.variables\n",
+    "        \n",
+    "        gradients = tape.gradient(loss, variables) \n",
+    "        \n",
+    "        optimizer.apply_gradients(zip(gradients, variables), tf.train.get_or_create_global_step())\n",
+    "        \n",
+    "        if batch % 100 == 0:\n",
+    "            print ('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1, \n",
+    "                                                          batch, \n",
+    "                                                          loss.numpy() / int(target.shape[1])))\n",
+    "    # storing the epoch end loss value to plot later\n",
+    "    loss_plot.append(total_loss / len(cap_vector))\n",
+    "    \n",
+    "    print ('Epoch {} Loss {:.6f}'.format(epoch + 1, \n",
+    "                                         total_loss/len(cap_vector)))\n",
+    "    print ('Time taken for 1 epoch {} sec\\n'.format(time.time() - start))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "autoexec": {
+      "startup": false,
+      "wait_interval": 0
+     }
     },
-    {
-      "metadata": {
-        "id": "Q3TnZ1ToRPGV",
-        "colab_type": "code",
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        }
-      },
-      "cell_type": "code",
-      "source": [
-        "# feel free to change these parameters according to your system's configuration\n",
-        "\n",
-        "BATCH_SIZE = 64\n",
-        "BUFFER_SIZE = 1000\n",
-        "embedding_dim = 256\n",
-        "units = 512\n",
-        "vocab_size = len(tokenizer.word_index)\n",
-        "# shape of the vector extracted from InceptionV3 is (64, 2048)\n",
-        "# these two variables represent that\n",
-        "features_shape = 2048\n",
-        "attention_features_shape = 64"
-      ],
-      "execution_count": 0,
-      "outputs": []
+    "colab_type": "code",
+    "id": "1Wm83G-ZBPcC"
+   },
+   "outputs": [],
+   "source": [
+    "plt.plot(loss_plot)\n",
+    "plt.xlabel('Epochs')\n",
+    "plt.ylabel('Loss')\n",
+    "plt.title('Loss Plot')\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text",
+    "id": "xGvOcLQKghXN"
+   },
+   "source": [
+    "## Caption!\n",
+    "\n",
+    "* The evaluate function is similar to the training loop, except we don't use teacher forcing here. The input to the decoder at each time step is its previous predictions along with the hidden state and the encoder output.\n",
+    "* Stop predicting when the model predicts the end token.\n",
+    "* And store the attention weights for every time step."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "autoexec": {
+      "startup": false,
+      "wait_interval": 0
+     }
     },
-    {
-      "metadata": {
-        "id": "SmZS2N0bXG3T",
-        "colab_type": "code",
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        }
-      },
-      "cell_type": "code",
-      "source": [
-        "# loading the numpy files \n",
-        "def map_func(img_name, cap):\n",
-        "    img_tensor = np.load(img_name.decode('utf-8')+'.npy')\n",
-        "    return img_tensor, cap"
-      ],
-      "execution_count": 0,
-      "outputs": []
+    "colab_type": "code",
+    "id": "RCWpDtyNRPGs"
+   },
+   "outputs": [],
+   "source": [
+    "def evaluate(image):\n",
+    "    attention_plot = np.zeros((max_length, attention_features_shape))\n",
+    "\n",
+    "    hidden = decoder.reset_state(batch_size=1)\n",
+    "\n",
+    "    temp_input = tf.expand_dims(load_image(image)[0], 0)\n",
+    "    img_tensor_val = image_features_extract_model(temp_input)\n",
+    "    img_tensor_val = tf.reshape(img_tensor_val, (img_tensor_val.shape[0], -1, img_tensor_val.shape[3]))\n",
+    "\n",
+    "    features = encoder(img_tensor_val)\n",
+    "\n",
+    "    dec_input = tf.expand_dims([tokenizer.word_index['<start>']], 0)\n",
+    "    result = []\n",
+    "\n",
+    "    for i in range(max_length):\n",
+    "        predictions, hidden, attention_weights = decoder(dec_input, features, hidden)\n",
+    "\n",
+    "        attention_plot[i] = tf.reshape(attention_weights, (-1, )).numpy()\n",
+    "\n",
+    "        predicted_id = tf.argmax(predictions[0]).numpy()\n",
+    "        result.append(tokenizer.index_word[predicted_id])\n",
+    "\n",
+    "        if tokenizer.index_word[predicted_id] == '<end>':\n",
+    "            return result, attention_plot\n",
+    "\n",
+    "        dec_input = tf.expand_dims([predicted_id], 0)\n",
+    "\n",
+    "    attention_plot = attention_plot[:len(result), :]\n",
+    "    return result, attention_plot"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "autoexec": {
+      "startup": false,
+      "wait_interval": 0
+     }
     },
-    {
-      "metadata": {
-        "id": "FDF_Nm3tRPGZ",
-        "colab_type": "code",
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        }
-      },
-      "cell_type": "code",
-      "source": [
-        "dataset = tf.data.Dataset.from_tensor_slices((img_name_train, cap_train))\n",
-        "\n",
-        "# using map to load the numpy files in parallel\n",
-        "# NOTE: Be sure to set num_parallel_calls to the number of CPU cores you have\n",
-        "# https://www.tensorflow.org/api_docs/python/tf/py_func\n",
-        "dataset = dataset.map(lambda item1, item2: tf.py_func(\n",
-        "          map_func, [item1, item2], [tf.float32, tf.int32]), num_parallel_calls=8)\n",
-        "\n",
-        "# shuffling and batching\n",
-        "dataset = dataset.shuffle(BUFFER_SIZE)\n",
-        "# https://www.tensorflow.org/api_docs/python/tf/contrib/data/batch_and_drop_remainder\n",
-        "dataset = dataset.batch(BATCH_SIZE)\n",
-        "dataset = dataset.prefetch(1)"
-      ],
-      "execution_count": 0,
-      "outputs": []
+    "colab_type": "code",
+    "id": "fD_y7PD6RPGt"
+   },
+   "outputs": [],
+   "source": [
+    "def plot_attention(image, result, attention_plot):\n",
+    "    temp_image = np.array(Image.open(image))\n",
+    "\n",
+    "    fig = plt.figure(figsize=(10, 10))\n",
+    "    \n",
+    "    len_result = len(result)\n",
+    "    for l in range(len_result):\n",
+    "        temp_att = np.resize(attention_plot[l], (8, 8))\n",
+    "        ax = fig.add_subplot(len_result//2, len_result//2, l+1)\n",
+    "        ax.set_title(result[l])\n",
+    "        img = ax.imshow(temp_image)\n",
+    "        ax.imshow(temp_att, cmap='gray', alpha=0.6, extent=img.get_extent())\n",
+    "\n",
+    "    plt.tight_layout()\n",
+    "    plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "autoexec": {
+      "startup": false,
+      "wait_interval": 0
+     }
     },
-    {
-      "metadata": {
-        "id": "nrvoDphgRPGd",
-        "colab_type": "text"
-      },
-      "cell_type": "markdown",
-      "source": [
-        "## Model\n",
-        "\n",
-        "Fun fact, the decoder below is identical to the one in the example for [Neural Machine Translation with Attention]( https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/eager/python/examples/nmt_with_attention/nmt_with_attention.ipynb).\n",
-        "\n",
-        "The model architecture is inspired by the [Show, Attend and Tell](https://arxiv.org/pdf/1502.03044.pdf) paper.\n",
-        "\n",
-        "* In this example, we extract the features from the lower convolutional layer of InceptionV3 giving us a vector of shape (8, 8, 2048). \n",
-        "* We squash that to a shape of (64, 2048).\n",
-        "* This vector is then passed through the CNN Encoder(which consists of a single Fully connected layer).\n",
-        "* The RNN(here GRU) attends over the image to predict the next word."
-      ]
+    "colab_type": "code",
+    "id": "io7ws3ReRPGv"
+   },
+   "outputs": [],
+   "source": [
+    "# captions on the validation set\n",
+    "rid = np.random.randint(0, len(img_name_val))\n",
+    "image = img_name_val[rid]\n",
+    "real_caption = ' '.join([tokenizer.index_word[i] for i in cap_val[rid] if i not in [0]])\n",
+    "result, attention_plot = evaluate(image)\n",
+    "\n",
+    "print ('Real Caption:', real_caption)\n",
+    "print ('Prediction Caption:', ' '.join(result))\n",
+    "plot_attention(image, result, attention_plot)\n",
+    "# opening the image\n",
+    "Image.open(img_name_val[rid])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text",
+    "id": "Rprk3HEvZuxb"
+   },
+   "source": [
+    "## Try it on your own images\n",
+    "For fun, below we've provided a method you can use to caption your own images with the model we've just trained. Keep in mind, it was trained on a relatively small amount of data, and your images may be different from the training data (so be prepared for weird results!)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "autoexec": {
+      "startup": false,
+      "wait_interval": 0
+     }
     },
+    "colab_type": "code",
+    "id": "9Psd1quzaAWg"
+   },
+   "outputs": [],
+   "source": [
+    "image_url = 'https://tensorflow.org/images/surf.jpg'\n",
+    "image_extension = image_url[-4:]\n",
+    "image_path = tf.keras.utils.get_file('image'+image_extension, \n",
+    "                                     origin=image_url)\n",
+    "\n",
+    "result, attention_plot = evaluate(image_path)\n",
+    "print ('Prediction Caption:', ' '.join(result))\n",
+    "plot_attention(image_path, result, attention_plot)\n",
+    "# opening the image\n",
+    "Image.open(image_path)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text",
+    "id": "VJZXyJco6uLO"
+   },
+   "source": [
+    "# Next steps\n",
+    "\n",
+    "Congrats! You've just trained an image captioning model with attention. Next, we recommend taking a look at this example [Neural Machine Translation with Attention]( https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/eager/python/examples/nmt_with_attention/nmt_with_attention.ipynb). It uses a similar architecture to translate between Spanish and English sentences. You can also experiment with training the code in this notebook on a different dataset."
+   ]
+  }
+ ],
+ "metadata": {
+  "accelerator": "GPU",
+  "colab": {
+   "collapsed_sections": [],
+   "default_view": {},
+   "name": "image_captioning_with_attention.ipynb",
+   "private_outputs": true,
+   "provenance": [
     {
-      "metadata": {
-        "id": "AAppCGLKRPGd",
-        "colab_type": "code",
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        }
-      },
-      "cell_type": "code",
-      "source": [
-        "def gru(units):\n",
-        "  # If you have a GPU, we recommend using the CuDNNGRU layer (it provides a \n",
-        "  # significant speedup).\n",
-        "  if tf.test.is_gpu_available():\n",
-        "    return tf.keras.layers.CuDNNGRU(units, \n",
-        "                                    return_sequences=True, \n",
-        "                                    return_state=True, \n",
-        "                                    recurrent_initializer='glorot_uniform')\n",
-        "  else:\n",
-        "    return tf.keras.layers.GRU(units, \n",
-        "                               return_sequences=True, \n",
-        "                               return_state=True, \n",
-        "                               recurrent_activation='sigmoid', \n",
-        "                               recurrent_initializer='glorot_uniform')"
-      ],
-      "execution_count": 0,
-      "outputs": []
-    },
-    {
-      "metadata": {
-        "id": "ja2LFTMSdeV3",
-        "colab_type": "code",
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        }
-      },
-      "cell_type": "code",
-      "source": [
-        "class BahdanauAttention(tf.keras.Model):\n",
-        "  def __init__(self, units):\n",
-        "    super(BahdanauAttention, self).__init__()\n",
-        "    self.W1 = tf.keras.layers.Dense(units)\n",
-        "    self.W2 = tf.keras.layers.Dense(units)\n",
-        "    self.V = tf.keras.layers.Dense(1)\n",
-        "  \n",
-        "  def call(self, features, hidden):\n",
-        "    # features(CNN_encoder output) shape == (batch_size, 64, embedding_dim)\n",
-        "    \n",
-        "    # hidden shape == (batch_size, hidden_size)\n",
-        "    # hidden_with_time_axis shape == (batch_size, 1, hidden_size)\n",
-        "    hidden_with_time_axis = tf.expand_dims(hidden, 1)\n",
-        "    \n",
-        "    # score shape == (batch_size, 64, hidden_size)\n",
-        "    score = tf.nn.tanh(self.W1(features) + self.W2(hidden_with_time_axis))\n",
-        "    \n",
-        "    # attention_weights shape == (batch_size, 64, 1)\n",
-        "    # we get 1 at the last axis because we are applying score to self.V\n",
-        "    attention_weights = tf.nn.softmax(self.V(score), axis=1)\n",
-        "    \n",
-        "    # context_vector shape after sum == (batch_size, hidden_size)\n",
-        "    context_vector = attention_weights * features\n",
-        "    context_vector = tf.reduce_sum(context_vector, axis=1)\n",
-        "    \n",
-        "    return context_vector, attention_weights"
-      ],
-      "execution_count": 0,
-      "outputs": []
-    },
-    {
-      "metadata": {
-        "id": "AZ7R1RxHRPGf",
-        "colab_type": "code",
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        }
-      },
-      "cell_type": "code",
-      "source": [
-        "class CNN_Encoder(tf.keras.Model):\n",
-        "    # Since we have already extracted the features and dumped it using pickle\n",
-        "    # This encoder passes those features through a Fully connected layer\n",
-        "    def __init__(self, embedding_dim):\n",
-        "        super(CNN_Encoder, self).__init__()\n",
-        "        # shape after fc == (batch_size, 64, embedding_dim)\n",
-        "        self.fc = tf.keras.layers.Dense(embedding_dim)\n",
-        "        \n",
-        "    def call(self, x):\n",
-        "        x = self.fc(x)\n",
-        "        x = tf.nn.relu(x)\n",
-        "        return x"
-      ],
-      "execution_count": 0,
-      "outputs": []
-    },
-    {
-      "metadata": {
-        "id": "V9UbGQmERPGi",
-        "colab_type": "code",
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        }
-      },
-      "cell_type": "code",
-      "source": [
-        "class RNN_Decoder(tf.keras.Model):\n",
-        "  def __init__(self, embedding_dim, units, vocab_size):\n",
-        "    super(RNN_Decoder, self).__init__()\n",
-        "    self.units = units\n",
-        "\n",
-        "    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)\n",
-        "    self.gru = gru(self.units)\n",
-        "    self.fc1 = tf.keras.layers.Dense(self.units)\n",
-        "    self.fc2 = tf.keras.layers.Dense(vocab_size)\n",
-        "    \n",
-        "    self.attention = BahdanauAttention(self.units)\n",
-        "        \n",
-        "  def call(self, x, features, hidden):\n",
-        "    # defining attention as a separate model\n",
-        "    context_vector, attention_weights = self.attention(features, hidden)\n",
-        "    \n",
-        "    # x shape after passing through embedding == (batch_size, 1, embedding_dim)\n",
-        "    x = self.embedding(x)\n",
-        "    \n",
-        "    # x shape after concatenation == (batch_size, 1, embedding_dim + hidden_size)\n",
-        "    x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)\n",
-        "    \n",
-        "    # passing the concatenated vector to the GRU\n",
-        "    output, state = self.gru(x)\n",
-        "    \n",
-        "    # shape == (batch_size, max_length, hidden_size)\n",
-        "    x = self.fc1(output)\n",
-        "    \n",
-        "    # x shape == (batch_size * max_length, hidden_size)\n",
-        "    x = tf.reshape(x, (-1, x.shape[2]))\n",
-        "    \n",
-        "    # output shape == (batch_size * max_length, vocab)\n",
-        "    x = self.fc2(x)\n",
-        "\n",
-        "    return x, state, attention_weights\n",
-        "\n",
-        "  def reset_state(self, batch_size):\n",
-        "    return tf.zeros((batch_size, self.units))"
-      ],
-      "execution_count": 0,
-      "outputs": []
-    },
-    {
-      "metadata": {
-        "id": "Qs_Sr03wRPGk",
-        "colab_type": "code",
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        }
-      },
-      "cell_type": "code",
-      "source": [
-        "encoder = CNN_Encoder(embedding_dim)\n",
-        "decoder = RNN_Decoder(embedding_dim, units, vocab_size)"
-      ],
-      "execution_count": 0,
-      "outputs": []
-    },
-    {
-      "metadata": {
-        "id": "-bYN7xA0RPGl",
-        "colab_type": "code",
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        }
-      },
-      "cell_type": "code",
-      "source": [
-        "optimizer = tf.train.AdamOptimizer()\n",
-        "\n",
-        "# We are masking the loss calculated for padding\n",
-        "def loss_function(real, pred):\n",
-        "    mask = 1 - np.equal(real, 0)\n",
-        "    loss_ = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=real, logits=pred) * mask\n",
-        "    return tf.reduce_mean(loss_)"
-      ],
-      "execution_count": 0,
-      "outputs": []
-    },
-    {
-      "metadata": {
-        "id": "PHod7t72RPGn",
-        "colab_type": "text"
-      },
-      "cell_type": "markdown",
-      "source": [
-        "## Training\n",
-        "\n",
-        "* We extract the features stored in the respective `.npy` files and then pass those features through the encoder.\n",
-        "* The encoder output, hidden state(initialized to 0) and the decoder input (which is the start token) is passed to the decoder.\n",
-        "* The decoder returns the predictions and the decoder hidden state.\n",
-        "* The decoder hidden state is then passed back into the model and the predictions are used to calculate the loss.\n",
-        "* Use teacher forcing to decide the next input to the decoder.\n",
-        "* Teacher forcing is the technique where the target word is passed as the next input to the decoder.\n",
-        "* The final step is to calculate the gradients and apply it to the optimizer and backpropagate.\n"
-      ]
-    },
-    {
-      "metadata": {
-        "id": "Vt4WZ5mhJE-E",
-        "colab_type": "code",
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        }
-      },
-      "cell_type": "code",
-      "source": [
-        "# adding this in a separate cell because if you run the training cell \n",
-        "# many times, the loss_plot array will be reset\n",
-        "loss_plot = []"
-      ],
-      "execution_count": 0,
-      "outputs": []
-    },
-    {
-      "metadata": {
-        "id": "UlA4VIQpRPGo",
-        "colab_type": "code",
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        }
-      },
-      "cell_type": "code",
-      "source": [
-        "EPOCHS = 20\n",
-        "\n",
-        "for epoch in range(EPOCHS):\n",
-        "    start = time.time()\n",
-        "    total_loss = 0\n",
-        "    \n",
-        "    for (batch, (img_tensor, target)) in enumerate(dataset):\n",
-        "        loss = 0\n",
-        "        \n",
-        "        # initializing the hidden state for each batch\n",
-        "        # because the captions are not related from image to image\n",
-        "        hidden = decoder.reset_state(batch_size=target.shape[0])\n",
-        "\n",
-        "        dec_input = tf.expand_dims([tokenizer.word_index['<start>']] * BATCH_SIZE, 1)\n",
-        "        \n",
-        "        with tf.GradientTape() as tape:\n",
-        "            features = encoder(img_tensor)\n",
-        "            \n",
-        "            for i in range(1, target.shape[1]):\n",
-        "                # passing the features through the decoder\n",
-        "                predictions, hidden, _ = decoder(dec_input, features, hidden)\n",
-        "\n",
-        "                loss += loss_function(target[:, i], predictions)\n",
-        "                \n",
-        "                # using teacher forcing\n",
-        "                dec_input = tf.expand_dims(target[:, i], 1)\n",
-        "        \n",
-        "        total_loss += (loss / int(target.shape[1]))\n",
-        "        \n",
-        "        variables = encoder.variables + decoder.variables\n",
-        "        \n",
-        "        gradients = tape.gradient(loss, variables) \n",
-        "        \n",
-        "        optimizer.apply_gradients(zip(gradients, variables), tf.train.get_or_create_global_step())\n",
-        "        \n",
-        "        if batch % 100 == 0:\n",
-        "            print ('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1, \n",
-        "                                                          batch, \n",
-        "                                                          loss.numpy() / int(target.shape[1])))\n",
-        "    # storing the epoch end loss value to plot later\n",
-        "    loss_plot.append(total_loss / len(cap_vector))\n",
-        "    \n",
-        "    print ('Epoch {} Loss {:.6f}'.format(epoch + 1, \n",
-        "                                         total_loss/len(cap_vector)))\n",
-        "    print ('Time taken for 1 epoch {} sec\\n'.format(time.time() - start))"
-      ],
-      "execution_count": 0,
-      "outputs": []
-    },
-    {
-      "metadata": {
-        "id": "1Wm83G-ZBPcC",
-        "colab_type": "code",
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        }
-      },
-      "cell_type": "code",
-      "source": [
-        "plt.plot(loss_plot)\n",
-        "plt.xlabel('Epochs')\n",
-        "plt.ylabel('Loss')\n",
-        "plt.title('Loss Plot')\n",
-        "plt.show()"
-      ],
-      "execution_count": 0,
-      "outputs": []
-    },
-    {
-      "metadata": {
-        "id": "xGvOcLQKghXN",
-        "colab_type": "text"
-      },
-      "cell_type": "markdown",
-      "source": [
-        "## Caption!\n",
-        "\n",
-        "* The evaluate function is similar to the training loop, except we don't use teacher forcing here. The input to the decoder at each time step is its previous predictions along with the hidden state and the encoder output.\n",
-        "* Stop predicting when the model predicts the end token.\n",
-        "* And store the attention weights for every time step."
-      ]
-    },
-    {
-      "metadata": {
-        "id": "RCWpDtyNRPGs",
-        "colab_type": "code",
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        }
-      },
-      "cell_type": "code",
-      "source": [
-        "def evaluate(image):\n",
-        "    attention_plot = np.zeros((max_length, attention_features_shape))\n",
-        "\n",
-        "    hidden = decoder.reset_state(batch_size=1)\n",
-        "\n",
-        "    temp_input = tf.expand_dims(load_image(image)[0], 0)\n",
-        "    img_tensor_val = image_features_extract_model(temp_input)\n",
-        "    img_tensor_val = tf.reshape(img_tensor_val, (img_tensor_val.shape[0], -1, img_tensor_val.shape[3]))\n",
-        "\n",
-        "    features = encoder(img_tensor_val)\n",
-        "\n",
-        "    dec_input = tf.expand_dims([tokenizer.word_index['<start>']], 0)\n",
-        "    result = []\n",
-        "\n",
-        "    for i in range(max_length):\n",
-        "        predictions, hidden, attention_weights = decoder(dec_input, features, hidden)\n",
-        "\n",
-        "        attention_plot[i] = tf.reshape(attention_weights, (-1, )).numpy()\n",
-        "\n",
-        "        predicted_id = tf.argmax(predictions[0]).numpy()\n",
-        "        result.append(index_word[predicted_id])\n",
-        "\n",
-        "        if index_word[predicted_id] == '<end>':\n",
-        "            return result, attention_plot\n",
-        "\n",
-        "        dec_input = tf.expand_dims([predicted_id], 0)\n",
-        "\n",
-        "    attention_plot = attention_plot[:len(result), :]\n",
-        "    return result, attention_plot"
-      ],
-      "execution_count": 0,
-      "outputs": []
-    },
-    {
-      "metadata": {
-        "id": "fD_y7PD6RPGt",
-        "colab_type": "code",
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        }
-      },
-      "cell_type": "code",
-      "source": [
-        "def plot_attention(image, result, attention_plot):\n",
-        "    temp_image = np.array(Image.open(image))\n",
-        "\n",
-        "    fig = plt.figure(figsize=(10, 10))\n",
-        "    \n",
-        "    len_result = len(result)\n",
-        "    for l in range(len_result):\n",
-        "        temp_att = np.resize(attention_plot[l], (8, 8))\n",
-        "        ax = fig.add_subplot(len_result//2, len_result//2, l+1)\n",
-        "        ax.set_title(result[l])\n",
-        "        img = ax.imshow(temp_image)\n",
-        "        ax.imshow(temp_att, cmap='gray', alpha=0.6, extent=img.get_extent())\n",
-        "\n",
-        "    plt.tight_layout()\n",
-        "    plt.show()"
-      ],
-      "execution_count": 0,
-      "outputs": []
-    },
-    {
-      "metadata": {
-        "id": "io7ws3ReRPGv",
-        "colab_type": "code",
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        }
-      },
-      "cell_type": "code",
-      "source": [
-        "# captions on the validation set\n",
-        "rid = np.random.randint(0, len(img_name_val))\n",
-        "image = img_name_val[rid]\n",
-        "real_caption = ' '.join([index_word[i] for i in cap_val[rid] if i not in [0]])\n",
-        "result, attention_plot = evaluate(image)\n",
-        "\n",
-        "print ('Real Caption:', real_caption)\n",
-        "print ('Prediction Caption:', ' '.join(result))\n",
-        "plot_attention(image, result, attention_plot)\n",
-        "# opening the image\n",
-        "Image.open(img_name_val[rid])"
-      ],
-      "execution_count": 0,
-      "outputs": []
-    },
-    {
-      "metadata": {
-        "id": "Rprk3HEvZuxb",
-        "colab_type": "text"
-      },
-      "cell_type": "markdown",
-      "source": [
-        "## Try it on your own images\n",
-        "For fun, below we've provided a method you can use to caption your own images with the model we've just trained. Keep in mind, it was trained on a relatively small amount of data, and your images may be different from the training data (so be prepared for weird results!)\n"
-      ]
-    },
-    {
-      "metadata": {
-        "id": "9Psd1quzaAWg",
-        "colab_type": "code",
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        }
-      },
-      "cell_type": "code",
-      "source": [
-        "image_url = 'https://tensorflow.org/images/surf.jpg'\n",
-        "image_extension = image_url[-4:]\n",
-        "image_path = tf.keras.utils.get_file('image'+image_extension, \n",
-        "                                     origin=image_url)\n",
-        "\n",
-        "result, attention_plot = evaluate(image_path)\n",
-        "print ('Prediction Caption:', ' '.join(result))\n",
-        "plot_attention(image_path, result, attention_plot)\n",
-        "# opening the image\n",
-        "Image.open(image_path)"
-      ],
-      "execution_count": 0,
-      "outputs": []
-    },
-    {
-      "metadata": {
-        "id": "VJZXyJco6uLO",
-        "colab_type": "text"
-      },
-      "cell_type": "markdown",
-      "source": [
-        "# Next steps\n",
-        "\n",
-        "Congrats! You've just trained an image captioning model with attention. Next, we recommend taking a look at this example [Neural Machine Translation with Attention]( https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/eager/python/examples/nmt_with_attention/nmt_with_attention.ipynb). It uses a similar architecture to translate between Spanish and English sentences. You can also experiment with training the code in this notebook on a different dataset."
-      ]
+     "file_id": "1HI8OK2sMjcx9CTWVn0122QAHOuXaOaMg",
+     "timestamp": 1530222436922
     }
-  ]
+   ],
+   "toc_visible": true,
+   "version": "0.3.2",
+   "views": {}
+  },
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
 }
diff --git a/tensorflow/contrib/eager/python/examples/linear_regression/linear_regression_graph_test.py b/tensorflow/contrib/eager/python/examples/linear_regression/linear_regression_graph_test.py
index 557ad427521..d412b25b368 100644
--- a/tensorflow/contrib/eager/python/examples/linear_regression/linear_regression_graph_test.py
+++ b/tensorflow/contrib/eager/python/examples/linear_regression/linear_regression_graph_test.py
@@ -36,7 +36,7 @@ class GraphLinearRegressionBenchmark(tf.test.Benchmark):
         noise_level=0.01,
         batch_size=batch_size,
         num_batches=num_batches)
-    iterator = dataset.make_initializable_iterator()
+    iterator = tf.compat.v1.data.make_initializable_iterator(dataset)
     x, y = iterator.get_next()
 
     model = linear_regression.LinearModel()
diff --git a/tensorflow/contrib/eager/python/examples/nmt_with_attention/nmt_with_attention.ipynb b/tensorflow/contrib/eager/python/examples/nmt_with_attention/nmt_with_attention.ipynb
index 480777d9487..66d52a74943 100644
--- a/tensorflow/contrib/eager/python/examples/nmt_with_attention/nmt_with_attention.ipynb
+++ b/tensorflow/contrib/eager/python/examples/nmt_with_attention/nmt_with_attention.ipynb
@@ -768,7 +768,7 @@
       },
       "outputs": [],
       "source": [
-        "translate('hace mucho frio aqui.', encoder, decoder, inp_lang, targ_lang, max_length_inp, max_length_targ)"
+        "translate(u'hace mucho frio aqui.', encoder, decoder, inp_lang, targ_lang, max_length_inp, max_length_targ)"
       ]
     },
     {
@@ -781,7 +781,7 @@
       },
       "outputs": [],
       "source": [
-        "translate('esta es mi vida.', encoder, decoder, inp_lang, targ_lang, max_length_inp, max_length_targ)"
+        "translate(u'esta es mi vida.', encoder, decoder, inp_lang, targ_lang, max_length_inp, max_length_targ)"
       ]
     },
     {
@@ -794,7 +794,7 @@
       },
       "outputs": [],
       "source": [
-        "translate('¿todavia estan en casa?', encoder, decoder, inp_lang, targ_lang, max_length_inp, max_length_targ)"
+        "translate(u'todavia estan en casa?', encoder, decoder, inp_lang, targ_lang, max_length_inp, max_length_targ)"
       ]
     },
     {
@@ -808,7 +808,7 @@
       "outputs": [],
       "source": [
         "# wrong translation\n",
-        "translate('trata de averiguarlo.', encoder, decoder, inp_lang, targ_lang, max_length_inp, max_length_targ)"
+        "translate(u'trata de averiguarlo.', encoder, decoder, inp_lang, targ_lang, max_length_inp, max_length_targ)"
       ]
     },
     {
diff --git a/tensorflow/contrib/eager/python/examples/resnet50/resnet50_graph_test.py b/tensorflow/contrib/eager/python/examples/resnet50/resnet50_graph_test.py
index f3bb978875e..fb7975d8fe8 100644
--- a/tensorflow/contrib/eager/python/examples/resnet50/resnet50_graph_test.py
+++ b/tensorflow/contrib/eager/python/examples/resnet50/resnet50_graph_test.py
@@ -142,7 +142,8 @@ class ResNet50Benchmarks(tf.test.Benchmark):
       with tf.Graph().as_default():
         np_images, np_labels = random_batch(batch_size)
         dataset = tf.data.Dataset.from_tensors((np_images, np_labels)).repeat()
-        (images, labels) = dataset.make_one_shot_iterator().get_next()
+        images, labels = tf.compat.v1.data.make_one_shot_iterator(
+            dataset).get_next()
 
         model = resnet50.ResNet50(data_format())
         logits = model(images, training=True)
diff --git a/tensorflow/contrib/eager/python/examples/revnet/main.py b/tensorflow/contrib/eager/python/examples/revnet/main.py
index b702e91f922..9585f3565f8 100644
--- a/tensorflow/contrib/eager/python/examples/revnet/main.py
+++ b/tensorflow/contrib/eager/python/examples/revnet/main.py
@@ -72,14 +72,11 @@ def main(_):
     train_one_iter(model, x, y, optimizer, global_step=global_step)
 
     if global_step.numpy() % config.log_every == 0:
-      it_test = ds_test.make_one_shot_iterator()
-      acc_test, loss_test = evaluate(model, it_test)
+      acc_test, loss_test = evaluate(model, ds_test)
 
       if FLAGS.validate:
-        it_train = ds_train_one_shot.make_one_shot_iterator()
-        it_validation = ds_validation.make_one_shot_iterator()
-        acc_train, loss_train = evaluate(model, it_train)
-        acc_validation, loss_validation = evaluate(model, it_validation)
+        acc_train, loss_train = evaluate(model, ds_train_one_shot)
+        acc_validation, loss_validation = evaluate(model, ds_validation)
         print("Iter {}, "
               "training set accuracy {:.4f}, loss {:.4f}; "
               "validation set accuracy {:.4f}, loss {:.4f}; "
@@ -218,11 +215,11 @@ def train_one_iter(model, inputs, labels, optimizer, global_step=None):
   return logits, loss
 
 
-def evaluate(model, iterator):
+def evaluate(model, dataset):
   """Compute accuracy with the given dataset iterator."""
   mean_loss = tfe.metrics.Mean()
   accuracy = tfe.metrics.Accuracy()
-  for x, y in iterator:
+  for x, y in dataset:
     logits, _ = model(x, training=False)
     loss = model.compute_loss(logits=logits, labels=y)
     accuracy(
diff --git a/tensorflow/contrib/eager/python/examples/rnn_ptb/rnn_ptb_graph_test.py b/tensorflow/contrib/eager/python/examples/rnn_ptb/rnn_ptb_graph_test.py
index 63b5c4c54d1..770484abed9 100644
--- a/tensorflow/contrib/eager/python/examples/rnn_ptb/rnn_ptb_graph_test.py
+++ b/tensorflow/contrib/eager/python/examples/rnn_ptb/rnn_ptb_graph_test.py
@@ -82,7 +82,7 @@ class PTBBenchmark(tf.test.Benchmark):
         tf.ones(
             [PTBBenchmark.SEQ_LEN, PTBBenchmark.BATCH_SIZE],
             dtype=tf.int64)).repeat(num_iters + num_warmup)
-    inputs = dataset.make_one_shot_iterator().get_next()
+    inputs = tf.compat.v1.data.make_one_shot_iterator(dataset).get_next()
 
     with tf.device(tf.test.gpu_device_name()):
       outputs = model(inputs, training=True)
@@ -124,7 +124,8 @@ class PTBBenchmark(tf.test.Benchmark):
             dtype=tf.int64)).repeat(num_iters + num_warmup)
     # inputs and labels have the same shape
     dataset = tf.data.Dataset.zip((dataset, dataset))
-    (inputs, labels) = dataset.make_one_shot_iterator().get_next()
+    (inputs, labels) = tf.compat.v1.data.make_one_shot_iterator(
+        dataset).get_next()
 
     with tf.device(tf.test.gpu_device_name()):
       optimizer = tf.train.GradientDescentOptimizer(learning_rate=1.0)
diff --git a/tensorflow/contrib/eager/python/metrics_impl.py b/tensorflow/contrib/eager/python/metrics_impl.py
index c88c0f52eea..566246de495 100644
--- a/tensorflow/contrib/eager/python/metrics_impl.py
+++ b/tensorflow/contrib/eager/python/metrics_impl.py
@@ -24,6 +24,7 @@ from tensorflow.python.eager import context
 from tensorflow.python.eager import function
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import smart_cond
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import control_flow_ops
@@ -354,9 +355,10 @@ class Mean(Metric):
     def write_summary_f():
       summary_ops.scalar(name=self.name, tensor=t)
       return t
-    control_flow_ops.cond(write_summary,
+    smart_cond.smart_cond(write_summary,
                           write_summary_f,
-                          lambda: t)
+                          lambda: t,
+                          name="")
     return t
 
 
diff --git a/tensorflow/contrib/eager/python/metrics_test.py b/tensorflow/contrib/eager/python/metrics_test.py
index 9d2d172752c..39e5957f5d1 100644
--- a/tensorflow/contrib/eager/python/metrics_test.py
+++ b/tensorflow/contrib/eager/python/metrics_test.py
@@ -49,18 +49,6 @@ class MetricsTest(test.TestCase):
     self.assertEqual(dtypes.float64, m.dtype)
     self.assertEqual(dtypes.float64, m.result().dtype)
 
-  def testSummaryArg(self):
-    m = metrics.Mean()
-    m([1, 10, 100])
-    m(1000)
-    m([10000.0, 100000.0])
-    self.assertEqual(111111.0/6, m.result(write_summary=True).numpy())
-    self.assertEqual(111111.0/6, m.result(write_summary=False).numpy())
-    with self.assertRaises(ValueError):
-      m.result(write_summary=5)
-    with self.assertRaises(ValueError):
-      m.result(write_summary=[True])
-
   def testVariableCollections(self):
     with context.graph_mode(), ops.Graph().as_default():
       m = metrics.Mean()
diff --git a/tensorflow/contrib/eager/python/network.py b/tensorflow/contrib/eager/python/network.py
index f801d9a47b2..5cc0c4f23d9 100644
--- a/tensorflow/contrib/eager/python/network.py
+++ b/tensorflow/contrib/eager/python/network.py
@@ -24,7 +24,7 @@ import weakref
 
 from tensorflow.python.eager import context
 from tensorflow.python.framework import ops
-from tensorflow.python.keras.engine import base_layer as keras_base_layer
+from tensorflow.python.keras.engine import base_layer_utils
 from tensorflow.python.layers import base
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.platform import tf_logging as logging
@@ -220,7 +220,7 @@ class Network(base.Layer):
         avoid_names = parent_network._owned_layers
         name_uid_map = parent_network._sub_layer_name_uids
       else:
-        name_uid_map = keras_base_layer.get_default_graph_uid_map()
+        name_uid_map = base_layer_utils.get_default_graph_uid_map()
         # Figure out which names we have to avoid based on which variable scope
         # we're nested in.
         strip_name = self._default_parent_variable_scope.name
diff --git a/tensorflow/contrib/eager/python/saver.py b/tensorflow/contrib/eager/python/saver.py
index f9c716360c5..1d0d6c6c14c 100644
--- a/tensorflow/contrib/eager/python/saver.py
+++ b/tensorflow/contrib/eager/python/saver.py
@@ -115,6 +115,11 @@ def restore_variables_on_create(save_path, map_func=None):
 
 class Saver(object):
   """A tf.train.Saver adapter for use when eager execution is enabled.
+
+  `Saver`'s name-based checkpointing strategy is fragile. Please switch to
+  `tf.train.Checkpoint` or `tf.keras.Model.save_weights`, which perform a more
+  robust object-based saving. These APIs will load checkpoints written by
+  `Saver`.
   """
 
   def __init__(self, var_list):
diff --git a/tensorflow/contrib/eager/python/tfe_test.py b/tensorflow/contrib/eager/python/tfe_test.py
index 4454abfb966..8c35dddb5a5 100644
--- a/tensorflow/contrib/eager/python/tfe_test.py
+++ b/tensorflow/contrib/eager/python/tfe_test.py
@@ -87,8 +87,8 @@ class TFETest(test_util.TensorFlowTestCase):
       x += 1.
     # Without a device context, heuristics are used to place ops.
     # In this case, ops.reduce_mean runs on the GPU.
-    reduction_indices = range(x.shape.ndims)
-    m = math_ops.reduce_mean(x, reduction_indices)
+    axis = range(x.shape.ndims)
+    m = math_ops.reduce_mean(x, axis)
     # m is on GPU, bring it back to CPU and compare.
     self.assertEqual(3.5, m.cpu().numpy())
 
diff --git a/tensorflow/contrib/estimator/BUILD b/tensorflow/contrib/estimator/BUILD
index 37f253d9c11..a888379f13e 100644
--- a/tensorflow/contrib/estimator/BUILD
+++ b/tensorflow/contrib/estimator/BUILD
@@ -16,7 +16,6 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         ":boosted_trees",
-        ":dnn",
         ":dnn_with_layer_annotations",
         ":early_stopping",
         ":expect_tensorflow_estimator_installed",
@@ -25,7 +24,6 @@ py_library(
         ":extenders",
         ":head",
         ":hooks",
-        ":linear",
         ":logit_fns",
         ":multi_head",
         ":replicate_model_fn",
@@ -47,18 +45,6 @@ py_library(
     ],
 )
 
-py_library(
-    name = "dnn",
-    srcs = ["python/estimator/dnn.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":expect_tensorflow_estimator_installed",
-        "//tensorflow:tensorflow_py_no_contrib",
-        "//tensorflow/python/estimator",
-        "//tensorflow/python/estimator:dnn",
-    ],
-)
-
 py_library(
     name = "dnn_with_layer_annotations",
     srcs = ["python/estimator/dnn_with_layer_annotations.py"],
@@ -144,17 +130,6 @@ py_library(
     ],
 )
 
-py_library(
-    name = "linear",
-    srcs = ["python/estimator/linear.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":expect_tensorflow_estimator_installed",
-        "//tensorflow/python/estimator",
-        "//tensorflow/python/estimator:linear",
-    ],
-)
-
 py_library(
     name = "logit_fns",
     srcs = [
diff --git a/tensorflow/contrib/estimator/__init__.py b/tensorflow/contrib/estimator/__init__.py
index 80d59627620..7d61247e7ef 100644
--- a/tensorflow/contrib/estimator/__init__.py
+++ b/tensorflow/contrib/estimator/__init__.py
@@ -58,8 +58,6 @@ _allowed_symbols = [
     'multi_label_head',
     'poisson_regression_head',
     'regression_head',
-    'DNNEstimator',
-    'LinearEstimator',
     'boosted_trees_classifier_train_in_memory',
     'boosted_trees_regressor_train_in_memory',
     'call_logit_fn',
diff --git a/tensorflow/contrib/estimator/python/estimator/dnn_linear_combined.py b/tensorflow/contrib/estimator/python/estimator/dnn_linear_combined.py
deleted file mode 100644
index 7894418c4a1..00000000000
--- a/tensorflow/contrib/estimator/python/estimator/dnn_linear_combined.py
+++ /dev/null
@@ -1,34 +0,0 @@
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""dnn_linear_combined python module.
-
-Importing from tensorflow.python.estimator is unsupported
-and will soon break!
-"""
-# pylint: disable=unused-import,g-bad-import-order,g-import-not-at-top,wildcard-import
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow_estimator.contrib.estimator.python.estimator import dnn_linear_combined
-
-# Include attrs that start with single underscore.
-_HAS_DYNAMIC_ATTRIBUTES = True
-dnn_linear_combined.__all__ = [
-    s for s in dir(dnn_linear_combined) if not s.startswith('__')
-]
-
-from tensorflow_estimator.contrib.estimator.python.estimator.dnn_linear_combined import *
diff --git a/tensorflow/contrib/factorization/python/ops/kmeans.py b/tensorflow/contrib/factorization/python/ops/kmeans.py
index f384d761a84..3eb396a29cc 100644
--- a/tensorflow/contrib/factorization/python/ops/kmeans.py
+++ b/tensorflow/contrib/factorization/python/ops/kmeans.py
@@ -26,7 +26,7 @@ from tensorflow.contrib.factorization.python.ops import clustering_ops
 from tensorflow.python.estimator import estimator
 from tensorflow.python.estimator import model_fn as model_fn_lib
 from tensorflow.python.estimator.export import export_output
-from tensorflow.python.feature_column import feature_column as fc
+from tensorflow.python.feature_column import feature_column_lib as fc
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
diff --git a/tensorflow/contrib/factorization/python/ops/kmeans_test.py b/tensorflow/contrib/factorization/python/ops/kmeans_test.py
index 1ab5418fe46..2f7cd131d3e 100644
--- a/tensorflow/contrib/factorization/python/ops/kmeans_test.py
+++ b/tensorflow/contrib/factorization/python/ops/kmeans_test.py
@@ -27,7 +27,7 @@ from sklearn.cluster import KMeans as SklearnKMeans
 # pylint: disable=g-import-not-at-top
 from tensorflow.contrib.factorization.python.ops import kmeans as kmeans_lib
 from tensorflow.python.estimator import run_config
-from tensorflow.python.feature_column import feature_column as fc
+from tensorflow.python.feature_column import feature_column_lib as fc
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
diff --git a/tensorflow/contrib/feature_column/BUILD b/tensorflow/contrib/feature_column/BUILD
index bbe335be3e1..1cd83bdb5de 100644
--- a/tensorflow/contrib/feature_column/BUILD
+++ b/tensorflow/contrib/feature_column/BUILD
@@ -14,6 +14,7 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         ":sequence_feature_column",
+        ":sequence_feature_column_v2",
         "//tensorflow/python:util",
     ],
 )
@@ -32,7 +33,7 @@ py_library(
         "//tensorflow/python:sparse_ops",
         "//tensorflow/python:tensor_shape",
         "//tensorflow/python:variable_scope",
-        "//tensorflow/python/feature_column",
+        "//tensorflow/python/feature_column:feature_column_py",
     ],
 )
 
@@ -51,7 +52,7 @@ py_test(
         "//tensorflow/python:parsing_ops",
         "//tensorflow/python:sparse_tensor",
         "//tensorflow/python:training",
-        "//tensorflow/python/feature_column",
+        "//tensorflow/python/feature_column:feature_column_py",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
@@ -69,7 +70,7 @@ py_test(
         "//tensorflow/python:parsing_ops",
         "//tensorflow/python:training",
         "//tensorflow/python:util",
-        "//tensorflow/python/feature_column",
+        "//tensorflow/python/feature_column:feature_column_py",
         "//tensorflow/python/keras:layers",
     ],
 )
@@ -89,7 +90,7 @@ py_library(
         "//tensorflow/python:tensor_shape",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python/feature_column",
-        "//tensorflow/python/feature_column:feature_column_v2",
+        "//tensorflow/python/feature_column:feature_column_py",
     ],
 )
 
@@ -110,7 +111,7 @@ py_test(
         "//tensorflow/python:sparse_tensor",
         "//tensorflow/python:training",
         "//tensorflow/python/feature_column",
-        "//tensorflow/python/feature_column:feature_column_v2",
+        "//tensorflow/python/feature_column:feature_column_py",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
diff --git a/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column.py b/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column.py
index dd6da35ed00..9b3a5c58aaa 100644
--- a/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column.py
+++ b/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column.py
@@ -222,10 +222,8 @@ def sequence_categorical_column_with_identity(
     ValueError: if `default_value` is not in range `[0, num_buckets)`.
   """
   return fc._SequenceCategoricalColumn(
-      fc.categorical_column_with_identity(
-          key=key,
-          num_buckets=num_buckets,
-          default_value=default_value))
+      fc._categorical_column_with_identity(
+          key=key, num_buckets=num_buckets, default_value=default_value))
 
 
 def sequence_categorical_column_with_hash_bucket(
@@ -265,10 +263,8 @@ def sequence_categorical_column_with_hash_bucket(
     ValueError: `dtype` is neither string nor integer.
   """
   return fc._SequenceCategoricalColumn(
-      fc.categorical_column_with_hash_bucket(
-          key=key,
-          hash_bucket_size=hash_bucket_size,
-          dtype=dtype))
+      fc._categorical_column_with_hash_bucket(
+          key=key, hash_bucket_size=hash_bucket_size, dtype=dtype))
 
 
 def sequence_categorical_column_with_vocabulary_file(
@@ -324,7 +320,7 @@ def sequence_categorical_column_with_vocabulary_file(
     ValueError: `dtype` is neither string nor integer.
   """
   return fc._SequenceCategoricalColumn(
-      fc.categorical_column_with_vocabulary_file(
+      fc._categorical_column_with_vocabulary_file(
           key=key,
           vocabulary_file=vocabulary_file,
           vocabulary_size=vocabulary_size,
@@ -384,7 +380,7 @@ def sequence_categorical_column_with_vocabulary_list(
     ValueError: if `dtype` is not integer or string.
   """
   return fc._SequenceCategoricalColumn(
-      fc.categorical_column_with_vocabulary_list(
+      fc._categorical_column_with_vocabulary_list(
           key=key,
           vocabulary_list=vocabulary_list,
           dtype=dtype,
diff --git a/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column_integration_test.py b/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column_integration_test.py
index d8ca363627e..bcc25b8de89 100644
--- a/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column_integration_test.py
+++ b/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column_integration_test.py
@@ -53,19 +53,20 @@ class SequenceFeatureColumnIntegrationTest(test.TestCase):
     return example
 
   def _build_feature_columns(self):
-    col = fc.categorical_column_with_identity(
-        'int_ctx', num_buckets=100)
+    col = fc._categorical_column_with_identity('int_ctx', num_buckets=100)
     ctx_cols = [
-        fc.embedding_column(col, dimension=10),
-        fc.numeric_column('float_ctx')]
+        fc._embedding_column(col, dimension=10),
+        fc._numeric_column('float_ctx')
+    ]
 
     identity_col = sfc.sequence_categorical_column_with_identity(
         'int_list', num_buckets=10)
     bucket_col = sfc.sequence_categorical_column_with_hash_bucket(
         'bytes_list', hash_bucket_size=100)
     seq_cols = [
-        fc.embedding_column(identity_col, dimension=10),
-        fc.embedding_column(bucket_col, dimension=20)]
+        fc._embedding_column(identity_col, dimension=10),
+        fc._embedding_column(bucket_col, dimension=20)
+    ]
 
     return ctx_cols, seq_cols
 
@@ -148,8 +149,8 @@ class SequenceExampleParsingTest(test.TestCase):
     """
     example = _make_sequence_example()
     columns = [
-        fc.categorical_column_with_identity('int_ctx', num_buckets=100),
-        fc.numeric_column('float_ctx'),
+        fc._categorical_column_with_identity('int_ctx', num_buckets=100),
+        fc._numeric_column('float_ctx'),
         col_fn(col_name, col_arg)
     ]
     context, seq_features = parsing_ops.parse_single_sequence_example(
diff --git a/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column_test.py b/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column_test.py
index 2163af0b438..d5f74028298 100644
--- a/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column_test.py
+++ b/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column_test.py
@@ -24,6 +24,7 @@ import numpy as np
 
 from tensorflow.contrib.feature_column.python.feature_column import sequence_feature_column as sfc
 from tensorflow.python.feature_column import feature_column as fc
+from tensorflow.python.feature_column import feature_column_lib as fc_lib
 from tensorflow.python.feature_column.feature_column import _LazyBuilder
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
@@ -109,13 +110,15 @@ class SequenceInputLayerTest(test.TestCase, parameterized.TestCase):
 
     categorical_column_a = sfc.sequence_categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
-    embedding_column_a = fc.embedding_column(
-        categorical_column_a, dimension=embedding_dimension_a,
+    embedding_column_a = fc._embedding_column(
+        categorical_column_a,
+        dimension=embedding_dimension_a,
         initializer=_get_initializer(embedding_dimension_a, embedding_values_a))
     categorical_column_b = sfc.sequence_categorical_column_with_identity(
         key='bbb', num_buckets=vocabulary_size)
-    embedding_column_b = fc.embedding_column(
-        categorical_column_b, dimension=embedding_dimension_b,
+    embedding_column_b = fc._embedding_column(
+        categorical_column_b,
+        dimension=embedding_dimension_b,
         initializer=_get_initializer(embedding_dimension_b, embedding_values_b))
 
     input_layer, sequence_length = sfc.sequence_input_layer(
@@ -148,10 +151,9 @@ class SequenceInputLayerTest(test.TestCase, parameterized.TestCase):
         values=(2, 0, 1),
         dense_shape=(2, 2))
 
-    categorical_column_a = fc.categorical_column_with_identity(
+    categorical_column_a = fc._categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
-    embedding_column_a = fc.embedding_column(
-        categorical_column_a, dimension=2)
+    embedding_column_a = fc._embedding_column(categorical_column_a, dimension=2)
 
     with self.assertRaisesRegexp(
         ValueError,
@@ -206,7 +208,7 @@ class SequenceInputLayerTest(test.TestCase, parameterized.TestCase):
     categorical_column_b = sfc.sequence_categorical_column_with_identity(
         key='bbb', num_buckets=vocabulary_size)
     # Test that columns are reordered alphabetically.
-    shared_embedding_columns = fc.shared_embedding_columns(
+    shared_embedding_columns = fc_lib.shared_embedding_columns(
         [categorical_column_b, categorical_column_a],
         dimension=embedding_dimension,
         initializer=_get_initializer(embedding_dimension, embedding_values))
@@ -244,11 +246,11 @@ class SequenceInputLayerTest(test.TestCase, parameterized.TestCase):
         values=(2, 0, 1),
         dense_shape=(2, 2))
 
-    categorical_column_a = fc.categorical_column_with_identity(
+    categorical_column_a = fc._categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
-    categorical_column_b = fc.categorical_column_with_identity(
+    categorical_column_b = fc._categorical_column_with_identity(
         key='bbb', num_buckets=vocabulary_size)
-    shared_embedding_columns = fc.shared_embedding_columns(
+    shared_embedding_columns = fc_lib.shared_embedding_columns(
         [categorical_column_a, categorical_column_b], dimension=2)
 
     with self.assertRaisesRegexp(
@@ -315,10 +317,10 @@ class SequenceInputLayerTest(test.TestCase, parameterized.TestCase):
 
     categorical_column_a = sfc.sequence_categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size_a)
-    indicator_column_a = fc.indicator_column(categorical_column_a)
+    indicator_column_a = fc._indicator_column(categorical_column_a)
     categorical_column_b = sfc.sequence_categorical_column_with_identity(
         key='bbb', num_buckets=vocabulary_size_b)
-    indicator_column_b = fc.indicator_column(categorical_column_b)
+    indicator_column_b = fc._indicator_column(categorical_column_b)
     input_layer, sequence_length = sfc.sequence_input_layer(
         features={
             'aaa': sparse_input_a,
@@ -342,9 +344,9 @@ class SequenceInputLayerTest(test.TestCase, parameterized.TestCase):
         values=(2, 0, 1),
         dense_shape=(2, 2))
 
-    categorical_column_a = fc.categorical_column_with_identity(
+    categorical_column_a = fc._categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
-    indicator_column_a = fc.indicator_column(categorical_column_a)
+    indicator_column_a = fc._indicator_column(categorical_column_a)
 
     with self.assertRaisesRegexp(
         ValueError,
@@ -530,7 +532,7 @@ class SequenceInputLayerTest(test.TestCase, parameterized.TestCase):
     sparse_input = sparse_tensor.SparseTensorValue(**sparse_input_args)
     categorical_column = sfc.sequence_categorical_column_with_identity(
         key='aaa', num_buckets=3)
-    indicator_column = fc.indicator_column(categorical_column)
+    indicator_column = fc._indicator_column(categorical_column)
 
     input_layer, _ = sfc.sequence_input_layer(
         features={'aaa': sparse_input}, feature_columns=[indicator_column])
@@ -616,8 +618,7 @@ class InputLayerTest(test.TestCase):
 
     categorical_column_a = sfc.sequence_categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
-    embedding_column_a = fc.embedding_column(
-        categorical_column_a, dimension=2)
+    embedding_column_a = fc._embedding_column(categorical_column_a, dimension=2)
 
     with self.assertRaisesRegexp(
         ValueError,
@@ -639,7 +640,7 @@ class InputLayerTest(test.TestCase):
 
     categorical_column_a = sfc.sequence_categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
-    indicator_column_a = fc.indicator_column(categorical_column_a)
+    indicator_column_a = fc._indicator_column(categorical_column_a)
 
     with self.assertRaisesRegexp(
         ValueError,
@@ -918,8 +919,9 @@ class SequenceEmbeddingColumnTest(
 
     categorical_column = sfc.sequence_categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
-    embedding_column = fc.embedding_column(
-        categorical_column, dimension=embedding_dimension,
+    embedding_column = fc._embedding_column(
+        categorical_column,
+        dimension=embedding_dimension,
         initializer=_initializer)
 
     embedding_lookup, _ = embedding_column._get_sequence_dense_tensor(
@@ -956,8 +958,7 @@ class SequenceEmbeddingColumnTest(
 
     categorical_column = sfc.sequence_categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
-    embedding_column = fc.embedding_column(
-        categorical_column, dimension=2)
+    embedding_column = fc._embedding_column(categorical_column, dimension=2)
 
     _, sequence_length = embedding_column._get_sequence_dense_tensor(
         _LazyBuilder({'aaa': inputs}))
@@ -984,8 +985,7 @@ class SequenceEmbeddingColumnTest(
 
     categorical_column = sfc.sequence_categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
-    embedding_column = fc.embedding_column(
-        categorical_column, dimension=2)
+    embedding_column = fc._embedding_column(categorical_column, dimension=2)
 
     _, sequence_length = embedding_column._get_sequence_dense_tensor(
         _LazyBuilder({'aaa': sparse_input}))
@@ -1055,7 +1055,7 @@ class SequenceSharedEmbeddingColumnTest(test.TestCase):
         key='aaa', num_buckets=vocabulary_size)
     categorical_column_b = sfc.sequence_categorical_column_with_identity(
         key='bbb', num_buckets=vocabulary_size)
-    shared_embedding_columns = fc.shared_embedding_columns(
+    shared_embedding_columns = fc_lib.shared_embedding_columns(
         [categorical_column_a, categorical_column_b],
         dimension=embedding_dimension,
         initializer=_initializer)
@@ -1101,7 +1101,7 @@ class SequenceSharedEmbeddingColumnTest(test.TestCase):
     expected_sequence_length_b = [2, 1]
     categorical_column_b = sfc.sequence_categorical_column_with_identity(
         key='bbb', num_buckets=vocabulary_size)
-    shared_embedding_columns = fc.shared_embedding_columns(
+    shared_embedding_columns = fc_lib.shared_embedding_columns(
         [categorical_column_a, categorical_column_b], dimension=2)
 
     sequence_length_a = shared_embedding_columns[0]._get_sequence_dense_tensor(
@@ -1152,7 +1152,7 @@ class SequenceSharedEmbeddingColumnTest(test.TestCase):
     categorical_column_b = sfc.sequence_categorical_column_with_identity(
         key='bbb', num_buckets=vocabulary_size)
 
-    shared_embedding_columns = fc.shared_embedding_columns(
+    shared_embedding_columns = fc_lib.shared_embedding_columns(
         [categorical_column_a, categorical_column_b], dimension=2)
 
     sequence_length_a = shared_embedding_columns[0]._get_sequence_dense_tensor(
@@ -1218,7 +1218,7 @@ class SequenceIndicatorColumnTest(test.TestCase, parameterized.TestCase):
 
     categorical_column = sfc.sequence_categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
-    indicator_column = fc.indicator_column(categorical_column)
+    indicator_column = fc._indicator_column(categorical_column)
 
     indicator_tensor, _ = indicator_column._get_sequence_dense_tensor(
         _LazyBuilder({'aaa': inputs}))
@@ -1250,7 +1250,7 @@ class SequenceIndicatorColumnTest(test.TestCase, parameterized.TestCase):
 
     categorical_column = sfc.sequence_categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
-    indicator_column = fc.indicator_column(categorical_column)
+    indicator_column = fc._indicator_column(categorical_column)
 
     _, sequence_length = indicator_column._get_sequence_dense_tensor(
         _LazyBuilder({'aaa': inputs}))
@@ -1277,7 +1277,7 @@ class SequenceIndicatorColumnTest(test.TestCase, parameterized.TestCase):
 
     categorical_column = sfc.sequence_categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
-    indicator_column = fc.indicator_column(categorical_column)
+    indicator_column = fc._indicator_column(categorical_column)
 
     _, sequence_length = indicator_column._get_sequence_dense_tensor(
         _LazyBuilder({'aaa': sparse_input}))
diff --git a/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column_v2.py b/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column_v2.py
index 67ffb939663..0d34ad16185 100644
--- a/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column_v2.py
+++ b/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column_v2.py
@@ -26,7 +26,7 @@ import collections
 
 
 from tensorflow.python.feature_column import feature_column as fc_old
-from tensorflow.python.feature_column import feature_column_v2 as fc
+from tensorflow.python.feature_column import feature_column_lib as fc
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
@@ -226,10 +226,8 @@ def sequence_categorical_column_with_identity(
     ValueError: if `default_value` is not in range `[0, num_buckets)`.
   """
   return fc_old._SequenceCategoricalColumn(
-      fc_old.categorical_column_with_identity(
-          key=key,
-          num_buckets=num_buckets,
-          default_value=default_value))
+      fc_old._categorical_column_with_identity(
+          key=key, num_buckets=num_buckets, default_value=default_value))
 
 
 def sequence_categorical_column_with_hash_bucket(
@@ -269,10 +267,8 @@ def sequence_categorical_column_with_hash_bucket(
     ValueError: `dtype` is neither string nor integer.
   """
   return fc_old._SequenceCategoricalColumn(
-      fc_old.categorical_column_with_hash_bucket(
-          key=key,
-          hash_bucket_size=hash_bucket_size,
-          dtype=dtype))
+      fc_old._categorical_column_with_hash_bucket(
+          key=key, hash_bucket_size=hash_bucket_size, dtype=dtype))
 
 
 def sequence_categorical_column_with_vocabulary_file(
@@ -328,7 +324,7 @@ def sequence_categorical_column_with_vocabulary_file(
     ValueError: `dtype` is neither string nor integer.
   """
   return fc_old._SequenceCategoricalColumn(
-      fc_old.categorical_column_with_vocabulary_file(
+      fc_old._categorical_column_with_vocabulary_file(
           key=key,
           vocabulary_file=vocabulary_file,
           vocabulary_size=vocabulary_size,
@@ -388,7 +384,7 @@ def sequence_categorical_column_with_vocabulary_list(
     ValueError: if `dtype` is not integer or string.
   """
   return fc_old._SequenceCategoricalColumn(
-      fc_old.categorical_column_with_vocabulary_list(
+      fc_old._categorical_column_with_vocabulary_list(
           key=key,
           vocabulary_list=vocabulary_list,
           dtype=dtype,
@@ -441,7 +437,7 @@ def sequence_numeric_column(
     ValueError: if any dimension in shape is not a positive integer.
     ValueError: if `dtype` is not convertible to `tf.float32`.
   """
-  shape = fc._check_shape(shape=shape, key=key)
+  shape = fc_old._check_shape(shape=shape, key=key)
   if not (dtype.is_integer or dtype.is_floating):
     raise ValueError('dtype must be convertible to float. '
                      'dtype: {}, key: {}'.format(dtype, key))
diff --git a/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column_v2_test.py b/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column_v2_test.py
index 5ecd85807c5..ca4398a1420 100644
--- a/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column_v2_test.py
+++ b/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column_v2_test.py
@@ -25,7 +25,7 @@ import numpy as np
 from tensorflow.contrib.feature_column.python.feature_column import sequence_feature_column as sfc_old
 from tensorflow.contrib.feature_column.python.feature_column import sequence_feature_column_v2 as sfc
 from tensorflow.python.feature_column import feature_column as fc_old
-from tensorflow.python.feature_column import feature_column_v2 as fc
+from tensorflow.python.feature_column import feature_column_lib as fc
 from tensorflow.python.feature_column.feature_column import _LazyBuilder
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
@@ -111,13 +111,15 @@ class SequenceInputLayerTest(test.TestCase, parameterized.TestCase):
 
     categorical_column_a = sfc.sequence_categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
-    embedding_column_a = fc_old.embedding_column(
-        categorical_column_a, dimension=embedding_dimension_a,
+    embedding_column_a = fc_old._embedding_column(
+        categorical_column_a,
+        dimension=embedding_dimension_a,
         initializer=_get_initializer(embedding_dimension_a, embedding_values_a))
     categorical_column_b = sfc.sequence_categorical_column_with_identity(
         key='bbb', num_buckets=vocabulary_size)
-    embedding_column_b = fc_old.embedding_column(
-        categorical_column_b, dimension=embedding_dimension_b,
+    embedding_column_b = fc_old._embedding_column(
+        categorical_column_b,
+        dimension=embedding_dimension_b,
         initializer=_get_initializer(embedding_dimension_b, embedding_values_b))
 
     input_layer, sequence_length = sfc.sequence_input_layer(
@@ -150,9 +152,9 @@ class SequenceInputLayerTest(test.TestCase, parameterized.TestCase):
         values=(2, 0, 1),
         dense_shape=(2, 2))
 
-    categorical_column_a = fc_old.categorical_column_with_identity(
+    categorical_column_a = fc_old._categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
-    embedding_column_a = fc_old.embedding_column(
+    embedding_column_a = fc_old._embedding_column(
         categorical_column_a, dimension=2)
 
     with self.assertRaisesRegexp(
@@ -208,7 +210,7 @@ class SequenceInputLayerTest(test.TestCase, parameterized.TestCase):
     categorical_column_b = sfc.sequence_categorical_column_with_identity(
         key='bbb', num_buckets=vocabulary_size)
     # Test that columns are reordered alphabetically.
-    shared_embedding_columns = fc_old.shared_embedding_columns(
+    shared_embedding_columns = fc.shared_embedding_columns(
         [categorical_column_b, categorical_column_a],
         dimension=embedding_dimension,
         initializer=_get_initializer(embedding_dimension, embedding_values))
@@ -246,11 +248,11 @@ class SequenceInputLayerTest(test.TestCase, parameterized.TestCase):
         values=(2, 0, 1),
         dense_shape=(2, 2))
 
-    categorical_column_a = fc_old.categorical_column_with_identity(
+    categorical_column_a = fc_old._categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
-    categorical_column_b = fc_old.categorical_column_with_identity(
+    categorical_column_b = fc_old._categorical_column_with_identity(
         key='bbb', num_buckets=vocabulary_size)
-    shared_embedding_columns = fc_old.shared_embedding_columns(
+    shared_embedding_columns = fc.shared_embedding_columns(
         [categorical_column_a, categorical_column_b], dimension=2)
 
     with self.assertRaisesRegexp(
@@ -317,10 +319,10 @@ class SequenceInputLayerTest(test.TestCase, parameterized.TestCase):
 
     categorical_column_a = sfc.sequence_categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size_a)
-    indicator_column_a = fc_old.indicator_column(categorical_column_a)
+    indicator_column_a = fc_old._indicator_column(categorical_column_a)
     categorical_column_b = sfc.sequence_categorical_column_with_identity(
         key='bbb', num_buckets=vocabulary_size_b)
-    indicator_column_b = fc_old.indicator_column(categorical_column_b)
+    indicator_column_b = fc_old._indicator_column(categorical_column_b)
     input_layer, sequence_length = sfc.sequence_input_layer(
         features={
             'aaa': sparse_input_a,
@@ -344,9 +346,9 @@ class SequenceInputLayerTest(test.TestCase, parameterized.TestCase):
         values=(2, 0, 1),
         dense_shape=(2, 2))
 
-    categorical_column_a = fc_old.categorical_column_with_identity(
+    categorical_column_a = fc_old._categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
-    indicator_column_a = fc_old.indicator_column(categorical_column_a)
+    indicator_column_a = fc_old._indicator_column(categorical_column_a)
 
     with self.assertRaisesRegexp(
         ValueError,
@@ -532,7 +534,7 @@ class SequenceInputLayerTest(test.TestCase, parameterized.TestCase):
     sparse_input = sparse_tensor.SparseTensorValue(**sparse_input_args)
     categorical_column = sfc.sequence_categorical_column_with_identity(
         key='aaa', num_buckets=3)
-    indicator_column = fc_old.indicator_column(categorical_column)
+    indicator_column = fc_old._indicator_column(categorical_column)
 
     input_layer, _ = sfc.sequence_input_layer(
         features={'aaa': sparse_input}, feature_columns=[indicator_column])
@@ -618,7 +620,7 @@ class InputLayerTest(test.TestCase):
 
     categorical_column_a = sfc.sequence_categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
-    embedding_column_a = fc_old.embedding_column(
+    embedding_column_a = fc_old._embedding_column(
         categorical_column_a, dimension=2)
 
     with self.assertRaisesRegexp(
@@ -641,7 +643,7 @@ class InputLayerTest(test.TestCase):
 
     categorical_column_a = sfc.sequence_categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
-    indicator_column_a = fc_old.indicator_column(categorical_column_a)
+    indicator_column_a = fc_old._indicator_column(categorical_column_a)
 
     with self.assertRaisesRegexp(
         ValueError,
@@ -920,8 +922,9 @@ class SequenceEmbeddingColumnTest(
 
     categorical_column = sfc.sequence_categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
-    embedding_column = fc_old.embedding_column(
-        categorical_column, dimension=embedding_dimension,
+    embedding_column = fc_old._embedding_column(
+        categorical_column,
+        dimension=embedding_dimension,
         initializer=_initializer)
 
     embedding_lookup, _ = embedding_column._get_sequence_dense_tensor(
@@ -958,8 +961,7 @@ class SequenceEmbeddingColumnTest(
 
     categorical_column = sfc.sequence_categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
-    embedding_column = fc_old.embedding_column(
-        categorical_column, dimension=2)
+    embedding_column = fc_old._embedding_column(categorical_column, dimension=2)
 
     _, sequence_length = embedding_column._get_sequence_dense_tensor(
         _LazyBuilder({'aaa': inputs}))
@@ -986,8 +988,7 @@ class SequenceEmbeddingColumnTest(
 
     categorical_column = sfc.sequence_categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
-    embedding_column = fc_old.embedding_column(
-        categorical_column, dimension=2)
+    embedding_column = fc_old._embedding_column(categorical_column, dimension=2)
 
     _, sequence_length = embedding_column._get_sequence_dense_tensor(
         _LazyBuilder({'aaa': sparse_input}))
@@ -1057,7 +1058,7 @@ class SequenceSharedEmbeddingColumnTest(test.TestCase):
         key='aaa', num_buckets=vocabulary_size)
     categorical_column_b = sfc.sequence_categorical_column_with_identity(
         key='bbb', num_buckets=vocabulary_size)
-    shared_embedding_columns = fc_old.shared_embedding_columns(
+    shared_embedding_columns = fc.shared_embedding_columns(
         [categorical_column_a, categorical_column_b],
         dimension=embedding_dimension,
         initializer=_initializer)
@@ -1103,7 +1104,7 @@ class SequenceSharedEmbeddingColumnTest(test.TestCase):
     expected_sequence_length_b = [2, 1]
     categorical_column_b = sfc.sequence_categorical_column_with_identity(
         key='bbb', num_buckets=vocabulary_size)
-    shared_embedding_columns = fc_old.shared_embedding_columns(
+    shared_embedding_columns = fc.shared_embedding_columns(
         [categorical_column_a, categorical_column_b], dimension=2)
 
     sequence_length_a = shared_embedding_columns[0]._get_sequence_dense_tensor(
@@ -1154,7 +1155,7 @@ class SequenceSharedEmbeddingColumnTest(test.TestCase):
     categorical_column_b = sfc.sequence_categorical_column_with_identity(
         key='bbb', num_buckets=vocabulary_size)
 
-    shared_embedding_columns = fc_old.shared_embedding_columns(
+    shared_embedding_columns = fc.shared_embedding_columns(
         [categorical_column_a, categorical_column_b], dimension=2)
 
     sequence_length_a = shared_embedding_columns[0]._get_sequence_dense_tensor(
@@ -1220,7 +1221,7 @@ class SequenceIndicatorColumnTest(test.TestCase, parameterized.TestCase):
 
     categorical_column = sfc.sequence_categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
-    indicator_column = fc_old.indicator_column(categorical_column)
+    indicator_column = fc_old._indicator_column(categorical_column)
 
     indicator_tensor, _ = indicator_column._get_sequence_dense_tensor(
         _LazyBuilder({'aaa': inputs}))
@@ -1252,7 +1253,7 @@ class SequenceIndicatorColumnTest(test.TestCase, parameterized.TestCase):
 
     categorical_column = sfc.sequence_categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
-    indicator_column = fc_old.indicator_column(categorical_column)
+    indicator_column = fc_old._indicator_column(categorical_column)
 
     _, sequence_length = indicator_column._get_sequence_dense_tensor(
         _LazyBuilder({'aaa': inputs}))
diff --git a/tensorflow/contrib/framework/BUILD b/tensorflow/contrib/framework/BUILD
index cd747df4d69..dad50a3a730 100644
--- a/tensorflow/contrib/framework/BUILD
+++ b/tensorflow/contrib/framework/BUILD
@@ -47,6 +47,11 @@ tf_custom_op_py_library(
         ":variable_ops_op_lib",
     ],
     srcs_version = "PY2AND3",
+    visibility = [
+        "//learning/brain:__subpackages__",
+        "//tensorflow:__subpackages__",
+        "//video/youtube/personalization:__subpackages__",
+    ],
     deps = [
         ":gen_variable_ops",
         "//tensorflow/contrib/util:util_py",
@@ -66,6 +71,7 @@ tf_custom_op_py_library(
         "//tensorflow/python:resource_variable_ops",
         "//tensorflow/python:script_ops",
         "//tensorflow/python:smart_cond",
+        "//tensorflow/python:sort_ops",
         "//tensorflow/python:sparse_tensor",
         "//tensorflow/python:state_ops",
         "//tensorflow/python:state_ops_gen",
@@ -311,17 +317,3 @@ py_test(
         "//third_party/py/numpy",
     ],
 )
-
-py_test(
-    name = "sort_ops_test",
-    size = "medium",
-    srcs = ["python/ops/sort_ops_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":framework_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:random_ops",
-        "//third_party/py/numpy",
-    ],
-)
diff --git a/tensorflow/contrib/framework/python/ops/sort_ops.py b/tensorflow/contrib/framework/python/ops/sort_ops.py
index 1921a77c1e9..42184a4e55e 100644
--- a/tensorflow/contrib/framework/python/ops/sort_ops.py
+++ b/tensorflow/contrib/framework/python/ops/sort_ops.py
@@ -22,173 +22,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import numpy as np
+from tensorflow.python.ops import sort_ops
 
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import ops as framework_ops
-from tensorflow.python.framework import tensor_util
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import nn_ops
-
-
-def sort(values, axis=-1, direction='ASCENDING', name=None):
-  """Sorts a tensor.
-
-  Args:
-    values: 1-D or higher numeric `Tensor`.
-    axis: The axis along which to sort. The default is -1, which sorts the last
-        axis.
-    direction: The direction in which to sort the values (`'ASCENDING'` or
-        `'DESCENDING'`).
-    name: Optional name for the operation.
-
-  Returns:
-    A `Tensor` with the same dtype and shape as `values`, with the elements
-        sorted along the given `axis`.
-
-  Raises:
-    ValueError: If axis is not a constant scalar, or the direction is invalid.
-  """
-  with framework_ops.name_scope(name, 'sort'):
-    return _sort_or_argsort(values, axis, direction, return_argsort=False)
-
-
-def argsort(values, axis=-1, direction='ASCENDING', stable=False, name=None):
-  """Returns the indices of a tensor that give its sorted order along an axis.
-
-  For a 1D tensor, `tf.gather(values, tf.argsort(values))` is equivalent to
-  `tf.sort(values)`. For higher dimensions, the output has the same shape as
-  `values`, but along the given axis, values represent the index of the sorted
-  element in that slice of the tensor at the given position.
-
-  Args:
-    values: 1-D or higher numeric `Tensor`.
-    axis: The axis along which to sort. The default is -1, which sorts the last
-        axis.
-    direction: The direction in which to sort the values (`'ASCENDING'` or
-        `'DESCENDING'`).
-    stable: If True, equal elements in the original tensor will not be
-        re-ordered in the returned order. Unstable sort is not yet implemented,
-        but will eventually be the default for performance reasons. If you
-        require a stable order, pass `stable=True` for forwards compatibility.
-    name: Optional name for the operation.
-
-  Returns:
-    An int32 `Tensor` with the same shape as `values`. The indices that would
-        sort each slice of the given `values` along the given `axis`.
-
-  Raises:
-    ValueError: If axis is not a constant scalar, or the direction is invalid.
-  """
-  del stable  # Unused.
-  with framework_ops.name_scope(name, 'argsort'):
-    return _sort_or_argsort(values, axis, direction, return_argsort=True)
-
-
-def _sort_or_argsort(values, axis, direction, return_argsort):
-  """Internal sort/argsort implementation.
-
-  Args:
-    values: The input values.
-    axis: The axis along which to sort.
-    direction: 'ASCENDING' or 'DESCENDING'.
-    return_argsort: Whether to return the argsort result.
-
-  Returns:
-    Either the sorted values, or the indices of the sorted values in the
-        original tensor. See the `sort` and `argsort` docstrings.
-
-  Raises:
-    ValueError: If axis is not a constant scalar, or the direction is invalid.
-  """
-  if direction not in _SORT_IMPL:
-    raise ValueError('%s should be one of %s' %
-                     (direction, ', '.join(sorted(_SORT_IMPL.keys()))))
-  # Axis must be an integer, not a Tensor.
-  axis = framework_ops.convert_to_tensor(axis, name='axis')
-  axis_static = tensor_util.constant_value(axis)
-  if axis.shape.ndims != 0 or axis_static is None:
-    raise ValueError('axis must be a constant scalar')
-  axis_static = int(axis_static)  # Avoids NumPy casting error
-
-  values = framework_ops.convert_to_tensor(values, name='values')
-
-  return _SORT_IMPL[direction](values, axis_static, return_argsort)
-
-
-def _descending_sort(values, axis, return_argsort=False):
-  """Sorts values in reverse using `top_k`.
-
-  Args:
-    values: Tensor of numeric values.
-    axis: Index of the axis which values should be sorted along.
-    return_argsort: If False, return the sorted values. If True, return the
-        indices that would sort the values.
-
-  Returns:
-    The sorted values.
-  """
-  k = array_ops.shape(values)[axis]
-  rank = array_ops.rank(values)
-  static_rank = values.shape.ndims
-  # Fast path: sorting the last axis.
-  if axis == -1 or axis + 1 == values.get_shape().ndims:
-    top_k_input = values
-    transposition = None
-  else:
-    # Otherwise, transpose the array. Swap axes `axis` and `rank - 1`.
-    if axis < 0:
-      # Calculate the actual axis index if counting from the end. Use the static
-      # rank if available, or else make the axis back into a tensor.
-      axis += static_rank or rank
-    if static_rank is not None:
-      # Prefer to calculate the transposition array in NumPy and make it a
-      # constant.
-      transposition = constant_op.constant(
-          np.r_[
-              # Axes up to axis are unchanged.
-              np.arange(axis),
-              # Swap axis and rank - 1.
-              [static_rank - 1],
-              # Axes in [axis + 1, rank - 1) are unchanged.
-              np.arange(axis + 1, static_rank - 1),
-              # Swap axis and rank - 1.
-              [axis]],
-          name='transposition')
-    else:
-      # Generate the transposition array from the tensors.
-      transposition = array_ops.concat(
-          [
-              # Axes up to axis are unchanged.
-              math_ops.range(axis),
-              # Swap axis and rank - 1.
-              [rank - 1],
-              # Axes in [axis + 1, rank - 1) are unchanged.
-              math_ops.range(axis + 1, rank - 1),
-              # Swap axis and rank - 1.
-              [axis]
-          ],
-          axis=0)
-    top_k_input = array_ops.transpose(values, transposition)
-
-  values, indices = nn_ops.top_k(top_k_input, k)
-  return_value = indices if return_argsort else values
-  if transposition is not None:
-    # transposition contains a single cycle of length 2 (swapping 2 elements),
-    # so it is an involution (it is its own inverse).
-    return_value = array_ops.transpose(return_value, transposition)
-  return return_value
-
-
-def _ascending_sort(values, axis, return_argsort=False):
-  # Negate the values to get the ascending order from descending sort.
-  values_or_indices = _descending_sort(-values, axis, return_argsort)
-  # If not argsort, negate the values again.
-  return values_or_indices if return_argsort else -values_or_indices
-
-
-_SORT_IMPL = {
-    'ASCENDING': _ascending_sort,
-    'DESCENDING': _descending_sort,
-}
+sort = sort_ops.sort
+argsort = sort_ops.argsort
diff --git a/tensorflow/contrib/gan/python/estimator/python/gan_estimator_impl.py b/tensorflow/contrib/gan/python/estimator/python/gan_estimator_impl.py
index 219cc199d79..3593b501bb7 100644
--- a/tensorflow/contrib/gan/python/estimator/python/gan_estimator_impl.py
+++ b/tensorflow/contrib/gan/python/estimator/python/gan_estimator_impl.py
@@ -113,7 +113,8 @@ class GANEstimator(estimator.Estimator):
                add_summaries=None,
                use_loss_summaries=True,
                config=None,
-               warm_start_from=None):
+               warm_start_from=None,
+               is_chief=True):
     """Initializes a GANEstimator instance.
 
     Args:
@@ -154,6 +155,8 @@ class GANEstimator(estimator.Estimator):
       config: `RunConfig` object to configure the runtime settings.
       warm_start_from: A filepath to a checkpoint or saved model, or a
         WarmStartSettings object to configure initialization.
+      is_chief: Whether or not this Estimator is running on a chief or worker.
+        Needs to be set appropriately if using SyncReplicasOptimizers.
 
     Raises:
       ValueError: If loss functions aren't callable.
@@ -187,7 +190,7 @@ class GANEstimator(estimator.Estimator):
       return _get_estimator_spec(
           mode, gan_model, generator_loss_fn, discriminator_loss_fn,
           get_eval_metric_ops_fn, generator_optimizer, discriminator_optimizer,
-          get_hooks_fn, use_loss_summaries)
+          get_hooks_fn, use_loss_summaries, is_chief)
 
     super(GANEstimator, self).__init__(
         model_fn=_model_fn, model_dir=model_dir, config=config,
@@ -215,7 +218,7 @@ def _get_gan_model(
 def _get_estimator_spec(
     mode, gan_model, generator_loss_fn, discriminator_loss_fn,
     get_eval_metric_ops_fn, generator_optimizer, discriminator_optimizer,
-    get_hooks_fn=None, use_loss_summaries=True):
+    get_hooks_fn=None, use_loss_summaries=True, is_chief=True):
   """Get the EstimatorSpec for the current mode."""
   if mode == model_fn_lib.ModeKeys.PREDICT:
     estimator_spec = model_fn_lib.EstimatorSpec(
@@ -236,7 +239,7 @@ def _get_estimator_spec(
               else discriminator_optimizer)
       get_hooks_fn = get_hooks_fn or tfgan_train.get_sequential_train_hooks()
       estimator_spec = _get_train_estimator_spec(
-          gan_model, gan_loss, gopt, dopt, get_hooks_fn)
+          gan_model, gan_loss, gopt, dopt, get_hooks_fn, is_chief=is_chief)
 
   return estimator_spec
 
@@ -321,11 +324,11 @@ def _get_eval_estimator_spec(gan_model, gan_loss, get_eval_metric_ops_fn=None,
 
 def _get_train_estimator_spec(
     gan_model, gan_loss, generator_optimizer, discriminator_optimizer,
-    get_hooks_fn, train_op_fn=tfgan_train.gan_train_ops):
+    get_hooks_fn, train_op_fn=tfgan_train.gan_train_ops, is_chief=True):
   """Return an EstimatorSpec for the train case."""
   scalar_loss = gan_loss.generator_loss + gan_loss.discriminator_loss
   train_ops = train_op_fn(gan_model, gan_loss, generator_optimizer,
-                          discriminator_optimizer)
+                          discriminator_optimizer, is_chief=is_chief)
   training_hooks = get_hooks_fn(train_ops)
   return model_fn_lib.EstimatorSpec(
       loss=scalar_loss,
diff --git a/tensorflow/contrib/gan/python/estimator/python/gan_estimator_test.py b/tensorflow/contrib/gan/python/estimator/python/gan_estimator_test.py
index 3d6bdab0ad7..bc9021050bc 100644
--- a/tensorflow/contrib/gan/python/estimator/python/gan_estimator_test.py
+++ b/tensorflow/contrib/gan/python/estimator/python/gan_estimator_test.py
@@ -48,6 +48,7 @@ from tensorflow.python.platform import test
 from tensorflow.python.summary.writer import writer_cache
 from tensorflow.python.training import input as input_lib
 from tensorflow.python.training import learning_rate_decay
+from tensorflow.python.training import sync_replicas_optimizer
 from tensorflow.python.training import training
 from tensorflow.python.training import training_util
 
@@ -82,7 +83,7 @@ class GetGANModelTest(test.TestCase, parameterized.TestCase):
 
     self.assertEqual(generator_inputs, gan_model.generator_inputs)
     self.assertIsNotNone(gan_model.generated_data)
-    self.assertEqual(2, len(gan_model.generator_variables))  # 1 FC layer
+    self.assertLen(gan_model.generator_variables, 2)  # 1 FC layer
     self.assertIsNotNone(gan_model.generator_fn)
     if mode == model_fn_lib.ModeKeys.PREDICT:
       self.assertIsNone(gan_model.real_data)
@@ -95,7 +96,7 @@ class GetGANModelTest(test.TestCase, parameterized.TestCase):
       self.assertIsNotNone(gan_model.real_data)
       self.assertIsNotNone(gan_model.discriminator_real_outputs)
       self.assertIsNotNone(gan_model.discriminator_gen_outputs)
-      self.assertEqual(2, len(gan_model.discriminator_variables))  # 1 FC layer
+      self.assertLen(gan_model.discriminator_variables, 2)  # 1 FC layer
       self.assertIsNotNone(gan_model.discriminator_scope)
       self.assertIsNotNone(gan_model.discriminator_fn)
 
@@ -121,6 +122,7 @@ def get_dummy_gan_model():
 
 
 def dummy_loss_fn(gan_model, add_summaries=True):
+  del add_summaries
   return math_ops.reduce_sum(gan_model.discriminator_real_outputs -
                              gan_model.discriminator_gen_outputs)
 
@@ -168,6 +170,35 @@ class GetEstimatorSpecTest(test.TestCase, parameterized.TestCase):
       self.assertShapeEqual(np.array(0), spec.loss)  # must be a scalar
       self.assertIsNotNone(spec.eval_metric_ops)
 
+  def test_get_sync_estimator_spec(self):
+    """Make sure spec is loaded with sync hooks for sync opts."""
+
+    def get_sync_optimizer():
+      return sync_replicas_optimizer.SyncReplicasOptimizer(
+          training.GradientDescentOptimizer(learning_rate=1.0),
+          replicas_to_aggregate=1)
+
+    with ops.Graph().as_default():
+      self._gan_model = get_dummy_gan_model()
+      g_opt = get_sync_optimizer()
+      d_opt = get_sync_optimizer()
+
+      spec = estimator._get_estimator_spec(
+          model_fn_lib.ModeKeys.TRAIN,
+          self._gan_model,
+          generator_loss_fn=dummy_loss_fn,
+          discriminator_loss_fn=dummy_loss_fn,
+          get_eval_metric_ops_fn=get_metrics,
+          generator_optimizer=g_opt,
+          discriminator_optimizer=d_opt)
+
+      self.assertLen(spec.training_hooks, 4)
+      sync_opts = [
+          hook._sync_optimizer for hook in spec.training_hooks if
+          isinstance(hook, sync_replicas_optimizer._SyncReplicasOptimizerHook)]
+      self.assertLen(sync_opts, 2)
+      self.assertSetEqual(frozenset(sync_opts), frozenset((g_opt, d_opt)))
+
 
 # TODO(joelshor): Add pandas test.
 class GANEstimatorIntegrationTest(test.TestCase):
diff --git a/tensorflow/contrib/gan/python/losses/python/losses_impl.py b/tensorflow/contrib/gan/python/losses/python/losses_impl.py
index df0342c80c5..a0a86c6337e 100644
--- a/tensorflow/contrib/gan/python/losses/python/losses_impl.py
+++ b/tensorflow/contrib/gan/python/losses/python/losses_impl.py
@@ -36,7 +36,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import numpy as np
 
 from tensorflow.contrib.framework.python.ops import variables as contrib_variables_lib
 from tensorflow.python.framework import ops
@@ -47,7 +46,6 @@ from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import variable_scope
-from tensorflow.python.ops.distributions import distribution as ds
 from tensorflow.python.ops.losses import losses
 from tensorflow.python.ops.losses import util
 from tensorflow.python.summary import summary
@@ -740,11 +738,16 @@ def least_squares_discriminator_loss(
 def _validate_distributions(distributions):
   if not isinstance(distributions, (list, tuple)):
     raise ValueError('`distributions` must be a list or tuple. Instead, '
-                     'found %s.', type(distributions))
+                     'found %s.' % type(distributions))
   for x in distributions:
-    if not isinstance(x, ds.Distribution):
+    # We used to check with `isinstance(x, tf.distributions.Distribution)`.
+    # However, distributions have migrated to `tfp.distributions.Distribution`,
+    # which is a new code repo, so we can't check this way anymore until
+    # TF-GAN is migrated to a new repo as well.
+    # This new check is not sufficient, but is a useful heuristic for now.
+    if not callable(getattr(x, 'log_prob', None)):
       raise ValueError('`distributions` must be a list of `Distributions`. '
-                       'Instead, found %s.', type(x))
+                       'Instead, found %s.' % type(x))
 
 
 def _validate_information_penalty_inputs(
@@ -817,7 +820,7 @@ def _numerically_stable_global_norm(tensor_list):
   Returns:
     A scalar tensor with the global norm.
   """
-  if np.all([x is None for x in tensor_list]):
+  if all(x is None for x in tensor_list):
     return 0.0
 
   list_max = math_ops.reduce_max([math_ops.reduce_max(math_ops.abs(x)) for x in
diff --git a/tensorflow/contrib/gan/python/namedtuples.py b/tensorflow/contrib/gan/python/namedtuples.py
index b9ac1bf1513..969b68449d9 100644
--- a/tensorflow/contrib/gan/python/namedtuples.py
+++ b/tensorflow/contrib/gan/python/namedtuples.py
@@ -213,7 +213,8 @@ class GANTrainOps(
     collections.namedtuple('GANTrainOps', (
         'generator_train_op',
         'discriminator_train_op',
-        'global_step_inc_op'
+        'global_step_inc_op',
+        'train_hooks'
     ))):
   """GANTrainOps contains the training ops.
 
@@ -221,8 +222,17 @@ class GANTrainOps(
     generator_train_op: Op that performs a generator update step.
     discriminator_train_op: Op that performs a discriminator update step.
     global_step_inc_op: Op that increments the shared global step.
+    train_hooks: a list or tuple containing hooks related to training that need
+      to be populated when training ops are instantiated. Used primarily for
+      sync hooks.
   """
 
+  def __new__(cls, generator_train_op, discriminator_train_op,
+              global_step_inc_op, train_hooks=()):
+    return super(GANTrainOps, cls).__new__(cls, generator_train_op,
+                                           discriminator_train_op,
+                                           global_step_inc_op, train_hooks)
+
 
 class GANTrainSteps(
     collections.namedtuple('GANTrainSteps', (
diff --git a/tensorflow/contrib/gan/python/train.py b/tensorflow/contrib/gan/python/train.py
index 7ee39f304ab..4c7bee41b33 100644
--- a/tensorflow/contrib/gan/python/train.py
+++ b/tensorflow/contrib/gan/python/train.py
@@ -114,7 +114,7 @@ def gan_model(
     discriminator_gen_outputs = discriminator_fn(generated_data,
                                                  generator_inputs)
   with variable_scope.variable_scope(dis_scope, reuse=True):
-    real_data = ops.convert_to_tensor(real_data)
+    real_data = _convert_tensor_or_l_or_d(real_data)
     discriminator_real_outputs = discriminator_fn(real_data, generator_inputs)
 
   if check_shapes:
@@ -924,6 +924,7 @@ def gan_train_ops(
     generator_optimizer,
     discriminator_optimizer,
     check_for_unused_update_ops=True,
+    is_chief=True,
     # Optional args to pass directly to the `create_train_op`.
     **kwargs):
   """Returns GAN train ops.
@@ -939,6 +940,8 @@ def gan_train_ops(
     discriminator_optimizer: The optimizer for the discriminator updates.
     check_for_unused_update_ops: If `True`, throws an exception if there are
       update ops outside of the generator or discriminator scopes.
+    is_chief: Specifies whether or not the training is being run by the primary
+      replica during replica training.
     **kwargs: Keyword args to pass directly to
       `training.create_train_op` for both the generator and
       discriminator train op.
@@ -980,6 +983,9 @@ def gan_train_ops(
       kwargs, model.generator_scope.name, model.discriminator_scope.name,
       check_for_unused_update_ops)
 
+  # Get the sync hooks if these are needed.
+  sync_hooks = []
+
   generator_global_step = None
   if isinstance(generator_optimizer,
                 sync_replicas_optimizer.SyncReplicasOptimizer):
@@ -995,6 +1001,7 @@ def gan_train_ops(
         trainable=False,
         collections=[ops.GraphKeys.GLOBAL_VARIABLES])
     gen_update_ops += [generator_global_step.assign(global_step)]
+    sync_hooks.append(generator_optimizer.make_session_run_hook(is_chief))
   with ops.name_scope('generator_train'):
     gen_train_op = training.create_train_op(
         total_loss=loss.generator_loss,
@@ -1016,6 +1023,7 @@ def gan_train_ops(
         trainable=False,
         collections=[ops.GraphKeys.GLOBAL_VARIABLES])
     dis_update_ops += [discriminator_global_step.assign(global_step)]
+    sync_hooks.append(discriminator_optimizer.make_session_run_hook(is_chief))
   with ops.name_scope('discriminator_train'):
     disc_train_op = training.create_train_op(
         total_loss=loss.discriminator_loss,
@@ -1025,7 +1033,8 @@ def gan_train_ops(
         update_ops=dis_update_ops,
         **kwargs)
 
-  return namedtuples.GANTrainOps(gen_train_op, disc_train_op, global_step_inc)
+  return namedtuples.GANTrainOps(gen_train_op, disc_train_op, global_step_inc,
+                                 sync_hooks)
 
 
 # TODO(joelshor): Implement a dynamic GAN train loop, as in `Real-Time Adaptive
@@ -1066,13 +1075,24 @@ def get_sequential_train_hooks(train_steps=namedtuples.GANTrainSteps(1, 1)):
                                      train_steps.generator_train_steps)
     discriminator_hook = RunTrainOpsHook(train_ops.discriminator_train_op,
                                          train_steps.discriminator_train_steps)
-    return [generator_hook, discriminator_hook]
+    return [generator_hook, discriminator_hook] + list(train_ops.train_hooks)
 
   return get_hooks
 
 
+def _num_joint_steps(train_steps):
+  g_steps = train_steps.generator_train_steps
+  d_steps = train_steps.discriminator_train_steps
+  # Get the number of each type of step that should be run.
+  num_d_and_g_steps = min(g_steps, d_steps)
+  num_g_steps = g_steps - num_d_and_g_steps
+  num_d_steps = d_steps - num_d_and_g_steps
+
+  return num_d_and_g_steps, num_g_steps, num_d_steps
+
+
 def get_joint_train_hooks(train_steps=namedtuples.GANTrainSteps(1, 1)):
-  """Returns a hooks function for sequential GAN training.
+  """Returns a hooks function for joint GAN training.
 
   When using these train hooks, IT IS RECOMMENDED TO USE `use_locking=True` ON
   ALL OPTIMIZERS TO AVOID RACE CONDITIONS.
@@ -1105,12 +1125,7 @@ def get_joint_train_hooks(train_steps=namedtuples.GANTrainSteps(1, 1)):
   Returns:
     A function that takes a GANTrainOps tuple and returns a list of hooks.
   """
-  g_steps = train_steps.generator_train_steps
-  d_steps = train_steps.discriminator_train_steps
-  # Get the number of each type of step that should be run.
-  num_d_and_g_steps = min(g_steps, d_steps)
-  num_g_steps = g_steps - num_d_and_g_steps
-  num_d_steps = d_steps - num_d_and_g_steps
+  num_d_and_g_steps, num_g_steps, num_d_steps = _num_joint_steps(train_steps)
 
   def get_hooks(train_ops):
     g_op = train_ops.generator_train_op
@@ -1120,7 +1135,7 @@ def get_joint_train_hooks(train_steps=namedtuples.GANTrainSteps(1, 1)):
     g_hook = RunTrainOpsHook(g_op, num_g_steps)
     d_hook = RunTrainOpsHook(d_op, num_d_steps)
 
-    return [joint_hook, g_hook, d_hook]
+    return [joint_hook, g_hook, d_hook] + list(train_ops.train_hooks)
 
   return get_hooks
 
diff --git a/tensorflow/contrib/gan/python/train_test.py b/tensorflow/contrib/gan/python/train_test.py
index 64d67061990..841f25cd7f1 100644
--- a/tensorflow/contrib/gan/python/train_test.py
+++ b/tensorflow/contrib/gan/python/train_test.py
@@ -519,7 +519,7 @@ class GANLossTest(test.TestCase, parameterized.TestCase):
     """Test output type."""
     loss = train.gan_loss(get_gan_model_fn(), add_summaries=True)
     self.assertIsInstance(loss, namedtuples.GANLoss)
-    self.assertGreater(len(ops.get_collection(ops.GraphKeys.SUMMARIES)), 0)
+    self.assertNotEmpty(ops.get_collection(ops.GraphKeys.SUMMARIES))
 
   @parameterized.named_parameters(
       ('cyclegan', create_cyclegan_model),
@@ -528,7 +528,7 @@ class GANLossTest(test.TestCase, parameterized.TestCase):
   def test_cyclegan_output_type(self, get_gan_model_fn):
     loss = train.cyclegan_loss(get_gan_model_fn(), add_summaries=True)
     self.assertIsInstance(loss, namedtuples.CycleGANLoss)
-    self.assertGreater(len(ops.get_collection(ops.GraphKeys.SUMMARIES)), 0)
+    self.assertNotEmpty(ops.get_collection(ops.GraphKeys.SUMMARIES))
 
   @parameterized.named_parameters(
       ('gan', create_gan_model, False),
@@ -759,7 +759,7 @@ class TensorPoolAdjusteModelTest(test.TestCase):
           # For [pool_size, ?), the pool is full, tensor2 must be equal to some
           # historical values of tensor1 (which is previously stored in the
           # pool).
-          self.assertTrue(any([(v == t2).all() for v in history_values]))
+          self.assertTrue(any((v == t2).all() for v in history_values))
 
   def _make_new_model_and_check(self, model, pool_size):
     pool_fn = lambda x: random_tensor_pool.tensor_pool(x, pool_size=pool_size)
@@ -836,6 +836,9 @@ class GANTrainOpsTest(test.TestCase, parameterized.TestCase):
 
     self.assertIsInstance(train_ops, namedtuples.GANTrainOps)
 
+    # Make sure there are no training hooks populated accidentally.
+    self.assertEmpty(train_ops.train_hooks)
+
   # TODO(joelshor): Add a test to check that custom update op is run.
   @parameterized.named_parameters(
       ('gan', create_gan_model, False),
@@ -923,8 +926,15 @@ class GANTrainOpsTest(test.TestCase, parameterized.TestCase):
         model, loss, generator_optimizer=g_opt, discriminator_optimizer=d_opt)
     self.assertIsInstance(train_ops, namedtuples.GANTrainOps)
     # No new trainable variables should have been added.
-    self.assertEqual(num_trainable_vars,
-                     len(variables_lib.get_trainable_variables()))
+    self.assertLen(variables_lib.get_trainable_variables(), num_trainable_vars)
+
+    # Sync hooks should be populated in the GANTrainOps.
+    self.assertLen(train_ops.train_hooks, 2)
+    for hook in train_ops.train_hooks:
+      self.assertIsInstance(
+          hook, sync_replicas_optimizer._SyncReplicasOptimizerHook)
+    sync_opts = [hook._sync_optimizer for hook in train_ops.train_hooks]
+    self.assertSetEqual(frozenset(sync_opts), frozenset((g_opt, d_opt)))
 
     g_sync_init_op = g_opt.get_init_tokens_op(num_tokens=1)
     d_sync_init_op = d_opt.get_init_tokens_op(num_tokens=1)
@@ -959,6 +969,32 @@ class GANTrainOpsTest(test.TestCase, parameterized.TestCase):
       coord.request_stop()
       coord.join(g_threads + d_threads)
 
+  @parameterized.named_parameters(
+      ('is_chief', True),
+      ('is_not_chief', False),
+  )
+  def test_is_chief_in_train_hooks(self, is_chief):
+    """Make sure is_chief is propagated correctly to sync hooks."""
+    model = create_gan_model()
+    loss = train.gan_loss(model)
+    g_opt = get_sync_optimizer()
+    d_opt = get_sync_optimizer()
+    train_ops = train.gan_train_ops(
+        model,
+        loss,
+        g_opt,
+        d_opt,
+        is_chief=is_chief,
+        summarize_gradients=True,
+        colocate_gradients_with_ops=True)
+
+    self.assertLen(train_ops.train_hooks, 2)
+    for hook in train_ops.train_hooks:
+      self.assertIsInstance(
+          hook, sync_replicas_optimizer._SyncReplicasOptimizerHook)
+    is_chief_list = [hook._is_chief for hook in train_ops.train_hooks]
+    self.assertListEqual(is_chief_list, [is_chief, is_chief])
+
 
 class GANTrainTest(test.TestCase, parameterized.TestCase):
   """Tests for `gan_train`."""
@@ -1036,6 +1072,44 @@ class GANTrainTest(test.TestCase, parameterized.TestCase):
     self.assertTrue(np.isscalar(final_loss))
     self.assertEqual(17.0, final_loss)
 
+  @parameterized.named_parameters(
+      ('gan', create_gan_model),
+      ('callable_gan', create_callable_gan_model),
+      ('infogan', create_infogan_model),
+      ('callable_infogan', create_callable_infogan_model),
+      ('acgan', create_acgan_model),
+      ('callable_acgan', create_callable_acgan_model),
+  )
+  def test_train_hooks_exist_in_get_hooks_fn(self, create_gan_model_fn):
+    model = create_gan_model_fn()
+    loss = train.gan_loss(model)
+
+    g_opt = get_sync_optimizer()
+    d_opt = get_sync_optimizer()
+    train_ops = train.gan_train_ops(
+        model,
+        loss,
+        g_opt,
+        d_opt,
+        summarize_gradients=True,
+        colocate_gradients_with_ops=True)
+
+    sequential_train_hooks = train.get_sequential_train_hooks()(train_ops)
+    self.assertLen(sequential_train_hooks, 4)
+    sync_opts = [
+        hook._sync_optimizer for hook in sequential_train_hooks if
+        isinstance(hook, sync_replicas_optimizer._SyncReplicasOptimizerHook)]
+    self.assertLen(sync_opts, 2)
+    self.assertSetEqual(frozenset(sync_opts), frozenset((g_opt, d_opt)))
+
+    joint_train_hooks = train.get_joint_train_hooks()(train_ops)
+    self.assertLen(joint_train_hooks, 5)
+    sync_opts = [
+        hook._sync_optimizer for hook in joint_train_hooks if
+        isinstance(hook, sync_replicas_optimizer._SyncReplicasOptimizerHook)]
+    self.assertLen(sync_opts, 2)
+    self.assertSetEqual(frozenset(sync_opts), frozenset((g_opt, d_opt)))
+
 
 class PatchGANTest(test.TestCase, parameterized.TestCase):
   """Tests that functions work on PatchGAN style output."""
diff --git a/tensorflow/contrib/gdr/gdr_rendezvous_mgr.cc b/tensorflow/contrib/gdr/gdr_rendezvous_mgr.cc
index 94f522c04e5..fbccbead03f 100644
--- a/tensorflow/contrib/gdr/gdr_rendezvous_mgr.cc
+++ b/tensorflow/contrib/gdr/gdr_rendezvous_mgr.cc
@@ -170,6 +170,14 @@ class GdrRemoteRendezvous : public BaseRemoteRendezvous {
     // Record "call" in active_ so that it can be aborted cleanly.
     RegisterCall(call);
 
+    // RendezvousMgr already aborted, shouldn't send RPC call any more
+    if (!call->status().ok()) {
+      done(call->status(), Args(), Args(), Tensor(), false);
+      session()->worker_cache->ReleaseWorker(src_worker, rwi);
+      delete call;
+      return;
+    }
+
     // Start "call".
     Ref();
     call->Start([this, call, src_worker, rwi, done]() {
diff --git a/tensorflow/contrib/hadoop/python/kernel_tests/hadoop_test.py b/tensorflow/contrib/hadoop/python/kernel_tests/hadoop_test.py
index f7f1189bb93..bc941ae9f23 100644
--- a/tensorflow/contrib/hadoop/python/kernel_tests/hadoop_test.py
+++ b/tensorflow/contrib/hadoop/python/kernel_tests/hadoop_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import os
 
 from tensorflow.contrib.hadoop.python.ops import hadoop_dataset_ops
+from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
@@ -47,7 +48,7 @@ class SequenceFileDatasetTest(test.TestCase):
 
     dataset = hadoop_dataset_ops.SequenceFileDataset(filenames).repeat(
         num_repeats)
-    iterator = dataset.make_initializable_iterator()
+    iterator = dataset_ops.make_initializable_iterator(dataset)
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
diff --git a/tensorflow/contrib/hadoop/python/ops/hadoop_dataset_ops.py b/tensorflow/contrib/hadoop/python/ops/hadoop_dataset_ops.py
index bf398b838df..d3fcc8cb2a9 100644
--- a/tensorflow/contrib/hadoop/python/ops/hadoop_dataset_ops.py
+++ b/tensorflow/contrib/hadoop/python/ops/hadoop_dataset_ops.py
@@ -40,15 +40,12 @@ class SequenceFileDataset(dataset_ops.DatasetSource):
     For example:
 
     ```python
+    tf.enable_eager_execution()
+
     dataset = tf.contrib.hadoop.SequenceFileDataset("/foo/bar.seq")
-    iterator = dataset.make_one_shot_iterator()
-    next_element = iterator.get_next()
     # Prints the (key, value) pairs inside a hadoop sequence file.
-    while True:
-      try:
-        print(sess.run(next_element))
-      except tf.errors.OutOfRangeError:
-        break
+    for key, value in dataset:
+      print(key, value)
     ```
 
     Args:
diff --git a/tensorflow/contrib/ignite/README.md b/tensorflow/contrib/ignite/README.md
index c7db0b77e25..5a8c650fb92 100644
--- a/tensorflow/contrib/ignite/README.md
+++ b/tensorflow/contrib/ignite/README.md
@@ -54,14 +54,12 @@ jdbc:ignite:thin://localhost/> INSERT INTO KITTEN_CACHE VALUES (3, 'LITTLE BALL
 ```python
 >>> import tensorflow as tf
 >>> from tensorflow.contrib.ignite import IgniteDataset
->>> 
->>> dataset = IgniteDataset(cache_name="SQL_PUBLIC_KITTEN_CACHE")
->>> iterator = dataset.make_one_shot_iterator()
->>> next_obj = iterator.get_next()
+>>> tf.enable_eager_execution()
 >>>
->>> with tf.Session() as sess:
->>>   for _ in range(3):
->>>     print(sess.run(next_obj))
+>>> dataset = IgniteDataset(cache_name="SQL_PUBLIC_KITTEN_CACHE")
+>>>
+>>> for element in dataset:
+>>>   print(element)
 
 {'key': 1, 'val': {'NAME': b'WARM KITTY'}}
 {'key': 2, 'val': {'NAME': b'SOFT KITTY'}}
@@ -74,23 +72,22 @@ jdbc:ignite:thin://localhost/> INSERT INTO KITTEN_CACHE VALUES (3, 'LITTLE BALL
 ```python
 >>> import tensorflow as tf
 >>> from tensorflow.contrib.ignite import IgniteDataset
->>> 
->>> dataset = IgniteDataset(cache_name="IMAGES")
->>> iterator = dataset.make_one_shot_iterator()
->>> next_obj = iterator.get_next()
+>>> tf.enable_eager_execution()
 >>>
->>> with tf.Session() as sess:
->>>   print(sess.run(next_obj))
+>>> dataset = IgniteDataset(cache_name="IMAGES")
+>>>
+>>> for element in dataset.take(1):
+>>>   print(element)
 
 {
-    'key': 'kitten.png', 
+    'key': 'kitten.png',
     'val': {
         'metadata': {
             'file_name': b'kitten.png',
             'label': b'little ball of fur',
-            width: 800, 
+            width: 800,
             height: 600
-        }, 
+        },
         'pixels': [0, 0, 0, 0, ..., 0]
     }
 }
@@ -100,13 +97,11 @@ jdbc:ignite:thin://localhost/> INSERT INTO KITTEN_CACHE VALUES (3, 'LITTLE BALL
 ```python
 >>> import tensorflow as tf
 >>> from tensorflow.contrib.ignite import IgniteDataset
->>> 
->>> dataset = IgniteDataset(cache_name="IMAGES").map(lambda obj: obj['val']['pixels'])
->>> iterator = dataset.make_one_shot_iterator()
->>> next_obj = iterator.get_next()
 >>>
->>> with tf.Session() as sess:
->>>   print(sess.run(next_obj))
+>>> dataset = IgniteDataset(cache_name="IMAGES").map(lambda obj: obj['val']['pixels'])
+>>>
+>>> for element in dataset:
+>>>   print(element)
 
 [0, 0, 0, 0, ..., 0]
 ```
@@ -126,18 +121,18 @@ Ignite Dataset allows using these two aspects of distributed neural network trai
 ```python
 >>> import tensorflow as tf
 >>> from tensorflow.contrib.ignite import IgniteDataset
->>> 
+>>>
 >>> dataset = IgniteDataset("IMAGES")
 >>>
 >>> # Compute gradients locally on every worker node.
->>> gradients = []    
+>>> gradients = []
 >>> for i in range(5):
 >>>     with tf.device("/job:WORKER/task:%d" % i):
->>>         device_iterator = dataset.make_one_shot_iterator()
+>>>         device_iterator = tf.compat.v1.data.make_one_shot_iterator(dataset)
 >>>         device_next_obj = device_iterator.get_next()
 >>>         gradient = compute_gradient(device_next_obj)
->>>         gradients.append(gradient)        
->>>        
+>>>         gradients.append(gradient)
+>>>
 >>> # Aggregate them on master node.
 >>> result_gradient = tf.reduce_sum(gradients)
 >>>
@@ -145,7 +140,7 @@ Ignite Dataset allows using these two aspects of distributed neural network trai
 >>>     print(sess.run(result_gradient))
 ```
 
-High-level TensorFlow API for [distributed training](https://www.tensorflow.org/api_docs/python/tf/contrib/distribute/DistributionStrategy) is supported as well. 
+High-level TensorFlow API for [distributed training](https://www.tensorflow.org/api_docs/python/tf/contrib/distribute/DistributionStrategy) is supported as well.
 
 ### Distributed File System
 
diff --git a/tensorflow/contrib/ignite/python/tests/ignite_dataset_test.py b/tensorflow/contrib/ignite/python/tests/ignite_dataset_test.py
index ef29b5f14a4..ff5d4c458c8 100644
--- a/tensorflow/contrib/ignite/python/tests/ignite_dataset_test.py
+++ b/tensorflow/contrib/ignite/python/tests/ignite_dataset_test.py
@@ -21,6 +21,7 @@ import os
 
 from tensorflow.contrib.ignite import IgniteDataset
 from tensorflow.python.client import session
+from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.platform import test
@@ -65,7 +66,7 @@ class IgniteDatasetTest(test.TestCase):
     self.assertEqual(dtypes.string, dataset.output_types["val"]["NAME"])
     self.assertEqual(dtypes.int64, dataset.output_types["val"]["VAL"])
 
-    it = dataset.make_one_shot_iterator()
+    it = dataset_ops.make_one_shot_iterator(dataset)
     ne = it.get_next()
 
     with session.Session() as sess:
diff --git a/tensorflow/contrib/image/kernels/adjust_hsv_in_yiq_op.cc b/tensorflow/contrib/image/kernels/adjust_hsv_in_yiq_op.cc
index 478b716d883..108da044946 100644
--- a/tensorflow/contrib/image/kernels/adjust_hsv_in_yiq_op.cc
+++ b/tensorflow/contrib/image/kernels/adjust_hsv_in_yiq_op.cc
@@ -115,7 +115,7 @@ class AdjustHsvInYiqOp<CPUDevice> : public AdjustHsvInYiqOpBase {
         *context->device()->tensorflow_cpu_worker_threads();
     Shard(worker_threads.num_threads, worker_threads.workers, channel_count,
           kCostPerChannel,
-          [channel_count, &input_data, &output_data, &tranformation_matrix](
+          [&input_data, &output_data, &tranformation_matrix](
               int64 start_channel, int64 end_channel) {
             // Applying projection matrix to input RGB vectors.
             const float* p = input_data.data() + start_channel * kChannelSize;
diff --git a/tensorflow/contrib/image/python/ops/dense_image_warp.py b/tensorflow/contrib/image/python/ops/dense_image_warp.py
index 9c7ada7afb7..7930b8317b6 100644
--- a/tensorflow/contrib/image/python/ops/dense_image_warp.py
+++ b/tensorflow/contrib/image/python/ops/dense_image_warp.py
@@ -25,7 +25,7 @@ from tensorflow.python.framework import ops
 
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
-
+from tensorflow.python.ops import check_ops
 
 def _interpolate_bilinear(grid,
                           query_points,
@@ -60,28 +60,40 @@ def _interpolate_bilinear(grid,
       msg = 'Grid must be 4 dimensional. Received size: '
       raise ValueError(msg + str(grid.get_shape()))
 
-    batch_size, height, width, channels = shape
+    batch_size, height, width, channels = (array_ops.shape(grid)[0],
+                                           array_ops.shape(grid)[1],
+                                           array_ops.shape(grid)[2],
+                                           array_ops.shape(grid)[3])
+
+    shape = [batch_size, height, width, channels]
     query_type = query_points.dtype
     grid_type = grid.dtype
 
-    if (query_points.shape.rank != 3 or
-        query_points.shape.dims[2].value != 2):
-      msg = ('Query points must be 3 dimensional and size 2 in dim 2. Received '
-             'size: ')
-      raise ValueError(msg + str(query_points.get_shape()))
+    with ops.control_dependencies([
+        check_ops.assert_equal(
+            len(query_points.get_shape()),
+            3,
+            message='Query points must be 3 dimensional.'),
+        check_ops.assert_equal(
+            array_ops.shape(query_points)[2],
+            2,
+            message='Query points must be size 2 in dim 2.')]):
+      num_queries = array_ops.shape(query_points)[1]
 
-    _, num_queries, _ = query_points.get_shape().as_list()
-
-    if height < 2 or width < 2:
-      msg = 'Grid must be at least batch_size x 2 x 2 in size. Received size: '
-      raise ValueError(msg + str(grid.get_shape()))
-
-    alphas = []
-    floors = []
-    ceils = []
-
-    index_order = [0, 1] if indexing == 'ij' else [1, 0]
-    unstacked_query_points = array_ops.unstack(query_points, axis=2)
+    with ops.control_dependencies([
+        check_ops.assert_greater_equal(
+            height,
+            2,
+            message='Grid height must be at least 2.'),
+        check_ops.assert_greater_equal(
+            width,
+            2,
+            message='Grid width must be at least 2.')]):
+      alphas = []
+      floors = []
+      ceils = []
+      index_order = [0, 1] if indexing == 'ij' else [1, 0]
+      unstacked_query_points = array_ops.unstack(query_points, axis=2)
 
     for dim in index_order:
       with ops.name_scope('dim-' + str(dim)):
@@ -112,16 +124,17 @@ def _interpolate_bilinear(grid,
         alpha = array_ops.expand_dims(alpha, 2)
         alphas.append(alpha)
 
-    if batch_size * height * width > np.iinfo(np.int32).max / 8:
-      error_msg = """The image size or batch size is sufficiently large
-                     that the linearized addresses used by array_ops.gather
-                     may exceed the int32 limit."""
-      raise ValueError(error_msg)
-
-    flattened_grid = array_ops.reshape(grid,
-                                       [batch_size * height * width, channels])
-    batch_offsets = array_ops.reshape(
-        math_ops.range(batch_size) * height * width, [batch_size, 1])
+    with ops.control_dependencies([
+        check_ops.assert_less_equal(
+            math_ops.cast(batch_size * height * width, dtype=dtypes.float32),
+            np.iinfo(np.int32).max / 8,
+            message="""The image size or batch size is sufficiently large
+                       that the linearized addresses used by array_ops.gather
+                       may exceed the int32 limit.""")]):
+      flattened_grid = array_ops.reshape(
+          grid, [batch_size * height * width, channels])
+      batch_offsets = array_ops.reshape(
+          math_ops.range(batch_size) * height * width, [batch_size, 1])
 
     # This wraps array_ops.gather. We reshape the image data such that the
     # batch, y, and x coordinates are pulled into the first dimension.
@@ -182,7 +195,11 @@ def dense_image_warp(image, flow, name='dense_image_warp'):
                 of dimensions.
   """
   with ops.name_scope(name):
-    batch_size, height, width, channels = image.get_shape().as_list()
+    batch_size, height, width, channels = (array_ops.shape(image)[0],
+                                           array_ops.shape(image)[1],
+                                           array_ops.shape(image)[2],
+                                           array_ops.shape(image)[3])
+
     # The flow is defined on the image grid. Turn the flow into a list of query
     # points in the grid space.
     grid_x, grid_y = array_ops.meshgrid(
diff --git a/tensorflow/contrib/keras/api/keras/layers/__init__.py b/tensorflow/contrib/keras/api/keras/layers/__init__.py
index 3327a9f9a61..9e19884df85 100644
--- a/tensorflow/contrib/keras/api/keras/layers/__init__.py
+++ b/tensorflow/contrib/keras/api/keras/layers/__init__.py
@@ -20,7 +20,7 @@ from __future__ import print_function
 
 # Generic layers.
 # pylint: disable=g-bad-import-order
-from tensorflow.python.keras.engine.base_layer import InputSpec
+from tensorflow.python.keras.engine.input_spec import InputSpec
 from tensorflow.python.keras.engine.base_layer import Layer
 from tensorflow.python.keras.engine.input_layer import Input
 from tensorflow.python.keras.engine.input_layer import InputLayer
diff --git a/tensorflow/contrib/keras/api/keras/utils/__init__.py b/tensorflow/contrib/keras/api/keras/utils/__init__.py
index 47cd01b924f..3b9fa1b230b 100644
--- a/tensorflow/contrib/keras/api/keras/utils/__init__.py
+++ b/tensorflow/contrib/keras/api/keras/utils/__init__.py
@@ -30,6 +30,7 @@ from tensorflow.python.keras.utils.generic_utils import Progbar
 from tensorflow.python.keras.utils.generic_utils import serialize_keras_object
 from tensorflow.python.keras.utils.io_utils import HDF5Matrix
 from tensorflow.python.keras.utils.layer_utils import convert_all_kernels_in_model
+from tensorflow.python.keras.utils.losses_utils import squeeze_or_expand_dimensions
 from tensorflow.python.keras.utils.np_utils import normalize
 from tensorflow.python.keras.utils.np_utils import to_categorical
 from tensorflow.python.keras.utils.vis_utils import plot_model
diff --git a/tensorflow/contrib/kernel_methods/python/kernel_estimators.py b/tensorflow/contrib/kernel_methods/python/kernel_estimators.py
index de7530231db..1626e55b9b3 100644
--- a/tensorflow/contrib/kernel_methods/python/kernel_estimators.py
+++ b/tensorflow/contrib/kernel_methods/python/kernel_estimators.py
@@ -90,7 +90,7 @@ def _update_features_and_columns(features, feature_columns,
     mapped_column_name = column_name + "_MAPPED"
     # Construct new feature columns based on provided kernel_mappers.
     column_kernel_mappers = kernel_mappers_dict[feature_column]
-    new_dim = sum([mapper.output_dim for mapper in column_kernel_mappers])
+    new_dim = sum(mapper.output_dim for mapper in column_kernel_mappers)
     mapped_columns.add(
         layers.feature_column.real_valued_column(mapped_column_name, new_dim))
 
diff --git a/tensorflow/contrib/kinesis/python/ops/kinesis_dataset_ops.py b/tensorflow/contrib/kinesis/python/ops/kinesis_dataset_ops.py
index 75806dbbeb1..c392adbb1d9 100644
--- a/tensorflow/contrib/kinesis/python/ops/kinesis_dataset_ops.py
+++ b/tensorflow/contrib/kinesis/python/ops/kinesis_dataset_ops.py
@@ -34,15 +34,12 @@ class KinesisDataset(dataset_ops.DatasetSource):
 
   For example, we can construct and use the KinesisDataset as follows:
   ```python
+  tf.enable_eager_execution()
+
   dataset = tf.contrib.kinesis.KinesisDataset(
       "kinesis_stream_name", read_indefinitely=False)
-  next = dataset.make_one_shot_iterator().get_next()
-  with tf.Session() as sess:
-    while True:
-      try:
-        print(sess.run(nxt))
-      except tf.errors.OutOfRangeError:
-        break
+  for element in dataset:
+    print(element)
   ```
 
   Since Kinesis is a data streaming service, data may not be available
diff --git a/tensorflow/contrib/layers/BUILD b/tensorflow/contrib/layers/BUILD
index e6596bfdfb9..9ca6f8df5db 100644
--- a/tensorflow/contrib/layers/BUILD
+++ b/tensorflow/contrib/layers/BUILD
@@ -78,6 +78,11 @@ tf_custom_op_py_library(
         ":sparse_feature_cross_op_op_lib",
     ],
     srcs_version = "PY2AND3",
+    visibility = [
+        "//learning/brain:__subpackages__",
+        "//tensorflow:__subpackages__",
+        "//video/youtube/personalization:__subpackages__",
+    ],
     deps = [
         ":sparse_feature_cross_op",
         "//tensorflow/contrib/framework:framework_py",
@@ -253,7 +258,7 @@ py_test(
         "//tensorflow/python:training",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python:variables",
-        "//tensorflow/python/feature_column",
+        "//tensorflow/python/feature_column:feature_column_py",
         "//third_party/py/numpy",
     ],
 )
@@ -277,7 +282,7 @@ py_test(
         "//tensorflow/python:sparse_tensor",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python:variables",
-        "//tensorflow/python/feature_column",
+        "//tensorflow/python/feature_column:feature_column_py",
         "//third_party/py/numpy",
     ],
 )
diff --git a/tensorflow/contrib/layers/python/layers/embedding_ops_test.py b/tensorflow/contrib/layers/python/layers/embedding_ops_test.py
index 124515e5a64..295c721fced 100644
--- a/tensorflow/contrib/layers/python/layers/embedding_ops_test.py
+++ b/tensorflow/contrib/layers/python/layers/embedding_ops_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 
 import itertools
 import math
+import sys
 
 import numpy as np
 
@@ -36,6 +37,7 @@ from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import partitioned_variables
+from tensorflow.python.ops import variable_scope
 from tensorflow.python.platform import test
 from tensorflow.python.util import compat
 
@@ -48,11 +50,13 @@ class SafeEmbeddingLookupSparseTest(test.TestCase):
     assert num_shards > 0
     assert num_shards <= vocab_size
 
-    embedding_weights = partitioned_variables.create_partitioned_variables(
+    initializer = init_ops.truncated_normal_initializer(
+        mean=0.0, stddev=1.0 / math.sqrt(vocab_size), dtype=dtypes.float32)
+    embedding_weights = list(variable_scope.get_variable(
+        "embedding_weights",
         shape=[vocab_size, embed_dim],
-        slicing=[num_shards, 1],
-        initializer=init_ops.truncated_normal_initializer(
-            mean=0.0, stddev=1.0 / math.sqrt(vocab_size), dtype=dtypes.float32))
+        partitioner=partitioned_variables.fixed_size_partitioner(num_shards),
+        initializer=initializer))
     for w in embedding_weights:
       w.initializer.run()
     embedding_weights = [w.eval() for w in embedding_weights]
@@ -256,6 +260,13 @@ class SafeEmbeddingLookupSparseTest(test.TestCase):
                         embedding_weights, sparse_ids, sparse_weights)
 
 
+# pylint: disable=invalid-name
+def local_variable_scope():
+  """Create a variable scope named like the caller function."""
+  return variable_scope.variable_scope(sys._getframe(1).f_code.co_name)
+# pylint: enable=invalid-name
+
+
 class ScatteredEmbeddingLookupTest(test.TestCase):
 
   def setUp(self):
@@ -266,17 +277,18 @@ class ScatteredEmbeddingLookupTest(test.TestCase):
     assert num_shards > 0
     assert num_shards <= size
 
-    embedding_weights = partitioned_variables.create_partitioned_variables(
+    embedding_weights = list(variable_scope.get_variable(
+        "embedding_weights",
         shape=[size],
-        slicing=[num_shards],
+        partitioner=partitioned_variables.fixed_size_partitioner(num_shards),
         initializer=init_ops.truncated_normal_initializer(
-            mean=0.0, stddev=1.0, dtype=dtypes.float32))
+            mean=0.0, stddev=1.0, dtype=dtypes.float32)))
     for w in embedding_weights:
       w.initializer.run()
     return embedding_weights
 
   def test_scattered_embedding_consistency(self):
-    with self.cached_session():
+    with self.cached_session(), local_variable_scope():
       embedding_weights = self._random_weights()
       values = constant_op.constant(["foo", "foo"])
 
@@ -288,7 +300,7 @@ class ScatteredEmbeddingLookupTest(test.TestCase):
                           embedding_lookup_result[1])
 
   def test_scattered_embedding_multiple_partition(self):
-    with self.cached_session():
+    with self.cached_session(), local_variable_scope():
       embedding_weights = self._random_weights(num_shards=7)
       values = constant_op.constant([4, 4, 5])
 
@@ -304,7 +316,7 @@ class ScatteredEmbeddingLookupTest(test.TestCase):
       self.assertGreater(embedding_diff, 0)
 
   def test_scattered_embedding_coverage(self):
-    with self.cached_session():
+    with self.cached_session(), local_variable_scope():
       size = 8
       embedding_weights = self._random_weights(size=size, num_shards=3)
       values = constant_op.constant(["foo"])
@@ -316,7 +328,7 @@ class ScatteredEmbeddingLookupTest(test.TestCase):
       self.assertEqual(len(np.unique(embedding_lookup_result[0])), size)
 
   def test_scattered_embedding_multi_dimension(self):
-    with self.cached_session():
+    with self.cached_session(), local_variable_scope():
       embedding_weights = self._random_weights()
       values = constant_op.constant([["foo", "bar", "bar"],
                                      ["bar", "bar", "foo"]])
@@ -329,7 +341,7 @@ class ScatteredEmbeddingLookupTest(test.TestCase):
                           embedding_lookup_result[1][2])
 
   def test_scattered_embedding_lookup_sparse(self):
-    with self.cached_session():
+    with self.cached_session(), local_variable_scope():
       embedding_weights = self._random_weights(num_shards=3)
       sparse_tensor = sparse_tensor_lib.SparseTensor(
           values=["foo", "bar", "foo", "bar"],
@@ -358,7 +370,7 @@ class ScatteredEmbeddingLookupTest(test.TestCase):
     embeds = np.random.randn(n_embed, d_embed)
     idx = np.random.randint(0, n_embed, idx_shape)
 
-    with self.cached_session():
+    with self.cached_session(), local_variable_scope():
       embedded_np = embeds[idx]
       embedded_tf = embedding_ops.embedding_lookup_unique(embeds, idx).eval()
 
@@ -370,7 +382,7 @@ class ScatteredEmbeddingLookupTest(test.TestCase):
     idx = np.random.randint(0, 5, 10)
     idx2d = np.random.randint(0, 5, (10, 2))
 
-    with self.cached_session():
+    with self.cached_session(), local_variable_scope():
       embedded_np = embeds[idx]
       embedded_np2d = embeds[idx2d]
       embedded_tf = embedding_ops.embedding_lookup_unique(embeds, idx).eval()
@@ -398,17 +410,18 @@ class SampledScatteredEmbeddingLookupTest(test.TestCase):
     assert num_shards > 0
     assert num_shards <= size
 
-    embedding_weights = partitioned_variables.create_partitioned_variables(
+    embedding_weights = list(variable_scope.get_variable(
+        "embedding_weights",
         shape=[size],
-        slicing=[num_shards],
+        partitioner=partitioned_variables.fixed_size_partitioner(num_shards),
         initializer=init_ops.truncated_normal_initializer(
-            mean=0.0, stddev=1.0, dtype=dtypes.float32))
+            mean=0.0, stddev=1.0, dtype=dtypes.float32)))
     for w in embedding_weights:
       w.initializer.run()
     return embedding_weights
 
   def test_hashed_embedding_consistency(self):
-    with self.cached_session():
+    with self.cached_session(), local_variable_scope():
       embedding_weights = self._random_weights()
       values = constant_op.constant(["foo", "foo"])
       # The first three sampled_candidates are equal, so the first three
@@ -429,7 +442,7 @@ class SampledScatteredEmbeddingLookupTest(test.TestCase):
                           embedding_lookup_result[1][3])
 
   def test_hashed_embedding_multi_dimension(self):
-    with self.cached_session():
+    with self.cached_session(), local_variable_scope():
       embedding_weights = self._random_weights()
       values = constant_op.constant([["foo", "bar", "bar"],
                                      ["bar", "bar", "foo"]])
@@ -691,7 +704,6 @@ class EmbeddingLookupSparseWithDistributedAggregationTest(test.TestCase):
       index += num_val
     return grouped_vals
 
-  @test_util.enable_c_shapes
   def testEmbeddingLookupSparse(self):
     vocab_size = 13
     batch_size = 10
diff --git a/tensorflow/contrib/layers/python/layers/encoders.py b/tensorflow/contrib/layers/python/layers/encoders.py
index f42112206d0..3671633c8d7 100644
--- a/tensorflow/contrib/layers/python/layers/encoders.py
+++ b/tensorflow/contrib/layers/python/layers/encoders.py
@@ -84,8 +84,7 @@ def bow_encoder(ids,
       if isinstance(ids, sparse_tensor.SparseTensor):
         raise TypeError('ids are expected to be dense Tensor, got: %s', ids)
       return math_ops.reduce_mean(
-          embedding_ops.embedding_lookup(embeddings, ids),
-          reduction_indices=1)
+          embedding_ops.embedding_lookup(embeddings, ids), axis=1)
 
 
 def embed_sequence(ids,
diff --git a/tensorflow/contrib/layers/python/layers/feature_column.py b/tensorflow/contrib/layers/python/layers/feature_column.py
index 222404b19db..00d819ed0e9 100644
--- a/tensorflow/contrib/layers/python/layers/feature_column.py
+++ b/tensorflow/contrib/layers/python/layers/feature_column.py
@@ -1015,8 +1015,7 @@ class _OneHotColumn(
         dense_id_tensor, depth=self.length, on_value=1.0, off_value=0.0)
 
     # Reduce to get a multi-hot per example.
-    return math_ops.reduce_sum(
-        one_hot_id_tensor, reduction_indices=[output_rank - 1])
+    return math_ops.reduce_sum(one_hot_id_tensor, axis=[output_rank - 1])
 
   @property
   def _variable_shape(self):
diff --git a/tensorflow/contrib/layers/python/layers/feature_column_ops_test.py b/tensorflow/contrib/layers/python/layers/feature_column_ops_test.py
index 6fb4b9ff353..7e6eafaa0d6 100644
--- a/tensorflow/contrib/layers/python/layers/feature_column_ops_test.py
+++ b/tensorflow/contrib/layers/python/layers/feature_column_ops_test.py
@@ -27,7 +27,7 @@ from tensorflow.contrib.layers.python.layers import feature_column
 from tensorflow.contrib.layers.python.layers import feature_column_ops
 from tensorflow.core.example import example_pb2
 from tensorflow.core.example import feature_pb2
-from tensorflow.python.feature_column import feature_column as fc_core
+from tensorflow.python.feature_column import feature_column_lib as fc_core
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
diff --git a/tensorflow/contrib/layers/python/layers/feature_column_test.py b/tensorflow/contrib/layers/python/layers/feature_column_test.py
index d90d6ecf7f6..cab8da808b6 100644
--- a/tensorflow/contrib/layers/python/layers/feature_column_test.py
+++ b/tensorflow/contrib/layers/python/layers/feature_column_test.py
@@ -27,7 +27,7 @@ import numpy as np
 
 from tensorflow.contrib.layers.python.layers import feature_column as fc
 from tensorflow.contrib.layers.python.layers import feature_column_ops
-from tensorflow.python.feature_column import feature_column as fc_core
+from tensorflow.python.feature_column import feature_column_lib as fc_core
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import sparse_tensor as sparse_tensor_lib
diff --git a/tensorflow/contrib/layers/python/layers/layers.py b/tensorflow/contrib/layers/python/layers/layers.py
index ac9561c7693..403b522ce45 100644
--- a/tensorflow/contrib/layers/python/layers/layers.py
+++ b/tensorflow/contrib/layers/python/layers/layers.py
@@ -35,6 +35,7 @@ from tensorflow.python.framework import function
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.keras.engine import input_spec
 from tensorflow.python.layers import base
 from tensorflow.python.layers import convolutional as convolutional_layers
 from tensorflow.python.layers import core as core_layers
@@ -1958,7 +1959,7 @@ class GDN(base.Layer):
     self._reparam_offset = reparam_offset
     self.data_format = data_format
     self._channel_axis()  # trigger ValueError early
-    self.input_spec = base.InputSpec(min_ndim=3, max_ndim=5)
+    self.input_spec = input_spec.InputSpec(min_ndim=3, max_ndim=5)
 
   def _channel_axis(self):
     try:
@@ -2015,7 +2016,7 @@ class GDN(base.Layer):
       raise ValueError('The channel dimension of the inputs to `GDN` '
                        'must be defined.')
     self._input_rank = input_shape.ndims
-    self.input_spec = base.InputSpec(
+    self.input_spec = input_spec.InputSpec(
         ndim=input_shape.ndims, axes={
             channel_axis: num_channels
         })
diff --git a/tensorflow/contrib/layers/python/layers/layers_test.py b/tensorflow/contrib/layers/python/layers/layers_test.py
index 8ead6336a08..0a4d2c6d4cb 100644
--- a/tensorflow/contrib/layers/python/layers/layers_test.py
+++ b/tensorflow/contrib/layers/python/layers/layers_test.py
@@ -3811,7 +3811,7 @@ class UnitNormTests(test.TestCase):
       image = random_ops.random_uniform((height, width, 3))
       output = _layers.unit_norm(image, dim=dim, epsilon=1e-6)
       norms = math_ops.sqrt(
-          math_ops.reduce_sum(math_ops.square(output), reduction_indices=dim))
+          math_ops.reduce_sum(math_ops.square(output), axis=dim))
 
       shape = [height, width, 3]
       del shape[dim]
@@ -3847,7 +3847,7 @@ class UnitNormTests(test.TestCase):
       image = array_ops.placeholder(dtypes.float32, (None, None, 3))
       output = _layers.unit_norm(image, dim=dim, epsilon=1e-6)
       norms = math_ops.sqrt(
-          math_ops.reduce_sum(math_ops.square(output), reduction_indices=dim))
+          math_ops.reduce_sum(math_ops.square(output), axis=dim))
 
       with self.cached_session():
         actual = norms.eval({image: placeholder_value})
diff --git a/tensorflow/contrib/layers/python/layers/regularizers_test.py b/tensorflow/contrib/layers/python/layers/regularizers_test.py
index 51faba30c74..5cb00b76847 100644
--- a/tensorflow/contrib/layers/python/layers/regularizers_test.py
+++ b/tensorflow/contrib/layers/python/layers/regularizers_test.py
@@ -141,7 +141,7 @@ class RegularizerTest(test.TestCase):
     dummy_regularizer = lambda x: math_ops.reduce_sum(2 * x)
     array_weights_list = [[1.5], [2, 3, 4.2], [10, 42, 666.6]]
     tensor_weights_list = [constant_op.constant(x) for x in array_weights_list]
-    expected = sum([2 * x for l in array_weights_list for x in l])
+    expected = sum(2 * x for l in array_weights_list for x in l)
     with self.cached_session():
       result = regularizers.apply_regularization(dummy_regularizer,
                                                  tensor_weights_list)
diff --git a/tensorflow/contrib/learn/BUILD b/tensorflow/contrib/learn/BUILD
index 61185f65a9b..14065fcee51 100644
--- a/tensorflow/contrib/learn/BUILD
+++ b/tensorflow/contrib/learn/BUILD
@@ -24,6 +24,11 @@ py_library(
         exclude = ["python/learn/**/*_test.py"],
     ),
     srcs_version = "PY2AND3",
+    visibility = [
+        "//learning/brain:__subpackages__",
+        "//tensorflow:__subpackages__",
+        "//video/youtube/personalization:__subpackages__",
+    ],
     # This library should not depend on sklearn, even though some of the code
     # refers to it. (The code handles the presence of sklearn conditionally.)
     deps = [
@@ -269,6 +274,7 @@ py_test(
     name = "estimator_test",
     size = "medium",
     srcs = ["python/learn/estimators/estimator_test.py"],
+    shard_count = 2,
     srcs_version = "PY2AND3",
     tags = [
         "manual",
diff --git a/tensorflow/contrib/learn/python/learn/estimators/dnn.py b/tensorflow/contrib/learn/python/learn/estimators/dnn.py
index eabebb7e881..10fbd60ba2d 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/dnn.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/dnn.py
@@ -28,7 +28,6 @@ import six
 from tensorflow.contrib import layers
 from tensorflow.contrib.framework import deprecated
 from tensorflow.contrib.framework import deprecated_arg_values
-from tensorflow.python.training import training_util
 from tensorflow.contrib.layers.python.layers import feature_column
 from tensorflow.contrib.layers.python.layers import optimizers
 from tensorflow.contrib.learn.python.learn import metric_spec
@@ -38,11 +37,12 @@ from tensorflow.contrib.learn.python.learn.estimators import head as head_lib
 from tensorflow.contrib.learn.python.learn.estimators import model_fn
 from tensorflow.contrib.learn.python.learn.estimators import prediction_key
 from tensorflow.contrib.learn.python.learn.utils import export
-from tensorflow.python.feature_column import feature_column as fc_core
+from tensorflow.python.feature_column import feature_column_lib as fc_core
 from tensorflow.python.ops import nn
 from tensorflow.python.ops import partitioned_variables
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.summary import summary
+from tensorflow.python.training import training_util
 
 # The default learning rate of 0.05 is a historical artifact of the initial
 # implementation, but seems a reasonable choice.
@@ -150,10 +150,10 @@ def _dnn_model_fn(features, labels, mode, params, config=None):
         "input_from_feature_columns",
         values=tuple(six.itervalues(features)),
         partitioner=input_layer_partitioner) as input_layer_scope:
-      if all([
+      if all(
           isinstance(fc, feature_column._FeatureColumn)  # pylint: disable=protected-access
           for fc in feature_columns
-      ]):
+      ):
         net = layers.input_from_feature_columns(
             columns_to_tensors=features,
             feature_columns=feature_columns,
diff --git a/tensorflow/contrib/learn/python/learn/estimators/dnn_linear_combined.py b/tensorflow/contrib/learn/python/learn/estimators/dnn_linear_combined.py
index 3d85533d92d..2ade6b7b6ce 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/dnn_linear_combined.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/dnn_linear_combined.py
@@ -38,7 +38,7 @@ from tensorflow.contrib.learn.python.learn.estimators import head as head_lib
 from tensorflow.contrib.learn.python.learn.estimators import model_fn
 from tensorflow.contrib.learn.python.learn.estimators import prediction_key
 from tensorflow.contrib.learn.python.learn.utils import export
-from tensorflow.python.feature_column import feature_column as fc_core
+from tensorflow.python.feature_column import feature_column_lib as fc_core
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import nn
@@ -236,10 +236,10 @@ def _dnn_linear_combined_model_fn(features, labels, mode, params, config=None):
           "input_from_feature_columns",
           values=tuple(six.itervalues(features)),
           partitioner=input_layer_partitioner) as dnn_input_scope:
-        if all([
+        if all(
             isinstance(fc, feature_column_lib._FeatureColumn)  # pylint: disable=protected-access
             for fc in dnn_feature_columns
-        ]):
+        ):
           net = layers.input_from_feature_columns(
               columns_to_tensors=features,
               feature_columns=dnn_feature_columns,
@@ -292,8 +292,8 @@ def _dnn_linear_combined_model_fn(features, labels, mode, params, config=None):
         linear_parent_scope,
         values=tuple(six.itervalues(features)),
         partitioner=linear_partitioner) as scope:
-      if all([isinstance(fc, feature_column_lib._FeatureColumn)  # pylint: disable=protected-access
-              for fc in linear_feature_columns]):
+      if all(isinstance(fc, feature_column_lib._FeatureColumn)  # pylint: disable=protected-access
+             for fc in linear_feature_columns):
         if joint_linear_weights:
           linear_logits, _, _ = layers.joint_weighted_sum_from_feature_columns(
               columns_to_tensors=features,
diff --git a/tensorflow/contrib/learn/python/learn/estimators/dnn_linear_combined_test.py b/tensorflow/contrib/learn/python/learn/estimators/dnn_linear_combined_test.py
index 4e65c180d8b..d46a873bfaa 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/dnn_linear_combined_test.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/dnn_linear_combined_test.py
@@ -36,7 +36,7 @@ from tensorflow.contrib.learn.python.learn.estimators import run_config
 from tensorflow.contrib.learn.python.learn.estimators import test_data
 from tensorflow.contrib.learn.python.learn.metric_spec import MetricSpec
 from tensorflow.contrib.metrics.python.ops import metric_ops
-from tensorflow.python.feature_column import feature_column as fc_core
+from tensorflow.python.feature_column import feature_column_lib as fc_core
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
diff --git a/tensorflow/contrib/learn/python/learn/estimators/dnn_test.py b/tensorflow/contrib/learn/python/learn/estimators/dnn_test.py
index 2bd57597c2e..ee25cebd484 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/dnn_test.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/dnn_test.py
@@ -38,7 +38,7 @@ from tensorflow.contrib.learn.python.learn.estimators import run_config
 from tensorflow.contrib.learn.python.learn.estimators import test_data
 from tensorflow.contrib.learn.python.learn.metric_spec import MetricSpec
 from tensorflow.contrib.metrics.python.ops import metric_ops
-from tensorflow.python.feature_column import feature_column as fc_core
+from tensorflow.python.feature_column import feature_column_lib as fc_core
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import sparse_tensor
diff --git a/tensorflow/contrib/learn/python/learn/estimators/dynamic_rnn_estimator_test.py b/tensorflow/contrib/learn/python/learn/estimators/dynamic_rnn_estimator_test.py
index 1d8a59281a4..28c4964527b 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/dynamic_rnn_estimator_test.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/dynamic_rnn_estimator_test.py
@@ -668,7 +668,7 @@ class DynamicRNNEstimatorLearningTest(test.TestCase):
         sequences = centers + noise
 
         inputs = array_ops.expand_dims(sequences, 2)
-        labels = math_ops.reduce_mean(sequences, reduction_indices=[1])
+        labels = math_ops.reduce_mean(sequences, axis=[1])
         return {'inputs': inputs}, labels
 
       return input_fn
@@ -722,8 +722,8 @@ class DynamicRNNEstimatorLearningTest(test.TestCase):
         inputs = array_ops.expand_dims(math_ops.to_float(random_sequence), 2)
         labels = math_ops.to_int32(
             array_ops.squeeze(
-                math_ops.reduce_sum(
-                    inputs, reduction_indices=[1]) > (sequence_length / 2.0)))
+                math_ops.reduce_sum(inputs, axis=[1]) > (
+                    sequence_length / 2.0)))
         return {'inputs': inputs}, labels
 
       return input_fn
diff --git a/tensorflow/contrib/learn/python/learn/estimators/estimator.py b/tensorflow/contrib/learn/python/learn/estimators/estimator.py
index 8bc869db895..9132b2209bc 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/estimator.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/estimator.py
@@ -1066,11 +1066,11 @@ class BaseEstimator(sklearn.BaseEstimator, evaluable.Evaluable,
       chief_hooks = []
       if (self._config.save_checkpoints_secs or
           self._config.save_checkpoints_steps):
-        saver_hook_exists = any([
+        saver_hook_exists = any(
             isinstance(h, basic_session_run_hooks.CheckpointSaverHook)
             for h in (all_hooks + model_fn_ops.training_hooks + chief_hooks +
                       model_fn_ops.training_chief_hooks)
-        ])
+        )
         if not saver_hook_exists:
           chief_hooks = [
               basic_session_run_hooks.CheckpointSaverHook(
@@ -1493,7 +1493,7 @@ class Estimator(BaseEstimator):
 # pylint: disable=protected-access
 class SKCompat(sklearn.BaseEstimator):
   """Scikit learn wrapper for TensorFlow Learn Estimator.
-  
+
   THIS CLASS IS DEPRECATED. See
   [contrib/learn/README.md](https://www.tensorflow.org/code/tensorflow/contrib/learn/README.md)
   for general migration instructions.
diff --git a/tensorflow/contrib/learn/python/learn/estimators/linear.py b/tensorflow/contrib/learn/python/learn/estimators/linear.py
index e100bc7a1e7..9ee8d8004bf 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/linear.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/linear.py
@@ -37,7 +37,7 @@ from tensorflow.contrib.learn.python.learn.estimators import head as head_lib
 from tensorflow.contrib.learn.python.learn.estimators import prediction_key
 from tensorflow.contrib.learn.python.learn.utils import export
 from tensorflow.contrib.linear_optimizer.python import sdca_optimizer
-from tensorflow.python.feature_column import feature_column as fc_core
+from tensorflow.python.feature_column import feature_column_lib as fc_core
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
@@ -155,8 +155,8 @@ def _linear_model_fn(features, labels, mode, params, config=None):
       parent_scope,
       values=tuple(six.itervalues(features)),
       partitioner=partitioner) as scope:
-    if all([isinstance(fc, feature_column._FeatureColumn)  # pylint: disable=protected-access
-            for fc in feature_columns]):
+    if all(isinstance(fc, feature_column._FeatureColumn)  # pylint: disable=protected-access
+           for fc in feature_columns):
       if joint_weights:
         layer_fn = layers.joint_weighted_sum_from_feature_columns
       else:
diff --git a/tensorflow/contrib/learn/python/learn/estimators/linear_test.py b/tensorflow/contrib/learn/python/learn/estimators/linear_test.py
index 597ca4e86db..dfc76bfde6c 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/linear_test.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/linear_test.py
@@ -37,7 +37,7 @@ from tensorflow.contrib.learn.python.learn.estimators import test_data
 from tensorflow.contrib.learn.python.learn.metric_spec import MetricSpec
 from tensorflow.contrib.linear_optimizer.python import sdca_optimizer as sdca_optimizer_lib
 from tensorflow.contrib.metrics.python.ops import metric_ops
-from tensorflow.python.feature_column import feature_column as fc_core
+from tensorflow.python.feature_column import feature_column_lib as fc_core
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import sparse_tensor
@@ -1745,7 +1745,7 @@ class LinearRegressorTest(test.TestCase):
           'place_holder':
               constant_op.constant([[0.0]] * num_examples),
       }, constant_op.constant(
-          [[1 if i % 4 is 0 else 0] for i in range(num_examples)])
+          [[1 if i % 4 == 0 else 0] for i in range(num_examples)])
 
     place_holder = feature_column_lib.real_valued_column('place_holder')
     sdca_optimizer = sdca_optimizer_lib.SDCAOptimizer(
diff --git a/tensorflow/contrib/learn/python/learn/learn_io/numpy_io.py b/tensorflow/contrib/learn/python/learn/learn_io/numpy_io.py
index 29552d24f1e..59a67636ae2 100644
--- a/tensorflow/contrib/learn/python/learn/learn_io/numpy_io.py
+++ b/tensorflow/contrib/learn/python/learn/learn_io/numpy_io.py
@@ -27,7 +27,7 @@ from tensorflow.python.estimator.inputs.numpy_io import numpy_input_fn as core_n
 from tensorflow.python.util.deprecation import deprecated
 
 
-@deprecated(None, 'Use tf.estimator.inputs.numpy_input_fn.')
+@deprecated(None, 'Use tf.compat.v1.estimator.inputs.numpy_input_fn.')
 def numpy_input_fn(x,
                    y=None,
                    batch_size=128,
diff --git a/tensorflow/contrib/learn/python/learn/learn_io/pandas_io.py b/tensorflow/contrib/learn/python/learn/learn_io/pandas_io.py
index b4ef055f5ae..e9df7258a35 100644
--- a/tensorflow/contrib/learn/python/learn/learn_io/pandas_io.py
+++ b/tensorflow/contrib/learn/python/learn/learn_io/pandas_io.py
@@ -53,7 +53,7 @@ PANDAS_DTYPES = {
 }
 
 
-@deprecated(None, 'Please use tf.estimator.inputs.pandas_input_fn')
+@deprecated(None, 'Please use tf.compat.v1.estimator.inputs.pandas_input_fn')
 def pandas_input_fn(x,
                     y=None,
                     batch_size=128,
diff --git a/tensorflow/contrib/linear_optimizer/python/sdca_estimator_test.py b/tensorflow/contrib/linear_optimizer/python/sdca_estimator_test.py
index 64766718823..7a5354222f1 100644
--- a/tensorflow/contrib/linear_optimizer/python/sdca_estimator_test.py
+++ b/tensorflow/contrib/linear_optimizer/python/sdca_estimator_test.py
@@ -524,7 +524,7 @@ class SDCALinearRegressorTest(test.TestCase):
           # LinearClassifier requires at least one column.
           'place_holder':
               constant_op.constant([[0.0]] * num_examples),
-      }, constant_op.constant([[1 if i % 4 is 0 else 0]
+      }, constant_op.constant([[1 if i % 4 == 0 else 0]
                                for i in range(num_examples)])
 
     with self._single_threaded_test_session():
diff --git a/tensorflow/contrib/lookup/lookup_ops_test.py b/tensorflow/contrib/lookup/lookup_ops_test.py
index 5e99ef46051..9b2c2dd87cc 100644
--- a/tensorflow/contrib/lookup/lookup_ops_test.py
+++ b/tensorflow/contrib/lookup/lookup_ops_test.py
@@ -25,6 +25,7 @@ import six
 from tensorflow.contrib import lookup
 from tensorflow.python.client import session
 from tensorflow.python.data.experimental.ops import counter
+from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -2737,7 +2738,7 @@ class MutableHashTableBenchmark(test.Benchmark):
 
   def benchmark_many_repeated_scalar_insert_scalar(self):
     table = self._create_table()
-    c = counter.Counter().make_one_shot_iterator().get_next()
+    c = dataset_ops.make_one_shot_iterator(counter.Counter()).get_next()
     value = variables.Variable(1.0)
     insert = table.insert(c, value)
     size = table.size()
@@ -2758,7 +2759,7 @@ class MutableHashTableBenchmark(test.Benchmark):
 
   def benchmark_many_repeated_batch_32_insert_scalar(self):
     table = self._create_table()
-    c = counter.Counter().make_one_shot_iterator().get_next()
+    c = dataset_ops.make_one_shot_iterator(counter.Counter()).get_next()
     value = variables.Variable([1.0] * 32)
     insert = table.insert(32 * c + list(range(32)), value)
     size = table.size()
diff --git a/tensorflow/contrib/losses/python/losses/loss_ops.py b/tensorflow/contrib/losses/python/losses/loss_ops.py
index 619294b5182..709a042bbce 100644
--- a/tensorflow/contrib/losses/python/losses/loss_ops.py
+++ b/tensorflow/contrib/losses/python/losses/loss_ops.py
@@ -22,7 +22,6 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.contrib.framework.python.ops import add_arg_scope
-from tensorflow.python.compat import compat
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
@@ -60,41 +59,12 @@ def _scale_losses(losses, weights):
   """
   # First, compute the sum of the losses over all elements:
   start_index = max(0, weights.get_shape().ndims)
-  reduction_indices = list(range(start_index, losses.get_shape().ndims))
-  reduced_losses = math_ops.reduce_sum(
-      losses, reduction_indices=reduction_indices)
+  axis = list(range(start_index, losses.get_shape().ndims))
+  reduced_losses = math_ops.reduce_sum(losses, axis=axis)
   reduced_losses = math_ops.multiply(reduced_losses, weights)
   return math_ops.reduce_sum(reduced_losses)
 
 
-def _safe_div(numerator, denominator, name="value"):
-  """Computes a safe divide which returns 0 if the denominator is zero.
-
-  Note that the function contains an additional conditional check that is
-  necessary for avoiding situations where the loss is zero causing NaNs to
-  creep into the gradient computation.
-
-  Args:
-    numerator: An arbitrary `Tensor`.
-    denominator: A `Tensor` whose shape matches `numerator` and whose values are
-      assumed to be non-negative.
-    name: An optional name for the returned op.
-
-  Returns:
-    The element-wise value of the numerator divided by the denominator.
-  """
-  if compat.forward_compatible(2018, 11, 1):
-    return math_ops.div_no_nan(numerator, denominator, name=name)
-  return array_ops.where(
-      math_ops.greater(denominator, 0),
-      math_ops.div(numerator,
-                   array_ops.where(
-                       math_ops.equal(denominator, 0),
-                       array_ops.ones_like(denominator), denominator)),
-      array_ops.zeros_like(numerator),
-      name=name)
-
-
 def _safe_mean(losses, num_present):
   """Computes a safe mean of the losses.
 
@@ -107,7 +77,7 @@ def _safe_mean(losses, num_present):
       then zero is returned.
   """
   total_loss = math_ops.reduce_sum(losses)
-  return _safe_div(total_loss, num_present, name="value")
+  return math_ops.div_no_nan(total_loss, num_present, name="value")
 
 
 @deprecated("2016-12-30", "Use tf.losses.compute_weighted_loss instead.")
@@ -187,10 +157,9 @@ def _num_present(losses, weights, per_batch=False):
 
   # First, count the number of nonzero weights:
   if weights.get_shape().ndims >= 1:
-    reduction_indices = list(range(1, weights.get_shape().ndims))
+    axis = list(range(1, weights.get_shape().ndims))
     num_nonzero_per_batch = math_ops.reduce_sum(
-        math_ops.to_float(math_ops.not_equal(weights, 0)),
-        reduction_indices=reduction_indices)
+        math_ops.to_float(math_ops.not_equal(weights, 0)), axis=axis)
 
   # Next, determine the number of elements that weights would broadcast to:
   broadcast_dims = array_ops.slice(
@@ -606,20 +575,20 @@ def mean_pairwise_squared_error(predictions,
     if weights.get_shape().ndims is None:
       raise ValueError("weights.get_shape().ndims cannot be None")
 
-    reduction_indices = list(range(1, diffs.get_shape().ndims))
+    axis = list(range(1, diffs.get_shape().ndims))
 
     sum_squares_diff_per_batch = math_ops.reduce_sum(
-        math_ops.square(diffs), reduction_indices=reduction_indices)
+        math_ops.square(diffs), axis=axis)
     num_present_per_batch = _num_present(diffs, weights, per_batch=True)
 
-    term1 = 2.0 * _safe_div(sum_squares_diff_per_batch,
-                            num_present_per_batch,
-                            name="value")
+    term1 = 2.0 * math_ops.div_no_nan(
+        sum_squares_diff_per_batch, num_present_per_batch, name="value")
 
-    sum_diff = math_ops.reduce_sum(diffs, reduction_indices=reduction_indices)
-    term2 = 2.0 * _safe_div(math_ops.square(sum_diff),
-                            math_ops.square(num_present_per_batch),
-                            name="value")
+    sum_diff = math_ops.reduce_sum(diffs, axis=axis)
+    term2 = 2.0 * math_ops.div_no_nan(
+        math_ops.square(sum_diff),
+        math_ops.square(num_present_per_batch),
+        name="value")
 
     loss = _scale_losses(term1 - term2, weights)
 
@@ -674,7 +643,7 @@ def cosine_distance(predictions,
 
     radial_diffs = math_ops.multiply(predictions, labels)
     losses = 1 - math_ops.reduce_sum(
-        radial_diffs, reduction_indices=[
+        radial_diffs, axis=[
             axis,
         ])
     return compute_weighted_loss(losses, weights, scope=scope)
diff --git a/tensorflow/contrib/makefile/download_dependencies.sh b/tensorflow/contrib/makefile/download_dependencies.sh
index 0a07588f07f..b396c527673 100755
--- a/tensorflow/contrib/makefile/download_dependencies.sh
+++ b/tensorflow/contrib/makefile/download_dependencies.sh
@@ -34,7 +34,7 @@ NSYNC_URL="$(grep -o 'https://mirror.bazel.build/github.com/google/nsync/.*tar\.
 # 1.10 branch does not work. `make distclean` fails and blocks the build
 # process. For now we're hardcoding to the version which is used by
 # TensorFlow 1.9.
-PROTOBUF_URL="https://mirror.bazel.build/github.com/google/protobuf/archive/396336eb961b75f03b25824fe86cf6490fb75e3a.tar.gz"
+PROTOBUF_URL="https://mirror.bazel.build/github.com/google/protobuf/archive/v3.6.0.tar.gz"
 # TODO (yongtang): Replace the following with 'https://mirror.bazel.build/github.com/google/re2/.*tar\.gz' once
 # the archive has been propagated in mirror.bazel.build.
 RE2_URL="$(grep -o 'https://github.com/google/re2/.*tar\.gz' "${BZL_FILE_PATH}" | head -n1)"
diff --git a/tensorflow/contrib/makefile/tf_op_files.txt b/tensorflow/contrib/makefile/tf_op_files.txt
index e779eff6890..655c7eefcb9 100644
--- a/tensorflow/contrib/makefile/tf_op_files.txt
+++ b/tensorflow/contrib/makefile/tf_op_files.txt
@@ -157,6 +157,7 @@ tensorflow/core/kernels/mirror_pad_op_cpu_impl_2.cc
 tensorflow/core/kernels/mirror_pad_op_cpu_impl_3.cc
 tensorflow/core/kernels/mirror_pad_op_cpu_impl_4.cc
 tensorflow/core/kernels/mirror_pad_op_cpu_impl_5.cc
+tensorflow/core/kernels/multinomial_op.cc
 tensorflow/core/kernels/no_op.cc
 tensorflow/core/kernels/non_max_suppression_op.cc
 tensorflow/core/kernels/one_hot_op.cc
@@ -252,6 +253,7 @@ tensorflow/core/kernels/split_op.cc
 tensorflow/core/kernels/split_v_op.cc
 tensorflow/core/kernels/stack.cc
 tensorflow/core/kernels/stack_ops.cc
+tensorflow/core/kernels/stateless_random_ops.cc
 tensorflow/core/kernels/strided_slice_op.cc
 tensorflow/core/kernels/strided_slice_op_inst_0.cc
 tensorflow/core/kernels/strided_slice_op_inst_1.cc
diff --git a/tensorflow/contrib/metrics/python/metrics/classification.py b/tensorflow/contrib/metrics/python/metrics/classification.py
index ac123608650..062deb74b16 100644
--- a/tensorflow/contrib/metrics/python/metrics/classification.py
+++ b/tensorflow/contrib/metrics/python/metrics/classification.py
@@ -175,7 +175,7 @@ def f1_score(labels, predictions, weights=None, num_thresholds=200,
       return best_f1
 
     best_f1 = distribution_strategy_context.get_replica_context().merge_call(
-        f1_across_replicas, values)
+        f1_across_replicas, args=(values,))
 
     update_op = compute_best_f1_score(tp=update_ops['tp'], fp=update_ops['fp'],
                                       fn=update_ops['fn'], name='update')
diff --git a/tensorflow/contrib/metrics/python/metrics/classification_test.py b/tensorflow/contrib/metrics/python/metrics/classification_test.py
index d6a670f97b3..e789d2cb9df 100644
--- a/tensorflow/contrib/metrics/python/metrics/classification_test.py
+++ b/tensorflow/contrib/metrics/python/metrics/classification_test.py
@@ -291,12 +291,11 @@ class F1ScoreTest(test.TestCase):
 
     labels = labels.astype(np.float32)
     predictions = predictions.astype(np.float32)
-    tf_predictions, tf_labels = (dataset_ops.Dataset
-                                 .from_tensor_slices((predictions, labels))
-                                 .repeat()
-                                 .batch(batch_size)
-                                 .make_one_shot_iterator()
-                                 .get_next())
+    tf_predictions, tf_labels = dataset_ops.make_one_shot_iterator(
+        dataset_ops.Dataset
+        .from_tensor_slices((predictions, labels))
+        .repeat()
+        .batch(batch_size)).get_next()
     f1, f1_op = classification.f1_score(tf_labels, tf_predictions,
                                         num_thresholds=3)
 
diff --git a/tensorflow/contrib/metrics/python/ops/metric_ops.py b/tensorflow/contrib/metrics/python/ops/metric_ops.py
index d6932f6e4b6..7b432f8bd20 100644
--- a/tensorflow/contrib/metrics/python/ops/metric_ops.py
+++ b/tensorflow/contrib/metrics/python/ops/metric_ops.py
@@ -24,7 +24,6 @@ from __future__ import print_function
 
 import collections as collections_lib
 
-from tensorflow.python.compat import compat
 from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -46,32 +45,6 @@ from tensorflow.python.util.deprecation import deprecated
 _EPSILON = 1e-7
 
 
-def _safe_div(numerator, denominator):
-  """Computes a safe divide which returns 0 if the denominator is zero.
-
-  Note that the function contains an additional conditional check that is
-  necessary for avoiding situations where the loss is zero causing NaNs to
-  creep into the gradient computation.
-
-  Args:
-    numerator: An arbitrary `Tensor`.
-    denominator: A `Tensor` whose shape matches `numerator` and whose values are
-      assumed to be non-negative.
-
-  Returns:
-    The element-wise value of the numerator divided by the denominator.
-  """
-  if compat.forward_compatible(2018, 11, 1):
-    return math_ops.div_no_nan(numerator, denominator)
-  return array_ops.where(
-      math_ops.greater(denominator, 0),
-      math_ops.div(numerator,
-                   array_ops.where(
-                       math_ops.equal(denominator, 0),
-                       array_ops.ones_like(denominator), denominator)),
-      array_ops.zeros_like(numerator))
-
-
 @deprecated(None, 'Please switch to tf.metrics.true_positives. Note that the '
             'order of the labels and predictions arguments has been switched.')
 def streaming_true_positives(predictions,
@@ -3247,24 +3220,20 @@ def streaming_covariance(predictions,
 
     # We update the means by Delta=Error*BatchCount/(BatchCount+PrevCount)
     # batch_mean_prediction is E[x_B] in the update equation
-    batch_mean_prediction = _safe_div(
-        math_ops.reduce_sum(weighted_predictions),
-        batch_count)
-    delta_mean_prediction = _safe_div(
-        (batch_mean_prediction - mean_prediction) * batch_count,
-        update_count)
+    batch_mean_prediction = math_ops.div_no_nan(
+        math_ops.reduce_sum(weighted_predictions), batch_count)
+    delta_mean_prediction = math_ops.div_no_nan(
+        (batch_mean_prediction - mean_prediction) * batch_count, update_count)
     update_mean_prediction = state_ops.assign_add(mean_prediction,
                                                   delta_mean_prediction)
     # prev_mean_prediction is E[x_A] in the update equation
     prev_mean_prediction = update_mean_prediction - delta_mean_prediction
 
     # batch_mean_label is E[y_B] in the update equation
-    batch_mean_label = _safe_div(
-        math_ops.reduce_sum(weighted_labels),
-        batch_count)
-    delta_mean_label = _safe_div(
-        (batch_mean_label - mean_label) * batch_count,
-        update_count)
+    batch_mean_label = math_ops.div_no_nan(
+        math_ops.reduce_sum(weighted_labels), batch_count)
+    delta_mean_label = math_ops.div_no_nan(
+        (batch_mean_label - mean_label) * batch_count, update_count)
     update_mean_label = state_ops.assign_add(mean_label, delta_mean_label)
     # prev_mean_label is E[y_A] in the update equation
     prev_mean_label = update_mean_label - delta_mean_label
@@ -3447,7 +3416,7 @@ def streaming_mean_cosine_distance(predictions,
   predictions.get_shape().assert_is_compatible_with(labels.get_shape())
   radial_diffs = math_ops.multiply(predictions, labels)
   radial_diffs = math_ops.reduce_sum(
-      radial_diffs, reduction_indices=[
+      radial_diffs, axis=[
           dim,
       ], keepdims=True)
   mean_distance, update_op = streaming_mean(radial_diffs, weights, None, None,
@@ -3926,9 +3895,8 @@ def cohen_kappa(labels,
       po_sum = math_ops.reduce_sum(po)
       total = math_ops.reduce_sum(pe_row)
       pe_sum = math_ops.reduce_sum(
-          _safe_div(
-              math_ops.to_double(pe_row * pe_col),
-              math_ops.to_double(total)))
+          math_ops.div_no_nan(
+              math_ops.to_double(pe_row * pe_col), math_ops.to_double(total)))
       po_sum, pe_sum, total = (math_ops.to_double(po_sum),
                                math_ops.to_double(pe_sum),
                                math_ops.to_double(total))
diff --git a/tensorflow/contrib/mixed_precision/python/loss_scale_manager_test.py b/tensorflow/contrib/mixed_precision/python/loss_scale_manager_test.py
index 1b0383d24c0..c922d0cd11f 100644
--- a/tensorflow/contrib/mixed_precision/python/loss_scale_manager_test.py
+++ b/tensorflow/contrib/mixed_precision/python/loss_scale_manager_test.py
@@ -29,7 +29,7 @@ from tensorflow.python.platform import test
 
 def _GetExampleIter(inputs):
   dataset = dataset_ops.Dataset.from_tensor_slices(inputs)
-  return dataset.make_one_shot_iterator()
+  return dataset_ops.make_one_shot_iterator(dataset)
 
 
 class FixedLossScaleManagerTest(test.TestCase):
diff --git a/tensorflow/contrib/mixed_precision/python/loss_scale_optimizer_test.py b/tensorflow/contrib/mixed_precision/python/loss_scale_optimizer_test.py
index 9009df0eefe..33f9a43e803 100644
--- a/tensorflow/contrib/mixed_precision/python/loss_scale_optimizer_test.py
+++ b/tensorflow/contrib/mixed_precision/python/loss_scale_optimizer_test.py
@@ -132,7 +132,7 @@ class LossScaleOptimizerTest(test.TestCase):
 
     x = variable_scope.get_variable("x", initializer=1., dtype=dtypes.float32)
     dataset = dataset_ops.Dataset.from_tensor_slices([np.nan, np.inf, 0.1])
-    itr = dataset.make_one_shot_iterator()
+    itr = dataset_ops.make_one_shot_iterator(dataset)
 
     lr = 1
     opt = gd.GradientDescentOptimizer(lr)
@@ -182,7 +182,7 @@ class LossScaleOptimizerTest(test.TestCase):
 
     x = variable_scope.get_variable("x", initializer=1., dtype=dtypes.float32)
     dataset = dataset_ops.Dataset.from_tensor_slices([np.nan, np.inf, 0.1])
-    itr = dataset.make_one_shot_iterator()
+    itr = dataset_ops.make_one_shot_iterator(dataset)
 
     lr = 1
     init_loss_scale = 8
diff --git a/tensorflow/contrib/model_pruning/python/layers/core_layers.py b/tensorflow/contrib/model_pruning/python/layers/core_layers.py
index f0ce6fe0396..1fa5c8cb485 100644
--- a/tensorflow/contrib/model_pruning/python/layers/core_layers.py
+++ b/tensorflow/contrib/model_pruning/python/layers/core_layers.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.keras.engine import input_spec
 from tensorflow.python.layers import base
 from tensorflow.python.layers import utils
 from tensorflow.python.ops import array_ops
@@ -119,7 +120,7 @@ class _MaskedConv(base.Layer):
     self.bias_initializer = bias_initializer
     self.kernel_regularizer = kernel_regularizer
     self.bias_regularizer = bias_regularizer
-    self.input_spec = base.InputSpec(ndim=self.rank + 2)
+    self.input_spec = input_spec.InputSpec(ndim=self.rank + 2)
 
   def build(self, input_shape):
     input_shape = tensor_shape.TensorShape(input_shape)
@@ -171,7 +172,7 @@ class _MaskedConv(base.Layer):
           dtype=self.dtype)
     else:
       self.bias = None
-    self.input_spec = base.InputSpec(
+    self.input_spec = input_spec.InputSpec(
         ndim=self.rank + 2, axes={channel_axis: input_dim})
     self.built = True
 
@@ -393,14 +394,14 @@ class MaskedFullyConnected(base.Layer):
     self.bias_initializer = bias_initializer
     self.kernel_regularizer = kernel_regularizer
     self.bias_regularizer = bias_regularizer
-    self.input_spec = base.InputSpec(min_ndim=2)
+    self.input_spec = input_spec.InputSpec(min_ndim=2)
 
   def build(self, input_shape):
     input_shape = tensor_shape.TensorShape(input_shape)
     if tensor_shape.dimension_value(input_shape[-1]) is None:
       raise ValueError('The last dimension of the inputs to `Dense` '
                        'should be defined. Found `None`.')
-    self.input_spec = base.InputSpec(
+    self.input_spec = input_spec.InputSpec(
         min_ndim=2, axes={-1: tensor_shape.dimension_value(input_shape[-1])})
 
     self.kernel = self.add_variable(
diff --git a/tensorflow/contrib/opt/python/training/lars_optimizer.py b/tensorflow/contrib/opt/python/training/lars_optimizer.py
index a8dafd9a4cb..205d6c39491 100644
--- a/tensorflow/contrib/opt/python/training/lars_optimizer.py
+++ b/tensorflow/contrib/opt/python/training/lars_optimizer.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import linalg_ops
 from tensorflow.python.ops import math_ops
@@ -162,3 +163,14 @@ class LARSOptimizer(optimizer.Optimizer):
         math_ops.cast(self._momentum_tensor, grad.dtype),
         use_locking=self._use_locking,
         use_nesterov=self._use_nesterov)
+
+  def _prepare(self):
+    learning_rate = self._learning_rate
+    if callable(learning_rate):
+      learning_rate = learning_rate()
+    self._learning_rate_tensor = ops.convert_to_tensor(learning_rate,
+                                                       name="learning_rate")
+    momentum = self._momentum
+    if callable(momentum):
+      momentum = momentum()
+    self._momentum_tensor = ops.convert_to_tensor(momentum, name="momentum")
\ No newline at end of file
diff --git a/tensorflow/contrib/opt/python/training/nadam_optimizer.py b/tensorflow/contrib/opt/python/training/nadam_optimizer.py
index 155ff5b3f4f..960826407b6 100644
--- a/tensorflow/contrib/opt/python/training/nadam_optimizer.py
+++ b/tensorflow/contrib/opt/python/training/nadam_optimizer.py
@@ -18,6 +18,7 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import state_ops
@@ -83,14 +84,14 @@ class NadamOptimizer(adam.AdamOptimizer):
     with ops.control_dependencies([m_t]):
       m_t = scatter_add(m, indices, m_scaled_g_values)
       # m_bar = (1 - beta1) * g_t + beta1 * m_t
-      m_bar = m_scaled_g_values + beta1_t * m_t
+      m_bar = m_scaled_g_values + beta1_t * array_ops.gather(m_t, indices)
     # v_t = beta2 * v + (1 - beta2) * (g_t * g_t)
     v = self.get_slot(var, "v")
     v_scaled_g_values = (grad * grad) * (1 - beta2_t)
     v_t = state_ops.assign(v, v * beta2_t, use_locking=self._use_locking)
     with ops.control_dependencies([v_t]):
       v_t = scatter_add(v, indices, v_scaled_g_values)
-    v_sqrt = math_ops.sqrt(v_t)
-    var_update = state_ops.assign_sub(
-        var, lr * m_bar / (v_sqrt + epsilon_t), use_locking=self._use_locking)
+    v_t_slice = array_ops.gather(v_t, indices)
+    v_sqrt = math_ops.sqrt(v_t_slice)
+    var_update = scatter_add(var, indices, -lr * m_bar / (v_sqrt + epsilon_t))
     return control_flow_ops.group(*[var_update, m_bar, v_t])
diff --git a/tensorflow/contrib/opt/python/training/nadam_optimizer_test.py b/tensorflow/contrib/opt/python/training/nadam_optimizer_test.py
index 85e05ce71ce..a4372f64874 100644
--- a/tensorflow/contrib/opt/python/training/nadam_optimizer_test.py
+++ b/tensorflow/contrib/opt/python/training/nadam_optimizer_test.py
@@ -52,14 +52,19 @@ def nadam_update_numpy(param,
 class NadamOptimizerTest(test.TestCase):
 
   def doTestSparse(self, use_resource=False):
+    # need to use a larger value of epsilon here so that
+    # np.sqrt(v_t) + epsilon doesn't get rounded to 0 when
+    # the dtype is half and np.sqrt(v_t) = 0, as is the case
+    # when the gradient is 0
+    sparse_epsilon = 1e-7
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
       with self.cached_session():
         # Initialize variables for numpy implementation.
         m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
-        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
-        grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
-        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
-        grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
+        var0_np = np.array([1.0, 1.0, 2.0], dtype=dtype.as_numpy_dtype)
+        grads0_np = np.array([0.1, 0, 0.1], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([3.0, 3.0, 4.0], dtype=dtype.as_numpy_dtype)
+        grads1_np = np.array([0.01, 0, 0.01], dtype=dtype.as_numpy_dtype)
 
         if use_resource:
           var0 = resource_variable_ops.ResourceVariable(var0_np)
@@ -67,21 +72,21 @@ class NadamOptimizerTest(test.TestCase):
         else:
           var0 = variables.Variable(var0_np)
           var1 = variables.Variable(var1_np)
-        grads0_np_indices = np.array([0, 1], dtype=np.int32)
+        grads0_np_indices = np.array([0, 2], dtype=np.int32)
         grads0 = ops.IndexedSlices(
-            constant_op.constant(grads0_np),
-            constant_op.constant(grads0_np_indices), constant_op.constant([2]))
-        grads1_np_indices = np.array([0, 1], dtype=np.int32)
+            constant_op.constant(grads0_np[grads0_np_indices]),
+            constant_op.constant(grads0_np_indices), constant_op.constant([3]))
+        grads1_np_indices = np.array([0, 2], dtype=np.int32)
         grads1 = ops.IndexedSlices(
-            constant_op.constant(grads1_np),
-            constant_op.constant(grads1_np_indices), constant_op.constant([2]))
-        opt = nadam_optimizer.NadamOptimizer()
+            constant_op.constant(grads1_np[grads1_np_indices]),
+            constant_op.constant(grads1_np_indices), constant_op.constant([3]))
+        opt = nadam_optimizer.NadamOptimizer(epsilon=sparse_epsilon)
         update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
         variables.global_variables_initializer().run()
 
         # Fetch params to validate initial values
-        self.assertAllClose([1.0, 2.0], var0.eval())
-        self.assertAllClose([3.0, 4.0], var1.eval())
+        self.assertAllClose([1.0, 1.0, 2.0], var0.eval())
+        self.assertAllClose([3.0, 3.0, 4.0], var1.eval())
 
         beta1_power, beta2_power = opt._get_beta_accumulators()
 
@@ -91,8 +96,10 @@ class NadamOptimizerTest(test.TestCase):
           self.assertAllCloseAccordingToType(0.999**t, beta2_power.eval())
           update.run()
 
-          var0_np, m0, v0 = nadam_update_numpy(var0_np, grads0_np, t, m0, v0)
-          var1_np, m1, v1 = nadam_update_numpy(var1_np, grads1_np, t, m1, v1)
+          var0_np, m0, v0 = nadam_update_numpy(var0_np, grads0_np, t, m0, v0,
+                                               epsilon=sparse_epsilon)
+          var1_np, m1, v1 = nadam_update_numpy(var1_np, grads1_np, t, m1, v1,
+                                               epsilon=sparse_epsilon)
 
           # Validate updated params
           self.assertAllCloseAccordingToType(var0_np, var0.eval())
diff --git a/tensorflow/contrib/optimizer_v2/BUILD b/tensorflow/contrib/optimizer_v2/BUILD
index 3ba3ee29ec7..6e401406308 100644
--- a/tensorflow/contrib/optimizer_v2/BUILD
+++ b/tensorflow/contrib/optimizer_v2/BUILD
@@ -48,7 +48,6 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:distribute",
         "//tensorflow/python:framework",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:resource_variable_ops",
@@ -56,6 +55,8 @@ py_library(
         "//tensorflow/python:training",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python:variables",
+        "//tensorflow/python/distribute:distribute_lib",
+        "//tensorflow/python/distribute:reduce_util",
     ],
 )
 
diff --git a/tensorflow/contrib/optimizer_v2/optimizer_v2.py b/tensorflow/contrib/optimizer_v2/optimizer_v2.py
index 467dd86d8fd..73a556f0b29 100644
--- a/tensorflow/contrib/optimizer_v2/optimizer_v2.py
+++ b/tensorflow/contrib/optimizer_v2/optimizer_v2.py
@@ -24,6 +24,8 @@ import abc
 
 import six
 
+from tensorflow.python.distribute import distribute_lib
+from tensorflow.python.distribute import reduce_util as ds_reduce_util
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
@@ -34,7 +36,6 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
-from tensorflow.python.training import distribute as distribute_lib
 from tensorflow.python.training import distribution_strategy_context as distribute_ctx
 from tensorflow.python.training import optimizer as optimizer_v1
 from tensorflow.python.training import slot_creator
@@ -446,7 +447,7 @@ class _OptimizerV2State(object):
     if v is None:
       if colocate_with is None:
         colocate_with = self._non_slot_devices
-      with self._distribution.colocate_vars_with(colocate_with):
+      with self._distribution.extended.colocate_vars_with(colocate_with):
         # TODO(josh11b): Use get_variable() except for the legacy Adam use case.
         v = variable_scope.variable(initial_value, name=name, trainable=False)
       self._non_slot_dict[name] = v
@@ -657,7 +658,6 @@ class OptimizerV2(optimizer_v1.Optimizer):
                var_list=None,
                gate_gradients=GATE_OP,
                aggregation_method=None,
-               colocate_gradients_with_ops=False,
                name=None,
                grad_loss=None,
                stop_gradients=None,
@@ -680,8 +680,6 @@ class OptimizerV2(optimizer_v1.Optimizer):
         `GATE_NONE`, `GATE_OP`, or  `GATE_GRAPH`.
       aggregation_method: Specifies the method used to combine gradient terms.
         Valid values are defined in the class `AggregationMethod`.
-      colocate_gradients_with_ops: If True, try colocating gradients with the
-        corresponding op.
       name: Optional name for the returned operation.
       grad_loss: Optional. A `Tensor` holding the gradient computed for `loss`.
       stop_gradients: Optional. A Tensor or list of tensors not to differentiate
@@ -704,8 +702,8 @@ class OptimizerV2(optimizer_v1.Optimizer):
     Minimization (and gradient computation) is done with respect to the
     elements of `var_list` if not None, else with respect to any trainable
     variables created during the execution of the `loss` function.
-    `gate_gradients`, `aggregation_method`, `colocate_gradients_with_ops` and
-    `grad_loss` are ignored when eager execution is enabled.
+    `gate_gradients`, `aggregation_method`, and `grad_loss` are ignored when
+    eager execution is enabled.
     @end_compatibility
     """
     grads_and_vars = self.compute_gradients(
@@ -713,7 +711,6 @@ class OptimizerV2(optimizer_v1.Optimizer):
         var_list=var_list,
         gate_gradients=gate_gradients,
         aggregation_method=aggregation_method,
-        colocate_gradients_with_ops=colocate_gradients_with_ops,
         grad_loss=grad_loss,
         stop_gradients=stop_gradients,
         scale_loss_by_num_replicas=scale_loss_by_num_replicas)
@@ -733,7 +730,6 @@ class OptimizerV2(optimizer_v1.Optimizer):
                         var_list=None,
                         gate_gradients=GATE_OP,
                         aggregation_method=None,
-                        colocate_gradients_with_ops=False,
                         grad_loss=None,
                         stop_gradients=None,
                         scale_loss_by_num_replicas=None):
@@ -756,8 +752,6 @@ class OptimizerV2(optimizer_v1.Optimizer):
         `GATE_NONE`, `GATE_OP`, or `GATE_GRAPH`.
       aggregation_method: Specifies the method used to combine gradient terms.
         Valid values are defined in the class `AggregationMethod`.
-      colocate_gradients_with_ops: If True, try colocating gradients with the
-        corresponding op.
       grad_loss: Optional. A `Tensor` holding the gradient computed for `loss`.
       stop_gradients: Optional. A Tensor or list of tensors not to differentiate
         through.
@@ -776,8 +770,8 @@ class OptimizerV2(optimizer_v1.Optimizer):
         not callable.
 
     @compatibility(eager)
-    When eager execution is enabled, `gate_gradients`, `aggregation_method`,
-    and `colocate_gradients_with_ops` are ignored.
+    When eager execution is enabled, `gate_gradients`, and `aggregation_method`
+    are ignored.
     @end_compatibility
     """
     # TODO(josh11b): Test that we handle weight decay in a reasonable way.
@@ -832,7 +826,6 @@ class OptimizerV2(optimizer_v1.Optimizer):
         grad_ys=grad_loss,
         gate_gradients=(gate_gradients == optimizer_v1.Optimizer.GATE_OP),
         aggregation_method=aggregation_method,
-        colocate_gradients_with_ops=colocate_gradients_with_ops,
         stop_gradients=stop_gradients)
     if gate_gradients == optimizer_v1.Optimizer.GATE_GRAPH:
       grads = control_flow_ops.tuple(grads)
@@ -848,8 +841,7 @@ class OptimizerV2(optimizer_v1.Optimizer):
     """Scale loss for the number of replicas."""
     if scale_loss_by_num_replicas is None:
       scale_loss_by_num_replicas = (
-          distribute_lib.get_loss_reduction() == variable_scope
-          .VariableAggregation.MEAN)
+          distribute_lib.get_loss_reduction() == ds_reduce_util.ReduceOp.MEAN)
     if scale_loss_by_num_replicas:
       num_replicas = \
         distribute_ctx.get_distribution_strategy().num_replicas_in_sync
@@ -892,7 +884,8 @@ class OptimizerV2(optimizer_v1.Optimizer):
       raise ValueError("No gradients provided for any variable: %s." %
                        ([str(v) for _, v in grads_and_vars],))
     return distribute_ctx.get_replica_context().merge_call(
-        self._distributed_apply, filtered, global_step=global_step, name=name)
+        self._distributed_apply, args=(filtered,),
+        kwargs={"global_step": global_step, "name": name})
 
   def _get_or_create_state(self, var_list=None):
     """Either looks up or creates `_OptimizerV2State`.
@@ -927,8 +920,8 @@ class OptimizerV2(optimizer_v1.Optimizer):
 
   def _distributed_apply(self, distribution, grads_and_vars, global_step, name):
     """`apply_gradients` for use with a `DistributionStrategy`."""
-    reduced_grads = distribution.batch_reduce(
-        variable_scope.VariableAggregation.SUM, grads_and_vars)
+    reduced_grads = distribution.extended.batch_reduce_to(
+        ds_reduce_util.ReduceOp.SUM, grads_and_vars)
     var_list = [v for _, v in grads_and_vars]
     grads_and_vars = zip(reduced_grads, var_list)
 
@@ -944,7 +937,7 @@ class OptimizerV2(optimizer_v1.Optimizer):
     with ops.name_scope(name, self._name) as name:
       per_graph_state = self._get_or_create_state(var_list=unwrapped_var_list)
       # Include the current value of any dynamic hyper parameters in `state`.
-      non_slot_devices = distribution.non_slot_devices(var_list)
+      non_slot_devices = distribution.extended.non_slot_devices(var_list)
       state = per_graph_state._copy_with_dynamic_hyper(  # pylint: disable=protected-access
           self._hyper, distribution, non_slot_devices)
 
@@ -989,7 +982,8 @@ class OptimizerV2(optimizer_v1.Optimizer):
       # Use the processors to update the variables.
       update_ops = []
       for grad, var in grads_and_vars:
-        update_ops.extend(distribution.update(var, update, grad, grouped=False))
+        update_ops.extend(distribution.extended.update(
+            var, update, args=(grad,), group=False))
 
       # Give the child class a chance to do something after applying
       # gradients
@@ -1001,8 +995,8 @@ class OptimizerV2(optimizer_v1.Optimizer):
 
       update_ops = control_flow_ops.group(update_ops)
       with ops.control_dependencies([update_ops]):
-        finish_updates = distribution.update_non_slot(
-            non_slot_devices, finish, grouped=False)
+        finish_updates = distribution.extended.update_non_slot(
+            non_slot_devices, finish, group=False)
       # We said grouped=False, which means finish_updates is always a list.
       # It will be [None] when finish() returns None.
       if finish_updates == [None]:
@@ -1017,8 +1011,8 @@ class OptimizerV2(optimizer_v1.Optimizer):
           def update_global_step(global_step, name):
             return global_step.assign_add(1, read_value=False, name=name)
 
-          apply_updates = distribution.update(global_step, update_global_step,
-                                              name)
+          apply_updates = distribution.extended.update(
+              global_step, update_global_step, args=(name,))
 
       # Add the training op to the TRAIN_OP graph collection in graph mode.
       if not eager_execution:
diff --git a/tensorflow/contrib/predictor/BUILD b/tensorflow/contrib/predictor/BUILD
index d50b52b8ff1..53a3bc63e1d 100644
--- a/tensorflow/contrib/predictor/BUILD
+++ b/tensorflow/contrib/predictor/BUILD
@@ -42,6 +42,7 @@ py_library(
     name = "saved_model_predictor",
     srcs = ["saved_model_predictor.py"],
     srcs_version = "PY2AND3",
+    visibility = ["//learning/brain/contrib/learn/tpu:__subpackages__"],
     deps = [
         ":base_predictor",
         "//tensorflow/contrib/saved_model:saved_model_py",
diff --git a/tensorflow/contrib/quantize/README.md b/tensorflow/contrib/quantize/README.md
index a1f2b590266..9085d9fa719 100644
--- a/tensorflow/contrib/quantize/README.md
+++ b/tensorflow/contrib/quantize/README.md
@@ -28,7 +28,7 @@ Since it's difficult to add these fake quantization operations to all the
 required locations in the model, there's a function available that rewrites the
 training graph. To create a fake quantized training graph:
 
-```
+```python
 # Build forward pass of model.
 loss = tf.losses.get_total_loss()
 
@@ -51,7 +51,7 @@ The rewritten *eval graph* is non-trivially different from the *training graph*
 since the quantization ops affect the batch normalization step. Because of this,
 we've added a separate rewrite for the *eval graph*:
 
-```
+```python
 # Build eval model
 logits = tf.nn.softmax_cross_entropy_with_logits_v2(...)
 
diff --git a/tensorflow/contrib/quantize/python/quant_ops.py b/tensorflow/contrib/quantize/python/quant_ops.py
index 6f659347fba..8619708cdae 100644
--- a/tensorflow/contrib/quantize/python/quant_ops.py
+++ b/tensorflow/contrib/quantize/python/quant_ops.py
@@ -138,7 +138,7 @@ def LastValueQuantize(inputs,
     if per_channel:
       if input_dim >= 2:
         batch_min = math_ops.reduce_min(
-            inputs, reduction_indices=reduce_dims, name='BatchMin')
+            inputs, axis=reduce_dims, name='BatchMin')
       else:
         batch_min = inputs
     else:
@@ -147,7 +147,7 @@ def LastValueQuantize(inputs,
     if per_channel:
       if input_dim >= 2:
         batch_max = math_ops.reduce_max(
-            inputs, reduction_indices=reduce_dims, name='BatchMax')
+            inputs, axis=reduce_dims, name='BatchMax')
       else:
         batch_max = inputs
     else:
@@ -263,7 +263,7 @@ def MovingAvgQuantize(inputs,
     if per_channel:
       if input_dim >= 2:
         batch_min = math_ops.reduce_min(
-            inputs, reduction_indices=reduce_dims, name='BatchMin')
+            inputs, axis=reduce_dims, name='BatchMin')
       else:
         batch_min = inputs
     else:
@@ -272,7 +272,7 @@ def MovingAvgQuantize(inputs,
     if per_channel:
       if input_dim >= 2:
         batch_max = math_ops.reduce_max(
-            inputs, reduction_indices=reduce_dims, name='BatchMax')
+            inputs, axis=reduce_dims, name='BatchMax')
       else:
         batch_max = inputs
     else:
diff --git a/tensorflow/contrib/quantize/python/quantize.py b/tensorflow/contrib/quantize/python/quantize.py
index 338923f7512..21d1b121309 100644
--- a/tensorflow/contrib/quantize/python/quantize.py
+++ b/tensorflow/contrib/quantize/python/quantize.py
@@ -160,7 +160,7 @@ def Quantize(graph,
       # shouldn't quantize it, since the activation will be Fused into the
       # Add at inference time.
       consumers = input_to_ops_map.ConsumerOperations(layer_match.bypass_op)
-      if any([consumer.type in _ACTIVATION_TYPES for consumer in consumers]):
+      if any(consumer.type in _ACTIVATION_TYPES for consumer in consumers):
         logging.info('Skipping %s, because its followed by an activation.',
                      layer_match.bypass_op.name)
       else:
@@ -195,7 +195,7 @@ def Quantize(graph,
       # Add at inference time.
       consumers = input_to_ops_map.ConsumerOperations(
           layer_match.post_activation_bypass_op)
-      if any([consumer.type in _RELU_TYPES for consumer in consumers]):
+      if any(consumer.type in _RELU_TYPES for consumer in consumers):
         logging.info('Skipping %s, because its followed by an activation.',
                      layer_match.post_activation_bypass_op.name)
       else:
diff --git a/tensorflow/contrib/resampler/BUILD b/tensorflow/contrib/resampler/BUILD
index 38fcca03116..bbf10996759 100644
--- a/tensorflow/contrib/resampler/BUILD
+++ b/tensorflow/contrib/resampler/BUILD
@@ -13,6 +13,7 @@ load(
 )
 load("//tensorflow:tensorflow.bzl", "cuda_py_test")
 load("//tensorflow:tensorflow.bzl", "tf_custom_op_py_library")
+load("//tensorflow/compiler/tests:build_defs.bzl", "tf_xla_py_test")
 
 tf_custom_op_py_library(
     name = "resampler_py",
@@ -50,10 +51,14 @@ tf_kernel_library(
     prefix = "resampler_ops",
     deps = [
         ":resampler_ops_op_lib",
-        "//tensorflow/compiler/tf2xla/kernels:resampler_ops",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
-    ],
+    ] + select({
+        "//tensorflow:with_xla_support": [
+            "//tensorflow/compiler/tf2xla/kernels:resampler_ops",
+        ],
+        "//conditions:default": [],
+    }),
     alwayslink = 1,
 )
 
@@ -94,3 +99,26 @@ cuda_py_test(
         "//tensorflow/python:array_ops",
     ],
 )
+
+tf_xla_py_test(
+    name = "resampler_ops_xla_test",
+    size = "small",
+    srcs = ["xla/resampler_ops_xla_test.py"],
+    disabled_backends = [
+        # TODO(b/74459949) Support BatchDot in CPU backend.
+        "cpu",
+        "cpu_ondemand",
+    ],
+    # TODO(b/112295522): the OSS build will not likely work in the short to medium term, currently it is blocked by the fact that bazel does not allow py_library to depend on cc_library: https://github.com/bazelbuild/bazel/issues/701 which may not be resolvable.
+    tags = ["no_oss"],
+    deps = [
+        "//tensorflow/compiler/tests:xla_test",
+        "//tensorflow/compiler/tf2xla/kernels:resampler_ops",
+        "//tensorflow/contrib/resampler:resampler_ops",
+        "//tensorflow/contrib/resampler:resampler_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:platform_test",
+        "//third_party/py/numpy",
+    ],
+)
diff --git a/tensorflow/compiler/tests/resampler_ops_test.py b/tensorflow/contrib/resampler/xla/resampler_ops_xla_test.py
similarity index 76%
rename from tensorflow/compiler/tests/resampler_ops_test.py
rename to tensorflow/contrib/resampler/xla/resampler_ops_xla_test.py
index f87ac3360c9..d8ca0eab276 100644
--- a/tensorflow/compiler/tests/resampler_ops_test.py
+++ b/tensorflow/contrib/resampler/xla/resampler_ops_xla_test.py
@@ -63,8 +63,8 @@ class ResamplerOpsTest(xla_test.XLATestCase):
   def testSimple(self):
     for dtype in self.float_types:
       input_shape = [1, 2, 2, 1]
-      input_rgb_data = [0, 5, 13, 54]
-      input_np = np.array(input_rgb_data, dtype=dtype).reshape(input_shape)
+      input_data = [0, 5, 13, 54]
+      input_np = np.array(input_data, dtype=dtype).reshape(input_shape)
 
       warp_shape = [1, 2]
       warp_data = [0.7, 0.6]
@@ -151,6 +151,55 @@ class ResamplerOpsTest(xla_test.XLATestCase):
                                             expected_grad_data,
                                             expected_grad_warp)
 
+  def testOutOfBoundWarps(self):
+    # (x, y) are both less than 0.
+    for dtype in self.float_types:
+      input_shape = [1, 2, 2, 1]
+      input_data = [10, 5, 13, 54]
+      input_np = np.array(input_data, dtype=dtype).reshape(input_shape)
+
+      warp_shape = [1, 2, 2]
+      warp_data = [-1, -1, 0.7, 0.6]
+      warp_np = np.array(warp_data, dtype=dtype).reshape(warp_shape)
+      expected = [[[0.0], [27.62]]]
+      self._assertForwardOpMatchesExpected(input_np, warp_np, expected)
+
+    # One of (x, y) is less than 0.
+    for dtype in self.float_types:
+      input_shape = [1, 2, 2, 1]
+      input_data = [10, 5, 13, 54]
+      input_np = np.array(input_data, dtype=dtype).reshape(input_shape)
+
+      warp_shape = [1, 2, 2]
+      warp_data = [-1, 0.1, 0.7, 0.6]
+      warp_np = np.array(warp_data, dtype=dtype).reshape(warp_shape)
+      expected = [[[0.0], [27.62]]]
+      self._assertForwardOpMatchesExpected(input_np, warp_np, expected)
+
+    # Both of (x, y) are greater than image size.
+    for dtype in self.float_types:
+      input_shape = [1, 2, 2, 1]
+      input_data = [10, 5, 13, 54]
+      input_np = np.array(input_data, dtype=dtype).reshape(input_shape)
+
+      warp_shape = [1, 2, 2]
+      warp_data = [-0.1, 0.1, 1.2, 2.1]
+      warp_np = np.array(warp_data, dtype=dtype).reshape(warp_shape)
+      expected = [[[0.0], [0.0]]]
+      self._assertForwardOpMatchesExpected(input_np, warp_np, expected)
+
+    # One of (x, y) is greater than image size.
+    for dtype in self.float_types:
+      input_shape = [1, 2, 2, 1]
+      input_data = [10, 5, 13, 54]
+      input_np = np.array(input_data, dtype=dtype).reshape(input_shape)
+
+      warp_shape = [1, 2, 2]
+      warp_data = [0.1, -0.1, 1.2, 0.1]
+      warp_np = np.array(warp_data, dtype=dtype).reshape(warp_shape)
+      expected = [[[0.0], [0.0]]]
+      self._assertForwardOpMatchesExpected(input_np, warp_np, expected)
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_cell_test.py b/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_cell_test.py
index 245fa68eaef..7d57b0413a3 100644
--- a/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_cell_test.py
+++ b/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_cell_test.py
@@ -906,7 +906,7 @@ class DropoutWrapperTest(test.TestCase):
 
   def testDropoutWrapperKeepNoOutput(self):
     keep_all = variable_scope.get_variable("all", initializer=1.0)
-    keep_none = variable_scope.get_variable("none", initializer=1e-10)
+    keep_none = variable_scope.get_variable("none", initializer=1e-6)
     res = self._testDropoutWrapper(
         input_keep_prob=keep_all,
         output_keep_prob=keep_none,
@@ -922,7 +922,7 @@ class DropoutWrapperTest(test.TestCase):
 
   def testDropoutWrapperKeepNoStateExceptLSTMCellMemory(self):
     keep_all = variable_scope.get_variable("all", initializer=1.0)
-    keep_none = variable_scope.get_variable("none", initializer=1e-10)
+    keep_none = variable_scope.get_variable("none", initializer=1e-6)
     # Even though we dropout state, by default DropoutWrapper never
     # drops out the memory ("c") term of an LSTMStateTuple.
     res = self._testDropoutWrapper(
@@ -943,7 +943,7 @@ class DropoutWrapperTest(test.TestCase):
 
   def testDropoutWrapperKeepNoInput(self):
     keep_all = variable_scope.get_variable("all", initializer=1.0)
-    keep_none = variable_scope.get_variable("none", initializer=1e-10)
+    keep_none = variable_scope.get_variable("none", initializer=1e-6)
     true_full_output = np.array(
         [[[0.751109, 0.751109, 0.751109]], [[0.895509, 0.895509, 0.895509]]],
         dtype=np.float32)
diff --git a/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_test.py b/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_test.py
index 5cba54dd3df..ef372b947ce 100644
--- a/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_test.py
+++ b/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_test.py
@@ -227,7 +227,7 @@ class RNNTest(test.TestCase):
   def testDropout(self):
     cell = Plus1RNNCell()
     full_dropout_cell = rnn_cell.DropoutWrapper(
-        cell, input_keep_prob=1e-12, seed=0)
+        cell, input_keep_prob=1e-6, seed=0)
     (name, dep), = full_dropout_cell._checkpoint_dependencies
     self.assertIs(dep, cell)
     self.assertEqual("cell", name)
diff --git a/tensorflow/contrib/rnn/python/ops/gru_ops.py b/tensorflow/contrib/rnn/python/ops/gru_ops.py
index b30ca7882fc..251a933eaec 100644
--- a/tensorflow/contrib/rnn/python/ops/gru_ops.py
+++ b/tensorflow/contrib/rnn/python/ops/gru_ops.py
@@ -21,7 +21,7 @@ from tensorflow.contrib.rnn.ops import gen_gru_ops
 from tensorflow.contrib.util import loader
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
-from tensorflow.python.layers import base as base_layer
+from tensorflow.python.keras.engine import input_spec
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
@@ -165,7 +165,7 @@ class GRUBlockCell(LayerRNNCell):
       num_units = cell_size
     self._cell_size = num_units
     # Inputs must be 2-dimensional.
-    self.input_spec = base_layer.InputSpec(ndim=2)
+    self.input_spec = input_spec.InputSpec(ndim=2)
 
   @property
   def state_size(self):
diff --git a/tensorflow/contrib/rnn/python/ops/lstm_ops.py b/tensorflow/contrib/rnn/python/ops/lstm_ops.py
index 4db431f85a4..b043026bc55 100644
--- a/tensorflow/contrib/rnn/python/ops/lstm_ops.py
+++ b/tensorflow/contrib/rnn/python/ops/lstm_ops.py
@@ -25,6 +25,7 @@ from tensorflow.contrib.rnn.ops import gen_lstm_ops
 from tensorflow.contrib.util import loader
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.keras.engine import input_spec
 from tensorflow.python.layers import base as base_layer
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import init_ops
@@ -385,7 +386,7 @@ class LSTMBlockCell(LayerRNNCell):
         "scope": "lstm_cell"
     }
     # Inputs must be 2-dimensional.
-    self.input_spec = base_layer.InputSpec(ndim=2)
+    self.input_spec = input_spec.InputSpec(ndim=2)
 
   @property
   def state_size(self):
@@ -628,7 +629,7 @@ class LSTMBlockFusedCell(LSTMBlockWrapper):
     self._use_peephole = use_peephole
 
     # Inputs must be 3-dimensional.
-    self.input_spec = base_layer.InputSpec(ndim=3)
+    self.input_spec = input_spec.InputSpec(ndim=3)
 
   @property
   def num_units(self):
diff --git a/tensorflow/contrib/rnn/python/ops/rnn_cell.py b/tensorflow/contrib/rnn/python/ops/rnn_cell.py
index e159dc95796..8a1c09f171e 100644
--- a/tensorflow/contrib/rnn/python/ops/rnn_cell.py
+++ b/tensorflow/contrib/rnn/python/ops/rnn_cell.py
@@ -30,7 +30,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.keras import activations
 from tensorflow.python.keras import initializers
-from tensorflow.python.layers import base as base_layer
+from tensorflow.python.keras.engine import input_spec
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import clip_ops
 from tensorflow.python.ops import gen_array_ops
@@ -2752,7 +2752,7 @@ class SRUCell(rnn_cell_impl.LayerRNNCell):
     self._activation = activation or math_ops.tanh
 
     # Restrict inputs to be 2-dimensional matrices
-    self.input_spec = base_layer.InputSpec(ndim=2)
+    self.input_spec = input_spec.InputSpec(ndim=2)
 
   @property
   def state_size(self):
@@ -3089,7 +3089,7 @@ class IndRNNCell(rnn_cell_impl.LayerRNNCell):
     super(IndRNNCell, self).__init__(_reuse=reuse, name=name, dtype=dtype)
 
     # Inputs must be 2-dimensional.
-    self.input_spec = base_layer.InputSpec(ndim=2)
+    self.input_spec = input_spec.InputSpec(ndim=2)
 
     self._num_units = num_units
     self._activation = activation or math_ops.tanh
@@ -3183,7 +3183,7 @@ class IndyGRUCell(rnn_cell_impl.LayerRNNCell):
     super(IndyGRUCell, self).__init__(_reuse=reuse, name=name, dtype=dtype)
 
     # Inputs must be 2-dimensional.
-    self.input_spec = base_layer.InputSpec(ndim=2)
+    self.input_spec = input_spec.InputSpec(ndim=2)
 
     self._num_units = num_units
     self._activation = activation or math_ops.tanh
@@ -3323,7 +3323,7 @@ class IndyLSTMCell(rnn_cell_impl.LayerRNNCell):
     super(IndyLSTMCell, self).__init__(_reuse=reuse, name=name, dtype=dtype)
 
     # Inputs must be 2-dimensional.
-    self.input_spec = base_layer.InputSpec(ndim=2)
+    self.input_spec = input_spec.InputSpec(ndim=2)
 
     self._num_units = num_units
     self._forget_bias = forget_bias
@@ -3444,7 +3444,7 @@ class MinimalRNNCell(rnn_cell_impl.LayerRNNCell):
     super(MinimalRNNCell, self).__init__(name=name, dtype=dtype, **kwargs)
 
     # Inputs must be 2-dimensional.
-    self.input_spec = base_layer.InputSpec(ndim=2)
+    self.input_spec = input_spec.InputSpec(ndim=2)
 
     self.units = units
     self.activation = activations.get(activation)
@@ -3558,7 +3558,7 @@ class CFNCell(rnn_cell_impl.LayerRNNCell):
     super(CFNCell, self).__init__(name=name, dtype=dtype, **kwargs)
 
     # Inputs must be 2-dimensional.
-    self.input_spec = base_layer.InputSpec(ndim=2)
+    self.input_spec = input_spec.InputSpec(ndim=2)
 
     self.units = units
     self.activation = activations.get(activation)
diff --git a/tensorflow/contrib/saved_model/BUILD b/tensorflow/contrib/saved_model/BUILD
index f0947fe423f..269443b2c65 100644
--- a/tensorflow/contrib/saved_model/BUILD
+++ b/tensorflow/contrib/saved_model/BUILD
@@ -102,7 +102,10 @@ py_test(
     size = "medium",
     srcs = ["python/saved_model/keras_saved_model_test.py"],
     srcs_version = "PY2AND3",
-    tags = ["no_windows"],
+    tags = [
+        "no_oss",  # TODO(b/119349471): Re-enable
+        "no_windows",
+    ],
     deps = [
         ":keras_saved_model",
         "//tensorflow/python:client_testlib",
diff --git a/tensorflow/contrib/saved_model/python/saved_model/keras_saved_model.py b/tensorflow/contrib/saved_model/python/saved_model/keras_saved_model.py
index 27b5b6d22e0..ffba514bb96 100644
--- a/tensorflow/contrib/saved_model/python/saved_model/keras_saved_model.py
+++ b/tensorflow/contrib/saved_model/python/saved_model/keras_saved_model.py
@@ -25,7 +25,6 @@ from tensorflow.python.client import session
 from tensorflow.python.estimator import keras as estimator_keras_util
 from tensorflow.python.estimator import model_fn as model_fn_lib
 from tensorflow.python.estimator.export import export as export_helpers
-from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.keras import backend as K
 from tensorflow.python.keras import models as models_lib
@@ -126,7 +125,7 @@ def save_keras_model(
   export_dir = export_helpers.get_timestamped_export_dir(saved_model_path)
   temp_export_dir = export_helpers.get_temp_export_dir(export_dir)
 
-  builder = saved_model_builder.SavedModelBuilder(temp_export_dir)
+  builder = saved_model_builder._SavedModelBuilder(temp_export_dir)
 
   # Manually save variables to export them in an object-based checkpoint. This
   # skips the `builder.add_meta_graph_and_variables()` step, which saves a
@@ -228,9 +227,10 @@ def _export_mode(
       g.add_to_collection(ops.GraphKeys.GLOBAL_STEP, clone.optimizer.iterations)
 
     # Extract update and train ops from train/test/predict functions.
+    train_op = None
     if mode == model_fn_lib.ModeKeys.TRAIN:
       clone._make_train_function()
-      builder._add_train_op(clone.train_function.updates_op)
+      train_op = clone.train_function.updates_op
     elif mode == model_fn_lib.ModeKeys.EVAL:
       clone._make_test_function()
     else:
@@ -265,7 +265,8 @@ def _export_mode(
         model_fn_lib.EXPORT_TAG_MAP[mode],
         signature_def_map=_create_signature_def_map(clone, mode),
         saver=saver_lib.Saver(clone_var_list),
-        main_op=variables.local_variables_initializer())
+        init_op=variables.local_variables_initializer(),
+        train_op=train_op)
     return None
 
 
@@ -307,31 +308,11 @@ def _create_signature_def_map(model, mode):
       serving_only=(mode == model_fn_lib.ModeKeys.PREDICT))
 
 
-def _assert_same_non_optimizer_objects(model, model_graph, clone, clone_graph):
+def _assert_same_non_optimizer_objects(model, model_graph, clone, clone_graph):  # pylint: disable=unused-argument
   """Assert model and clone contain the same checkpointable objects."""
 
-  def get_non_optimizer_objects(m, g):
-    """Gather set of model and optimizer checkpointable objects."""
-    # Set default graph because optimizer.variables() returns optimizer
-    # variables defined in the default graph.
-    with g.as_default():
-      all_objects = set(checkpointable_utils.list_objects(m))
-      optimizer_and_variables = set()
-      for obj in all_objects:
-        if isinstance(obj, optimizers.TFOptimizer):
-          optimizer_and_variables.update(checkpointable_utils.list_objects(obj))
-          optimizer_and_variables.update(set(obj.optimizer.variables()))
-      return all_objects - optimizer_and_variables
-
-  model_objects = get_non_optimizer_objects(model, model_graph)
-  clone_objects = get_non_optimizer_objects(clone, clone_graph)
-
-  if len(model_objects) != len(clone_objects):
-    raise errors.InternalError(
-        None, None,
-        'Model and clone must use the same variables.'
-        '\n\tModel variables: %s\n\t Clone variables: %s'
-        % (model_objects, clone_objects))
+  # TODO(fchollet, kathywu): make sure this works in eager mode.
+  return True
 
 
 def load_keras_model(saved_model_path):
diff --git a/tensorflow/contrib/saved_model/python/saved_model/keras_saved_model_test.py b/tensorflow/contrib/saved_model/python/saved_model/keras_saved_model_test.py
index a65b2ce4661..93d73e1b484 100644
--- a/tensorflow/contrib/saved_model/python/saved_model/keras_saved_model_test.py
+++ b/tensorflow/contrib/saved_model/python/saved_model/keras_saved_model_test.py
@@ -29,14 +29,12 @@ from tensorflow.python import keras
 from tensorflow.python.client import session
 from tensorflow.python.eager import context
 from tensorflow.python.estimator import model_fn as model_fn_lib
-from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.keras.engine import training
 from tensorflow.python.keras.utils import tf_utils
 from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
-from tensorflow.python.saved_model import constants
 from tensorflow.python.saved_model import loader_impl
 from tensorflow.python.saved_model import signature_constants
 from tensorflow.python.training import training as training_module
@@ -255,7 +253,7 @@ def load_model(sess, path, mode):
   outputs = {
       k: sess.graph.get_tensor_by_name(v.name)
       for k, v in meta_graph_def.signature_def[sig_def_key].outputs.items()}
-  return inputs, outputs
+  return inputs, outputs, meta_graph_def
 
 
 @test_util.run_all_in_graph_and_eager_modes
@@ -332,8 +330,8 @@ class TestModelSavedModelExport(test.TestCase, parameterized.TestCase):
 
     # Load predict graph, and test predictions
     with session.Session(graph=ops.Graph()) as sess:
-      inputs, outputs = load_model(sess, output_path,
-                                   model_fn_lib.ModeKeys.PREDICT)
+      inputs, outputs, _ = load_model(sess, output_path,
+                                      model_fn_lib.ModeKeys.PREDICT)
 
       predictions = sess.run(outputs[output_name],
                              {inputs[input_name]: input_arr})
@@ -342,19 +340,21 @@ class TestModelSavedModelExport(test.TestCase, parameterized.TestCase):
     if optimizer:
       # Load eval graph, and test predictions, loss and metric values
       with session.Session(graph=ops.Graph()) as sess:
-        inputs, outputs = load_model(sess, output_path,
-                                     model_fn_lib.ModeKeys.EVAL)
+        inputs, outputs, _ = load_model(sess, output_path,
+                                        model_fn_lib.ModeKeys.EVAL)
 
         # First obtain the loss and predictions, and run the metric update op by
         # feeding in the inputs and targets.
         loss, predictions, _ = sess.run(
             (outputs['loss'], outputs['predictions/' + output_name],
-             outputs['metrics/mae/update_op']),
-            {inputs[input_name]: input_arr, inputs[target_name]: target_arr})
+             outputs['metrics/mean_absolute_error/update_op']), {
+                 inputs[input_name]: input_arr,
+                 inputs[target_name]: target_arr
+             })
 
         # The metric value should be run after the update op, to ensure that it
         # reflects the correct value.
-        metric_value = sess.run(outputs['metrics/mae/value'])
+        metric_value = sess.run(outputs['metrics/mean_absolute_error/value'])
 
         self.assertEqual(int(train_before_export),
                          sess.run(training_module.get_global_step()))
@@ -364,17 +364,17 @@ class TestModelSavedModelExport(test.TestCase, parameterized.TestCase):
 
       # Load train graph, and check for the train op, and prediction values
       with session.Session(graph=ops.Graph()) as sess:
-        inputs, outputs = load_model(sess, output_path,
-                                     model_fn_lib.ModeKeys.TRAIN)
+        inputs, outputs, meta_graph_def = load_model(
+            sess, output_path, model_fn_lib.ModeKeys.TRAIN)
         self.assertEqual(int(train_before_export),
                          sess.run(training_module.get_global_step()))
         self.assertIn('loss', outputs)
-        self.assertIn('metrics/mae/update_op', outputs)
-        self.assertIn('metrics/mae/value', outputs)
+        self.assertIn('metrics/mean_absolute_error/update_op', outputs)
+        self.assertIn('metrics/mean_absolute_error/value', outputs)
         self.assertIn('predictions/' + output_name, outputs)
 
         # Train for a step
-        train_op = ops.get_collection(constants.TRAIN_OP_KEY)
+        train_op = loader_impl.get_train_op(meta_graph_def)
         train_outputs, _ = sess.run(
             [outputs, train_op], {inputs[input_name]: input_arr,
                                   inputs[target_name]: target_arr})
@@ -401,8 +401,8 @@ class TestModelSavedModelExport(test.TestCase, parameterized.TestCase):
       output_path = keras_saved_model.save_keras_model(
           model, saved_model_path, custom_objects={'relu6': relu6})
     with session.Session(graph=ops.Graph()) as sess:
-      inputs, outputs = load_model(sess, output_path,
-                                   model_fn_lib.ModeKeys.PREDICT)
+      inputs, outputs, _ = load_model(sess, output_path,
+                                      model_fn_lib.ModeKeys.PREDICT)
       input_name = model.input_names[0]
       output_name = model.output_names[0]
       predictions = sess.run(
@@ -463,11 +463,6 @@ class TestModelSavedModelExport(test.TestCase, parameterized.TestCase):
       clone.compile(loss='mse', optimizer=keras.optimizers.RMSprop(lr=0.0001))
       clone.train_on_batch(input_arr, target_arr)
 
-    with self.assertRaisesRegexp(
-        errors.InternalError, 'Model and clone must use the same variables.'):
-      keras_saved_model._assert_same_non_optimizer_objects(
-          model, model_graph, clone, clone_graph)
-
   def testSaveSeqModelWithoutInputShapesRaisesError(self):
     """A Sequential model that hasn't been built should raise an error."""
     model = sequential_model_without_input_shape(True)
diff --git a/tensorflow/contrib/seq2seq/python/kernel_tests/attention_wrapper_test.py b/tensorflow/contrib/seq2seq/python/kernel_tests/attention_wrapper_test.py
index 8668c67cf95..922f21b98b3 100644
--- a/tensorflow/contrib/seq2seq/python/kernel_tests/attention_wrapper_test.py
+++ b/tensorflow/contrib/seq2seq/python/kernel_tests/attention_wrapper_test.py
@@ -154,8 +154,8 @@ class AttentionWrapperTest(test.TestCase):
 
     if attention_layer_sizes is not None:
       # Compute sum of attention_layer_sizes. Use encoder_output_depth if None.
-      attention_depth = sum([attention_layer_size or encoder_output_depth
-                             for attention_layer_size in attention_layer_sizes])
+      attention_depth = sum(attention_layer_size or encoder_output_depth
+                            for attention_layer_size in attention_layer_sizes)
     elif attention_layers is not None:
       # Compute sum of attention_layers output depth.
       attention_depth = sum(
diff --git a/tensorflow/contrib/summary/summary_ops_test.py b/tensorflow/contrib/summary/summary_ops_test.py
index 4d1807130c5..10e4556dacb 100644
--- a/tensorflow/contrib/summary/summary_ops_test.py
+++ b/tensorflow/contrib/summary/summary_ops_test.py
@@ -152,6 +152,27 @@ class EagerFileTest(test_util.TensorFlowTestCase):
       self.assertEqual(len(events), 2)
       self.assertEqual(events[1].summary.value[0].tag, 'scalar')
 
+  def testRecordEveryNGlobalSteps(self):
+    step = training_util.get_or_create_global_step()
+    logdir = tempfile.mkdtemp()
+
+    def run_step():
+      summary_ops.scalar('scalar', i, step=step)
+      step.assign_add(1)
+
+    with summary_ops.create_file_writer(
+        logdir).as_default(), summary_ops.record_summaries_every_n_global_steps(
+            2, step):
+      for i in range(10):
+        run_step()
+      # And another 10 steps as a graph function.
+      run_step_fn = function.defun(run_step)
+      for i in range(10):
+        run_step_fn()
+
+    events = summary_test_util.events_from_logdir(logdir)
+    self.assertEqual(len(events), 11)
+
   def testMaxQueue(self):
     logs = tempfile.mkdtemp()
     with summary_ops.create_file_writer(
@@ -279,12 +300,9 @@ class EagerDbTest(summary_test_util.SummaryDbTest):
 
   def testDbURIOpen(self):
     tmpdb_path = os.path.join(self.get_temp_dir(), 'tmpDbURITest.sqlite')
-    tmpdb_uri = six.moves.urllib_parse.urljoin("file:", tmpdb_path)
-    tmpdb_writer = summary_ops.create_db_writer(
-        tmpdb_uri,
-        "experimentA",
-        "run1",
-        "user1")
+    tmpdb_uri = six.moves.urllib_parse.urljoin('file:', tmpdb_path)
+    tmpdb_writer = summary_ops.create_db_writer(tmpdb_uri, 'experimentA',
+                                                'run1', 'user1')
     with summary_ops.always_record_summaries():
       with tmpdb_writer.as_default():
         summary_ops.scalar('t1', 2.0)
diff --git a/tensorflow/contrib/tensorboard/db/summary_file_writer.cc b/tensorflow/contrib/tensorboard/db/summary_file_writer.cc
index 3f24f58f03a..22b6f09d0cd 100644
--- a/tensorflow/contrib/tensorboard/db/summary_file_writer.cc
+++ b/tensorflow/contrib/tensorboard/db/summary_file_writer.cc
@@ -73,7 +73,16 @@ class SummaryFileWriter : public SummaryWriterInterface {
     e->set_step(global_step);
     e->set_wall_time(GetWallTime());
     Summary::Value* v = e->mutable_summary()->add_value();
-    t.AsProtoTensorContent(v->mutable_tensor());
+
+    if (t.dtype() == DT_STRING) {
+      // Treat DT_STRING specially, so that tensor_util.MakeNdarray in Python
+      // can convert the TensorProto to string-type numpy array. MakeNdarray
+      // does not work with strings encoded by AsProtoTensorContent() in
+      // tensor_content.
+      t.AsProtoField(v->mutable_tensor());
+    } else {
+      t.AsProtoTensorContent(v->mutable_tensor());
+    }
     v->set_tag(tag);
     if (!serialized_metadata.empty()) {
       v->mutable_metadata()->ParseFromString(serialized_metadata);
diff --git a/tensorflow/contrib/tensorboard/db/summary_file_writer_test.cc b/tensorflow/contrib/tensorboard/db/summary_file_writer_test.cc
index cd3f712256f..ffbfb9533e8 100644
--- a/tensorflow/contrib/tensorboard/db/summary_file_writer_test.cc
+++ b/tensorflow/contrib/tensorboard/db/summary_file_writer_test.cc
@@ -15,6 +15,7 @@ limitations under the License.
 #include "tensorflow/contrib/tensorboard/db/summary_file_writer.h"
 
 #include "tensorflow/core/framework/summary.pb.h"
+#include "tensorflow/core/framework/tensor.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/refcount.h"
 #include "tensorflow/core/lib/io/path.h"
@@ -104,6 +105,23 @@ TEST_F(SummaryFileWriterTest, WriteTensor) {
                                   CHECK_EQ(e.summary().value_size(), 1);
                                   EXPECT_EQ(e.summary().value(0).tag(), "name");
                                 }));
+  TF_CHECK_OK(SummaryTestHelper(
+      "string_tensor_test",
+      [](SummaryWriterInterface* writer) {
+        Tensor hello(DT_STRING, TensorShape({}));
+        hello.scalar<string>()() = "hello";
+        TF_RETURN_IF_ERROR(writer->WriteTensor(
+            2, hello, "name", SummaryMetadata().SerializeAsString()));
+        TF_RETURN_IF_ERROR(writer->Flush());
+        return Status::OK();
+      },
+      [](const Event& e) {
+        EXPECT_EQ(e.step(), 2);
+        CHECK_EQ(e.summary().value_size(), 1);
+        EXPECT_EQ(e.summary().value(0).tag(), "name");
+        EXPECT_EQ(e.summary().value(0).tensor().dtype(), DT_STRING);
+        EXPECT_EQ(e.summary().value(0).tensor().string_val()[0], "hello");
+      }));
 }
 
 TEST_F(SummaryFileWriterTest, WriteScalar) {
diff --git a/tensorflow/contrib/tensorrt/BUILD b/tensorflow/contrib/tensorrt/BUILD
index 20bcd2447e6..784acce444a 100644
--- a/tensorflow/contrib/tensorrt/BUILD
+++ b/tensorflow/contrib/tensorrt/BUILD
@@ -29,6 +29,10 @@ load(
     "if_tensorrt",
 )
 
+exports_files(glob([
+    "test/testdata/*",
+]))
+
 tf_cuda_cc_test(
     name = "tensorrt_test_cc",
     size = "small",
@@ -491,6 +495,7 @@ cuda_py_tests(
         "test/memory_alignment_test.py",
         "test/multi_connection_neighbor_engine_test.py",
         "test/neighboring_engine_test.py",
+        "test/quantization_test.py",
         "test/rank_two_test.py",
         "test/reshape_transpose_test.py",
         "test/vgg_block_nchw_test.py",
@@ -527,6 +532,30 @@ cuda_py_tests(
     ],
 )
 
+cuda_py_test(
+    name = "quantization_mnist_test",
+    srcs = ["test/quantization_mnist_test.py"],
+    additional_deps = [
+        ":tf_trt_integration_test_base",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python/keras:keras",
+        "//tensorflow/python/estimator:estimator",
+    ],
+    data = [
+        "test/testdata/checkpoint",
+        "test/testdata/model.ckpt-46900.data-00000-of-00001",
+        "test/testdata/model.ckpt-46900.index",
+    ],
+    tags = [
+        "no_cuda_on_cpu_tap",
+        "no_pip",
+        "no_tap",  # It is not able to download the mnist data.
+        "no_windows",
+        "nomac",
+    ],
+)
+
 cc_library(
     name = "utils",
     srcs = ["convert/utils.cc"],
diff --git a/tensorflow/contrib/tensorrt/convert/convert_graph.cc b/tensorflow/contrib/tensorrt/convert/convert_graph.cc
index 26d54eb156c..812948bb303 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_graph.cc
+++ b/tensorflow/contrib/tensorrt/convert/convert_graph.cc
@@ -82,60 +82,76 @@ std::vector<int> GetLoadedTensorRTVersion() {
 }
 
 TrtCandidateSelector::TrtCandidateSelector(
-    const grappler::GraphProperties& graph_properties)
-    : graph_properties_(graph_properties) {}
+    const grappler::GraphProperties& graph_properties, int precision_mode)
+    : graph_properties_(graph_properties), precision_mode_(precision_mode) {}
 
 Status TrtCandidateSelector::IsTensorRTCandidate(const tensorflow::Node* node) {
   // TODO(laigd): move this set to TrtNodeValidator where it should belong.
   // LINT.IfChange
   static const std::set<string> candidate_ops = {
-    "Identity",
-    "Snapshot",
-    "Const",
-    "Conv2D",
-    "MaxPool",
-    "BiasAdd",
-    "Relu",
-    "Add",
-    "Mul",
-    "Sub",
-    "Rsqrt",
-    "Pad",
-    "Mean",
-    "AvgPool",
-    "ConcatV2",
-    "DepthwiseConv2dNative",
-    "FusedBatchNorm",
-    "FusedBatchNormV2",
-    "Div",
-    "RealDiv",
-    "Rsqrt",
-    "Reciprocal",
-    "Exp",
-    "Log",
-    "Sqrt",
-    "Abs",
-    "Neg",
-    "Transpose",
-    "Reshape",
-    "MatMul",
-    "BatchMatMul",
-    "Softmax",
-    "Minimum",
-    "Maximum",
-    "TopKV2",
-    "Sum",
-    "Prod",
-    "Max",
-    "Min",
+      "Identity",
+      "Snapshot",
+      "Const",
+      "Conv2D",
+      "MaxPool",
+      "BiasAdd",
+      "Relu",
+      "Sigmoid",
+      "Tanh",
+      "Add",
+      "Mul",
+      "Sub",
+      "Rsqrt",
+      "Pad",
+      "Mean",
+      "AvgPool",
+      "ConcatV2",
+      "DepthwiseConv2dNative",
+      "FusedBatchNorm",
+      "FusedBatchNormV2",
+      "Div",
+      "RealDiv",
+      "Rsqrt",
+      "Reciprocal",
+      "Exp",
+      "Log",
+      "Sqrt",
+      "Abs",
+      "Neg",
+      "Transpose",
+      "Reshape",
+      "MatMul",
+      "BatchMatMul",
+      "Softmax",
+      "Minimum",
+      "Maximum",
+      "TopKV2",
+      "Sum",
+      "Prod",
+      "Max",
+      "Min",
+      "Relu6",
+      "Square",
   };
-  // LINT.ThenChange(//tensorflow/contrib/tensorrt/convert/convert_nodes.cc)
-  const bool is_supported_op_type =
+  bool is_supported_op_type =
       (candidate_ops.count(node->type_string()) ||
        PluginFactoryTensorRT::GetInstance()->IsPlugin(node->type_string()));
+  static const std::set<string> quantize_ops = {
+      "QuantizeAndDequantizeV2",
+      "QuantizeAndDequantizeV3",
+      "FakeQuantWithMinMaxVars",
+      "FakeQuantWithMinMaxArgs",
+  };
+  // In INT8 mode, we will always apply the quantization ranges provided by
+  // these ops to the relevant tensors. This happens regardless of the value of
+  // use_calibration.
+  if (precision_mode_ == INT8MODE && quantize_ops.count(node->type_string())) {
+    is_supported_op_type = true;
+  }
+  // LINT.ThenChange(//tensorflow/contrib/tensorrt/convert/convert_nodes.cc)
   if (!is_supported_op_type) {
     return errors::Unimplemented("Op type ", node->type_string(),
-                                 " is not supported.");
+                                 " is not supported");
   }
 
   std::vector<const Edge*> input_edges;
@@ -170,7 +186,7 @@ tensorflow::Status BuildNodeMap(
 tensorflow::Status ConvertCalibGraphToInferGraph(
     const tensorflow::GraphDef& graph_def, tensorflow::GraphDef* infer_graph,
     bool is_dyn_op) {
-  VLOG(0) << "Starting Calib Conversion";
+  LOG(INFO) << "Starting Calib Conversion";
   infer_graph->CopyFrom(graph_def);
   auto trt_rm = TRTResourceManager::instance();
   auto calib_rm = trt_rm->getManager("TRTCalibration");
@@ -220,18 +236,19 @@ tensorflow::Status ConvertGraphDefToTensorRT(
     const std::vector<string>& output_names, size_t max_batch_size,
     size_t max_workspace_size_bytes, tensorflow::GraphDef* new_graph_def,
     int precision_mode, int minimum_segment_size, bool is_dyn_op,
-    int max_cached_engines, std::vector<int> cached_engine_batches) {
+    int max_cached_engines, std::vector<int> cached_engine_batches,
+    bool use_calibration) {
   // Create GrapplerItem.
   tensorflow::grappler::GrapplerItem item;
   item.fetch = output_names;
   item.graph = graph_def;
 
-  // TODO(aaroey): we should have used single machine cluster like the
-  // following, but the problem is then wrap_conversion will depend on
-  // direct_session and cause double linking problems. To fix this we need to
-  // fix or get rid of the swig dependency. Here we use VirtualCluster
-  // as a work around, and we need to create a session to initialize the
-  // underlying device before calling this method.
+// TODO(aaroey): we should have used single machine cluster like the
+// following, but the problem is then wrap_conversion will depend on
+// direct_session and cause double linking problems. To fix this we need to
+// fix or get rid of the swig dependency. Here we use VirtualCluster
+// as a work around, and we need to create a session to initialize the
+// underlying device before calling this method.
 #if 0
   // Create single machine cluster. Note that this will create a session and
   // initialize the gpu devices.
@@ -264,7 +281,9 @@ tensorflow::Status ConvertGraphDefToTensorRT(
 #endif
 
   // Create RewriterConfig.
-  tensorflow::RewriterConfig rw_cfg;
+  tensorflow::ConfigProto config_proto;
+  auto& rw_cfg =
+      *config_proto.mutable_graph_options()->mutable_rewrite_options();
   // TODO(aaroey): use only const folding and layout for the time being since
   // new optimizers break the graph for trt.
   rw_cfg.add_optimizers("constfold");
@@ -285,9 +304,10 @@ tensorflow::Status ConvertGraphDefToTensorRT(
       list->add_i(batch);
     }
   }
+  parameters["use_calibration"].set_b(use_calibration);
 
   // Run optimizer.
-  tensorflow::grappler::MetaOptimizer meta_opt(nullptr, rw_cfg);
+  tensorflow::grappler::MetaOptimizer meta_opt(nullptr, config_proto);
   TF_RETURN_IF_ERROR(meta_opt.Optimize(cluster.get(), item, new_graph_def));
 
   if (VLOG_IS_ON(5)) {
@@ -433,7 +453,8 @@ tensorflow::Status GetEngineInfo(
                  << "but this shouldn't have happened";
     info->device = *segment_devices.begin();
   } else {
-    LOG(ERROR) << "Can't find a device placement for the op!";
+    VLOG(1) << "No device is assigned to the segment. "
+            << "A device will be assigned during graph execution (inference).";
   }
   return Status::OK();
 }
@@ -564,27 +585,30 @@ tensorflow::Status CreateTRTNode(const std::vector<EngineInfo>& infos, int pos,
       }
     }
   }
+
+  const bool calibrate_int8 =
+      (info.precision_mode == INT8MODE && info.use_calibration);
+  // Build the engine and get its serialized representation.
   string segment_string;
-  if (info.engine_type == EngineInfo::EngineType::TRTStatic ||
-      info.precision_mode == INT8MODE) {
+  if (info.engine_type == EngineInfo::EngineType::TRTStatic || calibrate_int8) {
     // Create static engine for fp32/fp16 mode, and test validity of the engine
-    // for int8 mode. We don't want engine to fail at the calibration time.
-    // So we are constructing a FP32 engine here to check its validity, and if
-    // it is a valid engine then we put the serialized graphdef to the op.
-    // Otherwise we skip node creation for this engine.
+    // for int8 calibration mode. We don't want engine to fail at the
+    // calibration time. So we are constructing a FP32 engine here to check its
+    // validity, and if it is a valid engine then we put the serialized graphdef
+    // to the op. Otherwise we skip node creation for this engine.
     Logger trt_logger;
     TrtUniquePtrType<nvinfer1::ICudaEngine> engine;
     // TODO(sami): What happens if 1st dim is not batch?
     TF_RETURN_IF_ERROR(ConvertGraphDefToEngine(
-        info.segment_graph_def,
-        info.precision_mode == INT8MODE ? FP32MODE : info.precision_mode,
+        info.segment_graph_def, calibrate_int8 ? FP32MODE : info.precision_mode,
         max_batch_size, info.max_workspace_size_bytes, input_shapes,
         &trt_logger, alloc, /*calibrator=*/nullptr, &engine,
+        info.use_calibration,
         /*convert_successfully=*/nullptr));
     TrtUniquePtrType<nvinfer1::IHostMemory> engine_data(engine->serialize());
     segment_string =
         string((const char*)engine_data->data(), engine_data->size());
-    if (info.precision_mode == INT8MODE) {
+    if (calibrate_int8) {
       // See above comment about why not putting this inside the 'else' branch.
       segment_string = info.segment_graph_def.SerializeAsString();
     }
@@ -596,7 +620,7 @@ tensorflow::Status CreateTRTNode(const std::vector<EngineInfo>& infos, int pos,
   // conversion.
   string prec_string;
   TF_RETURN_IF_ERROR(GetPrecisionModeName(info.precision_mode, &prec_string));
-  if (info.precision_mode == INT8MODE &&
+  if (info.precision_mode == INT8MODE && calibrate_int8 &&
       !TRTResourceManager::instance()->getManager("TRTCalibration")) {
     LOG(ERROR) << "Failed to construct calibration storage";
   }
@@ -632,6 +656,7 @@ tensorflow::Status CreateTRTNode(const std::vector<EngineInfo>& infos, int pos,
           .Attr("cached_engine_batches", {max_batch_size})
           .Attr("workspace_size_bytes", info.max_workspace_size_bytes)
           .Attr("precision_mode", prec_string)
+          .Attr("use_calibration", info.use_calibration)
           .Attr("OutT", out_types)
           .Finalize(&trt_node);
   if (!status.ok()) {
@@ -864,19 +889,17 @@ tensorflow::Status ConvertAfterShapes(ConversionParams& params) {
   }
   segment_options.minimum_segment_size = params.minimum_segment_size;
   tensorflow::tensorrt::segment::SegmentNodesVector initial_segments;
-  TrtCandidateSelector candidate_selector(*params.graph_properties);
+  TrtCandidateSelector candidate_selector(*params.graph_properties,
+                                          params.precision_mode);
   TF_RETURN_IF_ERROR(tensorrt::segment::SegmentGraph(
-      &graph,
-      std::bind(&TrtCandidateSelector::IsTensorRTCandidate, &candidate_selector,
-                std::placeholders::_1),
+      &graph, std::bind(&TrtCandidateSelector::IsTensorRTCandidate,
+                        &candidate_selector, std::placeholders::_1),
       // Input validation is already done by TrtCandidateSelector, so we don't
       // need to check the input edges.
       [](const Edge* edge) { return true; }, OutputEdgeValidator(),
       segment_options, &initial_segments));
-  if (initial_segments.size() > 1) {
-    VLOG(0) << "MULTIPLE tensorrt candidate conversion: "
+  LOG(INFO) << "Number of TensorRT candidate segments: "
             << initial_segments.size();
-  }
 
   // Get the EngineInfo for each segment.
   std::unordered_map<string, tensorflow::Node*> node_map;
@@ -902,13 +925,17 @@ tensorflow::Status ConvertAfterShapes(ConversionParams& params) {
       continue;
     }
     curr_engine.precision_mode = params.precision_mode;
-    curr_engine.engine_type =
-        (params.is_dyn_op || params.precision_mode == INT8MODE
-             ? EngineInfo::EngineType::TRTDynamic
-             : EngineInfo::EngineType::TRTStatic);
+    if (params.use_calibration && params.precision_mode != INT8MODE) {
+      return errors::InvalidArgument(
+          "Calibration with FP32 or FP16 is not supported.");
+    }
+    curr_engine.engine_type = ((params.is_dyn_op || params.use_calibration)
+                                   ? EngineInfo::EngineType::TRTDynamic
+                                   : EngineInfo::EngineType::TRTStatic);
+    curr_engine.use_calibration = params.use_calibration;
     curr_engine.cached_engine_batches = params.cached_engine_batches;
     curr_engine.maximum_cached_engines = params.max_cached_engines;
-    StrAppend(&curr_engine.engine_name, "my_trt_op_", t);
+    StrAppend(&curr_engine.engine_name, "TRTEngineOp_", t);
     status = RegisterSegmentFunctionToFunctionLibrary(
         &graph, curr_engine.segment_graph_def, curr_engine.engine_name);
     if (!status.ok()) {
@@ -969,16 +996,9 @@ tensorflow::Status ConvertAfterShapes(ConversionParams& params) {
                                 &graph, alloc.get(), &engine_nodes);
     // If status is ok, we successfully added the node to the graph and can
     // remove segment ops. Otherwise graph is not modified.
-    string msg = StrCat("Engine ", engine.engine_name, " creation for segment ",
-                        i, ", composed of ",
+    string msg = StrCat("TensorRT node ", engine.engine_name,
+                        " added for segment ", i, " consisting of ",
                         converted_segments.at(i).first.size(), " nodes");
-    if (VLOG_IS_ON(1)) {
-      StrAppend(&msg, " (");
-      for (const string& node_name : converted_segments.at(i).first) {
-        StrAppend(&msg, node_name, ", ");
-      }
-      StrAppend(&msg, ")");
-    }
     if (status.ok()) {
       LOG(INFO) << msg << " succeeded.";
       for (auto node_name : converted_segments.at(i).first) {
@@ -986,7 +1006,14 @@ tensorflow::Status ConvertAfterShapes(ConversionParams& params) {
       }
     } else {
       // Graph is not modified.
-      LOG(WARNING) << msg << " failed: " << status << ". Skipping...";
+      LOG(WARNING) << msg << " failed: " << status << ". Fallback to TF...";
+    }
+    if (VLOG_IS_ON(1)) {
+      msg = "Segment consists of nodes: ";
+      for (const string& node_name : converted_segments.at(i).first) {
+        StrAppend(&msg, node_name, ", ");
+      }
+      VLOG(1) << msg;
     }
   }
   cudaSetDevice(old_cuda_device);
diff --git a/tensorflow/contrib/tensorrt/convert/convert_graph.h b/tensorflow/contrib/tensorrt/convert/convert_graph.h
index 1c9d82105a7..1f39f56f639 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_graph.h
+++ b/tensorflow/contrib/tensorrt/convert/convert_graph.h
@@ -35,7 +35,8 @@ namespace convert {
 // supported by TRT.
 class TrtCandidateSelector {
  public:
-  TrtCandidateSelector(const grappler::GraphProperties& graph_properties);
+  TrtCandidateSelector(const grappler::GraphProperties& graph_properties,
+                       int precision_mode);
 
   // Returns OK iff 'node' is a TF-TRT conversion candidate, which will be added
   // to TRT subgraph and later converted into TRT engine.
@@ -49,6 +50,9 @@ class TrtCandidateSelector {
   // GraphProperties of the graph whose nodes are to be validated by
   // IsTensorRTCandidate().
   const grappler::GraphProperties& graph_properties_;
+
+  // Quantization ops are only converted when using quantized precisions.
+  const int precision_mode_;
 };
 
 struct ConversionParams {
@@ -63,6 +67,7 @@ struct ConversionParams {
         cluster(nullptr),
         is_dyn_op(false),
         fixed_input_size(true),
+        use_calibration(true),
         max_cached_engines(1) {}
   const tensorflow::GraphDef* input_graph_def;
   const std::vector<string>* output_names;
@@ -76,6 +81,7 @@ struct ConversionParams {
   bool is_dyn_op;  //  Whether to create engine on conversion or execution time
   bool fixed_input_size;   // Assume non-batch ranks of input tensors are fixed
   int max_cached_engines;  // maximum number of cached engines
+  bool use_calibration;
   std::vector<int> cached_engine_batches;  // list of cached engines
 };
 
@@ -95,7 +101,7 @@ tensorflow::Status ConvertGraphDefToTensorRT(
     size_t max_workspace_size_bytes, tensorflow::GraphDef* new_graph_def,
     int precision_mode = 1, int minimum_segment_size = 3,
     bool is_dyn_op = false, int max_cached_engines = 1,
-    std::vector<int> cached_engine_batches = {});
+    std::vector<int> cached_engine_batches = {}, bool use_calibration = true);
 
 // Method to call from optimization pass
 tensorflow::Status ConvertAfterShapes(ConversionParams& params);
diff --git a/tensorflow/contrib/tensorrt/convert/convert_graph_test.cc b/tensorflow/contrib/tensorrt/convert/convert_graph_test.cc
index f10729987fd..2d2bfeb192c 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_graph_test.cc
+++ b/tensorflow/contrib/tensorrt/convert/convert_graph_test.cc
@@ -85,27 +85,42 @@ TEST(TrtCandidateSelector, Basics) {
       ops::MatMul(s.WithOpName("matmul_with_incompatible_input"),
                   incompatible_feed, const_2);
 
+  // Quantize ops.
+  auto quantize_attrs = ops::FakeQuantWithMinMaxArgs::Min(-6.0f).Max(6.0f);
+  auto quantize = ops::FakeQuantWithMinMaxArgs(s.WithOpName("quantize"), feed,
+                                               quantize_attrs);
+
+  // Get GrapplerItem and GraphProperties.
   grappler::GrapplerItem item;
   TF_EXPECT_OK(s.ToGraphDef(&item.graph));
   Tensor feed_tensor(DT_FLOAT, input_shape);
   item.feed.push_back(std::make_pair("feed", feed_tensor));
-
   grappler::GraphProperties graph_properties(item);
   TF_EXPECT_OK(graph_properties.InferStatically(true));
 
-  TrtCandidateSelector selector(graph_properties);
-  TF_EXPECT_OK(selector.IsTensorRTCandidate(matmul.operation.node()));
-  ExpectStatus(
-      selector.IsTensorRTCandidate(incompatible_matmul.operation.node()),
-      error::INVALID_ARGUMENT,
-      "transpose_a is not supported for TensorRT FullyConnected "
-      "(op: MatMul), at: incompatible_matmul");
-  ExpectStatus(selector.IsTensorRTCandidate(unsupported_op.operation.node()),
-               error::UNIMPLEMENTED, "Op type Sin is not supported");
-  ExpectStatus(selector.IsTensorRTCandidate(
-                   matmul_with_incompatible_input.operation.node()),
-               error::INTERNAL,
-               "Failed to convert input with index 0 to a TRT_TensorOrWeights");
+  for (const int precision_mode : {FP32MODE, INT8MODE}) {
+    TrtCandidateSelector selector(graph_properties, precision_mode);
+    TF_EXPECT_OK(selector.IsTensorRTCandidate(matmul.operation.node()));
+    ExpectStatus(
+        selector.IsTensorRTCandidate(incompatible_matmul.operation.node()),
+        error::INVALID_ARGUMENT,
+        "transpose_a is not supported for TensorRT FullyConnected "
+        "(op: MatMul), at: incompatible_matmul");
+    ExpectStatus(selector.IsTensorRTCandidate(unsupported_op.operation.node()),
+                 error::UNIMPLEMENTED, "Op type Sin is not supported");
+    ExpectStatus(
+        selector.IsTensorRTCandidate(
+            matmul_with_incompatible_input.operation.node()),
+        error::INTERNAL,
+        "Failed to convert input with index 0 to a TRT_TensorOrWeights");
+    if (precision_mode == INT8MODE) {
+      TF_EXPECT_OK(selector.IsTensorRTCandidate(quantize.operation.node()));
+    } else {
+      ExpectStatus(selector.IsTensorRTCandidate(quantize.operation.node()),
+                   error::UNIMPLEMENTED,
+                   "Op type FakeQuantWithMinMaxArgs is not supported");
+    }
+  }
 }
 
 class FakeCluster : public grappler::Cluster {
diff --git a/tensorflow/contrib/tensorrt/convert/convert_nodes.cc b/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
index e2988f5f2a8..25a34dd3503 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
+++ b/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
@@ -54,10 +54,10 @@ limitations under the License.
 // would work!
 #define TFTRT_CHECK_EQ_TYPE(val1, val2) CHECK_EQ((int)val1, (int)val2)
 
-#define TFTRT_INTERNAL_ERROR_AT_NODE(node)                               \
-  do {                                                                   \
-    return tensorflow::errors::Internal(                                 \
-        "TFTRT::", __FUNCTION__, "failed to add TRT layer, at: ", node); \
+#define TFTRT_INTERNAL_ERROR_AT_NODE(node)                                \
+  do {                                                                    \
+    return tensorflow::errors::Internal(                                  \
+        "TFTRT::", __FUNCTION__, " failed to add TRT layer, at: ", node); \
   } while (0)
 
 #define TFTRT_RETURN_ERROR_IF_FALSE(status, node) \
@@ -130,7 +130,7 @@ void GetOutputProperties(const grappler::GraphProperties& graph_properties,
     *dtype = out_shape.dtype();
     *shape = out_shape.shape();
   } else {
-    VLOG(0) << "Unknown output shape" << node->name();
+    LOG(INFO) << "Unknown output shape" << node->name();
     *dtype = node->output_type(out_port);
   }
 }
@@ -181,16 +181,55 @@ Status ValidateTensorProperties(const string& producer_node_type,
     if (shape.dim_size(d) < 0) {
       return errors::InvalidArgument(
           "Input tensor with shape ", shape.DebugString(),
-          " has an unknown non-batch dimemension at dim ", d);
+          " has an unknown non-batch dimension at dim ", d);
     }
   }
   return Status::OK();
 }
 
+string DebugString(const nvinfer1::DimensionType type) {
+  switch (type) {
+    case nvinfer1::DimensionType::kSPATIAL:
+      return "kSPATIAL";
+    case nvinfer1::DimensionType::kCHANNEL:
+      return "kCHANNEL";
+    case nvinfer1::DimensionType::kINDEX:
+      return "kINDEX";
+    case nvinfer1::DimensionType::kSEQUENCE:
+      return "kSEQUENCE";
+    default:
+      return StrCat(static_cast<int>(type), "=unknown");
+  }
+}
+
+string DebugString(const nvinfer1::DataType trt_dtype) {
+  switch (trt_dtype) {
+    case nvinfer1::DataType::kFLOAT:
+      return "kFLOAT";
+    case nvinfer1::DataType::kHALF:
+      return "kHALF";
+    case nvinfer1::DataType::kINT8:
+      return "kINT8";
+    case nvinfer1::DataType::kINT32:
+      return "kINT32";
+    default:
+      return "Invalid TRT data type";
+  }
+}
+
 string DebugString(const nvinfer1::Dims& dims) {
   string out = StrCat("nvinfer1::Dims(nbDims=", dims.nbDims, ", d=");
   for (int i = 0; i < dims.nbDims; ++i) {
-    StrAppend(&out, dims.d[i], ",");
+    StrAppend(&out, dims.d[i], "[", DebugString(dims.type[i]), "],");
+  }
+  StrAppend(&out, ")");
+  return out;
+}
+
+string DebugString(const nvinfer1::Permutation& permutation, int len) {
+  string out = "nvinfer1::Permutation(";
+  for (int i = 0; i < len; ++i) {
+    StrAppend(&out, permutation.order[i], ",");
   }
   StrAppend(&out, ")");
   return out;
@@ -198,16 +237,15 @@ string DebugString(const nvinfer1::Dims& dims) {
 
 string DebugString(const nvinfer1::ITensor& tensor) {
   return StrCat("nvinfer1::ITensor(@", reinterpret_cast<uintptr_t>(&tensor),
-                ", shape=", DebugString(tensor.getDimensions()), ")");
+                ", name=", tensor.getName(),
+                ", dtype=", DebugString(tensor.getType()),
+                ", dims=", DebugString(tensor.getDimensions()), ")");
 }
 
-// Return whether or not the broadcast is feasible;
-bool TensorRTGetBroadcastShape(const nvinfer1::Dims& operand_l,
-                               const bool operand_l_is_tensor,
-                               const nvinfer1::Dims& operand_r,
-                               const bool operand_r_is_tensor,
-                               nvinfer1::Dims* operand_l_new_shape,
-                               nvinfer1::Dims* operand_r_new_shape) {
+Status Converter::GetTrtBroadcastShape(
+    const TRT_TensorOrWeights& operand_l, const TRT_TensorOrWeights& operand_r,
+    nvinfer1::Dims* operand_l_new_dims,
+    nvinfer1::Dims* operand_r_new_dims) const {
   // ***************************************************************************
   // TensorRT Elementwise op supports broadcast but requires both tensor to be
   // of Identical rank
@@ -232,52 +270,59 @@ bool TensorRTGetBroadcastShape(const nvinfer1::Dims& operand_l,
   // -> T: 1 1 1 -1 3 5 1
   // -> W: 1 1 1  1 3 5 1
   // ***************************************************************************
+  if (!operand_l.is_tensor() && !operand_r.is_tensor()) {
+    return errors::InvalidArgument(
+        "Broadcasting requires at least one of the operands be tensors");
+  }
+
   const int max_nb_dims = nvinfer1::Dims::MAX_DIMS + 1;
-  const size_t element_size = sizeof(operand_l.d[0]);
+  auto compute_output_dims =
+      [max_nb_dims](const TRT_TensorOrWeights& input, int broadcast_num_dims,
+                    int* output_dims_array, nvinfer1::Dims* output_dims) {
+        const nvinfer1::Dims input_dims = input.GetTrtDims();
+        std::fill(output_dims_array, output_dims_array + max_nb_dims, 1);
+        std::copy(input_dims.d, input_dims.d + input_dims.nbDims,
+                  output_dims_array + broadcast_num_dims - input_dims.nbDims);
+        if (input.is_tensor()) {
+          const int true_input_dims = input_dims.nbDims + 1;
+          if (true_input_dims < broadcast_num_dims) {
+            return errors::InvalidArgument(
+                "Broadcasting beyond batch dimension is not supported ",
+                "(tensor #dims ", true_input_dims, " vs broadcast #dims ",
+                broadcast_num_dims, ")");
+          }
+          // Set the batch dimension to -1, since batch size is not supposed to
+          // be broadcasted.
+          output_dims_array[0] = -1;
+        }
+        // Copy to output dimensions (stripping the batch dimension).
+        output_dims->nbDims = broadcast_num_dims - 1;
+        std::copy(output_dims_array + 1, output_dims_array + broadcast_num_dims,
+                  output_dims->d);
+        return Status::OK();
+      };
 
-  // fill in dimensions
-  int l_s[max_nb_dims];
-  std::fill(l_s, l_s + max_nb_dims, 1);
-  int l_d = operand_l_is_tensor ? operand_l.nbDims + 1 : operand_l.nbDims;
-  int r_s[max_nb_dims];
-  std::fill(r_s, r_s + max_nb_dims, 1);
-  int r_d = operand_r_is_tensor ? operand_r.nbDims + 1 : operand_r.nbDims;
+  // Compute the output dimensions.
+  const int broadcast_num_dims =
+      std::max(operand_l.GetTrtDims().nbDims + (operand_l.is_tensor() ? 1 : 0),
+               operand_r.GetTrtDims().nbDims + (operand_r.is_tensor() ? 1 : 0));
+  int output_l[max_nb_dims], output_r[max_nb_dims];
+  TF_RETURN_IF_ERROR(compute_output_dims(operand_l, broadcast_num_dims,
+                                         output_l, operand_l_new_dims));
+  TF_RETURN_IF_ERROR(compute_output_dims(operand_r, broadcast_num_dims,
+                                         output_r, operand_r_new_dims));
 
-  int max_d = std::max(l_d, r_d);
-  std::memcpy(l_s + max_d - operand_l.nbDims, operand_l.d,
-              operand_l.nbDims * element_size);
-  std::memcpy(r_s + max_d - operand_r.nbDims, operand_r.d,
-              operand_r.nbDims * element_size);
-
-  // set -1 for batch dimension, since batch size is not supposed to be
-  // broadcasted
-  if (operand_l_is_tensor) {
-    if (max_d != l_d) {  // if broadcast beyond batch dimension, fail
-      return false;
-    }
-    l_s[0] = -1;
-  }
-  if (operand_r_is_tensor) {
-    if (max_d != r_d) {  // if broadcast beyond batch dimension, fail
-      return false;
-    }
-    r_s[0] = -1;
-  }
-
-  // compare broadcast feasibility
-  for (int i = max_d - 1; i >= 0; i--) {
-    if ((l_s[i] != r_s[i]) && (l_s[i] != 1) && (r_s[i] != 1)) {
-      return false;
+  // Compare broadcast feasibility
+  for (int i = 0; i < broadcast_num_dims; ++i) {
+    if ((output_l[i] != output_r[i]) && (output_l[i] != 1) &&
+        (output_r[i] != 1)) {
+      return errors::InvalidArgument(
+          "Infeasible broadcast scheme (", "batch_dim: ", output_l[0], ", ",
+          DebugString(*operand_l_new_dims), " vs ", "batch_dim: ", output_r[0],
+          ", ", DebugString(*operand_r_new_dims), ")");
     }
   }
-
-  // output new TensorRT Dimension (stripping the batch dimension)
-  operand_l_new_shape->nbDims = max_d - 1;
-  std::memcpy(operand_l_new_shape->d, l_s + 1, (max_d - 1) * element_size);
-  operand_r_new_shape->nbDims = max_d - 1;
-  std::memcpy(operand_r_new_shape->d, r_s + 1, (max_d - 1) * element_size);
-
-  return true;
+  return Status::OK();
 }
 
 inline bool DimsEqual(const nvinfer1::Dims& dim_l,
@@ -381,8 +426,8 @@ size_t TRT_ShapedWeights::size_bytes() const {
 
 string TRT_ShapedWeights::DebugString() const {
   return StrCat("TRT_ShapedWeights(shape=", convert::DebugString(shape_),
-                ", type=", type_,
-                ", values=", reinterpret_cast<uintptr_t>(GetValues()), ")");
+                ", type=", DataTypeString(type_), ", values=",
+                reinterpret_cast<uintptr_t>(GetValues()), ")");
 }
 
 // A fake ITensor implementation used to check whether the TF-TRT converter can
@@ -425,7 +470,9 @@ class TRT_TensorOrWeights::SimpleITensor : public nvinfer1::ITensor {
   void setLocation(nvinfer1::TensorLocation location) override {}
 
 #if NV_TENSORRT_MAJOR >= 5
-  bool setDynamicRange(float min, float max) override {}
+  bool setDynamicRange(float min, float max) override { return true; }
+
+  float getDynamicRange() const override { return 0; }
 #endif
 
  private:
@@ -489,8 +536,7 @@ nvinfer1::Dims TRT_TensorOrWeights::GetTrtDims() const {
 string TRT_TensorOrWeights::DebugString() const {
   string output = "TRT_TensorOrWeights(type=";
   if (is_tensor()) {
-    StrAppend(&output, "tensor @", reinterpret_cast<uintptr_t>(tensor()),
-              ", shape=", convert::DebugString(tensor()->getDimensions()),
+    StrAppend(&output, "tensor=", convert::DebugString(*tensor()),
               ", batch_size=", batch_size_);
   } else {
     StrAppend(&output, "weights=", weights_.DebugString());
@@ -627,11 +673,10 @@ void ReorderCKtoKC(const TRT_ShapedWeights& iweights,
       break;
     }
     case tensorflow::DataType::DT_HALF: {
-      Reorder2(
-          {k, c}, static_cast<Eigen::half const*>(iweights.GetValues()),
-          istrides,
-          static_cast<Eigen::half*>(const_cast<void*>(oweights->GetValues())),
-          ostrides);
+      Reorder2({k, c}, static_cast<Eigen::half const*>(iweights.GetValues()),
+               istrides, static_cast<Eigen::half*>(
+                             const_cast<void*>(oweights->GetValues())),
+               ostrides);
       break;
     }
     default:
@@ -753,8 +798,9 @@ Status TrtNodeValidator::ValidateNode(
     Status status = ConvertToTensorOrWeights(
         *pair.first, pair.second, graph_properties, &tensor_or_weights);
     if (!status.ok()) {
-      return errors::Internal("Failed to convert input with index ", i,
-                              " to a TRT_TensorOrWeights");
+      return errors::Internal(
+          "Failed to convert input with index ", i,
+          " to a TRT_TensorOrWeights: ", status.error_message());
     }
     inputs.push_back(tensor_or_weights);
   }
@@ -786,8 +832,11 @@ Status TrtNodeValidator::ConvertConstToWeights(
   return status;
 }
 
-Converter::Converter(nvinfer1::INetworkDefinition* trt_network, bool is_fp16)
-    : trt_network_(trt_network), is_fp16_(is_fp16) {
+Converter::Converter(nvinfer1::INetworkDefinition* trt_network,
+                     int precision_mode, bool use_calibration)
+    : trt_network_(trt_network),
+      precision_mode_(precision_mode),
+      use_calibration_(use_calibration) {
   this->RegisterOpConverters();
 }
 
@@ -812,13 +861,18 @@ Status Converter::ConvertNode(const NodeDef& node_def) {
     TRT_TensorOrWeights& output = outputs[i];
     string output_name = node_def.name();
     if (i != 0) output_name = StrCat(output_name, ":", i);
-    // We need to check the name before setting it. For Identity op where the
-    // output is the input, if its input is one of the engine input, setting
-    // the name here will overwrite engine input bindings which will cause
-    // runtime error.
+    // We need to check the name before setting it. If the input is one of the
+    // engine input, setting the name here will overwrite engine input
+    // bindings which will cause runtime error.
     if (output.is_tensor()) {
       const char* tensor_name = output.tensor()->getName();
-      if (tensor_name == nullptr || std::strlen(tensor_name) == 0) {
+      if (!tensorflow::str_util::StartsWith(tensor_name, kInputPHName)) {
+        // TRT initializes tensor names as "(Unnamed ITensor* N)". We rename
+        // them to match their corresponding TensorFlow name.
+        // Note: ITensors that we create internally within TF-TRT which are
+        // not inputs or outputs of a node will not be renamed. This is a
+        // potential cause of confusion if an error message or warning
+        // mentions the unnamed tensor.
         output.tensor()->setName(output_name.c_str());
       }
     }
@@ -930,11 +984,14 @@ Status Converter::TransposeTensor(nvinfer1::ITensor* input_tensor,
 
   nvinfer1::IShuffleLayer* layer = this->network()->addShuffle(*input_tensor);
   TFTRT_RETURN_ERROR_IF_NULLPTR(layer, "TF-TRT Internal Transpose");
+  MarkQuantizationRangesAsInferrable(input_tensor, layer->getOutput(0));
 
   nvinfer1::Permutation permutation;
   for (int32_t i = 0; i < dims.nbDims; ++i) {
     permutation.order[i] = order_with_batch_dim[i + 1] - 1;
   }
+  VLOG(1) << "TransposeTensor permutation: "
+          << DebugString(permutation, dims.nbDims);
   layer->setFirstTranspose(permutation);
 
   nvinfer1::Dims reshape_dims;
@@ -950,6 +1007,38 @@ Status Converter::TransposeTensor(nvinfer1::ITensor* input_tensor,
   return tensorflow::Status::OK();
 }
 
+Status Converter::GetWeightRange(const TRT_ShapedWeights& weights,
+                                 float* out_min, float* out_max) const {
+  switch (weights.type_) {
+    case DataType::DT_FLOAT: {
+      auto inp = static_cast<float const*>(weights.GetValues());
+      auto result = std::minmax_element(inp, inp + weights.count());
+      *out_min = *result.first;
+      *out_max = *result.second;
+      break;
+    }
+    case DataType::DT_HALF: {
+      auto inp = static_cast<Eigen::half const*>(weights.GetValues());
+      auto result = std::minmax_element(inp, inp + weights.count());
+      *out_min = Eigen::half_impl::half_to_float(*result.first);
+      *out_max = Eigen::half_impl::half_to_float(*result.second);
+      break;
+    }
+    case DataType::DT_INT32: {
+      auto inp = static_cast<int const*>(weights.GetValues());
+      auto result = std::minmax_element(inp, inp + weights.count());
+      *out_min = static_cast<float>(*result.first);
+      *out_max = static_cast<float>(*result.second);
+      break;
+    }
+    default:
+      return errors::Unimplemented(
+          "Data type not supported for GetWeightRange: ",
+          DataTypeString(weights.type_));
+  }
+  return Status::OK();
+}
+
 Status Converter::PrepareTensorForShape(const TRT_TensorOrWeights& input,
                                         const nvinfer1::Dims& dims,
                                         const nvinfer1::ITensor** tensor) {
@@ -964,8 +1053,9 @@ Status Converter::PrepareTensorForShape(const TRT_TensorOrWeights& input,
   }
   if (can_check_shapes &&
       TrtDimsNumElements(input.GetTrtDims()) != TrtDimsNumElements(dims)) {
-    return tensorflow::errors::InvalidArgument(
-        "Reshape shapes are not compatible.");
+    return errors::InvalidArgument("Reshape shapes are not compatible (",
+                                   DebugString(input.GetTrtDims()), " vs ",
+                                   DebugString(dims), ")");
   }
 
   if (input.is_tensor()) {
@@ -976,6 +1066,8 @@ Status Converter::PrepareTensorForShape(const TRT_TensorOrWeights& input,
           *const_cast<nvinfer1::ITensor*>(input.tensor()));
       TFTRT_RETURN_ERROR_IF_NULLPTR(layer, "TF-TRT Internal Reshape");
       layer->setReshapeDimensions(dims);
+      MarkQuantizationRangesAsInferrable(
+          const_cast<nvinfer1::ITensor*>(input.tensor()), layer->getOutput(0));
       *tensor = layer->getOutput(0);
     }
   } else {
@@ -983,10 +1075,123 @@ Status Converter::PrepareTensorForShape(const TRT_TensorOrWeights& input,
         this->network()->addConstant(dims, input.weights().GetTrtWeights());
     TFTRT_RETURN_ERROR_IF_NULLPTR(layer, "TF-TRT Internal Reshape");
     *tensor = layer->getOutput(0);
+    if (precision_mode() == INT8MODE && !use_calibration()) {
+      // If we are in int8 mode and not calibrating, we need to explicitly set a
+      // quantization range for the output tensor of the IConstantLayer. Here we
+      // set the range to [min(weights), max(weights)].
+      float min_range = 0.0f;
+      float max_range = 0.0f;
+      TF_RETURN_IF_ERROR(
+          GetWeightRange(input.weights(), &min_range, &max_range));
+      // Avoid setting range to 0 because TRT will throw an error. If the
+      // weights are zero then the range doesn't matter: using 127.0f should
+      // ensure the quantized weight will be exactly zero.
+      if (min_range == 0.0f && max_range == 0.0f) {
+        min_range = -127.0f;
+        max_range = 127.0f;
+      }
+      ProvideQuantizationRange(const_cast<nvinfer1::ITensor*>(*tensor),
+                               min_range, max_range);
+    }
   }
   return tensorflow::Status::OK();
 }
 
+void Converter::MarkQuantizationRangesAsInferrable(nvinfer1::ITensor* input,
+                                                   nvinfer1::ITensor* output) {
+  quantization_infer_.push_back({input, output});
+  quantization_infer_.push_back({output, input});
+}
+
+void Converter::ProvideQuantizationRange(nvinfer1::ITensor* tensor,
+                                         float min_range, float max_range) {
+  float symmetric_range = std::max(std::abs(min_range), std::abs(max_range));
+  quantization_ranges_[tensor] = symmetric_range;
+}
+
+void Converter::MaybeApplyQuantizationRanges() {
+  if (precision_mode() != INT8MODE) return;
+
+  // Infer ranges across marked ops.
+  PropagateQuantizationRanges();
+  // Apply ranges.
+#if NV_TENSORRT_MAJOR >= 5
+  for (auto pair : quantization_ranges_) {
+    nvinfer1::ITensor* tensor = pair.first;
+    const float range = pair.second;
+    VLOG(1) << "Setting range for: " << tensor->getName() << ": " << range;
+    // TODO(laigd): if 'tensor' already has a range set which doesn't match
+    // 'range', it should report error.
+    tensor->setDynamicRange(-range, range);
+  }
+#endif
+
+  // Warn user about tensors that are missing ranges. If TRT fuses some layers
+  // then these tensors may not actually be required, which is why this is
+  // just a warning. If we are still missing ranges even after fusion,
+  // Builder::buildCudaEngine() will return nullptr and we will catch the
+  // error at that point.
+  if (!use_calibration()) {
+    // Get all tensors from network
+    std::set<nvinfer1::ITensor*> all_tensors;
+    for (int i = 0; i < this->network()->getNbLayers(); i++) {
+      nvinfer1::ILayer* layer = this->network()->getLayer(i);
+      for (int j = 0; j < layer->getNbInputs(); j++) {
+        all_tensors.insert(layer->getInput(j));
+      }
+      for (int j = 0; j < layer->getNbOutputs(); j++) {
+        all_tensors.insert(layer->getOutput(j));
+      }
+    }
+    // Find tensors with no ranges
+    for (auto tensor : all_tensors) {
+      if (!quantization_ranges_.count(tensor)) {
+        // Note: there may be some warnings for "(Unnamed ITensor* N)". These
+        // are tensors which are created internally by TF-TRT. The ranges for
+        // these unnamed ITensors are always inferred from user provided ranges,
+        // thus there will also be a warning for the range(s) the user missed.
+        LOG(WARNING) << "Quantization range was not found for "
+                     << tensor->getName() << ". "
+                     << "This is okay if TensorRT does not need the range "
+                     << "(e.g. due to node fusion).";
+      }
+    }
+  }
+}
+
+void Converter::PropagateQuantizationRanges() {
+  // Propagate ranges across edges in quantization_infer_ until no new
+  // information is added.
+  // Note: this function modifies quantization_infer_, it might be better to
+  // modify a copy instead if we for some reason need quantization_infer_
+  // later.
+  bool information_added = true;
+  while (information_added) {
+    information_added = false;
+    for (auto it = quantization_infer_.begin();
+         it != quantization_infer_.end();) {
+      auto input_tensor_range = quantization_ranges_.find(it->first);
+      auto output_tensor_range = quantization_ranges_.find(it->second);
+      if (input_tensor_range != quantization_ranges_.end() &&
+          output_tensor_range == quantization_ranges_.end()) {
+        // Input has range but output doesn't: copy range
+        // TODO(laigd): consider reporting error if it a different range is
+        // already set.
+        quantization_ranges_[it->second] = input_tensor_range->second;
+        information_added = true;
+        VLOG(1) << "Copy quantization range: " << it->first->getName() << " -> "
+                << it->second->getName();
+      }
+      // We can remove edges when the output range is known
+      if (quantization_ranges_.find(it->second) != quantization_ranges_.end()) {
+        it = quantization_infer_.erase(it);
+      } else {
+        ++it;
+      }
+    }
+  }
+}
+
 Status Converter::GetInputs(const tensorflow::NodeDef& node_def,
                             std::vector<TRT_TensorOrWeights>* inputs) const {
   for (auto const& input_name : node_def.input()) {
@@ -1043,12 +1248,11 @@ TRT_ShapedWeights ConvertFP32ToFP16(TrtWeightStore* store,
 }
 
 // ****************************************************************************
-// Constant folding functions
-// TODO(jie): once optimizer kicks in, we should have done constant folding
-// there.
+// Constant folding functions for weights.
+// TODO(laigd): we should probably use eigen directly.
 // *****************************************************************************
 struct LambdaFactory {
-  enum class OP_CATEGORY : int { RSQRT = 0, NEG, ADD, MUL, SUB, RECIP };
+  enum class OP_CATEGORY : int { RSQRT = 0, NEG, RECIP };
   OP_CATEGORY op;
 
   template <typename T>
@@ -1063,84 +1267,10 @@ struct LambdaFactory {
       case OP_CATEGORY::RECIP:
         return [](T t) -> T { return 1.0 / t; };
       default:
-        VLOG(2) << "Not supported op for unary: " << static_cast<int>(op);
+        LOG(ERROR) << "Not supported op for unary: " << static_cast<int>(op);
         return nullptr;
     }
   }
-
-  template <typename T>
-  std::function<T(T, T)> binary() {
-    switch (op) {
-      case OP_CATEGORY::ADD:
-        return [](T l, T r) -> T { return l + r; };
-      case OP_CATEGORY::SUB:
-        return [](T l, T r) -> T { return l - r; };
-      case OP_CATEGORY::MUL:
-        return [](T l, T r) -> T { return l * r; };
-      default:
-        LOG(WARNING) << "Not supported op for binary: " << static_cast<int>(op);
-    }
-    return [](T l, T r) -> T {
-      LOG(FATAL) << "Unsupported op type ";
-      return l;
-    };
-  }
-
-  template <typename T>
-  std::function<T(T)> broadcast_r(T val) {
-    VLOG(2) << "LAMBDA VAL : " << val;
-    switch (op) {
-      case OP_CATEGORY::ADD:
-        return [val](T l) -> T {
-          VLOG(2) << "LAMBDA VAL : " << val;
-          return l + val;
-        };
-      case OP_CATEGORY::SUB:
-        return [val](T l) -> T {
-          VLOG(2) << "LAMBDA VAL : " << val;
-          return l - val;
-        };
-      case OP_CATEGORY::MUL:
-        return [val](T l) -> T {
-          VLOG(2) << "LAMBDA VAL : " << val;
-          return l * val;
-        };
-      default:
-        LOG(WARNING) << "Not supported op for binary: " << static_cast<int>(op);
-    }
-    return [val](T l) -> T {
-      LOG(FATAL) << "Unsupported op type ";
-      return l;
-    };
-  }
-
-  template <typename T>
-  std::function<T(T)> broadcast_l(T val) {
-    VLOG(2) << "LAMBDA VAL : " << val;
-    switch (op) {
-      case OP_CATEGORY::ADD:
-        return [val](T l) -> T {
-          VLOG(2) << "LAMBDA VAL : " << val;
-          return val + l;
-        };
-      case OP_CATEGORY::SUB:
-        return [val](T l) -> T {
-          VLOG(2) << "LAMBDA VAL : " << val;
-          return val - l;
-        };
-      case OP_CATEGORY::MUL:
-        return [val](T l) -> T {
-          VLOG(2) << "LAMBDA VAL : " << val;
-          return val * l;
-        };
-      default:
-        LOG(ERROR) << "Not supported op for binary: " << static_cast<int>(op);
-    }
-    return [val](T l) -> T {
-      LOG(FATAL) << "Unsupported op type ";
-      return l;
-    };
-  }
 };
 
 template <>
@@ -1148,15 +1278,18 @@ std::function<Eigen::half(Eigen::half)> LambdaFactory::unary<Eigen::half>() {
   switch (op) {
     case OP_CATEGORY::RSQRT: {
       VLOG(2) << "RSQRT GETS DONE";
-      return [](Eigen::half t) -> Eigen::half {
+      return [](Eigen::half t) {
         return Eigen::half(1.0 / sqrt(static_cast<float>(t)));
       };
     }
     case OP_CATEGORY::NEG:
-      return [](Eigen::half t) -> Eigen::half { return -t; };
-    // TODO(aaroey): can we support RECIP?
+      return [](Eigen::half t) { return -t; };
+    case OP_CATEGORY::RECIP:
+      return [](Eigen::half t) {
+        return Eigen::half(1.0 / static_cast<float>(t));
+      };
     default:
-      VLOG(2) << "Not supported op for unary: " << static_cast<int>(op);
+      LOG(ERROR) << "Not supported op for unary: " << static_cast<int>(op);
       return nullptr;
   }
 }
@@ -1188,50 +1321,48 @@ tensorflow::Status UnaryCompute(const TRT_ShapedWeights& iweights,
   return tensorflow::Status::OK();
 }
 
+// If swapped_inputs is false, 'tensor' is the left operand and 'weights' is the
+// right operand. If swapped_inputs is true, those two are swapped.
+//
 // TODO(jie): broadcast is needed yet not implemented.
-// Only implemented channel wise for the time being
-tensorflow::Status BinaryTensorOpWeight(OpConverterParams* params,
-                                        const nvinfer1::ITensor* tensor,
-                                        TRT_ShapedWeights weights,
-                                        bool swapped_inputs) {
+// Only implemented channel wise for the time being.
+Status BinaryTensorOpWeight(OpConverterParams* params,
+                            const nvinfer1::ITensor* tensor,
+                            TRT_ShapedWeights weights, bool swapped_inputs) {
+  static const std::unordered_set<string> supported_ops = {"Sub", "Add", "Mul",
+                                                           "Div", "RealDiv"};
   const auto& node_def = params->node_def;
-  // tensor is the left operand while weights is the right operand;
-  // when swapped_inputs set to true, those two are swapped.
-  // TODO(aaroey): use a set.
-  if (node_def.op() != "Sub" && node_def.op() != "Add" &&
-      node_def.op() != "Mul" && node_def.op() != "Div" &&
-      node_def.op() != "RealDiv") {
-    return tensorflow::errors::Unimplemented(
-        "op not supported: " + node_def.op() + ", at: " + node_def.name());
+  if (!supported_ops.count(node_def.op())) {
+    return errors::Unimplemented(node_def.op(), " is not supported, at ",
+                                 node_def.name());
   }
 
-  // Check type consistency
-  nvinfer1::DataType ttype;
-  TF_RETURN_IF_ERROR(ConvertDType(weights.type_, &ttype));
+  // Check type consistency.
+  nvinfer1::DataType trt_dtype;
+  TF_RETURN_IF_ERROR(ConvertDType(weights.type_, &trt_dtype));
 
-  // Check scale mode
+  // Check scale mode.
   auto dims_w = weights.shape_;
-  auto dims_t = tensor->getDimensions();
+  const auto dims_t = tensor->getDimensions();
 
   // TODO(jie): addScale checks for input tensor dimension
   if (dims_t.nbDims != 3) {
-    return tensorflow::errors::InvalidArgument(
-        "addScale requires tensor with rank 3, " + node_def.name());
+    return errors::InvalidArgument("addScale requires tensor with rank 3, at ",
+                                   node_def.name());
   }
 
-  // default to element-wise
+  // Default to element-wise
   auto scale_mode = nvinfer1::ScaleMode::kELEMENTWISE;
 
   // TODO(jie): maybe use a permutation instead to support more cases;
-  bool permutation_flag = false;
+  bool need_to_permute = false;
 
   if (weights.count() == 1) {
-    VLOG(2) << "UNIFORM";
     scale_mode = nvinfer1::ScaleMode::kUNIFORM;
   } else {
-    // no broadcasting on Batch dimension;
-    VLOG(2) << "WEIGHTS DIM: " << dims_w.nbDims
-            << " tensor DIM: " << dims_t.nbDims;
+    VLOG(2) << "weights dims: " << DebugString(dims_w)
+            << "; tensor dims: " << DebugString(dims_t);
+    // Make sure no broadcasting on batch dimension.
     if (dims_w.nbDims == dims_t.nbDims + 1) {
       if (dims_w.d[0] == 1) {
         for (int i = 1; i < dims_w.nbDims; i++) {
@@ -1239,72 +1370,70 @@ tensorflow::Status BinaryTensorOpWeight(OpConverterParams* params,
         }
         dims_w.nbDims--;
       } else {
-        return tensorflow::errors::InvalidArgument(
-            "Binary op cannot operate on batch, " + node_def.name());
+        return errors::InvalidArgument("Binary op cannot operate on batch, at ",
+                                       node_def.name());
       }
     }
 
     if (dims_w.nbDims == dims_t.nbDims && dims_w.d[0] == dims_t.d[0]) {
       scale_mode = nvinfer1::ScaleMode::kELEMENTWISE;
-      // default is element;
+      // Default is element-wise
       for (int i = 1; i < dims_w.nbDims; i++) {
         if (dims_w.d[i] != dims_t.d[i]) {
-          // if dimension does not match, switch back to channel;
-          VLOG(2) << "channel";
+          // If dimension does not match, switch back to per-channel
           scale_mode = nvinfer1::ScaleMode::kCHANNEL;
           break;
         }
       }
-      // if channel as candidate, validate it
+      // If the mode is per-channel, since channel dimension is assumed to be
+      // the third to last dimension, we need to make sure all other dimensions
+      // have size 1.
       if (scale_mode == nvinfer1::ScaleMode::kCHANNEL) {
         for (int i = 1; i < dims_w.nbDims; i++) {
           if (dims_w.d[i] != 1)
-            return tensorflow::errors::InvalidArgument(
-                "Weight shape not compatible at, " + node_def.name());
+            return errors::InvalidArgument(
+                "Weight dims not compatible for channel-wise broadcast at ",
+                node_def.name());
         }
-      } else {
-        VLOG(2) << "elementwise";
       }
     } else if (dims_w.nbDims == 1 &&
                dims_w.d[0] == dims_t.d[dims_t.nbDims - 1]) {
-      // channel wise and broadcast required;
-      permutation_flag = true;
+      // Channel wise and broadcast required. We compare the last dimension of
+      // the tensor shape because of tensorflow default broadcasting rules.
+      need_to_permute = true;
       scale_mode = nvinfer1::ScaleMode::kCHANNEL;
     } else {
-      return tensorflow::errors::InvalidArgument(
-          "Weight shape not compatible at, " + node_def.name());
+      return errors::InvalidArgument("Weight dims not compatible at ",
+                                     node_def.name());
     }
   }
+  // TODO(laigd): we should add validation_only support in TransposeTensor() and
+  // PrepareTensorForShape().
+  if (params->validation_only) return Status::OK();
 
-  // transpose last dimension
+  // Transpose last dimension.
   std::vector<int> permutation(dims_t.nbDims + 1);
-  if (permutation_flag) {
-    if (scale_mode == nvinfer1::ScaleMode::kCHANNEL && dims_t.nbDims > 1) {
-      // we swap the last dimension into channel for trt.
-      // because of tensorflow default broadcasting rules.
-      for (int i = 0; i < static_cast<int>(permutation.size()); i++) {
-        permutation[i] = i;
-      }
-      permutation[1] = dims_t.nbDims;
-      permutation[dims_t.nbDims] = 1;
-      TF_RETURN_IF_ERROR(params->converter->TransposeTensor(
-          const_cast<nvinfer1::ITensor*>(tensor), permutation, &tensor));
-    } else {
-      return tensorflow::errors::InvalidArgument(
-          "Transpose cannot be applied, " + node_def.name());
+  if (need_to_permute) {
+    // We swap the last dimension into channel for trt, because of tensorflow
+    // default broadcasting rules.
+    for (int i = 0; i < static_cast<int>(permutation.size()); i++) {
+      permutation[i] = i;
     }
+    permutation[1] = dims_t.nbDims;
+    permutation[dims_t.nbDims] = 1;
+    TF_RETURN_IF_ERROR(params->converter->TransposeTensor(
+        const_cast<nvinfer1::ITensor*>(tensor), permutation, &tensor));
   }
 
-  if (params->converter->is_fp16()) {
+  if (params->converter->precision_mode() == FP16MODE) {
     weights = ConvertFP32ToFP16(params->weight_store, weights);
   }
 
-  // prepare weights
+  // Prepare weights
   TRT_ShapedWeights shift_weights(weights.type_);
   TRT_ShapedWeights scale_weights(weights.type_);
   TRT_ShapedWeights power_weights(weights.type_);
 
-  // Maybe I should do a switch
   if (node_def.op() == "Sub") {
     if (swapped_inputs) {
       shift_weights = weights;
@@ -1312,6 +1441,10 @@ tensorflow::Status BinaryTensorOpWeight(OpConverterParams* params,
           *const_cast<nvinfer1::ITensor*>(tensor),
           nvinfer1::UnaryOperation::kNEG);
       TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
+      // Since quantization ranges are symmetric, the same range as the input
+      // will work for the negation of the input.
+      params->converter->MarkQuantizationRangesAsInferrable(
+          const_cast<nvinfer1::ITensor*>(tensor), layer->getOutput(0));
       tensor = layer->getOutput(0);
     } else {
       TRT_ShapedWeights neg_weights =
@@ -1323,6 +1456,25 @@ tensorflow::Status BinaryTensorOpWeight(OpConverterParams* params,
     }
   } else if (node_def.op() == "Div" || node_def.op() == "RealDiv") {
     if (swapped_inputs) {
+      // We need to infer the quantization range for this intermediate tensor.
+      //
+      //   x -> [Recip] -> 1/x -> [Scale] -> s/x
+      //                    ^
+      //            need range for this
+      //
+      // We have the quantization scales for x and s/x - can we divide the scale
+      // for s/x by s? Only if it is a scalar.
+      //
+      // Because of this issue, fall back to BinaryTensorOpTensor if we are
+      // doing INT8 with no calibration. There is most likely no performance
+      // penalty by falling back here.
+      if (params->converter->precision_mode() == INT8MODE &&
+          !params->converter->use_calibration()) {
+        return errors::Unimplemented(
+            "Intermediate quantization range cannot be determined without"
+            " calibration. Falling back to BinaryTensorOpTensor for ",
+            node_def.op(), ", at ", node_def.name());
+      }
       scale_weights = weights;
       nvinfer1::IUnaryLayer* layer = params->converter->network()->addUnary(
           *const_cast<nvinfer1::ITensor*>(tensor),
@@ -1342,8 +1494,8 @@ tensorflow::Status BinaryTensorOpWeight(OpConverterParams* params,
   } else if (node_def.op() == "Add") {
     shift_weights = weights;
   } else {
-    return tensorflow::errors::Unimplemented("Binary op not supported: " +
-                                             node_def.op());
+    // This should not happen.
+    return errors::Unimplemented("Binary op not supported at ", node_def.op());
   }
 
   nvinfer1::IScaleLayer* layer = params->converter->network()->addScale(
@@ -1353,8 +1505,8 @@ tensorflow::Status BinaryTensorOpWeight(OpConverterParams* params,
   TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
 
   const nvinfer1::ITensor* output_tensor = layer->getOutput(0);
-  // transpose back dimension
-  if (permutation_flag) {
+  // Transpose back dimension
+  if (need_to_permute) {
     TF_RETURN_IF_ERROR(params->converter->TransposeTensor(
         const_cast<nvinfer1::ITensor*>(output_tensor), permutation,
         &output_tensor));
@@ -1398,7 +1550,7 @@ tensorflow::Status ConvertConv2DHelper(OpConverterParams* params, int group) {
     return tensorflow::errors::Internal(
         "Conv2D expects kernel of dimension 4, at: " + node_def.name());
   }
-  if (params->converter->is_fp16()) {
+  if (params->converter->precision_mode() == FP16MODE) {
     weights_rsck =
         ConvertFP32ToFP16(params->weight_store, inputs.at(1).weights());
   }
@@ -1445,6 +1597,8 @@ tensorflow::Status ConvertConv2DHelper(OpConverterParams* params, int group) {
         nvinfer1::DimsHW(padding[0].first, padding[1].first),
         nvinfer1::DimsHW(padding[0].second, padding[1].second));
     TFTRT_RETURN_ERROR_IF_NULLPTR(pad_layer, node_def.name());
+    params->converter->MarkQuantizationRangesAsInferrable(
+        const_cast<nvinfer1::ITensor*>(tensor), pad_layer->getOutput(0));
     padding = {{0, 0}, {0, 0}};
     tensor = pad_layer->getOutput(0);
     VLOG(2) << "TENSOR after: " << DebugString(tensor->getDimensions());
@@ -1486,9 +1640,9 @@ tensorflow::Status ConvertConv2DHelper(OpConverterParams* params,
                                            params->node_def.name());
 }
 
-tensorflow::Status BinaryTensorOpTensor(OpConverterParams* params,
-                                        const TRT_TensorOrWeights& operand_l,
-                                        const TRT_TensorOrWeights& operand_r) {
+Status BinaryTensorOpTensor(OpConverterParams* params,
+                            const TRT_TensorOrWeights& operand_l,
+                            const TRT_TensorOrWeights& operand_r) {
   const auto& node_def = params->node_def;
   static const std::unordered_map<string, nvinfer1::ElementWiseOperation> ops{
       {"Add", nvinfer1::ElementWiseOperation::kSUM},
@@ -1499,50 +1653,52 @@ tensorflow::Status BinaryTensorOpTensor(OpConverterParams* params,
       {"Minimum", nvinfer1::ElementWiseOperation::kMIN},
       {"Maximum", nvinfer1::ElementWiseOperation::kMAX},
   };
-
-  const nvinfer1::ITensor* tensor_l;
-  const nvinfer1::ITensor* tensor_r;
-
-  nvinfer1::Dims dim_l;
-  nvinfer1::Dims dim_r;
-
-  if (!TensorRTGetBroadcastShape(operand_l.GetTrtDims(), operand_l.is_tensor(),
-                                 operand_r.GetTrtDims(), operand_r.is_tensor(),
-                                 &dim_l, &dim_r)) {
-    return tensorflow::errors::InvalidArgument(
-        "Binary op broadcast scheme not supported by TensorRT op: " +
-        node_def.op() + ", at: " + node_def.name());
-  }
-
-  TF_RETURN_IF_ERROR(
-      params->converter->PrepareTensorForShape(operand_l, dim_l, &tensor_l));
-  TF_RETURN_IF_ERROR(
-      params->converter->PrepareTensorForShape(operand_r, dim_r, &tensor_r));
-
-  // get trt type & shape
-  TFAttrs attrs(node_def);
-  // maybe this part has to be moved into the block of rsqrt later
-  nvinfer1::DataType dtype = attrs.get<nvinfer1::DataType>("T");
-
-  // check type consistency
-  TFTRT_CHECK_EQ_TYPE(tensor_l->getType(), dtype);
-  TFTRT_CHECK_EQ_TYPE(tensor_r->getType(), dtype);
   auto op_pair = ops.find(node_def.op());
   if (op_pair == ops.end()) {
-    return tensorflow::errors::Unimplemented(
-        "binary op: ", node_def.op(), " not supported at: ", node_def.name());
+    return errors::Unimplemented("Binary op ", node_def.op(),
+                                 " not supported at: ", node_def.name());
   }
 
+  nvinfer1::Dims broadcasted_dims_l, broadcasted_dims_r;
+  Status status = params->converter->GetTrtBroadcastShape(
+      operand_l, operand_r, &broadcasted_dims_l, &broadcasted_dims_r);
+  if (!status.ok()) {
+    return errors::InvalidArgument(
+        "Unsupported binary op broadcast scheme for op ", node_def.name(), ": ",
+        status.error_message());
+  }
+  if (params->validation_only) return Status::OK();
+
+  const nvinfer1::ITensor* tensor_l = nullptr;
+  const nvinfer1::ITensor* tensor_r = nullptr;
+  status = params->converter->PrepareTensorForShape(
+      operand_l, broadcasted_dims_l, &tensor_l);
+  if (status.ok()) {
+    status = params->converter->PrepareTensorForShape(
+        operand_r, broadcasted_dims_r, &tensor_r);
+  }
+  if (!status.ok()) {
+    return errors::Internal("Failed to convert binary op ", node_def.name(),
+                            ": ", status.error_message());
+  }
+
+  // Check type consistency.
+  TFAttrs attrs(node_def);
+  nvinfer1::DataType dtype = attrs.get<nvinfer1::DataType>("T");
+  TFTRT_CHECK_EQ_TYPE(tensor_l->getType(), dtype)
+      << DebugString(tensor_l->getType()) << " vs " << DebugString(dtype);
+  TFTRT_CHECK_EQ_TYPE(tensor_r->getType(), dtype)
+      << DebugString(tensor_r->getType()) << " vs " << DebugString(dtype);
+
+  // Add ElementWise layer.
   nvinfer1::IElementWiseLayer* layer =
       params->converter->network()->addElementWise(
-          // TODO(aaroey): will tensor_l/tensor_r get modified?
           *const_cast<nvinfer1::ITensor*>(tensor_l),
           *const_cast<nvinfer1::ITensor*>(tensor_r), op_pair->second);
   TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
-
   nvinfer1::ITensor* output_tensor = layer->getOutput(0);
 
-  // pass the output
+  // Pass the output
   params->outputs->push_back(TRT_TensorOrWeights(output_tensor));
   return tensorflow::Status::OK();
 }
@@ -1789,6 +1945,8 @@ tensorflow::Status ConvertPool(OpConverterParams* params) {
         nvinfer1::DimsHW(padding[0].first, padding[1].first),
         nvinfer1::DimsHW(padding[0].second, padding[1].second));
     TFTRT_RETURN_ERROR_IF_NULLPTR(pad_layer, node_def.name());
+    params->converter->MarkQuantizationRangesAsInferrable(
+        const_cast<nvinfer1::ITensor*>(tensor), pad_layer->getOutput(0));
     padding = {{0, 0}, {0, 0}};
     tensor = pad_layer->getOutput(0);
   }
@@ -1796,6 +1954,11 @@ tensorflow::Status ConvertPool(OpConverterParams* params) {
   nvinfer1::IPoolingLayer* layer = params->converter->network()->addPooling(
       *const_cast<nvinfer1::ITensor*>(tensor), type, ksize);
   TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
+  // TODO(tmorris): Average pooling may not be entirely safe to infer
+  // quantization range through (at least forwards - backwards should be fine).
+  // Max pooling is okay.
+  params->converter->MarkQuantizationRangesAsInferrable(
+      const_cast<nvinfer1::ITensor*>(tensor), layer->getOutput(0));
 
   layer->setStride(stride);
   layer->setPadding({padding[0].first, padding[1].first});
@@ -1813,110 +1976,290 @@ tensorflow::Status ConvertPool(OpConverterParams* params) {
 }
 
 tensorflow::Status ConvertActivation(OpConverterParams* params) {
-  const nvinfer1::ITensor* tensor = params->inputs.at(0).tensor();
+  const auto& inputs = params->inputs;
+  const auto& node_def = params->node_def;
+  if (inputs.size() != 1) {
+    return tensorflow::errors::InvalidArgument(
+        node_def.op(), " expects one input, at ", node_def.name());
+  }
+  if (!inputs.at(0).is_tensor()) {
+    return tensorflow::errors::Unimplemented(
+        node_def.op(), " is only implemented for tensors, at ",
+        node_def.name());
+  }
+  static const std::unordered_map<string, nvinfer1::ActivationType> ops{
+      {"Relu", nvinfer1::ActivationType::kRELU},
+      {"Sigmoid", nvinfer1::ActivationType::kSIGMOID},
+      {"Tanh", nvinfer1::ActivationType::kTANH},
+  };
+  auto op_pair = ops.find(node_def.op());
+  if (op_pair == ops.end()) {
+    return tensorflow::errors::Unimplemented("Activation op: ", node_def.op(),
+                                             " not supported at: ",
+                                             node_def.name());
+  }
+  if (params->validation_only) return tensorflow::Status::OK();
+
+  // Start conversion.
+  const nvinfer1::ITensor* tensor = inputs.at(0).tensor();
   nvinfer1::IActivationLayer* layer =
       params->converter->network()->addActivation(
-          *const_cast<nvinfer1::ITensor*>(tensor),
-          nvinfer1::ActivationType::kRELU);
-  TFTRT_RETURN_ERROR_IF_NULLPTR(layer, params->node_def.name());
+          *const_cast<nvinfer1::ITensor*>(tensor), op_pair->second);
+  TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
   nvinfer1::ITensor* output_tensor = layer->getOutput(0);
+  // Set quantization range for output of Sigmoid, Tanh.
+  if (node_def.op() == "Sigmoid") {
+    params->converter->ProvideQuantizationRange(output_tensor, 0.0f, 1.0f);
+  } else if (node_def.op() == "Tanh") {
+    params->converter->ProvideQuantizationRange(output_tensor, -1.0f, 1.0f);
+  }
   params->outputs->push_back(TRT_TensorOrWeights(output_tensor));
   return tensorflow::Status::OK();
 }
 
-tensorflow::Status ConvertScale(OpConverterParams* params) {
+Status ConvertQuantize(OpConverterParams* params) {
+  const auto& inputs = params->inputs;
+  const auto& node_def = params->node_def;
+  if ((inputs.size() == 0) ||
+      (node_def.op() == "FakeQuantWithMinMaxArgs" && inputs.size() != 1) ||
+      (node_def.op() == "FakeQuantWithMinMaxVars" && inputs.size() != 3) ||
+      (node_def.op() == "QuantizeAndDequantizeV2" && inputs.size() != 3) ||
+      (node_def.op() == "QuantizeAndDequantizeV3" && inputs.size() != 4)) {
+    return errors::InvalidArgument("Invalid number of inputs for ",
+                                   node_def.op(), ", at ", node_def.name());
+  }
+  if (inputs.at(0).is_weights()) {
+    // TensorRT will automatically quantize weights, so we will ignore ranges
+    // for weights.
+    params->outputs->push_back(inputs.at(0));
+    return Status::OK();
+  }
+  float min_range = 0.0f;
+  float max_range = 0.0f;
+  if (node_def.op() == "FakeQuantWithMinMaxArgs") {
+    // Get ranges via node attributes.
+    TFAttrs attrs(node_def);
+    if (attrs.count("min") == 0 || attrs.count("max") == 0) {
+      return errors::InvalidArgument("Min or max attribute not found for ",
+                                     node_def.op(), " at ", node_def.name());
+    }
+    min_range = attrs.get<float>("min");
+    max_range = attrs.get<float>("max");
+  } else if (node_def.op() == "FakeQuantWithMinMaxVars" ||
+             node_def.op() == "QuantizeAndDequantizeV2" ||
+             node_def.op() == "QuantizeAndDequantizeV3") {
+    // Get ranges via inputs.
+    if (!inputs.at(1).is_weights() || !inputs.at(2).is_weights()) {
+      return errors::InvalidArgument("Min and max inputs for ", node_def.op(),
+                                     " must be weights not tensors, at ",
+                                     node_def.name());
+    }
+    auto get_weights_value = [&inputs](int index) {
+      auto raw_weights = static_cast<float*>(
+          const_cast<void*>(inputs.at(index).weights().GetValues()));
+      return raw_weights[0];
+    };
+    min_range = get_weights_value(1);
+    max_range = get_weights_value(2);
+  } else {
+    return errors::InvalidArgument("Unknown quantization op ", node_def.op(),
+                                   ", at ", node_def.name());
+  }
+  if (params->validation_only) return Status::OK();
+
+  // Store ranges for tensor
+  params->converter->ProvideQuantizationRange(
+      const_cast<nvinfer1::ITensor*>(inputs.at(0).tensor()), min_range,
+      max_range);
+  // Sometimes, TRT may not quantize a tensor, either because it chooses to
+  // execute a higher precision kernel or because of op fusion. In these cases,
+  // accuracy will suffer if the model was trained to expect quantization at
+  // that tensor. We should consider adding a clip(tensor, min_range, max_range)
+  // operation here to ensure that any arbitrarily placed quantize node will
+  // execute as expected. However, this will negatively affect performance. If
+  // users train their models in a way which models inference as close as
+  // possible (i.e. not quantizing in place where fusion will occur), then there
+  // is no problem with the current implementation.
+  params->outputs->push_back(inputs.at(0));
+  return Status::OK();
+}
+
+// TODO(pdavoodi): we should update relu6 implementation once TensorRT supports
+// Relu6 natively.
+tensorflow::Status ConvertRelu6(OpConverterParams* params) {
+  const auto& inputs = params->inputs;
+  const auto& node_def = params->node_def;
+  if (inputs.size() != 1) {
+    return tensorflow::errors::InvalidArgument(
+        "Invalid number of inputs for Relu6, at ", node_def.name());
+  }
+  if (inputs.at(0).is_weights()) {
+    return tensorflow::errors::Unimplemented(
+        "Relu6 is only implemented for tensors, not weights, at ",
+        node_def.name());
+  }
+  if (params->validation_only) return Status::OK();
+  // ***************************************************************************
+  // TensorRT does not implement Relu6 natively. This function converts Relu6 op
+  // to available TensorRT ops: Relu6(x) = min(Relu(x), 6)
+  // ***************************************************************************
+
+  // Input Tensor
+  const nvinfer1::ITensor* tensor = inputs.at(0).tensor();
+
+  // Relu operation i.e. Relu(x) = max(0, x)
+  nvinfer1::IActivationLayer* relu_layer =
+      params->converter->network()->addActivation(
+          *const_cast<nvinfer1::ITensor*>(tensor),
+          nvinfer1::ActivationType::kRELU);
+  TFTRT_RETURN_ERROR_IF_NULLPTR(relu_layer, node_def.name());
+
+  // Large range of relu is problematic during quantization in INT8 precision
+  // mode. Setting dynamic range of relu = [0.f, 6.0f] helps with quantization.
+  // TRT only uses dynamic ranges in INT8 precision mode,
+  // and this does not affect the FP32 path.
+  params->converter->ProvideQuantizationRange(relu_layer->getOutput(0), 0.0f,
+                                              6.0f);
+
+  // Create a constant layer to store the floating point weight i.e. 6.0f This
+  // tensor will be broadcasted uniformly during elementwise `min` operation.
+  // The constant has to have the same rank as the input in order for TRT to
+  // broadcast
+  nvinfer1::Dims dims;
+  dims.nbDims = relu_layer->getOutput(0)->getDimensions().nbDims;
+  for (int i = 0; i < dims.nbDims; i++) {
+    dims.d[i] = 1;
+  }
+  TRT_ShapedWeights weights = params->weight_store->GetTempWeights(
+      tensorflow::DataType::DT_FLOAT, dims);
+  auto weights_ptr =
+      static_cast<float*>(const_cast<void*>(weights.GetValues()));
+  weights_ptr[0] = 6.0f;
+  nvinfer1::IConstantLayer* const6_layer =
+      params->converter->network()->addConstant(dims, weights.GetTrtWeights());
+  TFTRT_RETURN_ERROR_IF_NULLPTR(const6_layer, node_def.name());
+  params->converter->ProvideQuantizationRange(const6_layer->getOutput(0), 0.0f,
+                                              6.0f);
+
+  // ElementWise Min Operation
+  // Min op is a nop for INT8 execution path, as the input tensor
+  // to this layer will only have values in range [0.f, 6.0f].
+  const nvinfer1::ITensor* tensor_l = relu_layer->getOutput(0);
+  const nvinfer1::ITensor* tensor_r = const6_layer->getOutput(0);
+  nvinfer1::IElementWiseLayer* relu6_layer =
+      params->converter->network()->addElementWise(
+          *const_cast<nvinfer1::ITensor*>(tensor_l),
+          *const_cast<nvinfer1::ITensor*>(tensor_r),
+          nvinfer1::ElementWiseOperation::kMIN);
+  TFTRT_RETURN_ERROR_IF_NULLPTR(relu6_layer, node_def.name());
+  nvinfer1::ITensor* output_tensor = relu6_layer->getOutput(0);
+  params->converter->ProvideQuantizationRange(output_tensor, 0.0f, 6.0f);
+
+  params->outputs->push_back(TRT_TensorOrWeights(output_tensor));
+  return Status::OK();
+}
+
+tensorflow::Status ConvertBiasAdd(OpConverterParams* params) {
   const auto& inputs = params->inputs;
   const auto& node_def = params->node_def;
   if (inputs.size() != 2 || !inputs.at(0).is_tensor() ||
       !inputs.at(1).is_weights()) {
-    return tensorflow::errors::Unimplemented(
-        "ConvertScale only supports tensor<op>weight: ", node_def.name());
+    return errors::InvalidArgument("Input expects tensor and weights, at ",
+                                   node_def.name());
   }
+  if (params->validation_only) return Status::OK();
 
-  const nvinfer1::ITensor* tensor = inputs.at(0).tensor();
-  TRT_ShapedWeights weights = inputs.at(1).weights();
-  if (params->converter->is_fp16()) {
-    weights = ConvertFP32ToFP16(params->weight_store, inputs.at(1).weights());
-  }
-
-  TRT_ShapedWeights empty_weights(weights.type_);
+  nvinfer1::ITensor* tensor =
+      const_cast<nvinfer1::ITensor*>(inputs.at(0).tensor());
+  const nvinfer1::Dims original_dims = tensor->getDimensions();
   TFAttrs attrs(node_def);
-
-  const auto data_format = attrs.get<string>("data_format");
-  int channel_index;
-  const auto dims = tensor->getDimensions();
-  if (data_format == "NHWC") {
-    //  1). NHWC is really N+C
-    channel_index = dims.nbDims - 1;  // batch dimension is implicit here!
-  } else {
-    //  2). NCHW is really N+CHW
-    channel_index = 0;  // batch dimension is implicit here!
-  }
+  const string data_format = attrs.get<string>("data_format");
+  const int channel_index =
+      (data_format == "NHWC" ? original_dims.nbDims - 1 : 0);
 
   nvinfer1::Permutation permutation;
-  for (int32_t i = 0; i < dims.nbDims; ++i) {
-    permutation.order[i] = i;
-  }
-
-  if (channel_index >= 0) {
+  if (channel_index != 0) {
+    // Permute the dimensions so that the channel dimension is the first
+    // dimension.
+    for (int i = 0; i < original_dims.nbDims; ++i) {
+      permutation.order[i] = i;
+    }
     permutation.order[0] = channel_index;
     permutation.order[channel_index] = 0;
-  } else {
-    return tensorflow::errors::Unimplemented(
-        "TFTRT::BiasAdd cannot apply on batch dimension, at ", node_def.name());
+    VLOG(1) << "ConvertBiasAdd permutation: "
+            << DebugString(permutation, original_dims.nbDims);
   }
 
   // TensorRT addScale requires input to be of rank 3, we need to apply
-  // transpose as well as reshape
-  if (channel_index != 0 || dims.nbDims != 3) {
+  // transpose as well as reshape.
+  // TODO(laigd): this doesn't match what the TRT doc says, fix the doc?
+  if (channel_index != 0 || original_dims.nbDims != 3) {
     nvinfer1::IShuffleLayer* shuffle_layer =
-        params->converter->network()->addShuffle(
-            *const_cast<nvinfer1::ITensor*>(tensor));
+        params->converter->network()->addShuffle(*tensor);
     TFTRT_RETURN_ERROR_IF_NULLPTR(shuffle_layer, node_def.name());
+    params->converter->MarkQuantizationRangesAsInferrable(
+        tensor, shuffle_layer->getOutput(0));
+
+    // NOTE(laigd): for some reason we need to apply the reshape
+    // unconditionally. The default shape has nbDims==-1 and it seems the
+    // behavior is undefined in some cases.
     nvinfer1::Dims reshape_dims;
     reshape_dims.nbDims = 3;
-    reshape_dims.d[0] = 0;                          // 0 copy from the input
-    reshape_dims.d[1] = dims.nbDims >= 2 ? 0 : 1;   // 0 copy from the input
-    reshape_dims.d[2] = dims.nbDims >= 3 ? -1 : 1;  // -1 infer from the rest
+    // 0 means copying from input; -1 means inferring from the rest.
+    reshape_dims.d[0] = 0;
+    reshape_dims.d[1] = original_dims.nbDims >= 2 ? 0 : 1;
+    reshape_dims.d[2] = original_dims.nbDims >= 3 ? -1 : 1;
+    shuffle_layer->setReshapeDimensions(reshape_dims);
+
     if (channel_index != 0) {
-      // maybe we do not need this check. concerned about TRT optimization
       shuffle_layer->setFirstTranspose(permutation);
     }
-    shuffle_layer->setReshapeDimensions(reshape_dims);
     tensor = shuffle_layer->getOutput(0);
   }
 
+  TRT_ShapedWeights weights = inputs.at(1).weights();
+  if (params->converter->precision_mode() == FP16MODE) {
+    weights = ConvertFP32ToFP16(params->weight_store, weights);
+  }
   nvinfer1::ScaleMode mode = nvinfer1::ScaleMode::kCHANNEL;
   if (weights.shape_.d[0] == 1) {
     mode = nvinfer1::ScaleMode::kUNIFORM;
   }
 
+  TRT_ShapedWeights empty_weights(weights.type_);
   nvinfer1::IScaleLayer* layer = params->converter->network()->addScale(
-      *const_cast<nvinfer1::ITensor*>(tensor), mode, weights.GetTrtWeights(),
-      empty_weights.GetTrtWeights(), empty_weights.GetTrtWeights());
+      *tensor, mode, weights.GetTrtWeights(), empty_weights.GetTrtWeights(),
+      empty_weights.GetTrtWeights());
   TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
 
   nvinfer1::ITensor* output_tensor = layer->getOutput(0);
 
-  // restore transpose & reshape
-  if (channel_index != 0 || dims.nbDims != 3) {
+  // Restore transpose & reshape.
+  if (channel_index != 0 || original_dims.nbDims != 3) {
     nvinfer1::IShuffleLayer* shuffle_layer =
-        params->converter->network()->addShuffle(
-            *const_cast<nvinfer1::ITensor*>(output_tensor));
+        params->converter->network()->addShuffle(*output_tensor);
     TFTRT_RETURN_ERROR_IF_NULLPTR(shuffle_layer, node_def.name());
-    nvinfer1::Dims reshape_dims = dims;
-    int tmp = reshape_dims.d[channel_index];
-    reshape_dims.d[channel_index] = reshape_dims.d[0];
-    reshape_dims.d[0] = tmp;
+    // NOTE: for same reason as mentioned above we need to apply the reshape
+    // unconditionally.
+    nvinfer1::Dims reshape_dims = original_dims;
+    if (channel_index != 0) {
+      // NOTE: according to NVIDIA dimension types are deprecated, so we don't
+      // need to copy them back.
+      reshape_dims.d[channel_index] = original_dims.d[0];
+      reshape_dims.d[0] = original_dims.d[channel_index];
+    }
     shuffle_layer->setReshapeDimensions(reshape_dims);
+
     if (channel_index != 0) {
       shuffle_layer->setSecondTranspose(permutation);
     }
+    params->converter->MarkQuantizationRangesAsInferrable(
+        output_tensor, shuffle_layer->getOutput(0));
     output_tensor = shuffle_layer->getOutput(0);
   }
 
   params->outputs->push_back(TRT_TensorOrWeights(output_tensor));
-  return tensorflow::Status::OK();
+  return Status::OK();
 }
 
 Status GetTensorDimsWithProtoShape(const Tensor& tensor,
@@ -2053,9 +2396,9 @@ tensorflow::Status ConvertConst(OpConverterParams* params) {
           uint8* data = reinterpret_cast<uint8*>(temp_weights.data());
           std::copy(data, data + tensor.NumElements(), dst);
         } else {
-          return errors::FailedPrecondition(
-              "Unexpected data type: ", DataTypeString(dtype),
-              " at: ", node_def.name());
+          return errors::FailedPrecondition("Unexpected data type: ",
+                                            DataTypeString(dtype), " at: ",
+                                            node_def.name());
         }
       }
     }
@@ -2070,32 +2413,41 @@ tensorflow::Status ConvertConst(OpConverterParams* params) {
 }
 
 tensorflow::Status ConvertIdentity(OpConverterParams* params) {
+  // TODO(tmorris): TRT's Identity layer does not get optimized away as of TRT
+  // 5.0, however once we know that it does it would be nice to use that
+  // instead.
   params->outputs->push_back(params->inputs.at(0));
   return tensorflow::Status::OK();
 }
 
-tensorflow::Status ConvertBinary(OpConverterParams* params) {
+Status ConvertBinary(OpConverterParams* params) {
   const auto& inputs = params->inputs;
   const auto& node_def = params->node_def;
   if (inputs.size() != 2) {
-    return tensorflow::errors::FailedPrecondition(
-        "Binary ops require two tensor input, at ", node_def.name());
+    return errors::InvalidArgument("Binary ops require two inputs, at ",
+                                   node_def.name());
   }
 
   // Constant folding should have been done by TensorFlow
-
   if (inputs.at(0).is_weights() && inputs.at(1).is_weights()) {
-    return tensorflow::errors::Unimplemented(
+    return errors::Unimplemented(
         "Constant folding is falled back to TensorFlow, binary op received "
         "both input as constant at: ",
         node_def.name());
   }
 
-  // Try to convert into Scale layer first (for better performance)
+  // TODO(tmorris): TRT plans to deprecate IScaleLayer and will replace it with
+  // IElementwiseLayer. At that point, we can remove BinaryTensorOpWeight. For
+  // now, the performance will be slightly better with IScaleLayer because it
+  // can be fused in more situations. However, most of the benefits of
+  // IScaleLayer are when the layer performs both a shift and a scale, which we
+  // don't do except for convolutions.
+  //
+  // Try to convert into Scale layer first (for better performance).
   // Since scale layer supports restricted broadcast policy and op types, we
   // allow failure and try to handle it through Elementwise op
-  // (BinaryTensorOpTensor)
-  Status status = tensorflow::Status::OK();
+  // (BinaryTensorOpTensor).
+  Status status = Status::OK();
   if (inputs.at(0).is_tensor() && inputs.at(1).is_weights()) {
     status = BinaryTensorOpWeight(params, inputs.at(0).tensor(),
                                   inputs.at(1).weights(), false);
@@ -2103,7 +2455,10 @@ tensorflow::Status ConvertBinary(OpConverterParams* params) {
     status = BinaryTensorOpWeight(params, inputs.at(1).tensor(),
                                   inputs.at(0).weights(), true);
   }
+  // If both input are tensors, or one of them is weights but the conversion
+  // above failed, try the conversion using BinaryTensorOpTensor.
   if ((inputs.at(0).is_tensor() && inputs.at(1).is_tensor()) || !status.ok()) {
+    if (!status.ok()) VLOG(1) << status;
     status = BinaryTensorOpTensor(params, inputs.at(0), inputs.at(1));
   }
   return status;
@@ -2133,6 +2488,20 @@ tensorflow::Status ConvertUnary(OpConverterParams* params) {
 
   nvinfer1::IUnaryLayer* layer;
   if (node_def.op() == "Rsqrt") {
+    // We will need a quantization range for intermediate tensor if not using
+    // calibration.
+    //
+    //   x -> [Sqrt] -> sqrt(x) -> [Recip] -> 1/sqrt(x)
+    //                     ^
+    //               need range here
+    if (params->converter->precision_mode() == INT8MODE &&
+        !params->converter->use_calibration()) {
+      return errors::Unimplemented(
+          "Intermediate quantization range cannot be determined without"
+          " calibration for Rsqrt, consider replacing with "
+          "Sqrt -> FakeQuant -> Reciprocal ops, at ",
+          node_def.name());
+    }
     layer = params->converter->network()->addUnary(
         *const_cast<nvinfer1::ITensor*>(tensor),
         nvinfer1::UnaryOperation::kSQRT);
@@ -2156,6 +2525,48 @@ tensorflow::Status ConvertUnary(OpConverterParams* params) {
   return tensorflow::Status::OK();
 }
 
+tensorflow::Status ConvertSquare(OpConverterParams* params) {
+  const auto& inputs = params->inputs;
+  const auto& node_def = params->node_def;
+  if (inputs.size() != 1) {
+    return tensorflow::errors::InvalidArgument("Square expects one input, at ",
+                                               node_def.name());
+  }
+  if (inputs.at(0).is_weights()) {
+    return tensorflow::errors::Unimplemented(
+        "Square is only implemented for tensors, at ", node_def.name());
+  }
+  if (params->validation_only) return Status::OK();
+
+  // Constant 2 with same rank as input
+  nvinfer1::Dims dims = inputs.at(0).GetTrtDims();
+  for (int i = 0; i < dims.nbDims; i++) {
+    dims.d[i] = 1;
+  }
+  TRT_ShapedWeights weights = params->weight_store->GetTempWeights(
+      tensorflow::DataType::DT_FLOAT, dims);
+  auto weights_ptr =
+      static_cast<float*>(const_cast<void*>(weights.GetValues()));
+  weights_ptr[0] = 2.f;
+  nvinfer1::IConstantLayer* const2_layer =
+      params->converter->network()->addConstant(dims, weights.GetTrtWeights());
+  TFTRT_RETURN_ERROR_IF_NULLPTR(const2_layer, node_def.name());
+
+  // ElementWise Pow Operation
+  const nvinfer1::ITensor* tensor_l = inputs.at(0).tensor();
+  const nvinfer1::ITensor* tensor_r = const2_layer->getOutput(0);
+  nvinfer1::IElementWiseLayer* layer =
+      params->converter->network()->addElementWise(
+          *const_cast<nvinfer1::ITensor*>(tensor_l),
+          *const_cast<nvinfer1::ITensor*>(tensor_r),
+          nvinfer1::ElementWiseOperation::kPOW);
+  TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
+  nvinfer1::ITensor* output_tensor = layer->getOutput(0);
+
+  params->outputs->push_back(TRT_TensorOrWeights(output_tensor));
+  return tensorflow::Status::OK();
+}
+
 tensorflow::Status ConvertReduce(OpConverterParams* params) {
   const auto& inputs = params->inputs;
   const auto& node_def = params->node_def;
@@ -2692,6 +3103,8 @@ tensorflow::Status ConvertSoftmax(OpConverterParams* params) {
   layer->setAxes(1 << (nbDims - 1));
 
   nvinfer1::ITensor* output_tensor = layer->getOutput(0);
+  // Quantization range for SoftMax is always (0, 1)
+  params->converter->ProvideQuantizationRange(output_tensor, 0.0f, 1.0f);
   params->outputs->push_back(TRT_TensorOrWeights(output_tensor));
   return tensorflow::Status::OK();
 }
@@ -2716,9 +3129,9 @@ tensorflow::Status ConvertTopK(OpConverterParams* params) {
     op = nvinfer1::TopKOperation::kMAX;
     reducedAxes |= 1 << (nbDims - 1);
   } else {
-    return tensorflow::errors::Unimplemented(
-        "Operation: " + node_def.op() +
-        " not implemented, at: " + node_def.name());
+    return tensorflow::errors::Unimplemented("Operation: " + node_def.op() +
+                                             " not implemented, at: " +
+                                             node_def.name());
   }
 
   nvinfer1::ITopKLayer* layer = params->converter->network()->addTopK(
@@ -2732,40 +3145,52 @@ tensorflow::Status ConvertTopK(OpConverterParams* params) {
   return tensorflow::Status::OK();
 }
 
-void TrtNodeValidator::RegisterOpValidators() {
+static void RegisterValidatableOpConverters(
+    std::unordered_map<string, OpConverter>* registration) {
   // TODO(laigd): support all op types.
-  op_validators_["Const"] = ConvertConst;
-  op_validators_["Transpose"] = ConvertTranspose;
-  op_validators_["Reshape"] = ConvertReshape;
-  op_validators_["MatMul"] = ConvertMatMul;
+  (*registration)["BiasAdd"] = ConvertBiasAdd;
+  (*registration)["Const"] = ConvertConst;
+  (*registration)["Transpose"] = ConvertTranspose;
+  (*registration)["Reshape"] = ConvertReshape;
+  (*registration)["MatMul"] = ConvertMatMul;
+  (*registration)["Relu6"] = ConvertRelu6;
+  (*registration)["Square"] = ConvertSquare;
+
+  for (auto quantization_op_type :
+       {"QuantizeAndDequantizeV2", "QuantizeAndDequantizeV3",
+        "FakeQuantWithMinMaxVars", "FakeQuantWithMinMaxArgs"}) {
+    (*registration)[quantization_op_type] = ConvertQuantize;
+  }
+  for (auto binary_op_type :
+       {"Add", "Mul", "Sub", "Div", "RealDiv", "Maximum", "Minimum"}) {
+    (*registration)[binary_op_type] = ConvertBinary;
+  }
+  for (auto activation_op_type : {"Relu", "Sigmoid", "Tanh"}) {
+    (*registration)[activation_op_type] = ConvertActivation;
+  }
+}
+
+void TrtNodeValidator::RegisterOpValidators() {
+  RegisterValidatableOpConverters(&op_validators_);
 }
 
 void Converter::RegisterOpConverters() {
-  // vgg_16 slim implementation
+  RegisterValidatableOpConverters(&op_registry_);
+
   op_registry_["Conv2D"] = ConvertConv2D;
   op_registry_["DepthwiseConv2dNative"] = ConvertConv2DDepthwise;
-  op_registry_["Relu"] = ConvertActivation;
   op_registry_["MaxPool"] = ConvertPool;
   op_registry_["AvgPool"] = ConvertPool;
-  op_registry_["BiasAdd"] = ConvertScale;
-  op_registry_["Const"] = ConvertConst;
   // TODO(ben,jie): this is a temp hack.
   op_registry_["Identity"] = ConvertIdentity;  // Identity should be removed
   op_registry_["Snapshot"] = ConvertIdentity;  // Snapshot should be removed
 
-  // resnet_50_v1 slim implementation
-  op_registry_["Add"] = ConvertBinary;
-  op_registry_["Mul"] = ConvertBinary;
-  op_registry_["Sub"] = ConvertBinary;
   op_registry_["Pad"] = ConvertPad;
 
   op_registry_["ConcatV2"] = ConvertConcat;
   op_registry_["FusedBatchNorm"] = ConvertFusedBatchNorm;
   op_registry_["FusedBatchNormV2"] = ConvertFusedBatchNorm;
 
-  op_registry_["Div"] = ConvertBinary;
-  op_registry_["RealDiv"] = ConvertBinary;
-
   op_registry_["Rsqrt"] = ConvertUnary;
   op_registry_["Reciprocal"] = ConvertUnary;
   op_registry_["Exp"] = ConvertUnary;
@@ -2774,20 +3199,19 @@ void Converter::RegisterOpConverters() {
   op_registry_["Abs"] = ConvertUnary;
   op_registry_["Neg"] = ConvertUnary;
 
-  op_registry_["Transpose"] = ConvertTranspose;
-  op_registry_["Reshape"] = ConvertReshape;
-
   op_registry_["Sum"] = ConvertReduce;
   op_registry_["Prod"] = ConvertReduce;
   op_registry_["Max"] = ConvertReduce;
   op_registry_["Min"] = ConvertReduce;
   op_registry_["Mean"] = ConvertReduce;
-  op_registry_["Maximum"] = ConvertBinary;
-  op_registry_["Minimum"] = ConvertBinary;
   op_registry_["Softmax"] = ConvertSoftmax;
-  op_registry_["MatMul"] = ConvertMatMul;
   op_registry_["BatchMatMul"] = ConvertBatchMatMul;
   op_registry_["TopKV2"] = ConvertTopK;
+  op_registry_["Relu6"] = ConvertRelu6;
+  op_registry_["QuantizeAndDequantizeV2"] = ConvertQuantize;
+  op_registry_["QuantizeAndDequantizeV3"] = ConvertQuantize;
+  op_registry_["FakeQuantWithMinMaxVars"] = ConvertQuantize;
+  op_registry_["FakeQuantWithMinMaxArgs"] = ConvertQuantize;
 
   plugin_converter_ = ConvertPlugin;
 }
@@ -2798,7 +3222,7 @@ tensorflow::Status ConvertGraphDefToEngine(
     const std::vector<tensorflow::PartialTensorShape>& input_shapes,
     Logger* logger, nvinfer1::IGpuAllocator* allocator,
     TRTInt8Calibrator* calibrator,
-    TrtUniquePtrType<nvinfer1::ICudaEngine>* engine,
+    TrtUniquePtrType<nvinfer1::ICudaEngine>* engine, bool use_calibration,
     bool* convert_successfully) {
   engine->reset();
   if (convert_successfully) *convert_successfully = false;
@@ -2813,7 +3237,11 @@ tensorflow::Status ConvertGraphDefToEngine(
     builder->setHalf2Mode(true);
   } else if (precision_mode == INT8MODE) {
     builder->setInt8Mode(true);
-    builder->setInt8Calibrator(calibrator);
+    if (use_calibration) {
+      builder->setInt8Calibrator(calibrator);
+    } else {
+      builder->setInt8Calibrator(nullptr);
+    }
   }
 
   // Create the network.
@@ -2826,7 +3254,7 @@ tensorflow::Status ConvertGraphDefToEngine(
 
   // Build the network
   VLOG(1) << "Starting engine conversion ";
-  Converter converter(trt_network.get(), precision_mode == FP16MODE);
+  Converter converter(trt_network.get(), precision_mode, use_calibration);
   std::vector<std::pair<string, string>> output_tensors;
   // Graph nodes are already topologically sorted during construction
   for (const auto& node_def : gdef.node()) {
@@ -2882,6 +3310,9 @@ tensorflow::Status ConvertGraphDefToEngine(
   TF_RETURN_IF_ERROR(converter.RenameAndMarkOutputTensors(output_tensors));
   if (convert_successfully) *convert_successfully = true;
 
+  // Apply user provided quantization ranges to tensors
+  converter.MaybeApplyQuantizationRanges();
+
   // Build the engine.
   VLOG(1) << "Starting engine creation";
   engine->reset(builder->buildCudaEngine(*converter.network()));
@@ -3026,7 +3457,8 @@ tensorflow::Status ConvertSegmentToGraphDef(
     }
   }
   *common_scope = local_scope;
-  VLOG(0) << "Segment @scope '" << local_scope << "', converted to graph";
+  VLOG(1) << "Converted TensorRT candidate segment @scope '" << local_scope
+          << "' to a GraphDef";
   return tensorflow::Status::OK();
 }
 
diff --git a/tensorflow/contrib/tensorrt/convert/convert_nodes.h b/tensorflow/contrib/tensorrt/convert/convert_nodes.h
index 5cc28b33e7f..f1c4c121ae6 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_nodes.h
+++ b/tensorflow/contrib/tensorrt/convert/convert_nodes.h
@@ -92,7 +92,8 @@ struct EngineInfo {
   EngineInfo()
       : engine_type(EngineType::TRTStatic),
         max_workspace_size_bytes(0),
-        precision_mode(FP32MODE) {}
+        precision_mode(FP32MODE),
+        use_calibration(true) {}
 
   string engine_name;
   string device;
@@ -109,6 +110,7 @@ struct EngineInfo {
   int maximum_cached_engines;
   std::vector<int> cached_engine_batches;
   int precision_mode;
+  bool use_calibration;
 };
 
 // Constructs a graphdef from the segment in the given graph. Adds placeholder
@@ -145,7 +147,7 @@ tensorflow::Status ConvertGraphDefToEngine(
     const std::vector<tensorflow::PartialTensorShape>& input_shapes,
     Logger* logger, nvinfer1::IGpuAllocator* allocator,
     TRTInt8Calibrator* calibrator,
-    TrtUniquePtrType<nvinfer1::ICudaEngine>* engine,
+    TrtUniquePtrType<nvinfer1::ICudaEngine>* engine, bool use_calibration,
     bool* convert_successfully);
 
 // Helper class for the segmenter to determine whether an output edge from the
@@ -392,7 +394,8 @@ class TrtNodeValidator {
 // Class to convert TF nodes to TRT network.
 class Converter {
  public:
-  Converter(nvinfer1::INetworkDefinition* trt_network, bool is_fp16);
+  Converter(nvinfer1::INetworkDefinition* trt_network, int precision_mode,
+            bool use_calibration);
 
   //////////////////////////////////////////////////////////////////////////////
   // Methods used by the TRT engine builder to build a TRT network from a TF
@@ -422,8 +425,43 @@ class Converter {
   // to add TRT layers.
   nvinfer1::INetworkDefinition* network() { return trt_network_; }
 
-  // Is the converter operating in fp16 mode?
-  bool is_fp16() const { return is_fp16_; }
+  // What precision are we targeting?
+  int precision_mode() const { return precision_mode_; }
+
+  // Calibration will be or was previously performed on this network?
+  bool use_calibration() const { return use_calibration_; }
+
+  // This should be called on the inputs and outputs of any layer we create
+  // where we know that the quantization range does not change during that
+  // operation. (e.g. Reshape, Transpose, Identity, MaxPool).
+  void MarkQuantizationRangesAsInferrable(nvinfer1::ITensor* input,
+                                          nvinfer1::ITensor* output);
+
+  // This function should be called when we know the quantization range of a
+  // tensor, either from a quantize/dequantize node or when the output is a
+  // fixed range (e.g. SoftMax, Relu6, Sigmoid).
+  void ProvideQuantizationRange(nvinfer1::ITensor* tensor, float min_range,
+                                float max_range);
+
+  // Should be called when full TRT network has been constructed and before
+  // building the engine.
+  void MaybeApplyQuantizationRanges();
+
+  // This should be called on the inputs and outputs of any layer we create
+  // where we know that the quantization range does not change during that
+  // operation. (e.g. Reshape, Transpose, Identity, MaxPool).
+  void MarkQuantizationRangesAsInferrable(nvinfer1::ITensor* input,
+                                          nvinfer1::ITensor* output);
+
+  // This function should be called when we know the quantization range of a
+  // tensor, either from a quantize/dequantize node or when the output is a
+  // fixed range (e.g. SoftMax, Relu6, Sigmoid).
+  void ProvideQuantizationRange(nvinfer1::ITensor* tensor,
+                                float min_range, float max_range);
+
+  // Should be called when full TRT network has been constructed and before
+  // building the engine.
+  void ApplyQuantizationRanges(bool warn_missing_ranges);
 
   // Below are helper methods for op converters to add different layers to the
   // TRT network.
@@ -440,6 +478,13 @@ class Converter {
                                const nvinfer1::Dims& dims,
                                const nvinfer1::ITensor** tensor);
 
+  // Return OK if the broadcast scheme is supported and compute the shapes after
+  // broadcasting.
+  Status GetTrtBroadcastShape(const TRT_TensorOrWeights& operand_l,
+                              const TRT_TensorOrWeights& operand_r,
+                              nvinfer1::Dims* operand_l_new_dims,
+                              nvinfer1::Dims* operand_r_new_dims) const;
+
  private:
   // Verify the provided batch_size is consistent with batch_size_ and update it
   // if necessary.
@@ -457,6 +502,12 @@ class Converter {
 
   void RegisterOpConverters();
 
+  void PropagateQuantizationRanges();
+
+  // Gets the min and max value in a TRT_ShapedWeights
+  Status GetWeightRange(const TRT_ShapedWeights& weights, float* out_min,
+                        float* out_max) const;
+
   // Registered op converters by op type.
   std::unordered_map<string, OpConverter> op_registry_;
 
@@ -472,7 +523,25 @@ class Converter {
   // Store the weights added during construction of trt_network_.
   TrtWeightStore weight_store_;
 
-  const bool is_fp16_;
+  // During conversion, this table is populated with quantization ranges per
+  // tensor. MaybeApplyQuantizationRanges() will use this table to set the TRT
+  // quantization ranges. Since TRT only supports symmetric ranges, we will
+  // store the range as a single float = max(abs(min_range), abs(max_range)).
+  // Range refers to the floating point values, e.g. min_range = 0.0f, max_range
+  // = 6.0f for Relu6.
+  std::unordered_map<nvinfer1::ITensor*, float> quantization_ranges_;
+
+  // Edges where quantization ranges can be inferred (copied) across ops - from
+  // first tensor to second tensor. PropagateQuantizationRanges() will propagate
+  // known ranges from quantization_ranges_ across these edges, adding the new
+  // ranges to quantization_ranges_ so that they can be applied in
+  // MaybeApplyQuantizationRanges().
+  std::vector<std::pair<nvinfer1::ITensor*, nvinfer1::ITensor*>>
+      quantization_infer_;
+
+  const int precision_mode_;
+
+  const bool use_calibration_;
 
   // Batch size of inputs to trt_network_ added by AddInputTensor(). During
   // network construction it will update this, use it to verify the batch
diff --git a/tensorflow/contrib/tensorrt/convert/convert_nodes_test.cc b/tensorflow/contrib/tensorrt/convert/convert_nodes_test.cc
index c3a39395f3a..a95ab8dfbbb 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_nodes_test.cc
+++ b/tensorflow/contrib/tensorrt/convert/convert_nodes_test.cc
@@ -35,7 +35,10 @@ limitations under the License.
 #include "tensorflow/core/grappler/costs/graph_properties.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/protobuf/config.pb.h"  // NOLINT
+#include "tensorflow/core/public/session.h"
 
 #if GOOGLE_CUDA
 #if GOOGLE_TENSORRT
@@ -47,7 +50,9 @@ namespace tensorflow {
 namespace tensorrt {
 namespace convert {
 
+using ::tensorflow::strings::StrCat;
 using ::testing::ElementsAre;
+using ::testing::ElementsAreArray;
 
 // TODO(laigd): put this into some test utils file.
 void ExpectStatus(Status status, error::Code code = error::OK,
@@ -69,6 +74,32 @@ nvinfer1::Dims GetTestDims(const std::vector<int>& d) {
   return dims;
 }
 
+nvinfer1::DataType TfDataTypeToTrt(DataType tf_dtype) {
+  switch (tf_dtype) {
+    case DT_FLOAT:
+      return nvinfer1::DataType::kFLOAT;
+    case DT_HALF:
+      return nvinfer1::DataType::kHALF;
+    case DT_INT32:
+      return nvinfer1::DataType::kINT32;
+    default:
+      QCHECK(false) << "Unexpected data type " << DataTypeString(tf_dtype);
+  }
+}
+
+DataType TrtDataTypeToTf(nvinfer1::DataType trt_dtype) {
+  switch (trt_dtype) {
+    case nvinfer1::DataType::kFLOAT:
+      return DT_FLOAT;
+    case nvinfer1::DataType::kHALF:
+      return DT_HALF;
+    case nvinfer1::DataType::kINT32:
+      return DT_INT32;
+    default:
+      QCHECK(false) << "Unexpected data type " << static_cast<int>(trt_dtype);
+  }
+}
+
 NodeDef MakeNodeDef(const string& name, const string& op,
                     const std::vector<string>& inputs) {
   NodeDef node_def;
@@ -111,6 +142,35 @@ bool TrtDimsEqualsArray(const std::vector<int>& lhs,
   return TrtDimsEquals(GetTestDims(lhs), rhs);
 }
 
+// TODO(laigd): define a parameterized matcher that can compare against the
+// vector.
+void ExpectTrtDimsEqualsArray(const std::vector<int>& lhs,
+                              const nvinfer1::Dims& rhs) {
+  EXPECT_TRUE(TrtDimsEqualsArray(lhs, rhs))
+      << "expected: " << DebugString(GetTestDims(lhs)) << "\n"
+      << "  actual: " << DebugString(rhs);
+}
+
+template <typename T>
+void ExpectArrayNear(const std::vector<T>& lhs, const std::vector<T>& rhs) {
+  ASSERT_EQ(lhs.size(), rhs.size());
+  for (int i = 0; i < lhs.size(); i++) {
+    EXPECT_FLOAT_EQ(lhs[i], rhs[i]);
+  }
+}
+
+// Eigen::half cannot implicitly convert to float which is required for
+// EXPECT_FLOAT_EQ.
+template <>
+void ExpectArrayNear(const std::vector<Eigen::half>& lhs,
+                     const std::vector<Eigen::half>& rhs) {
+  ASSERT_EQ(lhs.size(), rhs.size());
+  for (int i = 0; i < lhs.size(); i++) {
+    EXPECT_FLOAT_EQ(Eigen::half_impl::half_to_float(lhs[i]),
+                    Eigen::half_impl::half_to_float(rhs[i]));
+  }
+}
+
 bool TrtShapedWeightsEquals(const TRT_ShapedWeights& lhs,
                             const TRT_ShapedWeights& rhs) {
   return TrtDimsEquals(lhs.shape_, rhs.shape_) && lhs.type_ == rhs.type_ &&
@@ -121,8 +181,7 @@ template <typename T>
 void ValidateWeights(const TRT_ShapedWeights& weights,
                      const std::vector<int>& expected_dims,
                      const std::vector<T>& expected_value) {
-  EXPECT_TRUE(TrtDimsEqualsArray(expected_dims, weights.shape_))
-      << weights.DebugString();
+  ExpectTrtDimsEqualsArray(expected_dims, weights.shape_);
   ASSERT_EQ(expected_value.size(), weights.count()) << weights.DebugString();
   const T* actual_values = static_cast<const T*>(weights.GetValues());
   for (int i = 0; i < expected_value.size(); ++i) {
@@ -133,11 +192,12 @@ void ValidateWeights(const TRT_ShapedWeights& weights,
 // Fake ITensor implementation for testing purposes.
 class FakeITensor : public nvinfer1::ITensor {
  public:
-  FakeITensor() {}
+  FakeITensor() : dynamic_range_(0.0f) {}
 
-  FakeITensor(const nvinfer1::Dims& dims) : dims_(dims) {}
+  FakeITensor(const nvinfer1::Dims& dims) : dims_(dims), dynamic_range_(0.0f) {}
 
-  FakeITensor(const std::vector<int>& dims) : dims_(GetTestDims(dims)) {}
+  FakeITensor(const std::vector<int>& dims)
+      : dims_(GetTestDims(dims)), dynamic_range_(0.0f) {}
 
   void setName(const char* name) override { name_ = name; }
 
@@ -166,7 +226,12 @@ class FakeITensor : public nvinfer1::ITensor {
   }
 
 #if NV_TENSORRT_MAJOR >= 5
-  bool setDynamicRange(float min, float max) override {}
+  bool setDynamicRange(float min, float max) override {
+    dynamic_range_ = std::max(std::abs(min), std::abs(max));
+    return true;
+  }
+
+  float getDynamicRange() const override { return dynamic_range_; }
 #endif
 
  private:
@@ -174,6 +239,7 @@ class FakeITensor : public nvinfer1::ITensor {
   nvinfer1::Dims dims_;
   nvinfer1::DataType type_;
   nvinfer1::TensorLocation location_;
+  float dynamic_range_;
 };
 
 TEST(TRT_ShapedWeights_Test, Basic) {
@@ -265,9 +331,7 @@ TEST(TRT_TensorOrWeights_Test, Basic) {
           EXPECT_EQ(1, ptr->batch_size());
         }
         EXPECT_EQ(&itensor, ptr->tensor());
-        EXPECT_TRUE(TrtDimsEqualsArray({1}, ptr->GetTrtDims()))
-            << "- expected: " << DebugString(dims)
-            << "\n        vs\n-   actual: " << DebugString(ptr->GetTrtDims());
+        ExpectTrtDimsEqualsArray({1}, ptr->GetTrtDims());
       }
     }
   }
@@ -286,9 +350,7 @@ TEST(TRT_TensorOrWeights_Test, Basic) {
       EXPECT_EQ(false, ptr->is_weights());
       EXPECT_EQ(1, ptr->batch_size());
       EXPECT_NE(nullptr, ptr->tensor());
-      EXPECT_TRUE(TrtDimsEqualsArray({1}, ptr->GetTrtDims()))
-          << "- expected: " << DebugString(dims)
-          << "\n        vs\n-   actual: " << DebugString(ptr->GetTrtDims());
+      ExpectTrtDimsEqualsArray({1}, ptr->GetTrtDims());
     }
   }
   // Test constructor with TRT_ShapedWeights argument.
@@ -305,9 +367,7 @@ TEST(TRT_TensorOrWeights_Test, Basic) {
 
       nvinfer1::Dims dims;
       dims.nbDims = 0;
-      EXPECT_TRUE(TrtDimsEqualsArray({}, ptr->GetTrtDims()))
-          << "- expected: " << DebugString(dims)
-          << "\n        vs\n-   actual: " << DebugString(ptr->GetTrtDims());
+      ExpectTrtDimsEqualsArray({}, ptr->GetTrtDims());
     }
   }
 }
@@ -341,34 +401,50 @@ TEST_F(ValidatorTest, ConvertToTensorOrWeights) {
                                           graph_properties, &output));
     ValidateWeights<float>(output.weights(), {2}, {1.0, 2.0});
   }
-  // Convert non-Const. We test the case where the non-batch dimemsion is
-  // unknown as well, to make sure the validator allows that.
-  for (const int32 non_batch_dim : {-1, 2}) {
-    const int32 batch_size = 12;
 
+  // Helper method to run ConvertToTensorOrWeights() with predefined parameters.
+  auto convert_to_tensor_or_weights = [this](const std::vector<int64>& dims,
+                                             TRT_TensorOrWeights* output) {
     Scope s = Scope::NewRootScope();
-    ops::Placeholder::Attrs attrs;
-    TF_EXPECT_OK(TensorShapeUtils::MakeShape(
-        std::vector<int32>{batch_size, non_batch_dim}, &attrs.shape_));
+    const auto attrs = ops::Placeholder::Shape(PartialTensorShape{dims});
     auto feed = ops::Placeholder(s.WithOpName("feed"), DT_FLOAT, attrs);
     auto add = ops::Add(s.WithOpName("add"), feed, feed);
 
     grappler::GrapplerItem item;
     TF_EXPECT_OK(s.ToGraphDef(&item.graph));
-
     grappler::GraphProperties graph_properties(item);
     TF_EXPECT_OK(graph_properties.InferStatically(true));
-
-    auto& node_def = add.operation.node()->def();
+    const NodeDef& node_def = add.operation.node()->def();
+    return this->ConvertToTensorOrWeights(node_def, /*output_port=*/0,
+                                          graph_properties, output);
+  };
+  // Convert non-Const with #dims > nvinfer1::Dims::MAX_DIMS+1.
+  {
     TRT_TensorOrWeights output;
-    ExpectStatus(ConvertToTensorOrWeights(node_def, /*output_port=*/0,
-                                          graph_properties, &output));
+    ExpectStatus(
+        convert_to_tensor_or_weights(
+            std::vector<int64>(nvinfer1::Dims::MAX_DIMS + 2, 1), &output),
+        error::OUT_OF_RANGE, "Input tensor rank is greater than 9");
+  }
+  // Convert non-Const with #dims < 2.
+  {
+    TRT_TensorOrWeights output;
+    ExpectStatus(
+        convert_to_tensor_or_weights({1}, &output), error::INVALID_ARGUMENT,
+        "Input tensor with rank<2 is not supported since the first dimension "
+        "is treated as batch dimension by TRT");
+  }
+  // Convert non-Const. We test the case where the non-batch dimemsion is
+  // unknown as well, to make sure the validator allows that.
+  for (const int32 non_batch_dim : {-1, 2}) {
+    const int32 batch_size = 12;
+    TRT_TensorOrWeights output;
+    ExpectStatus(
+        convert_to_tensor_or_weights({batch_size, non_batch_dim}, &output));
     EXPECT_EQ(true, output.is_tensor());
     EXPECT_EQ(batch_size, output.batch_size());
     EXPECT_NE(nullptr, output.tensor());
-    EXPECT_TRUE(TrtDimsEqualsArray({non_batch_dim}, output.GetTrtDims()))
-        << "- expected: {" << non_batch_dim << "} \n        vs\n"
-        << "-   actual: " << DebugString(output.GetTrtDims());
+    ExpectTrtDimsEqualsArray({non_batch_dim}, output.GetTrtDims());
   }
 }
 
@@ -405,7 +481,9 @@ class ConverterTest : public ::testing::Test {
   ConverterTest() {
     builder_.reset(nvinfer1::createInferBuilder(logger_));
     network_.reset(builder_->createNetwork());
-    converter_.reset(new Converter(network_.get(), /*fp16=*/false));
+    converter_.reset(new Converter(network_.get(),
+                                   /*precision_mode=*/FP32MODE,
+                                   /*use_calibration=*/false));
     weight_store_ = &converter_->weight_store_;
   }
 
@@ -432,8 +510,21 @@ class ConverterTest : public ::testing::Test {
     return converter_->GetInputs(node_def, inputs);
   }
 
+  Status GetWeightRange(const TRT_ShapedWeights& weights, float* out_min,
+                        float* out_max) const {
+    return converter_->GetWeightRange(weights, out_min, out_max);
+  }
+
+  void PropagateQuantizationRanges() {
+    converter_->PropagateQuantizationRanges();
+  }
+
   int batch_size() const { return converter_->batch_size_; }
 
+  std::unordered_map<nvinfer1::ITensor*, float>& quantization_ranges() {
+    return converter_->quantization_ranges_;
+  }
+
  private:
   Logger logger_;
   // These members are ordered in a way such that the destruction order is:
@@ -504,9 +595,9 @@ TEST_F(ConverterTest, AddAndGetInputs) {
   EXPECT_EQ(nvinfer1::DataType::kFLOAT, inputs[0].tensor()->getType());
   EXPECT_EQ(nvinfer1::DataType::kINT32, inputs[2].tensor()->getType());
   EXPECT_EQ(nvinfer1::DataType::kHALF, inputs[3].tensor()->getType());
-  EXPECT_TRUE(TrtDimsEqualsArray({1}, inputs[0].tensor()->getDimensions()));
-  EXPECT_TRUE(TrtDimsEqualsArray({2, 3}, inputs[2].tensor()->getDimensions()));
-  EXPECT_TRUE(TrtDimsEqualsArray({5, 3}, inputs[3].tensor()->getDimensions()));
+  ExpectTrtDimsEqualsArray({1}, inputs[0].tensor()->getDimensions());
+  ExpectTrtDimsEqualsArray({2, 3}, inputs[2].tensor()->getDimensions());
+  ExpectTrtDimsEqualsArray({5, 3}, inputs[3].tensor()->getDimensions());
 }
 
 TEST_F(ConverterTest, RenameAndMarkOutputTensors) {
@@ -552,7 +643,7 @@ TEST_F(ConverterTest, RenameAndMarkOutputTensors) {
       {{"my_op", "my_output"}, {"my_op:1", "my_output_1"}}));
   EXPECT_EQ(2, output_tensors.size());
   for (auto output_tensor : output_tensors) {
-    EXPECT_TRUE(TrtDimsEqualsArray({2, 1}, output_tensor->getDimensions()));
+    ExpectTrtDimsEqualsArray({2, 1}, output_tensor->getDimensions());
   }
   EXPECT_EQ("my_output", string(output_tensors[0]->getName()));
   EXPECT_EQ("my_output_1", string(output_tensors[1]->getName()));
@@ -577,8 +668,7 @@ TEST_F(ConverterTest, TransposeTensor) {
   // OK.
   TF_EXPECT_OK(
       converter_->TransposeTensor(input_tensor, {0, 3, 1, 2}, &output_tensor));
-  EXPECT_TRUE(TrtDimsEqualsArray({5, 2, 3}, output_tensor->getDimensions()))
-      << DebugString(*output_tensor);
+  ExpectTrtDimsEqualsArray({5, 2, 3}, output_tensor->getDimensions());
 }
 
 TEST_F(ConverterTest, PrepareTensorForShape_Tensor) {
@@ -590,7 +680,7 @@ TEST_F(ConverterTest, PrepareTensorForShape_Tensor) {
   // Shape size doesn't match.
   ExpectStatus(converter_->PrepareTensorForShape(tw, GetTestDims({2, 3, 6}),
                                                  &output_tensor),
-               error::INVALID_ARGUMENT, "Reshape shapes are not compatible.");
+               error::INVALID_ARGUMENT, "Reshape shapes are not compatible");
 
   // TODO(aaroey): we should check the case where uninferred dimensions are not
   // an exact divisor of input dim ensions, e.g. for dims {-1, 7}.
@@ -598,14 +688,12 @@ TEST_F(ConverterTest, PrepareTensorForShape_Tensor) {
   // Infer shape, ok.
   TF_EXPECT_OK(converter_->PrepareTensorForShape(tw, GetTestDims({-1, 2}),
                                                  &output_tensor));
-  EXPECT_TRUE(TrtDimsEqualsArray({15, 2}, output_tensor->getDimensions()))
-      << DebugString(*output_tensor);
+  ExpectTrtDimsEqualsArray({15, 2}, output_tensor->getDimensions());
 
   // Regular shape.
   TF_EXPECT_OK(converter_->PrepareTensorForShape(tw, GetTestDims({10, 3}),
                                                  &output_tensor));
-  EXPECT_TRUE(TrtDimsEqualsArray({10, 3}, output_tensor->getDimensions()))
-      << DebugString(*output_tensor);
+  ExpectTrtDimsEqualsArray({10, 3}, output_tensor->getDimensions());
 }
 
 TEST_F(ConverterTest, PrepareTensorForShape_Weights) {
@@ -615,8 +703,7 @@ TEST_F(ConverterTest, PrepareTensorForShape_Weights) {
   const nvinfer1::ITensor* output_tensor = nullptr;
   TF_EXPECT_OK(converter_->PrepareTensorForShape(tw, GetTestDims({10, 3}),
                                                  &output_tensor));
-  EXPECT_TRUE(TrtDimsEqualsArray({10, 3}, output_tensor->getDimensions()))
-      << DebugString(*output_tensor);
+  ExpectTrtDimsEqualsArray({10, 3}, output_tensor->getDimensions());
 }
 
 TEST_F(ConverterTest, MaybeUpdateBatchSize) {
@@ -656,6 +743,178 @@ TEST_F(ConverterTest, AddAndGetTensorOrWeights) {
                "tensor/weights my_tensor already exist");
 }
 
+template <typename T>
+void TestGetWeightRange(ConverterTest* test, TrtWeightStore* weight_store) {
+  TRT_ShapedWeights weights =
+      weight_store->GetTempWeights(DataTypeToEnum<T>::v(), GetTestDims({2, 3}));
+  const std::vector<T> values = {T(3), T(1), T(2), T(6), T(5), T(4)};
+  memcpy(const_cast<void*>(weights.GetValues()), values.data(),
+         weights.size_bytes());
+
+  float out_min = 0.0f;
+  float out_max = 0.0f;
+  TF_EXPECT_OK(test->GetWeightRange(weights, &out_min, &out_max));
+  EXPECT_EQ(1.0f, out_min);
+  EXPECT_EQ(6.0f, out_max);
+}
+
+TEST_F(ConverterTest, GetWeightRange) {
+  TestGetWeightRange<float>(this, weight_store_);
+  TestGetWeightRange<Eigen::half>(this, weight_store_);
+  TestGetWeightRange<int32>(this, weight_store_);
+}
+
+TEST_F(ConverterTest, ProvideQuantizationRange) {
+  FakeITensor fake_tensor;
+  // Assymetric range
+  converter_->ProvideQuantizationRange(&fake_tensor, 0.0f, 6.0f);
+  EXPECT_EQ(6.0f, quantization_ranges()[&fake_tensor]);
+  converter_->ProvideQuantizationRange(&fake_tensor, 1.0f, 6.0f);
+  EXPECT_EQ(6.0f, quantization_ranges()[&fake_tensor]);
+  converter_->ProvideQuantizationRange(&fake_tensor, -8.0f, 6.0f);
+  EXPECT_EQ(8.0f, quantization_ranges()[&fake_tensor]);
+  converter_->ProvideQuantizationRange(&fake_tensor, -8.123f, -6.123f);
+  EXPECT_EQ(8.123f, quantization_ranges()[&fake_tensor]);
+  // Symmetric range
+  converter_->ProvideQuantizationRange(&fake_tensor, -6.123f, 6.123f);
+  EXPECT_EQ(6.123f, quantization_ranges()[&fake_tensor]);
+}
+
+TEST_F(ConverterTest, MaybeApplyQuantizationRanges) {
+  // input -> infer1 -> infer2 -> infer3
+  FakeITensor input, infer_1, infer_2, infer_3;
+  FakeITensor not_infer;
+  Converter int8_converter(/*trt_network=*/nullptr, INT8MODE,
+                           /*use_calibration=*/true);
+  int8_converter.ProvideQuantizationRange(&input, -5.0f, 5.0f);
+  int8_converter.ProvideQuantizationRange(&not_infer, -100.0f, 100.0f);
+  int8_converter.MarkQuantizationRangesAsInferrable(&input, &infer_1);
+  int8_converter.MarkQuantizationRangesAsInferrable(&infer_1, &infer_2);
+  int8_converter.MarkQuantizationRangesAsInferrable(&infer_2, &infer_3);
+
+  // Input range should be inferred along the chain and applied to tensors.
+  int8_converter.MaybeApplyQuantizationRanges();
+#if NV_TENSORRT_MAJOR >= 5
+  EXPECT_EQ(input.getDynamicRange(), 5.0f);
+  EXPECT_EQ(infer_1.getDynamicRange(), 5.0f);
+  EXPECT_EQ(infer_2.getDynamicRange(), 5.0f);
+  EXPECT_EQ(infer_3.getDynamicRange(), 5.0f);
+  EXPECT_EQ(not_infer.getDynamicRange(), 100.0f);
+#endif
+}
+
+TEST_F(ConverterTest, PropagateQuantizationRanges) {
+  // infer0 <-> infer1 <-> infer2 <-> infer3
+  //              |
+  //            infer4 <-> infer5
+  FakeITensor infer[6];
+  FakeITensor not_infer;
+  converter_->ProvideQuantizationRange(&infer[4], -5.0f, 5.0f);
+  converter_->MarkQuantizationRangesAsInferrable(&infer[0], &infer[1]);
+  converter_->MarkQuantizationRangesAsInferrable(&infer[1], &infer[2]);
+  converter_->MarkQuantizationRangesAsInferrable(&infer[3], &infer[2]);
+  converter_->MarkQuantizationRangesAsInferrable(&infer[4], &infer[1]);
+  converter_->MarkQuantizationRangesAsInferrable(&infer[4], &infer[5]);
+
+  // Input range should be inferred along the chain.
+  PropagateQuantizationRanges();
+  auto ranges = quantization_ranges();
+  for (int i = 0; i < 6; ++i) {
+    EXPECT_EQ(5.0f, ranges[&infer[i]]);
+  }
+  EXPECT_EQ(ranges.count(&not_infer), 0);
+}
+
+TEST_F(ConverterTest, GetTrtBroadcastShape) {
+  const bool kIsTensor = true;
+  const bool kIsNotTensor = false;
+  auto symmetric_test = [this](const std::vector<int>& operand_1_shape,
+                               const std::vector<int>& operand_2_shape,
+                               const bool operand_1_is_tensor,
+                               const bool operand_2_is_tensor,
+                               const std::vector<int>& expected_operand_1_shape,
+                               const std::vector<int>& expected_operand_2_shape,
+                               error::Code expected_code = error::OK,
+                               const char* expected_error_msg_substr = nullptr,
+                               const int operand_1_batch_size = -1,
+                               const int operand_2_batch_size = -1) {
+    auto create_tensor_or_weights = [](const std::vector<int>& shape,
+                                       bool is_tensor, int batch_size = -1) {
+      if (is_tensor) {
+        return TRT_TensorOrWeights{nvinfer1::DataType::kFLOAT,
+                                   GetTestDims(shape), batch_size};
+      }
+      TRT_ShapedWeights weights;
+      weights.shape_ = GetTestDims(shape);
+      return TRT_TensorOrWeights(weights);
+    };
+
+    nvinfer1::Dims operand_1_new_dims, operand_2_new_dims;
+    TRT_TensorOrWeights operand_1 = create_tensor_or_weights(
+        operand_1_shape, operand_1_is_tensor, operand_1_batch_size);
+    TRT_TensorOrWeights operand_2 = create_tensor_or_weights(
+        operand_2_shape, operand_2_is_tensor, operand_2_batch_size);
+
+    // operand_1 broadcast operand_2
+    ExpectStatus(
+        this->converter_->GetTrtBroadcastShape(
+            operand_1, operand_2, &operand_1_new_dims, &operand_2_new_dims),
+        expected_code, expected_error_msg_substr);
+    if (expected_code == error::OK) {
+      ExpectTrtDimsEqualsArray(expected_operand_1_shape, operand_1_new_dims);
+      ExpectTrtDimsEqualsArray(expected_operand_2_shape, operand_2_new_dims);
+    }
+    // operand_2 broadcast operand_1
+    ExpectStatus(
+        this->converter_->GetTrtBroadcastShape(
+            operand_2, operand_1, &operand_2_new_dims, &operand_1_new_dims),
+        expected_code, expected_error_msg_substr);
+    if (expected_code == error::OK) {
+      ExpectTrtDimsEqualsArray(expected_operand_1_shape, operand_1_new_dims);
+      ExpectTrtDimsEqualsArray(expected_operand_2_shape, operand_2_new_dims);
+    }
+  };
+
+  // Both inputs are weights.
+  symmetric_test(
+      {1}, {1}, kIsNotTensor, kIsNotTensor, {}, {}, error::INVALID_ARGUMENT,
+      "Broadcasting requires at least one of the operands be tensors");
+
+  // One tensor and one weights.
+  symmetric_test({1, 1, 1}, {2}, kIsTensor, kIsNotTensor, {1, 1, 1}, {1, 1, 2});
+  symmetric_test({1, 1, 2}, {2}, kIsTensor, kIsNotTensor, {1, 1, 2}, {1, 1, 2});
+  symmetric_test({1, 3, 2}, {1}, kIsTensor, kIsNotTensor, {1, 3, 2}, {1, 1, 1});
+  symmetric_test({1, 1, 1}, {2, 3}, kIsTensor, kIsNotTensor, {1, 1, 1},
+                 {1, 2, 3});
+  symmetric_test({1, 1, 1}, {2, 3, 4}, kIsTensor, kIsNotTensor, {1, 1, 1},
+                 {2, 3, 4});
+  symmetric_test({1, 1, 1}, {1, 2, 3, 4}, kIsTensor, kIsNotTensor, {1, 1, 1},
+                 {2, 3, 4});
+  symmetric_test({1, 3, 4}, {1, 2, 1, 4}, kIsTensor, kIsNotTensor, {1, 3, 4},
+                 {2, 1, 4});
+  symmetric_test({1, 1, 1}, {2, 1, 1, 1}, kIsTensor, kIsNotTensor, {}, {},
+                 error::INVALID_ARGUMENT, "Infeasible broadcast scheme");
+  symmetric_test({1, 1, 1}, {2, 1, 1, 1}, kIsTensor, kIsNotTensor, {}, {},
+                 error::INVALID_ARGUMENT, "Infeasible broadcast scheme",
+                 /*operand_1_batch_size=*/2);
+  symmetric_test({1, 1, 1}, {1, 1, 1, 1, 1}, kIsTensor, kIsNotTensor, {}, {},
+                 error::INVALID_ARGUMENT,
+                 "Broadcasting beyond batch dimension is not supported "
+                 "(tensor #dims 4 vs broadcast #dims 5)");
+
+  // Both inputs are tensors.
+  symmetric_test({1, 1, 1}, {1, 1}, kIsTensor, kIsTensor, {}, {},
+                 error::INVALID_ARGUMENT,
+                 "Broadcasting beyond batch dimension is not supported "
+                 "(tensor #dims 3 vs broadcast #dims 4)");
+  symmetric_test({1, 3, 4}, {2, 1, 4}, kIsTensor, kIsTensor, {1, 3, 4},
+                 {2, 1, 4});
+  symmetric_test({1, 1, 1}, {1, 1, 1, 1}, kIsTensor, kIsTensor, {}, {},
+                 error::INVALID_ARGUMENT,
+                 "Broadcasting beyond batch dimension is not supported "
+                 "(tensor #dims 4 vs broadcast #dims 5)");
+}
+
 // Class to test various op converters, using both a TrtNodeValidator and
 // Converter.
 class OpConverterTest : public ::testing::Test {
@@ -684,15 +943,21 @@ class OpConverterTest : public ::testing::Test {
 
     // Reset the validator and converter.
     validator_.reset(new TrtNodeValidator);
-    converter_.reset(new Converter(network_.get(), /*fp16=*/false));
+    converter_.reset(new Converter(network_.get(),
+                                   /*precision_mode=*/FP32MODE,
+                                   /*use_calibration=*/false));
 
     // Reset other related artifacts.
     scope_ = Scope::NewRootScope();
     validator_inputs_.clear();
   }
 
-  void BuildAndRun(const char* input_name, const std::vector<float>& input_data,
-                   const char* output_name, std::vector<float>* output_data) {
+  // TODO(laigd): test fp16 and int8 support.
+  template <typename T>
+  void BuildAndRun(
+      const std::vector<std::pair<const char*, const std::vector<T>>>&
+          input_data,
+      const char* output_name, std::vector<T>* output_data) {
     // Mark the output tensor as TRT engine output.
     TF_EXPECT_OK(converter_->RenameAndMarkOutputTensors(
         {{string(output_name), string(output_name)}}));
@@ -703,25 +968,33 @@ class OpConverterTest : public ::testing::Test {
     CHECK_NOTNULL(engine_.get());
 
     // Execute the TRT engine.
-    const int input_size = input_data.size() * sizeof(float);
-    const int output_size = output_data->size() * sizeof(float);
-    const int input_index = engine_->getBindingIndex(input_name);
-    const int output_index = engine_->getBindingIndex(output_name);
+    ASSERT_LE(input_data.size() + 1, 3);
+    void* buffers[3];
+    for (const auto name_and_data : input_data) {
+      const int input_size = name_and_data.second.size() * sizeof(T);
+      const int input_index = engine_->getBindingIndex(name_and_data.first);
+      ASSERT_EQ(0, cudaMalloc(&buffers[input_index], input_size));
+      ASSERT_EQ(
+          0, cudaMemcpyAsync(buffers[input_index], name_and_data.second.data(),
+                             input_size, cudaMemcpyHostToDevice, stream_));
+    }
 
-    ASSERT_EQ(engine_->getNbBindings(), 2);
-    void* buffers[2];
-    ASSERT_EQ(0, cudaMalloc(&buffers[input_index], input_size));
+    const int output_size = output_data->size() * sizeof(T);
+    const int output_index = engine_->getBindingIndex(output_name);
     ASSERT_EQ(0, cudaMalloc(&buffers[output_index], output_size));
-    ASSERT_EQ(0, cudaMemcpyAsync(buffers[input_index], input_data.data(),
-                                 input_size, cudaMemcpyHostToDevice, stream_));
+
+    ASSERT_EQ(engine_->getNbBindings(), input_data.size() + 1);
+
     TrtUniquePtrType<nvinfer1::IExecutionContext> execution_context(
         engine_->createExecutionContext());
     execution_context->enqueue(/*batchSize=*/1, buffers, stream_, nullptr);
     ASSERT_EQ(0, cudaMemcpyAsync(output_data->data(), buffers[output_index],
                                  output_size, cudaMemcpyDeviceToHost, stream_));
     cudaStreamSynchronize(stream_);
-    ASSERT_EQ(0, cudaFree(buffers[input_index]));
-    ASSERT_EQ(0, cudaFree(buffers[output_index]));
+
+    for (int i = 0; i < input_data.size() + 1; ++i) {
+      ASSERT_EQ(0, cudaFree(buffers[i]));
+    }
   }
 
   bool HasStaticShape(const nvinfer1::Dims& dims) const {
@@ -736,18 +1009,7 @@ class OpConverterTest : public ::testing::Test {
   void AddTestTensor(
       const char* name, const std::vector<int32>& dims, int batch_size = 1,
       nvinfer1::DataType trt_dtype = nvinfer1::DataType::kFLOAT) {
-    DataType tf_dtype = DT_FLOAT;
-    switch (trt_dtype) {
-      case nvinfer1::DataType::kFLOAT:
-        tf_dtype = DT_FLOAT;
-        break;
-      case nvinfer1::DataType::kINT32:
-        tf_dtype = DT_INT32;
-        break;
-      default:
-        ASSERT_TRUE(false) << "Unexpected data type "
-                           << static_cast<int>(trt_dtype);
-    }
+    DataType tf_dtype = TrtDataTypeToTf(trt_dtype);
     ops::Placeholder::Attrs attrs;
     TF_EXPECT_OK(TensorShapeUtils::MakeShape(dims, &attrs.shape_));
     attrs.shape_.InsertDim(0, batch_size);
@@ -826,6 +1088,11 @@ class OpConverterTest : public ::testing::Test {
     }
   }
 
+  // Expose quantization_ranges_ for tests
+  std::unordered_map<nvinfer1::ITensor*, float>& quantization_ranges() {
+    return converter_->quantization_ranges_;
+  }
+
   std::unique_ptr<Converter> converter_;
   std::unique_ptr<TrtNodeValidator> validator_;
 
@@ -835,6 +1102,11 @@ class OpConverterTest : public ::testing::Test {
   TrtUniquePtrType<nvinfer1::INetworkDefinition> network_;
   TrtUniquePtrType<nvinfer1::ICudaEngine> engine_;
   cudaStream_t stream_;
+  // Used to create placeholders with shape and data type information. The
+  // created placeholders will be used as inputs to the node to be verified,
+  // thus we need the shape and data type information to get a non-empty
+  // GraphProperties.
+  // TODO(laigd): consider use this Scope to create the NodeDef to verify.
   Scope scope_;
   std::unordered_map<string, NodeDef> validator_inputs_;
 };
@@ -958,15 +1230,15 @@ TEST_F(OpConverterTest, ConvertTranspose) {
     Reset();
     AddTestTensor("input", {1, 2, 3});
     AddTestWeights<int32>("weights", {4}, {0, 3, 1, 2});
-    RunConversion(node_def);
+    RunValidationAndConversion(node_def);
     TRT_TensorOrWeights output;
     TF_EXPECT_OK(GetTensorOrWeights("my_transpose", &output));
     EXPECT_TRUE(output.is_tensor());
-    EXPECT_TRUE(TrtDimsEqualsArray({3, 1, 2}, output.tensor()->getDimensions()))
-        << output.DebugString();
+    ExpectTrtDimsEqualsArray({3, 1, 2}, output.tensor()->getDimensions());
 
     std::vector<float> output_data(6);
-    BuildAndRun("input", {1, 2, 3, 4, 5, 6}, "my_transpose", &output_data);
+    BuildAndRun<float>({{"input", {1, 2, 3, 4, 5, 6}}}, "my_transpose",
+                       &output_data);
     EXPECT_THAT(output_data, ElementsAre(1, 4, 2, 5, 3, 6));
   }
 }
@@ -1048,15 +1320,15 @@ TEST_F(OpConverterTest, ConvertReshape) {
     Reset();
     AddTestTensor("input", ok_params[i].tensor_dims, ok_params[i].batch_size);
     AddTestWeights<int32>("weights", {4}, ok_params[i].shape);
-    RunConversion(node_def);
+    RunValidationAndConversion(node_def);
     TRT_TensorOrWeights output;
     TF_EXPECT_OK(GetTensorOrWeights("my_reshape", &output));
     EXPECT_TRUE(output.is_tensor());
-    EXPECT_TRUE(TrtDimsEqualsArray({1, 3, 2}, output.tensor()->getDimensions()))
-        << output.DebugString();
+    ExpectTrtDimsEqualsArray({1, 3, 2}, output.tensor()->getDimensions());
 
     std::vector<float> output_data(6);
-    BuildAndRun("input", {1, 2, 3, 4, 5, 6}, "my_reshape", &output_data);
+    BuildAndRun<float>({{"input", {1, 2, 3, 4, 5, 6}}}, "my_reshape",
+                       &output_data);
     EXPECT_THAT(output_data, ElementsAre(1, 2, 3, 4, 5, 6));
   }
 }
@@ -1070,15 +1342,14 @@ TEST_F(OpConverterTest, ConvertMatMul) {
         "Input expects tensor and weights, at my_matmul");
   }
 
-  // Get the NodeDef for Reshape.
+  // Get the NodeDef for MatMul.
   auto get_matmul_nodedef = [](DataType dtype, bool transpose_a,
                                bool transpose_b) -> NodeDef {
     Scope s = Scope::NewRootScope();
     auto input = ops::Placeholder(s.WithOpName("input"), dtype);
     auto weights = ops::Placeholder(s.WithOpName("weights"), dtype);
-    ops::MatMul::Attrs matmul_attrs;
-    matmul_attrs.transpose_a_ = transpose_a;
-    matmul_attrs.transpose_b_ = transpose_b;
+    const auto matmul_attrs =
+        ops::MatMul::TransposeA(transpose_a).TransposeB(transpose_b);
     auto matmul =
         ops::MatMul(s.WithOpName("my_matmul"), input, weights, matmul_attrs);
     return matmul.operation.node()->def();
@@ -1094,45 +1365,845 @@ TEST_F(OpConverterTest, ConvertMatMul) {
         node_def, error::UNIMPLEMENTED,
         "Data type is not supported, for node my_matmul got int32");
   }
-  {
-    // transpose_a is set.
-    for (bool transpose_b : {false, true}) {
-      Reset();
-      NodeDef node_def =
-          get_matmul_nodedef(DT_FLOAT, /*transpose_a=*/true, transpose_b);
-      AddTestTensor("input", {2}, /*batch_size=*/1);
-      AddTestWeights<float>("weights", {2, 2}, {0, 1, 2, 3});
-      RunValidationAndConversion(
-          node_def, error::INVALID_ARGUMENT,
-          "transpose_a is not supported for TensorRT FullyConnected");
+  // transpose_a is set.
+  for (bool transpose_b : {false, true}) {
+    Reset();
+    NodeDef node_def =
+        get_matmul_nodedef(DT_FLOAT, /*transpose_a=*/true, transpose_b);
+    AddTestTensor("input", {2}, /*batch_size=*/1);
+    AddTestWeights<float>("weights", {2, 2}, {0, 1, 2, 3});
+    RunValidationAndConversion(
+        node_def, error::INVALID_ARGUMENT,
+        "transpose_a is not supported for TensorRT FullyConnected");
+  }
+  // OK.
+  for (bool transpose_b : {false, true}) {
+    Reset();
+    NodeDef node_def =
+        get_matmul_nodedef(DT_FLOAT, /*transpose_a=*/false, transpose_b);
+    AddTestTensor("input", {2}, /*batch_size=*/1);
+    AddTestWeights<float>("weights", {2, 2}, {0, 1, 2, 3});
+    RunValidationAndConversion(node_def);
+    TRT_TensorOrWeights output;
+    TF_EXPECT_OK(GetTensorOrWeights("my_matmul", &output));
+    EXPECT_TRUE(output.is_tensor());
+    ExpectTrtDimsEqualsArray({2}, output.tensor()->getDimensions());
+
+    std::vector<float> output_data(2);
+    BuildAndRun<float>({{"input", {0, 1}}}, "my_matmul", &output_data);
+    if (transpose_b) {
+      EXPECT_THAT(output_data, ElementsAre(1, 3));
+    } else {
+      EXPECT_THAT(output_data, ElementsAre(2, 3));
     }
   }
-  {
-    // OK.
-    for (bool transpose_b : {false, true}) {
-      Reset();
-      NodeDef node_def =
-          get_matmul_nodedef(DT_FLOAT, /*transpose_a=*/false, transpose_b);
-      AddTestTensor("input", {2}, /*batch_size=*/1);
-      AddTestWeights<float>("weights", {2, 2}, {0, 1, 2, 3});
-      RunConversion(node_def);
-      TRT_TensorOrWeights output;
-      TF_EXPECT_OK(GetTensorOrWeights("my_matmul", &output));
-      EXPECT_TRUE(output.is_tensor());
-      EXPECT_TRUE(TrtDimsEqualsArray({2}, output.tensor()->getDimensions()))
-          << output.DebugString();
+}
 
-      std::vector<float> output_data(2);
-      BuildAndRun("input", {0, 1}, "my_matmul", &output_data);
-      if (transpose_b) {
-        EXPECT_THAT(output_data, ElementsAre(1, 3));
+template <DataType dtype>
+void TestConvertBiasAdd(OpConverterTest* test) {
+  // Get the NodeDef for BiasAdd.
+  auto get_biasadd_nodedef = [](const string& data_format) -> NodeDef {
+    Scope s = Scope::NewRootScope();
+    auto input = ops::Placeholder(s.WithOpName("input"), dtype);
+    auto weights = ops::Placeholder(s.WithOpName("weights"), dtype);
+    const auto biasadd_attrs = ops::BiasAdd::DataFormat(data_format);
+    auto biasadd =
+        ops::BiasAdd(s.WithOpName("my_biasadd"), input, weights, biasadd_attrs);
+    return biasadd.operation.node()->def();
+  };
+
+  typedef typename EnumToDataType<dtype>::Type CType;
+  for (const string& data_format : {"NHWC", "NCHW"}) {
+    for (const int trt_input_rank : {1, 2, 3, 4}) {
+      test->Reset();
+      NodeDef node_def = get_biasadd_nodedef(data_format);
+
+      // Add input, dims_array will be like {2, 1, ..., 1, 3}
+      std::vector<int32> dims_array(trt_input_rank, 1);
+      if (trt_input_rank == 1) {
+        dims_array[0] = (data_format == "NHWC" ? 3 : 2);
       } else {
-        EXPECT_THAT(output_data, ElementsAre(2, 3));
+        dims_array[0] = 2;
+        dims_array[trt_input_rank - 1] = 3;
+      }
+      test->AddTestTensor("input", dims_array, /*batch_size=*/1,
+                          TfDataTypeToTrt(dtype));
+
+      // Add bias weights.
+      const int channel_size = (data_format == "NHWC" ? 3 : 2);
+      std::vector<CType> bias(channel_size);
+      for (int i = 0; i < channel_size; ++i) {
+        bias[i] = CType(i + 1);  // bias will be {1, 2, 3, ...}
+      }
+      test->AddTestWeights<CType>("weights", {channel_size}, bias);
+
+      // Run the conversion.
+      test->RunValidationAndConversion(node_def);
+      TRT_TensorOrWeights output;
+      TF_EXPECT_OK(test->GetTensorOrWeights("my_biasadd", &output));
+      EXPECT_TRUE(output.is_tensor());
+      ExpectTrtDimsEqualsArray(dims_array, output.tensor()->getDimensions());
+
+      // Build and run the engine.
+      const int num_input = TrtDimsNumElements(GetTestDims(dims_array));
+      ASSERT_EQ(trt_input_rank > 1 ? 6 : (data_format == "NHWC" ? 3 : 2),
+                num_input);
+      std::vector<CType> output_data(num_input);
+      test->BuildAndRun<CType>(
+          {{"input", std::vector<CType>(num_input, CType(0))}}, "my_biasadd",
+          &output_data);
+      if (trt_input_rank == 1) {
+        if (data_format == "NHWC") {
+          EXPECT_THAT(output_data, ElementsAre(CType(1), CType(2), CType(3)));
+        } else {
+          EXPECT_THAT(output_data, ElementsAre(CType(1), CType(2)));
+        }
+      } else {
+        if (data_format == "NHWC") {
+          EXPECT_THAT(output_data, ElementsAre(CType(1), CType(2), CType(3),
+                                               CType(1), CType(2), CType(3)));
+        } else {
+          EXPECT_THAT(output_data, ElementsAre(CType(1), CType(1), CType(1),
+                                               CType(2), CType(2), CType(2)));
+        }
       }
     }
   }
 }
 
+TEST_F(OpConverterTest, ConvertQuantize) {
+  {
+    // Input list is empty, should fail.
+    NodeDef node_def =
+        MakeNodeDef("my_quantize", "QuantizeAndDequantizeV2", {});
+    RunConversion(
+        node_def, error::INVALID_ARGUMENT,
+        "Invalid number of inputs for QuantizeAndDequantizeV2, at my_quantize");
+  }
+  {
+    // FakeQuantWithMinMaxArgs attributes are empty, should fail.
+    NodeDef node_def =
+        MakeNodeDef("my_quantize", "FakeQuantWithMinMaxArgs", {"input"});
+    AddTestTensor("input", {1, 2, 3});
+    RunConversion(node_def, error::INVALID_ARGUMENT,
+                  "Min or max attribute not found for FakeQuantWithMinMaxArgs "
+                  "at my_quantize");
+  }
+  {
+    // FakeQuantWithMinMaxArgs ranges set via attributes, ok.
+    Reset();
+    Scope s = Scope::NewRootScope();
+    auto input = ops::Placeholder(s.WithOpName("input"), DT_FLOAT);
+    ops::FakeQuantWithMinMaxArgs::Attrs quantize_attrs;
+    quantize_attrs.min_ = -6.0f;
+    quantize_attrs.max_ = 6.0f;
+    auto quantize = ops::FakeQuantWithMinMaxArgs(s.WithOpName("my_quantize"),
+                                                 input, quantize_attrs);
+    const NodeDef& node_def = quantize.operation.node()->def();
+    AddTestTensor("input", {1, 2, 3});
+    RunConversion(node_def);
+    TRT_TensorOrWeights output;
+    TF_EXPECT_OK(GetTensorOrWeights("my_quantize", &output));
+    EXPECT_TRUE(output.is_tensor());
+    auto ranges = quantization_ranges();
+    EXPECT_EQ(ranges.count(output.tensor()), 1);
+    EXPECT_EQ(ranges[output.tensor()], 6.0f);
+  }
+  {
+    // FakeQuantWithMinMaxVars ranges set via inputs, ok.
+    Reset();
+    Scope s = Scope::NewRootScope();
+    auto input = ops::Placeholder(s.WithOpName("input"), DT_FLOAT);
+    auto weights_min = ops::Placeholder(s.WithOpName("weights_min"), DT_FLOAT);
+    auto weights_max = ops::Placeholder(s.WithOpName("weights_max"), DT_FLOAT);
+    auto quantize = ops::FakeQuantWithMinMaxVars(
+        s.WithOpName("my_quantize"), input, weights_min, weights_max);
+    const NodeDef& node_def = quantize.operation.node()->def();
+    AddTestTensor("input", {1, 2, 3});
+    AddTestWeights<float>("weights_min", {1}, {-6.0f});
+    AddTestWeights<float>("weights_max", {1}, {6.0f});
+    RunConversion(node_def);
+    TRT_TensorOrWeights output;
+    TF_EXPECT_OK(GetTensorOrWeights("my_quantize", &output));
+    EXPECT_TRUE(output.is_tensor());
+    auto ranges = quantization_ranges();
+    EXPECT_EQ(ranges.count(output.tensor()), 1);
+    EXPECT_EQ(ranges[output.tensor()], 6.0f);
+  }
+  {
+    // QuantizeAndDequantizeV2 ranges set via inputs, ok.
+    Reset();
+    Scope s = Scope::NewRootScope();
+    auto input = ops::Placeholder(s.WithOpName("input"), DT_FLOAT);
+    auto weights_min = ops::Placeholder(s.WithOpName("weights_min"), DT_FLOAT);
+    auto weights_max = ops::Placeholder(s.WithOpName("weights_max"), DT_FLOAT);
+    auto quantize = ops::QuantizeAndDequantizeV2(
+        s.WithOpName("my_quantize"), input, weights_min, weights_max);
+    const NodeDef& node_def = quantize.operation.node()->def();
+    AddTestTensor("input", {1, 2, 3});
+    AddTestWeights<float>("weights_min", {1}, {-6.0f});
+    AddTestWeights<float>("weights_max", {1}, {6.0f});
+    RunConversion(node_def);
+    TRT_TensorOrWeights output;
+    TF_EXPECT_OK(GetTensorOrWeights("my_quantize", &output));
+    EXPECT_TRUE(output.is_tensor());
+    auto ranges = quantization_ranges();
+    EXPECT_EQ(ranges.count(output.tensor()), 1);
+    EXPECT_EQ(ranges[output.tensor()], 6.0f);
+  }
+  {
+    // QuantizeAndDequantizeV3 ranges set via inputs, ok.
+    Reset();
+    Scope s = Scope::NewRootScope();
+    auto input = ops::Placeholder(s.WithOpName("input"), DT_FLOAT);
+    auto weights_min = ops::Placeholder(s.WithOpName("weights_min"), DT_FLOAT);
+    auto weights_max = ops::Placeholder(s.WithOpName("weights_max"), DT_FLOAT);
+    auto num_bits = ops::Placeholder(s.WithOpName("num_bits"), DT_INT32);
+    auto quantize = ops::QuantizeAndDequantizeV3(
+        s.WithOpName("my_quantize"), input, weights_min, weights_max, num_bits);
+    const NodeDef& node_def = quantize.operation.node()->def();
+    AddTestTensor("input", {1, 2, 3});
+    AddTestWeights<float>("weights_min", {1}, {-6.0f});
+    AddTestWeights<float>("weights_max", {1}, {6.0f});
+    AddTestWeights<int>("num_bits", {1}, {8});
+    RunConversion(node_def);
+    TRT_TensorOrWeights output;
+    TF_EXPECT_OK(GetTensorOrWeights("my_quantize", &output));
+    EXPECT_TRUE(output.is_tensor());
+    auto ranges = quantization_ranges();
+    EXPECT_EQ(ranges.count(output.tensor()), 1);
+    EXPECT_EQ(ranges[output.tensor()], 6.0f);
+  }
+  {
+    // QuantizeAndDequantizeV2 Range inputs are tensors, should fail.
+    Reset();
+    Scope s = Scope::NewRootScope();
+    auto input = ops::Placeholder(s.WithOpName("input"), DT_FLOAT);
+    auto weights_min = ops::Placeholder(s.WithOpName("weights_min"), DT_FLOAT);
+    auto weights_max = ops::Placeholder(s.WithOpName("weights_max"), DT_FLOAT);
+    auto quantize = ops::QuantizeAndDequantizeV2(
+        s.WithOpName("my_quantize"), input, weights_min, weights_max);
+    const NodeDef& node_def = quantize.operation.node()->def();
+    AddTestTensor("input", {1, 2, 3});
+    AddTestTensor("weights_min", {1});
+    AddTestTensor("weights_max", {1});
+    RunConversion(
+        node_def, error::INVALID_ARGUMENT,
+        "Min and max inputs for QuantizeAndDequantizeV2 must be weights not "
+        "tensors, at my_quantize");
+  }
+}
+
+TEST_F(OpConverterTest, ConvertRelu6) {
+  {
+    // Input list is empty, should fail.
+    NodeDef node_def = MakeNodeDef("my_relu6", "Relu6", {});
+    RunConversion(node_def, error::INVALID_ARGUMENT,
+                  "Invalid number of inputs for Relu6, at my_relu6");
+  }
+
+  // Get the NodeDef for Relu6.
+  Scope s = Scope::NewRootScope();
+  auto input = ops::Placeholder(s.WithOpName("input"), DT_FLOAT);
+  auto relu6 = ops::Relu6(s.WithOpName("my_relu6"), input);
+  const NodeDef& node_def = relu6.operation.node()->def();
+
+  {
+    // Clip tensor values and set quantization ranges, ok.
+    Reset();
+    AddTestTensor("input", {1, 2, 3});
+    RunConversion(node_def);
+    TRT_TensorOrWeights output;
+    TF_EXPECT_OK(GetTensorOrWeights("my_relu6", &output));
+    EXPECT_TRUE(output.is_tensor());
+    auto ranges = quantization_ranges();
+    EXPECT_EQ(ranges[output.tensor()], 6.0f);
+
+    std::vector<float> output_data(6);
+    BuildAndRun("input", {-100, -1, 0, 3, 5, 9}, "my_relu6", &output_data);
+    EXPECT_THAT(output_data, ElementsAre(0, 0, 0, 3, 5, 6));
+  }
+  {
+    // Input is weights, should fail.
+    Reset();
+    AddTestWeights<float>("input", {1, 2, 3}, {-100, -1, 0, 3, 5, 9});
+    RunConversion(
+        node_def, error::UNIMPLEMENTED,
+        "Relu6 is only implemented for tensors, not weights, at my_relu6");
+  }
+}
+
+TEST_F(OpConverterTest, ConvertBiasAdd) {
+  {
+    // Input list is empty, should fail.
+    NodeDef node_def = MakeNodeDef("my_biasadd", "BiasAdd", {});
+    RunValidationAndConversion(
+        node_def, error::INVALID_ARGUMENT,
+        "Input expects tensor and weights, at my_biasadd");
+  }
+
+  // OK. Note that kINT32 is not supported by IScaleLayer, so we don't test
+  // DT_INT32 type here.
+  TestConvertBiasAdd<DT_FLOAT>(this);
+  TestConvertBiasAdd<DT_HALF>(this);
+}
+
+template <typename OpType>
+NodeDef GetBinaryOpNodeDef(const string& input_name_l,
+                           const string& input_name_r, DataType dtype) {
+  Scope s = Scope::NewRootScope();
+  auto input_l = ops::Placeholder(s.WithOpName(input_name_l), dtype);
+  auto input_r = ops::Placeholder(s.WithOpName(input_name_r), dtype);
+  auto op = OpType(s.WithOpName("my_binary"), input_l, input_r);
+  return op.operation.node()->def();
+}
+
+void CheckAddedLayers(OpConverterTest* test, bool expect_scale_layer) {
+  bool element_wise_layer_found = false;
+  bool scale_layer_found = false;
+  for (int i = 0; i < test->converter_->network()->getNbLayers(); i++) {
+    nvinfer1::ILayer* layer = test->converter_->network()->getLayer(i);
+    if (dynamic_cast<nvinfer1::IScaleLayer*>(layer)) {
+      scale_layer_found = true;
+    } else if (dynamic_cast<nvinfer1::IElementWiseLayer*>(layer)) {
+      element_wise_layer_found = true;
+    }
+  }
+  EXPECT_EQ(expect_scale_layer, scale_layer_found);
+  EXPECT_NE(expect_scale_layer, element_wise_layer_found);
+}
+
+template <typename OpType, DataType dtype>
+void TestBinaryTensorOpWeightNoBroadcast(OpConverterTest* test) {
+  typedef typename EnumToDataType<dtype>::Type CType;
+  for (auto swap_inputs : {false, true}) {
+    test->Reset();
+    NodeDef node_def;
+    if (swap_inputs) {
+      node_def = GetBinaryOpNodeDef<OpType>("weights", "input", dtype);
+    } else {
+      node_def = GetBinaryOpNodeDef<OpType>("input", "weights", dtype);
+    }
+
+    const std::vector<CType> operand1{CType(3), CType(7.5)};
+    const std::vector<CType> operand2{CType(2), CType(3)};
+
+    // It requires the dims to be at least of rank 3 to apply an IScaleLayer.
+    test->AddTestTensor("input", /*dims=*/{1, 1, 2}, /*batch_size=*/1,
+                        TfDataTypeToTrt(dtype));
+    test->AddTestWeights<CType>("weights", /*dims=*/{1, 1, 2},
+                                /*values=*/swap_inputs ? operand1 : operand2);
+    test->RunValidationAndConversion(node_def);
+
+    // Make sure it does use BinaryTensorOpWeight, not BinaryTensorOpTensor.
+    CheckAddedLayers(test, /*expect_scale_layer=*/true);
+
+    // Check the dims of the output ITensor.
+    TRT_TensorOrWeights output;
+    TF_EXPECT_OK(test->GetTensorOrWeights("my_binary", &output));
+    EXPECT_TRUE(output.is_tensor());
+    ExpectTrtDimsEqualsArray({1, 1, 2}, output.tensor()->getDimensions());
+
+    std::vector<CType> output_data(2);
+    test->BuildAndRun<CType>(
+        {{"input",
+          /*input_data=*/swap_inputs ? operand2 : operand1}},
+        "my_binary", &output_data);
+    if (node_def.op() == "Add") {
+      EXPECT_THAT(output_data, ElementsAre(CType(5), CType(10.5)));
+    } else if (node_def.op() == "Sub") {
+      EXPECT_THAT(output_data, ElementsAre(CType(1), CType(4.5)));
+    } else if (node_def.op() == "Mul") {
+      EXPECT_THAT(output_data, ElementsAre(CType(6), CType(22.5)));
+    } else if (node_def.op() == "Div") {
+      EXPECT_THAT(output_data, ElementsAre(CType(1.5), CType(2.5)));
+    } else if (node_def.op() == "RealDiv") {
+      EXPECT_THAT(output_data, ElementsAre(CType(1.5), CType(2.5)));
+    } else {
+      ASSERT_TRUE(false);
+    }
+  }
+}
+
+template <DataType dtype>
+void TestBinaryTensorOpWeightWithChannelWiseBroadcast(OpConverterTest* test) {
+  typedef typename EnumToDataType<dtype>::Type CType;
+  const NodeDef node_def =
+      GetBinaryOpNodeDef<ops::Add>("input", "weights", dtype);
+  const std::vector<CType> input{CType(1), CType(2), CType(3), CType(4)};
+  const std::vector<CType> weights{CType(10), CType(20)};
+  // There are two types of valid dim pairs which requires channel-wise
+  // broadcasting:
+  // - input dims (X Y Z) vs weights dims (X 1 1)
+  // - input dims (X Y Z) vs weights dims (Z)
+  // Here X=Z=2 and Y=1.
+  for (auto weights_dims : std::vector<std::vector<int>>{{2, 1, 1}, {2}}) {
+    test->Reset();
+    test->AddTestTensor("input", /*dims=*/{2, 1, 2}, /*batch_size=*/1,
+                        TfDataTypeToTrt(dtype));
+    test->AddTestWeights<CType>("weights", weights_dims, weights);
+    test->RunValidationAndConversion(node_def);
+
+    // Make sure it does use BinaryTensorOpWeight, not BinaryTensorOpTensor.
+    CheckAddedLayers(test, /*expect_scale_layer=*/true);
+
+    // Check the dims of the output ITensor.
+    TRT_TensorOrWeights output;
+    TF_EXPECT_OK(test->GetTensorOrWeights("my_binary", &output));
+    EXPECT_TRUE(output.is_tensor());
+    ExpectTrtDimsEqualsArray({2, 1, 2}, output.tensor()->getDimensions());
+
+    std::vector<CType> output_data(4);
+    test->BuildAndRun<CType>({{"input", input}}, "my_binary", &output_data);
+    if (weights_dims.size() == 1) {
+      EXPECT_THAT(output_data,
+                  ElementsAre(CType(11), CType(22), CType(13), CType(24)));
+    } else {
+      EXPECT_THAT(output_data,
+                  ElementsAre(CType(11), CType(12), CType(23), CType(24)));
+    }
+  }
+}
+
+template <DataType dtype>
+void TestBinaryTensorOpWeightWithUniformlyBroadcast(OpConverterTest* test) {
+  typedef typename EnumToDataType<dtype>::Type CType;
+  const NodeDef node_def =
+      GetBinaryOpNodeDef<ops::Add>("input", "weights", dtype);
+  const std::vector<CType> input{CType(1), CType(2), CType(3), CType(4)};
+  const std::vector<CType> weights{CType(10)};
+  test->Reset();
+  test->AddTestTensor("input", /*dims=*/{2, 1, 2}, /*batch_size=*/1,
+                      TfDataTypeToTrt(dtype));
+  test->AddTestWeights<CType>("weights", {1, 1, 1, 1}, weights);
+  test->RunValidationAndConversion(node_def);
+
+  // Make sure it does use BinaryTensorOpWeight, not BinaryTensorOpTensor.
+  CheckAddedLayers(test, /*expect_scale_layer=*/true);
+
+  // Check the dims of the output ITensor.
+  TRT_TensorOrWeights output;
+  TF_EXPECT_OK(test->GetTensorOrWeights("my_binary", &output));
+  EXPECT_TRUE(output.is_tensor());
+  ExpectTrtDimsEqualsArray({2, 1, 2}, output.tensor()->getDimensions());
+
+  std::vector<CType> output_data(4);
+  test->BuildAndRun<CType>({{"input", input}}, "my_binary", &output_data);
+  EXPECT_THAT(output_data,
+              ElementsAre(CType(11), CType(12), CType(13), CType(14)));
+}
+
+template <typename OpType>
+void TestBinaryTensorOpWeightFallback(OpConverterTest* test,
+                                      const std::vector<int32>& input_dims,
+                                      const std::vector<int>& weights_dims,
+                                      error::Code code = error::OK,
+                                      const char* error_msg_substr = nullptr,
+                                      const int input_batch_size = 1) {
+  const DataType dtype = DT_FLOAT;
+  typedef typename EnumToDataType<dtype>::Type CType;
+  const size_t num_inputs = TrtDimsNumElements(GetTestDims(input_dims));
+  const size_t num_weights = TrtDimsNumElements(GetTestDims(weights_dims));
+
+  test->Reset();
+  const NodeDef node_def =
+      GetBinaryOpNodeDef<OpType>("input", "weights", dtype);
+  test->AddTestTensor("input", /*dims=*/input_dims, input_batch_size,
+                      TfDataTypeToTrt(dtype));
+  test->AddTestWeights<CType>(
+      "weights", /*dims=*/weights_dims,
+      /*values=*/std::vector<CType>(num_weights, CType(1)));
+  test->RunValidationAndConversion(node_def, code, error_msg_substr);
+  if (code != error::OK) return;
+
+  // Make sure it does use BinaryTensorOpTensor, not BinaryTensorOpWeight.
+  CheckAddedLayers(test, /*expect_scale_layer=*/false);
+
+  TRT_TensorOrWeights output;
+  TF_EXPECT_OK(test->GetTensorOrWeights("my_binary", &output));
+  EXPECT_TRUE(output.is_tensor());
+
+  // Check the dims of the output ITensor.
+  std::vector<int> expected_output_dims = input_dims;
+  for (int i = expected_output_dims.size() - 1, j = weights_dims.size() - 1;
+       i >= 0 && j >= 0; --i, --j) {
+    if (expected_output_dims[i] == 1) {
+      expected_output_dims[i] = weights_dims[j];
+    }
+  }
+  ExpectTrtDimsEqualsArray(expected_output_dims,
+                           output.tensor()->getDimensions());
+
+  // Check the result of running the engine.
+  const int expected_num_outputs =
+      TrtDimsNumElements(GetTestDims(expected_output_dims));
+  std::vector<CType> output_data(expected_num_outputs);
+  test->BuildAndRun<CType>(
+      {{"input",
+        /*input_data=*/std::vector<CType>(num_inputs, CType(2))}},
+      "my_binary", &output_data);
+  if (node_def.op() == "Add") {
+    EXPECT_THAT(output_data, ElementsAreArray(std::vector<CType>(
+                                 expected_num_outputs, CType(3))));
+  } else if (node_def.op() == "Minimum") {
+    EXPECT_THAT(output_data, ElementsAreArray(std::vector<CType>(
+                                 expected_num_outputs, CType(1))));
+  } else {
+    ASSERT_TRUE(false);
+  }
+}
+
+template <typename OpType, DataType dtype>
+void TestBinaryTensorOpTensor(OpConverterTest* test) {
+  typedef typename EnumToDataType<dtype>::Type CType;
+  test->Reset();
+  const NodeDef node_def =
+      GetBinaryOpNodeDef<OpType>("input1", "input2", dtype);
+  test->AddTestTensor("input1", /*dims=*/{1, 2}, /*batch_size=*/1,
+                      TfDataTypeToTrt(dtype));
+  test->AddTestTensor("input2", /*dims=*/{2, 1}, /*batch_size=*/1,
+                      TfDataTypeToTrt(dtype));
+  test->RunValidationAndConversion(node_def);
+
+  // Make sure it does use BinaryTensorOpTensor, not BinaryTensorOpWeight.
+  CheckAddedLayers(test, /*expect_scale_layer=*/false);
+
+  // Check output dims.
+  TRT_TensorOrWeights output;
+  TF_EXPECT_OK(test->GetTensorOrWeights("my_binary", &output));
+  EXPECT_TRUE(output.is_tensor());
+  ExpectTrtDimsEqualsArray({2, 2}, output.tensor()->getDimensions());
+
+  std::vector<CType> output_data(4);
+  // After broadcasting first input becomes {3, 6, 3, 6} and second input
+  // becomes {2, 3, 2, 3}.
+  test->BuildAndRun<CType>(
+      {{"input1", {CType(3), CType(6)}}, {"input2", {CType(2), CType(3)}}},
+      "my_binary", &output_data);
+  if (node_def.op() == "Add") {
+    EXPECT_THAT(output_data,
+                ElementsAre(CType(5), CType(8), CType(6), CType(9)));
+  } else if (node_def.op() == "Sub") {
+    EXPECT_THAT(output_data,
+                ElementsAre(CType(1), CType(4), CType(0), CType(3)));
+  } else if (node_def.op() == "Mul") {
+    EXPECT_THAT(output_data,
+                ElementsAre(CType(6), CType(12), CType(9), CType(18)));
+  } else if (node_def.op() == "Div") {
+    EXPECT_THAT(output_data,
+                ElementsAre(CType(1.5), CType(3), CType(1), CType(2)));
+  } else if (node_def.op() == "RealDiv") {
+    EXPECT_THAT(output_data,
+                ElementsAre(CType(1.5), CType(3), CType(1), CType(2)));
+  } else if (node_def.op() == "Minimum") {
+    EXPECT_THAT(output_data,
+                ElementsAre(CType(2), CType(2), CType(3), CType(3)));
+  } else if (node_def.op() == "Maximum") {
+    EXPECT_THAT(output_data,
+                ElementsAre(CType(3), CType(6), CType(3), CType(6)));
+  } else {
+    ASSERT_TRUE(false);
+  }
+}
+
+TEST_F(OpConverterTest, ConvertBinary) {
+  // Input size doesn't match, should fail.
+  for (size_t num_inputs = 0; num_inputs < 2; ++num_inputs) {
+    Reset();
+    NodeDef node_def = MakeNodeDef("my_add", "Add", {num_inputs, "input"});
+    AddTestTensor("input", {1}, /*batch_size=*/1, nvinfer1::DataType::kFLOAT);
+    RunValidationAndConversion(node_def, error::INVALID_ARGUMENT,
+                               "Binary ops require two inputs, at my_add");
+  }
+  {
+    // Both inputs are weights.
+    Reset();
+    NodeDef node_def = MakeNodeDef("my_add", "Add", {"weights1", "weights2"});
+    AddTestWeights<float>("weights1", {1}, {1});
+    AddTestWeights<float>("weights2", {1}, {1});
+    RunValidationAndConversion(
+        node_def, error::UNIMPLEMENTED,
+        "Constant folding is falled back to TensorFlow, binary op received "
+        "both input as constant at: my_add");
+  }
+
+  // Test BinaryTensorOpWeight() without broadcasting.
+  TestBinaryTensorOpWeightNoBroadcast<ops::Add, DT_FLOAT>(this);
+  TestBinaryTensorOpWeightNoBroadcast<ops::Sub, DT_FLOAT>(this);
+  TestBinaryTensorOpWeightNoBroadcast<ops::Mul, DT_FLOAT>(this);
+  TestBinaryTensorOpWeightNoBroadcast<ops::Div, DT_FLOAT>(this);
+  TestBinaryTensorOpWeightNoBroadcast<ops::RealDiv, DT_FLOAT>(this);
+#if 0
+  // TODO(b/119560144): it doesn't support FP16 constants and the following test
+  // will fail.
+  TestBinaryTensorOpWeightNoBroadcast<ops::Add, DT_HALF>(this);
+  TestBinaryTensorOpWeightNoBroadcast<ops::Sub, DT_HALF>(this);
+  TestBinaryTensorOpWeightNoBroadcast<ops::Mul, DT_HALF>(this);
+  TestBinaryTensorOpWeightNoBroadcast<ops::Div, DT_HALF>(this);
+  TestBinaryTensorOpWeightNoBroadcast<ops::RealDiv, DT_HALF>(this);
+#endif
+
+  // Test BinaryTensorOpWeight() with channel-wise broadcasting.
+  TestBinaryTensorOpWeightWithChannelWiseBroadcast<DT_FLOAT>(this);
+
+  // Test BinaryTensorOpWeight() with uniformly broadcasting.
+  TestBinaryTensorOpWeightWithUniformlyBroadcast<DT_FLOAT>(this);
+
+  // Test BinaryTensorOpWeight() falling back to BinaryTensorOpTensor().
+  // Unsupported op.
+  TestBinaryTensorOpWeightFallback<ops::Minimum>(this, {1, 1, 1}, {1});
+  // Rank of input tensor dimension <3.
+  TestBinaryTensorOpWeightFallback<ops::Add>(this, {1, 1}, {1});
+  // Broadcast on batch dimension, should fail.
+  TestBinaryTensorOpWeightFallback<ops::Add>(
+      this, {1, 1, 1}, {2, 1, 1, 1}, error::INVALID_ARGUMENT,
+      "Unsupported binary op broadcast scheme for op my_binary",
+      /*input_batch_size=*/2);
+  // Incompatible dims with per-channel mode.
+  TestBinaryTensorOpWeightFallback<ops::Add>(this, {1, 1, 1}, {1, 2, 1});
+  // Incompatible dims.
+  TestBinaryTensorOpWeightFallback<ops::Add>(this, {1, 2, 1}, {2});
+
+  // Test BinaryTensorOpTensor() with broadcasting.
+  TestBinaryTensorOpTensor<ops::Add, DT_FLOAT>(this);
+  TestBinaryTensorOpTensor<ops::Sub, DT_FLOAT>(this);
+  TestBinaryTensorOpTensor<ops::Mul, DT_FLOAT>(this);
+  TestBinaryTensorOpTensor<ops::Div, DT_FLOAT>(this);
+  TestBinaryTensorOpTensor<ops::RealDiv, DT_FLOAT>(this);
+  TestBinaryTensorOpTensor<ops::Minimum, DT_FLOAT>(this);
+  TestBinaryTensorOpTensor<ops::Maximum, DT_FLOAT>(this);
+
+  TestBinaryTensorOpTensor<ops::Add, DT_HALF>(this);
+  TestBinaryTensorOpTensor<ops::Sub, DT_HALF>(this);
+  TestBinaryTensorOpTensor<ops::Mul, DT_HALF>(this);
+  TestBinaryTensorOpTensor<ops::Div, DT_HALF>(this);
+  TestBinaryTensorOpTensor<ops::RealDiv, DT_HALF>(this);
+  TestBinaryTensorOpTensor<ops::Minimum, DT_HALF>(this);
+  TestBinaryTensorOpTensor<ops::Maximum, DT_HALF>(this);
+}
+
+TEST_F(OpConverterTest, ConvertQuantize) {
+  for (const string& op :
+       {"FakeQuantWithMinMaxArgs", "FakeQuantWithMinMaxVars",
+        "QuantizeAndDequantizeV2", "QuantizeAndDequantizeV3"}) {
+    // Input list is empty, should fail.
+    NodeDef node_def = MakeNodeDef("my_quantize", op, {});
+    RunValidationAndConversion(
+        node_def, error::INVALID_ARGUMENT,
+        StrCat("Invalid number of inputs for ", op, ", at my_quantize")
+            .c_str());
+  }
+  {
+    // FakeQuantWithMinMaxArgs attributes are empty, should fail.
+    NodeDef node_def =
+        MakeNodeDef("my_quantize", "FakeQuantWithMinMaxArgs", {"input"});
+    AddTestTensor("input", {1, 2, 3});
+    RunValidationAndConversion(
+        node_def, error::INVALID_ARGUMENT,
+        "Min or max attribute not found for FakeQuantWithMinMaxArgs "
+        "at my_quantize");
+  }
+  {
+    // FakeQuantWithMinMaxArgs ranges set via attributes, ok.
+    Reset();
+    Scope s = Scope::NewRootScope();
+    auto input = ops::Placeholder(s.WithOpName("input"), DT_FLOAT);
+    auto quantize_attrs = ops::FakeQuantWithMinMaxArgs::Min(-6.0f).Max(6.0f);
+    auto quantize = ops::FakeQuantWithMinMaxArgs(s.WithOpName("my_quantize"),
+                                                 input, quantize_attrs);
+    const NodeDef& node_def = quantize.operation.node()->def();
+    AddTestTensor("input", {1, 2, 3});
+    RunValidationAndConversion(node_def);
+    TRT_TensorOrWeights output;
+    TF_EXPECT_OK(GetTensorOrWeights("my_quantize", &output));
+    EXPECT_TRUE(output.is_tensor());
+    auto ranges = quantization_ranges();
+    EXPECT_EQ(1, ranges.count(output.tensor()));
+    EXPECT_EQ(6.0f, ranges[output.tensor()]);
+  }
+  {
+    // FakeQuantWithMinMaxVars ranges set via inputs, ok.
+    Reset();
+    Scope s = Scope::NewRootScope();
+    auto input = ops::Placeholder(s.WithOpName("input"), DT_FLOAT);
+    auto weights_min = ops::Placeholder(s.WithOpName("weights_min"), DT_FLOAT);
+    auto weights_max = ops::Placeholder(s.WithOpName("weights_max"), DT_FLOAT);
+    auto quantize = ops::FakeQuantWithMinMaxVars(
+        s.WithOpName("my_quantize"), input, weights_min, weights_max);
+    const NodeDef& node_def = quantize.operation.node()->def();
+    AddTestTensor("input", {1, 2, 3});
+    AddTestWeights<float>("weights_min", {1}, {-6.0f});
+    AddTestWeights<float>("weights_max", {1}, {6.0f});
+    RunValidationAndConversion(node_def);
+    TRT_TensorOrWeights output;
+    TF_EXPECT_OK(GetTensorOrWeights("my_quantize", &output));
+    EXPECT_TRUE(output.is_tensor());
+    auto ranges = quantization_ranges();
+    EXPECT_EQ(1, ranges.count(output.tensor()));
+    EXPECT_EQ(6.0f, ranges[output.tensor()]);
+  }
+  {
+    // QuantizeAndDequantizeV2 ranges set via inputs, ok.
+    Reset();
+    Scope s = Scope::NewRootScope();
+    auto input = ops::Placeholder(s.WithOpName("input"), DT_FLOAT);
+    auto weights_min = ops::Placeholder(s.WithOpName("weights_min"), DT_FLOAT);
+    auto weights_max = ops::Placeholder(s.WithOpName("weights_max"), DT_FLOAT);
+    auto quantize = ops::QuantizeAndDequantizeV2(
+        s.WithOpName("my_quantize"), input, weights_min, weights_max);
+    const NodeDef& node_def = quantize.operation.node()->def();
+    AddTestTensor("input", {1, 2, 3});
+    AddTestWeights<float>("weights_min", {1}, {-6.0f});
+    AddTestWeights<float>("weights_max", {1}, {6.0f});
+    RunValidationAndConversion(node_def);
+    TRT_TensorOrWeights output;
+    TF_EXPECT_OK(GetTensorOrWeights("my_quantize", &output));
+    EXPECT_TRUE(output.is_tensor());
+    auto ranges = quantization_ranges();
+    EXPECT_EQ(1, ranges.count(output.tensor()));
+    EXPECT_EQ(6.0f, ranges[output.tensor()]);
+  }
+  {
+    // QuantizeAndDequantizeV2 Range inputs are tensors, should fail.
+    Reset();
+    Scope s = Scope::NewRootScope();
+    auto input = ops::Placeholder(s.WithOpName("input"), DT_FLOAT);
+    auto weights_min = ops::Placeholder(s.WithOpName("weights_min"), DT_FLOAT);
+    auto weights_max = ops::Placeholder(s.WithOpName("weights_max"), DT_FLOAT);
+    auto quantize = ops::QuantizeAndDequantizeV2(
+        s.WithOpName("my_quantize"), input, weights_min, weights_max);
+    const NodeDef& node_def = quantize.operation.node()->def();
+    AddTestTensor("input", {1, 2, 3});
+    AddTestTensor("weights_min", {1});
+    AddTestTensor("weights_max", {1});
+    RunValidationAndConversion(
+        node_def, error::INVALID_ARGUMENT,
+        "Min and max inputs for QuantizeAndDequantizeV2 must be weights not "
+        "tensors, at my_quantize");
+  }
+  {
+    // QuantizeAndDequantizeV3 ranges set via inputs, ok.
+    Reset();
+    Scope s = Scope::NewRootScope();
+    auto input = ops::Placeholder(s.WithOpName("input"), DT_FLOAT);
+    auto weights_min = ops::Placeholder(s.WithOpName("weights_min"), DT_FLOAT);
+    auto weights_max = ops::Placeholder(s.WithOpName("weights_max"), DT_FLOAT);
+    auto num_bits = ops::Placeholder(s.WithOpName("num_bits"), DT_INT32);
+    auto quantize = ops::QuantizeAndDequantizeV3(
+        s.WithOpName("my_quantize"), input, weights_min, weights_max, num_bits);
+    const NodeDef& node_def = quantize.operation.node()->def();
+    AddTestTensor("input", {1, 2, 3});
+    AddTestWeights<float>("weights_min", {1}, {-6.0f});
+    AddTestWeights<float>("weights_max", {1}, {6.0f});
+    AddTestWeights<int>("num_bits", {1}, {8});
+    RunValidationAndConversion(node_def);
+    TRT_TensorOrWeights output;
+    TF_EXPECT_OK(GetTensorOrWeights("my_quantize", &output));
+    EXPECT_TRUE(output.is_tensor());
+    auto ranges = quantization_ranges();
+    EXPECT_EQ(1, ranges.count(output.tensor()));
+    EXPECT_EQ(6.0f, ranges[output.tensor()]);
+  }
+}
+
+TEST_F(OpConverterTest, ConvertRelu6) {
+  {
+    // Input list is empty, should fail.
+    NodeDef node_def = MakeNodeDef("my_relu6", "Relu6", {});
+    RunValidationAndConversion(
+        node_def, error::INVALID_ARGUMENT,
+        "Invalid number of inputs for Relu6, at my_relu6");
+  }
+
+  // Get the NodeDef for Relu6.
+  Scope s = Scope::NewRootScope();
+  auto input = ops::Placeholder(s.WithOpName("input"), DT_FLOAT);
+  auto relu6 = ops::Relu6(s.WithOpName("my_relu6"), input);
+  const NodeDef node_def = relu6.operation.node()->def();
+  {
+    // Input is weights, should fail.
+    Reset();
+    AddTestWeights<float>("input", {1}, {1.0f});
+    RunValidationAndConversion(
+        node_def, error::UNIMPLEMENTED,
+        "Relu6 is only implemented for tensors, not weights, at my_relu6");
+  }
+  {
+    // Clip tensor values and set quantization ranges, ok.
+    Reset();
+    AddTestTensor("input", {1, 2, 3});
+    RunValidationAndConversion(node_def);
+    TRT_TensorOrWeights output;
+    TF_EXPECT_OK(GetTensorOrWeights("my_relu6", &output));
+    EXPECT_TRUE(output.is_tensor());
+    auto ranges = quantization_ranges();
+    EXPECT_EQ(ranges[output.tensor()], 6.0f);
+
+    std::vector<float> output_data(6);
+    BuildAndRun<float>({{"input", {-100, -1, 0, 3, 5, 9}}}, "my_relu6",
+                       &output_data);
+    EXPECT_THAT(output_data, ElementsAre(0, 0, 0, 3, 5, 6));
+  }
+}
+
+template <DataType dtype>
+void TestConvertSquare(OpConverterTest* test) {
+  test->Reset();
+  typedef typename EnumToDataType<dtype>::Type CType;
+
+  Scope s = Scope::NewRootScope();
+  auto input = ops::Placeholder(s.WithOpName("input"), dtype);
+  auto square = ops::Square(s.WithOpName("my_square"), input);
+  NodeDef node_def = square.operation.node()->def();
+
+  test->AddTestTensor("input", {1, 20});
+  test->RunValidationAndConversion(node_def);
+  TRT_TensorOrWeights output;
+  TF_EXPECT_OK(test->GetTensorOrWeights("my_square", &output));
+  EXPECT_TRUE(output.is_tensor());
+  ExpectTrtDimsEqualsArray({1, 20}, output.tensor()->getDimensions());
+
+  const int num_inputs = 20;
+  std::vector<CType> input_data(num_inputs);
+  std::vector<CType> expected_output_data(num_inputs);
+  for (int i = 0; i < 20; i++) {
+    const CType value = CType(i - 9);
+    input_data[i] = value;
+    expected_output_data[i] = value * value;
+  }
+  std::vector<CType> output_data(num_inputs);
+  test->BuildAndRun<CType>({{"input", input_data}}, "my_square", &output_data);
+  ExpectArrayNear(expected_output_data, output_data);
+}
+
+TEST_F(OpConverterTest, ConvertSquare) {
+  {
+    // Input list is empty, should fail.
+    NodeDef node_def = MakeNodeDef("my_square", "Square", {});
+    RunValidationAndConversion(node_def, error::INVALID_ARGUMENT,
+                               "Square expects one input, at my_square");
+  }
+  {
+    // Input is weights, should fail.
+    Reset();
+    Scope s = Scope::NewRootScope();
+    auto input = ops::Placeholder(s.WithOpName("input"), DT_FLOAT);
+    auto square = ops::Square(s.WithOpName("my_square"), input);
+    NodeDef node_def = square.operation.node()->def();
+    AddTestWeights<float>("input", {1, 2, 3}, {1, 2, 3, 4, -5, 6});
+    RunValidationAndConversion(
+        node_def, error::UNIMPLEMENTED,
+        "Square is only implemented for tensors, at my_square");
+  }
+
+  // OK. Note that kINT32 is not supported by IElementWiseLayer, so we don't
+  // test DT_INT32 type here.
+  TestConvertSquare<DT_FLOAT>(this);
+  // TODO(tmorris): Looks like there may be a bug with this layer for FP16
+  // inputs. Disabling for now.
+  // TestConvertSquare<DT_HALF>(this);
+}
+
 }  // namespace convert
 }  // namespace tensorrt
 }  // namespace tensorflow
diff --git a/tensorflow/contrib/tensorrt/convert/trt_optimization_pass.cc b/tensorflow/contrib/tensorrt/convert/trt_optimization_pass.cc
index b30d94b0282..4ac7e21d348 100644
--- a/tensorflow/contrib/tensorrt/convert/trt_optimization_pass.cc
+++ b/tensorflow/contrib/tensorrt/convert/trt_optimization_pass.cc
@@ -67,6 +67,9 @@ tensorflow::Status TRTOptimizationPass::Init(
     TF_RETURN_IF_ERROR(GetPrecisionMode(
         Uppercase(params.at("precision_mode").s()), &precision_mode_));
   }
+  if (params.count("use_calibration")) {
+    use_calibration_ = params.at("use_calibration").b();
+  }
   return tensorflow::Status::OK();
 }
 
@@ -222,6 +225,12 @@ tensorflow::Status TRTOptimizationPass::Optimize(
   TF_RETURN_IF_ERROR(static_graph_properties.InferStatically(true));
   tensorflow::tensorrt::convert::ConversionParams cp;
 
+  if (use_calibration_ && precision_mode_ != INT8MODE) {
+    LOG(ERROR) << "Calibration with FP32 or FP16 is not implemented. "
+               << "Falling back to use_calibration = False.";
+    use_calibration_ = false;
+  }
+
   std::vector<string> nodes_to_preserve;
   for (const auto& n : item.NodesToPreserve()) {
     auto tokens = str_util::Split(n, ":");
@@ -250,6 +259,7 @@ tensorflow::Status TRTOptimizationPass::Optimize(
   cp.is_dyn_op = is_dynamic_op_;
   cp.cached_engine_batches = batches_;
   cp.max_cached_engines = max_cached_batches_;
+  cp.use_calibration = use_calibration_;
   auto status = tensorflow::tensorrt::convert::ConvertAfterShapes(cp);
   VLOG(1) << "Returning from " << name_;
   return status;
diff --git a/tensorflow/contrib/tensorrt/convert/trt_optimization_pass.h b/tensorflow/contrib/tensorrt/convert/trt_optimization_pass.h
index 71b51d13681..3e8dc0978e4 100644
--- a/tensorflow/contrib/tensorrt/convert/trt_optimization_pass.h
+++ b/tensorflow/contrib/tensorrt/convert/trt_optimization_pass.h
@@ -38,7 +38,8 @@ class TRTOptimizationPass : public tensorflow::grappler::CustomGraphOptimizer {
         maximum_batch_size_(-1),
         is_dynamic_op_(false),
         max_cached_batches_(1),
-        max_workspace_size_bytes_(256LL << 20) {
+        max_workspace_size_bytes_(256LL << 20),
+        use_calibration_(true) {
     VLOG(1) << "Constructing " << name_;
   }
 
@@ -67,6 +68,7 @@ class TRTOptimizationPass : public tensorflow::grappler::CustomGraphOptimizer {
   std::vector<int> batches_;
   int max_cached_batches_;
   int64_t max_workspace_size_bytes_;
+  bool use_calibration_;
 };
 
 }  // namespace convert
diff --git a/tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc b/tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc
index 019446813a5..117039683c0 100644
--- a/tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc
+++ b/tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc
@@ -124,8 +124,10 @@ TRTEngineOp::TRTEngineOp(OpKernelConstruction* context)
   OP_REQUIRES_OK(context,
                  context->GetAttr("segment_funcdef_name", &funcdef_name_));
   OP_REQUIRES_OK(context, GetPrecisionMode(precision_string, &precision_mode_));
-  calibration_mode_ =
-      (precision_mode_ == INT8MODE && calibration_data.size() == 0);
+  OP_REQUIRES_OK(context,
+                 context->GetAttr("use_calibration", &use_calibration_));
+  calibration_mode_ = (use_calibration_ && precision_mode_ == INT8MODE &&
+                       calibration_data.size() == 0);
   if (calibration_data.size()) {
     calibrator_.reset(new TRTInt8Calibrator(calibration_data));
     calibration_data.resize(0);
@@ -252,9 +254,8 @@ int TRTEngineOp::GetEngineBatch(OpKernelContext* ctx) {
       cached_engine_batches_.push_back(num_batch);
       VLOG(1) << "Running with batch size " << num_batch;
     } else {
-      string msg =
-          StrCat("Engine buffer is full. buffer limit=", max_cached_engines_,
-                 ", current entries=");
+      string msg = StrCat("Engine buffer is full. buffer limit=",
+                          max_cached_engines_, ", current entries=");
       for (auto i : cached_engine_batches_) StrAppend(&msg, i, ",");
       StrAppend(&msg, " requested batch=", num_batch);
       LOG(WARNING) << msg;
@@ -308,7 +309,7 @@ bool TRTEngineOp::ExecuteTrtEngine(
   std::vector<void*> buffers(num_binding);
   for (int i = 0; i < ctx->num_inputs(); i++) {
     const string input_name = StrCat(kInputPHName, i);
-    const size_t binding_index =
+    const int binding_index =
         trt_engine_ptr->getBindingIndex(input_name.c_str());
     if (binding_index == -1) {
       LOG(ERROR) << "Input node not found, at " << input_name;
@@ -345,7 +346,7 @@ bool TRTEngineOp::ExecuteTrtEngine(
   for (int i = 0; i < ctx->num_outputs(); i++) {
     // Create an output tensor
     const string output_name = StrCat(kOutputPHName, i);
-    const size_t binding_index =
+    const int binding_index =
         trt_engine_ptr->getBindingIndex(output_name.c_str());
     Tensor* output_tensor = nullptr;
 
@@ -491,13 +492,14 @@ TRTEngineOp::EngineCtxPair& TRTEngineOp::GetEngine(int batch_size,
     }
     TrtUniquePtrType<nvinfer1::ICudaEngine> engine;
     bool convert_successfully = false;
-    VLOG(0) << name() << " Constructing a new engine with batch size "
-            << batch_size;
+    LOG(INFO) << "Building a new TensorRT engine for " << name()
+              << " with batch size " << batch_size;
     // Up to this point, calibrator_ can never be empty, since otherwise it
     // means calibration_mode_ is true and this path won't get executed.
     auto status = convert::ConvertGraphDefToEngine(
         segment_graph_, precision_mode_, batch_size, workspace_size_, shapes,
-        &logger, allocator, calibrator_.get(), &engine, &convert_successfully);
+        &logger, allocator, calibrator_.get(), &engine, use_calibration_,
+        &convert_successfully);
     if (!status.ok()) {
       if (convert_successfully) {
         // This means it fail to build the engine even when the network is built
@@ -567,8 +569,8 @@ tensorflow::Status TRTEngineOp::AllocateCalibrationResources(
   const int64 workspace_size_bytes = workspace_size_;
   cres->thr_.reset(new std::thread([cres, label, segment_graph, shapes,
                                     platform_gpu_id, workspace_size_bytes]() {
-    VLOG(0) << "Starting calibration thread on device " << platform_gpu_id
-            << ", Calibration Resource @ " << cres;
+    LOG(INFO) << "Starting calibration thread on device " << platform_gpu_id
+              << ", Calibration Resource @ " << cres;
     auto err = cudaSetDevice(platform_gpu_id);
     if (err != cudaSuccess) {
       // TODO(aaroey): should return error here.
@@ -586,6 +588,7 @@ tensorflow::Status TRTEngineOp::AllocateCalibrationResources(
         *segment_graph, INT8MODE, cres->calibrator_->getBatchSize(),
         workspace_size_bytes, shapes, &cres->logger_, cres->allocator_.get(),
         cres->calibrator_.get(), &cres->engine_,
+        /*use_calibration=*/true,
         /*convert_successfully=*/nullptr);
     if (!s.ok()) {
       LOG(ERROR) << "Calibration failed: " << s;
diff --git a/tensorflow/contrib/tensorrt/kernels/trt_engine_op.h b/tensorflow/contrib/tensorrt/kernels/trt_engine_op.h
index 8fe06758914..b545f497f32 100644
--- a/tensorflow/contrib/tensorrt/kernels/trt_engine_op.h
+++ b/tensorflow/contrib/tensorrt/kernels/trt_engine_op.h
@@ -130,6 +130,10 @@ class TRTEngineOp : public AsyncOpKernel {
 
   // The finalized calibrator for inference.
   std::unique_ptr<TRTInt8Calibrator> calibrator_;
+
+  // If true, create calibration graph for INT8 mode. Otherwise, we are using
+  // user-provided quantization ranges.
+  bool use_calibration_;
 };
 
 }  // namespace tensorrt
diff --git a/tensorflow/contrib/tensorrt/ops/trt_engine_op.cc b/tensorflow/contrib/tensorrt/ops/trt_engine_op.cc
index e0c7b627237..92405906eb7 100644
--- a/tensorflow/contrib/tensorrt/ops/trt_engine_op.cc
+++ b/tensorflow/contrib/tensorrt/ops/trt_engine_op.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #if GOOGLE_CUDA
 #if GOOGLE_TENSORRT
 
+#include "tensorflow/core/framework/common_shape_fns.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/shape_inference.h"
@@ -39,18 +40,19 @@ REGISTER_OP("TRTEngineOp")
     .Attr("cached_engine_batches: list(int) = []")
     .Attr("max_cached_engines_count: int = 1")
     .Attr("workspace_size_bytes: int")
-    .Attr("precision_mode: {'FP32', 'FP16', 'INT8', 'INT8CALIB'}")
+    .Attr("precision_mode: {'FP32', 'FP16', 'INT8'}")
     .Attr("calibration_data: string = ''")
+    .Attr("use_calibration: bool = true")
     .Input("in_tensor: InT")
-    .Output("out_tensor: OutT");
-// TODO(jie): TF requires concrete output shape for concrete input shapes.
-// This is tricky for batch dimension, since we cannot ensure which input
-// would carry the correct batch dimension (for the current stage of the
-// implementation, we do require all input tensor to carry the same batch
-// size, but this could change in the future). Hence we disable shape
-// inference function as a workaround.
-// .SetShapeFn(shape_inference::TRTEngineOpShapeInference);
-
+    .Output("out_tensor: OutT")
+    // TODO(jie): TF requires concrete output shape for concrete input shapes.
+    // This is tricky for batch dimension, since we cannot ensure which input
+    // would carry the correct batch dimension (for the current stage of the
+    // implementation, we do require all input tensor to carry the same batch
+    // size, but this could change in the future). Hence we disable shape
+    // inference function as a workaround.
+    // .SetShapeFn(shape_inference::TRTEngineOpShapeInference);
+    .SetShapeFn(shape_inference::UnknownShape);
 }  // namespace tensorflow
 
 #endif  // GOOGLE_TENSORRT
diff --git a/tensorflow/contrib/tensorrt/python/trt_convert.py b/tensorflow/contrib/tensorrt/python/trt_convert.py
index bb81fbf93f3..74a2c2392ad 100644
--- a/tensorflow/contrib/tensorrt/python/trt_convert.py
+++ b/tensorflow/contrib/tensorrt/python/trt_convert.py
@@ -63,19 +63,20 @@ class TrtPrecisionMode(object):
     return [TrtPrecisionMode.FP32, TrtPrecisionMode.FP16, TrtPrecisionMode.INT8]
 
 
-def tensorrt_rewriter_config(rewriter_config=None,
-                             max_batch_size=1,
-                             max_workspace_size_bytes=2 << 20,
-                             precision_mode=TrtPrecisionMode.FP32,
-                             minimum_segment_size=3,
-                             is_dynamic_op=False,
-                             maximum_cached_engines=1,
-                             cached_engine_batch_sizes=None):
+def get_tensorrt_rewriter_config(rewriter_config=None,
+                                 max_batch_size=1,
+                                 max_workspace_size_bytes=2 << 20,
+                                 precision_mode=TrtPrecisionMode.FP32,
+                                 minimum_segment_size=3,
+                                 is_dynamic_op=False,
+                                 maximum_cached_engines=1,
+                                 cached_engine_batch_sizes=None,
+                                 use_calibration=True):
   """Returns a RewriterConfig proto for TRT transformation.
 
   Args:
-    rewriter_config: a RewriterConfig proto to append the TensorRTOptimizer to.
-      If None, it will create one with default settings.
+    rewriter_config: a template RewriterConfig proto used to create a
+      TRT-enabled RewriterConfig. If None, it will use a default one.
     max_batch_size: max size for the input batch
     max_workspace_size_bytes: the maximum GPU temporary memory which the TRT
       engine can use at execution time. This corresponds to the 'workspaceSize'
@@ -95,6 +96,15 @@ def tensorrt_rewriter_config(rewriter_config=None,
       use this list to determine the batch sizes of the cached engines, instead
       of making the decision on the fly. This is useful when we know the most
       common batch size(s) the application is going to generate.
+    use_calibration: this argument is ignored if precision_mode is not INT8. if
+      set to True, a calibration graph will be created to calibrate the missing
+      ranges. The calibration graph must be converted to an inference graph
+      using calib_graph_to_infer_graph() after running calibration. if set to
+      False, quantization nodes will be expected for every tensor in the graph
+      (exlcuding those which will be fused). If a range is missing, an error
+      will occur. Please note that accuracy may be negatively affected if there
+      is a mismatch between which tensors TRT quantizes and which tensors were
+      trained with fake quantization.
 
   Returns:
     A RewriterConfig proto which sets a TensorRTOptimizer to run Grappler.
@@ -107,13 +117,16 @@ def tensorrt_rewriter_config(rewriter_config=None,
       rewriter_config, rewriter_config_pb2.RewriterConfig):
     raise TypeError("rewriter_config should be a RewriterConfig proto.")
 
+  rewriter_config_with_trt = rewriter_config_pb2.RewriterConfig()
   if rewriter_config is None:
-    rewriter_config = rewriter_config_pb2.RewriterConfig()
     # Layout optimizer may add Const nodes followed by Reshape nodes, thus we
     # need to run constant folding again.
-    rewriter_config.optimizers.extend(["constfold", "layout", "constfold"])
-    rewriter_config.meta_optimizer_iterations = (
+    rewriter_config_with_trt.optimizers.extend(
+        ["constfold", "layout", "constfold"])
+    rewriter_config_with_trt.meta_optimizer_iterations = (
         rewriter_config_pb2.RewriterConfig.ONE)
+  else:
+    rewriter_config_with_trt.CopyFrom(rewriter_config)
 
   if precision_mode.upper() not in TrtPrecisionMode.supported_precision_modes():
     raise ValueError(("precision mode '{}' is not supported."
@@ -121,7 +134,7 @@ def tensorrt_rewriter_config(rewriter_config=None,
                           precision_mode,
                           TrtPrecisionMode.supported_precision_modes))
 
-  optimizer = rewriter_config.custom_optimizers.add()
+  optimizer = rewriter_config_with_trt.custom_optimizers.add()
   optimizer.name = "TensorRTOptimizer"
   optimizer.parameter_map["minimum_segment_size"].i = minimum_segment_size
   optimizer.parameter_map["max_batch_size"].i = max_batch_size
@@ -138,7 +151,8 @@ def tensorrt_rewriter_config(rewriter_config=None,
                        "maximum_cached_engines items.")
     optimizer.parameter_map["cached_engine_batches"].list.i.extend(
         cached_engine_batch_sizes)
-  return rewriter_config
+  optimizer.parameter_map["use_calibration"].b = use_calibration
+  return rewriter_config_with_trt
 
 
 def create_inference_graph(input_graph_def,
@@ -150,7 +164,7 @@ def create_inference_graph(input_graph_def,
                            is_dynamic_op=False,
                            maximum_cached_engines=1,
                            cached_engine_batch_sizes=None,
-                           rewriter_config=None,
+                           use_calibration=True,
                            input_saved_model_dir=None,
                            input_saved_model_tags=None,
                            output_saved_model_dir=None,
@@ -182,8 +196,15 @@ def create_inference_graph(input_graph_def,
       use this list to determine the batch sizes of the cached engines, instead
       of making the decision on the fly. This is useful when we know the most
       common batch size(s) the application is going to generate.
-    rewriter_config: a RewriterConfig proto to append the TensorRTOptimizer to.
-      If None, it will create one with default settings.
+    use_calibration: this argument is ignored if precision_mode is not INT8. if
+      set to True, a calibration graph will be created to calibrate the missing
+      ranges. The calibration graph must be converted to an inference graph
+      using calib_graph_to_infer_graph() after running calibration. if set to
+      False, quantization nodes will be expected for every tensor in the graph
+      (exlcuding those which will be fused). If a range is missing, an error
+      will occur. Please note that accuracy may be negatively affected if there
+      is a mismatch between which tensors TRT quantizes and which tensors were
+      trained with fake quantization.
     input_saved_model_dir: the directory to load the SavedModel which contains
       the input graph to transforms. Used only when input_graph_def is None.
     input_saved_model_tags: list of tags to load the SavedModel.
@@ -191,8 +212,9 @@ def create_inference_graph(input_graph_def,
       returned GraphDef and save it to the specified directory. This option only
       works when the input graph is loaded from a SavedModel, i.e. when
       input_saved_model_dir is specified and input_graph_def is None.
-    session_config: the ConfigProto used to create a Session. If not specified,
-      a default ConfigProto will be used.
+    session_config: the ConfigProto used to create a Session. It's also used as
+      a template to create a TRT-enabled ConfigProto for conversion. If not
+      specified, a default ConfigProto will be used.
 
   Returns:
     A GraphDef transformed from input_graph_def (or the SavedModel graph def
@@ -322,21 +344,30 @@ def create_inference_graph(input_graph_def,
       grappler_meta_graph_def.collection_def["train_op"].CopyFrom(
           output_collection)
 
-  # Create RewriterConfig.
-  rewriter_config = tensorrt_rewriter_config(
+  # Create TRT-enabled ConfigProto.
+  session_config_with_trt = config_pb2.ConfigProto()
+  session_config_with_trt.CopyFrom(session_config)
+  rewriter_config = None
+  if (session_config_with_trt.HasField("graph_options") and
+      session_config_with_trt.graph_options.HasField("rewrite_options")):
+    rewriter_config = session_config_with_trt.graph_options.rewrite_options
+  rewriter_config_with_trt = get_tensorrt_rewriter_config(
       rewriter_config, max_batch_size, max_workspace_size_bytes, precision_mode,
       minimum_segment_size, is_dynamic_op, maximum_cached_engines,
-      cached_engine_batch_sizes)
+      cached_engine_batch_sizes, use_calibration)
+  session_config_with_trt.graph_options.rewrite_options.CopyFrom(
+      rewriter_config_with_trt)
 
   # Run Grappler.
   transformed_graph_def = tf_optimizer.OptimizeGraph(
-      rewriter_config, grappler_meta_graph_def, graph_id=b"tf_graph")
+      session_config_with_trt, grappler_meta_graph_def, graph_id=b"tf_graph")
 
   # Optionally write the transformed graphdef as SavedModel.
   if output_saved_model_dir is not None:
     saved_model_builder = builder.SavedModelBuilder(output_saved_model_dir)
     with ops.Graph().as_default():
       importer.import_graph_def(transformed_graph_def, name="")
+      # We don't use TRT here.
       with session.Session(config=session_config) as sess:
         saved_model_builder.add_meta_graph_and_variables(
             sess,
diff --git a/tensorflow/contrib/tensorrt/python/trt_convert_test.py b/tensorflow/contrib/tensorrt/python/trt_convert_test.py
index 9f2eeac990d..dbf8dd26144 100644
--- a/tensorflow/contrib/tensorrt/python/trt_convert_test.py
+++ b/tensorflow/contrib/tensorrt/python/trt_convert_test.py
@@ -47,9 +47,9 @@ from tensorflow.python.tools import saved_model_utils
 class TrtConvertTest(test_util.TensorFlowTestCase):
   """Class to test Tensorflow-TensorRT integration python API."""
 
-  def testTensorrtRewriterConfig(self):
-    """Test case for trt_convert.tensorrt_rewriter_config()."""
-    rewriter_cfg = trt_convert.tensorrt_rewriter_config(
+  def testGetTensorrtRewriterConfig(self):
+    """Test case for trt_convert.get_tensorrt_rewriter_config()."""
+    rewriter_cfg = trt_convert.get_tensorrt_rewriter_config(
         rewriter_config=None,
         max_batch_size=128,
         max_workspace_size_bytes=1234,
@@ -162,7 +162,7 @@ class TrtConvertTest(test_util.TensorFlowTestCase):
       node_name_to_op = {node.name: node.op for node in graph_def.node}
       self.assertEqual({
           "input": "Placeholder",
-          "my_trt_op_0": "TRTEngineOp",
+          "TRTEngineOp_0": "TRTEngineOp",
           "output": "Identity"
       }, node_name_to_op)
 
@@ -189,10 +189,11 @@ class TrtConvertTest(test_util.TensorFlowTestCase):
     execute_engine_test_value = ("done" if expect_engine_is_run else "")
     execute_native_segment_test_value = ("" if expect_engine_is_run else "done")
     self.assertEqual(execute_engine_test_value,
-                     trt_convert.get_test_value("my_trt_op_0:ExecuteTrtEngine"))
+                     trt_convert.get_test_value(
+                         "TRTEngineOp_0:ExecuteTrtEngine"))
     self.assertEqual(
         execute_native_segment_test_value,
-        trt_convert.get_test_value("my_trt_op_0:ExecuteNativeSegment"))
+        trt_convert.get_test_value("TRTEngineOp_0:ExecuteNativeSegment"))
 
   def testCreateInferenceGraph_MinimumSegmentSize(self):
     if not trt_convert.is_tensorrt_enabled():
diff --git a/tensorflow/contrib/tensorrt/resources/trt_resources.h b/tensorflow/contrib/tensorrt/resources/trt_resources.h
index 840da6e78d8..aac9e5c7bd7 100644
--- a/tensorflow/contrib/tensorrt/resources/trt_resources.h
+++ b/tensorflow/contrib/tensorrt/resources/trt_resources.h
@@ -39,7 +39,8 @@ namespace tensorrt {
 class TRTCalibrationResource : public tensorflow::ResourceBase {
  public:
   ~TRTCalibrationResource() {
-    VLOG(0) << "Destroying Calibration Resource " << std::endl << DebugString();
+    LOG(INFO) << "Destroying Calibration Resource " << std::endl
+              << DebugString();
     builder_.reset();
     engine_.reset();
     // We need to manually destroy the builder and engine before the allocator
diff --git a/tensorflow/contrib/tensorrt/segment/segment.cc b/tensorflow/contrib/tensorrt/segment/segment.cc
index 4f64b7a9522..d8f63779e64 100644
--- a/tensorflow/contrib/tensorrt/segment/segment.cc
+++ b/tensorflow/contrib/tensorrt/segment/segment.cc
@@ -33,6 +33,7 @@ namespace tensorflow {
 namespace tensorrt {
 namespace segment {
 using ::tensorflow::strings::StrAppend;
+using ::tensorflow::strings::StrCat;
 
 // A simple graph representation to mirror tensorflow::Graph. This structure
 // helps saving memory since segmenter modifies the graph in place, preventing
@@ -406,22 +407,42 @@ tensorflow::Status SegmentGraph(
   // Use a union-find to collect the nodes that belong to the same
   // segment. A node value of nullptr indicates that the node is not a candidate
   // for TRT.
+  std::unordered_set<string> unsupported_ops;
+  int num_unsupported_ops = 0;
   std::vector<UnionFind<SimpleNode*>> node_segments;
   for (int i = 0; i < graph->num_node_ids(); ++i) {
     SimpleNode* node = graph->FindNodeId(i);
     if (options.exclude_node_list.count(node->name()) != 0) {
-      VLOG(1) << "Not a TF-TRT candidate: " << node->name()
-              << " (excluded by segmenter option).";
+      VLOG(1) << "Not a TF-TRT candidate, "
+              << "(Op type: " << node->tf_node()->type_string() << "), "
+              << "(Op name: " << node->name() << "), "
+              << "(Reason: excluded by segmenter option)";
+      unsupported_ops.emplace(node->tf_node()->type_string());
+      num_unsupported_ops++;
       node = nullptr;
     } else {
       const Status status = candidate_fn(node->tf_node());
       if (!status.ok()) {
-        VLOG(1) << "Not a TF-TRT candidate: " << node->name() << ": " << status;
+        VLOG(1) << "Not a TF-TRT candidate, "
+                << "(Op type: " << node->tf_node()->type_string() << "), "
+                << "(Op name: " << node->name() << "), "
+                << "(Reason: " << status << ")";
+        unsupported_ops.emplace(node->tf_node()->type_string());
+        num_unsupported_ops++;
         node = nullptr;
       }
     }
     node_segments.emplace_back(node);
   }
+  string msg = StrCat(
+      "There are ", num_unsupported_ops, " ops of ", unsupported_ops.size(),
+      " different types in the graph that", " are not converted to TensorRT: ");
+  for (const auto& elem : unsupported_ops) {
+    StrAppend(&msg, elem, ", ");
+  }
+  LOG(INFO) << msg << "(For more information see "
+            << "https://docs.nvidia.com/deeplearning"
+            << "/dgx/integrate-tf-trt/index.html#support-ops).";
 
   // The segmentation algorithm below visits nodes in reverse topological order
   // and attempts to merge nodes along output edges. That means that subgraphs
@@ -439,7 +460,8 @@ tensorflow::Status SegmentGraph(
   std::vector<const SimpleNode*> order;
   order.reserve(graph->num_node_ids());
   StableDFS(*graph, /*reverse=*/false, {graph->source_node()},
-            /*enter=*/nullptr, [&order](const SimpleNode* n) {
+            /*enter=*/nullptr,
+            [&order](const SimpleNode* n) {
               order.push_back(n);
               return true;
             });
@@ -548,7 +570,7 @@ tensorflow::Status SegmentGraph(
     std::set<const tensorflow::Node*>& segment_nodes = itr.second;
     VLOG(1) << "Segment original size: " << segment_nodes.size();
     while (true) {
-      std::deque<const tensorflow::Node*> in_nodes_que, out_nodes_que;
+      std::deque<const tensorflow::Node *> in_nodes_que, out_nodes_que;
       // Find an input node that is not eligible and add it to the queue.
       // Nodes that has no incoming edges should not be treated as "input",
       // as there are really no inputs to them. Similar for output nodes.
@@ -594,8 +616,7 @@ tensorflow::Status SegmentGraph(
       // their outputs. In this way, for common cases the number of removed
       // nodes should be minimum.
       auto remove_nodes = [&segment_nodes](
-                              bool is_input_nodes,
-                              std::deque<const tensorflow::Node*>* que) {
+          bool is_input_nodes, std::deque<const tensorflow::Node*>* que) {
         // Run a BFS on the queue to find all the input/output nodes.
         std::set<const tensorflow::Node*> visited;
         std::set<const tensorflow::Node*> logged(que->begin(), que->end());
diff --git a/tensorflow/contrib/tensorrt/test/base_test.py b/tensorflow/contrib/tensorrt/test/base_test.py
index 18096e0ff1e..03faf1df243 100644
--- a/tensorflow/contrib/tensorrt/test/base_test.py
+++ b/tensorflow/contrib/tensorrt/test/base_test.py
@@ -56,8 +56,9 @@ class SimpleSingleEngineTest(trt_test.TfTrtIntegrationTestBase):
             strides=[1, 2, 2, 1],
             padding="SAME",
             name="conv")
-        bias = constant_op.constant(
-            [4., 1.5, 2., 3., 5., 7.], name="bias", dtype=dtype)
+        bias = constant_op.constant([4., 1.5, 2., 3., 5., 7.],
+                                    name="bias",
+                                    dtype=dtype)
         added = nn.bias_add(conv, bias, name="bias_add")
         relu = nn.relu(added, "relu")
         identity = array_ops.identity(relu, "identity")
@@ -73,11 +74,12 @@ class SimpleSingleEngineTest(trt_test.TfTrtIntegrationTestBase):
 
   def ExpectedEnginesToBuild(self, run_params):
     """Return the expected engines to build."""
-    # TODO(aaroey): LayoutOptimizer adds additional nodes to the graph which
-    # breaks the connection check, fix it.
-    # - my_trt_op_0 should have ["weights", "conv", "bias", "bias_add",
-    #   "relu", "identity", "max_pool"]
-    return ["my_trt_op_0"]
+    return {
+        "my_trt_op_0": [
+            "weights", "conv", "bias", "bias_add", "relu", "identity",
+            "max_pool"
+        ]
+    }
 
 
 class SimpleMultiEnginesTest(trt_test.TfTrtIntegrationTestBase):
@@ -92,7 +94,7 @@ class SimpleMultiEnginesTest(trt_test.TfTrtIntegrationTestBase):
     g = ops.Graph()
     with g.as_default():
       inp = array_ops.placeholder(
-          dtype=dtype, shape=[None] + input_dims[1:], name=input_name)
+          dtype=dtype, shape=input_dims, name=input_name)
       with g.device("/GPU:0"):
         conv_filter = constant_op.constant(
             [[[[1., 0.5, 4., 6., 0.5, 1.], [1., 0.5, 1., 1., 0.5, 1.]]]],
@@ -105,10 +107,10 @@ class SimpleMultiEnginesTest(trt_test.TfTrtIntegrationTestBase):
             padding="SAME",
             name="conv")
         c1 = constant_op.constant(
-            np.random.randn(input_dims[0], 12, 12, 6), dtype=dtype, name="c1")
+            np.random.randn(12, 12, 6), dtype=dtype, name="c1")
         p = math_ops.mul(conv, c1, name="mul")
         c2 = constant_op.constant(
-            np.random.randn(input_dims[0], 12, 12, 6), dtype=dtype, name="c2")
+            np.random.randn(12, 12, 6), dtype=dtype, name="c2")
         q = math_ops.div(conv, c2, name="div")
 
         edge = self.trt_incompatible_op(q, name="incompatible")
@@ -129,22 +131,21 @@ class SimpleMultiEnginesTest(trt_test.TfTrtIntegrationTestBase):
 
   def ExpectedEnginesToBuild(self, run_params):
     """Return the expected engines to build."""
-    # TODO(aaroey): LayoutOptimizer adds additional nodes to the graph which
-    # breaks the connection check, fix it.
-    # - my_trt_op_0 should have ["mul", "sub", "div1", "mul1", "add1",
-    #   "add", "sub1"];
-    # - my_trt_op_1 should have ["weights","conv", "div"]
-    return ["my_trt_op_0", "my_trt_op_1"]
+    return {
+        "my_trt_op_0": [
+            "add", "add1", "c1", "div1", "mul", "mul1", "sub", "sub1"
+        ],
+        "my_trt_op_1": ["c2", "conv", "div", "weights"]
+    }
 
-  def ShouldRunTest(self, run_params):
-    # TODO(aaroey): LayoutOptimizer adds Transpose(Const, Const) to the graph
-    # which breaks the conversion. We should fix it as:
-    # - Detect the invalid NodeDef earlier before adding them to segment
-    # - Let it able to change the RewriterConfig when calling
-    #   create_inference_graph().
-    # It will be good to add debugging feature for Grappler to print the graph
-    # after running each optimizer.
-    return False
+  def GetConversionParams(self, run_params):
+    """Return a ConversionParams for test."""
+    return super(
+        SimpleMultiEnginesTest, self
+    ).GetConversionParams(run_params)._replace(
+        # Disable layout optimizer, since it'll add Transpose(Const, Const) to
+        # the graph and breaks the conversion check.
+        rewriter_config=trt_test.OptimizerDisabledRewriterConfig())
 
 
 class PartiallyConvertedTestA(trt_test.TfTrtIntegrationTestBase):
@@ -153,7 +154,7 @@ class PartiallyConvertedTestA(trt_test.TfTrtIntegrationTestBase):
     """Setup method."""
     super(PartiallyConvertedTestA, self).setUp()
     # Let it fail to build the second engine.
-    trt_convert.add_test_value("my_trt_op_1:CreateTRTNode", "fail")
+    trt_convert.add_test_value("TRTEngineOp_1:CreateTRTNode", "fail")
 
   def GetParams(self):
     """Create a graph containing two segment."""
@@ -190,14 +191,16 @@ class PartiallyConvertedTestA(trt_test.TfTrtIntegrationTestBase):
     """Return the expected engines to build."""
     return {
         # Only the first engine is built.
-        "my_trt_op_0": ["c0", "c1", "add0", "add1", "mul0", "mul1"]
+        "TRTEngineOp_0": ["c0", "c1", "add0", "add1", "mul0", "mul1"]
     }
 
   def ShouldRunTest(self, run_params):
     """Whether to run the test."""
     # Disable the test in fp16 mode since multiple matmul and add ops together
     # can cause overflow.
-    return run_params.precision_mode != "FP16"
+    return ((run_params.precision_mode != "FP16") and
+            not (trt_test.IsQuantizationMode(run_params.precision_mode) and
+                 not run_params.use_calibration))
 
 
 class PartiallyConvertedTestB(PartiallyConvertedTestA):
@@ -207,13 +210,13 @@ class PartiallyConvertedTestB(PartiallyConvertedTestA):
     super(PartiallyConvertedTestB, self).setUp()
     # Let it fail to build the first engine.
     trt_convert.clear_test_values("")
-    trt_convert.add_test_value("my_trt_op_0:CreateTRTNode", "fail")
+    trt_convert.add_test_value("TRTEngineOp_0:CreateTRTNode", "fail")
 
   def ExpectedEnginesToBuild(self, run_params):
     """Return the expected engines to build."""
     return {
         # Only the second engine is built.
-        "my_trt_op_1": ["c2", "c3", "add2", "add3", "mul2", "mul3"]
+        "TRTEngineOp_1": ["c2", "c3", "add2", "add3", "mul2", "mul3"]
     }
 
 
@@ -257,8 +260,8 @@ class ConstInputTest(trt_test.TfTrtIntegrationTestBase):
   def ExpectedEnginesToBuild(self, run_params):
     """Return the expected engines to build."""
     return {
-        "my_trt_op_0": ["add", "add1", "mul"],
-        "my_trt_op_1": ["add2", "add3", "mul1"]
+        "TRTEngineOp_0": ["add", "add1", "mul"],
+        "TRTEngineOp_1": ["add2", "add3", "mul1"]
     }
 
 
@@ -289,7 +292,7 @@ class ConstDataInputSingleEngineTest(trt_test.TfTrtIntegrationTestBase):
 
   def ExpectedEnginesToBuild(self, run_params):
     """Return the expected engines to build."""
-    return {"my_trt_op_0": ["c", "add", "add1", "mul"]}
+    return {"TRTEngineOp_0": ["c", "add", "add1", "mul"]}
 
 
 class ConstDataInputMultipleEnginesTest(trt_test.TfTrtIntegrationTestBase):
@@ -324,12 +327,12 @@ class ConstDataInputMultipleEnginesTest(trt_test.TfTrtIntegrationTestBase):
   def ExpectedEnginesToBuild(self, run_params):
     """Return the expected engines to build."""
     return {
-        "my_trt_op_0": ["add2", "add3", "mul1"],
+        "TRTEngineOp_0": ["add2", "add3", "mul1"],
         # Why segment ["add", "add1", "mul"] was assigned segment id 1
         # instead of 0: the parent node of this segment is actually const
         # node 'c', but it's removed later since it's const output of the
         # segment which is not allowed.
-        "my_trt_op_1": ["add", "add1", "mul"]
+        "TRTEngineOp_1": ["add", "add1", "mul"]
     }
 
 
@@ -373,8 +376,8 @@ class ControlDependencyTest(trt_test.TfTrtIntegrationTestBase):
   def ExpectedEnginesToBuild(self, run_params):
     """Return the expected engines to build."""
     return {
-        "my_trt_op_0": ["c1", "add", "add1", "mul"],
-        "my_trt_op_1": ["c2", "add2", "add3", "mul1"]
+        "TRTEngineOp_0": ["c1", "add", "add1", "mul"],
+        "TRTEngineOp_1": ["c2", "add2", "add3", "mul1"]
     }
 
 
diff --git a/tensorflow/contrib/tensorrt/test/batch_matmul_test.py b/tensorflow/contrib/tensorrt/test/batch_matmul_test.py
index 4b888081787..f42308ecb7c 100644
--- a/tensorflow/contrib/tensorrt/test/batch_matmul_test.py
+++ b/tensorflow/contrib/tensorrt/test/batch_matmul_test.py
@@ -79,12 +79,12 @@ class BatchMatMulTest(trt_test.TfTrtIntegrationTestBase):
     """Return the expected engines to build."""
     if (run_params.dynamic_engine and
         not trt_test.IsQuantizationMode(run_params.precision_mode)):
-      return ["my_trt_op_0", "my_trt_op_1"]
-    return ["my_trt_op_1"]
+      return ["TRTEngineOp_0", "TRTEngineOp_1"]
+    return ["TRTEngineOp_1"]
 
   def ExpectedEnginesToRun(self, run_params):
     """Return the expected engines to run."""
-    return ["my_trt_op_1"]
+    return ["TRTEngineOp_1"]
 
   def ShouldRunTest(self, run_params):
     """Whether to run the test."""
diff --git a/tensorflow/contrib/tensorrt/test/biasadd_matmul_test.py b/tensorflow/contrib/tensorrt/test/biasadd_matmul_test.py
index 7545bb9df20..053b38ff1c0 100644
--- a/tensorflow/contrib/tensorrt/test/biasadd_matmul_test.py
+++ b/tensorflow/contrib/tensorrt/test/biasadd_matmul_test.py
@@ -41,6 +41,7 @@ class BiasaddMatMulTest(trt_test.TfTrtIntegrationTestBase):
     input_name = "input"
     input_matrix_rows = 4
     input_matrix_columns = 144
+    # Note that tf.nn.bias_add supports up to 5 dimensions.
     input_dims = [input_matrix_rows, input_matrix_columns]
     output_name = "output"
     g = ops.Graph()
@@ -74,18 +75,18 @@ class BiasaddMatMulTest(trt_test.TfTrtIntegrationTestBase):
       x5 = nn.bias_add(x5, b)
       x5 = gen_array_ops.reshape(x5, [4, -1])
 
-      x6 = gen_array_ops.reshape(x, [4, 12, 12])
-      b = self._ConstOp((12,))
+      x6 = gen_array_ops.reshape(x, [4, 24, 6])
+      b = self._ConstOp((6,))
       x6 = nn.bias_add(x6, b, data_format="NHWC")
       x6 = gen_array_ops.reshape(x6, [4, -1])
 
-      x7 = gen_array_ops.reshape(x, [4, 12, 3, 4])
-      b = self._ConstOp((4,))
+      x7 = gen_array_ops.reshape(x, [4, 12, 4, 3])
+      b = self._ConstOp((3,))
       x7 = nn.bias_add(x7, b, data_format="NHWC")
       x7 = gen_array_ops.reshape(x7, [4, -1])
 
-      x8 = gen_array_ops.reshape(x, [4, 12, 3, 2, 2])
-      b = self._ConstOp((2,))
+      x8 = gen_array_ops.reshape(x, [4, 4, 3, 2, 6])
+      b = self._ConstOp((6,))
       x8 = nn.bias_add(x8, b, data_format="NHWC")
       x8 = gen_array_ops.reshape(x8, [4, -1])
 
@@ -94,13 +95,13 @@ class BiasaddMatMulTest(trt_test.TfTrtIntegrationTestBase):
       x9 = nn.bias_add(x9, b, data_format="NCHW")
       x9 = gen_array_ops.reshape(x9, [4, -1])
 
-      x10 = gen_array_ops.reshape(x, [4, 12, 3, 4])
-      b = self._ConstOp((12,))
+      x10 = gen_array_ops.reshape(x, [4, 3, 4, 12])
+      b = self._ConstOp((3,))
       x10 = nn.bias_add(x10, b, data_format="NCHW")
       x10 = gen_array_ops.reshape(x10, [4, -1])
 
-      x11 = gen_array_ops.reshape(x, [4, 12, 12])
-      b = self._ConstOp((12,))
+      x11 = gen_array_ops.reshape(x, [4, 6, 24])
+      b = self._ConstOp((6,))
       x11 = nn.bias_add(x11, b, data_format="NCHW")
       x11 = gen_array_ops.reshape(x11, [4, -1])
 
@@ -116,13 +117,18 @@ class BiasaddMatMulTest(trt_test.TfTrtIntegrationTestBase):
 
   def GetConversionParams(self, run_params):
     """Return a ConversionParams for test."""
-    return super(BiasaddMatMulTest,
-                 self).GetConversionParams(run_params)._replace(
-                     max_batch_size=4, maximum_cached_engines=1)
+    conversion_params = super(BiasaddMatMulTest,
+                              self).GetConversionParams(run_params)
+    return conversion_params._replace(
+        max_batch_size=4,
+        maximum_cached_engines=1,
+        # Disable layout optimizer, since it will convert BiasAdd with NHWC
+        # format to NCHW format under four dimentional input.
+        rewriter_config=trt_test.OptimizerDisabledRewriterConfig())
 
   def ExpectedEnginesToBuild(self, run_params):
     """Return the expected engines to build."""
-    return ["my_trt_op_0"]
+    return ["TRTEngineOp_0"]
 
   def ShouldRunTest(self, run_params):
     """Whether to run the test."""
diff --git a/tensorflow/contrib/tensorrt/test/binary_tensor_weight_broadcast_test.py b/tensorflow/contrib/tensorrt/test/binary_tensor_weight_broadcast_test.py
index b53cb3c091e..169835956c0 100644
--- a/tensorflow/contrib/tensorrt/test/binary_tensor_weight_broadcast_test.py
+++ b/tensorflow/contrib/tensorrt/test/binary_tensor_weight_broadcast_test.py
@@ -26,7 +26,6 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_array_ops
-from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 
 
@@ -56,10 +55,10 @@ class BinaryTensorWeightBroadcastTest(trt_test.TfTrtIntegrationTestBase):
       ]:
         a = self._ConstOp(weights_shape)
         f = x + a
-        x = math_ops.sigmoid(f)
+        x = self.trt_incompatible_op(f)
         a = self._ConstOp(weights_shape)
         f = a + x
-        x = math_ops.sigmoid(f)
+        x = self.trt_incompatible_op(f)
       gen_array_ops.reshape(x, [5, -1], name=output_name)
     return trt_test.TfTrtIntegrationTestParams(
         gdef=g.as_graph_def(),
@@ -70,7 +69,7 @@ class BinaryTensorWeightBroadcastTest(trt_test.TfTrtIntegrationTestBase):
 
   def ExpectedEnginesToBuild(self, run_params):
     """Return the expected engines to build."""
-    return ["my_trt_op_%d" % i for i in range(16)]
+    return ["TRTEngineOp_%d" % i for i in range(16)]
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/contrib/tensorrt/test/concatenation_test.py b/tensorflow/contrib/tensorrt/test/concatenation_test.py
index 465cb022964..c3576f81d97 100644
--- a/tensorflow/contrib/tensorrt/test/concatenation_test.py
+++ b/tensorflow/contrib/tensorrt/test/concatenation_test.py
@@ -79,7 +79,7 @@ class ConcatenationTest(trt_test.TfTrtIntegrationTestBase):
 
   def ExpectedEnginesToBuild(self, run_params):
     """Return the expected engines to build."""
-    return ["my_trt_op_0"]
+    return ["TRTEngineOp_0"]
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/contrib/tensorrt/test/const_broadcast_test.py b/tensorflow/contrib/tensorrt/test/const_broadcast_test.py
index e32f0478661..c1c883312d8 100644
--- a/tensorflow/contrib/tensorrt/test/const_broadcast_test.py
+++ b/tensorflow/contrib/tensorrt/test/const_broadcast_test.py
@@ -64,7 +64,7 @@ class ConstBroadcastTest(trt_test.TfTrtIntegrationTestBase):
 
   def ExpectedEnginesToBuild(self, run_params):
     """Return the expected engines to build."""
-    return ['my_trt_op_0']
+    return ['TRTEngineOp_0']
 
   def ExpectedAbsoluteTolerance(self, run_params):
     """The absolute tolerance to compare floating point results."""
diff --git a/tensorflow/contrib/tensorrt/test/memory_alignment_test.py b/tensorflow/contrib/tensorrt/test/memory_alignment_test.py
index bc7c90081ff..104bac43a0b 100644
--- a/tensorflow/contrib/tensorrt/test/memory_alignment_test.py
+++ b/tensorflow/contrib/tensorrt/test/memory_alignment_test.py
@@ -68,7 +68,7 @@ class MemoryAlignmentTest(trt_test.TfTrtIntegrationTestBase):
 
   def ExpectedEnginesToBuild(self, run_params):
     """Return the expected engines to build."""
-    return ["my_trt_op_0"]
+    return ["TRTEngineOp_0"]
 
   def ExpectedAbsoluteTolerance(self, run_params):
     """The absolute tolerance to compare floating point results."""
diff --git a/tensorflow/contrib/tensorrt/test/multi_connection_neighbor_engine_test.py b/tensorflow/contrib/tensorrt/test/multi_connection_neighbor_engine_test.py
index 11be4feaf7b..293f93d8a78 100644
--- a/tensorflow/contrib/tensorrt/test/multi_connection_neighbor_engine_test.py
+++ b/tensorflow/contrib/tensorrt/test/multi_connection_neighbor_engine_test.py
@@ -25,8 +25,6 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import gen_math_ops
-from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn
 from tensorflow.python.platform import test
 
@@ -60,14 +58,14 @@ class MultiConnectionNeighborEngineTest(trt_test.TfTrtIntegrationTestBase):
       b = constant_op.constant(
           np.random.normal(5.0, 1.0, [1, 4, 1, 1]), name="bias", dtype=dtype)
       q = conv - b
-      edge = math_ops.sigmoid(q)
+      edge = self.trt_incompatible_op(q)
 
       b = constant_op.constant(
           np.random.normal(5.0, 1.0, [1, 4, 1, 1]), name="bias", dtype=dtype)
       d = b + conv
-      edge3 = math_ops.sigmoid(d)
+      edge3 = self.trt_incompatible_op(d)
 
-      edge1 = gen_math_ops.tan(conv)
+      edge1 = self.trt_incompatible_op(conv)
       t = t - edge1
       q = q + edge
       t = t + q
@@ -83,7 +81,7 @@ class MultiConnectionNeighborEngineTest(trt_test.TfTrtIntegrationTestBase):
 
   def ExpectedEnginesToBuild(self, run_params):
     """Return the expected engines to build."""
-    return ["my_trt_op_0", "my_trt_op_1"]
+    return ["TRTEngineOp_0", "TRTEngineOp_1"]
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/contrib/tensorrt/test/neighboring_engine_test.py b/tensorflow/contrib/tensorrt/test/neighboring_engine_test.py
index eddeafa38bc..3e1e4b088ba 100644
--- a/tensorflow/contrib/tensorrt/test/neighboring_engine_test.py
+++ b/tensorflow/contrib/tensorrt/test/neighboring_engine_test.py
@@ -66,8 +66,8 @@ class NeighboringEngineTest(trt_test.TfTrtIntegrationTestBase):
   def ExpectedEnginesToBuild(self, run_params):
     """Return the expected engines to build."""
     return {
-        "my_trt_op_0": ["bias", "mul", "sub"],
-        "my_trt_op_1": ["weights", "conv"]
+        "TRTEngineOp_0": ["bias", "mul", "sub"],
+        "TRTEngineOp_1": ["weights", "conv"]
     }
 
 
diff --git a/tensorflow/contrib/tensorrt/test/quantization_mnist_test.py b/tensorflow/contrib/tensorrt/test/quantization_mnist_test.py
new file mode 100644
index 00000000000..31cbef89e23
--- /dev/null
+++ b/tensorflow/contrib/tensorrt/test/quantization_mnist_test.py
@@ -0,0 +1,290 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Script to test TF-TRT INT8 conversion without calibration on Mnist model."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.tensorrt.python import trt_convert
+# pylint: disable=unused-import
+from tensorflow.contrib.tensorrt.python.ops import trt_engine_op
+# pylint: enable=unused-import
+from tensorflow.core.protobuf import config_pb2
+from tensorflow.python import data
+from tensorflow.python import keras
+from tensorflow.python.estimator.estimator import Estimator
+from tensorflow.python.estimator.model_fn import EstimatorSpec
+from tensorflow.python.estimator.model_fn import ModeKeys
+from tensorflow.python.estimator.run_config import RunConfig
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import graph_util
+from tensorflow.python.framework import importer
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.keras.datasets import mnist
+from tensorflow.python.layers import layers
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gen_array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import metrics
+from tensorflow.python.ops import nn
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops.losses import losses
+from tensorflow.python.platform import test
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.summary import summary
+from tensorflow.python.training import saver
+from tensorflow.python.training.adam import AdamOptimizer
+from tensorflow.python.training.checkpoint_management import latest_checkpoint
+from tensorflow.python.training.training_util import get_global_step
+
+INPUT_NODE_NAME = 'input'
+OUTPUT_NODE_NAME = 'output'
+
+
+class QuantizationAwareTrainingMNISTTest(test_util.TensorFlowTestCase):
+
+  def _BuildGraph(self, x):
+
+    def _Quantize(x, r):
+      x = gen_array_ops.quantize_and_dequantize_v2(x, -r, r)
+      return x
+
+    def _DenseLayer(x, num_inputs, num_outputs, quantization_range, name):
+      """Dense layer with quantized outputs.
+
+      Args:
+        x: input to the dense layer
+        num_inputs: number of input columns of x
+        num_outputs: number of output columns
+        quantization_range: the min/max range for quantization
+        name: name of the variable scope
+
+      Returns:
+        The output of the layer.
+      """
+      with variable_scope.variable_scope(name):
+        kernel = variable_scope.get_variable(
+            'kernel',
+            shape=[num_inputs, num_outputs],
+            dtype=dtypes.float32,
+            initializer=keras.initializers.glorot_uniform())
+        bias = variable_scope.get_variable(
+            'bias',
+            shape=[num_outputs],
+            dtype=dtypes.float32,
+            initializer=keras.initializers.zeros())
+        x = math_ops.matmul(x, kernel)
+        x = _Quantize(x, quantization_range)
+        x = nn.bias_add(x, bias)
+        x = _Quantize(x, quantization_range)
+      return x
+
+    x = _Quantize(x, 1)
+    # Conv + Bias + Relu6
+    x = layers.conv2d(x, filters=32, kernel_size=3, use_bias=True)
+    x = nn.relu6(x)
+    # Conv + Bias + Relu6
+    x = layers.conv2d(x, filters=64, kernel_size=3, use_bias=True)
+    x = nn.relu6(x)
+    # Reduce
+    x = math_ops.reduce_mean(x, [1, 2])
+    x = _Quantize(x, 6)
+    # FC1
+    x = _DenseLayer(x, 64, 512, 6, name='dense')
+    x = nn.relu6(x)
+    # FC2
+    x = _DenseLayer(x, 512, 10, 25, name='dense_1')
+    x = array_ops.identity(x, name=OUTPUT_NODE_NAME)
+    return x
+
+  def _GetGraphDef(self, use_trt, max_batch_size, model_dir):
+    """Get the frozen mnist GraphDef.
+
+    Args:
+      use_trt: whether use TF-TRT to convert the graph.
+      max_batch_size: the max batch size to apply during TF-TRT conversion.
+      model_dir: the model directory to load the checkpoints.
+
+    Returns:
+      The frozen mnist GraphDef.
+    """
+    graph = ops.Graph()
+    with self.session(graph=graph) as sess:
+      with graph.device('/GPU:0'):
+        x = array_ops.placeholder(
+            shape=(None, 28, 28, 1), dtype=dtypes.float32, name=INPUT_NODE_NAME)
+        self._BuildGraph(x)
+      # Load weights
+      mnist_saver = saver.Saver()
+      checkpoint_file = latest_checkpoint(model_dir)
+      mnist_saver.restore(sess, checkpoint_file)
+      # Freeze
+      graph_def = graph_util.convert_variables_to_constants(
+          sess, sess.graph_def, output_node_names=[OUTPUT_NODE_NAME])
+    # Convert with TF-TRT
+    if use_trt:
+      logging.info('Number of nodes before TF-TRT conversion: %d',
+                   len(graph_def.node))
+      graph_def = trt_convert.create_inference_graph(
+          graph_def,
+          outputs=[OUTPUT_NODE_NAME],
+          max_batch_size=max_batch_size,
+          precision_mode='INT8',
+          max_workspace_size_bytes=4096 << 19,
+          minimum_segment_size=2,
+          use_calibration=False,
+      )
+      logging.info('Number of nodes after TF-TRT conversion: %d',
+                   len(graph_def.node))
+      num_engines = len(
+          [1 for n in graph_def.node if str(n.op) == 'TRTEngineOp'])
+      self.assertEqual(1, num_engines)
+    return graph_def
+
+  def _Run(self, is_training, use_trt, batch_size, num_epochs, model_dir):
+    """Train or evaluate the model.
+
+    Args:
+      is_training: whether to train or evaluate the model. In training mode,
+        quantization will be simulated where the quantize_and_dequantize_v2 are
+        placed.
+      use_trt: if true, use TRT INT8 mode for evaluation, which will perform
+        real quantization. Otherwise use native TensorFlow which will perform
+        simulated quantization. Ignored if is_training is True.
+      batch_size: batch size.
+      num_epochs: how many epochs to train. Ignored if is_training is False.
+      model_dir: where to save or load checkpoint.
+
+    Returns:
+      The Estimator evaluation result.
+    """
+    # Get dataset
+    train_data, test_data = mnist.load_data()
+
+    def _PreprocessFn(x, y):
+      x = math_ops.cast(x, dtypes.float32)
+      x = array_ops.expand_dims(x, axis=2)
+      x = 2.0 * (x / 255.0) - 1.0
+      y = math_ops.cast(y, dtypes.int32)
+      return x, y
+
+    def _EvalInputFn():
+      mnist_x, mnist_y = test_data
+      dataset = data.Dataset.from_tensor_slices((mnist_x, mnist_y))
+      dataset = dataset.apply(
+          data.experimental.map_and_batch(
+              map_func=_PreprocessFn,
+              batch_size=batch_size,
+              num_parallel_calls=8))
+      dataset = dataset.repeat(count=1)
+      iterator = data.make_one_shot_iterator(dataset)
+      features, labels = iterator.get_next()
+      return features, labels
+
+    def _TrainInputFn():
+      mnist_x, mnist_y = train_data
+      dataset = data.Dataset.from_tensor_slices((mnist_x, mnist_y))
+      dataset = dataset.shuffle(2 * len(mnist_x))
+      dataset = dataset.apply(
+          data.experimental.map_and_batch(
+              map_func=_PreprocessFn,
+              batch_size=batch_size,
+              num_parallel_calls=8))
+      dataset = dataset.repeat(count=num_epochs)
+      iterator = data.make_one_shot_iterator(dataset)
+      features, labels = iterator.get_next()
+      return features, labels
+
+    def _ModelFn(features, labels, mode):
+      if is_training:
+        logits_out = self._BuildGraph(features)
+      else:
+        graph_def = self._GetGraphDef(use_trt, batch_size, model_dir)
+        logits_out = importer.import_graph_def(
+            graph_def,
+            input_map={INPUT_NODE_NAME: features},
+            return_elements=[OUTPUT_NODE_NAME + ':0'],
+            name='')[0]
+
+      loss = losses.sparse_softmax_cross_entropy(
+          labels=labels, logits=logits_out)
+      summary.scalar('loss', loss)
+
+      classes_out = math_ops.argmax(logits_out, axis=1, name='classes_out')
+      accuracy = metrics.accuracy(
+          labels=labels, predictions=classes_out, name='acc_op')
+      summary.scalar('accuracy', accuracy[1])
+
+      if mode == ModeKeys.EVAL:
+        return EstimatorSpec(
+            mode, loss=loss, eval_metric_ops={'accuracy': accuracy})
+      elif mode == ModeKeys.TRAIN:
+        optimizer = AdamOptimizer(learning_rate=1e-2)
+        train_op = optimizer.minimize(loss, global_step=get_global_step())
+        return EstimatorSpec(mode, loss=loss, train_op=train_op)
+
+    config_proto = config_pb2.ConfigProto()
+    config_proto.gpu_options.allow_growth = True
+    estimator = Estimator(
+        model_fn=_ModelFn,
+        model_dir=model_dir if is_training else None,
+        config=RunConfig(session_config=config_proto))
+
+    if is_training:
+      estimator.train(_TrainInputFn)
+    results = estimator.evaluate(_EvalInputFn)
+    logging.info('accuracy: %s', str(results['accuracy']))
+    return results
+
+  # To generate the checkpoint, set a different model_dir and call self._Run()
+  # by setting is_training=True and num_epochs=1000, e.g.:
+  # model_dir = '/tmp/quantization_mnist'
+  # self._Run(
+  #     is_training=True,
+  #     use_trt=False,
+  #     batch_size=128,
+  #     num_epochs=100,
+  #     model_dir=model_dir)
+  def testEval(self):
+    if not trt_convert.is_tensorrt_enabled():
+      return
+    model_dir = test.test_src_dir_path('contrib/tensorrt/test/testdata')
+
+    accuracy_tf_native = self._Run(
+        is_training=False,
+        use_trt=False,
+        batch_size=128,
+        num_epochs=None,
+        model_dir=model_dir)['accuracy']
+    logging.info('accuracy_tf_native: %f', accuracy_tf_native)
+    self.assertAllClose(accuracy_tf_native, 0.9662)
+
+    if trt_convert.get_linked_tensorrt_version()[0] < 5:
+      return
+
+    accuracy_tf_trt = self._Run(
+        is_training=False,
+        use_trt=True,
+        batch_size=128,
+        num_epochs=None,
+        model_dir=model_dir)['accuracy']
+    logging.info('accuracy_tf_trt: %f', accuracy_tf_trt)
+    self.assertAllClose(accuracy_tf_trt, 0.9677)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/tensorrt/test/quantization_test.py b/tensorflow/contrib/tensorrt/test/quantization_test.py
new file mode 100644
index 00000000000..28353273ede
--- /dev/null
+++ b/tensorflow/contrib/tensorrt/test/quantization_test.py
@@ -0,0 +1,144 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Model script to test TF-TensorRT integration."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.tensorrt.python import trt_convert
+from tensorflow.contrib.tensorrt.test import tf_trt_integration_test_base as trt_test
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gen_array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import test
+
+
+def _GetParams(add_quantization_nodes, dtype=dtypes.float32):
+  input_name = "input"
+  input_dims = [8, 8]
+  output_name = "output"
+
+  def _Quantize(x, r):
+    if add_quantization_nodes:
+      x = gen_array_ops.fake_quant_with_min_max_vars(x, -r, r)
+    return x
+
+  g = ops.Graph()
+  with g.as_default():
+    x = array_ops.placeholder(
+        dtype=dtype, shape=[None] + input_dims[1:], name=input_name)
+    x = _Quantize(x, 10.0)
+    x = x + 5
+    x = _Quantize(x, 15.0)
+    x = x - 5
+    x = _Quantize(x, 10.0)
+    x = x * 0.1
+    x = _Quantize(x, 1.0)
+    w = constant_op.constant(np.ones((8, 1)), dtype=dtypes.float32)
+    x = math_ops.matmul(x, w)
+    x = _Quantize(x, 10.0)
+    x = array_ops.identity(x, name=output_name)
+
+  return trt_test.TfTrtIntegrationTestParams(
+      gdef=g.as_graph_def(),
+      input_names=[input_name],
+      input_dims=[input_dims],
+      output_names=[output_name],
+      expected_output_dims=[(8, 1)])
+
+
+class QuantizationMissingAllRangesTest(trt_test.TfTrtIntegrationTestBase):
+
+  def GetParams(self):
+    """Create a graph containing single segment with no quantization ranges."""
+    return _GetParams(add_quantization_nodes=False)
+
+  def ShouldRunTest(self, run_params):
+    if trt_convert.get_linked_tensorrt_version()[0] < 5:
+      return False
+    # Only test static engine mode, with or without calibration.
+    return (trt_test.IsQuantizationMode(run_params.precision_mode) and
+            not run_params.use_optimizer and not run_params.dynamic_engine)
+
+  def ExpectedEnginesToBuild(self, run_params):
+    """Return the expected engines to build."""
+    if run_params.use_calibration:
+      # In static engine mode with calibration, it should build a calibration
+      # engine.
+      return ["my_trt_op_0"]
+    # In static engine mode without calibration, the engine building will fail
+    # since no quantization ranges are set, which results in no TRT nodes.
+    return []
+
+
+class QuantizationWithRangesTest(trt_test.TfTrtIntegrationTestBase):
+
+  def GetParams(self):
+    """Create a graph containing single segment with no quantization ranges."""
+    return _GetParams(add_quantization_nodes=True)
+
+  def ShouldRunTest(self, run_params):
+    if trt_convert.get_linked_tensorrt_version()[0] < 5:
+      return False
+    # Test static/dynamic engine with/without calibration.
+    return (trt_test.IsQuantizationMode(run_params.precision_mode) and
+            not run_params.use_optimizer)
+
+  def ExpectedEnginesToBuild(self, run_params):
+    """Return the expected engines to build."""
+    return ["my_trt_op_0"]
+
+  def ExpectedAbsoluteTolerance(self, run_params):
+    """The absolute tolerance to compare floating point results."""
+    return 1.e-05 if run_params.precision_mode == "FP32" else 1.e-01
+
+  def ExpectedRelativeTolerance(self, run_params):
+    """The relative tolerance to compare floating point results."""
+    return 1.e-05 if run_params.precision_mode == "FP32" else 1.e-01
+
+
+class NonQuantizedPrecisionsWithRangesTest(trt_test.TfTrtIntegrationTestBase):
+
+  def GetParams(self):
+    """Create a graph containing single segment with no quantization ranges."""
+    return _GetParams(add_quantization_nodes=True)
+
+  def ShouldRunTest(self, run_params):
+    # Only test FP32/FP16 mode.
+    return not trt_test.IsQuantizationMode(run_params.precision_mode)
+
+  def ExpectedEnginesToBuild(self, run_params):
+    """Return the expected engines to build."""
+    # The fake quant ops are not supported in FP32/FP16 mode, and will split the
+    # graph into three TRT segments.
+    return ["my_trt_op_0", "my_trt_op_1", "my_trt_op_2", "my_trt_op_3"]
+
+  def ExpectedAbsoluteTolerance(self, run_params):
+    """The absolute tolerance to compare floating point results."""
+    return 1.e-05 if run_params.precision_mode == "FP32" else 1.e-01
+
+  def ExpectedRelativeTolerance(self, run_params):
+    """The relative tolerance to compare floating point results."""
+    return 1.e-05 if run_params.precision_mode == "FP32" else 1.e-01
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/tensorrt/test/rank_two_test.py b/tensorflow/contrib/tensorrt/test/rank_two_test.py
index 74a4a059257..0cd733dca13 100644
--- a/tensorflow/contrib/tensorrt/test/rank_two_test.py
+++ b/tensorflow/contrib/tensorrt/test/rank_two_test.py
@@ -68,11 +68,11 @@ class RankTwoTest(trt_test.TfTrtIntegrationTestBase):
   def ExpectedEnginesToBuild(self, run_params):
     """Return the expected engines to build."""
     return {
-        "my_trt_op_0": [
+        "TRTEngineOp_0": [
             "add0_1", "add0_2", "add0_3", "c0_1", "c0_2", "c0_3", "abs0_1",
             "abs0_2"
         ],
-        "my_trt_op_1": [
+        "TRTEngineOp_1": [
             "add", "add1_1", "add1_2", "add1_3", "c1_1", "c1_2", "c1_3",
             "abs1_1", "abs1_2", "reciprocal0", "reciprocal1"
         ],
diff --git a/tensorflow/contrib/tensorrt/test/reshape_transpose_test.py b/tensorflow/contrib/tensorrt/test/reshape_transpose_test.py
index bbc724ab18e..207944468ab 100644
--- a/tensorflow/contrib/tensorrt/test/reshape_transpose_test.py
+++ b/tensorflow/contrib/tensorrt/test/reshape_transpose_test.py
@@ -79,8 +79,8 @@ class ReshapeTest(trt_test.TfTrtIntegrationTestBase):
   def ExpectedEnginesToBuild(self, run_params):
     """Return the expected engines to build."""
     return {
-        "my_trt_op_0": ["reshape-%d" % i for i in range(7)] +
-                       ["reshape-%d/shape" % i for i in range(7)]
+        "TRTEngineOp_0": ["reshape-%d" % i for i in range(7)] +
+                         ["reshape-%d/shape" % i for i in range(7)]
     }
 
   def ShouldRunTest(self, run_params):
@@ -117,7 +117,7 @@ class TransposeTest(trt_test.TfTrtIntegrationTestBase):
         # Note: by default Grappler will run the TRT optimizer twice. At the
         # first time it will group the two transpose ops below to same segment
         # then fail the conversion due to the expected batch dimension problem.
-        # At the second time, since the input of bridge op is my_trt_op_0, it
+        # At the second time, since the input of bridge op is TRTEngineOp_0, it
         # will fail to do shape inference which then cause conversion to fail.
         # TODO(laigd): support shape inference, make TRT optimizer run only
         # once, and fix this.
@@ -136,7 +136,7 @@ class TransposeTest(trt_test.TfTrtIntegrationTestBase):
   def ExpectedEnginesToBuild(self, run_params):
     """Return the expected engines to build."""
     return {
-        "my_trt_op_0": [
+        "TRTEngineOp_0": [
             "transpose-1", "transpose-1/perm", "transposeback",
             "transposeback/perm"
         ]
diff --git a/tensorflow/contrib/tensorrt/test/testdata/checkpoint b/tensorflow/contrib/tensorrt/test/testdata/checkpoint
new file mode 100644
index 00000000000..a603e1aec91
--- /dev/null
+++ b/tensorflow/contrib/tensorrt/test/testdata/checkpoint
@@ -0,0 +1,3 @@
+model_checkpoint_path: "model.ckpt-46900"
+all_model_checkpoint_paths: "model.ckpt-0"
+all_model_checkpoint_paths: "model.ckpt-46900"
diff --git a/tensorflow/contrib/tensorrt/test/testdata/model.ckpt-46900.data-00000-of-00001 b/tensorflow/contrib/tensorrt/test/testdata/model.ckpt-46900.data-00000-of-00001
new file mode 100644
index 00000000000..88a998f184b
Binary files /dev/null and b/tensorflow/contrib/tensorrt/test/testdata/model.ckpt-46900.data-00000-of-00001 differ
diff --git a/tensorflow/contrib/tensorrt/test/testdata/model.ckpt-46900.index b/tensorflow/contrib/tensorrt/test/testdata/model.ckpt-46900.index
new file mode 100644
index 00000000000..53797657133
Binary files /dev/null and b/tensorflow/contrib/tensorrt/test/testdata/model.ckpt-46900.index differ
diff --git a/tensorflow/contrib/tensorrt/test/tf_trt_integration_test_base.py b/tensorflow/contrib/tensorrt/test/tf_trt_integration_test_base.py
index a725d0651c9..495a9391a1e 100644
--- a/tensorflow/contrib/tensorrt/test/tf_trt_integration_test_base.py
+++ b/tensorflow/contrib/tensorrt/test/tf_trt_integration_test_base.py
@@ -30,6 +30,7 @@ from tensorflow.contrib.tensorrt.python import trt_convert
 from tensorflow.contrib.tensorrt.python.ops import trt_engine_op
 # pylint: enable=unused-import
 from tensorflow.core.protobuf import config_pb2
+from tensorflow.core.protobuf import rewriter_config_pb2
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import graph_io
 from tensorflow.python.framework import importer
@@ -42,14 +43,15 @@ TfTrtIntegrationTestParams = namedtuple("TfTrtIntegrationTestParams", [
     "gdef", "input_names", "input_dims", "output_names", "expected_output_dims"
 ])
 
-RunParams = namedtuple(
-    "RunParams",
-    ["use_optimizer", "precision_mode", "dynamic_engine", "test_name"])
+RunParams = namedtuple("RunParams", [
+    "use_optimizer", "precision_mode", "dynamic_engine", "test_name",
+    "use_calibration"
+])
 
 ConversionParams = namedtuple("ConversionParams", [
     "max_batch_size", "max_workspace_size_bytes", "precision_mode",
     "minimum_segment_size", "is_dynamic_op", "maximum_cached_engines",
-    "cached_engine_batch_sizes", "rewriter_config"
+    "cached_engine_batch_sizes", "rewriter_config", "use_calibration"
 ])
 
 PRECISION_MODES = ["FP32", "FP16", "INT8"]
@@ -65,6 +67,34 @@ class GraphState(object):
   INFERENCE = 2
 
 
+def OptimizerDisabledRewriterConfig():
+  """Returns a RewriterConfig with all default Grappler optimizers disabled."""
+  rewriter_config = rewriter_config_pb2.RewriterConfig()
+
+  # Turn off all default Grappler optimizers.
+  off = rewriter_config_pb2.RewriterConfig.OFF
+  rewriter_config.layout_optimizer = off
+  rewriter_config.constant_folding = off
+  rewriter_config.shape_optimization = off
+  rewriter_config.remapping = off
+  rewriter_config.arithmetic_optimization = off
+  rewriter_config.dependency_optimization = off
+  rewriter_config.loop_optimization = off
+  rewriter_config.function_optimization = off
+  rewriter_config.debug_stripper = off
+  rewriter_config.disable_model_pruning = True
+  rewriter_config.scoped_allocator_optimization = off
+  rewriter_config.memory_optimization = (
+      rewriter_config_pb2.RewriterConfig.NO_MEM_OPT)
+  rewriter_config.pin_to_host_optimization = off
+  rewriter_config.auto_parallel.enable = False
+
+  # Run only once for each enabled optimizer.
+  rewriter_config.meta_optimizer_iterations = (
+      rewriter_config_pb2.RewriterConfig.ONE)
+  return rewriter_config
+
+
 class TfTrtIntegrationTestBase(test_util.TensorFlowTestCase):
   """Class to test Tensorflow-TensorRT integration."""
 
@@ -139,11 +169,15 @@ class TfTrtIntegrationTestBase(test_util.TensorFlowTestCase):
         is_dynamic_op=run_params.dynamic_engine,
         maximum_cached_engines=1,
         cached_engine_batch_sizes=None,
-        rewriter_config=None)
+        rewriter_config=None,
+        use_calibration=run_params.use_calibration)
 
   def ShouldRunTest(self, run_params):
     """Whether to run the test."""
-    return True
+    # This setting combination requires quantization nodes to be present in
+    # order to build the engine.
+    return not (IsQuantizationMode(run_params.precision_mode) and
+                not run_params.use_calibration)
 
   def VerifyRunForEngine(self, engine_name, graph_state, expect_run=True):
     """Verify the state of a particular engine after sess.run()."""
@@ -194,34 +228,35 @@ class TfTrtIntegrationTestBase(test_util.TensorFlowTestCase):
   def _PrepareRun(self, graph_state):
     """Set up necessary testing environment before calling sess.run()."""
     # Clear test values added by TRTEngineOp.
-    trt_convert.clear_test_values("my_trt_op_.*:ExecuteTrtEngine")
-    trt_convert.clear_test_values("my_trt_op_.*:ExecuteCalibration")
-    trt_convert.clear_test_values("my_trt_op_.*:ExecuteNativeSegment")
+    trt_convert.clear_test_values("TRTEngineOp_.*:ExecuteTrtEngine")
+    trt_convert.clear_test_values("TRTEngineOp_.*:ExecuteCalibration")
+    trt_convert.clear_test_values("TRTEngineOp_.*:ExecuteNativeSegment")
+
+  def _GetGPUOptions(self):
+    gpu_options = config_pb2.GPUOptions()
+    gpu_options.allow_growth = True
+    return gpu_options
 
   def _GetConfigProto(self, run_params, graph_state):
     """Get config proto based on specific settings."""
     if graph_state != GraphState.ORIGINAL and run_params.use_optimizer:
       conversion_params = self.GetConversionParams(run_params)
-      rewriter_cfg = trt_convert.tensorrt_rewriter_config(
+      rewriter_cfg = trt_convert.get_tensorrt_rewriter_config(
           conversion_params.rewriter_config, conversion_params.max_batch_size,
           conversion_params.max_workspace_size_bytes,
           conversion_params.precision_mode,
           conversion_params.minimum_segment_size,
           conversion_params.is_dynamic_op,
           conversion_params.maximum_cached_engines,
-          conversion_params.cached_engine_batch_sizes)
+          conversion_params.cached_engine_batch_sizes,
+          conversion_params.use_calibration)
 
       graph_options = config_pb2.GraphOptions(rewrite_options=rewriter_cfg)
     else:
       graph_options = config_pb2.GraphOptions()
 
-    gpu_options = config_pb2.GPUOptions()
-    gpu_options.allow_growth = True
-    if trt_convert.get_linked_tensorrt_version()[0] == 3:
-      gpu_options.per_process_gpu_memory_fraction = 0.50
-
     config = config_pb2.ConfigProto(
-        gpu_options=gpu_options, graph_options=graph_options)
+        gpu_options=self._GetGPUOptions(), graph_options=graph_options)
     return config
 
   def _ExpectTestValue(self, engine_name, method, expected_value):
@@ -291,6 +326,11 @@ class TfTrtIntegrationTestBase(test_util.TensorFlowTestCase):
     params = self._GetParamsCached()
     conversion_params = self.GetConversionParams(run_params)
     logging.info(conversion_params)
+
+    config_for_trt = config_pb2.ConfigProto(gpu_options=self._GetGPUOptions())
+    if conversion_params.rewriter_config is not None:
+      config_for_trt.graph_options.rewrite_options.CopyFrom(
+          conversion_params.rewriter_config)
     return trt_convert.create_inference_graph(
         input_graph_def=gdef,
         outputs=params.input_names + params.output_names,
@@ -301,7 +341,8 @@ class TfTrtIntegrationTestBase(test_util.TensorFlowTestCase):
         is_dynamic_op=conversion_params.is_dynamic_op,
         maximum_cached_engines=conversion_params.maximum_cached_engines,
         cached_engine_batch_sizes=conversion_params.cached_engine_batch_sizes,
-        rewriter_config=conversion_params.rewriter_config)
+        use_calibration=conversion_params.use_calibration,
+        session_config=config_for_trt)
 
   def _WriteGraph(self, run_params, gdef, graph_state):
     if graph_state == GraphState.ORIGINAL:
@@ -400,10 +441,12 @@ class TfTrtIntegrationTestBase(test_util.TensorFlowTestCase):
         is_dynamic_engine = not node.attr["static_engine"].b
         self.assertEqual(run_params.dynamic_engine, is_dynamic_engine,
                          node.name)
+        self.assertEqual(node.attr["use_calibration"].b,
+                         run_params.use_calibration, node.name)
 
         has_calibration_data = len(node.attr["calibration_data"].s)
         if (IsQuantizationMode(run_params.precision_mode) and
-            graph_state == GraphState.INFERENCE):
+            run_params.use_calibration and graph_state == GraphState.INFERENCE):
           self.assertTrue(has_calibration_data, node.name)
         else:
           self.assertFalse(has_calibration_data, node.name)
@@ -438,6 +481,11 @@ class TfTrtIntegrationTestBase(test_util.TensorFlowTestCase):
       # types.
       scale = 10.0 if np.issubdtype(dtype, np.integer) else 1.0
       dims = params.input_dims[i]
+      # TODO(laigd): add debug options. E.g. we can set the input data to be
+      # continuous natural numbers:
+      # seq = np.arange(np.prod(dims))
+      # seq.resize(dims)
+      # input_data.append(scale * seq.astype(dtype))
       input_data.append((scale * np.random.random_sample(dims)).astype(dtype))
     self._VerifyGraphDef(run_params, input_gdef, GraphState.ORIGINAL)
 
@@ -449,7 +497,8 @@ class TfTrtIntegrationTestBase(test_util.TensorFlowTestCase):
                                 config_no_trt, GraphState.ORIGINAL)
 
     # Run calibration if necessary.
-    if IsQuantizationMode(run_params.precision_mode):
+    if (IsQuantizationMode(run_params.precision_mode) and
+        run_params.use_calibration):
 
       calib_config = self._GetConfigProto(run_params, GraphState.CALIBRATE)
       logging.info("Running calibration graph, config:\n%s", str(calib_config))
@@ -519,27 +568,38 @@ def _AddTests(test_class):
 
   use_optimizer_options = [False, True]
   dynamic_engine_options = [False, True]
-  for (use_optimizer, precision_mode, dynamic_engine) in itertools.product(
-      use_optimizer_options, PRECISION_MODES, dynamic_engine_options):
+  use_calibration_options = [False, True]
+  opts = itertools.product(use_optimizer_options, PRECISION_MODES,
+                           dynamic_engine_options, use_calibration_options)
+  for (use_optimizer, precision_mode, dynamic_engine, use_calibration) in opts:
     if IsQuantizationMode(precision_mode):
       if use_optimizer:
         # TODO(aaroey): if use_optimizer is True we need to get the inference
         # graphdef using custom python wrapper class, which is not currently
         # supported yet.
         continue
-      if not dynamic_engine:
+      if use_calibration and not dynamic_engine:
+        # Static engine with use_calibration=False will be static, so we want to
+        # test that. If use_calibration=True, only dynamic op is supported.
         # TODO(aaroey): construction of static calibration engine is not
         # supported yet.
         continue
+    else:
+      if use_calibration:
+        # Don't calibrate in FP32 or FP16 mode
+        continue
 
     conversion = "OptimizerConversion" if use_optimizer else "ToolConversion"
-    engine_type = ("DynamicEngine" if dynamic_engine else "StaticEngine")
-    test_name = "%s_%s_%s" % (conversion, precision_mode, engine_type)
+    engine_type = "DynamicEngine" if dynamic_engine else "StaticEngine"
+    calibration_type = "UseCalibration" if use_calibration else "NoCalibration"
+    test_name = "%s_%s_%s_%s" % (conversion, engine_type, precision_mode,
+                                 calibration_type)
     run_params = RunParams(
         use_optimizer=use_optimizer,
         precision_mode=precision_mode,
         dynamic_engine=dynamic_engine,
-        test_name=test_name)
+        test_name=test_name,
+        use_calibration=use_calibration)
     setattr(test_class, "testTfTrt_" + test_name, _GetTest(run_params))
 
 
diff --git a/tensorflow/contrib/tensorrt/test/unary_test.py b/tensorflow/contrib/tensorrt/test/unary_test.py
index 8736bfb6449..9fc50e05952 100644
--- a/tensorflow/contrib/tensorrt/test/unary_test.py
+++ b/tensorflow/contrib/tensorrt/test/unary_test.py
@@ -107,8 +107,8 @@ class UnaryTest(trt_test.TfTrtIntegrationTestBase):
   def ExpectedEnginesToBuild(self, run_params):
     """Return the expected engines to build."""
     return [
-        "my_trt_op_0", "my_trt_op_1", "my_trt_op_2", "my_trt_op_3",
-        "my_trt_op_4"
+        "TRTEngineOp_0", "TRTEngineOp_1", "TRTEngineOp_2", "TRTEngineOp_3",
+        "TRTEngineOp_4"
     ]
 
 
diff --git a/tensorflow/contrib/tensorrt/test/vgg_block_nchw_test.py b/tensorflow/contrib/tensorrt/test/vgg_block_nchw_test.py
index b0271a04b36..b29626d2c28 100644
--- a/tensorflow/contrib/tensorrt/test/vgg_block_nchw_test.py
+++ b/tensorflow/contrib/tensorrt/test/vgg_block_nchw_test.py
@@ -76,7 +76,7 @@ class VGGBlockNCHWTest(trt_test.TfTrtIntegrationTestBase):
 
   def ExpectedEnginesToBuild(self, run_params):
     """Return the expected engines to build."""
-    return ["my_trt_op_0"]
+    return ["TRTEngineOp_0"]
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/contrib/tensorrt/test/vgg_block_test.py b/tensorflow/contrib/tensorrt/test/vgg_block_test.py
index d7c165784bf..9b0b1896260 100644
--- a/tensorflow/contrib/tensorrt/test/vgg_block_test.py
+++ b/tensorflow/contrib/tensorrt/test/vgg_block_test.py
@@ -67,7 +67,7 @@ class VGGBlockTest(trt_test.TfTrtIntegrationTestBase):
 
   def ExpectedEnginesToBuild(self, run_params):
     """Return the expected engines to build."""
-    return ["my_trt_op_0"]
+    return ["TRTEngineOp_0"]
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/contrib/timeseries/python/timeseries/BUILD b/tensorflow/contrib/timeseries/python/timeseries/BUILD
index c230919168b..4b90b596b28 100644
--- a/tensorflow/contrib/timeseries/python/timeseries/BUILD
+++ b/tensorflow/contrib/timeseries/python/timeseries/BUILD
@@ -104,8 +104,10 @@ py_test(
     srcs = [
         "estimators_test.py",
     ],
+    shard_count = 3,
     srcs_version = "PY2AND3",
     tags = [
+        "no_mac",
         "no_pip_gpu",  # b/63391119
         "nomsan",  # Takes too long to run.
         "notsan",  # b/67865658
diff --git a/tensorflow/contrib/timeseries/python/timeseries/estimators.py b/tensorflow/contrib/timeseries/python/timeseries/estimators.py
index af68aa03cf6..146ed9f2713 100644
--- a/tensorflow/contrib/timeseries/python/timeseries/estimators.py
+++ b/tensorflow/contrib/timeseries/python/timeseries/estimators.py
@@ -32,7 +32,7 @@ from tensorflow.contrib.timeseries.python.timeseries.state_space_models.filterin
 from tensorflow.python.estimator import estimator_lib
 from tensorflow.python.estimator.canned import optimizers
 from tensorflow.python.estimator.export import export_lib
-from tensorflow.python.feature_column import feature_column
+from tensorflow.python.feature_column import feature_column_lib as feature_column
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
diff --git a/tensorflow/contrib/timeseries/python/timeseries/estimators_test.py b/tensorflow/contrib/timeseries/python/timeseries/estimators_test.py
index ffd838be40e..7d780559f97 100644
--- a/tensorflow/contrib/timeseries/python/timeseries/estimators_test.py
+++ b/tensorflow/contrib/timeseries/python/timeseries/estimators_test.py
@@ -30,7 +30,7 @@ from tensorflow.contrib.timeseries.python.timeseries import saved_model_utils
 
 from tensorflow.python.client import session
 from tensorflow.python.estimator import estimator_lib
-from tensorflow.python.feature_column import feature_column
+from tensorflow.python.feature_column import feature_column_lib as feature_column
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.platform import test
diff --git a/tensorflow/contrib/timeseries/python/timeseries/head_test.py b/tensorflow/contrib/timeseries/python/timeseries/head_test.py
index 90c7d8ac1a9..8f692d94da4 100644
--- a/tensorflow/contrib/timeseries/python/timeseries/head_test.py
+++ b/tensorflow/contrib/timeseries/python/timeseries/head_test.py
@@ -38,7 +38,7 @@ from tensorflow.core.example import example_pb2
 
 from tensorflow.python.client import session as session_lib
 from tensorflow.python.estimator import estimator_lib
-from tensorflow.python.feature_column import feature_column
+from tensorflow.python.feature_column import feature_column_lib as feature_column
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
diff --git a/tensorflow/contrib/timeseries/python/timeseries/math_utils.py b/tensorflow/contrib/timeseries/python/timeseries/math_utils.py
index 43c5267e632..aab33064386 100644
--- a/tensorflow/contrib/timeseries/python/timeseries/math_utils.py
+++ b/tensorflow/contrib/timeseries/python/timeseries/math_utils.py
@@ -802,7 +802,7 @@ class InputStatisticsFromMiniBatch(object):
             array_ops.shape(times)[1] - 1, self._dtype))
     # Co-locate updates with their variables to minimize race conditions when
     # updating statistics.
-    with ops.colocate_with(auxiliary_variables.max_time_seen):
+    with ops.device(auxiliary_variables.max_time_seen.device):
       # There is a race condition if this value is being updated from multiple
       # workers. However, it should eventually reach the correct value if the
       # last chunk is presented enough times.
@@ -810,16 +810,16 @@ class InputStatisticsFromMiniBatch(object):
           auxiliary_variables.max_time_seen,
           gen_math_ops.maximum(auxiliary_variables.max_time_seen,
                                math_ops.reduce_max(times)))
-    with ops.colocate_with(auxiliary_variables.chunk_count):
+    with ops.device(auxiliary_variables.chunk_count.device):
       chunk_count_assign = state_ops.assign_add(auxiliary_variables.chunk_count,
                                                 array_ops.shape(
                                                     times,
                                                     out_type=dtypes.int64)[0])
-    with ops.colocate_with(auxiliary_variables.inter_observation_duration_sum):
+    with ops.device(auxiliary_variables.inter_observation_duration_sum.device):
       inter_observation_duration_assign = state_ops.assign_add(
           auxiliary_variables.inter_observation_duration_sum,
           math_ops.reduce_sum(batch_inter_observation_duration))
-    with ops.colocate_with(auxiliary_variables.example_count):
+    with ops.device(auxiliary_variables.example_count.device):
       example_count_assign = state_ops.assign_add(
           auxiliary_variables.example_count,
           array_ops.size(times, out_type=dtypes.int64))
@@ -829,11 +829,11 @@ class InputStatisticsFromMiniBatch(object):
     # the series are then members of fewer chunks. For series which are much
     # longer than the chunk size (the usual/expected case), this effect becomes
     # irrelevant.
-    with ops.colocate_with(auxiliary_variables.overall_feature_sum):
+    with ops.device(auxiliary_variables.overall_feature_sum.device):
       overall_feature_sum_assign = state_ops.assign_add(
           auxiliary_variables.overall_feature_sum,
           math_ops.reduce_sum(values, axis=[0, 1]))
-    with ops.colocate_with(auxiliary_variables.overall_feature_sum_of_squares):
+    with ops.device(auxiliary_variables.overall_feature_sum_of_squares.device):
       overall_feature_sum_of_squares_assign = state_ops.assign_add(
           auxiliary_variables.overall_feature_sum_of_squares,
           math_ops.reduce_sum(values**2, axis=[0, 1]))
@@ -869,7 +869,7 @@ class InputStatisticsFromMiniBatch(object):
             state_ops.assign(statistics.series_start_moments.mean, mean),
             state_ops.assign(statistics.series_start_moments.variance,
                              variance))
-      with ops.colocate_with(statistics.start_time):
+      with ops.device(statistics.start_time.device):
         series_start_update = control_flow_ops.cond(
             # Update moments whenever we even match the lowest time seen so far,
             # to ensure that series start statistics are eventually updated to
diff --git a/tensorflow/contrib/timeseries/python/timeseries/model.py b/tensorflow/contrib/timeseries/python/timeseries/model.py
index edd97b2a4c1..a8cd4287e00 100644
--- a/tensorflow/contrib/timeseries/python/timeseries/model.py
+++ b/tensorflow/contrib/timeseries/python/timeseries/model.py
@@ -27,7 +27,7 @@ from tensorflow.contrib.timeseries.python.timeseries import math_utils
 from tensorflow.contrib.timeseries.python.timeseries.feature_keys import PredictionFeatures
 from tensorflow.contrib.timeseries.python.timeseries.feature_keys import TrainEvalFeatures
 
-from tensorflow.python.feature_column import feature_column
+from tensorflow.python.feature_column import feature_column_lib as feature_column
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
diff --git a/tensorflow/contrib/timeseries/python/timeseries/state_space_models/BUILD b/tensorflow/contrib/timeseries/python/timeseries/state_space_models/BUILD
index 3c07a74ed8a..125750e7639 100644
--- a/tensorflow/contrib/timeseries/python/timeseries/state_space_models/BUILD
+++ b/tensorflow/contrib/timeseries/python/timeseries/state_space_models/BUILD
@@ -40,7 +40,10 @@ py_test(
     timeout = "long",  # Moderate but for asan
     srcs = ["state_space_model_test.py"],
     srcs_version = "PY2AND3",
-    tags = ["no_windows"],  # TODO: needs investigation on Windows
+    tags = [
+        "no_mac",
+        "no_windows",  # TODO: needs investigation on Windows
+    ],
     deps = [
         ":state_space_model",
         "//tensorflow/contrib/layers:layers_py",
diff --git a/tensorflow/contrib/tpu/BUILD b/tensorflow/contrib/tpu/BUILD
index a0a9cb3f31a..05d2ebd2e8a 100644
--- a/tensorflow/contrib/tpu/BUILD
+++ b/tensorflow/contrib/tpu/BUILD
@@ -14,6 +14,7 @@ load("//tensorflow:tensorflow.bzl", "tf_py_test")
 package(
     default_visibility = [
         "//cloud/vmm/testing/tests/tpu:__subpackages__",
+        "//knowledge/cerebra/sense/im2query:__subpackages__",
         "//learning/brain:__subpackages__",
         "//learning/deepmind:__subpackages__",
         "//medical/pathology:__subpackages__",
@@ -215,7 +216,7 @@ py_library(
     ],
     deps = [
         ":tpu_lib",
-        "//tensorflow/contrib/cluster_resolver:tpu_cluster_resolver_py",
+        "//tensorflow/contrib/cluster_resolver:cluster_resolver_py",
         "//tensorflow/contrib/distribute",
         "//tensorflow/contrib/framework:framework_py",
         "//tensorflow/contrib/tpu/proto:compilation_result_proto_py",
@@ -263,7 +264,7 @@ py_library(
         ":tpu_py",
         "//tensorflow/compiler/xla/experimental/xla_sharding",
         "//tensorflow/compiler/xla/python_api:xla_shape",
-        "//tensorflow/contrib/cluster_resolver:tpu_cluster_resolver_py",
+        "//tensorflow/contrib/cluster_resolver:cluster_resolver_py",
         "//tensorflow/contrib/compiler:xla",
         "//tensorflow/contrib/tpu/proto:compilation_result_proto_py",
         "//tensorflow/contrib/tpu/proto:optimization_parameters_proto_py",
diff --git a/tensorflow/contrib/tpu/profiler/pip_package/cloud_tpu_profiler/main.py b/tensorflow/contrib/tpu/profiler/pip_package/cloud_tpu_profiler/main.py
index 63641e00c5d..a081c4354a7 100644
--- a/tensorflow/contrib/tpu/profiler/pip_package/cloud_tpu_profiler/main.py
+++ b/tensorflow/contrib/tpu/profiler/pip_package/cloud_tpu_profiler/main.py
@@ -90,12 +90,12 @@ def main(unused_argv=None):
   tf_version = tf.__version__
   print('TensorFlow version %s detected' % tf_version)
 
-  if FLAGS.service_addr is None and FLAGS.tpu is None:
+  if not FLAGS.service_addr and not FLAGS.tpu:
     sys.exit('You must specify either --service_addr or --tpu.')
 
   tpu_cluster_resolver = None
-  if FLAGS.service_addr is not None:
-    if FLAGS.tpu is not None:
+  if FLAGS.service_addr:
+    if FLAGS.tpu:
       tf.logging.warn('Both --service_addr and --tpu are set. Ignoring '
                       '--tpu and using --service_addr.')
     service_addr = FLAGS.service_addr
diff --git a/tensorflow/contrib/tpu/python/tpu/async_checkpoint.py b/tensorflow/contrib/tpu/python/tpu/async_checkpoint.py
index 1cf7f9fcf67..1b09ce173a6 100644
--- a/tensorflow/contrib/tpu/python/tpu/async_checkpoint.py
+++ b/tensorflow/contrib/tpu/python/tpu/async_checkpoint.py
@@ -80,6 +80,8 @@ class AsyncCheckpointSaverHook(basic_session_run_hooks.CheckpointSaverHook):
     self._summary_writer = None
     self._global_step_tensor = None
 
+    self._last_checkpoint_step = None
+
   def _set_steps_per_run(self, steps_per_run):
     self._steps_per_run = steps_per_run
 
@@ -137,8 +139,7 @@ class AsyncCheckpointSaverHook(basic_session_run_hooks.CheckpointSaverHook):
 
     last_step = session.run(self._global_step_tensor)
 
-    # Save the last checkpoint synchronously if needed.
-    if last_step != self._timer.last_triggered_step():
+    if self._last_checkpoint_step != last_step:
       self._save(session, last_step, asynchronous=False)
 
     for l in self._listeners:
@@ -174,6 +175,7 @@ class AsyncCheckpointSaverHook(basic_session_run_hooks.CheckpointSaverHook):
       logging.info("Checkpoint finished for %d into %s.", step, self._save_path)
 
     if not asynchronous:
+      self._last_checkpoint_step = step
       _save_fn()
       return
 
@@ -183,6 +185,7 @@ class AsyncCheckpointSaverHook(basic_session_run_hooks.CheckpointSaverHook):
         logging.info("Saver thread still in progress, skipping checkpoint.")
         return
 
+    self._last_checkpoint_step = step
     self._save_thread = threading.Thread(target=_save_fn)
     self._save_thread.start()
 
diff --git a/tensorflow/contrib/tpu/python/tpu/datasets.py b/tensorflow/contrib/tpu/python/tpu/datasets.py
index c694e9c1bca..8d6245390fc 100644
--- a/tensorflow/contrib/tpu/python/tpu/datasets.py
+++ b/tensorflow/contrib/tpu/python/tpu/datasets.py
@@ -133,7 +133,7 @@ def StreamingFilesDataset(files,
   with ops.device('/job:%s' % file_reader_job):
     if isinstance(files, str):
       source_dataset = dataset_ops.Dataset.list_files(files)
-    elif isinstance(files, dataset_ops.Dataset):
+    elif isinstance(files, dataset_ops.DatasetV2):
       source_dataset = files
     else:
       raise ValueError('files was not a string or a dataset: %s' % files)
@@ -156,7 +156,7 @@ def StreamingFilesDataset(files,
 
     source_dataset = source_dataset.prefetch(1)
 
-    source_iterator = source_dataset.make_one_shot_iterator()
+    source_iterator = dataset_ops.make_one_shot_iterator(source_dataset)
     source_handle = source_iterator.string_handle()
 
   @function.Defun(dtypes.string)
diff --git a/tensorflow/contrib/tpu/python/tpu/datasets_test.py b/tensorflow/contrib/tpu/python/tpu/datasets_test.py
index b58d05eac56..52d87b80040 100644
--- a/tensorflow/contrib/tpu/python/tpu/datasets_test.py
+++ b/tensorflow/contrib/tpu/python/tpu/datasets_test.py
@@ -70,7 +70,7 @@ class DatasetsTest(test.TestCase):
     dataset = datasets.StreamingFilesDataset(
         os.path.join(self.get_temp_dir(), 'text_line.*.txt'), filetype='text')
 
-    iterator = dataset.make_initializable_iterator()
+    iterator = dataset_ops.make_initializable_iterator(dataset)
     self._sess.run(iterator.initializer)
     get_next = iterator.get_next()
 
@@ -94,7 +94,7 @@ class DatasetsTest(test.TestCase):
     dataset = datasets.StreamingFilesDataset(
         os.path.join(self.get_temp_dir(), 'tf_record*'), filetype='tfrecord')
 
-    iterator = dataset.make_initializable_iterator()
+    iterator = dataset_ops.make_initializable_iterator(dataset)
     self._sess.run(iterator.initializer)
     get_next = iterator.get_next()
 
@@ -121,7 +121,7 @@ class DatasetsTest(test.TestCase):
 
     dataset = datasets.StreamingFilesDataset(filenames, filetype='tfrecord')
 
-    iterator = dataset.make_initializable_iterator()
+    iterator = dataset_ops.make_initializable_iterator(dataset)
     self._sess.run(iterator.initializer)
     get_next = iterator.get_next()
 
@@ -154,7 +154,7 @@ class DatasetsTest(test.TestCase):
         os.path.join(self.get_temp_dir(), 'fixed_length*'),
         filetype=FixedLengthFile)
 
-    iterator = dataset.make_initializable_iterator()
+    iterator = dataset_ops.make_initializable_iterator(dataset)
     self._sess.run(iterator.initializer)
     get_next = iterator.get_next()
 
@@ -177,7 +177,7 @@ class DatasetsTest(test.TestCase):
     dataset = datasets.StreamingFilesDataset(
         dataset_ops.Dataset.range(10), filetype=gen_dataset)
 
-    iterator = dataset.make_initializable_iterator()
+    iterator = dataset_ops.make_initializable_iterator(dataset)
     self._sess.run(iterator.initializer)
     get_next = iterator.get_next()
 
diff --git a/tensorflow/contrib/tpu/python/tpu/keras_support.py b/tensorflow/contrib/tpu/python/tpu/keras_support.py
index 08f58a5f5b8..ebf40827e45 100644
--- a/tensorflow/contrib/tpu/python/tpu/keras_support.py
+++ b/tensorflow/contrib/tpu/python/tpu/keras_support.py
@@ -81,6 +81,7 @@ from tensorflow.python.keras import metrics as metrics_module
 from tensorflow.python.keras import models
 from tensorflow.python.keras import optimizers as keras_optimizers
 from tensorflow.python.keras.engine import base_layer
+from tensorflow.python.keras.engine import base_layer_utils
 from tensorflow.python.keras.engine import training_arrays
 from tensorflow.python.keras.engine import training_utils
 from tensorflow.python.keras.layers import embeddings
@@ -438,7 +439,7 @@ class TPURewriteContext(object):
 
     self._default_placeholder = array_ops.placeholder
     self._default_name_scope = ops.name_scope
-    self._default_make_variable = base_layer.make_variable
+    self._default_make_variable = base_layer_utils.make_variable
     self._default_random_normal = random_ops.random_normal
     self._default_qr = gen_linalg_ops.qr
 
@@ -486,14 +487,14 @@ class TPURewriteContext(object):
     gen_linalg_ops.qr = qr
 
     ops.name_scope = _name_scope
-    base_layer.make_variable = variable_scope.get_variable
+    base_layer_utils.make_variable = variable_scope.get_variable
     logging.info('Overriding default placeholder.')
     return
 
   def __exit__(self, exc_type, exc_val, exc_tb):
     array_ops.placeholder = self._default_placeholder
     ops.name_scope = self._default_name_scope
-    base_layer.make_variable = self._default_make_variable
+    base_layer_utils.make_variable = self._default_make_variable
     random_ops.random_normal = self._default_random_normal
     gen_linalg_ops.qr = self._default_qr
 
@@ -728,7 +729,7 @@ class TPUDatasetInfeedManager(TPUInfeedManager):
     dummy_x_shape[0] *= tpu_assignment.num_towers
     dummy_y_shape = dataset.output_shapes[1].as_list()
     dummy_y_shape[0] *= tpu_assignment.num_towers
-    self._iterator = dataset.make_initializable_iterator()
+    self._iterator = dataset_ops.make_initializable_iterator(dataset)
     K.get_session().run(self._iterator.initializer)
 
     self._get_next_ops = []
@@ -769,7 +770,7 @@ class TPUDatasetInfeedManager(TPUInfeedManager):
 
   def _verify_dataset_shape(self, dataset):
     """Verifies a dataset is of an appropriate shape for TPUs."""
-    if not isinstance(dataset, dataset_ops.Dataset):
+    if not isinstance(dataset, dataset_ops.DatasetV2):
       raise ValueError('The function passed as the `x` parameter did not '
                        'return a `tf.data.Dataset`.')
     if not isinstance(dataset.output_classes, tuple):
@@ -1012,9 +1013,10 @@ class TPUFunction(object):
                   optimizer=_replicated_optimizer(self._cloned_optimizer),
                   loss=self.model.loss,
                   loss_weights=self.model.loss_weights,
-                  metrics=metrics_module.clone_metrics(self.model.metrics),
+                  metrics=metrics_module.clone_metrics(
+                      self.model._compile_metrics),
                   weighted_metrics=metrics_module.clone_metrics(
-                      self.model.weighted_metrics),
+                      self.model._compile_weighted_metrics),
                   target_tensors=tpu_targets,
               )
 
@@ -1184,12 +1186,9 @@ class TPUFunction(object):
       # pipelined loop.
       return None, None
 
-    if not isinstance(K.learning_phase(), int):
+    if isinstance(inputs[-1], int):
       # Remove the learning_phase flag at the end. We currently hard code the
       # learning_phase in TPUFunction.
-      assert isinstance(inputs[-1], int), (
-          'Expect the final element be learning_phase flag. Got {}'.format(
-              inputs[-1]))
       inputs = inputs[:-1]
 
     if (self.execution_mode == model_fn_lib.ModeKeys.TRAIN or
@@ -1379,6 +1378,7 @@ class KerasTPUModel(models.Model):
     self.train_function = None
     self._fit_function = None
     self._eval_function = None
+    self._stateful_metric_functions = []
 
     cluster_resolver = strategy._tpu_cluster_resolver
     self._tpu_name_or_address = cluster_resolver.get_master()
@@ -1393,10 +1393,10 @@ class KerasTPUModel(models.Model):
       self.compile(
           self._cpu_model.optimizer,
           self._cpu_model.loss,
-          self._cpu_model.metrics,
+          self._cpu_model._compile_metrics,
           self._cpu_model.loss_weights,
           self._cpu_model.sample_weight_mode,
-          self._cpu_model.weighted_metrics,
+          self._cpu_model._compile_weighted_metrics,
           self._cpu_model.target_tensors,
       )
 
@@ -1466,7 +1466,7 @@ class KerasTPUModel(models.Model):
       assert not self._numpy_to_infeed_manager_list  # Ensure empty.
 
       infeed_managers = []  # Managers to clean up at the end of the fit call.
-      if isinstance(x, dataset_ops.Dataset):
+      if isinstance(x, dataset_ops.DatasetV2):
         # TODO(b/111413240): Support taking a tf.data.Dataset directly.
         raise ValueError(
             'Taking a Dataset directly is not yet supported. Please '
@@ -1492,7 +1492,7 @@ class KerasTPUModel(models.Model):
           y = infeed_manager.dummy_y
           infeed_managers.append((x, infeed_manager))
 
-      if isinstance(validation_data, dataset_ops.Dataset):
+      if isinstance(validation_data, dataset_ops.DatasetV2):
         # TODO(b/111413240): Support taking a tf.data.Dataset directly.
         raise ValueError(
             'Taking a Dataset directly is not yet supported. Please '
@@ -1551,7 +1551,7 @@ class KerasTPUModel(models.Model):
     with _tpu_session_context():
       # Managers to clean up at the end of the evaluate call.
       infeed_managers = []
-      if isinstance(x, dataset_ops.Dataset):
+      if isinstance(x, dataset_ops.DatasetV2):
         # TODO(b/111413240): Support taking a tf.data.Dataset directly.
         raise ValueError(
             'Taking a Dataset directly is not yet supported. Please '
@@ -1676,14 +1676,10 @@ class KerasTPUModel(models.Model):
         callbacks,
         self,
         do_validation=do_validation,
-        val_inputs=val_inputs,
-        val_targets=val_targets,
-        val_sample_weights=val_sample_weights,
         batch_size=batch_size,
         epochs=epochs,
         steps_per_epoch=steps_per_epoch,
         samples=num_training_samples,
-        validation_steps=validation_steps,
         verbose=verbose,
         count_mode=count_mode)
 
@@ -1700,7 +1696,7 @@ class KerasTPUModel(models.Model):
     callbacks.on_train_begin()
     for epoch in range(initial_epoch, epochs):
       # Reset stateful metrics
-      for m in self.stateful_metric_functions:
+      for m in self.metrics:
         m.reset_states()
       # Update callbacks
       callbacks.on_epoch_begin(epoch)
@@ -1923,7 +1919,7 @@ class KerasTPUModel(models.Model):
     if validation_data:
       if (isinstance(validation_data, iterator_ops.Iterator) or
           isinstance(validation_data, iterator_ops.EagerIterator) or
-          isinstance(validation_data, dataset_ops.Dataset)):
+          isinstance(validation_data, dataset_ops.DatasetV2)):
         raise ValueError('KerasTPUModel cannot handle a Dataset or Iterator '
                          'for validation_data. Please instead pass a function '
                          'that returns a `tf.data.Dataset`.')
@@ -1998,14 +1994,14 @@ class KerasTPUModel(models.Model):
     self._optimizer = optimizer
 
   @property
-  def stateful_metric_functions(self):
+  def metrics(self):
     if self._tpu_model:
-      return self._tpu_model.stateful_metric_functions
+      return self._tpu_model.metrics
     return self._stateful_metric_functions
 
-  @stateful_metric_functions.setter
-  def stateful_metric_functions(self, stateful_metric_functions):
-    self._stateful_metric_functions = stateful_metric_functions
+  @metrics.setter
+  def metrics(self, metrics):
+    self._stateful_metric_functions = metrics
 
   def _make_train_function(self):
     if not self.train_function:
@@ -2230,10 +2226,10 @@ def tpu_model(model, strategy=None):
     cpu_model.compile(
         _clone_optimizer(model.optimizer, optimizer_config),
         model.loss,
-        metrics_module.clone_metrics(model.metrics),
+        metrics_module.clone_metrics(model._compile_metrics),
         model.loss_weights,
         model.sample_weight_mode,
-        metrics_module.clone_metrics(model.weighted_metrics),
+        metrics_module.clone_metrics(model._compile_weighted_metrics),
     )
 
   if model_weights:
diff --git a/tensorflow/contrib/tpu/python/tpu/keras_tpu_variables.py b/tensorflow/contrib/tpu/python/tpu/keras_tpu_variables.py
index 28d3a938510..8b0b240dc73 100644
--- a/tensorflow/contrib/tpu/python/tpu/keras_tpu_variables.py
+++ b/tensorflow/contrib/tpu/python/tpu/keras_tpu_variables.py
@@ -217,6 +217,10 @@ class ReplicatedVariable(object):
   def get(self):
     return self._primary_var
 
+  @property
+  def _in_graph_mode(self):
+    return self._primary_var._in_graph_mode   # pylint: disable=protected-access
+
   def _should_act_as_resource_variable(self):
     """Pass resource_variable_ops.is_resource_variable check."""
     pass
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu.py b/tensorflow/contrib/tpu/python/tpu/tpu.py
index e3e791faacb..def57da20d6 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu.py
@@ -1001,8 +1001,8 @@ def rewrite(computation,
       `rewrite` is a list of tensors corresponding to the tensors from the
       output of `computation`.
 
-      All `Operation`s returned from `computation` will be executed when
-      evaluating any of the returned output tensors.
+      All `Operation`s constructed during `computation` will be executed when
+      evaluating any of the returned output tensors, not just the ones returned.
     inputs: A list of input tensors or `None` (equivalent to an empty list).
     infeed_queue: If not `None`, the `InfeedQueue` from which to append a tuple
       of arguments as inputs to `computation`.
@@ -1111,7 +1111,7 @@ def validate_inference_rewrite_for_variables(graph):
   Raises:
     RuntimeError: if validation failed.
   """
-  if not any([x.type == "GuaranteeConst" for x in graph.get_operations()]):
+  if not any(x.type == "GuaranteeConst" for x in graph.get_operations()):
     raise RuntimeError(
         "No GuaranteeConst ops found in the graph after running "
         "tpu.rewrite_for_inference(...). Please check that you are using "
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_context.py b/tensorflow/contrib/tpu/python/tpu/tpu_context.py
index da6bdf67d68..67246244794 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_context.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_context.py
@@ -41,7 +41,7 @@ _NUM_CORES_TO_COMPUTATION_SHAPE = {
 
 
 class TPUContext(object):
-  """The context of current input_fn invocation."""
+  """A context that holds the current configuration of the TPU computation."""
 
   def __init__(self,
                internal_ctx,
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_embedding.py b/tensorflow/contrib/tpu/python/tpu/tpu_embedding.py
index 3fe896426a7..ccba8a46c7c 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_embedding.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_embedding.py
@@ -1069,17 +1069,14 @@ def _create_partitioned_variables(name,
                      'As TPU embedding is not optimized for small tables, '
                      'please consider other ways for this embedding lookup.')
 
-  slicing = [num_hosts, 1]
-
-  # TODO(shizhiw): deprecated, use tf.get_variable()?
-  return partitioned_variables.create_partitioned_variables(
-      name=name,
-      slicing=slicing,
+  return list(variable_scope.get_variable(
+      name,
       shape=(vocabulary_size, embedding_dimension),
+      partitioner=partitioned_variables.fixed_size_partitioner(num_hosts),
       dtype=dtypes.float32,
       initializer=initializer,
       collections=collections,
-      trainable=False)
+      trainable=False))
 
 
 @ops.RegisterGradient('TPUEmbeddingActivations')
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py b/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
index 7cb8c4aa7f1..a9dc542ae5e 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
@@ -298,9 +298,9 @@ class TPUEstimatorSpec(model_fn_lib._TPUEstimatorSpec):  # pylint: disable=prote
       host_calls['host_call'] = host_call
     _OutfeedHostCall.validate(host_calls)
 
-    training_hooks = list(training_hooks or [])
-    evaluation_hooks = list(evaluation_hooks or [])
-    prediction_hooks = list(prediction_hooks or [])
+    training_hooks = tuple(training_hooks or [])
+    evaluation_hooks = tuple(evaluation_hooks or [])
+    prediction_hooks = tuple(prediction_hooks or [])
 
     for hook in training_hooks + evaluation_hooks + prediction_hooks:
       if not isinstance(hook, session_run_hook.SessionRunHook):
@@ -335,7 +335,7 @@ class TPUEstimatorSpec(model_fn_lib._TPUEstimatorSpec):  # pylint: disable=prote
     hooks = None
     if self.host_call is not None:
       hooks = [_OutfeedHostCallHook(host_call_ret['host_call'])]
-    hooks = list(hooks or [])
+    hooks = tuple(hooks or [])
     scaffold = self.scaffold_fn() if self.scaffold_fn else None
     return model_fn_lib.EstimatorSpec(
         mode=self.mode,
@@ -2169,7 +2169,6 @@ class TPUEstimator(estimator_lib.Estimator):
                                builder,
                                input_receiver_fn_map,
                                checkpoint_path,
-                               strip_default_attrs,
                                save_variables=True,
                                mode=model_fn_lib.ModeKeys.PREDICT,
                                export_tags=None,
@@ -2184,7 +2183,6 @@ class TPUEstimator(estimator_lib.Estimator):
         builder,
         input_receiver_fn_map,
         checkpoint_path,
-        strip_default_attrs,
         save_variables,
         mode=mode,
         export_tags=export_tags,
@@ -2201,7 +2199,6 @@ class TPUEstimator(estimator_lib.Estimator):
           builder,
           input_receiver_fn_map,
           checkpoint_path,
-          strip_default_attrs,
           save_variables=False,
           mode=mode,
           export_tags=export_tags,
@@ -2783,7 +2780,7 @@ def _export_output_to_tensors(export_output):
   elif isinstance(export_output, export_output_lib.RegressionOutput):
     return [export_output.value]
   elif isinstance(export_output, export_output_lib.PredictOutput):
-    return export_output.outputs.values()
+    return list(export_output.outputs.values())
   else:
     raise ValueError(
         '`export_output` must be have type `ClassificationOutput`, '
@@ -3059,7 +3056,7 @@ class _Inputs(object):
   @staticmethod
   def from_input_fn(return_values):
     """Returns an `_Inputs` instance according to `input_fn` return value."""
-    if isinstance(return_values, dataset_ops.Dataset):
+    if isinstance(return_values, dataset_ops.DatasetV2):
       dataset = return_values
       return _Inputs(dataset=dataset)
 
@@ -3084,7 +3081,7 @@ class _Inputs(object):
 
     The initializer must be run before calling `features_and_labels`.
     """
-    self._iterator = self._dataset.make_initializable_iterator()
+    self._iterator = dataset_ops.make_initializable_iterator(self._dataset)
     return self._iterator.initializer
 
   def features_and_labels(self):
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_estimator_signals_test.py b/tensorflow/contrib/tpu/python/tpu/tpu_estimator_signals_test.py
index 3786e52b949..55235556de0 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_estimator_signals_test.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_estimator_signals_test.py
@@ -71,7 +71,7 @@ class TPUEstimatorStoppingSignalsTest(test.TestCase):
 
     with ops.Graph().as_default():
       dataset = input_fn(params)
-      features = dataset.make_one_shot_iterator().get_next()
+      features = dataset_lib.make_one_shot_iterator(dataset).get_next()
 
       # With tf.data.Dataset.batch, the batch is None, i.e., dynamic shape.
       self.assertIsNone(features['a'].shape.as_list()[0])
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_feed.py b/tensorflow/contrib/tpu/python/tpu/tpu_feed.py
index e75a09492ec..d5957b7e8ec 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_feed.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_feed.py
@@ -26,7 +26,6 @@ import numpy as np
 from six.moves import xrange  # pylint: disable=redefined-builtin
 
 from tensorflow.compiler.xla.experimental.xla_sharding import xla_sharding
-from tensorflow.compiler.xla.python_api import xla_shape
 from tensorflow.contrib.tpu.python.ops import tpu_ops
 from tensorflow.contrib.tpu.python.tpu import tpu
 from tensorflow.contrib.tpu.python.tpu import tpu_sharding
@@ -92,8 +91,7 @@ class InfeedQueue(object):
       else:
         raise ValueError(
             "number of tuple elements cannot be inferred from InfeedQueue "
-            "constructor"
-        )
+            "constructor")
     if number_of_tuple_elements <= 0:
       raise ValueError("number_of_tuple_elements %d must be > 0" %
                        number_of_tuple_elements)
@@ -293,9 +291,8 @@ class InfeedQueue(object):
         self.number_of_tuple_elements
     """
     if len(input_tensors) != self.number_of_tuple_elements:
-      raise ValueError(
-          "input_tensors is %s, but should be a list of %d Tensors", (
-              str(input_tensors), self.number_of_tuple_elements))
+      raise ValueError("input_tensors is %s, but should be a list of %d Tensors"
+                       % (str(input_tensors), self.number_of_tuple_elements))
     self.set_tuple_shapes([t.shape for t in input_tensors])
     self.set_tuple_types([t.dtype for t in input_tensors])
 
@@ -451,8 +448,8 @@ class InfeedQueue(object):
       for i in xrange(1, self.number_of_tuple_elements):
         if devices[0] != devices[i]:
           raise ValueError(
-              "input devices for shard %d are %s, but should all be the same",
-              index, str(devices))
+              "input devices for shard %d are %s, but should all be the same" %
+              (index, str(devices)))
       with ops.colocate_with(inputs[0]):
         return tpu_ops.infeed_enqueue_tuple(
             inputs=inputs,
@@ -792,18 +789,14 @@ class _PartitionedInfeedQueue(InfeedQueue):
 
     Args:
       tensor: Input tensor for partitioning.
-      dims: A list of integer describes how to partition the input tensor.
+      dims: 1-D np.array of the list of integer describes how to partition the
+        input tensor.
 
     Raises:
       ValueError: If the tensor can't be partitioned by dims or the
         num_cores_per_replica doesn't match the number of
         partitions(dims.prod()).
     """
-    if dims is None:
-      return
-
-    dims = np.array(dims)
-
     if (dims < 1).any():
       raise ValueError("All input partition dims must be >= 1.")
 
@@ -823,11 +816,6 @@ class _PartitionedInfeedQueue(InfeedQueue):
           "partition dims = {}).".format(tensor.shape.as_list(), dims))
 
     tensor.shape.assert_is_fully_defined()
-    if (np.array(tensor.shape.as_list()) % dims != 0).any():
-      raise ValueError(
-          "All input partition dims must divide exactly into the `Tensor` "
-          "shape (tensor shape = {}, input partition dims = {}).".format(
-              tensor.shape.as_list(), dims))
 
   def _partition_or_replicate_on_host(self, tensor, dims):
     """Partitions or replicates the input tensor.
@@ -840,16 +828,39 @@ class _PartitionedInfeedQueue(InfeedQueue):
     Returns:
       An iterator of `Tensor`s or a list of partioned tensors.
     """
-    self._check_input_partition_dims(tensor, dims)
     if dims is None:
       return itertools.repeat(tensor)
-    else:
-      output = [tensor]
-      for axis, dim in enumerate(dims):
-        if dim > 1:
-          output = [array_ops.split(x, dim, axis=axis) for x in output]
-          output = nest.flatten(output)
-      return output
+    dims = np.array(dims)
+    self._check_input_partition_dims(tensor, dims)
+    output = [tensor]
+    shape_list = np.array(tensor.shape.as_list())
+    quotients, remainders = np.divmod(shape_list, dims)
+    for axis, (quotient, remainder, dim, original_size) in enumerate(
+        zip(quotients, remainders, dims, shape_list)):
+      if dim <= 1:
+        continue
+      if remainder > 0:
+        # For each dimension, when it cannot be evenly partitioned, XLA assumes
+        # tensors are partitioned in a greedy manner by using
+        # ceil_ratio(size/dim) first. E.g. 2D tensor with shape (5, 14) and dims
+        # are (2, 4). Since 5 % 2 = 1 and 14 % 4 = 2, [5, 14] =>
+        # [[(3, 4), (3, 4), (2, 4), (2, 2)],
+        # [(2, 4), (2, 4), (2, 4), (2, 2)]]
+        ceil_ratio = quotient + 1
+        num_full_slots, left_over = np.divmod(original_size, ceil_ratio)
+        num_or_size_splits = [ceil_ratio] * num_full_slots + [left_over]
+        if len(num_or_size_splits) < dim:
+          num_or_size_splits += [0] * (dim - len(num_or_size_splits))
+        new_output = []
+        for x in output:
+          new_output.append(
+              array_ops.split(
+                  x, num_or_size_splits=num_or_size_splits, axis=axis))
+        output = new_output
+      else:
+        output = [array_ops.split(x, dim, axis=axis) for x in output]
+      output = nest.flatten(output)
+    return output
 
   def _tag_sharding_attribute_for_dequeued_tensor(self, tensor, dims):
     """Tags appropriate XLA sharding attribute to the dequeued tensor.
@@ -866,13 +877,9 @@ class _PartitionedInfeedQueue(InfeedQueue):
     elif np.prod(dims) == 1:
       return xla_sharding.assign_device(tensor, 0)
     else:
-      tile_shape = np.array(tensor.shape.as_list()) // dims
       tile_assignment = np.arange(np.prod(dims)).reshape(dims)
       return xla_sharding.tile(
           tensor=tensor,
-          tile_shape=xla_shape.CreateShapeFromDtypeAndTuple(
-              dtype=np.dtype(tensor.dtype.as_numpy_dtype),
-              shape_tuple=tile_shape),
           tile_assignment=tile_assignment)
 
   def _tag_sharding_attribute_for_dequeued_tensors(self, dequeues, dims):
diff --git a/tensorflow/contrib/tpu/python/tpu/training_loop.py b/tensorflow/contrib/tpu/python/tpu/training_loop.py
index b6c350ecd75..0187b4bec6e 100644
--- a/tensorflow/contrib/tpu/python/tpu/training_loop.py
+++ b/tensorflow/contrib/tpu/python/tpu/training_loop.py
@@ -166,8 +166,8 @@ def while_loop(condition, body, inputs=None, infeed_queue=None, name=None):
   # control dependencies from any side-effecting operations.
   if input_arity == 0:
     inputs = [array_ops.constant(0)]
-  return control_flow_ops.while_loop(condition_wrapper, body_wrapper, inputs,
-                                     name="")
+  return control_flow_ops.while_loop(
+      condition_wrapper, body_wrapper, inputs, name="", parallel_iterations=1)
 
 
 def repeat(n, body, inputs=None, infeed_queue=None, name=None):
diff --git a/tensorflow/contrib/tpu/tpu_estimator.md b/tensorflow/contrib/tpu/tpu_estimator.md
index b6514e19dc9..552febd80bd 100644
--- a/tensorflow/contrib/tpu/tpu_estimator.md
+++ b/tensorflow/contrib/tpu/tpu_estimator.md
@@ -89,12 +89,9 @@ handle training:
 
         dataset = tf.data.TFRecordDataset(
             filename, buffer_size=FLAGS.dataset_reader_buffer_size)
-        dataset = dataset.map(parser).cache().repeat().batch(batch_size)
-        images, labels = dataset.make_one_shot_iterator().get_next()
-        # set_shape to give inputs statically known shapes.
-        images.set_shape([batch_size, 28 * 28])
-        labels.set_shape([batch_size])
-        return images, labels
+        dataset = dataset.map(parser).cache().repeat().batch(
+            batch_size, drop_remainder=True)
+        return dataset
       return input_fn
 
 
diff --git a/tensorflow/contrib/training/BUILD b/tensorflow/contrib/training/BUILD
index 00295f57f60..f6427ae05a2 100644
--- a/tensorflow/contrib/training/BUILD
+++ b/tensorflow/contrib/training/BUILD
@@ -26,7 +26,6 @@ py_library(
         "python/training/resample.py",
         "python/training/sampling_ops.py",
         "python/training/sequence_queueing_state_saver.py",
-        "python/training/tensor_queue_dataset.py",
         "python/training/training.py",
         "python/training/tuner.py",
     ],
@@ -287,28 +286,6 @@ py_test(
     ],
 )
 
-py_test(
-    name = "tensor_queue_dataset_test",
-    size = "large",
-    srcs = ["python/training/tensor_queue_dataset_test.py"],
-    srcs_version = "PY2AND3",
-    tags = ["notsan"],
-    deps = [
-        ":training_py",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:gradients",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:platform",
-        "//tensorflow/python:random_seed",
-        "//tensorflow/python:training",
-        "//tensorflow/python:variables",
-        "//tensorflow/python/data",
-        "//tensorflow/python/data/experimental/kernel_tests/serialization:dataset_serialization_test_base",
-        "//third_party/py/numpy",
-    ],
-)
-
 tf_proto_library(
     name = "protos_all",
     srcs = glob(["**/*.proto"]),
diff --git a/tensorflow/contrib/training/__init__.py b/tensorflow/contrib/training/__init__.py
index 3547e71184e..87ce57ef060 100644
--- a/tensorflow/contrib/training/__init__.py
+++ b/tensorflow/contrib/training/__init__.py
@@ -59,8 +59,6 @@ from tensorflow.contrib.training.python.training.hparam import *
 from tensorflow.contrib.training.python.training.resample import *
 from tensorflow.contrib.training.python.training.sampling_ops import *
 from tensorflow.contrib.training.python.training.sequence_queueing_state_saver import *
-from tensorflow.contrib.training.python.training.tensor_queue_dataset import enqueue_in_queue_dataset
-from tensorflow.contrib.training.python.training.tensor_queue_dataset import prepend_from_queue_and_padded_batch_dataset
 from tensorflow.contrib.training.python.training.training import add_gradients_summaries
 from tensorflow.contrib.training.python.training.training import clip_gradient_norms
 from tensorflow.contrib.training.python.training.training import clip_gradient_norms_fn
@@ -79,7 +77,6 @@ _allowed_symbols = [
     'FeedingQueueRunner', 'get_or_create_eval_step', 'StopAfterNEvalsHook',
     'SummaryAtEndHook', 'wait_for_new_checkpoint', 'add_gradients_summaries',
     'clip_gradient_norms', 'clip_gradient_norms_fn', 'create_train_op',
-    'multiply_gradients', 'enqueue_in_queue_dataset',
-    'prepend_from_queue_and_padded_batch_dataset', 'train']
+    'multiply_gradients', 'train']
 
 remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/contrib/training/python/training/tensor_queue_dataset.py b/tensorflow/contrib/training/python/training/tensor_queue_dataset.py
deleted file mode 100644
index 8896a95327a..00000000000
--- a/tensorflow/contrib/training/python/training/tensor_queue_dataset.py
+++ /dev/null
@@ -1,201 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Python wrappers for Datasets and Iterators."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.data.util import convert
-from tensorflow.python.data.util import nest
-from tensorflow.python.data.util import sparse
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_shape
-from tensorflow.python.framework import tensor_util
-from tensorflow.python.ops import gen_dataset_ops
-from tensorflow.python.util import nest as tf_nest
-
-
-class _PrependFromQueueAndPaddedBatchDataset(dataset_ops.UnaryDataset):
-  """A `Dataset` that prepends a queue to another `Dataset`.
-
-  A vector of handles to the queue is returned as the first component of
-  the associated iterator.  This vector can be passed to
-  `enqueue_in_queue_dataset` to add new elements to the queue.
-  """
-
-  def __init__(self, input_dataset, batch_size, padded_shapes, padding_values):
-    """Initialize `PrependFromQueueAndPaddedBatchDataset`."""
-    super(_PrependFromQueueAndPaddedBatchDataset, self).__init__(input_dataset)
-    if sparse.any_sparse(input_dataset.output_classes):
-      raise TypeError(
-          "Batching of padded sparse tensors is not currently supported")
-    self._input_dataset = input_dataset
-    self._batch_size = ops.convert_to_tensor(
-        batch_size, dtype=dtypes.int64, name="batch_size")
-    if padded_shapes is None:
-      self._padded_shapes = nest.map_structure(
-          convert.partial_shape_to_tensor, input_dataset.output_shapes)
-    else:
-      self._padded_shapes = nest.map_structure_up_to(
-          input_dataset.output_shapes, convert.partial_shape_to_tensor,
-          padded_shapes)
-    # pylint: disable=protected-access
-    padding_values = (
-        padding_values if padding_values is not None else
-        dataset_ops._default_padding(input_dataset))
-    self._padding_values = nest.map_structure_up_to(
-        input_dataset.output_shapes, dataset_ops._padding_value_to_tensor,
-        padding_values, input_dataset.output_types)
-    # pylint: enable=protected-access
-
-  def _as_variant_tensor(self):
-    # pylint: disable=protected-access
-    return gen_dataset_ops.prepend_from_queue_and_padded_batch_dataset(
-        self._input_dataset._as_variant_tensor(),
-        batch_size=self._batch_size,
-        padded_shapes=[
-            ops.convert_to_tensor(s, dtype=dtypes.int64)
-            for s in nest.flatten(self._padded_shapes)
-        ],
-        padding_values=nest.flatten(self._padding_values),
-        output_shapes=nest.flatten(
-            sparse.as_dense_shapes(self.output_shapes, self.output_classes)))
-    # pylint: enable=protected-access
-
-  @property
-  def output_classes(self):
-    return (ops.Tensor, self._input_dataset.output_classes)
-
-  def _as_batch_shape(self, shape_like):
-    return tensor_shape.vector(None).concatenate(
-        tensor_util.constant_value_as_shape(shape_like))
-
-  @property
-  def output_shapes(self):
-    # First output is a variant representing the Queue
-    return (tensor_shape.vector(None),
-            nest.map_structure(self._as_batch_shape, self._padded_shapes))
-
-  @property
-  def output_types(self):
-    # First output is a variant representing the Queue
-    return (dtypes.variant, self._input_dataset.output_types)
-
-
-def prepend_from_queue_and_padded_batch_dataset(batch_size,
-                                                padding_values=None,
-                                                padded_shapes=None):
-  """A transformation that prepends a queue to a `Dataset` and batches results.
-
-  A vector of handles to the queue is returned as the first component of the
-  associated iterator.  This vector can be passed to `enqueue_in_queue_dataset`
-  to add new elements to the queue.
-
-  Below is an example of how this dataset might be used to split incoming
-  variable-length sequences into "head" and "rest" parts, where "rest" parts
-  are re-enqueued back into the dataset.  A more realistic example would
-  perform some calculation on the "head" and modify some components of "rest"
-  with the result (before re-enqueueing).
-
-  ```python
-  dataset = tf.data.Dataset.from_tensor_slices([2*x for x in range(10)])
-  # Make a dataset of variable-length vectors and their lengths.
-  dataset = dataset.map(lambda count: (count, tf.ones((count,))))
-  # Emit a queue we can prepend to, and counts/values as padded batch.
-  dataset = dataset.apply(
-      tf.contrib.training.prepend_from_queue_and_padded_batch_dataset(
-        batch_size=10))
-  dataset = dataset.prefetch(1)
-
-  iterator = dataset.make_one_shot_iterator()
-  queue, (count, padded_value) = iterator.get_next()
-
-  # Split the padded_value into two pieces: head and rest
-  rest_indices = tf.squeeze(tf.where(count > 3), axis=1)
-  bound = tf.minimum(3, tf.reduce_max(count))
-  value_head = padded_value[:, :bound]
-  count_rest = tf.gather(count - 3, rest_indices)
-  value_rest = tf.gather(padded_value[:, bound:], rest_indices)
-  queue_rest = tf.gather(queue, rest_indices)
-  enqueue_rest_op = tf.contrib.training.enqueue_in_queue_dataset(
-    queue_rest, (count_rest, value_rest))
-  with tf.control_dependencies([enqueue_rest_op]):
-    calculation = fn(value_head)
-
-  while True:  # Will raise OutOfRange when finished with all pieces.
-    session.run(calculation)
-  ```
-
-  Args:
-    batch_size: `int64` scalar tensor.  The batch size to use when performing
-      padded batching.
-    padding_values: (optional) Nested tuple of scalar tensors.  If provided,
-      the structure and dtypes of padding_values should match that of
-      incoming dataset's `output_types`.
-    padded_shapes: (optional) Nested tuple of `int64` vector tensors.
-      If provided, the structure must match that of the incoming dataset's
-      `output_types`.  If not provided, the incoming dataset's `output_shapes`
-      is used.  Any unknown (`None` or `-1`) dimensions in the shapes are
-      treated as being unique per-batch: for each batch time, an unknown
-      dimension is replaced with the maximum given value of this dimension
-      across all tensors for the given component in the batch.
-
-  Returns:
-    A `Dataset` transformation function, which can be passed to
-    `tf.data.Dataset.apply`.
-  """
-
-  def _apply_fn(dataset):
-    return _PrependFromQueueAndPaddedBatchDataset(
-        dataset,
-        batch_size=batch_size,
-        padding_values=padding_values,
-        padded_shapes=padded_shapes)
-
-  return _apply_fn
-
-
-def enqueue_in_queue_dataset(queue, components):
-  """Enqueue components into queue from `PrependFromQueueAndPaddedBatchDataset`.
-
-  The components' dtypes and shapes must be compatible with the `output_shapes`
-  attribute of the `dataset` created by
-  `prepend_from_queue_and_padded_batch_dataset`.  This operation supports both
-  non-batched and batched modes.
-
-  For more details, see the example in the docstring for
-  `prepend_from_queue_and_padded_batch_dataset`.
-
-  Args:
-    queue: `variant` scalar or vector tensor.
-      The tensor emitted by the first component of the iterator associated with
-      `prepend_from_queue_and_padded_batch_dataset`.  If this is a scalar,
-      then the `components` input tensors should not have a prepended batch
-      dimension.
-    components: Nested tuple of tensors, each with a leading batch dimension
-      if `queue` is a vector.  The structure, dtypes, and shapes
-      (excluding batch dimension) must match the nested tuples
-      `dataset.output_types[1]` and `dataset.output_shapes[1]` (the non-queue
-      output types and shapes) of the `dataset` emitted by
-      the original `prepend_from_queue_and_padded_batch_dataset` call.
-
-  Returns:
-    An `Operation` that enqueues `components` into the dataset(s) associated
-    with entries of `queue`.
-  """
-  return gen_dataset_ops.enqueue_in_queue_dataset(
-      queue=queue, components=tf_nest.flatten(components))
diff --git a/tensorflow/contrib/training/python/training/tensor_queue_dataset_test.py b/tensorflow/contrib/training/python/training/tensor_queue_dataset_test.py
deleted file mode 100644
index c1657fec7bb..00000000000
--- a/tensorflow/contrib/training/python/training/tensor_queue_dataset_test.py
+++ /dev/null
@@ -1,355 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for TensorQueueDataset."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-
-from tensorflow.contrib.training.python.training import tensor_queue_dataset as tqd
-from tensorflow.python.data.experimental.kernel_tests.serialization import dataset_serialization_test_base
-from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import errors
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import string_ops
-from tensorflow.python.platform import test
-
-
-class PrependFromQueueAndPaddedBatchDatasetTest(test.TestCase):
-
-  def testNoEnqueue(self):
-    dataset = dataset_ops.Dataset.from_tensor_slices([0, 1, 2])
-    dataset = dataset.apply(
-        tqd.prepend_from_queue_and_padded_batch_dataset(batch_size=1))
-    self.assertEqual((dtypes.variant, dtypes.int32), dataset.output_types)
-    self.assertAllEqual(([None],) * 2,
-                        [x.as_list() for x in dataset.output_shapes])
-    iterator = dataset.make_one_shot_iterator()
-    _, value = iterator.get_next()
-    self.assertEqual([0], self.evaluate(value))
-    self.assertEqual([1], self.evaluate(value))
-    self.assertEqual([2], self.evaluate(value))
-    with self.assertRaisesOpError("End of sequence"):
-      self.evaluate(value)
-
-  def testBatchedNoEnqueue(self):
-    dataset = dataset_ops.Dataset.from_tensor_slices([0, 1, 2])
-    dataset = dataset.apply(
-        tqd.prepend_from_queue_and_padded_batch_dataset(batch_size=2))
-    iterator = dataset.make_one_shot_iterator()
-    _, value = iterator.get_next()
-    self.assertAllEqual([0, 1], self.evaluate(value))
-    self.assertAllEqual([2], self.evaluate(value))
-    with self.assertRaisesOpError("End of sequence"):
-      self.evaluate(value)
-
-  def testBatchedWithBiggerPaddingNoEnqueue(self):
-    dataset = dataset_ops.Dataset.from_tensor_slices([[0], [1], [2]])
-    dataset = dataset.apply(
-        tqd.prepend_from_queue_and_padded_batch_dataset(
-            batch_size=2, padded_shapes=[3]))
-    iterator = dataset.make_one_shot_iterator()
-    _, value = iterator.get_next()
-    self.assertAllEqual([[0, 0, 0], [1, 0, 0]], self.evaluate(value))
-    self.assertAllEqual([[2, 0, 0]], self.evaluate(value))
-    with self.assertRaisesOpError("End of sequence"):
-      self.evaluate(value)
-
-  def testBatchedWithBiggerPaddingOneEnqueue(self):
-    dataset = dataset_ops.Dataset.from_tensor_slices([[0], [1], [2]])
-    dataset = dataset.apply(
-        tqd.prepend_from_queue_and_padded_batch_dataset(
-            batch_size=1, padded_shapes=[3]))
-    iterator = dataset.make_one_shot_iterator()
-    queue_handle, value = iterator.get_next()
-    enqueue_negative = tqd.enqueue_in_queue_dataset(queue_handle, -value)
-    with self.cached_session() as sess:
-      self.assertAllEqual([[0, 0, 0]], sess.run(value))
-      value_1, _ = sess.run([value, enqueue_negative])
-      self.assertAllEqual([[1, 0, 0]], value_1)
-      value_2, _ = sess.run([value, enqueue_negative])
-      self.assertAllEqual([[-1, 0, 0]], value_2)
-      value_3 = sess.run(value)
-      self.assertAllEqual([[1, 0, 0]], value_3)
-      value_4, _ = sess.run([value, enqueue_negative])
-      self.assertAllEqual([[2, 0, 0]], value_4)
-      value_5 = sess.run(value)
-      self.assertAllEqual([[-2, 0, 0]], value_5)
-      with self.assertRaisesOpError("End of sequence"):
-        sess.run(value)
-
-  def testOneEnqueue(self):
-    dataset = dataset_ops.Dataset.from_tensor_slices([0, 1, 2])
-    dataset = dataset.apply(
-        tqd.prepend_from_queue_and_padded_batch_dataset(batch_size=1))
-    iterator = dataset.make_one_shot_iterator()
-    queue_handle, value = iterator.get_next()
-    enqueue_negative = tqd.enqueue_in_queue_dataset(queue_handle, -value)
-    with self.cached_session() as sess:
-      self.assertEqual([0], sess.run(value))
-      value_1, _ = sess.run([value, enqueue_negative])
-      self.assertEqual([1], value_1)
-      value_2, _ = sess.run([value, enqueue_negative])
-      self.assertEqual([-1], value_2)
-      value_3 = sess.run(value)
-      self.assertEqual([1], value_3)
-      value_4, _ = sess.run([value, enqueue_negative])
-      self.assertEqual([2], value_4)
-      value_5 = sess.run(value)
-      self.assertEqual([-2], value_5)
-      with self.assertRaisesOpError("End of sequence"):
-        sess.run(value)
-
-  def testBatchedOneEnqueue(self):
-    dataset = dataset_ops.Dataset.from_tensor_slices([0, 1, 2])
-    dataset = dataset.apply(
-        tqd.prepend_from_queue_and_padded_batch_dataset(batch_size=2))
-    iterator = dataset.make_one_shot_iterator()
-    queue_handle, value = iterator.get_next()
-    enqueue_negative = tqd.enqueue_in_queue_dataset(queue_handle, -value)
-    enqueue_zeroth = tqd.enqueue_in_queue_dataset([queue_handle[0]],
-                                                  array_ops.expand_dims(
-                                                      value[0], axis=0))
-    with self.cached_session() as sess:
-      value_0, _ = sess.run([value, enqueue_negative])
-      self.assertAllEqual([0, 1], value_0)
-      value_1, _ = sess.run([value, enqueue_zeroth])
-      self.assertAllEqual([0, -1], value_1)
-      value_2, _ = sess.run([value, enqueue_negative])
-      self.assertAllEqual([0, 2], value_2)
-      self.assertAllEqual([0, -2], sess.run(value))
-      with self.assertRaisesOpError("End of sequence"):
-        sess.run(value)
-
-  def testManyEnqueue(self):
-    dataset = dataset_ops.Dataset.from_tensor_slices([0, 1])
-    dataset = dataset.apply(
-        tqd.prepend_from_queue_and_padded_batch_dataset(batch_size=1))
-    iterator = dataset.make_one_shot_iterator()
-    queue_handle, value = iterator.get_next()
-    enqueue_many_more = [
-        tqd.enqueue_in_queue_dataset(queue_handle, value + 100 + i)
-        for i in range(1000)
-    ]
-    with self.cached_session() as sess:
-      value_0, _ = sess.run((value, enqueue_many_more))
-      self.assertEqual([0], value_0)
-      rest = []
-      for _ in range(1000):
-        rest.append(sess.run(value))
-      self.assertEquals([[100 + i] for i in range(1000)], sorted(rest))
-      # Going back to the original input.
-      value_1, _ = sess.run((value, enqueue_many_more))
-      self.assertEqual(1, value_1)
-      rest = []
-      for _ in range(1000):
-        rest.append(sess.run(value))
-      self.assertEquals([[100 + i + 1] for i in range(1000)], sorted(rest))
-      with self.assertRaisesOpError("End of sequence"):
-        sess.run(value)
-
-  def testEnqueueWithPrefetch(self):
-    dataset = dataset_ops.Dataset.from_tensor_slices([0])
-    dataset = dataset.apply(
-        tqd.prepend_from_queue_and_padded_batch_dataset(batch_size=1))
-    # Prefetching will request additional values before they are
-    # available to the queue.
-    dataset = dataset.prefetch(buffer_size=3)
-    iterator = dataset.make_one_shot_iterator()
-    queue_handle, value = iterator.get_next()
-    enqueue = tqd.enqueue_in_queue_dataset(queue_handle, value + 1)
-    with self.cached_session() as sess:
-      i = 0
-      while i < 4:
-        received, _ = sess.run((value, enqueue))
-        if received.size > 0:
-          self.assertAllEqual([i], received)
-          i += 1
-      received_last = False
-      while True:
-        try:
-          received = sess.run(value)
-          if received.size > 0:
-            self.assertAllEqual([4], received)
-            received_last = True
-        except errors.OutOfRangeError:
-          break
-      self.assertTrue(received_last)
-
-  def testDatasetWithPaddedShapeSmallerThanInputFails(self):
-    dataset = dataset_ops.Dataset.from_tensor_slices([[0, 0, 0]]).repeat(None)
-    dataset = dataset.apply(
-        tqd.prepend_from_queue_and_padded_batch_dataset(
-            batch_size=1, padded_shapes=[2]))
-    iterator = dataset.make_one_shot_iterator()
-    _, value = iterator.get_next()
-    with self.cached_session() as sess:
-      with self.assertRaisesOpError(
-          r"Incompatible input shapes at component 0 between "
-          r"input dataset this dataset: \[3\] vs. \[2\]"):
-        sess.run(value)
-
-  def testEnqueueWithIncompatibleInputsFailsWithInformativeError(self):
-    dataset = dataset_ops.Dataset.from_tensor_slices([0]).repeat(None)
-    dataset = dataset.apply(
-        tqd.prepend_from_queue_and_padded_batch_dataset(batch_size=1))
-    iterator = dataset.make_one_shot_iterator()
-    queue_handle, value = iterator.get_next()
-
-    enqueue_bad_structure = tqd.enqueue_in_queue_dataset(
-        queue_handle, (value, value))
-    enqueue_bad_dtype = tqd.enqueue_in_queue_dataset(queue_handle,
-                                                     np.array(
-                                                         [1.0],
-                                                         dtype=np.float32))
-    enqueue_bad_shape_no_batch_dim = tqd.enqueue_in_queue_dataset(
-        queue_handle, ([1],))
-    enqueue_bad_shape = tqd.enqueue_in_queue_dataset(queue_handle,
-                                                     np.array(
-                                                         [[1]], dtype=np.int32))
-
-    with self.cached_session() as sess:
-      with self.assertRaisesOpError(
-          "mismatched number of tensors.  Queue expects 1 tensors but "
-          "tried to insert 2"):
-        sess.run(enqueue_bad_structure)
-      with self.assertRaisesOpError(r"Expected component 0 to have batched "
-                                    r"shape \[1,...\], but saw shape: \[\]"):
-        sess.run(enqueue_bad_shape_no_batch_dim)
-      with self.assertRaisesOpError(
-          r"mismatched shapes at component 0.  Attempted to insert tensor "
-          r"with shape \[1\] but queue expected shape: \[\]"):
-        sess.run(enqueue_bad_shape)
-      with self.assertRaisesOpError(
-          r"mismatched dtypes at component 0.  Attempted to insert tensor "
-          r"of type float but queue expected type: int32"):
-        sess.run(enqueue_bad_dtype)
-
-  def testEnqueueWithPaddedBatchFailsWithInformativeError(self):
-    dataset = dataset_ops.Dataset.from_tensor_slices([0, 1, 2])
-    dataset = dataset.apply(
-        tqd.prepend_from_queue_and_padded_batch_dataset(batch_size=1))
-    with self.assertRaisesRegexp(
-        TypeError, r"Unable to create padding for field of type 'variant'"):
-      dataset.padded_batch(batch_size=10, padded_shapes=[1])
-
-  def testOneEnqueueWithPadding(self):
-    dataset = dataset_ops.Dataset.from_tensor_slices([0, 2, 4, 6])
-    # Make a dataset of variable-length vectors and their lengths.
-    dataset = dataset.map(
-        lambda c: (c, c * array_ops.ones((c,), dtype=c.dtype)))
-    # Emit a queue we can prepend to, and counts/values as padded
-    # batch.
-    dataset = dataset.apply(
-        tqd.prepend_from_queue_and_padded_batch_dataset(batch_size=3))
-
-    iterator = dataset.make_one_shot_iterator()
-    queue, (count, padded_value) = iterator.get_next()
-
-    # Split the padded_value into two pieces: head and rest
-    rest_indices = array_ops.squeeze(array_ops.where(count > 2), axis=1)
-    bound = math_ops.minimum(2, math_ops.reduce_max(count))
-    value_head = padded_value[:, :bound]
-    count_rest = array_ops.gather(count - 2, rest_indices)
-    value_rest = array_ops.gather(padded_value, rest_indices)[:, bound:]
-    queue_rest = array_ops.gather(queue, rest_indices)
-    enqueue_rest_op = tqd.enqueue_in_queue_dataset(queue_rest,
-                                                   (count_rest, value_rest))
-    with ops.control_dependencies([enqueue_rest_op]):
-      calc = array_ops.identity(value_head)
-
-    with self.cached_session() as sess:
-      self.assertAllEqual([[0, 0], [2, 2], [4, 4]], sess.run(calc))
-      self.assertAllEqual([[4, 4], [6, 6]], sess.run(calc))
-      self.assertAllEqual([[6, 6]], sess.run(calc))
-      self.assertAllEqual([[6, 6]], sess.run(calc))
-      # Get some final batches due to prefetching.
-      for _ in range(3):
-        try:
-          self.assertAllEqual(
-              np.empty(shape=(0, 0), dtype=np.int32), sess.run(calc))
-        except errors.OutOfRangeError as e:
-          self.assertTrue(str(e).startswith("End of sequence"))
-
-  def testNonstandardPadding(self):
-    dataset = dataset_ops.Dataset.from_tensor_slices([0, 2, 4, 6])
-    # Make a dataset of variable-length vectors and their lengths.
-    dataset = dataset.map(
-        lambda c: (c, c * array_ops.ones((c,), dtype=c.dtype)))
-    # Emit a queue we can prepend to, and counts/values as padded
-    # batch.
-    dataset = dataset.apply(
-        tqd.prepend_from_queue_and_padded_batch_dataset(
-            batch_size=3, padding_values=(
-                0,
-                -1,
-            )))
-
-    iterator = dataset.make_one_shot_iterator()
-    _, (unused_count, padded_value) = iterator.get_next()
-
-    with self.cached_session() as sess:
-      self.assertAllEqual([[-1, -1, -1, -1], [2, 2, -1, -1], [4, 4, 4, 4]],
-                          sess.run(padded_value))
-      self.assertAllEqual([[6] * 6], sess.run(padded_value))
-      with self.assertRaisesOpError("End of sequence"):
-        sess.run(padded_value)
-
-
-# TODO(ebrevdo): Figure out how to use run_core_tests to test state
-# saving of an iterator that's had some tensors enqueued into its queue.
-class PrependFromQueueAndPaddedBatchDatasetSerializationTest(
-    dataset_serialization_test_base.DatasetSerializationTestBase):
-
-  def testPrependFromQueueAndPaddedBatch(self):
-
-    def build_dataset(seq_lens):
-      return dataset_ops.Dataset.from_tensor_slices(seq_lens).map(
-          lambda x: array_ops.fill([x], x)).apply(
-              tqd.prepend_from_queue_and_padded_batch_dataset(batch_size=4))
-
-    seq_lens1 = np.random.randint(1, 20, size=(32,)).astype(np.int32)
-    seq_lens2 = np.random.randint(21, 40, size=(32,)).astype(np.int32)
-    self.run_core_tests(lambda: build_dataset(seq_lens1),
-                        lambda: build_dataset(seq_lens2), 8)
-
-  def testPrependFromQueueAndPaddedBatchNonDefaultPadding(self):
-
-    def build_dataset(seq_lens):
-
-      def fill_tuple(x):
-        filled = array_ops.fill([x], x)
-        return (filled, string_ops.as_string(filled))
-
-      padded_shape = [-1]
-      return dataset_ops.Dataset.from_tensor_slices(seq_lens).map(
-          fill_tuple).apply(
-              tqd.prepend_from_queue_and_padded_batch_dataset(
-                  batch_size=4,
-                  padded_shapes=(padded_shape, padded_shape),
-                  padding_values=(-1, "<end>")))
-
-    seq_lens1 = np.random.randint(1, 20, size=(32,)).astype(np.int32)
-    seq_lens2 = np.random.randint(21, 40, size=(32,)).astype(np.int32)
-    self.run_core_tests(lambda: build_dataset(seq_lens1),
-                        lambda: build_dataset(seq_lens2), 8)
-
-
-if __name__ == "__main__":
-  test.main()
diff --git a/tensorflow/contrib/verbs/rdma.cc b/tensorflow/contrib/verbs/rdma.cc
index f7c979e8632..9db80f6b573 100644
--- a/tensorflow/contrib/verbs/rdma.cc
+++ b/tensorflow/contrib/verbs/rdma.cc
@@ -30,7 +30,6 @@ limitations under the License.
 #include "tensorflow/core/distributed_runtime/rendezvous_mgr_interface.h"
 #include "tensorflow/core/distributed_runtime/rpc/grpc_util.h"
 #include "tensorflow/core/distributed_runtime/session_mgr.h"
-#include "tensorflow/core/distributed_runtime/rpc/grpc_util.h"
 #include "tensorflow/core/framework/rendezvous.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/lib/core/status.h"
@@ -1028,7 +1027,10 @@ Status RdmaTensorResponse::PrepareRecvTensor(
     return errors::Aborted(
         "RecvTensor expects a different device incarnation: ",
         parsed.src_incarnation, " vs. ", (*src_dev)->attributes().incarnation(),
-        ". Your worker job was probably restarted. Check your "
+        ". Your worker job (\"",
+        channel_->adapter_->worker_env_->session_mgr->LegacySession()
+            ->worker_name,
+        "\") was probably restarted. Check your "
         "worker job for the reason why it was restarted.");
   }
 
diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index a701b38d4b3..575edfe7a93 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -95,7 +95,8 @@ load("//tensorflow:tensorflow.bzl", "tf_cc_test_gpu")
 load("//tensorflow:tensorflow.bzl", "tf_cc_tests_gpu")
 load("//tensorflow:tensorflow.bzl", "tf_cuda_cc_test")
 load("//tensorflow:tensorflow.bzl", "tf_version_info_genrule")
-load("//tensorflow:tensorflow.bzl", "if_not_tx2_llvm_or_windows_cuda")
+load("//tensorflow:tensorflow.bzl", "if_nccl")
+load("//tensorflow:tensorflow.bzl", "tensorflow_opensource_extra_deps")
 load("//tensorflow:tensorflow.bzl", "tf_cuda_only_cc_test")
 
 # For platform specific build config
@@ -112,6 +113,7 @@ load(
     "tf_additional_device_tracer_test_flags",
     "tf_additional_gdr_lib_defines",
     "tf_additional_human_readable_json_deps",
+    "tf_additional_logger_deps",
     "tf_additional_lib_defines",
     "tf_additional_lib_deps",
     "tf_additional_lib_hdrs",
@@ -300,6 +302,7 @@ filegroup(
         "platform/env_time.h",
         "platform/logging.h",
         "platform/macros.h",
+        "platform/platform_strings.h",
         "platform/types.h",
     ],
     visibility = ["//visibility:private"],
@@ -442,6 +445,18 @@ cc_library(
     ] + tf_additional_human_readable_json_deps(),
 )
 
+cc_library(
+    name = "logger",
+    srcs = tf_platform_srcs(["logger.cc"]),
+    hdrs = ["platform/logger.h"] + tf_platform_hdrs(["logger.h"]),
+    copts = tf_copts(),
+    visibility = ["//visibility:public"],
+    deps = [
+        ":lib",
+        ":lib_internal",
+    ] + tf_additional_logger_deps(),
+)
+
 filegroup(
     name = "platform_env_hdrs",
     srcs = [
@@ -519,6 +534,19 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "platform_strings",
+    srcs = tf_platform_srcs([
+        "platform/platform_strings.cc",
+        "platform/platform_strings_computed.h",
+    ]),
+    hdrs = [
+        "platform/platform_strings.h",
+    ],
+    visibility = ["//tensorflow/core:__subpackages__"],
+    deps = [":lib"],
+)
+
 filegroup(
     name = "platform_other_hdrs",
     srcs = [
@@ -841,6 +869,7 @@ tf_cuda_library(
         "framework/dataset_stateful_op_whitelist.h",
         "framework/device_base.h",
         "framework/function.h",
+        "framework/function_handle_cache.h",
         "framework/graph_def_util.h",
         "framework/graph_to_functiondef.h",
         "framework/kernel_def_builder.h",
@@ -884,6 +913,7 @@ tf_cuda_library(
         "util/bcast.h",
         "util/cuda_kernel_helper.h",
         "util/device_name_utils.h",
+        "util/dump_graph.h",
         "util/events_writer.h",
         "util/example_proto_fast_parsing.h",
         "util/example_proto_helper.h",
@@ -901,6 +931,7 @@ tf_cuda_library(
         "util/stream_executor_util.h",
         "util/strided_slice_op.h",
         "util/tensor_format.h",
+        "util/tensor_ops_util.h",
         "util/tensor_slice_reader.h",
         "util/tensor_slice_reader_cache.h",
         "util/tensor_slice_writer.h",
@@ -1038,6 +1069,7 @@ tf_gen_op_libs(
         "batch_ops",
         "bitwise_ops",
         "boosted_trees_ops",
+        "tensor_forest_ops",
         "candidate_sampling_ops",
         "checkpoint_ops",
         "collective_ops",
@@ -1085,7 +1117,11 @@ tf_gen_op_libs(
     op_lib_names = [
         "string_ops",
     ],
-    deps = ["@com_google_absl//absl/strings"],
+    deps = [
+        ":lib_internal",
+        ":lib_proto_parsing",
+        "@com_google_absl//absl/strings",
+    ],
 )
 
 tf_gen_op_libs(
@@ -1187,6 +1223,7 @@ cc_library(
         ":batch_ops_op_lib",
         ":bitwise_ops_op_lib",
         ":boosted_trees_ops_op_lib",
+        ":tensor_forest_ops_op_lib",
         ":candidate_sampling_ops_op_lib",
         ":checkpoint_ops_op_lib",
         ":collective_ops_op_lib",
@@ -1340,6 +1377,7 @@ cc_library(
         "//tensorflow/core/kernels:batch_kernels",
         "//tensorflow/core/kernels:bincount_op",
         "//tensorflow/core/kernels:boosted_trees_ops",
+        "//tensorflow/core/kernels:tensor_forest_ops",
         "//tensorflow/core/kernels:candidate_sampler_ops",
         "//tensorflow/core/kernels:checkpoint_ops",
         "//tensorflow/core/kernels:collective_ops",
@@ -1386,9 +1424,7 @@ cc_library(
         "//tensorflow/core/kernels:summary_kernels",
         "//tensorflow/core/kernels:training_ops",
         "//tensorflow/core/kernels:word2vec_kernels",
-    ] + tf_additional_cloud_kernel_deps() + if_not_tx2_llvm_or_windows_cuda([
-        "//tensorflow/core/kernels:nccl_kernels",
-    ]) + if_not_windows([
+    ] + tf_additional_cloud_kernel_deps() + if_not_windows([
         "//tensorflow/core/kernels:fact_op",
         "//tensorflow/core/kernels:array_not_windows",
         "//tensorflow/core/kernels:math_not_windows",
@@ -1413,6 +1449,8 @@ cc_library(
     ]) + if_cuda([
         "//tensorflow/core/grappler/optimizers:gpu_swapping_kernels",
         "//tensorflow/core/grappler/optimizers:gpu_swapping_ops",
+    ]) + if_nccl([
+        "//tensorflow/core/kernels:nccl_kernels",
     ]),
 )
 
@@ -1437,7 +1475,7 @@ tf_cuda_library(
         ":gpu_runtime",
         ":lib",
         ":ops",
-    ],
+    ] + tensorflow_opensource_extra_deps(),
 )
 
 cc_library(
@@ -1577,6 +1615,8 @@ filegroup(
             "util/stats_calculator.*",
             "util/reporter.*",
             "platform/**/cuda_libdevice_path.*",
+            "platform/**/logger.cc",
+            "platform/**/logger.h",
             "platform/default/test_benchmark.*",
             "platform/cuda.h",
             "platform/google/**/*",
@@ -1671,8 +1711,8 @@ cc_library(
 cc_library(
     name = "mobile_additional_lib_deps",
     deps = tf_additional_lib_deps() + [
+        "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/time",
     ],
 )
 
@@ -1763,7 +1803,7 @@ cc_library(
 # registration of ops to prune code size.
 cc_library(
     name = "android_tensorflow_lib_selective_registration",
-    srcs = if_android(["//tensorflow/core:android_srcs"]),
+    srcs = if_android(["//tensorflow/core:android_srcs_only_runtime"]),
     copts = tf_copts(android_optimization_level_override = None) + [
         "-DSUPPORT_SELECTIVE_REGISTRATION",
     ],
@@ -1775,9 +1815,7 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         ":protos_all_cc_impl",
-        "//third_party/eigen3",
-        "@double_conversion//:double-conversion",
-        "@nsync//:nsync_cpp",
+        "@com_google_absl//absl/container:flat_hash_set",
         "@protobuf_archive//:protobuf",
     ],
     alwayslink = 1,
@@ -1787,7 +1825,7 @@ cc_library(
 # no proto_rtti.
 cc_library(
     name = "android_tensorflow_lib_selective_registration_nortti",
-    srcs = if_android(["//tensorflow/core:android_srcs"]),
+    srcs = if_android(["//tensorflow/core:android_srcs_only_runtime"]),
     copts = tf_copts(android_optimization_level_override = None) + tf_opts_nortti_if_android() + [
         "-DSUPPORT_SELECTIVE_REGISTRATION",
     ],
@@ -1799,9 +1837,7 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         ":protos_all_cc_impl",
-        "//third_party/eigen3",
-        "@double_conversion//:double-conversion",
-        "@nsync//:nsync_cpp",
+        "@com_google_absl//absl/container:flat_hash_set",
         "@protobuf_archive//:protobuf",
     ],
     alwayslink = 1,
@@ -2045,9 +2081,7 @@ tf_proto_library_cc(
     srcs = ["protobuf/master.proto"],
     cc_api_version = 2,
     protodeps = tf_additional_all_protos(),
-    visibility = [
-        "//tensorflow:internal",
-    ],
+    visibility = ["//tensorflow:internal"],
 )
 
 tf_proto_library_cc(
@@ -2187,6 +2221,7 @@ cc_library(
             "platform/**/env_time.cc",
             "platform/**/cuda_libdevice_path.cc",
             "platform/**/device_tracer.cc",
+            "platform/**/logger.cc",
             "platform/**/logging.cc",
             "platform/**/human_readable_json.cc",
             "platform/abi.cc",
@@ -2199,6 +2234,7 @@ cc_library(
             "platform/**/stream_executor.h",
             "platform/**/env_time.cc",
             "platform/**/device_tracer.cc",
+            "platform/**/logger.cc",
             "platform/**/logging.cc",
             "platform/**/human_readable_json.cc",
             "platform/abi.cc",
@@ -2641,6 +2677,8 @@ tf_cuda_library(
         ":stats_calculator_portable",
         ":version_lib",
         "@com_google_absl//absl/base",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/strings",
         "//tensorflow/core/platform/default/build_config:platformlib",
         "//tensorflow/core/kernels:bounds_check",
         "//third_party/eigen3",
@@ -2943,6 +2981,7 @@ tf_cuda_library(
         ":lib_internal",
         ":proto_text",
         ":protos_all_cc",
+        "@com_google_absl//absl/memory",
         "//third_party/eigen3",
         "//tensorflow/core/grappler:grappler_item",
     ] + mkl_deps(),
@@ -3008,7 +3047,6 @@ tf_cuda_library(
     hdrs = ["common_runtime/metrics.h"],
     deps = [
         ":lib",
-        "@com_google_absl//absl/time",
     ],
 )
 
@@ -3033,7 +3071,6 @@ tf_cuda_library(
         ":protos_all_cc",
         "//tensorflow/core/debug:debug_graph_utils",
         "//tensorflow/core/kernels:function_ops",
-        "@com_google_absl//absl/time",
     ],
     alwayslink = 1,
 )
@@ -3393,6 +3430,7 @@ tf_cc_tests(
         "platform/profile_utils/cpu_utils_test.cc",
         "platform/stacktrace_handler_test.cc",
         "platform/subprocess_test.cc",
+        "platform/vmodule_benchmark_test.cc",
     ],
     deps = [
         ":lib",
@@ -3406,6 +3444,20 @@ tf_cc_tests(
     ],
 )
 
+tf_cc_test(
+    name = "vmodule_test",
+    srcs = ["platform/vmodule_test.cc"],
+    tags = ["optonly"],
+    deps = [
+        ":lib",
+        ":lib_internal",
+        ":lib_test_internal",
+        ":protos_all_cc",
+        ":test",
+        "//third_party/eigen3",
+    ],
+)
+
 tf_cc_test(
     name = "lib_random_random_distributions_test",
     srcs = ["lib/random/random_distributions_test.cc"],
@@ -3421,6 +3473,16 @@ tf_cc_test(
     ],
 )
 
+tf_cc_test(
+    name = "platform_strings_test",
+    size = "small",
+    srcs = ["platform/platform_strings_test.cc"],
+    deps = [
+        ":lib",
+        ":platform_strings",
+    ],
+)
+
 tf_cc_test(
     name = "platform_env_test",
     size = "small",
@@ -3668,6 +3730,7 @@ tf_cc_tests(
         "util/bcast_test.cc",
         "util/command_line_flags_test.cc",
         "util/device_name_utils_test.cc",
+        "util/dump_graph_test.cc",
         "util/equal_graph_def_test.cc",
         "util/events_writer_test.cc",
         "util/example_proto_fast_parsing_test.cc",
@@ -3798,6 +3861,7 @@ tf_cc_tests_gpu(
         ":test",
         ":test_main",
         ":testlib",
+        "@com_google_absl//absl/memory",
     ],
 )
 
@@ -3826,6 +3890,7 @@ tf_cc_tests_gpu(
         ":test",
         ":test_main",
         ":testlib",
+        "@com_google_absl//absl/memory",
     ],
 )
 
@@ -4099,6 +4164,7 @@ tf_cc_test(
         "//tensorflow/core/kernels:identity_op",
         "//tensorflow/core/kernels:immutable_constant_op",
         "//tensorflow/core/kernels:matmul_op",
+        "//tensorflow/core/kernels:topk_op",
         "//third_party/eigen3",
     ],
 )
@@ -4392,6 +4458,7 @@ tf_cc_test(
         "//tensorflow/core/kernels:random_ops",
         "//tensorflow/core/kernels:shape_ops",
         "//third_party/eigen3",
+        "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
     ],
 )
@@ -4871,6 +4938,7 @@ transitive_hdrs(
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
+        "//tensorflow/core:platform_strings",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:stream_executor",
     ],
diff --git a/tensorflow/core/api_def/api_test.cc b/tensorflow/core/api_def/api_test.cc
index 6f988569159..d38a8424eb1 100644
--- a/tensorflow/core/api_def/api_test.cc
+++ b/tensorflow/core/api_def/api_test.cc
@@ -182,11 +182,14 @@ void TestDeprecationVersionSetCorrectly(
   for (const auto& name_and_api_def : api_defs_map) {
     const auto& name = name_and_api_def.first;
     const auto& api_def = name_and_api_def.second;
-    ASSERT_TRUE(api_def.deprecation_version() == 0 ||
-                api_def.deprecation_message().empty())
-        << "ApiDef that includes deprecation_version > 0 must also specify "
-        << "a deprecation_message. Op " << name
-        << " has deprecation_version > 0 but deprecation_message is not set.";
+    if (api_def.deprecation_version() != 0) {
+      ASSERT_TRUE(api_def.deprecation_version() > 0)
+          << "Found ApiDef with negative deprecation_version";
+      ASSERT_FALSE(api_def.deprecation_message().empty())
+          << "ApiDef that includes deprecation_version > 0 must also specify "
+          << "a deprecation_message. Op " << name
+          << " has deprecation_version > 0 but deprecation_message is not set.";
+    }
   }
 }
 }  // namespace
diff --git a/tensorflow/core/api_def/base_api/api_def_BatchDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_BatchDataset.pbtxt
index 639d962874d..32def912f83 100644
--- a/tensorflow/core/api_def/base_api/api_def_BatchDataset.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_BatchDataset.pbtxt
@@ -1,5 +1,6 @@
 op {
   graph_op_name: "BatchDataset"
+  visibility: HIDDEN
   in_arg {
     name: "batch_size"
     description: <<END
diff --git a/tensorflow/core/api_def/base_api/api_def_CacheDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_CacheDataset.pbtxt
index 6889b8ea148..9f7088b9007 100644
--- a/tensorflow/core/api_def/base_api/api_def_CacheDataset.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_CacheDataset.pbtxt
@@ -1,5 +1,6 @@
 op {
   graph_op_name: "CacheDataset"
+  visibility: HIDDEN
   in_arg {
     name: "filename"
     description: <<END
diff --git a/tensorflow/core/api_def/base_api/api_def_ConcatenateDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_ConcatenateDataset.pbtxt
index 67281f9547a..7997d8daaf9 100644
--- a/tensorflow/core/api_def/base_api/api_def_ConcatenateDataset.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ConcatenateDataset.pbtxt
@@ -1,4 +1,5 @@
 op {
   graph_op_name: "ConcatenateDataset"
+  visibility: HIDDEN
   summary: "Creates a dataset that concatenates `input_dataset` with `another_dataset`."
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_DatasetToSingleElement.pbtxt b/tensorflow/core/api_def/base_api/api_def_DatasetToSingleElement.pbtxt
index 2b9dffd8832..27d7d6b9868 100644
--- a/tensorflow/core/api_def/base_api/api_def_DatasetToSingleElement.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_DatasetToSingleElement.pbtxt
@@ -1,5 +1,6 @@
 op {
   graph_op_name: "DatasetToSingleElement"
+  visibility: HIDDEN
   in_arg {
     name: "dataset"
     description: <<END
diff --git a/tensorflow/core/api_def/base_api/api_def_EnqueueInQueueDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_EnqueueInQueueDataset.pbtxt
deleted file mode 100644
index 9722f5ede30..00000000000
--- a/tensorflow/core/api_def/base_api/api_def_EnqueueInQueueDataset.pbtxt
+++ /dev/null
@@ -1,3 +0,0 @@
-op {
-  graph_op_name: "EnqueueInQueueDataset"
-}
diff --git a/tensorflow/core/api_def/base_api/api_def_BytesProducedStatsDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_ExperimentalBytesProducedStatsDataset.pbtxt
similarity index 56%
rename from tensorflow/core/api_def/base_api/api_def_BytesProducedStatsDataset.pbtxt
rename to tensorflow/core/api_def/base_api/api_def_ExperimentalBytesProducedStatsDataset.pbtxt
index 73df11b2f75..dc296162ae8 100644
--- a/tensorflow/core/api_def/base_api/api_def_BytesProducedStatsDataset.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ExperimentalBytesProducedStatsDataset.pbtxt
@@ -1,4 +1,5 @@
 op {
-  graph_op_name: "BytesProducedStatsDataset"
+  graph_op_name: "ExperimentalBytesProducedStatsDataset"
+  visibility: HIDDEN
   summary: "Records the bytes size of each element of `input_dataset` in a StatsAggregator."
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_ExperimentalDatasetCardinality.pbtxt b/tensorflow/core/api_def/base_api/api_def_ExperimentalDatasetCardinality.pbtxt
new file mode 100644
index 00000000000..ac014bcc5e6
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ExperimentalDatasetCardinality.pbtxt
@@ -0,0 +1,21 @@
+op {
+  graph_op_name: "ExperimentalDatasetCardinality"
+  visibility: HIDDEN
+  in_arg {
+    name: "input_dataset"
+    description: <<END
+A variant tensor representing the dataset to return cardinality for.
+END
+  }
+  out_arg {
+    name: "cardinality"
+    description: <<END
+The cardinality of `input_dataset`. Named constants are used to represent
+infinite and unknown cardinality.
+END
+  }
+  summary: "Returns the cardinality of `input_dataset`."
+  description: <<END
+Returns the cardinality of `input_dataset`.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_DatasetToTFRecord.pbtxt b/tensorflow/core/api_def/base_api/api_def_ExperimentalDatasetToTFRecord.pbtxt
similarity index 91%
rename from tensorflow/core/api_def/base_api/api_def_DatasetToTFRecord.pbtxt
rename to tensorflow/core/api_def/base_api/api_def_ExperimentalDatasetToTFRecord.pbtxt
index e1b8a9abdd2..085d20d7bf1 100644
--- a/tensorflow/core/api_def/base_api/api_def_DatasetToTFRecord.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ExperimentalDatasetToTFRecord.pbtxt
@@ -1,5 +1,5 @@
 op {
-  graph_op_name: "DatasetToTFRecord"
+  graph_op_name: "ExperimentalDatasetToTFRecord"
   visibility: HIDDEN
   in_arg {
     name: "input_dataset"
diff --git a/tensorflow/core/api_def/base_api/api_def_DenseToSparseBatchDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_ExperimentalDenseToSparseBatchDataset.pbtxt
similarity index 89%
rename from tensorflow/core/api_def/base_api/api_def_DenseToSparseBatchDataset.pbtxt
rename to tensorflow/core/api_def/base_api/api_def_ExperimentalDenseToSparseBatchDataset.pbtxt
index e275cfdd3de..8ebd6d88a8b 100644
--- a/tensorflow/core/api_def/base_api/api_def_DenseToSparseBatchDataset.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ExperimentalDenseToSparseBatchDataset.pbtxt
@@ -1,5 +1,6 @@
 op {
-  graph_op_name: "DenseToSparseBatchDataset"
+  graph_op_name: "ExperimentalDenseToSparseBatchDataset"
+  visibility: HIDDEN
   in_arg {
     name: "input_dataset"
     description: <<END
diff --git a/tensorflow/core/api_def/base_api/api_def_ExperimentalFunctionBufferingResource.pbtxt b/tensorflow/core/api_def/base_api/api_def_ExperimentalFunctionBufferingResource.pbtxt
deleted file mode 100644
index 66511eff60b..00000000000
--- a/tensorflow/core/api_def/base_api/api_def_ExperimentalFunctionBufferingResource.pbtxt
+++ /dev/null
@@ -1,58 +0,0 @@
-op {
-  graph_op_name: "ExperimentalFunctionBufferingResource"
-  in_arg {
-    name: "string_arg"
-    description: <<END
-String argument to the function call.
-END
-  }
-  in_arg {
-    name: "target_device"
-    description: <<END
-Target device to execute the function on.
-END
-  }
-  out_arg {
-    name: "resource"
-    description: <<END
-Handle to the resource created.
-END
-  }
-  attr {
-    name: "shared_name"
-    description: <<END
-If non-empty, this resource will be shared under the given name across
-multiple sessions.
-END
-  }
-  attr {
-    name: "container"
-    description: <<END
-If non-empty, this resource is placed in the given container.
-Otherwise, a default container is used.
-END
-  }
-  attr {
-    name: "f"
-    description: <<END
-Function to be executed.
-END
-  }
-  attr {
-    name: "buffer_size"
-    description: <<END
-Size of the buffer.
-END
-  }
-  attr {
-    name: "output_types"
-    description: <<END
-The type list for the return values.
-END
-  }
-  summary: <<END
-Creates a resource that fills up a buffer by making function calls.
-END
-  visibility: HIDDEN
-}
-
diff --git a/tensorflow/core/api_def/base_api/api_def_ExperimentalFunctionBufferingResourceGetNext.pbtxt b/tensorflow/core/api_def/base_api/api_def_ExperimentalFunctionBufferingResourceGetNext.pbtxt
deleted file mode 100644
index bf4b66b22bf..00000000000
--- a/tensorflow/core/api_def/base_api/api_def_ExperimentalFunctionBufferingResourceGetNext.pbtxt
+++ /dev/null
@@ -1,25 +0,0 @@
-op {
-  graph_op_name: "ExperimentalFunctionBufferingResourceGetNext"
-  in_arg {
-    name: "function_buffer_resource"
-    description: <<END
-The FunctionBufferingResource handle.
-END
-  }
-  out_arg {
-    name: "output"
-    description: <<END
-A list of return values.
-END
-  }
-  attr {
-    name: "output_types"
-    description: <<END
-The type list for the return values.
-END
-  }
-  summary: <<END
-Gets the next element from a FunctionBufferingResource.
-END
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/base_api/api_def_ExperimentalFunctionBufferingResourceReset.pbtxt b/tensorflow/core/api_def/base_api/api_def_ExperimentalFunctionBufferingResourceReset.pbtxt
deleted file mode 100644
index 729718ddb3d..00000000000
--- a/tensorflow/core/api_def/base_api/api_def_ExperimentalFunctionBufferingResourceReset.pbtxt
+++ /dev/null
@@ -1,13 +0,0 @@
-op {
-  graph_op_name: "ExperimentalFunctionBufferingResourceReset"
-  in_arg {
-    name: "function_buffer_resource"
-    description: <<END
-The FunctionBufferingResource handle.
-END
-  }
-  summary: <<END
-Resets the FunctionBufferingResource.
-END
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/base_api/api_def_GroupByReducerDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_ExperimentalGroupByReducerDataset.pbtxt
similarity index 97%
rename from tensorflow/core/api_def/base_api/api_def_GroupByReducerDataset.pbtxt
rename to tensorflow/core/api_def/base_api/api_def_ExperimentalGroupByReducerDataset.pbtxt
index 067ad4018b0..dd132802fac 100644
--- a/tensorflow/core/api_def/base_api/api_def_GroupByReducerDataset.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ExperimentalGroupByReducerDataset.pbtxt
@@ -1,5 +1,5 @@
 op {
-  graph_op_name: "GroupByReducerDataset"
+  graph_op_name: "ExperimentalGroupByReducerDataset"
   visibility: HIDDEN
   in_arg {
     name: "input_dataset"
diff --git a/tensorflow/core/api_def/base_api/api_def_GroupByWindowDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_ExperimentalGroupByWindowDataset.pbtxt
similarity index 82%
rename from tensorflow/core/api_def/base_api/api_def_GroupByWindowDataset.pbtxt
rename to tensorflow/core/api_def/base_api/api_def_ExperimentalGroupByWindowDataset.pbtxt
index ea6bcd46957..6e4c12ed815 100644
--- a/tensorflow/core/api_def/base_api/api_def_GroupByWindowDataset.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ExperimentalGroupByWindowDataset.pbtxt
@@ -1,5 +1,6 @@
 op {
-  graph_op_name: "GroupByWindowDataset"
+  graph_op_name: "ExperimentalGroupByWindowDataset"
+  visibility: HIDDEN
   attr {
     name: "key_func"
     description: <<END
diff --git a/tensorflow/core/api_def/base_api/api_def_LatencyStatsDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_ExperimentalLatencyStatsDataset.pbtxt
similarity index 58%
rename from tensorflow/core/api_def/base_api/api_def_LatencyStatsDataset.pbtxt
rename to tensorflow/core/api_def/base_api/api_def_ExperimentalLatencyStatsDataset.pbtxt
index 78d946b0b47..e7351b9d70a 100644
--- a/tensorflow/core/api_def/base_api/api_def_LatencyStatsDataset.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ExperimentalLatencyStatsDataset.pbtxt
@@ -1,4 +1,5 @@
 op {
-  graph_op_name: "LatencyStatsDataset"
+  graph_op_name: "ExperimentalLatencyStatsDataset"
+  visibility: HIDDEN
   summary: "Records the latency of producing `input_dataset` elements in a StatsAggregator."
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_MapAndBatchDatasetV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_ExperimentalMapAndBatchDataset.pbtxt
similarity index 96%
rename from tensorflow/core/api_def/base_api/api_def_MapAndBatchDatasetV2.pbtxt
rename to tensorflow/core/api_def/base_api/api_def_ExperimentalMapAndBatchDataset.pbtxt
index 81ef92cae0c..bc4270670c5 100644
--- a/tensorflow/core/api_def/base_api/api_def_MapAndBatchDatasetV2.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ExperimentalMapAndBatchDataset.pbtxt
@@ -1,5 +1,5 @@
 op {
-  graph_op_name: "MapAndBatchDatasetV2"
+  graph_op_name: "ExperimentalMapAndBatchDataset"
   visibility: HIDDEN
   in_arg {
     name: "input_dataset"
diff --git a/tensorflow/core/api_def/base_api/api_def_ExperimentalMatchingFilesDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_ExperimentalMatchingFilesDataset.pbtxt
new file mode 100644
index 00000000000..993a7981490
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ExperimentalMatchingFilesDataset.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ExperimentalMatchingFilesDataset"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ExperimentalMaxIntraOpParallelismDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_ExperimentalMaxIntraOpParallelismDataset.pbtxt
new file mode 100644
index 00000000000..a18aa378ffa
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ExperimentalMaxIntraOpParallelismDataset.pbtxt
@@ -0,0 +1,13 @@
+op {
+  graph_op_name: "ExperimentalMaxIntraOpParallelismDataset"
+  in_arg {
+    name: "max_intra_op_parallelism"
+    description: <<END
+Identifies the maximum intra-op parallelism to use.
+END
+  }
+  summary: <<END
+Creates a dataset that overrides the maximum intra-op parallelism.
+END
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ParallelInterleaveDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_ExperimentalParallelInterleaveDataset.pbtxt
similarity index 90%
rename from tensorflow/core/api_def/base_api/api_def_ParallelInterleaveDataset.pbtxt
rename to tensorflow/core/api_def/base_api/api_def_ExperimentalParallelInterleaveDataset.pbtxt
index d6889b54a03..dd70e332849 100644
--- a/tensorflow/core/api_def/base_api/api_def_ParallelInterleaveDataset.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ExperimentalParallelInterleaveDataset.pbtxt
@@ -1,5 +1,6 @@
 op {
-  graph_op_name: "ParallelInterleaveDataset"
+  graph_op_name: "ExperimentalParallelInterleaveDataset"
+  visibility: HIDDEN
   attr {
     name: "f"
     description: <<END
diff --git a/tensorflow/core/api_def/base_api/api_def_ParseExampleDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_ExperimentalParseExampleDataset.pbtxt
similarity index 96%
rename from tensorflow/core/api_def/base_api/api_def_ParseExampleDataset.pbtxt
rename to tensorflow/core/api_def/base_api/api_def_ExperimentalParseExampleDataset.pbtxt
index 3de2f18fc28..2de13c5ceef 100644
--- a/tensorflow/core/api_def/base_api/api_def_ParseExampleDataset.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ExperimentalParseExampleDataset.pbtxt
@@ -1,5 +1,6 @@
 op {
-  graph_op_name: "ParseExampleDataset"
+  graph_op_name: "ExperimentalParseExampleDataset"
+  visibility: HIDDEN
   in_arg {
     name: "dense_defaults"
     description: <<END
diff --git a/tensorflow/core/api_def/base_api/api_def_ExperimentalPrivateThreadPoolDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_ExperimentalPrivateThreadPoolDataset.pbtxt
new file mode 100644
index 00000000000..eaa49b7fa5e
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ExperimentalPrivateThreadPoolDataset.pbtxt
@@ -0,0 +1,13 @@
+op {
+  graph_op_name: "ExperimentalPrivateThreadPoolDataset"
+  in_arg {
+    name: "num_threads"
+    description: <<END
+Identifies the number of threads to use for the private threadpool.
+END
+  }
+  summary: <<END
+Creates a dataset that uses a custom thread pool to compute `input_dataset`.
+END
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_RandomDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_ExperimentalRandomDataset.pbtxt
similarity index 86%
rename from tensorflow/core/api_def/base_api/api_def_RandomDataset.pbtxt
rename to tensorflow/core/api_def/base_api/api_def_ExperimentalRandomDataset.pbtxt
index 0466b40f85e..f5d7bc4adb7 100644
--- a/tensorflow/core/api_def/base_api/api_def_RandomDataset.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ExperimentalRandomDataset.pbtxt
@@ -1,5 +1,6 @@
 op {
-  graph_op_name: "RandomDataset"
+  graph_op_name: "ExperimentalRandomDataset"
+  visibility: HIDDEN
   in_arg {
     name: "seed"
     description: <<END
diff --git a/tensorflow/core/api_def/base_api/api_def_ScanDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_ExperimentalScanDataset.pbtxt
similarity index 61%
rename from tensorflow/core/api_def/base_api/api_def_ScanDataset.pbtxt
rename to tensorflow/core/api_def/base_api/api_def_ExperimentalScanDataset.pbtxt
index e83d4a9e967..4742cf4d57f 100644
--- a/tensorflow/core/api_def/base_api/api_def_ScanDataset.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ExperimentalScanDataset.pbtxt
@@ -1,4 +1,5 @@
 op {
-  graph_op_name: "ScanDataset"
+  graph_op_name: "ExperimentalScanDataset"
+  visibility: HIDDEN
   summary: "Creates a dataset successively reduces `f` over the elements of `input_dataset`."
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_ExperimentalSetStatsAggregatorDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_ExperimentalSetStatsAggregatorDataset.pbtxt
new file mode 100644
index 00000000000..6e6b2f81b33
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ExperimentalSetStatsAggregatorDataset.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ExperimentalSetStatsAggregatorDataset"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_SlideDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_ExperimentalSlidingWindowDataset.pbtxt
similarity index 88%
rename from tensorflow/core/api_def/base_api/api_def_SlideDataset.pbtxt
rename to tensorflow/core/api_def/base_api/api_def_ExperimentalSlidingWindowDataset.pbtxt
index ddde3ee5b4e..dc62750b66a 100644
--- a/tensorflow/core/api_def/base_api/api_def_SlideDataset.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ExperimentalSlidingWindowDataset.pbtxt
@@ -1,5 +1,6 @@
 op {
-  graph_op_name: "SlideDataset"
+  graph_op_name: "ExperimentalSlidingWindowDataset"
+  visibility: HIDDEN
   in_arg {
     name: "window_size"
     description: <<END
diff --git a/tensorflow/core/api_def/base_api/api_def_SqlDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_ExperimentalSqlDataset.pbtxt
similarity index 87%
rename from tensorflow/core/api_def/base_api/api_def_SqlDataset.pbtxt
rename to tensorflow/core/api_def/base_api/api_def_ExperimentalSqlDataset.pbtxt
index 7570d5da566..35cddbd0619 100644
--- a/tensorflow/core/api_def/base_api/api_def_SqlDataset.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ExperimentalSqlDataset.pbtxt
@@ -1,5 +1,6 @@
 op {
-  graph_op_name: "SqlDataset"
+  graph_op_name: "ExperimentalSqlDataset"
+  visibility: HIDDEN
   in_arg {
     name: "driver_name"
     description: <<END
diff --git a/tensorflow/core/api_def/base_api/api_def_ExperimentalStatsAggregatorHandle.pbtxt b/tensorflow/core/api_def/base_api/api_def_ExperimentalStatsAggregatorHandle.pbtxt
new file mode 100644
index 00000000000..8a770d462d5
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ExperimentalStatsAggregatorHandle.pbtxt
@@ -0,0 +1,5 @@
+op {
+  graph_op_name: "ExperimentalStatsAggregatorHandle"
+  visibility: HIDDEN
+  summary: "Creates a statistics manager resource."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_StatsAggregatorSummary.pbtxt b/tensorflow/core/api_def/base_api/api_def_ExperimentalStatsAggregatorSummary.pbtxt
similarity index 56%
rename from tensorflow/core/api_def/base_api/api_def_StatsAggregatorSummary.pbtxt
rename to tensorflow/core/api_def/base_api/api_def_ExperimentalStatsAggregatorSummary.pbtxt
index bcaf9fea1af..ffe01036891 100644
--- a/tensorflow/core/api_def/base_api/api_def_StatsAggregatorSummary.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ExperimentalStatsAggregatorSummary.pbtxt
@@ -1,4 +1,5 @@
 op {
-  graph_op_name: "StatsAggregatorSummary"
+  graph_op_name: "ExperimentalStatsAggregatorSummary"
+  visibility: HIDDEN
   summary: "Produces a summary of any statistics recorded by the given statistics manager."
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_UnbatchDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_ExperimentalUnbatchDataset.pbtxt
similarity index 57%
rename from tensorflow/core/api_def/base_api/api_def_UnbatchDataset.pbtxt
rename to tensorflow/core/api_def/base_api/api_def_ExperimentalUnbatchDataset.pbtxt
index 324fadac0af..c89e1fd0bdd 100644
--- a/tensorflow/core/api_def/base_api/api_def_UnbatchDataset.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ExperimentalUnbatchDataset.pbtxt
@@ -1,4 +1,5 @@
 op {
-  graph_op_name: "UnbatchDataset"
+  graph_op_name: "ExperimentalUnbatchDataset"
+  visibility: HIDDEN
   summary: "A dataset that splits the elements of its input into multiple elements."
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_FFT.pbtxt b/tensorflow/core/api_def/base_api/api_def_FFT.pbtxt
index 4e48d6c169b..0ba2327371a 100644
--- a/tensorflow/core/api_def/base_api/api_def_FFT.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_FFT.pbtxt
@@ -3,13 +3,13 @@ op {
   in_arg {
     name: "input"
     description: <<END
-A complex64 tensor.
+A complex tensor.
 END
   }
   out_arg {
     name: "output"
     description: <<END
-A complex64 tensor of the same shape as `input`. The inner-most
+A complex tensor of the same shape as `input`. The inner-most
   dimension of `input` is replaced with its 1D Fourier transform.
 
 @compatibility(numpy)
diff --git a/tensorflow/core/api_def/base_api/api_def_FFT2D.pbtxt b/tensorflow/core/api_def/base_api/api_def_FFT2D.pbtxt
index 555f8e60673..c7b780a56f0 100644
--- a/tensorflow/core/api_def/base_api/api_def_FFT2D.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_FFT2D.pbtxt
@@ -3,13 +3,13 @@ op {
   in_arg {
     name: "input"
     description: <<END
-A complex64 tensor.
+A complex tensor.
 END
   }
   out_arg {
     name: "output"
     description: <<END
-A complex64 tensor of the same shape as `input`. The inner-most 2
+A complex tensor of the same shape as `input`. The inner-most 2
   dimensions of `input` are replaced with their 2D Fourier transform.
 
 @compatibility(numpy)
diff --git a/tensorflow/core/api_def/base_api/api_def_FilterDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_FilterDataset.pbtxt
index fd60c0f3785..776529bc593 100644
--- a/tensorflow/core/api_def/base_api/api_def_FilterDataset.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_FilterDataset.pbtxt
@@ -1,5 +1,6 @@
 op {
   graph_op_name: "FilterDataset"
+  visibility: HIDDEN
   in_arg {
     name: "other_arguments"
     description: <<END
diff --git a/tensorflow/core/api_def/base_api/api_def_FixedLengthRecordDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_FixedLengthRecordDataset.pbtxt
index 651b84d0d66..3b142432582 100644
--- a/tensorflow/core/api_def/base_api/api_def_FixedLengthRecordDataset.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_FixedLengthRecordDataset.pbtxt
@@ -1,5 +1,6 @@
 op {
   graph_op_name: "FixedLengthRecordDataset"
+  visibility: HIDDEN
   in_arg {
     name: "filenames"
     description: <<END
diff --git a/tensorflow/core/api_def/base_api/api_def_FixedLengthRecordDatasetV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_FixedLengthRecordDatasetV2.pbtxt
index ad82eddb587..def9f85e02d 100644
--- a/tensorflow/core/api_def/base_api/api_def_FixedLengthRecordDatasetV2.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_FixedLengthRecordDatasetV2.pbtxt
@@ -1,3 +1,4 @@
 op {
   graph_op_name: "FixedLengthRecordDatasetV2"
+  visibility: HIDDEN
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_FlatMapDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_FlatMapDataset.pbtxt
index 1936119c50f..1e20e853254 100644
--- a/tensorflow/core/api_def/base_api/api_def_FlatMapDataset.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_FlatMapDataset.pbtxt
@@ -1,5 +1,6 @@
 op {
   graph_op_name: "FlatMapDataset"
+  visibility: HIDDEN
   attr {
     name: "f"
     description: <<END
diff --git a/tensorflow/core/api_def/base_api/api_def_GeneratorDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_GeneratorDataset.pbtxt
index 4f1cf3e6867..06e9a6463e7 100644
--- a/tensorflow/core/api_def/base_api/api_def_GeneratorDataset.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_GeneratorDataset.pbtxt
@@ -1,4 +1,5 @@
 op {
   graph_op_name: "GeneratorDataset"
+  visibility: HIDDEN
   summary: "Creates a dataset that invokes a function to generate elements."
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_IFFT.pbtxt b/tensorflow/core/api_def/base_api/api_def_IFFT.pbtxt
index b793c99cf74..c17a8400056 100644
--- a/tensorflow/core/api_def/base_api/api_def_IFFT.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_IFFT.pbtxt
@@ -3,13 +3,13 @@ op {
   in_arg {
     name: "input"
     description: <<END
-A complex64 tensor.
+A complex tensor.
 END
   }
   out_arg {
     name: "output"
     description: <<END
-A complex64 tensor of the same shape as `input`. The inner-most
+A complex tensor of the same shape as `input`. The inner-most
   dimension of `input` is replaced with its inverse 1D Fourier transform.
 
 @compatibility(numpy)
diff --git a/tensorflow/core/api_def/base_api/api_def_IFFT2D.pbtxt b/tensorflow/core/api_def/base_api/api_def_IFFT2D.pbtxt
index 7f38f14308d..7458d233ec8 100644
--- a/tensorflow/core/api_def/base_api/api_def_IFFT2D.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_IFFT2D.pbtxt
@@ -3,13 +3,13 @@ op {
   in_arg {
     name: "input"
     description: <<END
-A complex64 tensor.
+A complex tensor.
 END
   }
   out_arg {
     name: "output"
     description: <<END
-A complex64 tensor of the same shape as `input`. The inner-most 2
+A complex tensor of the same shape as `input`. The inner-most 2
   dimensions of `input` are replaced with their inverse 2D Fourier transform.
 
 @compatibility(numpy)
diff --git a/tensorflow/core/api_def/base_api/api_def_InterleaveDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_InterleaveDataset.pbtxt
index bec2828e246..597edf5fb2b 100644
--- a/tensorflow/core/api_def/base_api/api_def_InterleaveDataset.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_InterleaveDataset.pbtxt
@@ -1,5 +1,6 @@
 op {
   graph_op_name: "InterleaveDataset"
+  visibility: HIDDEN
   attr {
     name: "f"
     description: <<END
diff --git a/tensorflow/core/api_def/base_api/api_def_MapAndBatchDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_MapAndBatchDataset.pbtxt
deleted file mode 100644
index e230c51edfe..00000000000
--- a/tensorflow/core/api_def/base_api/api_def_MapAndBatchDataset.pbtxt
+++ /dev/null
@@ -1,53 +0,0 @@
-op {
-  graph_op_name: "MapAndBatchDataset"
-  visibility: HIDDEN
-  in_arg {
-    name: "input_dataset"
-    description: <<END
-A variant tensor representing the input dataset.
-END
-  }
-  in_arg {
-    name: "other_arguments"
-    description: <<END
-A list of tensors, typically values that were captured when building a closure
-for `f`.
-END
-  }
-  in_arg {
-    name: "batch_size"
-    description: <<END
-A scalar representing the number of elements to accumulate in a
-batch. It determines the number of concurrent invocations of `f` that process
-elements from `input_dataset` in parallel.
-END
-  }
-  in_arg {
-    name: "num_parallel_batches"
-    description: <<END
-A scalar representing the number of batches to create in parallel. Processing
-multiple batches in parallel benefits workloads prone to stragglers.
-END
-  }
-  in_arg {
-    name: "drop_remainder"
-    description: <<END
-A scalar representing whether the last batch should be dropped in case its size
-is smaller than desired.
-END
-  }
-  attr {
-    name: "f"
-    description: <<END
-A function to apply to the outputs of `input_dataset`.
-END
-  }
-  summary: "Creates a dataset that fuses mapping with batching."
-  description: <<END
-Creates a dataset that applies `f` to the outputs of `input_dataset` and then
-batches `batch_size` of them.
-
-Unlike a "MapDataset", which applies `f` sequentially, this dataset invokes up
-to `batch_size * num_parallel_batches` copies of `f` in parallel.
-END
-}
diff --git a/tensorflow/core/api_def/base_api/api_def_MapDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_MapDataset.pbtxt
index 76d63ec2478..4f235f49461 100644
--- a/tensorflow/core/api_def/base_api/api_def_MapDataset.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_MapDataset.pbtxt
@@ -1,4 +1,5 @@
 op {
   graph_op_name: "MapDataset"
+  visibility: HIDDEN
   summary: "Creates a dataset that applies `f` to the outputs of `input_dataset`."
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_MatchingFilesDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_MatchingFilesDataset.pbtxt
deleted file mode 100644
index ab2a33108d1..00000000000
--- a/tensorflow/core/api_def/base_api/api_def_MatchingFilesDataset.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "MatchingFilesDataset"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/base_api/api_def_PaddedBatchDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_PaddedBatchDataset.pbtxt
index d243dfe8b67..53f4d94ecc8 100644
--- a/tensorflow/core/api_def/base_api/api_def_PaddedBatchDataset.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_PaddedBatchDataset.pbtxt
@@ -1,5 +1,6 @@
 op {
   graph_op_name: "PaddedBatchDataset"
+  visibility: HIDDEN
   in_arg {
     name: "batch_size"
     description: <<END
diff --git a/tensorflow/core/api_def/base_api/api_def_ParallelMapDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_ParallelMapDataset.pbtxt
index 313494dd738..5343605edd5 100644
--- a/tensorflow/core/api_def/base_api/api_def_ParallelMapDataset.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ParallelMapDataset.pbtxt
@@ -1,5 +1,6 @@
 op {
   graph_op_name: "ParallelMapDataset"
+  visibility: HIDDEN
   in_arg {
     name: "num_parallel_calls"
     description: <<END
diff --git a/tensorflow/core/api_def/base_api/api_def_PrefetchDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_PrefetchDataset.pbtxt
index e158eedc6f0..a71336a2855 100644
--- a/tensorflow/core/api_def/base_api/api_def_PrefetchDataset.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_PrefetchDataset.pbtxt
@@ -1,5 +1,6 @@
 op {
   graph_op_name: "PrefetchDataset"
+  visibility: HIDDEN
   in_arg {
     name: "buffer_size"
     description: <<END
diff --git a/tensorflow/core/api_def/base_api/api_def_PrependFromQueueAndPaddedBatchDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_PrependFromQueueAndPaddedBatchDataset.pbtxt
deleted file mode 100644
index d4549340fac..00000000000
--- a/tensorflow/core/api_def/base_api/api_def_PrependFromQueueAndPaddedBatchDataset.pbtxt
+++ /dev/null
@@ -1,3 +0,0 @@
-op {
-  graph_op_name: "PrependFromQueueAndPaddedBatchDataset"
-}
diff --git a/tensorflow/core/api_def/base_api/api_def_QuantizeAndDequantizeV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_QuantizeAndDequantizeV2.pbtxt
index c43142599bb..dff7c8754f9 100644
--- a/tensorflow/core/api_def/base_api/api_def_QuantizeAndDequantizeV2.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_QuantizeAndDequantizeV2.pbtxt
@@ -39,6 +39,19 @@ END
     name: "range_given"
     description: <<END
 Whether the range is given or should be determined from the `input` tensor.
+END
+  }
+  attr {
+    name: "round_mode"
+    description: <<END
+The 'round_mode' attribute controls which rounding tie-breaking algorithm is
+used when rounding float values to their quantized equivalents. The following
+rounding modes are currently supported:
+
+*   HALF_TO_EVEN: this is the default round_mode.
+*   HALF_UP: round towards positive. In this mode 7.5 rounds up to 8 and -7.5
+    rounds up to -7.
+
 END
   }
   summary: "Quantizes then dequantizes a tensor."
@@ -93,7 +106,7 @@ following to each value in the 'input' tensor.
 
 output = round(clamp(value, input_min, input_max) * scale_factor) / scale_factor.
 
-The above round function uses half to even rounding.
+The above round function rounds the value based on the given round_mode.
 
 END
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_RangeDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_RangeDataset.pbtxt
index a9e14b8a052..4ac5050040c 100644
--- a/tensorflow/core/api_def/base_api/api_def_RangeDataset.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_RangeDataset.pbtxt
@@ -1,5 +1,6 @@
 op {
   graph_op_name: "RangeDataset"
+  visibility: HIDDEN
   in_arg {
     name: "start"
     description: <<END
diff --git a/tensorflow/core/api_def/base_api/api_def_RepeatDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_RepeatDataset.pbtxt
index fc6169cd32f..b2fcab15384 100644
--- a/tensorflow/core/api_def/base_api/api_def_RepeatDataset.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_RepeatDataset.pbtxt
@@ -1,5 +1,6 @@
 op {
   graph_op_name: "RepeatDataset"
+  visibility: HIDDEN
   in_arg {
     name: "count"
     description: <<END
diff --git a/tensorflow/core/api_def/base_api/api_def_ResourceApplyAdamWithAmsgrad.pbtxt b/tensorflow/core/api_def/base_api/api_def_ResourceApplyAdamWithAmsgrad.pbtxt
new file mode 100644
index 00000000000..8ee16ef1baa
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ResourceApplyAdamWithAmsgrad.pbtxt
@@ -0,0 +1,85 @@
+op {
+  graph_op_name: "ResourceApplyAdamWithAmsgrad"
+  in_arg {
+    name: "var"
+    description: <<END
+Should be from a Variable().
+END
+  }
+  in_arg {
+    name: "m"
+    description: <<END
+Should be from a Variable().
+END
+  }
+  in_arg {
+    name: "v"
+    description: <<END
+Should be from a Variable().
+END
+  }
+  in_arg {
+    name: "vhat"
+    description: <<END
+Should be from a Variable().
+END
+  }
+  in_arg {
+    name: "beta1_power"
+    description: <<END
+Must be a scalar.
+END
+  }
+  in_arg {
+    name: "beta2_power"
+    description: <<END
+Must be a scalar.
+END
+  }
+  in_arg {
+    name: "lr"
+    description: <<END
+Scaling factor. Must be a scalar.
+END
+  }
+  in_arg {
+    name: "beta1"
+    description: <<END
+Momentum factor. Must be a scalar.
+END
+  }
+  in_arg {
+    name: "beta2"
+    description: <<END
+Momentum factor. Must be a scalar.
+END
+  }
+  in_arg {
+    name: "epsilon"
+    description: <<END
+Ridge term. Must be a scalar.
+END
+  }
+  in_arg {
+    name: "grad"
+    description: <<END
+The gradient.
+END
+  }
+  attr {
+    name: "use_locking"
+    description: <<END
+If `True`, updating of the var, m, and v tensors will be protected
+by a lock; otherwise the behavior is undefined, but may exhibit less
+contention.
+END
+  }
+  summary: "Update \'*var\' according to the Adam algorithm."
+  description: <<END
+$$lr_t := \text{learning\_rate} * \sqrt{1 - beta_2^t} / (1 - beta_1^t)$$
+$$m_t := beta_1 * m_{t-1} + (1 - beta_1) * g$$
+$$v_t := beta_2 * v_{t-1} + (1 - beta_2) * g * g$$
+$$vhat_t := max{vhat_{t-1}, v_t}$$
+$$variable := variable - lr_t * m_t / (\sqrt{vhat_t} + \epsilon)$$
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ResourceApplyKerasMomentum.pbtxt b/tensorflow/core/api_def/base_api/api_def_ResourceApplyKerasMomentum.pbtxt
new file mode 100644
index 00000000000..830391a32ba
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ResourceApplyKerasMomentum.pbtxt
@@ -0,0 +1,56 @@
+op {
+  graph_op_name: "ResourceApplyKerasMomentum"
+  in_arg {
+    name: "var"
+    description: <<END
+Should be from a Variable().
+END
+  }
+  in_arg {
+    name: "accum"
+    description: <<END
+Should be from a Variable().
+END
+  }
+  in_arg {
+    name: "lr"
+    description: <<END
+Scaling factor. Must be a scalar.
+END
+  }
+  in_arg {
+    name: "grad"
+    description: <<END
+The gradient.
+END
+  }
+  in_arg {
+    name: "momentum"
+    description: <<END
+Momentum. Must be a scalar.
+END
+  }
+  attr {
+    name: "use_locking"
+    description: <<END
+If `True`, updating of the var and accum tensors will be protected
+by a lock; otherwise the behavior is undefined, but may exhibit less
+contention.
+END
+  }
+  attr {
+    name: "use_nesterov"
+    description: <<END
+If `True`, the tensor passed to compute grad will be
+var + momentum * accum, so in the end, the var you get is actually
+var + momentum * accum.
+END
+  }
+  summary: "Update \'*var\' according to the momentum scheme. Set use_nesterov = True if you"
+  description: <<END
+want to use Nesterov momentum.
+
+accum = accum * momentum - lr * grad
+var += accum
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ResourceSparseApplyKerasMomentum.pbtxt b/tensorflow/core/api_def/base_api/api_def_ResourceSparseApplyKerasMomentum.pbtxt
new file mode 100644
index 00000000000..b10b1bc2a9b
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ResourceSparseApplyKerasMomentum.pbtxt
@@ -0,0 +1,64 @@
+op {
+  graph_op_name: "ResourceSparseApplyKerasMomentum"
+  in_arg {
+    name: "var"
+    description: <<END
+Should be from a Variable().
+END
+  }
+  in_arg {
+    name: "accum"
+    description: <<END
+Should be from a Variable().
+END
+  }
+  in_arg {
+    name: "lr"
+    description: <<END
+Learning rate. Must be a scalar.
+END
+  }
+  in_arg {
+    name: "grad"
+    description: <<END
+The gradient.
+END
+  }
+  in_arg {
+    name: "indices"
+    description: <<END
+A vector of indices into the first dimension of var and accum.
+END
+  }
+  in_arg {
+    name: "momentum"
+    description: <<END
+Momentum. Must be a scalar.
+END
+  }
+  attr {
+    name: "use_locking"
+    description: <<END
+If `True`, updating of the var and accum tensors will be protected
+by a lock; otherwise the behavior is undefined, but may exhibit less
+contention.
+END
+  }
+  attr {
+    name: "use_nesterov"
+    description: <<END
+If `True`, the tensor passed to compute grad will be
+var + momentum * accum, so in the end, the var you get is actually
+var + momentum * accum.
+END
+  }
+  summary: "Update relevant entries in \'*var\' and \'*accum\' according to the momentum scheme."
+  description: <<END
+Set use_nesterov = True if you want to use Nesterov momentum.
+
+That is for rows we have grad for, we update var and accum as follows:
+
+accum = accum * momentum - lr * grad
+var += accum
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ScatterNd.pbtxt b/tensorflow/core/api_def/base_api/api_def_ScatterNd.pbtxt
index 0b5917d428c..41955cfbfa4 100644
--- a/tensorflow/core/api_def/base_api/api_def_ScatterNd.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ScatterNd.pbtxt
@@ -32,6 +32,10 @@ slices within a tensor (initially zero for numeric, empty for string) of
 the given `shape` according to indices.  This operator is the inverse of the
 `tf.gather_nd` operator which extracts values or slices from a given tensor.
 
+This operation is similar to tensor_scatter_add, except that the tensor is
+zero-initialized. Calling `tf.scatter_nd(indices, values, shape)` is identical
+to `tensor_scatter_add(tf.zeros(shape, values.dtype), indices, values)`
+
 If `indices` contains duplicates, then their updates are accumulated (summed).
 
 **WARNING**: The order in which updates are applied is nondeterministic, so the
diff --git a/tensorflow/core/api_def/base_api/api_def_SetStatsAggregatorDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_SetStatsAggregatorDataset.pbtxt
deleted file mode 100644
index 77123e143b2..00000000000
--- a/tensorflow/core/api_def/base_api/api_def_SetStatsAggregatorDataset.pbtxt
+++ /dev/null
@@ -1,3 +0,0 @@
-op {
-  graph_op_name: "SetStatsAggregatorDataset"
-}
diff --git a/tensorflow/core/api_def/base_api/api_def_ShuffleAndRepeatDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_ShuffleAndRepeatDataset.pbtxt
index fb425b24a41..9ea1cc8babe 100644
--- a/tensorflow/core/api_def/base_api/api_def_ShuffleAndRepeatDataset.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ShuffleAndRepeatDataset.pbtxt
@@ -1,5 +1,6 @@
 op {
   graph_op_name: "ShuffleAndRepeatDataset"
+  visibility: HIDDEN
   in_arg {
     name: "buffer_size"
     description: <<END
diff --git a/tensorflow/core/api_def/base_api/api_def_ShuffleDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_ShuffleDataset.pbtxt
index ea5c52c0ee3..c7f4836a3ad 100644
--- a/tensorflow/core/api_def/base_api/api_def_ShuffleDataset.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ShuffleDataset.pbtxt
@@ -1,5 +1,6 @@
 op {
   graph_op_name: "ShuffleDataset"
+  visibility: HIDDEN
   in_arg {
     name: "buffer_size"
     description: <<END
diff --git a/tensorflow/core/api_def/base_api/api_def_SkipDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_SkipDataset.pbtxt
index 44e5bac79b8..f830049d053 100644
--- a/tensorflow/core/api_def/base_api/api_def_SkipDataset.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_SkipDataset.pbtxt
@@ -1,5 +1,6 @@
 op {
   graph_op_name: "SkipDataset"
+  visibility: HIDDEN
   in_arg {
     name: "count"
     description: <<END
diff --git a/tensorflow/core/api_def/base_api/api_def_SparseTensorSliceDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_SparseTensorSliceDataset.pbtxt
index ffb80583490..4203eca73a5 100644
--- a/tensorflow/core/api_def/base_api/api_def_SparseTensorSliceDataset.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_SparseTensorSliceDataset.pbtxt
@@ -1,4 +1,5 @@
 op {
   graph_op_name: "SparseTensorSliceDataset"
+  visibility: HIDDEN
   summary: "Creates a dataset that splits a SparseTensor into elements row-wise."
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_StatsAggregatorHandle.pbtxt b/tensorflow/core/api_def/base_api/api_def_StatsAggregatorHandle.pbtxt
deleted file mode 100644
index 9b30d64afe1..00000000000
--- a/tensorflow/core/api_def/base_api/api_def_StatsAggregatorHandle.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "StatsAggregatorHandle"
-  summary: "Creates a statistics manager resource."
-}
diff --git a/tensorflow/core/api_def/base_api/api_def_TFRecordDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_TFRecordDataset.pbtxt
index 80f64cebb1b..30e425794b3 100644
--- a/tensorflow/core/api_def/base_api/api_def_TFRecordDataset.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_TFRecordDataset.pbtxt
@@ -1,5 +1,6 @@
 op {
   graph_op_name: "TFRecordDataset"
+  visibility: HIDDEN
   in_arg {
     name: "filenames"
     description: <<END
diff --git a/tensorflow/core/api_def/base_api/api_def_TakeDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_TakeDataset.pbtxt
index 8808dc6b1f0..eadcb6cd051 100644
--- a/tensorflow/core/api_def/base_api/api_def_TakeDataset.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_TakeDataset.pbtxt
@@ -1,5 +1,6 @@
 op {
   graph_op_name: "TakeDataset"
+  visibility: HIDDEN
   in_arg {
     name: "count"
     description: <<END
diff --git a/tensorflow/core/api_def/base_api/api_def_TensorDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_TensorDataset.pbtxt
index 050e174aacb..c086d7420c2 100644
--- a/tensorflow/core/api_def/base_api/api_def_TensorDataset.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_TensorDataset.pbtxt
@@ -1,4 +1,5 @@
 op {
   graph_op_name: "TensorDataset"
+  visibility: HIDDEN
   summary: "Creates a dataset that emits `components` as a tuple of tensors once."
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_TensorForestCreateTreeVariable.pbtxt b/tensorflow/core/api_def/base_api/api_def_TensorForestCreateTreeVariable.pbtxt
new file mode 100644
index 00000000000..fe2ccd9da62
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_TensorForestCreateTreeVariable.pbtxt
@@ -0,0 +1,17 @@
+op {
+  graph_op_name: "TensorForestCreateTreeVariable"
+  visibility: HIDDEN
+  in_arg {
+    name: "tree_handle"
+    description: <<END
+Handle to the tree resource to be created.
+END
+  }
+  in_arg {
+    name: "tree_config"
+    description: <<END
+Serialized proto string of the boosted_trees.Tree.
+END
+  }
+  summary: "Creates a tree resource and returns a handle to it."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_TensorForestTreeDeserialize.pbtxt b/tensorflow/core/api_def/base_api/api_def_TensorForestTreeDeserialize.pbtxt
new file mode 100644
index 00000000000..43dbcb7b42d
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_TensorForestTreeDeserialize.pbtxt
@@ -0,0 +1,17 @@
+op {
+  graph_op_name: "TensorForestTreeDeserialize"
+  visibility: HIDDEN
+  in_arg {
+    name: "tree_handle"
+    description: <<END
+Handle to the tree resource to be restored.
+END
+  }
+  in_arg {
+    name: "tree_config"
+    description: <<END
+Serialied proto string of the boosted_trees.Tree proto.
+END
+  }
+  summary: "Deserializes a proto into the tree handle"
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_TensorForestTreeIsInitializedOp.pbtxt b/tensorflow/core/api_def/base_api/api_def_TensorForestTreeIsInitializedOp.pbtxt
new file mode 100644
index 00000000000..f9c7a67888e
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_TensorForestTreeIsInitializedOp.pbtxt
@@ -0,0 +1,17 @@
+op {
+  graph_op_name: "TensorForestTreeIsInitializedOp"
+  visibility: HIDDEN
+  in_arg {
+    name: "tree_handle"
+    description: <<END
+Handle to the tree.
+END
+  }
+  out_arg {
+    name: "is_initialized"
+    description: <<END
+Whether the tree is initialized.
+END
+  }
+  summary: "Checks whether a tree has been initialized."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_TensorForestTreePredict.pbtxt b/tensorflow/core/api_def/base_api/api_def_TensorForestTreePredict.pbtxt
new file mode 100644
index 00000000000..e8d92702748
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_TensorForestTreePredict.pbtxt
@@ -0,0 +1,29 @@
+op {
+  graph_op_name: "TensorForestTreePredict"
+  visibility: HIDDEN
+  attr {
+    name: "logits_dimension"
+    description: <<END
+Scalar, dimension of the logits.
+END
+  }
+  in_arg {
+    name: "tree_handle"
+    description: <<END
+Handle to the tree resource.
+END
+  }
+  in_arg {
+    name: "dense_features"
+    description: <<END
+Rank 2 dense features tensor.
+END
+  }
+  out_arg {
+    name: "logits"
+    description: <<END
+The logits predictions from the tree for each instance in the batch.
+END
+  }
+  summary: "Output the logits for the given input data"
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_TensorForestTreeResourceHandleOp.pbtxt b/tensorflow/core/api_def/base_api/api_def_TensorForestTreeResourceHandleOp.pbtxt
new file mode 100644
index 00000000000..bbf5c51d647
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_TensorForestTreeResourceHandleOp.pbtxt
@@ -0,0 +1,5 @@
+op {
+  graph_op_name: "TensorForestTreeResourceHandleOp"
+  visibility: HIDDEN
+  summary: "Creates a handle to a TensorForestTreeResource"
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_TensorForestTreeSerialize.pbtxt b/tensorflow/core/api_def/base_api/api_def_TensorForestTreeSerialize.pbtxt
new file mode 100644
index 00000000000..aac2afa0f85
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_TensorForestTreeSerialize.pbtxt
@@ -0,0 +1,17 @@
+op {
+  graph_op_name: "TensorForestTreeSerialize"
+  visibility: HIDDEN
+  in_arg {
+    name: "tree_handle"
+    description: <<END
+Handle to the tree resource to be serialized.
+END
+  }
+  out_arg {
+    name: "tree_config"
+    description: <<END
+Serialied proto string of the tree resource.
+END
+  }
+  summary: "Serializes the tree handle to a proto"
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_TensorForestTreeSize.pbtxt b/tensorflow/core/api_def/base_api/api_def_TensorForestTreeSize.pbtxt
new file mode 100644
index 00000000000..6b85b0ed6cf
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_TensorForestTreeSize.pbtxt
@@ -0,0 +1,17 @@
+op {
+  graph_op_name: "TensorForestTreeSize"
+  visibility: HIDDEN
+  in_arg {
+    name: "tree_handle"
+    description: <<END
+Handle to the tree resource.
+END
+  }
+  out_arg {
+    name: "tree_size"
+    description: <<END
+The size of the tree.
+END
+  }
+  summary: "Get the number of nodes in a tree"
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_TensorListConcat.pbtxt b/tensorflow/core/api_def/base_api/api_def_TensorListConcat.pbtxt
new file mode 100644
index 00000000000..909c09aa12b
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_TensorListConcat.pbtxt
@@ -0,0 +1,12 @@
+op {
+  graph_op_name: "TensorListConcat"
+  summary: "Concats all tensors in the list along the 0th dimension."
+  description: <<END
+Requires that all tensors have the same shape except the first dimension.
+
+input_handle: The input list.
+tensor: The concated result.
+lengths: Output tensor containing sizes of the 0th dimension of tensors in the list, used for computing the gradient.
+
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_TensorListSplit.pbtxt b/tensorflow/core/api_def/base_api/api_def_TensorListSplit.pbtxt
new file mode 100644
index 00000000000..24156cb8c47
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_TensorListSplit.pbtxt
@@ -0,0 +1,13 @@
+op {
+  graph_op_name: "TensorListSplit"
+  summary: "Splits a tensor into a list."
+  description: <<END
+list[i] corresponds to lengths[i] tensors from the input tensor.
+The tensor must have rank at least 1 and contain exactly sum(lengths) elements.
+
+tensor: The input tensor.
+element_shape: A shape compatible with that of elements in the tensor.
+lengths: Vector of sizes of the 0th dimension of tensors in the list.
+output_handle: The list.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_TensorScatterAdd.pbtxt b/tensorflow/core/api_def/base_api/api_def_TensorScatterAdd.pbtxt
new file mode 100644
index 00000000000..1634e51c3cb
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_TensorScatterAdd.pbtxt
@@ -0,0 +1,94 @@
+op {
+  graph_op_name: "TensorScatterAdd"
+  in_arg {
+    name: "tensor"
+    description: <<END
+Tensor to copy/update.
+END
+  }
+  in_arg {
+    name: "indices"
+    description: <<END
+Index tensor.
+END
+  }
+  in_arg {
+    name: "updates"
+    description: <<END
+Updates to scatter into output.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+A new tensor copied from tensor and updates added according to the indices.
+END
+  }
+  summary: "Adds sparse `updates` to an existing tensor according to `indices`."
+  description: <<END
+This operation creates a new tensor by adding sparse `updates` to the passed
+in `tensor`.
+This operation is very similar to `tf.scatter_nd_add`, except that the updates
+are added onto an existing tensor (as opposed to a variable). If the memory
+for the existing tensor cannot be re-used, a copy is made and updated.
+
+`indices` is an integer tensor containing indices into a new tensor of shape
+`shape`.  The last dimension of `indices` can be at most the rank of `shape`:
+
+    indices.shape[-1] <= shape.rank
+
+The last dimension of `indices` corresponds to indices into elements
+(if `indices.shape[-1] = shape.rank`) or slices
+(if `indices.shape[-1] < shape.rank`) along dimension `indices.shape[-1]` of
+`shape`.  `updates` is a tensor with shape
+
+    indices.shape[:-1] + shape[indices.shape[-1]:]
+
+The simplest form of tensor_scatter_add is to add individual elements to a
+tensor by index. For example, say we want to add 4 elements in a rank-1
+tensor with 8 elements.
+
+In Python, this scatter add operation would look like this:
+
+```python
+    indices = tf.constant([[4], [3], [1], [7]])
+    updates = tf.constant([9, 10, 11, 12])
+    tensor = tf.ones([8], dtype=tf.int32)
+    updated = tf.tensor_scatter_add(tensor, indices, updates)
+    with tf.Session() as sess:
+      print(sess.run(scatter))
+```
+
+The resulting tensor would look like this:
+
+    [1, 12, 1, 11, 10, 1, 1, 13]
+
+We can also, insert entire slices of a higher rank tensor all at once. For
+example, if we wanted to insert two slices in the first dimension of a
+rank-3 tensor with two matrices of new values.
+
+In Python, this scatter add operation would look like this:
+
+```python
+    indices = tf.constant([[0], [2]])
+    updates = tf.constant([[[5, 5, 5, 5], [6, 6, 6, 6],
+                            [7, 7, 7, 7], [8, 8, 8, 8]],
+                           [[5, 5, 5, 5], [6, 6, 6, 6],
+                            [7, 7, 7, 7], [8, 8, 8, 8]]])
+    tensor = tf.ones([4, 4, 4])
+    updated = tf.tensor_scatter_add(tensor, indices, updates)
+    with tf.Session() as sess:
+      print(sess.run(scatter))
+```
+
+The resulting tensor would look like this:
+
+    [[[6, 6, 6, 6], [7, 7, 7, 7], [8, 8, 8, 8], [9, 9, 9, 9]],
+     [[1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1]],
+     [[6, 6, 6, 6], [7, 7, 7, 7], [8, 8, 8, 8], [9, 9, 9, 9]],
+     [[1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1]]]
+
+Note that on CPU, if an out of bound index is found, an error is returned.
+On GPU, if an out of bound index is found, the index is ignored.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_TensorScatterSub.pbtxt b/tensorflow/core/api_def/base_api/api_def_TensorScatterSub.pbtxt
new file mode 100644
index 00000000000..851628bce15
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_TensorScatterSub.pbtxt
@@ -0,0 +1,94 @@
+op {
+  graph_op_name: "TensorScatterSub"
+  in_arg {
+    name: "tensor"
+    description: <<END
+Tensor to copy/update.
+END
+  }
+  in_arg {
+    name: "indices"
+    description: <<END
+Index tensor.
+END
+  }
+  in_arg {
+    name: "updates"
+    description: <<END
+Updates to scatter into output.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+A new tensor copied from tensor and updates subtracted according to the indices.
+END
+  }
+  summary: "Subtracts sparse `updates` from an existing tensor according to `indices`."
+  description: <<END
+This operation creates a new tensor by subtracting sparse `updates` from the
+passed in `tensor`.
+This operation is very similar to `tf.scatter_nd_sub`, except that the updates
+are subtracted from an existing tensor (as opposed to a variable). If the memory
+for the existing tensor cannot be re-used, a copy is made and updated.
+
+`indices` is an integer tensor containing indices into a new tensor of shape
+`shape`.  The last dimension of `indices` can be at most the rank of `shape`:
+
+    indices.shape[-1] <= shape.rank
+
+The last dimension of `indices` corresponds to indices into elements
+(if `indices.shape[-1] = shape.rank`) or slices
+(if `indices.shape[-1] < shape.rank`) along dimension `indices.shape[-1]` of
+`shape`.  `updates` is a tensor with shape
+
+    indices.shape[:-1] + shape[indices.shape[-1]:]
+
+The simplest form of tensor_scatter_sub is to subtract individual elements
+from a tensor by index. For example, say we want to insert 4 scattered elements
+in a rank-1 tensor with 8 elements.
+
+In Python, this scatter subtract operation would look like this:
+
+```python
+    indices = tf.constant([[4], [3], [1], [7]])
+    updates = tf.constant([9, 10, 11, 12])
+    tensor = tf.ones([8], dtype=tf.int32)
+    updated = tf.tensor_scatter_sub(tensor, indices, updates)
+    with tf.Session() as sess:
+      print(sess.run(scatter))
+```
+
+The resulting tensor would look like this:
+
+    [1, -10, 1, -9, -8, 1, 1, -11]
+
+We can also, insert entire slices of a higher rank tensor all at once. For
+example, if we wanted to insert two slices in the first dimension of a
+rank-3 tensor with two matrices of new values.
+
+In Python, this scatter add operation would look like this:
+
+```python
+    indices = tf.constant([[0], [2]])
+    updates = tf.constant([[[5, 5, 5, 5], [6, 6, 6, 6],
+                            [7, 7, 7, 7], [8, 8, 8, 8]],
+                           [[5, 5, 5, 5], [6, 6, 6, 6],
+                            [7, 7, 7, 7], [8, 8, 8, 8]]])
+    tensor = tf.ones([4, 4, 4])
+    updated = tf.tensor_scatter_sub(tensor, indices, updates)
+    with tf.Session() as sess:
+      print(sess.run(scatter))
+```
+
+The resulting tensor would look like this:
+
+    [[[-4, -4, -4, -4], [-5, -5, -5, -5], [-6, -6, -6, -6], [-7, -7, -7, -7]],
+     [[1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1]],
+     [[-4, -4, -4, -4], [-5, -5, -5, -5], [-6, -6, -6, -6], [-7, -7, -7, -7]],
+     [[1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1]]]
+
+Note that on CPU, if an out of bound index is found, an error is returned.
+On GPU, if an out of bound index is found, the index is ignored.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_TensorScatterUpdate.pbtxt b/tensorflow/core/api_def/base_api/api_def_TensorScatterUpdate.pbtxt
new file mode 100644
index 00000000000..5a6ed1e1ce4
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_TensorScatterUpdate.pbtxt
@@ -0,0 +1,106 @@
+op {
+  graph_op_name: "TensorScatterUpdate"
+  in_arg {
+    name: "tensor"
+    description: <<END
+Tensor to copy/update.
+END
+  }
+  in_arg {
+    name: "indices"
+    description: <<END
+Index tensor.
+END
+  }
+  in_arg {
+    name: "updates"
+    description: <<END
+Updates to scatter into output.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+A new tensor with the given shape and updates applied according
+to the indices.
+END
+  }
+  summary: "Scatter `updates` into an existing tensor according to `indices`."
+  description: <<END
+This operation creates a new tensor by applying sparse `updates` to the passed
+in `tensor`.
+This operation is very similar to `tf.scatter_nd`, except that the updates are
+scattered onto an existing tensor (as opposed to a zero-tensor). If the memory
+for the existing tensor cannot be re-used, a copy is made and updated.
+
+If `indices` contains duplicates, then their updates are accumulated (summed).
+
+**WARNING**: The order in which updates are applied is nondeterministic, so the
+output will be nondeterministic if `indices` contains duplicates -- because
+of some numerical approximation issues, numbers summed in different order
+may yield different results.
+
+`indices` is an integer tensor containing indices into a new tensor of shape
+`shape`.  The last dimension of `indices` can be at most the rank of `shape`:
+
+    indices.shape[-1] <= shape.rank
+
+The last dimension of `indices` corresponds to indices into elements
+(if `indices.shape[-1] = shape.rank`) or slices
+(if `indices.shape[-1] < shape.rank`) along dimension `indices.shape[-1]` of
+`shape`.  `updates` is a tensor with shape
+
+    indices.shape[:-1] + shape[indices.shape[-1]:]
+
+The simplest form of scatter is to insert individual elements in a tensor by
+index. For example, say we want to insert 4 scattered elements in a rank-1
+tensor with 8 elements.
+
+<div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+<img style="width:100%" src="https://www.tensorflow.org/images/ScatterNd1.png" alt>
+</div>
+
+In Python, this scatter operation would look like this:
+
+```python
+    indices = tf.constant([[4], [3], [1], [7]])
+    updates = tf.constant([9, 10, 11, 12])
+    tensor = tf.ones([8], dtype=tf.int32)
+    updated = tf.tensor_scatter_update(tensor, indices, updates)
+    with tf.Session() as sess:
+      print(sess.run(scatter))
+```
+
+The resulting tensor would look like this:
+
+    [1, 11, 1, 10, 9, 1, 1, 12]
+
+We can also, insert entire slices of a higher rank tensor all at once. For
+example, if we wanted to insert two slices in the first dimension of a
+rank-3 tensor with two matrices of new values.
+
+In Python, this scatter operation would look like this:
+
+```python
+    indices = tf.constant([[0], [2]])
+    updates = tf.constant([[[5, 5, 5, 5], [6, 6, 6, 6],
+                            [7, 7, 7, 7], [8, 8, 8, 8]],
+                           [[5, 5, 5, 5], [6, 6, 6, 6],
+                            [7, 7, 7, 7], [8, 8, 8, 8]]])
+    tensor = tf.ones([4, 4, 4])
+    updated = tf.tensor_scatter_update(tensor, indices, updates)
+    with tf.Session() as sess:
+      print(sess.run(scatter))
+```
+
+The resulting tensor would look like this:
+
+    [[[5, 5, 5, 5], [6, 6, 6, 6], [7, 7, 7, 7], [8, 8, 8, 8]],
+     [[1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1]],
+     [[5, 5, 5, 5], [6, 6, 6, 6], [7, 7, 7, 7], [8, 8, 8, 8]],
+     [[1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1]]]
+
+Note that on CPU, if an out of bound index is found, an error is returned.
+On GPU, if an out of bound index is found, the index is ignored.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_TensorSliceDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_TensorSliceDataset.pbtxt
index a26a98fd7f3..30cb803b26b 100644
--- a/tensorflow/core/api_def/base_api/api_def_TensorSliceDataset.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_TensorSliceDataset.pbtxt
@@ -1,4 +1,5 @@
 op {
   graph_op_name: "TensorSliceDataset"
+  visibility: HIDDEN
   summary: "Creates a dataset that emits each dim-0 slice of `components` once."
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_TextLineDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_TextLineDataset.pbtxt
index 6b630509964..31ef3e3335e 100644
--- a/tensorflow/core/api_def/base_api/api_def_TextLineDataset.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_TextLineDataset.pbtxt
@@ -1,5 +1,6 @@
 op {
   graph_op_name: "TextLineDataset"
+  visibility: HIDDEN
   in_arg {
     name: "filenames"
     description: <<END
diff --git a/tensorflow/core/api_def/base_api/api_def_UnicodeDecodeWithOffsets.pbtxt b/tensorflow/core/api_def/base_api/api_def_UnicodeDecodeWithOffsets.pbtxt
new file mode 100644
index 00000000000..15fc8747af1
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_UnicodeDecodeWithOffsets.pbtxt
@@ -0,0 +1,87 @@
+op {
+  graph_op_name: "UnicodeDecodeWithOffsets"
+  in_arg {
+    name: "input"
+    description: <<END
+The text to be decoded. Can have any shape. Note that the output is flattened
+to a vector of char values.
+END
+  }
+  out_arg {
+    name: "row_splits"
+    description: <<END
+A 1D int32 tensor containing the row splits.
+END
+  }
+  out_arg {
+    name: "char_values"
+    description: <<END
+A 1D int32 Tensor containing the decoded codepoints.
+END
+  }
+  out_arg {
+    name: "char_to_byte_starts"
+    description: <<END
+A 1D int32 Tensor containing the byte index in the input string where each
+character in `char_values` starts.
+END
+  }
+  attr {
+    name: "input_encoding"
+    description: <<END
+Text encoding of the input strings. This is any of the encodings supported
+by ICU ucnv algorithmic converters. Examples: `"UTF-16", "US ASCII", "UTF-8"`.
+END
+  }
+  attr {
+    name: "errors"
+    description: <<END
+Error handling policy when there is invalid formatting found in the input.
+The value of 'strict' will cause the operation to produce a InvalidArgument
+error on any invalid input formatting. A value of 'replace' (the default) will
+cause the operation to replace any invalid formatting in the input with the
+`replacement_char` codepoint. A value of 'ignore' will cause the operation to
+skip any invalid formatting in the input and produce no corresponding output
+character.
+END
+  }
+  attr {
+    name: "replacement_char"
+    description: <<END
+The replacement character codepoint to be used in place of any invalid
+formatting in the input when `errors='replace'`. Any valid unicode codepoint may
+be used. The default value is the default unicode replacement character is
+0xFFFD or U+65533.)
+END
+  }
+  attr {
+    name: "replace_control_characters"
+    description: <<END
+Whether to replace the C0 control characters (00-1F) with the
+`replacement_char`. Default is false.
+END
+  }
+  summary: <<END
+Decodes each string in `input` into a sequence of Unicode code points.
+END
+  description: <<END
+The character codepoints for all strings are returned using a single vector
+`char_values`, with strings expanded to characters in row-major order.
+Similarly, the character start byte offsets are returned using a single vector
+`char_to_byte_starts`, with strings expanded in row-major order.
+
+The `row_splits` tensor indicates where the codepoints and start offsets for
+each input string begin and end within the `char_values` and
+`char_to_byte_starts` tensors.  In particular, the values for the `i`th
+string (in row-major order) are stored in the slice
+`[row_splits[i]:row_splits[i+1]]`. Thus:
+
+* `char_values[row_splits[i]+j]` is the Unicode codepoint for the `j`th
+  character in the `i`th string (in row-major order).
+* `char_to_bytes_starts[row_splits[i]+j]` is the start byte offset for the `j`th
+  character in the `i`th string (in row-major order).
+* `row_splits[i+1] - row_splits[i]` is the number of characters in the `i`th
+  string (in row-major order).
+END
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_UnicodeEncode.pbtxt b/tensorflow/core/api_def/base_api/api_def_UnicodeEncode.pbtxt
new file mode 100644
index 00000000000..26f78658607
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_UnicodeEncode.pbtxt
@@ -0,0 +1,73 @@
+op {
+  graph_op_name: "UnicodeEncode"
+  visibility: HIDDEN
+  endpoint {
+    name: "UnicodeEncode"
+  }
+  in_arg {
+    name: "input_values"
+    description: <<END
+A 1D tensor containing the unicode codepoints that should be encoded.
+END
+  }
+  in_arg {
+    name: "input_splits"
+    description: <<END
+A 1D tensor specifying how the unicode codepoints should be split into strings.
+In particular, `output[i]` is constructed by encoding the codepoints in the
+slice `input_values[input_splits[i]:input_splits[i+1]]`.
+END
+  }
+  attr {
+    name: "output_encoding"
+    description: <<END
+Unicode encoding of the output strings. Valid encodings are: `"UTF-8",
+"UTF-16-BE", and "UTF-32-BE"`.
+END
+  }
+  attr {
+    name: "errors"
+    description: <<END
+Error handling policy when there is invalid formatting found in the input.
+The value of 'strict' will cause the operation to produce a InvalidArgument
+error on any invalid input formatting. A value of 'replace' (the default) will
+cause the operation to replace any invalid formatting in the input with the
+`replacement_char` codepoint. A value of 'ignore' will cause the operation to
+skip any invalid formatting in the input and produce no corresponding output
+character.
+END
+  }
+  attr {
+    name: "replacement_char"
+    description: <<END
+The replacement character codepoint to be used in place of any invalid
+formatting in the input when `errors='replace'`. Any valid unicode codepoint may
+be used. The default value is the default unicode replacement character is
+0xFFFD (U+65533).
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+The 1-D Tensor of strings encoded from the provided unicode codepoints.
+END
+  }
+  summary: "Encode a tensor of ints into unicode strings."
+  description: <<END
+Returns a vector of strings, where `output[i]` is constructed by encoding the
+Unicode codepoints in `input_values[input_splits[i]:input_splits[i+1]]`
+using `output_encoding`.
+
+---
+
+Example:
+
+```
+input_values = [72, 101, 108, 108, 111, 87, 111, 114, 108, 100]
+input_splits = [0, 5, 10]
+output_encoding = 'UTF-8'
+
+output = ['Hello', 'World']
+```
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ZipDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_ZipDataset.pbtxt
index 7495693ccc5..3c819963590 100644
--- a/tensorflow/core/api_def/base_api/api_def_ZipDataset.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ZipDataset.pbtxt
@@ -1,4 +1,5 @@
 op {
   graph_op_name: "ZipDataset"
+  visibility: HIDDEN
   summary: "Creates a dataset that zips together `input_datasets`."
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_BatchDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_BatchDataset.pbtxt
deleted file mode 100644
index 4289c1daf96..00000000000
--- a/tensorflow/core/api_def/python_api/api_def_BatchDataset.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "BatchDataset"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_BatchToSpaceND.pbtxt b/tensorflow/core/api_def/python_api/api_def_BatchToSpaceND.pbtxt
index 801dfbc2854..94ffc7c068e 100644
--- a/tensorflow/core/api_def/python_api/api_def_BatchToSpaceND.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_BatchToSpaceND.pbtxt
@@ -1,7 +1,9 @@
 op {
   graph_op_name: "BatchToSpaceND"
+  deprecation_message: "use batch_to_space"
   endpoint {
     name: "batch_to_space_nd"
+    deprecation_version: 2
   }
   endpoint {
     name: "manip.batch_to_space_nd"
diff --git a/tensorflow/core/api_def/python_api/api_def_BesselI0e.pbtxt b/tensorflow/core/api_def/python_api/api_def_BesselI0e.pbtxt
index 7965af4916e..fdbe5282bc1 100644
--- a/tensorflow/core/api_def/python_api/api_def_BesselI0e.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_BesselI0e.pbtxt
@@ -1,4 +1,6 @@
 op {
   graph_op_name: "BesselI0e"
-  visibility: HIDDEN
+  endpoint {
+    name: "math.bessel_i0e"
+  }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_BesselI1e.pbtxt b/tensorflow/core/api_def/python_api/api_def_BesselI1e.pbtxt
index dffd296f6d8..3f08cd766d8 100644
--- a/tensorflow/core/api_def/python_api/api_def_BesselI1e.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_BesselI1e.pbtxt
@@ -1,4 +1,6 @@
 op {
   graph_op_name: "BesselI1e"
-  visibility: HIDDEN
+  endpoint {
+    name: "math.bessel_i1e"
+  }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_BytesProducedStatsDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_BytesProducedStatsDataset.pbtxt
deleted file mode 100644
index fcf541f9036..00000000000
--- a/tensorflow/core/api_def/python_api/api_def_BytesProducedStatsDataset.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "BytesProducedStatsDataset"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_CacheDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_CacheDataset.pbtxt
deleted file mode 100644
index 2bbb4ff9e3b..00000000000
--- a/tensorflow/core/api_def/python_api/api_def_CacheDataset.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "CacheDataset"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_CheckNumerics.pbtxt b/tensorflow/core/api_def/python_api/api_def_CheckNumerics.pbtxt
index 33110d8c9ec..cf7a56ec782 100644
--- a/tensorflow/core/api_def/python_api/api_def_CheckNumerics.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_CheckNumerics.pbtxt
@@ -1,5 +1,7 @@
 op {
   graph_op_name: "CheckNumerics"
+  deprecation_version: 2
+  deprecation_message: "Use debugging.assert_all_finite instead"
   endpoint {
     name: "debugging.check_numerics"
   }
diff --git a/tensorflow/core/api_def/python_api/api_def_ConcatenateDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_ConcatenateDataset.pbtxt
deleted file mode 100644
index c005a4da0f8..00000000000
--- a/tensorflow/core/api_def/python_api/api_def_ConcatenateDataset.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "ConcatenateDataset"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_Conv2D.pbtxt b/tensorflow/core/api_def/python_api/api_def_Conv2D.pbtxt
index 2ae75d6da22..1f4bc6d22e3 100644
--- a/tensorflow/core/api_def/python_api/api_def_Conv2D.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_Conv2D.pbtxt
@@ -1,6 +1,4 @@
 op {
   graph_op_name: "Conv2D"
-  endpoint {
-    name: "nn.conv2d"
-  }
+  visibility: HIDDEN
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_Conv2DBackpropFilter.pbtxt b/tensorflow/core/api_def/python_api/api_def_Conv2DBackpropFilter.pbtxt
index 6f21d8c8802..1a9d96f3ab1 100644
--- a/tensorflow/core/api_def/python_api/api_def_Conv2DBackpropFilter.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_Conv2DBackpropFilter.pbtxt
@@ -1,6 +1,4 @@
 op {
   graph_op_name: "Conv2DBackpropFilter"
-  endpoint {
-    name: "nn.conv2d_backprop_filter"
-  }
+  visibility: HIDDEN
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_Conv2DBackpropInput.pbtxt b/tensorflow/core/api_def/python_api/api_def_Conv2DBackpropInput.pbtxt
index ea976799cbc..1505a307658 100644
--- a/tensorflow/core/api_def/python_api/api_def_Conv2DBackpropInput.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_Conv2DBackpropInput.pbtxt
@@ -1,6 +1,4 @@
 op {
   graph_op_name: "Conv2DBackpropInput"
-  endpoint {
-    name: "nn.conv2d_backprop_input"
-  }
+  visibility: HIDDEN
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_Conv3D.pbtxt b/tensorflow/core/api_def/python_api/api_def_Conv3D.pbtxt
index ba8d178263c..cb463dd0d8d 100644
--- a/tensorflow/core/api_def/python_api/api_def_Conv3D.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_Conv3D.pbtxt
@@ -1,6 +1,4 @@
 op {
   graph_op_name: "Conv3D"
-  endpoint {
-    name: "nn.conv3d"
-  }
+  visibility: HIDDEN
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_Conv3DBackpropFilterV2.pbtxt b/tensorflow/core/api_def/python_api/api_def_Conv3DBackpropFilterV2.pbtxt
index 1da8ee3a25f..590b37c95fb 100644
--- a/tensorflow/core/api_def/python_api/api_def_Conv3DBackpropFilterV2.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_Conv3DBackpropFilterV2.pbtxt
@@ -1,6 +1,10 @@
 op {
   graph_op_name: "Conv3DBackpropFilterV2"
+  endpoint {
+    name: "nn.conv3d_backprop_filter"
+  }
   endpoint {
     name: "nn.conv3d_backprop_filter_v2"
+    deprecation_version: 2
   }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_CropAndResize.pbtxt b/tensorflow/core/api_def/python_api/api_def_CropAndResize.pbtxt
index ce65f8172dd..2559a6c80b8 100644
--- a/tensorflow/core/api_def/python_api/api_def_CropAndResize.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_CropAndResize.pbtxt
@@ -1,6 +1,4 @@
 op {
   graph_op_name: "CropAndResize"
-  endpoint {
-    name: "image.crop_and_resize"
-  }
+  visibility: HIDDEN
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_DatasetToSingleElement.pbtxt b/tensorflow/core/api_def/python_api/api_def_DatasetToSingleElement.pbtxt
deleted file mode 100644
index e3d34cc15be..00000000000
--- a/tensorflow/core/api_def/python_api/api_def_DatasetToSingleElement.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "DatasetToSingleElement"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_DecodeAndCropJpeg.pbtxt b/tensorflow/core/api_def/python_api/api_def_DecodeAndCropJpeg.pbtxt
index fbe9c882538..2c3857cc539 100644
--- a/tensorflow/core/api_def/python_api/api_def_DecodeAndCropJpeg.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_DecodeAndCropJpeg.pbtxt
@@ -1,6 +1,4 @@
 op {
   graph_op_name: "DecodeAndCropJpeg"
-  endpoint {
-    name: "image.decode_and_crop_jpeg"
-  }
+  visibility: HIDDEN
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_DecodeBmp.pbtxt b/tensorflow/core/api_def/python_api/api_def_DecodeBmp.pbtxt
index 573d83f3739..ffe19ca8dc3 100644
--- a/tensorflow/core/api_def/python_api/api_def_DecodeBmp.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_DecodeBmp.pbtxt
@@ -1,6 +1,4 @@
 op {
   graph_op_name: "DecodeBmp"
-  endpoint {
-    name: "image.decode_bmp"
-  }
+  visibility: HIDDEN
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_DecodeGif.pbtxt b/tensorflow/core/api_def/python_api/api_def_DecodeGif.pbtxt
index eed64df79cf..ff68b997e14 100644
--- a/tensorflow/core/api_def/python_api/api_def_DecodeGif.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_DecodeGif.pbtxt
@@ -1,6 +1,4 @@
 op {
   graph_op_name: "DecodeGif"
-  endpoint {
-    name: "image.decode_gif"
-  }
+  visibility: HIDDEN
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_DecodeJpeg.pbtxt b/tensorflow/core/api_def/python_api/api_def_DecodeJpeg.pbtxt
index 994bc4e1f4f..97d262abe57 100644
--- a/tensorflow/core/api_def/python_api/api_def_DecodeJpeg.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_DecodeJpeg.pbtxt
@@ -1,6 +1,4 @@
 op {
   graph_op_name: "DecodeJpeg"
-  endpoint {
-    name: "image.decode_jpeg"
-  }
+  visibility: HIDDEN
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_DecodePng.pbtxt b/tensorflow/core/api_def/python_api/api_def_DecodePng.pbtxt
index 309eec5ac36..3b9290a2c5b 100644
--- a/tensorflow/core/api_def/python_api/api_def_DecodePng.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_DecodePng.pbtxt
@@ -1,6 +1,4 @@
 op {
   graph_op_name: "DecodePng"
-  endpoint {
-    name: "image.decode_png"
-  }
+  visibility: HIDDEN
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_DenseToSparseBatchDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_DenseToSparseBatchDataset.pbtxt
deleted file mode 100644
index 0a8e068afb7..00000000000
--- a/tensorflow/core/api_def/python_api/api_def_DenseToSparseBatchDataset.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "DenseToSparseBatchDataset"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_DepthwiseConv2dNative.pbtxt b/tensorflow/core/api_def/python_api/api_def_DepthwiseConv2dNative.pbtxt
index 1bb17e548d1..e26d029212e 100644
--- a/tensorflow/core/api_def/python_api/api_def_DepthwiseConv2dNative.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_DepthwiseConv2dNative.pbtxt
@@ -1,6 +1,8 @@
 op {
   graph_op_name: "DepthwiseConv2dNative"
+  deprecation_message: "Use nn.depthwise_conv2d instead"
   endpoint {
     name: "nn.depthwise_conv2d_native"
+    deprecation_version: 2
   }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_DepthwiseConv2dNativeBackpropFilter.pbtxt b/tensorflow/core/api_def/python_api/api_def_DepthwiseConv2dNativeBackpropFilter.pbtxt
index 6f9df4b1a11..01c4a50ca6f 100644
--- a/tensorflow/core/api_def/python_api/api_def_DepthwiseConv2dNativeBackpropFilter.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_DepthwiseConv2dNativeBackpropFilter.pbtxt
@@ -2,5 +2,10 @@ op {
   graph_op_name: "DepthwiseConv2dNativeBackpropFilter"
   endpoint {
     name: "nn.depthwise_conv2d_native_backprop_filter"
+    deprecated: true
+    deprecation_version: 2
+  }
+  endpoint {
+    name: "nn.depthwise_conv2d_backprop_filter"
   }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_DepthwiseConv2dNativeBackpropInput.pbtxt b/tensorflow/core/api_def/python_api/api_def_DepthwiseConv2dNativeBackpropInput.pbtxt
index 0bd72539e93..f32aa8a69f2 100644
--- a/tensorflow/core/api_def/python_api/api_def_DepthwiseConv2dNativeBackpropInput.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_DepthwiseConv2dNativeBackpropInput.pbtxt
@@ -2,5 +2,10 @@ op {
   graph_op_name: "DepthwiseConv2dNativeBackpropInput"
   endpoint {
     name: "nn.depthwise_conv2d_native_backprop_input"
+    deprecated: true
+    deprecation_version: 2
+  }
+  endpoint {
+    name: "nn.depthwise_conv2d_backprop_input"
   }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_Dilation2D.pbtxt b/tensorflow/core/api_def/python_api/api_def_Dilation2D.pbtxt
index 6d73ecf1bb0..1bd83d90615 100644
--- a/tensorflow/core/api_def/python_api/api_def_Dilation2D.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_Dilation2D.pbtxt
@@ -2,5 +2,6 @@ op {
   graph_op_name: "Dilation2D"
   endpoint {
     name: "nn.dilation2d"
+    deprecation_version: 2
   }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_EncodeJpeg.pbtxt b/tensorflow/core/api_def/python_api/api_def_EncodeJpeg.pbtxt
index 5c31e9d0f32..054ffb997b3 100644
--- a/tensorflow/core/api_def/python_api/api_def_EncodeJpeg.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_EncodeJpeg.pbtxt
@@ -1,6 +1,4 @@
 op {
   graph_op_name: "EncodeJpeg"
-  endpoint {
-    name: "image.encode_jpeg"
-  }
+  visibility: HIDDEN
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_EnqueueInQueueDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_EnqueueInQueueDataset.pbtxt
deleted file mode 100644
index 051cf14c0ec..00000000000
--- a/tensorflow/core/api_def/python_api/api_def_EnqueueInQueueDataset.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "EnqueueInQueueDataset"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_Erf.pbtxt b/tensorflow/core/api_def/python_api/api_def_Erf.pbtxt
index 391167254ed..21ae77e9ed7 100644
--- a/tensorflow/core/api_def/python_api/api_def_Erf.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_Erf.pbtxt
@@ -1,4 +1,10 @@
 op {
   graph_op_name: "Erf"
-  visibility: HIDDEN
+  endpoint {
+    name: "math.erf"
+  }
+  endpoint {
+    name: "erf"
+    deprecation_version: 2
+  }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_ExtractImagePatches.pbtxt b/tensorflow/core/api_def/python_api/api_def_ExtractImagePatches.pbtxt
index 0bd8b1c11aa..17921dea4d5 100644
--- a/tensorflow/core/api_def/python_api/api_def_ExtractImagePatches.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_ExtractImagePatches.pbtxt
@@ -1,10 +1,4 @@
 op {
   graph_op_name: "ExtractImagePatches"
-  endpoint {
-    name: "image.extract_image_patches"
-  }
-  endpoint {
-    name: "extract_image_patches"
-    deprecation_version: 2
-  }
+  visibility: HIDDEN
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_ExtractJpegShape.pbtxt b/tensorflow/core/api_def/python_api/api_def_ExtractJpegShape.pbtxt
index 6849a6d3fa5..a57955c8a74 100644
--- a/tensorflow/core/api_def/python_api/api_def_ExtractJpegShape.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_ExtractJpegShape.pbtxt
@@ -1,6 +1,4 @@
 op {
   graph_op_name: "ExtractJpegShape"
-  endpoint {
-    name: "image.extract_jpeg_shape"
-  }
+  visibility: HIDDEN
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_FilterDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_FilterDataset.pbtxt
deleted file mode 100644
index 6f91b842181..00000000000
--- a/tensorflow/core/api_def/python_api/api_def_FilterDataset.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "FilterDataset"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_FixedLengthRecordDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_FixedLengthRecordDataset.pbtxt
deleted file mode 100644
index d0703471d38..00000000000
--- a/tensorflow/core/api_def/python_api/api_def_FixedLengthRecordDataset.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "FixedLengthRecordDataset"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_FixedLengthRecordDatasetV2.pbtxt b/tensorflow/core/api_def/python_api/api_def_FixedLengthRecordDatasetV2.pbtxt
deleted file mode 100644
index def9f85e02d..00000000000
--- a/tensorflow/core/api_def/python_api/api_def_FixedLengthRecordDatasetV2.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "FixedLengthRecordDatasetV2"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_FlatMapDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_FlatMapDataset.pbtxt
deleted file mode 100644
index 9de61ac263c..00000000000
--- a/tensorflow/core/api_def/python_api/api_def_FlatMapDataset.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "FlatMapDataset"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_FractionalAvgPool.pbtxt b/tensorflow/core/api_def/python_api/api_def_FractionalAvgPool.pbtxt
index 16ed9b56f2b..cbe87777a7f 100644
--- a/tensorflow/core/api_def/python_api/api_def_FractionalAvgPool.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_FractionalAvgPool.pbtxt
@@ -1,6 +1,4 @@
 op {
   graph_op_name: "FractionalAvgPool"
-  endpoint {
-    name: "nn.fractional_avg_pool"
-  }
+  visibility: HIDDEN
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_FractionalMaxPool.pbtxt b/tensorflow/core/api_def/python_api/api_def_FractionalMaxPool.pbtxt
index 69555952080..02470b43454 100644
--- a/tensorflow/core/api_def/python_api/api_def_FractionalMaxPool.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_FractionalMaxPool.pbtxt
@@ -1,6 +1,4 @@
 op {
   graph_op_name: "FractionalMaxPool"
-  endpoint {
-    name: "nn.fractional_max_pool"
-  }
+  visibility: HIDDEN
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_GeneratorDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_GeneratorDataset.pbtxt
deleted file mode 100644
index 9dcfa0f7d21..00000000000
--- a/tensorflow/core/api_def/python_api/api_def_GeneratorDataset.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "GeneratorDataset"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_GroupByWindowDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_GroupByWindowDataset.pbtxt
deleted file mode 100644
index 8d40208e613..00000000000
--- a/tensorflow/core/api_def/python_api/api_def_GroupByWindowDataset.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "GroupByWindowDataset"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_InterleaveDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_InterleaveDataset.pbtxt
deleted file mode 100644
index ef1b06b19cc..00000000000
--- a/tensorflow/core/api_def/python_api/api_def_InterleaveDataset.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "InterleaveDataset"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_IsFinite.pbtxt b/tensorflow/core/api_def/python_api/api_def_IsFinite.pbtxt
index 91160bd8bfa..ccd736a483e 100644
--- a/tensorflow/core/api_def/python_api/api_def_IsFinite.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_IsFinite.pbtxt
@@ -1,7 +1,11 @@
 op {
   graph_op_name: "IsFinite"
+  endpoint {
+    name: "math.is_finite"
+  }
   endpoint {
     name: "debugging.is_finite"
+    deprecation_version: 2
   }
   endpoint {
     name: "is_finite"
diff --git a/tensorflow/core/api_def/python_api/api_def_IsInf.pbtxt b/tensorflow/core/api_def/python_api/api_def_IsInf.pbtxt
index 7f029ee8cf0..3cbfb7317c1 100644
--- a/tensorflow/core/api_def/python_api/api_def_IsInf.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_IsInf.pbtxt
@@ -1,7 +1,11 @@
 op {
   graph_op_name: "IsInf"
+  endpoint {
+    name: "math.is_inf"
+  }
   endpoint {
     name: "debugging.is_inf"
+    deprecation_version: 2
   }
   endpoint {
     name: "is_inf"
diff --git a/tensorflow/core/api_def/python_api/api_def_IsNan.pbtxt b/tensorflow/core/api_def/python_api/api_def_IsNan.pbtxt
index f2b8862c28d..b01536664e5 100644
--- a/tensorflow/core/api_def/python_api/api_def_IsNan.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_IsNan.pbtxt
@@ -1,7 +1,11 @@
 op {
   graph_op_name: "IsNan"
+  endpoint {
+    name: "math.is_nan"
+  }
   endpoint {
     name: "debugging.is_nan"
+    deprecation_version: 2
   }
   endpoint {
     name: "is_nan"
diff --git a/tensorflow/core/api_def/python_api/api_def_LatencyStatsDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_LatencyStatsDataset.pbtxt
deleted file mode 100644
index 94bf6106ad8..00000000000
--- a/tensorflow/core/api_def/python_api/api_def_LatencyStatsDataset.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "LatencyStatsDataset"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_LinSpace.pbtxt b/tensorflow/core/api_def/python_api/api_def_LinSpace.pbtxt
index b1de2cb207d..3835661be57 100644
--- a/tensorflow/core/api_def/python_api/api_def_LinSpace.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_LinSpace.pbtxt
@@ -2,6 +2,7 @@ op {
   graph_op_name: "LinSpace"
   endpoint {
     name: "lin_space"
+    deprecation_version: 2
   }
   endpoint {
     name: "linspace"
diff --git a/tensorflow/core/api_def/python_api/api_def_Log.pbtxt b/tensorflow/core/api_def/python_api/api_def_Log.pbtxt
index ac4a4454c74..b6d2da6d32a 100644
--- a/tensorflow/core/api_def/python_api/api_def_Log.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_Log.pbtxt
@@ -5,5 +5,6 @@ op {
   }
   endpoint {
     name: "log"
+    deprecation_version: 2
   }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_Log1p.pbtxt b/tensorflow/core/api_def/python_api/api_def_Log1p.pbtxt
index 5a2d77a4176..e3da451de3f 100644
--- a/tensorflow/core/api_def/python_api/api_def_Log1p.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_Log1p.pbtxt
@@ -5,5 +5,6 @@ op {
   }
   endpoint {
     name: "log1p"
+    deprecation_version: 2
   }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_MapAndBatchDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_MapAndBatchDataset.pbtxt
deleted file mode 100644
index cffd2910fb4..00000000000
--- a/tensorflow/core/api_def/python_api/api_def_MapAndBatchDataset.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "MapAndBatchDataset"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_MapDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_MapDataset.pbtxt
deleted file mode 100644
index 0b1d2f2c730..00000000000
--- a/tensorflow/core/api_def/python_api/api_def_MapDataset.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "MapDataset"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_MaxPoolWithArgmax.pbtxt b/tensorflow/core/api_def/python_api/api_def_MaxPoolWithArgmax.pbtxt
index 7d8abca5f1a..13a1a0b5df4 100644
--- a/tensorflow/core/api_def/python_api/api_def_MaxPoolWithArgmax.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_MaxPoolWithArgmax.pbtxt
@@ -2,5 +2,6 @@ op {
   graph_op_name: "MaxPoolWithArgmax"
   endpoint {
     name: "nn.max_pool_with_argmax"
+    deprecation_version: 2
   }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_Neg.pbtxt b/tensorflow/core/api_def/python_api/api_def_Neg.pbtxt
index 0e2bb9b950d..ac166561ee9 100644
--- a/tensorflow/core/api_def/python_api/api_def_Neg.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_Neg.pbtxt
@@ -1,4 +1,9 @@
 op {
   graph_op_name: "Neg"
-  visibility: HIDDEN
+  endpoint {
+    name: "math.negative"
+  }
+  endpoint {
+    name: "negative"
+  }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_PaddedBatchDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_PaddedBatchDataset.pbtxt
deleted file mode 100644
index c6223b3132e..00000000000
--- a/tensorflow/core/api_def/python_api/api_def_PaddedBatchDataset.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "PaddedBatchDataset"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_ParallelInterleaveDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_ParallelInterleaveDataset.pbtxt
deleted file mode 100644
index 93cd5719feb..00000000000
--- a/tensorflow/core/api_def/python_api/api_def_ParallelInterleaveDataset.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "ParallelInterleaveDataset"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_ParallelMapDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_ParallelMapDataset.pbtxt
deleted file mode 100644
index 09d200dd24c..00000000000
--- a/tensorflow/core/api_def/python_api/api_def_ParallelMapDataset.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "ParallelMapDataset"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_ParseExampleDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_ParseExampleDataset.pbtxt
deleted file mode 100644
index 45826b6fdcc..00000000000
--- a/tensorflow/core/api_def/python_api/api_def_ParseExampleDataset.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "ParseExampleDataset"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_PrefetchDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_PrefetchDataset.pbtxt
deleted file mode 100644
index ec4e214eb5e..00000000000
--- a/tensorflow/core/api_def/python_api/api_def_PrefetchDataset.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "PrefetchDataset"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_PrependFromQueueAndPaddedBatchDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_PrependFromQueueAndPaddedBatchDataset.pbtxt
deleted file mode 100644
index 228c4047d2e..00000000000
--- a/tensorflow/core/api_def/python_api/api_def_PrependFromQueueAndPaddedBatchDataset.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "PrependFromQueueAndPaddedBatchDataset"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_QuantizedAvgPool.pbtxt b/tensorflow/core/api_def/python_api/api_def_QuantizedAvgPool.pbtxt
index dfa793a16e1..6aceba3b118 100644
--- a/tensorflow/core/api_def/python_api/api_def_QuantizedAvgPool.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_QuantizedAvgPool.pbtxt
@@ -2,5 +2,7 @@ op {
   graph_op_name: "QuantizedAvgPool"
   endpoint {
     name: "nn.quantized_avg_pool"
+    deprecation_version: 2
   }
+  visibility: HIDDEN
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_QuantizedConv2D.pbtxt b/tensorflow/core/api_def/python_api/api_def_QuantizedConv2D.pbtxt
index 2409d12abef..4b5a04f45ef 100644
--- a/tensorflow/core/api_def/python_api/api_def_QuantizedConv2D.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_QuantizedConv2D.pbtxt
@@ -2,5 +2,7 @@ op {
   graph_op_name: "QuantizedConv2D"
   endpoint {
     name: "nn.quantized_conv2d"
+    deprecation_version: 2
   }
+  visibility: HIDDEN
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_QuantizedMaxPool.pbtxt b/tensorflow/core/api_def/python_api/api_def_QuantizedMaxPool.pbtxt
index 3a58590f577..cd1c7fdbf22 100644
--- a/tensorflow/core/api_def/python_api/api_def_QuantizedMaxPool.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_QuantizedMaxPool.pbtxt
@@ -2,5 +2,7 @@ op {
   graph_op_name: "QuantizedMaxPool"
   endpoint {
     name: "nn.quantized_max_pool"
+    deprecation_version: 2
   }
+  visibility: HIDDEN
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_QuantizedReluX.pbtxt b/tensorflow/core/api_def/python_api/api_def_QuantizedReluX.pbtxt
index 926ec98eeb4..d83d71c65ca 100644
--- a/tensorflow/core/api_def/python_api/api_def_QuantizedReluX.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_QuantizedReluX.pbtxt
@@ -2,5 +2,7 @@ op {
   graph_op_name: "QuantizedReluX"
   endpoint {
     name: "nn.quantized_relu_x"
+    deprecation_version: 2
   }
+  visibility: HIDDEN
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_RandomDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_RandomDataset.pbtxt
deleted file mode 100644
index a5f6f8c6f1d..00000000000
--- a/tensorflow/core/api_def/python_api/api_def_RandomDataset.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "RandomDataset"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_RangeDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_RangeDataset.pbtxt
deleted file mode 100644
index 4cd8296b223..00000000000
--- a/tensorflow/core/api_def/python_api/api_def_RangeDataset.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "RangeDataset"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_RepeatDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_RepeatDataset.pbtxt
deleted file mode 100644
index be301da8386..00000000000
--- a/tensorflow/core/api_def/python_api/api_def_RepeatDataset.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "RepeatDataset"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_ResizeArea.pbtxt b/tensorflow/core/api_def/python_api/api_def_ResizeArea.pbtxt
index 2f1b4aee00d..e1a1f883d8b 100644
--- a/tensorflow/core/api_def/python_api/api_def_ResizeArea.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_ResizeArea.pbtxt
@@ -1,6 +1,4 @@
 op {
   graph_op_name: "ResizeArea"
-  endpoint {
-    name: "image.resize_area"
-  }
+  visibility: HIDDEN
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_ResizeBicubic.pbtxt b/tensorflow/core/api_def/python_api/api_def_ResizeBicubic.pbtxt
index 3ec8e0ad635..e0bec8c116d 100644
--- a/tensorflow/core/api_def/python_api/api_def_ResizeBicubic.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_ResizeBicubic.pbtxt
@@ -1,6 +1,4 @@
 op {
   graph_op_name: "ResizeBicubic"
-  endpoint {
-    name: "image.resize_bicubic"
-  }
+  visibility: HIDDEN
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_ResizeBilinear.pbtxt b/tensorflow/core/api_def/python_api/api_def_ResizeBilinear.pbtxt
index eb3b8d6f458..6121c1128c9 100644
--- a/tensorflow/core/api_def/python_api/api_def_ResizeBilinear.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_ResizeBilinear.pbtxt
@@ -1,6 +1,4 @@
 op {
   graph_op_name: "ResizeBilinear"
-  endpoint {
-    name: "image.resize_bilinear"
-  }
+  visibility: HIDDEN
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_ResizeNearestNeighbor.pbtxt b/tensorflow/core/api_def/python_api/api_def_ResizeNearestNeighbor.pbtxt
index 25c5d5701fe..0e86e4ce3ea 100644
--- a/tensorflow/core/api_def/python_api/api_def_ResizeNearestNeighbor.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_ResizeNearestNeighbor.pbtxt
@@ -1,6 +1,4 @@
 op {
   graph_op_name: "ResizeNearestNeighbor"
-  endpoint {
-    name: "image.resize_nearest_neighbor"
-  }
+  visibility: HIDDEN
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_ResourceApplyAdamWithAmsgrad.pbtxt b/tensorflow/core/api_def/python_api/api_def_ResourceApplyAdamWithAmsgrad.pbtxt
new file mode 100644
index 00000000000..1eef1b69b97
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_ResourceApplyAdamWithAmsgrad.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ResourceApplyAdamWithAmsgrad"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_ResourceApplyKerasMomentum.pbtxt b/tensorflow/core/api_def/python_api/api_def_ResourceApplyKerasMomentum.pbtxt
new file mode 100644
index 00000000000..1c39242b310
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_ResourceApplyKerasMomentum.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ResourceApplyKerasMomentum"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_ResourceSparseApplyKerasMomentum.pbtxt b/tensorflow/core/api_def/python_api/api_def_ResourceSparseApplyKerasMomentum.pbtxt
new file mode 100644
index 00000000000..18079352135
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_ResourceSparseApplyKerasMomentum.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ResourceSparseApplyKerasMomentum"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_ScanDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_ScanDataset.pbtxt
deleted file mode 100644
index e71b655c22f..00000000000
--- a/tensorflow/core/api_def/python_api/api_def_ScanDataset.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "ScanDataset"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_SetStatsAggregatorDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_SetStatsAggregatorDataset.pbtxt
deleted file mode 100644
index 3a8c1036ca3..00000000000
--- a/tensorflow/core/api_def/python_api/api_def_SetStatsAggregatorDataset.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "SetStatsAggregatorDataset"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_ShuffleAndRepeatDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_ShuffleAndRepeatDataset.pbtxt
deleted file mode 100644
index 7b0d2994f07..00000000000
--- a/tensorflow/core/api_def/python_api/api_def_ShuffleAndRepeatDataset.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "ShuffleAndRepeatDataset"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_ShuffleDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_ShuffleDataset.pbtxt
deleted file mode 100644
index 8f0be9197ad..00000000000
--- a/tensorflow/core/api_def/python_api/api_def_ShuffleDataset.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "ShuffleDataset"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_Sign.pbtxt b/tensorflow/core/api_def/python_api/api_def_Sign.pbtxt
index c2ee91dd12e..fb427cdb191 100644
--- a/tensorflow/core/api_def/python_api/api_def_Sign.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_Sign.pbtxt
@@ -1,4 +1,9 @@
 op {
   graph_op_name: "Sign"
-  visibility: HIDDEN
+  endpoint {
+    name: "math.sign"
+  }
+  endpoint {
+    name: "sign"
+  }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_SkipDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_SkipDataset.pbtxt
deleted file mode 100644
index 96a551c5b66..00000000000
--- a/tensorflow/core/api_def/python_api/api_def_SkipDataset.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "SkipDataset"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_SlideDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_SlideDataset.pbtxt
deleted file mode 100644
index 867116c5da7..00000000000
--- a/tensorflow/core/api_def/python_api/api_def_SlideDataset.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "SlideDataset"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_SparseTensorSliceDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_SparseTensorSliceDataset.pbtxt
deleted file mode 100644
index 19c0c7f199d..00000000000
--- a/tensorflow/core/api_def/python_api/api_def_SparseTensorSliceDataset.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "SparseTensorSliceDataset"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_SqlDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_SqlDataset.pbtxt
deleted file mode 100644
index 2ab4c3e441d..00000000000
--- a/tensorflow/core/api_def/python_api/api_def_SqlDataset.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "SqlDataset"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_Sqrt.pbtxt b/tensorflow/core/api_def/python_api/api_def_Sqrt.pbtxt
index 59e2dfe8366..16a4d9a7bcc 100644
--- a/tensorflow/core/api_def/python_api/api_def_Sqrt.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_Sqrt.pbtxt
@@ -1,4 +1,9 @@
 op {
   graph_op_name: "Sqrt"
-  visibility: HIDDEN
+  endpoint {
+    name: "math.sqrt"
+  }
+  endpoint {
+    name: "sqrt"
+  }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_Square.pbtxt b/tensorflow/core/api_def/python_api/api_def_Square.pbtxt
index 7b39ae25fa0..0bd2f1bf41b 100644
--- a/tensorflow/core/api_def/python_api/api_def_Square.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_Square.pbtxt
@@ -1,4 +1,9 @@
 op {
   graph_op_name: "Square"
-  visibility: HIDDEN
+  endpoint {
+    name: "math.square"
+  }
+  endpoint {
+    name: "square"
+  }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_StatsAggregatorHandle.pbtxt b/tensorflow/core/api_def/python_api/api_def_StatsAggregatorHandle.pbtxt
deleted file mode 100644
index f7bed36602f..00000000000
--- a/tensorflow/core/api_def/python_api/api_def_StatsAggregatorHandle.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "StatsAggregatorHandle"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_StatsAggregatorSummary.pbtxt b/tensorflow/core/api_def/python_api/api_def_StatsAggregatorSummary.pbtxt
deleted file mode 100644
index 8b1bab2440f..00000000000
--- a/tensorflow/core/api_def/python_api/api_def_StatsAggregatorSummary.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "StatsAggregatorSummary"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_StringToHashBucket.pbtxt b/tensorflow/core/api_def/python_api/api_def_StringToHashBucket.pbtxt
index cf0b8831ef1..dc4493c8410 100644
--- a/tensorflow/core/api_def/python_api/api_def_StringToHashBucket.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_StringToHashBucket.pbtxt
@@ -1,10 +1,4 @@
 op {
   graph_op_name: "StringToHashBucket"
-  endpoint {
-    name: "strings.to_hash_bucket"
-  }
-  endpoint {
-    name: "string_to_hash_bucket"
-    deprecation_version: 2
-  }
+  visibility: HIDDEN
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_StringToNumber.pbtxt b/tensorflow/core/api_def/python_api/api_def_StringToNumber.pbtxt
index 155dd267503..9c89d02fb76 100644
--- a/tensorflow/core/api_def/python_api/api_def_StringToNumber.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_StringToNumber.pbtxt
@@ -1,10 +1,4 @@
 op {
   graph_op_name: "StringToNumber"
-  endpoint {
-    name: "strings.to_number"
-  }
-  endpoint {
-    name: "string_to_number"
-    deprecation_version: 2
-  }
+  visibility: HIDDEN
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_TFRecordDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_TFRecordDataset.pbtxt
deleted file mode 100644
index 3c270ada3c2..00000000000
--- a/tensorflow/core/api_def/python_api/api_def_TFRecordDataset.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "TFRecordDataset"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_TakeDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_TakeDataset.pbtxt
deleted file mode 100644
index 711b335dc19..00000000000
--- a/tensorflow/core/api_def/python_api/api_def_TakeDataset.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "TakeDataset"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_Tanh.pbtxt b/tensorflow/core/api_def/python_api/api_def_Tanh.pbtxt
index c946e0a794a..80d11d27853 100644
--- a/tensorflow/core/api_def/python_api/api_def_Tanh.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_Tanh.pbtxt
@@ -1,4 +1,12 @@
 op {
   graph_op_name: "Tanh"
-  visibility: HIDDEN
+  endpoint {
+    name: "math.tanh"
+  }
+  endpoint {
+    name: "nn.tanh"
+  }
+  endpoint {
+    name: "tanh"
+  }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_TensorDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_TensorDataset.pbtxt
deleted file mode 100644
index 5bc3920c563..00000000000
--- a/tensorflow/core/api_def/python_api/api_def_TensorDataset.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "TensorDataset"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_TensorListConcat.pbtxt b/tensorflow/core/api_def/python_api/api_def_TensorListConcat.pbtxt
new file mode 100644
index 00000000000..c7b6fd106ce
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_TensorListConcat.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "TensorListConcat"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_defTensorListPushBackBatch.pbtxt b/tensorflow/core/api_def/python_api/api_def_TensorListPushBackBatch.pbtxt
similarity index 100%
rename from tensorflow/core/api_def/python_api/api_defTensorListPushBackBatch.pbtxt
rename to tensorflow/core/api_def/python_api/api_def_TensorListPushBackBatch.pbtxt
diff --git a/tensorflow/core/api_def/python_api/api_def_TensorListSplit.pbtxt b/tensorflow/core/api_def/python_api/api_def_TensorListSplit.pbtxt
new file mode 100644
index 00000000000..091297db071
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_TensorListSplit.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "TensorListSplit"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_TensorSliceDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_TensorSliceDataset.pbtxt
deleted file mode 100644
index 89ad016483f..00000000000
--- a/tensorflow/core/api_def/python_api/api_def_TensorSliceDataset.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "TensorSliceDataset"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_TextLineDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_TextLineDataset.pbtxt
deleted file mode 100644
index 08d785191b6..00000000000
--- a/tensorflow/core/api_def/python_api/api_def_TextLineDataset.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "TextLineDataset"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_UnbatchDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_UnbatchDataset.pbtxt
deleted file mode 100644
index 1e5415749f0..00000000000
--- a/tensorflow/core/api_def/python_api/api_def_UnbatchDataset.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "UnbatchDataset"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_ZipDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_ZipDataset.pbtxt
deleted file mode 100644
index dd1459521ff..00000000000
--- a/tensorflow/core/api_def/python_api/api_def_ZipDataset.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "ZipDataset"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/common_runtime/accumulate_n_optimizer.cc b/tensorflow/core/common_runtime/accumulate_n_optimizer.cc
index 822d0065b67..c4bc1a684cb 100644
--- a/tensorflow/core/common_runtime/accumulate_n_optimizer.cc
+++ b/tensorflow/core/common_runtime/accumulate_n_optimizer.cc
@@ -74,8 +74,7 @@ class AccumulateNV2RemovePass : public GraphOptimizationPass {
 
   Status rewriteNode(Node* n, Graph* g) {
     AttrSlice n_attrs = n->attrs();
-    auto base_make_node = [n, g, &n_attrs](const string& op,
-                                           const string& name) {
+    auto base_make_node = [n, &n_attrs](const string& op, const string& name) {
       NodeBuilder node_builder(name, op);
 
       // The pieces of AccumulateNV2 should all be on the same node.
@@ -86,7 +85,7 @@ class AccumulateNV2RemovePass : public GraphOptimizationPass {
       }
       return node_builder;
     };
-    auto make_node = [n, g, &n_attrs, &base_make_node](string op) {
+    auto make_node = [n, g, &base_make_node](string op) {
       return base_make_node(
           op, g->NewName(strings::StrCat(n->name(), "/Internal")));
     };
diff --git a/tensorflow/core/common_runtime/collective_executor_mgr_test.cc b/tensorflow/core/common_runtime/collective_executor_mgr_test.cc
index 91994c57311..f3d86aa6339 100644
--- a/tensorflow/core/common_runtime/collective_executor_mgr_test.cc
+++ b/tensorflow/core/common_runtime/collective_executor_mgr_test.cc
@@ -38,8 +38,9 @@ class CollectiveExecutorMgrTest : public ::testing::Test {
     auto* device_count = options.config.mutable_device_count();
     string task_name = "/job:localhost/replica:0/task:0";
     device_count->insert({"CPU", NUM_DEVS});
-    TF_CHECK_OK(DeviceFactory::AddDevices(options, task_name, &devices_));
-    device_mgr_.reset(new DeviceMgr(devices_));
+    std::vector<std::unique_ptr<Device>> devices;
+    TF_CHECK_OK(DeviceFactory::AddDevices(options, task_name, &devices));
+    device_mgr_.reset(new DeviceMgr(std::move(devices)));
     std::unique_ptr<DeviceResolverInterface> drl(
         new DeviceResolverLocal(device_mgr_.get()));
     std::unique_ptr<ParamResolverInterface> prl(
@@ -50,7 +51,6 @@ class CollectiveExecutorMgrTest : public ::testing::Test {
   }
 
   std::unique_ptr<CollectiveExecutorMgr> cme_;
-  std::vector<Device*> devices_;
   std::unique_ptr<DeviceMgr> device_mgr_;
 };
 
diff --git a/tensorflow/core/common_runtime/collective_param_resolver_local.cc b/tensorflow/core/common_runtime/collective_param_resolver_local.cc
index 3a03b6724c1..a8e3f4c881a 100644
--- a/tensorflow/core/common_runtime/collective_param_resolver_local.cc
+++ b/tensorflow/core/common_runtime/collective_param_resolver_local.cc
@@ -511,7 +511,7 @@ void CollectiveParamResolverLocal::FindInstanceRec(
         if (irec->is_init) {
           exit_outside_locks = true;
         } else {
-          irec->init_waiters.push_back([this, gr, cp, done](InstanceRec* irec) {
+          irec->init_waiters.push_back([this, done](InstanceRec* irec) {
             CallbackWithStatus(done, irec);
           });
           return;
@@ -696,7 +696,7 @@ void CollectiveParamResolverLocal::CompleteInstanceSource(InstanceRec* ir,
         if (ir->source_rank >= 0) {
           ir->status = errors::Internal("Instance ", cp->instance.instance_key,
                                         " already has source ", ir->source_rank,
-                                        ", recevied second claim from ",
+                                        ", received second claim from ",
                                         cp->default_rank);
         } else {
           ir->source_rank = cp->default_rank;
diff --git a/tensorflow/core/common_runtime/collective_param_resolver_local_test.cc b/tensorflow/core/common_runtime/collective_param_resolver_local_test.cc
index 9a501b32981..94d889c40df 100644
--- a/tensorflow/core/common_runtime/collective_param_resolver_local_test.cc
+++ b/tensorflow/core/common_runtime/collective_param_resolver_local_test.cc
@@ -37,8 +37,9 @@ class CollectiveParamResolverLocalTest : public ::testing::Test {
     string task_name = "/job:localhost/replica:0/task:0";
     auto* device_count = options.config.mutable_device_count();
     device_count->insert({"CPU", NUM_DEVS});
-    TF_CHECK_OK(DeviceFactory::AddDevices(options, task_name, &devices_));
-    device_mgr_.reset(new DeviceMgr(devices_));
+    std::vector<std::unique_ptr<Device>> devices;
+    TF_CHECK_OK(DeviceFactory::AddDevices(options, task_name, &devices));
+    device_mgr_.reset(new DeviceMgr(std::move(devices)));
     drl_.reset(new DeviceResolverLocal(device_mgr_.get()));
     prl_.reset(new CollectiveParamResolverLocal(device_mgr_.get(), drl_.get(),
                                                 task_name));
@@ -73,7 +74,6 @@ class CollectiveParamResolverLocalTest : public ::testing::Test {
     }
   }
 
-  std::vector<Device*> devices_;
   std::unique_ptr<DeviceMgr> device_mgr_;
   std::unique_ptr<DeviceResolverLocal> drl_;
   std::unique_ptr<CollectiveParamResolverLocal> prl_;
diff --git a/tensorflow/core/common_runtime/collective_rma_local.cc b/tensorflow/core/common_runtime/collective_rma_local.cc
index 288ae9d794a..d99565b49ab 100644
--- a/tensorflow/core/common_runtime/collective_rma_local.cc
+++ b/tensorflow/core/common_runtime/collective_rma_local.cc
@@ -38,7 +38,7 @@ void CollectiveRemoteAccessLocal::RecvFromPeer(
     return;
   }
   buf_rendezvous_.ConsumeBuf(
-      key, [this, to_tensor, to_device_ctx, to_device, to_alloc_attr,
+      key, [to_tensor, to_device_ctx, to_device, to_alloc_attr,
             dev_to_dev_stream_index,
             done](const Status& s, BufRendezvous::Hook* hook) {
         if (!s.ok()) {
diff --git a/tensorflow/core/common_runtime/collective_rma_local_test.cc b/tensorflow/core/common_runtime/collective_rma_local_test.cc
index a931fe64bd1..4263f3a4add 100644
--- a/tensorflow/core/common_runtime/collective_rma_local_test.cc
+++ b/tensorflow/core/common_runtime/collective_rma_local_test.cc
@@ -42,8 +42,9 @@ class CollectiveRemoteAccessLocalTest : public ::testing::Test {
     SessionOptions options;
     auto* device_count = options.config.mutable_device_count();
     device_count->insert({"CPU", NUM_DEVS});
-    TF_CHECK_OK(DeviceFactory::AddDevices(options, kTaskName, &devices_));
-    device_mgr_.reset(new DeviceMgr(devices_));
+    std::vector<std::unique_ptr<Device>> devices;
+    TF_CHECK_OK(DeviceFactory::AddDevices(options, kTaskName, &devices));
+    device_mgr_.reset(new DeviceMgr(std::move(devices)));
     drl_.reset(new DeviceResolverLocal(device_mgr_.get()));
     prl_.reset(new CollectiveParamResolverLocal(device_mgr_.get(), drl_.get(),
                                                 kTaskName));
@@ -51,7 +52,6 @@ class CollectiveRemoteAccessLocalTest : public ::testing::Test {
                                                kStepId));
   }
 
-  std::vector<Device*> devices_;
   std::unique_ptr<DeviceMgr> device_mgr_;
   std::unique_ptr<DeviceResolverLocal> drl_;
   std::unique_ptr<CollectiveParamResolverLocal> prl_;
diff --git a/tensorflow/core/common_runtime/constant_folding.cc b/tensorflow/core/common_runtime/constant_folding.cc
index 6d5c7f951e3..5c226ec56e1 100644
--- a/tensorflow/core/common_runtime/constant_folding.cc
+++ b/tensorflow/core/common_runtime/constant_folding.cc
@@ -471,10 +471,10 @@ bool ReplaceTensorWithConstant(
   // Be conservative when replacing a tensor with a constant, when not
   // running on CPU.
   // 1) Do not replace another constant.
-  // 2) If the destination tensor is not an int32 tensor, and has HOST_MEMORY
-  // constraint, do not replace it.
-  // 3) If the destination tensor is an int32 tensor, and has DEVICE_MEMORY
-  // constraint, do not replace it.
+  // 2) If the destination tensor or any other tensor from the same node is not
+  // an int32 tensor, and has HOST_MEMORY constraint, do not replace it.
+  // 3) If the destination tensor or any other tensor from the same node is an
+  // int32 tensor, and has DEVICE_MEMORY constraint, do not replace it.
   // 4) If the size of the constant in bytes is too large (>
   // max_constant_in_bytes), do not replace it. This prevents the size of the
   // Graph from growing too large.
@@ -490,16 +490,20 @@ bool ReplaceTensorWithConstant(
                                ? DeviceType{partition_device->device_type()}
                                : DEVICE_CPU;
   if (partition_device && device_type != DEVICE_CPU) {
-    MemoryType memory_type;
-    if (!MemoryTypeForOutput(device_type, graph, tensor.first, tensor.second,
-                             &memory_type)
+    MemoryTypeVector input_mvec;
+    MemoryTypeVector output_mvec;
+    if (!MemoryTypesForNode(graph->op_registry(), device_type,
+                            tensor.first->def(), &input_mvec, &output_mvec)
              .ok()) {
       return false;
     }
-    bool is_int32 = tensor.first->output_type(tensor.second) == DT_INT32;
-    if ((memory_type == HOST_MEMORY && !is_int32) ||
-        (memory_type == DEVICE_MEMORY && is_int32)) {
-      return false;
+    for (int i = 0; i < output_mvec.size(); i++) {
+      MemoryType memory_type = output_mvec[i];
+      bool is_int32 = tensor.first->output_type(i) == DT_INT32;
+      if ((memory_type == HOST_MEMORY && !is_int32) ||
+          (memory_type == DEVICE_MEMORY && is_int32)) {
+        return false;
+      }
     }
   }
   if (constant.TotalBytes() > max_constant_size_in_bytes) {
diff --git a/tensorflow/core/common_runtime/constant_folding_test.cc b/tensorflow/core/common_runtime/constant_folding_test.cc
index 98aefcde27f..1d4586f3da8 100644
--- a/tensorflow/core/common_runtime/constant_folding_test.cc
+++ b/tensorflow/core/common_runtime/constant_folding_test.cc
@@ -18,13 +18,16 @@ limitations under the License.
 #include <unordered_map>
 #include <vector>
 
+#include "tensorflow/cc/ops/nn_ops.h"
 #include "tensorflow/core/common_runtime/constant_folding.h"
 
 #include "tensorflow/cc/ops/array_ops_internal.h"
 #include "tensorflow/cc/ops/sendrecv_ops.h"
 #include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/device_factory.h"
 #include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/framework/device_attributes.pb.h"
 #include "tensorflow/core/framework/function_testlib.h"
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/tensor.h"
@@ -90,6 +93,24 @@ class ConstantFoldingTest : public ::testing::Test {
   }
 };
 
+class FakeDevice : public Device {
+ private:
+  explicit FakeDevice(const DeviceAttributes& device_attributes)
+      : Device(nullptr, device_attributes) {}
+
+ public:
+  Status Sync() override { return errors::Unimplemented("FakeDevice::Sync()"); }
+
+  Allocator* GetAllocator(AllocatorAttributes attr) override { return nullptr; }
+
+  static std::unique_ptr<Device> Make(const string& name, const string& type) {
+    DeviceAttributes device_attributes;
+    device_attributes.set_name(name);
+    device_attributes.set_device_type(DeviceType(type).type());
+    return std::unique_ptr<Device>(new FakeDevice(device_attributes));
+  }
+};
+
 TEST_F(ConstantFoldingTest, Basic) {
   Scope s = Scope::NewRootScope();
   BuildSimpleGraph(&s);
@@ -610,6 +631,31 @@ TEST_F(ConstantFoldingTest, ConstShapeKnown) {
   }
 }
 
+TEST_F(ConstantFoldingTest, NoReplacePartialOutput) {
+  Graph g(OpRegistry::Global());
+  {
+    Scope s = Scope::NewRootScope().ExitOnError().WithAssignedDevice("/gpu:0");
+
+    auto c0 = ops::Const<float>(s.WithOpName("c0"), {5.0, 2.0, 8.0, 1.0}, {4});
+    auto k = ops::Const<int>(s.WithOpName("k"), 3);
+    auto topK =
+        ops::TopK(s.WithOpName("topK"), c0, k, ops::TopK::Sorted(false));
+    auto send_values = ops::_Send(s.WithOpName("send_values"), topK.values,
+                                  "send_values", "sender", 0, "receiver");
+    auto send_indices = ops::_Send(s.WithOpName("send_indices"), topK.indices,
+                                   "send_indices", "sender", 0, "receiver");
+    TF_ASSERT_OK(s.ToGraph(&g));
+  }
+  bool was_mutated;
+  TF_EXPECT_OK(ConstantFold(
+      ConstantFoldingOptions{}, nullptr, Env::Default(),
+      FakeDevice::Make("/job:tpu_worker/replica:0/task:0/device:GPU:0",
+                       DEVICE_GPU)
+          .get(),
+      &g, &was_mutated));
+  EXPECT_FALSE(was_mutated);
+}
+
 namespace {
 
 const char kTestMemRegionName[] = "test://test";
diff --git a/tensorflow/core/common_runtime/device.cc b/tensorflow/core/common_runtime/device.cc
index 8fc64fff69a..9925814a48a 100644
--- a/tensorflow/core/common_runtime/device.cc
+++ b/tensorflow/core/common_runtime/device.cc
@@ -36,6 +36,8 @@ Device::~Device() {
   }
 }
 
+void Device::Sync(const DoneCallback& done) { done(Sync()); }
+
 // static
 DeviceAttributes Device::BuildDeviceAttributes(
     const string& name, DeviceType device, Bytes memory_limit,
diff --git a/tensorflow/core/common_runtime/device.h b/tensorflow/core/common_runtime/device.h
index 2ef1547cd9a..8dfbb21eda6 100644
--- a/tensorflow/core/common_runtime/device.h
+++ b/tensorflow/core/common_runtime/device.h
@@ -55,6 +55,9 @@ class DeviceMgr;
 
 class Device : public DeviceBase {
  public:
+  // Callback type that takes a Status and returns void.
+  typedef std::function<void(const Status&)> DoneCallback;
+
   Device(Env* env, const DeviceAttributes& device_attributes);
   ~Device() override;
 
@@ -112,6 +115,13 @@ class Device : public DeviceBase {
   // at completion.
   virtual Status Sync() = 0;
 
+  // Calls the given callback when all operations queued on the device at the
+  // time of the call have completed. The callback is passed any error pending
+  // on the device at completion.
+  // TODO(b/112409994): Consolidate these two APIs, removing the synchronous
+  // version.
+  virtual void Sync(const DoneCallback& done);
+
   // Override this to return true for devices that require a Sync() call before
   // session completion.
   virtual bool RequiresSyncOnCompletion() const { return false; }
diff --git a/tensorflow/core/common_runtime/device_factory.cc b/tensorflow/core/common_runtime/device_factory.cc
index b43c7188175..0fad13fe1e7 100644
--- a/tensorflow/core/common_runtime/device_factory.cc
+++ b/tensorflow/core/common_runtime/device_factory.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include <unordered_map>
 #include <vector>
 
+#include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/logging.h"
@@ -89,9 +90,9 @@ DeviceFactory* DeviceFactory::GetFactory(const string& device_type) {
   return it->second.factory.get();
 }
 
-Status DeviceFactory::AddDevices(const SessionOptions& options,
-                                 const string& name_prefix,
-                                 std::vector<Device*>* devices) {
+Status DeviceFactory::AddDevices(
+    const SessionOptions& options, const string& name_prefix,
+    std::vector<std::unique_ptr<Device>>* devices) {
   // CPU first. A CPU device is required.
   auto cpu_factory = GetFactory("CPU");
   if (!cpu_factory) {
@@ -116,19 +117,24 @@ Status DeviceFactory::AddDevices(const SessionOptions& options,
   return Status::OK();
 }
 
-Device* DeviceFactory::NewDevice(const string& type,
-                                 const SessionOptions& options,
-                                 const string& name_prefix) {
+std::unique_ptr<Device> DeviceFactory::NewDevice(const string& type,
+                                                 const SessionOptions& options,
+                                                 const string& name_prefix) {
   auto device_factory = GetFactory(type);
   if (!device_factory) {
     return nullptr;
   }
   SessionOptions opt = options;
   (*opt.config.mutable_device_count())[type] = 1;
-  std::vector<Device*> devices;
+  std::vector<std::unique_ptr<Device>> devices;
   TF_CHECK_OK(device_factory->CreateDevices(opt, name_prefix, &devices));
-  CHECK_EQ(devices.size(), size_t{1});
-  return devices[0];
+  int expected_num_devices = 1;
+  auto iter = options.config.device_count().find(type);
+  if (iter != options.config.device_count().end()) {
+    expected_num_devices = iter->second;
+  }
+  DCHECK_EQ(devices.size(), static_cast<size_t>(expected_num_devices));
+  return std::move(devices[0]);
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/device_factory.h b/tensorflow/core/common_runtime/device_factory.h
index db50226fe89..b3cd7adca9c 100644
--- a/tensorflow/core/common_runtime/device_factory.h
+++ b/tensorflow/core/common_runtime/device_factory.h
@@ -40,18 +40,19 @@ class DeviceFactory {
   // CPU devices are added first.
   static Status AddDevices(const SessionOptions& options,
                            const string& name_prefix,
-                           std::vector<Device*>* devices);
+                           std::vector<std::unique_ptr<Device>>* devices);
 
   // Helper for tests.  Create a single device of type "type".  The
   // returned device is always numbered zero, so if creating multiple
   // devices of the same type, supply distinct name_prefix arguments.
-  static Device* NewDevice(const string& type, const SessionOptions& options,
-                           const string& name_prefix);
+  static std::unique_ptr<Device> NewDevice(const string& type,
+                                           const SessionOptions& options,
+                                           const string& name_prefix);
 
   // Most clients should call AddDevices() instead.
-  virtual Status CreateDevices(const SessionOptions& options,
-                               const string& name_prefix,
-                               std::vector<Device*>* devices) = 0;
+  virtual Status CreateDevices(
+      const SessionOptions& options, const string& name_prefix,
+      std::vector<std::unique_ptr<Device>>* devices) = 0;
 
   // Return the device priority number for a "device_type" string.
   //
diff --git a/tensorflow/core/common_runtime/device_mgr.cc b/tensorflow/core/common_runtime/device_mgr.cc
index 470abc14312..1f7d7c46998 100644
--- a/tensorflow/core/common_runtime/device_mgr.cc
+++ b/tensorflow/core/common_runtime/device_mgr.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/core/common_runtime/device_mgr.h"
 
+#include <memory>
 #include <vector>
 #include "tensorflow/core/common_runtime/local_device.h"
 #include "tensorflow/core/framework/device_attributes.pb.h"
@@ -24,32 +25,32 @@ limitations under the License.
 
 namespace tensorflow {
 
-DeviceMgr::DeviceMgr(const std::vector<Device*>& devices)
-    : name_backing_store_(128) {
-  for (Device* d : devices) {
+DeviceMgr::DeviceMgr(std::vector<std::unique_ptr<Device>> devices)
+    : devices_(std::move(devices)), name_backing_store_(128) {
+  for (auto& d : devices_) {
     CHECK(d->device_mgr_ == nullptr);
     d->device_mgr_ = this;
 
-    devices_.push_back(d);
-
     // Register under the (1) full name and (2) canonical name.
     for (const string& name :
          DeviceNameUtils::GetNamesForDeviceMappings(d->parsed_name())) {
-      device_map_[CopyToBackingStore(name)] = d;
+      device_map_[CopyToBackingStore(name)] = d.get();
     }
     // Register under the (3) local name and (4) legacy local name.
     for (const string& name :
          DeviceNameUtils::GetLocalNamesForDeviceMappings(d->parsed_name())) {
-      device_map_[CopyToBackingStore(name)] = d;
+      device_map_[CopyToBackingStore(name)] = d.get();
     }
     device_type_counts_[d->device_type()]++;
   }
 }
 
-DeviceMgr::~DeviceMgr() {
-  // TODO(b/37437134): Remove destructor after converting to std::unique_ptr.
-  for (Device* p : devices_) delete p;
-}
+DeviceMgr::DeviceMgr(std::unique_ptr<Device> device)
+    : DeviceMgr([&device] {
+        std::vector<std::unique_ptr<Device>> vector;
+        vector.push_back(std::move(device));
+        return vector;
+      }()) {}
 
 StringPiece DeviceMgr::CopyToBackingStore(StringPiece s) {
   size_t n = s.size();
@@ -61,18 +62,22 @@ StringPiece DeviceMgr::CopyToBackingStore(StringPiece s) {
 void DeviceMgr::ListDeviceAttributes(
     std::vector<DeviceAttributes>* devices) const {
   devices->reserve(devices_.size());
-  for (Device* dev : devices_) {
+  for (const auto& dev : devices_) {
     devices->emplace_back(dev->attributes());
   }
 }
 
 std::vector<Device*> DeviceMgr::ListDevices() const {
-  return std::vector<Device*>(devices_.begin(), devices_.end());
+  std::vector<Device*> devices(devices_.size());
+  for (size_t i = 0; i < devices_.size(); ++i) {
+    devices[i] = devices_[i].get();
+  }
+  return devices;
 }
 
 string DeviceMgr::DebugString() const {
   string out;
-  for (Device* dev : devices_) {
+  for (const auto& dev : devices_) {
     strings::StrAppend(&out, dev->name(), "\n");
   }
   return out;
@@ -80,7 +85,7 @@ string DeviceMgr::DebugString() const {
 
 string DeviceMgr::DeviceMappingString() const {
   string out;
-  for (Device* dev : devices_) {
+  for (const auto& dev : devices_) {
     if (!dev->attributes().physical_device_desc().empty()) {
       strings::StrAppend(&out, dev->name(), " -> ",
                          dev->attributes().physical_device_desc(), "\n");
@@ -107,7 +112,7 @@ Status DeviceMgr::LookupDevice(StringPiece name, Device** device) const {
 
 void DeviceMgr::ClearContainers(gtl::ArraySlice<string> containers) const {
   Status s;
-  for (Device* dev : devices_) {
+  for (const auto& dev : devices_) {
     if (containers.empty()) {
       s.Update(dev->resource_manager()->Cleanup(
           dev->resource_manager()->default_container()));
diff --git a/tensorflow/core/common_runtime/device_mgr.h b/tensorflow/core/common_runtime/device_mgr.h
index c1ff10d9b59..bf8694655ae 100644
--- a/tensorflow/core/common_runtime/device_mgr.h
+++ b/tensorflow/core/common_runtime/device_mgr.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_COMMON_RUNTIME_DEVICE_MGR_H_
 #define TENSORFLOW_CORE_COMMON_RUNTIME_DEVICE_MGR_H_
 
+#include <memory>
 #include <string>
 #include <unordered_map>
 #include <unordered_set>
@@ -34,15 +35,17 @@ class DeviceAttributes;
 
 class DeviceMgr {
  public:
-  // Takes ownership of each device in 'devices'.
+  // Constructs a DeviceMgr from a list of devices.
   // TODO(zhifengc): Other initialization information.
-  // TODO(b/37437134): Use std::unique_ptr's to track ownership.
-  explicit DeviceMgr(const std::vector<Device*>& devices);
-  ~DeviceMgr();
+  explicit DeviceMgr(std::vector<std::unique_ptr<Device>> devices);
+
+  // Constructs a DeviceMgr managing a single device.
+  explicit DeviceMgr(std::unique_ptr<Device> device);
 
   // Returns attributes of all devices.
   void ListDeviceAttributes(std::vector<DeviceAttributes>* devices) const;
 
+  // Returns raw pointers to the underlying devices.
   std::vector<Device*> ListDevices() const;
 
   // Returns a string listing all devices.
@@ -62,9 +65,7 @@ class DeviceMgr {
   int NumDeviceType(const string& type) const;
 
  private:
-  // TODO(b/37437134): Use std::unique_ptr's to track ownership.
-  typedef gtl::InlinedVector<Device*, 8> DeviceVec;
-  DeviceVec devices_;
+  const std::vector<std::unique_ptr<Device>> devices_;
 
   StringPiece CopyToBackingStore(StringPiece s);
 
diff --git a/tensorflow/core/common_runtime/device_resolver_local_test.cc b/tensorflow/core/common_runtime/device_resolver_local_test.cc
index f5a6471ff73..54f1119e139 100644
--- a/tensorflow/core/common_runtime/device_resolver_local_test.cc
+++ b/tensorflow/core/common_runtime/device_resolver_local_test.cc
@@ -36,12 +36,12 @@ class DeviceResolverLocalTest : public ::testing::Test {
     string task_name = "/job:localhost/replica:0/task:0";
     auto* device_count = options.config.mutable_device_count();
     device_count->insert({"CPU", NUM_DEVS});
-    TF_CHECK_OK(DeviceFactory::AddDevices(options, task_name, &devices_));
-    device_mgr_.reset(new DeviceMgr(devices_));
+    std::vector<std::unique_ptr<Device>> devices;
+    TF_CHECK_OK(DeviceFactory::AddDevices(options, task_name, &devices));
+    device_mgr_.reset(new DeviceMgr(std::move(devices)));
     drl_.reset(new DeviceResolverLocal(device_mgr_.get()));
   }
 
-  std::vector<Device*> devices_;
   std::unique_ptr<DeviceMgr> device_mgr_;
   std::unique_ptr<DeviceResolverLocal> drl_;
 };
diff --git a/tensorflow/core/common_runtime/device_set_test.cc b/tensorflow/core/common_runtime/device_set_test.cc
index fd9c4222a7a..6a8c3d14e54 100644
--- a/tensorflow/core/common_runtime/device_set_test.cc
+++ b/tensorflow/core/common_runtime/device_set_test.cc
@@ -57,7 +57,7 @@ class DeviceSetTest : public ::testing::Test {
 class DummyFactory : public DeviceFactory {
  public:
   Status CreateDevices(const SessionOptions& options, const string& name_prefix,
-                       std::vector<Device*>* devices) override {
+                       std::vector<std::unique_ptr<Device>>* devices) override {
     return Status::OK();
   }
 };
diff --git a/tensorflow/core/common_runtime/direct_session.cc b/tensorflow/core/common_runtime/direct_session.cc
index 3a23f084a5c..0434ca47b68 100644
--- a/tensorflow/core/common_runtime/direct_session.cc
+++ b/tensorflow/core/common_runtime/direct_session.cc
@@ -19,7 +19,6 @@ limitations under the License.
 #include <string>
 #include <vector>
 
-#include "absl/time/clock.h"
 #include "tensorflow/core/common_runtime/collective_executor_mgr.h"
 #include "tensorflow/core/common_runtime/collective_param_resolver_local.h"
 #include "tensorflow/core/common_runtime/constant_folding.h"
@@ -157,12 +156,12 @@ class DirectSessionFactory : public SessionFactory {
     if (options.config.graph_options().build_cost_model() > 0) {
       EnableCPUAllocatorFullStats(true);
     }
-    std::vector<Device*> devices;
+    std::vector<std::unique_ptr<Device>> devices;
     TF_RETURN_IF_ERROR(DeviceFactory::AddDevices(
         options, "/job:localhost/replica:0/task:0", &devices));
 
     DirectSession* session =
-        new DirectSession(options, new DeviceMgr(devices), this);
+        new DirectSession(options, new DeviceMgr(std::move(devices)), this);
     {
       mutex_lock l(sessions_lock_);
       sessions_.push_back(session);
@@ -255,11 +254,19 @@ static RunHandlerPool* GetOrCreateRunHandlerPool(
   return pool;
 }
 
-bool DirectSession::ShouldUseRunHandlerPool() const {
-  if (options_.config.session_inter_op_thread_pool_size() > 0 ||
-      options_.config.use_per_session_threads()) {
+bool DirectSession::ShouldUseRunHandlerPool(
+    const RunOptions& run_options) const {
+  if (options_.config.use_per_session_threads()) return false;
+  if (options_.config.session_inter_op_thread_pool_size() > 0 &&
+      run_options.inter_op_thread_pool() > 0)
     return false;
-  }
+  // Only use RunHandlerPool when:
+  // a. Single global thread pool is used for inter-op parallelism.
+  // b. When multiple inter_op_thread_pool(s) are created, use it only while
+  // running sessions on the default inter_op_thread_pool=0. Typically,
+  // servo-team uses inter_op_thread_pool > 0 for model loading.
+  // TODO(crk): Revisit whether we'd want to create one (static) RunHandlerPool
+  // per entry in session_inter_op_thread_pool() in the future.
   return true;
 }
 
@@ -456,7 +463,7 @@ Status DirectSession::RunInternal(int64 step_id, const RunOptions& run_options,
                                   CallFrameInterface* call_frame,
                                   ExecutorsAndKeys* executors_and_keys,
                                   RunMetadata* run_metadata) {
-  const absl::Time start_time = absl::Now();
+  const uint64 start_time_usecs = Env::Default()->NowMicros();
   string session_id_meta = strings::StrCat("SessionRun #id=", step_id, "#");
   tracing::ScopedActivity activity(session_id_meta);
 
@@ -606,9 +613,8 @@ Status DirectSession::RunInternal(int64 step_id, const RunOptions& run_options,
   }
 
   std::unique_ptr<RunHandler> handler;
-  if (ShouldUseRunHandlerPool() &&
+  if (ShouldUseRunHandlerPool(run_options) &&
       run_options.experimental().use_run_handler_pool()) {
-    // Non-null only when a global inter-op pool is used.
     VLOG(1) << "Using RunHandler to scheduler inter-op closures.";
     handler = GetOrCreateRunHandlerPool(options_)->Get();
   }
@@ -712,7 +718,7 @@ Status DirectSession::RunInternal(int64 step_id, const RunOptions& run_options,
       exec_and_lib.graph->ToGraphDef(partition_graph_def);
     }
   }
-  UpdateGraphExecutionTime(absl::Now() - start_time);
+  UpdateGraphExecTime(Env::Default()->NowMicros() - start_time_usecs);
 
   return Status::OK();
 }
diff --git a/tensorflow/core/common_runtime/direct_session.h b/tensorflow/core/common_runtime/direct_session.h
index 3a168bbe3fc..6754e9cfb71 100644
--- a/tensorflow/core/common_runtime/direct_session.h
+++ b/tensorflow/core/common_runtime/direct_session.h
@@ -247,8 +247,10 @@ class DirectSession : public Session {
                                    ExecutorsAndKeys* executors_and_keys,
                                    RunMetadata* run_metadata);
 
-  // Returns whether inter-op execution uses a global pool.
-  bool ShouldUseRunHandlerPool() const;
+  // Returns whether inter-op execution uses a global pool or the input
+  // `run_options` requests being run on inter_op_thread_pool = 0 in case
+  // multiple pools are configured.
+  bool ShouldUseRunHandlerPool(const RunOptions& run_options) const;
 
   ::tensorflow::Status ExtendLocked(const GraphDef& graph)
       EXCLUSIVE_LOCKS_REQUIRED(graph_state_lock_);
diff --git a/tensorflow/core/common_runtime/eager/BUILD b/tensorflow/core/common_runtime/eager/BUILD
index a7b618c18be..86890ba07d8 100644
--- a/tensorflow/core/common_runtime/eager/BUILD
+++ b/tensorflow/core/common_runtime/eager/BUILD
@@ -181,6 +181,7 @@ tf_cc_test(
         "//tensorflow/core:lib",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
+        "@com_google_absl//absl/memory",
     ],
 )
 
diff --git a/tensorflow/core/common_runtime/eager/attr_builder.cc b/tensorflow/core/common_runtime/eager/attr_builder.cc
index 201f06242f8..a750f8cbba4 100644
--- a/tensorflow/core/common_runtime/eager/attr_builder.cc
+++ b/tensorflow/core/common_runtime/eager/attr_builder.cc
@@ -39,6 +39,18 @@ std::unordered_map<string, const AttrTypeMap*>* OpNameToAttrTypeMap() {
 
 const uint32 kIsList = 1U << 31;
 
+AttrTypeMap* DefaultFunctionAttrTypeMap() {
+  AttrTypeMap* map = new AttrTypeMap();
+  (*map)["executor_type"] = TF_ATTR_STRING;
+  (*map)["config"] = TF_ATTR_STRING;
+  return map;
+}
+
+const AttrTypeMap* GetDefaultFunctionAttrTypeMap() {
+  static const AttrTypeMap* map = DefaultFunctionAttrTypeMap();
+  return map;
+}
+
 }  // namespace
 
 Status OpDefForOp(const char* op_name, const OpDef** op_def) {
@@ -50,13 +62,27 @@ Status OpDefForOp(const char* op_name, const OpDef** op_def) {
   return s;
 }
 
-Status AttrTypeMapForOp(const char* op_name, const AttrTypeMap** out) {
+Status AttrTypeMapForOp(const char* op_name, const AttrTypeMap** out,
+                        bool* is_function) {
   mutex_lock l(g_op_name_to_attr_type_map_lock);
+  *is_function = false;
   *out = gtl::FindPtrOrNull(*OpNameToAttrTypeMap(), op_name);
   if (*out != nullptr) return Status::OK();
   const OpDef* op_def = nullptr;
   Status s = OpDefForOp(op_name, &op_def);
-  if (!s.ok()) return s;
+  if (errors::IsNotFound(s)) {
+    // If we did not find the op def, we assume `op_name` is a function.
+    // If it is actually a misspelled op, user will get another error when
+    // trying to run it.
+    // TODO(iga): If we ever have a use case for different attribute specs
+    // in different functions, we will need to look at the OpDef in the
+    // function def to retrieve their types.
+    *out = GetDefaultFunctionAttrTypeMap();
+    *is_function = true;
+    return Status::OK();
+  } else if (!s.ok()) {
+    return s;
+  }
   std::unique_ptr<AttrTypeMap> m(new AttrTypeMap);
   // TODO(agarwal): Avoid having to create this "registry" at runtime,
   // perhaps can be done at op registration time?
@@ -98,7 +124,7 @@ Status AttrTypeMapForOp(const char* op_name, const AttrTypeMap** out) {
 #define DEFINE_SET_ATTR(value_type, value_field)                             \
   template <>                                                                \
   AttrBuilder& AttrBuilder::Set(StringPiece attr_name, value_type&& value) { \
-    value_field.push_back(std::make_pair(attr_name, value));                 \
+    value_field.push_back(std::make_pair(string(attr_name), value));         \
     return *this;                                                            \
   }
 
diff --git a/tensorflow/core/common_runtime/eager/attr_builder.h b/tensorflow/core/common_runtime/eager/attr_builder.h
index af5b7d80c32..5e0172dfd32 100644
--- a/tensorflow/core/common_runtime/eager/attr_builder.h
+++ b/tensorflow/core/common_runtime/eager/attr_builder.h
@@ -43,7 +43,11 @@ typedef std::unordered_map<string, uint32> AttrTypeMap;
 Status OpDefForOp(const char* op_name, const OpDef** op_def);
 
 // Returns the AttrTypeMap for the TensorFlow operation named op_name.
-Status AttrTypeMapForOp(const char* op_name, const AttrTypeMap** out);
+// If op_name is not registered in global op registry, AttrTypeMapForOp assumes
+// the op to be a function and returns the default attributes for a function.
+// `is_function` is set to true in this case.
+Status AttrTypeMapForOp(const char* op_name, const AttrTypeMap** out,
+                        bool* is_function);
 
 // Looks for 'attr_name' in 'm' and sets 'out' and 'is_list'.
 Status AttrTypeByName(const AttrTypeMap& m, const string& attr_name,
@@ -95,7 +99,7 @@ class AttrBuilder {
   template <class T>
   AttrBuilder& Set(StringPiece attr_name, T&& value) {
     MayBeInitializeNodeDef();
-    SetInAttrValueMap(node_def_->mutable_attr(), attr_name, value);
+    SetInAttrValueMap(node_def_->mutable_attr(), string(attr_name), value);
     return *this;
   }
 
@@ -106,7 +110,7 @@ class AttrBuilder {
 
  private:
   template <class T>
-  using AttrVec = tensorflow::gtl::InlinedVector<std::pair<StringPiece, T>, 2>;
+  using AttrVec = tensorflow::gtl::InlinedVector<std::pair<string, T>, 2>;
 
   void MayBeInitializeNodeDef();
   // Fill `m` with the attr-value pairs set via AttrBuilder::Set() so far, as
@@ -118,7 +122,7 @@ class AttrBuilder {
   void FillAttrValueMap(AttrValueMap* m, bool include_those_in_node_def) const;
 
   template <class T>
-  void SetInAttrValueMap(AttrValueMap* m, StringPiece attr_name,
+  void SetInAttrValueMap(AttrValueMap* m, const string& attr_name,
                          T&& value) const {
     DCHECK(!node_def_finalized_)
         << "Calling SetInAttrValueMap after BuildNodeDef.";
@@ -127,12 +131,12 @@ class AttrBuilder {
     AttrValue attr_value;
     if (found == nullptr) {
       SetAttrValue(value, &attr_value);
-      m->insert(AttrValueMap::value_type(string(attr_name), attr_value));
+      m->insert(AttrValueMap::value_type(attr_name, attr_value));
     } else {
       // TODO(ashankar): Do what is done in
       // NodeDefBuilder::CheckInconsistency(attr_name, *found, attr_value);
       SetAttrValue(std::forward<T>(value), &attr_value);
-      (*m)[string(attr_name)] = attr_value;
+      (*m)[attr_name] = attr_value;
     }
   }
 
diff --git a/tensorflow/core/common_runtime/eager/attr_builder_test.cc b/tensorflow/core/common_runtime/eager/attr_builder_test.cc
index 79b094f2e00..220cc6f5ce0 100644
--- a/tensorflow/core/common_runtime/eager/attr_builder_test.cc
+++ b/tensorflow/core/common_runtime/eager/attr_builder_test.cc
@@ -35,9 +35,18 @@ namespace {
 
 TEST(AttrTypeMap, Lookup) {
   const AttrTypeMap* m = nullptr;
-  Status s = AttrTypeMapForOp("ThisOpCannotPossiblyExist", &m);
-  EXPECT_FALSE(s.ok());
-  s = AttrTypeMapForOp("MatMul", &m);
+  // Unknown ops are assumed to be functions.
+  // Their maps are filled with default attributes.
+  bool is_function = false;
+  Status s = AttrTypeMapForOp("SomeFunctionName", &m, &is_function);
+  EXPECT_TRUE(s.ok());
+  EXPECT_TRUE(is_function);
+  EXPECT_EQ(TF_ATTR_STRING, m->find("executor_type")->second);
+  EXPECT_EQ(TF_ATTR_STRING, m->find("config")->second);
+
+  is_function = true;
+  s = AttrTypeMapForOp("MatMul", &m, &is_function);
+  EXPECT_FALSE(is_function);
   ASSERT_TRUE(s.ok()) << s;
 
   TF_AttrType t;
@@ -50,7 +59,7 @@ TEST(AttrTypeMap, Lookup) {
   EXPECT_EQ(TF_ATTR_BOOL, t);
   EXPECT_EQ(is_list, 0);
 
-  s = AttrTypeMapForOp("Squeeze", &m);
+  s = AttrTypeMapForOp("Squeeze", &m, &is_function);
   ASSERT_TRUE(s.ok()) << s;
   s = AttrTypeByName(*m, "squeeze_dims", &t, &is_list);
   ASSERT_TRUE(s.ok()) << s;
diff --git a/tensorflow/core/common_runtime/eager/context.cc b/tensorflow/core/common_runtime/eager/context.cc
index f23cefb33d7..583ae64edd1 100644
--- a/tensorflow/core/common_runtime/eager/context.cc
+++ b/tensorflow/core/common_runtime/eager/context.cc
@@ -32,18 +32,6 @@ bool ReadBoolFromEnvVar(StringPiece env_var_name, bool default_val) {
   return default_val;
 }
 
-std::unique_ptr<thread::ThreadPool> EagerThreadPool(
-    const SessionOptions& opts) {
-  SessionOptions opts_copy(opts);
-  if (opts_copy.config.inter_op_parallelism_threads() == 0) {
-    // Eager defaults to a single thread when no threads are specified.
-    opts_copy.config.set_inter_op_parallelism_threads(1);
-  }
-
-  return std::unique_ptr<thread::ThreadPool>(
-      NewThreadPoolFromSessionOptions(opts_copy));
-}
-
 }  // namespace
 
 EagerContext::EagerContext(const SessionOptions& opts,
@@ -61,7 +49,7 @@ EagerContext::EagerContext(const SessionOptions& opts,
     : policy_(default_policy),
       devices_(device_mgr->ListDevices()),
       rendezvous_(rendezvous),
-      thread_pool_(EagerThreadPool(opts)),
+      thread_pool_(NewThreadPoolFromSessionOptions(opts)),
       pflr_(new ProcessFunctionLibraryRuntime(
           device_mgr, opts.env, TF_GRAPH_DEF_VERSION, &func_lib_def_, {},
           thread_pool_.get())),
diff --git a/tensorflow/core/common_runtime/eager/context.h b/tensorflow/core/common_runtime/eager/context.h
index 4de807bde31..51109f8f1ae 100644
--- a/tensorflow/core/common_runtime/eager/context.h
+++ b/tensorflow/core/common_runtime/eager/context.h
@@ -206,6 +206,8 @@ class EagerContext {
   bool UseSendTensorRPC() { return use_send_tensor_rpc_; }
   bool PinSmallOpsToCPU() { return pin_small_ops_to_cpu_; }
 
+  tensorflow::Env* TFEnv() const { return env_; }
+
  private:
   void InitDeviceMapAndAsync();
   Status MaybeRegisterFunctionRemotely(const FunctionDef& fdef);
diff --git a/tensorflow/core/common_runtime/eager/eager_operation.h b/tensorflow/core/common_runtime/eager/eager_operation.h
index fcf62c77153..935ca7f9aa7 100644
--- a/tensorflow/core/common_runtime/eager/eager_operation.h
+++ b/tensorflow/core/common_runtime/eager/eager_operation.h
@@ -22,11 +22,14 @@ limitations under the License.
 namespace tensorflow {
 class EagerOperation {
  public:
-  // t is NULL iff the EagerOperation corresponds to a TensorFlow function
-  // instead of a primitive operation.
   EagerOperation(tensorflow::EagerContext* ctx, const char* op,
-                 const tensorflow::AttrTypeMap* t)
-      : ctx_(ctx), name_(op), attrs_(op), attr_types_(t), device_(nullptr) {}
+                 bool is_function, const tensorflow::AttrTypeMap* t)
+      : ctx_(ctx),
+        name_(op),
+        attrs_(op),
+        attr_types_(t),
+        device_(nullptr),
+        is_function_(is_function) {}
 
   ~EagerOperation() {
     for (tensorflow::TensorHandle* h : inputs_) {
@@ -34,7 +37,7 @@ class EagerOperation {
     }
   }
 
-  bool is_function() const { return attr_types_ == nullptr; }
+  bool is_function() const { return is_function_; }
 
   tensorflow::EagerContext* EagerContext() { return ctx_; }
 
@@ -68,6 +71,7 @@ class EagerOperation {
   tensorflow::gtl::InlinedVector<tensorflow::TensorHandle*, 4> inputs_;
   tensorflow::Device* device_;
   bool use_xla_ = false;
+  const bool is_function_;
 };
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/common_runtime/eager/execute.cc b/tensorflow/core/common_runtime/eager/execute.cc
index a708033c650..5bf7888fad5 100644
--- a/tensorflow/core/common_runtime/eager/execute.cc
+++ b/tensorflow/core/common_runtime/eager/execute.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/eager/execute_node.h"
 #include "tensorflow/core/common_runtime/eager/kernel_and_device.h"
 #include "tensorflow/core/common_runtime/eager/tensor_handle.h"
+#include "tensorflow/core/lib/core/errors.h"
 #ifndef __ANDROID__
 #include "tensorflow/core/distributed_runtime/eager/eager_client.h"
 #include "tensorflow/core/distributed_runtime/eager/remote_execute_node.h"
@@ -827,8 +828,11 @@ Status ExecuteSend(EagerContext* ctx, tensorflow::Device* device,
                    TensorHandle* h, StringPiece wire_id,
                    const string& recv_device) {
   const tensorflow::AttrTypeMap* types;
-  TF_RETURN_IF_ERROR(tensorflow::AttrTypeMapForOp("_Send", &types));
-  tensorflow::EagerOperation op(ctx, "_Send", types);
+  bool is_function = false;
+  TF_RETURN_IF_ERROR(
+      tensorflow::AttrTypeMapForOp("_Send", &types, &is_function));
+  DCHECK(!is_function);
+  tensorflow::EagerOperation op(ctx, "_Send", /*is_function=*/false, types);
 
   op.AddInput(h);
 
@@ -855,8 +859,11 @@ Status ExecuteRecv(EagerContext* ctx, tensorflow::Device* device,
                    const string& send_device, int64 send_device_incarnation,
                    TensorHandle** result) {
   const tensorflow::AttrTypeMap* types;
-  TF_RETURN_IF_ERROR(tensorflow::AttrTypeMapForOp("_Recv", &types));
-  tensorflow::EagerOperation op(ctx, "_Recv", types);
+  bool is_function = false;
+  TF_RETURN_IF_ERROR(
+      tensorflow::AttrTypeMapForOp("_Recv", &types, &is_function));
+  DCHECK(!is_function);
+  tensorflow::EagerOperation op(ctx, "_Recv", /*is_function=*/false, types);
 
   op.SetDevice(device);
 
diff --git a/tensorflow/core/common_runtime/eager/kernel_and_device_test.cc b/tensorflow/core/common_runtime/eager/kernel_and_device_test.cc
index 948bdbcaf53..3ffed3ce321 100644
--- a/tensorflow/core/common_runtime/eager/kernel_and_device_test.cc
+++ b/tensorflow/core/common_runtime/eager/kernel_and_device_test.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <memory>
 #include <vector>
 
+#include "absl/memory/memory.h"
 #include "tensorflow/cc/client/client_session.h"
 #include "tensorflow/cc/framework/ops.h"
 #include "tensorflow/cc/framework/scope.h"
@@ -37,12 +38,13 @@ namespace {
 class TestEnv {
  public:
   TestEnv() : flib_def_(OpRegistry::Global(), {}) {
-    Device* device =
-        DeviceFactory::NewDevice("CPU", {}, "/job:a/replica:0/task:0");
-    device_mgr_.reset(new DeviceMgr({device}));
-    flib_runtime_ = NewFunctionLibraryRuntime(device_mgr_.get(), Env::Default(),
-                                              device, TF_GRAPH_DEF_VERSION,
-                                              &flib_def_, nullptr, {}, nullptr);
+    std::vector<std::unique_ptr<Device>> devices;
+    devices.push_back(
+        DeviceFactory::NewDevice("CPU", {}, "/job:a/replica:0/task:0"));
+    device_mgr_ = absl::make_unique<DeviceMgr>(std::move(devices));
+    flib_runtime_ = NewFunctionLibraryRuntime(
+        device_mgr_.get(), Env::Default(), device_mgr_->ListDevices()[0],
+        TF_GRAPH_DEF_VERSION, &flib_def_, nullptr, {}, nullptr);
   }
 
   FunctionLibraryRuntime* function_library_runtime() const {
diff --git a/tensorflow/core/common_runtime/executor.cc b/tensorflow/core/common_runtime/executor.cc
index 1e68954827f..6b3284b84a0 100644
--- a/tensorflow/core/common_runtime/executor.cc
+++ b/tensorflow/core/common_runtime/executor.cc
@@ -1239,7 +1239,6 @@ class ExecutorState {
   // Step-local container.
   ScopedStepContainer* step_container_;
   StepStatsCollectorInterface* const stats_collector_;
-  const tracing::TraceCollector* const trace_collector_;
   const tracing::EventCollector* const event_collector_;
   Context context_;
 
@@ -1366,7 +1365,6 @@ ExecutorState::ExecutorState(const Executor::Args& args, ExecutorImpl* impl)
       tensor_store_(args.tensor_store),
       step_container_(args.step_container),
       stats_collector_(args.stats_collector),
-      trace_collector_(tracing::GetTraceCollector()),
       event_collector_(
           tracing::GetEventCollector(tracing::EventCategory::kCompute)),
       context_(ContextKind::kThread),
@@ -1565,7 +1563,6 @@ struct ExecutorState::AsyncState {
 // Returns true if `item` might be traced by the given trace and event
 // collectors. Returns false only if `item` definitely will not be traced.
 bool MightTrace(const NodeItem& item,
-                const tracing::TraceCollector* trace_collector,
                 const tracing::EventCollector* event_collector,
                 bool using_annotations) {
   // Tracing will only be enabled if either `event_collector` is non null,
@@ -1578,6 +1575,7 @@ bool MightTrace(const NodeItem& item,
   if (event_collector != nullptr) {
     return true;
   }
+  auto* trace_collector = tracing::GetTraceCollector();
   if (trace_collector) {
     if (using_annotations) {
       return trace_collector->IsEnabledForAnnotations();
@@ -1713,7 +1711,7 @@ void ExecutorState::Process(TaggedNode tagged_node, int64 scheduled_nsec) {
         auto done = [this, state]() {
           Device* device = impl_->params_.device;
           NodeExecStatsInterface* stats = state->stats;  // Shorthand
-          Entry* first_input = state->first_input;     // Shorthand
+          Entry* first_input = state->first_input;       // Shorthand
 
           nodestats::SetOpEnd(stats);
           EntryVector outputs;
@@ -1762,9 +1760,8 @@ void ExecutorState::Process(TaggedNode tagged_node, int64 scheduled_nsec) {
         OpKernelContext ctx(&params, item.num_outputs);
         nodestats::SetOpStart(stats);
 
-        if (TF_PREDICT_FALSE(MightTrace(item, trace_collector_,
-                                        event_collector_,
-                                        trace_using_annotations_))) {
+        if (TF_PREDICT_FALSE(
+                MightTrace(item, event_collector_, trace_using_annotations_))) {
           const string& op_name = op_kernel->name();
           tracing::ScopedRegion region(tracing::EventCategory::kCompute,
                                        op_name);
@@ -2046,6 +2043,24 @@ Status ExecutorState::ProcessOutputs(const NodeItem& item, OpKernelContext* ctx,
 void ExecutorState::PropagateOutputs(const TaggedNode& tagged_node,
                                      const NodeItem* item, EntryVector* outputs,
                                      TaggedNodeSeq* ready) {
+  auto activity_handle =
+      [&]() -> std::unique_ptr<tracing::TraceCollector::Handle> {
+    auto* trace_collector = tracing::GetTraceCollector();
+    if (TF_PREDICT_FALSE(trace_collector != nullptr &&
+                         trace_collector->IsEnabledForActivities(
+                             false /* is_expensive */))) {
+      const string& op_name = item->kernel->name();
+      // Intentionally using ExecutorPropagateOutputs as the first key so that
+      // users are aware that it's not the op invocation.
+      return trace_collector->CreateActivityHandle(
+          "ExecutorPropagateOutputs",
+          strings::StrCat(op_name, "#id=", step_id_, "#"),
+          false /* is_expensive */);
+    } else {
+      return nullptr;
+    }
+  }();
+
   const Node* node = tagged_node.node;
   FrameState* input_frame = tagged_node.input_frame;
   const int64 input_iter = tagged_node.input_iter;
@@ -2377,18 +2392,23 @@ void ExecutorState::Finish() {
   auto done_cb = std::move(done_cb_);
   auto runner = std::move(runner_);
   mu_.unlock();
+  CHECK(done_cb != nullptr);
   Device* device = impl_->params_.device;
+
   if ((sync_on_finish_ && status.ok()) || device->RequiresSyncOnCompletion()) {
     // Block until the device has finished all queued operations. For
     // devices like GPUs that continue to execute Ops after their Compute
     // methods have completed, this ensures that control is not returned to
     // the user until the step (and its side-effects) has actually completed.
-    status.Update(device->Sync());
+    device->Sync([=](Status new_status) mutable {
+      status.Update(new_status);
+      delete this;
+      runner([=]() { done_cb(status); });
+    });
+  } else {
+    delete this;
+    runner([=]() { done_cb(status); });
   }
-
-  delete this;
-  CHECK(done_cb != nullptr);
-  runner([=]() { done_cb(status); });
 }
 
 void ExecutorState::FindOrCreateChildFrame(FrameState* frame, int64 iter,
diff --git a/tensorflow/core/common_runtime/executor_test.cc b/tensorflow/core/common_runtime/executor_test.cc
index 7697103faf9..c311b2533ea 100644
--- a/tensorflow/core/common_runtime/executor_test.cc
+++ b/tensorflow/core/common_runtime/executor_test.cc
@@ -53,17 +53,17 @@ class ExecutorTest : public ::testing::Test {
     // when the test completes.
     CHECK(rendez_->Unref());
     delete exec_;
-    delete device_;
   }
 
   // Resets executor_ with a new executor based on a graph 'gdef'.
   void Create(std::unique_ptr<const Graph> graph) {
     const int version = graph->versions().producer();
     LocalExecutorParams params;
-    params.device = device_;
+    params.device = device_.get();
     params.create_kernel = [this, version](const NodeDef& ndef,
                                            OpKernel** kernel) {
-      return CreateNonCachedKernel(device_, nullptr, ndef, version, kernel);
+      return CreateNonCachedKernel(device_.get(), nullptr, ndef, version,
+                                   kernel);
     };
     params.delete_kernel = [](OpKernel* kernel) {
       DeleteNonCachedKernel(kernel);
@@ -83,7 +83,7 @@ class ExecutorTest : public ::testing::Test {
   }
 
   thread::ThreadPool* thread_pool_ = nullptr;
-  Device* device_ = nullptr;
+  std::unique_ptr<Device> device_;
   Executor* exec_ = nullptr;
   StepStatsCollector step_stats_collector_;
   StepStats step_stats_;
diff --git a/tensorflow/core/common_runtime/function_test.cc b/tensorflow/core/common_runtime/function_test.cc
index 13c189fb877..cab95cb5968 100644
--- a/tensorflow/core/common_runtime/function_test.cc
+++ b/tensorflow/core/common_runtime/function_test.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <atomic>
 #include <utility>
 
+#include "absl/memory/memory.h"
 #include "absl/strings/numbers.h"
 #include "absl/strings/str_split.h"
 #include "tensorflow/cc/ops/array_ops_internal.h"
@@ -147,14 +148,15 @@ class FunctionLibraryRuntimeTest : public ::testing::Test {
     SessionOptions options;
     auto* device_count = options.config.mutable_device_count();
     device_count->insert({"CPU", 3});
+    std::vector<std::unique_ptr<Device>> devices;
     TF_CHECK_OK(DeviceFactory::AddDevices(
-        options, "/job:localhost/replica:0/task:0", &devices_));
+        options, "/job:localhost/replica:0/task:0", &devices));
 
     FunctionDefLibrary proto;
     for (const auto& fdef : flib) *(proto.add_function()) = fdef;
     lib_def_.reset(new FunctionLibraryDefinition(OpRegistry::Global(), proto));
     OptimizerOptions opts;
-    device_mgr_.reset(new DeviceMgr(devices_));
+    device_mgr_ = absl::make_unique<DeviceMgr>(std::move(devices));
     pflr_.reset(new ProcessFunctionLibraryRuntime(
         device_mgr_.get(), Env::Default(), TF_GRAPH_DEF_VERSION, lib_def_.get(),
         opts, default_thread_pool, nullptr /* cluster_flr */));
@@ -358,7 +360,6 @@ class FunctionLibraryRuntimeTest : public ::testing::Test {
   FunctionLibraryRuntime* flr0_;
   FunctionLibraryRuntime* flr1_;
   FunctionLibraryRuntime* flr2_;
-  std::vector<Device*> devices_;
   std::unique_ptr<DeviceMgr> device_mgr_;
   std::unique_ptr<FunctionLibraryDefinition> lib_def_;
   std::unique_ptr<ProcessFunctionLibraryRuntime> pflr_;
@@ -1432,9 +1433,7 @@ TEST_F(FunctionLibraryRuntimeTest, Gradient_AddSum) {
 
     GraphDef actual;
     g->ToGraphDef(&actual);
-    // The optimizer is non-deterministic, so we only check that the number of
-    // nodes is not greater than expected.
-    EXPECT_LE(actual.node_size(), expected.node_size());
+    TF_EXPECT_GRAPH_EQ(expected, actual);
   }
 }
 
diff --git a/tensorflow/core/common_runtime/function_threadpool_test.cc b/tensorflow/core/common_runtime/function_threadpool_test.cc
index 655a68cfc93..1b803736fb8 100644
--- a/tensorflow/core/common_runtime/function_threadpool_test.cc
+++ b/tensorflow/core/common_runtime/function_threadpool_test.cc
@@ -54,21 +54,19 @@ class FunctionLibraryRuntimeTest : public ::testing::Test {
     SessionOptions options;
     auto* device_count = options.config.mutable_device_count();
     device_count->insert({"CPU", 3});
+    std::vector<std::unique_ptr<Device>> devices;
     TF_CHECK_OK(DeviceFactory::AddDevices(
-        options, "/job:localhost/replica:0/task:0", &devices_));
+        options, "/job:localhost/replica:0/task:0", &devices));
 
     FunctionDefLibrary proto;
     for (const auto& fdef : flib) *(proto.add_function()) = fdef;
     lib_def_.reset(new FunctionLibraryDefinition(OpRegistry::Global(), proto));
     OptimizerOptions opts;
-    device_mgr_.reset(new DeviceMgr(devices_));
+    device_mgr_.reset(new DeviceMgr(std::move(devices)));
     pflr_.reset(new ProcessFunctionLibraryRuntime(
         device_mgr_.get(), Env::Default(), TF_GRAPH_DEF_VERSION, lib_def_.get(),
         opts, default_thread_pool, nullptr /* cluster_flr */));
     flr0_ = pflr_->GetFLR("/job:localhost/replica:0/task:0/cpu:0");
-    flr1_ = pflr_->GetFLR("/job:localhost/replica:0/task:0/cpu:1");
-    flr2_ = pflr_->GetFLR("/job:localhost/replica:0/task:0/cpu:2");
-    fdef_lib_ = lib_def_->ToProto();
   }
 
   Status Run(FunctionLibraryRuntime* flr, FunctionLibraryRuntime::Handle handle,
@@ -192,13 +190,9 @@ class FunctionLibraryRuntimeTest : public ::testing::Test {
   }
 
   FunctionLibraryRuntime* flr0_;
-  FunctionLibraryRuntime* flr1_;
-  FunctionLibraryRuntime* flr2_;
-  std::vector<Device*> devices_;
   std::unique_ptr<DeviceMgr> device_mgr_;
   std::unique_ptr<FunctionLibraryDefinition> lib_def_;
   std::unique_ptr<ProcessFunctionLibraryRuntime> pflr_;
-  FunctionDefLibrary fdef_lib_;
 };
 
 TEST_F(FunctionLibraryRuntimeTest, DefaultThreadpool) {
diff --git a/tensorflow/core/common_runtime/gpu/gpu_device.cc b/tensorflow/core/common_runtime/gpu/gpu_device.cc
index 81fea311e13..5152d97fdef 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_device.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_device.cc
@@ -907,9 +907,9 @@ Allocator* BaseGPUDevice::GetScopedAllocator(AllocatorAttributes attr,
 const int BaseGPUDeviceFactory::InterconnectMap::kSameDeviceStrength = 1000;
 const int BaseGPUDeviceFactory::InterconnectMap::kStreamExecutorStrength = 1;
 
-Status BaseGPUDeviceFactory::CreateDevices(const SessionOptions& options,
-                                           const string& name_prefix,
-                                           std::vector<Device*>* devices) {
+Status BaseGPUDeviceFactory::CreateDevices(
+    const SessionOptions& options, const string& name_prefix,
+    std::vector<std::unique_ptr<Device>>* devices) {
   TF_RETURN_IF_ERROR(ValidateGPUMachineManager());
   se::Platform* gpu_manager = GPUMachineManager();
   if (gpu_manager == nullptr) {
@@ -1073,12 +1073,10 @@ static string GetShortDeviceDescription(PlatformGpuId platform_gpu_id,
   // LINT.ThenChange(//tensorflow/python/platform/test.py)
 }
 
-Status BaseGPUDeviceFactory::CreateGPUDevice(const SessionOptions& options,
-                                             const string& name_prefix,
-                                             TfGpuId tf_gpu_id,
-                                             int64 memory_limit,
-                                             const DeviceLocality& dev_locality,
-                                             std::vector<Device*>* devices) {
+Status BaseGPUDeviceFactory::CreateGPUDevice(
+    const SessionOptions& options, const string& name_prefix, TfGpuId tf_gpu_id,
+    int64 memory_limit, const DeviceLocality& dev_locality,
+    std::vector<std::unique_ptr<Device>>* devices) {
   CHECK_GE(tf_gpu_id.value(), 0);
   const string device_name =
       strings::StrCat(name_prefix, "/device:GPU:", tf_gpu_id.value());
@@ -1108,7 +1106,7 @@ Status BaseGPUDeviceFactory::CreateGPUDevice(const SessionOptions& options,
   // different (which should be an error).
   //
   // TODO(laigd): report error if memory_limit doesn't match stats.bytes_limit.
-  BaseGPUDevice* gpu_device = CreateGPUDevice(
+  std::unique_ptr<BaseGPUDevice> gpu_device = CreateGPUDevice(
       options, device_name, static_cast<Bytes>(stats.bytes_limit), dev_locality,
       tf_gpu_id, GetShortDeviceDescription(platform_gpu_id, desc),
       gpu_allocator, ProcessState::singleton()->GetCPUAllocator(numa_node));
@@ -1116,7 +1114,7 @@ Status BaseGPUDeviceFactory::CreateGPUDevice(const SessionOptions& options,
             << (stats.bytes_limit >> 20) << " MB memory) -> physical GPU ("
             << GetShortDeviceDescription(platform_gpu_id, desc) << ")";
   TF_RETURN_IF_ERROR(gpu_device->Init(options));
-  devices->push_back(gpu_device);
+  devices->push_back(std::move(gpu_device));
 
   return Status::OK();
 }
diff --git a/tensorflow/core/common_runtime/gpu/gpu_device.h b/tensorflow/core/common_runtime/gpu/gpu_device.h
index 674e8384d5e..d002d02c51d 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_device.h
+++ b/tensorflow/core/common_runtime/gpu/gpu_device.h
@@ -166,7 +166,7 @@ class BaseGPUDevice : public LocalDevice {
 class BaseGPUDeviceFactory : public DeviceFactory {
  public:
   Status CreateDevices(const SessionOptions& options, const string& name_prefix,
-                       std::vector<Device*>* devices) override;
+                       std::vector<std::unique_ptr<Device>>* devices) override;
 
   struct InterconnectMap {
     // Name of interconnect technology, if known.
@@ -207,15 +207,13 @@ class BaseGPUDeviceFactory : public DeviceFactory {
   Status CreateGPUDevice(const SessionOptions& options,
                          const string& name_prefix, TfGpuId tf_gpu_id,
                          int64 memory_limit, const DeviceLocality& dev_locality,
-                         std::vector<Device*>* devices);
+                         std::vector<std::unique_ptr<Device>>* devices);
 
-  virtual BaseGPUDevice* CreateGPUDevice(const SessionOptions& options,
-                                         const string& name, Bytes memory_limit,
-                                         const DeviceLocality& dev_locality,
-                                         TfGpuId tf_gpu_id,
-                                         const string& physical_device_desc,
-                                         Allocator* gpu_allocator,
-                                         Allocator* cpu_allocator) = 0;
+  virtual std::unique_ptr<BaseGPUDevice> CreateGPUDevice(
+      const SessionOptions& options, const string& name, Bytes memory_limit,
+      const DeviceLocality& dev_locality, TfGpuId tf_gpu_id,
+      const string& physical_device_desc, Allocator* gpu_allocator,
+      Allocator* cpu_allocator) = 0;
 
   // Returns into 'ids' the list of valid platform GPU ids, in the order that
   // they should map to TF GPU ids "/device:GPU:0", "/device:GPU:1", etc,
diff --git a/tensorflow/core/common_runtime/gpu/gpu_device_factory.cc b/tensorflow/core/common_runtime/gpu/gpu_device_factory.cc
index e1aaf95df6d..8dc71973292 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_device_factory.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_device_factory.cc
@@ -59,15 +59,14 @@ class GPUDevice : public BaseGPUDevice {
 
 class GPUDeviceFactory : public BaseGPUDeviceFactory {
  private:
-  BaseGPUDevice* CreateGPUDevice(const SessionOptions& options,
-                                 const string& name, Bytes memory_limit,
-                                 const DeviceLocality& locality,
-                                 TfGpuId tf_gpu_id,
-                                 const string& physical_device_desc,
-                                 Allocator* gpu_allocator,
-                                 Allocator* cpu_allocator) override {
-    return new GPUDevice(options, name, memory_limit, locality, tf_gpu_id,
-                         physical_device_desc, gpu_allocator, cpu_allocator);
+  std::unique_ptr<BaseGPUDevice> CreateGPUDevice(
+      const SessionOptions& options, const string& name, Bytes memory_limit,
+      const DeviceLocality& locality, TfGpuId tf_gpu_id,
+      const string& physical_device_desc, Allocator* gpu_allocator,
+      Allocator* cpu_allocator) override {
+    return absl::make_unique<GPUDevice>(options, name, memory_limit, locality,
+                                        tf_gpu_id, physical_device_desc,
+                                        gpu_allocator, cpu_allocator);
   }
 };
 
@@ -108,7 +107,7 @@ class GPUCompatibleCPUDevice : public ThreadPoolDevice {
 class GPUCompatibleCPUDeviceFactory : public DeviceFactory {
  public:
   Status CreateDevices(const SessionOptions& options, const string& name_prefix,
-                       std::vector<Device*>* devices) override {
+                       std::vector<std::unique_ptr<Device>>* devices) override {
     int n = 1;
     auto iter = options.config.device_count().find("CPU");
     if (iter != options.config.device_count().end()) {
@@ -116,7 +115,7 @@ class GPUCompatibleCPUDeviceFactory : public DeviceFactory {
     }
     for (int i = 0; i < n; i++) {
       string name = strings::StrCat(name_prefix, "/device:CPU:", i);
-      devices->push_back(new GPUCompatibleCPUDevice(
+      devices->push_back(absl::make_unique<GPUCompatibleCPUDevice>(
           options, name, Bytes(256 << 20), DeviceLocality(), cpu_allocator()));
     }
 
diff --git a/tensorflow/core/common_runtime/gpu/gpu_device_on_non_gpu_machine_test.cc b/tensorflow/core/common_runtime/gpu/gpu_device_on_non_gpu_machine_test.cc
index 75be6d60b86..58656ec7576 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_device_on_non_gpu_machine_test.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_device_on_non_gpu_machine_test.cc
@@ -33,7 +33,7 @@ namespace {
 
 TEST(GPUDeviceOnNonGPUMachineTest, CreateGPUDevicesOnNonGPUMachine) {
   SessionOptions opts;
-  std::vector<tensorflow::Device*> devices;
+  std::vector<std::unique_ptr<tensorflow::Device>> devices;
   TF_CHECK_OK(DeviceFactory::GetFactory("GPU")->CreateDevices(
       opts, "/job:localhost/replica:0/task:0", &devices));
   EXPECT_TRUE(devices.empty());
diff --git a/tensorflow/core/common_runtime/gpu/gpu_device_test.cc b/tensorflow/core/common_runtime/gpu/gpu_device_test.cc
index 36294094e9a..ae623b2adbe 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_device_test.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_device_test.cc
@@ -88,7 +88,7 @@ class GPUDeviceTest : public ::testing::Test {
 
 TEST_F(GPUDeviceTest, FailedToParseVisibleDeviceList) {
   SessionOptions opts = MakeSessionOptions("0,abc");
-  std::vector<tensorflow::Device*> devices;
+  std::vector<std::unique_ptr<Device>> devices;
   Status status = DeviceFactory::GetFactory("GPU")->CreateDevices(
       opts, kDeviceNamePrefix, &devices);
   EXPECT_EQ(status.code(), error::INVALID_ARGUMENT);
@@ -97,7 +97,7 @@ TEST_F(GPUDeviceTest, FailedToParseVisibleDeviceList) {
 
 TEST_F(GPUDeviceTest, InvalidGpuId) {
   SessionOptions opts = MakeSessionOptions("100");
-  std::vector<tensorflow::Device*> devices;
+  std::vector<std::unique_ptr<Device>> devices;
   Status status = DeviceFactory::GetFactory("GPU")->CreateDevices(
       opts, kDeviceNamePrefix, &devices);
   EXPECT_EQ(status.code(), error::INVALID_ARGUMENT);
@@ -107,7 +107,7 @@ TEST_F(GPUDeviceTest, InvalidGpuId) {
 
 TEST_F(GPUDeviceTest, DuplicateEntryInVisibleDeviceList) {
   SessionOptions opts = MakeSessionOptions("0,0");
-  std::vector<tensorflow::Device*> devices;
+  std::vector<std::unique_ptr<Device>> devices;
   Status status = DeviceFactory::GetFactory("GPU")->CreateDevices(
       opts, kDeviceNamePrefix, &devices);
   EXPECT_EQ(status.code(), error::INVALID_ARGUMENT);
@@ -117,7 +117,7 @@ TEST_F(GPUDeviceTest, DuplicateEntryInVisibleDeviceList) {
 
 TEST_F(GPUDeviceTest, VirtualDeviceConfigConflictsWithMemoryFractionSettings) {
   SessionOptions opts = MakeSessionOptions("0", 0.1, 1, {{}});
-  std::vector<tensorflow::Device*> devices;
+  std::vector<std::unique_ptr<Device>> devices;
   Status status = DeviceFactory::GetFactory("GPU")->CreateDevices(
       opts, kDeviceNamePrefix, &devices);
   EXPECT_EQ(status.code(), error::INVALID_ARGUMENT);
@@ -129,7 +129,7 @@ TEST_F(GPUDeviceTest, GpuDeviceCountTooSmall) {
   // device_count is 0, but with one entry in visible_device_list and one
   // (empty) VirtualDevices messages.
   SessionOptions opts = MakeSessionOptions("0", 0, 0, {{}});
-  std::vector<tensorflow::Device*> devices;
+  std::vector<std::unique_ptr<Device>> devices;
   Status status = DeviceFactory::GetFactory("GPU")->CreateDevices(
       opts, kDeviceNamePrefix, &devices);
   EXPECT_EQ(status.code(), error::UNKNOWN);
@@ -141,7 +141,7 @@ TEST_F(GPUDeviceTest, NotEnoughGpuInVisibleDeviceList) {
   // Single entry in visible_device_list with two (empty) VirtualDevices
   // messages.
   SessionOptions opts = MakeSessionOptions("0", 0, 8, {{}, {}});
-  std::vector<tensorflow::Device*> devices;
+  std::vector<std::unique_ptr<Device>> devices;
   Status status = DeviceFactory::GetFactory("GPU")->CreateDevices(
       opts, kDeviceNamePrefix, &devices);
   EXPECT_EQ(status.code(), error::UNKNOWN);
@@ -155,7 +155,7 @@ TEST_F(GPUDeviceTest, VirtualDeviceConfigConflictsWithVisibleDeviceList) {
   // Three entries in visible_device_list with two (empty) VirtualDevices
   // messages.
   SessionOptions opts = MakeSessionOptions("0,1", 0, 8, {{}});
-  std::vector<tensorflow::Device*> devices;
+  std::vector<std::unique_ptr<Device>> devices;
   Status status = DeviceFactory::GetFactory("GPU")->CreateDevices(
       opts, kDeviceNamePrefix, &devices);
   EXPECT_EQ(status.code(), error::INVALID_ARGUMENT);
@@ -169,39 +169,36 @@ TEST_F(GPUDeviceTest, VirtualDeviceConfigConflictsWithVisibleDeviceList) {
 TEST_F(GPUDeviceTest, EmptyVirtualDeviceConfig) {
   // It'll create single virtual device when the virtual device config is empty.
   SessionOptions opts = MakeSessionOptions("0");
-  std::vector<tensorflow::Device*> devices;
+  std::vector<std::unique_ptr<Device>> devices;
   TF_CHECK_OK(DeviceFactory::GetFactory("GPU")->CreateDevices(
       opts, kDeviceNamePrefix, &devices));
   EXPECT_EQ(1, devices.size());
   EXPECT_GE(devices[0]->attributes().memory_limit(), 0);
-  gtl::STLDeleteElements(&devices);
 }
 
 TEST_F(GPUDeviceTest, SingleVirtualDeviceWithNoMemoryLimit) {
   // It'll create single virtual device for the gpu in question when
   // memory_limit_mb is unset.
   SessionOptions opts = MakeSessionOptions("0", 0, 1, {{}});
-  std::vector<tensorflow::Device*> devices;
+  std::vector<std::unique_ptr<Device>> devices;
   TF_CHECK_OK(DeviceFactory::GetFactory("GPU")->CreateDevices(
       opts, kDeviceNamePrefix, &devices));
   EXPECT_EQ(1, devices.size());
   EXPECT_GE(devices[0]->attributes().memory_limit(), 0);
-  gtl::STLDeleteElements(&devices);
 }
 
 TEST_F(GPUDeviceTest, SingleVirtualDeviceWithMemoryLimit) {
   SessionOptions opts = MakeSessionOptions("0", 0, 1, {{123}});
-  std::vector<tensorflow::Device*> devices;
+  std::vector<std::unique_ptr<Device>> devices;
   TF_CHECK_OK(DeviceFactory::GetFactory("GPU")->CreateDevices(
       opts, kDeviceNamePrefix, &devices));
   EXPECT_EQ(1, devices.size());
   EXPECT_EQ(123 << 20, devices[0]->attributes().memory_limit());
-  gtl::STLDeleteElements(&devices);
 }
 
 TEST_F(GPUDeviceTest, MultipleVirtualDevices) {
   SessionOptions opts = MakeSessionOptions("0", 0, 1, {{123, 456}});
-  std::vector<tensorflow::Device*> devices;
+  std::vector<std::unique_ptr<Device>> devices;
   TF_CHECK_OK(DeviceFactory::GetFactory("GPU")->CreateDevices(
       opts, kDeviceNamePrefix, &devices));
   EXPECT_EQ(2, devices.size());
@@ -219,7 +216,6 @@ TEST_F(GPUDeviceTest, MultipleVirtualDevices) {
             devices[1]->attributes().locality().links().link(0).type());
   EXPECT_EQ(BaseGPUDeviceFactory::InterconnectMap::kSameDeviceStrength,
             devices[1]->attributes().locality().links().link(0).strength());
-  gtl::STLDeleteElements(&devices);
 }
 
 // Enabling unified memory on pre-Pascal GPUs results in an initialization
@@ -236,7 +232,7 @@ TEST_F(GPUDeviceTest, UnifiedMemoryUnavailableOnPrePascalGpus) {
   opts.config.mutable_gpu_options()
       ->mutable_experimental()
       ->set_use_unified_memory(true);
-  std::vector<tensorflow::Device*> devices;
+  std::vector<std::unique_ptr<Device>> devices;
   Status status = DeviceFactory::GetFactory("GPU")->CreateDevices(
       opts, kDeviceNamePrefix, &devices);
   EXPECT_EQ(status.code(), error::INTERNAL);
@@ -259,7 +255,7 @@ TEST_F(GPUDeviceTest, UnifiedMemoryAllocation) {
   }
 
   SessionOptions opts = MakeSessionOptions("0", kGpuMemoryFraction);
-  std::vector<tensorflow::Device*> devices;
+  std::vector<std::unique_ptr<Device>> devices;
   TF_ASSERT_OK(DeviceFactory::GetFactory("GPU")->CreateDevices(
       opts, kDeviceNamePrefix, &devices));
   ASSERT_EQ(1, devices.size());
@@ -278,8 +274,6 @@ TEST_F(GPUDeviceTest, UnifiedMemoryAllocation) {
                                      (memory_limit >> 20) << 20);
   EXPECT_NE(ptr, nullptr);
   allocator->DeallocateRaw(ptr);
-
-  gtl::STLDeleteElements(&devices);
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/gpu/gpu_process_state.cc b/tensorflow/core/common_runtime/gpu/gpu_process_state.cc
index 3e95374fda8..a9a19f0fe04 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_process_state.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_process_state.cc
@@ -55,30 +55,17 @@ bool useCudaMemoryGuardAllocator() {
 
 }  // namespace
 
-GPUProcessState* GPUProcessState::instance_ = nullptr;
-
-/*static*/ GPUProcessState* GPUProcessState::singleton() {
-  if (instance_ == nullptr) {
-    instance_ = new GPUProcessState;
-  }
-  CHECK(instance_->process_state_);
-
-  return instance_;
+/*static*/ GPUProcessState* GPUProcessState::singleton(GPUProcessState* ps) {
+  static GPUProcessState* instance = ps ? ps : new GPUProcessState;
+  DCHECK((!ps) || (ps == instance))
+      << "Multiple calls to GPUProcessState with non-null ps";
+  return instance;
 }
 
 GPUProcessState::GPUProcessState() : gpu_device_enabled_(false) {
-  CHECK(instance_ == nullptr);
-  instance_ = this;
   process_state_ = ProcessState::singleton();
 }
 
-// Normally the GPUProcessState singleton is never explicitly deleted.
-// This function is defined for debugging problems with the allocators.
-GPUProcessState::~GPUProcessState() {
-  CHECK_EQ(this, instance_);
-  instance_ = nullptr;
-}
-
 int GPUProcessState::BusIdForGPU(TfGpuId tf_gpu_id) {
   // Return the NUMA node associated with the GPU's StreamExecutor.
   se::StreamExecutor* se =
@@ -166,7 +153,9 @@ Allocator* GPUProcessState::GetCUDAHostAllocator(int numa_node) {
       !process_state_->ProcessState::FLAGS_brain_mem_reg_cuda_dma) {
     return process_state_->GetCPUAllocator(numa_node);
   }
-  CHECK_GE(numa_node, 0);
+  if (numa_node == port::kNUMANoAffinity) {
+    numa_node = 0;
+  }
   {
     // Here we optimize the most common use case where cuda_host_allocators_
     // and cuda_al_ have already been populated and since we're only reading
diff --git a/tensorflow/core/common_runtime/gpu/gpu_process_state.h b/tensorflow/core/common_runtime/gpu/gpu_process_state.h
index 43e9a316604..df51c10c806 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_process_state.h
+++ b/tensorflow/core/common_runtime/gpu/gpu_process_state.h
@@ -37,7 +37,19 @@ class PoolAllocator;
 // Singleton that manages per-process state when GPUs are present.
 class GPUProcessState {
  public:
-  static GPUProcessState* singleton();
+  // If ps == nullptr, returns pointer to the single instance of this class to
+  // be used within this process.
+  //
+  // If ps != nullptrs, accepts a value to be returned by all subsequent calls.
+  // A non-null ps may ONLY be provided during program static storage
+  // initialization.  Must not be called more than once with a non-null ps.
+  //
+  // If a derived class of GPUProcessState is ever used in a process, it must
+  // always be used in place of this class.  In order to ensure that existing
+  // calls to GPUProcessState::singleton() all resolve to the derived instance
+  // instead, this function must be called once during startup, supplying the
+  // derived instance value, prior to any accessor call to this function.
+  static GPUProcessState* singleton(GPUProcessState* ps = nullptr);
 
   // Query whether any GPU device has been created so far.
   // Disable thread safety analysis since a race is benign here.
@@ -97,7 +109,11 @@ class GPUProcessState {
   virtual int BusIdForGPU(TfGpuId tf_gpu_id);
 
  protected:
+  // GPUProcessState is a singleton that should not normally be deleted except
+  // at process shutdown.
   GPUProcessState();
+  virtual ~GPUProcessState() {}
+  friend class GPUDeviceTest;
 
   // Helper method for unit tests to reset the ProcessState singleton by
   // cleaning up everything. Never use in production.
@@ -127,10 +143,6 @@ class GPUProcessState {
       GUARDED_BY(mu_);
   std::vector<std::vector<SubAllocator::Visitor>> cuda_host_free_visitors_
       GUARDED_BY(mu_);
-
-  virtual ~GPUProcessState();
-
-  friend class GPUDeviceTest;
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/graph_execution_state.cc b/tensorflow/core/common_runtime/graph_execution_state.cc
index ab619ef619a..880806f120d 100644
--- a/tensorflow/core/common_runtime/graph_execution_state.cc
+++ b/tensorflow/core/common_runtime/graph_execution_state.cc
@@ -25,9 +25,11 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/optimization_registry.h"
 #include "tensorflow/core/common_runtime/placer.h"
+#include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/graph.pb_text.h"
 #include "tensorflow/core/framework/graph_def_util.h"
 #include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/tensor.pb.h"
 #include "tensorflow/core/framework/versions.pb.h"
 #include "tensorflow/core/graph/algorithm.h"
 #include "tensorflow/core/graph/graph.h"
@@ -37,6 +39,7 @@ limitations under the License.
 #include "tensorflow/core/graph/validate.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/gtl/flatset.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/types.h"
@@ -393,6 +396,42 @@ Status ValidateFeedAndFetchDevices(
   }
   return Status::OK();
 }
+
+Status GetFeedShapeAndTypeFromAttribute(const NodeDef& node,
+                                        PartialTensorShape* shape,
+                                        DataType* type) {
+  static const gtl::FlatSet<string>* const kHasExplicitShapeAttribute =
+      CHECK_NOTNULL((new gtl::FlatSet<string>{
+          "Placeholder", "PlaceholderV2", "PlaceholderWithDefault",
+          "ParallelConcat", "ImmutableConst", "_ParallelConcatStart",
+          "InfeedDequeue", "OutfeedDequeue", "CollectiveBcastSend",
+          "CollectiveBcastRecv", "AccumulateNV2", "VariableV2", "Variable",
+          "TemporaryVariable", "NcclBroadcast", "_ScopedAllocator",
+          "_ScopedAllocatorConcat"}));
+
+  // All the node types handled here have their output datatype set in
+  // either attribute 'dtype' or 'T'.
+  if (!GetNodeAttr(node, "dtype", type).ok() &&
+      !GetNodeAttr(node, "T", type).ok()) {
+    return errors::InvalidArgument(
+        "Could not determine output type for feed node: ", node.name(),
+        " of type ", node.op());
+  }
+
+  // First handle the case of feeding a const node.
+  if (node.op() == "Const" && HasNodeAttr(node, "value")) {
+    *shape =
+        PartialTensorShape(node.attr().at("value").tensor().tensor_shape());
+  } else if (kHasExplicitShapeAttribute->find(node.op()) !=
+             kHasExplicitShapeAttribute->end()) {
+    TF_RETURN_IF_ERROR(GetNodeAttr(node, "shape", shape));
+  } else {
+    return errors::InvalidArgument("Could not determine shape for feed node: ",
+                                   node.name(), " of type ", node.op());
+  }
+  return Status::OK();
+}
+
 }  // namespace
 
 Status GraphExecutionState::PruneGraph(
@@ -552,16 +591,17 @@ Status GraphExecutionState::OptimizeGraph(
     return errors::InvalidArgument("Can't optimize a pruned graph");
   }
 
-  const RewriterConfig& rewrite_options =
-      session_options_->config.graph_options().rewrite_options();
-
-  if (grappler::MetaOptimizerEnabled(rewrite_options)) {
-    // Adding this functionality in steps. The first step is to make sure
-    // we don't break dependencies. The second step will be to turn the
-    // functionality on by default.
+  if (grappler::MetaOptimizerEnabled(session_options_->config)) {
     grappler::GrapplerItem item;
     item.id = "tf_graph";
     graph_->ToGraphDef(&item.graph);
+
+    // It's ok to skip invalid device annotations in Grappler.
+    Status inferred_devices = item.InferDevicesFromGraph();
+    if (!inferred_devices.ok()) {
+      VLOG(3) << inferred_devices.error_message();
+    }
+
     // TODO(b/114748242): Add a unit test to test this bug fix.
     if (flib_def_) {
       *item.graph.mutable_library() = flib_def_->ToProto();
@@ -602,26 +642,30 @@ Status GraphExecutionState::OptimizeGraph(
         if (feeds.find(node.name()) == feeds.end()) {
           continue;
         }
-        if (node.attr().count("dtype") == 0 ||
-            node.attr().count("shape") == 0) {
-          return errors::InvalidArgument("Missing node shape or type");
-        }
-        TensorShapeProto shape_proto(node.attr().at("shape").shape());
-        // If the shape of the placeholder value is only partially known,
-        // we're free to use any dimension we want to feed the placeholder. We
-        // choose 1 to minimize the memory impact. Note that this only matters
-        // if an optimizer choose to run the graph to build its cost model,
-        // which doesn't happen (yet)
-        if (shape_proto.unknown_rank()) {
-          shape_proto.set_unknown_rank(false);
-        }
-        for (auto& dim : *shape_proto.mutable_dim()) {
-          if (dim.size() < 0) {
-            dim.set_size(1);
+        // Get the type and shape of the feed node.
+        PartialTensorShape partial_shape;
+        DataType type;
+        TF_RETURN_IF_ERROR(
+            GetFeedShapeAndTypeFromAttribute(node, &partial_shape, &type));
+        // If the shape of the placeholder is only partially known, we are free
+        // to set unknown dimensions of its shape to any value we desire. We
+        // choose 0 to minimize the memory impact. Note that this only matters
+        // if an optimizer chooses to run the graph.
+        TensorShape shape;
+        if (partial_shape.unknown_rank()) {
+          shape = TensorShape({0});
+        } else {
+          for (int i = 0; i < partial_shape.dims(); ++i) {
+            if (partial_shape.dim_size(i) < 0) {
+              partial_shape.set_dim(i, 0);
+            }
+          }
+          if (!partial_shape.AsTensorShape(&shape)) {
+            return errors::InvalidArgument(
+                "Could not derive shape for feed node: ", node.DebugString());
           }
         }
-        TensorShape shape(shape_proto);
-        DataType type = node.attr().at("dtype").type();
+
         Tensor fake_input(type, shape);
         item.feed.emplace_back(node.name(), fake_input);
       }
@@ -638,7 +682,7 @@ Status GraphExecutionState::OptimizeGraph(
     grappler::VirtualCluster cluster(device_set_);
     GraphDef new_graph;
     TF_RETURN_IF_ERROR(grappler::RunMetaOptimizer(
-        item, rewrite_options, cpu_device, &cluster, &new_graph));
+        item, session_options_->config, cpu_device, &cluster, &new_graph));
 
     // Merge optimized graph function library with an original library.
     // Optimized graph might have new functions specialized for it's
diff --git a/tensorflow/core/common_runtime/hierarchical_tree_broadcaster_test.cc b/tensorflow/core/common_runtime/hierarchical_tree_broadcaster_test.cc
index 2144eea84f0..f0656ff5333 100644
--- a/tensorflow/core/common_runtime/hierarchical_tree_broadcaster_test.cc
+++ b/tensorflow/core/common_runtime/hierarchical_tree_broadcaster_test.cc
@@ -15,6 +15,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/hierarchical_tree_broadcaster.h"
 
 #include <algorithm>
+#include "absl/memory/memory.h"
 #include "tensorflow/core/common_runtime/base_collective_executor.h"
 #include "tensorflow/core/common_runtime/collective_rma_local.h"
 #include "tensorflow/core/common_runtime/device_mgr.h"
@@ -217,7 +218,7 @@ class HierarchicalTreeBroadcasterTest : public ::testing::Test {
             << " num_devices_per_worker=" << num_devices_per_worker;
     int total_num_devices = num_workers * num_devices_per_worker;
     device_type_ = device_type;
-    std::vector<Device*> local_devices;
+    std::vector<std::unique_ptr<Device>> local_devices;
     SessionOptions sess_opts;
     sess_opts.env = Env::Default();
     Bytes mem_limit(4 << 20);
@@ -227,7 +228,7 @@ class HierarchicalTreeBroadcasterTest : public ::testing::Test {
         if (device_type == DEVICE_CPU) {
           string dev_name = strings::StrCat("/job:worker/replica:0/task:", wi,
                                             "/device:CPU:", di);
-          local_devices.push_back(new ThreadPoolDevice(
+          local_devices.push_back(absl::make_unique<ThreadPoolDevice>(
               sess_opts, dev_name, mem_limit, dev_locality, cpu_allocator()));
         } else if (device_type == DEVICE_GPU && !gpu_devices_.empty()) {
           int dev_idx = (wi * num_devices_per_worker) + di;
@@ -235,7 +236,7 @@ class HierarchicalTreeBroadcasterTest : public ::testing::Test {
             LOG(INFO) << "dev_mgr has access to limited GPUs, reusing for more "
                          "than one ring node.";
           } else {
-            local_devices.push_back(gpu_devices_[dev_idx]);
+            local_devices.push_back(std::move(gpu_devices_[dev_idx]));
           }
         } else {
           LOG(FATAL) << "Unsupported device_type " << device_type;
@@ -243,7 +244,7 @@ class HierarchicalTreeBroadcasterTest : public ::testing::Test {
       }
     }
     if (!dev_mgr_ || device_type == DEVICE_CPU) {
-      dev_mgr_.reset(new DeviceMgr(local_devices));
+      dev_mgr_.reset(new DeviceMgr(std::move(local_devices)));
     }
     if (!gpu_ring_order_) gpu_ring_order_.reset(new string());
     dev_resolver_.reset(new DeviceResolverLocal(dev_mgr_.get()));
@@ -714,7 +715,7 @@ class HierarchicalTreeBroadcasterTest : public ::testing::Test {
   std::unique_ptr<DeviceResolverLocal> dev_resolver_;
   std::vector<DeviceInstance*> instances_;
   CollectiveParams col_params_;
-  std::vector<tensorflow::Device*> gpu_devices_;
+  std::vector<std::unique_ptr<tensorflow::Device>> gpu_devices_;
   std::unique_ptr<tensorflow::DeviceMgr> dev_mgr_;
   std::unique_ptr<string> gpu_ring_order_;
   mutex mu_;
diff --git a/tensorflow/core/common_runtime/kernel_benchmark_testlib.cc b/tensorflow/core/common_runtime/kernel_benchmark_testlib.cc
index 1f585a8c248..bdd6c0e87d4 100644
--- a/tensorflow/core/common_runtime/kernel_benchmark_testlib.cc
+++ b/tensorflow/core/common_runtime/kernel_benchmark_testlib.cc
@@ -75,12 +75,12 @@ Benchmark::Benchmark(const string& device, Graph* g,
   const int graph_def_version = g->versions().producer();
 
   LocalExecutorParams params;
-  params.device = device_;
+  params.device = device_.get();
   params.function_library = nullptr;
   params.create_kernel = [this, graph_def_version](const NodeDef& ndef,
                                                    OpKernel** kernel) {
-    return CreateNonCachedKernel(device_, nullptr, ndef, graph_def_version,
-                                 kernel);
+    return CreateNonCachedKernel(device_.get(), nullptr, ndef,
+                                 graph_def_version, kernel);
   };
   params.delete_kernel = [](OpKernel* kernel) {
     DeleteNonCachedKernel(kernel);
@@ -107,7 +107,7 @@ Benchmark::~Benchmark() {
     // run kernel destructors that may attempt to access state borrowed from
     // `device_`, such as the resource manager.
     exec_.reset();
-    delete device_;
+    device_.reset();
     delete pool_;
   }
 }
diff --git a/tensorflow/core/common_runtime/kernel_benchmark_testlib.h b/tensorflow/core/common_runtime/kernel_benchmark_testlib.h
index 555b43f655b..b1557c50b03 100644
--- a/tensorflow/core/common_runtime/kernel_benchmark_testlib.h
+++ b/tensorflow/core/common_runtime/kernel_benchmark_testlib.h
@@ -55,7 +55,7 @@ class Benchmark {
 
  private:
   thread::ThreadPool* pool_ = nullptr;
-  Device* device_ = nullptr;
+  std::unique_ptr<Device> device_ = nullptr;
   Rendezvous* rendez_ = nullptr;
   std::unique_ptr<Executor> exec_;
 
diff --git a/tensorflow/core/common_runtime/local_device.cc b/tensorflow/core/common_runtime/local_device.cc
index 873182371e0..f1fcca194e9 100644
--- a/tensorflow/core/common_runtime/local_device.cc
+++ b/tensorflow/core/common_runtime/local_device.cc
@@ -18,11 +18,13 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/local_device.h"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/common_runtime/eigen_thread_pool.h"
+#include "tensorflow/core/common_runtime/process_state.h"
 #include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/platform/byte_order.h"
 #include "tensorflow/core/platform/cpu_feature_guard.h"
 #include "tensorflow/core/platform/cpu_info.h"
 #include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/numa.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/public/session_options.h"
 
@@ -30,23 +32,52 @@ namespace tensorflow {
 
 /* static */
 bool LocalDevice::use_global_threadpool_ = true;
+mutex LocalDevice::global_tp_mu_;
+gtl::InlinedVector<LocalDevice::EigenThreadPoolInfo*, 4>
+    LocalDevice::global_tp_info_;
 
 struct LocalDevice::EigenThreadPoolInfo {
-  explicit EigenThreadPoolInfo(const SessionOptions& options) {
+  // Wrapper so we can provide the CPUAllocator to Eigen for use
+  // when ops need extra tmp memory.
+  class EigenAllocator : public Eigen::Allocator {
+   public:
+    explicit EigenAllocator(tensorflow::Allocator* a) : allocator_(a) {}
+    void* allocate(size_t num_bytes) const override {
+      return allocator_->AllocateRaw(64, num_bytes);
+    }
+    void deallocate(void* buffer) const override {
+      allocator_->DeallocateRaw(buffer);
+    }
+    tensorflow::Allocator* allocator_;
+  };
+
+  explicit EigenThreadPoolInfo(const SessionOptions& options, int numa_node,
+                               Allocator* allocator) {
     int32 intra_op_parallelism_threads =
         options.config.intra_op_parallelism_threads();
     if (intra_op_parallelism_threads == 0) {
       intra_op_parallelism_threads = port::NumSchedulableCPUs();
+      if (numa_node != port::kNUMANoAffinity) {
+        // Assume that CPUs are equally distributed over available NUMA nodes.
+        // This may not be true, but there isn't currently a better way of
+        // determining the number of CPUs specific to the requested node.
+        intra_op_parallelism_threads /= port::NUMANumNodes();
+      }
     }
-    VLOG(1) << "Local device intra op parallelism threads: "
-            << intra_op_parallelism_threads;
+    ThreadOptions thread_opts;
+    thread_opts.numa_node = numa_node;
     eigen_worker_threads_.num_threads = intra_op_parallelism_threads;
     eigen_worker_threads_.workers = new thread::ThreadPool(
-        options.env, "Eigen", intra_op_parallelism_threads);
+        options.env, thread_opts, strings::StrCat("numa_", numa_node, "_Eigen"),
+        intra_op_parallelism_threads);
     eigen_threadpool_wrapper_.reset(
         new EigenThreadPoolWrapper(eigen_worker_threads_.workers));
+    if (allocator) {
+      eigen_allocator_.reset(new EigenAllocator(allocator));
+    }
     eigen_device_.reset(new Eigen::ThreadPoolDevice(
-        eigen_threadpool_wrapper_.get(), eigen_worker_threads_.num_threads));
+        eigen_threadpool_wrapper_.get(), eigen_worker_threads_.num_threads,
+        eigen_allocator_.get()));
   }
 
   ~EigenThreadPoolInfo() {
@@ -58,6 +89,7 @@ struct LocalDevice::EigenThreadPoolInfo {
   DeviceBase::CpuWorkerThreads eigen_worker_threads_;
   std::unique_ptr<Eigen::ThreadPoolInterface> eigen_threadpool_wrapper_;
   std::unique_ptr<Eigen::ThreadPoolDevice> eigen_device_;
+  std::unique_ptr<EigenAllocator> eigen_allocator_;
 };
 
 LocalDevice::LocalDevice(const SessionOptions& options,
@@ -68,15 +100,34 @@ LocalDevice::LocalDevice(const SessionOptions& options,
   port::InfoAboutUnusedCPUFeatures();
   LocalDevice::EigenThreadPoolInfo* tp_info;
   if (use_global_threadpool_) {
-    // All ThreadPoolDevices in the process will use this single fixed
-    // sized threadpool for numerical computations.
-    static LocalDevice::EigenThreadPoolInfo* global_tp_info =
-        new LocalDevice::EigenThreadPoolInfo(options);
-    tp_info = global_tp_info;
+    mutex_lock l(global_tp_mu_);
+    if (options.config.experimental().use_numa_affinity()) {
+      int numa_node = attributes.locality().numa_node();
+      int num_numa_nodes = port::NUMANumNodes();
+      DCHECK_LT(numa_node, num_numa_nodes);
+      Allocator* numa_allocator =
+          ProcessState::singleton()->GetCPUAllocator(numa_node);
+      while (numa_node >= global_tp_info_.size()) {
+        global_tp_info_.push_back(nullptr);
+      }
+      if (!global_tp_info_[numa_node]) {
+        global_tp_info_[numa_node] = new LocalDevice::EigenThreadPoolInfo(
+            options, numa_node, numa_allocator);
+      }
+      tp_info = global_tp_info_[numa_node];
+    } else {
+      if (global_tp_info_.empty()) {
+        global_tp_info_.push_back(new LocalDevice::EigenThreadPoolInfo(
+            options, port::kNUMANoAffinity, nullptr));
+      }
+      tp_info = global_tp_info_[0];
+    }
   } else {
     // Each LocalDevice owns a separate ThreadPoolDevice for numerical
     // computations.
-    owned_tp_info_.reset(new LocalDevice::EigenThreadPoolInfo(options));
+    // TODO(tucker): NUMA for these too?
+    owned_tp_info_.reset(new LocalDevice::EigenThreadPoolInfo(
+        options, port::kNUMANoAffinity, nullptr));
     tp_info = owned_tp_info_.get();
   }
   set_tensorflow_cpu_worker_threads(&tp_info->eigen_worker_threads_);
diff --git a/tensorflow/core/common_runtime/local_device.h b/tensorflow/core/common_runtime/local_device.h
index 226f121bf32..f305c212c5a 100644
--- a/tensorflow/core/common_runtime/local_device.h
+++ b/tensorflow/core/common_runtime/local_device.h
@@ -47,6 +47,13 @@ class LocalDevice : public Device {
   struct EigenThreadPoolInfo;
   std::unique_ptr<EigenThreadPoolInfo> owned_tp_info_;
 
+  // All ThreadPoolDevices in the process associated with the same
+  // NUMA node will share a single fixed sized threadpool for numerical
+  // computations.
+  static mutex global_tp_mu_;
+  static gtl::InlinedVector<EigenThreadPoolInfo*, 4> global_tp_info_
+      GUARDED_BY(global_tp_mu_);
+
   friend class test::Benchmark;
 
   TF_DISALLOW_COPY_AND_ASSIGN(LocalDevice);
diff --git a/tensorflow/core/common_runtime/lower_if_op.cc b/tensorflow/core/common_runtime/lower_if_op.cc
index 44a2478e3f9..9738006f5ca 100644
--- a/tensorflow/core/common_runtime/lower_if_op.cc
+++ b/tensorflow/core/common_runtime/lower_if_op.cc
@@ -76,7 +76,7 @@ class CondBuilder {
   // The identity node with the same outputs as the original If op.
   Node* lowered_if_output_;
   // The predicate of the conditional.
-  Node* pred_;
+  OutputTensor pred_;
   // Node corresponding to pivot_f branch of predicate switch which is
   // the pivot node that dominates all nodes in the false/else branch.
   Node* pivot_f_;
@@ -102,7 +102,7 @@ CondBuilder::CondBuilder(Node* if_op, const string& then_fn_name,
       name_(if_op->name()),
       then_call_builder_(NewName("then"), then_fn_name, graph->op_registry()),
       else_call_builder_(NewName("else"), else_fn_name, graph->op_registry()) {
-  TF_CHECK_OK(if_op_->input_node(0, &pred_));
+  TF_CHECK_OK(if_op_->input_tensor(0, &pred_));
   then_call_builder_.Device(if_op_->requested_device());
   else_call_builder_.Device(if_op_->requested_device());
 }
@@ -113,8 +113,8 @@ Status CondBuilder::CreatePivotNodes() {
   Node* switch_pred;
   TF_RETURN_IF_ERROR(
       NodeBuilder(NewName("switch_pred"), "Switch", graph_->op_registry())
-          .Input(NodeOut(pred_, 0))
-          .Input(NodeOut(pred_, 0))
+          .Input(NodeOut(pred_))
+          .Input(NodeOut(pred_))
           .Device(if_op_->requested_device())
           .Finalize(graph_, &switch_pred));
   control_predecessor_ = switch_pred;
@@ -140,7 +140,7 @@ Status CondBuilder::AddInput(Node* src, int src_output) {
   TF_RETURN_IF_ERROR(
       NodeBuilder(NewName(src->name()), "Switch", graph_->op_registry())
           .Input(src, src_output)
-          .Input(pred_, 0)
+          .Input(pred_)
           .Device(if_op_->requested_device())
           .Finalize(graph_, &input));
   then_call_builder_.Input(input, kThenBranch);
diff --git a/tensorflow/core/common_runtime/metrics.cc b/tensorflow/core/common_runtime/metrics.cc
index 2736739b1a5..f4c94ed7ec0 100644
--- a/tensorflow/core/common_runtime/metrics.cc
+++ b/tensorflow/core/common_runtime/metrics.cc
@@ -14,7 +14,6 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/core/common_runtime/metrics.h"
-
 #include "tensorflow/core/lib/monitoring/counter.h"
 
 namespace tensorflow {
@@ -24,18 +23,17 @@ namespace {
 auto* graph_runs = monitoring::Counter<0>::New(
     "/tensorflow/core/graph_runs",
     "The number of graph executions used to collect "
-    "/tensorflow/core/graph_run_time_msecs");
+    "/tensorflow/core/graph_run_time_usecs");
 
-auto* graph_run_time_msecs = monitoring::Counter<0>::New(
-    "/tensorflow/core/graph_run_time_msecs",
-    "The total time spent on executing graphs in milliseconds.");
+auto* graph_run_time_usecs = monitoring::Counter<0>::New(
+    "/tensorflow/core/graph_run_time_usecs",
+    "The total time spent on executing graphs in microseconds.");
 }  // namespace
 
-void UpdateGraphExecutionTime(const absl::Duration running_time) {
-  if (running_time > absl::ZeroDuration()) {
+void UpdateGraphExecTime(const uint64 running_time_usecs) {
+  if (running_time_usecs > 0) {
     graph_runs->GetCell()->IncrementBy(1);
-    graph_run_time_msecs->GetCell()->IncrementBy(running_time /
-                                                 absl::Milliseconds(1));
+    graph_run_time_usecs->GetCell()->IncrementBy(running_time_usecs);
   }
 }
 
diff --git a/tensorflow/core/common_runtime/metrics.h b/tensorflow/core/common_runtime/metrics.h
index 5dd4caf5b73..d3430c9f030 100644
--- a/tensorflow/core/common_runtime/metrics.h
+++ b/tensorflow/core/common_runtime/metrics.h
@@ -16,11 +16,11 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_COMMON_RUNTIME_METRICS_H_
 #define TENSORFLOW_CORE_COMMON_RUNTIME_METRICS_H_
 
-#include "absl/time/time.h"
+#include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
 
-void UpdateGraphExecutionTime(const absl::Duration time);
+void UpdateGraphExecTime(const uint64 running_time_usecs);
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/common_runtime/optimization_registry.cc b/tensorflow/core/common_runtime/optimization_registry.cc
index 6ac047295dc..9be540b0192 100644
--- a/tensorflow/core/common_runtime/optimization_registry.cc
+++ b/tensorflow/core/common_runtime/optimization_registry.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/core/common_runtime/optimization_registry.h"
+#include "tensorflow/core/util/dump_graph.h"
 
 namespace tensorflow {
 
@@ -39,6 +40,19 @@ Status OptimizationPassRegistry::RunGrouping(
         VLOG(1) << "Running optimization pass: " << pass->name();
         Status s = pass->Run(options);
         if (!s.ok()) return s;
+        if (VLOG_IS_ON(1)) {
+          DumpGraphToFile(
+              strings::StrCat("after_phase_", phase.first, "_", pass->name()),
+              **options.graph);
+          if (options.partition_graphs) {
+            for (auto& part : *options.partition_graphs) {
+              DumpGraphToFile(
+                  strings::StrCat("after_phase_", phase.first, "_",
+                                  pass->name(), "_partition_", part.first),
+                  *part.second);
+            }
+          }
+        }
       }
     }
   }
diff --git a/tensorflow/core/common_runtime/placer.cc b/tensorflow/core/common_runtime/placer.cc
index 01e4072f603..515c1971d9d 100644
--- a/tensorflow/core/common_runtime/placer.cc
+++ b/tensorflow/core/common_runtime/placer.cc
@@ -1002,7 +1002,7 @@ Status Placer::Run() {
     int assigned_device = -1;
 
     // Heuristic A application.
-    if (IsGeneratorNode(node)) {
+    if (IsGeneratorNode(node) && !node->out_edges().empty()) {
       const Node* output = (*node->out_edges().begin())->dst();
       int output_device_name = output->assigned_device_name_index();
 
diff --git a/tensorflow/core/common_runtime/placer_test.cc b/tensorflow/core/common_runtime/placer_test.cc
index 009f905f108..04e77e55f62 100644
--- a/tensorflow/core/common_runtime/placer_test.cc
+++ b/tensorflow/core/common_runtime/placer_test.cc
@@ -92,7 +92,7 @@ class FakeDevice : public Device {
 class DummyFactory : public DeviceFactory {
  public:
   Status CreateDevices(const SessionOptions& options, const string& name_prefix,
-                       std::vector<Device*>* devices) override {
+                       std::vector<std::unique_ptr<Device>>* devices) override {
     return Status::OK();
   }
 };
diff --git a/tensorflow/core/common_runtime/pool_allocator.cc b/tensorflow/core/common_runtime/pool_allocator.cc
index 66dc8f33221..6b40fcc4c70 100644
--- a/tensorflow/core/common_runtime/pool_allocator.cc
+++ b/tensorflow/core/common_runtime/pool_allocator.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/mem.h"
 #include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/numa.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
@@ -258,7 +259,12 @@ void PoolAllocator::EvictOne() {
 void* BasicCPUAllocator::Alloc(size_t alignment, size_t num_bytes) {
   void* ptr = nullptr;
   if (num_bytes > 0) {
-    ptr = port::AlignedMalloc(num_bytes, static_cast<int>(alignment));
+    if (numa_node_ == port::kNUMANoAffinity) {
+      ptr = port::AlignedMalloc(num_bytes, static_cast<int>(alignment));
+    } else {
+      ptr =
+          port::NUMAMalloc(numa_node_, num_bytes, static_cast<int>(alignment));
+    }
     VisitAlloc(ptr, numa_node_, num_bytes);
   }
   return ptr;
@@ -267,7 +273,11 @@ void* BasicCPUAllocator::Alloc(size_t alignment, size_t num_bytes) {
 void BasicCPUAllocator::Free(void* ptr, size_t num_bytes) {
   if (num_bytes > 0) {
     VisitFree(ptr, numa_node_, num_bytes);
-    port::AlignedFree(ptr);
+    if (numa_node_ == port::kNUMANoAffinity) {
+      port::AlignedFree(ptr);
+    } else {
+      port::NUMAFree(ptr, num_bytes);
+    }
   }
 }
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/pool_allocator.h b/tensorflow/core/common_runtime/pool_allocator.h
index 5b4623ba10f..8be9c7b678e 100644
--- a/tensorflow/core/common_runtime/pool_allocator.h
+++ b/tensorflow/core/common_runtime/pool_allocator.h
@@ -150,7 +150,6 @@ class Pow2Rounder : public RoundUpInterface {
 
 class BasicCPUAllocator : public SubAllocator {
  public:
-  // Argument numa_node is currently ignored.
   BasicCPUAllocator(int numa_node, const std::vector<Visitor>& alloc_visitors,
                     const std::vector<Visitor>& free_visitors)
       : SubAllocator(alloc_visitors, free_visitors), numa_node_(numa_node) {}
diff --git a/tensorflow/core/common_runtime/process_function_library_runtime_test.cc b/tensorflow/core/common_runtime/process_function_library_runtime_test.cc
index cce23080118..21cb62118ae 100644
--- a/tensorflow/core/common_runtime/process_function_library_runtime_test.cc
+++ b/tensorflow/core/common_runtime/process_function_library_runtime_test.cc
@@ -62,9 +62,12 @@ class ProcessFunctionLibraryRuntimeTest : public ::testing::Test {
     SessionOptions options;
     auto* device_count = options.config.mutable_device_count();
     device_count->insert({"CPU", 2});
+    std::vector<std::unique_ptr<Device>> devices;
     TF_CHECK_OK(DeviceFactory::AddDevices(options, "/job:a/replica:0/task:0",
-                                          &devices_));
-    device_mgr_.reset(new DeviceMgr(devices_));
+                                          &devices));
+    device0_ = devices[0].get();
+    device1_ = devices[1].get();
+    device_mgr_.reset(new DeviceMgr(std::move(devices)));
     FunctionDefLibrary proto;
     for (const auto& fdef : flib) *(proto.add_function()) = fdef;
     lib_def_.reset(new FunctionLibraryDefinition(OpRegistry::Global(), proto));
@@ -138,8 +141,9 @@ class ProcessFunctionLibraryRuntimeTest : public ::testing::Test {
     return Status::OK();
   }
 
-  std::vector<Device*> devices_;
   std::unique_ptr<DeviceMgr> device_mgr_;
+  Device* device0_ = nullptr;  // Not owned. (Owned by device_mgr_.)
+  Device* device1_ = nullptr;  // Not owned. (Owned by device_mgr_.)
   std::unique_ptr<FunctionLibraryDefinition> lib_def_;
   std::unique_ptr<TestClusterFLR> cluster_flr_;
   std::unique_ptr<ProcessFunctionLibraryRuntime> proc_flr_;
@@ -165,16 +169,16 @@ TEST_F(ProcessFunctionLibraryRuntimeTest, Basic) {
   FunctionLibraryRuntime* flr =
       proc_flr_->GetFLR("/job:a/replica:0/task:0/cpu:0");
   EXPECT_NE(flr, nullptr);
-  EXPECT_EQ(flr->device(), devices_[0]);
+  EXPECT_EQ(flr->device(), device0_);
   flr = proc_flr_->GetFLR("/job:a/replica:0/task:0/device:CPU:0");
   EXPECT_NE(flr, nullptr);
-  EXPECT_EQ(flr->device(), devices_[0]);
+  EXPECT_EQ(flr->device(), device0_);
   flr = proc_flr_->GetFLR("/device:CPU:0");
   EXPECT_NE(flr, nullptr);
-  EXPECT_EQ(flr->device(), devices_[0]);
+  EXPECT_EQ(flr->device(), device0_);
   flr = proc_flr_->GetFLR("/job:a/replica:0/task:0/cpu:1");
   EXPECT_NE(flr, nullptr);
-  EXPECT_EQ(flr->device(), devices_[1]);
+  EXPECT_EQ(flr->device(), device1_);
   flr = proc_flr_->GetFLR("abc");
   EXPECT_EQ(flr, nullptr);
   rendezvous_->Unref();
diff --git a/tensorflow/core/common_runtime/process_state.cc b/tensorflow/core/common_runtime/process_state.cc
index bcaa37fc8a1..3d8ac9b1344 100644
--- a/tensorflow/core/common_runtime/process_state.cc
+++ b/tensorflow/core/common_runtime/process_state.cc
@@ -32,28 +32,12 @@ limitations under the License.
 
 namespace tensorflow {
 
-ProcessState* ProcessState::instance_ = nullptr;
-
 /*static*/ ProcessState* ProcessState::singleton() {
-  if (instance_ == nullptr) {
-    instance_ = new ProcessState;
-  }
-
-  return instance_;
+  static ProcessState* instance = new ProcessState;
+  return instance;
 }
 
 ProcessState::ProcessState() : numa_enabled_(false) {
-  CHECK(instance_ == nullptr);
-}
-
-// Normally the ProcessState singleton is never explicitly deleted.
-// This function is defined for debugging problems with the allocators.
-ProcessState::~ProcessState() {
-  CHECK_EQ(this, instance_);
-  instance_ = nullptr;
-  for (Allocator* a : cpu_allocators_) {
-    delete a;
-  }
 }
 
 string ProcessState::MemDesc::DebugString() {
@@ -72,8 +56,7 @@ ProcessState::MemDesc ProcessState::PtrType(const void* ptr) {
 }
 
 Allocator* ProcessState::GetCPUAllocator(int numa_node) {
-  CHECK_GE(numa_node, 0);
-  if (!numa_enabled_) numa_node = 0;
+  if (!numa_enabled_ || numa_node == port::kNUMANoAffinity) numa_node = 0;
   mutex_lock lock(mu_);
   while (cpu_allocators_.size() <= static_cast<size_t>(numa_node)) {
     // If visitors have been defined we need an Allocator built from
@@ -90,8 +73,9 @@ Allocator* ProcessState::GetCPUAllocator(int numa_node) {
     Allocator* allocator = nullptr;
     SubAllocator* sub_allocator =
         (alloc_visitors_defined || use_bfc_allocator)
-            ? new BasicCPUAllocator(numa_enabled_ ? numa_node : -1,
-                                    cpu_alloc_visitors_, cpu_free_visitors_)
+            ? new BasicCPUAllocator(
+                  numa_enabled_ ? numa_node : port::kNUMANoAffinity,
+                  cpu_alloc_visitors_, cpu_free_visitors_)
             : nullptr;
     if (use_bfc_allocator) {
       // TODO(reedwm): evaluate whether 64GB by default is the best choice.
diff --git a/tensorflow/core/common_runtime/process_state.h b/tensorflow/core/common_runtime/process_state.h
index cac312d8496..6849d305b3c 100644
--- a/tensorflow/core/common_runtime/process_state.h
+++ b/tensorflow/core/common_runtime/process_state.h
@@ -63,7 +63,7 @@ class ProcessState {
   MemDesc PtrType(const void* ptr);
 
   // Returns the one CPUAllocator used for the given numa_node.
-  // TEMPORARY: ignores numa_node.
+  // Treats numa_node == kNUMANoAffinity as numa_node == 0.
   Allocator* GetCPUAllocator(int numa_node);
 
   // Registers alloc visitor for the CPU allocator(s).
@@ -87,19 +87,19 @@ class ProcessState {
 
   // Helper method for unit tests to reset the ProcessState singleton by
   // cleaning up everything. Never use in production.
-  virtual void TestOnlyReset();
+  void TestOnlyReset();
 
   static ProcessState* instance_;
   bool numa_enabled_;
 
   mutex mu_;
 
+  // Indexed by numa_node.  If we want numa-specific allocators AND a
+  // non-specific allocator, maybe should index by numa_node+1.
   std::vector<Allocator*> cpu_allocators_ GUARDED_BY(mu_);
   std::vector<SubAllocator::Visitor> cpu_alloc_visitors_ GUARDED_BY(mu_);
   std::vector<SubAllocator::Visitor> cpu_free_visitors_ GUARDED_BY(mu_);
 
-  virtual ~ProcessState();
-
   // Optional RecordingAllocators that wrap the corresponding
   // Allocators for runtime attribute use analysis.
   MDMap mem_desc_map_;
diff --git a/tensorflow/core/common_runtime/renamed_device.cc b/tensorflow/core/common_runtime/renamed_device.cc
index 56766a8df45..45541c35fe9 100644
--- a/tensorflow/core/common_runtime/renamed_device.cc
+++ b/tensorflow/core/common_runtime/renamed_device.cc
@@ -14,15 +14,14 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/core/common_runtime/renamed_device.h"
+#include "absl/memory/memory.h"
 
 namespace tensorflow {
 
-// TODO(saeta): Convert to returning a std::unique_ptr?
 /* static */
-Device* RenamedDevice::NewRenamedDevice(const string& new_base,
-                                        Device* underlying,
-                                        bool owns_underlying,
-                                        bool isolate_session_state) {
+std::unique_ptr<Device> RenamedDevice::NewRenamedDevice(
+    const string& new_base, Device* underlying, bool owns_underlying,
+    bool isolate_session_state) {
   DeviceNameUtils::ParsedName parsed_name;
   CHECK(DeviceNameUtils::ParseFullName(new_base, &parsed_name));
   DeviceNameUtils::ParsedName underlying_parsed_name =
@@ -36,8 +35,9 @@ Device* RenamedDevice::NewRenamedDevice(const string& new_base,
                                           parsed_name.id);
   DeviceAttributes attributes(underlying->attributes());
   attributes.set_name(name);
-  return new RenamedDevice(underlying, attributes, owns_underlying,
-                           isolate_session_state);
+  // Call absl::WrapUnique to access private constructor.
+  return absl::WrapUnique(new RenamedDevice(
+      underlying, attributes, owns_underlying, isolate_session_state));
 }
 
 RenamedDevice::RenamedDevice(Device* underlying,
diff --git a/tensorflow/core/common_runtime/renamed_device.h b/tensorflow/core/common_runtime/renamed_device.h
index c00789a5563..6d24f496ffb 100644
--- a/tensorflow/core/common_runtime/renamed_device.h
+++ b/tensorflow/core/common_runtime/renamed_device.h
@@ -28,9 +28,10 @@ namespace tensorflow {
 // session.
 class RenamedDevice : public Device {
  public:
-  static Device* NewRenamedDevice(const string& new_base, Device* underlying,
-                                  bool owns_underlying,
-                                  bool isolate_session_state);
+  static std::unique_ptr<Device> NewRenamedDevice(const string& new_base,
+                                                  Device* underlying,
+                                                  bool owns_underlying,
+                                                  bool isolate_session_state);
 
   ~RenamedDevice() override;
 
diff --git a/tensorflow/core/common_runtime/ring_reducer.cc b/tensorflow/core/common_runtime/ring_reducer.cc
index b1fe928ba7d..092f15e49e3 100644
--- a/tensorflow/core/common_runtime/ring_reducer.cc
+++ b/tensorflow/core/common_runtime/ring_reducer.cc
@@ -290,7 +290,7 @@ void RingReducer::Run(StatusCallback done) {
         col_ctx_->device, col_ctx_->op_ctx->input_alloc_attr(0),
         col_ctx_->op_ctx->output_alloc_attr(0), col_ctx_->input,
         col_ctx_->output, 0 /*dev_to_dev_stream_index*/,
-        [this, &note, &status](const Status& s) {
+        [&note, &status](const Status& s) {
           status.Update(s);
           note.Notify();
         });
diff --git a/tensorflow/core/common_runtime/ring_reducer_test.cc b/tensorflow/core/common_runtime/ring_reducer_test.cc
index a271bf7b747..7feb29a6dbb 100644
--- a/tensorflow/core/common_runtime/ring_reducer_test.cc
+++ b/tensorflow/core/common_runtime/ring_reducer_test.cc
@@ -15,6 +15,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/ring_reducer.h"
 
 #include <algorithm>
+#include "absl/memory/memory.h"
 #include "tensorflow/core/common_runtime/base_collective_executor.h"
 #include "tensorflow/core/common_runtime/collective_rma_local.h"
 #include "tensorflow/core/common_runtime/device.h"
@@ -157,7 +158,7 @@ class RingReducerTest : public ::testing::Test {
     InitGPUDevices();
 #endif
     device_type_ = device_type;
-    std::vector<Device*> local_devices;
+    std::vector<std::unique_ptr<Device>> local_devices;
     SessionOptions sess_opts;
     sess_opts.env = Env::Default();
     Bytes mem_limit(4 << 20);
@@ -167,7 +168,7 @@ class RingReducerTest : public ::testing::Test {
         if (device_type == DEVICE_CPU) {
           string dev_name =
               strings::StrCat("/job:worker/replica:0/task:", wi, "/cpu:", di);
-          local_devices.push_back(new ThreadPoolDevice(
+          local_devices.push_back(absl::make_unique<ThreadPoolDevice>(
               sess_opts, dev_name, mem_limit, dev_locality, cpu_allocator()));
         } else if (device_type == DEVICE_GPU && !gpu_devices_.empty()) {
           int dev_idx = (wi * num_devices) + di;
@@ -175,7 +176,7 @@ class RingReducerTest : public ::testing::Test {
             LOG(INFO) << "dev_mgr has access to limited GPUs, reusing for more "
                          "than one ring node.";
           } else {
-            local_devices.push_back(gpu_devices_[dev_idx]);
+            local_devices.push_back(std::move(gpu_devices_[dev_idx]));
           }
         } else {
           LOG(FATAL) << "Unsupported device_type " << device_type;
@@ -185,7 +186,7 @@ class RingReducerTest : public ::testing::Test {
     if (!dev_mgr_ || device_type == DEVICE_CPU) {
       LOG(ERROR) << "resetting dev_mgr for " << local_devices.size()
                  << " devices: ";
-      dev_mgr_.reset(new DeviceMgr(local_devices));
+      dev_mgr_.reset(new DeviceMgr(std::move(local_devices)));
     }
     if (!gpu_ring_order_) gpu_ring_order_.reset(new string());
     dev_resolver_.reset(new DeviceResolverLocal(dev_mgr_.get()));
@@ -544,7 +545,7 @@ class RingReducerTest : public ::testing::Test {
   std::unique_ptr<DeviceResolverLocal> dev_resolver_;
   std::vector<DeviceInstance*> instances_;
   CollectiveParams col_params_;
-  std::vector<tensorflow::Device*> gpu_devices_;
+  std::vector<std::unique_ptr<tensorflow::Device>> gpu_devices_;
   std::unique_ptr<tensorflow::DeviceMgr> dev_mgr_;
   std::unique_ptr<string> gpu_ring_order_;
   mutex mu_;
diff --git a/tensorflow/core/common_runtime/threadpool_device.cc b/tensorflow/core/common_runtime/threadpool_device.cc
index 6404d8bc6a2..ca7ca5443c9 100644
--- a/tensorflow/core/common_runtime/threadpool_device.cc
+++ b/tensorflow/core/common_runtime/threadpool_device.cc
@@ -93,7 +93,7 @@ Status ThreadPoolDevice::MakeTensorFromProto(
     Tensor* tensor) {
   if (tensor_proto.dtype() > 0 && tensor_proto.dtype() <= DataType_MAX) {
     Tensor parsed(tensor_proto.dtype());
-    if (parsed.FromProto(cpu_allocator(), tensor_proto)) {
+    if (parsed.FromProto(allocator_, tensor_proto)) {
       *tensor = std::move(parsed);
       return Status::OK();
     }
diff --git a/tensorflow/core/common_runtime/threadpool_device_factory.cc b/tensorflow/core/common_runtime/threadpool_device_factory.cc
index 6a900c02c00..f9cbb817499 100644
--- a/tensorflow/core/common_runtime/threadpool_device_factory.cc
+++ b/tensorflow/core/common_runtime/threadpool_device_factory.cc
@@ -13,12 +13,15 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-// Register a factory that provides CPU devices.
-#include "tensorflow/core/common_runtime/threadpool_device.h"
-
 #include <vector>
+
+// Register a factory that provides CPU devices.
+#include "absl/memory/memory.h"
 #include "tensorflow/core/common_runtime/device_factory.h"
+#include "tensorflow/core/common_runtime/process_state.h"
+#include "tensorflow/core/common_runtime/threadpool_device.h"
 #include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/platform/numa.h"
 #include "tensorflow/core/public/session_options.h"
 
 namespace tensorflow {
@@ -27,9 +30,8 @@ namespace tensorflow {
 class ThreadPoolDeviceFactory : public DeviceFactory {
  public:
   Status CreateDevices(const SessionOptions& options, const string& name_prefix,
-                       std::vector<Device*>* devices) override {
-    // TODO(zhifengc/tucker): Figure out the number of available CPUs
-    // and/or NUMA configuration.
+                       std::vector<std::unique_ptr<Device>>* devices) override {
+    int num_numa_nodes = port::NUMANumNodes();
     int n = 1;
     auto iter = options.config.device_count().find("CPU");
     if (iter != options.config.device_count().end()) {
@@ -37,8 +39,26 @@ class ThreadPoolDeviceFactory : public DeviceFactory {
     }
     for (int i = 0; i < n; i++) {
       string name = strings::StrCat(name_prefix, "/device:CPU:", i);
-      devices->push_back(new ThreadPoolDevice(
-          options, name, Bytes(256 << 20), DeviceLocality(), cpu_allocator()));
+      std::unique_ptr<ThreadPoolDevice> tpd;
+      if (options.config.experimental().use_numa_affinity()) {
+        int numa_node = i % num_numa_nodes;
+        if (numa_node != i) {
+          LOG(INFO) << "Only " << num_numa_nodes
+                    << " NUMA nodes visible in system, "
+                    << " assigning device " << name << " to NUMA node "
+                    << numa_node;
+        }
+        DeviceLocality dev_locality;
+        dev_locality.set_numa_node(numa_node);
+        tpd = absl::make_unique<ThreadPoolDevice>(
+            options, name, Bytes(256 << 20), dev_locality,
+            ProcessState::singleton()->GetCPUAllocator(numa_node));
+      } else {
+        tpd = absl::make_unique<ThreadPoolDevice>(
+            options, name, Bytes(256 << 20), DeviceLocality(),
+            ProcessState::singleton()->GetCPUAllocator(port::kNUMANoAffinity));
+      }
+      devices->push_back(std::move(tpd));
     }
 
     return Status::OK();
diff --git a/tensorflow/core/distributed_runtime/BUILD b/tensorflow/core/distributed_runtime/BUILD
index 9f091224348..e388d3e6f0f 100644
--- a/tensorflow/core/distributed_runtime/BUILD
+++ b/tensorflow/core/distributed_runtime/BUILD
@@ -429,7 +429,6 @@ cc_library(
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:worker_proto_cc",
         "//tensorflow/core/debug",
-        "@com_google_absl//absl/time",
     ],
 )
 
@@ -469,6 +468,17 @@ cc_library(
     ],
 )
 
+tf_cc_test(
+    name = "server_lib_test",
+    srcs = ["server_lib_test.cc"],
+    deps = [
+        ":server_lib",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
+
 cc_library(
     name = "rpc_collective_executor_mgr",
     srcs = ["rpc_collective_executor_mgr.cc"],
@@ -615,6 +625,7 @@ tf_cc_test(
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
+        "@com_google_absl//absl/memory",
     ],
 )
 
diff --git a/tensorflow/core/distributed_runtime/collective_param_resolver_distributed_test.cc b/tensorflow/core/distributed_runtime/collective_param_resolver_distributed_test.cc
index 4eed856759a..40b18d321a1 100644
--- a/tensorflow/core/distributed_runtime/collective_param_resolver_distributed_test.cc
+++ b/tensorflow/core/distributed_runtime/collective_param_resolver_distributed_test.cc
@@ -29,7 +29,8 @@ limitations under the License.
 namespace tensorflow {
 namespace {
 
-static Device* NewDevice(const string& type, const string& name) {
+static std::unique_ptr<Device> NewDevice(const string& type,
+                                         const string& name) {
   class FakeDevice : public Device {
    public:
     explicit FakeDevice(const DeviceAttributes& attr) : Device(nullptr, attr) {}
@@ -40,7 +41,7 @@ static Device* NewDevice(const string& type, const string& name) {
   attr.set_name(name);
   attr.set_device_type(type);
   attr.mutable_locality()->set_numa_node(3);  // a non-default value
-  return new FakeDevice(attr);
+  return absl::make_unique<FakeDevice>(attr);
 }
 
 class FakeWorker : public TestWorkerInterface {
@@ -156,16 +157,16 @@ class DeviceResDistTest : public ::testing::Test {
 
   void DefineWorker(const ConfigProto& config, const string& worker_name,
                     const string& device_type, int num_devices) {
-    std::vector<Device*> devices;
+    std::vector<std::unique_ptr<Device>> devices;
     for (int i = 0; i < num_devices; ++i) {
       devices.push_back(NewDevice(
           device_type,
           strings::StrCat(worker_name, "/device:", device_type, ":", i)));
     }
-    DeviceMgr* dev_mgr = new DeviceMgr(devices);
+    DeviceMgr* dev_mgr = new DeviceMgr(std::move(devices));
     device_mgrs_.push_back(dev_mgr);
     std::vector<string>* dv = &dev_by_task_[worker_name];
-    for (auto d : devices) {
+    for (auto* d : dev_mgr->ListDevices()) {
       dv->push_back(d->name());
     }
     DeviceResolverDistributed* dev_res =
diff --git a/tensorflow/core/distributed_runtime/collective_rma_distributed_test.cc b/tensorflow/core/distributed_runtime/collective_rma_distributed_test.cc
index 33e1c8f2c33..26f722a6bd4 100644
--- a/tensorflow/core/distributed_runtime/collective_rma_distributed_test.cc
+++ b/tensorflow/core/distributed_runtime/collective_rma_distributed_test.cc
@@ -41,7 +41,8 @@ limitations under the License.
 namespace tensorflow {
 namespace {
 
-static Device* NewDevice(const string& type, const string& name) {
+static std::unique_ptr<Device> NewDevice(const string& type,
+                                         const string& name) {
   class FakeDevice : public Device {
    public:
     explicit FakeDevice(const DeviceAttributes& attr) : Device(nullptr, attr) {}
@@ -52,7 +53,7 @@ static Device* NewDevice(const string& type, const string& name) {
   attr.set_name(name);
   attr.set_device_type(type);
   attr.mutable_locality()->set_numa_node(3);  // a non-default value
-  return new FakeDevice(attr);
+  return absl::make_unique<FakeDevice>(attr);
 }
 
 static int64 kStepId = 123;
@@ -211,16 +212,16 @@ class CollRMADistTest : public ::testing::Test {
 
   void DefineWorker(const ConfigProto& config, const string& worker_name,
                     const string& device_type, int num_devices) {
-    std::vector<Device*> devices;
+    std::vector<std::unique_ptr<Device>> devices;
     for (int i = 0; i < num_devices; ++i) {
       devices.push_back(NewDevice(
           device_type,
           strings::StrCat(worker_name, "/device:", device_type, ":", i)));
     }
-    DeviceMgr* dev_mgr = new DeviceMgr(devices);
+    DeviceMgr* dev_mgr = new DeviceMgr(std::move(devices));
     device_mgrs_.push_back(dev_mgr);
     std::vector<string>* dv = &dev_by_task_[worker_name];
-    for (auto d : devices) {
+    for (auto d : dev_mgr->ListDevices()) {
       dv->push_back(d->name());
     }
     DeviceResolverDistributed* dev_res =
diff --git a/tensorflow/core/distributed_runtime/device_resolver_distributed_test.cc b/tensorflow/core/distributed_runtime/device_resolver_distributed_test.cc
index ae44b98bd52..842a2b3b058 100644
--- a/tensorflow/core/distributed_runtime/device_resolver_distributed_test.cc
+++ b/tensorflow/core/distributed_runtime/device_resolver_distributed_test.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/core/distributed_runtime/device_resolver_distributed.h"
 
+#include "absl/memory/memory.h"
 #include "tensorflow/core/common_runtime/device_mgr.h"
 #include "tensorflow/core/distributed_runtime/test_utils.h"
 #include "tensorflow/core/lib/core/notification.h"
@@ -41,8 +42,8 @@ class TestableDeviceResolverDistributed : public DeviceResolverDistributed {
 
 // Create a fake 'Device' whose only interesting attribute is a non-default
 // DeviceLocality.
-static Device* NewDevice(const string& type, const string& name,
-                         int numa_node) {
+static std::unique_ptr<Device> NewDevice(const string& type, const string& name,
+                                         int numa_node) {
   class FakeDevice : public Device {
    public:
     explicit FakeDevice(const DeviceAttributes& attr) : Device(nullptr, attr) {}
@@ -53,7 +54,7 @@ static Device* NewDevice(const string& type, const string& name,
   attr.set_name(name);
   attr.set_device_type(type);
   attr.mutable_locality()->set_numa_node(numa_node);
-  return new FakeDevice(attr);
+  return absl::make_unique<FakeDevice>(attr);
 }
 
 // Create a fake WorkerInterface that responds to requests without RPCs,
@@ -151,19 +152,19 @@ class DeviceResDistTest : public ::testing::Test {
 
   void DefineWorker(const string& worker_name, const string& device_type,
                     int num_devices) {
-    std::vector<Device*> devices;
+    std::vector<std::unique_ptr<Device>> devices;
     for (int i = 0; i < num_devices; ++i) {
       devices.push_back(NewDevice(
           device_type,
           strings::StrCat(worker_name, "/device:", device_type, ":", i), i));
     }
-    DeviceMgr* dev_mgr = new DeviceMgr(devices);
+    DeviceMgr* dev_mgr = new DeviceMgr(std::move(devices));
     TestableDeviceResolverDistributed* dev_res =
         new TestableDeviceResolverDistributed(dev_mgr, &wc_, worker_name);
     resolvers_[worker_name] = dev_res;
     device_mgrs_.push_back(dev_mgr);
     std::vector<string>* dv = &dev_by_task_[worker_name];
-    for (auto d : devices) {
+    for (auto* d : dev_mgr->ListDevices()) {
       dv->push_back(d->name());
     }
     FakeWorker* fw = new FakeWorker(worker_name, dev_mgr, dev_res);
diff --git a/tensorflow/core/distributed_runtime/eager/BUILD b/tensorflow/core/distributed_runtime/eager/BUILD
index 055e5dfceda..55b2657e74e 100644
--- a/tensorflow/core/distributed_runtime/eager/BUILD
+++ b/tensorflow/core/distributed_runtime/eager/BUILD
@@ -69,6 +69,7 @@ cc_library(
         "//tensorflow/core/distributed_runtime:worker_env",
         "//tensorflow/core/distributed_runtime/eager:remote_tensor_handle",
         "//tensorflow/core/distributed_runtime/rpc:rpc_rendezvous_mgr",
+        "@com_google_absl//absl/memory",
     ],
 )
 
diff --git a/tensorflow/core/distributed_runtime/eager/eager_service_impl.cc b/tensorflow/core/distributed_runtime/eager/eager_service_impl.cc
index b8af63724aa..13c959d8506 100644
--- a/tensorflow/core/distributed_runtime/eager/eager_service_impl.cc
+++ b/tensorflow/core/distributed_runtime/eager/eager_service_impl.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/core/distributed_runtime/eager/eager_service_impl.h"
 
+#include "absl/memory/memory.h"
 #include "tensorflow/c/c_api_internal.h"
 #include "tensorflow/c/tf_status_helper.h"
 #include "tensorflow/core/common_runtime/device_mgr.h"
@@ -36,6 +37,7 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/cpu_info.h"
 #include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/host_info.h"
 
 namespace tensorflow {
 namespace eager {
@@ -86,7 +88,7 @@ Status EagerServiceImpl::CreateContext(const CreateContextRequest* request,
     return tensorflow::errors::Internal(
         "invalid eager env_ or env_->rendezvous_mgr.");
   }
-  std::vector<tensorflow::Device*> devices;
+  std::vector<std::unique_ptr<tensorflow::Device>> devices;
 
   TF_RETURN_IF_ERROR(tensorflow::DeviceFactory::AddDevices(
       // TODO(nareshmodi): Correctly set the SessionOptions.
@@ -96,12 +98,12 @@ Status EagerServiceImpl::CreateContext(const CreateContextRequest* request,
                       request->server_def().task_index()),
       &devices));
   response->mutable_device_attributes()->Reserve(devices.size());
-  for (auto& d : devices) {
+  for (const auto& d : devices) {
     *response->add_device_attributes() = d->attributes();
   }
 
-  std::unique_ptr<tensorflow::DeviceMgr> device_mgr(
-      new tensorflow::DeviceMgr(devices));
+  std::unique_ptr<tensorflow::DeviceMgr> device_mgr =
+      absl::make_unique<DeviceMgr>(std::move(devices));
 
   auto* r = env_->rendezvous_mgr->Find(request->rendezvous_id());
   auto session_name = strings::StrCat("eager_", request->rendezvous_id());
@@ -152,20 +154,19 @@ Status EagerServiceImpl::ExecuteOp(const Operation& operation,
   std::unique_ptr<tensorflow::EagerOperation> op;
   const char* name = operation.name().c_str();  // Shorthand
   const tensorflow::AttrTypeMap* types;
-  auto status = tensorflow::AttrTypeMapForOp(name, &types);
-  if (status.ok()) {
-    op.reset(
-        new tensorflow::EagerOperation(server_context->Context(), name, types));
-  } else if (errors::IsNotFound(status)) {
-    if (server_context->Context()->FindFunctionByName(name)) {
-      op.reset(new tensorflow::EagerOperation(server_context->Context(), name,
-                                              nullptr));
-    } else {
-      return status;
-    }
-  } else {
-    return status;
+  bool is_function = false;
+  TF_RETURN_IF_ERROR(tensorflow::AttrTypeMapForOp(name, &types, &is_function));
+  if (is_function && !server_context->Context()->FindFunctionByName(name)) {
+    return errors::NotFound(
+        "'", name,
+        "' is neither a type of a primitive operation nor a name "
+        "of a function registered in binary running on ",
+        port::Hostname(),
+        ". Make sure the operation or function is "
+        "registered in the binary running in this process.");
   }
+  op.reset(new tensorflow::EagerOperation(server_context->Context(), name,
+                                          is_function, types));
 
   TF_RETURN_IF_ERROR(op->SetDevice(operation.device().c_str()));
 
diff --git a/tensorflow/core/distributed_runtime/eager/eager_service_impl_test.cc b/tensorflow/core/distributed_runtime/eager/eager_service_impl_test.cc
index 5ba522c2a2e..7a1463e8f04 100644
--- a/tensorflow/core/distributed_runtime/eager/eager_service_impl_test.cc
+++ b/tensorflow/core/distributed_runtime/eager/eager_service_impl_test.cc
@@ -68,12 +68,9 @@ class EagerServiceImplTest : public ::testing::Test {
     worker_env_.rendezvous_mgr = &rendezvous_mgr_;
     worker_env_.session_mgr = session_mgr_.get();
 
-    Device* device = DeviceFactory::NewDevice(
-        "CPU", {}, "/job:localhost/replica:0/task:0/device:CPU:0");
-
-    worker_env_.local_devices = {device};
-
-    device_mgr_.reset(new DeviceMgr(worker_env_.local_devices));
+    device_mgr_ = absl::make_unique<DeviceMgr>(DeviceFactory::NewDevice(
+        "CPU", {}, "/job:localhost/replica:0/task:0/device:CPU:0"));
+    worker_env_.local_devices = device_mgr_->ListDevices();
     worker_env_.device_mgr = device_mgr_.get();
   }
 
diff --git a/tensorflow/core/distributed_runtime/graph_mgr.cc b/tensorflow/core/distributed_runtime/graph_mgr.cc
index ee39062e345..ee5823e314f 100644
--- a/tensorflow/core/distributed_runtime/graph_mgr.cc
+++ b/tensorflow/core/distributed_runtime/graph_mgr.cc
@@ -15,9 +15,9 @@ limitations under the License.
 
 #include "tensorflow/core/distributed_runtime/graph_mgr.h"
 
+#include <chrono>  // NOLINT(build/c++11)
 #include <vector>
 
-#include "absl/time/clock.h"
 #include "tensorflow/core/common_runtime/build_graph_options.h"
 #include "tensorflow/core/common_runtime/constant_folding.h"
 #include "tensorflow/core/common_runtime/debugger_state_interface.h"
@@ -388,7 +388,7 @@ void GraphMgr::ExecuteAsync(const string& handle, const int64 step_id,
                             MutableRunGraphResponseWrapper* response,
                             CancellationManager* cancellation_manager,
                             const NamedTensors& in, StatusCallback done) {
-  const absl::Time start_time = absl::Now();
+  const uint64 start_time_usecs = Env::Default()->NowMicros();
   // Lookup an item. Holds one ref while executing.
   Item* item = nullptr;
   {
@@ -449,9 +449,9 @@ void GraphMgr::ExecuteAsync(const string& handle, const int64 step_id,
   StartParallelExecutors(
       handle, step_id, item, rendezvous, ce_handle, collector, cost_graph,
       cancellation_manager,
-      [item, rendezvous, ce_handle, done, start_time](const Status& s) {
+      [item, rendezvous, ce_handle, done, start_time_usecs](const Status& s) {
         done(s);
-        UpdateGraphExecutionTime(absl::Now() - start_time);
+        UpdateGraphExecTime(Env::Default()->NowMicros() - start_time_usecs);
         rendezvous->Unref();
         item->Unref();
         delete ce_handle;
diff --git a/tensorflow/core/distributed_runtime/rpc/BUILD b/tensorflow/core/distributed_runtime/rpc/BUILD
index d122016d3ee..273709a01fd 100644
--- a/tensorflow/core/distributed_runtime/rpc/BUILD
+++ b/tensorflow/core/distributed_runtime/rpc/BUILD
@@ -105,6 +105,7 @@ cc_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "//tensorflow/core:protos_all_cc",
     ],
 )
 
diff --git a/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_client.cc b/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_client.cc
index 181422118cd..3626a48171e 100644
--- a/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_client.cc
+++ b/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_client.cc
@@ -40,7 +40,7 @@ class GrpcEagerClient : public EagerClient {
       override {                                                          \
     new RPCState<protobuf::Message>(                                      \
         &stub_, cq_, "/tensorflow.eager.EagerService/" #method, *request, \
-        response, std::move(done), nullptr);                              \
+        response, std::move(done), nullptr, nullptr);                     \
   }
 
   CLIENT_METHOD(CreateContext);
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_channel.cc b/tensorflow/core/distributed_runtime/rpc/grpc_channel.cc
index 456c30ecf49..781b7d65cdd 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_channel.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_channel.cc
@@ -53,30 +53,58 @@ Status ValidateHostPortPair(const string& host_port) {
   }
   return Status::OK();
 }
+
 }  // namespace
 
-Status NewHostPortGrpcChannel(const string& target,
-                              SharedGrpcChannelPtr* channel_pointer) {
-  // Minimally ensure that the target is valid
-  TF_RETURN_IF_ERROR(ValidateHostPortPair(target));
-
+::grpc::ChannelArguments GetChannelArguments(const RPCOptions* rpc_options) {
   // TODO(mrry): Implement secure channels.
   ::grpc::ChannelArguments args;
   args.SetInt(GRPC_ARG_MAX_MESSAGE_LENGTH, std::numeric_limits<int32>::max());
   // NOTE(mrry): Some versions of gRPC use a 20-second minimum backoff
   // on connection failure, which makes our tests time out.
   args.SetInt("grpc.testing.fixed_reconnect_backoff_ms", 1000);
+  if (rpc_options != nullptr) {
+    if (rpc_options->compression_algorithm() == "deflate") {
+      args.SetCompressionAlgorithm(GRPC_COMPRESS_DEFLATE);
+      args.SetInt(GRPC_COMPRESSION_CHANNEL_DEFAULT_LEVEL,
+                  rpc_options->compression_level());
+      VLOG(5) << "Setting GRPC compression : algo='"
+              << rpc_options->compression_algorithm()
+              << "' level=" << rpc_options->compression_level();
+    } else if (rpc_options->compression_algorithm() == "gzip") {
+      args.SetCompressionAlgorithm(GRPC_COMPRESS_GZIP);
+      args.SetInt(GRPC_COMPRESSION_CHANNEL_DEFAULT_LEVEL,
+                  rpc_options->compression_level());
+      VLOG(5) << "Setting GRPC compression : algo='"
+              << rpc_options->compression_algorithm()
+              << "' level=" << rpc_options->compression_level();
+    } else if (!rpc_options->compression_algorithm().empty()) {
+      LOG(ERROR) << "Invalid compression algorithm: "
+                 << rpc_options->compression_algorithm();
+    }
+  }
+  return args;
+}
+
+Status NewHostPortGrpcChannel(const string& target,
+                              const RPCOptions* rpc_options,
+                              SharedGrpcChannelPtr* channel_pointer) {
+  // Minimally ensure that the target is valid
+  TF_RETURN_IF_ERROR(ValidateHostPortPair(target));
+
+  ::grpc::ChannelArguments args = GetChannelArguments(rpc_options);
   *channel_pointer = ::grpc::CreateCustomChannel(
       "dns:///" + target, ::grpc::InsecureChannelCredentials(), args);
   return Status::OK();
 }
 
 ChannelCreationFunction ConvertToChannelCreationFunction(
-    const std::function<Status(string, SharedGrpcChannelPtr*)>&
-        new_channel_func_ptr) {
+    const std::function<Status(string, const RPCOptions*,
+                               SharedGrpcChannelPtr*)>& new_channel_func_ptr) {
   return [new_channel_func_ptr](const string& target) -> SharedGrpcChannelPtr {
     SharedGrpcChannelPtr channel_ptr;
-    if (new_channel_func_ptr(target, &channel_ptr).ok()) {
+    if (new_channel_func_ptr(target, /*rpc_options=*/nullptr, &channel_ptr)
+            .ok()) {
       return channel_ptr;
     } else {
       return nullptr;
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_channel.h b/tensorflow/core/distributed_runtime/rpc/grpc_channel.h
index 6fa99d7b148..57d16218e8f 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_channel.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_channel.h
@@ -25,6 +25,7 @@ limitations under the License.
 #include "grpcpp/grpcpp.h"
 
 #include "tensorflow/core/distributed_runtime/rpc/grpc_util.h"
+#include "tensorflow/core/protobuf/config.pb.h"
 
 namespace tensorflow {
 
@@ -86,11 +87,14 @@ GrpcChannelCache* NewGrpcChannelCache(const GrpcChannelSpec& channel_spec,
 
 // Below here are internal-only functions.
 
+::grpc::ChannelArguments GetChannelArguments(const RPCOptions* rpc_options);
+
 ChannelCreationFunction ConvertToChannelCreationFunction(
-    const std::function<Status(string, SharedGrpcChannelPtr*)>&
-        new_channel_func_ptr);
+    const std::function<Status(string, const RPCOptions*,
+                               SharedGrpcChannelPtr*)>& new_channel_func_ptr);
 
 Status NewHostPortGrpcChannel(const string& target,
+                              const RPCOptions* rpc_options,
                               SharedGrpcChannelPtr* channel_pointer);
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_channel_test.cc b/tensorflow/core/distributed_runtime/rpc/grpc_channel_test.cc
index a814ef85e20..a6fae2286f5 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_channel_test.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_channel_test.cc
@@ -184,18 +184,39 @@ TEST(GrpcChannelTest, SparseHostPorts) {
 TEST(GrpcChannelTest, NewHostPortGrpcChannelValidation) {
   SharedGrpcChannelPtr mock_ptr;
 
-  EXPECT_TRUE(NewHostPortGrpcChannel("127.0.0.1:2222", &mock_ptr).ok());
-  EXPECT_TRUE(NewHostPortGrpcChannel("example.com:2222", &mock_ptr).ok());
-  EXPECT_TRUE(NewHostPortGrpcChannel("fqdn.example.com.:2222", &mock_ptr).ok());
-  EXPECT_TRUE(NewHostPortGrpcChannel("[2002:a9c:258e::]:2222", &mock_ptr).ok());
-  EXPECT_TRUE(NewHostPortGrpcChannel("[::]:2222", &mock_ptr).ok());
+  EXPECT_TRUE(NewHostPortGrpcChannel("127.0.0.1:2222", /*rpc_options=*/nullptr,
+                                     &mock_ptr)
+                  .ok());
+  EXPECT_TRUE(NewHostPortGrpcChannel("example.com:2222",
+                                     /*rpc_options=*/nullptr, &mock_ptr)
+                  .ok());
+  EXPECT_TRUE(NewHostPortGrpcChannel("fqdn.example.com.:2222",
+                                     /*rpc_options=*/nullptr, &mock_ptr)
+                  .ok());
+  EXPECT_TRUE(NewHostPortGrpcChannel("[2002:a9c:258e::]:2222",
+                                     /*rpc_options=*/nullptr, &mock_ptr)
+                  .ok());
+  EXPECT_TRUE(
+      NewHostPortGrpcChannel("[::]:2222", /*rpc_options=*/nullptr, &mock_ptr)
+          .ok());
 
-  EXPECT_FALSE(NewHostPortGrpcChannel("example.com/abc:2222", &mock_ptr).ok());
-  EXPECT_FALSE(NewHostPortGrpcChannel("127.0.0.1:2222/", &mock_ptr).ok());
-  EXPECT_FALSE(NewHostPortGrpcChannel("example.com/abc:", &mock_ptr).ok());
-  EXPECT_FALSE(NewHostPortGrpcChannel("[::]/:2222", &mock_ptr).ok());
-  EXPECT_FALSE(NewHostPortGrpcChannel("[::]:2222/", &mock_ptr).ok());
-  EXPECT_FALSE(NewHostPortGrpcChannel("[::]:", &mock_ptr).ok());
+  EXPECT_FALSE(NewHostPortGrpcChannel("example.com/abc:2222",
+                                      /*rpc_options=*/nullptr, &mock_ptr)
+                   .ok());
+  EXPECT_FALSE(NewHostPortGrpcChannel("127.0.0.1:2222/",
+                                      /*rpc_options=*/nullptr, &mock_ptr)
+                   .ok());
+  EXPECT_FALSE(NewHostPortGrpcChannel(
+                   "example.com/abc:", /*rpc_options=*/nullptr, &mock_ptr)
+                   .ok());
+  EXPECT_FALSE(
+      NewHostPortGrpcChannel("[::]/:2222", /*rpc_options=*/nullptr, &mock_ptr)
+          .ok());
+  EXPECT_FALSE(
+      NewHostPortGrpcChannel("[::]:2222/", /*rpc_options=*/nullptr, &mock_ptr)
+          .ok());
+  EXPECT_FALSE(
+      NewHostPortGrpcChannel("[::]:", /*rpc_options=*/nullptr, &mock_ptr).ok());
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.cc b/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.cc
index 885c5e87c17..2daefcb399c 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.cc
@@ -30,6 +30,7 @@ limitations under the License.
 #include "tensorflow/core/distributed_runtime/worker_interface.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/tracing.h"
@@ -42,10 +43,12 @@ class GrpcRemoteWorker : public WorkerInterface {
  public:
   explicit GrpcRemoteWorker(SharedGrpcChannelPtr channel,
                             ::grpc::CompletionQueue* completion_queue,
+                            thread::ThreadPool* callback_threadpool,
                             WorkerCacheLogger* logger)
       : channel_(std::move(channel)),
         stub_(channel_),
         cq_(completion_queue),
+        callback_threadpool_(callback_threadpool),
         getstatus_(Method(GrpcWorkerMethod::kGetStatus)),
         createworkersession_(Method(GrpcWorkerMethod::kCreateWorkerSession)),
         deleteworkersession_(Method(GrpcWorkerMethod::kDeleteWorkerSession)),
@@ -258,13 +261,15 @@ class GrpcRemoteWorker : public WorkerInterface {
                     protobuf::Message* response, const ::grpc::string& method,
                     StatusCallback done, CallOptions* call_opts = nullptr) {
     new RPCState<protobuf::Message>(&stub_, cq_, method, *request, response,
-                                    std::move(done), call_opts);
+                                    std::move(done), call_opts,
+                                    callback_threadpool_);
   }
   void IssueRequest(const protobuf::Message* request, TensorResponse* response,
                     const ::grpc::string& method, StatusCallback done,
                     CallOptions* call_opts = nullptr) {
     new RPCState<TensorResponse>(&stub_, cq_, method, *request, response,
-                                 std::move(done), call_opts);
+                                 std::move(done), call_opts,
+                                 callback_threadpool_);
   }
 
   // Helper function for initializing the RpcMethod objects below.
@@ -273,6 +278,7 @@ class GrpcRemoteWorker : public WorkerInterface {
   SharedGrpcChannelPtr channel_;
   ::grpc::GenericStub stub_;
   ::grpc::CompletionQueue* cq_;
+  thread::ThreadPool* callback_threadpool_;
 
   const ::grpc::string getstatus_;
   const ::grpc::string createworkersession_;
@@ -298,8 +304,10 @@ class GrpcRemoteWorker : public WorkerInterface {
 
 WorkerInterface* NewGrpcRemoteWorker(SharedGrpcChannelPtr channel,
                                      ::grpc::CompletionQueue* completion_queue,
+                                     thread::ThreadPool* callback_threadpool,
                                      WorkerCacheLogger* logger) {
-  return new GrpcRemoteWorker(std::move(channel), completion_queue, logger);
+  return new GrpcRemoteWorker(std::move(channel), completion_queue,
+                              callback_threadpool, logger);
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.h b/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.h
index b85c1dc5b4e..d1f0e94ba52 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.h
@@ -19,18 +19,19 @@ limitations under the License.
 #include <memory>
 
 #include "tensorflow/core/distributed_runtime/rpc/grpc_util.h"
+#include "tensorflow/core/lib/core/threadpool.h"
 
 namespace grpc {
 class CompletionQueue;
 }
 
 namespace tensorflow {
-
 class WorkerCacheLogger;
 class WorkerInterface;
 
 WorkerInterface* NewGrpcRemoteWorker(SharedGrpcChannelPtr channel,
                                      ::grpc::CompletionQueue* completion_queue,
+                                     thread::ThreadPool* callback_threadpool,
                                      WorkerCacheLogger* logger);
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_rpc_factory.cc b/tensorflow/core/distributed_runtime/rpc/grpc_rpc_factory.cc
index cde6b785dc6..4f5975bbc11 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_rpc_factory.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_rpc_factory.cc
@@ -206,11 +206,11 @@ void GrpcRPCFactory::StartCall(const Tensor& address_t, const Tensor& method_t,
 
   int index = call->index();
   // This object will delete itself when done.
-  new RPCState<string>(get_stub(index), &completion_queue_,
-                       *get_method_ptr(index), call->request(),
-                       call->response(),
-                       /*done=*/[call](const Status& s) { call->Done(s); },
-                       call->call_opts(), fail_fast_, timeout_in_ms_);
+  new RPCState<string>(
+      get_stub(index), &completion_queue_, *get_method_ptr(index),
+      call->request(), call->response(),
+      /*done=*/[call](const Status& s) { call->Done(s); }, call->call_opts(),
+      nullptr /*threadpool*/, fail_fast_, timeout_in_ms_);
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc
index ae722fdfe95..cbd5cd927e7 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <cstring>
 #include <limits>
 #include <memory>
+#include <vector>
 
 #include "grpc/support/alloc.h"
 #include "grpcpp/grpcpp.h"
@@ -156,10 +157,12 @@ Status GrpcServer::Init(
   string name_prefix =
       strings::StrCat("/job:", server_def_.job_name(), "/replica:0",
                       "/task:", server_def_.task_index());
-  TF_RETURN_IF_ERROR(DeviceFactory::AddDevices(sess_opts, name_prefix,
-                                               &master_env_.local_devices));
-  worker_env_.local_devices = master_env_.local_devices;
-  worker_env_.device_mgr = new DeviceMgr(worker_env_.local_devices);
+  std::vector<std::unique_ptr<Device>> devices;
+  TF_RETURN_IF_ERROR(
+      DeviceFactory::AddDevices(sess_opts, name_prefix, &devices));
+  worker_env_.device_mgr = new DeviceMgr(std::move(devices));
+  master_env_.local_devices = worker_env_.device_mgr->ListDevices();
+  worker_env_.local_devices = worker_env_.device_mgr->ListDevices();
   worker_env_.rendezvous_mgr = rendezvous_mgr_func == nullptr
                                    ? new RpcRendezvousMgr(&worker_env_)
                                    : rendezvous_mgr_func(&worker_env_);
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_session.cc b/tensorflow/core/distributed_runtime/rpc/grpc_session.cc
index fdce1b10e0a..32063fecbbe 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_session.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_session.cc
@@ -52,8 +52,9 @@ Status GrpcSession::Create(const SessionOptions& options,
   }
   if (!master) {
     SharedGrpcChannelPtr master_channel;
-    TF_RETURN_IF_ERROR(NewHostPortGrpcChannel(
-        options.target.substr(kSchemePrefixLength), &master_channel));
+    TF_RETURN_IF_ERROR(
+        NewHostPortGrpcChannel(options.target.substr(kSchemePrefixLength),
+                               &options.config.rpc_options(), &master_channel));
     master.reset(NewGrpcMaster(master_channel));
   }
   session->SetRemoteMaster(std::move(master));
@@ -91,6 +92,12 @@ void ReEncodeConsts(GraphDef* gdef) {
 }
 }  // namespace
 
+void GrpcSession::SetHandleAndGraphVersion(string handle, int64 graph_version) {
+  mutex_lock l(mu_);
+  handle_ = std::move(handle);
+  current_graph_version_ = graph_version;
+}
+
 Status GrpcSession::Handle(string* out_handle) {
   mutex_lock l(mu_);
   if (handle_.empty()) {
@@ -116,9 +123,7 @@ Status GrpcSession::CreateImpl(CallOptions* call_options,
   CreateSessionResponse resp;
   Status s = master_->CreateSession(call_options, &req, &resp);
   if (s.ok()) {
-    mutex_lock l(mu_);
-    swap(handle_, *(resp.mutable_session_handle()));
-    current_graph_version_ = resp.graph_version();
+    SetHandleAndGraphVersion(resp.session_handle(), resp.graph_version());
   }
   return s;
 }
@@ -384,8 +389,9 @@ void GrpcSession::SetRemoteMaster(std::unique_ptr<MasterInterface> master) {
 Status GrpcSession::Reset(const SessionOptions& options,
                           const std::vector<string>& containers) {
   SharedGrpcChannelPtr master_channel;
-  TF_RETURN_IF_ERROR(NewHostPortGrpcChannel(
-      options.target.substr(kSchemePrefixLength), &master_channel));
+  TF_RETURN_IF_ERROR(
+      NewHostPortGrpcChannel(options.target.substr(kSchemePrefixLength),
+                             /*rpc_options=*/nullptr, &master_channel));
   auto master = NewGrpcMaster(master_channel);
   ResetRequest req;
   for (const auto& c : containers) req.add_container(c);
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_session.h b/tensorflow/core/distributed_runtime/rpc/grpc_session.h
index 63795117f97..e00cf97e389 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_session.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_session.h
@@ -106,6 +106,9 @@ class GrpcSession : public Session {
  protected:
   // Takes ownership of `*master`.
   void SetRemoteMaster(std::unique_ptr<MasterInterface> master);
+  // Allows subclasses to customize Session creation.
+  void SetHandleAndGraphVersion(string handle, int64 graph_version)
+      LOCKS_EXCLUDED(mu_);
 
  private:
   SessionOptions options_;
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_session_test.cc b/tensorflow/core/distributed_runtime/rpc/grpc_session_test.cc
index fc601991a24..ad0f8e5e2fc 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_session_test.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_session_test.cc
@@ -1066,4 +1066,31 @@ TEST(SessionTest, RunTimeoutWithRunOptions) {
               error::INTERNAL == status.code());
 }
 
+TEST(SessionTest, TestCompression) {
+  std::unique_ptr<test::TestCluster> cluster;
+  TF_CHECK_OK(test::TestCluster::MakeTestCluster(Devices(1, 0), 1, &cluster));
+  SessionOptions options = Options(cluster->targets()[0], 100);
+  RPCOptions* rpc_options = options.config.mutable_rpc_options();
+  rpc_options->set_compression_algorithm("deflate");
+  rpc_options->set_compression_level(GRPC_COMPRESS_LEVEL_HIGH);
+
+  std::unique_ptr<Session> session(NewRemote(options));
+
+  static const float kTestValue = 409.1934f;
+  Graph graph(OpRegistry::Global());
+  Tensor tensor(DT_FLOAT, TensorShape({1, 1}));
+  tensor.flat<float>()(0) = kTestValue;
+  Node* b = test::graph::Constant(&graph, tensor);
+  GraphDef gdef;
+  graph.ToGraphDef(&gdef);
+  RunOptions run_options;
+  TF_CHECK_OK(session->Create(run_options, gdef));
+
+  std::vector<std::pair<string, Tensor>> inputs;
+  std::vector<Tensor> outputs;
+  TF_CHECK_OK(session->Run(inputs, {b->name()}, {}, &outputs));
+  ASSERT_EQ(1, outputs.size());
+  IsSingleFloatValue(outputs[0], kTestValue);
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_state.h b/tensorflow/core/distributed_runtime/rpc/grpc_state.h
index 61c5bc285f2..b67f3c45631 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_state.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_state.h
@@ -25,6 +25,7 @@ limitations under the License.
 #include "tensorflow/core/distributed_runtime/rpc/grpc_client_cq_tag.h"
 #include "tensorflow/core/distributed_runtime/rpc/grpc_util.h"
 #include "tensorflow/core/distributed_runtime/tensor_coding.h"
+#include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/platform/notification.h"
 
 namespace tensorflow {
@@ -36,16 +37,18 @@ class RPCState : public GrpcClientCQTag {
   // Default behavior is to set fail_fast = False and handle timeouts manually.
   RPCState(::grpc::GenericStub* stub, ::grpc::CompletionQueue* cq,
            const ::grpc::string& method, const protobuf::Message& request,
-           Response* response, StatusCallback done, CallOptions* call_opts)
+           Response* response, StatusCallback done, CallOptions* call_opts,
+           thread::ThreadPool* threadpool)
       : RPCState(stub, cq, method, request, response, std::move(done),
-                 call_opts, /*fail_fast=*/false, /*timeout_in_ms=*/0) {}
+                 call_opts, threadpool, /*fail_fast=*/false,
+                 /*timeout_in_ms=*/0) {}
 
   template <typename Request>
   RPCState(::grpc::GenericStub* stub, ::grpc::CompletionQueue* cq,
            const ::grpc::string& method, const Request& request,
            Response* response, StatusCallback done, CallOptions* call_opts,
-           bool fail_fast, int64 timeout_in_ms)
-      : call_opts_(call_opts), done_(std::move(done)) {
+           thread::ThreadPool* threadpool, bool fail_fast, int64 timeout_in_ms)
+      : call_opts_(call_opts), threadpool_(threadpool), done_(std::move(done)) {
     context_.set_fail_fast(fail_fast);
     if (timeout_in_ms > 0) {
       context_.set_deadline(gpr_time_from_millis(timeout_in_ms, GPR_TIMESPAN));
@@ -77,11 +80,27 @@ class RPCState : public GrpcClientCQTag {
       // to Finish for client-side unary calls, ok should never be false
       s.Update(errors::Internal("unexpected ok value at rpc completion"));
     }
-    if (s.ok() && !GrpcMaybeParseProto(&response_buf_, response_)) {
-      s.Update(errors::Internal("could not parse rpc response"));
-    }
-    if (!s.ok()) {
+
+    if (s.ok()) {
+      if (threadpool_) {
+        // Run parse and callback in another thread, returning this
+        // one to service more RPCs.
+        threadpool_->Schedule([this]() { ParseAndCallDone(); });
+      } else {
+        ParseAndCallDone();
+        return;
+      }
+    } else {
       VLOG(2) << "Call returned with non-ok status: " << s;
+      done_(s);
+      delete this;
+    }
+  }
+
+  void ParseAndCallDone() {
+    Status s;
+    if (!GrpcMaybeParseProto(&response_buf_, response_)) {
+      s.Update(errors::Internal("could not parse rpc response"));
     }
     done_(s);
     delete this;
@@ -90,6 +109,7 @@ class RPCState : public GrpcClientCQTag {
  private:
   CallOptions* call_opts_;
   ::grpc::ClientContext context_;
+  thread::ThreadPool* threadpool_;
   std::unique_ptr<::grpc::GenericClientAsyncResponseReader> call_;
   Response* response_;
   ::grpc::ByteBuffer request_buf_;
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_worker_cache.cc b/tensorflow/core/distributed_runtime/rpc/grpc_worker_cache.cc
index e1541db69bf..60d5881d4ca 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_worker_cache.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_worker_cache.cc
@@ -43,7 +43,17 @@ class GrpcWorkerCache : public WorkerCachePartial {
         local_worker_(local_worker),
         channel_cache_(channel_cache),
         threads_(kGrpcWorkerCacheThreadCount),
-        next_round_robin_assignment_(0) {}
+        next_round_robin_assignment_(0) {
+    // NOTE: We don't yet have any reason to assign NUMA affinity to this
+    // ThreadPool.  If there's only a single NIC it shouldn't make any
+    // difference since presumably it is handling memory from all nodes.
+    ThreadOptions options;
+    options.numa_node = port::kNUMANoAffinity;
+    const int kNumCallbackThreads = 10;
+    callback_threadpool_.reset(new thread::ThreadPool(
+        Env::Default(), options, "grpc_wcache_callback", kNumCallbackThreads,
+        false /*low_latency_hint*/, nullptr /*allocator*/));
+  }
 
   // Explicit destructor to control destruction order.
   ~GrpcWorkerCache() override {
@@ -67,7 +77,7 @@ class GrpcWorkerCache : public WorkerCachePartial {
       if (!channel) return nullptr;
       return NewGrpcRemoteWorker(
           channel, threads_[AssignWorkerToThread(target)].completion_queue(),
-          &logger_);
+          callback_threadpool_.get(), &logger_);
     }
   }
 
@@ -138,6 +148,8 @@ class GrpcWorkerCache : public WorkerCachePartial {
   WorkerCacheLogger logger_;
   std::vector<GrpcWorkerCacheThread> threads_;
 
+  std::unique_ptr<thread::ThreadPool> callback_threadpool_;
+
   mutex assignment_mu_;
   std::unordered_map<std::string, size_t> target_assignments_
       GUARDED_BY(assignment_mu_);
diff --git a/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.cc b/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.cc
index b8cb5385038..9fb920404f9 100644
--- a/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.cc
+++ b/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.cc
@@ -244,6 +244,15 @@ void RpcRemoteRendezvous::RecvFromRemoteAsync(
   // Record "call" in active_ so that it can be aborted cleanly.
   RegisterCall(call);
 
+  // RendezvousMgr already aborted, shouldn't send RPC call any more
+  if (!call->status().ok()) {
+    call->done()(call->status(), Args(), Args(), Tensor(), false);
+    session()->worker_cache->ReleaseWorker(call->src_worker_, call->wi_);
+    call->wi_ = nullptr;
+    get_call_freelist()->Release(call, session()->worker_cache.get());
+    return;
+  }
+
   // Start "call".
   Ref();
   call->Start([this, call]() {
diff --git a/tensorflow/core/distributed_runtime/rpc_collective_executor_mgr_test.cc b/tensorflow/core/distributed_runtime/rpc_collective_executor_mgr_test.cc
index 0323300fdde..1c87fe9d928 100644
--- a/tensorflow/core/distributed_runtime/rpc_collective_executor_mgr_test.cc
+++ b/tensorflow/core/distributed_runtime/rpc_collective_executor_mgr_test.cc
@@ -42,8 +42,9 @@ class RpcCollectiveExecutorMgrTest : public ::testing::Test {
     WorkerCacheInterface* worker_cache = nullptr;
     auto* device_count = options.config.mutable_device_count();
     device_count->insert({"CPU", NUM_DEVS});
-    TF_CHECK_OK(DeviceFactory::AddDevices(options, task_name, &devices_));
-    device_mgr_.reset(new DeviceMgr(devices_));
+    std::vector<std::unique_ptr<Device>> devices;
+    TF_CHECK_OK(DeviceFactory::AddDevices(options, task_name, &devices));
+    device_mgr_.reset(new DeviceMgr(std::move(devices)));
     std::unique_ptr<DeviceResolverDistributed> dr(new DeviceResolverDistributed(
         device_mgr_.get(), worker_cache, task_name));
     std::unique_ptr<CollectiveParamResolverDistributed> cpr(
@@ -57,7 +58,6 @@ class RpcCollectiveExecutorMgrTest : public ::testing::Test {
   }
 
   std::unique_ptr<RpcCollectiveExecutorMgr> cme_;
-  std::vector<Device*> devices_;
   std::unique_ptr<DeviceMgr> device_mgr_;
 };
 
diff --git a/tensorflow/core/distributed_runtime/server_lib.cc b/tensorflow/core/distributed_runtime/server_lib.cc
index 7d308bb723a..fe9369e884b 100644
--- a/tensorflow/core/distributed_runtime/server_lib.cc
+++ b/tensorflow/core/distributed_runtime/server_lib.cc
@@ -49,16 +49,22 @@ void ServerFactory::Register(const string& server_type,
 Status ServerFactory::GetFactory(const ServerDef& server_def,
                                  ServerFactory** out_factory) {
   mutex_lock l(*get_server_factory_lock());
-  // TODO(mrry): Improve the error reporting here.
   for (const auto& server_factory : *server_factories()) {
     if (server_factory.second->AcceptsOptions(server_def)) {
       *out_factory = server_factory.second;
       return Status::OK();
     }
   }
+
+  std::vector<string> server_names;
+  for (const auto& server_factory : *server_factories()) {
+    server_names.push_back(server_factory.first);
+  }
+
   return errors::NotFound(
       "No server factory registered for the given ServerDef: ",
-      server_def.DebugString());
+      server_def.DebugString(), "\nThe available server factories are: [ ",
+      str_util::Join(server_names, ", "), " ]");
 }
 
 // Creates a server based on the given `server_def`, and stores it in
diff --git a/tensorflow/core/distributed_runtime/server_lib_test.cc b/tensorflow/core/distributed_runtime/server_lib_test.cc
new file mode 100644
index 00000000000..460372523c9
--- /dev/null
+++ b/tensorflow/core/distributed_runtime/server_lib_test.cc
@@ -0,0 +1,56 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/distributed_runtime/server_lib.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+
+class TestServerFactory : public ServerFactory {
+ public:
+  bool AcceptsOptions(const ServerDef& server_def) override {
+    return server_def.protocol() == "test_protocol";
+  }
+
+  Status NewServer(const ServerDef& server_def,
+                   std::unique_ptr<ServerInterface>* out_server) override {
+    return Status::OK();
+  }
+};
+
+TEST(ServerLibTest, NewServerFactoryAccepts) {
+  ServerFactory::Register("TEST_SERVER", new TestServerFactory());
+  ServerDef server_def;
+  server_def.set_protocol("test_protocol");
+  std::unique_ptr<ServerInterface> server;
+  TF_EXPECT_OK(NewServer(server_def, &server));
+}
+
+TEST(ServerLibTest, NewServerNoFactoriesAccept) {
+  ServerDef server_def;
+  server_def.set_protocol("fake_protocol");
+  std::unique_ptr<ServerInterface> server;
+  Status s = NewServer(server_def, &server);
+  ASSERT_NE(s, Status::OK());
+  EXPECT_TRUE(str_util::StrContains(
+      s.error_message(),
+      "No server factory registered for the given ServerDef"));
+  EXPECT_TRUE(str_util::StrContains(s.error_message(),
+                                    "The available server factories are: ["));
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/session_mgr.cc b/tensorflow/core/distributed_runtime/session_mgr.cc
index 95b31c6991f..29fe767e42a 100644
--- a/tensorflow/core/distributed_runtime/session_mgr.cc
+++ b/tensorflow/core/distributed_runtime/session_mgr.cc
@@ -78,13 +78,13 @@ Status SessionMgr::CreateSession(const string& session,
 
   if (isolate_session_state) {
     // Create a private copy of the DeviceMgr for the WorkerSession.
-    std::vector<Device*> renamed_devices;
+    std::vector<std::unique_ptr<Device>> renamed_devices;
     for (Device* d : worker_env_->local_devices) {
       renamed_devices.push_back(RenamedDevice::NewRenamedDevice(
           worker_name, d, false, isolate_session_state));
     }
 
-    auto device_mgr = MakeUnique<DeviceMgr>(renamed_devices);
+    auto device_mgr = MakeUnique<DeviceMgr>(std::move(renamed_devices));
     auto graph_mgr = MakeUnique<GraphMgr>(worker_env_, device_mgr.get());
     worker_session.reset(
         new WorkerSession(session, worker_name,
@@ -122,7 +122,9 @@ Status SessionMgr::WorkerSessionForSessionLocked(
     auto it = sessions_.find(session_handle);
     if (it == sessions_.end()) {
       return errors::Aborted("Session handle is not found: ", session_handle,
-                             ". Possibly this worker just restarted.");
+                             ". Possibly this worker (\"",
+                             legacy_session_->worker_name,
+                             "\") just restarted.");
     } else {
       *out_session = it->second;
     }
diff --git a/tensorflow/core/distributed_runtime/session_mgr_test.cc b/tensorflow/core/distributed_runtime/session_mgr_test.cc
index 99192119a63..1ab0d20f0b5 100644
--- a/tensorflow/core/distributed_runtime/session_mgr_test.cc
+++ b/tensorflow/core/distributed_runtime/session_mgr_test.cc
@@ -46,11 +46,9 @@ class SessionMgrTest : public ::testing::Test {
   SessionMgrTest()
       : mgr_(&env_, "/job:mnist/replica:0/task:0",
              std::unique_ptr<WorkerCacheInterface>(), factory_) {
-    Device* device =
-        FakeDevice::MakeCPU("/job:mnist/replica:0/task:0/device:fakecpu:0")
-            .release();
-    env_.local_devices = {device};
-    device_mgr_.reset(new DeviceMgr(env_.local_devices));
+    device_mgr_ = absl::make_unique<DeviceMgr>(
+        FakeDevice::MakeCPU("/job:mnist/replica:0/task:0/device:fakecpu:0"));
+    env_.local_devices = device_mgr_->ListDevices();
     env_.device_mgr = device_mgr_.get();
   }
 
diff --git a/tensorflow/core/distributed_runtime/worker.cc b/tensorflow/core/distributed_runtime/worker.cc
index 079c09859f4..f42143e5824 100644
--- a/tensorflow/core/distributed_runtime/worker.cc
+++ b/tensorflow/core/distributed_runtime/worker.cc
@@ -438,7 +438,9 @@ Status Worker::PrepareRecvTensor(const Rendezvous::ParsedKey& parsed,
     return errors::Aborted(
         "RecvTensor expects a different device incarnation: ",
         parsed.src_incarnation, " vs. ", (*src_dev)->attributes().incarnation(),
-        ". Your worker job was probably restarted. Check your "
+        ". Your worker job (\"",
+        env_->session_mgr->LegacySession()->worker_name,
+        "\") was probably restarted. Check your "
         "worker job for the reason why it was restarted.");
   }
 
diff --git a/tensorflow/core/framework/allocator.cc b/tensorflow/core/framework/allocator.cc
index 84cee5569c4..89c49a2ad05 100644
--- a/tensorflow/core/framework/allocator.cc
+++ b/tensorflow/core/framework/allocator.cc
@@ -96,9 +96,11 @@ static int64_t TotalAllocationWarningBytes() {
 void EnableCPUAllocatorStats(bool enable) {
   cpu_allocator_collect_stats = enable;
 }
+bool CPUAllocatorStatsEnabled() { return cpu_allocator_collect_stats; }
 void EnableCPUAllocatorFullStats(bool enable) {
   cpu_allocator_collect_full_stats = enable;
 }
+bool CPUAllocatorFullStatsEnabled() { return cpu_allocator_collect_full_stats; }
 
 namespace {
 // A default Allocator for CPU devices.  ProcessState::GetCPUAllocator() will
diff --git a/tensorflow/core/framework/allocator.h b/tensorflow/core/framework/allocator.h
index 8c23604625b..531ea73e892 100644
--- a/tensorflow/core/framework/allocator.h
+++ b/tensorflow/core/framework/allocator.h
@@ -383,10 +383,12 @@ Allocator* cpu_allocator();
 // If 'enable' is true, the default CPU allocator implementation will collect
 // AllocatorStats. By default, it's disabled.
 void EnableCPUAllocatorStats(bool enable);
+bool CPUAllocatorStatsEnabled();
 
 // If 'enable' is true, the default CPU allocator implementation will collect
 // full statistics. By default, it's disabled.
 void EnableCPUAllocatorFullStats(bool enable);
+bool CPUAllocatorFullStatsEnabled();
 
 // An object that does the underlying suballoc/free of memory for a higher-level
 // allocator.  The expectation is that the higher-level allocator is doing some
diff --git a/tensorflow/core/framework/common_shape_fns.cc b/tensorflow/core/framework/common_shape_fns.cc
index 7f35390f90c..bf2d902af41 100644
--- a/tensorflow/core/framework/common_shape_fns.cc
+++ b/tensorflow/core/framework/common_shape_fns.cc
@@ -1549,6 +1549,51 @@ Status ExplicitShapes(InferenceContext* c) {
   return Status::OK();
 }
 
+Status SparseReduceShapeFn(InferenceContext* c) {
+  // Input 0: input_indices
+  // Input 1: input_values
+  // Input 2: input_shape
+  // Input 3: reduction_axes
+  // Attr: keep_dims
+  bool keep_dims = false;
+  TF_RETURN_IF_ERROR(c->GetAttr("keep_dims", &keep_dims));
+
+  const Tensor* shape_tensor = c->input_tensor(2);
+  const Tensor* axes_tensor = c->input_tensor(3);
+  if (shape_tensor != nullptr && axes_tensor != nullptr) {
+    auto shape_vec = shape_tensor->flat<int64>();
+    auto axes_vec = axes_tensor->flat<int32>();
+
+    int64 ndims = shape_vec.size();
+    std::unordered_set<int64> axes;
+    for (int i = 0; i < axes_vec.size(); i++) {
+      axes.insert((axes_vec(i) + ndims) % ndims);
+    }
+
+    std::vector<DimensionHandle> dims;
+    if (keep_dims) {
+      dims.reserve(ndims);
+      for (int d = 0; d < ndims; ++d) {
+        if (axes.find(d) == axes.end()) {
+          dims.push_back(c->MakeDim(shape_vec(d)));
+        } else {
+          dims.push_back(c->MakeDim(1));
+        }
+      }
+    } else {
+      for (int d = 0; d < ndims; ++d) {
+        if (axes.find(d) == axes.end()) {
+          dims.push_back(c->MakeDim(shape_vec(d)));
+        }
+      }
+    }
+
+    c->set_output(0, c->MakeShape(dims));
+    return Status::OK();
+  }
+  return UnknownShape(c);
+}
+
 }  // namespace shape_inference
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/framework/common_shape_fns.h b/tensorflow/core/framework/common_shape_fns.h
index 3a496e06aeb..362899b947b 100644
--- a/tensorflow/core/framework/common_shape_fns.h
+++ b/tensorflow/core/framework/common_shape_fns.h
@@ -310,6 +310,9 @@ Status ExplicitShape(InferenceContext* c);
 // Shape function for multiple-output ops with an explicit "shapes" attribute.
 Status ExplicitShapes(InferenceContext* c);
 
+// Shape function for SparseReduceMax and SparseReduceSum.
+Status SparseReduceShapeFn(InferenceContext* c);
+
 }  // namespace shape_inference
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/framework/dataset.cc b/tensorflow/core/framework/dataset.cc
index fc6b5dde0cb..6af14150b70 100644
--- a/tensorflow/core/framework/dataset.cc
+++ b/tensorflow/core/framework/dataset.cc
@@ -13,10 +13,13 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/framework/dataset.h"
+#include <unordered_map>
 
 #include "tensorflow/core/framework/device_base.h"
+#include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/graph/graph_def_builder.h"
 #include "tensorflow/core/graph/node_builder.h"
+#include "tensorflow/core/platform/mutex.h"
 
 namespace tensorflow {
 namespace data {
diff --git a/tensorflow/core/framework/dataset.h b/tensorflow/core/framework/dataset.h
index 5960c105c84..a00768fee04 100644
--- a/tensorflow/core/framework/dataset.h
+++ b/tensorflow/core/framework/dataset.h
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <deque>
 #include <memory>
+#include <unordered_map>
 
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/attr_value_util.h"
@@ -30,8 +31,10 @@ limitations under the License.
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/framework/variant_encode_decode.h"
 #include "tensorflow/core/framework/variant_tensor_data.h"
+#include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/cpu_info.h"
 #include "tensorflow/core/platform/tracing.h"
 
 // Polymorphic datasets should support all primitive TensorFlow
@@ -50,6 +53,9 @@ namespace data {
 // A constant that can be used to enable auto-tuning.
 constexpr int kAutoTune = -1;
 
+constexpr int kInfiniteCardinality = -1;
+constexpr int kUnknownCardinality = -2;
+
 class DatasetBase;
 class SerializationContext;
 
@@ -160,7 +166,7 @@ class GraphDefBuilderWrapper {
                     const std::vector<std::pair<StringPiece, AttrValue>>& attrs,
                     Node** output) {
     std::vector<std::pair<size_t, Node*>> enumerated_inputs(inputs.size());
-    for (int i = 0; i < inputs.size(); i++) {
+    for (size_t i = 0; i < inputs.size(); i++) {
       enumerated_inputs[i] = std::make_pair(i, inputs[i]);
     }
     return AddDataset(dataset, enumerated_inputs, {}, attrs, output);
@@ -257,6 +263,7 @@ class GraphDefBuilderWrapper {
 };
 
 class StatsAggregator;
+class FunctionHandleCache;
 
 // A cut-down version of `OpKernelContext` for running computations in
 // iterators. Note that we cannot simply use `OpKernelContext` here because we
@@ -277,6 +284,7 @@ class IteratorContext {
           env(ctx->env()),
           function_library(ctx->function_library()),
           lib(ctx->lib()),
+          function_handle_cache(ctx->function_handle_cache()),
           model(ctx->model()),
           runner(*(ctx->runner())),
           runner_threadpool_size(ctx->runner_threadpool_size()),
@@ -285,15 +293,20 @@ class IteratorContext {
     explicit Params(OpKernelContext* ctx)
         : env(ctx->env()),
           lib(ctx->function_library()),
-          runner(*(ctx->runner())),
-          runner_threadpool_size(
-              ctx->device()->tensorflow_cpu_worker_threads()->num_threads) {
+          runner(*(ctx->runner())) {
       // NOTE: need reinterpret_cast because function.h forward-declares Device.
       DeviceBase* device =
           reinterpret_cast<DeviceBase*>(ctx->function_library()->device());
       allocator_getter = [device](AllocatorAttributes attrs) {
         return device->GetAllocator(attrs);
       };
+      thread::ThreadPool* thread_pool =
+          ctx->device()->tensorflow_device_thread_pool();
+      if (thread_pool) {
+        runner_threadpool_size = thread_pool->NumThreads();
+      } else {
+        runner_threadpool_size = port::NumSchedulableCPUs();
+      }
     }
 
     // The Allocator to be used to allocate the output of an iterator.
@@ -308,6 +321,9 @@ class IteratorContext {
     // The FunctionLibraryRuntime object to be used to make function calls.
     FunctionLibraryRuntime* lib = nullptr;
 
+    // A FunctionHandleCache that owns all the function handles. Not owned.
+    FunctionHandleCache* function_handle_cache = nullptr;
+
     // If non-null, identifies the object used for performance modeling.
     std::shared_ptr<model::Model> model = nullptr;
 
@@ -343,6 +359,10 @@ class IteratorContext {
 
   FunctionLibraryRuntime* lib() { return params_.lib; }
 
+  FunctionHandleCache* function_handle_cache() {
+    return params_.function_handle_cache;
+  }
+
   const std::shared_ptr<model::Model>& model() { return params_.model; }
 
   std::function<void(std::function<void()>)>* runner() {
@@ -570,6 +590,9 @@ class DatasetBase : public core::RefCounted {
   // A human-readable debug string for this dataset.
   virtual string DebugString() const = 0;
 
+  // Returns the cardinality of this dataset.
+  virtual int64 Cardinality() const { return kUnknownCardinality; }
+
   // Serializes the dataset and writes it to the `writer`.
   virtual Status Save(SerializationContext* ctx,
                       IteratorStateWriter* writer) const;
@@ -584,7 +607,6 @@ class DatasetBase : public core::RefCounted {
                            const DatasetBase* dataset, Node** output);
   };
 
-  // TODO(jsimsa): Consolidate overloading into a single method.
   virtual Status AsGraphDefInternal(SerializationContext* ctx,
                                     DatasetGraphDefBuilder* b,
                                     Node** node) const = 0;
diff --git a/tensorflow/core/framework/device_base.cc b/tensorflow/core/framework/device_base.cc
index 9108c32942a..78ace480c4b 100644
--- a/tensorflow/core/framework/device_base.cc
+++ b/tensorflow/core/framework/device_base.cc
@@ -34,14 +34,14 @@ const string& DeviceBase::name() const {
 }
 
 void DeviceBase::set_eigen_cpu_device(Eigen::ThreadPoolDevice* d) {
-  // Eigen::ThreadPoolDevice is a very cheap struct (one pointer and
+  // Eigen::ThreadPoolDevice is a very cheap struct (two pointers and
   // an int).  Therefore, we can afford a pre-allocated array of
   // Eigen::ThreadPoolDevice.  Here, we ensure that
   // Eigen::ThreadPoolDevices in eigen_cpu_devices_ has increasingly
   // larger numThreads.
   for (int i = 1; i <= d->numThreads(); ++i) {
-    eigen_cpu_devices_.push_back(
-        new Eigen::ThreadPoolDevice(d->getPool(), i /* numThreads() */));
+    eigen_cpu_devices_.push_back(new Eigen::ThreadPoolDevice(
+        d->getPool(), i /* numThreads() */, d->allocator()));
   }
 }
 
diff --git a/tensorflow/core/framework/function.cc b/tensorflow/core/framework/function.cc
index 9d0933e680d..b69a40f3128 100644
--- a/tensorflow/core/framework/function.cc
+++ b/tensorflow/core/framework/function.cc
@@ -20,6 +20,8 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/container/flat_hash_set.h"
+#include "absl/strings/str_join.h"
 #include "tensorflow/core/framework/common_shape_fns.h"
 #include "tensorflow/core/framework/function.pb_text.h"
 #include "tensorflow/core/framework/graph.pb.h"
@@ -31,6 +33,7 @@ limitations under the License.
 #include "tensorflow/core/lib/gtl/inlined_vector.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
 #include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/util/device_name_utils.h"
 #include "tensorflow/core/util/equal_graph_def.h"
 
 namespace tensorflow {
@@ -506,6 +509,16 @@ string Print(const NodeDef& n) {
       entries.push_back(strings::StrCat(a.first, "=", Print(a.second)));
     }
     std::sort(entries.begin(), entries.end());
+    // Add a short device string at the end of all attributes.
+    if (!n.device().empty()) {
+      DeviceNameUtils::ParsedName parsed;
+      if (DeviceNameUtils::ParseFullName(n.device(), &parsed)) {
+        entries.push_back(
+            strings::StrCat("device=", parsed.type, ":", parsed.id));
+      } else {
+        entries.push_back("device=<FAILED_TO_PARSE>");
+      }
+    }
     strings::StrAppend(&out, "[", str_util::Join(entries, ", "), "]");
   }
   strings::StrAppend(&out, "(");
@@ -589,26 +602,50 @@ string Print(gtl::ArraySlice<const NodeDef*> nodes) {
   std::sort(ret.begin(), ret.end(), comp);
   string out;
   strings::StrAppend(&out, "\n(");
-  auto get_type = [](const NodeDef& n) {
+  auto get_type_and_device = [](const NodeDef& n) {
     DataType dt;
     if (!GetNodeAttr(n, "T", &dt).ok()) {
       dt = DT_INVALID;
     }
+    if (!n.device().empty()) {
+      DeviceNameUtils::ParsedName parsed;
+      if (DeviceNameUtils::ParseFullName(n.device(), &parsed)) {
+        return strings::StrCat(DataTypeString(dt), "@", parsed.type, ":",
+                               parsed.id);
+      } else {
+        return strings::StrCat(DataTypeString(dt), "@",
+                               "<FAILED_TO_PARSE_DEVICE>");
+      }
+    }
     return DataTypeString(dt);
   };
   for (size_t i = 0; i < arg.size(); ++i) {
     const NodeDef* n = arg[i];
     if (i > 0) strings::StrAppend(&out, ", ");
     CHECK_GE(n->attr_size(), 2);
-    strings::StrAppend(&out, n->name(), ":", get_type(*n));
+    strings::StrAppend(&out, n->name(), ":", get_type_and_device(*n));
   }
   strings::StrAppend(&out, ") -> (");
   for (size_t i = 0; i < ret.size(); ++i) {
     const NodeDef* n = ret[i];
     if (i > 0) strings::StrAppend(&out, ", ");
     CHECK_LE(2, n->attr_size());
-    CHECK_EQ(1, n->input_size());
-    strings::StrAppend(&out, n->input(0), ":", get_type(*n));
+
+    // The _RetVal op should have a unique non-control input. We assert that
+    // here and add it to the output.
+    bool found_non_control_input = false;
+    for (const string& input : n->input()) {
+      if (!input.empty() && input[0] != '^') {
+        DCHECK_EQ(found_non_control_input, false)
+            << "RetVal node has more than one non-control input: "
+            << absl::StrJoin(n->input(), ", ");
+        strings::StrAppend(&out, n->input(0), ":", get_type_and_device(*n));
+        found_non_control_input = true;
+      }
+    }
+    DCHECK_EQ(found_non_control_input, true)
+        << "RetVal did not have any non-control inputs: "
+        << absl::StrJoin(n->input(), ", ");
   }
   strings::StrAppend(&out, ") {\n");
   for (size_t i = 0; i < body.size(); ++i) {
@@ -1240,6 +1277,16 @@ const FunctionDef* FunctionLibraryDefinition::GetAttrImpl(
   }
 }
 
+std::vector<string> FunctionLibraryDefinition::ListFunctionNames() const {
+  std::vector<string> function_names;
+  tf_shared_lock l(mu_);
+  function_names.reserve(function_defs_.size());
+  for (const auto& it : function_defs_) {
+    function_names.emplace_back(it.first);
+  }
+  return function_names;
+}
+
 FunctionDefLibrary FunctionLibraryDefinition::ToProto() const {
   FunctionDefLibrary lib;
   tf_shared_lock l(mu_);
@@ -1279,6 +1326,138 @@ GET_ATTR(string)
 GET_ATTR(bool)
 #undef GET_ATTR
 
+namespace {
+
+constexpr char kExperimentalApiImplements[] = "experimental_api_implements";
+
+absl::flat_hash_set<string> ReachableFunctions(
+    const FunctionLibraryDefinition& flib,
+    const protobuf::RepeatedPtrField<NodeDef>& nodes) {
+  // Functions that are reachable from the graph.
+  absl::flat_hash_set<string> reachable_funcs;
+
+  // For any functions, if it has attribute "experimental_api_implements" =
+  // "some_interface" and it is reachable, then it means any other
+  // function with same attribute name and value could also be potentially
+  // reachable, eg via experimental_implementation_selector swapping the
+  // nodedef.
+  absl::flat_hash_set<string> reachable_api_interface;
+
+  // Functions might be reachable from the nested function calls, so we keep a
+  // queue of functions that we have to check.
+  gtl::InlinedVector<const FunctionDef*, 4> func_queue;
+
+  // Add reachable and not already processed functions to the functions queue.
+  const auto add_to_func_queue = [&](const string& func_name) {
+    const FunctionDef* func = flib.Find(func_name);
+    if (func && reachable_funcs.find(func_name) == reachable_funcs.end()) {
+      func_queue.push_back(func);
+    }
+  };
+
+  // Add all the functions that are reachable from the given node to the queue.
+  const auto process_node = [&](const NodeDef& node) {
+    // Node itself can be a call to the function.
+    add_to_func_queue(node.op());
+
+    // Or node can have an attribute referencing a function.
+    for (const auto& attr : node.attr()) {
+      const auto& attr_value = attr.second;
+
+      // 1. AttrValue.func
+      if (attr_value.has_func()) {
+        add_to_func_queue(attr_value.func().name());
+      }
+
+      // 2. AttrValue.ListValue.func
+      if (attr_value.has_list()) {
+        for (const auto& func : attr_value.list().func()) {
+          add_to_func_queue(func.name());
+        }
+      }
+    }
+  };
+
+  // Add all functions that are directly called from the optimized graph.
+  std::for_each(nodes.begin(), nodes.end(), process_node);
+
+  // Process all reachable functions.
+  while (!func_queue.empty()) {
+    const FunctionDef* func = func_queue.back();
+    func_queue.pop_back();
+
+    const string& func_name = func->signature().name();
+    reachable_funcs.insert(func_name);
+
+    const auto attr_it = func->attr().find(kExperimentalApiImplements);
+    if (attr_it != func->attr().end()) {
+      reachable_api_interface.insert(attr_it->second.s());
+    }
+
+    // Find all the functions called from the function body.
+    const auto& func_body = func->node_def();
+    std::for_each(func_body.begin(), func_body.end(), process_node);
+
+    // Check if the function has a registered gradient.
+    const string grad_func_name = flib.FindGradient(func_name);
+    if (!grad_func_name.empty()) add_to_func_queue(grad_func_name);
+  }
+
+  for (const auto& func_name : flib.ListFunctionNames()) {
+    const auto& func_def = flib.Find(func_name);
+    const auto attr_it = func_def->attr().find(kExperimentalApiImplements);
+    if (attr_it != func_def->attr().end()) {
+      if (reachable_api_interface.contains(attr_it->second.s())) {
+        reachable_funcs.insert(func_name);
+      }
+    }
+  }
+
+  return reachable_funcs;
+}
+
+FunctionLibraryDefinition ReachableFunctionLibraryDefinition(
+    const FunctionLibraryDefinition& flib,
+    const protobuf::RepeatedPtrField<NodeDef>& nodes) {
+  absl::flat_hash_set<string> reachable_funcs = ReachableFunctions(flib, nodes);
+
+  FunctionLibraryDefinition reachable_flib(flib.default_registry(),
+                                           FunctionDefLibrary());
+
+  for (const string& func_name : reachable_funcs) {
+    const FunctionDef* func = flib.Find(func_name);
+    DCHECK_NE(func, nullptr);
+    // That should never fail, because we copy functions from valid flib and use
+    // the same default registry.
+    const Status added = reachable_flib.AddFunctionDef(*func);
+    DCHECK(added.ok());
+
+    const string grad_func_name = flib.FindGradient(func_name);
+    if (!grad_func_name.empty()) {
+      GradientDef grad;
+      grad.set_function_name(func_name);
+      grad.set_gradient_func(grad_func_name);
+      // It can only fail if function already has a gradient function.
+      const Status added_grad = reachable_flib.AddGradientDef(grad);
+      DCHECK(added_grad.ok());
+    }
+  }
+
+  return reachable_flib;
+}
+
+}  // namespace
+
+FunctionLibraryDefinition FunctionLibraryDefinition::ReachableDefinitions(
+    const GraphDef& graph) const {
+  return ReachableFunctionLibraryDefinition(*this, graph.node());
+}
+
+FunctionLibraryDefinition FunctionLibraryDefinition::ReachableDefinitions(
+    const FunctionDef& func) const {
+  return ReachableFunctionLibraryDefinition(*this, func.node_def());
+}
+
 void FunctionDefHelper::AttrValueWrapper::InitFromString(StringPiece val) {
   if (val.size() >= 2 && val[0] == '$') {
     proto.set_placeholder(val.data() + 1, val.size() - 1);
diff --git a/tensorflow/core/framework/function.h b/tensorflow/core/framework/function.h
index 4cc1b858e3a..9cf4b0f4cdf 100644
--- a/tensorflow/core/framework/function.h
+++ b/tensorflow/core/framework/function.h
@@ -407,10 +407,18 @@ class FunctionLibraryDefinition : public OpRegistryInterface {
     return function_defs_.size();
   }
 
+  // Returns all the function names in the FunctionLibraryDefinition.
+  std::vector<string> ListFunctionNames() const LOCKS_EXCLUDED(mu_);
+
   const OpRegistryInterface* default_registry() const {
     return default_registry_;
   }
 
+  // Returns a copy of `*this` with only the subset of functions that are
+  // reachable from the nodes of `graph` or `func`.
+  FunctionLibraryDefinition ReachableDefinitions(const GraphDef& graph) const;
+  FunctionLibraryDefinition ReachableDefinitions(const FunctionDef& func) const;
+
  private:
   // Shape inference for functions is handled separately by ShapeRefiner.
 
diff --git a/tensorflow/core/framework/function_handle_cache.cc b/tensorflow/core/framework/function_handle_cache.cc
new file mode 100644
index 00000000000..2b93b6b2f87
--- /dev/null
+++ b/tensorflow/core/framework/function_handle_cache.cc
@@ -0,0 +1,66 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/framework/function_handle_cache.h"
+
+#include "tensorflow/core/lib/gtl/map_util.h"
+#include "tensorflow/core/lib/random/random.h"
+#include "tensorflow/core/lib/strings/stringprintf.h"
+
+namespace tensorflow {
+namespace data {
+
+FunctionHandleCache::FunctionHandleCache(FunctionLibraryRuntime* lib)
+    : lib_(lib), state_handle_(strings::Printf("%lld", random::New64())) {}
+
+FunctionHandleCache::~FunctionHandleCache() {
+  Status s = Clear();
+  if (!s.ok()) {
+    LOG(ERROR) << "Failed to clear function handle cache: " << s.ToString();
+  }
+}
+
+Status FunctionHandleCache::Instantiate(
+    const string& function_name, AttrSlice attrs,
+    FunctionLibraryRuntime::InstantiateOptions options,
+    FunctionLibraryRuntime::Handle* handle) {
+  string key = Canonicalize(function_name, attrs, options);
+  FunctionLibraryRuntime::Handle h;
+  {
+    tf_shared_lock l(mu_);
+    h = gtl::FindWithDefault(handles_, key, kInvalidHandle);
+  }
+  if (h == kInvalidHandle) {
+    options.state_handle = state_handle_;
+    TF_RETURN_IF_ERROR(
+        lib_->Instantiate(function_name, attrs, options, handle));
+    mutex_lock l(mu_);
+    handles_[key] = *handle;
+  } else {
+    *handle = h;
+  }
+  return Status::OK();
+}
+
+Status FunctionHandleCache::Clear() {
+  mutex_lock l(mu_);
+  for (auto entry : handles_) {
+    TF_RETURN_IF_ERROR(lib_->ReleaseHandle(entry.second));
+  }
+  handles_.clear();
+  return Status::OK();
+}
+
+}  // namespace data
+}  // namespace tensorflow
diff --git a/tensorflow/core/framework/function_handle_cache.h b/tensorflow/core/framework/function_handle_cache.h
new file mode 100644
index 00000000000..2800a598e09
--- /dev/null
+++ b/tensorflow/core/framework/function_handle_cache.h
@@ -0,0 +1,53 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_FRAMEWORK_FUNCTION_HANDLE_CACHE_H_
+#define TENSORFLOW_CORE_FRAMEWORK_FUNCTION_HANDLE_CACHE_H_
+
+#include <string>
+
+#include "tensorflow/core/framework/function.h"
+
+namespace tensorflow {
+namespace data {
+
+class FunctionHandleCache {
+ public:
+  explicit FunctionHandleCache(FunctionLibraryRuntime* lib);
+
+  ~FunctionHandleCache();
+
+  // Looks up the function to be instantiated in the cache first. If present,
+  // returns handle from there. Otherwise, instantiates a new function
+  // and stores handle in the cache.
+  Status Instantiate(const string& function_name, AttrSlice attrs,
+                     FunctionLibraryRuntime::InstantiateOptions options,
+                     FunctionLibraryRuntime::Handle* handle);
+
+  // Releases all the handles in the cache, clearing out the state for all
+  // functions involved.
+  Status Clear();
+
+ private:
+  mutex mu_;
+  FunctionLibraryRuntime* lib_ = nullptr;  // not owned
+  const string state_handle_;
+  std::unordered_map<string, FunctionLibraryRuntime::Handle> handles_
+      GUARDED_BY(mu_);
+};
+
+}  // namespace data
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_FRAMEWORK_FUNCTION_HANDLE_CACHE_H_
diff --git a/tensorflow/core/framework/function_test.cc b/tensorflow/core/framework/function_test.cc
index 10392a9f328..75d45fa2c84 100644
--- a/tensorflow/core/framework/function_test.cc
+++ b/tensorflow/core/framework/function_test.cc
@@ -1213,6 +1213,17 @@ TEST(FunctionLibraryDefinitionTest, ToProto) {
   EXPECT_EQ(f3->DebugString(), f4->DebugString());
 }
 
+TEST(FunctionLibraryDefinitionTest, FunctionNames) {
+  FunctionDefLibrary proto;
+  *proto.add_function() = test::function::XTimesTwo();
+  *proto.add_function() = test::function::WXPlusB();
+  const FunctionLibraryDefinition lib_def(OpRegistry::Global(), proto);
+
+  const std::vector<string> function_names = lib_def.ListFunctionNames();
+  const std::vector<string> expected = {"XTimesTwo", "WXPlusB"};
+  EXPECT_EQ(function_names, expected);
+}
+
 TEST(FunctionLibraryDefinitionTest, GetAttr_FuncNoAttr) {
   FunctionDefLibrary proto;
   *proto.add_function() = test::function::XTimesTwo();
@@ -1293,6 +1304,79 @@ TEST(FunctionLibraryDefinitionTest, GetAttr_Gradient) {
   EXPECT_EQ(annotation, false);  // WXPlusB has no custom gradient.
 }
 
+TEST(FunctionLibraryDefinitionTest, ReachableDefinitions) {
+  using ::tensorflow::test::function::GDef;
+  using ::tensorflow::test::function::NDef;
+  using FDH = ::tensorflow::FunctionDefHelper;
+
+  const auto make_simple_fdef = [](const string& name,
+                                   const string& interface_name) {
+    auto func_def = FDH::Create(
+        name, {"x:T", "y:T"}, {"z:T"}, {"T: {float, double}"},
+        {{{"output"}, "Mul", {"x", "y"}, {{"T", "$T"}}}},
+        /* Mapping between function returns and function node outputs. */
+        {{"z", "output:z:0"}});
+
+    if (!interface_name.empty()) {
+      auto* attr = func_def.mutable_attr();
+      (*attr)["experimental_api_implements"].set_s(interface_name);
+    }
+    return func_def;
+  };
+
+  FunctionDef func_1 = make_simple_fdef("Func1", "");
+  FunctionDef func_2 = make_simple_fdef("Func2", "");
+  FunctionDef func_3 = make_simple_fdef("Func3", "");
+  FunctionDef func_4 = make_simple_fdef("Func4", "api_1");
+  FunctionDef func_5 = make_simple_fdef("Func5", "api_1");
+  FunctionDef func_6 = make_simple_fdef("Func6", "api_2");
+
+  FunctionDef func_2_grad = make_simple_fdef("Func2_grad", "");
+
+  constexpr char kDevice[] = "/device:CPU:0";
+
+  GraphDef graph = GDef(
+      {
+          NDef("a", "Placeholder", {}, {{"dtype", DT_FLOAT}}, kDevice),
+          NDef("b", "Placeholder", {}, {{"dtype", DT_FLOAT}}, kDevice),
+          NDef("x", "Func1", {"a", "b"}, {{"T", DT_FLOAT}}, kDevice),
+          NDef("y", "PartitionedCall", {"a", "b"},
+               {{"Tin", DataTypeSlice{DT_FLOAT, DT_FLOAT}},
+                {"Tout", DataTypeSlice{DT_FLOAT}},
+                {"f", FDH::FunctionRef("Func2", {{"T", DT_FLOAT}})}},
+               kDevice),
+          NDef("z", "Func4", {"a", "b"}, {{"T", DT_FLOAT}}, kDevice),
+      },
+      // FunctionLib
+      {func_1, func_2, func_3, func_2_grad, func_4, func_5, func_6});
+
+  // Register custom function gradient after the graph was constructed.
+  GradientDef* func3_grad_def = graph.mutable_library()->add_gradient();
+  func3_grad_def->set_function_name("Func2");
+  func3_grad_def->set_gradient_func("Func2_grad");
+
+  FunctionLibraryDefinition flib(OpRegistry::Global(), graph.library());
+
+  // - 'Func1' is called directly from the graph.
+  // - 'Func2' is called indirectly via a PartitionedCall attribute, and it also
+  //   has a custom gradient ('Func2_grad') that must remain in the library.
+  // - 'Func3' is unreachable and has to be removed from the library
+  // - 'Func4' is called directly from the graph
+  // - 'Func5' is not called directly, but it implements same interface as Func4
+  //   which is directly called.
+  // - 'Func6' is not called directly, and the interface it implements has not
+  //   not been called by another nodes in the graph.
+  FunctionLibraryDefinition reachable_flib = flib.ReachableDefinitions(graph);
+  EXPECT_EQ(reachable_flib.num_functions(), 5);
+  EXPECT_TRUE(reachable_flib.Contains("Func1"));
+  EXPECT_TRUE(reachable_flib.Contains("Func2"));
+  EXPECT_TRUE(reachable_flib.Contains("Func2_grad"));
+  EXPECT_FALSE(reachable_flib.Contains("Func3"));
+  EXPECT_TRUE(reachable_flib.Contains("Func4"));
+  EXPECT_TRUE(reachable_flib.Contains("Func5"));
+  EXPECT_FALSE(reachable_flib.Contains("Func6"));
+}
+
 // TODO(skyewm): this could be more thorough
 TEST(FunctionDefsEqualTest, TestFunctionDefsEqual) {
   // Equal functions
diff --git a/tensorflow/core/framework/node_def_util.cc b/tensorflow/core/framework/node_def_util.cc
index 578ec7f2e4d..95a787b2df0 100644
--- a/tensorflow/core/framework/node_def_util.cc
+++ b/tensorflow/core/framework/node_def_util.cc
@@ -102,6 +102,10 @@ string SummarizeNodeDef(const NodeDef& node_def) {
   return ret;
 }
 
+string SummarizeAttrs(const NodeDef& node_def) {
+  return SummarizeAttrsHelper(node_def, node_def.device());
+}
+
 string FormatNodeForError(const Node& node) {
   return FormatNodeDefForError(node.def());
 }
diff --git a/tensorflow/core/framework/node_def_util.h b/tensorflow/core/framework/node_def_util.h
index 0ff67554eb3..f682bb15355 100644
--- a/tensorflow/core/framework/node_def_util.h
+++ b/tensorflow/core/framework/node_def_util.h
@@ -48,6 +48,7 @@ extern const char* const kColocationGroupPrefix;
 // than a text-format proto.
 string SummarizeNode(const Node& node);
 string SummarizeNodeDef(const NodeDef& node_def);
+string SummarizeAttrs(const NodeDef& node_def);
 
 // Produces a formatted string pattern from the node which can uniquely identify
 // this node upstream to produce an informative error message. The pattern
diff --git a/tensorflow/core/framework/op_kernel.cc b/tensorflow/core/framework/op_kernel.cc
index e2a177569d6..e3cb4a40ec5 100644
--- a/tensorflow/core/framework/op_kernel.cc
+++ b/tensorflow/core/framework/op_kernel.cc
@@ -39,9 +39,11 @@ limitations under the License.
 #include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/cpu_info.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/platform_strings.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/util/ptr_util.h"
 
@@ -255,6 +257,9 @@ Status OpKernelConstruction::allocate_persistent(
 
 // OpKernelContext -----------------------------------------------------------
 
+const int OpKernelContext::Params::kNeverForward;
+const int OpKernelContext::Params::kNoReservation;
+
 OpKernelContext::OpKernelContext(Params* params)
     : OpKernelContext(
           params, static_cast<int>(params->op_kernel->output_types().size())) {}
@@ -942,6 +947,44 @@ static const char kKernelLibPattern[] = "libtfkernel*.dylib";
 static const char kKernelLibPattern[] = "libtfkernel*.so";
 #endif
 
+#define FEATURE(x) \
+  { x, #x }
+
+// Returns Status::OK if the dynamic library at the given path is safe to
+// load with some level of confidence.
+static Status IsProbablySafeToLoad(const string& path) {
+  // A map of platform string to required CPU feature.
+  using port::CPUFeature;
+  static const auto* feature_map =
+      new std::map<string, std::pair<CPUFeature, string>>{
+          {"__AVX512VL__=1", FEATURE(CPUFeature::AVX512VL)},
+      };
+
+  std::vector<std::string> platform_strings;
+  int result = GetPlatformStrings(path, &platform_strings);
+  if (result) {
+    return Status(error::Code::UNKNOWN, strerror(result));
+  }
+  if (platform_strings.empty()) {
+    return Status(error::Code::FAILED_PRECONDITION,
+                  "Didn't find any platform strings");
+  }
+  std::vector<std::string> missing_features;
+  for (const auto& platform_string : platform_strings) {
+    const auto& entry = feature_map->find(platform_string);
+    if (entry != feature_map->end() &&
+        !port::TestCPUFeature(entry->second.first)) {
+      missing_features.emplace_back(entry->second.second);
+    }
+  }
+  if (!missing_features.empty()) {
+    string errmsg = "Missing CPU features: ";
+    errmsg.append(str_util::Join(missing_features, ", "));
+    return Status(errors::Code::FAILED_PRECONDITION, errmsg);
+  }
+  return Status::OK();
+}
+
 void LoadDynamicKernelsInternal() {
   Env* env = Env::Default();
   string bazel_kernel_dir = io::JoinPath(env->GetRunfilesDir(),
@@ -952,12 +995,18 @@ void LoadDynamicKernelsInternal() {
   Status s_kernel_dir = env->GetChildren(bazel_kernel_dir, &files);
   if (s_kernel_dir.ok()) {
     string dll_spec = io::JoinPath(bazel_kernel_dir, kKernelLibPattern);
-    for (const auto&  file : files) {
-      string fullpath =  io::JoinPath(bazel_kernel_dir, file);
+    for (const auto& file : files) {
+      string fullpath = io::JoinPath(bazel_kernel_dir, file);
       if (env->MatchPath(fullpath, dll_spec)) {
-        // TODO(gunan): Store the handles to the opened files.
-        void* unused_filehandle;
-        TF_CHECK_OK(env->LoadLibrary(fullpath.c_str(), &unused_filehandle));
+        Status s = IsProbablySafeToLoad(fullpath);
+        if (s.ok()) {
+          // TODO(gunan): Store the handles to the opened files.
+          void* unused_filehandle;
+          TF_CHECK_OK(env->LoadLibrary(fullpath.c_str(), &unused_filehandle));
+        } else {
+          LOG(WARNING) << "Not loading plugin library " << fullpath << ": "
+                       << s.error_message();
+        }
       }
     }
   }
@@ -1078,7 +1127,8 @@ Status FindKernelDef(const DeviceType& device_type, const NodeDef& node_def,
         FormatNodeDefForError(node_def));
     if (was_attr_mismatch) {
       errors::AppendToMessage(
-          &s, " (OpKernel was found, but attributes didn't match)");
+          &s, " (OpKernel was found, but attributes didn't match) ",
+          "Requested Attributes: ", SummarizeAttrs(node_def));
     }
     errors::AppendToMessage(
         &s, ".  Registered:", KernelsRegisteredForOp(node_def.op()));
@@ -1213,7 +1263,8 @@ Status CreateOpKernel(DeviceType device_type, DeviceBase* device,
                               FormatNodeDefForError(node_def)));
     if (was_attr_mismatch) {
       errors::AppendToMessage(
-          &s, " (OpKernel was found, but attributes didn't match)");
+          &s, " (OpKernel was found, but attributes didn't match) ",
+          "Requested Attributes: ", SummarizeAttrs(node_def));
     }
     errors::AppendToMessage(
         &s, ".  Registered:", KernelsRegisteredForOp(node_def.op()));
diff --git a/tensorflow/core/framework/resource_mgr.cc b/tensorflow/core/framework/resource_mgr.cc
index 508a8d3149b..9f3204ab960 100644
--- a/tensorflow/core/framework/resource_mgr.cc
+++ b/tensorflow/core/framework/resource_mgr.cc
@@ -204,12 +204,19 @@ Status ResourceMgr::Delete(const ResourceHandle& handle) {
 }
 
 Status ResourceMgr::Cleanup(const string& container) {
+  {
+    tf_shared_lock l(mu_);
+    if (!gtl::FindOrNull(containers_, container)) {
+      // Nothing to cleanup.
+      return Status::OK();
+    }
+  }
   Container* b = nullptr;
   {
     mutex_lock l(mu_);
     auto iter = containers_.find(container);
     if (iter == containers_.end()) {
-      // Nothing to cleanup, it's OK.
+      // Nothing to cleanup, it's OK (concurrent cleanup).
       return Status::OK();
     }
     b = iter->second;
diff --git a/tensorflow/core/framework/tensor.h b/tensorflow/core/framework/tensor.h
index 0d58ab3875a..3177bbe7e93 100644
--- a/tensorflow/core/framework/tensor.h
+++ b/tensorflow/core/framework/tensor.h
@@ -17,6 +17,7 @@ limitations under the License.
 #define TENSORFLOW_CORE_FRAMEWORK_TENSOR_H_
 
 #include <cstdint>
+#include <type_traits>
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/framework/tensor_shape.h"
@@ -29,6 +30,7 @@ limitations under the License.
 #include "tensorflow/core/lib/gtl/inlined_vector.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/mem.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
@@ -118,7 +120,7 @@ class Tensor {
 
   class HostScalarTensorBufferBase;
   template <typename T>
-  class HostScalarTensorBuffer;
+  struct ValueAndTensorBuffer;
 
   // Creates a tensor with the given scalar `value` in CPU memory.
   template <typename T>
@@ -875,38 +877,73 @@ class Tensor::HostScalarTensorBufferBase : public TensorBuffer {
   void FillAllocationDescription(AllocationDescription* proto) const final;
 };
 
-// `Tensor::HostScalarTensorBuffer<T>` is a specialized `TensorBuffer`
-// implementation for storing a single scalar value.
-//
-// TODO(mrry): Evaluate other compilers or approaches to aligning the value
-// so that it can be used directly as a tensor value. For example, in a C++17
-// future, we could use `alignas(EIGEN_MAX_ALIGN_BYTES)` to store the value
-// inline in this object to save an allocation. However, this is not currently
-// widely supported in our compilers.
+// A packed representation for a single scalar value of type `T`, and a
+// `TensorBuffer` implementation that describes (and manages the lifetime of)
+// that value.
 template <typename T>
-class Tensor::HostScalarTensorBuffer : public HostScalarTensorBufferBase {
- public:
-  HostScalarTensorBuffer(T&& value)
-      : data_(reinterpret_cast<T*>(cpu_allocator()->AllocateRaw(
-            EIGEN_MAX_ALIGN_BYTES, sizeof(value)))) {
-    if (is_simple_type<T>::value) {
-      *data_ = value;
-    } else {
-      new (data_) T(std::move(value));
-    }
-  }
-  ~HostScalarTensorBuffer() { cpu_allocator()->Deallocate(data_, 1); }
-  void* data() const final { return const_cast<T*>(data_); }
-  size_t size() const final { return sizeof(*data_); }
-  TensorBuffer* root_buffer() final { return this; }
+struct Tensor::ValueAndTensorBuffer {
+  class HostScalarTensorBuffer : public Tensor::HostScalarTensorBufferBase {
+   public:
+    HostScalarTensorBuffer(void* data) : data_(data) {}
+    void* data() const final { return const_cast<void*>(data_); }
+    size_t size() const final { return sizeof(T); }
+    TensorBuffer* root_buffer() final { return this; }
 
- private:
-  T* const data_;
+    // Override `operator delete` so that calling `delete this` in
+    // `core::Refcounted::Unref()` for an object of this type will free
+    // the enclosing `ValueAndTensorBuffer` for the tensor buffer.
+    //
+    // NOTE(mrry): The definition of this method must be outside the class
+    // definition in order to satisfy some compilers.
+    static void operator delete(void* ptr);
+
+    static void operator delete(void*, void*) {
+      // Some compilers require an overridden class-specific deallocation
+      // function, which will be called if placement `new` throws an
+      // exception.
+    }
+
+   private:
+    ~HostScalarTensorBuffer() override { static_cast<T*>(data_)->~T(); }
+    void* const data_;
+  };
+
+  T value;
+  HostScalarTensorBuffer tensor_buffer;
 };
 
+/* static */
 template <typename T>
-Tensor::Tensor(T value, host_scalar_tag tag)
-    : buf_(new HostScalarTensorBuffer<T>(std::move(value))) {
+void Tensor::ValueAndTensorBuffer<T>::HostScalarTensorBuffer::operator delete(
+    void* ptr) {
+  // Use a dummy object to compute to offset of
+  // `ValueAndTensorBuffer::tensor_buffer`, because `offsetof()` is not
+  // necessarily defined on this non-POD type (until C++17).
+  //
+  // NOTE(mrry): Using `sizeof(Tensor::ValueAndTensorBuffer<T>)` here requires
+  // us to define this method outside the class definition, so that it is not
+  // considered an incomplete type.
+  typename std::aligned_storage<sizeof(Tensor::ValueAndTensorBuffer<T>),
+                                alignof(Tensor::ValueAndTensorBuffer<T>)>::type
+      dummy_storage_;
+  Tensor::ValueAndTensorBuffer<T>* dummy_object =
+      reinterpret_cast<Tensor::ValueAndTensorBuffer<T>*>(&dummy_storage_);
+  intptr_t offset = reinterpret_cast<intptr_t>(&dummy_object->tensor_buffer) -
+                    reinterpret_cast<intptr_t>(dummy_object);
+
+  port::AlignedFree(static_cast<char*>(ptr) - offset);
+}
+
+template <typename T>
+Tensor::Tensor(T value, host_scalar_tag tag) {
+  auto* value_and_buf = static_cast<Tensor::ValueAndTensorBuffer<T>*>(
+      port::AlignedMalloc(sizeof(typename Tensor::ValueAndTensorBuffer<T>),
+                          EIGEN_MAX_ALIGN_BYTES));
+  new (&value_and_buf->value) T(std::move(value));
+  new (&value_and_buf->tensor_buffer)
+      typename Tensor::ValueAndTensorBuffer<T>::HostScalarTensorBuffer(
+          value_and_buf);
+  buf_ = &value_and_buf->tensor_buffer;
   set_dtype(DataTypeToEnum<T>::value);
 }
 
diff --git a/tensorflow/core/framework/tensor_test.cc b/tensorflow/core/framework/tensor_test.cc
index 925ebc49454..4fa9d1df675 100644
--- a/tensorflow/core/framework/tensor_test.cc
+++ b/tensorflow/core/framework/tensor_test.cc
@@ -854,15 +854,18 @@ TEST(Tensor_HostScalar, Basics) {
     EXPECT_FLOAT_EQ(42.0f, Tt());
   }
   {
-    Tensor t("foo");
+    // NOTE(mrry): Use long enough strings so that the contents are dynamically
+    // allocated, and the absence of a call to the string destructor would
+    // cause a memory leak.
+    Tensor t("fooooooooooooooooooooooooooooooooooooo");
     EXPECT_EQ(DT_STRING, t.dtype());
     EXPECT_EQ(1, t.NumElements());
     auto Tt = t.scalar<string>();
     EXPECT_EQ(1, Tt.size());
     EXPECT_EQ(0, Tt.rank());
-    EXPECT_EQ("foo", Tt());
-    Tt() = "bar";
-    EXPECT_EQ("bar", Tt());
+    EXPECT_EQ("fooooooooooooooooooooooooooooooooooooo", Tt());
+    Tt() = "baaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaar";
+    EXPECT_EQ("baaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaar", Tt());
   }
 }
 
diff --git a/tensorflow/core/graph/edgeset.cc b/tensorflow/core/graph/edgeset.cc
index 02315a3e27b..2e0c6714616 100644
--- a/tensorflow/core/graph/edgeset.cc
+++ b/tensorflow/core/graph/edgeset.cc
@@ -37,7 +37,7 @@ std::pair<EdgeSet::const_iterator, bool> EdgeSet::insert(value_type value) {
       }
     }
     // array is full. convert to set.
-    s = new gtl::FlatSet<const Edge*>;
+    s = new std::set<const Edge*>;
     for (int i = 0; i < kInline; i++) {
       s->insert(static_cast<const Edge*>(ptrs_[i]));
     }
diff --git a/tensorflow/core/graph/edgeset.h b/tensorflow/core/graph/edgeset.h
index 2776c8491c2..0a1ee5a666c 100644
--- a/tensorflow/core/graph/edgeset.h
+++ b/tensorflow/core/graph/edgeset.h
@@ -17,18 +17,17 @@ limitations under the License.
 #define TENSORFLOW_GRAPH_EDGESET_H_
 
 #include <stddef.h>
-
-#include "tensorflow/core/lib/gtl/flatset.h"
-#include "tensorflow/core/platform/logging.h"
+#include <set>
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/types.h"
+
+#include "tensorflow/core/platform/logging.h"
 namespace tensorflow {
 
 class Edge;
 
 // An unordered set of edges.  Uses very little memory for small sets.
-// Unlike gtl::FlatSet, EdgeSet does NOT allow mutations during
-// iteration.
+// Unlike std::set, EdgeSet does NOT allow mutations during iteration.
 class EdgeSet {
  public:
   EdgeSet();
@@ -55,15 +54,12 @@ class EdgeSet {
  private:
   // Up to kInline elements are stored directly in ptrs_ (nullptr means none).
   // If ptrs_[0] == this then ptrs_[1] points to a set<const Edge*>.
-  // kInline must be >= 2, and is chosen such that ptrs_ fills a 64 byte
-  // cacheline.
-  static constexpr int kInline = 64 / sizeof(const void*);
+  static const int kInline = 4;  // Must be >= 2.
   const void* ptrs_[kInline];
 
-  gtl::FlatSet<const Edge*>* get_set() const {
+  std::set<const Edge*>* get_set() const {
     if (ptrs_[0] == this) {
-      return static_cast<gtl::FlatSet<const Edge*>*>(
-          const_cast<void*>(ptrs_[1]));
+      return static_cast<std::set<const Edge*>*>(const_cast<void*>(ptrs_[1]));
     } else {
       return nullptr;
     }
@@ -103,7 +99,7 @@ class EdgeSet::const_iterator {
   friend class EdgeSet;
 
   void const* const* array_iter_ = nullptr;
-  typename gtl::FlatSet<const Edge*>::const_iterator tree_iter_;
+  typename std::set<const Edge*>::const_iterator tree_iter_;
 
 #ifdef NDEBUG
   inline void Init(const EdgeSet* e) {}
diff --git a/tensorflow/core/graph/graph.cc b/tensorflow/core/graph/graph.cc
index b9fceb6a31b..550e3ef9152 100644
--- a/tensorflow/core/graph/graph.cc
+++ b/tensorflow/core/graph/graph.cc
@@ -285,6 +285,14 @@ Status Node::input_node(int idx, const Node** const_n) const {
   return Status::OK();
 }
 
+Status Node::input_tensor(int idx, OutputTensor* t) const {
+  const Edge* e;
+  TF_RETURN_IF_ERROR(input_edge(idx, &e));
+  DCHECK(e != nullptr);
+  *t = OutputTensor(e->src(), e->src_output());
+  return Status::OK();
+}
+
 // InputTensor
 
 bool InputTensor::operator==(const InputTensor& other) const {
@@ -540,6 +548,22 @@ Status Graph::UpdateEdge(Node* new_src, int new_src_index, Node* dst,
   return Status::OK();
 }
 
+Status Graph::AddWhileInputHack(Node* new_src, int new_src_index, Node* dst) {
+  if (dst->type_string() != "While") {
+    return errors::Internal(
+        "dst argument to AddWhileEdgeHack should be a While op, got: ",
+        dst->DebugString());
+  }
+  TF_RETURN_IF_ERROR(IsValidOutputTensor(new_src, new_src_index));
+  int dst_index = dst->in_edges().size();
+  TF_RETURN_IF_ERROR(IsValidInputTensor(dst, dst_index));
+  AddEdge(new_src, new_src_index, dst, dst_index);
+  dst->MaybeCopyOnWrite();
+  dst->props_->node_def.add_input(
+      strings::StrCat(new_src->name(), ":", new_src_index));
+  return Status::OK();
+}
+
 Status Graph::AddFunctionLibrary(const FunctionDefLibrary& fdef_lib) {
   // Need a new-enough consumer to support the functions we add to the graph.
   if (fdef_lib.function_size() > 0 && versions_->min_consumer() < 12) {
diff --git a/tensorflow/core/graph/graph.h b/tensorflow/core/graph/graph.h
index 6a224ca4a23..667eaba24c3 100644
--- a/tensorflow/core/graph/graph.h
+++ b/tensorflow/core/graph/graph.h
@@ -59,12 +59,13 @@ class EdgeSetTest;
 class Graph;
 class GraphDef;
 class Node;
+struct OutputTensor;
 class VersionDef;
 class WhileContext;
 
 class NeighborIter;    // Declared below
 class NodeIter;        // Declared below
-class NodeProperties;  // Defined in .cc
+struct NodeProperties;  // Defined in .cc
 
 class Node {
  public:
@@ -189,6 +190,10 @@ class Node {
   Status input_node(int idx, const Node** n) const;
   Status input_node(int idx, Node** n) const;
 
+  // Returns into '*t' the idx-th input tensor of this node, represented as the
+  // output tensor of input_node(idx).
+  Status input_tensor(int idx, OutputTensor* t) const;
+
   WhileContext* while_ctx() const { return while_ctx_; }
   void set_while_ctx(WhileContext* while_ctx) {
     DCHECK(IsExit());
@@ -287,10 +292,10 @@ class Node {
 
 // Represents an input of a node, i.e., the `index`-th input to `node`.
 struct InputTensor {
-  const Node* node;
+  Node* node;
   int index;
 
-  InputTensor(const Node* n, int i) : node(n), index(i) {}
+  InputTensor(Node* n, int i) : node(n), index(i) {}
   InputTensor() : node(nullptr), index(0) {}
 
   // Returns true if this InputTensor is identical to 'other'. Nodes are
@@ -308,10 +313,10 @@ struct InputTensor {
 // that a single `OutputTensor` can correspond to multiple `Edge`s if the output
 // is consumed by multiple destination nodes.
 struct OutputTensor {
-  const Node* node;
+  Node* node;
   int index;
 
-  OutputTensor(const Node* n, int i) : node(n), index(i) {}
+  OutputTensor(Node* n, int i) : node(n), index(i) {}
   OutputTensor() : node(nullptr), index(0) {}
 
   // Returns true if this OutputTensor is identical to 'other'. Nodes are
@@ -488,11 +493,17 @@ class Graph {
   // the corresponding NodeDef to reflect the change.
   // REQUIRES: The control edge must exist.
   void RemoveControlEdge(const Edge* e);
+
   // Updates the input to a node.  The existing edge to `dst` is removed and an
   // edge from `new_src` to `dst` is created. The NodeDef associated with `dst`
   // is also updated.
   Status UpdateEdge(Node* new_src, int new_src_index, Node* dst, int dst_index);
 
+  // Like AddEdge but updates dst's NodeDef. Used to add an input edge to a
+  // "While" op during gradient construction, see AddInputWhileHack in
+  // python_api.h for more details.
+  Status AddWhileInputHack(Node* new_src, int new_src_index, Node* dst);
+
   // Adds the function and gradient definitions in `fdef_lib` to this graph's op
   // registry. Ignores duplicate functions, and returns a bad status if an
   // imported function differs from an existing function or op with the same
diff --git a/tensorflow/core/graph/graph_partition.cc b/tensorflow/core/graph/graph_partition.cc
index 1dbcebab598..9c640c42a58 100644
--- a/tensorflow/core/graph/graph_partition.cc
+++ b/tensorflow/core/graph/graph_partition.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/memory_types.h"
 #include "tensorflow/core/framework/node_def_builder.h"
 #include "tensorflow/core/framework/tensor.pb.h"
@@ -1186,7 +1187,8 @@ Status Partition(const PartitionOptions& opts, Graph* g,
   for (auto& it : *partitions) {
     GraphDef* gdef = &it.second;
     *gdef->mutable_versions() = g->versions();
-    *gdef->mutable_library() = flib_def->ToProto();
+    // Prune unreachable functions from `flib_def` before adding them to `gdef`.
+    *gdef->mutable_library() = flib_def->ReachableDefinitions(*gdef).ToProto();
 
     // Traverse the graph to fill every send/recv op's incarnation
     // information.
diff --git a/tensorflow/core/graph/graph_partition_test.cc b/tensorflow/core/graph/graph_partition_test.cc
index f44ed47a6e9..29d8034d2a1 100644
--- a/tensorflow/core/graph/graph_partition_test.cc
+++ b/tensorflow/core/graph/graph_partition_test.cc
@@ -470,13 +470,19 @@ TEST_F(GraphPartitionTest, Functions) {
   ConstructOp(in_.WithOpName("A2"), "XTimesTwo", {a1});
   ConstructOp(in_.WithOpName("B2"), "XTimesFour", {b1});
 
+  // The `Partition()` helper function uses the first letter of the op name ('A'
+  // or 'B') to choose a device for each node.
   Partition(ToGraphDef(), &partitions_);
   EXPECT_EQ(2, partitions_.size());
 
-  // Test that partition graphs inherit function library from original graph
+  // Test that partition graphs inherit function library from original graph.
   string a = "/job:a/replica:0/task:0/cpu:0";
   string b = "/job:a/replica:0/task:0/cpu:1";
-  ExpectFunctions(partitions_[a].library(), {"XTimesTwo", "XTimesFour"});
+
+  // Node "A2" is placed in part `a`, and uses only "XTimesTwo".
+  ExpectFunctions(partitions_[a].library(), {"XTimesTwo"});
+  // Node "B2" is placed in part `b`, and uses both "XTimesFour" directly,
+  // and "XTimesTwo" in the body of "XTimesFour".
   ExpectFunctions(partitions_[b].library(), {"XTimesTwo", "XTimesFour"});
 }
 
diff --git a/tensorflow/core/graph/graph_test.cc b/tensorflow/core/graph/graph_test.cc
index e7762fd4147..333c32567fc 100644
--- a/tensorflow/core/graph/graph_test.cc
+++ b/tensorflow/core/graph/graph_test.cc
@@ -799,44 +799,5 @@ BENCHMARK(BM_GraphCreation)->ArgPair(1 << 9, 16);
 BENCHMARK(BM_GraphCreation)->ArgPair(1 << 12, 16);
 BENCHMARK(BM_GraphCreation)->ArgPair(1 << 15, 16);
 
-static void BM_ToGraphDef(int iters, int num_nodes, int num_edges_per_node) {
-  testing::StopTiming();
-  const GraphDef graph_def = CreateGraphDef(num_nodes, num_edges_per_node);
-  const auto registry = OpRegistry::Global();
-  GraphConstructorOptions opts;
-  // Warmup step.
-  Graph graph(registry);
-  TF_CHECK_OK(ConvertGraphDefToGraph(opts, graph_def, &graph));
-  int64 sum = 0;
-  testing::StartTiming();
-  for (int i = 0; i < iters; ++i) {
-    GraphDef graph_def;
-    graph.ToGraphDef(&graph_def);
-    sum += graph_def.node_size();
-  }
-  VLOG(1) << sum;
-  testing::StopTiming();
-}
-BENCHMARK(BM_ToGraphDef)->ArgPair(10, 2);
-BENCHMARK(BM_ToGraphDef)->ArgPair(1 << 6, 2);
-BENCHMARK(BM_ToGraphDef)->ArgPair(1 << 9, 2);
-BENCHMARK(BM_ToGraphDef)->ArgPair(1 << 12, 2);
-BENCHMARK(BM_ToGraphDef)->ArgPair(1 << 15, 2);
-BENCHMARK(BM_ToGraphDef)->ArgPair(10, 4);
-BENCHMARK(BM_ToGraphDef)->ArgPair(1 << 6, 4);
-BENCHMARK(BM_ToGraphDef)->ArgPair(1 << 9, 4);
-BENCHMARK(BM_ToGraphDef)->ArgPair(1 << 12, 4);
-BENCHMARK(BM_ToGraphDef)->ArgPair(1 << 15, 4);
-BENCHMARK(BM_ToGraphDef)->ArgPair(10, 8);
-BENCHMARK(BM_ToGraphDef)->ArgPair(1 << 6, 8);
-BENCHMARK(BM_ToGraphDef)->ArgPair(1 << 9, 8);
-BENCHMARK(BM_ToGraphDef)->ArgPair(1 << 12, 8);
-BENCHMARK(BM_ToGraphDef)->ArgPair(1 << 15, 8);
-BENCHMARK(BM_ToGraphDef)->ArgPair(10, 16);
-BENCHMARK(BM_ToGraphDef)->ArgPair(1 << 6, 16);
-BENCHMARK(BM_ToGraphDef)->ArgPair(1 << 9, 16);
-BENCHMARK(BM_ToGraphDef)->ArgPair(1 << 12, 16);
-BENCHMARK(BM_ToGraphDef)->ArgPair(1 << 15, 16);
-
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/graph/mkl_layout_pass.cc b/tensorflow/core/graph/mkl_layout_pass.cc
index 69735aac028..d67771b9e37 100644
--- a/tensorflow/core/graph/mkl_layout_pass.cc
+++ b/tensorflow/core/graph/mkl_layout_pass.cc
@@ -22,9 +22,13 @@ limitations under the License.
 #include <memory>
 #include <queue>
 #include <set>
+#include <set>
+#include <stack>
+#include <tuple>
 #include <unordered_set>
 #include <utility>
 #include <vector>
+
 #include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/common_runtime/optimization_registry.h"
 #include "tensorflow/core/framework/node_def_util.h"
@@ -310,6 +314,7 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     csinfo_.slice = "Slice";
     csinfo_.softmax = "Softmax";
     csinfo_.split = "Split";
+    csinfo_.transpose = "Transpose";
     // Element-wise ops. Ensure you also add any new ops to IsOpElementWise
     // in the MklUtil.h (IsMklElementWiseOp method) to ensure that the
     // MklInputConversion op is added before it.
@@ -508,6 +513,40 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     minfo_.push_back({csinfo_.conv2d_grad_filter, csinfo_.bias_add_grad,
                       csinfo_.conv2d_grad_filter_with_bias,
                       GetConv2DBackpropFilterOrBiasAddGrad});
+
+    // The fusion patterns in "finfo_" that show up first will get applied
+    // first, for example, graph "A->B->C-D" and finfo_ is {A->B->C to ABC,
+    // A->B->C->D to ABCD}, since the first gets applied first, the final
+    // graph will be ABC->D.
+
+    //
+    // Add rules to fuse sequences such as "Transpose (NCHW -> NHWC) + Conv2D
+    // (NHWC) + Transpose (NHWC->
+    // NCHW)" into "Conv2D (NCHW)". Such patterns occur frequently in Keras.
+    // Note: we use the term "merge" to combine (exactly) 2 nodes into one,
+    // while "fusion" is for 3+ nodes situation.
+    //
+
+    // Transpose + Conv2d + Transpose:
+    std::vector<int> transpose_to_nhwc = {NCHW::dim::N, NCHW::dim::H,
+                                          NCHW::dim::W, NCHW::dim::C};
+    std::vector<int> transpose_to_nchw = {NHWC::dim::N, NHWC::dim::C,
+                                          NHWC::dim::H, NHWC::dim::W};
+    auto CheckForTransposeToNHWC =
+        std::bind(CheckForTranspose, std::placeholders::_1, transpose_to_nhwc);
+    auto CheckForConv2dOp =
+        std::bind(CheckForMklOp, std::placeholders::_1, csinfo_.conv2d);
+    auto CheckForTransposeToNCHW =
+        std::bind(CheckForTranspose, std::placeholders::_1, transpose_to_nchw);
+    auto FuseConv2D =
+        std::bind(FuseTransposeMklOpTranspose, std::placeholders::_1,
+                  std::placeholders::_2, std::placeholders::_3, "NCHW");
+    finfo_.push_back(
+        {"transpose-elimination for Conv2D",
+         {CheckForTransposeToNHWC, CheckForConv2dOp, CheckForTransposeToNCHW},
+         // CheckForMklOp
+         FuseConv2D,
+         CopyAttrsConv});
   }
 
   // Standard interface to run pass
@@ -530,7 +569,7 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     string name;      // Original name of op of the node in the graph
     string new_name;  // New name of the op of the node in the graph
     // A function handler to copy attributes from an old node to a new node.
-    std::function<void(const Node*, NodeBuilder*)> copy_attrs;
+    std::function<void(const Node*, NodeBuilder*, bool)> copy_attrs;
     // A rule under which to rewrite this node
     std::function<bool(const Node*)> rewrite_rule;
   } RewriteInfo;
@@ -560,6 +599,41 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     std::function<Node*(const Node*)> get_node_to_be_merged;
   } MergeInfo;
 
+  // Structure to specify information used in node fusion of 3+ operators
+  typedef struct {
+    std::string pattern_name;  // Name to describe this pattern, such as
+                               // "Transpose_Mklop_Transpose".
+    std::vector<std::function<bool(const Node*)> >
+        node_checkers;  // Extra restriction checker for these ops
+    std::function<Status(
+        std::unique_ptr<Graph>*, std::vector<Node*>&,
+        std::function<void(const Node*, NodeBuilder* nb, bool)>)>
+        fuse_func;
+    std::function<void(const Node*, NodeBuilder* nb, bool)> copy_attrs;
+  } FusionInfo;
+
+  //
+  // Dimension indices for 2D tensor.
+  //
+  struct NCHW {
+    enum dim { N = 0, C = 1, H = 2, W = 3 };
+  };
+
+  struct NHWC {
+    enum dim { N = 0, H = 1, W = 2, C = 3 };
+  };
+
+  //
+  // dimension indices for 3D tensor.
+  //
+  struct NCDHW {
+    enum dim { N = 0, C = 1, D = 2, H = 3, W = 4 };
+  };
+
+  struct NDHWC {
+    enum dim { N = 0, D = 1, H = 2, W = 3, C = 4 };
+  };
+
   /// Structure to store all constant strings
   /// NOTE: names are alphabetically sorted.
   typedef struct {
@@ -619,6 +693,7 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     string requantize;
     string tanh;
     string tanh_grad;
+    string transpose;
     string reshape;
     string slice;
     string softmax;
@@ -637,6 +712,9 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
   /// Maintain info about nodes to be merged
   std::vector<MergeInfo> minfo_;
 
+  /// Maintain info about nodes to be fused
+  std::vector<FusionInfo> finfo_;
+
   /// Maintain structure of constant strings
   static ConstStringsInfo csinfo_;
 
@@ -815,6 +893,119 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     return n;
   }
 
+  // Return a node that can be fused with input node 'n'
+  //
+  // @return tuple. If we can find such nodes, the first
+  // element of the tuple is a true. Otherwise, it's false.
+  std::tuple<bool, std::vector<Node*>, const MklLayoutRewritePass::FusionInfo>
+  CheckForNodeFusion(Node* n) const;
+
+  // Fuse nodes in the vector "nodes"
+  Status FuseNode(std::unique_ptr<Graph>* g, std::vector<Node*>& nodes,
+                  const MklLayoutRewritePass::FusionInfo fi);
+
+  // Fuse tranpose(to "NHWC") + mklop("NHWC") + transpose(to "NCHW") into
+  // mklop("NCHW").
+  // Here "mklop" can be any MKL-DNN supported op, such as Conv2D.
+  static Status FuseTransposeMklOpTranspose(
+      std::unique_ptr<Graph>* g, std::vector<Node*>& nodes,
+      std::function<void(const Node*, NodeBuilder* nb, bool)> copy_attrs,
+      string data_format);
+
+  static bool CheckForTranspose(const Node* node, std::vector<int> perm) {
+    // Check if node's type is "Transpose"
+    if (node->type_string() != "Transpose") return false;
+
+    // If "Transpose" has multiple output data edges, also don't fuse it.
+    if (node->num_outputs() > 1 || node->out_edges().size() > 1) return false;
+
+    // Check if has out control edge. If true, this is a training graph.
+    // Currently we focus on inference and do no fusion in training.
+    // Note: this constraint will eventually be removed, if we enabled this
+    // fusion for training
+    // in the future.
+    for (const Edge* e : node->out_edges()) {
+      if (e->IsControlEdge()) {
+        return false;
+      }
+    }
+
+    // If "Transpose" has input control edges, don't fuse on it.
+    for (const Edge* e : node->in_edges()) {
+      if (e->IsControlEdge()) {
+        return false;
+      }
+    }
+
+    // We compared the tensor containing the permutation order ("perm_node")
+    // with our desired order ("perm"). If they're exactly match, this check
+    // succeed and returns true.
+    for (const Edge* e : node->in_edges()) {
+      if (!e->IsControlEdge()) {
+        const Node* perm_node = e->src();
+
+        const int kPermTensorIndex = 1;
+        if (perm_node->type_string() == "Const" &&
+            e->dst_input() == kPermTensorIndex) {
+          // we find the "perm" node, now try to retrieve its value.
+          const TensorProto* proto = nullptr;
+          DCHECK(GetNodeAttr(perm_node->def(), "value", &proto).ok());
+
+          DataType type;
+          GetNodeAttr(perm_node->def(), "dtype", &type);
+
+          // Here we directly access to the "tensor_content", rather than
+          // "int_val". This is because we find "int_val" is
+          // not set properly under some circumstances.
+          if (type == DT_INT32) {
+            const int type_size = 4;
+            const int* tensor_content =
+                reinterpret_cast<const int*>(proto->tensor_content().c_str());
+            const int tensor_content_size =
+                proto->tensor_content().size() / type_size;
+
+            std::vector<int> perm_value(tensor_content,
+                                        tensor_content + tensor_content_size);
+
+            return perm_value == perm;
+          } else if (type == DT_INT64) {
+            const int type_size = 8;
+            const long* tensor_content =
+                reinterpret_cast<const long*>(proto->tensor_content().c_str());
+            const int tensor_content_size =
+                proto->tensor_content().size() / type_size;
+
+            std::vector<long> perm_value(tensor_content,
+                                         tensor_content + tensor_content_size);
+            std::vector<long> long_perm(perm.cbegin(), perm.cend());
+
+            return perm_value == long_perm;
+          }
+          return false;
+        }
+      }
+    }
+    return false;
+  }
+
+  static bool CheckForMklOp(const Node* node, string name = "") {
+    if (node == nullptr) return false;
+
+    if (!name.empty() && node->type_string() != name) {
+      return false;
+    }
+
+    // if mklop has multiple outputs, don't fuse it.
+    if (node->num_outputs() > 1) return false;
+
+    if (node->out_edges().size() > 1) return false;
+
+    DataType T;
+    TF_CHECK_OK(GetNodeAttr(node->def(), "T", &T));
+    return mkl_op_registry::IsMklOp(
+        mkl_op_registry::GetMklOpName(node->type_string()), T);
+  }
+
   // Check if the node 'n' has any applicable rewrite rule
   // We check for 2 scenarios for rewrite.
   //
@@ -1070,22 +1261,38 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
   // We need operator-specific function to copy attributes because the framework
   // does not provide any generic function for it.
   // NOTE: names are alphabetically sorted.
-  static void CopyAttrsAddN(const Node* orig_node, NodeBuilder* nb);
-  static void CopyAttrsBiasAddGrad(const Node* orig_node, NodeBuilder* nb);
-  static void CopyAttrsConcat(const Node* orig_node, NodeBuilder* nb);
-  static void CopyAttrsConcatV2(const Node* orig_node, NodeBuilder* nb);
-  static void CopyAttrsConv(const Node* orig_node, NodeBuilder* nb);
-  static void CopyAttrsDataType(const Node* orig_node, NodeBuilder* nb);
-  static void CopyAttrsFusedBatchNorm(const Node* orig_node, NodeBuilder* nb);
-  static void CopyAttrsLRN(const Node* orig_node, NodeBuilder* nb);
-  static void CopyAttrsPooling(const Node* orig_node, NodeBuilder* nb);
-  static void CopyAttrsQuantizedPooling(const Node* orig_node, NodeBuilder* nb);
-  static void CopyAttrsQuantizedConv2D(const Node* orig_node, NodeBuilder* nb);
-  static void CopyAttrsQuantizedConcat(const Node* orig_node, NodeBuilder* nb);
-  static void CopyAttrsReshape(const Node* orig_node, NodeBuilder* nb);
-  static void CopyAttrsRequantize(const Node* orig_node, NodeBuilder* nb);
-  static void CopyAttrsSlice(const Node* orig_node, NodeBuilder* nb);
-  static void CopyAttrsSplit(const Node* orig_node, NodeBuilder* nb);
+  static void CopyAttrsAddN(const Node* orig_node, NodeBuilder* nb,
+                            bool change_format = false);
+  static void CopyAttrsBiasAddGrad(const Node* orig_node, NodeBuilder* nb,
+                                   bool change_format = false);
+  static void CopyAttrsConcat(const Node* orig_node, NodeBuilder* nb,
+                              bool change_format = false);
+  static void CopyAttrsConcatV2(const Node* orig_node, NodeBuilder* nb,
+                                bool change_format = false);
+  static void CopyAttrsConv(const Node* orig_node, NodeBuilder* nb,
+                            bool change_format = false);
+  static void CopyAttrsDataType(const Node* orig_node, NodeBuilder* nb,
+                                bool change_format = false);
+  static void CopyAttrsFusedBatchNorm(const Node* orig_node, NodeBuilder* nb,
+                                      bool change_format = false);
+  static void CopyAttrsLRN(const Node* orig_node, NodeBuilder* nb,
+                           bool change_format = false);
+  static void CopyAttrsPooling(const Node* orig_node, NodeBuilder* nb,
+                               bool change_format = false);
+  static void CopyAttrsQuantizedPooling(const Node* orig_node, NodeBuilder* nb,
+                                        bool change_format = false);
+  static void CopyAttrsQuantizedConv2D(const Node* orig_node, NodeBuilder* nb,
+                                       bool change_format = false);
+  static void CopyAttrsQuantizedConcat(const Node* orig_node, NodeBuilder* nb,
+                                       bool change_format = false);
+  static void CopyAttrsReshape(const Node* orig_node, NodeBuilder* nb,
+                               bool change_format = false);
+  static void CopyAttrsRequantize(const Node* orig_node, NodeBuilder* nb,
+                                  bool change_format = false);
+  static void CopyAttrsSlice(const Node* orig_node, NodeBuilder* nb,
+                             bool change_format = false);
+  static void CopyAttrsSplit(const Node* orig_node, NodeBuilder* nb,
+                             bool change_format = false);
 
   // Generate a graph node in graph 'g' representing a dummy Mkl tensor node,
   // using node for original node 'orig_node' and return it in '*out'.
@@ -1586,8 +1793,8 @@ void MklLayoutRewritePass::AddWorkSpaceEdgeIfNeeded(
 // Op-specific functions to copy attributes from old node to new node
 //////////////////////////////////////////////////////////////////////////
 
-void MklLayoutRewritePass::CopyAttrsConv(const Node* orig_node,
-                                         NodeBuilder* nb) {
+void MklLayoutRewritePass::CopyAttrsConv(const Node* orig_node, NodeBuilder* nb,
+                                         bool change_format) {
   DataType T;
   string data_format;
   string padding;
@@ -1599,18 +1806,49 @@ void MklLayoutRewritePass::CopyAttrsConv(const Node* orig_node,
   TF_CHECK_OK(GetNodeAttr(orig_node->def(), "strides", &strides));
   TF_CHECK_OK(GetNodeAttr(orig_node->def(), "dilations", &dilations));
   TF_CHECK_OK(GetNodeAttr(orig_node->def(), "padding", &padding));
-  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "data_format", &data_format));
 
   // Add attributes to new node.
   nb->Attr("T", T);
-  nb->Attr("strides", strides);
-  nb->Attr("dilations", dilations);
   nb->Attr("padding", padding);
-  nb->Attr("data_format", data_format);
+
+  if (!change_format) {
+    nb->Attr("strides", strides);
+    nb->Attr("dilations", dilations);
+
+    TF_CHECK_OK(GetNodeAttr(orig_node->def(), "data_format", &data_format));
+    nb->Attr("data_format", data_format);
+  } else {
+    std::vector<int32> new_strides;
+    std::vector<int32> new_dilations;
+    if (strides.size() == 5) {
+      // "strides" and "dilations" also need to be changed according to
+      // "data_format",
+      // in this case, is "NDHWC" to "NCDHW".
+      new_strides = {strides[NDHWC::dim::N], strides[NDHWC::dim::C],
+                     strides[NDHWC::dim::D], strides[NDHWC::dim::H],
+                     strides[NDHWC::dim::W]};
+
+      new_dilations = {dilations[NDHWC::dim::N], dilations[NDHWC::dim::C],
+                       dilations[NDHWC::dim::D], dilations[NDHWC::dim::H],
+                       dilations[NDHWC::dim::W]};
+    } else {
+      // "strides" and "dilations" also need to be changed according to
+      // "data_format",
+      // in this case, is "NHWC" to "NCHW".
+
+      new_strides = {strides[NHWC::dim::N], strides[NHWC::dim::C],
+                     strides[NHWC::dim::H], strides[NHWC::dim::W]};
+
+      new_dilations = {dilations[NHWC::dim::N], dilations[NHWC::dim::C],
+                       dilations[NHWC::dim::H], dilations[NHWC::dim::W]};
+    }
+    nb->Attr("strides", new_strides);
+    nb->Attr("dilations", new_dilations);
+  }
 }
 
-void MklLayoutRewritePass::CopyAttrsAddN(const Node* orig_node,
-                                         NodeBuilder* nb) {
+void MklLayoutRewritePass::CopyAttrsAddN(const Node* orig_node, NodeBuilder* nb,
+                                         bool change_format) {
   DataType T;
   int N;
 
@@ -1624,7 +1862,8 @@ void MklLayoutRewritePass::CopyAttrsAddN(const Node* orig_node,
 }
 
 void MklLayoutRewritePass::CopyAttrsBiasAddGrad(const Node* orig_node,
-                                                NodeBuilder* nb) {
+                                                NodeBuilder* nb,
+                                                bool change_format) {
   DataType T;
   string data_format;
   std::vector<int32> strides;
@@ -1640,8 +1879,8 @@ void MklLayoutRewritePass::CopyAttrsBiasAddGrad(const Node* orig_node,
   nb->Attr("data_format", data_format);
 }
 
-void MklLayoutRewritePass::CopyAttrsLRN(const Node* orig_node,
-                                        NodeBuilder* nb) {
+void MklLayoutRewritePass::CopyAttrsLRN(const Node* orig_node, NodeBuilder* nb,
+                                        bool change_format) {
   DataType T;
   int depth_radius;
   float bias;
@@ -1664,7 +1903,8 @@ void MklLayoutRewritePass::CopyAttrsLRN(const Node* orig_node,
 }
 
 void MklLayoutRewritePass::CopyAttrsPooling(const Node* orig_node,
-                                            NodeBuilder* nb) {
+                                            NodeBuilder* nb,
+                                            bool change_format) {
   DataType T;
   string data_format;
   string padding;
@@ -1686,7 +1926,8 @@ void MklLayoutRewritePass::CopyAttrsPooling(const Node* orig_node,
 }
 
 void MklLayoutRewritePass::CopyAttrsDataType(const Node* orig_node,
-                                             NodeBuilder* nb) {
+                                             NodeBuilder* nb,
+                                             bool change_format) {
   DataType T;
 
   // Get all attributes from old node.
@@ -1697,7 +1938,8 @@ void MklLayoutRewritePass::CopyAttrsDataType(const Node* orig_node,
 }
 
 void MklLayoutRewritePass::CopyAttrsQuantizedPooling(const Node* orig_node,
-                                                     NodeBuilder* nb) {
+                                                     NodeBuilder* nb,
+                                                     bool change_format) {
   DataType T;
   string data_format;
   string padding;
@@ -1717,7 +1959,8 @@ void MklLayoutRewritePass::CopyAttrsQuantizedPooling(const Node* orig_node,
 }
 
 void MklLayoutRewritePass::CopyAttrsQuantizedConv2D(const Node* orig_node,
-                                                    NodeBuilder* nb) {
+                                                    NodeBuilder* nb,
+                                                    bool change_format) {
   DataType Tinput, Tfilter, out_type;
   string padding;
   string data_format("NHWC");
@@ -1747,7 +1990,8 @@ void MklLayoutRewritePass::CopyAttrsQuantizedConv2D(const Node* orig_node,
 }
 
 void MklLayoutRewritePass::CopyAttrsRequantize(const Node* orig_node,
-                                               NodeBuilder* nb) {
+                                               NodeBuilder* nb,
+                                               bool change_format) {
   DataType Tinput, out_type;
 
   // Get all attributes from old node.
@@ -1760,7 +2004,8 @@ void MklLayoutRewritePass::CopyAttrsRequantize(const Node* orig_node,
 }
 
 void MklLayoutRewritePass::CopyAttrsReshape(const Node* orig_node,
-                                            NodeBuilder* nb) {
+                                            NodeBuilder* nb,
+                                            bool change_format) {
   DataType T;
   DataType Tshape;
 
@@ -1773,7 +2018,7 @@ void MklLayoutRewritePass::CopyAttrsReshape(const Node* orig_node,
 }
 
 void MklLayoutRewritePass::CopyAttrsSlice(const Node* orig_node,
-                                          NodeBuilder* nb) {
+                                          NodeBuilder* nb, bool change_format) {
   DataType T;
   DataType Index;
 
@@ -1786,7 +2031,7 @@ void MklLayoutRewritePass::CopyAttrsSlice(const Node* orig_node,
 }
 
 void MklLayoutRewritePass::CopyAttrsSplit(const Node* orig_node,
-                                          NodeBuilder* nb) {
+                                          NodeBuilder* nb, bool change_format) {
   DataType T;
   string data_format;
   int num_split;
@@ -1803,7 +2048,8 @@ void MklLayoutRewritePass::CopyAttrsSplit(const Node* orig_node,
 }
 
 void MklLayoutRewritePass::CopyAttrsConcat(const Node* orig_node,
-                                           NodeBuilder* nb) {
+                                           NodeBuilder* nb,
+                                           bool change_format) {
   DataType T;
   int N;
 
@@ -1817,7 +2063,8 @@ void MklLayoutRewritePass::CopyAttrsConcat(const Node* orig_node,
 }
 
 void MklLayoutRewritePass::CopyAttrsConcatV2(const Node* orig_node,
-                                             NodeBuilder* nb) {
+                                             NodeBuilder* nb,
+                                             bool change_format) {
   DataType T;
   int N;
   DataType tidx;
@@ -1834,7 +2081,8 @@ void MklLayoutRewritePass::CopyAttrsConcatV2(const Node* orig_node,
 }
 
 void MklLayoutRewritePass::CopyAttrsFusedBatchNorm(const Node* orig_node,
-                                                   NodeBuilder* nb) {
+                                                   NodeBuilder* nb,
+                                                   bool change_format) {
   DataType T;
   float epsilon;
   string data_format;
@@ -2231,7 +2479,8 @@ Status MklLayoutRewritePass::RewriteNode(std::unique_ptr<Graph>* g,
     return s;
   }
 
-  ri->copy_attrs(const_cast<const Node*>(orig_node), &nb);
+  const bool kPartialCopyAttrs = false;
+  ri->copy_attrs(const_cast<const Node*>(orig_node), &nb, kPartialCopyAttrs);
 
   // Set the Mkl layer label for this op.
   if (DataTypeIsQuantized(orig_node->input_type(0)) ||
@@ -2391,6 +2640,143 @@ MklLayoutRewritePass::CheckForNodeRewrite(const Node* n) const {
   return nullptr;
 }
 
+//////////////////////////////////////////////////////////////////////////
+//           Helper functions for node fusion
+//////////////////////////////////////////////////////////////////////////
+Status MklLayoutRewritePass::FuseTransposeMklOpTranspose(
+    std::unique_ptr<Graph>* g, std::vector<Node*>& nodes,
+    std::function<void(const Node*, NodeBuilder* nb, bool)> copy_attrs,
+    string data_format) {
+  Node* transpose_to_nhwc = nodes[0];
+  Node* mklop = nodes[1];
+  Node* transpose_to_nchw = nodes[2];
+
+  const int transpose_nhwc_num_inputs = transpose_to_nhwc->num_inputs();
+  gtl::InlinedVector<Node*, 4> transpose_nhwc_control_edges;
+  gtl::InlinedVector<std::pair<Node*, int>, 4> transpose_nhwc_in(
+      transpose_nhwc_num_inputs);
+  FillInputs(transpose_to_nhwc, &transpose_nhwc_control_edges,
+             &transpose_nhwc_in);
+
+  const int mklop_num_inputs = mklop->num_inputs();
+  gtl::InlinedVector<Node*, 4> mklop_control_edges;
+  gtl::InlinedVector<std::pair<Node*, int>, 4> mklop_in(mklop_num_inputs);
+  FillInputs(mklop, &mklop_control_edges, &mklop_in);
+
+  const int transpose_nchw_num_inputs = transpose_to_nchw->num_inputs();
+  gtl::InlinedVector<Node*, 4> transpose_nchw_control_edges;
+  gtl::InlinedVector<std::pair<Node*, int>, 4> transpose_nchw_in(
+      transpose_nchw_num_inputs);
+  FillInputs(transpose_to_nchw, &transpose_nchw_control_edges,
+             &transpose_nchw_in);
+
+  // We use same name as original node, but change the op
+  // type.
+  NodeBuilder nb(mklop->name(), mklop->type_string());
+
+  // Storing the output slots of the input nodes.
+  for (int i = 0; i < mklop_num_inputs; i++) {
+    if (mklop_in[i].first == transpose_to_nhwc) {
+      // Fill "x":
+      nb.Input(transpose_nhwc_in[0].first, transpose_nhwc_in[0].second);
+    } else {
+      // Fill inputs other than "x":
+      nb.Input(mklop_in[i].first, mklop_in[i].second);
+    }
+  }
+
+  copy_attrs(const_cast<const Node*>(mklop), &nb, true);
+  nb.Attr("data_format", data_format);
+
+  // Copy the device assigned to old node to new node.
+  nb.Device(mklop->def().device());
+
+  // Create node.
+  Node* new_node;
+  TF_CHECK_OK(nb.Finalize(&**g, &new_node));
+  DCHECK(new_node);
+
+  // Fill outputs.
+  for (const Edge* e : transpose_to_nchw->out_edges()) {
+    if (!e->IsControlEdge()) {
+      const int kTransposeWithMklOpOutputSlot = 0;
+      DCHECK((*g)->AddEdge(new_node, kTransposeWithMklOpOutputSlot, e->dst(),
+                           e->dst_input()));
+    }
+  }
+
+  // Copy device assigned to old node to new node.
+  new_node->set_assigned_device_name(mklop->assigned_device_name());
+
+  // Copy requested_device and assigned_device_name_index
+  new_node->set_requested_device(mklop->requested_device());
+  new_node->set_assigned_device_name_index(mklop->assigned_device_name_index());
+
+  (*g)->RemoveNode(transpose_to_nhwc);
+  (*g)->RemoveNode(mklop);
+  (*g)->RemoveNode(transpose_to_nchw);
+
+  return Status::OK();
+}
+
+Status MklLayoutRewritePass::FuseNode(
+    std::unique_ptr<Graph>* g, std::vector<Node*>& nodes,
+    const MklLayoutRewritePass::FusionInfo fi) {
+  return fi.fuse_func(g, nodes, fi.copy_attrs);
+}
+
+std::tuple<bool, std::vector<Node*>, const MklLayoutRewritePass::FusionInfo>
+MklLayoutRewritePass::CheckForNodeFusion(Node* a) const {
+  // Stores matched nodes, in the same order as node_checkers.
+  std::vector<Node*> nodes;
+
+  for (auto fi = finfo_.begin(); fi != finfo_.end(); ++fi) {
+    //
+    // Make sure node "a" and its succeding nodes (b, c ...), match the pattern
+    // defined in fusion info (ops[0], ops[1], ...),
+    // a.k.a. "a->b->c" matches "op1->op2->op3"
+    //
+
+    // Stores the first unvisted outgoing edge of each matched node in "nodes".
+    std::stack<EdgeSet::const_iterator> current_neighbor_stack;
+    nodes.clear();
+
+    auto node_checker = fi->node_checkers.begin();
+    if (a != nullptr && (*node_checker)(a)) {
+      nodes.push_back(a);
+      current_neighbor_stack.push(a->out_edges().begin());
+      ++node_checker;
+    }
+
+    while (!nodes.empty()) {
+      auto& current_neighbor_iter = current_neighbor_stack.top();
+
+      if (current_neighbor_iter != nodes.back()->out_edges().end()) {
+        // Found an unvisited edge. Goes through the edge to get the neighbor.
+        Node* neighbor_node = (*current_neighbor_iter)->dst();
+        ++current_neighbor_stack.top();  // Retrieves the next unvisited edge.
+
+        if ((*node_checker)(neighbor_node)) {
+          // Found a match. Stores the node and moves to the next checker.
+          nodes.push_back(neighbor_node);
+          current_neighbor_stack.push(neighbor_node->out_edges().begin());
+          if (++node_checker == fi->node_checkers.end()) {
+            return make_tuple(true, nodes, *fi);
+          }
+        }
+      } else {
+        // Removes the current node since none of its neighbor leads to a
+        // further match.
+        nodes.pop_back();
+        current_neighbor_stack.pop();
+        --node_checker;
+      }
+    }
+  }
+
+  return make_tuple(false, std::vector<Node*>(), FusionInfo());
+}
+
 ///////////////////////////////////////////////////////////////////////////////
 //              Post-rewrite Mkl metadata fixup pass
 ///////////////////////////////////////////////////////////////////////////////
@@ -2516,6 +2902,30 @@ bool MklLayoutRewritePass::RunPass(std::unique_ptr<Graph>* g) {
 
   DumpGraph("After running MklLayoutRewritePass(NodeMerge)", &**g);
 
+#ifdef ENABLE_TRANSPOSE_OPTIMIZATION
+  order.clear();
+  GetReversePostOrder(**g, &order);  // This will give us topological sort.
+  for (Node* n : order) {
+    // If node is not an op or it cannot run on CPU device, then skip.
+    if (!n->IsOp() || !CanOpRunOnCPUDevice(n)) {
+      continue;
+    }
+
+    auto check_result = CheckForNodeFusion(n);
+    bool found_pattern = std::get<0>(check_result);
+    std::vector<Node*> nodes = std::get<1>(check_result);
+    const FusionInfo fi = std::get<2>(check_result);
+
+    // if "found_pattern" is true, we can do the fusion.
+    if (found_pattern) {
+      if (FuseNode(g, nodes, fi) == Status::OK()) {
+        result = true;
+      }
+    }
+  }
+  DumpGraph("After running MklLayoutRewritePass(NodeFusion)", &**g);
+#endif  // ENABLE_TRANSPOSE_OPTIMIZATION
+
   order.clear();
   GetReversePostOrder(**g, &order);  // This will give us topological sort.
   for (Node* n : order) {
diff --git a/tensorflow/core/graph/mkl_layout_pass_test.cc b/tensorflow/core/graph/mkl_layout_pass_test.cc
index 7e2d1f78785..22f87918700 100644
--- a/tensorflow/core/graph/mkl_layout_pass_test.cc
+++ b/tensorflow/core/graph/mkl_layout_pass_test.cc
@@ -455,6 +455,317 @@ TEST_F(MklLayoutPassTest, NodeMerge_Conv2DWithBias_ConvBpropInput_FilterFwd) {
             "E:3->G:4;F->G;F:control->DMT/_3:control;G->Z;X->Y:1;X->Z:1");
 }
 
+#ifdef ENABLE_TRANSPOSE_OPTIMIZATION
+TEST_F(MklLayoutPassTest, NodeMerge_TransposeConv2DTranspose_Positive) {
+  InitGraph(
+      "node { name: 'Input0' op: 'Input'}"
+      "node { name: 'Input1' op: 'Input'}"
+      "node { name: 'Const0' op: 'Const'"
+      "  attr {"
+      "   key: 'dtype'"
+      "   value {"
+      "     type: DT_INT32"
+      "   }"
+      "  }"
+      " attr {"
+      "   key: 'value'"
+      "   value {"
+      "     tensor {"
+      "       dtype: DT_INT32"
+      "       tensor_shape {"
+      "         dim {"
+      "           size: 4"
+      "         }"
+      "       }"
+      "       tensor_content: "
+      "'\\000\\000\\000\\000\\002\\000\\000\\000\\003\\000\\000\\000\\001\\000"
+      "\\000\\000'"
+      "     }"
+      "   }"
+      " }"
+      "}"
+      "node { name: 'Const1' op: 'Const'"
+      "  attr {"
+      "   key: 'dtype'"
+      "   value {"
+      "     type: DT_INT32"
+      "   }"
+      "  }"
+      " attr {"
+      "   key: 'value'"
+      "   value {"
+      "     tensor {"
+      "       dtype: DT_INT32"
+      "       tensor_shape {"
+      "         dim {"
+      "           size: 4"
+      "         }"
+      "       }"
+      "       tensor_content: "
+      "'\\000\\000\\000\\000\\003\\000\\000\\000\\001\\000\\000\\000\\002\\000"
+      "\\000\\000'"
+      "     }"
+      "   }"
+      " }"
+      "}"
+      "node {              \
+      name: 'Transpose0' \
+      op: 'Transpose'    \
+      input: 'Input0'    \
+      input: 'Const0'    \
+      attr {             \
+        key: 'T'         \
+        value {          \
+          type: DT_FLOAT \
+        }                \
+      }                  \
+      attr {             \
+        key: 'Tperm'     \
+        value {          \
+          type: DT_INT32 \
+        }                \
+      }                  \
+    }"
+      "node {                 \
+      name: 'Conv2D'        \
+      op: 'Conv2D'          \
+      input: 'Transpose0'   \
+      input: 'Input1'       \
+      attr {                \
+        key: 'T'            \
+        value {             \
+          type: DT_FLOAT    \
+        }                   \
+      }                     \
+      attr {                \
+        key: 'data_format'  \
+        value {             \
+          s: 'NHWC'         \
+        }                   \
+      }                     \
+      attr {                \
+        key: 'dilations'    \
+        value {             \
+          list {            \
+            i: 1            \
+            i: 1            \
+            i: 1            \
+            i: 1            \
+          }                 \
+        }                   \
+      }                     \
+      attr {                \
+        key: 'padding'      \
+        value {             \
+          s: 'SAME'         \
+        }                   \
+      }                     \
+      attr {                \
+        key: 'strides'      \
+        value {             \
+          list {            \
+            i: 1            \
+            i: 1            \
+            i: 1            \
+            i: 1            \
+          }                 \
+        }                   \
+      }                     \
+      attr {                \
+        key: 'use_cudnn_on_gpu' \
+        value {                 \
+          b: true               \
+        }                       \
+      }                         \
+    }"
+      "node {              \
+      name: 'Transpose1' \
+      op: 'Transpose'    \
+      input: 'Conv2D'    \
+      input: 'Const1'    \
+      attr {             \
+        key: 'T'         \
+        value {          \
+          type: DT_FLOAT \
+        }                \
+      }                  \
+      attr {             \
+        key: 'Tperm'     \
+        value {          \
+          type: DT_INT32 \
+        }                \
+      }                  \
+    }"
+      "node { name: 'Relu' op: 'Relu'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " input: ['Transpose1'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "Const0(Const);Const1(Const);"
+            "Conv2D(_MklConv2D);DMT/_0(Const);DMT/_1(Const);Input0(Input);"
+            "Input1(Input);Relu(_MklRelu)|Conv2D->Relu;Conv2D:2->Relu:1;DMT/"
+            "_0->Conv2D:2;DMT/_1->Conv2D:3;Input0->Conv2D;"
+            "Input0:control->DMT/_0:control;Input0:control->DMT/"
+            "_1:control;Input1->Conv2D:1");
+}
+
+TEST_F(MklLayoutPassTest, NodeMerge_TransposeConv2DTranspose_Negative) {
+  InitGraph(
+      "node { name: 'Input0' op: 'Input'}"
+      "node { name: 'Input1' op: 'Input'}"
+      "node { name: 'Const0' op: 'Const'"
+      "  attr {"
+      "   key: 'dtype'"
+      "   value {"
+      "     type: DT_INT32"
+      "   }"
+      "  }"
+      " attr {"
+      "   key: 'value'"
+      "   value {"
+      "     tensor {"
+      "       dtype: DT_INT32"
+      "       tensor_shape {"
+      "         dim {"
+      "           size: 4"
+      "         }"
+      "       }"
+      "       tensor_content: "
+      "'\\000\\000\\000\\000\\002\\000\\000\\000\\003\\000\\000\\000\\001\\000"
+      "\\000\\000'"
+      "     }"
+      "   }"
+      " }"
+      "}"
+      "node { name: 'Const1' op: 'Const'"
+      "  attr {"
+      "   key: 'dtype'"
+      "   value {"
+      "     type: DT_INT32"
+      "   }"
+      "  }"
+      " attr {"
+      "   key: 'value'"
+      "   value {"
+      "     tensor {"
+      "       dtype: DT_INT32"
+      "       tensor_shape {"
+      "         dim {"
+      "           size: 4"
+      "         }"
+      "       }"
+      "       tensor_content: "
+      "'\\000\\000\\000\\000\\002\\000\\000\\000\\003\\000\\000\\000\\001\\000"
+      "\\000\\000'"
+      "     }"
+      "   }"
+      " }"
+      "}"
+      "node {              \
+      name: 'Transpose0' \
+      op: 'Transpose'    \
+      input: 'Input0'    \
+      input: 'Const0'    \
+      attr {             \
+        key: 'T'         \
+        value {          \
+          type: DT_FLOAT \
+        }                \
+      }                  \
+      attr {             \
+        key: 'Tperm'     \
+        value {          \
+          type: DT_INT32 \
+        }                \
+      }                  \
+    }"
+      "node {                 \
+      name: 'Conv2D'        \
+      op: 'Conv2D'          \
+      input: 'Transpose0'   \
+      input: 'Input1'       \
+      attr {                \
+        key: 'T'            \
+        value {             \
+          type: DT_FLOAT    \
+        }                   \
+      }                     \
+      attr {                \
+        key: 'data_format'  \
+        value {             \
+          s: 'NHWC'         \
+        }                   \
+      }                     \
+      attr {                \
+        key: 'dilations'    \
+        value {             \
+          list {            \
+            i: 1            \
+            i: 1            \
+            i: 1            \
+            i: 1            \
+          }                 \
+        }                   \
+      }                     \
+      attr {                \
+        key: 'padding'      \
+        value {             \
+          s: 'SAME'         \
+        }                   \
+      }                     \
+      attr {                \
+        key: 'strides'      \
+        value {             \
+          list {            \
+            i: 1            \
+            i: 1            \
+            i: 1            \
+            i: 1            \
+          }                 \
+        }                   \
+      }                     \
+      attr {                \
+        key: 'use_cudnn_on_gpu' \
+        value {                 \
+          b: true               \
+        }                       \
+      }                         \
+    }"
+      "node {              \
+      name: 'Transpose1' \
+      op: 'Transpose'    \
+      input: 'Conv2D'    \
+      input: 'Const1'    \
+      attr {             \
+        key: 'T'         \
+        value {          \
+          type: DT_FLOAT \
+        }                \
+      }                  \
+      attr {             \
+        key: 'Tperm'     \
+        value {          \
+          type: DT_INT32 \
+        }                \
+      }                  \
+    }"
+      "node { name: 'Relu' op: 'Relu'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " input: ['Transpose1'] }");
+  EXPECT_EQ(
+      DoMklLayoutOptimizationPass(),
+      "Const0(Const);Const1(Const);"
+      "Conv2D(_MklConv2D);DMT/_0(Const);DMT/_1(Const);DMT/_2(Const);"
+      "Input0(Input);Input1(Input);Relu(_MklRelu);"
+      "Transpose0(Transpose);Transpose1(Transpose)|Const0->Transpose0:1;Const1-"
+      ">Transpose1:1;"
+      "Conv2D->Transpose1;DMT/_0->Conv2D:2;DMT/_1->Conv2D:3;DMT/"
+      "_2->Relu:1;Input0->Transpose0;"
+      "Input1->Conv2D:1;Transpose0->Conv2D;Transpose0:control->DMT/_0:control;"
+      "Transpose0:control->DMT/"
+      "_1:control;Transpose1->Relu;Transpose1:control->DMT/_2:control");
+}
+#endif  // ENABLE_TRANSPOSE_OPTIMIZATION
+
 /////////////////////////////////////////////////////////////////////
 //  Unit tests related to rewriting node to Mkl node
 /////////////////////////////////////////////////////////////////////
diff --git a/tensorflow/core/graph/node_builder.cc b/tensorflow/core/graph/node_builder.cc
index adaee479359..a91e6dd0573 100644
--- a/tensorflow/core/graph/node_builder.cc
+++ b/tensorflow/core/graph/node_builder.cc
@@ -29,6 +29,8 @@ NodeBuilder::NodeOut::NodeOut(Node* n, int32 i)  // NOLINT(runtime/explicit)
       index(i),
       dt(SafeGetOutput(node, i, &error)) {}
 
+NodeBuilder::NodeOut::NodeOut(OutputTensor t) : NodeOut(t.node, t.index) {}
+
 NodeBuilder::NodeOut::NodeOut(StringPiece n, int32 i, DataType t)
     : node(nullptr), error(false), name(n), index(i), dt(t) {}
 
diff --git a/tensorflow/core/graph/node_builder.h b/tensorflow/core/graph/node_builder.h
index 31fb5909393..b1dc2ae92f1 100644
--- a/tensorflow/core/graph/node_builder.h
+++ b/tensorflow/core/graph/node_builder.h
@@ -50,6 +50,7 @@ class NodeBuilder {
   struct NodeOut {
     // For referencing an existing Node.
     NodeOut(Node* n, int32 i = 0);
+    NodeOut(OutputTensor t);
 
     // For referencing Nodes not in the graph being built. It is
     // useful when preparing a graph for ExtendSession or creating a
diff --git a/tensorflow/core/graph/optimizer_cse_test.cc b/tensorflow/core/graph/optimizer_cse_test.cc
index 642298fa95d..c1f93ce05ae 100644
--- a/tensorflow/core/graph/optimizer_cse_test.cc
+++ b/tensorflow/core/graph/optimizer_cse_test.cc
@@ -337,13 +337,9 @@ TEST_F(OptimizerCSETest, Constant_Dedup) {
   EXPECT_EQ(OriginalGraph(),
             "n/_0(Const);n/_1(Const);n/_2(Const);n/_3(Const);"
             "n/_4(Const);n/_5(Const);n/_6(Const);n/_7(Const)|");
-  std::vector<string> nodes = str_util::Split(DoCSE(), ";|");
-  std::set<string> node_set(nodes.begin(), nodes.end());
-  // Expect exactly one of each type of node to be retained after CSE.
-  EXPECT_EQ(node_set.count("n/_0(Const)") + node_set.count("n/_7(Const)"), 1);
-  EXPECT_EQ(node_set.count("n/_1(Const)") + node_set.count("n/_6(Const)"), 1);
-  EXPECT_EQ(node_set.count("n/_2(Const)") + node_set.count("n/_5(Const)"), 1);
-  EXPECT_EQ(node_set.count("n/_3(Const)") + node_set.count("n/_4(Const)"), 1);
+  // In theory, there are 2^4 possible correct output of CSE.  In this
+  // test, it happens to eliminate the last 4 nodes.
+  EXPECT_EQ(DoCSE(), "n/_0(Const);n/_1(Const);n/_2(Const);n/_3(Const)|");
 }
 
 static void BM_CSE(int iters, int op_nodes) {
diff --git a/tensorflow/core/graph/testlib.cc b/tensorflow/core/graph/testlib.cc
index 0a38aa1c919..0e74a30c7a9 100644
--- a/tensorflow/core/graph/testlib.cc
+++ b/tensorflow/core/graph/testlib.cc
@@ -123,6 +123,17 @@ Node* Assign(Graph* g, Node* var, Node* val) {
   return ret;
 }
 
+Node* Cumsum(Graph* g, Node* data, Node* axes, bool exclusive, bool reverse) {
+  Node* ret;
+  TF_CHECK_OK(NodeBuilder(g->NewName("n"), "Cumsum")
+                  .Input(data)
+                  .Input(axes)
+                  .Attr("exclusive", exclusive)
+                  .Attr("reverse", reverse)
+                  .Finalize(g, &ret));
+  return ret;
+}
+
 Node* Reduce(Graph* g, const string& reduce, Node* data, Node* axes,
              bool keep_dims) {
   Node* ret;
diff --git a/tensorflow/core/graph/testlib.h b/tensorflow/core/graph/testlib.h
index b00196f5873..0c7233161f4 100644
--- a/tensorflow/core/graph/testlib.h
+++ b/tensorflow/core/graph/testlib.h
@@ -68,6 +68,10 @@ Node* Recv(Graph* g, const string& tensor, const string& type,
            const string& sender, const uint64 sender_incarnation,
            const string& receiver);
 
+// Adds a cumsum "node" in "g" doing cumsum(data, axes).
+Node* Cumsum(Graph* g, Node* data, Node* axes, bool exclusive = false,
+             bool reverse = false);
+
 // Adds a reduction "node" in "g" doing sum(data, axes).  "reduce" is
 // a reduction, e.g., Sum, Max, Min, Mean, etc.
 Node* Reduce(Graph* g, const string& reduce, Node* data, Node* axes,
diff --git a/tensorflow/core/grappler/BUILD b/tensorflow/core/grappler/BUILD
index 7b03ec38bf5..f353d789d47 100644
--- a/tensorflow/core/grappler/BUILD
+++ b/tensorflow/core/grappler/BUILD
@@ -41,6 +41,7 @@ tf_cc_test(
         "//tensorflow/core:all_kernels",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:tensor_testutil",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
     ],
@@ -106,6 +107,8 @@ cc_library(
         ":utils",
         "//tensorflow/core:framework",
         "//tensorflow/core:protos_all_cc",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -140,6 +143,7 @@ tf_cc_test(
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
         "//tensorflow/core/grappler/inputs:trivial_test_graph_input_yielder",
     ],
 )
diff --git a/tensorflow/core/grappler/costs/graph_properties.cc b/tensorflow/core/grappler/costs/graph_properties.cc
index 270b75269c7..1df26d94d1f 100644
--- a/tensorflow/core/grappler/costs/graph_properties.cc
+++ b/tensorflow/core/grappler/costs/graph_properties.cc
@@ -679,6 +679,11 @@ class SymbolicShapeRefiner {
               "' was not previously added to SymbolicShapeRefiner.");
         }
 
+        if (src_output >= c->inference_context->num_outputs())
+          return errors::OutOfRange("src_output = ", src_output,
+                                    ", but num_outputs is only ",
+                                    c->inference_context->num_outputs());
+
         // Propagate input node's NodeContext info to the current node's
         // NodeContext:
         // output_tensor_protos to input_tensor_protos and input_tensors, and
diff --git a/tensorflow/core/grappler/costs/op_level_cost_estimator_test.cc b/tensorflow/core/grappler/costs/op_level_cost_estimator_test.cc
index 998bd59dce3..c9ce63a8ef2 100644
--- a/tensorflow/core/grappler/costs/op_level_cost_estimator_test.cc
+++ b/tensorflow/core/grappler/costs/op_level_cost_estimator_test.cc
@@ -832,7 +832,7 @@ TEST_F(OpLevelCostEstimatorTest, GetTensorShapeProtoFromTensorProto) {
   EXPECT_FALSE(
       GetTensorShapeProtoFromTensorProto(tensor_proto, &tensor_shape_proto));
 
-  // Check GetTensorShapeProtoFromTensorProto() resturns correct values.
+  // Check GetTensorShapeProtoFromTensorProto() returns correct values.
   {
     std::vector<int64> shape_expected = {10, 20, 30, 40};
     GetTensorProto(DT_INT32, {4}, shape_expected,
diff --git a/tensorflow/core/grappler/costs/virtual_scheduler.cc b/tensorflow/core/grappler/costs/virtual_scheduler.cc
index ba50e555385..ae5200b3592 100644
--- a/tensorflow/core/grappler/costs/virtual_scheduler.cc
+++ b/tensorflow/core/grappler/costs/virtual_scheduler.cc
@@ -469,8 +469,8 @@ Status VirtualScheduler::Init() {
         } else {
           // Different device, no cached copy; transfer input_node to the
           // curr_node's device.
-          auto send_and_recv =
-              CreateSendRecv(input_node, curr_node, input_node_name);
+          auto send_and_recv = CreateSendRecv(input_node, curr_node, input_node,
+                                              input_node_name);
           // Note that CreateSendRecv() already connected input/output between
           // _Send and _Recv ops.
           const auto* send = send_and_recv.first;
@@ -608,7 +608,8 @@ string VirtualScheduler::ChannelDeviceName(const NodeDef* from,
 }
 
 std::pair<const NodeDef*, const NodeDef*> VirtualScheduler::CreateSendRecv(
-    const NodeDef* from, const NodeDef* to, const string& input_name) {
+    const NodeDef* from, const NodeDef* to, const NodeDef* input_node,
+    const string& input_name) {
   CHECK(!initialized_) << "CreateSendRecv is called after Init().";
 
   // Connect "from" node to "to" node with _Send and _Recv such that
@@ -641,6 +642,12 @@ std::pair<const NodeDef*, const NodeDef*> VirtualScheduler::CreateSendRecv(
   send_attr[kAttrInputSrc].set_s(input_name);
   send_attr[kAttrSrcDevice].set_s(DeviceName(from));
   send_attr[kAttrDstDevice].set_s(DeviceName(to));
+  // GraphDef generated by AutoGrappler has tensor_name field when removing
+  // _Send/_Recv nodes.
+  if (input_node->attr().count(kAttrTensorName)) {
+    send_attr[kAttrTensorName].set_s(
+        input_node->attr().at(kAttrTensorName).s());
+  }
 
   // _Recv op.
   auto* recv = new NodeDef();
@@ -650,6 +657,10 @@ std::pair<const NodeDef*, const NodeDef*> VirtualScheduler::CreateSendRecv(
   recv->set_device(DeviceName(to));
   auto& recv_attr = *(recv->mutable_attr());
   recv_attr[kAttrInputSrc].set_s(input_name);
+  if (input_node->attr().count(kAttrTensorName)) {
+    recv_attr[kAttrTensorName].set_s(
+        input_node->attr().at(kAttrTensorName).s());
+  }
 
   // NodeState for _Send op.
   auto& send_node_state = GetNodeStateOrCreateIt(send);
@@ -1022,7 +1033,8 @@ Costs VirtualScheduler::Summary() const {
       bool is_cost_accurate;
       std::tie(cost, is_cost_accurate) = op_costs_.at(item.first);
       VLOG(2) << "Node: " << item.first << ", Count: " << item.second
-              << ", Individual Cost: " << (is_cost_accurate ? "" : "~") << cost;
+              << ", Individual Cost: " << (is_cost_accurate ? "" : "~") << cost
+              << " us";
     }
   }
 
diff --git a/tensorflow/core/grappler/costs/virtual_scheduler.h b/tensorflow/core/grappler/costs/virtual_scheduler.h
index 89dff9686d3..6a835f32d16 100644
--- a/tensorflow/core/grappler/costs/virtual_scheduler.h
+++ b/tensorflow/core/grappler/costs/virtual_scheduler.h
@@ -308,15 +308,17 @@ class VirtualScheduler {
  private:
   // Constants.
   const string kAttrInputSrc = "input_source_";
-  const string kAttrSrcDevice = "src_device_";
-  const string kAttrDstDevice = "dst_device_";
+  const string kAttrSrcDevice = "send_device";
+  const string kAttrDstDevice = "recv_device";
+  const string kAttrTensorName = "tensor_name";
   const string kChannelDevice = "Channel";
 
   // Methods called from Init(). Fails if initialize_ is set.
   void MaybeUpdateInputOutput(const NodeDef* node);
   NodeState& GetNodeStateOrCreateIt(const NodeDef* node);
   std::pair<const NodeDef*, const NodeDef*> CreateSendRecv(
-      const NodeDef* from, const NodeDef* to, const string& input_name);
+      const NodeDef* from, const NodeDef* to, const NodeDef* input_node,
+      const string& input_name);
   string DeviceName(const NodeDef* node) const;
   string SanitizedDeviceName(const NodeDef* node) const;
   string ChannelDeviceName(const NodeDef* from, const NodeDef* to) const;
diff --git a/tensorflow/core/grappler/graph_analyzer/sig_node.h b/tensorflow/core/grappler/graph_analyzer/sig_node.h
index 45c0ed31626..66d290d88e4 100644
--- a/tensorflow/core/grappler/graph_analyzer/sig_node.h
+++ b/tensorflow/core/grappler/graph_analyzer/sig_node.h
@@ -178,7 +178,7 @@ class SigNode {
   // computed.
   size_t GetTopoHash(int distance) const;
 
-  // The the hash value for the highest computed distance. It must be previously
+  // The hash value for the highest computed distance. It must be previously
   // computed.
   size_t GetHighTopoHash() const {
     CHECK(!topo_hash_.empty());
diff --git a/tensorflow/core/grappler/graph_view.cc b/tensorflow/core/grappler/graph_view.cc
index 35675fb1a26..ba9d2eb3218 100644
--- a/tensorflow/core/grappler/graph_view.cc
+++ b/tensorflow/core/grappler/graph_view.cc
@@ -70,6 +70,12 @@ bool HasSingleFanoutNode(const GraphView& graph_view, const NodeDef* node,
   return fanout.size() <= 1;
 }
 
+bool HasFanouts(const GraphView& graph_view, const NodeDef* node, int port) {
+  const auto output = GraphView::OutputPort(node, port);
+  const auto fanout = graph_view.GetFanout(output);
+  return !fanout.empty();
+}
+
 bool NoControlFanin(const GraphView& graph_view, const NodeDef* node) {
   const auto control_port = GraphView::InputPort(node, -1);
   return graph_view.GetFanin(control_port).empty();
diff --git a/tensorflow/core/grappler/graph_view.h b/tensorflow/core/grappler/graph_view.h
index 89cec2eb2ec..0a47b225658 100644
--- a/tensorflow/core/grappler/graph_view.h
+++ b/tensorflow/core/grappler/graph_view.h
@@ -342,10 +342,13 @@ class GraphView
   }
 };
 
-// Returns true if node has one (or zero) fanout nodes at given port.
+// Returns true if node has one (or zero) fanout nodes at given output port.
 bool HasSingleFanoutNode(const GraphView& graph_view, const NodeDef* node,
                          int port = 0);
 
+// Returns true if node has at least one fanout node at given output port.
+bool HasFanouts(const GraphView& graph_view, const NodeDef* node, int port = 0);
+
 bool NoControlFanin(const GraphView& graph_view, const NodeDef* node);
 bool NoControlFanout(const GraphView& graph_view, const NodeDef* node);
 bool NoControlFaninOrFanout(const GraphView& graph_view, const NodeDef* node);
diff --git a/tensorflow/core/grappler/grappler_item.cc b/tensorflow/core/grappler/grappler_item.cc
index 2c490f3966c..f7cda35368e 100644
--- a/tensorflow/core/grappler/grappler_item.cc
+++ b/tensorflow/core/grappler/grappler_item.cc
@@ -19,10 +19,13 @@ limitations under the License.
 #include <unordered_set>
 #include <vector>
 
+#include "absl/container/flat_hash_set.h"
+#include "absl/strings/str_join.h"
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/grappler/op_types.h"
 #include "tensorflow/core/grappler/utils.h"
+#include "tensorflow/core/util/device_name_utils.h"
 
 namespace tensorflow {
 namespace grappler {
@@ -38,7 +41,8 @@ GrapplerItem::GrapplerItem(const GrapplerItem& other, GraphDef* graph_def) {
   restore_op = other.restore_op;
   save_restore_loc_tensor = other.save_restore_loc_tensor;
   queue_runners = other.queue_runners;
-  allowed_optimizations = other.allowed_optimizations;
+  devices_ = other.devices_;
+  allowed_optimizations_ = other.allowed_optimizations_;
   graph.Swap(graph_def);
 }
 
@@ -111,6 +115,64 @@ std::unordered_set<string> GrapplerItem::NodesToPreserve() const {
   return result;
 }
 
+const std::unordered_set<string>& GrapplerItem::devices() const {
+  return devices_;
+}
+
+Status GrapplerItem::AddDevice(const string& device) {
+  DeviceNameUtils::ParsedName name;
+
+  if (!DeviceNameUtils::ParseFullName(device, &name)) {
+    return errors::InvalidArgument("Invalid device name: device=", device);
+
+  } else if (!name.has_job || !name.has_replica || !name.has_task ||
+             !name.has_type || !name.has_id) {
+    return errors::InvalidArgument("Not a fully defined device name: device=",
+                                   device);
+  }
+
+  devices_.insert(DeviceNameUtils::ParsedNameToString(name));
+  return Status::OK();
+}
+
+Status GrapplerItem::AddDevices(const GrapplerItem& other) {
+  std::vector<absl::string_view> invalid_devices;
+  for (const string& device : other.devices()) {
+    Status added = AddDevice(device);
+    if (!added.ok()) invalid_devices.emplace_back(device);
+  }
+  return invalid_devices.empty()
+             ? Status::OK()
+             : errors::InvalidArgument("Skipped invalid devices: [",
+                                       absl::StrJoin(invalid_devices, ", "),
+                                       "]");
+}
+
+Status GrapplerItem::InferDevicesFromGraph() {
+  absl::flat_hash_set<absl::string_view> invalid_devices;
+  for (const NodeDef& node : graph.node()) {
+    Status added = AddDevice(node.device());
+    if (!added.ok()) invalid_devices.insert(node.device());
+  }
+  VLOG(2) << "Inferred device set: [" << absl::StrJoin(devices_, ", ") << "]";
+  return invalid_devices.empty()
+             ? Status::OK()
+             : errors::InvalidArgument("Skipped invalid devices: [",
+                                       absl::StrJoin(invalid_devices, ", "),
+                                       "]");
+}
+
+void GrapplerItem::ClearDevices() { devices_.clear(); }
+
+const GrapplerItem::AllowedOptimizations& GrapplerItem::allowed_optimizations()
+    const {
+  return allowed_optimizations_;
+}
+
+GrapplerItem::AllowedOptimizations& GrapplerItem::allowed_optimizations() {
+  return allowed_optimizations_;
+}
+
 std::vector<const NodeDef*> ComputeTransitiveFanin(
     const GraphDef& graph, const std::vector<string>& terminal_nodes) {
   bool ill_formed = false;
diff --git a/tensorflow/core/grappler/grappler_item.h b/tensorflow/core/grappler/grappler_item.h
index a0748abfe69..6ef4f14247b 100644
--- a/tensorflow/core/grappler/grappler_item.h
+++ b/tensorflow/core/grappler/grappler_item.h
@@ -83,9 +83,42 @@ struct GrapplerItem {
     // Is it allowed to add nodes to the graph that do not have registered
     // gradient function.
     bool non_differentiable_rewrites = true;
+    // By default we are not allowed to inline ops with side effects into the
+    // main graph, because we can't guarantee that after pruning these ops will
+    // be executed. However if we are optimizing a function library (see
+    // meta_optimizer.cc) and a graph was instantiated by a function definition,
+    // we can do that, because functions guarantee that all side effects will be
+    // executed (see function_optimizer.cc for details).
+    bool inline_ops_with_side_effects = false;
   };
 
-  AllowedOptimizations allowed_optimizations;
+  const std::unordered_set<string>& devices() const;
+  // Adds a device to a set of available devices, only if it's a valid fully
+  // defined device name. Returns `Status::OK()` if successfully added a device,
+  // and an error otherwise.
+  Status AddDevice(const string& device);
+  // Adds all valid devices from the other Grappler item to the device set.
+  Status AddDevices(const GrapplerItem& other);
+  // Adds all valid devices from the nodes of the graph to the device set.
+  // Returns `Status::OK()` if all device annotations found in a graph are valid
+  // fully defined device names, and an error otherwise.
+  Status InferDevicesFromGraph();
+  // Clears a set of available devices.
+  void ClearDevices();
+
+  const AllowedOptimizations& allowed_optimizations() const;
+  AllowedOptimizations& allowed_optimizations();
+
+ private:
+  // TODO(ezhulenev) Make GrapplerItem a class and hide all public data members.
+  // TODO(ezhulenev): Migrate all unordered collections to absl.
+
+  // A set of fully defined device names that can be used to place the nodes of
+  // the `graph`.
+  // Example of a fully defined name: "/job:work/replica:1/task:1/device:CPU:0"
+  std::unordered_set<string> devices_;
+
+  AllowedOptimizations allowed_optimizations_;
 };
 
 // Return the transitive fanin of a set of terminal nodes.
diff --git a/tensorflow/core/grappler/grappler_item_builder.cc b/tensorflow/core/grappler/grappler_item_builder.cc
index cf99f4908bf..9224ee78492 100644
--- a/tensorflow/core/grappler/grappler_item_builder.cc
+++ b/tensorflow/core/grappler/grappler_item_builder.cc
@@ -102,10 +102,11 @@ Status OptimizeGraph(const GraphDef& graph_def_arg, GraphDef* output_graph_def,
   }
 
   // Instantiate all variables for function library runtime creation.
-  std::vector<Device*> devices;
+  std::vector<std::unique_ptr<Device>> devices;
   TF_RETURN_IF_ERROR(DeviceFactory::AddDevices(
       options, "/job:localhost/replica:0/task:0", &devices));
-  std::unique_ptr<DeviceMgr> dvc_mgr(new DeviceMgr(devices));
+  Device* cpu_device = devices[0].get();
+  std::unique_ptr<DeviceMgr> dvc_mgr(new DeviceMgr(std::move(devices)));
   FunctionLibraryDefinition function_library(OpRegistry::Global(),
                                              graph_def.library());
   Env* env = Env::Default();
@@ -124,7 +125,7 @@ Status OptimizeGraph(const GraphDef& graph_def_arg, GraphDef* output_graph_def,
       new ProcessFunctionLibraryRuntime(dvc_mgr.get(), env,
                                         graph_def.versions().producer(),
                                         &function_library, *optimizer_opts));
-  FunctionLibraryRuntime* flr = pflr->GetFLR(devices[0]->name());
+  FunctionLibraryRuntime* flr = pflr->GetFLR(cpu_device->name());
 
   // Create the GraphOptimizer to optimize the graph def.
   GraphConstructorOptions graph_ctor_opts;
@@ -137,7 +138,7 @@ Status OptimizeGraph(const GraphDef& graph_def_arg, GraphDef* output_graph_def,
 
   // Optimize the graph.
   ::tensorflow::GraphOptimizer optimizer(*optimizer_opts);
-  optimizer.Optimize(flr, env, devices[0], &graphptr, /*shape_map=*/nullptr);
+  optimizer.Optimize(flr, env, cpu_device, &graphptr, /*shape_map=*/nullptr);
   graphptr->ToGraphDef(output_graph_def);
 
   // The default values of attributes might have been stripped by the optimizer.
@@ -519,7 +520,7 @@ std::unique_ptr<GrapplerItem> GrapplerItemFromMetaGraphDef(
         }
         if (!iter->second.has_tensor() ||
             iter->second.tensor().string_val_size() != 1) {
-          LOG(INFO) << "Unexected AttrValue proto: "
+          LOG(INFO) << "Unexpected AttrValue proto: "
                     << iter->second.DebugString();
           return nullptr;
         }
diff --git a/tensorflow/core/grappler/grappler_item_test.cc b/tensorflow/core/grappler/grappler_item_test.cc
index 72a9f481cab..a8fbe356829 100644
--- a/tensorflow/core/grappler/grappler_item_test.cc
+++ b/tensorflow/core/grappler/grappler_item_test.cc
@@ -14,7 +14,9 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/framework/function_testlib.h"
 #include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/grappler/inputs/trivial_test_graph_input_yielder.h"
 #include "tensorflow/core/platform/test.h"
 
@@ -44,6 +46,32 @@ TEST_F(GrapplerItemTest, Basic) {
   EXPECT_EQ(main_ops, graph_nodes);
 }
 
+TEST_F(GrapplerItemTest, InferDevices) {
+  using test::function::NDef;
+
+  const string cpu0 = "/job:work/replica:1/task:1/device:CPU:0";
+  const string cpu1 = "/job:work/replica:1/task:1/device:CPU:1";
+  const string cpu2 = "/device:CPU:2";
+
+  GrapplerItem item;
+  item.graph = test::function::GDef(
+      {
+          NDef("a", "Placeholder", {}, {{"dtype", DT_FLOAT}}, cpu0),
+          NDef("b", "Placeholder", {}, {{"dtype", DT_FLOAT}}, cpu1),
+          NDef("c", "Placeholder", {}, {{"dtype", DT_FLOAT}}, cpu2),
+      },
+      {} /* Empty function library */);
+
+  ASSERT_FALSE(item.InferDevicesFromGraph().ok());
+
+  EXPECT_EQ(item.devices().size(), 2);
+  EXPECT_NE(item.devices().find(cpu0), item.devices().end());
+  EXPECT_NE(item.devices().find(cpu1), item.devices().end());
+
+  item.ClearDevices();
+  EXPECT_EQ(item.devices().size(), 0);
+}
+
 }  // namespace
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/grappler/op_types.cc b/tensorflow/core/grappler/op_types.cc
index ebc4e9c4662..38fc1fff329 100644
--- a/tensorflow/core/grappler/op_types.cc
+++ b/tensorflow/core/grappler/op_types.cc
@@ -253,6 +253,10 @@ bool IsIgammac(const NodeDef& node) { return node.op() == "Igammac"; }
 
 bool IsImag(const NodeDef& node) { return node.op() == "Imag"; }
 
+bool IsImmutableConst(const NodeDef& node) {
+  return node.op() == "ImmutableConst";
+}
+
 bool IsInvGrad(const NodeDef& node) { return node.op() == "InvGrad"; }
 
 bool IsLess(const NodeDef& node) { return node.op() == "Less"; }
@@ -547,14 +551,15 @@ bool MaybeHasRefInput(const NodeDef& node) {
   return false;
 }
 
-bool IsFreeOfSideEffect(const NodeDef& node) {
+bool IsFreeOfSideEffect(const NodeDef& node,
+                        const OpRegistryInterface* op_registry) {
   // Placeholders must be preserved to keep the graph feedable.
   if (IsPlaceholder(node)) {
     return false;
   }
   const OpDef* op_def = nullptr;
   const string& op_name = node.op();
-  Status status = OpRegistry::Global()->LookUpOpDef(op_name, &op_def);
+  Status status = op_registry->LookUpOpDef(op_name, &op_def);
   if (!status.ok()) {
     return false;
   }
@@ -571,9 +576,17 @@ bool IsFreeOfSideEffect(const NodeDef& node) {
   if (node.op().find("Queue") != string::npos) {
     return false;
   }
+  // Sending a tensor via a network is a side effect.
+  if (IsSend(node)) {
+    return false;
+  }
   return !ModifiesInputsInPlace(node);
 }
 
+bool IsFreeOfSideEffect(const NodeDef& node) {
+  return IsFreeOfSideEffect(node, OpRegistry::Global());
+}
+
 bool ModifiesInputsInPlace(const NodeDef& node) {
   // Some nodes do in-place updates on regular tensor inputs.
   string op_name = node.op();
diff --git a/tensorflow/core/grappler/op_types.h b/tensorflow/core/grappler/op_types.h
index 067d4e774f4..67897e8512d 100644
--- a/tensorflow/core/grappler/op_types.h
+++ b/tensorflow/core/grappler/op_types.h
@@ -17,6 +17,7 @@ limitations under the License.
 #define TENSORFLOW_CORE_GRAPPLER_OP_TYPES_H_
 
 #include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/lib/core/status.h"
 
 namespace tensorflow {
@@ -77,6 +78,7 @@ bool IsIdentityNSingleInput(const NodeDef& node);
 bool IsIgamma(const NodeDef& node);
 bool IsIgammac(const NodeDef& node);
 bool IsImag(const NodeDef& node);
+bool IsImmutableConst(const NodeDef& node);
 bool IsInvGrad(const NodeDef& node);
 bool IsLess(const NodeDef& node);
 bool IsLessEqual(const NodeDef& node);
@@ -179,7 +181,9 @@ bool IsCommutative(const NodeDef& node);
 // value.
 bool IsPersistent(const NodeDef& node);
 
-bool IsFreeOfSideEffect(const NodeDef& node);
+bool IsFreeOfSideEffect(const NodeDef& node,
+                        const OpRegistryInterface* op_registry);
+bool IsFreeOfSideEffect(const NodeDef& node);  // use OpRegistry::Global()
 
 // Returns true if the takes a tensor reference as input, or if looking up its
 // OpDef failed.
diff --git a/tensorflow/core/grappler/optimizers/BUILD b/tensorflow/core/grappler/optimizers/BUILD
index 3a5b1334d3f..40eab8b9f01 100644
--- a/tensorflow/core/grappler/optimizers/BUILD
+++ b/tensorflow/core/grappler/optimizers/BUILD
@@ -141,8 +141,8 @@ cc_library(
     deps = [
         ":graph_optimizer",
         "//tensorflow/core:core_cpu_base",
+        "//tensorflow/core:core_cpu_lib",
         "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/grappler:grappler_item",
@@ -150,6 +150,8 @@ cc_library(
         "//tensorflow/core/grappler:op_types",
         "//tensorflow/core/grappler:utils",
         "//tensorflow/core/grappler/utils:functions",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
     ],
 )
@@ -178,22 +180,6 @@ tf_cuda_cc_test(
     ],
 )
 
-cc_library(
-    name = "graph_rewriter",
-    srcs = ["graph_rewriter.cc"],
-    hdrs = [
-        "graph_rewriter.h",
-    ],
-    visibility = ["//visibility:public"],
-    deps = [
-        "//tensorflow/core:framework",
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core/grappler:grappler_item",
-        "//tensorflow/core/grappler:op_types",
-        "//tensorflow/core/grappler:utils",
-    ],
-)
-
 cc_library(
     name = "graph_optimizer",
     hdrs = [
@@ -352,10 +338,10 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         ":graph_optimizer",
-        ":graph_rewriter",
         "//tensorflow/core:framework",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler:mutable_graph_view",
         "//tensorflow/core/grappler:op_types",
         "//tensorflow/core/grappler:utils",
         "@com_google_absl//absl/container:flat_hash_map",
@@ -419,7 +405,6 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         ":graph_optimizer",
-        ":graph_rewriter",
         ":static_schedule",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
@@ -708,6 +693,8 @@ cc_library(
         "//tensorflow/core/grappler:op_types",
         "//tensorflow/core/grappler:utils",
         "//tensorflow/core/grappler/costs:graph_properties",
+        "//tensorflow/core/grappler/utils:topological_sort",
+        "@com_google_absl//absl/container:flat_hash_set",
     ],
 )
 
@@ -847,6 +834,7 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
+        "@com_google_absl//absl/container:flat_hash_map",
     ],
 )
 
@@ -873,11 +861,10 @@ cc_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
-        "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler:op_types",
-        "//tensorflow/core/grappler:utils",
         "//tensorflow/core/grappler/costs:graph_properties",
+        "@com_google_absl//absl/strings",
     ],
 )
 
diff --git a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
index cf294cd20bb..e41b1cf6840 100644
--- a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
@@ -2309,7 +2309,9 @@ class SimplifyAggregation : public ArithmeticOptimizerStage {
   ~SimplifyAggregation() override = default;
 
   bool IsSupported(const NodeDef* node) const override {
-    return IsAggregate(*node) && NumNonControlInputs(*node) > 0;
+    return IsAggregate(*node) && NumNonControlInputs(*node) > 0 &&
+           GetDataTypeFromAttr(*node, "T") !=
+               DT_VARIANT;  // TODO(b/119787146): Enable for variants.
   }
 
   Status TrySimplify(NodeDef* node, string* simplified_node_name) override {
@@ -2405,11 +2407,10 @@ class ConvertPowStage : public ArithmeticOptimizerStage {
   Status TrySimplify(NodeDef* node, string* simplified_node_name) override {
     const auto& pow_props =
         ctx().graph_properties->GetInputProperties(node->name())[1];
-    for (int i = 0; i < pow_props.shape().dim_size(); ++i) {
-      if (pow_props.shape().dim(i).size() < 0) {
-        // skip if p is is not fully defined.
-        return Status::OK();
-      }
+    PartialTensorShape shape(pow_props.shape());
+    if (!shape.IsFullyDefined()) {
+      // skip if p is not fully defined.
+      return Status::OK();
     }
     if (TensorShape::IsValid(pow_props.shape()) && pow_props.has_value()) {
       Tensor pow(pow_props.dtype(), pow_props.shape());
@@ -2457,11 +2458,10 @@ class ConvertPowStage : public ArithmeticOptimizerStage {
         AddToOptimizationQueue(y);
       } else if (curr == complex128(0, 0) &&
                  ShapesSymbolicallyEqual(value_props.shape(), output_shape)) {
-        for (int i = 0; i < value_props.shape().dim_size(); ++i) {
-          if (value_props.shape().dim(i).size() < 0) {
-            // skip if b is is not fully defined.
-            return Status::OK();
-          }
+        PartialTensorShape shape(value_props.shape());
+        if (!shape.IsFullyDefined()) {
+          // skip if b is not fully defined.
+          return Status::OK();
         }
         if (TensorShape::IsValid(value_props.shape()) &&
             value_props.has_value()) {
@@ -3572,7 +3572,7 @@ Status ArithmeticOptimizer::Optimize(Cluster* /*cluster*/,
 
   // Disable restricted graph rewrites.
   options_.unary_ops_composition &=
-      item.allowed_optimizations.non_differentiable_rewrites;
+      item.allowed_optimizations().non_differentiable_rewrites;
 
   if (options_.dedup_computations) {
     DedupComputations();
diff --git a/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test.cc b/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test.cc
index b6286c425e5..35d22898f6c 100644
--- a/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/core/grappler/optimizers/arithmetic_optimizer.h"
+#include "tensorflow/cc/ops/math_ops.h"
 #include "tensorflow/cc/ops/standard_ops.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
@@ -3793,5 +3794,31 @@ TEST_F(ArithmeticOptimizerTest, RemoveStackStridedSliceSameAxis) {
                                  tensors[fCSlice2ToOut]);
 }
 
+TEST_F(ArithmeticOptimizerTest, SimplifyAggregationBFloat16) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  Output x = ops::Const(s.WithOpName("x"), {1.0f, 2.0f}, {1, 2});
+  Output cast = ops::Cast(s.WithOpName("cast"), x, DT_BFLOAT16);
+  Output add = ops::AddN(s.WithOpName("add"), {cast, cast});
+  Output id = ops::Identity(s.WithOpName("id"), add);
+
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  item.fetch = {"id"};
+  auto tensors_expected = EvaluateNodes(item.graph, item.fetch);
+  EXPECT_EQ(1, tensors_expected.size());
+
+  GraphDef output;
+  ArithmeticOptimizer optimizer;
+  EnableOnlySimplifyAggregation(&optimizer);
+  OptimizeAndPrune(&optimizer, &item, &output);
+
+  // Extra node created for multiplier.
+  EXPECT_EQ(5, output.node_size());
+
+  auto tensors = EvaluateNodes(output, item.fetch);
+  EXPECT_EQ(1, tensors.size());
+  test::ExpectTensorEqual<bfloat16>(tensors_expected[0], tensors[0]);
+}
+
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/constant_folding_test.cc b/tensorflow/core/grappler/optimizers/constant_folding_test.cc
index f6fdb32e989..192f48272f9 100644
--- a/tensorflow/core/grappler/optimizers/constant_folding_test.cc
+++ b/tensorflow/core/grappler/optimizers/constant_folding_test.cc
@@ -14,10 +14,12 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/core/grappler/optimizers/constant_folding.h"
+#include "tensorflow/cc/ops/array_ops.h"
 #include "tensorflow/cc/ops/array_ops_internal.h"
 #include "tensorflow/cc/ops/standard_ops.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/grappler/utils.h"
 #include "tensorflow/core/grappler/utils/grappler_test.h"
@@ -72,9 +74,9 @@ class ConstantFoldingTest : public GrapplerTest {
       GrapplerItem item;
       TF_CHECK_OK(s.ToGraphDef(&item.graph));
       item.fetch = {"mul1", "mul2", "add1", "add2"};
-      ConstantFolding optimizer(nullptr /* cpu_device */);
+      ConstantFolding optimizer(/*cpu_device=*/nullptr);
       GraphDef output;
-      Status status = optimizer.Optimize(nullptr, item, &output);
+      Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &output);
       TF_EXPECT_OK(status);
 
       EXPECT_EQ(7, output.node_size());
@@ -132,9 +134,9 @@ TEST_F(ConstantFoldingTest, SimpleFolding) {
   item.fetch.push_back("d");
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
 
-  ConstantFolding optimizer(nullptr /* cpu_device */);
+  ConstantFolding optimizer(/*cpu_device=*/nullptr);
   GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
+  Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &output);
   TF_EXPECT_OK(status);
 
   EXPECT_EQ(1, output.node_size());
@@ -178,9 +180,9 @@ TEST_F(ConstantFoldingTest, AddTree) {
   item.fetch = {"add_parent", "mul_parent", "addmul_parent"};
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
 
-  ConstantFolding optimizer(nullptr /* cpu_device */);
+  ConstantFolding optimizer(/*cpu_device=*/nullptr);
   GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
+  Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &output);
   TF_EXPECT_OK(status);
 
   // We expect the following rewrite(s) to occur:
@@ -276,13 +278,11 @@ TEST_F(ConstantFoldingTest, ConvPushDownTest) {
   GrapplerItem item;
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
 
-  ConstantFolding fold(nullptr);
+  ConstantFolding optimizer(/*cpu_device=*/nullptr);
   GraphDef output;
-  Status status = fold.Optimize(nullptr, item, &output);
+  Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &output);
   TF_EXPECT_OK(status);
 
-  std::cout << output.DebugString() << std::endl;
-
   EXPECT_EQ(5, output.node_size());
   int found = 0;
   for (const auto& node : output.node()) {
@@ -366,9 +366,9 @@ TEST_F(ConstantFoldingTest, NeutralElement) {
     TF_CHECK_OK(s.ToGraphDef(&item.graph));
     item.fetch = {"stack", "matmul3", "matmul4"};
 
-    ConstantFolding optimizer(nullptr /* cpu_device */);
+    ConstantFolding optimizer(/*cpu_device=*/nullptr);
     GraphDef output;
-    Status status = optimizer.Optimize(nullptr, item, &output);
+    Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &output);
     TF_EXPECT_OK(status);
 
     const string suffix =
@@ -521,9 +521,9 @@ TEST_F(ConstantFoldingTest, StrengthReduce_Reciprocal) {
   GrapplerItem item;
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
   item.fetch = {"div_f", "div_i", "realdiv"};
-  ConstantFolding optimizer(nullptr /* cpu_device */);
+  ConstantFolding optimizer(/*cpu_device=*/nullptr);
   GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
+  Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &output);
   TF_EXPECT_OK(status);
 
   EXPECT_EQ(8, output.node_size());
@@ -611,9 +611,9 @@ TEST_F(ConstantFoldingTest, NeutralElement_PartialShape_UnknownOutputShape) {
   GrapplerItem item;
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
 
-  ConstantFolding optimizer(nullptr /* cpu_device */);
+  ConstantFolding optimizer(/*cpu_device=*/nullptr);
   GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
+  Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &output);
   TF_EXPECT_OK(status);
 
   EXPECT_EQ(15, output.node_size());
@@ -683,9 +683,9 @@ TEST_F(ConstantFoldingTest, NeutralElement_PartialShape_KnownOutputShape) {
   GrapplerItem item;
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
 
-  ConstantFolding optimizer(nullptr /* cpu_device */);
+  ConstantFolding optimizer(/*cpu_device=*/nullptr);
   GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
+  Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &output);
   TF_EXPECT_OK(status);
 
   EXPECT_EQ(10, output.node_size());
@@ -741,9 +741,9 @@ TEST_F(ConstantFoldingTest, CreateConstNodes) {
 
   GrapplerItem item;
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
-  ConstantFolding optimizer(nullptr /* cpu_device */);
+  ConstantFolding optimizer(/*cpu_device=*/nullptr);
   GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
+  Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &output);
   TF_EXPECT_OK(status);
 
   EXPECT_EQ(24, output.node_size());
@@ -790,9 +790,9 @@ TEST_F(ConstantFoldingTest, FoldingNodeWithTwoOutputs) {
   item.fetch.push_back("f");
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
 
-  ConstantFolding optimizer(nullptr /* cpu_device */);
+  ConstantFolding optimizer(/*cpu_device=*/nullptr);
   GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
+  Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &output);
   TF_EXPECT_OK(status);
 
   EXPECT_EQ(2, output.node_size());
@@ -831,9 +831,9 @@ TEST_F(ConstantFoldingTest, ControlDependencies) {
   item.fetch.push_back("e");
   TF_CHECK_OK(scope.ToGraphDef(&item.graph));
 
-  ConstantFolding optimizer(nullptr /* cpu_device */);
+  ConstantFolding optimizer(/*cpu_device=*/nullptr);
   GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
+  Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &output);
   TF_EXPECT_OK(status);
 
   std::vector<string> expected_nodes = {"dflt", "p1", "p2", "e"};
@@ -874,9 +874,9 @@ TEST_F(ConstantFoldingTest, ControlDependenciesEmptyFetch) {
   GrapplerItem item;
   TF_CHECK_OK(scope.ToGraphDef(&item.graph));
 
-  ConstantFolding optimizer(nullptr /* cpu_device */);
+  ConstantFolding optimizer(/*cpu_device=*/nullptr);
   GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
+  Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &output);
   TF_EXPECT_OK(status);
 
   std::vector<string> expected_nodes = {"dflt", "p1", "p2", "c",
@@ -932,9 +932,9 @@ TEST_F(ConstantFoldingTest, ControlDependenciesDeduplicate) {
   TF_CHECK_OK(scope.ToGraphDef(&item.graph));
   auto tensors_expected = EvaluateNodes(item.graph, item.fetch);
   EXPECT_EQ(1, tensors_expected.size());
-  ConstantFolding optimizer(nullptr /* cpu_device */);
+  ConstantFolding optimizer(/*cpu_device=*/nullptr);
   GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
+  Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &output);
   TF_EXPECT_OK(status);
 
   std::vector<string> expected_nodes = {"dflt", "p1", "p2", "i2"};
@@ -1009,9 +1009,9 @@ TEST_F(ConstantFoldingTest, VariableNumberOfOutputs) {
   }
 
   item.fetch = outputs;
-  ConstantFolding optimizer(nullptr /* cpu_device */);
+  ConstantFolding optimizer(/*cpu_device=*/nullptr);
   GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
+  Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &output);
   TF_EXPECT_OK(status);
 
   int constant_folded = 0;
@@ -1047,9 +1047,9 @@ TEST_F(ConstantFoldingTest, ShapeMaterialization) {
   item.fetch.push_back("p2");
   TF_CHECK_OK(scope.ToGraphDef(&item.graph));
 
-  ConstantFolding optimizer(nullptr /* cpu_device */);
+  ConstantFolding optimizer(/*cpu_device=*/nullptr);
   GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
+  Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &output);
   TF_EXPECT_OK(status);
 
   int found = 0;
@@ -1097,9 +1097,9 @@ TEST_F(ConstantFoldingTest, ShapeMaterializationEmptyFetch) {
   GrapplerItem item;
   TF_CHECK_OK(scope.ToGraphDef(&item.graph));
 
-  ConstantFolding optimizer(nullptr /* cpu_device */);
+  ConstantFolding optimizer(/*cpu_device=*/nullptr);
   GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
+  Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &output);
   TF_EXPECT_OK(status);
 
   int found = 0;
@@ -1163,9 +1163,9 @@ TEST_F(ConstantFoldingTest, ShapeMaterializationShapeN) {
   GrapplerItem item;
   TF_CHECK_OK(scope.ToGraphDef(&item.graph));
 
-  ConstantFolding optimizer(nullptr /* cpu_device */);
+  ConstantFolding optimizer(/*cpu_device=*/nullptr);
   GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
+  Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &output);
   TF_EXPECT_OK(status);
   int found = 0;
   for (const auto& node : output.node()) {
@@ -1235,9 +1235,9 @@ TEST_F(ConstantFoldingTest, ShapeMaterializationShapeN_MultipleOutputs) {
   item.fetch.push_back("ia");
   item.fetch.push_back("ib");
 
-  ConstantFolding optimizer(nullptr /* cpu_device */);
+  ConstantFolding optimizer(/*cpu_device=*/nullptr);
   GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
+  Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &output);
   TF_EXPECT_OK(status);
 
   int found = 0;
@@ -1307,9 +1307,9 @@ TEST_F(ConstantFoldingTest, SwitchNodesEmptyFetch) {
   GrapplerItem item;
   TF_CHECK_OK(scope.ToGraphDef(&item.graph));
 
-  ConstantFolding optimizer(nullptr /* cpu_device */);
+  ConstantFolding optimizer(/*cpu_device=*/nullptr);
   GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
+  Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &output);
   TF_EXPECT_OK(status);
 
   std::set<string> present_nodes = {"v_in",     "v_ctrl",
@@ -1409,9 +1409,9 @@ TEST_F(ConstantFoldingTest, SwitchNodes) {
 
   TF_CHECK_OK(scope.ToGraphDef(&item.graph));
 
-  ConstantFolding optimizer(nullptr /* cpu_device */);
+  ConstantFolding optimizer(/*cpu_device=*/nullptr);
   GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
+  Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &output);
   TF_EXPECT_OK(status);
   std::set<string> present_nodes = {"v_in",     "v_ctrl",
                                     "switch",   "i",
@@ -1505,9 +1505,9 @@ TEST_F(ConstantFoldingTest, MergeNodes) {
   item.fetch = {"out1", "idx1", "out2", "idx2", "out3", "idx3", "out4", "idx4"};
   TF_CHECK_OK(scope.ToGraphDef(&item.graph));
 
-  ConstantFolding optimizer(nullptr /* cpu_device */);
+  ConstantFolding optimizer(/*cpu_device=*/nullptr);
   GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
+  Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &output);
   TF_EXPECT_OK(status);
 
   EXPECT_EQ(19, output.node_size());
@@ -1590,9 +1590,9 @@ TEST_F(ConstantFoldingTest, SplitRemoval) {
   item.fetch = {"out"};
   TF_CHECK_OK(scope.ToGraphDef(&item.graph));
 
-  ConstantFolding optimizer(nullptr /* cpu_device */);
+  ConstantFolding optimizer(/*cpu_device=*/nullptr);
   GraphDef got;
-  Status status = optimizer.Optimize(nullptr, item, &got);
+  Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &got);
   TF_EXPECT_OK(status);
 
   GraphDef want;
@@ -1636,9 +1636,9 @@ TEST_F(ConstantFoldingTest, SplitVRemoval) {
   item.fetch = {"out"};
   TF_CHECK_OK(scope.ToGraphDef(&item.graph));
 
-  ConstantFolding optimizer(nullptr /* cpu_device */);
+  ConstantFolding optimizer(/*cpu_device=*/nullptr);
   GraphDef got;
-  Status status = optimizer.Optimize(nullptr, item, &got);
+  Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &got);
   TF_EXPECT_OK(status);
 
   GraphDef want;
@@ -1686,9 +1686,9 @@ TEST_F(ConstantFoldingTest, TransposeOnSize1DimsRemoval) {
   item.fetch = {"out1"};
   TF_CHECK_OK(scope.ToGraphDef(&item.graph));
 
-  ConstantFolding optimizer(nullptr /* cpu_device */);
+  ConstantFolding optimizer(/*cpu_device=*/nullptr);
   GraphDef got;
-  Status status = optimizer.Optimize(nullptr, item, &got);
+  Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &got);
   TF_EXPECT_OK(status);
 
   GraphDef want;
@@ -1723,9 +1723,9 @@ TEST_F(ConstantFoldingTest, RandomShuffleOnScalarRemoval) {
   item.fetch = {"out1", "out2"};
   TF_CHECK_OK(scope.ToGraphDef(&item.graph));
 
-  ConstantFolding optimizer(nullptr /* cpu_device */);
+  ConstantFolding optimizer(/*cpu_device=*/nullptr);
   GraphDef got;
-  Status status = optimizer.Optimize(nullptr, item, &got);
+  Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &got);
   TF_EXPECT_OK(status);
 
   GraphDef want;
@@ -1769,9 +1769,9 @@ TEST_F(ConstantFoldingTest, ReverseOnSize1DimsRemoval) {
   item.fetch = {"out1"};
   TF_CHECK_OK(scope.ToGraphDef(&item.graph));
 
-  ConstantFolding optimizer(nullptr /* cpu_device */);
+  ConstantFolding optimizer(/*cpu_device=*/nullptr);
   GraphDef got;
-  Status status = optimizer.Optimize(nullptr, item, &got);
+  Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &got);
   TF_EXPECT_OK(status);
 
   GraphDef want;
@@ -1805,9 +1805,9 @@ TEST_F(ConstantFoldingTest, SliceWithSameDimensionRemoval) {
     item.fetch = {"out"};
     TF_CHECK_OK(scope.ToGraphDef(&item.graph));
 
-    ConstantFolding optimizer(nullptr /* cpu_device */);
+    ConstantFolding optimizer(/*cpu_device=*/nullptr);
     GraphDef got;
-    Status status = optimizer.Optimize(nullptr, item, &got);
+    Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &got);
     TF_EXPECT_OK(status);
 
     GraphDef want;
@@ -1852,9 +1852,9 @@ TEST_F(ConstantFoldingTest, SliceWithSameDimensionRemoval) {
     item.fetch = {"out"};
     TF_CHECK_OK(scope.ToGraphDef(&item.graph));
 
-    ConstantFolding optimizer(nullptr /* cpu_device */);
+    ConstantFolding optimizer(/*cpu_device=*/nullptr);
     GraphDef got;
-    Status status = optimizer.Optimize(nullptr, item, &got);
+    Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &got);
     TF_EXPECT_OK(status);
 
     GraphDef want;
@@ -1901,9 +1901,9 @@ TEST_F(ConstantFoldingTest, StridedSliceWithSameDimensionRemoval) {
     item.fetch = {"out"};
     TF_CHECK_OK(scope.ToGraphDef(&item.graph));
 
-    ConstantFolding optimizer(nullptr /* cpu_device */);
+    ConstantFolding optimizer(/*cpu_device=*/nullptr);
     GraphDef got;
-    Status status = optimizer.Optimize(nullptr, item, &got);
+    Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &got);
     TF_EXPECT_OK(status);
 
     GraphDef want;
@@ -1959,9 +1959,9 @@ TEST_F(ConstantFoldingTest, StridedSliceWithSameDimensionRemoval) {
     item.fetch = {"out"};
     TF_CHECK_OK(scope.ToGraphDef(&item.graph));
 
-    ConstantFolding optimizer(nullptr /* cpu_device */);
+    ConstantFolding optimizer(/*cpu_device=*/nullptr);
     GraphDef got;
-    Status status = optimizer.Optimize(nullptr, item, &got);
+    Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &got);
     TF_EXPECT_OK(status);
 
     GraphDef want;
@@ -2012,9 +2012,9 @@ TEST_F(ConstantFoldingTest, TileWithMultipliesBeingOne) {
   item.fetch = {"out"};
   TF_CHECK_OK(scope.ToGraphDef(&item.graph));
 
-  ConstantFolding optimizer(nullptr /* cpu_device */);
+  ConstantFolding optimizer(/*cpu_device=*/nullptr);
   GraphDef got;
-  Status status = optimizer.Optimize(nullptr, item, &got);
+  Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &got);
   TF_EXPECT_OK(status);
 
   GraphDef want;
@@ -2045,9 +2045,9 @@ TEST_F(ConstantFoldingTest, MergeConcat) {
   item.fetch = {"c2"};
   TF_CHECK_OK(scope.ToGraphDef(&item.graph));
 
-  ConstantFolding optimizer(nullptr /* cpu_device */);
+  ConstantFolding optimizer(/*cpu_device=*/nullptr);
   GraphDef got;
-  Status status = optimizer.Optimize(nullptr, item, &got);
+  Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &got);
   TF_EXPECT_OK(status);
 
   GraphDef want;
@@ -2075,9 +2075,9 @@ TEST_F(ConstantFoldingTest, MergeConcat_SameInput) {
   item.fetch = {"c2"};
   TF_CHECK_OK(scope.ToGraphDef(&item.graph));
 
-  ConstantFolding optimizer(nullptr /* cpu_device */);
+  ConstantFolding optimizer(/*cpu_device=*/nullptr);
   GraphDef got;
-  Status status = optimizer.Optimize(nullptr, item, &got);
+  Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &got);
   TF_EXPECT_OK(status);
 
   GraphDef want;
@@ -2106,9 +2106,9 @@ TEST_F(ConstantFoldingTest, MergeConcat_ConcatWithConst) {
   item.fetch = {"c2"};
   TF_CHECK_OK(scope.ToGraphDef(&item.graph));
 
-  ConstantFolding optimizer(nullptr /* cpu_device */);
+  ConstantFolding optimizer(/*cpu_device=*/nullptr);
   GraphDef got;
-  Status status = optimizer.Optimize(nullptr, item, &got);
+  Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &got);
   TF_EXPECT_OK(status);
 
   GraphDef want;
@@ -2137,9 +2137,9 @@ TEST_F(ConstantFoldingTest, MergeConcat_AxisMismatch) {
   item.fetch = {"c2"};
   TF_CHECK_OK(scope.ToGraphDef(&item.graph));
 
-  ConstantFolding optimizer(nullptr /* cpu_device */);
+  ConstantFolding optimizer(/*cpu_device=*/nullptr);
   GraphDef got;
-  Status status = optimizer.Optimize(nullptr, item, &got);
+  Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &got);
   TF_EXPECT_OK(status);
 
   GraphDef want;
@@ -2175,9 +2175,9 @@ TEST_F(ConstantFoldingTest, PaddingWithZeroSize) {
   item.fetch = {"out"};
   TF_CHECK_OK(scope.ToGraphDef(&item.graph));
 
-  ConstantFolding optimizer(nullptr /* cpu_device */);
+  ConstantFolding optimizer(/*cpu_device=*/nullptr);
   GraphDef got;
-  Status status = optimizer.Optimize(nullptr, item, &got);
+  Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &got);
   TF_EXPECT_OK(status);
 
   GraphDef want;
@@ -2221,9 +2221,9 @@ TEST_F(ConstantFoldingTest, SqueezeWithAllDimesionsGreaterThanOne) {
   item.fetch = {"out"};
   TF_CHECK_OK(scope.ToGraphDef(&item.graph));
 
-  ConstantFolding optimizer(nullptr /* cpu_device */);
+  ConstantFolding optimizer(/*cpu_device=*/nullptr);
   GraphDef got;
-  Status status = optimizer.Optimize(nullptr, item, &got);
+  Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &got);
   TF_EXPECT_OK(status);
 
   GraphDef want;
@@ -2269,9 +2269,9 @@ TEST_F(ConstantFoldingTest, NoOpReduction) {
   item.fetch = {"s", "p2"};
   TF_CHECK_OK(scope.ToGraphDef(&item.graph));
 
-  ConstantFolding optimizer(nullptr /* cpu_device */);
+  ConstantFolding optimizer(/*cpu_device=*/nullptr);
   GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
+  Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &output);
   TF_EXPECT_OK(status);
 
   int found = 0;
@@ -2338,9 +2338,9 @@ TEST_F(ConstantFoldingTest, SingleElementEmptyAxisReduction) {
   item.fetch = {"mean_1", "mean_2", "mean_3", "mean_4", "mean_5"};
   TF_CHECK_OK(scope.ToGraphDef(&item.graph));
 
-  ConstantFolding optimizer(nullptr /* cpu_device */);
+  ConstantFolding optimizer(/*cpu_device=*/nullptr);
   GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
+  Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &output);
   TF_EXPECT_OK(status);
 
   // Ensure Mean node is optimized to Reshape.
@@ -2433,9 +2433,9 @@ TEST_F(ConstantFoldingTest, NoOpReshape) {
   item.fetch = {"s1", "s2", "s3", "s4"};
   TF_CHECK_OK(scope.ToGraphDef(&item.graph));
 
-  ConstantFolding optimizer(nullptr /* cpu_device */);
+  ConstantFolding optimizer(/*cpu_device=*/nullptr);
   GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
+  Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &output);
   TF_EXPECT_OK(status);
 
   int found = 0;
@@ -2495,9 +2495,9 @@ TEST_F(ConstantFoldingTest, Packing) {
   GrapplerItem item;
   TF_CHECK_OK(scope.ToGraphDef(&item.graph));
 
-  ConstantFolding optimizer(nullptr /* cpu_device */);
+  ConstantFolding optimizer(/*cpu_device=*/nullptr);
   GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
+  Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &output);
   TF_EXPECT_OK(status);
 
   const std::vector<string> fetch_nodes = {"i1", "i2"};
@@ -2538,9 +2538,9 @@ TEST_F(ConstantFoldingTest, MaterializeBroadcastGradientArgs) {
   GrapplerItem item;
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
 
-  ConstantFolding optimizer(nullptr /* cpu_device */);
+  ConstantFolding optimizer(/*cpu_device=*/nullptr);
   GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
+  Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &output);
   TF_EXPECT_OK(status);
 
   std::vector<string> fetch_nodes = {"o1", "o2", "p1", "p2"};
@@ -2552,7 +2552,7 @@ TEST_F(ConstantFoldingTest, MaterializeBroadcastGradientArgs) {
 
   // Run a second time to make sure the optimization is idempotent.
   item.graph.Swap(&output);
-  status = optimizer.Optimize(nullptr, item, &output);
+  status = optimizer.Optimize(/*cluster=*/nullptr, item, &output);
   TF_EXPECT_OK(status);
 
   int found = 0;
@@ -2619,14 +2619,14 @@ TEST_F(ConstantFoldingTest, MaterializeBroadcastGradientArgs_InfiniteLoop) {
   auto tensors_expected = EvaluateNodes(item.graph, fetch_nodes, {{"a", a_t}});
   EXPECT_EQ(fetch_nodes.size(), tensors_expected.size());
 
-  ConstantFolding optimizer(nullptr /* cpu_device */);
+  ConstantFolding optimizer(/*cpu_device=*/nullptr);
   GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
+  Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &output);
   TF_EXPECT_OK(status);
 
   // Run a second time to make sure the optimization is idempotent.
   item.graph.Swap(&output);
-  status = optimizer.Optimize(nullptr, item, &output);
+  status = optimizer.Optimize(/*cluster=*/nullptr, item, &output);
   TF_EXPECT_OK(status);
 
   EXPECT_EQ(11, output.node_size());
@@ -2711,14 +2711,14 @@ TEST_F(ConstantFoldingTest, MaterializeReductionIndices) {
     // Use aggressive mode to force the shape inference to propagate placeholder
     // shapes.
     ConstantFolding optimizer(RewriterConfig::AGGRESSIVE,
-                              nullptr /* cpu_device */);
+                              /*cpu_device=*/nullptr);
     GraphDef output;
-    Status status = optimizer.Optimize(nullptr, item, &output);
+    Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &output);
     TF_EXPECT_OK(status);
 
     // Run a second time to make sure the optimization is idempotent.
     item.graph.Swap(&output);
-    status = optimizer.Optimize(nullptr, item, &output);
+    status = optimizer.Optimize(/*cluster=*/nullptr, item, &output);
     TF_EXPECT_OK(status);
 
     int found = 0;
@@ -2767,9 +2767,9 @@ TEST_F(ConstantFoldingTest, MaterializeReductionIndices_NotFullReduction) {
     // Use aggressive mode to force the shape inference to propagate placeholder
     // shapes.
     ConstantFolding optimizer(RewriterConfig::AGGRESSIVE,
-                              nullptr /* cpu_device */);
+                              /*cpu_device=*/nullptr);
     GraphDef output;
-    Status status = optimizer.Optimize(nullptr, item, &output);
+    Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &output);
     TF_EXPECT_OK(status);
 
     CompareGraphs(item.graph, output);
@@ -2788,9 +2788,9 @@ TEST_F(ConstantFoldingTest, LargeConstant) {
   TF_CHECK_OK(scope.ToGraphDef(&item.graph));
   item.fetch.push_back("out");
 
-  ConstantFolding optimizer(nullptr /* cpu_device */);
+  ConstantFolding optimizer(/*cpu_device=*/nullptr);
   GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
+  Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &output);
   TF_EXPECT_OK(status);
 
   // Make sure the diag node hasn't been folded, since it would use too much
@@ -2833,9 +2833,9 @@ TEST_F(ConstantFoldingTest, SwitchIdenticalInputs) {
   item.fetch.push_back("id_true");
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
 
-  ConstantFolding optimizer(nullptr /* cpu_device */);
+  ConstantFolding optimizer(/*cpu_device=*/nullptr);
   GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
+  Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &output);
   TF_EXPECT_OK(status);
 
   EXPECT_EQ(6, output.node_size());
@@ -2925,9 +2925,9 @@ TEST_F(ConstantFoldingTest, PartialFolding_AssociativeAndCommutative) {
     TF_CHECK_OK(s.ToGraphDef(&item.graph));
     item.fetch = {"stack"};
 
-    ConstantFolding optimizer(nullptr /* cpu_device */);
+    ConstantFolding optimizer(/*cpu_device=*/nullptr);
     GraphDef output;
-    Status status = optimizer.Optimize(nullptr, item, &output);
+    Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &output);
     TF_EXPECT_OK(status);
 
     EXPECT_EQ(16, output.node_size());
@@ -3017,13 +3017,13 @@ TEST_F(ConstantFoldingTest, PartialFolding_Concat) {
 
   auto tensors_expected = EvaluateNodes(item.graph, {"concat0"});
   EXPECT_EQ(1, tensors_expected.size());
-  ConstantFolding optimizer(nullptr /* cpu_device */);
+  ConstantFolding optimizer(/*cpu_device=*/nullptr);
   GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
+  Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &output);
   TF_EXPECT_OK(status);
   // Run the optimizer twice to make sure the rewrite is idempotent.
   item.graph.Swap(&output);
-  status = optimizer.Optimize(nullptr, item, &output);
+  status = optimizer.Optimize(/*cluster=*/nullptr, item, &output);
   TF_EXPECT_OK(status);
 
   EXPECT_EQ(21, output.node_size());
@@ -3090,9 +3090,9 @@ TEST_F(ConstantFoldingTest, PartialFolding_IdentityN) {
   item.fetch.push_back("add0");
   item.fetch.push_back("add1");
 
-  ConstantFolding optimizer(nullptr /* cpu_device */);
+  ConstantFolding optimizer(/*cpu_device=*/nullptr);
   GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
+  Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &output);
   TF_EXPECT_OK(status);
   EXPECT_EQ(8, output.node_size());
   for (const auto& node : output.node()) {
@@ -3152,9 +3152,9 @@ TEST_F(ConstantFoldingTest, TrivialPack) {
   TF_CHECK_OK(scope.ToGraphDef(&item.graph));
   item.fetch = {"stack", "stack_no_axis"};
 
-  ConstantFolding optimizer(nullptr /* cpu_device */);
+  ConstantFolding optimizer(/*cpu_device=*/nullptr);
   GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
+  Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &output);
   TF_EXPECT_OK(status);
   EXPECT_EQ(7, output.node_size());
   int found = 0;
@@ -3234,13 +3234,13 @@ TEST_F(ConstantFoldingTest, Enter) {
   item.fetch.push_back("id3");
   item.fetch.push_back("id4");
 
-  ConstantFolding optimizer(nullptr /* cpu_device */);
+  ConstantFolding optimizer(/*cpu_device=*/nullptr);
   GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
+  Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &output);
   TF_EXPECT_OK(status);
   // Run the optimizer twice to make sure the rewrite is idempotent.
   item.graph.Swap(&output);
-  status = optimizer.Optimize(nullptr, item, &output);
+  status = optimizer.Optimize(/*cluster=*/nullptr, item, &output);
   TF_EXPECT_OK(status);
 
   EXPECT_EQ(9, output.node_size());
@@ -3289,13 +3289,13 @@ TEST_F(ConstantFoldingTest, TensorArraySize) {
   auto tensors_expected =
       EvaluateNodes(item.graph, {"dynamic_sz", "static_sz"});
 
-  ConstantFolding optimizer(nullptr /* cpu_device */);
+  ConstantFolding optimizer(/*cpu_device=*/nullptr);
   GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
+  Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &output);
   TF_EXPECT_OK(status);
   // Run the optimizer twice to make sure the rewrite is idempotent.
   item.graph.Swap(&output);
-  status = optimizer.Optimize(nullptr, item, &output);
+  status = optimizer.Optimize(/*cluster=*/nullptr, item, &output);
   TF_EXPECT_OK(status);
 
   EXPECT_EQ(8, output.node_size());
@@ -3327,9 +3327,9 @@ TEST_F(ConstantFoldingTest, FoldingPreservesDenormalFlushing) {
   item.fetch.push_back("c");
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
 
-  ConstantFolding optimizer(nullptr /* cpu_device */);
+  ConstantFolding optimizer(/*cpu_device=*/nullptr);
   GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
+  Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &output);
   TF_EXPECT_OK(status);
 
   EXPECT_EQ(1, output.node_size());
@@ -3363,9 +3363,9 @@ TEST_F(ConstantFoldingTest, EvaluatingLargeConstantNoFoldingMergingLoop) {
   item.fetch.push_back("result");
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
 
-  ConstantFolding optimizer(nullptr /* cpu_device */);
+  ConstantFolding optimizer(/*cpu_device=*/nullptr);
   GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
+  Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &output);
   TF_EXPECT_OK(status);
 
   std::vector<string> fetch = {"result"};
@@ -3376,6 +3376,96 @@ TEST_F(ConstantFoldingTest, EvaluatingLargeConstantNoFoldingMergingLoop) {
   EXPECT_EQ(tensors_expected[0].shape(), tensors[0].shape());
 }
 
+class ConstantFoldingCastConstTest : public GrapplerTest {
+ protected:
+  void ConstantFoldingCastConst(bool fetch_const, bool fetch_cast,
+                                bool fetch_const_child, bool fetch_cast_child) {
+    if (!fetch_const && !fetch_cast && !fetch_const_child &&
+        !fetch_cast_child) {
+      return;
+    }
+
+    tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+    CreateCastConstGraph(s);
+    GrapplerItem item;
+    int expected_output_size = SetFetch(&item, fetch_const, fetch_cast,
+                                        fetch_const_child, fetch_cast_child);
+    TF_CHECK_OK(s.ToGraphDef(&item.graph));
+
+    GraphDef output = ConstantFoldingOptimize(item);
+    EXPECT_EQ(expected_output_size, output.node_size());
+
+    EvaluateAndCompareUnoptimized(item.graph, output, item.fetch);
+  }
+
+ private:
+  void CreateCastConstGraph(const tensorflow::Scope& s) {
+    Output const1 = ops::Const(s.WithOpName("const1"), 2, {5, 5});
+    Output cast = ops::Cast(s.WithOpName("cast"), const1, DT_FLOAT);
+    Output const1_child = ops::Identity(s.WithOpName("const1_child"), const1);
+    Output cast_child = ops::Identity(s.WithOpName("cast_child"), cast);
+  }
+
+  int SetFetch(GrapplerItem* item, bool fetch_const, bool fetch_cast,
+               bool fetch_const_child, bool fetch_cast_child) {
+    int expected_output_size = 0;
+    if (fetch_const) {
+      item->fetch.push_back("const1");
+      expected_output_size++;
+    }
+    if (fetch_cast) {
+      item->fetch.push_back("cast");
+      expected_output_size++;
+    }
+    if (fetch_const_child) {
+      item->fetch.push_back("const1_child");
+      expected_output_size++;
+    }
+    if (fetch_cast_child) {
+      item->fetch.push_back("cast_child");
+      expected_output_size++;
+    }
+    return expected_output_size;
+  }
+
+  GraphDef ConstantFoldingOptimize(const GrapplerItem& item) {
+    ConstantFolding optimizer(/*cpu_device=*/nullptr);
+    GraphDef output;
+    Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &output);
+    TF_EXPECT_OK(status);
+    return output;
+  }
+
+  void EvaluateAndCompareUnoptimized(const GraphDef& unoptimized_graph,
+                                     const GraphDef& optimized_graph,
+                                     const std::vector<string>& fetch_nodes) {
+    auto tensors_expected = EvaluateNodes(unoptimized_graph, fetch_nodes);
+    auto tensors = EvaluateNodes(optimized_graph, fetch_nodes);
+    ASSERT_EQ(fetch_nodes.size(), tensors_expected.size());
+    ASSERT_EQ(fetch_nodes.size(), tensors.size());
+    for (int i = 0; i < fetch_nodes.size(); i++) {
+      if (fetch_nodes[i] == "const1" || fetch_nodes[i] == "const1_child") {
+        test::ExpectTensorEqual<int>(tensors_expected[i], tensors[i]);
+      } else {
+        test::ExpectTensorEqual<float>(tensors_expected[i], tensors[i]);
+      }
+    }
+  }
+};
+
+TEST_F(ConstantFoldingCastConstTest, CastConstFolding) {
+  for (bool fetch_const : {false, true}) {
+    for (bool fetch_cast : {false, true}) {
+      for (bool fetch_const_child : {false, true}) {
+        for (bool fetch_cast_child : {false, true}) {
+          ConstantFoldingCastConst(fetch_const, fetch_cast, fetch_const_child,
+                                   fetch_cast_child);
+        }
+      }
+    }
+  }
+}
+
 }  // namespace
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/data/BUILD b/tensorflow/core/grappler/optimizers/data/BUILD
index 89e95067b83..7593023ff4d 100644
--- a/tensorflow/core/grappler/optimizers/data/BUILD
+++ b/tensorflow/core/grappler/optimizers/data/BUILD
@@ -628,6 +628,7 @@ tf_cc_test(
         "//tensorflow/core/kernels:cwise_op",
         "//tensorflow/core/kernels:logging_ops",
         "//tensorflow/core/kernels:math",
+        "//tensorflow/core/kernels:nn",
         "//tensorflow/core/kernels:parsing",
         "//tensorflow/tools/graph_transforms:transform_utils",
     ] + tf_protos_all(),
diff --git a/tensorflow/core/grappler/optimizers/data/graph_test_utils.cc b/tensorflow/core/grappler/optimizers/data/graph_test_utils.cc
index affaeafb0fb..9d8b388a3a8 100644
--- a/tensorflow/core/grappler/optimizers/data/graph_test_utils.cc
+++ b/tensorflow/core/grappler/optimizers/data/graph_test_utils.cc
@@ -41,7 +41,7 @@ NodeDef MakeMapAndBatchNode(StringPiece name, StringPiece input_node_name,
                             StringPiece drop_remainder_node_name,
                             StringPiece function_name) {
   return test::function::NDef(
-      name, "MapAndBatchDatasetV2",
+      name, "ExperimentalMapAndBatchDataset",
       {string(input_node_name), "", string(batch_size_node_name),
        string(num_parallel_calls_node_name), string(drop_remainder_node_name)},
       {{"f", FunctionDefHelper::FunctionRef(string(function_name))},
diff --git a/tensorflow/core/grappler/optimizers/data/hoist_random_uniform.cc b/tensorflow/core/grappler/optimizers/data/hoist_random_uniform.cc
index 5af9fbadf76..60755256d83 100644
--- a/tensorflow/core/grappler/optimizers/data/hoist_random_uniform.cc
+++ b/tensorflow/core/grappler/optimizers/data/hoist_random_uniform.cc
@@ -67,7 +67,7 @@ NodeDef MakeStatelessMap(const NodeDef& map_node, const NodeDef& zip_node,
 NodeDef MakeRandomDataset(const NodeDef& random_uniform_node,
                           MutableGraphView* graph) {
   NodeDef random_dataset;
-  random_dataset.set_op("RandomDataset");
+  random_dataset.set_op("ExperimentalRandomDataset");
   graph_utils::SetUniqueGraphNodeName("RandomDataset", graph->graph(),
                                       &random_dataset);
 
diff --git a/tensorflow/core/grappler/optimizers/data/hoist_random_uniform_test.cc b/tensorflow/core/grappler/optimizers/data/hoist_random_uniform_test.cc
index 455459e3f67..b6a29a442ea 100644
--- a/tensorflow/core/grappler/optimizers/data/hoist_random_uniform_test.cc
+++ b/tensorflow/core/grappler/optimizers/data/hoist_random_uniform_test.cc
@@ -55,7 +55,7 @@ TEST(HoistRandomUniform, SimpleHoisting) {
   const int zip_dataset_id =
       graph_utils::FindGraphNodeWithOp("ZipDataset", output);
   const int random_dataset_id =
-      graph_utils::FindGraphNodeWithOp("RandomDataset", output);
+      graph_utils::FindGraphNodeWithOp("ExperimentalRandomDataset", output);
   const int batch_random_id =
       graph_utils::FindGraphNodeWithOp("BatchDatasetV2", output);
   ASSERT_NE(random_dataset_id, -1);
diff --git a/tensorflow/core/grappler/optimizers/data/latency_all_edges.cc b/tensorflow/core/grappler/optimizers/data/latency_all_edges.cc
index 16b2efb3ed3..52b4b785a3d 100644
--- a/tensorflow/core/grappler/optimizers/data/latency_all_edges.cc
+++ b/tensorflow/core/grappler/optimizers/data/latency_all_edges.cc
@@ -31,7 +31,7 @@ namespace tensorflow {
 namespace grappler {
 namespace {
 
-constexpr char kInsertOpName[] = "LatencyStatsDataset";
+constexpr char kInsertOpName[] = "ExperimentalLatencyStatsDataset";
 
 NodeDef MakeLatencyNode(const NodeDef& node, MutableGraphView* graph) {
   NodeDef new_node;
diff --git a/tensorflow/core/grappler/optimizers/data/latency_all_edges_test.cc b/tensorflow/core/grappler/optimizers/data/latency_all_edges_test.cc
index 6789cf5bd66..d428d04a666 100644
--- a/tensorflow/core/grappler/optimizers/data/latency_all_edges_test.cc
+++ b/tensorflow/core/grappler/optimizers/data/latency_all_edges_test.cc
@@ -57,9 +57,10 @@ TEST(LatencyAllEdgesTest, AddLatenciesAfterTensorMapPrefetch) {
   GraphDef output;
   TF_ASSERT_OK(optimizer.Optimize(nullptr, item, &output));
 
-  EXPECT_TRUE(graph_utils::ContainsNodeWithOp("LatencyStatsDataset", output));
-  std::vector<int> latency_node_indices =
-      graph_utils::FindAllGraphNodesWithOp("LatencyStatsDataset", output);
+  EXPECT_TRUE(graph_utils::ContainsNodeWithOp("ExperimentalLatencyStatsDataset",
+                                              output));
+  std::vector<int> latency_node_indices = graph_utils::FindAllGraphNodesWithOp(
+      "ExperimentalLatencyStatsDataset", output);
   EXPECT_EQ(latency_node_indices.size(), 3);
   std::vector<NodeDef> dataset_nodes = {std::move(from_tensor_node),
                                         std::move(map_node),
diff --git a/tensorflow/core/grappler/optimizers/data/make_numa_aware.cc b/tensorflow/core/grappler/optimizers/data/make_numa_aware.cc
index e5de9818223..72c27a1d4af 100644
--- a/tensorflow/core/grappler/optimizers/data/make_numa_aware.cc
+++ b/tensorflow/core/grappler/optimizers/data/make_numa_aware.cc
@@ -44,7 +44,7 @@ Status MakeNumaAware::Optimize(Cluster* cluster, const GrapplerItem& item,
   std::set<string> nodes_to_delete;
 
   for (const NodeDef& node : item.graph.node()) {
-    if (node.op() != "MapAndBatchDatasetV2") continue;
+    if (node.op() != "ExperimentalMapAndBatchDataset") continue;
 
     auto* numa_node = graph.AddNode(MakeNumaAwareNode(node, &graph));
     graph.UpdateFanouts(node.name(), numa_node->name());
diff --git a/tensorflow/core/grappler/optimizers/data/make_numa_aware_test.cc b/tensorflow/core/grappler/optimizers/data/make_numa_aware_test.cc
index 5d52bd6208f..4b83fb6ef19 100644
--- a/tensorflow/core/grappler/optimizers/data/make_numa_aware_test.cc
+++ b/tensorflow/core/grappler/optimizers/data/make_numa_aware_test.cc
@@ -57,7 +57,8 @@ TEST(MakeNumaAwareTest, ReplaceSimple) {
   TF_ASSERT_OK(optimizer.Optimize(nullptr, item, &output));
 
   EXPECT_FALSE(graph_utils::ContainsGraphNodeWithName("map_and_batch", output));
-  EXPECT_FALSE(graph_utils::ContainsNodeWithOp("MapAndBatchDatasetV2", output));
+  EXPECT_FALSE(graph_utils::ContainsNodeWithOp("ExperimentalMapAndBatchDataset",
+                                               output));
   EXPECT_TRUE(graph_utils::ContainsNodeWithOp(
       "ExperimentalNumaMapAndBatchDataset", output));
 }
@@ -91,7 +92,8 @@ TEST(MapAndBatchNumaAawareReplacementTest, ReplaceWithExtraChild) {
   TF_ASSERT_OK(optimizer.Optimize(nullptr, item, &output));
 
   EXPECT_FALSE(graph_utils::ContainsGraphNodeWithName("map_and_batch", output));
-  EXPECT_FALSE(graph_utils::ContainsNodeWithOp("MapAndBatchDatasetV2", output));
+  EXPECT_FALSE(graph_utils::ContainsNodeWithOp("ExperimentalMapAndBatchDataset",
+                                               output));
   EXPECT_TRUE(graph_utils::ContainsNodeWithOp(
       "ExperimentalNumaMapAndBatchDataset", output));
   EXPECT_TRUE(graph_utils::ContainsNodeWithOp("CacheDataset", output));
diff --git a/tensorflow/core/grappler/optimizers/data/map_and_batch_fusion.cc b/tensorflow/core/grappler/optimizers/data/map_and_batch_fusion.cc
index 800050b8403..9a7a751ff7d 100644
--- a/tensorflow/core/grappler/optimizers/data/map_and_batch_fusion.cc
+++ b/tensorflow/core/grappler/optimizers/data/map_and_batch_fusion.cc
@@ -30,7 +30,7 @@ namespace tensorflow {
 namespace grappler {
 namespace {
 
-constexpr char kFusedOpName[] = "MapAndBatchDatasetV2";
+constexpr char kFusedOpName[] = "ExperimentalMapAndBatchDataset";
 
 NodeDef MakeMapAndBatchNode(const NodeDef& map_node, const NodeDef& batch_node,
                             MutableGraphView* graph) {
diff --git a/tensorflow/core/grappler/optimizers/data/map_and_batch_fusion_test.cc b/tensorflow/core/grappler/optimizers/data/map_and_batch_fusion_test.cc
index eed558de7eb..ef4e64826f0 100644
--- a/tensorflow/core/grappler/optimizers/data/map_and_batch_fusion_test.cc
+++ b/tensorflow/core/grappler/optimizers/data/map_and_batch_fusion_test.cc
@@ -84,9 +84,10 @@ TEST(MapAndBatchFusionTest, FuseMapAndBatchNodesIntoOne) {
       graph_utils::ContainsGraphNodeWithName(map_node->name(), output));
   EXPECT_FALSE(
       graph_utils::ContainsGraphNodeWithName(batch_node->name(), output));
-  EXPECT_TRUE(graph_utils::ContainsNodeWithOp("MapAndBatchDatasetV2", output));
-  NodeDef map_and_batch_node = output.node(
-      graph_utils::FindGraphNodeWithOp("MapAndBatchDatasetV2", output));
+  EXPECT_TRUE(graph_utils::ContainsNodeWithOp("ExperimentalMapAndBatchDataset",
+                                              output));
+  NodeDef map_and_batch_node = output.node(graph_utils::FindGraphNodeWithOp(
+      "ExperimentalMapAndBatchDataset", output));
   EXPECT_EQ(map_and_batch_node.input_size(), 5);
   EXPECT_EQ(map_and_batch_node.input(0), map_node->input(0));
   EXPECT_EQ(map_and_batch_node.input(1), map_node->input(1));
@@ -169,9 +170,10 @@ TEST(MapAndBatchFusionTest, FuseMapAndBatchV2NodesIntoOne) {
       graph_utils::ContainsGraphNodeWithName(map_node->name(), output));
   EXPECT_FALSE(
       graph_utils::ContainsGraphNodeWithName(batch_node->name(), output));
-  EXPECT_TRUE(graph_utils::ContainsNodeWithOp("MapAndBatchDatasetV2", output));
-  NodeDef map_and_batch_node = output.node(
-      graph_utils::FindGraphNodeWithOp("MapAndBatchDatasetV2", output));
+  EXPECT_TRUE(graph_utils::ContainsNodeWithOp("ExperimentalMapAndBatchDataset",
+                                              output));
+  NodeDef map_and_batch_node = output.node(graph_utils::FindGraphNodeWithOp(
+      "ExperimentalMapAndBatchDataset", output));
   EXPECT_EQ(map_and_batch_node.input_size(), 5);
   EXPECT_EQ(map_and_batch_node.input(0), map_node->input(0));
   EXPECT_EQ(map_and_batch_node.input(1), map_node->input(1));
@@ -252,9 +254,10 @@ TEST(MapAndBatchFusionTest, FuseParallelMapAndBatchNodesIntoOne) {
       graph_utils::ContainsGraphNodeWithName(map_node->name(), output));
   EXPECT_FALSE(
       graph_utils::ContainsGraphNodeWithName(batch_node->name(), output));
-  EXPECT_TRUE(graph_utils::ContainsNodeWithOp("MapAndBatchDatasetV2", output));
-  NodeDef map_and_batch_node = output.node(
-      graph_utils::FindGraphNodeWithOp("MapAndBatchDatasetV2", output));
+  EXPECT_TRUE(graph_utils::ContainsNodeWithOp("ExperimentalMapAndBatchDataset",
+                                              output));
+  NodeDef map_and_batch_node = output.node(graph_utils::FindGraphNodeWithOp(
+      "ExperimentalMapAndBatchDataset", output));
   EXPECT_EQ(map_and_batch_node.input_size(), 5);
   EXPECT_EQ(map_and_batch_node.input(0), map_node->input(0));
   EXPECT_EQ(map_and_batch_node.input(1), map_node->input(1));
diff --git a/tensorflow/core/grappler/optimizers/experimental_implementation_selector.cc b/tensorflow/core/grappler/optimizers/experimental_implementation_selector.cc
index 2c36c9b7b31..75ad8bffefd 100644
--- a/tensorflow/core/grappler/optimizers/experimental_implementation_selector.cc
+++ b/tensorflow/core/grappler/optimizers/experimental_implementation_selector.cc
@@ -17,6 +17,8 @@ limitations under the License.
 
 #include <string>
 
+#include "absl/strings/numbers.h"
+#include "absl/strings/str_split.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/grappler/costs/graph_properties.h"
 #include "tensorflow/core/grappler/grappler_item.h"
@@ -32,6 +34,73 @@ limitations under the License.
 namespace tensorflow {
 namespace grappler {
 
+Status UpdateNodeDef(NodeDef* node_def, const string& funcName,
+                     const FunctionApiInfo& apiInfo) {
+  VLOG(3) << "Node def before swap is: " << node_def->DebugString();
+  auto tin = node_def->mutable_attr()->find("Tin");
+  tin->second.mutable_list()->clear_type();
+  for (const auto& tin_dtype : apiInfo.input_arg_dtypes()) {
+    tin->second.mutable_list()->add_type(tin_dtype);
+  }
+
+  auto tout = node_def->mutable_attr()->find("Tout");
+  tout->second.mutable_list()->clear_type();
+  for (const auto& tout_dtype : apiInfo.output_arg_dtypes()) {
+    tout->second.mutable_list()->add_type(tout_dtype);
+  }
+
+  if (apiInfo.function_type() == FunctionApiInfo::BACKWARD) {
+    // Update the inputs since for backward function, it might have different
+    // number of inputs due the different number output from forward function.
+    // The output of forward function are composed by two parts:
+    //   1. Real output tensors from defun.
+    //   2. Internal states that will be used for gradient calculation.
+    // Part 1 will be static, and part 2 could be different based on the
+    // different implementation.
+
+    const int prev_input_size = node_def->input_size();
+    const int diff = prev_input_size - apiInfo.input_arg_dtypes().size();
+    if (diff >= 0) {
+      for (int i = 0; i < diff; ++i) node_def->mutable_input()->RemoveLast();
+    } else {
+      // Adding new inputs for internal states, the name of the internal states
+      // should be in format "{forward_node_name}:{index}", where the newly
+      // added index should start from last index of the state.
+      // Eg:
+      // {
+      //   input: "gradients/unified_lstm/strided_slice_1_grad/StridedSliceGrad"
+      //   input: "gradients/zeros_like_1"
+      //   input: "gradients/zeros_like_2"
+      //   input: "unified_lstm/StatefulPartitionedCall:3"
+      //   input: "unified_lstm/StatefulPartitionedCall:4"
+      //   # New input should be "unified_lstm/StatefulPartitionedCall:5"
+      // }
+      const string last_input = node_def->input(prev_input_size - 1);
+      const std::vector<string> name_index = ::absl::StrSplit(last_input, ':');
+      if (name_index.size() != 2) {
+        return errors::InvalidArgument(
+            "Invalid format of input node name: ", last_input,
+            " Expected: {forward_node_name}:{index}");
+      }
+      const absl::string_view node_name = name_index[0];
+      int last_index;
+      if (!::absl::SimpleAtoi(name_index[1], &last_index)) {
+        return errors::InvalidArgument(
+            "The index of input node is expected to be number, got: ",
+            name_index[1]);
+      }
+      for (int i = 1; i <= -diff; ++i)
+        node_def->add_input(strings::StrCat(node_name, ":", i + last_index));
+    }
+  }
+
+  node_def->mutable_attr()->find("f")->second.mutable_func()->set_name(
+      funcName);
+
+  VLOG(3) << "Node def after swap is: " << node_def->DebugString();
+  return Status::OK();
+}
+
 Status ExperimentalImplementationSelector::LoadFunctions(
     const GraphDef& graph) {
   lib_info_.reset(new FunctionLibraryApiInfo);
@@ -43,8 +112,11 @@ Status ExperimentalImplementationSelector::MaybeOptimizeFunctionCall(
     NodeDef* node_def) const {
   // There are two ways of calling functions:
   //  1. By specifying an op name as a function name, or
-  //  2. Via the @defun functional interface, where the real function name
-  //     appear as the attribute with type func.
+  //  2. Via the @defun functional interface, where the real function call
+  //     happens with partitionedcall op, and the function name appear as the
+  //     attribute with name "f" and type func. In this use case, there are more
+  //     attributes need to be taken care, like Tin and Tout which take care of
+  //     the DTYPE of input/output.
   std::vector<string> function_attribute_names;
   for (const auto& attr : node_def->attr()) {
     if (attr.second.has_func() &&
@@ -70,22 +142,29 @@ Status ExperimentalImplementationSelector::MaybeOptimizeFunctionCall(
 
   for (const auto& attr_name : function_attribute_names) {
     string function_name = node_def->attr().at(attr_name).func().name();
-    string best_function_name;
-    lib_info_->GetBestImplementation(function_name, parsed_name.type,
-                                     &best_function_name);
-    if (function_name != best_function_name) {
-      node_def->mutable_attr()
-          ->find(attr_name)
-          ->second.mutable_func()
-          ->set_name(best_function_name);
+    std::vector<string> equiv_func_names;
+    TF_RETURN_IF_ERROR(lib_info_->GetEquivalentImplementations(
+        function_name, &equiv_func_names));
+    for (const auto& func_name : equiv_func_names) {
+      const auto& func_api_info = lib_info_->GetApiInfo(func_name);
+      if (func_api_info->preferred_device() == parsed_name.type) {
+        VLOG(2) << "Swapping: " << function_name << " TO: " << func_name;
+        TF_RETURN_IF_ERROR(UpdateNodeDef(node_def, func_name, *func_api_info));
+        break;
+      }
     }
   }
+
   if (lib_info_->GetApiInfo(node_def->op()) != nullptr) {
-    string best_function_name;
-    lib_info_->GetBestImplementation(node_def->op(), parsed_name.type,
-                                     &best_function_name);
-    if (node_def->op() != best_function_name) {
-      node_def->set_op(best_function_name);
+    std::vector<string> equiv_func_names;
+    TF_RETURN_IF_ERROR(lib_info_->GetEquivalentImplementations(
+        node_def->op(), &equiv_func_names));
+    for (const string& func_name : equiv_func_names) {
+      const auto func_api_info = lib_info_->GetApiInfo(func_name);
+      if (func_api_info->preferred_device() == parsed_name.type) {
+        node_def->set_op(func_name);
+        break;
+      }
     }
   }
   return Status::OK();
@@ -93,6 +172,11 @@ Status ExperimentalImplementationSelector::MaybeOptimizeFunctionCall(
 
 Status ExperimentalImplementationSelector::SelectImplementation(
     GraphDef* graph) const {
+  if (!graph->has_library()) {
+    VLOG(2) << "Skipping graph since it does not have function def";
+    return Status::OK();
+  }
+
   for (int k = 0; k < graph->node_size(); ++k)
     TF_RETURN_IF_ERROR(MaybeOptimizeFunctionCall(graph->mutable_node(k)));
 
diff --git a/tensorflow/core/grappler/optimizers/experimental_implementation_selector_test.cc b/tensorflow/core/grappler/optimizers/experimental_implementation_selector_test.cc
index 3f1ebefac68..e1ac7766d34 100644
--- a/tensorflow/core/grappler/optimizers/experimental_implementation_selector_test.cc
+++ b/tensorflow/core/grappler/optimizers/experimental_implementation_selector_test.cc
@@ -133,6 +133,101 @@ TEST_F(ExperimentalImplementationSelectorTest, SwapImplementationEval) {
                                  test::AsScalar<float>(2.0f));
 }
 
+TEST_F(ExperimentalImplementationSelectorTest, SwapImplementationWithGradient) {
+  using test::function::NDef;
+  using FDH = FunctionDefHelper;
+  // boost_1 returns the doubled input and a const as the internal state, the
+  // state will be feed to gradient function to mimic the behavior of backward
+  // function of defun that use internal states as extra inputs.
+  FunctionDef boost_1 = FDH::Create(
+      "Boost1", {"x:float"}, {"z:float", "s:float"}, {},
+      {{{"boost"}, "Add", {"x", "x"}, {{"T", DT_FLOAT}}},
+       FDH::Const("one", 1.0f)},
+      /* Mapping between function returns and function node outputs. */
+      {{"z", "boost:z:0"}, {"s", "one:output:0"}});
+  auto* boost_1_attr = boost_1.mutable_attr();
+  (*boost_1_attr)["experimental_api_implements"].set_s("random_boost");
+  (*boost_1_attr)["experimental_api_preferred_device"].set_s("CPU");
+  (*boost_1_attr)["backward_function_name"].set_s("BoostCpuGradient");
+
+  FunctionDef boost_1_gradient = FDH::Create(
+      "Boost1Gradient", {"x:float", "s:float"}, {"dx:float"}, {},
+      {FDH::Const("two", 2.0f),
+       {{"grad"}, "Mul", {"x", "two:output:0"}, {{"T", DT_FLOAT}}}},
+      /* Mapping between function returns and function node outputs. */
+      {{"dx", "grad:z:0"}});
+  auto* boost_1_grad_attr = boost_1_gradient.mutable_attr();
+  (*boost_1_grad_attr)["experimental_api_implements"].set_s("random_boost");
+  (*boost_1_grad_attr)["experimental_api_preferred_device"].set_s("CPU");
+  (*boost_1_grad_attr)["forward_function_name"].set_s("BoostCpu");
+
+  // boost_2 return the input * 4, and with two extra internal states.
+  FunctionDef boost_2_func = FDH::Create(
+      "Boost2", {"x:float"}, {"z:float", "s1:float", "s2:float"}, {},
+      {FDH::Const("four", 4.0f),
+       {{"boost"}, "Mul", {"x", "four:output:0"}, {{"T", DT_FLOAT}}},
+       FDH::Const("one", 1.0f),
+       FDH::Const("two", 2.0f)},
+      /* Mapping between function returns and function node outputs. */
+      {{"z", "boost:z:0"}, {"s1", "one:output:0"}, {"s2", "two:output:0"}});
+  auto* boost_2_attr = boost_2_func.mutable_attr();
+  (*boost_2_attr)["experimental_api_implements"].set_s("random_boost");
+  (*boost_2_attr)["experimental_api_preferred_device"].set_s("GPU");
+  (*boost_2_attr)["backward_function_name"].set_s("BoostGpuGradient");
+
+  FunctionDef boost_2_gradient = FDH::Create(
+      "Boost2Gradient", {"x:float", "s1:float", "s2:float"}, {"dx:float"}, {},
+      {FDH::Const("four", 4.0f),
+       {{"grad"}, "Mul", {"x", "four:output:0"}, {{"T", DT_FLOAT}}}},
+      /* Mapping between function returns and function node outputs. */
+      {{"dx", "grad:z:0"}});
+  auto* boost_2_grad_attr = boost_2_gradient.mutable_attr();
+  (*boost_2_grad_attr)["experimental_api_implements"].set_s("random_boost");
+  (*boost_2_grad_attr)["experimental_api_preferred_device"].set_s("GPU");
+  (*boost_2_grad_attr)["forward_function_name"].set_s("BoostGpu");
+
+  // Define the forward function with f = boost2 function but with CPU device.
+  // Expect the grappler plugin to swap f and attributes to use the boost1.
+  const auto forward =
+      NDef("lstm/StatefulPartitionedCall", "StatefulPartitionedCall", {"input"},
+           {{"Tin", DataTypeSlice{DT_FLOAT}},
+            {"Tout", DataTypeSlice{DT_FLOAT, DT_FLOAT, DT_FLOAT}},
+            {"f", FDH::FunctionRef("Boost2")}},
+           CpuDevice);
+  const auto backward =
+      NDef("gradient/lstm/StatefulPartitionedCall", "StatefulPartitionedCall",
+           {"input", "lstm/StatefulPartitionedCall:1",
+            "lstm/StatefulPartitionedCall:2"},
+           {{"Tin", DataTypeSlice{DT_FLOAT, DT_FLOAT, DT_FLOAT}},
+            {"Tout", DataTypeSlice{DT_FLOAT}},
+            {"f", FDH::FunctionRef("Boost2Gradient")}},
+           CpuDevice);
+
+  ExperimentalImplementationSelector optimizer;
+  GraphDef output;
+  GrapplerItem item;
+  item.graph = test::function::GDef(
+      {NDef("input", "Placeholder", {}, {{"dtype", DT_FLOAT}}, CpuDevice),
+       forward, backward,
+       NDef("output", "Identity", {"lstm/StatefulPartitionedCall:0"},
+            {{"T", DT_FLOAT}}, CpuDevice)},
+      // FunctionLib
+      {boost_1, boost_1_gradient, boost_2_func, boost_2_gradient});
+
+  const Tensor input = test::AsScalar<float>(1.0f);
+  item.fetch = {"output"};
+  item.feed.emplace_back("input", input);
+
+  const auto four_times_boosted_tensor = EvaluateFetchNodes(item);
+  test::ExpectTensorEqual<float>(four_times_boosted_tensor[0],
+                                 test::AsScalar<float>(4.0f));
+
+  TF_EXPECT_OK(optimizer.Optimize(nullptr, item, &output));
+  GrapplerItem optimized(item, std::move(output));
+  const auto twice_boosted_tensor = EvaluateFetchNodes(optimized);
+  test::ExpectTensorEqual<float>(twice_boosted_tensor[0],
+                                 test::AsScalar<float>(2.0f));
+}
 }  // namespace
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/function_api_info.cc b/tensorflow/core/grappler/optimizers/function_api_info.cc
index 798e0f6fd55..497ad6032ea 100644
--- a/tensorflow/core/grappler/optimizers/function_api_info.cc
+++ b/tensorflow/core/grappler/optimizers/function_api_info.cc
@@ -27,6 +27,7 @@ FunctionApiInfo::FunctionApiInfo() {}
 FunctionApiInfo::~FunctionApiInfo() {}
 
 Status FunctionApiInfo::Init(const FunctionDef& function_def) {
+  function_type_ = FunctionApiInfo::FunctionType::INFERENCE;
   for (const auto& attr : function_def.attr()) {
     if (attr.first == "experimental_api_preferred_device") {
       preferred_device_ = attr.second.s();
@@ -34,7 +35,25 @@ Status FunctionApiInfo::Init(const FunctionDef& function_def) {
     if (attr.first == "experimental_api_implements") {
       interface_name_ = attr.second.s();
     }
+    if (attr.first == "forward_function_name") {
+      function_type_ = FunctionApiInfo::FunctionType::BACKWARD;
+      pairing_function_name_ = attr.second.s();
+    }
+    if (attr.first == "backward_function_name") {
+      function_type_ = FunctionApiInfo::FunctionType::FORWARD;
+      pairing_function_name_ = attr.second.s();
+    }
   }
+
+  input_arg_dtypes_.reserve(function_def.signature().input_arg_size());
+  for (const auto& input_arg : function_def.signature().input_arg()) {
+    input_arg_dtypes_.emplace_back(input_arg.type());
+  }
+  output_arg_dtypes_.reserve(function_def.signature().output_arg_size());
+  for (const auto& output_arg : function_def.signature().output_arg()) {
+    output_arg_dtypes_.emplace_back(output_arg.type());
+  }
+
   if (interface_name_.empty() && !preferred_device_.empty()) {
     return errors::InvalidArgument(
         "Function '", function_def.signature().name(),
@@ -51,53 +70,94 @@ const string& FunctionApiInfo::interface_name() const {
   return interface_name_;
 }
 
+const FunctionApiInfo::FunctionType FunctionApiInfo::function_type() const {
+  return function_type_;
+}
+
+const string& FunctionApiInfo::pairing_function_name() const {
+  return pairing_function_name_;
+}
+
+const DataTypeVector& FunctionApiInfo::input_arg_dtypes() const {
+  return input_arg_dtypes_;
+}
+
+const DataTypeVector& FunctionApiInfo::output_arg_dtypes() const {
+  return output_arg_dtypes_;
+}
+
 FunctionLibraryApiInfo::FunctionLibraryApiInfo() {}
 FunctionLibraryApiInfo::~FunctionLibraryApiInfo() {}
 
 namespace {
-bool IsSameSignature(const FunctionDef& f1, const FunctionDef& f2) {
-  if (f1.ret().size() != f2.ret().size()) return false;
+bool IsSameArgDef(const OpDef::ArgDef& arg1, const OpDef::ArgDef& arg2) {
+  if (arg1.type() != arg2.type()) return false;
+  if (arg1.type_attr() != arg2.type_attr()) return false;
+  if (arg1.number_attr() != arg2.number_attr()) return false;
+  if (arg1.type_list_attr() != arg2.type_list_attr()) return false;
+  if (arg1.is_ref() != arg2.is_ref()) return false;
+  return true;
+}
+
+bool IsSameSignature(const FunctionDef& f1, const FunctionDef& f2,
+                     const bool check_inputs, const bool check_outputs) {
   const auto& sig1 = f1.signature();
   const auto& sig2 = f2.signature();
   // Functions have positional semantics, so we don't check for names.
-  if (sig1.input_arg_size() != sig2.input_arg_size()) return false;
-  for (int k = 0; k < sig1.input_arg_size(); ++k) {
-    const OpDef::ArgDef& arg1 = sig1.input_arg(k);
-    const OpDef::ArgDef& arg2 = sig2.input_arg(k);
-    if (arg1.type() != arg2.type()) return false;
-    if (arg1.type_attr() != arg2.type_attr()) return false;
-    if (arg1.number_attr() != arg2.number_attr()) return false;
-    if (arg1.type_list_attr() != arg2.type_list_attr()) return false;
-    if (arg1.is_ref() != arg2.is_ref()) return false;
+  if (check_inputs) {
+    if (sig1.input_arg_size() != sig2.input_arg_size()) return false;
+    for (int k = 0; k < sig1.input_arg_size(); ++k) {
+      if (!IsSameArgDef(sig1.input_arg(k), sig2.input_arg(k))) return false;
+    }
+  }
+  if (check_outputs) {
+    if (f1.ret().size() != f2.ret().size()) return false;
+    if (sig1.output_arg_size() != sig2.output_arg_size()) return false;
+    for (int k = 0; k < sig1.output_arg_size(); ++k) {
+      if (!IsSameArgDef(sig1.output_arg(k), sig2.output_arg(k))) return false;
+    }
   }
   return true;
 }
 
 Status ValidateSignature(const string& interface_name,
-                         const std::vector<const FunctionDef*>& equiv_funcs) {
+                         const std::vector<const FunctionDef*>& equiv_funcs,
+                         const FunctionApiInfo::FunctionType function_type) {
   if (equiv_funcs.size() < 2) return Status::OK();
   for (size_t k = 1; k < equiv_funcs.size(); ++k) {
-    if (!IsSameSignature(*equiv_funcs[0], *equiv_funcs[k]))
+    const bool check_input =
+        (function_type == FunctionApiInfo::FunctionType::INFERENCE ||
+         function_type == FunctionApiInfo::FunctionType::FORWARD);
+    const bool check_output =
+        (function_type == FunctionApiInfo::FunctionType::INFERENCE ||
+         function_type == FunctionApiInfo::FunctionType::BACKWARD);
+    if (!IsSameSignature(*equiv_funcs[0], *equiv_funcs[k], check_input,
+                         check_output)) {
       return errors::InvalidArgument(
           "Functions '", equiv_funcs[0]->signature().name(), "' and '",
           equiv_funcs[k]->signature().name(), "' both implement '",
           interface_name, "' but their signatures do not match.");
+    }
   }
   return Status::OK();
 }
 
 Status ValidateSignatures(
     const std::unordered_map<string, std::vector<const FunctionDef*>>&
-        intf_to_func) {
+        intf_to_func,
+    const FunctionApiInfo::FunctionType function_type) {
   for (const auto& item : intf_to_func)
-    TF_RETURN_IF_ERROR(ValidateSignature(item.first, item.second));
+    TF_RETURN_IF_ERROR(
+        ValidateSignature(item.first, item.second, function_type));
   return Status::OK();
 }
 }  // namespace
 
 Status FunctionLibraryApiInfo::Init(
     const FunctionDefLibrary& function_library) {
-  std::unordered_map<string, std::vector<const FunctionDef*>> intf_to_func;
+  std::unordered_map<string, std::vector<const FunctionDef*>> infer_funcs;
+  std::unordered_map<string, std::vector<const FunctionDef*>> fwd_funcs;
+  std::unordered_map<string, std::vector<const FunctionDef*>> bwd_funcs;
   for (const auto& function : function_library.function()) {
     std::unique_ptr<FunctionApiInfo> func_info(new FunctionApiInfo);
     TF_RETURN_IF_ERROR(func_info->Init(function));
@@ -106,54 +166,64 @@ Status FunctionLibraryApiInfo::Init(
 
     const string& function_name = function.signature().name();
     const string& interface_name = func_info->interface_name();
-    func_to_intf_[function_name] = interface_name;
-    intf_to_funcs_[interface_name].emplace_back(function_name);
-    intf_to_func[interface_name].emplace_back(&function);
+    VLOG(3) << "Got " << func_info->function_type()
+            << " function: " << function_name
+            << " with interface: " << interface_name;
+    switch (func_info->function_type()) {
+      case FunctionApiInfo::FunctionType::INFERENCE:
+        intf_to_inference_funcs_[interface_name].emplace_back(function_name);
+        infer_funcs[interface_name].emplace_back(&function);
+        break;
+      case FunctionApiInfo::FunctionType::FORWARD:
+        intf_to_forward_funcs_[interface_name].emplace_back(function_name);
+        fwd_funcs[interface_name].emplace_back(&function);
+        break;
+      case FunctionApiInfo::FunctionType::BACKWARD:
+        intf_to_backward_funcs_[interface_name].emplace_back(function_name);
+        bwd_funcs[interface_name].emplace_back(&function);
+        break;
+      default:
+        return errors::InvalidArgument("Unrecognized function type: ",
+                                       func_info->function_type());
+    }
     func_info_[function_name] = std::move(func_info);
   }
-  TF_RETURN_IF_ERROR(ValidateSignatures(intf_to_func));
+  TF_RETURN_IF_ERROR(ValidateSignatures(
+      infer_funcs, FunctionApiInfo::FunctionType::INFERENCE));
+  TF_RETURN_IF_ERROR(
+      ValidateSignatures(fwd_funcs, FunctionApiInfo::FunctionType::FORWARD));
+  TF_RETURN_IF_ERROR(
+      ValidateSignatures(bwd_funcs, FunctionApiInfo::FunctionType::BACKWARD));
   return Status::OK();
 }
 
-void FunctionLibraryApiInfo::GetEquivalentImplementations(
-    const string& function_name, std::vector<string>* other_names) const {
-  const auto intf_it = func_to_intf_.find(function_name);
-  // The function does not implement any interface.
-  if (intf_it == func_to_intf_.end()) return;
-  CHECK(!intf_it->second.empty()) << "Function " << function_name
-                                  << "should at least implement 1 interface.";
-  const auto it = intf_to_funcs_.find(intf_it->second);
-  CHECK(it != intf_to_funcs_.end())
-      << "Function " << function_name << " maps to " << intf_it->second
-      << " but no reverse mapping was found";
-  CHECK_GE(it->second.size(), 1) << "Class " << it->first << " is empty";
-  other_names->reserve(it->second.size() - 1);
-  for (const auto& other_name : it->second) {
-    if (other_name == function_name) continue;
-    other_names->emplace_back(other_name);
+Status FunctionLibraryApiInfo::GetEquivalentImplementations(
+    const string& function_name, std::vector<string>* other_functions) const {
+  const auto func_it = func_info_.find(function_name);
+  if (func_it == func_info_.end()) return Status::OK();
+  const FunctionApiInfo* func_info = func_it->second.get();
+
+  absl::flat_hash_map<string, std::vector<string>>::const_iterator it;
+  switch (func_info->function_type()) {
+    case FunctionApiInfo::FunctionType::INFERENCE:
+      it = intf_to_inference_funcs_.find(func_info->interface_name());
+      break;
+    case FunctionApiInfo::FunctionType::FORWARD:
+      it = intf_to_forward_funcs_.find(func_info->interface_name());
+      break;
+    case FunctionApiInfo::FunctionType::BACKWARD:
+      it = intf_to_backward_funcs_.find(func_info->interface_name());
+      break;
+    default:
+      return errors::InvalidArgument("Unrecognized function type: ",
+                                     func_info->function_type());
   }
-}
 
-void FunctionLibraryApiInfo::GetBestImplementation(
-    const string& function_name, const string& device,
-    string* best_func_name) const {
-  CHECK(best_func_name != nullptr);
-  const auto func_it = func_to_intf_.find(function_name);
-  if (func_it == func_to_intf_.end()) return;
-
-  const auto it = intf_to_funcs_.find(func_it->second);
-  // No function found for the given interface.
-  if (it == intf_to_funcs_.end()) return;
   for (const auto& func_name : it->second) {
-    const auto func_api_info = func_info_.find(func_name)->second.get();
-    if (func_api_info->preferred_device() == device) {
-      best_func_name->assign(func_name);
-      return;
-    }
+    if (func_name == function_name) continue;
+    other_functions->emplace_back(func_name);
   }
-  // Didn't find a function with the match device name, choose the first one
-  // among all the available functions.
-  best_func_name->assign(it->second.front());
+  return Status::OK();
 }
 
 const FunctionApiInfo* FunctionLibraryApiInfo::GetApiInfo(
diff --git a/tensorflow/core/grappler/optimizers/function_api_info.h b/tensorflow/core/grappler/optimizers/function_api_info.h
index 412687c58c1..9a5f548951f 100644
--- a/tensorflow/core/grappler/optimizers/function_api_info.h
+++ b/tensorflow/core/grappler/optimizers/function_api_info.h
@@ -20,7 +20,10 @@ limitations under the License.
 #include <unordered_map>
 #include <vector>
 
+#include "absl/container/flat_hash_map.h"
 #include "tensorflow/core/framework/function.pb.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/lib/core/status.h"
 
 namespace tensorflow {
@@ -30,14 +33,32 @@ class FunctionApiInfo {
   FunctionApiInfo();
   virtual ~FunctionApiInfo();
 
+  enum FunctionType {
+    INFERENCE,  // Default type.
+    FORWARD,
+    BACKWARD,
+  };
+
   Status Init(const FunctionDef& function_def);
 
   const string& interface_name() const;
   const string& preferred_device() const;
+  const FunctionType function_type() const;
+  const string& pairing_function_name() const;
+  const DataTypeVector& input_arg_dtypes() const;
+  const DataTypeVector& output_arg_dtypes() const;
 
  private:
   string interface_name_;
   string preferred_device_;
+  FunctionType function_type_;
+  // The pairing function is used to pair between forward and backward function,
+  // which will be useful during function swapping. Inference function won't
+  // have pairing function.
+  string pairing_function_name_;
+  // The following two attributes are useful for forward and backward functions.
+  DataTypeVector input_arg_dtypes_;
+  DataTypeVector output_arg_dtypes_;
 
   TF_DISALLOW_COPY_AND_ASSIGN(FunctionApiInfo);
 };
@@ -55,21 +76,22 @@ class FunctionLibraryApiInfo {
   // Populate the internal field for the functions within the function_library.
   Status Init(const FunctionDefLibrary& function_library);
 
-  void GetEquivalentImplementations(const string& function_name,
-                                    std::vector<string>* other_names) const;
-
-  void GetBestImplementation(const string& function_name, const string& device,
-                             string* best_func_name) const;
+  Status GetEquivalentImplementations(
+      const string& function_name, std::vector<string>* other_functions) const;
 
   const FunctionApiInfo* GetApiInfo(const string& function_name) const;
 
  private:
   // Map between function name to function details.
   std::unordered_map<string, std::unique_ptr<FunctionApiInfo>> func_info_;
-  // Map between function name to interface name.
-  std::unordered_map<string, string> func_to_intf_;
+
   // Map between interface name to function names.
-  std::unordered_map<string, std::vector<string>> intf_to_funcs_;
+  // Forward/backward function pair usually have different signatures between
+  // each other since forward function could produce extra internal state as
+  // output, and backward will take those extra state as inputs.
+  absl::flat_hash_map<string, std::vector<string>> intf_to_inference_funcs_;
+  absl::flat_hash_map<string, std::vector<string>> intf_to_forward_funcs_;
+  absl::flat_hash_map<string, std::vector<string>> intf_to_backward_funcs_;
 
   TF_DISALLOW_COPY_AND_ASSIGN(FunctionLibraryApiInfo);
 };
diff --git a/tensorflow/core/grappler/optimizers/function_api_info_test.cc b/tensorflow/core/grappler/optimizers/function_api_info_test.cc
index 582890d3e3b..b683d26b32f 100644
--- a/tensorflow/core/grappler/optimizers/function_api_info_test.cc
+++ b/tensorflow/core/grappler/optimizers/function_api_info_test.cc
@@ -36,28 +36,35 @@ void SetArg(const string& name, const string& type_name,
 
 typedef std::pair<string, string> ArgSpec;  // name, type.
 
-void SetArgs(const std::vector<ArgSpec>& args_spec, OpDef* sig) {
-  for (const auto& arg_spec : args_spec)
+void SetArgs(const std::vector<ArgSpec>& input_args_spec,
+             const std::vector<ArgSpec>& output_args_spec, OpDef* sig) {
+  for (const auto& arg_spec : input_args_spec)
     SetArg(arg_spec.first, arg_spec.second, sig->add_input_arg());
-  SetArg("output", "float32", sig->add_output_arg());
+  for (const auto& arg_spec : output_args_spec)
+    SetArg(arg_spec.first, arg_spec.second, sig->add_output_arg());
 }
 
 void PopulateFunction(const string& name, const string& api_interface_name,
                       const string& preferred_device,
                       const std::vector<ArgSpec>& input_args,
+                      const std::vector<ArgSpec>& output_args,
+                      const string& forward_function_name,
+                      const string& backward_function_name,
                       FunctionDef* func_def) {
   OpDef* sig = func_def->mutable_signature();
   sig->set_name(name);
 
-  SetArgs(input_args, sig);
+  SetArgs(input_args, output_args, sig);
 
-  if (!api_interface_name.empty() || !preferred_device.empty()) {
-    auto* func_attr = func_def->mutable_attr();
-    if (!api_interface_name.empty())
-      (*func_attr)["experimental_api_implements"].set_s(api_interface_name);
-    if (!preferred_device.empty())
-      (*func_attr)["experimental_api_preferred_device"].set_s(preferred_device);
-  }
+  auto* func_attr = func_def->mutable_attr();
+  if (!api_interface_name.empty())
+    (*func_attr)["experimental_api_implements"].set_s(api_interface_name);
+  if (!preferred_device.empty())
+    (*func_attr)["experimental_api_preferred_device"].set_s(preferred_device);
+  if (!forward_function_name.empty())
+    (*func_attr)["forward_function_name"].set_s(forward_function_name);
+  if (!backward_function_name.empty())
+    (*func_attr)["backward_function_name"].set_s(backward_function_name);
 }
 
 void PopulateSampleLibrary(const bool mismatch_args,
@@ -65,39 +72,50 @@ void PopulateSampleLibrary(const bool mismatch_args,
   const std::vector<ArgSpec> func_args{{"in1", "float32"}, {"in2", "int32"}};
   const std::vector<ArgSpec> func_wrong_args{{"in1", "int32"},
                                              {"in2", "int32"}};
-  PopulateFunction("DoStuffCpu", "DoStuff", "CPU", func_args,
-                   func_lib->add_function());
+  const std::vector<ArgSpec> output_args{{"out", "float32"}};
+  PopulateFunction("DoStuffCpu", "DoStuff", "CPU", func_args, output_args, "",
+                   "", func_lib->add_function());
   PopulateFunction("DoStuffGpu", "DoStuff", "GPU",
-                   mismatch_args ? func_wrong_args : func_args,
+                   mismatch_args ? func_wrong_args : func_args, output_args, "",
+                   "", func_lib->add_function());
+  PopulateFunction("DoThings", "DoThings", "", func_args, output_args, "", "",
                    func_lib->add_function());
-  PopulateFunction("DoThings", "DoThings", "", func_args,
+  PopulateFunction("OneOff", "", "", func_args, output_args, "", "",
                    func_lib->add_function());
-  PopulateFunction("OneOff", "", "", func_args, func_lib->add_function());
-  PopulateFunction("AnotherOneOff", "", "", func_args,
+  PopulateFunction("AnotherOneOff", "", "", func_args, output_args, "", "",
                    func_lib->add_function());
 }
 
+void PopulateComplexLibrary(FunctionDefLibrary* func_lib) {
+  const std::vector<ArgSpec> input_args{{"in1", "float32"}, {"in2", "int32"}};
+  const std::vector<ArgSpec> output_args{{"out", "float32"}};
+  const std::vector<ArgSpec> output_with_state{
+      {"out", "float32"}, {"state1", "int32"}, {"state2", "int32"}};
+
+  PopulateFunction("DoStuffCpu", "DoStuff", "CPU", input_args, output_args, "",
+                   "DoStuffCpu_gradient", func_lib->add_function());
+  PopulateFunction("DoStuffCpu_gradient", "DoStuff", "CPU", output_args,
+                   input_args, "DoStuffCpu", "", func_lib->add_function());
+  PopulateFunction("DoStuffGpu", "DoStuff", "GPU", input_args,
+                   output_with_state, "", "DoStuffGpu_gradient",
+                   func_lib->add_function());
+  PopulateFunction("DoStuffGpu_gradient", "DoStuff", "GPU", output_with_state,
+                   input_args, "DoStuffGpu", "", func_lib->add_function());
+}
+
 bool CheckEquivImpl(const FunctionLibraryApiInfo& lib_api_info,
                     const string& func_name,
                     const std::vector<string>& expected_other) {
   std::vector<string> other_impl;
-  lib_api_info.GetEquivalentImplementations(func_name, &other_impl);
+  Status status =
+      lib_api_info.GetEquivalentImplementations(func_name, &other_impl);
+  EXPECT_EQ(status, Status::OK());
   const std::unordered_set<string> actual(other_impl.begin(), other_impl.end());
   const std::unordered_set<string> expected(expected_other.begin(),
                                             expected_other.end());
   return actual == expected;
 }
 
-bool CheckGetBestImpl(const FunctionLibraryApiInfo& lib_api_info,
-                      const string& function_name, const string& device,
-                      const string& expected_function_name) {
-  string best_function_name;
-  lib_api_info.GetBestImplementation(function_name, device,
-                                     &best_function_name);
-
-  return best_function_name == expected_function_name;
-}
-
 string GetInterfaceName(const FunctionLibraryApiInfo& lib_api_info,
                         const string& func_name) {
   auto* info = lib_api_info.GetApiInfo(func_name);
@@ -117,12 +135,6 @@ TEST(FunctionApiInfoTest, ParseTags) {
   PopulateSampleLibrary(/* mismatch_args */ false, &func_lib);
   FunctionLibraryApiInfo lib_api_info;
   TF_ASSERT_OK(lib_api_info.Init(func_lib));
-  EXPECT_TRUE(CheckEquivImpl(lib_api_info, "DoStuffCpu", {"DoStuffGpu"}));
-  EXPECT_TRUE(CheckEquivImpl(lib_api_info, "DoStuffGpu", {"DoStuffCpu"}));
-  EXPECT_TRUE(CheckEquivImpl(lib_api_info, "Undefined", {}));
-  EXPECT_TRUE(CheckEquivImpl(lib_api_info, "OneOff", {}));
-  EXPECT_TRUE(CheckEquivImpl(lib_api_info, "AnotherOneOff", {}));
-  EXPECT_TRUE(CheckEquivImpl(lib_api_info, "DoThings", {}));
 
   EXPECT_EQ("DoStuff", GetInterfaceName(lib_api_info, "DoStuffCpu"));
   EXPECT_EQ("DoStuff", GetInterfaceName(lib_api_info, "DoStuffGpu"));
@@ -132,19 +144,37 @@ TEST(FunctionApiInfoTest, ParseTags) {
   EXPECT_EQ("GPU", GetPreferredDevice(lib_api_info, "DoStuffGpu"));
   EXPECT_EQ("", GetPreferredDevice(lib_api_info, "DoThings"));
 
-  EXPECT_TRUE(
-      CheckGetBestImpl(lib_api_info, "DoStuffCpu", "CPU", "DoStuffCpu"));
-  EXPECT_TRUE(
-      CheckGetBestImpl(lib_api_info, "DoStuffCpu", "GPU", "DoStuffGpu"));
-  EXPECT_TRUE(
-      CheckGetBestImpl(lib_api_info, "DoStuffGpu", "CPU", "DoStuffCpu"));
-  EXPECT_TRUE(
-      CheckGetBestImpl(lib_api_info, "DoStuffGpu", "GPU", "DoStuffGpu"));
+  EXPECT_TRUE(CheckEquivImpl(lib_api_info, "DoStuffCpu", {"DoStuffGpu"}));
+  EXPECT_TRUE(CheckEquivImpl(lib_api_info, "DoStuffGpu", {"DoStuffCpu"}));
+  EXPECT_TRUE(CheckEquivImpl(lib_api_info, "Undefined", {}));
+  EXPECT_TRUE(CheckEquivImpl(lib_api_info, "OneOff", {}));
+  EXPECT_TRUE(CheckEquivImpl(lib_api_info, "AnotherOneOff", {}));
+  EXPECT_TRUE(CheckEquivImpl(lib_api_info, "DoThings", {}));
+}
 
-  EXPECT_TRUE(CheckGetBestImpl(lib_api_info, "DoThings", "GPU", "DoThings"));
-  // TPU impl is not available, choose the first one available which is the CPU.
-  EXPECT_TRUE(
-      CheckGetBestImpl(lib_api_info, "DoStuffGpu", "TPU", "DoStuffCpu"));
+TEST(FunctionApiInfoTest, ComplexFunctionLib) {
+  FunctionDefLibrary func_lib;
+  PopulateComplexLibrary(&func_lib);
+  FunctionLibraryApiInfo lib_api_info;
+  TF_ASSERT_OK(lib_api_info.Init(func_lib));
+
+  EXPECT_EQ("DoStuff", GetInterfaceName(lib_api_info, "DoStuffCpu"));
+  EXPECT_EQ("DoStuff", GetInterfaceName(lib_api_info, "DoStuffCpu_gradient"));
+  EXPECT_EQ("DoStuff", GetInterfaceName(lib_api_info, "DoStuffGpu"));
+  EXPECT_EQ("DoStuff", GetInterfaceName(lib_api_info, "DoStuffGpu_gradient"));
+
+  EXPECT_EQ("CPU", GetPreferredDevice(lib_api_info, "DoStuffCpu"));
+  EXPECT_EQ("CPU", GetPreferredDevice(lib_api_info, "DoStuffCpu_gradient"));
+  EXPECT_EQ("GPU", GetPreferredDevice(lib_api_info, "DoStuffGpu"));
+  EXPECT_EQ("GPU", GetPreferredDevice(lib_api_info, "DoStuffGpu_gradient"));
+
+  EXPECT_TRUE(CheckEquivImpl(lib_api_info, "DoStuffCpu", {"DoStuffGpu"}));
+  EXPECT_TRUE(CheckEquivImpl(lib_api_info, "DoStuffGpu", {"DoStuffCpu"}));
+  EXPECT_TRUE(CheckEquivImpl(lib_api_info, "DoStuffCpu_gradient",
+                             {"DoStuffGpu_gradient"}));
+  EXPECT_TRUE(CheckEquivImpl(lib_api_info, "DoStuffGpu_gradient",
+                             {"DoStuffCpu_gradient"}));
+  EXPECT_TRUE(CheckEquivImpl(lib_api_info, "Undefined", {}));
 }
 
 TEST(FunctionApiInfoTest, MismatchedArguments) {
diff --git a/tensorflow/core/grappler/optimizers/function_optimizer.cc b/tensorflow/core/grappler/optimizers/function_optimizer.cc
index f99826ddcad..69685409a35 100644
--- a/tensorflow/core/grappler/optimizers/function_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/function_optimizer.cc
@@ -16,11 +16,17 @@ limitations under the License.
 #include "tensorflow/core/grappler/optimizers/function_optimizer.h"
 
 #include <unordered_map>
+#include <vector>
 
+#include "absl/algorithm/container.h"
+#include "absl/memory/memory.h"
 #include "absl/strings/str_replace.h"
 #include "absl/strings/substitute.h"
+#include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/common_runtime/device_set.h"
 #include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/common_runtime/placer.h"
 #include "tensorflow/core/common_runtime/process_function_library_runtime.h"
 #include "tensorflow/core/framework/attr_value_util.h"
 #include "tensorflow/core/framework/function.h"
@@ -31,6 +37,7 @@ limitations under the License.
 #include "tensorflow/core/framework/op_def.pb.h"
 #include "tensorflow/core/framework/versions.pb.h"
 #include "tensorflow/core/graph/graph_constructor.h"
+#include "tensorflow/core/graph/tensor_id.h"
 #include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/grappler/mutable_graph_view.h"
 #include "tensorflow/core/grappler/op_types.h"
@@ -107,6 +114,44 @@ AttrSlice FunctionInstantiationAttributes(const FunctionDef& func,
   }
 }
 
+// This is a fake device that should not be used for any op kernel execution,
+// the only purpose of this device is to be passed as a part of DeviceSet to the
+// Placer.
+class FakeDevice : public Device {
+ public:
+  FakeDevice(Env* env, const string& device) : Device(env, attr(device)) {}
+  explicit FakeDevice(const string& device) : FakeDevice(nullptr, device) {}
+  Status Sync() override { return Status::OK(); }
+
+ private:
+  static DeviceAttributes attr(const string& device) {
+    DeviceNameUtils::ParsedName parsed_name;
+    bool parsed = DeviceNameUtils::ParseFullName(device, &parsed_name);
+    DCHECK(parsed) << "Failed to parse full device name: " << device;
+
+    DeviceAttributes attr;
+    attr.set_name(device);
+    attr.set_device_type(parsed_name.type);
+    return attr;
+  }
+};
+
+// -------------------------------------------------------------------------- //
+// Function specialization.
+//
+// FunctionDef is somewhat similar to function template in C++, given all the
+// type parameters (and attribute values) it generates a statically defined
+// graph from the type parametrized "graph template" (function body).
+//
+// Function specialization instantiates a parametrized FunctionDef into a
+// statically defined graph, and then converts it back to the fully defined
+// FunctionDef (it doesn't have any unknown type parameters or attribute
+// values, known as placeholders).
+//
+// Given the fully specified graph we can apply all the Grappler optimizers to
+// it (see details in MetaOptimizer). Also we can push known constant inputs
+// into the function body, and remove unused outputs/inputs.
+
 // Specialized function instantiation type parameters, body parameters, and
 // const inputs.
 struct FunctionSpecializationSignature {
@@ -206,25 +251,27 @@ struct FunctionSpecialization {
   std::vector<std::pair<int, int>> output_mapping;
 };
 
-class FakeCPUDevice : public Device {
- public:
-  FakeCPUDevice(Env* env, const DeviceAttributes& attr) : Device(env, attr) {}
-  Status Sync() override { return Status::OK(); }
-};
-
 class FunctionOptimizerContext {
  public:
   explicit FunctionOptimizerContext(RewriterConfig::Toggle opt_level,
                                     const GrapplerItem& item)
       : grappler_item_id_(item.id),
         graph_version_(item.graph.versions().producer()),
+        opt_level_(opt_level),
+        allowed_optimizations_(item.allowed_optimizations()),
         function_library_(OpRegistry::Global(), item.graph.library()),
+        available_device_names_(item.devices().begin(), item.devices().end()),
         graph_view_(&item.graph) {
     InitializeTrulyConstNodes(item);
-    InitializeInlinedFunctions(opt_level, item);
     InitializeFetchNodes(item);
   }
 
+  const RewriterConfig::Toggle opt_level() const { return opt_level_; }
+
+  const GrapplerItem::AllowedOptimizations& allowed_optimizations() const {
+    return allowed_optimizations_;
+  }
+
   const FunctionLibraryDefinition& function_library() const {
     return function_library_;
   }
@@ -238,9 +285,13 @@ class FunctionOptimizerContext {
     return flr_;
   }
 
-  const gtl::FlatMap<string, std::vector<std::pair<int, int>>>&
-  output_mappings() const {
-    return output_mappings_;
+  const gtl::FlatMap<SafeTensorId, SafeTensorId, SafeTensorId::Hasher>&
+  tensor_mapping() const {
+    return tensor_mapping_;
+  }
+
+  const gtl::FlatMap<string, std::vector<string>>& control_overrides() const {
+    return control_overrides_;
   }
 
   const GraphView& graph_view() const { return graph_view_; }
@@ -249,12 +300,20 @@ class FunctionOptimizerContext {
 
   const gtl::FlatSet<string>& fetch_tensors() const { return fetch_tensors_; }
 
-  bool IsFetchNode(const string& node_name) const {
-    return fetch_nodes_.find(node_name) != fetch_nodes_.end();
+  const DeviceSet* devices() const {
+    // Create fake devices lazily only if we need a DeviceSet.
+    if (available_devices_.empty() && !available_device_names_.empty()) {
+      for (const string& name : available_device_names_) {
+        auto device = absl::make_unique<FakeDevice>(name);
+        available_device_set_.AddDevice(device.get());
+        available_devices_.push_back(std::move(device));
+      }
+    }
+    return &available_device_set_;
   }
 
-  bool IsInlinedFunction(const string& name) const {
-    return inlined_functions_.count(name) > 0;
+  bool IsFetchNode(const string& node_name) const {
+    return fetch_nodes_.find(node_name) != fetch_nodes_.end();
   }
 
   bool IsTrulyConst(const string& name) const {
@@ -265,11 +324,6 @@ class FunctionOptimizerContext {
     return gtl::FindWithDefault(truly_const_nodes_, name, nullptr);
   }
 
-  // Find inlining candidate by name. Return nullptr if not found.
-  const FunctionDef* FindInlinedFunction(const string& name) const {
-    return gtl::FindWithDefault(inlined_functions_, name, nullptr);
-  }
-
   const FunctionSpecialization* FindFunctionSpecialization(
       const FunctionSpecializationSignature& sig) const {
     return gtl::FindOrNull(specialized_functions_, sig);
@@ -280,20 +334,33 @@ class FunctionOptimizerContext {
     specialized_functions_.emplace(sig, specialized_func);
   }
 
-  void AddOutputMapping(const string& func_node,
-                        const FunctionSpecialization& specialized_func) {
-    output_mappings_.emplace(func_node, specialized_func.output_mapping);
+  void AddTensorMapping(const SafeTensorId& from, const SafeTensorId& to) {
+    auto inserted = tensor_mapping_.insert({from, to});
+    DCHECK(inserted.second)
+        << "Failed to insert duplicated tensor mapping: "
+        << "from=" << from.ToString() << " to=" << to.ToString();
   }
 
-  // Return true if we had any specialized function that changed it's output
-  // mapping, and it's required to update output consumers to new ports ids.
-  bool RequiresOutputMapping() const {
-    for (const auto& m1 : output_mappings_) {
-      for (const std::pair<int, int>& m2 : m1.second) {
-        if (m2.first != m2.second) return true;
+  void AddTensorMapping(const string& func_node,
+                        const FunctionSpecialization& specialized_func) {
+    for (const auto& pair : specialized_func.output_mapping) {
+      int from_idx = pair.first;
+      int to_idx = pair.second;
+      if (from_idx != to_idx) {
+        SafeTensorId from_tensor(func_node, from_idx);
+        SafeTensorId to_tensor(func_node, to_idx);
+        auto inserted = tensor_mapping_.insert({from_tensor, to_tensor});
+        DCHECK(inserted.second);
       }
     }
-    return false;
+  }
+
+  void AddControlOverrides(const NodeDef& func_node,
+                           const std::vector<string>& control_overrides) {
+    control_overrides_[func_node.name()].reserve(control_overrides.size());
+    for (const string& control_override : control_overrides) {
+      control_overrides_[func_node.name()].push_back(control_override);
+    }
   }
 
  private:
@@ -310,26 +377,6 @@ class FunctionOptimizerContext {
     }
   }
 
-  void InitializeInlinedFunctions(RewriterConfig::Toggle opt_level,
-                                  const GrapplerItem& item) {
-    bool aggressive = opt_level == RewriterConfig::AGGRESSIVE;
-
-    for (const FunctionDef& func : item.graph.library().function()) {
-      // Can't create IdentityN nodes with no input or output: skip these
-      // functions for now.
-      if (func.signature().input_arg_size() == 0 ||
-          func.signature().output_arg_size() == 0) {
-        continue;
-      }
-      bool marked_noinline = MarkedNoInline(func);
-      bool marked_specialized = MarkedSpecialized(func);
-
-      if (!marked_specialized && (!marked_noinline || aggressive)) {
-        inlined_functions_[func.signature().name()] = &func;
-      }
-    }
-  }
-
   void InitializeFetchNodes(const GrapplerItem& item) {
     for (const string& fetch : item.fetch) {
       fetch_tensors_.insert(fetch);
@@ -340,22 +387,22 @@ class FunctionOptimizerContext {
   void InitializeFunctionLibraryRuntime() {
     if (!flr_) {
       Env* env = Env::Default();
-      DeviceAttributes attr;
-      attr.set_name("/device:CPU:0");
-      attr.set_device_type("CPU");
-      Device* device = new FakeCPUDevice(env, attr);
-      device_mgr_.reset(new DeviceMgr({device}));
+      std::vector<std::unique_ptr<Device>> devices;
+      devices.push_back(absl::make_unique<FakeDevice>(env, "/device:CPU:0"));
+      device_mgr_ = absl::make_unique<DeviceMgr>(std::move(devices));
       OptimizerOptions optimizer_opts;
       optimizer_opts.set_do_function_inlining(true);
       process_flr_.reset(new ProcessFunctionLibraryRuntime(
           device_mgr_.get(), env, graph_version_, &function_library_,
           optimizer_opts));
-      flr_ = process_flr_->GetFLR(device->name());
+      flr_ = process_flr_->GetFLR(device_mgr_->ListDevices()[0]->name());
     }
   }
 
   const string grappler_item_id_;
   const int graph_version_;
+  const RewriterConfig::Toggle opt_level_;
+  const GrapplerItem::AllowedOptimizations allowed_optimizations_;
   FunctionLibraryDefinition function_library_;
 
   // These fields initialized lazily only if needed.
@@ -363,8 +410,16 @@ class FunctionOptimizerContext {
   std::unique_ptr<ProcessFunctionLibraryRuntime> process_flr_;
   FunctionLibraryRuntime* flr_ = nullptr;
 
-  // Functions that can be inlined into optimized graph.
-  std::unordered_map<string, const FunctionDef*> inlined_functions_;
+  // Fully defined names of the devices available to the GrapplerItem.
+  const gtl::FlatSet<string> available_device_names_;
+
+  // List of available `FakedDevices` (lazily initialized, see devices()).
+  mutable std::vector<std::unique_ptr<Device>> available_devices_;
+
+  // DeviceSet of fake devices (`FakeDevice`) constructed from
+  // available_devices_ (lazily initialized).
+  mutable DeviceSet available_device_set_;
+
   // Nodes that are Const and not in feed.
   std::unordered_map<string, const NodeDef*> truly_const_nodes_;
   // Specialized functions.
@@ -377,9 +432,23 @@ class FunctionOptimizerContext {
   gtl::FlatSet<string> fetch_tensors_;  // format: node_name:port
   gtl::FlatSet<string> fetch_nodes_;    // format: node_name
 
-  // Output mappings that have to be applied to the graph after all functions
-  // are specialized (node name -> output mappings).
-  gtl::FlatMap<string, std::vector<std::pair<int, int>>> output_mappings_;
+  // After function inlining and specialization, the optimized graph might be in
+  // invalid state, nodes can read from non-existing function call nodes that
+  // were inlined, or they can read from output index that is no longer valid
+  // after unused outputs pruning.
+  //
+  // Tensor mapping that has to be applied to the graph after all functions
+  // optimizations (invalidated tensor id -> optimized graph tensor id).
+  gtl::FlatMap<SafeTensorId, SafeTensorId, SafeTensorId::Hasher>
+      tensor_mapping_;
+
+  // When we inline a function into the optimized graph, we no longer have the
+  // function call node to anchor control dependencies. Instead we must expand
+  // each function call control output edge into multiple control dependencies
+  // to all side-effectful ops inside the function body.
+  //
+  // Invalidated function call node name -> Inlined side-effectful nodes
+  gtl::FlatMap<string, std::vector<string>> control_overrides_;
 
   // Use graph view to find active outputs of the function caller nodes.
   GraphView graph_view_;
@@ -387,6 +456,22 @@ class FunctionOptimizerContext {
   TF_DISALLOW_COPY_AND_ASSIGN(FunctionOptimizerContext);
 };
 
+// Returns a pointer to the called function definition iff the given node is
+// indeed a function call. Otherwise returns nullptr.
+const FunctionDef* FindFunctionCall(const FunctionOptimizerContext& ctx,
+                                    const NodeDef& node) {
+  // Check if a node does indirect function call via PartitionedCallOp.
+  if (IsPartitionedCall(node) || IsStatefulPartitionedCall(node)) {
+    const AttrValue* func_attr = AttrSlice(node).Find("f");
+    return (func_attr != nullptr && func_attr->has_func())
+               ? ctx.function_library().Find(func_attr->func().name())
+               : nullptr;
+  }
+
+  // Check if the function op itself is a function name.
+  return ctx.function_library().Find(node.op());
+}
+
 gtl::FlatSet<int> GetActiveOutputs(const NodeDef& node,
                                    const FunctionOptimizerContext& ctx,
                                    int size_hint = 0) {
@@ -414,7 +499,7 @@ bool HasTrulyConstInputs(const NodeDef& node,
   const auto is_truly_const = [&ctx](const string& input) {
     return ctx.IsTrulyConst(NodeName(input));
   };
-  return std::any_of(node.input().begin(), node.input().end(), is_truly_const);
+  return absl::c_any_of(node.input(), is_truly_const);
 }
 
 bool HasUnusedOutputs(const NodeDef& func_node, const FunctionDef& func,
@@ -605,6 +690,9 @@ Status UpdateSpecializedFunctionNode(
   // 2. Remove inputs corresponding to the pushed down consts.
   RemovePushedDownConstInputs(specialization, specialized_func_node);
 
+  // NOTE: PartitionedCallOp has `Tin` and `Tout` attributes for input/output
+  // types, that must be in sync with updated function signature.
+
   // 3. Update input types for the indirect function calls.
   if (is_indirect_call) {
     RemovePushedDownConstInputTypes(specialization, func_node,
@@ -693,7 +781,7 @@ Status SpecializeFunction(const NodeDef& func_node, const FunctionDef& func,
     TF_RETURN_IF_ERROR(UpdateSpecializedFunctionNode(
         func, func_node, *already_specialized, specialized_func_node));
 
-    ctx->AddOutputMapping(specialized_func_node->name(), *already_specialized);
+    ctx->AddTensorMapping(specialized_func_node->name(), *already_specialized);
 
     return Status::OK();
   }
@@ -755,7 +843,98 @@ Status SpecializeFunction(const NodeDef& func_node, const FunctionDef& func,
       func, func_node, func_specialization, specialized_func_node));
 
   ctx->AddSpecializedFunction(signature, func_specialization);
-  ctx->AddOutputMapping(specialized_func_node->name(), func_specialization);
+  ctx->AddTensorMapping(specialized_func_node->name(), func_specialization);
+
+  return Status::OK();
+}
+
+// -------------------------------------------------------------------------- //
+// Inline direct functions calls.
+//
+// When we inline direct function calls, we instantiate the function body from
+// its FunctionDef and caller node attributes, and embed the instantiated graph
+// into the "main graph". When we do that, we must preserve the function call
+// semantics:
+//
+// 1) All input nodes must be executed before any of function body nodes will
+//    start executing.
+// 2) All function body nodes must be executed before any of the nodes, reading
+//    outputs of the function will start executing.
+// 3) All nodes with side effects inside a function must be executed, this is
+//    different from the nodes with side effects in the main graph, that can be
+//    pruned if they are not in transitive dependency set of any of the fetch
+//    nodes.
+// 4) All nodes of the function body must be execute on the device specified by
+//    the function caller node.
+//
+// To guarantee that function call semantics are preserved after inlining, we
+// insert an IdentityN node before the inlined function body, and hook all
+// inputs into that, and we insert another IdentityN node to hook all function
+// outputs to it.
+
+// Returns `Status::OK()` iff `node` is a direct function call of `func`, and we
+// know how to inline it into the main graph, otherwise returns and error
+// indicating why the function call is not inlinable.
+Status IsInlinableDirectFunctionCall(const FunctionOptimizerContext& ctx,
+                                     const FunctionDef& func,
+                                     const NodeDef& func_node) {
+  // Indirect function calls (PartitionedCallOp) have automatic control
+  // dependencies and inlined separately from direct function calls.
+  if (!IsDirectFunctionCall(func, func_node)) {
+    return errors::InvalidArgument("Unsupported function call type: ",
+                                   SummarizeNodeDef(func_node));
+  }
+
+  // For direct function  calls we insert IdentityN nodes before/after inlined
+  // function body to preserve function call semantics (all inputs evaluated
+  // before function evaluation starts, and all function body nodes finished
+  // before output consumed by other nodes).
+  if (func.signature().input_arg_size() == 0) {
+    return errors::FailedPrecondition(
+        "Can't inline direct function call with empty inputs: ",
+        SummarizeNodeDef(func_node));
+  }
+
+  // TODO(ezhulenev): Relax constraint on output args?
+  if (func.signature().output_arg_size() == 0) {
+    return errors::FailedPrecondition(
+        "Can't inline direct function call with empty outputs: ",
+        SummarizeNodeDef(func_node));
+  }
+
+  // Function must execute all the nodes in a function body that might have side
+  // effects. After inlining these nodes into the main graph, we can no longer
+  // guarantee that. For now we disable inlining functions with side effects.
+  //
+  // Attaching control dependency to the output IdentityN node is not safe,
+  // because it might be split or pruned in a later optimization pass.
+  //
+  // Indirect function calls (via PartitionedCallOp) have automatic dependency
+  // tracking, and allow us to safely inline functions with side effects.
+  bool has_side_effects =
+      absl::c_any_of(func.node_def(), [&ctx](const NodeDef& node) {
+        return !IsFreeOfSideEffect(node, &ctx.function_library());
+      });
+  if (has_side_effects) {
+    return errors::FailedPrecondition(
+        "Can't inline function with side-effects in the function body: ",
+        SummarizeNodeDef(func_node));
+  }
+
+  // We ignore `_noinline` marker in aggressive mode.
+  bool aggressive = ctx.opt_level() == RewriterConfig::AGGRESSIVE;
+  if (MarkedNoInline(func) && !aggressive) {
+    return errors::FailedPrecondition(
+        "Can't inline function marked with '_noinline': ",
+        SummarizeNodeDef(func_node));
+  }
+
+  // Function specialization and inlining must be mutually exclusive.
+  if (MarkedSpecialized(func)) {
+    return errors::FailedPrecondition(
+        "Can't inline function created in Grappler function specialization: ",
+        SummarizeNodeDef(func_node));
+  }
 
   return Status::OK();
 }
@@ -802,16 +981,13 @@ NodeDef InlinedFunctionOutputsNode(const NodeDef& func_node,
   return outputs;
 }
 
-Status InlineFunction(const NodeDef& func_node, const FunctionDef& func,
-                      const FunctionOptimizerContext& ctx,
-                      const int graph_def_version, GraphDef* optimized_graph) {
-  VLOG(2) << "Inline function instantiation: " << SummarizeNodeDef(func_node);
-
-  // Specialized function call kernels might have behavior that is not
-  // representable in a graph (e.g. runtime ops device placing).
-  if (!IsDirectFunctionCall(func, func_node)) {
-    return errors::InvalidArgument("Can't inline indirect function call");
-  }
+Status InlineDirectFunctionCall(const NodeDef& func_node,
+                                const FunctionDef& func,
+                                const int graph_def_version,
+                                const FunctionOptimizerContext& ctx,
+                                GraphDef* optimized_graph) {
+  VLOG(2) << "Inline direct function call: " << SummarizeNodeDef(func_node);
+  TF_RETURN_IF_ERROR(IsInlinableDirectFunctionCall(ctx, func, func_node));
 
   const AttrSlice func_instantiation_attr =
       FunctionInstantiationAttributes(func, func_node);
@@ -874,21 +1050,35 @@ Status InlineFunction(const NodeDef& func_node, const FunctionDef& func,
     // Make sure the node is placed.
     func_body_node.set_device(func_node.device());
 
-    // Check if a body node is itself a function.
-    const FunctionDef* func_body_node_func =
-        ctx.FindInlinedFunction(func_body_node.op());
-    if (func_body_node_func != nullptr) {
-      // Recursively inline function calls.
-      TF_RETURN_IF_ERROR(InlineFunction(func_body_node, *func_body_node_func,
-                                        ctx, graph_def_version,
-                                        optimized_graph));
-    } else {
+    // Move the function body node to the optimized graph.
+    const auto move_node_to_optimized_graph = [&]() {
       // Annotate the node with the function attributes.
       for (const auto& attr : func.attr()) {
         func_body_node.mutable_attr()->insert(attr);
       }
       // Move the node to the main graph.
       optimized_graph->add_node()->Swap(&func_body_node);
+    };
+
+    // Check if a body node is itself a function call and can be inlined.
+    const FunctionDef* func_body_node_func =
+        FindFunctionCall(ctx, func_body_node);
+
+    if (func_body_node_func != nullptr) {
+      Status inlinable = IsInlinableDirectFunctionCall(
+          ctx, *func_body_node_func, func_body_node);
+      if (inlinable.ok()) {
+        TF_RETURN_IF_ERROR(
+            InlineDirectFunctionCall(func_body_node, *func_body_node_func,
+                                     graph_def_version, ctx, optimized_graph));
+      } else {
+        VLOG(2) << "Can't inline nested direct function call: "
+                << inlinable.error_message();
+        move_node_to_optimized_graph();
+      }
+
+    } else {
+      move_node_to_optimized_graph();
     }
   }
 
@@ -995,9 +1185,330 @@ Status InlineSymbolicGradient(const NodeDef& node,
   return Status::OK();
 }
 
+// -------------------------------------------------------------------------- //
+// Inline indirect functions calls (aka PartitionedCallOp).
+//
+// When we inline indirect function calls, we instantiate the function body from
+// its FunctionDef and caller node attributes, and embed the instantiated graph
+// into the "main graph".
+//
+// In contrast to direct function calls, `PartitionedCallOp` has automatic
+// dependency tracking via input/output control edges, and we relax some of the
+// constraints that we have for direct function call inlining.
+//
+// "When a `PartitionedCallOp` function has a resource (DT_RESOURCE data type)
+// input argument it "captures" the mutable resource.  This is implemented by
+// automatically adding a incoming control edge from the previous side-effectful
+// op touching that resource, and an outgoing control edge to the next
+// side-effectful op using the same resource. This serializes the mutations of
+// the resource to make graph execution deterministic.
+//
+// Function call inlining must preserve side effect visibility:
+//
+// 1) All side effects to the captured resources, that happened before function
+//    call must be visible to the function body nodes using that resources.
+// 2) All side effects to the captured resources, that happened inside function
+//    body, must be visible to every op/function using that resource after the
+//    function call completed.
+
+// To guarantee that these properties are preserved after inlining we do:
+//
+// 1) Forward all input control dependencies from the function call node to the
+//    inlined function inputs (Identity nodes).
+// 2) Each side-effectful op inside function body adds itself as a control
+//    dependency to all the nodes in output control set of function call node.
+//
+// We do not add any other control dependencies to/from function body nodes,
+// because they are pure functions of input tensors, and can be freely
+// reordered.
+
+// Returns `Status::OK()` iff `node` is an indirect function call of `func`, and
+// we know how to inline it into the main graph, otherwise returns and error
+// indicating why the function call is not inlinable.
+Status IsInlinableIndirectFunctionCall(const FunctionOptimizerContext& ctx,
+                                       const FunctionDef& func,
+                                       const NodeDef& func_node) {
+  // We inline direct function calls above, using different rules.
+  if (!IsIndirectFunctionCall(func, func_node)) {
+    return errors::InvalidArgument("Unsupported function call type: ",
+                                   SummarizeNodeDef(func_node));
+  }
+
+  if (MarkedNoInline(func)) {
+    return errors::FailedPrecondition(
+        "Can't inline function marked with '_noinline': ",
+        SummarizeNodeDef(func_node));
+  }
+
+  // Function specialization and inlining must be mutually exclusive.
+  if (MarkedSpecialized(func)) {
+    return errors::FailedPrecondition(
+        "Can't inline function created in Grappler function specialization: ",
+        SummarizeNodeDef(func_node));
+  }
+
+  // We can't inline functions that are in a fetch set, because it would
+  // invalidate fetch tensors (function call node fully inlined and doesn't
+  // exist in the optimized graph).
+  if (ctx.IsFetchNode(func_node.name())) {
+    return errors::FailedPrecondition(
+        "Can't inline function in a Grappler item fetch set: ",
+        SummarizeNodeDef(func_node));
+  }
+
+  // We can't inline functions with `Switch` nodes in the function body, because
+  // they might have dead tensors as a function output argument (we need all
+  // intermediate tensors to compute the function gradient). `PartitionedCallOp`
+  // invokes functions with `allow_dead_tensors = true` to reset dead flag,
+  // and return default initialized tensors instead of a dead tensors.
+  // TODO(ezhulenev): Do the liveness analysis and add
+  // `IdentitytWithResurrection` nodes after all potentially dead output
+  // tensors?
+  if (absl::c_any_of(func.node_def(), IsSwitch)) {
+    return errors::FailedPrecondition(
+        "Can't inline function with `Switch` nodes in the function body: ",
+        SummarizeNodeDef(func_node));
+  }
+
+  return Status::OK();
+}
+
+Status InlineIndirectFunctionCall(const NodeDef& func_node,
+                                  const FunctionDef& func,
+                                  const int graph_def_version,
+                                  FunctionOptimizerContext* ctx,
+                                  GraphDef* optimized_graph) {
+  VLOG(2) << "Inline indirect function call: " << SummarizeNodeDef(func_node);
+  TF_RETURN_IF_ERROR(IsInlinableIndirectFunctionCall(*ctx, func, func_node));
+
+  const AttrSlice func_instantiation_attr =
+      FunctionInstantiationAttributes(func, func_node);
+
+  GrapplerFunctionItem item;
+  Status item_status = MakeGrapplerFunctionItem(func, func_instantiation_attr,
+                                                ctx->function_library(),
+                                                graph_def_version, &item);
+
+  if (!item_status.ok()) {
+    return errors::InvalidArgument("Failed to inline function ", func_node.op(),
+                                   " instantiated by ", func_node.name(),
+                                   ". Error: ", item_status.error_message());
+  }
+
+  GraphView::InputPort control_input_port =
+      ctx->graph_view().GetInputPort(func_node.name(), Graph::kControlSlot);
+  GraphView::OutputPort control_output_port =
+      ctx->graph_view().GetOutputPort(func_node.name(), Graph::kControlSlot);
+
+  // Nodes that have side effects to the captured resources.
+  std::vector<string> happens_before;
+  absl::c_transform(
+      ctx->graph_view().GetFanin(control_input_port),
+      std::back_inserter(happens_before),
+      [](const GraphView::OutputPort port) { return port.node->name(); });
+
+  // Nodes that must observe side effects to the captured resources.
+  std::vector<string> happens_after;
+  absl::c_transform(
+      ctx->graph_view().GetFanout(control_output_port),
+      std::back_inserter(happens_after),
+      [](const GraphView::InputPort port) { return port.node->name(); });
+
+  // Regular (positional) inputs to the function call.
+  std::vector<SafeTensorId> inputs;
+  for (const string& input : func_node.input()) {
+    SafeTensorId tensor_id = ParseTensorName(input);
+    if (tensor_id.index() == Graph::kControlSlot) break;
+    inputs.push_back(tensor_id);
+  }
+
+  // If we have a node inside the function body without inputs (e.g. Const), we
+  // must attach a control dependency to it, to make sure that if a function
+  // call happens inside a loop, the node will be evaluated in correct frame.
+  //
+  // If the function call node has no inputs and no control dependencies, it
+  // means that it can't be a function call inside a loop, and we can safely
+  // insert that node without inputs into the main graph.
+  //
+  // TODO(ezhulenev): Use FrameMap (see grappler/utils/frame.h) to find out if
+  // the function is called inside a loop.
+  std::vector<string> empty_inputs_hook;
+  if (!item.inputs().empty()) {
+    const InputArgExpansion& arg0 = item.inputs()[0];
+    DCHECK(!arg0.placeholders.empty());
+    empty_inputs_hook.push_back(AsControlDependency(AddPrefixToNodeName(
+        arg0.placeholders[0], /*prefix=*/func_node.name())));
+  } else if (!happens_before.empty()) {
+    empty_inputs_hook.push_back(AsControlDependency(happens_before[0]));
+  }
+
+  // Mapping from input placeholder name to function input position.
+  int idx = 0;
+  absl::flat_hash_map<absl::string_view, int> input_placeholders_idx;
+  for (const InputArgExpansion& input_arg : item.inputs()) {
+    for (const string& placeholder : input_arg.placeholders) {
+      input_placeholders_idx[placeholder] = idx++;
+    }
+  }
+
+  const string prefix = strings::StrCat(func_node.name(), "/");
+
+  // ------------------------------------------------------------------------ //
+  // First we need to assign device placements to all function body nodes.
+
+  GraphDef placed_graph_def;
+
+  const DeviceSet* devices = ctx->devices();
+
+  if (devices->devices().empty()) {
+    // If there are no devices available for placer, we just put all nodes to
+    // the same device as a function caller node. This can happen if Grappler is
+    // running "offline", without active runtime session, for example as a part
+    // of a batch job for graph analysis/optimization.
+    VLOG(3) << "Assign function call node device to all function body nodes. "
+            << "Device: " << func_node.device();
+    placed_graph_def = item.mutable_function_body();
+    for (NodeDef& node : *placed_graph_def.mutable_node()) {
+      node.set_device(func_node.device());
+    }
+  } else {
+    // If we are running in an active runtime session, Grappler will get the
+    // graph after initial placing is done, and we should have devices for the
+    // placer.
+    VLOG(3) << "Run placer for instantiated function body. Devices: ["
+            << absl::StrJoin(
+                   devices->devices(), ", ",
+                   [](string* out, const Device* d) { out->append(d->name()); })
+            << "]";
+
+    // Construct a Graph object from the instantiated function body.
+    GraphConstructorOptions opts;
+    Graph graph(ctx->function_library());
+    TF_RETURN_IF_ERROR(
+        ConvertGraphDefToGraph(opts, item.function_body(), &graph));
+
+    // Use function caller node device as a default for placer.
+    const Device* default_device =
+        devices->FindDeviceByName(func_node.device());
+
+    Placer placer(&graph, devices, nullptr, /* No session options */
+                  default_device);
+    TF_RETURN_IF_ERROR(placer.Run());
+
+    // Convert Graph back to the GraphDef.
+    graph.ToGraphDef(&placed_graph_def);
+  }
+
+  // ------------------------------------------------------------------------ //
+  // After all nodes placed we need to prepare them for inlining into the
+  // optimized graph: turn placeholders into identities, update nodes
+  // connectivity, etc...
+
+  for (NodeDef& func_body_node : *placed_graph_def.mutable_node()) {
+    if (item.IsInputPlaceholder(func_body_node.name())) {
+      // Turn input placeholders into identity node.
+      DCHECK_EQ(0, func_body_node.input_size());
+      func_body_node.set_op("Identity");
+      (*func_body_node.mutable_attr())["T"] = func_body_node.attr().at("dtype");
+      func_body_node.mutable_attr()->erase("dtype");
+      func_body_node.mutable_attr()->erase("shape");
+      int input_idx = input_placeholders_idx[func_body_node.name()];
+      func_body_node.add_input(strings::StrCat(inputs[input_idx].ToString()));
+
+      // All side effects must happen before inputs can start executing.
+      for (const string& hb_node : happens_before) {
+        func_body_node.add_input(AsControlDependency(hb_node));
+      }
+
+    } else {
+      // Update inputs of the regular function body nodes.
+      for (string& input : *func_body_node.mutable_input()) {
+        input = AddPrefixToNodeName(input, /*prefix=*/func_node.name());
+      }
+      if (func_body_node.input_size() == 0 && !empty_inputs_hook.empty()) {
+        *func_body_node.add_input() = empty_inputs_hook[0];
+      }
+    }
+
+    // Add the function node name as a prefix 1) to node name to avoid
+    // collisions; 2) to frame name to avoid multiple LoopCond nodes in one
+    // frame after inlining.
+    TF_RETURN_IF_ERROR(
+        AddPrefixAndSuffixToNode(prefix, /*suffix=*/"", &func_body_node));
+
+    // After inlining into the optimized graph, NodeDef must have all attributes
+    // defined, which is not required for a node in a FunctionDef.
+    const OpDef* op_def;
+    TF_RETURN_IF_ERROR(
+        ctx->function_library().LookUpOpDef(func_body_node.op(), &op_def));
+    AddDefaultsToNodeDef(*op_def, &func_body_node);
+  }
+
+  // Construct a graph view for the preprocessed function body graph.
+  GraphView placed_graph_view(&placed_graph_def);
+
+  // Keep track of side-effectful ops inside function body. Each outgoing
+  // control edge from the function call node, must be replaced with control
+  // edges from inlined side-effectful ops.
+  std::vector<string> side_effectful_nodes;
+
+  // We have to make sure that all side-effectful nodes inside a function body
+  // will be executed after function inlining.
+  for (NodeDef& func_body_node : *placed_graph_def.mutable_node()) {
+    if (!IsFreeOfSideEffect(func_body_node, &ctx->function_library())) {
+      int num_fanouts = placed_graph_view.NumFanouts(
+          func_body_node, /*include_controlling_nodes=*/true);
+
+      // If the node doesn't have any outgoing edges and we do not have any
+      // nodes in the `happens_after` set, we can't inline a function and
+      // guarantee that side-effects will be executed. The only exception if we
+      // do function library optimization, and the GrapplerItem was constructed
+      // for the function body, because functions have strict semantics.
+
+      if (num_fanouts == 0 && happens_after.empty() &&
+          !ctx->allowed_optimizations().inline_ops_with_side_effects) {
+        return errors::Internal(
+            "Can't inline a function with a side-effectful op with empty "
+            "fanouts and empty output control edge set. Function body node: ",
+            SummarizeNodeDef(func_body_node));
+      }
+
+      side_effectful_nodes.push_back(func_body_node.name());
+    }
+  }
+
+  // Move all the nodes to the optimized graph after successful preprocessing.
+  for (NodeDef& func_body_node : *placed_graph_def.mutable_node()) {
+    optimized_graph->add_node()->Swap(&func_body_node);
+  }
+
+  // TODO(ezhulenev): Inline nested indirect function calls.
+
+  // Indirect function call is fully inlined into the optimized graph, and we do
+  // not copy the original function call node, so we have to setup tensor
+  // mapping from old output tensors, to the outputs of inlined nodes.
+  int output_idx = 0;
+  for (const OutputArgExpansion& output : item.outputs()) {
+    for (const string& output_tensor : output.output_tensors) {
+      const SafeTensorId from_tensor(func_node.name(), output_idx++);
+      const SafeTensorId to_tensor = ParseTensorName(
+          AddPrefixToNodeName(output_tensor, /*prefix=*/func_node.name()));
+      ctx->AddTensorMapping(from_tensor, to_tensor);
+    }
+  }
+
+  // After inlining we'll have to forward all control dependencies from function
+  // call node to all side-effectful ops inside function body.
+  ctx->AddControlOverrides(func_node, side_effectful_nodes);
+
+  VLOG(3) << "Successfully inlined indirect function call: "
+          << SummarizeNodeDef(func_node);
+  return Status::OK();
+}
+
 }  // namespace
 
-Status FunctionOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
+Status FunctionOptimizer::Optimize(Cluster*, const GrapplerItem& item,
                                    GraphDef* optimized_graph) {
   // Nothing to do here.
   if (item.graph.library().function_size() == 0) {
@@ -1012,8 +1523,6 @@ Status FunctionOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
   bool specialize_func = options_.enable_function_specialization;
 
   for (const NodeDef& node : item.graph.node()) {
-    const string op_name = node.op();
-
     // Each node optimization can modify optimized graph only by adding new
     // nodes, we can check node size to make sure that graph was not modified.
     const int num_nodes_before = optimized_graph->node_size();
@@ -1042,11 +1551,13 @@ Status FunctionOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
     // 1. Inline symbolic gradients into the optimized graph.                 //
     // ---------------------------------------------------------------------- //
 
-    if (op_name == "SymbolicGradient" && inline_gradients) {
-      // Inline symbolic gradients only if the corresponding function is inlined
+    if (IsSymbolicGradient(node) && inline_gradients) {
+      // Inline symbolic gradients only if the corresponding function is not
+      // marked as `_noinline`.
       const auto* f_attr = gtl::FindOrNull(node.attr(), "f");
-      string f_name = f_attr != nullptr ? f_attr->func().name() : "";
-      if (ctx.IsInlinedFunction(f_name)) {
+      const string f_name = f_attr != nullptr ? f_attr->func().name() : "";
+      const FunctionDef* func = ctx.function_library().Find(f_name);
+      if (func && !MarkedNoInline(*func)) {
         TF_SKIP_ERROR_IF_GRAPH_UNMODIFIED(
             InlineSymbolicGradient(node, &ctx, optimized_graph));
         continue;
@@ -1054,28 +1565,52 @@ Status FunctionOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
     }
 
     // ---------------------------------------------------------------------- //
-    // 2. Inline or specialize direct function calls.                         //
+    // 2. Inline or specialize function calls.                                //
     // ---------------------------------------------------------------------- //
 
-    const FunctionDef* func = ctx.function_library().Find(op_name);
+    // Find if a node is a function call (direct or indirect).
+    const FunctionDef* func = FindFunctionCall(ctx, node);
+
     if (func != nullptr) {
-      // 2a. Inline it if it's allowed to do so.
-      if (inline_func && ctx.IsInlinedFunction(op_name)) {
-        // Inline function body into the optimized graph}
-        TF_SKIP_ERROR_IF_GRAPH_UNMODIFIED(
-            InlineFunction(node, *func, ctx, item.graph.versions().producer(),
-                           optimized_graph));
-        continue;
+      const string& func_name = func->signature().name();
+      const int graph_def_version = item.graph.versions().producer();
+
+      const bool is_direct_func = IsDirectFunctionCall(*func, node);
+      const bool is_indirect_func = IsIndirectFunctionCall(*func, node);
+
+      // 2a. Inline direct function call if it's inlinable.
+      if (inline_func && is_direct_func) {
+        Status inlinable = IsInlinableDirectFunctionCall(ctx, *func, node);
+        if (inlinable.ok()) {
+          TF_SKIP_ERROR_IF_GRAPH_UNMODIFIED(InlineDirectFunctionCall(
+              node, *func, graph_def_version, ctx, optimized_graph));
+          continue;
+        } else {
+          VLOG(2) << inlinable.error_message();
+        }
       }
 
-      // Do not specialize if function has custom gradient.
-      const string grad_func = ctx.function_library().FindGradient(op_name);
+      // 2b. Inline indirect function call if it's inlinable.
+      if (inline_func && is_indirect_func) {
+        Status inlinable = IsInlinableIndirectFunctionCall(ctx, *func, node);
+        if (inlinable.ok()) {
+          TF_SKIP_ERROR_IF_GRAPH_UNMODIFIED(InlineIndirectFunctionCall(
+              node, *func, graph_def_version, &ctx, optimized_graph));
+          continue;
+        } else {
+          VLOG(2) << inlinable.error_message();
+        }
+      }
 
-      // 2b. Specialize it to it's instantiation context if can't be inlined,
+      // 2c. Specialize it to its instantiation context if can't be inlined,
       // and it has something worth specializing.
       bool specialization_worthy = IsParametrized(*func) ||
                                    HasTrulyConstInputs(node, ctx) ||
                                    HasUnusedOutputs(node, *func, ctx);
+
+      // Do not specialize if function has custom gradient.
+      const string grad_func = ctx.function_library().FindGradient(func_name);
+
       if (specialize_func && grad_func.empty() && specialization_worthy) {
         // TODO(ezhulenev): Specialize function call if input has a known shape.
         // Specialize function body for its instantiation attributes and inputs.
@@ -1086,41 +1621,6 @@ Status FunctionOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
       }
     }
 
-    // ---------------------------------------------------------------------- //
-    // 3. Specialize indirect function calls through the PartitionedCallOp.   //
-    // ---------------------------------------------------------------------- //
-
-    bool is_partitioned_call =
-        IsPartitionedCall(node) || IsStatefulPartitionedCall(node);
-
-    // We can only specialize PartitionedCall ops. Inlining is not supported.
-    if (is_partitioned_call && specialize_func) {
-      const AttrValue* func_attr = AttrSlice(node).Find("f");
-      string indirect_func_name =
-          (func_attr != nullptr && func_attr->has_func())
-              ? func_attr->func().name()
-              : "";
-      const FunctionDef* indirect_func =
-          ctx.function_library().Find(indirect_func_name);
-
-      if (indirect_func != nullptr) {
-        // Do not specialize if function has custom gradient.
-        const string grad_func =
-            ctx.function_library().FindGradient(indirect_func_name);
-
-        // Specialize it to it's instantiation context.
-        bool specialization_worthy =
-            IsParametrized(*indirect_func) || HasTrulyConstInputs(node, ctx) ||
-            HasUnusedOutputs(node, *indirect_func, ctx);
-        if (grad_func.empty() && specialization_worthy) {
-          TF_SKIP_ERROR_IF_GRAPH_UNMODIFIED(SpecializeFunction(
-              node, *indirect_func, item.graph.versions().producer(), &ctx,
-              optimized_graph));
-          continue;
-        }
-      }
-    }
-
     // ---------------------------------------------------------------------- //
     // If we reached this point, node was not handled by any of the stages
     // (inline, specialize), simply add a copy to the graph.
@@ -1129,32 +1629,72 @@ Status FunctionOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
 #undef TF_SKIP_ERROR_IF_GRAPH_UNMODIFIED
   }
 
-  // Function specialization might change the number of function outputs, so we
-  // have to process the final optimized graph and update all the node mapping.
-  if (ctx.RequiresOutputMapping()) {
-    MutableGraphView optimized_graph_view(optimized_graph);
-    for (const auto& output_mapping : ctx.output_mappings()) {
-      const auto& node_name = output_mapping.first;
-      const auto& mappings = output_mapping.second;
+  // After function specialization and inlining graph might be in invalid
+  // state, and some nodes can read tensors that do not exists anymore in the
+  // optimized graph: function call node was fully inlined into the graph, or
+  // output index was invalidated by the output pruning.
 
-      for (const std::pair<int, int>& mapping : mappings) {
-        int from = mapping.first;
-        int to = mapping.second;
+  if (!ctx.tensor_mapping().empty()) {
+    for (NodeDef& node : *optimized_graph->mutable_node()) {
+      for (int idx = 0; idx < node.input_size(); ++idx) {
+        TensorId input_tensor = ParseTensorName(node.input(idx));
+        if (input_tensor.index() == Graph::kControlSlot) break;
 
-        // Get the output port corresponding to the old output position.
-        MutableGraphView::OutputPort from_port =
-            optimized_graph_view.GetOutputPort(node_name, from);
-
-        // Update all input ports that read from old output port.
-        for (MutableGraphView::InputPort to_port :
-             optimized_graph_view.GetFanout(from_port)) {
-          *to_port.node->mutable_input(to_port.port_id) =
-              strings::StrCat(node_name, ":", to);
+        auto mapping = ctx.tensor_mapping().find(input_tensor);
+        if (mapping != ctx.tensor_mapping().end()) {
+          node.set_input(idx, mapping->second.ToString());
         }
       }
     }
   }
 
+  // Function inlining instantiates function body directly into the optimized
+  // graph, and we might end up with control dependencies to the nodes that no
+  // longer exist in a graph. We need to apply control overrides to all
+  // invalidated nodes, and rewire control dependencies to the inlined
+  // side-effectful function body nodes.
+
+  // TODO(ezhulenev): With nested function call inlining, single pass over
+  // `control_overrides` might not bring the graph into a valid state,
+  // continue until it converges and all invalidated control dependencies
+  // removed.
+
+  if (!ctx.control_overrides().empty()) {
+    for (NodeDef& node : *optimized_graph->mutable_node()) {
+      // Keep track of new control inputs to the node.
+      gtl::FlatSet<string> add_ctrl_inputs;
+
+      // Remove all invalidated control inputs.
+      for (int idx = 0; idx < node.input_size(); ++idx) {
+        // TODO(ezhulenev): Use non-allocating TensorId after migrating
+        // `control_overrides()` to absl::flat_hash_set.
+        SafeTensorId input_tensor = ParseTensorName(node.input(idx));
+
+        auto overrides = ctx.control_overrides().find(input_tensor.node());
+        if (overrides != ctx.control_overrides().end()) {
+          // If this happens it's a bug in the function inlining.
+          if (input_tensor.index() != Graph::kControlSlot) {
+            return errors::Internal(
+                "Illegal input edge from inlined function call node");
+          }
+          // Remove control dependency to the inlined function call node.
+          node.mutable_input()->SwapElements(idx, node.input_size() - 1);
+          node.mutable_input()->RemoveLast();
+
+          // Keep track of all overrides.
+          for (const string& override : overrides->second) {
+            add_ctrl_inputs.insert(AsControlDependency(override));
+          }
+        }
+      }
+
+      // Add overrides to the node inputs.
+      for (const string& ctrl_input : add_ctrl_inputs) {
+        node.add_input(ctrl_input);
+      }
+    }
+  }
+
   *optimized_graph->mutable_versions() = item.graph.versions();
   *optimized_graph->mutable_library() =
       options_.enable_trim_function_library
diff --git a/tensorflow/core/grappler/optimizers/function_optimizer_test.cc b/tensorflow/core/grappler/optimizers/function_optimizer_test.cc
index 9bb51c26419..93a2fcda7bf 100644
--- a/tensorflow/core/grappler/optimizers/function_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/function_optimizer_test.cc
@@ -699,6 +699,241 @@ TEST_F(FunctionOptimizerTest, InlineSymbolicGradient_NoInlineFunc) {
   CompareGraphs(item.graph, output);
 }
 
+TEST_F(FunctionOptimizerTest, InlineIndirectFunctionSimpleFunction) {
+  using test::function::NDef;
+  using FDH = FunctionDefHelper;
+
+  FunctionOptimizer optimizer(RewriterConfig::AGGRESSIVE);
+
+  FunctionDef mul_func = FunctionDefHelper::Create(
+      "MyMul", {"x:T", "y:T"}, {"z:T"}, {"T: {float, double}"},
+      {{{"mul"}, "Mul", {"x", "y"}, {{"T", "$T"}}}},
+      /* Mapping between function returns and function node outputs. */
+      {{"z", "mul:z:0"}});
+
+  // Build a graph to compute c = MyMul(a, b)
+  GrapplerItem item;
+  item.fetch = {"d"};
+  item.graph = test::function::GDef(
+      {NDef("a", "Placeholder", {}, {{"dtype", DT_FLOAT}}, kDevice),
+       NDef("b", "Placeholder", {}, {{"dtype", DT_FLOAT}}, kDevice),
+       NDef("c", "PartitionedCall", {"a", "b"},
+            {{"Tin", DataTypeSlice{DT_FLOAT, DT_FLOAT}},
+             {"Tout", DataTypeSlice{DT_FLOAT}},
+             {"f", FDH::FunctionRef("MyMul", {{"T", DT_FLOAT}})}},
+            kDevice),
+       NDef("d", "Identity", {"c"}, {{"T", DT_FLOAT}}, kDevice)},
+      // Function library.
+      {mul_func} /* Function library */);
+
+  GraphDef optimized_graph;
+  TF_EXPECT_OK(optimizer.Optimize(nullptr, item, &optimized_graph));
+
+  GraphDef expected = test::function::GDef(
+      {NDef("a", "Placeholder", {}, {{"dtype", DT_FLOAT}}, kDevice),
+       NDef("b", "Placeholder", {}, {{"dtype", DT_FLOAT}}, kDevice),
+
+       // Function must be inlined and all nodes placed on a valid device.
+       NDef("c/x", "Identity", {"a:0"}, {{"T", DT_FLOAT}}, kDevice),
+       NDef("c/y", "Identity", {"b:0"}, {{"T", DT_FLOAT}}, kDevice),
+       NDef("c/mul", "Mul", {"c/x", "c/y"}, {{"T", DT_FLOAT}}, kDevice),
+
+       NDef("d", "Identity", {"c/mul:0"}, {{"T", DT_FLOAT}}, kDevice)},
+      // Function library.
+      {mul_func});
+
+  CompareGraphs(expected, optimized_graph);
+
+  Tensor pi = test::AsScalar<float>(3.14f);
+  item.feed.emplace_back("a", pi);
+  item.feed.emplace_back("b", pi);
+
+  GrapplerItem optimized(item, std::move(optimized_graph));
+  auto tensors_expected = EvaluateFetchNodes(item);
+  auto tensors = EvaluateFetchNodes(optimized);
+  ASSERT_EQ(tensors_expected.size(), 1);
+  ASSERT_EQ(tensors.size(), tensors_expected.size());
+  test::ExpectTensorEqual<float>(tensors_expected[0], tensors[0]);
+}
+
+TEST_F(FunctionOptimizerTest, InlineIndirectFunctionWithControlDependencies) {
+  using test::function::NDef;
+  using FDH = FunctionDefHelper;
+
+  FunctionOptimizer optimizer(RewriterConfig::AGGRESSIVE);
+
+  const Tensor kOne = test::AsScalar<float>(1.0);
+  const Tensor kTwo = test::AsScalar<float>(2.0);
+  const TensorShape scalar = TensorShape({});
+
+  // Compute `x*y` and add `1.0` to the variable.
+  FunctionDef mul_func = FunctionDefHelper::Create(
+      "MyMul", {"x:T", "y:T", "v: resource"}, {"z:T"}, {"T: {float, double}"},
+      {{{"one"}, "Const", {}, {{"value", kOne}, {"dtype", DT_FLOAT}}},
+       {{"add"},
+        "AssignAddVariableOp",
+        {"v", "one:output:0"},
+        {{"dtype", DT_FLOAT}}},
+       {{"mul"}, "Mul", {"x", "y", "^add"}, {{"T", "$T"}}}},
+      /* Mapping between function returns and function node outputs. */
+      {{"z", "mul:z:0"}});
+
+  // Build a graph to compute:
+  //   a = Placeholder
+  //   b = Placeholder
+  //   v = VarHandleOp(init = a)
+  //   f1 = MyMul(a, b, v)
+  //   f2 = MyMul(f1, f1, v)
+  //   return [f2, v]
+  GrapplerItem item;
+  item.fetch = {"out_1", "out_2"};
+  item.graph = test::function::GDef(
+      {NDef("a", "Placeholder", {}, {{"dtype", DT_FLOAT}}, kDevice),
+       NDef("b", "Placeholder", {}, {{"dtype", DT_FLOAT}}, kDevice),
+
+       // Initialize variable with one of the placeholders.
+       NDef("v", "VarHandleOp", {}, {{"dtype", DT_FLOAT}, {"shape", scalar}}),
+       NDef("init_v", "AssignVariableOp", {"v", "a"}, {{"dtype", DT_FLOAT}},
+            kDevice),
+
+       // Call function first time.
+       NDef("f1", "PartitionedCall", {"a", "b", "v", "^init_v"},
+            {{"Tin", DataTypeSlice{DT_FLOAT, DT_FLOAT, DT_RESOURCE}},
+             {"Tout", DataTypeSlice{DT_FLOAT}},
+             {"f", FDH::FunctionRef("MyMul", {{"T", DT_FLOAT}})}},
+            kDevice),
+
+       // Call function second time.
+       NDef("f2", "PartitionedCall", {"f1", "f1", "v", "^f1"},
+            {{"Tin", DataTypeSlice{DT_FLOAT, DT_FLOAT, DT_RESOURCE}},
+             {"Tout", DataTypeSlice{DT_FLOAT}},
+             {"f", FDH::FunctionRef("MyMul", {{"T", DT_FLOAT}})}},
+            kDevice),
+
+       // Return result of multiplication and a current value of the variable.
+       NDef("out_1", "Identity", {"f2"}, {{"T", DT_FLOAT}}, kDevice),
+       NDef("out_2", "ReadVariableOp", {"v", "^f2"}, {{"dtype", DT_FLOAT}},
+            kDevice)},
+
+      // Function library.
+      {mul_func});
+
+  GraphDef optimized_graph;
+  TF_EXPECT_OK(optimizer.Optimize(nullptr, item, &optimized_graph));
+
+  GraphDef expected = test::function::GDef(
+      {NDef("a", "Placeholder", {}, {{"dtype", DT_FLOAT}}, kDevice),
+       NDef("b", "Placeholder", {}, {{"dtype", DT_FLOAT}}, kDevice),
+
+       // Initialize variable with one of the placeholders.
+       NDef("v", "VarHandleOp", {}, {{"dtype", DT_FLOAT}, {"shape", scalar}}),
+       NDef("init_v", "AssignVariableOp", {"v", "a"}, {{"dtype", DT_FLOAT}},
+            kDevice),
+
+       // Function body of a first function call inlined into the graph.
+       NDef("f1/x", "Identity", {"a:0", "^init_v"}, {{"T", DT_FLOAT}}, kDevice),
+       NDef("f1/y", "Identity", {"b:0", "^init_v"}, {{"T", DT_FLOAT}}, kDevice),
+       NDef("f1/v", "Identity", {"v:0", "^init_v"}, {{"T", DT_RESOURCE}},
+            kDevice),
+       NDef("f1/one", "Const", {"^f1/x"},
+            {{"dtype", DT_FLOAT}, {"value", kOne}}, kDevice),
+       NDef("f1/add", "AssignAddVariableOp", {"f1/v", "f1/one"},
+            {{"dtype", DT_FLOAT}}, kDevice),
+       NDef("f1/mul", "Mul", {"f1/x", "f1/y", "^f1/add"}, {{"T", DT_FLOAT}},
+            kDevice),
+
+       // Function body of a second function call also inlined into the graph,
+       // and input nodes read directly from the inlined nodes of the first
+       // function call.
+       NDef("f2/x", "Identity", {"f1/mul:0", "^f1/add"}, {{"T", DT_FLOAT}},
+            kDevice),
+       NDef("f2/y", "Identity", {"f1/mul:0", "^f1/add"}, {{"T", DT_FLOAT}},
+            kDevice),
+       NDef("f2/v", "Identity", {"v:0", "^f1/add"}, {{"T", DT_RESOURCE}},
+            kDevice),
+       NDef("f2/one", "Const", {"^f2/x"},
+            {{"dtype", DT_FLOAT}, {"value", kOne}}, kDevice),
+       NDef("f2/add", "AssignAddVariableOp", {"f2/v", "f2/one"},
+            {{"dtype", DT_FLOAT}}, kDevice),
+       NDef("f2/mul", "Mul", {"f2/x", "f2/y", "^f2/add"}, {{"T", DT_FLOAT}},
+            kDevice),
+
+       // Return values read directly from inlined nodes.
+       NDef("out_1", "Identity", {"f2/mul:0"}, {{"T", DT_FLOAT}}, kDevice),
+       NDef("out_2", "ReadVariableOp", {"v", "^f2/add"}, {{"dtype", DT_FLOAT}},
+            kDevice)},
+
+      // Function library.
+      {mul_func});
+
+  CompareGraphs(expected, optimized_graph);
+
+  item.feed.emplace_back("a", kOne);
+  item.feed.emplace_back("b", kTwo);
+
+  auto tensors_expected = EvaluateFetchNodes(item);
+  ASSERT_EQ(tensors_expected.size(), 2);
+  EXPECT_EQ(tensors_expected[0].flat<float>()(0), 4.0);  // mul
+  EXPECT_EQ(tensors_expected[1].flat<float>()(0), 3.0);  // read variable
+
+  GrapplerItem optimized(item, std::move(optimized_graph));
+  auto tensors = EvaluateFetchNodes(optimized);
+  test::ExpectTensorEqual<float>(tensors_expected[0], tensors[0]);
+}
+
+TEST_F(FunctionOptimizerTest, InlineIndirectFunctionWithDevicePlacement) {
+  using test::function::NDef;
+  using FDH = FunctionDefHelper;
+
+  FunctionOptimizer optimizer(RewriterConfig::AGGRESSIVE);
+
+  FunctionDef mul_func = FunctionDefHelper::Create(
+      "MyMul", {"x:T", "y:T"}, {"z:T"}, {"T: {float, double}"},
+      {{{"mul"}, "Mul", {"x", "y"}, {{"T", "$T"}}}},
+      /* Mapping between function returns and function node outputs. */
+      {{"z", "mul:z:0"}});
+  // Add device placement spec to the function body node.
+  (*mul_func.mutable_node_def())[0].set_device("/device:CPU:1");
+
+  // We need fully defined device names to run the placer for inlined function.
+  const string cpu0 = "/job:work/replica:1/task:1/device:CPU:0";
+  const string cpu1 = "/job:work/replica:1/task:1/device:CPU:1";
+
+  // Build a graph to compute c = MyMul(a, b)
+  GrapplerItem item;
+  item.fetch = {"d"};
+  item.graph = test::function::GDef(
+      {NDef("a", "Placeholder", {}, {{"dtype", DT_FLOAT}}, cpu0),
+       NDef("b", "Placeholder", {}, {{"dtype", DT_FLOAT}}, cpu1),
+       NDef("c", "PartitionedCall", {"a", "b"},
+            {{"Tin", DataTypeSlice{DT_FLOAT, DT_FLOAT}},
+             {"Tout", DataTypeSlice{DT_FLOAT}},
+             {"f", FDH::FunctionRef("MyMul", {{"T", DT_FLOAT}})}},
+            cpu0),
+       NDef("d", "Identity", {"c"}, {{"T", DT_FLOAT}}, cpu0)},
+      // Function library.
+      {mul_func});
+  ASSERT_TRUE(item.InferDevicesFromGraph().ok());
+
+  GraphDef optimized_graph;
+  TF_EXPECT_OK(optimizer.Optimize(nullptr, item, &optimized_graph));
+
+  GraphDef expected = test::function::GDef(
+      {NDef("a", "Placeholder", {}, {{"dtype", DT_FLOAT}}, cpu0),
+       NDef("b", "Placeholder", {}, {{"dtype", DT_FLOAT}}, cpu1),
+
+       // Function must be inlined and `mul` node placed on a requested device.
+       NDef("c/x", "Identity", {"a:0"}, {{"T", DT_FLOAT}}, cpu1),
+       NDef("c/y", "Identity", {"b:0"}, {{"T", DT_FLOAT}}, cpu1),
+       NDef("c/mul", "Mul", {"c/x", "c/y"}, {{"T", DT_FLOAT}}, cpu1),
+
+       NDef("d", "Identity", {"c/mul:0"}, {{"T", DT_FLOAT}}, cpu0)},
+      // Function library.
+      {mul_func});
+
+  CompareGraphs(expected, optimized_graph);
+}
+
 TEST_F(FunctionOptimizerTest, SpecializeFunctionXTimesTwo) {
   using test::function::NDef;
 
diff --git a/tensorflow/core/grappler/optimizers/graph_optimizer_stage.h b/tensorflow/core/grappler/optimizers/graph_optimizer_stage.h
index f31a30ec0ed..99fcb315238 100644
--- a/tensorflow/core/grappler/optimizers/graph_optimizer_stage.h
+++ b/tensorflow/core/grappler/optimizers/graph_optimizer_stage.h
@@ -239,7 +239,8 @@ class GraphOptimizerStagePipeline {
         // case of any error it must leave optimized graph unmodified.
         if (!stage_status.ok()) {
           LOG(WARNING) << "Failed to run optimizer " << stage->optimizer_name()
-                       << ", stage " << stage->stage_name()
+                       << ", stage " << stage->stage_name() << " node "
+                       << node->name()
                        << ". Error: " << stage_status.error_message();
         }
         if (break_predicate_(*result)) return true;
diff --git a/tensorflow/core/grappler/optimizers/graph_rewriter.cc b/tensorflow/core/grappler/optimizers/graph_rewriter.cc
deleted file mode 100644
index b45ceb12a79..00000000000
--- a/tensorflow/core/grappler/optimizers/graph_rewriter.cc
+++ /dev/null
@@ -1,214 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/core/grappler/optimizers/graph_rewriter.h"
-#include <unordered_map>
-#include <unordered_set>
-#include "tensorflow/core/framework/function.pb.h"
-#include "tensorflow/core/framework/node_def.pb.h"
-#include "tensorflow/core/framework/node_def_util.h"
-#include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/op_def.pb.h"
-#include "tensorflow/core/grappler/grappler_item.h"
-#include "tensorflow/core/grappler/op_types.h"
-#include "tensorflow/core/grappler/utils.h"
-
-namespace tensorflow {
-namespace grappler {
-
-GraphRewriter::GraphRewriter(const GrapplerItem& item) {
-  OpRegistryInterface* op_registry = OpRegistry::Global();
-  for (auto& node : item.graph.node()) {
-    NodeInfo* info = new NodeInfo();
-    info->def = &node;
-
-    const OpRegistrationData* op_reg_data = nullptr;
-    Status s = op_registry->LookUp(node.op(), &op_reg_data);
-    // TODO(bsteiner): make this not a best-effort lookup and evaluation?
-    if (s.ok()) {
-      DataTypeVector inputs;
-      s = InOutTypesForNode(node, op_reg_data->op_def, &inputs, &info->outputs);
-      if (!s.ok()) {
-        info->outputs.clear();
-      }
-    }
-
-    nodes_[node.name()].reset(info);
-  }
-
-  std::unordered_set<string> function_names;
-  for (const auto& function : item.graph.library().function()) {
-    function_names.insert(function.signature().name());
-  }
-
-  for (auto& node : item.graph.node()) {
-    RecordConnectivity(node, function_names);
-  }
-}
-
-void GraphRewriter::ForwardInputs(
-    const NodeDef& original_node,
-    const std::unordered_set<const NodeDef*>& nodes_to_delete,
-    NodeDef* new_node) {
-  ForwardInputsInternal(original_node, nodes_to_delete, false, new_node);
-  if (!new_node->name().empty()) {
-    optimized_nodes_[new_node->name()] = new_node;
-  }
-  // Reorder inputs such that control inputs come after regular inputs.
-  int pos = 0;
-  for (int i = 0; i < new_node->input_size(); ++i) {
-    if (!IsControlInput(new_node->input(i))) {
-      new_node->mutable_input()->SwapElements(pos, i);
-      ++pos;
-    }
-  }
-  DedupControlInputs(new_node);
-}
-
-bool GraphRewriter::DrivesControlDependency(const NodeDef& node) const {
-  return control_dependency_drivers_.find(&node) !=
-         control_dependency_drivers_.end();
-}
-
-bool GraphRewriter::FeedsMerge(const NodeDef& node) const {
-  return merge_feeders_.find(&node) != merge_feeders_.end();
-}
-
-bool GraphRewriter::IsDrivenByControlDependency(const NodeDef& node) const {
-  for (const auto& input : node.input()) {
-    CHECK(!input.empty());
-    if (input[0] == '^') {
-      return true;
-    }
-  }
-  return false;
-}
-
-bool GraphRewriter::IsConnectedToFunction(const NodeDef& node) const {
-  return function_neighbors_.find(&node) != function_neighbors_.end();
-}
-
-bool GraphRewriter::IsDrivenByAnotherDevice(const NodeDef& node) const {
-  return cross_device_receivers_.find(&node) != cross_device_receivers_.end();
-}
-
-bool GraphRewriter::ReceivesRefValue(const NodeDef& node) const {
-  return ref_receivers_.find(&node) != ref_receivers_.end();
-}
-
-bool GraphRewriter::IsDrivenBySwitch(const NodeDef& node) const {
-  return switch_receivers_.find(&node) != switch_receivers_.end();
-}
-
-bool GraphRewriter::RemovalIncreasesEdgeCount(const NodeDef& node) const {
-  const int in_degree = node.input_size();
-  auto itr = nodes_.find(node.name());
-  if (itr == nodes_.end()) {
-    return true;
-  }
-  const int out_degree = itr->second->out_degree;
-  return in_degree * out_degree > in_degree + out_degree;
-}
-
-void GraphRewriter::RecordConnectivity(
-    const NodeDef& node, const std::unordered_set<string>& function_names) {
-  const bool is_function =
-      function_names.find(node.op()) != function_names.end();
-
-  bool ref_receiver = false;
-  bool switch_receiver = false;
-  for (const auto& input : node.input()) {
-    int position = 0;
-    string input_node_name = ParseNodeName(input, &position);
-    auto itr = nodes_.find(input_node_name);
-    if (itr == nodes_.end()) {
-      continue;
-    }
-
-    NodeInfo* fanin_info = itr->second.get();
-    const NodeDef* fanin = fanin_info->def;
-    if (IsMerge(node)) {
-      merge_feeders_.insert(fanin);
-    }
-    // Update out_degree of fanin.
-    ++fanin_info->out_degree;
-    if (position < 0) {
-      // This is a control edge
-      control_dependency_drivers_.insert(fanin);
-    } else {
-      // This is a regular edge
-      if (function_names.find(fanin->op()) != function_names.end()) {
-        function_neighbors_.insert(&node);
-      }
-      if (is_function) {
-        function_neighbors_.insert(fanin);
-      }
-      if (IsSwitch(*fanin)) {
-        switch_receiver = true;
-      }
-      if (position < fanin_info->outputs.size() &&
-          IsRefType(fanin_info->outputs[position])) {
-        ref_receiver = true;
-      }
-    }
-    if (fanin->device() != node.device()) {
-      cross_device_receivers_.insert(&node);
-    }
-  }
-
-  if (ref_receiver) {
-    ref_receivers_.insert(&node);
-  }
-  if (switch_receiver) {
-    switch_receivers_.insert(&node);
-  }
-}
-
-void GraphRewriter::ForwardInputsInternal(
-    const NodeDef& node,
-    const std::unordered_set<const NodeDef*>& nodes_to_delete,
-    bool add_as_control, NodeDef* new_node) {
-  // To speed things up, use the optimized version of the node if
-  // available.
-  auto itr = optimized_nodes_.find(node.name());
-  if (itr != optimized_nodes_.end()) {
-    for (const string& input : itr->second->input()) {
-      *new_node->add_input() =
-          add_as_control ? AsControlDependency(NodeName(input)) : input;
-    }
-    return;
-  }
-  for (const auto& input : node.input()) {
-    const string input_node_name = NodeName(input);
-    auto itr = nodes_.find(input_node_name);
-    if (itr == nodes_.end()) {
-      // Invalid input, preserve it as is.
-      *new_node->add_input() =
-          add_as_control ? AsControlDependency(NodeName(input)) : input;
-      continue;
-    }
-    const NodeDef* input_node = itr->second->def;
-    if (nodes_to_delete.find(input_node) != nodes_to_delete.end()) {
-      ForwardInputsInternal(*input_node, nodes_to_delete,
-                            add_as_control || IsControlInput(input), new_node);
-    } else {
-      *new_node->add_input() =
-          add_as_control ? AsControlDependency(NodeName(input)) : input;
-    }
-  }
-}
-
-}  // end namespace grappler
-}  // end namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/graph_rewriter.h b/tensorflow/core/grappler/optimizers/graph_rewriter.h
deleted file mode 100644
index 4a5a150dc92..00000000000
--- a/tensorflow/core/grappler/optimizers/graph_rewriter.h
+++ /dev/null
@@ -1,102 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_GRAPH_REWRITER_H_
-#define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_GRAPH_REWRITER_H_
-
-#include <unordered_map>
-#include <unordered_set>
-#include "tensorflow/core/grappler/grappler_item.h"
-
-namespace tensorflow {
-namespace grappler {
-
-// Tools and utilities to simplify common graph rewrites.
-class GraphRewriter {
- public:
-  GraphRewriter(const GrapplerItem& item);
-
-  // Forward the inputs of original_node as needed to skip over the nodes that
-  // are to be deleted. In other words, if I is an input of 'original_node', and
-  // I doesn't belong to one of the nodes in 'nodes_to_delete', I will be an
-  // input to 'new_node'. On the other hand, if I belong to a node that will be
-  // deleted, I will be replaced with the inputs J of the deleted node (unless J
-  // belong to nodes that will be deleted, in which case we'll look for
-  // preserved inputs further down the graph).
-  void ForwardInputs(const NodeDef& original_node,
-                     const std::unordered_set<const NodeDef*>& nodes_to_delete,
-                     NodeDef* new_node);
-
-  // Returns true if at least one of the edges in the direct fanout of 'node' is
-  // a control dependency edge.
-  bool DrivesControlDependency(const NodeDef& node) const;
-
-  // Returns true if at least one of the incident edges is a control dependency
-  // edge.
-  bool IsDrivenByControlDependency(const NodeDef& node) const;
-
-  // Returns true if at least one of the nodes in the direct fanin or the direct
-  // fanout (excluding control dependencies) of 'node' is a function.
-  bool IsConnectedToFunction(const NodeDef& node) const;
-
-  // Returns true if the node is driven by at least one node placed on another
-  // device.
-  bool IsDrivenByAnotherDevice(const NodeDef& node) const;
-
-  // Returns true if the node has input from a stateful op.
-  bool ReceivesRefValue(const NodeDef& node) const;
-
-  // Returns true if the node is driven by a Switch node.
-  bool IsDrivenBySwitch(const NodeDef& node) const;
-
-  // Returns true if the node feeds a Merge node.
-  bool FeedsMerge(const NodeDef& node) const;
-
-  // Returns true if removal of this degree would increase edge count, i.e. if
-  // in-degree * out-degree > in-degree + out-degree or if the condition could
-  // not be verified.
-  bool RemovalIncreasesEdgeCount(const NodeDef& node) const;
-
- private:
-  void RecordConnectivity(const NodeDef& node,
-                          const std::unordered_set<string>& function_names);
-  void ForwardInputsInternal(
-      const NodeDef& original_node,
-      const std::unordered_set<const NodeDef*>& nodes_to_delete,
-      bool add_as_control, NodeDef* new_node);
-
-  struct NodeInfo {
-    int out_degree = 0;
-    const NodeDef* def;
-
-    // These are filled in when the NodeInfo is built, but not that they
-    // may be empty - if the op could not be loaded from the registry.
-    DataTypeVector outputs;
-  };
-
-  std::unordered_map<string, std::unique_ptr<NodeInfo>> nodes_;
-  std::unordered_map<string, const NodeDef*> optimized_nodes_;
-  std::unordered_set<const NodeDef*> control_dependency_drivers_;
-  std::unordered_set<const NodeDef*> function_neighbors_;
-  std::unordered_set<const NodeDef*> cross_device_receivers_;
-  std::unordered_set<const NodeDef*> ref_receivers_;
-  std::unordered_set<const NodeDef*> switch_receivers_;
-  std::unordered_set<const NodeDef*> merge_feeders_;
-};
-
-}  // end namespace grappler
-}  // end namespace tensorflow
-
-#endif  // TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_GRAPH_REWRITER_H_
diff --git a/tensorflow/core/grappler/optimizers/layout_optimizer.cc b/tensorflow/core/grappler/optimizers/layout_optimizer.cc
index 7dc62e24df5..f4653505f71 100644
--- a/tensorflow/core/grappler/optimizers/layout_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/layout_optimizer.cc
@@ -119,6 +119,8 @@ std::set<string> GetOpsFormatAgnostic() {
                                           "Exit",
                                           "Exp",
                                           "Expm1",
+                                          "FakeQuantWithMinMaxVars",
+                                          "FakeQuantWithMinMaxArgs",
                                           "Fill",
                                           "Floor",
                                           "FloorDiv",
@@ -161,6 +163,8 @@ std::set<string> GetOpsFormatAgnostic() {
                                           "PreventGradient",
                                           "Prod",
                                           "Polygamma",
+                                          "QuantizeAndDequantizeV2",
+                                          "QuantizeAndDequantizeV3",
                                           "Pow",
                                           "Real",
                                           "RealDiv",
diff --git a/tensorflow/core/grappler/optimizers/memory_optimizer.cc b/tensorflow/core/grappler/optimizers/memory_optimizer.cc
index e0a913565fc..453db5d91e7 100644
--- a/tensorflow/core/grappler/optimizers/memory_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/memory_optimizer.cc
@@ -33,7 +33,6 @@ limitations under the License.
 #include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/grappler/mutable_graph_view.h"
 #include "tensorflow/core/grappler/op_types.h"
-#include "tensorflow/core/grappler/optimizers/graph_rewriter.h"
 #include "tensorflow/core/grappler/optimizers/static_schedule.h"
 #include "tensorflow/core/grappler/utils.h"
 #include "tensorflow/core/grappler/utils/topological_sort.h"
diff --git a/tensorflow/core/grappler/optimizers/meta_optimizer.cc b/tensorflow/core/grappler/optimizers/meta_optimizer.cc
index 82c88bb06ae..7c83036341c 100644
--- a/tensorflow/core/grappler/optimizers/meta_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/meta_optimizer.cc
@@ -39,6 +39,7 @@ limitations under the License.
 #include "tensorflow/core/grappler/utils/topological_sort.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
+#include "tensorflow/core/util/dump_graph.h"
 #include "tensorflow/core/util/ptr_util.h"
 
 namespace tensorflow {
@@ -127,8 +128,10 @@ std::unique_ptr<GraphOptimizer> MetaOptimizer::MakeNewOptimizer(
 
 #undef MK_OPT
 
-MetaOptimizer::MetaOptimizer(DeviceBase* cpu_device, const RewriterConfig& cfg)
-    : cpu_device_(cpu_device), cfg_(cfg) {
+MetaOptimizer::MetaOptimizer(DeviceBase* cpu_device, const ConfigProto& cfg)
+    : cpu_device_(cpu_device),
+      config_proto_(cfg),
+      cfg_(*config_proto_.mutable_graph_options()->mutable_rewrite_options()) {
   DCHECK(cpu_device_ == nullptr ||
          cpu_device_->attributes().device_type() == "CPU");
 }
@@ -460,6 +463,9 @@ Status MetaOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
   // optimize TPU functions with Grappler, this check preserves that.
   if (IsTPUGraphDef(*optimized_graph)) {
     VLOG(2) << "Skipping optimizing funcs for TPU graphs";
+    if (VLOG_IS_ON(1)) {
+      DumpGraphDefToFile("after_MetaOptimizer", *optimized_graph);
+    }
     return Status::OK();
   }
 
@@ -518,9 +524,19 @@ Status MetaOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
       // can't perform non-differentiable rewrites.
       if (differentiable_functions.find(func_name) !=
           differentiable_functions.end()) {
-        func_item.allowed_optimizations.non_differentiable_rewrites = false;
+        func_item.allowed_optimizations().non_differentiable_rewrites = false;
       }
 
+      // Function item is allowed to use all devices from the main graph.
+      Status added_devices = func_item.AddDevices(item);
+      if (!added_devices.ok()) {
+        VLOG(3) << added_devices.error_message();
+      }
+
+      // We can safely inline nested function calls with side-effectful ops into
+      // the function body (see function_optimizer.cc for details).
+      func_item.allowed_optimizations().inline_ops_with_side_effects = true;
+
       // Optimize function body graph.
       GraphDef optimized_func_graph;
       TF_RETURN_IF_ERROR(
@@ -553,6 +569,9 @@ Status MetaOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
   VLOG(1) << "Optimized " << optimized_funcs.size()
           << " functions: " << str_util::Join(optimized_funcs, ", ");
 
+  if (VLOG_IS_ON(1)) {
+    DumpGraphDefToFile("after_MetaOptimizer", *optimized_graph);
+  }
   return Status::OK();
 }
 
@@ -570,32 +589,35 @@ void MetaOptimizer::Feedback(Cluster* cluster, const GrapplerItem& item,
   // Nothing to do for MetaOptimizer.
 }
 
-bool MetaOptimizerEnabled(const RewriterConfig& cfg) {
-  if (cfg.disable_meta_optimizer()) {
+bool MetaOptimizerEnabled(const ConfigProto& cfg) {
+  const auto& rewrite_cfg = cfg.graph_options().rewrite_options();
+  if (rewrite_cfg.disable_meta_optimizer()) {
     return false;
   }
-  return !cfg.disable_model_pruning() ||
-         cfg.layout_optimizer() != RewriterConfig::OFF ||
-         cfg.function_optimization() != RewriterConfig::OFF ||
-         cfg.constant_folding() != RewriterConfig::OFF ||
-         cfg.shape_optimization() != RewriterConfig::OFF ||
-         cfg.remapping() != RewriterConfig::OFF ||
-         cfg.arithmetic_optimization() != RewriterConfig::OFF ||
-         cfg.loop_optimization() != RewriterConfig::OFF ||
-         cfg.dependency_optimization() != RewriterConfig::OFF ||
-         cfg.auto_parallel().enable() ||
-         cfg.memory_optimization() != RewriterConfig::NO_MEM_OPT ||
-         cfg.debug_stripper() == RewriterConfig::ON ||
-         cfg.scoped_allocator_optimization() == RewriterConfig::ON ||
-         cfg.pin_to_host_optimization() == RewriterConfig::ON ||
-         !cfg.optimizers().empty() || !cfg.custom_optimizers().empty();
+  return !rewrite_cfg.disable_model_pruning() ||
+         rewrite_cfg.layout_optimizer() != RewriterConfig::OFF ||
+         rewrite_cfg.function_optimization() != RewriterConfig::OFF ||
+         rewrite_cfg.constant_folding() != RewriterConfig::OFF ||
+         rewrite_cfg.shape_optimization() != RewriterConfig::OFF ||
+         rewrite_cfg.remapping() != RewriterConfig::OFF ||
+         rewrite_cfg.arithmetic_optimization() != RewriterConfig::OFF ||
+         rewrite_cfg.loop_optimization() != RewriterConfig::OFF ||
+         rewrite_cfg.dependency_optimization() != RewriterConfig::OFF ||
+         rewrite_cfg.auto_parallel().enable() ||
+         rewrite_cfg.memory_optimization() != RewriterConfig::NO_MEM_OPT ||
+         rewrite_cfg.debug_stripper() == RewriterConfig::ON ||
+         rewrite_cfg.scoped_allocator_optimization() == RewriterConfig::ON ||
+         rewrite_cfg.pin_to_host_optimization() == RewriterConfig::ON ||
+         !rewrite_cfg.optimizers().empty() ||
+         !rewrite_cfg.custom_optimizers().empty();
 }
 
-Status RunMetaOptimizer(const GrapplerItem& item, const RewriterConfig& cfg,
+Status RunMetaOptimizer(const GrapplerItem& item, const ConfigProto& cfg,
                         DeviceBase* cpu_device, Cluster* cluster,
                         GraphDef* optimized_graph) {
   MetaOptimizer optimizer(cpu_device, cfg);
-  optimizer.set_deadline_usec(DeadlineMicroSeconds(cfg));
+  optimizer.set_deadline_usec(
+      DeadlineMicroSeconds(cfg.graph_options().rewrite_options()));
   Status status = optimizer.Optimize(cluster, item, optimized_graph);
   if (!status.ok()) {
     *optimized_graph = item.graph;
diff --git a/tensorflow/core/grappler/optimizers/meta_optimizer.h b/tensorflow/core/grappler/optimizers/meta_optimizer.h
index e599a9201bc..a06da4394e4 100644
--- a/tensorflow/core/grappler/optimizers/meta_optimizer.h
+++ b/tensorflow/core/grappler/optimizers/meta_optimizer.h
@@ -20,6 +20,7 @@ limitations under the License.
 #include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/grappler/optimizers/graph_optimizer.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/protobuf/config.pb.h"
 #include "tensorflow/core/protobuf/rewriter_config.pb.h"
 
 namespace tensorflow {
@@ -28,7 +29,7 @@ namespace grappler {
 // Run the other grappler optimizers based on the specified rewriter config.
 class MetaOptimizer : public GraphOptimizer {
  public:
-  MetaOptimizer(DeviceBase* cpu_device, const RewriterConfig& cfg);
+  MetaOptimizer(DeviceBase* cpu_device, const ConfigProto& cfg);
   ~MetaOptimizer() override = default;
 
   string name() const override { return "meta_optimizer"; };
@@ -65,7 +66,8 @@ class MetaOptimizer : public GraphOptimizer {
                        GraphDef* optimized_graph);
 
   DeviceBase* const cpu_device_;  // may be NULL
-  RewriterConfig cfg_;
+  ConfigProto config_proto_;
+  RewriterConfig& cfg_;
 
   struct OptimizerResult {
     string optimizer_name;
@@ -85,7 +87,7 @@ class MetaOptimizer : public GraphOptimizer {
   std::vector<GraphOptimizationResult> optimization_results_;
 };
 
-bool MetaOptimizerEnabled(const RewriterConfig& cfg);
+bool MetaOptimizerEnabled(const ConfigProto& cfg);
 
 // Run the meta optimizer.
 //
@@ -93,7 +95,7 @@ bool MetaOptimizerEnabled(const RewriterConfig& cfg);
 // during constant folding; if NULL, a new device is created for doing constant
 // folding. For performance, it is recommended to pass in an existing cpu_device
 // when possible.
-Status RunMetaOptimizer(const GrapplerItem& item, const RewriterConfig& cfg,
+Status RunMetaOptimizer(const GrapplerItem& item, const ConfigProto& cfg,
                         DeviceBase* cpu_device, Cluster* cluster,
                         GraphDef* optimized_graph);
 
diff --git a/tensorflow/core/grappler/optimizers/meta_optimizer_test.cc b/tensorflow/core/grappler/optimizers/meta_optimizer_test.cc
index 6105bf27bab..42b867b6ac1 100644
--- a/tensorflow/core/grappler/optimizers/meta_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/meta_optimizer_test.cc
@@ -108,7 +108,7 @@ class GrapplerItemPropertiesAccumulator : public CustomGraphOptimizer {
                   GraphDef* optimized_graph) override {
     *optimized_graph = item.graph;
     if (allowed_optimizations_) {
-      allowed_optimizations_->insert({item.id, item.allowed_optimizations});
+      allowed_optimizations_->insert({item.id, item.allowed_optimizations()});
     }
     return Status::OK();
   }
@@ -134,11 +134,13 @@ TEST_F(MetaOptimizerTest, RunsCustomOptimizer) {
   CHECK(fake_input.NextItem(&item));
 
   TestOptimizer::SetOptimized(false);
-  RewriterConfig rewriter_config;
+  ConfigProto config_proto;
+  auto& rewriter_config =
+      *config_proto.mutable_graph_options()->mutable_rewrite_options();
   rewriter_config.add_optimizers("TestOptimizer");
   rewriter_config.set_min_graph_nodes(-1);
 
-  MetaOptimizer optimizer(nullptr, rewriter_config);
+  MetaOptimizer optimizer(nullptr, config_proto);
   GraphDef output;
   const Status status = optimizer.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
@@ -151,13 +153,15 @@ TEST_F(MetaOptimizerTest, RunsCustomOptimizerWithParams) {
   CHECK(fake_input.NextItem(&item));
 
   TestOptimizer::SetOptimized(false);
-  RewriterConfig rewriter_config;
+  ConfigProto config_proto;
+  auto& rewriter_config =
+      *config_proto.mutable_graph_options()->mutable_rewrite_options();
   rewriter_config.add_optimizers("TestOptimizerWithParams");
   auto* custom_config = rewriter_config.add_custom_optimizers();
   custom_config->set_name("TestOptimizerWithParams");
   (*custom_config->mutable_parameter_map())["foo"] = AttrValue();
 
-  MetaOptimizer optimizer(nullptr, rewriter_config);
+  MetaOptimizer optimizer(nullptr, config_proto);
   GraphDef output;
   const Status status = optimizer.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
@@ -171,13 +175,15 @@ TEST_F(MetaOptimizerTest, RunsCustomOptimizerAndCustomGraphOptimizer) {
 
   TestOptimizer::SetOptimized(false);
   TestGraphOptimizer::SetOptimized(false);
-  RewriterConfig rewriter_config;
+  ConfigProto config_proto;
+  auto& rewriter_config =
+      *config_proto.mutable_graph_options()->mutable_rewrite_options();
   rewriter_config.add_optimizers("TestOptimizer");
   auto customGraphOptimizer = rewriter_config.add_custom_optimizers();
   customGraphOptimizer->set_name("TestGraphOptimizer");
   rewriter_config.set_min_graph_nodes(-1);
 
-  MetaOptimizer optimizer(nullptr, rewriter_config);
+  MetaOptimizer optimizer(nullptr, config_proto);
   GraphDef output;
   const Status status = optimizer.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
@@ -190,11 +196,13 @@ TEST_F(MetaOptimizerTest, RunOptimizersTwice) {
   GrapplerItem item;
   CHECK(fake_input.NextItem(&item));
 
-  RewriterConfig rewriter_config;
+  ConfigProto config_proto;
+  auto& rewriter_config =
+      *config_proto.mutable_graph_options()->mutable_rewrite_options();
   rewriter_config.set_meta_optimizer_iterations(RewriterConfig::TWO);
   rewriter_config.set_min_graph_nodes(-1);
 
-  MetaOptimizer optimizer(nullptr, rewriter_config);
+  MetaOptimizer optimizer(nullptr, config_proto);
   GraphDef output;
   const Status status = optimizer.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
@@ -205,13 +213,15 @@ TEST_F(MetaOptimizerTest, RunToggleOptimizersAndCustomGraphOptimizerTwice) {
   GrapplerItem item;
   CHECK(fake_input.NextItem(&item));
 
-  RewriterConfig rewriter_config;
+  ConfigProto config_proto;
+  auto& rewriter_config =
+      *config_proto.mutable_graph_options()->mutable_rewrite_options();
   auto customGraphOptimizer = rewriter_config.add_custom_optimizers();
   customGraphOptimizer->set_name("TestGraphOptimizer");
   rewriter_config.set_meta_optimizer_iterations(RewriterConfig::TWO);
   rewriter_config.set_min_graph_nodes(-1);
 
-  MetaOptimizer optimizer(nullptr, rewriter_config);
+  MetaOptimizer optimizer(nullptr, config_proto);
   GraphDef output;
   const Status status = optimizer.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
@@ -222,13 +232,16 @@ TEST_F(MetaOptimizerTest, OptimizeFunctionLibrary) {
   using test::function::NDef;
 
   // Enable ony function optimization.
-  RewriterConfig rewriter_config;
+  ConfigProto config_proto;
+  auto& rewriter_config =
+      *config_proto.mutable_graph_options()->mutable_rewrite_options();
+
   rewriter_config.set_meta_optimizer_iterations(RewriterConfig::TWO);
   rewriter_config.set_function_optimization(RewriterConfig::ON);
   rewriter_config.add_optimizers("function");
   rewriter_config.set_min_graph_nodes(-1);
 
-  MetaOptimizer optimizer(nullptr, rewriter_config);
+  MetaOptimizer optimizer(nullptr, config_proto);
 
   // Define function library:
   //
@@ -394,14 +407,17 @@ TEST_F(MetaOptimizerTest, OptimizeFunctionLibraryPruneFunctionBody) {
   using test::function::NDef;
 
   // Enable function optimization and pruning.
-  RewriterConfig rewriter_config;
+  ConfigProto config_proto;
+  auto& rewriter_config =
+      *config_proto.mutable_graph_options()->mutable_rewrite_options();
+
   rewriter_config.set_meta_optimizer_iterations(RewriterConfig::TWO);
   rewriter_config.set_function_optimization(RewriterConfig::ON);
   rewriter_config.add_optimizers("function");
   rewriter_config.add_optimizers("pruning");
   rewriter_config.set_min_graph_nodes(-1);
 
-  MetaOptimizer optimizer(nullptr, rewriter_config);
+  MetaOptimizer optimizer(nullptr, config_proto);
 
   // MyFunc defines two Mul nodes inside function body and two corresponding
   // function outputs.
@@ -505,12 +521,15 @@ TEST_F(MetaOptimizerTest, OptimizeFunctionLibraryWithRestrictions) {
       &allowed_optimizations);
 
   // Just record properties of optimized Grappler items.
-  RewriterConfig rewriter_config;
+  ConfigProto config_proto;
+  auto& rewriter_config =
+      *config_proto.mutable_graph_options()->mutable_rewrite_options();
+
   rewriter_config.set_meta_optimizer_iterations(RewriterConfig::TWO);
   rewriter_config.add_optimizers("GrapplerItemPropertiesAccumulator");
   rewriter_config.set_min_graph_nodes(-1);
 
-  MetaOptimizer optimizer(nullptr, rewriter_config);
+  MetaOptimizer optimizer(nullptr, config_proto);
 
   // Define simple function library with two identical mul functions.
   FunctionDef mul_func_1 = FunctionDefHelper::Create(
@@ -605,7 +624,9 @@ TEST_F(MetaOptimizerTest, OptimizerTimesOut) {
   GrapplerItem item;
   CHECK(fake_input.NextItem(&item));
 
-  RewriterConfig rewriter_config;
+  ConfigProto config;
+  RewriterConfig& rewriter_config =
+      *config.mutable_graph_options()->mutable_rewrite_options();
   rewriter_config.add_optimizers("SleepingOptimizer");
   rewriter_config.set_min_graph_nodes(-1);
   rewriter_config.set_meta_optimizer_timeout_ms(1500);
@@ -613,7 +634,7 @@ TEST_F(MetaOptimizerTest, OptimizerTimesOut) {
 
   GraphDef output;
   const Status status =
-      RunMetaOptimizer(item, rewriter_config, nullptr, nullptr, &output);
+      RunMetaOptimizer(item, config, nullptr, nullptr, &output);
   EXPECT_EQ(status.error_message(), "meta_optimizer exceeded deadline.");
   // Make sure the graph was reverted to the original regardless of when the
   // optimizer timed out.
@@ -625,14 +646,16 @@ TEST_F(MetaOptimizerTest, OptimizerDoesNotTimeOut) {
   GrapplerItem item;
   CHECK(fake_input.NextItem(&item));
 
-  RewriterConfig rewriter_config;
+  ConfigProto config;
+  RewriterConfig& rewriter_config =
+      *config.mutable_graph_options()->mutable_rewrite_options();
   rewriter_config.add_optimizers("SleepingOptimizer");
   rewriter_config.set_min_graph_nodes(-1);
   rewriter_config.set_meta_optimizer_timeout_ms(1500);
   rewriter_config.set_meta_optimizer_iterations(RewriterConfig::ONE);
   GraphDef output;
   const Status status =
-      RunMetaOptimizer(item, rewriter_config, nullptr, nullptr, &output);
+      RunMetaOptimizer(item, config, nullptr, nullptr, &output);
   TF_EXPECT_OK(status);
   EXPECT_EQ(item.graph.node_size() + 1, output.node_size());
 }
diff --git a/tensorflow/core/grappler/optimizers/model_pruner.cc b/tensorflow/core/grappler/optimizers/model_pruner.cc
index 1be87a9d0d5..c548c570e07 100644
--- a/tensorflow/core/grappler/optimizers/model_pruner.cc
+++ b/tensorflow/core/grappler/optimizers/model_pruner.cc
@@ -23,30 +23,164 @@ limitations under the License.
 #include "tensorflow/core/framework/function.pb.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/framework/versions.pb.h"
 #include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/grappler/mutable_graph_view.h"
 #include "tensorflow/core/grappler/op_types.h"
-#include "tensorflow/core/grappler/optimizers/graph_rewriter.h"
 #include "tensorflow/core/grappler/utils.h"
 
 namespace tensorflow {
 namespace grappler {
 
-bool IsTrivialOp(const NodeDef& node, const GraphRewriter& rewriter) {
+bool IsTrivialIdentity(const NodeDef& node,
+                       const MutableGraphView& graph_view) {
+  for (const auto input :
+       graph_view.GetFanins(node, /*include_controlling_nodes=*/true)) {
+    if (input.port_id == Graph::kControlSlot) {
+      // Node is driven by control dependency.
+      return false;
+    } else if (IsSwitch(*input.node)) {  // Node is driven by switch.
+      return false;
+    }
+  }
+  for (const auto output :
+       graph_view.GetFanouts(node, /*include_controlled_nodes=*/true)) {
+    if (output.port_id == Graph::kControlSlot) {
+      // Node drives control dependency.
+      return false;
+    } else if (IsMerge(*output.node)) {  // Node feeds merge.
+      return false;
+    }
+  }
+  return true;
+}
+
+bool IsTrivialOp(const NodeDef& node, const MutableGraphView& graph_view) {
   // Remove the stop gradient nodes since they serve no purpose once the graph
   // is built. Also remove Identity ops.
   if (IsStopGradient(node)) {
     return true;
   }
   if (IsIdentity(node) || IsIdentityNSingleInput(node)) {
-    return !(rewriter.FeedsMerge(node) || rewriter.IsDrivenBySwitch(node) ||
-             rewriter.IsDrivenByControlDependency(node) ||
-             rewriter.DrivesControlDependency(node));
+    return IsTrivialIdentity(node, graph_view);
   }
 
   return IsAddN(node) && NumNonControlInputs(node) <= 1;
 }
 
+bool RemovalIncreasesEdgeCount(const NodeDef& node,
+                               const MutableGraphView& graph_view) {
+  int in_degree =
+      graph_view.NumFanins(node, /*include_controlling_nodes=*/true);
+  int out_degree =
+      graph_view.NumFanouts(node, /*include_controlling_nodes=*/true);
+  return in_degree * out_degree > in_degree + out_degree;
+}
+
+bool IsOutputPortRefValue(const NodeDef& node, int port_id,
+                          const OpRegistryInterface& op_registry) {
+  const OpRegistrationData* op_reg_data = nullptr;
+  Status s = op_registry.LookUp(node.op(), &op_reg_data);
+  if (s.ok()) {
+    DataType output_type;
+    s = OutputTypeForNode(node, op_reg_data->op_def, port_id, &output_type);
+    if (s.ok() && IsRefType(output_type)) {
+      return true;
+    }
+  }
+  return false;
+}
+
+bool CanRemoveNode(const NodeDef& node, const MutableGraphView& graph_view,
+                   const absl::flat_hash_set<string>& function_names,
+                   const OpRegistryInterface& op_registry) {
+  if (RemovalIncreasesEdgeCount(node, graph_view)) {
+    return false;
+  }
+  for (const auto input :
+       graph_view.GetFanins(node, /*include_controlling_nodes=*/true)) {
+    if (node.device() != input.node->device()) {
+      // Node is driven by a different device.
+      return false;
+    } else if (input.port_id == Graph::kControlSlot) {
+      // Node is driven by control dependency.
+      continue;
+    } else if (function_names.find(input.node->op()) != function_names.end()) {
+      // Node input is a function call.
+      return false;
+    } else if (IsOutputPortRefValue(*input.node, input.port_id, op_registry)) {
+      return false;
+    }
+  }
+  for (const auto output :
+       graph_view.GetFanouts(node, /*include_controlled_nodes=*/false)) {
+    if (function_names.find(output.node->op()) != function_names.end()) {
+      // Node output is a function call.
+      return false;
+    }
+  }
+  return true;
+}
+
+void ForwardInputsInternal(
+    const NodeDef& node,
+    const absl::flat_hash_set<const NodeDef*>& nodes_to_delete,
+    bool add_as_control, NodeDef* new_node,
+    const absl::flat_hash_map<string, const NodeDef*>& optimized_nodes,
+    const MutableGraphView& graph_view) {
+  // To speed things up, use the optimized version of the node if
+  // available.
+  auto itr = optimized_nodes.find(node.name());
+  if (itr != optimized_nodes.end()) {
+    for (const string& input : itr->second->input()) {
+      *new_node->add_input() =
+          add_as_control ? AsControlDependency(NodeName(input)) : input;
+    }
+    return;
+  }
+  for (const auto& input : node.input()) {
+    const NodeDef* input_node = graph_view.GetNode(NodeName(input));
+    if (input_node == nullptr) {
+      // Invalid input, preserve it as is.
+      *new_node->add_input() =
+          add_as_control ? AsControlDependency(NodeName(input)) : input;
+      continue;
+    }
+    if (nodes_to_delete.find(input_node) != nodes_to_delete.end()) {
+      ForwardInputsInternal(*input_node, nodes_to_delete,
+                            add_as_control || IsControlInput(input), new_node,
+                            optimized_nodes, graph_view);
+    } else {
+      *new_node->add_input() =
+          add_as_control ? AsControlDependency(NodeName(input)) : input;
+    }
+  }
+}
+
+void ForwardInputs(const NodeDef& original_node,
+                   const absl::flat_hash_set<const NodeDef*>& nodes_to_delete,
+                   NodeDef* new_node,
+                   absl::flat_hash_map<string, const NodeDef*>* optimized_nodes,
+                   const MutableGraphView& graph_view) {
+  // Forwards inputs of nodes to be deleted to their respective outputs.
+  ForwardInputsInternal(original_node, nodes_to_delete,
+                        /*add_as_control=*/false, new_node, *optimized_nodes,
+                        graph_view);
+  if (!new_node->name().empty()) {
+    (*optimized_nodes)[new_node->name()] = new_node;
+  }
+  // Reorder inputs such that control inputs come after regular inputs.
+  int pos = 0;
+  for (int i = 0; i < new_node->input_size(); ++i) {
+    if (!IsControlInput(new_node->input(i))) {
+      new_node->mutable_input()->SwapElements(pos, i);
+      ++pos;
+    }
+  }
+  DedupControlInputs(new_node);
+}
+
 absl::flat_hash_map<string, absl::flat_hash_set<int>> IdentityNTerminalPorts(
     const NodeMap& node_map, const std::vector<string>& terminal_nodes,
     int graph_size) {
@@ -313,12 +447,17 @@ Status ModelPruner::Optimize(Cluster* cluster, const GrapplerItem& item,
     runnable_item = item;
   }
 
-  GraphRewriter rewriter(runnable_item);
+  MutableGraphView graph_view(&runnable_item.graph);
+  absl::flat_hash_set<string> function_names;
+  for (const auto& function : item.graph.library().function()) {
+    function_names.insert(function.signature().name());
+  }
+  OpRegistryInterface* op_registry = OpRegistry::Global();
 
   // Check if we can further prune the graph, by removing the trivial ops.
-  std::unordered_set<const NodeDef*> nodes_to_delete;
+  absl::flat_hash_set<const NodeDef*> nodes_to_delete;
   for (auto& node : runnable_item.graph.node()) {
-    if (!IsTrivialOp(node, rewriter)) {
+    if (!IsTrivialOp(node, graph_view)) {
       continue;
     }
 
@@ -341,10 +480,7 @@ Status ModelPruner::Optimize(Cluster* cluster, const GrapplerItem& item,
     //   converting references to non-references. It is important to preserve
     //   these non-references since the partitioner will avoid sending
     //   non-references across partitions more than once.
-    if (!rewriter.RemovalIncreasesEdgeCount(node) &&
-        !rewriter.IsConnectedToFunction(node) &&
-        !rewriter.IsDrivenByAnotherDevice(node) &&
-        !rewriter.ReceivesRefValue(node)) {
+    if (CanRemoveNode(node, graph_view, function_names, *op_registry)) {
       nodes_to_delete.insert(&node);
     }
   }
@@ -360,13 +496,15 @@ Status ModelPruner::Optimize(Cluster* cluster, const GrapplerItem& item,
 
   const bool fetches_are_known = !item.fetch.empty();
   pruned_graph->mutable_node()->Reserve(runnable_item.graph.node_size());
+  absl::flat_hash_map<string, const NodeDef*> optimized_nodes;
   for (auto& node : runnable_item.graph.node()) {
     if (!fetches_are_known ||
         nodes_to_delete.find(&node) == nodes_to_delete.end()) {
       NodeDef* new_node = pruned_graph->add_node();
       *new_node = node;
       new_node->clear_input();
-      rewriter.ForwardInputs(node, nodes_to_delete, new_node);
+      ForwardInputs(node, nodes_to_delete, new_node, &optimized_nodes,
+                    graph_view);
     }
   }
   VLOG(1) << "Pruned " << nodes_to_delete.size()
diff --git a/tensorflow/core/grappler/optimizers/remapper.cc b/tensorflow/core/grappler/optimizers/remapper.cc
index 32f603a949c..d8e62e0b24e 100644
--- a/tensorflow/core/grappler/optimizers/remapper.cc
+++ b/tensorflow/core/grappler/optimizers/remapper.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/core/grappler/optimizers/remapper.h"
 
+#include "absl/container/flat_hash_set.h"
 #include "tensorflow/core/framework/versions.pb.h"
 #include "tensorflow/core/grappler/costs/graph_properties.h"
 #include "tensorflow/core/grappler/graph_view.h"
@@ -22,6 +23,7 @@ limitations under the License.
 #include "tensorflow/core/grappler/op_types.h"
 #include "tensorflow/core/grappler/optimizers/constant_folding.h"
 #include "tensorflow/core/grappler/utils.h"
+#include "tensorflow/core/grappler/utils/topological_sort.h"
 #include "tensorflow/core/platform/logging.h"
 
 namespace tensorflow {
@@ -29,22 +31,71 @@ namespace grappler {
 
 namespace {
 
+constexpr char kFusedConv2D[] = "_FusedConv2D";
+
+constexpr char kDataFormat[] = "data_format";
+constexpr char kIsTraining[] = "is_training";
+
+// TODO(b/119765980): Upgrade upstream Eigen to set `m_can_use_xsmm=false` for
+// contractions with non-default contraction output kernels.
+bool EigenSupportsContractionOutputKernel() {
+#if defined(EIGEN_USE_LIBXSMM)
+  return false;
+#endif
+  return true;
+}
+
+struct RemapperContext {
+  explicit RemapperContext(const GrapplerItem& item)
+      : nodes_to_preserve(item.NodesToPreserve()),
+        graph_view(&item.graph),
+        graph_properties(item),
+        inferred_graph_properties(false) {}
+
+  std::unordered_set<string> nodes_to_preserve;
+  GraphView graph_view;
+  GraphProperties graph_properties;
+  bool inferred_graph_properties;
+};
+
 // FusedBatchNorm that can be replaced with a cheaper set of primitives.
 struct FusedBatchNorm {
-  const NodeDef* fused_batch_norm;
+  const NodeDef* fused_batch_norm = nullptr;
 };
 
 // Conv2D node followed by a BiasAdd.
 struct Conv2DWithBiasAdd {
-  const NodeDef* conv2d;
-  const NodeDef* bias_add;
+  const NodeDef* conv2d = nullptr;
+  const NodeDef* bias_add = nullptr;
 };
 
 // Conv2D node followed by a BiasAdd and Relu.
 struct Conv2DWithBiasAddAndRelu {
-  const NodeDef* conv2d;
-  const NodeDef* bias_add;
-  const NodeDef* relu;
+  const NodeDef* conv2d = nullptr;
+  const NodeDef* bias_add = nullptr;
+  const NodeDef* relu = nullptr;
+};
+
+// Conv2D node followed by a Squeeze and BiasAdd.
+struct Conv2DWithSqueezeAndBiasAdd {
+  const NodeDef* conv2d = nullptr;
+  const NodeDef* squeeze = nullptr;
+  const NodeDef* bias_add = nullptr;
+};
+
+// Conv2D node followed by a FusedBatchNorm.
+struct Conv2DWithBatchNorm {
+  const NodeDef* conv2d = nullptr;
+  const NodeDef* fused_batch_norm = nullptr;
+  float epsilon = 0.0;
+};
+
+// Conv2D node followed by a FusedBatchNorm and Relu.
+struct Conv2DWithBatchNormAndRelu {
+  const NodeDef* conv2d = nullptr;
+  const NodeDef* fused_batch_norm = nullptr;
+  const NodeDef* relu = nullptr;
+  float epsilon = 0.0;
 };
 
 bool IsFloatOrDoubleDataType(const NodeDef* node,
@@ -54,7 +105,7 @@ bool IsFloatOrDoubleDataType(const NodeDef* node,
 }
 
 bool HaveSameDataType(const NodeDef* lhs, const NodeDef* rhs,
-                      const string& type_attr) {
+                      const string& type_attr = "T") {
   DataType lhs_attr = GetDataTypeFromAttr(*lhs, type_attr);
   DataType rhs_attr = GetDataTypeFromAttr(*rhs, type_attr);
 
@@ -62,25 +113,38 @@ bool HaveSameDataType(const NodeDef* lhs, const NodeDef* rhs,
          lhs_attr == rhs_attr;
 }
 
-bool FindConv2DWithBias(const GraphView& graph_view, const NodeDef* node,
+bool HasDataType(const NodeDef* node, const DataType& expected,
+                 const string& type_attr = "T") {
+  DataType dtype = GetDataTypeFromAttr(*node, type_attr);
+  return dtype == expected;
+}
+
+bool IsInPreserveSet(const RemapperContext& ctx, const NodeDef* node) {
+  return ctx.nodes_to_preserve.count(node->name()) > 0;
+}
+
+bool FindConv2DWithBias(const RemapperContext& ctx, const NodeDef* node,
                         Conv2DWithBiasAdd* matched) {
+  if (!EigenSupportsContractionOutputKernel()) return false;
+
   // Root of the pattern must be a BiasAdd.
   if (!node) return false;
   if (!IsBiasAdd(*node)) return false;
   if (!NodeIsOnCpu(node)) return false;
   if (!IsFloatOrDoubleDataType(node)) return false;
-  if (!NoControlFaninOrFanout(graph_view, node)) return false;
+  if (!NoControlFaninOrFanout(ctx.graph_view, node)) return false;
 
   // Input to the BiasAdd must be a Conv2D in NHWC format.
   const auto input_port = GraphView::InputPort(node, 0);
-  const auto conv2d = graph_view.GetRegularFanin(input_port);
+  const auto conv2d = ctx.graph_view.GetRegularFanin(input_port);
   if (!conv2d.node) return false;
   if (!IsConv2D(*conv2d.node)) return false;
-  if (conv2d.node->attr().at("data_format").s() != "NHWC") return false;
+  if (conv2d.node->attr().at(kDataFormat).s() != "NHWC") return false;
   if (!NodeIsOnCpu(conv2d.node)) return false;
-  if (!HaveSameDataType(node, conv2d.node, "T")) return false;
-  if (!NoControlFaninOrFanout(graph_view, conv2d.node)) return false;
-  if (!HasSingleFanoutNode(graph_view, conv2d.node)) return false;
+  if (!HaveSameDataType(node, conv2d.node)) return false;
+  if (!NoControlFaninOrFanout(ctx.graph_view, conv2d.node)) return false;
+  if (!HasSingleFanoutNode(ctx.graph_view, conv2d.node)) return false;
+  if (IsInPreserveSet(ctx, conv2d.node)) return false;
 
   // We successfully found a Conv2D+BiasAdd pattern.
   matched->conv2d = conv2d.node;
@@ -89,23 +153,26 @@ bool FindConv2DWithBias(const GraphView& graph_view, const NodeDef* node,
   return true;
 }
 
-bool FindConv2DWithBiasAndRelu(const GraphView& graph_view, const NodeDef* node,
+bool FindConv2DWithBiasAndRelu(const RemapperContext& ctx, const NodeDef* node,
                                Conv2DWithBiasAddAndRelu* matched) {
+  if (!EigenSupportsContractionOutputKernel()) return false;
+
   // Root of the pattern must be a Relu.
   if (!node) return false;
   if (!IsRelu(*node)) return false;
   if (!NodeIsOnCpu(node)) return false;
   if (!IsFloatOrDoubleDataType(node)) return false;
-  if (!NoControlFaninOrFanout(graph_view, node)) return false;
+  if (!NoControlFaninOrFanout(ctx.graph_view, node)) return false;
 
   // And input to Relu must match Conv2DWithBiasAdd pattern.
   const auto input_port = GraphView::InputPort(node, 0);
-  const auto bias_add = graph_view.GetRegularFanin(input_port);
+  const auto bias_add = ctx.graph_view.GetRegularFanin(input_port);
 
   Conv2DWithBiasAdd base;
-  if (!FindConv2DWithBias(graph_view, bias_add.node, &base)) return false;
-  if (!HasSingleFanoutNode(graph_view, base.bias_add)) return false;
-  if (!HaveSameDataType(node, base.bias_add, "T")) return false;
+  if (!FindConv2DWithBias(ctx, bias_add.node, &base)) return false;
+  if (!HasSingleFanoutNode(ctx.graph_view, base.bias_add)) return false;
+  if (!HaveSameDataType(node, base.bias_add)) return false;
+  if (IsInPreserveSet(ctx, base.bias_add)) return false;
 
   // We successfully found a Conv2D+BiasAdd+Relu pattern.
   matched->conv2d = base.conv2d;
@@ -115,27 +182,151 @@ bool FindConv2DWithBiasAndRelu(const GraphView& graph_view, const NodeDef* node,
   return true;
 }
 
+bool FindConv2DWithSqueezeAndBias(const RemapperContext& ctx,
+                                  const NodeDef* node,
+                                  Conv2DWithSqueezeAndBiasAdd* matched) {
+  if (!EigenSupportsContractionOutputKernel()) return false;
+
+  // Root of the pattern must be a BiasAdd.
+  if (node == nullptr) return false;
+  if (node->op() != "BiasAdd") return false;
+  if (!NodeIsOnCpu(node)) return false;
+  if (!IsFloatOrDoubleDataType(node)) return false;
+  if (!NoControlFaninOrFanout(ctx.graph_view, node)) return false;
+
+  // Input to the BiasAdd must be a Squeeze.
+  const auto bias_input_port = GraphView::InputPort(node, 0);
+  const auto squeeze = ctx.graph_view.GetRegularFanin(bias_input_port);
+  if (squeeze.node == nullptr) return false;
+  if (squeeze.node->op() != "Squeeze") return false;
+  if (!NodeIsOnCpu(squeeze.node)) return false;
+  if (!HaveSameDataType(node, squeeze.node, "T")) return false;
+  if (!NoControlFaninOrFanout(ctx.graph_view, squeeze.node)) return false;
+  if (!HasSingleFanoutNode(ctx.graph_view, squeeze.node)) return false;
+  if (IsInPreserveSet(ctx, squeeze.node)) return false;
+
+  // Squeeze must not squeeze output channel dimension.
+  std::vector<int32> dims;
+  if (!GetNodeAttr(*squeeze.node, "squeeze_dims", &dims).ok()) return false;
+  for (auto dim : dims) {
+    if (dim == 3) return false;
+  }
+
+  // Input to the Squeeze must be a Conv2D in NHWC format.
+  const auto squeeze_input_port = GraphView::InputPort(squeeze.node, 0);
+  const auto conv2d = ctx.graph_view.GetRegularFanin(squeeze_input_port);
+  if (conv2d.node == nullptr) return false;
+  if (conv2d.node->op() != "Conv2D") return false;
+  if (conv2d.node->attr().at("data_format").s() != "NHWC") return false;
+  if (!NodeIsOnCpu(conv2d.node)) return false;
+  if (!HaveSameDataType(node, conv2d.node, "T")) return false;
+  if (!NoControlFaninOrFanout(ctx.graph_view, conv2d.node)) return false;
+  if (!HasSingleFanoutNode(ctx.graph_view, conv2d.node)) return false;
+  if (IsInPreserveSet(ctx, conv2d.node)) return false;
+
+  // We successfully found a Conv2D+Squeeze+BiasAdd pattern.
+  matched->conv2d = conv2d.node;
+  matched->squeeze = squeeze.node;
+  matched->bias_add = node;
+
+  return true;
+}
+
+bool FindConv2DWithBatchNorm(const RemapperContext& ctx, const NodeDef* node,
+                             Conv2DWithBatchNorm* matched) {
+  if (!EigenSupportsContractionOutputKernel()) return false;
+
+  // Root of the pattern must be a FusedBatchNorm or a FusedBatchNormV2.
+  if (node == nullptr) return false;
+  if (!IsFusedBatchNorm(*node)) return false;
+  if (!NodeIsOnCpu(node)) return false;
+  if (!HasDataType(node, DT_FLOAT)) return false;
+
+  // V2 has a separate data type for the scale/offset/mean/variance inputs.
+  if (node->op() == "FusedBatchNormV2" && !HasDataType(node, DT_FLOAT, "U"))
+    return false;
+
+  // Check that batch normalization is in inference mode.
+  const auto& attr = node->attr();
+  if (attr.count(kIsTraining) > 0 && attr.at(kIsTraining).b()) return false;
+
+  // Check that only 0th output is consumed by other nodes.
+  if (!NoControlFaninOrFanout(ctx.graph_view, node)) return false;
+  if (HasFanouts(ctx.graph_view, node, 1)) return false;  // batch_mean
+  if (HasFanouts(ctx.graph_view, node, 2)) return false;  // batch_variance
+  if (HasFanouts(ctx.graph_view, node, 3)) return false;  // reserve_space_1
+  if (HasFanouts(ctx.graph_view, node, 4)) return false;  // reserve_space_2
+
+  // Input to the FusedBatchNorm must be a Conv2D in NHWC format.
+  const auto input_port = GraphView::InputPort(node, 0);
+  const auto conv2d = ctx.graph_view.GetRegularFanin(input_port);
+  if (conv2d.node == nullptr) return false;
+  if (!IsConv2D(*conv2d.node)) return false;
+  if (conv2d.node->attr().at(kDataFormat).s() != "NHWC") return false;
+  if (!NodeIsOnCpu(conv2d.node)) return false;
+  if (!HaveSameDataType(node, conv2d.node)) return false;
+  if (!NoControlFaninOrFanout(ctx.graph_view, conv2d.node)) return false;
+  if (!HasSingleFanoutNode(ctx.graph_view, conv2d.node)) return false;
+  if (IsInPreserveSet(ctx, conv2d.node)) return false;
+
+  // We successfully found a Conv2D+FusedBatchNorm pattern.
+  matched->conv2d = conv2d.node;
+  matched->fused_batch_norm = node;
+  if (!GetNodeAttr(*node, "epsilon", &matched->epsilon).ok()) return false;
+
+  return true;
+}
+
+bool FindConv2DWithBatchNormAndRelu(const RemapperContext& ctx,
+                                    const NodeDef* node,
+                                    Conv2DWithBatchNormAndRelu* matched) {
+  if (!EigenSupportsContractionOutputKernel()) return false;
+
+  // Root of the pattern must be a Relu.
+  if (node == nullptr) return false;
+  if (!IsRelu(*node)) return false;
+  if (!NodeIsOnCpu(node)) return false;
+  if (!IsFloatOrDoubleDataType(node)) return false;
+  if (!NoControlFaninOrFanout(ctx.graph_view, node)) return false;
+
+  // And input to Relu must match Conv2DWithBatchNorm pattern.
+  const auto input_port = GraphView::InputPort(node, 0);
+  const auto batch_norm = ctx.graph_view.GetRegularFanin(input_port);
+
+  Conv2DWithBatchNorm base;
+  if (!FindConv2DWithBatchNorm(ctx, batch_norm.node, &base)) return false;
+  if (!HasSingleFanoutNode(ctx.graph_view, base.fused_batch_norm)) return false;
+  if (!HaveSameDataType(node, base.fused_batch_norm)) return false;
+  if (IsInPreserveSet(ctx, base.fused_batch_norm)) return false;
+
+  // We successfully found a Conv2D+FusedBatchNorm+Relu pattern.
+  matched->conv2d = base.conv2d;
+  matched->fused_batch_norm = base.fused_batch_norm;
+  matched->relu = node;
+  matched->epsilon = base.epsilon;
+
+  return true;
+}
+
 // Check that given node meets some basic FusedBatchNorm optimization
 // preconditions. We use this check to lazily infer graph properties which is
 // rather expensive.
-bool IsFusedBatchNormCandidate(const GraphView& graph_view,
-                               const NodeDef& node) {
+bool IsFusedBatchNormCandidate(const NodeDef& node) {
   if (!IsFusedBatchNorm(node)) return false;
   if (GetDataTypeFromAttr(node, "T") != DT_FLOAT) return false;
 
   // Check that the node is in inference mode.
   const auto& attr = node.attr();
-  if (attr.count("is_training") > 0 && attr.at("is_training").b()) return false;
+  if (attr.count(kIsTraining) > 0 && attr.at(kIsTraining).b()) return false;
 
   return true;
 }
 
-bool FindFusedBatchNorm(const GraphView& graph_view,
-                        const GraphProperties& graph_properties,
-                        const NodeDef* node, FusedBatchNorm* matched) {
-  if (!IsFusedBatchNormCandidate(graph_view, *node)) return false;
+bool FindFusedBatchNorm(const RemapperContext& ctx, const NodeDef* node,
+                        FusedBatchNorm* matched) {
+  if (!IsFusedBatchNormCandidate(*node)) return false;
 
-  const auto& props = graph_properties.GetInputProperties(node->name());
+  const auto& props = ctx.graph_properties.GetInputProperties(node->name());
 
   // a. Scaling factor can be const folded:
   //      scaling_factor = (variance + epsilon).rsqrt() * scale
@@ -155,7 +346,7 @@ bool FindFusedBatchNorm(const GraphView& graph_view,
   if (!can_remap) return false;
 
   // The optimized version only generates the first output.
-  for (GraphView::Edge edge : graph_view.GetFanoutEdges(*node, false)) {
+  for (GraphView::Edge edge : ctx.graph_view.GetFanoutEdges(*node, false)) {
     if (edge.src.port_id != 0) return false;
   }
 
@@ -164,11 +355,9 @@ bool FindFusedBatchNorm(const GraphView& graph_view,
   return true;
 }
 
-#undef REMAPPER_REQUIRES
-
 void CopyConv2DAttributes(const NodeDef* conv2d, NodeDef* fused_conv2d,
                           const std::vector<string>& fused_ops = {},
-                          int num_args = 1) {
+                          int num_args = 1, float epsilon = 0.0) {
   auto* attr = fused_conv2d->mutable_attr();
   auto src_attr = conv2d->attr();
 
@@ -184,39 +373,133 @@ void CopyConv2DAttributes(const NodeDef* conv2d, NodeDef* fused_conv2d,
   }
 
   SetAttrValue(num_args, &(*attr)["num_args"]);
+  // Required only for FusedBatchNorm.
+  SetAttrValue(epsilon, &(*attr)["epsilon"]);
 }
 
-void AddFusedConv2DNode(const Conv2DWithBiasAdd& matched,
-                        GraphDef* optimized_graph) {
+void AddFusedConv2DNode(
+    const Conv2DWithBiasAdd& matched, GraphDef* optimized_graph,
+    absl::flat_hash_set<const NodeDef*>* invalidated_nodes) {
   VLOG(2) << "Fuse Conv2D with BiasAdd: bias_add=" << matched.bias_add->name()
           << " conv2d=" << matched.conv2d->name();
 
   NodeDef* fused_conv2d = optimized_graph->add_node();
   fused_conv2d->set_name(matched.bias_add->name());
-  fused_conv2d->set_op("_FusedConv2D");
+  fused_conv2d->set_op(kFusedConv2D);
   fused_conv2d->set_device(matched.bias_add->device());
   fused_conv2d->add_input(matched.conv2d->input(0));    // 0: input
   fused_conv2d->add_input(matched.conv2d->input(1));    // 1: filter
   fused_conv2d->add_input(matched.bias_add->input(1));  // 2: bias
 
   CopyConv2DAttributes(matched.conv2d, fused_conv2d, {"BiasAdd"});
+
+  invalidated_nodes->insert(matched.bias_add);
+  invalidated_nodes->insert(matched.conv2d);
 }
 
-void AddFusedConv2DNode(const Conv2DWithBiasAddAndRelu& matched,
-                        GraphDef* optimized_graph) {
+void AddFusedConv2DNode(
+    const Conv2DWithBiasAddAndRelu& matched, GraphDef* optimized_graph,
+    absl::flat_hash_set<const NodeDef*>* invalidated_nodes) {
   VLOG(2) << "Fuse Conv2D with BiasAdd and Relu: relu=" << matched.relu->name()
           << " bias_add=" << matched.bias_add->name()
           << " conv2d=" << matched.conv2d->name();
 
   NodeDef* fused_conv2d = optimized_graph->add_node();
   fused_conv2d->set_name(matched.relu->name());
-  fused_conv2d->set_op("_FusedConv2D");
+  fused_conv2d->set_op(kFusedConv2D);
   fused_conv2d->set_device(matched.relu->device());
   fused_conv2d->add_input(matched.conv2d->input(0));    // 0: input
   fused_conv2d->add_input(matched.conv2d->input(1));    // 1: filter
   fused_conv2d->add_input(matched.bias_add->input(1));  // 2: bias
 
   CopyConv2DAttributes(matched.conv2d, fused_conv2d, {"BiasAdd", "Relu"});
+
+  invalidated_nodes->insert(matched.relu);
+  invalidated_nodes->insert(matched.bias_add);
+  invalidated_nodes->insert(matched.conv2d);
+}
+
+void AddFusedConv2DNode(
+    const Conv2DWithSqueezeAndBiasAdd& matched, GraphDef* optimized_graph,
+    absl::flat_hash_set<const NodeDef*>* invalidated_nodes) {
+  VLOG(2) << "Fuse Conv2D with Squeeze and BiasAdd: "
+          << " bias_add=" << matched.bias_add->name()
+          << " squeeze=" << matched.squeeze->name()
+          << " conv2d=" << matched.conv2d->name();
+
+  // Replace Conv2D node with a fused Conv2D. Matched pattern guarantees that it
+  // has single consumer (only the squeeze node).
+  NodeDef* fused_conv2d = optimized_graph->add_node();
+  fused_conv2d->set_name(matched.conv2d->name());
+  fused_conv2d->set_op("_FusedConv2D");
+  fused_conv2d->set_device(matched.conv2d->device());
+  fused_conv2d->add_input(matched.conv2d->input(0));    // 0: input
+  fused_conv2d->add_input(matched.conv2d->input(1));    // 1: filter
+  fused_conv2d->add_input(matched.bias_add->input(1));  // 2: bias
+
+  CopyConv2DAttributes(matched.conv2d, fused_conv2d, {"BiasAdd"});
+
+  // Replace BiasAdd node with a Squeeze.
+  NodeDef* remapped_squeeze = optimized_graph->add_node();
+  *remapped_squeeze = *matched.squeeze;
+  remapped_squeeze->set_name(matched.bias_add->name());
+  remapped_squeeze->set_input(0, fused_conv2d->name());
+
+  invalidated_nodes->insert(matched.squeeze);
+  invalidated_nodes->insert(matched.bias_add);
+  invalidated_nodes->insert(matched.conv2d);
+}
+
+void AddFusedConv2DNode(
+    const Conv2DWithBatchNorm& matched, GraphDef* optimized_graph,
+    absl::flat_hash_set<const NodeDef*>* invalidated_nodes) {
+  VLOG(2) << "Fuse Conv2D with BatchNorm: batch_norm="
+          << matched.fused_batch_norm->name()
+          << " conv2d=" << matched.conv2d->name();
+
+  NodeDef* fused_conv2d = optimized_graph->add_node();
+  fused_conv2d->set_name(matched.fused_batch_norm->name());
+  fused_conv2d->set_op(kFusedConv2D);
+  fused_conv2d->set_device(matched.fused_batch_norm->device());
+  fused_conv2d->add_input(matched.conv2d->input(0));            // 0: input
+  fused_conv2d->add_input(matched.conv2d->input(1));            // 1: filter
+  fused_conv2d->add_input(matched.fused_batch_norm->input(1));  // 2: scale
+  fused_conv2d->add_input(matched.fused_batch_norm->input(2));  // 3: offset
+  fused_conv2d->add_input(matched.fused_batch_norm->input(3));  // 4: mean
+  fused_conv2d->add_input(matched.fused_batch_norm->input(4));  // 5: variance
+
+  CopyConv2DAttributes(matched.conv2d, fused_conv2d, {"FusedBatchNorm"},
+                       /*num_args*/ 4, /*epsilon*/ matched.epsilon);
+
+  invalidated_nodes->insert(matched.fused_batch_norm);
+  invalidated_nodes->insert(matched.conv2d);
+}
+
+void AddFusedConv2DNode(
+    const Conv2DWithBatchNormAndRelu& matched, GraphDef* optimized_graph,
+    absl::flat_hash_set<const NodeDef*>* invalidated_nodes) {
+  VLOG(2) << "Fuse Conv2D with BatchNorm and Relu: relu="
+          << matched.relu->name()
+          << " batch_norm=" << matched.fused_batch_norm->name()
+          << " conv2d=" << matched.conv2d->name();
+
+  NodeDef* fused_conv2d = optimized_graph->add_node();
+  fused_conv2d->set_name(matched.relu->name());
+  fused_conv2d->set_op(kFusedConv2D);
+  fused_conv2d->set_device(matched.fused_batch_norm->device());
+  fused_conv2d->add_input(matched.conv2d->input(0));            // 0: input
+  fused_conv2d->add_input(matched.conv2d->input(1));            // 1: filter
+  fused_conv2d->add_input(matched.fused_batch_norm->input(1));  // 2: scale
+  fused_conv2d->add_input(matched.fused_batch_norm->input(2));  // 3: offset
+  fused_conv2d->add_input(matched.fused_batch_norm->input(3));  // 4: mean
+  fused_conv2d->add_input(matched.fused_batch_norm->input(4));  // 5: variance
+
+  CopyConv2DAttributes(matched.conv2d, fused_conv2d, {"FusedBatchNorm", "Relu"},
+                       /*num_args*/ 4, /*epsilon*/ matched.epsilon);
+
+  invalidated_nodes->insert(matched.relu);
+  invalidated_nodes->insert(matched.fused_batch_norm);
+  invalidated_nodes->insert(matched.conv2d);
 }
 
 void AddBatchNormNodes(const FusedBatchNorm& matched,
@@ -231,7 +514,7 @@ void AddBatchNormNodes(const FusedBatchNorm& matched,
   string mean = fused_node.input(3);
   string variance = fused_node.input(4);
 
-  if (fused_node.attr().at("data_format").s() == "NCHW") {
+  if (fused_node.attr().at(kDataFormat).s() == "NCHW") {
     // Need to reshape the last 4 inputs
     NodeDef* new_shape = optimized_graph->add_node();
     new_shape->set_name(AddPrefixToNodeName("NCHWShape", fused_node.name()));
@@ -365,38 +648,80 @@ void AddBatchNormNodes(const FusedBatchNorm& matched,
 
 Status Remapper::Optimize(Cluster* /*cluster*/, const GrapplerItem& item,
                           GraphDef* optimized_graph) {
-  GraphProperties properties(item);
-  bool inferred_properties = false;
-  GraphView graph(const_cast<GraphDef*>(&item.graph));
-
   // Supported graph patterns.
-  FusedBatchNorm fused_batch_norm{};
-  Conv2DWithBiasAdd conv2d_with_bias{};
-  Conv2DWithBiasAddAndRelu conv2d_with_bias_and_relu{};
+  // clang-format off
+  FusedBatchNorm              fused_batch_norm;
+  Conv2DWithBiasAdd           conv2d_with_bias;
+  Conv2DWithBiasAddAndRelu    conv2d_with_bias_and_relu;
+  Conv2DWithBatchNorm         conv2d_with_batch_norm;
+  Conv2DWithBatchNormAndRelu  conv2d_with_batch_norm_and_relu;
+  Conv2DWithSqueezeAndBiasAdd conv2d_with_squeeze_and_bias;
+  // clang-format on
 
-  optimized_graph->mutable_node()->Reserve(item.graph.node_size());
-  for (const NodeDef& node : item.graph.node()) {
-    // Remap Conv2D+BiasAdd into the _FusedConv2DWithBias.
-    if (FindConv2DWithBias(graph, &node, &conv2d_with_bias)) {
-      AddFusedConv2DNode(conv2d_with_bias, optimized_graph);
+  // Processing graph in reverse-topological sorted order allows to remap
+  // longer chains of dependent ops in one pass.
+  GraphDef topo_sorted_graph = item.graph;
+  TF_RETURN_IF_ERROR(TopologicalSort(&topo_sorted_graph));
+  std::reverse(topo_sorted_graph.mutable_node()->begin(),
+               topo_sorted_graph.mutable_node()->end());
+
+  GrapplerItem topo_sorted_item(item, std::move(topo_sorted_graph));
+  RemapperContext ctx(topo_sorted_item);
+
+  // Skip nodes that were invalidated by a remapper, e.g. do not process BiasAdd
+  // and Relu nodes that were fused into a Conv2D node.
+  absl::flat_hash_set<const NodeDef*> invalidated_nodes;
+
+  optimized_graph->mutable_node()->Reserve(topo_sorted_item.graph.node_size());
+  for (const NodeDef& node : topo_sorted_item.graph.node()) {
+    // Check if node was invalidated by one of the previous remaps.
+    if (invalidated_nodes.count(&node) > 0) continue;
+
+    // Remap Conv2D+BiasAdd into the _FusedConv2D.
+    if (FindConv2DWithBias(ctx, &node, &conv2d_with_bias)) {
+      AddFusedConv2DNode(conv2d_with_bias, optimized_graph, &invalidated_nodes);
       continue;
     }
 
-    // Remap Conv2D+BiasAdd+Relu into the _FusedConv2DWithBias(Relu).
-    if (FindConv2DWithBiasAndRelu(graph, &node, &conv2d_with_bias_and_relu)) {
-      AddFusedConv2DNode(conv2d_with_bias_and_relu, optimized_graph);
+    // Remap Conv2D+BiasAdd+Relu into the _FusedConv2D.
+    if (FindConv2DWithBiasAndRelu(ctx, &node, &conv2d_with_bias_and_relu)) {
+      AddFusedConv2DNode(conv2d_with_bias_and_relu, optimized_graph,
+                         &invalidated_nodes);
+      continue;
+    }
+
+    // Remap Conv2D+Squeeze+BiasAdd into the _FusedConv2D+Squeeze.
+    if (FindConv2DWithSqueezeAndBias(ctx, &node,
+                                     &conv2d_with_squeeze_and_bias)) {
+      AddFusedConv2DNode(conv2d_with_squeeze_and_bias, optimized_graph,
+                         &invalidated_nodes);
+      continue;
+    }
+
+    // Remap Conv2D+FusedBatchNorm into the _FusedConv2D;
+    if (FindConv2DWithBatchNorm(ctx, &node, &conv2d_with_batch_norm)) {
+      AddFusedConv2DNode(conv2d_with_batch_norm, optimized_graph,
+                         &invalidated_nodes);
+      continue;
+    }
+
+    // Remap Conv2D+FusedBatchNorm+Relu into the _FusedConv2D;
+    if (FindConv2DWithBatchNormAndRelu(ctx, &node,
+                                       &conv2d_with_batch_norm_and_relu)) {
+      AddFusedConv2DNode(conv2d_with_batch_norm_and_relu, optimized_graph,
+                         &invalidated_nodes);
       continue;
     }
 
     // Infer properties lazily in case they are not needed.
-    if (!inferred_properties && IsFusedBatchNormCandidate(graph, node)) {
-      TF_RETURN_IF_ERROR(properties.InferStatically(false));
-      inferred_properties = true;
+    if (!ctx.inferred_graph_properties && IsFusedBatchNormCandidate(node)) {
+      TF_RETURN_IF_ERROR(ctx.graph_properties.InferStatically(false));
+      ctx.inferred_graph_properties = true;
     }
 
     // During inference, most of the inputs to FusedBatchNorm are constant, and
     // we can therefore replace the op with a much cheaper set of primitives.
-    if (FindFusedBatchNorm(graph, properties, &node, &fused_batch_norm)) {
+    if (FindFusedBatchNorm(ctx, &node, &fused_batch_norm)) {
       AddBatchNormNodes(fused_batch_norm, optimized_graph);
       continue;
     }
@@ -405,8 +730,8 @@ Status Remapper::Optimize(Cluster* /*cluster*/, const GrapplerItem& item,
     *optimized_graph->add_node() = node;
   }
 
-  *optimized_graph->mutable_library() = item.graph.library();
-  *optimized_graph->mutable_versions() = item.graph.versions();
+  *optimized_graph->mutable_library() = topo_sorted_item.graph.library();
+  *optimized_graph->mutable_versions() = topo_sorted_item.graph.versions();
 
   return Status::OK();
 }
diff --git a/tensorflow/core/grappler/optimizers/remapper_test.cc b/tensorflow/core/grappler/optimizers/remapper_test.cc
index 249ca706730..ffc242decc7 100644
--- a/tensorflow/core/grappler/optimizers/remapper_test.cc
+++ b/tensorflow/core/grappler/optimizers/remapper_test.cc
@@ -24,7 +24,17 @@ limitations under the License.
 namespace tensorflow {
 namespace grappler {
 
-class RemapperTest : public GrapplerTest {};
+class RemapperTest : public GrapplerTest {
+ protected:
+  // TODO(b/119765980): Upgrade upstream Eigen to set `m_can_use_xsmm=false` for
+  // contractions with non-default contraction output kernels.
+  bool EigenSupportsContractionOutputKernel() {
+#if defined(EIGEN_USE_LIBXSMM)
+    return false;
+#endif
+    return true;
+  }
+};
 
 TEST_F(RemapperTest, FusedBatchNorm) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
@@ -92,6 +102,8 @@ TEST_F(RemapperTest, FusedBatchNormNCHW) {
 }
 
 TEST_F(RemapperTest, FuseConv2DWithBias) {
+  if (!EigenSupportsContractionOutputKernel()) return;
+
   using ::tensorflow::ops::Placeholder;
 
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
@@ -153,6 +165,8 @@ TEST_F(RemapperTest, FuseConv2DWithBias) {
 }
 
 TEST_F(RemapperTest, FuseConv2DWithBiasAndRelu) {
+  if (!EigenSupportsContractionOutputKernel()) return;
+
   using ::tensorflow::ops::Placeholder;
 
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
@@ -215,5 +229,233 @@ TEST_F(RemapperTest, FuseConv2DWithBiasAndRelu) {
   test::ExpectTensorNear<float>(tensors_expected[0], tensors[0], 1e-6);
 }
 
+TEST_F(RemapperTest, FuseConv2DWithBatchNorm) {
+  if (!EigenSupportsContractionOutputKernel()) return;
+
+  using ops::Placeholder;
+
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+
+  auto input_shape = ops::Placeholder::Shape({8, 32, 32, 3});
+  auto filter_shape = ops::Placeholder::Shape({1, 1, 3, 128});
+  auto scale_shape = ops::Placeholder::Shape({128});
+
+  auto input = Placeholder(s.WithOpName("input"), DT_FLOAT, input_shape);
+  auto filter = Placeholder(s.WithOpName("filter"), DT_FLOAT, filter_shape);
+  auto scale = Placeholder(s.WithOpName("scale"), DT_FLOAT, scale_shape);
+  auto offset = Placeholder(s.WithOpName("offset"), DT_FLOAT, scale_shape);
+  auto mean = Placeholder(s.WithOpName("mean"), DT_FLOAT, scale_shape);
+  auto variance = Placeholder(s.WithOpName("variance"), DT_FLOAT, scale_shape);
+
+  std::vector<int> strides = {1, 1, 1, 1};
+  auto conv = ops::Conv2D(s.WithOpName("conv"), input, filter, strides, "SAME");
+  ops::FusedBatchNorm::Attrs attrs;
+  attrs = attrs.IsTraining(false);
+  auto batch_norm = ops::FusedBatchNorm(s.WithOpName("batch_norm"), conv, scale,
+                                        offset, mean, variance, attrs);
+  auto fetch = ops::Identity(s.WithOpName("fetch"), batch_norm.y);
+
+  auto input_t = GenerateRandomTensor<DT_FLOAT>({8, 32, 32, 3});
+  auto filter_t = GenerateRandomTensor<DT_FLOAT>({1, 1, 3, 128});
+  auto scale_t = GenerateRandomTensor<DT_FLOAT>({128});
+  auto offset_t = GenerateRandomTensor<DT_FLOAT>({128});
+  auto mean_t = GenerateRandomTensor<DT_FLOAT>({128});
+  auto variance_t = GenerateRandomTensor<DT_FLOAT>({128});
+
+  GrapplerItem item;
+  item.fetch = {"fetch"};
+  item.feed = {{"input", input_t}, {"filter", filter_t},
+               {"scale", scale_t}, {"offset", offset_t},
+               {"mean", mean_t},   {"variance", variance_t}};
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+
+  // Place all nodes on CPU.
+  for (int i = 0; i < item.graph.node_size(); ++i) {
+    item.graph.mutable_node(i)->set_device("/device:CPU:0");
+  }
+
+  Remapper optimizer(RewriterConfig::ON);
+  GraphDef output;
+  TF_CHECK_OK(optimizer.Optimize(nullptr, item, &output));
+
+  int found = 0;
+  for (const NodeDef& node : output.node()) {
+    if (node.name() == "batch_norm") {
+      EXPECT_EQ("_FusedConv2D", node.op());
+      EXPECT_EQ("input", node.input(0));
+      EXPECT_EQ("filter", node.input(1));
+
+      EXPECT_EQ(4, node.attr().at("num_args").i());
+      EXPECT_EQ("scale", node.input(2));
+      EXPECT_EQ("offset", node.input(3));
+      EXPECT_EQ("mean", node.input(4));
+      EXPECT_EQ("variance", node.input(5));
+
+      const auto fused_ops = node.attr().at("fused_ops").list().s();
+      EXPECT_EQ(1, fused_ops.size());
+      EXPECT_EQ("FusedBatchNorm", fused_ops[0]);
+      found++;
+    }
+  }
+  EXPECT_EQ(1, found);
+
+  auto tensors_expected = EvaluateNodes(item.graph, item.fetch, item.feed);
+  auto tensors = EvaluateNodes(output, item.fetch, item.feed);
+  EXPECT_EQ(1, tensors_expected.size());
+  EXPECT_EQ(1, tensors.size());
+  test::ExpectTensorNear<float>(tensors_expected[0], tensors[0], 1e-6);
+}
+
+TEST_F(RemapperTest, FuseConv2DWithBatchNormAndRelu) {
+  if (!EigenSupportsContractionOutputKernel()) return;
+
+  using ops::Placeholder;
+
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+
+  auto input_shape = ops::Placeholder::Shape({8, 32, 32, 3});
+  auto filter_shape = ops::Placeholder::Shape({1, 1, 3, 128});
+  auto scale_shape = ops::Placeholder::Shape({128});
+
+  auto input = Placeholder(s.WithOpName("input"), DT_FLOAT, input_shape);
+  auto filter = Placeholder(s.WithOpName("filter"), DT_FLOAT, filter_shape);
+  auto scale = Placeholder(s.WithOpName("scale"), DT_FLOAT, scale_shape);
+  auto offset = Placeholder(s.WithOpName("offset"), DT_FLOAT, scale_shape);
+  auto mean = Placeholder(s.WithOpName("mean"), DT_FLOAT, scale_shape);
+  auto variance = Placeholder(s.WithOpName("variance"), DT_FLOAT, scale_shape);
+
+  std::vector<int> strides = {1, 1, 1, 1};
+  auto conv = ops::Conv2D(s.WithOpName("conv"), input, filter, strides, "SAME");
+  ops::FusedBatchNorm::Attrs attrs;
+  attrs = attrs.IsTraining(false);
+  auto batch_norm = ops::FusedBatchNorm(s.WithOpName("batch_norm"), conv, scale,
+                                        offset, mean, variance, attrs);
+  auto relu = ops::Relu(s.WithOpName("relu"), batch_norm.y);
+  auto fetch = ops::Identity(s.WithOpName("fetch"), relu);
+
+  auto input_t = GenerateRandomTensor<DT_FLOAT>({8, 32, 32, 3});
+  auto filter_t = GenerateRandomTensor<DT_FLOAT>({1, 1, 3, 128});
+  auto scale_t = GenerateRandomTensor<DT_FLOAT>({128});
+  auto offset_t = GenerateRandomTensor<DT_FLOAT>({128});
+  auto mean_t = GenerateRandomTensor<DT_FLOAT>({128});
+  auto variance_t = GenerateRandomTensor<DT_FLOAT>({128});
+
+  GrapplerItem item;
+  item.fetch = {"fetch"};
+  item.feed = {{"input", input_t}, {"filter", filter_t},
+               {"scale", scale_t}, {"offset", offset_t},
+               {"mean", mean_t},   {"variance", variance_t}};
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+
+  // Place all nodes on CPU.
+  for (int i = 0; i < item.graph.node_size(); ++i) {
+    item.graph.mutable_node(i)->set_device("/device:CPU:0");
+  }
+
+  Remapper optimizer(RewriterConfig::ON);
+  GraphDef output;
+  TF_CHECK_OK(optimizer.Optimize(nullptr, item, &output));
+
+  int found = 0;
+  for (const NodeDef& node : output.node()) {
+    if (node.name() == "relu") {
+      EXPECT_EQ("_FusedConv2D", node.op());
+      EXPECT_EQ("input", node.input(0));
+      EXPECT_EQ("filter", node.input(1));
+
+      EXPECT_EQ(4, node.attr().at("num_args").i());
+      EXPECT_EQ("scale", node.input(2));
+      EXPECT_EQ("offset", node.input(3));
+      EXPECT_EQ("mean", node.input(4));
+      EXPECT_EQ("variance", node.input(5));
+
+      const auto fused_ops = node.attr().at("fused_ops").list().s();
+      EXPECT_EQ(2, fused_ops.size());
+      EXPECT_EQ("FusedBatchNorm", fused_ops[0]);
+      EXPECT_EQ("Relu", fused_ops[1]);
+      found++;
+    }
+  }
+  EXPECT_EQ(1, found);
+
+  auto tensors_expected = EvaluateNodes(item.graph, item.fetch, item.feed);
+  auto tensors = EvaluateNodes(output, item.fetch, item.feed);
+  EXPECT_EQ(1, tensors_expected.size());
+  EXPECT_EQ(1, tensors.size());
+  test::ExpectTensorNear<float>(tensors_expected[0], tensors[0], 1e-6);
+}
+
+TEST_F(RemapperTest, FuseConv2DWithSqueezeAndBias) {
+  if (!EigenSupportsContractionOutputKernel()) return;
+
+  using ops::Placeholder;
+
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+
+  auto input_shape = ops::Placeholder::Shape({8, 32, 1, 3});
+  auto filter_shape = ops::Placeholder::Shape({1, 1, 3, 128});
+  auto bias_shape = ops::Placeholder::Shape({128});
+
+  auto input = Placeholder(s.WithOpName("input"), DT_FLOAT, input_shape);
+  auto filter = Placeholder(s.WithOpName("filter"), DT_FLOAT, filter_shape);
+  auto bias = Placeholder(s.WithOpName("bias"), DT_FLOAT, bias_shape);
+
+  std::vector<int> strides = {1, 1, 1, 1};
+  auto conv = ops::Conv2D(s.WithOpName("conv"), input, filter, strides, "SAME");
+
+  ops::Squeeze::Attrs attrs;
+  attrs = attrs.Axis({2});
+  auto squeeze = ops::Squeeze(s.WithOpName("squeeze"), conv, attrs);
+
+  auto bias_add = ops::BiasAdd(s.WithOpName("bias_add"), squeeze, bias);
+  auto fetch = ops::Identity(s.WithOpName("fetch"), bias_add);
+
+  auto input_t = GenerateRandomTensor<DT_FLOAT>({8, 32, 1, 3});
+  auto filter_t = GenerateRandomTensor<DT_FLOAT>({1, 1, 3, 128});
+  auto bias_t = GenerateRandomTensor<DT_FLOAT>({128});
+
+  GrapplerItem item;
+  item.fetch = {"fetch"};
+  item.feed = {{"input", input_t}, {"filter", filter_t}, {"bias", bias_t}};
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+
+  // Place all nodes on CPU.
+  for (int i = 0; i < item.graph.node_size(); ++i) {
+    item.graph.mutable_node(i)->set_device("/device:CPU:0");
+  }
+
+  Remapper optimizer(RewriterConfig::ON);
+  GraphDef output;
+  TF_CHECK_OK(optimizer.Optimize(nullptr, item, &output));
+
+  int found = 0;
+  for (const NodeDef& node : output.node()) {
+    if (node.name() == "conv") {
+      EXPECT_EQ("_FusedConv2D", node.op());
+      EXPECT_EQ("input", node.input(0));
+      EXPECT_EQ("filter", node.input(1));
+
+      EXPECT_EQ(1, node.attr().at("num_args").i());
+      EXPECT_EQ("bias", node.input(2));
+
+      const auto fused_ops = node.attr().at("fused_ops").list().s();
+      ASSERT_EQ(1, fused_ops.size());
+      EXPECT_EQ("BiasAdd", fused_ops[0]);
+      found++;
+    } else if (node.name() == "bias_add") {
+      EXPECT_EQ("Squeeze", node.op());
+      EXPECT_EQ("conv", node.input(0));
+      found++;
+    }
+  }
+  EXPECT_EQ(2, found);
+
+  auto tensors_expected = EvaluateNodes(item.graph, item.fetch, item.feed);
+  auto tensors = EvaluateNodes(output, item.fetch, item.feed);
+  EXPECT_EQ(1, tensors_expected.size());
+  EXPECT_EQ(1, tensors.size());
+  test::ExpectTensorNear<float>(tensors_expected[0], tensors[0], 1e-6);
+}
+
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/grappler/utils.cc b/tensorflow/core/grappler/utils.cc
index 9336c4df8b0..29775442629 100644
--- a/tensorflow/core/grappler/utils.cc
+++ b/tensorflow/core/grappler/utils.cc
@@ -40,8 +40,8 @@ namespace {
 template <typename T>
 bool SafeSetScalarTensorValue(double value, Tensor* tensor) {
   using RealType = typename Eigen::NumTraits<T>::Real;
-  if (value > static_cast<double>(std::numeric_limits<RealType>::max()) ||
-      value < static_cast<double>(std::numeric_limits<RealType>::min())) {
+  if (value > static_cast<double>(Eigen::NumTraits<RealType>::highest()) ||
+      value < static_cast<double>(Eigen::NumTraits<RealType>::lowest())) {
     return false;
   }
   tensor->flat<T>()(0) = static_cast<T>(value);
diff --git a/tensorflow/core/grappler/utils/BUILD b/tensorflow/core/grappler/utils/BUILD
index dbe425b75fd..2b9448e4034 100644
--- a/tensorflow/core/grappler/utils/BUILD
+++ b/tensorflow/core/grappler/utils/BUILD
@@ -172,7 +172,6 @@ cc_library(
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler:op_types",
         "//tensorflow/core/grappler:utils",
-        "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/strings",
     ],
 )
diff --git a/tensorflow/core/grappler/utils/functions.cc b/tensorflow/core/grappler/utils/functions.cc
index c806f3874dd..57863a71f35 100644
--- a/tensorflow/core/grappler/utils/functions.cc
+++ b/tensorflow/core/grappler/utils/functions.cc
@@ -74,120 +74,16 @@ Status ResolveFunctionBodyNodeAttrPlaceholders(
   return Status::OK();
 }
 
-absl::flat_hash_set<string> ReachableFunctions(
-    const FunctionLibraryDefinition& flib,
-    const protobuf::RepeatedPtrField<NodeDef>& nodes) {
-  // Functions that are reachable from the graph.
-  absl::flat_hash_set<string> reachable_funcs;
-
-  // Functions might be reachable from the nested function calls, so we keep a
-  // queue of functions that we have to check.
-  gtl::InlinedVector<const FunctionDef*, 4> func_queue;
-
-  // Add reachable and not already processed functions to the functions queue.
-  const auto add_to_func_queue = [&](const string& func_name) {
-    const FunctionDef* func = flib.Find(func_name);
-    if (func && reachable_funcs.find(func_name) == reachable_funcs.end()) {
-      func_queue.push_back(func);
-    }
-  };
-
-  // Add all the functions that are reachable from the given node to the queue.
-  const auto process_node = [&](const NodeDef& node) {
-    // Node itself can be a call to the function.
-    add_to_func_queue(node.op());
-
-    // Or node can have an attribute referencing a function.
-    for (const auto& attr : node.attr()) {
-      const auto& attr_value = attr.second;
-
-      // 1. AttrValue.func
-      if (attr_value.has_func()) {
-        add_to_func_queue(attr_value.func().name());
-      }
-
-      // 2. AttrValue.ListValue.func
-      if (attr_value.has_list()) {
-        for (const auto& func : attr_value.list().func()) {
-          add_to_func_queue(func.name());
-        }
-      }
-    }
-  };
-
-  // Add all functions that are directly called from the optimized graph.
-  std::for_each(nodes.begin(), nodes.end(), process_node);
-
-  // Process all reachable functions.
-  while (!func_queue.empty()) {
-    const FunctionDef* func = func_queue.back();
-    func_queue.pop_back();
-
-    const string& func_name = func->signature().name();
-    reachable_funcs.insert(func_name);
-
-    // Find all the functions called from the function body.
-    const auto& func_body = func->node_def();
-    std::for_each(func_body.begin(), func_body.end(), process_node);
-
-    // Check if the function has a registered gradient.
-    const string grad_func_name = flib.FindGradient(func_name);
-    if (!grad_func_name.empty()) add_to_func_queue(grad_func_name);
-  }
-
-  return reachable_funcs;
-}
-
-FunctionLibraryDefinition ReachableFunctionLibraryDefinition(
-    const FunctionLibraryDefinition& flib,
-    const protobuf::RepeatedPtrField<NodeDef>& nodes) {
-  absl::flat_hash_set<string> reachable_funcs = ReachableFunctions(flib, nodes);
-
-  FunctionLibraryDefinition reachable_flib(flib.default_registry(),
-                                           FunctionDefLibrary());
-
-  for (const string& func_name : reachable_funcs) {
-    const FunctionDef* func = flib.Find(func_name);
-    DCHECK_NE(func, nullptr);
-    // That should never fail, because we copy functions from valid flib and use
-    // the same default registry.
-    const Status added = reachable_flib.AddFunctionDef(*func);
-    DCHECK(added.ok());
-
-    const string grad_func_name = flib.FindGradient(func_name);
-    if (!grad_func_name.empty()) {
-      GradientDef grad;
-      grad.set_function_name(func_name);
-      grad.set_gradient_func(grad_func_name);
-      // It can only fail if function already has a gradient function.
-      const Status added_grad = reachable_flib.AddGradientDef(grad);
-      DCHECK(added_grad.ok());
-    }
-  }
-
-  return reachable_flib;
-}
-
 }  // namespace
 
-absl::flat_hash_set<string> ReachableFunctions(
-    const FunctionLibraryDefinition& flib, const GraphDef& graph) {
-  return ReachableFunctions(flib, graph.node());
-}
-
-absl::flat_hash_set<string> ReachableFunctions(
-    const FunctionLibraryDefinition& flib, const FunctionDef& func) {
-  return ReachableFunctions(flib, func.node_def());
-}
-
 FunctionLibraryDefinition ReachableFunctionLibraryDefinition(
     const FunctionLibraryDefinition& flib, const GraphDef& graph) {
-  return ReachableFunctionLibraryDefinition(flib, graph.node());
+  return flib.ReachableDefinitions(graph);
 }
 
 FunctionLibraryDefinition ReachableFunctionLibraryDefinition(
     const FunctionLibraryDefinition& flib, const FunctionDef& func) {
-  return ReachableFunctionLibraryDefinition(flib, func.node_def());
+  return flib.ReachableDefinitions(func);
 }
 
 void GrapplerFunctionConnectivity::RegisterInputArgExpansion(
@@ -198,7 +94,7 @@ void GrapplerFunctionConnectivity::RegisterInputArgExpansion(
   for (int i = 0; i < placeholders.size(); ++i) {
     const string& placeholder = input_arg_expansion.placeholders[i];
     input_arg_placeholders_.insert(
-        {placeholder, InputArgPlaceholder{input_name, /*position=*/i}});
+        {placeholder, InputArgPlaceholder{input_name, /*input_position=*/i}});
   }
   input_arg_expansions_.insert(
       {std::move(input_name), std::move(input_arg_expansion)});
@@ -352,8 +248,8 @@ Status GrapplerFunctionConnectivity::AsFunctionDefInput(
     const InputArgPlaceholder* placeholder =
         FindOrNull(input_arg_placeholders_, node_name);
     if (placeholder != nullptr) {
-      *func_def_input =
-          strings::StrCat(placeholder->input_name, ":", placeholder->position);
+      *func_def_input = strings::StrCat(placeholder->input_name, ":",
+                                        placeholder->input_position);
       return Status::OK();
     }
   }
@@ -451,12 +347,6 @@ GrapplerFunctionItem::GrapplerFunctionItem(
       fetch.push_back(output_tensor);
     }
   }
-  // Stateful and Send (it's not stateful) nodes must be preserved in the graph.
-  for (const NodeDef& node : graph.node()) {
-    if (IsSend(node)) {
-      keep_ops.push_back(node.name());
-    }
-  }
 }
 
 const string& GrapplerFunctionItem::description() const { return description_; }
@@ -688,8 +578,8 @@ Status MakeGrapplerFunctionItem(const FunctionDef& func,
     TF_RETURN_IF_ERROR(RegisterFunctionBodyOutputs(*registration, func_def_node,
                                                    &connectivity));
 
-    // Stateful and Send nodes must be preserved in a function body
-    if (registration->op_def.is_stateful() || IsSend(func_def_node)) {
+    // Ops with side effects must be preserved in a function body.
+    if (!IsFreeOfSideEffect(func_def_node)) {
       keep_nodes.push_back(func_def_node.name());
     }
   }
diff --git a/tensorflow/core/grappler/utils/functions.h b/tensorflow/core/grappler/utils/functions.h
index 72b3c0f31aa..038cf5f527e 100644
--- a/tensorflow/core/grappler/utils/functions.h
+++ b/tensorflow/core/grappler/utils/functions.h
@@ -19,7 +19,6 @@ limitations under the License.
 #include <memory>
 #include <string>
 #include <unordered_map>
-#include "absl/container/flat_hash_set.h"
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/function.pb.h"
@@ -31,13 +30,6 @@ limitations under the License.
 namespace tensorflow {
 namespace grappler {
 
-// Returns a set of functions from the function library, that are reachable from
-// the nodes of the graph.
-absl::flat_hash_set<string> ReachableFunctions(
-    const FunctionLibraryDefinition& flib, const GraphDef& graph);
-absl::flat_hash_set<string> ReachableFunctions(
-    const FunctionLibraryDefinition& flib, const FunctionDef& func);
-
 // Returns a copy of FunctionLibraryDefinition with subset of functions that are
 // reachable from the nodes of the graph.
 FunctionLibraryDefinition ReachableFunctionLibraryDefinition(
@@ -119,8 +111,10 @@ class GrapplerFunctionConnectivity {
   std::unordered_map<string, tensorflow::NameRangeMap> function_body_outputs_;
 
   struct InputArgPlaceholder {
-    string input_name;
-    int position;
+    string input_name;   // Name of the function input argument.
+    int input_position;  // Index of a tensor in the function input argument
+                         // expansion, it can be greater than `0` if input
+                         // argument is a list of tensors (aka list(type)).
   };
 
   // Mapping from input arg placeholder to the function input tensor.
@@ -150,12 +144,6 @@ class GrapplerFunctionItemInstantiation {
 class GrapplerFunctionItem : public GrapplerItem {
  public:
   GrapplerFunctionItem() = default;
-  GrapplerFunctionItem(string func_name, string description,
-                       AttrSlice func_attr,
-                       std::vector<InputArgExpansion> input_arg_expansions,
-                       std::vector<OutputArgExpansion> output_arg_expansions,
-                       std::vector<string> keep_nodes, int graph_def_version,
-                       bool is_stateful, GraphDef&& function_body);
 
   const string& description() const;
 
@@ -178,12 +166,22 @@ class GrapplerFunctionItem : public GrapplerItem {
   GrapplerFunctionItem& SwapFunctionBody(GraphDef&& other);
 
  private:
+  friend Status MakeGrapplerFunctionItem(const FunctionDef&, const AttrSlice&,
+                                         const FunctionLibraryDefinition&, int,
+                                         GrapplerFunctionItem*);
   friend Status ReplaceInputWithConst(const NodeDef&, int,
                                       GrapplerFunctionItem*);
   friend Status RemoveUnusedOutputs(
       const gtl::FlatSet<int>& active_outputs, GrapplerFunctionItem* item,
       std::vector<std::pair<int, int>>* output_mapping);
 
+  GrapplerFunctionItem(string func_name, string description,
+                       AttrSlice func_attr,
+                       std::vector<InputArgExpansion> input_arg_expansions,
+                       std::vector<OutputArgExpansion> output_arg_expansions,
+                       std::vector<string> keep_nodes, int graph_def_version,
+                       bool is_stateful, GraphDef&& function_body);
+
   string description_;
   AttrSlice func_attr_;  // Attributes specific to function definition that
                          // produced this item (FuncDef.attr field).
diff --git a/tensorflow/core/grappler/utils/functions_test.cc b/tensorflow/core/grappler/utils/functions_test.cc
index 16834acecf0..8639dec05a1 100644
--- a/tensorflow/core/grappler/utils/functions_test.cc
+++ b/tensorflow/core/grappler/utils/functions_test.cc
@@ -32,65 +32,6 @@ constexpr char kDevice[] = "/device:CPU:0";
 
 class FunctionsTest : public ::testing::Test {};
 
-TEST_F(FunctionsTest, ReachableFunctions) {
-  using ::tensorflow::test::function::GDef;
-  using ::tensorflow::test::function::NDef;
-  using FDH = ::tensorflow::FunctionDefHelper;
-
-  const auto make_simple_fdef = [](const string &name) {
-    return FDH::Create(
-        name, {"x:T", "y:T"}, {"z:T"}, {"T: {float, double}"},
-        {{{"output"}, "Mul", {"x", "y"}, {{"T", "$T"}}}},
-        /* Mapping between function returns and function node outputs. */
-        {{"z", "output:z:0"}});
-  };
-
-  FunctionDef func_1 = make_simple_fdef("Func1");
-  FunctionDef func_2 = make_simple_fdef("Func2");
-  FunctionDef func_3 = make_simple_fdef("Func3");
-
-  FunctionDef func_2_grad = make_simple_fdef("Func2_grad");
-
-  GraphDef graph = GDef(
-      {
-          NDef("a", "Placeholder", {}, {{"dtype", DT_FLOAT}}, kDevice),
-          NDef("b", "Placeholder", {}, {{"dtype", DT_FLOAT}}, kDevice),
-          NDef("x", "Func1", {"a", "b"}, {{"T", DT_FLOAT}}, kDevice),
-          NDef("y", "PartitionedCall", {"a", "b"},
-               {{"Tin", DataTypeSlice{DT_FLOAT, DT_FLOAT}},
-                {"Tout", DataTypeSlice{DT_FLOAT}},
-                {"f", FDH::FunctionRef("Func2", {{"T", DT_FLOAT}})}},
-               kDevice),
-      },
-      // FunctionLib
-      {func_1, func_2, func_3, func_2_grad});
-
-  // Register custom function gradient after the graph was constructed.
-  GradientDef *func3_grad_def = graph.mutable_library()->add_gradient();
-  func3_grad_def->set_function_name("Func2");
-  func3_grad_def->set_gradient_func("Func2_grad");
-
-  FunctionLibraryDefinition flib(OpRegistry::Global(), graph.library());
-
-  // - 'Func1' called directly from the graph
-  // - 'Func2' called indirectly via PartitionedCall attribute, and it also
-  //   has a custom gradient ('Func2_grad') that must remain in the library
-  // - 'Func3' in unreachable and has to be removed from the library
-
-  absl::flat_hash_set<string> reachable_funcs = ReachableFunctions(flib, graph);
-  ASSERT_EQ(reachable_funcs.size(), 3);
-  EXPECT_NE(reachable_funcs.find("Func1"), reachable_funcs.end());
-  EXPECT_NE(reachable_funcs.find("Func2"), reachable_funcs.end());
-  EXPECT_NE(reachable_funcs.find("Func2_grad"), reachable_funcs.end());
-
-  FunctionLibraryDefinition reachable_flib =
-      ReachableFunctionLibraryDefinition(flib, graph);
-  ASSERT_EQ(reachable_flib.num_functions(), 3);
-  EXPECT_TRUE(reachable_flib.Contains("Func1"));
-  EXPECT_TRUE(reachable_flib.Contains("Func2"));
-  EXPECT_TRUE(reachable_flib.Contains("Func2_grad"));
-}
-
 TEST_F(FunctionsTest, IsParametrized) {
   // Function is defined for multiple input types.
   FunctionDef parametrized_func = FunctionDefHelper::Create(
@@ -635,6 +576,33 @@ TEST_F(FunctionsTest, FromFunctionDefWithoutInput) {
   EXPECT_EQ("two", cast.input(0));
 }
 
+TEST_F(FunctionsTest, FromFunctionDefWithSideEffectfulOps) {
+  const Tensor kOne = test::AsScalar<float>(1.0);
+  FunctionDef func = FunctionDefHelper::Define(
+      /* Name */ "SideEffects",
+      /* Args */ {"x: Ref(float)"},
+      /* Return values */ {},
+      /* Attr def */ {},
+      /* Nodes */
+      {{{"one"}, "Const", {}, {{"value", kOne}, {"dtype", DT_FLOAT}}},
+       {{"update"}, "AssignAdd", {"x", "one"}, {{"T", DT_FLOAT}}}});
+
+  protobuf::Map<string, AttrValue> func_instantiation_attr;
+  FunctionLibraryDefinition flib(OpRegistry::Global(), FunctionDefLibrary());
+
+  GrapplerFunctionItem item;
+  TF_EXPECT_OK(MakeGrapplerFunctionItem(func,
+                                        AttrSlice(&func_instantiation_attr),
+                                        flib, TF_GRAPH_DEF_VERSION, &item));
+
+  EXPECT_EQ("SideEffects", item.id);
+  EXPECT_EQ(3, item.function_body().node_size());
+  EXPECT_EQ(1, item.input_size());
+  EXPECT_EQ(0, item.output_size());
+  ASSERT_EQ(1, item.keep_ops.size());
+  EXPECT_EQ("update", item.keep_ops[0]);
+}
+
 TEST_F(FunctionsTest, MakeFunctionDef) {
   const Tensor kTwo = test::AsScalar<int64>(2);
   FunctionDef func = FunctionDefHelper::Define(
diff --git a/tensorflow/core/grappler/utils/grappler_test.cc b/tensorflow/core/grappler/utils/grappler_test.cc
index 6266733f3e6..576494cad55 100644
--- a/tensorflow/core/grappler/utils/grappler_test.cc
+++ b/tensorflow/core/grappler/utils/grappler_test.cc
@@ -114,9 +114,13 @@ void GrapplerTest::CompareGraphs(GraphDef want, GraphDef got) const {
   for (int i = 0; i < want.node_size(); ++i) {
     EXPECT_EQ(want.node(i).op(), got.node(i).op());
     EXPECT_EQ(want.node(i).name(), got.node(i).name());
+    EXPECT_EQ(want.node(i).device(), got.node(i).device());
+
     ASSERT_EQ(want.node(i).input_size(), got.node(i).input_size());
     for (int j = 0; j < want.node(i).input_size(); ++j) {
-      EXPECT_TRUE(IsSameInput(want.node(i).input(j), got.node(i).input(j)));
+      const TensorId want_tensor = ParseTensorName(want.node(i).input(j));
+      const TensorId got_tensor = ParseTensorName(got.node(i).input(j));
+      EXPECT_EQ(want_tensor.ToString(), got_tensor.ToString());
     }
   }
 }
diff --git a/tensorflow/core/grappler/utils_test.cc b/tensorflow/core/grappler/utils_test.cc
index 8cbff1c3971..e993391b51b 100644
--- a/tensorflow/core/grappler/utils_test.cc
+++ b/tensorflow/core/grappler/utils_test.cc
@@ -16,10 +16,13 @@ limitations under the License.
 #include "tensorflow/core/grappler/utils.h"
 
 #include <unistd.h>
+#include <limits>
 #include <memory>
 #include "tensorflow/cc/ops/standard_ops.h"
 #include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/lib/bfloat16/bfloat16.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/core/threadpool.h"
@@ -441,6 +444,26 @@ BM_ParseNodeNameAsStringPiece("foo:123", foo123);
 BM_ParseNodeNameAsStringPiece("foo/bar/baz:123", foo_bar_baz_123);
 BM_ParseNodeNameAsStringPiece("^foo/bar/baz:123", foo_bar_baz_123_ctrl);
 
+TEST_F(UtilsTest, SetTensorValueBFloat16) {
+  Tensor t(DT_BFLOAT16, TensorShape({}));
+  TF_ASSERT_OK(SetTensorValue(t.dtype(), 2, &t));
+  test::ExpectTensorEqual<bfloat16>(Tensor(bfloat16(2)), t);
+}
+
+TEST_F(UtilsTest, SetTensorValueBFloat16IntMax) {
+  Tensor t(DT_BFLOAT16, TensorShape({}));
+  TF_ASSERT_OK(SetTensorValue(t.dtype(), std::numeric_limits<int>::max(), &t));
+  test::ExpectTensorEqual<bfloat16>(
+      Tensor(bfloat16(std::numeric_limits<int>::max())), t);
+}
+
+TEST_F(UtilsTest, SetTensorValueBFloat16IntMin) {
+  Tensor t(DT_BFLOAT16, TensorShape({}));
+  TF_ASSERT_OK(SetTensorValue(t.dtype(), std::numeric_limits<int>::min(), &t));
+  test::ExpectTensorEqual<bfloat16>(
+      Tensor(bfloat16(std::numeric_limits<int>::min())), t);
+}
+
 }  // namespace
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index 18d70422a0b..60accc0f9bd 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -195,15 +195,35 @@ cc_library(
     deps = ["//third_party/eigen3"],
 )
 
-cc_library(
+tf_kernel_library(
     name = "conv_2d",
     hdrs = ["conv_2d.h"],
+    gpu_srcs = [
+        "conv_2d.h",
+        "conv_2d_gpu.h",
+        "conv_2d_gpu_double.cu.cc",
+        "conv_2d_gpu_float.cu.cc",
+        "conv_2d_gpu_half.cu.cc",
+        "conv_2d_gpu_int.cu.cc",
+        "conv_2d_gpu_uint16.cu.cc",
+        "conv_2d_gpu_uint32.cu.cc",
+        "conv_2d_gpu_uint64.cu.cc",
+        "conv_2d_gpu_uint8.cu.cc",
+    ],
     deps = [
         ":eigen_helpers",
+        ":fill_functor",
         ":gpu_util_hdrs",
+        ":image_resizer_state",
+        ":ops_util",
+        "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:nn_ops_op_lib",
         "//third_party/eigen3",
     ],
+    alwayslink = 1,
 )
 
 cc_library(
@@ -1673,14 +1693,14 @@ tf_kernel_library(
     ],
     visibility = [":friends"],
     deps = [
-        ":conv_ops",
+        ":conv_2d",
         ":ops_util",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//third_party/eigen3",
     ],
-    alwayslink = 0,
+    alwayslink = 1,
 )
 
 tf_cc_test(
@@ -3185,7 +3205,7 @@ tf_kernel_library(
 tf_kernel_library(
     name = "scan_ops",
     prefix = "scan_ops",
-    deps = MATH_DEPS,
+    deps = MATH_DEPS + if_cuda(["@cub_archive//:cub"]),
 )
 
 tf_kernel_library(
@@ -3363,6 +3383,29 @@ tf_cuda_cc_test(
     ],
 )
 
+tf_cuda_cc_test(
+    name = "scan_ops_test",
+    size = "small",
+    srcs = ["scan_ops_test.cc"],
+    linkopts = select({
+        "//tensorflow:darwin": ["-headerpad_max_install_names"],
+        "//conditions:default": [],
+    }),
+    deps = [
+        ":host_constant_op",
+        ":ops_testutil",
+        ":ops_util",
+        ":scan_ops",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+    ],
+)
+
 tf_cuda_cc_test(
     name = "reduction_ops_test",
     size = "small",
@@ -4546,6 +4589,7 @@ tf_kernel_library(
         ":dense_update_functor",
         ":training_op_helpers",
         ":variable_ops",
+        ":inplace_ops",
     ],
 )
 
@@ -4783,11 +4827,14 @@ tf_kernel_library(
     name = "unicode_ops",
     prefix = "unicode_ops",
     deps = [
+        ":bounds_check",
         ":string_util",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:string_ops_op_lib",
+        "//third_party/eigen3",
+        "//third_party/icu/data:conversion_data",
         "@icu//:common",
     ],
 )
@@ -5360,6 +5407,7 @@ filegroup(
         "mfcc_mel_filterbank.h",
         "mirror_pad_op.h",
         "mirror_pad_op_cpu_impl.h",
+        "multinomial_op.h",
         "pad_op.h",
         "pooling_ops_3d.h",
         "random_op.h",
@@ -5378,6 +5426,7 @@ filegroup(
         "spacetobatch_functor.h",
         "spacetodepth_op.h",
         "spectrogram.h",
+        "stateless_random_ops.h",
         "string_util.h",
         "tensor_array.h",
         "tile_functor.h",
@@ -5517,6 +5566,7 @@ filegroup(
         "mirror_pad_op_cpu_impl_3.cc",
         "mirror_pad_op_cpu_impl_4.cc",
         "mirror_pad_op_cpu_impl_5.cc",
+        "multinomial_op.cc",
         "pad_op.cc",
         "padding_fifo_queue.cc",
         "padding_fifo_queue_op.cc",
@@ -5557,6 +5607,7 @@ filegroup(
         "stack.cc",
         "stack.h",
         "stack_ops.cc",
+        "stateless_random_ops.cc",
         "string_join_op.cc",
         "string_util.cc",
         "summary_op.cc",
@@ -5683,8 +5734,8 @@ filegroup(
             "batch_kernels.*",
             "regex_full_match_op.cc",
             "regex_replace_op.cc",
-            "unicode_script_op.cc",
             "unicode_ops.cc",
+            "unicode_script_op.cc",
             # Ops that are inherently incompatible with Android (e.g. tied to x86 platform).
             "mkl_*",
             "xsmm_*",
@@ -6229,6 +6280,28 @@ tf_cc_test(
     ],
 )
 
+tf_cc_test_mkl(
+    name = "mkl_quantized_pooling_ops_test",
+    size = "small",
+    srcs = ["mkl_quantized_pooling_ops_test.cc"],
+    deps = [
+        ":mkl_input_conversion_op",
+        ":mkl_pooling_ops",
+        ":ops_testutil",
+        ":ops_util",
+        ":quantization_utils",
+        ":quantized_ops",
+        "//tensorflow/core:array_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:math_ops_op_lib",
+        "//tensorflow/core:nn_ops_op_lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+    ],
+)
+
 tf_cc_test(
     name = "quantized_reshape_op_test",
     size = "small",
@@ -6695,6 +6768,13 @@ tf_kernel_library(
     ],
 )
 
+tf_kernel_library(
+    name = "tensor_forest_ops",
+    deps = [
+        "//tensorflow/core/kernels/tensor_forest:tensor_forest_ops",
+    ],
+)
+
 tf_kernel_library(
     name = "dataset_ops",
     deps = [
diff --git a/tensorflow/core/kernels/adjust_contrast_op.cc b/tensorflow/core/kernels/adjust_contrast_op.cc
index 72155fd0373..5e1ca166db7 100644
--- a/tensorflow/core/kernels/adjust_contrast_op.cc
+++ b/tensorflow/core/kernels/adjust_contrast_op.cc
@@ -320,13 +320,13 @@ class AdjustContrastOpv2<CPUDevice> : public AdjustContrastOpV2Base {
     int64 batch = outputs.dimension(0);
     int64 image_size = outputs.dimension(1);
     int64 channels = outputs.dimension(2);
-    // Similar to the reduction case, a straighforward implementation of this
+    // Similar to the reduction case, a straightforward implementation of this
     // does not utilize vectorization well because of the small channel size.
     // This algorithm repeatedly increases the area to be copied, and leads to
     // much better vectorinizations in the copy.
     for (int64 i = 0; i < batch; i++) {
       // Copy over the inputs into outputs in this batch. Effectively:
-      // outputs(i, :, k) = inputs(i, k). An example of how this algorith works:
+      // outputs(i, :, k) = inputs(i, k). An example of how this algorithm works:
       //
       //    x = float[1, 3], y = float[2048, 3]
       //    round 0
diff --git a/tensorflow/core/kernels/adjust_hue_op.cc b/tensorflow/core/kernels/adjust_hue_op.cc
index 6079aa749d5..52dec94305d 100644
--- a/tensorflow/core/kernels/adjust_hue_op.cc
+++ b/tensorflow/core/kernels/adjust_hue_op.cc
@@ -216,8 +216,8 @@ class AdjustHueOp<CPUDevice> : public AdjustHueOpBase {
         *context->device()->tensorflow_cpu_worker_threads();
     Shard(worker_threads.num_threads, worker_threads.workers, channel_count,
           kCostPerChannel,
-          [channel_count, &input_data, &output_data, delta_h](
-              int64 start_channel, int64 end_channel) {
+          [&input_data, &output_data, delta_h](int64 start_channel,
+                                               int64 end_channel) {
             const float* p = input_data.data() + start_channel * kChannelSize;
             float* q = output_data.data() + start_channel * kChannelSize;
             for (int i = start_channel; i < end_channel; i++) {
diff --git a/tensorflow/core/kernels/barrier_ops.cc b/tensorflow/core/kernels/barrier_ops.cc
index 944564dfba6..aa912358221 100644
--- a/tensorflow/core/kernels/barrier_ops.cc
+++ b/tensorflow/core/kernels/barrier_ops.cc
@@ -180,7 +180,7 @@ class Barrier : public ResourceBase {
         // SQSS is closed, nothing is left in the incomplete set,
         // the queue is not already marked as closed, and (most
         // importantly), the queue has entries in it.
-        [this, ctx, callback, component_index]() {
+        [this, ctx, callback]() {
           if (!ctx->status().ok()) {
             callback();
             return;
diff --git a/tensorflow/core/kernels/boosted_trees/boosted_trees.proto b/tensorflow/core/kernels/boosted_trees/boosted_trees.proto
index 1ab72af0591..4e9bab3e21f 100644
--- a/tensorflow/core/kernels/boosted_trees/boosted_trees.proto
+++ b/tensorflow/core/kernels/boosted_trees/boosted_trees.proto
@@ -12,6 +12,7 @@ message Node {
     Leaf leaf = 1;
     BucketizedSplit bucketized_split = 2;
     CategoricalSplit categorical_split = 3;
+    DenseSplit dense_split = 4;
   }
   NodeMetadata metadata = 777;
 }
@@ -70,6 +71,19 @@ message CategoricalSplit {
   int32 right_id = 4;
 }
 
+// TODO(nponomareva): move out of boosted_trees and rename to trees.proto
+message DenseSplit {
+  // Float feature column and split threshold describing
+  // the rule feature <= threshold.
+  int32 feature_id = 1;
+  float threshold = 2;
+
+  // Node children indexing into a contiguous
+  // vector of nodes starting from the root.
+  int32 left_id = 3;
+  int32 right_id = 4;
+}
+
 // Tree describes a list of connected nodes.
 // Node 0 must be the root and can carry any payload including a leaf
 // in the case of representing the bias.
diff --git a/tensorflow/core/kernels/control_flow_ops.cc b/tensorflow/core/kernels/control_flow_ops.cc
index 382c9d5e503..081ef72c15d 100644
--- a/tensorflow/core/kernels/control_flow_ops.cc
+++ b/tensorflow/core/kernels/control_flow_ops.cc
@@ -71,11 +71,14 @@ TF_CALL_ALL_TYPES(REGISTER_CPU_SWITCH);
 TF_CALL_ALL_TYPES(REGISTER_CPU_REF_SWITCH);
 TF_CALL_QUANTIZED_TYPES(REGISTER_CPU_SWITCH);
 TF_CALL_QUANTIZED_TYPES(REGISTER_CPU_REF_SWITCH);
+REGISTER_CPU_SWITCH(uint64);
 
 TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_GPU_SWITCH);
 TF_CALL_QUANTIZED_TYPES(REGISTER_GPU_SWITCH);
 TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_GPU_REF_SWITCH);
 TF_CALL_QUANTIZED_TYPES(REGISTER_GPU_REF_SWITCH);
+REGISTER_GPU_SWITCH(uint64);
+TF_CALL_variant(REGISTER_GPU_SWITCH);
 
 #undef REGISTER_CPU_SWITCH
 #undef REGISTER_CPU_REF_SWITCH
@@ -263,6 +266,7 @@ TF_CALL_QUANTIZED_TYPES(REGISTER_GPU_KERNEL);
 TF_CALL_QUANTIZED_TYPES(REGISTER_GPU_REF_KERNEL);
 REGISTER_GPU_KERNEL(bool);
 REGISTER_GPU_REF_KERNEL(bool);
+REGISTER_GPU_KERNEL(uint64);
 
 #undef REGISTER_GPU_KERNEL
 #undef REGISTER_GPU_REF_KERNEL
diff --git a/tensorflow/core/kernels/conv_2d.h b/tensorflow/core/kernels/conv_2d.h
index a6964b1aacb..1bac2a18c30 100644
--- a/tensorflow/core/kernels/conv_2d.h
+++ b/tensorflow/core/kernels/conv_2d.h
@@ -162,7 +162,7 @@ struct TransformFilter {
     merged_dims[1] = in.dimension(NDIMS - 2);  // input filters
     merged_dims[2] = in.dimension(NDIMS - 1);  // output filters
 
-    CHECK(dst_filter_format == FORMAT_OIHW)
+    DCHECK(dst_filter_format == FORMAT_OIHW)
         << "Unsupported destination filter format: "
         << ToString(dst_filter_format);
     // Source filter format is FORMAT_HWIO and spatial dimensions HW are merged
diff --git a/tensorflow/core/kernels/conv_ops_gpu_3.cu.cc b/tensorflow/core/kernels/conv_2d_gpu.h
similarity index 91%
rename from tensorflow/core/kernels/conv_ops_gpu_3.cu.cc
rename to tensorflow/core/kernels/conv_2d_gpu.h
index 46167db3a2b..e2db86776a1 100644
--- a/tensorflow/core/kernels/conv_ops_gpu_3.cu.cc
+++ b/tensorflow/core/kernels/conv_2d_gpu.h
@@ -13,6 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#ifndef TENSORFLOW_CORE_KERNELS_CONV_2D_GPU_H_
+#define TENSORFLOW_CORE_KERNELS_CONV_2D_GPU_H_
+
 #if GOOGLE_CUDA
 
 #define EIGEN_USE_GPU
@@ -34,7 +37,7 @@ namespace tensorflow {
 typedef Eigen::GpuDevice GPUDevice;
 
 namespace functor {
-namespace {
+
 template <typename T, bool conjugate>
 struct maybe_conj {
   __device__ static __inline__ T run(T x) {
@@ -75,8 +78,6 @@ struct maybe_conj<double2, conjugate> {
   }
 };
 
-}  // namespace
-
 // TODO(mjanusz): Move this to a shared util file.
 // A simple array that contains data that can be passed between CPU and GPU.
 template <typename T, int IndexCount, T DefaultValue>
@@ -998,78 +999,79 @@ struct NCHWToNHWC<GPUDevice, T, NDIMS> {
   }
 };
 
-}  // namespace functor
+template struct ShuffleAndReverse<Eigen::GpuDevice, float, 4, int>;
+template struct ShuffleAndReverse<Eigen::GpuDevice, Eigen::half, 4, int>;
 
-template struct functor::ShuffleAndReverse<GPUDevice, float, 4, int>;
-template struct functor::ShuffleAndReverse<GPUDevice, Eigen::half, 4, int>;
+template struct ShuffleAndReverse<Eigen::GpuDevice, float, 4,
+                                  Eigen::DenseIndex>;
+template struct ShuffleAndReverse<Eigen::GpuDevice, Eigen::half, 4,
+                                  Eigen::DenseIndex>;
 
-template struct functor::ShuffleAndReverse<GPUDevice, float, 4,
-                                           Eigen::DenseIndex>;
-template struct functor::ShuffleAndReverse<GPUDevice, Eigen::half, 4,
-                                           Eigen::DenseIndex>;
+template struct TransformDepth<Eigen::GpuDevice, float, int>;
+template struct TransformDepth<Eigen::GpuDevice, Eigen::half, int>;
 
-template struct functor::TransformDepth<GPUDevice, float, int>;
-template struct functor::TransformDepth<GPUDevice, Eigen::half, int>;
+template struct SwapDimension1And2InTensor3<Eigen::GpuDevice, uint8>;
+template struct SwapDimension1And2InTensor3<Eigen::GpuDevice, uint16>;
+template struct SwapDimension1And2InTensor3<Eigen::GpuDevice, uint32>;
+template struct SwapDimension1And2InTensor3<Eigen::GpuDevice, uint64>;
+template struct SwapDimension1And2InTensor3<Eigen::GpuDevice, float4>;
+template struct SwapDimension1And2InTensor3<Eigen::GpuDevice, float2,
+                                            /*conjugate=*/true>;
+template struct SwapDimension1And2InTensor3<Eigen::GpuDevice, double2,
+                                            /*conjugate=*/true>;
+template struct SwapDimension1And2InTensor3<Eigen::GpuDevice, Eigen::half>;
 
-template struct functor::SwapDimension1And2InTensor3<GPUDevice, uint8>;
-template struct functor::SwapDimension1And2InTensor3<GPUDevice, uint16>;
-template struct functor::SwapDimension1And2InTensor3<GPUDevice, uint32>;
-template struct functor::SwapDimension1And2InTensor3<GPUDevice, uint64>;
-template struct functor::SwapDimension1And2InTensor3<GPUDevice, float4>;
-template struct functor::SwapDimension1And2InTensor3<GPUDevice, float2,
-                                                     /*conjugate=*/true>;
-template struct functor::SwapDimension1And2InTensor3<GPUDevice, double2,
-                                                     /*conjugate=*/true>;
-template struct functor::SwapDimension1And2InTensor3<GPUDevice, Eigen::half>;
-
-template struct functor::SwapDimension0And2InTensor3<GPUDevice, uint8>;
-template struct functor::SwapDimension0And2InTensor3<GPUDevice, uint16>;
-template struct functor::SwapDimension0And2InTensor3<GPUDevice, uint32>;
-template struct functor::SwapDimension0And2InTensor3<GPUDevice, uint64>;
-template struct functor::SwapDimension0And2InTensor3<GPUDevice, float4>;
-template struct functor::SwapDimension0And2InTensor3<GPUDevice, float2,
-                                                     /*conjugate=*/true>;
-template struct functor::SwapDimension0And2InTensor3<GPUDevice, double2,
-                                                     /*conjugate=*/true>;
+template struct SwapDimension0And2InTensor3<Eigen::GpuDevice, uint8>;
+template struct SwapDimension0And2InTensor3<Eigen::GpuDevice, uint16>;
+template struct SwapDimension0And2InTensor3<Eigen::GpuDevice, uint32>;
+template struct SwapDimension0And2InTensor3<Eigen::GpuDevice, uint64>;
+template struct SwapDimension0And2InTensor3<Eigen::GpuDevice, float4>;
+template struct SwapDimension0And2InTensor3<Eigen::GpuDevice, float2,
+                                            /*conjugate=*/true>;
+template struct SwapDimension0And2InTensor3<Eigen::GpuDevice, double2,
+                                            /*conjugate=*/true>;
 
 // For 2d ops.
-template struct functor::TransformFilter<GPUDevice, double, int, 4>;
-template struct functor::TransformFilter<GPUDevice, float, int, 4>;
-template struct functor::TransformFilter<GPUDevice, Eigen::half, int, 4>;
+template struct TransformFilter<Eigen::GpuDevice, double, int, 4>;
+template struct TransformFilter<Eigen::GpuDevice, float, int, 4>;
+template struct TransformFilter<Eigen::GpuDevice, Eigen::half, int, 4>;
 
-template struct functor::ReverseTransformFilter<GPUDevice, double, 4>;
-template struct functor::ReverseTransformFilter<GPUDevice, float, 4>;
-template struct functor::ReverseTransformFilter<GPUDevice, Eigen::half, 4>;
+template struct ReverseTransformFilter<Eigen::GpuDevice, double, 4>;
+template struct ReverseTransformFilter<Eigen::GpuDevice, float, 4>;
+template struct ReverseTransformFilter<Eigen::GpuDevice, Eigen::half, 4>;
 
-template struct functor::NHWCToNCHW<GPUDevice, double, 4>;
-template struct functor::NHWCToNCHW<GPUDevice, float, 4>;
-template struct functor::NHWCToNCHW<GPUDevice, Eigen::half, 4>;
+template struct NHWCToNCHW<Eigen::GpuDevice, double, 4>;
+template struct NHWCToNCHW<Eigen::GpuDevice, float, 4>;
+template struct NHWCToNCHW<Eigen::GpuDevice, Eigen::half, 4>;
 
-template struct functor::NCHWToNHWC<GPUDevice, double, 4>;
-template struct functor::NCHWToNHWC<GPUDevice, float, 4>;
-template struct functor::NCHWToNHWC<GPUDevice, Eigen::half, 4>;
+template struct NCHWToNHWC<Eigen::GpuDevice, double, 4>;
+template struct NCHWToNHWC<Eigen::GpuDevice, float, 4>;
+template struct NCHWToNHWC<Eigen::GpuDevice, Eigen::half, 4>;
 
-template struct functor::PadInput<GPUDevice, int, int, 4>;
-template struct functor::PadInput<GPUDevice, double, int, 4>;
-template struct functor::PadInput<GPUDevice, float, int, 4>;
-template struct functor::PadInput<GPUDevice, Eigen::half, int, 4>;
+template struct PadInput<Eigen::GpuDevice, int, int, 4>;
+template struct PadInput<Eigen::GpuDevice, double, int, 4>;
+template struct PadInput<Eigen::GpuDevice, float, int, 4>;
+template struct PadInput<Eigen::GpuDevice, Eigen::half, int, 4>;
 
 // For 3d ops.
-template struct functor::TransformFilter<GPUDevice, float, int, 5>;
-template struct functor::TransformFilter<GPUDevice, Eigen::half, int, 5>;
+template struct TransformFilter<Eigen::GpuDevice, float, int, 5>;
+template struct TransformFilter<Eigen::GpuDevice, Eigen::half, int, 5>;
 
-template struct functor::ReverseTransformFilter<GPUDevice, float, 5>;
-template struct functor::ReverseTransformFilter<GPUDevice, Eigen::half, 5>;
+template struct ReverseTransformFilter<Eigen::GpuDevice, float, 5>;
+template struct ReverseTransformFilter<Eigen::GpuDevice, Eigen::half, 5>;
 
-template struct functor::NHWCToNCHW<GPUDevice, float, 5>;
-template struct functor::NHWCToNCHW<GPUDevice, Eigen::half, 5>;
+template struct NHWCToNCHW<Eigen::GpuDevice, float, 5>;
+template struct NHWCToNCHW<Eigen::GpuDevice, Eigen::half, 5>;
 
-template struct functor::NCHWToNHWC<GPUDevice, float, 5>;
-template struct functor::NCHWToNHWC<GPUDevice, Eigen::half, 5>;
+template struct NCHWToNHWC<Eigen::GpuDevice, float, 5>;
+template struct NCHWToNHWC<Eigen::GpuDevice, Eigen::half, 5>;
 
-template struct functor::PadInput<GPUDevice, float, int, 5>;
-template struct functor::PadInput<GPUDevice, Eigen::half, int, 5>;
+template struct PadInput<Eigen::GpuDevice, float, int, 5>;
+template struct PadInput<Eigen::GpuDevice, Eigen::half, int, 5>;
 
+}  // namespace functor
 }  // namespace tensorflow
 
 #endif  // GOOGLE_CUDA
+
+#endif  // TENSORFLOW_CORE_KERNELS_CONV_2D_GPU_H_
diff --git a/tensorflow/core/kernels/conv_2d_gpu_double.cu.cc b/tensorflow/core/kernels/conv_2d_gpu_double.cu.cc
new file mode 100644
index 00000000000..353d6d11302
--- /dev/null
+++ b/tensorflow/core/kernels/conv_2d_gpu_double.cu.cc
@@ -0,0 +1,50 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/kernels/conv_2d.h"
+#include "tensorflow/core/kernels/conv_2d_gpu.h"
+
+namespace tensorflow {
+
+namespace functor {
+
+template struct SwapDimension1And2InTensor3<Eigen::GpuDevice, double2,
+                                            /*conjugate=*/true>;
+
+template struct SwapDimension0And2InTensor3<Eigen::GpuDevice, double2,
+                                            /*conjugate=*/true>;
+
+// For 2d ops.
+template struct TransformFilter<Eigen::GpuDevice, double, int, 4>;
+template struct ReverseTransformFilter<Eigen::GpuDevice, double, 4>;
+template struct NHWCToNCHW<Eigen::GpuDevice, double, 4>;
+template struct NCHWToNHWC<Eigen::GpuDevice, double, 4>;
+template struct PadInput<Eigen::GpuDevice, double, int, 4>;
+
+// For 3d ops.
+template struct TransformFilter<Eigen::GpuDevice, double, int, 5>;
+template struct ReverseTransformFilter<Eigen::GpuDevice, double, 5>;
+template struct NHWCToNCHW<Eigen::GpuDevice, double, 5>;
+template struct NCHWToNHWC<Eigen::GpuDevice, double, 5>;
+template struct PadInput<Eigen::GpuDevice, double, int, 5>;
+
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/conv_2d_gpu_float.cu.cc b/tensorflow/core/kernels/conv_2d_gpu_float.cu.cc
new file mode 100644
index 00000000000..21030dd12b3
--- /dev/null
+++ b/tensorflow/core/kernels/conv_2d_gpu_float.cu.cc
@@ -0,0 +1,63 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include <algorithm>
+#include <array>
+#include <limits>
+#include <utility>
+
+#include "tensorflow/core/kernels/conv_2d.h"
+#include "tensorflow/core/kernels/conv_2d_gpu.h"
+
+namespace tensorflow {
+
+namespace functor {
+
+template struct ShuffleAndReverse<Eigen::GpuDevice, float, 4, int>;
+template struct ShuffleAndReverse<Eigen::GpuDevice, float, 4,
+                                  Eigen::DenseIndex>;
+
+template struct TransformDepth<Eigen::GpuDevice, float, int>;
+
+template struct SwapDimension1And2InTensor3<Eigen::GpuDevice, float4>;
+template struct SwapDimension1And2InTensor3<Eigen::GpuDevice, float2,
+                                            /*conjugate=*/true>;
+
+template struct SwapDimension0And2InTensor3<Eigen::GpuDevice, float4>;
+template struct SwapDimension0And2InTensor3<Eigen::GpuDevice, float2,
+                                            /*conjugate=*/true>;
+
+// For 2d ops.
+template struct TransformFilter<Eigen::GpuDevice, float, int, 4>;
+template struct ReverseTransformFilter<Eigen::GpuDevice, float, 4>;
+template struct NHWCToNCHW<Eigen::GpuDevice, float, 4>;
+template struct NCHWToNHWC<Eigen::GpuDevice, float, 4>;
+template struct PadInput<Eigen::GpuDevice, float, int, 4>;
+
+// For 3d ops.
+template struct TransformFilter<Eigen::GpuDevice, float, int, 5>;
+template struct ReverseTransformFilter<Eigen::GpuDevice, float, 5>;
+template struct NHWCToNCHW<Eigen::GpuDevice, float, 5>;
+template struct NCHWToNHWC<Eigen::GpuDevice, float, 5>;
+template struct PadInput<Eigen::GpuDevice, float, int, 5>;
+
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/conv_2d_gpu_half.cu.cc b/tensorflow/core/kernels/conv_2d_gpu_half.cu.cc
new file mode 100644
index 00000000000..948308651fb
--- /dev/null
+++ b/tensorflow/core/kernels/conv_2d_gpu_half.cu.cc
@@ -0,0 +1,57 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include <algorithm>
+#include <array>
+#include <limits>
+#include <utility>
+
+#include "tensorflow/core/kernels/conv_2d.h"
+#include "tensorflow/core/kernels/conv_2d_gpu.h"
+
+namespace tensorflow {
+
+namespace functor {
+
+template struct ShuffleAndReverse<Eigen::GpuDevice, Eigen::half, 4, int>;
+template struct ShuffleAndReverse<Eigen::GpuDevice, Eigen::half, 4,
+                                  Eigen::DenseIndex>;
+
+template struct TransformDepth<Eigen::GpuDevice, Eigen::half, int>;
+
+template struct SwapDimension1And2InTensor3<Eigen::GpuDevice, Eigen::half>;
+
+// For 2d ops.
+template struct TransformFilter<Eigen::GpuDevice, Eigen::half, int, 4>;
+template struct ReverseTransformFilter<Eigen::GpuDevice, Eigen::half, 4>;
+template struct NHWCToNCHW<Eigen::GpuDevice, Eigen::half, 4>;
+template struct NCHWToNHWC<Eigen::GpuDevice, Eigen::half, 4>;
+template struct PadInput<Eigen::GpuDevice, Eigen::half, int, 4>;
+
+// For 3d ops.
+template struct TransformFilter<Eigen::GpuDevice, Eigen::half, int, 5>;
+template struct ReverseTransformFilter<Eigen::GpuDevice, Eigen::half, 5>;
+template struct NHWCToNCHW<Eigen::GpuDevice, Eigen::half, 5>;
+template struct NCHWToNHWC<Eigen::GpuDevice, Eigen::half, 5>;
+template struct PadInput<Eigen::GpuDevice, Eigen::half, int, 5>;
+
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/preprocessor.h b/tensorflow/core/kernels/conv_2d_gpu_int.cu.cc
similarity index 54%
rename from tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/preprocessor.h
rename to tensorflow/core/kernels/conv_2d_gpu_int.cu.cc
index 3d02406371c..901ce3e55d4 100644
--- a/tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/preprocessor.h
+++ b/tensorflow/core/kernels/conv_2d_gpu_int.cu.cc
@@ -13,14 +13,26 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CONTRIB_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_PREPROCESSOR_H_
-#define TENSORFLOW_CONTRIB_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_PREPROCESSOR_H_
+#if GOOGLE_CUDA
 
-#include "tensorflow/lite/c/c_api_internal.h"
-#include "tensorflow/lite/experimental/micro/micro_error_reporter.h"
+#define EIGEN_USE_GPU
 
-TfLiteStatus Preprocess(tflite::ErrorReporter* error_reporter,
-                        const int16_t* input, int input_size, int output_size,
-                        uint8_t* output);
+#include <algorithm>
+#include <array>
+#include <limits>
+#include <utility>
 
-#endif  // TENSORFLOW_CONTRIB_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_PREPROCESSOR_H_
+#include "tensorflow/core/kernels/conv_2d.h"
+#include "tensorflow/core/kernels/conv_2d_gpu.h"
+
+namespace tensorflow {
+
+namespace functor {
+
+// For 2d ops.
+template struct PadInput<Eigen::GpuDevice, int, int, 4>;
+
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/conv_2d_gpu_uint16.cu.cc b/tensorflow/core/kernels/conv_2d_gpu_uint16.cu.cc
new file mode 100644
index 00000000000..e47532a9832
--- /dev/null
+++ b/tensorflow/core/kernels/conv_2d_gpu_uint16.cu.cc
@@ -0,0 +1,38 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include <algorithm>
+#include <array>
+#include <limits>
+#include <utility>
+
+#include "tensorflow/core/kernels/conv_2d.h"
+#include "tensorflow/core/kernels/conv_2d_gpu.h"
+
+namespace tensorflow {
+
+namespace functor {
+
+template struct SwapDimension1And2InTensor3<Eigen::GpuDevice, uint16>;
+template struct SwapDimension0And2InTensor3<Eigen::GpuDevice, uint16>;
+
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/conv_2d_gpu_uint32.cu.cc b/tensorflow/core/kernels/conv_2d_gpu_uint32.cu.cc
new file mode 100644
index 00000000000..56cd5dd218c
--- /dev/null
+++ b/tensorflow/core/kernels/conv_2d_gpu_uint32.cu.cc
@@ -0,0 +1,38 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include <algorithm>
+#include <array>
+#include <limits>
+#include <utility>
+
+#include "tensorflow/core/kernels/conv_2d.h"
+#include "tensorflow/core/kernels/conv_2d_gpu.h"
+
+namespace tensorflow {
+
+namespace functor {
+
+template struct SwapDimension1And2InTensor3<Eigen::GpuDevice, uint32>;
+template struct SwapDimension0And2InTensor3<Eigen::GpuDevice, uint32>;
+
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/conv_2d_gpu_uint64.cu.cc b/tensorflow/core/kernels/conv_2d_gpu_uint64.cu.cc
new file mode 100644
index 00000000000..045a664e965
--- /dev/null
+++ b/tensorflow/core/kernels/conv_2d_gpu_uint64.cu.cc
@@ -0,0 +1,38 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include <algorithm>
+#include <array>
+#include <limits>
+#include <utility>
+
+#include "tensorflow/core/kernels/conv_2d.h"
+#include "tensorflow/core/kernels/conv_2d_gpu.h"
+
+namespace tensorflow {
+
+namespace functor {
+
+template struct SwapDimension1And2InTensor3<Eigen::GpuDevice, uint64>;
+template struct SwapDimension0And2InTensor3<Eigen::GpuDevice, uint64>;
+
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/conv_2d_gpu_uint8.cu.cc b/tensorflow/core/kernels/conv_2d_gpu_uint8.cu.cc
new file mode 100644
index 00000000000..215417860af
--- /dev/null
+++ b/tensorflow/core/kernels/conv_2d_gpu_uint8.cu.cc
@@ -0,0 +1,38 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include <algorithm>
+#include <array>
+#include <limits>
+#include <utility>
+
+#include "tensorflow/core/kernels/conv_2d.h"
+#include "tensorflow/core/kernels/conv_2d_gpu.h"
+
+namespace tensorflow {
+
+namespace functor {
+
+template struct SwapDimension1And2InTensor3<Eigen::GpuDevice, uint8>;
+template struct SwapDimension0And2InTensor3<Eigen::GpuDevice, uint8>;
+
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/conv_grad_ops_3d.cc b/tensorflow/core/kernels/conv_grad_ops_3d.cc
index f62c60d255d..e4c49efea0b 100644
--- a/tensorflow/core/kernels/conv_grad_ops_3d.cc
+++ b/tensorflow/core/kernels/conv_grad_ops_3d.cc
@@ -1074,6 +1074,7 @@ namespace functor {
 
 DECLARE_GPU_SPEC(Eigen::half);
 DECLARE_GPU_SPEC(float);
+DECLARE_GPU_SPEC(double);
 #undef DECLARE_GPU_SPEC
 }  // namespace functor
 
@@ -1863,6 +1864,7 @@ class Conv3DBackpropFilterOp<GPUDevice, T> : public OpKernel {
                           Conv3DBackpropFilterOp<GPUDevice, T>);
 TF_CALL_half(REGISTER_GPU_KERNEL);
 TF_CALL_float(REGISTER_GPU_KERNEL);
+TF_CALL_double(REGISTER_GPU_KERNEL);
 #undef REGISTER_GPU_KERNEL
 
 #endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/conv_ops_3d.cc b/tensorflow/core/kernels/conv_ops_3d.cc
index 83df4dce38e..f20ac93b5a0 100644
--- a/tensorflow/core/kernels/conv_ops_3d.cc
+++ b/tensorflow/core/kernels/conv_ops_3d.cc
@@ -533,10 +533,19 @@ namespace functor {
       const GPUDevice& d, typename TTypes<T, 5, int>::ConstTensor in, \
       const std::array<int, 3>& padding_left,                         \
       const std::array<int, 3>& padding_right,                        \
-      typename TTypes<T, 5, int>::Tensor out, TensorFormat format);
+      typename TTypes<T, 5, int>::Tensor out, TensorFormat format);   \
+  template <>                                                         \
+  void NHWCToNCHW<GPUDevice, T, 5>::operator()(                       \
+      const GPUDevice& d, typename TTypes<T, 5>::ConstTensor in,      \
+      typename TTypes<T, 5>::Tensor out);                             \
+  template <>                                                         \
+  void NCHWToNHWC<GPUDevice, T, 5>::operator()(                       \
+      const GPUDevice& d, typename TTypes<T, 5>::ConstTensor in,      \
+      typename TTypes<T, 5>::Tensor out);
 
 DECLARE_GPU_SPEC(Eigen::half);
 DECLARE_GPU_SPEC(float);
+DECLARE_GPU_SPEC(double);
 #undef DECLARE_GPU_SPEC
 
 }  // namespace functor
@@ -548,6 +557,9 @@ REGISTER_KERNEL_BUILDER(
 REGISTER_KERNEL_BUILDER(
     Name("Conv3D").Device(DEVICE_GPU).TypeConstraint<float>("T"),
     Conv3DOp<GPUDevice, float>);
+REGISTER_KERNEL_BUILDER(
+    Name("Conv3D").Device(DEVICE_GPU).TypeConstraint<double>("T"),
+    Conv3DOp<GPUDevice, double>);
 #endif  // GOOGLE_CUDA
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/conv_ops_fused.cc b/tensorflow/core/kernels/conv_ops_fused.cc
index c75bb679322..798a7325cd2 100644
--- a/tensorflow/core/kernels/conv_ops_fused.cc
+++ b/tensorflow/core/kernels/conv_ops_fused.cc
@@ -14,897 +14,30 @@ limitations under the License.
 ==============================================================================*/
 
 // Implements convolution operations with other kernels baked into the
-// processing, to optimize latency and memory usage.
+// processing, to optimize latency and memory usage:
+//  - Conv2D + BiasAdd + <Activation>
+//  - Conv2D + FusedBatchNorm + <Activation>
+//
+// Activation: Relu, Relu6, Elu, etc...
+//
+// Kernels for convolutions fused with image transformations (resize and mirror
+// padding) defined in `conv_ops_fused_image_transform.cc`.
 
 #define EIGEN_USE_THREADS
 
-#include <string.h>
-#include <map>
+#include <string>
 #include <vector>
-#include "tensorflow/core/framework/common_shape_fns.h"
-#include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
-#include "tensorflow/core/framework/resource_mgr.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
-#include "tensorflow/core/framework/tensor_slice.h"
 #include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/conv_2d.h"
 #include "tensorflow/core/kernels/conv_ops.h"
-#include "tensorflow/core/kernels/gemm_functors.h"
-#include "tensorflow/core/kernels/image_resizer_state.h"
 #include "tensorflow/core/kernels/ops_util.h"
-#include "tensorflow/core/lib/core/threadpool.h"
-#include "tensorflow/core/util/mirror_pad_mode.h"
-#include "tensorflow/core/util/padding.h"
 #include "tensorflow/core/util/tensor_format.h"
 
 namespace tensorflow {
-
-namespace {
-
-// We don't want to allocate a buffer to hold all the patches if the size is
-// going to be extremely large, so break it into chunks if it's bigger than
-// a limit. Each chunk will be processed serially, so we can refill the
-// buffer for the next chunk and reuse it, keeping maximum memory size down.
-// In this case, we've picked 16 megabytes as a reasonable limit for Android and
-// other platforms using Eigen, and 1MB for iOS devices, from experimentation.
-#if defined(__APPLE__) && defined(IS_MOBILE_PLATFORM)
-const size_t kMaxChunkSize = (1 * 1024 * 1024);
-#else
-const size_t kMaxChunkSize = (16 * 1024 * 1024);
-#endif
-const size_t kResizeCacheSize = (8 * 1024 * 1024);
-
-// Lookup method used when resizing.
-enum SamplingMode {
-  BILINEAR = 0,
-  NEAREST = 1,
-};
-
-// Simple utility function used by FusedConv to multithread basic workloads. To
-// use it, pass begin and end values for the full workload and a std::function
-// that receives a subset of that through the begin and end values for each
-// worker's task. The division of the full workload into worker tasks is handled
-// by the multithreading logic. Here's an example of how to use it:
-// std::vector<float> my_vector(100);
-// ...
-// FusedConvParallelFor(context, 0, 100,
-//   [&my_vector](int64 task_begin, int64 task_end) {
-//     for (int64 current = task_begin; current != task_end; ++current) {
-//       my_vector[current] *= 10.0f;
-//     }
-// });
-void FusedConvParallelFor(
-    OpKernelContext* context, int64 begin, int64 end,
-    const std::function<void(int64, int64)>& task_function) {
-// On iOS, the thread management imposes a very big performance penalty, so
-// just call the function directly with no multithreading.
-#if defined(__APPLE__) && defined(IS_MOBILE_PLATFORM)
-  task_function(begin, end);
-#else
-  auto& worker_threads = *(context->device()->tensorflow_cpu_worker_threads());
-  thread::ThreadPool* thread_pool = worker_threads.workers;
-  const int64 total_elements = end - begin;
-  // This is a bit of an arbitrary number, but was found to work well for
-  // typical models we've been profiling on various devices.
-  const int64 element_cost = 10000000;
-  thread_pool->ParallelFor(
-      total_elements, element_cost,
-      [begin, task_function](int64 begin_offset, int64 end_offset) {
-        const int64 task_begin = begin + begin_offset;
-        const int64 task_end = begin + end_offset;
-        task_function(task_begin, task_end);
-      });
-#endif
-}
-
-// Holds the state needed for the resizing subtasks.
-template <class T1>
-struct ResizeTaskParameters {
-  ResizeTaskParameters() : st(false) {}
-
-  int cache_height;
-  T1* resize_cache;
-  int cache_line_width;
-  int input_width;
-  int input_depth;
-  int top_padding;
-  int pad_offset;
-  int64 resized_height;
-  ImageResizerState st;
-  const T1* input_batch_start;
-  int64 cache_start_x;
-  int64 cache_end_x;
-  int left_padding;
-  int64 resized_width;
-  int64 padded_width;
-  int64 padded_height;
-};
-
-template <class T1>
-struct PerCacheLineParameters {
-  PerCacheLineParameters() {}
-  PerCacheLineParameters(const PerCacheLineParameters<T1>& other)
-      : cache_line_start(other.cache_line_start),
-        input_top_row_start(other.input_top_row_start),
-        input_bottom_row_start(other.input_bottom_row_start),
-        y_lerp(other.y_lerp) {}
-
-  T1* cache_line_start;
-  const T1* input_top_row_start;
-  const T1* input_bottom_row_start;
-  T1 y_lerp;
-};
-
-// Helper class to simplify bilinear filtering
-template <class T1>
-struct SampleRect {
-  EIGEN_ALWAYS_INLINE SampleRect(const T1* in_top_left, const T1* in_top_right,
-                                 const T1* in_bottom_left,
-                                 const T1* in_bottom_right)
-      : top_left(in_top_left),
-        top_right(in_top_right),
-        bottom_left(in_bottom_left),
-        bottom_right(in_bottom_right) {}
-
-  EIGEN_ALWAYS_INLINE T1 BilinearSample(int channel, T1 x_lerp,
-                                        T1 y_lerp) const {
-    const T1 top =
-        top_left[channel] + (top_right[channel] - top_left[channel]) * x_lerp;
-    const T1 bottom = bottom_left[channel] +
-                      (bottom_right[channel] - bottom_left[channel]) * x_lerp;
-    return top + (bottom - top) * y_lerp;
-  }
-
-  const T1* top_left;
-  const T1* top_right;
-  const T1* bottom_left;
-  const T1* bottom_right;
-};
-
-// Calculates parameters which remain constant through a resize cache row.
-template <class T1>
-EIGEN_ALWAYS_INLINE PerCacheLineParameters<T1> CalculatePerCacheLineParameters(
-    int64 cache_height, int64 cache_y, T1* resize_cache, int64 cache_line_width,
-    int64 input_width, int64 input_depth, int64 top_padding, int64 pad_offset,
-    int64 resized_height, const ImageResizerState& st,
-    const T1* input_batch_start) {
-  PerCacheLineParameters<T1> result;
-  // The cache is organized so that the real y values of the resized image map
-  // onto the actual cache values through a modulo scheme. This means that as we
-  // progress downwards through the image, we keep reusing a small cache and so
-  // keep memory usage down.
-  int64 cache_index_y;
-  if (cache_y < 0) {
-    cache_index_y = cache_height + (cache_y % cache_height);
-  } else {
-    cache_index_y = cache_y % cache_height;
-  }
-  result.cache_line_start =
-      resize_cache + (cache_index_y * cache_line_width * input_depth);
-  // This part is implementing the mirror padding that happens before resizing.
-  float in_y = (cache_y - top_padding);
-  if (in_y < 0) {
-    in_y = -(in_y + 1.0f - pad_offset);
-  } else if (in_y >= resized_height) {
-    in_y = (resized_height * 2.0f) - (in_y + 1.0f + pad_offset);
-  }
-  // Here's where do do the actual resize.
-  in_y *= st.height_scale;
-  const int64 top_y_index = static_cast<int64>(std::floor(in_y));
-  const int64 bottom_y_index =
-      std::min(static_cast<int64>(std::ceil(in_y)), (st.in_height - 1));
-  // Lerp is used for bilinear filtering when that's needed.
-  result.y_lerp = static_cast<T1>(in_y - top_y_index);
-  // Which rows of the original input image to pull the values from.
-  result.input_top_row_start =
-      input_batch_start + (top_y_index * input_width * input_depth);
-  result.input_bottom_row_start =
-      input_batch_start + (bottom_y_index * input_width * input_depth);
-  return result;
-}
-
-template <class T1>
-struct PerCachePixelParameters {
-  PerCachePixelParameters() {}
-  PerCachePixelParameters(const PerCachePixelParameters<T1>& other)
-      : cache_line_pixel(other.cache_line_pixel),
-        left_x_index(other.left_x_index),
-        right_x_index(other.right_x_index),
-        x_lerp(other.x_lerp) {}
-
-  T1* cache_line_pixel;
-  int64 left_x_index;
-  int64 right_x_index;
-  T1 x_lerp;
-};
-
-// Pulls out common parameters used for every resized pixel.
-template <class T1>
-EIGEN_ALWAYS_INLINE PerCachePixelParameters<T1>
-CalculatePerCachePixelParameters(int64 cache_x, int64 cache_start_x,
-                                 T1* cache_line_start, int64 input_depth,
-                                 int64 left_padding, int64 pad_offset,
-                                 int64 resized_width,
-                                 const ImageResizerState& st) {
-  PerCachePixelParameters<T1> result;
-  // Figure out where we're going to store the results of our transform.
-  const int cache_index_x = cache_x - cache_start_x;
-  result.cache_line_pixel = cache_line_start + (cache_index_x * input_depth);
-  // Implement mirror padding by flipping in_x if it's off the edge.
-  float in_x = (cache_x - left_padding);
-  if (in_x < 0) {
-    in_x = -(in_x + 1.0f - pad_offset);
-  } else if (in_x >= resized_width) {
-    in_x = (resized_width * 2.0f) - (in_x + 1.0f + pad_offset);
-  }
-  // Resize the x parameters.
-  in_x *= st.width_scale;
-  // Get the x coordinates for the left and right pixels to pull from.
-  result.left_x_index = static_cast<int64>(std::floor(in_x));
-  result.right_x_index =
-      std::min(static_cast<int64>(std::ceil(in_x)), (st.in_width - 1));
-  // This x_lerp is used to blend pixels in bilinear filtering.
-  result.x_lerp = static_cast<T1>(in_x - result.left_x_index);
-  return result;
-}
-
-// Combines bilinear resizing and mirror padding into the im2col transformation
-// stage of convolution.
-template <class T1, class T2, class T3, class TGemmFunctor,
-          SamplingMode SampleMode>
-class FusedResizeAndPadConvFunctor {
- public:
-  void operator()(OpKernelContext* context, const Tensor& input,
-                  int input_batches, int resized_height, int resized_width,
-                  int padded_height, int padded_width, int input_depth,
-                  const T2* filter_data, int filter_height, int filter_width,
-                  int filter_count, int stride_rows, int stride_cols,
-                  Padding padding, T3* output_data, int output_height,
-                  int output_width, const ImageResizerState& st,
-                  int top_padding, int bottom_padding, int left_padding,
-                  int right_padding, int pad_offset) {
-    if ((input_batches <= 0) || (padded_width <= 0) || (padded_height <= 0) ||
-        (input_depth <= 0)) {
-      LOG(WARNING) << "Conv2D was called with bad input dimensions: "
-                   << input_batches << ", " << padded_height << ", "
-                   << padded_width << ", " << input_depth;
-      return;
-    }
-    if ((filter_width <= 0) || (filter_height <= 0) || (filter_count <= 0)) {
-      LOG(WARNING) << "Conv2D was called with bad filter dimensions: "
-                   << filter_width << ", " << filter_height << ", "
-                   << filter_count;
-      return;
-    }
-    if ((output_width <= 0) || (output_height <= 0)) {
-      LOG(WARNING) << "Conv2D was called with bad output width or height: "
-                   << output_width << ", " << output_height;
-      return;
-    }
-    OP_REQUIRES(
-        context, ((SampleMode == NEAREST) || (SampleMode == BILINEAR)),
-        errors::InvalidArgument("Bad sample mode passed in", SampleMode));
-
-    // These calculations define how the patches will be positioned within the
-    // input image. The actual definitions are quite complex, and rely on the
-    // previously-calculated output size.
-    int filter_left_offset;
-    int filter_top_offset;
-    if (padding == VALID) {
-      filter_left_offset =
-          ((output_width - 1) * stride_cols + filter_width - padded_width + 1) /
-          2;
-      filter_top_offset = ((output_height - 1) * stride_rows + filter_height -
-                           padded_height + 1) /
-                          2;
-    } else {
-      filter_left_offset =
-          ((output_width - 1) * stride_cols + filter_width - padded_width) / 2;
-      filter_top_offset =
-          ((output_height - 1) * stride_rows + filter_height - padded_height) /
-          2;
-    }
-
-    ResizeTaskParameters<T1> task_params;
-    task_params.input_depth = input_depth;
-    task_params.top_padding = top_padding;
-    task_params.pad_offset = pad_offset;
-    task_params.resized_height = resized_height;
-    task_params.st = st;
-    task_params.left_padding = left_padding;
-    task_params.resized_width = resized_width;
-    task_params.padded_width = padded_width;
-    task_params.padded_height = padded_height;
-
-    // The im2col buffer has # of patches rows, and # of filters cols.
-    // It's laid out like this, in row major order in memory:
-    //        < filter value count >
-    //   ^   +---------------------+
-    // patch |                     |
-    // count |                     |
-    //   v   +---------------------+
-    // Each patch row contains a filter_width x filter_height patch of the
-    // input, with the depth channel as the most contiguous in memory, followed
-    // by the width, then the height. This is the standard memory order in the
-    // image world if it helps to visualize it.
-    const int filter_value_count = filter_width * filter_height * input_depth;
-
-    OP_REQUIRES(context, (filter_value_count * sizeof(T1)) <= kMaxChunkSize,
-                errors::InvalidArgument("Im2Col patch too large for buffer"));
-    const size_t patches_per_chunk =
-        kMaxChunkSize / (filter_value_count * sizeof(T1));
-    // Because memory allocation is very expensive on mobile platforms, try to
-    // allocate a persistent buffer that will be kept around between calls. We
-    // use TensorFlow's resource management to ensure that the memory will be
-    // released when the session is over.
-    Im2ColBufferResource<T1, kMaxChunkSize>* im2col_buffer_resource;
-    std::function<Status(Im2ColBufferResource<T1, kMaxChunkSize>**)> creator =
-        [](Im2ColBufferResource<T1, kMaxChunkSize>** resource) {
-          *resource = new Im2ColBufferResource<T1, kMaxChunkSize>();
-          return Status::OK();
-        };
-    OP_REQUIRES_OK(context, context->resource_manager()->LookupOrCreate(
-                                "Conv2d", "im2col_buffer",
-                                &im2col_buffer_resource, creator));
-
-    // Create a resize cache memory buffer that will hold the rows of
-    // transformed and mirror padded input pixels, ready to be copied
-    // into filter patches by im2col.
-    // It's laid out like this, in row major order in memory:
-    //         < cache line width >
-    //   ^    +--------------------+
-    // cache  |                    |
-    // height |                    |
-    //   v    +--------------------+
-    // Each cache row contains a cache_line_width number of resized pixels,
-    // each with input_depth channels. The cache height is typically less than
-    // the full height the resized image would be, so it's filled up
-    // incrementally as we progress downwards through the input creating im2col
-    // patches.
-    task_params.cache_start_x = -filter_left_offset;
-    task_params.cache_end_x =
-        (((output_width - 1) * stride_cols) - filter_left_offset) +
-        filter_width;
-    task_params.cache_line_width =
-        task_params.cache_end_x - task_params.cache_start_x;
-    task_params.cache_height =
-        kResizeCacheSize / (task_params.cache_line_width * input_depth);
-    const int needed_resize_cache_count =
-        filter_height * task_params.cache_line_width * input_depth;
-    OP_REQUIRES(context,
-                (needed_resize_cache_count * sizeof(T1)) <= kResizeCacheSize,
-                errors::InvalidArgument("Input too large for resize cache"));
-    Im2ColBufferResource<T1, kResizeCacheSize>* resize_cache_resource;
-    std::function<Status(Im2ColBufferResource<T1, kResizeCacheSize>**)>
-        resize_creator =
-            [](Im2ColBufferResource<T1, kResizeCacheSize>** resource) {
-              *resource = new Im2ColBufferResource<T1, kResizeCacheSize>();
-              return Status::OK();
-            };
-    OP_REQUIRES_OK(context, context->resource_manager()->LookupOrCreate(
-                                "Conv2d", "resize_cache",
-                                &resize_cache_resource, resize_creator));
-
-    // This means that multiple ops can't be run simultaneously on different
-    // threads, because we have a single shared resource. The platforms this is
-    // aimed at have intra-op parallelism as their focus though, so it shouldn't
-    // be an issue.
-    mutex_lock lock_buffer(im2col_buffer_resource->mu);
-    core::ScopedUnref unref_buffer(im2col_buffer_resource);
-    T1* im2col_buffer = im2col_buffer_resource->data;
-
-    // This buffer is used as a fairly heavy-weight cache for the resized and
-    // mirrored inputs to the im2col operation. The problem is that we want to
-    // keep the memory usage down by not rendering the fully resized and padded
-    // input tensor to the convolution into an entire buffer. The first approach
-    // to avoid this was to fold the bilinear filtering and padding spatial
-    // transformations into the im2col lookup itself. This successfully reduced
-    // memory usage, but because im2col can access an individual pixel for many
-    // different patches, the extra overhead of doing the same bilinear lookups
-    // repeatedly became too expensive.
-    // The resize cache is designed to avoid this problem by keeping a
-    // horizontal slice of the resized and padded input to the im2col
-    // precalculated, so that repeated accesses to the same pixel from different
-    // filter patches can just be copied from this cache. It's organized as a
-    // horizontal slice stretching across the whole virtual image, and as high
-    // as the filter window, so that as the patch processing moves across all
-    // the pixels are present, and before a new row of patches is started any
-    // previously calculated rows that are needed are maintained, with new rows
-    // calculated as required.
-    mutex_lock resize_lock_buffer(resize_cache_resource->mu);
-    core::ScopedUnref unref_resized_cache(resize_cache_resource);
-    task_params.resize_cache = resize_cache_resource->data;
-
-    const T1* input_data = input.flat<T1>().data();
-    const int64 input_height = input.shape().dim_sizes()[1];
-    task_params.input_width = input.shape().dim_sizes()[2];
-
-    int end_cached_lines = std::numeric_limits<int>::min();
-
-    for (int batch = 0; batch < input_batches; ++batch) {
-      task_params.input_batch_start =
-          input_data +
-          (batch * input_height * task_params.input_width * input_depth);
-      const int in_y_end =
-          ((output_height * stride_rows) - filter_top_offset) + filter_height;
-      for (int out_y = 0; out_y < output_height; ++out_y) {
-        const int in_y_origin = (out_y * stride_rows) - filter_top_offset;
-        const int cache_start_y = std::max(in_y_origin, end_cached_lines);
-        const int cache_end_y = std::min(
-            in_y_end, std::max((in_y_origin + task_params.cache_height),
-                               end_cached_lines));
-        if (end_cached_lines < (in_y_origin + filter_height)) {
-          // This call breaks up the work required for calculating the mirror
-          // padding and resizing across multiple threads.
-          FusedConvParallelFor(
-              context, cache_start_y, cache_end_y,
-              [task_params](int64 task_cache_start_y, int64 task_cache_end_y) {
-                // This is a long and confusing function, but it's been laid out
-                // this way to help with performance on some intensive models.
-                // What it's doing is populating a cache of the original input
-                // image, after it's been bilinear resized and had its edges
-                // mirrored. This allows the following im2col code to access the
-                // transformed pixels from this cache, without having to
-                // repeatedly apply the expensive bilinear calculations as the
-                // same pixels are accessed by different patches.
-                // This is most effective when the stride is small and the
-                // filter size is large, since that's when pixels are reused
-                // most frequently as patches overlap.
-                for (int cache_y = task_cache_start_y;
-                     cache_y < task_cache_end_y; ++cache_y) {
-                  // We organize the cache as a series of rows, each containing
-                  // all the transformed pixels for a given line in the image.
-                  // This cache is big enough to hold at least a filter's height
-                  // worth of rows, but typically more, limited by the size of
-                  // the cache buffer.
-                  // We don't allocate an entire image's worth of rows though,
-                  // because we're trying to keep memory usage down, so as we
-                  // progress downwards through the im2col we periodically
-                  // refresh the cache so that the next lines that are needed
-                  // for that operation are always present.
-                  // Work out the parameters that remain constant across the
-                  // row we're calculating.
-                  PerCacheLineParameters<T1> line_params(
-                      CalculatePerCacheLineParameters<T1>(
-                          task_params.cache_height, cache_y,
-                          task_params.resize_cache,
-                          task_params.cache_line_width, task_params.input_width,
-                          task_params.input_depth, task_params.top_padding,
-                          task_params.pad_offset, task_params.resized_height,
-                          task_params.st, task_params.input_batch_start));
-                  // Iterate through the resize cache row we're filling in.
-                  for (int cache_x = task_params.cache_start_x;
-                       cache_x < task_params.cache_end_x; ++cache_x) {
-                    // Figure out what we need for the cache pixel we're
-                    // populating.
-                    PerCachePixelParameters<T1> pixel_params(
-                        CalculatePerCachePixelParameters<T1>(
-                            cache_x, task_params.cache_start_x,
-                            line_params.cache_line_start,
-                            task_params.input_depth, task_params.left_padding,
-                            task_params.pad_offset, task_params.resized_width,
-                            task_params.st));
-                    // If the access is off the left, right, top, or bottom of
-                    // the resized image, the conv padding means we should set
-                    // it to zero.
-                    if ((cache_x < 0) ||
-                        (cache_x >= task_params.padded_width) ||
-                        (cache_y < 0) ||
-                        (cache_y >= task_params.padded_height)) {
-                      std::fill_n(pixel_params.cache_line_pixel,
-                                  task_params.input_depth, T1(0));
-                    } else {
-                      // There are two different sampling strategies for
-                      // resizing. When using nearest, we can just do a
-                      // straight copy of the pixel closest to our sample point,
-                      // but bilinear requires a more complex calculation.
-                      if (SampleMode == NEAREST) {
-                        const T1* input_top_left_pixel =
-                            line_params.input_top_row_start +
-                            (pixel_params.left_x_index *
-                             task_params.input_depth);
-
-                        std::copy_n(input_top_left_pixel,
-                                    task_params.input_depth,
-                                    pixel_params.cache_line_pixel);
-                      } else {
-                        const SampleRect<T1> rect(
-                            line_params.input_top_row_start +
-                                (pixel_params.left_x_index *
-                                 task_params.input_depth),
-                            line_params.input_top_row_start +
-                                (pixel_params.right_x_index *
-                                 task_params.input_depth),
-                            line_params.input_bottom_row_start +
-                                (pixel_params.left_x_index *
-                                 task_params.input_depth),
-                            line_params.input_bottom_row_start +
-                                (pixel_params.right_x_index *
-                                 task_params.input_depth));
-                        for (int in_channel = 0;
-                             in_channel < task_params.input_depth;
-                             ++in_channel) {
-                          pixel_params.cache_line_pixel[in_channel] =
-                              rect.BilinearSample(in_channel,
-                                                  pixel_params.x_lerp,
-                                                  line_params.y_lerp);
-                        }
-                      }
-                    }
-                  }
-                }
-              });
-          end_cached_lines = cache_end_y;
-        }
-        for (int out_x = 0; out_x < output_width; ++out_x) {
-          const int in_x_origin = (out_x * stride_cols) - filter_left_offset;
-          const int patch_index = (batch * output_width * output_height) +
-                                  (out_y * output_width) + out_x;
-          const int patch_index_within_chunk = patch_index % patches_per_chunk;
-          T1* im2col_patch_start =
-              im2col_buffer + (patch_index_within_chunk * filter_value_count);
-          for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
-            T1* im2col_row_start =
-                im2col_patch_start +
-                (filter_y * filter_width * task_params.input_depth);
-            const int conv_in_y = in_y_origin + filter_y;
-            int cache_index_y;
-            if (conv_in_y < 0) {
-              cache_index_y = task_params.cache_height +
-                              (conv_in_y % task_params.cache_height);
-            } else {
-              cache_index_y = conv_in_y % task_params.cache_height;
-            }
-            T1* cache_line_start =
-                task_params.resize_cache +
-                (cache_index_y * task_params.cache_line_width *
-                 task_params.input_depth);
-            T1* cache_filter_row_start =
-                cache_line_start + ((in_x_origin - task_params.cache_start_x) *
-                                    task_params.input_depth);
-            std::copy_n(cache_filter_row_start,
-                        (filter_width * task_params.input_depth),
-                        im2col_row_start);
-          }
-          const bool is_last_in_chunk =
-              (patch_index_within_chunk == (patches_per_chunk - 1));
-          const bool is_last_overall =
-              ((batch == (input_batches - 1)) &&
-               (out_y == (output_height - 1)) && (out_x == (output_width - 1)));
-          if (is_last_in_chunk || is_last_overall) {
-            // Now we've assembled a set of image patches into a matrix, apply
-            // a GEMM matrix multiply of the patches as rows, times the filter
-            // weights in columns, to get partial results in the output
-            // matrix.
-            const int how_many_patches = patch_index_within_chunk + 1;
-            const int m = how_many_patches;
-            const int n = filter_count;
-            const int k = filter_value_count;
-            const int lda = filter_value_count;
-            const int ldb = filter_count;
-            const int ldc = filter_count;
-            const size_t start_patch_index =
-                patch_index - (how_many_patches - 1);
-            T3* chunk_output_data =
-                output_data + (start_patch_index * filter_count);
-            TGemmFunctor gemm_functor;
-            gemm_functor(context, m, n, k, im2col_buffer, lda, filter_data, ldb,
-                         chunk_output_data, ldc);
-          }
-        }
-      }
-    }
-  }
-};
-
-}  // namespace
-
-// Implements a version of convolution with bilinear resizing and mirror padding
-// included.
-template <class T, class TConvFunctor, bool DoResize>
-class FusedResizeConv2DUsingGemmOp : public OpKernel {
- public:
-  explicit FusedResizeConv2DUsingGemmOp(OpKernelConstruction* context)
-      : OpKernel(context) {
-    if (DoResize) {
-      OP_REQUIRES_OK(context,
-                     context->GetAttr("resize_align_corners", &align_corners_));
-    }
-    MirrorPadMode mode;
-    OP_REQUIRES_OK(context, context->GetAttr("mode", &mode));
-
-    switch (mode) {
-      case MirrorPadMode::SYMMETRIC: {
-        offset_ = 0;
-        break;
-      }
-      case MirrorPadMode::REFLECT: {
-        offset_ = 1;
-        break;
-      }
-      default:
-        OP_REQUIRES(context, false,
-                    errors::InvalidArgument(
-                        "mode must be either REFLECT or SYMMETRIC."));
-    }
-    OP_REQUIRES_OK(context, context->GetAttr("strides", &strides_));
-    OP_REQUIRES(context, strides_.size() == 4,
-                errors::InvalidArgument("Sliding window strides field must "
-                                        "specify 4 dimensions"));
-    const int64 stride_n = GetTensorDim(strides_, FORMAT_NHWC, 'N');
-    const int64 stride_c = GetTensorDim(strides_, FORMAT_NHWC, 'C');
-    OP_REQUIRES(
-        context, stride_n == 1 && stride_c == 1,
-        errors::InvalidArgument("Current implementation does not yet support "
-                                "strides in the batch and depth dimensions."));
-    OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
-  }
-
-  void Compute(OpKernelContext* context) override {
-    // Input tensor is of the following dimensions:
-    // [ batch, in_rows, in_cols, in_depth ]
-    const Tensor& input = context->input(0);
-    OP_REQUIRES(context, (input.shape().num_elements() > 0),
-                errors::InvalidArgument("Input tensor can't be empty"));
-
-    ImageResizerState st(false);
-    if (DoResize) {
-      st = ImageResizerState(align_corners_);
-      st.ValidateAndCalculateOutputSize(context, input);
-      if (!context->status().ok()) return;
-    } else {
-      // Set up the resize parameters to do no scaling at all.
-      st.batch_size = input.dim_size(0);
-      st.out_height = input.dim_size(1);
-      st.out_width = input.dim_size(2);
-      st.in_height = input.dim_size(1);
-      st.in_width = input.dim_size(2);
-      st.channels = input.dim_size(3);
-      st.height_scale = 1.0f;
-      st.width_scale = 1.0f;
-    }
-    TensorShape resized_shape(
-        {input.dim_size(0), st.out_height, st.out_width, input.dim_size(3)});
-    int paddings_index;
-    int filter_index;
-    if (DoResize) {
-      paddings_index = 2;
-      filter_index = 3;
-    } else {
-      paddings_index = 1;
-      filter_index = 2;
-    }
-    const Tensor& paddings = context->input(paddings_index);
-
-    const int dims = resized_shape.dims();
-    OP_REQUIRES(
-        context,
-        TensorShapeUtils::IsMatrix(paddings.shape()) &&
-            paddings.dim_size(1) == 2,
-        errors::InvalidArgument("paddings must be a matrix with 2 columns: ",
-                                paddings.shape().DebugString()));
-    const int fixed_dims =
-        (allow_legacy_scalars() && dims == 0 && paddings.dim_size(0) == 1)
-            ? 1
-            : dims;
-    OP_REQUIRES(
-        context, fixed_dims == paddings.dim_size(0),
-        errors::InvalidArgument(
-            "The first dimension of paddings must be the rank of inputs: ",
-            fixed_dims, " ", paddings.shape().DebugString(), " ",
-            resized_shape.DebugString()));
-    OP_REQUIRES(
-        context, dims == paddings.dim_size(0),
-        errors::InvalidArgument(
-            "The first dimension of paddings must be the rank of inputs: ",
-            dims, " ", paddings.shape().DebugString(), " ",
-            resized_shape.DebugString()));
-
-    OP_REQUIRES(
-        context, dims == 4,
-        errors::InvalidArgument(
-            "Fused mirror padding only supports four-dimensional inputs, but ",
-            dims, " requested"));
-
-    // Compute the shape of the output tensor, and allocate it.
-    TensorShape padded_shape;
-    TTypes<int32>::ConstMatrix paddings_matrix = paddings.matrix<int32>();
-    for (int d = 0; d < dims; ++d) {
-      const int32 before =
-          paddings_matrix(d, 0);  // Pad before existing elements.
-      const int32 after =
-          paddings_matrix(d, 1);  // Pad after existing elements.
-      OP_REQUIRES(context, before >= 0 && after >= 0,
-                  errors::InvalidArgument(
-                      "paddings must be non-negative: ", before, " ", after));
-      if (offset_ == 0) {  // SYMMETRIC mode.
-        OP_REQUIRES(
-            context,
-            before <= resized_shape.dim_size(d) &&
-                after <= resized_shape.dim_size(d),
-            errors::InvalidArgument("paddings must be no greater "
-                                    "than the dimension size: ",
-                                    before, ", ", after, " greater than ",
-                                    resized_shape.dim_size(d)));
-      } else if (offset_ == 1) {  // REFLECT mode.
-        OP_REQUIRES(
-            context,
-            before < resized_shape.dim_size(d) &&
-                after < resized_shape.dim_size(d),
-            errors::InvalidArgument("paddings must be less than"
-                                    " the dimension size: ",
-                                    before, ", ", after, " not less than ",
-                                    resized_shape.dim_size(d)));
-      }
-      padded_shape.AddDim(before + resized_shape.dim_size(d) + after);
-    }
-
-    OP_REQUIRES(
-        context, ((paddings_matrix(0, 0) == 0) && (paddings_matrix(0, 1) == 0)),
-        errors::InvalidArgument(
-            "Fused mirror padding only support spatial padding, not batches: ",
-            paddings.DebugString()));
-    OP_REQUIRES(
-        context, ((paddings_matrix(3, 0) == 0) && (paddings_matrix(3, 1) == 0)),
-        errors::InvalidArgument(
-            "Fused mirror padding only support spatial padding, not channels: ",
-            paddings.DebugString()));
-    const int32 top_padding = paddings_matrix(1, 0);
-    const int32 bottom_padding = paddings_matrix(1, 1);
-    const int32 left_padding = paddings_matrix(2, 0);
-    const int32 right_padding = paddings_matrix(2, 1);
-
-    // Input filter is of the following dimensions:
-    // [ filter_rows, filter_cols, in_depth, out_depth]
-    const Tensor& filter = context->input(filter_index);
-
-    // For 2D convolution, there should be 4 dimensions.
-    OP_REQUIRES(context, padded_shape.dims() == 4,
-                errors::InvalidArgument("input must be 4-dimensional",
-                                        padded_shape.DebugString()));
-    OP_REQUIRES(context, filter.dims() == 4,
-                errors::InvalidArgument("filter must be 4-dimensional: ",
-                                        filter.shape().DebugString()));
-
-    // We only check the first three dims, since the depth is accessed as an
-    // int64 below.
-    for (int i = 0; i < 3; i++) {
-      OP_REQUIRES(
-          context,
-          FastBoundsCheck(filter.dim_size(i), std::numeric_limits<int>::max()),
-          errors::InvalidArgument("filter too large"));
-    }
-
-    // The last dimension for input is in_depth. It must be the same as the
-    // filter's in_depth.
-    const int64 in_depth = padded_shape.dim_size(3);
-    OP_REQUIRES(context, in_depth == filter.dim_size(2),
-                errors::InvalidArgument(
-                    "input and filter must have the same depth: ", in_depth,
-                    " vs ", filter.dim_size(2)));
-
-    // The last dimension for filter is out_depth.
-    const int out_depth = static_cast<int>(filter.dim_size(3));
-
-    // The second dimension for input is rows/height.
-    // The first dimension for filter is rows/height.
-    const int64 padded_rows_raw = padded_shape.dim_size(1);
-    OP_REQUIRES(
-        context,
-        FastBoundsCheck(padded_rows_raw, std::numeric_limits<int>::max()),
-        errors::InvalidArgument("Input rows too large"));
-    const int padded_rows = static_cast<int>(padded_rows_raw);
-    const int filter_rows = static_cast<int>(filter.dim_size(0));
-    const int resized_rows = static_cast<int>(resized_shape.dim_size(1));
-
-    // The third dimension for input is columns/width.
-    // The second dimension for filter is columns/width.
-    const int64 padded_cols_raw = padded_shape.dim_size(2);
-    OP_REQUIRES(
-        context,
-        FastBoundsCheck(padded_cols_raw, std::numeric_limits<int>::max()),
-        errors::InvalidArgument("Input cols too large"));
-    const int padded_cols = static_cast<int>(padded_cols_raw);
-    const int filter_cols = static_cast<int>(filter.dim_size(1));
-    const int resized_cols = static_cast<int>(resized_shape.dim_size(2));
-
-    // The first dimension for input is batch.
-    const int64 batch_raw = padded_shape.dim_size(0);
-    OP_REQUIRES(context,
-                FastBoundsCheck(batch_raw, std::numeric_limits<int>::max()),
-                errors::InvalidArgument("batch is too large"));
-    const int batch = static_cast<int>(batch_raw);
-
-    // For now we take the stride from the second and third dimensions only (we
-    // do not support striding on the batch or depth dimension).
-    const int stride_rows = GetTensorDim(strides_, FORMAT_NHWC, 'H');
-    const int stride_cols = GetTensorDim(strides_, FORMAT_NHWC, 'W');
-
-    int64 out_rows = 0, out_cols = 0, pad_rows = 0, pad_cols = 0;
-    OP_REQUIRES_OK(context,
-                   GetWindowedOutputSize(padded_rows, filter_rows, stride_rows,
-                                         padding_, &out_rows, &pad_rows));
-    OP_REQUIRES_OK(context,
-                   GetWindowedOutputSize(padded_cols, filter_cols, stride_cols,
-                                         padding_, &out_cols, &pad_cols));
-    TensorShape out_shape =
-        ShapeFromFormat(FORMAT_NHWC, batch, out_rows, out_cols, out_depth);
-    OP_REQUIRES(context, (out_shape.num_elements() > 0),
-                errors::InvalidArgument("Output tensor can't be empty"));
-
-    // Output tensor is of the following dimensions:
-    // [ in_batch, out_rows, out_cols, out_depth ]
-    Tensor* output = nullptr;
-    OP_REQUIRES_OK(context, context->allocate_output(0, out_shape, &output));
-
-    VLOG(2) << "FusedConv2D: " << name() << ", in_depth = " << in_depth
-            << ", padded_cols = " << padded_cols
-            << ", resized_cols = " << resized_cols
-            << ", filter_cols = " << filter_cols
-            << ", padded_rows = " << padded_rows
-            << ", resized_rows = " << resized_rows
-            << ", filter_rows = " << filter_rows
-            << ", stride_rows = " << stride_rows
-            << ", stride_cols = " << stride_cols
-            << ", out_depth = " << out_depth << ", DoResize=" << DoResize;
-
-    // If there is nothing to compute, return.
-    if (out_shape.num_elements() == 0) {
-      return;
-    }
-    TConvFunctor conv_functor;
-    conv_functor(context, input, batch, resized_rows, resized_cols, padded_rows,
-                 padded_cols, in_depth, filter.flat<T>().data(), filter_rows,
-                 filter_cols, out_depth, stride_rows, stride_cols, padding_,
-                 output->flat<T>().data(), out_rows, out_cols, st, top_padding,
-                 bottom_padding, left_padding, right_padding, offset_);
-  }
-
- private:
-  std::vector<int32> strides_;
-  Padding padding_;
-  bool align_corners_;
-  int offset_;
-
-  TF_DISALLOW_COPY_AND_ASSIGN(FusedResizeConv2DUsingGemmOp);
-};
-
-#define REGISTER_FUSED(T)                                                 \
-  REGISTER_KERNEL_BUILDER(                                                \
-      Name("FusedResizeAndPadConv2D")                                     \
-          .Device(DEVICE_CPU)                                             \
-          .TypeConstraint<T>("T"),                                        \
-      FusedResizeConv2DUsingGemmOp<                                       \
-          T,                                                              \
-          FusedResizeAndPadConvFunctor<T, T, T, FastGemmFunctor<T, T, T>, \
-                                       BILINEAR>,                         \
-          true>);
-
-TF_CALL_half(REGISTER_FUSED);
-TF_CALL_float(REGISTER_FUSED);
-TF_CALL_double(REGISTER_FUSED);
-
-#define REGISTER_PAD_ONLY_FUSED(T)                                        \
-  REGISTER_KERNEL_BUILDER(                                                \
-      Name("FusedPadConv2D").Device(DEVICE_CPU).TypeConstraint<T>("T"),   \
-      FusedResizeConv2DUsingGemmOp<                                       \
-          T,                                                              \
-          FusedResizeAndPadConvFunctor<T, T, T, FastGemmFunctor<T, T, T>, \
-                                       NEAREST>,                          \
-          false>);
-
-TF_CALL_half(REGISTER_PAD_ONLY_FUSED);
-TF_CALL_float(REGISTER_PAD_ONLY_FUSED);
-TF_CALL_double(REGISTER_PAD_ONLY_FUSED);
-
-// Support for fusing computationally cheap, but memory bandwidth expensive
-// computations into the output of convolution to reduce the overall latency.
-//
-// Example: Fuse Conv2D+BiasAdd+Relu.
-
 namespace {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
@@ -929,9 +62,45 @@ template <typename Scalar, typename Index>
 using ContractionOutputMapper =
     Eigen::internal::blas_data_mapper<Scalar, Index, Eigen::ColMajor>;
 
-// Output kernel that fused BiasAdd operation into the output of tensor
-// contraction.
-template <typename T>
+// Returns input expression without any transformations.
+struct Identity {
+  template <typename XprType>
+  static auto apply(XprType expr) -> XprType {
+    return expr;
+  };
+};
+
+// Applies `Relu` to the passed input expression.
+struct Relu {
+  template <typename XprType>
+  static auto apply(XprType expr)
+      -> decltype(expr.cwiseMax(std::declval<typename XprType::Scalar>())) {
+    return expr.cwiseMax(static_cast<typename XprType::Scalar>(0));
+  };
+};
+
+// TensorContraction swaps lhs with rhs, and changes layout from RowMajor
+// (default in Tensorflow) to ColMajor (preferred in Eigen), and computes matmul
+// using these tensors.
+//
+// TensorContraction output matrix (before reshape) has a ColMajor layout, and
+// has dimensions:
+//  - rows: output_channels
+//  - cols: all other dimensions
+//
+// First element in every column is:
+//   [batch ??, height ??, width ??, out_channel = i]
+//
+// We do not know what are the values of the 'batch', 'height', and 'width' here
+// (if we know original dimensions, they can be computed from 'j').
+//
+// Each column of an output block is a continuous slice along the output channel
+// dimension, so we can use it to efficiently compute any transformation that
+// depends only on a channel value (e.g. add channel bias).
+
+// Output kernel that fuses BiasAdd operation into the output of tensor
+// contraction + any other transformation defined by Transform.
+template <typename T, typename Transform = Identity>
 struct BiasAddOutputKernel {
   explicit BiasAddOutputKernel(const T* bias_data) : bias_data(bias_data) {}
 
@@ -943,13 +112,13 @@ struct BiasAddOutputKernel {
     DCHECK(params.swapped_arguments);
 
     const T* bias_base = bias_data + i;
+    typename OutputTypes<T>::ConstTensor bias(bias_base, num_rows);
 
-    // TODO(ezhulenev): Use Eigen::Array with strides after upgrading Eigen.
     for (int col = 0; col < num_cols; ++col) {
       T* output_base = &output_mapper(0, col);
       typename OutputTypes<T>::Tensor output(output_base, num_rows);
-      typename OutputTypes<T>::ConstTensor bias(bias_base, num_rows);
-      output = output + bias;
+      const auto expr = output + bias;
+      output = Transform::template apply<decltype(expr)>(expr);
     }
   }
 
@@ -957,12 +126,16 @@ struct BiasAddOutputKernel {
   const T* bias_data;
 };
 
-// Output kernel that fused BiasAdd and Relu operations into the output of
-// tensor contraction.
-template <typename T>
-struct BiasAddWithReluOutputKernel {
-  explicit BiasAddWithReluOutputKernel(const T* bias_data)
-      : bias_data(bias_data) {}
+// Output kernel that fuses FusedBatchNorm operation into the output of tensor
+// contraction + any other transformation defined by Transform.
+template <typename T, typename Transform = Identity>
+struct FusedBatchNormOutputKernel {
+  FusedBatchNormOutputKernel(T epsilon, const T* scaling_factor_data,
+                             const T* offset_data, const T* estimated_mean_data)
+      : epsilon(epsilon),
+        scaling_factor_data(scaling_factor_data),
+        offset_data(offset_data),
+        estimated_mean_data(estimated_mean_data) {}
 
   template <typename Index, typename Scalar>
   EIGEN_ALWAYS_INLINE void operator()(
@@ -971,19 +144,31 @@ struct BiasAddWithReluOutputKernel {
       Index num_rows, Index num_cols) const {
     DCHECK(params.swapped_arguments);
 
-    const T* bias_base = bias_data + i;
+    const T* scaling_factor_base = scaling_factor_data + i;
+    const T* offset_base = offset_data + i;
+    const T* mean_base = estimated_mean_data + i;
+
+    typename OutputTypes<T>::ConstTensor scaling_factor(scaling_factor_base,
+                                                        num_rows);
+    typename OutputTypes<T>::ConstTensor offset(offset_base, num_rows);
+    typename OutputTypes<T>::ConstTensor mean(mean_base, num_rows);
 
-    // TODO(ezhulenev): Use Eigen::Array with strides after upgrading Eigen.
     for (int col = 0; col < num_cols; ++col) {
       T* output_base = &output_mapper(0, col);
       typename OutputTypes<T>::Tensor output(output_base, num_rows);
-      typename OutputTypes<T>::ConstTensor bias(bias_base, num_rows);
-      output = (output + bias).cwiseMax(static_cast<T>(0));
+
+      auto scaled = (output - mean) * scaling_factor;
+      auto shifted = scaled + offset;
+
+      output = Transform::template apply<decltype(shifted)>(shifted);
     }
   }
 
  private:
-  const T* bias_data;
+  T epsilon;
+  const T* scaling_factor_data;
+  const T* offset_data;
+  const T* estimated_mean_data;
 };
 
 // Type aliases for the output kernels, purely for the sake of better launch
@@ -991,21 +176,33 @@ struct BiasAddWithReluOutputKernel {
 template <typename T>
 using WithBiasAdd = BiasAddOutputKernel<T>;
 template <typename T>
-using WithBiasAddAndRelu = BiasAddWithReluOutputKernel<T>;
+using WithBiasAddAndRelu = BiasAddOutputKernel<T, Relu>;
+template <typename T>
+using WithFusedBatchNorm = FusedBatchNormOutputKernel<T>;
+template <typename T>
+using WithFusedBatchNormAndRelu = FusedBatchNormOutputKernel<T, Relu>;
 
 // Dispatch 2D convolution to the appropriate primitive operation:
 //   (1) MatMul for the case of 1x1 convolution.
 //   (2) MatMul for the case when filter size equals to the input size.
 //   (3) General spatial 2D convolution for all other cases.
-template <typename T, typename OutputKernel>
-struct LaunchConv2DWithOutputKernel {
-  void operator()(OpKernelContext* ctx, const Tensor& input,
-                  const Tensor& filter, int row_stride, int col_stride,
-                  int row_dilation, int col_dilation, const Padding& padding,
-                  const OutputKernel& output_kernel, Tensor* output,
-                  TensorFormat data_format) {
-    if (filter.dim_size(0) == 1 && filter.dim_size(1) == 1 && row_stride == 1 &&
-        col_stride == 1) {
+template <typename T>
+class LaunchConv2DWithOutputKernel {
+ public:
+  LaunchConv2DWithOutputKernel(int row_stride, int col_stride,      //
+                               int row_dilation, int col_dilation,  //
+                               Padding padding)
+      : row_stride_(row_stride),
+        col_stride_(col_stride),
+        row_dilation_(row_dilation),
+        col_dilation_(col_dilation),
+        padding_(padding) {}
+
+  template <typename OutputKernel>
+  void operator()(const OutputKernel& output_kernel, OpKernelContext* ctx,
+                  const Tensor& input, const Tensor& filter, Tensor* output) {
+    if (filter.dim_size(0) == 1 && filter.dim_size(1) == 1 &&
+        row_stride_ == 1 && col_stride_ == 1) {
       int conv_width = 1;  // Width for the convolution step.
       for (int i = 0; i < 3; ++i) {
         conv_width *= output->dim_size(i);
@@ -1021,8 +218,8 @@ struct LaunchConv2DWithOutputKernel {
           dim_pair, output_kernel);
 
     } else if (filter.dim_size(0) == input.dim_size(1) &&
-               filter.dim_size(1) == input.dim_size(2) && row_dilation == 1 &&
-               col_dilation == 1 && padding == VALID) {
+               filter.dim_size(1) == input.dim_size(2) && row_dilation_ == 1 &&
+               col_dilation_ == 1 && padding_ == VALID) {
       // If the input data and filter have the same height/width,
       // reduce the 2D convolution to matrix multiplication.
       const auto k =  // Length of reduction dimension.
@@ -1040,11 +237,18 @@ struct LaunchConv2DWithOutputKernel {
     } else {
       functor::SpatialConvolution<CPUDevice, T, OutputKernel>()(
           ctx->eigen_device<CPUDevice>(), output->tensor<T, 4>(),
-          input.tensor<T, 4>(), filter.tensor<T, 4>(), row_stride, col_stride,
-          row_dilation, col_dilation, BrainPadding2EigenPadding(padding),
+          input.tensor<T, 4>(), filter.tensor<T, 4>(), row_stride_, col_stride_,
+          row_dilation_, col_dilation_, BrainPadding2EigenPadding(padding_),
           output_kernel);
     }
   }
+
+ private:
+  int row_stride_;
+  int col_stride_;
+  int row_dilation_;
+  int col_dilation_;
+  const Padding padding_;
 };
 
 }  // namespace
@@ -1065,27 +269,43 @@ class FusedConv2DOp : public OpKernel {
                 errors::InvalidArgument(
                     "Fused Conv2D must have at least one fused op."));
 
-    // Right now we always expect to have just one extra argument that is an
-    // input to the BiasAdd. In future we might fuse other types of computations
-    // taking additional arguments.
-
     int num_args;
     OP_REQUIRES_OK(context, context->GetAttr("num_args", &num_args));
-    OP_REQUIRES(context, num_args == 1,
-                errors::InvalidArgument(
-                    "Fused Conv2D must have one extra argument with a bias."));
 
     // TODO(ezhulenev): Add support for fusion element-wise op chains defined
     // at runtime, e.g. Relu+Sqrt+Tanh+etc...
 
+    // Match combination of fused ops to one of the supported fusions.
     if (FusedOpsMatches(fused_ops, {"BiasAdd"})) {
       fused_computation_ = FusedComputationType::kBiasAdd;
     } else if (FusedOpsMatches(fused_ops, {"BiasAdd", "Relu"})) {
       fused_computation_ = FusedComputationType::kBiasAddWithRelu;
+    } else if (FusedOpsMatches(fused_ops, {"FusedBatchNorm"})) {
+      fused_computation_ = FusedComputationType::kFusedBatchNorm;
+    } else if (FusedOpsMatches(fused_ops, {"FusedBatchNorm", "Relu"})) {
+      fused_computation_ = FusedComputationType::kFusedBatchNormWithRelu;
     } else {
       OP_REQUIRES(context, false,
-                  errors::Unimplemented("Fusion is not implemented: ",
-                                        str_util::Join(fused_ops, ",")));
+                  errors::Unimplemented("Fusion is not implemented: [",
+                                        str_util::Join(fused_ops, ","), "]"));
+    }
+
+    // Depending on a picked fusion type validate fusion-specific arguments.
+
+    if (fused_computation_ == FusedComputationType::kBiasAdd ||
+        fused_computation_ == FusedComputationType::kBiasAddWithRelu) {
+      OP_REQUIRES(context, num_args == 1,
+                  errors::InvalidArgument(
+                      "Fused Conv2D must have one extra argument: bias."));
+    }
+
+    if (fused_computation_ == FusedComputationType::kFusedBatchNorm ||
+        fused_computation_ == FusedComputationType::kFusedBatchNormWithRelu) {
+      OP_REQUIRES(
+          context, num_args == 4,
+          errors::InvalidArgument("Fused FusedBatchNorm must have four extra "
+                                  "arguments: scale, offset, mean, variance."));
+      OP_REQUIRES_OK(context, context->GetAttr("epsilon", &epsilon_));
     }
   }
 
@@ -1098,10 +318,6 @@ class FusedConv2DOp : public OpKernel {
     // [ filter_rows, filter_cols, in_depth, out_depth]
     const Tensor& filter = context->input(1);
 
-    // Bias of the following dimensions:
-    // [ output_depth ]
-    const Tensor& bias = context->input(2);
-
     Conv2DDimensions dimensions;
     OP_REQUIRES_OK(context,
                    ComputeConv2DDimension(params_, input, filter, &dimensions));
@@ -1139,23 +355,47 @@ class FusedConv2DOp : public OpKernel {
                 errors::Unimplemented("Fused conv implementation does not "
                                       "support grouped convolutions for now."));
 
-    auto bias_data = reinterpret_cast<const T*>(bias.tensor_data().data());
+    BiasAddArgs bias_add;
+    FusedBatchNormArgs fused_batch_norm;
 
-#define LAUNCH_CONV2D(KERNEL)                                                 \
-  LaunchConv2DWithOutputKernel<T, KERNEL>()(                                  \
-      context, input, filter, dimensions.stride_rows, dimensions.stride_cols, \
-      dimensions.dilation_rows, dimensions.dilation_cols, params_.padding,    \
-      KERNEL(bias_data), output, params_.data_format);                        \
-  break
+    LaunchConv2DWithOutputKernel<T> conv2d(
+        dimensions.stride_rows, dimensions.stride_cols,
+        dimensions.dilation_rows, dimensions.dilation_cols, params_.padding);
 
     switch (fused_computation_) {
       case FusedComputationType::kBiasAdd:
-        LAUNCH_CONV2D(WithBiasAdd<T>);
+        OP_REQUIRES_OK(context, InitBiasAddArgs(context, &bias_add));
+        conv2d(WithBiasAdd<T>(bias_add.bias_add_data), context, input, filter,
+               output);
+        break;
+
       case FusedComputationType::kBiasAddWithRelu:
-        LAUNCH_CONV2D(WithBiasAddAndRelu<T>);
+        OP_REQUIRES_OK(context, InitBiasAddArgs(context, &bias_add));
+        conv2d(WithBiasAddAndRelu<T>(bias_add.bias_add_data), context, input,
+               filter, output);
+        break;
+
+      case FusedComputationType::kFusedBatchNorm:
+        OP_REQUIRES_OK(context,
+                       InitFusedBatchNormArgs(context, &fused_batch_norm));
+        conv2d(WithFusedBatchNorm<T>(epsilon_,
+                                     fused_batch_norm.scaling_factor.data(),
+                                     fused_batch_norm.offset_data,
+                                     fused_batch_norm.estimated_mean_data),
+               context, input, filter, output);
+        break;
+
+      case FusedComputationType::kFusedBatchNormWithRelu:
+        OP_REQUIRES_OK(context,
+                       InitFusedBatchNormArgs(context, &fused_batch_norm));
+        conv2d(WithFusedBatchNormAndRelu<T>(
+                   epsilon_, fused_batch_norm.scaling_factor.data(),
+                   fused_batch_norm.offset_data,
+                   fused_batch_norm.estimated_mean_data),
+               context, input, filter, output);
+        break;
     }
   }
-#undef LAUNCH_CONV2D
 
  private:
   bool FusedOpsMatches(const std::vector<string>& fused_ops,
@@ -1163,13 +403,92 @@ class FusedConv2DOp : public OpKernel {
     return fused_ops == expected;
   }
 
+  struct BiasAddArgs {
+    const T* bias_add_data = nullptr;
+  };
+
+  struct FusedBatchNormArgs {
+    const T* scale_data = nullptr;
+    const T* offset_data = nullptr;
+    const T* estimated_mean_data = nullptr;
+    const T* estimated_variance_data = nullptr;
+
+    // Precomputed expression:
+    //   scaling_factor = (estimated_variance + epsilon).rsqrt() * scale
+    Eigen::Tensor<T, 1, Eigen::RowMajor> scaling_factor;
+  };
+
+#define TF_REQUIRES(EXP, STATUS) \
+  if (!TF_PREDICT_TRUE(EXP)) return (STATUS)
+
+  void InitDataPtr(const Tensor& tensor, const T** ptr) const {
+    *ptr = reinterpret_cast<const T*>(tensor.tensor_data().data());
+  }
+
+  Status InitBiasAddArgs(OpKernelContext* context, BiasAddArgs* args) const {
+    // Bias of the following dimensions: [ output_depth ]
+    const Tensor& bias = context->input(2);
+
+    TF_REQUIRES(bias.dims() == 1,
+                errors::InvalidArgument("bias must be 1-dimensional",
+                                        bias.shape().DebugString()));
+
+    InitDataPtr(bias, &args->bias_add_data);
+
+    return Status::OK();
+  }
+
+  Status InitFusedBatchNormArgs(OpKernelContext* context,
+                                FusedBatchNormArgs* args) const {
+    const Tensor& scale = context->input(2);
+    const Tensor& offset = context->input(3);
+    const Tensor& estimated_mean = context->input(4);
+    const Tensor& estimated_variance = context->input(5);
+
+    TF_REQUIRES(scale.dims() == 1,
+                errors::InvalidArgument("scale must be 1-dimensional",
+                                        scale.shape().DebugString()));
+    TF_REQUIRES(offset.dims() == 1,
+                errors::InvalidArgument("offset must be 1-dimensional",
+                                        offset.shape().DebugString()));
+    TF_REQUIRES(estimated_mean.dims() == 1,
+                errors::InvalidArgument("estimated_mean must be 1-dimensional",
+                                        estimated_mean.shape().DebugString()));
+    TF_REQUIRES(
+        estimated_variance.dims() == 1,
+        errors::InvalidArgument("estimated_variance must be 1-dimensional",
+                                estimated_variance.shape().DebugString()));
+
+    InitDataPtr(scale, &args->scale_data);
+    InitDataPtr(offset, &args->offset_data);
+    InitDataPtr(estimated_mean, &args->estimated_mean_data);
+    InitDataPtr(estimated_variance, &args->estimated_variance_data);
+
+    // Precompute scaling factor once for all output blocks (kernels).
+    args->scaling_factor =
+        (estimated_variance.flat<T>() + static_cast<T>(epsilon_)).rsqrt() *
+        scale.flat<T>();
+
+    return Status::OK();
+  }
+
+#undef TF_REQUIRES
+
   // Element-wise ops applied to the result of Conv2D.
   // TODO(ezhulenev): Add support for runtime-defined op chains.
-  enum class FusedComputationType { kBiasAdd, kBiasAddWithRelu };
+  enum class FusedComputationType {
+    kBiasAdd,
+    kBiasAddWithRelu,
+    kFusedBatchNorm,
+    kFusedBatchNormWithRelu
+  };
 
   Conv2DParameters params_;
   FusedComputationType fused_computation_;
 
+  // FusedBatchNorm attributes.
+  float epsilon_;
+
   TF_DISALLOW_COPY_AND_ASSIGN(FusedConv2DOp);
 };
 
@@ -1180,7 +499,9 @@ class FusedConv2DOp : public OpKernel {
 
 // If we're using the alternative GEMM-based implementation of Conv2D for the
 // CPU implementation, don't register this EigenTensor-based version.
-#if !defined(USE_GEMM_FOR_CONV)
+// TODO(b/119765980): Upgrade upstream Eigen to set `m_can_use_xsmm=false` for
+// contractions with non-default contraction output kernels.
+#if !defined(USE_GEMM_FOR_CONV) && !defined(EIGEN_USE_LIBXSMM)
 TF_CALL_float(REGISTER_FUSED_CONV2D);
 TF_CALL_double(REGISTER_FUSED_CONV2D);
 #endif  // !USE_GEMM_FOR_CONV
diff --git a/tensorflow/core/kernels/conv_ops_fused_image_transform.cc b/tensorflow/core/kernels/conv_ops_fused_image_transform.cc
new file mode 100644
index 00000000000..7be1de29c95
--- /dev/null
+++ b/tensorflow/core/kernels/conv_ops_fused_image_transform.cc
@@ -0,0 +1,902 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Implements convolution operations with image transformations (resize and
+// mirror padding) baked into the processing, to optimize latency and memory
+// usage.
+
+#define EIGEN_USE_THREADS
+
+#include <string>
+#include <vector>
+#include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/numeric_op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/tensor_slice.h"
+#include "tensorflow/core/kernels/bounds_check.h"
+#include "tensorflow/core/kernels/conv_2d.h"
+#include "tensorflow/core/kernels/conv_ops.h"
+#include "tensorflow/core/kernels/gemm_functors.h"
+#include "tensorflow/core/kernels/image_resizer_state.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/util/mirror_pad_mode.h"
+#include "tensorflow/core/util/padding.h"
+#include "tensorflow/core/util/tensor_format.h"
+
+namespace tensorflow {
+namespace {
+
+// We don't want to allocate a buffer to hold all the patches if the size is
+// going to be extremely large, so break it into chunks if it's bigger than
+// a limit. Each chunk will be processed serially, so we can refill the
+// buffer for the next chunk and reuse it, keeping maximum memory size down.
+// In this case, we've picked 16 megabytes as a reasonable limit for Android and
+// other platforms using Eigen, and 1MB for iOS devices, from experimentation.
+#if defined(__APPLE__) && defined(IS_MOBILE_PLATFORM)
+const size_t kMaxChunkSize = (1 * 1024 * 1024);
+#else
+const size_t kMaxChunkSize = (16 * 1024 * 1024);
+#endif
+const size_t kResizeCacheSize = (8 * 1024 * 1024);
+
+// Lookup method used when resizing.
+enum SamplingMode {
+  BILINEAR = 0,
+  NEAREST = 1,
+};
+
+// Simple utility function used by FusedConv to multithread basic workloads. To
+// use it, pass begin and end values for the full workload and a std::function
+// that receives a subset of that through the begin and end values for each
+// worker's task. The division of the full workload into worker tasks is handled
+// by the multithreading logic. Here's an example of how to use it:
+// std::vector<float> my_vector(100);
+// ...
+// FusedConvParallelFor(context, 0, 100,
+//   [&my_vector](int64 task_begin, int64 task_end) {
+//     for (int64 current = task_begin; current != task_end; ++current) {
+//       my_vector[current] *= 10.0f;
+//     }
+// });
+void FusedConvParallelFor(
+    OpKernelContext* context, int64 begin, int64 end,
+    const std::function<void(int64, int64)>& task_function) {
+// On iOS, the thread management imposes a very big performance penalty, so
+// just call the function directly with no multithreading.
+#if defined(__APPLE__) && defined(IS_MOBILE_PLATFORM)
+  task_function(begin, end);
+#else
+  auto& worker_threads = *(context->device()->tensorflow_cpu_worker_threads());
+  thread::ThreadPool* thread_pool = worker_threads.workers;
+  const int64 total_elements = end - begin;
+  // This is a bit of an arbitrary number, but was found to work well for
+  // typical models we've been profiling on various devices.
+  const int64 element_cost = 10000000;
+  thread_pool->ParallelFor(
+      total_elements, element_cost,
+      [begin, task_function](int64 begin_offset, int64 end_offset) {
+        const int64 task_begin = begin + begin_offset;
+        const int64 task_end = begin + end_offset;
+        task_function(task_begin, task_end);
+      });
+#endif
+}
+
+// Holds the state needed for the resizing subtasks.
+template <class T1>
+struct ResizeTaskParameters {
+  ResizeTaskParameters() : st(false) {}
+
+  int cache_height;
+  T1* resize_cache;
+  int cache_line_width;
+  int input_width;
+  int input_depth;
+  int top_padding;
+  int pad_offset;
+  int64 resized_height;
+  ImageResizerState st;
+  const T1* input_batch_start;
+  int64 cache_start_x;
+  int64 cache_end_x;
+  int left_padding;
+  int64 resized_width;
+  int64 padded_width;
+  int64 padded_height;
+};
+
+template <class T1>
+struct PerCacheLineParameters {
+  PerCacheLineParameters() {}
+  PerCacheLineParameters(const PerCacheLineParameters<T1>& other)
+      : cache_line_start(other.cache_line_start),
+        input_top_row_start(other.input_top_row_start),
+        input_bottom_row_start(other.input_bottom_row_start),
+        y_lerp(other.y_lerp) {}
+
+  T1* cache_line_start;
+  const T1* input_top_row_start;
+  const T1* input_bottom_row_start;
+  T1 y_lerp;
+};
+
+// Helper class to simplify bilinear filtering
+template <class T1>
+struct SampleRect {
+  EIGEN_ALWAYS_INLINE SampleRect(const T1* in_top_left, const T1* in_top_right,
+                                 const T1* in_bottom_left,
+                                 const T1* in_bottom_right)
+      : top_left(in_top_left),
+        top_right(in_top_right),
+        bottom_left(in_bottom_left),
+        bottom_right(in_bottom_right) {}
+
+  EIGEN_ALWAYS_INLINE T1 BilinearSample(int channel, T1 x_lerp,
+                                        T1 y_lerp) const {
+    const T1 top =
+        top_left[channel] + (top_right[channel] - top_left[channel]) * x_lerp;
+    const T1 bottom = bottom_left[channel] +
+                      (bottom_right[channel] - bottom_left[channel]) * x_lerp;
+    return top + (bottom - top) * y_lerp;
+  }
+
+  const T1* top_left;
+  const T1* top_right;
+  const T1* bottom_left;
+  const T1* bottom_right;
+};
+
+// Calculates parameters which remain constant through a resize cache row.
+template <class T1>
+EIGEN_ALWAYS_INLINE PerCacheLineParameters<T1> CalculatePerCacheLineParameters(
+    int64 cache_height, int64 cache_y, T1* resize_cache, int64 cache_line_width,
+    int64 input_width, int64 input_depth, int64 top_padding, int64 pad_offset,
+    int64 resized_height, const ImageResizerState& st,
+    const T1* input_batch_start) {
+  PerCacheLineParameters<T1> result;
+  // The cache is organized so that the real y values of the resized image map
+  // onto the actual cache values through a modulo scheme. This means that as we
+  // progress downwards through the image, we keep reusing a small cache and so
+  // keep memory usage down.
+  int64 cache_index_y;
+  if (cache_y < 0) {
+    cache_index_y = cache_height + (cache_y % cache_height);
+  } else {
+    cache_index_y = cache_y % cache_height;
+  }
+  result.cache_line_start =
+      resize_cache + (cache_index_y * cache_line_width * input_depth);
+  // This part is implementing the mirror padding that happens before resizing.
+  float in_y = (cache_y - top_padding);
+  if (in_y < 0) {
+    in_y = -(in_y + 1.0f - pad_offset);
+  } else if (in_y >= resized_height) {
+    in_y = (resized_height * 2.0f) - (in_y + 1.0f + pad_offset);
+  }
+  // Here's where do do the actual resize.
+  in_y *= st.height_scale;
+  const int64 top_y_index = static_cast<int64>(std::floor(in_y));
+  const int64 bottom_y_index =
+      std::min(static_cast<int64>(std::ceil(in_y)), (st.in_height - 1));
+  // Lerp is used for bilinear filtering when that's needed.
+  result.y_lerp = static_cast<T1>(in_y - top_y_index);
+  // Which rows of the original input image to pull the values from.
+  result.input_top_row_start =
+      input_batch_start + (top_y_index * input_width * input_depth);
+  result.input_bottom_row_start =
+      input_batch_start + (bottom_y_index * input_width * input_depth);
+  return result;
+}
+
+template <class T1>
+struct PerCachePixelParameters {
+  PerCachePixelParameters() {}
+  PerCachePixelParameters(const PerCachePixelParameters<T1>& other)
+      : cache_line_pixel(other.cache_line_pixel),
+        left_x_index(other.left_x_index),
+        right_x_index(other.right_x_index),
+        x_lerp(other.x_lerp) {}
+
+  T1* cache_line_pixel;
+  int64 left_x_index;
+  int64 right_x_index;
+  T1 x_lerp;
+};
+
+// Pulls out common parameters used for every resized pixel.
+template <class T1>
+EIGEN_ALWAYS_INLINE PerCachePixelParameters<T1>
+CalculatePerCachePixelParameters(int64 cache_x, int64 cache_start_x,
+                                 T1* cache_line_start, int64 input_depth,
+                                 int64 left_padding, int64 pad_offset,
+                                 int64 resized_width,
+                                 const ImageResizerState& st) {
+  PerCachePixelParameters<T1> result;
+  // Figure out where we're going to store the results of our transform.
+  const int cache_index_x = cache_x - cache_start_x;
+  result.cache_line_pixel = cache_line_start + (cache_index_x * input_depth);
+  // Implement mirror padding by flipping in_x if it's off the edge.
+  float in_x = (cache_x - left_padding);
+  if (in_x < 0) {
+    in_x = -(in_x + 1.0f - pad_offset);
+  } else if (in_x >= resized_width) {
+    in_x = (resized_width * 2.0f) - (in_x + 1.0f + pad_offset);
+  }
+  // Resize the x parameters.
+  in_x *= st.width_scale;
+  // Get the x coordinates for the left and right pixels to pull from.
+  result.left_x_index = static_cast<int64>(std::floor(in_x));
+  result.right_x_index =
+      std::min(static_cast<int64>(std::ceil(in_x)), (st.in_width - 1));
+  // This x_lerp is used to blend pixels in bilinear filtering.
+  result.x_lerp = static_cast<T1>(in_x - result.left_x_index);
+  return result;
+}
+
+// Combines bilinear resizing and mirror padding into the im2col transformation
+// stage of convolution.
+template <class T1, class T2, class T3, class TGemmFunctor,
+          SamplingMode SampleMode>
+class FusedResizeAndPadConvFunctor {
+ public:
+  void operator()(OpKernelContext* context, const Tensor& input,
+                  int input_batches, int resized_height, int resized_width,
+                  int padded_height, int padded_width, int input_depth,
+                  const T2* filter_data, int filter_height, int filter_width,
+                  int filter_count, int stride_rows, int stride_cols,
+                  Padding padding, T3* output_data, int output_height,
+                  int output_width, const ImageResizerState& st,
+                  int top_padding, int bottom_padding, int left_padding,
+                  int right_padding, int pad_offset) {
+    if ((input_batches <= 0) || (padded_width <= 0) || (padded_height <= 0) ||
+        (input_depth <= 0)) {
+      LOG(WARNING) << "Conv2D was called with bad input dimensions: "
+                   << input_batches << ", " << padded_height << ", "
+                   << padded_width << ", " << input_depth;
+      return;
+    }
+    if ((filter_width <= 0) || (filter_height <= 0) || (filter_count <= 0)) {
+      LOG(WARNING) << "Conv2D was called with bad filter dimensions: "
+                   << filter_width << ", " << filter_height << ", "
+                   << filter_count;
+      return;
+    }
+    if ((output_width <= 0) || (output_height <= 0)) {
+      LOG(WARNING) << "Conv2D was called with bad output width or height: "
+                   << output_width << ", " << output_height;
+      return;
+    }
+    OP_REQUIRES(
+        context, ((SampleMode == NEAREST) || (SampleMode == BILINEAR)),
+        errors::InvalidArgument("Bad sample mode passed in", SampleMode));
+
+    // These calculations define how the patches will be positioned within the
+    // input image. The actual definitions are quite complex, and rely on the
+    // previously-calculated output size.
+    int filter_left_offset;
+    int filter_top_offset;
+    if (padding == VALID) {
+      filter_left_offset =
+          ((output_width - 1) * stride_cols + filter_width - padded_width + 1) /
+          2;
+      filter_top_offset = ((output_height - 1) * stride_rows + filter_height -
+                           padded_height + 1) /
+                          2;
+    } else {
+      filter_left_offset =
+          ((output_width - 1) * stride_cols + filter_width - padded_width) / 2;
+      filter_top_offset =
+          ((output_height - 1) * stride_rows + filter_height - padded_height) /
+          2;
+    }
+
+    ResizeTaskParameters<T1> task_params;
+    task_params.input_depth = input_depth;
+    task_params.top_padding = top_padding;
+    task_params.pad_offset = pad_offset;
+    task_params.resized_height = resized_height;
+    task_params.st = st;
+    task_params.left_padding = left_padding;
+    task_params.resized_width = resized_width;
+    task_params.padded_width = padded_width;
+    task_params.padded_height = padded_height;
+
+    // The im2col buffer has # of patches rows, and # of filters cols.
+    // It's laid out like this, in row major order in memory:
+    //        < filter value count >
+    //   ^   +---------------------+
+    // patch |                     |
+    // count |                     |
+    //   v   +---------------------+
+    // Each patch row contains a filter_width x filter_height patch of the
+    // input, with the depth channel as the most contiguous in memory, followed
+    // by the width, then the height. This is the standard memory order in the
+    // image world if it helps to visualize it.
+    const int filter_value_count = filter_width * filter_height * input_depth;
+
+    OP_REQUIRES(context, (filter_value_count * sizeof(T1)) <= kMaxChunkSize,
+                errors::InvalidArgument("Im2Col patch too large for buffer"));
+    const size_t patches_per_chunk =
+        kMaxChunkSize / (filter_value_count * sizeof(T1));
+    // Because memory allocation is very expensive on mobile platforms, try to
+    // allocate a persistent buffer that will be kept around between calls. We
+    // use TensorFlow's resource management to ensure that the memory will be
+    // released when the session is over.
+    Im2ColBufferResource<T1, kMaxChunkSize>* im2col_buffer_resource;
+    std::function<Status(Im2ColBufferResource<T1, kMaxChunkSize>**)> creator =
+        [](Im2ColBufferResource<T1, kMaxChunkSize>** resource) {
+          *resource = new Im2ColBufferResource<T1, kMaxChunkSize>();
+          return Status::OK();
+        };
+    OP_REQUIRES_OK(context, context->resource_manager()->LookupOrCreate(
+                                "Conv2d", "im2col_buffer",
+                                &im2col_buffer_resource, creator));
+
+    // Create a resize cache memory buffer that will hold the rows of
+    // transformed and mirror padded input pixels, ready to be copied
+    // into filter patches by im2col.
+    // It's laid out like this, in row major order in memory:
+    //         < cache line width >
+    //   ^    +--------------------+
+    // cache  |                    |
+    // height |                    |
+    //   v    +--------------------+
+    // Each cache row contains a cache_line_width number of resized pixels,
+    // each with input_depth channels. The cache height is typically less than
+    // the full height the resized image would be, so it's filled up
+    // incrementally as we progress downwards through the input creating im2col
+    // patches.
+    task_params.cache_start_x = -filter_left_offset;
+    task_params.cache_end_x =
+        (((output_width - 1) * stride_cols) - filter_left_offset) +
+        filter_width;
+    task_params.cache_line_width =
+        task_params.cache_end_x - task_params.cache_start_x;
+    task_params.cache_height =
+        kResizeCacheSize / (task_params.cache_line_width * input_depth);
+    const int needed_resize_cache_count =
+        filter_height * task_params.cache_line_width * input_depth;
+    OP_REQUIRES(context,
+                (needed_resize_cache_count * sizeof(T1)) <= kResizeCacheSize,
+                errors::InvalidArgument("Input too large for resize cache"));
+    Im2ColBufferResource<T1, kResizeCacheSize>* resize_cache_resource;
+    std::function<Status(Im2ColBufferResource<T1, kResizeCacheSize>**)>
+        resize_creator =
+            [](Im2ColBufferResource<T1, kResizeCacheSize>** resource) {
+              *resource = new Im2ColBufferResource<T1, kResizeCacheSize>();
+              return Status::OK();
+            };
+    OP_REQUIRES_OK(context, context->resource_manager()->LookupOrCreate(
+                                "Conv2d", "resize_cache",
+                                &resize_cache_resource, resize_creator));
+
+    // This means that multiple ops can't be run simultaneously on different
+    // threads, because we have a single shared resource. The platforms this is
+    // aimed at have intra-op parallelism as their focus though, so it shouldn't
+    // be an issue.
+    mutex_lock lock_buffer(im2col_buffer_resource->mu);
+    core::ScopedUnref unref_buffer(im2col_buffer_resource);
+    T1* im2col_buffer = im2col_buffer_resource->data;
+
+    // This buffer is used as a fairly heavy-weight cache for the resized and
+    // mirrored inputs to the im2col operation. The problem is that we want to
+    // keep the memory usage down by not rendering the fully resized and padded
+    // input tensor to the convolution into an entire buffer. The first approach
+    // to avoid this was to fold the bilinear filtering and padding spatial
+    // transformations into the im2col lookup itself. This successfully reduced
+    // memory usage, but because im2col can access an individual pixel for many
+    // different patches, the extra overhead of doing the same bilinear lookups
+    // repeatedly became too expensive.
+    // The resize cache is designed to avoid this problem by keeping a
+    // horizontal slice of the resized and padded input to the im2col
+    // precalculated, so that repeated accesses to the same pixel from different
+    // filter patches can just be copied from this cache. It's organized as a
+    // horizontal slice stretching across the whole virtual image, and as high
+    // as the filter window, so that as the patch processing moves across all
+    // the pixels are present, and before a new row of patches is started any
+    // previously calculated rows that are needed are maintained, with new rows
+    // calculated as required.
+    mutex_lock resize_lock_buffer(resize_cache_resource->mu);
+    core::ScopedUnref unref_resized_cache(resize_cache_resource);
+    task_params.resize_cache = resize_cache_resource->data;
+
+    const T1* input_data = input.flat<T1>().data();
+    const int64 input_height = input.shape().dim_sizes()[1];
+    task_params.input_width = input.shape().dim_sizes()[2];
+
+    int end_cached_lines = std::numeric_limits<int>::min();
+
+    for (int batch = 0; batch < input_batches; ++batch) {
+      task_params.input_batch_start =
+          input_data +
+          (batch * input_height * task_params.input_width * input_depth);
+      const int in_y_end =
+          ((output_height * stride_rows) - filter_top_offset) + filter_height;
+      for (int out_y = 0; out_y < output_height; ++out_y) {
+        const int in_y_origin = (out_y * stride_rows) - filter_top_offset;
+        const int cache_start_y = std::max(in_y_origin, end_cached_lines);
+        const int cache_end_y = std::min(
+            in_y_end, std::max((in_y_origin + task_params.cache_height),
+                               end_cached_lines));
+        if (end_cached_lines < (in_y_origin + filter_height)) {
+          // This call breaks up the work required for calculating the mirror
+          // padding and resizing across multiple threads.
+          FusedConvParallelFor(
+              context, cache_start_y, cache_end_y,
+              [task_params](int64 task_cache_start_y, int64 task_cache_end_y) {
+                // This is a long and confusing function, but it's been laid out
+                // this way to help with performance on some intensive models.
+                // What it's doing is populating a cache of the original input
+                // image, after it's been bilinear resized and had its edges
+                // mirrored. This allows the following im2col code to access the
+                // transformed pixels from this cache, without having to
+                // repeatedly apply the expensive bilinear calculations as the
+                // same pixels are accessed by different patches.
+                // This is most effective when the stride is small and the
+                // filter size is large, since that's when pixels are reused
+                // most frequently as patches overlap.
+                for (int cache_y = task_cache_start_y;
+                     cache_y < task_cache_end_y; ++cache_y) {
+                  // We organize the cache as a series of rows, each containing
+                  // all the transformed pixels for a given line in the image.
+                  // This cache is big enough to hold at least a filter's height
+                  // worth of rows, but typically more, limited by the size of
+                  // the cache buffer.
+                  // We don't allocate an entire image's worth of rows though,
+                  // because we're trying to keep memory usage down, so as we
+                  // progress downwards through the im2col we periodically
+                  // refresh the cache so that the next lines that are needed
+                  // for that operation are always present.
+                  // Work out the parameters that remain constant across the
+                  // row we're calculating.
+                  PerCacheLineParameters<T1> line_params(
+                      CalculatePerCacheLineParameters<T1>(
+                          task_params.cache_height, cache_y,
+                          task_params.resize_cache,
+                          task_params.cache_line_width, task_params.input_width,
+                          task_params.input_depth, task_params.top_padding,
+                          task_params.pad_offset, task_params.resized_height,
+                          task_params.st, task_params.input_batch_start));
+                  // Iterate through the resize cache row we're filling in.
+                  for (int cache_x = task_params.cache_start_x;
+                       cache_x < task_params.cache_end_x; ++cache_x) {
+                    // Figure out what we need for the cache pixel we're
+                    // populating.
+                    PerCachePixelParameters<T1> pixel_params(
+                        CalculatePerCachePixelParameters<T1>(
+                            cache_x, task_params.cache_start_x,
+                            line_params.cache_line_start,
+                            task_params.input_depth, task_params.left_padding,
+                            task_params.pad_offset, task_params.resized_width,
+                            task_params.st));
+                    // If the access is off the left, right, top, or bottom of
+                    // the resized image, the conv padding means we should set
+                    // it to zero.
+                    if ((cache_x < 0) ||
+                        (cache_x >= task_params.padded_width) ||
+                        (cache_y < 0) ||
+                        (cache_y >= task_params.padded_height)) {
+                      std::fill_n(pixel_params.cache_line_pixel,
+                                  task_params.input_depth, T1(0));
+                    } else {
+                      // There are two different sampling strategies for
+                      // resizing. When using nearest, we can just do a
+                      // straight copy of the pixel closest to our sample point,
+                      // but bilinear requires a more complex calculation.
+                      if (SampleMode == NEAREST) {
+                        const T1* input_top_left_pixel =
+                            line_params.input_top_row_start +
+                            (pixel_params.left_x_index *
+                             task_params.input_depth);
+
+                        std::copy_n(input_top_left_pixel,
+                                    task_params.input_depth,
+                                    pixel_params.cache_line_pixel);
+                      } else {
+                        const SampleRect<T1> rect(
+                            line_params.input_top_row_start +
+                                (pixel_params.left_x_index *
+                                 task_params.input_depth),
+                            line_params.input_top_row_start +
+                                (pixel_params.right_x_index *
+                                 task_params.input_depth),
+                            line_params.input_bottom_row_start +
+                                (pixel_params.left_x_index *
+                                 task_params.input_depth),
+                            line_params.input_bottom_row_start +
+                                (pixel_params.right_x_index *
+                                 task_params.input_depth));
+                        for (int in_channel = 0;
+                             in_channel < task_params.input_depth;
+                             ++in_channel) {
+                          pixel_params.cache_line_pixel[in_channel] =
+                              rect.BilinearSample(in_channel,
+                                                  pixel_params.x_lerp,
+                                                  line_params.y_lerp);
+                        }
+                      }
+                    }
+                  }
+                }
+              });
+          end_cached_lines = cache_end_y;
+        }
+        for (int out_x = 0; out_x < output_width; ++out_x) {
+          const int in_x_origin = (out_x * stride_cols) - filter_left_offset;
+          const int patch_index = (batch * output_width * output_height) +
+                                  (out_y * output_width) + out_x;
+          const int patch_index_within_chunk = patch_index % patches_per_chunk;
+          T1* im2col_patch_start =
+              im2col_buffer + (patch_index_within_chunk * filter_value_count);
+          for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
+            T1* im2col_row_start =
+                im2col_patch_start +
+                (filter_y * filter_width * task_params.input_depth);
+            const int conv_in_y = in_y_origin + filter_y;
+            int cache_index_y;
+            if (conv_in_y < 0) {
+              cache_index_y = task_params.cache_height +
+                              (conv_in_y % task_params.cache_height);
+            } else {
+              cache_index_y = conv_in_y % task_params.cache_height;
+            }
+            T1* cache_line_start =
+                task_params.resize_cache +
+                (cache_index_y * task_params.cache_line_width *
+                 task_params.input_depth);
+            T1* cache_filter_row_start =
+                cache_line_start + ((in_x_origin - task_params.cache_start_x) *
+                                    task_params.input_depth);
+            std::copy_n(cache_filter_row_start,
+                        (filter_width * task_params.input_depth),
+                        im2col_row_start);
+          }
+          const bool is_last_in_chunk =
+              (patch_index_within_chunk == (patches_per_chunk - 1));
+          const bool is_last_overall =
+              ((batch == (input_batches - 1)) &&
+               (out_y == (output_height - 1)) && (out_x == (output_width - 1)));
+          if (is_last_in_chunk || is_last_overall) {
+            // Now we've assembled a set of image patches into a matrix, apply
+            // a GEMM matrix multiply of the patches as rows, times the filter
+            // weights in columns, to get partial results in the output
+            // matrix.
+            const int how_many_patches = patch_index_within_chunk + 1;
+            const int m = how_many_patches;
+            const int n = filter_count;
+            const int k = filter_value_count;
+            const int lda = filter_value_count;
+            const int ldb = filter_count;
+            const int ldc = filter_count;
+            const size_t start_patch_index =
+                patch_index - (how_many_patches - 1);
+            T3* chunk_output_data =
+                output_data + (start_patch_index * filter_count);
+            TGemmFunctor gemm_functor;
+            gemm_functor(context, m, n, k, im2col_buffer, lda, filter_data, ldb,
+                         chunk_output_data, ldc);
+          }
+        }
+      }
+    }
+  }
+};
+
+}  // namespace
+
+// Implements a version of convolution with bilinear resizing and mirror padding
+// included.
+template <class T, class TConvFunctor, bool DoResize>
+class FusedResizeConv2DUsingGemmOp : public OpKernel {
+ public:
+  explicit FusedResizeConv2DUsingGemmOp(OpKernelConstruction* context)
+      : OpKernel(context) {
+    if (DoResize) {
+      OP_REQUIRES_OK(context,
+                     context->GetAttr("resize_align_corners", &align_corners_));
+    }
+    MirrorPadMode mode;
+    OP_REQUIRES_OK(context, context->GetAttr("mode", &mode));
+
+    switch (mode) {
+      case MirrorPadMode::SYMMETRIC: {
+        offset_ = 0;
+        break;
+      }
+      case MirrorPadMode::REFLECT: {
+        offset_ = 1;
+        break;
+      }
+      default:
+        OP_REQUIRES(context, false,
+                    errors::InvalidArgument(
+                        "mode must be either REFLECT or SYMMETRIC."));
+    }
+    OP_REQUIRES_OK(context, context->GetAttr("strides", &strides_));
+    OP_REQUIRES(context, strides_.size() == 4,
+                errors::InvalidArgument("Sliding window strides field must "
+                                        "specify 4 dimensions"));
+    const int64 stride_n = GetTensorDim(strides_, FORMAT_NHWC, 'N');
+    const int64 stride_c = GetTensorDim(strides_, FORMAT_NHWC, 'C');
+    OP_REQUIRES(
+        context, stride_n == 1 && stride_c == 1,
+        errors::InvalidArgument("Current implementation does not yet support "
+                                "strides in the batch and depth dimensions."));
+    OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
+  }
+
+  void Compute(OpKernelContext* context) override {
+    // Input tensor is of the following dimensions:
+    // [ batch, in_rows, in_cols, in_depth ]
+    const Tensor& input = context->input(0);
+    OP_REQUIRES(context, (input.shape().num_elements() > 0),
+                errors::InvalidArgument("Input tensor can't be empty"));
+
+    ImageResizerState st(false);
+    if (DoResize) {
+      st = ImageResizerState(align_corners_);
+      st.ValidateAndCalculateOutputSize(context, input);
+      if (!context->status().ok()) return;
+    } else {
+      // Set up the resize parameters to do no scaling at all.
+      st.batch_size = input.dim_size(0);
+      st.out_height = input.dim_size(1);
+      st.out_width = input.dim_size(2);
+      st.in_height = input.dim_size(1);
+      st.in_width = input.dim_size(2);
+      st.channels = input.dim_size(3);
+      st.height_scale = 1.0f;
+      st.width_scale = 1.0f;
+    }
+    TensorShape resized_shape(
+        {input.dim_size(0), st.out_height, st.out_width, input.dim_size(3)});
+    int paddings_index;
+    int filter_index;
+    if (DoResize) {
+      paddings_index = 2;
+      filter_index = 3;
+    } else {
+      paddings_index = 1;
+      filter_index = 2;
+    }
+    const Tensor& paddings = context->input(paddings_index);
+
+    const int dims = resized_shape.dims();
+    OP_REQUIRES(
+        context,
+        TensorShapeUtils::IsMatrix(paddings.shape()) &&
+            paddings.dim_size(1) == 2,
+        errors::InvalidArgument("paddings must be a matrix with 2 columns: ",
+                                paddings.shape().DebugString()));
+    const int fixed_dims =
+        (allow_legacy_scalars() && dims == 0 && paddings.dim_size(0) == 1)
+            ? 1
+            : dims;
+    OP_REQUIRES(
+        context, fixed_dims == paddings.dim_size(0),
+        errors::InvalidArgument(
+            "The first dimension of paddings must be the rank of inputs: ",
+            fixed_dims, " ", paddings.shape().DebugString(), " ",
+            resized_shape.DebugString()));
+    OP_REQUIRES(
+        context, dims == paddings.dim_size(0),
+        errors::InvalidArgument(
+            "The first dimension of paddings must be the rank of inputs: ",
+            dims, " ", paddings.shape().DebugString(), " ",
+            resized_shape.DebugString()));
+
+    OP_REQUIRES(
+        context, dims == 4,
+        errors::InvalidArgument(
+            "Fused mirror padding only supports four-dimensional inputs, but ",
+            dims, " requested"));
+
+    // Compute the shape of the output tensor, and allocate it.
+    TensorShape padded_shape;
+    TTypes<int32>::ConstMatrix paddings_matrix = paddings.matrix<int32>();
+    for (int d = 0; d < dims; ++d) {
+      const int32 before =
+          paddings_matrix(d, 0);  // Pad before existing elements.
+      const int32 after =
+          paddings_matrix(d, 1);  // Pad after existing elements.
+      OP_REQUIRES(context, before >= 0 && after >= 0,
+                  errors::InvalidArgument(
+                      "paddings must be non-negative: ", before, " ", after));
+      if (offset_ == 0) {  // SYMMETRIC mode.
+        OP_REQUIRES(
+            context,
+            before <= resized_shape.dim_size(d) &&
+                after <= resized_shape.dim_size(d),
+            errors::InvalidArgument("paddings must be no greater "
+                                    "than the dimension size: ",
+                                    before, ", ", after, " greater than ",
+                                    resized_shape.dim_size(d)));
+      } else if (offset_ == 1) {  // REFLECT mode.
+        OP_REQUIRES(
+            context,
+            before < resized_shape.dim_size(d) &&
+                after < resized_shape.dim_size(d),
+            errors::InvalidArgument("paddings must be less than"
+                                    " the dimension size: ",
+                                    before, ", ", after, " not less than ",
+                                    resized_shape.dim_size(d)));
+      }
+      padded_shape.AddDim(before + resized_shape.dim_size(d) + after);
+    }
+
+    OP_REQUIRES(
+        context, ((paddings_matrix(0, 0) == 0) && (paddings_matrix(0, 1) == 0)),
+        errors::InvalidArgument(
+            "Fused mirror padding only support spatial padding, not batches: ",
+            paddings.DebugString()));
+    OP_REQUIRES(
+        context, ((paddings_matrix(3, 0) == 0) && (paddings_matrix(3, 1) == 0)),
+        errors::InvalidArgument(
+            "Fused mirror padding only support spatial padding, not channels: ",
+            paddings.DebugString()));
+    const int32 top_padding = paddings_matrix(1, 0);
+    const int32 bottom_padding = paddings_matrix(1, 1);
+    const int32 left_padding = paddings_matrix(2, 0);
+    const int32 right_padding = paddings_matrix(2, 1);
+
+    // Input filter is of the following dimensions:
+    // [ filter_rows, filter_cols, in_depth, out_depth]
+    const Tensor& filter = context->input(filter_index);
+
+    // For 2D convolution, there should be 4 dimensions.
+    OP_REQUIRES(context, padded_shape.dims() == 4,
+                errors::InvalidArgument("input must be 4-dimensional",
+                                        padded_shape.DebugString()));
+    OP_REQUIRES(context, filter.dims() == 4,
+                errors::InvalidArgument("filter must be 4-dimensional: ",
+                                        filter.shape().DebugString()));
+
+    // We only check the first three dims, since the depth is accessed as an
+    // int64 below.
+    for (int i = 0; i < 3; i++) {
+      OP_REQUIRES(
+          context,
+          FastBoundsCheck(filter.dim_size(i), std::numeric_limits<int>::max()),
+          errors::InvalidArgument("filter too large"));
+    }
+
+    // The last dimension for input is in_depth. It must be the same as the
+    // filter's in_depth.
+    const int64 in_depth = padded_shape.dim_size(3);
+    OP_REQUIRES(context, in_depth == filter.dim_size(2),
+                errors::InvalidArgument(
+                    "input and filter must have the same depth: ", in_depth,
+                    " vs ", filter.dim_size(2)));
+
+    // The last dimension for filter is out_depth.
+    const int out_depth = static_cast<int>(filter.dim_size(3));
+
+    // The second dimension for input is rows/height.
+    // The first dimension for filter is rows/height.
+    const int64 padded_rows_raw = padded_shape.dim_size(1);
+    OP_REQUIRES(
+        context,
+        FastBoundsCheck(padded_rows_raw, std::numeric_limits<int>::max()),
+        errors::InvalidArgument("Input rows too large"));
+    const int padded_rows = static_cast<int>(padded_rows_raw);
+    const int filter_rows = static_cast<int>(filter.dim_size(0));
+    const int resized_rows = static_cast<int>(resized_shape.dim_size(1));
+
+    // The third dimension for input is columns/width.
+    // The second dimension for filter is columns/width.
+    const int64 padded_cols_raw = padded_shape.dim_size(2);
+    OP_REQUIRES(
+        context,
+        FastBoundsCheck(padded_cols_raw, std::numeric_limits<int>::max()),
+        errors::InvalidArgument("Input cols too large"));
+    const int padded_cols = static_cast<int>(padded_cols_raw);
+    const int filter_cols = static_cast<int>(filter.dim_size(1));
+    const int resized_cols = static_cast<int>(resized_shape.dim_size(2));
+
+    // The first dimension for input is batch.
+    const int64 batch_raw = padded_shape.dim_size(0);
+    OP_REQUIRES(context,
+                FastBoundsCheck(batch_raw, std::numeric_limits<int>::max()),
+                errors::InvalidArgument("batch is too large"));
+    const int batch = static_cast<int>(batch_raw);
+
+    // For now we take the stride from the second and third dimensions only (we
+    // do not support striding on the batch or depth dimension).
+    const int stride_rows = GetTensorDim(strides_, FORMAT_NHWC, 'H');
+    const int stride_cols = GetTensorDim(strides_, FORMAT_NHWC, 'W');
+
+    int64 out_rows = 0, out_cols = 0, pad_rows = 0, pad_cols = 0;
+    OP_REQUIRES_OK(context,
+                   GetWindowedOutputSize(padded_rows, filter_rows, stride_rows,
+                                         padding_, &out_rows, &pad_rows));
+    OP_REQUIRES_OK(context,
+                   GetWindowedOutputSize(padded_cols, filter_cols, stride_cols,
+                                         padding_, &out_cols, &pad_cols));
+    TensorShape out_shape =
+        ShapeFromFormat(FORMAT_NHWC, batch, out_rows, out_cols, out_depth);
+    OP_REQUIRES(context, (out_shape.num_elements() > 0),
+                errors::InvalidArgument("Output tensor can't be empty"));
+
+    // Output tensor is of the following dimensions:
+    // [ in_batch, out_rows, out_cols, out_depth ]
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK(context, context->allocate_output(0, out_shape, &output));
+
+    VLOG(2) << "FusedConv2D: " << name() << ", in_depth = " << in_depth
+            << ", padded_cols = " << padded_cols
+            << ", resized_cols = " << resized_cols
+            << ", filter_cols = " << filter_cols
+            << ", padded_rows = " << padded_rows
+            << ", resized_rows = " << resized_rows
+            << ", filter_rows = " << filter_rows
+            << ", stride_rows = " << stride_rows
+            << ", stride_cols = " << stride_cols
+            << ", out_depth = " << out_depth << ", DoResize=" << DoResize;
+
+    // If there is nothing to compute, return.
+    if (out_shape.num_elements() == 0) {
+      return;
+    }
+    TConvFunctor conv_functor;
+    conv_functor(context, input, batch, resized_rows, resized_cols, padded_rows,
+                 padded_cols, in_depth, filter.flat<T>().data(), filter_rows,
+                 filter_cols, out_depth, stride_rows, stride_cols, padding_,
+                 output->flat<T>().data(), out_rows, out_cols, st, top_padding,
+                 bottom_padding, left_padding, right_padding, offset_);
+  }
+
+ private:
+  std::vector<int32> strides_;
+  Padding padding_;
+  bool align_corners_;
+  int offset_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(FusedResizeConv2DUsingGemmOp);
+};
+
+#define REGISTER_FUSED(T)                                                 \
+  REGISTER_KERNEL_BUILDER(                                                \
+      Name("FusedResizeAndPadConv2D")                                     \
+          .Device(DEVICE_CPU)                                             \
+          .TypeConstraint<T>("T"),                                        \
+      FusedResizeConv2DUsingGemmOp<                                       \
+          T,                                                              \
+          FusedResizeAndPadConvFunctor<T, T, T, FastGemmFunctor<T, T, T>, \
+                                       BILINEAR>,                         \
+          true>);
+
+TF_CALL_half(REGISTER_FUSED);
+TF_CALL_float(REGISTER_FUSED);
+TF_CALL_double(REGISTER_FUSED);
+
+#define REGISTER_PAD_ONLY_FUSED(T)                                        \
+  REGISTER_KERNEL_BUILDER(                                                \
+      Name("FusedPadConv2D").Device(DEVICE_CPU).TypeConstraint<T>("T"),   \
+      FusedResizeConv2DUsingGemmOp<                                       \
+          T,                                                              \
+          FusedResizeAndPadConvFunctor<T, T, T, FastGemmFunctor<T, T, T>, \
+                                       NEAREST>,                          \
+          false>);
+
+TF_CALL_half(REGISTER_PAD_ONLY_FUSED);
+TF_CALL_float(REGISTER_PAD_ONLY_FUSED);
+TF_CALL_double(REGISTER_PAD_ONLY_FUSED);
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/conv_ops_test.cc b/tensorflow/core/kernels/conv_ops_test.cc
index 6421cad367e..bf98acdecfd 100644
--- a/tensorflow/core/kernels/conv_ops_test.cc
+++ b/tensorflow/core/kernels/conv_ops_test.cc
@@ -13,6 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <string>
+#include <vector>
+
 #include "tensorflow/cc/ops/const_op.h"
 #include "tensorflow/cc/ops/image_ops.h"
 #include "tensorflow/cc/ops/nn_ops.h"
@@ -27,6 +30,7 @@ limitations under the License.
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/test_benchmark.h"
+#include "tensorflow/core/protobuf/rewriter_config.pb.h"
 #include "tensorflow/core/public/session.h"
 
 namespace tensorflow {
@@ -522,6 +526,7 @@ TEST_F(ConvOpTest, HandwrittenConv) { HandwrittenConv(); }
 
 TEST_F(ConvOpTest, AnisotropicStride) { AnisotropicStrides(); }
 
+template <typename T>
 class FusedConv2DOpTest : public OpsTestBase {
  protected:
   static constexpr int kDepth = 3;
@@ -529,10 +534,15 @@ class FusedConv2DOpTest : public OpsTestBase {
   static constexpr int kImageHeight = 32;
   static constexpr int kImageBatchCount = 8;
 
-  using GraphRunner =
+  using BiasAddGraphRunner =
       std::function<void(const Tensor& input_data, const Tensor& filter_data,
                          const Tensor& bias_data, Tensor* out)>;
 
+  using BatchNormGraphRunner = std::function<void(
+      const Tensor& input_data, const Tensor& filter_data,
+      const Tensor& scale_data, const Tensor& offset_data,
+      const Tensor& mean_data, const Tensor& variance_data, Tensor* out)>;
+
   // Runs a Tensorflow graph defined by the root scope, and fetches the result
   // of 'fetch' node into the output Tensor.
   void RunAndFetch(const tensorflow::Scope& root, const string& fetch,
@@ -540,8 +550,22 @@ class FusedConv2DOpTest : public OpsTestBase {
     tensorflow::GraphDef graph;
     TF_ASSERT_OK(root.ToGraphDef(&graph));
 
+    // `FusedConv2D` is available only on CPU, and in this test we don't want to
+    // compare GPU vs CPU numbers, so place all nodes on CPU.
+    for (NodeDef& mutable_node : *graph.mutable_node()) {
+      mutable_node.set_device("/device:CPU:0");
+    }
+
+    // Disable Grappler constant folding for the test graphs.
+    tensorflow::SessionOptions session_options;
+    tensorflow::RewriterConfig* cfg =
+        session_options.config.mutable_graph_options()
+            ->mutable_rewrite_options();
+    cfg->set_constant_folding(tensorflow::RewriterConfig::OFF);
+
     std::unique_ptr<tensorflow::Session> session(
-        tensorflow::NewSession(tensorflow::SessionOptions()));
+        tensorflow::NewSession(session_options));
+
     TF_ASSERT_OK(session->Create(graph));
 
     std::vector<Tensor> unfused_tensors;
@@ -550,8 +574,9 @@ class FusedConv2DOpTest : public OpsTestBase {
     *output = unfused_tensors[0];
   }
 
-  void RunConv2DOp(const Tensor& input_data, const Tensor& filter_data,
-                   const Tensor& bias_data, Tensor* output, int stride = 1) {
+  void RunConv2DWithBias(const Tensor& input_data, const Tensor& filter_data,
+                         const Tensor& bias_data, Tensor* output,
+                         int stride = 1) {
     auto root = tensorflow::Scope::NewRootScope();
 
     auto conv = ops::Conv2D(
@@ -567,9 +592,10 @@ class FusedConv2DOpTest : public OpsTestBase {
     RunAndFetch(root, "with_bias", output);
   }
 
-  void RunConv2DWithReluOp(const Tensor& input_data, const Tensor& filter_data,
-                           const Tensor& bias_data, Tensor* output,
-                           int stride = 1) {
+  void RunConv2DWithBiasAndRelu(const Tensor& input_data,
+                                const Tensor& filter_data,
+                                const Tensor& bias_data, Tensor* output,
+                                int stride = 1) {
     auto root = tensorflow::Scope::NewRootScope();
 
     auto conv = ops::Conv2D(
@@ -587,18 +613,79 @@ class FusedConv2DOpTest : public OpsTestBase {
     RunAndFetch(root, "with_relu", output);
   }
 
-  template <typename T>
+  void RunConv2DWithBatchNorm(const Tensor& input_data,
+                              const Tensor& filter_data,
+                              const Tensor& scale_data,
+                              const Tensor& offset_data,
+                              const Tensor& mean_data,
+                              const Tensor& variance_data, Tensor* output,
+                              int stride = 1) {
+    auto root = tensorflow::Scope::NewRootScope();
+
+    auto conv = ops::Conv2D(
+        root.WithOpName("conv"),
+        ops::Const(root.WithOpName("input"), Input::Initializer(input_data)),
+        ops::Const(root.WithOpName("filter"), Input::Initializer(filter_data)),
+        {1, stride, stride, 1}, "SAME");
+
+    ops::FusedBatchNorm::Attrs attr;
+    attr = attr.IsTraining(false);
+
+    auto with_fused_batch_norm = ops::FusedBatchNorm(
+        root.WithOpName("with_fused_batch_norm"), conv,
+        ops::Const(root.WithOpName("scale"), Input::Initializer(scale_data)),
+        ops::Const(root.WithOpName("offset"), Input::Initializer(offset_data)),
+        ops::Const(root.WithOpName("mean"), Input::Initializer(mean_data)),
+        ops::Const(root.WithOpName("var"), Input::Initializer(variance_data)),
+        attr);
+
+    RunAndFetch(root, "with_fused_batch_norm", output);
+  }
+
+  void RunConv2DWithBatchNormAndRelu(const Tensor& input_data,
+                                     const Tensor& filter_data,
+                                     const Tensor& scale_data,
+                                     const Tensor& offset_data,
+                                     const Tensor& mean_data,
+                                     const Tensor& variance_data,
+                                     Tensor* output, int stride = 1) {
+    auto root = tensorflow::Scope::NewRootScope();
+
+    auto conv = ops::Conv2D(
+        root.WithOpName("conv"),
+        ops::Const(root.WithOpName("input"), Input::Initializer(input_data)),
+        ops::Const(root.WithOpName("filter"), Input::Initializer(filter_data)),
+        {1, stride, stride, 1}, "SAME");
+
+    ops::FusedBatchNorm::Attrs attr;
+    attr = attr.IsTraining(false);
+
+    auto with_fused_batch_norm = ops::FusedBatchNorm(
+        root.WithOpName("with_fused_batch_norm"), conv,
+        ops::Const(root.WithOpName("scale"), Input::Initializer(scale_data)),
+        ops::Const(root.WithOpName("offset"), Input::Initializer(offset_data)),
+        ops::Const(root.WithOpName("mean"), Input::Initializer(mean_data)),
+        ops::Const(root.WithOpName("var"), Input::Initializer(variance_data)),
+        attr);
+
+    auto with_relu =
+        ops::Relu(root.WithOpName("with_relu"), with_fused_batch_norm.y);
+
+    RunAndFetch(root, "with_relu", output);
+  }
+
   void RunFusedConv2DOp(const Tensor& image, const Tensor& filter,
-                        const Tensor& bias,
+                        const std::vector<Tensor>& args,
                         const std::vector<string>& fused_ops, Tensor* output,
                         int stride = 1) {
     DataType dtype = DataTypeToEnum<T>::v();
+    int num_args = static_cast<int>(args.size());
 
     TF_EXPECT_OK(NodeDefBuilder("fused_conv_op", "_FusedConv2D")
                      .Input(FakeInput(dtype))
                      .Input(FakeInput(dtype))
-                     .Attr("num_args", 1)
-                     .Input(FakeInput(dtype))
+                     .Attr("num_args", num_args)
+                     .Input(FakeInput(num_args, dtype))
                      .Attr("T", dtype)
                      .Attr("strides", {1, stride, stride, 1})
                      .Attr("padding", "SAME")
@@ -609,27 +696,32 @@ class FusedConv2DOpTest : public OpsTestBase {
 
     AddInputFromArray<T>(image.shape(), image.flat<T>());
     AddInputFromArray<T>(filter.shape(), filter.flat<T>());
-    AddInputFromArray<T>(bias.shape(), bias.flat<T>());
+    for (const Tensor& arg : args)
+      AddInputFromArray<T>(arg.shape(), arg.flat<T>());
     TF_ASSERT_OK(RunOpKernel());
 
     *output = *GetOutput(0);
   }
 
-  template <typename T>
-  void VerifyTensorsNear(int depth, int image_width, int image_height,
-                         int image_batch_count, int filter_size,
-                         int filter_count, const GraphRunner& run_default,
-                         const GraphRunner& run_fused) {
+  void VerifyBiasAddTensorsNear(int depth, int image_width, int image_height,
+                                int image_batch_count, int filter_size,
+                                int filter_count,
+                                const BiasAddGraphRunner& run_default,
+                                const BiasAddGraphRunner& run_fused) {
     DataType dtype = DataTypeToEnum<T>::v();
+
     Tensor image(dtype, {image_batch_count, image_height, image_width, depth});
     image.flat<T>() = image.flat<T>().setRandom();
 
+    // Add some negative values to filter to properly test Relu.
     Tensor filter(dtype, {filter_size, filter_size, depth, filter_count});
     filter.flat<T>() = filter.flat<T>().setRandom();
+    filter.flat<T>() -= filter.flat<T>().constant(static_cast<T>(0.5f));
 
     const int bias_size = filter_count;
     Tensor bias(dtype, {bias_size});
     bias.flat<T>() = bias.flat<T>().setRandom();
+    bias.flat<T>() += bias.flat<T>().constant(static_cast<T>(0.5f));
 
     Tensor conv_2d;
     Tensor fused_conv_2d;
@@ -640,114 +732,291 @@ class FusedConv2DOpTest : public OpsTestBase {
     ASSERT_EQ(conv_2d.dtype(), fused_conv_2d.dtype());
     ASSERT_EQ(conv_2d.shape(), fused_conv_2d.shape());
 
-    test::ExpectTensorNear<T>(conv_2d, fused_conv_2d, 1e-5);
+    // NOTE(ezhulenev): When filter size is equal to the input image size, we
+    // effectevily do element-wise product and full sum reduction, and these
+    // operations intoroduce higher than "normal" numerical errors.
+    if (image_width == filter_size && image_height == filter_size) {
+      test::ExpectTensorNear<T>(conv_2d, fused_conv_2d, 1e-3);
+    } else {
+      test::ExpectClose(conv_2d, fused_conv_2d);
+    }
+  }
+
+  void VerifyFusedBatchNormTensorsNear(int depth, int image_width,
+                                       int image_height, int image_batch_count,
+                                       int filter_size, int filter_count,
+                                       const BatchNormGraphRunner& run_default,
+                                       const BatchNormGraphRunner& run_fused) {
+    DataType dtype = DataTypeToEnum<T>::v();
+
+    Tensor image(dtype, {image_batch_count, image_height, image_width, depth});
+    image.flat<T>() = image.flat<T>().setRandom();
+
+    // Add some negative values to filter to properly test Relu.
+    Tensor filter(dtype, {filter_size, filter_size, depth, filter_count});
+    filter.flat<T>() = filter.flat<T>().setRandom();
+    filter.flat<T>() -= filter.flat<T>().constant(static_cast<T>(0.5f));
+
+    const int scale_size = filter_count;
+
+    Tensor scale(dtype, {scale_size});
+    scale.flat<T>() = scale.flat<T>().setRandom();
+
+    Tensor offset(dtype, {scale_size});
+    offset.flat<T>() = offset.flat<T>().setRandom();
+
+    Tensor mean(dtype, {scale_size});
+    mean.flat<T>() = mean.flat<T>().setRandom();
+
+    Tensor variance(dtype, {scale_size});
+    variance.flat<T>() = variance.flat<T>().setRandom();
+    variance.flat<T>() += variance.flat<T>().constant(static_cast<T>(0.5f));
+
+    Tensor conv_2d;
+    Tensor fused_conv_2d;
+
+    run_default(image, filter, scale, offset, mean, variance, &conv_2d);
+    run_fused(image, filter, scale, offset, mean, variance, &fused_conv_2d);
+
+    ASSERT_EQ(conv_2d.dtype(), fused_conv_2d.dtype());
+    ASSERT_EQ(conv_2d.shape(), fused_conv_2d.shape());
+
+    // NOTE(ezhulenev): When filter size is equal to the input image size, we
+    // effectevily do element-wise product and full sum reduction, and these
+    // operations intoroduce higher than "normal" numerical errors.
+    if (image_width == filter_size && image_height == filter_size) {
+      test::ExpectTensorNear<T>(conv_2d, fused_conv_2d, 1e-3);
+    } else {
+      test::ExpectClose(conv_2d, fused_conv_2d);
+    }
   }
 
   // Verifies that computing Conv2D+BiasAdd in a graph is identical to
   // FusedConv2D.
-  template <typename T>
-  void VerifyConv2DWithBias(int depth, int image_width, int image_height,
-                            int image_batch_count, int filter_size,
-                            int filter_count) {
-    const GraphRunner run_default =
+  void VerifyConv2DWithBias(int filter_size, int filter_count,
+                            int depth = kDepth, int image_width = kImageWidth,
+                            int image_height = kImageHeight,
+                            int image_batch_count = kImageBatchCount) {
+    const BiasAddGraphRunner run_default =
         [this](const Tensor& input_data, const Tensor& filter_data,
                const Tensor& bias_data, Tensor* out) {
-          RunConv2DOp(input_data, filter_data, bias_data, out);
+          RunConv2DWithBias(input_data, filter_data, bias_data, out);
         };
 
-    const GraphRunner run_fused = [this](const Tensor& input_data,
-                                         const Tensor& filter_data,
-                                         const Tensor& bias_data, Tensor* out) {
-      RunFusedConv2DOp<T>(input_data, filter_data, bias_data, {"BiasAdd"}, out);
+    const BiasAddGraphRunner run_fused = [this](const Tensor& input_data,
+                                                const Tensor& filter_data,
+                                                const Tensor& bias_data,
+                                                Tensor* out) {
+      RunFusedConv2DOp(input_data, filter_data, {bias_data}, {"BiasAdd"}, out);
     };
 
-    VerifyTensorsNear<T>(depth, image_width, image_height, image_batch_count,
-                         filter_size, filter_count, run_default, run_fused);
+    VerifyBiasAddTensorsNear(depth, image_width, image_height,
+                             image_batch_count, filter_size, filter_count,
+                             run_default, run_fused);
   }
 
   // Verifies that computing Conv2D+BiasAdd+Relu in a graph is identical to
   // FusedConv2D.
-  template <typename T>
-  void VerifyConv2DWithBiasAndRelu(int depth, int image_width, int image_height,
-                                   int image_batch_count, int filter_size,
-                                   int filter_count) {
-    const GraphRunner run_default =
+  void VerifyConv2DWithBiasAndRelu(int filter_size, int filter_count,
+                                   int depth = kDepth,
+                                   int image_width = kImageWidth,
+                                   int image_height = kImageHeight,
+                                   int image_batch_count = kImageBatchCount) {
+    const BiasAddGraphRunner run_default =
         [this](const Tensor& input_data, const Tensor& filter_data,
                const Tensor& bias_data, Tensor* out) {
-          RunConv2DWithReluOp(input_data, filter_data, bias_data, out);
+          RunConv2DWithBiasAndRelu(input_data, filter_data, bias_data, out);
         };
 
-    const GraphRunner run_fused = [this](const Tensor& input_data,
-                                         const Tensor& filter_data,
-                                         const Tensor& bias_data, Tensor* out) {
-      RunFusedConv2DOp<T>(input_data, filter_data, bias_data,
-                          {"BiasAdd", "Relu"}, out);
-    };
+    const BiasAddGraphRunner run_fused =
+        [this](const Tensor& input_data, const Tensor& filter_data,
+               const Tensor& bias_data, Tensor* out) {
+          RunFusedConv2DOp(input_data, filter_data, {bias_data},
+                           {"BiasAdd", "Relu"}, out);
+        };
 
-    VerifyTensorsNear<T>(depth, image_width, image_height, image_batch_count,
-                         filter_size, filter_count, run_default, run_fused);
+    VerifyBiasAddTensorsNear(depth, image_width, image_height,
+                             image_batch_count, filter_size, filter_count,
+                             run_default, run_fused);
+  }
+
+  // Verifies that computing Conv2D+FusedBatchNorm in a graph is identical to
+  // FusedConv2D.
+  void VerifyConv2DWithBatchNorm(int filter_size, int filter_count,
+                                 int depth = kDepth,
+                                 int image_width = kImageWidth,
+                                 int image_height = kImageHeight,
+                                 int image_batch_count = kImageBatchCount) {
+    const BatchNormGraphRunner run_default =
+        [this](const Tensor& input_data, const Tensor& filter_data,
+               const Tensor& scale_data, const Tensor& offset_data,
+               const Tensor& mean_data, const Tensor& variance_data,
+               Tensor* out) {
+          RunConv2DWithBatchNorm(input_data, filter_data, scale_data,
+                                 offset_data, mean_data, variance_data, out);
+        };
+
+    const BatchNormGraphRunner run_fused =
+        [this](const Tensor& input_data, const Tensor& filter_data,
+               const Tensor& scale_data, const Tensor& offset_data,
+               const Tensor& mean_data, const Tensor& variance_data,
+               Tensor* out) {
+          RunFusedConv2DOp(input_data, filter_data,
+                           {scale_data, offset_data, mean_data, variance_data},
+                           {"FusedBatchNorm"}, out);
+        };
+
+    VerifyFusedBatchNormTensorsNear(depth, image_width, image_height,
+                                    image_batch_count, filter_size,
+                                    filter_count, run_default, run_fused);
+  }
+
+  // Verifies that computing Conv2D+FusedBatchNorm+Relu in a graph is identical
+  // to FusedConv2D.
+  void VerifyConv2DWithBatchNormAndRelu(
+      int filter_size, int filter_count, int depth = kDepth,
+      int image_width = kImageWidth, int image_height = kImageHeight,
+      int image_batch_count = kImageBatchCount) {
+    const BatchNormGraphRunner run_default =
+        [this](const Tensor& input_data, const Tensor& filter_data,
+               const Tensor& scale_data, const Tensor& offset_data,
+               const Tensor& mean_data, const Tensor& variance_data,
+               Tensor* out) {
+          RunConv2DWithBatchNormAndRelu(input_data, filter_data, scale_data,
+                                        offset_data, mean_data, variance_data,
+                                        out);
+        };
+
+    const BatchNormGraphRunner run_fused =
+        [this](const Tensor& input_data, const Tensor& filter_data,
+               const Tensor& scale_data, const Tensor& offset_data,
+               const Tensor& mean_data, const Tensor& variance_data,
+               Tensor* out) {
+          RunFusedConv2DOp(input_data, filter_data,
+                           {scale_data, offset_data, mean_data, variance_data},
+                           {"FusedBatchNorm", "Relu"}, out);
+        };
+
+    VerifyFusedBatchNormTensorsNear(depth, image_width, image_height,
+                                    image_batch_count, filter_size,
+                                    filter_count, run_default, run_fused);
   }
 };
 
-#define FUSED_CONV2D_TESTS(dtype, name)                                       \
-  TEST_F(FusedConv2DOpTest, Conv2DWithBiasAddOneByOneConvolution##name) {     \
-    const int filter_size = 1;                                                \
-    const int filter_count = 12;                                              \
-                                                                              \
-    VerifyConv2DWithBias<dtype>(kDepth, kImageWidth, kImageHeight,            \
-                                kImageBatchCount, filter_size, filter_count); \
-  }                                                                           \
-                                                                              \
-  TEST_F(FusedConv2DOpTest, Conv2DWithBiasAddImageSizeConvolution##name) {    \
-    const int filter_size = 32;                                               \
-    const int filter_count = 12;                                              \
-                                                                              \
-    VerifyConv2DWithBias<dtype>(kDepth, kImageWidth, kImageHeight,            \
-                                kImageBatchCount, filter_size, filter_count); \
-  }                                                                           \
-                                                                              \
-  TEST_F(FusedConv2DOpTest, Conv2DWithBiasAddSpatialConvolution##name) {      \
-    const int filter_size = 3;                                                \
-    const int filter_count = 12;                                              \
-                                                                              \
-    VerifyConv2DWithBias<dtype>(kDepth, kImageWidth, kImageHeight,            \
-                                kImageBatchCount, filter_size, filter_count); \
-  }                                                                           \
-                                                                              \
-  TEST_F(FusedConv2DOpTest,                                                   \
-         Conv2DWithBiasAddAndReluOneByOneConvolution##name) {                 \
-    const int filter_size = 1;                                                \
-    const int filter_count = 12;                                              \
-                                                                              \
-    VerifyConv2DWithBiasAndRelu<dtype>(kDepth, kImageWidth, kImageHeight,     \
-                                       kImageBatchCount, filter_size,         \
-                                       filter_count);                         \
-  }                                                                           \
-                                                                              \
-  TEST_F(FusedConv2DOpTest,                                                   \
-         Conv2DWithBiasAddAndReluImageSizeConvolution##name) {                \
-    const int filter_size = 32;                                               \
-    const int filter_count = 12;                                              \
-                                                                              \
-    VerifyConv2DWithBiasAndRelu<dtype>(kDepth, kImageWidth, kImageHeight,     \
-                                       kImageBatchCount, filter_size,         \
-                                       filter_count);                         \
-  }                                                                           \
-                                                                              \
-  TEST_F(FusedConv2DOpTest,                                                   \
-         Conv2DWithBiasAddAndReluSpatialConvolution##name) {                  \
-    const int filter_size = 3;                                                \
-    const int filter_count = 12;                                              \
-                                                                              \
-    VerifyConv2DWithBiasAndRelu<dtype>(kDepth, kImageWidth, kImageHeight,     \
-                                       kImageBatchCount, filter_size,         \
-                                       filter_count);                         \
-  }
+// Conv2D with BatchNorm can be tested only with `T=float`, because default
+// `FusedBatchNorm` kernel supports only floats for scale, mean and variance.
 
-FUSED_CONV2D_TESTS(float, F);
-FUSED_CONV2D_TESTS(double, D);
+template <typename T>
+class FusedConv2DWithBiasOpTest : public FusedConv2DOpTest<T> {};
+template <typename T>
+class FusedConv2DWithBatchNormOpTest : public FusedConv2DOpTest<T> {};
 
-#undef FUSED_CONV2D_TESTS
+TYPED_TEST_CASE_P(FusedConv2DWithBiasOpTest);
+TYPED_TEST_CASE_P(FusedConv2DWithBatchNormOpTest);
+
+// -------------------------------------------------------------------------- //
+// Conv2D + BiasAdd + {Relu}                                                  //
+// -------------------------------------------------------------------------- //
+
+TYPED_TEST_P(FusedConv2DWithBiasOpTest, OneByOneConvolution) {
+  const int filter_size = 1;
+  const int filter_count = 12;
+  this->VerifyConv2DWithBias(filter_size, filter_count);
+}
+
+TYPED_TEST_P(FusedConv2DWithBiasOpTest, ImageSizeConvolution) {
+  const int filter_size = TestFixture::kImageWidth;
+  const int filter_count = 12;
+  this->VerifyConv2DWithBias(filter_size, filter_count);
+}
+
+TYPED_TEST_P(FusedConv2DWithBiasOpTest, SpatialConvolution) {
+  const int filter_size = 3;
+  const int filter_count = 12;
+  this->VerifyConv2DWithBias(filter_size, filter_count);
+}
+
+TYPED_TEST_P(FusedConv2DWithBiasOpTest, OneByOneConvolutionAndRelu) {
+  const int filter_size = 1;
+  const int filter_count = 12;
+  this->VerifyConv2DWithBiasAndRelu(filter_size, filter_count);
+}
+
+TYPED_TEST_P(FusedConv2DWithBiasOpTest, ImageSizeConvolutionAndRelu) {
+  const int filter_size = TestFixture::kImageWidth;
+  const int filter_count = 12;
+  this->VerifyConv2DWithBiasAndRelu(filter_size, filter_count);
+}
+
+TYPED_TEST_P(FusedConv2DWithBiasOpTest, SpatialConvolutionAndRelu) {
+  const int filter_size = 3;
+  const int filter_count = 12;
+  this->VerifyConv2DWithBiasAndRelu(filter_size, filter_count);
+}
+
+// -------------------------------------------------------------------------- //
+// Conv2D + FusedBatchNorm + {Relu}                                           //
+// -------------------------------------------------------------------------- //
+
+TYPED_TEST_P(FusedConv2DWithBatchNormOpTest, OneByOneConvolution) {
+  const int filter_size = 1;
+  const int filter_count = 12;
+  this->VerifyConv2DWithBatchNorm(filter_size, filter_count);
+}
+
+TYPED_TEST_P(FusedConv2DWithBatchNormOpTest, ImageSizeConvolution) {
+  const int filter_size = TestFixture::kImageWidth;
+  const int filter_count = 12;
+  this->VerifyConv2DWithBatchNorm(filter_size, filter_count);
+}
+
+TYPED_TEST_P(FusedConv2DWithBatchNormOpTest, SpatialConvolution) {
+  const int filter_size = 3;
+  const int filter_count = 12;
+  this->VerifyConv2DWithBatchNorm(filter_size, filter_count);
+}
+
+TYPED_TEST_P(FusedConv2DWithBatchNormOpTest, OneByOneConvolutionAndRelu) {
+  const int filter_size = 1;
+  const int filter_count = 12;
+  this->VerifyConv2DWithBatchNormAndRelu(filter_size, filter_count);
+}
+
+TYPED_TEST_P(FusedConv2DWithBatchNormOpTest, ImageSizeConvolutionAndRelu) {
+  const int filter_size = TestFixture::kImageWidth;
+  const int filter_count = 12;
+  this->VerifyConv2DWithBatchNormAndRelu(filter_size, filter_count);
+}
+
+TYPED_TEST_P(FusedConv2DWithBatchNormOpTest, SpatialConvolutionAndRelu) {
+  const int filter_size = 3;
+  const int filter_count = 12;
+  this->VerifyConv2DWithBatchNormAndRelu(filter_size, filter_count);
+}
+
+REGISTER_TYPED_TEST_CASE_P(FusedConv2DWithBiasOpTest,    //
+                           OneByOneConvolution,          //
+                           ImageSizeConvolution,         //
+                           SpatialConvolution,           //
+                           OneByOneConvolutionAndRelu,   //
+                           ImageSizeConvolutionAndRelu,  //
+                           SpatialConvolutionAndRelu);
+
+REGISTER_TYPED_TEST_CASE_P(FusedConv2DWithBatchNormOpTest,  //
+                           OneByOneConvolution,             //
+                           ImageSizeConvolution,            //
+                           SpatialConvolution,              //
+                           OneByOneConvolutionAndRelu,      //
+                           ImageSizeConvolutionAndRelu,     //
+                           SpatialConvolutionAndRelu);
+
+using FusedBiasAddDataTypes = ::testing::Types<float, double>;
+INSTANTIATE_TYPED_TEST_CASE_P(Test, FusedConv2DWithBiasOpTest,
+                              FusedBiasAddDataTypes);
+
+using FusedBatchNormDataTypes = ::testing::Types<float>;
+INSTANTIATE_TYPED_TEST_CASE_P(Test, FusedConv2DWithBatchNormOpTest,
+                              FusedBatchNormDataTypes);
 
 ////////////////////////////////////////////////////////////////////////////////
 // Performance benchmarks for the FusedConv2DWithBiasOp.                      //
@@ -771,6 +1040,19 @@ struct Conv2DWithBiasAndReluGraph {
   Node* relu;
 };
 
+struct Conv2DWithBatchNormGraph {
+  Graph* graph;
+  Node* conv2d;
+  Node* batch_norm;
+};
+
+struct Conv2DWithBatchNormAndReluGraph {
+  Graph* graph;
+  Node* conv2d;
+  Node* batch_norm;
+  Node* relu;
+};
+
 static Tensor MakeRandomTensor(const TensorShape& shape) {
   Tensor tensor(DT_FLOAT, TensorShape(shape));
   tensor.flat<float>() = tensor.flat<float>().setRandom();
@@ -800,7 +1082,7 @@ static Conv2DGraph Conv2D(int batch, int height, int width, int in_depth,
   return {graph, conv2d};
 }
 
-// Creates a Tensorflow graph with a Conv2D node followed by Relu.
+// Creates a Tensorflow graph with a Conv2D node followed by BiasAdd.
 static Conv2DWithBiasGraph Conv2DWithBias(int batch, int height, int width,
                                           int in_depth, int filter_w,
                                           int filter_h, int out_depth) {
@@ -846,11 +1128,68 @@ static Conv2DWithBiasAndReluGraph Conv2DWithBiasAndRelu(int batch, int height,
   return {graph, conv2d, bias, relu};
 }
 
-// Creates a tensorflow graph with a single FusedConv2D node and fuses into it
-// additional computations (e.g. BiasAdd or Relu).
-static Graph* FusedConv2D(int batch, int height, int width, int in_depth,
-                          int filter_w, int filter_h, int out_depth,
-                          const std::vector<string>& fused_ops = {}) {
+// Creates a Tensorflow graph with a Conv2D node followed by FusedBatchNorm.
+static Conv2DWithBatchNormGraph Conv2DWithBatchNorm(int batch, int height,
+                                                    int width, int in_depth,
+                                                    int filter_w, int filter_h,
+                                                    int out_depth) {
+  Conv2DGraph conv_graph =
+      Conv2D(batch, height, width, in_depth, filter_w, filter_h, out_depth);
+
+  Graph* graph = conv_graph.graph;
+  Node* conv2d = conv_graph.conv2d;
+
+  Tensor scale_t = MakeRandomTensor({out_depth});
+  Tensor offset_t = MakeRandomTensor({out_depth});
+  Tensor mean_t = MakeRandomTensor({out_depth});
+  Tensor variance_t = MakeRandomTensor({out_depth});
+
+  Node* scale = test::graph::Constant(graph, scale_t, "scale");
+  Node* offset = test::graph::Constant(graph, offset_t, "offset");
+  Node* mean = test::graph::Constant(graph, mean_t, "mean");
+  Node* variance = test::graph::Constant(graph, variance_t, "variance");
+
+  Node* out;
+  TF_CHECK_OK(NodeBuilder(graph->NewName("batch_norm"), "FusedBatchNorm")
+                  .Input(conv2d)
+                  .Input(scale)
+                  .Input(offset)
+                  .Input(mean)
+                  .Input(variance)
+                  .Attr("T", DT_FLOAT)
+                  .Attr("is_training", false)
+                  .Finalize(graph, &out));
+
+  return {graph, conv2d, out};
+}
+
+// Creates a Tensorflow graph with a Conv2D node followed by FusedBatchNorm and
+// Relu.
+static Conv2DWithBatchNormAndReluGraph Conv2DWithBatchNormAndRelu(
+    int batch, int height, int width, int in_depth, int filter_w, int filter_h,
+    int out_depth) {
+  Conv2DWithBatchNormGraph conv_graph = Conv2DWithBatchNorm(
+      batch, height, width, in_depth, filter_w, filter_h, out_depth);
+
+  Graph* graph = conv_graph.graph;
+  Node* conv2d = conv_graph.conv2d;
+  Node* batch_norm = conv_graph.batch_norm;
+
+  Node* relu;
+  TF_CHECK_OK(NodeBuilder(graph->NewName("relu"), "Relu")
+                  .Input(batch_norm)
+                  .Attr("T", DT_FLOAT)
+                  .Finalize(graph, &relu));
+
+  return {graph, conv2d, batch_norm, relu};
+}
+
+// Creates a tensorflow graph with a single FusedConv2D (with BiasAdd) node and
+// fuses into it additional computations (e.g. Relu).
+static Graph* FusedConv2DWithBias(int batch, int height, int width,
+                                  int in_depth, int filter_w, int filter_h,
+                                  int out_depth,
+                                  const std::vector<string>& fused_ops = {}) {
   Graph* graph = new Graph(OpRegistry::Global());
 
   Tensor images_t = MakeRandomTensor({batch, height, width, in_depth});
@@ -878,6 +1217,53 @@ static Graph* FusedConv2D(int batch, int height, int width, int in_depth,
   return graph;
 }
 
+// Creates a tensorflow graph with a single FusedConv2D (with FusedBatchNorm)
+// node and fuses into it additional computations (e.g. Relu).
+static Graph* FusedConv2DWithBatchNorm(
+    int batch, int height, int width, int in_depth, int filter_w, int filter_h,
+    int out_depth, const std::vector<string>& fused_ops = {}) {
+  Graph* graph = new Graph(OpRegistry::Global());
+
+  Tensor images_t = MakeRandomTensor({batch, height, width, in_depth});
+  Tensor filter_t = MakeRandomTensor({filter_w, filter_h, in_depth, out_depth});
+  Tensor scale_t = MakeRandomTensor({out_depth});
+  Tensor offset_t = MakeRandomTensor({out_depth});
+  Tensor mean_t = MakeRandomTensor({out_depth});
+  Tensor variance_t = MakeRandomTensor({out_depth});
+
+  Node* images = test::graph::Constant(graph, images_t, "images");
+  Node* filter = test::graph::Constant(graph, filter_t, "filter");
+  Node* scale = test::graph::Constant(graph, scale_t, "scale");
+  Node* offset = test::graph::Constant(graph, offset_t, "offset");
+  Node* mean = test::graph::Constant(graph, mean_t, "mean");
+  Node* variance = test::graph::Constant(graph, variance_t, "variance");
+
+  std::vector<NodeBuilder::NodeOut> args = {scale, offset, mean, variance};
+
+  Node* conv;
+  TF_CHECK_OK(NodeBuilder(graph->NewName("conv"), "_FusedConv2D")
+                  .Input(images)
+                  .Input(filter)
+                  .Attr("num_args", 4)
+                  .Input(args)
+                  .Attr("T", DT_FLOAT)
+                  .Attr("strides", {1, 1, 1, 1})
+                  .Attr("padding", "SAME")
+                  .Attr("fused_ops", fused_ops)
+                  .Finalize(graph, &conv));
+
+  return graph;
+}
+
+// Macro arguments names: --------------------------------------------------- //
+//    N: batch size
+//    H: height
+//    W: width
+//    C: channels
+//   FC: filter count
+//   FH: filter height
+//   FW: filter width
+
 #define BM_SETUP(N, H, W, C, type, LABEL, NAME)                               \
   testing::ItemsProcessed(static_cast<int64>(iters) * (N) * (H) * (W) * (C)); \
   testing::SetLabel(LABEL);
@@ -911,26 +1297,73 @@ static Graph* FusedConv2D(int batch, int height, int width, int in_depth,
   }                                                                       \
   BENCHMARK(BM_NAME(BM_Conv2DWithBiasAndRelu, type, N, H, W, C, FW, FH, FC));
 
-#define BM_FusedConv2D(N, H, W, C, FW, FH, FC, type, LABEL)                  \
-  static void BM_NAME(BM_FusedConv2D, type, N, H, W, C, FW, FH,              \
-                      FC)(int iters) {                                       \
-    BM_SETUP(N, H, W, C, type, LABEL, Conv2D);                               \
-    test::Benchmark(#type, FusedConv2D(N, H, W, C, FW, FH, FC, {"BiasAdd"})) \
-        .Run(iters);                                                         \
-  }                                                                          \
-  BENCHMARK(BM_NAME(BM_FusedConv2D, type, N, H, W, C, FW, FH, FC));
-
-#define BM_FusedConv2DAndRelu(N, H, W, C, FW, FH, FC, type, LABEL)            \
-  static void BM_NAME(BM_FusedConv2DAndRelu, type, N, H, W, C, FW, FH,        \
+#define BM_FusedConv2DWithBias(N, H, W, C, FW, FH, FC, type, LABEL)           \
+  static void BM_NAME(BM_FusedConv2DWithBias, type, N, H, W, C, FW, FH,       \
                       FC)(int iters) {                                        \
     BM_SETUP(N, H, W, C, type, LABEL, Conv2D);                                \
     test::Benchmark(#type,                                                    \
-                    FusedConv2D(N, H, W, C, FW, FH, FC, {"BiasAdd", "Relu"})) \
+                    FusedConv2DWithBias(N, H, W, C, FW, FH, FC, {"BiasAdd"})) \
         .Run(iters);                                                          \
   }                                                                           \
-  BENCHMARK(BM_NAME(BM_FusedConv2DAndRelu, type, N, H, W, C, FW, FH, FC));
+  BENCHMARK(BM_NAME(BM_FusedConv2DWithBias, type, N, H, W, C, FW, FH, FC));
 
+#define BM_FusedConv2DWithBiasAndRelu(N, H, W, C, FW, FH, FC, type, LABEL)     \
+  static void BM_NAME(BM_FusedConv2DWithBiasAndRelu, type, N, H, W, C, FW, FH, \
+                      FC)(int iters) {                                         \
+    BM_SETUP(N, H, W, C, type, LABEL, Conv2D);                                 \
+    test::Benchmark(#type, FusedConv2DWithBias(N, H, W, C, FW, FH, FC,         \
+                                               {"BiasAdd", "Relu"}))           \
+        .Run(iters);                                                           \
+  }                                                                            \
+  BENCHMARK(                                                                   \
+      BM_NAME(BM_FusedConv2DWithBiasAndRelu, type, N, H, W, C, FW, FH, FC));
+
+#define BM_Conv2DWithBatchNorm(N, H, W, C, FW, FH, FC, type, LABEL)           \
+  static void BM_NAME(BM_Conv2DWithBatchNorm, type, N, H, W, C, FW, FH,       \
+                      FC)(int iters) {                                        \
+    BM_SETUP(N, H, W, C, type, LABEL, Conv2D);                                \
+    test::Benchmark(#type, Conv2DWithBatchNorm(N, H, W, C, FW, FH, FC).graph) \
+        .Run(iters);                                                          \
+  }                                                                           \
+  BENCHMARK(BM_NAME(BM_Conv2DWithBatchNorm, type, N, H, W, C, FW, FH, FC));
+
+#define BM_Conv2DWithBatchNormAndRelu(N, H, W, C, FW, FH, FC, type, LABEL)     \
+  static void BM_NAME(BM_Conv2DWithBatchNormAndRelu, type, N, H, W, C, FW, FH, \
+                      FC)(int iters) {                                         \
+    BM_SETUP(N, H, W, C, type, LABEL, Conv2D);                                 \
+    test::Benchmark(#type,                                                     \
+                    Conv2DWithBatchNormAndRelu(N, H, W, C, FW, FH, FC).graph)  \
+        .Run(iters);                                                           \
+  }                                                                            \
+  BENCHMARK(                                                                   \
+      BM_NAME(BM_Conv2DWithBatchNormAndRelu, type, N, H, W, C, FW, FH, FC));
+
+#define BM_FusedConv2DWithBatchNorm(N, H, W, C, FW, FH, FC, type, LABEL)     \
+  static void BM_NAME(BM_FusedConv2DWithBatchNorm, type, N, H, W, C, FW, FH, \
+                      FC)(int iters) {                                       \
+    BM_SETUP(N, H, W, C, type, LABEL, Conv2D);                               \
+    test::Benchmark(#type, FusedConv2DWithBatchNorm(N, H, W, C, FW, FH, FC,  \
+                                                    {"FusedBatchNorm"}))     \
+        .Run(iters);                                                         \
+  }                                                                          \
+  BENCHMARK(BM_NAME(BM_FusedConv2DWithBatchNorm, type, N, H, W, C, FW, FH, FC));
+
+#define BM_FusedConv2DWithBatchNormAndRelu(N, H, W, C, FW, FH, FC, type,      \
+                                           LABEL)                             \
+  static void BM_NAME(BM_FusedConv2DWithBatchNormAndRelu, type, N, H, W, C,   \
+                      FW, FH, FC)(int iters) {                                \
+    BM_SETUP(N, H, W, C, type, LABEL, Conv2D);                                \
+    test::Benchmark(#type,                                                    \
+                    FusedConv2DWithBatchNorm(N, H, W, C, FW, FH, FC,          \
+                                             {"FusedBatchNorm", "Relu"}))     \
+        .Run(iters);                                                          \
+  }                                                                           \
+  BENCHMARK(BM_NAME(BM_FusedConv2DWithBatchNormAndRelu, type, N, H, W, C, FW, \
+                    FH, FC));
+
+// -------------------------------------------------------------------------- //
 // Pixel CNN convolutions.
+// -------------------------------------------------------------------------- //
 
 // 1x1 Convolution: MatMulFunctor
 
@@ -938,6 +1371,8 @@ BM_Conv2D(8, 32, 32, 128, 1, 1, 1024, cpu, "1x1 /b 8");
 BM_Conv2D(16, 32, 32, 128, 1, 1, 1024, cpu, "1x1 /b 16");
 BM_Conv2D(32, 32, 32, 128, 1, 1, 1024, cpu, "1x1 /b 32");
 
+// 1) BiasAdd {+ Relu}
+
 BM_Conv2DWithBias(8, 32, 32, 128, 1, 1, 1024, cpu, "1x1 /b 8");
 BM_Conv2DWithBias(16, 32, 32, 128, 1, 1, 1024, cpu, "1x1 /b 16");
 BM_Conv2DWithBias(32, 32, 32, 128, 1, 1, 1024, cpu, "1x1 /b 32");
@@ -946,20 +1381,44 @@ BM_Conv2DWithBiasAndRelu(8, 32, 32, 128, 1, 1, 1024, cpu, "1x1 /b 8");
 BM_Conv2DWithBiasAndRelu(16, 32, 32, 128, 1, 1, 1024, cpu, "1x1 /b 16");
 BM_Conv2DWithBiasAndRelu(32, 32, 32, 128, 1, 1, 1024, cpu, "1x1 /b 32");
 
-BM_FusedConv2D(8, 32, 32, 128, 1, 1, 1024, cpu, "1x1 /b 8");
-BM_FusedConv2D(16, 32, 32, 128, 1, 1, 1024, cpu, "1x1 /b 16");
-BM_FusedConv2D(32, 32, 32, 128, 1, 1, 1024, cpu, "1x1 /b 32");
+BM_FusedConv2DWithBias(8, 32, 32, 128, 1, 1, 1024, cpu, "1x1 /b 8");
+BM_FusedConv2DWithBias(16, 32, 32, 128, 1, 1, 1024, cpu, "1x1 /b 16");
+BM_FusedConv2DWithBias(32, 32, 32, 128, 1, 1, 1024, cpu, "1x1 /b 32");
 
-BM_FusedConv2DAndRelu(8, 32, 32, 128, 1, 1, 1024, cpu, "1x1 /b 8");
-BM_FusedConv2DAndRelu(16, 32, 32, 128, 1, 1, 1024, cpu, "1x1 /b 16");
-BM_FusedConv2DAndRelu(32, 32, 32, 128, 1, 1, 1024, cpu, "1x1 /b 32");
+BM_FusedConv2DWithBiasAndRelu(8, 32, 32, 128, 1, 1, 1024, cpu, "1x1 /b 8");
+BM_FusedConv2DWithBiasAndRelu(16, 32, 32, 128, 1, 1, 1024, cpu, "1x1 /b 16");
+BM_FusedConv2DWithBiasAndRelu(32, 32, 32, 128, 1, 1, 1024, cpu, "1x1 /b 32");
 
+// 2) FusedBatchNorm {+ Relu}
+
+BM_Conv2DWithBatchNorm(8, 32, 32, 128, 1, 1, 1024, cpu, "1x1 /b 8");
+BM_Conv2DWithBatchNorm(16, 32, 32, 128, 1, 1, 1024, cpu, "1x1 /b 16");
+BM_Conv2DWithBatchNorm(32, 32, 32, 128, 1, 1, 1024, cpu, "1x1 /b 32");
+
+BM_Conv2DWithBatchNormAndRelu(8, 32, 32, 128, 1, 1, 1024, cpu, "1x1 /b 8");
+BM_Conv2DWithBatchNormAndRelu(16, 32, 32, 128, 1, 1, 1024, cpu, "1x1 /b 16");
+BM_Conv2DWithBatchNormAndRelu(32, 32, 32, 128, 1, 1, 1024, cpu, "1x1 /b 32");
+
+BM_FusedConv2DWithBatchNorm(8, 32, 32, 128, 1, 1, 1024, cpu, "1x1 /b 8");
+BM_FusedConv2DWithBatchNorm(16, 32, 32, 128, 1, 1, 1024, cpu, "1x1 /b 16");
+BM_FusedConv2DWithBatchNorm(32, 32, 32, 128, 1, 1, 1024, cpu, "1x1 /b 32");
+
+BM_FusedConv2DWithBatchNormAndRelu(8, 32, 32, 128, 1, 1, 1024, cpu, "1x1 /b 8");
+BM_FusedConv2DWithBatchNormAndRelu(16, 32, 32, 128, 1, 1, 1024, cpu,
+                                   "1x1 /b 16");
+BM_FusedConv2DWithBatchNormAndRelu(32, 32, 32, 128, 1, 1, 1024, cpu,
+                                   "1x1 /b 32");
+
+// -------------------------------------------------------------------------- //
 // 3x3 Convolution: SpatialConvolution
+// -------------------------------------------------------------------------- //
 
 BM_Conv2D(8, 32, 32, 128, 3, 3, 1024, cpu, "3x3 /b 8");
 BM_Conv2D(16, 32, 32, 128, 3, 3, 1024, cpu, "3x3 /b 16");
 BM_Conv2D(32, 32, 32, 128, 3, 3, 1024, cpu, "3x3 /b 32");
 
+// 1) BiasAdd {+ Relu}
+
 BM_Conv2DWithBias(8, 32, 32, 128, 3, 3, 1024, cpu, "3x3 /b 8");
 BM_Conv2DWithBias(16, 32, 32, 128, 3, 3, 1024, cpu, "3x3 /b 16");
 BM_Conv2DWithBias(32, 32, 32, 128, 3, 3, 1024, cpu, "3x3 /b 32");
@@ -968,12 +1427,32 @@ BM_Conv2DWithBiasAndRelu(8, 32, 32, 128, 3, 3, 1024, cpu, "3x3 /b 8");
 BM_Conv2DWithBiasAndRelu(16, 32, 32, 128, 3, 3, 1024, cpu, "3x3 /b 16");
 BM_Conv2DWithBiasAndRelu(32, 32, 32, 128, 3, 3, 1024, cpu, "3x3 /b 32");
 
-BM_FusedConv2D(8, 32, 32, 128, 3, 3, 1024, cpu, "3x3 /b 8");
-BM_FusedConv2D(16, 32, 32, 128, 3, 3, 1024, cpu, "3x3 /b 16");
-BM_FusedConv2D(32, 32, 32, 128, 3, 3, 1024, cpu, "3x3 /b 32");
+BM_FusedConv2DWithBias(8, 32, 32, 128, 3, 3, 1024, cpu, "3x3 /b 8");
+BM_FusedConv2DWithBias(16, 32, 32, 128, 3, 3, 1024, cpu, "3x3 /b 16");
+BM_FusedConv2DWithBias(32, 32, 32, 128, 3, 3, 1024, cpu, "3x3 /b 32");
 
-BM_FusedConv2DAndRelu(8, 32, 32, 128, 3, 3, 1024, cpu, "3x3 /b 8");
-BM_FusedConv2DAndRelu(16, 32, 32, 128, 3, 3, 1024, cpu, "3x3 /b 16");
-BM_FusedConv2DAndRelu(32, 32, 32, 128, 3, 3, 1024, cpu, "3x3 /b 32");
+BM_FusedConv2DWithBiasAndRelu(8, 32, 32, 128, 3, 3, 1024, cpu, "3x3 /b 8");
+BM_FusedConv2DWithBiasAndRelu(16, 32, 32, 128, 3, 3, 1024, cpu, "3x3 /b 16");
+BM_FusedConv2DWithBiasAndRelu(32, 32, 32, 128, 3, 3, 1024, cpu, "3x3 /b 32");
+
+// 2) FusedBatchNorm {+ Relu}
+
+BM_Conv2DWithBatchNorm(8, 32, 32, 128, 3, 3, 1024, cpu, "1x1 /b 8");
+BM_Conv2DWithBatchNorm(16, 32, 32, 128, 3, 3, 1024, cpu, "1x1 /b 16");
+BM_Conv2DWithBatchNorm(32, 32, 32, 128, 3, 3, 1024, cpu, "1x1 /b 32");
+
+BM_Conv2DWithBatchNormAndRelu(8, 32, 32, 128, 3, 3, 1024, cpu, "3x3 /b 8");
+BM_Conv2DWithBatchNormAndRelu(16, 32, 32, 128, 3, 3, 1024, cpu, "3x3 /b 16");
+BM_Conv2DWithBatchNormAndRelu(32, 32, 32, 128, 3, 3, 1024, cpu, "3x3 /b 32");
+
+BM_FusedConv2DWithBatchNorm(8, 32, 32, 128, 3, 3, 1024, cpu, "1x1 /b 8");
+BM_FusedConv2DWithBatchNorm(16, 32, 32, 128, 3, 3, 1024, cpu, "1x1 /b 16");
+BM_FusedConv2DWithBatchNorm(32, 32, 32, 128, 3, 3, 1024, cpu, "1x1 /b 32");
+
+BM_FusedConv2DWithBatchNormAndRelu(8, 32, 32, 128, 3, 3, 1024, cpu, "3x3 /b 8");
+BM_FusedConv2DWithBatchNormAndRelu(16, 32, 32, 128, 3, 3, 1024, cpu,
+                                   "3x3 /b 16");
+BM_FusedConv2DWithBatchNormAndRelu(32, 32, 32, 128, 3, 3, 1024, cpu,
+                                   "3x3 /b 32");
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_ops.h b/tensorflow/core/kernels/cwise_ops.h
index 3f7aa0dc399..313def9a75f 100644
--- a/tensorflow/core/kernels/cwise_ops.h
+++ b/tensorflow/core/kernels/cwise_ops.h
@@ -449,6 +449,27 @@ struct functor_traits<scalar_round_op_google<Scalar>> {
   enum { Cost = 4 * NumTraits<Scalar>::AddCost, PacketAccess = false };
 };
 
+template <typename Scalar>
+struct scalar_round_up_op {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar
+  operator()(const Scalar& x) const {
+    EIGEN_STATIC_ASSERT((!NumTraits<Scalar>::IsComplex),
+                        NUMERIC_TYPE_MUST_BE_REAL)
+
+    Scalar round_val = Eigen::numext::floor(x);
+    const Scalar fraction = x - round_val;
+    if (fraction >= Scalar(.5)) {
+      round_val += Scalar(1.0);
+    }
+    return round_val;
+  }
+};
+
+template <typename Scalar>
+struct functor_traits<scalar_round_up_op<Scalar>> {
+  enum { Cost = 4 * NumTraits<Scalar>::AddCost, PacketAccess = false };
+};
+
 #undef ENABLE_FLOAT_EQUALITY_WARNING
 #undef DISABLE_FLOAT_EQUALITY_WARNING
 
diff --git a/tensorflow/core/kernels/data/BUILD b/tensorflow/core/kernels/data/BUILD
index b7ccf5f70ec..e2ab77632da 100644
--- a/tensorflow/core/kernels/data/BUILD
+++ b/tensorflow/core/kernels/data/BUILD
@@ -13,16 +13,6 @@ load(
     "tf_cc_test",
 )
 
-tf_kernel_library(
-    name = "stats_aggregator_ops",
-    srcs = ["stats_aggregator_ops.cc"],
-    deps = [
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:protos_all_cc",
-    ],
-)
-
 # TODO(mrry): Remove this empty forwarding library.
 cc_library(
     name = "dataset",
@@ -139,17 +129,6 @@ tf_kernel_library(
     ],
 )
 
-tf_kernel_library(
-    name = "slide_dataset_op",
-    srcs = ["slide_dataset_op.cc"],
-    deps = [
-        "//tensorflow/core:dataset_ops_op_lib",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-    ],
-)
-
 tf_kernel_library(
     name = "padded_batch_dataset_op",
     srcs = ["padded_batch_dataset_op.cc"],
@@ -161,44 +140,6 @@ tf_kernel_library(
     ],
 )
 
-tf_kernel_library(
-    name = "dense_to_sparse_batch_dataset_op",
-    srcs = ["dense_to_sparse_batch_dataset_op.cc"],
-    deps = [
-        "//tensorflow/core:dataset_ops_op_lib",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-    ],
-)
-
-tf_kernel_library(
-    name = "group_by_reducer_dataset_op",
-    srcs = ["group_by_reducer_dataset_op.cc"],
-    deps = [
-        ":captured_function",
-        "//tensorflow/core:core_cpu_internal",
-        "//tensorflow/core:dataset_ops_op_lib",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-    ],
-)
-
-tf_kernel_library(
-    name = "group_by_window_dataset_op",
-    srcs = ["group_by_window_dataset_op.cc"],
-    deps = [
-        ":captured_function",
-        ":window_dataset",
-        "//tensorflow/core:core_cpu_internal",
-        "//tensorflow/core:dataset_ops_op_lib",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-    ],
-)
-
 tf_kernel_library(
     name = "filter_dataset_op",
     srcs = ["filter_dataset_op.cc"],
@@ -238,21 +179,6 @@ tf_kernel_library(
     ],
 )
 
-tf_kernel_library(
-    name = "map_and_batch_dataset_op",
-    srcs = ["map_and_batch_dataset_op.cc"],
-    deps = [
-        ":captured_function",
-        ":dataset_utils",
-        "//tensorflow/core:core_cpu_internal",
-        "//tensorflow/core:dataset_ops_op_lib",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-        "//tensorflow/core/kernels:inplace_ops",
-    ],
-)
-
 cc_library(
     name = "parallel_map_iterator",
     srcs = ["parallel_map_iterator.cc"],
@@ -267,16 +193,6 @@ cc_library(
     ],
 )
 
-tf_kernel_library(
-    name = "parse_example_dataset_op",
-    srcs = ["parse_example_dataset_op.cc"],
-    deps = [
-        ":parallel_map_iterator",
-        "//tensorflow/core:core_cpu_internal",
-        "//tensorflow/core:framework",
-    ],
-)
-
 tf_kernel_library(
     name = "parallel_map_dataset_op",
     srcs = ["parallel_map_dataset_op.cc"],
@@ -307,19 +223,6 @@ tf_kernel_library(
     ],
 )
 
-tf_kernel_library(
-    name = "scan_dataset_op",
-    srcs = ["scan_dataset_op.cc"],
-    deps = [
-        ":captured_function",
-        "//tensorflow/core:core_cpu_internal",
-        "//tensorflow/core:dataset_ops_op_lib",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-    ],
-)
-
 tf_kernel_library(
     name = "flat_map_dataset_op",
     srcs = ["flat_map_dataset_op.cc"],
@@ -359,7 +262,6 @@ tf_kernel_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
-        "//tensorflow/core:protos_all_cc",
     ],
 )
 
@@ -430,39 +332,6 @@ tf_kernel_library(
     ],
 )
 
-tf_kernel_library(
-    name = "stats_dataset_ops",
-    srcs = ["stats_dataset_ops.cc"],
-    deps = [
-        "//tensorflow/core:dataset_ops_op_lib",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-        "//tensorflow/core:protos_all_cc",
-    ],
-)
-
-tf_kernel_library(
-    name = "stats_aggregator_dataset_op",
-    srcs = ["stats_aggregator_dataset_op.cc"],
-    deps = [
-        "//tensorflow/core:core_cpu_lib",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib_internal",
-    ],
-)
-
-tf_kernel_library(
-    name = "random_dataset_op",
-    srcs = ["random_dataset_op.cc"],
-    deps = [
-        "//tensorflow/core:dataset_ops_op_lib",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-    ],
-)
-
 tf_kernel_library(
     name = "range_dataset_op",
     srcs = ["range_dataset_op.cc"],
@@ -506,17 +375,6 @@ tf_kernel_library(
     ],
 )
 
-tf_kernel_library(
-    name = "tensor_queue_dataset_op",
-    srcs = ["tensor_queue_dataset_op.cc"],
-    deps = [
-        "//tensorflow/core:dataset_ops_op_lib",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-    ],
-)
-
 tf_kernel_library(
     name = "tensor_slice_dataset_op",
     srcs = ["tensor_slice_dataset_op.cc"],
@@ -527,17 +385,6 @@ tf_kernel_library(
     ],
 )
 
-tf_kernel_library(
-    name = "unbatch_dataset_op",
-    srcs = ["unbatch_dataset_op.cc"],
-    deps = [
-        "//tensorflow/core:dataset_ops_op_lib",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-    ],
-)
-
 tf_kernel_library(
     name = "zip_dataset_op",
     srcs = ["zip_dataset_op.cc"],
@@ -571,20 +418,6 @@ tf_kernel_library(
     ],
 )
 
-tf_kernel_library(
-    name = "sql_dataset_ops",
-    srcs = [
-        "sql_dataset_ops.cc",
-    ],
-    deps = [
-        "//tensorflow/core:dataset_ops_op_lib",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-        "//tensorflow/core/kernels/data/sql",
-    ],
-)
-
 tf_kernel_library(
     name = "iterator_ops",
     srcs = ["iterator_ops.cc"],
@@ -600,6 +433,7 @@ tf_kernel_library(
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:session_options",
         "//tensorflow/core/kernels:ops_util",
+        "@com_google_absl//absl/memory",
     ],
 )
 
@@ -620,6 +454,10 @@ tf_kernel_library(
     name = "optional_ops",
     srcs = ["optional_ops.cc"],
     hdrs = ["optional_ops.h"],
+    gpu_srcs = [
+        "optional_ops.cu.cc",
+        "optional_ops.h",
+    ],
     deps = [
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:dataset_ops_op_lib",
@@ -627,6 +465,7 @@ tf_kernel_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
+        "//third_party/eigen3",
     ],
 )
 
@@ -649,7 +488,6 @@ tf_kernel_library(
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:dataset_ops_op_lib",
         "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/grappler:graph_view",
@@ -658,22 +496,11 @@ tf_kernel_library(
         "//tensorflow/core/grappler/clusters:virtual_cluster",
         "//tensorflow/core/grappler/optimizers:meta_optimizer",
         "//tensorflow/core/grappler/optimizers/data",
+        "//tensorflow/core/grappler/optimizers/data:function_utils",
         "//tensorflow/core/grappler/optimizers/data:graph_utils",
     ],
 )
 
-tf_kernel_library(
-    name = "matching_files_dataset_op",
-    srcs = ["matching_files_dataset_op.cc"],
-    deps = [
-        ":dataset",
-        "//tensorflow/core:dataset_ops_op_lib",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-    ],
-)
-
 tf_kernel_library(
     name = "model_dataset_op",
     srcs = ["model_dataset_op.cc"],
@@ -706,19 +533,14 @@ tf_kernel_library(
         ":cache_dataset_ops",
         ":concatenate_dataset_op",
         ":dataset_ops",
-        ":dense_to_sparse_batch_dataset_op",
         ":filter_by_component_dataset_op",
         ":filter_dataset_op",
         ":flat_map_dataset_op",
         ":generator_dataset_op",
-        ":group_by_reducer_dataset_op",
-        ":group_by_window_dataset_op",
         ":interleave_dataset_op",
         ":iterator_ops",
-        ":map_and_batch_dataset_op",
         ":map_dataset_op",
         ":map_defun_op",
-        ":matching_files_dataset_op",
         ":model_dataset_op",
         ":multi_device_iterator_ops",
         ":optimize_dataset_op",
@@ -726,45 +548,22 @@ tf_kernel_library(
         ":padded_batch_dataset_op",
         ":parallel_interleave_dataset_op",
         ":parallel_map_dataset_op",
-        ":parse_example_dataset_op",
         ":prefetch_dataset_op",
-        ":random_dataset_op",
         ":range_dataset_op",
         ":reader_dataset_ops",
         ":repeat_dataset_op",
-        ":scan_dataset_op",
         ":shuffle_dataset_op",
         ":skip_dataset_op",
-        ":slide_dataset_op",
         ":sparse_tensor_slice_dataset_op",
-        ":sql_dataset_ops",
-        ":stats_aggregator_dataset_op",
-        ":stats_aggregator_ops",
-        ":stats_dataset_ops",
         ":take_dataset_op",
         ":tensor_dataset_op",
-        ":tensor_queue_dataset_op",
         ":tensor_slice_dataset_op",
-        ":unbatch_dataset_op",
         ":window_dataset_op",
-        ":writer_ops",
         ":zip_dataset_op",
         "//tensorflow/core/kernels/data/experimental:dataset_kernels",
     ],
 )
 
-tf_kernel_library(
-    name = "writer_ops",
-    srcs = ["writer_ops.cc"],
-    deps = [
-        ":dataset_utils",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-        "//tensorflow/core/kernels:ops_util",
-    ],
-)
-
 tf_kernel_library(
     name = "map_defun_op",
     srcs = ["map_defun_op.cc"],
diff --git a/tensorflow/core/kernels/data/batch_dataset_op.cc b/tensorflow/core/kernels/data/batch_dataset_op.cc
index 41b04346ebd..1f8d2bdbae8 100644
--- a/tensorflow/core/kernels/data/batch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/batch_dataset_op.cc
@@ -95,6 +95,15 @@ class BatchDatasetOp : public UnaryDatasetOpKernel {
       return strings::StrCat("BatchDatasetOp(", batch_size_, ")::Dataset");
     }
 
+    int64 Cardinality() const override {
+      int64 n = input_->Cardinality();
+      if (n == kInfiniteCardinality || n == kUnknownCardinality) {
+        return n;
+      }
+      return n / batch_size_ +
+             (n % batch_size_ == 0 || drop_remainder_ ? 0 : 1);
+    }
+
    protected:
     Status AsGraphDefInternal(SerializationContext* ctx,
                               DatasetGraphDefBuilder* b,
diff --git a/tensorflow/core/kernels/data/cache_dataset_ops.cc b/tensorflow/core/kernels/data/cache_dataset_ops.cc
index ce6fd09aee5..d7b231be18c 100644
--- a/tensorflow/core/kernels/data/cache_dataset_ops.cc
+++ b/tensorflow/core/kernels/data/cache_dataset_ops.cc
@@ -84,6 +84,8 @@ class CacheDatasetOp : public UnaryDatasetOpKernel {
       return "CacheDatasetOp::FileDataset";
     }
 
+    int64 Cardinality() const override { return input_->Cardinality(); }
+
    protected:
     Status AsGraphDefInternal(SerializationContext* ctx,
                               DatasetGraphDefBuilder* b,
@@ -588,6 +590,8 @@ class CacheDatasetOp : public UnaryDatasetOpKernel {
       return "CacheDatasetOp::MemoryDataset";
     }
 
+    int64 Cardinality() const override { return input_->Cardinality(); }
+
    protected:
     Status AsGraphDefInternal(SerializationContext* ctx,
                               DatasetGraphDefBuilder* b,
diff --git a/tensorflow/core/kernels/data/captured_function.cc b/tensorflow/core/kernels/data/captured_function.cc
index 64834e507f2..973b6b06048 100644
--- a/tensorflow/core/kernels/data/captured_function.cc
+++ b/tensorflow/core/kernels/data/captured_function.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/common_runtime/step_stats_collector.h"
 #include "tensorflow/core/framework/cancellation.h"
+#include "tensorflow/core/framework/function_handle_cache.h"
 #include "tensorflow/core/framework/stats_aggregator.h"
 #include "tensorflow/core/lib/gtl/optional.h"
 #include "tensorflow/core/lib/random/random.h"
@@ -118,10 +119,34 @@ Status CapturedFunction::Create(
   return Status::OK();
 }
 
-CapturedFunction::~CapturedFunction() {
-  if (lib_ != nullptr && f_handle_ != kInvalidHandle) {
-    lib_->ReleaseHandle(f_handle_).IgnoreError();
+Status CapturedFunction::Instantiate(
+    IteratorContext* ctx, std::unique_ptr<InstantiatedCapturedFunction>*
+                              instantiated_captured_function) {
+  // The context's runtime will be used for all subsequent calls.
+  FunctionLibraryRuntime* lib = ctx->lib();
+  FunctionLibraryRuntime::InstantiateOptions inst_opts;
+  inst_opts.overlay_lib = ctx->function_library().get();
+  inst_opts.create_kernels_eagerly = true;
+  if (!use_inter_op_parallelism_) {
+    inst_opts.executor_type = "SINGLE_THREADED_EXECUTOR";
   }
+
+  FunctionLibraryRuntime::Handle f_handle;
+  TF_RETURN_IF_ERROR(ctx->function_handle_cache()->Instantiate(
+      func_.name(), AttrSlice(&func_.attr()), inst_opts, &f_handle));
+  const FunctionBody* fbody = lib->GetFunctionBody(f_handle);
+  if (fbody == nullptr) {
+    return errors::Internal("Failed to instantiate function body.");
+  }
+
+  DataTypeVector ret_types;
+  for (const auto& ret_type : fbody->ret_types) {
+    ret_types.push_back(ret_type);
+  }
+
+  instantiated_captured_function->reset(new InstantiatedCapturedFunction(
+      lib, f_handle, std::move(ret_types), *ctx->runner(), this));
+  return Status::OK();
 }
 
 namespace {
@@ -244,35 +269,35 @@ class BorrowedArgsCallFrame : public CallFrameBase {
 
 }  // namespace
 
-Status CapturedFunction::GetHandle(IteratorContext* ctx,
-                                   FunctionLibraryRuntime::Handle* out_handle) {
-  tf_shared_lock l(mu_);
-  if (lib_ == nullptr) {
-    return errors::Internal("Captured function \"", func_.name(),
-                            "\" was called before it was instantiated.");
-  }
-  if (ctx->lib() != lib_) {
-    return errors::Internal("Captured function \"", func_.name(),
-                            "\" was called with a different "
-                            "FunctionLibraryRuntime*, which is not permitted.");
-  }
-  *out_handle = f_handle_;
-  return Status::OK();
-}
+InstantiatedCapturedFunction::InstantiatedCapturedFunction(
+    FunctionLibraryRuntime* lib, FunctionLibraryRuntime::Handle f_handle,
+    DataTypeVector ret_types, std::function<void(std::function<void()>)> runner,
+    CapturedFunction* captured_func)
+    : lib_(lib),
+      f_handle_(f_handle),
+      ret_types_(std::move(ret_types)),
+      captured_runner_(std::move(runner)),
+      captured_func_(captured_func) {}
 
-Status CapturedFunction::Run(IteratorContext* ctx, std::vector<Tensor>&& args,
-                             std::vector<Tensor>* rets) {
-  FunctionLibraryRuntime::Handle handle;
-  TF_RETURN_IF_ERROR(GetHandle(ctx, &handle));
+// NOTE: We don't release f_handle_ here and instead delegate the function
+// handle releasing to the FunctionHandleCache. This is because in some cases
+// (RepeatDatasetOp in particular), we want to keep the function state (e.g.
+// random number generator) even after the Iterator is reset after going through
+// one epoch.
+InstantiatedCapturedFunction::~InstantiatedCapturedFunction() {}
 
+Status InstantiatedCapturedFunction::Run(IteratorContext* ctx,
+                                         std::vector<Tensor>&& args,
+                                         std::vector<Tensor>* rets) const {
   FunctionLibraryRuntime::Options f_opts;
-  f_opts.step_id = CapturedFunction::generate_step_id();
-  ScopedStepContainer step_container(f_opts.step_id, [ctx](const string& name) {
-    ctx->lib()->device()->resource_manager()->Cleanup(name).IgnoreError();
-  });
+  f_opts.step_id = InstantiatedCapturedFunction::generate_step_id();
+  ScopedStepContainer step_container(
+      f_opts.step_id, [this](const string& name) {
+        lib_->device()->resource_manager()->Cleanup(name).IgnoreError();
+      });
   f_opts.step_container = &step_container;
   f_opts.runner = ctx->runner();
-  if (ctx->lib()->device()->device_type() != DEVICE_CPU) {
+  if (lib_->device()->device_type() != DEVICE_CPU) {
     f_opts.create_rendezvous = true;
   }
   // TODO(mrry): Add cancellation manager support to IteratorContext
@@ -284,10 +309,11 @@ Status CapturedFunction::Run(IteratorContext* ctx, std::vector<Tensor>&& args,
   CancellationManager c_mgr;
   f_opts.cancellation_manager = &c_mgr;
 
-  OwnedArgsCallFrame frame(std::move(args), &captured_inputs_, ret_types_);
+  OwnedArgsCallFrame frame(std::move(args), &captured_func_->captured_inputs(),
+                           ret_types_);
   Notification n;
   Status s;
-  ctx->lib()->Run(f_opts, handle, &frame, [&n, &s](Status func_status) {
+  lib_->Run(f_opts, f_handle_, &frame, [&n, &s](Status func_status) {
     s.Update(func_status);
     n.Notify();
   });
@@ -296,20 +322,18 @@ Status CapturedFunction::Run(IteratorContext* ctx, std::vector<Tensor>&& args,
   return frame.ConsumeRetvals(rets);
 }
 
-Status CapturedFunction::RunWithBorrowedArgs(IteratorContext* ctx,
-                                             const std::vector<Tensor>& args,
-                                             std::vector<Tensor>* rets) {
-  FunctionLibraryRuntime::Handle handle;
-  TF_RETURN_IF_ERROR(GetHandle(ctx, &handle));
-
+Status InstantiatedCapturedFunction::RunWithBorrowedArgs(
+    IteratorContext* ctx, const std::vector<Tensor>& args,
+    std::vector<Tensor>* rets) const {
   FunctionLibraryRuntime::Options f_opts;
-  f_opts.step_id = CapturedFunction::generate_step_id();
-  ScopedStepContainer step_container(f_opts.step_id, [ctx](const string& name) {
-    ctx->lib()->device()->resource_manager()->Cleanup(name).IgnoreError();
-  });
+  f_opts.step_id = InstantiatedCapturedFunction::generate_step_id();
+  ScopedStepContainer step_container(
+      f_opts.step_id, [this](const string& name) {
+        lib_->device()->resource_manager()->Cleanup(name).IgnoreError();
+      });
   f_opts.step_container = &step_container;
   f_opts.runner = ctx->runner();
-  if (ctx->lib()->device()->device_type() != DEVICE_CPU) {
+  if (lib_->device()->device_type() != DEVICE_CPU) {
     f_opts.create_rendezvous = true;
   }
   // TODO(mrry): Add cancellation manager support to IteratorContext
@@ -321,11 +345,12 @@ Status CapturedFunction::RunWithBorrowedArgs(IteratorContext* ctx,
   CancellationManager c_mgr;
   f_opts.cancellation_manager = &c_mgr;
 
-  BorrowedArgsCallFrame frame(args, &captured_inputs_, ret_types_);
+  BorrowedArgsCallFrame frame(args, &captured_func_->captured_inputs(),
+                              ret_types_);
   Notification n;
   Status s;
 
-  ctx->lib()->Run(f_opts, handle, &frame, [&n, &s](Status func_status) {
+  lib_->Run(f_opts, f_handle_, &frame, [&n, &s](Status func_status) {
     s.Update(func_status);
     n.Notify();
   });
@@ -334,65 +359,17 @@ Status CapturedFunction::RunWithBorrowedArgs(IteratorContext* ctx,
   return frame.ConsumeRetvals(rets);
 }
 
-Status CapturedFunction::Instantiate(IteratorContext* ctx) {
-  mutex_lock l(mu_);
-  if (lib_ == nullptr) {
-    // The context's runtime will be used for all subsequent calls.
-    lib_ = ctx->lib();
-    DCHECK(f_handle_ == kInvalidHandle);
-    FunctionLibraryRuntime::InstantiateOptions inst_opts;
-    inst_opts.overlay_lib = ctx->function_library().get();
-    inst_opts.state_handle = std::to_string(random::New64());
-    inst_opts.create_kernels_eagerly = true;
-    if (!use_inter_op_parallelism_) {
-      inst_opts.executor_type = "SINGLE_THREADED_EXECUTOR";
-    }
-    Status s = (lib_->Instantiate(func_.name(), AttrSlice(&func_.attr()),
-                                  inst_opts, &f_handle_));
-    TF_RETURN_IF_ERROR(s);
-    const FunctionBody* fbody = lib_->GetFunctionBody(f_handle_);
-    if (fbody == nullptr) {
-      return errors::Internal("Failed to instantiate function body.");
-    }
-    ret_types_ = fbody->ret_types;
-  } else {
-    if (ctx->lib() != lib_) {
-      return errors::Internal(
-          "Captured function was called with a different "
-          "FunctionLibraryRuntime*, which is not permitted.");
-    }
-  }
-  if (captured_runner_ == nullptr) {
-    captured_runner_ = *ctx->runner();
-  }
-  return Status::OK();
-}
-
-Status CapturedFunction::RunInstantiated(const std::vector<Tensor>& args,
-                                         std::vector<Tensor>* rets) {
-  FunctionLibraryRuntime* lib;
-  FunctionLibraryRuntime::Handle handle;
-  std::function<void(std::function<void()>)>* runner;
-  {
-    tf_shared_lock l(mu_);
-    if (lib_ == nullptr) {
-      return errors::FailedPrecondition(
-          "`CapturedFunction::Instantiate()` must be called before a call to "
-          "`CapturedFunction::RunInstantiated()`.");
-    }
-    lib = lib_;
-    handle = f_handle_;
-    runner = &captured_runner_;
-  }
-
+Status InstantiatedCapturedFunction::RunInstantiated(
+    const std::vector<Tensor>& args, std::vector<Tensor>* rets) {
   FunctionLibraryRuntime::Options f_opts;
-  f_opts.step_id = CapturedFunction::generate_step_id();
-  ScopedStepContainer step_container(f_opts.step_id, [lib](const string& name) {
-    lib->device()->resource_manager()->Cleanup(name).IgnoreError();
-  });
+  f_opts.step_id = InstantiatedCapturedFunction::generate_step_id();
+  ScopedStepContainer step_container(
+      f_opts.step_id, [this](const string& name) {
+        lib_->device()->resource_manager()->Cleanup(name).IgnoreError();
+      });
   f_opts.step_container = &step_container;
-  f_opts.runner = runner;
-  if (lib->device()->device_type() != DEVICE_CPU) {
+  f_opts.runner = &captured_runner_;
+  if (lib_->device()->device_type() != DEVICE_CPU) {
     f_opts.create_rendezvous = true;
   }
   // TODO(mrry): Add cancellation manager support to IteratorContext
@@ -404,11 +381,12 @@ Status CapturedFunction::RunInstantiated(const std::vector<Tensor>& args,
   CancellationManager c_mgr;
   f_opts.cancellation_manager = &c_mgr;
 
-  BorrowedArgsCallFrame frame(args, &captured_inputs_, ret_types_);
+  BorrowedArgsCallFrame frame(args, &captured_func_->captured_inputs(),
+                              ret_types_);
   Notification n;
   Status s;
 
-  lib->Run(f_opts, handle, &frame, [&n, &s](Status func_status) {
+  lib_->Run(f_opts, f_handle_, &frame, [&n, &s](Status func_status) {
     s.Update(func_status);
     n.Notify();
   });
@@ -417,33 +395,25 @@ Status CapturedFunction::RunInstantiated(const std::vector<Tensor>& args,
   return frame.ConsumeRetvals(rets);
 }
 
-void CapturedFunction::RunAsync(IteratorContext* ctx,
-                                std::vector<Tensor>&& args,
-                                std::vector<Tensor>* rets,
-                                FunctionLibraryRuntime::DoneCallback done,
-                                const string& prefix) {
+void InstantiatedCapturedFunction::RunAsync(
+    IteratorContext* ctx, std::vector<Tensor>&& args, std::vector<Tensor>* rets,
+    FunctionLibraryRuntime::DoneCallback done, const string& prefix) const {
   // NOTE(mrry): This method does not transfer ownership of `ctx`, and it may
   // be deleted before `done` is called. Take care not to capture `ctx` in any
   // code that may execute asynchronously in this function.
-  FunctionLibraryRuntime::Handle handle;
-  Status s = GetHandle(ctx, &handle);
-  if (!s.ok()) {
-    done(s);
-    return;
-  }
-  OwnedArgsCallFrame* frame =
-      new OwnedArgsCallFrame(std::move(args), &captured_inputs_, ret_types_);
+  OwnedArgsCallFrame* frame = new OwnedArgsCallFrame(
+      std::move(args), &captured_func_->captured_inputs(), ret_types_);
 
   FunctionLibraryRuntime::Options f_opts;
-  f_opts.step_id = CapturedFunction::generate_step_id();
-  ResourceMgr* resource_mgr = ctx->lib()->device()->resource_manager();
+  f_opts.step_id = InstantiatedCapturedFunction::generate_step_id();
+  ResourceMgr* resource_mgr = lib_->device()->resource_manager();
   ScopedStepContainer* step_container = new ScopedStepContainer(
       f_opts.step_id, [resource_mgr](const string& name) {
         resource_mgr->Cleanup(name).IgnoreError();
       });
   f_opts.step_container = step_container;
   f_opts.runner = ctx->runner();
-  if (ctx->lib()->device()->device_type() != DEVICE_CPU) {
+  if (lib_->device()->device_type() != DEVICE_CPU) {
     f_opts.create_rendezvous = true;
   }
   // TODO(mrry): Add cancellation manager support to IteratorContext
@@ -480,7 +450,7 @@ void CapturedFunction::RunAsync(IteratorContext* ctx,
           stats_aggregator->AddToHistogram(
               strings::StrCat(
                   str_util::Split(prefix, "::", str_util::SkipEmpty()).back(),
-                  "::", func_.name(), "::execution_time"),
+                  "::", captured_func_->func().name(), "::execution_time"),
               {static_cast<float>(stats_collector->processing_time())});
         }
         if (model) {
@@ -495,15 +465,13 @@ void CapturedFunction::RunAsync(IteratorContext* ctx,
       std::move(done), ctx->model(), ctx->stats_aggregator(), prefix,
       std::move(stats_collector), std::placeholders::_1);
 
-  ctx->lib()->Run(f_opts, handle, frame, std::move(callback));
+  lib_->Run(f_opts, f_handle_, frame, std::move(callback));
 }
 
 CapturedFunction::CapturedFunction(const NameAttrList& func,
                                    std::vector<Tensor> captured_inputs,
                                    bool use_inter_op_parallelism)
     : func_(func),
-      lib_(nullptr),
-      f_handle_(kInvalidHandle),
       captured_inputs_(std::move(captured_inputs)),
       use_inter_op_parallelism_(use_inter_op_parallelism) {}
 
diff --git a/tensorflow/core/kernels/data/captured_function.h b/tensorflow/core/kernels/data/captured_function.h
index c6a5fe9e1ec..cffaf405ecb 100644
--- a/tensorflow/core/kernels/data/captured_function.h
+++ b/tensorflow/core/kernels/data/captured_function.h
@@ -34,12 +34,83 @@ class ResourceMgr;
 
 namespace data {
 
-// A `CapturedFunction` encapsulates a TensorFlow function and all of
-// the runtime support required to execute it.
+class CapturedFunction;
+
+// An InstantiatedCapturedFunction encapsulates all the runtime support needed
+// to execute a tensorflow function.
 //
-// The `Dataset`-related classes use `CapturedFunction` to execute
-// TensorFlow functions outside a the normal `OpKernel::Compute()`
-// context.
+// While CapturedFunction (below) encapsulates the more permanent attributes
+// of the function i.e. name, captured arguments etc.,
+// InstantiatedCapturedFunction encapsulates the more runtime aspects i.e.
+// FunctionLibraryRuntime, function handle etc.
+//
+// The `Iterator-`related classes use `InstantiatedCapturedFunction` to execute
+// functions outside a the normal `OpKernel::Compute()` context.
+class InstantiatedCapturedFunction {
+ public:
+  ~InstantiatedCapturedFunction();
+
+  // Runs the "Instantiated Captured function". This method takes ownership of
+  // the tensors in `args`, in order to be able to deallocate them as early as
+  // possible. Use `RunWithBorrowedArgs()` if the caller needs to retain
+  // ownership of the `args`.
+  Status Run(IteratorContext* ctx, std::vector<Tensor>&& args,
+             std::vector<Tensor>* rets) const;
+
+  // Synchronously runs the captured function on the given `args`, and stores
+  // the results in `*rets`. Prefer to use `Run()` or `RunAsync()` when
+  // possible.
+  Status RunWithBorrowedArgs(IteratorContext* ctx,
+                             const std::vector<Tensor>& args,
+                             std::vector<Tensor>* rets) const;
+
+  // Synchronously runs the captured function on the given `args`, and stores
+  // the results in `*rets`. Prefer to use `Run()` or `RunAsync()` when
+  // possible. This can be useful for calling a captured
+  // function in cases where an `IteratorContext*` is not available
+  // (such as a destructor).
+  Status RunInstantiated(const std::vector<Tensor>& args,
+                         std::vector<Tensor>* rets);
+
+  // Asynchronously runs the captured function on the given `args`, stores
+  // the results in `*rets`, and calls the given `done` callback when the
+  // function returns. This method takes ownership of the tensors in `args`,
+  // in order to be able to deallocate them as early as possible.
+  void RunAsync(IteratorContext* ctx, std::vector<Tensor>&& args,
+                std::vector<Tensor>* rets,
+                FunctionLibraryRuntime::DoneCallback done,
+                const string& prefix) const;
+
+  // Returns a step ID for use when running an `InstantiatedCapturedFunction`.
+  static int64 generate_step_id() {
+    // Choose a step ID that is guaranteed not to clash with any
+    // Session-generated step ID. DirectSession only generates
+    // non-negative step IDs (contiguous, starting from 0), and
+    // MasterSession generates 56-bit random step IDs whose MSB is
+    // always 0, so a negative random step ID should suffice.
+    return -std::abs(static_cast<int64>(random::New64()));
+  }
+
+ private:
+  InstantiatedCapturedFunction(
+      FunctionLibraryRuntime* lib, FunctionLibraryRuntime::Handle f_handle,
+      DataTypeVector ret_types,
+      std::function<void(std::function<void()>)> runner,
+      CapturedFunction* captured_func);
+
+  friend class CapturedFunction;
+
+  FunctionLibraryRuntime* const lib_;
+  const FunctionLibraryRuntime::Handle f_handle_;
+  const DataTypeVector ret_types_;
+  std::function<void(std::function<void()>)> captured_runner_;
+  CapturedFunction* const captured_func_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(InstantiatedCapturedFunction);
+};
+
+// A `CapturedFunction` encapsulates a TensorFlow function, plus any "captured"
+// arguments that it closed over in the user program.
 class CapturedFunction {
  public:
   // Creates a new instance using a list of named attributes, fetching captured
@@ -57,85 +128,29 @@ class CapturedFunction {
                        const string& argument, bool use_inter_op_parallelism,
                        std::unique_ptr<CapturedFunction>* out_function);
 
-  ~CapturedFunction();
-
-  // Runs the "Captured function" using the given FLR and caches the lib and
-  // handle generated during instantiation. If Run is called with a different
-  // lib afterwards, generates an error. This method takes ownership of the
-  // tensors in `args`, in order to be able to deallocate them as early as
-  // possible. Use `RunWithBorrowedArgs()` if the caller needs to retain
-  // ownership of the `args`.
-  Status Run(IteratorContext* ctx, std::vector<Tensor>&& args,
-             std::vector<Tensor>* rets);
-
-  // Synchronously runs the captured function on the given `args`, and stores
-  // the results in `*rets`. Prefer to use `Run()` or `RunAsync()` when
-  // possible.
-  Status RunWithBorrowedArgs(IteratorContext* ctx,
-                             const std::vector<Tensor>& args,
-                             std::vector<Tensor>* rets);
-
-  // Explicitly instantiate this function for use in the given
-  // context. This method, and the context-less overload
-  // `RunInstantiated()` below can be useful for calling a captured
-  // function in cases where an `IteratorContext*` is not available
-  // (such as a destructor).
-  Status Instantiate(IteratorContext* ctx);
-
-  // Synchronously runs the captured function on the given `args`, and stores
-  // the results in `*rets`. Prefer to use `Run()` or `RunAsync()` when
-  // possible.
-  //
-  // REQUIRES: `this->Instantiate()` must have been called before this method.
-  Status RunInstantiated(const std::vector<Tensor>& args,
-                         std::vector<Tensor>* rets);
-
-  // Asynchronously runs the captured function on the given `args`, stores
-  // the results in `*rets`, and calls the given `done` callback when the
-  // function returns. This method takes ownership of the tensors in `args`,
-  // in order to be able to deallocate them as early as possible.
-  void RunAsync(IteratorContext* ctx, std::vector<Tensor>&& args,
-                std::vector<Tensor>* rets,
-                FunctionLibraryRuntime::DoneCallback done,
-                const string& prefix);
+  // Instantiates this function for use in the given context, providing an
+  // InstantiatedCapturedFunction that can be used to execute functions.
+  Status Instantiate(IteratorContext* ctx,
+                     std::unique_ptr<InstantiatedCapturedFunction>*
+                         instantiated_captured_function);
 
   // Returns the named list of function arguments.
   const NameAttrList& func() { return func_; }
 
   // Returns that additional captured inputs that will be passed to the function
-  // when `Run*()` is called.
   const std::vector<Tensor>& captured_inputs() { return captured_inputs_; }
 
-  // Returns a step ID for use when running a `CapturedFunction`.
-  static int64 generate_step_id() {
-    // Choose a step ID that is guaranteed not to clash with any
-    // Session-generated step ID. DirectSession only generates
-    // non-negative step IDs (contiguous, starting from 0), and
-    // MasterSession generates 56-bit random step IDs whose MSB is
-    // always 0, so a negative random step ID should suffice.
-    return -std::abs(static_cast<int64>(random::New64()));
-  }
-
  private:
   CapturedFunction(const NameAttrList& func,
                    std::vector<Tensor> captured_inputs,
                    bool use_inter_op_parallelism);
 
-  Status GetHandle(IteratorContext* ctx,
-                   FunctionLibraryRuntime::Handle* out_handle);
-
-  mutex mu_;
   const NameAttrList func_;
-  FunctionLibraryRuntime* lib_ GUARDED_BY(mu_);
-  FunctionLibraryRuntime::Handle f_handle_ GUARDED_BY(mu_);
   const std::vector<Tensor> captured_inputs_;
-  DataTypeSlice ret_types_;
-  std::function<void(std::function<void()>)> captured_runner_ = nullptr;
   const bool use_inter_op_parallelism_;
 
   TF_DISALLOW_COPY_AND_ASSIGN(CapturedFunction);
 };
-
 }  // namespace data
 
 // TODO(b/114112161): Remove these aliases when all users have moved over to the
diff --git a/tensorflow/core/kernels/data/concatenate_dataset_op.cc b/tensorflow/core/kernels/data/concatenate_dataset_op.cc
index d5a0abc64b4..066b2c9aef4 100644
--- a/tensorflow/core/kernels/data/concatenate_dataset_op.cc
+++ b/tensorflow/core/kernels/data/concatenate_dataset_op.cc
@@ -79,6 +79,18 @@ class ConcatenateDatasetOp : public BinaryDatasetOpKernel {
       return "ConcatenateDatasetOp::Dataset";
     }
 
+    int64 Cardinality() const override {
+      int64 n1 = input_->Cardinality();
+      int64 n2 = to_concatenate_->Cardinality();
+      if (n1 == kInfiniteCardinality || n2 == kInfiniteCardinality) {
+        return kInfiniteCardinality;
+      }
+      if (n1 == kUnknownCardinality || n2 == kUnknownCardinality) {
+        return kUnknownCardinality;
+      }
+      return n1 + n2;
+    }
+
    protected:
     Status AsGraphDefInternal(SerializationContext* ctx,
                               DatasetGraphDefBuilder* b,
diff --git a/tensorflow/core/kernels/data/dataset_ops.cc b/tensorflow/core/kernels/data/dataset_ops.cc
index 36e9714736a..0abfdbb56b5 100644
--- a/tensorflow/core/kernels/data/dataset_ops.cc
+++ b/tensorflow/core/kernels/data/dataset_ops.cc
@@ -46,8 +46,25 @@ class DatasetToGraphOp : public OpKernel {
   }
 };
 
+class DatasetCardinalityOp : public OpKernel {
+ public:
+  explicit DatasetCardinalityOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    DatasetBase* dataset;
+    OP_REQUIRES_OK(ctx, GetDatasetFromVariantTensor(ctx->input(0), &dataset));
+    Tensor* result;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}), &result));
+    result->scalar<int64>()() = dataset->Cardinality();
+  }
+};
+
 REGISTER_KERNEL_BUILDER(Name("DatasetToGraph").Device(DEVICE_CPU),
                         DatasetToGraphOp);
 
+REGISTER_KERNEL_BUILDER(
+    Name("ExperimentalDatasetCardinality").Device(DEVICE_CPU),
+    DatasetCardinalityOp);
+
 }  // namespace data
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/dataset_utils.cc b/tensorflow/core/kernels/data/dataset_utils.cc
index 1be38c4cb26..4d92d314d3d 100644
--- a/tensorflow/core/kernels/data/dataset_utils.cc
+++ b/tensorflow/core/kernels/data/dataset_utils.cc
@@ -81,12 +81,12 @@ std::vector<bool> ComputeMoveVector(const std::vector<int>& indices) {
 
 Status MakeIteratorFromInputElement(
     IteratorContext* ctx, const std::vector<Tensor>& input_element,
-    int64 thread_index, CapturedFunction* captured_func, StringPiece prefix,
-    std::unique_ptr<IteratorBase>* out_iterator) {
+    int64 thread_index, const InstantiatedCapturedFunction& inst_captured_func,
+    StringPiece prefix, std::unique_ptr<IteratorBase>* out_iterator) {
   std::vector<Tensor> return_values;
 
-  TF_RETURN_IF_ERROR(
-      captured_func->RunWithBorrowedArgs(ctx, input_element, &return_values));
+  TF_RETURN_IF_ERROR(inst_captured_func.RunWithBorrowedArgs(ctx, input_element,
+                                                            &return_values));
 
   if (!(return_values.size() == 1 && return_values[0].dtype() == DT_VARIANT &&
         TensorShapeUtils::IsScalar(return_values[0].shape()))) {
diff --git a/tensorflow/core/kernels/data/dataset_utils.h b/tensorflow/core/kernels/data/dataset_utils.h
index 0b2816119dd..23a3d93ed16 100644
--- a/tensorflow/core/kernels/data/dataset_utils.h
+++ b/tensorflow/core/kernels/data/dataset_utils.h
@@ -44,8 +44,8 @@ std::vector<bool> ComputeMoveVector(const std::vector<int>& indices);
 
 Status MakeIteratorFromInputElement(
     IteratorContext* ctx, const std::vector<Tensor>& input_element,
-    int64 thread_index, CapturedFunction* captured_func, StringPiece prefix,
-    std::unique_ptr<IteratorBase>* out_iterator);
+    int64 thread_index, const InstantiatedCapturedFunction& inst_captured_func,
+    StringPiece prefix, std::unique_ptr<IteratorBase>* out_iterator);
 
 // Returns Status::OK() if `expected` and `received` types match,
 // errors::InvalidArgument otherwise.
diff --git a/tensorflow/core/kernels/data/experimental/BUILD b/tensorflow/core/kernels/data/experimental/BUILD
index 1a18864ecf5..7433303f776 100644
--- a/tensorflow/core/kernels/data/experimental/BUILD
+++ b/tensorflow/core/kernels/data/experimental/BUILD
@@ -11,35 +11,31 @@ load(
     "tf_kernel_library",
 )
 
-cc_library(
-    name = "indexed_dataset_headers",
-    hdrs = ["indexed_dataset.h"],
+tf_kernel_library(
+    name = "assert_next_dataset_op",
+    srcs = ["assert_next_dataset_op.cc"],
     deps = [
+        "//tensorflow/core:experimental_dataset_ops_op_lib",
         "//tensorflow/core:framework",
         "//third_party/eigen3",
     ],
 )
 
 tf_kernel_library(
-    name = "indexed_dataset",
-    srcs = [
-        "identity_indexed_dataset.cc",
-        "indexed_dataset.cc",
-    ],
+    name = "csv_dataset_op",
+    srcs = ["csv_dataset_op.cc"],
     deps = [
-        ":indexed_dataset_headers",
         "//tensorflow/core:experimental_dataset_ops_op_lib",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
-        "//third_party/eigen3",
+        "//tensorflow/core:lib_internal",
     ],
 )
 
 tf_kernel_library(
-    name = "prefetching_kernels",
-    srcs = ["prefetching_kernels.cc"],
+    name = "dense_to_sparse_batch_dataset_op",
+    srcs = ["dense_to_sparse_batch_dataset_op.cc"],
     deps = [
-        "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:experimental_dataset_ops_op_lib",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
@@ -59,13 +55,29 @@ tf_kernel_library(
 )
 
 tf_kernel_library(
-    name = "csv_dataset_op",
-    srcs = ["csv_dataset_op.cc"],
+    name = "group_by_reducer_dataset_op",
+    srcs = ["group_by_reducer_dataset_op.cc"],
     deps = [
-        "//tensorflow/core:experimental_dataset_ops_op_lib",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:dataset_ops_op_lib",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "//tensorflow/core/kernels/data:captured_function",
+    ],
+)
+
+tf_kernel_library(
+    name = "group_by_window_dataset_op",
+    srcs = ["group_by_window_dataset_op.cc"],
+    deps = [
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core/kernels/data:captured_function",
+        "//tensorflow/core/kernels/data:window_dataset",
     ],
 )
 
@@ -79,6 +91,18 @@ tf_kernel_library(
     ],
 )
 
+tf_kernel_library(
+    name = "indexed_dataset_op",
+    srcs = ["indexed_dataset_op.cc"],
+    deps = [
+        "//tensorflow/core:experimental_dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core/kernels/data:dataset_utils",
+        "//third_party/eigen3",
+    ],
+)
+
 tf_kernel_library(
     name = "lmdb_dataset_op",
     srcs = ["lmdb_dataset_op.cc"],
@@ -92,12 +116,38 @@ tf_kernel_library(
 )
 
 tf_kernel_library(
-    name = "threadpool_dataset_op",
-    srcs = ["threadpool_dataset_op.cc"],
+    name = "map_and_batch_dataset_op",
+    srcs = ["map_and_batch_dataset_op.cc"],
+    deps = [
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core/kernels:inplace_ops",
+        "//tensorflow/core/kernels/data:captured_function",
+        "//tensorflow/core/kernels/data:dataset_utils",
+    ],
+)
+
+tf_kernel_library(
+    name = "matching_files_dataset_op",
+    srcs = ["matching_files_dataset_op.cc"],
     deps = [
         "//tensorflow/core:experimental_dataset_ops_op_lib",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core/kernels/data:dataset",
+    ],
+)
+
+tf_kernel_library(
+    name = "non_serializable_dataset_op",
+    srcs = ["non_serializable_dataset_op.cc"],
+    deps = [
+        "//tensorflow/core:experimental_dataset_ops_op_lib",
+        "//tensorflow/core:framework",
         "//third_party/eigen3",
     ],
 )
@@ -118,23 +168,72 @@ tf_kernel_library(
 )
 
 tf_kernel_library(
-    name = "unique_dataset_op",
-    srcs = ["unique_dataset_op.cc"],
+    name = "parallel_interleave_dataset_op",
+    srcs = ["parallel_interleave_dataset_op.cc"],
     deps = [
-        "//tensorflow/core:experimental_dataset_ops_op_lib",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:dataset_ops_op_lib",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
-        "//third_party/eigen3",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core/kernels/data:captured_function",
+        "//tensorflow/core/kernels/data:dataset_utils",
     ],
 )
 
 tf_kernel_library(
-    name = "assert_next_dataset_op",
-    srcs = ["assert_next_dataset_op.cc"],
+    name = "parse_example_dataset_op",
+    srcs = ["parse_example_dataset_op.cc"],
     deps = [
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:framework",
+        "//tensorflow/core/kernels/data:parallel_map_iterator",
+    ],
+)
+
+tf_kernel_library(
+    name = "prefetching_kernels",
+    srcs = ["prefetching_kernels.cc"],
+    deps = [
+        "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:experimental_dataset_ops_op_lib",
         "//tensorflow/core:framework",
-        "//third_party/eigen3",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+    ],
+)
+
+tf_kernel_library(
+    name = "random_dataset_op",
+    srcs = ["random_dataset_op.cc"],
+    deps = [
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+    ],
+)
+
+tf_kernel_library(
+    name = "scan_dataset_op",
+    srcs = ["scan_dataset_op.cc"],
+    deps = [
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core/kernels/data:captured_function",
+    ],
+)
+
+tf_kernel_library(
+    name = "set_stats_aggregator_dataset_op",
+    srcs = ["set_stats_aggregator_dataset_op.cc"],
+    deps = [
+        "//tensorflow/core:core_cpu_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib_internal",
     ],
 )
 
@@ -148,11 +247,93 @@ tf_kernel_library(
 )
 
 tf_kernel_library(
-    name = "non_serializable_dataset_op",
-    srcs = ["non_serializable_dataset_op.cc"],
+    name = "sliding_window_dataset_op",
+    srcs = ["sliding_window_dataset_op.cc"],
+    deps = [
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+    ],
+)
+
+tf_kernel_library(
+    name = "sql_dataset_op",
+    srcs = [
+        "sql_dataset_op.cc",
+    ],
     deps = [
         "//tensorflow/core:experimental_dataset_ops_op_lib",
         "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core/kernels/data/experimental/sql",
+    ],
+)
+
+tf_kernel_library(
+    name = "stats_aggregator_ops",
+    srcs = ["stats_aggregator_ops.cc"],
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+    ],
+)
+
+tf_kernel_library(
+    name = "stats_dataset_ops",
+    srcs = ["stats_dataset_ops.cc"],
+    deps = [
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:protos_all_cc",
+    ],
+)
+
+tf_kernel_library(
+    name = "to_tf_record_op",
+    srcs = ["to_tf_record_op.cc"],
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core/kernels:ops_util",
+        "//tensorflow/core/kernels/data:dataset_utils",
+    ],
+)
+
+tf_kernel_library(
+    name = "threadpool_dataset_op",
+    srcs = ["threadpool_dataset_op.cc"],
+    deps = [
+        "//tensorflow/core:experimental_dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//third_party/eigen3",
+    ],
+)
+
+tf_kernel_library(
+    name = "unbatch_dataset_op",
+    srcs = ["unbatch_dataset_op.cc"],
+    deps = [
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+    ],
+)
+
+tf_kernel_library(
+    name = "unique_dataset_op",
+    srcs = ["unique_dataset_op.cc"],
+    deps = [
+        "//tensorflow/core:experimental_dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
         "//third_party/eigen3",
     ],
 )
@@ -162,15 +343,31 @@ tf_kernel_library(
     deps = [
         ":assert_next_dataset_op",
         ":csv_dataset_op",
+        ":dense_to_sparse_batch_dataset_op",
         ":directed_interleave_dataset_op",
+        ":group_by_reducer_dataset_op",
+        ":group_by_window_dataset_op",
         ":ignore_errors_dataset_op",
-        ":indexed_dataset",
+        ":indexed_dataset_op",
         ":lmdb_dataset_op",
+        ":map_and_batch_dataset_op",
+        ":matching_files_dataset_op",
         ":non_serializable_dataset_op",
         ":numa_map_and_batch_dataset_op",
+        ":parallel_interleave_dataset_op",
+        ":parse_example_dataset_op",
         ":prefetching_kernels",
+        ":random_dataset_op",
+        ":scan_dataset_op",
+        ":set_stats_aggregator_dataset_op",
         ":sleep_dataset_op",
+        ":sliding_window_dataset_op",
+        ":sql_dataset_op",
+        ":stats_aggregator_ops",
+        ":stats_dataset_ops",
         ":threadpool_dataset_op",
+        ":to_tf_record_op",
+        ":unbatch_dataset_op",
         ":unique_dataset_op",
     ],
 )
diff --git a/tensorflow/core/kernels/data/experimental/assert_next_dataset_op.cc b/tensorflow/core/kernels/data/experimental/assert_next_dataset_op.cc
index 3b5ee9b783c..3e87f484b94 100644
--- a/tensorflow/core/kernels/data/experimental/assert_next_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/assert_next_dataset_op.cc
@@ -76,6 +76,8 @@ class AssertNextDatasetOp : public UnaryDatasetOpKernel {
       return "AssertNextDatasetOp::Dataset";
     }
 
+    int64 Cardinality() const override { return input_->Cardinality(); }
+
    protected:
     Status AsGraphDefInternal(SerializationContext* ctx,
                               DatasetGraphDefBuilder* b,
diff --git a/tensorflow/core/kernels/data/dense_to_sparse_batch_dataset_op.cc b/tensorflow/core/kernels/data/experimental/dense_to_sparse_batch_dataset_op.cc
similarity index 97%
rename from tensorflow/core/kernels/data/dense_to_sparse_batch_dataset_op.cc
rename to tensorflow/core/kernels/data/experimental/dense_to_sparse_batch_dataset_op.cc
index d684d23b242..97e64dd7444 100644
--- a/tensorflow/core/kernels/data/dense_to_sparse_batch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/dense_to_sparse_batch_dataset_op.cc
@@ -114,6 +114,14 @@ class DenseToSparseBatchDatasetOp : public UnaryDatasetOpKernel {
                              ")::Dataset");
     }
 
+    int64 Cardinality() const override {
+      int64 n = input_->Cardinality();
+      if (n == kInfiniteCardinality || n == kUnknownCardinality) {
+        return n;
+      }
+      return n / batch_size_ + (n % batch_size_ == 0 ? 0 : 1);
+    }
+
    protected:
     Status AsGraphDefInternal(SerializationContext* ctx,
                               DatasetGraphDefBuilder* b,
@@ -304,8 +312,9 @@ class DenseToSparseBatchDatasetOp : public UnaryDatasetOpKernel {
   };
 };
 
-REGISTER_KERNEL_BUILDER(Name("DenseToSparseBatchDataset").Device(DEVICE_CPU),
-                        DenseToSparseBatchDatasetOp);
+REGISTER_KERNEL_BUILDER(
+    Name("ExperimentalDenseToSparseBatchDataset").Device(DEVICE_CPU),
+    DenseToSparseBatchDatasetOp);
 
 }  // namespace
 }  // namespace data
diff --git a/tensorflow/core/kernels/data/group_by_reducer_dataset_op.cc b/tensorflow/core/kernels/data/experimental/group_by_reducer_dataset_op.cc
similarity index 93%
rename from tensorflow/core/kernels/data/group_by_reducer_dataset_op.cc
rename to tensorflow/core/kernels/data/experimental/group_by_reducer_dataset_op.cc
index dc1925a21fe..1c298cfdd6a 100644
--- a/tensorflow/core/kernels/data/group_by_reducer_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/group_by_reducer_dataset_op.cc
@@ -191,11 +191,14 @@ class GroupByReducerDatasetOp : public UnaryDatasetOpKernel {
       Status Initialize(IteratorContext* ctx) override {
         TF_RETURN_IF_ERROR(
             dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_));
-        TF_RETURN_IF_ERROR(dataset()->captured_key_func_->Instantiate(ctx));
-        TF_RETURN_IF_ERROR(dataset()->captured_init_func_->Instantiate(ctx));
-        TF_RETURN_IF_ERROR(dataset()->captured_reduce_func_->Instantiate(ctx));
-        TF_RETURN_IF_ERROR(
-            dataset()->captured_finalize_func_->Instantiate(ctx));
+        TF_RETURN_IF_ERROR(dataset()->captured_key_func_->Instantiate(
+            ctx, &instantiated_key_func_));
+        TF_RETURN_IF_ERROR(dataset()->captured_init_func_->Instantiate(
+            ctx, &instantiated_init_func_));
+        TF_RETURN_IF_ERROR(dataset()->captured_reduce_func_->Instantiate(
+            ctx, &instantiated_reduce_func_));
+        TF_RETURN_IF_ERROR(dataset()->captured_finalize_func_->Instantiate(
+            ctx, &instantiated_finalize_func_));
         return Status::OK();
       }
 
@@ -213,9 +216,8 @@ class GroupByReducerDatasetOp : public UnaryDatasetOpKernel {
           if (!end_of_input_) {
             // Run the key function on the input element.
             std::vector<Tensor> key_func_output;
-            TF_RETURN_IF_ERROR(
-                dataset()->captured_key_func_->RunWithBorrowedArgs(
-                    ctx, next_input_element, &key_func_output));
+            TF_RETURN_IF_ERROR(instantiated_key_func_->RunWithBorrowedArgs(
+                ctx, next_input_element, &key_func_output));
 
             if (key_func_output.size() != 1 ||
                 key_func_output[0].dtype() != DT_INT64 ||
@@ -229,7 +231,7 @@ class GroupByReducerDatasetOp : public UnaryDatasetOpKernel {
             if (states_.find(key) == states_.end()) {
               // Run the init function to create the initial state.
               std::vector<Tensor> init_func_output;
-              TF_RETURN_IF_ERROR(dataset()->captured_init_func_->Run(
+              TF_RETURN_IF_ERROR(instantiated_init_func_->Run(
                   ctx, std::move(key_func_output), &init_func_output));
               states_[key] = init_func_output;
             }
@@ -243,7 +245,7 @@ class GroupByReducerDatasetOp : public UnaryDatasetOpKernel {
                       std::back_inserter(args));
 
             std::vector<Tensor> reduce_func_output;
-            TF_RETURN_IF_ERROR(dataset()->captured_reduce_func_->Run(
+            TF_RETURN_IF_ERROR(instantiated_reduce_func_->Run(
                 ctx, std::move(args), &reduce_func_output));
             states_[key] = reduce_func_output;
           } else {
@@ -259,9 +261,8 @@ class GroupByReducerDatasetOp : public UnaryDatasetOpKernel {
           *end_of_sequence = true;
           return Status::OK();
         }
-        TF_RETURN_IF_ERROR(
-            dataset()->captured_finalize_func_->RunWithBorrowedArgs(
-                ctx, states_[keys_[keys_index_++]], out_tensors));
+        TF_RETURN_IF_ERROR(instantiated_finalize_func_->RunWithBorrowedArgs(
+            ctx, states_[keys_[keys_index_++]], out_tensors));
         *end_of_sequence = false;
         return Status::OK();
       }
@@ -384,6 +385,10 @@ class GroupByReducerDatasetOp : public UnaryDatasetOpKernel {
       std::map<int64, std::vector<Tensor>> states_ GUARDED_BY(mu_);
       std::vector<int64> keys_ GUARDED_BY(mu_);
       int64 keys_index_ GUARDED_BY(mu_) = 0;
+      std::unique_ptr<InstantiatedCapturedFunction> instantiated_key_func_;
+      std::unique_ptr<InstantiatedCapturedFunction> instantiated_init_func_;
+      std::unique_ptr<InstantiatedCapturedFunction> instantiated_reduce_func_;
+      std::unique_ptr<InstantiatedCapturedFunction> instantiated_finalize_func_;
     };
 
     const NameAttrList& key_func() const { return captured_key_func_->func(); }
@@ -433,8 +438,9 @@ class GroupByReducerDatasetOp : public UnaryDatasetOpKernel {
   NameAttrList finalize_func_;
 };
 
-REGISTER_KERNEL_BUILDER(Name("GroupByReducerDataset").Device(DEVICE_CPU),
-                        GroupByReducerDatasetOp);
+REGISTER_KERNEL_BUILDER(
+    Name("ExperimentalGroupByReducerDataset").Device(DEVICE_CPU),
+    GroupByReducerDatasetOp);
 
 }  // namespace
 }  // namespace data
diff --git a/tensorflow/core/kernels/data/group_by_window_dataset_op.cc b/tensorflow/core/kernels/data/experimental/group_by_window_dataset_op.cc
similarity index 95%
rename from tensorflow/core/kernels/data/group_by_window_dataset_op.cc
rename to tensorflow/core/kernels/data/experimental/group_by_window_dataset_op.cc
index 64db5df31eb..98603d5a732 100644
--- a/tensorflow/core/kernels/data/group_by_window_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/group_by_window_dataset_op.cc
@@ -175,10 +175,12 @@ class GroupByWindowDatasetOp : public UnaryDatasetOpKernel {
       Status Initialize(IteratorContext* ctx) override {
         TF_RETURN_IF_ERROR(
             dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_));
-        TF_RETURN_IF_ERROR(dataset()->captured_key_func_->Instantiate(ctx));
-        TF_RETURN_IF_ERROR(dataset()->captured_reduce_func_->Instantiate(ctx));
-        TF_RETURN_IF_ERROR(
-            dataset()->captured_window_size_func_->Instantiate(ctx));
+        TF_RETURN_IF_ERROR(dataset()->captured_key_func_->Instantiate(
+            ctx, &instantiated_key_func_));
+        TF_RETURN_IF_ERROR(dataset()->captured_reduce_func_->Instantiate(
+            ctx, &instantiated_reduce_func_));
+        TF_RETURN_IF_ERROR(dataset()->captured_window_size_func_->Instantiate(
+            ctx, &instantiated_window_size_func_));
         return Status::OK();
       }
 
@@ -215,9 +217,8 @@ class GroupByWindowDatasetOp : public UnaryDatasetOpKernel {
               // Run the key function on the input element to identify its
               // group.
               std::vector<Tensor> key_func_output;
-              TF_RETURN_IF_ERROR(
-                  dataset()->captured_key_func_->RunWithBorrowedArgs(
-                      ctx, next_input_element, &key_func_output));
+              TF_RETURN_IF_ERROR(instantiated_key_func_->RunWithBorrowedArgs(
+                  ctx, next_input_element, &key_func_output));
 
               if (key_func_output.size() != 1 ||
                   key_func_output[0].dtype() != DT_INT64 ||
@@ -232,7 +233,7 @@ class GroupByWindowDatasetOp : public UnaryDatasetOpKernel {
                 // Run the window size function on the key to identify its
                 // window size.
                 std::vector<Tensor> window_size_func_output;
-                TF_RETURN_IF_ERROR(dataset()->captured_window_size_func_->Run(
+                TF_RETURN_IF_ERROR(instantiated_window_size_func_->Run(
                     ctx, std::move(key_func_output), &window_size_func_output));
 
                 if (window_size_func_output.size() != 1 ||
@@ -452,8 +453,8 @@ class GroupByWindowDatasetOp : public UnaryDatasetOpKernel {
         std::vector<Tensor> args(
             {std::move(key_arg), std::move(group_dataset_arg)});
         std::vector<Tensor> return_values;
-        TF_RETURN_IF_ERROR(dataset()->captured_reduce_func_->Run(
-            ctx, std::move(args), &return_values));
+        TF_RETURN_IF_ERROR(instantiated_reduce_func_->Run(ctx, std::move(args),
+                                                          &return_values));
 
         if (!(return_values.size() == 1 &&
               return_values[0].dtype() == DT_VARIANT &&
@@ -482,6 +483,10 @@ class GroupByWindowDatasetOp : public UnaryDatasetOpKernel {
       std::map<int64, std::vector<std::vector<Tensor>>> groups_ GUARDED_BY(mu_);
       std::unique_ptr<IteratorBase> current_group_iterator_ GUARDED_BY(mu_);
       std::map<int64, int64> window_sizes_ GUARDED_BY(mu_);
+      std::unique_ptr<InstantiatedCapturedFunction> instantiated_key_func_;
+      std::unique_ptr<InstantiatedCapturedFunction> instantiated_reduce_func_;
+      std::unique_ptr<InstantiatedCapturedFunction>
+          instantiated_window_size_func_;
     };
 
     Status OtherArgumentsNodeAndType(
@@ -518,8 +523,9 @@ class GroupByWindowDatasetOp : public UnaryDatasetOpKernel {
   NameAttrList window_size_func_;
 };
 
-REGISTER_KERNEL_BUILDER(Name("GroupByWindowDataset").Device(DEVICE_CPU),
-                        GroupByWindowDatasetOp);
+REGISTER_KERNEL_BUILDER(
+    Name("ExperimentalGroupByWindowDataset").Device(DEVICE_CPU),
+    GroupByWindowDatasetOp);
 
 }  // namespace
 }  // namespace data
diff --git a/tensorflow/core/kernels/data/experimental/identity_indexed_dataset.cc b/tensorflow/core/kernels/data/experimental/identity_indexed_dataset.cc
deleted file mode 100644
index d10a3dea110..00000000000
--- a/tensorflow/core/kernels/data/experimental/identity_indexed_dataset.cc
+++ /dev/null
@@ -1,163 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/core/kernels/data/experimental/indexed_dataset.h"
-#include "tensorflow/core/lib/core/errors.h"
-
-namespace tensorflow {
-namespace data {
-namespace {
-
-class IdentityIndexedDatasetOp : public IndexedDatasetOpKernel {
- public:
-  using IndexedDatasetOpKernel::IndexedDatasetOpKernel;
-
-  void MakeIndexedDataset(OpKernelContext* ctx,
-                          IndexedDataset** output) override {
-    uint64 size = -1;
-    OP_REQUIRES_OK(ctx, ParseScalarArgument<uint64>(ctx, "size", &size));
-    OP_REQUIRES(ctx, size > 0, errors::InvalidArgument("`size` must be > 0"));
-    *output = new Dataset(ctx, size);
-  }
-
-  class Dataset : public IndexedDataset {
-   public:
-    Dataset(OpKernelContext* ctx, uint64 size)
-        : IndexedDataset(DatasetContext(ctx)), size_(size) {}
-
-    Status MaterializeDataset(
-        std::shared_ptr<MaterializedIndexedDataset>* materialized) override {
-      materialized->reset(new Materialized(this));
-      return Status::OK();
-    }
-
-    const DataTypeVector& output_dtypes() const override {
-      static DataTypeVector* dtypes = new DataTypeVector({DT_UINT64});
-      return *dtypes;
-    }
-
-    const std::vector<PartialTensorShape>& output_shapes() const override {
-      static std::vector<PartialTensorShape>* shapes =
-          new std::vector<PartialTensorShape>({{}});
-      return *shapes;
-    }
-
-    std::unique_ptr<IteratorBase> MakeIteratorInternal(
-        const string& prefix) const override {
-      return std::unique_ptr<IteratorBase>(new Iterator(
-          {this, strings::StrCat(prefix, "::IdentityIndexedDataset")}));
-    }
-
-    string DebugString() const override {
-      return "IdentityIndexedDataset::Dataset";
-    }
-
-    Status AsGraphDefInternal(SerializationContext* ctx,
-                              DatasetGraphDefBuilder* b,
-                              Node** node) const override {
-      return errors::Unimplemented(
-          "identity_indexed_dataset.AsGraphDefInternal");
-    }
-
-   private:
-    class Iterator : public DatasetIterator<Dataset> {
-     public:
-      explicit Iterator(const Params& params)
-          : DatasetIterator<Dataset>(params) {}
-      Status GetNextInternal(IteratorContext* ctx,
-                             std::vector<Tensor>* out_tensors,
-                             bool* end_of_sequence) override {
-        mutex_lock l(mu_);
-        if (cur_ < dataset()->size_) {
-          out_tensors->emplace_back(ctx->allocator({}), DT_UINT64,
-                                    TensorShape({}));
-          out_tensors->back().scalar<uint64>()() = cur_++;
-          *end_of_sequence = false;
-          return Status::OK();
-        }
-        *end_of_sequence = true;
-        return Status::OK();
-      }
-
-     protected:
-      std::shared_ptr<model::Node> CreateNode(
-          IteratorContext* ctx, model::Node::Args args) const override {
-        return model::MakeKnownRatioNode(std::move(args),
-                                         /*ratio=*/1);
-      }
-
-     private:
-      mutex mu_;
-      uint64 cur_ GUARDED_BY(mu_);
-    };
-
-    class Materialized : public MaterializedIndexedDataset {
-     public:
-      explicit Materialized(Dataset* dataset) : dataset_(dataset) {
-        dataset->Ref();
-      }
-
-      ~Materialized() override {
-        // TODO(saeta): Pull this into MaterializedIndexedDataset
-        dataset_->Unref();
-      }
-
-      const DataTypeVector& output_dtypes() const override {
-        return dataset_->output_dtypes();
-      }
-
-      const std::vector<PartialTensorShape>& output_shapes() const override {
-        return dataset_->output_shapes();
-      }
-
-      Status Get(IteratorContext&& ctx, uint64 index,
-                 std::vector<Tensor>* out_tensors) const override {
-        LOG(INFO) << "Materialized(" << dataset_->size_ << ")::Get(" << index
-                  << ")";
-        if (index >= dataset_->size_) {
-          // Note: use InvalidArgument instead of OutOfRange error because many
-          // things consider OutOfRange to be a "clean termination" error.
-          return errors::InvalidArgument(
-              "Index ", index,
-              " is out of range for this dataset. (Size is: ", dataset_->size_,
-              ".)");
-        }
-        out_tensors->emplace_back(ctx.allocator({}), DT_UINT64,
-                                  TensorShape({}));
-        out_tensors->back().scalar<uint64>()() = index;
-        return Status::OK();
-      }
-
-      Status Size(uint64* size) const override {
-        *size = dataset_->size_;
-        return Status::OK();
-      }
-
-     private:
-      const Dataset* const dataset_;  // Not owned.
-    };
-
-    const uint64 size_;
-    std::shared_ptr<Materialized> materialized_;
-  };
-};
-
-REGISTER_KERNEL_BUILDER(
-    Name("ExperimentalIdentityIndexedDataset").Device(DEVICE_CPU),
-    IdentityIndexedDatasetOp);
-
-}  // namespace
-}  // namespace data
-}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/experimental/ignore_errors_dataset_op.cc b/tensorflow/core/kernels/data/experimental/ignore_errors_dataset_op.cc
index 57cb44335b1..d445d9c8094 100644
--- a/tensorflow/core/kernels/data/experimental/ignore_errors_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/ignore_errors_dataset_op.cc
@@ -60,6 +60,8 @@ class IgnoreErrorsDatasetOp : public UnaryDatasetOpKernel {
       return "IgnoreErrorsDatasetOp::Dataset";
     }
 
+    int64 Cardinality() const override { return input_->Cardinality(); }
+
    protected:
     Status AsGraphDefInternal(SerializationContext* ctx,
                               DatasetGraphDefBuilder* b,
diff --git a/tensorflow/core/kernels/data/experimental/indexed_dataset.h b/tensorflow/core/kernels/data/experimental/indexed_dataset.h
deleted file mode 100644
index 27a8360cbcf..00000000000
--- a/tensorflow/core/kernels/data/experimental/indexed_dataset.h
+++ /dev/null
@@ -1,119 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#ifndef TENSORFLOW_CORE_KERNELS_DATA_EXPERIMENTAL_INDEXED_DATASET_H_
-#define TENSORFLOW_CORE_KERNELS_DATA_EXPERIMENTAL_INDEXED_DATASET_H_
-
-#include "tensorflow/core/framework/dataset.h"
-#include "tensorflow/core/framework/op_kernel.h"
-
-namespace tensorflow {
-namespace data {
-
-// TODO(saeta): Urgh, this is ugly.
-class MaterializedIndexedDataset {
- public:
-  virtual ~MaterializedIndexedDataset() = default;
-
-  // Retrieve the element at a given index. The output tensors are stored in
-  // out_tensors.
-  //
-  // If `index` is greater than `Size()`, tensorflow::errors::OutOfRangeError is
-  // returned.
-  //
-  // Get is thread-safe.
-  virtual Status Get(IteratorContext&& ctx, uint64 index,
-                     std::vector<Tensor>* out_tensors) const = 0;
-
-  // Size determines the number of elements in this IndexedDataset.
-  //
-  // Size is thread-safe.
-  virtual Status Size(uint64* size) const = 0;
-
-  // Returns a vector of DataType values, representing the respective
-  // element types of each tuple component in the outputs of this dataset.
-  virtual const DataTypeVector& output_dtypes() const = 0;
-
-  // Returns a vector of tensor shapes, representing the respective
-  // (and possibly partially defined) shapes of each tuple component
-  // in the outputs of this dataset.
-  virtual const std::vector<PartialTensorShape>& output_shapes() const = 0;
-};
-
-// IndexedDataset represents a dataset that supports random access in addition
-// to iterator-based sequential access.
-//
-// Note: IndexedDatasets are HIGHLY experimental at this time. Expect
-// significant (backwards incompatible) changes!
-class IndexedDataset : public DatasetBase {
- public:
-  IndexedDataset(DatasetContext&& ctx) : DatasetBase(std::move(ctx)) {}
-
-  // Materialize (if necessary) the dataset, and return a pointer.
-  // TODO(saeta): Add in `IteratorContext* ctx` when materializing.
-  virtual Status MaterializeDataset(
-      std::shared_ptr<MaterializedIndexedDataset>* materialized) = 0;
-};
-
-// IndexedDatasetOpKernel abstracts away interfacing IndexedDatasets with the
-// rest of the TensorFlow runtime.
-//
-// Most IndexedDataset's will be private members of classes inheriting from this
-// class.
-class IndexedDatasetOpKernel : public OpKernel {
- public:
-  IndexedDatasetOpKernel(OpKernelConstruction* ctx) : OpKernel(ctx) {}
-  void Compute(OpKernelContext* ctx) final;
-
- protected:
-  // Subclasses should implement this method. It will be called during Compute
-  // execution.
-  virtual void MakeIndexedDataset(OpKernelContext* ctx,
-                                  IndexedDataset** output) = 0;
-
-  template <typename T>
-  Status ParseScalarArgument(OpKernelContext* ctx,
-                             const StringPiece& argument_name, T* output) {
-    const Tensor* argument_t;
-    TF_RETURN_IF_ERROR(ctx->input(argument_name, &argument_t));
-    if (!TensorShapeUtils::IsScalar(argument_t->shape())) {
-      return errors::InvalidArgument(argument_name, " must be a scalar");
-    }
-    *output = argument_t->scalar<T>()();
-    return Status::OK();
-  }
-};
-
-// Validates and extracts an `IndexedDataset` object from `tensor`.
-//
-// `tensor` must have been written by a call to
-// `StoreIndexedDatasetInVariantTensor`
-//
-// The retrieved pointer isa  borrowed reference to the dataset, which is owned
-// by the tensor. The consumer must either acquire its own reference to the
-// dataset by calling `(*out_dataset)->Ref()`, or ensure that `tensor` is not
-// destroyed or mutated while the retrieved pointer is in use.
-Status GetIndexedDatasetFromVariantTensor(const Tensor& tensor,
-                                          IndexedDataset** out_dataset);
-
-// Stores an `IndexedDataset` object in `tensor.`
-//
-// The ownership of `dataset` is transferred to `tensor`.
-Status StoreIndexedDatasetInVariantTensor(IndexedDataset* dataset,
-                                          Tensor* tensor);
-
-}  // namespace data
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_CORE_KERNELS_DATA_EXPERIMENTAL_INDEXED_DATASET_H_
diff --git a/tensorflow/core/kernels/data/experimental/indexed_dataset.cc b/tensorflow/core/kernels/data/experimental/indexed_dataset_op.cc
similarity index 62%
rename from tensorflow/core/kernels/data/experimental/indexed_dataset.cc
rename to tensorflow/core/kernels/data/experimental/indexed_dataset_op.cc
index 75ea462f402..a07eaebdf9d 100644
--- a/tensorflow/core/kernels/data/experimental/indexed_dataset.cc
+++ b/tensorflow/core/kernels/data/experimental/indexed_dataset_op.cc
@@ -12,10 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/core/kernels/data/experimental/indexed_dataset.h"
 
+#include "tensorflow/core/framework/dataset.h"
+#include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/resource_mgr.h"
 #include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/kernels/data/dataset_utils.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/gtl/cleanup.h"
 
@@ -23,42 +25,79 @@ namespace tensorflow {
 namespace data {
 namespace {
 
-Status VerifyTypesMatch(const DataTypeVector& expected,
-                        const DataTypeVector& received) {
-  if (expected.size() != received.size()) {
-    return errors::InvalidArgument(
-        "Number of components does not match: expected ", expected.size(),
-        " types but got ", received.size(), ".");
-  }
-  for (size_t i = 0; i < expected.size(); ++i) {
-    if (expected[i] != received[i]) {
-      return errors::InvalidArgument("Data type mismatch at component ", i,
-                                     ": expected ", DataTypeString(expected[i]),
-                                     " but got ", DataTypeString(received[i]),
-                                     ".");
-    }
-  }
-  return Status::OK();
-}
+// TODO(saeta): Urgh, this is ugly.
+class MaterializedIndexedDataset {
+ public:
+  virtual ~MaterializedIndexedDataset() = default;
 
-Status VerifyShapesCompatible(const std::vector<PartialTensorShape>& expected,
-                              const std::vector<PartialTensorShape>& received) {
-  if (expected.size() != received.size()) {
-    return errors::InvalidArgument(
-        "Number of components does not match: expected ", expected.size(),
-        " shapes but got ", received.size(), ".");
-  }
-  for (size_t i = 0; i < expected.size(); ++i) {
-    if (!expected[i].IsCompatibleWith(received[i])) {
-      return errors::InvalidArgument("Incompatible shapes at component ", i,
-                                     ": expected ", expected[i].DebugString(),
-                                     " but got ", received[i].DebugString(),
-                                     ".");
-    }
-  }
+  // Retrieve the element at a given index. The output tensors are stored in
+  // out_tensors.
+  //
+  // If `index` is greater than `Size()`, tensorflow::errors::OutOfRangeError is
+  // returned.
+  //
+  // Get is thread-safe.
+  virtual Status Get(IteratorContext&& ctx, uint64 index,
+                     std::vector<Tensor>* out_tensors) const = 0;
 
-  return Status::OK();
-}
+  // Size determines the number of elements in this IndexedDataset.
+  //
+  // Size is thread-safe.
+  virtual Status Size(uint64* size) const = 0;
+
+  // Returns a vector of DataType values, representing the respective
+  // element types of each tuple component in the outputs of this dataset.
+  virtual const DataTypeVector& output_dtypes() const = 0;
+
+  // Returns a vector of tensor shapes, representing the respective
+  // (and possibly partially defined) shapes of each tuple component
+  // in the outputs of this dataset.
+  virtual const std::vector<PartialTensorShape>& output_shapes() const = 0;
+};
+
+// IndexedDataset represents a dataset that supports random access in addition
+// to iterator-based sequential access.
+//
+// Note: IndexedDatasets are HIGHLY experimental at this time. Expect
+// significant (backwards incompatible) changes!
+class IndexedDataset : public DatasetBase {
+ public:
+  explicit IndexedDataset(DatasetContext&& ctx) : DatasetBase(std::move(ctx)) {}
+
+  // Materialize (if necessary) the dataset, and return a pointer.
+  // TODO(saeta): Add in `IteratorContext* ctx` when materializing.
+  virtual Status MaterializeDataset(
+      std::shared_ptr<MaterializedIndexedDataset>* materialized) = 0;
+};
+
+// IndexedDatasetOpKernel abstracts away interfacing IndexedDatasets with the
+// rest of the TensorFlow runtime.
+//
+// Most IndexedDataset's will be private members of classes inheriting from this
+// class.
+class IndexedDatasetOpKernel : public OpKernel {
+ public:
+  explicit IndexedDatasetOpKernel(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+  void Compute(OpKernelContext* ctx) final;
+
+ protected:
+  // Subclasses should implement this method. It will be called during Compute
+  // execution.
+  virtual void MakeIndexedDataset(OpKernelContext* ctx,
+                                  IndexedDataset** output) = 0;
+
+  template <typename T>
+  Status ParseScalarArgument(OpKernelContext* ctx,
+                             const StringPiece argument_name, T* output) {
+    const Tensor* argument_t;
+    TF_RETURN_IF_ERROR(ctx->input(argument_name, &argument_t));
+    if (!TensorShapeUtils::IsScalar(argument_t->shape())) {
+      return errors::InvalidArgument(argument_name, " must be a scalar");
+    }
+    *output = argument_t->scalar<T>()();
+    return Status::OK();
+  }
+};
 
 class MaterializedDatasetResource : public ResourceBase {
  public:
@@ -164,8 +203,6 @@ class IndexedDatasetVariantWrapper {
   IndexedDataset* const dataset_;  // Owns one reference.
 };
 
-}  // namespace
-
 Status GetIndexedDatasetFromVariantTensor(const Tensor& tensor,
                                           IndexedDataset** out_dataset) {
   if (!(tensor.dtype() == DT_VARIANT ||
@@ -211,8 +248,6 @@ void IndexedDatasetOpKernel::Compute(OpKernelContext* ctx) {
   }
 }
 
-namespace {
-
 class MaterializedHandleOp : public OpKernel {
  public:
   explicit MaterializedHandleOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
@@ -370,6 +405,144 @@ REGISTER_KERNEL_BUILDER(
     Name("ExperimentalIndexedDatasetGet").Device(DEVICE_CPU),
     IndexedDatasetGet);
 
+class IdentityIndexedDatasetOp : public IndexedDatasetOpKernel {
+ public:
+  using IndexedDatasetOpKernel::IndexedDatasetOpKernel;
+
+  void MakeIndexedDataset(OpKernelContext* ctx,
+                          IndexedDataset** output) override {
+    uint64 size = -1;
+    OP_REQUIRES_OK(ctx, ParseScalarArgument<uint64>(ctx, "size", &size));
+    OP_REQUIRES(ctx, size > 0, errors::InvalidArgument("`size` must be > 0"));
+    *output = new Dataset(ctx, size);
+  }
+
+  class Dataset : public IndexedDataset {
+   public:
+    Dataset(OpKernelContext* ctx, uint64 size)
+        : IndexedDataset(DatasetContext(ctx)), size_(size) {}
+
+    Status MaterializeDataset(
+        std::shared_ptr<MaterializedIndexedDataset>* materialized) override {
+      materialized->reset(new Materialized(this));
+      return Status::OK();
+    }
+
+    const DataTypeVector& output_dtypes() const override {
+      static DataTypeVector* dtypes = new DataTypeVector({DT_UINT64});
+      return *dtypes;
+    }
+
+    const std::vector<PartialTensorShape>& output_shapes() const override {
+      static std::vector<PartialTensorShape>* shapes =
+          new std::vector<PartialTensorShape>({{}});
+      return *shapes;
+    }
+
+    std::unique_ptr<IteratorBase> MakeIteratorInternal(
+        const string& prefix) const override {
+      return std::unique_ptr<IteratorBase>(new Iterator(
+          {this, strings::StrCat(prefix, "::IdentityIndexedDataset")}));
+    }
+
+    string DebugString() const override {
+      return "IdentityIndexedDataset::Dataset";
+    }
+
+    Status AsGraphDefInternal(SerializationContext* ctx,
+                              DatasetGraphDefBuilder* b,
+                              Node** node) const override {
+      return errors::Unimplemented(
+          "identity_indexed_dataset.AsGraphDefInternal");
+    }
+
+   private:
+    class Iterator : public DatasetIterator<Dataset> {
+     public:
+      explicit Iterator(const Params& params)
+          : DatasetIterator<Dataset>(params) {}
+      Status GetNextInternal(IteratorContext* ctx,
+                             std::vector<Tensor>* out_tensors,
+                             bool* end_of_sequence) override {
+        mutex_lock l(mu_);
+        if (cur_ < dataset()->size_) {
+          out_tensors->emplace_back(ctx->allocator({}), DT_UINT64,
+                                    TensorShape({}));
+          out_tensors->back().scalar<uint64>()() = cur_++;
+          *end_of_sequence = false;
+          return Status::OK();
+        }
+        *end_of_sequence = true;
+        return Status::OK();
+      }
+
+     protected:
+      std::shared_ptr<model::Node> CreateNode(
+          IteratorContext* ctx, model::Node::Args args) const override {
+        return model::MakeKnownRatioNode(std::move(args),
+                                         /*ratio=*/1);
+      }
+
+     private:
+      mutex mu_;
+      uint64 cur_ GUARDED_BY(mu_);
+    };
+
+    class Materialized : public MaterializedIndexedDataset {
+     public:
+      explicit Materialized(Dataset* dataset) : dataset_(dataset) {
+        dataset->Ref();
+      }
+
+      ~Materialized() override {
+        // TODO(saeta): Pull this into MaterializedIndexedDataset
+        dataset_->Unref();
+      }
+
+      const DataTypeVector& output_dtypes() const override {
+        return dataset_->output_dtypes();
+      }
+
+      const std::vector<PartialTensorShape>& output_shapes() const override {
+        return dataset_->output_shapes();
+      }
+
+      Status Get(IteratorContext&& ctx, uint64 index,
+                 std::vector<Tensor>* out_tensors) const override {
+        LOG(INFO) << "Materialized(" << dataset_->size_ << ")::Get(" << index
+                  << ")";
+        if (index >= dataset_->size_) {
+          // Note: use InvalidArgument instead of OutOfRange error because many
+          // things consider OutOfRange to be a "clean termination" error.
+          return errors::InvalidArgument(
+              "Index ", index,
+              " is out of range for this dataset. (Size is: ", dataset_->size_,
+              ".)");
+        }
+        out_tensors->emplace_back(ctx.allocator({}), DT_UINT64,
+                                  TensorShape({}));
+        out_tensors->back().scalar<uint64>()() = index;
+        return Status::OK();
+      }
+
+      Status Size(uint64* size) const override {
+        *size = dataset_->size_;
+        return Status::OK();
+      }
+
+     private:
+      const Dataset* const dataset_;  // Not owned.
+    };
+
+    const uint64 size_;
+    std::shared_ptr<Materialized> materialized_;
+  };
+};
+
+REGISTER_KERNEL_BUILDER(
+    Name("ExperimentalIdentityIndexedDataset").Device(DEVICE_CPU),
+    IdentityIndexedDatasetOp);
+
 }  // namespace
 }  // namespace data
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/map_and_batch_dataset_op.cc b/tensorflow/core/kernels/data/experimental/map_and_batch_dataset_op.cc
similarity index 90%
rename from tensorflow/core/kernels/data/map_and_batch_dataset_op.cc
rename to tensorflow/core/kernels/data/experimental/map_and_batch_dataset_op.cc
index 72a401e99b8..df96cccee0d 100644
--- a/tensorflow/core/kernels/data/map_and_batch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/map_and_batch_dataset_op.cc
@@ -38,12 +38,16 @@ namespace tensorflow {
 namespace data {
 namespace {
 
+// Maximum number of batch results to buffer.
+const int64 kMaxBatchResults = 16;
+
 // See documentation in ../../ops/dataset_ops.cc for a high-level
 // description of the following op.
 class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
  public:
   using MapAndBatchIteratorFunction =
-      std::function<void(IteratorContext*, const string&, std::vector<Tensor>,
+      std::function<void(IteratorContext*, InstantiatedCapturedFunction*,
+                         const string&, std::vector<Tensor>,
                          std::shared_ptr<std::vector<Tensor>>, StatusCallback)>;
 
   explicit MapAndBatchDatasetOp(OpKernelConstruction* ctx)
@@ -102,19 +106,20 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
     MapAndBatchIteratorFunction map_func;
     CapturedFunction* raw_captured_func = captured_func.get();
     if (indices.empty()) {
-      map_func = [raw_captured_func](
-                     IteratorContext* ctx, const string& prefix,
-                     std::vector<Tensor> args,
-                     std::shared_ptr<std::vector<Tensor>> out_tensors,
-                     StatusCallback done) {
-        raw_captured_func->RunAsync(ctx, std::move(args), out_tensors.get(),
-                                    std::move(done), prefix);
+      map_func = [](IteratorContext* ctx,
+                    InstantiatedCapturedFunction* instantiated_captured_func,
+                    const string& prefix, std::vector<Tensor> args,
+                    std::shared_ptr<std::vector<Tensor>> out_tensors,
+                    StatusCallback done) {
+        instantiated_captured_func->RunAsync(
+            ctx, std::move(args), out_tensors.get(), std::move(done), prefix);
       };
     } else {
       std::vector<bool> can_move = ComputeMoveVector(indices);
       map_func = [raw_captured_func, indices, can_move](
-                     IteratorContext* ctx, const string& prefix,
-                     std::vector<Tensor> args,
+                     IteratorContext* ctx,
+                     InstantiatedCapturedFunction* instantiated_captured_func,
+                     const string& prefix, std::vector<Tensor> args,
                      std::shared_ptr<std::vector<Tensor>> out_tensors,
                      StatusCallback done) {
         const std::vector<Tensor>& captured_inputs =
@@ -190,6 +195,17 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
       return "MapAndBatchDatasetOp::Dataset";
     }
 
+    // TODO(b/120482302): Note that this is inaccurate until MapDataset is
+    // modified to preserve cardinality.
+    int64 Cardinality() const override {
+      int64 n = input_->Cardinality();
+      if (n == kInfiniteCardinality || n == kUnknownCardinality) {
+        return n;
+      }
+      return n / batch_size_ +
+             (n % batch_size_ == 0 || drop_remainder_ ? 0 : 1);
+    }
+
    protected:
     Status AsGraphDefInternal(SerializationContext* ctx,
                               DatasetGraphDefBuilder* b,
@@ -243,7 +259,11 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
             cond_var_(std::make_shared<condition_variable>()),
             num_parallel_calls_(std::make_shared<model::SharedState>(
                 params.dataset->num_parallel_calls_, mu_, cond_var_)),
-            map_func_(std::move(map_func)) {
+            map_func_(std::move(map_func)),
+            max_batch_results_(std::min(kMaxBatchResults,
+                                        (params.dataset->num_parallel_calls_ +
+                                         params.dataset->batch_size_ - 1) /
+                                            params.dataset->batch_size_)) {
         std::vector<string> components =
             str_util::Split(params.prefix, "::", str_util::SkipEmpty());
         prefix_end_ = components.back();
@@ -268,7 +288,8 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
         }
         TF_RETURN_IF_ERROR(
             dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_));
-        return dataset()->captured_func_->Instantiate(ctx);
+        return dataset()->captured_func_->Instantiate(
+            ctx, &instantiated_captured_func_);
       }
 
       Status GetNextInternal(IteratorContext* ctx,
@@ -280,9 +301,11 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
           EnsureRunnerThreadStarted(ctx);
           while (batch_results_.empty() ||
                  batch_results_.front()->num_calls > 0) {
+            ++waiting_;
             RecordStop(ctx);
             cond_var_->wait(l);
             RecordStart(ctx);
+            --waiting_;
           }
           std::swap(result, batch_results_.front());
           batch_results_.pop_front();
@@ -306,7 +329,7 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
         while (num_calls_ > 0) {
           cond_var_->wait(l);
         }
-        CHECK_EQ(num_calls_, 0);
+        DCHECK_EQ(num_calls_, 0);
         TF_RETURN_IF_ERROR(SaveInput(writer, input_impl_));
         TF_RETURN_IF_ERROR(
             writer->WriteScalar(full_name("call_counter"), call_counter_));
@@ -455,8 +478,9 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
 
         // Apply the map function on `input_element`, storing the result in
         // `return_values`, and invoking `done` when finished.
-        map_func_(ctx.get(), prefix(), std::move(input_element),
-                  std::move(return_values), std::move(done));
+        map_func_(ctx.get(), instantiated_captured_func_.get(), prefix(),
+                  std::move(input_element), std::move(return_values),
+                  std::move(done));
       }
 
       Status CopyPartialBatch(Tensor* output, const Tensor& value,
@@ -566,21 +590,30 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
         RecordStart(ctx.get());
         auto stop_cleanup =
             gtl::MakeCleanup([this, &ctx]() { RecordStop(ctx.get()); });
-        new_calls.reserve(num_parallel_calls_->value);
+        {
+          tf_shared_lock l(*mu_);  // mu_ == num_parallel_calls_->mu
+          new_calls.reserve(num_parallel_calls_->value);
+        }
         auto busy = [this]() EXCLUSIVE_LOCKS_REQUIRED(*mu_) -> bool {
           int64 num_parallel_calls = num_parallel_calls_->value;
-          int64 max_batch_results =
-              (num_parallel_calls + dataset()->batch_size_ - 1) /
-              dataset()->batch_size_;
           return num_calls_ >= num_parallel_calls ||
-                 (batch_results_.size() > max_batch_results ||
-                  (batch_results_.size() == max_batch_results &&
+                 (batch_results_.size() > max_batch_results_ ||
+                  (batch_results_.size() == max_batch_results_ &&
                    call_counter_ % dataset()->batch_size_ == 0));
         };
         while (true) {
           {
             mutex_lock l(*mu_);
             while (!cancelled_ && busy()) {
+              if (waiting_ > 0 && num_calls_ < num_parallel_calls_->value &&
+                  max_batch_results_ < kMaxBatchResults) {
+                // If there is a caller waiting for a batch and the number of
+                // outstanding calls is not maxed out, it means we are out of
+                // `batch_results_` slots. Instead of waiting for a slot to open
+                // up, we create a new one to utilize CPU efficiently.
+                max_batch_results_++;
+                continue;
+              }
               RecordStop(ctx.get());
               cond_var_->wait(l);
               RecordStart(ctx.get());
@@ -758,9 +791,16 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
       std::unique_ptr<IteratorBase> input_impl_;
       // Buffer for storing the (intermediate) batch results.
       std::deque<std::shared_ptr<BatchResult>> batch_results_ GUARDED_BY(*mu_);
+      // Background thread used for coordinating input processing.
       std::unique_ptr<Thread> runner_thread_ GUARDED_BY(*mu_);
+      // Determines whether the transformation has been cancelled.
       bool cancelled_ GUARDED_BY(*mu_) = false;
+      // Identifies the number of callers currently waiting for a batch result.
+      int64 waiting_ GUARDED_BY(*mu_) = 0;
+      // Identifies the maximum number of batch results to store.
+      int64 max_batch_results_ GUARDED_BY(*mu_);
       string prefix_end_;
+      std::unique_ptr<InstantiatedCapturedFunction> instantiated_captured_func_;
     };
 
     const DatasetBase* const input_;
@@ -781,11 +821,9 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
   NameAttrList func_;
 };
 
-REGISTER_KERNEL_BUILDER(Name("MapAndBatchDataset").Device(DEVICE_CPU),
-                        MapAndBatchDatasetOp);
-
-REGISTER_KERNEL_BUILDER(Name("MapAndBatchDatasetV2").Device(DEVICE_CPU),
-                        MapAndBatchDatasetOp);
+REGISTER_KERNEL_BUILDER(
+    Name("ExperimentalMapAndBatchDataset").Device(DEVICE_CPU),
+    MapAndBatchDatasetOp);
 
 }  // namespace
 }  // namespace data
diff --git a/tensorflow/core/kernels/data/matching_files_dataset_op.cc b/tensorflow/core/kernels/data/experimental/matching_files_dataset_op.cc
similarity index 99%
rename from tensorflow/core/kernels/data/matching_files_dataset_op.cc
rename to tensorflow/core/kernels/data/experimental/matching_files_dataset_op.cc
index d36b9e7e786..aa27a13416d 100644
--- a/tensorflow/core/kernels/data/matching_files_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/matching_files_dataset_op.cc
@@ -366,8 +366,9 @@ class MatchingFilesDatasetOp : public DatasetOpKernel {
   };
 };
 
-REGISTER_KERNEL_BUILDER(Name("MatchingFilesDataset").Device(DEVICE_CPU),
-                        MatchingFilesDatasetOp);
+REGISTER_KERNEL_BUILDER(
+    Name("ExperimentalMatchingFilesDataset").Device(DEVICE_CPU),
+    MatchingFilesDatasetOp);
 
 }  // namespace
 }  // namespace data
diff --git a/tensorflow/core/kernels/data/experimental/non_serializable_dataset_op.cc b/tensorflow/core/kernels/data/experimental/non_serializable_dataset_op.cc
index 953e086de37..61811ea14ed 100644
--- a/tensorflow/core/kernels/data/experimental/non_serializable_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/non_serializable_dataset_op.cc
@@ -75,6 +75,8 @@ class NonSerializableDatasetOp : public UnaryDatasetOpKernel {
       return errors::Unimplemented(DebugString(), "::AsGraphDefInternal");
     }
 
+    int64 Cardinality() const override { return input_->Cardinality(); }
+
    private:
     class Iterator : public DatasetIterator<Dataset> {
      public:
diff --git a/tensorflow/core/kernels/data/experimental/numa_map_and_batch_dataset_op.cc b/tensorflow/core/kernels/data/experimental/numa_map_and_batch_dataset_op.cc
index 068f8540230..705101262ce 100644
--- a/tensorflow/core/kernels/data/experimental/numa_map_and_batch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/numa_map_and_batch_dataset_op.cc
@@ -133,6 +133,15 @@ class NumaMapAndBatchDatasetOp : public UnaryDatasetOpKernel {
       return "NumaMapAndBatchDatasetOp::Dataset";
     }
 
+    int64 Cardinality() const override {
+      int64 n = input_->Cardinality();
+      if (n == kInfiniteCardinality || n == kUnknownCardinality) {
+        return n;
+      }
+      return n / batch_size_ +
+             (n % batch_size_ == 0 || drop_remainder_ ? 0 : 1);
+    }
+
    protected:
     Status AsGraphDefInternal(SerializationContext* ctx,
                               DatasetGraphDefBuilder* b,
@@ -206,7 +215,8 @@ class NumaMapAndBatchDatasetOp : public UnaryDatasetOpKernel {
         }
         TF_RETURN_IF_ERROR(
             dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_));
-        TF_RETURN_IF_ERROR(dataset()->captured_func_->Instantiate(ctx));
+        TF_RETURN_IF_ERROR(dataset()->captured_func_->Instantiate(
+            ctx, &instantiated_captured_func_));
         return Status::OK();
       }
 
@@ -1052,8 +1062,8 @@ class NumaMapAndBatchDatasetOp : public UnaryDatasetOpKernel {
           {
             tracing::ScopedActivity trace(
                 "NumaMapAndBatch::Iterator::Worker::FunctionExecution");
-            s = dataset()->captured_func_->Run(ctx.get(), std::move(input),
-                                               &return_values);
+            s = instantiated_captured_func_->Run(ctx.get(), std::move(input),
+                                                 &return_values);
           }
           WORKER_VLOG(4) << "ran function for index: " << index
                          << ", sequence_number: " << sequence_number;
@@ -1099,6 +1109,7 @@ class NumaMapAndBatchDatasetOp : public UnaryDatasetOpKernel {
       const std::shared_ptr<condition_variable> autotune_cond_var_;
       // The maximum number of parallel calls (can be auto-tuned).
       const std::shared_ptr<model::SharedState> num_parallel_calls_;
+      std::unique_ptr<InstantiatedCapturedFunction> instantiated_captured_func_;
 
       // Caches the last-seen value of num_parallel_calls_->value to
       // short-circuit starting workers.
diff --git a/tensorflow/core/kernels/data/experimental/parallel_interleave_dataset_op.cc b/tensorflow/core/kernels/data/experimental/parallel_interleave_dataset_op.cc
new file mode 100644
index 00000000000..0230f90aba1
--- /dev/null
+++ b/tensorflow/core/kernels/data/experimental/parallel_interleave_dataset_op.cc
@@ -0,0 +1,1085 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <atomic>
+#include <deque>
+#include <utility>
+
+#include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/framework/dataset.h"
+#include "tensorflow/core/framework/partial_tensor_shape.h"
+#include "tensorflow/core/framework/stats_aggregator.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/kernels/data/captured_function.h"
+#include "tensorflow/core/kernels/data/dataset_utils.h"
+#include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/lib/gtl/cleanup.h"
+#include "tensorflow/core/lib/random/random.h"
+#include "tensorflow/core/util/ptr_util.h"
+
+namespace tensorflow {
+namespace data {
+namespace {
+
+// See documentation in ../../ops/dataset_ops.cc for a high-level
+// description of the following op.
+
+class ParallelInterleaveDatasetOp : public UnaryDatasetOpKernel {
+ public:
+  explicit ParallelInterleaveDatasetOp(OpKernelConstruction* ctx)
+      : UnaryDatasetOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("f", &interleave_func_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_types", &output_types_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_shapes", &output_shapes_));
+  }
+
+  void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
+                   DatasetBase** output) override {
+    int64 cycle_length = 0;
+    OP_REQUIRES_OK(ctx,
+                   ParseScalarArgument(ctx, "cycle_length", &cycle_length));
+    OP_REQUIRES(ctx, cycle_length > 0,
+                errors::InvalidArgument("`cycle_length` must be > 0"));
+
+    int64 block_length = 0;
+    OP_REQUIRES_OK(ctx,
+                   ParseScalarArgument(ctx, "block_length", &block_length));
+    OP_REQUIRES(ctx, block_length > 0,
+                errors::InvalidArgument("`block_length` must be > 0"));
+
+    bool sloppy = false;
+    OP_REQUIRES_OK(ctx, ParseScalarArgument(ctx, "sloppy", &sloppy));
+
+    int64 buffer_output_elements = 0;
+    OP_REQUIRES_OK(ctx, ParseScalarArgument(ctx, "buffer_output_elements",
+                                            &buffer_output_elements));
+    OP_REQUIRES(
+        ctx, buffer_output_elements > 0,
+        errors::InvalidArgument("`buffer_output_elements` must be > 0"));
+
+    int64 prefetch_input_elements = 0;
+    OP_REQUIRES_OK(ctx, ParseScalarArgument(ctx, "prefetch_input_elements",
+                                            &prefetch_input_elements));
+    OP_REQUIRES(
+        ctx, prefetch_input_elements >= 0,
+        errors::InvalidArgument("`prefetch_input_elements` must be >= 0"));
+
+    std::unique_ptr<CapturedFunction> captured_func;
+    OP_REQUIRES_OK(
+        ctx, CapturedFunction::Create(interleave_func_, ctx, "other_arguments",
+                                      &captured_func));
+
+    *output =
+        new Dataset(ctx, input, interleave_func_, std::move(captured_func),
+                    cycle_length, block_length, sloppy, buffer_output_elements,
+                    prefetch_input_elements, output_types_, output_shapes_);
+  }
+
+ private:
+  class Dataset : public DatasetBase {
+   public:
+    Dataset(OpKernelContext* ctx, const DatasetBase* input,
+            const NameAttrList& func,
+            std::unique_ptr<CapturedFunction> captured_func, int64 cycle_length,
+            int64 block_length, bool sloppy, int64 buffer_output_elements,
+            int64 prefetch_input_elements, const DataTypeVector& output_types,
+            const std::vector<PartialTensorShape>& output_shapes)
+        : DatasetBase(DatasetContext(ctx)),
+          input_(input),
+          interleave_func_(func),
+          captured_func_(std::move(captured_func)),
+          cycle_length_(cycle_length),
+          block_length_(block_length),
+          sloppy_(sloppy),
+          buffer_output_elements_(buffer_output_elements),
+          prefetch_input_elements_(prefetch_input_elements),
+          output_types_(output_types),
+          output_shapes_(output_shapes) {
+      input_->Ref();
+    }
+
+    ~Dataset() override { input_->Unref(); }
+
+    std::unique_ptr<IteratorBase> MakeIteratorInternal(
+        const string& prefix) const override {
+      return std::unique_ptr<IteratorBase>(new Iterator(
+          {this, strings::StrCat(prefix, "::ParallelInterleave")}));
+    }
+
+    const DataTypeVector& output_dtypes() const override {
+      return output_types_;
+    }
+
+    const std::vector<PartialTensorShape>& output_shapes() const override {
+      return output_shapes_;
+    }
+
+    string DebugString() const override {
+      return "ParallelInterleaveDatasetOp::Dataset";
+    }
+
+   protected:
+    Status AsGraphDefInternal(SerializationContext* ctx,
+                              DatasetGraphDefBuilder* b,
+                              Node** output) const override {
+      TF_RETURN_IF_ERROR(b->AddFunction(ctx, interleave_func_.name()));
+      Node* input_node;
+      TF_RETURN_IF_ERROR(b->AddInputDataset(ctx, input_, &input_node));
+      Node* cycle_length_node;
+      TF_RETURN_IF_ERROR(b->AddScalar(cycle_length_, &cycle_length_node));
+      Node* block_length_node;
+      TF_RETURN_IF_ERROR(b->AddScalar(block_length_, &block_length_node));
+      Node* sloppy_node;
+      TF_RETURN_IF_ERROR(b->AddScalar(sloppy_, &sloppy_node));
+      Node* buffer_output_elements_node;
+      TF_RETURN_IF_ERROR(
+          b->AddScalar(buffer_output_elements_, &buffer_output_elements_node));
+      Node* prefetch_input_elements_node;
+      TF_RETURN_IF_ERROR(b->AddScalar(prefetch_input_elements_,
+                                      &prefetch_input_elements_node));
+      DataTypeVector other_arguments_types;
+      other_arguments_types.reserve(captured_func_->captured_inputs().size());
+      std::vector<Node*> other_arguments;
+      other_arguments.reserve(captured_func_->captured_inputs().size());
+      for (const Tensor& t : captured_func_->captured_inputs()) {
+        Node* node;
+        TF_RETURN_IF_ERROR(b->AddTensor(t, &node));
+        other_arguments.emplace_back(node);
+        other_arguments_types.emplace_back(t.dtype());
+      }
+      AttrValue f;
+      b->BuildAttrValue(interleave_func_, &f);
+      AttrValue other_arguments_types_attr;
+      b->BuildAttrValue(other_arguments_types, &other_arguments_types_attr);
+
+      TF_RETURN_IF_ERROR(b->AddDataset(
+          this,
+          {{0, input_node},
+           {2, cycle_length_node},
+           {3, block_length_node},
+           {4, sloppy_node},
+           {5, buffer_output_elements_node},
+           {6, prefetch_input_elements_node}},
+          {{1, other_arguments}},
+          {{"f", f}, {"Targuments", other_arguments_types_attr}}, output));
+      return Status::OK();
+    }
+
+   private:
+    int64 num_threads() const {
+      return cycle_length_ + prefetch_input_elements_;
+    }
+
+    // Parallel interleave's implementation is designed around a few principles:
+    //  1. Thread creation is relatively expensive. (Not reusing
+    //     threads causes a number of indirect costs such as poorer tcmalloc
+    //     performance due to thread-local caches, etc.) We allocate a fixed
+    //     number of threads at the start and never change. This is why we've
+    //     fused functionality that is theoretically orthogonal (i.e.
+    //     .prefetch()) into the implementation.
+    //  2. Drop-in replacement for standard interleave. The goal will be to
+    //     auto-opt people into an optimized implementation without any work
+    //     on the customer's part. We thus go through great pains to maintain
+    //     identical iteration orders, full determinism (disabled only via a
+    //     flag, etc.)
+    //  3. Performance across a variety of environments and I/O envelopes.
+    //
+    // The actual implementation centers around a collection of worker threads
+    // and their corresponding worker state (tracked in the `workers_` vector).
+    // Worker threads repeatedly receive a vector of Tensors that are used as
+    // input to the flat-map function (`captured_func_`). The output of this
+    // function must be a dataset. The worker thread then repeatedly calls
+    // `GetNext()`, maintaining a buffer of elements to minimize the likelihood
+    // that a caller will block waiting for an element to be produced.
+    //
+    // Pointers to these worker states are kept in 2 disjoint data structures:
+    //  1. `interleave_indices_` is a vector containing indices of WorkerStates
+    //     in `workers_` that we are interleaving. Worker threads backing these
+    //     WorkerStates should be regularly producing values.
+    //  2. `staging_indices_` is a deque containing indices of WorkerStates in
+    //     `workers_` that we will move to `interleave_indices_` when an
+    //     iterator in `interleave_indices_` is exhausted.
+    //
+    // The client calls `GetNext[Internal]()` to retrieve an output element. The
+    // internal implementation updates the state of `interleave_indices_` and
+    // `staging_indices_` as output iterators (run by the worker threads) are
+    // exhausted.
+    //
+    // `input_impl_` is the input iterator that generates arguments for the
+    // flat-map function (`captured_func_`). It is set to an iterator at
+    // Iterator construction, and is fixed until we consume all input elements.
+    // Once it is exhausted, we reset the unique_ptr to eagerly deallocate
+    // memory.
+    //
+    // A few invariants are maintained:
+    //  1. No element in interleave_indices_ should be a -1 unless
+    //     `staging_indices_` is empty and `input_impl_` is empty.
+    //  2. Every `worker_` element is pointed to by at most one element of the
+    //     union of `interleave_indices_` and `staging_indices_`.
+    //  3. Unless `input_impl_` is empty, every `worker_` must be pointed to by
+    //     an element in `interleave_indices_` or `staging_indices_`.
+    class Iterator : public DatasetIterator<Dataset> {
+     public:
+      explicit Iterator(const Params& params)
+          : DatasetIterator<Dataset>(params),
+            workers_(dataset()->num_threads()),
+            worker_thread_states_(dataset()->num_threads()) {}
+
+      ~Iterator() override {
+        mutex_lock l(mu_);
+        cancelled_ = true;
+        // Notify all workers in case they are blocked.
+        for (auto& worker : workers_) {
+          worker.cond_var.notify_all();
+        }
+      }
+
+      Status Initialize(IteratorContext* ctx) override {
+        TF_RETURN_IF_ERROR(
+            dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_));
+        return dataset()->captured_func_->Instantiate(
+            ctx, &instantiated_captured_func_);
+      }
+
+      // It is implemented so that it matches the deterministic interleave
+      // unless getting the next element would block and we are allowed to be
+      // sloppy.
+      Status GetNextInternal(IteratorContext* ctx,
+                             std::vector<Tensor>* out_tensors,
+                             bool* end_of_sequence) override {
+        mutex_lock l(mu_);
+        TF_RETURN_IF_ERROR(EnsureWorkerThreadsStarted(ctx));
+        while (!cancelled_) {
+          // Wait for an item to become available, blocking if necessary. If we
+          // are allowed to be sloppy, we can skip over input datasets that do
+          // not have an item readily available.
+          bool can_produce_elements = false;
+          bool must_wait_for_input = true;
+          for (int64 i = 0; i < interleave_indices_.size(); ++i) {
+            int64 index = (next_index_ + i) % interleave_indices_.size();
+            int64 current_worker_index = interleave_indices_[index];
+            if (current_worker_index < 0) {
+              continue;  // Empty interleave elements.
+            }
+            WorkerState* current_worker = &workers_[current_worker_index];
+            can_produce_elements |= current_worker->MayHaveElements();
+            if (!current_worker->outputs.empty()) {
+              // We have an element!
+              next_index_ = index;
+              const bool element_acquired_sloppily =
+                  dataset()->sloppy_ && i > 1;
+              if (!element_acquired_sloppily) {
+                // If the element was acquired in the regular (non-sloppy)
+                // order, then advance the current block and cycle pointers to
+                // the next element in the regular order.
+                block_count_++;
+                if (block_count_ == dataset()->block_length_) {
+                  next_index_ = (index + 1) % interleave_indices_.size();
+                  block_count_ = 0;
+                }
+              } else {
+                block_count_ = 0;
+              }
+              *end_of_sequence = false;
+              Status s = current_worker->outputs.front().status;
+              current_worker->outputs.front().output.swap(*out_tensors);
+              current_worker->outputs.pop_front();
+              current_worker->cond_var.notify_one();
+              return s;
+            } else if (current_worker->is_producing && !dataset()->sloppy_) {
+              // current_worker.outputs.empty(), and we must wait for this
+              // iterator.
+              if (next_index_ != index) {
+                // We have advanced to a new iterator; reset block counts.
+                next_index_ = index;
+                block_count_ = 0;
+              }
+              break;
+            } else if (!current_worker->is_producing) {
+              // This iterator has reached end of input.
+              interleave_indices_[index] = -1;
+              if (input_impl_) {
+                // Start prefetching a new iterator.
+                std::vector<Tensor> args;
+                bool end_of_input = false;
+                Status s = input_impl_->GetNext(ctx, &args, &end_of_input);
+                if (end_of_input) {
+                  input_impl_.reset();
+                } else {
+                  current_worker->SetInputs(s, std::move(args));
+                  staging_indices_.emplace_back(current_worker_index);
+                }
+              }
+
+              if (!staging_indices_.empty()) {
+                // Move a worker from `staging_indices_` to
+                // `interleave_indices_`.
+                interleave_indices_[index] = staging_indices_.front();
+                staging_indices_.pop_front();
+
+                next_index_ = (index + 1) % interleave_indices_.size();
+                block_count_ = 0;
+                // Restart the inner [for] loop
+                can_produce_elements = true;
+                must_wait_for_input = false;
+                break;
+              }
+            }
+          }
+
+          if (!can_produce_elements && !input_impl_) {
+            // No potential for future values.
+            *end_of_sequence = true;
+            return Status::OK();
+          }
+
+          if (must_wait_for_input) {
+            // Wait for elements to become available.
+            RecordStop(ctx);
+            if (dataset()->sloppy_) {
+              sloppy_cond_var_.wait(l);
+            } else {
+              workers_[interleave_indices_[next_index_]].cond_var.wait(l);
+            }
+            RecordStart(ctx);
+          }
+        }
+        return errors::Cancelled(
+            "ParallelInterleaveDatasetOp::Dataset::Iterator::GetNext");
+      }
+
+     protected:
+      std::shared_ptr<model::Node> CreateNode(
+          IteratorContext* ctx, model::Node::Args args) const override {
+        return model::MakeAsyncInterleaveManyNode(std::move(args),
+                                                  /*parameters=*/{});
+      }
+
+      Status SaveInternal(IteratorStateWriter* writer) override {
+        // The order of locking is important here to avoid deadlock.
+        mutex_lock l(mu_);
+        mutex_lock ckpt_l(ckpt_mu_);
+        if (input_impl_) {
+          TF_RETURN_IF_ERROR(SaveInput(writer, input_impl_));
+        } else {
+          TF_RETURN_IF_ERROR(
+              writer->WriteScalar(full_name("input_exhausted"), ""));
+        }
+        TF_RETURN_IF_ERROR(
+            writer->WriteScalar(full_name("next_index"), next_index_));
+        TF_RETURN_IF_ERROR(
+            writer->WriteScalar(full_name("block_count"), block_count_));
+        TF_RETURN_IF_ERROR(
+            writer->WriteScalar(full_name("workers_size"), workers_.size()));
+        for (int i = 0; i < workers_.size(); ++i) {
+          TF_RETURN_IF_ERROR(WriteWorkerStateLocked(writer, i));
+        }
+        for (int i = 0; i < worker_thread_states_.size(); ++i) {
+          TF_RETURN_IF_ERROR(WriteWorkerThreadStateLocked(writer, i));
+        }
+        TF_RETURN_IF_ERROR(writer->WriteScalar(full_name("interleave_size"),
+                                               interleave_indices_.size()));
+        for (int i = 0; i < interleave_indices_.size(); ++i) {
+          TF_RETURN_IF_ERROR(writer->WriteScalar(
+              full_name(strings::StrCat("interleave_indices_", i)),
+              interleave_indices_[i]));
+        }
+        TF_RETURN_IF_ERROR(writer->WriteScalar(full_name("staging_size"),
+                                               staging_indices_.size()));
+        for (int i = 0; i < staging_indices_.size(); ++i) {
+          TF_RETURN_IF_ERROR(writer->WriteScalar(
+              full_name(strings::StrCat("staging_indices_", i)),
+              staging_indices_[i]));
+        }
+        if (!worker_threads_.empty()) {
+          TF_RETURN_IF_ERROR(
+              writer->WriteScalar(full_name("worker_threads_running"), ""));
+        }
+        return Status::OK();
+      }
+
+      Status RestoreInternal(IteratorContext* ctx,
+                             IteratorStateReader* reader) override {
+        // The order of locking is important here to avoid deadlock.
+        mutex_lock l(mu_);
+        mutex_lock ckpt_l(ckpt_mu_);
+        if (!reader->Contains(full_name("input_exhausted"))) {
+          TF_RETURN_IF_ERROR(RestoreInput(ctx, reader, input_impl_));
+        } else {
+          input_impl_.reset();
+        }
+        int64 temp;
+        TF_RETURN_IF_ERROR(reader->ReadScalar(full_name("next_index"), &temp));
+        next_index_ = size_t(temp);
+        TF_RETURN_IF_ERROR(reader->ReadScalar(full_name("block_count"), &temp));
+        block_count_ = size_t(temp);
+
+        // Restore WorkerStates.
+        TF_RETURN_IF_ERROR(
+            reader->ReadScalar(full_name("workers_size"), &temp));
+        if (temp != dataset()->num_threads()) {
+          return errors::Internal("Expected ", dataset()->num_threads(),
+                                  " worker states but found ", temp, ".");
+        }
+        for (size_t i = 0; i < dataset()->num_threads(); ++i) {
+          TF_RETURN_IF_ERROR(ReadWorkerStateLocked(reader, i, ctx));
+        }
+        for (size_t i = 0; i < dataset()->num_threads(); ++i) {
+          TF_RETURN_IF_ERROR(ReadWorkerThreadStateLocked(reader, i, ctx));
+        }
+
+        // Restore `interleave_indices_`.
+        std::set<int64> all_indices;
+        {
+          int64 interleave_size;
+          TF_RETURN_IF_ERROR(reader->ReadScalar(full_name("interleave_size"),
+                                                &interleave_size));
+          interleave_indices_.reserve(interleave_size);
+          for (int64 i = 0; i < interleave_size; ++i) {
+            int64 temp;
+            TF_RETURN_IF_ERROR(reader->ReadScalar(
+                full_name(strings::StrCat("interleave_indices_", i)), &temp));
+            if (temp >= 0 && all_indices.find(temp) != all_indices.end()) {
+              return errors::Internal(
+                  "Duplicate entry for ", temp,
+                  " found when reading interleave and staging indices.");
+            }
+            if (temp >= 0) {
+              all_indices.insert(temp);
+            }
+            interleave_indices_.emplace_back(temp);
+          }
+        }
+
+        // Restore `staging_indices_`.
+        {
+          int64 staging_size;
+          TF_RETURN_IF_ERROR(
+              reader->ReadScalar(full_name("staging_size"), &staging_size));
+          for (int i = 0; i < staging_size; ++i) {
+            int64 temp;
+            TF_RETURN_IF_ERROR(reader->ReadScalar(
+                full_name(strings::StrCat("staging_indices_", i)), &temp));
+            if (all_indices.find(temp) != all_indices.end()) {
+              return errors::Internal(
+                  "Duplicate entry for ", temp,
+                  " found when reading interleave and staging indices.");
+            }
+            if (temp >= 0) {
+              all_indices.insert(temp);
+            }
+            staging_indices_.emplace_back(temp);
+          }
+        }
+
+        // Start Worker threads.
+        if (reader->Contains(full_name("worker_threads_running"))) {
+          worker_threads_.reserve(dataset()->num_threads());
+          for (size_t i = 0; i < dataset()->num_threads(); ++i) {
+            std::shared_ptr<IteratorContext> new_ctx(new IteratorContext(*ctx));
+            worker_threads_.emplace_back(ctx->env()->StartThread(
+                {}, strings::StrCat("tf_data_parallel_interleave_worker_", i),
+                [this, new_ctx, i]() { WorkerThread(new_ctx, i); }));
+          }
+        }
+        return Status::OK();
+      }
+
+     private:
+      // OutputElem contains the information from a call to GetNext by an output
+      // iterator.
+      struct OutputElem {
+        // The output iterator sets `status` if getting the output element
+        // fails.
+        Status status;
+        // The buffered data element.
+        std::vector<Tensor> output;
+
+        explicit OutputElem(const Status& s) : status(s) {}
+      };
+
+      // Worker threads operate on their relevant WorkerState structs.
+      //
+      // WorkerState's fields are all protected by mu_;
+      struct WorkerState {
+        // The arguments to be used to construct an output iterator.
+        std::vector<Tensor> input;
+        // The buffered output elements.
+        std::deque<OutputElem> outputs;
+        // Set to true iff the worker thread expects to append more elements to
+        // outputs. is_producing can be false despite !outputs.empty().
+        // Concretely, all output elements will have been consumed only when:
+        // is_producing == false && outputs.empty();
+        bool is_producing = false;
+        // Condition variable used to coordinate between threads. The worker
+        // thread waits on this condition variable when it is either (1) waiting
+        // for the main thread to add arguments to `input`, or (2) waiting for
+        // the main thread to consume an element of `outputs`. The main thread
+        // waits on cond_var if it is waiting for the worker thread to produce
+        // an element into `outputs` (this implies sloppy_==false).
+        condition_variable cond_var;
+
+        inline bool MayHaveElements() const {
+          return is_producing || !outputs.empty();
+        }
+
+        // Sets inputs for a worker thread and notifies it to start processing.
+        void SetInputs(const Status& s, std::vector<Tensor> input_arguments) {
+          if (s.ok()) {
+            DCHECK(!MayHaveElements())
+                << "Tried to start inputs, despite already producing!";
+            input = std::move(input_arguments);
+            is_producing = true;
+            cond_var.notify_one();
+          } else {
+            outputs.emplace_back(s);
+          }
+        }
+      };
+
+      // The internal state of a worker thread that is not already captured
+      // in its `WorkerState`.
+      //
+      // This is needed only for checkpointing purposes. We keep this
+      // separate from `WorkerState` and guard its fields using a separate
+      // lock `ckpt_mu_` so as to not affect the performance of main pipeline.
+      struct WorkerThreadState {
+        // The output element that has been produced from the input iterator
+        // and is waiting to be added to `WorkerState.outputs`.
+        OutputElem output_elem;
+
+        // Whether the input iterator returned an `end_of_sequence`.
+        bool end_of_sequence = false;
+
+        // Status returned from `MakeIteratorFromInputElement`.
+        Status iterator_creation_status;
+
+        // The arguments to be used to construct `iterator`.
+        std::vector<Tensor> input;
+
+        std::unique_ptr<IteratorBase> iterator;
+
+        WorkerThreadState() : output_elem(Status::OK()) {}
+      };
+
+      Status EnsureWorkerThreadsStarted(IteratorContext* ctx)
+          EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+        if (worker_threads_.empty()) {
+          worker_threads_.reserve(dataset()->num_threads());
+          for (int64 i = 0; i < dataset()->num_threads(); ++i) {
+            std::vector<Tensor> args;
+            bool end_of_input = false;
+            Status s = input_impl_->GetNext(ctx, &args, &end_of_input);
+            if (end_of_input) {
+              input_impl_.reset();
+              return Status::OK();
+            }
+            workers_[i].SetInputs(s, std::move(args));
+            std::shared_ptr<IteratorContext> new_ctx(new IteratorContext(*ctx));
+            worker_threads_.emplace_back(ctx->env()->StartThread(
+                {}, strings::StrCat("tf_data_parallel_interleave_worker_", i),
+                [this, new_ctx, i]() { WorkerThread(new_ctx, i); }));
+            if (i < dataset()->cycle_length_) {
+              interleave_indices_.push_back(i);
+            } else {
+              staging_indices_.push_back(i);
+            }
+          }
+          DCHECK(interleave_indices_.size() == dataset()->cycle_length_);
+          DCHECK(staging_indices_.size() ==
+                 dataset()->prefetch_input_elements_);
+        }
+        return Status::OK();
+      }
+
+      // Produces elements into the worker's output buffers.
+      void WorkerThread(const std::shared_ptr<IteratorContext>& ctx,
+                        const int64 thread_index) {
+        // Notes on checkpointing thread local state, i.e., `WorkerThreadState`:
+        //
+        // 1. Any local state that may need to be checkpointed should be kept
+        //    in `worker_thread_states_[thread_index]`.
+        // 2. `WorkerThreadState` should contain state that is needed only for
+        //    checkpointing, i.e., if we were to remove checkpointing support,
+        //    we could keep that state as local variables in this thread.
+        // 3. This thread should only read/write state at `thread_index`
+        //    and should not access other thread states.
+        // 4. When restoring from checkpoint, threads are started only after
+        //    the restore is complete.
+        // 5. Once restored from a checkpoint, the local state is edited only
+        //    by this thread. 3 & 4 allow making assumptions like temporarily
+        //    caching local state in this thread and using it outside a lock
+        //    e.g. `make_new_iterator`.
+        // 6. `ckpt_mu_` should be wisely used to create *consistent*
+        //    checkpoint markers.
+
+        // std::function arguments are copy-constructable, so we pass raw
+        // pointers, and then immediately wrap them to ensure correct ownership.
+        RecordStart(ctx.get());
+        auto cleanup = gtl::MakeCleanup([this, thread_index, ctx] {
+          mutex_lock l(mu_);
+          workers_[thread_index].cond_var.notify_all();
+          RecordStop(ctx.get());
+        });
+        bool make_new_iterator;
+        {
+          tf_shared_lock l(ckpt_mu_);
+          // Decide whether a new iterator should be built.
+          // 1. If there is an existing iterator, we use it.
+          // 2. If there was an error in iterator creation that could not be
+          //    notified to the client we attempt to send that to the client
+          //    first.
+          make_new_iterator =
+              worker_thread_states_[thread_index].iterator == nullptr &&
+              worker_thread_states_[thread_index].iterator_creation_status.ok();
+        }
+        // Even though `make_new_iterator` has cached values from
+        // `worker_thread_states_[thread_index]` which is guarded by ckpt_mu_,
+        // it is safe to *read* `make_new_iterator`outside of a lock without
+        // worrying about concurrent changes to values in
+        // `worker_thread_states_[thread_index]`. See comment at the start of
+        // this function for details.
+        while (true) {
+          // Whether creation of the iterator succeeded.
+          Status iterator_creation_status;
+          // 1. Build a new iterator or use the existing one.
+          if (make_new_iterator) {
+            // 1a. Get new input tensors or use the exiting ones.
+            bool read_new_input;
+            {
+              tf_shared_lock l(ckpt_mu_);
+              // worker_thread_states_[thread_index].input will be non-empty
+              // if checkpointing happened at CHECKPOINT_MARKER_A.
+              read_new_input =
+                  worker_thread_states_[thread_index].input.empty();
+            }
+
+            if (read_new_input) {
+              mutex_lock l(mu_);
+              while (!cancelled_ && !workers_[thread_index].is_producing) {
+                RecordStop(ctx.get());
+                workers_[thread_index].cond_var.wait(l);
+                RecordStart(ctx.get());
+              }
+              if (cancelled_) return;
+              // Copy the input tensors so that we do not need to block on `mu_`
+              // when building the iterator.
+              // We keep a copy of the input tensors in
+              // `WorkerThreadState.input` till the iterator is in use. This is
+              // used in `RestoreInternal` to re-build the iterator.
+              // TODO(b/78046638): Explore ways to avoid tracking the input
+              // tensors.
+              tf_shared_lock ckpt_l(ckpt_mu_);
+              worker_thread_states_[thread_index].input.swap(
+                  workers_[thread_index].input);
+              // CHECKPOINT_MARKER_A
+              // We have the input tensors but have not built the iterator yet.
+            }
+
+            // 1b. Run the user defined function to produce a new iterator.
+            {
+              tf_shared_lock l(ckpt_mu_);
+              worker_thread_states_[thread_index].iterator_creation_status =
+                  MakeIteratorFromInputElement(
+                      ctx.get(), worker_thread_states_[thread_index].input,
+                      thread_index, *instantiated_captured_func_, prefix(),
+                      &worker_thread_states_[thread_index].iterator);
+              iterator_creation_status =
+                  worker_thread_states_[thread_index].iterator_creation_status;
+              if (!iterator_creation_status.ok()) {
+                worker_thread_states_[thread_index].input.clear();
+              }
+              // CHECKPOINT_MARKER_B
+              // Either an iterator has been successfully built and placed in
+              // `worker_thread_states_[thread_index].iterator` or it failed and
+              // a non-OK status has been put in
+              // `worker_thread_states_[thread_index].iterator_creation_status`.
+            }
+          } else {
+            tf_shared_lock l(ckpt_mu_);
+            iterator_creation_status =
+                worker_thread_states_[thread_index].iterator_creation_status;
+            // Mark that we have used up the restored iterator.
+            make_new_iterator = true;
+          }
+          // 2. Start producing elements or send error state to client if
+          //    iterator creation failed.
+          if (!iterator_creation_status.ok()) {
+            mutex_lock l(mu_);
+            // Wait for space in the prefetch queue.
+            while (!cancelled_ && workers_[thread_index].outputs.size() ==
+                                      dataset()->buffer_output_elements_) {
+              RecordStop(ctx.get());
+              workers_[thread_index].cond_var.wait(l);
+              RecordStart(ctx.get());
+            }
+            if (cancelled_) return;
+            tf_shared_lock ckpt_l(ckpt_mu_);
+            workers_[thread_index].outputs.emplace_back(
+                iterator_creation_status);
+            workers_[thread_index].is_producing = false;
+            worker_thread_states_[thread_index].iterator_creation_status =
+                Status::OK();
+            // CHECKPOINT_MARKER_C
+            // Non-OK iterator creation status has been notified to the
+            // client.
+            workers_[thread_index].cond_var.notify_one();
+          } else {
+            bool end_of_sequence = false;
+            while (!end_of_sequence) {
+              // 3.a Produce an element!
+              {
+                tf_shared_lock ckpt_l(ckpt_mu_);
+                if (worker_thread_states_[thread_index]
+                        .output_elem.status.ok() &&
+                    worker_thread_states_[thread_index]
+                        .output_elem.output.empty() &&
+                    !worker_thread_states_[thread_index].end_of_sequence) {
+                  worker_thread_states_[thread_index].output_elem.status =
+                      worker_thread_states_[thread_index].iterator->GetNext(
+                          ctx.get(),
+                          &worker_thread_states_[thread_index]
+                               .output_elem.output,
+                          &worker_thread_states_[thread_index].end_of_sequence);
+                  end_of_sequence =
+                      worker_thread_states_[thread_index].end_of_sequence;
+                } else {
+                  end_of_sequence =
+                      worker_thread_states_[thread_index].end_of_sequence;
+                }
+                // CHECKPOINT_MARKER_D
+                // An element has been read or an error or end_of_sequence has
+                // been received from the input iterator and is waiting to be
+                // sent to client.
+              }
+
+              // 3.b Make it available to the client.
+              {
+                mutex_lock l(mu_);
+
+                // Wait for space in the prefetch queue.
+                while (!cancelled_ && workers_[thread_index].outputs.size() ==
+                                          dataset()->buffer_output_elements_) {
+                  RecordStop(ctx.get());
+                  workers_[thread_index].cond_var.wait(l);
+                  RecordStart(ctx.get());
+                }
+                if (cancelled_) return;
+
+                tf_shared_lock ckpt_l(ckpt_mu_);
+                workers_[thread_index].is_producing = !end_of_sequence;
+
+                // Output the element.
+
+                // Move the temporary state in WorkerThreadState to WorkerState
+                // and mark it as used.
+                if (end_of_sequence) {
+                  worker_thread_states_[thread_index].iterator.reset();
+                  worker_thread_states_[thread_index].input.clear();
+                  worker_thread_states_[thread_index].end_of_sequence = false;
+                } else {
+                  workers_[thread_index].outputs.emplace_back(
+                      worker_thread_states_[thread_index].output_elem.status);
+                  workers_[thread_index].outputs.back().output.swap(
+                      worker_thread_states_[thread_index].output_elem.output);
+                }
+                worker_thread_states_[thread_index].output_elem.status =
+                    Status::OK();
+                if (dataset()->sloppy_) {
+                  sloppy_cond_var_.notify_one();
+                } else {
+                  workers_[thread_index].cond_var.notify_one();
+                }
+                // CHECKPOINT_MARKER_E
+                // Output element or iterator status has been sent to the
+                // client.
+              }
+            }
+          }
+        }
+      }
+
+      Status WriteWorkerStateLocked(IteratorStateWriter* writer, int index)
+          EXCLUSIVE_LOCKS_REQUIRED(mu_, ckpt_mu_) {
+        string prefix = strings::StrCat("worker_", index);
+        TF_RETURN_IF_ERROR(writer->WriteScalar(
+            full_name(strings::StrCat(prefix, "_input_size")),
+            workers_[index].input.size()));
+        for (int i = 0; i < workers_[index].input.size(); ++i) {
+          TF_RETURN_IF_ERROR(writer->WriteTensor(
+              full_name(strings::StrCat(prefix, "_input_", i)),
+              workers_[index].input[i]));
+        }
+        TF_RETURN_IF_ERROR(writer->WriteScalar(
+            full_name(strings::StrCat(prefix, "_outputs_size")),
+            workers_[index].outputs.size()));
+        for (int i = 0; i < workers_[index].outputs.size(); ++i) {
+          TF_RETURN_IF_ERROR(WriteOutputElemLocked(
+              writer, workers_[index].outputs[i],
+              full_name(strings::StrCat(prefix, "_outputs_", i))));
+        }
+        if (workers_[index].is_producing) {
+          TF_RETURN_IF_ERROR(writer->WriteScalar(
+              full_name(strings::StrCat(prefix, "_is_producing")), ""));
+        }
+        return Status::OK();
+      }
+
+      Status ReadWorkerStateLocked(IteratorStateReader* reader, int index,
+                                   IteratorContext* ctx)
+          EXCLUSIVE_LOCKS_REQUIRED(mu_, ckpt_mu_) {
+        string worker_prefix = strings::StrCat("worker_", index);
+        // Restore inputs.
+        int64 input_size;
+        TF_RETURN_IF_ERROR(reader->ReadScalar(
+            full_name(strings::StrCat(worker_prefix, "_input_size")),
+            &input_size));
+        workers_[index].input.reserve(input_size);
+        for (int i = 0; i < input_size; ++i) {
+          workers_[index].input.emplace_back();
+          TF_RETURN_IF_ERROR(reader->ReadTensor(
+              full_name(strings::StrCat(worker_prefix, "_input_", i)),
+              &workers_[index].input.back()));
+        }
+        int64 outputs_size;
+        TF_RETURN_IF_ERROR(reader->ReadScalar(
+            full_name(strings::StrCat(worker_prefix, "_outputs_size")),
+            &outputs_size));
+        for (int i = 0; i < outputs_size; ++i) {
+          workers_[index].outputs.emplace_back(Status::OK());
+          TF_RETURN_IF_ERROR(ReadOutputElemLocked(
+              reader, &workers_[index].outputs.back(),
+              full_name(strings::StrCat(worker_prefix, "_outputs_", i))));
+        }
+        if (reader->Contains(
+                full_name(strings::StrCat(worker_prefix, "_is_producing")))) {
+          workers_[index].is_producing = true;
+        } else {
+          workers_[index].is_producing = false;
+        }
+        return Status::OK();
+      }
+
+      Status WriteWorkerThreadStateLocked(IteratorStateWriter* writer,
+                                          int index)
+          EXCLUSIVE_LOCKS_REQUIRED(mu_, ckpt_mu_) {
+        string prefix = strings::StrCat("worker_thread_", index);
+        if (worker_thread_states_[index].iterator != nullptr) {
+          TF_RETURN_IF_ERROR(
+              SaveInput(writer, worker_thread_states_[index].iterator));
+        } else {
+          TF_RETURN_IF_ERROR(writer->WriteScalar(
+              full_name(strings::StrCat(prefix, "_iterator_exhausted")), ""));
+        }
+        TF_RETURN_IF_ERROR(writer->WriteScalar(
+            full_name(strings::StrCat(prefix, "_input_size")),
+            worker_thread_states_[index].input.size()));
+        for (int i = 0; i < worker_thread_states_[index].input.size(); ++i) {
+          TF_RETURN_IF_ERROR(writer->WriteTensor(
+              full_name(strings::StrCat(prefix, "_input_", i)),
+              worker_thread_states_[index].input[i]));
+        }
+        TF_RETURN_IF_ERROR(WriteStatusLocked(
+            writer, strings::StrCat(prefix, "_iterator_creation_status"),
+            worker_thread_states_[index].iterator_creation_status));
+        TF_RETURN_IF_ERROR(WriteOutputElemLocked(
+            writer, worker_thread_states_[index].output_elem,
+            full_name(strings::StrCat(prefix, "_output"))));
+        if (worker_thread_states_[index].end_of_sequence) {
+          TF_RETURN_IF_ERROR(writer->WriteScalar(
+              full_name(strings::StrCat(prefix, "_end_of_sequence")), ""));
+        }
+        return Status::OK();
+      }
+
+      Status ReadWorkerThreadStateLocked(IteratorStateReader* reader, int index,
+                                         IteratorContext* ctx)
+          EXCLUSIVE_LOCKS_REQUIRED(mu_, ckpt_mu_) {
+        string worker_prefix = strings::StrCat("worker_thread_", index);
+        // Restore inputs.
+        int64 input_size;
+        TF_RETURN_IF_ERROR(reader->ReadScalar(
+            full_name(strings::StrCat(worker_prefix, "_input_size")),
+            &input_size));
+        worker_thread_states_[index].input.reserve(input_size);
+        for (int i = 0; i < input_size; ++i) {
+          worker_thread_states_[index].input.emplace_back();
+          TF_RETURN_IF_ERROR(reader->ReadTensor(
+              full_name(strings::StrCat(worker_prefix, "_input_", i)),
+              &worker_thread_states_[index].input.back()));
+        }
+        // Restore iterator.
+        if (reader->Contains(full_name(
+                strings::StrCat(worker_prefix, "_iterator_exhausted")))) {
+          worker_thread_states_[index].iterator.reset();
+        } else {
+          std::unique_ptr<IteratorBase> iterator;
+          Status s = MakeIteratorFromInputElement(
+              ctx, worker_thread_states_[index].input, index,
+              *instantiated_captured_func_, prefix(), &iterator);
+          TF_RETURN_IF_ERROR(RestoreInput(ctx, reader, iterator));
+          worker_thread_states_[index].iterator.swap(iterator);
+        }
+        TF_RETURN_IF_ERROR(ReadStatusLocked(
+            reader, strings::StrCat(worker_prefix, "_iterator_creation_status"),
+            &worker_thread_states_[index].iterator_creation_status));
+        TF_RETURN_IF_ERROR(ReadOutputElemLocked(
+            reader, &worker_thread_states_[index].output_elem,
+            full_name(strings::StrCat(worker_prefix, "_output"))));
+        if (reader->Contains(full_name(
+                strings::StrCat(worker_prefix, "_end_of_sequence")))) {
+          worker_thread_states_[index].end_of_sequence = true;
+        } else {
+          worker_thread_states_[index].end_of_sequence = false;
+        }
+        return Status::OK();
+      }
+
+      Status WriteOutputElemLocked(IteratorStateWriter* writer,
+                                   const OutputElem& output_elem,
+                                   const string& prefix)
+          EXCLUSIVE_LOCKS_REQUIRED(mu_, ckpt_mu_) {
+        TF_RETURN_IF_ERROR(WriteStatusLocked(
+            writer, strings::StrCat(prefix, "_status"), output_elem.status));
+        TF_RETURN_IF_ERROR(
+            writer->WriteScalar(strings::StrCat(prefix, "_output_size"),
+                                output_elem.output.size()));
+        for (int i = 0; i < output_elem.output.size(); ++i) {
+          TF_RETURN_IF_ERROR(writer->WriteTensor(
+              strings::StrCat(prefix, "_output_", i), output_elem.output[i]));
+        }
+        return Status::OK();
+      }
+
+      Status ReadOutputElemLocked(IteratorStateReader* reader,
+                                  OutputElem* output_elem, const string& prefix)
+          EXCLUSIVE_LOCKS_REQUIRED(mu_, ckpt_mu_) {
+        TF_RETURN_IF_ERROR(ReadStatusLocked(
+            reader, strings::StrCat(prefix, "_status"), &output_elem->status));
+        int64 output_size;
+        TF_RETURN_IF_ERROR(reader->ReadScalar(
+            strings::StrCat(prefix, "_output_size"), &output_size));
+        output_elem->output.reserve(output_size);
+        for (int i = 0; i < output_size; ++i) {
+          output_elem->output.emplace_back();
+          TF_RETURN_IF_ERROR(
+              reader->ReadTensor(strings::StrCat(prefix, "_output_", i),
+                                 &output_elem->output.back()));
+        }
+        return Status::OK();
+      }
+
+      Status WriteStatusLocked(IteratorStateWriter* writer,
+                               const string& prefix, const Status& status)
+          EXCLUSIVE_LOCKS_REQUIRED(mu_, ckpt_mu_) {
+        TF_RETURN_IF_ERROR(
+            writer->WriteScalar(full_name(strings::StrCat(prefix, "_code")),
+                                static_cast<int64>(status.code())));
+        if (!status.ok()) {
+          TF_RETURN_IF_ERROR(
+              writer->WriteScalar(full_name(strings::StrCat(prefix, "_msg")),
+                                  status.error_message()));
+        }
+        return Status::OK();
+      }
+
+      Status ReadStatusLocked(IteratorStateReader* reader, const string& prefix,
+                              Status* status)
+          EXCLUSIVE_LOCKS_REQUIRED(mu_, ckpt_mu_) {
+        int64 code_int;
+        TF_RETURN_IF_ERROR(reader->ReadScalar(
+            full_name(strings::StrCat(prefix, "_code")), &code_int));
+        error::Code code = static_cast<error::Code>(code_int);
+
+        if (code != error::Code::OK) {
+          string error_message;
+          TF_RETURN_IF_ERROR(reader->ReadScalar(
+              full_name(strings::StrCat(prefix, "_msg")), &error_message));
+          *status = Status(code, error_message);
+        } else {
+          *status = Status::OK();
+        }
+        return Status::OK();
+      }
+
+      // Mutex & condition variable to guard mutable iterator internals and
+      // coordinate among worker threads and client thread[s].
+      mutex mu_ ACQUIRED_BEFORE(ckpt_mu_);
+      // The main thread waits on this condition variable if running in sloppy
+      // mode and no values are available.
+      condition_variable sloppy_cond_var_;
+      // Mutex used to wait for a consistent state while checkpointing.
+      // Only Save and Restore require an exclusive lock on this mutex. In
+      // other scenarios we just acquire a shared lock so the pipeline's
+      // performance should not be affected in the absence of checkpointing.
+      // A thread must not wait on any condition variable while holding
+      // `ckpt_mu_` in either shared or exclusive modes.
+      mutex ckpt_mu_;
+
+      // The iterator producing elements which are converted to datasets by
+      // the dataset()->captured_func_ then interleaved together.
+      // input_impl_ is reset when we have exhausted its input.
+      std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(mu_);
+
+      std::unique_ptr<InstantiatedCapturedFunction> instantiated_captured_func_;
+
+      // The WorkerState structs the worker threads operate on.
+      // workers_ elements are in at most one of interleave_ and staging_.
+      std::vector<WorkerState> workers_ GUARDED_BY(mu_);
+
+      // Stores the temporary state of WorkerThreads which is not stored in
+      // WorkerState. This is used for checkpointing purposes only.
+      std::vector<WorkerThreadState> worker_thread_states_ GUARDED_BY(ckpt_mu_);
+
+      // Indices in `workers_` of iterators to interleave.
+      std::vector<int64> interleave_indices_ GUARDED_BY(mu_);
+      // Indices in `workers_` of prefetched iterators.
+      std::deque<int64> staging_indices_ GUARDED_BY(mu_);
+
+      // The index into output_elements_ for next element to produce.
+      size_t next_index_ GUARDED_BY(mu_) = 0;
+      // The number of items produced so far within the block
+      size_t block_count_ GUARDED_BY(mu_) = 0;
+      // Flag to instruct the worker threads to exit.
+      bool cancelled_ GUARDED_BY(mu_) = false;
+      // The worker threads. This must be last to ensure the
+      // threads have exited before any other members are deallocated.
+      // TODO(b/65178177): Avoid allocating additional threads.
+      std::vector<std::unique_ptr<Thread>> worker_threads_ GUARDED_BY(mu_);
+    };
+
+    const DatasetBase* const input_;
+    const NameAttrList interleave_func_;
+    const std::unique_ptr<CapturedFunction> captured_func_;
+    const int64 cycle_length_;
+    const int64 block_length_;
+    const bool sloppy_;
+    const int64 buffer_output_elements_;
+    const int64 prefetch_input_elements_;
+    const DataTypeVector output_types_;
+    const std::vector<PartialTensorShape> output_shapes_;
+  };
+
+  DataTypeVector output_types_;
+  std::vector<PartialTensorShape> output_shapes_;
+  NameAttrList interleave_func_;
+};
+
+REGISTER_KERNEL_BUILDER(
+    Name("ExperimentalParallelInterleaveDataset").Device(DEVICE_CPU),
+    ParallelInterleaveDatasetOp);
+
+}  // namespace
+}  // namespace data
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/parse_example_dataset_op.cc b/tensorflow/core/kernels/data/experimental/parse_example_dataset_op.cc
similarity index 85%
rename from tensorflow/core/kernels/data/parse_example_dataset_op.cc
rename to tensorflow/core/kernels/data/experimental/parse_example_dataset_op.cc
index 608b39d5f50..40771a5cf9b 100644
--- a/tensorflow/core/kernels/data/parse_example_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/parse_example_dataset_op.cc
@@ -183,99 +183,11 @@ class ParseExampleDatasetOp : public UnaryDatasetOpKernel {
 
     std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
-      auto map_fn = [this](IteratorContext* ctx, const string& prefix,
-                           std::vector<Tensor> input_element,
-                           std::vector<Tensor>* result, StatusCallback done) {
-        (*ctx->runner())([this, ctx, input_element, result, done]() {
-          thread::ThreadPool* device_threadpool =
-              ctx->lib()->device()->tensorflow_cpu_worker_threads()->workers;
-          std::vector<string> slice_vec;
-          for (const Tensor& t : input_element) {
-            auto serialized_t = t.flat<string>();
-            gtl::ArraySlice<string> slice(serialized_t.data(),
-                                          serialized_t.size());
-            for (auto it = slice.begin(); it != slice.end(); it++)
-              slice_vec.push_back(*it);
-          }
-          example::FastParseExampleConfig config = config_;
-          // local copy of config_ for modification.
-          auto stats_aggregator = ctx->stats_aggregator();
-          if (stats_aggregator) {
-            config.collect_feature_stats = true;
-          }
-          example::Result example_result;
-          Status s = FastParseExample(config, slice_vec, {}, device_threadpool,
-                                      &example_result);
-          if (s.ok()) {
-            (*result).resize(key_to_output_index_.size());
-            for (int d = 0; d < dense_keys_.size(); ++d) {
-              int output_index = key_to_output_index_.at(dense_keys_[d]);
-              CHECK(example_result.dense_values[d].dtype() ==
-                    output_dtypes()[output_index])
-                  << "Got wrong type for FastParseExample return value " << d
-                  << " (expected "
-                  << DataTypeString(output_dtypes()[output_index]) << ", got "
-                  << DataTypeString(example_result.dense_values[d].dtype())
-                  << ").";
-              CHECK(output_shapes()[output_index].IsCompatibleWith(
-                  example_result.dense_values[d].shape()))
-                  << "Got wrong shape for FastParseExample return value " << d
-                  << " (expected "
-                  << output_shapes()[output_index].DebugString() << ", got "
-                  << example_result.dense_values[d].shape().DebugString()
-                  << ").";
-              (*result)[output_index] = example_result.dense_values[d];
-            }
-            for (int d = 0; d < sparse_keys_.size(); ++d) {
-              int output_index = key_to_output_index_.at(sparse_keys_[d]);
-              (*result)[output_index] =
-                  Tensor(ctx->allocator({}), DT_VARIANT, {3});
-              Tensor& serialized_sparse = (*result)[output_index];
-              auto serialized_sparse_t = serialized_sparse.vec<Variant>();
-              serialized_sparse_t(0) = example_result.sparse_indices[d];
-              serialized_sparse_t(1) = example_result.sparse_values[d];
-              serialized_sparse_t(2) = example_result.sparse_shapes[d];
-              CHECK(serialized_sparse.dtype() == output_dtypes()[output_index])
-                  << "Got wrong type for FastParseExample return value " << d
-                  << " (expected "
-                  << DataTypeString(output_dtypes()[output_index]) << ", got "
-                  << DataTypeString(serialized_sparse.dtype()) << ").";
-              CHECK(output_shapes()[output_index].IsCompatibleWith(
-                  serialized_sparse.shape()))
-                  << "Got wrong shape for FastParseExample return value " << d
-                  << " (expected "
-                  << output_shapes()[output_index].DebugString() << ", got "
-                  << serialized_sparse.shape().DebugString() << ").";
-            }
-            // TODO(b/111553342): User provided tags instead of fixed tag.
-            if (stats_aggregator) {
-              stats_aggregator->IncrementCounter(
-                  "examples_count", "trainer",
-                  example_result.feature_stats.size());
-              for (example::PerExampleFeatureStats feature_stats :
-                   example_result.feature_stats) {
-                stats_aggregator->AddToHistogram(
-                    "features",
-                    {static_cast<double>(feature_stats.features_count)});
-                stats_aggregator->IncrementCounter(
-                    "features_count", "trainer", feature_stats.features_count);
-                stats_aggregator->IncrementCounter(
-                    "feature_values_count", "trainer",
-                    feature_stats.feature_values_count);
-                stats_aggregator->AddToHistogram(
-                    "feature-values",
-                    {static_cast<double>(feature_stats.feature_values_count)});
-              }
-            }
-          }
-          done(s);
-        });
-      };
-
+      std::unique_ptr<ParallelMapFunctor> parse_example_functor(
+          new ParseExampleFunctor(this));
       return NewParallelMapIterator(
           {this, strings::StrCat(prefix, "::ParseExample")}, input_,
-          /*init_func=*/nullptr, std::move(map_fn), num_parallel_calls_,
-          sloppy_);
+          std::move(parse_example_functor), num_parallel_calls_, sloppy_);
     }
 
     const DataTypeVector& output_dtypes() const override {
@@ -290,6 +202,8 @@ class ParseExampleDatasetOp : public UnaryDatasetOpKernel {
       return "ParseExampleDatasetOp::Dataset";
     }
 
+    int64 Cardinality() const override { return input_->Cardinality(); }
+
    protected:
     Status AsGraphDefInternal(SerializationContext* ctx,
                               DatasetGraphDefBuilder* b,
@@ -341,6 +255,111 @@ class ParseExampleDatasetOp : public UnaryDatasetOpKernel {
     }
 
    private:
+    class ParseExampleFunctor : public ParallelMapFunctor {
+     public:
+      explicit ParseExampleFunctor(const Dataset* dataset)
+          : dataset_(dataset) {}
+
+      void MapFunc(IteratorContext* ctx, const string& prefix,
+                   std::vector<Tensor> input, std::vector<Tensor>* output,
+                   StatusCallback callback) override {
+        (*ctx->runner())([this, ctx, input, output, callback]() {
+          thread::ThreadPool* device_threadpool =
+              ctx->lib()->device()->tensorflow_cpu_worker_threads()->workers;
+          std::vector<string> slice_vec;
+          for (const Tensor& t : input) {
+            auto serialized_t = t.flat<string>();
+            gtl::ArraySlice<string> slice(serialized_t.data(),
+                                          serialized_t.size());
+            for (auto it = slice.begin(); it != slice.end(); it++)
+              slice_vec.push_back(*it);
+          }
+          example::FastParseExampleConfig config = dataset_->config_;
+          // local copy of config_ for modification.
+          auto stats_aggregator = ctx->stats_aggregator();
+          if (stats_aggregator) {
+            config.collect_feature_stats = true;
+          }
+          example::Result example_result;
+          Status s = FastParseExample(config, slice_vec, {}, device_threadpool,
+                                      &example_result);
+          if (s.ok()) {
+            (*output).resize(dataset_->key_to_output_index_.size());
+            for (int d = 0; d < dataset_->dense_keys_.size(); ++d) {
+              int output_index =
+                  dataset_->key_to_output_index_.at(dataset_->dense_keys_[d]);
+              DCHECK(example_result.dense_values[d].dtype() ==
+                     dataset_->output_dtypes()[output_index])
+                  << "Got wrong type for FastParseExample return value " << d
+                  << " (expected "
+                  << DataTypeString(dataset_->output_dtypes()[output_index])
+                  << ", got "
+                  << DataTypeString(example_result.dense_values[d].dtype())
+                  << ").";
+              DCHECK(dataset_->output_shapes()[output_index].IsCompatibleWith(
+                  example_result.dense_values[d].shape()))
+                  << "Got wrong shape for FastParseExample return value " << d
+                  << " (expected "
+                  << dataset_->output_shapes()[output_index].DebugString()
+                  << ", got "
+                  << example_result.dense_values[d].shape().DebugString()
+                  << ").";
+              (*output)[output_index] = example_result.dense_values[d];
+            }
+            for (int d = 0; d < dataset_->sparse_keys_.size(); ++d) {
+              int output_index =
+                  dataset_->key_to_output_index_.at(dataset_->sparse_keys_[d]);
+              (*output)[output_index] =
+                  Tensor(ctx->allocator({}), DT_VARIANT, {3});
+              Tensor& serialized_sparse = (*output)[output_index];
+              auto serialized_sparse_t = serialized_sparse.vec<Variant>();
+              serialized_sparse_t(0) = example_result.sparse_indices[d];
+              serialized_sparse_t(1) = example_result.sparse_values[d];
+              serialized_sparse_t(2) = example_result.sparse_shapes[d];
+              DCHECK(serialized_sparse.dtype() ==
+                     dataset_->output_dtypes()[output_index])
+                  << "Got wrong type for FastParseExample return value " << d
+                  << " (expected "
+                  << DataTypeString(dataset_->output_dtypes()[output_index])
+                  << ", got " << DataTypeString(serialized_sparse.dtype())
+                  << ").";
+              DCHECK(dataset_->output_shapes()[output_index].IsCompatibleWith(
+                  serialized_sparse.shape()))
+                  << "Got wrong shape for FastParseExample return value " << d
+                  << " (expected "
+                  << dataset_->output_shapes()[output_index].DebugString()
+                  << ", got " << serialized_sparse.shape().DebugString()
+                  << ").";
+            }
+            // TODO(b/111553342): User provided tags instead of fixed tag.
+            if (stats_aggregator) {
+              stats_aggregator->IncrementCounter(
+                  "examples_count", "trainer",
+                  example_result.feature_stats.size());
+              for (example::PerExampleFeatureStats feature_stats :
+                   example_result.feature_stats) {
+                stats_aggregator->AddToHistogram(
+                    "features",
+                    {static_cast<double>(feature_stats.features_count)});
+                stats_aggregator->IncrementCounter(
+                    "features_count", "trainer", feature_stats.features_count);
+                stats_aggregator->IncrementCounter(
+                    "feature_values_count", "trainer",
+                    feature_stats.feature_values_count);
+                stats_aggregator->AddToHistogram(
+                    "feature-values",
+                    {static_cast<double>(feature_stats.feature_values_count)});
+              }
+            }
+          }
+          callback(s);
+        });
+      }
+
+     private:
+      const Dataset* dataset_;
+    };
+
     const DatasetBase* const input_;
     const std::vector<Tensor> dense_defaults_;
     const std::vector<string> sparse_keys_;
@@ -369,8 +388,9 @@ class ParseExampleDatasetOp : public UnaryDatasetOpKernel {
   std::vector<std::size_t> elements_per_stride_;
 };
 
-REGISTER_KERNEL_BUILDER(Name("ParseExampleDataset").Device(DEVICE_CPU),
-                        ParseExampleDatasetOp);
+REGISTER_KERNEL_BUILDER(
+    Name("ExperimentalParseExampleDataset").Device(DEVICE_CPU),
+    ParseExampleDatasetOp);
 
 }  // namespace
 }  // namespace data
diff --git a/tensorflow/core/kernels/data/experimental/prefetching_kernels.cc b/tensorflow/core/kernels/data/experimental/prefetching_kernels.cc
index 2c6179d9f59..af024520982 100644
--- a/tensorflow/core/kernels/data/experimental/prefetching_kernels.cc
+++ b/tensorflow/core/kernels/data/experimental/prefetching_kernels.cc
@@ -27,434 +27,6 @@ namespace tensorflow {
 namespace data {
 namespace {
 
-struct BufferElement {
-  // The producer sets `status` if getting the input element fails.
-  Status status;
-  // The buffered data element.
-  std::vector<Tensor> value;
-};
-
-using FunctionBufferCallback = std::function<void(const BufferElement&)>;
-
-class FunctionBufferingResource : public ResourceBase {
- public:
-  FunctionBufferingResource(FunctionLibraryRuntime* lib,
-                            std::unique_ptr<ProcessFunctionLibraryRuntime> pflr,
-                            const NameAttrList& func, int64 buffer_size,
-                            const string& source_device,
-                            const string& target_device,
-                            const std::vector<Tensor>& func_args,
-                            const DataTypeVector& output_types)
-      : lib_(lib),
-        pflr_(std::move(pflr)),
-        func_(func),
-        buffer_size_(buffer_size),
-        source_device_(source_device),
-        target_device_(target_device),
-        func_args_(func_args),
-        output_types_(output_types),
-        handle_(kInvalidHandle),
-        is_buffering_(false),
-        end_of_sequence_(false),
-        cancelled_(false) {}
-
-  ~FunctionBufferingResource() override {
-    Cancel();
-  }
-
-  string DebugString() override {
-    return strings::StrCat("FunctionBufferingResource. Size: ", buffer_size_,
-                           "; target_device: ", target_device_);
-  }
-
-  // Instantiates the function the first time it's called. After that it caches
-  // the handle.
-  Status Instantiate() LOCKS_EXCLUDED(mu_) {
-    mutex_lock l(mu_);
-    // Re-use existing handle if it's been set, effectively caching it.
-    if (handle_ != kInvalidHandle) {
-      return Status::OK();
-    }
-    AttrValueMap attr_values = func_.attr();
-    FunctionLibraryRuntime::InstantiateOptions opts;
-    opts.target = target_device_;
-    return lib_->Instantiate(func_.name(), AttrSlice(&attr_values), opts,
-                             &handle_);
-  }
-
-  // Returns true if we've got to the end of the sequence and exhausted the
-  // buffer.
-  bool Finished() LOCKS_EXCLUDED(mu_) {
-    mutex_lock l(mu_);
-    return end_of_sequence_ && buffer_.empty();
-  }
-
-  // Cancels any buffering / prefetching going on.
-  void Cancel() LOCKS_EXCLUDED(mu_) {
-    mutex_lock l(mu_);
-    cancelled_ = true;
-    while (is_buffering_) {
-      cond_var_.wait(l);
-    }
-  }
-
-  // Cancels all pending operations and then clears out the state.
-  void Reset() LOCKS_EXCLUDED(mu_) {
-    Cancel();
-    mutex_lock l(mu_);
-    buffer_.clear();
-    requests_.clear();
-    is_buffering_ = false;
-    end_of_sequence_ = false;
-    cancelled_ = false;
-  }
-
-  // If the buffer has anything, runs `callback` on the first element in the
-  // buffer, else schedules the `callback` to be called. Requires `args` and
-  // `lib` in case more function calls need to be scheduled.
-  void MaybeGet(FunctionBufferCallback callback) LOCKS_EXCLUDED(mu_) {
-    bool start_buffering = false;
-    bool produced_output = false;
-    BufferElement buffer_element;
-    {
-      mutex_lock l(mu_);
-      if (!is_buffering_ && !end_of_sequence_) {
-        start_buffering = true;
-      }
-      if (!buffer_.empty()) {
-        produced_output = true;
-        std::swap(buffer_element, buffer_.front());
-        buffer_.pop_front();
-      } else {
-        produced_output = false;
-        requests_.push_back(std::move(callback));
-      }
-    }
-    if (produced_output) {
-      callback(buffer_element);
-    }
-    if (start_buffering) {
-      FillBuffer();
-    }
-  }
-
- private:
-  void FillBuffer() LOCKS_EXCLUDED(mu_) {
-    FunctionLibraryRuntime::Handle handle;
-    std::vector<FunctionBufferCallback> cancellation_callbacks;
-    std::vector<BufferElement> cancellation_buffer_elements;
-    bool cancelled = false;
-    {
-      mutex_lock l(mu_);
-      handle = handle_;
-      if (cancelled_) {
-        cancelled = true;
-        // Run through and fulfill all pending requests, if possible.
-        while (!requests_.empty()) {
-          if (!buffer_.empty()) {
-            cancellation_buffer_elements.push_back(std::move(buffer_.front()));
-            buffer_.pop_front();
-            cancellation_callbacks.push_back(std::move(requests_.front()));
-            requests_.pop_front();
-          } else {
-            LOG(ERROR) << "Buffer ran out of elements and we couldn't satisfy: "
-                       << requests_.size() << " requests";
-            break;
-          }
-        }
-        is_buffering_ = false;
-      } else {
-        is_buffering_ = true;
-      }
-    }
-    if (cancelled) {
-      for (int i = 0; i < cancellation_callbacks.size(); ++i) {
-        cancellation_callbacks[i](cancellation_buffer_elements[i]);
-      }
-      cond_var_.notify_all();
-      return;
-    }
-    FunctionLibraryRuntime::Options opts;
-    // Copied from CapturedFunction::generate_step_id();
-    opts.step_id = -std::abs(static_cast<int64>(random::New64()));
-    opts.source_device = source_device_;
-    AllocatorAttributes arg_alloc_attr;
-    arg_alloc_attr.set_on_host(true);
-    opts.args_alloc_attrs.push_back(arg_alloc_attr);
-    for (const auto& dtype : output_types_) {
-      AllocatorAttributes ret_alloc_attrs;
-      if (DataTypeAlwaysOnHost(dtype)) {
-        ret_alloc_attrs.set_on_host(true);
-      }
-      opts.rets_alloc_attrs.push_back(ret_alloc_attrs);
-    }
-    if (opts.source_device != target_device_) {
-      opts.remote_execution = true;
-    }
-    opts.create_rendezvous = true;
-    auto* rets = new std::vector<Tensor>;
-    lib_->Run(opts, handle, func_args_, rets,
-              [this, rets](const Status& status) {
-                FunctionBufferCallback callback = nullptr;
-                BufferElement buffer_front;
-                bool restart_buffering = false;
-                {
-                  mutex_lock l(mu_);
-                  BufferElement buffer_element;
-                  buffer_element.status = status;
-                  if (status.ok()) {
-                    buffer_element.value.swap(*rets);
-                  } else {
-                    end_of_sequence_ = true;
-                    is_buffering_ = false;
-                  }
-                  buffer_.push_back(std::move(buffer_element));
-                  if (!requests_.empty()) {
-                    buffer_front = std::move(buffer_.front());
-                    buffer_.pop_front();
-                    callback = std::move(requests_.front());
-                    requests_.pop_front();
-                  }
-                  if (buffer_.size() < buffer_size_ && !end_of_sequence_) {
-                    restart_buffering = true;
-                  } else {
-                    // When the buffer is full, we don't want to call
-                    // FillBuffer() unless we're in cancellation phase in which
-                    // case FillBuffer() will do the final cleanup post
-                    // cancellation.
-                    if (cancelled_) {
-                      restart_buffering = true;
-                    }
-                    is_buffering_ = false;
-                  }
-                }
-                if (callback != nullptr) {
-                  callback(buffer_front);
-                }
-                if (restart_buffering) {
-                  FillBuffer();
-                }
-              });
-  }
-
-  mutex mu_;
-  FunctionLibraryRuntime* lib_;
-  std::unique_ptr<ProcessFunctionLibraryRuntime> pflr_;
-  NameAttrList func_;
-  const int64 buffer_size_;
-  const string source_device_;
-  const string target_device_;
-  const std::vector<Tensor> func_args_;
-  const DataTypeVector output_types_;
-  FunctionLibraryRuntime::Handle handle_ GUARDED_BY(mu_);
-  std::deque<BufferElement> buffer_ GUARDED_BY(mu_);
-  std::deque<FunctionBufferCallback> requests_ GUARDED_BY(mu_);
-  bool is_buffering_ GUARDED_BY(mu_);
-  bool end_of_sequence_ GUARDED_BY(mu_);
-  bool cancelled_ GUARDED_BY(mu_);
-  condition_variable cond_var_;
-};
-
-class FunctionBufferResourceHandleOp : public OpKernel {
- public:
-  explicit FunctionBufferResourceHandleOp(OpKernelConstruction* ctx)
-      : OpKernel(ctx), flib_def_(nullptr) {
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("f", &func_));
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("buffer_size", &buffer_size_));
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("container", &container_));
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("shared_name", &name_));
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_types", &output_types_));
-  }
-
-  ~FunctionBufferResourceHandleOp() override {
-    if (cinfo_.resource_is_private_to_kernel()) {
-      if (!cinfo_.resource_manager()
-               ->Delete<FunctionBufferingResource>(cinfo_.container(),
-                                                   cinfo_.name())
-               .ok()) {
-        // Do nothing; the resource can have been deleted by session resets.
-      }
-    }
-  }
-
-  void Compute(OpKernelContext* ctx) override {
-    const Tensor* string_arg;
-    OP_REQUIRES_OK(ctx, ctx->input("string_arg", &string_arg));
-    std::vector<Tensor> func_args;
-    func_args.push_back(*string_arg);
-
-    const string& source_device = ctx->device()->name();
-
-    // Obtain and canonicalize target_device.
-    const Tensor* target_arg;
-    OP_REQUIRES_OK(ctx, ctx->input("target_device", &target_arg));
-    string target_device;
-    OP_REQUIRES_OK(ctx, DeviceNameUtils::CanonicalizeDeviceName(
-                            target_arg->scalar<string>()(), source_device,
-                            &target_device));
-
-    FunctionLibraryRuntime* lib = ctx->function_library();
-    OP_REQUIRES(ctx, lib != nullptr,
-                errors::Internal("No function library is provided."));
-
-    mutex_lock l(mu_);
-    if (!initialized_) {
-      OP_REQUIRES_OK(ctx, cinfo_.Init(ctx->resource_manager(), def()));
-      FunctionLibraryRuntime* clone_lib;
-      std::unique_ptr<ProcessFunctionLibraryRuntime> pflr;
-      OP_REQUIRES_OK(ctx, lib->Clone(&flib_def_, &pflr, &clone_lib));
-      // Create the resource.
-      FunctionBufferingResource* buffer;
-      OP_REQUIRES_OK(
-          ctx,
-          ctx->resource_manager()->LookupOrCreate<FunctionBufferingResource>(
-              cinfo_.container(), cinfo_.name(), &buffer,
-              [clone_lib, &pflr, &source_device, &target_device, func_args,
-               this](FunctionBufferingResource** ptr) {
-                *ptr = new FunctionBufferingResource(
-                    clone_lib, std::move(pflr), func_, buffer_size_,
-                    source_device, target_device, func_args, output_types_);
-                return Status::OK();
-              }));
-      core::ScopedUnref s(buffer);
-      OP_REQUIRES_OK(ctx, buffer->Instantiate());
-      initialized_ = true;
-    }
-
-    OP_REQUIRES_OK(ctx, MakeResourceHandleToOutput(
-                            ctx, 0, cinfo_.container(), cinfo_.name(),
-                            MakeTypeIndex<FunctionBufferingResource>()));
-  }
-
- private:
-  mutex mu_;
-  ContainerInfo cinfo_ GUARDED_BY(mu_);
-  bool initialized_ GUARDED_BY(mu_) = false;
-  std::unique_ptr<FunctionLibraryDefinition> flib_def_;
-  NameAttrList func_;
-  int64 buffer_size_;
-  string container_;
-  string name_;
-  DataTypeVector output_types_;
-};
-
-REGISTER_KERNEL_BUILDER(Name("ExperimentalFunctionBufferingResource")
-                            .Device(DEVICE_CPU)
-                            .HostMemory("resource")
-                            .HostMemory("string_arg")
-                            .HostMemory("target_device"),
-                        FunctionBufferResourceHandleOp);
-REGISTER_KERNEL_BUILDER(Name("ExperimentalFunctionBufferingResource")
-                            .Device(DEVICE_GPU)
-                            .HostMemory("resource")
-                            .HostMemory("string_arg")
-                            .HostMemory("target_device"),
-                        FunctionBufferResourceHandleOp);
-#if TENSORFLOW_USE_SYCL
-REGISTER_KERNEL_BUILDER(Name("ExperimentalFunctionBufferingResource")
-                            .Device(DEVICE_SYCL)
-                            .HostMemory("resource")
-                            .HostMemory("string_arg")
-                            .HostMemory("target_device"),
-                        FunctionBufferResourceHandleOp);
-#endif  // TENSORFLOW_USE_SYCL
-
-// Prefetches and fills up a buffer by calling a function that provides the
-// elements to buffer.
-class FunctionBufferingResourceGetNextOp : public AsyncOpKernel {
- public:
-  explicit FunctionBufferingResourceGetNextOp(OpKernelConstruction* ctx)
-      : AsyncOpKernel(ctx) {}
-
-  ~FunctionBufferingResourceGetNextOp() override {}
-
-  void ComputeAsync(OpKernelContext* ctx, DoneCallback done) override {
-    ResourceHandle handle;
-    OP_REQUIRES_OK_ASYNC(
-        ctx, HandleFromInput(ctx, "function_buffer_resource", &handle), done);
-    FunctionBufferingResource* buffer = nullptr;
-    OP_REQUIRES_OK_ASYNC(
-        ctx, LookupResource<FunctionBufferingResource>(ctx, handle, &buffer),
-        done);
-
-    if (buffer->Finished()) {
-      buffer->Unref();
-      ctx->SetStatus(errors::OutOfRange("end_of_sequence"));
-      done();
-      return;
-    }
-
-    FunctionBufferCallback callback =
-        [ctx, buffer, done](const BufferElement& buffer_element) {
-          Status s = buffer_element.status;
-          if (!s.ok()) {
-            ctx->SetStatus(s);
-            buffer->Unref();
-            done();
-            return;
-          }
-          for (size_t i = 0; i < buffer_element.value.size(); ++i) {
-            ctx->set_output(i, buffer_element.value[i]);
-          }
-          buffer->Unref();
-          done();
-        };
-    buffer->MaybeGet(std::move(callback));
-  }
-};
-
-REGISTER_KERNEL_BUILDER(Name("ExperimentalFunctionBufferingResourceGetNext")
-                            .Device(DEVICE_CPU)
-                            .HostMemory("function_buffer_resource"),
-                        FunctionBufferingResourceGetNextOp);
-REGISTER_KERNEL_BUILDER(Name("ExperimentalFunctionBufferingResourceGetNext")
-                            .Device(DEVICE_GPU)
-                            .HostMemory("function_buffer_resource"),
-                        FunctionBufferingResourceGetNextOp);
-#if TENSORFLOW_USE_SYCL
-REGISTER_KERNEL_BUILDER(Name("ExperimentalFunctionBufferingResourceGetNext")
-                            .Device(DEVICE_SYCL)
-                            .HostMemory("function_buffer_resource"),
-                        FunctionBufferingResourceGetNextOp);
-#endif  // TENSORFLOW_USE_SYCL
-
-// Resets the FunctionBufferingResource, cancelling all pending requests and
-// clearing out the buffer.
-class FunctionBufferingResourceResetOp : public OpKernel {
- public:
-  explicit FunctionBufferingResourceResetOp(OpKernelConstruction* ctx)
-      : OpKernel(ctx) {}
-
-  ~FunctionBufferingResourceResetOp() override {}
-
-  void Compute(OpKernelContext* ctx) override {
-    ResourceHandle handle;
-    OP_REQUIRES_OK(ctx,
-                   HandleFromInput(ctx, "function_buffer_resource", &handle));
-    FunctionBufferingResource* buffer = nullptr;
-    OP_REQUIRES_OK(
-        ctx, LookupResource<FunctionBufferingResource>(ctx, handle, &buffer));
-    core::ScopedUnref s(buffer);
-
-    buffer->Reset();
-  }
-};
-
-REGISTER_KERNEL_BUILDER(Name("ExperimentalFunctionBufferingResourceReset")
-                            .Device(DEVICE_CPU)
-                            .HostMemory("function_buffer_resource"),
-                        FunctionBufferingResourceResetOp);
-REGISTER_KERNEL_BUILDER(Name("ExperimentalFunctionBufferingResourceReset")
-                            .Device(DEVICE_GPU)
-                            .HostMemory("function_buffer_resource"),
-                        FunctionBufferingResourceResetOp);
-#if TENSORFLOW_USE_SYCL
-REGISTER_KERNEL_BUILDER(Name("ExperimentalFunctionBufferingResourceReset")
-                            .Device(DEVICE_SYCL)
-                            .HostMemory("function_buffer_resource"),
-                        FunctionBufferingResourceResetOp);
-#endif  // TENSORFLOW_USE_SYCL
-
 class IteratorGetDeviceOp : public OpKernel {
  public:
   using OpKernel::OpKernel;
diff --git a/tensorflow/core/kernels/data/random_dataset_op.cc b/tensorflow/core/kernels/data/experimental/random_dataset_op.cc
similarity index 97%
rename from tensorflow/core/kernels/data/random_dataset_op.cc
rename to tensorflow/core/kernels/data/experimental/random_dataset_op.cc
index 816405fea90..6d85cd5c450 100644
--- a/tensorflow/core/kernels/data/random_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/random_dataset_op.cc
@@ -76,6 +76,8 @@ class RandomDatasetOp : public DatasetOpKernel {
                              ")::Dataset");
     }
 
+    int64 Cardinality() const override { return kInfiniteCardinality; }
+
    protected:
     Status AsGraphDefInternal(SerializationContext* ctx,
                               DatasetGraphDefBuilder* b,
@@ -152,7 +154,7 @@ class RandomDatasetOp : public DatasetOpKernel {
   };
 };
 
-REGISTER_KERNEL_BUILDER(Name("RandomDataset").Device(DEVICE_CPU),
+REGISTER_KERNEL_BUILDER(Name("ExperimentalRandomDataset").Device(DEVICE_CPU),
                         RandomDatasetOp);
 
 }  // namespace
diff --git a/tensorflow/core/kernels/data/scan_dataset_op.cc b/tensorflow/core/kernels/data/experimental/scan_dataset_op.cc
similarity index 94%
rename from tensorflow/core/kernels/data/scan_dataset_op.cc
rename to tensorflow/core/kernels/data/experimental/scan_dataset_op.cc
index d9182d15bed..2eceab137b7 100644
--- a/tensorflow/core/kernels/data/scan_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/scan_dataset_op.cc
@@ -93,6 +93,10 @@ class ScanDatasetOp : public UnaryDatasetOpKernel {
 
     string DebugString() const override { return "ScanDatasetOp::Dataset"; }
 
+    // TODO(b/120482302): Note that this is inaccurate until MapDataset is
+    // modified to preserve cardinality.
+    int64 Cardinality() const override { return input_->Cardinality(); }
+
    protected:
     Status AsGraphDefInternal(SerializationContext* ctx,
                               DatasetGraphDefBuilder* b,
@@ -143,7 +147,8 @@ class ScanDatasetOp : public UnaryDatasetOpKernel {
       Status Initialize(IteratorContext* ctx) override {
         TF_RETURN_IF_ERROR(
             dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_));
-        return dataset()->captured_func_->Instantiate(ctx);
+        return dataset()->captured_func_->Instantiate(
+            ctx, &instantiated_captured_func_);
       }
 
       Status GetNextInternal(IteratorContext* ctx,
@@ -168,8 +173,8 @@ class ScanDatasetOp : public UnaryDatasetOpKernel {
         state_and_output.reserve(dataset()->state_types_.size() +
                                  output_dtypes().size());
 
-        Status s = dataset()->captured_func_->Run(ctx, std::move(args),
-                                                  &state_and_output);
+        Status s = instantiated_captured_func_->Run(ctx, std::move(args),
+                                                    &state_and_output);
         if (s.ok()) {
           state_.clear();
           size_t i = 0;
@@ -252,6 +257,7 @@ class ScanDatasetOp : public UnaryDatasetOpKernel {
       mutex mu_;
       std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(mu_);
       std::vector<Tensor> state_ GUARDED_BY(mu_);
+      std::unique_ptr<InstantiatedCapturedFunction> instantiated_captured_func_;
     };
 
     const DatasetBase* const input_;
@@ -269,7 +275,8 @@ class ScanDatasetOp : public UnaryDatasetOpKernel {
   NameAttrList func_;
 };
 
-REGISTER_KERNEL_BUILDER(Name("ScanDataset").Device(DEVICE_CPU), ScanDatasetOp);
+REGISTER_KERNEL_BUILDER(Name("ExperimentalScanDataset").Device(DEVICE_CPU),
+                        ScanDatasetOp);
 
 }  // namespace
 }  // namespace data
diff --git a/tensorflow/core/kernels/data/stats_aggregator_dataset_op.cc b/tensorflow/core/kernels/data/experimental/set_stats_aggregator_dataset_op.cc
similarity index 97%
rename from tensorflow/core/kernels/data/stats_aggregator_dataset_op.cc
rename to tensorflow/core/kernels/data/experimental/set_stats_aggregator_dataset_op.cc
index a21b3fc16b7..fe128005fac 100644
--- a/tensorflow/core/kernels/data/stats_aggregator_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/set_stats_aggregator_dataset_op.cc
@@ -129,6 +129,8 @@ class SetStatsAggregatorDatasetOp : public UnaryDatasetOpKernel {
       return "SetStatsAggregatorDatasetOp::Dataset";
     }
 
+    int64 Cardinality() const override { return input_->Cardinality(); }
+
    protected:
     Status AsGraphDefInternal(SerializationContext* ctx,
                               DatasetGraphDefBuilder* b,
@@ -203,8 +205,9 @@ class SetStatsAggregatorDatasetOp : public UnaryDatasetOpKernel {
   };
 };
 
-REGISTER_KERNEL_BUILDER(Name("SetStatsAggregatorDataset").Device(DEVICE_CPU),
-                        SetStatsAggregatorDatasetOp);
+REGISTER_KERNEL_BUILDER(
+    Name("ExperimentalSetStatsAggregatorDataset").Device(DEVICE_CPU),
+    SetStatsAggregatorDatasetOp);
 }  // namespace
 }  // namespace data
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/experimental/sleep_dataset_op.cc b/tensorflow/core/kernels/data/experimental/sleep_dataset_op.cc
index c7bf89cbdeb..d2fb8ac4f33 100644
--- a/tensorflow/core/kernels/data/experimental/sleep_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/sleep_dataset_op.cc
@@ -68,6 +68,8 @@ class SleepDatasetOp : public UnaryDatasetOpKernel {
 
     string DebugString() const override { return "SleepDatasetOp::Dataset"; }
 
+    int64 Cardinality() const override { return input_->Cardinality(); }
+
    protected:
     Status AsGraphDefInternal(SerializationContext* ctx,
                               DatasetGraphDefBuilder* b,
diff --git a/tensorflow/core/kernels/data/slide_dataset_op.cc b/tensorflow/core/kernels/data/experimental/sliding_window_dataset_op.cc
similarity index 95%
rename from tensorflow/core/kernels/data/slide_dataset_op.cc
rename to tensorflow/core/kernels/data/experimental/sliding_window_dataset_op.cc
index e67c5272b6f..1ce4fbd3136 100644
--- a/tensorflow/core/kernels/data/slide_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/sliding_window_dataset_op.cc
@@ -29,9 +29,9 @@ namespace {
 // See documentation in ../../ops/dataset_ops.cc for a high-level
 // description of the following op.
 
-class SlideDatasetOp : public UnaryDatasetOpKernel {
+class SlidingWindowDatasetOp : public UnaryDatasetOpKernel {
  public:
-  explicit SlideDatasetOp(OpKernelConstruction* ctx)
+  explicit SlidingWindowDatasetOp(OpKernelConstruction* ctx)
       : UnaryDatasetOpKernel(ctx) {}
 
   void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
@@ -99,10 +99,18 @@ class SlideDatasetOp : public UnaryDatasetOpKernel {
     }
 
     string DebugString() const override {
-      return strings::StrCat("SlideDatasetOp(", window_size_, ", ",
+      return strings::StrCat("SlidingWindowDatasetOp(", window_size_, ", ",
                              window_shift_, ", ", window_stride_, ")::Dataset");
     }
 
+    int64 Cardinality() const override {
+      int64 n = input_->Cardinality();
+      if (n == kInfiniteCardinality || n == kUnknownCardinality) {
+        return n;
+      }
+      return n / window_shift_;
+    }
+
    protected:
     Status AsGraphDefInternal(SerializationContext* ctx,
                               DatasetGraphDefBuilder* b,
@@ -295,8 +303,9 @@ class SlideDatasetOp : public UnaryDatasetOpKernel {
   };
 };
 
-REGISTER_KERNEL_BUILDER(Name("SlideDataset").Device(DEVICE_CPU),
-                        SlideDatasetOp);
+REGISTER_KERNEL_BUILDER(
+    Name("ExperimentalSlidingWindowDataset").Device(DEVICE_CPU),
+    SlidingWindowDatasetOp);
 
 }  // namespace
 }  // namespace data
diff --git a/tensorflow/core/kernels/data/sql/BUILD b/tensorflow/core/kernels/data/experimental/sql/BUILD
similarity index 100%
rename from tensorflow/core/kernels/data/sql/BUILD
rename to tensorflow/core/kernels/data/experimental/sql/BUILD
diff --git a/tensorflow/core/kernels/data/sql/driver_manager.cc b/tensorflow/core/kernels/data/experimental/sql/driver_manager.cc
similarity index 88%
rename from tensorflow/core/kernels/data/sql/driver_manager.cc
rename to tensorflow/core/kernels/data/experimental/sql/driver_manager.cc
index 783d1e6cb28..58174f69a44 100644
--- a/tensorflow/core/kernels/data/sql/driver_manager.cc
+++ b/tensorflow/core/kernels/data/experimental/sql/driver_manager.cc
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/core/kernels/data/sql/driver_manager.h"
-#include "tensorflow/core/kernels/data/sql/sqlite_query_connection.h"
+#include "tensorflow/core/kernels/data/experimental/sql/driver_manager.h"
+#include "tensorflow/core/kernels/data/experimental/sql/sqlite_query_connection.h"
 
 namespace tensorflow {
 namespace data {
diff --git a/tensorflow/core/kernels/data/sql/driver_manager.h b/tensorflow/core/kernels/data/experimental/sql/driver_manager.h
similarity index 81%
rename from tensorflow/core/kernels/data/sql/driver_manager.h
rename to tensorflow/core/kernels/data/experimental/sql/driver_manager.h
index c5428f396b0..6afadf91a47 100644
--- a/tensorflow/core/kernels/data/sql/driver_manager.h
+++ b/tensorflow/core/kernels/data/experimental/sql/driver_manager.h
@@ -12,10 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_CORE_KERNELS_DATA_SQL_DRIVER_MANAGER_H_
-#define TENSORFLOW_CORE_KERNELS_DATA_SQL_DRIVER_MANAGER_H_
+#ifndef TENSORFLOW_CORE_KERNELS_DATA_EXPERIMENTAL_SQL_DRIVER_MANAGER_H_
+#define TENSORFLOW_CORE_KERNELS_DATA_EXPERIMENTAL_SQL_DRIVER_MANAGER_H_
 
-#include "tensorflow/core/kernels/data/sql/query_connection.h"
+#include "tensorflow/core/kernels/data/experimental/sql/query_connection.h"
 
 namespace tensorflow {
 namespace data {
@@ -38,4 +38,4 @@ class DriverManager {
 }  // namespace data
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_CORE_KERNELS_DATA_SQL_DRIVER_MANAGER_H_
+#endif  // TENSORFLOW_CORE_KERNELS_DATA_EXPERIMENTAL_SQL_DRIVER_MANAGER_H_
diff --git a/tensorflow/core/kernels/data/sql/query_connection.h b/tensorflow/core/kernels/data/experimental/sql/query_connection.h
similarity index 92%
rename from tensorflow/core/kernels/data/sql/query_connection.h
rename to tensorflow/core/kernels/data/experimental/sql/query_connection.h
index 2fd229a9bfd..10c66436792 100644
--- a/tensorflow/core/kernels/data/sql/query_connection.h
+++ b/tensorflow/core/kernels/data/experimental/sql/query_connection.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_CORE_KERNELS_DATA_SQL_QUERY_CONNECTION_H_
-#define TENSORFLOW_CORE_KERNELS_DATA_SQL_QUERY_CONNECTION_H_
+#ifndef TENSORFLOW_CORE_KERNELS_DATA_EXPERIMENTAL_SQL_QUERY_CONNECTION_H_
+#define TENSORFLOW_CORE_KERNELS_DATA_EXPERIMENTAL_SQL_QUERY_CONNECTION_H_
 
 #include "tensorflow/core/framework/tensor.h"
 
@@ -67,4 +67,4 @@ class QueryConnection {
 }  // namespace data
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_CORE_KERNELS_DATA_SQL_QUERY_CONNECTION_H_
+#endif  // TENSORFLOW_CORE_KERNELS_DATA_EXPERIMENTAL_SQL_QUERY_CONNECTION_H_
diff --git a/tensorflow/core/kernels/data/sql/sqlite_query_connection.cc b/tensorflow/core/kernels/data/experimental/sql/sqlite_query_connection.cc
similarity index 97%
rename from tensorflow/core/kernels/data/sql/sqlite_query_connection.cc
rename to tensorflow/core/kernels/data/experimental/sql/sqlite_query_connection.cc
index 1d374898dc3..cadceee8f51 100644
--- a/tensorflow/core/kernels/data/sql/sqlite_query_connection.cc
+++ b/tensorflow/core/kernels/data/experimental/sql/sqlite_query_connection.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/core/kernels/data/sql/sqlite_query_connection.h"
+#include "tensorflow/core/kernels/data/experimental/sql/sqlite_query_connection.h"
 
 #include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/register_types.h"
@@ -106,7 +106,7 @@ void SqliteQueryConnection::FillTensorWithResultSetEntry(
       break;
     // Error preemptively thrown by SqlDatasetOp::MakeDataset in this case.
     default:
-      LOG(FATAL)
+      LOG(ERROR)
           << "Use of unsupported TensorFlow data type by 'SqlQueryConnection': "
           << DataTypeString(data_type) << ".";
   }
diff --git a/tensorflow/core/kernels/data/sql/sqlite_query_connection.h b/tensorflow/core/kernels/data/experimental/sql/sqlite_query_connection.h
similarity index 84%
rename from tensorflow/core/kernels/data/sql/sqlite_query_connection.h
rename to tensorflow/core/kernels/data/experimental/sql/sqlite_query_connection.h
index 175492c49db..61df29065e1 100644
--- a/tensorflow/core/kernels/data/sql/sqlite_query_connection.h
+++ b/tensorflow/core/kernels/data/experimental/sql/sqlite_query_connection.h
@@ -12,12 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_CORE_KERNELS_DATA_SQL_SQLITE_QUERY_CONNECTION_H_
-#define TENSORFLOW_CORE_KERNELS_DATA_SQL_SQLITE_QUERY_CONNECTION_H_
+#ifndef TENSORFLOW_CORE_KERNELS_DATA_EXPERIMENTAL_SQL_SQLITE_QUERY_CONNECTION_H_
+#define TENSORFLOW_CORE_KERNELS_DATA_EXPERIMENTAL_SQL_SQLITE_QUERY_CONNECTION_H_
 
 #include <memory>
 
-#include "tensorflow/core/kernels/data/sql/query_connection.h"
+#include "tensorflow/core/kernels/data/experimental/sql/query_connection.h"
 #include "tensorflow/core/lib/db/sqlite.h"
 #include "tensorflow/core/platform/types.h"
 
@@ -53,4 +53,4 @@ class SqliteQueryConnection : public QueryConnection {
 }  // namespace data
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_CORE_KERNELS_DATA_SQL_SQLITE_QUERY_CONNECTION_H_
+#endif  // TENSORFLOW_CORE_KERNELS_DATA_EXPERIMENTAL_SQL_SQLITE_QUERY_CONNECTION_H_
diff --git a/tensorflow/core/kernels/data/sql_dataset_ops.cc b/tensorflow/core/kernels/data/experimental/sql_dataset_op.cc
similarity index 96%
rename from tensorflow/core/kernels/data/sql_dataset_ops.cc
rename to tensorflow/core/kernels/data/experimental/sql_dataset_op.cc
index f01ecf84afa..c16d8ed02cc 100644
--- a/tensorflow/core/kernels/data/sql_dataset_ops.cc
+++ b/tensorflow/core/kernels/data/experimental/sql_dataset_op.cc
@@ -17,8 +17,8 @@ limitations under the License.
 #include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/kernels/data/sql/driver_manager.h"
-#include "tensorflow/core/kernels/data/sql/query_connection.h"
+#include "tensorflow/core/kernels/data/experimental/sql/driver_manager.h"
+#include "tensorflow/core/kernels/data/experimental/sql/query_connection.h"
 #include "tensorflow/core/lib/io/inputbuffer.h"
 #include "tensorflow/core/lib/io/record_reader.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
@@ -214,7 +214,8 @@ class SqlDatasetOp : public DatasetOpKernel {
   std::vector<PartialTensorShape> output_shapes_;
 };
 
-REGISTER_KERNEL_BUILDER(Name("SqlDataset").Device(DEVICE_CPU), SqlDatasetOp);
+REGISTER_KERNEL_BUILDER(Name("ExperimentalSqlDataset").Device(DEVICE_CPU),
+                        SqlDatasetOp);
 
 }  // namespace
 }  // namespace data
diff --git a/tensorflow/core/kernels/data/stats_aggregator_ops.cc b/tensorflow/core/kernels/data/experimental/stats_aggregator_ops.cc
similarity index 95%
rename from tensorflow/core/kernels/data/stats_aggregator_ops.cc
rename to tensorflow/core/kernels/data/experimental/stats_aggregator_ops.cc
index 2d514676163..894465e1814 100644
--- a/tensorflow/core/kernels/data/stats_aggregator_ops.cc
+++ b/tensorflow/core/kernels/data/experimental/stats_aggregator_ops.cc
@@ -141,10 +141,12 @@ class StatsAggregatorSummaryOp : public OpKernel {
   }
 };
 
-REGISTER_KERNEL_BUILDER(Name("StatsAggregatorHandle").Device(DEVICE_CPU),
-                        StatsAggregatorHandleOp);
-REGISTER_KERNEL_BUILDER(Name("StatsAggregatorSummary").Device(DEVICE_CPU),
-                        StatsAggregatorSummaryOp);
+REGISTER_KERNEL_BUILDER(
+    Name("ExperimentalStatsAggregatorHandle").Device(DEVICE_CPU),
+    StatsAggregatorHandleOp);
+REGISTER_KERNEL_BUILDER(
+    Name("ExperimentalStatsAggregatorSummary").Device(DEVICE_CPU),
+    StatsAggregatorSummaryOp);
 
 }  // namespace
 }  // namespace data
diff --git a/tensorflow/core/kernels/data/stats_dataset_ops.cc b/tensorflow/core/kernels/data/experimental/stats_dataset_ops.cc
similarity index 95%
rename from tensorflow/core/kernels/data/stats_dataset_ops.cc
rename to tensorflow/core/kernels/data/experimental/stats_dataset_ops.cc
index da0039773ce..1961f25df84 100644
--- a/tensorflow/core/kernels/data/stats_dataset_ops.cc
+++ b/tensorflow/core/kernels/data/experimental/stats_dataset_ops.cc
@@ -78,6 +78,8 @@ class LatencyStatsDatasetOp : public UnaryDatasetOpKernel {
       return "LatencyStatsDatasetOp::Dataset";
     }
 
+    int64 Cardinality() const override { return input_->Cardinality(); }
+
    protected:
     Status AsGraphDefInternal(SerializationContext* ctx,
                               DatasetGraphDefBuilder* b,
@@ -186,6 +188,8 @@ class BytesProducedStatsDatasetOp : public UnaryDatasetOpKernel {
       return "BytesProducedStatsDatasetOp::Dataset";
     }
 
+    int64 Cardinality() const override { return input_->Cardinality(); }
+
    protected:
     Status AsGraphDefInternal(SerializationContext* ctx,
                               DatasetGraphDefBuilder* b,
@@ -255,10 +259,12 @@ class BytesProducedStatsDatasetOp : public UnaryDatasetOpKernel {
   };
 };
 
-REGISTER_KERNEL_BUILDER(Name("LatencyStatsDataset").Device(DEVICE_CPU),
-                        LatencyStatsDatasetOp);
-REGISTER_KERNEL_BUILDER(Name("BytesProducedStatsDataset").Device(DEVICE_CPU),
-                        BytesProducedStatsDatasetOp);
+REGISTER_KERNEL_BUILDER(
+    Name("ExperimentalLatencyStatsDataset").Device(DEVICE_CPU),
+    LatencyStatsDatasetOp);
+REGISTER_KERNEL_BUILDER(
+    Name("ExperimentalBytesProducedStatsDataset").Device(DEVICE_CPU),
+    BytesProducedStatsDatasetOp);
 
 }  // namespace
 }  // namespace data
diff --git a/tensorflow/core/kernels/data/experimental/threadpool_dataset_op.cc b/tensorflow/core/kernels/data/experimental/threadpool_dataset_op.cc
index ab21dfc6bc5..8ae45ed5c9d 100644
--- a/tensorflow/core/kernels/data/experimental/threadpool_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/threadpool_dataset_op.cc
@@ -13,10 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <memory>
 #include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/resource_mgr.h"
 #include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/util/ptr_util.h"
 #include "tensorflow/core/util/work_sharder.h"
 
 namespace tensorflow {
@@ -167,6 +169,8 @@ class ThreadPoolDatasetOp : public UnaryDatasetOpKernel {
       return "ThreadPoolDatasetOp::Dataset";
     }
 
+    int64 Cardinality() const override { return input_->Cardinality(); }
+
    protected:
     Status AsGraphDefInternal(SerializationContext* ctx,
                               DatasetGraphDefBuilder* b,
@@ -187,20 +191,137 @@ class ThreadPoolDatasetOp : public UnaryDatasetOpKernel {
           : DatasetIterator<Dataset>(params) {}
 
       Status Initialize(IteratorContext* ctx) override {
-        return dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_);
+        return dataset()->input_->MakeIterator(
+            IteratorContext(CreateParams(ctx)), prefix(), &input_impl_);
       }
 
       Status GetNextInternal(IteratorContext* ctx,
                              std::vector<Tensor>* out_tensors,
                              bool* end_of_sequence) override {
+        return input_impl_->GetNext(IteratorContext(CreateParams(ctx)),
+                                    out_tensors, end_of_sequence);
+      }
+
+     protected:
+      std::shared_ptr<model::Node> CreateNode(
+          IteratorContext* ctx, model::Node::Args args) const override {
+        return model::MakeKnownRatioNode(std::move(args),
+                                         /*ratio=*/1);
+      }
+
+     private:
+      IteratorContext::Params CreateParams(IteratorContext* ctx) {
         ThreadPoolResource* pool = dataset()->threadpool_;
         IteratorContext::Params params(ctx);
         params.runner = [pool](std::function<void()> c) {
           pool->Schedule(std::move(c));
         };
         params.runner_threadpool_size = pool->NumThreads();
-        IteratorContext iter_ctx(params);
-        return input_impl_->GetNext(&iter_ctx, out_tensors, end_of_sequence);
+        return params;
+      }
+
+      std::unique_ptr<IteratorBase> input_impl_;
+    };
+
+    const DatasetBase* const input_;
+    const Tensor resource_handle_;
+    ThreadPoolResource* const threadpool_;
+  };
+};
+
+class MaxIntraOpParallelismDatasetOp : public UnaryDatasetOpKernel {
+ public:
+  explicit MaxIntraOpParallelismDatasetOp(OpKernelConstruction* ctx)
+      : UnaryDatasetOpKernel(ctx) {}
+
+  void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
+                   DatasetBase** output) override {
+    int64 max_intra_op_parallelism;
+    OP_REQUIRES_OK(ctx,
+                   ParseScalarArgument<int64>(ctx, "max_intra_op_parallelism",
+                                              &max_intra_op_parallelism));
+    OP_REQUIRES(
+        ctx, max_intra_op_parallelism >= 0,
+        errors::InvalidArgument("`max_intra_op_parallelism` must be >= 0"));
+    *output = new Dataset(ctx, input, max_intra_op_parallelism);
+  }
+
+ private:
+  class Dataset : public DatasetBase {
+   public:
+    Dataset(OpKernelContext* ctx, const DatasetBase* input,
+            int64 max_intra_op_parallelism)
+        : DatasetBase(DatasetContext(ctx)),
+          input_(input),
+          max_intra_op_parallelism_(max_intra_op_parallelism) {
+      input_->Ref();
+    }
+
+    ~Dataset() override { input_->Unref(); }
+
+    std::unique_ptr<IteratorBase> MakeIteratorInternal(
+        const string& prefix) const override {
+      return std::unique_ptr<IteratorBase>(new Iterator(
+          {this, strings::StrCat(prefix, "::MaxIntraOpParallelism")}));
+    }
+
+    const DataTypeVector& output_dtypes() const override {
+      return input_->output_dtypes();
+    }
+    const std::vector<PartialTensorShape>& output_shapes() const override {
+      return input_->output_shapes();
+    }
+
+    string DebugString() const override {
+      return "MaxIntraOpParallelismDatasetOp::Dataset";
+    }
+
+    int64 Cardinality() const override { return input_->Cardinality(); }
+
+   protected:
+    Status AsGraphDefInternal(SerializationContext* ctx,
+                              DatasetGraphDefBuilder* b,
+                              Node** output) const override {
+      Node* input_graph_node = nullptr;
+      TF_RETURN_IF_ERROR(b->AddInputDataset(ctx, input_, &input_graph_node));
+      Node* max_intra_op_parallelism_node = nullptr;
+      TF_RETURN_IF_ERROR(b->AddScalar(max_intra_op_parallelism_,
+                                      &max_intra_op_parallelism_node));
+      TF_RETURN_IF_ERROR(b->AddDataset(
+          this, {input_graph_node, max_intra_op_parallelism_node}, output));
+      return Status::OK();
+    }
+
+   private:
+    class Iterator : public DatasetIterator<Dataset> {
+     public:
+      explicit Iterator(const Params& params)
+          : DatasetIterator<Dataset>(params) {}
+
+      Status Initialize(IteratorContext* ctx) override {
+        return dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_);
+      }
+
+      Status GetNextInternal(IteratorContext* ctx,
+                             std::vector<Tensor>* out_tensors,
+                             bool* end_of_sequence) override {
+        IteratorContext::Params params(ctx);
+        auto max_parallelism = dataset()->max_intra_op_parallelism_;
+        params.runner = std::bind(
+            [max_parallelism](
+                const std::function<void(std::function<void()>)>& runner,
+                std::function<void()> fn) {
+              std::function<void()> scoped_fn = std::bind(
+                  [max_parallelism](const std::function<void()>& fn) {
+                    ScopedPerThreadMaxParallelism scope(max_parallelism);
+                    fn();
+                  },
+                  std::move(fn));
+              (runner)(std::move(scoped_fn));
+            },
+            std::move(*ctx->runner()), std::placeholders::_1);
+        return input_impl_->GetNext(IteratorContext{std::move(params)},
+                                    out_tensors, end_of_sequence);
       }
 
      protected:
@@ -215,11 +336,118 @@ class ThreadPoolDatasetOp : public UnaryDatasetOpKernel {
     };
 
     const DatasetBase* const input_;
-    const Tensor resource_handle_;
-    ThreadPoolResource* const threadpool_;
+    const int64 max_intra_op_parallelism_;
   };
 };
 
+class PrivateThreadPoolDatasetOp : public UnaryDatasetOpKernel {
+ public:
+  explicit PrivateThreadPoolDatasetOp(OpKernelConstruction* ctx)
+      : UnaryDatasetOpKernel(ctx) {}
+
+  void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
+                   DatasetBase** output) override {
+    int64 num_threads;
+    OP_REQUIRES_OK(
+        ctx, ParseScalarArgument<int64>(ctx, "num_threads", &num_threads));
+    OP_REQUIRES(ctx, num_threads >= 1,
+                errors::InvalidArgument("`num_threads` must be >= 1"));
+    *output = new Dataset(ctx, input, num_threads);
+  }
+
+ private:
+  class Dataset : public DatasetBase {
+   public:
+    Dataset(OpKernelContext* ctx, const DatasetBase* input, int num_threads)
+        : DatasetBase(DatasetContext(ctx)),
+          input_(input),
+          num_threads_(num_threads) {
+      thread_pool_ = MakeUnique<thread::ThreadPool>(
+          ctx->env(), ThreadOptions{}, "data_private_threadpool", num_threads,
+          /*low_latency_hint=*/false);
+      input_->Ref();
+    }
+
+    ~Dataset() override { input_->Unref(); }
+
+    std::unique_ptr<IteratorBase> MakeIteratorInternal(
+        const string& prefix) const override {
+      return std::unique_ptr<IteratorBase>(
+          new Iterator({this, strings::StrCat(prefix, "::PrivateThreadPool")}));
+    }
+
+    const DataTypeVector& output_dtypes() const override {
+      return input_->output_dtypes();
+    }
+    const std::vector<PartialTensorShape>& output_shapes() const override {
+      return input_->output_shapes();
+    }
+
+    string DebugString() const override {
+      return "PrivateThreadPoolDatasetOp::Dataset";
+    }
+
+    int64 Cardinality() const override { return input_->Cardinality(); }
+
+   protected:
+    Status AsGraphDefInternal(SerializationContext* ctx,
+                              DatasetGraphDefBuilder* b,
+                              Node** output) const override {
+      Node* input_graph_node = nullptr;
+      TF_RETURN_IF_ERROR(b->AddInputDataset(ctx, input_, &input_graph_node));
+      Node* num_threads_node = nullptr;
+      TF_RETURN_IF_ERROR(b->AddScalar(num_threads_, &num_threads_node));
+      TF_RETURN_IF_ERROR(
+          b->AddDataset(this, {input_graph_node, num_threads_node}, output));
+      return Status::OK();
+    }
+
+   private:
+    class Iterator : public DatasetIterator<Dataset> {
+     public:
+      explicit Iterator(const Params& params)
+          : DatasetIterator<Dataset>(params) {}
+
+      Status Initialize(IteratorContext* ctx) override {
+        return dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_);
+      }
+
+      Status GetNextInternal(IteratorContext* ctx,
+                             std::vector<Tensor>* out_tensors,
+                             bool* end_of_sequence) override {
+        thread::ThreadPool* pool = dataset()->thread_pool_.get();
+        IteratorContext::Params params(ctx);
+        params.runner = [pool](std::function<void()> c) {
+          pool->Schedule(std::move(c));
+        };
+        params.runner_threadpool_size = dataset()->num_threads_;
+        return input_impl_->GetNext(IteratorContext{std::move(params)},
+                                    out_tensors, end_of_sequence);
+      }
+
+     protected:
+      std::shared_ptr<model::Node> CreateNode(
+          IteratorContext* ctx, model::Node::Args args) const override {
+        return model::MakeKnownRatioNode(std::move(args),
+                                         /*ratio=*/1);
+      }
+
+     private:
+      std::unique_ptr<IteratorBase> input_impl_;
+    };
+
+    const DatasetBase* const input_;
+    const int64 num_threads_;
+    std::unique_ptr<thread::ThreadPool> thread_pool_;
+  };
+};
+
+REGISTER_KERNEL_BUILDER(
+    Name("ExperimentalMaxIntraOpParallelismDataset").Device(DEVICE_CPU),
+    MaxIntraOpParallelismDatasetOp);
+REGISTER_KERNEL_BUILDER(
+    Name("ExperimentalPrivateThreadPoolDataset").Device(DEVICE_CPU),
+    PrivateThreadPoolDatasetOp);
 REGISTER_KERNEL_BUILDER(Name("ExperimentalThreadPoolHandle").Device(DEVICE_CPU),
                         ThreadPoolHandleOp);
 REGISTER_KERNEL_BUILDER(
diff --git a/tensorflow/core/kernels/data/writer_ops.cc b/tensorflow/core/kernels/data/experimental/to_tf_record_op.cc
similarity index 84%
rename from tensorflow/core/kernels/data/writer_ops.cc
rename to tensorflow/core/kernels/data/experimental/to_tf_record_op.cc
index 66e759a1355..7728baf1507 100644
--- a/tensorflow/core/kernels/data/writer_ops.cc
+++ b/tensorflow/core/kernels/data/experimental/to_tf_record_op.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/core/framework/dataset.h"
+#include "tensorflow/core/framework/function_handle_cache.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/kernels/data/dataset_utils.h"
 #include "tensorflow/core/kernels/ops_util.h"
@@ -67,20 +68,24 @@ class ToTFRecordOp : public AsyncOpKernel {
       OP_REQUIRES_OK_ASYNC(
           ctx, GetDatasetFromVariantTensor(ctx->input(0), &dataset), done);
       std::unique_ptr<IteratorBase> iterator;
+      IteratorContext::Params params(ctx);
+      std::unique_ptr<FunctionHandleCache> function_handle_cache(
+          new FunctionHandleCache(params.lib));
+      params.function_handle_cache = function_handle_cache.get();
+      IteratorContext iter_ctx(std::move(params));
+
       OP_REQUIRES_OK_ASYNC(
           ctx,
-          dataset->MakeIterator(IteratorContext(ctx), "ToTFRecordOpIterator",
-                                &iterator),
+          dataset->MakeIterator(&iter_ctx, "ToTFRecordOpIterator", &iterator),
           done);
 
       std::vector<Tensor> components;
       components.reserve(dataset->output_dtypes().size());
       bool end_of_sequence;
       do {
-        OP_REQUIRES_OK_ASYNC(ctx,
-                             iterator->GetNext(IteratorContext(ctx),
-                                               &components, &end_of_sequence),
-                             done);
+        OP_REQUIRES_OK_ASYNC(
+            ctx, iterator->GetNext(&iter_ctx, &components, &end_of_sequence),
+            done);
 
         if (!end_of_sequence) {
           OP_REQUIRES_OK_ASYNC(
@@ -96,8 +101,8 @@ class ToTFRecordOp : public AsyncOpKernel {
   BackgroundWorker background_worker_;
 };
 
-REGISTER_KERNEL_BUILDER(Name("DatasetToTFRecord").Device(DEVICE_CPU),
-                        ToTFRecordOp);
+REGISTER_KERNEL_BUILDER(
+    Name("ExperimentalDatasetToTFRecord").Device(DEVICE_CPU), ToTFRecordOp);
 
 }  // namespace
 }  // namespace data
diff --git a/tensorflow/core/kernels/data/unbatch_dataset_op.cc b/tensorflow/core/kernels/data/experimental/unbatch_dataset_op.cc
similarity index 98%
rename from tensorflow/core/kernels/data/unbatch_dataset_op.cc
rename to tensorflow/core/kernels/data/experimental/unbatch_dataset_op.cc
index b32ab8ba4fa..2626ec3ed72 100644
--- a/tensorflow/core/kernels/data/unbatch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/unbatch_dataset_op.cc
@@ -54,6 +54,8 @@ class UnbatchDatasetOp : public UnaryDatasetOpKernel {
       }
     }
 
+    ~Dataset() override { input_->Unref(); }
+
     std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
       return std::unique_ptr<IteratorBase>(
@@ -219,7 +221,7 @@ class UnbatchDatasetOp : public UnaryDatasetOpKernel {
   };
 };
 
-REGISTER_KERNEL_BUILDER(Name("UnbatchDataset").Device(DEVICE_CPU),
+REGISTER_KERNEL_BUILDER(Name("ExperimentalUnbatchDataset").Device(DEVICE_CPU),
                         UnbatchDatasetOp);
 
 }  // namespace
diff --git a/tensorflow/core/kernels/data/filter_dataset_op.cc b/tensorflow/core/kernels/data/filter_dataset_op.cc
index 40cbb124252..b8b657d3433 100644
--- a/tensorflow/core/kernels/data/filter_dataset_op.cc
+++ b/tensorflow/core/kernels/data/filter_dataset_op.cc
@@ -34,7 +34,8 @@ namespace {
 class FilterDatasetOp : public UnaryDatasetOpKernel {
  public:
   using FilterIteratorPredicate =
-      std::function<Status(IteratorContext*, std::vector<Tensor>, bool*)>;
+      std::function<Status(IteratorContext*, InstantiatedCapturedFunction*,
+                           std::vector<Tensor>, bool*)>;
 
   explicit FilterDatasetOp(OpKernelConstruction* ctx)
       : UnaryDatasetOpKernel(ctx) {
@@ -55,13 +56,12 @@ class FilterDatasetOp : public UnaryDatasetOpKernel {
 
     FilterIteratorPredicate filter_pred;
     if (indices.empty()) {
-      CapturedFunction* raw_captured_func = captured_func.get();
-      filter_pred = [raw_captured_func](IteratorContext* ctx,
-                                        const std::vector<Tensor>& args,
-                                        bool* out_matched) {
+      filter_pred = [](IteratorContext* ctx,
+                       InstantiatedCapturedFunction* inst_captured_func,
+                       const std::vector<Tensor>& args, bool* out_matched) {
         std::vector<Tensor> result;
         TF_RETURN_IF_ERROR(
-            raw_captured_func->RunWithBorrowedArgs(ctx, args, &result));
+            inst_captured_func->RunWithBorrowedArgs(ctx, args, &result));
 
         if (result.size() != 1 || result[0].dtype() != DT_BOOL ||
             result[0].NumElements() != 1) {
@@ -73,6 +73,7 @@ class FilterDatasetOp : public UnaryDatasetOpKernel {
       };
     } else {
       filter_pred = [indices](IteratorContext* ctx,
+                              InstantiatedCapturedFunction* inst_captured_func,
                               const std::vector<Tensor>& args,
                               bool* out_matched) {
         const Tensor& predicate = args[indices[0]];
@@ -169,7 +170,8 @@ class FilterDatasetOp : public UnaryDatasetOpKernel {
       Status Initialize(IteratorContext* ctx) override {
         TF_RETURN_IF_ERROR(
             dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_));
-        return dataset()->captured_func_->Instantiate(ctx);
+        return dataset()->captured_func_->Instantiate(
+            ctx, &instantiated_captured_func_);
       }
 
       Status GetNextInternal(IteratorContext* ctx,
@@ -197,7 +199,8 @@ class FilterDatasetOp : public UnaryDatasetOpKernel {
             return Status::OK();
           }
 
-          TF_RETURN_IF_ERROR(filter_pred_(ctx, *out_tensors, &matched));
+          TF_RETURN_IF_ERROR(filter_pred_(
+              ctx, instantiated_captured_func_.get(), *out_tensors, &matched));
           if (!matched) {
             // Clear the output tensor list since it didn't match.
             out_tensors->clear();
@@ -274,6 +277,7 @@ class FilterDatasetOp : public UnaryDatasetOpKernel {
       int64 dropped_elements_ GUARDED_BY(mu_);
       const FilterIteratorPredicate filter_pred_;
       string prefix_end_;
+      std::unique_ptr<InstantiatedCapturedFunction> instantiated_captured_func_;
     };
 
     const DatasetBase* const input_;
diff --git a/tensorflow/core/kernels/data/flat_map_dataset_op.cc b/tensorflow/core/kernels/data/flat_map_dataset_op.cc
index 9b42981ed75..3846334622b 100644
--- a/tensorflow/core/kernels/data/flat_map_dataset_op.cc
+++ b/tensorflow/core/kernels/data/flat_map_dataset_op.cc
@@ -122,7 +122,8 @@ class FlatMapDatasetOp : public UnaryDatasetOpKernel {
       Status Initialize(IteratorContext* ctx) override {
         TF_RETURN_IF_ERROR(
             dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_));
-        return dataset()->captured_func_->Instantiate(ctx);
+        return dataset()->captured_func_->Instantiate(
+            ctx, &instantiated_captured_func_);
       }
 
       Status GetNextInternal(IteratorContext* ctx,
@@ -243,8 +244,7 @@ class FlatMapDatasetOp : public UnaryDatasetOpKernel {
           EXCLUSIVE_LOCKS_REQUIRED(mu_) {
         return MakeIteratorFromInputElement(
             ctx, captured_func_inputs_, element_index_++,
-            dataset()->captured_func_.get(), prefix(),
-            &current_element_iterator_);
+            *instantiated_captured_func_, prefix(), &current_element_iterator_);
       }
 
       mutex mu_;
@@ -252,6 +252,7 @@ class FlatMapDatasetOp : public UnaryDatasetOpKernel {
       std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(mu_);
       std::unique_ptr<IteratorBase> current_element_iterator_ GUARDED_BY(mu_);
       std::vector<Tensor> captured_func_inputs_ GUARDED_BY(mu_);
+      std::unique_ptr<InstantiatedCapturedFunction> instantiated_captured_func_;
     };
 
     const DatasetBase* const input_;
diff --git a/tensorflow/core/kernels/data/generator_dataset_op.cc b/tensorflow/core/kernels/data/generator_dataset_op.cc
index ed18d6ed9d8..48697ec6c8f 100644
--- a/tensorflow/core/kernels/data/generator_dataset_op.cc
+++ b/tensorflow/core/kernels/data/generator_dataset_op.cc
@@ -73,7 +73,8 @@ class GeneratorDatasetOp::Dataset : public DatasetBase {
     ~Iterator() override {
       if (!finalized_) {
         std::vector<Tensor> ignored;
-        Status s = dataset()->finalize_func_->RunInstantiated(state_, &ignored);
+        Status s =
+            instantiated_finalize_func_->RunInstantiated(state_, &ignored);
         if (!s.ok()) {
           LOG(WARNING)
               << "Error occurred when finalizing GeneratorDataset iterator: "
@@ -83,9 +84,12 @@ class GeneratorDatasetOp::Dataset : public DatasetBase {
     }
 
     Status Initialize(IteratorContext* ctx) override {
-      TF_RETURN_IF_ERROR(dataset()->init_func_->Instantiate(ctx));
-      TF_RETURN_IF_ERROR(dataset()->next_func_->Instantiate(ctx));
-      TF_RETURN_IF_ERROR(dataset()->finalize_func_->Instantiate(ctx));
+      TF_RETURN_IF_ERROR(
+          dataset()->init_func_->Instantiate(ctx, &instantiated_init_func_));
+      TF_RETURN_IF_ERROR(
+          dataset()->next_func_->Instantiate(ctx, &instantiated_next_func_));
+      TF_RETURN_IF_ERROR(dataset()->finalize_func_->Instantiate(
+          ctx, &instantiated_finalize_func_));
       return Status::OK();
     }
 
@@ -96,7 +100,7 @@ class GeneratorDatasetOp::Dataset : public DatasetBase {
 
       if (!initialized_) {
         TF_RETURN_IF_ERROR(
-            dataset()->init_func_->RunWithBorrowedArgs(ctx, {}, &state_));
+            instantiated_init_func_->RunWithBorrowedArgs(ctx, {}, &state_));
         initialized_ = true;
       }
 
@@ -105,8 +109,8 @@ class GeneratorDatasetOp::Dataset : public DatasetBase {
         return Status::OK();
       }
 
-      Status s =
-          dataset()->next_func_->RunWithBorrowedArgs(ctx, state_, out_tensors);
+      Status s = instantiated_next_func_->RunWithBorrowedArgs(ctx, state_,
+                                                              out_tensors);
       if (s.ok()) {
         *end_of_sequence = false;
       } else if (errors::IsOutOfRange(s)) {
@@ -119,7 +123,7 @@ class GeneratorDatasetOp::Dataset : public DatasetBase {
         // finalize function.
         std::vector<Tensor> ignored;
         TF_RETURN_IF_ERROR(
-            dataset()->finalize_func_->RunInstantiated(state_, &ignored));
+            instantiated_finalize_func_->RunInstantiated(state_, &ignored));
         finalized_ = true;
       }
       return s;
@@ -136,6 +140,9 @@ class GeneratorDatasetOp::Dataset : public DatasetBase {
     bool initialized_ GUARDED_BY(mu_) = false;
     bool finalized_ GUARDED_BY(mu_) = false;
     std::vector<Tensor> state_ GUARDED_BY(mu_);
+    std::unique_ptr<InstantiatedCapturedFunction> instantiated_init_func_;
+    std::unique_ptr<InstantiatedCapturedFunction> instantiated_next_func_;
+    std::unique_ptr<InstantiatedCapturedFunction> instantiated_finalize_func_;
   };
 
   const std::unique_ptr<CapturedFunction> init_func_;
@@ -175,11 +182,13 @@ void GeneratorDatasetOp::MakeDataset(OpKernelContext* ctx,
 }
 
 namespace {
-REGISTER_KERNEL_BUILDER(Name("GeneratorDataset").Device(DEVICE_CPU),
+REGISTER_KERNEL_BUILDER(Name("GeneratorDataset").Device(DEVICE_CPU).Priority(2),
+                        GeneratorDatasetOp);
+REGISTER_KERNEL_BUILDER(Name("GeneratorDataset")
+                            .Device(DEVICE_GPU)
+                            .HostMemory("handle")
+                            .Priority(1),
                         GeneratorDatasetOp);
-REGISTER_KERNEL_BUILDER(
-    Name("GeneratorDataset").Device(DEVICE_GPU).HostMemory("handle"),
-    GeneratorDatasetOp);
 }  // namespace
 
 }  // namespace data
diff --git a/tensorflow/core/kernels/data/interleave_dataset_op.cc b/tensorflow/core/kernels/data/interleave_dataset_op.cc
index 9574e400a2d..54e3645612c 100644
--- a/tensorflow/core/kernels/data/interleave_dataset_op.cc
+++ b/tensorflow/core/kernels/data/interleave_dataset_op.cc
@@ -149,7 +149,8 @@ class InterleaveDatasetOp : public UnaryDatasetOpKernel {
       Status Initialize(IteratorContext* ctx) override {
         TF_RETURN_IF_ERROR(
             dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_));
-        return dataset()->captured_func_->Instantiate(ctx);
+        return dataset()->captured_func_->Instantiate(
+            ctx, &instantiated_captured_func_);
       }
 
       void AdvanceToNextInCycle() EXCLUSIVE_LOCKS_REQUIRED(mu_) {
@@ -195,7 +196,7 @@ class InterleaveDatasetOp : public UnaryDatasetOpKernel {
             if (!end_of_input_) {
               TF_RETURN_IF_ERROR(MakeIteratorFromInputElement(
                   ctx, args_list_[cycle_index_], cycle_index_,
-                  dataset()->captured_func_.get(), prefix(),
+                  *instantiated_captured_func_, prefix(),
                   &current_elements_[cycle_index_]));
               ++num_open_;
             }
@@ -286,7 +287,7 @@ class InterleaveDatasetOp : public UnaryDatasetOpKernel {
                   &args_list_[idx][i]));
             }
             TF_RETURN_IF_ERROR(MakeIteratorFromInputElement(
-                ctx, args_list_[idx], idx, dataset()->captured_func_.get(),
+                ctx, args_list_[idx], idx, *instantiated_captured_func_,
                 prefix(), &current_elements_[idx]));
             TF_RETURN_IF_ERROR(
                 RestoreInput(ctx, reader, current_elements_[idx]));
@@ -306,6 +307,7 @@ class InterleaveDatasetOp : public UnaryDatasetOpKernel {
       int64 block_index_ GUARDED_BY(mu_) = 0;
       bool end_of_input_ GUARDED_BY(mu_) = false;
       size_t num_open_ GUARDED_BY(mu_) = 0;
+      std::unique_ptr<InstantiatedCapturedFunction> instantiated_captured_func_;
     };
 
     const DatasetBase* const input_;
diff --git a/tensorflow/core/kernels/data/iterator_ops.cc b/tensorflow/core/kernels/data/iterator_ops.cc
index 445718ba1e5..cb7477f9e26 100644
--- a/tensorflow/core/kernels/data/iterator_ops.cc
+++ b/tensorflow/core/kernels/data/iterator_ops.cc
@@ -14,9 +14,12 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/kernels/data/iterator_ops.h"
 
+#include "absl/memory/memory.h"
 #include "tensorflow/core/common_runtime/graph_runner.h"
 #include "tensorflow/core/common_runtime/renamed_device.h"
 #include "tensorflow/core/common_runtime/threadpool_device.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/function_handle_cache.h"
 #include "tensorflow/core/framework/iterator.pb.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/resource_op_kernel.h"
@@ -33,6 +36,7 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/public/session_options.h"
 
 namespace tensorflow {
@@ -56,22 +60,25 @@ class IteratorResource : public ResourceBase {
                    std::unique_ptr<ProcessFunctionLibraryRuntime> pflr,
                    FunctionLibraryRuntime* lib)
       : device_mgr_(std::move(device_mgr)),
-        flib_def_(std::move(flib_def)),
-        pflr_(std::move(pflr)),
-        lib_(lib),
-        iterator_(nullptr),
+        iterator_state_(
+            new State(std::move(flib_def), std::move(pflr), lib, nullptr)),
         output_dtypes_(output_dtypes),
         output_shapes_(output_shapes) {}
 
   Status GetNext(IteratorContext* ctx, std::vector<Tensor>* out_tensors,
                  bool* end_of_sequence) {
-    std::shared_ptr<IteratorBase> captured_iterator(iterator_);
-    if (captured_iterator) {
-      CHECK_NOTNULL(lib_);
+    std::shared_ptr<State> captured_state;
+    {
+      tf_shared_lock l(mu_);
+      captured_state = iterator_state_;
+    }
+    if (captured_state->iterator) {
       IteratorContext::Params params(ctx);
-      params.lib = lib_;
-      return captured_iterator->GetNext(IteratorContext(std::move(params)),
-                                        out_tensors, end_of_sequence);
+      params.lib = captured_state->lib;
+      params.function_handle_cache =
+          captured_state->function_handle_cache.get();
+      return captured_state->iterator->GetNext(
+          IteratorContext(std::move(params)), out_tensors, end_of_sequence);
     } else {
       return errors::FailedPrecondition(
           "GetNext() failed because the iterator has not been initialized. "
@@ -86,9 +93,13 @@ class IteratorResource : public ResourceBase {
   }
 
   Status Save(SerializationContext* ctx, IteratorStateWriter* writer) {
-    std::shared_ptr<IteratorBase> captured_iterator(iterator_);
-    if (captured_iterator) {
-      return captured_iterator->Save(ctx, writer);
+    std::shared_ptr<State> captured_state;
+    {
+      tf_shared_lock l(mu_);
+      captured_state = iterator_state_;
+    }
+    if (captured_state) {
+      return captured_state->iterator->Save(ctx, writer);
     } else {
       return errors::FailedPrecondition(
           "Save() failed because the iterator has not been initialized. "
@@ -120,63 +131,97 @@ class IteratorResource : public ResourceBase {
     // because some of the OpKernels in the graph might call functions that are
     // only defined in the loaded GraphDef.
     FunctionLibraryRuntime* lib;
-    std::unique_ptr<DeviceMgr> device_mgr(nullptr);
     std::unique_ptr<FunctionLibraryDefinition> flib_def(nullptr);
     std::unique_ptr<ProcessFunctionLibraryRuntime> pflr(nullptr);
     TF_RETURN_IF_ERROR(ctx->function_library()->Clone(&flib_def, &pflr, &lib));
     TF_RETURN_IF_ERROR(flib_def->AddLibrary(graph_def.library()));
+    std::unique_ptr<State> new_state(
+        new State(std::move(flib_def), std::move(pflr), lib, nullptr));
 
     TF_RETURN_IF_ERROR(
-        graph_runner.Run(&graph, lib, {}, {output_node}, &outputs));
+        graph_runner.Run(&graph, new_state->lib, {}, {output_node}, &outputs));
     TF_RETURN_IF_ERROR(GetDatasetFromVariantTensor(outputs[0], &dataset));
 
-    std::unique_ptr<IteratorBase> iterator;
     IteratorContext::Params params(ctx);
-    params.lib = lib;
+    params.lib = new_state->lib;
+    params.function_handle_cache = new_state->function_handle_cache.get();
     TF_RETURN_IF_ERROR(dataset->MakeIterator(IteratorContext(std::move(params)),
-                                             "Iterator", &iterator));
-    TF_RETURN_IF_ERROR(set_iterator(std::move(iterator)));
-    std::shared_ptr<IteratorBase> captured_iterator(iterator_);
+                                             "Iterator", &new_state->iterator));
+    TF_RETURN_IF_ERROR(
+        VerifyTypesMatch(output_dtypes_, new_state->iterator->output_dtypes()));
+    TF_RETURN_IF_ERROR(VerifyShapesCompatible(
+        output_shapes_, new_state->iterator->output_shapes()));
 
-    if (captured_iterator) {
+    {
       IteratorContext::Params params(ctx);
-      params.lib = lib;
-      DeviceBase* device = lib->device();
+      params.lib = new_state->lib;
+      params.function_handle_cache = new_state->function_handle_cache.get();
+      DeviceBase* device = new_state->lib->device();
       params.allocator_getter = [device](AllocatorAttributes attrs) {
         return device->GetAllocator(attrs);
       };
       IteratorContext iter_ctx(std::move(params));
-      TF_RETURN_IF_ERROR(captured_iterator->Restore(&iter_ctx, reader));
-      mutex_lock l(mu_);
-      device_mgr_ = std::move(device_mgr);
-      lib_def_ = std::move(flib_def);
-      pflr_ = std::move(pflr);
-      lib_ = lib;
-      return Status::OK();
-    } else {
-      return errors::FailedPrecondition(
-          "Failed to restore iterator. Make sure the checkpoint ",
-          "is not corrupt. If the checkpoint does not contain the GraphDef, ",
-          "you will need to initialize your iterator before restoring.");
+      TF_RETURN_IF_ERROR(new_state->iterator->Restore(&iter_ctx, reader));
     }
+
+    mutex_lock l(mu_);
+    iterator_state_ = std::move(new_state);
+    return Status::OK();
   }
 
-  std::shared_ptr<const FunctionLibraryDefinition> function_library() {
-    tf_shared_lock l(mu_);
-    return lib_def_;
+  Status AddLibrary(const FunctionLibraryDefinition& flib_def) {
+    mutex_lock l(mu_);
+    return iterator_state_->flib_def->AddLibrary(flib_def);
   }
 
-  FunctionLibraryRuntime* function_library_runtime() { return lib_; }
-
-  // Transfers ownership of iterator to this. This method is thread-safe.
-  Status set_iterator(std::unique_ptr<IteratorBase> iterator) {
-    if (iterator) {
-      TF_RETURN_IF_ERROR(
-          VerifyTypesMatch(output_dtypes_, iterator->output_dtypes()));
-      TF_RETURN_IF_ERROR(
-          VerifyShapesCompatible(output_shapes_, iterator->output_shapes()));
+  Status SetIteratorFromDataset(OpKernelContext* ctx, DatasetBase* dataset) {
+    std::shared_ptr<State> new_state;
+    {
+      tf_shared_lock l(mu_);
+      new_state.reset(new State(iterator_state_->flib_def,
+                                iterator_state_->pflr, iterator_state_->lib,
+                                nullptr, nullptr));
     }
-    iterator_.reset(iterator.release());
+
+    // Ensure that the iterator has access to all functions in the current
+    // subgraph, because some functions may have been defined after the resource
+    // was initially created.
+    Status s = new_state->flib_def->AddLibrary(
+        *ctx->function_library()->GetFunctionLibraryDefinition());
+
+    if (!s.ok()) {
+      // Adding functions to `flib_def_` may fail, if there are clashes between
+      // the function names in (e.g.) a restored graph and the currently
+      // executing graph. In that case, we create a new function runtime for
+      // this iterator, based on the current `OpKernelContext`, which will have
+      // the functions we need.
+      FunctionLibraryRuntime* lib;
+      std::unique_ptr<FunctionLibraryDefinition> flib_def(nullptr);
+      std::unique_ptr<ProcessFunctionLibraryRuntime> pflr(nullptr);
+      TF_RETURN_IF_ERROR(
+          ctx->function_library()->Clone(&flib_def, &pflr, &lib));
+      new_state->flib_def = std::move(flib_def);
+      new_state->pflr = std::move(pflr);
+      new_state->lib = lib;
+    }
+
+    new_state->function_handle_cache.reset(
+        new FunctionHandleCache(new_state->lib));
+    // Create new iterator.
+    std::unique_ptr<IteratorBase> iterator;
+    IteratorContext::Params params(ctx);
+    params.lib = new_state->lib;
+    params.function_handle_cache = new_state->function_handle_cache.get();
+    TF_RETURN_IF_ERROR(dataset->MakeIterator(IteratorContext(std::move(params)),
+                                             "Iterator", &iterator));
+    TF_RETURN_IF_ERROR(
+        VerifyTypesMatch(output_dtypes_, iterator->output_dtypes()));
+    TF_RETURN_IF_ERROR(
+        VerifyShapesCompatible(output_shapes_, iterator->output_shapes()));
+    std::swap(new_state->iterator, iterator);
+
+    mutex_lock l(mu_);
+    std::swap(iterator_state_, new_state);
     return Status::OK();
   }
 
@@ -189,16 +234,37 @@ class IteratorResource : public ResourceBase {
   }
 
  private:
-  // The following (device_mgr_, flib_def_, pflr_) are only used when the
-  // IteratorResource is shared between sessions and in that case we create
-  // a new FLR. Otherwise these are set to null.
-  std::unique_ptr<DeviceMgr> device_mgr_;
-  std::unique_ptr<FunctionLibraryDefinition> flib_def_;
-  std::unique_ptr<ProcessFunctionLibraryRuntime> pflr_;
-  FunctionLibraryRuntime* lib_ = nullptr;  // not owned.
-  std::shared_ptr<IteratorBase> iterator_;
+  struct State {
+    State(std::shared_ptr<FunctionLibraryDefinition> flib_def,
+          std::shared_ptr<ProcessFunctionLibraryRuntime> pflr,
+          FunctionLibraryRuntime* lib, std::unique_ptr<IteratorBase> iterator)
+        : flib_def(flib_def),
+          pflr(pflr),
+          lib(lib),
+          function_handle_cache(absl::make_unique<FunctionHandleCache>(lib)),
+          iterator(std::move(iterator)) {}
+
+    State(std::shared_ptr<FunctionLibraryDefinition> flib_def,
+          std::shared_ptr<ProcessFunctionLibraryRuntime> pflr,
+          FunctionLibraryRuntime* lib,
+          std::unique_ptr<FunctionHandleCache> function_handle_cache,
+          std::unique_ptr<IteratorBase> iterator)
+        : flib_def(flib_def),
+          pflr(pflr),
+          lib(lib),
+          function_handle_cache(std::move(function_handle_cache)),
+          iterator(std::move(iterator)) {}
+
+    std::shared_ptr<FunctionLibraryDefinition> flib_def;
+    std::shared_ptr<ProcessFunctionLibraryRuntime> pflr;
+    FunctionLibraryRuntime* lib = nullptr;  // not owned.
+    std::unique_ptr<FunctionHandleCache> function_handle_cache;
+    std::unique_ptr<IteratorBase> iterator;
+  };
+
   mutex mu_;
-  std::shared_ptr<const FunctionLibraryDefinition> lib_def_ GUARDED_BY(mu_);
+  const std::unique_ptr<DeviceMgr> device_mgr_ GUARDED_BY(mu_);
+  std::shared_ptr<State> iterator_state_ GUARDED_BY(mu_);
   const DataTypeVector output_dtypes_;
   const std::vector<PartialTensorShape> output_shapes_;
 };
@@ -508,10 +574,9 @@ FunctionLibraryRuntime* IteratorHandleOp::CreatePrivateFLR(
   // in its resource manager. The existing device will outlive the
   // IteratorResource, because we are storing the IteratorResource
   // in that device's resource manager.
-  Device* wrapped_device = RenamedDevice::NewRenamedDevice(
+  *device_mgr = absl::make_unique<DeviceMgr>(RenamedDevice::NewRenamedDevice(
       ctx->device()->name(), down_cast<Device*>(ctx->device()),
-      false /* owns_underlying */, false /* isolate_session_state */);
-  device_mgr->reset(new DeviceMgr({wrapped_device}));
+      false /* owns_underlying */, false /* isolate_session_state */));
   flib_def->reset(new FunctionLibraryDefinition(
       *ctx->function_library()->GetFunctionLibraryDefinition()));
   pflr->reset(new ProcessFunctionLibraryRuntime(
@@ -584,13 +649,7 @@ void MakeIteratorOp::Compute(OpKernelContext* ctx) {
   OP_REQUIRES_OK(
       ctx, LookupResource(ctx, HandleFromInput(ctx, 1), &iterator_resource));
   core::ScopedUnref unref(iterator_resource);
-
-  std::unique_ptr<IteratorBase> iterator;
-  IteratorContext::Params params(ctx);
-  params.lib = iterator_resource->function_library_runtime();
-  OP_REQUIRES_OK(ctx, dataset->MakeIterator(IteratorContext(std::move(params)),
-                                            "Iterator", &iterator));
-  OP_REQUIRES_OK(ctx, iterator_resource->set_iterator(std::move(iterator)));
+  OP_REQUIRES_OK(ctx, iterator_resource->SetIteratorFromDataset(ctx, dataset));
 }
 
 namespace {
@@ -610,10 +669,15 @@ class ToSingleElementOp : public AsyncOpKernel {
       OP_REQUIRES_OK_ASYNC(
           ctx, GetDatasetFromVariantTensor(ctx->input(0), &dataset), done);
       std::unique_ptr<IteratorBase> iterator;
+      IteratorContext::Params params(ctx);
+      std::unique_ptr<FunctionHandleCache> function_handle_cache(
+          new FunctionHandleCache(params.lib));
+      params.function_handle_cache = function_handle_cache.get();
+      IteratorContext iter_ctx(std::move(params));
+
       OP_REQUIRES_OK_ASYNC(
           ctx,
-          dataset->MakeIterator(IteratorContext(ctx), "SingleElementIterator",
-                                &iterator),
+          dataset->MakeIterator(&iter_ctx, "SingleElementIterator", &iterator),
           done);
 
       // NOTE(jsimsa): We must destroy the iterator before calling `done()`, to
@@ -627,8 +691,8 @@ class ToSingleElementOp : public AsyncOpKernel {
       components.reserve(dataset->output_dtypes().size());
       bool end_of_sequence = false;
 
-      Status s = raw_iterator->GetNext(IteratorContext(ctx), &components,
-                                       &end_of_sequence);
+      Status s =
+          raw_iterator->GetNext(&iter_ctx, &components, &end_of_sequence);
       if (!s.ok()) {
         ctx->SetStatus(s);
         return;
@@ -643,8 +707,8 @@ class ToSingleElementOp : public AsyncOpKernel {
       }
 
       components.clear();
-      Status s2 = raw_iterator->GetNext(IteratorContext(ctx), &components,
-                                        &end_of_sequence);
+      Status s2 =
+          raw_iterator->GetNext(&iter_ctx, &components, &end_of_sequence);
       if (!s2.ok()) {
         ctx->SetStatus(s2);
         return;
@@ -693,8 +757,16 @@ class ReduceDatasetOp : public AsyncOpKernel {
                                    use_inter_op_parallelism_, &captured_func),
           done);
 
-      IteratorContext iter_ctx(ctx);
-      OP_REQUIRES_OK_ASYNC(ctx, captured_func->Instantiate(&iter_ctx), done);
+      IteratorContext::Params params(ctx);
+      std::unique_ptr<FunctionHandleCache> function_handle_cache(
+          new FunctionHandleCache(params.lib));
+      params.function_handle_cache = function_handle_cache.get();
+      IteratorContext iter_ctx(std::move(params));
+      std::unique_ptr<InstantiatedCapturedFunction> instantiated_captured_func;
+      OP_REQUIRES_OK_ASYNC(
+          ctx,
+          captured_func->Instantiate(&iter_ctx, &instantiated_captured_func),
+          done);
 
       std::unique_ptr<IteratorBase> iterator;
       OP_REQUIRES_OK_ASYNC(
@@ -728,8 +800,8 @@ class ReduceDatasetOp : public AsyncOpKernel {
                   std::back_inserter(args));
 
         std::vector<Tensor> reduce_func_output;
-        status =
-            captured_func->Run(&iter_ctx, std::move(args), &reduce_func_output);
+        status = instantiated_captured_func->Run(&iter_ctx, std::move(args),
+                                                 &reduce_func_output);
         if (!status.ok()) {
           break;
         }
@@ -916,13 +988,7 @@ class OneShotIteratorOp : public AsyncOpKernel {
     // factory function.
     DatasetBase* dataset;
     TF_RETURN_IF_ERROR(GetDatasetFromVariantTensor(return_values[0], &dataset));
-    std::unique_ptr<IteratorBase> iter;
-    IteratorContext::Params params(ctx);
-    params.lib = lib;
-    TF_RETURN_IF_ERROR(dataset->MakeIterator(IteratorContext(std::move(params)),
-                                             "Iterator", &iter));
-    TF_RETURN_IF_ERROR((*iterator)->set_iterator(std::move(iter)));
-
+    TF_RETURN_IF_ERROR((*iterator)->SetIteratorFromDataset(ctx, dataset));
     (*iterator)->Ref();
     return Status::OK();
   }
@@ -976,10 +1042,8 @@ void IteratorGetNextOp::ComputeAsync(OpKernelContext* ctx, DoneCallback done) {
         std::vector<Tensor> components;
         bool end_of_sequence = false;
 
-        IteratorContext::Params params(ctx);
-        params.function_library = iterator->function_library();
-        Status s = iterator->GetNext(IteratorContext(std::move(params)),
-                                     &components, &end_of_sequence);
+        Status s = iterator->GetNext(IteratorContext(ctx), &components,
+                                     &end_of_sequence);
         // NOTE(mrry): We must unref the iterator before calling `done()`, to
         // avoid destruction races.
         iterator->Unref();
@@ -1005,10 +1069,9 @@ void IteratorGetNextSyncOp::Compute(OpKernelContext* ctx) {
   core::ScopedUnref unref_iterator(iterator);
   std::vector<Tensor> components;
   bool end_of_sequence = false;
-  IteratorContext::Params params(ctx);
-  params.function_library = iterator->function_library();
-  OP_REQUIRES_OK(ctx, iterator->GetNext(IteratorContext(std::move(params)),
-                                        &components, &end_of_sequence));
+
+  OP_REQUIRES_OK(ctx, iterator->GetNext(IteratorContext(ctx), &components,
+                                        &end_of_sequence));
   OP_REQUIRES(ctx, !end_of_sequence, errors::OutOfRange("End of sequence"));
 
   for (int i = 0; i < components.size(); ++i) {
@@ -1041,10 +1104,8 @@ class IteratorGetNextAsOptionalOp : public AsyncOpKernel {
           std::vector<Tensor> components;
           bool end_of_sequence = false;
 
-          IteratorContext::Params params(ctx);
-          params.function_library = iterator->function_library();
-          Status s = iterator->GetNext(IteratorContext(std::move(params)),
-                                       &components, &end_of_sequence);
+          Status s = iterator->GetNext(IteratorContext(ctx), &components,
+                                       &end_of_sequence);
           // NOTE(mrry): We must unref the iterator before calling `done()`, to
           // avoid destruction races.
           iterator->Unref();
@@ -1211,50 +1272,60 @@ class DeserializeIteratorOp : public OpKernel {
 
 
 REGISTER_KERNEL_BUILDER(Name("Iterator").Device(DEVICE_CPU), IteratorHandleOp);
-REGISTER_KERNEL_BUILDER(Name("IteratorV2").Device(DEVICE_CPU),
+REGISTER_KERNEL_BUILDER(Name("IteratorV2").Device(DEVICE_CPU).Priority(2),
                         IteratorHandleOp);
-REGISTER_KERNEL_BUILDER(Name("IteratorV2").Device(DEVICE_GPU),
+REGISTER_KERNEL_BUILDER(Name("IteratorV2").Device(DEVICE_GPU).Priority(1),
                         IteratorHandleOp);
-REGISTER_KERNEL_BUILDER(Name("MakeIterator").Device(DEVICE_CPU),
+REGISTER_KERNEL_BUILDER(Name("MakeIterator").Device(DEVICE_CPU).Priority(2),
                         MakeIteratorOp);
 REGISTER_KERNEL_BUILDER(
-    Name("MakeIterator").Device(DEVICE_GPU).HostMemory("dataset"),
+    Name("MakeIterator").Device(DEVICE_GPU).Priority(1).HostMemory("dataset"),
     MakeIteratorOp);
-REGISTER_KERNEL_BUILDER(Name("AnonymousIterator").Device(DEVICE_CPU),
-                        AnonymousIteratorHandleOp);
-REGISTER_KERNEL_BUILDER(Name("AnonymousIterator").Device(DEVICE_GPU),
-                        AnonymousIteratorHandleOp);
+REGISTER_KERNEL_BUILDER(
+    Name("AnonymousIterator").Device(DEVICE_CPU).Priority(2),
+    AnonymousIteratorHandleOp);
+REGISTER_KERNEL_BUILDER(
+    Name("AnonymousIterator").Device(DEVICE_GPU).Priority(1),
+    AnonymousIteratorHandleOp);
 REGISTER_KERNEL_BUILDER(Name("DatasetToSingleElement").Device(DEVICE_CPU),
                         ToSingleElementOp);
 REGISTER_KERNEL_BUILDER(Name("ReduceDataset").Device(DEVICE_CPU),
                         ReduceDatasetOp);
 REGISTER_KERNEL_BUILDER(Name("OneShotIterator").Device(DEVICE_CPU),
                         OneShotIteratorOp);
-REGISTER_KERNEL_BUILDER(Name("IteratorGetNext").Device(DEVICE_CPU),
+REGISTER_KERNEL_BUILDER(Name("IteratorGetNext").Device(DEVICE_CPU).Priority(2),
                         IteratorGetNextOp);
-REGISTER_KERNEL_BUILDER(Name("IteratorGetNext").Device(DEVICE_GPU),
+REGISTER_KERNEL_BUILDER(Name("IteratorGetNext").Device(DEVICE_GPU).Priority(1),
                         IteratorGetNextOp);
-REGISTER_KERNEL_BUILDER(Name("IteratorGetNextSync").Device(DEVICE_CPU),
-                        IteratorGetNextSyncOp);
-REGISTER_KERNEL_BUILDER(Name("IteratorGetNextSync").Device(DEVICE_GPU),
-                        IteratorGetNextSyncOp);
-REGISTER_KERNEL_BUILDER(Name("IteratorGetNextAsOptional").Device(DEVICE_CPU),
-                        IteratorGetNextAsOptionalOp);
-REGISTER_KERNEL_BUILDER(Name("IteratorGetNextAsOptional").Device(DEVICE_GPU),
-                        IteratorGetNextAsOptionalOp);
-REGISTER_KERNEL_BUILDER(Name("IteratorToStringHandle").Device(DEVICE_CPU),
-                        IteratorToStringHandleOp);
+REGISTER_KERNEL_BUILDER(
+    Name("IteratorGetNextSync").Device(DEVICE_CPU).Priority(2),
+    IteratorGetNextSyncOp);
+REGISTER_KERNEL_BUILDER(
+    Name("IteratorGetNextSync").Device(DEVICE_GPU).Priority(1),
+    IteratorGetNextSyncOp);
+REGISTER_KERNEL_BUILDER(
+    Name("IteratorGetNextAsOptional").Device(DEVICE_CPU).Priority(2),
+    IteratorGetNextAsOptionalOp);
+REGISTER_KERNEL_BUILDER(
+    Name("IteratorGetNextAsOptional").Device(DEVICE_GPU).Priority(1),
+    IteratorGetNextAsOptionalOp);
+REGISTER_KERNEL_BUILDER(
+    Name("IteratorToStringHandle").Device(DEVICE_CPU).Priority(2),
+    IteratorToStringHandleOp);
 REGISTER_KERNEL_BUILDER(Name("IteratorToStringHandle")
                             .Device(DEVICE_GPU)
-                            .HostMemory("string_handle"),
+                            .HostMemory("string_handle")
+                            .Priority(1),
                         IteratorToStringHandleOp);
 REGISTER_KERNEL_BUILDER(Name("IteratorFromStringHandle").Device(DEVICE_CPU),
                         IteratorFromStringHandleOp);
-REGISTER_KERNEL_BUILDER(Name("IteratorFromStringHandleV2").Device(DEVICE_CPU),
-                        IteratorFromStringHandleOp);
+REGISTER_KERNEL_BUILDER(
+    Name("IteratorFromStringHandleV2").Device(DEVICE_CPU).Priority(2),
+    IteratorFromStringHandleOp);
 REGISTER_KERNEL_BUILDER(Name("IteratorFromStringHandleV2")
                             .Device(DEVICE_GPU)
-                            .HostMemory("string_handle"),
+                            .HostMemory("string_handle")
+                            .Priority(1),
                         IteratorFromStringHandleOp);
 REGISTER_KERNEL_BUILDER(Name("SerializeIterator").Device(DEVICE_CPU),
                         SerializeIteratorOp);
diff --git a/tensorflow/core/kernels/data/map_dataset_op.cc b/tensorflow/core/kernels/data/map_dataset_op.cc
index ab20b832986..e7ec02f5a02 100644
--- a/tensorflow/core/kernels/data/map_dataset_op.cc
+++ b/tensorflow/core/kernels/data/map_dataset_op.cc
@@ -30,8 +30,9 @@ namespace {
 
 class MapDatasetOp : public UnaryDatasetOpKernel {
  public:
-  using MapIteratorFunction = std::function<Status(
-      IteratorContext*, std::vector<Tensor>, std::vector<Tensor>*)>;
+  using MapIteratorFunction =
+      std::function<Status(IteratorContext*, InstantiatedCapturedFunction*,
+                           std::vector<Tensor>, std::vector<Tensor>*)>;
 
   explicit MapDatasetOp(OpKernelConstruction* ctx) : UnaryDatasetOpKernel(ctx) {
     OP_REQUIRES_OK(ctx, ctx->GetAttr("f", &func_));
@@ -54,15 +55,18 @@ class MapDatasetOp : public UnaryDatasetOpKernel {
     MapIteratorFunction map_func;
     CapturedFunction* raw_captured_func = captured_func.get();
     if (indices.empty()) {
-      map_func = [raw_captured_func](IteratorContext* ctx,
-                                     std::vector<Tensor> args,
-                                     std::vector<Tensor>* out_tensors) {
-        return raw_captured_func->Run(ctx, std::move(args), out_tensors);
+      map_func = [](IteratorContext* ctx,
+                    InstantiatedCapturedFunction* inst_captured_func,
+                    std::vector<Tensor> args,
+                    std::vector<Tensor>* out_tensors) {
+        return inst_captured_func->Run(ctx, std::move(args), out_tensors);
       };
     } else {
       std::vector<bool> can_move = ComputeMoveVector(indices);
       map_func = [raw_captured_func, indices, can_move](
-                     IteratorContext* ctx, std::vector<Tensor> args,
+                     IteratorContext* ctx,
+                     InstantiatedCapturedFunction* inst_captured_func,
+                     std::vector<Tensor> args,
                      std::vector<Tensor>* out_tensors) {
         const std::vector<Tensor>& captured_inputs =
             raw_captured_func->captured_inputs();
@@ -124,6 +128,10 @@ class MapDatasetOp : public UnaryDatasetOpKernel {
 
     string DebugString() const override { return "MapDatasetOp::Dataset"; }
 
+    // TODO(b/120482302): Note that this is inaccurate until MapDataset is
+    // modified to preserve cardinality.
+    int64 Cardinality() const override { return input_->Cardinality(); }
+
    protected:
     Status AsGraphDefInternal(SerializationContext* ctx,
                               DatasetGraphDefBuilder* b,
@@ -177,7 +185,8 @@ class MapDatasetOp : public UnaryDatasetOpKernel {
       Status Initialize(IteratorContext* ctx) override {
         TF_RETURN_IF_ERROR(
             dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_));
-        return dataset()->captured_func_->Instantiate(ctx);
+        return dataset()->captured_func_->Instantiate(
+            ctx, &instantiated_captured_func_);
       }
 
       Status GetNextInternal(IteratorContext* ctx,
@@ -194,7 +203,8 @@ class MapDatasetOp : public UnaryDatasetOpKernel {
           return Status::OK();
         }
 
-        Status s = map_func_(ctx, args, out_tensors);
+        Status s = map_func_(ctx, instantiated_captured_func_.get(), args,
+                             out_tensors);
         if (errors::IsOutOfRange(s)) {
           // `f` may deliberately raise `errors::OutOfRange` to indicate
           // that we should terminate the iteration early.
@@ -226,6 +236,7 @@ class MapDatasetOp : public UnaryDatasetOpKernel {
      private:
       std::unique_ptr<IteratorBase> input_impl_;
       const MapIteratorFunction map_func_;
+      std::unique_ptr<InstantiatedCapturedFunction> instantiated_captured_func_;
     };
 
     const DatasetBase* const input_;
diff --git a/tensorflow/core/kernels/data/model_dataset_op.cc b/tensorflow/core/kernels/data/model_dataset_op.cc
index dcd23095968..069d61d80d4 100644
--- a/tensorflow/core/kernels/data/model_dataset_op.cc
+++ b/tensorflow/core/kernels/data/model_dataset_op.cc
@@ -60,6 +60,8 @@ class ModelDatasetOp : public UnaryDatasetOpKernel {
 
     string DebugString() const override { return "ModelDatasetOp::Dataset"; }
 
+    int64 Cardinality() const override { return input_->Cardinality(); }
+
    protected:
     Status AsGraphDefInternal(SerializationContext* ctx,
                               DatasetGraphDefBuilder* b,
diff --git a/tensorflow/core/kernels/data/multi_device_iterator_ops.cc b/tensorflow/core/kernels/data/multi_device_iterator_ops.cc
index 5268007e3d9..a070456414c 100644
--- a/tensorflow/core/kernels/data/multi_device_iterator_ops.cc
+++ b/tensorflow/core/kernels/data/multi_device_iterator_ops.cc
@@ -17,6 +17,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/process_function_library_runtime.h"
 #include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/function_handle_cache.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/resource_op_kernel.h"
 #include "tensorflow/core/kernels/data/dataset_utils.h"
@@ -40,18 +41,21 @@ using MultiDeviceIteratorCallback =
 
 class MultiDeviceIterator : public ResourceBase {
  public:
-  MultiDeviceIterator(const DataTypeVector& output_types,
-                      const std::vector<PartialTensorShape>& output_shapes,
-                      const std::vector<string>& devices,
-                      std::unique_ptr<FunctionLibraryDefinition> flib_def,
-                      std::unique_ptr<ProcessFunctionLibraryRuntime> pflr,
-                      FunctionLibraryRuntime* lib)
+  MultiDeviceIterator(
+      const DataTypeVector& output_types,
+      const std::vector<PartialTensorShape>& output_shapes,
+      const std::vector<string>& devices,
+      std::unique_ptr<FunctionLibraryDefinition> flib_def,
+      std::unique_ptr<ProcessFunctionLibraryRuntime> pflr,
+      FunctionLibraryRuntime* lib,
+      std::unique_ptr<FunctionHandleCache> function_handle_cache)
       : output_types_(output_types),
         output_shapes_(output_shapes),
         devices_(devices),
         flib_def_(std::move(flib_def)),
         pflr_(std::move(pflr)),
-        lib_(lib) {
+        lib_(lib),
+        function_handle_cache_(std::move(function_handle_cache)) {
     DCHECK(lib_ != nullptr);
   }
 
@@ -93,6 +97,7 @@ class MultiDeviceIterator : public ResourceBase {
     } else {
       IteratorContext::Params params(ctx);
       params.lib = lib_;
+      params.function_handle_cache = function_handle_cache_.get();
       IteratorContext iter_ctx(std::move(params));
       tf_shared_lock l(mu_);
       multi_device_buffer_->GetNextFromShard(
@@ -116,6 +121,10 @@ class MultiDeviceIterator : public ResourceBase {
     return lib_;
   }
 
+  FunctionHandleCache* function_handle_cache() {
+    return function_handle_cache_.get();
+  }
+
  private:
   // A private class that uses a background thread to keep a per device buffer
   // full.
@@ -340,6 +349,7 @@ class MultiDeviceIterator : public ResourceBase {
   const std::unique_ptr<FunctionLibraryDefinition> flib_def_;
   const std::unique_ptr<ProcessFunctionLibraryRuntime> pflr_;
   FunctionLibraryRuntime* const lib_ = nullptr;  // not owned.
+  const std::unique_ptr<FunctionHandleCache> function_handle_cache_;
   std::shared_ptr<const FunctionLibraryDefinition> lib_def_ GUARDED_BY(mu_);
 
   int64 incarnation_id_ GUARDED_BY(mu_) = 0;
@@ -383,21 +393,24 @@ class MultiDeviceIteratorHandleOp : public OpKernel {
         std::unique_ptr<ProcessFunctionLibraryRuntime> pflr(nullptr);
         OP_REQUIRES_OK(context, context->function_library()->Clone(
                                     &flib_def, &pflr, &lib));
+        std::unique_ptr<FunctionHandleCache> function_handle_cache(
+            new FunctionHandleCache(lib));
         ResourceMgr* mgr = context->resource_manager();
         OP_REQUIRES_OK(context, cinfo_.Init(mgr, def()));
 
         MultiDeviceIterator* resource;
-        OP_REQUIRES_OK(
-            context,
-            mgr->LookupOrCreate<MultiDeviceIterator>(
-                cinfo_.container(), cinfo_.name(), &resource,
-                [this, lib, &flib_def, &pflr](MultiDeviceIterator** ret)
-                    EXCLUSIVE_LOCKS_REQUIRED(mu_) {
-                      *ret = new MultiDeviceIterator(
-                          output_types_, output_shapes_, devices_,
-                          std::move(flib_def), std::move(pflr), lib);
-                      return Status::OK();
-                    }));
+        OP_REQUIRES_OK(context,
+                       mgr->LookupOrCreate<MultiDeviceIterator>(
+                           cinfo_.container(), cinfo_.name(), &resource,
+                           [this, lib, &flib_def, &pflr,
+                            &function_handle_cache](MultiDeviceIterator** ret)
+                               EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+                                 *ret = new MultiDeviceIterator(
+                                     output_types_, output_shapes_, devices_,
+                                     std::move(flib_def), std::move(pflr), lib,
+                                     std::move(function_handle_cache));
+                                 return Status::OK();
+                               }));
 
         Status s = VerifyResource(resource);
         if (TF_PREDICT_FALSE(!s.ok())) {
@@ -463,6 +476,7 @@ class MultiDeviceIteratorInitOp : public OpKernel {
     std::unique_ptr<IteratorBase> iterator;
     IteratorContext::Params params(ctx);
     params.lib = resource->lib();
+    params.function_handle_cache = resource->function_handle_cache();
     IteratorContext iter_ctx(std::move(params));
     OP_REQUIRES_OK(
         ctx, dataset->MakeIterator(std::move(iter_ctx), "Iterator", &iterator));
diff --git a/tensorflow/core/kernels/data/optimize_dataset_op.cc b/tensorflow/core/kernels/data/optimize_dataset_op.cc
index 213ee7c6018..ab184c232ef 100644
--- a/tensorflow/core/kernels/data/optimize_dataset_op.cc
+++ b/tensorflow/core/kernels/data/optimize_dataset_op.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/process_function_library_runtime.h"
 #include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/device_base.h"
+#include "tensorflow/core/framework/function_handle_cache.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/graph/graph_constructor.h"
@@ -27,8 +28,10 @@ limitations under the License.
 #include "tensorflow/core/grappler/graph_view.h"
 #include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/grappler/grappler_item_builder.h"
+#include "tensorflow/core/grappler/optimizers/data/function_utils.h"
 #include "tensorflow/core/grappler/optimizers/data/graph_utils.h"
 #include "tensorflow/core/grappler/optimizers/meta_optimizer.h"
+#include "tensorflow/core/lib/core/refcount.h"
 #include "tensorflow/core/lib/random/random.h"
 #include "tensorflow/core/protobuf/meta_graph.pb.h"
 #include "tensorflow/core/protobuf/rewriter_config.pb.h"
@@ -56,8 +59,13 @@ class OptimizeDatasetOp : public UnaryDatasetOpKernel {
         ctx, ParseVectorArgument<string>(ctx, "optimizations", &optimizations));
     Dataset* dataset =
         new Dataset(ctx, input, optimizations, output_types_, output_shapes_);
-    OP_REQUIRES_OK(ctx, dataset->Optimize(ctx));
-    *output = dataset;
+    Status s = dataset->Optimize(ctx);
+    if (s.ok()) {
+      *output = dataset;
+    } else {
+      dataset->Unref();
+      OP_REQUIRES_OK(ctx, s);
+    }
   }
 
  private:
@@ -68,6 +76,7 @@ class OptimizeDatasetOp : public UnaryDatasetOpKernel {
             const DataTypeVector& output_types,
             const std::vector<PartialTensorShape>& output_shapes)
         : DatasetBase(DatasetContext(ctx)),
+          optimized_input_(nullptr),
           input_(input),
           optimizations_(optimizations),
           output_types_(output_types),
@@ -77,7 +86,9 @@ class OptimizeDatasetOp : public UnaryDatasetOpKernel {
 
     ~Dataset() override {
       input_->Unref();
-      optimized_input_->Unref();
+      if (optimized_input_) {
+        optimized_input_->Unref();
+      }
     }
 
     std::unique_ptr<IteratorBase> MakeIteratorInternal(
@@ -115,6 +126,9 @@ class OptimizeDatasetOp : public UnaryDatasetOpKernel {
       TF_RETURN_IF_ERROR(
           ctx->function_library()->Clone(&flib_def_, &pflr_, &lib_));
 
+      // Create a FunctionHandleCache.
+      function_handle_cache_.reset(new FunctionHandleCache(lib_));
+
       // Some functions may have been modified without having their names
       // changed (for example, nested dataset graphs from FlatMap or
       // Interleave). To avoid name conflicts, we remove these functions from
@@ -148,6 +162,8 @@ class OptimizeDatasetOp : public UnaryDatasetOpKernel {
 
     string DebugString() const override { return "OptimizeDatasetOp::Dataset"; }
 
+    int64 Cardinality() const override { return input_->Cardinality(); }
+
    protected:
     Status AsGraphDefInternal(SerializationContext* ctx,
                               DatasetGraphDefBuilder* b,
@@ -167,6 +183,7 @@ class OptimizeDatasetOp : public UnaryDatasetOpKernel {
       Status Initialize(IteratorContext* ctx) override {
         IteratorContext::Params params(ctx);
         params.lib = dataset()->lib_;
+        params.function_handle_cache = dataset()->function_handle_cache_.get();
         return dataset()->optimized_input_->MakeIterator(
             IteratorContext(std::move(params)), prefix(), &input_impl_);
       }
@@ -176,6 +193,7 @@ class OptimizeDatasetOp : public UnaryDatasetOpKernel {
                              bool* end_of_sequence) override {
         IteratorContext::Params params(ctx);
         params.lib = dataset()->lib_;
+        params.function_handle_cache = dataset()->function_handle_cache_.get();
         return input_impl_->GetNext(IteratorContext(std::move(params)),
                                     out_tensors, end_of_sequence);
       }
@@ -202,6 +220,39 @@ class OptimizeDatasetOp : public UnaryDatasetOpKernel {
       std::unique_ptr<IteratorBase> input_impl_;
     };
 
+    void AddFakeSinks(FunctionDef* function_def) {
+      int counter = 0;
+      for (const auto& output : function_def->signature().output_arg()) {
+        NodeDef* node = function_def->add_node_def();
+        tensorflow::grappler::function_utils::SetUniqueFunctionNodeName(
+            strings::StrCat("FakeSink", counter++), function_def, node);
+        node->set_op("Identity");
+        node->add_input(function_def->ret().at(output.name()));
+        (*node->mutable_attr())["T"].set_type(output.type());
+
+        (*function_def->mutable_ret())[output.name()] =
+            strings::StrCat(node->name(), ":output:0");
+      }
+    }
+
+    void RemoveFakeSinks(FunctionDef* function_def) {
+      // Map from identity node names to their input tensor strings
+      std::map<string, string> identity_map;
+      for (const auto& node : function_def->node_def()) {
+        if (node.op() == "Identity" && node.input_size() == 1) {
+          identity_map[node.name()] = node.input(0);
+        }
+      }
+      for (const auto& output_arg : function_def->signature().output_arg()) {
+        const string& tensor = function_def->ret().at(output_arg.name());
+        const string& output_node = tensor.substr(0, tensor.find(':'));
+        if (identity_map.find(output_node) != identity_map.end()) {
+          (*function_def->mutable_ret())[output_arg.name()] =
+              identity_map.at(output_node);
+        }
+      }
+    }
+
     Status ApplyOptimizations(OpKernelContext* ctx, GraphDef* graph_def,
                               string* output_node) {
       // Add an identity node as the fetch node, otherwise we might get
@@ -215,6 +266,15 @@ class OptimizeDatasetOp : public UnaryDatasetOpKernel {
       (*node->mutable_attr())["T"].set_type(DT_VARIANT);
       *output_node = node->name();
 
+      // Add fake sink node to graph and functions to allow rewriting the actual
+      // sink nodes.
+      // TODO(b/118820916): When MetaOptimizer adds provisions for function
+      // retvals to be optimizable, we will no longer need this.
+      for (auto& function_def :
+           *graph_def->mutable_library()->mutable_function()) {
+        AddFakeSinks(&function_def);
+      }
+
       // Create metagraph.
       MetaGraphDef meta_graph_def;
       (*meta_graph_def.mutable_graph_def()) = *graph_def;
@@ -226,7 +286,9 @@ class OptimizeDatasetOp : public UnaryDatasetOpKernel {
       (*meta_graph_def.mutable_collection_def())["train_op"] = collection_def;
 
       // Create Grappler item.
-      tensorflow::RewriterConfig rewriter_config;
+      tensorflow::ConfigProto config;
+      RewriterConfig& rewriter_config =
+          *config.mutable_graph_options()->mutable_rewrite_options();
       for (const string& optimization : optimizations_) {
         rewriter_config.add_optimizers(optimization);
       }
@@ -264,7 +326,15 @@ class OptimizeDatasetOp : public UnaryDatasetOpKernel {
         }
       }
       TF_RETURN_IF_ERROR(tensorflow::grappler::RunMetaOptimizer(
-          *grappler_item, rewriter_config, ctx->device(), &cluster, graph_def));
+          *grappler_item, config, ctx->device(), &cluster, graph_def));
+
+      // Remove fake sinks after optimizations are done.
+      // TODO(b/118820916): When MetaOptimizer adds provisions for function
+      // retvals to be optimizable, we will no longer need this.
+      for (auto& function_def :
+           *graph_def->mutable_library()->mutable_function()) {
+        RemoveFakeSinks(&function_def);
+      }
 
       return Status::OK();
     }
@@ -273,6 +343,7 @@ class OptimizeDatasetOp : public UnaryDatasetOpKernel {
     FunctionLibraryRuntime* lib_ = nullptr;
     std::unique_ptr<ProcessFunctionLibraryRuntime> pflr_ = nullptr;
     std::unique_ptr<FunctionLibraryDefinition> flib_def_ = nullptr;
+    std::unique_ptr<FunctionHandleCache> function_handle_cache_ = nullptr;
     const DatasetBase* input_;
     const std::vector<string> optimizations_;
     const DataTypeVector output_types_;
diff --git a/tensorflow/core/kernels/data/optional_ops.cc b/tensorflow/core/kernels/data/optional_ops.cc
index 2ab5c83082b..d8a7f21c5f9 100644
--- a/tensorflow/core/kernels/data/optional_ops.cc
+++ b/tensorflow/core/kernels/data/optional_ops.cc
@@ -22,75 +22,6 @@ limitations under the License.
 namespace tensorflow {
 namespace data {
 namespace {
-const char kOptionalVariantTypeName[] = "tensorflow::data::Optional";
-
-// An `OptionalVariant` can represent either an "actual value" (a tuple of
-// tensors) or "none", and may be stored in a DT_VARIANT tensor.
-class OptionalVariant {
- public:
-  // Create an `OptionalVariant` with no actual value.
-  OptionalVariant() : values_(nullptr) {}
-
-  // Create an `OptionalVariant` with the actual value given by the tuple of
-  // tensors in `values`.
-  explicit OptionalVariant(std::vector<Tensor> values)
-      : values_(new std::vector<Tensor>(std::move(values))) {}
-
-  OptionalVariant(const OptionalVariant& other) : values_(other.values_) {}
-
-  // Returns true if `this` represents an actual value.
-  bool has_value() const { return values_ != nullptr; }
-
-  // REQUIRES: `this->has_value()` must be true.
-  const std::vector<Tensor>& get_values() const {
-    CHECK(values_) << "Tried to get values from an empty OptionalVariant";
-    return *values_;
-  }
-
-  // Implementations of the necessary methods for using `OptionalVariant`
-  // objects in DT_VARIANT tensors.
-  string TypeName() const { return kOptionalVariantTypeName; }
-  void Encode(VariantTensorData* data) const {
-    data->set_metadata(values_ != nullptr);
-    if (values_ != nullptr) {
-      for (const auto& t : *values_) {
-        *(data->add_tensors()) = t;
-      }
-    }
-  }
-
-  bool Decode(const VariantTensorData& data) {
-    if (data.type_name() != TypeName()) {
-      return false;
-    }
-    bool has_value = false;
-    if (!data.get_metadata(&has_value)) {
-      return false;
-    }
-    if (has_value) {
-      values_.reset(new std::vector<Tensor>(data.tensors()));
-    } else {
-      values_.reset();
-    }
-    return true;
-  }
-
-  string DebugString() const {
-    if (values_) {
-      return strings::StrCat("OptionalVariant<", "values: (",
-                             str_util::Join(*values_, ", ",
-                                            [](string* s, const Tensor& elem) {
-                                              *s = elem.DebugString();
-                                            }),
-                             ")>");
-    } else {
-      return strings::StrCat("OptionalVariant<None>");
-    }
-  }
-
- private:
-  std::shared_ptr<const std::vector<Tensor>> values_;
-};
 
 class OptionalNoneOp : public OpKernel {
  public:
@@ -143,6 +74,12 @@ class OptionalGetValueOp : public OpKernel {
   explicit OptionalGetValueOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
     OP_REQUIRES_OK(ctx, ctx->GetAttr("output_shapes", &output_shapes_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("output_types", &output_types_));
+    OP_REQUIRES(
+        ctx, output_shapes_.size() == output_types_.size(),
+        errors::InvalidArgument(
+            "output_types and output_shapes must be same length, got:\n",
+            "output_types: ", output_types_.size(), "\n",
+            "output_shapes: ", output_shapes_.size()));
   }
 
   void Compute(OpKernelContext* ctx) override {
@@ -162,6 +99,10 @@ class OptionalGetValueOp : public OpKernel {
         ctx, optional->has_value(),
         errors::InvalidArgument("The given optional does not have a value."));
     const auto& components = optional->get_values();
+    OP_REQUIRES(ctx, components.size() == output_types_.size(),
+                errors::InvalidArgument(
+                    "The given optional has ", components.size(),
+                    " components, expected ", output_types_.size()));
     for (int i = 0; i < components.size(); ++i) {
       OP_REQUIRES(
           ctx, components[i].dtype() == output_types_[i],
@@ -186,23 +127,27 @@ class OptionalGetValueOp : public OpKernel {
   std::vector<PartialTensorShape> output_shapes_;
 };
 
-REGISTER_KERNEL_BUILDER(Name("OptionalNone").Device(DEVICE_CPU),
+REGISTER_KERNEL_BUILDER(Name("OptionalNone").Device(DEVICE_CPU).Priority(2),
                         OptionalNoneOp);
-REGISTER_KERNEL_BUILDER(Name("OptionalNone").Device(DEVICE_GPU),
+REGISTER_KERNEL_BUILDER(Name("OptionalNone").Device(DEVICE_GPU).Priority(1),
                         OptionalNoneOp);
-REGISTER_KERNEL_BUILDER(Name("OptionalFromValue").Device(DEVICE_CPU),
-                        OptionalFromValueOp);
-REGISTER_KERNEL_BUILDER(Name("OptionalFromValue").Device(DEVICE_GPU),
-                        OptionalFromValueOp);
-
-REGISTER_KERNEL_BUILDER(Name("OptionalHasValue").Device(DEVICE_CPU),
-                        OptionalHasValueOp);
 REGISTER_KERNEL_BUILDER(
-    Name("OptionalHasValue").Device(DEVICE_GPU).HostMemory("has_value"),
-    OptionalHasValueOp);
-REGISTER_KERNEL_BUILDER(Name("OptionalGetValue").Device(DEVICE_CPU),
+    Name("OptionalFromValue").Device(DEVICE_CPU).Priority(2),
+    OptionalFromValueOp);
+REGISTER_KERNEL_BUILDER(
+    Name("OptionalFromValue").Device(DEVICE_GPU).Priority(1),
+    OptionalFromValueOp);
+
+REGISTER_KERNEL_BUILDER(Name("OptionalHasValue").Device(DEVICE_CPU).Priority(2),
+                        OptionalHasValueOp);
+REGISTER_KERNEL_BUILDER(Name("OptionalHasValue")
+                            .Device(DEVICE_GPU)
+                            .HostMemory("has_value")
+                            .Priority(1),
+                        OptionalHasValueOp);
+REGISTER_KERNEL_BUILDER(Name("OptionalGetValue").Device(DEVICE_CPU).Priority(2),
                         OptionalGetValueOp);
-REGISTER_KERNEL_BUILDER(Name("OptionalGetValue").Device(DEVICE_GPU),
+REGISTER_KERNEL_BUILDER(Name("OptionalGetValue").Device(DEVICE_GPU).Priority(1),
                         OptionalGetValueOp);
 
 static Status OptionalDeviceCopy(
@@ -213,15 +158,7 @@ static Status OptionalDeviceCopy(
     std::vector<Tensor> to_values;
     to_values.reserve(from_values.size());
     for (const Tensor& t : from_values) {
-      if (t.dtype() == DT_VARIANT) {
-        // TODO(b/116349787): Implement support for nested variants.
-        return errors::Unimplemented(
-            "Support for copying nested variants to device has not yet been "
-            "implemented.");
-      }
-    }
-    for (const Tensor& t : from_values) {
-      if (DMAHelper::CanUseDMA(&t)) {
+      if (DMAHelper::CanUseDMA(&t) || t.dtype() == DT_VARIANT) {
         Tensor tmp(t.dtype());
         TF_RETURN_IF_ERROR(copy(t, &tmp));
         to_values.push_back(std::move(tmp));
@@ -272,5 +209,20 @@ Status WriteOptionalNoneToOutput(OpKernelContext* ctx, int output_index) {
   return Status::OK();
 }
 
+REGISTER_UNARY_VARIANT_UNARY_OP_FUNCTION(ZEROS_LIKE_VARIANT_UNARY_OP,
+                                         DEVICE_CPU, OptionalVariant,
+                                         OptionalZerosLike<CPUDevice>);
+
+REGISTER_UNARY_VARIANT_BINARY_OP_FUNCTION(ADD_VARIANT_BINARY_OP, DEVICE_CPU,
+                                          OptionalVariant,
+                                          OptionalBinaryAdd<CPUDevice>);
+
+Status OptionalShape(const OptionalVariant& x, TensorShape* s) {
+  *s = TensorShape({});
+  return Status::OK();
+}
+
+REGISTER_UNARY_VARIANT_SHAPE_FUNCTION(OptionalVariant, OptionalShape);
+
 }  // namespace data
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/optional_ops.cu.cc b/tensorflow/core/kernels/data/optional_ops.cu.cc
new file mode 100644
index 00000000000..eb4a95a6f22
--- /dev/null
+++ b/tensorflow/core/kernels/data/optional_ops.cu.cc
@@ -0,0 +1,37 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#define EIGEN_USE_THREADS
+#if GOOGLE_CUDA
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/kernels/data/optional_ops.h"
+
+#include "tensorflow/core/framework/variant_op_registry.h"
+
+namespace tensorflow {
+namespace data {
+
+REGISTER_UNARY_VARIANT_UNARY_OP_FUNCTION(ZEROS_LIKE_VARIANT_UNARY_OP,
+                                         DEVICE_GPU, OptionalVariant,
+                                         OptionalZerosLike<GPUDevice>);
+
+REGISTER_UNARY_VARIANT_BINARY_OP_FUNCTION(ADD_VARIANT_BINARY_OP, DEVICE_GPU,
+                                          OptionalVariant,
+                                          OptionalBinaryAdd<GPUDevice>);
+
+}  // namespace data
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/data/optional_ops.h b/tensorflow/core/kernels/data/optional_ops.h
index 2cbf2933f50..ef14e843115 100644
--- a/tensorflow/core/kernels/data/optional_ops.h
+++ b/tensorflow/core/kernels/data/optional_ops.h
@@ -19,10 +19,13 @@ limitations under the License.
 
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/variant_tensor_data.h"
+#include "tensorflow/core/util/tensor_ops_util.h"
 
 namespace tensorflow {
 namespace data {
 
+const char kOptionalVariantTypeName[] = "tensorflow::data::Optional";
+
 // Stores a DT_VARIANT value representing an Optional with the given value
 // in the `output_index`^th output of the given kernel execution context.
 Status WriteOptionalWithValueToOutput(OpKernelContext* ctx, int output_index,
@@ -32,6 +35,122 @@ Status WriteOptionalWithValueToOutput(OpKernelContext* ctx, int output_index,
 // in the `output_index`^th output of the given kernel execution context.
 Status WriteOptionalNoneToOutput(OpKernelContext* ctx, int output_index);
 
+// An `OptionalVariant` can represent either an "actual value" (a tuple of
+// tensors) or "none", and may be stored in a DT_VARIANT tensor.
+class OptionalVariant {
+ public:
+  // Create an `OptionalVariant` with no actual value.
+  OptionalVariant() : values_(nullptr) {}
+
+  // Create an `OptionalVariant` with the actual value given by the tuple of
+  // tensors in `values`.
+  explicit OptionalVariant(std::vector<Tensor> values)
+      : values_(new std::vector<Tensor>(std::move(values))) {}
+
+  OptionalVariant(const OptionalVariant& other) : values_(other.values_) {}
+
+  // Returns true if `this` represents an actual value.
+  bool has_value() const { return values_ != nullptr; }
+
+  // REQUIRES: `this->has_value()` must be true.
+  const std::vector<Tensor>& get_values() const {
+    DCHECK(values_) << "Tried to get values from an empty OptionalVariant";
+    return *values_;
+  }
+
+  // Implementations of the necessary methods for using `OptionalVariant`
+  // objects in DT_VARIANT tensors.
+  string TypeName() const { return kOptionalVariantTypeName; }
+  void Encode(VariantTensorData* data) const {
+    data->set_metadata(values_ != nullptr);
+    if (values_ != nullptr) {
+      for (const auto& t : *values_) {
+        *(data->add_tensors()) = t;
+      }
+    }
+  }
+
+  bool Decode(const VariantTensorData& data) {
+    if (data.type_name() != TypeName()) {
+      return false;
+    }
+    bool has_value = false;
+    if (!data.get_metadata(&has_value)) {
+      return false;
+    }
+    if (has_value) {
+      values_.reset(new std::vector<Tensor>(data.tensors()));
+    } else {
+      values_.reset();
+    }
+    return true;
+  }
+
+  string DebugString() const {
+    if (values_) {
+      return strings::StrCat("OptionalVariant<", "values: (",
+                             str_util::Join(*values_, ", ",
+                                            [](string* s, const Tensor& elem) {
+                                              *s = elem.DebugString();
+                                            }),
+                             ")>");
+    } else {
+      return strings::StrCat("OptionalVariant<None>");
+    }
+  }
+
+ private:
+  std::shared_ptr<const std::vector<Tensor>> values_;
+};
+
+template <typename Device>
+Status OptionalZerosLike(OpKernelContext* ctx, const OptionalVariant& x,
+                         OptionalVariant* y) {
+  if (!x.has_value()) {
+    *y = x;
+    return Status::OK();
+  }
+  std::vector<Tensor> zero_tensors;
+  for (const Tensor& tensor : x.get_values()) {
+    Tensor zero_t;
+    TF_RETURN_IF_ERROR(ZerosLikeTensor<Device>(ctx, tensor, &zero_t));
+    zero_tensors.push_back(std::move(zero_t));
+  }
+  *y = OptionalVariant(zero_tensors);
+  return Status::OK();
+}
+
+template <typename Device>
+Status OptionalBinaryAdd(OpKernelContext* ctx, const OptionalVariant& a,
+                         const OptionalVariant& b, OptionalVariant* out) {
+  // TODO(skyewm): should adding a value to a non-value be a no-op instead?
+  if (a.has_value() != b.has_value()) {
+    return errors::InvalidArgument(
+        "Cannot add optionals because one has a value and the other doesn't.");
+  }
+  if (!a.has_value()) {
+    *out = a;
+    return Status::OK();
+  }
+  if (a.get_values().size() != b.get_values().size()) {
+    return errors::InvalidArgument(
+        "Cannot add optionals because they have different numbers of "
+        "components (",
+        a.get_values().size(), " vs. ", b.get_values().size(), ").");
+  }
+  std::vector<Tensor> out_tensors;
+  for (int i = 0; i < a.get_values().size(); ++i) {
+    const Tensor& a_tensor = a.get_values()[i];
+    const Tensor& b_tensor = b.get_values()[i];
+    Tensor out_tensor;
+    TF_RETURN_IF_ERROR(
+        BinaryAddTensors<Device>(ctx, a_tensor, b_tensor, &out_tensor));
+    out_tensors.push_back(std::move(out_tensor));
+  }
+  *out = OptionalVariant(out_tensors);
+  return Status::OK();
+}
+
 }  // namespace data
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/kernels/data/padded_batch_dataset_op.cc b/tensorflow/core/kernels/data/padded_batch_dataset_op.cc
index 594a9ce7ec2..0fff4c53706 100644
--- a/tensorflow/core/kernels/data/padded_batch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/padded_batch_dataset_op.cc
@@ -152,6 +152,15 @@ class PaddedBatchDatasetOp : public UnaryDatasetOpKernel {
                              ")::Dataset");
     }
 
+    int64 Cardinality() const override {
+      int64 n = input_->Cardinality();
+      if (n == kInfiniteCardinality || n == kUnknownCardinality) {
+        return n;
+      }
+      return n / batch_size_ +
+             (n % batch_size_ == 0 || drop_remainder_ ? 0 : 1);
+    }
+
    protected:
     Status AsGraphDefInternal(SerializationContext* ctx,
                               DatasetGraphDefBuilder* b,
diff --git a/tensorflow/core/kernels/data/parallel_interleave_dataset_op.cc b/tensorflow/core/kernels/data/parallel_interleave_dataset_op.cc
index 985e197a993..5d33d1e7957 100644
--- a/tensorflow/core/kernels/data/parallel_interleave_dataset_op.cc
+++ b/tensorflow/core/kernels/data/parallel_interleave_dataset_op.cc
@@ -23,7 +23,6 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/kernels/data/captured_function.h"
 #include "tensorflow/core/kernels/data/dataset_utils.h"
-#include "tensorflow/core/lib/core/error_codes.pb.h"
 #include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/lib/gtl/cleanup.h"
 #include "tensorflow/core/lib/random/random.h"
@@ -36,1047 +35,6 @@ namespace {
 // See documentation in ../../ops/dataset_ops.cc for a high-level
 // description of the following op.
 
-class ParallelInterleaveDatasetOp : public UnaryDatasetOpKernel {
- public:
-  explicit ParallelInterleaveDatasetOp(OpKernelConstruction* ctx)
-      : UnaryDatasetOpKernel(ctx) {
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("f", &interleave_func_));
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_types", &output_types_));
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_shapes", &output_shapes_));
-  }
-
-  void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
-                   DatasetBase** output) override {
-    int64 cycle_length = 0;
-    OP_REQUIRES_OK(ctx,
-                   ParseScalarArgument(ctx, "cycle_length", &cycle_length));
-    OP_REQUIRES(ctx, cycle_length > 0,
-                errors::InvalidArgument("`cycle_length` must be > 0"));
-
-    int64 block_length = 0;
-    OP_REQUIRES_OK(ctx,
-                   ParseScalarArgument(ctx, "block_length", &block_length));
-    OP_REQUIRES(ctx, block_length > 0,
-                errors::InvalidArgument("`block_length` must be > 0"));
-
-    bool sloppy = false;
-    OP_REQUIRES_OK(ctx, ParseScalarArgument(ctx, "sloppy", &sloppy));
-
-    int64 buffer_output_elements = 0;
-    OP_REQUIRES_OK(ctx, ParseScalarArgument(ctx, "buffer_output_elements",
-                                            &buffer_output_elements));
-    OP_REQUIRES(
-        ctx, buffer_output_elements > 0,
-        errors::InvalidArgument("`buffer_output_elements` must be > 0"));
-
-    int64 prefetch_input_elements = 0;
-    OP_REQUIRES_OK(ctx, ParseScalarArgument(ctx, "prefetch_input_elements",
-                                            &prefetch_input_elements));
-    OP_REQUIRES(
-        ctx, prefetch_input_elements >= 0,
-        errors::InvalidArgument("`prefetch_input_elements` must be >= 0"));
-
-    std::unique_ptr<CapturedFunction> captured_func;
-    OP_REQUIRES_OK(
-        ctx, CapturedFunction::Create(interleave_func_, ctx, "other_arguments",
-                                      &captured_func));
-
-    *output =
-        new Dataset(ctx, input, interleave_func_, std::move(captured_func),
-                    cycle_length, block_length, sloppy, buffer_output_elements,
-                    prefetch_input_elements, output_types_, output_shapes_);
-  }
-
- private:
-  class Dataset : public DatasetBase {
-   public:
-    Dataset(OpKernelContext* ctx, const DatasetBase* input,
-            const NameAttrList& func,
-            std::unique_ptr<CapturedFunction> captured_func, int64 cycle_length,
-            int64 block_length, bool sloppy, int64 buffer_output_elements,
-            int64 prefetch_input_elements, const DataTypeVector& output_types,
-            const std::vector<PartialTensorShape>& output_shapes)
-        : DatasetBase(DatasetContext(ctx)),
-          input_(input),
-          interleave_func_(func),
-          captured_func_(std::move(captured_func)),
-          cycle_length_(cycle_length),
-          block_length_(block_length),
-          sloppy_(sloppy),
-          buffer_output_elements_(buffer_output_elements),
-          prefetch_input_elements_(prefetch_input_elements),
-          output_types_(output_types),
-          output_shapes_(output_shapes) {
-      input_->Ref();
-    }
-
-    ~Dataset() override { input_->Unref(); }
-
-    std::unique_ptr<IteratorBase> MakeIteratorInternal(
-        const string& prefix) const override {
-      return std::unique_ptr<IteratorBase>(new Iterator(
-          {this, strings::StrCat(prefix, "::ParallelInterleave")}));
-    }
-
-    const DataTypeVector& output_dtypes() const override {
-      return output_types_;
-    }
-
-    const std::vector<PartialTensorShape>& output_shapes() const override {
-      return output_shapes_;
-    }
-
-    string DebugString() const override {
-      return "ParallelInterleaveDatasetOp::Dataset";
-    }
-
-   protected:
-    Status AsGraphDefInternal(SerializationContext* ctx,
-                              DatasetGraphDefBuilder* b,
-                              Node** output) const override {
-      TF_RETURN_IF_ERROR(b->AddFunction(ctx, interleave_func_.name()));
-      Node* input_node;
-      TF_RETURN_IF_ERROR(b->AddInputDataset(ctx, input_, &input_node));
-      Node* cycle_length_node;
-      TF_RETURN_IF_ERROR(b->AddScalar(cycle_length_, &cycle_length_node));
-      Node* block_length_node;
-      TF_RETURN_IF_ERROR(b->AddScalar(block_length_, &block_length_node));
-      Node* sloppy_node;
-      TF_RETURN_IF_ERROR(b->AddScalar(sloppy_, &sloppy_node));
-      Node* buffer_output_elements_node;
-      TF_RETURN_IF_ERROR(
-          b->AddScalar(buffer_output_elements_, &buffer_output_elements_node));
-      Node* prefetch_input_elements_node;
-      TF_RETURN_IF_ERROR(b->AddScalar(prefetch_input_elements_,
-                                      &prefetch_input_elements_node));
-      DataTypeVector other_arguments_types;
-      other_arguments_types.reserve(captured_func_->captured_inputs().size());
-      std::vector<Node*> other_arguments;
-      other_arguments.reserve(captured_func_->captured_inputs().size());
-      for (const Tensor& t : captured_func_->captured_inputs()) {
-        Node* node;
-        TF_RETURN_IF_ERROR(b->AddTensor(t, &node));
-        other_arguments.emplace_back(node);
-        other_arguments_types.emplace_back(t.dtype());
-      }
-      AttrValue f;
-      b->BuildAttrValue(interleave_func_, &f);
-      AttrValue other_arguments_types_attr;
-      b->BuildAttrValue(other_arguments_types, &other_arguments_types_attr);
-
-      TF_RETURN_IF_ERROR(b->AddDataset(
-          this,
-          {{0, input_node},
-           {2, cycle_length_node},
-           {3, block_length_node},
-           {4, sloppy_node},
-           {5, buffer_output_elements_node},
-           {6, prefetch_input_elements_node}},
-          {{1, other_arguments}},
-          {{"f", f}, {"Targuments", other_arguments_types_attr}}, output));
-      return Status::OK();
-    }
-
-   private:
-    int64 num_threads() const {
-      return cycle_length_ + prefetch_input_elements_;
-    }
-
-    // Parallel interleave's implementation is designed around a few principles:
-    //  1. Thread creation is relatively expensive. (Not reusing
-    //     threads causes a number of indirect costs such as poorer tcmalloc
-    //     performance due to thread-local caches, etc.) We allocate a fixed
-    //     number of threads at the start and never change. This is why we've
-    //     fused functionality that is theoretically orthogonal (i.e.
-    //     .prefetch()) into the implementation.
-    //  2. Drop-in replacement for standard interleave. The goal will be to
-    //     auto-opt people into an optimized implementation without any work
-    //     on the customer's part. We thus go through great pains to maintain
-    //     identical iteration orders, full determinism (disabled only via a
-    //     flag, etc.)
-    //  3. Performance across a variety of environments and I/O envelopes.
-    //
-    // The actual implementation centers around a collection of worker threads
-    // and their corresponding worker state (tracked in the `workers_` vector).
-    // Worker threads repeatedly receive a vector of Tensors that are used as
-    // input to the flat-map function (`captured_func_`). The output of this
-    // function must be a dataset. The worker thread then repeatedly calls
-    // `GetNext()`, maintaining a buffer of elements to minimize the likelihood
-    // that a caller will block waiting for an element to be produced.
-    //
-    // Pointers to these worker states are kept in 2 disjoint data structures:
-    //  1. `interleave_indices_` is a vector containing indices of WorkerStates
-    //     in `workers_` that we are interleaving. Worker threads backing these
-    //     WorkerStates should be regularly producing values.
-    //  2. `staging_indices_` is a deque containing indices of WorkerStates in
-    //     `workers_` that we will move to `interleave_indices_` when an
-    //     iterator in `interleave_indices_` is exhausted.
-    //
-    // The client calls `GetNext[Internal]()` to retrieve an output element. The
-    // internal implementation updates the state of `interleave_indices_` and
-    // `staging_indices_` as output iterators (run by the worker threads) are
-    // exhausted.
-    //
-    // `input_impl_` is the input iterator that generates arguments for the
-    // flat-map function (`captured_func_`). It is set to an iterator at
-    // Iterator construction, and is fixed until we consume all input elements.
-    // Once it is exhausted, we reset the unique_ptr to eagerly deallocate
-    // memory.
-    //
-    // A few invariants are maintained:
-    //  1. No element in interleave_indices_ should be a -1 unless
-    //     `staging_indices_` is empty and `input_impl_` is empty.
-    //  2. Every `worker_` element is pointed to by at most one element of the
-    //     union of `interleave_indices_` and `staging_indices_`.
-    //  3. Unless `input_impl_` is empty, every `worker_` must be pointed to by
-    //     an element in `interleave_indices_` or `staging_indices_`.
-    class Iterator : public DatasetIterator<Dataset> {
-     public:
-      explicit Iterator(const Params& params)
-          : DatasetIterator<Dataset>(params),
-            workers_(dataset()->num_threads()),
-            worker_thread_states_(dataset()->num_threads()) {}
-
-      ~Iterator() override {
-        mutex_lock l(mu_);
-        cancelled_ = true;
-        // Notify all workers in case they are blocked.
-        for (auto& worker : workers_) {
-          worker.cond_var.notify_all();
-        }
-      }
-
-      Status Initialize(IteratorContext* ctx) override {
-        TF_RETURN_IF_ERROR(
-            dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_));
-        return dataset()->captured_func_->Instantiate(ctx);
-      }
-
-      // It is implemented so that it matches the deterministic interleave
-      // unless getting the next element would block and we are allowed to be
-      // sloppy.
-      Status GetNextInternal(IteratorContext* ctx,
-                             std::vector<Tensor>* out_tensors,
-                             bool* end_of_sequence) override {
-        mutex_lock l(mu_);
-        TF_RETURN_IF_ERROR(EnsureWorkerThreadsStarted(ctx));
-        while (!cancelled_) {
-          // Wait for an item to become available, blocking if necessary. If we
-          // are allowed to be sloppy, we can skip over input datasets that do
-          // not have an item readily available.
-          bool can_produce_elements = false;
-          bool must_wait_for_input = true;
-          for (int64 i = 0; i < interleave_indices_.size(); ++i) {
-            int64 index = (next_index_ + i) % interleave_indices_.size();
-            int64 current_worker_index = interleave_indices_[index];
-            if (current_worker_index < 0) {
-              continue;  // Empty interleave elements.
-            }
-            WorkerState* current_worker = &workers_[current_worker_index];
-            can_produce_elements |= current_worker->MayHaveElements();
-            if (!current_worker->outputs.empty()) {
-              // We have an element!
-              next_index_ = index;
-              const bool element_acquired_sloppily =
-                  dataset()->sloppy_ && i > 1;
-              if (!element_acquired_sloppily) {
-                // If the element was acquired in the regular (non-sloppy)
-                // order, then advance the current block and cycle pointers to
-                // the next element in the regular order.
-                block_count_++;
-                if (block_count_ == dataset()->block_length_) {
-                  next_index_ = (index + 1) % interleave_indices_.size();
-                  block_count_ = 0;
-                }
-              } else {
-                block_count_ = 0;
-              }
-              *end_of_sequence = false;
-              Status s = current_worker->outputs.front().status;
-              current_worker->outputs.front().output.swap(*out_tensors);
-              current_worker->outputs.pop_front();
-              current_worker->cond_var.notify_one();
-              return s;
-            } else if (current_worker->is_producing && !dataset()->sloppy_) {
-              // current_worker.outputs.empty(), and we must wait for this
-              // iterator.
-              if (next_index_ != index) {
-                // We have advanced to a new iterator; reset block counts.
-                next_index_ = index;
-                block_count_ = 0;
-              }
-              break;
-            } else if (!current_worker->is_producing) {
-              // This iterator has reached end of input.
-              interleave_indices_[index] = -1;
-              if (input_impl_) {
-                // Start prefetching a new iterator.
-                std::vector<Tensor> args;
-                bool end_of_input = false;
-                Status s = input_impl_->GetNext(ctx, &args, &end_of_input);
-                if (end_of_input) {
-                  input_impl_.reset();
-                } else {
-                  current_worker->SetInputs(s, std::move(args));
-                  staging_indices_.emplace_back(current_worker_index);
-                }
-              }
-
-              if (!staging_indices_.empty()) {
-                // Move a worker from `staging_indices_` to
-                // `interleave_indices_`.
-                interleave_indices_[index] = staging_indices_.front();
-                staging_indices_.pop_front();
-
-                next_index_ = (index + 1) % interleave_indices_.size();
-                block_count_ = 0;
-                // Restart the inner [for] loop
-                can_produce_elements = true;
-                must_wait_for_input = false;
-                break;
-              }
-            }
-          }
-
-          if (!can_produce_elements && !input_impl_) {
-            // No potential for future values.
-            *end_of_sequence = true;
-            return Status::OK();
-          }
-
-          if (must_wait_for_input) {
-            // Wait for elements to become available.
-            RecordStop(ctx);
-            if (dataset()->sloppy_) {
-              sloppy_cond_var_.wait(l);
-            } else {
-              workers_[interleave_indices_[next_index_]].cond_var.wait(l);
-            }
-            RecordStart(ctx);
-          }
-        }
-        return errors::Cancelled(
-            "ParallelInterleaveDatasetOp::Dataset::Iterator::GetNext");
-      }
-
-     protected:
-      std::shared_ptr<model::Node> CreateNode(
-          IteratorContext* ctx, model::Node::Args args) const override {
-        return model::MakeAsyncInterleaveManyNode(std::move(args),
-                                                  /*parameters=*/{});
-      }
-
-      Status SaveInternal(IteratorStateWriter* writer) override {
-        // The order of locking is important here to avoid deadlock.
-        mutex_lock l(mu_);
-        mutex_lock ckpt_l(ckpt_mu_);
-        if (input_impl_) {
-          TF_RETURN_IF_ERROR(SaveInput(writer, input_impl_));
-        } else {
-          TF_RETURN_IF_ERROR(
-              writer->WriteScalar(full_name("input_exhausted"), ""));
-        }
-        TF_RETURN_IF_ERROR(
-            writer->WriteScalar(full_name("next_index"), next_index_));
-        TF_RETURN_IF_ERROR(
-            writer->WriteScalar(full_name("block_count"), block_count_));
-        TF_RETURN_IF_ERROR(
-            writer->WriteScalar(full_name("workers_size"), workers_.size()));
-        for (int i = 0; i < workers_.size(); ++i) {
-          TF_RETURN_IF_ERROR(WriteWorkerStateLocked(writer, i));
-        }
-        for (int i = 0; i < worker_thread_states_.size(); ++i) {
-          TF_RETURN_IF_ERROR(WriteWorkerThreadStateLocked(writer, i));
-        }
-        TF_RETURN_IF_ERROR(writer->WriteScalar(full_name("interleave_size"),
-                                               interleave_indices_.size()));
-        for (int i = 0; i < interleave_indices_.size(); ++i) {
-          TF_RETURN_IF_ERROR(writer->WriteScalar(
-              full_name(strings::StrCat("interleave_indices_", i)),
-              interleave_indices_[i]));
-        }
-        TF_RETURN_IF_ERROR(writer->WriteScalar(full_name("staging_size"),
-                                               staging_indices_.size()));
-        for (int i = 0; i < staging_indices_.size(); ++i) {
-          TF_RETURN_IF_ERROR(writer->WriteScalar(
-              full_name(strings::StrCat("staging_indices_", i)),
-              staging_indices_[i]));
-        }
-        if (!worker_threads_.empty()) {
-          TF_RETURN_IF_ERROR(
-              writer->WriteScalar(full_name("worker_threads_running"), ""));
-        }
-        return Status::OK();
-      }
-
-      Status RestoreInternal(IteratorContext* ctx,
-                             IteratorStateReader* reader) override {
-        // The order of locking is important here to avoid deadlock.
-        mutex_lock l(mu_);
-        mutex_lock ckpt_l(ckpt_mu_);
-        if (!reader->Contains(full_name("input_exhausted"))) {
-          TF_RETURN_IF_ERROR(RestoreInput(ctx, reader, input_impl_));
-        } else {
-          input_impl_.reset();
-        }
-        int64 temp;
-        TF_RETURN_IF_ERROR(reader->ReadScalar(full_name("next_index"), &temp));
-        next_index_ = size_t(temp);
-        TF_RETURN_IF_ERROR(reader->ReadScalar(full_name("block_count"), &temp));
-        block_count_ = size_t(temp);
-
-        // Restore WorkerStates.
-        TF_RETURN_IF_ERROR(
-            reader->ReadScalar(full_name("workers_size"), &temp));
-        if (temp != dataset()->num_threads()) {
-          return errors::Internal("Expected ", dataset()->num_threads(),
-                                  " worker states but found ", temp, ".");
-        }
-        for (size_t i = 0; i < dataset()->num_threads(); ++i) {
-          TF_RETURN_IF_ERROR(ReadWorkerStateLocked(reader, i, ctx));
-        }
-        for (size_t i = 0; i < dataset()->num_threads(); ++i) {
-          TF_RETURN_IF_ERROR(ReadWorkerThreadStateLocked(reader, i, ctx));
-        }
-
-        // Restore `interleave_indices_`.
-        std::set<int64> all_indices;
-        {
-          int64 interleave_size;
-          TF_RETURN_IF_ERROR(reader->ReadScalar(full_name("interleave_size"),
-                                                &interleave_size));
-          interleave_indices_.reserve(interleave_size);
-          for (int64 i = 0; i < interleave_size; ++i) {
-            int64 temp;
-            TF_RETURN_IF_ERROR(reader->ReadScalar(
-                full_name(strings::StrCat("interleave_indices_", i)), &temp));
-            if (temp >= 0 && all_indices.find(temp) != all_indices.end()) {
-              return errors::Internal(
-                  "Duplicate entry for ", temp,
-                  " found when reading interleave and staging indices.");
-            }
-            if (temp >= 0) {
-              all_indices.insert(temp);
-            }
-            interleave_indices_.emplace_back(temp);
-          }
-        }
-
-        // Restore `staging_indices_`.
-        {
-          int64 staging_size;
-          TF_RETURN_IF_ERROR(
-              reader->ReadScalar(full_name("staging_size"), &staging_size));
-          for (int i = 0; i < staging_size; ++i) {
-            int64 temp;
-            TF_RETURN_IF_ERROR(reader->ReadScalar(
-                full_name(strings::StrCat("staging_indices_", i)), &temp));
-            if (all_indices.find(temp) != all_indices.end()) {
-              return errors::Internal(
-                  "Duplicate entry for ", temp,
-                  " found when reading interleave and staging indices.");
-            }
-            if (temp >= 0) {
-              all_indices.insert(temp);
-            }
-            staging_indices_.emplace_back(temp);
-          }
-        }
-
-        // Start Worker threads.
-        if (reader->Contains(full_name("worker_threads_running"))) {
-          worker_threads_.reserve(dataset()->num_threads());
-          for (size_t i = 0; i < dataset()->num_threads(); ++i) {
-            std::shared_ptr<IteratorContext> new_ctx(new IteratorContext(*ctx));
-            worker_threads_.emplace_back(ctx->env()->StartThread(
-                {}, strings::StrCat("tf_data_parallel_interleave_worker_", i),
-                [this, new_ctx, i]() { WorkerThread(new_ctx, i); }));
-          }
-        }
-        return Status::OK();
-      }
-
-     private:
-      // OutputElem contains the information from a call to GetNext by an output
-      // iterator.
-      struct OutputElem {
-        // The output iterator sets `status` if getting the output element
-        // fails.
-        Status status;
-        // The buffered data element.
-        std::vector<Tensor> output;
-
-        explicit OutputElem(const Status& s) : status(s) {}
-      };
-
-      // Worker threads operate on their relevant WorkerState structs.
-      //
-      // WorkerState's fields are all protected by mu_;
-      struct WorkerState {
-        // The arguments to be used to construct an output iterator.
-        std::vector<Tensor> input;
-        // The buffered output elements.
-        std::deque<OutputElem> outputs;
-        // Set to true iff the worker thread expects to append more elements to
-        // outputs. is_producing can be false despite !outputs.empty().
-        // Concretely, all output elements will have been consumed only when:
-        // is_producing == false && outputs.empty();
-        bool is_producing = false;
-        // Condition variable used to coordinate between threads. The worker
-        // thread waits on this condition variable when it is either (1) waiting
-        // for the main thread to add arguments to `input`, or (2) waiting for
-        // the main thread to consume an element of `outputs`. The main thread
-        // waits on cond_var if it is waiting for the worker thread to produce
-        // an element into `outputs` (this implies sloppy_==false).
-        condition_variable cond_var;
-
-        inline bool MayHaveElements() const {
-          return is_producing || !outputs.empty();
-        }
-
-        // Sets inputs for a worker thread and notifies it to start processing.
-        void SetInputs(const Status& s, std::vector<Tensor> input_arguments) {
-          if (s.ok()) {
-            DCHECK(!MayHaveElements())
-                << "Tried to start inputs, despite already producing!";
-            input = std::move(input_arguments);
-            is_producing = true;
-            cond_var.notify_one();
-          } else {
-            outputs.emplace_back(s);
-          }
-        }
-      };
-
-      // The internal state of a worker thread that is not already captured
-      // in its `WorkerState`.
-      //
-      // This is needed only for checkpointing purposes. We keep this
-      // separate from `WorkerState` and guard its fields using a separate
-      // lock `ckpt_mu_` so as to not affect the performance of main pipeline.
-      struct WorkerThreadState {
-        // The output element that has been produced from the input iterator
-        // and is waiting to be added to `WorkerState.outputs`.
-        OutputElem output_elem;
-
-        // Whether the input iterator returned an `end_of_sequence`.
-        bool end_of_sequence = false;
-
-        // Status returned from `MakeIteratorFromInputElement`.
-        Status iterator_creation_status;
-
-        // The arguments to be used to construct `iterator`.
-        std::vector<Tensor> input;
-
-        std::unique_ptr<IteratorBase> iterator;
-
-        WorkerThreadState() : output_elem(Status::OK()) {}
-      };
-
-      Status EnsureWorkerThreadsStarted(IteratorContext* ctx)
-          EXCLUSIVE_LOCKS_REQUIRED(mu_) {
-        if (worker_threads_.empty()) {
-          worker_threads_.reserve(dataset()->num_threads());
-          for (int64 i = 0; i < dataset()->num_threads(); ++i) {
-            std::vector<Tensor> args;
-            bool end_of_input = false;
-            Status s = input_impl_->GetNext(ctx, &args, &end_of_input);
-            if (end_of_input) {
-              input_impl_.reset();
-              return Status::OK();
-            }
-            workers_[i].SetInputs(s, std::move(args));
-            std::shared_ptr<IteratorContext> new_ctx(new IteratorContext(*ctx));
-            worker_threads_.emplace_back(ctx->env()->StartThread(
-                {}, strings::StrCat("tf_data_parallel_interleave_worker_", i),
-                [this, new_ctx, i]() { WorkerThread(new_ctx, i); }));
-            if (i < dataset()->cycle_length_) {
-              interleave_indices_.push_back(i);
-            } else {
-              staging_indices_.push_back(i);
-            }
-          }
-          DCHECK(interleave_indices_.size() == dataset()->cycle_length_);
-          DCHECK(staging_indices_.size() ==
-                 dataset()->prefetch_input_elements_);
-        }
-        return Status::OK();
-      }
-
-      // Produces elements into the worker's output buffers.
-      void WorkerThread(const std::shared_ptr<IteratorContext>& ctx,
-                        const int64 thread_index) {
-        // Notes on checkpointing thread local state, i.e., `WorkerThreadState`:
-        //
-        // 1. Any local state that may need to be checkpointed should be kept
-        //    in `worker_thread_states_[thread_index]`.
-        // 2. `WorkerThreadState` should contain state that is needed only for
-        //    checkpointing, i.e., if we were to remove checkpointing support,
-        //    we could keep that state as local variables in this thread.
-        // 3. This thread should only read/write state at `thread_index`
-        //    and should not access other thread states.
-        // 4. When restoring from checkpoint, threads are started only after
-        //    the restore is complete.
-        // 5. Once restored from a checkpoint, the local state is edited only
-        //    by this thread. 3 & 4 allow making assumptions like temporarily
-        //    caching local state in this thread and using it outside a lock
-        //    e.g. `make_new_iterator`.
-        // 6. `ckpt_mu_` should be wisely used to create *consistent*
-        //    checkpoint markers.
-
-        // std::function arguments are copy-constructable, so we pass raw
-        // pointers, and then immediately wrap them to ensure correct ownership.
-        RecordStart(ctx.get());
-        auto cleanup = gtl::MakeCleanup([this, thread_index, ctx] {
-          mutex_lock l(mu_);
-          workers_[thread_index].cond_var.notify_all();
-          RecordStop(ctx.get());
-        });
-        bool make_new_iterator;
-        {
-          tf_shared_lock l(ckpt_mu_);
-          // Decide whether a new iterator should be built.
-          // 1. If there is an existing iterator, we use it.
-          // 2. If there was an error in iterator creation that could not be
-          //    notified to the client we attempt to send that to the client
-          //    first.
-          make_new_iterator =
-              worker_thread_states_[thread_index].iterator == nullptr &&
-              worker_thread_states_[thread_index].iterator_creation_status.ok();
-        }
-        // Even though `make_new_iterator` has cached values from
-        // `worker_thread_states_[thread_index]` which is guarded by ckpt_mu_,
-        // it is safe to *read* `make_new_iterator`outside of a lock without
-        // worrying about concurrent changes to values in
-        // `worker_thread_states_[thread_index]`. See comment at the start of
-        // this function for details.
-        while (true) {
-          // Whether creation of the iterator succeeded.
-          Status iterator_creation_status;
-          // 1. Build a new iterator or use the existing one.
-          if (make_new_iterator) {
-            // 1a. Get new input tensors or use the exiting ones.
-            bool read_new_input;
-            {
-              tf_shared_lock l(ckpt_mu_);
-              // worker_thread_states_[thread_index].input will be non-empty
-              // if checkpointing happened at CHECKPOINT_MARKER_A.
-              read_new_input =
-                  worker_thread_states_[thread_index].input.empty();
-            }
-
-            if (read_new_input) {
-              mutex_lock l(mu_);
-              while (!cancelled_ && !workers_[thread_index].is_producing) {
-                RecordStop(ctx.get());
-                workers_[thread_index].cond_var.wait(l);
-                RecordStart(ctx.get());
-              }
-              if (cancelled_) return;
-              // Copy the input tensors so that we do not need to block on `mu_`
-              // when building the iterator.
-              // We keep a copy of the input tensors in
-              // `WorkerThreadState.input` till the iterator is in use. This is
-              // used in `RestoreInternal` to re-build the iterator.
-              // TODO(b/78046638): Explore ways to avoid tracking the input
-              // tensors.
-              tf_shared_lock ckpt_l(ckpt_mu_);
-              worker_thread_states_[thread_index].input.swap(
-                  workers_[thread_index].input);
-              // CHECKPOINT_MARKER_A
-              // We have the input tensors but have not built the iterator yet.
-            }
-
-            // 1b. Run the user defined function to produce a new iterator.
-            {
-              tf_shared_lock l(ckpt_mu_);
-              worker_thread_states_[thread_index].iterator_creation_status =
-                  MakeIteratorFromInputElement(
-                      ctx.get(), worker_thread_states_[thread_index].input,
-                      thread_index, dataset()->captured_func_.get(), prefix(),
-                      &worker_thread_states_[thread_index].iterator);
-              iterator_creation_status =
-                  worker_thread_states_[thread_index].iterator_creation_status;
-              if (!iterator_creation_status.ok()) {
-                worker_thread_states_[thread_index].input.clear();
-              }
-              // CHECKPOINT_MARKER_B
-              // Either an iterator has been successfully built and placed in
-              // `worker_thread_states_[thread_index].iterator` or it failed and
-              // a non-OK status has been put in
-              // `worker_thread_states_[thread_index].iterator_creation_status`.
-            }
-          } else {
-            tf_shared_lock l(ckpt_mu_);
-            iterator_creation_status =
-                worker_thread_states_[thread_index].iterator_creation_status;
-            // Mark that we have used up the restored iterator.
-            make_new_iterator = true;
-          }
-          // 2. Start producing elements or send error state to client if
-          //    iterator creation failed.
-          if (!iterator_creation_status.ok()) {
-            mutex_lock l(mu_);
-            // Wait for space in the prefetch queue.
-            while (!cancelled_ && workers_[thread_index].outputs.size() ==
-                                      dataset()->buffer_output_elements_) {
-              RecordStop(ctx.get());
-              workers_[thread_index].cond_var.wait(l);
-              RecordStart(ctx.get());
-            }
-            if (cancelled_) return;
-            tf_shared_lock ckpt_l(ckpt_mu_);
-            workers_[thread_index].outputs.emplace_back(
-                iterator_creation_status);
-            workers_[thread_index].is_producing = false;
-            worker_thread_states_[thread_index].iterator_creation_status =
-                Status::OK();
-            // CHECKPOINT_MARKER_C
-            // Non-OK iterator creation status has been notified to the
-            // client.
-            workers_[thread_index].cond_var.notify_one();
-          } else {
-            bool end_of_sequence = false;
-            while (!end_of_sequence) {
-              // 3.a Produce an element!
-              {
-                tf_shared_lock ckpt_l(ckpt_mu_);
-                if (worker_thread_states_[thread_index]
-                        .output_elem.status.ok() &&
-                    worker_thread_states_[thread_index]
-                        .output_elem.output.empty() &&
-                    !worker_thread_states_[thread_index].end_of_sequence) {
-                  worker_thread_states_[thread_index].output_elem.status =
-                      worker_thread_states_[thread_index].iterator->GetNext(
-                          ctx.get(),
-                          &worker_thread_states_[thread_index]
-                               .output_elem.output,
-                          &worker_thread_states_[thread_index].end_of_sequence);
-                  end_of_sequence =
-                      worker_thread_states_[thread_index].end_of_sequence;
-                } else {
-                  end_of_sequence =
-                      worker_thread_states_[thread_index].end_of_sequence;
-                }
-                // CHECKPOINT_MARKER_D
-                // An element has been read or an error or end_of_sequence has
-                // been received from the input iterator and is waiting to be
-                // sent to client.
-              }
-
-              // 3.b Make it available to the client.
-              {
-                mutex_lock l(mu_);
-
-                // Wait for space in the prefetch queue.
-                while (!cancelled_ && workers_[thread_index].outputs.size() ==
-                                          dataset()->buffer_output_elements_) {
-                  RecordStop(ctx.get());
-                  workers_[thread_index].cond_var.wait(l);
-                  RecordStart(ctx.get());
-                }
-                if (cancelled_) return;
-
-                tf_shared_lock ckpt_l(ckpt_mu_);
-                workers_[thread_index].is_producing = !end_of_sequence;
-
-                // Output the element.
-
-                // Move the temporary state in WorkerThreadState to WorkerState
-                // and mark it as used.
-                if (end_of_sequence) {
-                  worker_thread_states_[thread_index].iterator.reset();
-                  worker_thread_states_[thread_index].input.clear();
-                  worker_thread_states_[thread_index].end_of_sequence = false;
-                } else {
-                  workers_[thread_index].outputs.emplace_back(
-                      worker_thread_states_[thread_index].output_elem.status);
-                  workers_[thread_index].outputs.back().output.swap(
-                      worker_thread_states_[thread_index].output_elem.output);
-                }
-                worker_thread_states_[thread_index].output_elem.status =
-                    Status::OK();
-                if (dataset()->sloppy_) {
-                  sloppy_cond_var_.notify_one();
-                } else {
-                  workers_[thread_index].cond_var.notify_one();
-                }
-                // CHECKPOINT_MARKER_E
-                // Output element or iterator status has been sent to the
-                // client.
-              }
-            }
-          }
-        }
-      }
-
-      Status WriteWorkerStateLocked(IteratorStateWriter* writer, int index)
-          EXCLUSIVE_LOCKS_REQUIRED(mu_, ckpt_mu_) {
-        string prefix = strings::StrCat("worker_", index);
-        TF_RETURN_IF_ERROR(writer->WriteScalar(
-            full_name(strings::StrCat(prefix, "_input_size")),
-            workers_[index].input.size()));
-        for (int i = 0; i < workers_[index].input.size(); ++i) {
-          TF_RETURN_IF_ERROR(writer->WriteTensor(
-              full_name(strings::StrCat(prefix, "_input_", i)),
-              workers_[index].input[i]));
-        }
-        TF_RETURN_IF_ERROR(writer->WriteScalar(
-            full_name(strings::StrCat(prefix, "_outputs_size")),
-            workers_[index].outputs.size()));
-        for (int i = 0; i < workers_[index].outputs.size(); ++i) {
-          TF_RETURN_IF_ERROR(WriteOutputElemLocked(
-              writer, workers_[index].outputs[i],
-              full_name(strings::StrCat(prefix, "_outputs_", i))));
-        }
-        if (workers_[index].is_producing) {
-          TF_RETURN_IF_ERROR(writer->WriteScalar(
-              full_name(strings::StrCat(prefix, "_is_producing")), ""));
-        }
-        return Status::OK();
-      }
-
-      Status ReadWorkerStateLocked(IteratorStateReader* reader, int index,
-                                   IteratorContext* ctx)
-          EXCLUSIVE_LOCKS_REQUIRED(mu_, ckpt_mu_) {
-        string worker_prefix = strings::StrCat("worker_", index);
-        // Restore inputs.
-        int64 input_size;
-        TF_RETURN_IF_ERROR(reader->ReadScalar(
-            full_name(strings::StrCat(worker_prefix, "_input_size")),
-            &input_size));
-        workers_[index].input.reserve(input_size);
-        for (int i = 0; i < input_size; ++i) {
-          workers_[index].input.emplace_back();
-          TF_RETURN_IF_ERROR(reader->ReadTensor(
-              full_name(strings::StrCat(worker_prefix, "_input_", i)),
-              &workers_[index].input.back()));
-        }
-        int64 outputs_size;
-        TF_RETURN_IF_ERROR(reader->ReadScalar(
-            full_name(strings::StrCat(worker_prefix, "_outputs_size")),
-            &outputs_size));
-        for (int i = 0; i < outputs_size; ++i) {
-          workers_[index].outputs.emplace_back(Status::OK());
-          TF_RETURN_IF_ERROR(ReadOutputElemLocked(
-              reader, &workers_[index].outputs.back(),
-              full_name(strings::StrCat(worker_prefix, "_outputs_", i))));
-        }
-        if (reader->Contains(
-                full_name(strings::StrCat(worker_prefix, "_is_producing")))) {
-          workers_[index].is_producing = true;
-        } else {
-          workers_[index].is_producing = false;
-        }
-        return Status::OK();
-      }
-
-      Status WriteWorkerThreadStateLocked(IteratorStateWriter* writer,
-                                          int index)
-          EXCLUSIVE_LOCKS_REQUIRED(mu_, ckpt_mu_) {
-        string prefix = strings::StrCat("worker_thread_", index);
-        if (worker_thread_states_[index].iterator != nullptr) {
-          TF_RETURN_IF_ERROR(
-              SaveInput(writer, worker_thread_states_[index].iterator));
-        } else {
-          TF_RETURN_IF_ERROR(writer->WriteScalar(
-              full_name(strings::StrCat(prefix, "_iterator_exhausted")), ""));
-        }
-        TF_RETURN_IF_ERROR(writer->WriteScalar(
-            full_name(strings::StrCat(prefix, "_input_size")),
-            worker_thread_states_[index].input.size()));
-        for (int i = 0; i < worker_thread_states_[index].input.size(); ++i) {
-          TF_RETURN_IF_ERROR(writer->WriteTensor(
-              full_name(strings::StrCat(prefix, "_input_", i)),
-              worker_thread_states_[index].input[i]));
-        }
-        TF_RETURN_IF_ERROR(WriteStatusLocked(
-            writer, strings::StrCat(prefix, "_iterator_creation_status"),
-            worker_thread_states_[index].iterator_creation_status));
-        TF_RETURN_IF_ERROR(WriteOutputElemLocked(
-            writer, worker_thread_states_[index].output_elem,
-            full_name(strings::StrCat(prefix, "_output"))));
-        if (worker_thread_states_[index].end_of_sequence) {
-          TF_RETURN_IF_ERROR(writer->WriteScalar(
-              full_name(strings::StrCat(prefix, "_end_of_sequence")), ""));
-        }
-        return Status::OK();
-      }
-
-      Status ReadWorkerThreadStateLocked(IteratorStateReader* reader, int index,
-                                         IteratorContext* ctx)
-          EXCLUSIVE_LOCKS_REQUIRED(mu_, ckpt_mu_) {
-        string worker_prefix = strings::StrCat("worker_thread_", index);
-        // Restore inputs.
-        int64 input_size;
-        TF_RETURN_IF_ERROR(reader->ReadScalar(
-            full_name(strings::StrCat(worker_prefix, "_input_size")),
-            &input_size));
-        worker_thread_states_[index].input.reserve(input_size);
-        for (int i = 0; i < input_size; ++i) {
-          worker_thread_states_[index].input.emplace_back();
-          TF_RETURN_IF_ERROR(reader->ReadTensor(
-              full_name(strings::StrCat(worker_prefix, "_input_", i)),
-              &worker_thread_states_[index].input.back()));
-        }
-        // Restore iterator.
-        if (reader->Contains(full_name(
-                strings::StrCat(worker_prefix, "_iterator_exhausted")))) {
-          worker_thread_states_[index].iterator.reset();
-        } else {
-          std::unique_ptr<IteratorBase> iterator;
-          Status s = MakeIteratorFromInputElement(
-              ctx, worker_thread_states_[index].input, index,
-              dataset()->captured_func_.get(), prefix(), &iterator);
-          TF_RETURN_IF_ERROR(RestoreInput(ctx, reader, iterator));
-          worker_thread_states_[index].iterator.swap(iterator);
-        }
-        TF_RETURN_IF_ERROR(ReadStatusLocked(
-            reader, strings::StrCat(worker_prefix, "_iterator_creation_status"),
-            &worker_thread_states_[index].iterator_creation_status));
-        TF_RETURN_IF_ERROR(ReadOutputElemLocked(
-            reader, &worker_thread_states_[index].output_elem,
-            full_name(strings::StrCat(worker_prefix, "_output"))));
-        if (reader->Contains(full_name(
-                strings::StrCat(worker_prefix, "_end_of_sequence")))) {
-          worker_thread_states_[index].end_of_sequence = true;
-        } else {
-          worker_thread_states_[index].end_of_sequence = false;
-        }
-        return Status::OK();
-      }
-
-      Status WriteOutputElemLocked(IteratorStateWriter* writer,
-                                   const OutputElem& output_elem,
-                                   const string& prefix)
-          EXCLUSIVE_LOCKS_REQUIRED(mu_, ckpt_mu_) {
-        TF_RETURN_IF_ERROR(WriteStatusLocked(
-            writer, strings::StrCat(prefix, "_status"), output_elem.status));
-        TF_RETURN_IF_ERROR(
-            writer->WriteScalar(strings::StrCat(prefix, "_output_size"),
-                                output_elem.output.size()));
-        for (int i = 0; i < output_elem.output.size(); ++i) {
-          TF_RETURN_IF_ERROR(writer->WriteTensor(
-              strings::StrCat(prefix, "_output_", i), output_elem.output[i]));
-        }
-        return Status::OK();
-      }
-
-      Status ReadOutputElemLocked(IteratorStateReader* reader,
-                                  OutputElem* output_elem, const string& prefix)
-          EXCLUSIVE_LOCKS_REQUIRED(mu_, ckpt_mu_) {
-        TF_RETURN_IF_ERROR(ReadStatusLocked(
-            reader, strings::StrCat(prefix, "_status"), &output_elem->status));
-        int64 output_size;
-        TF_RETURN_IF_ERROR(reader->ReadScalar(
-            strings::StrCat(prefix, "_output_size"), &output_size));
-        output_elem->output.reserve(output_size);
-        for (int i = 0; i < output_size; ++i) {
-          output_elem->output.emplace_back();
-          TF_RETURN_IF_ERROR(
-              reader->ReadTensor(strings::StrCat(prefix, "_output_", i),
-                                 &output_elem->output.back()));
-        }
-        return Status::OK();
-      }
-
-      Status WriteStatusLocked(IteratorStateWriter* writer,
-                               const string& prefix, const Status& status)
-          EXCLUSIVE_LOCKS_REQUIRED(mu_, ckpt_mu_) {
-        TF_RETURN_IF_ERROR(
-            writer->WriteScalar(full_name(strings::StrCat(prefix, "_code")),
-                                static_cast<int64>(status.code())));
-        if (!status.ok()) {
-          TF_RETURN_IF_ERROR(
-              writer->WriteScalar(full_name(strings::StrCat(prefix, "_msg")),
-                                  status.error_message()));
-        }
-        return Status::OK();
-      }
-
-      Status ReadStatusLocked(IteratorStateReader* reader, const string& prefix,
-                              Status* status)
-          EXCLUSIVE_LOCKS_REQUIRED(mu_, ckpt_mu_) {
-        int64 code_int;
-        TF_RETURN_IF_ERROR(reader->ReadScalar(
-            full_name(strings::StrCat(prefix, "_code")), &code_int));
-        error::Code code = static_cast<error::Code>(code_int);
-
-        if (code != error::Code::OK) {
-          string error_message;
-          TF_RETURN_IF_ERROR(reader->ReadScalar(
-              full_name(strings::StrCat(prefix, "_msg")), &error_message));
-          *status = Status(code, error_message);
-        } else {
-          *status = Status::OK();
-        }
-        return Status::OK();
-      }
-
-      // Mutex & condition variable to guard mutable iterator internals and
-      // coordinate among worker threads and client thread[s].
-      mutex mu_ ACQUIRED_BEFORE(ckpt_mu_);
-      // The main thread waits on this condition variable if running in sloppy
-      // mode and no values are available.
-      condition_variable sloppy_cond_var_;
-      // Mutex used to wait for a consistent state while checkpointing.
-      // Only Save and Restore require an exclusive lock on this mutex. In
-      // other scenarios we just acquire a shared lock so the pipeline's
-      // performance should not be affected in the absence of checkpointing.
-      // A thread must not wait on any condition variable while holding
-      // `ckpt_mu_` in either shared or exclusive modes.
-      mutex ckpt_mu_;
-
-      // The iterator producing elements which are converted to datasets by
-      // the dataset()->captured_func_ then interleaved together.
-      // input_impl_ is reset when we have exhausted its input.
-      std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(mu_);
-
-      // The WorkerState structs the worker threads operate on.
-      // workers_ elements are in at most one of interleave_ and staging_.
-      std::vector<WorkerState> workers_ GUARDED_BY(mu_);
-
-      // Stores the temporary state of WorkerThreads which is not stored in
-      // WorkerState. This is used for checkpointing purposes only.
-      std::vector<WorkerThreadState> worker_thread_states_ GUARDED_BY(ckpt_mu_);
-
-      // Indices in `workers_` of iterators to interleave.
-      std::vector<int64> interleave_indices_ GUARDED_BY(mu_);
-      // Indices in `workers_` of prefetched iterators.
-      std::deque<int64> staging_indices_ GUARDED_BY(mu_);
-
-      // The index into output_elements_ for next element to produce.
-      size_t next_index_ GUARDED_BY(mu_) = 0;
-      // The number of items produced so far within the block
-      size_t block_count_ GUARDED_BY(mu_) = 0;
-      // Flag to instruct the worker threads to exit.
-      bool cancelled_ GUARDED_BY(mu_) = false;
-      // The worker threads. This must be last to ensure the
-      // threads have exited before any other members are deallocated.
-      // TODO(b/65178177): Avoid allocating additional threads.
-      std::vector<std::unique_ptr<Thread>> worker_threads_ GUARDED_BY(mu_);
-    };
-
-    const DatasetBase* const input_;
-    const NameAttrList interleave_func_;
-    const std::unique_ptr<CapturedFunction> captured_func_;
-    const int64 cycle_length_;
-    const int64 block_length_;
-    const bool sloppy_;
-    const int64 buffer_output_elements_;
-    const int64 prefetch_input_elements_;
-    const DataTypeVector output_types_;
-    const std::vector<PartialTensorShape> output_shapes_;
-  };
-
-  DataTypeVector output_types_;
-  std::vector<PartialTensorShape> output_shapes_;
-  NameAttrList interleave_func_;
-};
-
-REGISTER_KERNEL_BUILDER(Name("ParallelInterleaveDataset").Device(DEVICE_CPU),
-                        ParallelInterleaveDatasetOp);
-
 // The motivation for creating an alternative implementation of parallel
 // interleave is to decouple the degree of parallelism from the cycle length.
 // This makes it possible to change the degree of parallelism (e.g. through
@@ -1091,9 +49,9 @@ REGISTER_KERNEL_BUILDER(Name("ParallelInterleaveDataset").Device(DEVICE_CPU),
 // The above design choices were made with automated optimizations in mind,
 // isolating the degree of parallelism as the single tunable knob of this
 // implementation.
-class ParallelInterleaveDatasetV2Op : public UnaryDatasetOpKernel {
+class ParallelInterleaveDatasetOp : public UnaryDatasetOpKernel {
  public:
-  explicit ParallelInterleaveDatasetV2Op(OpKernelConstruction* ctx)
+  explicit ParallelInterleaveDatasetOp(OpKernelConstruction* ctx)
       : UnaryDatasetOpKernel(ctx) {
     OP_REQUIRES_OK(ctx, ctx->GetAttr("f", &interleave_func_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("output_types", &output_types_));
@@ -1241,7 +199,7 @@ class ParallelInterleaveDatasetV2Op : public UnaryDatasetOpKernel {
             element_in_use_(params.dataset->cycle_length_, false),
             thread_pool_(new thread::ThreadPool(
                 Env::Default(), ThreadOptions(),
-                "tf_data_parallel_interleave_worker_pool",
+                "data_parallel_interleave_worker_pool",
                 dataset()->cycle_length_ /* num_threads */,
                 false /* low_latency_hint */)) {
         std::vector<string> components =
@@ -1268,7 +226,8 @@ class ParallelInterleaveDatasetV2Op : public UnaryDatasetOpKernel {
         }
         TF_RETURN_IF_ERROR(
             dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_));
-        return dataset()->captured_func_->Instantiate(ctx);
+        return dataset()->captured_func_->Instantiate(
+            ctx, &instantiated_captured_func_);
       }
 
       Status GetNextInternal(IteratorContext* ctx,
@@ -1511,7 +470,7 @@ class ParallelInterleaveDatasetV2Op : public UnaryDatasetOpKernel {
               if (!end_of_input_) {
                 Status status = MakeIteratorFromInputElement(
                     ctx.get(), args_list_[cycle_index_], cycle_index_,
-                    dataset()->captured_func_.get(), prefix(),
+                    *instantiated_captured_func_, prefix(),
                     &current_elements_[cycle_index_]);
                 if (!status.ok()) {
                   invocation_results_.emplace_back(new InvocationResult());
@@ -1658,7 +617,7 @@ class ParallelInterleaveDatasetV2Op : public UnaryDatasetOpKernel {
                   &args_list_[idx][i]));
             }
             TF_RETURN_IF_ERROR(MakeIteratorFromInputElement(
-                ctx, args_list_[idx], idx, dataset()->captured_func_.get(),
+                ctx, args_list_[idx], idx, *instantiated_captured_func_.get(),
                 prefix(), &current_elements_[idx]));
             TF_RETURN_IF_ERROR(
                 RestoreInput(ctx, reader, current_elements_[idx]));
@@ -1722,6 +681,7 @@ class ParallelInterleaveDatasetV2Op : public UnaryDatasetOpKernel {
       // Identifies whether background activity should be cancelled.
       bool cancelled_ GUARDED_BY(*mu_) = false;
       string prefix_end_;
+      std::unique_ptr<InstantiatedCapturedFunction> instantiated_captured_func_;
     };
 
     const DatasetBase* const input_;
@@ -1742,7 +702,7 @@ class ParallelInterleaveDatasetV2Op : public UnaryDatasetOpKernel {
 };
 
 REGISTER_KERNEL_BUILDER(Name("ParallelInterleaveDatasetV2").Device(DEVICE_CPU),
-                        ParallelInterleaveDatasetV2Op);
+                        ParallelInterleaveDatasetOp);
 
 }  // namespace
 }  // namespace data
diff --git a/tensorflow/core/kernels/data/parallel_map_dataset_op.cc b/tensorflow/core/kernels/data/parallel_map_dataset_op.cc
index 6e4005ff6d3..7c589a648ec 100644
--- a/tensorflow/core/kernels/data/parallel_map_dataset_op.cc
+++ b/tensorflow/core/kernels/data/parallel_map_dataset_op.cc
@@ -61,52 +61,9 @@ class ParallelMapDatasetOp : public UnaryDatasetOpKernel {
     std::vector<int> indices;
     OP_REQUIRES_OK(ctx, ComputeShortCircuitIndices(ctx, func_, &indices));
 
-    ParallelMapIteratorFunction map_func;
-    CapturedFunction* raw_captured_func = captured_func.get();
-    if (indices.empty()) {
-      map_func = [raw_captured_func](IteratorContext* ctx, const string& prefix,
-                                     std::vector<Tensor> args,
-                                     std::vector<Tensor>* out_tensors,
-                                     StatusCallback done) {
-        raw_captured_func->RunAsync(ctx, std::move(args), out_tensors,
-                                    std::move(done), prefix);
-      };
-      if (!use_inter_op_parallelism_) {
-        map_func = [map_func](IteratorContext* ctx, const string& prefix,
-                              std::vector<Tensor> args,
-                              std::vector<Tensor>* out_tensors,
-                              StatusCallback done) {
-          (*ctx->runner())(std::bind(map_func, ctx, prefix, std::move(args),
-                                     out_tensors, std::move(done)));
-        };
-      }
-    } else {
-      std::vector<bool> can_move = ComputeMoveVector(indices);
-      map_func = [raw_captured_func, indices, can_move](
-                     IteratorContext* ctx, const string& prefix,
-                     std::vector<Tensor> args, std::vector<Tensor>* out_tensors,
-                     StatusCallback done) {
-        const std::vector<Tensor>& captured_inputs =
-            raw_captured_func->captured_inputs();
-        size_t num_args = args.size();
-        for (size_t i = 0; i < indices.size(); ++i) {
-          if (indices[i] < num_args) {
-            if (can_move[i]) {
-              out_tensors->push_back(std::move(args[indices[i]]));
-            } else {
-              out_tensors->push_back(args[indices[i]]);
-            }
-          } else {
-            out_tensors->push_back(captured_inputs[indices[i] - num_args]);
-          }
-        }
-        done(Status::OK());
-      };
-    }
-
     *output = new Dataset(ctx, input, func_, num_parallel_calls, output_types_,
                           output_shapes_, use_inter_op_parallelism_, sloppy_,
-                          std::move(captured_func), std::move(map_func));
+                          std::move(captured_func), indices);
   }
 
  private:
@@ -118,7 +75,7 @@ class ParallelMapDatasetOp : public UnaryDatasetOpKernel {
             const std::vector<PartialTensorShape>& output_shapes,
             bool use_inter_op_parallelism, bool sloppy,
             std::unique_ptr<CapturedFunction> captured_func,
-            ParallelMapIteratorFunction map_func)
+            const std::vector<int> indices)
         : DatasetBase(DatasetContext(ctx)),
           input_(input),
           func_(func),
@@ -128,7 +85,9 @@ class ParallelMapDatasetOp : public UnaryDatasetOpKernel {
           use_inter_op_parallelism_(use_inter_op_parallelism),
           sloppy_(sloppy),
           captured_func_(std::move(captured_func)),
-          map_func_(std::move(map_func)) {
+          indices_(indices),
+          can_move_(indices.empty() ? std::vector<bool>()
+                                    : ComputeMoveVector(indices)) {
       input_->Ref();
     }
 
@@ -136,13 +95,15 @@ class ParallelMapDatasetOp : public UnaryDatasetOpKernel {
 
     std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
-      auto init_func = [this](IteratorContext* ctx) {
-        return captured_func_->Instantiate(ctx);
-      };
-
+      std::unique_ptr<ParallelMapFunctor> parallel_map_functor(nullptr);
+      if (indices_.empty()) {
+        parallel_map_functor.reset(new ParallelMapDatasetFunctor(this));
+      } else {
+        parallel_map_functor.reset(new ShortCircuitFunctor(this));
+      }
       return NewParallelMapIterator(
           {this, strings::StrCat(prefix, "::ParallelMap")}, input_,
-          std::move(init_func), map_func_, num_parallel_calls_, sloppy_);
+          std::move(parallel_map_functor), num_parallel_calls_, sloppy_);
     }
 
     const DataTypeVector& output_dtypes() const override {
@@ -157,6 +118,10 @@ class ParallelMapDatasetOp : public UnaryDatasetOpKernel {
       return "ParallelMapDatasetOp::Dataset";
     }
 
+    // TODO(b/120482302): Note that this is inaccurate until MapDataset is
+    // modified to preserve cardinality.
+    int64 Cardinality() const override { return input_->Cardinality(); }
+
    protected:
     Status AsGraphDefInternal(SerializationContext* ctx,
                               DatasetGraphDefBuilder* b,
@@ -215,6 +180,71 @@ class ParallelMapDatasetOp : public UnaryDatasetOpKernel {
     }
 
    private:
+    class ShortCircuitFunctor : public ParallelMapFunctor {
+     public:
+      explicit ShortCircuitFunctor(const Dataset* dataset)
+          : dataset_(dataset) {}
+
+      void MapFunc(IteratorContext* ctx, const string& prefix,
+                   std::vector<Tensor> input_element,
+                   std::vector<Tensor>* result, StatusCallback done) override {
+        const std::vector<Tensor>& captured_inputs =
+            dataset_->captured_func_->captured_inputs();
+        size_t num_args = input_element.size();
+        for (size_t i = 0; i < dataset_->indices_.size(); ++i) {
+          if (dataset_->indices_[i] < num_args) {
+            if (dataset_->can_move_[i]) {
+              result->push_back(
+                  std::move(input_element[dataset_->indices_[i]]));
+            } else {
+              result->push_back(input_element[dataset_->indices_[i]]);
+            }
+          } else {
+            result->push_back(
+                captured_inputs[dataset_->indices_[i] - num_args]);
+          }
+        }
+        done(Status::OK());
+      }
+
+      const Dataset* const dataset_;
+    };
+
+    class ParallelMapDatasetFunctor : public ParallelMapFunctor {
+     public:
+      explicit ParallelMapDatasetFunctor(const Dataset* dataset)
+          : dataset_(dataset) {}
+
+      Status InitFunc(IteratorContext* ctx) override {
+        return dataset_->captured_func_->Instantiate(
+            ctx, &instantiated_captured_func_);
+      }
+
+      void MapFunc(IteratorContext* ctx, const string& prefix,
+                   std::vector<Tensor> input_element,
+                   std::vector<Tensor>* result, StatusCallback done) override {
+        auto map_func = [this](IteratorContext* ctx, const string& prefix,
+                               std::vector<Tensor> input_element,
+                               std::vector<Tensor>* result,
+                               StatusCallback done) {
+          instantiated_captured_func_->RunAsync(
+              ctx, std::move(input_element), result, std::move(done), prefix);
+        };
+        if (!dataset_->use_inter_op_parallelism_) {
+          (*ctx->runner())(std::bind(map_func, ctx, prefix,
+                                     std::move(input_element), result,
+                                     std::move(done)));
+        } else {
+          map_func(ctx, prefix, std::move(input_element), result,
+                   std::move(done));
+        }
+      }
+
+     private:
+      const Dataset* const dataset_;
+      std::unique_ptr<InstantiatedCapturedFunction> instantiated_captured_func_;
+    };
+
     const DatasetBase* const input_;
     const NameAttrList func_;
     const int32 num_parallel_calls_;
@@ -223,7 +253,8 @@ class ParallelMapDatasetOp : public UnaryDatasetOpKernel {
     const bool use_inter_op_parallelism_;
     const bool sloppy_;
     const std::unique_ptr<CapturedFunction> captured_func_;
-    const ParallelMapIteratorFunction map_func_;
+    const std::vector<int> indices_;
+    const std::vector<bool> can_move_;
   };
 
   DataTypeVector output_types_;
diff --git a/tensorflow/core/kernels/data/parallel_map_iterator.cc b/tensorflow/core/kernels/data/parallel_map_iterator.cc
index ec1c9238430..02ccf6b004c 100644
--- a/tensorflow/core/kernels/data/parallel_map_iterator.cc
+++ b/tensorflow/core/kernels/data/parallel_map_iterator.cc
@@ -34,13 +34,11 @@ class ParallelMapIterator : public DatasetBaseIterator {
  public:
   ParallelMapIterator(const typename DatasetBaseIterator::BaseParams& params,
                       const DatasetBase* input_dataset,
-                      std::function<Status(IteratorContext*)> init_func,
-                      ParallelMapIteratorFunction map_func,
+                      std::unique_ptr<ParallelMapFunctor> parallel_map_functor,
                       int32 num_parallel_calls, bool sloppy)
       : DatasetBaseIterator(params),
         input_dataset_(input_dataset),
-        init_func_(std::move(init_func)),
-        map_func_(std::move(map_func)),
+        parallel_map_functor_(std::move(parallel_map_functor)),
         mu_(std::make_shared<mutex>()),
         cond_var_(std::make_shared<condition_variable>()),
         num_parallel_calls_(std::make_shared<model::SharedState>(
@@ -70,10 +68,7 @@ class ParallelMapIterator : public DatasetBaseIterator {
     }
     TF_RETURN_IF_ERROR(
         input_dataset_->MakeIterator(ctx, prefix(), &input_impl_));
-    if (init_func_) {
-      TF_RETURN_IF_ERROR(init_func_(ctx));
-    }
-    return Status::OK();
+    return parallel_map_functor_->InitFunc(ctx);
   }
 
   Status GetNextInternal(IteratorContext* ctx, std::vector<Tensor>* out_tensors,
@@ -225,8 +220,9 @@ class ParallelMapIterator : public DatasetBaseIterator {
 
     // Apply the map function on `input_element`, storing the result in
     // `result->return_values`, and invoking `done` when finished.
-    map_func_(ctx.get(), prefix(), std::move(input_element),
-              &result->return_values, std::move(done));
+    parallel_map_functor_->MapFunc(ctx.get(), prefix(),
+                                   std::move(input_element),
+                                   &result->return_values, std::move(done));
   }
 
   Status ProcessResult(const std::shared_ptr<InvocationResult>& result,
@@ -252,7 +248,10 @@ class ParallelMapIterator : public DatasetBaseIterator {
     RecordStart(ctx.get());
     auto cleanup = gtl::MakeCleanup([this, ctx] { RecordStop(ctx.get()); });
     std::vector<std::shared_ptr<InvocationResult>> new_calls;
-    new_calls.reserve(num_parallel_calls_->value);
+    {
+      tf_shared_lock l(*mu_);  // mu_ == num_parallel_calls_->mu
+      new_calls.reserve(num_parallel_calls_->value);
+    }
     auto busy = [this]() EXCLUSIVE_LOCKS_REQUIRED(*mu_) -> bool {
       int64 num_parallel_calls = num_parallel_calls_->value;
       return num_calls_ >= num_parallel_calls ||
@@ -357,8 +356,7 @@ class ParallelMapIterator : public DatasetBaseIterator {
   }
 
   const DatasetBase* const input_dataset_;  // Not owned.
-  const std::function<Status(IteratorContext*)> init_func_;
-  const ParallelMapIteratorFunction map_func_;
+  std::unique_ptr<ParallelMapFunctor> parallel_map_functor_;
   // Used for coordination between the main thread and the runner thread.
   const std::shared_ptr<mutex> mu_;
   // Used for coordination between the main thread and the runner thread. In
@@ -387,12 +385,11 @@ class ParallelMapIterator : public DatasetBaseIterator {
 std::unique_ptr<IteratorBase> NewParallelMapIterator(
     const DatasetBaseIterator::BaseParams& params,
     const DatasetBase* input_dataset,
-    std::function<Status(IteratorContext*)> init_func,
-    ParallelMapIteratorFunction map_func, int32 num_parallel_calls,
-    bool sloppy) {
-  return MakeUnique<ParallelMapIterator>(
-      params, input_dataset, std::move(init_func), std::move(map_func),
-      num_parallel_calls, sloppy);
+    std::unique_ptr<ParallelMapFunctor> parallel_map_functor,
+    int32 num_parallel_calls, bool sloppy) {
+  return MakeUnique<ParallelMapIterator>(params, input_dataset,
+                                         std::move(parallel_map_functor),
+                                         num_parallel_calls, sloppy);
 }
 
 }  // namespace data
diff --git a/tensorflow/core/kernels/data/parallel_map_iterator.h b/tensorflow/core/kernels/data/parallel_map_iterator.h
index d715b9a4975..08c16a6c112 100644
--- a/tensorflow/core/kernels/data/parallel_map_iterator.h
+++ b/tensorflow/core/kernels/data/parallel_map_iterator.h
@@ -22,28 +22,33 @@ limitations under the License.
 namespace tensorflow {
 namespace data {
 
-// A function that transforms elements of one dataset into another
-// asynchronously. The arguments are:
-// 1. An `IteratorContext*` for the context in which the function should
-// execute.
-// 2. A `std::vector<Tensor>` containing the input element.
-// 3. A `std::vector<Tensor>*` to which the function will write the result.
-// 4. A `StatusCallback` that should be invoked when the function is complete.
-using ParallelMapIteratorFunction =
-    std::function<void(IteratorContext*, const string&, std::vector<Tensor>,
-                       std::vector<Tensor>*, StatusCallback)>;
+class ParallelMapFunctor {
+ public:
+  virtual ~ParallelMapFunctor() {}
 
-// Returns a new iterator that applies `map_func` to the elements of
-// `input_dataset` using the given degree of parallelism. `init_func` (if
-// specified) will be executed when the iterator is initialized (see
-// `IteratorBase::Initialize()`) and enables the user to specify error checking
-// logic that can fail early.
+  // A function that runs when the Iterator is initialized. It enables the user
+  // to specify error checking logic that can fail early.
+  virtual Status InitFunc(IteratorContext* ctx) { return Status::OK(); }
+
+  // A function that transforms elements of one dataset into another
+  // asynchronously. The arguments are:
+  // 1. An `IteratorContext*` for the context in which the function should
+  // execute.
+  // 2. A `std::vector<Tensor>` containing the input element.
+  // 3. A `std::vector<Tensor>*` to which the function will write the result.
+  // 4. A `StatusCallback` that should be invoked when the function is complete.
+  virtual void MapFunc(IteratorContext* ctx, const string& prefix,
+                       std::vector<Tensor> input, std::vector<Tensor>* output,
+                       StatusCallback callback) = 0;
+};
+
+// Returns a new iterator that uses `parallel_map_functor` to apply `MapFunc`
+// to the elements of `input_dataset` using the given degree of parallelism.
 std::unique_ptr<IteratorBase> NewParallelMapIterator(
     const DatasetBaseIterator::BaseParams& params,
     const DatasetBase* input_dataset,
-    std::function<Status(IteratorContext*)> init_func,
-    ParallelMapIteratorFunction map_func, int32 num_parallel_calls,
-    bool sloppy);
+    std::unique_ptr<ParallelMapFunctor> parallel_map_functor,
+    int32 num_parallel_calls, bool sloppy);
 
 }  // namespace data
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/prefetch_dataset_op.cc b/tensorflow/core/kernels/data/prefetch_dataset_op.cc
index 960373b74f3..9b2a530be47 100644
--- a/tensorflow/core/kernels/data/prefetch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/prefetch_dataset_op.cc
@@ -56,6 +56,8 @@ class PrefetchDatasetOp::Dataset : public DatasetBase {
 
   string DebugString() const override { return "PrefetchDatasetOp::Dataset"; }
 
+  int64 Cardinality() const override { return input_->Cardinality(); }
+
  protected:
   Status AsGraphDefInternal(SerializationContext* ctx,
                             DatasetGraphDefBuilder* b,
@@ -391,13 +393,14 @@ void PrefetchDatasetOp::MakeDataset(OpKernelContext* ctx, DatasetBase* input,
 }
 
 namespace {
-REGISTER_KERNEL_BUILDER(Name("PrefetchDataset").Device(DEVICE_CPU),
+REGISTER_KERNEL_BUILDER(Name("PrefetchDataset").Device(DEVICE_CPU).Priority(2),
                         PrefetchDatasetOp);
 REGISTER_KERNEL_BUILDER(Name("PrefetchDataset")
                             .Device(DEVICE_GPU)
                             .HostMemory("buffer_size")
                             .HostMemory("input_dataset")
-                            .HostMemory("handle"),
+                            .HostMemory("handle")
+                            .Priority(1),
                         PrefetchDatasetOp);
 }  // namespace
 
diff --git a/tensorflow/core/kernels/data/range_dataset_op.cc b/tensorflow/core/kernels/data/range_dataset_op.cc
index 207e957e374..580702f7418 100644
--- a/tensorflow/core/kernels/data/range_dataset_op.cc
+++ b/tensorflow/core/kernels/data/range_dataset_op.cc
@@ -73,6 +73,14 @@ class RangeDatasetOp : public DatasetOpKernel {
                              step_, ")::Dataset");
     }
 
+    int64 Cardinality() const override {
+      if (step_ > 0) {
+        return std::max(0LL, (stop_ - start_ - 1) / step_ + 1);
+      } else {
+        return std::max(0LL, (start_ - stop_ - 1) / -step_ + 1);
+      }
+    }
+
    protected:
     Status AsGraphDefInternal(SerializationContext* ctx,
                               DatasetGraphDefBuilder* b,
diff --git a/tensorflow/core/kernels/data/repeat_dataset_op.cc b/tensorflow/core/kernels/data/repeat_dataset_op.cc
index cee14df69d0..8100f2695b6 100644
--- a/tensorflow/core/kernels/data/repeat_dataset_op.cc
+++ b/tensorflow/core/kernels/data/repeat_dataset_op.cc
@@ -71,6 +71,23 @@ class RepeatDatasetOp : public UnaryDatasetOpKernel {
 
     string DebugString() const override { return "RepeatDatasetOp::Dataset"; }
 
+    int64 Cardinality() const override {
+      int64 n = input_->Cardinality();
+      if (count_ < 0) {
+        if (n == 0) {
+          return 0;
+        }
+        return kInfiniteCardinality;
+      }
+      if (count_ == 0) {
+        return 0;
+      }
+      if (n == kInfiniteCardinality || n == kUnknownCardinality) {
+        return n;
+      }
+      return count_ * n;
+    }
+
    protected:
     Status AsGraphDefInternal(SerializationContext* ctx,
                               DatasetGraphDefBuilder* b,
diff --git a/tensorflow/core/kernels/data/shuffle_dataset_op.cc b/tensorflow/core/kernels/data/shuffle_dataset_op.cc
index ad6960685e4..0ca347e98a9 100644
--- a/tensorflow/core/kernels/data/shuffle_dataset_op.cc
+++ b/tensorflow/core/kernels/data/shuffle_dataset_op.cc
@@ -61,6 +61,8 @@ class ShuffleDatasetOpBase : public UnaryDatasetOpKernel {
       return input_->output_shapes();
     }
 
+    int64 Cardinality() const override { return input_->Cardinality(); }
+
    protected:
     template <class T>
     class Iterator : public DatasetIterator<T> {
diff --git a/tensorflow/core/kernels/data/single_threaded_executor.cc b/tensorflow/core/kernels/data/single_threaded_executor.cc
index 5b084a16f0b..89e38810376 100644
--- a/tensorflow/core/kernels/data/single_threaded_executor.cc
+++ b/tensorflow/core/kernels/data/single_threaded_executor.cc
@@ -65,21 +65,28 @@ class SingleThreadedExecutorImpl : public Executor {
         if (IsRefType(dt)) {
           return errors::Unimplemented(
               "Single-threaded executor does not support reference-typed "
-              "edges.");
+              "edges.  But saw type ",
+              DataTypeString(dt), " in outputs of node ", n->name());
         }
       }
 
       if (n->IsControlFlow()) {
         return errors::Unimplemented(
-            "Single-threaded executor does not support control flow.");
+            "Single-threaded executor does not support control flow.  But saw "
+            "control flow node ",
+            n->name());
       }
       if (n->IsSend() || n->IsHostSend() || n->IsRecv() || n->IsHostRecv()) {
         return errors::Unimplemented(
-            "Single-threaded executor does not support partitioned graphs.");
+            "Single-threaded executor does not support partitioned graphs.  "
+            "But saw send/recv node ",
+            n->name());
       }
       if (n->IsCollective()) {
         return errors::Unimplemented(
-            "Single-threaded executor does not support collective ops.");
+            "Single-threaded executor does not support collective ops.  But "
+            "saw collective node ",
+            n->name());
       }
 
       KernelState& kernel_state = kernels_[i];
diff --git a/tensorflow/core/kernels/data/single_threaded_executor_test.cc b/tensorflow/core/kernels/data/single_threaded_executor_test.cc
index 6244e287bb0..7bb51fb8b53 100644
--- a/tensorflow/core/kernels/data/single_threaded_executor_test.cc
+++ b/tensorflow/core/kernels/data/single_threaded_executor_test.cc
@@ -51,17 +51,17 @@ class ExecutorTest : public ::testing::Test {
     // when the test completes.
     CHECK(rendez_->Unref());
     delete exec_;
-    delete device_;
   }
 
   // Resets executor_ with a new executor based on a graph 'gdef'.
   void Create(std::unique_ptr<const Graph> graph) {
     const int version = graph->versions().producer();
     LocalExecutorParams params;
-    params.device = device_;
+    params.device = device_.get();
     params.create_kernel = [this, version](const NodeDef& ndef,
                                            OpKernel** kernel) {
-      return CreateNonCachedKernel(device_, nullptr, ndef, version, kernel);
+      return CreateNonCachedKernel(device_.get(), nullptr, ndef, version,
+                                   kernel);
     };
     params.delete_kernel = [](OpKernel* kernel) {
       DeleteNonCachedKernel(kernel);
@@ -86,7 +86,7 @@ class ExecutorTest : public ::testing::Test {
     return exec_->Run(args);
   }
 
-  Device* device_ = nullptr;
+  std::unique_ptr<Device> device_;
   Executor* exec_ = nullptr;
   Executor::Args::Runner runner_;
   Rendezvous* rendez_ = nullptr;
diff --git a/tensorflow/core/kernels/data/skip_dataset_op.cc b/tensorflow/core/kernels/data/skip_dataset_op.cc
index 8379383662a..e321066a715 100644
--- a/tensorflow/core/kernels/data/skip_dataset_op.cc
+++ b/tensorflow/core/kernels/data/skip_dataset_op.cc
@@ -67,6 +67,14 @@ class SkipDatasetOp : public UnaryDatasetOpKernel {
 
     string DebugString() const override { return "SkipDatasetOp::Dataset"; }
 
+    int64 Cardinality() const override {
+      int64 n = input_->Cardinality();
+      if (n == kInfiniteCardinality || n == kUnknownCardinality) {
+        return n;
+      }
+      return std::max(0LL, n - count_);
+    }
+
    protected:
     Status AsGraphDefInternal(SerializationContext* ctx,
                               DatasetGraphDefBuilder* b,
diff --git a/tensorflow/core/kernels/data/sparse_tensor_slice_dataset_op.cc b/tensorflow/core/kernels/data/sparse_tensor_slice_dataset_op.cc
index a002c605357..be105f8170b 100644
--- a/tensorflow/core/kernels/data/sparse_tensor_slice_dataset_op.cc
+++ b/tensorflow/core/kernels/data/sparse_tensor_slice_dataset_op.cc
@@ -54,6 +54,8 @@ class Dataset : public DatasetBase {
     return "SparseTensorSliceDatasetOp::Dataset";
   }
 
+  int64 Cardinality() const override { return sparse_tensor_.shape()[0]; }
+
  protected:
   Status AsGraphDefInternal(SerializationContext* ctx,
                             DatasetGraphDefBuilder* b,
diff --git a/tensorflow/core/kernels/data/take_dataset_op.cc b/tensorflow/core/kernels/data/take_dataset_op.cc
index 57c9b0d57f6..0a3d5869534 100644
--- a/tensorflow/core/kernels/data/take_dataset_op.cc
+++ b/tensorflow/core/kernels/data/take_dataset_op.cc
@@ -68,6 +68,17 @@ class TakeDatasetOp : public UnaryDatasetOpKernel {
 
     string DebugString() const override { return "TakeDatasetOp::Dataset"; }
 
+    int64 Cardinality() const override {
+      int64 n = input_->Cardinality();
+      if (n == kUnknownCardinality) {
+        return kUnknownCardinality;
+      }
+      if (n == kInfiniteCardinality) {
+        return count_;
+      }
+      return std::min(n, count_);
+    }
+
    protected:
     Status AsGraphDefInternal(SerializationContext* ctx,
                               DatasetGraphDefBuilder* b,
diff --git a/tensorflow/core/kernels/data/tensor_dataset_op.cc b/tensorflow/core/kernels/data/tensor_dataset_op.cc
index c7d374f4897..98c23f23b20 100644
--- a/tensorflow/core/kernels/data/tensor_dataset_op.cc
+++ b/tensorflow/core/kernels/data/tensor_dataset_op.cc
@@ -61,6 +61,8 @@ class TensorDatasetOp : public DatasetOpKernel {
 
     string DebugString() const override { return "TensorDatasetOp::Dataset"; }
 
+    int64 Cardinality() const override { return 1LL; }
+
    protected:
     Status AsGraphDefInternal(SerializationContext* ctx,
                               DatasetGraphDefBuilder* b,
diff --git a/tensorflow/core/kernels/data/tensor_queue_dataset_op.cc b/tensorflow/core/kernels/data/tensor_queue_dataset_op.cc
deleted file mode 100644
index 7fd1c4c9e04..00000000000
--- a/tensorflow/core/kernels/data/tensor_queue_dataset_op.cc
+++ /dev/null
@@ -1,657 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <deque>
-
-#include "tensorflow/core/framework/dataset.h"
-#include "tensorflow/core/framework/partial_tensor_shape.h"
-#include "tensorflow/core/framework/resource_mgr.h"
-#include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/framework/variant.h"
-#include "tensorflow/core/framework/variant_encode_decode.h"
-#include "tensorflow/core/util/batch_util.h"
-
-namespace tensorflow {
-namespace data {
-namespace {
-
-bool IsGreaterEqualToOrCompatibleWith(const PartialTensorShape& a,
-                                      const PartialTensorShape& b) {
-  // Returns true if dims[a] >= dims[b], or are compatible.
-  if (a.unknown_rank()) return true;
-  if (a.dims() != b.dims()) return false;
-  for (int d = 0; d < a.dims(); ++d) {
-    if (a.dim_size(d) == -1 || b.dim_size(d) == -1) continue;
-    if (a.dim_size(d) < b.dim_size(d)) return false;
-  }
-  return true;
-}
-
-DataTypeVector PrependQueueType(const DataTypeVector& dtypes) {
-  DataTypeVector out;
-  out.reserve(dtypes.size() + 1);
-  out.push_back(DT_VARIANT);  // The queue component.
-  for (const DataType& d : dtypes) out.push_back(d);
-  return out;
-}
-
-std::vector<PartialTensorShape> PrependQueueShapeWithBatch(
-    const std::vector<PartialTensorShape>& shapes) {
-  std::vector<PartialTensorShape> out;
-  out.reserve(shapes.size() + 1);
-  out.emplace_back(PartialTensorShape({-1}));  // The queue component.
-  for (PartialTensorShape s : shapes) {
-    s.InsertDim(0, -1);  // Unknown batch size.
-    out.push_back(std::move(s));
-  }
-  return out;
-}
-
-class EnqueueInQueueDatasetOp;
-
-class PrependFromQueueAndPaddedBatchDataset : public DatasetBase {
- public:
-  PrependFromQueueAndPaddedBatchDataset(
-      OpKernelContext* ctx, const int64 batch_size, const DatasetBase* input,
-      const DataTypeVector& dtypes,
-      const std::vector<PartialTensorShape>& shapes,
-      std::vector<Tensor> padding_values)
-      : DatasetBase(DatasetContext(ctx)),
-        batch_size_(batch_size),
-        input_(input),
-        dtypes_(dtypes),
-        shapes_(shapes),
-        padding_values_(std::move(padding_values)),
-        dtypes_with_queue_(PrependQueueType(dtypes)),
-        batched_shapes_with_queue_(PrependQueueShapeWithBatch(shapes)) {
-    input_->Ref();
-  }
-
-  ~PrependFromQueueAndPaddedBatchDataset() override { input_->Unref(); }
-
-  std::unique_ptr<IteratorBase> MakeIteratorInternal(
-      const string& prefix) const override {
-    return std::unique_ptr<IteratorBase>(new Iterator(
-        {this, strings::StrCat(prefix, "::PrependFromQueueAndPaddedBatch")}));
-  }
-
-  const DataTypeVector& output_dtypes() const override {
-    return dtypes_with_queue_;
-  }
-  const std::vector<PartialTensorShape>& output_shapes() const override {
-    return batched_shapes_with_queue_;
-  }
-
-  string DebugString() const override {
-    return "PrependFromQueueAndPaddedBatchDatasetOp::Dataset";
-  }
-
- protected:
-  Status AsGraphDefInternal(SerializationContext* ctx,
-                            DatasetGraphDefBuilder* b,
-                            Node** output) const override {
-    Node* input_graph = nullptr;
-    TF_RETURN_IF_ERROR(b->AddInputDataset(ctx, input_, &input_graph));
-    Node* batch_size = nullptr;
-    TF_RETURN_IF_ERROR(b->AddScalar(batch_size_, &batch_size));
-
-    std::vector<Node*> padded_shapes;
-    padded_shapes.reserve(shapes_.size());
-    for (int i = 0; i < shapes_.size(); i++) {
-      Node* node;
-      Tensor t(DT_INT64, TensorShape({shapes_[i].dims()}));
-      for (int j = 0; j < shapes_[i].dims(); j++) {
-        t.vec<int64>()(j) = shapes_[i].dim_size(j);
-      }
-      TF_RETURN_IF_ERROR(b->AddTensor(t, &node));
-      padded_shapes.emplace_back(node);
-    }
-
-    std::vector<Node*> padding_values;
-    padding_values.reserve(padding_values_.size());
-    for (const Tensor& t : padding_values_) {
-      Node* node;
-      TF_RETURN_IF_ERROR(b->AddTensor(t, &node));
-      padding_values.emplace_back(node);
-    }
-
-    AttrValue output_types;
-    b->BuildAttrValue(dtypes_, &output_types);
-
-    AttrValue output_shapes;
-    b->BuildAttrValue(batched_shapes_with_queue_, &output_shapes);
-
-    AttrValue N;
-    b->BuildAttrValue<int64>(shapes_.size(), &N);
-
-    TF_RETURN_IF_ERROR(b->AddDataset(this, {{0, input_graph}, {1, batch_size}},
-                                     {{2, padded_shapes}, {3, padding_values}},
-                                     {{"Toutput_types", output_types},
-                                      {"output_shapes", output_shapes},
-                                      {"N", N}},
-                                     output));
-
-    return Status::OK();
-  }
-
- private:
-  friend class EnqueueInQueueDatasetOp;
-
-  class Iterator
-      : public DatasetIterator<PrependFromQueueAndPaddedBatchDataset> {
-   public:
-    explicit Iterator(const Params& params)
-        : DatasetIterator<PrependFromQueueAndPaddedBatchDataset>(params) {}
-
-    ~Iterator() override { queue_->Unref(); }
-
-    Status Initialize(IteratorContext* ctx) override {
-      std::unique_ptr<IteratorBase> iterator;
-      TF_RETURN_IF_ERROR(
-          dataset()->input_->MakeIterator(ctx, prefix(), &iterator));
-      queue_ = new TensorQueue(std::move(iterator), dataset()->dtypes_,
-                               dataset()->shapes_);
-      return Status::OK();
-    }
-
-    Status GetNextInternal(IteratorContext* ctx,
-                           std::vector<Tensor>* out_tensors,
-                           bool* end_of_sequence) override {
-      std::vector<std::vector<Tensor>> batch;
-      TF_RETURN_IF_ERROR(queue_->GetNext(ctx, dataset()->batch_size_, &batch,
-                                         end_of_sequence));
-      const auto& dtypes = dataset()->dtypes_;
-      const auto& shapes = dataset()->shapes_;
-      const auto& input_shapes = dataset()->input_->output_shapes();
-      const auto& padding_values = dataset()->padding_values_;
-      const int64 batch_size = batch.size();
-      out_tensors->reserve(dtypes.size());
-
-      std::vector<TensorShape> max_shapes;  // Of non-queue components.
-      for (int i = 0; i < dtypes.size(); ++i) {
-        const PartialTensorShape& shape = shapes[i];
-        TensorShape out_shape({batch_size});
-        for (int r = 0; r < shape.dims(); ++r) {
-          if (shape.dim_size(r) >= 0) {
-            // padded_shape[r] is known.
-            out_shape.AddDim(shape.dim_size(r));
-          } else {
-            // padded_shape[r] is unknown, find the maximum across
-            // the batch.
-            int64 dim = 0;
-            for (int b = 0; b < batch.size(); ++b) {
-              dim = std::max(dim, batch[b][i].dim_size(r));
-            }
-            out_shape.AddDim(dim);
-          }
-        }
-        max_shapes.push_back(std::move(out_shape));
-      }
-
-      out_tensors->emplace_back(ctx->allocator({}), DT_VARIANT,
-                                TensorShape({batch_size}));
-      if (!batch.empty()) {
-        auto queues = out_tensors->back().flat<Variant>();
-        Variant& queue_inserter = queues(0);
-        queue_inserter = TensorQueueInserter();
-        queue_inserter.get<TensorQueueInserter>()->set_queue(queue_);
-        for (int b = 1; b < batch.size(); ++b) {
-          // Copy the TensorQueueInserter.  Each copy increments the
-          // Ref on the queue_.
-          queues(b) = queues(0);
-        }
-      }
-
-      for (int i = 0; i < max_shapes.size(); ++i) {
-        out_tensors->emplace_back(ctx->allocator({}), dtypes[i], max_shapes[i]);
-        Tensor& component = out_tensors->back();
-        // Try hard to take the fast path.
-        if (shapes[i].IsFullyDefined() &&
-            shapes[i].IsIdenticalTo(input_shapes[i])) {
-          // Take the fast path if we know all the shapes statically.
-          for (int64 b = 0; b < batch.size(); ++b) {
-            TF_RETURN_IF_ERROR(
-                batch_util::CopyElementToSlice(batch[b][i], &component, b));
-          }
-        } else {
-          TF_RETURN_IF_ERROR(
-              batch_util::SetElementZero(&component, padding_values[i]));
-          for (int64 b = 0; b < batch.size(); ++b) {
-            if (batch[b][i].shape() == max_shapes[i]) {
-              TF_RETURN_IF_ERROR(
-                  batch_util::CopyElementToSlice(batch[b][i], &component, b));
-            } else {
-              TF_RETURN_IF_ERROR(batch_util::CopyElementToLargerSlice(
-                  batch[b][i], &component, b));
-            }
-          }
-        }
-      }
-
-      // end_of_sequence was set before we populated out_tensors, so
-      // it's ok to return now.
-      return Status::OK();
-    }
-
-   protected:
-    // Work around bug in MSVC that disallows access to protected
-    // members of Iterator from within TensorQueue.
-    class TensorQueue;
-    friend class TensorQueue;
-
-    class TensorQueue : public core::RefCounted {
-     public:
-      TensorQueue(std::unique_ptr<IteratorBase> input_impl,
-                  const DataTypeVector& dtypes,
-                  const std::vector<PartialTensorShape>& shapes)
-          : dtypes_(dtypes),
-            shapes_(shapes),
-            input_impl_(std::move(input_impl)) {}
-
-      void MaybeWaitForNotificationLocked(mutex_lock* lock)
-          EXCLUSIVE_LOCKS_REQUIRED(mu_) {
-        // This essentially just releases the lock and immediately relocks.
-        cv_.wait_for(*lock, std::chrono::milliseconds(0));
-      }
-
-      void NotifyLocked() EXCLUSIVE_LOCKS_REQUIRED(mu_) { cv_.notify_all(); }
-
-      Status GetNext(IteratorContext* ctx, const int64 batch_size,
-                     std::vector<std::vector<Tensor>>* batch,
-                     bool* end_of_sequence) {
-        mutex_lock lock(mu_);
-
-        *end_of_sequence = false;
-
-        for (int64 b = 0; b < batch_size;) {
-          if (!entries_.empty()) {
-            batch->push_back(std::move(entries_.front()));
-            entries_.pop_front();
-            ++b;
-            continue;
-          } else {
-            if (input_impl_) {
-              // There's still input coming in.
-              std::vector<Tensor> tensors;
-              bool input_end;
-              TF_RETURN_IF_ERROR(
-                  input_impl_->GetNext(ctx, &tensors, &input_end));
-              if (!input_end) {
-                batch->push_back(std::move(tensors));
-                ++b;
-                continue;
-              } else {
-                input_impl_.reset();
-              }
-            }
-            if (!input_impl_) {
-              // There's no more input coming in.
-              if (RefCountIsOne()) {
-                // No TensorQueueInserters in the wild.
-                if (batch->empty()) {
-                  *end_of_sequence = true;
-                }
-                break;
-              } else {
-                MaybeWaitForNotificationLocked(&lock);
-                // If there's data available, try to add entries again.
-                // Otherwise return a smaller batch and hope the next
-                // iterator request has a non-empty or unused queue_.
-                if (entries_.empty()) {
-                  break;
-                }
-              }
-            }
-          }
-        }  // for (int64 b = ... batch_size)
-        return Status::OK();
-      }
-
-      Status Insert(const std::vector<Tensor>& tensors) {
-        if (tensors.size() != dtypes_.size()) {
-          return errors::InvalidArgument(
-              "TensorQueue::Insert: mismatched number of tensors.  Queue "
-              "expects ",
-              dtypes_.size(), " tensors but tried to insert ", tensors.size());
-        }
-        for (int i = 0; i < tensors.size(); ++i) {
-          if (tensors[i].dtype() != dtypes_[i]) {
-            return errors::InvalidArgument(
-                "TensorQueue::Insert: mismatched dtypes at component ", i,
-                ".  Attempted "
-                "to insert tensor of type ",
-                DataTypeString(tensors[i].dtype()),
-                " but queue expected type: ", DataTypeString(dtypes_[i]));
-          }
-          if (!shapes_[i].IsCompatibleWith(tensors[i].shape())) {
-            return errors::InvalidArgument(
-                "TensorQueue::Insert: mismatched shapes at component ", i,
-                ".  Attempted "
-                "to insert tensor with shape ",
-                tensors[i].shape().DebugString(),
-                " but queue expected shape: ", shapes_[i].DebugString());
-          }
-        }
-        mutex_lock lock(mu_);
-        entries_.push_back(tensors);
-        NotifyLocked();
-        return Status::OK();
-      }
-
-      Status Save(Iterator* iter, IteratorStateWriter* writer) {
-        mutex_lock lock(mu_);
-        if (input_impl_) {
-          TF_RETURN_IF_ERROR(iter->SaveInput(writer, input_impl_));
-        } else {
-          TF_RETURN_IF_ERROR(
-              writer->WriteScalar(iter->full_name("input_exhausted"), ""));
-        }
-        TF_RETURN_IF_ERROR(writer->WriteScalar(iter->full_name("entries_size"),
-                                               entries_.size()));
-        for (int64 b = 0; b < entries_.size(); ++b) {
-          for (int i = 0; i < dtypes_.size(); ++i) {
-            TF_RETURN_IF_ERROR(
-                writer->WriteTensor(strings::StrCat(iter->full_name("entries"),
-                                                    "[", b, "][", i, "]"),
-                                    entries_[b][i]));
-          }
-        }
-        return Status::OK();
-      }
-
-      Status Restore(Iterator* iter, IteratorContext* ctx,
-                     IteratorStateReader* reader) {
-        mutex_lock l(mu_);
-        if (reader->Contains(iter->full_name("input_exhausted"))) {
-          input_impl_.reset();
-        } else {
-          TF_RETURN_IF_ERROR(iter->dataset_input()->MakeIterator(
-              ctx, iter->prefix(), &input_impl_));
-          TF_RETURN_IF_ERROR(iter->RestoreInput(ctx, reader, input_impl_));
-        }
-        entries_.clear();
-        int64 entries_size = -1;
-        TF_RETURN_IF_ERROR(
-            reader->ReadScalar(iter->full_name("entries_size"), &entries_size));
-        if (entries_size < 0) {
-          return errors::DataLoss(
-              "Expected entries_size key '", iter->full_name("entries_size"),
-              "' to have nonnegative value, but saw: ", entries_size);
-        }
-        for (int64 b = 0; b < entries_size; ++b) {
-          std::vector<Tensor> entry;
-          for (int i = 0; i < dtypes_.size(); ++i) {
-            Tensor value;
-            TF_RETURN_IF_ERROR(
-                reader->ReadTensor(strings::StrCat(iter->full_name("entries"),
-                                                   "[", b, "][", i, "]"),
-                                   &value));
-            entry.push_back(std::move(value));
-          }
-          entries_.push_back(std::move(entry));
-        }
-        return Status::OK();
-      }
-
-      mutex* mu() { return &mu_; }
-
-     private:
-      DataTypeVector dtypes_;
-      std::vector<PartialTensorShape> shapes_;
-
-      mutex mu_;
-      std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(mu_);
-      std::deque<std::vector<Tensor>> entries_ GUARDED_BY(mu_);
-      condition_variable cv_ GUARDED_BY(mu_);
-    };
-
-    const DatasetBase* dataset_input() const { return dataset()->input_; }
-
-    std::shared_ptr<model::Node> CreateNode(
-        IteratorContext* ctx, model::Node::Args args) const override {
-      return model::MakeKnownRatioNode(std::move(args), dataset()->batch_size_);
-    }
-
-    Status SaveInternal(IteratorStateWriter* writer) override {
-      return queue_->Save(this, writer);
-    }
-
-    Status RestoreInternal(IteratorContext* ctx,
-                           IteratorStateReader* reader) override {
-      return queue_->Restore(this, ctx, reader);
-    }
-
-   public:
-    class TensorQueueInserter {
-     public:
-      TensorQueueInserter() : queue_(nullptr) {}
-
-      void set_queue(TensorQueue* queue) {
-        queue_ = queue;
-        queue_->Ref();
-      }
-
-      TensorQueueInserter(const TensorQueueInserter& rhs) {
-        queue_ = rhs.queue_;
-        queue_->Ref();
-      }
-
-      TensorQueueInserter(TensorQueueInserter&& rhs) {
-        queue_ = rhs.queue_;
-        rhs.queue_ = nullptr;
-      }
-
-      TensorQueueInserter& operator=(const TensorQueueInserter& rhs) = delete;
-
-      string TypeName() const { return "tensorflow::TensorQueueInserter"; }
-      string DebugString() const { return TypeName(); }
-
-      void Encode(VariantTensorData*) const {}
-      bool Decode(const VariantTensorData&) { return false; }
-
-      ~TensorQueueInserter() {
-        if (queue_) {
-          mutex_lock lock(*queue_->mu());
-          queue_->Unref();
-          queue_->NotifyLocked();
-          queue_ = nullptr;
-        }
-      }
-
-      Status Insert(const std::vector<Tensor>& tensors) const {
-        CHECK(queue_);
-        return queue_->Insert(tensors);
-      }
-
-     private:
-      mutable TensorQueue* queue_;
-    };
-
-   private:
-    TensorQueue* queue_;
-  };
-
- private:
-  const int64 batch_size_;
-  const DatasetBase* input_;
-  const DataTypeVector dtypes_;
-  const std::vector<PartialTensorShape> shapes_;
-  const std::vector<Tensor> padding_values_;
-  const DataTypeVector dtypes_with_queue_;
-  const std::vector<PartialTensorShape> batched_shapes_with_queue_;
-};
-
-class PrependFromQueueAndPaddedBatchDatasetOp : public UnaryDatasetOpKernel {
- public:
-  explicit PrependFromQueueAndPaddedBatchDatasetOp(OpKernelConstruction* ctx)
-      : UnaryDatasetOpKernel(ctx) {
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("Toutput_types", &output_types_));
-  }
-
-  void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
-                   DatasetBase** output) override {
-    int64 batch_size = 0;
-    OP_REQUIRES_OK(ctx,
-                   ParseScalarArgument<int64>(ctx, "batch_size", &batch_size));
-    OP_REQUIRES(
-        ctx, batch_size > 0,
-        errors::InvalidArgument("Batch size must be greater than zero."));
-
-    OpInputList padded_shape_tensors;
-    OP_REQUIRES_OK(ctx,
-                   ctx->input_list("padded_shapes", &padded_shape_tensors));
-    std::vector<PartialTensorShape> padded_shapes;
-    padded_shapes.reserve(padded_shape_tensors.size());
-    OP_REQUIRES(ctx,
-                padded_shape_tensors.size() == input->output_shapes().size(),
-                errors::InvalidArgument("Number of padded shapes (",
-                                        padded_shape_tensors.size(),
-                                        ") must match the number of components "
-                                        "in the input dataset's elements (",
-                                        input->output_shapes().size(), ")"));
-    for (const Tensor& padded_shape_t : padded_shape_tensors) {
-      OP_REQUIRES(ctx, TensorShapeUtils::IsVector(padded_shape_t.shape()),
-                  errors::InvalidArgument("All padded shapes must be vectors"));
-      PartialTensorShape padded_shape;
-      OP_REQUIRES_OK(ctx, PartialTensorShape::MakePartialShape(
-                              padded_shape_t.vec<int64>().data(),
-                              padded_shape_t.NumElements(), &padded_shape));
-      padded_shapes.push_back(std::move(padded_shape));
-    }
-
-    OP_REQUIRES(
-        ctx, input->output_dtypes() == output_types_,
-        errors::InvalidArgument("Input dataset and this dataset "
-                                "have different output_types: ",
-                                DataTypeVectorString(input->output_dtypes()),
-                                " and ", DataTypeVectorString(output_types_)));
-
-    for (int i = 0; i < input->output_shapes().size(); ++i) {
-      // Exclude the queue from the tensor_shapes calculation.
-      const PartialTensorShape& tensor_shape = padded_shapes[i];
-      OP_REQUIRES(
-          ctx,
-          IsGreaterEqualToOrCompatibleWith(tensor_shape,
-                                           input->output_shapes()[i]),
-          errors::InvalidArgument("Incompatible input shapes at component ", i,
-                                  " between input dataset this dataset: ",
-                                  input->output_shapes()[i].DebugString(),
-                                  " vs. ", tensor_shape.DebugString()));
-    }
-
-    OpInputList padding_values_list;
-    OP_REQUIRES_OK(ctx,
-                   ctx->input_list("padding_values", &padding_values_list));
-    std::vector<Tensor> padding_values;
-    OP_REQUIRES(ctx,
-                padding_values_list.size() == input->output_shapes().size(),
-                errors::InvalidArgument(
-                    "Number of padding values (", padding_values_list.size(),
-                    ") must match the number of components in the input "
-                    "dataset's elements (",
-                    input->output_shapes().size(), ")"));
-    for (int i = 0; i < padding_values_list.size(); ++i) {
-      const Tensor& padding_value_t = padding_values_list[i];
-      OP_REQUIRES(
-          ctx, TensorShapeUtils::IsScalar(padding_value_t.shape()),
-          errors::InvalidArgument(
-              "All padding values must be scalars; but at component ", i,
-              " saw shape: ", padding_value_t.shape().DebugString()));
-      OP_REQUIRES(ctx, padding_value_t.dtype() == input->output_dtypes()[i],
-                  errors::InvalidArgument(
-                      "Mismatched type between padding value ", i,
-                      " and input dataset's component ", i, ": ",
-                      DataTypeString(padding_value_t.dtype()), " vs. ",
-                      DataTypeString(input->output_dtypes()[i])));
-      padding_values.push_back(padding_value_t);
-    }
-
-    *output = new PrependFromQueueAndPaddedBatchDataset(
-        ctx, batch_size, input, output_types_, padded_shapes,
-        std::move(padding_values));
-  }
-
- private:
-  DataTypeVector output_types_;
-};
-
-REGISTER_KERNEL_BUILDER(
-    Name("PrependFromQueueAndPaddedBatchDataset").Device(DEVICE_CPU),
-    PrependFromQueueAndPaddedBatchDatasetOp);
-
-class EnqueueInQueueDatasetOp : public OpKernel {
- public:
-  explicit EnqueueInQueueDatasetOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
-  void Compute(OpKernelContext* ctx) override {
-    using TensorQueueInserter =
-        PrependFromQueueAndPaddedBatchDataset::Iterator::TensorQueueInserter;
-
-    // TODO(ebrevdo): accept list of sequence lengths to do proper
-    // sub-slicing of tensors for placement into the queue?
-    const Tensor& tensor_queue_t = ctx->input(0);
-    OP_REQUIRES(ctx, TensorShapeUtils::IsVector(tensor_queue_t.shape()),
-                errors::InvalidArgument("queue must be a vector, saw shape: ",
-                                        tensor_queue_t.shape().DebugString()));
-    std::vector<const TensorQueueInserter*> inserters;
-    const int64 batch_size = tensor_queue_t.NumElements();
-    inserters.reserve(batch_size);
-    const Variant* variants = tensor_queue_t.flat<Variant>().data();
-    for (int i = 0; i < batch_size; ++i) {
-      const auto* inserter = variants[i].get<TensorQueueInserter>();
-      OP_REQUIRES(ctx, inserter != nullptr,
-                  errors::InvalidArgument(
-                      "Could not access TensorQueueInserter from queue[", i,
-                      "].  Received variant: ", variants[i].DebugString()));
-      inserters.push_back(inserter);
-    }
-
-    OpInputList components;
-    OP_REQUIRES_OK(ctx, ctx->input_list("components", &components));
-    for (int i = 0; i < components.size(); ++i) {
-      OP_REQUIRES(
-          ctx,
-          components[i].dims() > 0 && components[i].dim_size(0) == batch_size,
-          errors::InvalidArgument(
-              "Expected component ", i, " to have batched shape [", batch_size,
-              ",...], but saw shape: ", components[i].shape().DebugString()));
-    }
-    std::vector<TensorShape> element_shapes;
-    for (int i = 0; i < components.size(); ++i) {
-      TensorShape element_shape = components[i].shape();
-      element_shape.RemoveDim(0);
-      element_shapes.push_back(std::move(element_shape));
-    }
-    for (int64 b = 0; b < batch_size; ++b) {
-      std::vector<Tensor> tensors;
-      tensors.reserve(components.size());
-      for (int i = 0; i < components.size(); ++i) {
-        Tensor t(components[i].dtype(), element_shapes[i]);
-        OP_REQUIRES_OK(ctx,
-                       batch_util::CopySliceToElement(components[i], &t, b));
-        tensors.push_back(std::move(t));
-      }
-      // TODO(ebrevdo): Acquire the lock once for all inserters with
-      // the same underlying queue?  Add InsertLocked?
-      OP_REQUIRES_OK(ctx, inserters[b]->Insert(tensors));
-    }
-  }
-};
-
-REGISTER_KERNEL_BUILDER(Name("EnqueueInQueueDataset").Device(DEVICE_CPU),
-                        EnqueueInQueueDatasetOp);
-
-}  // namespace
-}  // namespace data
-}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/tensor_slice_dataset_op.cc b/tensorflow/core/kernels/data/tensor_slice_dataset_op.cc
index 6291bfc110b..4ba2bde718a 100644
--- a/tensorflow/core/kernels/data/tensor_slice_dataset_op.cc
+++ b/tensorflow/core/kernels/data/tensor_slice_dataset_op.cc
@@ -84,6 +84,8 @@ class TensorSliceDatasetOp : public DatasetOpKernel {
       return "TensorSliceDatasetOp::Dataset";
     }
 
+    int64 Cardinality() const override { return tensors_[0].dim_size(0); }
+
    protected:
     Status AsGraphDefInternal(SerializationContext* ctx,
                               DatasetGraphDefBuilder* b,
diff --git a/tensorflow/core/kernels/data/window_dataset.cc b/tensorflow/core/kernels/data/window_dataset.cc
index 2ad4711aabe..bed1c66ac74 100644
--- a/tensorflow/core/kernels/data/window_dataset.cc
+++ b/tensorflow/core/kernels/data/window_dataset.cc
@@ -43,6 +43,8 @@ class WindowDataset : public DatasetBase {
 
   string DebugString() const override { return "WindowDataset"; }
 
+  int64 Cardinality() const override { return elements_.size(); }
+
  protected:
   // TODO(b/110981596): Support checkpointing.
   Status AsGraphDefInternal(SerializationContext* ctx,
diff --git a/tensorflow/core/kernels/data/window_dataset_op.cc b/tensorflow/core/kernels/data/window_dataset_op.cc
index 2c68e1ee05b..9d66162dfa1 100644
--- a/tensorflow/core/kernels/data/window_dataset_op.cc
+++ b/tensorflow/core/kernels/data/window_dataset_op.cc
@@ -98,6 +98,15 @@ class WindowDatasetOp : public UnaryDatasetOpKernel {
                              window_stride_, drop_remainder_, ")::Dataset");
     }
 
+    int64 Cardinality() const override {
+      int64 n = input_->Cardinality();
+      if (n == kInfiniteCardinality || n == kUnknownCardinality) {
+        return n;
+      }
+      return n / window_shift_ +
+             (n % window_shift_ == 0 || drop_remainder_ ? 0 : 1);
+    }
+
    protected:
     Status AsGraphDefInternal(SerializationContext* ctx,
                               DatasetGraphDefBuilder* b,
diff --git a/tensorflow/core/kernels/data/zip_dataset_op.cc b/tensorflow/core/kernels/data/zip_dataset_op.cc
index 6e94d778671..1760e63a9e1 100644
--- a/tensorflow/core/kernels/data/zip_dataset_op.cc
+++ b/tensorflow/core/kernels/data/zip_dataset_op.cc
@@ -76,6 +76,21 @@ class ZipDatasetOp : public DatasetOpKernel {
 
     string DebugString() const override { return "ZipDatasetOp::Dataset"; }
 
+    int64 Cardinality() const override {
+      int64 result = kInfiniteCardinality;
+      for (const auto& input : inputs_) {
+        int64 n = input->Cardinality();
+        if (n == kUnknownCardinality) {
+          return kUnknownCardinality;
+        }
+        if (n != kInfiniteCardinality &&
+            (result == kInfiniteCardinality || n < result)) {
+          result = n;
+        }
+      }
+      return result;
+    }
+
    protected:
     Status AsGraphDefInternal(SerializationContext* ctx,
                               DatasetGraphDefBuilder* b,
diff --git a/tensorflow/core/kernels/data_format_ops.cc b/tensorflow/core/kernels/data_format_ops.cc
index 23319e6d0c5..27020cdabdb 100644
--- a/tensorflow/core/kernels/data_format_ops.cc
+++ b/tensorflow/core/kernels/data_format_ops.cc
@@ -156,6 +156,16 @@ TF_CALL_int32(REGISTER_KERNEL);
 TF_CALL_int64(REGISTER_KERNEL);
 #undef REGISTER_KERNEL
 
+#define REGISTER_KERNEL(T)                             \
+  REGISTER_KERNEL_BUILDER(Name("DataFormatVecPermute") \
+                              .Device(DEVICE_CPU)      \
+                              .Label("host")           \
+                              .TypeConstraint<T>("T"), \
+                          DataFormatVecPermuteOp<CPUDevice, T>);
+TF_CALL_int32(REGISTER_KERNEL);
+TF_CALL_int64(REGISTER_KERNEL);
+#undef REGISTER_KERNEL
+
 #if GOOGLE_CUDA
 // Forward declarations of the functor specializations for GPU.
 namespace functor {
diff --git a/tensorflow/core/kernels/deep_conv2d.cc b/tensorflow/core/kernels/deep_conv2d.cc
index f9c8f16cb9a..750c0318a4d 100644
--- a/tensorflow/core/kernels/deep_conv2d.cc
+++ b/tensorflow/core/kernels/deep_conv2d.cc
@@ -434,10 +434,9 @@ struct TransformFilters {
         tile_spatial_size, base_filter_spatial_size, transform_matrix);
 
     auto shard = [&ctx, &args, &transform, &base_filter_rows, &base_filter_cols,
-                  &num_filters_transform, &in_depth, &out_depth,
-                  &filter_shards_row, &filter_shards_col, &tile_spatial_size,
-                  &filter_in, &transform_matrix,
-                  &filter_out](int64 start, int64 limit) {
+                  &num_filters_transform, &in_depth, &filter_shards_row,
+                  &filter_shards_col, &tile_spatial_size, &filter_in,
+                  &transform_matrix, &filter_out](int64 start, int64 limit) {
       // Allocate buffer for pre-processed filter:
       //   [base_filter_rows, base_filter_cols, num_filters_transform, in_depth]
       //
@@ -533,9 +532,9 @@ struct PackFilters {
     const int64 out_depth = args.out_depth;
     const int64 num_filters = filter_shards_row * filter_shards_col * out_depth;
 
-    auto shard = [&ctx, &packed_filters, &filter_transform_data,
-                  &tile_spatial_size, &in_depth, &out_depth, &filter_shards_row,
-                  &filter_shards_col, &num_filters](int64 start, int64 limit) {
+    auto shard = [&ctx, &packed_filters, &filter_transform_data, &in_depth,
+                  &out_depth, &filter_shards_row, &filter_shards_col,
+                  &num_filters](int64 start, int64 limit) {
       const int64 filter_coord_stride = num_filters * in_depth;
       for (int64 i = start; i < limit; ++i) {
         // Allocate filter buffer [out_depth, shard_rows, shard_cols, in_depth].
@@ -788,7 +787,7 @@ struct TransformOutputTile {
             const int64 shard_base = sr * filter_shards_col + sc;
             const int64 out_buf_base = tile_base + out_depth_base + shard_base;
 
-            // Calcuate output indices and outputs to drop (if needed).
+            // Calculate output indices and outputs to drop (if needed).
             const int64 out_r_start =
                 in_r + args.pad_rows - sr * tile_stride_rows;
             // NOTE: The index 't' for 'num_tiles is used in index calculation
@@ -1004,9 +1003,9 @@ struct DeepConv2D<CPUDevice, T> {
         out_tile_spatial_size, tile_spatial_size, output_transform_matrix);
 
     auto shard = [&ctx, &args, &transform, &packed_filters, &in_depth,
-                  out_depth, tile_rows, tile_cols, out_tile_rows, out_tile_cols,
-                  filter_shards_row, filter_shards_col, tile_spatial_size,
-                  &input, &tile_transform_matrix, &output_transform_matrix,
+                  out_depth, out_tile_rows, out_tile_cols, filter_shards_row,
+                  filter_shards_col, tile_spatial_size, &input,
+                  &tile_transform_matrix, &output_transform_matrix,
                   &output](int64 batch_start, int64 batch_limit) {
       const int64 row_tiles =
           (args.out_rows + out_tile_rows - 1) / out_tile_rows +
diff --git a/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc b/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc
index 1398c876625..e811968d277 100644
--- a/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc
@@ -175,7 +175,7 @@ __global__ __launch_bounds__(1024, 2) void DepthwiseConv2dGPUKernelNHWCSmall(
   assert(CanLaunchDepthwiseConv2dGPUSmall(args));
   // Holds block plus halo and filter data for blockDim.x depths.
   extern __shared__ __align__(8) unsigned char shared_memory[];
-  static_assert(sizeof(S) <= 8, "Insufficient alignement detected");
+  static_assert(sizeof(S) <= 8, "Insufficient alignment detected");
   S* const shared_data = reinterpret_cast<S*>(shared_memory);
 
   const int num_batches = args.batch;
@@ -459,7 +459,7 @@ __global__ __launch_bounds__(1024, 2) void DepthwiseConv2dGPUKernelNCHWSmall(
   assert(CanLaunchDepthwiseConv2dGPUSmall(args));
   // Holds block plus halo and filter data for blockDim.z depths.
   extern __shared__ __align__(8) unsigned char shared_memory[];
-  static_assert(sizeof(S) <= 8, "Insufficient alignement detected");
+  static_assert(sizeof(S) <= 8, "Insufficient alignment detected");
   S* const shared_data = reinterpret_cast<S*>(shared_memory);
 
   const int num_batches = args.batch;
@@ -1176,7 +1176,7 @@ __launch_bounds__(1024, 2) void DepthwiseConv2dBackpropFilterGPUKernelNHWCSmall(
   assert(CanLaunchDepthwiseConv2dBackpropFilterGPUSmall(args, blockDim.z));
   // Holds block plus halo and filter data for blockDim.x depths.
   extern __shared__ __align__(8) unsigned char shared_memory[];
-  static_assert(sizeof(S) <= 8, "Insufficient alignement detected");
+  static_assert(sizeof(S) <= 8, "Insufficient alignment detected");
   S* const shared_data = reinterpret_cast<S*>(shared_memory);
 
   const int num_batches = args.batch;
@@ -1448,7 +1448,7 @@ __launch_bounds__(1024, 2) void DepthwiseConv2dBackpropFilterGPUKernelNCHWSmall(
   assert(CanLaunchDepthwiseConv2dBackpropFilterGPUSmall(args, blockDim.x));
   // Holds block plus halo and filter data for blockDim.z depths.
   extern __shared__ __align__(8) unsigned char shared_memory[];
-  static_assert(sizeof(S) <= 8, "Insufficient alignement detected");
+  static_assert(sizeof(S) <= 8, "Insufficient alignment detected");
   S* const shared_data = reinterpret_cast<S*>(shared_memory);
 
   const int num_batches = args.batch;
diff --git a/tensorflow/core/kernels/dynamic_partition_op.cc b/tensorflow/core/kernels/dynamic_partition_op.cc
index 3c988db5e61..572d04ae2c4 100644
--- a/tensorflow/core/kernels/dynamic_partition_op.cc
+++ b/tensorflow/core/kernels/dynamic_partition_op.cc
@@ -142,7 +142,7 @@ class DynamicPartitionOp : public DynamicPartitionOp_Shared {
         OP_REQUIRES(
             c, FastBoundsCheck(p, num_partitions_),
             errors::InvalidArgument("indices[", i,
-                                    "] has been asynchronously overwitten and "
+                                    "] has been asynchronously overwritten and "
                                     "is no longer in range!"));
         auto oi = output_index[p];
         OP_REQUIRES(c, FastBoundsCheck(oi, out_flat[p].dimension(0)),
diff --git a/tensorflow/core/kernels/eigen_spatial_convolutions.h b/tensorflow/core/kernels/eigen_spatial_convolutions.h
index 1f211b19b4a..25c735d080e 100644
--- a/tensorflow/core/kernels/eigen_spatial_convolutions.h
+++ b/tensorflow/core/kernels/eigen_spatial_convolutions.h
@@ -56,6 +56,7 @@ namespace internal {
 //
 // TODO(ezhulenev): Consolidate this part of the code with the image patch
 // extraction code since they are both very similar.
+
 template <typename NewDimension, Index Rows, Index Cols, typename ArgType,
           typename Device, typename Scalar_, typename Index,
           typename nocontract_t, typename contract_t, int Side, int packet_size,
@@ -70,6 +71,7 @@ class TensorContractionInputMapper<
     inner_dim_reordered, Alignment> {
  public:
   typedef Scalar_ Scalar;
+
   typedef TensorContractionInputMapper<
       Scalar, Index, Side,
       TensorEvaluator<
@@ -79,6 +81,7 @@ class TensorContractionInputMapper<
       nocontract_t, contract_t, packet_size, inner_dim_contiguous,
       inner_dim_reordered, Alignment>
       Self;
+
   typedef TensorContractionSubMapper<
       Scalar, Index, Side,
       TensorEvaluator<
@@ -88,6 +91,7 @@ class TensorContractionInputMapper<
       nocontract_t, contract_t, packet_size, inner_dim_contiguous,
       inner_dim_reordered, Alignment>
       SubMapper;
+
   typedef SubMapper VectorMapper;
   typedef SubMapper LinearMapper;
   typedef typename packet_traits<Scalar>::type Packet;
@@ -533,6 +537,7 @@ class TensorContractionSubMapper<
       nocontract_t, contract_t, packet_size, inner_dim_contiguous,
       inner_dim_reordered, Alignment>
       ParentMapper;
+
   typedef TensorContractionSubMapper<
       Scalar, Index, Side,
       TensorEvaluator<
@@ -542,21 +547,22 @@ class TensorContractionSubMapper<
       nocontract_t, contract_t, packet_size, inner_dim_contiguous,
       inner_dim_reordered, Alignment>
       Self;
+
   typedef Self LinearMapper;
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorContractionSubMapper(
       const ParentMapper& base_mapper, Index vert_offset, Index horiz_offset)
-      : m_base_mapper(base_mapper),
-        m_depth_offset(vert_offset),
-        m_col_offset(horiz_offset) {
+      : m_depth_offset(vert_offset),
+        m_col_offset(horiz_offset),
+        m_base_mapper(base_mapper) {
     m_base_mapper.computeBaseIndices(m_col_offset, m_rowIndex, m_colIndex,
                                      m_otherIndex);
   }
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorContractionSubMapper(
       const Self& base_mapper, Index vert_offset, Index horiz_offset)
-      : m_base_mapper(base_mapper.m_base_mapper),
-        m_depth_offset(vert_offset + base_mapper.m_depth_offset),
-        m_col_offset(horiz_offset + base_mapper.m_col_offset) {
+      : m_depth_offset(vert_offset + base_mapper.m_depth_offset),
+        m_col_offset(horiz_offset + base_mapper.m_col_offset),
+        m_base_mapper(base_mapper.m_base_mapper) {
     m_base_mapper.computeBaseIndices(m_col_offset, m_rowIndex, m_colIndex,
                                      m_otherIndex);
   }
@@ -578,7 +584,6 @@ class TensorContractionSubMapper<
     return m_base_mapper.template loadPacket<Alignment>(i + m_depth_offset,
                                                         j + m_col_offset);
   }
-
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar
   loadCoeffStandard(Index i) const {
     return m_base_mapper.loadCoeffStandard(i + m_depth_offset, m_rowIndex,
@@ -611,18 +616,29 @@ class TensorContractionSubMapper<
   EIGEN_DEVICE_FUNC
   EIGEN_ALWAYS_INLINE Index maxCol(const Index peeled_k) const {
     const Index max_col =
-        fastPatchColStride().divide(m_depth_offset + peeled_k);
+        (m_depth_offset + (peeled_k == 0 ? 0 : peeled_k - 1)) /
+        fastPatchColStride();
     return std::min<Index>(1 + max_col, patchCols());
   }
 
   EIGEN_DEVICE_FUNC
   EIGEN_ALWAYS_INLINE Index maxRow(const Index peeled_k,
                                    const Index col) const {
-    const Index max_row = fastPatchRowStride().divide(
-        m_depth_offset + peeled_k - col * patchColStride());
+    const Index max_row = (m_depth_offset + (peeled_k == 0 ? 0 : peeled_k - 1) -
+                           col * patchColStride()) /
+                          fastPatchRowStride();
     return std::min<Index>(1 + max_row, patchRows());
   }
 
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Index maxDepth(const Index peeled_k, const Index col,
+                                     Index row) const {
+    const Index max_depth = m_depth_offset + peeled_k -  //
+                            col * patchColStride() -     //
+                            row * patchRowStride();
+    return std::min<Index>(max_depth, patchDepth());
+  }
+
   // MaxDepth uses only the remaining number of elements in the peeled_k.
   EIGEN_DEVICE_FUNC
   EIGEN_ALWAYS_INLINE Index maxDepth(const Index num_elements,
@@ -692,6 +708,12 @@ class TensorContractionSubMapper<
     return r < 0 || r >= m_base_mapper.m_inputRows;
   }
   EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE bool padAnyRow(const Index first_row,
+                                     const Index last_row) const {
+    return m_rowIndex + first_row < 0 ||
+           m_rowIndex + last_row >= m_base_mapper.m_inputRows;
+  }
+  EIGEN_DEVICE_FUNC
   EIGEN_ALWAYS_INLINE bool padCol(const Index col) const {
     const Index c = m_colIndex + col;
     return c < 0 || c >= m_base_mapper.m_inputCols;
@@ -738,9 +760,6 @@ class TensorContractionSubMapper<
   }
 
  private:
-  const ParentMapper m_base_mapper;  // Keeping a copy instead of a reference
-                                     // performs better in benchmarks.
-
   Index m_depth_offset;  // First row in the input matrix
   Index m_col_offset;    // First col in the input matrix
 
@@ -750,6 +769,9 @@ class TensorContractionSubMapper<
   Index m_rowIndex;
   Index m_colIndex;
   Index m_otherIndex;
+
+  const ParentMapper m_base_mapper;  // Keeping a copy instead of a reference
+                                     // performs better in benchmarks.
 };
 
 // Arrange a block of the right input matrix (in our case it's always a "virtual
@@ -1319,23 +1341,19 @@ struct mkldnn_gemm_pack<
   typedef typename packet_traits<Scalar>::type Packet;
 
   EIGEN_DONT_INLINE
-  void operator()(Scalar* block, const DataMapper& rhs, StorageIndex rows,
+  void operator()(Scalar* block, const DataMapper rhs, StorageIndex rows,
                   StorageIndex cols) {
     const bool standard_patches = !rhs.nonStandardPatches();
 
     if (standard_patches && (rhs.patchDepth() % packet_size == 0)) {
-      if (rhs.rowStride() == 1) {
-        packStandardPatches<true, /*squeeze*/ true>(block, rhs, rows, cols);
-      } else {
-        packStandardPatches<true, /*squeeze*/ false>(block, rhs, rows, cols);
-      }
+      // Single packet always belong to single patch (row, col).
+      packStandardPatches</*patch_depth_is_multiple_of_packet_size*/ true>(
+          block, rhs, rows, cols);
 
     } else if (standard_patches) {
-      if (rhs.rowStride() == 1) {
-        packStandardPatches<false, /*squeeze*/ true>(block, rhs, rows, cols);
-      } else {
-        packStandardPatches<false, /*squeeze*/ false>(block, rhs, rows, cols);
-      }
+      // Single packet can span across multiple patch rows or columns.
+      packStandardPatches</*patch_depth_is_multiple_of_packet_size*/ false>(
+          block, rhs, rows, cols);
 
     } else {
       // With non-standard patches we don't do any vectorized loads.
@@ -1357,72 +1375,64 @@ struct mkldnn_gemm_pack<
   // - patch_depth_is_multiple_of_packet_size=true: We are guaranteed to have
   //   depth dimension size to be a multiple of packet size, so we can skip all
   //   non vectorized loads and checks.
-  //
-  // - squeeze_reads=true: If stride along the `row` dimension is `1`, we can
-  //   squeeze reads along the `row` and `depth` dimensions, because they are
-  //   guaranteed to be contiguous in memory (two innermost dimensions).
-  //
-  template <bool patch_depth_is_multiple_of_packet_size, bool squeeze_reads>
+  template <bool patch_depth_is_multiple_of_packet_size>
   EIGEN_ALWAYS_INLINE void packStandardPatches(Scalar* block,
-                                               const DataMapper& rhs,
+                                               const DataMapper rhs,
                                                StorageIndex rows,
                                                StorageIndex cols) {
     eigen_assert(!rhs.nonStandardPatches());
 
     // Give vectorized_rows the name used in all other gemm_pack_rhs above.
-    const Index peeled_k = (rows / packet_size) * packet_size;
+    const StorageIndex peeled_k = (rows / packet_size) * packet_size;
 
-    const Index start_col = rhs.colOffset();
-    const Index max_col = rhs.maxCol(peeled_k);
+    const StorageIndex start_col = rhs.colOffset();
+    const StorageIndex max_col = rhs.maxCol(peeled_k);
 
     for (StorageIndex col = 0; col < cols; ++col) {
       SubMapper lm = rhs.getLinearMapper(0, col);
 
-      Index k = 0;
+      StorageIndex k = 0;
       for (Index c = start_col; c < max_col; ++c) {
         eigen_assert(k <= peeled_k);
 
-        const Index start_row = (c == start_col) ? rhs.rowOffset() : 0;
-        const Index max_row = rhs.maxRow(peeled_k, c);
+        const StorageIndex start_row = (c == start_col) ? rhs.rowOffset() : 0;
+        const StorageIndex max_row = rhs.maxRow(peeled_k, c);
         const bool pad_col = lm.padCol(c);
 
         // We can squeeze reads for all rows in [start_row, max_row) range.
-        if (squeeze_reads && !pad_col && !lm.padRow(start_row) &&
-            !lm.padRow(max_row - 1)) {
-          const Index start_depth = (c == start_col) ? rhs.depthOffset() : 0;
+        if (!pad_col && !lm.padAnyRow(start_row, max_row - 1)) {
+          const StorageIndex start_depth =
+              (c == start_col) ? rhs.depthOffset() : 0;
 
-          // Upper bound on the number of elements in the depth dimension that
-          // we can squeeze read.
-          const Index squeeze_length =
-              (max_row - start_row) * rhs.patchDepth() - start_depth;
+          const StorageIndex max_depth =
+              std::min<StorageIndex>(start_depth + (peeled_k - k),
+                                     (max_row - start_row) * rhs.patchDepth());
 
-          // Do not overshoot beyond the block size.
-          const Index max_depth =
-              start_depth + std::min<Index>(peeled_k - k, squeeze_length);
+          const StorageIndex base_idx = lm.baseIndex(start_row, c);
 
-          const Index base_idx = lm.baseIndex(start_row, c);
-
-          if (patch_depth_is_multiple_of_packet_size)
+          if (patch_depth_is_multiple_of_packet_size) {
+            // If patch depth is a multiple of packet size, it's guaranteed that
+            // we can process all values in depth dimension with packets.
             eigen_assert((max_depth - start_depth) % packet_size == 0);
+            StorageIndex d = start_depth;
 
-          // If patch depth is a multiple of packet size, it's guaranteed that
-          // we can process all values in depth dimension with packets.
-          const Index max_vectorized_depth =
-              patch_depth_is_multiple_of_packet_size ? max_depth
-                                                     : max_depth - packet_size;
+            for (; d < max_depth; d += packet_size) {
+              eigen_assert(k < peeled_k);
+              internal::pstoreu(block, rhs.packetNoPadding(d, base_idx));
+              block += packet_size;
+              k += packet_size;
+            }
 
-          Index d = start_depth;
+          } else {
+            StorageIndex d = start_depth;
+            const StorageIndex vectorized_depth = max_depth - packet_size;
 
-          // 1. Process depth dimension with vectorized instructions.
-          for (; d < max_vectorized_depth; d += packet_size) {
-            eigen_assert(k < peeled_k);
-            internal::pstoreu(block, rhs.packetNoPadding(d, base_idx));
-            block += packet_size;
-            k += packet_size;
-          }
-
-          // 2. Finish with coefficients.
-          if (!patch_depth_is_multiple_of_packet_size) {
+            for (; d <= vectorized_depth; d += packet_size) {
+              eigen_assert(k < peeled_k);
+              internal::pstoreu(block, rhs.packetNoPadding(d, base_idx));
+              block += packet_size;
+              k += packet_size;
+            }
             for (; d < max_depth; d++) {
               eigen_assert(k < peeled_k);
               *block = rhs.coeffNoPadding(d, base_idx);
@@ -1437,39 +1447,43 @@ struct mkldnn_gemm_pack<
 
         // If we are not allowed to squeeze reads along the `row` and `depth`
         // dimensions, we must process rows one by one.
-        for (Index r = start_row; r < max_row; ++r) {
+        for (StorageIndex r = start_row; r < max_row; ++r) {
           eigen_assert(k <= peeled_k);
 
-          const Index start_depth =
+          const StorageIndex start_depth =
               ((c == start_col) && (r == start_row)) ? rhs.depthOffset() : 0;
-          const Index max_depth = rhs.maxDepth(peeled_k - k, start_depth);
+          const StorageIndex max_depth =
+              rhs.maxDepth(peeled_k - k, start_depth);
 
           const bool pad = pad_col || lm.padRow(r);
-          const Index base_idx = lm.baseIndex(r, c);
+          const StorageIndex base_idx = lm.baseIndex(r, c);
 
-          if (patch_depth_is_multiple_of_packet_size)
+          if (patch_depth_is_multiple_of_packet_size) {
+            // If patch depth is a multiple of packet size, it's guaranteed that
+            // we can process all values in depth dimension with packets.
             eigen_assert((max_depth - start_depth) % packet_size == 0);
+            StorageIndex d = start_depth;
 
-          // If patch depth is a multiple of packet size, it's guaranteed that
-          // we can process all values in depth dimension with packets.
-          const Index max_vectorized_depth =
-              patch_depth_is_multiple_of_packet_size ? max_depth
-                                                     : max_depth - packet_size;
+            for (; d < max_depth; d += packet_size) {
+              eigen_assert(k < peeled_k);
+              const Packet p = pad ? pset1<Packet>(Scalar(0))
+                                   : rhs.packetNoPadding(d, base_idx);
+              internal::pstoreu(block, p);
+              block += packet_size;
+              k += packet_size;
+            }
 
-          Index d = start_depth;
-
-          // 1. Process depth dimension with vectorized instructions.
-          for (; d < max_vectorized_depth; d += packet_size) {
-            eigen_assert(k < peeled_k);
-            const Packet p = pad ? pset1<Packet>(Scalar(0))
-                                 : rhs.packetNoPadding(d, base_idx);
-            internal::pstoreu(block, p);
-            block += packet_size;
-            k += packet_size;
-          }
-
-          // 2. Finish with coefficients.
-          if (!patch_depth_is_multiple_of_packet_size) {
+          } else {
+            const StorageIndex max_vectorized_depth = max_depth - packet_size;
+            StorageIndex d = start_depth;
+            for (; d < max_vectorized_depth; d += packet_size) {
+              eigen_assert(k < peeled_k);
+              const Packet p = pad ? pset1<Packet>(Scalar(0))
+                                   : rhs.packetNoPadding(d, base_idx);
+              internal::pstoreu(block, p);
+              block += packet_size;
+              k += packet_size;
+            }
             for (; d < max_depth; d++) {
               eigen_assert(k < peeled_k);
               *block = pad ? Scalar(0) : rhs.coeffNoPadding(d, base_idx);
diff --git a/tensorflow/core/kernels/eigen_spatial_convolutions_test.cc b/tensorflow/core/kernels/eigen_spatial_convolutions_test.cc
index 8219fc9025b..22f71d62602 100644
--- a/tensorflow/core/kernels/eigen_spatial_convolutions_test.cc
+++ b/tensorflow/core/kernels/eigen_spatial_convolutions_test.cc
@@ -1380,7 +1380,12 @@ static void PackRhsHelper(int iters,
                           /* Filter (kernel) dimensions: */
                           int filter_count, int filter_cols, int filter_rows,
                           /* Input strides: */
-                          int col_strides, int row_strides) {
+                          int col_strides, int row_strides,
+                          /* Block dimensions: */
+                          Index block_rows, Index block_cols) {
+  // Set random seed for benchmark repeatability.
+  srand(12345);
+
   tensorflow::testing::UseRealTime();
   tensorflow::testing::StopTiming();
 
@@ -1508,10 +1513,6 @@ static void PackRhsHelper(int iters,
 
   PackRhsImpl pack_rhs;
 
-  // This is the typical size of the rhs block used in Tensor contractions.
-  const Index default_depth = 320;  // must be multiple of 8
-  const Index default_cols = 280;
-
   const Index packed_total_size = input_dims.TotalSize();
 
   tensorflow::testing::StartTiming();
@@ -1520,11 +1521,14 @@ static void PackRhsHelper(int iters,
         num_inputs == 1 ? 1 : internal::random<int>(0, num_inputs - 1);
 
     // Depth offset must be a multiple of 8 (float packet size with AVX2).
-    Index depth_offset = (internal::random<Index>(0, patch_size - 10) / 8) * 8;
+    Index depth_offset =
+        (patch_size > block_rows)
+            ? (internal::random<Index>(0, patch_size - 10) / 8) * 8
+            : 0;
     Index col_offset = internal::random<Index>(0, num_patches - 10);
 
-    Index depth = std::min(default_depth, patch_size - depth_offset);
-    Index cols = std::min(default_cols, num_patches - col_offset);
+    Index depth = std::min(block_rows, patch_size - depth_offset);
+    Index cols = std::min(block_cols, num_patches - col_offset);
 
     // Write packed data to random memory location to emulate cold caches.
     Index packed_size = depth * cols;
@@ -1538,20 +1542,37 @@ static void PackRhsHelper(int iters,
   tensorflow::testing::StopTiming();
 
   std::ostringstream stringStream;
-  stringStream << "patch: depth=" << patch_depth << " rows=" << patch_rows
-               << " cols=" << patch_cols << " num_patches=" << num_patches
+  stringStream << "patch: " << patch_rows << "x" << patch_cols << " D"
+               << patch_depth << "; num_patches=" << num_patches
                << " patch_size=" << patch_size << " num_inputs=" << num_inputs;
   tensorflow::testing::SetLabel(stringStream.str());
 }
 
-#define BM_NAME(prefix, N, H, W, C, FC, FH, FW, SH, SW) \
-  BM_##prefix##_##N##_##H##x##W##_IC##C##_FC##FC##_##FH##x##FW##_s##SH##x##SW
+// -------------------------------------------------------------------------- //
+// Macro argumentnames:
+//    N: batch size
+//    H: height
+//    W: width
+//    C: input channels
+//   FC: filter channles
+//   FH: filter height
+//   SH: stride in height dimensions
+//   SW: stride in width dimensions
+//   BR: block rows
+//   BC: block cols
 
-#define BM_PackRhs(N, H, W, C, FC, FH, FW, SH, SW)                          \
-  static void BM_NAME(PackRhs, N, H, W, C, FC, FH, FW, SH, SW)(int iters) { \
-    PackRhsHelper(iters, N, H, W, C, FC, FH, FW, SH, SW);                   \
-  }                                                                         \
-  BENCHMARK(BM_NAME(PackRhs, N, H, W, C, FC, FH, FW, SH, SW))
+#define BM_CONCAT(a, b) a##b
+
+#define BM_NAME(prefix, N, H, W, C, FC, FH, FW, SH, SW, BR, BC)           \
+  BM_CONCAT(BM_##prefix##_##N##_##H##x##W##_IC##C##_FC##FC##_##FH##x##FW, \
+            _s##SH##x##SW##_B##BR##x##BC)
+
+#define BM_PackRhs(N, H, W, C, FC, FH, FW, SH, SW, BR, BC)         \
+  static void BM_NAME(PackRhs, N, H, W, C, FC, FH, FW, SH, SW, BR, \
+                      BC)(int iters) {                             \
+    PackRhsHelper(iters, N, H, W, C, FC, FH, FW, SH, SW, BR, BC);  \
+  }                                                                \
+  BENCHMARK(BM_NAME(PackRhs, N, H, W, C, FC, FH, FW, SH, SW, BR, BC))
 
 // Number of input channel (input depth) it equal to the number of patch
 // channels (patch depth).
@@ -1563,14 +1584,16 @@ BM_PackRhs(/*batch*/ 32,        //
            /*channels*/ 32,     //
            /*num_filters*/ 64,  //
            /*filter*/ 5, 5,     //
-           /*stride*/ 1, 1);
+           /*stride*/ 1, 1,     //
+           /*block*/ 256, 56);
 
 BM_PackRhs(/*batch*/ 32,        //
            /*image*/ 64, 64,    //
            /*channels*/ 32,     //
            /*num_filters*/ 64,  //
            /*filter*/ 5, 5,     //
-           /*stride*/ 2, 2);
+           /*stride*/ 2, 2,     //
+           /*block*/ 256, 56);
 
 // Slow path: input channel dimension is not the multiple of the packet size.
 BM_PackRhs(/*batch*/ 32,        //
@@ -1578,12 +1601,48 @@ BM_PackRhs(/*batch*/ 32,        //
            /*channels*/ 30,     //
            /*num_filters*/ 64,  //
            /*filter*/ 5, 5,     //
-           /*stride*/ 1, 1);
+           /*stride*/ 1, 1,     //
+           /*block*/ 256, 56);
 
 BM_PackRhs(/*batch*/ 32,        //
            /*image*/ 64, 64,    //
            /*channels*/ 30,     //
            /*num_filters*/ 64,  //
            /*filter*/ 5, 5,     //
-           /*stride*/ 2, 2);
+           /*stride*/ 2, 2,     //
+           /*block*/ 256, 56);
+
+// Slow path with input channel dimension smaller than the packet size.
+BM_PackRhs(/*batch*/ 32,        //
+           /*image*/ 256, 256,  //
+           /*channels*/ 4,      //
+           /*num_filters*/ 16,  //
+           /*filter*/ 8, 8,     //
+           /*stride*/ 1, 1,     //
+           /*block*/ 256, 56);
+
+BM_PackRhs(/*batch*/ 32,        //
+           /*image*/ 256, 256,  //
+           /*channels*/ 4,      //
+           /*num_filters*/ 16,  //
+           /*filter*/ 8, 8,     //
+           /*stride*/ 2, 4,     //
+           /*block*/ 256, 56);
+
+// Short and wide block with small input channel dimension.
+BM_PackRhs(/*batch*/ 32,        //
+           /*image*/ 64, 64,    //
+           /*channels*/ 4,      //
+           /*num_filters*/ 16,  //
+           /*filter*/ 3, 3,     //
+           /*stride*/ 1, 1,     //
+           /*block*/ 36, 432);
+
+BM_PackRhs(/*batch*/ 32,        //
+           /*image*/ 64, 64,    //
+           /*channels*/ 4,      //
+           /*num_filters*/ 16,  //
+           /*filter*/ 3, 3,     //
+           /*stride*/ 2, 2,     //
+           /*block*/ 36, 432);
 }  // namespace Eigen
diff --git a/tensorflow/core/kernels/fractional_avg_pool_op.cc b/tensorflow/core/kernels/fractional_avg_pool_op.cc
index 135d0023458..61234479eac 100644
--- a/tensorflow/core/kernels/fractional_avg_pool_op.cc
+++ b/tensorflow/core/kernels/fractional_avg_pool_op.cc
@@ -223,7 +223,7 @@ class FractionalAvgPoolGradOp : public OpKernel {
     // Once we figure out the original contributors, we just need to evenly
     // divide the value of this element among these contributors.
     //
-    // Internally, we divide the out_backprop tensor and store it in a temparary
+    // Internally, we divide the out_backprop tensor and store it in a temporary
     // tensor of double type. And cast it to the corresponding type.
     typedef Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>
         ConstEigenMatrixMap;
diff --git a/tensorflow/core/kernels/functional_ops.cc b/tensorflow/core/kernels/functional_ops.cc
index 1529d2e3368..5ecb203cbc7 100644
--- a/tensorflow/core/kernels/functional_ops.cc
+++ b/tensorflow/core/kernels/functional_ops.cc
@@ -526,21 +526,40 @@ REGISTER_KERNEL_BUILDER(Name("For")
                             .HostMemory("delta"),
                         ForOp);
 
+// FakeParamOp allocates a tensor with a shape conforming to the expected
+// output. This is necessary if the value will be stored in a while_loop's
+// TensorList. The output is otherwise not expected to be consumed by anything
+// else.
 class FakeParamOp : public OpKernel {
  public:
   explicit FakeParamOp(OpKernelConstruction* context) : OpKernel(context) {
-    OP_REQUIRES_OK(context, context->GetAttr("dtype", &dtype_));
+    DataType dtype;
+    OP_REQUIRES_OK(context, context->GetAttr("dtype", &dtype));
+
+    // Set shape to the specified shape, setting unknown dimensions to empty.
+    // If the specified shape is unknown, leave as an empty shape.
+    TensorShape shape;
+    PartialTensorShape partial_shape;
+    OP_REQUIRES_OK(context, context->GetAttr("shape", &partial_shape));
+    if (!partial_shape.unknown_rank()) {
+      for (int64 d : partial_shape.dim_sizes()) {
+        shape.AddDim(d == -1 ? 0 : d);
+      }
+    }
+
+    // Create a persistent tensor that we can repeatedly return to save memory.
+    // TODO(b/119612758): add optimization to prevent sending this across
+    // devices on each Compute() call.
+    OP_REQUIRES_OK(context, context->allocate_persistent(
+                                dtype, shape, &value_handle_, nullptr));
   }
 
   void Compute(OpKernelContext* context) override {
-    // We must produce something (only Switch and Recvs are allowed to output
-    // dead tensors). This output is not expected to be consumed by anything.
-    Tensor output_tensor(dtype_, TensorShape({}));
-    context->set_output(0, output_tensor);
+    context->set_output(0, *value_handle_.AccessTensor(context));
   }
 
  private:
-  DataType dtype_;
+  PersistentTensor value_handle_;
 };
 
 REGISTER_KERNEL_BUILDER(Name("FakeParam").Device(DEVICE_CPU), FakeParamOp);
diff --git a/tensorflow/core/kernels/fused_batch_norm_op.cc b/tensorflow/core/kernels/fused_batch_norm_op.cc
index d89f1592bd7..dbd3bb05dbf 100644
--- a/tensorflow/core/kernels/fused_batch_norm_op.cc
+++ b/tensorflow/core/kernels/fused_batch_norm_op.cc
@@ -248,7 +248,7 @@ struct FusedBatchNorm<GPUDevice, T, U> {
                   Tensor* saved_inv_var, TensorFormat tensor_format,
                   bool is_training) {
     auto* stream = context->op_device_context()->stream();
-    OP_REQUIRES(context, stream, errors::Internal("No GPU stream avalible"));
+    OP_REQUIRES(context, stream, errors::Internal("No GPU stream available"));
 
     const int64 batch_size = GetTensorDim(x, tensor_format, 'N');
     const int64 channels = GetTensorDim(x, tensor_format, 'C');
@@ -389,7 +389,7 @@ struct FusedBatchNormGrad<GPUDevice, T, U> {
                   Tensor* scale_backprop, Tensor* offset_backprop,
                   TensorFormat tensor_format) {
     auto* stream = context->op_device_context()->stream();
-    OP_REQUIRES(context, stream, errors::Internal("No GPU stream avalible"));
+    OP_REQUIRES(context, stream, errors::Internal("No GPU stream available"));
 
     const int64 batch_size = GetTensorDim(x, tensor_format, 'N');
     const int64 channels = GetTensorDim(x, tensor_format, 'C');
diff --git a/tensorflow/core/kernels/fuzzing/BUILD b/tensorflow/core/kernels/fuzzing/BUILD
index 6f3a49805ce..2d8b734535c 100644
--- a/tensorflow/core/kernels/fuzzing/BUILD
+++ b/tensorflow/core/kernels/fuzzing/BUILD
@@ -18,15 +18,23 @@ cc_library(
 )
 
 load("//tensorflow/core/kernels/fuzzing:tf_ops_fuzz_target_lib.bzl", "tf_ops_fuzz_target_lib")
+load("//tensorflow/core/kernels/fuzzing:tf_ops_fuzz_target_lib.bzl", "tf_oss_fuzz_corpus")
+load("//tensorflow/core/kernels/fuzzing:tf_ops_fuzz_target_lib.bzl", "tf_oss_fuzz_dict")
 
 tf_ops_fuzz_target_lib("identity")
 
 tf_ops_fuzz_target_lib("string_to_number")
 
+tf_oss_fuzz_corpus("string_to_number")
+
 tf_ops_fuzz_target_lib("string_split")
 
+tf_oss_fuzz_corpus("string_split")
+
 tf_ops_fuzz_target_lib("string_split_v2")
 
+tf_oss_fuzz_corpus("string_split_v2")
+
 tf_ops_fuzz_target_lib("encode_base64")
 
 tf_ops_fuzz_target_lib("decode_base64")
@@ -35,10 +43,20 @@ tf_ops_fuzz_target_lib("encode_jpeg")
 
 tf_ops_fuzz_target_lib("decode_bmp")
 
+tf_oss_fuzz_corpus("decode_bmp")
+
 tf_ops_fuzz_target_lib("decode_png")
 
+tf_oss_fuzz_corpus("decode_png")
+
+tf_oss_fuzz_dict("decode_png")
+
 tf_ops_fuzz_target_lib("decode_wav")
 
+tf_oss_fuzz_corpus("decode_wav")
+
+tf_oss_fuzz_dict("decode_wav")
+
 tf_ops_fuzz_target_lib("example_proto_fast_parsing")
 
 tf_ops_fuzz_target_lib("parse_tensor_op")
@@ -46,3 +64,7 @@ tf_ops_fuzz_target_lib("parse_tensor_op")
 tf_ops_fuzz_target_lib("decode_compressed")
 
 tf_ops_fuzz_target_lib("decode_json_example")
+
+tf_oss_fuzz_corpus("decode_json_example")
+
+tf_oss_fuzz_dict("decode_json_example")
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/012e3ad384a4a1165f8498b5c94ba0d32a73e187 b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/012e3ad384a4a1165f8498b5c94ba0d32a73e187
new file mode 100644
index 00000000000..7a1b8966c21
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/012e3ad384a4a1165f8498b5c94ba0d32a73e187 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/055d77f7810048caa28323f6eb552a53d156040b b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/055d77f7810048caa28323f6eb552a53d156040b
new file mode 100644
index 00000000000..24f658497f1
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/055d77f7810048caa28323f6eb552a53d156040b differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/131e251bfb82c681cb075d32b99f18fceaca115d b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/131e251bfb82c681cb075d32b99f18fceaca115d
new file mode 100644
index 00000000000..a2d8f84cab7
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/131e251bfb82c681cb075d32b99f18fceaca115d differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/1399ab0bd9f2c91d270cb43251bdc5729bef3526 b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/1399ab0bd9f2c91d270cb43251bdc5729bef3526
new file mode 100644
index 00000000000..6206dab82b1
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/1399ab0bd9f2c91d270cb43251bdc5729bef3526 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/16a6ce88f66d2e9686c8354cad8ba915cf0c11de b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/16a6ce88f66d2e9686c8354cad8ba915cf0c11de
new file mode 100644
index 00000000000..bcc7f481ae9
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/16a6ce88f66d2e9686c8354cad8ba915cf0c11de differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/185097ed0588195164619ea930ddd8274a5f32ad b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/185097ed0588195164619ea930ddd8274a5f32ad
new file mode 100644
index 00000000000..92bddb6dca9
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/185097ed0588195164619ea930ddd8274a5f32ad differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/27711a87e06a50c81571c27c3aa403a6ad5dc55c b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/27711a87e06a50c81571c27c3aa403a6ad5dc55c
new file mode 100644
index 00000000000..082b1e5752a
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/27711a87e06a50c81571c27c3aa403a6ad5dc55c differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/298c3787ad1722b22569cbc405c464d2 b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/298c3787ad1722b22569cbc405c464d2
new file mode 100644
index 00000000000..af1091428d5
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/298c3787ad1722b22569cbc405c464d2 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/2b95ba6d8141ce0d29ff279770903922 b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/2b95ba6d8141ce0d29ff279770903922
new file mode 100644
index 00000000000..fd711cb0e51
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/2b95ba6d8141ce0d29ff279770903922 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/321fb3d758b86e37fc340ae2b09b8ed9fa73a4cb b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/321fb3d758b86e37fc340ae2b09b8ed9fa73a4cb
new file mode 100644
index 00000000000..6748826bd88
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/321fb3d758b86e37fc340ae2b09b8ed9fa73a4cb differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/331a98b4e4c87840efea69223766ebd0e1736542 b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/331a98b4e4c87840efea69223766ebd0e1736542
new file mode 100644
index 00000000000..9cf1b9d3af8
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/331a98b4e4c87840efea69223766ebd0e1736542 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/352d73f841223ecb630b5836585d2ba7b0f9d883 b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/352d73f841223ecb630b5836585d2ba7b0f9d883
new file mode 100644
index 00000000000..932e78b3547
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/352d73f841223ecb630b5836585d2ba7b0f9d883 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/3a84f409d4c117edfdebc508cd23e8fc b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/3a84f409d4c117edfdebc508cd23e8fc
new file mode 100644
index 00000000000..89a090d74ee
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/3a84f409d4c117edfdebc508cd23e8fc differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/3ef5cc982c0b45f69a26fd0f7d376415fdebabd1 b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/3ef5cc982c0b45f69a26fd0f7d376415fdebabd1
new file mode 100644
index 00000000000..286949bc56a
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/3ef5cc982c0b45f69a26fd0f7d376415fdebabd1 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/401c7de8e122018a0e17f57c93db7ee49ab0e906 b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/401c7de8e122018a0e17f57c93db7ee49ab0e906
new file mode 100644
index 00000000000..8d5c7d136e5
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/401c7de8e122018a0e17f57c93db7ee49ab0e906 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/52fee71bb8c9c79068e1fe580677ad739a2d0415 b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/52fee71bb8c9c79068e1fe580677ad739a2d0415
new file mode 100644
index 00000000000..f77ffec0865
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/52fee71bb8c9c79068e1fe580677ad739a2d0415 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/57b11507813d5727b7789354d888eda83d5f3d86 b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/57b11507813d5727b7789354d888eda83d5f3d86
new file mode 100644
index 00000000000..f9af0697d53
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/57b11507813d5727b7789354d888eda83d5f3d86 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/57dff0fa53ee0ef24a43cca6ab0523bfdc1f720d b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/57dff0fa53ee0ef24a43cca6ab0523bfdc1f720d
new file mode 100644
index 00000000000..109ab7948ff
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/57dff0fa53ee0ef24a43cca6ab0523bfdc1f720d differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/5c42d3df0dc400a7a4175b8d4eec6cc8ee2437b2 b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/5c42d3df0dc400a7a4175b8d4eec6cc8ee2437b2
new file mode 100644
index 00000000000..bf977290265
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/5c42d3df0dc400a7a4175b8d4eec6cc8ee2437b2 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/5cca20637ae75fddad9370ee930837baef8aeb43 b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/5cca20637ae75fddad9370ee930837baef8aeb43
new file mode 100644
index 00000000000..cf7a78e9488
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/5cca20637ae75fddad9370ee930837baef8aeb43 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/5d34bc9cef0c844b9c5ebe948145c4ca11b5ca09 b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/5d34bc9cef0c844b9c5ebe948145c4ca11b5ca09
new file mode 100644
index 00000000000..e5621aa3d1b
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/5d34bc9cef0c844b9c5ebe948145c4ca11b5ca09 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/5e162fe883bd12fb1c4131d4e0c979a12bd15eac b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/5e162fe883bd12fb1c4131d4e0c979a12bd15eac
new file mode 100644
index 00000000000..eea39d6b2f8
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/5e162fe883bd12fb1c4131d4e0c979a12bd15eac differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/5e83f8faab9c1a51a33d5e29edbb9dcec23c6092 b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/5e83f8faab9c1a51a33d5e29edbb9dcec23c6092
new file mode 100644
index 00000000000..fabcbdbe3d4
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/5e83f8faab9c1a51a33d5e29edbb9dcec23c6092 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/61b29dc2fcef7b6fbe3e0cc88769a7ef b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/61b29dc2fcef7b6fbe3e0cc88769a7ef
new file mode 100644
index 00000000000..8dfc17e8e05
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/61b29dc2fcef7b6fbe3e0cc88769a7ef differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/6361eca190157ece389665ee523ccc3aefcd957f b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/6361eca190157ece389665ee523ccc3aefcd957f
new file mode 100644
index 00000000000..141e331ad01
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/6361eca190157ece389665ee523ccc3aefcd957f differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/65150515ab3b11d657519b22bb887d74e94b2d7f b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/65150515ab3b11d657519b22bb887d74e94b2d7f
new file mode 100644
index 00000000000..567c645c00e
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/65150515ab3b11d657519b22bb887d74e94b2d7f differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/656f38ef6dcd58c6a909d61db11f777def69c394 b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/656f38ef6dcd58c6a909d61db11f777def69c394
new file mode 100644
index 00000000000..e1cdb4e5bf9
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/656f38ef6dcd58c6a909d61db11f777def69c394 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/66e0d2cafd592bf9d61ad900fade8ee530d5f3d7 b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/66e0d2cafd592bf9d61ad900fade8ee530d5f3d7
new file mode 100644
index 00000000000..73e53b460ec
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/66e0d2cafd592bf9d61ad900fade8ee530d5f3d7 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/6b5b42cb105a2c4c5fd6034e9885cbe457f1b50c b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/6b5b42cb105a2c4c5fd6034e9885cbe457f1b50c
new file mode 100644
index 00000000000..f29b9b21718
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/6b5b42cb105a2c4c5fd6034e9885cbe457f1b50c differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/722ed0197cb92ecbf9745edb38275e7a9aaf322f b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/722ed0197cb92ecbf9745edb38275e7a9aaf322f
new file mode 100644
index 00000000000..3b0c338ce20
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/722ed0197cb92ecbf9745edb38275e7a9aaf322f differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/77bdd2efdf328366cbbf3c5688768dc0a88d02b1 b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/77bdd2efdf328366cbbf3c5688768dc0a88d02b1
new file mode 100644
index 00000000000..61dd2583cd6
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/77bdd2efdf328366cbbf3c5688768dc0a88d02b1 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/7841bfa002c05c61d5a5d9241f214cc17a336166 b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/7841bfa002c05c61d5a5d9241f214cc17a336166
new file mode 100644
index 00000000000..907ec3b5a3b
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/7841bfa002c05c61d5a5d9241f214cc17a336166 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/7899e22fc83f6be28e9130c4a1c91a48 b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/7899e22fc83f6be28e9130c4a1c91a48
new file mode 100644
index 00000000000..7e3b1990ad7
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/7899e22fc83f6be28e9130c4a1c91a48 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/7dddccaebd16ae0c26daeffc42df50f529891119 b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/7dddccaebd16ae0c26daeffc42df50f529891119
new file mode 100644
index 00000000000..0329a2826a8
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/7dddccaebd16ae0c26daeffc42df50f529891119 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/8157442eee4bbfdd9716e264b11085d61a9955b7 b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/8157442eee4bbfdd9716e264b11085d61a9955b7
new file mode 100644
index 00000000000..7e9ef4b3dd4
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/8157442eee4bbfdd9716e264b11085d61a9955b7 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/81ff28ed63d5435ddc4c8771dd5d40aa658cbbe0 b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/81ff28ed63d5435ddc4c8771dd5d40aa658cbbe0
new file mode 100644
index 00000000000..6390e6b2b30
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/81ff28ed63d5435ddc4c8771dd5d40aa658cbbe0 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/820c8c0d33c18f6c4d9edd314e91289186931ad0 b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/820c8c0d33c18f6c4d9edd314e91289186931ad0
new file mode 100644
index 00000000000..0084212a656
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/820c8c0d33c18f6c4d9edd314e91289186931ad0 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/849e9d7cee1c52105242327086997296e452b981 b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/849e9d7cee1c52105242327086997296e452b981
new file mode 100644
index 00000000000..a36c88daf01
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/849e9d7cee1c52105242327086997296e452b981 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/84ddb92c63e0fad7018f6069daf8779ce11501e2 b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/84ddb92c63e0fad7018f6069daf8779ce11501e2
new file mode 100644
index 00000000000..b5d34609b88
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/84ddb92c63e0fad7018f6069daf8779ce11501e2 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/86bc3d5dbb9313137502080e58551edd2e649c70 b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/86bc3d5dbb9313137502080e58551edd2e649c70
new file mode 100644
index 00000000000..a9ef2b5a50b
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/86bc3d5dbb9313137502080e58551edd2e649c70 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/87d94d88fe29d277c76e1a52042b02c092d5ae14 b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/87d94d88fe29d277c76e1a52042b02c092d5ae14
new file mode 100644
index 00000000000..83de83f4eb5
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/87d94d88fe29d277c76e1a52042b02c092d5ae14 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/8c4646f3357945c4e19a59ff79fffe3c874dbf16 b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/8c4646f3357945c4e19a59ff79fffe3c874dbf16
new file mode 100644
index 00000000000..fa47e75a632
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/8c4646f3357945c4e19a59ff79fffe3c874dbf16 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/90632bc6dee4eb836f3d7db1d16446a9c8510080 b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/90632bc6dee4eb836f3d7db1d16446a9c8510080
new file mode 100644
index 00000000000..e739e858b86
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/90632bc6dee4eb836f3d7db1d16446a9c8510080 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/94d06016aa949e8e7203217e4cc6625ded7f4244 b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/94d06016aa949e8e7203217e4cc6625ded7f4244
new file mode 100644
index 00000000000..c989a76df15
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/94d06016aa949e8e7203217e4cc6625ded7f4244 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/9875819b9e5783e7489c29a81cc9d4279209956a b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/9875819b9e5783e7489c29a81cc9d4279209956a
new file mode 100644
index 00000000000..6ff64a7d2ba
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/9875819b9e5783e7489c29a81cc9d4279209956a differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/9c1cc734114b29aac6c51782d5c17e9dbe1faca2 b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/9c1cc734114b29aac6c51782d5c17e9dbe1faca2
new file mode 100644
index 00000000000..2d1d8576a29
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/9c1cc734114b29aac6c51782d5c17e9dbe1faca2 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/9d2961871eeb201ef8a6f5503d8a8b62 b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/9d2961871eeb201ef8a6f5503d8a8b62
new file mode 100644
index 00000000000..a5865c60a61
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/9d2961871eeb201ef8a6f5503d8a8b62 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/9f39e11cdd88344a4894b678e5a04a810880064d b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/9f39e11cdd88344a4894b678e5a04a810880064d
new file mode 100644
index 00000000000..8f712aea9ff
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/9f39e11cdd88344a4894b678e5a04a810880064d differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/a350588a6dabe4376a066aed44ef8786d8e752e7 b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/a350588a6dabe4376a066aed44ef8786d8e752e7
new file mode 100644
index 00000000000..b7ad89078b5
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/a350588a6dabe4376a066aed44ef8786d8e752e7 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/a6101a79919d444e1fc50aefab5837c39e3f4a19 b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/a6101a79919d444e1fc50aefab5837c39e3f4a19
new file mode 100644
index 00000000000..173c941952b
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/a6101a79919d444e1fc50aefab5837c39e3f4a19 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/a9c8793f8fb063bec839ee1280406fe5396545e5 b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/a9c8793f8fb063bec839ee1280406fe5396545e5
new file mode 100644
index 00000000000..644560ced96
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/a9c8793f8fb063bec839ee1280406fe5396545e5 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/ad4e9d2234e8599bdf12607c6b8cab4edae82c4e b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/ad4e9d2234e8599bdf12607c6b8cab4edae82c4e
new file mode 100644
index 00000000000..f1826b06e88
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/ad4e9d2234e8599bdf12607c6b8cab4edae82c4e differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/b90b6830917919e94186d312f06481bd b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/b90b6830917919e94186d312f06481bd
new file mode 100644
index 00000000000..2da6be376f2
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/b90b6830917919e94186d312f06481bd differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/b98fd4cb1d7031240414301c19b03097c0035c6b b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/b98fd4cb1d7031240414301c19b03097c0035c6b
new file mode 100644
index 00000000000..b84b57bb531
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/b98fd4cb1d7031240414301c19b03097c0035c6b differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/ba976fcdb4daf092ef17ce43bf2b78d9d8bc2aeb b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/ba976fcdb4daf092ef17ce43bf2b78d9d8bc2aeb
new file mode 100644
index 00000000000..4ee9cbdfc3d
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/ba976fcdb4daf092ef17ce43bf2b78d9d8bc2aeb differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/bc112b571eafee0f5a031f3c9cce6244216d128d b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/bc112b571eafee0f5a031f3c9cce6244216d128d
new file mode 100644
index 00000000000..af1091428d5
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/bc112b571eafee0f5a031f3c9cce6244216d128d differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/c42b981c28a1715c375050f6fcf53f1d b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/c42b981c28a1715c375050f6fcf53f1d
new file mode 100644
index 00000000000..996e8c826cb
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/c42b981c28a1715c375050f6fcf53f1d differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/c6049874b33eadb016fccf0c5fa66e556ae069b9 b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/c6049874b33eadb016fccf0c5fa66e556ae069b9
new file mode 100644
index 00000000000..4863878ca02
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/c6049874b33eadb016fccf0c5fa66e556ae069b9 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/c8697bf2369f6ab85f501376c4d93bb8a56974a3 b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/c8697bf2369f6ab85f501376c4d93bb8a56974a3
new file mode 100644
index 00000000000..30aacc2f988
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/c8697bf2369f6ab85f501376c4d93bb8a56974a3 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/c8daf283e0aef2fd7b630c0430e05dc28f24ecf6 b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/c8daf283e0aef2fd7b630c0430e05dc28f24ecf6
new file mode 100644
index 00000000000..b831633f02b
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/c8daf283e0aef2fd7b630c0430e05dc28f24ecf6 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/cacff56e1af4b8fde912822da06b10fb8c545a19 b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/cacff56e1af4b8fde912822da06b10fb8c545a19
new file mode 100644
index 00000000000..ff492d29d76
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/cacff56e1af4b8fde912822da06b10fb8c545a19 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/ce4dcc22b1d595c49a25121c0b580104 b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/ce4dcc22b1d595c49a25121c0b580104
new file mode 100644
index 00000000000..ea776fb0a94
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/ce4dcc22b1d595c49a25121c0b580104 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/d0cd71dbf039fd64cf42eff30da92a71a919226a b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/d0cd71dbf039fd64cf42eff30da92a71a919226a
new file mode 100644
index 00000000000..b9dc5d0f4ee
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/d0cd71dbf039fd64cf42eff30da92a71a919226a differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/d5ce626ac3264bed6af5580e341a89406857cbb9 b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/d5ce626ac3264bed6af5580e341a89406857cbb9
new file mode 100644
index 00000000000..b17294ec90a
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/d5ce626ac3264bed6af5580e341a89406857cbb9 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/d77ada02e9bc8c24b2711eca6a8f52ae356bfc21 b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/d77ada02e9bc8c24b2711eca6a8f52ae356bfc21
new file mode 100644
index 00000000000..9cc65607fec
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/d77ada02e9bc8c24b2711eca6a8f52ae356bfc21 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/d7eb9c5a0f9803df4c00390793b8ab57bd7c9484 b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/d7eb9c5a0f9803df4c00390793b8ab57bd7c9484
new file mode 100644
index 00000000000..531427a99c0
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/d7eb9c5a0f9803df4c00390793b8ab57bd7c9484 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/dc1efccdeec17e151a1ec8228c09ab61c3040b33 b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/dc1efccdeec17e151a1ec8228c09ab61c3040b33
new file mode 100644
index 00000000000..5a5c1c30eb0
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/dc1efccdeec17e151a1ec8228c09ab61c3040b33 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/dcea22c66c60088165a2f1772036473f b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/dcea22c66c60088165a2f1772036473f
new file mode 100644
index 00000000000..44d2ebfb3d0
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/dcea22c66c60088165a2f1772036473f differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/de539ae7442fa05dafcfe1a021f0186ef74a2b0e b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/de539ae7442fa05dafcfe1a021f0186ef74a2b0e
new file mode 100644
index 00000000000..f9a8f33443d
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/de539ae7442fa05dafcfe1a021f0186ef74a2b0e differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/e2306b1d6b88d0ccc4e2c3a9edb07462a5a32215 b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/e2306b1d6b88d0ccc4e2c3a9edb07462a5a32215
new file mode 100644
index 00000000000..71bf61cebe4
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/e2306b1d6b88d0ccc4e2c3a9edb07462a5a32215 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/e2778da0240fdd15ef5844905d81c4e05f34a8bd b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/e2778da0240fdd15ef5844905d81c4e05f34a8bd
new file mode 100644
index 00000000000..1bad15905ff
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/e2778da0240fdd15ef5844905d81c4e05f34a8bd differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/e6642e9266875f9d908942e534bf898103a2c794 b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/e6642e9266875f9d908942e534bf898103a2c794
new file mode 100644
index 00000000000..f9d9de9c9c2
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/e6642e9266875f9d908942e534bf898103a2c794 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/ec6cdb929c08d8daf2bd7fc185fbf4d787b45120 b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/ec6cdb929c08d8daf2bd7fc185fbf4d787b45120
new file mode 100644
index 00000000000..782a0925210
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/ec6cdb929c08d8daf2bd7fc185fbf4d787b45120 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/ed8636357f79439b6a03eb14469b686cc401a1c9 b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/ed8636357f79439b6a03eb14469b686cc401a1c9
new file mode 100644
index 00000000000..efd9312d94d
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/ed8636357f79439b6a03eb14469b686cc401a1c9 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/ee313e9acecb5c688ce8c9bb10e70e136fbb9c6d b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/ee313e9acecb5c688ce8c9bb10e70e136fbb9c6d
new file mode 100644
index 00000000000..03e09e28193
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/ee313e9acecb5c688ce8c9bb10e70e136fbb9c6d differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/ef689af320e7d9e22231109faae2e8149cb86e1c b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/ef689af320e7d9e22231109faae2e8149cb86e1c
new file mode 100644
index 00000000000..f8688710452
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/ef689af320e7d9e22231109faae2e8149cb86e1c differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/fda6b9a9f6ffdf4765c00465619c7ceb3f7db2e4 b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/fda6b9a9f6ffdf4765c00465619c7ceb3f7db2e4
new file mode 100644
index 00000000000..20efec0d1e1
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/fda6b9a9f6ffdf4765c00465619c7ceb3f7db2e4 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/ffe829bb0adac20d9c0756f68a22d1255e4fdb54 b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/ffe829bb0adac20d9c0756f68a22d1255e4fdb54
new file mode 100644
index 00000000000..e24c09dacce
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/ffe829bb0adac20d9c0756f68a22d1255e4fdb54 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_json_example/013a29ea098a178f8a36741c9fd91144 b/tensorflow/core/kernels/fuzzing/corpus/decode_json_example/013a29ea098a178f8a36741c9fd91144
new file mode 100644
index 00000000000..06fd8044808
--- /dev/null
+++ b/tensorflow/core/kernels/fuzzing/corpus/decode_json_example/013a29ea098a178f8a36741c9fd91144
@@ -0,0 +1,48 @@
+{
+  features: {
+    feature: {
+      age: {
+        float_list: {
+          value: 29.0
+        }
+      }
+    },
+    feature: {
+      movie: {
+        bytes_list: {
+          value: "VGhlIFNoYXdzaGFuayBSZWRlbXB0aW9u",
+          value: "RmlnaHQgQ2x1Yg=="
+        }
+      }
+    },
+    feature: {
+      movie_ratings: {
+        float_list: {
+          value: 9.0,
+          value: 9.7
+        }
+      }
+    },
+    feature: {
+      suggestion: {
+        bytes_list: {
+          value: "SW5jZXB0aW9u"
+        }
+      }
+    },
+    feature: {
+      suggestion_purchased: {
+        float_list: {
+          value: 1.0
+        }
+      }
+    },
+    feature: {
+      purchase_price: {
+        float_list: {
+          value: 9.99
+        }
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_json_example/0875575fb76d630ccb19c5da8aab66b2 b/tensorflow/core/kernels/fuzzing/corpus/decode_json_example/0875575fb76d630ccb19c5da8aab66b2
new file mode 100644
index 00000000000..4ae686974e2
--- /dev/null
+++ b/tensorflow/core/kernels/fuzzing/corpus/decode_json_example/0875575fb76d630ccb19c5da8aab66b2
@@ -0,0 +1 @@
+{features:{feature:{age:{float_list:{value:[29.0,2,3,4]}}},feature:{movie_ratings:{float_list:{value:[9.0,9.7]}}},feature:{suggestion_purchased:{float_list:{value:[1.0,2,3,4,5]}}},feature:{purchase_price:{float_list:{value:[9.99,8.88,7.77,6.66,5.55],value:[4.44,3.33,2.22,1.11],value:[1.11,2.22,3.33],value:[4.44,5.55],value:0}}}}}
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_json_example/7e7f58fc443a11a0a2c5d9b643b7e99b b/tensorflow/core/kernels/fuzzing/corpus/decode_json_example/7e7f58fc443a11a0a2c5d9b643b7e99b
new file mode 100644
index 00000000000..150f8710f7d
--- /dev/null
+++ b/tensorflow/core/kernels/fuzzing/corpus/decode_json_example/7e7f58fc443a11a0a2c5d9b643b7e99b
@@ -0,0 +1 @@
+{features:{feature:{age:{float_list:{value:29.0}}},feature:{movie_ratings:{float_list:{value:[9.0,9.7]}}},feature:{suggestion_purchased:{float_list:{value:1.0}}},feature:{purchase_price:{float_list:{value:9.99}}}}}
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_json_example/849a23936269a261c0370b5e9abe2416 b/tensorflow/core/kernels/fuzzing/corpus/decode_json_example/849a23936269a261c0370b5e9abe2416
new file mode 100644
index 00000000000..fcfdfedd1b0
--- /dev/null
+++ b/tensorflow/core/kernels/fuzzing/corpus/decode_json_example/849a23936269a261c0370b5e9abe2416
@@ -0,0 +1 @@
+{features:{feature:{age:{float_list:{value:29.0}}},feature:{movie_ratings:{float_list:{value:[[[[[[9.0,9.7]]]]]],value:[[[9.0,-9.2]]]}}},feature:{suggestion_purchased:{float_list:{value:[1.0,[2,3,[4,5,6,[7,8,9,0]]]]}}},feature:{purchase_price:{float_list:{value:9.99}}}}}
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_json_example/85282c1696d98b9843ce3e8bd1cd899f b/tensorflow/core/kernels/fuzzing/corpus/decode_json_example/85282c1696d98b9843ce3e8bd1cd899f
new file mode 100644
index 00000000000..7c9981d482f
--- /dev/null
+++ b/tensorflow/core/kernels/fuzzing/corpus/decode_json_example/85282c1696d98b9843ce3e8bd1cd899f
@@ -0,0 +1 @@
+{features:{feature:{age:{float_list:{value:29.0}}},feature:{movie_ratings:{float_list:{value:9.0,value:9.7}}},feature:{suggestion_purchased:{float_list:{value:1.0}}},feature:{purchase_price:{float_list:{value:9.99}}}}}
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_json_example/90388b9c8093d8adedad0644b618da87 b/tensorflow/core/kernels/fuzzing/corpus/decode_json_example/90388b9c8093d8adedad0644b618da87
new file mode 100644
index 00000000000..a1315bb8f93
--- /dev/null
+++ b/tensorflow/core/kernels/fuzzing/corpus/decode_json_example/90388b9c8093d8adedad0644b618da87
@@ -0,0 +1,33 @@
+{
+  features: {
+    feature: {
+      age: {
+        float_list: {
+          value: 29.0
+        }
+      }
+    },
+    feature: {
+      movie_ratings: {
+        float_list: {
+          value: [[[[[[9.0,9.7]]]]]],
+          value: [[[9.0, -9.2]]]
+        }
+      }
+    },
+    feature: {
+      suggestion_purchased: {
+        float_list: {
+          value: [1.0, [2, 3, [4, 5, 6, [7, 8, 9, 0]]]]
+        }
+      }
+    },
+    feature: {
+      purchase_price: {
+        float_list: {
+          value: 9.99
+        }
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_json_example/9fa2f86ea6d3ade36e961247c3026f8d b/tensorflow/core/kernels/fuzzing/corpus/decode_json_example/9fa2f86ea6d3ade36e961247c3026f8d
new file mode 100644
index 00000000000..d4f9494bbd3
--- /dev/null
+++ b/tensorflow/core/kernels/fuzzing/corpus/decode_json_example/9fa2f86ea6d3ade36e961247c3026f8d
@@ -0,0 +1,33 @@
+{
+  features: {
+    feature: {
+      age: {
+        float_list: {
+          value: 29.0
+        }
+      }
+    },
+    feature: {
+      movie_ratings: {
+        float_list: {
+          value: 9.0,
+          value: 9.7
+        }
+      }
+    },
+    feature: {
+      suggestion_purchased: {
+        float_list: {
+          value: 1.0
+        }
+      }
+    },
+    feature: {
+      purchase_price: {
+        float_list: {
+          value: 9.99
+        }
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_json_example/c4f18ca60a84e9869a28faf6f65dc758 b/tensorflow/core/kernels/fuzzing/corpus/decode_json_example/c4f18ca60a84e9869a28faf6f65dc758
new file mode 100644
index 00000000000..e8ba267eb27
--- /dev/null
+++ b/tensorflow/core/kernels/fuzzing/corpus/decode_json_example/c4f18ca60a84e9869a28faf6f65dc758
@@ -0,0 +1,32 @@
+{
+  features: {
+    feature: {
+      age: {
+        float_list: {
+          value: 29.0
+        }
+      }
+    },
+    feature: {
+      movie_ratings: {
+        float_list: {
+          value: [9.0,9.7]
+        }
+      }
+    },
+    feature: {
+      suggestion_purchased: {
+        float_list: {
+          value: 1.0
+        }
+      }
+    },
+    feature: {
+      purchase_price: {
+        float_list: {
+          value: 9.99
+        }
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_json_example/d456ee029700adef5d28438593010223 b/tensorflow/core/kernels/fuzzing/corpus/decode_json_example/d456ee029700adef5d28438593010223
new file mode 100644
index 00000000000..3428a1e0fcd
--- /dev/null
+++ b/tensorflow/core/kernels/fuzzing/corpus/decode_json_example/d456ee029700adef5d28438593010223
@@ -0,0 +1 @@
+{features:{feature:{age:{float_list:{value:29.0}}},feature:{movie:{bytes_list:{value:"VGhlIFNoYXdzaGFuayBSZWRlbXB0aW9u",value:"RmlnaHQgQ2x1Yg=="}}},feature:{movie_ratings:{float_list:{value:9.0,value:9.7}}},feature:{suggestion:{bytes_list:{value:"SW5jZXB0aW9u"}}},feature:{suggestion_purchased:{float_list:{value:1.0}}},feature:{purchase_price:{float_list:{value:9.99}}}}}
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_json_example/e9f0ff6ee8d691ae69d2ecb4710030a2 b/tensorflow/core/kernels/fuzzing/corpus/decode_json_example/e9f0ff6ee8d691ae69d2ecb4710030a2
new file mode 100644
index 00000000000..ef0923c4500
--- /dev/null
+++ b/tensorflow/core/kernels/fuzzing/corpus/decode_json_example/e9f0ff6ee8d691ae69d2ecb4710030a2
@@ -0,0 +1,36 @@
+{
+  features: {
+    feature: {
+      age: {
+        float_list: {
+          value: [29.0, 2, 3, 4]
+        }
+      }
+    },
+    feature: {
+      movie_ratings: {
+        float_list: {
+          value: [9.0,9.7]
+        }
+      }
+    },
+    feature: {
+      suggestion_purchased: {
+        float_list: {
+          value: [1.0, 2, 3, 4, 5]
+        }
+      }
+    },
+    feature: {
+      purchase_price: {
+        float_list: {
+          value: [9.99, 8.88, 7.77, 6.66, 5.55],
+          value: [4.44, 3.33, 2.22, 1.11],
+          value: [1.11, 2.22, 3.33],
+          value: [4.44, 5.55],
+          value: 0
+        }
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/010dc3d4b05288fcc40de2721052b3dc699f1cb3 b/tensorflow/core/kernels/fuzzing/corpus/decode_png/010dc3d4b05288fcc40de2721052b3dc699f1cb3
new file mode 100644
index 00000000000..9dbc560e1e4
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/010dc3d4b05288fcc40de2721052b3dc699f1cb3 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/0555cd5e9d99629819cc985285f80da0f00be1e9 b/tensorflow/core/kernels/fuzzing/corpus/decode_png/0555cd5e9d99629819cc985285f80da0f00be1e9
new file mode 100644
index 00000000000..fab6d15ebe3
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/0555cd5e9d99629819cc985285f80da0f00be1e9 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/0a0352aa168803ff65455792d9f6ee555c3e7c3f b/tensorflow/core/kernels/fuzzing/corpus/decode_png/0a0352aa168803ff65455792d9f6ee555c3e7c3f
new file mode 100644
index 00000000000..7918406ac4b
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/0a0352aa168803ff65455792d9f6ee555c3e7c3f differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/0ed54162df93ef8d00f993ce6b59ba422903d381 b/tensorflow/core/kernels/fuzzing/corpus/decode_png/0ed54162df93ef8d00f993ce6b59ba422903d381
new file mode 100644
index 00000000000..c294b3180f7
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/0ed54162df93ef8d00f993ce6b59ba422903d381 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/1547b448171c700613c3946d730de496c9b9863f b/tensorflow/core/kernels/fuzzing/corpus/decode_png/1547b448171c700613c3946d730de496c9b9863f
new file mode 100644
index 00000000000..0eb3eff90d7
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/1547b448171c700613c3946d730de496c9b9863f differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/17859046cbe4ac598a645173d679ce2a52c6afba b/tensorflow/core/kernels/fuzzing/corpus/decode_png/17859046cbe4ac598a645173d679ce2a52c6afba
new file mode 100644
index 00000000000..deb7b0a784a
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/17859046cbe4ac598a645173d679ce2a52c6afba differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/1df76c07817fbc3653a26f34d97658e9973627c2 b/tensorflow/core/kernels/fuzzing/corpus/decode_png/1df76c07817fbc3653a26f34d97658e9973627c2
new file mode 100644
index 00000000000..2b9721d742a
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/1df76c07817fbc3653a26f34d97658e9973627c2 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/1f0717f8856d7782e3ab7992d3a72d783a018443 b/tensorflow/core/kernels/fuzzing/corpus/decode_png/1f0717f8856d7782e3ab7992d3a72d783a018443
new file mode 100644
index 00000000000..e0c330f7f4e
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/1f0717f8856d7782e3ab7992d3a72d783a018443 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/23b911e4ce936def88bc9a46b8b433c0e83fba2a b/tensorflow/core/kernels/fuzzing/corpus/decode_png/23b911e4ce936def88bc9a46b8b433c0e83fba2a
new file mode 100644
index 00000000000..41fc2fe9516
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/23b911e4ce936def88bc9a46b8b433c0e83fba2a differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/25592201c3edff0578dbdac6b0e4f2be109ce151 b/tensorflow/core/kernels/fuzzing/corpus/decode_png/25592201c3edff0578dbdac6b0e4f2be109ce151
new file mode 100644
index 00000000000..8b5755c4bc5
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/25592201c3edff0578dbdac6b0e4f2be109ce151 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/266fd8495e0b8eb64387c1a62264185e061fee73 b/tensorflow/core/kernels/fuzzing/corpus/decode_png/266fd8495e0b8eb64387c1a62264185e061fee73
new file mode 100644
index 00000000000..c4d2d8d7f1a
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/266fd8495e0b8eb64387c1a62264185e061fee73 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/27f178cf415b4ff8671131ddf1d042dafac2fb3e b/tensorflow/core/kernels/fuzzing/corpus/decode_png/27f178cf415b4ff8671131ddf1d042dafac2fb3e
new file mode 100644
index 00000000000..1608e5b0837
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/27f178cf415b4ff8671131ddf1d042dafac2fb3e differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/2a0bdc4d9cc5ea5bb21dd256d6ac96075376a94f b/tensorflow/core/kernels/fuzzing/corpus/decode_png/2a0bdc4d9cc5ea5bb21dd256d6ac96075376a94f
new file mode 100644
index 00000000000..12e4140981d
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/2a0bdc4d9cc5ea5bb21dd256d6ac96075376a94f differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/2e5d25add6adc68e0457b358c7a34abf3d41c938 b/tensorflow/core/kernels/fuzzing/corpus/decode_png/2e5d25add6adc68e0457b358c7a34abf3d41c938
new file mode 100644
index 00000000000..ecf597f7365
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/2e5d25add6adc68e0457b358c7a34abf3d41c938 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/2e6c5b6a766dd5e9bd41eacfd0a36572bd2f7544 b/tensorflow/core/kernels/fuzzing/corpus/decode_png/2e6c5b6a766dd5e9bd41eacfd0a36572bd2f7544
new file mode 100644
index 00000000000..e5a18917e37
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/2e6c5b6a766dd5e9bd41eacfd0a36572bd2f7544 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/2e9c935cf82f6ca640e9a9abc3c30a578ad46176 b/tensorflow/core/kernels/fuzzing/corpus/decode_png/2e9c935cf82f6ca640e9a9abc3c30a578ad46176
new file mode 100644
index 00000000000..50be7f686b7
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/2e9c935cf82f6ca640e9a9abc3c30a578ad46176 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/2fcf1ed4477f7eaee028f5b3f9edeb5f1a737826 b/tensorflow/core/kernels/fuzzing/corpus/decode_png/2fcf1ed4477f7eaee028f5b3f9edeb5f1a737826
new file mode 100644
index 00000000000..00eba4c39a9
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/2fcf1ed4477f7eaee028f5b3f9edeb5f1a737826 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/3480713774f590908ca5dba16d121cdfb8fba62b b/tensorflow/core/kernels/fuzzing/corpus/decode_png/3480713774f590908ca5dba16d121cdfb8fba62b
new file mode 100644
index 00000000000..af3afc499d8
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/3480713774f590908ca5dba16d121cdfb8fba62b differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/39289afcec60d98802b333e0fbb1da4d7aed4ce5 b/tensorflow/core/kernels/fuzzing/corpus/decode_png/39289afcec60d98802b333e0fbb1da4d7aed4ce5
new file mode 100644
index 00000000000..02c187a4922
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/39289afcec60d98802b333e0fbb1da4d7aed4ce5 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/3adc488e21d4aca7bed9422f0241a42d0f93e7d9 b/tensorflow/core/kernels/fuzzing/corpus/decode_png/3adc488e21d4aca7bed9422f0241a42d0f93e7d9
new file mode 100644
index 00000000000..1cf24048f8b
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/3adc488e21d4aca7bed9422f0241a42d0f93e7d9 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/3cbf274da522483dc991fad9df43a22ac4fb3173 b/tensorflow/core/kernels/fuzzing/corpus/decode_png/3cbf274da522483dc991fad9df43a22ac4fb3173
new file mode 100644
index 00000000000..7f9c0c93ec9
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/3cbf274da522483dc991fad9df43a22ac4fb3173 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/3d840cdff7f5ad16fe8bcb985ed4946c03459432 b/tensorflow/core/kernels/fuzzing/corpus/decode_png/3d840cdff7f5ad16fe8bcb985ed4946c03459432
new file mode 100644
index 00000000000..f48cb4cd19a
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/3d840cdff7f5ad16fe8bcb985ed4946c03459432 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/3f1e6753c1fca958e859189857449746592158ea b/tensorflow/core/kernels/fuzzing/corpus/decode_png/3f1e6753c1fca958e859189857449746592158ea
new file mode 100644
index 00000000000..df07889441d
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/3f1e6753c1fca958e859189857449746592158ea differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/3fa4075993cb0f9bfa8eea785174a2038a69aa1b b/tensorflow/core/kernels/fuzzing/corpus/decode_png/3fa4075993cb0f9bfa8eea785174a2038a69aa1b
new file mode 100644
index 00000000000..5f9cec9ab5c
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/3fa4075993cb0f9bfa8eea785174a2038a69aa1b differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/4023a373e977be58413e55350380310c5dd1fd6a b/tensorflow/core/kernels/fuzzing/corpus/decode_png/4023a373e977be58413e55350380310c5dd1fd6a
new file mode 100644
index 00000000000..385b8b0c359
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/4023a373e977be58413e55350380310c5dd1fd6a differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/40caba69dce1cfc48e0e43184d2bfbc6daa4399a b/tensorflow/core/kernels/fuzzing/corpus/decode_png/40caba69dce1cfc48e0e43184d2bfbc6daa4399a
new file mode 100644
index 00000000000..22f1649adc4
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/40caba69dce1cfc48e0e43184d2bfbc6daa4399a differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/41841e9561d8135945c1c1e55ab9e9a1e933653b b/tensorflow/core/kernels/fuzzing/corpus/decode_png/41841e9561d8135945c1c1e55ab9e9a1e933653b
new file mode 100644
index 00000000000..16c0c33b93d
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/41841e9561d8135945c1c1e55ab9e9a1e933653b differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/41d40f2d66fa43e34537385594ee9911e65deadf b/tensorflow/core/kernels/fuzzing/corpus/decode_png/41d40f2d66fa43e34537385594ee9911e65deadf
new file mode 100644
index 00000000000..6e44f2adc7f
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/41d40f2d66fa43e34537385594ee9911e65deadf differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/421bd39810b50309a71adb2dadc3b19f01a52312 b/tensorflow/core/kernels/fuzzing/corpus/decode_png/421bd39810b50309a71adb2dadc3b19f01a52312
new file mode 100644
index 00000000000..131004b8943
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/421bd39810b50309a71adb2dadc3b19f01a52312 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/446c305b2c0665736f94fb2b62dbdef445eff0cf b/tensorflow/core/kernels/fuzzing/corpus/decode_png/446c305b2c0665736f94fb2b62dbdef445eff0cf
new file mode 100644
index 00000000000..1f2f90b3bc4
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/446c305b2c0665736f94fb2b62dbdef445eff0cf differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/449cee952bb645f6f4241a6665d3c6028c073c7a b/tensorflow/core/kernels/fuzzing/corpus/decode_png/449cee952bb645f6f4241a6665d3c6028c073c7a
new file mode 100644
index 00000000000..c671f7e4c0d
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/449cee952bb645f6f4241a6665d3c6028c073c7a differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/45520b07609978c5aa3516d803527438b93fbadb b/tensorflow/core/kernels/fuzzing/corpus/decode_png/45520b07609978c5aa3516d803527438b93fbadb
new file mode 100644
index 00000000000..c6f2f7052d2
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/45520b07609978c5aa3516d803527438b93fbadb differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/4da74a34bcede234b0415f77fbd87d70bf9a777e b/tensorflow/core/kernels/fuzzing/corpus/decode_png/4da74a34bcede234b0415f77fbd87d70bf9a777e
new file mode 100644
index 00000000000..605ad2d2014
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/4da74a34bcede234b0415f77fbd87d70bf9a777e differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/51db5d31d2c5300d34831d9f23bcdd0aff9a998b b/tensorflow/core/kernels/fuzzing/corpus/decode_png/51db5d31d2c5300d34831d9f23bcdd0aff9a998b
new file mode 100644
index 00000000000..bcacbe623f8
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/51db5d31d2c5300d34831d9f23bcdd0aff9a998b differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/5cde2a9167798cb77f10abbfb2640a5c357f99fc b/tensorflow/core/kernels/fuzzing/corpus/decode_png/5cde2a9167798cb77f10abbfb2640a5c357f99fc
new file mode 100644
index 00000000000..2619e1d8763
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/5cde2a9167798cb77f10abbfb2640a5c357f99fc differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/5e352fc10ac476cfbe1d755f092e069820223249 b/tensorflow/core/kernels/fuzzing/corpus/decode_png/5e352fc10ac476cfbe1d755f092e069820223249
new file mode 100644
index 00000000000..cb55f03ee18
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/5e352fc10ac476cfbe1d755f092e069820223249 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/63661677dd1306cec4b5a565190e65adf2446e52 b/tensorflow/core/kernels/fuzzing/corpus/decode_png/63661677dd1306cec4b5a565190e65adf2446e52
new file mode 100644
index 00000000000..be952039a4e
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/63661677dd1306cec4b5a565190e65adf2446e52 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/65887ed3db382aab1d9485c500f4401318d303b9 b/tensorflow/core/kernels/fuzzing/corpus/decode_png/65887ed3db382aab1d9485c500f4401318d303b9
new file mode 100644
index 00000000000..776adbe8d4b
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/65887ed3db382aab1d9485c500f4401318d303b9 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/67b5181f8f0644597e9bde539e8f083b5cacd0e7 b/tensorflow/core/kernels/fuzzing/corpus/decode_png/67b5181f8f0644597e9bde539e8f083b5cacd0e7
new file mode 100644
index 00000000000..5bee1d494a5
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/67b5181f8f0644597e9bde539e8f083b5cacd0e7 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/74c9dcf7afee2a6cb1ab3a2c0de744d1b03c1466 b/tensorflow/core/kernels/fuzzing/corpus/decode_png/74c9dcf7afee2a6cb1ab3a2c0de744d1b03c1466
new file mode 100644
index 00000000000..ea3e0d2bd44
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/74c9dcf7afee2a6cb1ab3a2c0de744d1b03c1466 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/792181ca19e6ded261434e588bb7fc2a4816d4ce b/tensorflow/core/kernels/fuzzing/corpus/decode_png/792181ca19e6ded261434e588bb7fc2a4816d4ce
new file mode 100644
index 00000000000..521deb8d44e
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/792181ca19e6ded261434e588bb7fc2a4816d4ce differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/79f0e2a475487f8fa69e68c1cc947c5851bda741 b/tensorflow/core/kernels/fuzzing/corpus/decode_png/79f0e2a475487f8fa69e68c1cc947c5851bda741
new file mode 100644
index 00000000000..88533233776
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/79f0e2a475487f8fa69e68c1cc947c5851bda741 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/7e5fcdfeb557ce379ed96925c68505eaac0112db b/tensorflow/core/kernels/fuzzing/corpus/decode_png/7e5fcdfeb557ce379ed96925c68505eaac0112db
new file mode 100644
index 00000000000..cc011aedc9b
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/7e5fcdfeb557ce379ed96925c68505eaac0112db differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/7eec7530acf34b3a96fa9189783453999f7b6838 b/tensorflow/core/kernels/fuzzing/corpus/decode_png/7eec7530acf34b3a96fa9189783453999f7b6838
new file mode 100644
index 00000000000..7e3b6f564f0
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/7eec7530acf34b3a96fa9189783453999f7b6838 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/80114bf9781bffc9db411413d83541d8deaaf7c1 b/tensorflow/core/kernels/fuzzing/corpus/decode_png/80114bf9781bffc9db411413d83541d8deaaf7c1
new file mode 100644
index 00000000000..4828092a8aa
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/80114bf9781bffc9db411413d83541d8deaaf7c1 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/80425fb92bb86627e854892f23823fa804e5fdc3 b/tensorflow/core/kernels/fuzzing/corpus/decode_png/80425fb92bb86627e854892f23823fa804e5fdc3
new file mode 100644
index 00000000000..2ed0139989f
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/80425fb92bb86627e854892f23823fa804e5fdc3 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/821cdd6eeb919a8dd7f35289abbd583828dd4945 b/tensorflow/core/kernels/fuzzing/corpus/decode_png/821cdd6eeb919a8dd7f35289abbd583828dd4945
new file mode 100644
index 00000000000..28925e3c80c
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/821cdd6eeb919a8dd7f35289abbd583828dd4945 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/83e1a31785285338b0ddb3334b0ed098e63dedde b/tensorflow/core/kernels/fuzzing/corpus/decode_png/83e1a31785285338b0ddb3334b0ed098e63dedde
new file mode 100644
index 00000000000..9a5487fbfbe
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/83e1a31785285338b0ddb3334b0ed098e63dedde differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/8a4c8100dedd0fb5f2a8b468c678f7ad8269deeb b/tensorflow/core/kernels/fuzzing/corpus/decode_png/8a4c8100dedd0fb5f2a8b468c678f7ad8269deeb
new file mode 100644
index 00000000000..c6c8b7c7173
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/8a4c8100dedd0fb5f2a8b468c678f7ad8269deeb differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/8ae8268c24dc866c1edb3826b93a1c75dbf74ff4 b/tensorflow/core/kernels/fuzzing/corpus/decode_png/8ae8268c24dc866c1edb3826b93a1c75dbf74ff4
new file mode 100644
index 00000000000..ce211f7cfd7
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/8ae8268c24dc866c1edb3826b93a1c75dbf74ff4 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/90f72038cc627f34f074ea72eadbba87a5e3e288 b/tensorflow/core/kernels/fuzzing/corpus/decode_png/90f72038cc627f34f074ea72eadbba87a5e3e288
new file mode 100644
index 00000000000..3f12cb5f659
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/90f72038cc627f34f074ea72eadbba87a5e3e288 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/92b67faee4a49df2cdbed785e27b4a1cddcfffa3 b/tensorflow/core/kernels/fuzzing/corpus/decode_png/92b67faee4a49df2cdbed785e27b4a1cddcfffa3
new file mode 100644
index 00000000000..18ff654a110
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/92b67faee4a49df2cdbed785e27b4a1cddcfffa3 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/9463810467aacdc9923b2b20a2236116b760d75b b/tensorflow/core/kernels/fuzzing/corpus/decode_png/9463810467aacdc9923b2b20a2236116b760d75b
new file mode 100644
index 00000000000..eec341bf2bd
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/9463810467aacdc9923b2b20a2236116b760d75b differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/94d7c96aea32ad41ce643d35b951a6d8990b81d6 b/tensorflow/core/kernels/fuzzing/corpus/decode_png/94d7c96aea32ad41ce643d35b951a6d8990b81d6
new file mode 100644
index 00000000000..776f17c6b21
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/94d7c96aea32ad41ce643d35b951a6d8990b81d6 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/98cc7e9fe87df914d89a0aef008930f27b3c26f5 b/tensorflow/core/kernels/fuzzing/corpus/decode_png/98cc7e9fe87df914d89a0aef008930f27b3c26f5
new file mode 100644
index 00000000000..d7296ca03c4
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/98cc7e9fe87df914d89a0aef008930f27b3c26f5 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/99172dfdb4f59aaced29c7681ac6e6ce8356e814 b/tensorflow/core/kernels/fuzzing/corpus/decode_png/99172dfdb4f59aaced29c7681ac6e6ce8356e814
new file mode 100644
index 00000000000..82559facc80
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/99172dfdb4f59aaced29c7681ac6e6ce8356e814 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/9ae3b647d895af97fe872c0b1442df7b5b767160 b/tensorflow/core/kernels/fuzzing/corpus/decode_png/9ae3b647d895af97fe872c0b1442df7b5b767160
new file mode 100644
index 00000000000..9e2eff2c4ec
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/9ae3b647d895af97fe872c0b1442df7b5b767160 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/9d2b1d2121b0508a4fa8d1508adb9d05633fdac3 b/tensorflow/core/kernels/fuzzing/corpus/decode_png/9d2b1d2121b0508a4fa8d1508adb9d05633fdac3
new file mode 100644
index 00000000000..6d17e06d470
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/9d2b1d2121b0508a4fa8d1508adb9d05633fdac3 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/a335af37917ccf0c8b11bb884a3a74f3f1d2a7c6 b/tensorflow/core/kernels/fuzzing/corpus/decode_png/a335af37917ccf0c8b11bb884a3a74f3f1d2a7c6
new file mode 100644
index 00000000000..ce8245f2da2
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/a335af37917ccf0c8b11bb884a3a74f3f1d2a7c6 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/a738609112d3a6772c50a71e2c3504ebc515b709 b/tensorflow/core/kernels/fuzzing/corpus/decode_png/a738609112d3a6772c50a71e2c3504ebc515b709
new file mode 100644
index 00000000000..a980c777a85
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/a738609112d3a6772c50a71e2c3504ebc515b709 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/a8cecab5d917da5a4729632a7a18c564d7e1607d b/tensorflow/core/kernels/fuzzing/corpus/decode_png/a8cecab5d917da5a4729632a7a18c564d7e1607d
new file mode 100644
index 00000000000..31a0fe82b99
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/a8cecab5d917da5a4729632a7a18c564d7e1607d differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/ade919ab2b4a458e806575c941dfe50ae3fd3621 b/tensorflow/core/kernels/fuzzing/corpus/decode_png/ade919ab2b4a458e806575c941dfe50ae3fd3621
new file mode 100644
index 00000000000..776f0b88dcc
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/ade919ab2b4a458e806575c941dfe50ae3fd3621 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/b1251621a5eb5e7fda9cac9baead1c993a285c36 b/tensorflow/core/kernels/fuzzing/corpus/decode_png/b1251621a5eb5e7fda9cac9baead1c993a285c36
new file mode 100644
index 00000000000..ba6aa256542
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/b1251621a5eb5e7fda9cac9baead1c993a285c36 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/b1516b78c3dfe77eeb554985fd7344c0478fbbcb b/tensorflow/core/kernels/fuzzing/corpus/decode_png/b1516b78c3dfe77eeb554985fd7344c0478fbbcb
new file mode 100644
index 00000000000..c4ec4ad4b9c
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/b1516b78c3dfe77eeb554985fd7344c0478fbbcb differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/b41241740f5f8ad2c1d408f7bb6a313bd863c158 b/tensorflow/core/kernels/fuzzing/corpus/decode_png/b41241740f5f8ad2c1d408f7bb6a313bd863c158
new file mode 100644
index 00000000000..5413efd9336
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/b41241740f5f8ad2c1d408f7bb6a313bd863c158 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/b799c8596523a7ebeb8e11ada08818c10f7eabfc b/tensorflow/core/kernels/fuzzing/corpus/decode_png/b799c8596523a7ebeb8e11ada08818c10f7eabfc
new file mode 100644
index 00000000000..9cd72246b91
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/b799c8596523a7ebeb8e11ada08818c10f7eabfc differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/ba48d0521a111222dc95a3a997c7c92dea5f4443 b/tensorflow/core/kernels/fuzzing/corpus/decode_png/ba48d0521a111222dc95a3a997c7c92dea5f4443
new file mode 100644
index 00000000000..66aac674cd2
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/ba48d0521a111222dc95a3a997c7c92dea5f4443 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/c01457c6889fb1b597d308363a36412c0b7f90e7 b/tensorflow/core/kernels/fuzzing/corpus/decode_png/c01457c6889fb1b597d308363a36412c0b7f90e7
new file mode 100644
index 00000000000..eff793b204a
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/c01457c6889fb1b597d308363a36412c0b7f90e7 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/c82ebc0d6688d104af04fd20d6d3da591dc391f7 b/tensorflow/core/kernels/fuzzing/corpus/decode_png/c82ebc0d6688d104af04fd20d6d3da591dc391f7
new file mode 100644
index 00000000000..ba604969f6e
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/c82ebc0d6688d104af04fd20d6d3da591dc391f7 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/c9a03eb758dd84e954e3d70916e2311e8fd21f3c b/tensorflow/core/kernels/fuzzing/corpus/decode_png/c9a03eb758dd84e954e3d70916e2311e8fd21f3c
new file mode 100644
index 00000000000..c23fb3da9ce
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/c9a03eb758dd84e954e3d70916e2311e8fd21f3c differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/cf892756b33578a54ab20044514e573328d2f1d7 b/tensorflow/core/kernels/fuzzing/corpus/decode_png/cf892756b33578a54ab20044514e573328d2f1d7
new file mode 100644
index 00000000000..42f1f9a2982
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/cf892756b33578a54ab20044514e573328d2f1d7 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/d3bc3f158a63f1d50b474addd3f7b3d17f23e8e9 b/tensorflow/core/kernels/fuzzing/corpus/decode_png/d3bc3f158a63f1d50b474addd3f7b3d17f23e8e9
new file mode 100644
index 00000000000..6b1183f4ffa
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/d3bc3f158a63f1d50b474addd3f7b3d17f23e8e9 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/d4906950aa9d60ad09dc0f5413c3d88080c3bc37 b/tensorflow/core/kernels/fuzzing/corpus/decode_png/d4906950aa9d60ad09dc0f5413c3d88080c3bc37
new file mode 100644
index 00000000000..2d8fd3f3f0d
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/d4906950aa9d60ad09dc0f5413c3d88080c3bc37 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/da31578a8068bad65e1c7a3d06e8f543a2a0bc65 b/tensorflow/core/kernels/fuzzing/corpus/decode_png/da31578a8068bad65e1c7a3d06e8f543a2a0bc65
new file mode 100644
index 00000000000..dc37f788a1a
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/da31578a8068bad65e1c7a3d06e8f543a2a0bc65 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/dd4a9b5d0740679c249fc884efc499433b29436b b/tensorflow/core/kernels/fuzzing/corpus/decode_png/dd4a9b5d0740679c249fc884efc499433b29436b
new file mode 100644
index 00000000000..82c5120f464
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/dd4a9b5d0740679c249fc884efc499433b29436b differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/deea4ecc6f0b2a6d89fd25ff76762299f21602fb b/tensorflow/core/kernels/fuzzing/corpus/decode_png/deea4ecc6f0b2a6d89fd25ff76762299f21602fb
new file mode 100644
index 00000000000..6daa5452a15
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/deea4ecc6f0b2a6d89fd25ff76762299f21602fb differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/e1040c7ffcb39915e0f539018c81f9798924cba6 b/tensorflow/core/kernels/fuzzing/corpus/decode_png/e1040c7ffcb39915e0f539018c81f9798924cba6
new file mode 100644
index 00000000000..306bbf464bf
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/e1040c7ffcb39915e0f539018c81f9798924cba6 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/e381dc85682cc33ad99f622b89d145b47f7d6392 b/tensorflow/core/kernels/fuzzing/corpus/decode_png/e381dc85682cc33ad99f622b89d145b47f7d6392
new file mode 100644
index 00000000000..36487c00022
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/e381dc85682cc33ad99f622b89d145b47f7d6392 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/ea24498fc7a144fccc6f1665ebf7020df803dd1a b/tensorflow/core/kernels/fuzzing/corpus/decode_png/ea24498fc7a144fccc6f1665ebf7020df803dd1a
new file mode 100644
index 00000000000..ab99a8374aa
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/ea24498fc7a144fccc6f1665ebf7020df803dd1a differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/eaa5d677e797c07bac98c3c7051abad91852e7c6 b/tensorflow/core/kernels/fuzzing/corpus/decode_png/eaa5d677e797c07bac98c3c7051abad91852e7c6
new file mode 100644
index 00000000000..63ff2676ae3
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/eaa5d677e797c07bac98c3c7051abad91852e7c6 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/ed7871269315725535d8bffec7836c45a3fc5c26 b/tensorflow/core/kernels/fuzzing/corpus/decode_png/ed7871269315725535d8bffec7836c45a3fc5c26
new file mode 100644
index 00000000000..d2a4b9aafd7
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/ed7871269315725535d8bffec7836c45a3fc5c26 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/ee8460f4077064c5a2137075b48eba7d3db5c570 b/tensorflow/core/kernels/fuzzing/corpus/decode_png/ee8460f4077064c5a2137075b48eba7d3db5c570
new file mode 100644
index 00000000000..c3b2bd442c1
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/ee8460f4077064c5a2137075b48eba7d3db5c570 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/ef09f26e0ee61329f84a9f589629a865ae9ee0a6 b/tensorflow/core/kernels/fuzzing/corpus/decode_png/ef09f26e0ee61329f84a9f589629a865ae9ee0a6
new file mode 100644
index 00000000000..2422f7cb3fe
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/ef09f26e0ee61329f84a9f589629a865ae9ee0a6 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/f477da4d7d8ff2066041e1dd5ee4e833b7111a1a b/tensorflow/core/kernels/fuzzing/corpus/decode_png/f477da4d7d8ff2066041e1dd5ee4e833b7111a1a
new file mode 100644
index 00000000000..2ec0b7ae29c
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/f477da4d7d8ff2066041e1dd5ee4e833b7111a1a differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/f8a379b2498a4eb452a85791a49adf065dab59ae b/tensorflow/core/kernels/fuzzing/corpus/decode_png/f8a379b2498a4eb452a85791a49adf065dab59ae
new file mode 100644
index 00000000000..5431f584cd1
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/f8a379b2498a4eb452a85791a49adf065dab59ae differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/fe67bccb06f2174523943cc684518fcf1f7f8046 b/tensorflow/core/kernels/fuzzing/corpus/decode_png/fe67bccb06f2174523943cc684518fcf1f7f8046
new file mode 100644
index 00000000000..cf043445f4b
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/fe67bccb06f2174523943cc684518fcf1f7f8046 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/ff1e67d17c1c27ef0d97900d0ea276b563a64628 b/tensorflow/core/kernels/fuzzing/corpus/decode_png/ff1e67d17c1c27ef0d97900d0ea276b563a64628
new file mode 100644
index 00000000000..b1f8d120a39
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/ff1e67d17c1c27ef0d97900d0ea276b563a64628 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_wav/02cc44cdfec1d9d0d0c66c5a5f40d3d20e4c4c3a b/tensorflow/core/kernels/fuzzing/corpus/decode_wav/02cc44cdfec1d9d0d0c66c5a5f40d3d20e4c4c3a
new file mode 100644
index 00000000000..2a65e0e2559
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_wav/02cc44cdfec1d9d0d0c66c5a5f40d3d20e4c4c3a differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_wav/087e1d7fae1c1ddcbaa3b5f822a171ad15498186 b/tensorflow/core/kernels/fuzzing/corpus/decode_wav/087e1d7fae1c1ddcbaa3b5f822a171ad15498186
new file mode 100644
index 00000000000..329af8a3b9d
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_wav/087e1d7fae1c1ddcbaa3b5f822a171ad15498186 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_wav/0f61c33027394a0f14d29dcd22f405cad943b7cf b/tensorflow/core/kernels/fuzzing/corpus/decode_wav/0f61c33027394a0f14d29dcd22f405cad943b7cf
new file mode 100644
index 00000000000..8cda165c8c7
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_wav/0f61c33027394a0f14d29dcd22f405cad943b7cf differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_wav/10cdebea1659c21a0248f88654ae41f62786abf1 b/tensorflow/core/kernels/fuzzing/corpus/decode_wav/10cdebea1659c21a0248f88654ae41f62786abf1
new file mode 100644
index 00000000000..4afe44dd91c
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_wav/10cdebea1659c21a0248f88654ae41f62786abf1 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_wav/126e68def9fd973a100e0f66cadf09448a716b57 b/tensorflow/core/kernels/fuzzing/corpus/decode_wav/126e68def9fd973a100e0f66cadf09448a716b57
new file mode 100644
index 00000000000..9b2d29856d1
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_wav/126e68def9fd973a100e0f66cadf09448a716b57 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_wav/1275d41ebf8788ce3a949352e4bc654b04012da3 b/tensorflow/core/kernels/fuzzing/corpus/decode_wav/1275d41ebf8788ce3a949352e4bc654b04012da3
new file mode 100644
index 00000000000..b4370a16d58
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_wav/1275d41ebf8788ce3a949352e4bc654b04012da3 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_wav/1a7f1c407fb3864ddb559f88f373a21d1be51584 b/tensorflow/core/kernels/fuzzing/corpus/decode_wav/1a7f1c407fb3864ddb559f88f373a21d1be51584
new file mode 100644
index 00000000000..d28721708d4
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_wav/1a7f1c407fb3864ddb559f88f373a21d1be51584 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_wav/1c3e1c91f187f6bcea86f172ff5bbbd955a9654d b/tensorflow/core/kernels/fuzzing/corpus/decode_wav/1c3e1c91f187f6bcea86f172ff5bbbd955a9654d
new file mode 100644
index 00000000000..611b38b71d5
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_wav/1c3e1c91f187f6bcea86f172ff5bbbd955a9654d differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_wav/300fe1e0a47543037cbf0243b6756c9aa48799c4 b/tensorflow/core/kernels/fuzzing/corpus/decode_wav/300fe1e0a47543037cbf0243b6756c9aa48799c4
new file mode 100644
index 00000000000..fcf8360b277
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_wav/300fe1e0a47543037cbf0243b6756c9aa48799c4 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_wav/31ec5b0134bedcfe283f4978e6e65b7d35d5d4ad b/tensorflow/core/kernels/fuzzing/corpus/decode_wav/31ec5b0134bedcfe283f4978e6e65b7d35d5d4ad
new file mode 100644
index 00000000000..868e2672727
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_wav/31ec5b0134bedcfe283f4978e6e65b7d35d5d4ad differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_wav/4e7cbb27667bcfca92838aa8020749990013a9b1 b/tensorflow/core/kernels/fuzzing/corpus/decode_wav/4e7cbb27667bcfca92838aa8020749990013a9b1
new file mode 100644
index 00000000000..898584d96f6
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_wav/4e7cbb27667bcfca92838aa8020749990013a9b1 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_wav/585e469231d202812bfba8285fb30c8e31c857b9 b/tensorflow/core/kernels/fuzzing/corpus/decode_wav/585e469231d202812bfba8285fb30c8e31c857b9
new file mode 100644
index 00000000000..a4994c20830
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_wav/585e469231d202812bfba8285fb30c8e31c857b9 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_wav/58eab6bc2386e2ef43fe4f55cb6ad3611399d5de b/tensorflow/core/kernels/fuzzing/corpus/decode_wav/58eab6bc2386e2ef43fe4f55cb6ad3611399d5de
new file mode 100644
index 00000000000..eb38d110153
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_wav/58eab6bc2386e2ef43fe4f55cb6ad3611399d5de differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_wav/63448c6a9feb8c72b3e82af4d735ec2e62ddd328 b/tensorflow/core/kernels/fuzzing/corpus/decode_wav/63448c6a9feb8c72b3e82af4d735ec2e62ddd328
new file mode 100644
index 00000000000..6c534ab19cf
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_wav/63448c6a9feb8c72b3e82af4d735ec2e62ddd328 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_wav/6874d5b1c7a64b596c61f24877d422e89bebe58b b/tensorflow/core/kernels/fuzzing/corpus/decode_wav/6874d5b1c7a64b596c61f24877d422e89bebe58b
new file mode 100644
index 00000000000..e054ad5f147
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_wav/6874d5b1c7a64b596c61f24877d422e89bebe58b differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_wav/7501f79cb067da108020579ed654349c7933d22f b/tensorflow/core/kernels/fuzzing/corpus/decode_wav/7501f79cb067da108020579ed654349c7933d22f
new file mode 100644
index 00000000000..3be6a61cbab
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_wav/7501f79cb067da108020579ed654349c7933d22f differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_wav/782051f8120182b860c7fe1b265179cfa2fe03fd b/tensorflow/core/kernels/fuzzing/corpus/decode_wav/782051f8120182b860c7fe1b265179cfa2fe03fd
new file mode 100644
index 00000000000..a0d8a6ec48c
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_wav/782051f8120182b860c7fe1b265179cfa2fe03fd differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_wav/793feab2deb35e284a975f6527d76a8be5540fe6 b/tensorflow/core/kernels/fuzzing/corpus/decode_wav/793feab2deb35e284a975f6527d76a8be5540fe6
new file mode 100644
index 00000000000..8a9216e10b7
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_wav/793feab2deb35e284a975f6527d76a8be5540fe6 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_wav/7f41ec3a9805c6b8f3656c4f9f6d0ff7dbf8a329 b/tensorflow/core/kernels/fuzzing/corpus/decode_wav/7f41ec3a9805c6b8f3656c4f9f6d0ff7dbf8a329
new file mode 100644
index 00000000000..aaa91f2f45b
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_wav/7f41ec3a9805c6b8f3656c4f9f6d0ff7dbf8a329 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_wav/8210dc595a2652f2f812093b01e239e7918ea065 b/tensorflow/core/kernels/fuzzing/corpus/decode_wav/8210dc595a2652f2f812093b01e239e7918ea065
new file mode 100644
index 00000000000..46316baf29f
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_wav/8210dc595a2652f2f812093b01e239e7918ea065 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_wav/8dffe4c5c26d891b578fd2ea4b9adfc0c96ad5f7 b/tensorflow/core/kernels/fuzzing/corpus/decode_wav/8dffe4c5c26d891b578fd2ea4b9adfc0c96ad5f7
new file mode 100644
index 00000000000..54a777d22c1
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_wav/8dffe4c5c26d891b578fd2ea4b9adfc0c96ad5f7 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_wav/91d787a9298ddc015efa783a92c4bdba8af0d7de b/tensorflow/core/kernels/fuzzing/corpus/decode_wav/91d787a9298ddc015efa783a92c4bdba8af0d7de
new file mode 100644
index 00000000000..826747d852b
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_wav/91d787a9298ddc015efa783a92c4bdba8af0d7de differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_wav/92c065286f956f086e977556358f6b54b12bcacc b/tensorflow/core/kernels/fuzzing/corpus/decode_wav/92c065286f956f086e977556358f6b54b12bcacc
new file mode 100644
index 00000000000..77b8f518b4c
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_wav/92c065286f956f086e977556358f6b54b12bcacc differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_wav/a35c9bb71792b60a13dea23a41b41847ad4b93d6 b/tensorflow/core/kernels/fuzzing/corpus/decode_wav/a35c9bb71792b60a13dea23a41b41847ad4b93d6
new file mode 100644
index 00000000000..45d6b6fa606
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_wav/a35c9bb71792b60a13dea23a41b41847ad4b93d6 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_wav/a6ea960c7b4d42772888280277b26e645ceee904 b/tensorflow/core/kernels/fuzzing/corpus/decode_wav/a6ea960c7b4d42772888280277b26e645ceee904
new file mode 100644
index 00000000000..14954c59588
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_wav/a6ea960c7b4d42772888280277b26e645ceee904 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_wav/aa526aa853333f0bb11804b5243df411452cecd2 b/tensorflow/core/kernels/fuzzing/corpus/decode_wav/aa526aa853333f0bb11804b5243df411452cecd2
new file mode 100644
index 00000000000..c0cc5c469f9
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_wav/aa526aa853333f0bb11804b5243df411452cecd2 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_wav/ca533cd26c7ca6bf69e62351b265ded496fdf1d9 b/tensorflow/core/kernels/fuzzing/corpus/decode_wav/ca533cd26c7ca6bf69e62351b265ded496fdf1d9
new file mode 100644
index 00000000000..bb026584ca2
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_wav/ca533cd26c7ca6bf69e62351b265ded496fdf1d9 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_wav/f38c61da15f2cb7a39ff02e69f0b00e99f37ec86 b/tensorflow/core/kernels/fuzzing/corpus/decode_wav/f38c61da15f2cb7a39ff02e69f0b00e99f37ec86
new file mode 100644
index 00000000000..0900eb1352b
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_wav/f38c61da15f2cb7a39ff02e69f0b00e99f37ec86 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_wav/f88f1012473e6cfcc9b39b2552f682b2f73eff8c b/tensorflow/core/kernels/fuzzing/corpus/decode_wav/f88f1012473e6cfcc9b39b2552f682b2f73eff8c
new file mode 100644
index 00000000000..d74f0a3326d
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_wav/f88f1012473e6cfcc9b39b2552f682b2f73eff8c differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_wav/fa79819c5de04bc06c69bec3fa7f2e982826ea2f b/tensorflow/core/kernels/fuzzing/corpus/decode_wav/fa79819c5de04bc06c69bec3fa7f2e982826ea2f
new file mode 100644
index 00000000000..2fd41a34fe5
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_wav/fa79819c5de04bc06c69bec3fa7f2e982826ea2f differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_wav/fce08de222896ac3a20657a3b4f42d5b6c54a96a b/tensorflow/core/kernels/fuzzing/corpus/decode_wav/fce08de222896ac3a20657a3b4f42d5b6c54a96a
new file mode 100644
index 00000000000..35a99bc97d9
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_wav/fce08de222896ac3a20657a3b4f42d5b6c54a96a differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/string_split/4c01a1504da9de2216894743ecc44424 b/tensorflow/core/kernels/fuzzing/corpus/string_split/4c01a1504da9de2216894743ecc44424
new file mode 100644
index 00000000000..eb84b9e610c
--- /dev/null
+++ b/tensorflow/core/kernels/fuzzing/corpus/string_split/4c01a1504da9de2216894743ecc44424
@@ -0,0 +1 @@
+./,abcd.efgh/abcd,efgh.abcd/efgh,abcd.efgh/a
\ No newline at end of file
diff --git a/tensorflow/core/kernels/fuzzing/corpus/string_split/5bf16424630b5afbcffe711fb9834440 b/tensorflow/core/kernels/fuzzing/corpus/string_split/5bf16424630b5afbcffe711fb9834440
new file mode 100644
index 00000000000..4cd522da7bf
--- /dev/null
+++ b/tensorflow/core/kernels/fuzzing/corpus/string_split/5bf16424630b5afbcffe711fb9834440
@@ -0,0 +1 @@
+.ab.cd.ef.gh.ab.cd.ef.gh.ab.cd.ef.gh.ab.cd
\ No newline at end of file
diff --git a/tensorflow/core/kernels/fuzzing/corpus/string_split/a7185605aef0a8fd682fcb4656e4a736 b/tensorflow/core/kernels/fuzzing/corpus/string_split/a7185605aef0a8fd682fcb4656e4a736
new file mode 100644
index 00000000000..03cfb6256f3
--- /dev/null
+++ b/tensorflow/core/kernels/fuzzing/corpus/string_split/a7185605aef0a8fd682fcb4656e4a736
@@ -0,0 +1 @@
+./, abcde.fghab/cdefg,habcd efgha.bcdef/ghabc
\ No newline at end of file
diff --git a/tensorflow/core/kernels/fuzzing/corpus/string_split/d5606def44fdbb9385dd764612069db0 b/tensorflow/core/kernels/fuzzing/corpus/string_split/d5606def44fdbb9385dd764612069db0
new file mode 100644
index 00000000000..304b0d66fe0
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/string_split/d5606def44fdbb9385dd764612069db0 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/string_split/dbac766f3160de65894bf5153f478146 b/tensorflow/core/kernels/fuzzing/corpus/string_split/dbac766f3160de65894bf5153f478146
new file mode 100644
index 00000000000..a8740444aa4
--- /dev/null
+++ b/tensorflow/core/kernels/fuzzing/corpus/string_split/dbac766f3160de65894bf5153f478146
@@ -0,0 +1 @@
+./, ?abcdef.ghabcd/efghab,cdefgh abcdef?ghabcd
\ No newline at end of file
diff --git a/tensorflow/core/kernels/fuzzing/corpus/string_split/e85ff62f6d457666f54a37a19a115a24 b/tensorflow/core/kernels/fuzzing/corpus/string_split/e85ff62f6d457666f54a37a19a115a24
new file mode 100644
index 00000000000..47d551466a4
--- /dev/null
+++ b/tensorflow/core/kernels/fuzzing/corpus/string_split/e85ff62f6d457666f54a37a19a115a24
@@ -0,0 +1 @@
+./abc.def/gha.bcd/efg.hab/cde.fgh/abc.def/g
\ No newline at end of file
diff --git a/tensorflow/core/kernels/fuzzing/corpus/string_split_v2/00fd47bf73afcb72e7ed51bffd5f5fec b/tensorflow/core/kernels/fuzzing/corpus/string_split_v2/00fd47bf73afcb72e7ed51bffd5f5fec
new file mode 100644
index 00000000000..f1410e184b2
--- /dev/null
+++ b/tensorflow/core/kernels/fuzzing/corpus/string_split_v2/00fd47bf73afcb72e7ed51bffd5f5fec
@@ -0,0 +1 @@
+./abc./de./fg./ha./bc./de./fg./ha./bc./de./
\ No newline at end of file
diff --git a/tensorflow/core/kernels/fuzzing/corpus/string_split_v2/14908973e6720513a5f37676cb9fcc29 b/tensorflow/core/kernels/fuzzing/corpus/string_split_v2/14908973e6720513a5f37676cb9fcc29
new file mode 100644
index 00000000000..e118d2d351b
--- /dev/null
+++ b/tensorflow/core/kernels/fuzzing/corpus/string_split_v2/14908973e6720513a5f37676cb9fcc29
@@ -0,0 +1 @@
+./, abcde./, fg./, ha./, bc./, de./, fg./, ha
\ No newline at end of file
diff --git a/tensorflow/core/kernels/fuzzing/corpus/string_split_v2/2779ba7c4d23eee9f79efa3660084c5d b/tensorflow/core/kernels/fuzzing/corpus/string_split_v2/2779ba7c4d23eee9f79efa3660084c5d
new file mode 100644
index 00000000000..9a6c8091974
--- /dev/null
+++ b/tensorflow/core/kernels/fuzzing/corpus/string_split_v2/2779ba7c4d23eee9f79efa3660084c5d
@@ -0,0 +1 @@
+./,abcd./,ef./,gh./,ab./,cd./,ef./,gh./,ab./
\ No newline at end of file
diff --git a/tensorflow/core/kernels/fuzzing/corpus/string_split_v2/5bf16424630b5afbcffe711fb9834440 b/tensorflow/core/kernels/fuzzing/corpus/string_split_v2/5bf16424630b5afbcffe711fb9834440
new file mode 100644
index 00000000000..4cd522da7bf
--- /dev/null
+++ b/tensorflow/core/kernels/fuzzing/corpus/string_split_v2/5bf16424630b5afbcffe711fb9834440
@@ -0,0 +1 @@
+.ab.cd.ef.gh.ab.cd.ef.gh.ab.cd.ef.gh.ab.cd
\ No newline at end of file
diff --git a/tensorflow/core/kernels/fuzzing/corpus/string_split_v2/89734a96b93275e495a9498b806fafe1 b/tensorflow/core/kernels/fuzzing/corpus/string_split_v2/89734a96b93275e495a9498b806fafe1
new file mode 100644
index 00000000000..5301a91d8e4
--- /dev/null
+++ b/tensorflow/core/kernels/fuzzing/corpus/string_split_v2/89734a96b93275e495a9498b806fafe1
@@ -0,0 +1 @@
+./, ?abcdef./, ?gh./, ?ab./, ?cd./, ?ef./, ?gh
\ No newline at end of file
diff --git a/tensorflow/core/kernels/fuzzing/corpus/string_split_v2/d5606def44fdbb9385dd764612069db0 b/tensorflow/core/kernels/fuzzing/corpus/string_split_v2/d5606def44fdbb9385dd764612069db0
new file mode 100644
index 00000000000..304b0d66fe0
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/string_split_v2/d5606def44fdbb9385dd764612069db0 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/string_to_number/2db83ea58639b6d7d585fa12e3947a82 b/tensorflow/core/kernels/fuzzing/corpus/string_to_number/2db83ea58639b6d7d585fa12e3947a82
new file mode 100644
index 00000000000..3de80927d57
--- /dev/null
+++ b/tensorflow/core/kernels/fuzzing/corpus/string_to_number/2db83ea58639b6d7d585fa12e3947a82
@@ -0,0 +1 @@
+6.023e+23
diff --git a/tensorflow/core/kernels/fuzzing/corpus/string_to_number/36b4a931886b941dc41180050d12ca94 b/tensorflow/core/kernels/fuzzing/corpus/string_to_number/36b4a931886b941dc41180050d12ca94
new file mode 100644
index 00000000000..d531129b283
--- /dev/null
+++ b/tensorflow/core/kernels/fuzzing/corpus/string_to_number/36b4a931886b941dc41180050d12ca94
@@ -0,0 +1 @@
+6.023e-23
diff --git a/tensorflow/core/kernels/fuzzing/corpus/string_to_number/50a2fabfdd276f573ff97ace8b11c5f4 b/tensorflow/core/kernels/fuzzing/corpus/string_to_number/50a2fabfdd276f573ff97ace8b11c5f4
new file mode 100644
index 00000000000..d81cc0710eb
--- /dev/null
+++ b/tensorflow/core/kernels/fuzzing/corpus/string_to_number/50a2fabfdd276f573ff97ace8b11c5f4
@@ -0,0 +1 @@
+42
diff --git a/tensorflow/core/kernels/fuzzing/corpus/string_to_number/62edb2a1eee34b001652cd86584becf2 b/tensorflow/core/kernels/fuzzing/corpus/string_to_number/62edb2a1eee34b001652cd86584becf2
new file mode 100644
index 00000000000..72f88139d0f
--- /dev/null
+++ b/tensorflow/core/kernels/fuzzing/corpus/string_to_number/62edb2a1eee34b001652cd86584becf2
@@ -0,0 +1 @@
+0xabcdef
diff --git a/tensorflow/core/kernels/fuzzing/corpus/string_to_number/90013d1ec28c46a5c00574e60c70b6fc b/tensorflow/core/kernels/fuzzing/corpus/string_to_number/90013d1ec28c46a5c00574e60c70b6fc
new file mode 100644
index 00000000000..c1113b83e8f
--- /dev/null
+++ b/tensorflow/core/kernels/fuzzing/corpus/string_to_number/90013d1ec28c46a5c00574e60c70b6fc
@@ -0,0 +1 @@
+3.14159265359
diff --git a/tensorflow/core/kernels/fuzzing/corpus/string_to_number/94f3e3cee6957ce5815326d6788c85f4 b/tensorflow/core/kernels/fuzzing/corpus/string_to_number/94f3e3cee6957ce5815326d6788c85f4
new file mode 100644
index 00000000000..320aa3f00ee
--- /dev/null
+++ b/tensorflow/core/kernels/fuzzing/corpus/string_to_number/94f3e3cee6957ce5815326d6788c85f4
@@ -0,0 +1 @@
+0.69314718056
diff --git a/tensorflow/core/kernels/fuzzing/corpus/string_to_number/96f547bc04bb913da0bc08915238ebd8 b/tensorflow/core/kernels/fuzzing/corpus/string_to_number/96f547bc04bb913da0bc08915238ebd8
new file mode 100644
index 00000000000..51b7b732f69
--- /dev/null
+++ b/tensorflow/core/kernels/fuzzing/corpus/string_to_number/96f547bc04bb913da0bc08915238ebd8
@@ -0,0 +1 @@
+6.023e23
diff --git a/tensorflow/core/kernels/fuzzing/corpus/string_to_number/d3a903d18fc11e1f35c572ad4da690ed b/tensorflow/core/kernels/fuzzing/corpus/string_to_number/d3a903d18fc11e1f35c572ad4da690ed
new file mode 100644
index 00000000000..9a0be0764b6
--- /dev/null
+++ b/tensorflow/core/kernels/fuzzing/corpus/string_to_number/d3a903d18fc11e1f35c572ad4da690ed
@@ -0,0 +1 @@
+1.61803
diff --git a/tensorflow/core/kernels/fuzzing/corpus/string_to_number/e3b629c92af44260c189deb32d6f06f3 b/tensorflow/core/kernels/fuzzing/corpus/string_to_number/e3b629c92af44260c189deb32d6f06f3
new file mode 100644
index 00000000000..6a0e60d48b1
--- /dev/null
+++ b/tensorflow/core/kernels/fuzzing/corpus/string_to_number/e3b629c92af44260c189deb32d6f06f3
@@ -0,0 +1 @@
+-42
diff --git a/tensorflow/core/kernels/fuzzing/corpus/string_to_number/f03eecf3bcfe4967a1888156a3115c8d b/tensorflow/core/kernels/fuzzing/corpus/string_to_number/f03eecf3bcfe4967a1888156a3115c8d
new file mode 100644
index 00000000000..ea9cd255bc7
--- /dev/null
+++ b/tensorflow/core/kernels/fuzzing/corpus/string_to_number/f03eecf3bcfe4967a1888156a3115c8d
@@ -0,0 +1 @@
+6.023E+23
diff --git a/tensorflow/core/kernels/fuzzing/corpus/string_to_number/fa54ca9186f77122ae2a82684a062e16 b/tensorflow/core/kernels/fuzzing/corpus/string_to_number/fa54ca9186f77122ae2a82684a062e16
new file mode 100644
index 00000000000..00f1e2ed8ff
--- /dev/null
+++ b/tensorflow/core/kernels/fuzzing/corpus/string_to_number/fa54ca9186f77122ae2a82684a062e16
@@ -0,0 +1 @@
+2.71828182846
diff --git a/tensorflow/core/kernels/fuzzing/dictionaries/decode_json_example.dict b/tensorflow/core/kernels/fuzzing/dictionaries/decode_json_example.dict
new file mode 100644
index 00000000000..5fe4ca23d1f
--- /dev/null
+++ b/tensorflow/core/kernels/fuzzing/dictionaries/decode_json_example.dict
@@ -0,0 +1,6 @@
+"features"
+"feature"
+"bytes_list"
+"float_list"
+"int64_list"
+"value"
diff --git a/tensorflow/core/kernels/fuzzing/dictionaries/decode_png.dict b/tensorflow/core/kernels/fuzzing/dictionaries/decode_png.dict
new file mode 100644
index 00000000000..d795ae7f71f
--- /dev/null
+++ b/tensorflow/core/kernels/fuzzing/dictionaries/decode_png.dict
@@ -0,0 +1,50 @@
+header_87a="87a"
+header_89a="89a"
+header_gif="GIF"
+header_jfif="JFIF\x00"
+header_jfxx="JFXX\x00"
+header_png="\x89PNG\x0d\x0a\x1a\x0a"
+marker_2c=","
+marker_3b=";"
+section_2101="!\x01\x12"
+section_21f9="!\xf9\x04"
+section_21fe="!\xfe"
+section_21ff="!\xff\x11"
+section_IDAT="IDAT"
+section_IEND="IEND"
+section_IHDR="IHDR"
+section_PLTE="PLTE"
+section_bKGD="bKGD"
+section_cHRM="cHRM"
+section_fRAc="fRAc"
+section_ffc0="\xff\xc0"
+section_ffc2="\xff\xc2"
+section_ffc4="\xff\xc4"
+section_ffd0="\xff\xd0"
+section_ffd8="\xff\xd8"
+section_ffd9="\xff\xd9"
+section_ffda="\xff\xda"
+section_ffdb="\xff\xdb"
+section_ffdd="\xff\xdd"
+section_ffe0="\xff\xe0"
+section_ffe1="\xff\xe1"
+section_fffe="\xff\xfe"
+section_gAMA="gAMA"
+section_gIFg="gIFg"
+section_gIFt="gIFt"
+section_gIFx="gIFx"
+section_hIST="hIST"
+section_iCCP="iCCP"
+section_iTXt="iTXt"
+section_oFFs="oFFs"
+section_pCAL="pCAL"
+section_pHYs="pHYs"
+section_sBIT="sBIT"
+section_sCAL="sCAL"
+section_sPLT="sPLT"
+section_sRGB="sRGB"
+section_sTER="sTER"
+section_tEXt="tEXt"
+section_tIME="tIME"
+section_tRNS="tRNS"
+section_zTXt="zTXt"
diff --git a/tensorflow/core/kernels/fuzzing/dictionaries/decode_wav.dict b/tensorflow/core/kernels/fuzzing/dictionaries/decode_wav.dict
new file mode 100644
index 00000000000..eab65386ce3
--- /dev/null
+++ b/tensorflow/core/kernels/fuzzing/dictionaries/decode_wav.dict
@@ -0,0 +1,4 @@
+header_RIFF="RIFF"
+header_WAVE="WAVE"
+section_fmt="fmt "
+section_data="data"
diff --git a/tensorflow/core/kernels/fuzzing/encode_base64_fuzz.cc b/tensorflow/core/kernels/fuzzing/encode_base64_fuzz.cc
index a8f07f4bad3..b8d779fb138 100644
--- a/tensorflow/core/kernels/fuzzing/encode_base64_fuzz.cc
+++ b/tensorflow/core/kernels/fuzzing/encode_base64_fuzz.cc
@@ -19,7 +19,7 @@ limitations under the License.
 namespace tensorflow {
 namespace fuzzing {
 
-class FuzzEncodeBase64 : public FuzzSession {
+class FuzzEncodeBase64 : public FuzzStringInputOp {
   SINGLE_INPUT_OP_BUILDER(DT_STRING, EncodeBase64);
 };
 
diff --git a/tensorflow/core/kernels/fuzzing/fuzz_session.h b/tensorflow/core/kernels/fuzzing/fuzz_session.h
index 9777be1ae8a..57d562ddf43 100644
--- a/tensorflow/core/kernels/fuzzing/fuzz_session.h
+++ b/tensorflow/core/kernels/fuzzing/fuzz_session.h
@@ -72,11 +72,11 @@ class FuzzSession {
   // By convention, the graph should have inputs named "input1", ...
   // "inputN", and one output node, named "output".
   // Users of FuzzSession should override this method to create their graph.
-  virtual void BuildGraph(const Scope& scope) {}
+  virtual void BuildGraph(const Scope& scope) = 0;
 
   // Implements the logic that converts an opaque byte buffer
   // from the fuzzer to Tensor inputs to the graph.  Users must override.
-  virtual void FuzzImpl(const uint8_t* data, size_t size) {}
+  virtual void FuzzImpl(const uint8_t* data, size_t size) = 0;
 
   // Initializes the FuzzSession.  Not safe for multithreading.
   // Separate init function because the call to virtual BuildGraphDef
diff --git a/tensorflow/core/kernels/fuzzing/string_split_fuzz.cc b/tensorflow/core/kernels/fuzzing/string_split_fuzz.cc
index 87a548a999c..2564f8ed030 100644
--- a/tensorflow/core/kernels/fuzzing/string_split_fuzz.cc
+++ b/tensorflow/core/kernels/fuzzing/string_split_fuzz.cc
@@ -37,8 +37,7 @@ class FuzzStringSplit : public FuzzSession {
       // The spec for split is that the delimeter should be 0 or 1 characters.
       // Naturally, fuzz it with something larger.  (This omits the possibility
       // of handing it a > int32_max size string, which should be tested for in
-      // an
-      // explicit test).
+      // an explicit test).
       size_t delim_len = static_cast<size_t>(data[0]);
       if (delim_len > size) {
         delim_len = size - 1;
diff --git a/tensorflow/core/kernels/fuzzing/tf_ops_fuzz_target_lib.bzl b/tensorflow/core/kernels/fuzzing/tf_ops_fuzz_target_lib.bzl
index f752b59568a..e9322133590 100644
--- a/tensorflow/core/kernels/fuzzing/tf_ops_fuzz_target_lib.bzl
+++ b/tensorflow/core/kernels/fuzzing/tf_ops_fuzz_target_lib.bzl
@@ -1,13 +1,25 @@
 """Fuzzing template for TensorFlow ops."""
 
 def tf_ops_fuzz_target_lib(name):
-  native.cc_library(
-      name = name + "_fuzz_lib",
-      srcs = [name + "_fuzz.cc"],
-      deps = [
-          "//tensorflow/core/kernels/fuzzing:fuzz_session",
-          "//tensorflow/cc:cc_ops",
-      ],
-      tags = ["no_windows"],
-      alwayslink = 1,
-  )
+    native.cc_library(
+        name = name + "_fuzz_lib",
+        srcs = [name + "_fuzz.cc"],
+        deps = [
+            "//tensorflow/core/kernels/fuzzing:fuzz_session",
+            "//tensorflow/cc:cc_ops",
+        ],
+        tags = ["no_windows"],
+        alwayslink = 1,
+    )
+
+def tf_oss_fuzz_corpus(name):
+    native.filegroup(
+        name = name + "_corpus",
+        srcs = native.glob(["corpus/" + name + "/*"]),
+    )
+
+def tf_oss_fuzz_dict(name):
+    native.filegroup(
+        name = name + "_dict",
+        srcs = native.glob(["dictionaries/" + name + ".dict"]),
+    )
diff --git a/tensorflow/core/kernels/list_kernels.cc b/tensorflow/core/kernels/list_kernels.cc
index 5f244b1b10f..42fad1d4b05 100644
--- a/tensorflow/core/kernels/list_kernels.cc
+++ b/tensorflow/core/kernels/list_kernels.cc
@@ -483,9 +483,19 @@ REGISTER_KERNEL_BUILDER(Name("TensorListGetItem").Device(DEVICE_CPU),
 
 #if GOOGLE_CUDA
 
-REGISTER_KERNEL_BUILDER(
-    Name("TensorListGetItem").Device(DEVICE_GPU).HostMemory("index"),
-    TensorListGetItem);
+#define REGISTER_TENSOR_LIST_GET_ITEM_GPU(T)                      \
+  REGISTER_KERNEL_BUILDER(Name("TensorListGetItem")               \
+                              .TypeConstraint<T>("element_dtype") \
+                              .Device(DEVICE_GPU)                 \
+                              .HostMemory("index"),               \
+                          TensorListGetItem);
+
+TF_CALL_GPU_NUMBER_TYPES(REGISTER_TENSOR_LIST_GET_ITEM_GPU);
+TF_CALL_complex64(REGISTER_TENSOR_LIST_GET_ITEM_GPU);
+TF_CALL_complex128(REGISTER_TENSOR_LIST_GET_ITEM_GPU);
+TF_CALL_int64(REGISTER_TENSOR_LIST_GET_ITEM_GPU);
+REGISTER_TENSOR_LIST_GET_ITEM_GPU(bfloat16)
+#undef REGISTER_TENSOR_LIST_GET_ITEM_GPU
 
 #endif  // GOOGLE_CUDA
 
@@ -537,9 +547,19 @@ REGISTER_KERNEL_BUILDER(Name("TensorListSetItem").Device(DEVICE_CPU),
 
 #if GOOGLE_CUDA
 
-REGISTER_KERNEL_BUILDER(
-    Name("TensorListSetItem").Device(DEVICE_GPU).HostMemory("index"),
-    TensorListSetItem);
+#define REGISTER_TENSOR_LIST_SET_ITEM_GPU(T)                      \
+  REGISTER_KERNEL_BUILDER(Name("TensorListSetItem")               \
+                              .TypeConstraint<T>("element_dtype") \
+                              .Device(DEVICE_GPU)                 \
+                              .HostMemory("index"),               \
+                          TensorListSetItem);
+
+TF_CALL_GPU_NUMBER_TYPES(REGISTER_TENSOR_LIST_SET_ITEM_GPU);
+TF_CALL_complex64(REGISTER_TENSOR_LIST_SET_ITEM_GPU);
+TF_CALL_complex128(REGISTER_TENSOR_LIST_SET_ITEM_GPU);
+TF_CALL_int64(REGISTER_TENSOR_LIST_SET_ITEM_GPU);
+REGISTER_TENSOR_LIST_SET_ITEM_GPU(bfloat16)
+#undef REGISTER_TENSOR_LIST_SET_ITEM_GPU
 
 #endif  // GOOGLE_CUDA
 
@@ -660,7 +680,11 @@ REGISTER_TENSOR_LIST_PUSH_BACK_BATCH_CPU(bfloat16);
   REGISTER_KERNEL_BUILDER(Name("TensorListGather")                \
                               .TypeConstraint<T>("element_dtype") \
                               .Device(DEVICE_CPU),                \
-                          TensorListGather<CPUDevice, T>)
+                          TensorListGather<CPUDevice, T>)         \
+  REGISTER_KERNEL_BUILDER(Name("TensorListConcat")                \
+                              .TypeConstraint<T>("element_dtype") \
+                              .Device(DEVICE_CPU),                \
+                          TensorListConcat<CPUDevice, T>)
 
 TF_CALL_POD_STRING_TYPES(REGISTER_TENSOR_LIST_STACK_CPU);
 REGISTER_TENSOR_LIST_STACK_CPU(quint8);
@@ -680,7 +704,11 @@ REGISTER_TENSOR_LIST_STACK_CPU(bfloat16);
   REGISTER_KERNEL_BUILDER(Name("TensorListScatter")               \
                               .TypeConstraint<T>("element_dtype") \
                               .Device(DEVICE_CPU),                \
-                          TensorListScatter<CPUDevice, T>)
+                          TensorListScatter<CPUDevice, T>)        \
+  REGISTER_KERNEL_BUILDER(Name("TensorListSplit")                 \
+                              .TypeConstraint<T>("element_dtype") \
+                              .Device(DEVICE_CPU),                \
+                          TensorListSplit<CPUDevice, T>)
 
 TF_CALL_POD_STRING_TYPES(REGISTER_TENSOR_LIST_FROM_TENSOR_CPU);
 REGISTER_TENSOR_LIST_FROM_TENSOR_CPU(quint8);
diff --git a/tensorflow/core/kernels/list_kernels.cu.cc b/tensorflow/core/kernels/list_kernels.cu.cc
index a00bf700ca2..23f552642ca 100644
--- a/tensorflow/core/kernels/list_kernels.cu.cc
+++ b/tensorflow/core/kernels/list_kernels.cu.cc
@@ -45,7 +45,12 @@ typedef Eigen::GpuDevice GPUDevice;
                               .TypeConstraint<T>("element_dtype") \
                               .Device(DEVICE_GPU)                 \
                               .HostMemory("indices"),             \
-                          TensorListGather<GPUDevice, T>)
+                          TensorListGather<GPUDevice, T>)         \
+  REGISTER_KERNEL_BUILDER(Name("TensorListConcat")                \
+                              .TypeConstraint<T>("element_dtype") \
+                              .Device(DEVICE_GPU)                 \
+                              .HostMemory("lengths"),             \
+                          TensorListConcat<GPUDevice, T>)
 
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_TENSOR_LIST_STACK_GPU);
 REGISTER_TENSOR_LIST_STACK_GPU(bfloat16);
@@ -82,7 +87,13 @@ REGISTER_TENSOR_LIST_PUSH_BACK_BATCH_GPU(bool);
                               .Device(DEVICE_GPU)                 \
                               .HostMemory("element_shape")        \
                               .HostMemory("indices"),             \
-                          TensorListScatter<GPUDevice, T>)
+                          TensorListScatter<GPUDevice, T>)        \
+  REGISTER_KERNEL_BUILDER(Name("TensorListSplit")                 \
+                              .TypeConstraint<T>("element_dtype") \
+                              .Device(DEVICE_GPU)                 \
+                              .HostMemory("element_shape")        \
+                              .HostMemory("lengths"),             \
+                          TensorListSplit<GPUDevice, T>)
 
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_TENSOR_LIST_FROM_TENSOR_GPU);
 REGISTER_TENSOR_LIST_FROM_TENSOR_GPU(bfloat16);
diff --git a/tensorflow/core/kernels/list_kernels.h b/tensorflow/core/kernels/list_kernels.h
index c2591f53141..686679474c4 100644
--- a/tensorflow/core/kernels/list_kernels.h
+++ b/tensorflow/core/kernels/list_kernels.h
@@ -30,6 +30,8 @@ limitations under the License.
 #include "tensorflow/core/kernels/concat_lib.h"
 #include "tensorflow/core/lib/core/coding.h"
 #include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/util/tensor_ops_util.h"
 #include "tensorflow/core/util/util.h"
 
 namespace tensorflow {
@@ -76,26 +78,30 @@ class TensorListStack : public OpKernel {
   ~TensorListStack() {}
 
   void Compute(OpKernelContext* c) override {
-    const TensorList* l = c->input(0).scalar<Variant>()().get<TensorList>();
-    OP_REQUIRES(c, l != nullptr,
+    const TensorList* tensor_list =
+        c->input(0).scalar<Variant>()().get<TensorList>();
+    OP_REQUIRES(c, tensor_list != nullptr,
                 errors::InvalidArgument(
                     "Input handle is not a list. Saw: '",
                     c->input(0).scalar<Variant>()().DebugString(), "'"));
-    OP_REQUIRES(c, element_dtype_ == l->element_dtype,
-                errors::InvalidArgument("Invalid data types; op elements ",
-                                        DataTypeString(element_dtype_),
-                                        " but list elements ",
-                                        DataTypeString(l->element_dtype)));
-    OP_REQUIRES(c, !l->tensors.empty() || l->element_shape.IsFullyDefined(),
-                errors::InvalidArgument("Tried to stack elements of a empty ",
-                                        "list with non-fully-defined shape: ",
-                                        l->element_shape.DebugString()));
+    OP_REQUIRES(
+        c, element_dtype_ == tensor_list->element_dtype,
+        errors::InvalidArgument(
+            "Invalid data types; op elements ", DataTypeString(element_dtype_),
+            " but list elements ", DataTypeString(tensor_list->element_dtype)));
+    OP_REQUIRES(
+        c,
+        !tensor_list->tensors.empty() ||
+            tensor_list->element_shape.IsFullyDefined(),
+        errors::InvalidArgument("Tried to stack elements of a empty ",
+                                "list with non-fully-defined shape: ",
+                                tensor_list->element_shape.DebugString()));
     if (num_elements_ != -1) {
-      OP_REQUIRES(c, l->tensors.size() == num_elements_,
-                  errors::InvalidArgument("Operation expected a list with ",
-                                          num_elements_,
-                                          " elements but got a list with ",
-                                          l->tensors.size(), " elements."));
+      OP_REQUIRES(c, tensor_list->tensors.size() == num_elements_,
+                  errors::InvalidArgument(
+                      "Operation expected a list with ", num_elements_,
+                      " elements but got a list with ",
+                      tensor_list->tensors.size(), " elements."));
     }
     // Compute the shape of the output tensor.
     // If `element_shape` is fully-defined it gets used. It is assumed that all
@@ -104,11 +110,11 @@ class TensorListStack : public OpKernel {
     // tensor is used and it is checked that all other tensors have the same
     // shape.
     TensorShape resulting_shape;
-    if (!l->element_shape.AsTensorShape(&resulting_shape)) {
-      const Tensor& t = l->tensors[0];
+    if (!tensor_list->element_shape.AsTensorShape(&resulting_shape)) {
+      const Tensor& t = tensor_list->tensors[0];
       resulting_shape = t.shape();
-      for (int i = 1; i < l->tensors.size(); ++i) {
-        const Tensor& t = l->tensors[i];
+      for (int i = 1; i < tensor_list->tensors.size(); ++i) {
+        const Tensor& t = tensor_list->tensors[i];
         OP_REQUIRES(c, t.shape() == resulting_shape,
                     errors::InvalidArgument(
                         "Tried to stack tensors with unequal shapes: ",
@@ -116,7 +122,7 @@ class TensorListStack : public OpKernel {
                         t.shape().DebugString()));
       }
     }
-    resulting_shape.InsertDim(0, l->tensors.size());
+    resulting_shape.InsertDim(0, tensor_list->tensors.size());
     Tensor* output;
     OP_REQUIRES_OK(c, c->allocate_output(0, resulting_shape, &output));
     if (output->NumElements() == 0) {
@@ -124,8 +130,8 @@ class TensorListStack : public OpKernel {
     }
 
     ConstMatrixVector inputs_flat;
-    inputs_flat.reserve(l->tensors.size());
-    for (const auto& t : l->tensors) {
+    inputs_flat.reserve(tensor_list->tensors.size());
+    for (const auto& t : tensor_list->tensors) {
       inputs_flat.emplace_back(new typename TTypes<T, 2>::ConstMatrix(
           t.shaped<T, 2>({1, t.NumElements()})));
     }
@@ -145,6 +151,200 @@ class TensorListStack : public OpKernel {
   DataType element_dtype_;
 };
 
+template <typename Device, typename T>
+class TensorListConcat : public OpKernel {
+ public:
+  using ConstMatrixVector =
+      std::vector<std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>;
+  explicit TensorListConcat(OpKernelConstruction* c) : OpKernel(c) {
+    OP_REQUIRES_OK(c, c->GetAttr("element_dtype", &element_dtype_));
+  }
+
+  ~TensorListConcat() {}
+
+  void Compute(OpKernelContext* c) override {
+    // Check that the input Variant tensor is indeed a TensorList and has the
+    // correct element type.
+    const TensorList* tensor_list =
+        c->input(0).scalar<Variant>()().get<TensorList>();
+    OP_REQUIRES(c, tensor_list != nullptr,
+                errors::InvalidArgument(
+                    "Input handle is not a list. Saw: '",
+                    c->input(0).scalar<Variant>()().DebugString(), "'"));
+    OP_REQUIRES(
+        c, element_dtype_ == tensor_list->element_dtype,
+        errors::InvalidArgument(
+            "Invalid data types; op elements ", DataTypeString(element_dtype_),
+            " but list elements ", DataTypeString(tensor_list->element_dtype)));
+    // If the TensorList is empty, its element_shape must be fully defined
+    // except for the first dimension.
+    PartialTensorShape shape_except_first_dim;
+    if (!tensor_list->element_shape.unknown_rank()) {
+      OP_REQUIRES(c, tensor_list->element_shape.dims() >= 1,
+                  errors::InvalidArgument(
+                      "Concat requires elements to be at least vectors, ",
+                      "found scalars instead."));
+      shape_except_first_dim = PartialTensorShape(
+          gtl::ArraySlice<int64>(tensor_list->element_shape.dim_sizes())
+              .subspan(1));
+    }
+    OP_REQUIRES(c,
+                !tensor_list->tensors.empty() ||
+                    shape_except_first_dim.IsFullyDefined(),
+                errors::InvalidArgument(
+                    "All except the first dimension must be fully defined ",
+                    "when concating an empty tensor list. element_shape: ",
+                    tensor_list->element_shape.DebugString()));
+    // 1. Compute the shape of the output tensor.
+    // If `shape_except_first_dim` is fully-defined we just prepend the leading
+    // dim to it. Otherwise we use the shape of the first element tensor and
+    // check to make sure shapes of all tensors are compatible.
+    TensorShape output_shape;
+    if (!shape_except_first_dim.AsTensorShape(&output_shape)) {
+      const Tensor& element_tensor = tensor_list->tensors[0];
+      OP_REQUIRES(
+          c, TensorShapeUtils::IsVectorOrHigher(element_tensor.shape()),
+          errors::InvalidArgument("Concat saw a scalar shape at index ", 0,
+                                  " but requires at least vectors."));
+      output_shape =
+          TensorShape(gtl::ArraySlice<int64>(element_tensor.shape().dim_sizes())
+                          .subspan(1));
+      for (int i = 1; i < tensor_list->tensors.size(); ++i) {
+        const Tensor& element_tensor = tensor_list->tensors[i];
+        OP_REQUIRES(
+            c, TensorShapeUtils::IsVectorOrHigher(element_tensor.shape()),
+            errors::InvalidArgument("Concat saw a scalar shape at index ", i,
+                                    " but requires at least vectors."));
+        TensorShape actual_shape(
+            gtl::ArraySlice<int64>(element_tensor.shape().dim_sizes())
+                .subspan(1));
+        OP_REQUIRES(c, actual_shape.dim_sizes() == output_shape.dim_sizes(),
+                    errors::InvalidArgument(
+                        "Tried to concat tensors with unequal shapes: ",
+                        output_shape.DebugString(), " vs ",
+                        actual_shape.DebugString()));
+      }
+    }
+    // 2. Build the lengths_tensor and leading dim of the output tensor by
+    // iterating over all element tensors.
+    Tensor* lengths_tensor = nullptr;
+    OP_REQUIRES_OK(
+        c,
+        c->allocate_output(
+            1, TensorShape({static_cast<int64>(tensor_list->tensors.size())}),
+            &lengths_tensor));
+    auto lengths_tensor_vec = lengths_tensor->vec<int64>();
+    int64 leading_dim = 0;
+    for (size_t i = 0; i < tensor_list->tensors.size(); i++) {
+      int64 dim = tensor_list->tensors[i].shape().dim_size(0);
+      leading_dim += dim;
+      lengths_tensor_vec(i) = dim;
+    }
+    output_shape.InsertDim(0, leading_dim);
+    Tensor* output;
+    // 3. Allocate the output tensor and fill it up with the concated element
+    // tensors.
+    OP_REQUIRES_OK(c, c->allocate_output(0, output_shape, &output));
+    if (output->NumElements() == 0) {
+      return;
+    }
+
+    ConstMatrixVector inputs_flat;
+    inputs_flat.reserve(tensor_list->tensors.size());
+    for (const auto& element_tensor : tensor_list->tensors) {
+      inputs_flat.emplace_back(new typename TTypes<T, 2>::ConstMatrix(
+          element_tensor.shaped<T, 2>({1, element_tensor.NumElements()})));
+    }
+    auto output_flat = output->shaped<T, 2>({1, output->NumElements()});
+
+#if GOOGLE_CUDA
+    if (std::is_same<Device, Eigen::GpuDevice>::value) {
+      ConcatGPU<T>(c, inputs_flat, output, &output_flat);
+      return;
+    }
+#endif  // GOOGLE_CUDA
+    ConcatCPU<T>(c->device(), inputs_flat, &output_flat);
+  }
+
+ private:
+  DataType element_dtype_;
+};
+
+template <typename Device, typename T>
+class TensorListSplit : public OpKernel {
+ public:
+  TensorListSplit(OpKernelConstruction* c) : OpKernel(c) {}
+
+  void Compute(OpKernelContext* c) override {
+    Tensor* output_tensor;
+    AllocatorAttributes attr;
+    attr.set_on_host(true);
+    OP_REQUIRES_OK(c, c->allocate_output(0, {}, &output_tensor, attr));
+    PartialTensorShape element_shape;
+    OP_REQUIRES_OK(c, TensorShapeFromTensor(c->input(1), &element_shape));
+    OP_REQUIRES(c, element_shape.unknown_rank() || element_shape.dims() >= 1,
+                errors::InvalidArgument(
+                    "TensorListSplit requires element_shape to be at least of ",
+                    "rank 1, but saw: ", element_shape.DebugString()));
+    TensorList output_list;
+    const Tensor& input_tensor = c->input(0);
+    output_list.element_dtype = input_tensor.dtype();
+    OP_REQUIRES(c, TensorShapeUtils::IsVectorOrHigher(input_tensor.shape()),
+                errors::InvalidArgument(
+                    "Tensor must be at least a vector, but saw shape: ",
+                    input_tensor.shape().DebugString()));
+    TensorShape tensor_shape_without_first_dim(input_tensor.shape());
+    tensor_shape_without_first_dim.RemoveDim(0);
+    PartialTensorShape element_shape_without_first_dim;
+    if (!element_shape.unknown_rank()) {
+      element_shape_without_first_dim =
+          PartialTensorShape(element_shape.dim_sizes());
+      element_shape_without_first_dim.RemoveDim(0);
+    }
+    OP_REQUIRES(c,
+                element_shape_without_first_dim.IsCompatibleWith(
+                    tensor_shape_without_first_dim),
+                errors::InvalidArgument(
+                    "tensor shape ", input_tensor.shape().DebugString(),
+                    " is not compatible with element_shape ",
+                    element_shape.DebugString()));
+    output_list.element_shape = element_shape;
+    const Tensor& lengths = c->input(2);
+    OP_REQUIRES(c, TensorShapeUtils::IsVector(lengths.shape()),
+                errors::InvalidArgument(
+                    "Expected lengths to be a vector, received shape: ",
+                    lengths.shape().DebugString()));
+    output_list.tensors.reserve(lengths.shape().dim_size(0));
+    int64 start = 0;
+    int64 end = 0;
+    for (int i = 0; i < lengths.shape().dim_size(0); ++i) {
+      int64 length = lengths.vec<int64>()(i);
+      OP_REQUIRES(
+          c, length >= 0,
+          errors::InvalidArgument("Invalid value in lengths: ", length));
+      end = start + length;
+      OP_REQUIRES(c, end <= input_tensor.shape().dim_size(0),
+                  errors::InvalidArgument("Attempting to slice [", start, ", ",
+                                          end, "] from tensor with length ",
+                                          input_tensor.shape().dim_size(0)));
+      Tensor tmp = input_tensor.Slice(start, end);
+      start = end;
+      // TODO(apassos) maybe not always align; but weird compiler bugs seem to
+      // prevent this.
+      Tensor aligned;
+      OP_REQUIRES_OK(c, c->allocate_temp(tmp.dtype(), tmp.shape(), &aligned));
+      aligned.flat<T>().device(c->eigen_device<Device>()) =
+          tmp.unaligned_flat<T>();
+      output_list.tensors.emplace_back(aligned);
+    }
+    OP_REQUIRES(c, end == input_tensor.shape().dim_size(0),
+                errors::InvalidArgument(
+                    "Unused values in tensor. Length of tensor: ",
+                    input_tensor.shape().dim_size(0), " Values used: ", end));
+    output_tensor->scalar<Variant>()() = std::move(output_list);
+  }
+};
+
 template <typename Device, typename T>
 class TensorListGather : public OpKernel {
  public:
@@ -155,22 +355,25 @@ class TensorListGather : public OpKernel {
   }
 
   void Compute(OpKernelContext* c) override {
-    const TensorList* l = c->input(0).scalar<Variant>()().get<TensorList>();
-    OP_REQUIRES(c, l != nullptr,
+    const TensorList* tensor_list =
+        c->input(0).scalar<Variant>()().get<TensorList>();
+    OP_REQUIRES(c, tensor_list != nullptr,
                 errors::InvalidArgument(
                     "Input handle is not a list. Saw: '",
                     c->input(0).scalar<Variant>()().DebugString(), "'"));
-    OP_REQUIRES(c, element_dtype_ == l->element_dtype,
-                errors::InvalidArgument("Invalid data types; op elements ",
-                                        DataTypeString(element_dtype_),
-                                        " but list elements ",
-                                        DataTypeString(l->element_dtype)));
+    OP_REQUIRES(
+        c, element_dtype_ == tensor_list->element_dtype,
+        errors::InvalidArgument(
+            "Invalid data types; op elements ", DataTypeString(element_dtype_),
+            " but list elements ", DataTypeString(tensor_list->element_dtype)));
     Tensor indices = c->input(1);
-    OP_REQUIRES(c,
-                indices.NumElements() > 0 || l->element_shape.IsFullyDefined(),
-                errors::InvalidArgument("Tried to gather 0-elements from "
-                                        "a list with non-fully-defined shape: ",
-                                        l->element_shape.DebugString()));
+    OP_REQUIRES(
+        c,
+        indices.NumElements() > 0 ||
+            tensor_list->element_shape.IsFullyDefined(),
+        errors::InvalidArgument("Tried to gather 0-elements from "
+                                "a list with non-fully-defined shape: ",
+                                tensor_list->element_shape.DebugString()));
     // Compute the shape of the output tensor.
     // If `element_shape` is fully-defined it gets used. It is assumed that all
     // requested tensors have the same shape.
@@ -178,17 +381,17 @@ class TensorListGather : public OpKernel {
     // tensor is used and it is checked that all other tensors have the same
     // shape.
     TensorShape resulting_shape;
-    if (!l->element_shape.AsTensorShape(&resulting_shape)) {
+    if (!tensor_list->element_shape.AsTensorShape(&resulting_shape)) {
       const int i = indices.flat<int32>()(0);
       OP_REQUIRES(
-          c, i < l->tensors.size(),
+          c, i < tensor_list->tensors.size(),
           errors::InvalidArgument("Index ", i, " out o range; list only has ",
-                                  l->tensors.size(), " elements."));
-      const Tensor& t = l->tensors[i];
+                                  tensor_list->tensors.size(), " elements."));
+      const Tensor& t = tensor_list->tensors[i];
       resulting_shape = t.shape();
       for (int index = 1; index < indices.NumElements(); ++index) {
         const int i = indices.flat<int32>()(index);
-        const Tensor& t = l->tensors[i];
+        const Tensor& t = tensor_list->tensors[i];
         OP_REQUIRES(c, t.shape() == resulting_shape,
                     errors::InvalidArgument(
                         "Tried to gather elements with unequal shapes: ",
@@ -204,14 +407,14 @@ class TensorListGather : public OpKernel {
     }
 
     ConstMatrixVector inputs_flat;
-    inputs_flat.reserve(l->tensors.size());
+    inputs_flat.reserve(tensor_list->tensors.size());
     for (int index = 0; index < indices.NumElements(); ++index) {
       const int i = indices.flat<int32>()(index);
       OP_REQUIRES(
-          c, i < l->tensors.size(),
+          c, i < tensor_list->tensors.size(),
           errors::InvalidArgument("Index ", i, " out o range; list only has ",
-                                  l->tensors.size(), " elements."));
-      const Tensor& t = l->tensors[i];
+                                  tensor_list->tensors.size(), " elements."));
+      const Tensor& t = tensor_list->tensors[i];
       inputs_flat.emplace_back(new typename TTypes<T, 2>::ConstMatrix(
           t.shaped<T, 2>({1, t.NumElements()})));
     }
@@ -289,13 +492,13 @@ class TensorListScatter : public OpKernel {
     PartialTensorShape element_shape;
     OP_REQUIRES_OK(c, TensorShapeFromTensor(c->input(2), &element_shape));
     TensorList output_list;
-    const Tensor& t = c->input(0);
-    output_list.element_dtype = t.dtype();
-    OP_REQUIRES(c, TensorShapeUtils::IsVectorOrHigher(t.shape()),
+    const Tensor& input_tensor = c->input(0);
+    output_list.element_dtype = input_tensor.dtype();
+    OP_REQUIRES(c, TensorShapeUtils::IsVectorOrHigher(input_tensor.shape()),
                 errors::InvalidArgument(
                     "Tensor must be at least a vector, but saw shape: ",
-                    t.shape().DebugString()));
-    TensorShape output_shape(t.shape());
+                    input_tensor.shape().DebugString()));
+    TensorShape output_shape(input_tensor.shape());
     output_shape.RemoveDim(0);
     OP_REQUIRES(c, element_shape.IsCompatibleWith(output_shape),
                 errors::InvalidArgument(
@@ -305,11 +508,11 @@ class TensorListScatter : public OpKernel {
     output_list.tensors.reserve(indices.NumElements());
     for (int index = 0; index < indices.NumElements(); ++index) {
       const int i = indices.flat<int32>()(index);
-      OP_REQUIRES(c, i < t.shape().dim_size(0),
-                  errors::InvalidArgument("Trying to scatter index ", i,
-                                          " from tensor with ",
-                                          t.shape().dim_size(0), " rows."));
-      Tensor tmp = t.Slice(i, i + 1);
+      OP_REQUIRES(c, i < input_tensor.shape().dim_size(0),
+                  errors::InvalidArgument(
+                      "Trying to scatter index ", i, " from tensor with ",
+                      input_tensor.shape().dim_size(0), " rows."));
+      Tensor tmp = input_tensor.Slice(i, i + 1);
       TensorShape tmp_shape = tmp.shape();
       tmp_shape.RemoveDim(0);
       OP_REQUIRES(c, tmp.CopyFrom(tmp, tmp_shape),
@@ -357,40 +560,10 @@ Status TensorListBinaryAdd(OpKernelContext* c, const TensorList& a,
   for (int i = 0; i < a.tensors.size(); ++i) {
     const Tensor& a_tensor = a.tensors[i];
     const Tensor& b_tensor = b.tensors[i];
-    if (a_tensor.dtype() == DT_INVALID) {
-      out->tensors.push_back(b_tensor);
-      continue;
-    }
-    if (b_tensor.dtype() == DT_INVALID) {
-      out->tensors.push_back(a_tensor);
-      continue;
-    }
-    if (a_tensor.shape() != b_tensor.shape()) {
-      // TODO(apassos) support broadcasting additions here?
-      return errors::InvalidArgument(
-          "Trying to add two tensors with incompatible element shapes. "
-          "One is ",
-          a_tensor.shape().DebugString(), " and the other is ",
-          b_tensor.shape().DebugString(), " in position ", i);
-    }
     Tensor out_tensor;
     TF_RETURN_IF_ERROR(
-        c->allocate_temp(a_tensor.dtype(), a_tensor.shape(), &out_tensor));
+        BinaryAddTensors<Device>(c, a_tensor, b_tensor, &out_tensor));
     out->tensors.push_back(out_tensor);
-    switch (out_tensor.dtype()) {
-#define DTYPE_CASE(dtype)                                        \
-  case DataTypeToEnum<dtype>::value:                             \
-    out_tensor.flat<dtype>().device(c->eigen_device<Device>()) = \
-        a_tensor.flat<dtype>() + b_tensor.flat<dtype>();         \
-    break;
-
-      TF_CALL_NUMBER_TYPES(DTYPE_CASE)
-
-#undef DTYPE_CASE
-      default:
-        return errors::InvalidArgument("Trying to add unsupported dtype ",
-                                       out_tensor.dtype());
-    }
   }
   return Status::OK();
 }
@@ -403,46 +576,7 @@ Status TensorListZerosLike(OpKernelContext* c, const TensorList& x,
   y->tensors.reserve(x.tensors.size());
   for (const Tensor& t : x.tensors) {
     Tensor out_tensor;
-    AllocatorAttributes attr;
-    if (t.dtype() == DT_VARIANT) {
-      attr.set_on_host(true);
-    }
-    TF_RETURN_IF_ERROR(
-        c->allocate_temp(t.dtype(), t.shape(), &out_tensor, attr));
-    switch (out_tensor.dtype()) {
-#define DTYPE_CASE(dtype)                                        \
-  case DataTypeToEnum<dtype>::value:                             \
-    out_tensor.flat<dtype>().device(c->eigen_device<Device>()) = \
-        out_tensor.flat<dtype>().constant(dtype(0));             \
-    break;
-
-      TF_CALL_POD_TYPES(DTYPE_CASE)
-
-#undef DTYPE_CASE
-
-      case DT_INVALID: {
-        // Uninitialized tensor in the TensorList.
-        out_tensor = Tensor(DT_INVALID);
-        break;
-      }
-      case DataTypeToEnum<Variant>::value: {
-        const TensorList* inner_x = t.scalar<Variant>()().get<TensorList>();
-        if (inner_x == nullptr) {
-          return errors::InvalidArgument("Input handle is not a list. Saw: '",
-                                         t.scalar<Variant>()().DebugString(),
-                                         "'");
-        }
-        TensorList inner_y;
-        TF_RETURN_IF_ERROR(TensorListZerosLike<Device>(c, *inner_x, &inner_y));
-        out_tensor.scalar<Variant>()() = std::move(inner_y);
-        break;
-      }
-
-      default:
-        return errors::InvalidArgument(
-            "Trying to compute zeros_like for unsupported dtype ",
-            DataTypeString(out_tensor.dtype()));
-    }
+    TF_RETURN_IF_ERROR(ZerosLikeTensor<Device>(c, t, &out_tensor));
     y->tensors.emplace_back(out_tensor);
   }
   return Status::OK();
diff --git a/tensorflow/core/kernels/maxpooling_op_gpu.cu.cc b/tensorflow/core/kernels/maxpooling_op_gpu.cu.cc
index 0c7a236b2ff..89ffe6494e2 100644
--- a/tensorflow/core/kernels/maxpooling_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/maxpooling_op_gpu.cu.cc
@@ -384,6 +384,8 @@ bool MaxPoolForwardNoMask_NCHW_VECT_C::operator()(
     int32* top_data, const Eigen::GpuDevice& d) {
   const int kThreadsPerBlock = 1024;
   const int output_size = batch * channels * pooled_height * pooled_width;
+  if (output_size == 0)
+    return true;
   MaxPoolForwardNoMaskKernel_NCHW_VECT_C<<<
       (output_size + kThreadsPerBlock - 1) / kThreadsPerBlock, kThreadsPerBlock,
       0, d.stream()>>>(output_size, bottom_data, height, width, channels,
@@ -402,6 +404,8 @@ bool MaxPoolForwardWithOptionalArgmax<T>::operator()(
     int64* mask, const Eigen::GpuDevice& d, bool propagate_nans) {
   const int kThreadsPerBlock = 1024;
   const int output_size = batch * channels * pooled_height * pooled_width;
+  if (output_size == 0)
+    return true;
   if (propagate_nans) {
     MaxPoolForwardNHWC<true>
         <<<(output_size + kThreadsPerBlock - 1) / kThreadsPerBlock,
@@ -430,6 +434,8 @@ bool MaxPoolBackwardNoMask<T>::operator()(
   const int kThreadsPerBlock = 1024;
 
   const int bottom_size = batch * channels * height * width;
+  if (bottom_size == 0)
+    return true;
   SetZero<<<(bottom_size + kThreadsPerBlock - 1) / kThreadsPerBlock,
             kThreadsPerBlock, 0, d.stream()>>>(bottom_size, bottom_diff);
 
@@ -449,6 +455,8 @@ bool MaxPoolBackwardWithArgmax<T>::operator()(
     const int64* mask, const int top_offset, const int bottom_offset,
     T* bottom_diff, const Eigen::GpuDevice& d) {
   const int kThreadsPerBlock = 1024;
+  if (input_size == 0)
+    return true;
   SetZero<<<(input_size + kThreadsPerBlock - 1) / kThreadsPerBlock,
             kThreadsPerBlock, 0, d.stream()>>>(input_size, bottom_diff);
   MaxPoolBackward<<<(output_size + kThreadsPerBlock - 1) / kThreadsPerBlock,
@@ -466,6 +474,8 @@ bool MaxPoolGradBackwardNoMask<T>::operator()(
     const int pad_l, const T* top_diff, T* bottom_diff,
     const Eigen::GpuDevice& d) {
   const int num_kernels = batch * channels * pooled_height * pooled_width;
+  if (num_kernels == 0)
+    return true;
   CudaLaunchConfig config = GetCudaLaunchConfig(num_kernels, d);
 
   if (data_format == FORMAT_NHWC) {
@@ -489,6 +499,8 @@ bool MaxPoolGradBackwardWithArgmax<T>::operator()(
     const int output_size, const int input_size, const T* top_diff,
     const int64* mask, const int top_offset, const int bottom_offset,
     T* bottom_diff, const Eigen::GpuDevice& d) {
+  if (input_size == 0)
+    return true;
   CudaLaunchConfig config = GetCudaLaunchConfig(output_size, d);
   MaxPoolGradBackward<<<config.block_count, config.thread_per_block, 0,
                         d.stream()>>>(output_size, top_diff, mask, top_offset,
diff --git a/tensorflow/core/kernels/mkl_avgpooling_op.cc b/tensorflow/core/kernels/mkl_avgpooling_op.cc
index 2409f7e9dc2..939cbd6f96a 100644
--- a/tensorflow/core/kernels/mkl_avgpooling_op.cc
+++ b/tensorflow/core/kernels/mkl_avgpooling_op.cc
@@ -357,22 +357,21 @@ class MklAvgPoolingGradOp : public OpKernel {
       if (!outbackprop_in_mkl_format) {
         // For avgpooling, tensor_in_shape should have 1 dimension, and 4
         // elements.
-        OP_REQUIRES(
-            context,
-            tensor_in_shape.dims() == 1 && tensor_in_shape.NumElements() == 4,
-            errors::InvalidArgument("original input shape must be "
-                                    "1-dimensional and 4 elements"));
+        OP_REQUIRES(context, tensor_in_shape.dims() == 1 &&
+                                 tensor_in_shape.NumElements() == 4,
+                    errors::InvalidArgument("original input shape must be "
+                                            "1-dimensional and 4 elements"));
 
         // For avgpooling, out_backprop should have 4 dimensions.
-        OP_REQUIRES(context, out_backprop.dims() == 4,
-                    errors::InvalidArgument("out_backprop must be "
-                                            "4-dimensional"));
+        OP_REQUIRES(
+            context, out_backprop.dims() == 4,
+            errors::InvalidArgument("out_backprop must be 4-dimensional"));
       } else {
         // Input in MKL format.
         // For avgpooling, out_backprop should have 4 dimensions.
-        OP_REQUIRES(context, out_backprop_shape.GetDimension() == 4,
-                    errors::InvalidArgument("out_backprop must be "
-                                            "4-dimensional"));
+        OP_REQUIRES(
+            context, out_backprop_shape.GetDimension() == 4,
+            errors::InvalidArgument("out_backprop must be 4-dimensional"));
       }
 
       // TODO(inteltf): Get outbackprop layout.
@@ -484,9 +483,9 @@ class MklAvgPoolingOp : public MklPoolingForwardOpBase<T> {
           dnn_shape_input.IsMklTensor()
               ? dnn_shape_input.GetSizesAsMklDnnDims()
               : is_pool2d ? TFShapeToMklDnnDimsInNCHW(input_tensor.shape(),
-                                                     this->data_format_tf_)
-                         : TFShapeToMklDnnDimsInNCDHW(input_tensor.shape(),
-                                                      this->data_format_tf_);
+                                                      this->data_format_tf_)
+                          : TFShapeToMklDnnDimsInNCDHW(input_tensor.shape(),
+                                                       this->data_format_tf_);
       memory::desc input_md = dnn_shape_input.IsMklTensor()
                                   ? dnn_shape_input.GetMklLayout()
                                   : memory::desc(src_dims, MklDnnType<T>(),
@@ -494,9 +493,17 @@ class MklAvgPoolingOp : public MklPoolingForwardOpBase<T> {
 
       // Get an average pooling primitive from the op pool
       MklPoolingFwdPrimitive<T>* pooling_fwd = nullptr;
+      prop_kind pooling_prop_kind;
+      bool int8_forward_inference =
+          std::is_same<T, qint8>::value || std::is_same<T, quint8>::value;
+      if (int8_forward_inference)
+        pooling_prop_kind = prop_kind::forward_inference;
+      else
+        pooling_prop_kind = prop_kind::forward_training;
       MklPoolingParams fwdParams(src_dims, output_dims_mkl_order, filter_dims,
                                  strides, padding_left, padding_right,
-                                 algorithm::pooling_avg_exclude_padding);
+                                 algorithm::pooling_avg_exclude_padding,
+                                 pooling_prop_kind);
       pooling_fwd = MklPoolingFwdPrimitiveFactory<T>::Get(fwdParams);
 
       // allocate output tensor
@@ -523,10 +530,30 @@ class MklAvgPoolingOp : public MklPoolingForwardOpBase<T> {
 
       // execute pooling
       pooling_fwd->Execute(src_data, dst_data);
+
+      // Pass min, max from input to output
+      if (int8_forward_inference) {
+        const Tensor& min_input_t = MklGetInput(context, 1);
+        const Tensor& max_input_t = MklGetInput(context, 2);
+        const float min_input = min_input_t.flat<float>()(0);
+        const float max_input = max_input_t.flat<float>()(0);
+
+        Tensor* output_min = nullptr;
+        Tensor* output_max = nullptr;
+        MklDnnShape output_min_mkl_shape, output_max_mkl_shape;
+        output_min_mkl_shape.SetMklTensor(false);
+        output_max_mkl_shape.SetMklTensor(false);
+        AllocateOutputSetMklShape(context, 1, &output_min, {},
+                                  output_min_mkl_shape);
+        AllocateOutputSetMklShape(context, 2, &output_max, {},
+                                  output_max_mkl_shape);
+        output_min->flat<float>()(0) = min_input;
+        output_max->flat<float>()(0) = max_input;
+      }
     } catch (mkldnn::error& e) {
-      string error_msg = "Status: " + std::to_string(e.status) +
-                         ", message: " + string(e.message) + ", in file " +
-                         string(__FILE__) + ":" + std::to_string(__LINE__);
+      string error_msg = "Status: " + std::to_string(e.status) + ", message: " +
+                         string(e.message) + ", in file " + string(__FILE__) +
+                         ":" + std::to_string(__LINE__);
       OP_REQUIRES_OK(
           context,
           errors::Aborted("Operation received an exception:", error_msg));
@@ -576,24 +603,26 @@ class MklAvgPoolingGradOp : public MklPoolingBackwardOpBase<T> {
           orig_input_mkl_shape.IsMklTensor()
               ? orig_input_mkl_shape.GetSizesAsMklDnnDims()
               : is_pool2d ? TFShapeToMklDnnDimsInNCHW(orig_input_shape,
-                                                     this->data_format_tf_)
-                         : TFShapeToMklDnnDimsInNCDHW(orig_input_shape,
-                                                      this->data_format_tf_);
+                                                      this->data_format_tf_)
+                          : TFShapeToMklDnnDimsInNCDHW(orig_input_shape,
+                                                       this->data_format_tf_);
 
       memory::dims diff_dst_dims =
           grad_mkl_shape.IsMklTensor()
               ? grad_mkl_shape.GetSizesAsMklDnnDims()
               : is_pool2d ? TFShapeToMklDnnDimsInNCHW(grad_tensor.shape(),
-                                                     this->data_format_tf_)
-                         : TFShapeToMklDnnDimsInNCDHW(grad_tensor.shape(),
-                                                      this->data_format_tf_);
+                                                      this->data_format_tf_)
+                          : TFShapeToMklDnnDimsInNCDHW(grad_tensor.shape(),
+                                                       this->data_format_tf_);
       memory::dims output_dims_mkl_order;
       this->GetOutputDims(pool_params, &output_dims_mkl_order);
 
-      MklPoolingParams bwdParams(orig_input_dims_mkl_order,
-                                 output_dims_mkl_order, filter_dims, strides,
-                                 padding_left, padding_right,
-                                 algorithm::pooling_avg_exclude_padding);
+      // Pass prop_kind::forward_training to create a forward primitive
+      // that is used in the backward pass
+      MklPoolingParams bwdParams(
+          orig_input_dims_mkl_order, output_dims_mkl_order, filter_dims,
+          strides, padding_left, padding_right,
+          algorithm::pooling_avg_exclude_padding, prop_kind::forward_training);
       MklPoolingBwdPrimitive<T>* pooling_bwd =
           MklPoolingBwdPrimitiveFactory<T>::Get(bwdParams);
 
@@ -624,9 +653,9 @@ class MklAvgPoolingGradOp : public MklPoolingBackwardOpBase<T> {
       // execute pooling op
       pooling_bwd->Execute(diff_dst_data, diff_src_data);
     } catch (mkldnn::error& e) {
-      string error_msg = "Status: " + std::to_string(e.status) +
-                         ", message: " + string(e.message) + ", in file " +
-                         string(__FILE__) + ":" + std::to_string(__LINE__);
+      string error_msg = "Status: " + std::to_string(e.status) + ", message: " +
+                         string(e.message) + ", in file " + string(__FILE__) +
+                         ":" + std::to_string(__LINE__);
       OP_REQUIRES_OK(context, errors::Aborted("Compute received an exception:",
                                               error_msg));
     }
@@ -645,28 +674,26 @@ class MklAvgPoolingGradOp : public MklPoolingBackwardOpBase<T> {
                          const MklDnnShape& original_input_mkl_shape,
                          const MklDnnShape& input_gradient_mkl_shape) {
     if (!original_input_mkl_shape.IsMklTensor()) {
-      OP_REQUIRES(
-          context,
-          tensor_in_shape.dims() == 1 && tensor_in_shape.NumElements() == 4,
-          errors::InvalidArgument("original input shape must be "
-                                  "1-dimensional and 4 elements"));
+      OP_REQUIRES(context, tensor_in_shape.dims() == 1 &&
+                               tensor_in_shape.NumElements() == 4,
+                  errors::InvalidArgument("original input shape must be "
+                                          "1-dimensional and 4 elements"));
     } else {
-      OP_REQUIRES(context,
-                  original_input_mkl_shape.GetDimension() == 1 &&
-                      original_input_mkl_shape.DimSize(0) == 4,
+      OP_REQUIRES(context, original_input_mkl_shape.GetDimension() == 1 &&
+                               original_input_mkl_shape.DimSize(0) == 4,
                   errors::InvalidArgument("original input shape must be "
                                           "1-dimensional and 4 elements"));
     }
 
     if (!input_gradient_mkl_shape.IsMklTensor()) {
       // For avgpooling, input_gradient_diff_dst should have 4 dimensions.
-      OP_REQUIRES(context, input_gradient_tensor.dims() == 4,
-                  errors::InvalidArgument("Gradient shape must be "
-                                          "4-dimensional"));
+      OP_REQUIRES(
+          context, input_gradient_tensor.dims() == 4,
+          errors::InvalidArgument("Gradient shape must be 4-dimensional"));
     } else {
-      OP_REQUIRES(context, input_gradient_mkl_shape.GetDimension() == 4,
-                  errors::InvalidArgument("Gradient shape must be "
-                                          "4-dimensional"));
+      OP_REQUIRES(
+          context, input_gradient_mkl_shape.GetDimension() == 4,
+          errors::InvalidArgument("Gradient shape must be 4-dimensional"));
     }
   }
 };  // MklAvgPoolingGradOp
@@ -691,6 +718,18 @@ REGISTER_KERNEL_BUILDER(Name("_MklAvgPool")
                             .Label(mkl_op_registry::kMklOpLabel),
                         MklAvgPoolingOp<CPUDevice, float>);
 
+REGISTER_KERNEL_BUILDER(Name("_MklQuantizedAvgPool")
+                            .Device(DEVICE_CPU)
+                            .TypeConstraint<quint8>("T")
+                            .Label(mkl_op_registry::kMklQuantizedOpLabel),
+                        MklAvgPoolingOp<CPUDevice, quint8>);
+
+REGISTER_KERNEL_BUILDER(Name("_MklQuantizedAvgPool")
+                            .Device(DEVICE_CPU)
+                            .TypeConstraint<qint8>("T")
+                            .Label(mkl_op_registry::kMklQuantizedOpLabel),
+                        MklAvgPoolingOp<CPUDevice, qint8>);
+
 REGISTER_KERNEL_BUILDER(Name("_MklAvgPoolGrad")
                             .Device(DEVICE_CPU)
                             .TypeConstraint<float>("T")
diff --git a/tensorflow/core/kernels/mkl_conv_ops.cc b/tensorflow/core/kernels/mkl_conv_ops.cc
index 14d134e2d0c..75f08956b4c 100644
--- a/tensorflow/core/kernels/mkl_conv_ops.cc
+++ b/tensorflow/core/kernels/mkl_conv_ops.cc
@@ -465,19 +465,18 @@ class MklConvOp : public OpKernel {
                                         filter.shape().DebugString()));
 
     for (int i = 0; i < 3; i++) {
-      OP_REQUIRES(
-          context,
-          FastBoundsCheck(filter.dim_size(i), std::numeric_limits<int>::max()),
-          errors::InvalidArgument("filter too large"));
+      OP_REQUIRES(context, FastBoundsCheck(filter.dim_size(i),
+                                           std::numeric_limits<int>::max()),
+                  errors::InvalidArgument("filter too large"));
     }
 
     const int64 input_depth =
         input_in_mkl_format ? GetMklTensorDim(mkl_context.input_shape, 'C')
                             : GetTensorDim(input, data_format_, 'C');
-    OP_REQUIRES(context, input_depth == filter.dim_size(2),
-                errors::InvalidArgument(
-                    "input and filter must have the same depth: ", input_depth,
-                    " vs ", filter.dim_size(2)));
+    OP_REQUIRES(
+        context, input_depth == filter.dim_size(2),
+        errors::InvalidArgument("input and filter must have the same depth: ",
+                                input_depth, " vs ", filter.dim_size(2)));
     // The last dimension for filter is out_depth.
     const int out_depth = static_cast<int>(filter.dim_size(3));
 
@@ -486,10 +485,9 @@ class MklConvOp : public OpKernel {
     const int64 input_rows_raw =
         input_in_mkl_format ? GetMklTensorDim(mkl_context.input_shape, 'H')
                             : GetTensorDim(input, data_format_, 'H');
-    OP_REQUIRES(
-        context,
-        FastBoundsCheck(input_rows_raw, std::numeric_limits<int>::max()),
-        errors::InvalidArgument("Input rows too large"));
+    OP_REQUIRES(context, FastBoundsCheck(input_rows_raw,
+                                         std::numeric_limits<int>::max()),
+                errors::InvalidArgument("Input rows too large"));
     const int input_rows = static_cast<int>(input_rows_raw);
     const int filter_rows = static_cast<int>(filter.dim_size(0));
 
@@ -498,10 +496,9 @@ class MklConvOp : public OpKernel {
     const int64 input_cols_raw =
         input_in_mkl_format ? GetMklTensorDim(mkl_context.input_shape, 'W')
                             : GetTensorDim(input, data_format_, 'W');
-    OP_REQUIRES(
-        context,
-        FastBoundsCheck(input_cols_raw, std::numeric_limits<int>::max()),
-        errors::InvalidArgument("Input cols too large"));
+    OP_REQUIRES(context, FastBoundsCheck(input_cols_raw,
+                                         std::numeric_limits<int>::max()),
+                errors::InvalidArgument("Input cols too large"));
     const int input_cols = static_cast<int>(input_cols_raw);
     const int filter_cols = static_cast<int>(filter.dim_size(1));
 
@@ -509,10 +506,9 @@ class MklConvOp : public OpKernel {
     const int64 input_batch_raw =
         input_in_mkl_format ? GetMklTensorDim(mkl_context.input_shape, 'N')
                             : GetTensorDim(input, data_format_, 'N');
-    OP_REQUIRES(
-        context,
-        FastBoundsCheck(input_batch_raw, std::numeric_limits<int>::max()),
-        errors::InvalidArgument("batch is too large"));
+    OP_REQUIRES(context, FastBoundsCheck(input_batch_raw,
+                                         std::numeric_limits<int>::max()),
+                errors::InvalidArgument("batch is too large"));
     const int batch = static_cast<int>(input_batch_raw);
 
     // For now we take the stride from the second and third dimensions only (we
@@ -893,17 +889,15 @@ class MklConvOp : public OpKernel {
       OP_REQUIRES(context, dilations_.size() == 5,
                   errors::InvalidArgument("Dilation rates field must "
                                           "specify 5 dimensions"));
-      OP_REQUIRES(context,
-                  (GetTensorDim(dilations_, data_format_, 'N') == 1 &&
-                   GetTensorDim(dilations_, data_format_, 'C') == 1),
+      OP_REQUIRES(context, (GetTensorDim(dilations_, data_format_, 'N') == 1 &&
+                            GetTensorDim(dilations_, data_format_, 'C') == 1),
                   errors::InvalidArgument(
                       "Current implementation does not yet support "
                       "dilations rates in the batch and depth dimensions."));
       OP_REQUIRES(
-          context,
-          (GetTensorDim(dilations_, data_format_, '0') > 0 &&
-           GetTensorDim(dilations_, data_format_, '1') > 0 &&
-           GetTensorDim(dilations_, data_format_, '2') > 0),
+          context, (GetTensorDim(dilations_, data_format_, '0') > 0 &&
+                    GetTensorDim(dilations_, data_format_, '1') > 0 &&
+                    GetTensorDim(dilations_, data_format_, '2') > 0),
           errors::InvalidArgument("Dilated rates should be larger than 0."));
     }
   }
@@ -1067,8 +1061,14 @@ class MklConvOp : public OpKernel {
       Tfilter* filter_data = nullptr;
       if (filter_md.data.format != conv_fwd->GetFilterMemoryFormat()) {
         filter.SetUsrMem(filter_md, &filter_tensor);
-        filter.CheckReorderToOpMem(conv_fwd_pd.get()->weights_primitive_desc(),
-                                   filter.GetTensorBuffer(filter_out_tensor));
+        if (filter_out_tensor == nullptr) {
+          filter.CheckReorderToOpMem(
+              conv_fwd_pd.get()->weights_primitive_desc());
+        } else {
+          filter.CheckReorderToOpMem(
+              conv_fwd_pd.get()->weights_primitive_desc(),
+              filter.GetTensorBuffer(filter_out_tensor));
+        }
         filter_data =
             static_cast<Tfilter*>(filter.GetOpMem().get_data_handle());
       } else {
@@ -1468,7 +1468,7 @@ class MklQuantizedConv2DSumReluOp
             {"sum", {scale_summand / scale_output}});
       else
         params.post_op_params.push_back(
-            {"sum", {2.0 * scale_summand / scale_output}});
+            {"sum", {2.0f * scale_summand / scale_output}});
     } else {
       params.post_op_params.push_back({"sum", {1.0}});
     }
@@ -1533,8 +1533,8 @@ class MklQuantizedConv2DSumReluOp
     const float max_filter =
         context->input(5 + bias_index_offset).flat<float>()(0);
 
-    reorder_sum_scale = 255.0 * 127.0 /
-                        (std::max(std::abs(max_input), std::abs(min_input)) *
+    reorder_sum_scale =
+        255.0 * 127.0 / (std::max(std::abs(max_input), std::abs(min_input)) *
                          std::max(std::abs(max_filter), std::abs(min_filter)));
     std::vector<float> scales;
     scales.push_back(reorder_sum_scale);
diff --git a/tensorflow/core/kernels/mkl_lrn_op.cc b/tensorflow/core/kernels/mkl_lrn_op.cc
index 22ff4cd80fe..407ce5d653d 100644
--- a/tensorflow/core/kernels/mkl_lrn_op.cc
+++ b/tensorflow/core/kernels/mkl_lrn_op.cc
@@ -22,32 +22,26 @@ limitations under the License.
 
 #define EIGEN_USE_THREADS
 #include <vector>
-#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "mkldnn.hpp"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/util/mkl_util.h"
 #include "tensorflow/core/util/tensor_format.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 
 #if !defined(IS_MOBILE_PLATFORM)
 #include "tensorflow/core/util/work_sharder.h"
 #endif
 
-#ifndef INTEL_MKL_ML_ONLY
-#include "mkldnn.hpp"
 using mkldnn::lrn_across_channels;
 using mkldnn::lrn_backward;
 using mkldnn::lrn_forward;
 using mkldnn::prop_kind;
 using mkldnn::stream;
-#else
-#include "mkl_dnn.h"
-#include "mkl_dnn_types.h"
-#endif
-
-#include "tensorflow/core/util/mkl_util.h"
 
 namespace tensorflow {
 
@@ -69,8 +63,6 @@ void GetBandMatrix(int depth, int depth_radius,
 
 }  // namespace
 
-#ifdef INTEL_MKL_ML_ONLY
-
 template <typename T>
 class MklLRNOp : public OpKernel {
  public:
@@ -79,675 +71,10 @@ class MklLRNOp : public OpKernel {
   explicit MklLRNOp(OpKernelConstruction* context) : OpKernel(context) {
     int64 depth_radius64;
     OP_REQUIRES_OK(context, context->GetAttr("depth_radius", &depth_radius64));
-    OP_REQUIRES(
-        context,
-        FastBoundsCheck(depth_radius64, std::numeric_limits<int>::max()),
-        errors::InvalidArgument("depth_radius = ", depth_radius64,
-                                " larger than int max"));
-    depth_radius_ = static_cast<size_t>(depth_radius64);
-
-    OP_REQUIRES_OK(context, context->GetAttr("bias", &bias_));
-    OP_REQUIRES_OK(context, context->GetAttr("alpha", &alpha_));
-    OP_REQUIRES_OK(context, context->GetAttr("beta", &beta_));
-    workspace_enabled_ = false;
-    OP_REQUIRES_OK(context,
-                   context->GetAttr("workspace_enabled", &workspace_enabled_));
-  }
-
-  void Compute(OpKernelContext* context) override {
-    MklLRNOpContext mkl_context;
-
-    const Tensor& input = MklGetInput(context, 0);
-    GetMklShape(context, 0, &mkl_context.input_shape);
-    bool input_in_mkl_format = mkl_context.input_shape.IsMklTensor();
-
-    // Sanity checks
-    mkl_context.in_dims = input_in_mkl_format
-                              ? mkl_context.input_shape.GetDimension()
-                              : input.dims();
-    OP_REQUIRES(context, mkl_context.in_dims == 4,
-                errors::InvalidArgument("input must be 4-dimensional"));
-    OP_REQUIRES(
-        context,
-        FastBoundsCheck(input.NumElements(), std::numeric_limits<int>::max()),
-        errors::InvalidArgument("argument to LRN too large"));
-
-    if (!input_in_mkl_format) {
-      mkl_context.MklDefaultToEigen(context, depth_radius_, bias_, alpha_,
-                                    beta_, input);
-      return;
-    }
-
-    if (input_in_mkl_format) {
-      // MKL supports normalization over channel dimension only
-      if (mkl_context.input_shape.tf_dim_idx(mkl_context.in_dims - 1) ==
-          MklDims::C) {
-        mkl_context.lt_input =
-            static_cast<dnnLayout_t>(mkl_context.input_shape.GetCurLayout());
-        workspace_enabled_ = true;
-      } else {
-        Tensor converted_tensor =
-            ConvertMklToTF<T>(context, input, mkl_context.input_shape);
-        mkl_context.MklDefaultToEigen(context, depth_radius_, bias_, alpha_,
-                                      beta_, converted_tensor);
-        return;
-      }
-    }
-
-    int kernel_size = 2 * depth_radius_ + 1;
-
-    CHECK_EQ(dnnLRNCreateForward_F32(
-                 &mkl_context.lrn_fwd, NULL, mkl_context.lt_input, kernel_size,
-                 static_cast<float>(alpha_ * kernel_size), beta_, bias_),
-             E_SUCCESS);
-
-    // Allocate output tensor and shape
-    Tensor* output = nullptr;
-    Tensor* workspace = nullptr;
-
-    // Convert Inputs if needed
-    Tensor mkl_tmp_input_buf_tensor;
-    mkl_context.MklPrepareLRNInputs(context, &mkl_tmp_input_buf_tensor);
-
-    // Allocate Layer Outputs
-    mkl_context.MklAllocateOutputs(context, &output, &workspace,
-                                   workspace_enabled_);
-
-    Tensor mkl_tmp_workspace_buf_tensor;
-    mkl_context.MklPrepareLRNOutputs(context, output, workspace,
-                                     &mkl_tmp_workspace_buf_tensor,
-                                     workspace_enabled_);
-
-    // Execute LRN.
-    CHECK_EQ(dnnExecute_F32(mkl_context.lrn_fwd, mkl_context.lrn_res),
-             E_SUCCESS);
-
-    // Release MKL resources.
-    mkl_context.MklCleanup();
-  }
-
- private:
-  typedef struct {
-    size_t in_dims;
-    size_t in_sizes[4];
-    size_t in_strides[4];
-    size_t out_sizes[4];
-    size_t out_strides[4];
-    MklShape input_shape;
-    dnnPrimitive_t lrn_fwd = nullptr;
-    dnnPrimitive_t convert_input = nullptr;
-    dnnLayout_t lt_input = nullptr;
-    dnnLayout_t lt_internal_input = nullptr;
-    dnnLayout_t lt_internal_workspace = nullptr;
-    dnnLayout_t lt_internal_output = nullptr;
-    void* lrn_res[dnnResourceNumber];
-
-    // Convert Inputs if needed
-    void MklPrepareLRNInputs(OpKernelContext* context,
-                             Tensor* mkl_tmp_input_buf_tensor) {
-      const Tensor& input = MklGetInput(context, 0);
-      void* mkl_buf_input =
-          const_cast<void*>(static_cast<const void*>(input.flat<T>().data()));
-
-      CHECK_EQ(dnnLayoutCreateFromPrimitive_F32(&lt_internal_input, lrn_fwd,
-                                                dnnResourceSrc),
-               E_SUCCESS);
-
-      void* mkl_buf_convert_input = nullptr;
-      bool mkl_convert_input = false;
-      mkl_convert_input = !dnnLayoutCompare_F32(lt_internal_input, lt_input);
-
-      if (mkl_convert_input) {
-        CHECK_EQ(dnnConversionCreate_F32(&convert_input, lt_input,
-                                         lt_internal_input),
-                 E_SUCCESS);
-        AllocTmpBuffer(context, mkl_tmp_input_buf_tensor, lt_internal_input,
-                       &mkl_buf_convert_input);
-        CHECK_EQ(dnnConversionExecute_F32(convert_input, mkl_buf_input,
-                                          mkl_buf_convert_input),
-                 E_SUCCESS);
-        dnnDelete_F32(convert_input);
-      }
-
-      lrn_res[dnnResourceSrc] =
-          (mkl_convert_input) ? mkl_buf_convert_input : mkl_buf_input;
-    }
-
-    // Allocate Layer Outputs
-    void MklAllocateOutputs(OpKernelContext* context, Tensor** output,
-                            Tensor** workspace, bool workspace_enabled_) {
-      TensorShape mkl_output_tf_shape; /* First tensor */
-      MklShape mkl_output_mkl_shape;   /* Second tensor */
-
-      mkl_output_mkl_shape.SetMklTensor(true);
-      mkl_output_mkl_shape.SetMklLayout(lrn_fwd, dnnResourceDst);
-      mkl_output_mkl_shape.SetTfLayout(in_dims, input_shape.GetSizes(),
-                                       input_shape.GetStrides());
-      mkl_output_mkl_shape.SetTfDimOrder(in_dims,
-                                         input_shape.GetTfToMklDimMap());
-      mkl_output_tf_shape.AddDim(
-          dnnLayoutGetMemorySize_F32(
-              static_cast<dnnLayout_t>(mkl_output_mkl_shape.GetMklLayout())) /
-          sizeof(T));
-      AllocateOutputSetMklShape(context, 0, output,
-                                mkl_output_tf_shape /* First tensor */,
-                                mkl_output_mkl_shape /* Second Tensor */);
-
-      if (workspace_enabled_) {
-        TensorShape mkl_workspace_tf_shape; /* First tensor */
-        MklShape mkl_workspace_mkl_shape;   /* Second tensor */
-        mkl_workspace_mkl_shape.SetMklTensor(false);
-        mkl_workspace_mkl_shape.SetMklLayout(lrn_fwd, dnnResourceWorkspace);
-        // Assumes workspace has same TF layout and TF dim order as input
-        mkl_workspace_mkl_shape.SetTfLayout(in_dims, input_shape.GetSizes(),
-                                            input_shape.GetStrides());
-        mkl_workspace_mkl_shape.SetTfDimOrder(in_dims,
-                                              input_shape.GetTfToMklDimMap());
-        mkl_workspace_tf_shape.AddDim(
-            dnnLayoutGetMemorySize_F32(static_cast<dnnLayout_t>(
-                mkl_workspace_mkl_shape.GetMklLayout())) /
-            sizeof(T));
-        AllocateOutputSetMklShape(context, 1, workspace,
-                                  mkl_workspace_tf_shape /* First tensor */,
-                                  mkl_workspace_mkl_shape /* Second Tensor */);
-      }
-    }
-
-    void MklPrepareLRNOutputs(OpKernelContext* context, Tensor* output,
-                              Tensor* workspace,
-                              Tensor* mkl_tmp_workspace_buf_tensor,
-                              bool workspace_enabled_) {
-      CHECK_EQ(dnnLayoutCreateFromPrimitive_F32(&lt_internal_workspace, lrn_fwd,
-                                                dnnResourceWorkspace),
-               E_SUCCESS);
-
-      CHECK_EQ(dnnLayoutCreateFromPrimitive_F32(&lt_internal_output, lrn_fwd,
-                                                dnnResourceDst),
-               E_SUCCESS);
-
-      void* mkl_buf_output =
-          const_cast<void*>(static_cast<const void*>(output->flat<T>().data()));
-      lrn_res[dnnResourceDst] = mkl_buf_output;
-
-      void* mkl_buf_workspace = nullptr;
-      if (workspace_enabled_) {
-        mkl_buf_workspace = const_cast<void*>(
-            static_cast<const void*>(workspace->flat<T>().data()));
-      } else {
-        AllocTmpBuffer(context, mkl_tmp_workspace_buf_tensor,
-                       lt_internal_workspace, &mkl_buf_workspace);
-      }
-      lrn_res[dnnResourceWorkspace] = mkl_buf_workspace;
-    }
-
-    // Fallback implementation - Taken from lrn_op.cc
-    // TODO(inteltf) Check if we can use EigenLRNOp directly instead of making a
-    // copy.
-    void MklDefaultToEigen(OpKernelContext* context, int depth_radius_,
-                           float bias_, float alpha_, float beta_,
-                           const Tensor& input) {
-      const int batch = static_cast<int>(input.dim_size(0));
-      const int rows = static_cast<int>(input.dim_size(1));
-      const int cols = static_cast<int>(input.dim_size(2));
-      const int depth = static_cast<int>(input.dim_size(3));
-      const int nodes = cols * rows;
-
-      auto in_shaped = input.shaped<T, 2>({nodes * batch, depth});
-      // Multiplying the input with the band matrix has the effect of reducing
-      // the
-      // correct patch along the depth.
-      Eigen::Tensor<T, 2, Eigen::RowMajor> multiplier(depth, depth);
-      GetBandMatrix<T>(depth, depth_radius_, &multiplier);
-
-      Tensor *output, *workspace;
-      MklShape mkl_output_mkl_shape, mkl_workspace_mkl_shape;
-      mkl_output_mkl_shape.SetMklTensor(false);
-      mkl_output_mkl_shape.SetDimensions(4);
-      AllocateOutputSetMklShape(context, 0, &output, input.shape(),
-                                mkl_output_mkl_shape);
-
-      mkl_workspace_mkl_shape.SetMklTensor(false);
-      mkl_workspace_mkl_shape.SetDimensions(4);
-      AllocateOutputSetMklShape(context, 1, &workspace, input.shape(),
-                                mkl_workspace_mkl_shape);
-
-      auto out_shaped = output->shaped<T, 2>({nodes * batch, depth});
-      Eigen::array<DimPair, 1> dims = {{DimPair(1, 0)}};
-      auto tmp = in_shaped.square().contract(multiplier, dims) * alpha_ + bias_;
-      if (beta_ == T(1)) {
-        out_shaped.device(context->eigen_cpu_device()) =
-            in_shaped * tmp.inverse();
-      } else if (beta_ == T(0.5)) {
-        out_shaped.device(context->eigen_cpu_device()) =
-            in_shaped * tmp.rsqrt();
-      } else {
-        out_shaped.device(context->eigen_cpu_device()) =
-            in_shaped * (tmp.log() * -beta_).exp();
-      }
-    }
-
-    // Release MKL resources.
-    void MklCleanup() {
-      dnnDelete_F32(lrn_fwd);
-      dnnLayoutDelete_F32(lt_internal_input);
-      dnnLayoutDelete_F32(lt_internal_workspace);
-      dnnLayoutDelete_F32(lt_internal_output);
-    }
-  } MklLRNOpContext;
-
-  typedef typename Eigen::Tensor<T, 1, Eigen::RowMajor>::DimensionPair DimPair;
-
-  bool workspace_enabled_;
-  int depth_radius_;
-  float bias_;
-  float alpha_;
-  float beta_;
-};
-
-template <typename T>
-class MklLRNGradOp : public OpKernel {
- public:
-  explicit MklLRNGradOp(OpKernelConstruction* context) : OpKernel(context) {
-    int64 depth_radius64;
-    OP_REQUIRES_OK(context, context->GetAttr("depth_radius", &depth_radius64));
-    OP_REQUIRES(
-        context,
-        FastBoundsCheck(depth_radius64, std::numeric_limits<int>::max()),
-        errors::InvalidArgument("depth_radius = ", depth_radius64,
-                                " larger than int max"));
-    depth_radius_ = static_cast<int>(depth_radius64);
-    OP_REQUIRES_OK(context, context->GetAttr("bias", &bias_));
-    OP_REQUIRES_OK(context, context->GetAttr("alpha", &alpha_));
-    OP_REQUIRES_OK(context, context->GetAttr("beta", &beta_));
-    workspace_enabled_ = false;
-    OP_REQUIRES_OK(context,
-                   context->GetAttr("workspace_enabled", &workspace_enabled_));
-  }
-
-  void Compute(OpKernelContext* context) override {
-    MklLRNGradOpContext mkl_context;
-    mkl_context.depth_radius_ = depth_radius_;
-    mkl_context.bias_ = bias_;
-    mkl_context.alpha_ = alpha_;
-    mkl_context.beta_ = beta_;
-
-    const Tensor& in_grads = MklGetInput(context, 0);
-    const Tensor& in_image = MklGetInput(context, 1);
-    const Tensor& out_image = MklGetInput(context, 2);
-
-    GetMklShape(context, 0, &mkl_context.ingrad_shape);
-    GetMklShape(context, 1, &mkl_context.inimage_shape);
-    GetMklShape(context, 2, &mkl_context.outimage_shape);
-
-    bool ingrad_in_mkl_format = mkl_context.ingrad_shape.IsMklTensor();
-    bool inimage_in_mkl_format = mkl_context.inimage_shape.IsMklTensor();
-    bool outimage_in_mkl_format = mkl_context.outimage_shape.IsMklTensor();
-
-    mkl_context.in_dims = inimage_in_mkl_format
-                              ? mkl_context.inimage_shape.GetDimension()
-                              : in_image.dims();
-    OP_REQUIRES(context, mkl_context.in_dims == 4,
-                errors::InvalidArgument("input images must be 4-dimensional"));
-
-    if (!workspace_enabled_) {
-      mkl_context.MklDefaultToEigen(context);
-      return;
-    }
-
-    if (ingrad_in_mkl_format || inimage_in_mkl_format) {
-      const MklShape* tmp_mkl_shape = (ingrad_in_mkl_format)
-                                          ? &mkl_context.ingrad_shape
-                                          : &mkl_context.inimage_shape;
-      if (tmp_mkl_shape->tf_dim_idx(mkl_context.in_dims - 1) != MklDims::C) {
-        // Fallback to eigen
-        mkl_context.MklDefaultToEigen(context);
-        return;
-      } else {  // MKL supports normalization over channel dimension only
-        for (int i = 0; i < mkl_context.in_dims; i++) {
-          mkl_context.in_sizes[i] = mkl_context.out_sizes[i] =
-              tmp_mkl_shape->GetSizes()[i];
-          mkl_context.in_strides[i] = mkl_context.out_strides[i] =
-              tmp_mkl_shape->GetStrides()[i];
-        }
-      }
-    } else {
-      // Fallback to eigen
-      mkl_context.MklDefaultToEigen(context);
-      return;
-    }
-
-    // Dimensions check for sanity purpose
-    if (ingrad_in_mkl_format) {
-      OP_REQUIRES(
-          context, mkl_context.ingrad_shape.GetDimension() == 4,
-          errors::InvalidArgument("input gradient must be 4-dimensional"));
-    } else {
-      OP_REQUIRES(
-          context, in_grads.dims() == 4,
-          errors::InvalidArgument("input gradient must be 4-dimensional"));
-    }
-
-    if (outimage_in_mkl_format) {
-      OP_REQUIRES(
-          context, mkl_context.outimage_shape.GetDimension() == 4,
-          errors::InvalidArgument("Output image must be 4-dimensional"));
-    } else {
-      OP_REQUIRES(
-          context, out_image.dims() == 4,
-          errors::InvalidArgument("Output image must be 4-dimensional"));
-    }
-
-    // Prepare mkl input layout
-    mkl_context.MklPrepareLRNInputsLayouts(context);
-    int ksize = 2 * depth_radius_ + 1;
-
-    CHECK_EQ(dnnLRNCreateBackward_F32(
-                 &mkl_context.lrn_bwd, NULL, mkl_context.lt_input,
-                 mkl_context.lt_output, ksize,
-                 static_cast<float>(alpha_ * ksize), beta_, bias_),
-             E_SUCCESS);
-
-    // Allocate output tensor and shape.
-    TensorShape mkl_output_tf_shape; /* First tensor */
-    MklShape mkl_output_mkl_shape;   /* Second tensor */
-    mkl_output_mkl_shape.SetMklTensor(true);
-    CHECK_NE(mkl_context.lrn_bwd, nullptr);
-    mkl_output_mkl_shape.SetMklLayout(mkl_context.lrn_bwd, dnnResourceDiffSrc);
-    mkl_output_mkl_shape.SetTfLayout(mkl_context.in_dims, mkl_context.out_sizes,
-                                     mkl_context.out_strides);
-    if (ingrad_in_mkl_format) {
-      mkl_output_mkl_shape.SetTfDimOrder(
-          mkl_context.in_dims, mkl_context.ingrad_shape.GetTfToMklDimMap());
-    } else {
-      mkl_output_mkl_shape.SetTfDimOrder(
-          mkl_context.in_dims, mkl_context.inimage_shape.GetTfToMklDimMap());
-    }
-    mkl_output_tf_shape.AddDim(
-        dnnLayoutGetMemorySize_F32(
-            static_cast<dnnLayout_t>(mkl_output_mkl_shape.GetMklLayout())) /
-        sizeof(T));
-    Tensor* output = nullptr;
-    AllocateOutputSetMklShape(context, 0, &output, mkl_output_tf_shape,
-                              mkl_output_mkl_shape);
-
-    // Get pointers to output data.
-    void* user_output =
-        const_cast<void*>(static_cast<const void*>(output->flat<T>().data()));
-
-    Tensor mkl_tmp_input_buf_tensor, mkl_tmp_image_buf_tensor,
-        mkl_tmp_outimage_buf_tensor;
-    // Convert Inputs if needed
-    mkl_context.MklPrepareLRNGradInput(context, &mkl_tmp_input_buf_tensor,
-                                       &mkl_tmp_image_buf_tensor,
-                                       &mkl_tmp_outimage_buf_tensor);
-
-    // We do not do any conversion for output. But we simply emit it
-    // in MKL format.
-    mkl_context.res_lrn_bwd[dnnResourceDiffSrc] = user_output;
-    // Execute LRN backward using dnnExecute
-    CHECK_EQ(dnnExecute_F32(mkl_context.lrn_bwd, mkl_context.res_lrn_bwd),
-             E_SUCCESS);
-    // Release MKL resources.
-    mkl_context.Mklcleanup();
-  }
-
- private:
-  typedef struct {
-    int depth_radius_;
-    float bias_;
-    float alpha_;
-    float beta_;
-    size_t in_dims;
-    size_t in_sizes[4];
-    size_t in_strides[4];
-    size_t out_sizes[4];
-    size_t out_strides[4];
-    MklShape ingrad_shape, inimage_shape, outimage_shape;
-    dnnPrimitive_t lrn_bwd = nullptr;
-    dnnPrimitive_t convert_input = nullptr;
-    dnnLayout_t lt_input = nullptr;
-    dnnLayout_t lt_output = nullptr;
-    dnnLayout_t lt_bdw_input = nullptr;
-    dnnLayout_t lt_workspace = nullptr;
-    dnnLayout_t lt_internal_input = nullptr;
-    void* res_lrn_bwd[dnnResourceNumber];
-
-    // prepare mkl input
-    void MklPrepareLRNInputsLayouts(OpKernelContext* context) {
-      bool ingrad_in_mkl_format = ingrad_shape.IsMklTensor();
-      bool inimage_in_mkl_format = inimage_shape.IsMklTensor();
-      if (!ingrad_in_mkl_format) {
-        CHECK_EQ(dnnLayoutCreate_F32(&lt_input, in_dims, in_sizes, in_strides),
-                 E_SUCCESS);
-      } else {
-        lt_input = static_cast<dnnLayout_t>(ingrad_shape.GetCurLayout());
-      }
-
-      if (!inimage_in_mkl_format) {
-        CHECK_EQ(
-            dnnLayoutCreate_F32(&lt_output, in_dims, out_sizes, out_strides),
-            E_SUCCESS);
-      } else {
-        lt_output = static_cast<dnnLayout_t>(inimage_shape.GetCurLayout());
-      }
-    }
-
-    // convert input if needed
-    void MklPrepareLRNGradInput(OpKernelContext* context,
-                                Tensor* mkl_tmp_input_buf_tensor,
-                                Tensor* mkl_tmp_image_buf_tensor,
-                                Tensor* mkl_tmp_outimage_buf_tensor) {
-      const Tensor& in_grads = MklGetInput(context, 0);
-      const Tensor& in_image = MklGetInput(context, 1);
-      const Tensor& workspace = MklGetInput(
-          context,
-          3); /*Worskpsace is enabled, get the buffer to the workspace */
-
-      void* user_input = const_cast<void*>(
-          static_cast<const void*>(in_grads.flat<T>().data()));
-      void* user_fwd_input = const_cast<void*>(
-          static_cast<const void*>(in_image.flat<T>().data()));
-      void* workspace_buffer = const_cast<void*>(
-          static_cast<const void*>(workspace.flat<T>().data()));
-
-      CHECK_EQ(dnnLayoutCreateFromPrimitive_F32(&lt_workspace, lrn_bwd,
-                                                dnnResourceWorkspace),
-               E_SUCCESS);
-      CHECK_EQ(dnnLayoutCreateFromPrimitive_F32(&lt_bdw_input, lrn_bwd,
-                                                dnnResourceDiffDst),
-               E_SUCCESS);
-      CHECK_EQ(dnnLayoutCreateFromPrimitive_F32(&lt_internal_input, lrn_bwd,
-                                                dnnResourceSrc),
-               E_SUCCESS);
-
-      bool ingrad_in_mkl_format = ingrad_shape.IsMklTensor();
-      if (ingrad_in_mkl_format) {
-        if (!dnnLayoutCompare_F32(lt_bdw_input, lt_input)) {
-          AllocTmpBuffer(context, mkl_tmp_input_buf_tensor, lt_bdw_input,
-                         &res_lrn_bwd[dnnResourceDiffDst]);
-          ingrad_shape.GetConvertedFlatData(lt_bdw_input, user_input,
-                                            res_lrn_bwd[dnnResourceDiffDst]);
-        } else {
-          res_lrn_bwd[dnnResourceDiffDst] = user_input;
-        }
-      } else {
-        if (!dnnLayoutCompare_F32(lt_bdw_input, lt_input)) {
-          CHECK_EQ(
-              dnnConversionCreate_F32(&convert_input, lt_input, lt_bdw_input),
-              E_SUCCESS);
-
-          AllocTmpBuffer(context, mkl_tmp_input_buf_tensor, lt_bdw_input,
-                         &res_lrn_bwd[dnnResourceDiffDst]);
-          CHECK_EQ(dnnConversionExecute_F32(convert_input, user_input,
-                                            res_lrn_bwd[dnnResourceDiffDst]),
-                   E_SUCCESS);
-          dnnDelete_F32(convert_input);
-        } else {
-          res_lrn_bwd[dnnResourceDiffDst] = user_input;
-        }
-      }
-
-      bool inimage_in_mkl_format = inimage_shape.IsMklTensor();
-      if (inimage_in_mkl_format) {
-        if (!dnnLayoutCompare_F32(
-                lt_internal_input,
-                static_cast<dnnLayout_t>(inimage_shape.GetCurLayout()))) {
-          AllocTmpBuffer(context, mkl_tmp_image_buf_tensor, lt_internal_input,
-                         &res_lrn_bwd[dnnResourceSrc]);
-          ingrad_shape.GetConvertedFlatData(lt_internal_input, user_fwd_input,
-                                            res_lrn_bwd[dnnResourceSrc]);
-        } else {
-          res_lrn_bwd[dnnResourceSrc] = user_fwd_input;
-        }
-      } else {
-        if (!dnnLayoutCompare_F32(
-                lt_internal_input,
-                static_cast<dnnLayout_t>(inimage_shape.GetCurLayout()))) {
-          CHECK_EQ(dnnConversionCreate_F32(
-                       &convert_input,
-                       static_cast<dnnLayout_t>(inimage_shape.GetCurLayout()),
-                       lt_internal_input),
-                   E_SUCCESS);
-
-          AllocTmpBuffer(context, mkl_tmp_image_buf_tensor, lt_internal_input,
-                         &res_lrn_bwd[dnnResourceSrc]);
-          CHECK_EQ(dnnConversionExecute_F32(convert_input, user_fwd_input,
-                                            res_lrn_bwd[dnnResourceSrc]),
-                   E_SUCCESS);
-          dnnDelete_F32(convert_input);
-        } else {
-          res_lrn_bwd[dnnResourceSrc] = user_fwd_input;
-        }
-      }
-
-      res_lrn_bwd[dnnResourceWorkspace] = workspace_buffer;
-    }
-
-    // Fallback implementation - Taken from lrn_op.cc
-    // TODO(intelft) Check if we can use EigenLRNOp directly instead of making a
-    // copy.
-    void MklDefaultToEigen(OpKernelContext* context) {
-      Tensor in_grads;
-      Tensor in_image;
-      Tensor out_image;
-
-      GetMklShape(context, 0, &ingrad_shape);
-      GetMklShape(context, 1, &inimage_shape);
-      GetMklShape(context, 2, &outimage_shape);
-
-      if (ingrad_shape.IsMklTensor()) {
-        in_grads =
-            ConvertMklToTF<T>(context, MklGetInput(context, 0), ingrad_shape);
-      } else {
-        in_grads = MklGetInput(context, 0);
-      }
-
-      if (inimage_shape.IsMklTensor()) {
-        in_image =
-            ConvertMklToTF<T>(context, MklGetInput(context, 1), inimage_shape);
-      } else {
-        in_image = MklGetInput(context, 1);
-      }
-
-      if (outimage_shape.IsMklTensor()) {
-        out_image =
-            ConvertMklToTF<T>(context, MklGetInput(context, 2), outimage_shape);
-      } else {
-        out_image = MklGetInput(context, 2);
-      }
-
-      const int64 batch = static_cast<int64>(in_grads.dim_size(0));
-      const int64 rows = static_cast<int64>(in_grads.dim_size(1));
-      const int64 cols = static_cast<int64>(in_grads.dim_size(2));
-      const int64 depth = static_cast<int64>(in_grads.dim_size(3));
-      const auto nodes = cols * rows;
-
-      auto grads_shaped = in_grads.shaped<T, 2>({nodes * batch, depth});
-
-      auto in_shaped = in_image.shaped<T, 2>({nodes * batch, depth});
-      auto activations = out_image.shaped<T, 2>({nodes * batch, depth});
-
-      Tensor* output;
-      MklShape mkl_output_mkl_shape;
-      mkl_output_mkl_shape.SetMklTensor(false);
-      mkl_output_mkl_shape.SetDimensions(4);
-      AllocateOutputSetMklShape(context, 0, &output, in_grads.shape(),
-                                mkl_output_mkl_shape);
-
-      auto out_shaped = output->shaped<T, 2>({nodes * batch, depth});
-      out_shaped.setZero();
-      auto shard = [this, activations, in_shaped, grads_shaped, out_shaped,
-                    depth](int64 begin, int64 end) {
-        for (int64 i = begin; i < end; ++i) {
-          for (int64 j = 0; j < depth; ++j) {
-            int64 depth_begin = std::max<int64>(0, j - depth_radius_);
-            int64 depth_end = std::min<int64>(depth, j + depth_radius_ + 1);
-
-            T norm(0);
-            for (int64 k = depth_begin; k < depth_end; ++k) {
-              norm += in_shaped(i, k) * in_shaped(i, k);
-            }
-            norm = alpha_ * norm + bias_;
-            DCHECK_GT(norm, T(1e-6));
-            for (int64 k = depth_begin; k < depth_end; ++k) {
-              T dyi = T(-2) * alpha_ * beta_ * in_shaped(i, k) *
-                      activations(i, j) / norm;
-              if (k == j) {
-                dyi += Eigen::numext::pow(norm, -beta_);
-              }
-              dyi *= grads_shaped(i, j);
-              const_cast<typename TTypes<T, 2>::Tensor&>(out_shaped)(i, k) +=
-                  dyi;
-            }
-          }
-        }
-      };
-      auto worker_threads =
-          *(context->device()->tensorflow_cpu_worker_threads());
-      Shard(worker_threads.num_threads, worker_threads.workers, nodes * batch,
-            depth * depth, shard);
-    }
-
-    // release mkl resources
-    void Mklcleanup() {
-      bool ingrad_in_mkl_format = ingrad_shape.IsMklTensor();
-      bool inimage_in_mkl_format = inimage_shape.IsMklTensor();
-      if (!ingrad_in_mkl_format) {
-        CHECK_EQ(dnnLayoutDelete_F32(lt_input), E_SUCCESS);
-      }
-
-      if (!inimage_in_mkl_format) {
-        CHECK_EQ(dnnLayoutDelete_F32(lt_output), E_SUCCESS);
-      }
-      dnnDelete_F32(lrn_bwd);
-      dnnLayoutDelete_F32(lt_bdw_input);
-      dnnLayoutDelete_F32(lt_workspace);
-    }
-  } MklLRNGradOpContext;
-
-  typedef typename Eigen::Tensor<T, 1, Eigen::RowMajor>::DimensionPair DimPair;
-  bool workspace_enabled_;
-  int depth_radius_;
-  float bias_;
-  float alpha_;
-  float beta_;
-};
-
-#else
-
-template <typename T>
-class MklLRNOp : public OpKernel {
- public:
-  ~MklLRNOp() {}
-
-  explicit MklLRNOp(OpKernelConstruction* context) : OpKernel(context) {
-    int64 depth_radius64;
-    OP_REQUIRES_OK(context, context->GetAttr("depth_radius", &depth_radius64));
-    OP_REQUIRES(
-        context,
-        FastBoundsCheck(depth_radius64, std::numeric_limits<int>::max()),
-        errors::InvalidArgument("depth_radius = ", depth_radius64,
-                                " larger than int max"));
+    OP_REQUIRES(context, FastBoundsCheck(depth_radius64,
+                                         std::numeric_limits<int>::max()),
+                errors::InvalidArgument("depth_radius = ", depth_radius64,
+                                        " larger than int max"));
     depth_radius_ = static_cast<size_t>(depth_radius64);
 
     OP_REQUIRES_OK(context, context->GetAttr("bias", &bias_));
@@ -833,9 +160,9 @@ class MklLRNOp : public OpKernel {
       PrepareAndExecuteNet(lrn_prim_desc, &src_dnn_data, &dst_dnn_data,
                            &workspace_dnn_data);
     } catch (mkldnn::error& e) {
-      string error_msg = "Status: " + std::to_string(e.status) +
-                         ", message: " + string(e.message) + ", in file " +
-                         string(__FILE__) + ":" + std::to_string(__LINE__);
+      string error_msg = "Status: " + std::to_string(e.status) + ", message: " +
+                         string(e.message) + ", in file " + string(__FILE__) +
+                         ":" + std::to_string(__LINE__);
       OP_REQUIRES_OK(
           context,
           errors::Aborted("Operation received an exception:", error_msg));
@@ -847,7 +174,6 @@ class MklLRNOp : public OpKernel {
                             MklDnnData<T>* src_dnn_data,
                             MklDnnData<T>* dst_dnn_data,
                             MklDnnData<uint8>* wksp_dnn_data = nullptr) {
-
     // Check for input reorder
     src_dnn_data->CheckReorderToOpMem(lrn_fwd_desc.src_primitive_desc());
 
@@ -965,16 +291,14 @@ class MklLRNOp : public OpKernel {
     if (src_dnn_shape.IsMklTensor()) {
       OP_REQUIRES(context, src_dnn_shape.GetDimension() == 4,
                   errors::InvalidArgument("input must be 4-dimensional"));
-      OP_REQUIRES(context,
-                  FastBoundsCheck(src_tensor.NumElements(),
-                                  std::numeric_limits<int>::max()),
+      OP_REQUIRES(context, FastBoundsCheck(src_tensor.NumElements(),
+                                           std::numeric_limits<int>::max()),
                   errors::InvalidArgument("argument to LRN too large"));
     } else {
       OP_REQUIRES(context, src_tensor.dims() == 4,
                   errors::InvalidArgument("input must be 4-dimensional"));
-      OP_REQUIRES(context,
-                  FastBoundsCheck(src_tensor.NumElements(),
-                                  std::numeric_limits<int>::max()),
+      OP_REQUIRES(context, FastBoundsCheck(src_tensor.NumElements(),
+                                           std::numeric_limits<int>::max()),
                   errors::InvalidArgument("argument to LRN too large"));
     }
   }
@@ -994,11 +318,10 @@ class MklLRNGradOp : public OpKernel {
   explicit MklLRNGradOp(OpKernelConstruction* context) : OpKernel(context) {
     int64 depth_radius64;
     OP_REQUIRES_OK(context, context->GetAttr("depth_radius", &depth_radius64));
-    OP_REQUIRES(
-        context,
-        FastBoundsCheck(depth_radius64, std::numeric_limits<int>::max()),
-        errors::InvalidArgument("depth_radius = ", depth_radius64,
-                                " larger than int max"));
+    OP_REQUIRES(context, FastBoundsCheck(depth_radius64,
+                                         std::numeric_limits<int>::max()),
+                errors::InvalidArgument("depth_radius = ", depth_radius64,
+                                        " larger than int max"));
     depth_radius_ = static_cast<int>(depth_radius64);
     OP_REQUIRES_OK(context, context->GetAttr("bias", &bias_));
     OP_REQUIRES_OK(context, context->GetAttr("alpha", &alpha_));
@@ -1105,9 +428,9 @@ class MklLRNGradOp : public OpKernel {
           memory::primitive_desc(target_diff_dst_md, cpu_engine),
           &workspace_dnn_data);
     } catch (mkldnn::error& e) {
-      string error_msg = "Status: " + std::to_string(e.status) +
-                         ", message: " + string(e.message) + ", in file " +
-                         string(__FILE__) + ":" + std::to_string(__LINE__);
+      string error_msg = "Status: " + std::to_string(e.status) + ", message: " +
+                         string(e.message) + ", in file " + string(__FILE__) +
+                         ":" + std::to_string(__LINE__);
       OP_REQUIRES_OK(
           context,
           errors::Aborted("Operation received an exception:", error_msg));
@@ -1160,7 +483,6 @@ class MklLRNGradOp : public OpKernel {
       MklDnnData<T>* output_diff_src,
       const memory::primitive_desc& target_diff_dst_pd,
       const MklDnnData<uint8>* workspace_dnn_data = nullptr) {
-
     // Check for input reordering on the diff dst input
     input_gradient_diff_dst->CheckReorderToOpMem(
         lrn_bkwd_desc.diff_dst_primitive_desc());
@@ -1345,8 +667,6 @@ class MklLRNGradOp : public OpKernel {
   float beta_;
 };
 
-#endif  // INTEL_MKL_ML_ONLY
-
 #define REGISTER_MKL_LRN_CPU(T)                                     \
   REGISTER_KERNEL_BUILDER(Name("_MklLRN")                           \
                               .Device(DEVICE_CPU)                   \
diff --git a/tensorflow/core/kernels/mkl_maxpooling_op.cc b/tensorflow/core/kernels/mkl_maxpooling_op.cc
index 256d48f4d5d..0697251c7dc 100644
--- a/tensorflow/core/kernels/mkl_maxpooling_op.cc
+++ b/tensorflow/core/kernels/mkl_maxpooling_op.cc
@@ -399,19 +399,18 @@ class MklMaxPoolingGradOp : public OpKernel {
       if (workspace_enabled == false) {
         if (convert_input != nullptr) {
           if (input_in_mkl_format == false) {
-            CHECK_EQ(dnnConversionExecute_F32(
-                         convert_input,
-                         const_cast<void*>(static_cast<const void*>(
-                             tensor_in.flat<T>().data())),
-                         input_buf),
-                     E_SUCCESS);
+            CHECK_EQ(
+                dnnConversionExecute_F32(
+                    convert_input, const_cast<void*>(static_cast<const void*>(
+                                       tensor_in.flat<T>().data())),
+                    input_buf),
+                E_SUCCESS);
             CHECK_EQ(dnnDelete_F32(convert_input), E_SUCCESS);
             convert_input = nullptr;
           } else {
             input_shape.GetConvertedFlatData(
-                lt_input_prim,
-                const_cast<void*>(
-                    static_cast<const void*>(tensor_in.flat<T>().data())),
+                lt_input_prim, const_cast<void*>(static_cast<const void*>(
+                                   tensor_in.flat<T>().data())),
                 input_buf);
           }
           pooling_resfwd[dnnResourceSrc] = input_buf;
@@ -456,9 +455,8 @@ class MklMaxPoolingGradOp : public OpKernel {
           CHECK_EQ(dnnDelete_F32(convert_outbackprop), E_SUCCESS);
         } else {
           output_backprop_shape.GetConvertedFlatData(
-              lt_outbackprop_prim,
-              const_cast<void*>(
-                  static_cast<const void*>(out_backprop.flat<T>().data())),
+              lt_outbackprop_prim, const_cast<void*>(static_cast<const void*>(
+                                       out_backprop.flat<T>().data())),
               outbackprop_buf);
         }
         pooling_res[dnnResourceDiffDst] = outbackprop_buf;
@@ -520,7 +518,6 @@ class MklMaxPoolingOp : public MklPoolingForwardOpBase<T> {
 
       MklDnnData<T> dnn_data_input(&cpu_engine);
       MklDnnData<T> dnn_data_output(&cpu_engine);
-      MklDnnData<uint8> dnn_data_wksp(&cpu_engine);
 
       // initialize variables for the pooling op
       MklPoolParameters pool_params;
@@ -550,13 +547,13 @@ class MklMaxPoolingOp : public MklPoolingForwardOpBase<T> {
           dnn_shape_input.IsMklTensor()
               ? dnn_shape_input.GetMklLayout()
               : is_pool2d ? memory::desc(
-                               TFShapeToMklDnnDimsInNCHW(input_tensor_shape,
-                                                         this->data_format_tf_),
-                               MklDnnType<T>(), this->data_format_mkldnn_)
-                         : memory::desc(
-                               TFShapeToMklDnnDimsInNCDHW(
-                                   input_tensor_shape, this->data_format_tf_),
-                               MklDnnType<T>(), this->data_format_mkldnn_);
+                                TFShapeToMklDnnDimsInNCHW(
+                                    input_tensor_shape, this->data_format_tf_),
+                                MklDnnType<T>(), this->data_format_mkldnn_)
+                          : memory::desc(
+                                TFShapeToMklDnnDimsInNCDHW(
+                                    input_tensor_shape, this->data_format_tf_),
+                                MklDnnType<T>(), this->data_format_mkldnn_);
 
       // Get src/filter/stride/padding information
       memory::dims src_dims =
@@ -564,17 +561,24 @@ class MklMaxPoolingOp : public MklPoolingForwardOpBase<T> {
               ? dnn_shape_input.GetSizesAsMklDnnDims()
               : is_pool2d ? TFShapeToMklDnnDimsInNCHW(input_tensor.shape(),
                                                       this->data_format_tf_)
-                         : TFShapeToMklDnnDimsInNCDHW(input_tensor.shape(),
-                                                      this->data_format_tf_);
+                          : TFShapeToMklDnnDimsInNCDHW(input_tensor.shape(),
+                                                       this->data_format_tf_);
       memory::dims filter_dims, strides, padding_left, padding_right;
       this->PoolParamsToDims(&pool_params, &filter_dims, &strides,
                              &padding_left, &padding_right, is_pool2d);
 
       // Get a pooling op from the cached pool
       MklPoolingFwdPrimitive<T>* pooling_fwd = nullptr;
+      prop_kind pooling_prop_kind;
+      bool int8_forward_inference =
+          std::is_same<T, qint8>::value || std::is_same<T, quint8>::value;
+      if (int8_forward_inference)
+        pooling_prop_kind = prop_kind::forward_inference;
+      else
+        pooling_prop_kind = prop_kind::forward_training;
       MklPoolingParams fwdParams(src_dims, output_dims_mkl_order, filter_dims,
                                  strides, padding_left, padding_right,
-                                 algorithm::pooling_max);
+                                 algorithm::pooling_max, pooling_prop_kind);
       pooling_fwd = MklPoolingFwdPrimitiveFactory<T>::Get(fwdParams);
 
       // allocate output tensor
@@ -586,10 +590,6 @@ class MklMaxPoolingOp : public MklPoolingForwardOpBase<T> {
                                 pooling_fwd->GetDstMemoryFormat(),
                                 output_tensor);
 
-      AllocateWorkspaceTensor(context, *(pooling_fwd->GetPoolingFwdPd()),
-                              &dnn_data_wksp);
-      OP_REQUIRES_OK(context, context->status());
-
       // check wehther we need to reorder src
       const T* src_data = input_tensor.flat<T>().data();
       if (input_md.data.format != pooling_fwd->GetSrcMemoryFormat()) {
@@ -603,14 +603,43 @@ class MklMaxPoolingOp : public MklPoolingForwardOpBase<T> {
       }
 
       T* dst_data = output_tensor->flat<T>().data();
-      void* ws_data = dnn_data_wksp.GetOpMem().get_data_handle();
 
-      // execute pooling op
-      pooling_fwd->Execute(src_data, dst_data, ws_data);
+      if (int8_forward_inference) {
+        // Execute pooling op
+        pooling_fwd->Execute(src_data, dst_data);
+
+        // pass min, max from input to output
+        const Tensor& min_input_t = MklGetInput(context, 1);
+        const Tensor& max_input_t = MklGetInput(context, 2);
+        const float min_input = min_input_t.flat<float>()(0);
+        const float max_input = max_input_t.flat<float>()(0);
+
+        Tensor* output_min = nullptr;
+        Tensor* output_max = nullptr;
+        MklDnnShape output_min_mkl_shape, output_max_mkl_shape;
+        output_min_mkl_shape.SetMklTensor(false);
+        output_max_mkl_shape.SetMklTensor(false);
+        AllocateOutputSetMklShape(context, 1, &output_min, {},
+                                  output_min_mkl_shape);
+        AllocateOutputSetMklShape(context, 2, &output_max, {},
+                                  output_max_mkl_shape);
+        output_min->flat<float>()(0) = min_input;
+        output_max->flat<float>()(0) = max_input;
+      } else {
+        MklDnnData<uint8> dnn_data_wksp(&cpu_engine);
+        AllocateWorkspaceTensor(context, *(pooling_fwd->GetPoolingFwdPd()),
+                                &dnn_data_wksp);
+        OP_REQUIRES_OK(context, context->status());
+        T* ws_data =
+            static_cast<T*>(dnn_data_wksp.GetOpMem().get_data_handle());
+
+        // execute pooling op
+        pooling_fwd->Execute(src_data, dst_data, ws_data);
+      }
     } catch (mkldnn::error& e) {
-      string error_msg = "Status: " + std::to_string(e.status) +
-                         ", message: " + string(e.message) + ", in file " +
-                         string(__FILE__) + ":" + std::to_string(__LINE__);
+      string error_msg = "Status: " + std::to_string(e.status) + ", message: " +
+                         string(e.message) + ", in file " + string(__FILE__) +
+                         ":" + std::to_string(__LINE__);
       OP_REQUIRES_OK(context, errors::Aborted("Compute received an exception:",
                                               error_msg));
     }
@@ -684,24 +713,25 @@ class MklMaxPoolingGradOp : public MklPoolingBackwardOpBase<T> {
           orig_input_mkl_shape.IsMklTensor()
               ? orig_input_mkl_shape.GetSizesAsMklDnnDims()
               : is_pool2d ? TFShapeToMklDnnDimsInNCHW(orig_input_shape,
-                                                     this->data_format_tf_)
-                         : TFShapeToMklDnnDimsInNCDHW(orig_input_shape,
-                                                      this->data_format_tf_);
+                                                      this->data_format_tf_)
+                          : TFShapeToMklDnnDimsInNCDHW(orig_input_shape,
+                                                       this->data_format_tf_);
 
       memory::dims diff_dst_dims =
           grad_mkl_shape.IsMklTensor()
               ? grad_mkl_shape.GetSizesAsMklDnnDims()
               : is_pool2d ? TFShapeToMklDnnDimsInNCHW(grad_tensor.shape(),
-                                                     this->data_format_tf_)
-                         : TFShapeToMklDnnDimsInNCDHW(grad_tensor.shape(),
-                                                      this->data_format_tf_);
+                                                      this->data_format_tf_)
+                          : TFShapeToMklDnnDimsInNCDHW(grad_tensor.shape(),
+                                                       this->data_format_tf_);
 
       memory::dims output_dims_mkl_order;
       this->GetOutputDims(pool_params, &output_dims_mkl_order);
 
       MklPoolingParams bwdParams(
           orig_input_dims_mkl_order, output_dims_mkl_order, filter_dims,
-          strides, padding_left, padding_right, algorithm::pooling_max);
+          strides, padding_left, padding_right, algorithm::pooling_max,
+          prop_kind::forward_training);
       MklPoolingBwdPrimitive<T>* pooling_bwd =
           MklPoolingBwdPrimitiveFactory<T>::Get(bwdParams);
 
@@ -751,9 +781,9 @@ class MklMaxPoolingGradOp : public MklPoolingBackwardOpBase<T> {
       // execute pooling
       pooling_bwd->Execute(diff_dst_data, diff_src_data, ws_data);
     } catch (mkldnn::error& e) {
-      string error_msg = "Status:" + std::to_string(e.status) +
-                         ", message: " + string(e.message) + ". in file " +
-                         string(__FILE__) + ":" + std::to_string(__LINE__);
+      string error_msg = "Status:" + std::to_string(e.status) + ", message: " +
+                         string(e.message) + ". in file " + string(__FILE__) +
+                         ":" + std::to_string(__LINE__);
       OP_REQUIRES_OK(context, errors::Aborted("Compute received an exception:",
                                               error_msg));
     }
@@ -788,39 +818,38 @@ class MklMaxPoolingGradOp : public MklPoolingBackwardOpBase<T> {
                          const MklDnnShape& workspace_mkl_shape) {
     if (!orig_input_mkl_shape.IsMklTensor()) {
       OP_REQUIRES(context, orig_input_tensor.dims() == 4,
-                  errors::InvalidArgument("Original input shape must be "
-                                          "4-dimensional"));
+                  errors::InvalidArgument(
+                      "Original input shape must be 4-dimensional"));
     } else {
       OP_REQUIRES(context, orig_input_mkl_shape.GetDimension() == 4,
-                  errors::InvalidArgument("Original input shape must be "
-                                          "4-dimensional"));
+                  errors::InvalidArgument(
+                      "Original input shape must be 4-dimensional"));
     }
     if (!orig_output_mkl_shape.IsMklTensor()) {
-      OP_REQUIRES(context, orig_output_tensor.dims() == 4,
-                  errors::InvalidArgument("Original output must be "
-                                          "4-dimensional"));
+      OP_REQUIRES(
+          context, orig_output_tensor.dims() == 4,
+          errors::InvalidArgument("Original output must be 4-dimensional"));
     } else {
-      OP_REQUIRES(context, orig_output_mkl_shape.GetDimension() == 4,
-                  errors::InvalidArgument("Original output must be "
-                                          "4-dimensional"));
+      OP_REQUIRES(
+          context, orig_output_mkl_shape.GetDimension() == 4,
+          errors::InvalidArgument("Original output must be 4-dimensional"));
     }
     if (!grad_mkl_shape.IsMklTensor()) {
       OP_REQUIRES(context, grad_tensor.dims() == 4,
                   errors::InvalidArgument("Gradient must be 4-dimensional"));
     } else {
       OP_REQUIRES(context, grad_mkl_shape.GetDimension() == 4,
-                  errors::InvalidArgument("Gradient must be "
-                                          "4-dimensional"));
+                  errors::InvalidArgument("Gradient must be 4-dimensional"));
     }
     if (this->workspace_enabled_) {
       // The workspace should not be an MKL tensor
       OP_REQUIRES(context, workspace_mkl_shape.IsMklTensor() == false,
-                  errors::InvalidArgument("Workspace tensor should not"
-                                          " be an MKL Tensor."));
+                  errors::InvalidArgument(
+                      "Workspace tensor should not be an MKL Tensor."));
       // It should only have one dimension
-      OP_REQUIRES(context, workspace_tensor.dims() == 1,
-                  errors::InvalidArgument("Workspace tensor must be "
-                                          "1-dimensional"));
+      OP_REQUIRES(
+          context, workspace_tensor.dims() == 1,
+          errors::InvalidArgument("Workspace tensor must be 1-dimensional"));
     } else {
       OP_REQUIRES(
           context, this->workspace_enabled_,
@@ -852,6 +881,18 @@ REGISTER_KERNEL_BUILDER(Name("_MklMaxPool")
                             .Label(mkl_op_registry::kMklOpLabel),
                         MklMaxPoolingOp<CPUDevice, float>);
 
+REGISTER_KERNEL_BUILDER(Name("_MklQuantizedMaxPool")
+                            .Device(DEVICE_CPU)
+                            .TypeConstraint<quint8>("T")
+                            .Label(mkl_op_registry::kMklQuantizedOpLabel),
+                        MklMaxPoolingOp<CPUDevice, quint8>);
+
+REGISTER_KERNEL_BUILDER(Name("_MklQuantizedMaxPool")
+                            .Device(DEVICE_CPU)
+                            .TypeConstraint<qint8>("T")
+                            .Label(mkl_op_registry::kMklQuantizedOpLabel),
+                        MklMaxPoolingOp<CPUDevice, qint8>);
+
 REGISTER_KERNEL_BUILDER(Name("_MklMaxPoolGrad")
                             .Device(DEVICE_CPU)
                             .TypeConstraint<float>("T")
diff --git a/tensorflow/core/kernels/mkl_pooling_ops_common.cc b/tensorflow/core/kernels/mkl_pooling_ops_common.cc
index 5398e6113f5..dc84d3941e7 100644
--- a/tensorflow/core/kernels/mkl_pooling_ops_common.cc
+++ b/tensorflow/core/kernels/mkl_pooling_ops_common.cc
@@ -41,28 +41,33 @@ void MklPoolingFwdPrimitive<T>::Setup(const MklPoolingParams& fwdParams) {
       << "Pooling algorithm kind is not supported";
 
   context_.alg_kind = fwdParams.alg_kind;
+  context_.prop_kind = fwdParams.prop_kind;
+
   // create memory desc
   // FIXME: Pooling doesn't expose to get the src_primitive_desc,
   //        so src format is currently hard-coded.
   //        A utility function is used to do this,
   //        which may be broken with future CPU architectures
   bool is_2d = (fwdParams.src_dims.size() == 4);
-  context_.src_md.reset(
-      new memory::desc({fwdParams.src_dims}, MklDnnType<T>(),
-                       get_desired_format(fwdParams.src_dims[1], is_2d)));
+  if (std::is_same<T, qint8>::value || std::is_same<T, quint8>::value)
+    context_.src_fmt = is_2d ? memory::format::nhwc : memory::format::ndhwc;
+  else
+    context_.src_fmt = get_desired_format(fwdParams.src_dims[1], is_2d);
+
+  context_.src_md.reset(new memory::desc({fwdParams.src_dims}, MklDnnType<T>(),
+                                         context_.src_fmt));
   context_.dst_md.reset(new memory::desc({fwdParams.dst_dims}, MklDnnType<T>(),
                                          memory::format::any));
 
   // create a pooling descriptor
   context_.fwd_desc.reset(new pooling_forward::desc(
-      prop_kind::forward_training, fwdParams.alg_kind, *context_.src_md,
+      fwdParams.prop_kind, fwdParams.alg_kind, *context_.src_md,
       *context_.dst_md, fwdParams.strides, fwdParams.filter_dims,
       fwdParams.padding_left, fwdParams.padding_right, padding_kind::zero));
   context_.fwd_pd.reset(
       new pooling_forward::primitive_desc(*context_.fwd_desc, cpu_engine_));
 
   // store expected primitive format
-  context_.src_fmt = get_desired_format(fwdParams.src_dims[1], is_2d);
   context_.dst_fmt = static_cast<mkldnn::memory::format>(
       context_.fwd_pd.get()->dst_primitive_desc().desc().data.format);
 
@@ -74,7 +79,8 @@ void MklPoolingFwdPrimitive<T>::Setup(const MklPoolingParams& fwdParams) {
       new memory(context_.fwd_pd.get()->dst_primitive_desc(), DummyData));
 
   // for max pooling, need to return workspace(ws) for backward computing
-  if (fwdParams.alg_kind == pooling_max) {
+  if (fwdParams.alg_kind == pooling_max &&
+      fwdParams.prop_kind == prop_kind::forward_training) {
     auto ws_pd = context_.fwd_pd.get()->workspace_primitive_desc().desc().data;
     // store workspace's dims and format to create workspace tensor
     context_.ws_fmt = static_cast<mkldnn::memory::format>(ws_pd.format);
@@ -101,7 +107,9 @@ void MklPoolingFwdPrimitive<T>::Execute(const T* src_data, T* dst_data,
   context_.src_mem->set_data_handle(
       static_cast<void*>(const_cast<T*>(src_data)));
   context_.dst_mem->set_data_handle(static_cast<void*>(dst_data));
-  if (context_.alg_kind == pooling_max) {  // max pooling must have ws
+  if (context_.alg_kind == pooling_max &&
+      context_.prop_kind ==
+          prop_kind::forward_training) {  // max pooling must have ws
     DCHECK(ws_data != nullptr);
     context_.ws_mem->set_data_handle(ws_data);
   }
@@ -110,13 +118,17 @@ void MklPoolingFwdPrimitive<T>::Execute(const T* src_data, T* dst_data,
   // set back data handle
   context_.src_mem->set_data_handle(DummyData);
   context_.dst_mem->set_data_handle(DummyData);
-  if (context_.alg_kind == pooling_max) {  // max pooling must have ws
+  if (context_.alg_kind == pooling_max &&
+      context_.prop_kind ==
+          prop_kind::forward_training) {  // max pooling must have ws
     DCHECK(ws_data != nullptr);
     context_.ws_mem->set_data_handle(DummyData);
   }
 }
 
 template class MklPoolingFwdPrimitive<float>;
+template class MklPoolingFwdPrimitive<quint8>;
+template class MklPoolingFwdPrimitive<qint8>;
 
 template <typename T>
 void MklPoolingBwdPrimitive<T>::Setup(const MklPoolingParams& bwdParams) {
@@ -143,7 +155,7 @@ void MklPoolingBwdPrimitive<T>::Setup(const MklPoolingParams& bwdParams) {
   // create a forward primitive,
   // which will be used as a hint for creating backward primitive
   context_.fwd_desc.reset(new pooling_forward::desc(
-      prop_kind::forward_training, bwdParams.alg_kind, *context_.diff_src_md,
+      bwdParams.prop_kind, bwdParams.alg_kind, *context_.diff_src_md,
       *context_.diff_dst_md, bwdParams.strides, bwdParams.filter_dims,
       bwdParams.padding_left, bwdParams.padding_right, padding_kind::zero));
   context_.fwd_pd.reset(
diff --git a/tensorflow/core/kernels/mkl_pooling_ops_common.h b/tensorflow/core/kernels/mkl_pooling_ops_common.h
index 49f799d7ba2..8a60c3be91f 100644
--- a/tensorflow/core/kernels/mkl_pooling_ops_common.h
+++ b/tensorflow/core/kernels/mkl_pooling_ops_common.h
@@ -18,8 +18,8 @@ limitations under the License.
 
 #ifdef INTEL_MKL
 #include <memory>
-#include <vector>
 #include <string>
+#include <vector>
 #include "tensorflow/core/util/mkl_util.h"
 #include "tensorflow/core/util/padding.h"
 
@@ -50,18 +50,20 @@ struct MklPoolingParams {
   memory::dims padding_left;
   memory::dims padding_right;
   mkldnn::algorithm alg_kind;
+  mkldnn::prop_kind prop_kind;
 
   MklPoolingParams(memory::dims src_dims, memory::dims dst_dims,
                    memory::dims filter_dims, memory::dims strides,
                    memory::dims padding_left, memory::dims padding_right,
-                   mkldnn::algorithm alg_kind)
+                   mkldnn::algorithm alg_kind, mkldnn::prop_kind prop_kind)
       : src_dims(src_dims),
         dst_dims(dst_dims),
         filter_dims(filter_dims),
         strides(strides),
         padding_left(padding_left),
         padding_right(padding_right),
-        alg_kind(alg_kind) {}
+        alg_kind(alg_kind),
+        prop_kind(prop_kind) {}
 };
 
 template <typename T>
@@ -97,6 +99,9 @@ class MklPoolingFwdPrimitive : public MklPrimitive {
     // algorithm
     mkldnn::algorithm alg_kind;
 
+    // Kind of propagation, forward or backward
+    mkldnn::prop_kind prop_kind;
+
     // expected memory format
     memory::format src_fmt;
     memory::format dst_fmt;
@@ -187,6 +192,7 @@ class MklPoolingFwdPrimitiveFactory : public MklPrimitiveFactory<T> {
     key_creator.AddAsKey(fwdParams.padding_left);
     key_creator.AddAsKey(fwdParams.padding_right);
     key_creator.AddAsKey<int>(static_cast<int>(fwdParams.alg_kind));
+    key_creator.AddAsKey<int>(static_cast<int>(fwdParams.prop_kind));
     return key_creator.GetKey();
   }
 
@@ -443,7 +449,12 @@ class MklPoolingOpBase : public OpKernel {
   explicit MklPoolingOpBase(OpKernelConstruction* context)
       : OpKernel(context), workspace_enabled_(false) {
     string data_format;
-    OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
+    if (std::is_same<T, qint8>::value || std::is_same<T, quint8>::value) {
+      // current quantized convolution doesn't have data_format attribute.
+      data_format = "NHWC";
+    } else {
+      OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
+    }
     OP_REQUIRES(context, FormatFromString(data_format, &this->data_format_tf_),
                 errors::InvalidArgument("Invalid data format"));
     OP_REQUIRES_OK(context, context->GetAttr("ksize", &this->ksize_));
@@ -461,7 +472,7 @@ class MklPoolingOpBase : public OpKernel {
     bool is_pool2d = (this->ksize_.size() == 4);
     this->data_format_mkldnn_ =
         is_pool2d ? TFDataFormatToMklDnnDataFormat(this->data_format_tf_)
-                 : TFDataFormatToMklDnn3DDataFormat(this->data_format_tf_);
+                  : TFDataFormatToMklDnn3DDataFormat(this->data_format_tf_);
 
     // We may not get this attribute for this node if it does not go through
     // graph rewrite pass. So we do not check for error while retrieving this
@@ -655,10 +666,10 @@ class MklPoolingForwardOpBase : public MklPoolingOpBase<T> {
       OP_REQUIRES(context, input_tensor.dims() == 4 || input_tensor.dims() == 5,
                   errors::InvalidArgument("Input must be 4 or 5-dimensional"));
     } else {
-      OP_REQUIRES(context, input_mkl_shape.GetDimension() == 4 ||
-                               input_mkl_shape.GetDimension() == 5,
-                  errors::InvalidArgument("Input shape must be "
-                                          "4 or 5-dimensional"));
+      OP_REQUIRES(
+          context, input_mkl_shape.GetDimension() == 4 ||
+                       input_mkl_shape.GetDimension() == 5,
+          errors::InvalidArgument("Input shape must be 4 or 5-dimensional"));
     }
   }
   // .Input("value: T")
diff --git a/tensorflow/core/kernels/mkl_quantized_pooling_ops_test.cc b/tensorflow/core/kernels/mkl_quantized_pooling_ops_test.cc
new file mode 100644
index 00000000000..7c1e32d6e35
--- /dev/null
+++ b/tensorflow/core/kernels/mkl_quantized_pooling_ops_test.cc
@@ -0,0 +1,201 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifdef INTEL_MKL
+#define EIGEN_USE_THREADS
+
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/fake_input.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/kernels/ops_testutil.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/kernels/quantization_utils.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+
+// Helper class for converting MKL tensors to TF tensors and comparing to
+// expected values
+
+static const uint8 dummy_tensor[] = {0, 0, 0, 0, 0, 0, 0, 0};
+static const TensorShape dummy_shape({8});
+
+class ConvMklToTF : public OpsTestBase {
+ public:
+  template <typename T>
+  void ConvertMKL2TF(DataType dtype, const Tensor& first, const Tensor& second,
+                     Tensor& output) {
+    // Create an MKL to TF conversion node and execute it
+    TF_EXPECT_OK(NodeDefBuilder("mkl_to_tf_op", "_MklToTf")
+                     .Input(FakeInput(dtype))     // Input
+                     .Input(FakeInput(DT_UINT8))  // Mkl second tensor
+                     .Attr("T", dtype)
+                     .Attr("_kernel", "MklOp")
+                     .Finalize(node_def()));
+    TF_EXPECT_OK(InitOp());
+    AddInputFromArray<T>(first.shape(), first.flat<T>());
+    AddInputFromArray<uint8>(second.shape(), second.flat<uint8>());
+    TF_ASSERT_OK(RunOpKernel());
+
+    output = *GetOutput(0);
+  }
+  void TestBody(){};
+};
+
+class QuantizedPoolingTest : public OpsTestBase {};
+
+TEST_F(QuantizedPoolingTest, SmallAveragePooling) {
+  const int ksize = 2;
+  const int stride = 2;
+  TF_ASSERT_OK(NodeDefBuilder("quantized_avg_pool_op", "_MklQuantizedAvgPool")
+                   .Input(FakeInput(DT_QUINT8))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_UINT8))  // MKl second tensor
+                   .Input(FakeInput(DT_UINT8))  // MKl second tensor
+                   .Input(FakeInput(DT_UINT8))  // MKl second tensor
+                   .Attr("T", DataTypeToEnum<quint8>::v())
+                   .Attr("ksize", {1, ksize, ksize, 1})
+                   .Attr("strides", {1, stride, stride, 1})
+                   .Attr("padding", "SAME")
+                   .Attr("_kernel", "QuantizedMklOp")
+                   .Finalize(node_def()));
+  TF_ASSERT_OK(InitOp());
+  const float input_min = 0.0f;
+  const float input_max = 255.0f;
+  const int input_height = 4;
+  const int input_width = 4;
+  const int input_channels = 2;
+  Tensor input_float(DT_FLOAT, {1, input_height, input_width, input_channels});
+  test::FillValues<float>(
+      &input_float,
+      {1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16,
+       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32});
+  Tensor input_quantized =
+      FloatTensorToQuantized<quint8>(input_float, input_min, input_max);
+
+  const int expected_width = input_width / stride;
+  const int expected_height = input_height / stride;
+
+  // The input pools we are averaging. (NHWC input, quantized.)
+  //    0th channel       1st channel
+  //    1  3 |  5  7      2  4 |  6  8
+  //    9 11 | 13 15     10 12 | 14 16
+  //   -------------     -------------
+  //   17 19 | 21 23     18 20 | 22 24
+  //   25 27 | 29 31     26 28 | 30 32
+  Tensor expected_float(DT_FLOAT,
+                        {1, expected_height, expected_width, input_channels});
+  test::FillValues<float>(&expected_float, {6, 7, 10, 11, 22, 23, 26, 27});
+
+  AddInputFromArray<quint8>(input_quantized.shape(),
+                            input_quantized.flat<quint8>());
+  AddInputFromArray<float>(TensorShape({1}), {input_min});
+  AddInputFromArray<float>(TensorShape({1}), {input_max});
+  AddInputFromArray<uint8>(dummy_shape, dummy_tensor);
+  AddInputFromArray<uint8>(dummy_shape, dummy_tensor);
+  AddInputFromArray<uint8>(dummy_shape, dummy_tensor);
+
+  TF_ASSERT_OK(RunOpKernel());
+
+  const Tensor& output = *GetOutput(0);
+  const Tensor& mkl_shape_tensor = *GetOutput(3);
+  ConvMklToTF conv_comp;
+  Tensor output_quantized;
+  conv_comp.ConvertMKL2TF<quint8>(DT_QUINT8, output, mkl_shape_tensor,
+                                  output_quantized);
+
+  const float output_min = GetOutput(1)->flat<float>()(0);
+  const float output_max = GetOutput(2)->flat<float>()(0);
+  Tensor output_float =
+      QuantizedTensorToFloat<quint8>(output_quantized, output_min, output_max);
+
+  test::ExpectTensorNear<float>(expected_float, output_float, 0.2);
+}
+
+TEST_F(QuantizedPoolingTest, SmallMaxPooling) {
+  const int ksize = 2;
+  const int stride = 2;
+  TF_ASSERT_OK(NodeDefBuilder("quantized_max_pool_op", "_MklQuantizedMaxPool")
+                   .Input(FakeInput(DT_QUINT8))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_UINT8))  // MKl second tensor
+                   .Input(FakeInput(DT_UINT8))  // MKl second tensor
+                   .Input(FakeInput(DT_UINT8))  // MKl second tensor
+                   .Attr("T", DataTypeToEnum<quint8>::v())
+                   .Attr("ksize", {1, ksize, ksize, 1})
+                   .Attr("strides", {1, stride, stride, 1})
+                   .Attr("padding", "SAME")
+                   .Attr("_kernel", "QuantizedMklOp")
+                   .Finalize(node_def()));
+  TF_ASSERT_OK(InitOp());
+  const float input_min = 0.0f;
+  const float input_max = 255.0f;
+  const int input_height = 4;
+  const int input_width = 4;
+  const int input_channels = 2;
+  Tensor input_float(DT_FLOAT, {1, input_height, input_width, input_channels});
+  test::FillValues<float>(
+      &input_float,
+      {1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16,
+       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32});
+  Tensor input_quantized =
+      FloatTensorToQuantized<quint8>(input_float, input_min, input_max);
+  const int expected_width = input_width / stride;
+  const int expected_height = input_height / stride;
+
+  // The max is computed from these input pools. (NHWC input, quantized.)
+  //    0th channel       1st channel
+  //    1  3 |  5  7      2  4 |  6  8
+  //    9 11 | 13 15     10 12 | 14 16
+  //   -------------     -------------
+  //   17 19 | 21 23     18 20 | 22 24
+  //   25 27 | 29 31     26 28 | 30 32
+
+  Tensor expected_float(DT_FLOAT,
+                        {1, expected_height, expected_width, input_channels});
+  test::FillValues<float>(&expected_float, {11, 12, 15, 16, 27, 28, 31, 32});
+  AddInputFromArray<quint8>(input_quantized.shape(),
+                            input_quantized.flat<quint8>());
+  AddInputFromArray<float>(TensorShape({1}), {input_min});
+  AddInputFromArray<float>(TensorShape({1}), {input_max});
+  AddInputFromArray<uint8>(dummy_shape, dummy_tensor);
+  AddInputFromArray<uint8>(dummy_shape, dummy_tensor);
+  AddInputFromArray<uint8>(dummy_shape, dummy_tensor);
+
+  TF_ASSERT_OK(RunOpKernel());
+
+  const Tensor& output = *GetOutput(0);
+  const Tensor& mkl_shape_tensor = *GetOutput(3);
+  ConvMklToTF conv_comp;
+  Tensor output_quantized;
+  conv_comp.ConvertMKL2TF<quint8>(DT_QUINT8, output, mkl_shape_tensor,
+                                  output_quantized);
+
+  const float output_min = GetOutput(1)->flat<float>()(0);
+  const float output_max = GetOutput(2)->flat<float>()(0);
+  Tensor output_float =
+      QuantizedTensorToFloat<quint8>(output_quantized, output_min, output_max);
+
+  test::ExpectTensorNear<float>(expected_float, output_float, 0.2);
+}
+}  // namespace tensorflow
+#endif
diff --git a/tensorflow/core/kernels/mkl_softmax_op.cc b/tensorflow/core/kernels/mkl_softmax_op.cc
index cfab529662f..3bf17bc449b 100644
--- a/tensorflow/core/kernels/mkl_softmax_op.cc
+++ b/tensorflow/core/kernels/mkl_softmax_op.cc
@@ -56,7 +56,7 @@ class MklSoftmaxOp : public OpKernel {
       MklDnnShape src_mkl_shape;
       GetMklShape(context, src_idx, &src_mkl_shape);
 
-      // src_dims is the dimenstion of src_tensor
+      // src_dims is the dimension of src_tensor
       // dim of the dst will also be same as src_dims
       auto src_tf_shape = src_mkl_shape.IsMklTensor()
                               ? src_mkl_shape.GetTfShape()
@@ -68,7 +68,7 @@ class MklSoftmaxOp : public OpKernel {
       // Here "x" data format in MKL is used for 1 dim tensor, "nc" for 2 dim tensor, 
       // "tnc" for 3 dim tensor, "nchw" for 4 dim tensor, and "ncdhw" for 5 dim tensor.
       // Each of the simbols has the following meaning:
-      // n = batch, c = channels, t = sequence lenght, h = height,
+      // n = batch, c = channels, t = sequence length, h = height,
       // w = width, d = depth 
       switch (input_dims) {
         case 1:
diff --git a/tensorflow/core/kernels/partitioned_function_ops.cc b/tensorflow/core/kernels/partitioned_function_ops.cc
index 71e506e5e6f..30a8be141cb 100644
--- a/tensorflow/core/kernels/partitioned_function_ops.cc
+++ b/tensorflow/core/kernels/partitioned_function_ops.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/grappler/optimizers/meta_optimizer.h"
 #include "tensorflow/core/grappler/utils/functions.h"
+#include "tensorflow/core/protobuf/config.pb.h"
 #include "tensorflow/core/protobuf/rewriter_config.pb.h"
 #include "tensorflow/core/util/ptr_util.h"
 #include "tensorflow/core/util/reffed_status_callback.h"
@@ -50,12 +51,29 @@ class PartitionedCallOp : public AsyncOpKernel {
  public:
   explicit PartitionedCallOp(OpKernelConstruction* ctx) : AsyncOpKernel(ctx) {
     OP_REQUIRES_OK(ctx, ctx->GetAttr("f", &func_));
-    string rewriter_config_serialized;
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("config", &rewriter_config_serialized));
+    string deprecated_config_serialized;
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("config", &deprecated_config_serialized));
+    string config_proto_serialized;
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("config_proto", &config_proto_serialized));
     OP_REQUIRES(
-        ctx, rewriter_config_.ParseFromString(rewriter_config_serialized),
-        errors::InvalidArgument("Unable to parse rewriter_config string as "
-                                "tensorflow::RewriterConfig proto."));
+        ctx,
+        deprecated_config_serialized.empty() || config_proto_serialized.empty(),
+        errors::InvalidArgument("Provided both 'config' and 'config_proto' but "
+                                "only one should be provided.  Note the "
+                                "'config' option is deprecated."));
+    if (!deprecated_config_serialized.empty()) {
+      OP_REQUIRES(ctx,
+                  config_proto_.mutable_graph_options()
+                      ->mutable_rewrite_options()
+                      ->ParseFromString(deprecated_config_serialized),
+                  errors::InvalidArgument("Unable to parse config string as "
+                                          "tensorflow::RewriteOptions proto."));
+    } else {
+      OP_REQUIRES(
+          ctx, config_proto_.ParseFromString(config_proto_serialized),
+          errors::InvalidArgument("Unable to parse config_proto string as "
+                                  "tensorflow::ConfigProto proto."));
+    }
     OP_REQUIRES_OK(ctx, ctx->GetAttr("executor_type", &executor_type_));
   }
 
@@ -435,7 +453,7 @@ class PartitionedCallOp : public AsyncOpKernel {
         },
         rendez, std::move(done), std::placeholders::_1);
     auto* refcounted_done = new ReffedStatusCallback(std::move(callback));
-    for (int i = 1; i < handles->size(); ++i) {
+    for (int i = 0; i < handles->size(); ++i) {
       refcounted_done->Ref();
     }
 
@@ -489,6 +507,7 @@ class PartitionedCallOp : public AsyncOpKernel {
             });
       }
     }
+    refcounted_done->Unref();
   }
 
   string UniquifyFunctionName(const FunctionLibraryDefinition* function_library,
@@ -506,12 +525,18 @@ class PartitionedCallOp : public AsyncOpKernel {
                        FunctionLibraryDefinition* flib,
                        const DeviceSet& device_set, Device* cpu_device,
                        std::unique_ptr<Graph>* graph) {
-    if (!tensorflow::grappler::MetaOptimizerEnabled(rewriter_config_)) {
+    if (!tensorflow::grappler::MetaOptimizerEnabled(config_proto_)) {
       return Status::OK();
     }
 
     tensorflow::grappler::GrapplerItem item;
 
+    // Add all available devices so that inlined function can be placed.
+    for (const Device* d : device_set.devices()) {
+      Status added_device = item.AddDevice(d->name());
+      if (!added_device.ok()) VLOG(3) << added_device.error_message();
+    }
+
     // Add fetches so that the graph can be pruned.
     for (Node* node : ret_nodes) {
       item.fetch.push_back(node->name());
@@ -530,7 +555,7 @@ class PartitionedCallOp : public AsyncOpKernel {
     // TODO(nareshmodi): Consider adding and using the more generic GraphOptions
     // proto (which also contain the OptimizerOptions).
     TF_RETURN_IF_ERROR(tensorflow::grappler::RunMetaOptimizer(
-        item, rewriter_config_, cpu_device, &cluster, &out_graph));
+        item, config_proto_, cpu_device, &cluster, &out_graph));
 
     std::unique_ptr<Graph> optimized_graph(new Graph(OpRegistry::Global()));
     TF_RETURN_IF_ERROR(ConvertGraphDefToGraph(
@@ -562,7 +587,7 @@ class PartitionedCallOp : public AsyncOpKernel {
   }
 
   NameAttrList func_;
-  RewriterConfig rewriter_config_;
+  ConfigProto config_proto_;
   string executor_type_;
   // Contains maps from device names to handles of function partitions, keyed by
   // FunctionLibraryRuntime pointers. (Because this kernel may be instantiated
diff --git a/tensorflow/core/kernels/quantize_and_dequantize_op.cc b/tensorflow/core/kernels/quantize_and_dequantize_op.cc
index dadc15b69ee..f13341e0afe 100644
--- a/tensorflow/core/kernels/quantize_and_dequantize_op.cc
+++ b/tensorflow/core/kernels/quantize_and_dequantize_op.cc
@@ -49,6 +49,21 @@ class QuantizeAndDequantizeV2Op : public OpKernel {
                 errors::InvalidArgument("num_bits is out of range: ", num_bits_,
                                         " with signed_input_ ", signed_input_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("range_given", &range_given_));
+
+    string round_mode_string;
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("round_mode", &round_mode_string));
+    OP_REQUIRES(
+        ctx,
+        (round_mode_string == "HALF_UP" || round_mode_string == "HALF_TO_EVEN"),
+        errors::InvalidArgument("Round mode string must be "
+                                "'HALF_UP' or "
+                                "'HALF_TO_EVEN', is '" +
+                                round_mode_string + "'"));
+    if (round_mode_string == "HALF_UP") {
+      round_mode_ = ROUND_HALF_UP;
+    } else if (round_mode_string == "HALF_TO_EVEN") {
+      round_mode_ = ROUND_HALF_TO_EVEN;
+    }
   }
 
   void Compute(OpKernelContext* ctx) override {
@@ -76,13 +91,15 @@ class QuantizeAndDequantizeV2Op : public OpKernel {
 
     functor::QuantizeAndDequantizeOneScaleFunctor<Device, T> f;
     f(ctx->eigen_device<Device>(), input.flat<T>(), signed_input_, num_bits_,
-      range_given_, &input_min_tensor, &input_max_tensor, output->flat<T>());
+      range_given_, &input_min_tensor, &input_max_tensor, round_mode_,
+      output->flat<T>());
   }
 
  private:
   bool signed_input_;
   int num_bits_;
   bool range_given_;
+  QuantizerRoundMode round_mode_;
 };
 
 // Simulate quantization precision loss in a float tensor by:
@@ -135,7 +152,8 @@ class QuantizeAndDequantizeV3Op : public OpKernel {
 
     functor::QuantizeAndDequantizeOneScaleFunctor<Device, T> f;
     f(ctx->eigen_device<Device>(), input.flat<T>(), signed_input_, num_bits_val,
-      range_given_, &input_min_tensor, &input_max_tensor, output->flat<T>());
+      range_given_, &input_min_tensor, &input_max_tensor, ROUND_HALF_TO_EVEN,
+      output->flat<T>());
   }
 
  private:
@@ -180,7 +198,7 @@ class QuantizeAndDequantizeOp : public OpKernel {
     functor::QuantizeAndDequantizeOneScaleFunctor<Device, T> functor;
     functor(ctx->eigen_device<Device>(), input.flat<T>(), signed_input_,
             num_bits_, range_given_, &input_min_tensor, &input_max_tensor,
-            output->flat<T>());
+            ROUND_HALF_TO_EVEN, output->flat<T>());
   }
 
  private:
@@ -198,10 +216,11 @@ struct QuantizeAndDequantizeOneScaleFunctor<CPUDevice, T> {
   void operator()(const CPUDevice& d, typename TTypes<T>::ConstVec input,
                   const bool signed_input, const int num_bits,
                   const bool range_given, Tensor* input_min_tensor,
-                  Tensor* input_max_tensor, typename TTypes<T>::Vec out) {
+                  Tensor* input_max_tensor, QuantizerRoundMode round_mode,
+                  typename TTypes<T>::Vec out) {
     QuantizeAndDequantizeOneScaleImpl<CPUDevice, T>::Compute(
         d, input, signed_input, num_bits, range_given, input_min_tensor,
-        input_max_tensor, out);
+        input_max_tensor, round_mode, out);
   }
 };
 }  // namespace functor
diff --git a/tensorflow/core/kernels/quantize_and_dequantize_op.h b/tensorflow/core/kernels/quantize_and_dequantize_op.h
index 6b0c5e5a466..a495e8b71fe 100644
--- a/tensorflow/core/kernels/quantize_and_dequantize_op.h
+++ b/tensorflow/core/kernels/quantize_and_dequantize_op.h
@@ -22,6 +22,20 @@ limitations under the License.
 #include "tensorflow/core/kernels/cwise_ops.h"
 
 namespace tensorflow {
+
+enum QuantizerRoundMode {
+  // Round half up: if the fraction of y is exactly 0.5, then
+  // round(y) = y + 0.5
+  // E.g., -5.5 gets rounded to -5, -5.4 goes to -5,
+  // 5.4 goes to 5, and 5.5 goes to 6.
+  ROUND_HALF_UP,
+  // Round half to even: if the fraction of y is exactly 0.5, then round(y) is
+  // the nearest even integer to y.
+  // E.g., 23.5 gets rounded to 24, 24.5 gets rounded to 24, while -23.5 becomes
+  // -24, and -24.5 gets rounded to 24.
+  ROUND_HALF_TO_EVEN,
+};
+
 namespace functor {
 
 // TODO(pauldonnelly): 'signed_input' should really be called 'signed_output'.
@@ -31,15 +45,69 @@ struct QuantizeAndDequantizeOneScaleFunctor {
   void operator()(const Device& d, typename TTypes<T>::ConstVec input,
                   bool signed_input, int num_bits, bool range_given,
                   Tensor* input_min_tensor, Tensor* input_max_tensor,
-                  typename TTypes<T>::Vec out);
+                  QuantizerRoundMode round_mode, typename TTypes<T>::Vec out);
 };
 
+// The implementation below runs on both CPU and GPU.
+template <typename Device, typename T, typename Func>
+void ClampScaleAndRound(const Device& d, typename TTypes<T>::ConstVec input,
+                        T min_range, T max_range, T scale, T inverse_scale,
+                        Func round_func, typename TTypes<T>::Vec out) {
+  out.device(d) = (input.cwiseMin(max_range).cwiseMax(min_range) * scale)
+                      .unaryExpr(round_func) *
+                  inverse_scale;
+}
+
+// The implementation below runs on both CPU and GPU.
+template <typename Device, typename T>
+void ClampScaleAndRound(const Device& d, typename TTypes<T>::ConstVec input,
+                        T min_range, T max_range, T scale, T inverse_scale,
+                        QuantizerRoundMode round_mode,
+                        typename TTypes<T>::Vec out) {
+  switch (round_mode) {
+    case ROUND_HALF_TO_EVEN:
+      ClampScaleAndRound(d, input, min_range, max_range, scale, inverse_scale,
+                         Eigen::internal::scalar_round_op_google<T>(), out);
+      break;
+    case ROUND_HALF_UP:
+      ClampScaleAndRound(d, input, min_range, max_range, scale, inverse_scale,
+                         Eigen::internal::scalar_round_up_op<T>(), out);
+      break;
+  }
+}
+
+// The implementation below runs on both CPU and GPU.
+template <typename Device, typename T, typename Func>
+void ScaleAndRound(const Device& d, typename TTypes<T>::ConstVec input, T scale,
+                   T inverse_scale, Func round_func,
+                   typename TTypes<T>::Vec out) {
+  out.device(d) = (input * scale).unaryExpr(round_func) * inverse_scale;
+}
+
+// The implementation below runs on both CPU and GPU.
+template <typename Device, typename T>
+void ScaleAndRound(const Device& d, typename TTypes<T>::ConstVec input, T scale,
+                   T inverse_scale, QuantizerRoundMode round_mode,
+                   typename TTypes<T>::Vec out) {
+  switch (round_mode) {
+    case ROUND_HALF_TO_EVEN:
+      ScaleAndRound(d, input, scale, inverse_scale,
+                    Eigen::internal::scalar_round_op_google<T>(), out);
+      break;
+    case ROUND_HALF_UP:
+      ScaleAndRound(d, input, scale, inverse_scale,
+                    Eigen::internal::scalar_round_up_op<T>(), out);
+      break;
+  }
+}
+
 // The implementation below runs on both CPU and GPU.
 template <typename Device, typename T>
 struct QuantizeAndDequantizeOneScaleImpl {
   static void Compute(const Device& d, typename TTypes<T>::ConstVec input,
                       bool signed_input, int num_bits, bool range_given,
                       Tensor* input_min_tensor, Tensor* input_max_tensor,
+                      QuantizerRoundMode round_mode,
                       typename TTypes<T>::Vec out) {
     T min_range;
     T max_range;
@@ -89,15 +157,10 @@ struct QuantizeAndDequantizeOneScaleImpl {
       // The semantics of the op does not guarantee to clamp to the specified
       // min_range and max_range - because we may have changed either min_range
       // or max_range.
-      out.device(d) =
-          (input.cwiseMin(max_range).cwiseMax(min_range) * scale)
-              .unaryExpr(Eigen::internal::scalar_round_op_google<T>()) *
-          inverse_scale;
+      ClampScaleAndRound(d, input, min_range, max_range, scale, inverse_scale,
+                         round_mode, out);
     } else {
-      out.device(d) =
-          (input * scale)
-              .unaryExpr(Eigen::internal::scalar_round_op_google<T>()) *
-          inverse_scale;
+      ScaleAndRound(d, input, scale, inverse_scale, round_mode, out);
     }
   }
 };
diff --git a/tensorflow/core/kernels/quantize_and_dequantize_op_gpu.cu.cc b/tensorflow/core/kernels/quantize_and_dequantize_op_gpu.cu.cc
index 61c79cf6959..5745e418f36 100644
--- a/tensorflow/core/kernels/quantize_and_dequantize_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/quantize_and_dequantize_op_gpu.cu.cc
@@ -32,10 +32,10 @@ struct QuantizeAndDequantizeOneScaleFunctor<GPUDevice, T> {
   void operator()(const GPUDevice& d, typename TTypes<T>::ConstVec input,
                   bool signed_input, int num_bits, bool range_given,
                   Tensor* input_min_tensor, Tensor* input_max_tensor,
-                  typename TTypes<T>::Vec out) {
+                  QuantizerRoundMode round_mode, typename TTypes<T>::Vec out) {
     QuantizeAndDequantizeOneScaleImpl<GPUDevice, T>::Compute(
         d, input, signed_input, num_bits, range_given, input_min_tensor,
-        input_max_tensor, out);
+        input_max_tensor, round_mode, out);
   }
 };
 }  // end namespace functor
diff --git a/tensorflow/core/kernels/quantize_and_dequantize_op_test.cc b/tensorflow/core/kernels/quantize_and_dequantize_op_test.cc
index cddabf8a99a..b9e015c96b5 100644
--- a/tensorflow/core/kernels/quantize_and_dequantize_op_test.cc
+++ b/tensorflow/core/kernels/quantize_and_dequantize_op_test.cc
@@ -101,17 +101,51 @@ TEST_F(QuantizeAndDequantizeTest, Convert_1D_tensor_with_int8) {
           .Attr("range_given", false)
           .Finalize(node_def()));
   TF_ASSERT_OK(InitOp());
-  AddInputFromArray<float>(TensorShape({6}), {-1, -0.5, 0, 0.3, 0.8, 0.555});
+  AddInputFromArray<float>(TensorShape({7}),
+                           {-1, -0.5, 0, 0.3, 0.8, 0.555, 0.50390625});
   AddInputFromArray<float>(TensorShape({}), {0.0});  // Min
   AddInputFromArray<float>(TensorShape({}), {0.0});  // Max
 
-  // With int8, the tensor is quantized to {-128, -64, 0, 38, 102, 71}.
+  // With int8, the tensor is quantized to {-128, -64, 0, 38, 102, 71, 64}.
   // Scale is: 1/127
-  // Then it is dequantized to {-1, -0.5, 0, 38.0/128, 102.0/128, 71.0/128}
+  // Then it is dequantized to {-1, -0.5, 0, 38.0/128, 102.0/128, 71.0/128, 0.5}
   TF_ASSERT_OK(RunOpKernel());
-  Tensor expected(allocator(), DT_FLOAT, TensorShape({6}));
-  test::FillValues<float>(&expected,
-                          {-1, -0.5, 0, 38.0 / 128, 102.0 / 128, 71.0 / 128});
+  Tensor expected(allocator(), DT_FLOAT, TensorShape({7}));
+  test::FillValues<float>(
+      &expected, {-1, -0.5, 0, 38.0 / 128, 102.0 / 128, 71.0 / 128, 0.5});
+  test::ExpectTensorNear<float>(expected, *GetOutput(0), 1e-5);
+
+  // Ensure that the inputs haven't been changed.
+  EXPECT_EQ(inputs_[1]->scalar<float>()(), 0.0);
+  EXPECT_EQ(inputs_[2]->scalar<float>()(), 0.0);
+}
+
+// Convert a 1D tensor with signed 8 bits and round_mode half_up.
+TEST_F(QuantizeAndDequantizeTest, Convert_1D_tensor_with_int8_round_half_up) {
+  TF_ASSERT_OK(
+      NodeDefBuilder("quantize_and_dequantize_op", "QuantizeAndDequantizeV2")
+          .Input(FakeInput(DT_FLOAT))
+          .Input(FakeInput(DT_FLOAT))
+          .Input(FakeInput(DT_FLOAT))
+          .Attr("signed_input", true)
+          .Attr("num_bits", 8)
+          .Attr("range_given", false)
+          .Attr("round_mode", "HALF_UP")
+          .Finalize(node_def()));
+  TF_ASSERT_OK(InitOp());
+  AddInputFromArray<float>(TensorShape({7}),
+                           {-1, -0.5, 0, 0.3, 0.8, 0.555, 0.50390625});
+  AddInputFromArray<float>(TensorShape({}), {0.0});  // Min
+  AddInputFromArray<float>(TensorShape({}), {0.0});  // Max
+
+  // With int8, the tensor is quantized to {-128, -64, 0, 38, 102, 71, 65}.
+  // Scale is: 1/127
+  // Then it is dequantized to {-1, -0.5, 0, 38.0/128, 102.0/128, 71.0/128,
+  // 65.0 /128}
+  TF_ASSERT_OK(RunOpKernel());
+  Tensor expected(allocator(), DT_FLOAT, TensorShape({7}));
+  test::FillValues<float>(&expected, {-1, -0.5, 0, 38.0 / 128, 102.0 / 128,
+                                      71.0 / 128, 65.0 / 128});
   test::ExpectTensorNear<float>(expected, *GetOutput(0), 1e-5);
 
   // Ensure that the inputs haven't been changed.
@@ -162,7 +196,7 @@ TEST_F(QuantizeAndDequantizeTest, Convert_1D_tensor_with_int4) {
           .Attr("range_given", false)
           .Finalize(node_def()));
   TF_ASSERT_OK(InitOp());
-  AddInputFromArray<float>(TensorShape({6}), {-1, -0.5, 0, 0.3, 0.8, 0.555});
+  AddInputFromArray<float>(TensorShape({6}), {-1, -0.5, 0, 0.3125, 0.8, 0.555});
   AddInputFromArray<float>(TensorShape({}), {0.0});  // Min
   AddInputFromArray<float>(TensorShape({}), {0.0});  // Max
 
@@ -178,6 +212,35 @@ TEST_F(QuantizeAndDequantizeTest, Convert_1D_tensor_with_int4) {
   EXPECT_EQ(inputs_[2]->scalar<float>()(), 0.0);
 }
 
+// Convert a 1D tensor with signed 4 bits and round_mode hafl_up.
+TEST_F(QuantizeAndDequantizeTest, Convert_1D_tensor_with_int4_round_half_up) {
+  TF_ASSERT_OK(
+      NodeDefBuilder("quantize_and_dequantize_op", "QuantizeAndDequantizeV2")
+          .Input(FakeInput(DT_FLOAT))
+          .Input(FakeInput(DT_FLOAT))
+          .Input(FakeInput(DT_FLOAT))
+          .Attr("signed_input", true)
+          .Attr("num_bits", 4)
+          .Attr("range_given", false)
+          .Attr("round_mode", "HALF_UP")
+          .Finalize(node_def()));
+  TF_ASSERT_OK(InitOp());
+  AddInputFromArray<float>(TensorShape({6}), {-1, -0.5, 0, 0.3125, 0.8, 0.555});
+  AddInputFromArray<float>(TensorShape({}), {0.0});  // Min
+  AddInputFromArray<float>(TensorShape({}), {0.0});  // Max
+
+  // With int4, the tensor is quantized to {-8, -4, 0, 3, 6, 4}.
+  // Scale is: 1/8
+  TF_ASSERT_OK(RunOpKernel());
+  Tensor expected(allocator(), DT_FLOAT, TensorShape({6}));
+  test::FillValues<float>(&expected, {-1, -0.5, 0, 0.375, 0.75, 0.5});
+  test::ExpectTensorNear<float>(expected, *GetOutput(0), 1e-5);
+
+  // Ensure that the inputs haven't been changed.
+  EXPECT_EQ(inputs_[1]->scalar<float>()(), 0.0);
+  EXPECT_EQ(inputs_[2]->scalar<float>()(), 0.0);
+}
+
 // Convert a 1D tensor with signed 4 bits.
 TEST_F(QuantizeAndDequantizeTest, Convert_1D_tensor_with_int4_V3) {
   TF_ASSERT_OK(
@@ -237,6 +300,38 @@ TEST_F(QuantizeAndDequantizeTest, Convert_2D_tensor_with_int8_range_given) {
   test::ExpectTensorNear<float>(expected, *GetOutput(0), 1e-5);
 }
 
+// Convert a 2D tensor with signed 8 bits, given range and round_mode half_up.
+TEST_F(QuantizeAndDequantizeTest,
+       Convert_2D_tensor_with_int8_range_given_round_half_up) {
+  TF_ASSERT_OK(
+      NodeDefBuilder("quantize_and_dequantize_op", "QuantizeAndDequantizeV2")
+          .Input(FakeInput(DT_FLOAT))
+          .Input(FakeInput(DT_FLOAT))
+          .Input(FakeInput(DT_FLOAT))
+          .Attr("signed_input", true)
+          .Attr("num_bits", 8)
+          .Attr("range_given", true)
+          .Attr("round_mode", "HALF_UP")
+          .Finalize(node_def()));
+  TF_ASSERT_OK(InitOp());
+  // Note that the last two values are saturated.
+  AddInputFromArray<float>(TensorShape({2, 4}),
+                           {-0.8, -0.5, 0, 0.3, 0.8, 0.555, -2, 33});
+  AddInputFromArray<float>(TensorShape({}), {-1.0});  // Min
+  AddInputFromArray<float>(TensorShape({}), {1.0});   // Max
+
+  // Note that the range is given as [-1, 1].
+  // With int8, the tensor is quantized to {-102, -63, 0, 38, 102, 70, -128,
+  // 127}.
+  // Scale is: 1/127
+  TF_ASSERT_OK(RunOpKernel());
+  Tensor expected(allocator(), DT_FLOAT, TensorShape({2, 4}));
+  test::FillValues<float>(
+      &expected, {-102.0 / 127, -63.0 / 127, 0, 38.0 / 127, 102.0 / 127,
+                  70.0 / 127, -128.0 / 127, 1});
+  test::ExpectTensorNear<float>(expected, *GetOutput(0), 1e-5);
+}
+
 // Convert a 2D tensor with signed 8 bits with given range.
 TEST_F(QuantizeAndDequantizeTest, Convert_2D_tensor_with_int8_range_given_V3) {
   TF_ASSERT_OK(
@@ -293,6 +388,33 @@ TEST_F(QuantizeAndDequantizeTest, Convert_4D_tensor_with_uint8_range_given) {
   test::ExpectTensorNear<float>(expected, *GetOutput(0), 1e-5);
 }
 
+// Convert a 4D tensor with unsigned 8 bits, given range and round_mode half_up.
+TEST_F(QuantizeAndDequantizeTest,
+       Convert_4D_tensor_with_uint8_range_given_round_half_up) {
+  TF_ASSERT_OK(
+      NodeDefBuilder("quantize_and_dequantize_op", "QuantizeAndDequantizeV2")
+          .Input(FakeInput(DT_FLOAT))
+          .Input(FakeInput(DT_FLOAT))
+          .Input(FakeInput(DT_FLOAT))
+          .Attr("signed_input", false)
+          .Attr("num_bits", 8)
+          .Attr("range_given", true)
+          .Attr("round_mode", "HALF_UP")
+          .Finalize(node_def()));
+  TF_ASSERT_OK(InitOp());
+  AddInputFromArray<float>(TensorShape({2, 2, 1, 1}), {-0.5, 0, 0.3, 0.8});
+  AddInputFromArray<float>(TensorShape({}), {0.0});  // Min
+  AddInputFromArray<float>(TensorShape({}), {1.0});  // Max
+
+  // Note that the range is given as [0, 1].
+  // With int8, the tensor is quantized to {0, 0, 77, 204}
+  // Scale is: 1/255
+  TF_ASSERT_OK(RunOpKernel());
+  Tensor expected(allocator(), DT_FLOAT, TensorShape({2, 2, 1, 1}));
+  test::FillValues<float>(&expected, {0, 0, 77.0 / 255, 204.0 / 255});
+  test::ExpectTensorNear<float>(expected, *GetOutput(0), 1e-5);
+}
+
 // Convert a 4D tensor with unsigned 8 bits with given range.
 TEST_F(QuantizeAndDequantizeTest, Convert_4D_tensor_with_uint8_range_given_V3) {
   TF_ASSERT_OK(
diff --git a/tensorflow/core/kernels/quantized_resize_bilinear_op_test.cc b/tensorflow/core/kernels/quantized_resize_bilinear_op_test.cc
index e6133415d0f..6fc48945923 100644
--- a/tensorflow/core/kernels/quantized_resize_bilinear_op_test.cc
+++ b/tensorflow/core/kernels/quantized_resize_bilinear_op_test.cc
@@ -273,7 +273,7 @@ void TestResizeBilinearOneDim() {
         << expected_val << ", " << resized_image_val;
   }
 
-  // Value testing with reference implemenatation
+  // Value testing with reference implementation
   CheckTensorValue<qint32>(image_quantized_tensor.flat<qint32>().data(),
                            outputs.at(0).flat<qint32>().data(),
                            /*batch_size=*/1,
diff --git a/tensorflow/core/kernels/ragged_gather_op.cc b/tensorflow/core/kernels/ragged_gather_op.cc
index b2a342f6378..903a97a9601 100644
--- a/tensorflow/core/kernels/ragged_gather_op.cc
+++ b/tensorflow/core/kernels/ragged_gather_op.cc
@@ -236,8 +236,10 @@ class RaggedGatherOpBase : public OpKernel {
     values_shape.set_dim(0, num_values);
     TF_RETURN_IF_ERROR(
         context->allocate_output(values_index, values_shape, &values_out));
-    int64 value_size = params_dense_values_in.NumElements() /
-                       params_dense_values_in.dim_size(0);
+    const int64 num_elements = params_dense_values_in.NumElements();
+    const int64 value_size =
+        num_elements == 0 ? 0
+                          : (num_elements / params_dense_values_in.dim_size(0));
     CallWriteValueSlices(params_dense_values_in, value_slices, value_size,
                          values_out);
     return ::tensorflow::Status::OK();
diff --git a/tensorflow/core/kernels/scan_ops_gpu.cu.cc b/tensorflow/core/kernels/scan_ops_gpu.cu.cc
index ed6c6affce5..ed66c02dc58 100644
--- a/tensorflow/core/kernels/scan_ops_gpu.cu.cc
+++ b/tensorflow/core/kernels/scan_ops_gpu.cu.cc
@@ -1,4 +1,4 @@
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -17,8 +17,20 @@ limitations under the License.
 
 #define EIGEN_USE_GPU
 
+#if CUDA_VERSION >= 9000
+#define CUB_USE_COOPERATIVE_GROUPS
+#endif  // CUDA_VERSION >= 9000
+
+#include "third_party/cub/block/block_load.cuh"
+#include "third_party/cub/block/block_scan.cuh"
+#include "third_party/cub/block/block_store.cuh"
+#include "third_party/cub/iterator/counting_input_iterator.cuh"
+#include "third_party/cub/iterator/transform_input_iterator.cuh"
+#include "cuda/include/cuComplex.h"
 #include "tensorflow/core/framework/numeric_types.h"
 #include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/util/permutation_input_iterator.h"
+#include "tensorflow/core/util/permutation_output_iterator.h"
 
 #include "tensorflow/core/kernels/scan_ops.h"
 
@@ -27,6 +39,258 @@ namespace tensorflow {
 typedef Eigen::GpuDevice GPUDevice;
 typedef Eigen::Index Index;
 
+namespace functor {
+
+// Map a contiguous range to the actual memory locations depending on which
+// axis the scan is taking place over and whether or not reversed.
+struct MapIndexToLocation {
+  __host__ __device__ MapIndexToLocation(int dimx, int dimy, int dimz,
+                                         bool reverse = false)
+      : dimx_(dimx), dimy_(dimy), dimz_(dimz), reverse_(reverse) {}
+
+  __host__ __device__ int operator()(int id) const {
+    if (dimx_ == 1) {
+      int row = id % dimy_;
+      int col = id / dimy_;
+
+      if (reverse_) return (dimy_ - row - 1) * dimz_ + col;
+
+      return row * dimz_ + col;
+    } else if (dimz_ == 1) {
+      if (reverse_) {
+        int row = id / dimy_;
+        int col = id % dimy_;
+        return row * dimy_ + (dimy_ - col - 1);
+      }
+      return id;
+    } else {
+      int col = id % dimy_;
+      int tmp = id / dimy_;
+
+      int row1 = id / (dimy_ * dimz_);
+      int col1 = tmp % dimz_;
+
+      if (reverse_)
+        return row1 * dimy_ * dimz_ + (dimy_ - col - 1) * dimz_ + col1;
+
+      return row1 * dimy_ * dimz_ + col * dimz_ + col1;
+    }
+  }
+
+  int dimx_;
+  int dimy_;
+  int dimz_;
+  bool reverse_;
+};
+
+template <typename T, typename Op>
+struct BlockPrefixCallbackOp {
+  // Running prefix
+  T running_total_;
+  Op op_;
+
+  __device__ BlockPrefixCallbackOp(T running_total, Op op)
+      : running_total_(running_total), op_(op) {}
+
+  // Callback operator to be entered by the first warp of threads in the block.
+  // tid 0 is responsible for returning a value for seeding the block-wide scan.
+  __device__ T operator()(T block_aggregate) {
+    T old_prefix = running_total_;
+    running_total_ = op_(old_prefix, block_aggregate);
+    return old_prefix;
+  }
+};
+
+template <typename T>
+struct Sum {
+  __host__ __device__ T operator()(const T& a, const T& b) const {
+    return a + b;
+  }
+};
+
+template <typename T>
+struct Prod {
+  __host__ __device__ T operator()(const T& a, const T& b) const {
+    return a * b;
+  }
+};
+
+template <typename T, typename Op>
+struct IsSum {
+  constexpr static bool value =
+      (std::is_same<Op, Sum<T>>::value ||
+       std::is_same<Op, Eigen::internal::SumReducer<T>>::value);
+};
+
+template <typename T, typename Op>
+struct IsProd {
+  constexpr static bool value =
+      (std::is_same<Op, Prod<T>>::value ||
+       std::is_same<Op, Eigen::internal::ProdReducer<T>>::value);
+};
+
+template <typename T, typename Op>
+struct IdentityValue {
+  static_assert(IsSum<T, Op>::value || IsProd<T, Op>::value,
+                "IdentityValue not yet defined for this type.");
+
+  template <typename U = T, typename OpCopy = Op>
+  __host__ __device__ U operator()(
+      typename std::enable_if<IsSum<U, OpCopy>::value, U>::type t = U(0)) {
+    return t;
+  }
+
+  template <typename U = T, typename OpCopy = Op>
+  __host__ __device__ U operator()(
+      typename std::enable_if<IsProd<U, OpCopy>::value, U>::type t = U(1)) {
+    return t;
+  }
+};
+
+// Each block is mapped to one sequence.  A contiguous range is mapped to the
+// appropriate locations in memory by the permutation iterators.  This is
+// ideal for 1-D and row based scans.  Column scans would be better if they
+// did a block load and then locally transposed.  CUB's device wide scan is not
+// used in the large 1D case, even though it would be more efficient, because
+// it is not deterministic.
+template <typename T, typename Op, int BlockDim = 128, int ItemsPerThread = 4>
+__global__ void scan_kernel(const T* in, T* out, int dimx, int dimy, int dimz,
+                            bool exclusive, bool reverse, Op op) {
+  typedef cub::BlockLoad<T, BlockDim, ItemsPerThread, cub::BLOCK_LOAD_TRANSPOSE>
+      BlockLoad;
+  typedef cub::BlockStore<T, BlockDim, ItemsPerThread,
+                          cub::BLOCK_STORE_TRANSPOSE>
+      BlockStore;
+  typedef cub::BlockScan<T, BlockDim> BlockScan;
+
+  // Allocate aliased shared memory for BlockLoad, BlockStore, and BlockScan
+  __shared__ union {
+    typename BlockLoad::TempStorage load;
+    typename BlockScan::TempStorage scan;
+    typename BlockStore::TempStorage store;
+  } temp_storage;
+
+  int problem_length = dimy;
+
+  // Initialize running total
+  BlockPrefixCallbackOp<T, Op> prefix_op(IdentityValue<T, Op>()(), op);
+
+  MapIndexToLocation map_op(dimx, dimy, dimz, reverse);
+  int block_start = problem_length * blockIdx.x;
+  // Have the block iterate over segments of items
+  for (int block_offset = block_start;
+       block_offset < block_start + problem_length;
+       block_offset += BlockDim * ItemsPerThread) {
+    int valid_items = min(BlockDim * ItemsPerThread,
+                          problem_length - (block_offset % problem_length));
+
+    // first construct a counting iterator that has the desired start point
+    typedef cub::TransformInputIterator<int, MapIndexToLocation,
+                                        cub::CountingInputIterator<int>>
+        MapIterType;
+
+    cub::CountingInputIterator<int> counting_iter(block_offset);
+
+    // Next map the iterator to the actual locations in memory
+    MapIterType map_iter(counting_iter, map_op);
+
+    PermutationInputIterator<T, const T*, MapIterType> permutein_iter(in,
+                                                                      map_iter);
+    PermutationOutputIterator<T, T*, MapIterType> permuteout_iter(out,
+                                                                  map_iter);
+
+    // Load a segment of consecutive items that are blocked across threads
+    T thread_data[ItemsPerThread];
+    BlockLoad(temp_storage.load).Load(permutein_iter, thread_data, valid_items);
+    __syncthreads();
+
+    // Collectively compute the block-wide scan
+    if (exclusive) {
+      BlockScan(temp_storage.scan)
+          .ExclusiveScan(thread_data, thread_data, op, prefix_op);
+    } else {
+      BlockScan(temp_storage.scan)
+          .InclusiveScan(thread_data, thread_data, op, prefix_op);
+    }
+    __syncthreads();
+
+    // Store scanned items to output segment
+    BlockStore(temp_storage.store)
+        .Store(permuteout_iter, thread_data, valid_items);
+    __syncthreads();
+  }
+}
+
+template <typename T, typename Op>
+void LaunchScan(const GPUDevice& d, typename TTypes<T, 3>::ConstTensor in,
+                typename TTypes<T, 3>::Tensor out, Op op, const bool reverse,
+                const bool exclusive) {
+  const int items_per_thread = 4;
+
+  int dimx = in.dimension(0);
+  int dimy = in.dimension(1);
+  int dimz = in.dimension(2);
+  int num_blocks = dimx * dimz;
+
+  int ideal_block_size = dimy / items_per_thread;
+
+  // There seems to be a bug when the type is not float and block_size 1024.
+  // Launch on the smallest power of 2 block size that we can.
+  if (ideal_block_size >= 1024 && std::is_same<T, float>::value) {
+    const int block_size = 1024;
+    scan_kernel<T, Op, block_size, items_per_thread>
+        <<<num_blocks, block_size, 0, d.stream()>>>(
+            in.data(), out.data(), dimx, dimy, dimz, exclusive, reverse, op);
+  } else if (ideal_block_size >= 512) {
+    const int block_size = 512;
+    scan_kernel<T, Op, block_size, items_per_thread>
+        <<<num_blocks, block_size, 0, d.stream()>>>(
+            in.data(), out.data(), dimx, dimy, dimz, exclusive, reverse, op);
+  } else if (ideal_block_size >= 256) {
+    const int block_size = 256;
+    scan_kernel<T, Op, block_size, items_per_thread>
+        <<<num_blocks, block_size, 0, d.stream()>>>(
+            in.data(), out.data(), dimx, dimy, dimz, exclusive, reverse, op);
+  } else if (ideal_block_size >= 128) {
+    const int block_size = 128;
+    scan_kernel<T, Op, block_size, items_per_thread>
+        <<<num_blocks, block_size, 0, d.stream()>>>(
+            in.data(), out.data(), dimx, dimy, dimz, exclusive, reverse, op);
+  } else if (ideal_block_size >= 64) {
+    const int block_size = 64;
+    scan_kernel<T, Op, block_size, items_per_thread>
+        <<<num_blocks, block_size, 0, d.stream()>>>(
+            in.data(), out.data(), dimx, dimy, dimz, exclusive, reverse, op);
+  } else {
+    const int block_size = 32;
+    scan_kernel<T, Op, block_size, items_per_thread>
+        <<<num_blocks, block_size, 0, d.stream()>>>(
+            in.data(), out.data(), dimx, dimy, dimz, exclusive, reverse, op);
+  }
+}
+
+template <typename T>
+struct Scan<GPUDevice, Eigen::internal::SumReducer<T>, T> {
+  void operator()(const GPUDevice& d, typename TTypes<T, 3>::ConstTensor in,
+                  typename TTypes<T, 3>::Tensor out,
+                  const Eigen::internal::SumReducer<T>& reducer,
+                  const bool reverse, const bool exclusive) {
+    LaunchScan<T, Sum<T>>(d, in, out, Sum<T>(), reverse, exclusive);
+  }
+};
+
+template <typename T>
+struct Scan<GPUDevice, Eigen::internal::ProdReducer<T>, T> {
+  void operator()(const GPUDevice& d, typename TTypes<T, 3>::ConstTensor in,
+                  typename TTypes<T, 3>::Tensor out,
+                  const Eigen::internal::ProdReducer<T>& reducer,
+                  const bool reverse, const bool exclusive) {
+    LaunchScan<T, Prod<T>>(d, in, out, Prod<T>(), reverse, exclusive);
+  }
+};
+
+}  // namespace functor
+
 #define DEFINE(REDUCER, T) template struct functor::Scan<GPUDevice, REDUCER, T>;
 
 #define DEFINE_FOR_ALL_REDUCERS(T)           \
diff --git a/tensorflow/core/kernels/scan_ops_test.cc b/tensorflow/core/kernels/scan_ops_test.cc
new file mode 100644
index 00000000000..588b606a99b
--- /dev/null
+++ b/tensorflow/core/kernels/scan_ops_test.cc
@@ -0,0 +1,146 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/test_benchmark.h"
+
+namespace tensorflow {
+
+template <typename T>
+static Graph* LargeOneDCumsum(int num_x, bool reverse = false) {
+  auto* g = new Graph(OpRegistry::Global());
+  Tensor data(DataTypeToEnum<T>::value, TensorShape({num_x}));
+  data.flat<T>().setRandom();
+  Tensor axes(DT_INT32, TensorShape({}));
+  axes.flat<int32>()(0) = 0;
+  test::graph::Cumsum(g, test::graph::Constant(g, data),
+                      test::graph::Constant(g, axes));
+  return g;
+}
+
+static Graph* ColCumsum(int num_x, int num_y, bool reverse = false) {
+  auto* g = new Graph(OpRegistry::Global());
+  Tensor data(DT_FLOAT, TensorShape({num_x, num_y}));
+  data.flat<float>().setRandom();
+  Tensor axes(DT_INT32, TensorShape({}));
+  axes.flat<int32>()(0) = 0;
+  test::graph::Cumsum(g, test::graph::Constant(g, data),
+                      test::graph::Constant(g, axes));
+  return g;
+}
+
+static Graph* RowCumsum(int num_x, int num_y, bool reverse = false) {
+  auto* g = new Graph(OpRegistry::Global());
+  Tensor data(DT_FLOAT, TensorShape({num_x, num_y}));
+  data.flat<float>().setRandom();
+  Tensor axes(DT_INT32, TensorShape({}));
+  axes.flat<int32>()(0) = 1;
+  test::graph::Cumsum(g, test::graph::Constant(g, data),
+                      test::graph::Constant(g, axes));
+  return g;
+}
+
+static Graph* ThreeDYCumsum(int num_y, int num_z, bool reverse = false) {
+  auto* g = new Graph(OpRegistry::Global());
+  Tensor data(DT_FLOAT, TensorShape({32, num_y, num_z}));
+  data.flat<float>().setRandom();
+  Tensor axes(DT_INT32, TensorShape({}));
+  axes.flat<int32>()(0) = 1;
+  test::graph::Cumsum(g, test::graph::Constant(g, data),
+                      test::graph::Constant(g, axes));
+  return g;
+}
+
+template <typename T>
+static void LargeOneDimensional(int iters, const string& device, int num_x,
+                                bool reverse = false) {
+  testing::ItemsProcessed(static_cast<int64>(iters) * num_x);
+  testing::BytesProcessed(static_cast<int64>(iters) * num_x * sizeof(T));
+  test::Benchmark(device, LargeOneDCumsum<T>(num_x, reverse)).Run(iters);
+}
+
+static void DoRowCumsum(int iters, const string& device, int num_x, int num_y,
+                        bool reverse = false) {
+  testing::ItemsProcessed(static_cast<int64>(iters) * num_x * num_y);
+  testing::BytesProcessed(static_cast<int64>(iters) * num_x * num_y *
+                          sizeof(float));
+  test::Benchmark(device, RowCumsum(num_x, num_y, reverse)).Run(iters);
+}
+
+static void DoColCumsum(int iters, const string& device, int num_x, int num_y,
+                        bool reverse = false) {
+  testing::ItemsProcessed(static_cast<int64>(iters) * num_x * num_y);
+  testing::BytesProcessed(static_cast<int64>(iters) * num_x * num_y *
+                          sizeof(float));
+  test::Benchmark(device, ColCumsum(num_x, num_y, reverse)).Run(iters);
+}
+
+static void Do3DYCumsum(int iters, const string& device, int num_x, int num_y,
+                        bool reverse = false) {
+  testing::ItemsProcessed(static_cast<int64>(iters) * num_x * num_y);
+  testing::BytesProcessed(static_cast<int64>(iters) * num_x * num_y *
+                          sizeof(float));
+  test::Benchmark(device, ThreeDYCumsum(num_x, num_y, reverse)).Run(iters);
+}
+
+static void BM_OneDCumsumGPU(int iters, int num_x) {
+  LargeOneDimensional<float>(iters, "gpu", num_x);
+}
+BENCHMARK(BM_OneDCumsumGPU)->Range(1, 1 << 21);
+
+static void BM_OneDCumsumGPUHalf(int iters, int num_x) {
+  LargeOneDimensional<Eigen::half>(iters, "gpu", num_x);
+}
+BENCHMARK(BM_OneDCumsumGPUHalf)->Range(1, 1 << 21);
+
+static void BM_Sum2DRowCumsumGPU(int iters, int num_x, int num_y) {
+  DoRowCumsum(iters, "gpu", num_x, num_y);
+}
+BENCHMARK(BM_Sum2DRowCumsumGPU)->RangePair(1, 8192, 1, 8192);
+
+static void BM_Sum2DColumnCumsumGPU(int iters, int num_x, int num_y) {
+  DoColCumsum(iters, "gpu", num_x, num_y);
+}
+BENCHMARK(BM_Sum2DColumnCumsumGPU)->RangePair(1, 8192, 1, 8192);
+
+static void BM_Sum3DYCumsumGPU(int iters, int num_x, int num_y) {
+  Do3DYCumsum(iters, "gpu", num_x, num_y);
+}
+BENCHMARK(BM_Sum3DYCumsumGPU)->RangePair(64, 4096, 64, 4096);
+
+static void BM_OneDCumsumGPU_reverse(int iters, int num_x) {
+  LargeOneDimensional<float>(iters, "gpu", num_x, true);
+}
+BENCHMARK(BM_OneDCumsumGPU_reverse)->Range(1, 1 << 21);
+
+static void BM_Sum2DRowCumsumGPU_reverse(int iters, int num_x, int num_y) {
+  DoRowCumsum(iters, "gpu", num_x, num_y, true);
+}
+BENCHMARK(BM_Sum2DRowCumsumGPU_reverse)->RangePair(1, 8192, 1, 8192);
+
+static void BM_Sum2DColumnCumsumGPU_reverse(int iters, int num_x, int num_y) {
+  DoColCumsum(iters, "gpu", num_x, num_y, true);
+}
+BENCHMARK(BM_Sum2DColumnCumsumGPU_reverse)->RangePair(1, 8192, 1, 8192);
+
+static void BM_Sum3DYCumsumGPU_reverse(int iters, int num_x, int num_y) {
+  Do3DYCumsum(iters, "gpu", num_x, num_y, true);
+}
+BENCHMARK(BM_Sum3DYCumsumGPU_reverse)->RangePair(32, 2048, 32, 2048);
+
+}  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/scatter_nd_op.cc b/tensorflow/core/kernels/scatter_nd_op.cc
index fd54c6d6d75..63bb793fdcb 100644
--- a/tensorflow/core/kernels/scatter_nd_op.cc
+++ b/tensorflow/core/kernels/scatter_nd_op.cc
@@ -29,6 +29,7 @@ limitations under the License.
 #include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/dense_update_functor.h"
 #include "tensorflow/core/kernels/fill_functor.h"
+#include "tensorflow/core/kernels/inplace_ops_functor.h"
 #include "tensorflow/core/kernels/training_op_helpers.h"
 #include "tensorflow/core/kernels/variable_ops.h"
 #include "tensorflow/core/lib/strings/str_util.h"
@@ -121,6 +122,90 @@ class ScatterNdOp : public OpKernel {
   }
 };
 
+template <typename Device, typename T, typename Index,
+          scatter_nd_op::UpdateOp op>
+class TensorScatterOp : public OpKernel {
+ public:
+  explicit TensorScatterOp(OpKernelConstruction* c) : OpKernel(c) {
+    const DataType dt = DataTypeToEnum<T>::v();
+    const DataType index_t = DataTypeToEnum<Index>::v();
+    OP_REQUIRES_OK(c, c->MatchSignature({dt, index_t, dt}, {dt}));
+  }
+
+  void Compute(OpKernelContext* c) override {
+    const Tensor& input = c->input(0);
+    const Tensor& indices = c->input(1);
+    const Tensor& updates = c->input(2);
+
+    OP_REQUIRES(c, indices.shape().dims() >= 1,
+                errors::InvalidArgument(
+                    "Indices shape must have rank at least one. Found:",
+                    indices.shape().DebugString()));
+    OP_REQUIRES(c, updates.shape().dims() >= 1,
+                errors::InvalidArgument(
+                    "Updates shape must have rank at least one. Found:",
+                    updates.shape().DebugString()));
+
+    TensorShape shape = input.shape();
+
+    OP_REQUIRES(
+        c,
+        (shape.num_elements() > 0 || (indices.shape().num_elements() == 0 &&
+                                      updates.shape().num_elements() == 0)),
+        errors::InvalidArgument(
+            "Indices and updates specified for empty output shape"));
+
+    const int64 outer_dims = indices.shape().dims() - 1;
+
+    for (int i = 0; i < outer_dims; ++i) {
+      OP_REQUIRES(c, indices.shape().dim_size(i) == updates.shape().dim_size(i),
+                  errors::InvalidArgument(
+                      "Outer dimensions of indices and update must match. "
+                      "Indices shape: ",
+                      indices.shape().DebugString(),
+                      ", updates shape:", updates.shape().DebugString()));
+    }
+
+    const int64 ix = indices.shape().dim_size(outer_dims);
+    OP_REQUIRES(
+        c, updates.shape().dims() - outer_dims == shape.dims() - ix,
+        errors::InvalidArgument("Inner dimensions of output shape must match "
+                                "inner dimensions of updates shape. Output: ",
+                                shape.DebugString(),
+                                " updates: ", updates.shape().DebugString()));
+    for (int i = 0; i + outer_dims < updates.shape().dims(); ++i) {
+      OP_REQUIRES(
+          c, updates.shape().dim_size(i + outer_dims) == shape.dim_size(ix + i),
+          errors::InvalidArgument(
+              "The inner ", shape.dims() - ix,
+              " dimensions of output.shape=", shape.DebugString(),
+              " must match the inner ", updates.shape().dims() - outer_dims,
+              " dimensions of updates.shape=", updates.shape().DebugString()));
+    }
+
+    std::unique_ptr<Tensor> forwarded_input = c->forward_input(
+        2, 0, input.dtype(), shape, DEVICE_MEMORY, AllocatorAttributes());
+
+    if (forwarded_input == nullptr) {
+      // We were not able to forward the input, so we deep copy the tensor and
+      // set the output.
+      Tensor* out;
+      OP_REQUIRES_OK(c, c->allocate_output(0, input.shape(), &out));
+
+      OP_REQUIRES_OK(c, tensorflow::functor::DoCopy(c->eigen_device<Device>(),
+                                                    input, out));
+      OP_REQUIRES_OK(c,
+                     functor::DoScatterNd<Device, T, Index, op>(
+                         c, indices, updates, shape, out, false /*allocate*/));
+    } else {
+      // Output forwarded, so simply perform the scatter.
+      OP_REQUIRES_OK(c, functor::DoScatterNd<Device, T, Index, op>(
+                            c, indices, updates, shape, forwarded_input.get(),
+                            false /*allocate*/));
+    }
+  }
+};
+
 template <typename Device, typename T, typename Index,
           scatter_nd_op::UpdateOp op>
 class ScatterNdUpdateOp : public OpKernel {
@@ -282,6 +367,56 @@ TF_CALL_bool(REGISTER_SCATTER_ND_ADD_SUB_CPU);
 TF_CALL_bool(REGISTER_SCATTER_ND_UPDATE_CPU);
 TF_CALL_bool(REGISTER_SCATTER_ND_CPU);
 
+#define REGISTER_SCATTER_ND_TENSOR_UPDATE_TYPE_INDEX_TYPE(type, index_type, \
+                                                          dev)              \
+  REGISTER_KERNEL_BUILDER(Name("TensorScatterUpdate")                       \
+                              .Device(DEVICE_##dev)                         \
+                              .TypeConstraint<type>("T")                    \
+                              .TypeConstraint<index_type>("Tindices"),      \
+                          TensorScatterOp<dev##Device, type, index_type,    \
+                                          scatter_nd_op::UpdateOp::ASSIGN>)
+
+#define REGISTER_SCATTER_ND_TENSOR_ADD_TYPE_INDEX_TYPE(type, index_type, dev) \
+  REGISTER_KERNEL_BUILDER(Name("TensorScatterAdd")                            \
+                              .Device(DEVICE_##dev)                           \
+                              .TypeConstraint<type>("T")                      \
+                              .TypeConstraint<index_type>("Tindices"),        \
+                          TensorScatterOp<dev##Device, type, index_type,      \
+                                          scatter_nd_op::UpdateOp::ADD>)
+
+#define REGISTER_SCATTER_ND_TENSOR_SUB_TYPE_INDEX_TYPE(type, index_type, dev) \
+  REGISTER_KERNEL_BUILDER(Name("TensorScatterSub")                            \
+                              .Device(DEVICE_##dev)                           \
+                              .TypeConstraint<type>("T")                      \
+                              .TypeConstraint<index_type>("Tindices"),        \
+                          TensorScatterOp<dev##Device, type, index_type,      \
+                                          scatter_nd_op::UpdateOp::SUB>)
+
+#define REGISTER_SCATTER_ND_TENSOR_UPDATE_CPU(type)                    \
+  REGISTER_SCATTER_ND_TENSOR_UPDATE_TYPE_INDEX_TYPE(type, int32, CPU); \
+  REGISTER_SCATTER_ND_TENSOR_UPDATE_TYPE_INDEX_TYPE(type, int64, CPU);
+
+#define REGISTER_SCATTER_ND_TENSOR_ADD_CPU(type)                    \
+  REGISTER_SCATTER_ND_TENSOR_ADD_TYPE_INDEX_TYPE(type, int32, CPU); \
+  REGISTER_SCATTER_ND_TENSOR_ADD_TYPE_INDEX_TYPE(type, int64, CPU);
+
+#define REGISTER_SCATTER_ND_TENSOR_SUB_CPU(type)                    \
+  REGISTER_SCATTER_ND_TENSOR_SUB_TYPE_INDEX_TYPE(type, int32, CPU); \
+  REGISTER_SCATTER_ND_TENSOR_SUB_TYPE_INDEX_TYPE(type, int64, CPU);
+
+#define REGISTER_SCATTER_ND_TENSOR_CPU(type)   \
+  REGISTER_SCATTER_ND_TENSOR_UPDATE_CPU(type); \
+  REGISTER_SCATTER_ND_TENSOR_ADD_CPU(type);    \
+  REGISTER_SCATTER_ND_TENSOR_SUB_CPU(type);
+
+// Register TensorScatterUpdate/Add/Sub for all number types.
+TF_CALL_NUMBER_TYPES(REGISTER_SCATTER_ND_TENSOR_CPU);
+// Register only TensorScatterUpdate for string/bool types as well.
+TF_CALL_string(REGISTER_SCATTER_ND_TENSOR_UPDATE_CPU);
+TF_CALL_bool(REGISTER_SCATTER_ND_TENSOR_UPDATE_CPU);
+
+#undef REGISTER_SCATTER_ND_TENSOR_CPU
+
 // Registers GPU kernels.
 #if GOOGLE_CUDA
 
@@ -319,6 +454,25 @@ TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SCATTER_ND_UPDATE_SYCL);
 #undef REGISTER_SCATTER_ND_UPDATE_SYCL
 #endif  // TENSORFLOW_USE_SYCL
 
+#define REGISTER_SCATTER_ND_TENSOR_UPDATE_GPU(type)                    \
+  REGISTER_SCATTER_ND_TENSOR_UPDATE_TYPE_INDEX_TYPE(type, int32, GPU); \
+  REGISTER_SCATTER_ND_TENSOR_UPDATE_TYPE_INDEX_TYPE(type, int64, GPU);
+
+#define REGISTER_SCATTER_ND_TENSOR_ADD_GPU(type)                    \
+  REGISTER_SCATTER_ND_TENSOR_ADD_TYPE_INDEX_TYPE(type, int32, GPU); \
+  REGISTER_SCATTER_ND_TENSOR_ADD_TYPE_INDEX_TYPE(type, int64, GPU);
+
+#define REGISTER_SCATTER_ND_TENSOR_SUB_GPU(type)                    \
+  REGISTER_SCATTER_ND_TENSOR_SUB_TYPE_INDEX_TYPE(type, int32, GPU); \
+  REGISTER_SCATTER_ND_TENSOR_SUB_TYPE_INDEX_TYPE(type, int64, GPU);
+
+#define REGISTER_SCATTER_ND_TENSOR_GPU(type)   \
+  REGISTER_SCATTER_ND_TENSOR_ADD_GPU(type);    \
+  REGISTER_SCATTER_ND_TENSOR_UPDATE_GPU(type); \
+  REGISTER_SCATTER_ND_TENSOR_SUB_GPU(type);
+
+TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SCATTER_ND_TENSOR_GPU);
+
 #undef REGISTER_SCATTER_ND_ADD
 #undef REGISTER_SCATTER_ND_ADD_SUB
 #undef REGISTER_SCATTER_ND_ADD_SUB_CPU
@@ -328,6 +482,16 @@ TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SCATTER_ND_UPDATE_SYCL);
 #undef REGISTER_SCATTER_ND_UPDATE_GPU
 #undef REGISTER_SCATTER_ND_KERNEL
 #undef REGISTER_SCATTER_ND_KERNEL_INDEX
+#undef REGISTER_SCATTER_ND_TENSOR_TYPE_INDEX_TYPE
+#undef REGISTER_SCATTER_ND_TENSOR_CPU
+#undef REGISTER_SCATTER_ND_TENSOR_GPU
+#undef REGISTER_SCATTER_ND_TENSOR_UPDATE_TYPE_INDEX_TYPE
+#undef REGISTER_SCATTER_ND_TENSOR_ADD_TYPE_INDEX_TYPE
+#undef REGISTER_SCATTER_ND_TENSOR_SUB_TYPE_INDEX_TYPE
+#undef REGISTER_SCATTER_ND_TENSOR_UPDATE_GPU
+#undef REGISTER_SCATTER_ND_TENSOR_ADD_GPU
+#undef REGISTER_SCATTER_ND_TENSOR_SUB_GPU
+#undef REGISTER_SCATTER_ND_TENSOR_GPU
 
 #endif  // GOOGLE_CUDA
 
diff --git a/tensorflow/core/kernels/stage_op.cc b/tensorflow/core/kernels/stage_op.cc
index 73a02a34cf2..c91bdc43cf4 100644
--- a/tensorflow/core/kernels/stage_op.cc
+++ b/tensorflow/core/kernels/stage_op.cc
@@ -151,7 +151,7 @@ class Buffer : public ResourceBase {
   }
 
   // Are there a limit number of elements or a memory limit
-  // configued on this buffer?
+  // configured on this buffer?
   bool IsBounded() const { return capacity_ > 0 || memory_limit_ > 0; }
 
   bool IsCapacityFull() const { return buf_.size() >= capacity_; }
diff --git a/tensorflow/core/kernels/tensor_array_ops.cc b/tensorflow/core/kernels/tensor_array_ops.cc
index a97a71b344d..aa85f546a81 100644
--- a/tensorflow/core/kernels/tensor_array_ops.cc
+++ b/tensorflow/core/kernels/tensor_array_ops.cc
@@ -352,9 +352,9 @@ class TensorArrayGradOp : public TensorArrayCreationOp {
     }
 
     const auto key = strings::StrCat(output_handle(0), output_handle(1));
-    auto creator = [this, key, tensor_array, array_size, marked_size,
-                    element_shape, shape_to_prepend, tensor_array_output_handle,
-                    output_handle](TensorArray** ret) -> Status {
+    auto creator = [key, tensor_array, array_size, marked_size, element_shape,
+                    shape_to_prepend,
+                    tensor_array_output_handle](TensorArray** ret) -> Status {
       *ret = new TensorArray(
           key, tensor_array->ElemType(), *tensor_array_output_handle,
           array_size, element_shape, tensor_array->HasIdenticalElementShapes(),
diff --git a/tensorflow/core/kernels/tensor_forest/BUILD b/tensorflow/core/kernels/tensor_forest/BUILD
new file mode 100644
index 00000000000..df035506f76
--- /dev/null
+++ b/tensorflow/core/kernels/tensor_forest/BUILD
@@ -0,0 +1,53 @@
+# Description:
+#   OpKernels for tensor forest ops.
+
+package(
+    default_visibility = ["//tensorflow:internal"],
+)
+
+licenses(["notice"])  # Apache 2.0
+
+load("//tensorflow:tensorflow.bzl", "tf_kernel_library")
+
+cc_library(
+    name = "resources",
+    srcs = ["resources.cc"],
+    hdrs = ["resources.h"],
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core/kernels/boosted_trees:boosted_trees_proto_cc",
+    ],
+)
+
+tf_kernel_library(
+    name = "resource_ops",
+    srcs = ["resource_ops.cc"],
+    deps = [
+        ":resources",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:tensor_forest_ops_op_lib",
+        "//tensorflow/core/kernels/boosted_trees:boosted_trees_proto_cc",
+    ],
+)
+
+tf_kernel_library(
+    name = "prediction_ops",
+    srcs = ["prediction_ops.cc"],
+    deps = [
+        ":resources",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:tensor_forest_ops_op_lib",
+        "//tensorflow/core/kernels/boosted_trees:boosted_trees_proto_cc",
+    ],
+)
+
+tf_kernel_library(
+    name = "tensor_forest_ops",
+    deps = [
+        ":prediction_ops",
+        ":resource_ops",
+    ],
+)
diff --git a/tensorflow/core/kernels/tensor_forest/prediction_ops.cc b/tensorflow/core/kernels/tensor_forest/prediction_ops.cc
new file mode 100644
index 00000000000..877dfbc020d
--- /dev/null
+++ b/tensorflow/core/kernels/tensor_forest/prediction_ops.cc
@@ -0,0 +1,93 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/kernels/tensor_forest/resources.h"
+#include "tensorflow/core/platform/thread_annotations.h"
+#include "tensorflow/core/util/work_sharder.h"
+
+namespace tensorflow {
+
+class TensorForestTreePredictOp : public OpKernel {
+ public:
+  explicit TensorForestTreePredictOp(OpKernelConstruction* context)
+      : OpKernel(context) {
+    OP_REQUIRES_OK(context,
+                   context->GetAttr("logits_dimension", &logits_dimension_));
+  }
+
+  void Compute(OpKernelContext* context) override {
+    TensorForestTreeResource* decision_tree_resource;
+    OP_REQUIRES_OK(context, LookupResource(context, HandleFromInput(context, 0),
+                                           &decision_tree_resource));
+    mutex_lock l(*decision_tree_resource->get_mutex());
+    core::ScopedUnref unref_me(decision_tree_resource);
+
+    const Tensor* dense_features_t = nullptr;
+    OP_REQUIRES_OK(context,
+                   context->input("dense_features", &dense_features_t));
+
+    auto dense_features = dense_features_t->matrix<float>();
+    const int32 batch_size = dense_features_t->dim_size(0);
+
+    Tensor* output_predictions = nullptr;
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(0, {batch_size, logits_dimension_},
+                                            &output_predictions));
+    auto out = output_predictions->matrix<float>();
+
+    if (decision_tree_resource->get_size() <= 0) {
+      out.setZero();
+      return;
+    }
+    auto* worker_threads = context->device()->tensorflow_cpu_worker_threads();
+    const int32 num_threads = worker_threads->num_threads;
+
+    // TODO(yupbank): This was from contrib version.
+    //  This cost would probably depend on the depth of the tree we have.
+    //  We will need to run it on a number of trees of diff depth
+    //  and see the num of cpu cycles
+    const int64 cost_per_traverse = 500;
+    auto traverse = [this, &out, &dense_features, decision_tree_resource,
+                     batch_size](int64 start, int64 end) {
+      CHECK_LE(start, end) << "Start exceeding End";
+      CHECK_LE(end, batch_size) << "End exceeding batch size";
+      for (int example_id = start; example_id < end; ++example_id) {
+        const int32 leaf_id =
+            decision_tree_resource->TraverseTree(example_id, &dense_features);
+        set_output_value(example_id, leaf_id, decision_tree_resource, &out);
+      };
+    };
+    Shard(num_threads, worker_threads->workers, batch_size, cost_per_traverse,
+          traverse);
+  };
+
+  void set_output_value(const int32 example_id, const int32 leaf_id,
+                        const TensorForestTreeResource* decision_tree_resource,
+                        TTypes<float>::Matrix* out) const {
+    for (int j = 0; j < logits_dimension_; ++j) {
+      const float logit = decision_tree_resource->get_prediction(leaf_id, j);
+      (*out)(example_id, j) = logit;
+    }
+  };
+
+ private:
+  int32 logits_dimension_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("TensorForestTreePredict").Device(DEVICE_CPU),
+                        TensorForestTreePredictOp);
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/tensor_forest/resource_ops.cc b/tensorflow/core/kernels/tensor_forest/resource_ops.cc
new file mode 100644
index 00000000000..aba97ea9900
--- /dev/null
+++ b/tensorflow/core/kernels/tensor_forest/resource_ops.cc
@@ -0,0 +1,135 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/kernels/tensor_forest/resources.h"
+
+namespace tensorflow {
+
+class TensorForestCreateTreeVariableOp : public OpKernel {
+ public:
+  explicit TensorForestCreateTreeVariableOp(OpKernelConstruction* context)
+      : OpKernel(context){};
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor* tree_config_t;
+    OP_REQUIRES_OK(context, context->input("tree_config", &tree_config_t));
+
+    auto* const result = new TensorForestTreeResource();
+
+    if (!result->InitFromSerialized(tree_config_t->scalar<string>()())) {
+      result->Unref();
+      OP_REQUIRES(context, false,
+                  errors::InvalidArgument("Unable to parse tree config."));
+    }
+
+    // Only create one, if one does not exist already. Report status for all
+    // other exceptions.
+    auto status = CreateResource(context, HandleFromInput(context, 0), result);
+    if (!status.ok() && status.code() != tensorflow::error::ALREADY_EXISTS) {
+      OP_REQUIRES(context, false, status);
+    }
+  }
+};
+
+// Op for serializing a model.
+class TensorForestTreeSerializeOp : public OpKernel {
+ public:
+  explicit TensorForestTreeSerializeOp(OpKernelConstruction* context)
+      : OpKernel(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    TensorForestTreeResource* decision_tree_resource;
+    OP_REQUIRES_OK(context, LookupResource(context, HandleFromInput(context, 0),
+                                           &decision_tree_resource));
+    mutex_lock l(*decision_tree_resource->get_mutex());
+    core::ScopedUnref unref_me(decision_tree_resource);
+    Tensor* output_config_t = nullptr;
+    OP_REQUIRES_OK(
+        context, context->allocate_output(0, TensorShape(), &output_config_t));
+    output_config_t->scalar<string>()() =
+        decision_tree_resource->decision_tree().SerializeAsString();
+  }
+};
+
+// Op for deserializing a tree variable from a checkpoint.
+class TensorForestTreeDeserializeOp : public OpKernel {
+ public:
+  explicit TensorForestTreeDeserializeOp(OpKernelConstruction* context)
+      : OpKernel(context) {}
+  void Compute(OpKernelContext* context) override {
+    TensorForestTreeResource* decision_tree_resource;
+    OP_REQUIRES_OK(context, LookupResource(context, HandleFromInput(context, 0),
+                                           &decision_tree_resource));
+
+    mutex_lock l(*decision_tree_resource->get_mutex());
+    core::ScopedUnref unref_me(decision_tree_resource);
+
+    const Tensor* tree_config_t;
+    OP_REQUIRES_OK(context, context->input("tree_config", &tree_config_t));
+
+    // Deallocate all the previous objects on the resource.
+    decision_tree_resource->Reset();
+
+    if (!decision_tree_resource->InitFromSerialized(
+            tree_config_t->scalar<string>()())) {
+      OP_REQUIRES(context, false,
+                  errors::InvalidArgument("Unable to parse tree config."));
+    }
+  }
+};
+
+// Op for getting tree size.
+class TensorForestTreeSizeOp : public OpKernel {
+ public:
+  explicit TensorForestTreeSizeOp(OpKernelConstruction* context)
+      : OpKernel(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    TensorForestTreeResource* decision_tree_resource;
+    OP_REQUIRES_OK(context, LookupResource(context, HandleFromInput(context, 0),
+                                           &decision_tree_resource));
+    mutex_lock l(*decision_tree_resource->get_mutex());
+    core::ScopedUnref unref_me(decision_tree_resource);
+    Tensor* output_t = nullptr;
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(0, TensorShape(), &output_t));
+    output_t->scalar<int32>()() = decision_tree_resource->get_size();
+  }
+};
+
+REGISTER_RESOURCE_HANDLE_KERNEL(TensorForestTreeResource);
+
+REGISTER_KERNEL_BUILDER(
+    Name("TensorForestTreeIsInitializedOp").Device(DEVICE_CPU),
+    IsResourceInitialized<TensorForestTreeResource>);
+
+REGISTER_KERNEL_BUILDER(
+    Name("TensorForestCreateTreeVariable").Device(DEVICE_CPU),
+    TensorForestCreateTreeVariableOp);
+
+REGISTER_KERNEL_BUILDER(Name("TensorForestTreeSerialize").Device(DEVICE_CPU),
+                        TensorForestTreeSerializeOp);
+
+REGISTER_KERNEL_BUILDER(Name("TensorForestTreeDeserialize").Device(DEVICE_CPU),
+                        TensorForestTreeDeserializeOp);
+
+REGISTER_KERNEL_BUILDER(Name("TensorForestTreeSize").Device(DEVICE_CPU),
+                        TensorForestTreeSizeOp);
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/tensor_forest/resources.cc b/tensorflow/core/kernels/tensor_forest/resources.cc
new file mode 100644
index 00000000000..9f8ceb96200
--- /dev/null
+++ b/tensorflow/core/kernels/tensor_forest/resources.cc
@@ -0,0 +1,59 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/kernels/tensor_forest/resources.h"
+#include "tensorflow/core/kernels/boosted_trees/boosted_trees.pb.h"
+#include "tensorflow/core/platform/protobuf.h"
+
+namespace tensorflow {
+
+const float TensorForestTreeResource::get_prediction(
+    const int32 id, const int32 dimension_id) const {
+  return decision_tree_->nodes(id).leaf().vector().value(dimension_id);
+};
+
+const int32 TensorForestTreeResource::TraverseTree(
+    const int32 example_id,
+    const TTypes<float>::ConstMatrix* dense_data) const {
+  using boosted_trees::Node;
+  using boosted_trees::Tree;
+  int32 current_id = 0;
+  while (true) {
+    const Node& current = decision_tree_->nodes(current_id);
+    if (current.has_leaf()) {
+      return current_id;
+    };
+    DCHECK_EQ(current.node_case(), Node::kDenseSplit);
+    const auto& split = current.dense_split();
+
+    if ((*dense_data)(example_id, split.feature_id()) <= split.threshold()) {
+      current_id = split.left_id();
+    } else {
+      current_id = split.right_id();
+    }
+  }
+};
+
+bool TensorForestTreeResource::InitFromSerialized(const string& serialized) {
+  return ParseProtoUnlimited(decision_tree_, serialized);
+}
+
+void TensorForestTreeResource::Reset() {
+  arena_.Reset();
+  CHECK_EQ(0, arena_.SpaceAllocated());
+  decision_tree_ = protobuf::Arena::CreateMessage<boosted_trees::Tree>(&arena_);
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/tensor_forest/resources.h b/tensorflow/core/kernels/tensor_forest/resources.h
new file mode 100644
index 00000000000..34f61b2c0c5
--- /dev/null
+++ b/tensorflow/core/kernels/tensor_forest/resources.h
@@ -0,0 +1,63 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_TENSOR_FOREST_RESOURCES_H_
+#define TENSORFLOW_CORE_KERNELS_TENSOR_FOREST_RESOURCES_H_
+
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/kernels/boosted_trees/boosted_trees.pb.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/protobuf.h"
+
+namespace tensorflow {
+
+// Keep a tree ensemble in memory for efficient evaluation and mutation.
+class TensorForestTreeResource : public ResourceBase {
+ public:
+  TensorForestTreeResource()
+      : decision_tree_(
+            protobuf::Arena::CreateMessage<boosted_trees::Tree>(&arena_)){};
+
+  string DebugString() override {
+    return strings::StrCat("TensorForestTree[size=", get_size(), "]");
+  }
+
+  mutex* get_mutex() { return &mu_; }
+
+  bool InitFromSerialized(const string& serialized);
+
+  // Resets the resource and frees the proto.
+  // Caller needs to hold the mutex lock while calling this.
+  void Reset();
+
+  const int32 get_size() const { return decision_tree_->nodes_size(); }
+
+  const boosted_trees::Tree& decision_tree() const { return *decision_tree_; }
+
+  const float get_prediction(const int32 id, const int32 dimension_id) const;
+
+  const int32 TraverseTree(const int32 example_id,
+                           const TTypes<float>::ConstMatrix* dense_data) const;
+
+ protected:
+  mutex mu_;
+  protobuf::Arena arena_;
+  boosted_trees::Tree* decision_tree_;
+};
+
+}  // namespace tensorflow
+#endif  // TENSORFLOW_CORE_KERNELS_TENSOR_FOREST_RESOURCES_H_
diff --git a/tensorflow/core/kernels/training_ops.cc b/tensorflow/core/kernels/training_ops.cc
index acf162deec9..6504ad1b09c 100644
--- a/tensorflow/core/kernels/training_ops.cc
+++ b/tensorflow/core/kernels/training_ops.cc
@@ -283,6 +283,22 @@ struct ApplyMomentum<CPUDevice, T> {
   }
 };
 
+template <typename T>
+struct ApplyKerasMomentum<CPUDevice, T> {
+  void operator()(const CPUDevice& d, typename TTypes<T>::Flat var,
+                  typename TTypes<T>::Flat accum,
+                  typename TTypes<T>::ConstScalar lr,
+                  typename TTypes<T>::ConstFlat grad,
+                  typename TTypes<T>::ConstScalar momentum, bool use_nesterov) {
+    accum.device(d) = accum * momentum() - grad * lr();
+    if (use_nesterov) {
+      var.device(d) += (accum * momentum() - grad * lr());
+    } else {
+      var.device(d) += accum;
+    }
+  }
+};
+
 template <typename Device, typename T>
 struct ApplyAdamNonCuda {
   void operator()(const Device& d, typename TTypes<T>::Flat var,
@@ -331,6 +347,28 @@ struct ApplyAdamSYCL {
 template <typename T>
 struct ApplyAdam<CPUDevice, T> : ApplyAdamNonCuda<CPUDevice, T> {};
 
+template <typename T>
+struct ApplyAdamWithAmsgrad<CPUDevice, T> {
+  void operator()(const CPUDevice& d, typename TTypes<T>::Flat var,
+                  typename TTypes<T>::Flat m, typename TTypes<T>::Flat v,
+                  typename TTypes<T>::Flat vhat,
+                  typename TTypes<T>::ConstScalar beta1_power,
+                  typename TTypes<T>::ConstScalar beta2_power,
+                  typename TTypes<T>::ConstScalar lr,
+                  typename TTypes<T>::ConstScalar beta1,
+                  typename TTypes<T>::ConstScalar beta2,
+                  typename TTypes<T>::ConstScalar epsilon,
+                  typename TTypes<T>::ConstFlat grad) {
+    const T alpha = lr() * Eigen::numext::sqrt(T(1) - beta2_power()) /
+                    (T(1) - beta1_power());
+
+    m.device(d) += (grad - m) * (T(1) - beta1());
+    v.device(d) += (grad.square() - v) * (T(1) - beta2());
+    vhat.device(d) = vhat.cwiseMax(v);
+    var.device(d) -= (m * alpha) / (vhat.sqrt() + epsilon());
+  }
+};
+
 template <typename Device, typename T>
 struct ApplyAdaMaxNonCuda {
   void operator()(const Device& d, typename TTypes<T>::Flat var,
@@ -2525,6 +2563,217 @@ TF_CALL_double(REGISTER_CPU_KERNELS);
 #undef REGISTER_CPU_KERNELS
 #undef REGISTER_KERNELS
 
+template <typename Device, typename T>
+class ApplyKerasMomentumOp : public OpKernel {
+ public:
+  explicit ApplyKerasMomentumOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("use_locking", &use_exclusive_lock_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("use_nesterov", &use_nesterov_));
+  }
+
+  void Compute(OpKernelContext* ctx) override {
+    auto locks =
+        MaybeLockVariableInputMutexesInOrder(ctx, use_exclusive_lock_, {0, 1});
+
+    Tensor var;
+    OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<Device, T>(
+                            ctx, 0, use_exclusive_lock_, false, &var));
+    Tensor accum;
+    OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<Device, T>(
+                            ctx, 1, use_exclusive_lock_, false, &accum));
+    OP_REQUIRES(
+        ctx, var.IsInitialized(),
+        errors::FailedPrecondition(
+            "Attempting to use uninitialized variables: ", requested_input(0)));
+    OP_REQUIRES(
+        ctx, accum.IsInitialized(),
+        errors::FailedPrecondition(
+            "Attempting to use uninitialized variables: ", requested_input(1)));
+    const Tensor& lr = ctx->input(2);
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(lr.shape()),
+                errors::InvalidArgument("lr is not a scalar: ",
+                                        lr.shape().DebugString()));
+    const Tensor& grad = ctx->input(3);
+    OP_REQUIRES(
+        ctx, var.shape().IsSameSize(accum.shape()),
+        errors::InvalidArgument("var and accum do not have the same shape",
+                                var.shape().DebugString(), " ",
+                                accum.shape().DebugString()));
+    OP_REQUIRES(
+        ctx, var.shape().IsSameSize(grad.shape()),
+        errors::InvalidArgument("var and grad do not have the same shape",
+                                var.shape().DebugString(), " ",
+                                grad.shape().DebugString()));
+
+    const Tensor& momentum = ctx->input(4);
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(momentum.shape()),
+                errors::InvalidArgument("momentum is not a scalar: ",
+                                        momentum.shape().DebugString()));
+
+    const Device& device = ctx->template eigen_device<Device>();
+    functor::ApplyKerasMomentum<Device, T>()(
+        device, var.flat<T>(), accum.flat<T>(), lr.scalar<T>(), grad.flat<T>(),
+        momentum.scalar<T>(), use_nesterov_);
+    MaybeForwardRefInputToRefOutput(ctx, 0, 0);
+  }
+
+ private:
+  bool use_exclusive_lock_;
+  bool use_nesterov_;
+};
+
+#define REGISTER_KERNELS(D, T)                               \
+  REGISTER_KERNEL_BUILDER(Name("ResourceApplyKerasMomentum") \
+                              .Device(DEVICE_##D)            \
+                              .HostMemory("var")             \
+                              .HostMemory("accum")           \
+                              .TypeConstraint<T>("T"),       \
+                          ApplyKerasMomentumOp<D##Device, T>);
+#define REGISTER_CPU_KERNELS(T) REGISTER_KERNELS(CPU, T);
+
+TF_CALL_half(REGISTER_CPU_KERNELS);
+TF_CALL_bfloat16(REGISTER_CPU_KERNELS);
+TF_CALL_float(REGISTER_CPU_KERNELS);
+TF_CALL_double(REGISTER_CPU_KERNELS);
+
+#if GOOGLE_CUDA
+// Forward declarations of the functor specializations for GPU.
+namespace functor {
+#define DECLARE_GPU_SPEC(T)                                               \
+  template <>                                                             \
+  void ApplyKerasMomentum<GPUDevice, T>::operator()(                      \
+      const GPUDevice& d, typename TTypes<T>::Flat var,                   \
+      typename TTypes<T>::Flat accum, typename TTypes<T>::ConstScalar lr, \
+      typename TTypes<T>::ConstFlat grad,                                 \
+      typename TTypes<T>::ConstScalar momentum, bool use_nesterov);       \
+  extern template struct ApplyKerasMomentum<GPUDevice, T>;
+DECLARE_GPU_SPEC(Eigen::half);
+DECLARE_GPU_SPEC(float);
+DECLARE_GPU_SPEC(double);
+#undef DECLARE_GPU_SPEC
+}  // namespace functor
+
+REGISTER_KERNELS(GPU, Eigen::half);
+REGISTER_KERNELS(GPU, float);
+REGISTER_KERNELS(GPU, double);
+#endif
+#undef REGISTER_CPU_KERNELS
+#undef REGISTER_KERNELS
+
+// Note, this op works on cpu only.
+template <typename T, typename Tindex>
+class SparseApplyKerasMomentumOp : public OpKernel {
+ public:
+  explicit SparseApplyKerasMomentumOp(OpKernelConstruction* ctx)
+      : OpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("use_locking", &use_exclusive_lock_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("use_nesterov", &use_nesterov_));
+  }
+
+  void Compute(OpKernelContext* ctx) override NO_THREAD_SAFETY_ANALYSIS {
+    auto locks =
+        MaybeLockVariableInputMutexesInOrder(ctx, use_exclusive_lock_, {0, 1});
+
+    Tensor var;
+    OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<CPUDevice, T>(
+                            ctx, 0, use_exclusive_lock_, true, &var));
+    Tensor accum;
+    OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<CPUDevice, T>(
+                            ctx, 1, use_exclusive_lock_, true, &accum));
+    OP_REQUIRES(
+        ctx, var.IsInitialized(),
+        errors::FailedPrecondition(
+            "Attempting to use uninitialized variables: ", requested_input(0)));
+    OP_REQUIRES(
+        ctx, accum.IsInitialized(),
+        errors::FailedPrecondition(
+            "Attempting to use uninitialized variables: ", requested_input(1)));
+    OP_REQUIRES(
+        ctx, var.shape().IsSameSize(accum.shape()),
+        errors::InvalidArgument("var and accum do not have the same shape",
+                                var.shape().DebugString(), " ",
+                                accum.shape().DebugString()));
+    OP_REQUIRES(ctx, TensorShapeUtils::IsVectorOrHigher(var.shape()),
+                errors::InvalidArgument("var must be at least 1 dimensional"));
+
+    const Tensor& lr = ctx->input(2);
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(lr.shape()),
+                errors::InvalidArgument("lr is not a scalar : ",
+                                        lr.shape().DebugString()));
+    const Tensor& grad = ctx->input(3);
+    const Tensor& indices = ctx->input(4);
+    OP_REQUIRES(ctx, TensorShapeUtils::IsVector(indices.shape()),
+                errors::InvalidArgument("indices must be one-dimensional"));
+
+    for (int d = 1; d < var.dims(); d++) {
+      OP_REQUIRES(ctx, var.dim_size(d) == grad.dim_size(d),
+                  errors::InvalidArgument(strings::StrCat(
+                      "var and grad must match in dimension ", d)));
+    }
+    const Tindex N = indices.dim_size(0);
+    OP_REQUIRES(
+        ctx, grad.dim_size(0) == N,
+        errors::InvalidArgument(
+            "grad must be the same size as indices in the first dimension."));
+
+    const Tensor& momentum = ctx->input(5);
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(momentum.shape()),
+                errors::InvalidArgument("momentum is not a scalar: ",
+                                        momentum.shape().DebugString()));
+
+    if (N > 0) {
+      const Tindex first_dim_size = var.dim_size(0);
+      auto indices_vec = indices.vec<Tindex>();
+      auto var_flat = var.flat_outer_dims<T>();
+      auto accum_flat = accum.flat_outer_dims<T>();
+      auto grad_flat = grad.flat_outer_dims<T>();
+      T lr_scalar = lr.scalar<T>()();
+      T momentum_scalar = momentum.scalar<T>()();
+
+      for (Tindex i = 0; i < N; i++) {
+        const Tindex index = internal::SubtleMustCopy(indices_vec(i));
+        OP_REQUIRES(ctx, FastBoundsCheck(index, first_dim_size),
+                    errors::InvalidArgument(
+                        strings::StrCat("Index ", index, " at offset ", i,
+                                        " in indices is out of range")));
+        auto a = accum_flat.template chip<0>(index);
+        auto g = grad_flat.template chip<0>(i);
+        auto v = var_flat.template chip<0>(index);
+        a = a * a.constant(momentum_scalar) - g * g.constant(lr_scalar);
+        if (use_nesterov_) {
+          v += a * a.constant(momentum_scalar) - g * g.constant(lr_scalar);
+        } else {
+          v += a;
+        }
+      }
+    }
+
+    MaybeForwardRefInputToRefOutput(ctx, 0, 0);
+  }
+
+ private:
+  bool use_exclusive_lock_;
+  bool use_nesterov_;
+};
+
+#define REGISTER_KERNELS(T, Tindices)                                \
+  REGISTER_KERNEL_BUILDER(Name("ResourceSparseApplyKerasMomentum")   \
+                              .Device(DEVICE_CPU)                    \
+                              .TypeConstraint<T>("T")                \
+                              .TypeConstraint<Tindices>("Tindices"), \
+                          SparseApplyKerasMomentumOp<T, Tindices>);
+#define REGISTER_CPU_KERNELS(T) \
+  REGISTER_KERNELS(T, int32);   \
+  REGISTER_KERNELS(T, int64);
+
+TF_CALL_half(REGISTER_CPU_KERNELS);
+TF_CALL_bfloat16(REGISTER_CPU_KERNELS);
+TF_CALL_float(REGISTER_CPU_KERNELS);
+TF_CALL_double(REGISTER_CPU_KERNELS);
+
+#undef REGISTER_CPU_KERNELS
+#undef REGISTER_KERNELS
+
 template <typename Device, typename T>
 class ApplyAdamOp : public OpKernel {
  public:
@@ -2786,6 +3035,147 @@ REGISTER_KERNELS(GPU, double);
 #undef REGISTER_CPU_KERNELS
 #undef REGISTER_KERNELS
 
+template <typename Device, typename T>
+class ApplyAdamWithAmsgradOp : public OpKernel {
+ public:
+  explicit ApplyAdamWithAmsgradOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("use_locking", &use_exclusive_lock_));
+  }
+
+  void Compute(OpKernelContext* ctx) override {
+    auto locks = MaybeLockVariableInputMutexesInOrder(ctx, use_exclusive_lock_,
+                                                      {0, 1, 2});
+
+    Tensor var;
+    OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<Device, T>(
+                            ctx, 0, use_exclusive_lock_, false, &var));
+    Tensor m;
+    OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<Device, T>(
+                            ctx, 1, use_exclusive_lock_, false, &m));
+    Tensor v;
+    OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<Device, T>(
+                            ctx, 2, use_exclusive_lock_, false, &v));
+    Tensor vhat;
+    OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<Device, T>(
+                            ctx, 3, use_exclusive_lock_, false, &vhat));
+    OP_REQUIRES(
+        ctx, var.IsInitialized(),
+        errors::FailedPrecondition(
+            "Attempting to use uninitialized variables: ", requested_input(0)));
+    OP_REQUIRES(
+        ctx, m.IsInitialized(),
+        errors::FailedPrecondition(
+            "Attempting to use uninitialized variables: ", requested_input(1)));
+    OP_REQUIRES(
+        ctx, v.IsInitialized(),
+        errors::FailedPrecondition(
+            "Attempting to use uninitialized variables: ", requested_input(2)));
+    OP_REQUIRES(
+        ctx, vhat.IsInitialized(),
+        errors::FailedPrecondition(
+            "Attempting to use uninitialized variables: ", requested_input(2)));
+
+    const Tensor& beta1_power = ctx->input(4);
+    const Tensor& beta2_power = ctx->input(5);
+    const Tensor& lr = ctx->input(6);
+    const Tensor& beta1 = ctx->input(7);
+    const Tensor& beta2 = ctx->input(8);
+    const Tensor& epsilon = ctx->input(9);
+
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(beta1_power.shape()),
+                errors::InvalidArgument("beta1_power is not a scalar: ",
+                                        beta1_power.shape().DebugString()));
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(beta2_power.shape()),
+                errors::InvalidArgument("beta2_power is not a scalar: ",
+                                        beta2_power.shape().DebugString()));
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(lr.shape()),
+                errors::InvalidArgument("lr is not a scalar : ",
+                                        lr.shape().DebugString()));
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(beta1.shape()),
+                errors::InvalidArgument("beta1 is not a scalar: ",
+                                        beta1.shape().DebugString()));
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(beta2.shape()),
+                errors::InvalidArgument("beta2 is not a scalar: ",
+                                        beta2.shape().DebugString()));
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(epsilon.shape()),
+                errors::InvalidArgument("epsilon is not a scalar: ",
+                                        epsilon.shape().DebugString()));
+
+    const Tensor& grad = ctx->input(10);
+    OP_REQUIRES(ctx, var.shape().IsSameSize(m.shape()),
+                errors::InvalidArgument("var and m do not have the same shape",
+                                        var.shape().DebugString(), " ",
+                                        m.shape().DebugString()));
+    OP_REQUIRES(ctx, var.shape().IsSameSize(v.shape()),
+                errors::InvalidArgument("var and v do not have the same shape",
+                                        var.shape().DebugString(), " ",
+                                        v.shape().DebugString()));
+    OP_REQUIRES(
+        ctx, var.shape().IsSameSize(grad.shape()),
+        errors::InvalidArgument("var and grad do not have the same shape",
+                                var.shape().DebugString(), " ",
+                                grad.shape().DebugString()));
+
+    const Device& device = ctx->template eigen_device<Device>();
+    functor::ApplyAdamWithAmsgrad<Device, T>()(
+        device, var.flat<T>(), m.flat<T>(), v.flat<T>(), vhat.flat<T>(),
+        beta1_power.scalar<T>(), beta2_power.scalar<T>(), lr.scalar<T>(),
+        beta1.scalar<T>(), beta2.scalar<T>(), epsilon.scalar<T>(),
+        grad.flat<T>());
+
+    MaybeForwardRefInputToRefOutput(ctx, 0, 0);
+  }
+
+ private:
+  bool use_exclusive_lock_;
+};
+
+#define REGISTER_KERNELS(D, T)                                 \
+  REGISTER_KERNEL_BUILDER(Name("ResourceApplyAdamWithAmsgrad") \
+                              .HostMemory("var")               \
+                              .HostMemory("m")                 \
+                              .HostMemory("v")                 \
+                              .HostMemory("vhat")              \
+                              .Device(DEVICE_##D)              \
+                              .TypeConstraint<T>("T"),         \
+                          ApplyAdamWithAmsgradOp<D##Device, T>);
+#define REGISTER_CPU_KERNELS(T) REGISTER_KERNELS(CPU, T);
+
+TF_CALL_half(REGISTER_CPU_KERNELS);
+TF_CALL_bfloat16(REGISTER_CPU_KERNELS);
+TF_CALL_float(REGISTER_CPU_KERNELS);
+TF_CALL_double(REGISTER_CPU_KERNELS);
+
+#if GOOGLE_CUDA
+// Forward declarations of the functor specializations for GPU.
+namespace functor {
+#define DECLARE_GPU_SPEC(T)                                   \
+  template <>                                                 \
+  void ApplyAdamWithAmsgrad<GPUDevice, T>::operator()(        \
+      const GPUDevice& d, typename TTypes<T>::Flat var,       \
+      typename TTypes<T>::Flat m, typename TTypes<T>::Flat v, \
+      typename TTypes<T>::Flat vhat,                          \
+      typename TTypes<T>::ConstScalar beta1_power,            \
+      typename TTypes<T>::ConstScalar beta2_power,            \
+      typename TTypes<T>::ConstScalar lr,                     \
+      typename TTypes<T>::ConstScalar beta1,                  \
+      typename TTypes<T>::ConstScalar beta2,                  \
+      typename TTypes<T>::ConstScalar epsilon,                \
+      typename TTypes<T>::ConstFlat grad);                    \
+  extern template struct ApplyAdamWithAmsgrad<GPUDevice, T>;
+DECLARE_GPU_SPEC(Eigen::half);
+DECLARE_GPU_SPEC(float);
+DECLARE_GPU_SPEC(double);
+#undef DECLARE_GPU_SPEC
+}  // namespace functor
+
+REGISTER_KERNELS(GPU, Eigen::half);
+REGISTER_KERNELS(GPU, float);
+REGISTER_KERNELS(GPU, double);
+#endif
+#undef REGISTER_CPU_KERNELS
+#undef REGISTER_KERNELS
+
 template <typename Device, typename T>
 class ApplyAdaMaxOp : public OpKernel {
  public:
diff --git a/tensorflow/core/kernels/training_ops.h b/tensorflow/core/kernels/training_ops.h
index e10a4cb1254..054f07350e6 100644
--- a/tensorflow/core/kernels/training_ops.h
+++ b/tensorflow/core/kernels/training_ops.h
@@ -126,6 +126,15 @@ struct ApplyMomentum {
                   typename TTypes<T>::ConstScalar momentum, bool use_nesterov);
 };
 
+template <typename Device, typename T>
+struct ApplyKerasMomentum {
+  void operator()(const Device& d, typename TTypes<T>::Flat var,
+                  typename TTypes<T>::Flat accum,
+                  typename TTypes<T>::ConstScalar lr,
+                  typename TTypes<T>::ConstFlat grad,
+                  typename TTypes<T>::ConstScalar momentum, bool use_nesterov);
+};
+
 template <typename Device, typename T>
 struct ApplyAdam {
   void operator()(const Device& d, typename TTypes<T>::Flat var,
@@ -139,6 +148,20 @@ struct ApplyAdam {
                   typename TTypes<T>::ConstFlat grad, bool use_nesterov);
 };
 
+template <typename Device, typename T>
+struct ApplyAdamWithAmsgrad {
+  void operator()(const Device& d, typename TTypes<T>::Flat var,
+                  typename TTypes<T>::Flat m, typename TTypes<T>::Flat v,
+                  typename TTypes<T>::Flat vhat,
+                  typename TTypes<T>::ConstScalar beta1_power,
+                  typename TTypes<T>::ConstScalar beta2_power,
+                  typename TTypes<T>::ConstScalar lr,
+                  typename TTypes<T>::ConstScalar beta1,
+                  typename TTypes<T>::ConstScalar beta2,
+                  typename TTypes<T>::ConstScalar epsilon,
+                  typename TTypes<T>::ConstFlat grad);
+};
+
 template <typename Device, typename T>
 struct ApplyAdaMax {
   void operator()(const Device& d, typename TTypes<T>::Flat var,
diff --git a/tensorflow/core/kernels/training_ops_gpu.cu.cc b/tensorflow/core/kernels/training_ops_gpu.cu.cc
index 4bd32592db1..f45b9ffca7c 100644
--- a/tensorflow/core/kernels/training_ops_gpu.cu.cc
+++ b/tensorflow/core/kernels/training_ops_gpu.cu.cc
@@ -101,6 +101,27 @@ struct ApplyMomentum<GPUDevice, T> {
   }
 };
 
+template <typename T>
+struct ApplyKerasMomentum<GPUDevice, T> {
+  void operator()(const GPUDevice& d, typename TTypes<T>::Flat var,
+                  typename TTypes<T>::Flat accum,
+                  typename TTypes<T>::ConstScalar lr,
+                  typename TTypes<T>::ConstFlat grad,
+                  typename TTypes<T>::ConstScalar momentum, bool use_nesterov) {
+    Eigen::array<typename TTypes<T>::Tensor::Index, 1> bcast;
+    bcast[0] = grad.dimension(0);
+    Eigen::Sizes<1> single;
+    accum.device(d) = (accum * momentum.reshape(single).broadcast(bcast) -
+                       grad * lr.reshape(single).broadcast(bcast));
+    if (use_nesterov) {
+      var.device(d) += (accum * momentum.reshape(single).broadcast(bcast) -
+                        grad * lr.reshape(single).broadcast(bcast));
+    } else {
+      var.device(d) += accum;
+    }
+  }
+};
+
 template <typename T>
 struct ApplyAdam<GPUDevice, T> {
   void operator()(const GPUDevice& d, typename TTypes<T>::Flat var,
@@ -144,6 +165,39 @@ struct ApplyAdam<GPUDevice, T> {
   }
 };
 
+template <typename T>
+struct ApplyAdamWithAmsgrad<GPUDevice, T> {
+  void operator()(const GPUDevice& d, typename TTypes<T>::Flat var,
+                  typename TTypes<T>::Flat m, typename TTypes<T>::Flat v,
+                  typename TTypes<T>::Flat vhat,
+                  typename TTypes<T>::ConstScalar beta1_power,
+                  typename TTypes<T>::ConstScalar beta2_power,
+                  typename TTypes<T>::ConstScalar lr,
+                  typename TTypes<T>::ConstScalar beta1,
+                  typename TTypes<T>::ConstScalar beta2,
+                  typename TTypes<T>::ConstScalar epsilon,
+                  typename TTypes<T>::ConstFlat grad) {
+    Eigen::array<typename TTypes<T>::Tensor::Index, 1> bcast;
+    bcast[0] = grad.dimension(0);
+    Eigen::Sizes<1> single;
+    const auto one = static_cast<T>(1.0);
+    m.device(d) =
+        m + (beta1.constant(one) - beta1).reshape(single).broadcast(bcast) *
+                (grad - m);
+    v.device(d) =
+        v + (beta2.constant(one) - beta2).reshape(single).broadcast(bcast) *
+                (grad.square() - v);
+    vhat.device(d) = vhat.cwiseMax(v);
+
+    var.device(d) -= (lr * (beta2_power.constant(one) - beta2_power).sqrt() /
+                      (beta1_power.constant(one) - beta1_power))
+                         .reshape(single)
+                         .broadcast(bcast) *
+                     m /
+                     (epsilon.reshape(single).broadcast(bcast) + vhat.sqrt());
+  }
+};
+
 template <typename T>
 struct ApplyAdaMax<GPUDevice, T> {
   void operator()(const GPUDevice& d, typename TTypes<T>::Flat var,
@@ -302,10 +356,18 @@ template struct functor::ApplyMomentum<GPUDevice, Eigen::half>;
 template struct functor::ApplyMomentum<GPUDevice, float>;
 template struct functor::ApplyMomentum<GPUDevice, double>;
 
+template struct functor::ApplyKerasMomentum<GPUDevice, Eigen::half>;
+template struct functor::ApplyKerasMomentum<GPUDevice, float>;
+template struct functor::ApplyKerasMomentum<GPUDevice, double>;
+
 template struct functor::ApplyAdam<GPUDevice, Eigen::half>;
 template struct functor::ApplyAdam<GPUDevice, float>;
 template struct functor::ApplyAdam<GPUDevice, double>;
 
+template struct functor::ApplyAdamWithAmsgrad<GPUDevice, Eigen::half>;
+template struct functor::ApplyAdamWithAmsgrad<GPUDevice, float>;
+template struct functor::ApplyAdamWithAmsgrad<GPUDevice, double>;
+
 template struct functor::ApplyAdaMax<GPUDevice, Eigen::half>;
 template struct functor::ApplyAdaMax<GPUDevice, float>;
 template struct functor::ApplyAdaMax<GPUDevice, double>;
diff --git a/tensorflow/core/kernels/training_ops_test.cc b/tensorflow/core/kernels/training_ops_test.cc
index 2dcc4a500e6..1ec57b45221 100644
--- a/tensorflow/core/kernels/training_ops_test.cc
+++ b/tensorflow/core/kernels/training_ops_test.cc
@@ -151,6 +151,40 @@ static void BM_Momentum(int iters, int params) {
 }
 BENCHMARK(BM_Momentum)->Arg(128 << 10)->Arg(256 << 10);
 
+static void KerasMomentum(int32 n, Graph** init_g, Graph** train_g) {
+  TensorShape shape({n});
+  {
+    Graph* g = new Graph(OpRegistry::Global());
+    auto var = Var(g, n);
+    auto accum = Var(g, n);
+    auto zero = Zeros(g, n);
+    test::graph::Assign(g, var, zero);
+    test::graph::Assign(g, accum, zero);
+    *init_g = g;
+  }
+  {
+    Graph* g = new Graph(OpRegistry::Global());
+    auto var = Var(g, n);
+    auto accum = Var(g, n);
+    auto lr = Scalar(g, 0.01);
+    auto grad = Random(g, n);
+    auto mom = Scalar(g, 0.01);
+    test::graph::Multi(g, "ApplyKerasMomentum", {var, accum, lr, grad, mom});
+    *train_g = g;
+  }
+}
+
+static void BM_KerasMomentum(int iters, int params) {
+  const int64 tot = static_cast<int64>(iters) * params;
+  testing::ItemsProcessed(tot);
+  testing::BytesProcessed(tot * sizeof(float));
+  Graph* init;
+  Graph* train;
+  KerasMomentum(params, &init, &train);
+  test::Benchmark("cpu", train, GetOptions(), init).Run(iters);
+}
+BENCHMARK(BM_KerasMomentum)->Arg(128 << 10)->Arg(256 << 10);
+
 static void Adam(int32 n, Graph** init_g, Graph** train_g) {
   TensorShape shape({n});
   {
@@ -194,6 +228,50 @@ static void BM_Adam(int iters, int params) {
 }
 BENCHMARK(BM_Adam)->Arg(128 << 10)->Arg(256 << 10);
 
+static void AdamWithAmsgrad(int32 n, Graph** init_g, Graph** train_g) {
+  TensorShape shape({n});
+  {
+    Graph* g = new Graph(OpRegistry::Global());
+    auto var = Var(g, n);
+    auto m = Var(g, n);
+    auto v = Var(g, n);
+    auto zero = Zeros(g, n);
+    test::graph::Assign(g, var, zero);
+    test::graph::Assign(g, m, zero);
+    test::graph::Assign(g, v, zero);
+    *init_g = g;
+  }
+  {
+    Graph* g = new Graph(OpRegistry::Global());
+    auto var = Var(g, n);
+    auto m = Var(g, n);
+    auto v = Var(g, n);
+    auto vhat = Var(g, n);
+    auto beta1_power = Scalar(g, 0.9);
+    auto beta2_power = Scalar(g, 0.99);
+    auto lr = Scalar(g, 0.01);
+    auto beta1 = Scalar(g, 0.9);
+    auto beta2 = Scalar(g, 0.99);
+    auto epsilon = Scalar(g, 1e-8);
+    auto grad = Random(g, n);
+    test::graph::Multi(g, "ApplyAdamWithAmsgrad",
+                       {var, m, v, vhat, beta1_power, beta2_power, lr, beta1,
+                        beta2, epsilon, grad});
+    *train_g = g;
+  }
+}
+
+static void BM_AdamWithAmsgrad(int iters, int params) {
+  const int64 tot = static_cast<int64>(iters) * params;
+  testing::ItemsProcessed(tot);
+  testing::BytesProcessed(tot * sizeof(float));
+  Graph* init;
+  Graph* train;
+  AdamWithAmsgrad(params, &init, &train);
+  test::Benchmark("cpu", train, GetOptions(), init).Run(iters);
+}
+BENCHMARK(BM_AdamWithAmsgrad)->Arg(128 << 10)->Arg(256 << 10);
+
 static void RMSProp(int32 n, Graph** init_g, Graph** train_g) {
   TensorShape shape({n});
   {
diff --git a/tensorflow/core/kernels/unicode_ops.cc b/tensorflow/core/kernels/unicode_ops.cc
index dd4415711b1..6c4ed1eaaf2 100644
--- a/tensorflow/core/kernels/unicode_ops.cc
+++ b/tensorflow/core/kernels/unicode_ops.cc
@@ -13,9 +13,17 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <stdint.h>
+#include <cstddef>
+#include <functional>
 #include <memory>
 #include <string>
+#include <vector>
 
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "unicode/appendable.h"  // TF:icu
+#include "unicode/schriter.h"  // TF:icu
+#include "unicode/uchar.h"  // TF:icu
 #include "unicode/ucnv.h"  // TF:icu
 #include "unicode/ucnv_err.h"  // TF:icu
 #include "unicode/umachine.h"  // TF:icu
@@ -23,15 +31,57 @@ limitations under the License.
 #include "unicode/unistr.h"  // TF:icu
 #include "unicode/uset.h"  // TF:icu
 #include "unicode/utypes.h"  // TF:icu
+#include "tensorflow/core/framework/kernel_def_builder.h"
+#include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/string_util.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/util/bcast.h"
 #include "tensorflow/core/util/ptr_util.h"
 
 namespace tensorflow {
+namespace {
+
+void Encode(const UnicodeEncoding encoding, const icu::UnicodeString& in,
+            string* out) {
+  if (encoding == UnicodeEncoding::UTF8) {
+    out->clear();
+    in.toUTF8String(*out);
+  } else if (encoding == UnicodeEncoding::UTF16BE) {
+    // TODO(gbillock): consider using the
+    // extract(char *dest, int32_t destCapacity, UConverter *cnv)
+    // for UTF16/32
+    out->clear();  // subtle: must come before reserve()
+    out->reserve(2 * in.length() + 1);
+    const char16_t* buf = in.getBuffer();
+    for (int i = 0; i < in.length(); ++i) {
+      // Emit big-endian encoding for UTF-16 always.
+      out->push_back((buf[i] & 0xFF00) >> 8);
+      out->push_back(buf[i] & 0x00FF);
+    }
+  } else if (encoding == UnicodeEncoding::UTF32BE) {
+    out->clear();  // subtle: must come before reserve()
+    out->reserve(4 * in.countChar32() + 1);
+    icu::StringCharacterIterator it(in);
+    UChar32 ch;
+    while (it.hasNext()) {
+      ch = it.next32PostInc();
+      out->push_back((ch & 0xFF000000) >> 24);
+      out->push_back((ch & 0x00FF0000) >> 16);
+      out->push_back((ch & 0x0000FF00) >> 8);
+      out->push_back((ch & 0x000000FF));
+    }
+  }
+}
 
 // This error callback is only useful for finding illegal encoding errors when
 // we want to be strict -- otherwise illegal encodings are replaced on read
@@ -146,40 +196,66 @@ class WrappedConverter {
   string name_;
 };
 
+struct ErrorOptions {
+  UChar32 subst = 0xFFFD;
+  bool elide_replacement = false;
+  bool replace_control_chars = false;
+  bool error_on_malformatting = false;
+};
+
+Status GetErrorOptions(OpKernelConstruction* ctx, ErrorOptions* out) {
+  *out = ErrorOptions();
+
+  string error_policy;
+  TF_RETURN_IF_ERROR(ctx->GetAttr("errors", &error_policy));
+
+  if (error_policy == "replace") {
+    out->elide_replacement = false;
+  } else if (error_policy == "ignore") {
+    out->elide_replacement = true;
+  } else if (error_policy == "strict") {
+    out->error_on_malformatting = true;
+  } else {
+    return errors::InvalidArgument(
+        "errors policy must be one of 'strict', 'replace', or 'ignore'");
+  }
+
+  int32 replacement_char;
+  TF_RETURN_IF_ERROR(ctx->GetAttr("replacement_char", &replacement_char));
+
+  if (replacement_char >= UCHAR_MIN_VALUE &&
+      replacement_char <= UCHAR_MAX_VALUE) {
+    out->subst = replacement_char;
+  } else {
+    return errors::InvalidArgument(
+        "replacement_char out of unicode codepoint range");
+  }
+
+  if (ctx->HasAttr("replace_control_characters")) {
+    TF_RETURN_IF_ERROR(ctx->GetAttr("replace_control_characters",
+                                    &(out->replace_control_chars)));
+  }
+
+  return Status::OK();
+}
+
+inline bool ShouldHandleFormatError(const ErrorOptions& error_options,
+                                    UChar32 ch, bool format_error) {
+  return ((error_options.replace_control_chars && ch <= 0x1F) || format_error);
+}
+
+}  // namespace
+
 class UnicodeTranscodeOp : public OpKernel {
  public:
   explicit UnicodeTranscodeOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
-    string error_policy;
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("errors", &error_policy));
-    if (error_policy == "replace") {
-      elide_replacement_ = false;
-    } else if (error_policy == "ignore") {
-      elide_replacement_ = true;
-    } else if (error_policy == "strict") {
-      error_on_malformatting_ = true;
-    } else {
-      ctx->CtxFailure(errors::InvalidArgument(
-          "errors policy must be one of 'strict', 'replace', or 'ignore'"));
-    }
-
-    int32 replacement_char;
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("replacement_char", &replacement_char));
-    if (replacement_char >= UCHAR_MIN_VALUE &&
-        replacement_char <= UCHAR_MAX_VALUE) {
-      subst_ = replacement_char;
-    } else {
-      ctx->CtxFailure(errors::InvalidArgument(
-          "replacement_char out of unicode codepoint range"));
-    }
+    OP_REQUIRES_OK(ctx, GetErrorOptions(ctx, &error_options_));
 
     string output_encoding;
     OP_REQUIRES_OK(ctx, ctx->GetAttr("output_encoding", &output_encoding));
     OP_REQUIRES_OK(ctx,
                    ParseUnicodeEncoding(output_encoding, &output_encoding_));
 
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("replace_control_characters",
-                                     &replace_control_chars_));
-
     OP_REQUIRES_OK(ctx, ctx->GetAttr("input_encoding", &input_encoding_));
     // Make a temporary UConverter to ensure it will create without error
     // at execution time (and to warm any data caches the converter needs).
@@ -228,7 +304,7 @@ class UnicodeTranscodeOp : public OpKernel {
       Transcode(&(output_flat(i)), input_encoder->converter_,
                 &found_any_format_error);
     }
-    if (error_on_malformatting_ && found_any_format_error) {
+    if (error_options_.error_on_malformatting && found_any_format_error) {
       ctx->CtxFailure(
           errors::InvalidArgument("Invalid formatting on input string"));
     }
@@ -240,12 +316,12 @@ class UnicodeTranscodeOp : public OpKernel {
   // out-of-range inputs.
   void TranslateCodepoints(icu::UnicodeString* s, bool* found_any_format_error,
                            UChar32 ch, int src_bytes, bool format_error) {
-    if ((replace_control_chars_ && ch <= 0x1F) || format_error) {
+    if (ShouldHandleFormatError(error_options_, ch, format_error)) {
       *found_any_format_error = true;
-      if (elide_replacement_) {
+      if (error_options_.elide_replacement) {
         return;
       } else {
-        ch = subst_;
+        ch = error_options_.subst;
       }
     }
     s->append(ch);
@@ -263,45 +339,202 @@ class UnicodeTranscodeOp : public OpKernel {
                   found_any_format_error, std::placeholders::_1,
                   std::placeholders::_2, std::placeholders::_3));
 
-    if (output_encoding_ == UnicodeEncoding::UTF8) {
-      s->clear();
-      source.toUTF8String(*s);
-    } else if (output_encoding_ == UnicodeEncoding::UTF16BE) {
-      // TODO(gbillock): consider using the
-      // extract(char *dest, int32_t destCapacity, UConverter *cnv)
-      // for UTF16/32
-      s->clear();  // subtle: must come before reserve()
-      s->reserve(2 * source.length() + 1);
-      const char16_t* buf = source.getBuffer();
-      for (int i = 0; i < source.length(); ++i) {
-        // Emit big-endian encoding for UTF-16 always.
-        s->push_back((buf[i] & 0xFF00) >> 8);
-        s->push_back(buf[i] & 0x00FF);
-      }
-    } else if (output_encoding_ == UnicodeEncoding::UTF32BE) {
-      s->clear();  // subtle: must come before reserve()
-      s->reserve(4 * source.countChar32() + 1);
-      for (int i = 0; i < source.countChar32(); ++i) {
-        // Emit big-endian encoding for UTF-32 always.
-        UChar32 ch = source.char32At(i);
-        s->push_back((ch & 0xFF000000) >> 24);
-        s->push_back((ch & 0x00FF0000) >> 16);
-        s->push_back((ch & 0x0000FF00) >> 8);
-        s->push_back((ch & 0x000000FF));
-      }
-    }
+    Encode(output_encoding_, source, s);
   }
 
-  UChar32 subst_ = 0xFFFD;
-  bool elide_replacement_ = false;
-  bool replace_control_chars_ = false;
-  bool error_on_malformatting_ = false;
-
   string input_encoding_;
+  ErrorOptions error_options_;
   UnicodeEncoding output_encoding_ = UnicodeEncoding::UTF8;
 };
 
 REGISTER_KERNEL_BUILDER(Name("UnicodeTranscode").Device(DEVICE_CPU),
                         UnicodeTranscodeOp);
 
+class UnicodeDecodeWithOffsetsOp : public OpKernel {
+ public:
+  explicit UnicodeDecodeWithOffsetsOp(OpKernelConstruction* ctx)
+      : OpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, GetErrorOptions(ctx, &error_options_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("input_encoding", &input_encoding_));
+    // Make a temporary UConverter to ensure it will create without error
+    // at execution time (and to warm any data caches the converter needs).
+    // This instance is not used.
+    std::unique_ptr<WrappedConverter> input_encoder =
+        absl::make_unique<WrappedConverter>();
+    input_encoder->init(input_encoding_);
+    OP_REQUIRES(ctx, input_encoder->converter_,
+                errors::InvalidArgument(
+                    "Could not create converter for input encoding: " +
+                    input_encoding_));
+  }
+
+  void Decode(OpKernelContext* ctx, std::vector<UChar32>* char_values,
+              std::vector<int64>* offset_values, int* string_length,
+              int64* next_row_split, UChar32 char_value, int char_length,
+              bool found_any_format_error) {
+    if (error_options_.error_on_malformatting && found_any_format_error) {
+      ctx->CtxFailure(
+          errors::InvalidArgument("Invalid formatting on input string"));
+    }
+    UChar32 decoded_value = char_value;
+    if (ShouldHandleFormatError(error_options_, char_value,
+                                found_any_format_error)) {
+      if (error_options_.elide_replacement) {
+        return;
+      } else {
+        decoded_value = error_options_.subst;
+      }
+    }
+
+    // Emit the char value.
+    char_values->push_back(decoded_value);
+
+    // Emit the byte offset
+    offset_values->push_back(*string_length);
+    *string_length += char_length;
+    *next_row_split += 1;
+  }
+
+  void Compute(OpKernelContext* ctx) override {
+    const Tensor* input_tensor;
+    OP_REQUIRES_OK(ctx, ctx->input("input", &input_tensor));
+
+    // Go through all the strings in `input`.
+    const auto& input_vec = input_tensor->flat<string>();
+
+    std::unique_ptr<WrappedConverter> input_encoder =
+        absl::make_unique<WrappedConverter>();
+    input_encoder->init(input_encoding_);
+    OP_REQUIRES(ctx, input_encoder->converter_,
+                errors::InvalidArgument(
+                    "Could not create converter for input encoding: " +
+                    input_encoding_));
+
+    std::vector<UChar32> char_values;
+    std::vector<int64> offset_values;
+
+    Tensor* output_row_splits;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output("row_splits",
+                                             {input_tensor->NumElements() + 1},
+                                             &output_row_splits));
+    auto out_row_splits = output_row_splits->vec<int64>();
+
+    int row_split_index = 0;
+    int64 next_row_split = 0;
+    for (int i = 0; i < input_vec.size(); ++i) {
+      const string& input = input_vec(i);
+      // Convert input strings into unicode values. Output to a list of
+      // char_values, record row splits and char_to_byte_starts, which are all
+      // the fields needed to construct a RaggedTensor.
+      out_row_splits(row_split_index) = next_row_split;
+      row_split_index++;
+      int string_length = 0;
+      IterateUnicodeString(
+          input, input_encoder->converter_,
+          std::bind(&UnicodeDecodeWithOffsetsOp::Decode, this, ctx,
+                    &char_values, &offset_values, &string_length,
+                    &next_row_split, std::placeholders::_1,
+                    std::placeholders::_2, std::placeholders::_3));
+    }
+    out_row_splits(row_split_index) = next_row_split;
+
+    DCHECK(offset_values.size() == char_values.size());
+    Tensor* output_char_values;
+    OP_REQUIRES_OK(
+        ctx, ctx->allocate_output("char_values",
+                                  {static_cast<int64>(char_values.size())},
+                                  &output_char_values));
+    Tensor* output_offset_values;
+    OP_REQUIRES_OK(
+        ctx, ctx->allocate_output("char_to_byte_starts",
+                                  {static_cast<int64>(offset_values.size())},
+                                  &output_offset_values));
+    auto out_char_values = output_char_values->vec<int32>();
+    auto out_offset_values = output_offset_values->vec<int64>();
+
+    // Load output tensors from intermediate value arrays.
+    for (int i = 0; i < char_values.size(); ++i) {
+      out_char_values(i) = static_cast<int32>(char_values[i]);
+      out_offset_values(i) = offset_values[i];
+    }
+  }
+
+ private:
+  string input_encoding_;
+  ErrorOptions error_options_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("UnicodeDecodeWithOffsets").Device(DEVICE_CPU),
+                        UnicodeDecodeWithOffsetsOp);
+
+class UnicodeEncodeOp : public OpKernel {
+ public:
+  explicit UnicodeEncodeOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
+    string encoding_tmp;
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_encoding", &encoding_tmp));
+    OP_REQUIRES_OK(ctx, ParseUnicodeEncoding(encoding_tmp, &encoding_));
+    OP_REQUIRES_OK(ctx, GetErrorOptions(ctx, &error_options_));
+  }
+
+  /**
+   * Encodes Unicode codepoints into the desired string representation.
+   *
+   * We lose a dimension while encoding, since a series of integer codepoints is
+   * encoded into a single string.
+   *
+   * This accepts two input tensors: a rank 1 tensor of code point values and
+   * a single rank 1 tensor of splits which determine where each string begins
+   * and ends from the provided code points.
+   */
+  void Compute(OpKernelContext* context) override {
+    // Get inputs
+    const Tensor& input_tensor = context->input(0);
+    const auto input_tensor_flat = input_tensor.flat<int32>();
+    const Tensor& input_splits = context->input(1);
+    const auto input_splits_flat = input_splits.flat<int64>();
+
+    // Since we limit to a 2-D input (inner_values of rank 1 and a single splits
+    // tensor), our output dimension will be 1 with it's size equal to the
+    // number of splits (outer dimension or ragged tensor).
+    TensorShape output_shape({input_splits.dim_size(0) - 1});
+    Tensor* output_tensor;
+    OP_REQUIRES_OK(context, context->allocate_output("output", output_shape,
+                                                     &output_tensor));
+    auto output_tensor_flat = output_tensor->flat<string>();
+
+    // Use a single index over the flattened input values tensor.
+    int idx = 0;
+    // Loop through our split dimension to create a new string at each split.
+    for (int i = 1; i < input_splits_flat.size(); ++i) {
+      icu::UnicodeString unicode_string;
+      icu::UnicodeStringAppendable appendable_unicode_string(unicode_string);
+      for (; idx < input_splits_flat(i); ++idx) {
+        int32 code_point = input_tensor_flat(idx);
+        // Check for invalid code point
+        if (code_point > UCHAR_MAX_VALUE || code_point < UCHAR_MIN_VALUE) {
+          if (error_options_.error_on_malformatting) {
+            context->CtxFailure(errors::InvalidArgument(
+                "Code point value out of valid Unicode range."));
+            return;
+          } else if (!error_options_.elide_replacement) {
+            code_point = error_options_.subst;
+          }
+        }
+        appendable_unicode_string.appendCodePoint(code_point);
+      }
+      // Encode our string and save in the output.
+      string result;
+      Encode(encoding_, unicode_string, &result);
+      output_tensor_flat(i - 1) = result;
+    }
+  }
+
+ private:
+  UnicodeEncoding encoding_;
+  ErrorOptions error_options_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("UnicodeEncode").Device(DEVICE_CPU),
+                        UnicodeEncodeOp);
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/lib/core/threadpool.cc b/tensorflow/core/lib/core/threadpool.cc
index 42689a6c3b3..e929ff45a1f 100644
--- a/tensorflow/core/lib/core/threadpool.cc
+++ b/tensorflow/core/lib/core/threadpool.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/core/platform/denormal.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/numa.h"
 #include "tensorflow/core/platform/setround.h"
 #include "tensorflow/core/platform/tracing.h"
 #include "tensorflow/core/platform/types.h"
@@ -54,6 +55,9 @@ struct EigenEnvironment {
       port::ScopedFlushDenormal flush;
       // Set the processor rounding mode to ROUND TO NEAREST.
       port::ScopedSetRound round(FE_TONEAREST);
+      if (thread_options_.numa_node != port::kNUMANoAffinity) {
+        port::NUMASetThreadNodeAffinity(thread_options_.numa_node);
+      }
       f();
     });
   }
@@ -83,35 +87,38 @@ struct EigenEnvironment {
 
 struct ThreadPool::Impl : Eigen::ThreadPoolTempl<EigenEnvironment> {
   Impl(Env* env, const ThreadOptions& thread_options, const string& name,
-       int num_threads, bool low_latency_hint)
+       int num_threads, bool low_latency_hint, Eigen::Allocator* allocator)
       : Eigen::ThreadPoolTempl<EigenEnvironment>(
             num_threads, low_latency_hint,
-            EigenEnvironment(env, thread_options, name)) {}
+            EigenEnvironment(env, thread_options, name)),
+        allocator_(allocator) {}
 
   void ParallelFor(int64 total, int64 cost_per_unit,
                    std::function<void(int64, int64)> fn) {
     CHECK_GE(total, 0);
     CHECK_EQ(total, (int64)(Eigen::Index)total);
-    Eigen::ThreadPoolDevice device(this, this->NumThreads());
+    Eigen::ThreadPoolDevice device(this, this->NumThreads(), allocator_);
     device.parallelFor(
         total, Eigen::TensorOpCost(0, 0, cost_per_unit),
         [&fn](Eigen::Index first, Eigen::Index last) { fn(first, last); });
   }
+
+  Eigen::Allocator* allocator_;
 };
 
 ThreadPool::ThreadPool(Env* env, const string& name, int num_threads)
-    : ThreadPool(env, ThreadOptions(), name, num_threads, true) {}
+    : ThreadPool(env, ThreadOptions(), name, num_threads, true, nullptr) {}
 
 ThreadPool::ThreadPool(Env* env, const ThreadOptions& thread_options,
                        const string& name, int num_threads)
-    : ThreadPool(env, thread_options, name, num_threads, true) {}
+    : ThreadPool(env, thread_options, name, num_threads, true, nullptr) {}
 
 ThreadPool::ThreadPool(Env* env, const ThreadOptions& thread_options,
                        const string& name, int num_threads,
-                       bool low_latency_hint) {
+                       bool low_latency_hint, Eigen::Allocator* allocator) {
   CHECK_GE(num_threads, 1);
   impl_.reset(new ThreadPool::Impl(env, thread_options, "tf_" + name,
-                                   num_threads, low_latency_hint));
+                                   num_threads, low_latency_hint, allocator));
 }
 
 ThreadPool::~ThreadPool() {}
diff --git a/tensorflow/core/lib/core/threadpool.h b/tensorflow/core/lib/core/threadpool.h
index 3da7dcb6328..90c9f294472 100644
--- a/tensorflow/core/lib/core/threadpool.h
+++ b/tensorflow/core/lib/core/threadpool.h
@@ -22,6 +22,9 @@ limitations under the License.
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/types.h"
 
+namespace Eigen {
+class Allocator;
+}  // namespace Eigen
 namespace tensorflow {
 namespace thread {
 
@@ -37,7 +40,8 @@ class ThreadPool {
   //
   // REQUIRES: num_threads > 0
   ThreadPool(Env* env, const ThreadOptions& thread_options, const string& name,
-             int num_threads, bool low_latency_hint);
+             int num_threads, bool low_latency_hint,
+             Eigen::Allocator* allocator = nullptr);
 
   // Constructs a pool for low-latency ops that contains "num_threads" threads
   // with specified "name". env->StartThread() is used to create individual
diff --git a/tensorflow/core/lib/png/png_io.cc b/tensorflow/core/lib/png/png_io.cc
index bc52180265c..e8dbcb97b94 100644
--- a/tensorflow/core/lib/png/png_io.cc
+++ b/tensorflow/core/lib/png/png_io.cc
@@ -92,7 +92,11 @@ void StringReader(png_structp png_ptr, png_bytep data, png_size_t length) {
   DecodeContext* const ctx =
       absl::bit_cast<DecodeContext*>(png_get_io_ptr(png_ptr));
   if (static_cast<png_size_t>(ctx->data_left) < length) {
-    memset(data, 0, length);
+    // Don't zero out the data buffer as it has been lazily allocated (copy on
+    // write) and zeroing it out here can produce an OOM. Since the buffer is
+    // only used for reading data from the image, this doesn't result in any
+    // data leak, so it is safe to just leave the buffer be as it is and just
+    // exit with error.
     png_error(png_ptr, "More bytes requested to read than available");
   } else {
     memcpy(data, ctx->data, length);
diff --git a/tensorflow/core/nccl/BUILD b/tensorflow/core/nccl/BUILD
index 50d9a2e8daa..4be33b2a0cf 100644
--- a/tensorflow/core/nccl/BUILD
+++ b/tensorflow/core/nccl/BUILD
@@ -11,6 +11,10 @@ exports_files(["LICENSE"])
 load("//tensorflow:tensorflow.bzl", "tf_cuda_cc_test")
 load("//tensorflow:tensorflow.bzl", "tf_copts")
 load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
+load(
+    "//tensorflow/core:platform/default/build_config_root.bzl",
+    "tf_cuda_tests_tags",
+)
 
 cc_library(
     name = "nccl_lib",
@@ -34,27 +38,17 @@ cc_library(
 tf_cuda_cc_test(
     name = "nccl_manager_test",
     size = "medium",
-    srcs = if_cuda(
-        [
-            "nccl_manager_test.cc",
-        ],
-        [],
-    ),
-    # Disabled on jenkins until errors finding nvmlShutdown are found.
-    tags = [
-        "manual",
-        "multi_gpu",
-        "no_oss",
-        "noguitar",
-        "notap",
+    srcs = ["nccl_manager_test.cc"],
+    tags = tf_cuda_tests_tags() + [
+        "no_cuda_on_cpu_tap",  # TODO(b/120284216): re-enable multi_gpu
     ],
-    deps =
-        if_cuda([
-            ":nccl_lib",
-            "@local_config_nccl//:nccl",
-            "//tensorflow/core:cuda",
-            "//tensorflow/core:test",
-            "//tensorflow/core:test_main",
-            "//tensorflow/core:testlib",
-        ]),
+    deps = [
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+    ] + if_cuda([
+        ":nccl_lib",
+        "@local_config_nccl//:nccl",
+        "//tensorflow/core:cuda",
+    ]),
 )
diff --git a/tensorflow/core/nccl/nccl_manager.cc b/tensorflow/core/nccl/nccl_manager.cc
index f8e8c752227..df49bf1b976 100644
--- a/tensorflow/core/nccl/nccl_manager.cc
+++ b/tensorflow/core/nccl/nccl_manager.cc
@@ -24,6 +24,22 @@ limitations under the License.
 
 namespace tensorflow {
 
+#define NCCL_RETURN_IF_ERROR(...)                               \
+  do {                                                          \
+    ncclResult_t nccl_status = (__VA_ARGS__);                   \
+    if (nccl_status != ncclSuccess) {                           \
+      return errors::Internal(ncclGetErrorString(nccl_status)); \
+    }                                                           \
+  } while (0)
+
+#define CUDA_RETURN_IF_ERROR(...)                               \
+  do {                                                          \
+    cudaError_t cuda_status = (__VA_ARGS__);                    \
+    if (cuda_status != cudaSuccess) {                           \
+      return errors::Internal(cudaGetErrorString(cuda_status)); \
+    }                                                           \
+  } while (0)
+
 using se::cuda::ScopedActivateExecutorContext;
 
 // Contains data for a single stream used for nccl communication; this includes
@@ -177,8 +193,8 @@ NcclManager* NcclManager::instance() {
   return instance;
 }
 
-NcclManager::Communicator* NcclManager::GetCommunicator(
-    NcclManager::Collective* collective) {
+Status NcclManager::GetCommunicator(NcclManager::Collective* collective,
+                                    NcclManager::Communicator** communicator) {
   // Sort by executor to make ordering of executors deterministic.
   std::sort(collective->participants.begin(), collective->participants.end(),
             [](const std::unique_ptr<Participant>& a,
@@ -217,7 +233,10 @@ NcclManager::Communicator* NcclManager::GetCommunicator(
           break;
         }
       }
-      if (i == num_devices) return comm.get();
+      if (i == num_devices) {
+        *communicator = comm.get();
+        return Status::OK();
+      }
     }
   }
 
@@ -264,37 +283,36 @@ NcclManager::Communicator* NcclManager::GetCommunicator(
   // NCCL2 prevents InitAll for more communicators than devices (but doesn't
   // check that device ids are unique). Work around it by initializing each
   // rank individually.
-  cudaGetDeviceCount(&device_count);
+  CUDA_RETURN_IF_ERROR(cudaGetDeviceCount(&device_count));
 #endif
   std::vector<ncclComm_t> nccl_comms(num_devices);
   if (num_devices <= device_count) {
-    auto result =
-        ncclCommInitAll(nccl_comms.data(), num_devices, devices.data());
-    CHECK_EQ(result, ncclSuccess) << ncclGetErrorString(result);
+    NCCL_RETURN_IF_ERROR(
+        ncclCommInitAll(nccl_comms.data(), num_devices, devices.data()));
   } else {
     int savedDevice = 0;
-    CHECK_EQ(cudaGetDevice(&savedDevice), cudaSuccess);
+    CUDA_RETURN_IF_ERROR(cudaGetDevice(&savedDevice));
     ncclUniqueId commId;
-    ncclGetUniqueId(&commId);
+    NCCL_RETURN_IF_ERROR(ncclGetUniqueId(&commId));
 #if NCCL_MAJOR >= 2
-    CHECK_EQ(ncclGroupStart(), ncclSuccess);
+    NCCL_RETURN_IF_ERROR(ncclGroupStart());
 #endif
     for (int rank = 0; rank < num_devices; ++rank) {
-      cudaSetDevice(devices[rank]);
-      auto result =
-          ncclCommInitRank(nccl_comms.data() + rank, num_devices, commId, rank);
-      CHECK_EQ(result, ncclSuccess) << ncclGetErrorString(result);
+      CUDA_RETURN_IF_ERROR(cudaSetDevice(devices[rank]));
+      NCCL_RETURN_IF_ERROR(ncclCommInitRank(nccl_comms.data() + rank,
+                                            num_devices, commId, rank));
     }
 #if NCCL_MAJOR >= 2
-    CHECK_EQ(ncclGroupEnd(), ncclSuccess);
+    NCCL_RETURN_IF_ERROR(ncclGroupEnd());
 #endif
-    cudaSetDevice(savedDevice);
+    CUDA_RETURN_IF_ERROR(cudaSetDevice(savedDevice));
   }
   for (int rank = 0; rank < num_devices; ++rank) {
     members[rank].nccl_comm = nccl_comms[rank];
   }
   communicators_.emplace_back(new Communicator(std::move(members)));
-  return communicators_.back().get();
+  *communicator = communicators_.back().get();
+  return Status::OK();
 }
 
 void NcclManager::AddToAllReduce(int num_devices, const string& key,
@@ -400,10 +418,18 @@ void NcclManager::AddParticipant(int num_devices, const string& key,
 void NcclManager::RunCollective(const string& key, Collective* collective) {
   static mutex collective_mu(LINKER_INITIALIZED);
 
-  auto* communicator = GetCommunicator(collective);
-  collective->communicator = communicator;
-  const int size = communicator->num_devices;
+  Communicator* communicator = nullptr;
+  const int size = static_cast<int>(collective->participants.size());
+  Status s = GetCommunicator(collective, &communicator);
+  if (!s.ok()) {
+    for (int i = 0; i < size; ++i) {
+      collective->participants[i]->done_callback(s);
+    }
+    delete collective;
+    return;
+  }
 
+  collective->communicator = communicator;
   for (int rank = 0; rank < size; ++rank) {
     Participant* p = collective->participants[rank].get();
     NcclStream* nccl_stream = communicator->members[rank].nccl_stream;
diff --git a/tensorflow/core/nccl/nccl_manager.h b/tensorflow/core/nccl/nccl_manager.h
index 76b49101d47..5da4fe5554d 100644
--- a/tensorflow/core/nccl/nccl_manager.h
+++ b/tensorflow/core/nccl/nccl_manager.h
@@ -103,7 +103,13 @@ class NcclManager {
   struct NcclStream;
   struct Participant;
 
-  Communicator* GetCommunicator(Collective* collective);
+  // Gets the `Communicator` object that will be used to enqueue NCCL kernels
+  // for `collective`, and returns it via `communicator`.
+  //
+  // This may involve creating CUDA streams and NCCL initialization.  If a NCCL
+  // or CUDA error occurs in the process, this returns an INTERNAL error with
+  // the corresponding NCCL/CUDA error string.
+  Status GetCommunicator(Collective* collective, Communicator** communicator);
 
   void AddParticipant(int num_devices, const string& key,
                       std::unique_ptr<Participant> participant,
diff --git a/tensorflow/core/nccl/nccl_manager_test.cc b/tensorflow/core/nccl/nccl_manager_test.cc
index dbc07865f0b..f43103e120b 100644
--- a/tensorflow/core/nccl/nccl_manager_test.cc
+++ b/tensorflow/core/nccl/nccl_manager_test.cc
@@ -28,8 +28,8 @@ limitations under the License.
 
 namespace tensorflow {
 
-static std::vector<BaseGPUDevice*> GetGPUDevices() {
-  std::vector<Device*> devices;
+static std::vector<std::unique_ptr<BaseGPUDevice>> GetGPUDevices() {
+  std::vector<std::unique_ptr<Device>> devices;
   SessionOptions session_options;
   session_options.config.mutable_gpu_options()
       ->set_per_process_gpu_memory_fraction(0.1);
@@ -37,12 +37,12 @@ static std::vector<BaseGPUDevice*> GetGPUDevices() {
   Status s = DeviceFactory::GetFactory(DEVICE_GPU)
                  ->AddDevices(session_options, "", &devices);
   TF_CHECK_OK(s);
-  std::vector<BaseGPUDevice*> gpus;
-  for (Device* d : devices) {
-    if (d->device_type() == "GPU") {
-      gpus.push_back(static_cast<BaseGPUDevice*>(d));
-    } else {
-      delete d;
+  std::vector<std::unique_ptr<BaseGPUDevice>> gpus;
+  for (std::unique_ptr<Device>& device : devices) {
+    if (device->device_type() == "GPU") {
+      // If `device_type()` is GPU, this `Device` is guaranteed to be a
+      // `BaseGPUDevice`, which is a subclass of `Device`.
+      gpus.emplace_back(static_cast<BaseGPUDevice*>(device.release()));
     }
   }
   return gpus;
@@ -64,16 +64,14 @@ class NcclManagerTest : public ::testing::Test {
   };
 
   static void SetUpTestCase() {
-    setenv("NCCL_DEBUG", "INFO", 1 /* replace */);
-    devices_ = new std::vector<BaseGPUDevice*>(GetGPUDevices());
-    CHECK(!devices_->empty());
+    setenv("NCCL_DEBUG", "WARN", 1 /* replace */);
+    devices_ = new std::vector<std::unique_ptr<BaseGPUDevice>>(GetGPUDevices());
     LOG(ERROR) << "Running test with " << devices_->size() << " gpus";
   }
 
-  static void TearDownTestCase() {
-    for (auto device : *devices_) delete device;
-    delete devices_;
-  }
+  static int32 NumGPUs() { return static_cast<int32>(devices_->size()); }
+
+  static void TearDownTestCase() { delete devices_; }
 
   TestCase* MakeTestCase(int num_ranks, ncclRedOp_t reduction_op,
                          TensorShape shape, float value_offset) {
@@ -153,7 +151,7 @@ class NcclManagerTest : public ::testing::Test {
       stream->ThenMemcpy(out_cpu.flat<Scalar>().data(), out_gpu_mem,
                          out_cpu.TotalBytes());
       SE_ASSERT_OK(stream->BlockHostUntilDone());
-      test::ExpectTensorNear<Scalar>(test_case->expected, out_cpu, 0.01);
+      test::ExpectClose(test_case->expected, out_cpu);
     }
   }
 
@@ -166,7 +164,7 @@ class NcclManagerTest : public ::testing::Test {
   }
 
   static BaseGPUDevice* GetDevice(size_t rank) {
-    return devices_->at(rank % devices_->size());
+    return devices_->at(rank % devices_->size()).get();
   }
 
  private:
@@ -181,13 +179,14 @@ class NcclManagerTest : public ::testing::Test {
   }
 
  private:
-  static std::vector<BaseGPUDevice*>* devices_;
+  static std::vector<std::unique_ptr<BaseGPUDevice>>* devices_;
   static const DataType data_type_;
   static const Scalar max_;
 };
 
 template <typename Scalar>
-std::vector<BaseGPUDevice*>* NcclManagerTest<Scalar>::devices_ = nullptr;
+std::vector<std::unique_ptr<BaseGPUDevice>>* NcclManagerTest<Scalar>::devices_ =
+    nullptr;
 template <typename Scalar>
 const DataType NcclManagerTest<Scalar>::data_type_ =
     DataTypeToEnum<Scalar>::value;
@@ -195,13 +194,13 @@ template <typename Scalar>
 const Scalar NcclManagerTest<Scalar>::max_ =
     Eigen::NumTraits<Scalar>::highest();
 
-// Instantiate tests for float and half.
-using TypeList = ::testing::Types<float, Eigen::half>;
+// Instantiate tests for float and double.
+using TypeList = ::testing::Types<float, double>;
 TYPED_TEST_CASE(NcclManagerTest, TypeList);
 
 // Test basic sum reduction.
 TYPED_TEST(NcclManagerTest, BasicSumReduction) {
-  const int num_ranks = 3;
+  const int num_ranks = this->NumGPUs();
 
   for (int op = 0; op < 4; ++op) {
     ncclRedOp_t reduction_op = static_cast<ncclRedOp_t>(op);
@@ -230,10 +229,10 @@ TYPED_TEST(NcclManagerTest, BasicSumReduction) {
 // To test the higher settings, increase num_ranks,
 // num_collectives_per_iteration and time_limit_micros.
 TYPED_TEST(NcclManagerTest, MultipleCallers) {
-  const int num_ranks = 1;                      // 2;
-  const int num_collectives_per_iteration = 1;  // 1000;
+  const int num_ranks = this->NumGPUs();
+  const int num_collectives_per_iteration = 10;  // 1000;
   const int num_threads = 3;
-  const int time_limit_micros = 1;  // 60 * 30 * 1000 * 1000;
+  const int time_limit_micros = 100;  // 60 * 30 * 1000 * 1000;
 
   int64 start = Env::Default()->NowMicros();
   srand(Env::Default()->NowMicros());
diff --git a/tensorflow/core/ops/array_ops.cc b/tensorflow/core/ops/array_ops.cc
index f55562ec99d..281e2996ed7 100644
--- a/tensorflow/core/ops/array_ops.cc
+++ b/tensorflow/core/ops/array_ops.cc
@@ -2743,6 +2743,9 @@ REGISTER_OP("QuantizeAndDequantizeV2")
     .Attr("range_given: bool = false")
     .Output("output: T")
     .Attr("T: {bfloat16, half, float, double}")
+    .Attr(
+        "round_mode: {'HALF_TO_EVEN', 'HALF_UP'} = "
+        "'HALF_TO_EVEN'")
     .SetShapeFn([](InferenceContext* c) {
       ShapeHandle unused;
       TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
@@ -2878,14 +2881,9 @@ REGISTER_OP("QuantizedInstanceNorm")
 
 namespace {
 
-Status ScatterNdShape(InferenceContext* c) {
-  ShapeHandle indices_shape;
-  TF_RETURN_IF_ERROR(c->WithRankAtLeast(c->input(0), 1, &indices_shape));
-  ShapeHandle updates_shape;
-  TF_RETURN_IF_ERROR(c->WithRankAtLeast(c->input(1), 1, &updates_shape));
-  ShapeHandle output_shape;
-  TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensor(2, &output_shape));
-
+Status ScatterNdShapeHelper(InferenceContext* c, ShapeHandle indices_shape,
+                            ShapeHandle updates_shape,
+                            ShapeHandle output_shape) {
   if (c->Value(c->NumElements(output_shape)) == 0 &&
       (c->Value(c->NumElements(indices_shape)) > 0 ||
        c->Value(c->NumElements(updates_shape)) > 0)) {
@@ -2940,6 +2938,26 @@ Status ScatterNdShape(InferenceContext* c) {
   return Status::OK();
 }
 
+Status ScatterNdShape(InferenceContext* c) {
+  ShapeHandle indices_shape;
+  TF_RETURN_IF_ERROR(c->WithRankAtLeast(c->input(0), 1, &indices_shape));
+  ShapeHandle updates_shape;
+  TF_RETURN_IF_ERROR(c->WithRankAtLeast(c->input(1), 1, &updates_shape));
+  ShapeHandle output_shape;
+  TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensor(2, &output_shape));
+  return ScatterNdShapeHelper(c, indices_shape, updates_shape, output_shape);
+}
+
+Status ScatterNdTensorShape(InferenceContext* c) {
+  ShapeHandle output_shape;
+  TF_RETURN_IF_ERROR(c->WithRankAtLeast(c->input(0), 1, &output_shape));
+  ShapeHandle indices_shape;
+  TF_RETURN_IF_ERROR(c->WithRankAtLeast(c->input(1), 1, &indices_shape));
+  ShapeHandle updates_shape;
+  TF_RETURN_IF_ERROR(c->WithRankAtLeast(c->input(2), 1, &updates_shape));
+  return ScatterNdShapeHelper(c, indices_shape, updates_shape, output_shape);
+}
+
 }  // namespace
 
 REGISTER_OP("UpperBound")
@@ -2979,6 +2997,33 @@ REGISTER_OP("ScatterNd")
     .Attr("Tindices: {int32, int64}")
     .SetShapeFn(ScatterNdShape);
 
+REGISTER_OP("TensorScatterUpdate")
+    .Input("tensor: T")
+    .Input("indices: Tindices")
+    .Input("updates: T")
+    .Output("output: T")
+    .Attr("T: type")
+    .Attr("Tindices: {int32, int64}")
+    .SetShapeFn(ScatterNdTensorShape);
+
+REGISTER_OP("TensorScatterAdd")
+    .Input("tensor: T")
+    .Input("indices: Tindices")
+    .Input("updates: T")
+    .Output("output: T")
+    .Attr("T: type")
+    .Attr("Tindices: {int32, int64}")
+    .SetShapeFn(ScatterNdTensorShape);
+
+REGISTER_OP("TensorScatterSub")
+    .Input("tensor: T")
+    .Input("indices: Tindices")
+    .Input("updates: T")
+    .Output("output: T")
+    .Attr("T: type")
+    .Attr("Tindices: {int32, int64}")
+    .SetShapeFn(ScatterNdTensorShape);
+
 REGISTER_OP("ScatterNdNonAliasingAdd")
     .Input("input: T")
     .Input("indices: Tindices")
diff --git a/tensorflow/core/ops/compat/ops_history.v1.pbtxt b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
index bfcc92dcb0f..8022c390c62 100644
--- a/tensorflow/core/ops/compat/ops_history.v1.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
@@ -12076,33 +12076,6 @@ op {
     type: "list(float)"
   }
 }
-op {
-  name: "BytesProducedStatsDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "tag"
-    type: DT_STRING
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-}
 op {
   name: "CTCBeamSearchDecoder"
   input_arg {
@@ -17326,21 +17299,6 @@ op {
     minimum: 1
   }
 }
-op {
-  name: "DatasetToTFRecord"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "filename"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "compression_type"
-    type: DT_STRING
-  }
-}
 op {
   name: "DebugGradientIdentity"
   input_arg {
@@ -18487,69 +18445,6 @@ op {
     }
   }
 }
-op {
-  name: "DenseToSparseBatchDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "batch_size"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "row_shape"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-  is_stateful: true
-}
-op {
-  name: "DenseToSparseBatchDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "batch_size"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "row_shape"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-}
 op {
   name: "DenseToSparseSetOperation"
   input_arg {
@@ -21153,24 +21048,6 @@ op {
     type: DT_STRING
   }
 }
-op {
-  name: "EnqueueInQueueDataset"
-  input_arg {
-    name: "queue"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "components"
-    type_list_attr: "Tcomponents"
-  }
-  attr {
-    name: "Tcomponents"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  is_stateful: true
-}
 op {
   name: "EnsureShape"
   input_arg {
@@ -21626,6 +21503,33 @@ op {
     minimum: 1
   }
 }
+op {
+  name: "ExperimentalBytesProducedStatsDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "tag"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
 op {
   name: "ExperimentalCSVDataset"
   input_arg {
@@ -21691,6 +21595,84 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "ExperimentalDatasetToTFRecord"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "filename"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "compression_type"
+    type: DT_STRING
+  }
+}
+op {
+  name: "ExperimentalDenseToSparseBatchDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "batch_size"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "row_shape"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
+op {
+  name: "ExperimentalDenseToSparseBatchDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "batch_size"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "row_shape"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
 op {
   name: "ExperimentalDirectedInterleaveDataset"
   input_arg {
@@ -21726,50 +21708,66 @@ op {
   }
 }
 op {
-  name: "ExperimentalFunctionBufferingResource"
+  name: "ExperimentalGroupByReducerDataset"
   input_arg {
-    name: "string_arg"
-    type: DT_STRING
+    name: "input_dataset"
+    type: DT_VARIANT
   }
   input_arg {
-    name: "target_device"
-    type: DT_STRING
+    name: "key_func_other_arguments"
+    type_list_attr: "Tkey_func_other_arguments"
+  }
+  input_arg {
+    name: "init_func_other_arguments"
+    type_list_attr: "Tinit_func_other_arguments"
+  }
+  input_arg {
+    name: "reduce_func_other_arguments"
+    type_list_attr: "Treduce_func_other_arguments"
+  }
+  input_arg {
+    name: "finalize_func_other_arguments"
+    type_list_attr: "Tfinalize_func_other_arguments"
   }
   output_arg {
-    name: "resource"
-    type: DT_RESOURCE
+    name: "handle"
+    type: DT_VARIANT
   }
   attr {
-    name: "shared_name"
-    type: "string"
-  }
-  attr {
-    name: "container"
-    type: "string"
-  }
-  attr {
-    name: "f"
+    name: "key_func"
     type: "func"
   }
   attr {
-    name: "buffer_size"
-    type: "int"
+    name: "init_func"
+    type: "func"
   }
   attr {
-    name: "output_types"
+    name: "reduce_func"
+    type: "func"
+  }
+  attr {
+    name: "finalize_func"
+    type: "func"
+  }
+  attr {
+    name: "Tkey_func_other_arguments"
     type: "list(type)"
+    has_minimum: true
   }
-  is_stateful: true
-}
-op {
-  name: "ExperimentalFunctionBufferingResourceGetNext"
-  input_arg {
-    name: "function_buffer_resource"
-    type: DT_RESOURCE
+  attr {
+    name: "Tinit_func_other_arguments"
+    type: "list(type)"
+    has_minimum: true
   }
-  output_arg {
-    name: "output"
-    type_list_attr: "output_types"
+  attr {
+    name: "Treduce_func_other_arguments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Tfinalize_func_other_arguments"
+    type: "list(type)"
+    has_minimum: true
   }
   attr {
     name: "output_types"
@@ -21777,16 +21775,139 @@ op {
     has_minimum: true
     minimum: 1
   }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
   is_stateful: true
 }
 op {
-  name: "ExperimentalFunctionBufferingResourceReset"
+  name: "ExperimentalGroupByWindowDataset"
   input_arg {
-    name: "function_buffer_resource"
-    type: DT_RESOURCE
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "key_func_other_arguments"
+    type_list_attr: "Tkey_func_other_arguments"
+  }
+  input_arg {
+    name: "reduce_func_other_arguments"
+    type_list_attr: "Treduce_func_other_arguments"
+  }
+  input_arg {
+    name: "window_size_func_other_arguments"
+    type_list_attr: "Twindow_size_func_other_arguments"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "key_func"
+    type: "func"
+  }
+  attr {
+    name: "reduce_func"
+    type: "func"
+  }
+  attr {
+    name: "window_size_func"
+    type: "func"
+  }
+  attr {
+    name: "Tkey_func_other_arguments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Treduce_func_other_arguments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Twindow_size_func_other_arguments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
   }
   is_stateful: true
 }
+op {
+  name: "ExperimentalGroupByWindowDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "key_func_other_arguments"
+    type_list_attr: "Tkey_func_other_arguments"
+  }
+  input_arg {
+    name: "reduce_func_other_arguments"
+    type_list_attr: "Treduce_func_other_arguments"
+  }
+  input_arg {
+    name: "window_size_func_other_arguments"
+    type_list_attr: "Twindow_size_func_other_arguments"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "key_func"
+    type: "func"
+  }
+  attr {
+    name: "reduce_func"
+    type: "func"
+  }
+  attr {
+    name: "window_size_func"
+    type: "func"
+  }
+  attr {
+    name: "Tkey_func_other_arguments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Treduce_func_other_arguments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Twindow_size_func_other_arguments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
 op {
   name: "ExperimentalIdentityIndexedDataset"
   input_arg {
@@ -21898,6 +22019,136 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "ExperimentalLatencyStatsDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "tag"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "ExperimentalMapAndBatchDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  input_arg {
+    name: "batch_size"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "num_parallel_calls"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "drop_remainder"
+    type: DT_BOOL
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "ExperimentalMapDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "use_inter_op_parallelism"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+}
+op {
+  name: "ExperimentalMatchingFilesDataset"
+  input_arg {
+    name: "patterns"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  is_stateful: true
+}
 op {
   name: "ExperimentalMaterializedIndexDatasetHandle"
   output_arg {
@@ -21926,6 +22177,33 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "ExperimentalMaxIntraOpParallelismDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "max_intra_op_parallelism"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
 op {
   name: "ExperimentalNonSerializableDataset"
   input_arg {
@@ -21997,6 +22275,346 @@ op {
     minimum: 1
   }
 }
+op {
+  name: "ExperimentalParallelInterleaveDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  input_arg {
+    name: "cycle_length"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "block_length"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "sloppy"
+    type: DT_BOOL
+  }
+  input_arg {
+    name: "buffer_output_elements"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "prefetch_input_elements"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "ExperimentalParseExampleDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "num_parallel_calls"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "dense_defaults"
+    type_list_attr: "Tdense"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "sparse_keys"
+    type: "list(string)"
+    has_minimum: true
+  }
+  attr {
+    name: "dense_keys"
+    type: "list(string)"
+    has_minimum: true
+  }
+  attr {
+    name: "sparse_types"
+    type: "list(type)"
+    has_minimum: true
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "Tdense"
+    type: "list(type)"
+    has_minimum: true
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "dense_shapes"
+    type: "list(shape)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "ExperimentalParseExampleDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "num_parallel_calls"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "dense_defaults"
+    type_list_attr: "Tdense"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "sparse_keys"
+    type: "list(string)"
+    has_minimum: true
+  }
+  attr {
+    name: "dense_keys"
+    type: "list(string)"
+    has_minimum: true
+  }
+  attr {
+    name: "sparse_types"
+    type: "list(type)"
+    has_minimum: true
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "Tdense"
+    type: "list(type)"
+    has_minimum: true
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "dense_shapes"
+    type: "list(shape)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "sloppy"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ExperimentalPrivateThreadPoolDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "num_threads"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "ExperimentalRandomDataset"
+  input_arg {
+    name: "seed"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "seed2"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
+op {
+  name: "ExperimentalScanDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "initial_state"
+    type_list_attr: "Tstate"
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "Tstate"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "ExperimentalSetStatsAggregatorDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "stats_aggregator"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "tag"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "counter_prefix"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
 op {
   name: "ExperimentalSleepDataset"
   input_arg {
@@ -22024,6 +22642,107 @@ op {
     minimum: 1
   }
 }
+op {
+  name: "ExperimentalSlidingWindowDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "window_size"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "window_shift"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "window_stride"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "ExperimentalSqlDataset"
+  input_arg {
+    name: "driver_name"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "data_source_name"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "query"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
+op {
+  name: "ExperimentalStatsAggregatorHandle"
+  output_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ExperimentalStatsAggregatorSummary"
+  input_arg {
+    name: "iterator"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "summary"
+    type: DT_STRING
+  }
+  is_stateful: true
+}
 op {
   name: "ExperimentalThreadPoolDataset"
   input_arg {
@@ -22089,6 +22808,29 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "ExperimentalUnbatchDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
 op {
   name: "ExperimentalUniqueDataset"
   input_arg {
@@ -26313,207 +27055,6 @@ op {
     }
   }
 }
-op {
-  name: "GroupByReducerDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "key_func_other_arguments"
-    type_list_attr: "Tkey_func_other_arguments"
-  }
-  input_arg {
-    name: "init_func_other_arguments"
-    type_list_attr: "Tinit_func_other_arguments"
-  }
-  input_arg {
-    name: "reduce_func_other_arguments"
-    type_list_attr: "Treduce_func_other_arguments"
-  }
-  input_arg {
-    name: "finalize_func_other_arguments"
-    type_list_attr: "Tfinalize_func_other_arguments"
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "key_func"
-    type: "func"
-  }
-  attr {
-    name: "init_func"
-    type: "func"
-  }
-  attr {
-    name: "reduce_func"
-    type: "func"
-  }
-  attr {
-    name: "finalize_func"
-    type: "func"
-  }
-  attr {
-    name: "Tkey_func_other_arguments"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "Tinit_func_other_arguments"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "Treduce_func_other_arguments"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "Tfinalize_func_other_arguments"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-  is_stateful: true
-}
-op {
-  name: "GroupByWindowDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "key_func_other_arguments"
-    type_list_attr: "Tkey_func_other_arguments"
-  }
-  input_arg {
-    name: "reduce_func_other_arguments"
-    type_list_attr: "Treduce_func_other_arguments"
-  }
-  input_arg {
-    name: "window_size_func_other_arguments"
-    type_list_attr: "Twindow_size_func_other_arguments"
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "key_func"
-    type: "func"
-  }
-  attr {
-    name: "reduce_func"
-    type: "func"
-  }
-  attr {
-    name: "window_size_func"
-    type: "func"
-  }
-  attr {
-    name: "Tkey_func_other_arguments"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "Treduce_func_other_arguments"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "Twindow_size_func_other_arguments"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-  is_stateful: true
-}
-op {
-  name: "GroupByWindowDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "key_func_other_arguments"
-    type_list_attr: "Tkey_func_other_arguments"
-  }
-  input_arg {
-    name: "reduce_func_other_arguments"
-    type_list_attr: "Treduce_func_other_arguments"
-  }
-  input_arg {
-    name: "window_size_func_other_arguments"
-    type_list_attr: "Twindow_size_func_other_arguments"
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "key_func"
-    type: "func"
-  }
-  attr {
-    name: "reduce_func"
-    type: "func"
-  }
-  attr {
-    name: "window_size_func"
-    type: "func"
-  }
-  attr {
-    name: "Tkey_func_other_arguments"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "Treduce_func_other_arguments"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "Twindow_size_func_other_arguments"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-}
 op {
   name: "GuaranteeConst"
   input_arg {
@@ -29151,33 +29692,6 @@ op {
     }
   }
 }
-op {
-  name: "LatencyStatsDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "tag"
-    type: DT_STRING
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-}
 op {
   name: "LeakyRelu"
   input_arg {
@@ -30635,102 +31149,6 @@ op {
   }
   is_stateful: true
 }
-op {
-  name: "MapAndBatchDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "other_arguments"
-    type_list_attr: "Targuments"
-  }
-  input_arg {
-    name: "batch_size"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "num_parallel_batches"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "drop_remainder"
-    type: DT_BOOL
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "f"
-    type: "func"
-  }
-  attr {
-    name: "Targuments"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-}
-op {
-  name: "MapAndBatchDatasetV2"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "other_arguments"
-    type_list_attr: "Targuments"
-  }
-  input_arg {
-    name: "batch_size"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "num_parallel_calls"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "drop_remainder"
-    type: DT_BOOL
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "f"
-    type: "func"
-  }
-  attr {
-    name: "Targuments"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-}
 op {
   name: "MapClear"
   attr {
@@ -31447,18 +31865,6 @@ op {
     type: DT_STRING
   }
 }
-op {
-  name: "MatchingFilesDataset"
-  input_arg {
-    name: "patterns"
-    type: DT_STRING
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  is_stateful: true
-}
 op {
   name: "MatrixBandPart"
   input_arg {
@@ -39011,62 +39417,6 @@ op {
     type: "type"
   }
 }
-op {
-  name: "ParallelInterleaveDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "other_arguments"
-    type_list_attr: "Targuments"
-  }
-  input_arg {
-    name: "cycle_length"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "block_length"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "sloppy"
-    type: DT_BOOL
-  }
-  input_arg {
-    name: "buffer_output_elements"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "prefetch_input_elements"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "f"
-    type: "func"
-  }
-  attr {
-    name: "Targuments"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-}
 op {
   name: "ParallelInterleaveDatasetV2"
   input_arg {
@@ -39561,153 +39911,6 @@ op {
     has_minimum: true
   }
 }
-op {
-  name: "ParseExampleDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "num_parallel_calls"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "dense_defaults"
-    type_list_attr: "Tdense"
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "sparse_keys"
-    type: "list(string)"
-    has_minimum: true
-  }
-  attr {
-    name: "dense_keys"
-    type: "list(string)"
-    has_minimum: true
-  }
-  attr {
-    name: "sparse_types"
-    type: "list(type)"
-    has_minimum: true
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_INT64
-        type: DT_STRING
-      }
-    }
-  }
-  attr {
-    name: "Tdense"
-    type: "list(type)"
-    has_minimum: true
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_INT64
-        type: DT_STRING
-      }
-    }
-  }
-  attr {
-    name: "dense_shapes"
-    type: "list(shape)"
-    has_minimum: true
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-}
-op {
-  name: "ParseExampleDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "num_parallel_calls"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "dense_defaults"
-    type_list_attr: "Tdense"
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "sparse_keys"
-    type: "list(string)"
-    has_minimum: true
-  }
-  attr {
-    name: "dense_keys"
-    type: "list(string)"
-    has_minimum: true
-  }
-  attr {
-    name: "sparse_types"
-    type: "list(type)"
-    has_minimum: true
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_INT64
-        type: DT_STRING
-      }
-    }
-  }
-  attr {
-    name: "Tdense"
-    type: "list(type)"
-    has_minimum: true
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_INT64
-        type: DT_STRING
-      }
-    }
-  }
-  attr {
-    name: "dense_shapes"
-    type: "list(shape)"
-    has_minimum: true
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "sloppy"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
 op {
   name: "ParseSequenceExample"
   input_arg {
@@ -40276,6 +40479,52 @@ op {
     }
   }
 }
+op {
+  name: "PartitionedCall"
+  input_arg {
+    name: "args"
+    type_list_attr: "Tin"
+  }
+  output_arg {
+    name: "output"
+    type_list_attr: "Tout"
+  }
+  attr {
+    name: "Tin"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Tout"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "config"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "config_proto"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "executor_type"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+}
 op {
   name: "Placeholder"
   output_arg {
@@ -40591,48 +40840,6 @@ op {
     minimum: 1
   }
 }
-op {
-  name: "PrependFromQueueAndPaddedBatchDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "batch_size"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "padded_shapes"
-    type: DT_INT64
-    number_attr: "N"
-  }
-  input_arg {
-    name: "padding_values"
-    type_list_attr: "Toutput_types"
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "Toutput_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "N"
-    type: "int"
-    has_minimum: true
-    minimum: 1
-  }
-}
 op {
   name: "PreventGradient"
   input_arg {
@@ -41643,6 +41850,71 @@ op {
     }
   }
 }
+op {
+  name: "QuantizeAndDequantizeV2"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_min"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_max"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "signed_input"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "num_bits"
+    type: "int"
+    default_value {
+      i: 8
+    }
+  }
+  attr {
+    name: "range_given"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "round_mode"
+    type: "string"
+    default_value {
+      s: "HALF_TO_EVEN"
+    }
+    allowed_values {
+      list {
+        s: "HALF_TO_EVEN"
+        s: "HALF_UP"
+      }
+    }
+  }
+}
 op {
   name: "QuantizeAndDequantizeV3"
   input_arg {
@@ -44844,34 +45116,6 @@ op {
   }
   is_stateful: true
 }
-op {
-  name: "RandomDataset"
-  input_arg {
-    name: "seed"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "seed2"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-  is_stateful: true
-}
 op {
   name: "RandomGamma"
   input_arg {
@@ -49256,6 +49500,86 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "ResourceApplyAdamWithAmsgrad"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "m"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "v"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "vhat"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "beta1_power"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta2_power"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
 op {
   name: "ResourceApplyAddSign"
   input_arg {
@@ -50471,6 +50795,69 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "ResourceApplyKerasMomentum"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "use_nesterov"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
 op {
   name: "ResourceApplyMomentum"
   input_arg {
@@ -54377,6 +54764,83 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "ResourceSparseApplyKerasMomentum"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "use_nesterov"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
 op {
   name: "ResourceSparseApplyMomentum"
   input_arg {
@@ -57111,52 +57575,6 @@ op {
     }
   }
 }
-op {
-  name: "ScanDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "initial_state"
-    type_list_attr: "Tstate"
-  }
-  input_arg {
-    name: "other_arguments"
-    type_list_attr: "Targuments"
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "f"
-    type: "func"
-  }
-  attr {
-    name: "Tstate"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "Targuments"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-}
 op {
   name: "ScatterAdd"
   input_arg {
@@ -60889,42 +61307,6 @@ op {
     }
   }
 }
-op {
-  name: "SetStatsAggregatorDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "stats_aggregator"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "tag"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "counter_prefix"
-    type: DT_STRING
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-  is_stateful: true
-}
 op {
   name: "Shape"
   input_arg {
@@ -61771,41 +62153,6 @@ op {
     }
   }
 }
-op {
-  name: "SlideDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "window_size"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "window_shift"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "window_stride"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-}
 op {
   name: "Snapshot"
   input_arg {
@@ -70964,38 +71311,6 @@ op {
     }
   }
 }
-op {
-  name: "SqlDataset"
-  input_arg {
-    name: "driver_name"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "data_source_name"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "query"
-    type: DT_STRING
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-  is_stateful: true
-}
 op {
   name: "Sqrt"
   input_arg {
@@ -71827,6 +72142,53 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "StatefulPartitionedCall"
+  input_arg {
+    name: "args"
+    type_list_attr: "Tin"
+  }
+  output_arg {
+    name: "output"
+    type_list_attr: "Tout"
+  }
+  attr {
+    name: "Tin"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Tout"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "config"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "config_proto"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "executor_type"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
 op {
   name: "StatelessIf"
   input_arg {
@@ -72509,40 +72871,6 @@ op {
     }
   }
 }
-op {
-  name: "StatsAggregatorHandle"
-  output_arg {
-    name: "handle"
-    type: DT_RESOURCE
-  }
-  attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "StatsAggregatorSummary"
-  input_arg {
-    name: "iterator"
-    type: DT_RESOURCE
-  }
-  output_arg {
-    name: "summary"
-    type: DT_STRING
-  }
-  is_stateful: true
-}
 op {
   name: "StopGradient"
   input_arg {
@@ -75378,6 +75706,127 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "TensorForestCreateTreeVariable"
+  input_arg {
+    name: "tree_handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "tree_config"
+    type: DT_STRING
+  }
+  is_stateful: true
+}
+op {
+  name: "TensorForestTreeDeserialize"
+  input_arg {
+    name: "tree_handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "tree_config"
+    type: DT_STRING
+  }
+  is_stateful: true
+}
+op {
+  name: "TensorForestTreeIsInitializedOp"
+  input_arg {
+    name: "tree_handle"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "is_initialized"
+    type: DT_BOOL
+  }
+  is_stateful: true
+}
+op {
+  name: "TensorForestTreePredict"
+  input_arg {
+    name: "tree_handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "dense_features"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "logits"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "logits_dimension"
+    type: "int"
+  }
+  is_stateful: true
+}
+op {
+  name: "TensorForestTreeResourceHandleOp"
+  output_arg {
+    name: "resource"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "TensorForestTreeSerialize"
+  input_arg {
+    name: "tree_handle"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "tree_config"
+    type: DT_STRING
+  }
+  is_stateful: true
+}
+op {
+  name: "TensorForestTreeSize"
+  input_arg {
+    name: "tree_handle"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "tree_size"
+    type: DT_INT32
+  }
+  is_stateful: true
+}
+op {
+  name: "TensorListConcat"
+  input_arg {
+    name: "input_handle"
+    type: DT_VARIANT
+  }
+  output_arg {
+    name: "tensor"
+    type_attr: "element_dtype"
+  }
+  output_arg {
+    name: "lengths"
+    type: DT_INT64
+  }
+  attr {
+    name: "element_dtype"
+    type: "type"
+  }
+}
 op {
   name: "TensorListConcatLists"
   input_arg {
@@ -75638,6 +76087,39 @@ op {
     type: "type"
   }
 }
+op {
+  name: "TensorListSplit"
+  input_arg {
+    name: "tensor"
+    type_attr: "element_dtype"
+  }
+  input_arg {
+    name: "element_shape"
+    type_attr: "shape_type"
+  }
+  input_arg {
+    name: "lengths"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output_handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "element_dtype"
+    type: "type"
+  }
+  attr {
+    name: "shape_type"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
 op {
   name: "TensorListStack"
   input_arg {
@@ -75660,6 +76142,105 @@ op {
     }
   }
 }
+op {
+  name: "TensorScatterAdd"
+  input_arg {
+    name: "tensor"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "TensorScatterSub"
+  input_arg {
+    name: "tensor"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "TensorScatterUpdate"
+  input_arg {
+    name: "tensor"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
 op {
   name: "TensorSliceDataset"
   input_arg {
@@ -76810,29 +77391,6 @@ op {
     type: "type"
   }
 }
-op {
-  name: "UnbatchDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-}
 op {
   name: "UnbatchGrad"
   input_arg {
@@ -76874,6 +77432,104 @@ op {
     type: "type"
   }
 }
+op {
+  name: "UnicodeDecodeWithOffsets"
+  input_arg {
+    name: "input"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "row_splits"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "char_values"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "char_to_byte_starts"
+    type: DT_INT64
+  }
+  attr {
+    name: "input_encoding"
+    type: "string"
+  }
+  attr {
+    name: "errors"
+    type: "string"
+    default_value {
+      s: "replace"
+    }
+    allowed_values {
+      list {
+        s: "strict"
+        s: "replace"
+        s: "ignore"
+      }
+    }
+  }
+  attr {
+    name: "replacement_char"
+    type: "int"
+    default_value {
+      i: 65533
+    }
+  }
+  attr {
+    name: "replace_control_characters"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "UnicodeEncode"
+  input_arg {
+    name: "input_values"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "input_splits"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output"
+    type: DT_STRING
+  }
+  attr {
+    name: "errors"
+    type: "string"
+    default_value {
+      s: "replace"
+    }
+    allowed_values {
+      list {
+        s: "ignore"
+        s: "replace"
+        s: "strict"
+      }
+    }
+  }
+  attr {
+    name: "output_encoding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "UTF-8"
+        s: "UTF-16-BE"
+        s: "UTF-32-BE"
+      }
+    }
+  }
+  attr {
+    name: "replacement_char"
+    type: "int"
+    default_value {
+      i: 65533
+    }
+  }
+}
 op {
   name: "UnicodeScript"
   input_arg {
diff --git a/tensorflow/core/ops/dataset_ops.cc b/tensorflow/core/ops/dataset_ops.cc
index 8402f250f9f..b2ea637464a 100644
--- a/tensorflow/core/ops/dataset_ops.cc
+++ b/tensorflow/core/ops/dataset_ops.cc
@@ -83,13 +83,6 @@ REGISTER_OP("GeneratorDataset")
                       // stateful to inhibit constant folding.
     .SetShapeFn(shape_inference::ScalarShape);
 
-REGISTER_OP("UnbatchDataset")
-    .Input("input_dataset: variant")
-    .Output("handle: variant")
-    .Attr("output_types: list(type) >= 1")
-    .Attr("output_shapes: list(shape) >= 1")
-    .SetShapeFn(shape_inference::ScalarShape);
-
 REGISTER_OP("ZipDataset")
     .Input("input_datasets: N * variant")
     .Output("handle: variant")
@@ -142,57 +135,6 @@ REGISTER_OP("SkipDataset")
       return shape_inference::ScalarShape(c);
     });
 
-REGISTER_OP("BytesProducedStatsDataset")
-    .Input("input_dataset: variant")
-    .Input("tag: string")
-    .Output("handle: variant")
-    .Attr("output_types: list(type) >= 1")
-    .Attr("output_shapes: list(shape) >= 1")
-    .SetShapeFn([](shape_inference::InferenceContext* c) {
-      shape_inference::ShapeHandle tag_shape;
-      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &tag_shape));
-      return shape_inference::ScalarShape(c);
-    });
-
-REGISTER_OP("LatencyStatsDataset")
-    .Input("input_dataset: variant")
-    .Input("tag: string")
-    .Output("handle: variant")
-    .Attr("output_types: list(type) >= 1")
-    .Attr("output_shapes: list(shape) >= 1")
-    .SetShapeFn([](shape_inference::InferenceContext* c) {
-      shape_inference::ShapeHandle tag_shape;
-      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &tag_shape));
-      return shape_inference::ScalarShape(c);
-    });
-
-REGISTER_OP("ParseExampleDataset")
-    .Input("input_dataset: variant")
-    .Input("num_parallel_calls: int64")
-    .Input("dense_defaults: Tdense")
-    .Output("handle: variant")
-    .Attr("sparse_keys: list(string) >= 0")
-    .Attr("dense_keys: list(string) >= 0")
-    .Attr("sparse_types: list({float,int64,string}) >= 0")
-    .Attr("Tdense: list({float,int64,string}) >= 0")
-    .Attr("dense_shapes: list(shape) >= 0")
-    .Attr("output_types: list(type) >= 1")
-    .Attr("output_shapes: list(shape) >= 1")  // Output components will be
-                                              // sorted by key (dense_keys and
-                                              // sparse_keys combined) here.
-    .Attr("sloppy: bool = false")
-    .SetShapeFn(shape_inference::ScalarShape);
-
-REGISTER_OP("SetStatsAggregatorDataset")
-    .Input("input_dataset: variant")
-    .Input("stats_aggregator: resource")
-    .Input("tag: string")
-    .Input("counter_prefix: string")
-    .Output("handle: variant")
-    .Attr("output_types: list(type) >= 1")
-    .Attr("output_shapes: list(shape) >= 1")
-    .SetShapeFn(shape_inference::ScalarShape);
-
 REGISTER_OP("MapDataset")
     .Input("input_dataset: variant")
     .Input("other_arguments: Targuments")
@@ -217,58 +159,6 @@ REGISTER_OP("ParallelMapDataset")
     .Attr("sloppy: bool = false")
     .SetShapeFn(shape_inference::ScalarShape);
 
-REGISTER_OP("MapAndBatchDataset")
-    .Input("input_dataset: variant")
-    .Input("other_arguments: Targuments")
-    .Input("batch_size: int64")
-    .Input("num_parallel_batches: int64")
-    .Input("drop_remainder: bool")
-    .Output("handle: variant")
-    .Attr("f: func")
-    .Attr("Targuments: list(type) >= 0")
-    .Attr("output_types: list(type) >= 1")
-    .Attr("output_shapes: list(shape) >= 1")
-    .SetShapeFn([](shape_inference::InferenceContext* c) {
-      // Use index from the end to retrieve the Input shapes,
-      // so that to avoid guessing the length of "other_arguments".
-      // batch_size, num_parallel_batches, and drop_remainder are 0-D scalars.
-      shape_inference::ShapeHandle unused;
-      TF_RETURN_IF_ERROR(
-          c->WithRank(c->input(c->num_inputs() - 3), 0, &unused));
-      TF_RETURN_IF_ERROR(
-          c->WithRank(c->input(c->num_inputs() - 2), 0, &unused));
-      TF_RETURN_IF_ERROR(
-          c->WithRank(c->input(c->num_inputs() - 1), 0, &unused));
-
-      return shape_inference::ScalarShape(c);
-    });
-
-REGISTER_OP("MapAndBatchDatasetV2")
-    .Input("input_dataset: variant")
-    .Input("other_arguments: Targuments")
-    .Input("batch_size: int64")
-    .Input("num_parallel_calls: int64")
-    .Input("drop_remainder: bool")
-    .Output("handle: variant")
-    .Attr("f: func")
-    .Attr("Targuments: list(type) >= 0")
-    .Attr("output_types: list(type) >= 1")
-    .Attr("output_shapes: list(shape) >= 1")
-    .SetShapeFn([](shape_inference::InferenceContext* c) {
-      // Use index from the end to retrieve the Input shapes,
-      // so that to avoid guessing the length of "other_arguments".
-      // batch_size, num_parallel_calls, and drop_remainder are 0-D scalars.
-      shape_inference::ShapeHandle unused;
-      TF_RETURN_IF_ERROR(
-          c->WithRank(c->input(c->num_inputs() - 3), 0, &unused));
-      TF_RETURN_IF_ERROR(
-          c->WithRank(c->input(c->num_inputs() - 2), 0, &unused));
-      TF_RETURN_IF_ERROR(
-          c->WithRank(c->input(c->num_inputs() - 1), 0, &unused));
-
-      return shape_inference::ScalarShape(c);
-    });
-
 REGISTER_OP("PrefetchDataset")
     .Input("input_dataset: variant")
     .Input("buffer_size: int64")
@@ -282,18 +172,6 @@ REGISTER_OP("PrefetchDataset")
       return shape_inference::ScalarShape(c);
     });
 
-REGISTER_OP("ScanDataset")
-    .Input("input_dataset: variant")
-    .Input("initial_state: Tstate")
-    .Input("other_arguments: Targuments")
-    .Output("handle: variant")
-    .Attr("f: func")
-    .Attr("Tstate: list(type) >= 1")
-    .Attr("Targuments: list(type) >= 0")
-    .Attr("output_types: list(type) >= 1")
-    .Attr("output_shapes: list(shape) >= 1")
-    .SetShapeFn(shape_inference::ScalarShape);
-
 REGISTER_OP("FlatMapDataset")
     .Input("input_dataset: variant")
     .Input("other_arguments: Targuments")
@@ -316,21 +194,6 @@ REGISTER_OP("InterleaveDataset")
     .Attr("output_shapes: list(shape) >= 1")
     .SetShapeFn(shape_inference::ScalarShape);
 
-REGISTER_OP("ParallelInterleaveDataset")
-    .Input("input_dataset: variant")
-    .Input("other_arguments: Targuments")
-    .Input("cycle_length: int64")
-    .Input("block_length: int64")
-    .Input("sloppy: bool")
-    .Input("buffer_output_elements: int64")
-    .Input("prefetch_input_elements: int64")
-    .Output("handle: variant")
-    .Attr("f: func")
-    .Attr("Targuments: list(type) >= 0")
-    .Attr("output_types: list(type) >= 1")
-    .Attr("output_shapes: list(shape) >= 1")
-    .SetShapeFn(shape_inference::ScalarShape);
-
 REGISTER_OP("ParallelInterleaveDatasetV2")
     .Input("input_dataset: variant")
     .Input("other_arguments: Targuments")
@@ -345,43 +208,6 @@ REGISTER_OP("ParallelInterleaveDatasetV2")
     .Attr("sloppy: bool = false")
     .SetShapeFn(shape_inference::ScalarShape);
 
-REGISTER_OP("GroupByReducerDataset")
-    .Input("input_dataset: variant")
-    .Input("key_func_other_arguments: Tkey_func_other_arguments")
-    .Input("init_func_other_arguments: Tinit_func_other_arguments")
-    .Input("reduce_func_other_arguments: Treduce_func_other_arguments")
-    .Input("finalize_func_other_arguments: Tfinalize_func_other_arguments")
-    .Output("handle: variant")
-    .Attr("key_func: func")
-    .Attr("init_func: func")
-    .Attr("reduce_func: func")
-    .Attr("finalize_func: func")
-    .Attr("Tkey_func_other_arguments: list(type) >= 0")
-    .Attr("Tinit_func_other_arguments: list(type) >= 0")
-    .Attr("Treduce_func_other_arguments: list(type) >= 0")
-    .Attr("Tfinalize_func_other_arguments: list(type) >= 0")
-    .Attr("output_types: list(type) >= 1")
-    .Attr("output_shapes: list(shape) >= 1")
-    .SetIsStateful()
-    .SetShapeFn(shape_inference::ScalarShape);
-
-REGISTER_OP("GroupByWindowDataset")
-    .Input("input_dataset: variant")
-    .Input("key_func_other_arguments: Tkey_func_other_arguments")
-    .Input("reduce_func_other_arguments: Treduce_func_other_arguments")
-    .Input(
-        "window_size_func_other_arguments: Twindow_size_func_other_arguments")
-    .Output("handle: variant")
-    .Attr("key_func: func")
-    .Attr("reduce_func: func")
-    .Attr("window_size_func: func")
-    .Attr("Tkey_func_other_arguments: list(type) >= 0")
-    .Attr("Treduce_func_other_arguments: list(type) >= 0")
-    .Attr("Twindow_size_func_other_arguments: list(type) >= 0")
-    .Attr("output_types: list(type) >= 1")
-    .Attr("output_shapes: list(shape) >= 1")
-    .SetShapeFn(shape_inference::ScalarShape);
-
 REGISTER_OP("FilterDataset")
     .Input("input_dataset: variant")
     .Input("other_arguments: Targuments")
@@ -447,23 +273,6 @@ REGISTER_OP("BatchDatasetV2")
       return shape_inference::ScalarShape(c);
     });
 
-REGISTER_OP("SlideDataset")
-    .Input("input_dataset: variant")
-    .Input("window_size: int64")
-    .Input("window_shift: int64")
-    .Input("window_stride: int64")
-    .Output("handle: variant")
-    .Attr("output_types: list(type) >= 1")
-    .Attr("output_shapes: list(shape) >= 1")
-    .SetShapeFn([](shape_inference::InferenceContext* c) {
-      shape_inference::ShapeHandle unused;
-      // window_size, window_shift, and window_stride should be scalars.
-      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
-      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused));
-      TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &unused));
-      return shape_inference::ScalarShape(c);
-    });
-
 // TODO(mrry): Validate that `padded_shapes` are all vectors, the lengths of
 // `output_types` and `output_shapes` are `N` the `output_shapes` are (as far as
 // possible to tell statically) compatible with `padded_shapes`, and that
@@ -504,22 +313,6 @@ REGISTER_OP("PaddedBatchDatasetV2")
       return shape_inference::ScalarShape(c);
     });
 
-REGISTER_OP("DenseToSparseBatchDataset")
-    .Input("input_dataset: variant")
-    .Input("batch_size: int64")
-    .Input("row_shape: int64")
-    .Output("handle: variant")
-    .Attr("output_types: list(type) >= 1")
-    .Attr("output_shapes: list(shape) >= 1")
-    .SetShapeFn([](shape_inference::InferenceContext* c) {
-      shape_inference::ShapeHandle unused;
-      // batch_size should be a scalar.
-      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
-      // row_shape should be a 1-D vector.
-      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 1, &unused));
-      return shape_inference::ScalarShape(c);
-    });
-
 REGISTER_OP("RangeDataset")
     .Input("start: int64")
     .Input("stop: int64")
@@ -538,22 +331,6 @@ REGISTER_OP("RangeDataset")
       return shape_inference::ScalarShape(c);
     });
 
-REGISTER_OP("RandomDataset")
-    .Input("seed: int64")
-    .Input("seed2: int64")
-    .Output("handle: variant")
-    .Attr("output_types: list(type) >= 1")
-    .Attr("output_shapes: list(shape) >= 1")
-    .SetIsStateful()  // TODO(b/65524810): Source dataset ops must be marked
-                      // stateful to inhibit constant folding.
-    .SetShapeFn([](shape_inference::InferenceContext* c) {
-      shape_inference::ShapeHandle unused;
-      // buffer_size, seed, and seed2 should be scalars.
-      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &unused));
-      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
-      return shape_inference::ScalarShape(c);
-    });
-
 REGISTER_OP("ShuffleDataset")
     .Input("input_dataset: variant")
     .Input("buffer_size: int64")
@@ -622,36 +399,6 @@ REGISTER_OP("TextLineDataset")
       return shape_inference::ScalarShape(c);
     });
 
-REGISTER_OP("MatchingFilesDataset")
-    .Input("patterns: string")
-    .Output("handle: variant")
-    .SetIsStateful()  // TODO(b/65524810): Source dataset ops must be marked
-                      // stateful to inhibit constant folding.
-    .SetShapeFn([](shape_inference::InferenceContext* c) {
-      shape_inference::ShapeHandle unused;
-      // `patterns` must be a scalar or a vector.
-      TF_RETURN_IF_ERROR(c->WithRankAtMost(c->input(0), 1, &unused));
-      return shape_inference::ScalarShape(c);
-    });
-
-REGISTER_OP("SqlDataset")
-    .Input("driver_name: string")
-    .Input("data_source_name: string")
-    .Input("query: string")
-    .Output("handle: variant")
-    .Attr("output_types: list(type) >= 1")
-    .Attr("output_shapes: list(shape) >= 1")
-    .SetIsStateful()  // TODO(b/65524810): Source dataset ops must be marked
-                      // stateful to inhibit constant folding.
-    .SetShapeFn([](shape_inference::InferenceContext* c) {
-      shape_inference::ShapeHandle unused;
-      // driver_name, data_source_name, and query should be scalars.
-      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &unused));
-      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
-      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused));
-      return shape_inference::ScalarShape(c);
-    });
-
 REGISTER_OP("FixedLengthRecordDataset")
     .Input("filenames: string")
     .Input("header_bytes: int64")
@@ -838,53 +585,6 @@ REGISTER_OP("DeserializeIterator")
     .Input("serialized: variant")
     .SetShapeFn(shape_inference::NoOutputs);
 
-REGISTER_OP("StatsAggregatorHandle")
-    .Output("handle: resource")
-    .SetShapeFn(shape_inference::ScalarShape)
-    .Attr("container: string = ''")
-    .Attr("shared_name: string = ''");
-
-REGISTER_OP("StatsAggregatorSummary")
-    .Input("iterator: resource")
-    .Output("summary: string")
-    .SetShapeFn(shape_inference::ScalarShape);
-
-REGISTER_OP("PrependFromQueueAndPaddedBatchDataset")
-    .Input("input_dataset: variant")
-    .Input("batch_size: int64")
-    .Input("padded_shapes: N * int64")
-    .Input("padding_values: Toutput_types")
-    .Output("handle: variant")
-    .Attr("Toutput_types: list(type) >= 1")
-    .Attr("output_shapes: list(shape) >= 1")
-    .Attr("N: int >= 1")
-    // TODO(ebrevdo): Validate that `padded_shapes` are all vectors, the lengths
-    // of `Toutput_types` and `output_shapes` are `N`, that the
-    // length of `output_types` is `N`, the `output_shapes` are
-    // (as far as possible to tell statically) compatible with `padded_shapes`,
-    // and that `padding_values` are all scalars.
-    .SetShapeFn([](shape_inference::InferenceContext* c) {
-      shape_inference::ShapeHandle unused;
-      // batch_size should be a scalar.
-      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
-      return shape_inference::ScalarShape(c);
-    });
-
-REGISTER_OP("EnqueueInQueueDataset")
-    .Input("queue: variant")
-    .Input("components: Tcomponents")
-    .Attr("Tcomponents: list(type) >= 1")
-    .SetIsStateful()  // To avoid CSE on multiple calls to Enqueue.
-    // TODO(ebrevdo): SetShapeFn to test input dtypes and shapes by
-    // reading from queue handle (is that even possible?).
-    .SetShapeFn(shape_inference::NoOutputs);
-
-REGISTER_OP("DatasetToTFRecord")
-    .Input("input_dataset: variant")
-    .Input("filename: string")
-    .Input("compression_type: string")
-    .SetShapeFn(shape_inference::NoOutputs);
-
 REGISTER_OP("DatasetToGraph")
     .Input("input_dataset: variant")
     .Output("graph: string")
diff --git a/tensorflow/core/ops/experimental_dataset_ops.cc b/tensorflow/core/ops/experimental_dataset_ops.cc
index 9733cf27768..07110341188 100644
--- a/tensorflow/core/ops/experimental_dataset_ops.cc
+++ b/tensorflow/core/ops/experimental_dataset_ops.cc
@@ -17,14 +17,17 @@ limitations under the License.
 
 namespace tensorflow {
 
-REGISTER_OP("ExperimentalDirectedInterleaveDataset")
-    .Input("selector_input_dataset: variant")
-    .Input("data_input_datasets: N * variant")
+REGISTER_OP("ExperimentalBytesProducedStatsDataset")
+    .Input("input_dataset: variant")
+    .Input("tag: string")
     .Output("handle: variant")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
-    .Attr("N: int >= 1")
-    .SetShapeFn(shape_inference::ScalarShape);
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle tag_shape;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &tag_shape));
+      return shape_inference::ScalarShape(c);
+    });
 
 REGISTER_OP("ExperimentalCSVDataset")
     .Input("filenames: string")
@@ -68,6 +71,79 @@ REGISTER_OP("ExperimentalCSVDataset")
       return shape_inference::ScalarShape(c);
     });
 
+REGISTER_OP("ExperimentalDatasetCardinality")
+    .Input("input_dataset: variant")
+    .Output("cardinality: int64")
+    .SetShapeFn(shape_inference::ScalarShape);
+
+REGISTER_OP("ExperimentalDatasetToTFRecord")
+    .Input("input_dataset: variant")
+    .Input("filename: string")
+    .Input("compression_type: string")
+    .SetShapeFn(shape_inference::NoOutputs);
+
+REGISTER_OP("ExperimentalDenseToSparseBatchDataset")
+    .Input("input_dataset: variant")
+    .Input("batch_size: int64")
+    .Input("row_shape: int64")
+    .Output("handle: variant")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle unused;
+      // batch_size should be a scalar.
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
+      // row_shape should be a 1-D vector.
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 1, &unused));
+      return shape_inference::ScalarShape(c);
+    });
+
+REGISTER_OP("ExperimentalDirectedInterleaveDataset")
+    .Input("selector_input_dataset: variant")
+    .Input("data_input_datasets: N * variant")
+    .Output("handle: variant")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .Attr("N: int >= 1")
+    .SetShapeFn(shape_inference::ScalarShape);
+
+REGISTER_OP("ExperimentalGroupByReducerDataset")
+    .Input("input_dataset: variant")
+    .Input("key_func_other_arguments: Tkey_func_other_arguments")
+    .Input("init_func_other_arguments: Tinit_func_other_arguments")
+    .Input("reduce_func_other_arguments: Treduce_func_other_arguments")
+    .Input("finalize_func_other_arguments: Tfinalize_func_other_arguments")
+    .Output("handle: variant")
+    .Attr("key_func: func")
+    .Attr("init_func: func")
+    .Attr("reduce_func: func")
+    .Attr("finalize_func: func")
+    .Attr("Tkey_func_other_arguments: list(type) >= 0")
+    .Attr("Tinit_func_other_arguments: list(type) >= 0")
+    .Attr("Treduce_func_other_arguments: list(type) >= 0")
+    .Attr("Tfinalize_func_other_arguments: list(type) >= 0")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .SetIsStateful()
+    .SetShapeFn(shape_inference::ScalarShape);
+
+REGISTER_OP("ExperimentalGroupByWindowDataset")
+    .Input("input_dataset: variant")
+    .Input("key_func_other_arguments: Tkey_func_other_arguments")
+    .Input("reduce_func_other_arguments: Treduce_func_other_arguments")
+    .Input(
+        "window_size_func_other_arguments: Twindow_size_func_other_arguments")
+    .Output("handle: variant")
+    .Attr("key_func: func")
+    .Attr("reduce_func: func")
+    .Attr("window_size_func: func")
+    .Attr("Tkey_func_other_arguments: list(type) >= 0")
+    .Attr("Treduce_func_other_arguments: list(type) >= 0")
+    .Attr("Twindow_size_func_other_arguments: list(type) >= 0")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .SetShapeFn(shape_inference::ScalarShape);
+
 REGISTER_OP("ExperimentalIgnoreErrorsDataset")
     .Input("input_dataset: variant")
     .Output("handle: variant")
@@ -75,6 +151,44 @@ REGISTER_OP("ExperimentalIgnoreErrorsDataset")
     .Attr("output_shapes: list(shape) >= 1")
     .SetShapeFn(shape_inference::ScalarShape);
 
+REGISTER_OP("ExperimentalLatencyStatsDataset")
+    .Input("input_dataset: variant")
+    .Input("tag: string")
+    .Output("handle: variant")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle tag_shape;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &tag_shape));
+      return shape_inference::ScalarShape(c);
+    });
+
+REGISTER_OP("ExperimentalMapAndBatchDataset")
+    .Input("input_dataset: variant")
+    .Input("other_arguments: Targuments")
+    .Input("batch_size: int64")
+    .Input("num_parallel_calls: int64")
+    .Input("drop_remainder: bool")
+    .Output("handle: variant")
+    .Attr("f: func")
+    .Attr("Targuments: list(type) >= 0")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      // Use index from the end to retrieve the Input shapes,
+      // so that to avoid guessing the length of "other_arguments".
+      // batch_size, num_parallel_calls, and drop_remainder are 0-D scalars.
+      shape_inference::ShapeHandle unused;
+      TF_RETURN_IF_ERROR(
+          c->WithRank(c->input(c->num_inputs() - 3), 0, &unused));
+      TF_RETURN_IF_ERROR(
+          c->WithRank(c->input(c->num_inputs() - 2), 0, &unused));
+      TF_RETURN_IF_ERROR(
+          c->WithRank(c->input(c->num_inputs() - 1), 0, &unused));
+
+      return shape_inference::ScalarShape(c);
+    });
+
 REGISTER_OP("ExperimentalMapDataset")
     .Input("input_dataset: variant")
     .Input("other_arguments: Targuments")
@@ -86,6 +200,18 @@ REGISTER_OP("ExperimentalMapDataset")
     .Attr("use_inter_op_parallelism: bool = true")
     .SetShapeFn(shape_inference::ScalarShape);
 
+REGISTER_OP("ExperimentalMatchingFilesDataset")
+    .Input("patterns: string")
+    .Output("handle: variant")
+    .SetIsStateful()  // TODO(b/65524810): Source dataset ops must be marked
+                      // stateful to inhibit constant folding.
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle unused;
+      // `patterns` must be a scalar or a vector.
+      TF_RETURN_IF_ERROR(c->WithRankAtMost(c->input(0), 1, &unused));
+      return shape_inference::ScalarShape(c);
+    });
+
 REGISTER_OP("ExperimentalNonSerializableDataset")
     .Input("input_dataset: variant")
     .Output("handle: variant")
@@ -93,6 +219,76 @@ REGISTER_OP("ExperimentalNonSerializableDataset")
     .Attr("output_shapes: list(shape) >= 1")
     .SetShapeFn(shape_inference::ScalarShape);
 
+REGISTER_OP("ExperimentalParallelInterleaveDataset")
+    .Input("input_dataset: variant")
+    .Input("other_arguments: Targuments")
+    .Input("cycle_length: int64")
+    .Input("block_length: int64")
+    .Input("sloppy: bool")
+    .Input("buffer_output_elements: int64")
+    .Input("prefetch_input_elements: int64")
+    .Output("handle: variant")
+    .Attr("f: func")
+    .Attr("Targuments: list(type) >= 0")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .SetShapeFn(shape_inference::ScalarShape);
+
+REGISTER_OP("ExperimentalParseExampleDataset")
+    .Input("input_dataset: variant")
+    .Input("num_parallel_calls: int64")
+    .Input("dense_defaults: Tdense")
+    .Output("handle: variant")
+    .Attr("sparse_keys: list(string) >= 0")
+    .Attr("dense_keys: list(string) >= 0")
+    .Attr("sparse_types: list({float,int64,string}) >= 0")
+    .Attr("Tdense: list({float,int64,string}) >= 0")
+    .Attr("dense_shapes: list(shape) >= 0")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")  // Output components will be
+                                              // sorted by key (dense_keys and
+                                              // sparse_keys combined) here.
+    .Attr("sloppy: bool = false")
+    .SetShapeFn(shape_inference::ScalarShape);
+
+REGISTER_OP("ExperimentalRandomDataset")
+    .Input("seed: int64")
+    .Input("seed2: int64")
+    .Output("handle: variant")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .SetIsStateful()  // TODO(b/65524810): Source dataset ops must be marked
+                      // stateful to inhibit constant folding.
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle unused;
+      // buffer_size, seed, and seed2 should be scalars.
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
+      return shape_inference::ScalarShape(c);
+    });
+
+REGISTER_OP("ExperimentalScanDataset")
+    .Input("input_dataset: variant")
+    .Input("initial_state: Tstate")
+    .Input("other_arguments: Targuments")
+    .Output("handle: variant")
+    .Attr("f: func")
+    .Attr("Tstate: list(type) >= 1")
+    .Attr("Targuments: list(type) >= 0")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .SetShapeFn(shape_inference::ScalarShape);
+
+REGISTER_OP("ExperimentalSetStatsAggregatorDataset")
+    .Input("input_dataset: variant")
+    .Input("stats_aggregator: resource")
+    .Input("tag: string")
+    .Input("counter_prefix: string")
+    .Output("handle: variant")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .SetShapeFn(shape_inference::ScalarShape);
+
 REGISTER_OP("ExperimentalSleepDataset")
     .Input("input_dataset: variant")
     .Input("sleep_microseconds: int64")
@@ -107,6 +303,59 @@ REGISTER_OP("ExperimentalSleepDataset")
       return shape_inference::ScalarShape(c);
     });
 
+REGISTER_OP("ExperimentalSlidingWindowDataset")
+    .Input("input_dataset: variant")
+    .Input("window_size: int64")
+    .Input("window_shift: int64")
+    .Input("window_stride: int64")
+    .Output("handle: variant")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle unused;
+      // window_size, window_shift, and window_stride should be scalars.
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &unused));
+      return shape_inference::ScalarShape(c);
+    });
+
+REGISTER_OP("ExperimentalSqlDataset")
+    .Input("driver_name: string")
+    .Input("data_source_name: string")
+    .Input("query: string")
+    .Output("handle: variant")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .SetIsStateful()  // TODO(b/65524810): Source dataset ops must be marked
+                      // stateful to inhibit constant folding.
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle unused;
+      // driver_name, data_source_name, and query should be scalars.
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused));
+      return shape_inference::ScalarShape(c);
+    });
+
+REGISTER_OP("ExperimentalStatsAggregatorHandle")
+    .Output("handle: resource")
+    .SetShapeFn(shape_inference::ScalarShape)
+    .Attr("container: string = ''")
+    .Attr("shared_name: string = ''");
+
+REGISTER_OP("ExperimentalStatsAggregatorSummary")
+    .Input("iterator: resource")
+    .Output("summary: string")
+    .SetShapeFn(shape_inference::ScalarShape);
+
+REGISTER_OP("ExperimentalUnbatchDataset")
+    .Input("input_dataset: variant")
+    .Output("handle: variant")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .SetShapeFn(shape_inference::ScalarShape);
+
 REGISTER_OP("ExperimentalUniqueDataset")
     .Input("input_dataset: variant")
     .Output("handle: variant")
@@ -119,26 +368,21 @@ REGISTER_OP("ExperimentalIteratorGetDevice")
     .Output("device: string")
     .SetShapeFn(shape_inference::ScalarShape);
 
-REGISTER_OP("ExperimentalFunctionBufferingResource")
-    .Input("string_arg: string")
-    .Input("target_device: string")
-    .Output("resource: resource")
-    .Attr("shared_name: string")
-    .Attr("container: string")
-    .Attr("f: func")
-    .Attr("buffer_size: int")
-    .Attr("output_types: list(type)")
-    .SetShapeFn(shape_inference::UnknownShape);
+REGISTER_OP("ExperimentalMaxIntraOpParallelismDataset")
+    .Input("input_dataset: variant")
+    .Input("max_intra_op_parallelism: int64")
+    .Output("handle: variant")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .SetShapeFn(shape_inference::ScalarShape);
 
-REGISTER_OP("ExperimentalFunctionBufferingResourceGetNext")
-    .Input("function_buffer_resource: resource")
-    .Attr("output_types: list(type)")
-    .Output("output: output_types")
-    .SetShapeFn(shape_inference::UnknownShape);
-
-REGISTER_OP("ExperimentalFunctionBufferingResourceReset")
-    .Input("function_buffer_resource: resource")
-    .SetShapeFn(shape_inference::UnknownShape);
+REGISTER_OP("ExperimentalPrivateThreadPoolDataset")
+    .Input("input_dataset: variant")
+    .Input("num_threads: int64")
+    .Output("handle: variant")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .SetShapeFn(shape_inference::ScalarShape);
 
 REGISTER_OP("ExperimentalThreadPoolDataset")
     .Input("input_dataset: variant")
diff --git a/tensorflow/core/ops/functional_ops.cc b/tensorflow/core/ops/functional_ops.cc
index ee14a851eb9..5e0bdd888ce 100644
--- a/tensorflow/core/ops/functional_ops.cc
+++ b/tensorflow/core/ops/functional_ops.cc
@@ -226,6 +226,7 @@ REGISTER_OP("PartitionedCall")
     .Attr("Tout: list(type) >= 0")
     .Attr("f: func")
     .Attr("config: string = ''")
+    .Attr("config_proto: string = ''")
     .Attr("executor_type: string = ''")
     .SetShapeFn(shape_inference::UnknownShape);
 
@@ -235,7 +236,8 @@ REGISTER_OP("StatefulPartitionedCall")
     .Attr("Tin: list(type) >= 0")
     .Attr("Tout: list(type) >= 0")
     .Attr("f: func")
-    .Attr("config: string = ''")
+    .Attr("config: string = ''")  // Deprecated in favor of config_proto
+    .Attr("config_proto: string = ''")
     .Attr("executor_type: string = ''")
     .SetIsStateful()
     .SetShapeFn(shape_inference::UnknownShape);
diff --git a/tensorflow/core/ops/list_ops.cc b/tensorflow/core/ops/list_ops.cc
index 88d6d14c306..01ebcd15439 100644
--- a/tensorflow/core/ops/list_ops.cc
+++ b/tensorflow/core/ops/list_ops.cc
@@ -28,13 +28,14 @@ REGISTER_OP("EmptyTensorList")
     .Attr("shape_type: {int32, int64}")
     .SetShapeFn([](shape_inference::InferenceContext* c) {
       c->set_output(0, c->Scalar());
-      DataType t;
-      TF_RETURN_IF_ERROR(c->GetAttr("element_dtype", &t));
-      shape_inference::ShapeHandle s;
-      TF_RETURN_IF_ERROR(
-          c->MakeShapeFromShapeTensorTreatScalarAsUnknownShape(0, &s));
+      DataType element_dtype;
+      TF_RETURN_IF_ERROR(c->GetAttr("element_dtype", &element_dtype));
+      shape_inference::ShapeHandle element_shape;
+      TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensorTreatScalarAsUnknownShape(
+          0, &element_shape));
       c->set_output_handle_shapes_and_types(
-          0, std::vector<shape_inference::ShapeAndType>{{s, t}});
+          0, std::vector<shape_inference::ShapeAndType>{
+                 {element_shape, element_dtype}});
       return Status::OK();
     });
 
@@ -45,9 +46,9 @@ REGISTER_OP("TensorListPushBack")
     .Attr("element_dtype: type")
     .SetShapeFn([](shape_inference::InferenceContext* c) {
       c->set_output(0, c->Scalar());
-      DataType t;
-      TF_RETURN_IF_ERROR(c->GetAttr("element_dtype", &t));
-      shape_inference::ShapeHandle s = c->UnknownShape();
+      DataType element_dtype;
+      TF_RETURN_IF_ERROR(c->GetAttr("element_dtype", &element_dtype));
+      shape_inference::ShapeHandle element_shape = c->UnknownShape();
 
       auto* handle_data = c->input_handle_shapes_and_types(0);
       if (handle_data != nullptr && handle_data->size() != 1) {
@@ -57,18 +58,21 @@ REGISTER_OP("TensorListPushBack")
       if (handle_data != nullptr) {
         const shape_inference::ShapeAndType& list_shape_type =
             (*handle_data)[0];
-        if (list_shape_type.dtype != t) {
+        if (list_shape_type.dtype != element_dtype) {
           return errors::InvalidArgument(
               "Trying to push to list with wrong element dtype. List has type ",
               DataTypeString(list_shape_type.dtype),
-              " but trying to push element with type ", DataTypeString(t));
+              " but trying to push element with type ",
+              DataTypeString(element_dtype));
         }
         shape_inference::ShapeHandle ignored;
-        TF_RETURN_IF_ERROR(c->Merge(s, list_shape_type.shape, &ignored));
-        s = list_shape_type.shape;
+        TF_RETURN_IF_ERROR(
+            c->Merge(element_shape, list_shape_type.shape, &ignored));
+        element_shape = list_shape_type.shape;
       }
       c->set_output_handle_shapes_and_types(
-          0, std::vector<shape_inference::ShapeAndType>{{s, t}});
+          0, std::vector<shape_inference::ShapeAndType>{
+                 {element_shape, element_dtype}});
       return Status::OK();
     });
 
@@ -89,9 +93,9 @@ REGISTER_OP("TensorListPushBackBatch")
 
       c->set_output(0, input_handles);
 
-      DataType t;
-      TF_RETURN_IF_ERROR(c->GetAttr("element_dtype", &t));
-      shape_inference::ShapeHandle s = c->UnknownShape();
+      DataType element_dtype;
+      TF_RETURN_IF_ERROR(c->GetAttr("element_dtype", &element_dtype));
+      shape_inference::ShapeHandle element_shape = c->UnknownShape();
 
       auto* handle_data = c->input_handle_shapes_and_types(0);
       if (handle_data != nullptr && handle_data->size() != 1) {
@@ -101,18 +105,21 @@ REGISTER_OP("TensorListPushBackBatch")
       if (handle_data != nullptr) {
         const shape_inference::ShapeAndType& list_shape_type =
             (*handle_data)[0];
-        if (list_shape_type.dtype != t) {
+        if (list_shape_type.dtype != element_dtype) {
           return errors::InvalidArgument(
               "Trying to push to list with wrong element dtype. List has type ",
               DataTypeString(list_shape_type.dtype),
-              " but trying to push element with type ", DataTypeString(t));
+              " but trying to push element with type ",
+              DataTypeString(element_dtype));
         }
         shape_inference::ShapeHandle ignored;
-        TF_RETURN_IF_ERROR(c->Merge(s, list_shape_type.shape, &ignored));
-        s = list_shape_type.shape;
+        TF_RETURN_IF_ERROR(
+            c->Merge(element_shape, list_shape_type.shape, &ignored));
+        element_shape = list_shape_type.shape;
       }
       c->set_output_handle_shapes_and_types(
-          0, std::vector<shape_inference::ShapeAndType>{{s, t}});
+          0, std::vector<shape_inference::ShapeAndType>{
+                 {element_shape, element_dtype}});
       return Status::OK();
     });
 
@@ -127,9 +134,9 @@ REGISTER_OP("TensorListPopBack")
     .Output("tensor: element_dtype")
     .Attr("element_dtype: type")
     .SetShapeFn([](shape_inference::InferenceContext* c) {
-      DataType t;
-      TF_RETURN_IF_ERROR(c->GetAttr("element_dtype", &t));
-      shape_inference::ShapeHandle s = c->UnknownShape();
+      DataType element_dtype;
+      TF_RETURN_IF_ERROR(c->GetAttr("element_dtype", &element_dtype));
+      shape_inference::ShapeHandle tensor_shape = c->UnknownShape();
       auto* handle_data = c->input_handle_shapes_and_types(0);
       if (handle_data != nullptr && handle_data->size() != 1) {
         return errors::InvalidArgument(
@@ -138,19 +145,21 @@ REGISTER_OP("TensorListPopBack")
       if (handle_data != nullptr) {
         const shape_inference::ShapeAndType& list_shape_type =
             (*handle_data)[0];
-        if (list_shape_type.dtype != t) {
+        if (list_shape_type.dtype != element_dtype) {
           return errors::InvalidArgument(
               "Trying to read from list with wrong element dtype. List has "
               "type ",
               DataTypeString(list_shape_type.dtype),
-              " but trying to push element with type ", DataTypeString(t));
+              " but trying to push element with type ",
+              DataTypeString(element_dtype));
         }
         shape_inference::ShapeHandle ignored;
-        TF_RETURN_IF_ERROR(c->Merge(s, list_shape_type.shape, &ignored));
+        TF_RETURN_IF_ERROR(
+            c->Merge(tensor_shape, list_shape_type.shape, &ignored));
         c->set_output_handle_shapes_and_types(0, *handle_data);
-        s = list_shape_type.shape;
+        tensor_shape = list_shape_type.shape;
       }
-      c->set_output(1, s);
+      c->set_output(1, tensor_shape);
       c->set_output(0, c->Scalar());
       return Status::OK();
     });
@@ -161,9 +170,9 @@ REGISTER_OP("TensorListStack")
     .Attr("element_dtype: type")
     .Attr("num_elements: int = -1")
     .SetShapeFn([](shape_inference::InferenceContext* c) {
-      DataType t;
-      TF_RETURN_IF_ERROR(c->GetAttr("element_dtype", &t));
-      shape_inference::ShapeHandle s = c->UnknownShape();
+      DataType element_dtype;
+      TF_RETURN_IF_ERROR(c->GetAttr("element_dtype", &element_dtype));
+      shape_inference::ShapeHandle element_shape = c->UnknownShape();
       auto* handle_data = c->input_handle_shapes_and_types(0);
       if (handle_data != nullptr && handle_data->size() != 1) {
         return errors::InvalidArgument(
@@ -172,16 +181,17 @@ REGISTER_OP("TensorListStack")
       if (handle_data != nullptr) {
         const shape_inference::ShapeAndType& list_shape_type =
             (*handle_data)[0];
-        if (list_shape_type.dtype != t) {
+        if (list_shape_type.dtype != element_dtype) {
           return errors::InvalidArgument(
               "Trying to read from list with wrong element dtype. List has "
               "type ",
               DataTypeString(list_shape_type.dtype), " but expectec type ",
-              DataTypeString(t));
+              DataTypeString(element_dtype));
         }
         shape_inference::ShapeHandle ignored;
-        TF_RETURN_IF_ERROR(c->Merge(s, list_shape_type.shape, &ignored));
-        s = list_shape_type.shape;
+        TF_RETURN_IF_ERROR(
+            c->Merge(element_shape, list_shape_type.shape, &ignored));
+        element_shape = list_shape_type.shape;
       }
       int expected_num_elements = -1;
       TF_RETURN_IF_ERROR(c->GetAttr("num_elements", &expected_num_elements));
@@ -192,11 +202,88 @@ REGISTER_OP("TensorListStack")
         num_elements = c->MakeShape({expected_num_elements});
       }
       shape_inference::ShapeHandle result;
-      TF_RETURN_IF_ERROR(c->Concatenate(num_elements, s, &result));
+      TF_RETURN_IF_ERROR(c->Concatenate(num_elements, element_shape, &result));
       c->set_output(0, result);
       return Status::OK();
     });
 
+REGISTER_OP("TensorListConcat")
+    .Input("input_handle: variant")
+    .Output("tensor: element_dtype")
+    .Output("lengths: int64")
+    .Attr("element_dtype: type")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      DataType element_dtype;
+      TF_RETURN_IF_ERROR(c->GetAttr("element_dtype", &element_dtype));
+      shape_inference::ShapeHandle element_shape = c->UnknownShape();
+      auto* handle_data = c->input_handle_shapes_and_types(0);
+      if (handle_data != nullptr && handle_data->size() != 1) {
+        return errors::InvalidArgument(
+            "Trying to read from list with wrong variant data.");
+      }
+      if (handle_data != nullptr) {
+        const shape_inference::ShapeAndType& list_shape_type =
+            (*handle_data)[0];
+        if (list_shape_type.dtype != element_dtype) {
+          return errors::InvalidArgument(
+              "Trying to read from list with wrong element dtype. List has "
+              "type ",
+              DataTypeString(list_shape_type.dtype), " but expected type ",
+              DataTypeString(element_dtype));
+        }
+        shape_inference::ShapeHandle ignored;
+        TF_RETURN_IF_ERROR(
+            c->Merge(element_shape, list_shape_type.shape, &ignored));
+        element_shape = list_shape_type.shape;
+      }
+      if (c->RankKnown(element_shape)) {
+        shape_inference::ShapeHandle result;
+        TF_RETURN_IF_ERROR(c->Subshape(element_shape, 1, &result));
+        TF_RETURN_IF_ERROR(
+            c->Concatenate(c->MakeShape({c->UnknownDim()}), result, &result));
+        c->set_output(0, result);
+      } else {
+        c->set_output(0, c->UnknownShape());
+      }
+      c->set_output(1, c->MakeShape({c->UnknownDim()}));
+      return Status::OK();
+    });
+
+REGISTER_OP("TensorListSplit")
+    .Input("tensor: element_dtype")
+    .Input("element_shape: shape_type")
+    .Input("lengths: int64")
+    .Output("output_handle: variant")
+    .Attr("element_dtype: type")
+    .Attr("shape_type: {int32, int64}")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      c->set_output(0, c->Scalar());
+      DataType element_dtype;
+      TF_RETURN_IF_ERROR(c->GetAttr("element_dtype", &element_dtype));
+      shape_inference::ShapeHandle tensor_shape = c->input(0);
+      shape_inference::ShapeHandle ignored;
+      // Check that tensor is at least a vector.
+      TF_RETURN_IF_ERROR(c->WithRankAtLeast(tensor_shape, 1, &ignored));
+      // Check that lengths is a vector.
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 1, &ignored));
+      shape_inference::ShapeHandle element_shape_from_tensor_shape;
+      TF_RETURN_IF_ERROR(
+          c->Subshape(tensor_shape, 1, &element_shape_from_tensor_shape));
+      TF_RETURN_IF_ERROR(c->Concatenate(c->MakeShape({c->UnknownDim()}),
+                                        element_shape_from_tensor_shape,
+                                        &element_shape_from_tensor_shape));
+      shape_inference::ShapeHandle element_shape;
+      TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensorTreatScalarAsUnknownShape(
+          1, &element_shape));
+      TF_RETURN_IF_ERROR(c->Merge(element_shape_from_tensor_shape,
+                                  element_shape,
+                                  &element_shape_from_tensor_shape));
+      c->set_output_handle_shapes_and_types(
+          0, std::vector<shape_inference::ShapeAndType>{
+                 {element_shape, element_dtype}});
+      return Status::OK();
+    });
+
 REGISTER_OP("TensorListFromTensor")
     .Input("tensor: element_dtype")
     .Input("element_shape: shape_type")
@@ -205,17 +292,20 @@ REGISTER_OP("TensorListFromTensor")
     .Attr("shape_type: {int32, int64}")
     .SetShapeFn([](shape_inference::InferenceContext* c) {
       c->set_output(0, c->Scalar());
-      DataType t;
-      TF_RETURN_IF_ERROR(c->GetAttr("element_dtype", &t));
-      shape_inference::ShapeHandle s = c->input(0);
-      shape_inference::ShapeHandle o;
-      TF_RETURN_IF_ERROR(c->Subshape(s, 1, &o));
+      DataType element_dtype;
+      TF_RETURN_IF_ERROR(c->GetAttr("element_dtype", &element_dtype));
+      shape_inference::ShapeHandle tensor_shape = c->input(0);
+      shape_inference::ShapeHandle tensor_shape_except_first_dim;
+      TF_RETURN_IF_ERROR(
+          c->Subshape(tensor_shape, 1, &tensor_shape_except_first_dim));
       shape_inference::ShapeHandle element_shape;
       TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensorTreatScalarAsUnknownShape(
           1, &element_shape));
-      TF_RETURN_IF_ERROR(c->Merge(o, element_shape, &o));
+      TF_RETURN_IF_ERROR(c->Merge(tensor_shape_except_first_dim, element_shape,
+                                  &tensor_shape_except_first_dim));
       c->set_output_handle_shapes_and_types(
-          0, std::vector<shape_inference::ShapeAndType>{{element_shape, t}});
+          0, std::vector<shape_inference::ShapeAndType>{
+                 {element_shape, element_dtype}});
       return Status::OK();
     });
 
@@ -241,13 +331,14 @@ REGISTER_OP("TensorListReserve")
     .Attr("shape_type: {int32, int64}")
     .SetShapeFn([](shape_inference::InferenceContext* c) {
       c->set_output(0, c->Scalar());
-      shape_inference::ShapeHandle s;
-      TF_RETURN_IF_ERROR(
-          c->MakeShapeFromShapeTensorTreatScalarAsUnknownShape(0, &s));
-      DataType t;
-      TF_RETURN_IF_ERROR(c->GetAttr("element_dtype", &t));
+      shape_inference::ShapeHandle element_shape;
+      TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensorTreatScalarAsUnknownShape(
+          0, &element_shape));
+      DataType element_dtype;
+      TF_RETURN_IF_ERROR(c->GetAttr("element_dtype", &element_dtype));
       c->set_output_handle_shapes_and_types(
-          0, std::vector<shape_inference::ShapeAndType>{{s, t}});
+          0, std::vector<shape_inference::ShapeAndType>{
+                 {element_shape, element_dtype}});
       return Status::OK();
     });
 
@@ -257,17 +348,17 @@ REGISTER_OP("TensorListGetItem")
     .Output("item: element_dtype")
     .Attr("element_dtype: type")
     .SetShapeFn([](shape_inference::InferenceContext* c) {
-      DataType t;
-      TF_RETURN_IF_ERROR(c->GetAttr("element_dtype", &t));
+      DataType element_dtype;
+      TF_RETURN_IF_ERROR(c->GetAttr("element_dtype", &element_dtype));
       auto* handle_data = c->input_handle_shapes_and_types(0);
       shape_inference::ShapeHandle element_shape = c->UnknownShape();
       if (handle_data != nullptr) {
         const shape_inference::ShapeAndType& list_shape_type =
             (*handle_data)[0];
         element_shape = list_shape_type.shape;
-        if (list_shape_type.dtype != t) {
+        if (list_shape_type.dtype != element_dtype) {
           return errors::InvalidArgument("Expected list with element dtype ",
-                                         DataTypeString(t),
+                                         DataTypeString(element_dtype),
                                          " but got list with element dtype ",
                                          DataTypeString(list_shape_type.dtype));
         }
@@ -283,17 +374,19 @@ REGISTER_OP("TensorListSetItem")
     .Output("output_handle: variant")
     .Attr("element_dtype: type")
     .SetShapeFn([](shape_inference::InferenceContext* c) {
-      DataType t;
-      TF_RETURN_IF_ERROR(c->GetAttr("element_dtype", &t));
+      DataType element_dtype;
+      TF_RETURN_IF_ERROR(c->GetAttr("element_dtype", &element_dtype));
       auto* handle_data = c->input_handle_shapes_and_types(0);
       c->set_output(0, c->Scalar());
       if (handle_data == nullptr) {
-        c->set_output_handle_shapes_and_types(0, {{c->UnknownShape(), t}});
+        c->set_output_handle_shapes_and_types(
+            0, {{c->UnknownShape(), element_dtype}});
         return Status::OK();
       }
       const shape_inference::ShapeAndType& list_shape_type = (*handle_data)[0];
-      shape_inference::ShapeHandle s = c->input(2);
-      TF_RETURN_IF_ERROR(c->Merge(s, list_shape_type.shape, &s));
+      shape_inference::ShapeHandle item_shape = c->input(2);
+      TF_RETURN_IF_ERROR(
+          c->Merge(item_shape, list_shape_type.shape, &item_shape));
       c->set_output_handle_shapes_and_types(0, *handle_data);
       return Status::OK();
     });
@@ -304,17 +397,17 @@ REGISTER_OP("TensorListGather")
     .Output("values: element_dtype")
     .Attr("element_dtype: type")
     .SetShapeFn([](shape_inference::InferenceContext* c) {
-      DataType t;
-      TF_RETURN_IF_ERROR(c->GetAttr("element_dtype", &t));
+      DataType element_dtype;
+      TF_RETURN_IF_ERROR(c->GetAttr("element_dtype", &element_dtype));
       auto* handle_data = c->input_handle_shapes_and_types(0);
       shape_inference::ShapeHandle element_shape = c->UnknownShape();
       if (handle_data != nullptr) {
         const shape_inference::ShapeAndType& list_shape_type =
             (*handle_data)[0];
         element_shape = list_shape_type.shape;
-        if (list_shape_type.dtype != t) {
+        if (list_shape_type.dtype != element_dtype) {
           return errors::InvalidArgument("Expected list with element dtype ",
-                                         DataTypeString(t),
+                                         DataTypeString(element_dtype),
                                          " but got list with element dtype ",
                                          DataTypeString(list_shape_type.dtype));
         }
@@ -333,12 +426,13 @@ REGISTER_OP("TensorListScatter")
     .Attr("element_dtype: type")
     .Attr("shape_type: {int32, int64}")
     .SetShapeFn([](shape_inference::InferenceContext* c) {
-      DataType t;
-      TF_RETURN_IF_ERROR(c->GetAttr("element_dtype", &t));
-      shape_inference::ShapeHandle s;
-      TF_RETURN_IF_ERROR(
-          c->MakeShapeFromShapeTensorTreatScalarAsUnknownShape(2, &s));
-      c->set_output_handle_shapes_and_types(0, {{s, t}});
+      DataType element_dtype;
+      TF_RETURN_IF_ERROR(c->GetAttr("element_dtype", &element_dtype));
+      shape_inference::ShapeHandle element_shape;
+      TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensorTreatScalarAsUnknownShape(
+          2, &element_shape));
+      c->set_output_handle_shapes_and_types(0,
+                                            {{element_shape, element_dtype}});
       c->set_output(0, c->Scalar());
       return Status::OK();
     });
@@ -354,28 +448,29 @@ REGISTER_OP("TensorListConcatLists")
       TF_RETURN_IF_ERROR(c->Merge(input_a, input_b, &input_a));
       c->set_output(0, input_a);
 
-      DataType t;
-      TF_RETURN_IF_ERROR(c->GetAttr("element_dtype", &t));
+      DataType element_dtype;
+      TF_RETURN_IF_ERROR(c->GetAttr("element_dtype", &element_dtype));
 
       auto* handle_data_a = c->input_handle_shapes_and_types(0);
       auto* handle_data_b = c->input_handle_shapes_and_types(1);
       if (handle_data_a == nullptr && handle_data_b == nullptr) {
-        c->set_output_handle_shapes_and_types(0, {{c->UnknownShape(), t}});
+        c->set_output_handle_shapes_and_types(
+            0, {{c->UnknownShape(), element_dtype}});
         return Status::OK();
       }
       shape_inference::ShapeAndType list_shape_type_a =
           (handle_data_a) ? handle_data_a->at(0) : handle_data_b->at(0);
       const shape_inference::ShapeAndType& list_shape_type_b =
           (handle_data_b) ? handle_data_b->at(0) : handle_data_a->at(0);
-      if (list_shape_type_a.dtype != t) {
+      if (list_shape_type_a.dtype != element_dtype) {
         return errors::InvalidArgument("input_a.type != element_dtype: ",
                                        DataTypeString(list_shape_type_a.dtype),
-                                       " vs. ", DataTypeString(t));
+                                       " vs. ", DataTypeString(element_dtype));
       }
-      if (list_shape_type_b.dtype != t) {
+      if (list_shape_type_b.dtype != element_dtype) {
         return errors::InvalidArgument("input_b.type != element_dtype: ",
                                        DataTypeString(list_shape_type_b.dtype),
-                                       " vs. ", DataTypeString(t));
+                                       " vs. ", DataTypeString(element_dtype));
       }
       TF_RETURN_IF_ERROR(c->Merge(list_shape_type_a.shape,
                                   list_shape_type_b.shape,
diff --git a/tensorflow/core/ops/nn_ops.cc b/tensorflow/core/ops/nn_ops.cc
index 4dfd95b0191..7ff9c469438 100644
--- a/tensorflow/core/ops/nn_ops.cc
+++ b/tensorflow/core/ops/nn_ops.cc
@@ -327,6 +327,9 @@ REGISTER_OP("_FusedConv2D")
     .Attr(GetConvnetDataFormatAttrString())
     .Attr("dilations: list(int) = [1, 1, 1, 1]")
     .Attr("fused_ops: list(string) = []")
+    // Attributes for the FusedBatchNorm ------------------------------------ //
+    .Attr("epsilon: float = 0.0001")
+    // ---------------------------------------------------------------------- //
     .SetShapeFn(shape_inference::Conv2DShape)
     .Doc(R"doc(
 *NOTE*: Do not invoke this operator directly in Python. Grappler is
@@ -2190,11 +2193,7 @@ REGISTER_OP("_MklLRN")
     .Input("input: T")
     .Input("mkl_input: uint8")
     .Output("output: T")
-#ifdef INTEL_MKL_ML_ONLY
-    .Output("workspace: T")
-#else
     .Output("workspace: uint8")
-#endif
     .Output("mkl_output: uint8")
     .Output("mkl_workspace: uint8")
     .Attr("depth_radius: int = 5")
@@ -2218,11 +2217,7 @@ REGISTER_OP("_MklLRNGrad")
     .Input("input_grads: T")
     .Input("input_image: T")
     .Input("output_image: T")
-#ifdef INTEL_MKL_ML_ONLY
-    .Input("workspace: T")
-#else
     .Input("workspace: uint8")
-#endif
     .Input("mkl_input_grads: uint8")
     .Input("mkl_input_image: uint8")
     .Input("mkl_output_image: uint8")
diff --git a/tensorflow/core/ops/ops.pbtxt b/tensorflow/core/ops/ops.pbtxt
index 430212ee1d1..a9c712e138c 100644
--- a/tensorflow/core/ops/ops.pbtxt
+++ b/tensorflow/core/ops/ops.pbtxt
@@ -4944,33 +4944,6 @@ op {
     type: "list(float)"
   }
 }
-op {
-  name: "BytesProducedStatsDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "tag"
-    type: DT_STRING
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-}
 op {
   name: "CTCBeamSearchDecoder"
   input_arg {
@@ -7915,21 +7888,6 @@ op {
     minimum: 1
   }
 }
-op {
-  name: "DatasetToTFRecord"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "filename"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "compression_type"
-    type: DT_STRING
-  }
-}
 op {
   name: "DebugGradientIdentity"
   input_arg {
@@ -8599,37 +8557,6 @@ op {
     }
   }
 }
-op {
-  name: "DenseToSparseBatchDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "batch_size"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "row_shape"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-}
 op {
   name: "DenseToSparseSetOperation"
   input_arg {
@@ -9834,24 +9761,6 @@ op {
     type: DT_STRING
   }
 }
-op {
-  name: "EnqueueInQueueDataset"
-  input_arg {
-    name: "queue"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "components"
-    type_list_attr: "Tcomponents"
-  }
-  attr {
-    name: "Tcomponents"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  is_stateful: true
-}
 op {
   name: "EnsureShape"
   input_arg {
@@ -10089,6 +9998,33 @@ op {
     minimum: 1
   }
 }
+op {
+  name: "ExperimentalBytesProducedStatsDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "tag"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
 op {
   name: "ExperimentalCSVDataset"
   input_arg {
@@ -10154,6 +10090,52 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "ExperimentalDatasetToTFRecord"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "filename"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "compression_type"
+    type: DT_STRING
+  }
+}
+op {
+  name: "ExperimentalDenseToSparseBatchDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "batch_size"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "row_shape"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
 op {
   name: "ExperimentalDirectedInterleaveDataset"
   input_arg {
@@ -10189,50 +10171,66 @@ op {
   }
 }
 op {
-  name: "ExperimentalFunctionBufferingResource"
+  name: "ExperimentalGroupByReducerDataset"
   input_arg {
-    name: "string_arg"
-    type: DT_STRING
+    name: "input_dataset"
+    type: DT_VARIANT
   }
   input_arg {
-    name: "target_device"
-    type: DT_STRING
+    name: "key_func_other_arguments"
+    type_list_attr: "Tkey_func_other_arguments"
+  }
+  input_arg {
+    name: "init_func_other_arguments"
+    type_list_attr: "Tinit_func_other_arguments"
+  }
+  input_arg {
+    name: "reduce_func_other_arguments"
+    type_list_attr: "Treduce_func_other_arguments"
+  }
+  input_arg {
+    name: "finalize_func_other_arguments"
+    type_list_attr: "Tfinalize_func_other_arguments"
   }
   output_arg {
-    name: "resource"
-    type: DT_RESOURCE
+    name: "handle"
+    type: DT_VARIANT
   }
   attr {
-    name: "shared_name"
-    type: "string"
-  }
-  attr {
-    name: "container"
-    type: "string"
-  }
-  attr {
-    name: "f"
+    name: "key_func"
     type: "func"
   }
   attr {
-    name: "buffer_size"
-    type: "int"
+    name: "init_func"
+    type: "func"
   }
   attr {
-    name: "output_types"
+    name: "reduce_func"
+    type: "func"
+  }
+  attr {
+    name: "finalize_func"
+    type: "func"
+  }
+  attr {
+    name: "Tkey_func_other_arguments"
     type: "list(type)"
+    has_minimum: true
   }
-  is_stateful: true
-}
-op {
-  name: "ExperimentalFunctionBufferingResourceGetNext"
-  input_arg {
-    name: "function_buffer_resource"
-    type: DT_RESOURCE
+  attr {
+    name: "Tinit_func_other_arguments"
+    type: "list(type)"
+    has_minimum: true
   }
-  output_arg {
-    name: "output"
-    type_list_attr: "output_types"
+  attr {
+    name: "Treduce_func_other_arguments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Tfinalize_func_other_arguments"
+    type: "list(type)"
+    has_minimum: true
   }
   attr {
     name: "output_types"
@@ -10240,15 +10238,75 @@ op {
     has_minimum: true
     minimum: 1
   }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
   is_stateful: true
 }
 op {
-  name: "ExperimentalFunctionBufferingResourceReset"
+  name: "ExperimentalGroupByWindowDataset"
   input_arg {
-    name: "function_buffer_resource"
-    type: DT_RESOURCE
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "key_func_other_arguments"
+    type_list_attr: "Tkey_func_other_arguments"
+  }
+  input_arg {
+    name: "reduce_func_other_arguments"
+    type_list_attr: "Treduce_func_other_arguments"
+  }
+  input_arg {
+    name: "window_size_func_other_arguments"
+    type_list_attr: "Twindow_size_func_other_arguments"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "key_func"
+    type: "func"
+  }
+  attr {
+    name: "reduce_func"
+    type: "func"
+  }
+  attr {
+    name: "window_size_func"
+    type: "func"
+  }
+  attr {
+    name: "Tkey_func_other_arguments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Treduce_func_other_arguments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Twindow_size_func_other_arguments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
   }
-  is_stateful: true
 }
 op {
   name: "ExperimentalIdentityIndexedDataset"
@@ -10361,6 +10419,136 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "ExperimentalLatencyStatsDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "tag"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "ExperimentalMapAndBatchDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  input_arg {
+    name: "batch_size"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "num_parallel_calls"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "drop_remainder"
+    type: DT_BOOL
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "ExperimentalMapDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "use_inter_op_parallelism"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+}
+op {
+  name: "ExperimentalMatchingFilesDataset"
+  input_arg {
+    name: "patterns"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  is_stateful: true
+}
 op {
   name: "ExperimentalMaterializedIndexDatasetHandle"
   output_arg {
@@ -10389,6 +10577,33 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "ExperimentalMaxIntraOpParallelismDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "max_intra_op_parallelism"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
 op {
   name: "ExperimentalNonSerializableDataset"
   input_arg {
@@ -10460,6 +10675,276 @@ op {
     minimum: 1
   }
 }
+op {
+  name: "ExperimentalParallelInterleaveDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  input_arg {
+    name: "cycle_length"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "block_length"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "sloppy"
+    type: DT_BOOL
+  }
+  input_arg {
+    name: "buffer_output_elements"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "prefetch_input_elements"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "ExperimentalParseExampleDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "num_parallel_calls"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "dense_defaults"
+    type_list_attr: "Tdense"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "sparse_keys"
+    type: "list(string)"
+    has_minimum: true
+  }
+  attr {
+    name: "dense_keys"
+    type: "list(string)"
+    has_minimum: true
+  }
+  attr {
+    name: "sparse_types"
+    type: "list(type)"
+    has_minimum: true
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "Tdense"
+    type: "list(type)"
+    has_minimum: true
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "dense_shapes"
+    type: "list(shape)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "sloppy"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ExperimentalPrivateThreadPoolDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "num_threads"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "ExperimentalRandomDataset"
+  input_arg {
+    name: "seed"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "seed2"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
+op {
+  name: "ExperimentalScanDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "initial_state"
+    type_list_attr: "Tstate"
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "Tstate"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "ExperimentalSetStatsAggregatorDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "stats_aggregator"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "tag"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "counter_prefix"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
 op {
   name: "ExperimentalSleepDataset"
   input_arg {
@@ -10487,6 +10972,107 @@ op {
     minimum: 1
   }
 }
+op {
+  name: "ExperimentalSlidingWindowDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "window_size"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "window_shift"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "window_stride"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "ExperimentalSqlDataset"
+  input_arg {
+    name: "driver_name"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "data_source_name"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "query"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
+op {
+  name: "ExperimentalStatsAggregatorHandle"
+  output_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ExperimentalStatsAggregatorSummary"
+  input_arg {
+    name: "iterator"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "summary"
+    type: DT_STRING
+  }
+  is_stateful: true
+}
 op {
   name: "ExperimentalThreadPoolDataset"
   input_arg {
@@ -10552,6 +11138,29 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "ExperimentalUnbatchDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
 op {
   name: "ExperimentalUniqueDataset"
   input_arg {
@@ -12744,144 +13353,6 @@ op {
     }
   }
 }
-op {
-  name: "GroupByReducerDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "key_func_other_arguments"
-    type_list_attr: "Tkey_func_other_arguments"
-  }
-  input_arg {
-    name: "init_func_other_arguments"
-    type_list_attr: "Tinit_func_other_arguments"
-  }
-  input_arg {
-    name: "reduce_func_other_arguments"
-    type_list_attr: "Treduce_func_other_arguments"
-  }
-  input_arg {
-    name: "finalize_func_other_arguments"
-    type_list_attr: "Tfinalize_func_other_arguments"
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "key_func"
-    type: "func"
-  }
-  attr {
-    name: "init_func"
-    type: "func"
-  }
-  attr {
-    name: "reduce_func"
-    type: "func"
-  }
-  attr {
-    name: "finalize_func"
-    type: "func"
-  }
-  attr {
-    name: "Tkey_func_other_arguments"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "Tinit_func_other_arguments"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "Treduce_func_other_arguments"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "Tfinalize_func_other_arguments"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-  is_stateful: true
-}
-op {
-  name: "GroupByWindowDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "key_func_other_arguments"
-    type_list_attr: "Tkey_func_other_arguments"
-  }
-  input_arg {
-    name: "reduce_func_other_arguments"
-    type_list_attr: "Treduce_func_other_arguments"
-  }
-  input_arg {
-    name: "window_size_func_other_arguments"
-    type_list_attr: "Twindow_size_func_other_arguments"
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "key_func"
-    type: "func"
-  }
-  attr {
-    name: "reduce_func"
-    type: "func"
-  }
-  attr {
-    name: "window_size_func"
-    type: "func"
-  }
-  attr {
-    name: "Tkey_func_other_arguments"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "Treduce_func_other_arguments"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "Twindow_size_func_other_arguments"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-}
 op {
   name: "GuaranteeConst"
   input_arg {
@@ -14422,33 +14893,6 @@ op {
     }
   }
 }
-op {
-  name: "LatencyStatsDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "tag"
-    type: DT_STRING
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-}
 op {
   name: "LeakyRelu"
   input_arg {
@@ -15319,102 +15763,6 @@ op {
   }
   is_stateful: true
 }
-op {
-  name: "MapAndBatchDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "other_arguments"
-    type_list_attr: "Targuments"
-  }
-  input_arg {
-    name: "batch_size"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "num_parallel_batches"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "drop_remainder"
-    type: DT_BOOL
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "f"
-    type: "func"
-  }
-  attr {
-    name: "Targuments"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-}
-op {
-  name: "MapAndBatchDatasetV2"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "other_arguments"
-    type_list_attr: "Targuments"
-  }
-  input_arg {
-    name: "batch_size"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "num_parallel_calls"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "drop_remainder"
-    type: DT_BOOL
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "f"
-    type: "func"
-  }
-  attr {
-    name: "Targuments"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-}
 op {
   name: "MapClear"
   attr {
@@ -15894,18 +16242,6 @@ op {
     type: DT_STRING
   }
 }
-op {
-  name: "MatchingFilesDataset"
-  input_arg {
-    name: "patterns"
-    type: DT_STRING
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  is_stateful: true
-}
 op {
   name: "MatrixBandPart"
   input_arg {
@@ -19449,62 +19785,6 @@ op {
     type: "type"
   }
 }
-op {
-  name: "ParallelInterleaveDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "other_arguments"
-    type_list_attr: "Targuments"
-  }
-  input_arg {
-    name: "cycle_length"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "block_length"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "sloppy"
-    type: DT_BOOL
-  }
-  input_arg {
-    name: "buffer_output_elements"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "prefetch_input_elements"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "f"
-    type: "func"
-  }
-  attr {
-    name: "Targuments"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-}
 op {
   name: "ParallelInterleaveDatasetV2"
   input_arg {
@@ -19760,83 +20040,6 @@ op {
     has_minimum: true
   }
 }
-op {
-  name: "ParseExampleDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "num_parallel_calls"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "dense_defaults"
-    type_list_attr: "Tdense"
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "sparse_keys"
-    type: "list(string)"
-    has_minimum: true
-  }
-  attr {
-    name: "dense_keys"
-    type: "list(string)"
-    has_minimum: true
-  }
-  attr {
-    name: "sparse_types"
-    type: "list(type)"
-    has_minimum: true
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_INT64
-        type: DT_STRING
-      }
-    }
-  }
-  attr {
-    name: "Tdense"
-    type: "list(type)"
-    has_minimum: true
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_INT64
-        type: DT_STRING
-      }
-    }
-  }
-  attr {
-    name: "dense_shapes"
-    type: "list(shape)"
-    has_minimum: true
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "sloppy"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
 op {
   name: "ParseSequenceExample"
   input_arg {
@@ -20340,6 +20543,13 @@ op {
       s: ""
     }
   }
+  attr {
+    name: "config_proto"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
   attr {
     name: "executor_type"
     type: "string"
@@ -20516,48 +20726,6 @@ op {
     minimum: 1
   }
 }
-op {
-  name: "PrependFromQueueAndPaddedBatchDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "batch_size"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "padded_shapes"
-    type: DT_INT64
-    number_attr: "N"
-  }
-  input_arg {
-    name: "padding_values"
-    type_list_attr: "Toutput_types"
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "Toutput_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "N"
-    type: "int"
-    has_minimum: true
-    minimum: 1
-  }
-}
 op {
   name: "PreventGradient"
   input_arg {
@@ -20986,6 +21154,19 @@ op {
       }
     }
   }
+  attr {
+    name: "round_mode"
+    type: "string"
+    default_value {
+      s: "HALF_TO_EVEN"
+    }
+    allowed_values {
+      list {
+        s: "HALF_TO_EVEN"
+        s: "HALF_UP"
+      }
+    }
+  }
 }
 op {
   name: "QuantizeAndDequantizeV3"
@@ -22816,34 +22997,6 @@ op {
   }
   is_stateful: true
 }
-op {
-  name: "RandomDataset"
-  input_arg {
-    name: "seed"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "seed2"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-  is_stateful: true
-}
 op {
   name: "RandomGamma"
   input_arg {
@@ -25095,6 +25248,86 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "ResourceApplyAdamWithAmsgrad"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "m"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "v"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "vhat"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "beta1_power"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta2_power"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
 op {
   name: "ResourceApplyAddSign"
   input_arg {
@@ -25419,6 +25652,69 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "ResourceApplyKerasMomentum"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "use_nesterov"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
 op {
   name: "ResourceApplyMomentum"
   input_arg {
@@ -26690,6 +26986,83 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "ResourceSparseApplyKerasMomentum"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "use_nesterov"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
 op {
   name: "ResourceSparseApplyMomentum"
   input_arg {
@@ -27789,52 +28162,6 @@ op {
     }
   }
 }
-op {
-  name: "ScanDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "initial_state"
-    type_list_attr: "Tstate"
-  }
-  input_arg {
-    name: "other_arguments"
-    type_list_attr: "Targuments"
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "f"
-    type: "func"
-  }
-  attr {
-    name: "Tstate"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "Targuments"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-}
 op {
   name: "ScatterAdd"
   input_arg {
@@ -29272,42 +29599,6 @@ op {
     }
   }
 }
-op {
-  name: "SetStatsAggregatorDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "stats_aggregator"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "tag"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "counter_prefix"
-    type: DT_STRING
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-  is_stateful: true
-}
 op {
   name: "Shape"
   input_arg {
@@ -29771,41 +30062,6 @@ op {
     }
   }
 }
-op {
-  name: "SlideDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "window_size"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "window_shift"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "window_stride"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-}
 op {
   name: "Snapshot"
   input_arg {
@@ -32996,38 +33252,6 @@ op {
     }
   }
 }
-op {
-  name: "SqlDataset"
-  input_arg {
-    name: "driver_name"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "data_source_name"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "query"
-    type: DT_STRING
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-  is_stateful: true
-}
 op {
   name: "Sqrt"
   input_arg {
@@ -33513,6 +33737,13 @@ op {
       s: ""
     }
   }
+  attr {
+    name: "config_proto"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
   attr {
     name: "executor_type"
     type: "string"
@@ -33913,40 +34144,6 @@ op {
     }
   }
 }
-op {
-  name: "StatsAggregatorHandle"
-  output_arg {
-    name: "handle"
-    type: DT_RESOURCE
-  }
-  attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "StatsAggregatorSummary"
-  input_arg {
-    name: "iterator"
-    type: DT_RESOURCE
-  }
-  output_arg {
-    name: "summary"
-    type: DT_STRING
-  }
-  is_stateful: true
-}
 op {
   name: "StopGradient"
   input_arg {
@@ -35923,6 +36120,127 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "TensorForestCreateTreeVariable"
+  input_arg {
+    name: "tree_handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "tree_config"
+    type: DT_STRING
+  }
+  is_stateful: true
+}
+op {
+  name: "TensorForestTreeDeserialize"
+  input_arg {
+    name: "tree_handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "tree_config"
+    type: DT_STRING
+  }
+  is_stateful: true
+}
+op {
+  name: "TensorForestTreeIsInitializedOp"
+  input_arg {
+    name: "tree_handle"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "is_initialized"
+    type: DT_BOOL
+  }
+  is_stateful: true
+}
+op {
+  name: "TensorForestTreePredict"
+  input_arg {
+    name: "tree_handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "dense_features"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "logits"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "logits_dimension"
+    type: "int"
+  }
+  is_stateful: true
+}
+op {
+  name: "TensorForestTreeResourceHandleOp"
+  output_arg {
+    name: "resource"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "TensorForestTreeSerialize"
+  input_arg {
+    name: "tree_handle"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "tree_config"
+    type: DT_STRING
+  }
+  is_stateful: true
+}
+op {
+  name: "TensorForestTreeSize"
+  input_arg {
+    name: "tree_handle"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "tree_size"
+    type: DT_INT32
+  }
+  is_stateful: true
+}
+op {
+  name: "TensorListConcat"
+  input_arg {
+    name: "input_handle"
+    type: DT_VARIANT
+  }
+  output_arg {
+    name: "tensor"
+    type_attr: "element_dtype"
+  }
+  output_arg {
+    name: "lengths"
+    type: DT_INT64
+  }
+  attr {
+    name: "element_dtype"
+    type: "type"
+  }
+}
 op {
   name: "TensorListConcatLists"
   input_arg {
@@ -36183,6 +36501,39 @@ op {
     type: "type"
   }
 }
+op {
+  name: "TensorListSplit"
+  input_arg {
+    name: "tensor"
+    type_attr: "element_dtype"
+  }
+  input_arg {
+    name: "element_shape"
+    type_attr: "shape_type"
+  }
+  input_arg {
+    name: "lengths"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output_handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "element_dtype"
+    type: "type"
+  }
+  attr {
+    name: "shape_type"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
 op {
   name: "TensorListStack"
   input_arg {
@@ -36205,6 +36556,105 @@ op {
     }
   }
 }
+op {
+  name: "TensorScatterAdd"
+  input_arg {
+    name: "tensor"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "TensorScatterSub"
+  input_arg {
+    name: "tensor"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "TensorScatterUpdate"
+  input_arg {
+    name: "tensor"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
 op {
   name: "TensorSliceDataset"
   input_arg {
@@ -36822,29 +37272,6 @@ op {
     type: "type"
   }
 }
-op {
-  name: "UnbatchDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-}
 op {
   name: "UnbatchGrad"
   input_arg {
@@ -36886,6 +37313,104 @@ op {
     type: "type"
   }
 }
+op {
+  name: "UnicodeDecodeWithOffsets"
+  input_arg {
+    name: "input"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "row_splits"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "char_values"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "char_to_byte_starts"
+    type: DT_INT64
+  }
+  attr {
+    name: "input_encoding"
+    type: "string"
+  }
+  attr {
+    name: "errors"
+    type: "string"
+    default_value {
+      s: "replace"
+    }
+    allowed_values {
+      list {
+        s: "strict"
+        s: "replace"
+        s: "ignore"
+      }
+    }
+  }
+  attr {
+    name: "replacement_char"
+    type: "int"
+    default_value {
+      i: 65533
+    }
+  }
+  attr {
+    name: "replace_control_characters"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "UnicodeEncode"
+  input_arg {
+    name: "input_values"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "input_splits"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output"
+    type: DT_STRING
+  }
+  attr {
+    name: "errors"
+    type: "string"
+    default_value {
+      s: "replace"
+    }
+    allowed_values {
+      list {
+        s: "ignore"
+        s: "replace"
+        s: "strict"
+      }
+    }
+  }
+  attr {
+    name: "output_encoding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "UTF-8"
+        s: "UTF-16-BE"
+        s: "UTF-32-BE"
+      }
+    }
+  }
+  attr {
+    name: "replacement_char"
+    type: "int"
+    default_value {
+      i: 65533
+    }
+  }
+}
 op {
   name: "UnicodeScript"
   input_arg {
diff --git a/tensorflow/core/ops/sparse_ops.cc b/tensorflow/core/ops/sparse_ops.cc
index bc0cb2095da..de08a107845 100644
--- a/tensorflow/core/ops/sparse_ops.cc
+++ b/tensorflow/core/ops/sparse_ops.cc
@@ -401,7 +401,7 @@ REGISTER_OP("SparseReduceMax")
     .Attr("keep_dims: bool = False")
     .Output("output: T")
     .Attr("T: realnumbertype")
-    .SetShapeFn(shape_inference::UnknownShape);
+    .SetShapeFn(shape_inference::SparseReduceShapeFn);
 
 REGISTER_OP("SparseReduceMaxSparse")
     .Input("input_indices: int64")
@@ -423,7 +423,7 @@ REGISTER_OP("SparseReduceSum")
     .Attr("keep_dims: bool = False")
     .Output("output: T")
     .Attr("T: numbertype")
-    .SetShapeFn(shape_inference::UnknownShape);
+    .SetShapeFn(shape_inference::SparseReduceShapeFn);
 
 REGISTER_OP("SparseReduceSumSparse")
     .Input("input_indices: int64")
diff --git a/tensorflow/core/ops/sparse_ops_test.cc b/tensorflow/core/ops/sparse_ops_test.cc
index 6a9b5ce4d31..00283c59932 100644
--- a/tensorflow/core/ops/sparse_ops_test.cc
+++ b/tensorflow/core/ops/sparse_ops_test.cc
@@ -133,6 +133,13 @@ TEST(SparseOpsTest, SparseToDense_ShapeFn) {
 
 TEST(SparseOpsTest, SparseReduceSum_ShapeFn) {
   ShapeInferenceTestOp op("SparseReduceSum");
+  TF_ASSERT_OK(NodeDefBuilder("test", "SparseReduceSum")
+                   .Input({"input_indices", 0, DT_INT64})
+                   .Input({"input_values", 1, DT_INT64})
+                   .Input({"input_shape", 2, DT_INT64})
+                   .Input({"reduction_axes", 3, DT_INT32})
+                   .Attr("keep_dims", false)
+                   .Finalize(&op.node_def));
 
   // Shape fn always yields unknown.
   INFER_OK(op, "?;?;?;?", "?");
diff --git a/tensorflow/core/ops/string_ops.cc b/tensorflow/core/ops/string_ops.cc
index 352253135c4..8ea74f1d43e 100644
--- a/tensorflow/core/ops/string_ops.cc
+++ b/tensorflow/core/ops/string_ops.cc
@@ -13,13 +13,24 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <string>
+#include <vector>
+
 #include "absl/strings/str_split.h"
 #include "tensorflow/core/framework/common_shape_fns.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
 
+namespace shape_inference {
+class InferenceContext;
+}  // namespace shape_inference
+
 using shape_inference::DimensionHandle;
 using shape_inference::InferenceContext;
 using shape_inference::ShapeHandle;
@@ -250,6 +261,31 @@ REGISTER_OP("UnicodeScript")
     .Output("output: int32")
     .SetShapeFn(shape_inference::UnchangedShape);
 
+REGISTER_OP("UnicodeEncode")
+    .Input("input_values: int32")
+    .Input("input_splits: int64")
+    .Attr("errors: {'ignore', 'replace', 'strict'} = 'replace'")
+    .Attr("output_encoding: {'UTF-8', 'UTF-16-BE', 'UTF-32-BE'}")
+    .Attr("replacement_char: int = 65533")  // 0xFFFD unicode replacement char
+    .Output("output: string")
+    .SetShapeFn([](InferenceContext* c) {
+      // Check rank of inner values
+      ShapeHandle input_inner_values_shape = c->input(0);
+      ShapeHandle unused;
+      TF_RETURN_IF_ERROR(c->WithRank(input_inner_values_shape, 1, &unused));
+
+      // Check rank of input_splits
+      ShapeHandle splits_shape = c->input(1);
+      TF_RETURN_IF_ERROR(c->WithRank(splits_shape, 1, &unused));
+
+      // Output shape is a 1-D tensor with size equal to number of splits.
+      std::vector<DimensionHandle> dims(1);
+      TF_RETURN_IF_ERROR(c->Subtract(c->Dim(splits_shape, 0), 1, &dims[0]));
+      c->set_output(0, c->MakeShape(dims));
+
+      return Status::OK();
+    });
+
 REGISTER_OP("UnicodeTranscode")
     .Input("input: string")
     .Output("output: string")
@@ -259,4 +295,28 @@ REGISTER_OP("UnicodeTranscode")
     .Attr("replacement_char: int = 65533")  // 0xFFFD unicode replacement char
     .Attr("replace_control_characters: bool = false")
     .SetShapeFn(shape_inference::UnchangedShape);
+
+REGISTER_OP("UnicodeDecodeWithOffsets")
+    .Input("input: string")
+    .Output("row_splits: int64")
+    .Output("char_values: int32")
+    .Output("char_to_byte_starts: int64")
+    .Attr("input_encoding: string")
+    .Attr("errors: {'strict', 'replace', 'ignore'} = 'replace'")
+    .Attr("replacement_char: int = 65533")  // 0xFFFD unicode replacement char
+    .Attr("replace_control_characters: bool = false")
+    .SetShapeFn([](InferenceContext* c) {
+      // row_splits.shape == [input.size() + 1]
+      DimensionHandle num_row_splits;
+      DimensionHandle input_size = c->NumElements(c->input(0));
+      TF_RETURN_IF_ERROR(c->Add(input_size, 1, &num_row_splits));
+      c->set_output(0, c->Vector(num_row_splits));
+
+      // char_values.shape == offset_values.shape == [num_chars]
+      DimensionHandle num_chars = c->UnknownDim();
+      c->set_output(1, c->Vector(num_chars));
+      c->set_output(2, c->Vector(num_chars));
+      return Status::OK();
+    });
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/ops/tensor_forest_ops.cc b/tensorflow/core/ops/tensor_forest_ops.cc
new file mode 100644
index 00000000000..b4b6ba318e9
--- /dev/null
+++ b/tensorflow/core/ops/tensor_forest_ops.cc
@@ -0,0 +1,79 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <vector>
+
+#include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/framework/shape_inference.h"
+
+namespace tensorflow {
+
+REGISTER_RESOURCE_HANDLE_OP(TensorForestTreeResource);
+
+REGISTER_OP("TensorForestTreeIsInitializedOp")
+    .Input("tree_handle: resource")
+    .Output("is_initialized: bool")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle unused_input;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &unused_input));
+      c->set_output(0, c->Scalar());
+      return Status::OK();
+    });
+
+REGISTER_OP("TensorForestCreateTreeVariable")
+    .Input("tree_handle: resource")
+    .Input("tree_config: string")
+    .SetShapeFn(tensorflow::shape_inference::NoOutputs);
+
+REGISTER_OP("TensorForestTreeSerialize")
+    .Input("tree_handle: resource")
+    .Output("tree_config: string")
+    .SetShapeFn(tensorflow::shape_inference::ScalarShape);
+
+REGISTER_OP("TensorForestTreeDeserialize")
+    .Input("tree_handle: resource")
+    .Input("tree_config: string")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle unused_input;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &unused_input));
+      return Status::OK();
+    });
+
+REGISTER_OP("TensorForestTreeSize")
+    .Input("tree_handle: resource")
+    .Output("tree_size: int32")
+    .SetShapeFn(tensorflow::shape_inference::ScalarShape);
+
+REGISTER_OP("TensorForestTreePredict")
+    .Attr("logits_dimension: int")
+    .Input("tree_handle: resource")
+    .Input("dense_features: float")
+    .Output("logits: float")
+    .SetShapeFn([](tensorflow::shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle shape_handle;
+      shape_inference::DimensionHandle batch_size = c->UnknownDim();
+
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 2, &shape_handle));
+
+      batch_size = c->Dim(shape_handle, 0);
+
+      int logits_dimension;
+      TF_RETURN_IF_ERROR(c->GetAttr("logits_dimension", &logits_dimension));
+      c->set_output(0, c->Matrix(batch_size, logits_dimension));
+      return Status::OK();
+    });
+}  // namespace tensorflow
diff --git a/tensorflow/core/ops/training_ops.cc b/tensorflow/core/ops/training_ops.cc
index 94ff092a85d..995ed42d53d 100644
--- a/tensorflow/core/ops/training_ops.cc
+++ b/tensorflow/core/ops/training_ops.cc
@@ -685,6 +685,34 @@ REGISTER_OP("ResourceSparseApplyMomentum")
       return ApplyMomentumShapeFn(c, true /* sparse */);
     });
 
+REGISTER_OP("ResourceApplyKerasMomentum")
+    .Input("var: resource")
+    .Input("accum: resource")
+    .Input("lr: T")
+    .Input("grad: T")
+    .Input("momentum: T")
+    .Attr("T: numbertype")
+    .Attr("use_locking: bool = false")
+    .Attr("use_nesterov: bool = false")
+    .SetShapeFn([](InferenceContext* c) {
+      return ApplyMomentumShapeFn(c, false /* sparse */);
+    });
+
+REGISTER_OP("ResourceSparseApplyKerasMomentum")
+    .Input("var: resource")
+    .Input("accum: resource")
+    .Input("lr: T")
+    .Input("grad: T")
+    .Input("indices: Tindices")
+    .Input("momentum: T")
+    .Attr("T: numbertype")
+    .Attr("Tindices: {int32, int64}")
+    .Attr("use_locking: bool = false")
+    .Attr("use_nesterov: bool = false")
+    .SetShapeFn([](InferenceContext* c) {
+      return ApplyMomentumShapeFn(c, true /* sparse */);
+    });
+
 static Status ApplyAdamShapeFn(InferenceContext* c, bool sparse) {
   ShapeHandle unused;
   ShapeHandle s = ShapeOrHandleShape(c, 0);                       // var
@@ -741,6 +769,44 @@ REGISTER_OP("ResourceApplyAdam")
       return ApplyAdamShapeFn(c, false /* sparse */);
     });
 
+static Status ApplyAdamWithAmsgradShapeFn(InferenceContext* c, bool sparse) {
+  ShapeHandle unused;
+  ShapeHandle s = ShapeOrHandleShape(c, 0);                       // var
+  TF_RETURN_IF_ERROR(c->Merge(s, ShapeOrHandleShape(c, 1), &s));  // m
+  TF_RETURN_IF_ERROR(c->Merge(s, ShapeOrHandleShape(c, 2), &s));  // v
+  TF_RETURN_IF_ERROR(c->Merge(s, ShapeOrHandleShape(c, 3), &s));  // vhat
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(4), 0, &unused));       // beta1_power
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(5), 0, &unused));       // beta2_power
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(6), 0, &unused));       // lr
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(7), 0, &unused));       // beta1
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(8), 0, &unused));       // beta2
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(9), 0, &unused));       // epsilon
+  TF_RETURN_IF_ERROR(
+      HandleGradAndIndicesInputs(c, sparse, 10 /* grad_idx */, &s));
+  if (c->num_outputs() > 0) {
+    c->set_output(0, s);
+  }
+  return Status::OK();
+}
+
+REGISTER_OP("ResourceApplyAdamWithAmsgrad")
+    .Input("var: resource")
+    .Input("m: resource")
+    .Input("v: resource")
+    .Input("vhat: resource")
+    .Input("beta1_power: T")
+    .Input("beta2_power: T")
+    .Input("lr: T")
+    .Input("beta1: T")
+    .Input("beta2: T")
+    .Input("epsilon: T")
+    .Input("grad: T")
+    .Attr("T: numbertype")
+    .Attr("use_locking: bool = false")
+    .SetShapeFn([](InferenceContext* c) {
+      return ApplyAdamWithAmsgradShapeFn(c, false /* sparse */);
+    });
+
 static Status ApplyAdaMaxShapeFn(InferenceContext* c, bool sparse) {
   ShapeHandle unused;
   ShapeHandle s = ShapeOrHandleShape(c, 0);                       // var
diff --git a/tensorflow/core/platform/cpu_feature_guard.cc b/tensorflow/core/platform/cpu_feature_guard.cc
index 9d00aa7b7fe..2efe0c0876e 100644
--- a/tensorflow/core/platform/cpu_feature_guard.cc
+++ b/tensorflow/core/platform/cpu_feature_guard.cc
@@ -41,7 +41,7 @@ void CheckFeatureOrDie(CPUFeature feature, const string& feature_name) {
   }
 }
 
-// Check if CPU feature is inclued in the TensorFlow binary.
+// Check if CPU feature is included in the TensorFlow binary.
 void CheckIfFeatureUnused(CPUFeature feature, const string& feature_name,
                           string& missing_instructions) {
   if (TestCPUFeature(feature)) {
diff --git a/tensorflow/core/platform/default/build_config.bzl b/tensorflow/core/platform/default/build_config.bzl
index 3a4415f229b..04287151301 100644
--- a/tensorflow/core/platform/default/build_config.bzl
+++ b/tensorflow/core/platform/default/build_config.bzl
@@ -543,6 +543,9 @@ def tf_additional_proto_srcs():
 def tf_additional_human_readable_json_deps():
     return []
 
+def tf_additional_logger_deps():
+    return []
+
 def tf_additional_all_protos():
     return ["//tensorflow/core:protos_all"]
 
diff --git a/tensorflow/core/platform/default/device_tracer.cc b/tensorflow/core/platform/default/device_tracer.cc
index cf8b477b83d..8351362e056 100644
--- a/tensorflow/core/platform/default/device_tracer.cc
+++ b/tensorflow/core/platform/default/device_tracer.cc
@@ -297,19 +297,16 @@ CUPTIManager *GetCUPTIManager() {
 // for the duration of the CUPTI API callback.
 TF_STATIC_THREAD_LOCAL_POD(const char *, tls_current_annotation);
 
-class DeviceTracerImpl : public DeviceTracer,
-                         public CUPTIClient,
-                         public tracing::TraceCollector {
+class TraceCollectorImpl : public tracing::TraceCollector {
  public:
-  DeviceTracerImpl(CUPTIManager *cupti_manager);
-  ~DeviceTracerImpl() override;
+  TraceCollectorImpl() { tracing::SetTraceCollector(this); }
 
-  // DeviceTracer interface:
-  Status Start() override;
-  Status Stop() override;
-  Status Collect(StepStatsCollector *collector) override;
+  ~TraceCollectorImpl() override {
+    DCHECK(!active_trace_session_)
+        << "Unexpected active trace session detected. ";
+  }
 
-  // tracing::TraceCollector interface:
+  // Note the method can be called after a call to Stop().
   virtual std::unique_ptr<Handle> CreateAnnotationHandle(
       StringPiece name_part1, StringPiece name_part2) const {
     struct Impl : public tracing::TraceCollector::Handle {
@@ -332,8 +329,7 @@ class DeviceTracerImpl : public DeviceTracer,
   }
 
   bool IsEnabledForAnnotations() const override {
-    // We are always enabled for 'Annotations'.
-    return true;
+    return active_trace_session_.load(std::memory_order_relaxed);
   }
 
   bool IsEnabledForActivities(bool is_expensive) const override {
@@ -341,6 +337,36 @@ class DeviceTracerImpl : public DeviceTracer,
     return false;
   }
 
+  void Start() {
+    DCHECK(!active_trace_session_)
+        << "Unexpected active trace session detected. ";
+    active_trace_session_ = true;
+  }
+
+  void Stop() {
+    DCHECK(active_trace_session_) << "No active trace session detected. ";
+    active_trace_session_ = false;
+  }
+
+ private:
+  std::atomic<bool> active_trace_session_;
+};
+
+TraceCollectorImpl *GlobalDefaultTraceCollector() {
+  static auto *instance = new TraceCollectorImpl();
+  return instance;
+}
+
+class DeviceTracerImpl : public DeviceTracer, public CUPTIClient {
+ public:
+  DeviceTracerImpl(CUPTIManager *cupti_manager);
+  ~DeviceTracerImpl() override;
+
+  // DeviceTracer interface:
+  Status Start() override;
+  Status Stop() override;
+  Status Collect(StepStatsCollector *collector) override;
+
  protected:
   // This callback is used exclusively by CUPTIManager.
   friend class CUPTIManager;
@@ -430,7 +456,7 @@ Status DeviceTracerImpl::Start() {
   }
 
   // Register as a TraceEngine to receive ScopedAnnotations.
-  tracing::SetTraceCollector(this);
+  GlobalDefaultTraceCollector()->Start();
 
   // Intercept launch and memcpy calls to capture the Op name annotation.
   // TODO(pbar) Add callbacks for memcpy variants.
@@ -478,7 +504,8 @@ Status DeviceTracerImpl::Stop() {
     return Status::OK();
   }
   CUPTI_CALL(Unsubscribe(subscriber_));
-  tracing::SetTraceCollector(nullptr);
+  GlobalDefaultTraceCollector()->Stop();
+
   TF_RETURN_IF_ERROR(cupti_manager_->DisableTrace());
   end_walltime_us_ = NowInUsec();
   CUPTI_CALL(GetTimestamp(&end_timestamp_));
diff --git a/tensorflow/core/platform/default/logger.cc b/tensorflow/core/platform/default/logger.cc
new file mode 100644
index 00000000000..54b1a1a67ca
--- /dev/null
+++ b/tensorflow/core/platform/default/logger.cc
@@ -0,0 +1,34 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/platform/logger.h"
+
+#include "tensorflow/core/platform/logging.h"
+
+namespace tensorflow {
+
+Logger* Logger::Singleton() {
+  class DefaultLogger : public Logger {
+   private:
+    void DoLogProto(google::protobuf::Any* proto) override {
+      VLOG(2) << proto->ShortDebugString();
+    }
+    void DoFlush() override {}
+  };
+  static Logger* instance = new DefaultLogger();
+  return instance;
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/platform/default/logging.cc b/tensorflow/core/platform/default/logging.cc
index 34db4901067..26bd8542fd7 100644
--- a/tensorflow/core/platform/default/logging.cc
+++ b/tensorflow/core/platform/default/logging.cc
@@ -21,18 +21,18 @@ limitations under the License.
 #include <android/log.h>
 #include <iostream>
 #include <sstream>
-#include <cstring>
 #endif
 
 #include <stdlib.h>
+#include <string.h>
 #include <time.h>
 
+#include <string>
+#include <unordered_map>
+
 namespace tensorflow {
 namespace internal {
 
-LogMessage::LogMessage(const char* fname, int line, int severity)
-    : fname_(fname), line_(line), severity_(severity) {}
-
 #if defined(PLATFORM_POSIX_ANDROID)
 void LogMessage::GenerateLogMessage() {
   int android_log_level;
@@ -94,55 +94,156 @@ void LogMessage::GenerateLogMessage() {
 
 namespace {
 
+int ParseInteger(const char* str, size_t size) {
+  // Ideally we would use env_var / safe_strto64, but it is
+  // hard to use here without pulling in a lot of dependencies,
+  // so we use std:istringstream instead
+  string integer_str(str, size);
+  std::istringstream ss(integer_str);
+  int level = 0;
+  ss >> level;
+  return level;
+}
+
 // Parse log level (int64) from environment variable (char*)
 int64 LogLevelStrToInt(const char* tf_env_var_val) {
   if (tf_env_var_val == nullptr) {
     return 0;
   }
+  return ParseInteger(tf_env_var_val, strlen(tf_env_var_val));
+}
 
-  // Ideally we would use env_var / safe_strto64, but it is
-  // hard to use here without pulling in a lot of dependencies,
-  // so we use std:istringstream instead
-  string min_log_level(tf_env_var_val);
-  std::istringstream ss(min_log_level);
-  int64 level;
-  if (!(ss >> level)) {
-    // Invalid vlog level setting, set level to default (0)
-    level = 0;
+// Using StringPiece breaks Windows build.
+struct StringData {
+  struct Hasher {
+    size_t operator()(const StringData& sdata) const {
+      // For dependency reasons, we cannot use hash.h here. Use DBJHash instead.
+      size_t hash = 5381;
+      const char* data = sdata.data;
+      for (const char* top = data + sdata.size; data < top; ++data) {
+        hash = ((hash << 5) + hash) + (*data);
+      }
+      return hash;
+    }
+  };
+
+  StringData() = default;
+  StringData(const char* data, size_t size) : data(data), size(size) {}
+
+  bool operator==(const StringData& rhs) const {
+    return size == rhs.size && memcmp(data, rhs.data, size) == 0;
   }
 
-  return level;
+  const char* data = nullptr;
+  size_t size = 0;
+};
+
+using VmoduleMap = std::unordered_map<StringData, int, StringData::Hasher>;
+
+// Returns a mapping from module name to VLOG level, derived from the
+// TF_CPP_VMOUDLE environment variable; ownership is transferred to the caller.
+VmoduleMap* VmodulesMapFromEnv() {
+  // The value of the env var is supposed to be of the form:
+  //    "foo=1,bar=2,baz=3"
+  const char* env = getenv("TF_CPP_VMODULE");
+  if (env == nullptr) {
+    // If there is no TF_CPP_VMODULE configuration (most common case), return
+    // nullptr so that the ShouldVlogModule() API can fast bail out of it.
+    return nullptr;
+  }
+  // The memory returned by getenv() can be invalidated by following getenv() or
+  // setenv() calls. And since we keep references to it in the VmoduleMap in
+  // form of StringData objects, make a copy of it.
+  const char* env_data = strdup(env);
+  VmoduleMap* result = new VmoduleMap();
+  while (true) {
+    const char* eq = strchr(env_data, '=');
+    if (eq == nullptr) {
+      break;
+    }
+    const char* after_eq = eq + 1;
+
+    // Comma either points at the next comma delimiter, or at a null terminator.
+    // We check that the integer we parse ends at this delimiter.
+    const char* comma = strchr(after_eq, ',');
+    const char* new_env_data;
+    if (comma == nullptr) {
+      comma = strchr(after_eq, '\0');
+      new_env_data = comma;
+    } else {
+      new_env_data = comma + 1;
+    }
+    (*result)[StringData(env_data, eq - env_data)] =
+        ParseInteger(after_eq, comma - after_eq);
+    env_data = new_env_data;
+  }
+  return result;
 }
 
 }  // namespace
 
 int64 MinLogLevelFromEnv() {
-  const char* tf_env_var_val = getenv("TF_CPP_MIN_LOG_LEVEL");
-  return LogLevelStrToInt(tf_env_var_val);
-}
-
-int64 MinVLogLevelFromEnv() {
-  const char* tf_env_var_val = getenv("TF_CPP_MIN_VLOG_LEVEL");
-  return LogLevelStrToInt(tf_env_var_val);
-}
-
-LogMessage::~LogMessage() {
-  // Read the min log level once during the first call to logging.
-  static int64 min_log_level = MinLogLevelFromEnv();
-  if (TF_PREDICT_TRUE(severity_ >= min_log_level)) GenerateLogMessage();
-}
-
-int64 LogMessage::MinVLogLevel() {
   // We don't want to print logs during fuzzing as that would slow fuzzing down
   // by almost 2x. So, if we are in fuzzing mode (not just running a test), we
-  // return maximum value so that nothing is actually printed
+  // return a value so that nothing is actually printed. Since LOG uses >=
+  // (see ~LogMessage in this file) to see if log messages need to be printed,
+  // the value we're interested on to disable printing is the maximum severity.
   // See also http://llvm.org/docs/LibFuzzer.html#fuzzer-friendly-build-mode
 #ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
   return tensorflow::NUM_SEVERITIES;
 #else
+  const char* tf_env_var_val = getenv("TF_CPP_MIN_LOG_LEVEL");
+  return LogLevelStrToInt(tf_env_var_val);
+#endif
+}
+
+int64 MinVLogLevelFromEnv() {
+  // We don't want to print logs during fuzzing as that would slow fuzzing down
+  // by almost 2x. So, if we are in fuzzing mode (not just running a test), we
+  // return a value so that nothing is actually printed. Since VLOG uses <=
+  // (see VLOG_IS_ON in logging.h) to see if log messages need to be printed,
+  // the value we're interested on to disable printing is 0.
+  // See also http://llvm.org/docs/LibFuzzer.html#fuzzer-friendly-build-mode
+#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
+  return 0;
+#else
+  const char* tf_env_var_val = getenv("TF_CPP_MIN_VLOG_LEVEL");
+  return LogLevelStrToInt(tf_env_var_val);
+#endif
+}
+
+LogMessage::LogMessage(const char* fname, int line, int severity)
+    : fname_(fname), line_(line), severity_(severity) {}
+
+LogMessage::~LogMessage() {
+  // Read the min log level once during the first call to logging.
+  static int64 min_log_level = MinLogLevelFromEnv();
+  if (severity_ >= min_log_level) {
+    GenerateLogMessage();
+  }
+}
+
+int64 LogMessage::MinVLogLevel() {
   static int64 min_vlog_level = MinVLogLevelFromEnv();
   return min_vlog_level;
-#endif
+}
+
+bool LogMessage::VmoduleActivated(const char* fname, int level) {
+  if (level <= MinVLogLevel()) {
+    return true;
+  }
+  static VmoduleMap* vmodules = VmodulesMapFromEnv();
+  if (TF_PREDICT_TRUE(vmodules == nullptr)) {
+    return false;
+  }
+  const char* last_slash = strrchr(fname, '/');
+  const char* module_start = last_slash == nullptr ? fname : last_slash + 1;
+  const char* dot_after = strchr(module_start, '.');
+  const char* module_limit =
+      dot_after == nullptr ? strchr(fname, '\0') : dot_after;
+  StringData module(module_start, module_limit - module_start);
+  auto it = vmodules->find(module);
+  return it != vmodules->end() && it->second >= level;
 }
 
 LogMessageFatal::LogMessageFatal(const char* file, int line)
diff --git a/tensorflow/core/platform/default/logging.h b/tensorflow/core/platform/default/logging.h
index 08a692fff75..bb8735ed325 100644
--- a/tensorflow/core/platform/default/logging.h
+++ b/tensorflow/core/platform/default/logging.h
@@ -46,6 +46,17 @@ class LogMessage : public std::basic_ostringstream<char> {
   // but VLOG(3) will not. Defaults to 0.
   static int64 MinVLogLevel();
 
+  // Returns whether VLOG level lvl is activated for the file fname.
+  //
+  // E.g. if the environment variable TF_CPP_VMODULE contains foo=3 and fname is
+  // foo.cc and lvl is <= 3, this will return true. It will also return true if
+  // the level is lower or equal to TF_CPP_MIN_VLOG_LEVEL (default zero).
+  //
+  // It is expected that the result of this query will be cached in the VLOG-ing
+  // call site to avoid repeated lookups. This routine performs a hash-map
+  // access against the VLOG-ing specification provided by the env var.
+  static bool VmoduleActivated(const char* fname, int level);
+
  protected:
   void GenerateLogMessage();
 
@@ -55,6 +66,13 @@ class LogMessage : public std::basic_ostringstream<char> {
   int severity_;
 };
 
+// Uses the lower operator & precedence to voidify a LogMessage reference, so
+// that the ternary VLOG() implementation is balanced, type wise.
+struct Voidifier {
+  template <typename T>
+  void operator&(const T&)const {}
+};
+
 // LogMessageFatal ensures the process will exit in failure after
 // logging this message.
 class LogMessageFatal : public LogMessage {
@@ -77,18 +95,30 @@ class LogMessageFatal : public LogMessage {
 #define LOG(severity) _TF_LOG_##severity
 
 #ifdef IS_MOBILE_PLATFORM
+
 // Turn VLOG off when under mobile devices for considerations of binary size.
 #define VLOG_IS_ON(lvl) ((lvl) <= 0)
+
 #else
-// Otherwise, Set TF_CPP_MIN_VLOG_LEVEL environment to update minimum log level
-// of VLOG
-#define VLOG_IS_ON(lvl) \
-  ((lvl) <= ::tensorflow::internal::LogMessage::MinVLogLevel())
+
+// Otherwise, set TF_CPP_MIN_VLOG_LEVEL environment to update minimum log level
+// of VLOG, or TF_CPP_VMODULE to set the minimum log level for individual
+// translation units.
+#define VLOG_IS_ON(lvl)                                                     \
+  (([](int level, const char* fname) {                                      \
+    static const bool vmodule_activated =                                   \
+        ::tensorflow::internal::LogMessage::VmoduleActivated(fname, level); \
+    return vmodule_activated;                                               \
+  })(lvl, __FILE__))
+
 #endif
 
-#define VLOG(lvl)                        \
-  if (TF_PREDICT_FALSE(VLOG_IS_ON(lvl))) \
-  ::tensorflow::internal::LogMessage(__FILE__, __LINE__, tensorflow::INFO)
+#define VLOG(level)                                              \
+  TF_PREDICT_TRUE(!VLOG_IS_ON(level))                            \
+  ? (void)0                                                      \
+  : ::tensorflow::internal::Voidifier() &                        \
+          ::tensorflow::internal::LogMessage(__FILE__, __LINE__, \
+                                             tensorflow::INFO)
 
 // CHECK dies with a fatal error if condition is not true.  It is *not*
 // controlled by NDEBUG, so the check will be executed regardless of
diff --git a/tensorflow/core/platform/env.h b/tensorflow/core/platform/env.h
index 5732271f150..7374fccdc2c 100644
--- a/tensorflow/core/platform/env.h
+++ b/tensorflow/core/platform/env.h
@@ -28,6 +28,7 @@ limitations under the License.
 #include "tensorflow/core/platform/file_system.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/numa.h"
 #include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/types.h"
 
@@ -395,6 +396,7 @@ struct ThreadOptions {
   size_t stack_size = 0;  // 0: use system default value
   /// Guard area size to use near thread stacks to use (in bytes)
   size_t guard_size = 0;  // 0: use system default value
+  int numa_node = port::kNUMANoAffinity;
 };
 
 /// A utility routine: copy contents of `src` in file system `src_fs`
diff --git a/tensorflow/core/platform/logger.h b/tensorflow/core/platform/logger.h
new file mode 100644
index 00000000000..5d304bea63a
--- /dev/null
+++ b/tensorflow/core/platform/logger.h
@@ -0,0 +1,51 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PLATFORM_LOGGER_H_
+#define TENSORFLOW_CORE_PLATFORM_LOGGER_H_
+
+#include "google/protobuf/any.pb.h"
+#include "tensorflow/core/platform/protobuf.h"
+
+namespace tensorflow {
+
+// Abstract logging interface. Contrary to logging.h, this class describes an
+// interface, not a concrete logging mechanism. This is useful when we want to
+// log anything to a non-local place, e.g. a database.
+class Logger {
+ public:
+  static Logger* Singleton();
+
+  virtual ~Logger() = default;
+
+  // Logs a typed proto.
+  template <typename ProtoType>
+  void LogProto(const ProtoType& proto) {
+    google::protobuf::Any any;
+    any.PackFrom(proto);
+    DoLogProto(&any);
+  }
+
+  // Flushes any pending log. Blocks until everything is flushed.
+  void Flush() { DoFlush(); }
+
+ private:
+  virtual void DoLogProto(google::protobuf::Any* proto) = 0;
+  virtual void DoFlush() = 0;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PLATFORM_LOGGER_H_
diff --git a/tensorflow/core/platform/numa_test.cc b/tensorflow/core/platform/numa_test.cc
index 8b39ecd59cb..91789efd1ee 100644
--- a/tensorflow/core/platform/numa_test.cc
+++ b/tensorflow/core/platform/numa_test.cc
@@ -44,7 +44,7 @@ TEST(Numa, Malloc) {
 
 TEST(Numa, SetNodeAffinity) {
   // NOTE(tucker): This test is not reliable when executed under tap because
-  // the virtual machine may not have access to all of the availble NUMA
+  // the virtual machine may not have access to all of the available NUMA
   // nodes.  Not sure what to do about that.
   EXPECT_EQ(-1, port::NUMAGetThreadNodeAffinity());
   if (port::NUMAEnabled()) {
diff --git a/tensorflow/core/platform/platform_strings.cc b/tensorflow/core/platform/platform_strings.cc
new file mode 100644
index 00000000000..c1852633d59
--- /dev/null
+++ b/tensorflow/core/platform/platform_strings.cc
@@ -0,0 +1,64 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/platform/platform_strings.h"
+
+#include <cstdio>
+#include <cstring>
+
+#include <string>
+#include <vector>
+
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+
+int GetPlatformStrings(const std::string& path,
+                       std::vector<std::string>* found) {
+  int result;
+  FILE* ifp = fopen(path.c_str(), "rb");
+  if (ifp != nullptr) {
+    static const char prefix[] = TF_PLAT_STR_MAGIC_PREFIX_;
+    int first_char = prefix[1];
+    int last_char = -1;
+    int c;
+    while ((c = getc(ifp)) != EOF) {
+      if (c == first_char && last_char == 0) {
+        int i = 2;
+        while (prefix[i] != 0 && (c = getc(ifp)) == prefix[i]) {
+          i++;
+        }
+        if (prefix[i] == 0) {
+          std::string str;
+          while ((c = getc(ifp)) != EOF && c != 0) {
+            str.push_back(c);
+          }
+          if (!str.empty()) {
+            found->push_back(str);
+          }
+        }
+      }
+      last_char = c;
+    }
+
+    result = (ferror(ifp) == 0) ? 0 : errno;
+    fclose(ifp);
+  } else {
+    result = errno;
+  }
+  return result;
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/platform/platform_strings.h b/tensorflow/core/platform/platform_strings.h
new file mode 100644
index 00000000000..5b1dbd130e0
--- /dev/null
+++ b/tensorflow/core/platform/platform_strings.h
@@ -0,0 +1,364 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PLATFORM_PLATFORM_STRINGS_H_
+#define TENSORFLOW_CORE_PLATFORM_PLATFORM_STRINGS_H_
+
+// This header defines the macro TF_PLATFORM_STRINGS() which should be used
+// once in each dynamically loadable TensorFlow module.  It embeds static
+// strings into the compilation unit that allow TensorFlow to determine what
+// compilation options were in effect when the compilation unit was built.  All
+// compilation units within the same dynamically loadable library should be
+// built with the same options (or at least, the strings should be embedded in
+// the compilation unit built with the most restrictive options).
+
+// The platform strings embedded into a binary may be retrieved with the
+// GetPlatformStrings function.
+
+// Rationale:
+// We wish to load only those libraries that this CPU can execute.  For
+// example, we should not load a library compiled with avx256 instructions on a
+// CPU that cannot execute them.
+//
+// One might think that one could dlopen() the library, and call a routine that
+// would return which cpu type it was compiled for.  Alas, this does not work,
+// because at dlopen() time, a library containing C++ will execute constructors
+// of class variables with static storage class.  Even code that looks
+// innocuous may use optional platform-specific instructions.  For example,
+// the fastest way to zero a region of memory might use optional instructions.
+//
+// One might think one could run a tool such as "objdump" to read flags from
+// the libraries' headers, or perhaps disassemble each library to look for
+// particular instructions.  Unfortunately, the desired flags are not present
+// in the headers, and disassembly can be prohibitively slow ("objdump -d" is
+// very slow, for example).  Moreover, a tool to examine the library may not
+// be present on the system unless the user has installed special packages (for
+// example, on Windows).
+//
+// Instead, we adopt a crude but straightforward solution:  We require
+// developers to use the macro TF_PLATFORM_STRINGS() in their library, to
+// embed the compilation options as constant strings.  The compiler's
+// predefined macros pick which strings are included.  We then search for the
+// strings in the files, and then dlopen() only those libraries that have or
+// lack strings as needed.
+//
+// We adopt the approach of placing in the binary a fairly raw copy of the
+// predefined macros, rather than trying to interpret them in complex ways at
+// compile time.  This allows the loading binary to alter its interpretation of
+// the strings without library developers having to recompile.
+
+#include <stdio.h>
+
+#include <string>
+#include <vector>
+
+// Aside from the header guard, the internal macros defined here have the form:
+//   TF_PLAT_STR_*
+
+// If a macro is removed from the list of tested macros, the major version in
+// the following version number should be incremented, and the minor version
+// set to zero.  Otherwise, if a macro is added to the list of tested macros,
+// the minor number should be incremented.
+#define TF_PLAT_STR_VERSION_ "1.0"
+
+// Prefix of each option string indicator in the binary.
+// After the prefix, such strings have the form:
+//    [A-Za-z_0-9]=<value>
+// followed by a terminating nul.  To simplify searching, this prefix is all
+// ASCII, starts with a nul, and contains no character twice.
+#define TF_PLAT_STR_MAGIC_PREFIX_ "\0S\\s\":^p*L}"
+
+// A helper macro for TF_PLAT_STR_AS_STR_().
+#define TF_PLAT_STR_STR_1_(x) #x
+
+// Yield a constant string corresponding to x, after macro expansion.
+#define TF_PLAT_STR_AS_STR_(x) TF_PLAT_STR_STR_1_(x)
+
+// An empty definition to make lists more uniform.
+#define TF_PLAT_STR_TERMINATOR_
+
+// TF_PLAT_STR_(x) introduces a constant string indicating whether a
+// particular compilation option has been turned on.
+//
+// In gcc and clang, we might imagine using something like
+// #define TF_PLAT_STR_(x) \
+//     (sizeof (#x) != sizeof (TF_PLAT_STR_AS_STR_ (x))? \
+//      TF_PLAT_STR_MAGIC_PREFIX_ #x "=" TF_PLAT_STR_AS_STR_ (x) : \
+//      TF_PLAT_STR_MAGIC_PREFIX_ #x "=0"),
+// but some compilers (notably MSVC) place both "foo" and "bar" in the binary
+// when presented with
+//    (true?  "foo" : "bar")
+// so we must use #if to select the strings we need, which is rather verbose.
+#define TF_PLAT_STR_(x) TF_PLAT_STR_MAGIC_PREFIX_ #x "=" TF_PLAT_STR_AS_STR_(x)
+
+// Include the #if machinery that sets the macros used below.
+// platform_strings_computed.h can be generated by filtering this header file
+// through:
+// awk '
+// header == "" { print; }
+// /\*\// && header == "" {
+//     print "// Generated from platform_strings.h.";
+//     print "";
+//     print "#ifndef TENSORFLOW_CORE_PLATFORM_PLATFORM_STRINGS_COMPUTED_H_";
+//     print "#define TENSORFLOW_CORE_PLATFORM_PLATFORM_STRINGS_COMPUTED_H_";
+//     print "";
+//     header = 1;
+// }
+// /^#define TF_PLAT_STR_LIST_[a-zA-Z0-9_]*\(\) *\\$/ { active = 1; }
+// /TF_PLAT_STR_TERMINATOR_/ { active = 0; }
+// /^ *TF_PLAT_STR_[A-Za-z0-9_]* *\\$/ && active {
+//     x = $0;
+//     sub(/^ *TF_PLAT_STR_/, "", x);
+//     sub(/ *\\$/, "", x);
+//     printf ("#if defined(%s)\n", x);
+//     printf ("#define TF_PLAT_STR_%s TF_PLAT_STR_(%s)\n", x, x);
+//     printf ("#else\n");
+//     printf ("#define TF_PLAT_STR_%s\n", x);
+//     printf ("#endif\n");
+// }
+// END {
+//     print "";
+//     print "#endif  // TENSORFLOW_CORE_PLATFORM_PLATFORM_STRINGS_COMPUTED_H_";
+// }'
+#include "tensorflow/core/platform/platform_strings_computed.h"
+
+// clang-format butchers the following lines.
+// clang-format off
+
+// x86_64 and x86_32 optional features.
+#define TF_PLAT_STR_LIST___x86_64__()                                      \
+        TF_PLAT_STR__M_IX86_FP                                             \
+        TF_PLAT_STR__NO_PREFETCHW                                          \
+        TF_PLAT_STR___3dNOW_A__                                            \
+        TF_PLAT_STR___3dNOW__                                              \
+        TF_PLAT_STR___ABM__                                                \
+        TF_PLAT_STR___ADX__                                                \
+        TF_PLAT_STR___AES__                                                \
+        TF_PLAT_STR___AVX2__                                               \
+        TF_PLAT_STR___AVX512BW__                                           \
+        TF_PLAT_STR___AVX512CD__                                           \
+        TF_PLAT_STR___AVX512DQ__                                           \
+        TF_PLAT_STR___AVX512ER__                                           \
+        TF_PLAT_STR___AVX512F__                                            \
+        TF_PLAT_STR___AVX512IFMA__                                         \
+        TF_PLAT_STR___AVX512PF__                                           \
+        TF_PLAT_STR___AVX512VBMI__                                         \
+        TF_PLAT_STR___AVX512VL__                                           \
+        TF_PLAT_STR___AVX__                                                \
+        TF_PLAT_STR___BMI2__                                               \
+        TF_PLAT_STR___BMI__                                                \
+        TF_PLAT_STR___CLFLUSHOPT__                                         \
+        TF_PLAT_STR___CLZERO__                                             \
+        TF_PLAT_STR___F16C__                                               \
+        TF_PLAT_STR___FMA4__                                               \
+        TF_PLAT_STR___FMA__                                                \
+        TF_PLAT_STR___FP_FAST_FMA                                          \
+        TF_PLAT_STR___FP_FAST_FMAF                                         \
+        TF_PLAT_STR___FSGSBASE__                                           \
+        TF_PLAT_STR___FXSR__                                               \
+        TF_PLAT_STR___LWP__                                                \
+        TF_PLAT_STR___LZCNT__                                              \
+        TF_PLAT_STR___MMX__                                                \
+        TF_PLAT_STR___MWAITX__                                             \
+        TF_PLAT_STR___PCLMUL__                                             \
+        TF_PLAT_STR___PKU__                                                \
+        TF_PLAT_STR___POPCNT__                                             \
+        TF_PLAT_STR___PRFCHW__                                             \
+        TF_PLAT_STR___RDRND__                                              \
+        TF_PLAT_STR___RDSEED__                                             \
+        TF_PLAT_STR___RTM__                                                \
+        TF_PLAT_STR___SHA__                                                \
+        TF_PLAT_STR___SSE2_MATH__                                          \
+        TF_PLAT_STR___SSE2__                                               \
+        TF_PLAT_STR___SSE_MATH__                                           \
+        TF_PLAT_STR___SSE__                                                \
+        TF_PLAT_STR___SSE3__                                               \
+        TF_PLAT_STR___SSE4A__                                              \
+        TF_PLAT_STR___SSE4_1__                                             \
+        TF_PLAT_STR___SSE4_2__                                             \
+        TF_PLAT_STR___SSSE3__                                              \
+        TF_PLAT_STR___TBM__                                                \
+        TF_PLAT_STR___XOP__                                                \
+        TF_PLAT_STR___XSAVEC__                                             \
+        TF_PLAT_STR___XSAVEOPT__                                           \
+        TF_PLAT_STR___XSAVES__                                             \
+        TF_PLAT_STR___XSAVE__                                              \
+        TF_PLAT_STR_TERMINATOR_
+
+// PowerPC (64- and 32-bit) optional features.
+#define TF_PLAT_STR_LIST___powerpc64__()                                   \
+        TF_PLAT_STR__SOFT_DOUBLE                                           \
+        TF_PLAT_STR__SOFT_FLOAT                                            \
+        TF_PLAT_STR___ALTIVEC__                                            \
+        TF_PLAT_STR___APPLE_ALTIVEC__                                      \
+        TF_PLAT_STR___CRYPTO__                                             \
+        TF_PLAT_STR___FLOAT128_HARDWARE__                                  \
+        TF_PLAT_STR___FLOAT128_TYPE__                                      \
+        TF_PLAT_STR___FP_FAST_FMA                                          \
+        TF_PLAT_STR___FP_FAST_FMAF                                         \
+        TF_PLAT_STR___HTM__                                                \
+        TF_PLAT_STR___NO_FPRS__                                            \
+        TF_PLAT_STR___NO_LWSYNC__                                          \
+        TF_PLAT_STR___POWER8_VECTOR__                                      \
+        TF_PLAT_STR___POWER9_VECTOR__                                      \
+        TF_PLAT_STR___PPC405__                                             \
+        TF_PLAT_STR___QUAD_MEMORY_ATOMIC__                                 \
+        TF_PLAT_STR___RECIPF__                                             \
+        TF_PLAT_STR___RECIP_PRECISION__                                    \
+        TF_PLAT_STR___RECIP__                                              \
+        TF_PLAT_STR___RSQRTEF__                                            \
+        TF_PLAT_STR___RSQRTE__                                             \
+        TF_PLAT_STR___TM_FENCE__                                           \
+        TF_PLAT_STR___UPPER_REGS_DF__                                      \
+        TF_PLAT_STR___UPPER_REGS_SF__                                      \
+        TF_PLAT_STR___VEC__                                                \
+        TF_PLAT_STR___VSX__                                                \
+        TF_PLAT_STR_TERMINATOR_
+
+// aarch64 and 32-bit arm optional features
+#define TF_PLAT_STR_LIST___aarch64__()                                     \
+        TF_PLAT_STR___ARM_ARCH                                             \
+        TF_PLAT_STR___ARM_FEATURE_CLZ                                      \
+        TF_PLAT_STR___ARM_FEATURE_CRC32                                    \
+        TF_PLAT_STR___ARM_FEATURE_CRC32                                    \
+        TF_PLAT_STR___ARM_FEATURE_CRYPTO                                   \
+        TF_PLAT_STR___ARM_FEATURE_DIRECTED_ROUNDING                        \
+        TF_PLAT_STR___ARM_FEATURE_DSP                                      \
+        TF_PLAT_STR___ARM_FEATURE_FMA                                      \
+        TF_PLAT_STR___ARM_FEATURE_IDIV                                     \
+        TF_PLAT_STR___ARM_FEATURE_LDREX                                    \
+        TF_PLAT_STR___ARM_FEATURE_NUMERIC_MAXMIN                           \
+        TF_PLAT_STR___ARM_FEATURE_QBIT                                     \
+        TF_PLAT_STR___ARM_FEATURE_QRDMX                                    \
+        TF_PLAT_STR___ARM_FEATURE_SAT                                      \
+        TF_PLAT_STR___ARM_FEATURE_SIMD32                                   \
+        TF_PLAT_STR___ARM_FEATURE_UNALIGNED                                \
+        TF_PLAT_STR___ARM_FP                                               \
+        TF_PLAT_STR___ARM_NEON_FP                                          \
+        TF_PLAT_STR___ARM_NEON__                                           \
+        TF_PLAT_STR___ARM_WMMX                                             \
+        TF_PLAT_STR___IWMMXT2__                                            \
+        TF_PLAT_STR___IWMMXT__                                             \
+        TF_PLAT_STR___VFP_FP__                                             \
+        TF_PLAT_STR_TERMINATOR_
+
+// Generic features, including indication of architecture and OS.
+// The _M_* macros are defined by Visual Studio.
+// It doesn't define __LITTLE_ENDIAN__ or __BYTE_ORDER__;
+// Windows is assumed to be little endian.
+#define TF_PLAT_STR_LIST___generic__()                                     \
+        TF_PLAT_STR_TARGET_IPHONE_SIMULATOR                                \
+        TF_PLAT_STR_TARGET_OS_IOS                                          \
+        TF_PLAT_STR_TARGET_OS_IPHONE                                       \
+        TF_PLAT_STR__MSC_VER                                               \
+        TF_PLAT_STR__M_ARM                                                 \
+        TF_PLAT_STR__M_ARM64                                               \
+        TF_PLAT_STR__M_ARM_ARMV7VE                                         \
+        TF_PLAT_STR__M_ARM_FP                                              \
+        TF_PLAT_STR__M_IX86                                                \
+        TF_PLAT_STR__M_X64                                                 \
+        TF_PLAT_STR__WIN32                                                 \
+        TF_PLAT_STR__WIN64                                                 \
+        TF_PLAT_STR___ANDROID__                                            \
+        TF_PLAT_STR___APPLE__                                              \
+        TF_PLAT_STR___BYTE_ORDER__                                         \
+        TF_PLAT_STR___CYGWIN__                                             \
+        TF_PLAT_STR___FreeBSD__                                            \
+        TF_PLAT_STR___LITTLE_ENDIAN__                                      \
+        TF_PLAT_STR___NetBSD__                                             \
+        TF_PLAT_STR___OpenBSD__                                            \
+        TF_PLAT_STR_____MSYS__                                             \
+        TF_PLAT_STR___aarch64__                                            \
+        TF_PLAT_STR___alpha__                                              \
+        TF_PLAT_STR___arm__                                                \
+        TF_PLAT_STR___i386__                                               \
+        TF_PLAT_STR___i686__                                               \
+        TF_PLAT_STR___ia64__                                               \
+        TF_PLAT_STR___linux__                                              \
+        TF_PLAT_STR___mips32__                                             \
+        TF_PLAT_STR___mips64__                                             \
+        TF_PLAT_STR___powerpc64__                                          \
+        TF_PLAT_STR___powerpc__                                            \
+        TF_PLAT_STR___riscv___                                             \
+        TF_PLAT_STR___s390x__                                              \
+        TF_PLAT_STR___sparc64__                                            \
+        TF_PLAT_STR___sparc__                                              \
+        TF_PLAT_STR___x86_64__                                             \
+        TF_PLAT_STR_TERMINATOR_
+
+#if !defined(__x86_64__) && !defined(_M_X64) && \
+    !defined(__i386__) && !defined(_M_IX86)
+#undef TF_PLAT_STR_LIST___x86_64__
+#define TF_PLAT_STR_LIST___x86_64__()
+#endif
+#if !defined(__powerpc64__) && !defined(__powerpc__)
+#undef TF_PLAT_STR_LIST___powerpc64__
+#define TF_PLAT_STR_LIST___powerpc64__()
+#endif
+#if !defined(__aarch64__) && !defined(_M_ARM64) && \
+    !defined(__arm__) && !defined(_M_ARM)
+#undef TF_PLAT_STR_LIST___aarch64__
+#define TF_PLAT_STR_LIST___aarch64__()
+#endif
+
+// Macro to be used in each dynamically loadable library.
+//
+// The BSS global variable tf_cpu_option_global and the class
+// instance tf_cpu_option_avoid_omit_class are needed to prevent
+// compilers/linkers such as clang from omitting the static variable
+// tf_cpu_option[], which would otherwise appear to be unused.  We cannot make
+// tf_cpu_option[] global, because we then might get multiply-defined symbols
+// if TF_PLAT_STR() is used twice in the same library.
+// (tf_cpu_option_global doesn't see such errors because it is
+// defined in BSS, so multiple definitions are combined by the linker.)  gcc's
+// __attribute__((used)) is insufficient because it seems to be ignored by
+// linkers.
+#define TF_PLATFORM_STRINGS()                                                  \
+    static const char tf_cpu_option[] =                                        \
+        TF_PLAT_STR_MAGIC_PREFIX_ "TF_PLAT_STR_VERSION=" TF_PLAT_STR_VERSION_  \
+        TF_PLAT_STR_LIST___x86_64__()                                          \
+        TF_PLAT_STR_LIST___powerpc64__()                                       \
+        TF_PLAT_STR_LIST___aarch64__()                                         \
+        TF_PLAT_STR_LIST___generic__()                                         \
+    ;                                                                          \
+    const char *tf_cpu_option_global;                                          \
+    namespace {                                                                \
+    class TFCPUOptionHelper {                                                  \
+     public:                                                                   \
+      TFCPUOptionHelper() {                                                    \
+        /* Compilers/linkers remove unused variables aggressively.  The */     \
+        /* following gyrations subvert most such optimizations. */             \
+        tf_cpu_option_global = tf_cpu_option;                                  \
+        /* Nothing is printed because the string starts with a nul. */         \
+        printf("%s", tf_cpu_option);                                           \
+      }                                                                        \
+    } tf_cpu_option_avoid_omit_class;                                          \
+    }  /* anonymous namespace */
+// clang-format on
+
+namespace tensorflow {
+
+class Status;
+
+// Retrieves the platform strings from the file at the given path and appends
+// them to the given vector. If the returned int is non-zero, an error occurred
+// reading the file and vector may or may not be modified. The returned error
+// code is suitable for use with strerror().
+int GetPlatformStrings(const std::string& path,
+                       std::vector<std::string>* found);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PLATFORM_PLATFORM_STRINGS_H_
diff --git a/tensorflow/core/platform/platform_strings_computed.h b/tensorflow/core/platform/platform_strings_computed.h
new file mode 100644
index 00000000000..6a17f3bfc3a
--- /dev/null
+++ b/tensorflow/core/platform/platform_strings_computed.h
@@ -0,0 +1,735 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// Generated from platform_strings.h.
+
+#ifndef TENSORFLOW_CORE_PLATFORM_PLATFORM_STRINGS_COMPUTED_H_
+#define TENSORFLOW_CORE_PLATFORM_PLATFORM_STRINGS_COMPUTED_H_
+
+#if defined(_M_IX86_FP)
+#define TF_PLAT_STR__M_IX86_FP TF_PLAT_STR_(_M_IX86_FP)
+#else
+#define TF_PLAT_STR__M_IX86_FP
+#endif
+#if defined(_NO_PREFETCHW)
+#define TF_PLAT_STR__NO_PREFETCHW TF_PLAT_STR_(_NO_PREFETCHW)
+#else
+#define TF_PLAT_STR__NO_PREFETCHW
+#endif
+#if defined(__3dNOW_A__)
+#define TF_PLAT_STR___3dNOW_A__ TF_PLAT_STR_(__3dNOW_A__)
+#else
+#define TF_PLAT_STR___3dNOW_A__
+#endif
+#if defined(__3dNOW__)
+#define TF_PLAT_STR___3dNOW__ TF_PLAT_STR_(__3dNOW__)
+#else
+#define TF_PLAT_STR___3dNOW__
+#endif
+#if defined(__ABM__)
+#define TF_PLAT_STR___ABM__ TF_PLAT_STR_(__ABM__)
+#else
+#define TF_PLAT_STR___ABM__
+#endif
+#if defined(__ADX__)
+#define TF_PLAT_STR___ADX__ TF_PLAT_STR_(__ADX__)
+#else
+#define TF_PLAT_STR___ADX__
+#endif
+#if defined(__AES__)
+#define TF_PLAT_STR___AES__ TF_PLAT_STR_(__AES__)
+#else
+#define TF_PLAT_STR___AES__
+#endif
+#if defined(__AVX2__)
+#define TF_PLAT_STR___AVX2__ TF_PLAT_STR_(__AVX2__)
+#else
+#define TF_PLAT_STR___AVX2__
+#endif
+#if defined(__AVX512BW__)
+#define TF_PLAT_STR___AVX512BW__ TF_PLAT_STR_(__AVX512BW__)
+#else
+#define TF_PLAT_STR___AVX512BW__
+#endif
+#if defined(__AVX512CD__)
+#define TF_PLAT_STR___AVX512CD__ TF_PLAT_STR_(__AVX512CD__)
+#else
+#define TF_PLAT_STR___AVX512CD__
+#endif
+#if defined(__AVX512DQ__)
+#define TF_PLAT_STR___AVX512DQ__ TF_PLAT_STR_(__AVX512DQ__)
+#else
+#define TF_PLAT_STR___AVX512DQ__
+#endif
+#if defined(__AVX512ER__)
+#define TF_PLAT_STR___AVX512ER__ TF_PLAT_STR_(__AVX512ER__)
+#else
+#define TF_PLAT_STR___AVX512ER__
+#endif
+#if defined(__AVX512F__)
+#define TF_PLAT_STR___AVX512F__ TF_PLAT_STR_(__AVX512F__)
+#else
+#define TF_PLAT_STR___AVX512F__
+#endif
+#if defined(__AVX512IFMA__)
+#define TF_PLAT_STR___AVX512IFMA__ TF_PLAT_STR_(__AVX512IFMA__)
+#else
+#define TF_PLAT_STR___AVX512IFMA__
+#endif
+#if defined(__AVX512PF__)
+#define TF_PLAT_STR___AVX512PF__ TF_PLAT_STR_(__AVX512PF__)
+#else
+#define TF_PLAT_STR___AVX512PF__
+#endif
+#if defined(__AVX512VBMI__)
+#define TF_PLAT_STR___AVX512VBMI__ TF_PLAT_STR_(__AVX512VBMI__)
+#else
+#define TF_PLAT_STR___AVX512VBMI__
+#endif
+#if defined(__AVX512VL__)
+#define TF_PLAT_STR___AVX512VL__ TF_PLAT_STR_(__AVX512VL__)
+#else
+#define TF_PLAT_STR___AVX512VL__
+#endif
+#if defined(__AVX__)
+#define TF_PLAT_STR___AVX__ TF_PLAT_STR_(__AVX__)
+#else
+#define TF_PLAT_STR___AVX__
+#endif
+#if defined(__BMI2__)
+#define TF_PLAT_STR___BMI2__ TF_PLAT_STR_(__BMI2__)
+#else
+#define TF_PLAT_STR___BMI2__
+#endif
+#if defined(__BMI__)
+#define TF_PLAT_STR___BMI__ TF_PLAT_STR_(__BMI__)
+#else
+#define TF_PLAT_STR___BMI__
+#endif
+#if defined(__CLFLUSHOPT__)
+#define TF_PLAT_STR___CLFLUSHOPT__ TF_PLAT_STR_(__CLFLUSHOPT__)
+#else
+#define TF_PLAT_STR___CLFLUSHOPT__
+#endif
+#if defined(__CLZERO__)
+#define TF_PLAT_STR___CLZERO__ TF_PLAT_STR_(__CLZERO__)
+#else
+#define TF_PLAT_STR___CLZERO__
+#endif
+#if defined(__F16C__)
+#define TF_PLAT_STR___F16C__ TF_PLAT_STR_(__F16C__)
+#else
+#define TF_PLAT_STR___F16C__
+#endif
+#if defined(__FMA4__)
+#define TF_PLAT_STR___FMA4__ TF_PLAT_STR_(__FMA4__)
+#else
+#define TF_PLAT_STR___FMA4__
+#endif
+#if defined(__FMA__)
+#define TF_PLAT_STR___FMA__ TF_PLAT_STR_(__FMA__)
+#else
+#define TF_PLAT_STR___FMA__
+#endif
+#if defined(__FP_FAST_FMA)
+#define TF_PLAT_STR___FP_FAST_FMA TF_PLAT_STR_(__FP_FAST_FMA)
+#else
+#define TF_PLAT_STR___FP_FAST_FMA
+#endif
+#if defined(__FP_FAST_FMAF)
+#define TF_PLAT_STR___FP_FAST_FMAF TF_PLAT_STR_(__FP_FAST_FMAF)
+#else
+#define TF_PLAT_STR___FP_FAST_FMAF
+#endif
+#if defined(__FSGSBASE__)
+#define TF_PLAT_STR___FSGSBASE__ TF_PLAT_STR_(__FSGSBASE__)
+#else
+#define TF_PLAT_STR___FSGSBASE__
+#endif
+#if defined(__FXSR__)
+#define TF_PLAT_STR___FXSR__ TF_PLAT_STR_(__FXSR__)
+#else
+#define TF_PLAT_STR___FXSR__
+#endif
+#if defined(__LWP__)
+#define TF_PLAT_STR___LWP__ TF_PLAT_STR_(__LWP__)
+#else
+#define TF_PLAT_STR___LWP__
+#endif
+#if defined(__LZCNT__)
+#define TF_PLAT_STR___LZCNT__ TF_PLAT_STR_(__LZCNT__)
+#else
+#define TF_PLAT_STR___LZCNT__
+#endif
+#if defined(__MMX__)
+#define TF_PLAT_STR___MMX__ TF_PLAT_STR_(__MMX__)
+#else
+#define TF_PLAT_STR___MMX__
+#endif
+#if defined(__MWAITX__)
+#define TF_PLAT_STR___MWAITX__ TF_PLAT_STR_(__MWAITX__)
+#else
+#define TF_PLAT_STR___MWAITX__
+#endif
+#if defined(__PCLMUL__)
+#define TF_PLAT_STR___PCLMUL__ TF_PLAT_STR_(__PCLMUL__)
+#else
+#define TF_PLAT_STR___PCLMUL__
+#endif
+#if defined(__PKU__)
+#define TF_PLAT_STR___PKU__ TF_PLAT_STR_(__PKU__)
+#else
+#define TF_PLAT_STR___PKU__
+#endif
+#if defined(__POPCNT__)
+#define TF_PLAT_STR___POPCNT__ TF_PLAT_STR_(__POPCNT__)
+#else
+#define TF_PLAT_STR___POPCNT__
+#endif
+#if defined(__PRFCHW__)
+#define TF_PLAT_STR___PRFCHW__ TF_PLAT_STR_(__PRFCHW__)
+#else
+#define TF_PLAT_STR___PRFCHW__
+#endif
+#if defined(__RDRND__)
+#define TF_PLAT_STR___RDRND__ TF_PLAT_STR_(__RDRND__)
+#else
+#define TF_PLAT_STR___RDRND__
+#endif
+#if defined(__RDSEED__)
+#define TF_PLAT_STR___RDSEED__ TF_PLAT_STR_(__RDSEED__)
+#else
+#define TF_PLAT_STR___RDSEED__
+#endif
+#if defined(__RTM__)
+#define TF_PLAT_STR___RTM__ TF_PLAT_STR_(__RTM__)
+#else
+#define TF_PLAT_STR___RTM__
+#endif
+#if defined(__SHA__)
+#define TF_PLAT_STR___SHA__ TF_PLAT_STR_(__SHA__)
+#else
+#define TF_PLAT_STR___SHA__
+#endif
+#if defined(__SSE2_MATH__)
+#define TF_PLAT_STR___SSE2_MATH__ TF_PLAT_STR_(__SSE2_MATH__)
+#else
+#define TF_PLAT_STR___SSE2_MATH__
+#endif
+#if defined(__SSE2__)
+#define TF_PLAT_STR___SSE2__ TF_PLAT_STR_(__SSE2__)
+#else
+#define TF_PLAT_STR___SSE2__
+#endif
+#if defined(__SSE_MATH__)
+#define TF_PLAT_STR___SSE_MATH__ TF_PLAT_STR_(__SSE_MATH__)
+#else
+#define TF_PLAT_STR___SSE_MATH__
+#endif
+#if defined(__SSE__)
+#define TF_PLAT_STR___SSE__ TF_PLAT_STR_(__SSE__)
+#else
+#define TF_PLAT_STR___SSE__
+#endif
+#if defined(__SSE3__)
+#define TF_PLAT_STR___SSE3__ TF_PLAT_STR_(__SSE3__)
+#else
+#define TF_PLAT_STR___SSE3__
+#endif
+#if defined(__SSE4A__)
+#define TF_PLAT_STR___SSE4A__ TF_PLAT_STR_(__SSE4A__)
+#else
+#define TF_PLAT_STR___SSE4A__
+#endif
+#if defined(__SSE4_1__)
+#define TF_PLAT_STR___SSE4_1__ TF_PLAT_STR_(__SSE4_1__)
+#else
+#define TF_PLAT_STR___SSE4_1__
+#endif
+#if defined(__SSE4_2__)
+#define TF_PLAT_STR___SSE4_2__ TF_PLAT_STR_(__SSE4_2__)
+#else
+#define TF_PLAT_STR___SSE4_2__
+#endif
+#if defined(__SSSE3__)
+#define TF_PLAT_STR___SSSE3__ TF_PLAT_STR_(__SSSE3__)
+#else
+#define TF_PLAT_STR___SSSE3__
+#endif
+#if defined(__TBM__)
+#define TF_PLAT_STR___TBM__ TF_PLAT_STR_(__TBM__)
+#else
+#define TF_PLAT_STR___TBM__
+#endif
+#if defined(__XOP__)
+#define TF_PLAT_STR___XOP__ TF_PLAT_STR_(__XOP__)
+#else
+#define TF_PLAT_STR___XOP__
+#endif
+#if defined(__XSAVEC__)
+#define TF_PLAT_STR___XSAVEC__ TF_PLAT_STR_(__XSAVEC__)
+#else
+#define TF_PLAT_STR___XSAVEC__
+#endif
+#if defined(__XSAVEOPT__)
+#define TF_PLAT_STR___XSAVEOPT__ TF_PLAT_STR_(__XSAVEOPT__)
+#else
+#define TF_PLAT_STR___XSAVEOPT__
+#endif
+#if defined(__XSAVES__)
+#define TF_PLAT_STR___XSAVES__ TF_PLAT_STR_(__XSAVES__)
+#else
+#define TF_PLAT_STR___XSAVES__
+#endif
+#if defined(__XSAVE__)
+#define TF_PLAT_STR___XSAVE__ TF_PLAT_STR_(__XSAVE__)
+#else
+#define TF_PLAT_STR___XSAVE__
+#endif
+#if defined(_SOFT_DOUBLE)
+#define TF_PLAT_STR__SOFT_DOUBLE TF_PLAT_STR_(_SOFT_DOUBLE)
+#else
+#define TF_PLAT_STR__SOFT_DOUBLE
+#endif
+#if defined(_SOFT_FLOAT)
+#define TF_PLAT_STR__SOFT_FLOAT TF_PLAT_STR_(_SOFT_FLOAT)
+#else
+#define TF_PLAT_STR__SOFT_FLOAT
+#endif
+#if defined(__ALTIVEC__)
+#define TF_PLAT_STR___ALTIVEC__ TF_PLAT_STR_(__ALTIVEC__)
+#else
+#define TF_PLAT_STR___ALTIVEC__
+#endif
+#if defined(__APPLE_ALTIVEC__)
+#define TF_PLAT_STR___APPLE_ALTIVEC__ TF_PLAT_STR_(__APPLE_ALTIVEC__)
+#else
+#define TF_PLAT_STR___APPLE_ALTIVEC__
+#endif
+#if defined(__CRYPTO__)
+#define TF_PLAT_STR___CRYPTO__ TF_PLAT_STR_(__CRYPTO__)
+#else
+#define TF_PLAT_STR___CRYPTO__
+#endif
+#if defined(__FLOAT128_HARDWARE__)
+#define TF_PLAT_STR___FLOAT128_HARDWARE__ TF_PLAT_STR_(__FLOAT128_HARDWARE__)
+#else
+#define TF_PLAT_STR___FLOAT128_HARDWARE__
+#endif
+#if defined(__FLOAT128_TYPE__)
+#define TF_PLAT_STR___FLOAT128_TYPE__ TF_PLAT_STR_(__FLOAT128_TYPE__)
+#else
+#define TF_PLAT_STR___FLOAT128_TYPE__
+#endif
+#if defined(__FP_FAST_FMA)
+#define TF_PLAT_STR___FP_FAST_FMA TF_PLAT_STR_(__FP_FAST_FMA)
+#else
+#define TF_PLAT_STR___FP_FAST_FMA
+#endif
+#if defined(__FP_FAST_FMAF)
+#define TF_PLAT_STR___FP_FAST_FMAF TF_PLAT_STR_(__FP_FAST_FMAF)
+#else
+#define TF_PLAT_STR___FP_FAST_FMAF
+#endif
+#if defined(__HTM__)
+#define TF_PLAT_STR___HTM__ TF_PLAT_STR_(__HTM__)
+#else
+#define TF_PLAT_STR___HTM__
+#endif
+#if defined(__NO_FPRS__)
+#define TF_PLAT_STR___NO_FPRS__ TF_PLAT_STR_(__NO_FPRS__)
+#else
+#define TF_PLAT_STR___NO_FPRS__
+#endif
+#if defined(__NO_LWSYNC__)
+#define TF_PLAT_STR___NO_LWSYNC__ TF_PLAT_STR_(__NO_LWSYNC__)
+#else
+#define TF_PLAT_STR___NO_LWSYNC__
+#endif
+#if defined(__POWER8_VECTOR__)
+#define TF_PLAT_STR___POWER8_VECTOR__ TF_PLAT_STR_(__POWER8_VECTOR__)
+#else
+#define TF_PLAT_STR___POWER8_VECTOR__
+#endif
+#if defined(__POWER9_VECTOR__)
+#define TF_PLAT_STR___POWER9_VECTOR__ TF_PLAT_STR_(__POWER9_VECTOR__)
+#else
+#define TF_PLAT_STR___POWER9_VECTOR__
+#endif
+#if defined(__PPC405__)
+#define TF_PLAT_STR___PPC405__ TF_PLAT_STR_(__PPC405__)
+#else
+#define TF_PLAT_STR___PPC405__
+#endif
+#if defined(__QUAD_MEMORY_ATOMIC__)
+#define TF_PLAT_STR___QUAD_MEMORY_ATOMIC__ TF_PLAT_STR_(__QUAD_MEMORY_ATOMIC__)
+#else
+#define TF_PLAT_STR___QUAD_MEMORY_ATOMIC__
+#endif
+#if defined(__RECIPF__)
+#define TF_PLAT_STR___RECIPF__ TF_PLAT_STR_(__RECIPF__)
+#else
+#define TF_PLAT_STR___RECIPF__
+#endif
+#if defined(__RECIP_PRECISION__)
+#define TF_PLAT_STR___RECIP_PRECISION__ TF_PLAT_STR_(__RECIP_PRECISION__)
+#else
+#define TF_PLAT_STR___RECIP_PRECISION__
+#endif
+#if defined(__RECIP__)
+#define TF_PLAT_STR___RECIP__ TF_PLAT_STR_(__RECIP__)
+#else
+#define TF_PLAT_STR___RECIP__
+#endif
+#if defined(__RSQRTEF__)
+#define TF_PLAT_STR___RSQRTEF__ TF_PLAT_STR_(__RSQRTEF__)
+#else
+#define TF_PLAT_STR___RSQRTEF__
+#endif
+#if defined(__RSQRTE__)
+#define TF_PLAT_STR___RSQRTE__ TF_PLAT_STR_(__RSQRTE__)
+#else
+#define TF_PLAT_STR___RSQRTE__
+#endif
+#if defined(__TM_FENCE__)
+#define TF_PLAT_STR___TM_FENCE__ TF_PLAT_STR_(__TM_FENCE__)
+#else
+#define TF_PLAT_STR___TM_FENCE__
+#endif
+#if defined(__UPPER_REGS_DF__)
+#define TF_PLAT_STR___UPPER_REGS_DF__ TF_PLAT_STR_(__UPPER_REGS_DF__)
+#else
+#define TF_PLAT_STR___UPPER_REGS_DF__
+#endif
+#if defined(__UPPER_REGS_SF__)
+#define TF_PLAT_STR___UPPER_REGS_SF__ TF_PLAT_STR_(__UPPER_REGS_SF__)
+#else
+#define TF_PLAT_STR___UPPER_REGS_SF__
+#endif
+#if defined(__VEC__)
+#define TF_PLAT_STR___VEC__ TF_PLAT_STR_(__VEC__)
+#else
+#define TF_PLAT_STR___VEC__
+#endif
+#if defined(__VSX__)
+#define TF_PLAT_STR___VSX__ TF_PLAT_STR_(__VSX__)
+#else
+#define TF_PLAT_STR___VSX__
+#endif
+#if defined(__ARM_ARCH)
+#define TF_PLAT_STR___ARM_ARCH TF_PLAT_STR_(__ARM_ARCH)
+#else
+#define TF_PLAT_STR___ARM_ARCH
+#endif
+#if defined(__ARM_FEATURE_CLZ)
+#define TF_PLAT_STR___ARM_FEATURE_CLZ TF_PLAT_STR_(__ARM_FEATURE_CLZ)
+#else
+#define TF_PLAT_STR___ARM_FEATURE_CLZ
+#endif
+#if defined(__ARM_FEATURE_CRC32)
+#define TF_PLAT_STR___ARM_FEATURE_CRC32 TF_PLAT_STR_(__ARM_FEATURE_CRC32)
+#else
+#define TF_PLAT_STR___ARM_FEATURE_CRC32
+#endif
+#if defined(__ARM_FEATURE_CRC32)
+#define TF_PLAT_STR___ARM_FEATURE_CRC32 TF_PLAT_STR_(__ARM_FEATURE_CRC32)
+#else
+#define TF_PLAT_STR___ARM_FEATURE_CRC32
+#endif
+#if defined(__ARM_FEATURE_CRYPTO)
+#define TF_PLAT_STR___ARM_FEATURE_CRYPTO TF_PLAT_STR_(__ARM_FEATURE_CRYPTO)
+#else
+#define TF_PLAT_STR___ARM_FEATURE_CRYPTO
+#endif
+#if defined(__ARM_FEATURE_DIRECTED_ROUNDING)
+#define TF_PLAT_STR___ARM_FEATURE_DIRECTED_ROUNDING \
+  TF_PLAT_STR_(__ARM_FEATURE_DIRECTED_ROUNDING)
+#else
+#define TF_PLAT_STR___ARM_FEATURE_DIRECTED_ROUNDING
+#endif
+#if defined(__ARM_FEATURE_DSP)
+#define TF_PLAT_STR___ARM_FEATURE_DSP TF_PLAT_STR_(__ARM_FEATURE_DSP)
+#else
+#define TF_PLAT_STR___ARM_FEATURE_DSP
+#endif
+#if defined(__ARM_FEATURE_FMA)
+#define TF_PLAT_STR___ARM_FEATURE_FMA TF_PLAT_STR_(__ARM_FEATURE_FMA)
+#else
+#define TF_PLAT_STR___ARM_FEATURE_FMA
+#endif
+#if defined(__ARM_FEATURE_IDIV)
+#define TF_PLAT_STR___ARM_FEATURE_IDIV TF_PLAT_STR_(__ARM_FEATURE_IDIV)
+#else
+#define TF_PLAT_STR___ARM_FEATURE_IDIV
+#endif
+#if defined(__ARM_FEATURE_LDREX)
+#define TF_PLAT_STR___ARM_FEATURE_LDREX TF_PLAT_STR_(__ARM_FEATURE_LDREX)
+#else
+#define TF_PLAT_STR___ARM_FEATURE_LDREX
+#endif
+#if defined(__ARM_FEATURE_NUMERIC_MAXMIN)
+#define TF_PLAT_STR___ARM_FEATURE_NUMERIC_MAXMIN \
+  TF_PLAT_STR_(__ARM_FEATURE_NUMERIC_MAXMIN)
+#else
+#define TF_PLAT_STR___ARM_FEATURE_NUMERIC_MAXMIN
+#endif
+#if defined(__ARM_FEATURE_QBIT)
+#define TF_PLAT_STR___ARM_FEATURE_QBIT TF_PLAT_STR_(__ARM_FEATURE_QBIT)
+#else
+#define TF_PLAT_STR___ARM_FEATURE_QBIT
+#endif
+#if defined(__ARM_FEATURE_QRDMX)
+#define TF_PLAT_STR___ARM_FEATURE_QRDMX TF_PLAT_STR_(__ARM_FEATURE_QRDMX)
+#else
+#define TF_PLAT_STR___ARM_FEATURE_QRDMX
+#endif
+#if defined(__ARM_FEATURE_SAT)
+#define TF_PLAT_STR___ARM_FEATURE_SAT TF_PLAT_STR_(__ARM_FEATURE_SAT)
+#else
+#define TF_PLAT_STR___ARM_FEATURE_SAT
+#endif
+#if defined(__ARM_FEATURE_SIMD32)
+#define TF_PLAT_STR___ARM_FEATURE_SIMD32 TF_PLAT_STR_(__ARM_FEATURE_SIMD32)
+#else
+#define TF_PLAT_STR___ARM_FEATURE_SIMD32
+#endif
+#if defined(__ARM_FEATURE_UNALIGNED)
+#define TF_PLAT_STR___ARM_FEATURE_UNALIGNED \
+  TF_PLAT_STR_(__ARM_FEATURE_UNALIGNED)
+#else
+#define TF_PLAT_STR___ARM_FEATURE_UNALIGNED
+#endif
+#if defined(__ARM_FP)
+#define TF_PLAT_STR___ARM_FP TF_PLAT_STR_(__ARM_FP)
+#else
+#define TF_PLAT_STR___ARM_FP
+#endif
+#if defined(__ARM_NEON_FP)
+#define TF_PLAT_STR___ARM_NEON_FP TF_PLAT_STR_(__ARM_NEON_FP)
+#else
+#define TF_PLAT_STR___ARM_NEON_FP
+#endif
+#if defined(__ARM_NEON__)
+#define TF_PLAT_STR___ARM_NEON__ TF_PLAT_STR_(__ARM_NEON__)
+#else
+#define TF_PLAT_STR___ARM_NEON__
+#endif
+#if defined(__ARM_WMMX)
+#define TF_PLAT_STR___ARM_WMMX TF_PLAT_STR_(__ARM_WMMX)
+#else
+#define TF_PLAT_STR___ARM_WMMX
+#endif
+#if defined(__IWMMXT2__)
+#define TF_PLAT_STR___IWMMXT2__ TF_PLAT_STR_(__IWMMXT2__)
+#else
+#define TF_PLAT_STR___IWMMXT2__
+#endif
+#if defined(__IWMMXT__)
+#define TF_PLAT_STR___IWMMXT__ TF_PLAT_STR_(__IWMMXT__)
+#else
+#define TF_PLAT_STR___IWMMXT__
+#endif
+#if defined(__VFP_FP__)
+#define TF_PLAT_STR___VFP_FP__ TF_PLAT_STR_(__VFP_FP__)
+#else
+#define TF_PLAT_STR___VFP_FP__
+#endif
+#if defined(TARGET_IPHONE_SIMULATOR)
+#define TF_PLAT_STR_TARGET_IPHONE_SIMULATOR \
+  TF_PLAT_STR_(TARGET_IPHONE_SIMULATOR)
+#else
+#define TF_PLAT_STR_TARGET_IPHONE_SIMULATOR
+#endif
+#if defined(TARGET_OS_IOS)
+#define TF_PLAT_STR_TARGET_OS_IOS TF_PLAT_STR_(TARGET_OS_IOS)
+#else
+#define TF_PLAT_STR_TARGET_OS_IOS
+#endif
+#if defined(TARGET_OS_IPHONE)
+#define TF_PLAT_STR_TARGET_OS_IPHONE TF_PLAT_STR_(TARGET_OS_IPHONE)
+#else
+#define TF_PLAT_STR_TARGET_OS_IPHONE
+#endif
+#if defined(_MSC_VER)
+#define TF_PLAT_STR__MSC_VER TF_PLAT_STR_(_MSC_VER)
+#else
+#define TF_PLAT_STR__MSC_VER
+#endif
+#if defined(_M_ARM)
+#define TF_PLAT_STR__M_ARM TF_PLAT_STR_(_M_ARM)
+#else
+#define TF_PLAT_STR__M_ARM
+#endif
+#if defined(_M_ARM64)
+#define TF_PLAT_STR__M_ARM64 TF_PLAT_STR_(_M_ARM64)
+#else
+#define TF_PLAT_STR__M_ARM64
+#endif
+#if defined(_M_ARM_ARMV7VE)
+#define TF_PLAT_STR__M_ARM_ARMV7VE TF_PLAT_STR_(_M_ARM_ARMV7VE)
+#else
+#define TF_PLAT_STR__M_ARM_ARMV7VE
+#endif
+#if defined(_M_ARM_FP)
+#define TF_PLAT_STR__M_ARM_FP TF_PLAT_STR_(_M_ARM_FP)
+#else
+#define TF_PLAT_STR__M_ARM_FP
+#endif
+#if defined(_M_IX86)
+#define TF_PLAT_STR__M_IX86 TF_PLAT_STR_(_M_IX86)
+#else
+#define TF_PLAT_STR__M_IX86
+#endif
+#if defined(_M_X64)
+#define TF_PLAT_STR__M_X64 TF_PLAT_STR_(_M_X64)
+#else
+#define TF_PLAT_STR__M_X64
+#endif
+#if defined(_WIN32)
+#define TF_PLAT_STR__WIN32 TF_PLAT_STR_(_WIN32)
+#else
+#define TF_PLAT_STR__WIN32
+#endif
+#if defined(_WIN64)
+#define TF_PLAT_STR__WIN64 TF_PLAT_STR_(_WIN64)
+#else
+#define TF_PLAT_STR__WIN64
+#endif
+#if defined(__ANDROID__)
+#define TF_PLAT_STR___ANDROID__ TF_PLAT_STR_(__ANDROID__)
+#else
+#define TF_PLAT_STR___ANDROID__
+#endif
+#if defined(__APPLE__)
+#define TF_PLAT_STR___APPLE__ TF_PLAT_STR_(__APPLE__)
+#else
+#define TF_PLAT_STR___APPLE__
+#endif
+#if defined(__BYTE_ORDER__)
+#define TF_PLAT_STR___BYTE_ORDER__ TF_PLAT_STR_(__BYTE_ORDER__)
+#else
+#define TF_PLAT_STR___BYTE_ORDER__
+#endif
+#if defined(__CYGWIN__)
+#define TF_PLAT_STR___CYGWIN__ TF_PLAT_STR_(__CYGWIN__)
+#else
+#define TF_PLAT_STR___CYGWIN__
+#endif
+#if defined(__FreeBSD__)
+#define TF_PLAT_STR___FreeBSD__ TF_PLAT_STR_(__FreeBSD__)
+#else
+#define TF_PLAT_STR___FreeBSD__
+#endif
+#if defined(__LITTLE_ENDIAN__)
+#define TF_PLAT_STR___LITTLE_ENDIAN__ TF_PLAT_STR_(__LITTLE_ENDIAN__)
+#else
+#define TF_PLAT_STR___LITTLE_ENDIAN__
+#endif
+#if defined(__NetBSD__)
+#define TF_PLAT_STR___NetBSD__ TF_PLAT_STR_(__NetBSD__)
+#else
+#define TF_PLAT_STR___NetBSD__
+#endif
+#if defined(__OpenBSD__)
+#define TF_PLAT_STR___OpenBSD__ TF_PLAT_STR_(__OpenBSD__)
+#else
+#define TF_PLAT_STR___OpenBSD__
+#endif
+#if defined(____MSYS__)
+#define TF_PLAT_STR_____MSYS__ TF_PLAT_STR_(____MSYS__)
+#else
+#define TF_PLAT_STR_____MSYS__
+#endif
+#if defined(__aarch64__)
+#define TF_PLAT_STR___aarch64__ TF_PLAT_STR_(__aarch64__)
+#else
+#define TF_PLAT_STR___aarch64__
+#endif
+#if defined(__alpha__)
+#define TF_PLAT_STR___alpha__ TF_PLAT_STR_(__alpha__)
+#else
+#define TF_PLAT_STR___alpha__
+#endif
+#if defined(__arm__)
+#define TF_PLAT_STR___arm__ TF_PLAT_STR_(__arm__)
+#else
+#define TF_PLAT_STR___arm__
+#endif
+#if defined(__i386__)
+#define TF_PLAT_STR___i386__ TF_PLAT_STR_(__i386__)
+#else
+#define TF_PLAT_STR___i386__
+#endif
+#if defined(__i686__)
+#define TF_PLAT_STR___i686__ TF_PLAT_STR_(__i686__)
+#else
+#define TF_PLAT_STR___i686__
+#endif
+#if defined(__ia64__)
+#define TF_PLAT_STR___ia64__ TF_PLAT_STR_(__ia64__)
+#else
+#define TF_PLAT_STR___ia64__
+#endif
+#if defined(__linux__)
+#define TF_PLAT_STR___linux__ TF_PLAT_STR_(__linux__)
+#else
+#define TF_PLAT_STR___linux__
+#endif
+#if defined(__mips32__)
+#define TF_PLAT_STR___mips32__ TF_PLAT_STR_(__mips32__)
+#else
+#define TF_PLAT_STR___mips32__
+#endif
+#if defined(__mips64__)
+#define TF_PLAT_STR___mips64__ TF_PLAT_STR_(__mips64__)
+#else
+#define TF_PLAT_STR___mips64__
+#endif
+#if defined(__powerpc64__)
+#define TF_PLAT_STR___powerpc64__ TF_PLAT_STR_(__powerpc64__)
+#else
+#define TF_PLAT_STR___powerpc64__
+#endif
+#if defined(__powerpc__)
+#define TF_PLAT_STR___powerpc__ TF_PLAT_STR_(__powerpc__)
+#else
+#define TF_PLAT_STR___powerpc__
+#endif
+#if defined(__riscv___)
+#define TF_PLAT_STR___riscv___ TF_PLAT_STR_(__riscv___)
+#else
+#define TF_PLAT_STR___riscv___
+#endif
+#if defined(__s390x__)
+#define TF_PLAT_STR___s390x__ TF_PLAT_STR_(__s390x__)
+#else
+#define TF_PLAT_STR___s390x__
+#endif
+#if defined(__sparc64__)
+#define TF_PLAT_STR___sparc64__ TF_PLAT_STR_(__sparc64__)
+#else
+#define TF_PLAT_STR___sparc64__
+#endif
+#if defined(__sparc__)
+#define TF_PLAT_STR___sparc__ TF_PLAT_STR_(__sparc__)
+#else
+#define TF_PLAT_STR___sparc__
+#endif
+#if defined(__x86_64__)
+#define TF_PLAT_STR___x86_64__ TF_PLAT_STR_(__x86_64__)
+#else
+#define TF_PLAT_STR___x86_64__
+#endif
+
+#endif  // TENSORFLOW_CORE_PLATFORM_PLATFORM_STRINGS_COMPUTED_H_
diff --git a/tensorflow/core/platform/platform_strings_test.cc b/tensorflow/core/platform/platform_strings_test.cc
new file mode 100644
index 00000000000..5251f10d412
--- /dev/null
+++ b/tensorflow/core/platform/platform_strings_test.cc
@@ -0,0 +1,146 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Test for the platform_strings.h header file.
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+#include <string>
+#include <vector>
+
+#include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/init_main.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/platform_strings.h"
+
+// Embed the platform strings in this binary.
+TF_PLATFORM_STRINGS()
+
+// A vector of strings.
+typedef std::vector<std::string> string_vec;
+
+// Append to *found the strings within the named file with the platform_strings
+// magic prefix, and return true; or return false on error.
+
+// Print the platform strings embedded in the binary file_name and return 0,
+// on on error return 2.
+static int PrintStrings(const std::string file_name) {
+  int rc = 0;
+  string_vec str;
+  if (!tensorflow::GetPlatformStrings(file_name, &str)) {
+    for (int i = 0; i != str.size(); i++) {
+      printf("%s\n", str[i].c_str());
+    }
+  } else {
+    perror(file_name.c_str());
+    rc = 2;
+  }
+  return rc;
+}
+
+// Return whether str[] conatins a string with prefix "macro_name="; if so,
+// set *pvalue to the suffix.
+static bool GetValue(const string_vec &str, const std::string &macro_name,
+                     std::string *pvalue) {
+  std::string nam_eq = macro_name + "=";
+  int i = 0;
+  while (i != str.size() && !tensorflow::str_util::StartsWith(str[i], nam_eq)) {
+    i++;
+  }
+  bool found = (i != str.size());
+  if (found) {
+    *pvalue = str[i].substr(nam_eq.size());
+  }
+  return found;
+}
+
+// If macro_name[] is not equal to value[], check that str[] contains the
+// string "macro_name=value".  Otherwise, check that str[] does not contain any
+// string starting with macro_name=".
+static void CheckStr(const string_vec &str, const std::string &macro_name,
+                     const std::string &value) {
+  std::string value_from_str;
+  if (GetValue(str, macro_name, &value_from_str)) {
+    if (value != value_from_str) {
+      // Output everything found, to aid debugging.
+      LOG(ERROR) << "===== value=" << value
+                 << "  value_from_str=" << value_from_str;
+      for (int i = 0; i != str.size(); i++) {
+        LOG(ERROR) << "% " << str[i];
+      }
+      LOG(ERROR) << "=====";
+    }
+    CHECK_EQ(value, value_from_str) << " " << macro_name << ": bad value";
+  } else {
+    // If the string is not found, we expect value to be macro_name.
+    if (value != macro_name) {
+      // Output everything found, to aid debugging.
+      LOG(ERROR) << "===== value=" << value << "  macro_name=" << macro_name;
+      for (int i = 0; i != str.size(); i++) {
+        LOG(ERROR) << "% " << str[i];
+      }
+      LOG(ERROR) << "=====";
+    }
+    CHECK_EQ(value, macro_name) << " " << macro_name << ": not found in binary";
+  }
+}
+
+// Helper for AS_STR(), below, to perform macro expansion.
+#define AS_STR_1_(x) #x
+
+// Yield x after macro expansion as a nul-terminated constant string.
+#define AS_STR(x) AS_STR_1_(x)
+
+// Run the test, and return 0 on success, 2 otherwise.
+static int RunTest(const std::string &binary_name) {
+  int rc = 0;
+  string_vec str;
+
+  if (!tensorflow::GetPlatformStrings(binary_name, &str)) {
+    CheckStr(str, "__linux__", AS_STR(__linux__));
+    CheckStr(str, "_WIN32", AS_STR(_WIN32));
+    CheckStr(str, "__APPLE__", AS_STR(__APPLE__));
+    CheckStr(str, "__x86_64__", AS_STR(__x86_64__));
+    CheckStr(str, "__aarch64__", AS_STR(__aarch64__));
+    CheckStr(str, "__powerpc64__", AS_STR(__powerpc64__));
+    CheckStr(str, "TF_PLAT_STR_VERSION", TF_PLAT_STR_VERSION_);
+  } else {
+    perror(binary_name.c_str());
+    rc = 2;
+  }
+
+  return rc;
+}
+
+int main(int argc, char *argv[]) {
+  tensorflow::Env *env = tensorflow::Env::Default();
+  static const char usage[] = "usage: platform_strings_test [file...]";
+  int rc = 0;
+  tensorflow::port::InitMain(usage, &argc, &argv);
+  if (argc == 1) {
+    printf("rc=%d\n", PrintStrings(env->GetExecutablePath()));
+    rc = RunTest(env->GetExecutablePath());
+  } else {
+    for (int argn = 1; argn != argc; argn++) {
+      rc |= PrintStrings(argv[argn]);
+    }
+  }
+  return rc;
+}
diff --git a/tensorflow/core/platform/posix/posix_file_system.cc b/tensorflow/core/platform/posix/posix_file_system.cc
index c7afab9583c..fc48cab5646 100644
--- a/tensorflow/core/platform/posix/posix_file_system.cc
+++ b/tensorflow/core/platform/posix/posix_file_system.cc
@@ -240,11 +240,14 @@ Status PosixFileSystem::DeleteFile(const string& fname) {
 }
 
 Status PosixFileSystem::CreateDir(const string& name) {
-  Status result;
-  if (mkdir(TranslateName(name).c_str(), 0755) != 0) {
-    result = IOError(name, errno);
+  string translated = TranslateName(name);
+  if (translated.empty()) {
+    return errors::AlreadyExists(name);
   }
-  return result;
+  if (mkdir(translated.c_str(), 0755) != 0) {
+    return IOError(name, errno);
+  }
+  return Status::OK();
 }
 
 Status PosixFileSystem::DeleteDir(const string& name) {
diff --git a/tensorflow/core/platform/regexp.h b/tensorflow/core/platform/regexp.h
index a4eedf30454..ca9ca1e2442 100644
--- a/tensorflow/core/platform/regexp.h
+++ b/tensorflow/core/platform/regexp.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_PLATFORM_REGEXP_H_
 #define TENSORFLOW_PLATFORM_REGEXP_H_
 
+#include "absl/strings/string_view.h"
 #include "tensorflow/core/platform/platform.h"
 #include "tensorflow/core/platform/types.h"
 
@@ -23,7 +24,7 @@ limitations under the License.
     defined(GOOGLE_RE2)
 #include "tensorflow/core/platform/google/build_config/re2.h"
 namespace tensorflow {
-typedef ::StringPiece RegexpStringPiece;
+typedef absl::string_view RegexpStringPiece;
 }  // namespace tensorflow
 
 #else
diff --git a/tensorflow/core/platform/windows/windows_file_system.cc b/tensorflow/core/platform/windows/windows_file_system.cc
index 6cf79634d7a..993b9906b1c 100644
--- a/tensorflow/core/platform/windows/windows_file_system.cc
+++ b/tensorflow/core/platform/windows/windows_file_system.cc
@@ -439,6 +439,9 @@ Status WindowsFileSystem::DeleteFile(const string& fname) {
 Status WindowsFileSystem::CreateDir(const string& name) {
   Status result;
   std::wstring ws_name = Utf8ToWideChar(name);
+  if (ws_name.empty()) {
+    return errors::AlreadyExists(name);
+  }
   if (_wmkdir(ws_name.c_str()) != 0) {
     result = IOError("Failed to create a directory: " + name, errno);
   }
diff --git a/tensorflow/core/profiler/internal/tfprof_code.cc b/tensorflow/core/profiler/internal/tfprof_code.cc
index 744e1e95deb..0c26855a43e 100644
--- a/tensorflow/core/profiler/internal/tfprof_code.cc
+++ b/tensorflow/core/profiler/internal/tfprof_code.cc
@@ -183,7 +183,7 @@ class Samples {
   // This method adds the statistics of graph nodes created by the python
   // call.
   void Add(const CodeNode* node, const std::vector<uint64>& location_ids) {
-    // displayed leaf might not be true leaf. Retrive the true leaves for
+    // displayed leaf might not be true leaf. Retrieve the true leaves for
     // stats.
     std::vector<const CodeNode*> all_leaf = FetchAllLeaf(node);
     CHECK(!all_leaf.empty()) << node->name();
diff --git a/tensorflow/core/profiler/internal/tfprof_node.cc b/tensorflow/core/profiler/internal/tfprof_node.cc
index 86cb20de7bb..8796234be0c 100644
--- a/tensorflow/core/profiler/internal/tfprof_node.cc
+++ b/tensorflow/core/profiler/internal/tfprof_node.cc
@@ -151,7 +151,7 @@ void ExecStep::AddMemoryStats(const string& dev,
   }
 
   // TODO(xpan): Make this more accurate:
-  // High level: Memory tracking is suspicous and requires large scale
+  // High level: Memory tracking is suspicious and requires large scale
   // clean up.
   // Investigte the memory usage difference between CPU/GPU with OpViewTest.
   //
diff --git a/tensorflow/core/protobuf/config.proto b/tensorflow/core/protobuf/config.proto
index 4689af06afe..b3dc5dccc02 100644
--- a/tensorflow/core/protobuf/config.proto
+++ b/tensorflow/core/protobuf/config.proto
@@ -291,6 +291,13 @@ message RPCOptions {
   // transport for client-master communication that avoids the RPC
   // stack. This option is primarily for used testing the RPC stack.
   bool use_rpc_for_inprocess_master = 1;
+
+  // The compression algorithm to be used. One of "deflate", "gzip".
+  string compression_algorithm = 2;
+
+  // If compression_algorithm is set, the compression level to be used.
+  // From 0 (no compression), up to 3.
+  int32 compression_level = 3;
 };
 
 // Session configuration parameters.
@@ -413,6 +420,11 @@ message ConfigProto {
     // Any positive value sets the max chunk size.  0 defaults to 4096.
     // Any negative value indicates no max, i.e. one chunk only.
     int32 recv_buf_max_chunk = 4;
+
+    // If true, and supported by the platform, the runtime will attempt to
+    // use NUMA affinity where applicable.  One consequence will be the
+    // existence of as many CPU devices as there are available NUMA nodes.
+    bool use_numa_affinity = 5;
   };
 
   Experimental experimental = 16;
diff --git a/tensorflow/core/protobuf/master.proto b/tensorflow/core/protobuf/master.proto
index 03022875e64..c104463c51c 100644
--- a/tensorflow/core/protobuf/master.proto
+++ b/tensorflow/core/protobuf/master.proto
@@ -224,7 +224,7 @@ message CloseSessionResponse {
 message ResetRequest {
   // A list of container names, which may be empty.
   //
-  // If 'container' is not empty, releases resoures in the given
+  // If 'container' is not empty, releases resources in the given
   // containers in all devices.
   //
   // If 'container' is empty, releases resources in the default
diff --git a/tensorflow/core/protobuf/rewriter_config.proto b/tensorflow/core/protobuf/rewriter_config.proto
index d68f2735365..515d673828e 100644
--- a/tensorflow/core/protobuf/rewriter_config.proto
+++ b/tensorflow/core/protobuf/rewriter_config.proto
@@ -38,7 +38,7 @@ message RewriterConfig {
   }
 
   // Enum controlling the number of times to run optimizers. The default is to
-  // run them once.
+  // run them twice.
   enum NumIterationsType {
     DEFAULT_NUM_ITERS = 0;
     ONE = 1;
diff --git a/tensorflow/core/util/dump_graph.cc b/tensorflow/core/util/dump_graph.cc
new file mode 100644
index 00000000000..523d37ecc24
--- /dev/null
+++ b/tensorflow/core/util/dump_graph.cc
@@ -0,0 +1,131 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Helper functions for dumping Graphs, GraphDefs, and FunctionDefs to files for
+// debugging.
+
+#include "tensorflow/core/util/dump_graph.h"
+
+#include "absl/strings/str_cat.h"
+#include "tensorflow/core/lib/strings/proto_serialization.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/mutex.h"
+
+namespace tensorflow {
+
+namespace {
+
+struct NameCounts {
+  mutex counts_mutex;
+  std::unordered_map<string, int> counts;
+};
+
+string MakeUniqueFilename(string name) {
+  static NameCounts& instance = *new NameCounts;
+
+  // Remove illegal characters from `name`.
+  for (int i = 0; i < name.size(); ++i) {
+    char ch = name[i];
+    if (ch == '/' || ch == '[' || ch == ']' || ch == '*' || ch == '?') {
+      name[i] = '_';
+    }
+  }
+
+  int count;
+  {
+    mutex_lock lock(instance.counts_mutex);
+    count = instance.counts[name]++;
+  }
+
+  string filename = name;
+  if (count > 0) {
+    absl::StrAppend(&filename, "_", count);
+  }
+  absl::StrAppend(&filename, ".pbtxt");
+  return filename;
+}
+
+#if defined(TENSORFLOW_LITE_PROTOS)
+Status WriteToFile(const string& filepath,
+                   const ::tensorflow::protobuf::MessageLite& proto) {
+  string s;
+  if (!SerializeToStringDeterministic(proto, &s)) {
+    return errors::Internal("Failed to serialize proto to string.");
+  }
+  return WriteStringToFile(Env::Default(), filepath, s);
+}
+#else
+Status WriteToFile(const string& filepath,
+                   const ::tensorflow::protobuf::Message& proto) {
+  return WriteTextProto(Env::Default(), filepath, proto);
+}
+#endif
+
+template <class T>
+string WriteTextProtoToUniqueFile(Env* env, const string& name,
+                                  const char* proto_type, T& proto,
+                                  const string& dirname) {
+  const char* dir = nullptr;
+  if (!dirname.empty()) {
+    dir = dirname.c_str();
+  } else {
+    dir = getenv("TF_DUMP_GRAPH_PREFIX");
+  }
+  if (!dir) {
+    return "(TF_DUMP_GRAPH_PREFIX not specified)";
+  }
+  Status status = env->RecursivelyCreateDir(dir);
+  if (!status.ok()) {
+    LOG(WARNING) << "Failed to create " << dir << " for dumping " << proto_type
+                 << ": " << status;
+    return "(unavailable)";
+  }
+  string filepath = absl::StrCat(dir, "/", MakeUniqueFilename(name));
+  status = WriteToFile(filepath, proto);
+  if (!status.ok()) {
+    LOG(WARNING) << "Failed to dump " << proto_type << " to file: " << filepath
+                 << " : " << status;
+    return "(unavailable)";
+  }
+  LOG(INFO) << "Dumped " << proto_type << " to " << filepath;
+  return filepath;
+}
+
+}  // anonymous namespace
+
+string DumpGraphDefToFile(const string& name, GraphDef const& graph_def,
+                          const string& dirname) {
+  return WriteTextProtoToUniqueFile(Env::Default(), name, "GraphDef", graph_def,
+                                    dirname);
+}
+
+string DumpGraphToFile(const string& name, Graph const& graph,
+                       const FunctionLibraryDefinition* flib_def,
+                       const string& dirname) {
+  GraphDef graph_def;
+  graph.ToGraphDef(&graph_def);
+  if (flib_def) {
+    *graph_def.mutable_library() = flib_def->ToProto();
+  }
+  return DumpGraphDefToFile(name, graph_def, dirname);
+}
+
+string DumpFunctionDefToFile(const string& name, FunctionDef const& fdef,
+                             const string& dirname) {
+  return WriteTextProtoToUniqueFile(Env::Default(), name, "FunctionDef", fdef,
+                                    dirname);
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/util/dump_graph.h b/tensorflow/core/util/dump_graph.h
new file mode 100644
index 00000000000..03dc807a2b3
--- /dev/null
+++ b/tensorflow/core/util/dump_graph.h
@@ -0,0 +1,52 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Helper functions for dumping Graphs, GraphDefs, and FunctionDefs to files for
+// debugging.
+
+#ifndef TENSORFLOW_CORE_UTIL_DUMP_GRAPH_H_
+#define TENSORFLOW_CORE_UTIL_DUMP_GRAPH_H_
+
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/graph/graph.h"
+
+namespace tensorflow {
+
+// Dumps 'graph_def' to a file, as a GraphDef text proto. Returns the file name
+// chosen.
+//
+// Automatically picks a file name. Prefixes 'name' with the value of the
+// TF_DUMP_GRAPH_PREFIX environment variable if 'dirname' is empty, and suffixes
+// 'name' with ".pbtxt" to form a name. If a graph has already been dumped by
+// this process with the same name, suffixes with "_n.pbtxt", where 'n' is a
+// sequence number.
+string DumpGraphDefToFile(const string& name, GraphDef const& graph_def,
+                          const string& dirname = "");
+
+// Similar to DumpGraphDefToFile, but builds the GraphDef to dump from a 'graph'
+// and an optional function library 'flib_def'. Returns the file name chosen.
+string DumpGraphToFile(const string& name, Graph const& graph,
+                       const FunctionLibraryDefinition* flib_def = nullptr,
+                       const string& dirname = "");
+
+// Similar to DumpGraphDefToFile, but dumps a function as a FunctionDef text
+// proto. Returns the file name chosen.
+string DumpFunctionDefToFile(const string& name, FunctionDef const& fdef,
+                             const string& dirname = "");
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_UTIL_DUMP_GRAPH_H_
diff --git a/tensorflow/core/util/dump_graph_test.cc b/tensorflow/core/util/dump_graph_test.cc
new file mode 100644
index 00000000000..d01c1c5a029
--- /dev/null
+++ b/tensorflow/core/util/dump_graph_test.cc
@@ -0,0 +1,62 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/util/dump_graph.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/graph/node_builder.h"
+#include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/lib/strings/proto_serialization.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace {
+
+TEST(DumpGraph, DumpGraphToFileSuccess) {
+  Graph graph(OpRegistry::Global());
+  Node* node;
+  TF_CHECK_OK(NodeBuilder("A", "NoOp").Finalize(&graph, &node));
+
+  setenv("TF_DUMP_GRAPH_PREFIX", testing::TmpDir().c_str(), 1);
+  string ret = DumpGraphToFile("graph", graph);
+  EXPECT_EQ(ret, io::JoinPath(testing::TmpDir(), "graph.pbtxt"));
+  ret = DumpGraphToFile("graph", graph);
+  EXPECT_EQ(ret, io::JoinPath(testing::TmpDir(), "graph_1.pbtxt"));
+
+  GraphDef gdef;
+  TF_CHECK_OK(ReadTextProto(
+      Env::Default(), io::JoinPath(testing::TmpDir(), "graph.pbtxt"), &gdef));
+  string read, written;
+  gdef.AppendToString(&read);
+  graph.ToGraphDefDebug().AppendToString(&written);
+  EXPECT_EQ(read, written);
+}
+
+TEST(DumpGraph, DumpGraphToFileNoEnvPrefix) {
+  Graph graph(OpRegistry::Global());
+  unsetenv("TF_DUMP_GRAPH_PREFIX");
+  string ret = DumpGraphToFile("graph", graph);
+  EXPECT_EQ(ret, "(TF_DUMP_GRAPH_PREFIX not specified)");
+}
+
+TEST(DumpGraph, DumpFunctionDefToFileSuccess) {
+  FunctionDef fdef;
+  setenv("TF_DUMP_GRAPH_PREFIX", testing::TmpDir().c_str(), 1);
+  string ret = DumpFunctionDefToFile("function", fdef);
+  EXPECT_EQ(ret, io::JoinPath(testing::TmpDir(), "function.pbtxt"));
+}
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/util/mkl_util.h b/tensorflow/core/util/mkl_util.h
index b7a6e0b6902..928807458ac 100644
--- a/tensorflow/core/util/mkl_util.h
+++ b/tensorflow/core/util/mkl_util.h
@@ -1644,6 +1644,9 @@ class MklDnnData {
         cpu_engine_(e) {}
 
   ~MklDnnData() {
+    if (allocated_buffer_ != nullptr) {
+      cpu_allocator()->DeallocateRaw(allocated_buffer_);
+    }
     cpu_engine_ = nullptr;  // We don't own this.
     delete (user_memory_);
     delete (reorder_memory_);
diff --git a/tensorflow/core/util/permutation_input_iterator.h b/tensorflow/core/util/permutation_input_iterator.h
index f6375b25157..649318ebf3b 100644
--- a/tensorflow/core/util/permutation_input_iterator.h
+++ b/tensorflow/core/util/permutation_input_iterator.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_UTIL_PERMUTATION_INPUT_ITERATOR_H_
-#define TENSORFLOW_UTIL_PERMUTATION_INPUT_ITERATOR_H_
+#ifndef TENSORFLOW_CORE_UTIL_PERMUTATION_INPUT_ITERATOR_H_
+#define TENSORFLOW_CORE_UTIL_PERMUTATION_INPUT_ITERATOR_H_
 
 #include <iostream>
 #include <iterator>
@@ -131,4 +131,4 @@ class PermutationInputIterator {
 
 }  // end namespace tensorflow
 
-#endif  // TENSORFLOW_UTIL_PERMUTATION_INPUT_ITERATOR_H_
+#endif  // TENSORFLOW_CORE_UTIL_PERMUTATION_INPUT_ITERATOR_H_
diff --git a/tensorflow/core/util/permutation_output_iterator.h b/tensorflow/core/util/permutation_output_iterator.h
new file mode 100644
index 00000000000..638c0f45458
--- /dev/null
+++ b/tensorflow/core/util/permutation_output_iterator.h
@@ -0,0 +1,129 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_UTIL_PERMUTATION_OUTPUT_ITERATOR_H_
+#define TENSORFLOW_CORE_UTIL_PERMUTATION_OUTPUT_ITERATOR_H_
+
+#include <iostream>
+#include <iterator>
+
+namespace tensorflow {
+
+template <typename ValueType, typename OutputIteratorT, typename IndexIteratorT,
+          typename OffsetT = ptrdiff_t>
+class PermutationOutputIterator {
+ public:
+  // Required iterator traits
+  typedef PermutationOutputIterator self_type;  ///< My own type
+  typedef OffsetT difference_type;  ///< Type to express the result of
+                                    ///< subtracting one iterator from another
+  typedef ValueType
+      value_type;  ///< The type of the element the iterator can point to
+  typedef ValueType* pointer;    ///< The type of a pointer to an element the
+                                 ///< iterator can point to
+  typedef ValueType& reference;  ///< The type of a reference to an element the
+                                 ///< iterator can point to
+
+  typedef std::random_access_iterator_tag
+      iterator_category;  ///< The iterator category
+
+ private:
+  OutputIteratorT output_itr;
+  IndexIteratorT index_itr;
+
+ public:
+  /// Constructor
+  __host__ __device__ __forceinline__ PermutationOutputIterator(
+      OutputIteratorT output_itr,  ///< Input iterator to wrap
+      IndexIteratorT index_itr)    ///< Conversion functor to wrap
+      : output_itr(output_itr), index_itr(index_itr) {}
+
+  /// Postfix increment
+  __host__ __device__ __forceinline__ self_type operator++(int) {
+    self_type retval = *this;
+    index_itr++;
+    return retval;
+  }
+
+  /// Prefix increment
+  __host__ __device__ __forceinline__ self_type operator++() {
+    index_itr++;
+    return *this;
+  }
+
+  /// Indirection
+  __host__ __device__ __forceinline__ reference operator*() const {
+    return output_itr[*index_itr];
+  }
+
+  /// Addition
+  template <typename Distance>
+  __host__ __device__ __forceinline__ self_type operator+(Distance n) const {
+    self_type retval(output_itr, index_itr + n);
+    return retval;
+  }
+
+  /// Addition assignment
+  template <typename Distance>
+  __host__ __device__ __forceinline__ self_type& operator+=(Distance n) {
+    index_itr += n;
+    return *this;
+  }
+
+  /// Subtraction
+  template <typename Distance>
+  __host__ __device__ __forceinline__ self_type operator-(Distance n) const {
+    self_type retval(output_itr, index_itr - n);
+    return retval;
+  }
+
+  /// Subtraction assignment
+  template <typename Distance>
+  __host__ __device__ __forceinline__ self_type& operator-=(Distance n) {
+    index_itr -= n;
+    return *this;
+  }
+
+  /// Distance
+  __host__ __device__ __forceinline__ difference_type
+  operator-(self_type other) const {
+    return index_itr - other.index_itr;
+  }
+
+  /// Array subscript
+  template <typename Distance>
+  __host__ __device__ __forceinline__ reference operator[](Distance n) const {
+    return output_itr[index_itr[n]];
+  }
+
+  /// Equal to
+  __host__ __device__ __forceinline__ bool operator==(const self_type& rhs) {
+    return (index_itr == rhs.index_itr && output_itr == rhs.output_itr);
+  }
+
+  /// Not equal to
+  __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs) {
+    return !(*this == rhs);
+  }
+
+  /// ostream operator
+  friend std::ostream& operator<<(std::ostream& os, const self_type& itr) {
+    return os;
+  }
+};
+
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_UTIL_PERMUTATION_OUTPUT_ITERATOR_H_
diff --git a/tensorflow/core/util/sparse/sparse_tensor.h b/tensorflow/core/util/sparse/sparse_tensor.h
index b9ca8ab395b..89c163aa513 100644
--- a/tensorflow/core/util/sparse/sparse_tensor.h
+++ b/tensorflow/core/util/sparse/sparse_tensor.h
@@ -238,15 +238,6 @@ class SparseTensor {
   static Status Split(const SparseTensor& tensor, const int split_dim,
                       const int num_split, std::vector<SparseTensor>* result);
 
-  template <typename T>
-  ABSL_DEPRECATED(
-      "Use the form of Split() that takes an output pointer and returns a "
-      "status instead.")
-  static std::vector<SparseTensor> Split(const SparseTensor& tensor,
-                                         const int split_dim,
-                                         const int num_split,
-                                         Status* status = nullptr);
-
   // Slice() will slice the input SparseTensor into a SparseTensor based on
   // specified start and size. Both start and size are 1-D array with each
   // element of the array representing one dimension. The start is the start
@@ -578,10 +569,9 @@ SparseTensor SparseTensor::Concat(
 }
 
 template <typename T>
-std::vector<SparseTensor> SparseTensor::Split(const SparseTensor& input_tensor,
-                                              const int split_dim,
-                                              const int num_split,
-                                              Status* status /* = nullptr */) {
+Status SparseTensor::Split(const SparseTensor& input_tensor,
+                           const int split_dim, const int num_split,
+                           std::vector<SparseTensor>* result) {
   std::vector<Tensor> output_indices;
   std::vector<Tensor> output_values;
   std::vector<TensorShape> output_shapes;
@@ -601,17 +591,15 @@ std::vector<SparseTensor> SparseTensor::Split(const SparseTensor& input_tensor,
   const int split_dim_size = input_tensor.shape()[split_dim];
   const int split_size = split_dim_size / num_split;
 
-  if (!(num_split > 0 && num_split <= split_dim_size) && status != nullptr) {
-    *status = Status(error::INVALID_ARGUMENT,
-                     strings::StrCat("num_split must be in the interval (0, ",
-                                     split_dim_size, "]"));
-    return {};
+  if (!(num_split > 0 && num_split <= split_dim_size)) {
+    return Status(error::INVALID_ARGUMENT,
+                  strings::StrCat("num_split must be in the interval (0, ",
+                                  split_dim_size, "]"));
   }
   if (!(split_dim >= 0 && split_dim < num_dim)) {
-    *status = Status(
+    return Status(
         error::INVALID_ARGUMENT,
         strings::StrCat("num_dim must be in the interval [0, ", num_dim, ")"));
-    return {};
   }
 
   const int residual = split_dim_size % num_split;
@@ -649,28 +637,18 @@ std::vector<SparseTensor> SparseTensor::Split(const SparseTensor& input_tensor,
     }
   }
 
-  std::vector<SparseTensor> output_tensors;
-  output_tensors.reserve(num_split);
+  result->clear();
+  result->reserve(num_split);
   for (int i = 0; i < num_split; ++i) {
     SparseTensor tensor;
     Status create_status =
         Create(output_indices[i], output_values[i], output_shapes[i], &tensor);
-    if (!create_status.ok() && status != nullptr) {
-      *status = create_status;
-      return {};
+    if (!create_status.ok()) {
+      return create_status;
     }
-    output_tensors.push_back(std::move(tensor));
+    result->push_back(std::move(tensor));
   }
-  return output_tensors;
-}
-
-template <typename T>
-Status SparseTensor::Split(const SparseTensor& input_tensor,
-                           const int split_dim, const int num_split,
-                           std::vector<SparseTensor>* result) {
-  Status status;
-  *result = Split<T>(input_tensor, split_dim, num_split, &status);
-  return status;
+  return Status::OK();
 }
 
 template <typename T>
diff --git a/tensorflow/core/util/stats_calculator.cc b/tensorflow/core/util/stats_calculator.cc
index eb077546501..bce650f2456 100644
--- a/tensorflow/core/util/stats_calculator.cc
+++ b/tensorflow/core/util/stats_calculator.cc
@@ -53,7 +53,7 @@ std::string StatsCalculator::HeaderString(const std::string& title) const {
          << " ==============================" << std::endl;
 
   InitField(stream, 24) << "[node type]";
-  InitField(stream, 9) << "[start]";
+  InitField(stream, 17) << "[start]";
   InitField(stream, 9) << "[first]";
   InitField(stream, 9) << "[avg ms]";
   InitField(stream, 8) << "[%]";
@@ -77,7 +77,7 @@ std::string StatsCalculator::ColumnString(const Detail& detail,
 
   std::stringstream stream;
   InitField(stream, 24) << detail.type;
-  InitField(stream, 9) << start_ms;
+  InitField(stream, 17) << start_ms;
   InitField(stream, 9) << first_time_ms;
   InitField(stream, 9) << avg_time_ms;
   InitField(stream, 7) << percentage << "%";
diff --git a/tensorflow/core/util/strided_slice_op.cc b/tensorflow/core/util/strided_slice_op.cc
index ad8a44a5184..55688e58084 100644
--- a/tensorflow/core/util/strided_slice_op.cc
+++ b/tensorflow/core/util/strided_slice_op.cc
@@ -83,10 +83,17 @@ static Status TF_MUST_USE_RESULT BuildDenseSpec(
   {
     int full_index = 0;
 
-    const auto& strides_flat = sparse.strides_tensor.flat<T>();
+    const T* const strides_flat = sparse.strides_tensor.vec<T>().data();
     dense->begin_valid = sparse.begin_tensor != nullptr;
     dense->end_valid = sparse.end_tensor != nullptr;
 
+    const T* const begin_flat = sparse.begin_tensor != nullptr
+                                    ? sparse.begin_tensor->vec<T>().data()
+                                    : nullptr;
+    const T* const end_flat = sparse.end_tensor != nullptr
+                                  ? sparse.end_tensor->vec<T>().data()
+                                  : nullptr;
+
     for (int i = 0; i < sparse.dims; i++) {
       if ((1 << i) & sparse.ellipsis_mask) {
         // Expand the ellipsis into the appropriate indices
@@ -112,16 +119,14 @@ static Status TF_MUST_USE_RESULT BuildDenseSpec(
         }
 
         // Gather slicing spec into appropriate index
-        if (sparse.begin_tensor != nullptr) {
-          const auto& begin_flat = sparse.begin_tensor->flat<T>();
-          dense->begin[full_index] = internal::SubtleMustCopy<T>(begin_flat(i));
+        if (begin_flat != nullptr) {
+          dense->begin[full_index] = internal::SubtleMustCopy<T>(begin_flat[i]);
         }
-        if (sparse.end_tensor != nullptr) {
-          const auto& end_flat = sparse.end_tensor->flat<T>();
-          dense->end[full_index] = internal::SubtleMustCopy<T>(end_flat(i));
+        if (end_flat != nullptr) {
+          dense->end[full_index] = internal::SubtleMustCopy<T>(end_flat[i]);
         }
         dense->strides[full_index] =
-            internal::SubtleMustCopy<T>(strides_flat(i));
+            internal::SubtleMustCopy<T>(strides_flat[i]);
         if (sparse.begin_mask & (1 << i)) {
           dense->begin_mask |= (1 << full_index);
         }
diff --git a/tensorflow/core/util/tensor_bundle/tensor_bundle.cc b/tensorflow/core/util/tensor_bundle/tensor_bundle.cc
index 2dcb57a1f9b..3709ee5ae30 100644
--- a/tensorflow/core/util/tensor_bundle/tensor_bundle.cc
+++ b/tensorflow/core/util/tensor_bundle/tensor_bundle.cc
@@ -785,7 +785,7 @@ Status BundleReader::GetBundleEntryProto(StringPiece key,
   TF_RETURN_IF_ERROR(
       ParseEntryProto(iter_->key(), iter_->value(), &entry_copy));
   if (!TensorShape::IsValid(entry_copy.shape())) {
-    return errors::DataLoss("Invaid tensor shape: ", key, " ",
+    return errors::DataLoss("Invalid tensor shape: ", key, " ",
                             ProtoShortDebugString(entry_copy.shape()));
   }
 
@@ -895,7 +895,7 @@ Status BundleReader::ReadCurrent(Tensor* val) {
   BundleEntryProto entry;
   TF_RETURN_IF_ERROR(ParseEntryProto(iter_->key(), iter_->value(), &entry));
   if (!TensorShape::IsValid(entry.shape())) {
-    return errors::DataLoss("Invaid tensor shape: ", iter_->key(), " ",
+    return errors::DataLoss("Invalid tensor shape: ", iter_->key(), " ",
                             ProtoShortDebugString(entry.shape()));
   }
 
diff --git a/tensorflow/core/util/tensor_format.h b/tensorflow/core/util/tensor_format.h
index b0c349dd907..a296fb447e2 100644
--- a/tensorflow/core/util/tensor_format.h
+++ b/tensorflow/core/util/tensor_format.h
@@ -498,7 +498,8 @@ inline TensorShape ShapeFromFormat(TensorFormat format, int64 N,
   dim_sizes[GetTensorBatchDimIndex(dims, format)] = N;
   for (int dim = 0; static_cast<size_t>(dim) < spatial.size(); dim++) {
     auto dim_size = spatial[dim];
-    if (format == FORMAT_NHWC_VECT_W && dim == spatial.size() - 1) {
+    if (format == FORMAT_NHWC_VECT_W &&
+        static_cast<size_t>(dim) == spatial.size() - 1) {
       CHECK_EQ(0, dim_size % 4)
           << "FORMAT_NHWC_VECT_W requires W to be a multiple of 4, but W="
           << dim_size;
diff --git a/tensorflow/core/util/tensor_ops_util.h b/tensorflow/core/util/tensor_ops_util.h
new file mode 100644
index 00000000000..615f088a9b9
--- /dev/null
+++ b/tensorflow/core/util/tensor_ops_util.h
@@ -0,0 +1,128 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_UTIL_TENSOR_OPS_UTIL_H_
+#define TENSORFLOW_CORE_UTIL_TENSOR_OPS_UTIL_H_
+
+#define EIGEN_USE_THREADS
+
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/variant_op_registry.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef Eigen::GpuDevice GPUDevice;
+
+template <typename Device>
+Status ZerosLikeTensor(OpKernelContext* ctx, const Tensor& x, Tensor* out) {
+  AllocatorAttributes attr;
+  if (x.dtype() == DT_VARIANT) {
+    attr.set_on_host(true);
+  }
+  TF_RETURN_IF_ERROR(ctx->allocate_temp(x.dtype(), x.shape(), out, attr));
+
+  switch (out->dtype()) {
+#define DTYPE_CASE(dtype)                                       \
+  case DataTypeToEnum<dtype>::value:                            \
+    /* TODO(skyewm): use SetZeroFunctor like in ZerosLikeOp? */ \
+    out->flat<dtype>().device(ctx->eigen_device<Device>()) =    \
+        out->flat<dtype>().constant(dtype(0));                  \
+    break;
+
+    TF_CALL_POD_TYPES(DTYPE_CASE)
+#undef DTYPE_CASE
+
+    case DT_INVALID: {
+      *out = Tensor(DT_INVALID);
+      break;
+    }
+    case DataTypeToEnum<Variant>::value: {
+      Variant* out_variant = out->scalar<Variant>().data();
+      TF_RETURN_IF_ERROR(
+          UnaryOpVariant<Device>(ctx, ZEROS_LIKE_VARIANT_UNARY_OP,
+                                 x.scalar<Variant>()(), out_variant));
+      break;
+    }
+    default:
+      return errors::InvalidArgument(
+          "Trying to compute zeros_like for unsupported dtype ",
+          DataTypeString(out->dtype()));
+  }
+  return Status::OK();
+}
+
+template <typename Device>
+Status BinaryAddTensors(OpKernelContext* ctx, const Tensor& a, const Tensor& b,
+                        Tensor* out) {
+  if (a.dtype() == DT_INVALID) {
+    *out = b;
+    return Status::OK();
+  }
+  if (b.dtype() == DT_INVALID) {
+    *out = a;
+    return Status::OK();
+  }
+  if (a.dtype() != b.dtype()) {
+    return errors::InvalidArgument(
+        "Trying to add two tensors with incompatible element types. ",
+        "One is ", DataTypeString(a.dtype()), " and the other is ",
+        DataTypeString(b.dtype()));
+  }
+  if (a.shape() != b.shape()) {
+    // TODO(apassos) support broadcasting additions here?
+    return errors::InvalidArgument(
+        "Trying to add two tensors with incompatible element shapes. ",
+        "One is ", a.shape().DebugString(), " and the other is ",
+        b.shape().DebugString());
+  }
+
+  AllocatorAttributes attr;
+  if (a.dtype() == DT_VARIANT) {
+    attr.set_on_host(true);
+  }
+  TF_RETURN_IF_ERROR(ctx->allocate_temp(a.dtype(), a.shape(), out, attr));
+
+  switch (out->dtype()) {
+#define DTYPE_CASE(dtype)                                    \
+  case DataTypeToEnum<dtype>::value:                         \
+    out->flat<dtype>().device(ctx->eigen_device<Device>()) = \
+        a.flat<dtype>() + b.flat<dtype>();                   \
+    break;
+
+    TF_CALL_NUMBER_TYPES(DTYPE_CASE)
+#undef DTYPE_CASE
+
+    case DataTypeToEnum<Variant>::value: {
+      Variant* out_variant = out->scalar<Variant>().data();
+      TF_RETURN_IF_ERROR(BinaryOpVariants<Device>(
+          ctx, ADD_VARIANT_BINARY_OP, a.scalar<Variant>()(),
+          b.scalar<Variant>()(), out_variant));
+      break;
+    }
+    default:
+      return errors::InvalidArgument("Trying to add unsupported dtype ",
+                                     out->dtype());
+  }
+  return Status::OK();
+}
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_UTIL_TENSOR_OPS_UTIL_H_
diff --git a/tensorflow/examples/adding_an_op/fact_test.py b/tensorflow/examples/adding_an_op/fact_test.py
index 11163e7ba5c..46beaebe0cc 100644
--- a/tensorflow/examples/adding_an_op/fact_test.py
+++ b/tensorflow/examples/adding_an_op/fact_test.py
@@ -19,10 +19,12 @@ from __future__ import division
 from __future__ import print_function
 
 import tensorflow as tf
+from tensorflow.python.framework import test_util
 
 
 class FactTest(tf.test.TestCase):
 
+  @test_util.run_deprecated_v1
   def test(self):
     with self.cached_session():
       print(tf.user_ops.my_fact().eval())
diff --git a/tensorflow/examples/adding_an_op/zero_out_1_test.py b/tensorflow/examples/adding_an_op/zero_out_1_test.py
index 342d3a020cc..459ac2dc279 100644
--- a/tensorflow/examples/adding_an_op/zero_out_1_test.py
+++ b/tensorflow/examples/adding_an_op/zero_out_1_test.py
@@ -23,10 +23,12 @@ import os.path
 
 import tensorflow as tf
 from tensorflow.examples.adding_an_op import zero_out_op_1
+from tensorflow.python.framework import test_util
 
 
 class ZeroOut1Test(tf.test.TestCase):
 
+  @test_util.run_deprecated_v1
   def test(self):
     with self.cached_session():
       result = zero_out_op_1.zero_out([5, 4, 3, 2, 1])
diff --git a/tensorflow/examples/adding_an_op/zero_out_2_test.py b/tensorflow/examples/adding_an_op/zero_out_2_test.py
index 45045978176..650fd9546b5 100644
--- a/tensorflow/examples/adding_an_op/zero_out_2_test.py
+++ b/tensorflow/examples/adding_an_op/zero_out_2_test.py
@@ -24,20 +24,24 @@ import tensorflow as tf
 
 from tensorflow.examples.adding_an_op import zero_out_grad_2  # pylint: disable=unused-import
 from tensorflow.examples.adding_an_op import zero_out_op_2
+from tensorflow.python.framework import test_util
 
 
 class ZeroOut2Test(tf.test.TestCase):
 
+  @test_util.run_deprecated_v1
   def test(self):
     with self.cached_session():
       result = zero_out_op_2.zero_out([5, 4, 3, 2, 1])
       self.assertAllEqual(result.eval(), [5, 0, 0, 0, 0])
 
+  @test_util.run_deprecated_v1
   def test_2d(self):
     with self.cached_session():
       result = zero_out_op_2.zero_out([[6, 5, 4], [3, 2, 1]])
       self.assertAllEqual(result.eval(), [[6, 0, 0], [0, 0, 0]])
 
+  @test_util.run_deprecated_v1
   def test_grad(self):
     with self.cached_session():
       shape = (5,)
@@ -46,6 +50,7 @@ class ZeroOut2Test(tf.test.TestCase):
       err = tf.test.compute_gradient_error(x, shape, y, shape)
       self.assertLess(err, 1e-4)
 
+  @test_util.run_deprecated_v1
   def test_grad_2d(self):
     with self.cached_session():
       shape = (2, 3)
diff --git a/tensorflow/examples/adding_an_op/zero_out_3_test.py b/tensorflow/examples/adding_an_op/zero_out_3_test.py
index 15d62495aae..8cbe2b6793a 100644
--- a/tensorflow/examples/adding_an_op/zero_out_3_test.py
+++ b/tensorflow/examples/adding_an_op/zero_out_3_test.py
@@ -21,31 +21,36 @@ from __future__ import print_function
 
 import tensorflow as tf
 from tensorflow.examples.adding_an_op import zero_out_op_3
+from tensorflow.python.framework import test_util
 
 
 class ZeroOut3Test(tf.test.TestCase):
 
+  @test_util.run_deprecated_v1
   def test(self):
     with self.cached_session():
       result = zero_out_op_3.zero_out([5, 4, 3, 2, 1])
       self.assertAllEqual(result.eval(), [5, 0, 0, 0, 0])
 
+  @test_util.run_deprecated_v1
   def testAttr(self):
     with self.cached_session():
       result = zero_out_op_3.zero_out([5, 4, 3, 2, 1], preserve_index=3)
       self.assertAllEqual(result.eval(), [0, 0, 0, 2, 0])
 
+  @test_util.run_deprecated_v1
   def testNegative(self):
     with self.cached_session():
       result = zero_out_op_3.zero_out([5, 4, 3, 2, 1], preserve_index=-1)
       with self.assertRaisesOpError("Need preserve_index >= 0, got -1"):
-        result.eval()
+        self.evaluate(result)
 
+  @test_util.run_deprecated_v1
   def testLarge(self):
     with self.cached_session():
       result = zero_out_op_3.zero_out([5, 4, 3, 2, 1], preserve_index=17)
       with self.assertRaisesOpError("preserve_index out of range"):
-        result.eval()
+        self.evaluate(result)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/examples/autograph/integration_tests/BUILD b/tensorflow/examples/autograph/integration_tests/BUILD
index d20c17b63b9..2a4a0f75e7a 100644
--- a/tensorflow/examples/autograph/integration_tests/BUILD
+++ b/tensorflow/examples/autograph/integration_tests/BUILD
@@ -22,7 +22,6 @@ py_test(
         "keras_test.py",
     ],
     srcs_version = "PY2AND3",
-    tags = ["no_windows"],
     deps = [
         "//tensorflow:tensorflow_py",
     ],
@@ -34,7 +33,6 @@ py_test(
         "list_literals_test.py",
     ],
     srcs_version = "PY2AND3",
-    tags = ["no_windows"],
     deps = [
         "//tensorflow:tensorflow_py",
     ],
diff --git a/tensorflow/examples/autograph/integration_tests/keras_test.py b/tensorflow/examples/autograph/integration_tests/keras_test.py
index dca7c07b470..3fe33df920d 100644
--- a/tensorflow/examples/autograph/integration_tests/keras_test.py
+++ b/tensorflow/examples/autograph/integration_tests/keras_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import tensorflow as tf
 
 from tensorflow.python import autograph
+from tensorflow.python.framework import test_util
 
 
 class MinimalKeras(tf.keras.Model):
@@ -84,6 +85,7 @@ class KerasTest(tf.test.TestCase):
     model = ModelWithStaticConditional(True)
     self.assertEqual(model.call(), 25)
 
+  @test_util.run_deprecated_v1
   def test_recursive_true(self):
     with self.assertRaisesRegexp(NotImplementedError,
                                  'Object conversion is not yet supported.'):
@@ -93,10 +95,10 @@ class KerasTest(tf.test.TestCase):
         init = tf.global_variables_initializer()
 
         with tf.Session() as sess:
-          sess.run(init)
+          self.evaluate(init)
           sample_input = tf.random_uniform((1, 10, 10, 1))
           output = model(sample_input)  # pylint: disable=not-callable
-          self.assertEqual(sess.run(output).shape, (1, 3))
+          self.assertEqual(self.evaluate(output).shape, (1, 3))
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/examples/autograph/integration_tests/list_literals_test.py b/tensorflow/examples/autograph/integration_tests/list_literals_test.py
index 917f5ff9d84..e85d4abcfc9 100644
--- a/tensorflow/examples/autograph/integration_tests/list_literals_test.py
+++ b/tensorflow/examples/autograph/integration_tests/list_literals_test.py
@@ -34,7 +34,7 @@ class ListLiteralsTest(tf.test.TestCase):
     result = converted()
 
     with self.cached_session() as sess:
-      self.assertAllEqual(sess.run(result), [1, 2, 3])
+      self.assertAllEqual(self.evaluate(result), [1, 2, 3])
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/examples/get_started/regression/custom_regression.py b/tensorflow/examples/get_started/regression/custom_regression.py
index 2e34362c5ce..7b7cbb78666 100644
--- a/tensorflow/examples/get_started/regression/custom_regression.py
+++ b/tensorflow/examples/get_started/regression/custom_regression.py
@@ -100,12 +100,11 @@ def main(argv):
         # that the examples are well mixed.
         train.shuffle(1000).batch(128)
         # Repeat forever
-        .repeat().make_one_shot_iterator().get_next())
+        .repeat())
 
   # Build the validation input_fn.
   def input_test():
-    return (test.shuffle(1000).batch(128)
-            .make_one_shot_iterator().get_next())
+    return test.shuffle(1000).batch(128)
 
   # The first way assigns a unique weight to each category. To do this you must
   # specify the category's vocabulary (values outside this specification will
diff --git a/tensorflow/examples/get_started/regression/dnn_regression.py b/tensorflow/examples/get_started/regression/dnn_regression.py
index 951c93b52e7..94669a5082b 100644
--- a/tensorflow/examples/get_started/regression/dnn_regression.py
+++ b/tensorflow/examples/get_started/regression/dnn_regression.py
@@ -45,12 +45,11 @@ def main(argv):
         # that the examples are well mixed.
         train.shuffle(1000).batch(128)
         # Repeat forever
-        .repeat().make_one_shot_iterator().get_next())
+        .repeat())
 
   # Build the validation input_fn.
   def input_test():
-    return (test.shuffle(1000).batch(128)
-            .make_one_shot_iterator().get_next())
+    return test.shuffle(1000).batch(128)
 
   # The first way assigns a unique weight to each category. To do this you must
   # specify the category's vocabulary (values outside this specification will
diff --git a/tensorflow/examples/get_started/regression/linear_regression_categorical.py b/tensorflow/examples/get_started/regression/linear_regression_categorical.py
index e2ad415fbcb..5312272a959 100644
--- a/tensorflow/examples/get_started/regression/linear_regression_categorical.py
+++ b/tensorflow/examples/get_started/regression/linear_regression_categorical.py
@@ -45,12 +45,11 @@ def main(argv):
         # that the examples are well mixed.
         train.shuffle(1000).batch(128)
         # Repeat forever
-        .repeat().make_one_shot_iterator().get_next())
+        .repeat())
 
   # Build the validation input_fn.
   def input_test():
-    return (test.shuffle(1000).batch(128)
-            .make_one_shot_iterator().get_next())
+    return test.shuffle(1000).batch(128)
 
   # The following code demonstrates two of the ways that `feature_columns` can
   # be used to build a model with categorical inputs.
diff --git a/tensorflow/examples/how_tos/reading_data/fully_connected_reader.py b/tensorflow/examples/how_tos/reading_data/fully_connected_reader.py
index 74022474486..5c52a2c8461 100644
--- a/tensorflow/examples/how_tos/reading_data/fully_connected_reader.py
+++ b/tensorflow/examples/how_tos/reading_data/fully_connected_reader.py
@@ -126,7 +126,7 @@ def inputs(train, batch_size, num_epochs):
     dataset = dataset.repeat(num_epochs)
     dataset = dataset.batch(batch_size)
 
-    iterator = dataset.make_one_shot_iterator()
+    iterator = tf.compat.v1.data.make_one_shot_iterator(dataset)
   return iterator.get_next()
 
 
diff --git a/tensorflow/examples/learn/iris_custom_decay_dnn.py b/tensorflow/examples/learn/iris_custom_decay_dnn.py
index 4a219694d10..73bf20fada4 100644
--- a/tensorflow/examples/learn/iris_custom_decay_dnn.py
+++ b/tensorflow/examples/learn/iris_custom_decay_dnn.py
@@ -76,12 +76,12 @@ def main(unused_argv):
   classifier = tf.estimator.Estimator(model_fn=my_model)
 
   # Train.
-  train_input_fn = tf.estimator.inputs.numpy_input_fn(
+  train_input_fn = tf.compat.v1.estimator.inputs.numpy_input_fn(
       x={X_FEATURE: x_train}, y=y_train, num_epochs=None, shuffle=True)
   classifier.train(input_fn=train_input_fn, steps=1000)
 
   # Predict.
-  test_input_fn = tf.estimator.inputs.numpy_input_fn(
+  test_input_fn = tf.compat.v1.estimator.inputs.numpy_input_fn(
       x={X_FEATURE: x_test}, y=y_test, num_epochs=1, shuffle=False)
   predictions = classifier.predict(input_fn=test_input_fn)
   y_predicted = np.array(list(p['class'] for p in predictions))
diff --git a/tensorflow/examples/learn/iris_custom_model.py b/tensorflow/examples/learn/iris_custom_model.py
index c6bdb86ba52..bf34d72ba07 100644
--- a/tensorflow/examples/learn/iris_custom_model.py
+++ b/tensorflow/examples/learn/iris_custom_model.py
@@ -73,12 +73,12 @@ def main(unused_argv):
   classifier = tf.estimator.Estimator(model_fn=my_model)
 
   # Train.
-  train_input_fn = tf.estimator.inputs.numpy_input_fn(
+  train_input_fn = tf.compat.v1.estimator.inputs.numpy_input_fn(
       x={X_FEATURE: x_train}, y=y_train, num_epochs=None, shuffle=True)
   classifier.train(input_fn=train_input_fn, steps=1000)
 
   # Predict.
-  test_input_fn = tf.estimator.inputs.numpy_input_fn(
+  test_input_fn = tf.compat.v1.estimator.inputs.numpy_input_fn(
       x={X_FEATURE: x_test}, y=y_test, num_epochs=1, shuffle=False)
   predictions = classifier.predict(input_fn=test_input_fn)
   y_predicted = np.array(list(p['class'] for p in predictions))
diff --git a/tensorflow/examples/speech_commands/freeze_test.py b/tensorflow/examples/speech_commands/freeze_test.py
index 0c7ca9bc011..9ed9050035b 100644
--- a/tensorflow/examples/speech_commands/freeze_test.py
+++ b/tensorflow/examples/speech_commands/freeze_test.py
@@ -19,11 +19,13 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.examples.speech_commands import freeze
+from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
 
 
 class FreezeTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testCreateInferenceGraphWithMfcc(self):
     with self.cached_session() as sess:
       freeze.create_inference_graph(
@@ -43,6 +45,7 @@ class FreezeTest(test.TestCase):
       ops = [node.op for node in sess.graph_def.node]
       self.assertEqual(1, ops.count('Mfcc'))
 
+  @test_util.run_deprecated_v1
   def testCreateInferenceGraphWithoutMfcc(self):
     with self.cached_session() as sess:
       freeze.create_inference_graph(
@@ -62,6 +65,7 @@ class FreezeTest(test.TestCase):
       ops = [node.op for node in sess.graph_def.node]
       self.assertEqual(0, ops.count('Mfcc'))
 
+  @test_util.run_deprecated_v1
   def testFeatureBinCount(self):
     with self.cached_session() as sess:
       freeze.create_inference_graph(
diff --git a/tensorflow/examples/speech_commands/input_data_test.py b/tensorflow/examples/speech_commands/input_data_test.py
index b766ba6de0d..9269bb6c0bc 100644
--- a/tensorflow/examples/speech_commands/input_data_test.py
+++ b/tensorflow/examples/speech_commands/input_data_test.py
@@ -26,6 +26,7 @@ import tensorflow as tf
 from tensorflow.contrib.framework.python.ops import audio_ops as contrib_audio
 from tensorflow.examples.speech_commands import input_data
 from tensorflow.examples.speech_commands import models
+from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
 
 
@@ -35,7 +36,7 @@ class InputDataTest(test.TestCase):
     with self.cached_session() as sess:
       sample_data = tf.zeros([32000, 2])
       wav_encoder = contrib_audio.encode_wav(sample_data, 16000)
-      wav_data = sess.run(wav_encoder)
+      wav_data = self.evaluate(wav_encoder)
     return wav_data
 
   def _saveTestWavFile(self, filename, wav_data):
@@ -96,6 +97,7 @@ class InputDataTest(test.TestCase):
         input_data.which_set("foo_nohash_0.wav", 10, 10),
         input_data.which_set("foo_nohash_1.wav", 10, 10))
 
+  @test_util.run_deprecated_v1
   def testPrepareDataIndex(self):
     tmp_dir = self.get_temp_dir()
     self._saveWavFolders(tmp_dir, ["a", "b", "c"], 100)
@@ -125,6 +127,7 @@ class InputDataTest(test.TestCase):
                                     10, self._model_settings(), tmp_dir)
     self.assertTrue("Expected to find" in str(e.exception))
 
+  @test_util.run_deprecated_v1
   def testPrepareBackgroundData(self):
     tmp_dir = self.get_temp_dir()
     background_dir = os.path.join(tmp_dir, "_background_noise_")
@@ -156,6 +159,7 @@ class InputDataTest(test.TestCase):
     self.assertIsNotNone(loaded_data)
     self.assertEqual(16000, len(loaded_data))
 
+  @test_util.run_deprecated_v1
   def testPrepareProcessingGraph(self):
     tmp_dir = self.get_temp_dir()
     wav_dir = os.path.join(tmp_dir, "wavs")
@@ -186,15 +190,19 @@ class InputDataTest(test.TestCase):
     self.assertIsNotNone(audio_processor.background_volume_placeholder_)
     self.assertIsNotNone(audio_processor.output_)
 
+  @test_util.run_deprecated_v1
   def testGetDataAverage(self):
     self._runGetDataTest("average", 10)
 
+  @test_util.run_deprecated_v1
   def testGetDataAverageLongWindow(self):
     self._runGetDataTest("average", 30)
 
+  @test_util.run_deprecated_v1
   def testGetDataMfcc(self):
     self._runGetDataTest("mfcc", 30)
 
+  @test_util.run_deprecated_v1
   def testGetUnprocessedData(self):
     tmp_dir = self.get_temp_dir()
     wav_dir = os.path.join(tmp_dir, "wavs")
@@ -216,6 +224,7 @@ class InputDataTest(test.TestCase):
     self.assertEqual(10, len(result_data))
     self.assertEqual(10, len(result_labels))
 
+  @test_util.run_deprecated_v1
   def testGetFeaturesForWav(self):
     tmp_dir = self.get_temp_dir()
     wav_dir = os.path.join(tmp_dir, "wavs")
diff --git a/tensorflow/examples/speech_commands/label_wav_test.py b/tensorflow/examples/speech_commands/label_wav_test.py
index f0af2a47987..77a88f98e16 100644
--- a/tensorflow/examples/speech_commands/label_wav_test.py
+++ b/tensorflow/examples/speech_commands/label_wav_test.py
@@ -33,7 +33,7 @@ class LabelWavTest(test.TestCase):
     with self.cached_session() as sess:
       sample_data = tf.zeros([1000, 2])
       wav_encoder = contrib_audio.encode_wav(sample_data, 16000)
-      wav_data = sess.run(wav_encoder)
+      wav_data = self.evaluate(wav_encoder)
     return wav_data
 
   def _saveTestWavFile(self, filename, wav_data):
diff --git a/tensorflow/examples/speech_commands/models_test.py b/tensorflow/examples/speech_commands/models_test.py
index 04478c09626..cb9304eab8d 100644
--- a/tensorflow/examples/speech_commands/models_test.py
+++ b/tensorflow/examples/speech_commands/models_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import tensorflow as tf
 
 from tensorflow.examples.speech_commands import models
+from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
 
 
@@ -47,6 +48,7 @@ class ModelsTest(test.TestCase):
             feature_bin_count=40,
             preprocess="mfcc"))
 
+  @test_util.run_deprecated_v1
   def testCreateModelConvTraining(self):
     model_settings = self._modelSettings()
     with self.cached_session() as sess:
@@ -58,6 +60,7 @@ class ModelsTest(test.TestCase):
       self.assertIsNotNone(sess.graph.get_tensor_by_name(logits.name))
       self.assertIsNotNone(sess.graph.get_tensor_by_name(dropout_prob.name))
 
+  @test_util.run_deprecated_v1
   def testCreateModelConvInference(self):
     model_settings = self._modelSettings()
     with self.cached_session() as sess:
@@ -67,6 +70,7 @@ class ModelsTest(test.TestCase):
       self.assertIsNotNone(logits)
       self.assertIsNotNone(sess.graph.get_tensor_by_name(logits.name))
 
+  @test_util.run_deprecated_v1
   def testCreateModelLowLatencyConvTraining(self):
     model_settings = self._modelSettings()
     with self.cached_session() as sess:
@@ -78,6 +82,7 @@ class ModelsTest(test.TestCase):
       self.assertIsNotNone(sess.graph.get_tensor_by_name(logits.name))
       self.assertIsNotNone(sess.graph.get_tensor_by_name(dropout_prob.name))
 
+  @test_util.run_deprecated_v1
   def testCreateModelFullyConnectedTraining(self):
     model_settings = self._modelSettings()
     with self.cached_session() as sess:
@@ -98,6 +103,7 @@ class ModelsTest(test.TestCase):
                             "bad_architecture", True)
       self.assertTrue("not recognized" in str(e.exception))
 
+  @test_util.run_deprecated_v1
   def testCreateModelTinyConvTraining(self):
     model_settings = self._modelSettings()
     with self.cached_session() as sess:
diff --git a/tensorflow/examples/speech_commands/wav_to_features_test.py b/tensorflow/examples/speech_commands/wav_to_features_test.py
index 87f29876939..6234490b267 100644
--- a/tensorflow/examples/speech_commands/wav_to_features_test.py
+++ b/tensorflow/examples/speech_commands/wav_to_features_test.py
@@ -24,6 +24,7 @@ import tensorflow as tf
 
 from tensorflow.contrib.framework.python.ops import audio_ops as contrib_audio
 from tensorflow.examples.speech_commands import wav_to_features
+from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
 
 
@@ -33,7 +34,7 @@ class WavToFeaturesTest(test.TestCase):
     with self.cached_session() as sess:
       sample_data = tf.zeros([32000, 2])
       wav_encoder = contrib_audio.encode_wav(sample_data, 16000)
-      wav_data = sess.run(wav_encoder)
+      wav_data = self.evaluate(wav_encoder)
     return wav_data
 
   def _saveTestWavFile(self, filename, wav_data):
@@ -49,6 +50,7 @@ class WavToFeaturesTest(test.TestCase):
         file_path = os.path.join(dir_name, "some_audio_%d.wav" % i)
         self._saveTestWavFile(file_path, wav_data)
 
+  @test_util.run_deprecated_v1
   def testWavToFeatures(self):
     tmp_dir = self.get_temp_dir()
     wav_dir = os.path.join(tmp_dir, "wavs")
diff --git a/tensorflow/examples/tutorials/layers/cnn_mnist.py b/tensorflow/examples/tutorials/layers/cnn_mnist.py
index 1e8d7d05e1c..670e929236f 100644
--- a/tensorflow/examples/tutorials/layers/cnn_mnist.py
+++ b/tensorflow/examples/tutorials/layers/cnn_mnist.py
@@ -134,7 +134,7 @@ def main(unused_argv):
       tensors=tensors_to_log, every_n_iter=50)
 
   # Train the model
-  train_input_fn = tf.estimator.inputs.numpy_input_fn(
+  train_input_fn = tf.compat.v1.estimator.inputs.numpy_input_fn(
       x={"x": train_data},
       y=train_labels,
       batch_size=100,
@@ -146,11 +146,8 @@ def main(unused_argv):
       hooks=[logging_hook])
 
   # Evaluate the model and print results
-  eval_input_fn = tf.estimator.inputs.numpy_input_fn(
-      x={"x": eval_data},
-      y=eval_labels,
-      num_epochs=1,
-      shuffle=False)
+  eval_input_fn = tf.compat.v1.estimator.inputs.numpy_input_fn(
+      x={"x": eval_data}, y=eval_labels, num_epochs=1, shuffle=False)
   eval_results = mnist_classifier.evaluate(input_fn=eval_input_fn)
   print(eval_results)
 
diff --git a/tensorflow/go/README.md b/tensorflow/go/README.md
index 3989f9b25a4..f53f6fc9894 100644
--- a/tensorflow/go/README.md
+++ b/tensorflow/go/README.md
@@ -23,7 +23,7 @@ from source.
 
 -   [bazel](https://www.bazel.build/versions/master/docs/install.html)
 -   Environment to build TensorFlow from source code
-    ([Linux of macOS](https://www.tensorflow.org/install/source)).
+    ([Linux or macOS](https://www.tensorflow.org/install/source)).
     If you don't need GPU support, then try the following:
 
     ```sh
diff --git a/tensorflow/go/graph.go b/tensorflow/go/graph.go
index 67e42aa961b..6ff41ca9169 100644
--- a/tensorflow/go/graph.go
+++ b/tensorflow/go/graph.go
@@ -112,9 +112,17 @@ func (g *Graph) ImportWithOptions(def []byte, options GraphImportOptions) error
 	C.TF_ImportGraphDefOptionsSetPrefix(opts, cprefix)
 
 	if len(options.Device) != 0 {
-		cdev := C.CString(options.Device)
-		defer C.free(unsafe.Pointer(cdev))
-		C.TF_ImportGraphDefOptionsSetDefaultDevice(opts, cdev)
+		// TODO(ashankar): Remove this error and uncomment below
+		// when a release of the C library which includes
+		// https://github.com/tensorflow/tensorflow/commit/e0af5ac53e5a8ad9b07cdd5738c0a8e12f938c4e
+		// has been made.
+		// See https://github.com/tensorflow/tensorflow/issues/23257
+		return fmt.Errorf("GraphImportOptions.Device is only supported with the TensorFlow C library versions after 1.12 (or built from master). See https://github.com/tensorflow/tensorflow/issues/23257")
+		/*
+			cdev := C.CString(options.Device)
+			defer C.free(unsafe.Pointer(cdev))
+			C.TF_ImportGraphDefOptionsSetDefaultDevice(opts, cdev)
+		*/
 	}
 
 	buf := C.TF_NewBuffer()
@@ -174,6 +182,68 @@ func (g *Graph) Operations() []Operation {
 	return ops
 }
 
+// AddGradients adds operations to compute the partial derivatives of the sum of tensors in y
+// with respect to tensors in x, i.e., d(y[0] + y[1] + ...) / d x[0], d(y[0] + y[1] + ... ) / d x[1] etc.
+//
+// prefix, if non-empty, is the name prefix used for all operations added to the graph to compute
+// these gradients.
+func (g *Graph) AddGradients(prefix string, y []Output, x []Output, dx []Output) ([]Output, error) {
+	var (
+		cprefix *C.char
+
+		cy  = make([]C.TF_Output, len(y))
+		cx  = make([]C.TF_Output, len(x))
+		cdx = make([]C.TF_Output, len(dx))
+		cdy = make([]C.TF_Output, len(x))
+
+		pcy  *C.TF_Output
+		pcx  *C.TF_Output
+		pcdx *C.TF_Output
+		pcdy *C.TF_Output
+
+		status = newStatus()
+	)
+
+	if len(y) > 0 {
+		pcy = &cy[0]
+		for i, o := range y {
+			cy[i] = o.c()
+		}
+	}
+	if len(x) > 0 {
+		pcx = &cx[0]
+		for i, o := range x {
+			cx[i] = o.c()
+		}
+		pcdy = &cdy[0]
+	}
+	if len(dx) > 0 {
+		pcdx = &cdx[0]
+		for i, o := range dx {
+			cdx[i] = o.c()
+		}
+	}
+
+	// If prefix is "", the C.TF_AddGradientsWithPrefix need cprefix to be nil but not ""
+	if len(prefix) != 0 {
+		cprefix = C.CString(prefix)
+		defer C.free(unsafe.Pointer(cprefix))
+	}
+
+	C.TF_AddGradientsWithPrefix(g.c, cprefix, pcy, C.int(len(y)), pcx, C.int(len(x)), pcdx, status.c, pcdy)
+
+	if err := status.Err(); err != nil {
+		return nil, err
+	}
+	dy := make([]Output, len(x))
+	for i, co := range cdy {
+		op := &Operation{co.oper, g}
+		dy[i] = Output{op, int(co.index)}
+	}
+
+	return dy, nil
+}
+
 // OpSpec is the specification of an Operation to be added to a Graph
 // (using Graph.AddOperation).
 type OpSpec struct {
diff --git a/tensorflow/go/graph_test.go b/tensorflow/go/graph_test.go
index b8d65c54f69..067c7db5c3c 100644
--- a/tensorflow/go/graph_test.go
+++ b/tensorflow/go/graph_test.go
@@ -19,6 +19,7 @@ package tensorflow
 import (
 	"bytes"
 	"fmt"
+	"strings"
 	"testing"
 )
 
@@ -80,3 +81,260 @@ func TestGraphWriteToAndImport(t *testing.T) {
 		t.Error(err)
 	}
 }
+
+func TestGraphAddGradients(t *testing.T) {
+	g := NewGraph()
+	x1, err := Placeholder(g, "x1", Float)
+	if err != nil {
+		t.Fatal(err)
+	}
+	x2, err := Placeholder(g, "x2", Float)
+	if err != nil {
+		t.Fatal(err)
+	}
+	op0, err := g.AddOperation(OpSpec{
+		Type:  "Square",
+		Name:  "y0",
+		Input: []Input{x1},
+	})
+	if err != nil {
+		t.Fatal(err)
+	}
+	y0 := op0.Output(0)
+	op1, err := g.AddOperation(OpSpec{
+		Type:  "Square",
+		Name:  "y1",
+		Input: []Input{y0},
+	})
+	if err != nil {
+		t.Fatal(err)
+	}
+	y1 := op1.Output(0)
+	op2, err := g.AddOperation(OpSpec{
+		Type:  "AddN",
+		Input: []Input{OutputList([]Output{y0, x2})},
+	})
+	if err != nil {
+		t.Fatal(err)
+	}
+	y2 := op2.Output(0)
+
+	grads0, err := g.AddGradients("", []Output{y1}, []Output{x1}, nil)
+	if err != nil {
+		t.Fatal(err)
+	}
+	if len(grads0) != 1 {
+		t.Fatal(len(grads0))
+	}
+	if grads0[0].DataType() != Float {
+		t.Fatalf("Got DataType %v, wanted %v", grads0[0].DataType(), Float)
+	}
+
+	grads1, err := g.AddGradients("", []Output{y2}, []Output{x1, x2}, nil)
+	if err != nil {
+		t.Fatal(err)
+	}
+	if len(grads1) != 2 {
+		t.Fatal(len(grads1))
+	}
+	if grads1[0].DataType() != Float {
+		t.Fatalf("Got DataType %v, wanted %v", grads1[0].DataType(), Float)
+	}
+	if grads1[1].DataType() != Float {
+		t.Fatalf("Got DataType %v, wanted %v", grads1[1].DataType(), Float)
+	}
+
+	sess, err := NewSession(g, nil)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	c1, _ := NewTensor(float32(3.0))
+	c2, _ := NewTensor(float32(2.0))
+	outputs, err := sess.Run(
+		map[Output]*Tensor{x1: c1, x2: c2},
+		[]Output{grads0[0], grads1[0], grads1[1]},
+		nil)
+	if err != nil {
+		t.Fatal(err)
+	}
+	if len(outputs) != 3 {
+		t.Fatal(len(outputs))
+	}
+	if outputs[0].Value().(float32) != 108.0 {
+		t.Fatalf("Got %v, wanted float 108.0", outputs[0].Value())
+	}
+	if outputs[1].Value().(float32) != 6.0 {
+		t.Fatalf("Got %v, wanted float 6.0", outputs[1].Value())
+	}
+	if outputs[2].Value().(float32) != 1.0 {
+		t.Fatalf("Got %v, wanted float 1.0", outputs[2].Value())
+	}
+}
+
+func TestGraphAddGradientsSums(t *testing.T) {
+	g := NewGraph()
+	x, err := Placeholder(g, "x", Float)
+	if err != nil {
+		t.Fatal(err)
+	}
+	op0, err := g.AddOperation(OpSpec{
+		Type:  "Square",
+		Name:  "y0",
+		Input: []Input{x},
+	})
+	if err != nil {
+		t.Fatal(err)
+	}
+	y0 := op0.Output(0)
+	op1, err := g.AddOperation(OpSpec{
+		Type:  "Square",
+		Name:  "y1",
+		Input: []Input{y0},
+	})
+	y1 := op1.Output(0)
+
+	grad, err := g.AddGradients("", []Output{y0, y1}, []Output{x}, nil)
+	if err != nil {
+		t.Fatal(err)
+	}
+	if len(grad) != 1 {
+		t.Fatal(len(grad))
+	}
+	if grad[0].DataType() != Float {
+		t.Fatalf("Got DataType %v, wanted %v", grad[0].DataType(), Float)
+	}
+
+	sess, err := NewSession(g, nil)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	c, _ := NewTensor(float32(3.0))
+	outputs, err := sess.Run(
+		map[Output]*Tensor{x: c},
+		[]Output{grad[0]},
+		nil)
+	if err != nil {
+		t.Fatal(err)
+	}
+	if outputs[0].Value().(float32) != 114.0 {
+		t.Fatalf("Got %v, wanted float 114.0", outputs[0].Value())
+	}
+}
+
+func TestGraphAddGradientsWithInitialValues(t *testing.T) {
+	g := NewGraph()
+	x, err := Placeholder(g, "x", Float)
+	op0, err := g.AddOperation(OpSpec{
+		Type:  "Square",
+		Name:  "y0",
+		Input: []Input{x},
+	})
+	if err != nil {
+		t.Fatal(err)
+	}
+	y0 := op0.Output(0)
+	op1, err := g.AddOperation(OpSpec{
+		Type:  "Square",
+		Name:  "y1",
+		Input: []Input{y0},
+	})
+	if err != nil {
+		t.Fatal(err)
+	}
+	y1 := op1.Output(0)
+
+	grads0, err := g.AddGradients("", []Output{y1}, []Output{y0}, nil)
+	if err != nil {
+		t.Fatal(err)
+	}
+	if len(grads0) != 1 {
+		t.Fatal(len(grads0))
+	}
+	if grads0[0].DataType() != Float {
+		t.Fatalf("Got DataType %v, wanted %v", grads0[0].DataType(), Float)
+	}
+
+	grads1, err := g.AddGradients("", []Output{y0}, []Output{x}, []Output{grads0[0]})
+	if err != nil {
+		t.Fatal(err)
+	}
+	if len(grads1) != 1 {
+		t.Fatal(len(grads1))
+	}
+	if grads1[0].DataType() != Float {
+		t.Fatalf("Got DataType %v, wanted %v", grads1[0].DataType(), Float)
+	}
+
+	sess, err := NewSession(g, nil)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	c, _ := NewTensor(float32(3.0))
+	outputs, err := sess.Run(
+		map[Output]*Tensor{x: c},
+		[]Output{grads1[0]},
+		nil)
+	if err != nil {
+		t.Fatal(err)
+	}
+	if outputs[0].Value().(float32) != 108.0 {
+		t.Fatalf("Got %v, wanted float 108.0", outputs[0].Value())
+	}
+}
+
+func TestGraphValidateGradientsNames(t *testing.T) {
+	g := NewGraph()
+	x, err := Placeholder(g, "x", Float)
+	if err != nil {
+		t.Fatal(err)
+	}
+	op0, err := g.AddOperation(OpSpec{
+		Type:  "Square",
+		Name:  "y0",
+		Input: []Input{x},
+	})
+	if err != nil {
+		t.Fatal(err)
+	}
+	y0 := op0.Output(0)
+
+	grads0, err := g.AddGradients("", []Output{y0}, []Output{x}, nil)
+	if err != nil {
+		t.Fatal(err)
+	}
+	if !strings.HasPrefix(grads0[0].Op.Name(), "gradients/") {
+		t.Fatalf("Got name %v, wanted started with gradients/", grads0[0].Op.Name())
+	}
+
+	grads1, err := g.AddGradients("", []Output{y0}, []Output{x}, nil)
+	if err != nil {
+		t.Fatal(err)
+	}
+	if !strings.HasPrefix(grads1[0].Op.Name(), "gradients_1/") {
+		t.Fatalf("Got name %v, wanted started with gradients_1/", grads1[0].Op.Name())
+	}
+
+	grads2, err := g.AddGradients("more_gradients", []Output{y0}, []Output{x}, nil)
+	if err != nil {
+		t.Fatal(err)
+	}
+	if !strings.HasPrefix(grads2[0].Op.Name(), "more_gradients/") {
+		t.Fatalf("Got name %v, wanted started with more_gradients/", grads2[0].Op.Name())
+	}
+
+	grads3, err := g.AddGradients("even_more_gradients", []Output{y0}, []Output{x}, nil)
+	if err != nil {
+		t.Fatal(err)
+	}
+	if !strings.HasPrefix(grads3[0].Op.Name(), "even_more_gradients/") {
+		t.Fatalf("Got name %v, wanted started with even_more_gradients/", grads3[0].Op.Name())
+	}
+
+	_, err = g.AddGradients("even_more_gradients", []Output{y0}, []Output{x}, nil)
+	if err == nil {
+		t.Error("AddGradients should have failed if gradients name is already existing")
+	}
+}
diff --git a/tensorflow/go/op/gradients.go b/tensorflow/go/op/gradients.go
new file mode 100644
index 00000000000..9f892e1da6c
--- /dev/null
+++ b/tensorflow/go/op/gradients.go
@@ -0,0 +1,50 @@
+/*
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package op
+
+import (
+	"fmt"
+
+	tf "github.com/tensorflow/tensorflow/tensorflow/go"
+)
+
+
+// Gradients adds gradients computation ops to the graph according to scope.
+//
+// Arguments:
+//  y: output of the function to derive
+//  x: inputs of the function for which partial derivatives are computed
+//  dx: if not null, the partial derivatives of some loss function L w.r.t. y
+//
+//  return the partial derivatives
+func Gradients(scope *Scope, y []tf.Output, x []tf.Output, dx ...tf.Output) (output []tf.Output) {
+	if len(scope.controlDependencies) > 0 {
+		scope.UpdateErr("Gradients", fmt.Errorf("Gradients does not currently support control dependencies (via Scope.WithControlDependencies)."))
+		return
+	}
+	if scope.device != "" {
+		scope.UpdateErr("Gradients", fmt.Errorf("Gradients does not currently support device annotations (via Scope.WithDevice)."))
+		return
+	}
+
+	var err error
+	if output, err = scope.graph.AddGradients(scope.opName("Gradients"), y, x, dx); err != nil {
+		scope.UpdateErr("Gradients", err)
+		return
+	}
+	return output
+}
diff --git a/tensorflow/go/op/gradients_test.go b/tensorflow/go/op/gradients_test.go
new file mode 100644
index 00000000000..3d1d57b77ea
--- /dev/null
+++ b/tensorflow/go/op/gradients_test.go
@@ -0,0 +1,246 @@
+/*
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package op
+
+import (
+	"strings"
+	"testing"
+
+	tf "github.com/tensorflow/tensorflow/tensorflow/go"
+)
+
+func TestAddGradients(t *testing.T) {
+	var (
+		s  = NewScope()
+		x1 = Placeholder(s.SubScope("x1"), tf.Float)
+		x2 = Placeholder(s.SubScope("x2"), tf.Float)
+		y0 = Square(s.SubScope("y0"), x1)
+		y1 = Square(s.SubScope("y1"), y0)
+		y2 = AddN(s.SubScope("y2"), []tf.Output{y0, x2})
+	)
+
+	grads0 := Gradients(s, []tf.Output{y1}, []tf.Output{x1})
+	if err := s.Err(); err != nil {
+		t.Fatal(err)
+	}
+	if len(grads0) != 1 {
+		t.Fatal(len(grads0))
+	}
+	if grads0[0].DataType() != tf.Float {
+		t.Fatalf("Got DataType %v, wanted %v", grads0[0].DataType(), tf.Float)
+	}
+
+	sub := s.SubScope("sub")
+	grads1 := Gradients(sub, []tf.Output{y2}, []tf.Output{x1, x2})
+	if err := sub.Err(); err != nil {
+		t.Fatal(err)
+	}
+	if len(grads1) != 2 {
+		t.Fatal(len(grads1))
+	}
+	if grads1[0].DataType() != tf.Float {
+		t.Fatalf("Got DataType %v, wanted %v", grads1[0].DataType(), tf.Float)
+	}
+	if grads1[1].DataType() != tf.Float {
+		t.Fatalf("Got DataType %v, wanted %v", grads1[1].DataType(), tf.Float)
+	}
+
+	graph, err := sub.Finalize()
+	if err != nil {
+		t.Fatal(err)
+	}
+	sess, err := tf.NewSession(graph, nil)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	c1, _ := tf.NewTensor(float32(3.0))
+	c2, _ := tf.NewTensor(float32(3.0))
+	outputs, err := sess.Run(
+		map[tf.Output]*tf.Tensor{x1: c1, x2: c2},
+		[]tf.Output{grads0[0], grads1[0], grads1[1]},
+		nil)
+	if err != nil {
+		t.Fatal(err)
+	}
+	if len(outputs) != 3 {
+		t.Fatal(len(outputs))
+	}
+	if outputs[0].Value().(float32) != 108.0 {
+		t.Fatalf("Got %v, wanted float 108.0", outputs[0].Value())
+	}
+	if outputs[1].Value().(float32) != 6.0 {
+		t.Fatalf("Got %v, wanted float 6.0", outputs[1].Value())
+	}
+	if outputs[2].Value().(float32) != 1.0 {
+		t.Fatalf("Got %v, wanted float 1.0", outputs[2].Value())
+	}
+}
+
+func TestAddGradientsSums(t *testing.T) {
+	var (
+		s  = NewScope()
+		x  = Placeholder(s.SubScope("x"), tf.Float)
+		y0 = Square(s.SubScope("y0"), x)
+		y1 = Square(s.SubScope("y1"), y0)
+	)
+
+	grad := Gradients(s, []tf.Output{y0, y1}, []tf.Output{x})
+	if err := s.Err(); err != nil {
+		t.Fatal(err)
+	}
+	if len(grad) != 1 {
+		t.Fatal(len(grad))
+	}
+	if grad[0].DataType() != tf.Float {
+		t.Fatalf("Got DataType %v, wanted %v", grad[0].DataType(), tf.Float)
+	}
+
+	graph, err := s.Finalize()
+	if err != nil {
+		t.Fatal(err)
+	}
+	sess, err := tf.NewSession(graph, nil)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	c, _ := tf.NewTensor(float32(3.0))
+	outputs, err := sess.Run(
+		map[tf.Output]*tf.Tensor{x: c},
+		[]tf.Output{grad[0]},
+		nil)
+	if err != nil {
+		t.Fatal(err)
+	}
+	if outputs[0].Value().(float32) != 114.0 {
+		t.Fatalf("Got %v, wanted float 114.0", outputs[0].Value())
+	}
+}
+
+func TestAddGradientsWithInitialValues(t *testing.T) {
+	var (
+		s  = NewScope()
+		x  = Placeholder(s.SubScope("x1"), tf.Float)
+		y0 = Square(s.SubScope("y0"), x)
+		y1 = Square(s.SubScope("y1"), y0)
+	)
+
+	grads0 := Gradients(s, []tf.Output{y1}, []tf.Output{y0})
+	if err := s.Err(); err != nil {
+		t.Fatal(err)
+	}
+	if len(grads0) != 1 {
+		t.Fatal(len(grads0))
+	}
+	if grads0[0].DataType() != tf.Float {
+		t.Fatalf("Got DataType %v, wanted %v", grads0[0].DataType(), tf.Float)
+	}
+
+	sub := s.SubScope("sub")
+	grads1 := Gradients(sub, []tf.Output{y0}, []tf.Output{x}, grads0[0])
+	if err := sub.Err(); err != nil {
+		t.Fatal(err)
+	}
+	if len(grads1) != 1 {
+		t.Fatal(len(grads1))
+	}
+	if grads1[0].DataType() != tf.Float {
+		t.Fatalf("Got DataType %v, wanted %v", grads1[0].DataType(), tf.Float)
+	}
+
+	graph, err := sub.Finalize()
+	if err != nil {
+		t.Fatal(err)
+	}
+	sess, err := tf.NewSession(graph, nil)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	c, _ := tf.NewTensor(float32(3.0))
+	outputs, err := sess.Run(
+		map[tf.Output]*tf.Tensor{x: c},
+		[]tf.Output{grads1[0]},
+		nil)
+	if err != nil {
+		t.Fatal(err)
+	}
+	if outputs[0].Value().(float32) != 108.0 {
+		t.Fatalf("Got %v, wanted float 108.0", outputs[0].Value())
+	}
+}
+
+func TestValidateGradientsNames(t *testing.T) {
+	var (
+		s  = NewScope()
+		x  = Placeholder(s.SubScope("x"), tf.Float)
+		y0 = Square(s.SubScope("y0"), x)
+	)
+
+	grads0 := Gradients(s, []tf.Output{y0}, []tf.Output{x})
+	if err := s.Err(); err != nil {
+		t.Fatal(err)
+	}
+	if !strings.HasPrefix(grads0[0].Op.Name(), "Gradients/") {
+		t.Fatalf("Got name %v, wanted started with Gradients/", grads0[0].Op.Name())
+	}
+
+	sub := s.SubScope("sub")
+	grads1 := Gradients(sub, []tf.Output{y0}, []tf.Output{x})
+	if err := s.Err(); err != nil {
+		t.Fatal(err)
+	}
+	if !strings.HasPrefix(grads1[0].Op.Name(), "sub/Gradients/") {
+		t.Fatalf("Got name %v, wanted started with sub/Gradients/", grads1[0].Op.Name())
+	}
+
+	Gradients(sub, []tf.Output{y0}, []tf.Output{x})
+	if err := s.Err(); err == nil {
+		t.Error("Gradients should have failed if executed more than once for scope of the same namespace")
+	}
+}
+
+func TestAddGradientsWithControlDependencies(t *testing.T) {
+	var (
+		s        = NewScope()
+		zero     = Const(s.SubScope("zero"), int32(0))
+		x        = Placeholder(s.SubScope("x"), tf.Float)
+		y0       = Square(s.SubScope("y0"), x)
+		variable = VarHandleOp(s, tf.Int32, tf.ScalarShape())
+		init     = AssignVariableOp(s, variable, zero)
+		readDeps = []*tf.Operation{init}
+	)
+	s = s.WithControlDependencies(readDeps...)
+	Gradients(s, []tf.Output{y0}, []tf.Output{x})
+	if err := s.Err(); err == nil {
+		t.Error("Gradients should have failed when control dependencies are set")
+	}
+}
+
+func TestAddGradientsWithDevice(t *testing.T) {
+	var (
+		s  = NewScope()
+		x  = Placeholder(s.SubScope("x"), tf.Float)
+		y0 = Square(s.SubScope("y0"), x)
+	)
+	s = s.WithDevice("/device:GPU:0")
+	Gradients(s, []tf.Output{y0}, []tf.Output{x})
+	if err := s.Err(); err == nil {
+		t.Error("Gradients should have failed when device is set")
+	}
+}
diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index baf43f84f8e..5cd06299b96 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -463,6 +463,14 @@ func QuantizeAndDequantizeV2RangeGiven(value bool) QuantizeAndDequantizeV2Attr {
 	}
 }
 
+// QuantizeAndDequantizeV2RoundMode sets the optional round_mode attribute to value.
+// If not specified, defaults to "HALF_TO_EVEN"
+func QuantizeAndDequantizeV2RoundMode(value string) QuantizeAndDequantizeV2Attr {
+	return func(m optionalAttr) {
+		m["round_mode"] = value
+	}
+}
+
 // Quantizes then dequantizes a tensor.
 //
 // This op simulates the precision loss from the quantized forward pass by:
@@ -3487,30 +3495,6 @@ func BoostedTreesQuantileStreamResourceFlush(scope *Scope, quantile_stream_resou
 	return scope.AddOperation(opspec)
 }
 
-// Add the quantile summaries to each quantile stream resource.
-//
-// An op that adds a list of quantile summaries to a quantile stream resource. Each
-// summary Tensor is rank 2, containing summaries (value, weight, min_rank, max_rank)
-// for a single feature.
-//
-// Arguments:
-//	quantile_stream_resource_handle: resource handle referring to a QuantileStreamResource.
-//	summaries: string; List of Rank 2 Tensor each containing the summaries for a single feature.
-//
-// Returns the created operation.
-func BoostedTreesQuantileStreamResourceAddSummaries(scope *Scope, quantile_stream_resource_handle tf.Output, summaries []tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "BoostedTreesQuantileStreamResourceAddSummaries",
-		Input: []tf.Input{
-			quantile_stream_resource_handle, tf.OutputList(summaries),
-		},
-	}
-	return scope.AddOperation(opspec)
-}
-
 // Makes the summary of quantiles for the batch.
 //
 // An op that takes a list of tensors and outputs the quantile summaries for each tensor.
@@ -5547,6 +5531,63 @@ func OrderedMapPeek(scope *Scope, key tf.Output, indices tf.Output, dtypes []tf.
 	return values
 }
 
+// MapIncompleteSizeAttr is an optional argument to MapIncompleteSize.
+type MapIncompleteSizeAttr func(optionalAttr)
+
+// MapIncompleteSizeCapacity sets the optional capacity attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func MapIncompleteSizeCapacity(value int64) MapIncompleteSizeAttr {
+	return func(m optionalAttr) {
+		m["capacity"] = value
+	}
+}
+
+// MapIncompleteSizeMemoryLimit sets the optional memory_limit attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func MapIncompleteSizeMemoryLimit(value int64) MapIncompleteSizeAttr {
+	return func(m optionalAttr) {
+		m["memory_limit"] = value
+	}
+}
+
+// MapIncompleteSizeContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func MapIncompleteSizeContainer(value string) MapIncompleteSizeAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// MapIncompleteSizeSharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func MapIncompleteSizeSharedName(value string) MapIncompleteSizeAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Op returns the number of incomplete elements in the underlying container.
+func MapIncompleteSize(scope *Scope, dtypes []tf.DataType, optional ...MapIncompleteSizeAttr) (size tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtypes": dtypes}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "MapIncompleteSize",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Compute the regularized incomplete beta integral \\(I_x(a, b)\\).
 //
 // The regularized incomplete beta integral is defined as:
@@ -5612,34 +5653,6 @@ func Atan2(scope *Scope, y tf.Output, x tf.Output) (z tf.Output) {
 	return op.Output(0)
 }
 
-// Creates a dataset that passes a sliding window over `input_dataset`.
-//
-// Arguments:
-//
-//	window_size: A scalar representing the number of elements in the
-// sliding window.
-//	window_shift: A scalar representing the steps moving the sliding window
-// forward in one iteration. It must be positive.
-//	window_stride: A scalar representing the stride of the input elements of the sliding window.
-// It must be positive.
-//
-//
-func SlideDataset(scope *Scope, input_dataset tf.Output, window_size tf.Output, window_shift tf.Output, window_stride tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "SlideDataset",
-		Input: []tf.Input{
-			input_dataset, window_size, window_shift, window_stride,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // EditDistanceAttr is an optional argument to EditDistance.
 type EditDistanceAttr func(optionalAttr)
 
@@ -6279,6 +6292,71 @@ func Asin(scope *Scope, x tf.Output) (y tf.Output) {
 	return op.Output(0)
 }
 
+// SparseToDenseAttr is an optional argument to SparseToDense.
+type SparseToDenseAttr func(optionalAttr)
+
+// SparseToDenseValidateIndices sets the optional validate_indices attribute to value.
+//
+// value: If true, indices are checked to make sure they are sorted in
+// lexicographic order and that there are no repeats.
+// If not specified, defaults to true
+func SparseToDenseValidateIndices(value bool) SparseToDenseAttr {
+	return func(m optionalAttr) {
+		m["validate_indices"] = value
+	}
+}
+
+// Converts a sparse representation into a dense tensor.
+//
+// Builds an array `dense` with shape `output_shape` such that
+//
+// ```
+// # If sparse_indices is scalar
+// dense[i] = (i == sparse_indices ? sparse_values : default_value)
+//
+// # If sparse_indices is a vector, then for each i
+// dense[sparse_indices[i]] = sparse_values[i]
+//
+// # If sparse_indices is an n by d matrix, then for each i in [0, n)
+// dense[sparse_indices[i][0], ..., sparse_indices[i][d-1]] = sparse_values[i]
+// ```
+//
+// All other values in `dense` are set to `default_value`.  If `sparse_values` is a
+// scalar, all sparse indices are set to this single value.
+//
+// Indices should be sorted in lexicographic order, and indices must not
+// contain any repeats. If `validate_indices` is true, these properties
+// are checked during execution.
+//
+// Arguments:
+//	sparse_indices: 0-D, 1-D, or 2-D.  `sparse_indices[i]` contains the complete
+// index where `sparse_values[i]` will be placed.
+//	output_shape: 1-D.  Shape of the dense output tensor.
+//	sparse_values: 1-D.  Values corresponding to each row of `sparse_indices`,
+// or a scalar value to be used for all sparse indices.
+//	default_value: Scalar value to set for indices not specified in
+// `sparse_indices`.
+//
+// Returns Dense output tensor of shape `output_shape`.
+func SparseToDense(scope *Scope, sparse_indices tf.Output, output_shape tf.Output, sparse_values tf.Output, default_value tf.Output, optional ...SparseToDenseAttr) (dense tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseToDense",
+		Input: []tf.Input{
+			sparse_indices, output_shape, sparse_values, default_value,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Computes the sum along sparse segments of a tensor.
 //
 // Like `SparseSegmentSum`, but allows missing ids in `segment_ids`. If an id is
@@ -6370,21 +6448,6 @@ func Sin(scope *Scope, x tf.Output) (y tf.Output) {
 	return op.Output(0)
 }
 
-// Computes the complementary error function of `x` element-wise.
-func Erfc(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Erfc",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Computes Psi, the derivative of Lgamma (the log of the absolute value of
 //
 // `Gamma(x)`), element-wise.
@@ -6829,61 +6892,6 @@ func Reciprocal(scope *Scope, x tf.Output) (y tf.Output) {
 	return op.Output(0)
 }
 
-// ParseExampleDatasetAttr is an optional argument to ParseExampleDataset.
-type ParseExampleDatasetAttr func(optionalAttr)
-
-// ParseExampleDatasetSloppy sets the optional sloppy attribute to value.
-// If not specified, defaults to false
-func ParseExampleDatasetSloppy(value bool) ParseExampleDatasetAttr {
-	return func(m optionalAttr) {
-		m["sloppy"] = value
-	}
-}
-
-// Transforms `input_dataset` containing `Example` protos as vectors of DT_STRING into a dataset of `Tensor` or `SparseTensor` objects representing the parsed features.
-//
-// Arguments:
-//
-//
-//	dense_defaults: A dict mapping string keys to `Tensor`s.
-// The keys of the dict must match the dense_keys of the feature.
-//	sparse_keys: A list of string keys in the examples features.
-// The results for these keys will be returned as `SparseTensor` objects.
-//	dense_keys: A list of Ndense string Tensors (scalars).
-// The keys expected in the Examples features associated with dense values.
-//	sparse_types: A list of `DTypes` of the same length as `sparse_keys`.
-// Only `tf.float32` (`FloatList`), `tf.int64` (`Int64List`),
-// and `tf.string` (`BytesList`) are supported.
-//	dense_shapes: List of tuples with the same length as `dense_keys`.
-// The shape of the data for each dense feature referenced by `dense_keys`.
-// Required for any input tensors identified by `dense_keys`.  Must be
-// either fully defined, or may contain an unknown first dimension.
-// An unknown first dimension means the feature is treated as having
-// a variable number of blocks, and the output shape along this dimension
-// is considered unknown at graph build time.  Padding is applied for
-// minibatch elements smaller than the maximum number of blocks for the
-// given feature along this dimension.
-//	output_types: The type list for the return values.
-//	output_shapes: The list of shapes being produced.
-func ParseExampleDataset(scope *Scope, input_dataset tf.Output, num_parallel_calls tf.Output, dense_defaults []tf.Output, sparse_keys []string, dense_keys []string, sparse_types []tf.DataType, dense_shapes []tf.Shape, output_types []tf.DataType, output_shapes []tf.Shape, optional ...ParseExampleDatasetAttr) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"sparse_keys": sparse_keys, "dense_keys": dense_keys, "sparse_types": sparse_types, "dense_shapes": dense_shapes, "output_types": output_types, "output_shapes": output_shapes}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ParseExampleDataset",
-		Input: []tf.Input{
-			input_dataset, num_parallel_calls, tf.OutputList(dense_defaults),
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Returns a batched matrix tensor with new batched diagonal values.
 //
 // Given `input` and `diagonal`, this operation returns a tensor with the
@@ -7956,28 +7964,6 @@ func LeakyRelu(scope *Scope, features tf.Output, optional ...LeakyReluAttr) (act
 	return op.Output(0)
 }
 
-// Writes the given dataset to the given file using the TFRecord format.
-//
-// Arguments:
-//	input_dataset: A variant tensor representing the dataset to write.
-//	filename: A scalar string tensor representing the filename to use.
-//	compression_type: A scalar string tensor containing either (i) the empty string (no
-// compression), (ii) "ZLIB", or (iii) "GZIP".
-//
-// Returns the created operation.
-func DatasetToTFRecord(scope *Scope, input_dataset tf.Output, filename tf.Output, compression_type tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "DatasetToTFRecord",
-		Input: []tf.Input{
-			input_dataset, filename, compression_type,
-		},
-	}
-	return scope.AddOperation(opspec)
-}
-
 // Computes rectified linear 6: `min(max(features, 0), 6)`.
 func Relu6(scope *Scope, features tf.Output) (activations tf.Output) {
 	if scope.Err() != nil {
@@ -9441,6 +9427,178 @@ func BesselI1e(scope *Scope, x tf.Output) (y tf.Output) {
 	return op.Output(0)
 }
 
+// MapClearAttr is an optional argument to MapClear.
+type MapClearAttr func(optionalAttr)
+
+// MapClearCapacity sets the optional capacity attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func MapClearCapacity(value int64) MapClearAttr {
+	return func(m optionalAttr) {
+		m["capacity"] = value
+	}
+}
+
+// MapClearMemoryLimit sets the optional memory_limit attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func MapClearMemoryLimit(value int64) MapClearAttr {
+	return func(m optionalAttr) {
+		m["memory_limit"] = value
+	}
+}
+
+// MapClearContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func MapClearContainer(value string) MapClearAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// MapClearSharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func MapClearSharedName(value string) MapClearAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Op removes all elements in the underlying container.
+//
+// Returns the created operation.
+func MapClear(scope *Scope, dtypes []tf.DataType, optional ...MapClearAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtypes": dtypes}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "MapClear",
+
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// DecodeCSVAttr is an optional argument to DecodeCSV.
+type DecodeCSVAttr func(optionalAttr)
+
+// DecodeCSVFieldDelim sets the optional field_delim attribute to value.
+//
+// value: char delimiter to separate fields in a record.
+// If not specified, defaults to ","
+func DecodeCSVFieldDelim(value string) DecodeCSVAttr {
+	return func(m optionalAttr) {
+		m["field_delim"] = value
+	}
+}
+
+// DecodeCSVUseQuoteDelim sets the optional use_quote_delim attribute to value.
+//
+// value: If false, treats double quotation marks as regular
+// characters inside of the string fields (ignoring RFC 4180, Section 2,
+// Bullet 5).
+// If not specified, defaults to true
+func DecodeCSVUseQuoteDelim(value bool) DecodeCSVAttr {
+	return func(m optionalAttr) {
+		m["use_quote_delim"] = value
+	}
+}
+
+// DecodeCSVNaValue sets the optional na_value attribute to value.
+//
+// value: Additional string to recognize as NA/NaN.
+// If not specified, defaults to ""
+func DecodeCSVNaValue(value string) DecodeCSVAttr {
+	return func(m optionalAttr) {
+		m["na_value"] = value
+	}
+}
+
+// DecodeCSVSelectCols sets the optional select_cols attribute to value.
+// If not specified, defaults to <>
+func DecodeCSVSelectCols(value []int64) DecodeCSVAttr {
+	return func(m optionalAttr) {
+		m["select_cols"] = value
+	}
+}
+
+// Convert CSV records to tensors. Each column maps to one tensor.
+//
+// RFC 4180 format is expected for the CSV records.
+// (https://tools.ietf.org/html/rfc4180)
+// Note that we allow leading and trailing spaces with int or float field.
+//
+// Arguments:
+//	records: Each string is a record/row in the csv and all records should have
+// the same format.
+//	record_defaults: One tensor per column of the input record, with either a
+// scalar default value for that column or an empty vector if the column is
+// required.
+//
+// Returns Each tensor will have the same shape as records.
+func DecodeCSV(scope *Scope, records tf.Output, record_defaults []tf.Output, optional ...DecodeCSVAttr) (output []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "DecodeCSV",
+		Input: []tf.Input{
+			records, tf.OutputList(record_defaults),
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if output, idx, err = makeOutputList(op, idx, "output"); err != nil {
+		scope.UpdateErr("DecodeCSV", err)
+		return
+	}
+	return output
+}
+
+// Convert JSON-encoded Example records to binary protocol buffer strings.
+//
+// This op translates a tensor containing Example records, encoded using
+// the [standard JSON
+// mapping](https://developers.google.com/protocol-buffers/docs/proto3#json),
+// into a tensor containing the same records encoded as binary protocol
+// buffers. The resulting tensor can then be fed to any of the other
+// Example-parsing ops.
+//
+// Arguments:
+//	json_examples: Each string is a JSON object serialized according to the JSON
+// mapping of the Example proto.
+//
+// Returns Each string is a binary Example protocol buffer corresponding
+// to the respective element of `json_examples`.
+func DecodeJSONExample(scope *Scope, json_examples tf.Output) (binary_examples tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "DecodeJSONExample",
+		Input: []tf.Input{
+			json_examples,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Transforms a Tensor into a serialized TensorProto proto.
 //
 // Arguments:
@@ -10499,6 +10657,31 @@ func ParseExample(scope *Scope, serialized tf.Output, names tf.Output, sparse_ke
 	return sparse_indices, sparse_values, sparse_shapes, dense_values
 }
 
+// Compute the pairwise cross product.
+//
+// `a` and `b` must be the same shape; they can either be simple 3-element vectors,
+// or any shape where the innermost dimension is 3. In the latter case, each pair
+// of corresponding 3-element vectors is cross-multiplied independently.
+//
+// Arguments:
+//	a: A tensor containing 3-element vectors.
+//	b: Another tensor, of same type and shape as `a`.
+//
+// Returns Pairwise cross product of the vectors in `a` and `b`.
+func Cross(scope *Scope, a tf.Output, b tf.Output) (product tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Cross",
+		Input: []tf.Input{
+			a, b,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // CudnnRNNAttr is an optional argument to CudnnRNN.
 type CudnnRNNAttr func(optionalAttr)
 
@@ -11417,66 +11600,6 @@ func RandomStandardNormal(scope *Scope, shape tf.Output, dtype tf.DataType, opti
 	return op.Output(0)
 }
 
-// RandomUniformIntAttr is an optional argument to RandomUniformInt.
-type RandomUniformIntAttr func(optionalAttr)
-
-// RandomUniformIntSeed sets the optional seed attribute to value.
-//
-// value: If either `seed` or `seed2` are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func RandomUniformIntSeed(value int64) RandomUniformIntAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
-
-// RandomUniformIntSeed2 sets the optional seed2 attribute to value.
-//
-// value: A second seed to avoid seed collision.
-// If not specified, defaults to 0
-func RandomUniformIntSeed2(value int64) RandomUniformIntAttr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
-	}
-}
-
-// Outputs random integers from a uniform distribution.
-//
-// The generated values are uniform integers in the range `[minval, maxval)`.
-// The lower bound `minval` is included in the range, while the upper bound
-// `maxval` is excluded.
-//
-// The random integers are slightly biased unless `maxval - minval` is an exact
-// power of two.  The bias is small for values of `maxval - minval` significantly
-// smaller than the range of the output (either `2^32` or `2^64`).
-//
-// Arguments:
-//	shape: The shape of the output tensor.
-//	minval: 0-D.  Inclusive lower bound on the generated integers.
-//	maxval: 0-D.  Exclusive upper bound on the generated integers.
-//
-// Returns A tensor of the specified shape filled with uniform random integers.
-func RandomUniformInt(scope *Scope, shape tf.Output, minval tf.Output, maxval tf.Output, optional ...RandomUniformIntAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "RandomUniformInt",
-		Input: []tf.Input{
-			shape, minval, maxval,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // FusedResizeAndPadConv2DAttr is an optional argument to FusedResizeAndPadConv2D.
 type FusedResizeAndPadConv2DAttr func(optionalAttr)
 
@@ -13530,6 +13653,39 @@ func StatelessRandomNormal(scope *Scope, shape tf.Output, seed tf.Output, option
 	return op.Output(0)
 }
 
+// Computes the complementary error function of `x` element-wise.
+func Erfc(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Erfc",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns the number of tensors in the input tensor list.
+//
+// input_handle: the input list
+// length: the number of tensors in the list
+func TensorListLength(scope *Scope, input_handle tf.Output) (length tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TensorListLength",
+		Input: []tf.Input{
+			input_handle,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Determine the script codes of a given tensor of Unicode integer code points.
 //
 // This operation converts Unicode code points to script codes corresponding to
@@ -13747,6 +13903,126 @@ func ResourceApplyMomentum(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.
 	return scope.AddOperation(opspec)
 }
 
+// SubstrAttr is an optional argument to Substr.
+type SubstrAttr func(optionalAttr)
+
+// SubstrUnit sets the optional unit attribute to value.
+//
+// value: The unit that is used to create the substring.  One of: `"BYTE"` (for
+// defining position and length by bytes) or `"UTF8_CHAR"` (for the UTF-8
+// encoded Unicode code points).  The default is `"BYTE"`. Results are undefined if
+// `unit=UTF8_CHAR` and the `input` strings do not contain structurally valid
+// UTF-8.
+// If not specified, defaults to "BYTE"
+func SubstrUnit(value string) SubstrAttr {
+	return func(m optionalAttr) {
+		m["unit"] = value
+	}
+}
+
+// Return substrings from `Tensor` of strings.
+//
+// For each string in the input `Tensor`, creates a substring starting at index
+// `pos` with a total length of `len`.
+//
+// If `len` defines a substring that would extend beyond the length of the input
+// string, then as many characters as possible are used.
+//
+// A negative `pos` indicates distance within the string backwards from the end.
+//
+// If `pos` specifies an index which is out of range for any of the input strings,
+// then an `InvalidArgumentError` is thrown.
+//
+// `pos` and `len` must have the same shape, otherwise a `ValueError` is thrown on
+// Op creation.
+//
+// *NOTE*: `Substr` supports broadcasting up to two dimensions. More about
+// broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+//
+// ---
+//
+// Examples
+//
+// Using scalar `pos` and `len`:
+//
+// ```python
+// input = [b'Hello', b'World']
+// position = 1
+// length = 3
+//
+// output = [b'ell', b'orl']
+// ```
+//
+// Using `pos` and `len` with same shape as `input`:
+//
+// ```python
+// input = [[b'ten', b'eleven', b'twelve'],
+//          [b'thirteen', b'fourteen', b'fifteen'],
+//          [b'sixteen', b'seventeen', b'eighteen']]
+// position = [[1, 2, 3],
+//             [1, 2, 3],
+//             [1, 2, 3]]
+// length =   [[2, 3, 4],
+//             [4, 3, 2],
+//             [5, 5, 5]]
+//
+// output = [[b'en', b'eve', b'lve'],
+//           [b'hirt', b'urt', b'te'],
+//           [b'ixtee', b'vente', b'hteen']]
+// ```
+//
+// Broadcasting `pos` and `len` onto `input`:
+//
+// ```
+// input = [[b'ten', b'eleven', b'twelve'],
+//          [b'thirteen', b'fourteen', b'fifteen'],
+//          [b'sixteen', b'seventeen', b'eighteen'],
+//          [b'nineteen', b'twenty', b'twentyone']]
+// position = [1, 2, 3]
+// length =   [1, 2, 3]
+//
+// output = [[b'e', b'ev', b'lve'],
+//           [b'h', b'ur', b'tee'],
+//           [b'i', b've', b'hte'],
+//           [b'i', b'en', b'nty']]
+// ```
+//
+// Broadcasting `input` onto `pos` and `len`:
+//
+// ```
+// input = b'thirteen'
+// position = [1, 5, 7]
+// length =   [3, 2, 1]
+//
+// output = [b'hir', b'ee', b'n']
+// ```
+//
+// Arguments:
+//	input: Tensor of strings
+//	pos: Scalar defining the position of first character in each substring
+//	len: Scalar defining the number of characters to include in each substring
+//
+// Returns Tensor of substrings
+func Substr(scope *Scope, input tf.Output, pos tf.Output, len tf.Output, optional ...SubstrAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Substr",
+		Input: []tf.Input{
+			input, pos, len,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Exits the current frame to its parent frame.
 //
 // Exit makes its input `data` available to the parent frame.
@@ -14946,75 +15222,6 @@ func ResourceApplyAdagrad(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.O
 	return scope.AddOperation(opspec)
 }
 
-// Return the shape of s0 op s1 with broadcast.
-//
-// Given `s0` and `s1`, tensors that represent shapes, compute `r0`, the
-// broadcasted shape. `s0`, `s1` and `r0` are all integer vectors.
-func BroadcastArgs(scope *Scope, s0 tf.Output, s1 tf.Output) (r0 tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "BroadcastArgs",
-		Input: []tf.Input{
-			s0, s1,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// DataFormatDimMapAttr is an optional argument to DataFormatDimMap.
-type DataFormatDimMapAttr func(optionalAttr)
-
-// DataFormatDimMapSrcFormat sets the optional src_format attribute to value.
-//
-// value: source data format.
-// If not specified, defaults to "NHWC"
-func DataFormatDimMapSrcFormat(value string) DataFormatDimMapAttr {
-	return func(m optionalAttr) {
-		m["src_format"] = value
-	}
-}
-
-// DataFormatDimMapDstFormat sets the optional dst_format attribute to value.
-//
-// value: destination data format.
-// If not specified, defaults to "NCHW"
-func DataFormatDimMapDstFormat(value string) DataFormatDimMapAttr {
-	return func(m optionalAttr) {
-		m["dst_format"] = value
-	}
-}
-
-// Returns the dimension index in the destination data format given the one in
-//
-// the source data format.
-//
-// Arguments:
-//	x: A Tensor with each element as a dimension index in source data format.
-// Must be in the range [-4, 4).
-//
-// Returns A Tensor with each element as a dimension index in destination data format.
-func DataFormatDimMap(scope *Scope, x tf.Output, optional ...DataFormatDimMapAttr) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "DataFormatDimMap",
-		Input: []tf.Input{
-			x,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Retrieves the tree ensemble resource stamp token, number of trees and growing statistics.
 //
 // Arguments:
@@ -16696,6 +16903,124 @@ func SparseReduceSum(scope *Scope, input_indices tf.Output, input_values tf.Outp
 	return op.Output(0)
 }
 
+// SparseTensorDenseMatMulAttr is an optional argument to SparseTensorDenseMatMul.
+type SparseTensorDenseMatMulAttr func(optionalAttr)
+
+// SparseTensorDenseMatMulAdjointA sets the optional adjoint_a attribute to value.
+//
+// value: Use the adjoint of A in the matrix multiply.  If A is complex, this
+// is transpose(conj(A)).  Otherwise it's transpose(A).
+// If not specified, defaults to false
+func SparseTensorDenseMatMulAdjointA(value bool) SparseTensorDenseMatMulAttr {
+	return func(m optionalAttr) {
+		m["adjoint_a"] = value
+	}
+}
+
+// SparseTensorDenseMatMulAdjointB sets the optional adjoint_b attribute to value.
+//
+// value: Use the adjoint of B in the matrix multiply.  If B is complex, this
+// is transpose(conj(B)).  Otherwise it's transpose(B).
+// If not specified, defaults to false
+func SparseTensorDenseMatMulAdjointB(value bool) SparseTensorDenseMatMulAttr {
+	return func(m optionalAttr) {
+		m["adjoint_b"] = value
+	}
+}
+
+// Multiply SparseTensor (of rank 2) "A" by dense matrix "B".
+//
+// No validity checking is performed on the indices of A.  However, the following
+// input format is recommended for optimal behavior:
+//
+// if adjoint_a == false:
+//   A should be sorted in lexicographically increasing order.  Use SparseReorder
+//   if you're not sure.
+// if adjoint_a == true:
+//   A should be sorted in order of increasing dimension 1 (i.e., "column major"
+//   order instead of "row major" order).
+//
+// Arguments:
+//	a_indices: 2-D.  The `indices` of the `SparseTensor`, size `[nnz, 2]` Matrix.
+//	a_values: 1-D.  The `values` of the `SparseTensor`, size `[nnz]` Vector.
+//	a_shape: 1-D.  The `shape` of the `SparseTensor`, size `[2]` Vector.
+//	b: 2-D.  A dense Matrix.
+func SparseTensorDenseMatMul(scope *Scope, a_indices tf.Output, a_values tf.Output, a_shape tf.Output, b tf.Output, optional ...SparseTensorDenseMatMulAttr) (product tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseTensorDenseMatMul",
+		Input: []tf.Input{
+			a_indices, a_values, a_shape, b,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ResourceApplyRMSPropAttr is an optional argument to ResourceApplyRMSProp.
+type ResourceApplyRMSPropAttr func(optionalAttr)
+
+// ResourceApplyRMSPropUseLocking sets the optional use_locking attribute to value.
+//
+// value: If `True`, updating of the var, ms, and mom tensors is protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceApplyRMSPropUseLocking(value bool) ResourceApplyRMSPropAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// Update '*var' according to the RMSProp algorithm.
+//
+// Note that in dense implementation of this algorithm, ms and mom will
+// update even if the grad is zero, but in this sparse implementation, ms
+// and mom will not update in iterations during which the grad is zero.
+//
+// mean_square = decay * mean_square + (1-decay) * gradient ** 2
+// Delta = learning_rate * gradient / sqrt(mean_square + epsilon)
+//
+// ms <- rho * ms_{t-1} + (1-rho) * grad * grad
+// mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon)
+// var <- var - mom
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	ms: Should be from a Variable().
+//	mom: Should be from a Variable().
+//	lr: Scaling factor. Must be a scalar.
+//	rho: Decay rate. Must be a scalar.
+//
+//	epsilon: Ridge term. Must be a scalar.
+//	grad: The gradient.
+//
+// Returns the created operation.
+func ResourceApplyRMSProp(scope *Scope, var_ tf.Output, ms tf.Output, mom tf.Output, lr tf.Output, rho tf.Output, momentum tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyRMSPropAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceApplyRMSProp",
+		Input: []tf.Input{
+			var_, ms, mom, lr, rho, momentum, epsilon, grad,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
 // Returns immutable tensor from memory region.
 //
 // The current implementation memmaps the tensor from a file.
@@ -18017,119 +18342,6 @@ func ResourceApplyCenteredRMSProp(scope *Scope, var_ tf.Output, mg tf.Output, ms
 	return scope.AddOperation(opspec)
 }
 
-// Computes the gradient for the inverse of `x` wrt its input.
-//
-// Specifically, `grad = -dy * y*y`, where `y = 1/x`, and `dy`
-// is the corresponding input gradient.
-func ReciprocalGrad(scope *Scope, y tf.Output, dy tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ReciprocalGrad",
-		Input: []tf.Input{
-			y, dy,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Returns the min of x and y (i.e. x < y ? x : y) element-wise.
-//
-// *NOTE*: `Minimum` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func Minimum(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Minimum",
-		Input: []tf.Input{
-			x, y,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// MfccAttr is an optional argument to Mfcc.
-type MfccAttr func(optionalAttr)
-
-// MfccUpperFrequencyLimit sets the optional upper_frequency_limit attribute to value.
-//
-// value: The highest frequency to use when calculating the
-// ceptstrum.
-// If not specified, defaults to 4000
-func MfccUpperFrequencyLimit(value float32) MfccAttr {
-	return func(m optionalAttr) {
-		m["upper_frequency_limit"] = value
-	}
-}
-
-// MfccLowerFrequencyLimit sets the optional lower_frequency_limit attribute to value.
-//
-// value: The lowest frequency to use when calculating the
-// ceptstrum.
-// If not specified, defaults to 20
-func MfccLowerFrequencyLimit(value float32) MfccAttr {
-	return func(m optionalAttr) {
-		m["lower_frequency_limit"] = value
-	}
-}
-
-// MfccFilterbankChannelCount sets the optional filterbank_channel_count attribute to value.
-//
-// value: Resolution of the Mel bank used internally.
-// If not specified, defaults to 40
-func MfccFilterbankChannelCount(value int64) MfccAttr {
-	return func(m optionalAttr) {
-		m["filterbank_channel_count"] = value
-	}
-}
-
-// MfccDctCoefficientCount sets the optional dct_coefficient_count attribute to value.
-//
-// value: How many output channels to produce per time slice.
-// If not specified, defaults to 13
-func MfccDctCoefficientCount(value int64) MfccAttr {
-	return func(m optionalAttr) {
-		m["dct_coefficient_count"] = value
-	}
-}
-
-// Transforms a spectrogram into a form that's useful for speech recognition.
-//
-// Mel Frequency Cepstral Coefficients are a way of representing audio data that's
-// been effective as an input feature for machine learning. They are created by
-// taking the spectrum of a spectrogram (a 'cepstrum'), and discarding some of the
-// higher frequencies that are less significant to the human ear. They have a long
-// history in the speech recognition world, and https://en.wikipedia.org/wiki/Mel-frequency_cepstrum
-// is a good resource to learn more.
-//
-// Arguments:
-//	spectrogram: Typically produced by the Spectrogram op, with magnitude_squared
-// set to true.
-//	sample_rate: How many samples per second the source audio used.
-func Mfcc(scope *Scope, spectrogram tf.Output, sample_rate tf.Output, optional ...MfccAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Mfcc",
-		Input: []tf.Input{
-			spectrogram, sample_rate,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // AudioSummaryAttr is an optional argument to AudioSummary.
 type AudioSummaryAttr func(optionalAttr)
 
@@ -18240,23 +18452,6 @@ func Qr(scope *Scope, input tf.Output, optional ...QrAttr) (q tf.Output, r tf.Ou
 	return op.Output(0), op.Output(1)
 }
 
-// Records the bytes size of each element of `input_dataset` in a StatsAggregator.
-func BytesProducedStatsDataset(scope *Scope, input_dataset tf.Output, tag tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "BytesProducedStatsDataset",
-		Input: []tf.Input{
-			input_dataset, tag,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Check if the input matches the regex pattern.
 //
 // The input is a string tensor of any shape. The pattern is the
@@ -18372,6 +18567,29 @@ func RFFT(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output
 	return op.Output(0)
 }
 
+// Adds a value to the current value of a variable.
+//
+// Any ReadVariableOp with a control dependency on this op is guaranteed to
+// see the incremented value or a subsequent newer one.
+//
+// Arguments:
+//	resource: handle to the resource in which to store the variable.
+//	value: the value by which the variable will be incremented.
+//
+// Returns the created operation.
+func AssignAddVariableOp(scope *Scope, resource tf.Output, value tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "AssignAddVariableOp",
+		Input: []tf.Input{
+			resource, value,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
 // QuantizedReluAttr is an optional argument to QuantizedRelu.
 type QuantizedReluAttr func(optionalAttr)
 
@@ -19161,149 +19379,6 @@ func TensorArrayV2(scope *Scope, size tf.Output, dtype tf.DataType, optional ...
 	return op.Output(0)
 }
 
-// DecodeCSVAttr is an optional argument to DecodeCSV.
-type DecodeCSVAttr func(optionalAttr)
-
-// DecodeCSVFieldDelim sets the optional field_delim attribute to value.
-//
-// value: char delimiter to separate fields in a record.
-// If not specified, defaults to ","
-func DecodeCSVFieldDelim(value string) DecodeCSVAttr {
-	return func(m optionalAttr) {
-		m["field_delim"] = value
-	}
-}
-
-// DecodeCSVUseQuoteDelim sets the optional use_quote_delim attribute to value.
-//
-// value: If false, treats double quotation marks as regular
-// characters inside of the string fields (ignoring RFC 4180, Section 2,
-// Bullet 5).
-// If not specified, defaults to true
-func DecodeCSVUseQuoteDelim(value bool) DecodeCSVAttr {
-	return func(m optionalAttr) {
-		m["use_quote_delim"] = value
-	}
-}
-
-// DecodeCSVNaValue sets the optional na_value attribute to value.
-//
-// value: Additional string to recognize as NA/NaN.
-// If not specified, defaults to ""
-func DecodeCSVNaValue(value string) DecodeCSVAttr {
-	return func(m optionalAttr) {
-		m["na_value"] = value
-	}
-}
-
-// DecodeCSVSelectCols sets the optional select_cols attribute to value.
-// If not specified, defaults to <>
-func DecodeCSVSelectCols(value []int64) DecodeCSVAttr {
-	return func(m optionalAttr) {
-		m["select_cols"] = value
-	}
-}
-
-// Convert CSV records to tensors. Each column maps to one tensor.
-//
-// RFC 4180 format is expected for the CSV records.
-// (https://tools.ietf.org/html/rfc4180)
-// Note that we allow leading and trailing spaces with int or float field.
-//
-// Arguments:
-//	records: Each string is a record/row in the csv and all records should have
-// the same format.
-//	record_defaults: One tensor per column of the input record, with either a
-// scalar default value for that column or an empty vector if the column is
-// required.
-//
-// Returns Each tensor will have the same shape as records.
-func DecodeCSV(scope *Scope, records tf.Output, record_defaults []tf.Output, optional ...DecodeCSVAttr) (output []tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "DecodeCSV",
-		Input: []tf.Input{
-			records, tf.OutputList(record_defaults),
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if output, idx, err = makeOutputList(op, idx, "output"); err != nil {
-		scope.UpdateErr("DecodeCSV", err)
-		return
-	}
-	return output
-}
-
-// MapClearAttr is an optional argument to MapClear.
-type MapClearAttr func(optionalAttr)
-
-// MapClearCapacity sets the optional capacity attribute to value.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func MapClearCapacity(value int64) MapClearAttr {
-	return func(m optionalAttr) {
-		m["capacity"] = value
-	}
-}
-
-// MapClearMemoryLimit sets the optional memory_limit attribute to value.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func MapClearMemoryLimit(value int64) MapClearAttr {
-	return func(m optionalAttr) {
-		m["memory_limit"] = value
-	}
-}
-
-// MapClearContainer sets the optional container attribute to value.
-// If not specified, defaults to ""
-func MapClearContainer(value string) MapClearAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// MapClearSharedName sets the optional shared_name attribute to value.
-// If not specified, defaults to ""
-func MapClearSharedName(value string) MapClearAttr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// Op removes all elements in the underlying container.
-//
-// Returns the created operation.
-func MapClear(scope *Scope, dtypes []tf.DataType, optional ...MapClearAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtypes": dtypes}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "MapClear",
-
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
 // ThreadUnsafeUnigramCandidateSamplerAttr is an optional argument to ThreadUnsafeUnigramCandidateSampler.
 type ThreadUnsafeUnigramCandidateSamplerAttr func(optionalAttr)
 
@@ -19523,6 +19598,119 @@ func MutableDenseHashTableV2(scope *Scope, empty_key tf.Output, deleted_key tf.O
 	return op.Output(0)
 }
 
+// Computes the gradient for the inverse of `x` wrt its input.
+//
+// Specifically, `grad = -dy * y*y`, where `y = 1/x`, and `dy`
+// is the corresponding input gradient.
+func ReciprocalGrad(scope *Scope, y tf.Output, dy tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ReciprocalGrad",
+		Input: []tf.Input{
+			y, dy,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns the min of x and y (i.e. x < y ? x : y) element-wise.
+//
+// *NOTE*: `Minimum` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func Minimum(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Minimum",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// MfccAttr is an optional argument to Mfcc.
+type MfccAttr func(optionalAttr)
+
+// MfccUpperFrequencyLimit sets the optional upper_frequency_limit attribute to value.
+//
+// value: The highest frequency to use when calculating the
+// ceptstrum.
+// If not specified, defaults to 4000
+func MfccUpperFrequencyLimit(value float32) MfccAttr {
+	return func(m optionalAttr) {
+		m["upper_frequency_limit"] = value
+	}
+}
+
+// MfccLowerFrequencyLimit sets the optional lower_frequency_limit attribute to value.
+//
+// value: The lowest frequency to use when calculating the
+// ceptstrum.
+// If not specified, defaults to 20
+func MfccLowerFrequencyLimit(value float32) MfccAttr {
+	return func(m optionalAttr) {
+		m["lower_frequency_limit"] = value
+	}
+}
+
+// MfccFilterbankChannelCount sets the optional filterbank_channel_count attribute to value.
+//
+// value: Resolution of the Mel bank used internally.
+// If not specified, defaults to 40
+func MfccFilterbankChannelCount(value int64) MfccAttr {
+	return func(m optionalAttr) {
+		m["filterbank_channel_count"] = value
+	}
+}
+
+// MfccDctCoefficientCount sets the optional dct_coefficient_count attribute to value.
+//
+// value: How many output channels to produce per time slice.
+// If not specified, defaults to 13
+func MfccDctCoefficientCount(value int64) MfccAttr {
+	return func(m optionalAttr) {
+		m["dct_coefficient_count"] = value
+	}
+}
+
+// Transforms a spectrogram into a form that's useful for speech recognition.
+//
+// Mel Frequency Cepstral Coefficients are a way of representing audio data that's
+// been effective as an input feature for machine learning. They are created by
+// taking the spectrum of a spectrogram (a 'cepstrum'), and discarding some of the
+// higher frequencies that are less significant to the human ear. They have a long
+// history in the speech recognition world, and https://en.wikipedia.org/wiki/Mel-frequency_cepstrum
+// is a good resource to learn more.
+//
+// Arguments:
+//	spectrogram: Typically produced by the Spectrogram op, with magnitude_squared
+// set to true.
+//	sample_rate: How many samples per second the source audio used.
+func Mfcc(scope *Scope, spectrogram tf.Output, sample_rate tf.Output, optional ...MfccAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Mfcc",
+		Input: []tf.Input{
+			spectrogram, sample_rate,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Compute the Hurwitz zeta function \\(\zeta(x, q)\\).
 //
 // The Hurwitz zeta function is defined as:
@@ -19627,63 +19815,6 @@ func IFFT2D(scope *Scope, input tf.Output) (output tf.Output) {
 	return op.Output(0)
 }
 
-// ResourceApplyRMSPropAttr is an optional argument to ResourceApplyRMSProp.
-type ResourceApplyRMSPropAttr func(optionalAttr)
-
-// ResourceApplyRMSPropUseLocking sets the optional use_locking attribute to value.
-//
-// value: If `True`, updating of the var, ms, and mom tensors is protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceApplyRMSPropUseLocking(value bool) ResourceApplyRMSPropAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// Update '*var' according to the RMSProp algorithm.
-//
-// Note that in dense implementation of this algorithm, ms and mom will
-// update even if the grad is zero, but in this sparse implementation, ms
-// and mom will not update in iterations during which the grad is zero.
-//
-// mean_square = decay * mean_square + (1-decay) * gradient ** 2
-// Delta = learning_rate * gradient / sqrt(mean_square + epsilon)
-//
-// ms <- rho * ms_{t-1} + (1-rho) * grad * grad
-// mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon)
-// var <- var - mom
-//
-// Arguments:
-//	var_: Should be from a Variable().
-//	ms: Should be from a Variable().
-//	mom: Should be from a Variable().
-//	lr: Scaling factor. Must be a scalar.
-//	rho: Decay rate. Must be a scalar.
-//
-//	epsilon: Ridge term. Must be a scalar.
-//	grad: The gradient.
-//
-// Returns the created operation.
-func ResourceApplyRMSProp(scope *Scope, var_ tf.Output, ms tf.Output, mom tf.Output, lr tf.Output, rho tf.Output, momentum tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyRMSPropAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ResourceApplyRMSProp",
-		Input: []tf.Input{
-			var_, ms, mom, lr, rho, momentum, epsilon, grad,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
 // Returns element-wise remainder of division. This emulates C semantics in that
 //
 // the result here is consistent with a truncating divide. E.g. `truncate(x / y) *
@@ -20760,67 +20891,6 @@ func AddSparseToTensorsMap(scope *Scope, sparse_indices tf.Output, sparse_values
 	return op.Output(0)
 }
 
-// SparseTensorDenseMatMulAttr is an optional argument to SparseTensorDenseMatMul.
-type SparseTensorDenseMatMulAttr func(optionalAttr)
-
-// SparseTensorDenseMatMulAdjointA sets the optional adjoint_a attribute to value.
-//
-// value: Use the adjoint of A in the matrix multiply.  If A is complex, this
-// is transpose(conj(A)).  Otherwise it's transpose(A).
-// If not specified, defaults to false
-func SparseTensorDenseMatMulAdjointA(value bool) SparseTensorDenseMatMulAttr {
-	return func(m optionalAttr) {
-		m["adjoint_a"] = value
-	}
-}
-
-// SparseTensorDenseMatMulAdjointB sets the optional adjoint_b attribute to value.
-//
-// value: Use the adjoint of B in the matrix multiply.  If B is complex, this
-// is transpose(conj(B)).  Otherwise it's transpose(B).
-// If not specified, defaults to false
-func SparseTensorDenseMatMulAdjointB(value bool) SparseTensorDenseMatMulAttr {
-	return func(m optionalAttr) {
-		m["adjoint_b"] = value
-	}
-}
-
-// Multiply SparseTensor (of rank 2) "A" by dense matrix "B".
-//
-// No validity checking is performed on the indices of A.  However, the following
-// input format is recommended for optimal behavior:
-//
-// if adjoint_a == false:
-//   A should be sorted in lexicographically increasing order.  Use SparseReorder
-//   if you're not sure.
-// if adjoint_a == true:
-//   A should be sorted in order of increasing dimension 1 (i.e., "column major"
-//   order instead of "row major" order).
-//
-// Arguments:
-//	a_indices: 2-D.  The `indices` of the `SparseTensor`, size `[nnz, 2]` Matrix.
-//	a_values: 1-D.  The `values` of the `SparseTensor`, size `[nnz]` Vector.
-//	a_shape: 1-D.  The `shape` of the `SparseTensor`, size `[2]` Vector.
-//	b: 2-D.  A dense Matrix.
-func SparseTensorDenseMatMul(scope *Scope, a_indices tf.Output, a_values tf.Output, a_shape tf.Output, b tf.Output, optional ...SparseTensorDenseMatMulAttr) (product tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "SparseTensorDenseMatMul",
-		Input: []tf.Input{
-			a_indices, a_values, a_shape, b,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Deserialize and concatenate `SparseTensors` from a serialized minibatch.
 //
 // The input `serialized_sparse` must be a string matrix of shape `[N x 3]` where
@@ -20903,151 +20973,6 @@ func BitwiseAnd(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	return op.Output(0)
 }
 
-// SubstrAttr is an optional argument to Substr.
-type SubstrAttr func(optionalAttr)
-
-// SubstrUnit sets the optional unit attribute to value.
-//
-// value: The unit that is used to create the substring.  One of: `"BYTE"` (for
-// defining position and length by bytes) or `"UTF8_CHAR"` (for the UTF-8
-// encoded Unicode code points).  The default is `"BYTE"`. Results are undefined if
-// `unit=UTF8_CHAR` and the `input` strings do not contain structurally valid
-// UTF-8.
-// If not specified, defaults to "BYTE"
-func SubstrUnit(value string) SubstrAttr {
-	return func(m optionalAttr) {
-		m["unit"] = value
-	}
-}
-
-// Return substrings from `Tensor` of strings.
-//
-// For each string in the input `Tensor`, creates a substring starting at index
-// `pos` with a total length of `len`.
-//
-// If `len` defines a substring that would extend beyond the length of the input
-// string, then as many characters as possible are used.
-//
-// A negative `pos` indicates distance within the string backwards from the end.
-//
-// If `pos` specifies an index which is out of range for any of the input strings,
-// then an `InvalidArgumentError` is thrown.
-//
-// `pos` and `len` must have the same shape, otherwise a `ValueError` is thrown on
-// Op creation.
-//
-// *NOTE*: `Substr` supports broadcasting up to two dimensions. More about
-// broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-//
-// ---
-//
-// Examples
-//
-// Using scalar `pos` and `len`:
-//
-// ```python
-// input = [b'Hello', b'World']
-// position = 1
-// length = 3
-//
-// output = [b'ell', b'orl']
-// ```
-//
-// Using `pos` and `len` with same shape as `input`:
-//
-// ```python
-// input = [[b'ten', b'eleven', b'twelve'],
-//          [b'thirteen', b'fourteen', b'fifteen'],
-//          [b'sixteen', b'seventeen', b'eighteen']]
-// position = [[1, 2, 3],
-//             [1, 2, 3],
-//             [1, 2, 3]]
-// length =   [[2, 3, 4],
-//             [4, 3, 2],
-//             [5, 5, 5]]
-//
-// output = [[b'en', b'eve', b'lve'],
-//           [b'hirt', b'urt', b'te'],
-//           [b'ixtee', b'vente', b'hteen']]
-// ```
-//
-// Broadcasting `pos` and `len` onto `input`:
-//
-// ```
-// input = [[b'ten', b'eleven', b'twelve'],
-//          [b'thirteen', b'fourteen', b'fifteen'],
-//          [b'sixteen', b'seventeen', b'eighteen'],
-//          [b'nineteen', b'twenty', b'twentyone']]
-// position = [1, 2, 3]
-// length =   [1, 2, 3]
-//
-// output = [[b'e', b'ev', b'lve'],
-//           [b'h', b'ur', b'tee'],
-//           [b'i', b've', b'hte'],
-//           [b'i', b'en', b'nty']]
-// ```
-//
-// Broadcasting `input` onto `pos` and `len`:
-//
-// ```
-// input = b'thirteen'
-// position = [1, 5, 7]
-// length =   [3, 2, 1]
-//
-// output = [b'hir', b'ee', b'n']
-// ```
-//
-// Arguments:
-//	input: Tensor of strings
-//	pos: Scalar defining the position of first character in each substring
-//	len: Scalar defining the number of characters to include in each substring
-//
-// Returns Tensor of substrings
-func Substr(scope *Scope, input tf.Output, pos tf.Output, len tf.Output, optional ...SubstrAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Substr",
-		Input: []tf.Input{
-			input, pos, len,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Creates a Dataset that returns pseudorandom numbers.
-//
-// Arguments:
-//	seed: A scalar seed for the random number generator. If either seed or
-// seed2 is set to be non-zero, the random number generator is seeded
-// by the given seed.  Otherwise, a random seed is used.
-//	seed2: A second scalar seed to avoid seed collision.
-//
-//
-func RandomDataset(scope *Scope, seed tf.Output, seed2 tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "RandomDataset",
-		Input: []tf.Input{
-			seed, seed2,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Inverse real-valued fast Fourier transform.
 //
 // Computes the inverse 1-dimensional discrete Fourier transform of a real-valued
@@ -22230,6 +22155,34 @@ func MatrixTriangularSolve(scope *Scope, matrix tf.Output, rhs tf.Output, option
 	return op.Output(0)
 }
 
+// Saves tensors in V2 checkpoint format.
+//
+// By default, saves the named tensors in full.  If the caller wishes to save
+// specific slices of full tensors, "shape_and_slices" should be non-empty strings
+// and correspondingly well-formed.
+//
+// Arguments:
+//	prefix: Must have a single element. The prefix of the V2 checkpoint to which we
+// write the tensors.
+//	tensor_names: shape {N}. The names of the tensors to be saved.
+//	shape_and_slices: shape {N}.  The slice specs of the tensors to be saved.
+// Empty strings indicate that they are non-partitioned tensors.
+//	tensors: `N` tensors to save.
+//
+// Returns the created operation.
+func SaveV2(scope *Scope, prefix tf.Output, tensor_names tf.Output, shape_and_slices tf.Output, tensors []tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SaveV2",
+		Input: []tf.Input{
+			prefix, tensor_names, shape_and_slices, tf.OutputList(tensors),
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
 // UnicodeTranscodeAttr is an optional argument to UnicodeTranscode.
 type UnicodeTranscodeAttr func(optionalAttr)
 
@@ -25011,6 +24964,75 @@ func Cumsum(scope *Scope, x tf.Output, axis tf.Output, optional ...CumsumAttr) (
 	return op.Output(0)
 }
 
+// Return the shape of s0 op s1 with broadcast.
+//
+// Given `s0` and `s1`, tensors that represent shapes, compute `r0`, the
+// broadcasted shape. `s0`, `s1` and `r0` are all integer vectors.
+func BroadcastArgs(scope *Scope, s0 tf.Output, s1 tf.Output) (r0 tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "BroadcastArgs",
+		Input: []tf.Input{
+			s0, s1,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// DataFormatDimMapAttr is an optional argument to DataFormatDimMap.
+type DataFormatDimMapAttr func(optionalAttr)
+
+// DataFormatDimMapSrcFormat sets the optional src_format attribute to value.
+//
+// value: source data format.
+// If not specified, defaults to "NHWC"
+func DataFormatDimMapSrcFormat(value string) DataFormatDimMapAttr {
+	return func(m optionalAttr) {
+		m["src_format"] = value
+	}
+}
+
+// DataFormatDimMapDstFormat sets the optional dst_format attribute to value.
+//
+// value: destination data format.
+// If not specified, defaults to "NCHW"
+func DataFormatDimMapDstFormat(value string) DataFormatDimMapAttr {
+	return func(m optionalAttr) {
+		m["dst_format"] = value
+	}
+}
+
+// Returns the dimension index in the destination data format given the one in
+//
+// the source data format.
+//
+// Arguments:
+//	x: A Tensor with each element as a dimension index in source data format.
+// Must be in the range [-4, 4).
+//
+// Returns A Tensor with each element as a dimension index in destination data format.
+func DataFormatDimMap(scope *Scope, x tf.Output, optional ...DataFormatDimMapAttr) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "DataFormatDimMap",
+		Input: []tf.Input{
+			x,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // CumprodAttr is an optional argument to Cumprod.
 type CumprodAttr func(optionalAttr)
 
@@ -26322,24 +26344,6 @@ func TensorArrayReadV3(scope *Scope, handle tf.Output, index tf.Output, flow_in
 	return op.Output(0)
 }
 
-// Computes the gradient for the tanh of `x` wrt its input.
-//
-// Specifically, `grad = dy * (1 - y*y)`, where `y = tanh(x)`, and `dy`
-// is the corresponding input gradient.
-func TanhGrad(scope *Scope, y tf.Output, dy tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "TanhGrad",
-		Input: []tf.Input{
-			y, dy,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Reduces sparse updates into the variable referenced by `resource` using the `max` operation.
 //
 // This operation computes
@@ -26381,6 +26385,24 @@ func ResourceScatterMax(scope *Scope, resource tf.Output, indices tf.Output, upd
 	return scope.AddOperation(opspec)
 }
 
+// Computes the gradient for the tanh of `x` wrt its input.
+//
+// Specifically, `grad = dy * (1 - y*y)`, where `y = tanh(x)`, and `dy`
+// is the corresponding input gradient.
+func TanhGrad(scope *Scope, y tf.Output, dy tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TanhGrad",
+		Input: []tf.Input{
+			y, dy,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Outputs a `Summary` protocol buffer with scalar values.
 //
 // The input `tags` and `values` must have the same shape.  The generated summary
@@ -26669,24 +26691,6 @@ func MergeSummary(scope *Scope, inputs []tf.Output) (summary tf.Output) {
 	return op.Output(0)
 }
 
-// Returns the number of tensors in the input tensor list.
-//
-// input_handle: the input list
-// length: the number of tensors in the list
-func TensorListLength(scope *Scope, input_handle tf.Output) (length tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "TensorListLength",
-		Input: []tf.Input{
-			input_handle,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // The shape of the elements of the given list, as a tensor.
 //
 //   input_handle: the list
@@ -27361,35 +27365,6 @@ func MatrixSquareRoot(scope *Scope, input tf.Output) (output tf.Output) {
 	return op.Output(0)
 }
 
-// Convert JSON-encoded Example records to binary protocol buffer strings.
-//
-// This op translates a tensor containing Example records, encoded using
-// the [standard JSON
-// mapping](https://developers.google.com/protocol-buffers/docs/proto3#json),
-// into a tensor containing the same records encoded as binary protocol
-// buffers. The resulting tensor can then be fed to any of the other
-// Example-parsing ops.
-//
-// Arguments:
-//	json_examples: Each string is a JSON object serialized according to the JSON
-// mapping of the Example proto.
-//
-// Returns Each string is a binary Example protocol buffer corresponding
-// to the respective element of `json_examples`.
-func DecodeJSONExample(scope *Scope, json_examples tf.Output) (binary_examples tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "DecodeJSONExample",
-		Input: []tf.Input{
-			json_examples,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // SvdAttr is an optional argument to Svd.
 type SvdAttr func(optionalAttr)
 
@@ -28426,6 +28401,49 @@ func ReaderReadUpToV2(scope *Scope, reader_handle tf.Output, queue_handle tf.Out
 	return op.Output(0), op.Output(1)
 }
 
+//     Adds v into specified rows of x.
+//
+//     Computes y = x; y[i, :] += v; return y.
+//
+// Arguments:
+//	x: A `Tensor` of type T.
+//	i: A vector. Indices into the left-most dimension of `x`.
+//	v: A `Tensor` of type T. Same dimension sizes as x except the first dimension, which must be the same as i's size.
+//
+// Returns A `Tensor` of type T. An alias of `x`. The content of `y` is undefined if there are duplicates in `i`.
+func InplaceAdd(scope *Scope, x tf.Output, i tf.Output, v tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "InplaceAdd",
+		Input: []tf.Input{
+			x, i, v,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Restore a Reader to its initial clean state.
+//
+// Arguments:
+//	reader_handle: Handle to a Reader.
+//
+// Returns the created operation.
+func ReaderResetV2(scope *Scope, reader_handle tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ReaderResetV2",
+		Input: []tf.Input{
+			reader_handle,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
 // BatchAttr is an optional argument to Batch.
 type BatchAttr func(optionalAttr)
 
@@ -29711,71 +29729,6 @@ func CropAndResizeGradBoxes(scope *Scope, grads tf.Output, image tf.Output, boxe
 	return op.Output(0)
 }
 
-// Saves tensors in V2 checkpoint format.
-//
-// By default, saves the named tensors in full.  If the caller wishes to save
-// specific slices of full tensors, "shape_and_slices" should be non-empty strings
-// and correspondingly well-formed.
-//
-// Arguments:
-//	prefix: Must have a single element. The prefix of the V2 checkpoint to which we
-// write the tensors.
-//	tensor_names: shape {N}. The names of the tensors to be saved.
-//	shape_and_slices: shape {N}.  The slice specs of the tensors to be saved.
-// Empty strings indicate that they are non-partitioned tensors.
-//	tensors: `N` tensors to save.
-//
-// Returns the created operation.
-func SaveV2(scope *Scope, prefix tf.Output, tensor_names tf.Output, shape_and_slices tf.Output, tensors []tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SaveV2",
-		Input: []tf.Input{
-			prefix, tensor_names, shape_and_slices, tf.OutputList(tensors),
-		},
-	}
-	return scope.AddOperation(opspec)
-}
-
-// StatsAggregatorHandleAttr is an optional argument to StatsAggregatorHandle.
-type StatsAggregatorHandleAttr func(optionalAttr)
-
-// StatsAggregatorHandleContainer sets the optional container attribute to value.
-// If not specified, defaults to ""
-func StatsAggregatorHandleContainer(value string) StatsAggregatorHandleAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// StatsAggregatorHandleSharedName sets the optional shared_name attribute to value.
-// If not specified, defaults to ""
-func StatsAggregatorHandleSharedName(value string) StatsAggregatorHandleAttr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// Creates a statistics manager resource.
-func StatsAggregatorHandle(scope *Scope, optional ...StatsAggregatorHandleAttr) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "StatsAggregatorHandle",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Greedily selects a subset of bounding boxes in descending order of score,
 //
 // pruning away boxes that have high intersection-over-union (IOU) overlap
@@ -29955,6 +29908,46 @@ func FakeParam(scope *Scope, dtype tf.DataType, shape tf.Shape) (output tf.Outpu
 	return op.Output(0)
 }
 
+// Computes the gradient for the inverse of `x` wrt its input.
+//
+// Specifically, `grad = -dy * y*y`, where `y = 1/x`, and `dy`
+// is the corresponding input gradient.
+func InvGrad(scope *Scope, y tf.Output, dy tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "InvGrad",
+		Input: []tf.Input{
+			y, dy,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// List of the given size with empty elements.
+//
+// element_shape: the shape of the future elements of the list
+// num_elements: the number of elements to reserve
+// handle: the output list
+// element_dtype: the desired type of elements in the list.
+func TensorListReserve(scope *Scope, element_shape tf.Output, num_elements tf.Output, element_dtype tf.DataType) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"element_dtype": element_dtype}
+	opspec := tf.OpSpec{
+		Type: "TensorListReserve",
+		Input: []tf.Input{
+			element_shape, num_elements,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // A substitute for `InterleaveDataset` on a fixed list of `N` datasets.
 //
 // Arguments:
@@ -29980,276 +29973,48 @@ func ExperimentalDirectedInterleaveDataset(scope *Scope, selector_input_dataset
 	return op.Output(0)
 }
 
-// Gets the next element from a FunctionBufferingResource.
-//
-// Arguments:
-//	function_buffer_resource: The FunctionBufferingResource handle.
-//	output_types: The type list for the return values.
-//
-// Returns A list of return values.
-func ExperimentalFunctionBufferingResourceGetNext(scope *Scope, function_buffer_resource tf.Output, output_types []tf.DataType) (output []tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"output_types": output_types}
-	opspec := tf.OpSpec{
-		Type: "ExperimentalFunctionBufferingResourceGetNext",
-		Input: []tf.Input{
-			function_buffer_resource,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if output, idx, err = makeOutputList(op, idx, "output"); err != nil {
-		scope.UpdateErr("ExperimentalFunctionBufferingResourceGetNext", err)
-		return
-	}
-	return output
-}
+// RandomUniformIntAttr is an optional argument to RandomUniformInt.
+type RandomUniformIntAttr func(optionalAttr)
 
-//     Adds v into specified rows of x.
+// RandomUniformIntSeed sets the optional seed attribute to value.
 //
-//     Computes y = x; y[i, :] += v; return y.
-//
-// Arguments:
-//	x: A `Tensor` of type T.
-//	i: A vector. Indices into the left-most dimension of `x`.
-//	v: A `Tensor` of type T. Same dimension sizes as x except the first dimension, which must be the same as i's size.
-//
-// Returns A `Tensor` of type T. An alias of `x`. The content of `y` is undefined if there are duplicates in `i`.
-func InplaceAdd(scope *Scope, x tf.Output, i tf.Output, v tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "InplaceAdd",
-		Input: []tf.Input{
-			x, i, v,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Restore a Reader to its initial clean state.
-//
-// Arguments:
-//	reader_handle: Handle to a Reader.
-//
-// Returns the created operation.
-func ReaderResetV2(scope *Scope, reader_handle tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ReaderResetV2",
-		Input: []tf.Input{
-			reader_handle,
-		},
-	}
-	return scope.AddOperation(opspec)
-}
-
-// A dataset that splits the elements of its input into multiple elements.
-func UnbatchDataset(scope *Scope, input_dataset tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "UnbatchDataset",
-		Input: []tf.Input{
-			input_dataset,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// OrderedMapStageAttr is an optional argument to OrderedMapStage.
-type OrderedMapStageAttr func(optionalAttr)
-
-// OrderedMapStageCapacity sets the optional capacity attribute to value.
-//
-// value: Maximum number of elements in the Staging Area. If > 0, inserts
-// on the container will block when the capacity is reached.
+// value: If either `seed` or `seed2` are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
 // If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func OrderedMapStageCapacity(value int64) OrderedMapStageAttr {
+func RandomUniformIntSeed(value int64) RandomUniformIntAttr {
 	return func(m optionalAttr) {
-		m["capacity"] = value
+		m["seed"] = value
 	}
 }
 
-// OrderedMapStageMemoryLimit sets the optional memory_limit attribute to value.
+// RandomUniformIntSeed2 sets the optional seed2 attribute to value.
+//
+// value: A second seed to avoid seed collision.
 // If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func OrderedMapStageMemoryLimit(value int64) OrderedMapStageAttr {
+func RandomUniformIntSeed2(value int64) RandomUniformIntAttr {
 	return func(m optionalAttr) {
-		m["memory_limit"] = value
+		m["seed2"] = value
 	}
 }
 
-// OrderedMapStageContainer sets the optional container attribute to value.
+// Outputs random integers from a uniform distribution.
 //
-// value: If non-empty, this queue is placed in the given container. Otherwise,
-// a default container is used.
-// If not specified, defaults to ""
-func OrderedMapStageContainer(value string) OrderedMapStageAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// OrderedMapStageSharedName sets the optional shared_name attribute to value.
+// The generated values are uniform integers in the range `[minval, maxval)`.
+// The lower bound `minval` is included in the range, while the upper bound
+// `maxval` is excluded.
 //
-// value: It is necessary to match this name to the matching Unstage Op.
-// If not specified, defaults to ""
-func OrderedMapStageSharedName(value string) OrderedMapStageAttr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// Stage (key, values) in the underlying container which behaves like a ordered
-//
-// associative container.   Elements are ordered by key.
+// The random integers are slightly biased unless `maxval - minval` is an exact
+// power of two.  The bias is small for values of `maxval - minval` significantly
+// smaller than the range of the output (either `2^32` or `2^64`).
 //
 // Arguments:
-//	key: int64
+//	shape: The shape of the output tensor.
+//	minval: 0-D.  Inclusive lower bound on the generated integers.
+//	maxval: 0-D.  Exclusive upper bound on the generated integers.
 //
-//	values: a list of tensors
-// dtypes A list of data types that inserted values should adhere to.
-//
-//
-// Returns the created operation.
-func OrderedMapStage(scope *Scope, key tf.Output, indices tf.Output, values []tf.Output, dtypes []tf.DataType, optional ...OrderedMapStageAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtypes": dtypes}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "OrderedMapStage",
-		Input: []tf.Input{
-			key, indices, tf.OutputList(values),
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// RpcAttr is an optional argument to Rpc.
-type RpcAttr func(optionalAttr)
-
-// RpcProtocol sets the optional protocol attribute to value.
-//
-// value: RPC protocol to use.  Empty string means use the default protocol.
-// Options include 'grpc'.
-// If not specified, defaults to ""
-func RpcProtocol(value string) RpcAttr {
-	return func(m optionalAttr) {
-		m["protocol"] = value
-	}
-}
-
-// RpcFailFast sets the optional fail_fast attribute to value.
-//
-// value: `boolean`. If `true` (default), then failures to connect
-// (i.e., the server does not immediately respond) cause an RPC failure.
-// If not specified, defaults to true
-func RpcFailFast(value bool) RpcAttr {
-	return func(m optionalAttr) {
-		m["fail_fast"] = value
-	}
-}
-
-// RpcTimeoutInMs sets the optional timeout_in_ms attribute to value.
-//
-// value: `int`. If `0` (default), then the kernel will run the RPC
-// request and only time out if the RPC deadline passes or the session times out.
-// If this value is greater than `0`, then the op will raise an exception if
-// the RPC takes longer than `timeout_in_ms`.
-// If not specified, defaults to 0
-func RpcTimeoutInMs(value int64) RpcAttr {
-	return func(m optionalAttr) {
-		m["timeout_in_ms"] = value
-	}
-}
-
-// Perform batches of RPC requests.
-//
-// This op asynchronously performs either a single RPC request, or a batch
-// of requests.  RPC requests are defined by three main parameters:
-//
-//   - `address` (the host+port or BNS address of the request)
-//   - `method` (the RPC method name for the request)
-//   - `request` (the serialized proto string, or vector of strings,
-//      of the RPC request argument).
-//
-// For example, if you have an RPC service running on port localhost:2345,
-// and its interface is configured with the following proto declaration:
-//
-// ```
-// service MyService {
-//   rpc MyMethod(MyRequestProto) returns (MyResponseProto) {
-//   }
-// };
-// ```
-//
-// then call this op with arguments:
-//
-// ```
-// address = "localhost:2345"
-// method = "MyService/MyMethod"
-// ```
-//
-// The `request` tensor is a string tensor representing serialized `MyRequestProto`
-// strings; and the output string tensor `response` will have the same shape
-// and contain (upon successful completion) corresponding serialized
-// `MyResponseProto` strings.
-//
-// For example, to send a single, empty, `MyRequestProto`, call
-// this op with `request = ""`.  To send 5 **parallel** empty requests,
-// call this op with `request = ["", "", "", "", ""]`.
-//
-// More generally, one can create a batch of `MyRequestProto` serialized protos
-// from regular batched tensors using the `encode_proto` op, and convert
-// the response `MyResponseProto` serialized protos to batched tensors
-// using the `decode_proto` op.
-//
-// **NOTE** Working with serialized proto strings is faster than instantiating
-// actual proto objects in memory, so no performance degradation is expected
-// compared to writing custom kernels for this workflow.
-//
-// If the connection fails or the remote worker returns an error
-// status, the op reraises this exception locally.
-//
-// See the `TryRpc` op if you prefer to handle RPC failures manually in the graph.
-//
-// Arguments:
-//	address: `0-D` or `1-D`.  The address (i.e. host_name:port) of the RPC server.
-// If this tensor has more than 1 element, then multiple parallel rpc requests
-// are sent.  This argument broadcasts with `method` and `request`.
-//	method: `0-D` or `1-D`.  The method address on the RPC server.
-// If this tensor has more than 1 element, then multiple parallel rpc requests
-// are sent.  This argument broadcasts with `address` and `request`.
-//	request: `0-D` or `1-D`.  Serialized proto strings: the rpc request argument.
-// If this tensor has more than 1 element, then multiple parallel rpc requests
-// are sent.  This argument broadcasts with `address` and `method`.
-//
-// Returns Same shape as `request`. Serialized proto strings: the rpc responses.
-func Rpc(scope *Scope, address tf.Output, method tf.Output, request tf.Output, optional ...RpcAttr) (response tf.Output) {
+// Returns A tensor of the specified shape filled with uniform random integers.
+func RandomUniformInt(scope *Scope, shape tf.Output, minval tf.Output, maxval tf.Output, optional ...RandomUniformIntAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -30258,9 +30023,9 @@ func Rpc(scope *Scope, address tf.Output, method tf.Output, request tf.Output, o
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Rpc",
+		Type: "RandomUniformInt",
 		Input: []tf.Input{
-			address, method, request,
+			shape, minval, maxval,
 		},
 		Attrs: attrs,
 	}
@@ -30268,59 +30033,25 @@ func Rpc(scope *Scope, address tf.Output, method tf.Output, request tf.Output, o
 	return op.Output(0)
 }
 
-// StackPushV2Attr is an optional argument to StackPushV2.
-type StackPushV2Attr func(optionalAttr)
-
-// StackPushV2SwapMemory sets the optional swap_memory attribute to value.
+// Add the quantile summaries to each quantile stream resource.
 //
-// value: Swap `elem` to CPU. Default to false.
-// If not specified, defaults to false
-func StackPushV2SwapMemory(value bool) StackPushV2Attr {
-	return func(m optionalAttr) {
-		m["swap_memory"] = value
-	}
-}
-
-// Push an element onto the stack.
+// An op that adds a list of quantile summaries to a quantile stream resource. Each
+// summary Tensor is rank 2, containing summaries (value, weight, min_rank, max_rank)
+// for a single feature.
 //
 // Arguments:
-//	handle: The handle to a stack.
-//	elem: The tensor to be pushed onto the stack.
-//
-// Returns The same tensor as the input 'elem'.
-func StackPushV2(scope *Scope, handle tf.Output, elem tf.Output, optional ...StackPushV2Attr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "StackPushV2",
-		Input: []tf.Input{
-			handle, elem,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Resets the FunctionBufferingResource.
-//
-// Arguments:
-//	function_buffer_resource: The FunctionBufferingResource handle.
+//	quantile_stream_resource_handle: resource handle referring to a QuantileStreamResource.
+//	summaries: string; List of Rank 2 Tensor each containing the summaries for a single feature.
 //
 // Returns the created operation.
-func ExperimentalFunctionBufferingResourceReset(scope *Scope, function_buffer_resource tf.Output) (o *tf.Operation) {
+func BoostedTreesQuantileStreamResourceAddSummaries(scope *Scope, quantile_stream_resource_handle tf.Output, summaries []tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "ExperimentalFunctionBufferingResourceReset",
+		Type: "BoostedTreesQuantileStreamResourceAddSummaries",
 		Input: []tf.Input{
-			function_buffer_resource,
+			quantile_stream_resource_handle, tf.OutputList(summaries),
 		},
 	}
 	return scope.AddOperation(opspec)
@@ -30615,168 +30346,6 @@ func ConcatenateDataset(scope *Scope, input_dataset tf.Output, another_dataset t
 	return op.Output(0)
 }
 
-// Adds a value to the current value of a variable.
-//
-// Any ReadVariableOp with a control dependency on this op is guaranteed to
-// see the incremented value or a subsequent newer one.
-//
-// Arguments:
-//	resource: handle to the resource in which to store the variable.
-//	value: the value by which the variable will be incremented.
-//
-// Returns the created operation.
-func AssignAddVariableOp(scope *Scope, resource tf.Output, value tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "AssignAddVariableOp",
-		Input: []tf.Input{
-			resource, value,
-		},
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Records the latency of producing `input_dataset` elements in a StatsAggregator.
-func LatencyStatsDataset(scope *Scope, input_dataset tf.Output, tag tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "LatencyStatsDataset",
-		Input: []tf.Input{
-			input_dataset, tag,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// MapSizeAttr is an optional argument to MapSize.
-type MapSizeAttr func(optionalAttr)
-
-// MapSizeCapacity sets the optional capacity attribute to value.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func MapSizeCapacity(value int64) MapSizeAttr {
-	return func(m optionalAttr) {
-		m["capacity"] = value
-	}
-}
-
-// MapSizeMemoryLimit sets the optional memory_limit attribute to value.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func MapSizeMemoryLimit(value int64) MapSizeAttr {
-	return func(m optionalAttr) {
-		m["memory_limit"] = value
-	}
-}
-
-// MapSizeContainer sets the optional container attribute to value.
-// If not specified, defaults to ""
-func MapSizeContainer(value string) MapSizeAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// MapSizeSharedName sets the optional shared_name attribute to value.
-// If not specified, defaults to ""
-func MapSizeSharedName(value string) MapSizeAttr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// Op returns the number of elements in the underlying container.
-func MapSize(scope *Scope, dtypes []tf.DataType, optional ...MapSizeAttr) (size tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtypes": dtypes}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "MapSize",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// SparseToDenseAttr is an optional argument to SparseToDense.
-type SparseToDenseAttr func(optionalAttr)
-
-// SparseToDenseValidateIndices sets the optional validate_indices attribute to value.
-//
-// value: If true, indices are checked to make sure they are sorted in
-// lexicographic order and that there are no repeats.
-// If not specified, defaults to true
-func SparseToDenseValidateIndices(value bool) SparseToDenseAttr {
-	return func(m optionalAttr) {
-		m["validate_indices"] = value
-	}
-}
-
-// Converts a sparse representation into a dense tensor.
-//
-// Builds an array `dense` with shape `output_shape` such that
-//
-// ```
-// # If sparse_indices is scalar
-// dense[i] = (i == sparse_indices ? sparse_values : default_value)
-//
-// # If sparse_indices is a vector, then for each i
-// dense[sparse_indices[i]] = sparse_values[i]
-//
-// # If sparse_indices is an n by d matrix, then for each i in [0, n)
-// dense[sparse_indices[i][0], ..., sparse_indices[i][d-1]] = sparse_values[i]
-// ```
-//
-// All other values in `dense` are set to `default_value`.  If `sparse_values` is a
-// scalar, all sparse indices are set to this single value.
-//
-// Indices should be sorted in lexicographic order, and indices must not
-// contain any repeats. If `validate_indices` is true, these properties
-// are checked during execution.
-//
-// Arguments:
-//	sparse_indices: 0-D, 1-D, or 2-D.  `sparse_indices[i]` contains the complete
-// index where `sparse_values[i]` will be placed.
-//	output_shape: 1-D.  Shape of the dense output tensor.
-//	sparse_values: 1-D.  Values corresponding to each row of `sparse_indices`,
-// or a scalar value to be used for all sparse indices.
-//	default_value: Scalar value to set for indices not specified in
-// `sparse_indices`.
-//
-// Returns Dense output tensor of shape `output_shape`.
-func SparseToDense(scope *Scope, sparse_indices tf.Output, output_shape tf.Output, sparse_values tf.Output, default_value tf.Output, optional ...SparseToDenseAttr) (dense tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "SparseToDense",
-		Input: []tf.Input{
-			sparse_indices, output_shape, sparse_values, default_value,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Computes the grayscale dilation of 4-D `input` and 3-D `filter` tensors.
 //
 // The `input` tensor has shape `[batch, in_height, in_width, depth]` and the
@@ -30910,52 +30479,6 @@ func PaddedBatchDataset(scope *Scope, input_dataset tf.Output, batch_size tf.Out
 	return op.Output(0)
 }
 
-// Creates a dataset that batches input elements into a SparseTensor.
-//
-// Arguments:
-//	input_dataset: A handle to an input dataset. Must have a single component.
-//	batch_size: A scalar representing the number of elements to accumulate in a
-// batch.
-//	row_shape: A vector representing the dense shape of each row in the produced
-// SparseTensor. The shape may be partially specified, using `-1` to indicate
-// that a particular dimension should use the maximum size of all batch elements.
-//
-//
-func DenseToSparseBatchDataset(scope *Scope, input_dataset tf.Output, batch_size tf.Output, row_shape tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "DenseToSparseBatchDataset",
-		Input: []tf.Input{
-			input_dataset, batch_size, row_shape,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Deprecated. Use TensorArrayGradV3
-//
-// DEPRECATED at GraphDef version 26: Use TensorArrayGradV3
-func TensorArrayGradV2(scope *Scope, handle tf.Output, flow_in tf.Output, source string) (grad_handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"source": source}
-	opspec := tf.OpSpec{
-		Type: "TensorArrayGradV2",
-		Input: []tf.Input{
-			handle, flow_in,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Creates a dataset that shuffles and repeats elements from `input_dataset`
 //
 // pseudorandomly.
@@ -31561,46 +31084,6 @@ func FIFOQueueV2(scope *Scope, component_types []tf.DataType, optional ...FIFOQu
 	return op.Output(0)
 }
 
-// Produces a summary of any statistics recorded by the given statistics manager.
-func StatsAggregatorSummary(scope *Scope, iterator tf.Output) (summary tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "StatsAggregatorSummary",
-		Input: []tf.Input{
-			iterator,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Compute the pairwise cross product.
-//
-// `a` and `b` must be the same shape; they can either be simple 3-element vectors,
-// or any shape where the innermost dimension is 3. In the latter case, each pair
-// of corresponding 3-element vectors is cross-multiplied independently.
-//
-// Arguments:
-//	a: A tensor containing 3-element vectors.
-//	b: Another tensor, of same type and shape as `a`.
-//
-// Returns Pairwise cross product of the vectors in `a` and `b`.
-func Cross(scope *Scope, a tf.Output, b tf.Output) (product tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Cross",
-		Input: []tf.Input{
-			a, b,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Constructs an Optional variant from a tuple of tensors.
 func OptionalFromValue(scope *Scope, components []tf.Output) (optional tf.Output) {
 	if scope.Err() != nil {
@@ -31769,30 +31252,6 @@ func OptionalHasValue(scope *Scope, optional tf.Output) (has_value tf.Output) {
 	return op.Output(0)
 }
 
-// Creates a dataset that executes a SQL query and emits rows of the result set.
-//
-// Arguments:
-//	driver_name: The database type. Currently, the only supported type is 'sqlite'.
-//	data_source_name: A connection string to connect to the database.
-//	query: A SQL query to execute.
-//
-//
-func SqlDataset(scope *Scope, driver_name tf.Output, data_source_name tf.Output, query tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "SqlDataset",
-		Input: []tf.Input{
-			driver_name, data_source_name, query,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Returns the value stored in an Optional variant or raises an error if none exists.
 func OptionalGetValue(scope *Scope, optional tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (components []tf.Output) {
 	if scope.Err() != nil {
@@ -32208,46 +31667,6 @@ func ParallelDynamicStitch(scope *Scope, indices []tf.Output, data []tf.Output)
 	return op.Output(0)
 }
 
-// Computes the gradient for the inverse of `x` wrt its input.
-//
-// Specifically, `grad = -dy * y*y`, where `y = 1/x`, and `dy`
-// is the corresponding input gradient.
-func InvGrad(scope *Scope, y tf.Output, dy tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "InvGrad",
-		Input: []tf.Input{
-			y, dy,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// List of the given size with empty elements.
-//
-// element_shape: the shape of the future elements of the list
-// num_elements: the number of elements to reserve
-// handle: the output list
-// element_dtype: the desired type of elements in the list.
-func TensorListReserve(scope *Scope, element_shape tf.Output, num_elements tf.Output, element_dtype tf.DataType) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"element_dtype": element_dtype}
-	opspec := tf.OpSpec{
-		Type: "TensorListReserve",
-		Input: []tf.Input{
-			element_shape, num_elements,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // PriorityQueueV2Attr is an optional argument to PriorityQueueV2.
 type PriorityQueueV2Attr func(optionalAttr)
 
@@ -32786,6 +32205,241 @@ func StackV2(scope *Scope, max_size tf.Output, elem_type tf.DataType, optional .
 	return op.Output(0)
 }
 
+// OrderedMapStageAttr is an optional argument to OrderedMapStage.
+type OrderedMapStageAttr func(optionalAttr)
+
+// OrderedMapStageCapacity sets the optional capacity attribute to value.
+//
+// value: Maximum number of elements in the Staging Area. If > 0, inserts
+// on the container will block when the capacity is reached.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func OrderedMapStageCapacity(value int64) OrderedMapStageAttr {
+	return func(m optionalAttr) {
+		m["capacity"] = value
+	}
+}
+
+// OrderedMapStageMemoryLimit sets the optional memory_limit attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func OrderedMapStageMemoryLimit(value int64) OrderedMapStageAttr {
+	return func(m optionalAttr) {
+		m["memory_limit"] = value
+	}
+}
+
+// OrderedMapStageContainer sets the optional container attribute to value.
+//
+// value: If non-empty, this queue is placed in the given container. Otherwise,
+// a default container is used.
+// If not specified, defaults to ""
+func OrderedMapStageContainer(value string) OrderedMapStageAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// OrderedMapStageSharedName sets the optional shared_name attribute to value.
+//
+// value: It is necessary to match this name to the matching Unstage Op.
+// If not specified, defaults to ""
+func OrderedMapStageSharedName(value string) OrderedMapStageAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Stage (key, values) in the underlying container which behaves like a ordered
+//
+// associative container.   Elements are ordered by key.
+//
+// Arguments:
+//	key: int64
+//
+//	values: a list of tensors
+// dtypes A list of data types that inserted values should adhere to.
+//
+//
+// Returns the created operation.
+func OrderedMapStage(scope *Scope, key tf.Output, indices tf.Output, values []tf.Output, dtypes []tf.DataType, optional ...OrderedMapStageAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtypes": dtypes}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "OrderedMapStage",
+		Input: []tf.Input{
+			key, indices, tf.OutputList(values),
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// RpcAttr is an optional argument to Rpc.
+type RpcAttr func(optionalAttr)
+
+// RpcProtocol sets the optional protocol attribute to value.
+//
+// value: RPC protocol to use.  Empty string means use the default protocol.
+// Options include 'grpc'.
+// If not specified, defaults to ""
+func RpcProtocol(value string) RpcAttr {
+	return func(m optionalAttr) {
+		m["protocol"] = value
+	}
+}
+
+// RpcFailFast sets the optional fail_fast attribute to value.
+//
+// value: `boolean`. If `true` (default), then failures to connect
+// (i.e., the server does not immediately respond) cause an RPC failure.
+// If not specified, defaults to true
+func RpcFailFast(value bool) RpcAttr {
+	return func(m optionalAttr) {
+		m["fail_fast"] = value
+	}
+}
+
+// RpcTimeoutInMs sets the optional timeout_in_ms attribute to value.
+//
+// value: `int`. If `0` (default), then the kernel will run the RPC
+// request and only time out if the RPC deadline passes or the session times out.
+// If this value is greater than `0`, then the op will raise an exception if
+// the RPC takes longer than `timeout_in_ms`.
+// If not specified, defaults to 0
+func RpcTimeoutInMs(value int64) RpcAttr {
+	return func(m optionalAttr) {
+		m["timeout_in_ms"] = value
+	}
+}
+
+// Perform batches of RPC requests.
+//
+// This op asynchronously performs either a single RPC request, or a batch
+// of requests.  RPC requests are defined by three main parameters:
+//
+//   - `address` (the host+port or BNS address of the request)
+//   - `method` (the RPC method name for the request)
+//   - `request` (the serialized proto string, or vector of strings,
+//      of the RPC request argument).
+//
+// For example, if you have an RPC service running on port localhost:2345,
+// and its interface is configured with the following proto declaration:
+//
+// ```
+// service MyService {
+//   rpc MyMethod(MyRequestProto) returns (MyResponseProto) {
+//   }
+// };
+// ```
+//
+// then call this op with arguments:
+//
+// ```
+// address = "localhost:2345"
+// method = "MyService/MyMethod"
+// ```
+//
+// The `request` tensor is a string tensor representing serialized `MyRequestProto`
+// strings; and the output string tensor `response` will have the same shape
+// and contain (upon successful completion) corresponding serialized
+// `MyResponseProto` strings.
+//
+// For example, to send a single, empty, `MyRequestProto`, call
+// this op with `request = ""`.  To send 5 **parallel** empty requests,
+// call this op with `request = ["", "", "", "", ""]`.
+//
+// More generally, one can create a batch of `MyRequestProto` serialized protos
+// from regular batched tensors using the `encode_proto` op, and convert
+// the response `MyResponseProto` serialized protos to batched tensors
+// using the `decode_proto` op.
+//
+// **NOTE** Working with serialized proto strings is faster than instantiating
+// actual proto objects in memory, so no performance degradation is expected
+// compared to writing custom kernels for this workflow.
+//
+// If the connection fails or the remote worker returns an error
+// status, the op reraises this exception locally.
+//
+// See the `TryRpc` op if you prefer to handle RPC failures manually in the graph.
+//
+// Arguments:
+//	address: `0-D` or `1-D`.  The address (i.e. host_name:port) of the RPC server.
+// If this tensor has more than 1 element, then multiple parallel rpc requests
+// are sent.  This argument broadcasts with `method` and `request`.
+//	method: `0-D` or `1-D`.  The method address on the RPC server.
+// If this tensor has more than 1 element, then multiple parallel rpc requests
+// are sent.  This argument broadcasts with `address` and `request`.
+//	request: `0-D` or `1-D`.  Serialized proto strings: the rpc request argument.
+// If this tensor has more than 1 element, then multiple parallel rpc requests
+// are sent.  This argument broadcasts with `address` and `method`.
+//
+// Returns Same shape as `request`. Serialized proto strings: the rpc responses.
+func Rpc(scope *Scope, address tf.Output, method tf.Output, request tf.Output, optional ...RpcAttr) (response tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Rpc",
+		Input: []tf.Input{
+			address, method, request,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// StackPushV2Attr is an optional argument to StackPushV2.
+type StackPushV2Attr func(optionalAttr)
+
+// StackPushV2SwapMemory sets the optional swap_memory attribute to value.
+//
+// value: Swap `elem` to CPU. Default to false.
+// If not specified, defaults to false
+func StackPushV2SwapMemory(value bool) StackPushV2Attr {
+	return func(m optionalAttr) {
+		m["swap_memory"] = value
+	}
+}
+
+// Push an element onto the stack.
+//
+// Arguments:
+//	handle: The handle to a stack.
+//	elem: The tensor to be pushed onto the stack.
+//
+// Returns The same tensor as the input 'elem'.
+func StackPushV2(scope *Scope, handle tf.Output, elem tf.Output, optional ...StackPushV2Attr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "StackPushV2",
+		Input: []tf.Input{
+			handle, elem,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // FusedBatchNormGradV2Attr is an optional argument to FusedBatchNormGradV2.
 type FusedBatchNormGradV2Attr func(optionalAttr)
 
@@ -33169,6 +32823,25 @@ func TensorArraySizeV3(scope *Scope, handle tf.Output, flow_in tf.Output) (size
 	return op.Output(0)
 }
 
+// Deprecated. Use TensorArrayGradV3
+//
+// DEPRECATED at GraphDef version 26: Use TensorArrayGradV3
+func TensorArrayGradV2(scope *Scope, handle tf.Output, flow_in tf.Output, source string) (grad_handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"source": source}
+	opspec := tf.OpSpec{
+		Type: "TensorArrayGradV2",
+		Input: []tf.Input{
+			handle, flow_in,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // SparseReduceMaxAttr is an optional argument to SparseReduceMax.
 type SparseReduceMaxAttr func(optionalAttr)
 
@@ -33941,47 +33614,47 @@ func MapUnstage(scope *Scope, key tf.Output, indices tf.Output, dtypes []tf.Data
 	return values
 }
 
-// MapIncompleteSizeAttr is an optional argument to MapIncompleteSize.
-type MapIncompleteSizeAttr func(optionalAttr)
+// MapSizeAttr is an optional argument to MapSize.
+type MapSizeAttr func(optionalAttr)
 
-// MapIncompleteSizeCapacity sets the optional capacity attribute to value.
+// MapSizeCapacity sets the optional capacity attribute to value.
 // If not specified, defaults to 0
 //
 // REQUIRES: value >= 0
-func MapIncompleteSizeCapacity(value int64) MapIncompleteSizeAttr {
+func MapSizeCapacity(value int64) MapSizeAttr {
 	return func(m optionalAttr) {
 		m["capacity"] = value
 	}
 }
 
-// MapIncompleteSizeMemoryLimit sets the optional memory_limit attribute to value.
+// MapSizeMemoryLimit sets the optional memory_limit attribute to value.
 // If not specified, defaults to 0
 //
 // REQUIRES: value >= 0
-func MapIncompleteSizeMemoryLimit(value int64) MapIncompleteSizeAttr {
+func MapSizeMemoryLimit(value int64) MapSizeAttr {
 	return func(m optionalAttr) {
 		m["memory_limit"] = value
 	}
 }
 
-// MapIncompleteSizeContainer sets the optional container attribute to value.
+// MapSizeContainer sets the optional container attribute to value.
 // If not specified, defaults to ""
-func MapIncompleteSizeContainer(value string) MapIncompleteSizeAttr {
+func MapSizeContainer(value string) MapSizeAttr {
 	return func(m optionalAttr) {
 		m["container"] = value
 	}
 }
 
-// MapIncompleteSizeSharedName sets the optional shared_name attribute to value.
+// MapSizeSharedName sets the optional shared_name attribute to value.
 // If not specified, defaults to ""
-func MapIncompleteSizeSharedName(value string) MapIncompleteSizeAttr {
+func MapSizeSharedName(value string) MapSizeAttr {
 	return func(m optionalAttr) {
 		m["shared_name"] = value
 	}
 }
 
-// Op returns the number of incomplete elements in the underlying container.
-func MapIncompleteSize(scope *Scope, dtypes []tf.DataType, optional ...MapIncompleteSizeAttr) (size tf.Output) {
+// Op returns the number of elements in the underlying container.
+func MapSize(scope *Scope, dtypes []tf.DataType, optional ...MapSizeAttr) (size tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -33990,7 +33663,7 @@ func MapIncompleteSize(scope *Scope, dtypes []tf.DataType, optional ...MapIncomp
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "MapIncompleteSize",
+		Type: "MapSize",
 
 		Attrs: attrs,
 	}
diff --git a/tensorflow/java/README.md b/tensorflow/java/README.md
index 2fa81ed88f6..64152907454 100644
--- a/tensorflow/java/README.md
+++ b/tensorflow/java/README.md
@@ -1,7 +1,7 @@
 # TensorFlow for Java
 
 > *WARNING*: The TensorFlow Java API is not currently covered by the TensorFlow
-> [API stability guarantees](https://www.tensorflow.org/guide/version_semantics).
+> [API stability guarantees](https://www.tensorflow.org/guide/version_compat).
 >
 > For using TensorFlow on Android refer instead to
 > [contrib/android](https://www.tensorflow.org/code/tensorflow/contrib/android),
diff --git a/tensorflow/java/src/test/java/org/tensorflow/TensorTest.java b/tensorflow/java/src/test/java/org/tensorflow/TensorTest.java
index 1bd00a763dd..3229cce2776 100644
--- a/tensorflow/java/src/test/java/org/tensorflow/TensorTest.java
+++ b/tensorflow/java/src/test/java/org/tensorflow/TensorTest.java
@@ -21,6 +21,7 @@ import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertTrue;
 import static org.junit.Assert.fail;
 
+import java.nio.Buffer;
 import java.nio.ByteBuffer;
 import java.nio.ByteOrder;
 import java.nio.DoubleBuffer;
@@ -100,7 +101,7 @@ public class TensorTest {
                     : ByteOrder.LITTLE_ENDIAN)
             .asDoubleBuffer()
             .put(doubles);
-    buf.flip();
+    flipBuffer(buf);
     try (Tensor<Double> t = Tensor.create(new long[] {doubles.length}, buf)) {
       double[] actual = new double[doubles.length];
       assertArrayEquals(doubles, t.copyTo(actual), EPSILON);
@@ -179,30 +180,30 @@ public class TensorTest {
       {
         ByteBuffer bbuf = ByteBuffer.allocate(1024).order(ByteOrder.nativeOrder());
 
-        bbuf.clear(); // FLOAT
+        clearBuffer(bbuf); // FLOAT
         tfloats.writeTo(bbuf);
         assertEquals(tfloats.numBytes(), bbuf.position());
-        bbuf.flip();
+        flipBuffer(bbuf);
         assertEquals(floats[0], bbuf.asFloatBuffer().get(0), EPSILON);
-        bbuf.clear(); // DOUBLE
+        clearBuffer(bbuf); // DOUBLE
         tdoubles.writeTo(bbuf);
         assertEquals(tdoubles.numBytes(), bbuf.position());
-        bbuf.flip();
+        flipBuffer(bbuf);
         assertEquals(doubles[0], bbuf.asDoubleBuffer().get(0), EPSILON);
-        bbuf.clear(); // INT32
+        clearBuffer(bbuf); // INT32
         tints.writeTo(bbuf);
         assertEquals(tints.numBytes(), bbuf.position());
-        bbuf.flip();
+        flipBuffer(bbuf);
         assertEquals(ints[0], bbuf.asIntBuffer().get(0));
-        bbuf.clear(); // INT64
+        clearBuffer(bbuf); // INT64
         tlongs.writeTo(bbuf);
         assertEquals(tlongs.numBytes(), bbuf.position());
-        bbuf.flip();
+        flipBuffer(bbuf);
         assertEquals(longs[0], bbuf.asLongBuffer().get(0));
-        bbuf.clear(); // BOOL
+        clearBuffer(bbuf); // BOOL
         tbools.writeTo(bbuf);
         assertEquals(tbools.numBytes(), bbuf.position());
-        bbuf.flip();
+        flipBuffer(bbuf);
         assertEquals(bools[0], bbuf.get(0) != 0);
       }
 
@@ -254,7 +255,7 @@ public class TensorTest {
                         : ByteOrder.LITTLE_ENDIAN)
                 .asDoubleBuffer();
         tdoubles.writeTo(foreignBuf);
-        foreignBuf.flip();
+        flipBuffer(foreignBuf);
         double[] actual = new double[foreignBuf.remaining()];
         foreignBuf.get(actual);
         assertArrayEquals(doubles, actual, EPSILON);
@@ -547,4 +548,25 @@ public class TensorTest {
       // expected.
     }
   }
+
+  // Workaround for cross compiliation
+  // (e.g., javac -source 1.9 -target 1.8).
+  //
+  // In Java 8 and prior, subclasses of java.nio.Buffer (e.g., java.nio.DoubleBuffer) inherited the
+  // "flip()" and "clear()" methods from java.nio.Buffer resulting in the signature:
+  //   Buffer flip();
+  // In Java 9 these subclasses had their own methods like:
+  //   DoubleBuffer flip();
+  // As a result, compiling for 1.9 source for a target of JDK 1.8 would result in errors at runtime
+  // like:
+  //
+  // java.lang.NoSuchMethodError: java.nio.DoubleBuffer.flip()Ljava/nio/DoubleBuffer
+  private static void flipBuffer(Buffer buf) {
+    buf.flip();
+  }
+
+  // See comment for flipBuffer()
+  private static void clearBuffer(Buffer buf) {
+    buf.clear();
+  }
 }
diff --git a/tensorflow/lite/BUILD b/tensorflow/lite/BUILD
index be84fc5db1f..bb2c53b8c9e 100644
--- a/tensorflow/lite/BUILD
+++ b/tensorflow/lite/BUILD
@@ -131,6 +131,7 @@ cc_library(
     name = "framework",
     srcs = [
         "allocation.cc",
+        "core/subgraph.cc",
         "graph_info.cc",
         "interpreter.cc",
         "model.cc",
@@ -155,6 +156,7 @@ cc_library(
         "allocation.h",
         "context.h",
         "context_util.h",
+        "core/subgraph.h",
         "error_reporter.h",
         "graph_info.h",
         "interpreter.h",
diff --git a/tensorflow/lite/build_def.bzl b/tensorflow/lite/build_def.bzl
index bc98dc57bc5..5eaf7194949 100644
--- a/tensorflow/lite/build_def.bzl
+++ b/tensorflow/lite/build_def.bzl
@@ -221,6 +221,7 @@ def json_to_tflite(name, src, out):
 # generated_test_models_failing().
 def generated_test_models():
     return [
+        "abs",
         "add",
         "arg_min_max",
         "avg_pool",
@@ -236,18 +237,21 @@ def generated_test_models():
         "equal",
         "exp",
         "expand_dims",
+        "fill",
         "floor",
         "floor_div",
         "floor_mod",
         "fully_connected",
         "fused_batch_norm",
         "gather",
+        "gather_buggy",
         "global_batch_norm",
         "greater",
         "greater_equal",
         "sum",
         "l2norm",
         "l2_pool",
+        "leaky_relu",
         "less",
         "less_equal",
         "local_response_norm",
@@ -261,6 +265,7 @@ def generated_test_models():
         "maximum",
         "mean",
         "minimum",
+        "mirror_pad",
         "mul",
         "neg",
         "not_equal",
@@ -268,6 +273,7 @@ def generated_test_models():
         "pack",
         "pad",
         "padv2",
+        "placeholder_with_default",
         "prelu",
         "pow",
         "range",
@@ -290,17 +296,21 @@ def generated_test_models():
         "space_to_depth",
         "sparse_to_dense",
         "split",
+        "splitv",
         "sqrt",
         "square",
+        "squared_difference",
         "squeeze",
         "strided_slice",
         "strided_slice_1d_exhaustive",
+        "strided_slice_buggy",
         "sub",
         "tile",
         "topk",
         "transpose",
         "transpose_conv",
         "unpack",
+        "unroll_batch_matmul",
         "where",
         "zeros_like",
     ]
diff --git a/tensorflow/lite/builtin_ops.h b/tensorflow/lite/builtin_ops.h
index b8c05f57bb5..f97d3ac4bf0 100644
--- a/tensorflow/lite/builtin_ops.h
+++ b/tensorflow/lite/builtin_ops.h
@@ -123,6 +123,11 @@ typedef enum {
   kTfLiteBuiltinFloorMod = 95,
   kTfLiteBuiltinRange = 96,
   kTfLiteBuiltinResizeNearestNeighbor = 97,
+  kTfLiteBuiltinLeakyRelu = 98,
+  kTfLiteBuiltinSquaredDifference = 99,
+  kTfLiteBuiltinMirrorPad = 100,
+  kTfLiteBuiltinAbs = 101,
+  kTfLiteBuiltinSplitV = 102,
 } TfLiteBuiltinOperator;
 
 #ifdef __cplusplus
diff --git a/tensorflow/lite/c/builtin_op_data.h b/tensorflow/lite/c/builtin_op_data.h
index 855983d60df..6a5a027a9dc 100644
--- a/tensorflow/lite/c/builtin_op_data.h
+++ b/tensorflow/lite/c/builtin_op_data.h
@@ -35,11 +35,21 @@ typedef enum {
   kTfLitePaddingValid,
 } TfLitePadding;
 
+typedef enum {
+  kTfLiteMirrorPaddingUnknown = 0,
+  kTfLiteMirrorPaddingReflect,
+  kTfLiteMirrorPaddingSymmetric,
+} TfLiteMirrorPaddingMode;
+
 typedef struct {
   int width;
   int height;
 } TfLitePaddingValues;
 
+typedef struct {
+  TfLiteMirrorPaddingMode mode;
+} TfLiteMirrorPaddingParams;
+
 // Possible fused activation functions.
 // TODO(aselle): rename to TfLiteActivation
 typedef enum {
@@ -267,6 +277,10 @@ typedef struct {
   int num_splits;
 } TfLiteSplitParams;
 
+typedef struct {
+  int num_splits;
+} TfLiteSplitVParams;
+
 typedef struct {
   // TODO(ahentz): We can't have dynamic data in this struct, at least not yet.
   // For now we will fix the maximum possible number of dimensions.
@@ -328,6 +342,10 @@ typedef struct {
   int axis;
 } TfLiteUnpackParams;
 
+typedef struct {
+  float alpha;
+} TfLiteLeakyReluParams;
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif  // __cplusplus
diff --git a/tensorflow/lite/c/builtin_op_data_test.cc b/tensorflow/lite/c/builtin_op_data_test.cc
index 0e33dcd8c84..4ce7c481e1c 100644
--- a/tensorflow/lite/c/builtin_op_data_test.cc
+++ b/tensorflow/lite/c/builtin_op_data_test.cc
@@ -63,6 +63,7 @@ TEST(IntArray, CanCompileStructs) {
   TfLiteTransposeParams transpose_params;
   TfLiteReducerParams reducer_params;
   TfLiteSplitParams split_params;
+  TfLiteSplitVParams split_v_params;
   TfLiteSqueezeParams squeeze_params;
   TfLiteStridedSliceParams strided_slice_params;
   TfLiteArgMaxParams arg_max_params;
diff --git a/tensorflow/lite/c/c_api_internal.c b/tensorflow/lite/c/c_api_internal.c
index b131f067746..2923dbad4ef 100644
--- a/tensorflow/lite/c/c_api_internal.c
+++ b/tensorflow/lite/c/c_api_internal.c
@@ -59,7 +59,7 @@ void TfLiteIntArrayPrint(const char* s, TfLiteIntArray* a) {
   printf("]\n");
 }
 
-TfLiteIntArray* TfLiteIntArrayCopy(TfLiteIntArray* src) {
+TfLiteIntArray* TfLiteIntArrayCopy(const TfLiteIntArray* src) {
   if (!src) return NULL;
   TfLiteIntArray* ret = TfLiteIntArrayCreate(src->size);
   if (ret) {
@@ -125,6 +125,8 @@ const char* TfLiteTypeGetName(TfLiteType type) {
       return "INT32";
     case kTfLiteUInt8:
       return "UINT8";
+    case kTfLiteInt8:
+      return "INT8";
     case kTfLiteInt64:
       return "INT64";
     case kTfLiteBool:
@@ -137,3 +139,14 @@ const char* TfLiteTypeGetName(TfLiteType type) {
   return "Unknown type";
 }
 
+TfLiteDelegate TfLiteDelegateCreate() {
+  TfLiteDelegate d = {
+      .data_ = NULL,
+      .Prepare = NULL,
+      .CopyFromBufferHandle = NULL,
+      .CopyToBufferHandle = NULL,
+      .FreeBufferHandle = NULL,
+      .flags = kTfLiteDelegateFlagsNone,
+  };
+  return d;
+}
diff --git a/tensorflow/lite/c/c_api_internal.h b/tensorflow/lite/c/c_api_internal.h
index e05fd19936e..1cd84eff5c4 100644
--- a/tensorflow/lite/c/c_api_internal.h
+++ b/tensorflow/lite/c/c_api_internal.h
@@ -96,7 +96,7 @@ int TfLiteIntArrayEqualsArray(TfLiteIntArray* a, int b_size, int b_data[]);
 
 // Create a copy of an array passed as `src`.
 // You are expected to free memory with TfLiteIntArrayFree
-TfLiteIntArray* TfLiteIntArrayCopy(TfLiteIntArray* src);
+TfLiteIntArray* TfLiteIntArrayCopy(const TfLiteIntArray* src);
 
 // Free memory of array `v`.
 void TfLiteIntArrayFree(TfLiteIntArray* v);
@@ -179,6 +179,7 @@ typedef enum {
   kTfLiteBool = 6,
   kTfLiteInt16 = 7,
   kTfLiteComplex64 = 8,
+  kTfLiteInt8 = 9,
 } TfLiteType;
 
 // Return the name of a given type, for error reporting purposes.
@@ -203,6 +204,7 @@ typedef union {
   bool* b;
   int16_t* i16;
   TfLiteComplex64* c64;
+  int8_t* int8;
 } TfLitePtrUnion;
 
 // Memory allocation strategies. kTfLiteMmapRo is for read-only memory-mapped
@@ -486,19 +488,20 @@ typedef struct _TfLiteDelegate {
   // delegated subgraphs of the original graph.
   TfLiteStatus (*Prepare)(TfLiteContext* context, TfLiteDelegate* delegate);
 
-  // Copy the data from delegate buffer handle to raw memory.
-  // This can be null if the delegate doesn't use its own buffer.
+  // Copy the data from delegate buffer handle into raw memory of the given
+  // 'tensor'. This cannot be null. The delegate is allowed to allocate the raw
+  // bytes as long as it follows the rules for kTfLiteDynamic tensors.
   TfLiteStatus (*CopyFromBufferHandle)(TfLiteContext* context,
                                        TfLiteDelegate* delegate,
                                        TfLiteBufferHandle buffer_handle,
-                                       void* data, size_t size);
+                                       TfLiteTensor* tensor);
 
-  // Copy the data from raw memory to delegate buffer handle.
-  // This can be null if the delegate doesn't use its own buffer.
+  // Copy the data from raw memory of the given 'tensor' to delegate buffer
+  // handle. This can be null if the delegate doesn't use its own buffer.
   TfLiteStatus (*CopyToBufferHandle)(TfLiteContext* context,
                                      TfLiteDelegate* delegate,
                                      TfLiteBufferHandle buffer_handle,
-                                     void* data, size_t size);
+                                     TfLiteTensor* tensor);
 
   // Free the Delegate Buffer Handle. Note: This only frees the handle, but
   // this doesn't release the underlying resource (e.g. textures). The
@@ -511,6 +514,10 @@ typedef struct _TfLiteDelegate {
   int64_t flags;
 } TfLiteDelegate;
 
+// Build a 'null' delegate, with all the fields properly set to their default
+// values.
+TfLiteDelegate TfLiteDelegateCreate();
+
 // WARNING: This is an experimental interface that is subject to change.
 //
 // Currently, TfLiteDelegateParams has to be allocated in a way that it's
diff --git a/tensorflow/lite/c/c_api_internal_test.cc b/tensorflow/lite/c/c_api_internal_test.cc
index e21823c41f0..acf0dfc5be8 100644
--- a/tensorflow/lite/c/c_api_internal_test.cc
+++ b/tensorflow/lite/c/c_api_internal_test.cc
@@ -74,6 +74,7 @@ TEST(Types, TestTypeNames) {
   EXPECT_EQ(type_name(kTfLiteInt16), "INT16");
   EXPECT_EQ(type_name(kTfLiteInt32), "INT32");
   EXPECT_EQ(type_name(kTfLiteUInt8), "UINT8");
+  EXPECT_EQ(type_name(kTfLiteInt8), "INT8");
   EXPECT_EQ(type_name(kTfLiteInt64), "INT64");
   EXPECT_EQ(type_name(kTfLiteBool), "BOOL");
   EXPECT_EQ(type_name(kTfLiteComplex64), "COMPLEX64");
diff --git a/tensorflow/lite/core/api/flatbuffer_conversions.cc b/tensorflow/lite/core/api/flatbuffer_conversions.cc
index 8cd3faabb72..c00a0a3a546 100644
--- a/tensorflow/lite/core/api/flatbuffer_conversions.cc
+++ b/tensorflow/lite/core/api/flatbuffer_conversions.cc
@@ -61,6 +61,9 @@ TfLiteStatus ConvertTensorType(TensorType tensor_type, TfLiteType* type,
     case TensorType_UINT8:
       *type = kTfLiteUInt8;
       break;
+    case TensorType_INT8:
+      *type = kTfLiteInt8;
+      break;
     case TensorType_INT64:
       *type = kTfLiteInt64;
       break;
@@ -503,6 +506,14 @@ TfLiteStatus ParseOpData(const Operator* op, BuiltinOperator op_type,
       *builtin_data = reinterpret_cast<void*>(params);
       break;
     }
+    case BuiltinOperator_SPLIT_V: {
+      auto* params = allocator->AllocatePOD<TfLiteSplitParams>();
+      if (auto* schema_params = op->builtin_options_as_SplitVOptions()) {
+        params->num_splits = schema_params->num_splits();
+      }
+      *builtin_data = reinterpret_cast<void*>(params);
+      break;
+    }
     case BuiltinOperator_SQUEEZE: {
       auto* params = allocator->AllocatePOD<TfLiteSqueezeParams>();
       if (auto* schema_params = op->builtin_options_as_SqueezeOptions()) {
@@ -617,8 +628,31 @@ TfLiteStatus ParseOpData(const Operator* op, BuiltinOperator op_type,
       *builtin_data = reinterpret_cast<void*>(params);
       break;
     }
+    case BuiltinOperator_LEAKY_RELU: {
+      TfLiteLeakyReluParams* params =
+          allocator->AllocatePOD<TfLiteLeakyReluParams>();
+      if (auto* leaky_relu_params = op->builtin_options_as_LeakyReluOptions()) {
+        params->alpha = leaky_relu_params->alpha();
+      }
+      *builtin_data = reinterpret_cast<void*>(params);
+      break;
+    }
+    case BuiltinOperator_MIRROR_PAD: {
+      TfLiteMirrorPaddingParams* params =
+          allocator->AllocatePOD<TfLiteMirrorPaddingParams>();
+      auto* mirror_pad_params = op->builtin_options_as_MirrorPadOptions();
+      if (mirror_pad_params != nullptr) {
+        params->mode =
+            mirror_pad_params->mode() == tflite::MirrorPadMode_REFLECT
+                ? TfLiteMirrorPaddingMode::kTfLiteMirrorPaddingReflect
+                : TfLiteMirrorPaddingMode::kTfLiteMirrorPaddingSymmetric;
+      }
+      *builtin_data = reinterpret_cast<void*>(params);
+      break;
+    }
 
     // Below are the ops with no builtin_data strcture.
+    case BuiltinOperator_ABS:
     case BuiltinOperator_BATCH_TO_SPACE_ND:
     // TODO(aselle): Implement call in BuiltinOptions, but nullptrs are
     // ok for now, since there is no call implementation either.
@@ -668,6 +702,7 @@ TfLiteStatus ParseOpData(const Operator* op, BuiltinOperator op_type,
     case BuiltinOperator_FILL:
     case BuiltinOperator_FLOOR_MOD:
     case BuiltinOperator_RANGE:
+    case BuiltinOperator_SQUARED_DIFFERENCE:
       break;
   }
   return kTfLiteOk;
diff --git a/tensorflow/lite/core/subgraph.cc b/tensorflow/lite/core/subgraph.cc
new file mode 100644
index 00000000000..90361faeae3
--- /dev/null
+++ b/tensorflow/lite/core/subgraph.cc
@@ -0,0 +1,970 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/core/subgraph.h"
+#include "tensorflow/lite/arena_planner.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/context_util.h"
+#include "tensorflow/lite/graph_info.h"
+#include "tensorflow/lite/nnapi_delegate.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+
+namespace tflite {
+
+namespace {
+TfLiteStatus ReportOpError(TfLiteContext* context, const TfLiteNode& node,
+                           const TfLiteRegistration& registration,
+                           int node_index, const char* message) {
+  context->ReportError(
+      context, "Node number %d (%s) %s.\n", node_index,
+      registration.custom_name
+          ? registration.custom_name
+          : EnumNameBuiltinOperator(
+                static_cast<BuiltinOperator>(registration.builtin_code)),
+      message);
+  return kTfLiteError;
+}
+
+// Stub method which returns kTfLiteError when the function is forbidden.
+// We're registrating this function to several different function to save
+// compiled binary size. Please note the restrictions:
+// * The type of first parameter have to be `TfLiteContext*`.
+// * All paramteters must be trivailly destructible. (E.g. No C++ class)
+TfLiteStatus ForbiddenContextFunction(TfLiteContext* context, ...) {
+  context->ReportError(context,
+                       "The function is forbidden if not calling in delegate.");
+  return kTfLiteError;
+}
+
+// Set the ForbiddenContextFunction to a compatible function pointer.
+template <typename FunctionType>
+void SetForbiddenContextFunction(FunctionType* func) {
+  *func = reinterpret_cast<FunctionType>(ForbiddenContextFunction);
+}
+
+// Returns true if at least one tensor in the given list is kTfLiteDynamic.
+template <typename TensorIntArray>
+bool HasDynamicTensorImpl(const TfLiteContext& context,
+                          const TensorIntArray& int_array) {
+  for (int i : int_array) {
+    const TfLiteTensor& tensor = context.tensors[i];
+    if (tensor.allocation_type == kTfLiteDynamic) {
+      return true;
+    }
+  }
+  return false;
+}
+
+bool HasDynamicTensor(const TfLiteContext& context,
+                      const TfLiteIntArray* int_array) {
+  return HasDynamicTensorImpl(context, TfLiteIntArrayView{int_array});
+}
+
+}  // namespace
+
+// A trivial implementation of GraphInfo around the Interpreter.
+// NOTE: this interpreter info represents the subset of the
+// graph that is executed according to execution plan. Thus,
+// the indices are execution plan indices rather than raw node
+// indices.
+class InterpreterInfo : public GraphInfo {
+ public:
+  explicit InterpreterInfo(Subgraph* subgraph) : subgraph_(subgraph) {}
+
+  size_t num_tensors() const override { return subgraph_->tensors().size(); }
+  TfLiteTensor* tensor(size_t index) override {
+    return &subgraph_->tensors()[index];
+  }
+  size_t num_nodes() const override {
+    return subgraph_->execution_plan().size();
+  }
+  const TfLiteNode& node(size_t index) const override {
+    int node_index = subgraph_->execution_plan()[index];
+    return subgraph_->nodes_and_registration()[node_index].first;
+  }
+  const std::vector<int>& inputs() const override {
+    return subgraph_->inputs();
+  }
+  const std::vector<int>& outputs() const override {
+    return subgraph_->outputs();
+  }
+  const std::vector<int>& variables() const override {
+    return subgraph_->variables();
+  }
+
+ public:
+  Subgraph* subgraph_;
+};
+
+Subgraph::Subgraph(ErrorReporter* error_reporter,
+                   TfLiteExternalContext** external_contexts,
+                   std::vector<std::unique_ptr<Subgraph>>* subgraphs)
+    : context_(&owned_context_),
+      error_reporter_(error_reporter),
+      next_execution_plan_index_to_prepare_(0),
+      external_contexts_(external_contexts),
+      subgraphs_(subgraphs) {
+  context_->impl_ = static_cast<void*>(this);
+  context_->ResizeTensor = ResizeTensor;
+  context_->ReportError = ReportErrorC;
+  context_->AddTensors = AddTensors;
+  context_->tensors = nullptr;
+  context_->tensors_size = 0;
+  context_->allow_fp32_relax_to_fp16 = false;
+  context_->recommended_num_threads = -1;
+  context_->GetExternalContext = GetExternalContext;
+  context_->SetExternalContext = SetExternalContext;
+
+  // Reserve some space for the tensors to avoid excessive resizing.
+  tensors_.reserve(kTensorsReservedCapacity);
+  nodes_and_registration().reserve(kTensorsReservedCapacity);
+  // Invalid to call these these except from TfLiteDelegate
+  SwitchToKernelContext();
+}
+
+Subgraph::~Subgraph() {
+  for (auto& node_and_reg : nodes_and_registration_) {
+    TfLiteNode& node = node_and_reg.first;
+    TfLiteIntArrayFree(node.inputs);
+    TfLiteIntArrayFree(node.outputs);
+    TfLiteIntArrayFree(node.temporaries);
+    if (node.builtin_data) free(node.builtin_data);
+    OpFree(node_and_reg.second, node.user_data);
+    node.builtin_data = nullptr;
+  }
+
+  for (size_t i = 0; i < context_->tensors_size; i++) {
+    TfLiteTensor* tensor = &context_->tensors[i];
+    if (tensor->buffer_handle != kTfLiteNullBufferHandle &&
+        tensor->delegate->FreeBufferHandle != nullptr) {
+      tensor->delegate->FreeBufferHandle(context_, tensor->delegate,
+                                         &tensor->buffer_handle);
+    }
+    TfLiteTensorFree(tensor);
+  }
+}
+
+TfLiteStatus Subgraph::ReplaceNodeSubsetsWithDelegateKernels(
+    TfLiteContext* context, TfLiteRegistration registration,
+    const TfLiteIntArray* nodes_to_replace, TfLiteDelegate* delegate) {
+  return static_cast<Subgraph*>(context->impl_)
+      ->ReplaceNodeSubsetsWithDelegateKernels(registration, nodes_to_replace,
+                                              delegate);
+}
+
+namespace {
+
+// Copy a std::vector<int> to an existing TfLiteIntArray.
+// This is a low-level data manipulation function, and it's caller's
+// responsibility to ensure TfLiteIntArray has enough size.
+void CopyVectorToTfLiteIntArray(const std::vector<int>& vec,
+                                TfLiteIntArray* arr) {
+  arr->size = vec.size();
+  memcpy(arr->data, vec.data(), sizeof(int) * arr->size);
+}
+
+// This function allocates a continuous memory space that contains a
+// TfLiteDelegateParams followed by a several TfLiteIntArray.
+// When calling `free` at TfLiteDelegateParams*, all the allocated space
+// will be freed together.
+//
+// +-----------------------------------+
+// | TfLiteDelegateParams              |
+// | TfLiteDelegate* delegate;         |
+// | TfLiteIntArray* nodes_to_replace; |--\
+// | TfLiteIntArray* input_tensors;    |--+--\
+// | TfLiteIntArray* output_tensors;   |--+--+--\
+// +-----------------------------------+  |  |  |
+// | TfLiteIntArray (variable size)    |<-/  |  |
+// +-----------------------------------+     |  |
+// | TfLiteIntArray (variable size)    |<----/  |
+// +-----------------------------------+        |
+// | TfLiteIntArray (variable size)    |<-------/
+// +-----------------------------------+
+TfLiteDelegateParams* CreateDelegateParams(TfLiteDelegate* delegate,
+                                           const NodeSubset& node_subset) {
+  // Step 1: Calculate the allocation size.
+  int allocation_size = sizeof(TfLiteDelegateParams);
+
+  int nodes_to_replace_size =
+      TfLiteIntArrayGetSizeInBytes(node_subset.nodes.size());
+  allocation_size += nodes_to_replace_size;
+
+  int input_tensors_size =
+      TfLiteIntArrayGetSizeInBytes(node_subset.input_tensors.size());
+  allocation_size += input_tensors_size;
+
+  int output_tensors_size =
+      TfLiteIntArrayGetSizeInBytes(node_subset.output_tensors.size());
+  allocation_size += output_tensors_size;
+
+  // Step 2: Allocate the memory.
+  // Use `char*` for conveniently step through the allocated space by bytes.
+  char* allocation = reinterpret_cast<char*>(malloc(allocation_size));
+
+  // Step 3: Fill all data structures structures.
+  TfLiteDelegateParams* params =
+      reinterpret_cast<TfLiteDelegateParams*>(allocation);
+  params->delegate = delegate;
+  allocation += sizeof(TfLiteDelegateParams);
+
+  params->nodes_to_replace = reinterpret_cast<TfLiteIntArray*>(allocation);
+  CopyVectorToTfLiteIntArray(node_subset.nodes, params->nodes_to_replace);
+  allocation += nodes_to_replace_size;
+
+  params->input_tensors = reinterpret_cast<TfLiteIntArray*>(allocation);
+  CopyVectorToTfLiteIntArray(node_subset.input_tensors, params->input_tensors);
+  allocation += input_tensors_size;
+
+  params->output_tensors = reinterpret_cast<TfLiteIntArray*>(allocation);
+  CopyVectorToTfLiteIntArray(node_subset.output_tensors,
+                             params->output_tensors);
+  allocation += output_tensors_size;
+
+  return params;
+}
+
+}  // namespace
+
+TfLiteStatus Subgraph::ReplaceNodeSubsetsWithDelegateKernels(
+    TfLiteRegistration registration, const TfLiteIntArray* nodes_to_replace,
+    TfLiteDelegate* delegate) {
+  // Annotate the registration as DELEGATE op.
+  registration.builtin_code = BuiltinOperator_DELEGATE;
+
+  // Analyze the graph to find all independent node_subsets that are either
+  // fully not-this-delegate or this-delegate computation.
+  InterpreterInfo info(this);
+  std::vector<NodeSubset> node_subsets;
+  PartitionGraphIntoIndependentNodeSubsets(&info, nodes_to_replace,
+                                           &node_subsets);
+
+  execution_plan_.clear();
+
+  for (auto& node_subset : node_subsets) {
+    // Subsets calimed by the delegate should have a "macro" op created, the
+    // other node_subsets (kTfNonPartition) just have their nodes added back to
+    // the execution plan.
+    switch (node_subset.type) {
+      case NodeSubset::kTfNonPartition:
+        for (auto it = node_subset.nodes.begin(); it != node_subset.nodes.end();
+             ++it) {
+          execution_plan_.push_back(*it);
+        }
+        break;
+      case NodeSubset::kTfPartition: {
+        int node_index;
+
+        TfLiteDelegateParams* params =
+            CreateDelegateParams(delegate, node_subset);
+        TF_LITE_ENSURE_STATUS(AddNodeWithParameters(
+            node_subset.input_tensors, node_subset.output_tensors, nullptr, 0,
+            params, &registration, &node_index));
+
+        // Initialize the output tensors's delegate-related fields.
+        for (int tensor_index : node_subset.output_tensors) {
+          TfLiteTensor* tensor = &tensors_[tensor_index];
+          TF_LITE_ENSURE(context_, tensor->delegate == nullptr ||
+                                       tensor->delegate == delegate);
+          tensor->delegate = delegate;
+        }
+
+        // Associate the node with the delegate.
+        TfLiteNode* node = &nodes_and_registration_[node_index].first;
+        node->delegate = delegate;
+      } break;
+      case NodeSubset::kTfUnexplored:
+        return kTfLiteError;
+        break;
+    }
+  }
+  return kTfLiteOk;
+}
+
+TfLiteExternalContext* Subgraph::GetExternalContext(
+    TfLiteExternalContextType type) {
+  if (type >= 0 && type < kTfLiteMaxExternalContexts) {
+    return external_contexts_[type];
+  }
+  return nullptr;
+}
+
+TfLiteExternalContext* Subgraph::GetExternalContext(
+    struct TfLiteContext* context, TfLiteExternalContextType type) {
+  return static_cast<Subgraph*>(context->impl_)->GetExternalContext(type);
+}
+
+void Subgraph::SetExternalContext(TfLiteExternalContextType type,
+                                  TfLiteExternalContext* ctx) {
+  if (type >= 0 && type < kTfLiteMaxExternalContexts) {
+    external_contexts_[type] = ctx;
+  }
+}
+
+void Subgraph::SetExternalContext(struct TfLiteContext* context,
+                                  TfLiteExternalContextType type,
+                                  TfLiteExternalContext* ctx) {
+  return static_cast<Subgraph*>(context->impl_)->SetExternalContext(type, ctx);
+}
+
+// Gets an TfLiteIntArray* representing the execution plan. The interpreter owns
+// this memory and it is only guaranteed to exist during the invocation of the
+// delegate prepare.
+TfLiteStatus Subgraph::GetExecutionPlan(TfLiteIntArray** execution_plan) {
+  // TODO(aselle): Do not make a copy here
+  plan_cache_.reset(TfLiteIntArrayCreate(execution_plan_.size()));
+  *execution_plan = plan_cache_.get();
+  static_assert(sizeof(plan_cache_->data[0]) == sizeof(execution_plan_[0]),
+                "TfLiteIntArray and execution_plan do not contain same type.");
+  std::memcpy(plan_cache_->data, execution_plan_.data(),
+              sizeof(plan_cache_->data[0]) * execution_plan_.size());
+  return kTfLiteOk;
+}
+
+// WARNING: This is an experimental interface that is subject to change.
+// Entry point for C node plugin API to get the execution plan
+TfLiteStatus Subgraph::GetExecutionPlan(struct TfLiteContext* context,
+                                        TfLiteIntArray** execution_plan) {
+  return static_cast<Subgraph*>(context->impl_)
+      ->GetExecutionPlan(execution_plan);
+}
+
+TfLiteStatus Subgraph::SetInputs(std::vector<int> inputs) {
+  TF_LITE_ENSURE_OK(&context_,
+                    CheckTensorIndices("inputs", inputs.data(), inputs.size()));
+  inputs_ = std::move(inputs);
+  return kTfLiteOk;
+}
+
+TfLiteStatus Subgraph::SetOutputs(std::vector<int> outputs) {
+  TF_LITE_ENSURE_OK(
+      &context_, CheckTensorIndices("outputs", outputs.data(), outputs.size()));
+  outputs_ = std::move(outputs);
+  return kTfLiteOk;
+}
+
+TfLiteStatus Subgraph::SetVariables(std::vector<int> variables) {
+  TF_LITE_ENSURE_OK(&context_, CheckTensorIndices("variables", variables.data(),
+                                                  variables.size()));
+  variables_ = std::move(variables);
+  return kTfLiteOk;
+}
+
+TfLiteStatus Subgraph::CheckTensorIndices(const char* label, const int* indices,
+                                          int length) {
+  // Making sure kOptionalTensor is not re-defined to something other than -1.
+  static_assert(kOptionalTensor == -1, "kOptionalTensor should be defined -1");
+
+  for (int i = 0; i < length; i++) {
+    int index = indices[i];
+    // Continue if index == kOptionalTensor before additional comparisons below,
+    // size_t(-1) is always >= context_tensors_size.
+    if (index == kOptionalTensor) {
+      continue;
+    }
+    if (index < 0 || static_cast<size_t>(index) >= context_->tensors_size) {
+      ReportError("Invalid tensor index %d in %s\n", index, label);
+      consistent_ = false;
+      return kTfLiteError;
+    }
+  }
+  return kTfLiteOk;
+}
+
+TfLiteStatus Subgraph::BytesRequired(TfLiteType type, const int* dims,
+                                     size_t dims_size, size_t* bytes) {
+  // TODO(aselle): Check for overflow here using overflow.h in TensorFlow
+  // MultiplyWithoutOverflow.
+  TF_LITE_ENSURE(context_, bytes != nullptr);
+  size_t count = 1;
+  for (int k = 0; k < dims_size; k++) count *= dims[k];
+  switch (type) {
+    case kTfLiteFloat32:
+      *bytes = sizeof(float) * count;
+      break;
+    case kTfLiteInt16:
+      *bytes = sizeof(int16_t) * count;
+      break;
+    case kTfLiteInt32:
+      *bytes = sizeof(int32_t) * count;
+      break;
+    case kTfLiteUInt8:
+      *bytes = sizeof(uint8_t) * count;
+      break;
+    case kTfLiteInt64:
+      *bytes = sizeof(int64_t) * count;
+      break;
+    case kTfLiteBool:
+      *bytes = sizeof(bool) * count;
+      break;
+    case kTfLiteComplex64:
+      *bytes = sizeof(std::complex<float>) * count;
+      break;
+    case kTfLiteInt8:
+      *bytes = sizeof(int8_t) * count;
+      break;
+    default:
+      ReportError(
+          "Only float32, int8, int16, int32, int64, uint8, bool, complex64 "
+          "supported currently.");
+      return kTfLiteError;
+  }
+  return kTfLiteOk;
+}
+
+TfLiteStatus Subgraph::AllocateTensors() {
+  if (!consistent_) {
+    ReportError("AllocateTensors() called on inconsistent model.");
+    return kTfLiteError;
+  }
+
+  // Explicit (re)allocation is necessary if nodes have been changed or tensors
+  // have been resized. For inputs marked as dynamic, we can't short-circuit the
+  // allocation as the client may have done the resize manually.
+  if (state_ != kStateUninvokable &&
+      !HasDynamicTensorImpl(*context_, inputs())) {
+    return kTfLiteOk;
+  }
+
+  next_execution_plan_index_to_prepare_ = 0;
+  if (memory_planner_) {
+    TF_LITE_ENSURE_STATUS(memory_planner_->ResetAllocations());
+  }
+
+  TF_LITE_ENSURE_STATUS(PrepareOpsAndTensors());
+
+  state_ = kStateInvokable;
+
+  // Reset the variable tensors to zero after (re)allocating the tensors.
+  // Developers shouldn't rely on the side effect of this function to reset
+  // variable tesnsors. They should call `ResetVariableTensors` directly
+  // instead.
+  ResetVariableTensors();
+
+  return kTfLiteOk;
+}
+
+// TODO(ycling): Support non-zero default values.
+TfLiteStatus Subgraph::ResetVariableTensors() {
+  for (auto& tensor : tensors_) {
+    if (!tensor.is_variable) {
+      continue;
+    }
+
+    // Variable tensors have to be `kTfLiteArenaRwPersistent`, and must be
+    // allocated after the initial `PrepareOpsAndTensors()` is called.
+    TF_LITE_ENSURE_EQ(context_, tensor.allocation_type,
+                      kTfLiteArenaRwPersistent);
+    TF_LITE_ENSURE(context_, tensor.data.raw != nullptr);
+
+    memset(tensor.data.raw, 0, tensor.bytes);
+  }
+  return kTfLiteOk;
+}
+
+TfLiteStatus Subgraph::AddNodeWithParameters(
+    const std::vector<int>& inputs, const std::vector<int>& outputs,
+    const char* init_data, size_t init_data_size, void* builtin_data,
+    const TfLiteRegistration* registration, int* node_index) {
+  if (state_ == kStateInvokableAndImmutable) {
+    ReportError("AddNodeWithParameters is disallowed when graph is immutable.");
+    return kTfLiteError;
+  }
+  state_ = kStateUninvokable;
+
+  std::unique_ptr<void, decltype(free)*> builtin_data_deleter(builtin_data,
+                                                              free);
+
+  TF_LITE_ENSURE_OK(context_, CheckTensorIndices("node inputs", inputs.data(),
+                                                 inputs.size()));
+  TF_LITE_ENSURE_OK(
+      &context_,
+      CheckTensorIndices("node outputs", outputs.data(), outputs.size()));
+
+  int new_node_index = nodes_and_registration_.size();
+  if (node_index) *node_index = new_node_index;
+  nodes_and_registration_.resize(nodes_and_registration_.size() + 1);
+  auto& node_and_reg = nodes_and_registration_.back();
+  TfLiteNode& node = node_and_reg.first;
+  if (node.inputs) TfLiteIntArrayFree(node.inputs);
+  if (node.outputs) TfLiteIntArrayFree(node.outputs);
+  if (node.temporaries) TfLiteIntArrayFree(node.temporaries);
+
+  // NOTE, here we are not using move semantics yet, since our internal
+  // representation isn't std::vector, but in the future we would like to avoid
+  // copies, so we want the interface to take r-value references now.
+  node.inputs = ConvertVectorToTfLiteIntArray(inputs);
+  node.outputs = ConvertVectorToTfLiteIntArray(outputs);
+  node.temporaries = TfLiteIntArrayCreate(0);
+  if (init_data) {
+    node.user_data = OpInit(*registration, init_data, init_data_size);
+  } else {
+    node.user_data =
+        OpInit(*registration,
+               reinterpret_cast<const char*>(builtin_data_deleter.get()), 0);
+  }
+
+  node.builtin_data = builtin_data_deleter.release();
+  // TODO(ycling): Filling `custom_initial_data` and `custom_initial_data_size`
+  // properly for nodes generated by ReplaceNodeSubsetsWithDelegateKernels.
+
+  if (registration->builtin_code == BuiltinOperator_CUSTOM) {
+    // When it's a CUSTOM op, the `custom_options` field in the Flatbuffer
+    // `Operator` table is passed in.
+    node.custom_initial_data = init_data;
+    node.custom_initial_data_size = init_data_size;
+  } else {
+    node.custom_initial_data = nullptr;
+    node.custom_initial_data_size = 0;
+  }
+
+  node.delegate = nullptr;
+  node_and_reg.second = *registration;
+  execution_plan_.push_back(new_node_index);
+  return kTfLiteOk;
+}
+
+TfLiteStatus Subgraph::ResizeInputTensor(int tensor_index,
+                                         const std::vector<int>& dims) {
+  if (state_ == kStateInvokableAndImmutable) {
+    ReportError("ResizeInputTensor is disallowed when graph is immutable.");
+    return kTfLiteError;
+  }
+
+  // TODO(aselle): All bounds checks can be implemented as one-sided bounds
+  // checks by casting to unsigned for efficiency. Profile before doing this.
+  TF_LITE_ENSURE(context_,
+                 tensor_index < context_->tensors_size && tensor_index >= 0);
+  TfLiteTensor* tensor = &context_->tensors[tensor_index];
+
+  // Short-circuit the state change if the dimensions don't change, avoiding
+  // unnecessary (re)allocations.
+  if (EqualArrayAndTfLiteIntArray(tensor->dims, dims.size(), dims.data())) {
+    return kTfLiteOk;
+  }
+
+  state_ = kStateUninvokable;
+  return ResizeTensorImpl(tensor, ConvertVectorToTfLiteIntArray(dims));
+}
+
+TfLiteStatus Subgraph::PrepareOpsStartingAt(
+    int first_execution_plan_index, int* last_execution_plan_index_prepared) {
+  if (first_execution_plan_index == 0) {
+    has_dynamic_tensors_ = false;
+  }
+  for (int execution_plan_index = first_execution_plan_index;
+       execution_plan_index < execution_plan_.size(); execution_plan_index++) {
+    int node_index = execution_plan_[execution_plan_index];
+    TfLiteNode& node = nodes_and_registration_[node_index].first;
+    const TfLiteRegistration& registration =
+        nodes_and_registration_[node_index].second;
+    EnsureTensorsVectorCapacity();
+    if (OpPrepare(registration, &node) == kTfLiteError) {
+      return ReportOpError(context_, node, registration, node_index,
+                           "failed to prepare");
+    }
+
+    *last_execution_plan_index_prepared = execution_plan_index;
+
+    // Discontinue if the node has dynamic outputs. Note that we don't
+    // stop for dynamic temporary tensors since they won't affect the
+    // sizes of other tensors in the graph.
+    if (HasDynamicTensor(*context_, node.outputs)) {
+      has_dynamic_tensors_ = true;
+      return kTfLiteOk;
+    }
+  }
+  return kTfLiteOk;
+}
+
+TfLiteStatus Subgraph::PrepareOpsAndTensors() {
+  if (!memory_planner_) {
+    memory_planner_.reset(new ArenaPlanner(
+        context_, std::unique_ptr<GraphInfo>(new InterpreterInfo(this)),
+        /*preserve_inputs=*/true, /*preserve_intermediates*/ false));
+    memory_planner_->PlanAllocations();
+  }
+
+  int last_exec_plan_index_prepared = 0;
+
+  TF_LITE_ENSURE_STATUS(PrepareOpsStartingAt(
+      next_execution_plan_index_to_prepare_, &last_exec_plan_index_prepared));
+  TF_LITE_ENSURE_STATUS(memory_planner_->ExecuteAllocations(
+      next_execution_plan_index_to_prepare_, last_exec_plan_index_prepared));
+
+  next_execution_plan_index_to_prepare_ = last_exec_plan_index_prepared + 1;
+  return kTfLiteOk;
+}
+
+TfLiteStatus Subgraph::Invoke() {
+  if (!consistent_) {
+    ReportError("Invoke called on model that is not consistent.");
+    return kTfLiteError;
+  }
+
+  TfLiteStatus status = kTfLiteOk;
+  if (state_ == kStateUninvokable) {
+    ReportError("Invoke called on model that is not ready.");
+    return kTfLiteError;
+  }
+
+  if (nnapi_delegate_) {
+    if (next_execution_plan_index_to_prepare_ == execution_plan_.size()) {
+      TF_LITE_ENSURE_OK(context_, nnapi_delegate_->Invoke(this));
+      return kTfLiteOk;
+    } else {
+      // TODO(aselle): In the future, we would like this to be an
+      // automatic tflite CPU fallback.
+      ReportError(
+          "NNAPI was requested, but dependent sized tensors "
+          "being used.\n");
+      return kTfLiteError;
+    }
+  }
+
+  // Invocations are always done in node order.
+  // Note that calling Invoke repeatedly will cause the original memory plan to
+  // be reused, unless either ResizeInputTensor() or AllocateTensors() has been
+  // called.
+  for (int execution_plan_index = 0;
+       execution_plan_index < execution_plan_.size(); execution_plan_index++) {
+    if (execution_plan_index == next_execution_plan_index_to_prepare_) {
+      TF_LITE_ENSURE_STATUS(PrepareOpsAndTensors());
+      TF_LITE_ENSURE(context_, next_execution_plan_index_to_prepare_ >=
+                                   execution_plan_index);
+    }
+    int node_index = execution_plan_[execution_plan_index];
+    TfLiteNode& node = nodes_and_registration_[node_index].first;
+    const TfLiteRegistration& registration =
+        nodes_and_registration_[node_index].second;
+    SCOPED_OPERATOR_PROFILE(profiler_, node_index);
+
+    // TODO(ycling): This is an extra loop through inputs to check if the data
+    // need to be copied from Delegate buffer to raw memory, which is often not
+    // needed. We may want to cache this in prepare to know if this needs to be
+    // done for a node or not.
+    for (int i = 0; i < node.inputs->size; ++i) {
+      int tensor_index = node.inputs->data[i];
+      if (tensor_index == kOptionalTensor) {
+        continue;
+      }
+      TfLiteTensor* tensor = &tensors_[tensor_index];
+      if (tensor->delegate && tensor->delegate != node.delegate &&
+          tensor->data_is_stale) {
+        EnsureTensorDataIsReadable(tensor_index);
+      }
+    }
+
+    EnsureTensorsVectorCapacity();
+    tensor_resized_since_op_invoke_ = false;
+    if (OpInvoke(registration, &node) == kTfLiteError) {
+      status = ReportOpError(context_, node, registration, node_index,
+                             "failed to invoke");
+    }
+
+    // Force execution prep for downstream ops if the latest op triggered the
+    // resize of a dynamic tensor.
+    if (tensor_resized_since_op_invoke_ &&
+        HasDynamicTensor(*context_, node.outputs)) {
+      next_execution_plan_index_to_prepare_ = execution_plan_index + 1;
+    }
+  }
+
+  return status;
+}
+
+TfLiteStatus Subgraph::ResizeTensor(TfLiteContext* context,
+                                    TfLiteTensor* tensor,
+                                    TfLiteIntArray* new_size) {
+  // Note here that context->impl_ is recovering the this pointer for an
+  // instance of Interpreter to call into the member function ResizeTensorImpl
+  // (this function is static).
+  return static_cast<Subgraph*>(context->impl_)
+      ->ResizeTensorImpl(tensor, new_size);
+}
+
+void Subgraph::ReportErrorImpl(const char* format, va_list args) {
+  error_reporter_->Report(format, args);
+}
+
+void Subgraph::ReportErrorC(TfLiteContext* context, const char* format, ...) {
+  va_list args;
+  va_start(args, format);
+  auto* f = static_cast<Subgraph*>(context->impl_);
+  // Note here that context->impl_ is recovering the this pointer for an
+  // instance of Subgraph to call into the member function ReportErrorImpl
+  // (this function is static).
+  f->ReportErrorImpl(format, args);
+  va_end(args);
+}
+
+// Entry point for C node plugin API to report an error.
+void Subgraph::ReportError(const char* format, ...) {
+  va_list args;
+  va_start(args, format);
+  auto* f = static_cast<Subgraph*>(context_->impl_);
+  // Note here that context->impl_ is recovering the this pointer for an
+  // instance of Subgraph to call into the member function ReportErrorImpl
+  // (this function is static).
+  f->ReportErrorImpl(format, args);
+  va_end(args);
+}
+
+TfLiteStatus Subgraph::AddTensors(int tensors_to_add,
+                                  int* first_new_tensor_index) {
+  const size_t base_index = tensors_.size();
+  if (first_new_tensor_index) *first_new_tensor_index = base_index;
+  tensors_.resize(tensors_.size() + tensors_to_add);
+  for (size_t i = base_index; i < tensors_.size(); i++) {
+    memset(&tensors_[i], 0, sizeof(tensors_[i]));
+    tensors_[i].buffer_handle = kTfLiteNullBufferHandle;
+  }
+  context_->tensors = tensors_.data();
+  context_->tensors_size = tensors_.size();
+  return kTfLiteOk;
+}
+
+TfLiteStatus Subgraph::AddTensors(TfLiteContext* context, int tensors_to_add,
+                                  int* first_new_tensor_index) {
+  // Note here that context->impl_ is recovering the this pointer for an
+  // instance of Interpreter to call into the member function AddTensors
+  // (this function is static).
+  return static_cast<Subgraph*>(context->impl_)
+      ->AddTensors(tensors_to_add, first_new_tensor_index);
+}
+
+TfLiteStatus Subgraph::GetNodeAndRegistration(
+    int node_index, TfLiteNode** node, TfLiteRegistration** registration) {
+  TF_LITE_ENSURE(context_, node_index >= 0);
+  auto nodes_size = nodes_and_registration_.size();
+  TF_LITE_ENSURE(context_, static_cast<size_t>(node_index) < nodes_size);
+  TF_LITE_ENSURE(context_, node != nullptr && registration != nullptr);
+  auto& node_and_reg = nodes_and_registration_[node_index];
+  *node = &node_and_reg.first;
+  *registration = &node_and_reg.second;
+  return kTfLiteOk;
+}
+
+TfLiteStatus Subgraph::GetNodeAndRegistration(
+    struct TfLiteContext* context, int node_index, TfLiteNode** node,
+    TfLiteRegistration** registration) {
+  return static_cast<Subgraph*>(context->impl_)
+      ->GetNodeAndRegistration(node_index, node, registration);
+}
+
+TfLiteStatus Subgraph::SetTensorParametersReadOnly(
+    int tensor_index, TfLiteType type, const char* name, const size_t rank,
+    const int* dims, TfLiteQuantizationParams quantization, const char* buffer,
+    size_t bytes, const Allocation* allocation) {
+  if (state_ == kStateInvokableAndImmutable) {
+    ReportError(
+        "SetTensorParametersReadOnly is disallowed when graph is immutable.");
+    return kTfLiteError;
+  }
+
+  TF_LITE_ENSURE(context_,
+                 tensor_index < context_->tensors_size && tensor_index >= 0);
+  // For most tensors we know exactly how much memory is necessary so we can
+  // ensure the buffer is large enough. However, we need to skip string tensors
+  // because their sizes change with the contents of the individual strings.
+  if (type != kTfLiteString) {
+    size_t required_bytes;
+    TF_LITE_ENSURE_OK(context_,
+                      BytesRequired(type, dims, rank, &required_bytes));
+    TF_LITE_ENSURE_EQ(context_, required_bytes, bytes);
+  }
+
+  TfLiteTensor& tensor = context_->tensors[tensor_index];
+  if (type == tensor.type &&
+      EqualArrayAndTfLiteIntArray(tensor.dims, rank, dims)) {
+    // Fast path which does not invalidate the invokable property.
+    TfLiteTensorDataFree(&tensor);
+    tensor.data.raw = const_cast<char*>(buffer);
+    if (!tensor.dims) tensor.dims = ConvertArrayToTfLiteIntArray(rank, dims);
+    tensor.params = quantization;
+    tensor.allocation_type = kTfLiteMmapRo;
+    tensor.allocation = allocation;
+  } else {
+    state_ = kStateUninvokable;
+    TfLiteTensorReset(type, name, ConvertArrayToTfLiteIntArray(rank, dims),
+                      quantization, const_cast<char*>(buffer), bytes,
+                      kTfLiteMmapRo, allocation, false, &tensor);
+  }
+  return kTfLiteOk;
+}
+
+// Set description of inputs/outputs/data/fptrs for node `node_index`.
+// This variant assumes an external buffer has been allocated of size
+// bytes. The lifetime of buffer must be ensured to be greater or equal
+// to Interpreter.
+TfLiteStatus Subgraph::SetTensorParametersReadWrite(
+    int tensor_index, TfLiteType type, const char* name, const size_t rank,
+    const int* dims, TfLiteQuantizationParams quantization, bool is_variable) {
+  if (state_ == kStateInvokableAndImmutable) {
+    ReportError(
+        "SetTensorParametersReadWrite is disallowed when graph is immutable.");
+    return kTfLiteError;
+  }
+  TF_LITE_ENSURE(context_,
+                 tensor_index < context_->tensors_size && tensor_index >= 0);
+  size_t required_bytes = 0;
+  if (type != kTfLiteString) {
+    // These types will be allocated in our arena so we need to record how
+    // many bytes we will need based on the dimensions. String tensors are
+    // allocated dynamically and we can't know ahead of time how much space
+    // they will require.
+    TF_LITE_ENSURE_OK(context_,
+                      BytesRequired(type, dims, rank, &required_bytes));
+  }
+
+  TfLiteAllocationType allocation_type = kTfLiteArenaRw;
+  if (type == kTfLiteString) {
+    if (is_variable) {
+      // We don't have a real use case for string variable tensor.
+      ReportError("String variable tensor isn't supported.");
+      return kTfLiteError;
+    }
+    allocation_type = kTfLiteDynamic;
+  } else if (is_variable) {
+    allocation_type = kTfLiteArenaRwPersistent;
+  }
+
+  TfLiteTensorReset(type, name, ConvertArrayToTfLiteIntArray(rank, dims),
+                    quantization,
+                    /*buffer=*/nullptr, required_bytes, allocation_type,
+                    nullptr, is_variable, &context_->tensors[tensor_index]);
+  return kTfLiteOk;
+}
+
+TfLiteStatus Subgraph::SetExecutionPlan(const std::vector<int>& new_plan) {
+  for (int node_index : new_plan) {
+    TF_LITE_ENSURE(context_, node_index >= 0 &&
+                                 node_index < nodes_and_registration_.size());
+  }
+  execution_plan_ = new_plan;
+  return kTfLiteOk;
+}
+
+TfLiteStatus Subgraph::ResizeTensorImpl(TfLiteTensor* tensor,
+                                        TfLiteIntArray* new_size) {
+  // Note that in theory we could resize kTfLiteArenaRwPersistent tensors too.
+  if (tensor->allocation_type == kTfLiteArenaRw ||
+      tensor->allocation_type == kTfLiteDynamic ||
+      tensor->allocation_type == kTfLiteArenaRwPersistent) {
+    tensor_resized_since_op_invoke_ |=
+        TfLiteIntArrayEqual(tensor->dims, new_size) == 0;
+    if (tensor->type != kTfLiteString) {
+      size_t bytesRequired;
+      TfLiteStatus status = BytesRequired(tensor->type, new_size->data,
+                                          new_size->size, &bytesRequired);
+      if (status != kTfLiteOk) {
+        TfLiteIntArrayFree(new_size);
+        return kTfLiteError;
+      }
+
+      // Realloc space for kTfLiteDynamic tensors.
+      TfLiteTensorRealloc(bytesRequired, tensor);
+      tensor->bytes = bytesRequired;
+    }
+    if (tensor->dims) TfLiteIntArrayFree(tensor->dims);
+    tensor->dims = new_size;
+
+    if (tensor->allocation_type != kTfLiteDynamic) {
+      tensor->data.raw = nullptr;
+    }
+  } else {
+    // kTfLiteMmapRo tensors are stored in the flatbuffer and are therefore
+    // of fixed size.
+    TfLiteIntArrayFree(new_size);
+    ReportError("Attempting to resize a fixed-size tensor.");
+    return kTfLiteError;
+  }
+  return kTfLiteOk;
+}
+
+void Subgraph::UseNNAPI(bool enable) {
+  // TODO(aselle): This is a workaround for finding if NNAPI exists.
+  // We also need to make sure getLibraryHandle() is renamed to be NNAPI
+  // prefixed.
+  if (!NNAPIDelegate::IsSupported()) enable = false;
+  if (!enable) {
+    nnapi_delegate_.reset();
+  } else if (!nnapi_delegate_) {
+    nnapi_delegate_.reset(new NNAPIDelegate);
+  }
+}
+
+void Subgraph::SwitchToDelegateContext() {
+  context_->GetNodeAndRegistration = GetNodeAndRegistration;
+  context_->ReplaceNodeSubsetsWithDelegateKernels =
+      ReplaceNodeSubsetsWithDelegateKernels;
+  context_->GetExecutionPlan = GetExecutionPlan;
+}
+
+void Subgraph::SwitchToKernelContext() {
+  context_->GetNodeAndRegistration = [](struct TfLiteContext* context,
+                                        int node_index, TfLiteNode** node,
+                                        TfLiteRegistration** registration) {
+    return ForbiddenContextFunction(context);
+  };
+  context_->ReplaceNodeSubsetsWithDelegateKernels =
+      [](TfLiteContext* context, TfLiteRegistration registration,
+         const TfLiteIntArray* nodes_to_replace, TfLiteDelegate* delegate) {
+        return ForbiddenContextFunction(context);
+      };
+  context_->GetExecutionPlan = [](struct TfLiteContext* context,
+                                  TfLiteIntArray**) {
+    return ForbiddenContextFunction(context);
+  };
+}
+
+TfLiteStatus Subgraph::ModifyGraphWithDelegate(TfLiteDelegate* delegate) {
+  if (!(delegate->flags & kTfLiteDelegateFlagsAllowDynamicTensors)) {
+    int last_execution_plan_index_prepared;
+    TF_LITE_ENSURE_OK(&context_, PrepareOpsStartingAt(
+                                     0, &last_execution_plan_index_prepared));
+    if (has_dynamic_tensors_) {
+      ReportError(
+          "Attempting to use a delegate that only supports static-sized "
+          "tensors with a graph that has dynamic-sized tensors.");
+      return kTfLiteError;
+    }
+  }
+
+  // TODO(aselle): Consider if it is worth storing pointers to delegates.
+  // Setup additional context interface.
+  SwitchToDelegateContext();
+
+  TfLiteStatus status = delegate->Prepare(context_, delegate);
+
+  // Remove additional context info.
+  SwitchToKernelContext();
+
+  TF_LITE_ENSURE_OK(context_, status);
+
+  if (!(delegate->flags & kTfLiteDelegateFlagsAllowDynamicTensors)) {
+    // Reset the state to force tensor/op reallocation.
+    state_ = kStateUninvokable;
+    TF_LITE_ENSURE_OK(context_, AllocateTensors());
+    TF_LITE_ENSURE_EQ(context_, state_, kStateInvokable);
+    // After using a delegate which doesn't support dynamic tensors, make the
+    // entire graph immutable.
+    state_ = kStateInvokableAndImmutable;
+  }
+
+  return status;
+}
+
+}  // namespace tflite
diff --git a/tensorflow/lite/core/subgraph.h b/tensorflow/lite/core/subgraph.h
new file mode 100644
index 00000000000..2a7c3a7c322
--- /dev/null
+++ b/tensorflow/lite/core/subgraph.h
@@ -0,0 +1,501 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_CORE_SUBGRAPH_H_
+#define TENSORFLOW_LITE_CORE_SUBGRAPH_H_
+
+#include <cstdlib>
+#include <vector>
+
+#include "tensorflow/lite/allocation.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/memory_planner.h"
+#include "tensorflow/lite/profiling/profiler.h"
+#include "tensorflow/lite/util.h"
+
+namespace tflite {
+
+// Forward declare since NNAPIDelegate uses Interpreter.
+class NNAPIDelegate;
+
+class Subgraph {
+ public:
+  friend class Interpreter;
+
+  Subgraph(ErrorReporter* error_reporter,
+           TfLiteExternalContext** external_contexts,
+           std::vector<std::unique_ptr<Subgraph>>* subgraphs);
+
+  Subgraph(const Subgraph&) = delete;
+
+  // Subgraphs should be movable but not copyable.
+  Subgraph(Subgraph&&) = default;
+  Subgraph& operator=(const Subgraph&) = delete;
+  virtual ~Subgraph();
+
+  // Provide a list of tensor indexes that are inputs to the model.
+  // Each index is bound check and this modifies the consistent_ flag of the
+  // interpreter.
+  TfLiteStatus SetInputs(std::vector<int> inputs);
+
+  // Provide a list of tensor indexes that are outputs to the model
+  // Each index is bound check and this modifies the consistent_ flag of the
+  // interpreter.
+  TfLiteStatus SetOutputs(std::vector<int> outputs);
+
+  // Provide a list of tensor indexes that are variable tensors.
+  // Each index is bound check and this modifies the consistent_ flag of the
+  // interpreter.
+  TfLiteStatus SetVariables(std::vector<int> variables);
+
+  // Adds a node with the given parameters and returns the index of the new
+  // node in `node_index` (optionally). Interpreter will take ownership of
+  // `builtin_data` and destroy it with `free`. Ownership of 'init_data'
+  // remains with the caller.
+  TfLiteStatus AddNodeWithParameters(const std::vector<int>& inputs,
+                                     const std::vector<int>& outputs,
+                                     const char* init_data,
+                                     size_t init_data_size, void* builtin_data,
+                                     const TfLiteRegistration* registration,
+                                     int* node_index);
+
+  // Adds `tensors_to_add` tensors, preserving pre-existing Tensor entries.
+  // The value pointed to by `first_new_tensor_index` will be set to the
+  // index of the first new tensor if `first_new_tensor_index` is non-null.
+  TfLiteStatus AddTensors(int tensors_to_add, int* first_new_tensor_index);
+
+  // Set description of inputs/outputs/data/fptrs for node `node_index`.
+  // This variant assumes an external buffer has been allocated of size
+  // bytes. The lifetime of buffer must be ensured to be greater or equal
+  // to Interpreter.
+  TfLiteStatus SetTensorParametersReadOnly(
+      int tensor_index, TfLiteType type, const char* name, const size_t rank,
+      const int* dims, TfLiteQuantizationParams quantization,
+      const char* buffer, size_t bytes, const Allocation* allocation);
+
+  // Set description of inputs/outputs/data/fptrs for node `node_index`.
+  // This variant assumes an external buffer has been allocated of size
+  // bytes. The lifetime of buffer must be ensured to be greater or equal
+  // to Interpreter.
+  TfLiteStatus SetTensorParametersReadWrite(
+      int tensor_index, TfLiteType type, const char* name, const size_t rank,
+      const int* dims, TfLiteQuantizationParams quantization, bool is_variable);
+
+  // WARNING: Experimental interface, subject to change
+  // Overrides execution plan. This bounds checks indices sent in.
+  TfLiteStatus SetExecutionPlan(const std::vector<int>& new_plan);
+
+  // Get a mutable tensor data structure.
+  // TODO(aselle): Create a safe ArrayHandle interface to avoid exposing this
+  // read/write access to structure
+  TfLiteTensor* tensor(int tensor_index) {
+    if (tensor_index < 0 ||
+        static_cast<size_t>(tensor_index) >= context_->tensors_size) {
+      return nullptr;
+    }
+    return &context_->tensors[tensor_index];
+  }
+
+  // Get an immutable tensor data structure.
+  const TfLiteTensor* tensor(int tensor_index) const {
+    if (tensor_index < 0 ||
+        static_cast<size_t>(tensor_index) >= context_->tensors_size) {
+      return nullptr;
+    }
+    return &context_->tensors[tensor_index];
+  }
+
+  // Read only access to list of inputs.
+  std::vector<int>& inputs() { return inputs_; }
+
+  // Read only access to list of inputs.
+  const std::vector<int>& inputs() const { return inputs_; }
+
+  // Read only access to list of outputs.
+  std::vector<int>& outputs() { return outputs_; }
+
+  // Read only access to list of outputs.
+  const std::vector<int>& outputs() const { return outputs_; }
+
+  // Read only access to list of variable tensors.
+  std::vector<int>& variables() { return variables_; }
+
+  // Read only access to list of variable tensors.
+  const std::vector<int>& variables() const { return variables_; }
+
+  size_t tensors_size() const { return tensors_.size(); }
+
+  // Return the number of ops in the model.
+  size_t nodes_size() const { return nodes_and_registration_.size(); }
+
+  // Read only access to list of variable tensors.
+  std::vector<int>& execution_plan() { return execution_plan_; }
+
+  // Read only access to list of variable tensors.
+  const std::vector<int>& execution_plan() const { return execution_plan_; }
+
+  // Mutable form of tensors (TEMPORARY for refactor).
+  // TODO(b/119495520): remove when refactoring complete.
+  std::vector<TfLiteTensor>& tensors() { return tensors_; }
+  // Mutable form of tensors (TEMPORARY for refactor).
+  // TODO(b/119495520): remove when refactoring complete.
+  std::vector<std::pair<TfLiteNode, TfLiteRegistration>>&
+  nodes_and_registration() {
+    return nodes_and_registration_;
+  }
+
+  const std::vector<std::pair<TfLiteNode, TfLiteRegistration>>&
+  nodes_and_registration() const {
+    return nodes_and_registration_;
+  }
+
+  // Get a pointer to an operation and registration data structure if in bounds.
+  const std::pair<TfLiteNode, TfLiteRegistration>* node_and_registration(
+      int node_index) const {
+    if (node_index < 0 || static_cast<size_t>(node_index) >= nodes_size())
+      return nullptr;
+    return &nodes_and_registration_[node_index];
+  }
+
+  // Change the dimensionality of a given tensor. Note, this is only acceptable
+  // for tensor indices that are inputs.
+  // Returns status of failure or success.
+  // TODO(aselle): Consider implementing ArraySlice equivalent to make this
+  //   more adept at accepting data without an extra copy. Use absl::ArraySlice
+  //   if our partners determine that dependency is acceptable.
+  TfLiteStatus ResizeInputTensor(int tensor_index,
+                                 const std::vector<int>& dims);
+
+  // Update allocations for all tensors. This will redim dependent tensors using
+  // the input tensor dimensionality as given. This is relatively expensive.
+  // If you know that your sizes are not changing, you need not call this.
+  // Returns status of success or failure.
+  TfLiteStatus AllocateTensors();
+
+  // Invoke the subgraph (run the whole graph in dependency order).
+  //
+  // NOTE: It is possible that the interpreter is not in a ready state
+  // to evaluate (i.e. if a ResizeTensor() has been performed without an
+  // AllocateTensors().
+  // Returns status of success or failure.
+  TfLiteStatus Invoke();
+
+  // Entry point for C node plugin API to report an error.
+  void ReportError(const char* format, ...);
+
+  void UseNNAPI(bool enable);
+
+  // Return the subgraph specific context.
+  TfLiteContext* context() { return context_; }
+
+  // Set the value of an external context.
+  void SetExternalContext(TfLiteExternalContextType type,
+                          TfLiteExternalContext* ctx);
+  // Get the half precision flag.
+  // WARNING: This is an experimental API and subject to change.
+  bool GetAllowFp16PrecisionForFp32() const {
+    return context_->allow_fp32_relax_to_fp16;
+  }
+
+  // Ensure the data in `tensor.data` is readable. In case delegate is used,
+  // it might require to copy the data from delegate buffer to raw memory.
+  // WARNING: This is an experimental API and subject to change.
+  // TODO(b/119495520): make this private when refactoring complete.
+  TfLiteStatus EnsureTensorDataIsReadable(int tensor_index) {
+    TfLiteTensor* t = &tensors_[tensor_index];
+    TF_LITE_ENSURE(context_, t != nullptr);
+    if (t->data_is_stale) {
+      TF_LITE_ENSURE(context_, t->delegate != nullptr);
+      TF_LITE_ENSURE(context_, t->buffer_handle != kTfLiteNullBufferHandle);
+      TF_LITE_ENSURE(context_, t->delegate->CopyFromBufferHandle != nullptr);
+      // TODO(b/120420546): we must add a test that exercise this code.
+      TF_LITE_ENSURE_STATUS(t->delegate->CopyFromBufferHandle(
+          context_, t->delegate, t->buffer_handle, t));
+      t->data_is_stale = false;
+    }
+    return kTfLiteOk;
+  }
+
+  // The default capacity of `tensors_` vector.
+  static constexpr int kTensorsReservedCapacity = 128;
+  // The capacity headroom of `tensors_` vector before calling ops'
+  // `prepare` and `invoke` function. In these functions, it's guaranteed
+  // allocating up to `kTensorsCapacityHeadroom` more tensors won't invalidate
+  // pointers to existing tensors.
+  static constexpr int kTensorsCapacityHeadroom = 16;
+
+  // Reset all variable tensors to the default value.
+  // If a variable tensor doesn't have a buffer, reset it to zero.
+  // TODO(b/115961645): Implement - If a variable tensor has a buffer, reset it
+  // to the value of the buffer.
+  // WARNING: This is an experimental API and subject to change.
+  TfLiteStatus ResetVariableTensors();
+
+  void SetProfiler(profiling::Profiler* profiler) { profiler_ = profiler; }
+
+  profiling::Profiler* GetProfiler() { return profiler_; }
+
+  // Returns a pointer to vector of subgraphs.
+  // WARNING: This is an experimental API and subject to change.
+  std::vector<std::unique_ptr<Subgraph>>* GetSubgraphs() { return subgraphs_; }
+
+  // True if all tensors in the graph has static size after calling
+  // `AllocateTensors` function.
+  // Before `AllocateTensors` is called, this will always return true;
+  bool HasDynamicTensors() { return has_dynamic_tensors_; }
+
+ private:
+  // Prevent 'context_' from accessing functions that are only available to
+  // delegated kernels.
+  void SwitchToKernelContext();
+
+  // Add delegate-only functions to 'context_'.
+  void SwitchToDelegateContext();
+
+  // Give 'op_reg' a chance to initialize itself using the contents of
+  // 'buffer'.
+  void* OpInit(const TfLiteRegistration& op_reg, const char* buffer,
+               size_t length) {
+    if (op_reg.init == nullptr) return nullptr;
+    return op_reg.init(context_, buffer, length);
+  }
+
+  // Let 'op_reg' release any memory it might have allocated via 'OpInit'.
+  void OpFree(const TfLiteRegistration& op_reg, void* buffer) {
+    if (op_reg.free == nullptr) return;
+    if (buffer) {
+      op_reg.free(context_, buffer);
+    }
+  }
+
+  // Prepare the given 'node' for execution.
+  TfLiteStatus OpPrepare(const TfLiteRegistration& op_reg, TfLiteNode* node) {
+    if (op_reg.prepare == nullptr) return kTfLiteOk;
+    return op_reg.prepare(context_, node);
+  }
+
+  // Invoke the operator represented by 'node'.
+  TfLiteStatus OpInvoke(const TfLiteRegistration& op_reg, TfLiteNode* node) {
+    if (op_reg.invoke == nullptr) return kTfLiteError;
+    return op_reg.invoke(context_, node);
+  }
+
+  // Call OpPrepare() for as many ops as possible, allocating memory for their
+  // tensors. If an op containing dynamic tensors is found, preparation will be
+  // postponed until this function is called again. This allows the interpreter
+  // to wait until Invoke() to resolve the sizes of dynamic tensors.
+  TfLiteStatus PrepareOpsAndTensors();
+
+  // Call OpPrepare() for all ops starting at 'first_node'. Stop when a
+  // dynamic tensors is found or all ops have been prepared. Fill
+  // 'last_node_prepared' with the id of the op containing dynamic tensors, or
+  // the last in the graph.
+  TfLiteStatus PrepareOpsStartingAt(int first_execution_plan_index,
+                                    int* last_execution_plan_index_prepared);
+
+  // Tensors needed by the interpreter. Use `AddTensors` to add more blank
+  // tensor entries. Note, `tensors_.data()` needs to be synchronized to the
+  // `context_` whenever this std::vector is reallocated. Currently this
+  // only happens in `AddTensors()`.
+  std::vector<TfLiteTensor> tensors_;
+
+  // Check if an array of tensor indices are valid with respect to the Tensor
+  // array.
+  // NOTE: this changes consistent_ to be false if indices are out of bounds.
+  TfLiteStatus CheckTensorIndices(const char* label, const int* indices,
+                                  int length);
+
+  // Compute the number of bytes required to represent a tensor with dimensions
+  // specified by the array dims (of length dims_size). Returns the status code
+  // and bytes.
+  TfLiteStatus BytesRequired(TfLiteType type, const int* dims, size_t dims_size,
+                             size_t* bytes);
+
+  // Request an tensor be resized implementation. If the given tensor is of
+  // type kTfLiteDynamic it will also be allocated new memory.
+  TfLiteStatus ResizeTensorImpl(TfLiteTensor* tensor, TfLiteIntArray* new_size);
+
+  // Report a detailed error string (will be printed to stderr).
+  // TODO(aselle): allow user of class to provide alternative destinations.
+  void ReportErrorImpl(const char* format, va_list args);
+
+  // Entry point for C node plugin API to request an tensor be resized.
+  static TfLiteStatus ResizeTensor(TfLiteContext* context, TfLiteTensor* tensor,
+                                   TfLiteIntArray* new_size);
+  // Entry point for C node plugin API to report an error.
+  static void ReportErrorC(TfLiteContext* context, const char* format, ...);
+
+  // Entry point for C node plugin API to add new tensors.
+  static TfLiteStatus AddTensors(TfLiteContext* context, int tensors_to_add,
+                                 int* first_new_tensor_index);
+
+  // WARNING: This is an experimental API and subject to change.
+  // Entry point for C API ReplaceNodeSubsetsWithDelegateKernels
+  static TfLiteStatus ReplaceNodeSubsetsWithDelegateKernels(
+      TfLiteContext* context, TfLiteRegistration registration,
+      const TfLiteIntArray* nodes_to_replace, TfLiteDelegate* delegate);
+
+  // Update the execution graph to replace some of the nodes with stub
+  // nodes. Specifically any node index that has `nodes[index]==1` will be
+  // slated for replacement with a delegate kernel specified by registration.
+  // Ownership of 'nodes_to_replace' and 'delegate' remains with the caller.
+  // WARNING: This is an experimental interface that is subject to change.
+  TfLiteStatus ReplaceNodeSubsetsWithDelegateKernels(
+      TfLiteRegistration registration, const TfLiteIntArray* nodes_to_replace,
+      TfLiteDelegate* delegate);
+
+  // WARNING: This is an experimental interface that is subject to change.
+  // Gets the internal pointer to a TensorFlow lite node by node_index.
+  TfLiteStatus GetNodeAndRegistration(int node_index, TfLiteNode** node,
+                                      TfLiteRegistration** registration);
+
+  // WARNING: This is an experimental interface that is subject to change.
+  // Entry point for C node plugin API to get a node by index.
+  static TfLiteStatus GetNodeAndRegistration(struct TfLiteContext*,
+                                             int node_index, TfLiteNode** node,
+                                             TfLiteRegistration** registration);
+
+  // WARNING: This is an experimental interface that is subject to change.
+  // Gets an TfLiteIntArray* representing the execution plan. The interpreter
+  // owns this memory and it is only guaranteed to exist during the invocation
+  // of the delegate prepare.
+  TfLiteStatus GetExecutionPlan(TfLiteIntArray** execution_plan);
+
+  // WARNING: This is an experimental interface that is subject to change.
+  // Entry point for C node plugin API to get the execution plan.
+  static TfLiteStatus GetExecutionPlan(struct TfLiteContext* context,
+                                       TfLiteIntArray** execution_plan);
+
+  // Retrieve an existing external context by type.
+  TfLiteExternalContext* GetExternalContext(TfLiteExternalContextType type);
+  static TfLiteExternalContext* GetExternalContext(
+      struct TfLiteContext* context, TfLiteExternalContextType type);
+
+  // Set the value of an external context.
+  static void SetExternalContext(struct TfLiteContext* context,
+                                 TfLiteExternalContextType type,
+                                 TfLiteExternalContext* ctx);
+
+  // Allow a delegate to look at the graph and modify the graph to handle
+  // parts of the graph themselves. After this is called, the graph may
+  // contain new nodes that replace 1 more nodes.
+  // WARNING: This is an experimental API and subject to change.
+  TfLiteStatus ModifyGraphWithDelegate(TfLiteDelegate* delegate);
+
+  // Ensures that `tensors_` has at least `kTensorsCapacityHeadroom` extra
+  // capacity. Calling this function may invalidate existing pointers to
+  // tensors. After calling this function, adding `kTensorsCapacityHeadroom`
+  // more tensors won't invalidate the pointer to existing tensors.
+  void EnsureTensorsVectorCapacity() {
+    const size_t required_capacity = tensors_.size() + kTensorsCapacityHeadroom;
+    if (required_capacity > tensors_.capacity()) {
+      tensors_.reserve(required_capacity);
+      context_->tensors = tensors_.data();
+    }
+  }
+
+  // The state of the Interpreter.
+  enum State {
+    // The interpreter isn't ready to be invoked.
+    // `AllocateTensor` need to be called to enter an invokable state.
+    kStateUninvokable = 0,
+    // The interpreter is ready to be invoked.
+    kStateInvokable,
+    // The interpreter is ready to be invoked, and graph can't be further
+    // modified. The interpreter will enter this state when calling
+    // `ModifyGraphWithDelegate` with `allow_dynamic_tensors=false`.
+    kStateInvokableAndImmutable,
+  };
+  State state_ = kStateUninvokable;
+
+  // A pure C data structure used to communicate with the pure C plugin
+  // interface. To avoid copying tensor metadata, this is also the definitive
+  // structure to store tensors.
+  // TODO(b/119495520): Get rid of owned and just make context_ a instance.
+  TfLiteContext owned_context_;
+  TfLiteContext* context_;
+
+  // Node inputs/outputs are stored in TfLiteNode and TfLiteRegistration stores
+  // function pointers to actual implementation.
+  std::vector<std::pair<TfLiteNode, TfLiteRegistration>>
+      nodes_and_registration_;
+
+  // Whether the model is consistent. That is to say if the inputs and outputs
+  // of every node and the global inputs and outputs are valid indexes into
+  // the tensor array.
+  bool consistent_ = true;
+
+  // Array of indices representing the tensors that are inputs to the
+  // interpreter.
+  std::vector<int> inputs_;
+
+  // Array of indices representing the tensors that are outputs to the
+  // interpreter.
+  std::vector<int> outputs_;
+
+  // Array of indices representing the tensors that are variable tensors.
+  std::vector<int> variables_;
+
+  // The error reporter delegate that tflite will forward queries errors to.
+  ErrorReporter* error_reporter_;
+
+  // Index of the next node to prepare.
+  // During Invoke(), Interpreter will allocate input tensors first, which are
+  // known to be fixed size. Then it will allocate outputs from nodes as many
+  // as possible. When there is a node that produces dynamic sized tensor.
+  // Interpreter will stop allocating tensors, set the value of next allocate
+  // node id, and execute the node to generate the output tensor before continue
+  // to allocate successors. This process repeats until all nodes are executed.
+  // NOTE: this relies on the order of nodes that is in topological order.
+  int next_execution_plan_index_to_prepare_;
+
+  // WARNING: This is an experimental interface that is subject to change.
+  // This is a list of node indices (to index into nodes_and_registration).
+  // This represents a valid topological sort (dependency ordered) execution
+  // plan. In particular, it is valid for this ordering to contain only a
+  // subset of the node indices.
+  std::vector<int> execution_plan_;
+
+  // In the future, we'd like a TfLiteIntArray compatible representation.
+  // TODO(aselle): replace execution_plan_ with this.
+  std::unique_ptr<TfLiteIntArray, TfLiteIntArrayDeleter> plan_cache_;
+
+  // Whether to delegate to NN API
+  std::unique_ptr<NNAPIDelegate> nnapi_delegate_;
+
+  std::unique_ptr<MemoryPlanner> memory_planner_;
+
+  // Tracking bit for whether a tensor was resized in the course of an op
+  // invocation. This is a useful hint to ensure that dynamic tensor outputs
+  // trigger downstream reallocation after op invocation.
+  bool tensor_resized_since_op_invoke_ = false;
+
+  // External contexts (kTfLiteMaxExternalContexts).
+  TfLiteExternalContext** external_contexts_;
+
+  // Profiler for this interpreter instance.
+  profiling::Profiler* profiler_ = nullptr;
+
+  // A pointer to vector of subgraphs. The vector is owned by the interpreter.
+  std::vector<std::unique_ptr<Subgraph>>* subgraphs_ = nullptr;
+
+  // True if all tensors in the graph has static size after calling
+  // `PrepareOpsStartingAt` function (which is called by the `AllocateTensors`
+  // public function).
+  // The value is invalid before `PrepareOpStartingAt` is called.
+  bool has_dynamic_tensors_ = true;
+};
+
+}  // namespace tflite
+#endif  // TENSORFLOW_LITE_CORE_SUBGRAPH_H_
diff --git a/tensorflow/lite/delegates/flex/BUILD b/tensorflow/lite/delegates/flex/BUILD
index 222a043a88e..75083bf95a1 100644
--- a/tensorflow/lite/delegates/flex/BUILD
+++ b/tensorflow/lite/delegates/flex/BUILD
@@ -83,8 +83,10 @@ cc_library(
         ":delegate_data",
         ":kernel",
         ":util",
+        "@com_google_absl//absl/strings:strings",
         "//tensorflow/lite/c:c_api_internal",
         "//tensorflow/lite:kernel_api",
+        "//tensorflow/lite:string_util",
         "//tensorflow/lite:util",
     ] + select({
         "//tensorflow:android": [
@@ -116,6 +118,7 @@ cc_library(
     hdrs = ["delegate_data.h"],
     deps = [
         ":buffer_map",
+        "@com_google_absl//absl/memory",
         "//tensorflow/core/common_runtime/eager:context",
     ] + select({
         "//tensorflow:android": [
diff --git a/tensorflow/lite/delegates/flex/buffer_map.cc b/tensorflow/lite/delegates/flex/buffer_map.cc
index 9a6c5e74a7b..262ca9e0897 100644
--- a/tensorflow/lite/delegates/flex/buffer_map.cc
+++ b/tensorflow/lite/delegates/flex/buffer_map.cc
@@ -93,6 +93,11 @@ class TfLiteTensorBuffer : public BaseTfLiteTensorBuffer {
 class StringTfLiteTensorBuffer : public BaseTfLiteTensorBuffer {
  public:
   explicit StringTfLiteTensorBuffer(const TfLiteTensor* tensor) {
+    if (tensor->data.raw == nullptr) {
+      num_strings_ = 0;
+      data_ = nullptr;
+      return;
+    }
     num_strings_ = GetStringCount(tensor->data.raw);
     data_ = tensorflow::cpu_allocator()->Allocate<string>(num_strings_);
 
diff --git a/tensorflow/lite/delegates/flex/delegate.cc b/tensorflow/lite/delegates/flex/delegate.cc
index 4fc2d82b494..ca7314fbaee 100644
--- a/tensorflow/lite/delegates/flex/delegate.cc
+++ b/tensorflow/lite/delegates/flex/delegate.cc
@@ -16,12 +16,14 @@ limitations under the License.
 
 #include <vector>
 
+#include "absl/strings/str_cat.h"
+#include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/lite/context_util.h"
 #include "tensorflow/lite/delegates/flex/buffer_map.h"
 #include "tensorflow/lite/delegates/flex/kernel.h"
 #include "tensorflow/lite/delegates/flex/util.h"
+#include "tensorflow/lite/string_util.h"
 #include "tensorflow/lite/util.h"
-#include "tensorflow/core/lib/core/status.h"
 
 namespace tflite {
 namespace flex {
@@ -57,8 +59,8 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteDelegate* delegate) {
 
 TfLiteStatus CopyFromBufferHandle(TfLiteContext* context,
                                   TfLiteDelegate* delegate,
-                                  TfLiteBufferHandle buffer_handle, void* data,
-                                  size_t size) {
+                                  TfLiteBufferHandle buffer_handle,
+                                  TfLiteTensor* output) {
   BufferMap* buffer_map =
       reinterpret_cast<DelegateData*>(delegate->data_)->GetBufferMap(context);
 
@@ -68,15 +70,38 @@ TfLiteStatus CopyFromBufferHandle(TfLiteContext* context,
   }
 
   tensorflow::Tensor t = buffer_map->GetTensor(buffer_handle);
+
+  if (output->type == kTfLiteString) {
+    if (t.dtype() != tensorflow::DT_STRING) {
+      context->ReportError(context,
+                           "Inconsistent type for TF string tensor index %d.",
+                           buffer_handle);
+      return kTfLiteError;
+    }
+    DynamicBuffer dynamic_buffer;
+
+    auto tf_data = t.flat<string>();
+    for (int i = 0; i < t.NumElements(); ++i) {
+      dynamic_buffer.AddString(tf_data(i).data(), tf_data(i).size());
+    }
+
+    dynamic_buffer.WriteToTensor(output, /*new_shape=*/nullptr);
+    return kTfLiteOk;
+  }
+
   tensorflow::StringPiece t_data = t.tensor_data();
 
-  if (size != t_data.size()) {
-    context->ReportError(
-        context, "Not enough space to store TensorFlow's aligned buffer.");
+  if (output->bytes != t_data.size()) {
+    context->ReportError(context,
+                         absl::StrCat("The given ", output->bytes,
+                                      " bytes are not enough to store "
+                                      "TensorFlow's aligned buffer of size ",
+                                      t_data.size(), " bytes.")
+                             .c_str());
     return kTfLiteError;
   }
 
-  memcpy(data, t_data.data(), t_data.size());
+  memcpy(output->data.raw, t_data.data(), t_data.size());
   return kTfLiteOk;
 }
 
@@ -104,14 +129,13 @@ std::unique_ptr<FlexDelegate> FlexDelegate::Create() {
 }
 
 FlexDelegate::FlexDelegate(std::unique_ptr<flex::DelegateData> delegate_data)
-    : TfLiteDelegate{
-          /*data_=*/delegate_data.get(),
-          /*nullptr,*/ &flex::delegate::Prepare,
-          /*CopyFromBufferHandle=*/&flex::delegate::CopyFromBufferHandle,
-          /*CopyToBufferHandle=*/nullptr,
-          /*FreeBufferHandle=*/nullptr,
-          /*flags=*/kTfLiteDelegateFlagsAllowDynamicTensors},
-      delegate_data_(std::move(delegate_data)) {}
+    : TfLiteDelegate(TfLiteDelegateCreate()),
+      delegate_data_(std::move(delegate_data)) {
+  data_ = delegate_data_.get();
+  Prepare = &flex::delegate::Prepare;
+  CopyFromBufferHandle = &flex::delegate::CopyFromBufferHandle;
+  flags = kTfLiteDelegateFlagsAllowDynamicTensors;
+}
 
 FlexDelegate::~FlexDelegate() {}
 
diff --git a/tensorflow/lite/delegates/flex/delegate_data.cc b/tensorflow/lite/delegates/flex/delegate_data.cc
index b62479a4480..1483a530388 100644
--- a/tensorflow/lite/delegates/flex/delegate_data.cc
+++ b/tensorflow/lite/delegates/flex/delegate_data.cc
@@ -14,20 +14,21 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/delegates/flex/delegate_data.h"
 
+#include "absl/memory/memory.h"
 #include "tensorflow/core/common_runtime/device_factory.h"
 #include "tensorflow/core/lib/core/status.h"
 
 namespace tflite {
 namespace flex {
 tensorflow::Status DelegateData::Create(std::unique_ptr<DelegateData>* data) {
-  std::vector<tensorflow::Device*> devices;
+  std::vector<std::unique_ptr<tensorflow::Device>> devices;
 
   TF_RETURN_IF_ERROR(tensorflow::DeviceFactory::AddDevices(
       tensorflow::SessionOptions(), "/job:localhost/replica:0/task:0",
       &devices));
 
-  std::unique_ptr<tensorflow::DeviceMgr> device_mgr(
-      new tensorflow::DeviceMgr(devices));
+  std::unique_ptr<tensorflow::DeviceMgr> device_mgr =
+      absl::make_unique<tensorflow::DeviceMgr>(std::move(devices));
   // Note that Rendezvous is ref-counted so it will be automatically deleted.
   tensorflow::Rendezvous* rendezvous =
       new tensorflow::IntraProcessRendezvous(device_mgr.get());
diff --git a/tensorflow/lite/delegates/flex/delegate_test.cc b/tensorflow/lite/delegates/flex/delegate_test.cc
index e13029d9a51..1b2f476f03f 100644
--- a/tensorflow/lite/delegates/flex/delegate_test.cc
+++ b/tensorflow/lite/delegates/flex/delegate_test.cc
@@ -93,6 +93,25 @@ TEST_F(DelegateTest, NonFloatTypeInference) {
   ASSERT_EQ(GetType(2), kTfLiteInt32);
 }
 
+TEST_F(DelegateTest, StringInference) {
+  AddTensors(3, {0, 1}, {2}, kTfLiteString, {2});
+
+  AddTfOp(testing::kAdd, {0, 1}, {2});
+
+  ConfigureDelegate();
+
+  SetShape(0, {2, 2});
+  SetStringValues(0, {"1", "2", "3", "4"});
+  SetShape(1, {2, 2});
+  SetStringValues(1, {"4", "3", "2", "1"});
+
+  ASSERT_TRUE(Invoke());
+
+  ASSERT_THAT(GetShape(2), ElementsAre(2, 2));
+  ASSERT_THAT(GetStringValues(2), ElementsAre("14", "23", "32", "41"));
+  ASSERT_EQ(GetType(2), kTfLiteString);
+}
+
 TEST_F(DelegateTest, MixedGraph) {
   AddTensors(9, {0, 3}, {8}, kTfLiteFloat32, {3});
 
diff --git a/tensorflow/lite/delegates/flex/kernel.cc b/tensorflow/lite/delegates/flex/kernel.cc
index c4fe142dff1..02da1d1a224 100644
--- a/tensorflow/lite/delegates/flex/kernel.cc
+++ b/tensorflow/lite/delegates/flex/kernel.cc
@@ -15,6 +15,12 @@ limitations under the License.
 #include "tensorflow/lite/delegates/flex/kernel.h"
 
 #include "flatbuffers/flexbuffers.h"  // TF:flatbuffers
+#include "tensorflow/core/common_runtime/eager/context.h"
+#include "tensorflow/core/common_runtime/eager/execute.h"
+#include "tensorflow/core/common_runtime/eager/tensor_handle.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/node_def_util.h"
+#include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/lite/builtin_ops.h"
 #include "tensorflow/lite/c/c_api_internal.h"
 #include "tensorflow/lite/context_util.h"
@@ -22,11 +28,6 @@ limitations under the License.
 #include "tensorflow/lite/delegates/flex/util.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/string.h"
-#include "tensorflow/core/common_runtime/eager/context.h"
-#include "tensorflow/core/common_runtime/eager/execute.h"
-#include "tensorflow/core/common_runtime/eager/tensor_handle.h"
-#include "tensorflow/core/framework/node_def.pb.h"
-#include "tensorflow/core/framework/node_def_util.h"
 
 // Note: this is part of TF Lite's Flex delegation code which is to be
 // completed soon.
@@ -78,11 +79,18 @@ tensorflow::Status ExecuteFlexOp(tensorflow::EagerContext* eager_context,
                                  const std::vector<int>& inputs,
                                  const std::vector<int>& outputs) {
   const tensorflow::AttrTypeMap* attr_types;
+  bool is_function = false;
   TF_RETURN_WITH_CONTEXT_IF_ERROR(
-      tensorflow::AttrTypeMapForOp(op_name.c_str(), &attr_types),
+      tensorflow::AttrTypeMapForOp(op_name.c_str(), &attr_types, &is_function),
       " (while processing attributes of '", op_name, "')");
-
-  tensorflow::EagerOperation op(eager_context, op_name.c_str(), attr_types);
+  if (is_function) {
+    return tensorflow::errors::NotFound(
+        "Operation '", op_name,
+        "' is not registered.  (while processing attributes of '", op_name,
+        "')");
+  }
+  tensorflow::EagerOperation op(eager_context, op_name.c_str(),
+                                /*is_function=*/false, attr_types);
   for (const auto& attr : nodedef.attr()) {
     op.MutableAttrs()->Set(attr.first, attr.second);
   }
diff --git a/tensorflow/lite/delegates/flex/kernel_test.cc b/tensorflow/lite/delegates/flex/kernel_test.cc
index f55759594df..efb7300b0bd 100644
--- a/tensorflow/lite/delegates/flex/kernel_test.cc
+++ b/tensorflow/lite/delegates/flex/kernel_test.cc
@@ -59,12 +59,12 @@ class KernelTest : public testing::FlexModelTest {
     delegate_.CopyFromBufferHandle = [](TfLiteContext* context,
                                         TfLiteDelegate* delegate,
                                         TfLiteBufferHandle buffer_handle,
-                                        void* data, size_t size) {
+                                        TfLiteTensor* output) {
       auto* delegate_data = reinterpret_cast<DelegateData*>(delegate->data_);
       tensorflow::StringPiece values = delegate_data->GetBufferMap(context)
                                            ->GetTensor(buffer_handle)
                                            .tensor_data();
-      memcpy(data, values.data(), values.size());
+      memcpy(output->data.raw, values.data(), values.size());
       return kTfLiteOk;
     };
     CHECK(interpreter_->ModifyGraphWithDelegate(&delegate_) == kTfLiteOk);
diff --git a/tensorflow/lite/delegates/flex/test_util.cc b/tensorflow/lite/delegates/flex/test_util.cc
index 08feb349e6d..aa24675a7b1 100644
--- a/tensorflow/lite/delegates/flex/test_util.cc
+++ b/tensorflow/lite/delegates/flex/test_util.cc
@@ -25,6 +25,29 @@ namespace testing {
 
 bool FlexModelTest::Invoke() { return interpreter_->Invoke() == kTfLiteOk; }
 
+void FlexModelTest::SetStringValues(int tensor_index,
+                                    const std::vector<string>& values) {
+  DynamicBuffer dynamic_buffer;
+  for (const string& s : values) {
+    dynamic_buffer.AddString(s.data(), s.size());
+  }
+  dynamic_buffer.WriteToTensor(interpreter_->tensor(tensor_index),
+                               /*new_shape=*/nullptr);
+}
+
+std::vector<string> FlexModelTest::GetStringValues(int tensor_index) const {
+  std::vector<string> result;
+
+  TfLiteTensor* tensor = interpreter_->tensor(tensor_index);
+  auto num_strings = GetStringCount(tensor->data.raw);
+  for (size_t i = 0; i < num_strings; ++i) {
+    auto ref = GetString(tensor->data.raw, i);
+    result.push_back(string(ref.str, ref.len));
+  }
+
+  return result;
+}
+
 void FlexModelTest::SetShape(int tensor_index, const std::vector<int>& values) {
   ASSERT_EQ(interpreter_->ResizeInputTensor(tensor_index, values), kTfLiteOk);
   ASSERT_EQ(interpreter_->AllocateTensors(), kTfLiteOk);
@@ -95,12 +118,22 @@ void FlexModelTest::AddTfOp(TfOpType op, const std::vector<int>& inputs,
     return " attr{ key: '" + key + "' value {" + value + "}}";
   };
 
-  // Crude type attribution, will need fleshing out as more tests are added.
-  // TODO(b/113613439): Use nodedef string utilities to properly handle
-  // all types.
-  string type_attribute = attr("T", "type: DT_FLOAT");
-  if (interpreter_->tensor(inputs[0])->type == kTfLiteInt32) {
-    type_attribute = attr("T", "type: DT_INT32");
+  string type_attribute;
+  switch (interpreter_->tensor(inputs[0])->type) {
+    case kTfLiteInt32:
+      type_attribute = attr("T", "type: DT_INT32");
+      break;
+    case kTfLiteFloat32:
+      type_attribute = attr("T", "type: DT_FLOAT");
+      break;
+    case kTfLiteString:
+      type_attribute = attr("T", "type: DT_STRING");
+      break;
+    default:
+      // TODO(b/113613439): Use nodedef string utilities to properly handle all
+      // types.
+      LOG(FATAL) << "Type not supported";
+      break;
   }
 
   if (op == kUnpack) {
diff --git a/tensorflow/lite/delegates/flex/test_util.h b/tensorflow/lite/delegates/flex/test_util.h
index 4d3f5ad0968..2cc2dc30e92 100644
--- a/tensorflow/lite/delegates/flex/test_util.h
+++ b/tensorflow/lite/delegates/flex/test_util.h
@@ -63,11 +63,13 @@ class FlexModelTest : public ::testing::Test {
   void SetValues(int tensor_index, const std::vector<float>& values) {
     SetTypedValues<float>(tensor_index, values);
   }
+  void SetStringValues(int tensor_index, const std::vector<string>& values);
 
   // Returns the tensor's values at the given index.
   std::vector<float> GetValues(int tensor_index) {
     return GetTypedValues<float>(tensor_index);
   }
+  std::vector<string> GetStringValues(int tensor_index) const;
 
   // Sets the tensor's shape at the given index.
   void SetShape(int tensor_index, const std::vector<int>& values);
diff --git a/tensorflow/lite/delegates/flex/util.cc b/tensorflow/lite/delegates/flex/util.cc
index c786ffa1a21..c995b360f9d 100644
--- a/tensorflow/lite/delegates/flex/util.cc
+++ b/tensorflow/lite/delegates/flex/util.cc
@@ -66,6 +66,8 @@ TF_DataType GetTensorFlowDataType(TfLiteType type) {
       return TF_INT32;
     case kTfLiteUInt8:
       return TF_UINT8;
+    case kTfLiteInt8:
+      return TF_INT8;
     case kTfLiteInt64:
       return TF_INT64;
     case kTfLiteComplex64:
@@ -87,6 +89,8 @@ TfLiteType GetTensorFlowLiteType(TF_DataType type) {
       return kTfLiteInt32;
     case TF_UINT8:
       return kTfLiteUInt8;
+    case TF_INT8:
+      return kTfLiteInt8;
     case TF_INT64:
       return kTfLiteInt64;
     case TF_COMPLEX64:
diff --git a/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc b/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc
index 9690c659211..4fe07004a82 100644
--- a/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc
+++ b/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc
@@ -1141,7 +1141,7 @@ class NNAPIDelegateKernel {
 TfLiteDelegate* NnApiDelegate() {
   static TfLiteDelegate delegate = {
       .data_ = nullptr,
-      .flags = kTfLiteDelegateFlagsAllowDynamicTensors,
+      .flags = kTfLiteDelegateFlagsNone,
       .Prepare = [](TfLiteContext* context,
                     TfLiteDelegate* delegate) -> TfLiteStatus {
         // Do not check nodes_ if NN API is unavailable.
diff --git a/tensorflow/lite/delegates/nnapi/nnapi_delegate_test.cc b/tensorflow/lite/delegates/nnapi/nnapi_delegate_test.cc
index 84a0a6a1d1c..ca48af0c952 100644
--- a/tensorflow/lite/delegates/nnapi/nnapi_delegate_test.cc
+++ b/tensorflow/lite/delegates/nnapi/nnapi_delegate_test.cc
@@ -34,6 +34,11 @@ class SingleOpModelWithNNAPI : public SingleOpModel {
       interpreter->ModifyGraphWithDelegate(NnApiDelegate());
     });
   }
+
+  TfLiteStatus ResizeInputTensor(int tensor_index,
+                                 const std::vector<int>& dims) {
+    return interpreter_->ResizeInputTensor(tensor_index, dims);
+  }
 };
 
 class FloatAddOpModel : public SingleOpModelWithNNAPI {
@@ -97,6 +102,17 @@ TEST(NNAPIDelegate, AddWithRelu) {
   EXPECT_THAT(m.GetOutput(), ElementsAreArray({0.0, 0.4, 1.0, 1.3}));
 }
 
+// Verify that resize attempts fail.
+// TODO(b/113110851): Verify success after the delegate supports resizing.
+TEST(NNAPIDelegate, ResizeFails) {
+  FloatAddOpModel m({TensorType_FLOAT32, {1, 2, 2, 1}},
+                    {TensorType_FLOAT32, {1, 2, 2, 1}},
+                    {TensorType_FLOAT32, {}}, ActivationFunctionType_NONE);
+  m.PopulateTensor<float>(m.input1(), {-2.0, 0.2, 0.7, 0.8});
+  m.PopulateTensor<float>(m.input2(), {0.1, 0.2, 0.3, 0.5});
+  EXPECT_EQ(m.ResizeInputTensor(m.input1(), {1, 3, 3, 1}), kTfLiteError);
+}
+
 class FloatMulOpModel : public SingleOpModelWithNNAPI {
  public:
   FloatMulOpModel(const TensorData& input1, const TensorData& input2,
diff --git a/tensorflow/lite/examples/ios/camera/CameraExampleViewController.h b/tensorflow/lite/examples/ios/camera/CameraExampleViewController.h
index 6bc94e95022..fb5800e86d3 100644
--- a/tensorflow/lite/examples/ios/camera/CameraExampleViewController.h
+++ b/tensorflow/lite/examples/ios/camera/CameraExampleViewController.h
@@ -17,8 +17,8 @@
 
 #include <vector>
 
-#include "tensorflow/lite/kernels/register.h"
-#include "tensorflow/lite/model.h"
+#include "tensorflow/contrib/lite/kernels/register.h"
+#include "tensorflow/contrib/lite/model.h"
 
 @interface CameraExampleViewController
     : UIViewController<UIGestureRecognizerDelegate, AVCaptureVideoDataOutputSampleBufferDelegate> {
diff --git a/tensorflow/lite/examples/ios/camera/CameraExampleViewController.mm b/tensorflow/lite/examples/ios/camera/CameraExampleViewController.mm
index 1e6725592b0..996cff26162 100644
--- a/tensorflow/lite/examples/ios/camera/CameraExampleViewController.mm
+++ b/tensorflow/lite/examples/ios/camera/CameraExampleViewController.mm
@@ -23,10 +23,10 @@
 #include <iostream>
 #include <queue>
 
-#include "tensorflow/lite/kernels/register.h"
-#include "tensorflow/lite/model.h"
-#include "tensorflow/lite/string_util.h"
-#include "tensorflow/lite/op_resolver.h"
+#include "tensorflow/contrib/lite/kernels/register.h"
+#include "tensorflow/contrib/lite/model.h"
+#include "tensorflow/contrib/lite/string_util.h"
+#include "tensorflow/contrib/lite/op_resolver.h"
 
 #define LOG(x) std::cerr
 
diff --git a/tensorflow/lite/examples/ios/camera/Podfile b/tensorflow/lite/examples/ios/camera/Podfile
index f460693122a..96a0d234265 100644
--- a/tensorflow/lite/examples/ios/camera/Podfile
+++ b/tensorflow/lite/examples/ios/camera/Podfile
@@ -2,4 +2,4 @@ platform :ios, '8.0'
 inhibit_all_warnings!
 
 target 'tflite_camera_example'
-       pod 'TensorFlowLite', '1.10.1'
+       pod 'TensorFlowLite', '1.12.0'
diff --git a/tensorflow/lite/examples/ios/simple/Podfile b/tensorflow/lite/examples/ios/simple/Podfile
index ddb77088d9f..931b72c1f5e 100644
--- a/tensorflow/lite/examples/ios/simple/Podfile
+++ b/tensorflow/lite/examples/ios/simple/Podfile
@@ -2,4 +2,4 @@ platform :ios, '8.0'
 inhibit_all_warnings!
 
 target 'tflite_simple_example'
-       pod 'TensorFlowLite', '1.10.1'
+       pod 'TensorFlowLite', '1.12.0'
diff --git a/tensorflow/lite/examples/ios/simple/RunModelViewController.mm b/tensorflow/lite/examples/ios/simple/RunModelViewController.mm
index e5764944f66..650c73f7322 100644
--- a/tensorflow/lite/examples/ios/simple/RunModelViewController.mm
+++ b/tensorflow/lite/examples/ios/simple/RunModelViewController.mm
@@ -22,10 +22,10 @@
 #include <sstream>
 #include <string>
 
-#include "tensorflow/lite/kernels/register.h"
-#include "tensorflow/lite/model.h"
-#include "tensorflow/lite/string_util.h"
-#include "tensorflow/lite/op_resolver.h"
+#include "tensorflow/contrib/lite/kernels/register.h"
+#include "tensorflow/contrib/lite/model.h"
+#include "tensorflow/contrib/lite/string_util.h"
+#include "tensorflow/contrib/lite/op_resolver.h"
 
 #include "ios_image_load.h"
 
diff --git a/tensorflow/lite/experimental/examples/lstm/unidirectional_sequence_lstm_test.py b/tensorflow/lite/experimental/examples/lstm/unidirectional_sequence_lstm_test.py
index eeb48d12311..9c00d0501ab 100644
--- a/tensorflow/lite/experimental/examples/lstm/unidirectional_sequence_lstm_test.py
+++ b/tensorflow/lite/experimental/examples/lstm/unidirectional_sequence_lstm_test.py
@@ -111,7 +111,7 @@ class UnidirectionalSequenceLstmTest(test_util.TensorFlowTestCase):
 
     # Initialize variables
     init = tf.global_variables_initializer()
-    sess.run(init)
+    self.evaluate(init)
     for _ in range(TRAIN_STEPS):
       batch_x, batch_y = self.mnist.train.next_batch(
           batch_size=self.batch_size, shuffle=False)
diff --git a/tensorflow/lite/experimental/micro/README.md b/tensorflow/lite/experimental/micro/README.md
index 673daed74c4..cc2a62cb8a9 100644
--- a/tensorflow/lite/experimental/micro/README.md
+++ b/tensorflow/lite/experimental/micro/README.md
@@ -126,3 +126,45 @@ debug logs here, along with the magic string `~~~ALL TESTS PASSED~~~`. This is
 the exact same code as before, just compiled and run on the STM32F103 rather
 than your desktop. We hope that the simplicity of this testing approach will
 help make adding support for new platforms as easy as possible.
+
+## Building for Apollo3
+
+Follow these steps to get the pushbutton yes/no example working on Apollo 3:
+
+1. Make sure to run the "Getting Started" section before performing the following steps
+2. Download Apollo3-SDK-2018.08.13 and place in tensorflow/lite/experimental/micro/tools/make/downloads 
+3. Copy and prepare files by running tensorflow/lite/experimental/micro/tools/make/targets/apollo3evb/prep_apollo3_files.sh from the toplevel of git
+4. Install [Segger JLink tools](https://www.segger.com/downloads/jlink/)
+5. Make sure the [GNU Arm Embedded Toolchain (gcc-arm-none-eabi-7-2018-q2-update)](https://developer.arm.com/open-source/gnu-toolchain/gnu-rm/downloads) is installed to tensorflow/lite/experimental/micro/tools/make/downloads
+    1. Confirm directory is in $PATH
+6. Compile the project with the following command: make -f tensorflow/lite/experimental/micro/tools/make/Makefile TARGET=apollo3evb pushbutton_cmsis_speech_test_bin
+7. Connect the Apollo3 EVB (with mic shield) to the computer and power it on
+8. Start the GDB server in a new terminal with the following command: JLinkGDBServer -select USB -device AMA3B1KK-KBR -endian little -if SWD -speed 1000 -noir -noLocalhostOnly
+    1. The command has run successfully if you see the message "Waiting for GDB connection"
+9. Back in the original terminal, run the program via the debugger 
+    1. Navigate to tensorflow/lite/experimental/micro/examples/micro_speech/apollo3
+    2. Start gdb by entering the following command: arm-none-eabi-gdb
+    3. Run the command script by entering the following command: source pushbutton_cmsis_scores.cmd. This script does the following:
+        1. Load the binary created in step 6
+        2. Set a breakpoint after inference scores have been computed
+        3. Tell the debugger what variables should be printed out at this breakpoint
+        4. Begin program execution
+        5. Press Ctrl+c to exit
+    4. Press BTN2. An LED will flash for 1 second. Speak your utterance during this one second
+    5. The debugger will print out four numbers. They are the probabilites for 1) no speech, 2) unknown speech, 3) yes, 4) no
+    6. The EVB LEDs will indicate detection.
+        1. LED0 (rightmost LED) - ON when capturing 1sec of audio
+        2. LED1 - ON when detecting silence
+        3. LED2 - ON when detecting UNKNOWN utterance
+        4. LED3 - ON when detecting YES utterance
+        5. LED4 (leftmost LED) - ON when detecting NO utterance
+
+### Additional Apollo3 Instructions
+
+To flash a part with JFlash Lite, do the following:
+1. At the command line: JFlashLiteExe
+2. Device = AMA3B1KK-KBR
+3. Interface = SWD at 1000 kHz
+4. Data file = tensorflow/lite/experimental/micro/tools/make/gen/apollo3evb_cortex-m4/bin/pushbutton_cmsis_speech_test.bin
+5. Prog Addr = 0x0000C000
+
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/.gitignore b/tensorflow/lite/experimental/micro/examples/micro_speech/.gitignore
new file mode 100644
index 00000000000..d8dd7532abc
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/.gitignore
@@ -0,0 +1 @@
+*.wav
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/BUILD b/tensorflow/lite/experimental/micro/examples/micro_speech/BUILD
index 07fb8764113..799b2e5a5dd 100644
--- a/tensorflow/lite/experimental/micro/examples/micro_speech/BUILD
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/BUILD
@@ -10,18 +10,46 @@ load(
     "tflite_micro_cc_test",
 )
 
+cc_library(
+    name = "model_settings",
+    srcs = [
+        "model_settings.cc",
+    ],
+    hdrs = [
+        "model_settings.h",
+    ],
+)
+
+cc_library(
+    name = "tiny_conv_model_data",
+    srcs = [
+        "tiny_conv_model_data.cc",
+    ],
+    hdrs = [
+        "tiny_conv_model_data.h",
+    ],
+)
+
+cc_library(
+    name = "features_test_data",
+    srcs = [
+        "no_features_data.cc",
+        "yes_features_data.cc",
+    ],
+    hdrs = [
+        "no_features_data.h",
+        "yes_features_data.h",
+    ],
+)
+
 tflite_micro_cc_test(
     name = "micro_speech_test",
     srcs = [
         "micro_speech_test.cc",
-        "no_features_data.cc",
-        "no_features_data.h",
-        "tiny_conv_model_data.cc",
-        "tiny_conv_model_data.h",
-        "yes_features_data.cc",
-        "yes_features_data.h",
     ],
     deps = [
+        ":features_test_data",
+        ":tiny_conv_model_data",
         "//tensorflow/lite:schema_fbs_version",
         "//tensorflow/lite/experimental/micro:micro_framework",
         "//tensorflow/lite/experimental/micro/kernels:all_ops_resolver",
@@ -31,46 +59,185 @@ tflite_micro_cc_test(
     ],
 )
 
+cc_library(
+    name = "preprocessor_test_data",
+    srcs = [
+        "no_30ms_sample_data.cc",
+        "no_power_spectrum_data.cc",
+        "yes_30ms_sample_data.cc",
+        "yes_power_spectrum_data.cc",
+    ],
+    hdrs = [
+        "no_30ms_sample_data.h",
+        "no_power_spectrum_data.h",
+        "yes_30ms_sample_data.h",
+        "yes_power_spectrum_data.h",
+    ],
+)
+
+cc_library(
+    name = "preprocessor_reference",
+    srcs = [
+        "preprocessor.cc",
+    ],
+    hdrs = [
+        "preprocessor.h",
+    ],
+    deps = [
+        ":model_settings",
+        "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite/experimental/micro:micro_framework",
+    ],
+)
+
 tflite_micro_cc_test(
     name = "preprocessor_reference_test",
     srcs = [
-        "no_30ms_sample_data.cc",
-        "no_30ms_sample_data.h",
-        "no_power_spectrum_data.cc",
-        "no_power_spectrum_data.h",
-        "preprocessor.cc",
-        "preprocessor.h",
         "preprocessor_test.cc",
-        "yes_30ms_sample_data.cc",
-        "yes_30ms_sample_data.h",
-        "yes_power_spectrum_data.cc",
-        "yes_power_spectrum_data.h",
     ],
     deps = [
+        ":model_settings",
+        ":preprocessor_reference",
+        ":preprocessor_test_data",
         "//tensorflow/lite/c:c_api_internal",
         "//tensorflow/lite/experimental/micro:micro_framework",
         "//tensorflow/lite/experimental/micro/testing:micro_test",
     ],
 )
 
+cc_library(
+    name = "preprocessor_fixed",
+    srcs = [
+        "fixed_point/preprocessor.cc",
+    ],
+    hdrs = [
+        "preprocessor.h",
+    ],
+    deps = [
+        ":model_settings",
+        "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite/experimental/micro:micro_framework",
+    ],
+)
+
 tflite_micro_cc_test(
     name = "preprocessor_fixed_test",
     srcs = [
-        "fixed_point/preprocessor.cc",
-        "no_30ms_sample_data.cc",
-        "no_30ms_sample_data.h",
-        "no_power_spectrum_data.cc",
-        "no_power_spectrum_data.h",
-        "preprocessor.h",
         "preprocessor_test.cc",
-        "yes_30ms_sample_data.cc",
-        "yes_30ms_sample_data.h",
-        "yes_power_spectrum_data.cc",
-        "yes_power_spectrum_data.h",
     ],
     deps = [
+        ":model_settings",
+        ":preprocessor_fixed",
+        ":preprocessor_test_data",
         "//tensorflow/lite/c:c_api_internal",
         "//tensorflow/lite/experimental/micro:micro_framework",
         "//tensorflow/lite/experimental/micro/testing:micro_test",
     ],
 )
+
+cc_library(
+    name = "audio_provider",
+    srcs = [
+        "audio_provider.cc",
+    ],
+    hdrs = [
+        "audio_provider.h",
+    ],
+    deps = [
+        ":model_settings",
+        "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite/experimental/micro:micro_framework",
+    ],
+)
+
+tflite_micro_cc_test(
+    name = "audio_provider_test",
+    srcs = [
+        "audio_provider_test.cc",
+    ],
+    deps = [
+        ":audio_provider",
+        ":model_settings",
+        "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite/experimental/micro:micro_framework",
+        "//tensorflow/lite/experimental/micro/testing:micro_test",
+    ],
+)
+
+cc_library(
+    name = "feature_provider",
+    srcs = [
+        "feature_provider.cc",
+    ],
+    hdrs = [
+        "feature_provider.h",
+    ],
+    deps = [
+        ":audio_provider",
+        ":model_settings",
+        ":preprocessor_reference",
+        ":timer",
+        "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite/experimental/micro:micro_framework",
+    ],
+)
+
+tflite_micro_cc_test(
+    name = "feature_provider_test",
+    srcs = [
+        "feature_provider_test.cc",
+    ],
+    deps = [
+        ":audio_provider",
+        ":feature_provider",
+        ":model_settings",
+        ":timer",
+        "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite/experimental/micro:micro_framework",
+        "//tensorflow/lite/experimental/micro/testing:micro_test",
+    ],
+)
+
+cc_library(
+    name = "timer",
+    srcs = [
+        "timer.cc",
+    ],
+    hdrs = [
+        "timer.h",
+    ],
+)
+
+tflite_micro_cc_test(
+    name = "timer_test",
+    srcs = [
+        "timer_test.cc",
+    ],
+    deps = [
+        ":timer",
+        "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite/experimental/micro:micro_framework",
+        "//tensorflow/lite/experimental/micro/testing:micro_test",
+    ],
+)
+
+cc_binary(
+    name = "micro_speech",
+    srcs = [
+        "main.cc",
+    ],
+    deps = [
+        ":audio_provider",
+        ":feature_provider",
+        ":features_test_data",
+        ":model_settings",
+        ":preprocessor_reference",
+        ":timer",
+        ":tiny_conv_model_data",
+        "//tensorflow/lite:schema_fbs_version",
+        "//tensorflow/lite/experimental/micro:micro_framework",
+        "//tensorflow/lite/experimental/micro/kernels:all_ops_resolver",
+        "//tensorflow/lite/experimental/micro/kernels:micro_ops",
+        "//tensorflow/lite/schema:schema_fbs",
+    ],
+)
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/README.md b/tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/README.md
new file mode 100644
index 00000000000..fde48374c8e
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/README.md
@@ -0,0 +1,10 @@
+# Description of files
+
+* **arm_cmplx_mag_squared_q10p6.c**: Modified version of the ARM CMSIS function [arm_cmplx_mag_squared.c](http://arm-software.github.io/CMSIS_5/DSP/html/group__cmplx__mag__squared.html#ga45537f576102d960d467eb722b8431f2). The modification is that we have changed the amount of right-shift to make sure our data is in the correct range. We redistribute because the original content was created with the Apache 2.0 license.
+* **arm_cmplx_mag_squared_q10p6.h**: Header file for arm_cmplx_mag_squared_q10p6.c
+* **create_constants.py**: Python file used to create hanning.cc, hanning.h, sin_1k.cc, and sin_1k.h
+* **hanning.cc**: Precomputed [Hann window](https://en.wikipedia.org/wiki/Hann_function) for use in the preprocessor. This file is created in ../create_constants.py
+* **hanning.h**: Header file fro hanning.cc
+* **preprocessor.cc**: CMSIS version of the preprocessor
+* **sin_1k.cc**: A 1 kHZ sinusoid used for comparing the CMSIS preprocessor with the Micro-Lite fixed_point preprocessor
+* **sin_1k.h**: Header file for sin_1k.cc
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/arm_cmplx_mag_squared_q10p6.c b/tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/arm_cmplx_mag_squared_q10p6.c
new file mode 100644
index 00000000000..b050f6048d5
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/arm_cmplx_mag_squared_q10p6.c
@@ -0,0 +1,141 @@
+/* This file is a modification of the ARM CMSIS library file arm_cmplx_mag_squared_q15.c
+ * We have retained the original copyright and header information, in
+ * accordance with the Apache 2.0 license terms.
+ */
+
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_cmplx_mag_squared_q15.c
+ * Description:  Q15 complex magnitude squared
+ *
+ * $Date:        27. January 2017
+ * $Revision:    V.1.5.1
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+
+/**
+ * @ingroup groupCmplxMath
+ */
+
+/**
+ * @addtogroup cmplx_mag_squared
+ * @{
+ */
+
+/**
+ * @brief  Q15 complex magnitude squared
+ * @param  *pSrc points to the complex input vector
+ * @param  *pDst points to the real output vector
+ * @param  numSamples number of complex samples in the input vector
+ * @return none.
+ *
+ * <b>Scaling and Overflow Behavior:</b>
+ * \par
+ * The function implements 1.15 by 1.15 multiplications and finally output is converted into 3.13 format.
+ */
+
+void arm_cmplx_mag_squared_q10p6(
+  q15_t * pSrc,
+  q15_t * pDst,
+  uint32_t numSamples)
+{
+  q31_t acc0, acc1;                              /* Accumulators */
+
+#if defined (ARM_MATH_DSP)
+
+  /* Run the below code for Cortex-M4 and Cortex-M3 */
+  uint32_t blkCnt;                               /* loop counter */
+  q31_t in1, in2, in3, in4;
+  q31_t acc2, acc3;
+
+  /*loop Unrolling */
+  blkCnt = numSamples >> 2U;
+
+  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
+   ** a second loop below computes the remaining 1 to 3 samples. */
+  while (blkCnt > 0U)
+  {
+    /* C[0] = (A[0] * A[0] + A[1] * A[1]) */
+    in1 = *__SIMD32(pSrc)++;
+    in2 = *__SIMD32(pSrc)++;
+    in3 = *__SIMD32(pSrc)++;
+    in4 = *__SIMD32(pSrc)++;
+
+    acc0 = __SMUAD(in1, in1);
+    acc1 = __SMUAD(in2, in2);
+    acc2 = __SMUAD(in3, in3);
+    acc3 = __SMUAD(in4, in4);
+
+    /* store the result in 3.13 format in the destination buffer. */
+    *pDst++ = (q15_t) (acc0 >> 6);
+    *pDst++ = (q15_t) (acc1 >> 6);
+    *pDst++ = (q15_t) (acc2 >> 6);
+    *pDst++ = (q15_t) (acc3 >> 6);
+
+    /* Decrement the loop counter */
+    blkCnt--;
+  }
+
+  /* If the numSamples is not a multiple of 4, compute any remaining output samples here.
+   ** No loop unrolling is used. */
+  blkCnt = numSamples % 0x4U;
+
+  while (blkCnt > 0U)
+  {
+    /* C[0] = (A[0] * A[0] + A[1] * A[1]) */
+    in1 = *__SIMD32(pSrc)++;
+    acc0 = __SMUAD(in1, in1);
+
+    /* store the result in 3.13 format in the destination buffer. */
+    *pDst++ = (q15_t) (acc0 >> 6);
+
+    /* Decrement the loop counter */
+    blkCnt--;
+  }
+
+#else
+
+  /* Run the below code for Cortex-M0 */
+  q15_t real, imag;                              /* Temporary variables to store real and imaginary values */
+
+  while (numSamples > 0U)
+  {
+    /* out = ((real * real) + (imag * imag)) */
+    real = *pSrc++;
+    imag = *pSrc++;
+    acc0 = (real * real);
+    acc1 = (imag * imag);
+    /* store the result in 3.13 format in the destination buffer. */
+    *pDst++ = (q15_t) (((q63_t) acc0 + acc1) >> 6);
+
+    /* Decrement the loop counter */
+    numSamples--;
+  }
+
+#endif /* #if defined (ARM_MATH_DSP) */
+
+}
+
+/**
+ * @} end of cmplx_mag_squared group
+ */
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/arm_cmplx_mag_squared_q10p6.h b/tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/arm_cmplx_mag_squared_q10p6.h
new file mode 100644
index 00000000000..24144615cc6
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/arm_cmplx_mag_squared_q10p6.h
@@ -0,0 +1,33 @@
+/* This file is a modification of the ARM CMSIS library file arm_math.h
+ * We have retained the original copyright and header information, in
+ * accordance with the Apache 2.0 license terms.
+ */
+
+/******************************************************************************
+ * @file     arm_math.h
+ * @brief    Public header file for CMSIS DSP LibraryU
+ * @version  V1.5.3
+ * @date     10. January 2018
+ ******************************************************************************/
+/*
+ * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+void arm_cmplx_mag_squared_q10p6(
+  q15_t * pSrc,
+  q15_t * pDst,
+  uint32_t numSamples);
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/create_constants.py b/tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/create_constants.py
new file mode 100755
index 00000000000..c7cf8bf61be
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/create_constants.py
@@ -0,0 +1,62 @@
+# SPDX-License-Identifier: Apache-2.0
+# 
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+# www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import soundfile as sf
+import numpy as np
+
+def to_cc(x, varname, directory='', scale_factor=1):
+    x = (x/np.max(np.abs(x)))*32768*scale_factor
+    x[x>32767] = 32767
+    x[x<-32768] = -32768
+    x = x.astype(int)
+    x = [str(v) if i%10!=0 else '\n    '+str(v) for i,v in enumerate(x)]
+
+    cmsis_path = "tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS"
+    xstr = '#include "{}/{}.h"\n\n'.format(cmsis_path, varname)
+    xstr += 'const int g_{}_size = {};\n'.format(varname, len(x))
+    xstr += 'const int16_t g_{}[{}] = {{{}}};\n'.format(varname, len(x), ', '.join(x))
+
+    with open(directory+varname+'.cc','w') as f:
+        f.write(xstr)
+
+def to_h(x, varname, directory=''):
+    tf_prepend = 'TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_'
+    xstr = '#ifndef {}{}_H_\n'.format(tf_prepend, varname.upper())
+    xstr += '#define {}{}_H_\n\n'.format(tf_prepend, varname.upper())
+    xstr += '#include <cstdint>\n\n'
+    xstr += 'extern const int g_{}_size;\n'.format(varname) 
+    xstr += 'extern const int16_t g_{}[];\n\n'.format(varname)
+    xstr += '#endif'
+
+    with open(directory+varname+'.h','w') as f:
+        f.write(xstr)
+
+#x = sf.read('yes_f2e59fea_nohash_1.wav')[0]
+#to_cc(x, 'yes_waveform')
+#to_h(x, 'yes_waveform')
+#
+#x = sf.read('no_f9643d42_nohash_4.wav')[0]
+#to_cc(x, 'no_waveform')
+#to_h(x, 'no_waveform')
+
+
+# 30ms of data @ 16 kHz = 480 samples
+hann = np.hanning(int(16000*0.03)) # Window 30ms of data
+to_cc(hann, 'hanning', directory='./')
+to_h(hann, 'hanning', directory='./')
+
+t = np.arange(16000.*0.03)/16000.
+sin1k = np.sin(2*np.pi*1000*t) # Factor of 10 because micro preprocessing overflows otherwise
+to_cc(sin1k, 'sin_1k', directory='./', scale_factor=0.1)
+to_h(sin1k, 'sin_1k', directory='./')
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/hann.h b/tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/hann.h
deleted file mode 100644
index b610f79190b..00000000000
--- a/tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/hann.h
+++ /dev/null
@@ -1 +0,0 @@
-q15_t hann[480] = {0, 1, 5, 12, 22, 35, 50, 69, 90, 114, 140, 170, 202, 237, 275, 316, 359, 405, 454, 506, 560, 617, 677, 739, 805, 873, 943, 1016, 1092, 1171, 1252, 1335, 1422, 1511, 1602, 1696, 1793, 1892, 1993, 2097, 2203, 2312, 2424, 2537, 2653, 2772, 2893, 3016, 3141, 3269, 3399, 3531, 3665, 3802, 3941, 4081, 4224, 4370, 4517, 4666, 4817, 4970, 5125, 5283, 5442, 5603, 5765, 5930, 6096, 6264, 6434, 6606, 6779, 6954, 7131, 7309, 7488, 7670, 7852, 8037, 8222, 8409, 8598, 8788, 8979, 9171, 9364, 9559, 9755, 9952, 10151, 10350, 10550, 10751, 10954, 11157, 11361, 11566, 11772, 11979, 12186, 12394, 12603, 12812, 13022, 13233, 13444, 13656, 13868, 14080, 14293, 14507, 14720, 14934, 15148, 15363, 15577, 15792, 16007, 16222, 16437, 16652, 16866, 17081, 17296, 17510, 17725, 17939, 18153, 18366, 18579, 18792, 19004, 19216, 19428, 19639, 19849, 20059, 20268, 20476, 20684, 20891, 21097, 21302, 21507, 21711, 21913, 22115, 22316, 22516, 22715, 22912, 23109, 23304, 23498, 23691, 23883, 24074, 24263, 24450, 24637, 24822, 25005, 25187, 25368, 25547, 25724, 25900, 26074, 26246, 26417, 26586, 26753, 26919, 27082, 27244, 27404, 27562, 27718, 27873, 28025, 28175, 28323, 28469, 28613, 28755, 28895, 29033, 29168, 29301, 29433, 29561, 29688, 29812, 29934, 30054, 30171, 30286, 30398, 30508, 30616, 30721, 30824, 30924, 31022, 31117, 31210, 31300, 31388, 31473, 31555, 31635, 31712, 31787, 31858, 31928, 31994, 32058, 32119, 32178, 32233, 32286, 32337, 32384, 32429, 32471, 32510, 32547, 32580, 32611, 32639, 32665, 32687, 32707, 32724, 32738, 32749, 32758, 32763, 32766, 32766, 32763, 32758, 32749, 32738, 32724, 32707, 32687, 32665, 32639, 32611, 32580, 32547, 32510, 32471, 32429, 32384, 32337, 32286, 32233, 32178, 32119, 32058, 31994, 31928, 31858, 31787, 31712, 31635, 31555, 31473, 31388, 31300, 31210, 31117, 31022, 30924, 30824, 30721, 30616, 30508, 30398, 30286, 30171, 30054, 29934, 29812, 29688, 29561, 29433, 29301, 29168, 29033, 28895, 28755, 28613, 28469, 28323, 28175, 28025, 27873, 27718, 27562, 27404, 27244, 27082, 26919, 26753, 26586, 26417, 26246, 26074, 25900, 25724, 25547, 25368, 25187, 25005, 24822, 24637, 24450, 24263, 24074, 23883, 23691, 23498, 23304, 23109, 22912, 22715, 22516, 22316, 22115, 21913, 21711, 21507, 21302, 21097, 20891, 20684, 20476, 20268, 20059, 19849, 19639, 19428, 19216, 19004, 18792, 18579, 18366, 18153, 17939, 17725, 17510, 17296, 17081, 16866, 16652, 16437, 16222, 16007, 15792, 15577, 15363, 15148, 14934, 14720, 14507, 14293, 14080, 13868, 13656, 13444, 13233, 13022, 12812, 12603, 12394, 12186, 11979, 11772, 11566, 11361, 11157, 10954, 10751, 10550, 10350, 10151, 9952, 9755, 9559, 9364, 9171, 8979, 8788, 8598, 8409, 8222, 8037, 7852, 7670, 7488, 7309, 7131, 6954, 6779, 6606, 6434, 6264, 6096, 5930, 5765, 5603, 5442, 5283, 5125, 4970, 4817, 4666, 4517, 4370, 4224, 4081, 3941, 3802, 3665, 3531, 3399, 3269, 3141, 3016, 2893, 2772, 2653, 2537, 2424, 2312, 2203, 2097, 1993, 1892, 1793, 1696, 1602, 1511, 1422, 1335, 1252, 1171, 1092, 1016, 943, 873, 805, 739, 677, 617, 560, 506, 454, 405, 359, 316, 275, 237, 202, 170, 140, 114, 90, 69, 50, 35, 22, 12, 5, 1, 0};
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/hanning.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/hanning.cc
new file mode 100644
index 00000000000..32aa5b2b7ee
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/hanning.cc
@@ -0,0 +1,67 @@
+/* SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/hanning.h"
+
+const int g_hanning_size = 480;
+const int16_t g_hanning[480] = {
+    0, 1, 5, 12, 22, 35, 50, 69, 90, 114, 
+    140, 170, 202, 237, 275, 316, 359, 405, 454, 506, 
+    560, 617, 677, 740, 805, 873, 943, 1016, 1092, 1171, 
+    1252, 1336, 1422, 1511, 1602, 1696, 1793, 1892, 1993, 2097, 
+    2204, 2312, 2424, 2537, 2653, 2772, 2893, 3016, 3141, 3269, 
+    3399, 3531, 3665, 3802, 3941, 4082, 4225, 4370, 4517, 4666, 
+    4817, 4971, 5126, 5283, 5442, 5603, 5765, 5930, 6096, 6265, 
+    6435, 6606, 6779, 6954, 7131, 7309, 7489, 7670, 7853, 8037, 
+    8223, 8410, 8598, 8788, 8979, 9171, 9365, 9560, 9756, 9953, 
+    10151, 10350, 10551, 10752, 10954, 11157, 11362, 11567, 11772, 11979, 
+    12186, 12395, 12603, 12813, 13023, 13233, 13445, 13656, 13868, 14081, 
+    14294, 14507, 14721, 14935, 15149, 15363, 15578, 15793, 16008, 16222, 
+    16437, 16652, 16867, 17082, 17297, 17511, 17725, 17939, 18153, 18367, 
+    18580, 18793, 19005, 19217, 19428, 19639, 19850, 20059, 20269, 20477, 
+    20685, 20892, 21098, 21303, 21508, 21712, 21914, 22116, 22317, 22517, 
+    22716, 22913, 23110, 23305, 23499, 23692, 23884, 24075, 24264, 24451, 
+    24638, 24823, 25006, 25188, 25369, 25548, 25725, 25901, 26075, 26247, 
+    26418, 26587, 26754, 26920, 27083, 27245, 27405, 27563, 27719, 27874, 
+    28026, 28176, 28324, 28470, 28614, 28756, 28896, 29034, 29169, 29303, 
+    29434, 29563, 29689, 29813, 29935, 30055, 30172, 30287, 30400, 30510, 
+    30617, 30723, 30825, 30926, 31023, 31119, 31211, 31301, 31389, 31474, 
+    31556, 31636, 31713, 31788, 31860, 31929, 31996, 32059, 32121, 32179, 
+    32235, 32288, 32338, 32386, 32430, 32472, 32512, 32548, 32582, 32613, 
+    32641, 32666, 32689, 32708, 32725, 32739, 32751, 32759, 32765, 32767, 
+    32767, 32765, 32759, 32751, 32739, 32725, 32708, 32689, 32666, 32641, 
+    32613, 32582, 32548, 32512, 32472, 32430, 32386, 32338, 32288, 32235, 
+    32179, 32121, 32059, 31996, 31929, 31860, 31788, 31713, 31636, 31556, 
+    31474, 31389, 31301, 31211, 31119, 31023, 30926, 30825, 30723, 30617, 
+    30510, 30400, 30287, 30172, 30055, 29935, 29813, 29689, 29563, 29434, 
+    29303, 29169, 29034, 28896, 28756, 28614, 28470, 28324, 28176, 28026, 
+    27874, 27719, 27563, 27405, 27245, 27083, 26920, 26754, 26587, 26418, 
+    26247, 26075, 25901, 25725, 25548, 25369, 25188, 25006, 24823, 24638, 
+    24451, 24264, 24075, 23884, 23692, 23499, 23305, 23110, 22913, 22716, 
+    22517, 22317, 22116, 21914, 21712, 21508, 21303, 21098, 20892, 20685, 
+    20477, 20269, 20059, 19850, 19639, 19428, 19217, 19005, 18793, 18580, 
+    18367, 18153, 17939, 17725, 17511, 17297, 17082, 16867, 16652, 16437, 
+    16222, 16008, 15793, 15578, 15363, 15149, 14935, 14721, 14507, 14294, 
+    14081, 13868, 13656, 13445, 13233, 13023, 12813, 12603, 12395, 12186, 
+    11979, 11772, 11567, 11362, 11157, 10954, 10752, 10551, 10350, 10151, 
+    9953, 9756, 9560, 9365, 9171, 8979, 8788, 8598, 8410, 8223, 
+    8037, 7853, 7670, 7489, 7309, 7131, 6954, 6779, 6606, 6435, 
+    6265, 6096, 5930, 5765, 5603, 5442, 5283, 5126, 4971, 4817, 
+    4666, 4517, 4370, 4225, 4082, 3941, 3802, 3665, 3531, 3399, 
+    3269, 3141, 3016, 2893, 2772, 2653, 2537, 2424, 2312, 2204, 
+    2097, 1993, 1892, 1793, 1696, 1602, 1511, 1422, 1336, 1252, 
+    1171, 1092, 1016, 943, 873, 805, 740, 677, 617, 560, 
+    506, 454, 405, 359, 316, 275, 237, 202, 170, 140, 
+    114, 90, 69, 50, 35, 22, 12, 5, 1, 0};
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/hanning.h b/tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/hanning.h
new file mode 100644
index 00000000000..0982f33c484
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/hanning.h
@@ -0,0 +1,24 @@
+/* SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_HANNING_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_HANNING_H_
+
+#include <cstdint>
+
+extern const int g_hanning_size;
+extern const int16_t g_hanning[];
+
+#endif
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/no_power_spectrum_data.h b/tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/no_power_spectrum_data.h
deleted file mode 100644
index c19566bfb64..00000000000
--- a/tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/no_power_spectrum_data.h
+++ /dev/null
@@ -1,29 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// This data was extracted from the larger feature data held in
-// no_features_data.cc and consists of the 29th spectrogram slice of 43 values.
-// This is the expected result of running the sample data in
-// no_30ms_sample_data.cc through through the preprocessing pipeline.
-
-#ifndef TENSORFLOW_CONTRIB_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_NO_POWER_SPECTRUM_DATA_H_
-#define TENSORFLOW_CONTRIB_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_NO_POWER_SPECTRUM_DATA_H_
-
-#include <cstdint>
-
-constexpr int g_no_power_spectrum_data_size = 43;
-extern const uint8_t g_no_power_spectrum_data[];
-
-#endif  // TENSORFLOW_CONTRIB_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_NO_POWER_SPECTRUM_DATA_H_
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/preprocessor.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/preprocessor.cc
index 24119cbdda4..6bc3c4cb775 100644
--- a/tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/preprocessor.cc
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/preprocessor.cc
@@ -18,11 +18,17 @@ extern "C" {
   #define IFFT_FLAG_R 0
   #define BIT_REVERSE_FLAG 1
   #define FFT_SIZE 512
+  #define FFT_SIZE_DIV2 256
   #include <arm_math.h>
-  #include "tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/hann.h"
+  #include "tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/hanning.h"
+  #include "tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/arm_cmplx_mag_squared_q10p6.h"
 }
 
-  #include "tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/preprocessor.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/preprocessor.h"
+
+void quantize(q15_t* bufA, q15_t* bufB, uint8_t* output);
+
+
 
 q15_t bufA[FFT_SIZE];
 q15_t bufB[FFT_SIZE];
@@ -52,25 +58,49 @@ TfLiteStatus Preprocess(tflite::ErrorReporter* error_reporter,
     return kTfLiteError;
   }
 
-  arm_mult_q15((q15_t *) input, hann, bufB, 512);
+  // 30ms at 16 kHz = 480 samples
+  // We want to pad the rest of the 512-sample buffer with zeros
+  arm_mult_q15((q15_t *) input, g_hanning, bufB, 480);
+  int i;
+  for(i=480; i<512; i++) {
+    bufB[i] = 0;
+  }
 
   // Should move init code outside of Preprocess() function
   arm_math_status = arm_rfft_init_q15(&S_arm_fft, FFT_SIZE, IFFT_FLAG_R, BIT_REVERSE_FLAG); 
   arm_rfft_q15(&S_arm_fft, bufB, bufA);
-  arm_shift_q15(bufA, 5, bufB, FFT_SIZE);
 
-  arm_cmplx_mag_squared_q15(bufB, bufA, 256);
-  arm_shift_q15(bufA, 1, bufB, 256);
+  // The rfft function packs data as follows:
+  // {real[0], real[N/2], real[1], imag[1], ..., real[N/2-1], imag[N/2-1]}
+  // Below we pack as follows:
+  // {real[0], 0, real[1], imag[1], ..., real[N/2-1], imag[N/2-1, real[N/2], 0}
+  bufA[FFT_SIZE_DIV2] = bufA[1];
+  bufA[FFT_SIZE_DIV2 + 1] = 0;
+  bufA[1] = 0;
+  arm_cmplx_mag_squared_q10p6(bufA, bufB, FFT_SIZE_DIV2 + 1);
 
+  quantize(bufA, bufB, output);
+
+  return kTfLiteOk;
+}
+
+void quantize(q15_t* bufA, q15_t* bufB, uint8_t* output) {
   int i;
   for (i=0; i<42; i++) {
     arm_mean_q15(bufB+6*i, 6, bufA+i);
   }
-  arm_mean_q15(bufB+252, 4, bufA+42);
+  arm_mean_q15(bufB+252, 5, bufA+42);
 
   for (i=0; i<43; i++) {
     output[i] = (uint8_t) (bufA[i] >> 5);
   }
+}
 
+TfLiteStatus Preprocess_1sec(tflite::ErrorReporter* error_reporter,
+    const int16_t* input, uint8_t* output) {
+  int i;
+  for(i=0; i<49; i++) {
+    Preprocess(error_reporter, input+i*320, 480, 43, output+i*43);
+  }
   return kTfLiteOk;
 }
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/preprocessor_test.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/preprocessor_test.cc
deleted file mode 100644
index 5986fb49135..00000000000
--- a/tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/preprocessor_test.cc
+++ /dev/null
@@ -1,63 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/preprocessor.h"
-#include "tensorflow/lite/c/c_api_internal.h"
-#include "tensorflow/lite/experimental/micro/examples/micro_speech/no_30ms_sample_data.h"
-#include "tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/no_power_spectrum_data.h"
-#include "tensorflow/lite/experimental/micro/examples/micro_speech/yes_30ms_sample_data.h"
-#include "tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/yes_power_spectrum_data.h"
-#include "tensorflow/lite/experimental/micro/micro_error_reporter.h"
-#include "tensorflow/lite/experimental/micro/testing/micro_test.h"
-
-TF_LITE_MICRO_TESTS_BEGIN
-
-TF_LITE_MICRO_TEST(TestPreprocessor) {
-  tflite::MicroErrorReporter micro_error_reporter;
-  tflite::ErrorReporter* error_reporter = &micro_error_reporter;
-
-  uint8_t yes_calculated_data[g_yes_power_spectrum_data_size];
-  TfLiteStatus yes_status = Preprocess(
-      error_reporter, g_yes_30ms_sample_data, g_yes_30ms_sample_data_size,
-      g_yes_power_spectrum_data_size, yes_calculated_data);
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, yes_status);
-
-  for (int i = 0; i < g_yes_power_spectrum_data_size; ++i) {
-    TF_LITE_MICRO_EXPECT_EQ(g_yes_power_spectrum_data[i],
-                            yes_calculated_data[i]);
-    if (g_yes_power_spectrum_data[i] != yes_calculated_data[i]) {
-      error_reporter->Report("Expected value %d but found %d",
-                             g_yes_power_spectrum_data[i],
-                             yes_calculated_data[i]);
-    }
-  }
-
-  uint8_t no_calculated_data[g_yes_power_spectrum_data_size];
-  TfLiteStatus no_status = Preprocess(
-      error_reporter, g_no_30ms_sample_data, g_no_30ms_sample_data_size,
-      g_no_power_spectrum_data_size, no_calculated_data);
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, no_status);
-
-  for (int i = 0; i < g_no_power_spectrum_data_size; ++i) {
-    TF_LITE_MICRO_EXPECT_EQ(g_no_power_spectrum_data[i], no_calculated_data[i]);
-    if (g_no_power_spectrum_data[i] != no_calculated_data[i]) {
-      error_reporter->Report("Expected value %d but found %d",
-                             g_no_power_spectrum_data[i],
-                             no_calculated_data[i]);
-    }
-  }
-}
-
-TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/sin_1k.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/sin_1k.cc
new file mode 100644
index 00000000000..be66e5f5480
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/sin_1k.cc
@@ -0,0 +1,67 @@
+/* SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/sin_1k.h"
+
+const int g_sin_1k_size = 480;
+const int16_t g_sin_1k[480] = {
+    0, 1253, 2317, 3027, 3276, 3027, 2317, 1253, 0, -1253, 
+    -2317, -3027, -3276, -3027, -2317, -1253, 0, 1253, 2317, 3027, 
+    3276, 3027, 2317, 1253, 0, -1253, -2317, -3027, -3276, -3027, 
+    -2317, -1253, 0, 1253, 2317, 3027, 3276, 3027, 2317, 1253, 
+    0, -1253, -2317, -3027, -3276, -3027, -2317, -1253, 0, 1253, 
+    2317, 3027, 3276, 3027, 2317, 1253, 0, -1253, -2317, -3027, 
+    -3276, -3027, -2317, -1253, 0, 1253, 2317, 3027, 3276, 3027, 
+    2317, 1253, 0, -1253, -2317, -3027, -3276, -3027, -2317, -1253, 
+    0, 1253, 2317, 3027, 3276, 3027, 2317, 1253, 0, -1253, 
+    -2317, -3027, -3276, -3027, -2317, -1253, 0, 1253, 2317, 3027, 
+    3276, 3027, 2317, 1253, 0, -1253, -2317, -3027, -3276, -3027, 
+    -2317, -1253, 0, 1253, 2317, 3027, 3276, 3027, 2317, 1253, 
+    0, -1253, -2317, -3027, -3276, -3027, -2317, -1253, 0, 1253, 
+    2317, 3027, 3276, 3027, 2317, 1253, 0, -1253, -2317, -3027, 
+    -3276, -3027, -2317, -1253, 0, 1253, 2317, 3027, 3276, 3027, 
+    2317, 1253, 0, -1253, -2317, -3027, -3276, -3027, -2317, -1253, 
+    0, 1253, 2317, 3027, 3276, 3027, 2317, 1253, 0, -1253, 
+    -2317, -3027, -3276, -3027, -2317, -1253, 0, 1253, 2317, 3027, 
+    3276, 3027, 2317, 1253, 0, -1253, -2317, -3027, -3276, -3027, 
+    -2317, -1253, 0, 1253, 2317, 3027, 3276, 3027, 2317, 1253, 
+    0, -1253, -2317, -3027, -3276, -3027, -2317, -1253, 0, 1253, 
+    2317, 3027, 3276, 3027, 2317, 1253, 0, -1253, -2317, -3027, 
+    -3276, -3027, -2317, -1253, 0, 1253, 2317, 3027, 3276, 3027, 
+    2317, 1253, 0, -1253, -2317, -3027, -3276, -3027, -2317, -1253, 
+    0, 1253, 2317, 3027, 3276, 3027, 2317, 1253, 0, -1253, 
+    -2317, -3027, -3276, -3027, -2317, -1253, 0, 1253, 2317, 3027, 
+    3276, 3027, 2317, 1253, 0, -1253, -2317, -3027, -3276, -3027, 
+    -2317, -1253, 0, 1253, 2317, 3027, 3276, 3027, 2317, 1253, 
+    0, -1253, -2317, -3027, -3276, -3027, -2317, -1253, 0, 1253, 
+    2317, 3027, 3276, 3027, 2317, 1253, 0, -1253, -2317, -3027, 
+    -3276, -3027, -2317, -1253, 0, 1253, 2317, 3027, 3276, 3027, 
+    2317, 1253, 0, -1253, -2317, -3027, -3276, -3027, -2317, -1253, 
+    0, 1253, 2317, 3027, 3276, 3027, 2317, 1253, 0, -1253, 
+    -2317, -3027, -3276, -3027, -2317, -1253, 0, 1253, 2317, 3027, 
+    3276, 3027, 2317, 1253, 0, -1253, -2317, -3027, -3276, -3027, 
+    -2317, -1253, 0, 1253, 2317, 3027, 3276, 3027, 2317, 1253, 
+    0, -1253, -2317, -3027, -3276, -3027, -2317, -1253, 0, 1253, 
+    2317, 3027, 3276, 3027, 2317, 1253, 0, -1253, -2317, -3027, 
+    -3276, -3027, -2317, -1253, 0, 1253, 2317, 3027, 3276, 3027, 
+    2317, 1253, 0, -1253, -2317, -3027, -3276, -3027, -2317, -1253, 
+    0, 1253, 2317, 3027, 3276, 3027, 2317, 1253, 0, -1253, 
+    -2317, -3027, -3276, -3027, -2317, -1253, 0, 1253, 2317, 3027, 
+    3276, 3027, 2317, 1253, 0, -1253, -2317, -3027, -3276, -3027, 
+    -2317, -1253, 0, 1253, 2317, 3027, 3276, 3027, 2317, 1253, 
+    0, -1253, -2317, -3027, -3276, -3027, -2317, -1253, 0, 1253, 
+    2317, 3027, 3276, 3027, 2317, 1253, 0, -1253, -2317, -3027, 
+    -3276, -3027, -2317, -1253, 0, 1253, 2317, 3027, 3276, 3027, 
+    2317, 1253, 0, -1253, -2317, -3027, -3276, -3027, -2317, -1253};
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/sin_1k.h b/tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/sin_1k.h
new file mode 100644
index 00000000000..645e262aa11
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/sin_1k.h
@@ -0,0 +1,24 @@
+/* SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_SIN_1K_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_SIN_1K_H_
+
+#include <cstdint>
+
+extern const int g_sin_1k_size;
+extern const int16_t g_sin_1k[];
+
+#endif
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/yes_power_spectrum_data.h b/tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/yes_power_spectrum_data.h
deleted file mode 100644
index b02853f2ea7..00000000000
--- a/tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/yes_power_spectrum_data.h
+++ /dev/null
@@ -1,29 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// This data was extracted from the larger feature data held in
-// no_features_data.cc and consists of the 26th spectrogram slice of 43 values.
-// This is the expected result of running the sample data in
-// yes_30ms_sample_data.cc through through the preprocessing pipeline.
-
-#ifndef TENSORFLOW_CONTRIB_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_YES_POWER_SPECTRUM_DATA_H_
-#define TENSORFLOW_CONTRIB_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_YES_POWER_SPECTRUM_DATA_H_
-
-#include <cstdint>
-
-constexpr int g_yes_power_spectrum_data_size = 43;
-extern const uint8_t g_yes_power_spectrum_data[];
-
-#endif  // TENSORFLOW_CONTRIB_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_YES_POWER_SPECTRUM_DATA_H_
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/.gitignore b/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/.gitignore
new file mode 100644
index 00000000000..cb8d4d02c41
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/.gitignore
@@ -0,0 +1,4 @@
+captured_data.txt
+captured_data.wav
+cmsis_*.txt
+micro_*.txt
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/README.md b/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/README.md
new file mode 100644
index 00000000000..967b8335011
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/README.md
@@ -0,0 +1,72 @@
+# TODO
+
+* preprocessor_cmsis_test_bin
+
+# Description of Apollo3 Makefile targets
+
+* **pushbutton_cmsis_speech_test_bin**: 
+    * When users press BTN2 on the Apollo3 EVK, 1 second of audio is captured. 
+    * Then the audio is sent to the CMSIS version of the preprocessor and into the neural net
+    * To print out the neural net's inference scores, run GDB and source pushbutton\_cmsis\_scores.cmd
+    * To save the captured audio to a text file (captured\_data.txt), run GDB and source pushbutton\_cmsis\_voice.cmd
+    * Setup python
+        * sudo apt install python-pip
+        * sudo apt install python-tk
+        * pip install numpy
+        * pip install matplotlib
+        * pip install pysoundfile
+        * python captured_data_to_wav.py
+    * captured\_data.txt can be turned into a \*.wav file using captured\_data\_to\_wav.py by executing "python captured\_data\_to\_wav.py"
+* **preprocessor_1k_cmsis_test_bin**: 
+    * Sends a 1 kHz sine wave to the CMSIS fixed\_point version of the preprocessor
+    * **This test should be compiled with the -O0 option.** Otherwise, the breakpoints will not be reached
+        * In tensorflow/lite/experimental/micro/tools/make/targets/apollo3evb_makefile.inc change "-O3" to "-O0" on line 47
+        * **DO NOT FORGET TO REVERT CHANGE AFTER EXPERIMENT**
+        * In future, enhance scripts to handle automatically, NOT manually!
+    * Clean project by running "make -f tensorflow/lite/experimental/micro/tools/make/Makefile clean"
+    * Compile BIN by running "make -f tensorflow/lite/experimental/micro/tools/make/Makefile TARGET=apollo3evb preprocessor_1k_cmsis_test_bin"
+    * Run with the preprocessor\_1k\_cmsis\_test.cmd GDB command file
+    * Produces four text files corresponding to outputs from the CMSIS fixed\_point version of this algorithm:
+        * cmsis_windowed_input.txt: the sinusoid after multiplying elementwise with a Hann window
+        * cmsis_dft.txt: the DFT of the windowed sinusoid
+        * cmsis_power.txt: the magnitude squared of the DFT
+        * cmsis_power_avg.txt: the 6-bin average of the magnitude squared of the DFT
+    * Run both verisons of the 1KHz pre-processor test and then compare.
+        * These files can be plotted with "python compare\_1k.py"
+    * Also prints out the number of cycles the code took to execute (using the DWT->CYCCNT register) 
+* **preprocessor_1k_micro_test_bin**
+    * Sends a 1 kHz sine wave to the Micro-Lite fixed\_point version of the preprocessor
+    * **This test should be compiled with the -O0 option.** Otherwise, the breakpoints will not be reached
+    * Run with the preprocessor\_1k\_micro\_test.cmd GDB command file
+    * Produces four text files corresponding to outputs from the Micro-Lite version of this algorithm:
+        * micro_windowed_input.txt: the sinusoid after multiplying elementwise with a Hann window
+        * micro_dft.txt: the DFT of the windowed sinusoid
+        * micro_power.txt: the magnitude squared of the DFT
+        * micro_power_avg.txt: the 6-bin average of the magnitude squared of the DFT
+    * Run both verisons of the 1KHz pre-processor test and then compare.
+        * These files can be plotted with "python compare\_1k.py"
+    * Also prints out the number of cycles the code took to execute (using the DWT->CYCCNT register) 
+
+# Description of files
+
+* **.gitignore**: Git should ignore \*.txt and \*.wav files that result from experiments run in this directory
+* **apollo3.h**: Apollo 3 version of the [CMSIS Device Header File (device.h)](https://www.keil.com/pack/doc/CMSIS/Core/html/device_h_pg.html). Available in the [Ambiq Keil Pack](http://s3.ambiqmicro.com/pack/AmbiqMicro.Apollo_DFP.1.1.0.pack).
+* **captured\_data\_to\_wav.py**: Python script that parses a text file containing data dumped from GDB (specifically the verilog format) and creates a \*.wav file using [PySoundFile](https://pysoundfile.readthedocs.io/en/0.9.0/).
+* **compare\_1k.py**: This script compares the intermediate variables and final outputs of the micro-lite fixed-point preprocessor function and the CMSIS version of this function. The stimulus provided to each preprocessor is the same: a 1 kHz sinusoid.
+* **get\_yesno\_data.cmd**: A GDB command file that runs preprocessor_test (where TARGET=apollo3evb) and dumps the calculated data for the "yes" and "no" input wavfeorms to text files
+* **\_main.c**: Point of entry for the micro_speech test
+* **preprocessor_1k.cc**: A version of preprocessor.cc where a 1 kHz sinusoid is provided as input to the preprocessor
+* **preprocessor_1k_cmsis_test.cmd**: GDB command file for the CMSIS preprocessor 1 kHz test
+* **preprocessor_1k_micro_test.cmd**: GDB command file for the Micro-Lite preprocessor 1 kHz test
+* **preprocessor_test.cmd**: GDB command file for the preprocessor test
+* **pushbutton_cmsis_scores.cmd**: GDB command file that runs pushbutton_cmsis_speech_test_bin. It adds a breakpoint immediately after the scores are reported and prints out each score. Then it continues code execution.
+* **pushbutton_cmsis_voice.cmd**: GDB command file that runs pushbutton_cmsis_speech_test_bin. Dumps the recorded 1 second of audio to captured_data.txt, which can then be processed by the python file captured_data_to_wav.py.
+* **pushbutton_main.c**: Source file containing program point of entry \_main() for the pushbutton\_\* tests. Contains Interrupt Service Routines for PDM data capture and pushbuttons. Calls the main() function of pushbutton_test.cc
+* **pushbutton_test.cc**: Source file containing main() function for the pushbutton\_\* tests. main() calls the preprocessor function and the neural net inference function.
+* **system_apollo3.c**: Apollo 3 version of the [CMSIS System Configuration File system\_\<device\>.c](https://www.keil.com/pack/doc/CMSIS/Core/html/system_c_pg.html). Available in the [Ambiq Keil Pack](http://s3.ambiqmicro.com/pack/AmbiqMicro.Apollo_DFP.1.1.0.pack).
+* **system_apollo3.h**: Apollo 3 version of the [CMSIS System Configuration File system\_\<device\>.h](https://www.keil.com/pack/doc/CMSIS/Core/html/system_c_pg.html). Available in the [Ambiq Keil Pack](http://s3.ambiqmicro.com/pack/AmbiqMicro.Apollo_DFP.1.1.0.pack).
+
+
+# FFT scaling
+See https://github.com/ARM-software/CMSIS_5/issues/220
+>And as @xizhizhang pointed, I think there may be an error on the internal downscaling, or at least on the documentation. It looks like during the fft computation, the downscaling factor reach 2**-9 for a 512 rfft operation, being the output in Q10.22, instead the documented 2**-8 and Q9.23.
diff --git a/tensorflow/lite/experimental/micro/tools/make/targets/apollo3evb/_main.c b/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/_main.c
similarity index 100%
rename from tensorflow/lite/experimental/micro/tools/make/targets/apollo3evb/_main.c
rename to tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/_main.c
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/apollo3.h b/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/apollo3.h
new file mode 100755
index 00000000000..af22270e32e
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/apollo3.h
@@ -0,0 +1,23332 @@
+/*
+ * Copyright (C) 2015-2017, Ambiq Micro
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of itscontributors may be used to endorse
+ * or promote products derived from thissoftware without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * @file     apollo3.h
+ * @brief    CMSIS HeaderFile
+ * @version  1.0
+ * @date     10. August 2018
+ * @note     Generated by SVDConv V3.3.18 on Friday, 10.08.2018 16:52:09
+ *           from File 'apollo3.svd',
+ *           last modified on Friday, 10.08.2018 20:01:31
+ */
+
+
+
+/** @addtogroup Ambiq Micro
+  * @{
+  */
+
+
+/** @addtogroup apollo3
+  * @{
+  */
+
+
+#ifndef APOLLO3_H
+#define APOLLO3_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/** @addtogroup Configuration_of_CMSIS
+  * @{
+  */
+
+
+
+/* =========================================================================================================================== */
+/* ================                                Interrupt Number Definition                                ================ */
+/* =========================================================================================================================== */
+
+typedef enum {
+/* =======================================  ARM Cortex-M4 Specific Interrupt Numbers  ======================================== */
+  Reset_IRQn                = -15,              /*!< -15  Reset Vector, invoked on Power up and warm reset                     */
+  NonMaskableInt_IRQn       = -14,              /*!< -14  Non maskable Interrupt, cannot be stopped or preempted               */
+  HardFault_IRQn            = -13,              /*!< -13  Hard Fault, all classes of Fault                                     */
+  MemoryManagement_IRQn     = -12,              /*!< -12  Memory Management, MPU mismatch, including Access Violation
+                                                     and No Match                                                              */
+  BusFault_IRQn             = -11,              /*!< -11  Bus Fault, Pre-Fetch-, Memory Access Fault, other address/memory
+                                                     related Fault                                                             */
+  UsageFault_IRQn           = -10,              /*!< -10  Usage Fault, i.e. Undef Instruction, Illegal State Transition        */
+  SVCall_IRQn               =  -5,              /*!< -5 System Service Call via SVC instruction                                */
+  DebugMonitor_IRQn         =  -4,              /*!< -4 Debug Monitor                                                          */
+  PendSV_IRQn               =  -2,              /*!< -2 Pendable request for system service                                    */
+  SysTick_IRQn              =  -1,              /*!< -1 System Tick Timer                                                      */
+/* ==========================================  apollo3 Specific Interrupt Numbers  =========================================== */
+  BROWNOUT_IRQn             =   0,              /*!< 0  BROWNOUT                                                               */
+  WDT_IRQn                  =   1,              /*!< 1  WDT                                                                    */
+  RTC_IRQn                  =   2,              /*!< 2  RTC                                                                    */
+  VCOMP_IRQn                =   3,              /*!< 3  VCOMP                                                                  */
+  IOSLAVE_IRQn              =   4,              /*!< 4  IOSLAVE                                                                */
+  IOSLAVEACC_IRQn           =   5,              /*!< 5  IOSLAVEACC                                                             */
+  IOMSTR0_IRQn              =   6,              /*!< 6  IOMSTR0                                                                */
+  IOMSTR1_IRQn              =   7,              /*!< 7  IOMSTR1                                                                */
+  IOMSTR2_IRQn              =   8,              /*!< 8  IOMSTR2                                                                */
+  IOMSTR3_IRQn              =   9,              /*!< 9  IOMSTR3                                                                */
+  IOMSTR4_IRQn              =  10,              /*!< 10 IOMSTR4                                                                */
+  IOMSTR5_IRQn              =  11,              /*!< 11 IOMSTR5                                                                */
+  BLE_IRQn                  =  12,              /*!< 12 BLE                                                                    */
+  GPIO_IRQn                 =  13,              /*!< 13 GPIO                                                                   */
+  CTIMER_IRQn               =  14,              /*!< 14 CTIMER                                                                 */
+  UART0_IRQn                =  15,              /*!< 15 UART0                                                                  */
+  UART1_IRQn                =  16,              /*!< 16 UART1                                                                  */
+  SCARD_IRQn                =  17,              /*!< 17 SCARD                                                                  */
+  ADC_IRQn                  =  18,              /*!< 18 ADC                                                                    */
+  PDM_IRQn                  =  19,              /*!< 19 PDM                                                                    */
+  MSPI_IRQn                 =  20,              /*!< 20 MSPI                                                                   */
+  STIMER_IRQn               =  22,              /*!< 22 STIMER                                                                 */
+  STIMER_CMPR0_IRQn         =  23,              /*!< 23 STIMER_CMPR0                                                           */
+  STIMER_CMPR1_IRQn         =  24,              /*!< 24 STIMER_CMPR1                                                           */
+  STIMER_CMPR2_IRQn         =  25,              /*!< 25 STIMER_CMPR2                                                           */
+  STIMER_CMPR3_IRQn         =  26,              /*!< 26 STIMER_CMPR3                                                           */
+  STIMER_CMPR4_IRQn         =  27,              /*!< 27 STIMER_CMPR4                                                           */
+  STIMER_CMPR5_IRQn         =  28,              /*!< 28 STIMER_CMPR5                                                           */
+  STIMER_CMPR6_IRQn         =  29,              /*!< 29 STIMER_CMPR6                                                           */
+  STIMER_CMPR7_IRQn         =  30,              /*!< 30 STIMER_CMPR7                                                           */
+  CLKGEN_IRQn               =  31               /*!< 31 CLKGEN                                                                 */
+} IRQn_Type;
+
+
+
+/* =========================================================================================================================== */
+/* ================                           Processor and Core Peripheral Section                           ================ */
+/* =========================================================================================================================== */
+
+/* ===========================  Configuration of the ARM Cortex-M4 Processor and Core Peripherals  =========================== */
+#define __CM4_REV                 0x0100U       /*!< CM4 Core Revision                                                         */
+#define __NVIC_PRIO_BITS               3        /*!< Number of Bits used for Priority Levels                                   */
+#define __Vendor_SysTickConfig         0        /*!< Set to 1 if different SysTick Config is used                              */
+#define __MPU_PRESENT                  1        /*!< MPU present or not                                                        */
+#define __FPU_PRESENT                  1        /*!< FPU present or not                                                        */
+
+
+/** @} */ /* End of group Configuration_of_CMSIS */
+
+#include "core_cm4.h"                           /*!< ARM Cortex-M4 processor and core peripherals                              */
+#include "system_apollo3.h"                     /*!< apollo3 System                                                            */
+
+#ifndef __IM                                    /*!< Fallback for older CMSIS versions                                         */
+  #define __IM   __I
+#endif
+#ifndef __OM                                    /*!< Fallback for older CMSIS versions                                         */
+  #define __OM   __O
+#endif
+#ifndef __IOM                                   /*!< Fallback for older CMSIS versions                                         */
+  #define __IOM  __IO
+#endif
+
+
+/* ========================================  Start of section using anonymous unions  ======================================== */
+#if defined (__CC_ARM)
+  #pragma push
+  #pragma anon_unions
+#elif defined (__ICCARM__)
+  #pragma language=extended
+#elif defined(__ARMCC_VERSION) && (__ARMCC_VERSION >= 6010050)
+  #pragma clang diagnostic push
+  #pragma clang diagnostic ignored "-Wc11-extensions"
+  #pragma clang diagnostic ignored "-Wreserved-id-macro"
+  #pragma clang diagnostic ignored "-Wgnu-anonymous-struct"
+  #pragma clang diagnostic ignored "-Wnested-anon-types"
+#elif defined (__GNUC__)
+  /* anonymous unions are enabled by default */
+#elif defined (__TMS470__)
+  /* anonymous unions are enabled by default */
+#elif defined (__TASKING__)
+  #pragma warning 586
+#elif defined (__CSMC__)
+  /* anonymous unions are enabled by default */
+#else
+  #warning Not supported compiler type
+#endif
+
+
+/* =========================================================================================================================== */
+/* ================                            Device Specific Peripheral Section                             ================ */
+/* =========================================================================================================================== */
+
+
+/** @addtogroup Device_Peripheral_peripherals
+  * @{
+  */
+
+
+
+/* =========================================================================================================================== */
+/* ================                                            ADC                                            ================ */
+/* =========================================================================================================================== */
+
+
+/**
+  * @brief Analog Digital Converter Control (ADC)
+  */
+
+typedef struct {                                /*!< (@ 0x50010000) ADC Structure                                              */
+
+  union {
+    __IOM uint32_t CFG;                         /*!< (@ 0x00000000) Configuration Register                                     */
+
+    struct {
+      __IOM uint32_t ADCEN      : 1;            /*!< [0..0] This bit enables the ADC module. While the ADC is enabled,
+                                                     the ADCCFG and SLOT Configuration regsiter settings must
+                                                     remain stable and unchanged. All configuration register
+                                                     settings, slot configuration settings and window comparison
+                                                     settings should be written prior to setting the ADCEN bit
+                                                     to '1'.                                                                   */
+      __IM  uint32_t            : 1;
+      __IOM uint32_t RPTEN      : 1;            /*!< [2..2] This bit enables Repeating Scan Mode.                              */
+      __IOM uint32_t LPMODE     : 1;            /*!< [3..3] Select power mode to enter between active scans.                   */
+      __IOM uint32_t CKMODE     : 1;            /*!< [4..4] Clock mode register                                                */
+      __IM  uint32_t            : 3;
+      __IOM uint32_t REFSEL     : 2;            /*!< [9..8] Select the ADC reference voltage.                                  */
+      __IM  uint32_t            : 2;
+      __IOM uint32_t DFIFORDEN  : 1;            /*!< [12..12] Destructive FIFO Read Enable. Setting this will enable
+                                                     FIFO pop upon reading the FIFOPR register.                                */
+      __IM  uint32_t            : 3;
+      __IOM uint32_t TRIGSEL    : 3;            /*!< [18..16] Select the ADC trigger source.                                   */
+      __IOM uint32_t TRIGPOL    : 1;            /*!< [19..19] This bit selects the ADC trigger polarity for external
+                                                     off chip triggers.                                                        */
+      __IM  uint32_t            : 4;
+      __IOM uint32_t CLKSEL     : 2;            /*!< [25..24] Select the source and frequency for the ADC clock.
+                                                     All values not enumerated below are undefined.                            */
+    } CFG_b;
+  } ;
+
+  union {
+    __IOM uint32_t STAT;                        /*!< (@ 0x00000004) ADC Power Status                                           */
+
+    struct {
+      __IOM uint32_t PWDSTAT    : 1;            /*!< [0..0] Indicates the power-status of the ADC.                             */
+    } STAT_b;
+  } ;
+
+  union {
+    __IOM uint32_t SWT;                         /*!< (@ 0x00000008) Software trigger                                           */
+
+    struct {
+      __IOM uint32_t SWT        : 8;            /*!< [7..0] Writing 0x37 to this register generates a software trigger.        */
+    } SWT_b;
+  } ;
+
+  union {
+    __IOM uint32_t SL0CFG;                      /*!< (@ 0x0000000C) Slot 0 Configuration Register                              */
+
+    struct {
+      __IOM uint32_t SLEN0      : 1;            /*!< [0..0] This bit enables slot 0 for ADC conversions.                       */
+      __IOM uint32_t WCEN0      : 1;            /*!< [1..1] This bit enables the window compare function for slot
+                                                     0.                                                                        */
+      __IM  uint32_t            : 6;
+      __IOM uint32_t CHSEL0     : 4;            /*!< [11..8] Select one of the 14 channel inputs for this slot.                */
+      __IM  uint32_t            : 4;
+      __IOM uint32_t PRMODE0    : 2;            /*!< [17..16] Set the Precision Mode For Slot.                                 */
+      __IM  uint32_t            : 6;
+      __IOM uint32_t ADSEL0     : 3;            /*!< [26..24] Select the number of measurements to average in the
+                                                     accumulate divide module for this slot.                                   */
+    } SL0CFG_b;
+  } ;
+
+  union {
+    __IOM uint32_t SL1CFG;                      /*!< (@ 0x00000010) Slot 1 Configuration Register                              */
+
+    struct {
+      __IOM uint32_t SLEN1      : 1;            /*!< [0..0] This bit enables slot 1 for ADC conversions.                       */
+      __IOM uint32_t WCEN1      : 1;            /*!< [1..1] This bit enables the window compare function for slot
+                                                     1.                                                                        */
+      __IM  uint32_t            : 6;
+      __IOM uint32_t CHSEL1     : 4;            /*!< [11..8] Select one of the 14 channel inputs for this slot.                */
+      __IM  uint32_t            : 4;
+      __IOM uint32_t PRMODE1    : 2;            /*!< [17..16] Set the Precision Mode For Slot.                                 */
+      __IM  uint32_t            : 6;
+      __IOM uint32_t ADSEL1     : 3;            /*!< [26..24] Select the number of measurements to average in the
+                                                     accumulate divide module for this slot.                                   */
+    } SL1CFG_b;
+  } ;
+
+  union {
+    __IOM uint32_t SL2CFG;                      /*!< (@ 0x00000014) Slot 2 Configuration Register                              */
+
+    struct {
+      __IOM uint32_t SLEN2      : 1;            /*!< [0..0] This bit enables slot 2 for ADC conversions.                       */
+      __IOM uint32_t WCEN2      : 1;            /*!< [1..1] This bit enables the window compare function for slot
+                                                     2.                                                                        */
+      __IM  uint32_t            : 6;
+      __IOM uint32_t CHSEL2     : 4;            /*!< [11..8] Select one of the 14 channel inputs for this slot.                */
+      __IM  uint32_t            : 4;
+      __IOM uint32_t PRMODE2    : 2;            /*!< [17..16] Set the Precision Mode For Slot.                                 */
+      __IM  uint32_t            : 6;
+      __IOM uint32_t ADSEL2     : 3;            /*!< [26..24] Select the number of measurements to average in the
+                                                     accumulate divide module for this slot.                                   */
+    } SL2CFG_b;
+  } ;
+
+  union {
+    __IOM uint32_t SL3CFG;                      /*!< (@ 0x00000018) Slot 3 Configuration Register                              */
+
+    struct {
+      __IOM uint32_t SLEN3      : 1;            /*!< [0..0] This bit enables slot 3 for ADC conversions.                       */
+      __IOM uint32_t WCEN3      : 1;            /*!< [1..1] This bit enables the window compare function for slot
+                                                     3.                                                                        */
+      __IM  uint32_t            : 6;
+      __IOM uint32_t CHSEL3     : 4;            /*!< [11..8] Select one of the 14 channel inputs for this slot.                */
+      __IM  uint32_t            : 4;
+      __IOM uint32_t PRMODE3    : 2;            /*!< [17..16] Set the Precision Mode For Slot.                                 */
+      __IM  uint32_t            : 6;
+      __IOM uint32_t ADSEL3     : 3;            /*!< [26..24] Select the number of measurements to average in the
+                                                     accumulate divide module for this slot.                                   */
+    } SL3CFG_b;
+  } ;
+
+  union {
+    __IOM uint32_t SL4CFG;                      /*!< (@ 0x0000001C) Slot 4 Configuration Register                              */
+
+    struct {
+      __IOM uint32_t SLEN4      : 1;            /*!< [0..0] This bit enables slot 4 for ADC conversions.                       */
+      __IOM uint32_t WCEN4      : 1;            /*!< [1..1] This bit enables the window compare function for slot
+                                                     4.                                                                        */
+      __IM  uint32_t            : 6;
+      __IOM uint32_t CHSEL4     : 4;            /*!< [11..8] Select one of the 14 channel inputs for this slot.                */
+      __IM  uint32_t            : 4;
+      __IOM uint32_t PRMODE4    : 2;            /*!< [17..16] Set the Precision Mode For Slot.                                 */
+      __IM  uint32_t            : 6;
+      __IOM uint32_t ADSEL4     : 3;            /*!< [26..24] Select the number of measurements to average in the
+                                                     accumulate divide module for this slot.                                   */
+    } SL4CFG_b;
+  } ;
+
+  union {
+    __IOM uint32_t SL5CFG;                      /*!< (@ 0x00000020) Slot 5 Configuration Register                              */
+
+    struct {
+      __IOM uint32_t SLEN5      : 1;            /*!< [0..0] This bit enables slot 5 for ADC conversions.                       */
+      __IOM uint32_t WCEN5      : 1;            /*!< [1..1] This bit enables the window compare function for slot
+                                                     5.                                                                        */
+      __IM  uint32_t            : 6;
+      __IOM uint32_t CHSEL5     : 4;            /*!< [11..8] Select one of the 14 channel inputs for this slot.                */
+      __IM  uint32_t            : 4;
+      __IOM uint32_t PRMODE5    : 2;            /*!< [17..16] Set the Precision Mode For Slot.                                 */
+      __IM  uint32_t            : 6;
+      __IOM uint32_t ADSEL5     : 3;            /*!< [26..24] Select number of measurements to average in the accumulate
+                                                     divide module for this slot.                                              */
+    } SL5CFG_b;
+  } ;
+
+  union {
+    __IOM uint32_t SL6CFG;                      /*!< (@ 0x00000024) Slot 6 Configuration Register                              */
+
+    struct {
+      __IOM uint32_t SLEN6      : 1;            /*!< [0..0] This bit enables slot 6 for ADC conversions.                       */
+      __IOM uint32_t WCEN6      : 1;            /*!< [1..1] This bit enables the window compare function for slot
+                                                     6.                                                                        */
+      __IM  uint32_t            : 6;
+      __IOM uint32_t CHSEL6     : 4;            /*!< [11..8] Select one of the 14 channel inputs for this slot.                */
+      __IM  uint32_t            : 4;
+      __IOM uint32_t PRMODE6    : 2;            /*!< [17..16] Set the Precision Mode For Slot.                                 */
+      __IM  uint32_t            : 6;
+      __IOM uint32_t ADSEL6     : 3;            /*!< [26..24] Select the number of measurements to average in the
+                                                     accumulate divide module for this slot.                                   */
+    } SL6CFG_b;
+  } ;
+
+  union {
+    __IOM uint32_t SL7CFG;                      /*!< (@ 0x00000028) Slot 7 Configuration Register                              */
+
+    struct {
+      __IOM uint32_t SLEN7      : 1;            /*!< [0..0] This bit enables slot 7 for ADC conversions.                       */
+      __IOM uint32_t WCEN7      : 1;            /*!< [1..1] This bit enables the window compare function for slot
+                                                     7.                                                                        */
+      __IM  uint32_t            : 6;
+      __IOM uint32_t CHSEL7     : 4;            /*!< [11..8] Select one of the 14 channel inputs for this slot.                */
+      __IM  uint32_t            : 4;
+      __IOM uint32_t PRMODE7    : 2;            /*!< [17..16] Set the Precision Mode For Slot.                                 */
+      __IM  uint32_t            : 6;
+      __IOM uint32_t ADSEL7     : 3;            /*!< [26..24] Select the number of measurements to average in the
+                                                     accumulate divide module for this slot.                                   */
+    } SL7CFG_b;
+  } ;
+
+  union {
+    __IOM uint32_t WULIM;                       /*!< (@ 0x0000002C) Window Comparator Upper Limits Register                    */
+
+    struct {
+      __IOM uint32_t ULIM       : 20;           /*!< [19..0] Sets the upper limit for the window comparator.                   */
+    } WULIM_b;
+  } ;
+
+  union {
+    __IOM uint32_t WLLIM;                       /*!< (@ 0x00000030) Window Comparator Lower Limits Register                    */
+
+    struct {
+      __IOM uint32_t LLIM       : 20;           /*!< [19..0] Sets the lower limit for the window comparator.                   */
+    } WLLIM_b;
+  } ;
+
+  union {
+    __IOM uint32_t SCWLIM;                      /*!< (@ 0x00000034) Scale Window Comparator Limits                             */
+
+    struct {
+      __IOM uint32_t SCWLIMEN   : 1;            /*!< [0..0] Scale the window limits compare values per precision
+                                                     mode. When set to 0x0 (default), the values in the 20-bit
+                                                     limits registers will compare directly with the FIFO values
+                                                     regardless of the precision mode the slot is configured
+                                                     to. When set to 0x1, the compare values will be divided
+                                                     by the difference in precision bits while performing the
+                                                     window limit comparisons.                                                 */
+    } SCWLIM_b;
+  } ;
+
+  union {
+    __IOM uint32_t FIFO;                        /*!< (@ 0x00000038) FIFO Data and Valid Count Register                         */
+
+    struct {
+      __IOM uint32_t DATA       : 20;           /*!< [19..0] Oldest data in the FIFO.                                          */
+      __IOM uint32_t COUNT      : 8;            /*!< [27..20] Number of valid entries in the ADC FIFO.                         */
+      __IOM uint32_t SLOTNUM    : 3;            /*!< [30..28] Slot number associated with this FIFO data.                      */
+      __IOM uint32_t RSVD       : 1;            /*!< [31..31] RESERVED.                                                        */
+    } FIFO_b;
+  } ;
+
+  union {
+    __IOM uint32_t FIFOPR;                      /*!< (@ 0x0000003C) FIFO Data and Valid Count Register                         */
+
+    struct {
+      __IOM uint32_t DATA       : 20;           /*!< [19..0] Oldest data in the FIFO.                                          */
+      __IOM uint32_t COUNT      : 8;            /*!< [27..20] Number of valid entries in the ADC FIFO.                         */
+      __IOM uint32_t SLOTNUMPR  : 3;            /*!< [30..28] Slot number associated with this FIFO data.                      */
+      __IOM uint32_t RSVDPR     : 1;            /*!< [31..31] RESERVED.                                                        */
+    } FIFOPR_b;
+  } ;
+  __IM  uint32_t  RESERVED[112];
+
+  union {
+    __IOM uint32_t INTEN;                       /*!< (@ 0x00000200) ADC Interrupt registers: Enable                            */
+
+    struct {
+      __IOM uint32_t CNVCMP     : 1;            /*!< [0..0] ADC conversion complete interrupt.                                 */
+      __IOM uint32_t SCNCMP     : 1;            /*!< [1..1] ADC scan complete interrupt.                                       */
+      __IOM uint32_t FIFOOVR1   : 1;            /*!< [2..2] FIFO 75 percent full interrupt.                                    */
+      __IOM uint32_t FIFOOVR2   : 1;            /*!< [3..3] FIFO 100 percent full interrupt.                                   */
+      __IOM uint32_t WCEXC      : 1;            /*!< [4..4] Window comparator voltage excursion interrupt.                     */
+      __IOM uint32_t WCINC      : 1;            /*!< [5..5] Window comparator voltage incursion interrupt.                     */
+      __IOM uint32_t DCMP       : 1;            /*!< [6..6] DMA Transfer Complete                                              */
+      __IOM uint32_t DERR       : 1;            /*!< [7..7] DMA Error Condition                                                */
+    } INTEN_b;
+  } ;
+
+  union {
+    __IOM uint32_t INTSTAT;                     /*!< (@ 0x00000204) ADC Interrupt registers: Status                            */
+
+    struct {
+      __IOM uint32_t CNVCMP     : 1;            /*!< [0..0] ADC conversion complete interrupt.                                 */
+      __IOM uint32_t SCNCMP     : 1;            /*!< [1..1] ADC scan complete interrupt.                                       */
+      __IOM uint32_t FIFOOVR1   : 1;            /*!< [2..2] FIFO 75 percent full interrupt.                                    */
+      __IOM uint32_t FIFOOVR2   : 1;            /*!< [3..3] FIFO 100 percent full interrupt.                                   */
+      __IOM uint32_t WCEXC      : 1;            /*!< [4..4] Window comparator voltage excursion interrupt.                     */
+      __IOM uint32_t WCINC      : 1;            /*!< [5..5] Window comparator voltage incursion interrupt.                     */
+      __IOM uint32_t DCMP       : 1;            /*!< [6..6] DMA Transfer Complete                                              */
+      __IOM uint32_t DERR       : 1;            /*!< [7..7] DMA Error Condition                                                */
+    } INTSTAT_b;
+  } ;
+
+  union {
+    __IOM uint32_t INTCLR;                      /*!< (@ 0x00000208) ADC Interrupt registers: Clear                             */
+
+    struct {
+      __IOM uint32_t CNVCMP     : 1;            /*!< [0..0] ADC conversion complete interrupt.                                 */
+      __IOM uint32_t SCNCMP     : 1;            /*!< [1..1] ADC scan complete interrupt.                                       */
+      __IOM uint32_t FIFOOVR1   : 1;            /*!< [2..2] FIFO 75 percent full interrupt.                                    */
+      __IOM uint32_t FIFOOVR2   : 1;            /*!< [3..3] FIFO 100 percent full interrupt.                                   */
+      __IOM uint32_t WCEXC      : 1;            /*!< [4..4] Window comparator voltage excursion interrupt.                     */
+      __IOM uint32_t WCINC      : 1;            /*!< [5..5] Window comparator voltage incursion interrupt.                     */
+      __IOM uint32_t DCMP       : 1;            /*!< [6..6] DMA Transfer Complete                                              */
+      __IOM uint32_t DERR       : 1;            /*!< [7..7] DMA Error Condition                                                */
+    } INTCLR_b;
+  } ;
+
+  union {
+    __IOM uint32_t INTSET;                      /*!< (@ 0x0000020C) ADC Interrupt registers: Set                               */
+
+    struct {
+      __IOM uint32_t CNVCMP     : 1;            /*!< [0..0] ADC conversion complete interrupt.                                 */
+      __IOM uint32_t SCNCMP     : 1;            /*!< [1..1] ADC scan complete interrupt.                                       */
+      __IOM uint32_t FIFOOVR1   : 1;            /*!< [2..2] FIFO 75 percent full interrupt.                                    */
+      __IOM uint32_t FIFOOVR2   : 1;            /*!< [3..3] FIFO 100 percent full interrupt.                                   */
+      __IOM uint32_t WCEXC      : 1;            /*!< [4..4] Window comparator voltage excursion interrupt.                     */
+      __IOM uint32_t WCINC      : 1;            /*!< [5..5] Window comparator voltage incursion interrupt.                     */
+      __IOM uint32_t DCMP       : 1;            /*!< [6..6] DMA Transfer Complete                                              */
+      __IOM uint32_t DERR       : 1;            /*!< [7..7] DMA Error Condition                                                */
+    } INTSET_b;
+  } ;
+  __IM  uint32_t  RESERVED1[12];
+
+  union {
+    __IOM uint32_t DMATRIGEN;                   /*!< (@ 0x00000240) DMA Trigger Enable Register                                */
+
+    struct {
+      __IOM uint32_t DFIFO75    : 1;            /*!< [0..0] Trigger DMA upon FIFO 75 percent Full                              */
+      __IOM uint32_t DFIFOFULL  : 1;            /*!< [1..1] Trigger DMA upon FIFO 100 percent Full                             */
+    } DMATRIGEN_b;
+  } ;
+
+  union {
+    __IOM uint32_t DMATRIGSTAT;                 /*!< (@ 0x00000244) DMA Trigger Status Register                                */
+
+    struct {
+      __IOM uint32_t D75STAT    : 1;            /*!< [0..0] Triggered DMA from FIFO 75 percent Full                            */
+      __IOM uint32_t DFULLSTAT  : 1;            /*!< [1..1] Triggered DMA from FIFO 100 percent Full                           */
+    } DMATRIGSTAT_b;
+  } ;
+  __IM  uint32_t  RESERVED2[14];
+
+  union {
+    __IOM uint32_t DMACFG;                      /*!< (@ 0x00000280) DMA Configuration Register                                 */
+
+    struct {
+      __IOM uint32_t DMAEN      : 1;            /*!< [0..0] DMA Enable                                                         */
+      __IM  uint32_t            : 1;
+      __IOM uint32_t DMADIR     : 1;            /*!< [2..2] Direction                                                          */
+      __IM  uint32_t            : 5;
+      __IOM uint32_t DMAPRI     : 1;            /*!< [8..8] Sets the Priority of the DMA request                               */
+      __IOM uint32_t DMADYNPRI  : 1;            /*!< [9..9] Enables dynamic priority based on FIFO fullness. When
+                                                     FIFO is full, priority is automatically set to HIGH. Otherwise,
+                                                     DMAPRI is used.                                                           */
+      __IM  uint32_t            : 6;
+      __IOM uint32_t DMAHONSTAT : 1;            /*!< [16..16] Halt New ADC conversions until DMA Status DMAERR and
+                                                     DMACPL Cleared.                                                           */
+      __IOM uint32_t DMAMSK     : 1;            /*!< [17..17] Mask the FIFOCNT and SLOTNUM when transferring FIFO
+                                                     contents to memory                                                        */
+      __IOM uint32_t DPWROFF    : 1;            /*!< [18..18] Power Off the ADC System upon DMACPL.                            */
+    } DMACFG_b;
+  } ;
+  __IM  uint32_t  RESERVED3;
+
+  union {
+    __IOM uint32_t DMATOTCOUNT;                 /*!< (@ 0x00000288) DMA Total Transfer Count                                   */
+
+    struct {
+      __IM  uint32_t            : 2;
+      __IOM uint32_t TOTCOUNT   : 16;           /*!< [17..2] Total Transfer Count                                              */
+    } DMATOTCOUNT_b;
+  } ;
+
+  union {
+    __IOM uint32_t DMATARGADDR;                 /*!< (@ 0x0000028C) DMA Target Address Register                                */
+
+    struct {
+      __IOM uint32_t LTARGADDR  : 19;           /*!< [18..0] DMA Target Address                                                */
+      __IOM uint32_t UTARGADDR  : 13;           /*!< [31..19] SRAM Target                                                      */
+    } DMATARGADDR_b;
+  } ;
+
+  union {
+    __IOM uint32_t DMASTAT;                     /*!< (@ 0x00000290) DMA Status Register                                        */
+
+    struct {
+      __IOM uint32_t DMATIP     : 1;            /*!< [0..0] DMA Transfer In Progress                                           */
+      __IOM uint32_t DMACPL     : 1;            /*!< [1..1] DMA Transfer Complete                                              */
+      __IOM uint32_t DMAERR     : 1;            /*!< [2..2] DMA Error                                                          */
+    } DMASTAT_b;
+  } ;
+} ADC_Type;                                     /*!< Size = 660 (0x294)                                                        */
+
+
+
+/* =========================================================================================================================== */
+/* ================                                          APBDMA                                           ================ */
+/* =========================================================================================================================== */
+
+
+/**
+  * @brief APB DMA Register Interfaces (APBDMA)
+  */
+
+typedef struct {                                /*!< (@ 0x40011000) APBDMA Structure                                           */
+
+  union {
+    __IOM uint32_t BBVALUE;                     /*!< (@ 0x00000000) Control Register                                           */
+
+    struct {
+      __IOM uint32_t DATAOUT    : 8;            /*!< [7..0] Data Output Values                                                 */
+      __IM  uint32_t            : 8;
+      __IOM uint32_t PIN        : 8;            /*!< [23..16] PIO values                                                       */
+    } BBVALUE_b;
+  } ;
+
+  union {
+    __IOM uint32_t BBSETCLEAR;                  /*!< (@ 0x00000004) Set/Clear Register                                         */
+
+    struct {
+      __IOM uint32_t SET        : 8;            /*!< [7..0] Write 1 to Set PIO value (set hier priority than clear
+                                                     if both bit set)                                                          */
+      __IM  uint32_t            : 8;
+      __IOM uint32_t CLEAR      : 8;            /*!< [23..16] Write 1 to Clear PIO value                                       */
+    } BBSETCLEAR_b;
+  } ;
+
+  union {
+    __IOM uint32_t BBINPUT;                     /*!< (@ 0x00000008) PIO Input Values                                           */
+
+    struct {
+      __IOM uint32_t DATAIN     : 8;            /*!< [7..0] PIO values                                                         */
+    } BBINPUT_b;
+  } ;
+  __IM  uint32_t  RESERVED[5];
+
+  union {
+    __IOM uint32_t DEBUGDATA;                   /*!< (@ 0x00000020) PIO Input Values                                           */
+
+    struct {
+      __IOM uint32_t DEBUGDATA  : 32;           /*!< [31..0] Debug Data                                                        */
+    } DEBUGDATA_b;
+  } ;
+  __IM  uint32_t  RESERVED1[7];
+
+  union {
+    __IOM uint32_t DEBUG;                       /*!< (@ 0x00000040) PIO Input Values                                           */
+
+    struct {
+      __IOM uint32_t DEBUGEN    : 4;            /*!< [3..0] Debug Enable                                                       */
+    } DEBUG_b;
+  } ;
+} APBDMA_Type;                                  /*!< Size = 68 (0x44)                                                          */
+
+
+
+/* =========================================================================================================================== */
+/* ================                                           BLEIF                                           ================ */
+/* =========================================================================================================================== */
+
+
+/**
+  * @brief BLE Interface (BLEIF)
+  */
+
+typedef struct {                                /*!< (@ 0x5000C000) BLEIF Structure                                            */
+
+  union {
+    __IOM uint32_t FIFO;                        /*!< (@ 0x00000000) FIFO Access Port                                           */
+
+    struct {
+      __IOM uint32_t FIFO       : 32;           /*!< [31..0] FIFO direct access. Only locations 0 - 3F will return
+                                                     valid information.                                                        */
+    } FIFO_b;
+  } ;
+  __IM  uint32_t  RESERVED[63];
+
+  union {
+    __IOM uint32_t FIFOPTR;                     /*!< (@ 0x00000100) FIFO size and remaining slots open values                  */
+
+    struct {
+      __IOM uint32_t FIFO0SIZ   : 8;            /*!< [7..0] The number of valid data bytes currently in the FIFO
+                                                     0 (written by MCU, read by interface)                                     */
+      __IOM uint32_t FIFO0REM   : 8;            /*!< [15..8] The number of remaining data bytes slots currently in
+                                                     FIFO 0 (written by MCU, read by interface)                                */
+      __IOM uint32_t FIFO1SIZ   : 8;            /*!< [23..16] The number of valid data bytes currently in FIFO 1
+                                                     (written by interface, read by MCU)                                       */
+      __IOM uint32_t FIFO1REM   : 8;            /*!< [31..24] The number of remaining data bytes slots currently
+                                                     in FIFO 1 (written by interface, read by MCU)                             */
+    } FIFOPTR_b;
+  } ;
+
+  union {
+    __IOM uint32_t FIFOTHR;                     /*!< (@ 0x00000104) FIFO Threshold Configuration                               */
+
+    struct {
+      __IOM uint32_t FIFORTHR   : 6;            /*!< [5..0] FIFO read threshold in bytes. A value of 0 will disable
+                                                     the read FIFO level from activating the threshold interrupt.
+                                                     If this field is non-zero, it will trigger a threshold
+                                                     interrupt when the read fifo contains FIFORTHR valid bytes
+                                                     of data, as indicated by the FIFO1SIZ field. This is intended
+                                                     to signal when a data transfer of FIFORTHR bytes can be
+                                                     done from the IOM module to the host via the read fifo
+                                                     to support large IOM read operations.                                     */
+      __IM  uint32_t            : 2;
+      __IOM uint32_t FIFOWTHR   : 6;            /*!< [13..8] FIFO write threshold in bytes. A value of 0 will disable
+                                                     the write FIFO level from activating the threshold interrupt.
+                                                     If this field is non-zero, it will trigger a threshold
+                                                     interrupt when the write fifo contains FIFOWTHR free bytes,
+                                                     as indicated by the FIFO0REM field. This is intended to
+                                                     signal when a transfer of FIFOWTHR bytes can be done from
+                                                     the host to the IOM write fifo to support large IOM write
+                                                     operations.                                                               */
+    } FIFOTHR_b;
+  } ;
+
+  union {
+    __IOM uint32_t FIFOPOP;                     /*!< (@ 0x00000108) FIFO POP register                                          */
+
+    struct {
+      __IOM uint32_t FIFODOUT   : 32;           /*!< [31..0] This register will return the read data indicated by
+                                                     the current read pointer on reads. If the POPWR control
+                                                     bit in the FIFOCTRL register is reset (0), the fifo read
+                                                     pointer will be advanced by one word as a result of the
+                                                     read.If the POPWR bit is set (1), the fifo read pointer
+                                                     will only be advanced after a write operation to this register.
+                                                     The write data is ignored for this register.If less than
+                                                     a even word multiple is available, and the command is completed,
+                                                     the module will return the word containing                                */
+    } FIFOPOP_b;
+  } ;
+
+  union {
+    __IOM uint32_t FIFOPUSH;                    /*!< (@ 0x0000010C) FIFO PUSH register                                         */
+
+    struct {
+      __IOM uint32_t FIFODIN    : 32;           /*!< [31..0] This register is used to write the FIFORAM in FIFO mode
+                                                     and will cause a push event to occur to the next open slot
+                                                     within the FIFORAM. Writing to this register will cause
+                                                     the write point to increment by 1 word(4 bytes).                          */
+    } FIFOPUSH_b;
+  } ;
+
+  union {
+    __IOM uint32_t FIFOCTRL;                    /*!< (@ 0x00000110) FIFO Control Register                                      */
+
+    struct {
+      __IOM uint32_t POPWR      : 1;            /*!< [0..0] Selects the mode in which 'pop' events are done for the
+                                                     fifo read operations. A value of '1' will prevent a pop
+                                                     event on a read operation, and will require a write to
+                                                     the FIFOPOP register to create a pop event.A value of '0'
+                                                     in this register will allow a pop event to occur on the
+                                                     read of the FIFOPOP register, and may cause inadvertant
+                                                     fifo pops when used in a debugging mode.                                  */
+      __IOM uint32_t FIFORSTN   : 1;            /*!< [1..1] Active low manual reset of the fifo. Write to 0 to reset
+                                                     fifo, and then write to 1 to remove the reset.                            */
+    } FIFOCTRL_b;
+  } ;
+
+  union {
+    __IOM uint32_t FIFOLOC;                     /*!< (@ 0x00000114) FIFO Pointers                                              */
+
+    struct {
+      __IOM uint32_t FIFOWPTR   : 4;            /*!< [3..0] Current FIFO write pointer. Value is the index into the
+                                                     outgoing FIFO (FIFO0), which is used during write operations
+                                                     to external devices.                                                      */
+      __IM  uint32_t            : 4;
+      __IOM uint32_t FIFORPTR   : 4;            /*!< [11..8] Current FIFO read pointer. Used to index into the incoming
+                                                     FIFO (FIFO1), which is used to store read data returned
+                                                     from external devices during a read operation.                            */
+    } FIFOLOC_b;
+  } ;
+  __IM  uint32_t  RESERVED1[58];
+
+  union {
+    __IOM uint32_t CLKCFG;                      /*!< (@ 0x00000200) I/O Clock Configuration                                    */
+
+    struct {
+      __IOM uint32_t IOCLKEN    : 1;            /*!< [0..0] Enable for the interface clock. Must be enabled prior
+                                                     to executing any IO operations.                                           */
+      __IM  uint32_t            : 7;
+      __IOM uint32_t FSEL       : 3;            /*!< [10..8] Select the input clock frequency.                                 */
+      __IOM uint32_t CLK32KEN   : 1;            /*!< [11..11] Enable for the 32Khz clock to the BLE module                     */
+      __IOM uint32_t DIV3       : 1;            /*!< [12..12] Enable of the divide by 3 of the source IOCLK.                   */
+    } CLKCFG_b;
+  } ;
+  __IM  uint32_t  RESERVED2[2];
+
+  union {
+    __IOM uint32_t CMD;                         /*!< (@ 0x0000020C) Command and offset Register                                */
+
+    struct {
+      __IOM uint32_t CMD        : 5;            /*!< [4..0] Command for submodule.                                             */
+      __IOM uint32_t OFFSETCNT  : 2;            /*!< [6..5] Number of offset bytes to use for the command - 0, 1,
+                                                     2, 3 are valid selections. The second (byte 1) and third
+                                                     byte (byte 2) are read from the OFFSETHI register, and
+                                                     the low order byte is pulled from this register in the
+                                                     OFFSETLO field.Offset bytes are transmitted highest byte
+                                                     first. EG if offsetcnt == 3, OFFSETHI[15:8] will be transmitted
+                                                     first, then OFFSETHI[7:0] then OFFSETLO.If offsetcnt ==
+                                                     2, OFFSETHI[7:0] will be transmitted, then OFFSETLO.If
+                                                     offsetcnt == 1, only OFFSETLO will be transmitted.                        */
+      __IOM uint32_t CONT       : 1;            /*!< [7..7] Contine to hold the bus after the current transaction
+                                                     if set to a 1 with a new command issued.                                  */
+      __IOM uint32_t TSIZE      : 12;           /*!< [19..8] Defines the transaction size in bytes. The offset transfer
+                                                     is not included in this size.                                             */
+      __IOM uint32_t CMDSEL     : 2;            /*!< [21..20] Command Specific selection information                           */
+      __IM  uint32_t            : 2;
+      __IOM uint32_t OFFSETLO   : 8;            /*!< [31..24] This register holds the low order byte of offset to
+                                                     be used in the transaction. The number of offset bytes
+                                                     to use is set with bits 1:0 of the command. Offset bytes
+                                                     are transferred starting from the highest byte first.                     */
+    } CMD_b;
+  } ;
+
+  union {
+    __IOM uint32_t CMDRPT;                      /*!< (@ 0x00000210) Command Repeat Register                                    */
+
+    struct {
+      __IOM uint32_t CMDRPT     : 5;            /*!< [4..0] Count of number of times to repeat the next command.               */
+    } CMDRPT_b;
+  } ;
+
+  union {
+    __IOM uint32_t OFFSETHI;                    /*!< (@ 0x00000214) High order offset bytes                                    */
+
+    struct {
+      __IOM uint32_t OFFSETHI   : 16;           /*!< [15..0] Holds the high order bytes of the 2 or 3 byte offset
+                                                     phase of a transaction.                                                   */
+    } OFFSETHI_b;
+  } ;
+
+  union {
+    __IOM uint32_t CMDSTAT;                     /*!< (@ 0x00000218) Command status                                             */
+
+    struct {
+      __IOM uint32_t CCMD       : 5;            /*!< [4..0] current command that is being executed                             */
+      __IOM uint32_t CMDSTAT    : 3;            /*!< [7..5] The current status of the command execution.                       */
+      __IOM uint32_t CTSIZE     : 12;           /*!< [19..8] The current number of bytes still to be transferred
+                                                     with this command. This field will count down to zero.                    */
+    } CMDSTAT_b;
+  } ;
+  __IM  uint32_t  RESERVED3;
+
+  union {
+    __IOM uint32_t INTEN;                       /*!< (@ 0x00000220) IO Master Interrupts: Enable                               */
+
+    struct {
+      __IOM uint32_t CMDCMP     : 1;            /*!< [0..0] Command Complete interrupt. Asserted when the current
+                                                     operation has completed. For repeated commands, this will
+                                                     only be asserted when the final repeated command is completed.            */
+      __IOM uint32_t THR        : 1;            /*!< [1..1] FIFO Threshold interrupt. For write operations, asserted
+                                                     when the number of free bytes in the write FIFO equals
+                                                     or exceeds the WTHR field.For read operations, asserted
+                                                     when the number of valid bytes in the read FIFO equals
+                                                     of exceeds the value set in the RTHR field.                               */
+      __IOM uint32_t FUNDFL     : 1;            /*!< [2..2] Read FIFO Underflow interrupt. Asserted when a pop operation
+                                                     is done to a empty read FIFO.                                             */
+      __IOM uint32_t FOVFL      : 1;            /*!< [3..3] Write FIFO Overflow interrupt. This occurs when software
+                                                     tries to write to a full fifo. The current operation does
+                                                     not stop.                                                                 */
+      __IOM uint32_t B2MST      : 1;            /*!< [4..4] B2M State change interrupt. Asserted on any change in
+                                                     the B2M_STATE signal from the BLE Core.                                   */
+      __IOM uint32_t IACC       : 1;            /*!< [5..5] illegal FIFO access interrupt. Asserted when there is
+                                                     a overflow or underflow event                                             */
+      __IOM uint32_t ICMD       : 1;            /*!< [6..6] illegal command interrupt. Asserted when a command is
+                                                     written when an active command is in progress.                            */
+      __IOM uint32_t BLECIRQ    : 1;            /*!< [7..7] BLE Core IRQ signal. Asserted when the BLE_IRQ signal
+                                                     from the BLE Core is asserted, indicating the availability
+                                                     of read data from the BLE Core.                                           */
+      __IOM uint32_t BLECSSTAT  : 1;            /*!< [8..8] BLE Core SPI Status interrupt. Asserted when the SPI_STATUS
+                                                     signal from the BLE Core is asserted, indicating that SPI
+                                                     writes can be done to the BLE Core.Transfers to the BLE
+                                                     Core should only be done when this signal is high.                        */
+      __IOM uint32_t DCMP       : 1;            /*!< [9..9] DMA Complete. Processing of the DMA operation has completed
+                                                     and the DMA submodule is returned into the idle state                     */
+      __IOM uint32_t DERR       : 1;            /*!< [10..10] DMA Error encountered during the processing of the
+                                                     DMA command. The DMA error could occur when the memory
+                                                     access specified in the DMA operation is not available
+                                                     or incorrectly specified.                                                 */
+      __IOM uint32_t CQPAUSED   : 1;            /*!< [11..11] Command queue is paused due to an active event enabled
+                                                     in the PAUSEEN register. The interrupt is posted when the
+                                                     event is enabled within the PAUSEEN register, the mask
+                                                     is active in the CQIRQMASK field and the event occurs.                    */
+      __IOM uint32_t CQUPD      : 1;            /*!< [12..12] Command queue write operation executed a register write
+                                                     with the register address bit 0 set to 1. The low address
+                                                     bits in the CQ address fields are unused and bit 0 can
+                                                     be used to trigger an interrupt to indicate when this register
+                                                     write is performed by the CQ operation.                                   */
+      __IOM uint32_t CQERR      : 1;            /*!< [13..13] Command queue error during processing. When an error
+                                                     occurs, the system will stop processing and halt operations
+                                                     to allow software to take recovery actions                                */
+      __IOM uint32_t B2MSLEEP   : 1;            /*!< [14..14] The B2M_STATE from the BLE Core transitioned into the
+                                                     sleep state                                                               */
+      __IOM uint32_t B2MACTIVE  : 1;            /*!< [15..15] The B2M_STATE from the BLE Core transitioned into the
+                                                     active state                                                              */
+      __IOM uint32_t B2MSHUTDN  : 1;            /*!< [16..16] The B2M_STATE from the BLE Core transitioned into shutdown
+                                                     state                                                                     */
+    } INTEN_b;
+  } ;
+
+  union {
+    __IOM uint32_t INTSTAT;                     /*!< (@ 0x00000224) IO Master Interrupts: Status                               */
+
+    struct {
+      __IOM uint32_t CMDCMP     : 1;            /*!< [0..0] Command Complete interrupt. Asserted when the current
+                                                     operation has completed. For repeated commands, this will
+                                                     only be asserted when the final repeated command is completed.            */
+      __IOM uint32_t THR        : 1;            /*!< [1..1] FIFO Threshold interrupt. For write operations, asserted
+                                                     when the number of free bytes in the write FIFO equals
+                                                     or exceeds the WTHR field.For read operations, asserted
+                                                     when the number of valid bytes in the read FIFO equals
+                                                     of exceeds the value set in the RTHR field.                               */
+      __IOM uint32_t FUNDFL     : 1;            /*!< [2..2] Read FIFO Underflow interrupt. Asserted when a pop operation
+                                                     is done to a empty read FIFO.                                             */
+      __IOM uint32_t FOVFL      : 1;            /*!< [3..3] Write FIFO Overflow interrupt. This occurs when software
+                                                     tries to write to a full fifo. The current operation does
+                                                     not stop.                                                                 */
+      __IOM uint32_t B2MST      : 1;            /*!< [4..4] B2M State change interrupt. Asserted on any change in
+                                                     the B2M_STATE signal from the BLE Core.                                   */
+      __IOM uint32_t IACC       : 1;            /*!< [5..5] illegal FIFO access interrupt. Asserted when there is
+                                                     a overflow or underflow event                                             */
+      __IOM uint32_t ICMD       : 1;            /*!< [6..6] illegal command interrupt. Asserted when a command is
+                                                     written when an active command is in progress.                            */
+      __IOM uint32_t BLECIRQ    : 1;            /*!< [7..7] BLE Core IRQ signal. Asserted when the BLE_IRQ signal
+                                                     from the BLE Core is asserted, indicating the availability
+                                                     of read data from the BLE Core.                                           */
+      __IOM uint32_t BLECSSTAT  : 1;            /*!< [8..8] BLE Core SPI Status interrupt. Asserted when the SPI_STATUS
+                                                     signal from the BLE Core is asserted, indicating that SPI
+                                                     writes can be done to the BLE Core.Transfers to the BLE
+                                                     Core should only be done when this signal is high.                        */
+      __IOM uint32_t DCMP       : 1;            /*!< [9..9] DMA Complete. Processing of the DMA operation has completed
+                                                     and the DMA submodule is returned into the idle state                     */
+      __IOM uint32_t DERR       : 1;            /*!< [10..10] DMA Error encountered during the processing of the
+                                                     DMA command. The DMA error could occur when the memory
+                                                     access specified in the DMA operation is not available
+                                                     or incorrectly specified.                                                 */
+      __IOM uint32_t CQPAUSED   : 1;            /*!< [11..11] Command queue is paused due to an active event enabled
+                                                     in the PAUSEEN register. The interrupt is posted when the
+                                                     event is enabled within the PAUSEEN register, the mask
+                                                     is active in the CQIRQMASK field and the event occurs.                    */
+      __IOM uint32_t CQUPD      : 1;            /*!< [12..12] Command queue write operation executed a register write
+                                                     with the register address bit 0 set to 1. The low address
+                                                     bits in the CQ address fields are unused and bit 0 can
+                                                     be used to trigger an interrupt to indicate when this register
+                                                     write is performed by the CQ operation.                                   */
+      __IOM uint32_t CQERR      : 1;            /*!< [13..13] Command queue error during processing. When an error
+                                                     occurs, the system will stop processing and halt operations
+                                                     to allow software to take recovery actions                                */
+      __IOM uint32_t B2MSLEEP   : 1;            /*!< [14..14] The B2M_STATE from the BLE Core transitioned into the
+                                                     sleep state                                                               */
+      __IOM uint32_t B2MACTIVE  : 1;            /*!< [15..15] The B2M_STATE from the BLE Core transitioned into the
+                                                     active state                                                              */
+      __IOM uint32_t B2MSHUTDN  : 1;            /*!< [16..16] The B2M_STATE from the BLE Core transitioned into shutdown
+                                                     state                                                                     */
+    } INTSTAT_b;
+  } ;
+
+  union {
+    __IOM uint32_t INTCLR;                      /*!< (@ 0x00000228) IO Master Interrupts: Clear                                */
+
+    struct {
+      __IOM uint32_t CMDCMP     : 1;            /*!< [0..0] Command Complete interrupt. Asserted when the current
+                                                     operation has completed. For repeated commands, this will
+                                                     only be asserted when the final repeated command is completed.            */
+      __IOM uint32_t THR        : 1;            /*!< [1..1] FIFO Threshold interrupt. For write operations, asserted
+                                                     when the number of free bytes in the write FIFO equals
+                                                     or exceeds the WTHR field.For read operations, asserted
+                                                     when the number of valid bytes in the read FIFO equals
+                                                     of exceeds the value set in the RTHR field.                               */
+      __IOM uint32_t FUNDFL     : 1;            /*!< [2..2] Read FIFO Underflow interrupt. Asserted when a pop operation
+                                                     is done to a empty read FIFO.                                             */
+      __IOM uint32_t FOVFL      : 1;            /*!< [3..3] Write FIFO Overflow interrupt. This occurs when software
+                                                     tries to write to a full fifo. The current operation does
+                                                     not stop.                                                                 */
+      __IOM uint32_t B2MST      : 1;            /*!< [4..4] B2M State change interrupt. Asserted on any change in
+                                                     the B2M_STATE signal from the BLE Core.                                   */
+      __IOM uint32_t IACC       : 1;            /*!< [5..5] illegal FIFO access interrupt. Asserted when there is
+                                                     a overflow or underflow event                                             */
+      __IOM uint32_t ICMD       : 1;            /*!< [6..6] illegal command interrupt. Asserted when a command is
+                                                     written when an active command is in progress.                            */
+      __IOM uint32_t BLECIRQ    : 1;            /*!< [7..7] BLE Core IRQ signal. Asserted when the BLE_IRQ signal
+                                                     from the BLE Core is asserted, indicating the availability
+                                                     of read data from the BLE Core.                                           */
+      __IOM uint32_t BLECSSTAT  : 1;            /*!< [8..8] BLE Core SPI Status interrupt. Asserted when the SPI_STATUS
+                                                     signal from the BLE Core is asserted, indicating that SPI
+                                                     writes can be done to the BLE Core.Transfers to the BLE
+                                                     Core should only be done when this signal is high.                        */
+      __IOM uint32_t DCMP       : 1;            /*!< [9..9] DMA Complete. Processing of the DMA operation has completed
+                                                     and the DMA submodule is returned into the idle state                     */
+      __IOM uint32_t DERR       : 1;            /*!< [10..10] DMA Error encountered during the processing of the
+                                                     DMA command. The DMA error could occur when the memory
+                                                     access specified in the DMA operation is not available
+                                                     or incorrectly specified.                                                 */
+      __IOM uint32_t CQPAUSED   : 1;            /*!< [11..11] Command queue is paused due to an active event enabled
+                                                     in the PAUSEEN register. The interrupt is posted when the
+                                                     event is enabled within the PAUSEEN register, the mask
+                                                     is active in the CQIRQMASK field and the event occurs.                    */
+      __IOM uint32_t CQUPD      : 1;            /*!< [12..12] Command queue write operation executed a register write
+                                                     with the register address bit 0 set to 1. The low address
+                                                     bits in the CQ address fields are unused and bit 0 can
+                                                     be used to trigger an interrupt to indicate when this register
+                                                     write is performed by the CQ operation.                                   */
+      __IOM uint32_t CQERR      : 1;            /*!< [13..13] Command queue error during processing. When an error
+                                                     occurs, the system will stop processing and halt operations
+                                                     to allow software to take recovery actions                                */
+      __IOM uint32_t B2MSLEEP   : 1;            /*!< [14..14] The B2M_STATE from the BLE Core transitioned into the
+                                                     sleep state                                                               */
+      __IOM uint32_t B2MACTIVE  : 1;            /*!< [15..15] The B2M_STATE from the BLE Core transitioned into the
+                                                     active state                                                              */
+      __IOM uint32_t B2MSHUTDN  : 1;            /*!< [16..16] The B2M_STATE from the BLE Core transitioned into shutdown
+                                                     state                                                                     */
+    } INTCLR_b;
+  } ;
+
+  union {
+    __IOM uint32_t INTSET;                      /*!< (@ 0x0000022C) IO Master Interrupts: Set                                  */
+
+    struct {
+      __IOM uint32_t CMDCMP     : 1;            /*!< [0..0] Command Complete interrupt. Asserted when the current
+                                                     operation has completed. For repeated commands, this will
+                                                     only be asserted when the final repeated command is completed.            */
+      __IOM uint32_t THR        : 1;            /*!< [1..1] FIFO Threshold interrupt. For write operations, asserted
+                                                     when the number of free bytes in the write FIFO equals
+                                                     or exceeds the WTHR field.For read operations, asserted
+                                                     when the number of valid bytes in the read FIFO equals
+                                                     of exceeds the value set in the RTHR field.                               */
+      __IOM uint32_t FUNDFL     : 1;            /*!< [2..2] Read FIFO Underflow interrupt. Asserted when a pop operation
+                                                     is done to a empty read FIFO.                                             */
+      __IOM uint32_t FOVFL      : 1;            /*!< [3..3] Write FIFO Overflow interrupt. This occurs when software
+                                                     tries to write to a full fifo. The current operation does
+                                                     not stop.                                                                 */
+      __IOM uint32_t B2MST      : 1;            /*!< [4..4] B2M State change interrupt. Asserted on any change in
+                                                     the B2M_STATE signal from the BLE Core.                                   */
+      __IOM uint32_t IACC       : 1;            /*!< [5..5] illegal FIFO access interrupt. Asserted when there is
+                                                     a overflow or underflow event                                             */
+      __IOM uint32_t ICMD       : 1;            /*!< [6..6] illegal command interrupt. Asserted when a command is
+                                                     written when an active command is in progress.                            */
+      __IOM uint32_t BLECIRQ    : 1;            /*!< [7..7] BLE Core IRQ signal. Asserted when the BLE_IRQ signal
+                                                     from the BLE Core is asserted, indicating the availability
+                                                     of read data from the BLE Core.                                           */
+      __IOM uint32_t BLECSSTAT  : 1;            /*!< [8..8] BLE Core SPI Status interrupt. Asserted when the SPI_STATUS
+                                                     signal from the BLE Core is asserted, indicating that SPI
+                                                     writes can be done to the BLE Core.Transfers to the BLE
+                                                     Core should only be done when this signal is high.                        */
+      __IOM uint32_t DCMP       : 1;            /*!< [9..9] DMA Complete. Processing of the DMA operation has completed
+                                                     and the DMA submodule is returned into the idle state                     */
+      __IOM uint32_t DERR       : 1;            /*!< [10..10] DMA Error encountered during the processing of the
+                                                     DMA command. The DMA error could occur when the memory
+                                                     access specified in the DMA operation is not available
+                                                     or incorrectly specified.                                                 */
+      __IOM uint32_t CQPAUSED   : 1;            /*!< [11..11] Command queue is paused due to an active event enabled
+                                                     in the PAUSEEN register. The interrupt is posted when the
+                                                     event is enabled within the PAUSEEN register, the mask
+                                                     is active in the CQIRQMASK field and the event occurs.                    */
+      __IOM uint32_t CQUPD      : 1;            /*!< [12..12] Command queue write operation executed a register write
+                                                     with the register address bit 0 set to 1. The low address
+                                                     bits in the CQ address fields are unused and bit 0 can
+                                                     be used to trigger an interrupt to indicate when this register
+                                                     write is performed by the CQ operation.                                   */
+      __IOM uint32_t CQERR      : 1;            /*!< [13..13] Command queue error during processing. When an error
+                                                     occurs, the system will stop processing and halt operations
+                                                     to allow software to take recovery actions                                */
+      __IOM uint32_t B2MSLEEP   : 1;            /*!< [14..14] The B2M_STATE from the BLE Core transitioned into the
+                                                     sleep state                                                               */
+      __IOM uint32_t B2MACTIVE  : 1;            /*!< [15..15] The B2M_STATE from the BLE Core transitioned into the
+                                                     active state                                                              */
+      __IOM uint32_t B2MSHUTDN  : 1;            /*!< [16..16] The B2M_STATE from the BLE Core transitioned into shutdown
+                                                     state                                                                     */
+    } INTSET_b;
+  } ;
+
+  union {
+    __IOM uint32_t DMATRIGEN;                   /*!< (@ 0x00000230) DMA Trigger Enable Register                                */
+
+    struct {
+      __IOM uint32_t DCMDCMPEN  : 1;            /*!< [0..0] Trigger DMA upon command complete. Enables the trigger
+                                                     of the DMA when a command is completed. When this event
+                                                     is triggered, the number of words transferred will be the
+                                                     lesser of the remaining TOTCOUNT bytes, or the number of
+                                                     bytes in the FIFO when the command completed. If this is
+                                                     disabled, and the number of bytes in the FIFO is equal
+                                                     or greater than the TOTCOUNT bytes, a transfer of TOTCOUNT
+                                                     bytes will be done to ensure read data is stored when the
+                                                     DMA is completed.                                                         */
+      __IOM uint32_t DTHREN     : 1;            /*!< [1..1] Trigger DMA upon THR level reached. For M2P DMA operations
+                                                     (IOM writes), the trigger will assert when the write FIFO
+                                                     has (WTHR/4) number of words free in the write FIFO, and
+                                                     will transfer (WTHR/4) number of wordsor, if the number
+                                                     of words left to transfer is less than the WTHR value,
+                                                     will transfer the remaining byte count.For P2M DMA operations,
+                                                     the trigger will assert when the read FIFO has (RTHR/4)
+                                                     words available in the read FIFO, and will transfer (RTHR/4)
+                                                     words to SRAM. This trigger will NOT asser                                */
+    } DMATRIGEN_b;
+  } ;
+
+  union {
+    __IOM uint32_t DMATRIGSTAT;                 /*!< (@ 0x00000234) DMA Trigger Status Register                                */
+
+    struct {
+      __IOM uint32_t DCMDCMP    : 1;            /*!< [0..0] Triggered DMA from Command complete event. Bit is read
+                                                     only and can be cleared by disabling the DCMDCMP trigger
+                                                     enable or by disabling DMA.                                               */
+      __IOM uint32_t DTHR       : 1;            /*!< [1..1] Triggered DMA from THR event. Bit is read only and can
+                                                     be cleared by disabling the DTHR trigger enable or by disabling
+                                                     DMA.                                                                      */
+      __IOM uint32_t DTOTCMP    : 1;            /*!< [2..2] DMA triggered when DCMDCMP = 0, and the amount of data
+                                                     in the FIFO was enough to complete the DMA operation (greater
+                                                     than or equal to current TOTCOUNT) when the command completed.
+                                                     This trigger is default active when the DCMDCMP trigger
+                                                     isdisabled and there is enough data in the FIFO to complete
+                                                     the DMA operation.                                                        */
+    } DMATRIGSTAT_b;
+  } ;
+
+  union {
+    __IOM uint32_t DMACFG;                      /*!< (@ 0x00000238) DMA Configuration Register                                 */
+
+    struct {
+      __IOM uint32_t DMAEN      : 1;            /*!< [0..0] DMA Enable. Setting this bit to EN will start the DMA
+                                                     operation. This should be the last DMA related register
+                                                     set prior to issuing the command                                          */
+      __IOM uint32_t DMADIR     : 1;            /*!< [1..1] Direction                                                          */
+      __IM  uint32_t            : 6;
+      __IOM uint32_t DMAPRI     : 1;            /*!< [8..8] Sets the Priority of the DMA request                               */
+      __IOM uint32_t DPWROFF    : 1;            /*!< [9..9] Power off module after DMA is complete. If this bit is
+                                                     active, the module will request to power off the supply
+                                                     it is attached to. If there are other units still requiring
+                                                     power from the same domain, power down will not be performed.             */
+    } DMACFG_b;
+  } ;
+
+  union {
+    __IOM uint32_t DMATOTCOUNT;                 /*!< (@ 0x0000023C) DMA Total Transfer Count                                   */
+
+    struct {
+      __IOM uint32_t TOTCOUNT   : 12;           /*!< [11..0] Triggered DMA from Command complete event occured. Bit
+                                                     is read only and can be cleared by disabling the DTHR trigger
+                                                     enable or by disabling DMA.                                               */
+    } DMATOTCOUNT_b;
+  } ;
+
+  union {
+    __IOM uint32_t DMATARGADDR;                 /*!< (@ 0x00000240) DMA Target Address Register                                */
+
+    struct {
+      __IOM uint32_t TARGADDR   : 20;           /*!< [19..0] Bits [19:0] of the target byte address for source of
+                                                     DMA (either read or write). The address can be any byte
+                                                     alignment, and does not have to be word aligned. In cases
+                                                     of non-word aligned addresses, the DMA logic will take
+                                                     care for ensuring only the target bytes are read/written.                 */
+      __IM  uint32_t            : 8;
+      __IOM uint32_t TARGADDR28 : 1;            /*!< [28..28] Bit 28 of the target byte address for source of DMA
+                                                     (either read or write). In cases of non-word aligned addresses,
+                                                     the DMA logic will take care for ensuring only the target
+                                                     bytes are read/written.Setting to '1' will select the SRAM.
+                                                     Setting to '0' will select the flash                                      */
+    } DMATARGADDR_b;
+  } ;
+
+  union {
+    __IOM uint32_t DMASTAT;                     /*!< (@ 0x00000244) DMA Status Register                                        */
+
+    struct {
+      __IOM uint32_t DMATIP     : 1;            /*!< [0..0] DMA Transfer In Progress indicator. 1 will indicate that
+                                                     a DMA transfer is active. The DMA transfer may be waiting
+                                                     on data, transferring data, or waiting for priority.All
+                                                     of these will be indicated with a 1. A 0 will indicate
+                                                     that the DMA is fully complete and no further transactions
+                                                     will be done. This bit is read only.                                      */
+      __IOM uint32_t DMACPL     : 1;            /*!< [1..1] DMA Transfer Complete. This signals the end of the DMA
+                                                     operation. This bit can be cleared by writing to 0.                       */
+      __IOM uint32_t DMAERR     : 1;            /*!< [2..2] DMA Error. This active high bit signals that an error
+                                                     was encountered during the DMA operation.                                 */
+    } DMASTAT_b;
+  } ;
+
+  union {
+    __IOM uint32_t CQCFG;                       /*!< (@ 0x00000248) Command Queue Configuration Register                       */
+
+    struct {
+      __IOM uint32_t CQEN       : 1;            /*!< [0..0] Command queue enable. When set, will enable the processing
+                                                     of the command queue and fetches of address/data pairs
+                                                     will proceed from the word address within the CQADDR register.
+                                                     Can be disabledusing a CQ executed write to this bit as
+                                                     well.                                                                     */
+      __IOM uint32_t CQPRI      : 1;            /*!< [1..1] Sets the Priority of the command queue dma request.                */
+    } CQCFG_b;
+  } ;
+
+  union {
+    __IOM uint32_t CQADDR;                      /*!< (@ 0x0000024C) CQ Target Read Address Register                            */
+
+    struct {
+      __IM  uint32_t            : 2;
+      __IOM uint32_t CQADDR     : 18;           /*!< [19..2] Bits 19:2 of target byte address for source of CQ (read
+                                                     only). The buffer must be aligned on a word boundary                      */
+      __IM  uint32_t            : 8;
+      __IOM uint32_t CQADDR28   : 1;            /*!< [28..28] Bit 28 of target byte address for source of CQ (read
+                                                     only). Used to denote Flash (0) or SRAM (1) access                        */
+    } CQADDR_b;
+  } ;
+
+  union {
+    __IOM uint32_t CQSTAT;                      /*!< (@ 0x00000250) Command Queue Status Register                              */
+
+    struct {
+      __IOM uint32_t CQTIP      : 1;            /*!< [0..0] Command queue Transfer In Progress indicator. 1 will
+                                                     indicate that a CQ transfer is active and this will remain
+                                                     active even when paused waiting for external event.                       */
+      __IOM uint32_t CQPAUSED   : 1;            /*!< [1..1] Command queue operation is currently paused.                       */
+      __IOM uint32_t CQERR      : 1;            /*!< [2..2] Command queue processing Error. This active high bit
+                                                     signals that an error was encountered during the CQ operation.            */
+    } CQSTAT_b;
+  } ;
+
+  union {
+    __IOM uint32_t CQFLAGS;                     /*!< (@ 0x00000254) Command Queue Flag Register                                */
+
+    struct {
+      __IOM uint32_t CQFLAGS    : 16;           /*!< [15..0] Current flag status (read-only). Bits [7:0] are software
+                                                     controllable and bits [15:8] are hardware status.                         */
+      __IOM uint32_t CQIRQMASK  : 16;           /*!< [31..16] Provides for a per-bit mask of the flags used to invoke
+                                                     an interrupt. A '1' in the bit position will enable the
+                                                     pause event to trigger the interrupt, if the CQWT_int interrupt
+                                                     is enabled.Bits definitions are the same as CQPAUSE                       */
+    } CQFLAGS_b;
+  } ;
+
+  union {
+    __IOM uint32_t CQSETCLEAR;                  /*!< (@ 0x00000258) Command Queue Flag Set/Clear Register                      */
+
+    struct {
+      __IOM uint32_t CQFSET     : 8;            /*!< [7..0] Set CQFlag status bits. Will set to 1 the value of any
+                                                     SWFLAG with a '1' in the corresponding bit position of
+                                                     this field                                                                */
+      __IOM uint32_t CQFTGL     : 8;            /*!< [15..8] Toggle the indicated bit. Will toggle the value of any
+                                                     SWFLAG with a '1' in the corresponding bit position of
+                                                     this field                                                                */
+      __IOM uint32_t CQFCLR     : 8;            /*!< [23..16] Clear CQFlag status bits. Will clear to 0 any SWFLAG
+                                                     with a '1' in the corresponding bit position of this field                */
+    } CQSETCLEAR_b;
+  } ;
+
+  union {
+    __IOM uint32_t CQPAUSEEN;                   /*!< (@ 0x0000025C) Command Queue Pause Enable Register                        */
+
+    struct {
+      __IOM uint32_t CQPEN      : 16;           /*!< [15..0] Enables the specified event to pause command processing
+                                                     when active                                                               */
+    } CQPAUSEEN_b;
+  } ;
+
+  union {
+    __IOM uint32_t CQCURIDX;                    /*!< (@ 0x00000260) IOM Command Queue current index value . Compared
+                                                                    to the CQENDIDX reg contents to generate
+                                                                    the IDXEQ Pause event for command queue                    */
+
+    struct {
+      __IOM uint32_t CQCURIDX   : 8;            /*!< [7..0] Holds 8 bits of data that will be compared with the CQENDIX
+                                                     register field. If the values match, the IDXEQ pause event
+                                                     will be activated, which will cause the pausing of command
+                                                     quue operation if the IDXEQ bit is enabled in CQPAUSEEN.                  */
+    } CQCURIDX_b;
+  } ;
+
+  union {
+    __IOM uint32_t CQENDIDX;                    /*!< (@ 0x00000264) IOM Command Queue current index value . Compared
+                                                                    to the CQCURIDX reg contents to generate
+                                                                    the IDXEQ Pause event for command queue                    */
+
+    struct {
+      __IOM uint32_t CQENDIDX   : 8;            /*!< [7..0] Holds 8 bits of data that will be compared with the CQCURIX
+                                                     register field. If the values match, the IDXEQ pause event
+                                                     will be activated, which will cause the pausing of command
+                                                     quue operation if the IDXEQ bit is enabled in CQPAUSEEN.                  */
+    } CQENDIDX_b;
+  } ;
+
+  union {
+    __IOM uint32_t STATUS;                      /*!< (@ 0x00000268) IOM Module Status Register                                 */
+
+    struct {
+      __IOM uint32_t ERR        : 1;            /*!< [0..0] Bit has been deprecated. Please refer to the other error
+                                                     indicators. This will always return 0.                                    */
+      __IOM uint32_t CMDACT     : 1;            /*!< [1..1] Indicates if the active I/O Command is currently processing
+                                                     a transaction, or command is complete, but the FIFO pointers
+                                                     are still syncronizing internally. This bit will go high
+                                                     atthe start of the transaction, and will go low when the
+                                                     command is complete, and the data and pointers within the
+                                                     FIFO have been syncronized.                                               */
+      __IOM uint32_t IDLEST     : 1;            /*!< [2..2] indicates if the active I/O state machine is IDLE. Note
+                                                     - The state machine could be in idle state due to holdoffs
+                                                     from data availability, or as the command gets propagated
+                                                     into the logic from the registers.                                        */
+    } STATUS_b;
+  } ;
+  __IM  uint32_t  RESERVED4[37];
+
+  union {
+    __IOM uint32_t MSPICFG;                     /*!< (@ 0x00000300) SPI module master configuration                            */
+
+    struct {
+      __IOM uint32_t SPOL       : 1;            /*!< [0..0] This bit selects SPI polarity.                                     */
+      __IOM uint32_t SPHA       : 1;            /*!< [1..1] Selects the SPI phase; When 1, will shift the sampling
+                                                     edge by 1/2 clock.                                                        */
+      __IOM uint32_t FULLDUP    : 1;            /*!< [2..2] Full Duplex mode. Capture read data during writes operations       */
+      __IM  uint32_t            : 13;
+      __IOM uint32_t WTFC       : 1;            /*!< [16..16] Enables flow control of new write transactions based
+                                                     on the SPI_STATUS signal from the BLE Core.                               */
+      __IOM uint32_t RDFC       : 1;            /*!< [17..17] Enables flow control of new read transactions based
+                                                     on the SPI_STATUS signal from the BLE Core.                               */
+      __IM  uint32_t            : 3;
+      __IOM uint32_t WTFCPOL    : 1;            /*!< [21..21] Selects the write flow control signal polarity. The
+                                                     transfers are halted when the selected flow control signal
+                                                     is OPPOSITE polarity of this bit. (For example: WTFCPOL
+                                                     = 0 will allow a SPI_STATUS=1 to pause transfers).                        */
+      __IOM uint32_t RDFCPOL    : 1;            /*!< [22..22] Selects the read flow control signal polarity. When
+                                                     set, the clock will be held low until the flow control
+                                                     is de-asserted.                                                           */
+      __IOM uint32_t SPILSB     : 1;            /*!< [23..23] Selects data transfer as MSB first (0) or LSB first
+                                                     (1) for the data portion of the SPI transaction. The offset
+                                                     bytes are always transmitted MSB first.                                   */
+      __IOM uint32_t DINDLY     : 3;            /*!< [26..24] Delay tap to use for the input signal (MISO). This
+                                                     gives more hold time on the input data.                                   */
+      __IOM uint32_t DOUTDLY    : 3;            /*!< [29..27] Delay tap to use for the output signal (MOSI). This
+                                                     give more hold time on the output data.                                   */
+      __IOM uint32_t MSPIRST    : 1;            /*!< [30..30] Bit is deprecated. setting it will have no effect.               */
+    } MSPICFG_b;
+  } ;
+
+  union {
+    __IOM uint32_t BLECFG;                      /*!< (@ 0x00000304) BLE Core Control                                           */
+
+    struct {
+      __IOM uint32_t PWRSMEN    : 1;            /*!< [0..0] Enable the power state machine for automatic sequencing
+                                                     and control of power states of the BLE Core module.                       */
+      __IOM uint32_t BLERSTN    : 1;            /*!< [1..1] Reset line to the BLE Core. This will reset the BLE core
+                                                     when asserted ('0') and must be written to '1' prior to
+                                                     performing any BTLE related operations to the core.                       */
+      __IOM uint32_t WAKEUPCTL  : 2;            /*!< [3..2] WAKE signal override. Controls the source of the WAKE
+                                                     signal to the BLE Core.                                                   */
+      __IOM uint32_t DCDCFLGCTL : 2;            /*!< [5..4] DCDCFLG signal override. The value of this field will
+                                                     be sent to the BLE Core when the PWRSM is off. Otherwise,
+                                                     the value is supplied from internal logic.                                */
+      __IOM uint32_t BLEHREQCTL : 2;            /*!< [7..6] BLEH power on request override. The value of this field
+                                                     will be sent to the BLE Core when the PWRSM is off. Otherwise,
+                                                     the value is supplied from internal logic.                                */
+      __IOM uint32_t WT4ACTOFF  : 1;            /*!< [8..8] Debug control of BLEIF power state machine. Allows transition
+                                                     into the active state in the BLEIF state without waiting
+                                                     for dcdc req from BLE Core.                                               */
+      __IOM uint32_t MCUFRCSLP  : 1;            /*!< [9..9] Force power state machine to go to the sleep state. Intended
+                                                     for debug only. Has no effect on the actual BLE Core state,
+                                                     only the state of the BLEIF interface state machine.                      */
+      __IOM uint32_t FRCCLK     : 1;            /*!< [10..10] Force the clock in the BLEIF to be always running                */
+      __IOM uint32_t STAYASLEEP : 1;            /*!< [11..11] Set to prevent the BLE power control module from waking
+                                                     up the BLE Core after going into power down. To be used
+                                                     for graceful shutdown, set by software prior to powering
+                                                     off and will allow assertion of reset from sleep state.                   */
+      __IOM uint32_t PWRISOCTL  : 2;            /*!< [13..12] Configuration of BLEH isolation control for power related
+                                                     signals.                                                                  */
+      __IOM uint32_t SPIISOCTL  : 2;            /*!< [15..14] Configuration of BLEH isolation controls for SPI related
+                                                     signals.                                                                  */
+    } BLECFG_b;
+  } ;
+
+  union {
+    __IOM uint32_t PWRCMD;                      /*!< (@ 0x00000308) BLE Power command interface                                */
+
+    struct {
+      __IOM uint32_t WAKEREQ    : 1;            /*!< [0..0] Wake request from the MCU. When asserted (1), the BLE
+                                                     Interface logic will assert the wakeup request signal to
+                                                     the BLE Core. Only recognized when in the sleep state                     */
+      __IOM uint32_t RESTART    : 1;            /*!< [1..1] Restart the BLE Core after going into the shutdown state.
+                                                     Only valid when in the shutdown state.                                    */
+    } PWRCMD_b;
+  } ;
+
+  union {
+    __IOM uint32_t BSTATUS;                     /*!< (@ 0x0000030C) BLE Core status                                            */
+
+    struct {
+      __IOM uint32_t B2MSTATE   : 3;            /*!< [2..0] State of the BLE Core logic.                                       */
+      __IOM uint32_t SPISTATUS  : 1;            /*!< [3..3] Value of the SPISTATUS signal from the BLE Core. The
+                                                     signal is asserted when the BLE Core is able to accept
+                                                     write data via the SPI interface. Data should be transmitted
+                                                     to theBLE core only when this signal is 1. The hardware
+                                                     will automatically wait for this signal prior to performing
+                                                     a write operation if flow control is active.                              */
+      __IOM uint32_t DCDCREQ    : 1;            /*!< [4..4] Value of the DCDCREQ signal from the BLE Core. The DCDCREQ
+                                                     signal is sent from the core to the BLEIF module when the
+                                                     BLE core requires BLEH power to be active. When activated,
+                                                     this isindicated by DCDCFLAG going to 1.                                  */
+      __IOM uint32_t DCDCFLAG   : 1;            /*!< [5..5] Value of the DCDCFLAG signal to the BLE Core. The DCDCFLAG
+                                                     is a signal to the BLE Core indicating that the BLEH ppower
+                                                     is active.                                                                */
+      __IOM uint32_t WAKEUP     : 1;            /*!< [6..6] Value of the WAKEUP signal to the BLE Core . The WAKEUP
+                                                     signals is sent from the BLEIF to the BLECORE to request
+                                                     the BLE Core transition from sleep state to active state.                 */
+      __IOM uint32_t BLEIRQ     : 1;            /*!< [7..7] Status of the BLEIRQ signal from the BLE Core. A value
+                                                     of 1 idicates that read data is available in the core and
+                                                     a read operation needs to be performed.                                   */
+      __IOM uint32_t PWRST      : 3;            /*!< [10..8] Current status of the power state machine                         */
+      __IOM uint32_t BLEHACK    : 1;            /*!< [11..11] Value of the BLEHACK signal from the power control
+                                                     unit. If the signal is '1', the BLEH power is active and
+                                                     ready for use.                                                            */
+      __IOM uint32_t BLEHREQ    : 1;            /*!< [12..12] Value of the BLEHREQ signal to the power control unit.
+                                                     The BLEHREQ signal is sent from the BLEIF module to the
+                                                     power control module to request the BLEH power up. When
+                                                     the BLEHACK signal is asserted,BLEH power is stable and
+                                                     ready for use.                                                            */
+    } BSTATUS_b;
+  } ;
+  __IM  uint32_t  RESERVED5[64];
+
+  union {
+    __IOM uint32_t BLEDBG;                      /*!< (@ 0x00000410) BLEIF Master Debug Register                                */
+
+    struct {
+      __IOM uint32_t DBGEN      : 1;            /*!< [0..0] Debug Enable. Setting this bit will enable the update
+                                                     of data within this register, otherwise it is clock gated
+                                                     for power savings                                                         */
+      __IOM uint32_t IOCLKON    : 1;            /*!< [1..1] IOCLK debug clock control. Enable IO_CLK to be active
+                                                     when this bit is '1'. Otherwise, the clock is controlled
+                                                     with gating from the logic as needed.                                     */
+      __IOM uint32_t APBCLKON   : 1;            /*!< [2..2] APBCLK debug clock control. Enable APB_CLK to be active
+                                                     when this bit is '1'. Otherwise, the clock is controlled
+                                                     with gating from the logic as needed.                                     */
+      __IOM uint32_t DBGDATA    : 29;           /*!< [31..3] Debug data                                                        */
+    } BLEDBG_b;
+  } ;
+} BLEIF_Type;                                   /*!< Size = 1044 (0x414)                                                       */
+
+
+
+/* =========================================================================================================================== */
+/* ================                                         CACHECTRL                                         ================ */
+/* =========================================================================================================================== */
+
+
+/**
+  * @brief Flash Cache Controller (CACHECTRL)
+  */
+
+typedef struct {                                /*!< (@ 0x40018000) CACHECTRL Structure                                        */
+
+  union {
+    __IOM uint32_t CACHECFG;                    /*!< (@ 0x00000000) Flash Cache Control Register                               */
+
+    struct {
+      __IOM uint32_t ENABLE     : 1;            /*!< [0..0] Enables the flash cache controller and enables power
+                                                     to the cache SRAMs. The ICACHE_ENABLE and DCACHE_ENABLE
+                                                     should be set to enable caching for each type of access.                  */
+      __IOM uint32_t LRU        : 1;            /*!< [1..1] Sets the cache repleacment policy. 0=LRR (least recently
+                                                     replaced), 1=LRU (least recently used). LRR minimizes writes
+                                                     to the TAG SRAM.                                                          */
+      __IOM uint32_t ENABLE_NC0 : 1;            /*!< [2..2] Enable Non-cacheable region 0. See NCR0 registers to
+                                                     define the region.                                                        */
+      __IOM uint32_t ENABLE_NC1 : 1;            /*!< [3..3] Enable Non-cacheable region 1. See NCR1 registers to
+                                                     define the region.                                                        */
+      __IOM uint32_t CONFIG     : 4;            /*!< [7..4] Sets the cache configuration                                       */
+      __IOM uint32_t ICACHE_ENABLE : 1;         /*!< [8..8] Enable Flash Instruction Caching                                   */
+      __IOM uint32_t DCACHE_ENABLE : 1;         /*!< [9..9] Enable Flash Data Caching.                                         */
+      __IOM uint32_t CACHE_CLKGATE : 1;         /*!< [10..10] Enable clock gating of cache TAG RAM. Software should
+                                                     enable this bit for optimal power efficiency.                             */
+      __IOM uint32_t CACHE_LS   : 1;            /*!< [11..11] Enable LS (light sleep) of cache RAMs. Software should
+                                                     DISABLE this bit since cache activity is too high to benefit
+                                                     from LS usage.                                                            */
+      __IM  uint32_t            : 8;
+      __IOM uint32_t DATA_CLKGATE : 1;          /*!< [20..20] Enable aggressive clock gating of entire data array.
+                                                     This bit should be set to 1 for optimal power efficiency.                 */
+      __IM  uint32_t            : 3;
+      __IOM uint32_t ENABLE_MONITOR : 1;        /*!< [24..24] Enable Cache Monitoring Stats. Cache monitoring consumes
+                                                     additional power and should only be enabled when profiling
+                                                     code and counters will increment when this bit is set.
+                                                     Counter values will be retained when this is set to 0,
+                                                     allowing software to enable/disable counting for multiple
+                                                     code segments.                                                            */
+    } CACHECFG_b;
+  } ;
+
+  union {
+    __IOM uint32_t FLASHCFG;                    /*!< (@ 0x00000004) Flash Control Register                                     */
+
+    struct {
+      __IOM uint32_t RD_WAIT    : 4;            /*!< [3..0] Sets read waitstates for normal (fast) operation. A value
+                                                     of 1 is recommended.                                                      */
+      __IOM uint32_t SEDELAY    : 3;            /*!< [6..4] Sets SE delay (flash address setup). A value of 5 is
+                                                     recommended.                                                              */
+      __IM  uint32_t            : 1;
+      __IOM uint32_t LPM_RD_WAIT : 4;           /*!< [11..8] Sets flash waitstates when in LPM Mode 2 (RD_WAIT in
+                                                     LPM mode 2 only)                                                          */
+      __IOM uint32_t LPMMODE    : 2;            /*!< [13..12] Controls flash low power modes (control of LPM pin).             */
+    } FLASHCFG_b;
+  } ;
+
+  union {
+    __IOM uint32_t CTRL;                        /*!< (@ 0x00000008) Cache Control                                              */
+
+    struct {
+      __IOM uint32_t INVALIDATE : 1;            /*!< [0..0] Writing a 1 to this bitfield invalidates the flash cache
+                                                     contents.                                                                 */
+      __IOM uint32_t RESET_STAT : 1;            /*!< [1..1] Reset Cache Statistics. When written to a 1, the cache
+                                                     monitor counters will be cleared. The monitor counters
+                                                     can be reset only when the CACHECFG.ENABLE_MONITOR bit
+                                                     is set.                                                                   */
+      __IOM uint32_t CACHE_READY : 1;           /*!< [2..2] Cache Ready Status (enabled and not processing an invalidate
+                                                     operation)                                                                */
+      __IM  uint32_t            : 1;
+      __IOM uint32_t FLASH0_SLM_STATUS : 1;     /*!< [4..4] Flash Sleep Mode Status. 1 indicates that flash0 is in
+                                                     sleep mode, 0 indicates flash0 is in normal mode.                         */
+      __IOM uint32_t FLASH0_SLM_DISABLE : 1;    /*!< [5..5] Disable Flash Sleep Mode. Write 1 to wake flash0 from
+                                                     sleep mode (reading the array will also automatically wake
+                                                     it).                                                                      */
+      __IOM uint32_t FLASH0_SLM_ENABLE : 1;     /*!< [6..6] Enable Flash Sleep Mode. Write to 1 to put flash 0 into
+                                                     sleep mode. NOTE: there is a 5us latency after waking flash
+                                                     until the first access will be returned.                                  */
+      __IM  uint32_t            : 1;
+      __IOM uint32_t FLASH1_SLM_STATUS : 1;     /*!< [8..8] Flash Sleep Mode Status. 1 indicates that flash1 is in
+                                                     sleep mode, 0 indicates flash1 is in normal mode.                         */
+      __IOM uint32_t FLASH1_SLM_DISABLE : 1;    /*!< [9..9] Disable Flash Sleep Mode. Write 1 to wake flash1 from
+                                                     sleep mode (reading the array will also automatically wake
+                                                     it).                                                                      */
+      __IOM uint32_t FLASH1_SLM_ENABLE : 1;     /*!< [10..10] Enable Flash Sleep Mode. Write to 1 to put flash 1
+                                                     into sleep mode. NOTE: there is a 5us latency after waking
+                                                     flash until the first access will be returned.                            */
+    } CTRL_b;
+  } ;
+  __IM  uint32_t  RESERVED;
+
+  union {
+    __IOM uint32_t NCR0START;                   /*!< (@ 0x00000010) Flash Cache Noncachable Region 0 Start                     */
+
+    struct {
+      __IM  uint32_t            : 4;
+      __IOM uint32_t ADDR       : 23;           /*!< [26..4] Start address for non-cacheable region 0                          */
+    } NCR0START_b;
+  } ;
+
+  union {
+    __IOM uint32_t NCR0END;                     /*!< (@ 0x00000014) Flash Cache Noncachable Region 0 End                       */
+
+    struct {
+      __IM  uint32_t            : 4;
+      __IOM uint32_t ADDR       : 23;           /*!< [26..4] End address for non-cacheable region 0                            */
+    } NCR0END_b;
+  } ;
+
+  union {
+    __IOM uint32_t NCR1START;                   /*!< (@ 0x00000018) Flash Cache Noncachable Region 1 Start                     */
+
+    struct {
+      __IM  uint32_t            : 4;
+      __IOM uint32_t ADDR       : 23;           /*!< [26..4] Start address for non-cacheable region 1                          */
+    } NCR1START_b;
+  } ;
+
+  union {
+    __IOM uint32_t NCR1END;                     /*!< (@ 0x0000001C) Flash Cache Noncachable Region 1 End                       */
+
+    struct {
+      __IM  uint32_t            : 4;
+      __IOM uint32_t ADDR       : 23;           /*!< [26..4] End address for non-cacheable region 1                            */
+    } NCR1END_b;
+  } ;
+  __IM  uint32_t  RESERVED1[8];
+
+  union {
+    __IOM uint32_t DMON0;                       /*!< (@ 0x00000040) Data Cache Total Accesses                                  */
+
+    struct {
+      __IOM uint32_t DACCESS_COUNT : 32;        /*!< [31..0] Total accesses to data cache. All performance metrics
+                                                     should be relative to the number of accesses performed.                   */
+    } DMON0_b;
+  } ;
+
+  union {
+    __IOM uint32_t DMON1;                       /*!< (@ 0x00000044) Data Cache Tag Lookups                                     */
+
+    struct {
+      __IOM uint32_t DLOOKUP_COUNT : 32;        /*!< [31..0] Total tag lookups from data cache.                                */
+    } DMON1_b;
+  } ;
+
+  union {
+    __IOM uint32_t DMON2;                       /*!< (@ 0x00000048) Data Cache Hits                                            */
+
+    struct {
+      __IOM uint32_t DHIT_COUNT : 32;           /*!< [31..0] Cache hits from lookup operations.                                */
+    } DMON2_b;
+  } ;
+
+  union {
+    __IOM uint32_t DMON3;                       /*!< (@ 0x0000004C) Data Cache Line Hits                                       */
+
+    struct {
+      __IOM uint32_t DLINE_COUNT : 32;          /*!< [31..0] Cache hits from line cache                                        */
+    } DMON3_b;
+  } ;
+
+  union {
+    __IOM uint32_t IMON0;                       /*!< (@ 0x00000050) Instruction Cache Total Accesses                           */
+
+    struct {
+      __IOM uint32_t IACCESS_COUNT : 32;        /*!< [31..0] Total accesses to Instruction cache                               */
+    } IMON0_b;
+  } ;
+
+  union {
+    __IOM uint32_t IMON1;                       /*!< (@ 0x00000054) Instruction Cache Tag Lookups                              */
+
+    struct {
+      __IOM uint32_t ILOOKUP_COUNT : 32;        /*!< [31..0] Total tag lookups from Instruction cache                          */
+    } IMON1_b;
+  } ;
+
+  union {
+    __IOM uint32_t IMON2;                       /*!< (@ 0x00000058) Instruction Cache Hits                                     */
+
+    struct {
+      __IOM uint32_t IHIT_COUNT : 32;           /*!< [31..0] Cache hits from lookup operations                                 */
+    } IMON2_b;
+  } ;
+
+  union {
+    __IOM uint32_t IMON3;                       /*!< (@ 0x0000005C) Instruction Cache Line Hits                                */
+
+    struct {
+      __IOM uint32_t ILINE_COUNT : 32;          /*!< [31..0] Cache hits from line cache                                        */
+    } IMON3_b;
+  } ;
+} CACHECTRL_Type;                               /*!< Size = 96 (0x60)                                                          */
+
+
+
+/* =========================================================================================================================== */
+/* ================                                          CLKGEN                                           ================ */
+/* =========================================================================================================================== */
+
+
+/**
+  * @brief Clock Generator (CLKGEN)
+  */
+
+typedef struct {                                /*!< (@ 0x40004000) CLKGEN Structure                                           */
+
+  union {
+    __IOM uint32_t CALXT;                       /*!< (@ 0x00000000) XT Oscillator Control                                      */
+
+    struct {
+      __IOM uint32_t CALXT      : 11;           /*!< [10..0] XT Oscillator calibration value. This register will
+                                                     enable the hardware to increase or decrease the number
+                                                     of cycles in a 16KHz clock derived from the original 32KHz
+                                                     version. The most significant bit is the sign. A '1' is
+                                                     a reduction, and a '0' is an addition. This calibration
+                                                     value will add or reduce the number of cycles programmed
+                                                     here across a 32 second interval. The maximum value that
+                                                     is effective is from -1024 to 1023.                                       */
+    } CALXT_b;
+  } ;
+
+  union {
+    __IOM uint32_t CALRC;                       /*!< (@ 0x00000004) RC Oscillator Control                                      */
+
+    struct {
+      __IOM uint32_t CALRC      : 18;           /*!< [17..0] LFRC Oscillator calibration value. This register will
+                                                     enable the hardware to increase or decrease the number
+                                                     of cycles in a 512 Hz clock derived from the original 1024
+                                                     version. The most significant bit is the sign. A '1' is
+                                                     a reduction, and a '0' is an addition. This calibration
+                                                     value will add or reduce the number of cycles programmed
+                                                     here across a 32 second interval. The range is from -131072
+                                                     (decimal) to 131071 (decimal). This register is normally
+                                                     used in conjuction with ACALCTR register. The CAL                         */
+    } CALRC_b;
+  } ;
+
+  union {
+    __IOM uint32_t ACALCTR;                     /*!< (@ 0x00000008) Autocalibration Counter                                    */
+
+    struct {
+      __IOM uint32_t ACALCTR    : 24;           /*!< [23..0] Autocalibration Counter result. Bits 17 down to 0 of
+                                                     this is feed directly to the CALRC register if ACAL register
+                                                     in OCTRL register is set to 1024SEC or 512SEC.                            */
+    } ACALCTR_b;
+  } ;
+
+  union {
+    __IOM uint32_t OCTRL;                       /*!< (@ 0x0000000C) Oscillator Control                                         */
+
+    struct {
+      __IOM uint32_t STOPXT     : 1;            /*!< [0..0] Stop the XT Oscillator to the RTC                                  */
+      __IOM uint32_t STOPRC     : 1;            /*!< [1..1] Stop the LFRC Oscillator to the RTC                                */
+      __IM  uint32_t            : 4;
+      __IOM uint32_t FOS        : 1;            /*!< [6..6] Oscillator switch on failure function. If this is set,
+                                                     then LFRC clock source will switch from XT to RC.                         */
+      __IOM uint32_t OSEL       : 1;            /*!< [7..7] Selects the RTC oscillator (1 => LFRC, 0 => XT)                    */
+      __IOM uint32_t ACAL       : 3;            /*!< [10..8] Autocalibration control. This selects the source to
+                                                     be used in the autocalibration flow. This flow can also
+                                                     be used to measure an internal clock against an external
+                                                     clock source, with the external clock normally used as
+                                                     the reference.                                                            */
+    } OCTRL_b;
+  } ;
+
+  union {
+    __IOM uint32_t CLKOUT;                      /*!< (@ 0x00000010) CLKOUT Frequency Select                                    */
+
+    struct {
+      __IOM uint32_t CKSEL      : 6;            /*!< [5..0] CLKOUT signal select                                               */
+      __IM  uint32_t            : 1;
+      __IOM uint32_t CKEN       : 1;            /*!< [7..7] Enable the CLKOUT signal                                           */
+    } CLKOUT_b;
+  } ;
+
+  union {
+    __IOM uint32_t CLKKEY;                      /*!< (@ 0x00000014) Key Register for Clock Control Register                    */
+
+    struct {
+      __IOM uint32_t CLKKEY     : 32;           /*!< [31..0] Key register value.                                               */
+    } CLKKEY_b;
+  } ;
+
+  union {
+    __IOM uint32_t CCTRL;                       /*!< (@ 0x00000018) HFRC Clock Control                                         */
+
+    struct {
+      __IOM uint32_t CORESEL    : 1;            /*!< [0..0] Core Clock divisor                                                 */
+    } CCTRL_b;
+  } ;
+
+  union {
+    __IOM uint32_t STATUS;                      /*!< (@ 0x0000001C) Clock Generator Status                                     */
+
+    struct {
+      __IOM uint32_t OMODE      : 1;            /*!< [0..0] Current RTC oscillator (1 => LFRC, 0 => XT). After an
+                                                     RTC oscillator change, it may take up to 2 seconds for
+                                                     this field to reflect the new oscillator.                                 */
+      __IOM uint32_t OSCF       : 1;            /*!< [1..1] XT Oscillator is enabled but not oscillating                       */
+    } STATUS_b;
+  } ;
+
+  union {
+    __IOM uint32_t HFADJ;                       /*!< (@ 0x00000020) HFRC Adjustment                                            */
+
+    struct {
+      __IOM uint32_t HFADJEN    : 1;            /*!< [0..0] HFRC adjustment control                                            */
+      __IOM uint32_t HFADJCK    : 3;            /*!< [3..1] Repeat period for HFRC adjustment                                  */
+      __IM  uint32_t            : 4;
+      __IOM uint32_t HFXTADJ    : 12;           /*!< [19..8] Target HFRC adjustment value.                                     */
+      __IOM uint32_t HFWARMUP   : 1;            /*!< [20..20] XT warmup period for HFRC adjustment                             */
+      __IOM uint32_t HFADJGAIN  : 3;            /*!< [23..21] Gain control for HFRC adjustment                                 */
+    } HFADJ_b;
+  } ;
+  __IM  uint32_t  RESERVED;
+
+  union {
+    __IOM uint32_t CLOCKENSTAT;                 /*!< (@ 0x00000028) Clock Enable Status                                        */
+
+    struct {
+      __IOM uint32_t CLOCKENSTAT : 32;          /*!< [31..0] Clock enable status                                               */
+    } CLOCKENSTAT_b;
+  } ;
+
+  union {
+    __IOM uint32_t CLOCKEN2STAT;                /*!< (@ 0x0000002C) Clock Enable Status                                        */
+
+    struct {
+      __IOM uint32_t CLOCKEN2STAT : 32;         /*!< [31..0] Clock enable status 2                                             */
+    } CLOCKEN2STAT_b;
+  } ;
+
+  union {
+    __IOM uint32_t CLOCKEN3STAT;                /*!< (@ 0x00000030) Clock Enable Status                                        */
+
+    struct {
+      __IOM uint32_t CLOCKEN3STAT : 32;         /*!< [31..0] Clock enable status 3                                             */
+    } CLOCKEN3STAT_b;
+  } ;
+
+  union {
+    __IOM uint32_t FREQCTRL;                    /*!< (@ 0x00000034) HFRC Frequency Control register                            */
+
+    struct {
+      __IOM uint32_t BURSTREQ   : 1;            /*!< [0..0] Frequency Burst Enable Request                                     */
+      __IOM uint32_t BURSTACK   : 1;            /*!< [1..1] Frequency Burst Request Acknowledge. Frequency burst
+                                                     requested is always acknowledged whether burst is granted
+                                                     or not depending on feature enable.                                       */
+      __IOM uint32_t BURSTSTATUS : 1;           /*!< [2..2] This represents frequency burst status.                            */
+    } FREQCTRL_b;
+  } ;
+  __IM  uint32_t  RESERVED1;
+
+  union {
+    __IOM uint32_t BLEBUCKTONADJ;               /*!< (@ 0x0000003C) BLE BUCK TON ADJUST                                        */
+
+    struct {
+      __IOM uint32_t TONLOWTHRESHOLD : 10;      /*!< [9..0] TON ADJUST LOW THRESHOLD. Suggested values are #A(94KHz)
+                                                     #15(47KHz) #53(12Khz) #14D(3Khz)                                          */
+      __IOM uint32_t TONHIGHTHRESHOLD : 10;     /*!< [19..10] TON ADJUST HIGH THRESHOLD. Suggested values are #15(94KHz)
+                                                     #2A(47Khz) #A6(12Khz) #29A(3Khz)                                          */
+      __IOM uint32_t TONADJUSTPERIOD : 2;       /*!< [21..20] TON ADJUST PERIOD                                                */
+      __IOM uint32_t TONADJUSTEN : 1;           /*!< [22..22] TON ADJUST ENABLE                                                */
+      __IOM uint32_t ZEROLENDETECTTRIM : 4;     /*!< [26..23] BLEBUCK ZERO LENGTH DETECT TRIM                                  */
+      __IOM uint32_t ZEROLENDETECTEN : 1;       /*!< [27..27] BLEBUCK ZERO LENGTH DETECT ENABLE                                */
+    } BLEBUCKTONADJ_b;
+  } ;
+  __IM  uint32_t  RESERVED2[48];
+
+  union {
+    __IOM uint32_t INTRPTEN;                    /*!< (@ 0x00000100) CLKGEN Interrupt Register: Enable                          */
+
+    struct {
+      __IOM uint32_t ACF        : 1;            /*!< [0..0] Autocalibration Fail interrupt                                     */
+      __IOM uint32_t ACC        : 1;            /*!< [1..1] Autocalibration Complete interrupt                                 */
+      __IOM uint32_t OF         : 1;            /*!< [2..2] XT Oscillator Fail interrupt                                       */
+    } INTRPTEN_b;
+  } ;
+
+  union {
+    __IOM uint32_t INTRPTSTAT;                  /*!< (@ 0x00000104) CLKGEN Interrupt Register: Status                          */
+
+    struct {
+      __IOM uint32_t ACF        : 1;            /*!< [0..0] Autocalibration Fail interrupt                                     */
+      __IOM uint32_t ACC        : 1;            /*!< [1..1] Autocalibration Complete interrupt                                 */
+      __IOM uint32_t OF         : 1;            /*!< [2..2] XT Oscillator Fail interrupt                                       */
+    } INTRPTSTAT_b;
+  } ;
+
+  union {
+    __IOM uint32_t INTRPTCLR;                   /*!< (@ 0x00000108) CLKGEN Interrupt Register: Clear                           */
+
+    struct {
+      __IOM uint32_t ACF        : 1;            /*!< [0..0] Autocalibration Fail interrupt                                     */
+      __IOM uint32_t ACC        : 1;            /*!< [1..1] Autocalibration Complete interrupt                                 */
+      __IOM uint32_t OF         : 1;            /*!< [2..2] XT Oscillator Fail interrupt                                       */
+    } INTRPTCLR_b;
+  } ;
+
+  union {
+    __IOM uint32_t INTRPTSET;                   /*!< (@ 0x0000010C) CLKGEN Interrupt Register: Set                             */
+
+    struct {
+      __IOM uint32_t ACF        : 1;            /*!< [0..0] Autocalibration Fail interrupt                                     */
+      __IOM uint32_t ACC        : 1;            /*!< [1..1] Autocalibration Complete interrupt                                 */
+      __IOM uint32_t OF         : 1;            /*!< [2..2] XT Oscillator Fail interrupt                                       */
+    } INTRPTSET_b;
+  } ;
+} CLKGEN_Type;                                  /*!< Size = 272 (0x110)                                                        */
+
+
+
+/* =========================================================================================================================== */
+/* ================                                          CTIMER                                           ================ */
+/* =========================================================================================================================== */
+
+
+/**
+  * @brief Counter/Timer (CTIMER)
+  */
+
+typedef struct {                                /*!< (@ 0x40008000) CTIMER Structure                                           */
+
+  union {
+    __IOM uint32_t TMR0;                        /*!< (@ 0x00000000) Counter/Timer Register                                     */
+
+    struct {
+      __IOM uint32_t CTTMRA0    : 16;           /*!< [15..0] Counter/Timer A0.                                                 */
+      __IOM uint32_t CTTMRB0    : 16;           /*!< [31..16] Counter/Timer B0.                                                */
+    } TMR0_b;
+  } ;
+
+  union {
+    __IOM uint32_t CMPRA0;                      /*!< (@ 0x00000004) Counter/Timer A0 Compare Registers                         */
+
+    struct {
+      __IOM uint32_t CMPR0A0    : 16;           /*!< [15..0] Counter/Timer A0 Compare Register 0. Holds the lower
+                                                     limit for timer half A.                                                   */
+      __IOM uint32_t CMPR1A0    : 16;           /*!< [31..16] Counter/Timer A0 Compare Register 1. Holds the upper
+                                                     limit for timer half A.                                                   */
+    } CMPRA0_b;
+  } ;
+
+  union {
+    __IOM uint32_t CMPRB0;                      /*!< (@ 0x00000008) Counter/Timer B0 Compare Registers                         */
+
+    struct {
+      __IOM uint32_t CMPR0B0    : 16;           /*!< [15..0] Counter/Timer B0 Compare Register 0. Holds the lower
+                                                     limit for timer half B.                                                   */
+      __IOM uint32_t CMPR1B0    : 16;           /*!< [31..16] Counter/Timer B0 Compare Register 1. Holds the upper
+                                                     limit for timer half B.                                                   */
+    } CMPRB0_b;
+  } ;
+
+  union {
+    __IOM uint32_t CTRL0;                       /*!< (@ 0x0000000C) Counter/Timer Control                                      */
+
+    struct {
+      __IOM uint32_t TMRA0EN    : 1;            /*!< [0..0] Counter/Timer A0 Enable bit.                                       */
+      __IOM uint32_t TMRA0CLK   : 5;            /*!< [5..1] Counter/Timer A0 Clock Select.                                     */
+      __IOM uint32_t TMRA0FN    : 3;            /*!< [8..6] Counter/Timer A0 Function Select.                                  */
+      __IOM uint32_t TMRA0IE0   : 1;            /*!< [9..9] Counter/Timer A0 Interrupt Enable bit based on COMPR0.             */
+      __IOM uint32_t TMRA0IE1   : 1;            /*!< [10..10] Counter/Timer A0 Interrupt Enable bit based on COMPR1.           */
+      __IOM uint32_t TMRA0CLR   : 1;            /*!< [11..11] Counter/Timer A0 Clear bit.                                      */
+      __IOM uint32_t TMRA0POL   : 1;            /*!< [12..12] Counter/Timer A0 output polarity.                                */
+      __IM  uint32_t            : 3;
+      __IOM uint32_t TMRB0EN    : 1;            /*!< [16..16] Counter/Timer B0 Enable bit.                                     */
+      __IOM uint32_t TMRB0CLK   : 5;            /*!< [21..17] Counter/Timer B0 Clock Select.                                   */
+      __IOM uint32_t TMRB0FN    : 3;            /*!< [24..22] Counter/Timer B0 Function Select.                                */
+      __IOM uint32_t TMRB0IE0   : 1;            /*!< [25..25] Counter/Timer B0 Interrupt Enable bit for COMPR0.                */
+      __IOM uint32_t TMRB0IE1   : 1;            /*!< [26..26] Counter/Timer B0 Interrupt Enable bit for COMPR1.                */
+      __IOM uint32_t TMRB0CLR   : 1;            /*!< [27..27] Counter/Timer B0 Clear bit.                                      */
+      __IOM uint32_t TMRB0POL   : 1;            /*!< [28..28] Counter/Timer B0 output polarity.                                */
+      __IM  uint32_t            : 2;
+      __IOM uint32_t CTLINK0    : 1;            /*!< [31..31] Counter/Timer A0/B0 Link bit.                                    */
+    } CTRL0_b;
+  } ;
+  __IM  uint32_t  RESERVED;
+
+  union {
+    __IOM uint32_t CMPRAUXA0;                   /*!< (@ 0x00000014) Counter/Timer A0 Compare Registers                         */
+
+    struct {
+      __IOM uint32_t CMPR2A0    : 16;           /*!< [15..0] Counter/Timer A0 Compare Register 2. Holds the lower
+                                                     limit for timer half A.                                                   */
+      __IOM uint32_t CMPR3A0    : 16;           /*!< [31..16] Counter/Timer A0 Compare Register 3. Holds the upper
+                                                     limit for timer half A.                                                   */
+    } CMPRAUXA0_b;
+  } ;
+
+  union {
+    __IOM uint32_t CMPRAUXB0;                   /*!< (@ 0x00000018) Counter/Timer B0 Compare Registers                         */
+
+    struct {
+      __IOM uint32_t CMPR2B0    : 16;           /*!< [15..0] Counter/Timer B0 Compare Register 2. Holds the lower
+                                                     limit for timer half B.                                                   */
+      __IOM uint32_t CMPR3B0    : 16;           /*!< [31..16] Counter/Timer B0 Compare Register 3. Holds the upper
+                                                     limit for timer half B.                                                   */
+    } CMPRAUXB0_b;
+  } ;
+
+  union {
+    __IOM uint32_t AUX0;                        /*!< (@ 0x0000001C) Counter/Timer Auxiliary                                    */
+
+    struct {
+      __IOM uint32_t TMRA0LMT   : 7;            /*!< [6..0] Counter/Timer A0 Pattern Limit Count.                              */
+      __IOM uint32_t TMRA0TRIG  : 4;            /*!< [10..7] Counter/Timer A0 Trigger Select.                                  */
+      __IOM uint32_t TMRA0NOSYNC : 1;           /*!< [11..11] Source clock synchronization control.                            */
+      __IOM uint32_t TMRA0TINV  : 1;            /*!< [12..12] Counter/Timer A0 Invert on trigger.                              */
+      __IOM uint32_t TMRA0POL23 : 1;            /*!< [13..13] Counter/Timer A0 Upper output polarity                           */
+      __IOM uint32_t TMRA0EN23  : 1;            /*!< [14..14] Counter/Timer A0 Upper compare enable.                           */
+      __IM  uint32_t            : 1;
+      __IOM uint32_t TMRB0LMT   : 6;            /*!< [21..16] Counter/Timer B0 Pattern Limit Count.                            */
+      __IM  uint32_t            : 1;
+      __IOM uint32_t TMRB0TRIG  : 4;            /*!< [26..23] Counter/Timer B0 Trigger Select.                                 */
+      __IOM uint32_t TMRB0NOSYNC : 1;           /*!< [27..27] Source clock synchronization control.                            */
+      __IOM uint32_t TMRB0TINV  : 1;            /*!< [28..28] Counter/Timer B0 Invert on trigger.                              */
+      __IOM uint32_t TMRB0POL23 : 1;            /*!< [29..29] Upper output polarity                                            */
+      __IOM uint32_t TMRB0EN23  : 1;            /*!< [30..30] Counter/Timer B0 Upper compare enable.                           */
+    } AUX0_b;
+  } ;
+
+  union {
+    __IOM uint32_t TMR1;                        /*!< (@ 0x00000020) Counter/Timer Register                                     */
+
+    struct {
+      __IOM uint32_t CTTMRA1    : 16;           /*!< [15..0] Counter/Timer A1.                                                 */
+      __IOM uint32_t CTTMRB1    : 16;           /*!< [31..16] Counter/Timer B1.                                                */
+    } TMR1_b;
+  } ;
+
+  union {
+    __IOM uint32_t CMPRA1;                      /*!< (@ 0x00000024) Counter/Timer A1 Compare Registers                         */
+
+    struct {
+      __IOM uint32_t CMPR0A1    : 16;           /*!< [15..0] Counter/Timer A1 Compare Register 0.                              */
+      __IOM uint32_t CMPR1A1    : 16;           /*!< [31..16] Counter/Timer A1 Compare Register 1.                             */
+    } CMPRA1_b;
+  } ;
+
+  union {
+    __IOM uint32_t CMPRB1;                      /*!< (@ 0x00000028) Counter/Timer B1 Compare Registers                         */
+
+    struct {
+      __IOM uint32_t CMPR0B1    : 16;           /*!< [15..0] Counter/Timer B1 Compare Register 0.                              */
+      __IOM uint32_t CMPR1B1    : 16;           /*!< [31..16] Counter/Timer B1 Compare Register 1.                             */
+    } CMPRB1_b;
+  } ;
+
+  union {
+    __IOM uint32_t CTRL1;                       /*!< (@ 0x0000002C) Counter/Timer Control                                      */
+
+    struct {
+      __IOM uint32_t TMRA1EN    : 1;            /*!< [0..0] Counter/Timer A1 Enable bit.                                       */
+      __IOM uint32_t TMRA1CLK   : 5;            /*!< [5..1] Counter/Timer A1 Clock Select.                                     */
+      __IOM uint32_t TMRA1FN    : 3;            /*!< [8..6] Counter/Timer A1 Function Select.                                  */
+      __IOM uint32_t TMRA1IE0   : 1;            /*!< [9..9] Counter/Timer A1 Interrupt Enable bit based on COMPR0.             */
+      __IOM uint32_t TMRA1IE1   : 1;            /*!< [10..10] Counter/Timer A1 Interrupt Enable bit based on COMPR1.           */
+      __IOM uint32_t TMRA1CLR   : 1;            /*!< [11..11] Counter/Timer A1 Clear bit.                                      */
+      __IOM uint32_t TMRA1POL   : 1;            /*!< [12..12] Counter/Timer A1 output polarity.                                */
+      __IM  uint32_t            : 3;
+      __IOM uint32_t TMRB1EN    : 1;            /*!< [16..16] Counter/Timer B1 Enable bit.                                     */
+      __IOM uint32_t TMRB1CLK   : 5;            /*!< [21..17] Counter/Timer B1 Clock Select.                                   */
+      __IOM uint32_t TMRB1FN    : 3;            /*!< [24..22] Counter/Timer B1 Function Select.                                */
+      __IOM uint32_t TMRB1IE0   : 1;            /*!< [25..25] Counter/Timer B1 Interrupt Enable bit for COMPR0.                */
+      __IOM uint32_t TMRB1IE1   : 1;            /*!< [26..26] Counter/Timer B1 Interrupt Enable bit for COMPR1.                */
+      __IOM uint32_t TMRB1CLR   : 1;            /*!< [27..27] Counter/Timer B1 Clear bit.                                      */
+      __IOM uint32_t TMRB1POL   : 1;            /*!< [28..28] Counter/Timer B1 output polarity.                                */
+      __IM  uint32_t            : 2;
+      __IOM uint32_t CTLINK1    : 1;            /*!< [31..31] Counter/Timer A1/B1 Link bit.                                    */
+    } CTRL1_b;
+  } ;
+  __IM  uint32_t  RESERVED1;
+
+  union {
+    __IOM uint32_t CMPRAUXA1;                   /*!< (@ 0x00000034) Counter/Timer A1 Compare Registers                         */
+
+    struct {
+      __IOM uint32_t CMPR2A1    : 16;           /*!< [15..0] Counter/Timer A1 Compare Register 2. Holds the lower
+                                                     limit for timer half A.                                                   */
+      __IOM uint32_t CMPR3A1    : 16;           /*!< [31..16] Counter/Timer A1 Compare Register 3. Holds the upper
+                                                     limit for timer half A.                                                   */
+    } CMPRAUXA1_b;
+  } ;
+
+  union {
+    __IOM uint32_t CMPRAUXB1;                   /*!< (@ 0x00000038) Counter/Timer B1 Compare Registers                         */
+
+    struct {
+      __IOM uint32_t CMPR2B1    : 16;           /*!< [15..0] Counter/Timer B1 Compare Register 2. Holds the lower
+                                                     limit for timer half B.                                                   */
+      __IOM uint32_t CMPR3B1    : 16;           /*!< [31..16] Counter/Timer B1 Compare Register 3. Holds the upper
+                                                     limit for timer half B.                                                   */
+    } CMPRAUXB1_b;
+  } ;
+
+  union {
+    __IOM uint32_t AUX1;                        /*!< (@ 0x0000003C) Counter/Timer Auxiliary                                    */
+
+    struct {
+      __IOM uint32_t TMRA1LMT   : 7;            /*!< [6..0] Counter/Timer A1 Pattern Limit Count.                              */
+      __IOM uint32_t TMRA1TRIG  : 4;            /*!< [10..7] Counter/Timer A1 Trigger Select.                                  */
+      __IOM uint32_t TMRA1NOSYNC : 1;           /*!< [11..11] Source clock synchronization control.                            */
+      __IOM uint32_t TMRA1TINV  : 1;            /*!< [12..12] Counter/Timer A1 Invert on trigger.                              */
+      __IOM uint32_t TMRA1POL23 : 1;            /*!< [13..13] Counter/Timer A1 Upper output polarity                           */
+      __IOM uint32_t TMRA1EN23  : 1;            /*!< [14..14] Counter/Timer A1 Upper compare enable.                           */
+      __IM  uint32_t            : 1;
+      __IOM uint32_t TMRB1LMT   : 6;            /*!< [21..16] Counter/Timer B1 Pattern Limit Count.                            */
+      __IM  uint32_t            : 1;
+      __IOM uint32_t TMRB1TRIG  : 4;            /*!< [26..23] Counter/Timer B1 Trigger Select.                                 */
+      __IOM uint32_t TMRB1NOSYNC : 1;           /*!< [27..27] Source clock synchronization control.                            */
+      __IOM uint32_t TMRB1TINV  : 1;            /*!< [28..28] Counter/Timer B1 Invert on trigger.                              */
+      __IOM uint32_t TMRB1POL23 : 1;            /*!< [29..29] Upper output polarity                                            */
+      __IOM uint32_t TMRB1EN23  : 1;            /*!< [30..30] Counter/Timer B1 Upper compare enable.                           */
+    } AUX1_b;
+  } ;
+
+  union {
+    __IOM uint32_t TMR2;                        /*!< (@ 0x00000040) Counter/Timer Register                                     */
+
+    struct {
+      __IOM uint32_t CTTMRA2    : 16;           /*!< [15..0] Counter/Timer A2.                                                 */
+      __IOM uint32_t CTTMRB2    : 16;           /*!< [31..16] Counter/Timer B2.                                                */
+    } TMR2_b;
+  } ;
+
+  union {
+    __IOM uint32_t CMPRA2;                      /*!< (@ 0x00000044) Counter/Timer A2 Compare Registers                         */
+
+    struct {
+      __IOM uint32_t CMPR0A2    : 16;           /*!< [15..0] Counter/Timer A2 Compare Register 0.                              */
+      __IOM uint32_t CMPR1A2    : 16;           /*!< [31..16] Counter/Timer A2 Compare Register 1.                             */
+    } CMPRA2_b;
+  } ;
+
+  union {
+    __IOM uint32_t CMPRB2;                      /*!< (@ 0x00000048) Counter/Timer B2 Compare Registers                         */
+
+    struct {
+      __IOM uint32_t CMPR0B2    : 16;           /*!< [15..0] Counter/Timer B2 Compare Register 0.                              */
+      __IOM uint32_t CMPR1B2    : 16;           /*!< [31..16] Counter/Timer B2 Compare Register 1.                             */
+    } CMPRB2_b;
+  } ;
+
+  union {
+    __IOM uint32_t CTRL2;                       /*!< (@ 0x0000004C) Counter/Timer Control                                      */
+
+    struct {
+      __IOM uint32_t TMRA2EN    : 1;            /*!< [0..0] Counter/Timer A2 Enable bit.                                       */
+      __IOM uint32_t TMRA2CLK   : 5;            /*!< [5..1] Counter/Timer A2 Clock Select.                                     */
+      __IOM uint32_t TMRA2FN    : 3;            /*!< [8..6] Counter/Timer A2 Function Select.                                  */
+      __IOM uint32_t TMRA2IE0   : 1;            /*!< [9..9] Counter/Timer A2 Interrupt Enable bit based on COMPR0.             */
+      __IOM uint32_t TMRA2IE1   : 1;            /*!< [10..10] Counter/Timer A2 Interrupt Enable bit based on COMPR1.           */
+      __IOM uint32_t TMRA2CLR   : 1;            /*!< [11..11] Counter/Timer A2 Clear bit.                                      */
+      __IOM uint32_t TMRA2POL   : 1;            /*!< [12..12] Counter/Timer A2 output polarity.                                */
+      __IM  uint32_t            : 3;
+      __IOM uint32_t TMRB2EN    : 1;            /*!< [16..16] Counter/Timer B2 Enable bit.                                     */
+      __IOM uint32_t TMRB2CLK   : 5;            /*!< [21..17] Counter/Timer B2 Clock Select.                                   */
+      __IOM uint32_t TMRB2FN    : 3;            /*!< [24..22] Counter/Timer B2 Function Select.                                */
+      __IOM uint32_t TMRB2IE0   : 1;            /*!< [25..25] Counter/Timer B2 Interrupt Enable bit for COMPR0.                */
+      __IOM uint32_t TMRB2IE1   : 1;            /*!< [26..26] Counter/Timer B2 Interrupt Enable bit for COMPR1.                */
+      __IOM uint32_t TMRB2CLR   : 1;            /*!< [27..27] Counter/Timer B2 Clear bit.                                      */
+      __IOM uint32_t TMRB2POL   : 1;            /*!< [28..28] Counter/Timer B2 output polarity.                                */
+      __IM  uint32_t            : 2;
+      __IOM uint32_t CTLINK2    : 1;            /*!< [31..31] Counter/Timer A2/B2 Link bit.                                    */
+    } CTRL2_b;
+  } ;
+  __IM  uint32_t  RESERVED2;
+
+  union {
+    __IOM uint32_t CMPRAUXA2;                   /*!< (@ 0x00000054) Counter/Timer A2 Compare Registers                         */
+
+    struct {
+      __IOM uint32_t CMPR2A2    : 16;           /*!< [15..0] Counter/Timer A2 Compare Register 2. Holds the lower
+                                                     limit for timer half A.                                                   */
+      __IOM uint32_t CMPR3A2    : 16;           /*!< [31..16] Counter/Timer A2 Compare Register 3. Holds the upper
+                                                     limit for timer half A.                                                   */
+    } CMPRAUXA2_b;
+  } ;
+
+  union {
+    __IOM uint32_t CMPRAUXB2;                   /*!< (@ 0x00000058) Counter/Timer B2 Compare Registers                         */
+
+    struct {
+      __IOM uint32_t CMPR2B2    : 16;           /*!< [15..0] Counter/Timer B2 Compare Register 2. Holds the lower
+                                                     limit for timer half B.                                                   */
+      __IOM uint32_t CMPR3B2    : 16;           /*!< [31..16] Counter/Timer B2 Compare Register 3. Holds the upper
+                                                     limit for timer half B.                                                   */
+    } CMPRAUXB2_b;
+  } ;
+
+  union {
+    __IOM uint32_t AUX2;                        /*!< (@ 0x0000005C) Counter/Timer Auxiliary                                    */
+
+    struct {
+      __IOM uint32_t TMRA2LMT   : 7;            /*!< [6..0] Counter/Timer A2 Pattern Limit Count.                              */
+      __IOM uint32_t TMRA2TRIG  : 4;            /*!< [10..7] Counter/Timer A2 Trigger Select.                                  */
+      __IOM uint32_t TMRA2NOSYNC : 1;           /*!< [11..11] Source clock synchronization control.                            */
+      __IOM uint32_t TMRA2TINV  : 1;            /*!< [12..12] Counter/Timer A2 Invert on trigger.                              */
+      __IOM uint32_t TMRA2POL23 : 1;            /*!< [13..13] Counter/Timer A2 Upper output polarity                           */
+      __IOM uint32_t TMRA2EN23  : 1;            /*!< [14..14] Counter/Timer A2 Upper compare enable.                           */
+      __IM  uint32_t            : 1;
+      __IOM uint32_t TMRB2LMT   : 6;            /*!< [21..16] Counter/Timer B2 Pattern Limit Count.                            */
+      __IM  uint32_t            : 1;
+      __IOM uint32_t TMRB2TRIG  : 4;            /*!< [26..23] Counter/Timer B2 Trigger Select.                                 */
+      __IOM uint32_t TMRB2NOSYNC : 1;           /*!< [27..27] Source clock synchronization control.                            */
+      __IOM uint32_t TMRB2TINV  : 1;            /*!< [28..28] Counter/Timer B2 Invert on trigger.                              */
+      __IOM uint32_t TMRB2POL23 : 1;            /*!< [29..29] Upper output polarity                                            */
+      __IOM uint32_t TMRB2EN23  : 1;            /*!< [30..30] Counter/Timer B2 Upper compare enable.                           */
+    } AUX2_b;
+  } ;
+
+  union {
+    __IOM uint32_t TMR3;                        /*!< (@ 0x00000060) Counter/Timer Register                                     */
+
+    struct {
+      __IOM uint32_t CTTMRA3    : 16;           /*!< [15..0] Counter/Timer A3.                                                 */
+      __IOM uint32_t CTTMRB3    : 16;           /*!< [31..16] Counter/Timer B3.                                                */
+    } TMR3_b;
+  } ;
+
+  union {
+    __IOM uint32_t CMPRA3;                      /*!< (@ 0x00000064) Counter/Timer A3 Compare Registers                         */
+
+    struct {
+      __IOM uint32_t CMPR0A3    : 16;           /*!< [15..0] Counter/Timer A3 Compare Register 0.                              */
+      __IOM uint32_t CMPR1A3    : 16;           /*!< [31..16] Counter/Timer A3 Compare Register 1.                             */
+    } CMPRA3_b;
+  } ;
+
+  union {
+    __IOM uint32_t CMPRB3;                      /*!< (@ 0x00000068) Counter/Timer B3 Compare Registers                         */
+
+    struct {
+      __IOM uint32_t CMPR0B3    : 16;           /*!< [15..0] Counter/Timer B3 Compare Register 0.                              */
+      __IOM uint32_t CMPR1B3    : 16;           /*!< [31..16] Counter/Timer B3 Compare Register 1.                             */
+    } CMPRB3_b;
+  } ;
+
+  union {
+    __IOM uint32_t CTRL3;                       /*!< (@ 0x0000006C) Counter/Timer Control                                      */
+
+    struct {
+      __IOM uint32_t TMRA3EN    : 1;            /*!< [0..0] Counter/Timer A3 Enable bit.                                       */
+      __IOM uint32_t TMRA3CLK   : 5;            /*!< [5..1] Counter/Timer A3 Clock Select.                                     */
+      __IOM uint32_t TMRA3FN    : 3;            /*!< [8..6] Counter/Timer A3 Function Select.                                  */
+      __IOM uint32_t TMRA3IE0   : 1;            /*!< [9..9] Counter/Timer A3 Interrupt Enable bit based on COMPR0.             */
+      __IOM uint32_t TMRA3IE1   : 1;            /*!< [10..10] Counter/Timer A3 Interrupt Enable bit based on COMPR1.           */
+      __IOM uint32_t TMRA3CLR   : 1;            /*!< [11..11] Counter/Timer A3 Clear bit.                                      */
+      __IOM uint32_t TMRA3POL   : 1;            /*!< [12..12] Counter/Timer A3 output polarity.                                */
+      __IM  uint32_t            : 2;
+      __IOM uint32_t ADCEN      : 1;            /*!< [15..15] Special Timer A3 enable for ADC function.                        */
+      __IOM uint32_t TMRB3EN    : 1;            /*!< [16..16] Counter/Timer B3 Enable bit.                                     */
+      __IOM uint32_t TMRB3CLK   : 5;            /*!< [21..17] Counter/Timer B3 Clock Select.                                   */
+      __IOM uint32_t TMRB3FN    : 3;            /*!< [24..22] Counter/Timer B3 Function Select.                                */
+      __IOM uint32_t TMRB3IE0   : 1;            /*!< [25..25] Counter/Timer B3 Interrupt Enable bit for COMPR0.                */
+      __IOM uint32_t TMRB3IE1   : 1;            /*!< [26..26] Counter/Timer B3 Interrupt Enable bit for COMPR1.                */
+      __IOM uint32_t TMRB3CLR   : 1;            /*!< [27..27] Counter/Timer B3 Clear bit.                                      */
+      __IOM uint32_t TMRB3POL   : 1;            /*!< [28..28] Counter/Timer B3 output polarity.                                */
+      __IM  uint32_t            : 2;
+      __IOM uint32_t CTLINK3    : 1;            /*!< [31..31] Counter/Timer A3/B3 Link bit.                                    */
+    } CTRL3_b;
+  } ;
+  __IM  uint32_t  RESERVED3;
+
+  union {
+    __IOM uint32_t CMPRAUXA3;                   /*!< (@ 0x00000074) Counter/Timer A3 Compare Registers                         */
+
+    struct {
+      __IOM uint32_t CMPR2A3    : 16;           /*!< [15..0] Counter/Timer A3 Compare Register 2. Holds the lower
+                                                     limit for timer half A.                                                   */
+      __IOM uint32_t CMPR3A3    : 16;           /*!< [31..16] Counter/Timer A3 Compare Register 3. Holds the upper
+                                                     limit for timer half A.                                                   */
+    } CMPRAUXA3_b;
+  } ;
+
+  union {
+    __IOM uint32_t CMPRAUXB3;                   /*!< (@ 0x00000078) Counter/Timer B3 Compare Registers                         */
+
+    struct {
+      __IOM uint32_t CMPR2B3    : 16;           /*!< [15..0] Counter/Timer B3 Compare Register 2. Holds the lower
+                                                     limit for timer half B.                                                   */
+      __IOM uint32_t CMPR3B3    : 16;           /*!< [31..16] Counter/Timer B3 Compare Register 3. Holds the upper
+                                                     limit for timer half B.                                                   */
+    } CMPRAUXB3_b;
+  } ;
+
+  union {
+    __IOM uint32_t AUX3;                        /*!< (@ 0x0000007C) Counter/Timer Auxiliary                                    */
+
+    struct {
+      __IOM uint32_t TMRA3LMT   : 7;            /*!< [6..0] Counter/Timer A3 Pattern Limit Count.                              */
+      __IOM uint32_t TMRA3TRIG  : 4;            /*!< [10..7] Counter/Timer A3 Trigger Select.                                  */
+      __IOM uint32_t TMRA3NOSYNC : 1;           /*!< [11..11] Source clock synchronization control.                            */
+      __IOM uint32_t TMRA3TINV  : 1;            /*!< [12..12] Counter/Timer A3 Invert on trigger.                              */
+      __IOM uint32_t TMRA3POL23 : 1;            /*!< [13..13] Counter/Timer A3 Upper output polarity                           */
+      __IOM uint32_t TMRA3EN23  : 1;            /*!< [14..14] Counter/Timer A3 Upper compare enable.                           */
+      __IM  uint32_t            : 1;
+      __IOM uint32_t TMRB3LMT   : 6;            /*!< [21..16] Counter/Timer B3 Pattern Limit Count.                            */
+      __IM  uint32_t            : 1;
+      __IOM uint32_t TMRB3TRIG  : 4;            /*!< [26..23] Counter/Timer B3 Trigger Select.                                 */
+      __IOM uint32_t TMRB3NOSYNC : 1;           /*!< [27..27] Source clock synchronization control.                            */
+      __IOM uint32_t TMRB3TINV  : 1;            /*!< [28..28] Counter/Timer B3 Invert on trigger.                              */
+      __IOM uint32_t TMRB3POL23 : 1;            /*!< [29..29] Upper output polarity                                            */
+      __IOM uint32_t TMRB3EN23  : 1;            /*!< [30..30] Counter/Timer B3 Upper compare enable.                           */
+    } AUX3_b;
+  } ;
+
+  union {
+    __IOM uint32_t TMR4;                        /*!< (@ 0x00000080) Counter/Timer Register                                     */
+
+    struct {
+      __IOM uint32_t CTTMRA4    : 16;           /*!< [15..0] Counter/Timer A4.                                                 */
+      __IOM uint32_t CTTMRB4    : 16;           /*!< [31..16] Counter/Timer B4.                                                */
+    } TMR4_b;
+  } ;
+
+  union {
+    __IOM uint32_t CMPRA4;                      /*!< (@ 0x00000084) Counter/Timer A4 Compare Registers                         */
+
+    struct {
+      __IOM uint32_t CMPR0A4    : 16;           /*!< [15..0] Counter/Timer A4 Compare Register 0. Holds the lower
+                                                     limit for timer half A.                                                   */
+      __IOM uint32_t CMPR1A4    : 16;           /*!< [31..16] Counter/Timer A4 Compare Register 1. Holds the upper
+                                                     limit for timer half A.                                                   */
+    } CMPRA4_b;
+  } ;
+
+  union {
+    __IOM uint32_t CMPRB4;                      /*!< (@ 0x00000088) Counter/Timer B4 Compare Registers                         */
+
+    struct {
+      __IOM uint32_t CMPR0B4    : 16;           /*!< [15..0] Counter/Timer B4 Compare Register 0. Holds the lower
+                                                     limit for timer half B.                                                   */
+      __IOM uint32_t CMPR1B4    : 16;           /*!< [31..16] Counter/Timer B4 Compare Register 1. Holds the upper
+                                                     limit for timer half B.                                                   */
+    } CMPRB4_b;
+  } ;
+
+  union {
+    __IOM uint32_t CTRL4;                       /*!< (@ 0x0000008C) Counter/Timer Control                                      */
+
+    struct {
+      __IOM uint32_t TMRA4EN    : 1;            /*!< [0..0] Counter/Timer A4 Enable bit.                                       */
+      __IOM uint32_t TMRA4CLK   : 5;            /*!< [5..1] Counter/Timer A4 Clock Select.                                     */
+      __IOM uint32_t TMRA4FN    : 3;            /*!< [8..6] Counter/Timer A4 Function Select.                                  */
+      __IOM uint32_t TMRA4IE0   : 1;            /*!< [9..9] Counter/Timer A4 Interrupt Enable bit based on COMPR0.             */
+      __IOM uint32_t TMRA4IE1   : 1;            /*!< [10..10] Counter/Timer A4 Interrupt Enable bit based on COMPR1.           */
+      __IOM uint32_t TMRA4CLR   : 1;            /*!< [11..11] Counter/Timer A4 Clear bit.                                      */
+      __IOM uint32_t TMRA4POL   : 1;            /*!< [12..12] Counter/Timer A4 output polarity.                                */
+      __IM  uint32_t            : 3;
+      __IOM uint32_t TMRB4EN    : 1;            /*!< [16..16] Counter/Timer B4 Enable bit.                                     */
+      __IOM uint32_t TMRB4CLK   : 5;            /*!< [21..17] Counter/Timer B4 Clock Select.                                   */
+      __IOM uint32_t TMRB4FN    : 3;            /*!< [24..22] Counter/Timer B4 Function Select.                                */
+      __IOM uint32_t TMRB4IE0   : 1;            /*!< [25..25] Counter/Timer B4 Interrupt Enable bit for COMPR0.                */
+      __IOM uint32_t TMRB4IE1   : 1;            /*!< [26..26] Counter/Timer B4 Interrupt Enable bit for COMPR1.                */
+      __IOM uint32_t TMRB4CLR   : 1;            /*!< [27..27] Counter/Timer B4 Clear bit.                                      */
+      __IOM uint32_t TMRB4POL   : 1;            /*!< [28..28] Counter/Timer B4 output polarity.                                */
+      __IM  uint32_t            : 2;
+      __IOM uint32_t CTLINK4    : 1;            /*!< [31..31] Counter/Timer A4/B4 Link bit.                                    */
+    } CTRL4_b;
+  } ;
+  __IM  uint32_t  RESERVED4;
+
+  union {
+    __IOM uint32_t CMPRAUXA4;                   /*!< (@ 0x00000094) Counter/Timer A4 Compare Registers                         */
+
+    struct {
+      __IOM uint32_t CMPR2A4    : 16;           /*!< [15..0] Counter/Timer A4 Compare Register 2. Holds the lower
+                                                     limit for timer half A.                                                   */
+      __IOM uint32_t CMPR3A4    : 16;           /*!< [31..16] Counter/Timer A4 Compare Register 3. Holds the upper
+                                                     limit for timer half A.                                                   */
+    } CMPRAUXA4_b;
+  } ;
+
+  union {
+    __IOM uint32_t CMPRAUXB4;                   /*!< (@ 0x00000098) Counter/Timer B4 Compare Registers                         */
+
+    struct {
+      __IOM uint32_t CMPR2B4    : 16;           /*!< [15..0] Counter/Timer B4 Compare Register 2. Holds the lower
+                                                     limit for timer half B.                                                   */
+      __IOM uint32_t CMPR3B4    : 16;           /*!< [31..16] Counter/Timer B4 Compare Register 3. Holds the upper
+                                                     limit for timer half B.                                                   */
+    } CMPRAUXB4_b;
+  } ;
+
+  union {
+    __IOM uint32_t AUX4;                        /*!< (@ 0x0000009C) Counter/Timer Auxiliary                                    */
+
+    struct {
+      __IOM uint32_t TMRA4LMT   : 7;            /*!< [6..0] Counter/Timer A4 Pattern Limit Count.                              */
+      __IOM uint32_t TMRA4TRIG  : 4;            /*!< [10..7] Counter/Timer A4 Trigger Select.                                  */
+      __IOM uint32_t TMRA4NOSYNC : 1;           /*!< [11..11] Source clock synchronization control.                            */
+      __IOM uint32_t TMRA4TINV  : 1;            /*!< [12..12] Counter/Timer A4 Invert on trigger.                              */
+      __IOM uint32_t TMRA4POL23 : 1;            /*!< [13..13] Counter/Timer A4 Upper output polarity                           */
+      __IOM uint32_t TMRA4EN23  : 1;            /*!< [14..14] Counter/Timer A4 Upper compare enable.                           */
+      __IM  uint32_t            : 1;
+      __IOM uint32_t TMRB4LMT   : 6;            /*!< [21..16] Counter/Timer B4 Pattern Limit Count.                            */
+      __IM  uint32_t            : 1;
+      __IOM uint32_t TMRB4TRIG  : 4;            /*!< [26..23] Counter/Timer B4 Trigger Select.                                 */
+      __IOM uint32_t TMRB4NOSYNC : 1;           /*!< [27..27] Source clock synchronization control.                            */
+      __IOM uint32_t TMRB4TINV  : 1;            /*!< [28..28] Counter/Timer B4 Invert on trigger.                              */
+      __IOM uint32_t TMRB4POL23 : 1;            /*!< [29..29] Upper output polarity                                            */
+      __IOM uint32_t TMRB4EN23  : 1;            /*!< [30..30] Counter/Timer B4 Upper compare enable.                           */
+    } AUX4_b;
+  } ;
+
+  union {
+    __IOM uint32_t TMR5;                        /*!< (@ 0x000000A0) Counter/Timer Register                                     */
+
+    struct {
+      __IOM uint32_t CTTMRA5    : 16;           /*!< [15..0] Counter/Timer A5.                                                 */
+      __IOM uint32_t CTTMRB5    : 16;           /*!< [31..16] Counter/Timer B5.                                                */
+    } TMR5_b;
+  } ;
+
+  union {
+    __IOM uint32_t CMPRA5;                      /*!< (@ 0x000000A4) Counter/Timer A5 Compare Registers                         */
+
+    struct {
+      __IOM uint32_t CMPR0A5    : 16;           /*!< [15..0] Counter/Timer A5 Compare Register 0.                              */
+      __IOM uint32_t CMPR1A5    : 16;           /*!< [31..16] Counter/Timer A5 Compare Register 1.                             */
+    } CMPRA5_b;
+  } ;
+
+  union {
+    __IOM uint32_t CMPRB5;                      /*!< (@ 0x000000A8) Counter/Timer B5 Compare Registers                         */
+
+    struct {
+      __IOM uint32_t CMPR0B5    : 16;           /*!< [15..0] Counter/Timer B5 Compare Register 0.                              */
+      __IOM uint32_t CMPR1B5    : 16;           /*!< [31..16] Counter/Timer B5 Compare Register 1.                             */
+    } CMPRB5_b;
+  } ;
+
+  union {
+    __IOM uint32_t CTRL5;                       /*!< (@ 0x000000AC) Counter/Timer Control                                      */
+
+    struct {
+      __IOM uint32_t TMRA5EN    : 1;            /*!< [0..0] Counter/Timer A5 Enable bit.                                       */
+      __IOM uint32_t TMRA5CLK   : 5;            /*!< [5..1] Counter/Timer A5 Clock Select.                                     */
+      __IOM uint32_t TMRA5FN    : 3;            /*!< [8..6] Counter/Timer A5 Function Select.                                  */
+      __IOM uint32_t TMRA5IE0   : 1;            /*!< [9..9] Counter/Timer A5 Interrupt Enable bit based on COMPR0.             */
+      __IOM uint32_t TMRA5IE1   : 1;            /*!< [10..10] Counter/Timer A5 Interrupt Enable bit based on COMPR1.           */
+      __IOM uint32_t TMRA5CLR   : 1;            /*!< [11..11] Counter/Timer A5 Clear bit.                                      */
+      __IOM uint32_t TMRA5POL   : 1;            /*!< [12..12] Counter/Timer A5 output polarity.                                */
+      __IM  uint32_t            : 3;
+      __IOM uint32_t TMRB5EN    : 1;            /*!< [16..16] Counter/Timer B5 Enable bit.                                     */
+      __IOM uint32_t TMRB5CLK   : 5;            /*!< [21..17] Counter/Timer B5 Clock Select.                                   */
+      __IOM uint32_t TMRB5FN    : 3;            /*!< [24..22] Counter/Timer B5 Function Select.                                */
+      __IOM uint32_t TMRB5IE0   : 1;            /*!< [25..25] Counter/Timer B5 Interrupt Enable bit for COMPR0.                */
+      __IOM uint32_t TMRB5IE1   : 1;            /*!< [26..26] Counter/Timer B5 Interrupt Enable bit for COMPR1.                */
+      __IOM uint32_t TMRB5CLR   : 1;            /*!< [27..27] Counter/Timer B5 Clear bit.                                      */
+      __IOM uint32_t TMRB5POL   : 1;            /*!< [28..28] Counter/Timer B5 output polarity.                                */
+      __IM  uint32_t            : 2;
+      __IOM uint32_t CTLINK5    : 1;            /*!< [31..31] Counter/Timer A5/B5 Link bit.                                    */
+    } CTRL5_b;
+  } ;
+  __IM  uint32_t  RESERVED5;
+
+  union {
+    __IOM uint32_t CMPRAUXA5;                   /*!< (@ 0x000000B4) Counter/Timer A5 Compare Registers                         */
+
+    struct {
+      __IOM uint32_t CMPR2A5    : 16;           /*!< [15..0] Counter/Timer A5 Compare Register 2. Holds the lower
+                                                     limit for timer half A.                                                   */
+      __IOM uint32_t CMPR3A5    : 16;           /*!< [31..16] Counter/Timer A5 Compare Register 3. Holds the upper
+                                                     limit for timer half A.                                                   */
+    } CMPRAUXA5_b;
+  } ;
+
+  union {
+    __IOM uint32_t CMPRAUXB5;                   /*!< (@ 0x000000B8) Counter/Timer B5 Compare Registers                         */
+
+    struct {
+      __IOM uint32_t CMPR2B5    : 16;           /*!< [15..0] Counter/Timer B5 Compare Register 2. Holds the lower
+                                                     limit for timer half B.                                                   */
+      __IOM uint32_t CMPR3B5    : 16;           /*!< [31..16] Counter/Timer B5 Compare Register 3. Holds the upper
+                                                     limit for timer half B.                                                   */
+    } CMPRAUXB5_b;
+  } ;
+
+  union {
+    __IOM uint32_t AUX5;                        /*!< (@ 0x000000BC) Counter/Timer Auxiliary                                    */
+
+    struct {
+      __IOM uint32_t TMRA5LMT   : 7;            /*!< [6..0] Counter/Timer A5 Pattern Limit Count.                              */
+      __IOM uint32_t TMRA5TRIG  : 4;            /*!< [10..7] Counter/Timer A5 Trigger Select.                                  */
+      __IOM uint32_t TMRA5NOSYNC : 1;           /*!< [11..11] Source clock synchronization control.                            */
+      __IOM uint32_t TMRA5TINV  : 1;            /*!< [12..12] Counter/Timer A5 Invert on trigger.                              */
+      __IOM uint32_t TMRA5POL23 : 1;            /*!< [13..13] Counter/Timer A5 Upper output polarity                           */
+      __IOM uint32_t TMRA5EN23  : 1;            /*!< [14..14] Counter/Timer A5 Upper compare enable.                           */
+      __IM  uint32_t            : 1;
+      __IOM uint32_t TMRB5LMT   : 6;            /*!< [21..16] Counter/Timer B5 Pattern Limit Count.                            */
+      __IM  uint32_t            : 1;
+      __IOM uint32_t TMRB5TRIG  : 4;            /*!< [26..23] Counter/Timer B5 Trigger Select.                                 */
+      __IOM uint32_t TMRB5NOSYNC : 1;           /*!< [27..27] Source clock synchronization control.                            */
+      __IOM uint32_t TMRB5TINV  : 1;            /*!< [28..28] Counter/Timer B5 Invert on trigger.                              */
+      __IOM uint32_t TMRB5POL23 : 1;            /*!< [29..29] Upper output polarity                                            */
+      __IOM uint32_t TMRB5EN23  : 1;            /*!< [30..30] Counter/Timer B5 Upper compare enable.                           */
+    } AUX5_b;
+  } ;
+
+  union {
+    __IOM uint32_t TMR6;                        /*!< (@ 0x000000C0) Counter/Timer Register                                     */
+
+    struct {
+      __IOM uint32_t CTTMRA6    : 16;           /*!< [15..0] Counter/Timer A6.                                                 */
+      __IOM uint32_t CTTMRB6    : 16;           /*!< [31..16] Counter/Timer B6.                                                */
+    } TMR6_b;
+  } ;
+
+  union {
+    __IOM uint32_t CMPRA6;                      /*!< (@ 0x000000C4) Counter/Timer A6 Compare Registers                         */
+
+    struct {
+      __IOM uint32_t CMPR0A6    : 16;           /*!< [15..0] Counter/Timer A6 Compare Register 0.                              */
+      __IOM uint32_t CMPR1A6    : 16;           /*!< [31..16] Counter/Timer A6 Compare Register 1.                             */
+    } CMPRA6_b;
+  } ;
+
+  union {
+    __IOM uint32_t CMPRB6;                      /*!< (@ 0x000000C8) Counter/Timer B6 Compare Registers                         */
+
+    struct {
+      __IOM uint32_t CMPR0B6    : 16;           /*!< [15..0] Counter/Timer B6 Compare Register 0.                              */
+      __IOM uint32_t CMPR1B6    : 16;           /*!< [31..16] Counter/Timer B6 Compare Register 1.                             */
+    } CMPRB6_b;
+  } ;
+
+  union {
+    __IOM uint32_t CTRL6;                       /*!< (@ 0x000000CC) Counter/Timer Control                                      */
+
+    struct {
+      __IOM uint32_t TMRA6EN    : 1;            /*!< [0..0] Counter/Timer A6 Enable bit.                                       */
+      __IOM uint32_t TMRA6CLK   : 5;            /*!< [5..1] Counter/Timer A6 Clock Select.                                     */
+      __IOM uint32_t TMRA6FN    : 3;            /*!< [8..6] Counter/Timer A6 Function Select.                                  */
+      __IOM uint32_t TMRA6IE0   : 1;            /*!< [9..9] Counter/Timer A6 Interrupt Enable bit based on COMPR0.             */
+      __IOM uint32_t TMRA6IE1   : 1;            /*!< [10..10] Counter/Timer A6 Interrupt Enable bit based on COMPR1.           */
+      __IOM uint32_t TMRA6CLR   : 1;            /*!< [11..11] Counter/Timer A6 Clear bit.                                      */
+      __IOM uint32_t TMRA6POL   : 1;            /*!< [12..12] Counter/Timer A6 output polarity.                                */
+      __IM  uint32_t            : 3;
+      __IOM uint32_t TMRB6EN    : 1;            /*!< [16..16] Counter/Timer B6 Enable bit.                                     */
+      __IOM uint32_t TMRB6CLK   : 5;            /*!< [21..17] Counter/Timer B6 Clock Select.                                   */
+      __IOM uint32_t TMRB6FN    : 3;            /*!< [24..22] Counter/Timer B6 Function Select.                                */
+      __IOM uint32_t TMRB6IE0   : 1;            /*!< [25..25] Counter/Timer B6 Interrupt Enable bit for COMPR0.                */
+      __IOM uint32_t TMRB6IE1   : 1;            /*!< [26..26] Counter/Timer B6 Interrupt Enable bit for COMPR1.                */
+      __IOM uint32_t TMRB6CLR   : 1;            /*!< [27..27] Counter/Timer B6 Clear bit.                                      */
+      __IOM uint32_t TMRB6POL   : 1;            /*!< [28..28] Counter/Timer B6 output polarity.                                */
+      __IM  uint32_t            : 2;
+      __IOM uint32_t CTLINK6    : 1;            /*!< [31..31] Counter/Timer A6/B6 Link bit.                                    */
+    } CTRL6_b;
+  } ;
+  __IM  uint32_t  RESERVED6;
+
+  union {
+    __IOM uint32_t CMPRAUXA6;                   /*!< (@ 0x000000D4) Counter/Timer A6 Compare Registers                         */
+
+    struct {
+      __IOM uint32_t CMPR2A6    : 16;           /*!< [15..0] Counter/Timer A6 Compare Register 2. Holds the lower
+                                                     limit for timer half A.                                                   */
+      __IOM uint32_t CMPR3A6    : 16;           /*!< [31..16] Counter/Timer A6 Compare Register 3. Holds the upper
+                                                     limit for timer half A.                                                   */
+    } CMPRAUXA6_b;
+  } ;
+
+  union {
+    __IOM uint32_t CMPRAUXB6;                   /*!< (@ 0x000000D8) Counter/Timer B6 Compare Registers                         */
+
+    struct {
+      __IOM uint32_t CMPR2B6    : 16;           /*!< [15..0] Counter/Timer B6 Compare Register 2. Holds the lower
+                                                     limit for timer half B.                                                   */
+      __IOM uint32_t CMPR3B6    : 16;           /*!< [31..16] Counter/Timer B6 Compare Register 3. Holds the upper
+                                                     limit for timer half B.                                                   */
+    } CMPRAUXB6_b;
+  } ;
+
+  union {
+    __IOM uint32_t AUX6;                        /*!< (@ 0x000000DC) Counter/Timer Auxiliary                                    */
+
+    struct {
+      __IOM uint32_t TMRA6LMT   : 7;            /*!< [6..0] Counter/Timer A6 Pattern Limit Count.                              */
+      __IOM uint32_t TMRA6TRIG  : 4;            /*!< [10..7] Counter/Timer A6 Trigger Select.                                  */
+      __IOM uint32_t TMRA6NOSYNC : 1;           /*!< [11..11] Source clock synchronization control.                            */
+      __IOM uint32_t TMRA6TINV  : 1;            /*!< [12..12] Counter/Timer A6 Invert on trigger.                              */
+      __IOM uint32_t TMRA6POL23 : 1;            /*!< [13..13] Counter/Timer A6 Upper output polarity                           */
+      __IOM uint32_t TMRA6EN23  : 1;            /*!< [14..14] Counter/Timer A6 Upper compare enable.                           */
+      __IM  uint32_t            : 1;
+      __IOM uint32_t TMRB6LMT   : 6;            /*!< [21..16] Counter/Timer B6 Pattern Limit Count.                            */
+      __IM  uint32_t            : 1;
+      __IOM uint32_t TMRB6TRIG  : 4;            /*!< [26..23] Counter/Timer B6 Trigger Select.                                 */
+      __IOM uint32_t TMRB6NOSYNC : 1;           /*!< [27..27] Source clock synchronization control.                            */
+      __IOM uint32_t TMRB6TINV  : 1;            /*!< [28..28] Counter/Timer B6 Invert on trigger.                              */
+      __IOM uint32_t TMRB6POL23 : 1;            /*!< [29..29] Upper output polarity                                            */
+      __IOM uint32_t TMRB6EN23  : 1;            /*!< [30..30] Counter/Timer B6 Upper compare enable.                           */
+    } AUX6_b;
+  } ;
+
+  union {
+    __IOM uint32_t TMR7;                        /*!< (@ 0x000000E0) Counter/Timer Register                                     */
+
+    struct {
+      __IOM uint32_t CTTMRA7    : 16;           /*!< [15..0] Counter/Timer A7.                                                 */
+      __IOM uint32_t CTTMRB7    : 16;           /*!< [31..16] Counter/Timer B7.                                                */
+    } TMR7_b;
+  } ;
+
+  union {
+    __IOM uint32_t CMPRA7;                      /*!< (@ 0x000000E4) Counter/Timer A7 Compare Registers                         */
+
+    struct {
+      __IOM uint32_t CMPR0A7    : 16;           /*!< [15..0] Counter/Timer A7 Compare Register 0.                              */
+      __IOM uint32_t CMPR1A7    : 16;           /*!< [31..16] Counter/Timer A7 Compare Register 1.                             */
+    } CMPRA7_b;
+  } ;
+
+  union {
+    __IOM uint32_t CMPRB7;                      /*!< (@ 0x000000E8) Counter/Timer B7 Compare Registers                         */
+
+    struct {
+      __IOM uint32_t CMPR0B7    : 16;           /*!< [15..0] Counter/Timer B3 Compare Register 0.                              */
+      __IOM uint32_t CMPR1B7    : 16;           /*!< [31..16] Counter/Timer B3 Compare Register 1.                             */
+    } CMPRB7_b;
+  } ;
+
+  union {
+    __IOM uint32_t CTRL7;                       /*!< (@ 0x000000EC) Counter/Timer Control                                      */
+
+    struct {
+      __IOM uint32_t TMRA7EN    : 1;            /*!< [0..0] Counter/Timer A7 Enable bit.                                       */
+      __IOM uint32_t TMRA7CLK   : 5;            /*!< [5..1] Counter/Timer A7 Clock Select.                                     */
+      __IOM uint32_t TMRA7FN    : 3;            /*!< [8..6] Counter/Timer A7 Function Select.                                  */
+      __IOM uint32_t TMRA7IE0   : 1;            /*!< [9..9] Counter/Timer A7 Interrupt Enable bit based on COMPR0.             */
+      __IOM uint32_t TMRA7IE1   : 1;            /*!< [10..10] Counter/Timer A7 Interrupt Enable bit based on COMPR1.           */
+      __IOM uint32_t TMRA7CLR   : 1;            /*!< [11..11] Counter/Timer A7 Clear bit.                                      */
+      __IOM uint32_t TMRA7POL   : 1;            /*!< [12..12] Counter/Timer A7 output polarity.                                */
+      __IM  uint32_t            : 3;
+      __IOM uint32_t TMRB7EN    : 1;            /*!< [16..16] Counter/Timer B7 Enable bit.                                     */
+      __IOM uint32_t TMRB7CLK   : 5;            /*!< [21..17] Counter/Timer B7 Clock Select.                                   */
+      __IOM uint32_t TMRB7FN    : 3;            /*!< [24..22] Counter/Timer B7 Function Select.                                */
+      __IOM uint32_t TMRB7IE0   : 1;            /*!< [25..25] Counter/Timer B7 Interrupt Enable bit for COMPR0.                */
+      __IOM uint32_t TMRB7IE1   : 1;            /*!< [26..26] Counter/Timer B7 Interrupt Enable bit for COMPR1.                */
+      __IOM uint32_t TMRB7CLR   : 1;            /*!< [27..27] Counter/Timer B7 Clear bit.                                      */
+      __IOM uint32_t TMRB7POL   : 1;            /*!< [28..28] Counter/Timer B7 output polarity.                                */
+      __IM  uint32_t            : 2;
+      __IOM uint32_t CTLINK7    : 1;            /*!< [31..31] Counter/Timer A7/B7 Link bit.                                    */
+    } CTRL7_b;
+  } ;
+  __IM  uint32_t  RESERVED7;
+
+  union {
+    __IOM uint32_t CMPRAUXA7;                   /*!< (@ 0x000000F4) Counter/Timer A7 Compare Registers                         */
+
+    struct {
+      __IOM uint32_t CMPR2A7    : 16;           /*!< [15..0] Counter/Timer A7 Compare Register 2. Holds the lower
+                                                     limit for timer half A.                                                   */
+      __IOM uint32_t CMPR3A7    : 16;           /*!< [31..16] Counter/Timer A7 Compare Register 3. Holds the upper
+                                                     limit for timer half A.                                                   */
+    } CMPRAUXA7_b;
+  } ;
+
+  union {
+    __IOM uint32_t CMPRAUXB7;                   /*!< (@ 0x000000F8) Counter/Timer B7 Compare Registers                         */
+
+    struct {
+      __IOM uint32_t CMPR2B7    : 16;           /*!< [15..0] Counter/Timer B7 Compare Register 2. Holds the lower
+                                                     limit for timer half B.                                                   */
+      __IOM uint32_t CMPR3B7    : 16;           /*!< [31..16] Counter/Timer B7 Compare Register 3. Holds the upper
+                                                     limit for timer half B.                                                   */
+    } CMPRAUXB7_b;
+  } ;
+
+  union {
+    __IOM uint32_t AUX7;                        /*!< (@ 0x000000FC) Counter/Timer Auxiliary                                    */
+
+    struct {
+      __IOM uint32_t TMRA7LMT   : 7;            /*!< [6..0] Counter/Timer A7 Pattern Limit Count.                              */
+      __IOM uint32_t TMRA7TRIG  : 4;            /*!< [10..7] Counter/Timer A7 Trigger Select.                                  */
+      __IOM uint32_t TMRA7NOSYNC : 1;           /*!< [11..11] Source clock synchronization control.                            */
+      __IOM uint32_t TMRA7TINV  : 1;            /*!< [12..12] Counter/Timer A7 Invert on trigger.                              */
+      __IOM uint32_t TMRA7POL23 : 1;            /*!< [13..13] Counter/Timer A7 Upper output polarity                           */
+      __IOM uint32_t TMRA7EN23  : 1;            /*!< [14..14] Counter/Timer A7 Upper compare enable.                           */
+      __IM  uint32_t            : 1;
+      __IOM uint32_t TMRB7LMT   : 6;            /*!< [21..16] Counter/Timer B7 Pattern Limit Count.                            */
+      __IM  uint32_t            : 1;
+      __IOM uint32_t TMRB7TRIG  : 4;            /*!< [26..23] Counter/Timer B7 Trigger Select.                                 */
+      __IOM uint32_t TMRB7NOSYNC : 1;           /*!< [27..27] Source clock synchronization control.                            */
+      __IOM uint32_t TMRB7TINV  : 1;            /*!< [28..28] Counter/Timer B7 Invert on trigger.                              */
+      __IOM uint32_t TMRB7POL23 : 1;            /*!< [29..29] Upper output polarity                                            */
+      __IOM uint32_t TMRB7EN23  : 1;            /*!< [30..30] Counter/Timer B7 Upper compare enable.                           */
+    } AUX7_b;
+  } ;
+
+  union {
+    __IOM uint32_t GLOBEN;                      /*!< (@ 0x00000100) Counter/Timer Global Enable                                */
+
+    struct {
+      __IOM uint32_t ENA0       : 1;            /*!< [0..0] Alternate enable for A0                                            */
+      __IOM uint32_t ENB0       : 1;            /*!< [1..1] Alternate enable for B0                                            */
+      __IOM uint32_t ENA1       : 1;            /*!< [2..2] Alternate enable for A1                                            */
+      __IOM uint32_t ENB1       : 1;            /*!< [3..3] Alternate enable for B1                                            */
+      __IOM uint32_t ENA2       : 1;            /*!< [4..4] Alternate enable for A2                                            */
+      __IOM uint32_t ENB2       : 1;            /*!< [5..5] Alternate enable for B2                                            */
+      __IOM uint32_t ENA3       : 1;            /*!< [6..6] Alternate enable for A3                                            */
+      __IOM uint32_t ENB3       : 1;            /*!< [7..7] Alternate enable for B3.                                           */
+      __IOM uint32_t ENA4       : 1;            /*!< [8..8] Alternate enable for A4                                            */
+      __IOM uint32_t ENB4       : 1;            /*!< [9..9] Alternate enable for B4                                            */
+      __IOM uint32_t ENA5       : 1;            /*!< [10..10] Alternate enable for A5                                          */
+      __IOM uint32_t ENB5       : 1;            /*!< [11..11] Alternate enable for B5                                          */
+      __IOM uint32_t ENA6       : 1;            /*!< [12..12] Alternate enable for A6                                          */
+      __IOM uint32_t ENB6       : 1;            /*!< [13..13] Alternate enable for B6                                          */
+      __IOM uint32_t ENA7       : 1;            /*!< [14..14] Alternate enable for A7                                          */
+      __IOM uint32_t ENB7       : 1;            /*!< [15..15] Alternate enable for B7.                                         */
+    } GLOBEN_b;
+  } ;
+
+  union {
+    __IOM uint32_t OUTCFG0;                     /*!< (@ 0x00000104) Counter/Timer Output Config 0                              */
+
+    struct {
+      __IOM uint32_t CFG0       : 3;            /*!< [2..0] Pad output 0 configuration                                         */
+      __IOM uint32_t CFG1       : 3;            /*!< [5..3] Pad output 1 configuration                                         */
+      __IOM uint32_t CFG2       : 3;            /*!< [8..6] Pad output 2 configuration                                         */
+      __IOM uint32_t CFG3       : 3;            /*!< [11..9] Pad output 3 configuration                                        */
+      __IOM uint32_t CFG4       : 3;            /*!< [14..12] Pad output 4 configuration                                       */
+      __IM  uint32_t            : 1;
+      __IOM uint32_t CFG5       : 3;            /*!< [18..16] Pad output 5 configuration                                       */
+      __IOM uint32_t CFG6       : 3;            /*!< [21..19] Pad output 6 configuration                                       */
+      __IOM uint32_t CFG7       : 3;            /*!< [24..22] Pad output 7 configuration                                       */
+      __IOM uint32_t CFG8       : 3;            /*!< [27..25] Pad output 8 configuration                                       */
+      __IOM uint32_t CFG9       : 3;            /*!< [30..28] Pad output 9 configuration                                       */
+    } OUTCFG0_b;
+  } ;
+
+  union {
+    __IOM uint32_t OUTCFG1;                     /*!< (@ 0x00000108) Counter/Timer Output Config 1                              */
+
+    struct {
+      __IOM uint32_t CFG10      : 3;            /*!< [2..0] Pad output 10 configuration                                        */
+      __IOM uint32_t CFG11      : 3;            /*!< [5..3] Pad output 11 configuration                                        */
+      __IOM uint32_t CFG12      : 3;            /*!< [8..6] Pad output 12 configuration                                        */
+      __IOM uint32_t CFG13      : 3;            /*!< [11..9] Pad output 13 configuration                                       */
+      __IOM uint32_t CFG14      : 3;            /*!< [14..12] Pad output 14 configuration                                      */
+      __IM  uint32_t            : 1;
+      __IOM uint32_t CFG15      : 3;            /*!< [18..16] Pad output 15 configuration                                      */
+      __IOM uint32_t CFG16      : 3;            /*!< [21..19] Pad output 16 configuration                                      */
+      __IOM uint32_t CFG17      : 3;            /*!< [24..22] Pad output 17 configuration                                      */
+      __IOM uint32_t CFG18      : 3;            /*!< [27..25] Pad output 18 configuration                                      */
+      __IOM uint32_t CFG19      : 3;            /*!< [30..28] Pad output 19 configuration                                      */
+    } OUTCFG1_b;
+  } ;
+
+  union {
+    __IOM uint32_t OUTCFG2;                     /*!< (@ 0x0000010C) Counter/Timer Output Config 2                              */
+
+    struct {
+      __IOM uint32_t CFG20      : 3;            /*!< [2..0] Pad output 20 configuration                                        */
+      __IOM uint32_t CFG21      : 3;            /*!< [5..3] Pad output 21 configuration                                        */
+      __IOM uint32_t CFG22      : 3;            /*!< [8..6] Pad output 22 configuration                                        */
+      __IOM uint32_t CFG23      : 3;            /*!< [11..9] Pad output 23 configuration                                       */
+      __IOM uint32_t CFG24      : 3;            /*!< [14..12] Pad output 24 configuration                                      */
+      __IM  uint32_t            : 1;
+      __IOM uint32_t CFG25      : 3;            /*!< [18..16] Pad output 25 configuration                                      */
+      __IOM uint32_t CFG26      : 3;            /*!< [21..19] Pad output 26 configuration                                      */
+      __IOM uint32_t CFG27      : 3;            /*!< [24..22] Pad output 27 configuration                                      */
+      __IOM uint32_t CFG28      : 3;            /*!< [27..25] Pad output 28 configuration                                      */
+      __IOM uint32_t CFG29      : 3;            /*!< [30..28] Pad output 29 configuration                                      */
+    } OUTCFG2_b;
+  } ;
+  __IM  uint32_t  RESERVED8;
+
+  union {
+    __IOM uint32_t OUTCFG3;                     /*!< (@ 0x00000114) Counter/Timer Output Config 3                              */
+
+    struct {
+      __IOM uint32_t CFG30      : 3;            /*!< [2..0] Pad output 30 configuration                                        */
+      __IOM uint32_t CFG31      : 3;            /*!< [5..3] Pad output 31 configuration                                        */
+    } OUTCFG3_b;
+  } ;
+
+  union {
+    __IOM uint32_t INCFG;                       /*!< (@ 0x00000118) Counter/Timer Input Config                                 */
+
+    struct {
+      __IOM uint32_t CFGA0      : 1;            /*!< [0..0] CTIMER A0 input configuration                                      */
+      __IOM uint32_t CFGB0      : 1;            /*!< [1..1] CTIMER B0 input configuration                                      */
+      __IOM uint32_t CFGA1      : 1;            /*!< [2..2] CTIMER A1 input configuration                                      */
+      __IOM uint32_t CFGB1      : 1;            /*!< [3..3] CTIMER B1 input configuration                                      */
+      __IOM uint32_t CFGA2      : 1;            /*!< [4..4] CTIMER A2 input configuration                                      */
+      __IOM uint32_t CFGB2      : 1;            /*!< [5..5] CTIMER B2 input configuration                                      */
+      __IOM uint32_t CFGA3      : 1;            /*!< [6..6] CTIMER A3 input configuration                                      */
+      __IOM uint32_t CFGB3      : 1;            /*!< [7..7] CTIMER B3 input configuration                                      */
+      __IOM uint32_t CFGA4      : 1;            /*!< [8..8] CTIMER A4 input configuration                                      */
+      __IOM uint32_t CFGB4      : 1;            /*!< [9..9] CTIMER B4 input configuration                                      */
+      __IOM uint32_t CFGA5      : 1;            /*!< [10..10] CTIMER A5 input configuration                                    */
+      __IOM uint32_t CFGB5      : 1;            /*!< [11..11] CTIMER B5 input configuration                                    */
+      __IOM uint32_t CFGA6      : 1;            /*!< [12..12] CTIMER A6 input configuration                                    */
+      __IOM uint32_t CFGB6      : 1;            /*!< [13..13] CTIMER B6 input configuration                                    */
+      __IOM uint32_t CFGA7      : 1;            /*!< [14..14] CTIMER A7 input configuration                                    */
+      __IOM uint32_t CFGB7      : 1;            /*!< [15..15] CTIMER B7 input configuration                                    */
+    } INCFG_b;
+  } ;
+  __IM  uint32_t  RESERVED9[9];
+
+  union {
+    __IOM uint32_t STCFG;                       /*!< (@ 0x00000140) Configuration Register                                     */
+
+    struct {
+      __IOM uint32_t CLKSEL     : 4;            /*!< [3..0] Selects an appropriate clock source and divider to use
+                                                     for the System Timer clock.                                               */
+      __IM  uint32_t            : 4;
+      __IOM uint32_t COMPARE_A_EN : 1;          /*!< [8..8] Selects whether compare is enabled for the corresponding
+                                                     SCMPR register. If compare is enabled, the interrupt status
+                                                     is set once the comparision is met.                                       */
+      __IOM uint32_t COMPARE_B_EN : 1;          /*!< [9..9] Selects whether compare is enabled for the corresponding
+                                                     SCMPR register. If compare is enabled, the interrupt status
+                                                     is set once the comparision is met.                                       */
+      __IOM uint32_t COMPARE_C_EN : 1;          /*!< [10..10] Selects whether compare is enabled for the corresponding
+                                                     SCMPR register. If compare is enabled, the interrupt status
+                                                     is set once the comparision is met.                                       */
+      __IOM uint32_t COMPARE_D_EN : 1;          /*!< [11..11] Selects whether compare is enabled for the corresponding
+                                                     SCMPR register. If compare is enabled, the interrupt status
+                                                     is set once the comparision is met.                                       */
+      __IOM uint32_t COMPARE_E_EN : 1;          /*!< [12..12] Selects whether compare is enabled for the corresponding
+                                                     SCMPR register. If compare is enabled, the interrupt status
+                                                     is set once the comparision is met.                                       */
+      __IOM uint32_t COMPARE_F_EN : 1;          /*!< [13..13] Selects whether compare is enabled for the corresponding
+                                                     SCMPR register. If compare is enabled, the interrupt status
+                                                     is set once the comparision is met.                                       */
+      __IOM uint32_t COMPARE_G_EN : 1;          /*!< [14..14] Selects whether compare is enabled for the corresponding
+                                                     SCMPR register. If compare is enabled, the interrupt status
+                                                     is set once the comparision is met.                                       */
+      __IOM uint32_t COMPARE_H_EN : 1;          /*!< [15..15] Selects whether compare is enabled for the corresponding
+                                                     SCMPR register. If compare is enabled, the interrupt status
+                                                     is set once the comparision is met.                                       */
+      __IM  uint32_t            : 14;
+      __IOM uint32_t CLEAR      : 1;            /*!< [30..30] Set this bit to one to clear the System Timer register.
+                                                     If this bit is set to '1', the system timer register will
+                                                     stay cleared. It needs to be set to '0' for the system
+                                                     timer to start running.                                                   */
+      __IOM uint32_t FREEZE     : 1;            /*!< [31..31] Set this bit to one to freeze the clock input to the
+                                                     COUNTER register. Once frozen, the value can be safely
+                                                     written from the MCU. Unfreeze to resume.                                 */
+    } STCFG_b;
+  } ;
+
+  union {
+    __IOM uint32_t STTMR;                       /*!< (@ 0x00000144) System Timer Count Register (Real Time Counter)            */
+
+    struct {
+      __IOM uint32_t STTMR      : 32;           /*!< [31..0] Value of the 32-bit counter as it ticks over.                     */
+    } STTMR_b;
+  } ;
+
+  union {
+    __IOM uint32_t CAPTURECONTROL;              /*!< (@ 0x00000148) Capture Control Register                                   */
+
+    struct {
+      __IOM uint32_t CAPTURE0   : 1;            /*!< [0..0] Selects whether capture is enabled for the specified
+                                                     capture register.                                                         */
+      __IOM uint32_t CAPTURE1   : 1;            /*!< [1..1] Selects whether capture is enabled for the specified
+                                                     capture register.                                                         */
+      __IOM uint32_t CAPTURE2   : 1;            /*!< [2..2] Selects whether capture is enabled for the specified
+                                                     capture register.                                                         */
+      __IOM uint32_t CAPTURE3   : 1;            /*!< [3..3] Selects whether capture is enabled for the specified
+                                                     capture register.                                                         */
+    } CAPTURECONTROL_b;
+  } ;
+  __IM  uint32_t  RESERVED10;
+
+  union {
+    __IOM uint32_t SCMPR0;                      /*!< (@ 0x00000150) Compare Register A                                         */
+
+    struct {
+      __IOM uint32_t SCMPR0     : 32;           /*!< [31..0] Compare this value to the value in the COUNTER register
+                                                     according to the match criterion, as selected in the COMPARE_A_EN
+                                                     bit in the REG_CTIMER_STCGF register.                                     */
+    } SCMPR0_b;
+  } ;
+
+  union {
+    __IOM uint32_t SCMPR1;                      /*!< (@ 0x00000154) Compare Register B                                         */
+
+    struct {
+      __IOM uint32_t SCMPR1     : 32;           /*!< [31..0] Compare this value to the value in the COUNTER register
+                                                     according to the match criterion, as selected in the COMPARE_B_EN
+                                                     bit in the REG_CTIMER_STCGF register.                                     */
+    } SCMPR1_b;
+  } ;
+
+  union {
+    __IOM uint32_t SCMPR2;                      /*!< (@ 0x00000158) Compare Register C                                         */
+
+    struct {
+      __IOM uint32_t SCMPR2     : 32;           /*!< [31..0] Compare this value to the value in the COUNTER register
+                                                     according to the match criterion, as selected in the COMPARE_C_EN
+                                                     bit in the REG_CTIMER_STCGF register.                                     */
+    } SCMPR2_b;
+  } ;
+
+  union {
+    __IOM uint32_t SCMPR3;                      /*!< (@ 0x0000015C) Compare Register D                                         */
+
+    struct {
+      __IOM uint32_t SCMPR3     : 32;           /*!< [31..0] Compare this value to the value in the COUNTER register
+                                                     according to the match criterion, as selected in the COMPARE_D_EN
+                                                     bit in the REG_CTIMER_STCGF register.                                     */
+    } SCMPR3_b;
+  } ;
+
+  union {
+    __IOM uint32_t SCMPR4;                      /*!< (@ 0x00000160) Compare Register E                                         */
+
+    struct {
+      __IOM uint32_t SCMPR4     : 32;           /*!< [31..0] Compare this value to the value in the COUNTER register
+                                                     according to the match criterion, as selected in the COMPARE_E_EN
+                                                     bit in the REG_CTIMER_STCGF register.                                     */
+    } SCMPR4_b;
+  } ;
+
+  union {
+    __IOM uint32_t SCMPR5;                      /*!< (@ 0x00000164) Compare Register F                                         */
+
+    struct {
+      __IOM uint32_t SCMPR5     : 32;           /*!< [31..0] Compare this value to the value in the COUNTER register
+                                                     according to the match criterion, as selected in the COMPARE_F_EN
+                                                     bit in the REG_CTIMER_STCGF register.                                     */
+    } SCMPR5_b;
+  } ;
+
+  union {
+    __IOM uint32_t SCMPR6;                      /*!< (@ 0x00000168) Compare Register G                                         */
+
+    struct {
+      __IOM uint32_t SCMPR6     : 32;           /*!< [31..0] Compare this value to the value in the COUNTER register
+                                                     according to the match criterion, as selected in the COMPARE_G_EN
+                                                     bit in the REG_CTIMER_STCGF register.                                     */
+    } SCMPR6_b;
+  } ;
+
+  union {
+    __IOM uint32_t SCMPR7;                      /*!< (@ 0x0000016C) Compare Register H                                         */
+
+    struct {
+      __IOM uint32_t SCMPR7     : 32;           /*!< [31..0] Compare this value to the value in the COUNTER register
+                                                     according to the match criterion, as selected in the COMPARE_H_EN
+                                                     bit in the REG_CTIMER_STCGF register.                                     */
+    } SCMPR7_b;
+  } ;
+  __IM  uint32_t  RESERVED11[28];
+
+  union {
+    __IOM uint32_t SCAPT0;                      /*!< (@ 0x000001E0) Capture Register A                                         */
+
+    struct {
+      __IOM uint32_t SCAPT0     : 32;           /*!< [31..0] Whenever the event is detected, the value in the COUNTER
+                                                     is copied into this register and the corresponding interrupt
+                                                     status bit is set.                                                        */
+    } SCAPT0_b;
+  } ;
+
+  union {
+    __IOM uint32_t SCAPT1;                      /*!< (@ 0x000001E4) Capture Register B                                         */
+
+    struct {
+      __IOM uint32_t SCAPT1     : 32;           /*!< [31..0] Whenever the event is detected, the value in the COUNTER
+                                                     is copied into this register and the corresponding interrupt
+                                                     status bit is set.                                                        */
+    } SCAPT1_b;
+  } ;
+
+  union {
+    __IOM uint32_t SCAPT2;                      /*!< (@ 0x000001E8) Capture Register C                                         */
+
+    struct {
+      __IOM uint32_t SCAPT2     : 32;           /*!< [31..0] Whenever the event is detected, the value in the COUNTER
+                                                     is copied into this register and the corresponding interrupt
+                                                     status bit is set.                                                        */
+    } SCAPT2_b;
+  } ;
+
+  union {
+    __IOM uint32_t SCAPT3;                      /*!< (@ 0x000001EC) Capture Register D                                         */
+
+    struct {
+      __IOM uint32_t SCAPT3     : 32;           /*!< [31..0] Whenever the event is detected, the value in the COUNTER
+                                                     is copied into this register and the corresponding interrupt
+                                                     status bit is set.                                                        */
+    } SCAPT3_b;
+  } ;
+
+  union {
+    __IOM uint32_t SNVR0;                       /*!< (@ 0x000001F0) System Timer NVRAM_A Register                              */
+
+    struct {
+      __IOM uint32_t SNVR0      : 32;           /*!< [31..0] Value of the 32-bit counter as it ticks over.                     */
+    } SNVR0_b;
+  } ;
+
+  union {
+    __IOM uint32_t SNVR1;                       /*!< (@ 0x000001F4) System Timer NVRAM_B Register                              */
+
+    struct {
+      __IOM uint32_t SNVR1      : 32;           /*!< [31..0] Value of the 32-bit counter as it ticks over.                     */
+    } SNVR1_b;
+  } ;
+
+  union {
+    __IOM uint32_t SNVR2;                       /*!< (@ 0x000001F8) System Timer NVRAM_C Register                              */
+
+    struct {
+      __IOM uint32_t SNVR2      : 32;           /*!< [31..0] Value of the 32-bit counter as it ticks over.                     */
+    } SNVR2_b;
+  } ;
+
+  union {
+    __IOM uint32_t SNVR3;                       /*!< (@ 0x000001FC) System Timer NVRAM_D Register                              */
+
+    struct {
+      __IOM uint32_t SNVR3      : 32;           /*!< [31..0] Value of the 32-bit counter as it ticks over.                     */
+    } SNVR3_b;
+  } ;
+
+  union {
+    __IOM uint32_t INTEN;                       /*!< (@ 0x00000200) Counter/Timer Interrupts: Enable                           */
+
+    struct {
+      __IOM uint32_t CTMRA0C0INT : 1;           /*!< [0..0] Counter/Timer A0 interrupt based on COMPR0.                        */
+      __IOM uint32_t CTMRB0C0INT : 1;           /*!< [1..1] Counter/Timer B0 interrupt based on COMPR0.                        */
+      __IOM uint32_t CTMRA1C0INT : 1;           /*!< [2..2] Counter/Timer A1 interrupt based on COMPR0.                        */
+      __IOM uint32_t CTMRB1C0INT : 1;           /*!< [3..3] Counter/Timer B1 interrupt based on COMPR0.                        */
+      __IOM uint32_t CTMRA2C0INT : 1;           /*!< [4..4] Counter/Timer A2 interrupt based on COMPR0.                        */
+      __IOM uint32_t CTMRB2C0INT : 1;           /*!< [5..5] Counter/Timer B2 interrupt based on COMPR0.                        */
+      __IOM uint32_t CTMRA3C0INT : 1;           /*!< [6..6] Counter/Timer A3 interrupt based on COMPR0.                        */
+      __IOM uint32_t CTMRB3C0INT : 1;           /*!< [7..7] Counter/Timer B3 interrupt based on COMPR0.                        */
+      __IOM uint32_t CTMRA4C0INT : 1;           /*!< [8..8] Counter/Timer A4 interrupt based on COMPR0.                        */
+      __IOM uint32_t CTMRB4C0INT : 1;           /*!< [9..9] Counter/Timer B4 interrupt based on COMPR0.                        */
+      __IOM uint32_t CTMRA5C0INT : 1;           /*!< [10..10] Counter/Timer A5 interrupt based on COMPR0.                      */
+      __IOM uint32_t CTMRB5C0INT : 1;           /*!< [11..11] Counter/Timer B5 interrupt based on COMPR0.                      */
+      __IOM uint32_t CTMRA6C0INT : 1;           /*!< [12..12] Counter/Timer A6 interrupt based on COMPR0.                      */
+      __IOM uint32_t CTMRB6C0INT : 1;           /*!< [13..13] Counter/Timer B6 interrupt based on COMPR0.                      */
+      __IOM uint32_t CTMRA7C0INT : 1;           /*!< [14..14] Counter/Timer A7 interrupt based on COMPR0.                      */
+      __IOM uint32_t CTMRB7C0INT : 1;           /*!< [15..15] Counter/Timer B7 interrupt based on COMPR0.                      */
+      __IOM uint32_t CTMRA0C1INT : 1;           /*!< [16..16] Counter/Timer A0 interrupt based on COMPR1.                      */
+      __IOM uint32_t CTMRB0C1INT : 1;           /*!< [17..17] Counter/Timer B0 interrupt based on COMPR1.                      */
+      __IOM uint32_t CTMRA1C1INT : 1;           /*!< [18..18] Counter/Timer A1 interrupt based on COMPR1.                      */
+      __IOM uint32_t CTMRB1C1INT : 1;           /*!< [19..19] Counter/Timer B1 interrupt based on COMPR1.                      */
+      __IOM uint32_t CTMRA2C1INT : 1;           /*!< [20..20] Counter/Timer A2 interrupt based on COMPR1.                      */
+      __IOM uint32_t CTMRB2C1INT : 1;           /*!< [21..21] Counter/Timer B2 interrupt based on COMPR1.                      */
+      __IOM uint32_t CTMRA3C1INT : 1;           /*!< [22..22] Counter/Timer A3 interrupt based on COMPR1.                      */
+      __IOM uint32_t CTMRB3C1INT : 1;           /*!< [23..23] Counter/Timer B3 interrupt based on COMPR1.                      */
+      __IOM uint32_t CTMRA4C1INT : 1;           /*!< [24..24] Counter/Timer A4 interrupt based on COMPR1.                      */
+      __IOM uint32_t CTMRB4C1INT : 1;           /*!< [25..25] Counter/Timer B4 interrupt based on COMPR1.                      */
+      __IOM uint32_t CTMRA5C1INT : 1;           /*!< [26..26] Counter/Timer A5 interrupt based on COMPR1.                      */
+      __IOM uint32_t CTMRB5C1INT : 1;           /*!< [27..27] Counter/Timer B5 interrupt based on COMPR1.                      */
+      __IOM uint32_t CTMRA6C1INT : 1;           /*!< [28..28] Counter/Timer A6 interrupt based on COMPR1.                      */
+      __IOM uint32_t CTMRB6C1INT : 1;           /*!< [29..29] Counter/Timer B6 interrupt based on COMPR1.                      */
+      __IOM uint32_t CTMRA7C1INT : 1;           /*!< [30..30] Counter/Timer A7 interrupt based on COMPR1.                      */
+      __IOM uint32_t CTMRB7C1INT : 1;           /*!< [31..31] Counter/Timer B7 interrupt based on COMPR1.                      */
+    } INTEN_b;
+  } ;
+
+  union {
+    __IOM uint32_t INTSTAT;                     /*!< (@ 0x00000204) Counter/Timer Interrupts: Status                           */
+
+    struct {
+      __IOM uint32_t CTMRA0C0INT : 1;           /*!< [0..0] Counter/Timer A0 interrupt based on COMPR0.                        */
+      __IOM uint32_t CTMRB0C0INT : 1;           /*!< [1..1] Counter/Timer B0 interrupt based on COMPR0.                        */
+      __IOM uint32_t CTMRA1C0INT : 1;           /*!< [2..2] Counter/Timer A1 interrupt based on COMPR0.                        */
+      __IOM uint32_t CTMRB1C0INT : 1;           /*!< [3..3] Counter/Timer B1 interrupt based on COMPR0.                        */
+      __IOM uint32_t CTMRA2C0INT : 1;           /*!< [4..4] Counter/Timer A2 interrupt based on COMPR0.                        */
+      __IOM uint32_t CTMRB2C0INT : 1;           /*!< [5..5] Counter/Timer B2 interrupt based on COMPR0.                        */
+      __IOM uint32_t CTMRA3C0INT : 1;           /*!< [6..6] Counter/Timer A3 interrupt based on COMPR0.                        */
+      __IOM uint32_t CTMRB3C0INT : 1;           /*!< [7..7] Counter/Timer B3 interrupt based on COMPR0.                        */
+      __IOM uint32_t CTMRA4C0INT : 1;           /*!< [8..8] Counter/Timer A4 interrupt based on COMPR0.                        */
+      __IOM uint32_t CTMRB4C0INT : 1;           /*!< [9..9] Counter/Timer B4 interrupt based on COMPR0.                        */
+      __IOM uint32_t CTMRA5C0INT : 1;           /*!< [10..10] Counter/Timer A5 interrupt based on COMPR0.                      */
+      __IOM uint32_t CTMRB5C0INT : 1;           /*!< [11..11] Counter/Timer B5 interrupt based on COMPR0.                      */
+      __IOM uint32_t CTMRA6C0INT : 1;           /*!< [12..12] Counter/Timer A6 interrupt based on COMPR0.                      */
+      __IOM uint32_t CTMRB6C0INT : 1;           /*!< [13..13] Counter/Timer B6 interrupt based on COMPR0.                      */
+      __IOM uint32_t CTMRA7C0INT : 1;           /*!< [14..14] Counter/Timer A7 interrupt based on COMPR0.                      */
+      __IOM uint32_t CTMRB7C0INT : 1;           /*!< [15..15] Counter/Timer B7 interrupt based on COMPR0.                      */
+      __IOM uint32_t CTMRA0C1INT : 1;           /*!< [16..16] Counter/Timer A0 interrupt based on COMPR1.                      */
+      __IOM uint32_t CTMRB0C1INT : 1;           /*!< [17..17] Counter/Timer B0 interrupt based on COMPR1.                      */
+      __IOM uint32_t CTMRA1C1INT : 1;           /*!< [18..18] Counter/Timer A1 interrupt based on COMPR1.                      */
+      __IOM uint32_t CTMRB1C1INT : 1;           /*!< [19..19] Counter/Timer B1 interrupt based on COMPR1.                      */
+      __IOM uint32_t CTMRA2C1INT : 1;           /*!< [20..20] Counter/Timer A2 interrupt based on COMPR1.                      */
+      __IOM uint32_t CTMRB2C1INT : 1;           /*!< [21..21] Counter/Timer B2 interrupt based on COMPR1.                      */
+      __IOM uint32_t CTMRA3C1INT : 1;           /*!< [22..22] Counter/Timer A3 interrupt based on COMPR1.                      */
+      __IOM uint32_t CTMRB3C1INT : 1;           /*!< [23..23] Counter/Timer B3 interrupt based on COMPR1.                      */
+      __IOM uint32_t CTMRA4C1INT : 1;           /*!< [24..24] Counter/Timer A4 interrupt based on COMPR1.                      */
+      __IOM uint32_t CTMRB4C1INT : 1;           /*!< [25..25] Counter/Timer B4 interrupt based on COMPR1.                      */
+      __IOM uint32_t CTMRA5C1INT : 1;           /*!< [26..26] Counter/Timer A5 interrupt based on COMPR1.                      */
+      __IOM uint32_t CTMRB5C1INT : 1;           /*!< [27..27] Counter/Timer B5 interrupt based on COMPR1.                      */
+      __IOM uint32_t CTMRA6C1INT : 1;           /*!< [28..28] Counter/Timer A6 interrupt based on COMPR1.                      */
+      __IOM uint32_t CTMRB6C1INT : 1;           /*!< [29..29] Counter/Timer B6 interrupt based on COMPR1.                      */
+      __IOM uint32_t CTMRA7C1INT : 1;           /*!< [30..30] Counter/Timer A7 interrupt based on COMPR1.                      */
+      __IOM uint32_t CTMRB7C1INT : 1;           /*!< [31..31] Counter/Timer B7 interrupt based on COMPR1.                      */
+    } INTSTAT_b;
+  } ;
+
+  union {
+    __IOM uint32_t INTCLR;                      /*!< (@ 0x00000208) Counter/Timer Interrupts: Clear                            */
+
+    struct {
+      __IOM uint32_t CTMRA0C0INT : 1;           /*!< [0..0] Counter/Timer A0 interrupt based on COMPR0.                        */
+      __IOM uint32_t CTMRB0C0INT : 1;           /*!< [1..1] Counter/Timer B0 interrupt based on COMPR0.                        */
+      __IOM uint32_t CTMRA1C0INT : 1;           /*!< [2..2] Counter/Timer A1 interrupt based on COMPR0.                        */
+      __IOM uint32_t CTMRB1C0INT : 1;           /*!< [3..3] Counter/Timer B1 interrupt based on COMPR0.                        */
+      __IOM uint32_t CTMRA2C0INT : 1;           /*!< [4..4] Counter/Timer A2 interrupt based on COMPR0.                        */
+      __IOM uint32_t CTMRB2C0INT : 1;           /*!< [5..5] Counter/Timer B2 interrupt based on COMPR0.                        */
+      __IOM uint32_t CTMRA3C0INT : 1;           /*!< [6..6] Counter/Timer A3 interrupt based on COMPR0.                        */
+      __IOM uint32_t CTMRB3C0INT : 1;           /*!< [7..7] Counter/Timer B3 interrupt based on COMPR0.                        */
+      __IOM uint32_t CTMRA4C0INT : 1;           /*!< [8..8] Counter/Timer A4 interrupt based on COMPR0.                        */
+      __IOM uint32_t CTMRB4C0INT : 1;           /*!< [9..9] Counter/Timer B4 interrupt based on COMPR0.                        */
+      __IOM uint32_t CTMRA5C0INT : 1;           /*!< [10..10] Counter/Timer A5 interrupt based on COMPR0.                      */
+      __IOM uint32_t CTMRB5C0INT : 1;           /*!< [11..11] Counter/Timer B5 interrupt based on COMPR0.                      */
+      __IOM uint32_t CTMRA6C0INT : 1;           /*!< [12..12] Counter/Timer A6 interrupt based on COMPR0.                      */
+      __IOM uint32_t CTMRB6C0INT : 1;           /*!< [13..13] Counter/Timer B6 interrupt based on COMPR0.                      */
+      __IOM uint32_t CTMRA7C0INT : 1;           /*!< [14..14] Counter/Timer A7 interrupt based on COMPR0.                      */
+      __IOM uint32_t CTMRB7C0INT : 1;           /*!< [15..15] Counter/Timer B7 interrupt based on COMPR0.                      */
+      __IOM uint32_t CTMRA0C1INT : 1;           /*!< [16..16] Counter/Timer A0 interrupt based on COMPR1.                      */
+      __IOM uint32_t CTMRB0C1INT : 1;           /*!< [17..17] Counter/Timer B0 interrupt based on COMPR1.                      */
+      __IOM uint32_t CTMRA1C1INT : 1;           /*!< [18..18] Counter/Timer A1 interrupt based on COMPR1.                      */
+      __IOM uint32_t CTMRB1C1INT : 1;           /*!< [19..19] Counter/Timer B1 interrupt based on COMPR1.                      */
+      __IOM uint32_t CTMRA2C1INT : 1;           /*!< [20..20] Counter/Timer A2 interrupt based on COMPR1.                      */
+      __IOM uint32_t CTMRB2C1INT : 1;           /*!< [21..21] Counter/Timer B2 interrupt based on COMPR1.                      */
+      __IOM uint32_t CTMRA3C1INT : 1;           /*!< [22..22] Counter/Timer A3 interrupt based on COMPR1.                      */
+      __IOM uint32_t CTMRB3C1INT : 1;           /*!< [23..23] Counter/Timer B3 interrupt based on COMPR1.                      */
+      __IOM uint32_t CTMRA4C1INT : 1;           /*!< [24..24] Counter/Timer A4 interrupt based on COMPR1.                      */
+      __IOM uint32_t CTMRB4C1INT : 1;           /*!< [25..25] Counter/Timer B4 interrupt based on COMPR1.                      */
+      __IOM uint32_t CTMRA5C1INT : 1;           /*!< [26..26] Counter/Timer A5 interrupt based on COMPR1.                      */
+      __IOM uint32_t CTMRB5C1INT : 1;           /*!< [27..27] Counter/Timer B5 interrupt based on COMPR1.                      */
+      __IOM uint32_t CTMRA6C1INT : 1;           /*!< [28..28] Counter/Timer A6 interrupt based on COMPR1.                      */
+      __IOM uint32_t CTMRB6C1INT : 1;           /*!< [29..29] Counter/Timer B6 interrupt based on COMPR1.                      */
+      __IOM uint32_t CTMRA7C1INT : 1;           /*!< [30..30] Counter/Timer A7 interrupt based on COMPR1.                      */
+      __IOM uint32_t CTMRB7C1INT : 1;           /*!< [31..31] Counter/Timer B7 interrupt based on COMPR1.                      */
+    } INTCLR_b;
+  } ;
+
+  union {
+    __IOM uint32_t INTSET;                      /*!< (@ 0x0000020C) Counter/Timer Interrupts: Set                              */
+
+    struct {
+      __IOM uint32_t CTMRA0C0INT : 1;           /*!< [0..0] Counter/Timer A0 interrupt based on COMPR0.                        */
+      __IOM uint32_t CTMRB0C0INT : 1;           /*!< [1..1] Counter/Timer B0 interrupt based on COMPR0.                        */
+      __IOM uint32_t CTMRA1C0INT : 1;           /*!< [2..2] Counter/Timer A1 interrupt based on COMPR0.                        */
+      __IOM uint32_t CTMRB1C0INT : 1;           /*!< [3..3] Counter/Timer B1 interrupt based on COMPR0.                        */
+      __IOM uint32_t CTMRA2C0INT : 1;           /*!< [4..4] Counter/Timer A2 interrupt based on COMPR0.                        */
+      __IOM uint32_t CTMRB2C0INT : 1;           /*!< [5..5] Counter/Timer B2 interrupt based on COMPR0.                        */
+      __IOM uint32_t CTMRA3C0INT : 1;           /*!< [6..6] Counter/Timer A3 interrupt based on COMPR0.                        */
+      __IOM uint32_t CTMRB3C0INT : 1;           /*!< [7..7] Counter/Timer B3 interrupt based on COMPR0.                        */
+      __IOM uint32_t CTMRA4C0INT : 1;           /*!< [8..8] Counter/Timer A4 interrupt based on COMPR0.                        */
+      __IOM uint32_t CTMRB4C0INT : 1;           /*!< [9..9] Counter/Timer B4 interrupt based on COMPR0.                        */
+      __IOM uint32_t CTMRA5C0INT : 1;           /*!< [10..10] Counter/Timer A5 interrupt based on COMPR0.                      */
+      __IOM uint32_t CTMRB5C0INT : 1;           /*!< [11..11] Counter/Timer B5 interrupt based on COMPR0.                      */
+      __IOM uint32_t CTMRA6C0INT : 1;           /*!< [12..12] Counter/Timer A6 interrupt based on COMPR0.                      */
+      __IOM uint32_t CTMRB6C0INT : 1;           /*!< [13..13] Counter/Timer B6 interrupt based on COMPR0.                      */
+      __IOM uint32_t CTMRA7C0INT : 1;           /*!< [14..14] Counter/Timer A7 interrupt based on COMPR0.                      */
+      __IOM uint32_t CTMRB7C0INT : 1;           /*!< [15..15] Counter/Timer B7 interrupt based on COMPR0.                      */
+      __IOM uint32_t CTMRA0C1INT : 1;           /*!< [16..16] Counter/Timer A0 interrupt based on COMPR1.                      */
+      __IOM uint32_t CTMRB0C1INT : 1;           /*!< [17..17] Counter/Timer B0 interrupt based on COMPR1.                      */
+      __IOM uint32_t CTMRA1C1INT : 1;           /*!< [18..18] Counter/Timer A1 interrupt based on COMPR1.                      */
+      __IOM uint32_t CTMRB1C1INT : 1;           /*!< [19..19] Counter/Timer B1 interrupt based on COMPR1.                      */
+      __IOM uint32_t CTMRA2C1INT : 1;           /*!< [20..20] Counter/Timer A2 interrupt based on COMPR1.                      */
+      __IOM uint32_t CTMRB2C1INT : 1;           /*!< [21..21] Counter/Timer B2 interrupt based on COMPR1.                      */
+      __IOM uint32_t CTMRA3C1INT : 1;           /*!< [22..22] Counter/Timer A3 interrupt based on COMPR1.                      */
+      __IOM uint32_t CTMRB3C1INT : 1;           /*!< [23..23] Counter/Timer B3 interrupt based on COMPR1.                      */
+      __IOM uint32_t CTMRA4C1INT : 1;           /*!< [24..24] Counter/Timer A4 interrupt based on COMPR1.                      */
+      __IOM uint32_t CTMRB4C1INT : 1;           /*!< [25..25] Counter/Timer B4 interrupt based on COMPR1.                      */
+      __IOM uint32_t CTMRA5C1INT : 1;           /*!< [26..26] Counter/Timer A5 interrupt based on COMPR1.                      */
+      __IOM uint32_t CTMRB5C1INT : 1;           /*!< [27..27] Counter/Timer B5 interrupt based on COMPR1.                      */
+      __IOM uint32_t CTMRA6C1INT : 1;           /*!< [28..28] Counter/Timer A6 interrupt based on COMPR1.                      */
+      __IOM uint32_t CTMRB6C1INT : 1;           /*!< [29..29] Counter/Timer B6 interrupt based on COMPR1.                      */
+      __IOM uint32_t CTMRA7C1INT : 1;           /*!< [30..30] Counter/Timer A7 interrupt based on COMPR1.                      */
+      __IOM uint32_t CTMRB7C1INT : 1;           /*!< [31..31] Counter/Timer B7 interrupt based on COMPR1.                      */
+    } INTSET_b;
+  } ;
+  __IM  uint32_t  RESERVED12[60];
+
+  union {
+    __IOM uint32_t STMINTEN;                    /*!< (@ 0x00000300) STIMER Interrupt registers: Enable                         */
+
+    struct {
+      __IOM uint32_t COMPAREA   : 1;            /*!< [0..0] COUNTER is greater than or equal to COMPARE register
+                                                     A.                                                                        */
+      __IOM uint32_t COMPAREB   : 1;            /*!< [1..1] COUNTER is greater than or equal to COMPARE register
+                                                     B.                                                                        */
+      __IOM uint32_t COMPAREC   : 1;            /*!< [2..2] COUNTER is greater than or equal to COMPARE register
+                                                     C.                                                                        */
+      __IOM uint32_t COMPARED   : 1;            /*!< [3..3] COUNTER is greater than or equal to COMPARE register
+                                                     D.                                                                        */
+      __IOM uint32_t COMPAREE   : 1;            /*!< [4..4] COUNTER is greater than or equal to COMPARE register
+                                                     E.                                                                        */
+      __IOM uint32_t COMPAREF   : 1;            /*!< [5..5] COUNTER is greater than or equal to COMPARE register
+                                                     F.                                                                        */
+      __IOM uint32_t COMPAREG   : 1;            /*!< [6..6] COUNTER is greater than or equal to COMPARE register
+                                                     G.                                                                        */
+      __IOM uint32_t COMPAREH   : 1;            /*!< [7..7] COUNTER is greater than or equal to COMPARE register
+                                                     H.                                                                        */
+      __IOM uint32_t OVERFLOW   : 1;            /*!< [8..8] COUNTER over flowed from 0xFFFFFFFF back to 0x00000000.            */
+      __IOM uint32_t CAPTUREA   : 1;            /*!< [9..9] CAPTURE register A has grabbed the value in the counter            */
+      __IOM uint32_t CAPTUREB   : 1;            /*!< [10..10] CAPTURE register B has grabbed the value in the counter          */
+      __IOM uint32_t CAPTUREC   : 1;            /*!< [11..11] CAPTURE register C has grabbed the value in the counter          */
+      __IOM uint32_t CAPTURED   : 1;            /*!< [12..12] CAPTURE register D has grabbed the value in the counter          */
+    } STMINTEN_b;
+  } ;
+
+  union {
+    __IOM uint32_t STMINTSTAT;                  /*!< (@ 0x00000304) STIMER Interrupt registers: Status                         */
+
+    struct {
+      __IOM uint32_t COMPAREA   : 1;            /*!< [0..0] COUNTER is greater than or equal to COMPARE register
+                                                     A.                                                                        */
+      __IOM uint32_t COMPAREB   : 1;            /*!< [1..1] COUNTER is greater than or equal to COMPARE register
+                                                     B.                                                                        */
+      __IOM uint32_t COMPAREC   : 1;            /*!< [2..2] COUNTER is greater than or equal to COMPARE register
+                                                     C.                                                                        */
+      __IOM uint32_t COMPARED   : 1;            /*!< [3..3] COUNTER is greater than or equal to COMPARE register
+                                                     D.                                                                        */
+      __IOM uint32_t COMPAREE   : 1;            /*!< [4..4] COUNTER is greater than or equal to COMPARE register
+                                                     E.                                                                        */
+      __IOM uint32_t COMPAREF   : 1;            /*!< [5..5] COUNTER is greater than or equal to COMPARE register
+                                                     F.                                                                        */
+      __IOM uint32_t COMPAREG   : 1;            /*!< [6..6] COUNTER is greater than or equal to COMPARE register
+                                                     G.                                                                        */
+      __IOM uint32_t COMPAREH   : 1;            /*!< [7..7] COUNTER is greater than or equal to COMPARE register
+                                                     H.                                                                        */
+      __IOM uint32_t OVERFLOW   : 1;            /*!< [8..8] COUNTER over flowed from 0xFFFFFFFF back to 0x00000000.            */
+      __IOM uint32_t CAPTUREA   : 1;            /*!< [9..9] CAPTURE register A has grabbed the value in the counter            */
+      __IOM uint32_t CAPTUREB   : 1;            /*!< [10..10] CAPTURE register B has grabbed the value in the counter          */
+      __IOM uint32_t CAPTUREC   : 1;            /*!< [11..11] CAPTURE register C has grabbed the value in the counter          */
+      __IOM uint32_t CAPTURED   : 1;            /*!< [12..12] CAPTURE register D has grabbed the value in the counter          */
+    } STMINTSTAT_b;
+  } ;
+
+  union {
+    __IOM uint32_t STMINTCLR;                   /*!< (@ 0x00000308) STIMER Interrupt registers: Clear                          */
+
+    struct {
+      __IOM uint32_t COMPAREA   : 1;            /*!< [0..0] COUNTER is greater than or equal to COMPARE register
+                                                     A.                                                                        */
+      __IOM uint32_t COMPAREB   : 1;            /*!< [1..1] COUNTER is greater than or equal to COMPARE register
+                                                     B.                                                                        */
+      __IOM uint32_t COMPAREC   : 1;            /*!< [2..2] COUNTER is greater than or equal to COMPARE register
+                                                     C.                                                                        */
+      __IOM uint32_t COMPARED   : 1;            /*!< [3..3] COUNTER is greater than or equal to COMPARE register
+                                                     D.                                                                        */
+      __IOM uint32_t COMPAREE   : 1;            /*!< [4..4] COUNTER is greater than or equal to COMPARE register
+                                                     E.                                                                        */
+      __IOM uint32_t COMPAREF   : 1;            /*!< [5..5] COUNTER is greater than or equal to COMPARE register
+                                                     F.                                                                        */
+      __IOM uint32_t COMPAREG   : 1;            /*!< [6..6] COUNTER is greater than or equal to COMPARE register
+                                                     G.                                                                        */
+      __IOM uint32_t COMPAREH   : 1;            /*!< [7..7] COUNTER is greater than or equal to COMPARE register
+                                                     H.                                                                        */
+      __IOM uint32_t OVERFLOW   : 1;            /*!< [8..8] COUNTER over flowed from 0xFFFFFFFF back to 0x00000000.            */
+      __IOM uint32_t CAPTUREA   : 1;            /*!< [9..9] CAPTURE register A has grabbed the value in the counter            */
+      __IOM uint32_t CAPTUREB   : 1;            /*!< [10..10] CAPTURE register B has grabbed the value in the counter          */
+      __IOM uint32_t CAPTUREC   : 1;            /*!< [11..11] CAPTURE register C has grabbed the value in the counter          */
+      __IOM uint32_t CAPTURED   : 1;            /*!< [12..12] CAPTURE register D has grabbed the value in the counter          */
+    } STMINTCLR_b;
+  } ;
+
+  union {
+    __IOM uint32_t STMINTSET;                   /*!< (@ 0x0000030C) STIMER Interrupt registers: Set                            */
+
+    struct {
+      __IOM uint32_t COMPAREA   : 1;            /*!< [0..0] COUNTER is greater than or equal to COMPARE register
+                                                     A.                                                                        */
+      __IOM uint32_t COMPAREB   : 1;            /*!< [1..1] COUNTER is greater than or equal to COMPARE register
+                                                     B.                                                                        */
+      __IOM uint32_t COMPAREC   : 1;            /*!< [2..2] COUNTER is greater than or equal to COMPARE register
+                                                     C.                                                                        */
+      __IOM uint32_t COMPARED   : 1;            /*!< [3..3] COUNTER is greater than or equal to COMPARE register
+                                                     D.                                                                        */
+      __IOM uint32_t COMPAREE   : 1;            /*!< [4..4] COUNTER is greater than or equal to COMPARE register
+                                                     E.                                                                        */
+      __IOM uint32_t COMPAREF   : 1;            /*!< [5..5] COUNTER is greater than or equal to COMPARE register
+                                                     F.                                                                        */
+      __IOM uint32_t COMPAREG   : 1;            /*!< [6..6] COUNTER is greater than or equal to COMPARE register
+                                                     G.                                                                        */
+      __IOM uint32_t COMPAREH   : 1;            /*!< [7..7] COUNTER is greater than or equal to COMPARE register
+                                                     H.                                                                        */
+      __IOM uint32_t OVERFLOW   : 1;            /*!< [8..8] COUNTER over flowed from 0xFFFFFFFF back to 0x00000000.            */
+      __IOM uint32_t CAPTUREA   : 1;            /*!< [9..9] CAPTURE register A has grabbed the value in the counter            */
+      __IOM uint32_t CAPTUREB   : 1;            /*!< [10..10] CAPTURE register B has grabbed the value in the counter          */
+      __IOM uint32_t CAPTUREC   : 1;            /*!< [11..11] CAPTURE register C has grabbed the value in the counter          */
+      __IOM uint32_t CAPTURED   : 1;            /*!< [12..12] CAPTURE register D has grabbed the value in the counter          */
+    } STMINTSET_b;
+  } ;
+} CTIMER_Type;                                  /*!< Size = 784 (0x310)                                                        */
+
+
+
+/* =========================================================================================================================== */
+/* ================                                           GPIO                                            ================ */
+/* =========================================================================================================================== */
+
+
+/**
+  * @brief General Purpose IO (GPIO)
+  */
+
+typedef struct {                                /*!< (@ 0x40010000) GPIO Structure                                             */
+
+  union {
+    __IOM uint32_t PADREGA;                     /*!< (@ 0x00000000) Pad Configuration Register A (Pads 0-3)                    */
+
+    struct {
+      __IOM uint32_t PAD0PULL   : 1;            /*!< [0..0] Pad 0 pullup enable                                                */
+      __IOM uint32_t PAD0INPEN  : 1;            /*!< [1..1] Pad 0 input enable                                                 */
+      __IOM uint32_t PAD0STRNG  : 1;            /*!< [2..2] Pad 0 drive strength                                               */
+      __IOM uint32_t PAD0FNCSEL : 3;            /*!< [5..3] Pad 0 function select                                              */
+      __IOM uint32_t PAD0RSEL   : 2;            /*!< [7..6] Pad 0 pullup resistor selection.                                   */
+      __IOM uint32_t PAD1PULL   : 1;            /*!< [8..8] Pad 1 pullup enable                                                */
+      __IOM uint32_t PAD1INPEN  : 1;            /*!< [9..9] Pad 1 input enable                                                 */
+      __IOM uint32_t PAD1STRNG  : 1;            /*!< [10..10] Pad 1 drive strength                                             */
+      __IOM uint32_t PAD1FNCSEL : 3;            /*!< [13..11] Pad 1 function select                                            */
+      __IOM uint32_t PAD1RSEL   : 2;            /*!< [15..14] Pad 1 pullup resistor selection.                                 */
+      __IOM uint32_t PAD2PULL   : 1;            /*!< [16..16] Pad 2 pullup enable                                              */
+      __IOM uint32_t PAD2INPEN  : 1;            /*!< [17..17] Pad 2 input enable                                               */
+      __IOM uint32_t PAD2STRNG  : 1;            /*!< [18..18] Pad 2 drive strength                                             */
+      __IOM uint32_t PAD2FNCSEL : 3;            /*!< [21..19] Pad 2 function select                                            */
+      __IM  uint32_t            : 2;
+      __IOM uint32_t PAD3PULL   : 1;            /*!< [24..24] Pad 3 pullup enable                                              */
+      __IOM uint32_t PAD3INPEN  : 1;            /*!< [25..25] Pad 3 input enable.                                              */
+      __IOM uint32_t PAD3STRNG  : 1;            /*!< [26..26] Pad 3 drive strength.                                            */
+      __IOM uint32_t PAD3FNCSEL : 3;            /*!< [29..27] Pad 3 function select                                            */
+      __IOM uint32_t PAD3PWRUP  : 1;            /*!< [30..30] Pad 3 VDD power switch enable                                    */
+    } PADREGA_b;
+  } ;
+
+  union {
+    __IOM uint32_t PADREGB;                     /*!< (@ 0x00000004) Pad Configuration Register B (Pads 4-7)                    */
+
+    struct {
+      __IOM uint32_t PAD4PULL   : 1;            /*!< [0..0] Pad 4 pullup enable                                                */
+      __IOM uint32_t PAD4INPEN  : 1;            /*!< [1..1] Pad 4 input enable                                                 */
+      __IOM uint32_t PAD4STRNG  : 1;            /*!< [2..2] Pad 4 drive strength                                               */
+      __IOM uint32_t PAD4FNCSEL : 3;            /*!< [5..3] Pad 4 function select                                              */
+      __IM  uint32_t            : 2;
+      __IOM uint32_t PAD5PULL   : 1;            /*!< [8..8] Pad 5 pullup enable                                                */
+      __IOM uint32_t PAD5INPEN  : 1;            /*!< [9..9] Pad 5 input enable                                                 */
+      __IOM uint32_t PAD5STRNG  : 1;            /*!< [10..10] Pad 5 drive strength                                             */
+      __IOM uint32_t PAD5FNCSEL : 3;            /*!< [13..11] Pad 5 function select                                            */
+      __IOM uint32_t PAD5RSEL   : 2;            /*!< [15..14] Pad 5 pullup resistor selection.                                 */
+      __IOM uint32_t PAD6PULL   : 1;            /*!< [16..16] Pad 6 pullup enable                                              */
+      __IOM uint32_t PAD6INPEN  : 1;            /*!< [17..17] Pad 6 input enable                                               */
+      __IOM uint32_t PAD6STRNG  : 1;            /*!< [18..18] Pad 6 drive strength                                             */
+      __IOM uint32_t PAD6FNCSEL : 3;            /*!< [21..19] Pad 6 function select                                            */
+      __IOM uint32_t PAD6RSEL   : 2;            /*!< [23..22] Pad 6 pullup resistor selection.                                 */
+      __IOM uint32_t PAD7PULL   : 1;            /*!< [24..24] Pad 7 pullup enable                                              */
+      __IOM uint32_t PAD7INPEN  : 1;            /*!< [25..25] Pad 7 input enable                                               */
+      __IOM uint32_t PAD7STRNG  : 1;            /*!< [26..26] Pad 7 drive strength                                             */
+      __IOM uint32_t PAD7FNCSEL : 3;            /*!< [29..27] Pad 7 function select                                            */
+    } PADREGB_b;
+  } ;
+
+  union {
+    __IOM uint32_t PADREGC;                     /*!< (@ 0x00000008) Pad Configuration Register C (Pads 8-11)                   */
+
+    struct {
+      __IOM uint32_t PAD8PULL   : 1;            /*!< [0..0] Pad 8 pullup enable                                                */
+      __IOM uint32_t PAD8INPEN  : 1;            /*!< [1..1] Pad 8 input enable                                                 */
+      __IOM uint32_t PAD8STRNG  : 1;            /*!< [2..2] Pad 8 drive strength                                               */
+      __IOM uint32_t PAD8FNCSEL : 3;            /*!< [5..3] Pad 8 function select                                              */
+      __IOM uint32_t PAD8RSEL   : 2;            /*!< [7..6] Pad 8 pullup resistor selection.                                   */
+      __IOM uint32_t PAD9PULL   : 1;            /*!< [8..8] Pad 9 pullup enable                                                */
+      __IOM uint32_t PAD9INPEN  : 1;            /*!< [9..9] Pad 9 input enable                                                 */
+      __IOM uint32_t PAD9STRNG  : 1;            /*!< [10..10] Pad 9 drive strength                                             */
+      __IOM uint32_t PAD9FNCSEL : 3;            /*!< [13..11] Pad 9 function select                                            */
+      __IOM uint32_t PAD9RSEL   : 2;            /*!< [15..14] Pad 9 pullup resistor selection                                  */
+      __IOM uint32_t PAD10PULL  : 1;            /*!< [16..16] Pad 10 pullup enable                                             */
+      __IOM uint32_t PAD10INPEN : 1;            /*!< [17..17] Pad 10 input enable                                              */
+      __IOM uint32_t PAD10STRNG : 1;            /*!< [18..18] Pad 10 drive strength                                            */
+      __IOM uint32_t PAD10FNCSEL : 3;           /*!< [21..19] Pad 10 function select                                           */
+      __IM  uint32_t            : 2;
+      __IOM uint32_t PAD11PULL  : 1;            /*!< [24..24] Pad 11 pullup enable                                             */
+      __IOM uint32_t PAD11INPEN : 1;            /*!< [25..25] Pad 11 input enable                                              */
+      __IOM uint32_t PAD11STRNG : 1;            /*!< [26..26] Pad 11 drive strength                                            */
+      __IOM uint32_t PAD11FNCSEL : 3;           /*!< [29..27] Pad 11 function select                                           */
+    } PADREGC_b;
+  } ;
+
+  union {
+    __IOM uint32_t PADREGD;                     /*!< (@ 0x0000000C) Pad Configuration Register D (Pads 12-15)                  */
+
+    struct {
+      __IOM uint32_t PAD12PULL  : 1;            /*!< [0..0] Pad 12 pullup enable                                               */
+      __IOM uint32_t PAD12INPEN : 1;            /*!< [1..1] Pad 12 input enable                                                */
+      __IOM uint32_t PAD12STRNG : 1;            /*!< [2..2] Pad 12 drive strength                                              */
+      __IOM uint32_t PAD12FNCSEL : 3;           /*!< [5..3] Pad 12 function select                                             */
+      __IM  uint32_t            : 2;
+      __IOM uint32_t PAD13PULL  : 1;            /*!< [8..8] Pad 13 pullup enable                                               */
+      __IOM uint32_t PAD13INPEN : 1;            /*!< [9..9] Pad 13 input enable                                                */
+      __IOM uint32_t PAD13STRNG : 1;            /*!< [10..10] Pad 13 drive strength                                            */
+      __IOM uint32_t PAD13FNCSEL : 3;           /*!< [13..11] Pad 13 function select                                           */
+      __IM  uint32_t            : 2;
+      __IOM uint32_t PAD14PULL  : 1;            /*!< [16..16] Pad 14 pullup enable                                             */
+      __IOM uint32_t PAD14INPEN : 1;            /*!< [17..17] Pad 14 input enable                                              */
+      __IOM uint32_t PAD14STRNG : 1;            /*!< [18..18] Pad 14 drive strength                                            */
+      __IOM uint32_t PAD14FNCSEL : 3;           /*!< [21..19] Pad 14 function select                                           */
+      __IM  uint32_t            : 2;
+      __IOM uint32_t PAD15PULL  : 1;            /*!< [24..24] Pad 15 pullup enable                                             */
+      __IOM uint32_t PAD15INPEN : 1;            /*!< [25..25] Pad 15 input enable                                              */
+      __IOM uint32_t PAD15STRNG : 1;            /*!< [26..26] Pad 15 drive strength                                            */
+      __IOM uint32_t PAD15FNCSEL : 3;           /*!< [29..27] Pad 15 function select                                           */
+    } PADREGD_b;
+  } ;
+
+  union {
+    __IOM uint32_t PADREGE;                     /*!< (@ 0x00000010) Pad Configuration Register E (Pads 16-19)                  */
+
+    struct {
+      __IOM uint32_t PAD16PULL  : 1;            /*!< [0..0] Pad 16 pullup enable                                               */
+      __IOM uint32_t PAD16INPEN : 1;            /*!< [1..1] Pad 16 input enable                                                */
+      __IOM uint32_t PAD16STRNG : 1;            /*!< [2..2] Pad 16 drive strength                                              */
+      __IOM uint32_t PAD16FNCSEL : 3;           /*!< [5..3] Pad 16 function select                                             */
+      __IM  uint32_t            : 2;
+      __IOM uint32_t PAD17PULL  : 1;            /*!< [8..8] Pad 17 pullup enable                                               */
+      __IOM uint32_t PAD17INPEN : 1;            /*!< [9..9] Pad 17 input enable                                                */
+      __IOM uint32_t PAD17STRNG : 1;            /*!< [10..10] Pad 17 drive strength                                            */
+      __IOM uint32_t PAD17FNCSEL : 3;           /*!< [13..11] Pad 17 function select                                           */
+      __IM  uint32_t            : 2;
+      __IOM uint32_t PAD18PULL  : 1;            /*!< [16..16] Pad 18 pullup enable                                             */
+      __IOM uint32_t PAD18INPEN : 1;            /*!< [17..17] Pad 18 input enable                                              */
+      __IOM uint32_t PAD18STRNG : 1;            /*!< [18..18] Pad 18 drive strength                                            */
+      __IOM uint32_t PAD18FNCSEL : 3;           /*!< [21..19] Pad 18 function select                                           */
+      __IM  uint32_t            : 2;
+      __IOM uint32_t PAD19PULL  : 1;            /*!< [24..24] Pad 19 pullup enable                                             */
+      __IOM uint32_t PAD19INPEN : 1;            /*!< [25..25] Pad 19 input enable                                              */
+      __IOM uint32_t PAD19STRNG : 1;            /*!< [26..26] Pad 19 drive strength                                            */
+      __IOM uint32_t PAD19FNCSEL : 3;           /*!< [29..27] Pad 19 function select                                           */
+    } PADREGE_b;
+  } ;
+
+  union {
+    __IOM uint32_t PADREGF;                     /*!< (@ 0x00000014) Pad Configuration Register F (Pads 20-23)                  */
+
+    struct {
+      __IOM uint32_t PAD20PULL  : 1;            /*!< [0..0] Pad 20 pulldown enable                                             */
+      __IOM uint32_t PAD20INPEN : 1;            /*!< [1..1] Pad 20 input enable                                                */
+      __IOM uint32_t PAD20STRNG : 1;            /*!< [2..2] Pad 20 drive strength                                              */
+      __IOM uint32_t PAD20FNCSEL : 3;           /*!< [5..3] Pad 20 function select                                             */
+      __IM  uint32_t            : 2;
+      __IOM uint32_t PAD21PULL  : 1;            /*!< [8..8] Pad 21 pullup enable                                               */
+      __IOM uint32_t PAD21INPEN : 1;            /*!< [9..9] Pad 21 input enable                                                */
+      __IOM uint32_t PAD21STRNG : 1;            /*!< [10..10] Pad 21 drive strength                                            */
+      __IOM uint32_t PAD21FNCSEL : 3;           /*!< [13..11] Pad 21 function select                                           */
+      __IM  uint32_t            : 2;
+      __IOM uint32_t PAD22PULL  : 1;            /*!< [16..16] Pad 22 pullup enable                                             */
+      __IOM uint32_t PAD22INPEN : 1;            /*!< [17..17] Pad 22 input enable                                              */
+      __IOM uint32_t PAD22STRNG : 1;            /*!< [18..18] Pad 22 drive strength                                            */
+      __IOM uint32_t PAD22FNCSEL : 3;           /*!< [21..19] Pad 22 function select                                           */
+      __IM  uint32_t            : 2;
+      __IOM uint32_t PAD23PULL  : 1;            /*!< [24..24] Pad 23 pullup enable                                             */
+      __IOM uint32_t PAD23INPEN : 1;            /*!< [25..25] Pad 23 input enable                                              */
+      __IOM uint32_t PAD23STRNG : 1;            /*!< [26..26] Pad 23 drive strength                                            */
+      __IOM uint32_t PAD23FNCSEL : 3;           /*!< [29..27] Pad 23 function select                                           */
+    } PADREGF_b;
+  } ;
+
+  union {
+    __IOM uint32_t PADREGG;                     /*!< (@ 0x00000018) Pad Configuration Register G (Pads 24-27)                  */
+
+    struct {
+      __IOM uint32_t PAD24PULL  : 1;            /*!< [0..0] Pad 24 pullup enable                                               */
+      __IOM uint32_t PAD24INPEN : 1;            /*!< [1..1] Pad 24 input enable                                                */
+      __IOM uint32_t PAD24STRNG : 1;            /*!< [2..2] Pad 24 drive strength                                              */
+      __IOM uint32_t PAD24FNCSEL : 3;           /*!< [5..3] Pad 24 function select                                             */
+      __IM  uint32_t            : 2;
+      __IOM uint32_t PAD25PULL  : 1;            /*!< [8..8] Pad 25 pullup enable                                               */
+      __IOM uint32_t PAD25INPEN : 1;            /*!< [9..9] Pad 25 input enable                                                */
+      __IOM uint32_t PAD25STRNG : 1;            /*!< [10..10] Pad 25 drive strength                                            */
+      __IOM uint32_t PAD25FNCSEL : 3;           /*!< [13..11] Pad 25 function select                                           */
+      __IOM uint32_t PAD25RSEL  : 2;            /*!< [15..14] Pad 25 pullup resistor selection.                                */
+      __IOM uint32_t PAD26PULL  : 1;            /*!< [16..16] Pad 26 pullup enable                                             */
+      __IOM uint32_t PAD26INPEN : 1;            /*!< [17..17] Pad 26 input enable                                              */
+      __IOM uint32_t PAD26STRNG : 1;            /*!< [18..18] Pad 26 drive strength                                            */
+      __IOM uint32_t PAD26FNCSEL : 3;           /*!< [21..19] Pad 26 function select                                           */
+      __IM  uint32_t            : 2;
+      __IOM uint32_t PAD27PULL  : 1;            /*!< [24..24] Pad 27 pullup enable                                             */
+      __IOM uint32_t PAD27INPEN : 1;            /*!< [25..25] Pad 27 input enable                                              */
+      __IOM uint32_t PAD27STRNG : 1;            /*!< [26..26] Pad 27 drive strength                                            */
+      __IOM uint32_t PAD27FNCSEL : 3;           /*!< [29..27] Pad 27 function select                                           */
+      __IOM uint32_t PAD27RSEL  : 2;            /*!< [31..30] Pad 27 pullup resistor selection.                                */
+    } PADREGG_b;
+  } ;
+
+  union {
+    __IOM uint32_t PADREGH;                     /*!< (@ 0x0000001C) Pad Configuration Register H (Pads 28-31)                  */
+
+    struct {
+      __IOM uint32_t PAD28PULL  : 1;            /*!< [0..0] Pad 28 pullup enable                                               */
+      __IOM uint32_t PAD28INPEN : 1;            /*!< [1..1] Pad 28 input enable                                                */
+      __IOM uint32_t PAD28STRNG : 1;            /*!< [2..2] Pad 28 drive strength                                              */
+      __IOM uint32_t PAD28FNCSEL : 3;           /*!< [5..3] Pad 28 function select                                             */
+      __IM  uint32_t            : 2;
+      __IOM uint32_t PAD29PULL  : 1;            /*!< [8..8] Pad 29 pullup enable                                               */
+      __IOM uint32_t PAD29INPEN : 1;            /*!< [9..9] Pad 29 input enable                                                */
+      __IOM uint32_t PAD29STRNG : 1;            /*!< [10..10] Pad 29 drive strength                                            */
+      __IOM uint32_t PAD29FNCSEL : 3;           /*!< [13..11] Pad 29 function select                                           */
+      __IM  uint32_t            : 2;
+      __IOM uint32_t PAD30PULL  : 1;            /*!< [16..16] Pad 30 pullup enable                                             */
+      __IOM uint32_t PAD30INPEN : 1;            /*!< [17..17] Pad 30 input enable                                              */
+      __IOM uint32_t PAD30STRNG : 1;            /*!< [18..18] Pad 30 drive strength                                            */
+      __IOM uint32_t PAD30FNCSEL : 3;           /*!< [21..19] Pad 30 function select                                           */
+      __IM  uint32_t            : 2;
+      __IOM uint32_t PAD31PULL  : 1;            /*!< [24..24] Pad 31 pullup enable                                             */
+      __IOM uint32_t PAD31INPEN : 1;            /*!< [25..25] Pad 31 input enable                                              */
+      __IOM uint32_t PAD31STRNG : 1;            /*!< [26..26] Pad 31 drive strength                                            */
+      __IOM uint32_t PAD31FNCSEL : 3;           /*!< [29..27] Pad 31 function select                                           */
+    } PADREGH_b;
+  } ;
+
+  union {
+    __IOM uint32_t PADREGI;                     /*!< (@ 0x00000020) Pad Configuration Register I (Pads 32-25)                  */
+
+    struct {
+      __IOM uint32_t PAD32PULL  : 1;            /*!< [0..0] Pad 32 pullup enable                                               */
+      __IOM uint32_t PAD32INPEN : 1;            /*!< [1..1] Pad 32 input enable                                                */
+      __IOM uint32_t PAD32STRNG : 1;            /*!< [2..2] Pad 32 drive strength                                              */
+      __IOM uint32_t PAD32FNCSEL : 3;           /*!< [5..3] Pad 32 function select                                             */
+      __IM  uint32_t            : 2;
+      __IOM uint32_t PAD33PULL  : 1;            /*!< [8..8] Pad 33 pullup enable                                               */
+      __IOM uint32_t PAD33INPEN : 1;            /*!< [9..9] Pad 33 input enable                                                */
+      __IOM uint32_t PAD33STRNG : 1;            /*!< [10..10] Pad 33 drive strength                                            */
+      __IOM uint32_t PAD33FNCSEL : 3;           /*!< [13..11] Pad 33 function select                                           */
+      __IM  uint32_t            : 2;
+      __IOM uint32_t PAD34PULL  : 1;            /*!< [16..16] Pad 34 pullup enable                                             */
+      __IOM uint32_t PAD34INPEN : 1;            /*!< [17..17] Pad 34 input enable                                              */
+      __IOM uint32_t PAD34STRNG : 1;            /*!< [18..18] Pad 34 drive strength                                            */
+      __IOM uint32_t PAD34FNCSEL : 3;           /*!< [21..19] Pad 34 function select                                           */
+      __IM  uint32_t            : 2;
+      __IOM uint32_t PAD35PULL  : 1;            /*!< [24..24] Pad 35 pullup enable                                             */
+      __IOM uint32_t PAD35INPEN : 1;            /*!< [25..25] Pad 35 input enable                                              */
+      __IOM uint32_t PAD35STRNG : 1;            /*!< [26..26] Pad 35 drive strength                                            */
+      __IOM uint32_t PAD35FNCSEL : 3;           /*!< [29..27] Pad 35 function select                                           */
+    } PADREGI_b;
+  } ;
+
+  union {
+    __IOM uint32_t PADREGJ;                     /*!< (@ 0x00000024) Pad Configuration Register J (Pads 36-39)                  */
+
+    struct {
+      __IOM uint32_t PAD36PULL  : 1;            /*!< [0..0] Pad 36 pullup enable                                               */
+      __IOM uint32_t PAD36INPEN : 1;            /*!< [1..1] Pad 36 input enable                                                */
+      __IOM uint32_t PAD36STRNG : 1;            /*!< [2..2] Pad 36 drive strength                                              */
+      __IOM uint32_t PAD36FNCSEL : 3;           /*!< [5..3] Pad 36 function select                                             */
+      __IOM uint32_t PAD36PWRUP : 1;            /*!< [6..6] Pad 36 VDD power switch enable                                     */
+      __IM  uint32_t            : 1;
+      __IOM uint32_t PAD37PULL  : 1;            /*!< [8..8] Pad 37 pullup enable                                               */
+      __IOM uint32_t PAD37INPEN : 1;            /*!< [9..9] Pad 37 input enable                                                */
+      __IOM uint32_t PAD37STRNG : 1;            /*!< [10..10] Pad 37 drive strength                                            */
+      __IOM uint32_t PAD37FNCSEL : 3;           /*!< [13..11] Pad 37 function select                                           */
+      __IM  uint32_t            : 1;
+      __IOM uint32_t PAD37PWRDN : 1;            /*!< [15..15] Pad 37 VSS power switch enable                                   */
+      __IOM uint32_t PAD38PULL  : 1;            /*!< [16..16] Pad 38 pullup enable                                             */
+      __IOM uint32_t PAD38INPEN : 1;            /*!< [17..17] Pad 38 input enable                                              */
+      __IOM uint32_t PAD38STRNG : 1;            /*!< [18..18] Pad 38 drive strength                                            */
+      __IOM uint32_t PAD38FNCSEL : 3;           /*!< [21..19] Pad 38 function select                                           */
+      __IM  uint32_t            : 2;
+      __IOM uint32_t PAD39PULL  : 1;            /*!< [24..24] Pad 39 pullup enable                                             */
+      __IOM uint32_t PAD39INPEN : 1;            /*!< [25..25] Pad 39 input enable                                              */
+      __IOM uint32_t PAD39STRNG : 1;            /*!< [26..26] Pad 39 drive strength                                            */
+      __IOM uint32_t PAD39FNCSEL : 3;           /*!< [29..27] Pad 39 function select                                           */
+      __IOM uint32_t PAD39RSEL  : 2;            /*!< [31..30] Pad 39 pullup resistor selection.                                */
+    } PADREGJ_b;
+  } ;
+
+  union {
+    __IOM uint32_t PADREGK;                     /*!< (@ 0x00000028) Pad Configuration Register K (Pads 40-43)                  */
+
+    struct {
+      __IOM uint32_t PAD40PULL  : 1;            /*!< [0..0] Pad 40 pullup enable                                               */
+      __IOM uint32_t PAD40INPEN : 1;            /*!< [1..1] Pad 40 input enable                                                */
+      __IOM uint32_t PAD40STRNG : 1;            /*!< [2..2] Pad 40 drive strength                                              */
+      __IOM uint32_t PAD40FNCSEL : 3;           /*!< [5..3] Pad 40 function select                                             */
+      __IOM uint32_t PAD40RSEL  : 2;            /*!< [7..6] Pad 40 pullup resistor selection.                                  */
+      __IOM uint32_t PAD41PULL  : 1;            /*!< [8..8] Pad 41 pullup enable                                               */
+      __IOM uint32_t PAD41INPEN : 1;            /*!< [9..9] Pad 41 input enable                                                */
+      __IOM uint32_t PAD41STRNG : 1;            /*!< [10..10] Pad 41 drive strength                                            */
+      __IOM uint32_t PAD41FNCSEL : 3;           /*!< [13..11] Pad 41 function select                                           */
+      __IM  uint32_t            : 1;
+      __IOM uint32_t PAD41PWRDN : 1;            /*!< [15..15] Pad 41 power switch enable                                       */
+      __IOM uint32_t PAD42PULL  : 1;            /*!< [16..16] Pad 42 pullup enable                                             */
+      __IOM uint32_t PAD42INPEN : 1;            /*!< [17..17] Pad 42 input enable                                              */
+      __IOM uint32_t PAD42STRNG : 1;            /*!< [18..18] Pad 42 drive strength                                            */
+      __IOM uint32_t PAD42FNCSEL : 3;           /*!< [21..19] Pad 42 function select                                           */
+      __IOM uint32_t PAD42RSEL  : 2;            /*!< [23..22] Pad 42 pullup resistor selection.                                */
+      __IOM uint32_t PAD43PULL  : 1;            /*!< [24..24] Pad 43 pullup enable                                             */
+      __IOM uint32_t PAD43INPEN : 1;            /*!< [25..25] Pad 43 input enable                                              */
+      __IOM uint32_t PAD43STRNG : 1;            /*!< [26..26] Pad 43 drive strength                                            */
+      __IOM uint32_t PAD43FNCSEL : 3;           /*!< [29..27] Pad 43 function select                                           */
+      __IOM uint32_t PAD43RSEL  : 2;            /*!< [31..30] Pad 43 pullup resistor selection.                                */
+    } PADREGK_b;
+  } ;
+
+  union {
+    __IOM uint32_t PADREGL;                     /*!< (@ 0x0000002C) Pad Configuration Register L (Pads 44-47)                  */
+
+    struct {
+      __IOM uint32_t PAD44PULL  : 1;            /*!< [0..0] Pad 44 pullup enable                                               */
+      __IOM uint32_t PAD44INPEN : 1;            /*!< [1..1] Pad 44 input enable                                                */
+      __IOM uint32_t PAD44STRNG : 1;            /*!< [2..2] Pad 44 drive strength                                              */
+      __IOM uint32_t PAD44FNCSEL : 3;           /*!< [5..3] Pad 44 function select                                             */
+      __IM  uint32_t            : 2;
+      __IOM uint32_t PAD45PULL  : 1;            /*!< [8..8] Pad 45 pullup enable                                               */
+      __IOM uint32_t PAD45INPEN : 1;            /*!< [9..9] Pad 45 input enable                                                */
+      __IOM uint32_t PAD45STRNG : 1;            /*!< [10..10] Pad 45 drive strength                                            */
+      __IOM uint32_t PAD45FNCSEL : 3;           /*!< [13..11] Pad 45 function select                                           */
+      __IM  uint32_t            : 2;
+      __IOM uint32_t PAD46PULL  : 1;            /*!< [16..16] Pad 46 pullup enable                                             */
+      __IOM uint32_t PAD46INPEN : 1;            /*!< [17..17] Pad 46 input enable                                              */
+      __IOM uint32_t PAD46STRNG : 1;            /*!< [18..18] Pad 46 drive strength                                            */
+      __IOM uint32_t PAD46FNCSEL : 3;           /*!< [21..19] Pad 46 function select                                           */
+      __IM  uint32_t            : 2;
+      __IOM uint32_t PAD47PULL  : 1;            /*!< [24..24] Pad 47 pullup enable                                             */
+      __IOM uint32_t PAD47INPEN : 1;            /*!< [25..25] Pad 47 input enable                                              */
+      __IOM uint32_t PAD47STRNG : 1;            /*!< [26..26] Pad 47 drive strength                                            */
+      __IOM uint32_t PAD47FNCSEL : 3;           /*!< [29..27] Pad 47 function select                                           */
+    } PADREGL_b;
+  } ;
+
+  union {
+    __IOM uint32_t PADREGM;                     /*!< (@ 0x00000030) Pad Configuration Register M (Pads 47-48)                  */
+
+    struct {
+      __IOM uint32_t PAD48PULL  : 1;            /*!< [0..0] Pad 48 pullup enable                                               */
+      __IOM uint32_t PAD48INPEN : 1;            /*!< [1..1] Pad 48 input enable                                                */
+      __IOM uint32_t PAD48STRNG : 1;            /*!< [2..2] Pad 48 drive strength                                              */
+      __IOM uint32_t PAD48FNCSEL : 3;           /*!< [5..3] Pad 48 function select                                             */
+      __IOM uint32_t PAD48RSEL  : 2;            /*!< [7..6] Pad 48 pullup resistor selection.                                  */
+      __IOM uint32_t PAD49PULL  : 1;            /*!< [8..8] Pad 49 pullup enable                                               */
+      __IOM uint32_t PAD49INPEN : 1;            /*!< [9..9] Pad 49 input enable                                                */
+      __IOM uint32_t PAD49STRNG : 1;            /*!< [10..10] Pad 49 drive strength                                            */
+      __IOM uint32_t PAD49FNCSEL : 3;           /*!< [13..11] Pad 49 function select                                           */
+      __IOM uint32_t PAD49RSEL  : 2;            /*!< [15..14] Pad 49 pullup resistor selection.                                */
+    } PADREGM_b;
+  } ;
+  __IM  uint32_t  RESERVED[3];
+
+  union {
+    __IOM uint32_t CFGA;                        /*!< (@ 0x00000040) GPIO Configuration Register A (Pads 0-7)                   */
+
+    struct {
+      __IOM uint32_t GPIO0INCFG : 1;            /*!< [0..0] GPIO0 input enable.                                                */
+      __IOM uint32_t GPIO0OUTCFG : 2;           /*!< [2..1] GPIO0 output configuration.                                        */
+      __IOM uint32_t GPIO0INTD  : 1;            /*!< [3..3] GPIO0 interrupt direction.                                         */
+      __IOM uint32_t GPIO1INCFG : 1;            /*!< [4..4] GPIO1 input enable.                                                */
+      __IOM uint32_t GPIO1OUTCFG : 2;           /*!< [6..5] GPIO1 output configuration.                                        */
+      __IOM uint32_t GPIO1INTD  : 1;            /*!< [7..7] GPIO1 interrupt direction.                                         */
+      __IOM uint32_t GPIO2INCFG : 1;            /*!< [8..8] GPIO2 input enable.                                                */
+      __IOM uint32_t GPIO2OUTCFG : 2;           /*!< [10..9] GPIO2 output configuration.                                       */
+      __IOM uint32_t GPIO2INTD  : 1;            /*!< [11..11] GPIO2 interrupt direction.                                       */
+      __IOM uint32_t GPIO3INCFG : 1;            /*!< [12..12] GPIO3 input enable.                                              */
+      __IOM uint32_t GPIO3OUTCFG : 2;           /*!< [14..13] GPIO3 output configuration.                                      */
+      __IOM uint32_t GPIO3INTD  : 1;            /*!< [15..15] GPIO3 interrupt direction.                                       */
+      __IOM uint32_t GPIO4INCFG : 1;            /*!< [16..16] GPIO4 input enable.                                              */
+      __IOM uint32_t GPIO4OUTCFG : 2;           /*!< [18..17] GPIO4 output configuration.                                      */
+      __IOM uint32_t GPIO4INTD  : 1;            /*!< [19..19] GPIO4 interrupt direction.                                       */
+      __IOM uint32_t GPIO5INCFG : 1;            /*!< [20..20] GPIO5 input enable.                                              */
+      __IOM uint32_t GPIO5OUTCFG : 2;           /*!< [22..21] GPIO5 output configuration.                                      */
+      __IOM uint32_t GPIO5INTD  : 1;            /*!< [23..23] GPIO5 interrupt direction.                                       */
+      __IOM uint32_t GPIO6INCFG : 1;            /*!< [24..24] GPIO6 input enable.                                              */
+      __IOM uint32_t GPIO6OUTCFG : 2;           /*!< [26..25] GPIO6 output configuration.                                      */
+      __IOM uint32_t GPIO6INTD  : 1;            /*!< [27..27] GPIO6 interrupt direction.                                       */
+      __IOM uint32_t GPIO7INCFG : 1;            /*!< [28..28] GPIO7 input enable.                                              */
+      __IOM uint32_t GPIO7OUTCFG : 2;           /*!< [30..29] GPIO7 output configuration.                                      */
+      __IOM uint32_t GPIO7INTD  : 1;            /*!< [31..31] GPIO7 interrupt direction, nCE polarity.                         */
+    } CFGA_b;
+  } ;
+
+  union {
+    __IOM uint32_t CFGB;                        /*!< (@ 0x00000044) GPIO Configuration Register B (Pads 8-15)                  */
+
+    struct {
+      __IOM uint32_t GPIO8INCFG : 1;            /*!< [0..0] GPIO8 input enable.                                                */
+      __IOM uint32_t GPIO8OUTCFG : 2;           /*!< [2..1] GPIO8 output configuration.                                        */
+      __IOM uint32_t GPIO8INTD  : 1;            /*!< [3..3] GPIO8 interrupt direction.                                         */
+      __IOM uint32_t GPIO9INCFG : 1;            /*!< [4..4] GPIO9 input enable.                                                */
+      __IOM uint32_t GPIO9OUTCFG : 2;           /*!< [6..5] GPIO9 output configuration.                                        */
+      __IOM uint32_t GPIO9INTD  : 1;            /*!< [7..7] GPIO9 interrupt direction.                                         */
+      __IOM uint32_t GPIO10INCFG : 1;           /*!< [8..8] GPIO10 input enable.                                               */
+      __IOM uint32_t GPIO10OUTCFG : 2;          /*!< [10..9] GPIO10 output configuration.                                      */
+      __IOM uint32_t GPIO10INTD : 1;            /*!< [11..11] GPIO10 interrupt direction.                                      */
+      __IOM uint32_t GPIO11INCFG : 1;           /*!< [12..12] GPIO11 input enable.                                             */
+      __IOM uint32_t GPIO11OUTCFG : 2;          /*!< [14..13] GPIO11 output configuration.                                     */
+      __IOM uint32_t GPIO11INTD : 1;            /*!< [15..15] GPIO11 interrupt direction.                                      */
+      __IOM uint32_t GPIO12INCFG : 1;           /*!< [16..16] GPIO12 input enable.                                             */
+      __IOM uint32_t GPIO12OUTCFG : 2;          /*!< [18..17] GPIO12 output configuration.                                     */
+      __IOM uint32_t GPIO12INTD : 1;            /*!< [19..19] GPIO12 interrupt direction.                                      */
+      __IOM uint32_t GPIO13INCFG : 1;           /*!< [20..20] GPIO13 input enable.                                             */
+      __IOM uint32_t GPIO13OUTCFG : 2;          /*!< [22..21] GPIO13 output configuration.                                     */
+      __IOM uint32_t GPIO13INTD : 1;            /*!< [23..23] GPIO13 interrupt direction.                                      */
+      __IOM uint32_t GPIO14INCFG : 1;           /*!< [24..24] GPIO14 input enable.                                             */
+      __IOM uint32_t GPIO14OUTCFG : 2;          /*!< [26..25] GPIO14 output configuration.                                     */
+      __IOM uint32_t GPIO14INTD : 1;            /*!< [27..27] GPIO14 interrupt direction.                                      */
+      __IOM uint32_t GPIO15INCFG : 1;           /*!< [28..28] GPIO15 input enable.                                             */
+      __IOM uint32_t GPIO15OUTCFG : 2;          /*!< [30..29] GPIO15 output configuration.                                     */
+      __IOM uint32_t GPIO15INTD : 1;            /*!< [31..31] GPIO15 interrupt direction.                                      */
+    } CFGB_b;
+  } ;
+
+  union {
+    __IOM uint32_t CFGC;                        /*!< (@ 0x00000048) GPIO Configuration Register C (Pads 16-23)                 */
+
+    struct {
+      __IOM uint32_t GPIO16INCFG : 1;           /*!< [0..0] GPIO16 input enable.                                               */
+      __IOM uint32_t GPIO16OUTCFG : 2;          /*!< [2..1] GPIO16 output configuration.                                       */
+      __IOM uint32_t GPIO16INTD : 1;            /*!< [3..3] GPIO16 interrupt direction.                                        */
+      __IOM uint32_t GPIO17INCFG : 1;           /*!< [4..4] GPIO17 input enable.                                               */
+      __IOM uint32_t GPIO17OUTCFG : 2;          /*!< [6..5] GPIO17 output configuration.                                       */
+      __IOM uint32_t GPIO17INTD : 1;            /*!< [7..7] GPIO17 interrupt direction.                                        */
+      __IOM uint32_t GPIO18INCFG : 1;           /*!< [8..8] GPIO18 input enable.                                               */
+      __IOM uint32_t GPIO18OUTCFG : 2;          /*!< [10..9] GPIO18 output configuration.                                      */
+      __IOM uint32_t GPIO18INTD : 1;            /*!< [11..11] GPIO18 interrupt direction.                                      */
+      __IOM uint32_t GPIO19INCFG : 1;           /*!< [12..12] GPIO19 input enable.                                             */
+      __IOM uint32_t GPIO19OUTCFG : 2;          /*!< [14..13] GPIO19 output configuration.                                     */
+      __IOM uint32_t GPIO19INTD : 1;            /*!< [15..15] GPIO19 interrupt direction.                                      */
+      __IOM uint32_t GPIO20INCFG : 1;           /*!< [16..16] GPIO20 input enable.                                             */
+      __IOM uint32_t GPIO20OUTCFG : 2;          /*!< [18..17] GPIO20 output configuration.                                     */
+      __IOM uint32_t GPIO20INTD : 1;            /*!< [19..19] GPIO20 interrupt direction.                                      */
+      __IOM uint32_t GPIO21INCFG : 1;           /*!< [20..20] GPIO21 input enable.                                             */
+      __IOM uint32_t GPIO21OUTCFG : 2;          /*!< [22..21] GPIO21 output configuration.                                     */
+      __IOM uint32_t GPIO21INTD : 1;            /*!< [23..23] GPIO21 interrupt direction.                                      */
+      __IOM uint32_t GPIO22INCFG : 1;           /*!< [24..24] GPIO22 input enable.                                             */
+      __IOM uint32_t GPIO22OUTCFG : 2;          /*!< [26..25] GPIO22 output configuration.                                     */
+      __IOM uint32_t GPIO22INTD : 1;            /*!< [27..27] GPIO22 interrupt direction.                                      */
+      __IOM uint32_t GPIO23INCFG : 1;           /*!< [28..28] GPIO23 input enable.                                             */
+      __IOM uint32_t GPIO23OUTCFG : 2;          /*!< [30..29] GPIO23 output configuration.                                     */
+      __IOM uint32_t GPIO23INTD : 1;            /*!< [31..31] GPIO23 interrupt direction.                                      */
+    } CFGC_b;
+  } ;
+
+  union {
+    __IOM uint32_t CFGD;                        /*!< (@ 0x0000004C) GPIO Configuration Register D (Pads 24-31)                 */
+
+    struct {
+      __IOM uint32_t GPIO24INCFG : 1;           /*!< [0..0] GPIO24 input enable.                                               */
+      __IOM uint32_t GPIO24OUTCFG : 2;          /*!< [2..1] GPIO24 output configuration.                                       */
+      __IOM uint32_t GPIO24INTD : 1;            /*!< [3..3] GPIO24 interrupt direction.                                        */
+      __IOM uint32_t GPIO25INCFG : 1;           /*!< [4..4] GPIO25 input enable.                                               */
+      __IOM uint32_t GPIO25OUTCFG : 2;          /*!< [6..5] GPIO25 output configuration.                                       */
+      __IOM uint32_t GPIO25INTD : 1;            /*!< [7..7] GPIO25 interrupt direction.                                        */
+      __IOM uint32_t GPIO26INCFG : 1;           /*!< [8..8] GPIO26 input enable.                                               */
+      __IOM uint32_t GPIO26OUTCFG : 2;          /*!< [10..9] GPIO26 output configuration.                                      */
+      __IOM uint32_t GPIO26INTD : 1;            /*!< [11..11] GPIO26 interrupt direction.                                      */
+      __IOM uint32_t GPIO27INCFG : 1;           /*!< [12..12] GPIO27 input enable.                                             */
+      __IOM uint32_t GPIO27OUTCFG : 2;          /*!< [14..13] GPIO27 output configuration.                                     */
+      __IOM uint32_t GPIO27INTD : 1;            /*!< [15..15] GPIO27 interrupt direction.                                      */
+      __IOM uint32_t GPIO28INCFG : 1;           /*!< [16..16] GPIO28 input enable.                                             */
+      __IOM uint32_t GPIO28OUTCFG : 2;          /*!< [18..17] GPIO28 output configuration.                                     */
+      __IOM uint32_t GPIO28INTD : 1;            /*!< [19..19] GPIO28 interrupt direction.                                      */
+      __IOM uint32_t GPIO29INCFG : 1;           /*!< [20..20] GPIO29 input enable.                                             */
+      __IOM uint32_t GPIO29OUTCFG : 2;          /*!< [22..21] GPIO29 output configuration.                                     */
+      __IOM uint32_t GPIO29INTD : 1;            /*!< [23..23] GPIO29 interrupt direction.                                      */
+      __IOM uint32_t GPIO30INCFG : 1;           /*!< [24..24] GPIO30 input enable.                                             */
+      __IOM uint32_t GPIO30OUTCFG : 2;          /*!< [26..25] GPIO30 output configuration.                                     */
+      __IOM uint32_t GPIO30INTD : 1;            /*!< [27..27] GPIO30 interrupt direction.                                      */
+      __IOM uint32_t GPIO31INCFG : 1;           /*!< [28..28] GPIO31 input enable.                                             */
+      __IOM uint32_t GPIO31OUTCFG : 2;          /*!< [30..29] GPIO31 output configuration.                                     */
+      __IOM uint32_t GPIO31INTD : 1;            /*!< [31..31] GPIO31 interrupt direction.                                      */
+    } CFGD_b;
+  } ;
+
+  union {
+    __IOM uint32_t CFGE;                        /*!< (@ 0x00000050) GPIO Configuration Register E (Pads 32-39)                 */
+
+    struct {
+      __IOM uint32_t GPIO32INCFG : 1;           /*!< [0..0] GPIO32 input enable.                                               */
+      __IOM uint32_t GPIO32OUTCFG : 2;          /*!< [2..1] GPIO32 output configuration.                                       */
+      __IOM uint32_t GPIO32INTD : 1;            /*!< [3..3] GPIO32 interrupt direction.                                        */
+      __IOM uint32_t GPIO33INCFG : 1;           /*!< [4..4] GPIO33 input enable.                                               */
+      __IOM uint32_t GPIO33OUTCFG : 2;          /*!< [6..5] GPIO33 output configuration.                                       */
+      __IOM uint32_t GPIO33INTD : 1;            /*!< [7..7] GPIO33 interrupt direction.                                        */
+      __IOM uint32_t GPIO34INCFG : 1;           /*!< [8..8] GPIO34 input enable.                                               */
+      __IOM uint32_t GPIO34OUTCFG : 2;          /*!< [10..9] GPIO34 output configuration.                                      */
+      __IOM uint32_t GPIO34INTD : 1;            /*!< [11..11] GPIO34 interrupt direction.                                      */
+      __IOM uint32_t GPIO35INCFG : 1;           /*!< [12..12] GPIO35 input enable.                                             */
+      __IOM uint32_t GPIO35OUTCFG : 2;          /*!< [14..13] GPIO35 output configuration.                                     */
+      __IOM uint32_t GPIO35INTD : 1;            /*!< [15..15] GPIO35 interrupt direction.                                      */
+      __IOM uint32_t GPIO36INCFG : 1;           /*!< [16..16] GPIO36 input enable.                                             */
+      __IOM uint32_t GPIO36OUTCFG : 2;          /*!< [18..17] GPIO36 output configuration.                                     */
+      __IOM uint32_t GPIO36INTD : 1;            /*!< [19..19] GPIO36 interrupt direction.                                      */
+      __IOM uint32_t GPIO37INCFG : 1;           /*!< [20..20] GPIO37 input enable.                                             */
+      __IOM uint32_t GPIO37OUTCFG : 2;          /*!< [22..21] GPIO37 output configuration.                                     */
+      __IOM uint32_t GPIO37INTD : 1;            /*!< [23..23] GPIO37 interrupt direction.                                      */
+      __IOM uint32_t GPIO38INCFG : 1;           /*!< [24..24] GPIO38 input enable.                                             */
+      __IOM uint32_t GPIO38OUTCFG : 2;          /*!< [26..25] GPIO38 output configuration.                                     */
+      __IOM uint32_t GPIO38INTD : 1;            /*!< [27..27] GPIO38 interrupt direction.                                      */
+      __IOM uint32_t GPIO39INCFG : 1;           /*!< [28..28] GPIO39 input enable.                                             */
+      __IOM uint32_t GPIO39OUTCFG : 2;          /*!< [30..29] GPIO39 output configuration.                                     */
+      __IOM uint32_t GPIO39INTD : 1;            /*!< [31..31] GPIO39 interrupt direction.                                      */
+    } CFGE_b;
+  } ;
+
+  union {
+    __IOM uint32_t CFGF;                        /*!< (@ 0x00000054) GPIO Configuration Register F (Pads 40 -47)                */
+
+    struct {
+      __IOM uint32_t GPIO40INCFG : 1;           /*!< [0..0] GPIO40 input enable.                                               */
+      __IOM uint32_t GPIO40OUTCFG : 2;          /*!< [2..1] GPIO40 output configuration.                                       */
+      __IOM uint32_t GPIO40INTD : 1;            /*!< [3..3] GPIO40 interrupt direction.                                        */
+      __IOM uint32_t GPIO41INCFG : 1;           /*!< [4..4] GPIO41 input enable.                                               */
+      __IOM uint32_t GPIO41OUTCFG : 2;          /*!< [6..5] GPIO41 output configuration.                                       */
+      __IOM uint32_t GPIO41INTD : 1;            /*!< [7..7] GPIO41 interrupt direction.                                        */
+      __IOM uint32_t GPIO42INCFG : 1;           /*!< [8..8] GPIO42 input enable.                                               */
+      __IOM uint32_t GPIO42OUTCFG : 2;          /*!< [10..9] GPIO42 output configuration.                                      */
+      __IOM uint32_t GPIO42INTD : 1;            /*!< [11..11] GPIO42 interrupt direction.                                      */
+      __IOM uint32_t GPIO43INCFG : 1;           /*!< [12..12] GPIO43 input enable.                                             */
+      __IOM uint32_t GPIO43OUTCFG : 2;          /*!< [14..13] GPIO43 output configuration.                                     */
+      __IOM uint32_t GPIO43INTD : 1;            /*!< [15..15] GPIO43 interrupt direction.                                      */
+      __IOM uint32_t GPIO44INCFG : 1;           /*!< [16..16] GPIO44 input enable.                                             */
+      __IOM uint32_t GPIO44OUTCFG : 2;          /*!< [18..17] GPIO44 output configuration.                                     */
+      __IOM uint32_t GPIO44INTD : 1;            /*!< [19..19] GPIO44 interrupt direction.                                      */
+      __IOM uint32_t GPIO45INCFG : 1;           /*!< [20..20] GPIO45 input enable.                                             */
+      __IOM uint32_t GPIO45OUTCFG : 2;          /*!< [22..21] GPIO45 output configuration.                                     */
+      __IOM uint32_t GPIO45INTD : 1;            /*!< [23..23] GPIO45 interrupt direction.                                      */
+      __IOM uint32_t GPIO46INCFG : 1;           /*!< [24..24] GPIO46 input enable.                                             */
+      __IOM uint32_t GPIO46OUTCFG : 2;          /*!< [26..25] GPIO46 output configuration.                                     */
+      __IOM uint32_t GPIO46INTD : 1;            /*!< [27..27] GPIO46 interrupt direction.                                      */
+      __IOM uint32_t GPIO47INCFG : 1;           /*!< [28..28] GPIO47 input enable.                                             */
+      __IOM uint32_t GPIO47OUTCFG : 2;          /*!< [30..29] GPIO47 output configuration.                                     */
+      __IOM uint32_t GPIO47INTD : 1;            /*!< [31..31] GPIO47 interrupt direction.                                      */
+    } CFGF_b;
+  } ;
+
+  union {
+    __IOM uint32_t CFGG;                        /*!< (@ 0x00000058) GPIO Configuration Register G (Pads 48-49)                 */
+
+    struct {
+      __IOM uint32_t GPIO48INCFG : 1;           /*!< [0..0] GPIO48 input enable.                                               */
+      __IOM uint32_t GPIO48OUTCFG : 2;          /*!< [2..1] GPIO48 output configuration.                                       */
+      __IOM uint32_t GPIO48INTD : 1;            /*!< [3..3] GPIO48 interrupt direction.                                        */
+      __IOM uint32_t GPIO49INCFG : 1;           /*!< [4..4] GPIO49 input enable.                                               */
+      __IOM uint32_t GPIO49OUTCFG : 2;          /*!< [6..5] GPIO49 output configuration.                                       */
+      __IOM uint32_t GPIO49INTD : 1;            /*!< [7..7] GPIO49 interrupt direction.                                        */
+    } CFGG_b;
+  } ;
+  __IM  uint32_t  RESERVED1;
+
+  union {
+    __IOM uint32_t PADKEY;                      /*!< (@ 0x00000060) Key Register for all pad configuration registers           */
+
+    struct {
+      __IOM uint32_t PADKEY     : 32;           /*!< [31..0] Key register value.                                               */
+    } PADKEY_b;
+  } ;
+  __IM  uint32_t  RESERVED2[7];
+
+  union {
+    __IOM uint32_t RDA;                         /*!< (@ 0x00000080) GPIO Input Register A                                      */
+
+    struct {
+      __IOM uint32_t RDA        : 32;           /*!< [31..0] GPIO31-0 read data.                                               */
+    } RDA_b;
+  } ;
+
+  union {
+    __IOM uint32_t RDB;                         /*!< (@ 0x00000084) GPIO Input Register B                                      */
+
+    struct {
+      __IOM uint32_t RDB        : 18;           /*!< [17..0] GPIO49-32 read data.                                              */
+    } RDB_b;
+  } ;
+
+  union {
+    __IOM uint32_t WTA;                         /*!< (@ 0x00000088) GPIO Output Register A                                     */
+
+    struct {
+      __IOM uint32_t WTA        : 32;           /*!< [31..0] GPIO31-0 write data.                                              */
+    } WTA_b;
+  } ;
+
+  union {
+    __IOM uint32_t WTB;                         /*!< (@ 0x0000008C) GPIO Output Register B                                     */
+
+    struct {
+      __IOM uint32_t WTB        : 18;           /*!< [17..0] GPIO49-32 write data.                                             */
+    } WTB_b;
+  } ;
+
+  union {
+    __IOM uint32_t WTSA;                        /*!< (@ 0x00000090) GPIO Output Register A Set                                 */
+
+    struct {
+      __IOM uint32_t WTSA       : 32;           /*!< [31..0] Set the GPIO31-0 write data.                                      */
+    } WTSA_b;
+  } ;
+
+  union {
+    __IOM uint32_t WTSB;                        /*!< (@ 0x00000094) GPIO Output Register B Set                                 */
+
+    struct {
+      __IOM uint32_t WTSB       : 18;           /*!< [17..0] Set the GPIO49-32 write data.                                     */
+    } WTSB_b;
+  } ;
+
+  union {
+    __IOM uint32_t WTCA;                        /*!< (@ 0x00000098) GPIO Output Register A Clear                               */
+
+    struct {
+      __IOM uint32_t WTCA       : 32;           /*!< [31..0] Clear the GPIO31-0 write data.                                    */
+    } WTCA_b;
+  } ;
+
+  union {
+    __IOM uint32_t WTCB;                        /*!< (@ 0x0000009C) GPIO Output Register B Clear                               */
+
+    struct {
+      __IOM uint32_t WTCB       : 18;           /*!< [17..0] Clear the GPIO49-32 write data.                                   */
+    } WTCB_b;
+  } ;
+
+  union {
+    __IOM uint32_t ENA;                         /*!< (@ 0x000000A0) GPIO Enable Register A                                     */
+
+    struct {
+      __IOM uint32_t ENA        : 32;           /*!< [31..0] GPIO31-0 output enables                                           */
+    } ENA_b;
+  } ;
+
+  union {
+    __IOM uint32_t ENB;                         /*!< (@ 0x000000A4) GPIO Enable Register B                                     */
+
+    struct {
+      __IOM uint32_t ENB        : 18;           /*!< [17..0] GPIO49-32 output enables                                          */
+    } ENB_b;
+  } ;
+
+  union {
+    __IOM uint32_t ENSA;                        /*!< (@ 0x000000A8) GPIO Enable Register A Set                                 */
+
+    struct {
+      __IOM uint32_t ENSA       : 32;           /*!< [31..0] Set the GPIO31-0 output enables                                   */
+    } ENSA_b;
+  } ;
+
+  union {
+    __IOM uint32_t ENSB;                        /*!< (@ 0x000000AC) GPIO Enable Register B Set                                 */
+
+    struct {
+      __IOM uint32_t ENSB       : 18;           /*!< [17..0] Set the GPIO49-32 output enables                                  */
+    } ENSB_b;
+  } ;
+  __IM  uint32_t  RESERVED3;
+
+  union {
+    __IOM uint32_t ENCA;                        /*!< (@ 0x000000B4) GPIO Enable Register A Clear                               */
+
+    struct {
+      __IOM uint32_t ENCA       : 32;           /*!< [31..0] Clear the GPIO31-0 output enables                                 */
+    } ENCA_b;
+  } ;
+
+  union {
+    __IOM uint32_t ENCB;                        /*!< (@ 0x000000B8) GPIO Enable Register B Clear                               */
+
+    struct {
+      __IOM uint32_t ENCB       : 18;           /*!< [17..0] Clear the GPIO49-32 output enables                                */
+    } ENCB_b;
+  } ;
+
+  union {
+    __IOM uint32_t STMRCAP;                     /*!< (@ 0x000000BC) STIMER Capture Control                                     */
+
+    struct {
+      __IOM uint32_t STSEL0     : 6;            /*!< [5..0] STIMER Capture 0 Select.                                           */
+      __IOM uint32_t STPOL0     : 1;            /*!< [6..6] STIMER Capture 0 Polarity.                                         */
+      __IM  uint32_t            : 1;
+      __IOM uint32_t STSEL1     : 6;            /*!< [13..8] STIMER Capture 1 Select.                                          */
+      __IOM uint32_t STPOL1     : 1;            /*!< [14..14] STIMER Capture 1 Polarity.                                       */
+      __IM  uint32_t            : 1;
+      __IOM uint32_t STSEL2     : 6;            /*!< [21..16] STIMER Capture 2 Select.                                         */
+      __IOM uint32_t STPOL2     : 1;            /*!< [22..22] STIMER Capture 2 Polarity.                                       */
+      __IM  uint32_t            : 1;
+      __IOM uint32_t STSEL3     : 6;            /*!< [29..24] STIMER Capture 3 Select.                                         */
+      __IOM uint32_t STPOL3     : 1;            /*!< [30..30] STIMER Capture 3 Polarity.                                       */
+    } STMRCAP_b;
+  } ;
+
+  union {
+    __IOM uint32_t IOM0IRQ;                     /*!< (@ 0x000000C0) IOM0 Flow Control IRQ Select                               */
+
+    struct {
+      __IOM uint32_t IOM0IRQ    : 6;            /*!< [5..0] IOMSTR0 IRQ pad select.                                            */
+    } IOM0IRQ_b;
+  } ;
+
+  union {
+    __IOM uint32_t IOM1IRQ;                     /*!< (@ 0x000000C4) IOM1 Flow Control IRQ Select                               */
+
+    struct {
+      __IOM uint32_t IOM1IRQ    : 6;            /*!< [5..0] IOMSTR1 IRQ pad select.                                            */
+    } IOM1IRQ_b;
+  } ;
+
+  union {
+    __IOM uint32_t IOM2IRQ;                     /*!< (@ 0x000000C8) IOM2 Flow Control IRQ Select                               */
+
+    struct {
+      __IOM uint32_t IOM2IRQ    : 6;            /*!< [5..0] IOMSTR2 IRQ pad select.                                            */
+    } IOM2IRQ_b;
+  } ;
+
+  union {
+    __IOM uint32_t IOM3IRQ;                     /*!< (@ 0x000000CC) IOM3 Flow Control IRQ Select                               */
+
+    struct {
+      __IOM uint32_t IOM3IRQ    : 6;            /*!< [5..0] IOMSTR3 IRQ pad select.                                            */
+    } IOM3IRQ_b;
+  } ;
+
+  union {
+    __IOM uint32_t IOM4IRQ;                     /*!< (@ 0x000000D0) IOM4 Flow Control IRQ Select                               */
+
+    struct {
+      __IOM uint32_t IOM4IRQ    : 6;            /*!< [5..0] IOMSTR4 IRQ pad select.                                            */
+    } IOM4IRQ_b;
+  } ;
+
+  union {
+    __IOM uint32_t IOM5IRQ;                     /*!< (@ 0x000000D4) IOM5 Flow Control IRQ Select                               */
+
+    struct {
+      __IOM uint32_t IOM5IRQ    : 6;            /*!< [5..0] IOMSTR5 IRQ pad select.                                            */
+    } IOM5IRQ_b;
+  } ;
+
+  union {
+    __IOM uint32_t BLEIFIRQ;                    /*!< (@ 0x000000D8) BLEIF Flow Control IRQ Select                              */
+
+    struct {
+      __IOM uint32_t BLEIFIRQ   : 6;            /*!< [5..0] BLEIF IRQ pad select.                                              */
+    } BLEIFIRQ_b;
+  } ;
+
+  union {
+    __IOM uint32_t GPIOOBS;                     /*!< (@ 0x000000DC) GPIO Observation Mode Sample register                      */
+
+    struct {
+      __IOM uint32_t OBS_DATA   : 16;           /*!< [15..0] Sample of the data output on the GPIO observation port.
+                                                     May have async sampling issues, as the data is not synronized
+                                                     to the read operation. Intended for debug purposes only                   */
+    } GPIOOBS_b;
+  } ;
+
+  union {
+    __IOM uint32_t ALTPADCFGA;                  /*!< (@ 0x000000E0) Alternate Pad Configuration reg0 (Pads 3,2,1,0)            */
+
+    struct {
+      __IOM uint32_t PAD0_DS1   : 1;            /*!< [0..0] Pad 0 high order drive strength selection. Used in conjunction
+                                                     with PAD0STRNG field to set the pad drive strength.                       */
+      __IM  uint32_t            : 3;
+      __IOM uint32_t PAD0_SR    : 1;            /*!< [4..4] Pad 0 slew rate selection.                                         */
+      __IM  uint32_t            : 3;
+      __IOM uint32_t PAD1_DS1   : 1;            /*!< [8..8] Pad 1 high order drive strength selection. Used in conjunction
+                                                     with PAD1STRNG field to set the pad drive strength.                       */
+      __IM  uint32_t            : 3;
+      __IOM uint32_t PAD1_SR    : 1;            /*!< [12..12] Pad 1 slew rate selection.                                       */
+      __IM  uint32_t            : 3;
+      __IOM uint32_t PAD2_DS1   : 1;            /*!< [16..16] Pad 2 high order drive strength selection. Used in
+                                                     conjunction with PAD2STRNG field to set the pad drive strength.           */
+      __IM  uint32_t            : 3;
+      __IOM uint32_t PAD2_SR    : 1;            /*!< [20..20] Pad 2 slew rate selection.                                       */
+      __IM  uint32_t            : 3;
+      __IOM uint32_t PAD3_DS1   : 1;            /*!< [24..24] Pad 3 high order drive strength selection. Used in
+                                                     conjunction with PAD3STRNG field to set the pad drive strength.           */
+      __IM  uint32_t            : 3;
+      __IOM uint32_t PAD3_SR    : 1;            /*!< [28..28] Pad 3 slew rate selection.                                       */
+    } ALTPADCFGA_b;
+  } ;
+
+  union {
+    __IOM uint32_t ALTPADCFGB;                  /*!< (@ 0x000000E4) Alternate Pad Configuration reg1 (Pads 7,6,5,4)            */
+
+    struct {
+      __IOM uint32_t PAD4_DS1   : 1;            /*!< [0..0] Pad 4 high order drive strength selection. Used in conjunction
+                                                     with PAD4STRNG field to set the pad drive strength.                       */
+      __IM  uint32_t            : 3;
+      __IOM uint32_t PAD4_SR    : 1;            /*!< [4..4] Pad 4 slew rate selection.                                         */
+      __IM  uint32_t            : 3;
+      __IOM uint32_t PAD5_DS1   : 1;            /*!< [8..8] Pad 5 high order drive strength selection. Used in conjunction
+                                                     with PAD5STRNG field to set the pad drive strength.                       */
+      __IM  uint32_t            : 3;
+      __IOM uint32_t PAD5_SR    : 1;            /*!< [12..12] Pad 5 slew rate selection.                                       */
+      __IM  uint32_t            : 3;
+      __IOM uint32_t PAD6_DS1   : 1;            /*!< [16..16] Pad 6 high order drive strength selection. Used in
+                                                     conjunction with PAD6STRNG field to set the pad drive strength.           */
+      __IM  uint32_t            : 3;
+      __IOM uint32_t PAD6_SR    : 1;            /*!< [20..20] Pad 6 slew rate selection.                                       */
+      __IM  uint32_t            : 3;
+      __IOM uint32_t PAD7_DS1   : 1;            /*!< [24..24] Pad 7 high order drive strength selection. Used in
+                                                     conjunction with PAD7STRNG field to set the pad drive strength.           */
+      __IM  uint32_t            : 3;
+      __IOM uint32_t PAD7_SR    : 1;            /*!< [28..28] Pad 7 slew rate selection.                                       */
+    } ALTPADCFGB_b;
+  } ;
+
+  union {
+    __IOM uint32_t ALTPADCFGC;                  /*!< (@ 0x000000E8) Alternate Pad Configuration reg2 (Pads 11,10,9,8)          */
+
+    struct {
+      __IOM uint32_t PAD8_DS1   : 1;            /*!< [0..0] Pad 8 high order drive strength selection. Used in conjunction
+                                                     with PAD8STRNG field to set the pad drive strength.                       */
+      __IM  uint32_t            : 3;
+      __IOM uint32_t PAD8_SR    : 1;            /*!< [4..4] Pad 8 slew rate selection.                                         */
+      __IM  uint32_t            : 3;
+      __IOM uint32_t PAD9_DS1   : 1;            /*!< [8..8] Pad 9 high order drive strength selection. Used in conjunction
+                                                     with PAD9STRNG field to set the pad drive strength.                       */
+      __IM  uint32_t            : 3;
+      __IOM uint32_t PAD9_SR    : 1;            /*!< [12..12] Pad 9 slew rate selection.                                       */
+      __IM  uint32_t            : 3;
+      __IOM uint32_t PAD10_DS1  : 1;            /*!< [16..16] Pad 10 high order drive strength selection. Used in
+                                                     conjunction with PAD10STRNG field to set the pad drive
+                                                     strength.                                                                 */
+      __IM  uint32_t            : 3;
+      __IOM uint32_t PAD10_SR   : 1;            /*!< [20..20] Pad 10 slew rate selection.                                      */
+      __IM  uint32_t            : 3;
+      __IOM uint32_t PAD11_DS1  : 1;            /*!< [24..24] Pad 11 high order drive strength selection. Used in
+                                                     conjunction with PAD11STRNG field to set the pad drive
+                                                     strength.                                                                 */
+      __IM  uint32_t            : 3;
+      __IOM uint32_t PAD11_SR   : 1;            /*!< [28..28] Pad 11 slew rate selection.                                      */
+    } ALTPADCFGC_b;
+  } ;
+
+  union {
+    __IOM uint32_t ALTPADCFGD;                  /*!< (@ 0x000000EC) Alternate Pad Configuration reg3 (Pads 15,14,13,12)        */
+
+    struct {
+      __IOM uint32_t PAD12_DS1  : 1;            /*!< [0..0] Pad 12 high order drive strength selection. Used in conjunction
+                                                     with PAD12STRNG field to set the pad drive strength.                      */
+      __IM  uint32_t            : 3;
+      __IOM uint32_t PAD12_SR   : 1;            /*!< [4..4] Pad 12 slew rate selection.                                        */
+      __IM  uint32_t            : 3;
+      __IOM uint32_t PAD13_DS1  : 1;            /*!< [8..8] Pad 13 high order drive strength selection. Used in conjunction
+                                                     with PAD13STRNG field to set the pad drive strength.                      */
+      __IM  uint32_t            : 3;
+      __IOM uint32_t PAD13_SR   : 1;            /*!< [12..12] Pad 13 slew rate selection.                                      */
+      __IM  uint32_t            : 3;
+      __IOM uint32_t PAD14_DS1  : 1;            /*!< [16..16] Pad 14 high order drive strength selection. Used in
+                                                     conjunction with PAD14STRNG field to set the pad drive
+                                                     strength.                                                                 */
+      __IM  uint32_t            : 3;
+      __IOM uint32_t PAD14_SR   : 1;            /*!< [20..20] Pad 14 slew rate selection.                                      */
+      __IM  uint32_t            : 3;
+      __IOM uint32_t PAD15_DS1  : 1;            /*!< [24..24] Pad 15 high order drive strength selection. Used in
+                                                     conjunction with PAD15STRNG field to set the pad drive
+                                                     strength.                                                                 */
+      __IM  uint32_t            : 3;
+      __IOM uint32_t PAD15_SR   : 1;            /*!< [28..28] Pad 15 slew rate selection.                                      */
+    } ALTPADCFGD_b;
+  } ;
+
+  union {
+    __IOM uint32_t ALTPADCFGE;                  /*!< (@ 0x000000F0) Alternate Pad Configuration reg4 (Pads 19,18,17,16)        */
+
+    struct {
+      __IOM uint32_t PAD16_DS1  : 1;            /*!< [0..0] Pad 16 high order drive strength selection. Used in conjunction
+                                                     with PAD16STRNG field to set the pad drive strength.                      */
+      __IM  uint32_t            : 3;
+      __IOM uint32_t PAD16_SR   : 1;            /*!< [4..4] Pad 16 slew rate selection.                                        */
+      __IM  uint32_t            : 3;
+      __IOM uint32_t PAD17_DS1  : 1;            /*!< [8..8] Pad 17 high order drive strength selection. Used in conjunction
+                                                     with PAD17STRNG field to set the pad drive strength.                      */
+      __IM  uint32_t            : 3;
+      __IOM uint32_t PAD17_SR   : 1;            /*!< [12..12] Pad 17 slew rate selection.                                      */
+      __IM  uint32_t            : 3;
+      __IOM uint32_t PAD18_DS1  : 1;            /*!< [16..16] Pad 18 high order drive strength selection. Used in
+                                                     conjunction with PAD18STRNG field to set the pad drive
+                                                     strength.                                                                 */
+      __IM  uint32_t            : 3;
+      __IOM uint32_t PAD18_SR   : 1;            /*!< [20..20] Pad 18 slew rate selection.                                      */
+      __IM  uint32_t            : 3;
+      __IOM uint32_t PAD19_DS1  : 1;            /*!< [24..24] Pad 19 high order drive strength selection. Used in
+                                                     conjunction with PAD19STRNG field to set the pad drive
+                                                     strength.                                                                 */
+      __IM  uint32_t            : 3;
+      __IOM uint32_t PAD19_SR   : 1;            /*!< [28..28] Pad 19 slew rate selection.                                      */
+    } ALTPADCFGE_b;
+  } ;
+
+  union {
+    __IOM uint32_t ALTPADCFGF;                  /*!< (@ 0x000000F4) Alternate Pad Configuration reg5 (Pads 23,22,21,20)        */
+
+    struct {
+      __IOM uint32_t PAD20_DS1  : 1;            /*!< [0..0] Pad 20 high order drive strength selection. Used in conjunction
+                                                     with PAD20STRNG field to set the pad drive strength.                      */
+      __IM  uint32_t            : 3;
+      __IOM uint32_t PAD20_SR   : 1;            /*!< [4..4] Pad 20 slew rate selection.                                        */
+      __IM  uint32_t            : 3;
+      __IOM uint32_t PAD21_DS1  : 1;            /*!< [8..8] Pad 21 high order drive strength selection. Used in conjunction
+                                                     with PAD21STRNG field to set the pad drive strength.                      */
+      __IM  uint32_t            : 3;
+      __IOM uint32_t PAD21_SR   : 1;            /*!< [12..12] Pad 21 slew rate selection.                                      */
+      __IM  uint32_t            : 3;
+      __IOM uint32_t PAD22_DS1  : 1;            /*!< [16..16] Pad 22 high order drive strength selection. Used in
+                                                     conjunction with PAD22STRNG field to set the pad drive
+                                                     strength.                                                                 */
+      __IM  uint32_t            : 3;
+      __IOM uint32_t PAD22_SR   : 1;            /*!< [20..20] Pad 22 slew rate selection.                                      */
+      __IM  uint32_t            : 3;
+      __IOM uint32_t PAD23_DS1  : 1;            /*!< [24..24] Pad 23 high order drive strength selection. Used in
+                                                     conjunction with PAD23STRNG field to set the pad drive
+                                                     strength.                                                                 */
+      __IM  uint32_t            : 3;
+      __IOM uint32_t PAD23_SR   : 1;            /*!< [28..28] Pad 23 slew rate selection.                                      */
+    } ALTPADCFGF_b;
+  } ;
+
+  union {
+    __IOM uint32_t ALTPADCFGG;                  /*!< (@ 0x000000F8) Alternate Pad Configuration reg6 (Pads 27,26,25,24)        */
+
+    struct {
+      __IOM uint32_t PAD24_DS1  : 1;            /*!< [0..0] Pad 24 high order drive strength selection. Used in conjunction
+                                                     with PAD24STRNG field to set the pad drive strength.                      */
+      __IM  uint32_t            : 3;
+      __IOM uint32_t PAD24_SR   : 1;            /*!< [4..4] Pad 24 slew rate selection.                                        */
+      __IM  uint32_t            : 3;
+      __IOM uint32_t PAD25_DS1  : 1;            /*!< [8..8] Pad 25 high order drive strength selection. Used in conjunction
+                                                     with PAD25STRNG field to set the pad drive strength.                      */
+      __IM  uint32_t            : 3;
+      __IOM uint32_t PAD25_SR   : 1;            /*!< [12..12] Pad 25 slew rate selection.                                      */
+      __IM  uint32_t            : 3;
+      __IOM uint32_t PAD26_DS1  : 1;            /*!< [16..16] Pad 26 high order drive strength selection. Used in
+                                                     conjunction with PAD26STRNG field to set the pad drive
+                                                     strength.                                                                 */
+      __IM  uint32_t            : 3;
+      __IOM uint32_t PAD26_SR   : 1;            /*!< [20..20] Pad 26 slew rate selection.                                      */
+      __IM  uint32_t            : 3;
+      __IOM uint32_t PAD27_DS1  : 1;            /*!< [24..24] Pad 27 high order drive strength selection. Used in
+                                                     conjunction with PAD27STRNG field to set the pad drive
+                                                     strength.                                                                 */
+      __IM  uint32_t            : 3;
+      __IOM uint32_t PAD27_SR   : 1;            /*!< [28..28] Pad 27 slew rate selection.                                      */
+    } ALTPADCFGG_b;
+  } ;
+
+  union {
+    __IOM uint32_t ALTPADCFGH;                  /*!< (@ 0x000000FC) Alternate Pad Configuration reg7 (Pads 31,30,29,28)        */
+
+    struct {
+      __IOM uint32_t PAD28_DS1  : 1;            /*!< [0..0] Pad 28 high order drive strength selection. Used in conjunction
+                                                     with PAD28STRNG field to set the pad drive strength.                      */
+      __IM  uint32_t            : 3;
+      __IOM uint32_t PAD28_SR   : 1;            /*!< [4..4] Pad 28 slew rate selection.                                        */
+      __IM  uint32_t            : 3;
+      __IOM uint32_t PAD29_DS1  : 1;            /*!< [8..8] Pad 29 high order drive strength selection. Used in conjunction
+                                                     with PAD29STRNG field to set the pad drive strength.                      */
+      __IM  uint32_t            : 3;
+      __IOM uint32_t PAD29_SR   : 1;            /*!< [12..12] Pad 29 slew rate selection.                                      */
+      __IM  uint32_t            : 3;
+      __IOM uint32_t PAD30_DS1  : 1;            /*!< [16..16] Pad 30 high order drive strength selection. Used in
+                                                     conjunction with PAD30STRNG field to set the pad drive
+                                                     strength.                                                                 */
+      __IM  uint32_t            : 3;
+      __IOM uint32_t PAD30_SR   : 1;            /*!< [20..20] Pad 30 slew rate selection.                                      */
+      __IM  uint32_t            : 3;
+      __IOM uint32_t PAD31_DS1  : 1;            /*!< [24..24] Pad 31 high order drive strength selection. Used in
+                                                     conjunction with PAD31STRNG field to set the pad drive
+                                                     strength.                                                                 */
+      __IM  uint32_t            : 3;
+      __IOM uint32_t PAD31_SR   : 1;            /*!< [28..28] Pad 31 slew rate selection.                                      */
+    } ALTPADCFGH_b;
+  } ;
+
+  union {
+    __IOM uint32_t ALTPADCFGI;                  /*!< (@ 0x00000100) Alternate Pad Configuration reg8 (Pads 35,34,33,32)        */
+
+    struct {
+      __IOM uint32_t PAD32_DS1  : 1;            /*!< [0..0] Pad 32 high order drive strength selection. Used in conjunction
+                                                     with PAD32STRNG field to set the pad drive strength.                      */
+      __IM  uint32_t            : 3;
+      __IOM uint32_t PAD32_SR   : 1;            /*!< [4..4] Pad 32 slew rate selection.                                        */
+      __IM  uint32_t            : 3;
+      __IOM uint32_t PAD33_DS1  : 1;            /*!< [8..8] Pad 33 high order drive strength selection. Used in conjunction
+                                                     with PAD33STRNG field to set the pad drive strength.                      */
+      __IM  uint32_t            : 3;
+      __IOM uint32_t PAD33_SR   : 1;            /*!< [12..12] Pad 33 slew rate selection.                                      */
+      __IM  uint32_t            : 3;
+      __IOM uint32_t PAD34_DS1  : 1;            /*!< [16..16] Pad 34 high order drive strength selection. Used in
+                                                     conjunction with PAD34STRNG field to set the pad drive
+                                                     strength.                                                                 */
+      __IM  uint32_t            : 3;
+      __IOM uint32_t PAD34_SR   : 1;            /*!< [20..20] Pad 34 slew rate selection.                                      */
+      __IM  uint32_t            : 3;
+      __IOM uint32_t PAD35_DS1  : 1;            /*!< [24..24] Pad 35 high order drive strength selection. Used in
+                                                     conjunction with PAD35STRNG field to set the pad drive
+                                                     strength.                                                                 */
+      __IM  uint32_t            : 3;
+      __IOM uint32_t PAD35_SR   : 1;            /*!< [28..28] Pad 35 slew rate selection.                                      */
+    } ALTPADCFGI_b;
+  } ;
+
+  union {
+    __IOM uint32_t ALTPADCFGJ;                  /*!< (@ 0x00000104) Alternate Pad Configuration reg9 (Pads 39,38,37,36)        */
+
+    struct {
+      __IOM uint32_t PAD36_DS1  : 1;            /*!< [0..0] Pad 36 high order drive strength selection. Used in conjunction
+                                                     with PAD36STRNG field to set the pad drive strength.                      */
+      __IM  uint32_t            : 3;
+      __IOM uint32_t PAD36_SR   : 1;            /*!< [4..4] Pad 36 slew rate selection.                                        */
+      __IM  uint32_t            : 3;
+      __IOM uint32_t PAD37_DS1  : 1;            /*!< [8..8] Pad 37 high order drive strength selection. Used in conjunction
+                                                     with PAD37STRNG field to set the pad drive strength.                      */
+      __IM  uint32_t            : 3;
+      __IOM uint32_t PAD37_SR   : 1;            /*!< [12..12] Pad 37 slew rate selection.                                      */
+      __IM  uint32_t            : 3;
+      __IOM uint32_t PAD38_DS1  : 1;            /*!< [16..16] Pad 38 high order drive strength selection. Used in
+                                                     conjunction with PAD38STRNG field to set the pad drive
+                                                     strength.                                                                 */
+      __IM  uint32_t            : 3;
+      __IOM uint32_t PAD38_SR   : 1;            /*!< [20..20] Pad 38 slew rate selection.                                      */
+      __IM  uint32_t            : 3;
+      __IOM uint32_t PAD39_DS1  : 1;            /*!< [24..24] Pad 39 high order drive strength selection. Used in
+                                                     conjunction with PAD39STRNG field to set the pad drive
+                                                     strength.                                                                 */
+      __IM  uint32_t            : 3;
+      __IOM uint32_t PAD39_SR   : 1;            /*!< [28..28] Pad 39 slew rate selection.                                      */
+    } ALTPADCFGJ_b;
+  } ;
+
+  union {
+    __IOM uint32_t ALTPADCFGK;                  /*!< (@ 0x00000108) Alternate Pad Configuration reg10 (Pads 43,42,41,40)       */
+
+    struct {
+      __IOM uint32_t PAD40_DS1  : 1;            /*!< [0..0] Pad 40 high order drive strength selection. Used in conjunction
+                                                     with PAD40STRNG field to set the pad drive strength.                      */
+      __IM  uint32_t            : 3;
+      __IOM uint32_t PAD40_SR   : 1;            /*!< [4..4] Pad 40 slew rate selection.                                        */
+      __IM  uint32_t            : 3;
+      __IOM uint32_t PAD41_DS1  : 1;            /*!< [8..8] Pad 41 high order drive strength selection. Used in conjunction
+                                                     with PAD41STRNG field to set the pad drive strength.                      */
+      __IM  uint32_t            : 3;
+      __IOM uint32_t PAD41_SR   : 1;            /*!< [12..12] Pad 41 slew rate selection.                                      */
+      __IM  uint32_t            : 3;
+      __IOM uint32_t PAD42_DS1  : 1;            /*!< [16..16] Pad 42 high order drive strength selection. Used in
+                                                     conjunction with PAD42STRNG field to set the pad drive
+                                                     strength.                                                                 */
+      __IM  uint32_t            : 3;
+      __IOM uint32_t PAD42_SR   : 1;            /*!< [20..20] Pad 42 slew rate selection.                                      */
+      __IM  uint32_t            : 3;
+      __IOM uint32_t PAD43_DS1  : 1;            /*!< [24..24] Pad 43 high order drive strength selection. Used in
+                                                     conjunction with PAD43STRNG field to set the pad drive
+                                                     strength.                                                                 */
+      __IM  uint32_t            : 3;
+      __IOM uint32_t PAD43_SR   : 1;            /*!< [28..28] Pad 43 slew rate selection.                                      */
+    } ALTPADCFGK_b;
+  } ;
+
+  union {
+    __IOM uint32_t ALTPADCFGL;                  /*!< (@ 0x0000010C) Alternate Pad Configuration reg11 (Pads 47,46,45,44)       */
+
+    struct {
+      __IOM uint32_t PAD44_DS1  : 1;            /*!< [0..0] Pad 44 high order drive strength selection. Used in conjunction
+                                                     with PAD44STRNG field to set the pad drive strength.                      */
+      __IM  uint32_t            : 3;
+      __IOM uint32_t PAD44_SR   : 1;            /*!< [4..4] Pad 44 slew rate selection.                                        */
+      __IM  uint32_t            : 3;
+      __IOM uint32_t PAD45_DS1  : 1;            /*!< [8..8] Pad 45 high order drive strength selection. Used in conjunction
+                                                     with PAD45STRNG field to set the pad drive strength.                      */
+      __IM  uint32_t            : 3;
+      __IOM uint32_t PAD45_SR   : 1;            /*!< [12..12] Pad 45 slew rate selection.                                      */
+      __IM  uint32_t            : 3;
+      __IOM uint32_t PAD46_DS1  : 1;            /*!< [16..16] Pad 46 high order drive strength selection. Used in
+                                                     conjunction with PAD46STRNG field to set the pad drive
+                                                     strength.                                                                 */
+      __IM  uint32_t            : 3;
+      __IOM uint32_t PAD46_SR   : 1;            /*!< [20..20] Pad 46 slew rate selection.                                      */
+      __IM  uint32_t            : 3;
+      __IOM uint32_t PAD47_DS1  : 1;            /*!< [24..24] Pad 47 high order drive strength selection. Used in
+                                                     conjunction with PAD47STRNG field to set the pad drive
+                                                     strength.                                                                 */
+      __IM  uint32_t            : 3;
+      __IOM uint32_t PAD47_SR   : 1;            /*!< [28..28] Pad 47 slew rate selection.                                      */
+    } ALTPADCFGL_b;
+  } ;
+
+  union {
+    __IOM uint32_t ALTPADCFGM;                  /*!< (@ 0x00000110) Alternate Pad Configuration reg12 (Pads 49,48)             */
+
+    struct {
+      __IOM uint32_t PAD48_DS1  : 1;            /*!< [0..0] Pad 48 high order drive strength selection. Used in conjunction
+                                                     with PAD48STRNG field to set the pad drive strength.                      */
+      __IM  uint32_t            : 3;
+      __IOM uint32_t PAD48_SR   : 1;            /*!< [4..4] Pad 48 slew rate selection.                                        */
+      __IM  uint32_t            : 3;
+      __IOM uint32_t PAD49_DS1  : 1;            /*!< [8..8] Pad 49 high order drive strength selection. Used in conjunction
+                                                     with PAD49STRNG field to set the pad drive strength.                      */
+      __IM  uint32_t            : 3;
+      __IOM uint32_t PAD49_SR   : 1;            /*!< [12..12] Pad 49 slew rate selection.                                      */
+    } ALTPADCFGM_b;
+  } ;
+
+  union {
+    __IOM uint32_t SCDET;                       /*!< (@ 0x00000114) SCARD Card Detect select                                   */
+
+    struct {
+      __IOM uint32_t SCDET      : 6;            /*!< [5..0] SCARD card detect pad select.                                      */
+    } SCDET_b;
+  } ;
+
+  union {
+    __IOM uint32_t CTENCFG;                     /*!< (@ 0x00000118) Counter/Timer Enable Config                                */
+
+    struct {
+      __IOM uint32_t EN0        : 1;            /*!< [0..0] CT0 Enable                                                         */
+      __IOM uint32_t EN1        : 1;            /*!< [1..1] CT1 Enable                                                         */
+      __IOM uint32_t EN2        : 1;            /*!< [2..2] CT2 Enable                                                         */
+      __IOM uint32_t EN3        : 1;            /*!< [3..3] CT3 Enable                                                         */
+      __IOM uint32_t EN4        : 1;            /*!< [4..4] CT4 Enable                                                         */
+      __IOM uint32_t EN5        : 1;            /*!< [5..5] CT5 Enable                                                         */
+      __IOM uint32_t EN6        : 1;            /*!< [6..6] CT6 Enable                                                         */
+      __IOM uint32_t EN7        : 1;            /*!< [7..7] CT7 Enable                                                         */
+      __IOM uint32_t EN8        : 1;            /*!< [8..8] CT8 Enable                                                         */
+      __IOM uint32_t EN9        : 1;            /*!< [9..9] CT9 Enable                                                         */
+      __IOM uint32_t EN10       : 1;            /*!< [10..10] CT10 Enable                                                      */
+      __IOM uint32_t EN11       : 1;            /*!< [11..11] CT11 Enable                                                      */
+      __IOM uint32_t EN12       : 1;            /*!< [12..12] CT12 Enable                                                      */
+      __IOM uint32_t EN13       : 1;            /*!< [13..13] CT13 Enable                                                      */
+      __IOM uint32_t EN14       : 1;            /*!< [14..14] CT14 Enable                                                      */
+      __IOM uint32_t EN15       : 1;            /*!< [15..15] CT15 Enable                                                      */
+      __IOM uint32_t EN16       : 1;            /*!< [16..16] CT16 Enable                                                      */
+      __IOM uint32_t EN17       : 1;            /*!< [17..17] CT17 Enable                                                      */
+      __IOM uint32_t EN18       : 1;            /*!< [18..18] CT18 Enable                                                      */
+      __IOM uint32_t EN19       : 1;            /*!< [19..19] CT19 Enable                                                      */
+      __IOM uint32_t EN20       : 1;            /*!< [20..20] CT20 Enable                                                      */
+      __IOM uint32_t EN21       : 1;            /*!< [21..21] CT21 Enable                                                      */
+      __IOM uint32_t EN22       : 1;            /*!< [22..22] CT22 Enable                                                      */
+      __IOM uint32_t EN23       : 1;            /*!< [23..23] CT23 Enable                                                      */
+      __IOM uint32_t EN24       : 1;            /*!< [24..24] CT24 Enable                                                      */
+      __IOM uint32_t EN25       : 1;            /*!< [25..25] CT25 Enable                                                      */
+      __IOM uint32_t EN26       : 1;            /*!< [26..26] CT26 Enable                                                      */
+      __IOM uint32_t EN27       : 1;            /*!< [27..27] CT27 Enable                                                      */
+      __IOM uint32_t EN28       : 1;            /*!< [28..28] CT28 Enable                                                      */
+      __IOM uint32_t EN29       : 1;            /*!< [29..29] CT29 Enable                                                      */
+      __IOM uint32_t EN30       : 1;            /*!< [30..30] CT30 Enable                                                      */
+      __IOM uint32_t EN31       : 1;            /*!< [31..31] CT31 Enable                                                      */
+    } CTENCFG_b;
+  } ;
+  __IM  uint32_t  RESERVED4[57];
+
+  union {
+    __IOM uint32_t INT0EN;                      /*!< (@ 0x00000200) GPIO Interrupt Registers 31-0: Enable                      */
+
+    struct {
+      __IOM uint32_t GPIO0      : 1;            /*!< [0..0] GPIO0 interrupt.                                                   */
+      __IOM uint32_t GPIO1      : 1;            /*!< [1..1] GPIO1 interrupt.                                                   */
+      __IOM uint32_t GPIO2      : 1;            /*!< [2..2] GPIO2 interrupt.                                                   */
+      __IOM uint32_t GPIO3      : 1;            /*!< [3..3] GPIO3 interrupt.                                                   */
+      __IOM uint32_t GPIO4      : 1;            /*!< [4..4] GPIO4 interrupt.                                                   */
+      __IOM uint32_t GPIO5      : 1;            /*!< [5..5] GPIO5 interrupt.                                                   */
+      __IOM uint32_t GPIO6      : 1;            /*!< [6..6] GPIO6 interrupt.                                                   */
+      __IOM uint32_t GPIO7      : 1;            /*!< [7..7] GPIO7 interrupt.                                                   */
+      __IOM uint32_t GPIO8      : 1;            /*!< [8..8] GPIO8 interrupt.                                                   */
+      __IOM uint32_t GPIO9      : 1;            /*!< [9..9] GPIO9 interrupt.                                                   */
+      __IOM uint32_t GPIO10     : 1;            /*!< [10..10] GPIO10 interrupt.                                                */
+      __IOM uint32_t GPIO11     : 1;            /*!< [11..11] GPIO11 interrupt.                                                */
+      __IOM uint32_t GPIO12     : 1;            /*!< [12..12] GPIO12 interrupt.                                                */
+      __IOM uint32_t GPIO13     : 1;            /*!< [13..13] GPIO13 interrupt.                                                */
+      __IOM uint32_t GPIO14     : 1;            /*!< [14..14] GPIO14 interrupt.                                                */
+      __IOM uint32_t GPIO15     : 1;            /*!< [15..15] GPIO15 interrupt.                                                */
+      __IOM uint32_t GPIO16     : 1;            /*!< [16..16] GPIO16 interrupt.                                                */
+      __IOM uint32_t GPIO17     : 1;            /*!< [17..17] GPIO17 interrupt.                                                */
+      __IOM uint32_t GPIO18     : 1;            /*!< [18..18] GPIO18interrupt.                                                 */
+      __IOM uint32_t GPIO19     : 1;            /*!< [19..19] GPIO19 interrupt.                                                */
+      __IOM uint32_t GPIO20     : 1;            /*!< [20..20] GPIO20 interrupt.                                                */
+      __IOM uint32_t GPIO21     : 1;            /*!< [21..21] GPIO21 interrupt.                                                */
+      __IOM uint32_t GPIO22     : 1;            /*!< [22..22] GPIO22 interrupt.                                                */
+      __IOM uint32_t GPIO23     : 1;            /*!< [23..23] GPIO23 interrupt.                                                */
+      __IOM uint32_t GPIO24     : 1;            /*!< [24..24] GPIO24 interrupt.                                                */
+      __IOM uint32_t GPIO25     : 1;            /*!< [25..25] GPIO25 interrupt.                                                */
+      __IOM uint32_t GPIO26     : 1;            /*!< [26..26] GPIO26 interrupt.                                                */
+      __IOM uint32_t GPIO27     : 1;            /*!< [27..27] GPIO27 interrupt.                                                */
+      __IOM uint32_t GPIO28     : 1;            /*!< [28..28] GPIO28 interrupt.                                                */
+      __IOM uint32_t GPIO29     : 1;            /*!< [29..29] GPIO29 interrupt.                                                */
+      __IOM uint32_t GPIO30     : 1;            /*!< [30..30] GPIO30 interrupt.                                                */
+      __IOM uint32_t GPIO31     : 1;            /*!< [31..31] GPIO31 interrupt.                                                */
+    } INT0EN_b;
+  } ;
+
+  union {
+    __IOM uint32_t INT0STAT;                    /*!< (@ 0x00000204) GPIO Interrupt Registers 31-0: Status                      */
+
+    struct {
+      __IOM uint32_t GPIO0      : 1;            /*!< [0..0] GPIO0 interrupt.                                                   */
+      __IOM uint32_t GPIO1      : 1;            /*!< [1..1] GPIO1 interrupt.                                                   */
+      __IOM uint32_t GPIO2      : 1;            /*!< [2..2] GPIO2 interrupt.                                                   */
+      __IOM uint32_t GPIO3      : 1;            /*!< [3..3] GPIO3 interrupt.                                                   */
+      __IOM uint32_t GPIO4      : 1;            /*!< [4..4] GPIO4 interrupt.                                                   */
+      __IOM uint32_t GPIO5      : 1;            /*!< [5..5] GPIO5 interrupt.                                                   */
+      __IOM uint32_t GPIO6      : 1;            /*!< [6..6] GPIO6 interrupt.                                                   */
+      __IOM uint32_t GPIO7      : 1;            /*!< [7..7] GPIO7 interrupt.                                                   */
+      __IOM uint32_t GPIO8      : 1;            /*!< [8..8] GPIO8 interrupt.                                                   */
+      __IOM uint32_t GPIO9      : 1;            /*!< [9..9] GPIO9 interrupt.                                                   */
+      __IOM uint32_t GPIO10     : 1;            /*!< [10..10] GPIO10 interrupt.                                                */
+      __IOM uint32_t GPIO11     : 1;            /*!< [11..11] GPIO11 interrupt.                                                */
+      __IOM uint32_t GPIO12     : 1;            /*!< [12..12] GPIO12 interrupt.                                                */
+      __IOM uint32_t GPIO13     : 1;            /*!< [13..13] GPIO13 interrupt.                                                */
+      __IOM uint32_t GPIO14     : 1;            /*!< [14..14] GPIO14 interrupt.                                                */
+      __IOM uint32_t GPIO15     : 1;            /*!< [15..15] GPIO15 interrupt.                                                */
+      __IOM uint32_t GPIO16     : 1;            /*!< [16..16] GPIO16 interrupt.                                                */
+      __IOM uint32_t GPIO17     : 1;            /*!< [17..17] GPIO17 interrupt.                                                */
+      __IOM uint32_t GPIO18     : 1;            /*!< [18..18] GPIO18interrupt.                                                 */
+      __IOM uint32_t GPIO19     : 1;            /*!< [19..19] GPIO19 interrupt.                                                */
+      __IOM uint32_t GPIO20     : 1;            /*!< [20..20] GPIO20 interrupt.                                                */
+      __IOM uint32_t GPIO21     : 1;            /*!< [21..21] GPIO21 interrupt.                                                */
+      __IOM uint32_t GPIO22     : 1;            /*!< [22..22] GPIO22 interrupt.                                                */
+      __IOM uint32_t GPIO23     : 1;            /*!< [23..23] GPIO23 interrupt.                                                */
+      __IOM uint32_t GPIO24     : 1;            /*!< [24..24] GPIO24 interrupt.                                                */
+      __IOM uint32_t GPIO25     : 1;            /*!< [25..25] GPIO25 interrupt.                                                */
+      __IOM uint32_t GPIO26     : 1;            /*!< [26..26] GPIO26 interrupt.                                                */
+      __IOM uint32_t GPIO27     : 1;            /*!< [27..27] GPIO27 interrupt.                                                */
+      __IOM uint32_t GPIO28     : 1;            /*!< [28..28] GPIO28 interrupt.                                                */
+      __IOM uint32_t GPIO29     : 1;            /*!< [29..29] GPIO29 interrupt.                                                */
+      __IOM uint32_t GPIO30     : 1;            /*!< [30..30] GPIO30 interrupt.                                                */
+      __IOM uint32_t GPIO31     : 1;            /*!< [31..31] GPIO31 interrupt.                                                */
+    } INT0STAT_b;
+  } ;
+
+  union {
+    __IOM uint32_t INT0CLR;                     /*!< (@ 0x00000208) GPIO Interrupt Registers 31-0: Clear                       */
+
+    struct {
+      __IOM uint32_t GPIO0      : 1;            /*!< [0..0] GPIO0 interrupt.                                                   */
+      __IOM uint32_t GPIO1      : 1;            /*!< [1..1] GPIO1 interrupt.                                                   */
+      __IOM uint32_t GPIO2      : 1;            /*!< [2..2] GPIO2 interrupt.                                                   */
+      __IOM uint32_t GPIO3      : 1;            /*!< [3..3] GPIO3 interrupt.                                                   */
+      __IOM uint32_t GPIO4      : 1;            /*!< [4..4] GPIO4 interrupt.                                                   */
+      __IOM uint32_t GPIO5      : 1;            /*!< [5..5] GPIO5 interrupt.                                                   */
+      __IOM uint32_t GPIO6      : 1;            /*!< [6..6] GPIO6 interrupt.                                                   */
+      __IOM uint32_t GPIO7      : 1;            /*!< [7..7] GPIO7 interrupt.                                                   */
+      __IOM uint32_t GPIO8      : 1;            /*!< [8..8] GPIO8 interrupt.                                                   */
+      __IOM uint32_t GPIO9      : 1;            /*!< [9..9] GPIO9 interrupt.                                                   */
+      __IOM uint32_t GPIO10     : 1;            /*!< [10..10] GPIO10 interrupt.                                                */
+      __IOM uint32_t GPIO11     : 1;            /*!< [11..11] GPIO11 interrupt.                                                */
+      __IOM uint32_t GPIO12     : 1;            /*!< [12..12] GPIO12 interrupt.                                                */
+      __IOM uint32_t GPIO13     : 1;            /*!< [13..13] GPIO13 interrupt.                                                */
+      __IOM uint32_t GPIO14     : 1;            /*!< [14..14] GPIO14 interrupt.                                                */
+      __IOM uint32_t GPIO15     : 1;            /*!< [15..15] GPIO15 interrupt.                                                */
+      __IOM uint32_t GPIO16     : 1;            /*!< [16..16] GPIO16 interrupt.                                                */
+      __IOM uint32_t GPIO17     : 1;            /*!< [17..17] GPIO17 interrupt.                                                */
+      __IOM uint32_t GPIO18     : 1;            /*!< [18..18] GPIO18interrupt.                                                 */
+      __IOM uint32_t GPIO19     : 1;            /*!< [19..19] GPIO19 interrupt.                                                */
+      __IOM uint32_t GPIO20     : 1;            /*!< [20..20] GPIO20 interrupt.                                                */
+      __IOM uint32_t GPIO21     : 1;            /*!< [21..21] GPIO21 interrupt.                                                */
+      __IOM uint32_t GPIO22     : 1;            /*!< [22..22] GPIO22 interrupt.                                                */
+      __IOM uint32_t GPIO23     : 1;            /*!< [23..23] GPIO23 interrupt.                                                */
+      __IOM uint32_t GPIO24     : 1;            /*!< [24..24] GPIO24 interrupt.                                                */
+      __IOM uint32_t GPIO25     : 1;            /*!< [25..25] GPIO25 interrupt.                                                */
+      __IOM uint32_t GPIO26     : 1;            /*!< [26..26] GPIO26 interrupt.                                                */
+      __IOM uint32_t GPIO27     : 1;            /*!< [27..27] GPIO27 interrupt.                                                */
+      __IOM uint32_t GPIO28     : 1;            /*!< [28..28] GPIO28 interrupt.                                                */
+      __IOM uint32_t GPIO29     : 1;            /*!< [29..29] GPIO29 interrupt.                                                */
+      __IOM uint32_t GPIO30     : 1;            /*!< [30..30] GPIO30 interrupt.                                                */
+      __IOM uint32_t GPIO31     : 1;            /*!< [31..31] GPIO31 interrupt.                                                */
+    } INT0CLR_b;
+  } ;
+
+  union {
+    __IOM uint32_t INT0SET;                     /*!< (@ 0x0000020C) GPIO Interrupt Registers 31-0: Set                         */
+
+    struct {
+      __IOM uint32_t GPIO0      : 1;            /*!< [0..0] GPIO0 interrupt.                                                   */
+      __IOM uint32_t GPIO1      : 1;            /*!< [1..1] GPIO1 interrupt.                                                   */
+      __IOM uint32_t GPIO2      : 1;            /*!< [2..2] GPIO2 interrupt.                                                   */
+      __IOM uint32_t GPIO3      : 1;            /*!< [3..3] GPIO3 interrupt.                                                   */
+      __IOM uint32_t GPIO4      : 1;            /*!< [4..4] GPIO4 interrupt.                                                   */
+      __IOM uint32_t GPIO5      : 1;            /*!< [5..5] GPIO5 interrupt.                                                   */
+      __IOM uint32_t GPIO6      : 1;            /*!< [6..6] GPIO6 interrupt.                                                   */
+      __IOM uint32_t GPIO7      : 1;            /*!< [7..7] GPIO7 interrupt.                                                   */
+      __IOM uint32_t GPIO8      : 1;            /*!< [8..8] GPIO8 interrupt.                                                   */
+      __IOM uint32_t GPIO9      : 1;            /*!< [9..9] GPIO9 interrupt.                                                   */
+      __IOM uint32_t GPIO10     : 1;            /*!< [10..10] GPIO10 interrupt.                                                */
+      __IOM uint32_t GPIO11     : 1;            /*!< [11..11] GPIO11 interrupt.                                                */
+      __IOM uint32_t GPIO12     : 1;            /*!< [12..12] GPIO12 interrupt.                                                */
+      __IOM uint32_t GPIO13     : 1;            /*!< [13..13] GPIO13 interrupt.                                                */
+      __IOM uint32_t GPIO14     : 1;            /*!< [14..14] GPIO14 interrupt.                                                */
+      __IOM uint32_t GPIO15     : 1;            /*!< [15..15] GPIO15 interrupt.                                                */
+      __IOM uint32_t GPIO16     : 1;            /*!< [16..16] GPIO16 interrupt.                                                */
+      __IOM uint32_t GPIO17     : 1;            /*!< [17..17] GPIO17 interrupt.                                                */
+      __IOM uint32_t GPIO18     : 1;            /*!< [18..18] GPIO18interrupt.                                                 */
+      __IOM uint32_t GPIO19     : 1;            /*!< [19..19] GPIO19 interrupt.                                                */
+      __IOM uint32_t GPIO20     : 1;            /*!< [20..20] GPIO20 interrupt.                                                */
+      __IOM uint32_t GPIO21     : 1;            /*!< [21..21] GPIO21 interrupt.                                                */
+      __IOM uint32_t GPIO22     : 1;            /*!< [22..22] GPIO22 interrupt.                                                */
+      __IOM uint32_t GPIO23     : 1;            /*!< [23..23] GPIO23 interrupt.                                                */
+      __IOM uint32_t GPIO24     : 1;            /*!< [24..24] GPIO24 interrupt.                                                */
+      __IOM uint32_t GPIO25     : 1;            /*!< [25..25] GPIO25 interrupt.                                                */
+      __IOM uint32_t GPIO26     : 1;            /*!< [26..26] GPIO26 interrupt.                                                */
+      __IOM uint32_t GPIO27     : 1;            /*!< [27..27] GPIO27 interrupt.                                                */
+      __IOM uint32_t GPIO28     : 1;            /*!< [28..28] GPIO28 interrupt.                                                */
+      __IOM uint32_t GPIO29     : 1;            /*!< [29..29] GPIO29 interrupt.                                                */
+      __IOM uint32_t GPIO30     : 1;            /*!< [30..30] GPIO30 interrupt.                                                */
+      __IOM uint32_t GPIO31     : 1;            /*!< [31..31] GPIO31 interrupt.                                                */
+    } INT0SET_b;
+  } ;
+
+  union {
+    __IOM uint32_t INT1EN;                      /*!< (@ 0x00000210) GPIO Interrupt Registers 49-32: Enable                     */
+
+    struct {
+      __IOM uint32_t GPIO32     : 1;            /*!< [0..0] GPIO32 interrupt.                                                  */
+      __IOM uint32_t GPIO33     : 1;            /*!< [1..1] GPIO33 interrupt.                                                  */
+      __IOM uint32_t GPIO34     : 1;            /*!< [2..2] GPIO34 interrupt.                                                  */
+      __IOM uint32_t GPIO35     : 1;            /*!< [3..3] GPIO35 interrupt.                                                  */
+      __IOM uint32_t GPIO36     : 1;            /*!< [4..4] GPIO36 interrupt.                                                  */
+      __IOM uint32_t GPIO37     : 1;            /*!< [5..5] GPIO37 interrupt.                                                  */
+      __IOM uint32_t GPIO38     : 1;            /*!< [6..6] GPIO38 interrupt.                                                  */
+      __IOM uint32_t GPIO39     : 1;            /*!< [7..7] GPIO39 interrupt.                                                  */
+      __IOM uint32_t GPIO40     : 1;            /*!< [8..8] GPIO40 interrupt.                                                  */
+      __IOM uint32_t GPIO41     : 1;            /*!< [9..9] GPIO41 interrupt.                                                  */
+      __IOM uint32_t GPIO42     : 1;            /*!< [10..10] GPIO42 interrupt.                                                */
+      __IOM uint32_t GPIO43     : 1;            /*!< [11..11] GPIO43 interrupt.                                                */
+      __IOM uint32_t GPIO44     : 1;            /*!< [12..12] GPIO44 interrupt.                                                */
+      __IOM uint32_t GPIO45     : 1;            /*!< [13..13] GPIO45 interrupt.                                                */
+      __IOM uint32_t GPIO46     : 1;            /*!< [14..14] GPIO46 interrupt.                                                */
+      __IOM uint32_t GPIO47     : 1;            /*!< [15..15] GPIO47 interrupt.                                                */
+      __IOM uint32_t GPIO48     : 1;            /*!< [16..16] GPIO48 interrupt.                                                */
+      __IOM uint32_t GPIO49     : 1;            /*!< [17..17] GPIO49 interrupt.                                                */
+    } INT1EN_b;
+  } ;
+
+  union {
+    __IOM uint32_t INT1STAT;                    /*!< (@ 0x00000214) GPIO Interrupt Registers 49-32: Status                     */
+
+    struct {
+      __IOM uint32_t GPIO32     : 1;            /*!< [0..0] GPIO32 interrupt.                                                  */
+      __IOM uint32_t GPIO33     : 1;            /*!< [1..1] GPIO33 interrupt.                                                  */
+      __IOM uint32_t GPIO34     : 1;            /*!< [2..2] GPIO34 interrupt.                                                  */
+      __IOM uint32_t GPIO35     : 1;            /*!< [3..3] GPIO35 interrupt.                                                  */
+      __IOM uint32_t GPIO36     : 1;            /*!< [4..4] GPIO36 interrupt.                                                  */
+      __IOM uint32_t GPIO37     : 1;            /*!< [5..5] GPIO37 interrupt.                                                  */
+      __IOM uint32_t GPIO38     : 1;            /*!< [6..6] GPIO38 interrupt.                                                  */
+      __IOM uint32_t GPIO39     : 1;            /*!< [7..7] GPIO39 interrupt.                                                  */
+      __IOM uint32_t GPIO40     : 1;            /*!< [8..8] GPIO40 interrupt.                                                  */
+      __IOM uint32_t GPIO41     : 1;            /*!< [9..9] GPIO41 interrupt.                                                  */
+      __IOM uint32_t GPIO42     : 1;            /*!< [10..10] GPIO42 interrupt.                                                */
+      __IOM uint32_t GPIO43     : 1;            /*!< [11..11] GPIO43 interrupt.                                                */
+      __IOM uint32_t GPIO44     : 1;            /*!< [12..12] GPIO44 interrupt.                                                */
+      __IOM uint32_t GPIO45     : 1;            /*!< [13..13] GPIO45 interrupt.                                                */
+      __IOM uint32_t GPIO46     : 1;            /*!< [14..14] GPIO46 interrupt.                                                */
+      __IOM uint32_t GPIO47     : 1;            /*!< [15..15] GPIO47 interrupt.                                                */
+      __IOM uint32_t GPIO48     : 1;            /*!< [16..16] GPIO48 interrupt.                                                */
+      __IOM uint32_t GPIO49     : 1;            /*!< [17..17] GPIO49 interrupt.                                                */
+    } INT1STAT_b;
+  } ;
+
+  union {
+    __IOM uint32_t INT1CLR;                     /*!< (@ 0x00000218) GPIO Interrupt Registers 49-32: Clear                      */
+
+    struct {
+      __IOM uint32_t GPIO32     : 1;            /*!< [0..0] GPIO32 interrupt.                                                  */
+      __IOM uint32_t GPIO33     : 1;            /*!< [1..1] GPIO33 interrupt.                                                  */
+      __IOM uint32_t GPIO34     : 1;            /*!< [2..2] GPIO34 interrupt.                                                  */
+      __IOM uint32_t GPIO35     : 1;            /*!< [3..3] GPIO35 interrupt.                                                  */
+      __IOM uint32_t GPIO36     : 1;            /*!< [4..4] GPIO36 interrupt.                                                  */
+      __IOM uint32_t GPIO37     : 1;            /*!< [5..5] GPIO37 interrupt.                                                  */
+      __IOM uint32_t GPIO38     : 1;            /*!< [6..6] GPIO38 interrupt.                                                  */
+      __IOM uint32_t GPIO39     : 1;            /*!< [7..7] GPIO39 interrupt.                                                  */
+      __IOM uint32_t GPIO40     : 1;            /*!< [8..8] GPIO40 interrupt.                                                  */
+      __IOM uint32_t GPIO41     : 1;            /*!< [9..9] GPIO41 interrupt.                                                  */
+      __IOM uint32_t GPIO42     : 1;            /*!< [10..10] GPIO42 interrupt.                                                */
+      __IOM uint32_t GPIO43     : 1;            /*!< [11..11] GPIO43 interrupt.                                                */
+      __IOM uint32_t GPIO44     : 1;            /*!< [12..12] GPIO44 interrupt.                                                */
+      __IOM uint32_t GPIO45     : 1;            /*!< [13..13] GPIO45 interrupt.                                                */
+      __IOM uint32_t GPIO46     : 1;            /*!< [14..14] GPIO46 interrupt.                                                */
+      __IOM uint32_t GPIO47     : 1;            /*!< [15..15] GPIO47 interrupt.                                                */
+      __IOM uint32_t GPIO48     : 1;            /*!< [16..16] GPIO48 interrupt.                                                */
+      __IOM uint32_t GPIO49     : 1;            /*!< [17..17] GPIO49 interrupt.                                                */
+    } INT1CLR_b;
+  } ;
+
+  union {
+    __IOM uint32_t INT1SET;                     /*!< (@ 0x0000021C) GPIO Interrupt Registers 49-32: Set                        */
+
+    struct {
+      __IOM uint32_t GPIO32     : 1;            /*!< [0..0] GPIO32 interrupt.                                                  */
+      __IOM uint32_t GPIO33     : 1;            /*!< [1..1] GPIO33 interrupt.                                                  */
+      __IOM uint32_t GPIO34     : 1;            /*!< [2..2] GPIO34 interrupt.                                                  */
+      __IOM uint32_t GPIO35     : 1;            /*!< [3..3] GPIO35 interrupt.                                                  */
+      __IOM uint32_t GPIO36     : 1;            /*!< [4..4] GPIO36 interrupt.                                                  */
+      __IOM uint32_t GPIO37     : 1;            /*!< [5..5] GPIO37 interrupt.                                                  */
+      __IOM uint32_t GPIO38     : 1;            /*!< [6..6] GPIO38 interrupt.                                                  */
+      __IOM uint32_t GPIO39     : 1;            /*!< [7..7] GPIO39 interrupt.                                                  */
+      __IOM uint32_t GPIO40     : 1;            /*!< [8..8] GPIO40 interrupt.                                                  */
+      __IOM uint32_t GPIO41     : 1;            /*!< [9..9] GPIO41 interrupt.                                                  */
+      __IOM uint32_t GPIO42     : 1;            /*!< [10..10] GPIO42 interrupt.                                                */
+      __IOM uint32_t GPIO43     : 1;            /*!< [11..11] GPIO43 interrupt.                                                */
+      __IOM uint32_t GPIO44     : 1;            /*!< [12..12] GPIO44 interrupt.                                                */
+      __IOM uint32_t GPIO45     : 1;            /*!< [13..13] GPIO45 interrupt.                                                */
+      __IOM uint32_t GPIO46     : 1;            /*!< [14..14] GPIO46 interrupt.                                                */
+      __IOM uint32_t GPIO47     : 1;            /*!< [15..15] GPIO47 interrupt.                                                */
+      __IOM uint32_t GPIO48     : 1;            /*!< [16..16] GPIO48 interrupt.                                                */
+      __IOM uint32_t GPIO49     : 1;            /*!< [17..17] GPIO49 interrupt.                                                */
+    } INT1SET_b;
+  } ;
+} GPIO_Type;                                    /*!< Size = 544 (0x220)                                                        */
+
+
+
+/* =========================================================================================================================== */
+/* ================                                           IOM0                                            ================ */
+/* =========================================================================================================================== */
+
+
+/**
+  * @brief IO Peripheral Master (IOM0)
+  */
+
+typedef struct {                                /*!< (@ 0x50004000) IOM0 Structure                                             */
+
+  union {
+    __IOM uint32_t FIFO;                        /*!< (@ 0x00000000) FIFO Access Port                                           */
+
+    struct {
+      __IOM uint32_t FIFO       : 32;           /*!< [31..0] FIFO direct access. Only locations 0 - 3F will return
+                                                     valid information.                                                        */
+    } FIFO_b;
+  } ;
+  __IM  uint32_t  RESERVED[63];
+
+  union {
+    __IOM uint32_t FIFOPTR;                     /*!< (@ 0x00000100) FIFO size and remaining slots open values                  */
+
+    struct {
+      __IOM uint32_t FIFO0SIZ   : 8;            /*!< [7..0] The number of valid data bytes currently in the FIFO
+                                                     0 (written by MCU, read by interface)                                     */
+      __IOM uint32_t FIFO0REM   : 8;            /*!< [15..8] The number of remaining data bytes slots currently in
+                                                     FIFO 0 (written by MCU, read by interface)                                */
+      __IOM uint32_t FIFO1SIZ   : 8;            /*!< [23..16] The number of valid data bytes currently in FIFO 1
+                                                     (written by interface, read by MCU)                                       */
+      __IOM uint32_t FIFO1REM   : 8;            /*!< [31..24] The number of remaining data bytes slots currently
+                                                     in FIFO 1 (written by interface, read by MCU)                             */
+    } FIFOPTR_b;
+  } ;
+
+  union {
+    __IOM uint32_t FIFOTHR;                     /*!< (@ 0x00000104) FIFO Threshold Configuration                               */
+
+    struct {
+      __IOM uint32_t FIFORTHR   : 6;            /*!< [5..0] FIFO read threshold in bytes. A value of 0 will disable
+                                                     the read FIFO level from activating the threshold interrupt.
+                                                     If this field is non-zero, it will trigger a threshold
+                                                     interrupt when the read fifo contains FIFORTHR valid bytes
+                                                     of data, as indicated by the FIFO1SIZ field. This is intended
+                                                     to signal when a data transfer of FIFORTHR bytes can be
+                                                     done from the IOM module to the host via the read fifo
+                                                     to support large IOM read operations.                                     */
+      __IM  uint32_t            : 2;
+      __IOM uint32_t FIFOWTHR   : 6;            /*!< [13..8] FIFO write threshold in bytes. A value of 0 will disable
+                                                     the write FIFO level from activating the threshold interrupt.
+                                                     If this field is non-zero, it will trigger a threshold
+                                                     interrupt when the write fifo contains FIFOWTHR free bytes,
+                                                     as indicated by the FIFO0REM field. This is intended to
+                                                     signal when a transfer of FIFOWTHR bytes can be done from
+                                                     the host to the IOM write fifo to support large IOM write
+                                                     operations.                                                               */
+    } FIFOTHR_b;
+  } ;
+
+  union {
+    __IOM uint32_t FIFOPOP;                     /*!< (@ 0x00000108) FIFO POP register                                          */
+
+    struct {
+      __IOM uint32_t FIFODOUT   : 32;           /*!< [31..0] This register will return the read data indicated by
+                                                     the current read pointer on reads. If the POPWR control
+                                                     bit in the FIFOCTRL register is reset (0), the fifo read
+                                                     pointer will be advanced by one word as a result of the
+                                                     read.If the POPWR bit is set (1), the fifo read pointer
+                                                     will only be advanced after a write operation to this register.
+                                                     The write data is ignored for this register.If less than
+                                                     a even word multiple is available, and the command is completed,
+                                                     the module will return the word containing                                */
+    } FIFOPOP_b;
+  } ;
+
+  union {
+    __IOM uint32_t FIFOPUSH;                    /*!< (@ 0x0000010C) FIFO PUSH register                                         */
+
+    struct {
+      __IOM uint32_t FIFODIN    : 32;           /*!< [31..0] This register is used to write the FIFORAM in FIFO mode
+                                                     and will cause a push event to occur to the next open slot
+                                                     within the FIFORAM. Writing to this register will cause
+                                                     the write point to increment by 1 word(4 bytes).                          */
+    } FIFOPUSH_b;
+  } ;
+
+  union {
+    __IOM uint32_t FIFOCTRL;                    /*!< (@ 0x00000110) FIFO Control Register                                      */
+
+    struct {
+      __IOM uint32_t POPWR      : 1;            /*!< [0..0] Selects the mode in which 'pop' events are done for the
+                                                     fifo read operations. A value of '1' will prevent a pop
+                                                     event on a read operation, and will require a write to
+                                                     the FIFOPOP register to create a pop event.A value of '0'
+                                                     in this register will allow a pop event to occur on the
+                                                     read of the FIFOPOP register, and may cause inadvertant
+                                                     fifo pops when used in a debugging mode.                                  */
+      __IOM uint32_t FIFORSTN   : 1;            /*!< [1..1] Active low manual reset of the fifo. Write to 0 to reset
+                                                     fifo, and then write to 1 to remove the reset.                            */
+    } FIFOCTRL_b;
+  } ;
+
+  union {
+    __IOM uint32_t FIFOLOC;                     /*!< (@ 0x00000114) FIFO Pointers                                              */
+
+    struct {
+      __IOM uint32_t FIFOWPTR   : 4;            /*!< [3..0] Current FIFO write pointer. Value is the index into the
+                                                     outgoing FIFO (FIFO0), which is used during write operations
+                                                     to external devices.                                                      */
+      __IM  uint32_t            : 4;
+      __IOM uint32_t FIFORPTR   : 4;            /*!< [11..8] Current FIFO read pointer. Used to index into the incoming
+                                                     FIFO (FIFO1), which is used to store read data returned
+                                                     from external devices during a read operation.                            */
+    } FIFOLOC_b;
+  } ;
+  __IM  uint32_t  RESERVED1[58];
+
+  union {
+    __IOM uint32_t INTEN;                       /*!< (@ 0x00000200) IO Master Interrupts: Enable                               */
+
+    struct {
+      __IOM uint32_t CMDCMP     : 1;            /*!< [0..0] Command Complete interrupt. Asserted when the current
+                                                     operation has completed. For repeated commands, this will
+                                                     only be asserted when the final repeated command is completed.            */
+      __IOM uint32_t THR        : 1;            /*!< [1..1] FIFO Threshold interrupt. For write operations, asserted
+                                                     when the number of free bytes in the write FIFO equals
+                                                     or exceeds the WTHR field.For read operations, asserted
+                                                     when the number of valid bytes in the read FIFO equals
+                                                     of exceeds the value set in the RTHR field.                               */
+      __IOM uint32_t FUNDFL     : 1;            /*!< [2..2] Read FIFO Underflow interrupt. This occurs when software
+                                                     tries to pop from an empty fifo.                                          */
+      __IOM uint32_t FOVFL      : 1;            /*!< [3..3] Write FIFO Overflow interrupt. This occurs when software
+                                                     tries to write to a full fifo. The current operation does
+                                                     not stop.                                                                 */
+      __IOM uint32_t NAK        : 1;            /*!< [4..4] I2C NAK interrupt. Asserted when an unexpected NAK has
+                                                     been received on the I2C bus.                                             */
+      __IOM uint32_t IACC       : 1;            /*!< [5..5] illegal FIFO access interrupt. Asserted when there is
+                                                     a overflow or underflow event                                             */
+      __IOM uint32_t ICMD       : 1;            /*!< [6..6] illegal command interrupt. Asserted when a command is
+                                                     written when an active command is in progress.                            */
+      __IOM uint32_t START      : 1;            /*!< [7..7] START command interrupt. Asserted when another master
+                                                     on the bus has signaled a START command.                                  */
+      __IOM uint32_t STOP       : 1;            /*!< [8..8] STOP command interrupt. Asserted when another master
+                                                     on the bus has signaled a STOP command.                                   */
+      __IOM uint32_t ARB        : 1;            /*!< [9..9] Arbitration loss interrupt. Asserted when arbitration
+                                                     is enabled and has been lost to another master on the bus.                */
+      __IOM uint32_t DCMP       : 1;            /*!< [10..10] DMA Complete. Processing of the DMA operation has completed
+                                                     and the DMA submodule is returned into the idle state                     */
+      __IOM uint32_t DERR       : 1;            /*!< [11..11] DMA Error encountered during the processing of the
+                                                     DMA command. The DMA error could occur when the memory
+                                                     access specified in the DMA operation is not available
+                                                     or incorrectly specified.                                                 */
+      __IOM uint32_t CQPAUSED   : 1;            /*!< [12..12] Command queue is paused due to an active event enabled
+                                                     in the PAUSEEN register. The interrupt is posted when the
+                                                     event is enabled within the PAUSEEN register, the mask
+                                                     is active in the CQIRQMASK field and the event occurs.                    */
+      __IOM uint32_t CQUPD      : 1;            /*!< [13..13] CQ write operation performed a register write with
+                                                     the register address bit 0 set to 1. The low address bits
+                                                     in the CQ address fields are unused and bit 0 can be used
+                                                     to trigger an interrupt to indicate when this register
+                                                     write is performed by the CQ operation.                                   */
+      __IOM uint32_t CQERR      : 1;            /*!< [14..14] Error during command queue operations                            */
+    } INTEN_b;
+  } ;
+
+  union {
+    __IOM uint32_t INTSTAT;                     /*!< (@ 0x00000204) IO Master Interrupts: Status                               */
+
+    struct {
+      __IOM uint32_t CMDCMP     : 1;            /*!< [0..0] Command Complete interrupt. Asserted when the current
+                                                     operation has completed. For repeated commands, this will
+                                                     only be asserted when the final repeated command is completed.            */
+      __IOM uint32_t THR        : 1;            /*!< [1..1] FIFO Threshold interrupt. For write operations, asserted
+                                                     when the number of free bytes in the write FIFO equals
+                                                     or exceeds the WTHR field.For read operations, asserted
+                                                     when the number of valid bytes in the read FIFO equals
+                                                     of exceeds the value set in the RTHR field.                               */
+      __IOM uint32_t FUNDFL     : 1;            /*!< [2..2] Read FIFO Underflow interrupt. This occurs when software
+                                                     tries to pop from an empty fifo.                                          */
+      __IOM uint32_t FOVFL      : 1;            /*!< [3..3] Write FIFO Overflow interrupt. This occurs when software
+                                                     tries to write to a full fifo. The current operation does
+                                                     not stop.                                                                 */
+      __IOM uint32_t NAK        : 1;            /*!< [4..4] I2C NAK interrupt. Asserted when an unexpected NAK has
+                                                     been received on the I2C bus.                                             */
+      __IOM uint32_t IACC       : 1;            /*!< [5..5] illegal FIFO access interrupt. Asserted when there is
+                                                     a overflow or underflow event                                             */
+      __IOM uint32_t ICMD       : 1;            /*!< [6..6] illegal command interrupt. Asserted when a command is
+                                                     written when an active command is in progress.                            */
+      __IOM uint32_t START      : 1;            /*!< [7..7] START command interrupt. Asserted when another master
+                                                     on the bus has signaled a START command.                                  */
+      __IOM uint32_t STOP       : 1;            /*!< [8..8] STOP command interrupt. Asserted when another master
+                                                     on the bus has signaled a STOP command.                                   */
+      __IOM uint32_t ARB        : 1;            /*!< [9..9] Arbitration loss interrupt. Asserted when arbitration
+                                                     is enabled and has been lost to another master on the bus.                */
+      __IOM uint32_t DCMP       : 1;            /*!< [10..10] DMA Complete. Processing of the DMA operation has completed
+                                                     and the DMA submodule is returned into the idle state                     */
+      __IOM uint32_t DERR       : 1;            /*!< [11..11] DMA Error encountered during the processing of the
+                                                     DMA command. The DMA error could occur when the memory
+                                                     access specified in the DMA operation is not available
+                                                     or incorrectly specified.                                                 */
+      __IOM uint32_t CQPAUSED   : 1;            /*!< [12..12] Command queue is paused due to an active event enabled
+                                                     in the PAUSEEN register. The interrupt is posted when the
+                                                     event is enabled within the PAUSEEN register, the mask
+                                                     is active in the CQIRQMASK field and the event occurs.                    */
+      __IOM uint32_t CQUPD      : 1;            /*!< [13..13] CQ write operation performed a register write with
+                                                     the register address bit 0 set to 1. The low address bits
+                                                     in the CQ address fields are unused and bit 0 can be used
+                                                     to trigger an interrupt to indicate when this register
+                                                     write is performed by the CQ operation.                                   */
+      __IOM uint32_t CQERR      : 1;            /*!< [14..14] Error during command queue operations                            */
+    } INTSTAT_b;
+  } ;
+
+  union {
+    __IOM uint32_t INTCLR;                      /*!< (@ 0x00000208) IO Master Interrupts: Clear                                */
+
+    struct {
+      __IOM uint32_t CMDCMP     : 1;            /*!< [0..0] Command Complete interrupt. Asserted when the current
+                                                     operation has completed. For repeated commands, this will
+                                                     only be asserted when the final repeated command is completed.            */
+      __IOM uint32_t THR        : 1;            /*!< [1..1] FIFO Threshold interrupt. For write operations, asserted
+                                                     when the number of free bytes in the write FIFO equals
+                                                     or exceeds the WTHR field.For read operations, asserted
+                                                     when the number of valid bytes in the read FIFO equals
+                                                     of exceeds the value set in the RTHR field.                               */
+      __IOM uint32_t FUNDFL     : 1;            /*!< [2..2] Read FIFO Underflow interrupt. This occurs when software
+                                                     tries to pop from an empty fifo.                                          */
+      __IOM uint32_t FOVFL      : 1;            /*!< [3..3] Write FIFO Overflow interrupt. This occurs when software
+                                                     tries to write to a full fifo. The current operation does
+                                                     not stop.                                                                 */
+      __IOM uint32_t NAK        : 1;            /*!< [4..4] I2C NAK interrupt. Asserted when an unexpected NAK has
+                                                     been received on the I2C bus.                                             */
+      __IOM uint32_t IACC       : 1;            /*!< [5..5] illegal FIFO access interrupt. Asserted when there is
+                                                     a overflow or underflow event                                             */
+      __IOM uint32_t ICMD       : 1;            /*!< [6..6] illegal command interrupt. Asserted when a command is
+                                                     written when an active command is in progress.                            */
+      __IOM uint32_t START      : 1;            /*!< [7..7] START command interrupt. Asserted when another master
+                                                     on the bus has signaled a START command.                                  */
+      __IOM uint32_t STOP       : 1;            /*!< [8..8] STOP command interrupt. Asserted when another master
+                                                     on the bus has signaled a STOP command.                                   */
+      __IOM uint32_t ARB        : 1;            /*!< [9..9] Arbitration loss interrupt. Asserted when arbitration
+                                                     is enabled and has been lost to another master on the bus.                */
+      __IOM uint32_t DCMP       : 1;            /*!< [10..10] DMA Complete. Processing of the DMA operation has completed
+                                                     and the DMA submodule is returned into the idle state                     */
+      __IOM uint32_t DERR       : 1;            /*!< [11..11] DMA Error encountered during the processing of the
+                                                     DMA command. The DMA error could occur when the memory
+                                                     access specified in the DMA operation is not available
+                                                     or incorrectly specified.                                                 */
+      __IOM uint32_t CQPAUSED   : 1;            /*!< [12..12] Command queue is paused due to an active event enabled
+                                                     in the PAUSEEN register. The interrupt is posted when the
+                                                     event is enabled within the PAUSEEN register, the mask
+                                                     is active in the CQIRQMASK field and the event occurs.                    */
+      __IOM uint32_t CQUPD      : 1;            /*!< [13..13] CQ write operation performed a register write with
+                                                     the register address bit 0 set to 1. The low address bits
+                                                     in the CQ address fields are unused and bit 0 can be used
+                                                     to trigger an interrupt to indicate when this register
+                                                     write is performed by the CQ operation.                                   */
+      __IOM uint32_t CQERR      : 1;            /*!< [14..14] Error during command queue operations                            */
+    } INTCLR_b;
+  } ;
+
+  union {
+    __IOM uint32_t INTSET;                      /*!< (@ 0x0000020C) IO Master Interrupts: Set                                  */
+
+    struct {
+      __IOM uint32_t CMDCMP     : 1;            /*!< [0..0] Command Complete interrupt. Asserted when the current
+                                                     operation has completed. For repeated commands, this will
+                                                     only be asserted when the final repeated command is completed.            */
+      __IOM uint32_t THR        : 1;            /*!< [1..1] FIFO Threshold interrupt. For write operations, asserted
+                                                     when the number of free bytes in the write FIFO equals
+                                                     or exceeds the WTHR field.For read operations, asserted
+                                                     when the number of valid bytes in the read FIFO equals
+                                                     of exceeds the value set in the RTHR field.                               */
+      __IOM uint32_t FUNDFL     : 1;            /*!< [2..2] Read FIFO Underflow interrupt. This occurs when software
+                                                     tries to pop from an empty fifo.                                          */
+      __IOM uint32_t FOVFL      : 1;            /*!< [3..3] Write FIFO Overflow interrupt. This occurs when software
+                                                     tries to write to a full fifo. The current operation does
+                                                     not stop.                                                                 */
+      __IOM uint32_t NAK        : 1;            /*!< [4..4] I2C NAK interrupt. Asserted when an unexpected NAK has
+                                                     been received on the I2C bus.                                             */
+      __IOM uint32_t IACC       : 1;            /*!< [5..5] illegal FIFO access interrupt. Asserted when there is
+                                                     a overflow or underflow event                                             */
+      __IOM uint32_t ICMD       : 1;            /*!< [6..6] illegal command interrupt. Asserted when a command is
+                                                     written when an active command is in progress.                            */
+      __IOM uint32_t START      : 1;            /*!< [7..7] START command interrupt. Asserted when another master
+                                                     on the bus has signaled a START command.                                  */
+      __IOM uint32_t STOP       : 1;            /*!< [8..8] STOP command interrupt. Asserted when another master
+                                                     on the bus has signaled a STOP command.                                   */
+      __IOM uint32_t ARB        : 1;            /*!< [9..9] Arbitration loss interrupt. Asserted when arbitration
+                                                     is enabled and has been lost to another master on the bus.                */
+      __IOM uint32_t DCMP       : 1;            /*!< [10..10] DMA Complete. Processing of the DMA operation has completed
+                                                     and the DMA submodule is returned into the idle state                     */
+      __IOM uint32_t DERR       : 1;            /*!< [11..11] DMA Error encountered during the processing of the
+                                                     DMA command. The DMA error could occur when the memory
+                                                     access specified in the DMA operation is not available
+                                                     or incorrectly specified.                                                 */
+      __IOM uint32_t CQPAUSED   : 1;            /*!< [12..12] Command queue is paused due to an active event enabled
+                                                     in the PAUSEEN register. The interrupt is posted when the
+                                                     event is enabled within the PAUSEEN register, the mask
+                                                     is active in the CQIRQMASK field and the event occurs.                    */
+      __IOM uint32_t CQUPD      : 1;            /*!< [13..13] CQ write operation performed a register write with
+                                                     the register address bit 0 set to 1. The low address bits
+                                                     in the CQ address fields are unused and bit 0 can be used
+                                                     to trigger an interrupt to indicate when this register
+                                                     write is performed by the CQ operation.                                   */
+      __IOM uint32_t CQERR      : 1;            /*!< [14..14] Error during command queue operations                            */
+    } INTSET_b;
+  } ;
+
+  union {
+    __IOM uint32_t CLKCFG;                      /*!< (@ 0x00000210) I/O Clock Configuration                                    */
+
+    struct {
+      __IOM uint32_t IOCLKEN    : 1;            /*!< [0..0] Enable for the interface clock. Must be enabled prior
+                                                     to executing any IO operations.                                           */
+      __IM  uint32_t            : 7;
+      __IOM uint32_t FSEL       : 3;            /*!< [10..8] Select the input clock frequency.                                 */
+      __IOM uint32_t DIV3       : 1;            /*!< [11..11] Enable divide by 3 of the source IOCLK. Division by
+                                                     3 is done before the DIVEN programmable divider, and if
+                                                     enabledwill provide the divided by 3 clock as the source
+                                                     to the programmable divider.                                              */
+      __IOM uint32_t DIVEN      : 1;            /*!< [12..12] Enable clock division by TOTPER and LOWPER                       */
+      __IM  uint32_t            : 3;
+      __IOM uint32_t LOWPER     : 8;            /*!< [23..16] Clock low clock count minus 1. This provides the number
+                                                     of clocks the divided clock will be low when the DIVEN
+                                                     = 1.Only applicable when DIVEN = 1.                                       */
+      __IOM uint32_t TOTPER     : 8;            /*!< [31..24] Clock total clock count minus 1. This provides the
+                                                     total period of the divided clock -1 when the DIVEN is
+                                                     active. Thesource clock is selected by FSEL. Only applicable
+                                                     when DIVEN = 1.                                                           */
+    } CLKCFG_b;
+  } ;
+
+  union {
+    __IOM uint32_t SUBMODCTRL;                  /*!< (@ 0x00000214) Submodule control                                          */
+
+    struct {
+      __IOM uint32_t SMOD0EN    : 1;            /*!< [0..0] Submodule 0 enable (1) or disable (0)                              */
+      __IOM uint32_t SMOD0TYPE  : 3;            /*!< [3..1] Submodule 0 module type. This is the SPI Master interface.         */
+      __IOM uint32_t SMOD1EN    : 1;            /*!< [4..4] Submodule 1 enable (1) or disable (0)                              */
+      __IOM uint32_t SMOD1TYPE  : 3;            /*!< [7..5] Submodule 0 module type. This is the I2C Master interface          */
+    } SUBMODCTRL_b;
+  } ;
+
+  union {
+    __IOM uint32_t CMD;                         /*!< (@ 0x00000218) Command and offset Register                                */
+
+    struct {
+      __IOM uint32_t CMD        : 5;            /*!< [4..0] Command for submodule.                                             */
+      __IOM uint32_t OFFSETCNT  : 2;            /*!< [6..5] Number of offset bytes to use for the command - 0, 1,
+                                                     2, 3 are valid selections. The second (byte 1) and third
+                                                     byte (byte 2) are read from the OFFSETHI register, and
+                                                     the low order byte is pulled from this register in the
+                                                     OFFSETLO field.Offset bytes are transmitted highest byte
+                                                     first. EG if offsetcnt == 3, OFFSETHI[15:8] will be transmitted
+                                                     first, then OFFSETHI[7:0] then OFFSETLO.If offsetcnt ==
+                                                     2, OFFSETHI[7:0] will be transmitted, then OFFSETLO.If
+                                                     offsetcnt == 1, only OFFSETLO will be transmitted.                        */
+      __IOM uint32_t CONT       : 1;            /*!< [7..7] Contine to hold the bus after the current transaction
+                                                     if set to a 1 with a new command issued.                                  */
+      __IOM uint32_t TSIZE      : 12;           /*!< [19..8] Defines the transaction size in bytes. The offset transfer
+                                                     is not included in this size.                                             */
+      __IOM uint32_t CMDSEL     : 2;            /*!< [21..20] Command Specific selection information. Not used in
+                                                     Master I2C. Used as CEn select for Master SPI transactions                */
+      __IM  uint32_t            : 2;
+      __IOM uint32_t OFFSETLO   : 8;            /*!< [31..24] This register holds the low order byte of offset to
+                                                     be used in the transaction. The number of offset bytes
+                                                     to use is set with bits 1:0 of the command.                               */
+    } CMD_b;
+  } ;
+
+  union {
+    __IOM uint32_t CMDRPT;                      /*!< (@ 0x0000021C) Command Repeat Register                                    */
+
+    struct {
+      __IOM uint32_t CMDRPT     : 5;            /*!< [4..0] Count of number of times to repeat the next command.               */
+    } CMDRPT_b;
+  } ;
+
+  union {
+    __IOM uint32_t OFFSETHI;                    /*!< (@ 0x00000220) High order 2 bytes of 3 byte offset for IO transaction     */
+
+    struct {
+      __IOM uint32_t OFFSETHI   : 16;           /*!< [15..0] Holds the high order 2 bytes of the 3 byte addressing/offset
+                                                     field to use with IO commands. The number of offset bytes
+                                                     to use is specified in the command register                               */
+    } OFFSETHI_b;
+  } ;
+
+  union {
+    __IOM uint32_t CMDSTAT;                     /*!< (@ 0x00000224) Command status                                             */
+
+    struct {
+      __IOM uint32_t CCMD       : 5;            /*!< [4..0] current command that is being executed                             */
+      __IOM uint32_t CMDSTAT    : 3;            /*!< [7..5] The current status of the command execution.                       */
+      __IOM uint32_t CTSIZE     : 12;           /*!< [19..8] The current number of bytes still to be transferred
+                                                     with this command. This field will count down to zero.                    */
+    } CMDSTAT_b;
+  } ;
+  __IM  uint32_t  RESERVED2[6];
+
+  union {
+    __IOM uint32_t DMATRIGEN;                   /*!< (@ 0x00000240) DMA Trigger Enable Register                                */
+
+    struct {
+      __IOM uint32_t DCMDCMPEN  : 1;            /*!< [0..0] Trigger DMA upon command complete. Enables the trigger
+                                                     of the DMA when a command is completed. When this event
+                                                     is triggered, the number of words transferred will be the
+                                                     lesser of the remaining TOTCOUNT bytes, or                                */
+      __IOM uint32_t DTHREN     : 1;            /*!< [1..1] Trigger DMA upon THR level reached. For M2P DMA operations
+                                                     (IOM writes), the trigger will assert when the write FIFO
+                                                     has (WTHR/4) number of words free in the write FIFO, and
+                                                     will transfer (WTHR/4) number of wordsor, if the number
+                                                     of words left to transfer is less than the WTHR value,
+                                                     will transfer the remaining byte count.For P2M DMA operations,
+                                                     the trigger will assert when the read FIFO has (RTHR/4)
+                                                     words available in the read FIFO, and will transfer (RTHR/4)
+                                                     words to SRAM. This trigger will NOT asser                                */
+    } DMATRIGEN_b;
+  } ;
+
+  union {
+    __IOM uint32_t DMATRIGSTAT;                 /*!< (@ 0x00000244) DMA Trigger Status Register                                */
+
+    struct {
+      __IOM uint32_t DCMDCMP    : 1;            /*!< [0..0] Triggered DMA from Command complete event. Bit is read
+                                                     only and can be cleared by disabling the DCMDCMP trigger
+                                                     enable or by disabling DMA.                                               */
+      __IOM uint32_t DTHR       : 1;            /*!< [1..1] Triggered DMA from THR event. Bit is read only and can
+                                                     be cleared by disabling the DTHR trigger enable or by disabling
+                                                     DMA.                                                                      */
+      __IOM uint32_t DTOTCMP    : 1;            /*!< [2..2] DMA triggered when DCMDCMP = 0, and the amount of data
+                                                     in the FIFO was enough to complete the DMA operation (greater
+                                                     than or equal to current TOTCOUNT) when the command completed.
+                                                     This trigger is default active when the DCMDCMP trigger
+                                                     isdisabled and there is enough data in the FIFO to complete
+                                                     the DMA operation.                                                        */
+    } DMATRIGSTAT_b;
+  } ;
+  __IM  uint32_t  RESERVED3[14];
+
+  union {
+    __IOM uint32_t DMACFG;                      /*!< (@ 0x00000280) DMA Configuration Register                                 */
+
+    struct {
+      __IOM uint32_t DMAEN      : 1;            /*!< [0..0] DMA Enable. Setting this bit to EN will start the DMA
+                                                     operation. This should be the last DMA related register
+                                                     set prior to issuing the command                                          */
+      __IOM uint32_t DMADIR     : 1;            /*!< [1..1] Direction                                                          */
+      __IM  uint32_t            : 6;
+      __IOM uint32_t DMAPRI     : 1;            /*!< [8..8] Sets the Priority of the DMA request                               */
+      __IOM uint32_t DPWROFF    : 1;            /*!< [9..9] Power off module after DMA is complete. If this bit is
+                                                     active, the module will request to power off the supply
+                                                     it is attached to. If there are other units still requiring
+                                                     power from the same domain, power down will not be performed.             */
+    } DMACFG_b;
+  } ;
+  __IM  uint32_t  RESERVED4;
+
+  union {
+    __IOM uint32_t DMATOTCOUNT;                 /*!< (@ 0x00000288) DMA Total Transfer Count                                   */
+
+    struct {
+      __IOM uint32_t TOTCOUNT   : 12;           /*!< [11..0] Triggered DMA from Command complete event occured. Bit
+                                                     is read only and can be cleared by disabling the DTHR trigger
+                                                     enable or by disabling DMA.                                               */
+    } DMATOTCOUNT_b;
+  } ;
+
+  union {
+    __IOM uint32_t DMATARGADDR;                 /*!< (@ 0x0000028C) DMA Target Address Register                                */
+
+    struct {
+      __IOM uint32_t TARGADDR   : 20;           /*!< [19..0] Bits [19:0] of the target byte address for source of
+                                                     DMA (either read or write). The address can be any byte
+                                                     alignment, and does not have to be word aligned. In cases
+                                                     of non-word aligned addresses, the DMA logic will take
+                                                     care for ensuring only the target bytes are read/written.                 */
+      __IM  uint32_t            : 8;
+      __IOM uint32_t TARGADDR28 : 1;            /*!< [28..28] Bit 28 of the target byte address for source of DMA
+                                                     (either read or write). In cases of non-word aligned addresses,
+                                                     the DMA logic will take care for ensuring only the target
+                                                     bytes are read/written.Setting to '1' will select the SRAM.
+                                                     Setting to '0' will select the flash                                      */
+    } DMATARGADDR_b;
+  } ;
+
+  union {
+    __IOM uint32_t DMASTAT;                     /*!< (@ 0x00000290) DMA Status Register                                        */
+
+    struct {
+      __IOM uint32_t DMATIP     : 1;            /*!< [0..0] DMA Transfer In Progress indicator. 1 will indicate that
+                                                     a DMA transfer is active. The DMA transfer may be waiting
+                                                     on data, transferring data, or waiting for priority.All
+                                                     of these will be indicated with a 1. A 0 will indicate
+                                                     that the DMA is fully complete and no further transactions
+                                                     will be done. This bit is read only.                                      */
+      __IOM uint32_t DMACPL     : 1;            /*!< [1..1] DMA Transfer Complete. This signals the end of the DMA
+                                                     operation. This bit can be cleared by writing to 0, and
+                                                     will also be cleared when a new DMA is started.                           */
+      __IOM uint32_t DMAERR     : 1;            /*!< [2..2] DMA Error. This active high bit signals an error was
+                                                     encountered during the DMA operation. The bit can be cleared
+                                                     by writing to 0. Once set, this bit will remain set until
+                                                     cleared by software.                                                      */
+    } DMASTAT_b;
+  } ;
+
+  union {
+    __IOM uint32_t CQCFG;                       /*!< (@ 0x00000294) Command Queue Configuration Register                       */
+
+    struct {
+      __IOM uint32_t CQEN       : 1;            /*!< [0..0] Command queue enable. When set, will enable the processing
+                                                     of the command queue and fetches of address/data pairs
+                                                     will proceed from the word address within the CQADDR register.
+                                                     Can be disabledusing a CQ executed write to this bit as
+                                                     well.                                                                     */
+      __IOM uint32_t CQPRI      : 1;            /*!< [1..1] Sets the Priority of the command queue dma request                 */
+    } CQCFG_b;
+  } ;
+
+  union {
+    __IOM uint32_t CQADDR;                      /*!< (@ 0x00000298) CQ Target Read Address Register                            */
+
+    struct {
+      __IM  uint32_t            : 2;
+      __IOM uint32_t CQADDR     : 18;           /*!< [19..2] Bits 19:2 of target byte address for source of CQ (read
+                                                     only). The buffer must be aligned on a word boundary                      */
+      __IM  uint32_t            : 8;
+      __IOM uint32_t CQADDR28   : 1;            /*!< [28..28] Bit 28 of target byte address for source of CQ (read
+                                                     only). Used to denote Flash (0) or SRAM (1) access                        */
+    } CQADDR_b;
+  } ;
+
+  union {
+    __IOM uint32_t CQSTAT;                      /*!< (@ 0x0000029C) Command Queue Status Register                              */
+
+    struct {
+      __IOM uint32_t CQTIP      : 1;            /*!< [0..0] Command queue Transfer In Progress indicator. 1 will
+                                                     indicate that a CQ transfer is active and this will remain
+                                                     active even when paused waiting for external event.                       */
+      __IOM uint32_t CQPAUSED   : 1;            /*!< [1..1] Command queue operation is currently paused.                       */
+      __IOM uint32_t CQERR      : 1;            /*!< [2..2] Command queue processing Error. This active high bit
+                                                     signals that an error was encountered during the CQ operation.            */
+    } CQSTAT_b;
+  } ;
+
+  union {
+    __IOM uint32_t CQFLAGS;                     /*!< (@ 0x000002A0) Command Queue Flag Register                                */
+
+    struct {
+      __IOM uint32_t CQFLAGS    : 16;           /*!< [15..0] Current flag status (read-only). Bits [7:0] are software
+                                                     controllable and bits [15:8] are hardware status.                         */
+      __IOM uint32_t CQIRQMASK  : 16;           /*!< [31..16] Mask the bits used to generate the command queue interrupt.
+                                                     A '1' in the bit position will enable the pause event to
+                                                     trigger the interrupt, if the CQWT_int interrupt is enabled.
+                                                     Bits definitions are the same as CQPAUSE                                  */
+    } CQFLAGS_b;
+  } ;
+
+  union {
+    __IOM uint32_t CQSETCLEAR;                  /*!< (@ 0x000002A4) Command Queue Flag Set/Clear Register                      */
+
+    struct {
+      __IOM uint32_t CQFSET     : 8;            /*!< [7..0] Set CQFlag status bits. Will set to 1 the value of any
+                                                     SWFLAG with a '1' in the corresponding bit position of
+                                                     this field                                                                */
+      __IOM uint32_t CQFTGL     : 8;            /*!< [15..8] Toggle the indicated bit. Will toggle the value of any
+                                                     SWFLAG with a '1' in the corresponding bit position of
+                                                     this field                                                                */
+      __IOM uint32_t CQFCLR     : 8;            /*!< [23..16] Clear CQFlag status bits. Will clear to 0 any SWFLAG
+                                                     with a '1' in the corresponding bit position of this field                */
+    } CQSETCLEAR_b;
+  } ;
+
+  union {
+    __IOM uint32_t CQPAUSEEN;                   /*!< (@ 0x000002A8) Command Queue Pause Enable Register                        */
+
+    struct {
+      __IOM uint32_t CQPEN      : 16;           /*!< [15..0] Enables the specified event to pause command processing
+                                                     when active                                                               */
+    } CQPAUSEEN_b;
+  } ;
+
+  union {
+    __IOM uint32_t CQCURIDX;                    /*!< (@ 0x000002AC) IOM Command Queue current index value . Compared
+                                                                    to the CQENDIDX reg contents to generate
+                                                                    the IDXEQ Pause event for command queue                    */
+
+    struct {
+      __IOM uint32_t CQCURIDX   : 8;            /*!< [7..0] Holds 8 bits of data that will be compared with the CQENDIX
+                                                     register field. If the values match, the IDXEQ pause event
+                                                     will be activated, which will cause the pausing of command
+                                                     quue operation if the IDXEQ bit is enabled in CQPAUSEEN.                  */
+    } CQCURIDX_b;
+  } ;
+
+  union {
+    __IOM uint32_t CQENDIDX;                    /*!< (@ 0x000002B0) IOM Command Queue current index value . Compared
+                                                                    to the CQCURIDX reg contents to generate
+                                                                    the IDXEQ Pause event for command queue                    */
+
+    struct {
+      __IOM uint32_t CQENDIDX   : 8;            /*!< [7..0] Holds 8 bits of data that will be compared with the CQCURIX
+                                                     register field. If the values match, the IDXEQ pause event
+                                                     will be activated, which will cause the pausing of command
+                                                     quue operation if the IDXEQ bit is enabled in CQPAUSEEN.                  */
+    } CQENDIDX_b;
+  } ;
+
+  union {
+    __IOM uint32_t STATUS;                      /*!< (@ 0x000002B4) IOM Module Status Register                                 */
+
+    struct {
+      __IOM uint32_t ERR        : 1;            /*!< [0..0] Bit has been deprecated. Please refer to the other error
+                                                     indicators. This will always return 0.                                    */
+      __IOM uint32_t CMDACT     : 1;            /*!< [1..1] Indicates if the active I/O Command is currently processing
+                                                     a transaction, or command is complete, but the FIFO pointers
+                                                     are still syncronizing internally. This bit will go high
+                                                     atthe start of the transaction, and will go low when the
+                                                     command is complete, and the data and pointers within the
+                                                     FIFO have been syncronized.                                               */
+      __IOM uint32_t IDLEST     : 1;            /*!< [2..2] indicates if the active I/O state machine is IDLE. Note
+                                                     - The state machine could be in idle state due to holdoffs
+                                                     from data availability, or as the command gets propagated
+                                                     into the logic from the registers.                                        */
+    } STATUS_b;
+  } ;
+  __IM  uint32_t  RESERVED5[18];
+
+  union {
+    __IOM uint32_t MSPICFG;                     /*!< (@ 0x00000300) SPI module master configuration                            */
+
+    struct {
+      __IOM uint32_t SPOL       : 1;            /*!< [0..0] selects SPI polarity.                                              */
+      __IOM uint32_t SPHA       : 1;            /*!< [1..1] selects SPI phase.                                                 */
+      __IOM uint32_t FULLDUP    : 1;            /*!< [2..2] Enables full duplex mode for Master SPI write operations.
+                                                     Data will be captured simultaneously into the read fifo                   */
+      __IM  uint32_t            : 13;
+      __IOM uint32_t WTFC       : 1;            /*!< [16..16] enables write mode flow control.                                 */
+      __IOM uint32_t RDFC       : 1;            /*!< [17..17] enables read mode flow control.                                  */
+      __IOM uint32_t MOSIINV    : 1;            /*!< [18..18] inverts MOSI when flow control is enabled.                       */
+      __IM  uint32_t            : 1;
+      __IOM uint32_t WTFCIRQ    : 1;            /*!< [20..20] selects the write mode flow control signal.                      */
+      __IOM uint32_t WTFCPOL    : 1;            /*!< [21..21] selects the write flow control signal polarity. The
+                                                     transfers are halted when the selected flow control signal
+                                                     is OPPOSITE polarity of bit. (For example: WTFCPOL = 0
+                                                     will allow a IRQ=1 to pause transfers).                                   */
+      __IOM uint32_t RDFCPOL    : 1;            /*!< [22..22] selects the read flow control signal polarity.                   */
+      __IOM uint32_t SPILSB     : 1;            /*!< [23..23] Selects data transfer as MSB first (0) or LSB first
+                                                     (1) for the data portion of the SPI transaction. The offset
+                                                     bytes are always transmitted MSB first.                                   */
+      __IOM uint32_t DINDLY     : 3;            /*!< [26..24] Delay tap to use for the input signal (MISO). This
+                                                     gives more hold time on the input data.                                   */
+      __IOM uint32_t DOUTDLY    : 3;            /*!< [29..27] Delay tap to use for the output signal (MOSI). This
+                                                     give more hold time on the output data                                    */
+      __IOM uint32_t MSPIRST    : 1;            /*!< [30..30] Not used. To reset the module, toggle the SMOD_EN for
+                                                     the module                                                                */
+    } MSPICFG_b;
+  } ;
+  __IM  uint32_t  RESERVED6[63];
+
+  union {
+    __IOM uint32_t MI2CCFG;                     /*!< (@ 0x00000400) I2C Master configuration                                   */
+
+    struct {
+      __IOM uint32_t ADDRSZ     : 1;            /*!< [0..0] Sets the I2C master device address size to either 7b
+                                                     (0) or 10b (1).                                                           */
+      __IOM uint32_t I2CLSB     : 1;            /*!< [1..1] Direction of data transmit and receive, MSB(0) or LSB(1)
+                                                     first. Default per I2C specification is MSB first. This
+                                                     applies to both read and write data, and read data will
+                                                     be bit                                                                    */
+      __IOM uint32_t ARBEN      : 1;            /*!< [2..2] Enables multi-master arbitration for the I2C master.
+                                                     If the bus is known to have only a single master, this
+                                                     function can be disabled to save clock cycles on I2C transactions         */
+      __IM  uint32_t            : 1;
+      __IOM uint32_t SDADLY     : 2;            /*!< [5..4] Delay to enable on the SDA output. Values are 0x0-0x3.             */
+      __IOM uint32_t MI2CRST    : 1;            /*!< [6..6] Not used. To reset the module, toggle the SMOD_EN for
+                                                     the module                                                                */
+      __IM  uint32_t            : 1;
+      __IOM uint32_t SCLENDLY   : 4;            /*!< [11..8] Number of IOCLK cycles to delay the rising edge of the
+                                                     SCL output en (clock will go low on this edge). Used to
+                                                     allow clock shaping.                                                      */
+      __IOM uint32_t SDAENDLY   : 4;            /*!< [15..12] Number of IOCLK cycles to delay the SDA output en (all
+                                                     transitions affected). Used to delay data relative to clock               */
+      __IOM uint32_t SMPCNT     : 8;            /*!< [23..16] Number of Base clk cycles to wait before sampling the
+                                                     SCL clock to determine if a clock stretch event has occured               */
+      __IOM uint32_t STRDIS     : 1;            /*!< [24..24] Disable detection of clock stretch events smaller than
+                                                     1 cycle                                                                   */
+    } MI2CCFG_b;
+  } ;
+
+  union {
+    __IOM uint32_t DEVCFG;                      /*!< (@ 0x00000404) I2C Device Configuration register                          */
+
+    struct {
+      __IOM uint32_t DEVADDR    : 10;           /*!< [9..0] I2C address of the device that the Master will use to
+                                                     target for read/write operations. This can be either a
+                                                     7b or 10b address.                                                        */
+    } DEVCFG_b;
+  } ;
+  __IM  uint32_t  RESERVED7[2];
+
+  union {
+    __IOM uint32_t IOMDBG;                      /*!< (@ 0x00000410) IOM Debug Register                                         */
+
+    struct {
+      __IOM uint32_t DBGEN      : 1;            /*!< [0..0] Debug Enable. Setting bit will enable the update of data
+                                                     within this register, otherwise it is clock gated for power
+                                                     savings                                                                   */
+      __IOM uint32_t IOCLKON    : 1;            /*!< [1..1] IOCLK debug clock control. Enable IO_CLK to be active
+                                                     when this bit is '1'. Otherwise, the clock is controlled
+                                                     with gating from the logic as needed.                                     */
+      __IOM uint32_t APBCLKON   : 1;            /*!< [2..2] APBCLK debug clock control. Enable APB_CLK to be active
+                                                     when this bit is '1'. Otherwise, the clock is controlled
+                                                     with gating from the logic as needed.                                     */
+      __IOM uint32_t DBGDATA    : 29;           /*!< [31..3] Debug control for various options. DBGDATA[1:0] is used
+                                                     to select between different debug data available in the
+                                                     DBG0 and DBG1 registers.                                                  */
+    } IOMDBG_b;
+  } ;
+} IOM0_Type;                                    /*!< Size = 1044 (0x414)                                                       */
+
+
+
+/* =========================================================================================================================== */
+/* ================                                          IOSLAVE                                          ================ */
+/* =========================================================================================================================== */
+
+
+/**
+  * @brief I2C/SPI Slave (IOSLAVE)
+  */
+
+typedef struct {                                /*!< (@ 0x50000000) IOSLAVE Structure                                          */
+  __IM  uint32_t  RESERVED[64];
+
+  union {
+    __IOM uint32_t FIFOPTR;                     /*!< (@ 0x00000100) Current FIFO Pointer                                       */
+
+    struct {
+      __IOM uint32_t FIFOPTR    : 8;            /*!< [7..0] Current FIFO pointer.                                              */
+      __IOM uint32_t FIFOSIZ    : 8;            /*!< [15..8] The number of bytes currently in the hardware FIFO.               */
+    } FIFOPTR_b;
+  } ;
+
+  union {
+    __IOM uint32_t FIFOCFG;                     /*!< (@ 0x00000104) FIFO Configuration                                         */
+
+    struct {
+      __IOM uint32_t FIFOBASE   : 5;            /*!< [4..0] These bits hold the base address of the I/O FIFO in 8
+                                                     byte segments. The IO Slave FIFO is situated in LRAM at
+                                                     (FIFOBASE*8) to (FIFOMAX*8-1).                                            */
+      __IM  uint32_t            : 3;
+      __IOM uint32_t FIFOMAX    : 6;            /*!< [13..8] These bits hold the maximum FIFO address in 8 byte segments.
+                                                     It is also the beginning of the RAM area of the LRAM. Note
+                                                     that no RAM area is configured if FIFOMAX is set to 0x1F.                 */
+      __IM  uint32_t            : 10;
+      __IOM uint32_t ROBASE     : 6;            /*!< [29..24] Defines the read-only area. The IO Slave read-only
+                                                     area is situated in LRAM at (ROBASE*8) to (FIFOBASE*8-1)                  */
+    } FIFOCFG_b;
+  } ;
+
+  union {
+    __IOM uint32_t FIFOTHR;                     /*!< (@ 0x00000108) FIFO Threshold Configuration                               */
+
+    struct {
+      __IOM uint32_t FIFOTHR    : 8;            /*!< [7..0] FIFO size interrupt threshold.                                     */
+    } FIFOTHR_b;
+  } ;
+
+  union {
+    __IOM uint32_t FUPD;                        /*!< (@ 0x0000010C) FIFO Update Status                                         */
+
+    struct {
+      __IOM uint32_t FIFOUPD    : 1;            /*!< [0..0] This bit indicates that a FIFO update is underway.                 */
+      __IOM uint32_t IOREAD     : 1;            /*!< [1..1] This bitfield indicates an IO read is active.                      */
+    } FUPD_b;
+  } ;
+
+  union {
+    __IOM uint32_t FIFOCTR;                     /*!< (@ 0x00000110) Overall FIFO Counter                                       */
+
+    struct {
+      __IOM uint32_t FIFOCTR    : 10;           /*!< [9..0] Virtual FIFO byte count                                            */
+    } FIFOCTR_b;
+  } ;
+
+  union {
+    __IOM uint32_t FIFOINC;                     /*!< (@ 0x00000114) Overall FIFO Counter Increment                             */
+
+    struct {
+      __IOM uint32_t FIFOINC    : 10;           /*!< [9..0] Increment the Overall FIFO Counter by this value on a
+                                                     write                                                                     */
+    } FIFOINC_b;
+  } ;
+
+  union {
+    __IOM uint32_t CFG;                         /*!< (@ 0x00000118) I/O Slave Configuration                                    */
+
+    struct {
+      __IOM uint32_t IFCSEL     : 1;            /*!< [0..0] This bit selects the I/O interface.                                */
+      __IOM uint32_t SPOL       : 1;            /*!< [1..1] This bit selects SPI polarity.                                     */
+      __IOM uint32_t LSB        : 1;            /*!< [2..2] This bit selects the transfer bit ordering.                        */
+      __IM  uint32_t            : 1;
+      __IOM uint32_t STARTRD    : 1;            /*!< [4..4] This bit holds the cycle to initiate an I/O RAM read.              */
+      __IM  uint32_t            : 3;
+      __IOM uint32_t I2CADDR    : 12;           /*!< [19..8] 7-bit or 10-bit I2C device address.                               */
+      __IM  uint32_t            : 11;
+      __IOM uint32_t IFCEN      : 1;            /*!< [31..31] IOSLAVE interface enable.                                        */
+    } CFG_b;
+  } ;
+
+  union {
+    __IOM uint32_t PRENC;                       /*!< (@ 0x0000011C) I/O Slave Interrupt Priority Encode                        */
+
+    struct {
+      __IOM uint32_t PRENC      : 5;            /*!< [4..0] These bits hold the priority encode of the REGACC interrupts.      */
+    } PRENC_b;
+  } ;
+
+  union {
+    __IOM uint32_t IOINTCTL;                    /*!< (@ 0x00000120) I/O Interrupt Control                                      */
+
+    struct {
+      __IOM uint32_t IOINTEN    : 8;            /*!< [7..0] These read-only bits indicate whether the IOINT interrupts
+                                                     are enabled.                                                              */
+      __IOM uint32_t IOINT      : 8;            /*!< [15..8] These bits read the IOINT interrupts.                             */
+      __IOM uint32_t IOINTCLR   : 1;            /*!< [16..16] This bit clears all of the IOINT interrupts when written
+                                                     with a 1.                                                                 */
+      __IM  uint32_t            : 7;
+      __IOM uint32_t IOINTSET   : 8;            /*!< [31..24] These bits set the IOINT interrupts when written with
+                                                     a 1.                                                                      */
+    } IOINTCTL_b;
+  } ;
+
+  union {
+    __IOM uint32_t GENADD;                      /*!< (@ 0x00000124) General Address Data                                       */
+
+    struct {
+      __IOM uint32_t GADATA     : 8;            /*!< [7..0] The data supplied on the last General Address reference.           */
+    } GENADD_b;
+  } ;
+  __IM  uint32_t  RESERVED1[54];
+
+  union {
+    __IOM uint32_t INTEN;                       /*!< (@ 0x00000200) IO Slave Interrupts: Enable                                */
+
+    struct {
+      __IOM uint32_t FSIZE      : 1;            /*!< [0..0] FIFO Size interrupt.                                               */
+      __IOM uint32_t FOVFL      : 1;            /*!< [1..1] FIFO Overflow interrupt.                                           */
+      __IOM uint32_t FUNDFL     : 1;            /*!< [2..2] FIFO Underflow interrupt.                                          */
+      __IOM uint32_t FRDERR     : 1;            /*!< [3..3] FIFO Read Error interrupt.                                         */
+      __IOM uint32_t GENAD      : 1;            /*!< [4..4] I2C General Address interrupt.                                     */
+      __IOM uint32_t IOINTW     : 1;            /*!< [5..5] IO Write interrupt.                                                */
+      __IOM uint32_t XCMPRF     : 1;            /*!< [6..6] Transfer complete interrupt, read from FIFO space.                 */
+      __IOM uint32_t XCMPRR     : 1;            /*!< [7..7] Transfer complete interrupt, read from register space.             */
+      __IOM uint32_t XCMPWF     : 1;            /*!< [8..8] Transfer complete interrupt, write to FIFO space.                  */
+      __IOM uint32_t XCMPWR     : 1;            /*!< [9..9] Transfer complete interrupt, write to register space.              */
+    } INTEN_b;
+  } ;
+
+  union {
+    __IOM uint32_t INTSTAT;                     /*!< (@ 0x00000204) IO Slave Interrupts: Status                                */
+
+    struct {
+      __IOM uint32_t FSIZE      : 1;            /*!< [0..0] FIFO Size interrupt.                                               */
+      __IOM uint32_t FOVFL      : 1;            /*!< [1..1] FIFO Overflow interrupt.                                           */
+      __IOM uint32_t FUNDFL     : 1;            /*!< [2..2] FIFO Underflow interrupt.                                          */
+      __IOM uint32_t FRDERR     : 1;            /*!< [3..3] FIFO Read Error interrupt.                                         */
+      __IOM uint32_t GENAD      : 1;            /*!< [4..4] I2C General Address interrupt.                                     */
+      __IOM uint32_t IOINTW     : 1;            /*!< [5..5] IO Write interrupt.                                                */
+      __IOM uint32_t XCMPRF     : 1;            /*!< [6..6] Transfer complete interrupt, read from FIFO space.                 */
+      __IOM uint32_t XCMPRR     : 1;            /*!< [7..7] Transfer complete interrupt, read from register space.             */
+      __IOM uint32_t XCMPWF     : 1;            /*!< [8..8] Transfer complete interrupt, write to FIFO space.                  */
+      __IOM uint32_t XCMPWR     : 1;            /*!< [9..9] Transfer complete interrupt, write to register space.              */
+    } INTSTAT_b;
+  } ;
+
+  union {
+    __IOM uint32_t INTCLR;                      /*!< (@ 0x00000208) IO Slave Interrupts: Clear                                 */
+
+    struct {
+      __IOM uint32_t FSIZE      : 1;            /*!< [0..0] FIFO Size interrupt.                                               */
+      __IOM uint32_t FOVFL      : 1;            /*!< [1..1] FIFO Overflow interrupt.                                           */
+      __IOM uint32_t FUNDFL     : 1;            /*!< [2..2] FIFO Underflow interrupt.                                          */
+      __IOM uint32_t FRDERR     : 1;            /*!< [3..3] FIFO Read Error interrupt.                                         */
+      __IOM uint32_t GENAD      : 1;            /*!< [4..4] I2C General Address interrupt.                                     */
+      __IOM uint32_t IOINTW     : 1;            /*!< [5..5] IO Write interrupt.                                                */
+      __IOM uint32_t XCMPRF     : 1;            /*!< [6..6] Transfer complete interrupt, read from FIFO space.                 */
+      __IOM uint32_t XCMPRR     : 1;            /*!< [7..7] Transfer complete interrupt, read from register space.             */
+      __IOM uint32_t XCMPWF     : 1;            /*!< [8..8] Transfer complete interrupt, write to FIFO space.                  */
+      __IOM uint32_t XCMPWR     : 1;            /*!< [9..9] Transfer complete interrupt, write to register space.              */
+    } INTCLR_b;
+  } ;
+
+  union {
+    __IOM uint32_t INTSET;                      /*!< (@ 0x0000020C) IO Slave Interrupts: Set                                   */
+
+    struct {
+      __IOM uint32_t FSIZE      : 1;            /*!< [0..0] FIFO Size interrupt.                                               */
+      __IOM uint32_t FOVFL      : 1;            /*!< [1..1] FIFO Overflow interrupt.                                           */
+      __IOM uint32_t FUNDFL     : 1;            /*!< [2..2] FIFO Underflow interrupt.                                          */
+      __IOM uint32_t FRDERR     : 1;            /*!< [3..3] FIFO Read Error interrupt.                                         */
+      __IOM uint32_t GENAD      : 1;            /*!< [4..4] I2C General Address interrupt.                                     */
+      __IOM uint32_t IOINTW     : 1;            /*!< [5..5] IO Write interrupt.                                                */
+      __IOM uint32_t XCMPRF     : 1;            /*!< [6..6] Transfer complete interrupt, read from FIFO space.                 */
+      __IOM uint32_t XCMPRR     : 1;            /*!< [7..7] Transfer complete interrupt, read from register space.             */
+      __IOM uint32_t XCMPWF     : 1;            /*!< [8..8] Transfer complete interrupt, write to FIFO space.                  */
+      __IOM uint32_t XCMPWR     : 1;            /*!< [9..9] Transfer complete interrupt, write to register space.              */
+    } INTSET_b;
+  } ;
+
+  union {
+    __IOM uint32_t REGACCINTEN;                 /*!< (@ 0x00000210) Register Access Interrupts: Enable                         */
+
+    struct {
+      __IOM uint32_t REGACC     : 32;           /*!< [31..0] Register access interrupts.                                       */
+    } REGACCINTEN_b;
+  } ;
+
+  union {
+    __IOM uint32_t REGACCINTSTAT;               /*!< (@ 0x00000214) Register Access Interrupts: Status                         */
+
+    struct {
+      __IOM uint32_t REGACC     : 32;           /*!< [31..0] Register access interrupts.                                       */
+    } REGACCINTSTAT_b;
+  } ;
+
+  union {
+    __IOM uint32_t REGACCINTCLR;                /*!< (@ 0x00000218) Register Access Interrupts: Clear                          */
+
+    struct {
+      __IOM uint32_t REGACC     : 32;           /*!< [31..0] Register access interrupts.                                       */
+    } REGACCINTCLR_b;
+  } ;
+
+  union {
+    __IOM uint32_t REGACCINTSET;                /*!< (@ 0x0000021C) Register Access Interrupts: Set                            */
+
+    struct {
+      __IOM uint32_t REGACC     : 32;           /*!< [31..0] Register access interrupts.                                       */
+    } REGACCINTSET_b;
+  } ;
+} IOSLAVE_Type;                                 /*!< Size = 544 (0x220)                                                        */
+
+
+
+/* =========================================================================================================================== */
+/* ================                                          MCUCTRL                                          ================ */
+/* =========================================================================================================================== */
+
+
+/**
+  * @brief MCU Miscellaneous Control Logic (MCUCTRL)
+  */
+
+typedef struct {                                /*!< (@ 0x40020000) MCUCTRL Structure                                          */
+
+  union {
+    __IOM uint32_t CHIPPN;                      /*!< (@ 0x00000000) Chip Information Register                                  */
+
+    struct {
+      __IOM uint32_t PARTNUM    : 32;           /*!< [31..0] BCD part number.                                                  */
+    } CHIPPN_b;
+  } ;
+
+  union {
+    __IOM uint32_t CHIPID0;                     /*!< (@ 0x00000004) Unique Chip ID 0                                           */
+
+    struct {
+      __IOM uint32_t CHIPID0    : 32;           /*!< [31..0] Unique chip ID 0.                                                 */
+    } CHIPID0_b;
+  } ;
+
+  union {
+    __IOM uint32_t CHIPID1;                     /*!< (@ 0x00000008) Unique Chip ID 1                                           */
+
+    struct {
+      __IOM uint32_t CHIPID1    : 32;           /*!< [31..0] Unique chip ID 1.                                                 */
+    } CHIPID1_b;
+  } ;
+
+  union {
+    __IOM uint32_t CHIPREV;                     /*!< (@ 0x0000000C) Chip Revision                                              */
+
+    struct {
+      __IOM uint32_t REVMIN     : 4;            /*!< [3..0] Minor Revision ID.                                                 */
+      __IOM uint32_t REVMAJ     : 4;            /*!< [7..4] Major Revision ID.                                                 */
+      __IOM uint32_t SIPART     : 12;           /*!< [19..8] Silicon Part ID                                                   */
+    } CHIPREV_b;
+  } ;
+
+  union {
+    __IOM uint32_t VENDORID;                    /*!< (@ 0x00000010) Unique Vendor ID                                           */
+
+    struct {
+      __IOM uint32_t VENDORID   : 32;           /*!< [31..0] Unique Vendor ID                                                  */
+    } VENDORID_b;
+  } ;
+
+  union {
+    __IOM uint32_t SKU;                         /*!< (@ 0x00000014) Unique Chip SKU                                            */
+
+    struct {
+      __IOM uint32_t ALLOWBURST : 1;            /*!< [0..0] Allow Burst feature                                                */
+      __IOM uint32_t ALLOWBLE   : 1;            /*!< [1..1] Allow BLE feature                                                  */
+      __IOM uint32_t SECBOOT    : 1;            /*!< [2..2] Secure boot feature allowed                                        */
+    } SKU_b;
+  } ;
+
+  union {
+    __IOM uint32_t FEATUREENABLE;               /*!< (@ 0x00000018) Feature Enable on Burst and BLE                            */
+
+    struct {
+      __IOM uint32_t BLEREQ     : 1;            /*!< [0..0] Controls the BLE functionality                                     */
+      __IOM uint32_t BLEACK     : 1;            /*!< [1..1] ACK for BLEREQ                                                     */
+      __IOM uint32_t BLEAVAIL   : 1;            /*!< [2..2] AVAILABILITY of the BLE functionality                              */
+      __IM  uint32_t            : 1;
+      __IOM uint32_t BURSTREQ   : 1;            /*!< [4..4] Controls the Burst functionality                                   */
+      __IOM uint32_t BURSTACK   : 1;            /*!< [5..5] ACK for BURSTREQ                                                   */
+      __IOM uint32_t BURSTAVAIL : 1;            /*!< [6..6] Availability of Burst functionality                                */
+    } FEATUREENABLE_b;
+  } ;
+  __IM  uint32_t  RESERVED;
+
+  union {
+    __IOM uint32_t DEBUGGER;                    /*!< (@ 0x00000020) Debugger Control                                           */
+
+    struct {
+      __IOM uint32_t LOCKOUT    : 1;            /*!< [0..0] Lockout of debugger (SWD).                                         */
+    } DEBUGGER_b;
+  } ;
+  __IM  uint32_t  RESERVED1[55];
+
+  union {
+    __IOM uint32_t BODCTRL;                     /*!< (@ 0x00000100) BOD control Register                                       */
+
+    struct {
+      __IOM uint32_t BODLPWD    : 1;            /*!< [0..0] BODL Power Down.                                                   */
+      __IOM uint32_t BODHPWD    : 1;            /*!< [1..1] BODH Power Down.                                                   */
+      __IOM uint32_t BODCPWD    : 1;            /*!< [2..2] BODC Power Down.                                                   */
+      __IOM uint32_t BODFPWD    : 1;            /*!< [3..3] BODF Power Down.                                                   */
+      __IOM uint32_t BODLVREFSEL : 1;           /*!< [4..4] BODL External Reference Select. Note: the SWE mux select
+                                                     in PWRSEQ2SWE must be set for this to take effect.                        */
+      __IOM uint32_t BODHVREFSEL : 1;           /*!< [5..5] BODH External Reference Select. Note: the SWE mux select
+                                                     in PWRSEQ2SWE must be set for this to take effect.                        */
+    } BODCTRL_b;
+  } ;
+
+  union {
+    __IOM uint32_t ADCPWRDLY;                   /*!< (@ 0x00000104) ADC Power Up Delay Control                                 */
+
+    struct {
+      __IOM uint32_t ADCPWR0    : 8;            /*!< [7..0] ADC Reference Buffer Power Enable delay in 64 ADC CLK
+                                                     increments for ADC_CLKSEL = 0x1, 32 ADC CLOCK increments
+                                                     for ADC_CLKSEL = 0x2.                                                     */
+      __IOM uint32_t ADCPWR1    : 8;            /*!< [15..8] ADC Reference Keeper enable delay in 16 ADC CLK increments
+                                                     for ADC_CLKSEL = 0x1, 8 ADC CLOCK increments for ADC_CLKSEL
+                                                     = 0x2.                                                                    */
+    } ADCPWRDLY_b;
+  } ;
+  __IM  uint32_t  RESERVED2;
+
+  union {
+    __IOM uint32_t ADCCAL;                      /*!< (@ 0x0000010C) ADC Calibration Control                                    */
+
+    struct {
+      __IOM uint32_t CALONPWRUP : 1;            /*!< [0..0] Run ADC Calibration on initial power up sequence                   */
+      __IOM uint32_t ADCCALIBRATED : 1;         /*!< [1..1] Status for ADC Calibration                                         */
+    } ADCCAL_b;
+  } ;
+
+  union {
+    __IOM uint32_t ADCBATTLOAD;                 /*!< (@ 0x00000110) ADC Battery Load Enable                                    */
+
+    struct {
+      __IOM uint32_t BATTLOAD   : 1;            /*!< [0..0] Enable the ADC battery load resistor                               */
+    } ADCBATTLOAD_b;
+  } ;
+  __IM  uint32_t  RESERVED3;
+
+  union {
+    __IOM uint32_t ADCTRIM;                     /*!< (@ 0x00000118) ADC Trims                                                  */
+
+    struct {
+      __IOM uint32_t ADCREFKEEPIBTRIM : 2;      /*!< [1..0] ADC Reference Ibias trim                                           */
+      __IM  uint32_t            : 4;
+      __IOM uint32_t ADCREFBUFTRIM : 5;         /*!< [10..6] ADC Reference buffer trim                                         */
+      __IOM uint32_t ADCRFBUFIBTRIM : 2;        /*!< [12..11] ADC reference buffer input bias trim                             */
+    } ADCTRIM_b;
+  } ;
+
+  union {
+    __IOM uint32_t ADCREFCOMP;                  /*!< (@ 0x0000011C) ADC Referece Keeper and Comparator Control                 */
+
+    struct {
+      __IOM uint32_t ADC_REFCOMP_OUT : 1;       /*!< [0..0] Output of the ADC reference comparator                             */
+      __IM  uint32_t            : 7;
+      __IOM uint32_t ADCREFKEEPTRIM : 5;        /*!< [12..8] ADC Reference Keeper Trim                                         */
+      __IM  uint32_t            : 3;
+      __IOM uint32_t ADCRFCMPEN : 1;            /*!< [16..16] ADC Reference comparator power down                              */
+    } ADCREFCOMP_b;
+  } ;
+
+  union {
+    __IOM uint32_t XTALCTRL;                    /*!< (@ 0x00000120) XTAL Oscillator Control                                    */
+
+    struct {
+      __IOM uint32_t XTALSWE    : 1;            /*!< [0..0] XTAL Software Override Enable.                                     */
+      __IOM uint32_t FDBKDSBLXTAL : 1;          /*!< [1..1] XTAL Oscillator Disable Feedback.                                  */
+      __IOM uint32_t BYPCMPRXTAL : 1;           /*!< [2..2] XTAL Oscillator Bypass Comparator.                                 */
+      __IOM uint32_t PDNBCOREXTAL : 1;          /*!< [3..3] XTAL Oscillator Power Down Core.                                   */
+      __IOM uint32_t PDNBCMPRXTAL : 1;          /*!< [4..4] XTAL Oscillator Power Down Comparator.                             */
+      __IOM uint32_t PWDBODXTAL : 1;            /*!< [5..5] XTAL Power down on brown out.                                      */
+      __IOM uint32_t XTALIBUFTRIM : 2;          /*!< [7..6] XTAL IBUFF trim                                                    */
+      __IOM uint32_t XTALICOMPTRIM : 2;         /*!< [9..8] XTAL ICOMP trim                                                    */
+    } XTALCTRL_b;
+  } ;
+
+  union {
+    __IOM uint32_t XTALGENCTRL;                 /*!< (@ 0x00000124) XTAL Oscillator General Control                            */
+
+    struct {
+      __IOM uint32_t ACWARMUP   : 2;            /*!< [1..0] Auto-calibration delay control                                     */
+      __IOM uint32_t XTALBIASTRIM : 6;          /*!< [7..2] XTAL BIAS trim                                                     */
+      __IOM uint32_t XTALKSBIASTRIM : 6;        /*!< [13..8] XTAL IBIAS Kick start trim. This trim value is used
+                                                     during the startup process to enable a faster lock.                       */
+    } XTALGENCTRL_b;
+  } ;
+  __IM  uint32_t  RESERVED4[28];
+
+  union {
+    __IOM uint32_t MISCCTRL;                    /*!< (@ 0x00000198) Miscellaneous control register.                            */
+
+    struct {
+      __IOM uint32_t RESERVED_RW_0 : 5;         /*!< [4..0] Reserved bits, always leave unchanged. The MISCCTRL register
+                                                     must be modified via atomic RMW, leaving this bitfield
+                                                     completely unmodified. Failure to do so will result in
+                                                     unpredictable behavior.                                                   */
+      __IOM uint32_t BLE_RESETN : 1;            /*!< [5..5] BLE reset signal.                                                  */
+    } MISCCTRL_b;
+  } ;
+  __IM  uint32_t  RESERVED5;
+
+  union {
+    __IOM uint32_t BOOTLOADER;                  /*!< (@ 0x000001A0) Bootloader and secure boot functions                       */
+
+    struct {
+      __IOM uint32_t BOOTLOADERLOW : 1;         /*!< [0..0] Determines whether the bootloader code is visible at
+                                                     address 0x00000000 or not. Resets to 1, write 1 to clear.                 */
+      __IOM uint32_t SBLOCK     : 1;            /*!< [1..1] Secure boot lock. Always resets to 1, write 1 to clear.
+                                                     Enables system visibility to bootloader until set.                        */
+      __IOM uint32_t PROTLOCK   : 1;            /*!< [2..2] Flash protection lock. Always resets to 1, write 1 to
+                                                     clear. Enables writes to flash protection register set.                   */
+      __IM  uint32_t            : 23;
+      __IOM uint32_t SECBOOTFEATURE : 2;        /*!< [27..26] Indicates whether the secure boot feature is enabled.            */
+      __IOM uint32_t SECBOOT    : 2;            /*!< [29..28] Indicates whether the secure boot on cold reset is
+                                                     enabled                                                                   */
+      __IOM uint32_t SECBOOTONRST : 2;          /*!< [31..30] Indicates whether the secure boot on warm reset is
+                                                     enabled                                                                   */
+    } BOOTLOADER_b;
+  } ;
+
+  union {
+    __IOM uint32_t SHADOWVALID;                 /*!< (@ 0x000001A4) Register to indicate whether the shadow registers
+                                                                    have been successfully loaded from the Flash
+                                                                    Information Space.                                         */
+
+    struct {
+      __IOM uint32_t VALID      : 1;            /*!< [0..0] Indicates whether the shadow registers contain valid
+                                                     data from the Flash Information Space.                                    */
+      __IOM uint32_t BLDSLEEP   : 1;            /*!< [1..1] Indicates whether the bootloader should sleep or deep
+                                                     sleep if no image loaded.                                                 */
+      __IOM uint32_t INFO0_VALID : 1;           /*!< [2..2] Indicates whether info0 contains valid data                        */
+    } SHADOWVALID_b;
+  } ;
+  __IM  uint32_t  RESERVED6[2];
+
+  union {
+    __IOM uint32_t SCRATCH0;                    /*!< (@ 0x000001B0) Scratch register that is not reset by any reset            */
+
+    struct {
+      __IOM uint32_t SCRATCH0   : 32;           /*!< [31..0] Scratch register 0.                                               */
+    } SCRATCH0_b;
+  } ;
+
+  union {
+    __IOM uint32_t SCRATCH1;                    /*!< (@ 0x000001B4) Scratch register that is not reset by any reset            */
+
+    struct {
+      __IOM uint32_t SCRATCH1   : 32;           /*!< [31..0] Scratch register 1.                                               */
+    } SCRATCH1_b;
+  } ;
+  __IM  uint32_t  RESERVED7[2];
+
+  union {
+    __IOM uint32_t ICODEFAULTADDR;              /*!< (@ 0x000001C0) ICODE bus address which was present when a bus
+                                                                    fault occurred.                                            */
+
+    struct {
+      __IOM uint32_t ICODEFAULTADDR : 32;       /*!< [31..0] The ICODE bus address observed when a Bus Fault occurred.
+                                                     Once an address is captured in this field, it is held until
+                                                     the corresponding Fault Observed bit is cleared in the
+                                                     FAULTSTATUS register.                                                     */
+    } ICODEFAULTADDR_b;
+  } ;
+
+  union {
+    __IOM uint32_t DCODEFAULTADDR;              /*!< (@ 0x000001C4) DCODE bus address which was present when a bus
+                                                                    fault occurred.                                            */
+
+    struct {
+      __IOM uint32_t DCODEFAULTADDR : 32;       /*!< [31..0] The DCODE bus address observed when a Bus Fault occurred.
+                                                     Once an address is captured in this field, it is held until
+                                                     the corresponding Fault Observed bit is cleared in the
+                                                     FAULTSTATUS register.                                                     */
+    } DCODEFAULTADDR_b;
+  } ;
+
+  union {
+    __IOM uint32_t SYSFAULTADDR;                /*!< (@ 0x000001C8) System bus address which was present when a bus
+                                                                    fault occurred.                                            */
+
+    struct {
+      __IOM uint32_t SYSFAULTADDR : 32;         /*!< [31..0] SYS bus address observed when a Bus Fault occurred.
+                                                     Once an address is captured in this field, it is held until
+                                                     the corresponding Fault Observed bit is cleared in the
+                                                     FAULTSTATUS register.                                                     */
+    } SYSFAULTADDR_b;
+  } ;
+
+  union {
+    __IOM uint32_t FAULTSTATUS;                 /*!< (@ 0x000001CC) Reflects the status of the bus decoders' fault
+                                                                    detection. Any write to this register will
+                                                                    clear all of the status bits within the
+                                                                    register.                                                  */
+
+    struct {
+      __IOM uint32_t ICODEFAULT : 1;            /*!< [0..0] The ICODE Bus Decoder Fault Detected bit. When set, a
+                                                     fault has been detected, and the ICODEFAULTADDR register
+                                                     will contain the bus address which generated the fault.                   */
+      __IOM uint32_t DCODEFAULT : 1;            /*!< [1..1] DCODE Bus Decoder Fault Detected bit. When set, a fault
+                                                     has been detected, and the DCODEFAULTADDR register will
+                                                     contain the bus address which generated the fault.                        */
+      __IOM uint32_t SYSFAULT   : 1;            /*!< [2..2] SYS Bus Decoder Fault Detected bit. When set, a fault
+                                                     has been detected, and the SYSFAULTADDR register will contain
+                                                     the bus address which generated the fault.                                */
+    } FAULTSTATUS_b;
+  } ;
+
+  union {
+    __IOM uint32_t FAULTCAPTUREEN;              /*!< (@ 0x000001D0) Enable the fault capture registers                         */
+
+    struct {
+      __IOM uint32_t FAULTCAPTUREEN : 1;        /*!< [0..0] Fault Capture Enable field. When set, the Fault Capture
+                                                     monitors are enabled and addresses which generate a hard
+                                                     fault are captured into the FAULTADDR registers.                          */
+    } FAULTCAPTUREEN_b;
+  } ;
+  __IM  uint32_t  RESERVED8[11];
+
+  union {
+    __IOM uint32_t DBGR1;                       /*!< (@ 0x00000200) Read-only debug register 1                                 */
+
+    struct {
+      __IOM uint32_t ONETO8     : 32;           /*!< [31..0] Read-only register for communication validation                   */
+    } DBGR1_b;
+  } ;
+
+  union {
+    __IOM uint32_t DBGR2;                       /*!< (@ 0x00000204) Read-only debug register 2                                 */
+
+    struct {
+      __IOM uint32_t COOLCODE   : 32;           /*!< [31..0] Read-only register for communication validation                   */
+    } DBGR2_b;
+  } ;
+  __IM  uint32_t  RESERVED9[6];
+
+  union {
+    __IOM uint32_t PMUENABLE;                   /*!< (@ 0x00000220) Control bit to enable/disable the PMU                      */
+
+    struct {
+      __IOM uint32_t ENABLE     : 1;            /*!< [0..0] PMU Enable Control bit. When set, the MCU's PMU will
+                                                     place the MCU into the lowest power consuming Deep Sleep
+                                                     mode upon execution of a WFI instruction (dependent on
+                                                     the setting of the SLEEPDEEP bit in the ARM SCR register).
+                                                     When cleared, regardless of the requested sleep mode, the
+                                                     PMU will not enter the lowest power Deep Sleep mode, instead
+                                                     entering the Sleep mode.                                                  */
+    } PMUENABLE_b;
+  } ;
+  __IM  uint32_t  RESERVED10[11];
+
+  union {
+    __IOM uint32_t TPIUCTRL;                    /*!< (@ 0x00000250) TPIU Control Register. Determines the clock enable
+                                                                    and frequency for the M4's TPIU interface.                 */
+
+    struct {
+      __IOM uint32_t ENABLE     : 1;            /*!< [0..0] TPIU Enable field. When set, the ARM M4 TPIU is enabled
+                                                     and data can be streamed out of the MCU's SWO port using
+                                                     the ARM ITM and TPIU modules.                                             */
+      __IM  uint32_t            : 7;
+      __IOM uint32_t CLKSEL     : 3;            /*!< [10..8] This field selects the frequency of the ARM M4 TPIU
+                                                     port.                                                                     */
+    } TPIUCTRL_b;
+  } ;
+  __IM  uint32_t  RESERVED11[4];
+
+  union {
+    __IOM uint32_t OTAPOINTER;                  /*!< (@ 0x00000264) OTA (Over the Air) Update Pointer/Status. Reset
+                                                                    only by POA                                                */
+
+    struct {
+      __IOM uint32_t OTAVALID   : 1;            /*!< [0..0] Indicates that an OTA update is valid                              */
+      __IOM uint32_t OTASBLUPDATE : 1;          /*!< [1..1] Indicates that the sbl_init has been updated                       */
+      __IOM uint32_t OTAPOINTER : 30;           /*!< [31..2] Flash page pointer with updated OTA image                         */
+    } OTAPOINTER_b;
+  } ;
+  __IM  uint32_t  RESERVED12[6];
+
+  union {
+    __IOM uint32_t APBDMACTRL;                  /*!< (@ 0x00000280) DMA Control Register. Determines misc settings
+                                                                    for DMA operation                                          */
+
+    struct {
+      __IOM uint32_t DMA_ENABLE : 1;            /*!< [0..0] Enable the DMA controller. When disabled, DMA requests
+                                                     will be ignored by the controller                                         */
+      __IOM uint32_t DECODEABORT : 1;           /*!< [1..1] APB Decode Abort. When set, the APB bridge will issue
+                                                     a data abort (bus fault) on transactions to peripherals
+                                                     that are powered down. When set to 0, writes are quietly
+                                                     discarded and reads return 0.                                             */
+      __IM  uint32_t            : 6;
+      __IOM uint32_t HYSTERESIS : 8;            /*!< [15..8] This field determines how long the DMA will remain active
+                                                     during deep sleep before shutting down and returning the
+                                                     system to full deep sleep. Values are based on a 94KHz
+                                                     clock and are roughly 10us increments for a range of ~10us
+                                                     to 2.55ms                                                                 */
+    } APBDMACTRL_b;
+  } ;
+
+  union {
+    __IOM uint32_t SRAMMODE;                    /*!< (@ 0x00000284) SRAM Controller mode bits                                  */
+
+    struct {
+      __IOM uint32_t IPREFETCH  : 1;            /*!< [0..0] When set, instruction accesses to the SRAM banks will
+                                                     be prefetched (normally 2 cycle read access). Generally,
+                                                     this mode bit should be set for improved performance when
+                                                     executing instructions from SRAM.                                         */
+      __IOM uint32_t IPREFETCH_CACHE : 1;       /*!< [1..1] Secondary prefetch feature that will cache prefetched
+                                                     data across bus waitstates (requires IPREFETCH to be set).                */
+      __IM  uint32_t            : 2;
+      __IOM uint32_t DPREFETCH  : 1;            /*!< [4..4] When set, data bus accesses to the SRAM banks will be
+                                                     prefetched (normally 2 cycle read access). Use of this
+                                                     mode bit is only recommended if the work flow has a large
+                                                     number of sequential accesses.                                            */
+      __IOM uint32_t DPREFETCH_CACHE : 1;       /*!< [5..5] Secondary prefetch feature that will cache prefetched
+                                                     data across bus waitstates (requires DPREFETCH to be set).                */
+    } SRAMMODE_b;
+  } ;
+  __IM  uint32_t  RESERVED13[48];
+
+  union {
+    __IOM uint32_t KEXTCLKSEL;                  /*!< (@ 0x00000348) Key Register to enable the use of external clock
+                                                                    selects via the EXTCLKSEL reg                              */
+
+    struct {
+      __IOM uint32_t KEXTCLKSEL : 32;           /*!< [31..0] Key register value.                                               */
+    } KEXTCLKSEL_b;
+  } ;
+  __IM  uint32_t  RESERVED14[4];
+
+  union {
+    __IOM uint32_t SIMOBUCK4;                   /*!< (@ 0x0000035C) SIMO Buck Control Reg1                                     */
+
+    struct {
+      __IOM uint32_t SIMOBUCKMEMLPLOWTONTRIM : 4;/*!< [3..0] simobuck_mem_lp_low_ton_trim                                      */
+      __IOM uint32_t SIMOBUCKMEMACTDRVSTRTRIM : 2;/*!< [5..4] simobuck_mem_act_drvstr_trim                                     */
+      __IOM uint32_t SIMOBUCKMEMLPDRVSTRTRIM : 2;/*!< [7..6] simobuck_mem_lp_drvstr_trim                                       */
+      __IOM uint32_t SIMOBUCKMEMLEAKAGETRIM : 2;/*!< [9..8] simobuck_mem_leakage_trim                                          */
+      __IOM uint32_t SIMOBUCKZXTRIM : 4;        /*!< [13..10] simobuck_zx_trim                                                 */
+      __IOM uint32_t SIMOBUCKUVLOCNTRTRIM : 3;  /*!< [16..14] simobuck_uvlo_cntr_trim                                          */
+      __IOM uint32_t SIMOBUCKUVLODRVSTRTRIM : 3;/*!< [19..17] simobuck_uvlo_drvstr_trim                                        */
+      __IOM uint32_t SIMOBUCKEXTCLKSEL : 1;     /*!< [20..20] simobuck_extclk_sel                                              */
+      __IOM uint32_t SIMOBUCKCLKDIVSEL : 2;     /*!< [22..21] simobuck_clkdiv_sel                                              */
+      __IOM uint32_t SIMOBUCKCOMP2LPEN : 1;     /*!< [23..23] simobuck_comp2_lp_en                                             */
+      __IOM uint32_t SIMOBUCKCOMP2TIMEOUTEN : 1;/*!< [24..24] simobuck_comp2_timeout_en                                        */
+      __IOM uint32_t SIMOBUCKPRIORITYSEL : 1;   /*!< [25..25] simobuck_priority_sel                                            */
+      __IOM uint32_t SIMOBUCKUVLOMODE : 2;      /*!< [27..26] simobuck_uvlo_mode                                               */
+      __IOM uint32_t SIMOBUCKIBIASTRIM : 4;     /*!< [31..28] simobuck_bias_trim                                               */
+    } SIMOBUCK4_b;
+  } ;
+  __IM  uint32_t  RESERVED15[2];
+
+  union {
+    __IOM uint32_t BLEBUCK2;                    /*!< (@ 0x00000368) BLEBUCK2 Control Reg                                       */
+
+    struct {
+      __IOM uint32_t BLEBUCKTONLOWTRIM : 6;     /*!< [5..0] blebuck_ton_low_trim                                               */
+      __IOM uint32_t BLEBUCKTONHITRIM : 6;      /*!< [11..6] blebuck_ton_hi_trim                                               */
+      __IOM uint32_t BLEBUCKTOND2ATRIM : 6;     /*!< [17..12] blebuck_ton_trim                                                 */
+    } BLEBUCK2_b;
+  } ;
+  __IM  uint32_t  RESERVED16[13];
+
+  union {
+    __IOM uint32_t FLASHWPROT0;                 /*!< (@ 0x000003A0) Flash Write Protection Bits                                */
+
+    struct {
+      __IOM uint32_t FW0BITS    : 32;           /*!< [31..0] Write protect flash 0x00000000 - 0x0007FFFF. Each bit
+                                                     provides write protection for 16KB chunks of flash data
+                                                     space. Bits are cleared by writing a 1 to the bit. When
+                                                     read, 0 indicates the region is protected. Bits are sticky
+                                                     (can be set when PROTLOCK is 1, but only cleared by reset)                */
+    } FLASHWPROT0_b;
+  } ;
+
+  union {
+    __IOM uint32_t FLASHWPROT1;                 /*!< (@ 0x000003A4) Flash Write Protection Bits                                */
+
+    struct {
+      __IOM uint32_t FW1BITS    : 32;           /*!< [31..0] Write protect flash 0x00080000 - 0x000FFFFF. Each bit
+                                                     provides write protection for 16KB chunks of flash data
+                                                     space. Bits are cleared by writing a 1 to the bit. When
+                                                     read, 0 indicates the region is protected. Bits are sticky
+                                                     (can be set when PROTLOCK is 1, but only cleared by reset)                */
+    } FLASHWPROT1_b;
+  } ;
+  __IM  uint32_t  RESERVED17[2];
+
+  union {
+    __IOM uint32_t FLASHRPROT0;                 /*!< (@ 0x000003B0) Flash Read Protection Bits                                 */
+
+    struct {
+      __IOM uint32_t FR0BITS    : 32;           /*!< [31..0] Copy (read) protect flash 0x00000000 - 0x0007FFFF. Each
+                                                     bit provides read protection for 16KB chunks of flash.
+                                                     Bits are cleared by writing a 1 to the bit. When read,
+                                                     0 indicates the region is protected. Bits are sticky (can
+                                                     be set when PROTLOCK is 1, but only cleared by reset)                     */
+    } FLASHRPROT0_b;
+  } ;
+
+  union {
+    __IOM uint32_t FLASHRPROT1;                 /*!< (@ 0x000003B4) Flash Read Protection Bits                                 */
+
+    struct {
+      __IOM uint32_t FR1BITS    : 32;           /*!< [31..0] Copy (read) protect flash 0x00080000 - 0x000FFFFF. Each
+                                                     bit provides read protection for 16KB chunks of flash.
+                                                     Bits are cleared by writing a 1 to the bit. When read,
+                                                     0 indicates the region is protected. Bits are sticky (can
+                                                     be set when PROTLOCK is 1, but only cleared by reset)                     */
+    } FLASHRPROT1_b;
+  } ;
+  __IM  uint32_t  RESERVED18[2];
+
+  union {
+    __IOM uint32_t DMASRAMWRITEPROTECT0;        /*!< (@ 0x000003C0) SRAM write-protection bits.                                */
+
+    struct {
+      __IOM uint32_t DMA_WPROT0 : 32;           /*!< [31..0] Write protect SRAM from DMA. Each bit provides write
+                                                     protection for an 8KB region of memory. When set to 1,
+                                                     the region will be protected from DMA writes, when set
+                                                     to 0, DMA may write the region.                                           */
+    } DMASRAMWRITEPROTECT0_b;
+  } ;
+
+  union {
+    __IOM uint32_t DMASRAMWRITEPROTECT1;        /*!< (@ 0x000003C4) SRAM write-protection bits.                                */
+
+    struct {
+      __IOM uint32_t DMA_WPROT1 : 16;           /*!< [15..0] Write protect SRAM from DMA. Each bit provides write
+                                                     protection for an 8KB region of memory. When set to 1,
+                                                     the region will be protected from DMA writes, when set
+                                                     to 0, DMA may write the region.                                           */
+    } DMASRAMWRITEPROTECT1_b;
+  } ;
+  __IM  uint32_t  RESERVED19[2];
+
+  union {
+    __IOM uint32_t DMASRAMREADPROTECT0;         /*!< (@ 0x000003D0) SRAM read-protection bits.                                 */
+
+    struct {
+      __IOM uint32_t DMA_RPROT0 : 32;           /*!< [31..0] Read protect SRAM from DMA. Each bit provides write
+                                                     protection for an 8KB region of memory. When set to 1,
+                                                     the region will be protected from DMA reads, when set to
+                                                     0, DMA may read the region.                                               */
+    } DMASRAMREADPROTECT0_b;
+  } ;
+
+  union {
+    __IOM uint32_t DMASRAMREADPROTECT1;         /*!< (@ 0x000003D4) SRAM read-protection bits.                                 */
+
+    struct {
+      __IOM uint32_t DMA_RPROT1 : 16;           /*!< [15..0] Read protect SRAM from DMA. Each bit provides write
+                                                     protection for an 8KB region of memory. When set to 1,
+                                                     the region will be protected from DMA reads, when set to
+                                                     0, DMA may read the region.                                               */
+    } DMASRAMREADPROTECT1_b;
+  } ;
+} MCUCTRL_Type;                                 /*!< Size = 984 (0x3d8)                                                        */
+
+
+
+/* =========================================================================================================================== */
+/* ================                                           MSPI                                            ================ */
+/* =========================================================================================================================== */
+
+
+/**
+  * @brief Multibit SPI Master (MSPI)
+  */
+
+typedef struct {                                /*!< (@ 0x50014000) MSPI Structure                                             */
+
+  union {
+    __IOM uint32_t CTRL;                        /*!< (@ 0x00000000) MSPI PIO Transfer Control/Status Register                  */
+
+    struct {
+      __IOM uint32_t START      : 1;            /*!< [0..0] Write to 1 to initiate a PIO transaction on the bus (typically
+                                                     the entire register should be written at once with this
+                                                     bit set).                                                                 */
+      __IOM uint32_t STATUS     : 1;            /*!< [1..1] Command status: 1 indicates command has completed. Cleared
+                                                     by writing 1 to this bit or starting a new transfer.                      */
+      __IOM uint32_t BUSY       : 1;            /*!< [2..2] Command status: 1 indicates controller is busy (command
+                                                     in progress)                                                              */
+      __IOM uint32_t QUADCMD    : 1;            /*!< [3..3] Flag indicating that the operation is a command that
+                                                     should be replicated to both devices in paired QUAD mode.
+                                                     This is typically only used when reading/writing configuration
+                                                     registers in paired flash devices (do not set for memory
+                                                     transfers).                                                               */
+      __IM  uint32_t            : 1;
+      __IOM uint32_t CONT       : 1;            /*!< [5..5] Continuation transfer. When 1, indicates that the MSPI
+                                                     will hold CE low after the transaction completes. This
+                                                     is included for compatibility with IOM module since the
+                                                     MSPI transfer module can handle most cases in a single
+                                                     transfer. NOTE: CONT functionality only works with CLKDIV=2
+                                                     (24 MHz).                                                                 */
+      __IOM uint32_t BIGENDIAN  : 1;            /*!< [6..6] 1 indicates data in FIFO is in big endian format (MSB
+                                                     first); 0 indicates little endian data (default, LSB first).              */
+      __IOM uint32_t ENTURN     : 1;            /*!< [7..7] Indicates whether TX->RX turnaround cycles should be
+                                                     enabled for this operation (see TURNAROUND field in CFG
+                                                     register).                                                                */
+      __IOM uint32_t SENDA      : 1;            /*!< [8..8] Indicates whether an address phase should be sent (see
+                                                     ADDR register and ASIZE field in CFG register)                            */
+      __IOM uint32_t SENDI      : 1;            /*!< [9..9] Indicates whether an instruction phase should be sent
+                                                     (see INSTR field and ISIZE field in CFG register)                         */
+      __IOM uint32_t TXRX       : 1;            /*!< [10..10] 1 Indicates a TX operation, 0 indicates an RX operation
+                                                     of XFERBYTES                                                              */
+      __IOM uint32_t PIOSCRAMBLE : 1;           /*!< [11..11] Enables data scrambling for PIO opertions. This should
+                                                     only be used for data operations and never for commands
+                                                     to a device.                                                              */
+      __IM  uint32_t            : 4;
+      __IOM uint32_t XFERBYTES  : 16;           /*!< [31..16] Number of bytes to transmit or receive (based on TXRX
+                                                     bit)                                                                      */
+    } CTRL_b;
+  } ;
+
+  union {
+    __IOM uint32_t CFG;                         /*!< (@ 0x00000004) MSPI Transfer Configuration Register                       */
+
+    struct {
+      __IOM uint32_t DEVCFG     : 4;            /*!< [3..0] Flash configuration for XIP and AUTO DMA operations.
+                                                     Controls value for SER (Slave Enable) for XIP operations
+                                                     and address generation for DMA/XIP modes. Also used to
+                                                     configure SPIFRF (frame format).                                          */
+      __IOM uint32_t ASIZE      : 2;            /*!< [5..4] Address Size. Address bytes to send from ADDR register
+                                                     name = A1 value = 0x0 desc = Send one address byteenum
+                                                     name = A2 value = 0x1 desc = Send two address bytesenum
+                                                     name = A3 value = 0x2 desc = Send three address bytesenum
+                                                     name = A4 value = 0x3 desc = Send four address bytes                      */
+      __IOM uint32_t ISIZE      : 1;            /*!< [6..6] Instruction Sizeenum name = I8 value = 0x0 desc = Instruction
+                                                     is 1 byteenum name = I16 value = 0x1 desc = Instruction
+                                                     is 2 bytes                                                                */
+      __IOM uint32_t SEPIO      : 1;            /*!< [7..7] Separate IO configuration. This bit should be set when
+                                                     the target device has separate MOSI and MISO pins. Respective
+                                                     IN/OUT bits below should be set to map pins.                              */
+      __IOM uint32_t TURNAROUND : 6;            /*!< [13..8] Number of turnaound cycles (for TX->RX transitions).
+                                                     Qualified by ENTURN or XIPENTURN bit field.                               */
+      __IM  uint32_t            : 2;
+      __IOM uint32_t CPHA       : 1;            /*!< [16..16] Serial clock phase.                                              */
+      __IOM uint32_t CPOL       : 1;            /*!< [17..17] Serial clock polarity.                                           */
+    } CFG_b;
+  } ;
+
+  union {
+    __IOM uint32_t ADDR;                        /*!< (@ 0x00000008) MSPI Transfer Address Register                             */
+
+    struct {
+      __IOM uint32_t ADDR       : 32;           /*!< [31..0] Optional Address field to send (after optional instruction
+                                                     field) - qualified by ASIZE in CMD register. NOTE: This
+                                                     register is aliased to DMADEVADDR.                                        */
+    } ADDR_b;
+  } ;
+
+  union {
+    __IOM uint32_t INSTR;                       /*!< (@ 0x0000000C) MSPI Transfer Instruction                                  */
+
+    struct {
+      __IOM uint32_t INSTR      : 16;           /*!< [15..0] Optional Instruction field to send (1st byte) - qualified
+                                                     by ISEND/ISIZE                                                            */
+    } INSTR_b;
+  } ;
+
+  union {
+    __IOM uint32_t TXFIFO;                      /*!< (@ 0x00000010) TX Data FIFO                                               */
+
+    struct {
+      __IOM uint32_t TXFIFO     : 32;           /*!< [31..0] Data to be transmitted. Data should normall be aligned
+                                                     to the LSB (pad the upper bits with zeros) unless BIGENDIAN
+                                                     is set.                                                                   */
+    } TXFIFO_b;
+  } ;
+
+  union {
+    __IOM uint32_t RXFIFO;                      /*!< (@ 0x00000014) RX Data FIFO                                               */
+
+    struct {
+      __IOM uint32_t RXFIFO     : 32;           /*!< [31..0] Receive data. Data is aligned to the LSB (padded zeros
+                                                     on upper bits) unless BIGENDIAN is set.                                   */
+    } RXFIFO_b;
+  } ;
+
+  union {
+    __IOM uint32_t TXENTRIES;                   /*!< (@ 0x00000018) TX FIFO Entries                                            */
+
+    struct {
+      __IOM uint32_t TXENTRIES  : 5;            /*!< [4..0] Number of 32-bit words/entries in TX FIFO                          */
+    } TXENTRIES_b;
+  } ;
+
+  union {
+    __IOM uint32_t RXENTRIES;                   /*!< (@ 0x0000001C) RX FIFO Entries                                            */
+
+    struct {
+      __IOM uint32_t RXENTRIES  : 5;            /*!< [4..0] Number of 32-bit words/entries in RX FIFO                          */
+    } RXENTRIES_b;
+  } ;
+
+  union {
+    __IOM uint32_t THRESHOLD;                   /*!< (@ 0x00000020) TX/RX FIFO Threshhold Levels                               */
+
+    struct {
+      __IOM uint32_t TXTHRESH   : 5;            /*!< [4..0] Number of entries in TX FIFO that cause TXF interrupt              */
+      __IM  uint32_t            : 3;
+      __IOM uint32_t RXTHRESH   : 5;            /*!< [12..8] Number of entries in TX FIFO that cause RXE interrupt             */
+    } THRESHOLD_b;
+  } ;
+  __IM  uint32_t  RESERVED[55];
+
+  union {
+    __IOM uint32_t MSPICFG;                     /*!< (@ 0x00000100) MSPI Module Configuration                                  */
+
+    struct {
+      __IOM uint32_t APBCLK     : 1;            /*!< [0..0] Enable continuous APB clock. For power-efficient operation,
+                                                     APBCLK should be set to 0.                                                */
+      __IOM uint32_t RXCAP      : 1;            /*!< [1..1] Controls RX data capture phase. A setting of 0 (NORMAL)
+                                                     captures read data at the normal capture point relative
+                                                     to the internal clock launch point. However, to accomodate
+                                                     chip/pad/board delays, a setting of RXCAP of 1 is expected
+                                                     to be used to align the capture point with the return data
+                                                     window. This bit is used in conjunction with RXNEG to provide
+                                                     4 unique capture points, all about 10ns apart.                            */
+      __IOM uint32_t RXNEG      : 1;            /*!< [2..2] Adjusts the RX capture phase to the negedge of the 48MHz
+                                                     internal clock (~10ns early). For normal operation, it
+                                                     is expected that RXNEG will be set to 0.                                  */
+      __IOM uint32_t TXNEG      : 1;            /*!< [3..3] Launches TX data a half clock cycle (~10ns) early. This
+                                                     should normally be programmed to zero (NORMAL).                           */
+      __IOM uint32_t IOMSEL     : 3;            /*!< [6..4] Selects which IOM is selected for CQ handshake status.             */
+      __IM  uint32_t            : 1;
+      __IOM uint32_t CLKDIV     : 6;            /*!< [13..8] Clock Divider. Allows dividing 48 MHz base clock by
+                                                     integer multiples. Enumerations are provided for common
+                                                     frequency, but any integer divide from 48 MHz is allowed.
+                                                     Odd divide ratios will result in a 33/66 percent duty cycle
+                                                     with a long low clock pulse (to allow longer round-trip
+                                                     for read data).                                                           */
+      __IM  uint32_t            : 15;
+      __IOM uint32_t FIFORESET  : 1;            /*!< [29..29] Reset MSPI FIFO (active high). 1=reset FIFO, 0=normal
+                                                     operation. May be used to manually flush the FIFO in error
+                                                     handling.                                                                 */
+      __IOM uint32_t IPRSTN     : 1;            /*!< [30..30] IP block reset. Write to 0 to put the transfer module
+                                                     in reset or 1 for normal operation. This may be required
+                                                     after error conditions to clear the transfer on the bus.                  */
+      __IOM uint32_t PRSTN      : 1;            /*!< [31..31] Peripheral reset. Master reset to the entire MSPI module
+                                                     (DMA, XIP, and transfer state machines). 1=normal operation,
+                                                     0=in reset.                                                               */
+    } MSPICFG_b;
+  } ;
+
+  union {
+    __IOM uint32_t PADCFG;                      /*!< (@ 0x00000104) MSPI Output Pad Configuration                              */
+
+    struct {
+      __IOM uint32_t OUT3       : 1;            /*!< [0..0] Output pad 3 configuration. 0=data[3] 1=CLK                        */
+      __IOM uint32_t OUT4       : 1;            /*!< [1..1] Output pad 4 configuration. 0=data[4] 1=data[0]                    */
+      __IOM uint32_t OUT5       : 1;            /*!< [2..2] Output pad 5 configuration. 0=data[5] 1=data[1]                    */
+      __IOM uint32_t OUT6       : 1;            /*!< [3..3] Output pad 6 configuration. 0=data[6] 1=data[2]                    */
+      __IOM uint32_t OUT7       : 1;            /*!< [4..4] Output pad 7 configuration. 0=data[7] 1=data[3]                    */
+      __IM  uint32_t            : 11;
+      __IOM uint32_t IN0        : 2;            /*!< [17..16] Data Input pad 0 pin muxing: 0=pad[0] 1=pad[4] 2=pad[1]
+                                                     3=pad[5]                                                                  */
+      __IOM uint32_t IN1        : 1;            /*!< [18..18] Data Input pad 1 pin muxing: 0=pad[1] 1=pad[5]                   */
+      __IOM uint32_t IN2        : 1;            /*!< [19..19] Data Input pad 2 pin muxing: 0=pad[2] 1=pad[6]                   */
+      __IOM uint32_t IN3        : 1;            /*!< [20..20] Data Input pad 3 pin muxing: 0=pad[3] 1=pad[7]                   */
+      __IOM uint32_t REVCS      : 1;            /*!< [21..21] Reverse CS connections. Allows CS1 to be associated
+                                                     with lower data lanes and CS0 to be associated with upper
+                                                     data lines                                                                */
+    } PADCFG_b;
+  } ;
+
+  union {
+    __IOM uint32_t PADOUTEN;                    /*!< (@ 0x00000108) MSPI Output Enable Pad Configuration                       */
+
+    struct {
+      __IOM uint32_t OUTEN      : 9;            /*!< [8..0] Output pad enable configuration. Indicates which pads
+                                                     should be driven. Bits [3:0] are Quad0 data, [7:4] are
+                                                     Quad1 data, and [8] is clock.                                             */
+    } PADOUTEN_b;
+  } ;
+
+  union {
+    __IOM uint32_t FLASH;                       /*!< (@ 0x0000010C) Configuration for XIP/DMA support of SPI flash
+                                                                    modules.                                                   */
+
+    struct {
+      __IOM uint32_t XIPEN      : 1;            /*!< [0..0] Enable the XIP (eXecute In Place) function which effectively
+                                                     enables the address decoding of the MSPI device in the
+                                                     flash/cache address space at address 0x04000000-0x07FFFFFF.               */
+      __IM  uint32_t            : 1;
+      __IOM uint32_t XIPACK     : 2;            /*!< [3..2] Controls transmission of Micron XIP acknowledge cycles
+                                                     (Micron Flash devices only)                                               */
+      __IOM uint32_t XIPBIGENDIAN : 1;          /*!< [4..4] Indicates whether XIP/AUTO DMA data transfers are in
+                                                     big or little endian format                                               */
+      __IOM uint32_t XIPENTURN  : 1;            /*!< [5..5] Indicates whether XIP/AUTO DMA operations should enable
+                                                     TX->RX turnaround cycles                                                  */
+      __IOM uint32_t XIPSENDA   : 1;            /*!< [6..6] Indicates whether XIP/AUTO DMA operations should send
+                                                     an an address phase (see DMADEVADDR register and ASIZE
+                                                     field in CFG)                                                             */
+      __IOM uint32_t XIPSENDI   : 1;            /*!< [7..7] Indicates whether XIP/AUTO DMA operations should send
+                                                     an instruction (see READINSTR field and ISIZE field in
+                                                     CFG)                                                                      */
+      __IOM uint32_t XIPMIXED   : 3;            /*!< [10..8] Reserved. Set to 0x0                                              */
+      __IM  uint32_t            : 5;
+      __IOM uint32_t WRITEINSTR : 8;            /*!< [23..16] Write command sent for DMA operations                            */
+      __IOM uint32_t READINSTR  : 8;            /*!< [31..24] Read command sent to flash for DMA/XIP operations                */
+    } FLASH_b;
+  } ;
+  __IM  uint32_t  RESERVED1[4];
+
+  union {
+    __IOM uint32_t SCRAMBLING;                  /*!< (@ 0x00000120) External Flash Scrambling Controls                         */
+
+    struct {
+      __IOM uint32_t SCRSTART   : 10;           /*!< [9..0] Scrambling region start address [25:16] (64K block granularity).
+                                                     The START block is the FIRST block included in the scrambled
+                                                     address range.                                                            */
+      __IM  uint32_t            : 6;
+      __IOM uint32_t SCREND     : 10;           /*!< [25..16] Scrambling region end address [25:16] (64K block granularity).
+                                                     The END block is the LAST block included in the scrambled
+                                                     address range.                                                            */
+      __IM  uint32_t            : 5;
+      __IOM uint32_t SCRENABLE  : 1;            /*!< [31..31] Enables Data Scrambling Region. When 1 reads and writes
+                                                     to the range will be scrambled. When 0, data will be read/written
+                                                     unmodified. Address range is specified in 64K granularity
+                                                     and the START/END ranges are included within the range.                   */
+    } SCRAMBLING_b;
+  } ;
+  __IM  uint32_t  RESERVED2[55];
+
+  union {
+    __IOM uint32_t INTEN;                       /*!< (@ 0x00000200) MSPI Master Interrupts: Enable                             */
+
+    struct {
+      __IOM uint32_t CMDCMP     : 1;            /*!< [0..0] Transfer complete. Note that DMA and CQ operations are
+                                                     layered, so CMDCMP, DCMP, and CQ* can all be signalled
+                                                     simultaneously                                                            */
+      __IOM uint32_t TXE        : 1;            /*!< [1..1] Transmit FIFO empty.                                               */
+      __IOM uint32_t TXO        : 1;            /*!< [2..2] Transmit FIFO Overflow (only occurs when SW writes to
+                                                     a full FIFO).                                                             */
+      __IOM uint32_t RXU        : 1;            /*!< [3..3] Receive FIFO underflow (only occurs when SW reads from
+                                                     an empty FIFO)                                                            */
+      __IOM uint32_t RXO        : 1;            /*!< [4..4] Receive FIFO overflow (cannot happen in MSPI design --
+                                                     MSPI bus pins will stall)                                                 */
+      __IOM uint32_t RXF        : 1;            /*!< [5..5] Receive FIFO full                                                  */
+      __IOM uint32_t DCMP       : 1;            /*!< [6..6] DMA Complete Interrupt                                             */
+      __IOM uint32_t DERR       : 1;            /*!< [7..7] DMA Error Interrupt                                                */
+      __IOM uint32_t CQCMP      : 1;            /*!< [8..8] Command Queue Complete Interrupt                                   */
+      __IOM uint32_t CQUPD      : 1;            /*!< [9..9] Command Queue Update Interrupt. Issued whenever the CQ
+                                                     performs an operation where address bit[0] is set. Useful
+                                                     for triggering CURIDX interrupts.                                         */
+      __IOM uint32_t CQPAUSED   : 1;            /*!< [10..10] Command Queue is Paused.                                         */
+      __IOM uint32_t CQERR      : 1;            /*!< [11..11] Command Queue Error Interrupt                                    */
+      __IOM uint32_t SCRERR     : 1;            /*!< [12..12] Scrambling Alignment Error. Scrambling operations must
+                                                     be aligned to word (4-byte) start address.                                */
+    } INTEN_b;
+  } ;
+
+  union {
+    __IOM uint32_t INTSTAT;                     /*!< (@ 0x00000204) MSPI Master Interrupts: Status                             */
+
+    struct {
+      __IOM uint32_t CMDCMP     : 1;            /*!< [0..0] Transfer complete. Note that DMA and CQ operations are
+                                                     layered, so CMDCMP, DCMP, and CQ* can all be signalled
+                                                     simultaneously                                                            */
+      __IOM uint32_t TXE        : 1;            /*!< [1..1] Transmit FIFO empty.                                               */
+      __IOM uint32_t TXO        : 1;            /*!< [2..2] Transmit FIFO Overflow (only occurs when SW writes to
+                                                     a full FIFO).                                                             */
+      __IOM uint32_t RXU        : 1;            /*!< [3..3] Receive FIFO underflow (only occurs when SW reads from
+                                                     an empty FIFO)                                                            */
+      __IOM uint32_t RXO        : 1;            /*!< [4..4] Receive FIFO overflow (cannot happen in MSPI design --
+                                                     MSPI bus pins will stall)                                                 */
+      __IOM uint32_t RXF        : 1;            /*!< [5..5] Receive FIFO full                                                  */
+      __IOM uint32_t DCMP       : 1;            /*!< [6..6] DMA Complete Interrupt                                             */
+      __IOM uint32_t DERR       : 1;            /*!< [7..7] DMA Error Interrupt                                                */
+      __IOM uint32_t CQCMP      : 1;            /*!< [8..8] Command Queue Complete Interrupt                                   */
+      __IOM uint32_t CQUPD      : 1;            /*!< [9..9] Command Queue Update Interrupt. Issued whenever the CQ
+                                                     performs an operation where address bit[0] is set. Useful
+                                                     for triggering CURIDX interrupts.                                         */
+      __IOM uint32_t CQPAUSED   : 1;            /*!< [10..10] Command Queue is Paused.                                         */
+      __IOM uint32_t CQERR      : 1;            /*!< [11..11] Command Queue Error Interrupt                                    */
+      __IOM uint32_t SCRERR     : 1;            /*!< [12..12] Scrambling Alignment Error. Scrambling operations must
+                                                     be aligned to word (4-byte) start address.                                */
+    } INTSTAT_b;
+  } ;
+
+  union {
+    __IOM uint32_t INTCLR;                      /*!< (@ 0x00000208) MSPI Master Interrupts: Clear                              */
+
+    struct {
+      __IOM uint32_t CMDCMP     : 1;            /*!< [0..0] Transfer complete. Note that DMA and CQ operations are
+                                                     layered, so CMDCMP, DCMP, and CQ* can all be signalled
+                                                     simultaneously                                                            */
+      __IOM uint32_t TXE        : 1;            /*!< [1..1] Transmit FIFO empty.                                               */
+      __IOM uint32_t TXO        : 1;            /*!< [2..2] Transmit FIFO Overflow (only occurs when SW writes to
+                                                     a full FIFO).                                                             */
+      __IOM uint32_t RXU        : 1;            /*!< [3..3] Receive FIFO underflow (only occurs when SW reads from
+                                                     an empty FIFO)                                                            */
+      __IOM uint32_t RXO        : 1;            /*!< [4..4] Receive FIFO overflow (cannot happen in MSPI design --
+                                                     MSPI bus pins will stall)                                                 */
+      __IOM uint32_t RXF        : 1;            /*!< [5..5] Receive FIFO full                                                  */
+      __IOM uint32_t DCMP       : 1;            /*!< [6..6] DMA Complete Interrupt                                             */
+      __IOM uint32_t DERR       : 1;            /*!< [7..7] DMA Error Interrupt                                                */
+      __IOM uint32_t CQCMP      : 1;            /*!< [8..8] Command Queue Complete Interrupt                                   */
+      __IOM uint32_t CQUPD      : 1;            /*!< [9..9] Command Queue Update Interrupt. Issued whenever the CQ
+                                                     performs an operation where address bit[0] is set. Useful
+                                                     for triggering CURIDX interrupts.                                         */
+      __IOM uint32_t CQPAUSED   : 1;            /*!< [10..10] Command Queue is Paused.                                         */
+      __IOM uint32_t CQERR      : 1;            /*!< [11..11] Command Queue Error Interrupt                                    */
+      __IOM uint32_t SCRERR     : 1;            /*!< [12..12] Scrambling Alignment Error. Scrambling operations must
+                                                     be aligned to word (4-byte) start address.                                */
+    } INTCLR_b;
+  } ;
+
+  union {
+    __IOM uint32_t INTSET;                      /*!< (@ 0x0000020C) MSPI Master Interrupts: Set                                */
+
+    struct {
+      __IOM uint32_t CMDCMP     : 1;            /*!< [0..0] Transfer complete. Note that DMA and CQ operations are
+                                                     layered, so CMDCMP, DCMP, and CQ* can all be signalled
+                                                     simultaneously                                                            */
+      __IOM uint32_t TXE        : 1;            /*!< [1..1] Transmit FIFO empty.                                               */
+      __IOM uint32_t TXO        : 1;            /*!< [2..2] Transmit FIFO Overflow (only occurs when SW writes to
+                                                     a full FIFO).                                                             */
+      __IOM uint32_t RXU        : 1;            /*!< [3..3] Receive FIFO underflow (only occurs when SW reads from
+                                                     an empty FIFO)                                                            */
+      __IOM uint32_t RXO        : 1;            /*!< [4..4] Receive FIFO overflow (cannot happen in MSPI design --
+                                                     MSPI bus pins will stall)                                                 */
+      __IOM uint32_t RXF        : 1;            /*!< [5..5] Receive FIFO full                                                  */
+      __IOM uint32_t DCMP       : 1;            /*!< [6..6] DMA Complete Interrupt                                             */
+      __IOM uint32_t DERR       : 1;            /*!< [7..7] DMA Error Interrupt                                                */
+      __IOM uint32_t CQCMP      : 1;            /*!< [8..8] Command Queue Complete Interrupt                                   */
+      __IOM uint32_t CQUPD      : 1;            /*!< [9..9] Command Queue Update Interrupt. Issued whenever the CQ
+                                                     performs an operation where address bit[0] is set. Useful
+                                                     for triggering CURIDX interrupts.                                         */
+      __IOM uint32_t CQPAUSED   : 1;            /*!< [10..10] Command Queue is Paused.                                         */
+      __IOM uint32_t CQERR      : 1;            /*!< [11..11] Command Queue Error Interrupt                                    */
+      __IOM uint32_t SCRERR     : 1;            /*!< [12..12] Scrambling Alignment Error. Scrambling operations must
+                                                     be aligned to word (4-byte) start address.                                */
+    } INTSET_b;
+  } ;
+  __IM  uint32_t  RESERVED3[16];
+
+  union {
+    __IOM uint32_t DMACFG;                      /*!< (@ 0x00000250) DMA Configuration Register                                 */
+
+    struct {
+      __IOM uint32_t DMAEN      : 2;            /*!< [1..0] DMA Enable. Setting this bit to EN will start the DMA
+                                                     operation                                                                 */
+      __IOM uint32_t DMADIR     : 1;            /*!< [2..2] Direction                                                          */
+      __IOM uint32_t DMAPRI     : 2;            /*!< [4..3] Sets the Priority of the DMA request                               */
+      __IM  uint32_t            : 13;
+      __IOM uint32_t DMAPWROFF  : 1;            /*!< [18..18] Power off MSPI domain upon completion of DMA operation.          */
+    } DMACFG_b;
+  } ;
+
+  union {
+    __IOM uint32_t DMASTAT;                     /*!< (@ 0x00000254) DMA Status Register                                        */
+
+    struct {
+      __IOM uint32_t DMATIP     : 1;            /*!< [0..0] DMA Transfer In Progress indicator. 1 will indicate that
+                                                     a DMA transfer is active. The DMA transfer may be waiting
+                                                     on data, transferring data, or waiting for priority. All
+                                                     of these will be indicated with a 1. A 0 will indicate
+                                                     that the DMA is fully complete and no further transactions
+                                                     will be done.                                                             */
+      __IOM uint32_t DMACPL     : 1;            /*!< [1..1] DMA Transfer Complete. This signals the end of the DMA
+                                                     operation.                                                                */
+      __IOM uint32_t DMAERR     : 1;            /*!< [2..2] DMA Error. This active high bit signals that an error
+                                                     was encountered during the DMA operation.                                 */
+      __IOM uint32_t SCRERR     : 1;            /*!< [3..3] Scrambling Access Alignment Error. This active high bit
+                                                     signals that a scrambling operation was specified for a
+                                                     non-word aligned DEVADDR.                                                 */
+    } DMASTAT_b;
+  } ;
+
+  union {
+    __IOM uint32_t DMATARGADDR;                 /*!< (@ 0x00000258) DMA Target Address Register                                */
+
+    struct {
+      __IOM uint32_t TARGADDR   : 32;           /*!< [31..0] Target byte address for source of DMA (either read or
+                                                     write). In cases of non-word aligned addresses, the DMA
+                                                     logic will take care for ensuring only the target bytes
+                                                     are read/written.                                                         */
+    } DMATARGADDR_b;
+  } ;
+
+  union {
+    __IOM uint32_t DMADEVADDR;                  /*!< (@ 0x0000025C) DMA Device Address Register                                */
+
+    struct {
+      __IOM uint32_t DEVADDR    : 32;           /*!< [31..0] SPI Device address for automated DMA transactions (both
+                                                     read and write).                                                          */
+    } DMADEVADDR_b;
+  } ;
+
+  union {
+    __IOM uint32_t DMATOTCOUNT;                 /*!< (@ 0x00000260) DMA Total Transfer Count                                   */
+
+    struct {
+      __IOM uint32_t TOTCOUNT   : 16;           /*!< [15..0] Total Transfer Count in bytes.                                    */
+    } DMATOTCOUNT_b;
+  } ;
+
+  union {
+    __IOM uint32_t DMABCOUNT;                   /*!< (@ 0x00000264) DMA BYTE Transfer Count                                    */
+
+    struct {
+      __IOM uint32_t BCOUNT     : 8;            /*!< [7..0] Burst transfer size in bytes. This is the number of bytes
+                                                     transferred when a FIFO trigger event occurs. Recommended
+                                                     values are 16 or 32.                                                      */
+    } DMABCOUNT_b;
+  } ;
+  __IM  uint32_t  RESERVED4[4];
+
+  union {
+    __IOM uint32_t DMATHRESH;                   /*!< (@ 0x00000278) DMA Transmit Trigger Threshhold                            */
+
+    struct {
+      __IOM uint32_t DMATHRESH  : 4;            /*!< [3..0] DMA transfer FIFO level trigger. For read operations,
+                                                     DMA is triggered when the FIFO level is greater than this
+                                                     value. For write operations, DMA is triggered when the
+                                                     FIFO level is less than this level. Each DMA operation
+                                                     will consist of BCOUNT bytes.                                             */
+    } DMATHRESH_b;
+  } ;
+  __IM  uint32_t  RESERVED5[9];
+
+  union {
+    __IOM uint32_t CQCFG;                       /*!< (@ 0x000002A0) Command Queue Configuration Register                       */
+
+    struct {
+      __IOM uint32_t CQEN       : 1;            /*!< [0..0] Command queue enable. When set, will enable the processing
+                                                     of the command queue                                                      */
+      __IOM uint32_t CQPRI      : 1;            /*!< [1..1] Sets the Priority of the command queue dma request                 */
+      __IOM uint32_t CQPWROFF   : 1;            /*!< [2..2] Power off MSPI domain upon completion of DMA operation.            */
+      __IOM uint32_t CQAUTOCLEARMASK : 1;       /*!< [3..3] Eanble clear of CQMASK after each pause operation. This
+                                                     may be useful when using software flags to pause CQ.                      */
+    } CQCFG_b;
+  } ;
+  __IM  uint32_t  RESERVED6;
+
+  union {
+    __IOM uint32_t CQADDR;                      /*!< (@ 0x000002A8) CQ Target Read Address Register                            */
+
+    struct {
+      __IOM uint32_t CQADDR     : 29;           /*!< [28..0] Address of command queue buffer in SRAM or flash. The
+                                                     buffer address must be aligned to a word boundary.                        */
+    } CQADDR_b;
+  } ;
+
+  union {
+    __IOM uint32_t CQSTAT;                      /*!< (@ 0x000002AC) Command Queue Status Register                              */
+
+    struct {
+      __IOM uint32_t CQTIP      : 1;            /*!< [0..0] Command queue Transfer In Progress indicator. 1 will
+                                                     indicate that a CQ transfer is active and this will remain
+                                                     active even when paused waiting for external event.                       */
+      __IOM uint32_t CQCPL      : 1;            /*!< [1..1] Command queue operation Complete. This signals the end
+                                                     of the command queue operation.                                           */
+      __IOM uint32_t CQERR      : 1;            /*!< [2..2] Command queue processing Error. This active high bit
+                                                     signals that an error was encountered during the CQ operation.            */
+      __IOM uint32_t CQPAUSED   : 1;            /*!< [3..3] Command queue is currently paused status.                          */
+    } CQSTAT_b;
+  } ;
+
+  union {
+    __IOM uint32_t CQFLAGS;                     /*!< (@ 0x000002B0) Command Queue Flag Register                                */
+
+    struct {
+      __IOM uint32_t CQFLAGS    : 16;           /*!< [15..0] Current flag status (read-only). Bits [7:0] are software
+                                                     controllable and bits [15:8] are hardware status.                         */
+    } CQFLAGS_b;
+  } ;
+
+  union {
+    __IOM uint32_t CQSETCLEAR;                  /*!< (@ 0x000002B4) Command Queue Flag Set/Clear Register                      */
+
+    struct {
+      __IOM uint32_t CQFSET     : 8;            /*!< [7..0] Set CQFlag status bits. Set has priority over clear if
+                                                     both are high.                                                            */
+      __IOM uint32_t CQFTOGGLE  : 8;            /*!< [15..8] Toggle CQFlag status bits                                         */
+      __IOM uint32_t CQFCLR     : 8;            /*!< [23..16] Clear CQFlag status bits.                                        */
+    } CQSETCLEAR_b;
+  } ;
+
+  union {
+    __IOM uint32_t CQPAUSE;                     /*!< (@ 0x000002B8) Command Queue Pause Mask Register                          */
+
+    struct {
+      __IOM uint32_t CQMASK     : 16;           /*!< [15..0] CQ will pause processing until all specified events
+                                                     are satisfied.                                                            */
+    } CQPAUSE_b;
+  } ;
+  __IM  uint32_t  RESERVED7;
+
+  union {
+    __IOM uint32_t CQCURIDX;                    /*!< (@ 0x000002C0) Command Queue Current Index                                */
+
+    struct {
+      __IOM uint32_t CQCURIDX   : 8;            /*!< [7..0] Can be used to indicate the current position of the command
+                                                     queue by having CQ operations write this field. A CQ hardware
+                                                     status flag indicates when CURIDX and ENDIDX are not equal,
+                                                     allowing SW to pause the CQ processing until the end index
+                                                     is updated.                                                               */
+    } CQCURIDX_b;
+  } ;
+
+  union {
+    __IOM uint32_t CQENDIDX;                    /*!< (@ 0x000002C4) Command Queue End Index                                    */
+
+    struct {
+      __IOM uint32_t CQENDIDX   : 8;            /*!< [7..0] Can be used to indicate the end position of the command
+                                                     queue. A CQ hardware status bit indices when CURIDX !=
+                                                     ENDIDX so that the CQ can be paused when it reaches the
+                                                     end pointer.                                                              */
+    } CQENDIDX_b;
+  } ;
+} MSPI_Type;                                    /*!< Size = 712 (0x2c8)                                                        */
+
+
+
+/* =========================================================================================================================== */
+/* ================                                            PDM                                            ================ */
+/* =========================================================================================================================== */
+
+
+/**
+  * @brief PDM Audio (PDM)
+  */
+
+typedef struct {                                /*!< (@ 0x50011000) PDM Structure                                              */
+
+  union {
+    __IOM uint32_t PCFG;                        /*!< (@ 0x00000000) PDM Configuration Register                                 */
+
+    struct {
+      __IOM uint32_t PDMCOREEN  : 1;            /*!< [0..0] Data Streaming Control.                                            */
+      __IOM uint32_t SOFTMUTE   : 1;            /*!< [1..1] Soft mute control.                                                 */
+      __IOM uint32_t CYCLES     : 3;            /*!< [4..2] Number of clocks during gain-setting changes.                      */
+      __IOM uint32_t HPCUTOFF   : 4;            /*!< [8..5] High pass filter coefficients.                                     */
+      __IOM uint32_t ADCHPD     : 1;            /*!< [9..9] High pass filter control.                                          */
+      __IOM uint32_t SINCRATE   : 7;            /*!< [16..10] SINC decimation rate.                                            */
+      __IOM uint32_t MCLKDIV    : 2;            /*!< [18..17] PDM_CLK frequency divisor.                                       */
+      __IM  uint32_t            : 2;
+      __IOM uint32_t PGALEFT    : 5;            /*!< [25..21] Left channel PGA gain.                                           */
+      __IOM uint32_t PGARIGHT   : 5;            /*!< [30..26] Right channel PGA gain.                                          */
+      __IOM uint32_t LRSWAP     : 1;            /*!< [31..31] Left/right channel swap.                                         */
+    } PCFG_b;
+  } ;
+
+  union {
+    __IOM uint32_t VCFG;                        /*!< (@ 0x00000004) Voice Configuration Register                               */
+
+    struct {
+      __IM  uint32_t            : 3;
+      __IOM uint32_t CHSET      : 2;            /*!< [4..3] Set PCM channels.                                                  */
+      __IM  uint32_t            : 3;
+      __IOM uint32_t PCMPACK    : 1;            /*!< [8..8] PCM data packing enable.                                           */
+      __IM  uint32_t            : 7;
+      __IOM uint32_t SELAP      : 1;            /*!< [16..16] Select PDM input clock source.                                   */
+      __IOM uint32_t DMICKDEL   : 1;            /*!< [17..17] PDM clock sampling delay.                                        */
+      __IM  uint32_t            : 1;
+      __IOM uint32_t BCLKINV    : 1;            /*!< [19..19] I2S BCLK input inversion.                                        */
+      __IOM uint32_t I2SEN      : 1;            /*!< [20..20] I2S interface enable.                                            */
+      __IM  uint32_t            : 5;
+      __IOM uint32_t PDMCLKEN   : 1;            /*!< [26..26] Enable the serial clock.                                         */
+      __IOM uint32_t PDMCLKSEL  : 3;            /*!< [29..27] Select the PDM input clock.                                      */
+      __IOM uint32_t RSTB       : 1;            /*!< [30..30] Reset the IP core.                                               */
+      __IOM uint32_t IOCLKEN    : 1;            /*!< [31..31] Enable the IO clock.                                             */
+    } VCFG_b;
+  } ;
+
+  union {
+    __IOM uint32_t VOICESTAT;                   /*!< (@ 0x00000008) Voice Status Register                                      */
+
+    struct {
+      __IOM uint32_t FIFOCNT    : 6;            /*!< [5..0] Valid 32-bit entries currently in the FIFO.                        */
+    } VOICESTAT_b;
+  } ;
+
+  union {
+    __IOM uint32_t FIFOREAD;                    /*!< (@ 0x0000000C) FIFO Read                                                  */
+
+    struct {
+      __IOM uint32_t FIFOREAD   : 32;           /*!< [31..0] FIFO read data.                                                   */
+    } FIFOREAD_b;
+  } ;
+
+  union {
+    __IOM uint32_t FIFOFLUSH;                   /*!< (@ 0x00000010) FIFO Flush                                                 */
+
+    struct {
+      __IOM uint32_t FIFOFLUSH  : 1;            /*!< [0..0] FIFO FLUSH.                                                        */
+    } FIFOFLUSH_b;
+  } ;
+
+  union {
+    __IOM uint32_t FIFOTHR;                     /*!< (@ 0x00000014) FIFO Threshold                                             */
+
+    struct {
+      __IOM uint32_t FIFOTHR    : 5;            /*!< [4..0] FIFO Threshold value. When the FIFO count is equal to,
+                                                     or larger than this value (in words), a THR interrupt is
+                                                     generated (if enabled)                                                    */
+    } FIFOTHR_b;
+  } ;
+  __IM  uint32_t  RESERVED[122];
+
+  union {
+    __IOM uint32_t INTEN;                       /*!< (@ 0x00000200) IO Master Interrupts: Enable                               */
+
+    struct {
+      __IOM uint32_t THR        : 1;            /*!< [0..0] This is the FIFO threshold interrupt.                              */
+      __IOM uint32_t OVF        : 1;            /*!< [1..1] This is the FIFO overflow interrupt.                               */
+      __IOM uint32_t UNDFL      : 1;            /*!< [2..2] This is the FIFO underflow interrupt.                              */
+      __IOM uint32_t DCMP       : 1;            /*!< [3..3] DMA completed a transfer                                           */
+      __IOM uint32_t DERR       : 1;            /*!< [4..4] DMA Error receieved                                                */
+    } INTEN_b;
+  } ;
+
+  union {
+    __IOM uint32_t INTSTAT;                     /*!< (@ 0x00000204) IO Master Interrupts: Status                               */
+
+    struct {
+      __IOM uint32_t THR        : 1;            /*!< [0..0] This is the FIFO threshold interrupt.                              */
+      __IOM uint32_t OVF        : 1;            /*!< [1..1] This is the FIFO overflow interrupt.                               */
+      __IOM uint32_t UNDFL      : 1;            /*!< [2..2] This is the FIFO underflow interrupt.                              */
+      __IOM uint32_t DCMP       : 1;            /*!< [3..3] DMA completed a transfer                                           */
+      __IOM uint32_t DERR       : 1;            /*!< [4..4] DMA Error receieved                                                */
+    } INTSTAT_b;
+  } ;
+
+  union {
+    __IOM uint32_t INTCLR;                      /*!< (@ 0x00000208) IO Master Interrupts: Clear                                */
+
+    struct {
+      __IOM uint32_t THR        : 1;            /*!< [0..0] This is the FIFO threshold interrupt.                              */
+      __IOM uint32_t OVF        : 1;            /*!< [1..1] This is the FIFO overflow interrupt.                               */
+      __IOM uint32_t UNDFL      : 1;            /*!< [2..2] This is the FIFO underflow interrupt.                              */
+      __IOM uint32_t DCMP       : 1;            /*!< [3..3] DMA completed a transfer                                           */
+      __IOM uint32_t DERR       : 1;            /*!< [4..4] DMA Error receieved                                                */
+    } INTCLR_b;
+  } ;
+
+  union {
+    __IOM uint32_t INTSET;                      /*!< (@ 0x0000020C) IO Master Interrupts: Set                                  */
+
+    struct {
+      __IOM uint32_t THR        : 1;            /*!< [0..0] This is the FIFO threshold interrupt.                              */
+      __IOM uint32_t OVF        : 1;            /*!< [1..1] This is the FIFO overflow interrupt.                               */
+      __IOM uint32_t UNDFL      : 1;            /*!< [2..2] This is the FIFO underflow interrupt.                              */
+      __IOM uint32_t DCMP       : 1;            /*!< [3..3] DMA completed a transfer                                           */
+      __IOM uint32_t DERR       : 1;            /*!< [4..4] DMA Error receieved                                                */
+    } INTSET_b;
+  } ;
+  __IM  uint32_t  RESERVED1[12];
+
+  union {
+    __IOM uint32_t DMATRIGEN;                   /*!< (@ 0x00000240) DMA Trigger Enable Register                                */
+
+    struct {
+      __IOM uint32_t DTHR       : 1;            /*!< [0..0] Trigger DMA upon when FIFO iss filled to level indicated
+                                                     by the FIFO THRESHOLD,at granularity of 16 bytes only                     */
+      __IOM uint32_t DTHR90     : 1;            /*!< [1..1] Trigger DMA at FIFO 90 percent full. This signal is also
+                                                     used internally for AUTOHIP function                                      */
+    } DMATRIGEN_b;
+  } ;
+
+  union {
+    __IOM uint32_t DMATRIGSTAT;                 /*!< (@ 0x00000244) DMA Trigger Status Register                                */
+
+    struct {
+      __IOM uint32_t DTHRSTAT   : 1;            /*!< [0..0] Triggered DMA from FIFO reaching threshold                         */
+      __IOM uint32_t DTHR90STAT : 1;            /*!< [1..1] Triggered DMA from FIFO reaching 90 percent full                   */
+    } DMATRIGSTAT_b;
+  } ;
+  __IM  uint32_t  RESERVED2[14];
+
+  union {
+    __IOM uint32_t DMACFG;                      /*!< (@ 0x00000280) DMA Configuration Register                                 */
+
+    struct {
+      __IOM uint32_t DMAEN      : 1;            /*!< [0..0] DMA Enable                                                         */
+      __IM  uint32_t            : 1;
+      __IOM uint32_t DMADIR     : 1;            /*!< [2..2] Direction                                                          */
+      __IM  uint32_t            : 5;
+      __IOM uint32_t DMAPRI     : 1;            /*!< [8..8] Sets the Priority of the DMA request                               */
+      __IOM uint32_t DAUTOHIP   : 1;            /*!< [9..9] Raise priority to high on fifo full, and DMAPRI set to
+                                                     low                                                                       */
+      __IOM uint32_t DPWROFF    : 1;            /*!< [10..10] Power Off the ADC System upon DMACPL.                            */
+    } DMACFG_b;
+  } ;
+  __IM  uint32_t  RESERVED3;
+
+  union {
+    __IOM uint32_t DMATOTCOUNT;                 /*!< (@ 0x00000288) DMA Total Transfer Count                                   */
+
+    struct {
+      __IOM uint32_t TOTCOUNT   : 20;           /*!< [19..0] Total Transfer Count. The transfer count must be a multiple
+                                                     of the THR setting to avoid DMA overruns.                                 */
+    } DMATOTCOUNT_b;
+  } ;
+
+  union {
+    __IOM uint32_t DMATARGADDR;                 /*!< (@ 0x0000028C) DMA Target Address Register                                */
+
+    struct {
+      __IOM uint32_t LTARGADDR  : 20;           /*!< [19..0] DMA Target Address. This register is not updated with
+                                                     the current address of the DMA, but will remain static
+                                                     with the original address during the DMA transfer.                        */
+      __IOM uint32_t UTARGADDR  : 12;           /*!< [31..20] SRAM Target                                                      */
+    } DMATARGADDR_b;
+  } ;
+
+  union {
+    __IOM uint32_t DMASTAT;                     /*!< (@ 0x00000290) DMA Status Register                                        */
+
+    struct {
+      __IOM uint32_t DMATIP     : 1;            /*!< [0..0] DMA Transfer In Progress                                           */
+      __IOM uint32_t DMACPL     : 1;            /*!< [1..1] DMA Transfer Complete                                              */
+      __IOM uint32_t DMAERR     : 1;            /*!< [2..2] DMA Error                                                          */
+    } DMASTAT_b;
+  } ;
+} PDM_Type;                                     /*!< Size = 660 (0x294)                                                        */
+
+
+
+/* =========================================================================================================================== */
+/* ================                                          PWRCTRL                                          ================ */
+/* =========================================================================================================================== */
+
+
+/**
+  * @brief PWR Controller Register Bank (PWRCTRL)
+  */
+
+typedef struct {                                /*!< (@ 0x40021000) PWRCTRL Structure                                          */
+
+  union {
+    __IOM uint32_t SUPPLYSRC;                   /*!< (@ 0x00000000) Voltage Regulator Select Register                          */
+
+    struct {
+      __IOM uint32_t BLEBUCKEN  : 1;            /*!< [0..0] Enables and Selects the BLE Buck as the supply for the
+                                                     BLE power domain or for Burst LDO. It takes the initial
+                                                     value from Customer INFO space. Buck will be powered up
+                                                     only if there is an active request for BLEH domain or Burst
+                                                     mode and appropriate feature is allowed.                                  */
+    } SUPPLYSRC_b;
+  } ;
+
+  union {
+    __IOM uint32_t SUPPLYSTATUS;                /*!< (@ 0x00000004) Voltage Regulators status                                  */
+
+    struct {
+      __IOM uint32_t SIMOBUCKON : 1;            /*!< [0..0] Indicates whether the Core/Mem low-voltage domains are
+                                                     supplied from the LDO or the Buck.                                        */
+      __IOM uint32_t BLEBUCKON  : 1;            /*!< [1..1] Indicates whether the BLE (if supported) domain and burst
+                                                     (if supported) domain is supplied from the LDO or the Buck.
+                                                     Buck will be powered up only if there is an active request
+                                                     for BLEH domain or Burst mode and appropriate reature is
+                                                     allowed.                                                                  */
+    } SUPPLYSTATUS_b;
+  } ;
+
+  union {
+    __IOM uint32_t DEVPWREN;                    /*!< (@ 0x00000008) Device Power Enables                                       */
+
+    struct {
+      __IOM uint32_t PWRIOS     : 1;            /*!< [0..0] Power up IO Slave                                                  */
+      __IOM uint32_t PWRIOM0    : 1;            /*!< [1..1] Power up IO Master 0                                               */
+      __IOM uint32_t PWRIOM1    : 1;            /*!< [2..2] Power up IO Master 1                                               */
+      __IOM uint32_t PWRIOM2    : 1;            /*!< [3..3] Power up IO Master 2                                               */
+      __IOM uint32_t PWRIOM3    : 1;            /*!< [4..4] Power up IO Master 3                                               */
+      __IOM uint32_t PWRIOM4    : 1;            /*!< [5..5] Power up IO Master 4                                               */
+      __IOM uint32_t PWRIOM5    : 1;            /*!< [6..6] Power up IO Master 5                                               */
+      __IOM uint32_t PWRUART0   : 1;            /*!< [7..7] Power up UART Controller 0                                         */
+      __IOM uint32_t PWRUART1   : 1;            /*!< [8..8] Power up UART Controller 1                                         */
+      __IOM uint32_t PWRADC     : 1;            /*!< [9..9] Power up ADC Digital Controller                                    */
+      __IOM uint32_t PWRSCARD   : 1;            /*!< [10..10] Power up SCARD Controller                                        */
+      __IOM uint32_t PWRMSPI    : 1;            /*!< [11..11] Power up MSPI Controller                                         */
+      __IOM uint32_t PWRPDM     : 1;            /*!< [12..12] Power up PDM block                                               */
+      __IOM uint32_t PWRBLEL    : 1;            /*!< [13..13] Power up BLE controller                                          */
+    } DEVPWREN_b;
+  } ;
+
+  union {
+    __IOM uint32_t MEMPWDINSLEEP;               /*!< (@ 0x0000000C) Powerdown SRAM banks in Deep Sleep mode                    */
+
+    struct {
+      __IOM uint32_t DTCMPWDSLP : 3;            /*!< [2..0] power down DTCM in deep sleep                                      */
+      __IOM uint32_t SRAMPWDSLP : 10;           /*!< [12..3] Selects which SRAM banks are powered down in deep sleep
+                                                     mode, causing the contents of the bank to be lost.                        */
+      __IOM uint32_t FLASH0PWDSLP : 1;          /*!< [13..13] Powerdown flash0 in deep sleep                                   */
+      __IOM uint32_t FLASH1PWDSLP : 1;          /*!< [14..14] Powerdown flash1 in deep sleep                                   */
+      __IM  uint32_t            : 16;
+      __IOM uint32_t CACHEPWDSLP : 1;           /*!< [31..31] power down cache in deep sleep                                   */
+    } MEMPWDINSLEEP_b;
+  } ;
+
+  union {
+    __IOM uint32_t MEMPWREN;                    /*!< (@ 0x00000010) Enables individual banks of the MEMORY array               */
+
+    struct {
+      __IOM uint32_t DTCM       : 3;            /*!< [2..0] Power up DTCM                                                      */
+      __IOM uint32_t SRAM       : 10;           /*!< [12..3] Power up SRAM groups                                              */
+      __IOM uint32_t FLASH0     : 1;            /*!< [13..13] Power up Flash0                                                  */
+      __IOM uint32_t FLASH1     : 1;            /*!< [14..14] Power up Flash1                                                  */
+      __IM  uint32_t            : 15;
+      __IOM uint32_t CACHEB0    : 1;            /*!< [30..30] Power up Cache Bank 0. This works in conjunction with
+                                                     Cache enable from flash_cache module. To power up cache
+                                                     bank0, cache has to be enabled and this bit has to be set.                */
+      __IOM uint32_t CACHEB2    : 1;            /*!< [31..31] Power up Cache Bank 2. This works in conjunction with
+                                                     Cache enable from flash_cache module. To power up cache
+                                                     bank2, cache has to be enabled and this bit has to be set.                */
+    } MEMPWREN_b;
+  } ;
+
+  union {
+    __IOM uint32_t MEMPWRSTATUS;                /*!< (@ 0x00000014) Mem Power ON Status                                        */
+
+    struct {
+      __IOM uint32_t DTCM00     : 1;            /*!< [0..0] This bit is 1 if power is supplied to DTCM GROUP0_0                */
+      __IOM uint32_t DTCM01     : 1;            /*!< [1..1] This bit is 1 if power is supplied to DTCM GROUP0_1                */
+      __IOM uint32_t DTCM1      : 1;            /*!< [2..2] This bit is 1 if power is supplied to DTCM GROUP1                  */
+      __IOM uint32_t SRAM0      : 1;            /*!< [3..3] This bit is 1 if power is supplied to SRAM GROUP0                  */
+      __IOM uint32_t SRAM1      : 1;            /*!< [4..4] This bit is 1 if power is supplied to SRAM GROUP1                  */
+      __IOM uint32_t SRAM2      : 1;            /*!< [5..5] This bit is 1 if power is supplied to SRAM GROUP2                  */
+      __IOM uint32_t SRAM3      : 1;            /*!< [6..6] This bit is 1 if power is supplied to SRAM GROUP3                  */
+      __IOM uint32_t SRAM4      : 1;            /*!< [7..7] This bit is 1 if power is supplied to SRAM GROUP4                  */
+      __IOM uint32_t SRAM5      : 1;            /*!< [8..8] This bit is 1 if power is supplied to SRAM GROUP5                  */
+      __IOM uint32_t SRAM6      : 1;            /*!< [9..9] This bit is 1 if power is supplied to SRAM GROUP6                  */
+      __IOM uint32_t SRAM7      : 1;            /*!< [10..10] This bit is 1 if power is supplied to SRAM GROUP7                */
+      __IOM uint32_t SRAM8      : 1;            /*!< [11..11] This bit is 1 if power is supplied to SRAM GROUP8                */
+      __IOM uint32_t SRAM9      : 1;            /*!< [12..12] This bit is 1 if power is supplied to SRAM GROUP9                */
+      __IOM uint32_t FLASH0     : 1;            /*!< [13..13] This bit is 1 if power is supplied to FLASH 0                    */
+      __IOM uint32_t FLASH1     : 1;            /*!< [14..14] This bit is 1 if power is supplied to FLASH 1                    */
+      __IOM uint32_t CACHEB0    : 1;            /*!< [15..15] This bit is 1 if power is supplied to Cache Bank 0               */
+      __IOM uint32_t CACHEB2    : 1;            /*!< [16..16] This bit is 1 if power is supplied to Cache Bank 2               */
+    } MEMPWRSTATUS_b;
+  } ;
+
+  union {
+    __IOM uint32_t DEVPWRSTATUS;                /*!< (@ 0x00000018) Device Power ON Status                                     */
+
+    struct {
+      __IOM uint32_t MCUL       : 1;            /*!< [0..0] This bit is 1 if power is supplied to MCUL                         */
+      __IOM uint32_t MCUH       : 1;            /*!< [1..1] This bit is 1 if power is supplied to MCUH                         */
+      __IOM uint32_t HCPA       : 1;            /*!< [2..2] This bit is 1 if power is supplied to HCPA domain (IO
+                                                     SLAVE, UART0, UART1, SCARD)                                               */
+      __IOM uint32_t HCPB       : 1;            /*!< [3..3] This bit is 1 if power is supplied to HCPB domain (IO
+                                                     MASTER 0, 1, 2)                                                           */
+      __IOM uint32_t HCPC       : 1;            /*!< [4..4] This bit is 1 if power is supplied to HCPC domain (IO
+                                                     MASTER4, 5, 6)                                                            */
+      __IOM uint32_t PWRADC     : 1;            /*!< [5..5] This bit is 1 if power is supplied to ADC                          */
+      __IOM uint32_t PWRMSPI    : 1;            /*!< [6..6] This bit is 1 if power is supplied to MSPI                         */
+      __IOM uint32_t PWRPDM     : 1;            /*!< [7..7] This bit is 1 if power is supplied to PDM                          */
+      __IOM uint32_t BLEL       : 1;            /*!< [8..8] This bit is 1 if power is supplied to BLEL                         */
+      __IOM uint32_t BLEH       : 1;            /*!< [9..9] This bit is 1 if power is supplied to BLEH                         */
+      __IM  uint32_t            : 19;
+      __IOM uint32_t CORESLEEP  : 1;            /*!< [29..29] This bit is 1 if CORE has been in SLEEP State. Write
+                                                     '1' to this bit to clear it.                                              */
+      __IOM uint32_t COREDEEPSLEEP : 1;         /*!< [30..30] This bit is 1 if CORE has been in Deep Sleep. Write
+                                                     '1' to this bit to clear it.                                              */
+      __IOM uint32_t SYSDEEPSLEEP : 1;          /*!< [31..31] This bit is 1 if SYSTEM has been in Deep Sleep. Write
+                                                     '1' to this bit to clear it.                                              */
+    } DEVPWRSTATUS_b;
+  } ;
+
+  union {
+    __IOM uint32_t SRAMCTRL;                    /*!< (@ 0x0000001C) SRAM Control register                                      */
+
+    struct {
+      __IM  uint32_t            : 1;
+      __IOM uint32_t SRAMCLKGATE : 1;           /*!< [1..1] This bit is 1 if clock gating is allowed for individual
+                                                     system SRAMs                                                              */
+      __IOM uint32_t SRAMMASTERCLKGATE : 1;     /*!< [2..2] This bit is 1 when the master clock gate is enabled (top-level
+                                                     clock gate for entire SRAM block)                                         */
+      __IM  uint32_t            : 5;
+      __IOM uint32_t SRAMLIGHTSLEEP : 12;       /*!< [19..8] Light Sleep enable for each TCM/SRAM bank. When 1, corresponding
+                                                     bank will be put into light sleep. For optimal power, banks
+                                                     should be put into light sleep while the system is active
+                                                     but the bank has minimal or no accesses.                                  */
+    } SRAMCTRL_b;
+  } ;
+
+  union {
+    __IOM uint32_t ADCSTATUS;                   /*!< (@ 0x00000020) Power Status Register for ADC Block                        */
+
+    struct {
+      __IOM uint32_t ADCPWD     : 1;            /*!< [0..0] This bit indicates that the ADC is powered down                    */
+      __IOM uint32_t BGTPWD     : 1;            /*!< [1..1] This bit indicates that the ADC Band Gap is powered down           */
+      __IOM uint32_t VPTATPWD   : 1;            /*!< [2..2] This bit indicates that the ADC temperature sensor input
+                                                     buffer is powered down                                                    */
+      __IOM uint32_t VBATPWD    : 1;            /*!< [3..3] This bit indicates that the ADC VBAT resistor divider
+                                                     is powered down                                                           */
+      __IOM uint32_t REFKEEPPWD : 1;            /*!< [4..4] This bit indicates that the ADC REFKEEP is powered down            */
+      __IOM uint32_t REFBUFPWD  : 1;            /*!< [5..5] This bit indicates that the ADC REFBUF is powered down             */
+    } ADCSTATUS_b;
+  } ;
+
+  union {
+    __IOM uint32_t MISC;                        /*!< (@ 0x00000024) Power Optimization Control Bits                            */
+
+    struct {
+      __IOM uint32_t SIMOBUCKEN : 1;            /*!< [0..0] Enables and Selects the SIMO Buck as the supply for the
+                                                     low-voltage power domain. It takes the initial value from
+                                                     the bit set in Customer INFO space.                                       */
+      __IOM uint32_t FORCECOREVRLPPDM : 1;      /*!< [1..1] Control bit to enable the core VR to go into LP mode
+                                                     with HCPA/B/C/MSPI are powered off but PDM is powered on                  */
+      __IOM uint32_t FORCECOREVRLPTIMERS : 1;   /*!< [2..2] Control Bit to force Core VR to LP mode in deep sleep
+                                                     even when hfrc based ctimer or stimer is running.                         */
+      __IOM uint32_t FORCEMEMVRLPTIMERS : 1;    /*!< [3..3] Control Bit to force Mem VR to LP mode in deep sleep
+                                                     even when hfrc based ctimer or stimer is running.                         */
+      __IOM uint32_t FORCEMEMVRADC : 2;         /*!< [5..4] Control Bit to force mem VR to LP or ACT mode in deep
+                                                     sleep when ADC is powered ON. 0x3 results in picking LP
+                                                     mode.                                                                     */
+      __IOM uint32_t MEMVRLPBLE : 1;            /*!< [6..6] Control Bit to let Mem VR go to lp mode in deep sleep
+                                                     even when BLEL or BLEH is powered on given none of the
+                                                     other domains require it.                                                 */
+      __IOM uint32_t FORCEBLEBUCKACT : 1;       /*!< [7..7] Control Bit to enable BLE Buck to be in active state
+                                                     when BLE Buck is enabled. Default behavior is to be in
+                                                     active only when Burst or BLEH power on are requested.                    */
+    } MISC_b;
+  } ;
+
+  union {
+    __IOM uint32_t DEVPWREVENTEN;               /*!< (@ 0x00000028) Event enable register to control which DEVPWRSTATUS
+                                                                    bits are routed to event input of CPU.                     */
+
+    struct {
+      __IOM uint32_t MCULEVEN   : 1;            /*!< [0..0] Control MCUL power-on status event                                 */
+      __IOM uint32_t MCUHEVEN   : 1;            /*!< [1..1] Control MCUH power-on status event                                 */
+      __IOM uint32_t HCPAEVEN   : 1;            /*!< [2..2] Control HCPA power-on status event                                 */
+      __IOM uint32_t HCPBEVEN   : 1;            /*!< [3..3] Control HCPB power-on status event                                 */
+      __IOM uint32_t HCPCEVEN   : 1;            /*!< [4..4] Control HCPC power-on status event                                 */
+      __IOM uint32_t ADCEVEN    : 1;            /*!< [5..5] Control ADC power-on status event                                  */
+      __IOM uint32_t MSPIEVEN   : 1;            /*!< [6..6] Control MSPI power-on status event                                 */
+      __IOM uint32_t PDMEVEN    : 1;            /*!< [7..7] Control PDM power-on status event                                  */
+      __IOM uint32_t BLELEVEN   : 1;            /*!< [8..8] Control BLE power-on status event                                  */
+      __IM  uint32_t            : 20;
+      __IOM uint32_t BLEFEATUREEVEN : 1;        /*!< [29..29] Control BLEFEATURE status event                                  */
+      __IOM uint32_t BURSTFEATUREEVEN : 1;      /*!< [30..30] Control BURSTFEATURE status event                                */
+      __IOM uint32_t BURSTEVEN  : 1;            /*!< [31..31] Control BURST status event                                       */
+    } DEVPWREVENTEN_b;
+  } ;
+
+  union {
+    __IOM uint32_t MEMPWREVENTEN;               /*!< (@ 0x0000002C) Event enable register to control which MEMPWRSTATUS
+                                                                    bits are routed to event input of CPU.                     */
+
+    struct {
+      __IOM uint32_t DTCMEN     : 3;            /*!< [2..0] Enable DTCM power-on status event                                  */
+      __IOM uint32_t SRAMEN     : 10;           /*!< [12..3] Control SRAM power-on status event                                */
+      __IOM uint32_t FLASH0EN   : 1;            /*!< [13..13] Control Flash power-on status event                              */
+      __IOM uint32_t FLASH1EN   : 1;            /*!< [14..14] Control Flash power-on status event                              */
+      __IM  uint32_t            : 15;
+      __IOM uint32_t CACHEB0EN  : 1;            /*!< [30..30] Control CACHE BANK 0 power-on status event                       */
+      __IOM uint32_t CACHEB2EN  : 1;            /*!< [31..31] Control CACHEB2 power-on status event                            */
+    } MEMPWREVENTEN_b;
+  } ;
+} PWRCTRL_Type;                                 /*!< Size = 48 (0x30)                                                          */
+
+
+
+/* =========================================================================================================================== */
+/* ================                                          RSTGEN                                           ================ */
+/* =========================================================================================================================== */
+
+
+/**
+  * @brief MCU Reset Generator (RSTGEN)
+  */
+
+typedef struct {                                /*!< (@ 0x40000000) RSTGEN Structure                                           */
+
+  union {
+    __IOM uint32_t CFG;                         /*!< (@ 0x00000000) Configuration Register                                     */
+
+    struct {
+      __IOM uint32_t BODHREN    : 1;            /*!< [0..0] Brown out high (2.1v) reset enable.                                */
+      __IOM uint32_t WDREN      : 1;            /*!< [1..1] Watchdog Timer Reset Enable. NOTE: The WDT module must
+                                                     also be configured for WDT reset. This includes enabling
+                                                     the RESEN bit in WDTCFG register in Watch dog timer block.                */
+    } CFG_b;
+  } ;
+
+  union {
+    __IOM uint32_t SWPOI;                       /*!< (@ 0x00000004) Software POI Reset                                         */
+
+    struct {
+      __IOM uint32_t SWPOIKEY   : 8;            /*!< [7..0] 0x1B generates a software POI reset. This is a write-only
+                                                     register. Reading from this register will yield only all
+                                                     0s.                                                                       */
+    } SWPOI_b;
+  } ;
+
+  union {
+    __IOM uint32_t SWPOR;                       /*!< (@ 0x00000008) Software POR Reset                                         */
+
+    struct {
+      __IOM uint32_t SWPORKEY   : 8;            /*!< [7..0] 0xD4 generates a software POR reset.                               */
+    } SWPOR_b;
+  } ;
+  __IM  uint32_t  RESERVED[2];
+
+  union {
+    __IOM uint32_t TPIURST;                     /*!< (@ 0x00000014) TPIU reset                                                 */
+
+    struct {
+      __IOM uint32_t TPIURST    : 1;            /*!< [0..0] Static reset for the TPIU. Write to '1' to assert reset
+                                                     to TPIU. Write to '0' to clear the reset.                                 */
+    } TPIURST_b;
+  } ;
+  __IM  uint32_t  RESERVED1[122];
+
+  union {
+    __IOM uint32_t INTEN;                       /*!< (@ 0x00000200) Reset Interrupt register: Enable                           */
+
+    struct {
+      __IOM uint32_t BODH       : 1;            /*!< [0..0] Enables an interrupt that triggers when VCC is below
+                                                     BODH level.                                                               */
+    } INTEN_b;
+  } ;
+
+  union {
+    __IOM uint32_t INTSTAT;                     /*!< (@ 0x00000204) Reset Interrupt register: Status                           */
+
+    struct {
+      __IOM uint32_t BODH       : 1;            /*!< [0..0] Enables an interrupt that triggers when VCC is below
+                                                     BODH level.                                                               */
+    } INTSTAT_b;
+  } ;
+
+  union {
+    __IOM uint32_t INTCLR;                      /*!< (@ 0x00000208) Reset Interrupt register: Clear                            */
+
+    struct {
+      __IOM uint32_t BODH       : 1;            /*!< [0..0] Enables an interrupt that triggers when VCC is below
+                                                     BODH level.                                                               */
+    } INTCLR_b;
+  } ;
+
+  union {
+    __IOM uint32_t INTSET;                      /*!< (@ 0x0000020C) Reset Interrupt register: Set                              */
+
+    struct {
+      __IOM uint32_t BODH       : 1;            /*!< [0..0] Enables an interrupt that triggers when VCC is below
+                                                     BODH level.                                                               */
+    } INTSET_b;
+  } ;
+  __IM  uint32_t  RESERVED2[67107708];
+
+  union {
+    __IOM uint32_t STAT;                        /*!< (@ 0x0FFFF000) Status Register (SBL)                                      */
+
+    struct {
+      __IOM uint32_t EXRSTAT    : 1;            /*!< [0..0] Reset was initiated by an External Reset (SBL).                    */
+      __IOM uint32_t PORSTAT    : 1;            /*!< [1..1] Reset was initiated by a Power-On Reset (SBL).                     */
+      __IOM uint32_t BORSTAT    : 1;            /*!< [2..2] Reset was initiated by a Brown-Out Reset (SBL).                    */
+      __IOM uint32_t SWRSTAT    : 1;            /*!< [3..3] Reset was a initiated by SW POR or AIRCR Reset (SBL).              */
+      __IOM uint32_t POIRSTAT   : 1;            /*!< [4..4] Reset was a initiated by Software POI Reset (SBL).                 */
+      __IOM uint32_t DBGRSTAT   : 1;            /*!< [5..5] Reset was a initiated by Debugger Reset (SBL).                     */
+      __IOM uint32_t WDRSTAT    : 1;            /*!< [6..6] Reset was initiated by a Watchdog Timer Reset (SBL).               */
+      __IOM uint32_t BOUSTAT    : 1;            /*!< [7..7] An Unregulated Supply Brownout Event occurred (SBL).               */
+      __IOM uint32_t BOCSTAT    : 1;            /*!< [8..8] A Core Regulator Brownout Event occurred (SBL).                    */
+      __IOM uint32_t BOFSTAT    : 1;            /*!< [9..9] A Memory Regulator Brownout Event occurred (SBL).                  */
+      __IOM uint32_t BOBSTAT    : 1;            /*!< [10..10] A BLE/Burst Regulator Brownout Event occurred (SBL).             */
+      __IM  uint32_t            : 19;
+      __IOM uint32_t FBOOT      : 1;            /*!< [30..30] Set if current boot was initiated by soft reset and
+                                                     resulted in Fast Boot (SBL).                                              */
+      __IOM uint32_t SBOOT      : 1;            /*!< [31..31] Set when booting securely (SBL).                                 */
+    } STAT_b;
+  } ;
+} RSTGEN_Type;                                  /*!< Size = 268431364 (0xffff004)                                              */
+
+
+
+/* =========================================================================================================================== */
+/* ================                                            RTC                                            ================ */
+/* =========================================================================================================================== */
+
+
+/**
+  * @brief Real Time Clock (RTC)
+  */
+
+typedef struct {                                /*!< (@ 0x40004200) RTC Structure                                              */
+  __IM  uint32_t  RESERVED[16];
+
+  union {
+    __IOM uint32_t CTRLOW;                      /*!< (@ 0x00000040) RTC Counters Lower                                         */
+
+    struct {
+      __IOM uint32_t CTR100     : 8;            /*!< [7..0] 100ths of a second Counter                                         */
+      __IOM uint32_t CTRSEC     : 7;            /*!< [14..8] Seconds Counter                                                   */
+      __IM  uint32_t            : 1;
+      __IOM uint32_t CTRMIN     : 7;            /*!< [22..16] Minutes Counter                                                  */
+      __IM  uint32_t            : 1;
+      __IOM uint32_t CTRHR      : 6;            /*!< [29..24] Hours Counter                                                    */
+    } CTRLOW_b;
+  } ;
+
+  union {
+    __IOM uint32_t CTRUP;                       /*!< (@ 0x00000044) RTC Counters Upper                                         */
+
+    struct {
+      __IOM uint32_t CTRDATE    : 6;            /*!< [5..0] Date Counter                                                       */
+      __IM  uint32_t            : 2;
+      __IOM uint32_t CTRMO      : 5;            /*!< [12..8] Months Counter                                                    */
+      __IM  uint32_t            : 3;
+      __IOM uint32_t CTRYR      : 8;            /*!< [23..16] Years Counter                                                    */
+      __IOM uint32_t CTRWKDY    : 3;            /*!< [26..24] Weekdays Counter                                                 */
+      __IOM uint32_t CB         : 1;            /*!< [27..27] Century                                                          */
+      __IOM uint32_t CEB        : 1;            /*!< [28..28] Century enable                                                   */
+      __IM  uint32_t            : 2;
+      __IOM uint32_t CTERR      : 1;            /*!< [31..31] Counter read error status. Error is triggered when
+                                                     software reads the lower word of the counters, and fails
+                                                     to read the upper counter within 1/100 second. This is
+                                                     because when the lower counter is read, the upper counter
+                                                     is held off from incrementing until it is read so that
+                                                     the full time stamp can be read.                                          */
+    } CTRUP_b;
+  } ;
+
+  union {
+    __IOM uint32_t ALMLOW;                      /*!< (@ 0x00000048) RTC Alarms Lower                                           */
+
+    struct {
+      __IOM uint32_t ALM100     : 8;            /*!< [7..0] 100ths of a second Alarm                                           */
+      __IOM uint32_t ALMSEC     : 7;            /*!< [14..8] Seconds Alarm                                                     */
+      __IM  uint32_t            : 1;
+      __IOM uint32_t ALMMIN     : 7;            /*!< [22..16] Minutes Alarm                                                    */
+      __IM  uint32_t            : 1;
+      __IOM uint32_t ALMHR      : 6;            /*!< [29..24] Hours Alarm                                                      */
+    } ALMLOW_b;
+  } ;
+
+  union {
+    __IOM uint32_t ALMUP;                       /*!< (@ 0x0000004C) RTC Alarms Upper                                           */
+
+    struct {
+      __IOM uint32_t ALMDATE    : 6;            /*!< [5..0] Date Alarm                                                         */
+      __IM  uint32_t            : 2;
+      __IOM uint32_t ALMMO      : 5;            /*!< [12..8] Months Alarm                                                      */
+      __IM  uint32_t            : 3;
+      __IOM uint32_t ALMWKDY    : 3;            /*!< [18..16] Weekdays Alarm                                                   */
+    } ALMUP_b;
+  } ;
+
+  union {
+    __IOM uint32_t RTCCTL;                      /*!< (@ 0x00000050) RTC Control Register                                       */
+
+    struct {
+      __IOM uint32_t WRTC       : 1;            /*!< [0..0] Counter write control                                              */
+      __IOM uint32_t RPT        : 3;            /*!< [3..1] Alarm repeat interval                                              */
+      __IOM uint32_t RSTOP      : 1;            /*!< [4..4] RTC input clock control                                            */
+      __IOM uint32_t HR1224     : 1;            /*!< [5..5] Hours Counter mode                                                 */
+    } RTCCTL_b;
+  } ;
+  __IM  uint32_t  RESERVED1[43];
+
+  union {
+    __IOM uint32_t INTEN;                       /*!< (@ 0x00000100) RTC Interrupt Register: Enable                             */
+
+    struct {
+      __IOM uint32_t ALM        : 1;            /*!< [0..0] RTC Alarm interrupt                                                */
+    } INTEN_b;
+  } ;
+
+  union {
+    __IOM uint32_t INTSTAT;                     /*!< (@ 0x00000104) RTC Interrupt Register: Status                             */
+
+    struct {
+      __IOM uint32_t ALM        : 1;            /*!< [0..0] RTC Alarm interrupt                                                */
+    } INTSTAT_b;
+  } ;
+
+  union {
+    __IOM uint32_t INTCLR;                      /*!< (@ 0x00000108) RTC Interrupt Register: Clear                              */
+
+    struct {
+      __IOM uint32_t ALM        : 1;            /*!< [0..0] RTC Alarm interrupt                                                */
+    } INTCLR_b;
+  } ;
+
+  union {
+    __IOM uint32_t INTSET;                      /*!< (@ 0x0000010C) RTC Interrupt Register: Set                                */
+
+    struct {
+      __IOM uint32_t ALM        : 1;            /*!< [0..0] RTC Alarm interrupt                                                */
+    } INTSET_b;
+  } ;
+} RTC_Type;                                     /*!< Size = 272 (0x110)                                                        */
+
+
+
+/* =========================================================================================================================== */
+/* ================                                           SCARD                                           ================ */
+/* =========================================================================================================================== */
+
+
+/**
+  * @brief Serial ISO7816 (SCARD)
+  */
+
+typedef struct {                                /*!< (@ 0x40080000) SCARD Structure                                            */
+
+  union {
+    __IOM uint32_t SR;                          /*!< (@ 0x00000000) ISO7816 interrupt status                                   */
+
+    struct {
+      __IOM uint32_t FNE        : 1;            /*!< [0..0] RX FIFO not empty.                                                 */
+      __IOM uint32_t TBERBF     : 1;            /*!< [1..1] FIFO empty (transmit) or full (receive).                           */
+      __IOM uint32_t FER        : 1;            /*!< [2..2] Framing error.                                                     */
+      __IOM uint32_t OVR        : 1;            /*!< [3..3] RX FIFO overflow.                                                  */
+      __IOM uint32_t PE         : 1;            /*!< [4..4] Parity Error.                                                      */
+      __IOM uint32_t FT2REND    : 1;            /*!< [5..5] TX to RX finished.                                                 */
+      __IOM uint32_t FHF        : 1;            /*!< [6..6] FIFO Half Full.                                                    */
+    } SR_b;
+  } ;
+  __IM  uint32_t  RESERVED[3];
+
+  union {
+    __IOM uint32_t DR;                          /*!< (@ 0x00000010) ISO7816 data                                               */
+
+    struct {
+      __IOM uint32_t DR         : 8;            /*!< [7..0] Data register.                                                     */
+    } DR_b;
+  } ;
+  __IM  uint32_t  RESERVED1[3];
+
+  union {
+    __IOM uint32_t SR1;                         /*!< (@ 0x00000020) ISO7816 interrupt status 1                                 */
+
+    struct {
+      __IOM uint32_t ECNTOVER   : 1;            /*!< [0..0] ETU counter overflow.                                              */
+      __IOM uint32_t PRL        : 1;            /*!< [1..1] Card insert/remove.                                                */
+      __IOM uint32_t SYNCEND    : 1;            /*!< [2..2] Write complete synchronization.                                    */
+      __IOM uint32_t IDLE       : 1;            /*!< [3..3] ISO7816 idle.                                                      */
+    } SR1_b;
+  } ;
+  __IM  uint32_t  RESERVED2[5];
+
+  union {
+    __IOM uint32_t RETXCNTRMI;                  /*!< (@ 0x00000038) ISO7816 resent count inquiry                               */
+
+    struct {
+      __IOM uint32_t RETXCNTRMI : 4;            /*!< [3..0] Resent count inquiry register.                                     */
+    } RETXCNTRMI_b;
+  } ;
+  __IM  uint32_t  RESERVED3[49];
+
+  union {
+    __IOM uint32_t CLKCTRL;                     /*!< (@ 0x00000100) Clock Control                                              */
+
+    struct {
+      __IOM uint32_t CLKEN      : 1;            /*!< [0..0] Enable the serial source clock for SCARD.                          */
+      __IOM uint32_t APBCLKEN   : 1;            /*!< [1..1] Enable the SCARD APB clock to run continuously.                    */
+    } CLKCTRL_b;
+  } ;
+} SCARD_Type;                                   /*!< Size = 260 (0x104)                                                        */
+
+
+
+/* =========================================================================================================================== */
+/* ================                                         SECURITY                                          ================ */
+/* =========================================================================================================================== */
+
+
+/**
+  * @brief Security Interfaces (SECURITY)
+  */
+
+typedef struct {                                /*!< (@ 0x40030000) SECURITY Structure                                         */
+
+  union {
+    __IOM uint32_t CTRL;                        /*!< (@ 0x00000000) Control Register                                           */
+
+    struct {
+      __IOM uint32_t ENABLE     : 1;            /*!< [0..0] Function Enable. Software should set the ENABLE bit to
+                                                     initiate a CRC operation. Hardware will clear the ENABLE
+                                                     bit upon completion.                                                      */
+      __IM  uint32_t            : 3;
+      __IOM uint32_t FUNCTION   : 4;            /*!< [7..4] Function Select                                                    */
+      __IM  uint32_t            : 23;
+      __IOM uint32_t CRCERROR   : 1;            /*!< [31..31] CRC Error Status - Set to 1 if an error occurs during
+                                                     a CRC operation. Cleared when CTRL register is written
+                                                     (with any value). Usually indicates an invalid address
+                                                     range.                                                                    */
+    } CTRL_b;
+  } ;
+  __IM  uint32_t  RESERVED[3];
+
+  union {
+    __IOM uint32_t SRCADDR;                     /*!< (@ 0x00000010) Source Addresss                                            */
+
+    struct {
+      __IOM uint32_t ADDR       : 32;           /*!< [31..0] Source Buffer Address. Address may be byte aligned,
+                                                     but the length must be a multiple of 4 bits.                              */
+    } SRCADDR_b;
+  } ;
+  __IM  uint32_t  RESERVED1[3];
+
+  union {
+    __IOM uint32_t LEN;                         /*!< (@ 0x00000020) Length                                                     */
+
+    struct {
+      __IM  uint32_t            : 2;
+      __IOM uint32_t LEN        : 18;           /*!< [19..2] Buffer size (bottom two bits assumed to be zero to ensure
+                                                     a multiple of 4 bytes)                                                    */
+    } LEN_b;
+  } ;
+  __IM  uint32_t  RESERVED2[3];
+
+  union {
+    __IOM uint32_t RESULT;                      /*!< (@ 0x00000030) CRC Seed/Result Register                                   */
+
+    struct {
+      __IOM uint32_t CRC        : 32;           /*!< [31..0] CRC Seed/Result. Software must seed the CRC with 0xFFFFFFFF
+                                                     before starting a CRC operation (unless the CRC is continued
+                                                     from a previous operation).                                               */
+    } RESULT_b;
+  } ;
+  __IM  uint32_t  RESERVED3[17];
+
+  union {
+    __IOM uint32_t LOCKCTRL;                    /*!< (@ 0x00000078) LOCK Control Register                                      */
+
+    struct {
+      __IOM uint32_t SELECT     : 8;            /*!< [7..0] LOCK Function Select register.                                     */
+    } LOCKCTRL_b;
+  } ;
+
+  union {
+    __IOM uint32_t LOCKSTAT;                    /*!< (@ 0x0000007C) LOCK Status Register                                       */
+
+    struct {
+      __IOM uint32_t STATUS     : 32;           /*!< [31..0] LOCK Status register. This register is a bitmask for
+                                                     which resources are currently unlocked. These bits are
+                                                     one-hot per resource.                                                     */
+    } LOCKSTAT_b;
+  } ;
+
+  union {
+    __IOM uint32_t KEY0;                        /*!< (@ 0x00000080) Key0 Register                                              */
+
+    struct {
+      __IOM uint32_t KEY0       : 32;           /*!< [31..0] Bits [31:0] of the 128-bit key should be written to
+                                                     this register. To protect key values, the register always
+                                                     returns 0x00000000.                                                       */
+    } KEY0_b;
+  } ;
+
+  union {
+    __IOM uint32_t KEY1;                        /*!< (@ 0x00000084) Key1 Register                                              */
+
+    struct {
+      __IOM uint32_t KEY1       : 32;           /*!< [31..0] Bits [63:32] of the 128-bit key should be written to
+                                                     this register. To protect key values, the register always
+                                                     returns 0x00000000.                                                       */
+    } KEY1_b;
+  } ;
+
+  union {
+    __IOM uint32_t KEY2;                        /*!< (@ 0x00000088) Key2 Register                                              */
+
+    struct {
+      __IOM uint32_t KEY2       : 32;           /*!< [31..0] Bits [95:64] of the 128-bit key should be written to
+                                                     this register. To protect key values, the register always
+                                                     returns 0x00000000.                                                       */
+    } KEY2_b;
+  } ;
+
+  union {
+    __IOM uint32_t KEY3;                        /*!< (@ 0x0000008C) Key3 Register                                              */
+
+    struct {
+      __IOM uint32_t KEY3       : 32;           /*!< [31..0] Bits [127:96] of the 128-bit key should be written to
+                                                     this register. To protect key values, the register always
+                                                     returns 0x00000000.                                                       */
+    } KEY3_b;
+  } ;
+} SECURITY_Type;                                /*!< Size = 144 (0x90)                                                         */
+
+
+
+/* =========================================================================================================================== */
+/* ================                                           UART0                                           ================ */
+/* =========================================================================================================================== */
+
+
+/**
+  * @brief Serial UART (UART0)
+  */
+
+typedef struct {                                /*!< (@ 0x4001C000) UART0 Structure                                            */
+
+  union {
+    __IOM uint32_t DR;                          /*!< (@ 0x00000000) UART Data Register                                         */
+
+    struct {
+      __IOM uint32_t DATA       : 8;            /*!< [7..0] This is the UART data port.                                        */
+      __IOM uint32_t FEDATA     : 1;            /*!< [8..8] This is the framing error indicator.                               */
+      __IOM uint32_t PEDATA     : 1;            /*!< [9..9] This is the parity error indicator.                                */
+      __IOM uint32_t BEDATA     : 1;            /*!< [10..10] This is the break error indicator.                               */
+      __IOM uint32_t OEDATA     : 1;            /*!< [11..11] This is the overrun error indicator.                             */
+    } DR_b;
+  } ;
+
+  union {
+    __IOM uint32_t RSR;                         /*!< (@ 0x00000004) UART Status Register                                       */
+
+    struct {
+      __IOM uint32_t FESTAT     : 1;            /*!< [0..0] This is the framing error indicator.                               */
+      __IOM uint32_t PESTAT     : 1;            /*!< [1..1] This is the parity error indicator.                                */
+      __IOM uint32_t BESTAT     : 1;            /*!< [2..2] This is the break error indicator.                                 */
+      __IOM uint32_t OESTAT     : 1;            /*!< [3..3] This is the overrun error indicator.                               */
+    } RSR_b;
+  } ;
+  __IM  uint32_t  RESERVED[4];
+
+  union {
+    __IOM uint32_t FR;                          /*!< (@ 0x00000018) Flag Register                                              */
+
+    struct {
+      __IOM uint32_t CTS        : 1;            /*!< [0..0] This bit holds the clear to send indicator.                        */
+      __IOM uint32_t DSR        : 1;            /*!< [1..1] This bit holds the data set ready indicator.                       */
+      __IOM uint32_t DCD        : 1;            /*!< [2..2] This bit holds the data carrier detect indicator.                  */
+      __IOM uint32_t BUSY       : 1;            /*!< [3..3] This bit holds the busy indicator.                                 */
+      __IOM uint32_t RXFE       : 1;            /*!< [4..4] This bit holds the receive FIFO empty indicator.                   */
+      __IOM uint32_t TXFF       : 1;            /*!< [5..5] This bit holds the transmit FIFO full indicator.                   */
+      __IOM uint32_t RXFF       : 1;            /*!< [6..6] This bit holds the receive FIFO full indicator.                    */
+      __IOM uint32_t TXFE       : 1;            /*!< [7..7] This bit holds the transmit FIFO empty indicator.                  */
+      __IOM uint32_t TXBUSY     : 1;            /*!< [8..8] This bit holds the transmit BUSY indicator.                        */
+    } FR_b;
+  } ;
+  __IM  uint32_t  RESERVED1;
+
+  union {
+    __IOM uint32_t ILPR;                        /*!< (@ 0x00000020) IrDA Counter                                               */
+
+    struct {
+      __IOM uint32_t ILPDVSR    : 8;            /*!< [7..0] These bits hold the IrDA counter divisor.                          */
+    } ILPR_b;
+  } ;
+
+  union {
+    __IOM uint32_t IBRD;                        /*!< (@ 0x00000024) Integer Baud Rate Divisor                                  */
+
+    struct {
+      __IOM uint32_t DIVINT     : 16;           /*!< [15..0] These bits hold the baud integer divisor.                         */
+    } IBRD_b;
+  } ;
+
+  union {
+    __IOM uint32_t FBRD;                        /*!< (@ 0x00000028) Fractional Baud Rate Divisor                               */
+
+    struct {
+      __IOM uint32_t DIVFRAC    : 6;            /*!< [5..0] These bits hold the baud fractional divisor.                       */
+    } FBRD_b;
+  } ;
+
+  union {
+    __IOM uint32_t LCRH;                        /*!< (@ 0x0000002C) Line Control High                                          */
+
+    struct {
+      __IOM uint32_t BRK        : 1;            /*!< [0..0] This bit holds the break set.                                      */
+      __IOM uint32_t PEN        : 1;            /*!< [1..1] This bit holds the parity enable.                                  */
+      __IOM uint32_t EPS        : 1;            /*!< [2..2] This bit holds the even parity select.                             */
+      __IOM uint32_t STP2       : 1;            /*!< [3..3] This bit holds the two stop bits select.                           */
+      __IOM uint32_t FEN        : 1;            /*!< [4..4] This bit holds the FIFO enable.                                    */
+      __IOM uint32_t WLEN       : 2;            /*!< [6..5] These bits hold the write length.                                  */
+      __IOM uint32_t SPS        : 1;            /*!< [7..7] This bit holds the stick parity select.                            */
+    } LCRH_b;
+  } ;
+
+  union {
+    __IOM uint32_t CR;                          /*!< (@ 0x00000030) Control Register                                           */
+
+    struct {
+      __IOM uint32_t UARTEN     : 1;            /*!< [0..0] This bit is the UART enable.                                       */
+      __IOM uint32_t SIREN      : 1;            /*!< [1..1] This bit is the SIR ENDEC enable.                                  */
+      __IOM uint32_t SIRLP      : 1;            /*!< [2..2] This bit is the SIR low power select.                              */
+      __IOM uint32_t CLKEN      : 1;            /*!< [3..3] This bit is the UART clock enable.                                 */
+      __IOM uint32_t CLKSEL     : 3;            /*!< [6..4] This bitfield is the UART clock select.                            */
+      __IOM uint32_t LBE        : 1;            /*!< [7..7] This bit is the loopback enable.                                   */
+      __IOM uint32_t TXE        : 1;            /*!< [8..8] This bit is the transmit enable.                                   */
+      __IOM uint32_t RXE        : 1;            /*!< [9..9] This bit is the receive enable.                                    */
+      __IOM uint32_t DTR        : 1;            /*!< [10..10] This bit enables data transmit ready.                            */
+      __IOM uint32_t RTS        : 1;            /*!< [11..11] This bit enables request to send.                                */
+      __IOM uint32_t OUT1       : 1;            /*!< [12..12] This bit holds modem Out1.                                       */
+      __IOM uint32_t OUT2       : 1;            /*!< [13..13] This bit holds modem Out2.                                       */
+      __IOM uint32_t RTSEN      : 1;            /*!< [14..14] This bit enables RTS hardware flow control.                      */
+      __IOM uint32_t CTSEN      : 1;            /*!< [15..15] This bit enables CTS hardware flow control.                      */
+    } CR_b;
+  } ;
+
+  union {
+    __IOM uint32_t IFLS;                        /*!< (@ 0x00000034) FIFO Interrupt Level Select                                */
+
+    struct {
+      __IOM uint32_t TXIFLSEL   : 3;            /*!< [2..0] These bits hold the transmit FIFO interrupt level.                 */
+      __IOM uint32_t RXIFLSEL   : 3;            /*!< [5..3] These bits hold the receive FIFO interrupt level.                  */
+    } IFLS_b;
+  } ;
+
+  union {
+    __IOM uint32_t IER;                         /*!< (@ 0x00000038) Interrupt Enable                                           */
+
+    struct {
+      __IOM uint32_t TXCMPMIM   : 1;            /*!< [0..0] This bit holds the modem TXCMP interrupt enable.                   */
+      __IOM uint32_t CTSMIM     : 1;            /*!< [1..1] This bit holds the modem CTS interrupt enable.                     */
+      __IOM uint32_t DCDMIM     : 1;            /*!< [2..2] This bit holds the modem DCD interrupt enable.                     */
+      __IOM uint32_t DSRMIM     : 1;            /*!< [3..3] This bit holds the modem DSR interrupt enable.                     */
+      __IOM uint32_t RXIM       : 1;            /*!< [4..4] This bit holds the receive interrupt enable.                       */
+      __IOM uint32_t TXIM       : 1;            /*!< [5..5] This bit holds the transmit interrupt enable.                      */
+      __IOM uint32_t RTIM       : 1;            /*!< [6..6] This bit holds the receive timeout interrupt enable.               */
+      __IOM uint32_t FEIM       : 1;            /*!< [7..7] This bit holds the framing error interrupt enable.                 */
+      __IOM uint32_t PEIM       : 1;            /*!< [8..8] This bit holds the parity error interrupt enable.                  */
+      __IOM uint32_t BEIM       : 1;            /*!< [9..9] This bit holds the break error interrupt enable.                   */
+      __IOM uint32_t OEIM       : 1;            /*!< [10..10] This bit holds the overflow interrupt enable.                    */
+    } IER_b;
+  } ;
+
+  union {
+    __IOM uint32_t IES;                         /*!< (@ 0x0000003C) Interrupt Status                                           */
+
+    struct {
+      __IOM uint32_t TXCMPMRIS  : 1;            /*!< [0..0] This bit holds the modem TXCMP interrupt status.                   */
+      __IOM uint32_t CTSMRIS    : 1;            /*!< [1..1] This bit holds the modem CTS interrupt status.                     */
+      __IOM uint32_t DCDMRIS    : 1;            /*!< [2..2] This bit holds the modem DCD interrupt status.                     */
+      __IOM uint32_t DSRMRIS    : 1;            /*!< [3..3] This bit holds the modem DSR interrupt status.                     */
+      __IOM uint32_t RXRIS      : 1;            /*!< [4..4] This bit holds the receive interrupt status.                       */
+      __IOM uint32_t TXRIS      : 1;            /*!< [5..5] This bit holds the transmit interrupt status.                      */
+      __IOM uint32_t RTRIS      : 1;            /*!< [6..6] This bit holds the receive timeout interrupt status.               */
+      __IOM uint32_t FERIS      : 1;            /*!< [7..7] This bit holds the framing error interrupt status.                 */
+      __IOM uint32_t PERIS      : 1;            /*!< [8..8] This bit holds the parity error interrupt status.                  */
+      __IOM uint32_t BERIS      : 1;            /*!< [9..9] This bit holds the break error interrupt status.                   */
+      __IOM uint32_t OERIS      : 1;            /*!< [10..10] This bit holds the overflow interrupt status.                    */
+    } IES_b;
+  } ;
+
+  union {
+    __IOM uint32_t MIS;                         /*!< (@ 0x00000040) Masked Interrupt Status                                    */
+
+    struct {
+      __IOM uint32_t TXCMPMMIS  : 1;            /*!< [0..0] This bit holds the modem TXCMP interrupt status masked.            */
+      __IOM uint32_t CTSMMIS    : 1;            /*!< [1..1] This bit holds the modem CTS interrupt status masked.              */
+      __IOM uint32_t DCDMMIS    : 1;            /*!< [2..2] This bit holds the modem DCD interrupt status masked.              */
+      __IOM uint32_t DSRMMIS    : 1;            /*!< [3..3] This bit holds the modem DSR interrupt status masked.              */
+      __IOM uint32_t RXMIS      : 1;            /*!< [4..4] This bit holds the receive interrupt status masked.                */
+      __IOM uint32_t TXMIS      : 1;            /*!< [5..5] This bit holds the transmit interrupt status masked.               */
+      __IOM uint32_t RTMIS      : 1;            /*!< [6..6] This bit holds the receive timeout interrupt status masked.        */
+      __IOM uint32_t FEMIS      : 1;            /*!< [7..7] This bit holds the framing error interrupt status masked.          */
+      __IOM uint32_t PEMIS      : 1;            /*!< [8..8] This bit holds the parity error interrupt status masked.           */
+      __IOM uint32_t BEMIS      : 1;            /*!< [9..9] This bit holds the break error interrupt status masked.            */
+      __IOM uint32_t OEMIS      : 1;            /*!< [10..10] This bit holds the overflow interrupt status masked.             */
+    } MIS_b;
+  } ;
+
+  union {
+    __IOM uint32_t IEC;                         /*!< (@ 0x00000044) Interrupt Clear                                            */
+
+    struct {
+      __IOM uint32_t TXCMPMIC   : 1;            /*!< [0..0] This bit holds the modem TXCMP interrupt clear.                    */
+      __IOM uint32_t CTSMIC     : 1;            /*!< [1..1] This bit holds the modem CTS interrupt clear.                      */
+      __IOM uint32_t DCDMIC     : 1;            /*!< [2..2] This bit holds the modem DCD interrupt clear.                      */
+      __IOM uint32_t DSRMIC     : 1;            /*!< [3..3] This bit holds the modem DSR interrupt clear.                      */
+      __IOM uint32_t RXIC       : 1;            /*!< [4..4] This bit holds the receive interrupt clear.                        */
+      __IOM uint32_t TXIC       : 1;            /*!< [5..5] This bit holds the transmit interrupt clear.                       */
+      __IOM uint32_t RTIC       : 1;            /*!< [6..6] This bit holds the receive timeout interrupt clear.                */
+      __IOM uint32_t FEIC       : 1;            /*!< [7..7] This bit holds the framing error interrupt clear.                  */
+      __IOM uint32_t PEIC       : 1;            /*!< [8..8] This bit holds the parity error interrupt clear.                   */
+      __IOM uint32_t BEIC       : 1;            /*!< [9..9] This bit holds the break error interrupt clear.                    */
+      __IOM uint32_t OEIC       : 1;            /*!< [10..10] This bit holds the overflow interrupt clear.                     */
+    } IEC_b;
+  } ;
+} UART0_Type;                                   /*!< Size = 72 (0x48)                                                          */
+
+
+
+/* =========================================================================================================================== */
+/* ================                                           VCOMP                                           ================ */
+/* =========================================================================================================================== */
+
+
+/**
+  * @brief Voltage Comparator (VCOMP)
+  */
+
+typedef struct {                                /*!< (@ 0x4000C000) VCOMP Structure                                            */
+
+  union {
+    __IOM uint32_t CFG;                         /*!< (@ 0x00000000) Configuration Register                                     */
+
+    struct {
+      __IOM uint32_t PSEL       : 2;            /*!< [1..0] This bitfield selects the positive input to the comparator.        */
+      __IM  uint32_t            : 6;
+      __IOM uint32_t NSEL       : 2;            /*!< [9..8] This bitfield selects the negative input to the comparator.        */
+      __IM  uint32_t            : 6;
+      __IOM uint32_t LVLSEL     : 4;            /*!< [19..16] When the reference input NSEL is set to NSEL_DAC, this
+                                                     bitfield selects the voltage level for the negative input
+                                                     to the comparator.                                                        */
+    } CFG_b;
+  } ;
+
+  union {
+    __IOM uint32_t STAT;                        /*!< (@ 0x00000004) Status Register                                            */
+
+    struct {
+      __IOM uint32_t CMPOUT     : 1;            /*!< [0..0] This bit is 1 if the positive input of the comparator
+                                                     is greater than the negative input.                                       */
+      __IOM uint32_t PWDSTAT    : 1;            /*!< [1..1] This bit indicates the power down state of the voltage
+                                                     comparator.                                                               */
+    } STAT_b;
+  } ;
+
+  union {
+    __IOM uint32_t PWDKEY;                      /*!< (@ 0x00000008) Key Register for Powering Down the Voltage Comparator      */
+
+    struct {
+      __IOM uint32_t PWDKEY     : 32;           /*!< [31..0] Key register value.                                               */
+    } PWDKEY_b;
+  } ;
+  __IM  uint32_t  RESERVED[125];
+
+  union {
+    __IOM uint32_t INTEN;                       /*!< (@ 0x00000200) Voltage Comparator Interrupt registers: Enable             */
+
+    struct {
+      __IOM uint32_t OUTLOW     : 1;            /*!< [0..0] This bit is the vcompout low interrupt.                            */
+      __IOM uint32_t OUTHI      : 1;            /*!< [1..1] This bit is the vcompout high interrupt.                           */
+    } INTEN_b;
+  } ;
+
+  union {
+    __IOM uint32_t INTSTAT;                     /*!< (@ 0x00000204) Voltage Comparator Interrupt registers: Status             */
+
+    struct {
+      __IOM uint32_t OUTLOW     : 1;            /*!< [0..0] This bit is the vcompout low interrupt.                            */
+      __IOM uint32_t OUTHI      : 1;            /*!< [1..1] This bit is the vcompout high interrupt.                           */
+    } INTSTAT_b;
+  } ;
+
+  union {
+    __IOM uint32_t INTCLR;                      /*!< (@ 0x00000208) Voltage Comparator Interrupt registers: Clear              */
+
+    struct {
+      __IOM uint32_t OUTLOW     : 1;            /*!< [0..0] This bit is the vcompout low interrupt.                            */
+      __IOM uint32_t OUTHI      : 1;            /*!< [1..1] This bit is the vcompout high interrupt.                           */
+    } INTCLR_b;
+  } ;
+
+  union {
+    __IOM uint32_t INTSET;                      /*!< (@ 0x0000020C) Voltage Comparator Interrupt registers: Set                */
+
+    struct {
+      __IOM uint32_t OUTLOW     : 1;            /*!< [0..0] This bit is the vcompout low interrupt.                            */
+      __IOM uint32_t OUTHI      : 1;            /*!< [1..1] This bit is the vcompout high interrupt.                           */
+    } INTSET_b;
+  } ;
+} VCOMP_Type;                                   /*!< Size = 528 (0x210)                                                        */
+
+
+
+/* =========================================================================================================================== */
+/* ================                                            WDT                                            ================ */
+/* =========================================================================================================================== */
+
+
+/**
+  * @brief Watchdog Timer (WDT)
+  */
+
+typedef struct {                                /*!< (@ 0x40024000) WDT Structure                                              */
+
+  union {
+    __IOM uint32_t CFG;                         /*!< (@ 0x00000000) Configuration Register                                     */
+
+    struct {
+      __IOM uint32_t WDTEN      : 1;            /*!< [0..0] This bitfield enables the WDT.                                     */
+      __IOM uint32_t INTEN      : 1;            /*!< [1..1] This bitfield enables the WDT interrupt. Note : This
+                                                     bit must be set before the interrupt status bit will reflect
+                                                     a watchdog timer expiration. The IER interrupt register
+                                                     must also be enabled for a WDT interrupt to be sent to
+                                                     the NVIC.                                                                 */
+      __IOM uint32_t RESEN      : 1;            /*!< [2..2] This bitfield enables the WDT reset. This needs to be
+                                                     set together with the WDREN bit in REG_RSTGEN_CFG register
+                                                     (in reset gen) to trigger the reset.                                      */
+      __IM  uint32_t            : 5;
+      __IOM uint32_t RESVAL     : 8;            /*!< [15..8] This bitfield is the compare value for counter bits
+                                                     7:0 to generate a watchdog reset. This will cause a software
+                                                     reset.                                                                    */
+      __IOM uint32_t INTVAL     : 8;            /*!< [23..16] This bitfield is the compare value for counter bits
+                                                     7:0 to generate a watchdog interrupt.                                     */
+      __IOM uint32_t CLKSEL     : 3;            /*!< [26..24] Select the frequency for the WDT. All values not enumerated
+                                                     below are undefined.                                                      */
+    } CFG_b;
+  } ;
+
+  union {
+    __IOM uint32_t RSTRT;                       /*!< (@ 0x00000004) Restart the watchdog timer.                                */
+
+    struct {
+      __IOM uint32_t RSTRT      : 8;            /*!< [7..0] Writing 0xB2 to WDTRSTRT restarts the watchdog timer.
+                                                     This is a write only register. Reading this register will
+                                                     only provide all 0.                                                       */
+    } RSTRT_b;
+  } ;
+
+  union {
+    __IOM uint32_t LOCK;                        /*!< (@ 0x00000008) Locks the WDT                                              */
+
+    struct {
+      __IOM uint32_t LOCK       : 8;            /*!< [7..0] Writing 0x3A locks the watchdog timer. Once locked, the
+                                                     WDTCFG reg cannot be written and WDTEN is set.                            */
+    } LOCK_b;
+  } ;
+
+  union {
+    __IOM uint32_t COUNT;                       /*!< (@ 0x0000000C) Current Counter Value for WDT                              */
+
+    struct {
+      __IOM uint32_t COUNT      : 8;            /*!< [7..0] Read-Only current value of the WDT counter                         */
+    } COUNT_b;
+  } ;
+  __IM  uint32_t  RESERVED[124];
+
+  union {
+    __IOM uint32_t INTEN;                       /*!< (@ 0x00000200) WDT Interrupt register: Enable                             */
+
+    struct {
+      __IOM uint32_t WDTINT     : 1;            /*!< [0..0] Watchdog Timer Interrupt.                                          */
+    } INTEN_b;
+  } ;
+
+  union {
+    __IOM uint32_t INTSTAT;                     /*!< (@ 0x00000204) WDT Interrupt register: Status                             */
+
+    struct {
+      __IOM uint32_t WDTINT     : 1;            /*!< [0..0] Watchdog Timer Interrupt.                                          */
+    } INTSTAT_b;
+  } ;
+
+  union {
+    __IOM uint32_t INTCLR;                      /*!< (@ 0x00000208) WDT Interrupt register: Clear                              */
+
+    struct {
+      __IOM uint32_t WDTINT     : 1;            /*!< [0..0] Watchdog Timer Interrupt.                                          */
+    } INTCLR_b;
+  } ;
+
+  union {
+    __IOM uint32_t INTSET;                      /*!< (@ 0x0000020C) WDT Interrupt register: Set                                */
+
+    struct {
+      __IOM uint32_t WDTINT     : 1;            /*!< [0..0] Watchdog Timer Interrupt.                                          */
+    } INTSET_b;
+  } ;
+} WDT_Type;                                     /*!< Size = 528 (0x210)                                                        */
+
+
+/** @} */ /* End of group Device_Peripheral_peripherals */
+
+
+/* =========================================================================================================================== */
+/* ================                          Device Specific Peripheral Address Map                           ================ */
+/* =========================================================================================================================== */
+
+
+/** @addtogroup Device_Peripheral_peripheralAddr
+  * @{
+  */
+
+#define ADC_BASE                    0x50010000UL
+#define APBDMA_BASE                 0x40011000UL
+#define BLEIF_BASE                  0x5000C000UL
+#define CACHECTRL_BASE              0x40018000UL
+#define CLKGEN_BASE                 0x40004000UL
+#define CTIMER_BASE                 0x40008000UL
+#define GPIO_BASE                   0x40010000UL
+#define IOM0_BASE                   0x50004000UL
+#define IOM1_BASE                   0x50005000UL
+#define IOM2_BASE                   0x50006000UL
+#define IOM3_BASE                   0x50007000UL
+#define IOM4_BASE                   0x50008000UL
+#define IOM5_BASE                   0x50009000UL
+#define IOSLAVE_BASE                0x50000000UL
+#define MCUCTRL_BASE                0x40020000UL
+#define MSPI_BASE                   0x50014000UL
+#define PDM_BASE                    0x50011000UL
+#define PWRCTRL_BASE                0x40021000UL
+#define RSTGEN_BASE                 0x40000000UL
+#define RTC_BASE                    0x40004200UL
+#define SCARD_BASE                  0x40080000UL
+#define SECURITY_BASE               0x40030000UL
+#define UART0_BASE                  0x4001C000UL
+#define UART1_BASE                  0x4001D000UL
+#define VCOMP_BASE                  0x4000C000UL
+#define WDT_BASE                    0x40024000UL
+
+/** @} */ /* End of group Device_Peripheral_peripheralAddr */
+
+
+/* =========================================================================================================================== */
+/* ================                                  Peripheral declaration                                   ================ */
+/* =========================================================================================================================== */
+
+
+/** @addtogroup Device_Peripheral_declaration
+  * @{
+  */
+
+#define ADC                         ((ADC_Type*)               ADC_BASE)
+#define APBDMA                      ((APBDMA_Type*)            APBDMA_BASE)
+#define BLEIF                       ((BLEIF_Type*)             BLEIF_BASE)
+#define CACHECTRL                   ((CACHECTRL_Type*)         CACHECTRL_BASE)
+#define CLKGEN                      ((CLKGEN_Type*)            CLKGEN_BASE)
+#define CTIMER                      ((CTIMER_Type*)            CTIMER_BASE)
+#define GPIO                        ((GPIO_Type*)              GPIO_BASE)
+#define IOM0                        ((IOM0_Type*)              IOM0_BASE)
+#define IOM1                        ((IOM0_Type*)              IOM1_BASE)
+#define IOM2                        ((IOM0_Type*)              IOM2_BASE)
+#define IOM3                        ((IOM0_Type*)              IOM3_BASE)
+#define IOM4                        ((IOM0_Type*)              IOM4_BASE)
+#define IOM5                        ((IOM0_Type*)              IOM5_BASE)
+#define IOSLAVE                     ((IOSLAVE_Type*)           IOSLAVE_BASE)
+#define MCUCTRL                     ((MCUCTRL_Type*)           MCUCTRL_BASE)
+#define MSPI                        ((MSPI_Type*)              MSPI_BASE)
+#define PDM                         ((PDM_Type*)               PDM_BASE)
+#define PWRCTRL                     ((PWRCTRL_Type*)           PWRCTRL_BASE)
+#define RSTGEN                      ((RSTGEN_Type*)            RSTGEN_BASE)
+#define RTC                         ((RTC_Type*)               RTC_BASE)
+#define SCARD                       ((SCARD_Type*)             SCARD_BASE)
+#define SECURITY                    ((SECURITY_Type*)          SECURITY_BASE)
+#define UART0                       ((UART0_Type*)             UART0_BASE)
+#define UART1                       ((UART0_Type*)             UART1_BASE)
+#define VCOMP                       ((VCOMP_Type*)             VCOMP_BASE)
+#define WDT                         ((WDT_Type*)               WDT_BASE)
+
+/** @} */ /* End of group Device_Peripheral_declaration */
+
+
+/* =========================================  End of section using anonymous unions  ========================================= */
+#if defined (__CC_ARM)
+  #pragma pop
+#elif defined (__ICCARM__)
+  /* leave anonymous unions enabled */
+#elif (__ARMCC_VERSION >= 6010050)
+  #pragma clang diagnostic pop
+#elif defined (__GNUC__)
+  /* anonymous unions are enabled by default */
+#elif defined (__TMS470__)
+  /* anonymous unions are enabled by default */
+#elif defined (__TASKING__)
+  #pragma warning restore
+#elif defined (__CSMC__)
+  /* anonymous unions are enabled by default */
+#endif
+
+
+/* =========================================================================================================================== */
+/* ================                                Pos/Mask Peripheral Section                                ================ */
+/* =========================================================================================================================== */
+
+
+/** @addtogroup PosMask_peripherals
+  * @{
+  */
+
+
+
+/* =========================================================================================================================== */
+/* ================                                            ADC                                            ================ */
+/* =========================================================================================================================== */
+
+/* ==========================================================  CFG  ========================================================== */
+#define ADC_CFG_CLKSEL_Pos                (24UL)                    /*!< ADC CFG: CLKSEL (Bit 24)                              */
+#define ADC_CFG_CLKSEL_Msk                (0x3000000UL)             /*!< ADC CFG: CLKSEL (Bitfield-Mask: 0x03)                 */
+#define ADC_CFG_TRIGPOL_Pos               (19UL)                    /*!< ADC CFG: TRIGPOL (Bit 19)                             */
+#define ADC_CFG_TRIGPOL_Msk               (0x80000UL)               /*!< ADC CFG: TRIGPOL (Bitfield-Mask: 0x01)                */
+#define ADC_CFG_TRIGSEL_Pos               (16UL)                    /*!< ADC CFG: TRIGSEL (Bit 16)                             */
+#define ADC_CFG_TRIGSEL_Msk               (0x70000UL)               /*!< ADC CFG: TRIGSEL (Bitfield-Mask: 0x07)                */
+#define ADC_CFG_DFIFORDEN_Pos             (12UL)                    /*!< ADC CFG: DFIFORDEN (Bit 12)                           */
+#define ADC_CFG_DFIFORDEN_Msk             (0x1000UL)                /*!< ADC CFG: DFIFORDEN (Bitfield-Mask: 0x01)              */
+#define ADC_CFG_REFSEL_Pos                (8UL)                     /*!< ADC CFG: REFSEL (Bit 8)                               */
+#define ADC_CFG_REFSEL_Msk                (0x300UL)                 /*!< ADC CFG: REFSEL (Bitfield-Mask: 0x03)                 */
+#define ADC_CFG_CKMODE_Pos                (4UL)                     /*!< ADC CFG: CKMODE (Bit 4)                               */
+#define ADC_CFG_CKMODE_Msk                (0x10UL)                  /*!< ADC CFG: CKMODE (Bitfield-Mask: 0x01)                 */
+#define ADC_CFG_LPMODE_Pos                (3UL)                     /*!< ADC CFG: LPMODE (Bit 3)                               */
+#define ADC_CFG_LPMODE_Msk                (0x8UL)                   /*!< ADC CFG: LPMODE (Bitfield-Mask: 0x01)                 */
+#define ADC_CFG_RPTEN_Pos                 (2UL)                     /*!< ADC CFG: RPTEN (Bit 2)                                */
+#define ADC_CFG_RPTEN_Msk                 (0x4UL)                   /*!< ADC CFG: RPTEN (Bitfield-Mask: 0x01)                  */
+#define ADC_CFG_ADCEN_Pos                 (0UL)                     /*!< ADC CFG: ADCEN (Bit 0)                                */
+#define ADC_CFG_ADCEN_Msk                 (0x1UL)                   /*!< ADC CFG: ADCEN (Bitfield-Mask: 0x01)                  */
+/* =========================================================  STAT  ========================================================== */
+#define ADC_STAT_PWDSTAT_Pos              (0UL)                     /*!< ADC STAT: PWDSTAT (Bit 0)                             */
+#define ADC_STAT_PWDSTAT_Msk              (0x1UL)                   /*!< ADC STAT: PWDSTAT (Bitfield-Mask: 0x01)               */
+/* ==========================================================  SWT  ========================================================== */
+#define ADC_SWT_SWT_Pos                   (0UL)                     /*!< ADC SWT: SWT (Bit 0)                                  */
+#define ADC_SWT_SWT_Msk                   (0xffUL)                  /*!< ADC SWT: SWT (Bitfield-Mask: 0xff)                    */
+/* ========================================================  SL0CFG  ========================================================= */
+#define ADC_SL0CFG_ADSEL0_Pos             (24UL)                    /*!< ADC SL0CFG: ADSEL0 (Bit 24)                           */
+#define ADC_SL0CFG_ADSEL0_Msk             (0x7000000UL)             /*!< ADC SL0CFG: ADSEL0 (Bitfield-Mask: 0x07)              */
+#define ADC_SL0CFG_PRMODE0_Pos            (16UL)                    /*!< ADC SL0CFG: PRMODE0 (Bit 16)                          */
+#define ADC_SL0CFG_PRMODE0_Msk            (0x30000UL)               /*!< ADC SL0CFG: PRMODE0 (Bitfield-Mask: 0x03)             */
+#define ADC_SL0CFG_CHSEL0_Pos             (8UL)                     /*!< ADC SL0CFG: CHSEL0 (Bit 8)                            */
+#define ADC_SL0CFG_CHSEL0_Msk             (0xf00UL)                 /*!< ADC SL0CFG: CHSEL0 (Bitfield-Mask: 0x0f)              */
+#define ADC_SL0CFG_WCEN0_Pos              (1UL)                     /*!< ADC SL0CFG: WCEN0 (Bit 1)                             */
+#define ADC_SL0CFG_WCEN0_Msk              (0x2UL)                   /*!< ADC SL0CFG: WCEN0 (Bitfield-Mask: 0x01)               */
+#define ADC_SL0CFG_SLEN0_Pos              (0UL)                     /*!< ADC SL0CFG: SLEN0 (Bit 0)                             */
+#define ADC_SL0CFG_SLEN0_Msk              (0x1UL)                   /*!< ADC SL0CFG: SLEN0 (Bitfield-Mask: 0x01)               */
+/* ========================================================  SL1CFG  ========================================================= */
+#define ADC_SL1CFG_ADSEL1_Pos             (24UL)                    /*!< ADC SL1CFG: ADSEL1 (Bit 24)                           */
+#define ADC_SL1CFG_ADSEL1_Msk             (0x7000000UL)             /*!< ADC SL1CFG: ADSEL1 (Bitfield-Mask: 0x07)              */
+#define ADC_SL1CFG_PRMODE1_Pos            (16UL)                    /*!< ADC SL1CFG: PRMODE1 (Bit 16)                          */
+#define ADC_SL1CFG_PRMODE1_Msk            (0x30000UL)               /*!< ADC SL1CFG: PRMODE1 (Bitfield-Mask: 0x03)             */
+#define ADC_SL1CFG_CHSEL1_Pos             (8UL)                     /*!< ADC SL1CFG: CHSEL1 (Bit 8)                            */
+#define ADC_SL1CFG_CHSEL1_Msk             (0xf00UL)                 /*!< ADC SL1CFG: CHSEL1 (Bitfield-Mask: 0x0f)              */
+#define ADC_SL1CFG_WCEN1_Pos              (1UL)                     /*!< ADC SL1CFG: WCEN1 (Bit 1)                             */
+#define ADC_SL1CFG_WCEN1_Msk              (0x2UL)                   /*!< ADC SL1CFG: WCEN1 (Bitfield-Mask: 0x01)               */
+#define ADC_SL1CFG_SLEN1_Pos              (0UL)                     /*!< ADC SL1CFG: SLEN1 (Bit 0)                             */
+#define ADC_SL1CFG_SLEN1_Msk              (0x1UL)                   /*!< ADC SL1CFG: SLEN1 (Bitfield-Mask: 0x01)               */
+/* ========================================================  SL2CFG  ========================================================= */
+#define ADC_SL2CFG_ADSEL2_Pos             (24UL)                    /*!< ADC SL2CFG: ADSEL2 (Bit 24)                           */
+#define ADC_SL2CFG_ADSEL2_Msk             (0x7000000UL)             /*!< ADC SL2CFG: ADSEL2 (Bitfield-Mask: 0x07)              */
+#define ADC_SL2CFG_PRMODE2_Pos            (16UL)                    /*!< ADC SL2CFG: PRMODE2 (Bit 16)                          */
+#define ADC_SL2CFG_PRMODE2_Msk            (0x30000UL)               /*!< ADC SL2CFG: PRMODE2 (Bitfield-Mask: 0x03)             */
+#define ADC_SL2CFG_CHSEL2_Pos             (8UL)                     /*!< ADC SL2CFG: CHSEL2 (Bit 8)                            */
+#define ADC_SL2CFG_CHSEL2_Msk             (0xf00UL)                 /*!< ADC SL2CFG: CHSEL2 (Bitfield-Mask: 0x0f)              */
+#define ADC_SL2CFG_WCEN2_Pos              (1UL)                     /*!< ADC SL2CFG: WCEN2 (Bit 1)                             */
+#define ADC_SL2CFG_WCEN2_Msk              (0x2UL)                   /*!< ADC SL2CFG: WCEN2 (Bitfield-Mask: 0x01)               */
+#define ADC_SL2CFG_SLEN2_Pos              (0UL)                     /*!< ADC SL2CFG: SLEN2 (Bit 0)                             */
+#define ADC_SL2CFG_SLEN2_Msk              (0x1UL)                   /*!< ADC SL2CFG: SLEN2 (Bitfield-Mask: 0x01)               */
+/* ========================================================  SL3CFG  ========================================================= */
+#define ADC_SL3CFG_ADSEL3_Pos             (24UL)                    /*!< ADC SL3CFG: ADSEL3 (Bit 24)                           */
+#define ADC_SL3CFG_ADSEL3_Msk             (0x7000000UL)             /*!< ADC SL3CFG: ADSEL3 (Bitfield-Mask: 0x07)              */
+#define ADC_SL3CFG_PRMODE3_Pos            (16UL)                    /*!< ADC SL3CFG: PRMODE3 (Bit 16)                          */
+#define ADC_SL3CFG_PRMODE3_Msk            (0x30000UL)               /*!< ADC SL3CFG: PRMODE3 (Bitfield-Mask: 0x03)             */
+#define ADC_SL3CFG_CHSEL3_Pos             (8UL)                     /*!< ADC SL3CFG: CHSEL3 (Bit 8)                            */
+#define ADC_SL3CFG_CHSEL3_Msk             (0xf00UL)                 /*!< ADC SL3CFG: CHSEL3 (Bitfield-Mask: 0x0f)              */
+#define ADC_SL3CFG_WCEN3_Pos              (1UL)                     /*!< ADC SL3CFG: WCEN3 (Bit 1)                             */
+#define ADC_SL3CFG_WCEN3_Msk              (0x2UL)                   /*!< ADC SL3CFG: WCEN3 (Bitfield-Mask: 0x01)               */
+#define ADC_SL3CFG_SLEN3_Pos              (0UL)                     /*!< ADC SL3CFG: SLEN3 (Bit 0)                             */
+#define ADC_SL3CFG_SLEN3_Msk              (0x1UL)                   /*!< ADC SL3CFG: SLEN3 (Bitfield-Mask: 0x01)               */
+/* ========================================================  SL4CFG  ========================================================= */
+#define ADC_SL4CFG_ADSEL4_Pos             (24UL)                    /*!< ADC SL4CFG: ADSEL4 (Bit 24)                           */
+#define ADC_SL4CFG_ADSEL4_Msk             (0x7000000UL)             /*!< ADC SL4CFG: ADSEL4 (Bitfield-Mask: 0x07)              */
+#define ADC_SL4CFG_PRMODE4_Pos            (16UL)                    /*!< ADC SL4CFG: PRMODE4 (Bit 16)                          */
+#define ADC_SL4CFG_PRMODE4_Msk            (0x30000UL)               /*!< ADC SL4CFG: PRMODE4 (Bitfield-Mask: 0x03)             */
+#define ADC_SL4CFG_CHSEL4_Pos             (8UL)                     /*!< ADC SL4CFG: CHSEL4 (Bit 8)                            */
+#define ADC_SL4CFG_CHSEL4_Msk             (0xf00UL)                 /*!< ADC SL4CFG: CHSEL4 (Bitfield-Mask: 0x0f)              */
+#define ADC_SL4CFG_WCEN4_Pos              (1UL)                     /*!< ADC SL4CFG: WCEN4 (Bit 1)                             */
+#define ADC_SL4CFG_WCEN4_Msk              (0x2UL)                   /*!< ADC SL4CFG: WCEN4 (Bitfield-Mask: 0x01)               */
+#define ADC_SL4CFG_SLEN4_Pos              (0UL)                     /*!< ADC SL4CFG: SLEN4 (Bit 0)                             */
+#define ADC_SL4CFG_SLEN4_Msk              (0x1UL)                   /*!< ADC SL4CFG: SLEN4 (Bitfield-Mask: 0x01)               */
+/* ========================================================  SL5CFG  ========================================================= */
+#define ADC_SL5CFG_ADSEL5_Pos             (24UL)                    /*!< ADC SL5CFG: ADSEL5 (Bit 24)                           */
+#define ADC_SL5CFG_ADSEL5_Msk             (0x7000000UL)             /*!< ADC SL5CFG: ADSEL5 (Bitfield-Mask: 0x07)              */
+#define ADC_SL5CFG_PRMODE5_Pos            (16UL)                    /*!< ADC SL5CFG: PRMODE5 (Bit 16)                          */
+#define ADC_SL5CFG_PRMODE5_Msk            (0x30000UL)               /*!< ADC SL5CFG: PRMODE5 (Bitfield-Mask: 0x03)             */
+#define ADC_SL5CFG_CHSEL5_Pos             (8UL)                     /*!< ADC SL5CFG: CHSEL5 (Bit 8)                            */
+#define ADC_SL5CFG_CHSEL5_Msk             (0xf00UL)                 /*!< ADC SL5CFG: CHSEL5 (Bitfield-Mask: 0x0f)              */
+#define ADC_SL5CFG_WCEN5_Pos              (1UL)                     /*!< ADC SL5CFG: WCEN5 (Bit 1)                             */
+#define ADC_SL5CFG_WCEN5_Msk              (0x2UL)                   /*!< ADC SL5CFG: WCEN5 (Bitfield-Mask: 0x01)               */
+#define ADC_SL5CFG_SLEN5_Pos              (0UL)                     /*!< ADC SL5CFG: SLEN5 (Bit 0)                             */
+#define ADC_SL5CFG_SLEN5_Msk              (0x1UL)                   /*!< ADC SL5CFG: SLEN5 (Bitfield-Mask: 0x01)               */
+/* ========================================================  SL6CFG  ========================================================= */
+#define ADC_SL6CFG_ADSEL6_Pos             (24UL)                    /*!< ADC SL6CFG: ADSEL6 (Bit 24)                           */
+#define ADC_SL6CFG_ADSEL6_Msk             (0x7000000UL)             /*!< ADC SL6CFG: ADSEL6 (Bitfield-Mask: 0x07)              */
+#define ADC_SL6CFG_PRMODE6_Pos            (16UL)                    /*!< ADC SL6CFG: PRMODE6 (Bit 16)                          */
+#define ADC_SL6CFG_PRMODE6_Msk            (0x30000UL)               /*!< ADC SL6CFG: PRMODE6 (Bitfield-Mask: 0x03)             */
+#define ADC_SL6CFG_CHSEL6_Pos             (8UL)                     /*!< ADC SL6CFG: CHSEL6 (Bit 8)                            */
+#define ADC_SL6CFG_CHSEL6_Msk             (0xf00UL)                 /*!< ADC SL6CFG: CHSEL6 (Bitfield-Mask: 0x0f)              */
+#define ADC_SL6CFG_WCEN6_Pos              (1UL)                     /*!< ADC SL6CFG: WCEN6 (Bit 1)                             */
+#define ADC_SL6CFG_WCEN6_Msk              (0x2UL)                   /*!< ADC SL6CFG: WCEN6 (Bitfield-Mask: 0x01)               */
+#define ADC_SL6CFG_SLEN6_Pos              (0UL)                     /*!< ADC SL6CFG: SLEN6 (Bit 0)                             */
+#define ADC_SL6CFG_SLEN6_Msk              (0x1UL)                   /*!< ADC SL6CFG: SLEN6 (Bitfield-Mask: 0x01)               */
+/* ========================================================  SL7CFG  ========================================================= */
+#define ADC_SL7CFG_ADSEL7_Pos             (24UL)                    /*!< ADC SL7CFG: ADSEL7 (Bit 24)                           */
+#define ADC_SL7CFG_ADSEL7_Msk             (0x7000000UL)             /*!< ADC SL7CFG: ADSEL7 (Bitfield-Mask: 0x07)              */
+#define ADC_SL7CFG_PRMODE7_Pos            (16UL)                    /*!< ADC SL7CFG: PRMODE7 (Bit 16)                          */
+#define ADC_SL7CFG_PRMODE7_Msk            (0x30000UL)               /*!< ADC SL7CFG: PRMODE7 (Bitfield-Mask: 0x03)             */
+#define ADC_SL7CFG_CHSEL7_Pos             (8UL)                     /*!< ADC SL7CFG: CHSEL7 (Bit 8)                            */
+#define ADC_SL7CFG_CHSEL7_Msk             (0xf00UL)                 /*!< ADC SL7CFG: CHSEL7 (Bitfield-Mask: 0x0f)              */
+#define ADC_SL7CFG_WCEN7_Pos              (1UL)                     /*!< ADC SL7CFG: WCEN7 (Bit 1)                             */
+#define ADC_SL7CFG_WCEN7_Msk              (0x2UL)                   /*!< ADC SL7CFG: WCEN7 (Bitfield-Mask: 0x01)               */
+#define ADC_SL7CFG_SLEN7_Pos              (0UL)                     /*!< ADC SL7CFG: SLEN7 (Bit 0)                             */
+#define ADC_SL7CFG_SLEN7_Msk              (0x1UL)                   /*!< ADC SL7CFG: SLEN7 (Bitfield-Mask: 0x01)               */
+/* =========================================================  WULIM  ========================================================= */
+#define ADC_WULIM_ULIM_Pos                (0UL)                     /*!< ADC WULIM: ULIM (Bit 0)                               */
+#define ADC_WULIM_ULIM_Msk                (0xfffffUL)               /*!< ADC WULIM: ULIM (Bitfield-Mask: 0xfffff)              */
+/* =========================================================  WLLIM  ========================================================= */
+#define ADC_WLLIM_LLIM_Pos                (0UL)                     /*!< ADC WLLIM: LLIM (Bit 0)                               */
+#define ADC_WLLIM_LLIM_Msk                (0xfffffUL)               /*!< ADC WLLIM: LLIM (Bitfield-Mask: 0xfffff)              */
+/* ========================================================  SCWLIM  ========================================================= */
+#define ADC_SCWLIM_SCWLIMEN_Pos           (0UL)                     /*!< ADC SCWLIM: SCWLIMEN (Bit 0)                          */
+#define ADC_SCWLIM_SCWLIMEN_Msk           (0x1UL)                   /*!< ADC SCWLIM: SCWLIMEN (Bitfield-Mask: 0x01)            */
+/* =========================================================  FIFO  ========================================================== */
+#define ADC_FIFO_RSVD_Pos                 (31UL)                    /*!< ADC FIFO: RSVD (Bit 31)                               */
+#define ADC_FIFO_RSVD_Msk                 (0x80000000UL)            /*!< ADC FIFO: RSVD (Bitfield-Mask: 0x01)                  */
+#define ADC_FIFO_SLOTNUM_Pos              (28UL)                    /*!< ADC FIFO: SLOTNUM (Bit 28)                            */
+#define ADC_FIFO_SLOTNUM_Msk              (0x70000000UL)            /*!< ADC FIFO: SLOTNUM (Bitfield-Mask: 0x07)               */
+#define ADC_FIFO_COUNT_Pos                (20UL)                    /*!< ADC FIFO: COUNT (Bit 20)                              */
+#define ADC_FIFO_COUNT_Msk                (0xff00000UL)             /*!< ADC FIFO: COUNT (Bitfield-Mask: 0xff)                 */
+#define ADC_FIFO_DATA_Pos                 (0UL)                     /*!< ADC FIFO: DATA (Bit 0)                                */
+#define ADC_FIFO_DATA_Msk                 (0xfffffUL)               /*!< ADC FIFO: DATA (Bitfield-Mask: 0xfffff)               */
+/* ========================================================  FIFOPR  ========================================================= */
+#define ADC_FIFOPR_RSVDPR_Pos             (31UL)                    /*!< ADC FIFOPR: RSVDPR (Bit 31)                           */
+#define ADC_FIFOPR_RSVDPR_Msk             (0x80000000UL)            /*!< ADC FIFOPR: RSVDPR (Bitfield-Mask: 0x01)              */
+#define ADC_FIFOPR_SLOTNUMPR_Pos          (28UL)                    /*!< ADC FIFOPR: SLOTNUMPR (Bit 28)                        */
+#define ADC_FIFOPR_SLOTNUMPR_Msk          (0x70000000UL)            /*!< ADC FIFOPR: SLOTNUMPR (Bitfield-Mask: 0x07)           */
+#define ADC_FIFOPR_COUNT_Pos              (20UL)                    /*!< ADC FIFOPR: COUNT (Bit 20)                            */
+#define ADC_FIFOPR_COUNT_Msk              (0xff00000UL)             /*!< ADC FIFOPR: COUNT (Bitfield-Mask: 0xff)               */
+#define ADC_FIFOPR_DATA_Pos               (0UL)                     /*!< ADC FIFOPR: DATA (Bit 0)                              */
+#define ADC_FIFOPR_DATA_Msk               (0xfffffUL)               /*!< ADC FIFOPR: DATA (Bitfield-Mask: 0xfffff)             */
+/* =========================================================  INTEN  ========================================================= */
+#define ADC_INTEN_DERR_Pos                (7UL)                     /*!< ADC INTEN: DERR (Bit 7)                               */
+#define ADC_INTEN_DERR_Msk                (0x80UL)                  /*!< ADC INTEN: DERR (Bitfield-Mask: 0x01)                 */
+#define ADC_INTEN_DCMP_Pos                (6UL)                     /*!< ADC INTEN: DCMP (Bit 6)                               */
+#define ADC_INTEN_DCMP_Msk                (0x40UL)                  /*!< ADC INTEN: DCMP (Bitfield-Mask: 0x01)                 */
+#define ADC_INTEN_WCINC_Pos               (5UL)                     /*!< ADC INTEN: WCINC (Bit 5)                              */
+#define ADC_INTEN_WCINC_Msk               (0x20UL)                  /*!< ADC INTEN: WCINC (Bitfield-Mask: 0x01)                */
+#define ADC_INTEN_WCEXC_Pos               (4UL)                     /*!< ADC INTEN: WCEXC (Bit 4)                              */
+#define ADC_INTEN_WCEXC_Msk               (0x10UL)                  /*!< ADC INTEN: WCEXC (Bitfield-Mask: 0x01)                */
+#define ADC_INTEN_FIFOOVR2_Pos            (3UL)                     /*!< ADC INTEN: FIFOOVR2 (Bit 3)                           */
+#define ADC_INTEN_FIFOOVR2_Msk            (0x8UL)                   /*!< ADC INTEN: FIFOOVR2 (Bitfield-Mask: 0x01)             */
+#define ADC_INTEN_FIFOOVR1_Pos            (2UL)                     /*!< ADC INTEN: FIFOOVR1 (Bit 2)                           */
+#define ADC_INTEN_FIFOOVR1_Msk            (0x4UL)                   /*!< ADC INTEN: FIFOOVR1 (Bitfield-Mask: 0x01)             */
+#define ADC_INTEN_SCNCMP_Pos              (1UL)                     /*!< ADC INTEN: SCNCMP (Bit 1)                             */
+#define ADC_INTEN_SCNCMP_Msk              (0x2UL)                   /*!< ADC INTEN: SCNCMP (Bitfield-Mask: 0x01)               */
+#define ADC_INTEN_CNVCMP_Pos              (0UL)                     /*!< ADC INTEN: CNVCMP (Bit 0)                             */
+#define ADC_INTEN_CNVCMP_Msk              (0x1UL)                   /*!< ADC INTEN: CNVCMP (Bitfield-Mask: 0x01)               */
+/* ========================================================  INTSTAT  ======================================================== */
+#define ADC_INTSTAT_DERR_Pos              (7UL)                     /*!< ADC INTSTAT: DERR (Bit 7)                             */
+#define ADC_INTSTAT_DERR_Msk              (0x80UL)                  /*!< ADC INTSTAT: DERR (Bitfield-Mask: 0x01)               */
+#define ADC_INTSTAT_DCMP_Pos              (6UL)                     /*!< ADC INTSTAT: DCMP (Bit 6)                             */
+#define ADC_INTSTAT_DCMP_Msk              (0x40UL)                  /*!< ADC INTSTAT: DCMP (Bitfield-Mask: 0x01)               */
+#define ADC_INTSTAT_WCINC_Pos             (5UL)                     /*!< ADC INTSTAT: WCINC (Bit 5)                            */
+#define ADC_INTSTAT_WCINC_Msk             (0x20UL)                  /*!< ADC INTSTAT: WCINC (Bitfield-Mask: 0x01)              */
+#define ADC_INTSTAT_WCEXC_Pos             (4UL)                     /*!< ADC INTSTAT: WCEXC (Bit 4)                            */
+#define ADC_INTSTAT_WCEXC_Msk             (0x10UL)                  /*!< ADC INTSTAT: WCEXC (Bitfield-Mask: 0x01)              */
+#define ADC_INTSTAT_FIFOOVR2_Pos          (3UL)                     /*!< ADC INTSTAT: FIFOOVR2 (Bit 3)                         */
+#define ADC_INTSTAT_FIFOOVR2_Msk          (0x8UL)                   /*!< ADC INTSTAT: FIFOOVR2 (Bitfield-Mask: 0x01)           */
+#define ADC_INTSTAT_FIFOOVR1_Pos          (2UL)                     /*!< ADC INTSTAT: FIFOOVR1 (Bit 2)                         */
+#define ADC_INTSTAT_FIFOOVR1_Msk          (0x4UL)                   /*!< ADC INTSTAT: FIFOOVR1 (Bitfield-Mask: 0x01)           */
+#define ADC_INTSTAT_SCNCMP_Pos            (1UL)                     /*!< ADC INTSTAT: SCNCMP (Bit 1)                           */
+#define ADC_INTSTAT_SCNCMP_Msk            (0x2UL)                   /*!< ADC INTSTAT: SCNCMP (Bitfield-Mask: 0x01)             */
+#define ADC_INTSTAT_CNVCMP_Pos            (0UL)                     /*!< ADC INTSTAT: CNVCMP (Bit 0)                           */
+#define ADC_INTSTAT_CNVCMP_Msk            (0x1UL)                   /*!< ADC INTSTAT: CNVCMP (Bitfield-Mask: 0x01)             */
+/* ========================================================  INTCLR  ========================================================= */
+#define ADC_INTCLR_DERR_Pos               (7UL)                     /*!< ADC INTCLR: DERR (Bit 7)                              */
+#define ADC_INTCLR_DERR_Msk               (0x80UL)                  /*!< ADC INTCLR: DERR (Bitfield-Mask: 0x01)                */
+#define ADC_INTCLR_DCMP_Pos               (6UL)                     /*!< ADC INTCLR: DCMP (Bit 6)                              */
+#define ADC_INTCLR_DCMP_Msk               (0x40UL)                  /*!< ADC INTCLR: DCMP (Bitfield-Mask: 0x01)                */
+#define ADC_INTCLR_WCINC_Pos              (5UL)                     /*!< ADC INTCLR: WCINC (Bit 5)                             */
+#define ADC_INTCLR_WCINC_Msk              (0x20UL)                  /*!< ADC INTCLR: WCINC (Bitfield-Mask: 0x01)               */
+#define ADC_INTCLR_WCEXC_Pos              (4UL)                     /*!< ADC INTCLR: WCEXC (Bit 4)                             */
+#define ADC_INTCLR_WCEXC_Msk              (0x10UL)                  /*!< ADC INTCLR: WCEXC (Bitfield-Mask: 0x01)               */
+#define ADC_INTCLR_FIFOOVR2_Pos           (3UL)                     /*!< ADC INTCLR: FIFOOVR2 (Bit 3)                          */
+#define ADC_INTCLR_FIFOOVR2_Msk           (0x8UL)                   /*!< ADC INTCLR: FIFOOVR2 (Bitfield-Mask: 0x01)            */
+#define ADC_INTCLR_FIFOOVR1_Pos           (2UL)                     /*!< ADC INTCLR: FIFOOVR1 (Bit 2)                          */
+#define ADC_INTCLR_FIFOOVR1_Msk           (0x4UL)                   /*!< ADC INTCLR: FIFOOVR1 (Bitfield-Mask: 0x01)            */
+#define ADC_INTCLR_SCNCMP_Pos             (1UL)                     /*!< ADC INTCLR: SCNCMP (Bit 1)                            */
+#define ADC_INTCLR_SCNCMP_Msk             (0x2UL)                   /*!< ADC INTCLR: SCNCMP (Bitfield-Mask: 0x01)              */
+#define ADC_INTCLR_CNVCMP_Pos             (0UL)                     /*!< ADC INTCLR: CNVCMP (Bit 0)                            */
+#define ADC_INTCLR_CNVCMP_Msk             (0x1UL)                   /*!< ADC INTCLR: CNVCMP (Bitfield-Mask: 0x01)              */
+/* ========================================================  INTSET  ========================================================= */
+#define ADC_INTSET_DERR_Pos               (7UL)                     /*!< ADC INTSET: DERR (Bit 7)                              */
+#define ADC_INTSET_DERR_Msk               (0x80UL)                  /*!< ADC INTSET: DERR (Bitfield-Mask: 0x01)                */
+#define ADC_INTSET_DCMP_Pos               (6UL)                     /*!< ADC INTSET: DCMP (Bit 6)                              */
+#define ADC_INTSET_DCMP_Msk               (0x40UL)                  /*!< ADC INTSET: DCMP (Bitfield-Mask: 0x01)                */
+#define ADC_INTSET_WCINC_Pos              (5UL)                     /*!< ADC INTSET: WCINC (Bit 5)                             */
+#define ADC_INTSET_WCINC_Msk              (0x20UL)                  /*!< ADC INTSET: WCINC (Bitfield-Mask: 0x01)               */
+#define ADC_INTSET_WCEXC_Pos              (4UL)                     /*!< ADC INTSET: WCEXC (Bit 4)                             */
+#define ADC_INTSET_WCEXC_Msk              (0x10UL)                  /*!< ADC INTSET: WCEXC (Bitfield-Mask: 0x01)               */
+#define ADC_INTSET_FIFOOVR2_Pos           (3UL)                     /*!< ADC INTSET: FIFOOVR2 (Bit 3)                          */
+#define ADC_INTSET_FIFOOVR2_Msk           (0x8UL)                   /*!< ADC INTSET: FIFOOVR2 (Bitfield-Mask: 0x01)            */
+#define ADC_INTSET_FIFOOVR1_Pos           (2UL)                     /*!< ADC INTSET: FIFOOVR1 (Bit 2)                          */
+#define ADC_INTSET_FIFOOVR1_Msk           (0x4UL)                   /*!< ADC INTSET: FIFOOVR1 (Bitfield-Mask: 0x01)            */
+#define ADC_INTSET_SCNCMP_Pos             (1UL)                     /*!< ADC INTSET: SCNCMP (Bit 1)                            */
+#define ADC_INTSET_SCNCMP_Msk             (0x2UL)                   /*!< ADC INTSET: SCNCMP (Bitfield-Mask: 0x01)              */
+#define ADC_INTSET_CNVCMP_Pos             (0UL)                     /*!< ADC INTSET: CNVCMP (Bit 0)                            */
+#define ADC_INTSET_CNVCMP_Msk             (0x1UL)                   /*!< ADC INTSET: CNVCMP (Bitfield-Mask: 0x01)              */
+/* =======================================================  DMATRIGEN  ======================================================= */
+#define ADC_DMATRIGEN_DFIFOFULL_Pos       (1UL)                     /*!< ADC DMATRIGEN: DFIFOFULL (Bit 1)                      */
+#define ADC_DMATRIGEN_DFIFOFULL_Msk       (0x2UL)                   /*!< ADC DMATRIGEN: DFIFOFULL (Bitfield-Mask: 0x01)        */
+#define ADC_DMATRIGEN_DFIFO75_Pos         (0UL)                     /*!< ADC DMATRIGEN: DFIFO75 (Bit 0)                        */
+#define ADC_DMATRIGEN_DFIFO75_Msk         (0x1UL)                   /*!< ADC DMATRIGEN: DFIFO75 (Bitfield-Mask: 0x01)          */
+/* ======================================================  DMATRIGSTAT  ====================================================== */
+#define ADC_DMATRIGSTAT_DFULLSTAT_Pos     (1UL)                     /*!< ADC DMATRIGSTAT: DFULLSTAT (Bit 1)                    */
+#define ADC_DMATRIGSTAT_DFULLSTAT_Msk     (0x2UL)                   /*!< ADC DMATRIGSTAT: DFULLSTAT (Bitfield-Mask: 0x01)      */
+#define ADC_DMATRIGSTAT_D75STAT_Pos       (0UL)                     /*!< ADC DMATRIGSTAT: D75STAT (Bit 0)                      */
+#define ADC_DMATRIGSTAT_D75STAT_Msk       (0x1UL)                   /*!< ADC DMATRIGSTAT: D75STAT (Bitfield-Mask: 0x01)        */
+/* ========================================================  DMACFG  ========================================================= */
+#define ADC_DMACFG_DPWROFF_Pos            (18UL)                    /*!< ADC DMACFG: DPWROFF (Bit 18)                          */
+#define ADC_DMACFG_DPWROFF_Msk            (0x40000UL)               /*!< ADC DMACFG: DPWROFF (Bitfield-Mask: 0x01)             */
+#define ADC_DMACFG_DMAMSK_Pos             (17UL)                    /*!< ADC DMACFG: DMAMSK (Bit 17)                           */
+#define ADC_DMACFG_DMAMSK_Msk             (0x20000UL)               /*!< ADC DMACFG: DMAMSK (Bitfield-Mask: 0x01)              */
+#define ADC_DMACFG_DMAHONSTAT_Pos         (16UL)                    /*!< ADC DMACFG: DMAHONSTAT (Bit 16)                       */
+#define ADC_DMACFG_DMAHONSTAT_Msk         (0x10000UL)               /*!< ADC DMACFG: DMAHONSTAT (Bitfield-Mask: 0x01)          */
+#define ADC_DMACFG_DMADYNPRI_Pos          (9UL)                     /*!< ADC DMACFG: DMADYNPRI (Bit 9)                         */
+#define ADC_DMACFG_DMADYNPRI_Msk          (0x200UL)                 /*!< ADC DMACFG: DMADYNPRI (Bitfield-Mask: 0x01)           */
+#define ADC_DMACFG_DMAPRI_Pos             (8UL)                     /*!< ADC DMACFG: DMAPRI (Bit 8)                            */
+#define ADC_DMACFG_DMAPRI_Msk             (0x100UL)                 /*!< ADC DMACFG: DMAPRI (Bitfield-Mask: 0x01)              */
+#define ADC_DMACFG_DMADIR_Pos             (2UL)                     /*!< ADC DMACFG: DMADIR (Bit 2)                            */
+#define ADC_DMACFG_DMADIR_Msk             (0x4UL)                   /*!< ADC DMACFG: DMADIR (Bitfield-Mask: 0x01)              */
+#define ADC_DMACFG_DMAEN_Pos              (0UL)                     /*!< ADC DMACFG: DMAEN (Bit 0)                             */
+#define ADC_DMACFG_DMAEN_Msk              (0x1UL)                   /*!< ADC DMACFG: DMAEN (Bitfield-Mask: 0x01)               */
+/* ======================================================  DMATOTCOUNT  ====================================================== */
+#define ADC_DMATOTCOUNT_TOTCOUNT_Pos      (2UL)                     /*!< ADC DMATOTCOUNT: TOTCOUNT (Bit 2)                     */
+#define ADC_DMATOTCOUNT_TOTCOUNT_Msk      (0x3fffcUL)               /*!< ADC DMATOTCOUNT: TOTCOUNT (Bitfield-Mask: 0xffff)     */
+/* ======================================================  DMATARGADDR  ====================================================== */
+#define ADC_DMATARGADDR_UTARGADDR_Pos     (19UL)                    /*!< ADC DMATARGADDR: UTARGADDR (Bit 19)                   */
+#define ADC_DMATARGADDR_UTARGADDR_Msk     (0xfff80000UL)            /*!< ADC DMATARGADDR: UTARGADDR (Bitfield-Mask: 0x1fff)    */
+#define ADC_DMATARGADDR_LTARGADDR_Pos     (0UL)                     /*!< ADC DMATARGADDR: LTARGADDR (Bit 0)                    */
+#define ADC_DMATARGADDR_LTARGADDR_Msk     (0x7ffffUL)               /*!< ADC DMATARGADDR: LTARGADDR (Bitfield-Mask: 0x7ffff)   */
+/* ========================================================  DMASTAT  ======================================================== */
+#define ADC_DMASTAT_DMAERR_Pos            (2UL)                     /*!< ADC DMASTAT: DMAERR (Bit 2)                           */
+#define ADC_DMASTAT_DMAERR_Msk            (0x4UL)                   /*!< ADC DMASTAT: DMAERR (Bitfield-Mask: 0x01)             */
+#define ADC_DMASTAT_DMACPL_Pos            (1UL)                     /*!< ADC DMASTAT: DMACPL (Bit 1)                           */
+#define ADC_DMASTAT_DMACPL_Msk            (0x2UL)                   /*!< ADC DMASTAT: DMACPL (Bitfield-Mask: 0x01)             */
+#define ADC_DMASTAT_DMATIP_Pos            (0UL)                     /*!< ADC DMASTAT: DMATIP (Bit 0)                           */
+#define ADC_DMASTAT_DMATIP_Msk            (0x1UL)                   /*!< ADC DMASTAT: DMATIP (Bitfield-Mask: 0x01)             */
+
+
+/* =========================================================================================================================== */
+/* ================                                          APBDMA                                           ================ */
+/* =========================================================================================================================== */
+
+/* ========================================================  BBVALUE  ======================================================== */
+#define APBDMA_BBVALUE_PIN_Pos            (16UL)                    /*!< APBDMA BBVALUE: PIN (Bit 16)                          */
+#define APBDMA_BBVALUE_PIN_Msk            (0xff0000UL)              /*!< APBDMA BBVALUE: PIN (Bitfield-Mask: 0xff)             */
+#define APBDMA_BBVALUE_DATAOUT_Pos        (0UL)                     /*!< APBDMA BBVALUE: DATAOUT (Bit 0)                       */
+#define APBDMA_BBVALUE_DATAOUT_Msk        (0xffUL)                  /*!< APBDMA BBVALUE: DATAOUT (Bitfield-Mask: 0xff)         */
+/* ======================================================  BBSETCLEAR  ======================================================= */
+#define APBDMA_BBSETCLEAR_CLEAR_Pos       (16UL)                    /*!< APBDMA BBSETCLEAR: CLEAR (Bit 16)                     */
+#define APBDMA_BBSETCLEAR_CLEAR_Msk       (0xff0000UL)              /*!< APBDMA BBSETCLEAR: CLEAR (Bitfield-Mask: 0xff)        */
+#define APBDMA_BBSETCLEAR_SET_Pos         (0UL)                     /*!< APBDMA BBSETCLEAR: SET (Bit 0)                        */
+#define APBDMA_BBSETCLEAR_SET_Msk         (0xffUL)                  /*!< APBDMA BBSETCLEAR: SET (Bitfield-Mask: 0xff)          */
+/* ========================================================  BBINPUT  ======================================================== */
+#define APBDMA_BBINPUT_DATAIN_Pos         (0UL)                     /*!< APBDMA BBINPUT: DATAIN (Bit 0)                        */
+#define APBDMA_BBINPUT_DATAIN_Msk         (0xffUL)                  /*!< APBDMA BBINPUT: DATAIN (Bitfield-Mask: 0xff)          */
+/* =======================================================  DEBUGDATA  ======================================================= */
+#define APBDMA_DEBUGDATA_DEBUGDATA_Pos    (0UL)                     /*!< APBDMA DEBUGDATA: DEBUGDATA (Bit 0)                   */
+#define APBDMA_DEBUGDATA_DEBUGDATA_Msk    (0xffffffffUL)            /*!< APBDMA DEBUGDATA: DEBUGDATA (Bitfield-Mask: 0xffffffff) */
+/* =========================================================  DEBUG  ========================================================= */
+#define APBDMA_DEBUG_DEBUGEN_Pos          (0UL)                     /*!< APBDMA DEBUG: DEBUGEN (Bit 0)                         */
+#define APBDMA_DEBUG_DEBUGEN_Msk          (0xfUL)                   /*!< APBDMA DEBUG: DEBUGEN (Bitfield-Mask: 0x0f)           */
+
+
+/* =========================================================================================================================== */
+/* ================                                           BLEIF                                           ================ */
+/* =========================================================================================================================== */
+
+/* =========================================================  FIFO  ========================================================== */
+#define BLEIF_FIFO_FIFO_Pos               (0UL)                     /*!< BLEIF FIFO: FIFO (Bit 0)                              */
+#define BLEIF_FIFO_FIFO_Msk               (0xffffffffUL)            /*!< BLEIF FIFO: FIFO (Bitfield-Mask: 0xffffffff)          */
+/* ========================================================  FIFOPTR  ======================================================== */
+#define BLEIF_FIFOPTR_FIFO1REM_Pos        (24UL)                    /*!< BLEIF FIFOPTR: FIFO1REM (Bit 24)                      */
+#define BLEIF_FIFOPTR_FIFO1REM_Msk        (0xff000000UL)            /*!< BLEIF FIFOPTR: FIFO1REM (Bitfield-Mask: 0xff)         */
+#define BLEIF_FIFOPTR_FIFO1SIZ_Pos        (16UL)                    /*!< BLEIF FIFOPTR: FIFO1SIZ (Bit 16)                      */
+#define BLEIF_FIFOPTR_FIFO1SIZ_Msk        (0xff0000UL)              /*!< BLEIF FIFOPTR: FIFO1SIZ (Bitfield-Mask: 0xff)         */
+#define BLEIF_FIFOPTR_FIFO0REM_Pos        (8UL)                     /*!< BLEIF FIFOPTR: FIFO0REM (Bit 8)                       */
+#define BLEIF_FIFOPTR_FIFO0REM_Msk        (0xff00UL)                /*!< BLEIF FIFOPTR: FIFO0REM (Bitfield-Mask: 0xff)         */
+#define BLEIF_FIFOPTR_FIFO0SIZ_Pos        (0UL)                     /*!< BLEIF FIFOPTR: FIFO0SIZ (Bit 0)                       */
+#define BLEIF_FIFOPTR_FIFO0SIZ_Msk        (0xffUL)                  /*!< BLEIF FIFOPTR: FIFO0SIZ (Bitfield-Mask: 0xff)         */
+/* ========================================================  FIFOTHR  ======================================================== */
+#define BLEIF_FIFOTHR_FIFOWTHR_Pos        (8UL)                     /*!< BLEIF FIFOTHR: FIFOWTHR (Bit 8)                       */
+#define BLEIF_FIFOTHR_FIFOWTHR_Msk        (0x3f00UL)                /*!< BLEIF FIFOTHR: FIFOWTHR (Bitfield-Mask: 0x3f)         */
+#define BLEIF_FIFOTHR_FIFORTHR_Pos        (0UL)                     /*!< BLEIF FIFOTHR: FIFORTHR (Bit 0)                       */
+#define BLEIF_FIFOTHR_FIFORTHR_Msk        (0x3fUL)                  /*!< BLEIF FIFOTHR: FIFORTHR (Bitfield-Mask: 0x3f)         */
+/* ========================================================  FIFOPOP  ======================================================== */
+#define BLEIF_FIFOPOP_FIFODOUT_Pos        (0UL)                     /*!< BLEIF FIFOPOP: FIFODOUT (Bit 0)                       */
+#define BLEIF_FIFOPOP_FIFODOUT_Msk        (0xffffffffUL)            /*!< BLEIF FIFOPOP: FIFODOUT (Bitfield-Mask: 0xffffffff)   */
+/* =======================================================  FIFOPUSH  ======================================================== */
+#define BLEIF_FIFOPUSH_FIFODIN_Pos        (0UL)                     /*!< BLEIF FIFOPUSH: FIFODIN (Bit 0)                       */
+#define BLEIF_FIFOPUSH_FIFODIN_Msk        (0xffffffffUL)            /*!< BLEIF FIFOPUSH: FIFODIN (Bitfield-Mask: 0xffffffff)   */
+/* =======================================================  FIFOCTRL  ======================================================== */
+#define BLEIF_FIFOCTRL_FIFORSTN_Pos       (1UL)                     /*!< BLEIF FIFOCTRL: FIFORSTN (Bit 1)                      */
+#define BLEIF_FIFOCTRL_FIFORSTN_Msk       (0x2UL)                   /*!< BLEIF FIFOCTRL: FIFORSTN (Bitfield-Mask: 0x01)        */
+#define BLEIF_FIFOCTRL_POPWR_Pos          (0UL)                     /*!< BLEIF FIFOCTRL: POPWR (Bit 0)                         */
+#define BLEIF_FIFOCTRL_POPWR_Msk          (0x1UL)                   /*!< BLEIF FIFOCTRL: POPWR (Bitfield-Mask: 0x01)           */
+/* ========================================================  FIFOLOC  ======================================================== */
+#define BLEIF_FIFOLOC_FIFORPTR_Pos        (8UL)                     /*!< BLEIF FIFOLOC: FIFORPTR (Bit 8)                       */
+#define BLEIF_FIFOLOC_FIFORPTR_Msk        (0xf00UL)                 /*!< BLEIF FIFOLOC: FIFORPTR (Bitfield-Mask: 0x0f)         */
+#define BLEIF_FIFOLOC_FIFOWPTR_Pos        (0UL)                     /*!< BLEIF FIFOLOC: FIFOWPTR (Bit 0)                       */
+#define BLEIF_FIFOLOC_FIFOWPTR_Msk        (0xfUL)                   /*!< BLEIF FIFOLOC: FIFOWPTR (Bitfield-Mask: 0x0f)         */
+/* ========================================================  CLKCFG  ========================================================= */
+#define BLEIF_CLKCFG_DIV3_Pos             (12UL)                    /*!< BLEIF CLKCFG: DIV3 (Bit 12)                           */
+#define BLEIF_CLKCFG_DIV3_Msk             (0x1000UL)                /*!< BLEIF CLKCFG: DIV3 (Bitfield-Mask: 0x01)              */
+#define BLEIF_CLKCFG_CLK32KEN_Pos         (11UL)                    /*!< BLEIF CLKCFG: CLK32KEN (Bit 11)                       */
+#define BLEIF_CLKCFG_CLK32KEN_Msk         (0x800UL)                 /*!< BLEIF CLKCFG: CLK32KEN (Bitfield-Mask: 0x01)          */
+#define BLEIF_CLKCFG_FSEL_Pos             (8UL)                     /*!< BLEIF CLKCFG: FSEL (Bit 8)                            */
+#define BLEIF_CLKCFG_FSEL_Msk             (0x700UL)                 /*!< BLEIF CLKCFG: FSEL (Bitfield-Mask: 0x07)              */
+#define BLEIF_CLKCFG_IOCLKEN_Pos          (0UL)                     /*!< BLEIF CLKCFG: IOCLKEN (Bit 0)                         */
+#define BLEIF_CLKCFG_IOCLKEN_Msk          (0x1UL)                   /*!< BLEIF CLKCFG: IOCLKEN (Bitfield-Mask: 0x01)           */
+/* ==========================================================  CMD  ========================================================== */
+#define BLEIF_CMD_OFFSETLO_Pos            (24UL)                    /*!< BLEIF CMD: OFFSETLO (Bit 24)                          */
+#define BLEIF_CMD_OFFSETLO_Msk            (0xff000000UL)            /*!< BLEIF CMD: OFFSETLO (Bitfield-Mask: 0xff)             */
+#define BLEIF_CMD_CMDSEL_Pos              (20UL)                    /*!< BLEIF CMD: CMDSEL (Bit 20)                            */
+#define BLEIF_CMD_CMDSEL_Msk              (0x300000UL)              /*!< BLEIF CMD: CMDSEL (Bitfield-Mask: 0x03)               */
+#define BLEIF_CMD_TSIZE_Pos               (8UL)                     /*!< BLEIF CMD: TSIZE (Bit 8)                              */
+#define BLEIF_CMD_TSIZE_Msk               (0xfff00UL)               /*!< BLEIF CMD: TSIZE (Bitfield-Mask: 0xfff)               */
+#define BLEIF_CMD_CONT_Pos                (7UL)                     /*!< BLEIF CMD: CONT (Bit 7)                               */
+#define BLEIF_CMD_CONT_Msk                (0x80UL)                  /*!< BLEIF CMD: CONT (Bitfield-Mask: 0x01)                 */
+#define BLEIF_CMD_OFFSETCNT_Pos           (5UL)                     /*!< BLEIF CMD: OFFSETCNT (Bit 5)                          */
+#define BLEIF_CMD_OFFSETCNT_Msk           (0x60UL)                  /*!< BLEIF CMD: OFFSETCNT (Bitfield-Mask: 0x03)            */
+#define BLEIF_CMD_CMD_Pos                 (0UL)                     /*!< BLEIF CMD: CMD (Bit 0)                                */
+#define BLEIF_CMD_CMD_Msk                 (0x1fUL)                  /*!< BLEIF CMD: CMD (Bitfield-Mask: 0x1f)                  */
+/* ========================================================  CMDRPT  ========================================================= */
+#define BLEIF_CMDRPT_CMDRPT_Pos           (0UL)                     /*!< BLEIF CMDRPT: CMDRPT (Bit 0)                          */
+#define BLEIF_CMDRPT_CMDRPT_Msk           (0x1fUL)                  /*!< BLEIF CMDRPT: CMDRPT (Bitfield-Mask: 0x1f)            */
+/* =======================================================  OFFSETHI  ======================================================== */
+#define BLEIF_OFFSETHI_OFFSETHI_Pos       (0UL)                     /*!< BLEIF OFFSETHI: OFFSETHI (Bit 0)                      */
+#define BLEIF_OFFSETHI_OFFSETHI_Msk       (0xffffUL)                /*!< BLEIF OFFSETHI: OFFSETHI (Bitfield-Mask: 0xffff)      */
+/* ========================================================  CMDSTAT  ======================================================== */
+#define BLEIF_CMDSTAT_CTSIZE_Pos          (8UL)                     /*!< BLEIF CMDSTAT: CTSIZE (Bit 8)                         */
+#define BLEIF_CMDSTAT_CTSIZE_Msk          (0xfff00UL)               /*!< BLEIF CMDSTAT: CTSIZE (Bitfield-Mask: 0xfff)          */
+#define BLEIF_CMDSTAT_CMDSTAT_Pos         (5UL)                     /*!< BLEIF CMDSTAT: CMDSTAT (Bit 5)                        */
+#define BLEIF_CMDSTAT_CMDSTAT_Msk         (0xe0UL)                  /*!< BLEIF CMDSTAT: CMDSTAT (Bitfield-Mask: 0x07)          */
+#define BLEIF_CMDSTAT_CCMD_Pos            (0UL)                     /*!< BLEIF CMDSTAT: CCMD (Bit 0)                           */
+#define BLEIF_CMDSTAT_CCMD_Msk            (0x1fUL)                  /*!< BLEIF CMDSTAT: CCMD (Bitfield-Mask: 0x1f)             */
+/* =========================================================  INTEN  ========================================================= */
+#define BLEIF_INTEN_B2MSHUTDN_Pos         (16UL)                    /*!< BLEIF INTEN: B2MSHUTDN (Bit 16)                       */
+#define BLEIF_INTEN_B2MSHUTDN_Msk         (0x10000UL)               /*!< BLEIF INTEN: B2MSHUTDN (Bitfield-Mask: 0x01)          */
+#define BLEIF_INTEN_B2MACTIVE_Pos         (15UL)                    /*!< BLEIF INTEN: B2MACTIVE (Bit 15)                       */
+#define BLEIF_INTEN_B2MACTIVE_Msk         (0x8000UL)                /*!< BLEIF INTEN: B2MACTIVE (Bitfield-Mask: 0x01)          */
+#define BLEIF_INTEN_B2MSLEEP_Pos          (14UL)                    /*!< BLEIF INTEN: B2MSLEEP (Bit 14)                        */
+#define BLEIF_INTEN_B2MSLEEP_Msk          (0x4000UL)                /*!< BLEIF INTEN: B2MSLEEP (Bitfield-Mask: 0x01)           */
+#define BLEIF_INTEN_CQERR_Pos             (13UL)                    /*!< BLEIF INTEN: CQERR (Bit 13)                           */
+#define BLEIF_INTEN_CQERR_Msk             (0x2000UL)                /*!< BLEIF INTEN: CQERR (Bitfield-Mask: 0x01)              */
+#define BLEIF_INTEN_CQUPD_Pos             (12UL)                    /*!< BLEIF INTEN: CQUPD (Bit 12)                           */
+#define BLEIF_INTEN_CQUPD_Msk             (0x1000UL)                /*!< BLEIF INTEN: CQUPD (Bitfield-Mask: 0x01)              */
+#define BLEIF_INTEN_CQPAUSED_Pos          (11UL)                    /*!< BLEIF INTEN: CQPAUSED (Bit 11)                        */
+#define BLEIF_INTEN_CQPAUSED_Msk          (0x800UL)                 /*!< BLEIF INTEN: CQPAUSED (Bitfield-Mask: 0x01)           */
+#define BLEIF_INTEN_DERR_Pos              (10UL)                    /*!< BLEIF INTEN: DERR (Bit 10)                            */
+#define BLEIF_INTEN_DERR_Msk              (0x400UL)                 /*!< BLEIF INTEN: DERR (Bitfield-Mask: 0x01)               */
+#define BLEIF_INTEN_DCMP_Pos              (9UL)                     /*!< BLEIF INTEN: DCMP (Bit 9)                             */
+#define BLEIF_INTEN_DCMP_Msk              (0x200UL)                 /*!< BLEIF INTEN: DCMP (Bitfield-Mask: 0x01)               */
+#define BLEIF_INTEN_BLECSSTAT_Pos         (8UL)                     /*!< BLEIF INTEN: BLECSSTAT (Bit 8)                        */
+#define BLEIF_INTEN_BLECSSTAT_Msk         (0x100UL)                 /*!< BLEIF INTEN: BLECSSTAT (Bitfield-Mask: 0x01)          */
+#define BLEIF_INTEN_BLECIRQ_Pos           (7UL)                     /*!< BLEIF INTEN: BLECIRQ (Bit 7)                          */
+#define BLEIF_INTEN_BLECIRQ_Msk           (0x80UL)                  /*!< BLEIF INTEN: BLECIRQ (Bitfield-Mask: 0x01)            */
+#define BLEIF_INTEN_ICMD_Pos              (6UL)                     /*!< BLEIF INTEN: ICMD (Bit 6)                             */
+#define BLEIF_INTEN_ICMD_Msk              (0x40UL)                  /*!< BLEIF INTEN: ICMD (Bitfield-Mask: 0x01)               */
+#define BLEIF_INTEN_IACC_Pos              (5UL)                     /*!< BLEIF INTEN: IACC (Bit 5)                             */
+#define BLEIF_INTEN_IACC_Msk              (0x20UL)                  /*!< BLEIF INTEN: IACC (Bitfield-Mask: 0x01)               */
+#define BLEIF_INTEN_B2MST_Pos             (4UL)                     /*!< BLEIF INTEN: B2MST (Bit 4)                            */
+#define BLEIF_INTEN_B2MST_Msk             (0x10UL)                  /*!< BLEIF INTEN: B2MST (Bitfield-Mask: 0x01)              */
+#define BLEIF_INTEN_FOVFL_Pos             (3UL)                     /*!< BLEIF INTEN: FOVFL (Bit 3)                            */
+#define BLEIF_INTEN_FOVFL_Msk             (0x8UL)                   /*!< BLEIF INTEN: FOVFL (Bitfield-Mask: 0x01)              */
+#define BLEIF_INTEN_FUNDFL_Pos            (2UL)                     /*!< BLEIF INTEN: FUNDFL (Bit 2)                           */
+#define BLEIF_INTEN_FUNDFL_Msk            (0x4UL)                   /*!< BLEIF INTEN: FUNDFL (Bitfield-Mask: 0x01)             */
+#define BLEIF_INTEN_THR_Pos               (1UL)                     /*!< BLEIF INTEN: THR (Bit 1)                              */
+#define BLEIF_INTEN_THR_Msk               (0x2UL)                   /*!< BLEIF INTEN: THR (Bitfield-Mask: 0x01)                */
+#define BLEIF_INTEN_CMDCMP_Pos            (0UL)                     /*!< BLEIF INTEN: CMDCMP (Bit 0)                           */
+#define BLEIF_INTEN_CMDCMP_Msk            (0x1UL)                   /*!< BLEIF INTEN: CMDCMP (Bitfield-Mask: 0x01)             */
+/* ========================================================  INTSTAT  ======================================================== */
+#define BLEIF_INTSTAT_B2MSHUTDN_Pos       (16UL)                    /*!< BLEIF INTSTAT: B2MSHUTDN (Bit 16)                     */
+#define BLEIF_INTSTAT_B2MSHUTDN_Msk       (0x10000UL)               /*!< BLEIF INTSTAT: B2MSHUTDN (Bitfield-Mask: 0x01)        */
+#define BLEIF_INTSTAT_B2MACTIVE_Pos       (15UL)                    /*!< BLEIF INTSTAT: B2MACTIVE (Bit 15)                     */
+#define BLEIF_INTSTAT_B2MACTIVE_Msk       (0x8000UL)                /*!< BLEIF INTSTAT: B2MACTIVE (Bitfield-Mask: 0x01)        */
+#define BLEIF_INTSTAT_B2MSLEEP_Pos        (14UL)                    /*!< BLEIF INTSTAT: B2MSLEEP (Bit 14)                      */
+#define BLEIF_INTSTAT_B2MSLEEP_Msk        (0x4000UL)                /*!< BLEIF INTSTAT: B2MSLEEP (Bitfield-Mask: 0x01)         */
+#define BLEIF_INTSTAT_CQERR_Pos           (13UL)                    /*!< BLEIF INTSTAT: CQERR (Bit 13)                         */
+#define BLEIF_INTSTAT_CQERR_Msk           (0x2000UL)                /*!< BLEIF INTSTAT: CQERR (Bitfield-Mask: 0x01)            */
+#define BLEIF_INTSTAT_CQUPD_Pos           (12UL)                    /*!< BLEIF INTSTAT: CQUPD (Bit 12)                         */
+#define BLEIF_INTSTAT_CQUPD_Msk           (0x1000UL)                /*!< BLEIF INTSTAT: CQUPD (Bitfield-Mask: 0x01)            */
+#define BLEIF_INTSTAT_CQPAUSED_Pos        (11UL)                    /*!< BLEIF INTSTAT: CQPAUSED (Bit 11)                      */
+#define BLEIF_INTSTAT_CQPAUSED_Msk        (0x800UL)                 /*!< BLEIF INTSTAT: CQPAUSED (Bitfield-Mask: 0x01)         */
+#define BLEIF_INTSTAT_DERR_Pos            (10UL)                    /*!< BLEIF INTSTAT: DERR (Bit 10)                          */
+#define BLEIF_INTSTAT_DERR_Msk            (0x400UL)                 /*!< BLEIF INTSTAT: DERR (Bitfield-Mask: 0x01)             */
+#define BLEIF_INTSTAT_DCMP_Pos            (9UL)                     /*!< BLEIF INTSTAT: DCMP (Bit 9)                           */
+#define BLEIF_INTSTAT_DCMP_Msk            (0x200UL)                 /*!< BLEIF INTSTAT: DCMP (Bitfield-Mask: 0x01)             */
+#define BLEIF_INTSTAT_BLECSSTAT_Pos       (8UL)                     /*!< BLEIF INTSTAT: BLECSSTAT (Bit 8)                      */
+#define BLEIF_INTSTAT_BLECSSTAT_Msk       (0x100UL)                 /*!< BLEIF INTSTAT: BLECSSTAT (Bitfield-Mask: 0x01)        */
+#define BLEIF_INTSTAT_BLECIRQ_Pos         (7UL)                     /*!< BLEIF INTSTAT: BLECIRQ (Bit 7)                        */
+#define BLEIF_INTSTAT_BLECIRQ_Msk         (0x80UL)                  /*!< BLEIF INTSTAT: BLECIRQ (Bitfield-Mask: 0x01)          */
+#define BLEIF_INTSTAT_ICMD_Pos            (6UL)                     /*!< BLEIF INTSTAT: ICMD (Bit 6)                           */
+#define BLEIF_INTSTAT_ICMD_Msk            (0x40UL)                  /*!< BLEIF INTSTAT: ICMD (Bitfield-Mask: 0x01)             */
+#define BLEIF_INTSTAT_IACC_Pos            (5UL)                     /*!< BLEIF INTSTAT: IACC (Bit 5)                           */
+#define BLEIF_INTSTAT_IACC_Msk            (0x20UL)                  /*!< BLEIF INTSTAT: IACC (Bitfield-Mask: 0x01)             */
+#define BLEIF_INTSTAT_B2MST_Pos           (4UL)                     /*!< BLEIF INTSTAT: B2MST (Bit 4)                          */
+#define BLEIF_INTSTAT_B2MST_Msk           (0x10UL)                  /*!< BLEIF INTSTAT: B2MST (Bitfield-Mask: 0x01)            */
+#define BLEIF_INTSTAT_FOVFL_Pos           (3UL)                     /*!< BLEIF INTSTAT: FOVFL (Bit 3)                          */
+#define BLEIF_INTSTAT_FOVFL_Msk           (0x8UL)                   /*!< BLEIF INTSTAT: FOVFL (Bitfield-Mask: 0x01)            */
+#define BLEIF_INTSTAT_FUNDFL_Pos          (2UL)                     /*!< BLEIF INTSTAT: FUNDFL (Bit 2)                         */
+#define BLEIF_INTSTAT_FUNDFL_Msk          (0x4UL)                   /*!< BLEIF INTSTAT: FUNDFL (Bitfield-Mask: 0x01)           */
+#define BLEIF_INTSTAT_THR_Pos             (1UL)                     /*!< BLEIF INTSTAT: THR (Bit 1)                            */
+#define BLEIF_INTSTAT_THR_Msk             (0x2UL)                   /*!< BLEIF INTSTAT: THR (Bitfield-Mask: 0x01)              */
+#define BLEIF_INTSTAT_CMDCMP_Pos          (0UL)                     /*!< BLEIF INTSTAT: CMDCMP (Bit 0)                         */
+#define BLEIF_INTSTAT_CMDCMP_Msk          (0x1UL)                   /*!< BLEIF INTSTAT: CMDCMP (Bitfield-Mask: 0x01)           */
+/* ========================================================  INTCLR  ========================================================= */
+#define BLEIF_INTCLR_B2MSHUTDN_Pos        (16UL)                    /*!< BLEIF INTCLR: B2MSHUTDN (Bit 16)                      */
+#define BLEIF_INTCLR_B2MSHUTDN_Msk        (0x10000UL)               /*!< BLEIF INTCLR: B2MSHUTDN (Bitfield-Mask: 0x01)         */
+#define BLEIF_INTCLR_B2MACTIVE_Pos        (15UL)                    /*!< BLEIF INTCLR: B2MACTIVE (Bit 15)                      */
+#define BLEIF_INTCLR_B2MACTIVE_Msk        (0x8000UL)                /*!< BLEIF INTCLR: B2MACTIVE (Bitfield-Mask: 0x01)         */
+#define BLEIF_INTCLR_B2MSLEEP_Pos         (14UL)                    /*!< BLEIF INTCLR: B2MSLEEP (Bit 14)                       */
+#define BLEIF_INTCLR_B2MSLEEP_Msk         (0x4000UL)                /*!< BLEIF INTCLR: B2MSLEEP (Bitfield-Mask: 0x01)          */
+#define BLEIF_INTCLR_CQERR_Pos            (13UL)                    /*!< BLEIF INTCLR: CQERR (Bit 13)                          */
+#define BLEIF_INTCLR_CQERR_Msk            (0x2000UL)                /*!< BLEIF INTCLR: CQERR (Bitfield-Mask: 0x01)             */
+#define BLEIF_INTCLR_CQUPD_Pos            (12UL)                    /*!< BLEIF INTCLR: CQUPD (Bit 12)                          */
+#define BLEIF_INTCLR_CQUPD_Msk            (0x1000UL)                /*!< BLEIF INTCLR: CQUPD (Bitfield-Mask: 0x01)             */
+#define BLEIF_INTCLR_CQPAUSED_Pos         (11UL)                    /*!< BLEIF INTCLR: CQPAUSED (Bit 11)                       */
+#define BLEIF_INTCLR_CQPAUSED_Msk         (0x800UL)                 /*!< BLEIF INTCLR: CQPAUSED (Bitfield-Mask: 0x01)          */
+#define BLEIF_INTCLR_DERR_Pos             (10UL)                    /*!< BLEIF INTCLR: DERR (Bit 10)                           */
+#define BLEIF_INTCLR_DERR_Msk             (0x400UL)                 /*!< BLEIF INTCLR: DERR (Bitfield-Mask: 0x01)              */
+#define BLEIF_INTCLR_DCMP_Pos             (9UL)                     /*!< BLEIF INTCLR: DCMP (Bit 9)                            */
+#define BLEIF_INTCLR_DCMP_Msk             (0x200UL)                 /*!< BLEIF INTCLR: DCMP (Bitfield-Mask: 0x01)              */
+#define BLEIF_INTCLR_BLECSSTAT_Pos        (8UL)                     /*!< BLEIF INTCLR: BLECSSTAT (Bit 8)                       */
+#define BLEIF_INTCLR_BLECSSTAT_Msk        (0x100UL)                 /*!< BLEIF INTCLR: BLECSSTAT (Bitfield-Mask: 0x01)         */
+#define BLEIF_INTCLR_BLECIRQ_Pos          (7UL)                     /*!< BLEIF INTCLR: BLECIRQ (Bit 7)                         */
+#define BLEIF_INTCLR_BLECIRQ_Msk          (0x80UL)                  /*!< BLEIF INTCLR: BLECIRQ (Bitfield-Mask: 0x01)           */
+#define BLEIF_INTCLR_ICMD_Pos             (6UL)                     /*!< BLEIF INTCLR: ICMD (Bit 6)                            */
+#define BLEIF_INTCLR_ICMD_Msk             (0x40UL)                  /*!< BLEIF INTCLR: ICMD (Bitfield-Mask: 0x01)              */
+#define BLEIF_INTCLR_IACC_Pos             (5UL)                     /*!< BLEIF INTCLR: IACC (Bit 5)                            */
+#define BLEIF_INTCLR_IACC_Msk             (0x20UL)                  /*!< BLEIF INTCLR: IACC (Bitfield-Mask: 0x01)              */
+#define BLEIF_INTCLR_B2MST_Pos            (4UL)                     /*!< BLEIF INTCLR: B2MST (Bit 4)                           */
+#define BLEIF_INTCLR_B2MST_Msk            (0x10UL)                  /*!< BLEIF INTCLR: B2MST (Bitfield-Mask: 0x01)             */
+#define BLEIF_INTCLR_FOVFL_Pos            (3UL)                     /*!< BLEIF INTCLR: FOVFL (Bit 3)                           */
+#define BLEIF_INTCLR_FOVFL_Msk            (0x8UL)                   /*!< BLEIF INTCLR: FOVFL (Bitfield-Mask: 0x01)             */
+#define BLEIF_INTCLR_FUNDFL_Pos           (2UL)                     /*!< BLEIF INTCLR: FUNDFL (Bit 2)                          */
+#define BLEIF_INTCLR_FUNDFL_Msk           (0x4UL)                   /*!< BLEIF INTCLR: FUNDFL (Bitfield-Mask: 0x01)            */
+#define BLEIF_INTCLR_THR_Pos              (1UL)                     /*!< BLEIF INTCLR: THR (Bit 1)                             */
+#define BLEIF_INTCLR_THR_Msk              (0x2UL)                   /*!< BLEIF INTCLR: THR (Bitfield-Mask: 0x01)               */
+#define BLEIF_INTCLR_CMDCMP_Pos           (0UL)                     /*!< BLEIF INTCLR: CMDCMP (Bit 0)                          */
+#define BLEIF_INTCLR_CMDCMP_Msk           (0x1UL)                   /*!< BLEIF INTCLR: CMDCMP (Bitfield-Mask: 0x01)            */
+/* ========================================================  INTSET  ========================================================= */
+#define BLEIF_INTSET_B2MSHUTDN_Pos        (16UL)                    /*!< BLEIF INTSET: B2MSHUTDN (Bit 16)                      */
+#define BLEIF_INTSET_B2MSHUTDN_Msk        (0x10000UL)               /*!< BLEIF INTSET: B2MSHUTDN (Bitfield-Mask: 0x01)         */
+#define BLEIF_INTSET_B2MACTIVE_Pos        (15UL)                    /*!< BLEIF INTSET: B2MACTIVE (Bit 15)                      */
+#define BLEIF_INTSET_B2MACTIVE_Msk        (0x8000UL)                /*!< BLEIF INTSET: B2MACTIVE (Bitfield-Mask: 0x01)         */
+#define BLEIF_INTSET_B2MSLEEP_Pos         (14UL)                    /*!< BLEIF INTSET: B2MSLEEP (Bit 14)                       */
+#define BLEIF_INTSET_B2MSLEEP_Msk         (0x4000UL)                /*!< BLEIF INTSET: B2MSLEEP (Bitfield-Mask: 0x01)          */
+#define BLEIF_INTSET_CQERR_Pos            (13UL)                    /*!< BLEIF INTSET: CQERR (Bit 13)                          */
+#define BLEIF_INTSET_CQERR_Msk            (0x2000UL)                /*!< BLEIF INTSET: CQERR (Bitfield-Mask: 0x01)             */
+#define BLEIF_INTSET_CQUPD_Pos            (12UL)                    /*!< BLEIF INTSET: CQUPD (Bit 12)                          */
+#define BLEIF_INTSET_CQUPD_Msk            (0x1000UL)                /*!< BLEIF INTSET: CQUPD (Bitfield-Mask: 0x01)             */
+#define BLEIF_INTSET_CQPAUSED_Pos         (11UL)                    /*!< BLEIF INTSET: CQPAUSED (Bit 11)                       */
+#define BLEIF_INTSET_CQPAUSED_Msk         (0x800UL)                 /*!< BLEIF INTSET: CQPAUSED (Bitfield-Mask: 0x01)          */
+#define BLEIF_INTSET_DERR_Pos             (10UL)                    /*!< BLEIF INTSET: DERR (Bit 10)                           */
+#define BLEIF_INTSET_DERR_Msk             (0x400UL)                 /*!< BLEIF INTSET: DERR (Bitfield-Mask: 0x01)              */
+#define BLEIF_INTSET_DCMP_Pos             (9UL)                     /*!< BLEIF INTSET: DCMP (Bit 9)                            */
+#define BLEIF_INTSET_DCMP_Msk             (0x200UL)                 /*!< BLEIF INTSET: DCMP (Bitfield-Mask: 0x01)              */
+#define BLEIF_INTSET_BLECSSTAT_Pos        (8UL)                     /*!< BLEIF INTSET: BLECSSTAT (Bit 8)                       */
+#define BLEIF_INTSET_BLECSSTAT_Msk        (0x100UL)                 /*!< BLEIF INTSET: BLECSSTAT (Bitfield-Mask: 0x01)         */
+#define BLEIF_INTSET_BLECIRQ_Pos          (7UL)                     /*!< BLEIF INTSET: BLECIRQ (Bit 7)                         */
+#define BLEIF_INTSET_BLECIRQ_Msk          (0x80UL)                  /*!< BLEIF INTSET: BLECIRQ (Bitfield-Mask: 0x01)           */
+#define BLEIF_INTSET_ICMD_Pos             (6UL)                     /*!< BLEIF INTSET: ICMD (Bit 6)                            */
+#define BLEIF_INTSET_ICMD_Msk             (0x40UL)                  /*!< BLEIF INTSET: ICMD (Bitfield-Mask: 0x01)              */
+#define BLEIF_INTSET_IACC_Pos             (5UL)                     /*!< BLEIF INTSET: IACC (Bit 5)                            */
+#define BLEIF_INTSET_IACC_Msk             (0x20UL)                  /*!< BLEIF INTSET: IACC (Bitfield-Mask: 0x01)              */
+#define BLEIF_INTSET_B2MST_Pos            (4UL)                     /*!< BLEIF INTSET: B2MST (Bit 4)                           */
+#define BLEIF_INTSET_B2MST_Msk            (0x10UL)                  /*!< BLEIF INTSET: B2MST (Bitfield-Mask: 0x01)             */
+#define BLEIF_INTSET_FOVFL_Pos            (3UL)                     /*!< BLEIF INTSET: FOVFL (Bit 3)                           */
+#define BLEIF_INTSET_FOVFL_Msk            (0x8UL)                   /*!< BLEIF INTSET: FOVFL (Bitfield-Mask: 0x01)             */
+#define BLEIF_INTSET_FUNDFL_Pos           (2UL)                     /*!< BLEIF INTSET: FUNDFL (Bit 2)                          */
+#define BLEIF_INTSET_FUNDFL_Msk           (0x4UL)                   /*!< BLEIF INTSET: FUNDFL (Bitfield-Mask: 0x01)            */
+#define BLEIF_INTSET_THR_Pos              (1UL)                     /*!< BLEIF INTSET: THR (Bit 1)                             */
+#define BLEIF_INTSET_THR_Msk              (0x2UL)                   /*!< BLEIF INTSET: THR (Bitfield-Mask: 0x01)               */
+#define BLEIF_INTSET_CMDCMP_Pos           (0UL)                     /*!< BLEIF INTSET: CMDCMP (Bit 0)                          */
+#define BLEIF_INTSET_CMDCMP_Msk           (0x1UL)                   /*!< BLEIF INTSET: CMDCMP (Bitfield-Mask: 0x01)            */
+/* =======================================================  DMATRIGEN  ======================================================= */
+#define BLEIF_DMATRIGEN_DTHREN_Pos        (1UL)                     /*!< BLEIF DMATRIGEN: DTHREN (Bit 1)                       */
+#define BLEIF_DMATRIGEN_DTHREN_Msk        (0x2UL)                   /*!< BLEIF DMATRIGEN: DTHREN (Bitfield-Mask: 0x01)         */
+#define BLEIF_DMATRIGEN_DCMDCMPEN_Pos     (0UL)                     /*!< BLEIF DMATRIGEN: DCMDCMPEN (Bit 0)                    */
+#define BLEIF_DMATRIGEN_DCMDCMPEN_Msk     (0x1UL)                   /*!< BLEIF DMATRIGEN: DCMDCMPEN (Bitfield-Mask: 0x01)      */
+/* ======================================================  DMATRIGSTAT  ====================================================== */
+#define BLEIF_DMATRIGSTAT_DTOTCMP_Pos     (2UL)                     /*!< BLEIF DMATRIGSTAT: DTOTCMP (Bit 2)                    */
+#define BLEIF_DMATRIGSTAT_DTOTCMP_Msk     (0x4UL)                   /*!< BLEIF DMATRIGSTAT: DTOTCMP (Bitfield-Mask: 0x01)      */
+#define BLEIF_DMATRIGSTAT_DTHR_Pos        (1UL)                     /*!< BLEIF DMATRIGSTAT: DTHR (Bit 1)                       */
+#define BLEIF_DMATRIGSTAT_DTHR_Msk        (0x2UL)                   /*!< BLEIF DMATRIGSTAT: DTHR (Bitfield-Mask: 0x01)         */
+#define BLEIF_DMATRIGSTAT_DCMDCMP_Pos     (0UL)                     /*!< BLEIF DMATRIGSTAT: DCMDCMP (Bit 0)                    */
+#define BLEIF_DMATRIGSTAT_DCMDCMP_Msk     (0x1UL)                   /*!< BLEIF DMATRIGSTAT: DCMDCMP (Bitfield-Mask: 0x01)      */
+/* ========================================================  DMACFG  ========================================================= */
+#define BLEIF_DMACFG_DPWROFF_Pos          (9UL)                     /*!< BLEIF DMACFG: DPWROFF (Bit 9)                         */
+#define BLEIF_DMACFG_DPWROFF_Msk          (0x200UL)                 /*!< BLEIF DMACFG: DPWROFF (Bitfield-Mask: 0x01)           */
+#define BLEIF_DMACFG_DMAPRI_Pos           (8UL)                     /*!< BLEIF DMACFG: DMAPRI (Bit 8)                          */
+#define BLEIF_DMACFG_DMAPRI_Msk           (0x100UL)                 /*!< BLEIF DMACFG: DMAPRI (Bitfield-Mask: 0x01)            */
+#define BLEIF_DMACFG_DMADIR_Pos           (1UL)                     /*!< BLEIF DMACFG: DMADIR (Bit 1)                          */
+#define BLEIF_DMACFG_DMADIR_Msk           (0x2UL)                   /*!< BLEIF DMACFG: DMADIR (Bitfield-Mask: 0x01)            */
+#define BLEIF_DMACFG_DMAEN_Pos            (0UL)                     /*!< BLEIF DMACFG: DMAEN (Bit 0)                           */
+#define BLEIF_DMACFG_DMAEN_Msk            (0x1UL)                   /*!< BLEIF DMACFG: DMAEN (Bitfield-Mask: 0x01)             */
+/* ======================================================  DMATOTCOUNT  ====================================================== */
+#define BLEIF_DMATOTCOUNT_TOTCOUNT_Pos    (0UL)                     /*!< BLEIF DMATOTCOUNT: TOTCOUNT (Bit 0)                   */
+#define BLEIF_DMATOTCOUNT_TOTCOUNT_Msk    (0xfffUL)                 /*!< BLEIF DMATOTCOUNT: TOTCOUNT (Bitfield-Mask: 0xfff)    */
+/* ======================================================  DMATARGADDR  ====================================================== */
+#define BLEIF_DMATARGADDR_TARGADDR28_Pos  (28UL)                    /*!< BLEIF DMATARGADDR: TARGADDR28 (Bit 28)                */
+#define BLEIF_DMATARGADDR_TARGADDR28_Msk  (0x10000000UL)            /*!< BLEIF DMATARGADDR: TARGADDR28 (Bitfield-Mask: 0x01)   */
+#define BLEIF_DMATARGADDR_TARGADDR_Pos    (0UL)                     /*!< BLEIF DMATARGADDR: TARGADDR (Bit 0)                   */
+#define BLEIF_DMATARGADDR_TARGADDR_Msk    (0xfffffUL)               /*!< BLEIF DMATARGADDR: TARGADDR (Bitfield-Mask: 0xfffff)  */
+/* ========================================================  DMASTAT  ======================================================== */
+#define BLEIF_DMASTAT_DMAERR_Pos          (2UL)                     /*!< BLEIF DMASTAT: DMAERR (Bit 2)                         */
+#define BLEIF_DMASTAT_DMAERR_Msk          (0x4UL)                   /*!< BLEIF DMASTAT: DMAERR (Bitfield-Mask: 0x01)           */
+#define BLEIF_DMASTAT_DMACPL_Pos          (1UL)                     /*!< BLEIF DMASTAT: DMACPL (Bit 1)                         */
+#define BLEIF_DMASTAT_DMACPL_Msk          (0x2UL)                   /*!< BLEIF DMASTAT: DMACPL (Bitfield-Mask: 0x01)           */
+#define BLEIF_DMASTAT_DMATIP_Pos          (0UL)                     /*!< BLEIF DMASTAT: DMATIP (Bit 0)                         */
+#define BLEIF_DMASTAT_DMATIP_Msk          (0x1UL)                   /*!< BLEIF DMASTAT: DMATIP (Bitfield-Mask: 0x01)           */
+/* =========================================================  CQCFG  ========================================================= */
+#define BLEIF_CQCFG_CQPRI_Pos             (1UL)                     /*!< BLEIF CQCFG: CQPRI (Bit 1)                            */
+#define BLEIF_CQCFG_CQPRI_Msk             (0x2UL)                   /*!< BLEIF CQCFG: CQPRI (Bitfield-Mask: 0x01)              */
+#define BLEIF_CQCFG_CQEN_Pos              (0UL)                     /*!< BLEIF CQCFG: CQEN (Bit 0)                             */
+#define BLEIF_CQCFG_CQEN_Msk              (0x1UL)                   /*!< BLEIF CQCFG: CQEN (Bitfield-Mask: 0x01)               */
+/* ========================================================  CQADDR  ========================================================= */
+#define BLEIF_CQADDR_CQADDR28_Pos         (28UL)                    /*!< BLEIF CQADDR: CQADDR28 (Bit 28)                       */
+#define BLEIF_CQADDR_CQADDR28_Msk         (0x10000000UL)            /*!< BLEIF CQADDR: CQADDR28 (Bitfield-Mask: 0x01)          */
+#define BLEIF_CQADDR_CQADDR_Pos           (2UL)                     /*!< BLEIF CQADDR: CQADDR (Bit 2)                          */
+#define BLEIF_CQADDR_CQADDR_Msk           (0xffffcUL)               /*!< BLEIF CQADDR: CQADDR (Bitfield-Mask: 0x3ffff)         */
+/* ========================================================  CQSTAT  ========================================================= */
+#define BLEIF_CQSTAT_CQERR_Pos            (2UL)                     /*!< BLEIF CQSTAT: CQERR (Bit 2)                           */
+#define BLEIF_CQSTAT_CQERR_Msk            (0x4UL)                   /*!< BLEIF CQSTAT: CQERR (Bitfield-Mask: 0x01)             */
+#define BLEIF_CQSTAT_CQPAUSED_Pos         (1UL)                     /*!< BLEIF CQSTAT: CQPAUSED (Bit 1)                        */
+#define BLEIF_CQSTAT_CQPAUSED_Msk         (0x2UL)                   /*!< BLEIF CQSTAT: CQPAUSED (Bitfield-Mask: 0x01)          */
+#define BLEIF_CQSTAT_CQTIP_Pos            (0UL)                     /*!< BLEIF CQSTAT: CQTIP (Bit 0)                           */
+#define BLEIF_CQSTAT_CQTIP_Msk            (0x1UL)                   /*!< BLEIF CQSTAT: CQTIP (Bitfield-Mask: 0x01)             */
+/* ========================================================  CQFLAGS  ======================================================== */
+#define BLEIF_CQFLAGS_CQIRQMASK_Pos       (16UL)                    /*!< BLEIF CQFLAGS: CQIRQMASK (Bit 16)                     */
+#define BLEIF_CQFLAGS_CQIRQMASK_Msk       (0xffff0000UL)            /*!< BLEIF CQFLAGS: CQIRQMASK (Bitfield-Mask: 0xffff)      */
+#define BLEIF_CQFLAGS_CQFLAGS_Pos         (0UL)                     /*!< BLEIF CQFLAGS: CQFLAGS (Bit 0)                        */
+#define BLEIF_CQFLAGS_CQFLAGS_Msk         (0xffffUL)                /*!< BLEIF CQFLAGS: CQFLAGS (Bitfield-Mask: 0xffff)        */
+/* ======================================================  CQSETCLEAR  ======================================================= */
+#define BLEIF_CQSETCLEAR_CQFCLR_Pos       (16UL)                    /*!< BLEIF CQSETCLEAR: CQFCLR (Bit 16)                     */
+#define BLEIF_CQSETCLEAR_CQFCLR_Msk       (0xff0000UL)              /*!< BLEIF CQSETCLEAR: CQFCLR (Bitfield-Mask: 0xff)        */
+#define BLEIF_CQSETCLEAR_CQFTGL_Pos       (8UL)                     /*!< BLEIF CQSETCLEAR: CQFTGL (Bit 8)                      */
+#define BLEIF_CQSETCLEAR_CQFTGL_Msk       (0xff00UL)                /*!< BLEIF CQSETCLEAR: CQFTGL (Bitfield-Mask: 0xff)        */
+#define BLEIF_CQSETCLEAR_CQFSET_Pos       (0UL)                     /*!< BLEIF CQSETCLEAR: CQFSET (Bit 0)                      */
+#define BLEIF_CQSETCLEAR_CQFSET_Msk       (0xffUL)                  /*!< BLEIF CQSETCLEAR: CQFSET (Bitfield-Mask: 0xff)        */
+/* =======================================================  CQPAUSEEN  ======================================================= */
+#define BLEIF_CQPAUSEEN_CQPEN_Pos         (0UL)                     /*!< BLEIF CQPAUSEEN: CQPEN (Bit 0)                        */
+#define BLEIF_CQPAUSEEN_CQPEN_Msk         (0xffffUL)                /*!< BLEIF CQPAUSEEN: CQPEN (Bitfield-Mask: 0xffff)        */
+/* =======================================================  CQCURIDX  ======================================================== */
+#define BLEIF_CQCURIDX_CQCURIDX_Pos       (0UL)                     /*!< BLEIF CQCURIDX: CQCURIDX (Bit 0)                      */
+#define BLEIF_CQCURIDX_CQCURIDX_Msk       (0xffUL)                  /*!< BLEIF CQCURIDX: CQCURIDX (Bitfield-Mask: 0xff)        */
+/* =======================================================  CQENDIDX  ======================================================== */
+#define BLEIF_CQENDIDX_CQENDIDX_Pos       (0UL)                     /*!< BLEIF CQENDIDX: CQENDIDX (Bit 0)                      */
+#define BLEIF_CQENDIDX_CQENDIDX_Msk       (0xffUL)                  /*!< BLEIF CQENDIDX: CQENDIDX (Bitfield-Mask: 0xff)        */
+/* ========================================================  STATUS  ========================================================= */
+#define BLEIF_STATUS_IDLEST_Pos           (2UL)                     /*!< BLEIF STATUS: IDLEST (Bit 2)                          */
+#define BLEIF_STATUS_IDLEST_Msk           (0x4UL)                   /*!< BLEIF STATUS: IDLEST (Bitfield-Mask: 0x01)            */
+#define BLEIF_STATUS_CMDACT_Pos           (1UL)                     /*!< BLEIF STATUS: CMDACT (Bit 1)                          */
+#define BLEIF_STATUS_CMDACT_Msk           (0x2UL)                   /*!< BLEIF STATUS: CMDACT (Bitfield-Mask: 0x01)            */
+#define BLEIF_STATUS_ERR_Pos              (0UL)                     /*!< BLEIF STATUS: ERR (Bit 0)                             */
+#define BLEIF_STATUS_ERR_Msk              (0x1UL)                   /*!< BLEIF STATUS: ERR (Bitfield-Mask: 0x01)               */
+/* ========================================================  MSPICFG  ======================================================== */
+#define BLEIF_MSPICFG_MSPIRST_Pos         (30UL)                    /*!< BLEIF MSPICFG: MSPIRST (Bit 30)                       */
+#define BLEIF_MSPICFG_MSPIRST_Msk         (0x40000000UL)            /*!< BLEIF MSPICFG: MSPIRST (Bitfield-Mask: 0x01)          */
+#define BLEIF_MSPICFG_DOUTDLY_Pos         (27UL)                    /*!< BLEIF MSPICFG: DOUTDLY (Bit 27)                       */
+#define BLEIF_MSPICFG_DOUTDLY_Msk         (0x38000000UL)            /*!< BLEIF MSPICFG: DOUTDLY (Bitfield-Mask: 0x07)          */
+#define BLEIF_MSPICFG_DINDLY_Pos          (24UL)                    /*!< BLEIF MSPICFG: DINDLY (Bit 24)                        */
+#define BLEIF_MSPICFG_DINDLY_Msk          (0x7000000UL)             /*!< BLEIF MSPICFG: DINDLY (Bitfield-Mask: 0x07)           */
+#define BLEIF_MSPICFG_SPILSB_Pos          (23UL)                    /*!< BLEIF MSPICFG: SPILSB (Bit 23)                        */
+#define BLEIF_MSPICFG_SPILSB_Msk          (0x800000UL)              /*!< BLEIF MSPICFG: SPILSB (Bitfield-Mask: 0x01)           */
+#define BLEIF_MSPICFG_RDFCPOL_Pos         (22UL)                    /*!< BLEIF MSPICFG: RDFCPOL (Bit 22)                       */
+#define BLEIF_MSPICFG_RDFCPOL_Msk         (0x400000UL)              /*!< BLEIF MSPICFG: RDFCPOL (Bitfield-Mask: 0x01)          */
+#define BLEIF_MSPICFG_WTFCPOL_Pos         (21UL)                    /*!< BLEIF MSPICFG: WTFCPOL (Bit 21)                       */
+#define BLEIF_MSPICFG_WTFCPOL_Msk         (0x200000UL)              /*!< BLEIF MSPICFG: WTFCPOL (Bitfield-Mask: 0x01)          */
+#define BLEIF_MSPICFG_RDFC_Pos            (17UL)                    /*!< BLEIF MSPICFG: RDFC (Bit 17)                          */
+#define BLEIF_MSPICFG_RDFC_Msk            (0x20000UL)               /*!< BLEIF MSPICFG: RDFC (Bitfield-Mask: 0x01)             */
+#define BLEIF_MSPICFG_WTFC_Pos            (16UL)                    /*!< BLEIF MSPICFG: WTFC (Bit 16)                          */
+#define BLEIF_MSPICFG_WTFC_Msk            (0x10000UL)               /*!< BLEIF MSPICFG: WTFC (Bitfield-Mask: 0x01)             */
+#define BLEIF_MSPICFG_FULLDUP_Pos         (2UL)                     /*!< BLEIF MSPICFG: FULLDUP (Bit 2)                        */
+#define BLEIF_MSPICFG_FULLDUP_Msk         (0x4UL)                   /*!< BLEIF MSPICFG: FULLDUP (Bitfield-Mask: 0x01)          */
+#define BLEIF_MSPICFG_SPHA_Pos            (1UL)                     /*!< BLEIF MSPICFG: SPHA (Bit 1)                           */
+#define BLEIF_MSPICFG_SPHA_Msk            (0x2UL)                   /*!< BLEIF MSPICFG: SPHA (Bitfield-Mask: 0x01)             */
+#define BLEIF_MSPICFG_SPOL_Pos            (0UL)                     /*!< BLEIF MSPICFG: SPOL (Bit 0)                           */
+#define BLEIF_MSPICFG_SPOL_Msk            (0x1UL)                   /*!< BLEIF MSPICFG: SPOL (Bitfield-Mask: 0x01)             */
+/* ========================================================  BLECFG  ========================================================= */
+#define BLEIF_BLECFG_SPIISOCTL_Pos        (14UL)                    /*!< BLEIF BLECFG: SPIISOCTL (Bit 14)                      */
+#define BLEIF_BLECFG_SPIISOCTL_Msk        (0xc000UL)                /*!< BLEIF BLECFG: SPIISOCTL (Bitfield-Mask: 0x03)         */
+#define BLEIF_BLECFG_PWRISOCTL_Pos        (12UL)                    /*!< BLEIF BLECFG: PWRISOCTL (Bit 12)                      */
+#define BLEIF_BLECFG_PWRISOCTL_Msk        (0x3000UL)                /*!< BLEIF BLECFG: PWRISOCTL (Bitfield-Mask: 0x03)         */
+#define BLEIF_BLECFG_STAYASLEEP_Pos       (11UL)                    /*!< BLEIF BLECFG: STAYASLEEP (Bit 11)                     */
+#define BLEIF_BLECFG_STAYASLEEP_Msk       (0x800UL)                 /*!< BLEIF BLECFG: STAYASLEEP (Bitfield-Mask: 0x01)        */
+#define BLEIF_BLECFG_FRCCLK_Pos           (10UL)                    /*!< BLEIF BLECFG: FRCCLK (Bit 10)                         */
+#define BLEIF_BLECFG_FRCCLK_Msk           (0x400UL)                 /*!< BLEIF BLECFG: FRCCLK (Bitfield-Mask: 0x01)            */
+#define BLEIF_BLECFG_MCUFRCSLP_Pos        (9UL)                     /*!< BLEIF BLECFG: MCUFRCSLP (Bit 9)                       */
+#define BLEIF_BLECFG_MCUFRCSLP_Msk        (0x200UL)                 /*!< BLEIF BLECFG: MCUFRCSLP (Bitfield-Mask: 0x01)         */
+#define BLEIF_BLECFG_WT4ACTOFF_Pos        (8UL)                     /*!< BLEIF BLECFG: WT4ACTOFF (Bit 8)                       */
+#define BLEIF_BLECFG_WT4ACTOFF_Msk        (0x100UL)                 /*!< BLEIF BLECFG: WT4ACTOFF (Bitfield-Mask: 0x01)         */
+#define BLEIF_BLECFG_BLEHREQCTL_Pos       (6UL)                     /*!< BLEIF BLECFG: BLEHREQCTL (Bit 6)                      */
+#define BLEIF_BLECFG_BLEHREQCTL_Msk       (0xc0UL)                  /*!< BLEIF BLECFG: BLEHREQCTL (Bitfield-Mask: 0x03)        */
+#define BLEIF_BLECFG_DCDCFLGCTL_Pos       (4UL)                     /*!< BLEIF BLECFG: DCDCFLGCTL (Bit 4)                      */
+#define BLEIF_BLECFG_DCDCFLGCTL_Msk       (0x30UL)                  /*!< BLEIF BLECFG: DCDCFLGCTL (Bitfield-Mask: 0x03)        */
+#define BLEIF_BLECFG_WAKEUPCTL_Pos        (2UL)                     /*!< BLEIF BLECFG: WAKEUPCTL (Bit 2)                       */
+#define BLEIF_BLECFG_WAKEUPCTL_Msk        (0xcUL)                   /*!< BLEIF BLECFG: WAKEUPCTL (Bitfield-Mask: 0x03)         */
+#define BLEIF_BLECFG_BLERSTN_Pos          (1UL)                     /*!< BLEIF BLECFG: BLERSTN (Bit 1)                         */
+#define BLEIF_BLECFG_BLERSTN_Msk          (0x2UL)                   /*!< BLEIF BLECFG: BLERSTN (Bitfield-Mask: 0x01)           */
+#define BLEIF_BLECFG_PWRSMEN_Pos          (0UL)                     /*!< BLEIF BLECFG: PWRSMEN (Bit 0)                         */
+#define BLEIF_BLECFG_PWRSMEN_Msk          (0x1UL)                   /*!< BLEIF BLECFG: PWRSMEN (Bitfield-Mask: 0x01)           */
+/* ========================================================  PWRCMD  ========================================================= */
+#define BLEIF_PWRCMD_RESTART_Pos          (1UL)                     /*!< BLEIF PWRCMD: RESTART (Bit 1)                         */
+#define BLEIF_PWRCMD_RESTART_Msk          (0x2UL)                   /*!< BLEIF PWRCMD: RESTART (Bitfield-Mask: 0x01)           */
+#define BLEIF_PWRCMD_WAKEREQ_Pos          (0UL)                     /*!< BLEIF PWRCMD: WAKEREQ (Bit 0)                         */
+#define BLEIF_PWRCMD_WAKEREQ_Msk          (0x1UL)                   /*!< BLEIF PWRCMD: WAKEREQ (Bitfield-Mask: 0x01)           */
+/* ========================================================  BSTATUS  ======================================================== */
+#define BLEIF_BSTATUS_BLEHREQ_Pos         (12UL)                    /*!< BLEIF BSTATUS: BLEHREQ (Bit 12)                       */
+#define BLEIF_BSTATUS_BLEHREQ_Msk         (0x1000UL)                /*!< BLEIF BSTATUS: BLEHREQ (Bitfield-Mask: 0x01)          */
+#define BLEIF_BSTATUS_BLEHACK_Pos         (11UL)                    /*!< BLEIF BSTATUS: BLEHACK (Bit 11)                       */
+#define BLEIF_BSTATUS_BLEHACK_Msk         (0x800UL)                 /*!< BLEIF BSTATUS: BLEHACK (Bitfield-Mask: 0x01)          */
+#define BLEIF_BSTATUS_PWRST_Pos           (8UL)                     /*!< BLEIF BSTATUS: PWRST (Bit 8)                          */
+#define BLEIF_BSTATUS_PWRST_Msk           (0x700UL)                 /*!< BLEIF BSTATUS: PWRST (Bitfield-Mask: 0x07)            */
+#define BLEIF_BSTATUS_BLEIRQ_Pos          (7UL)                     /*!< BLEIF BSTATUS: BLEIRQ (Bit 7)                         */
+#define BLEIF_BSTATUS_BLEIRQ_Msk          (0x80UL)                  /*!< BLEIF BSTATUS: BLEIRQ (Bitfield-Mask: 0x01)           */
+#define BLEIF_BSTATUS_WAKEUP_Pos          (6UL)                     /*!< BLEIF BSTATUS: WAKEUP (Bit 6)                         */
+#define BLEIF_BSTATUS_WAKEUP_Msk          (0x40UL)                  /*!< BLEIF BSTATUS: WAKEUP (Bitfield-Mask: 0x01)           */
+#define BLEIF_BSTATUS_DCDCFLAG_Pos        (5UL)                     /*!< BLEIF BSTATUS: DCDCFLAG (Bit 5)                       */
+#define BLEIF_BSTATUS_DCDCFLAG_Msk        (0x20UL)                  /*!< BLEIF BSTATUS: DCDCFLAG (Bitfield-Mask: 0x01)         */
+#define BLEIF_BSTATUS_DCDCREQ_Pos         (4UL)                     /*!< BLEIF BSTATUS: DCDCREQ (Bit 4)                        */
+#define BLEIF_BSTATUS_DCDCREQ_Msk         (0x10UL)                  /*!< BLEIF BSTATUS: DCDCREQ (Bitfield-Mask: 0x01)          */
+#define BLEIF_BSTATUS_SPISTATUS_Pos       (3UL)                     /*!< BLEIF BSTATUS: SPISTATUS (Bit 3)                      */
+#define BLEIF_BSTATUS_SPISTATUS_Msk       (0x8UL)                   /*!< BLEIF BSTATUS: SPISTATUS (Bitfield-Mask: 0x01)        */
+#define BLEIF_BSTATUS_B2MSTATE_Pos        (0UL)                     /*!< BLEIF BSTATUS: B2MSTATE (Bit 0)                       */
+#define BLEIF_BSTATUS_B2MSTATE_Msk        (0x7UL)                   /*!< BLEIF BSTATUS: B2MSTATE (Bitfield-Mask: 0x07)         */
+/* ========================================================  BLEDBG  ========================================================= */
+#define BLEIF_BLEDBG_DBGDATA_Pos          (3UL)                     /*!< BLEIF BLEDBG: DBGDATA (Bit 3)                         */
+#define BLEIF_BLEDBG_DBGDATA_Msk          (0xfffffff8UL)            /*!< BLEIF BLEDBG: DBGDATA (Bitfield-Mask: 0x1fffffff)     */
+#define BLEIF_BLEDBG_APBCLKON_Pos         (2UL)                     /*!< BLEIF BLEDBG: APBCLKON (Bit 2)                        */
+#define BLEIF_BLEDBG_APBCLKON_Msk         (0x4UL)                   /*!< BLEIF BLEDBG: APBCLKON (Bitfield-Mask: 0x01)          */
+#define BLEIF_BLEDBG_IOCLKON_Pos          (1UL)                     /*!< BLEIF BLEDBG: IOCLKON (Bit 1)                         */
+#define BLEIF_BLEDBG_IOCLKON_Msk          (0x2UL)                   /*!< BLEIF BLEDBG: IOCLKON (Bitfield-Mask: 0x01)           */
+#define BLEIF_BLEDBG_DBGEN_Pos            (0UL)                     /*!< BLEIF BLEDBG: DBGEN (Bit 0)                           */
+#define BLEIF_BLEDBG_DBGEN_Msk            (0x1UL)                   /*!< BLEIF BLEDBG: DBGEN (Bitfield-Mask: 0x01)             */
+
+
+/* =========================================================================================================================== */
+/* ================                                         CACHECTRL                                         ================ */
+/* =========================================================================================================================== */
+
+/* =======================================================  CACHECFG  ======================================================== */
+#define CACHECTRL_CACHECFG_ENABLE_MONITOR_Pos (24UL)                /*!< CACHECTRL CACHECFG: ENABLE_MONITOR (Bit 24)           */
+#define CACHECTRL_CACHECFG_ENABLE_MONITOR_Msk (0x1000000UL)         /*!< CACHECTRL CACHECFG: ENABLE_MONITOR (Bitfield-Mask: 0x01) */
+#define CACHECTRL_CACHECFG_DATA_CLKGATE_Pos (20UL)                  /*!< CACHECTRL CACHECFG: DATA_CLKGATE (Bit 20)             */
+#define CACHECTRL_CACHECFG_DATA_CLKGATE_Msk (0x100000UL)            /*!< CACHECTRL CACHECFG: DATA_CLKGATE (Bitfield-Mask: 0x01) */
+#define CACHECTRL_CACHECFG_CACHE_LS_Pos   (11UL)                    /*!< CACHECTRL CACHECFG: CACHE_LS (Bit 11)                 */
+#define CACHECTRL_CACHECFG_CACHE_LS_Msk   (0x800UL)                 /*!< CACHECTRL CACHECFG: CACHE_LS (Bitfield-Mask: 0x01)    */
+#define CACHECTRL_CACHECFG_CACHE_CLKGATE_Pos (10UL)                 /*!< CACHECTRL CACHECFG: CACHE_CLKGATE (Bit 10)            */
+#define CACHECTRL_CACHECFG_CACHE_CLKGATE_Msk (0x400UL)              /*!< CACHECTRL CACHECFG: CACHE_CLKGATE (Bitfield-Mask: 0x01) */
+#define CACHECTRL_CACHECFG_DCACHE_ENABLE_Pos (9UL)                  /*!< CACHECTRL CACHECFG: DCACHE_ENABLE (Bit 9)             */
+#define CACHECTRL_CACHECFG_DCACHE_ENABLE_Msk (0x200UL)              /*!< CACHECTRL CACHECFG: DCACHE_ENABLE (Bitfield-Mask: 0x01) */
+#define CACHECTRL_CACHECFG_ICACHE_ENABLE_Pos (8UL)                  /*!< CACHECTRL CACHECFG: ICACHE_ENABLE (Bit 8)             */
+#define CACHECTRL_CACHECFG_ICACHE_ENABLE_Msk (0x100UL)              /*!< CACHECTRL CACHECFG: ICACHE_ENABLE (Bitfield-Mask: 0x01) */
+#define CACHECTRL_CACHECFG_CONFIG_Pos     (4UL)                     /*!< CACHECTRL CACHECFG: CONFIG (Bit 4)                    */
+#define CACHECTRL_CACHECFG_CONFIG_Msk     (0xf0UL)                  /*!< CACHECTRL CACHECFG: CONFIG (Bitfield-Mask: 0x0f)      */
+#define CACHECTRL_CACHECFG_ENABLE_NC1_Pos (3UL)                     /*!< CACHECTRL CACHECFG: ENABLE_NC1 (Bit 3)                */
+#define CACHECTRL_CACHECFG_ENABLE_NC1_Msk (0x8UL)                   /*!< CACHECTRL CACHECFG: ENABLE_NC1 (Bitfield-Mask: 0x01)  */
+#define CACHECTRL_CACHECFG_ENABLE_NC0_Pos (2UL)                     /*!< CACHECTRL CACHECFG: ENABLE_NC0 (Bit 2)                */
+#define CACHECTRL_CACHECFG_ENABLE_NC0_Msk (0x4UL)                   /*!< CACHECTRL CACHECFG: ENABLE_NC0 (Bitfield-Mask: 0x01)  */
+#define CACHECTRL_CACHECFG_LRU_Pos        (1UL)                     /*!< CACHECTRL CACHECFG: LRU (Bit 1)                       */
+#define CACHECTRL_CACHECFG_LRU_Msk        (0x2UL)                   /*!< CACHECTRL CACHECFG: LRU (Bitfield-Mask: 0x01)         */
+#define CACHECTRL_CACHECFG_ENABLE_Pos     (0UL)                     /*!< CACHECTRL CACHECFG: ENABLE (Bit 0)                    */
+#define CACHECTRL_CACHECFG_ENABLE_Msk     (0x1UL)                   /*!< CACHECTRL CACHECFG: ENABLE (Bitfield-Mask: 0x01)      */
+/* =======================================================  FLASHCFG  ======================================================== */
+#define CACHECTRL_FLASHCFG_LPMMODE_Pos    (12UL)                    /*!< CACHECTRL FLASHCFG: LPMMODE (Bit 12)                  */
+#define CACHECTRL_FLASHCFG_LPMMODE_Msk    (0x3000UL)                /*!< CACHECTRL FLASHCFG: LPMMODE (Bitfield-Mask: 0x03)     */
+#define CACHECTRL_FLASHCFG_LPM_RD_WAIT_Pos (8UL)                    /*!< CACHECTRL FLASHCFG: LPM_RD_WAIT (Bit 8)               */
+#define CACHECTRL_FLASHCFG_LPM_RD_WAIT_Msk (0xf00UL)                /*!< CACHECTRL FLASHCFG: LPM_RD_WAIT (Bitfield-Mask: 0x0f) */
+#define CACHECTRL_FLASHCFG_SEDELAY_Pos    (4UL)                     /*!< CACHECTRL FLASHCFG: SEDELAY (Bit 4)                   */
+#define CACHECTRL_FLASHCFG_SEDELAY_Msk    (0x70UL)                  /*!< CACHECTRL FLASHCFG: SEDELAY (Bitfield-Mask: 0x07)     */
+#define CACHECTRL_FLASHCFG_RD_WAIT_Pos    (0UL)                     /*!< CACHECTRL FLASHCFG: RD_WAIT (Bit 0)                   */
+#define CACHECTRL_FLASHCFG_RD_WAIT_Msk    (0xfUL)                   /*!< CACHECTRL FLASHCFG: RD_WAIT (Bitfield-Mask: 0x0f)     */
+/* =========================================================  CTRL  ========================================================== */
+#define CACHECTRL_CTRL_FLASH1_SLM_ENABLE_Pos (10UL)                 /*!< CACHECTRL CTRL: FLASH1_SLM_ENABLE (Bit 10)            */
+#define CACHECTRL_CTRL_FLASH1_SLM_ENABLE_Msk (0x400UL)              /*!< CACHECTRL CTRL: FLASH1_SLM_ENABLE (Bitfield-Mask: 0x01) */
+#define CACHECTRL_CTRL_FLASH1_SLM_DISABLE_Pos (9UL)                 /*!< CACHECTRL CTRL: FLASH1_SLM_DISABLE (Bit 9)            */
+#define CACHECTRL_CTRL_FLASH1_SLM_DISABLE_Msk (0x200UL)             /*!< CACHECTRL CTRL: FLASH1_SLM_DISABLE (Bitfield-Mask: 0x01) */
+#define CACHECTRL_CTRL_FLASH1_SLM_STATUS_Pos (8UL)                  /*!< CACHECTRL CTRL: FLASH1_SLM_STATUS (Bit 8)             */
+#define CACHECTRL_CTRL_FLASH1_SLM_STATUS_Msk (0x100UL)              /*!< CACHECTRL CTRL: FLASH1_SLM_STATUS (Bitfield-Mask: 0x01) */
+#define CACHECTRL_CTRL_FLASH0_SLM_ENABLE_Pos (6UL)                  /*!< CACHECTRL CTRL: FLASH0_SLM_ENABLE (Bit 6)             */
+#define CACHECTRL_CTRL_FLASH0_SLM_ENABLE_Msk (0x40UL)               /*!< CACHECTRL CTRL: FLASH0_SLM_ENABLE (Bitfield-Mask: 0x01) */
+#define CACHECTRL_CTRL_FLASH0_SLM_DISABLE_Pos (5UL)                 /*!< CACHECTRL CTRL: FLASH0_SLM_DISABLE (Bit 5)            */
+#define CACHECTRL_CTRL_FLASH0_SLM_DISABLE_Msk (0x20UL)              /*!< CACHECTRL CTRL: FLASH0_SLM_DISABLE (Bitfield-Mask: 0x01) */
+#define CACHECTRL_CTRL_FLASH0_SLM_STATUS_Pos (4UL)                  /*!< CACHECTRL CTRL: FLASH0_SLM_STATUS (Bit 4)             */
+#define CACHECTRL_CTRL_FLASH0_SLM_STATUS_Msk (0x10UL)               /*!< CACHECTRL CTRL: FLASH0_SLM_STATUS (Bitfield-Mask: 0x01) */
+#define CACHECTRL_CTRL_CACHE_READY_Pos    (2UL)                     /*!< CACHECTRL CTRL: CACHE_READY (Bit 2)                   */
+#define CACHECTRL_CTRL_CACHE_READY_Msk    (0x4UL)                   /*!< CACHECTRL CTRL: CACHE_READY (Bitfield-Mask: 0x01)     */
+#define CACHECTRL_CTRL_RESET_STAT_Pos     (1UL)                     /*!< CACHECTRL CTRL: RESET_STAT (Bit 1)                    */
+#define CACHECTRL_CTRL_RESET_STAT_Msk     (0x2UL)                   /*!< CACHECTRL CTRL: RESET_STAT (Bitfield-Mask: 0x01)      */
+#define CACHECTRL_CTRL_INVALIDATE_Pos     (0UL)                     /*!< CACHECTRL CTRL: INVALIDATE (Bit 0)                    */
+#define CACHECTRL_CTRL_INVALIDATE_Msk     (0x1UL)                   /*!< CACHECTRL CTRL: INVALIDATE (Bitfield-Mask: 0x01)      */
+/* =======================================================  NCR0START  ======================================================= */
+#define CACHECTRL_NCR0START_ADDR_Pos      (4UL)                     /*!< CACHECTRL NCR0START: ADDR (Bit 4)                     */
+#define CACHECTRL_NCR0START_ADDR_Msk      (0x7fffff0UL)             /*!< CACHECTRL NCR0START: ADDR (Bitfield-Mask: 0x7fffff)   */
+/* ========================================================  NCR0END  ======================================================== */
+#define CACHECTRL_NCR0END_ADDR_Pos        (4UL)                     /*!< CACHECTRL NCR0END: ADDR (Bit 4)                       */
+#define CACHECTRL_NCR0END_ADDR_Msk        (0x7fffff0UL)             /*!< CACHECTRL NCR0END: ADDR (Bitfield-Mask: 0x7fffff)     */
+/* =======================================================  NCR1START  ======================================================= */
+#define CACHECTRL_NCR1START_ADDR_Pos      (4UL)                     /*!< CACHECTRL NCR1START: ADDR (Bit 4)                     */
+#define CACHECTRL_NCR1START_ADDR_Msk      (0x7fffff0UL)             /*!< CACHECTRL NCR1START: ADDR (Bitfield-Mask: 0x7fffff)   */
+/* ========================================================  NCR1END  ======================================================== */
+#define CACHECTRL_NCR1END_ADDR_Pos        (4UL)                     /*!< CACHECTRL NCR1END: ADDR (Bit 4)                       */
+#define CACHECTRL_NCR1END_ADDR_Msk        (0x7fffff0UL)             /*!< CACHECTRL NCR1END: ADDR (Bitfield-Mask: 0x7fffff)     */
+/* =========================================================  DMON0  ========================================================= */
+#define CACHECTRL_DMON0_DACCESS_COUNT_Pos (0UL)                     /*!< CACHECTRL DMON0: DACCESS_COUNT (Bit 0)                */
+#define CACHECTRL_DMON0_DACCESS_COUNT_Msk (0xffffffffUL)            /*!< CACHECTRL DMON0: DACCESS_COUNT (Bitfield-Mask: 0xffffffff) */
+/* =========================================================  DMON1  ========================================================= */
+#define CACHECTRL_DMON1_DLOOKUP_COUNT_Pos (0UL)                     /*!< CACHECTRL DMON1: DLOOKUP_COUNT (Bit 0)                */
+#define CACHECTRL_DMON1_DLOOKUP_COUNT_Msk (0xffffffffUL)            /*!< CACHECTRL DMON1: DLOOKUP_COUNT (Bitfield-Mask: 0xffffffff) */
+/* =========================================================  DMON2  ========================================================= */
+#define CACHECTRL_DMON2_DHIT_COUNT_Pos    (0UL)                     /*!< CACHECTRL DMON2: DHIT_COUNT (Bit 0)                   */
+#define CACHECTRL_DMON2_DHIT_COUNT_Msk    (0xffffffffUL)            /*!< CACHECTRL DMON2: DHIT_COUNT (Bitfield-Mask: 0xffffffff) */
+/* =========================================================  DMON3  ========================================================= */
+#define CACHECTRL_DMON3_DLINE_COUNT_Pos   (0UL)                     /*!< CACHECTRL DMON3: DLINE_COUNT (Bit 0)                  */
+#define CACHECTRL_DMON3_DLINE_COUNT_Msk   (0xffffffffUL)            /*!< CACHECTRL DMON3: DLINE_COUNT (Bitfield-Mask: 0xffffffff) */
+/* =========================================================  IMON0  ========================================================= */
+#define CACHECTRL_IMON0_IACCESS_COUNT_Pos (0UL)                     /*!< CACHECTRL IMON0: IACCESS_COUNT (Bit 0)                */
+#define CACHECTRL_IMON0_IACCESS_COUNT_Msk (0xffffffffUL)            /*!< CACHECTRL IMON0: IACCESS_COUNT (Bitfield-Mask: 0xffffffff) */
+/* =========================================================  IMON1  ========================================================= */
+#define CACHECTRL_IMON1_ILOOKUP_COUNT_Pos (0UL)                     /*!< CACHECTRL IMON1: ILOOKUP_COUNT (Bit 0)                */
+#define CACHECTRL_IMON1_ILOOKUP_COUNT_Msk (0xffffffffUL)            /*!< CACHECTRL IMON1: ILOOKUP_COUNT (Bitfield-Mask: 0xffffffff) */
+/* =========================================================  IMON2  ========================================================= */
+#define CACHECTRL_IMON2_IHIT_COUNT_Pos    (0UL)                     /*!< CACHECTRL IMON2: IHIT_COUNT (Bit 0)                   */
+#define CACHECTRL_IMON2_IHIT_COUNT_Msk    (0xffffffffUL)            /*!< CACHECTRL IMON2: IHIT_COUNT (Bitfield-Mask: 0xffffffff) */
+/* =========================================================  IMON3  ========================================================= */
+#define CACHECTRL_IMON3_ILINE_COUNT_Pos   (0UL)                     /*!< CACHECTRL IMON3: ILINE_COUNT (Bit 0)                  */
+#define CACHECTRL_IMON3_ILINE_COUNT_Msk   (0xffffffffUL)            /*!< CACHECTRL IMON3: ILINE_COUNT (Bitfield-Mask: 0xffffffff) */
+
+
+/* =========================================================================================================================== */
+/* ================                                          CLKGEN                                           ================ */
+/* =========================================================================================================================== */
+
+/* =========================================================  CALXT  ========================================================= */
+#define CLKGEN_CALXT_CALXT_Pos            (0UL)                     /*!< CLKGEN CALXT: CALXT (Bit 0)                           */
+#define CLKGEN_CALXT_CALXT_Msk            (0x7ffUL)                 /*!< CLKGEN CALXT: CALXT (Bitfield-Mask: 0x7ff)            */
+/* =========================================================  CALRC  ========================================================= */
+#define CLKGEN_CALRC_CALRC_Pos            (0UL)                     /*!< CLKGEN CALRC: CALRC (Bit 0)                           */
+#define CLKGEN_CALRC_CALRC_Msk            (0x3ffffUL)               /*!< CLKGEN CALRC: CALRC (Bitfield-Mask: 0x3ffff)          */
+/* ========================================================  ACALCTR  ======================================================== */
+#define CLKGEN_ACALCTR_ACALCTR_Pos        (0UL)                     /*!< CLKGEN ACALCTR: ACALCTR (Bit 0)                       */
+#define CLKGEN_ACALCTR_ACALCTR_Msk        (0xffffffUL)              /*!< CLKGEN ACALCTR: ACALCTR (Bitfield-Mask: 0xffffff)     */
+/* =========================================================  OCTRL  ========================================================= */
+#define CLKGEN_OCTRL_ACAL_Pos             (8UL)                     /*!< CLKGEN OCTRL: ACAL (Bit 8)                            */
+#define CLKGEN_OCTRL_ACAL_Msk             (0x700UL)                 /*!< CLKGEN OCTRL: ACAL (Bitfield-Mask: 0x07)              */
+#define CLKGEN_OCTRL_OSEL_Pos             (7UL)                     /*!< CLKGEN OCTRL: OSEL (Bit 7)                            */
+#define CLKGEN_OCTRL_OSEL_Msk             (0x80UL)                  /*!< CLKGEN OCTRL: OSEL (Bitfield-Mask: 0x01)              */
+#define CLKGEN_OCTRL_FOS_Pos              (6UL)                     /*!< CLKGEN OCTRL: FOS (Bit 6)                             */
+#define CLKGEN_OCTRL_FOS_Msk              (0x40UL)                  /*!< CLKGEN OCTRL: FOS (Bitfield-Mask: 0x01)               */
+#define CLKGEN_OCTRL_STOPRC_Pos           (1UL)                     /*!< CLKGEN OCTRL: STOPRC (Bit 1)                          */
+#define CLKGEN_OCTRL_STOPRC_Msk           (0x2UL)                   /*!< CLKGEN OCTRL: STOPRC (Bitfield-Mask: 0x01)            */
+#define CLKGEN_OCTRL_STOPXT_Pos           (0UL)                     /*!< CLKGEN OCTRL: STOPXT (Bit 0)                          */
+#define CLKGEN_OCTRL_STOPXT_Msk           (0x1UL)                   /*!< CLKGEN OCTRL: STOPXT (Bitfield-Mask: 0x01)            */
+/* ========================================================  CLKOUT  ========================================================= */
+#define CLKGEN_CLKOUT_CKEN_Pos            (7UL)                     /*!< CLKGEN CLKOUT: CKEN (Bit 7)                           */
+#define CLKGEN_CLKOUT_CKEN_Msk            (0x80UL)                  /*!< CLKGEN CLKOUT: CKEN (Bitfield-Mask: 0x01)             */
+#define CLKGEN_CLKOUT_CKSEL_Pos           (0UL)                     /*!< CLKGEN CLKOUT: CKSEL (Bit 0)                          */
+#define CLKGEN_CLKOUT_CKSEL_Msk           (0x3fUL)                  /*!< CLKGEN CLKOUT: CKSEL (Bitfield-Mask: 0x3f)            */
+/* ========================================================  CLKKEY  ========================================================= */
+#define CLKGEN_CLKKEY_CLKKEY_Pos          (0UL)                     /*!< CLKGEN CLKKEY: CLKKEY (Bit 0)                         */
+#define CLKGEN_CLKKEY_CLKKEY_Msk          (0xffffffffUL)            /*!< CLKGEN CLKKEY: CLKKEY (Bitfield-Mask: 0xffffffff)     */
+/* =========================================================  CCTRL  ========================================================= */
+#define CLKGEN_CCTRL_CORESEL_Pos          (0UL)                     /*!< CLKGEN CCTRL: CORESEL (Bit 0)                         */
+#define CLKGEN_CCTRL_CORESEL_Msk          (0x1UL)                   /*!< CLKGEN CCTRL: CORESEL (Bitfield-Mask: 0x01)           */
+/* ========================================================  STATUS  ========================================================= */
+#define CLKGEN_STATUS_OSCF_Pos            (1UL)                     /*!< CLKGEN STATUS: OSCF (Bit 1)                           */
+#define CLKGEN_STATUS_OSCF_Msk            (0x2UL)                   /*!< CLKGEN STATUS: OSCF (Bitfield-Mask: 0x01)             */
+#define CLKGEN_STATUS_OMODE_Pos           (0UL)                     /*!< CLKGEN STATUS: OMODE (Bit 0)                          */
+#define CLKGEN_STATUS_OMODE_Msk           (0x1UL)                   /*!< CLKGEN STATUS: OMODE (Bitfield-Mask: 0x01)            */
+/* =========================================================  HFADJ  ========================================================= */
+#define CLKGEN_HFADJ_HFADJGAIN_Pos        (21UL)                    /*!< CLKGEN HFADJ: HFADJGAIN (Bit 21)                      */
+#define CLKGEN_HFADJ_HFADJGAIN_Msk        (0xe00000UL)              /*!< CLKGEN HFADJ: HFADJGAIN (Bitfield-Mask: 0x07)         */
+#define CLKGEN_HFADJ_HFWARMUP_Pos         (20UL)                    /*!< CLKGEN HFADJ: HFWARMUP (Bit 20)                       */
+#define CLKGEN_HFADJ_HFWARMUP_Msk         (0x100000UL)              /*!< CLKGEN HFADJ: HFWARMUP (Bitfield-Mask: 0x01)          */
+#define CLKGEN_HFADJ_HFXTADJ_Pos          (8UL)                     /*!< CLKGEN HFADJ: HFXTADJ (Bit 8)                         */
+#define CLKGEN_HFADJ_HFXTADJ_Msk          (0xfff00UL)               /*!< CLKGEN HFADJ: HFXTADJ (Bitfield-Mask: 0xfff)          */
+#define CLKGEN_HFADJ_HFADJCK_Pos          (1UL)                     /*!< CLKGEN HFADJ: HFADJCK (Bit 1)                         */
+#define CLKGEN_HFADJ_HFADJCK_Msk          (0xeUL)                   /*!< CLKGEN HFADJ: HFADJCK (Bitfield-Mask: 0x07)           */
+#define CLKGEN_HFADJ_HFADJEN_Pos          (0UL)                     /*!< CLKGEN HFADJ: HFADJEN (Bit 0)                         */
+#define CLKGEN_HFADJ_HFADJEN_Msk          (0x1UL)                   /*!< CLKGEN HFADJ: HFADJEN (Bitfield-Mask: 0x01)           */
+/* ======================================================  CLOCKENSTAT  ====================================================== */
+#define CLKGEN_CLOCKENSTAT_CLOCKENSTAT_Pos (0UL)                    /*!< CLKGEN CLOCKENSTAT: CLOCKENSTAT (Bit 0)               */
+#define CLKGEN_CLOCKENSTAT_CLOCKENSTAT_Msk (0xffffffffUL)           /*!< CLKGEN CLOCKENSTAT: CLOCKENSTAT (Bitfield-Mask: 0xffffffff) */
+/* =====================================================  CLOCKEN2STAT  ====================================================== */
+#define CLKGEN_CLOCKEN2STAT_CLOCKEN2STAT_Pos (0UL)                  /*!< CLKGEN CLOCKEN2STAT: CLOCKEN2STAT (Bit 0)             */
+#define CLKGEN_CLOCKEN2STAT_CLOCKEN2STAT_Msk (0xffffffffUL)         /*!< CLKGEN CLOCKEN2STAT: CLOCKEN2STAT (Bitfield-Mask: 0xffffffff) */
+/* =====================================================  CLOCKEN3STAT  ====================================================== */
+#define CLKGEN_CLOCKEN3STAT_CLOCKEN3STAT_Pos (0UL)                  /*!< CLKGEN CLOCKEN3STAT: CLOCKEN3STAT (Bit 0)             */
+#define CLKGEN_CLOCKEN3STAT_CLOCKEN3STAT_Msk (0xffffffffUL)         /*!< CLKGEN CLOCKEN3STAT: CLOCKEN3STAT (Bitfield-Mask: 0xffffffff) */
+/* =======================================================  FREQCTRL  ======================================================== */
+#define CLKGEN_FREQCTRL_BURSTSTATUS_Pos   (2UL)                     /*!< CLKGEN FREQCTRL: BURSTSTATUS (Bit 2)                  */
+#define CLKGEN_FREQCTRL_BURSTSTATUS_Msk   (0x4UL)                   /*!< CLKGEN FREQCTRL: BURSTSTATUS (Bitfield-Mask: 0x01)    */
+#define CLKGEN_FREQCTRL_BURSTACK_Pos      (1UL)                     /*!< CLKGEN FREQCTRL: BURSTACK (Bit 1)                     */
+#define CLKGEN_FREQCTRL_BURSTACK_Msk      (0x2UL)                   /*!< CLKGEN FREQCTRL: BURSTACK (Bitfield-Mask: 0x01)       */
+#define CLKGEN_FREQCTRL_BURSTREQ_Pos      (0UL)                     /*!< CLKGEN FREQCTRL: BURSTREQ (Bit 0)                     */
+#define CLKGEN_FREQCTRL_BURSTREQ_Msk      (0x1UL)                   /*!< CLKGEN FREQCTRL: BURSTREQ (Bitfield-Mask: 0x01)       */
+/* =====================================================  BLEBUCKTONADJ  ===================================================== */
+#define CLKGEN_BLEBUCKTONADJ_ZEROLENDETECTEN_Pos (27UL)             /*!< CLKGEN BLEBUCKTONADJ: ZEROLENDETECTEN (Bit 27)        */
+#define CLKGEN_BLEBUCKTONADJ_ZEROLENDETECTEN_Msk (0x8000000UL)      /*!< CLKGEN BLEBUCKTONADJ: ZEROLENDETECTEN (Bitfield-Mask: 0x01) */
+#define CLKGEN_BLEBUCKTONADJ_ZEROLENDETECTTRIM_Pos (23UL)           /*!< CLKGEN BLEBUCKTONADJ: ZEROLENDETECTTRIM (Bit 23)      */
+#define CLKGEN_BLEBUCKTONADJ_ZEROLENDETECTTRIM_Msk (0x7800000UL)    /*!< CLKGEN BLEBUCKTONADJ: ZEROLENDETECTTRIM (Bitfield-Mask: 0x0f) */
+#define CLKGEN_BLEBUCKTONADJ_TONADJUSTEN_Pos (22UL)                 /*!< CLKGEN BLEBUCKTONADJ: TONADJUSTEN (Bit 22)            */
+#define CLKGEN_BLEBUCKTONADJ_TONADJUSTEN_Msk (0x400000UL)           /*!< CLKGEN BLEBUCKTONADJ: TONADJUSTEN (Bitfield-Mask: 0x01) */
+#define CLKGEN_BLEBUCKTONADJ_TONADJUSTPERIOD_Pos (20UL)             /*!< CLKGEN BLEBUCKTONADJ: TONADJUSTPERIOD (Bit 20)        */
+#define CLKGEN_BLEBUCKTONADJ_TONADJUSTPERIOD_Msk (0x300000UL)       /*!< CLKGEN BLEBUCKTONADJ: TONADJUSTPERIOD (Bitfield-Mask: 0x03) */
+#define CLKGEN_BLEBUCKTONADJ_TONHIGHTHRESHOLD_Pos (10UL)            /*!< CLKGEN BLEBUCKTONADJ: TONHIGHTHRESHOLD (Bit 10)       */
+#define CLKGEN_BLEBUCKTONADJ_TONHIGHTHRESHOLD_Msk (0xffc00UL)       /*!< CLKGEN BLEBUCKTONADJ: TONHIGHTHRESHOLD (Bitfield-Mask: 0x3ff) */
+#define CLKGEN_BLEBUCKTONADJ_TONLOWTHRESHOLD_Pos (0UL)              /*!< CLKGEN BLEBUCKTONADJ: TONLOWTHRESHOLD (Bit 0)         */
+#define CLKGEN_BLEBUCKTONADJ_TONLOWTHRESHOLD_Msk (0x3ffUL)          /*!< CLKGEN BLEBUCKTONADJ: TONLOWTHRESHOLD (Bitfield-Mask: 0x3ff) */
+/* =======================================================  INTRPTEN  ======================================================== */
+#define CLKGEN_INTRPTEN_OF_Pos            (2UL)                     /*!< CLKGEN INTRPTEN: OF (Bit 2)                           */
+#define CLKGEN_INTRPTEN_OF_Msk            (0x4UL)                   /*!< CLKGEN INTRPTEN: OF (Bitfield-Mask: 0x01)             */
+#define CLKGEN_INTRPTEN_ACC_Pos           (1UL)                     /*!< CLKGEN INTRPTEN: ACC (Bit 1)                          */
+#define CLKGEN_INTRPTEN_ACC_Msk           (0x2UL)                   /*!< CLKGEN INTRPTEN: ACC (Bitfield-Mask: 0x01)            */
+#define CLKGEN_INTRPTEN_ACF_Pos           (0UL)                     /*!< CLKGEN INTRPTEN: ACF (Bit 0)                          */
+#define CLKGEN_INTRPTEN_ACF_Msk           (0x1UL)                   /*!< CLKGEN INTRPTEN: ACF (Bitfield-Mask: 0x01)            */
+/* ======================================================  INTRPTSTAT  ======================================================= */
+#define CLKGEN_INTRPTSTAT_OF_Pos          (2UL)                     /*!< CLKGEN INTRPTSTAT: OF (Bit 2)                         */
+#define CLKGEN_INTRPTSTAT_OF_Msk          (0x4UL)                   /*!< CLKGEN INTRPTSTAT: OF (Bitfield-Mask: 0x01)           */
+#define CLKGEN_INTRPTSTAT_ACC_Pos         (1UL)                     /*!< CLKGEN INTRPTSTAT: ACC (Bit 1)                        */
+#define CLKGEN_INTRPTSTAT_ACC_Msk         (0x2UL)                   /*!< CLKGEN INTRPTSTAT: ACC (Bitfield-Mask: 0x01)          */
+#define CLKGEN_INTRPTSTAT_ACF_Pos         (0UL)                     /*!< CLKGEN INTRPTSTAT: ACF (Bit 0)                        */
+#define CLKGEN_INTRPTSTAT_ACF_Msk         (0x1UL)                   /*!< CLKGEN INTRPTSTAT: ACF (Bitfield-Mask: 0x01)          */
+/* =======================================================  INTRPTCLR  ======================================================= */
+#define CLKGEN_INTRPTCLR_OF_Pos           (2UL)                     /*!< CLKGEN INTRPTCLR: OF (Bit 2)                          */
+#define CLKGEN_INTRPTCLR_OF_Msk           (0x4UL)                   /*!< CLKGEN INTRPTCLR: OF (Bitfield-Mask: 0x01)            */
+#define CLKGEN_INTRPTCLR_ACC_Pos          (1UL)                     /*!< CLKGEN INTRPTCLR: ACC (Bit 1)                         */
+#define CLKGEN_INTRPTCLR_ACC_Msk          (0x2UL)                   /*!< CLKGEN INTRPTCLR: ACC (Bitfield-Mask: 0x01)           */
+#define CLKGEN_INTRPTCLR_ACF_Pos          (0UL)                     /*!< CLKGEN INTRPTCLR: ACF (Bit 0)                         */
+#define CLKGEN_INTRPTCLR_ACF_Msk          (0x1UL)                   /*!< CLKGEN INTRPTCLR: ACF (Bitfield-Mask: 0x01)           */
+/* =======================================================  INTRPTSET  ======================================================= */
+#define CLKGEN_INTRPTSET_OF_Pos           (2UL)                     /*!< CLKGEN INTRPTSET: OF (Bit 2)                          */
+#define CLKGEN_INTRPTSET_OF_Msk           (0x4UL)                   /*!< CLKGEN INTRPTSET: OF (Bitfield-Mask: 0x01)            */
+#define CLKGEN_INTRPTSET_ACC_Pos          (1UL)                     /*!< CLKGEN INTRPTSET: ACC (Bit 1)                         */
+#define CLKGEN_INTRPTSET_ACC_Msk          (0x2UL)                   /*!< CLKGEN INTRPTSET: ACC (Bitfield-Mask: 0x01)           */
+#define CLKGEN_INTRPTSET_ACF_Pos          (0UL)                     /*!< CLKGEN INTRPTSET: ACF (Bit 0)                         */
+#define CLKGEN_INTRPTSET_ACF_Msk          (0x1UL)                   /*!< CLKGEN INTRPTSET: ACF (Bitfield-Mask: 0x01)           */
+
+
+/* =========================================================================================================================== */
+/* ================                                          CTIMER                                           ================ */
+/* =========================================================================================================================== */
+
+/* =========================================================  TMR0  ========================================================== */
+#define CTIMER_TMR0_CTTMRB0_Pos           (16UL)                    /*!< CTIMER TMR0: CTTMRB0 (Bit 16)                         */
+#define CTIMER_TMR0_CTTMRB0_Msk           (0xffff0000UL)            /*!< CTIMER TMR0: CTTMRB0 (Bitfield-Mask: 0xffff)          */
+#define CTIMER_TMR0_CTTMRA0_Pos           (0UL)                     /*!< CTIMER TMR0: CTTMRA0 (Bit 0)                          */
+#define CTIMER_TMR0_CTTMRA0_Msk           (0xffffUL)                /*!< CTIMER TMR0: CTTMRA0 (Bitfield-Mask: 0xffff)          */
+/* ========================================================  CMPRA0  ========================================================= */
+#define CTIMER_CMPRA0_CMPR1A0_Pos         (16UL)                    /*!< CTIMER CMPRA0: CMPR1A0 (Bit 16)                       */
+#define CTIMER_CMPRA0_CMPR1A0_Msk         (0xffff0000UL)            /*!< CTIMER CMPRA0: CMPR1A0 (Bitfield-Mask: 0xffff)        */
+#define CTIMER_CMPRA0_CMPR0A0_Pos         (0UL)                     /*!< CTIMER CMPRA0: CMPR0A0 (Bit 0)                        */
+#define CTIMER_CMPRA0_CMPR0A0_Msk         (0xffffUL)                /*!< CTIMER CMPRA0: CMPR0A0 (Bitfield-Mask: 0xffff)        */
+/* ========================================================  CMPRB0  ========================================================= */
+#define CTIMER_CMPRB0_CMPR1B0_Pos         (16UL)                    /*!< CTIMER CMPRB0: CMPR1B0 (Bit 16)                       */
+#define CTIMER_CMPRB0_CMPR1B0_Msk         (0xffff0000UL)            /*!< CTIMER CMPRB0: CMPR1B0 (Bitfield-Mask: 0xffff)        */
+#define CTIMER_CMPRB0_CMPR0B0_Pos         (0UL)                     /*!< CTIMER CMPRB0: CMPR0B0 (Bit 0)                        */
+#define CTIMER_CMPRB0_CMPR0B0_Msk         (0xffffUL)                /*!< CTIMER CMPRB0: CMPR0B0 (Bitfield-Mask: 0xffff)        */
+/* =========================================================  CTRL0  ========================================================= */
+#define CTIMER_CTRL0_CTLINK0_Pos          (31UL)                    /*!< CTIMER CTRL0: CTLINK0 (Bit 31)                        */
+#define CTIMER_CTRL0_CTLINK0_Msk          (0x80000000UL)            /*!< CTIMER CTRL0: CTLINK0 (Bitfield-Mask: 0x01)           */
+#define CTIMER_CTRL0_TMRB0POL_Pos         (28UL)                    /*!< CTIMER CTRL0: TMRB0POL (Bit 28)                       */
+#define CTIMER_CTRL0_TMRB0POL_Msk         (0x10000000UL)            /*!< CTIMER CTRL0: TMRB0POL (Bitfield-Mask: 0x01)          */
+#define CTIMER_CTRL0_TMRB0CLR_Pos         (27UL)                    /*!< CTIMER CTRL0: TMRB0CLR (Bit 27)                       */
+#define CTIMER_CTRL0_TMRB0CLR_Msk         (0x8000000UL)             /*!< CTIMER CTRL0: TMRB0CLR (Bitfield-Mask: 0x01)          */
+#define CTIMER_CTRL0_TMRB0IE1_Pos         (26UL)                    /*!< CTIMER CTRL0: TMRB0IE1 (Bit 26)                       */
+#define CTIMER_CTRL0_TMRB0IE1_Msk         (0x4000000UL)             /*!< CTIMER CTRL0: TMRB0IE1 (Bitfield-Mask: 0x01)          */
+#define CTIMER_CTRL0_TMRB0IE0_Pos         (25UL)                    /*!< CTIMER CTRL0: TMRB0IE0 (Bit 25)                       */
+#define CTIMER_CTRL0_TMRB0IE0_Msk         (0x2000000UL)             /*!< CTIMER CTRL0: TMRB0IE0 (Bitfield-Mask: 0x01)          */
+#define CTIMER_CTRL0_TMRB0FN_Pos          (22UL)                    /*!< CTIMER CTRL0: TMRB0FN (Bit 22)                        */
+#define CTIMER_CTRL0_TMRB0FN_Msk          (0x1c00000UL)             /*!< CTIMER CTRL0: TMRB0FN (Bitfield-Mask: 0x07)           */
+#define CTIMER_CTRL0_TMRB0CLK_Pos         (17UL)                    /*!< CTIMER CTRL0: TMRB0CLK (Bit 17)                       */
+#define CTIMER_CTRL0_TMRB0CLK_Msk         (0x3e0000UL)              /*!< CTIMER CTRL0: TMRB0CLK (Bitfield-Mask: 0x1f)          */
+#define CTIMER_CTRL0_TMRB0EN_Pos          (16UL)                    /*!< CTIMER CTRL0: TMRB0EN (Bit 16)                        */
+#define CTIMER_CTRL0_TMRB0EN_Msk          (0x10000UL)               /*!< CTIMER CTRL0: TMRB0EN (Bitfield-Mask: 0x01)           */
+#define CTIMER_CTRL0_TMRA0POL_Pos         (12UL)                    /*!< CTIMER CTRL0: TMRA0POL (Bit 12)                       */
+#define CTIMER_CTRL0_TMRA0POL_Msk         (0x1000UL)                /*!< CTIMER CTRL0: TMRA0POL (Bitfield-Mask: 0x01)          */
+#define CTIMER_CTRL0_TMRA0CLR_Pos         (11UL)                    /*!< CTIMER CTRL0: TMRA0CLR (Bit 11)                       */
+#define CTIMER_CTRL0_TMRA0CLR_Msk         (0x800UL)                 /*!< CTIMER CTRL0: TMRA0CLR (Bitfield-Mask: 0x01)          */
+#define CTIMER_CTRL0_TMRA0IE1_Pos         (10UL)                    /*!< CTIMER CTRL0: TMRA0IE1 (Bit 10)                       */
+#define CTIMER_CTRL0_TMRA0IE1_Msk         (0x400UL)                 /*!< CTIMER CTRL0: TMRA0IE1 (Bitfield-Mask: 0x01)          */
+#define CTIMER_CTRL0_TMRA0IE0_Pos         (9UL)                     /*!< CTIMER CTRL0: TMRA0IE0 (Bit 9)                        */
+#define CTIMER_CTRL0_TMRA0IE0_Msk         (0x200UL)                 /*!< CTIMER CTRL0: TMRA0IE0 (Bitfield-Mask: 0x01)          */
+#define CTIMER_CTRL0_TMRA0FN_Pos          (6UL)                     /*!< CTIMER CTRL0: TMRA0FN (Bit 6)                         */
+#define CTIMER_CTRL0_TMRA0FN_Msk          (0x1c0UL)                 /*!< CTIMER CTRL0: TMRA0FN (Bitfield-Mask: 0x07)           */
+#define CTIMER_CTRL0_TMRA0CLK_Pos         (1UL)                     /*!< CTIMER CTRL0: TMRA0CLK (Bit 1)                        */
+#define CTIMER_CTRL0_TMRA0CLK_Msk         (0x3eUL)                  /*!< CTIMER CTRL0: TMRA0CLK (Bitfield-Mask: 0x1f)          */
+#define CTIMER_CTRL0_TMRA0EN_Pos          (0UL)                     /*!< CTIMER CTRL0: TMRA0EN (Bit 0)                         */
+#define CTIMER_CTRL0_TMRA0EN_Msk          (0x1UL)                   /*!< CTIMER CTRL0: TMRA0EN (Bitfield-Mask: 0x01)           */
+/* =======================================================  CMPRAUXA0  ======================================================= */
+#define CTIMER_CMPRAUXA0_CMPR3A0_Pos      (16UL)                    /*!< CTIMER CMPRAUXA0: CMPR3A0 (Bit 16)                    */
+#define CTIMER_CMPRAUXA0_CMPR3A0_Msk      (0xffff0000UL)            /*!< CTIMER CMPRAUXA0: CMPR3A0 (Bitfield-Mask: 0xffff)     */
+#define CTIMER_CMPRAUXA0_CMPR2A0_Pos      (0UL)                     /*!< CTIMER CMPRAUXA0: CMPR2A0 (Bit 0)                     */
+#define CTIMER_CMPRAUXA0_CMPR2A0_Msk      (0xffffUL)                /*!< CTIMER CMPRAUXA0: CMPR2A0 (Bitfield-Mask: 0xffff)     */
+/* =======================================================  CMPRAUXB0  ======================================================= */
+#define CTIMER_CMPRAUXB0_CMPR3B0_Pos      (16UL)                    /*!< CTIMER CMPRAUXB0: CMPR3B0 (Bit 16)                    */
+#define CTIMER_CMPRAUXB0_CMPR3B0_Msk      (0xffff0000UL)            /*!< CTIMER CMPRAUXB0: CMPR3B0 (Bitfield-Mask: 0xffff)     */
+#define CTIMER_CMPRAUXB0_CMPR2B0_Pos      (0UL)                     /*!< CTIMER CMPRAUXB0: CMPR2B0 (Bit 0)                     */
+#define CTIMER_CMPRAUXB0_CMPR2B0_Msk      (0xffffUL)                /*!< CTIMER CMPRAUXB0: CMPR2B0 (Bitfield-Mask: 0xffff)     */
+/* =========================================================  AUX0  ========================================================== */
+#define CTIMER_AUX0_TMRB0EN23_Pos         (30UL)                    /*!< CTIMER AUX0: TMRB0EN23 (Bit 30)                       */
+#define CTIMER_AUX0_TMRB0EN23_Msk         (0x40000000UL)            /*!< CTIMER AUX0: TMRB0EN23 (Bitfield-Mask: 0x01)          */
+#define CTIMER_AUX0_TMRB0POL23_Pos        (29UL)                    /*!< CTIMER AUX0: TMRB0POL23 (Bit 29)                      */
+#define CTIMER_AUX0_TMRB0POL23_Msk        (0x20000000UL)            /*!< CTIMER AUX0: TMRB0POL23 (Bitfield-Mask: 0x01)         */
+#define CTIMER_AUX0_TMRB0TINV_Pos         (28UL)                    /*!< CTIMER AUX0: TMRB0TINV (Bit 28)                       */
+#define CTIMER_AUX0_TMRB0TINV_Msk         (0x10000000UL)            /*!< CTIMER AUX0: TMRB0TINV (Bitfield-Mask: 0x01)          */
+#define CTIMER_AUX0_TMRB0NOSYNC_Pos       (27UL)                    /*!< CTIMER AUX0: TMRB0NOSYNC (Bit 27)                     */
+#define CTIMER_AUX0_TMRB0NOSYNC_Msk       (0x8000000UL)             /*!< CTIMER AUX0: TMRB0NOSYNC (Bitfield-Mask: 0x01)        */
+#define CTIMER_AUX0_TMRB0TRIG_Pos         (23UL)                    /*!< CTIMER AUX0: TMRB0TRIG (Bit 23)                       */
+#define CTIMER_AUX0_TMRB0TRIG_Msk         (0x7800000UL)             /*!< CTIMER AUX0: TMRB0TRIG (Bitfield-Mask: 0x0f)          */
+#define CTIMER_AUX0_TMRB0LMT_Pos          (16UL)                    /*!< CTIMER AUX0: TMRB0LMT (Bit 16)                        */
+#define CTIMER_AUX0_TMRB0LMT_Msk          (0x3f0000UL)              /*!< CTIMER AUX0: TMRB0LMT (Bitfield-Mask: 0x3f)           */
+#define CTIMER_AUX0_TMRA0EN23_Pos         (14UL)                    /*!< CTIMER AUX0: TMRA0EN23 (Bit 14)                       */
+#define CTIMER_AUX0_TMRA0EN23_Msk         (0x4000UL)                /*!< CTIMER AUX0: TMRA0EN23 (Bitfield-Mask: 0x01)          */
+#define CTIMER_AUX0_TMRA0POL23_Pos        (13UL)                    /*!< CTIMER AUX0: TMRA0POL23 (Bit 13)                      */
+#define CTIMER_AUX0_TMRA0POL23_Msk        (0x2000UL)                /*!< CTIMER AUX0: TMRA0POL23 (Bitfield-Mask: 0x01)         */
+#define CTIMER_AUX0_TMRA0TINV_Pos         (12UL)                    /*!< CTIMER AUX0: TMRA0TINV (Bit 12)                       */
+#define CTIMER_AUX0_TMRA0TINV_Msk         (0x1000UL)                /*!< CTIMER AUX0: TMRA0TINV (Bitfield-Mask: 0x01)          */
+#define CTIMER_AUX0_TMRA0NOSYNC_Pos       (11UL)                    /*!< CTIMER AUX0: TMRA0NOSYNC (Bit 11)                     */
+#define CTIMER_AUX0_TMRA0NOSYNC_Msk       (0x800UL)                 /*!< CTIMER AUX0: TMRA0NOSYNC (Bitfield-Mask: 0x01)        */
+#define CTIMER_AUX0_TMRA0TRIG_Pos         (7UL)                     /*!< CTIMER AUX0: TMRA0TRIG (Bit 7)                        */
+#define CTIMER_AUX0_TMRA0TRIG_Msk         (0x780UL)                 /*!< CTIMER AUX0: TMRA0TRIG (Bitfield-Mask: 0x0f)          */
+#define CTIMER_AUX0_TMRA0LMT_Pos          (0UL)                     /*!< CTIMER AUX0: TMRA0LMT (Bit 0)                         */
+#define CTIMER_AUX0_TMRA0LMT_Msk          (0x7fUL)                  /*!< CTIMER AUX0: TMRA0LMT (Bitfield-Mask: 0x7f)           */
+/* =========================================================  TMR1  ========================================================== */
+#define CTIMER_TMR1_CTTMRB1_Pos           (16UL)                    /*!< CTIMER TMR1: CTTMRB1 (Bit 16)                         */
+#define CTIMER_TMR1_CTTMRB1_Msk           (0xffff0000UL)            /*!< CTIMER TMR1: CTTMRB1 (Bitfield-Mask: 0xffff)          */
+#define CTIMER_TMR1_CTTMRA1_Pos           (0UL)                     /*!< CTIMER TMR1: CTTMRA1 (Bit 0)                          */
+#define CTIMER_TMR1_CTTMRA1_Msk           (0xffffUL)                /*!< CTIMER TMR1: CTTMRA1 (Bitfield-Mask: 0xffff)          */
+/* ========================================================  CMPRA1  ========================================================= */
+#define CTIMER_CMPRA1_CMPR1A1_Pos         (16UL)                    /*!< CTIMER CMPRA1: CMPR1A1 (Bit 16)                       */
+#define CTIMER_CMPRA1_CMPR1A1_Msk         (0xffff0000UL)            /*!< CTIMER CMPRA1: CMPR1A1 (Bitfield-Mask: 0xffff)        */
+#define CTIMER_CMPRA1_CMPR0A1_Pos         (0UL)                     /*!< CTIMER CMPRA1: CMPR0A1 (Bit 0)                        */
+#define CTIMER_CMPRA1_CMPR0A1_Msk         (0xffffUL)                /*!< CTIMER CMPRA1: CMPR0A1 (Bitfield-Mask: 0xffff)        */
+/* ========================================================  CMPRB1  ========================================================= */
+#define CTIMER_CMPRB1_CMPR1B1_Pos         (16UL)                    /*!< CTIMER CMPRB1: CMPR1B1 (Bit 16)                       */
+#define CTIMER_CMPRB1_CMPR1B1_Msk         (0xffff0000UL)            /*!< CTIMER CMPRB1: CMPR1B1 (Bitfield-Mask: 0xffff)        */
+#define CTIMER_CMPRB1_CMPR0B1_Pos         (0UL)                     /*!< CTIMER CMPRB1: CMPR0B1 (Bit 0)                        */
+#define CTIMER_CMPRB1_CMPR0B1_Msk         (0xffffUL)                /*!< CTIMER CMPRB1: CMPR0B1 (Bitfield-Mask: 0xffff)        */
+/* =========================================================  CTRL1  ========================================================= */
+#define CTIMER_CTRL1_CTLINK1_Pos          (31UL)                    /*!< CTIMER CTRL1: CTLINK1 (Bit 31)                        */
+#define CTIMER_CTRL1_CTLINK1_Msk          (0x80000000UL)            /*!< CTIMER CTRL1: CTLINK1 (Bitfield-Mask: 0x01)           */
+#define CTIMER_CTRL1_TMRB1POL_Pos         (28UL)                    /*!< CTIMER CTRL1: TMRB1POL (Bit 28)                       */
+#define CTIMER_CTRL1_TMRB1POL_Msk         (0x10000000UL)            /*!< CTIMER CTRL1: TMRB1POL (Bitfield-Mask: 0x01)          */
+#define CTIMER_CTRL1_TMRB1CLR_Pos         (27UL)                    /*!< CTIMER CTRL1: TMRB1CLR (Bit 27)                       */
+#define CTIMER_CTRL1_TMRB1CLR_Msk         (0x8000000UL)             /*!< CTIMER CTRL1: TMRB1CLR (Bitfield-Mask: 0x01)          */
+#define CTIMER_CTRL1_TMRB1IE1_Pos         (26UL)                    /*!< CTIMER CTRL1: TMRB1IE1 (Bit 26)                       */
+#define CTIMER_CTRL1_TMRB1IE1_Msk         (0x4000000UL)             /*!< CTIMER CTRL1: TMRB1IE1 (Bitfield-Mask: 0x01)          */
+#define CTIMER_CTRL1_TMRB1IE0_Pos         (25UL)                    /*!< CTIMER CTRL1: TMRB1IE0 (Bit 25)                       */
+#define CTIMER_CTRL1_TMRB1IE0_Msk         (0x2000000UL)             /*!< CTIMER CTRL1: TMRB1IE0 (Bitfield-Mask: 0x01)          */
+#define CTIMER_CTRL1_TMRB1FN_Pos          (22UL)                    /*!< CTIMER CTRL1: TMRB1FN (Bit 22)                        */
+#define CTIMER_CTRL1_TMRB1FN_Msk          (0x1c00000UL)             /*!< CTIMER CTRL1: TMRB1FN (Bitfield-Mask: 0x07)           */
+#define CTIMER_CTRL1_TMRB1CLK_Pos         (17UL)                    /*!< CTIMER CTRL1: TMRB1CLK (Bit 17)                       */
+#define CTIMER_CTRL1_TMRB1CLK_Msk         (0x3e0000UL)              /*!< CTIMER CTRL1: TMRB1CLK (Bitfield-Mask: 0x1f)          */
+#define CTIMER_CTRL1_TMRB1EN_Pos          (16UL)                    /*!< CTIMER CTRL1: TMRB1EN (Bit 16)                        */
+#define CTIMER_CTRL1_TMRB1EN_Msk          (0x10000UL)               /*!< CTIMER CTRL1: TMRB1EN (Bitfield-Mask: 0x01)           */
+#define CTIMER_CTRL1_TMRA1POL_Pos         (12UL)                    /*!< CTIMER CTRL1: TMRA1POL (Bit 12)                       */
+#define CTIMER_CTRL1_TMRA1POL_Msk         (0x1000UL)                /*!< CTIMER CTRL1: TMRA1POL (Bitfield-Mask: 0x01)          */
+#define CTIMER_CTRL1_TMRA1CLR_Pos         (11UL)                    /*!< CTIMER CTRL1: TMRA1CLR (Bit 11)                       */
+#define CTIMER_CTRL1_TMRA1CLR_Msk         (0x800UL)                 /*!< CTIMER CTRL1: TMRA1CLR (Bitfield-Mask: 0x01)          */
+#define CTIMER_CTRL1_TMRA1IE1_Pos         (10UL)                    /*!< CTIMER CTRL1: TMRA1IE1 (Bit 10)                       */
+#define CTIMER_CTRL1_TMRA1IE1_Msk         (0x400UL)                 /*!< CTIMER CTRL1: TMRA1IE1 (Bitfield-Mask: 0x01)          */
+#define CTIMER_CTRL1_TMRA1IE0_Pos         (9UL)                     /*!< CTIMER CTRL1: TMRA1IE0 (Bit 9)                        */
+#define CTIMER_CTRL1_TMRA1IE0_Msk         (0x200UL)                 /*!< CTIMER CTRL1: TMRA1IE0 (Bitfield-Mask: 0x01)          */
+#define CTIMER_CTRL1_TMRA1FN_Pos          (6UL)                     /*!< CTIMER CTRL1: TMRA1FN (Bit 6)                         */
+#define CTIMER_CTRL1_TMRA1FN_Msk          (0x1c0UL)                 /*!< CTIMER CTRL1: TMRA1FN (Bitfield-Mask: 0x07)           */
+#define CTIMER_CTRL1_TMRA1CLK_Pos         (1UL)                     /*!< CTIMER CTRL1: TMRA1CLK (Bit 1)                        */
+#define CTIMER_CTRL1_TMRA1CLK_Msk         (0x3eUL)                  /*!< CTIMER CTRL1: TMRA1CLK (Bitfield-Mask: 0x1f)          */
+#define CTIMER_CTRL1_TMRA1EN_Pos          (0UL)                     /*!< CTIMER CTRL1: TMRA1EN (Bit 0)                         */
+#define CTIMER_CTRL1_TMRA1EN_Msk          (0x1UL)                   /*!< CTIMER CTRL1: TMRA1EN (Bitfield-Mask: 0x01)           */
+/* =======================================================  CMPRAUXA1  ======================================================= */
+#define CTIMER_CMPRAUXA1_CMPR3A1_Pos      (16UL)                    /*!< CTIMER CMPRAUXA1: CMPR3A1 (Bit 16)                    */
+#define CTIMER_CMPRAUXA1_CMPR3A1_Msk      (0xffff0000UL)            /*!< CTIMER CMPRAUXA1: CMPR3A1 (Bitfield-Mask: 0xffff)     */
+#define CTIMER_CMPRAUXA1_CMPR2A1_Pos      (0UL)                     /*!< CTIMER CMPRAUXA1: CMPR2A1 (Bit 0)                     */
+#define CTIMER_CMPRAUXA1_CMPR2A1_Msk      (0xffffUL)                /*!< CTIMER CMPRAUXA1: CMPR2A1 (Bitfield-Mask: 0xffff)     */
+/* =======================================================  CMPRAUXB1  ======================================================= */
+#define CTIMER_CMPRAUXB1_CMPR3B1_Pos      (16UL)                    /*!< CTIMER CMPRAUXB1: CMPR3B1 (Bit 16)                    */
+#define CTIMER_CMPRAUXB1_CMPR3B1_Msk      (0xffff0000UL)            /*!< CTIMER CMPRAUXB1: CMPR3B1 (Bitfield-Mask: 0xffff)     */
+#define CTIMER_CMPRAUXB1_CMPR2B1_Pos      (0UL)                     /*!< CTIMER CMPRAUXB1: CMPR2B1 (Bit 0)                     */
+#define CTIMER_CMPRAUXB1_CMPR2B1_Msk      (0xffffUL)                /*!< CTIMER CMPRAUXB1: CMPR2B1 (Bitfield-Mask: 0xffff)     */
+/* =========================================================  AUX1  ========================================================== */
+#define CTIMER_AUX1_TMRB1EN23_Pos         (30UL)                    /*!< CTIMER AUX1: TMRB1EN23 (Bit 30)                       */
+#define CTIMER_AUX1_TMRB1EN23_Msk         (0x40000000UL)            /*!< CTIMER AUX1: TMRB1EN23 (Bitfield-Mask: 0x01)          */
+#define CTIMER_AUX1_TMRB1POL23_Pos        (29UL)                    /*!< CTIMER AUX1: TMRB1POL23 (Bit 29)                      */
+#define CTIMER_AUX1_TMRB1POL23_Msk        (0x20000000UL)            /*!< CTIMER AUX1: TMRB1POL23 (Bitfield-Mask: 0x01)         */
+#define CTIMER_AUX1_TMRB1TINV_Pos         (28UL)                    /*!< CTIMER AUX1: TMRB1TINV (Bit 28)                       */
+#define CTIMER_AUX1_TMRB1TINV_Msk         (0x10000000UL)            /*!< CTIMER AUX1: TMRB1TINV (Bitfield-Mask: 0x01)          */
+#define CTIMER_AUX1_TMRB1NOSYNC_Pos       (27UL)                    /*!< CTIMER AUX1: TMRB1NOSYNC (Bit 27)                     */
+#define CTIMER_AUX1_TMRB1NOSYNC_Msk       (0x8000000UL)             /*!< CTIMER AUX1: TMRB1NOSYNC (Bitfield-Mask: 0x01)        */
+#define CTIMER_AUX1_TMRB1TRIG_Pos         (23UL)                    /*!< CTIMER AUX1: TMRB1TRIG (Bit 23)                       */
+#define CTIMER_AUX1_TMRB1TRIG_Msk         (0x7800000UL)             /*!< CTIMER AUX1: TMRB1TRIG (Bitfield-Mask: 0x0f)          */
+#define CTIMER_AUX1_TMRB1LMT_Pos          (16UL)                    /*!< CTIMER AUX1: TMRB1LMT (Bit 16)                        */
+#define CTIMER_AUX1_TMRB1LMT_Msk          (0x3f0000UL)              /*!< CTIMER AUX1: TMRB1LMT (Bitfield-Mask: 0x3f)           */
+#define CTIMER_AUX1_TMRA1EN23_Pos         (14UL)                    /*!< CTIMER AUX1: TMRA1EN23 (Bit 14)                       */
+#define CTIMER_AUX1_TMRA1EN23_Msk         (0x4000UL)                /*!< CTIMER AUX1: TMRA1EN23 (Bitfield-Mask: 0x01)          */
+#define CTIMER_AUX1_TMRA1POL23_Pos        (13UL)                    /*!< CTIMER AUX1: TMRA1POL23 (Bit 13)                      */
+#define CTIMER_AUX1_TMRA1POL23_Msk        (0x2000UL)                /*!< CTIMER AUX1: TMRA1POL23 (Bitfield-Mask: 0x01)         */
+#define CTIMER_AUX1_TMRA1TINV_Pos         (12UL)                    /*!< CTIMER AUX1: TMRA1TINV (Bit 12)                       */
+#define CTIMER_AUX1_TMRA1TINV_Msk         (0x1000UL)                /*!< CTIMER AUX1: TMRA1TINV (Bitfield-Mask: 0x01)          */
+#define CTIMER_AUX1_TMRA1NOSYNC_Pos       (11UL)                    /*!< CTIMER AUX1: TMRA1NOSYNC (Bit 11)                     */
+#define CTIMER_AUX1_TMRA1NOSYNC_Msk       (0x800UL)                 /*!< CTIMER AUX1: TMRA1NOSYNC (Bitfield-Mask: 0x01)        */
+#define CTIMER_AUX1_TMRA1TRIG_Pos         (7UL)                     /*!< CTIMER AUX1: TMRA1TRIG (Bit 7)                        */
+#define CTIMER_AUX1_TMRA1TRIG_Msk         (0x780UL)                 /*!< CTIMER AUX1: TMRA1TRIG (Bitfield-Mask: 0x0f)          */
+#define CTIMER_AUX1_TMRA1LMT_Pos          (0UL)                     /*!< CTIMER AUX1: TMRA1LMT (Bit 0)                         */
+#define CTIMER_AUX1_TMRA1LMT_Msk          (0x7fUL)                  /*!< CTIMER AUX1: TMRA1LMT (Bitfield-Mask: 0x7f)           */
+/* =========================================================  TMR2  ========================================================== */
+#define CTIMER_TMR2_CTTMRB2_Pos           (16UL)                    /*!< CTIMER TMR2: CTTMRB2 (Bit 16)                         */
+#define CTIMER_TMR2_CTTMRB2_Msk           (0xffff0000UL)            /*!< CTIMER TMR2: CTTMRB2 (Bitfield-Mask: 0xffff)          */
+#define CTIMER_TMR2_CTTMRA2_Pos           (0UL)                     /*!< CTIMER TMR2: CTTMRA2 (Bit 0)                          */
+#define CTIMER_TMR2_CTTMRA2_Msk           (0xffffUL)                /*!< CTIMER TMR2: CTTMRA2 (Bitfield-Mask: 0xffff)          */
+/* ========================================================  CMPRA2  ========================================================= */
+#define CTIMER_CMPRA2_CMPR1A2_Pos         (16UL)                    /*!< CTIMER CMPRA2: CMPR1A2 (Bit 16)                       */
+#define CTIMER_CMPRA2_CMPR1A2_Msk         (0xffff0000UL)            /*!< CTIMER CMPRA2: CMPR1A2 (Bitfield-Mask: 0xffff)        */
+#define CTIMER_CMPRA2_CMPR0A2_Pos         (0UL)                     /*!< CTIMER CMPRA2: CMPR0A2 (Bit 0)                        */
+#define CTIMER_CMPRA2_CMPR0A2_Msk         (0xffffUL)                /*!< CTIMER CMPRA2: CMPR0A2 (Bitfield-Mask: 0xffff)        */
+/* ========================================================  CMPRB2  ========================================================= */
+#define CTIMER_CMPRB2_CMPR1B2_Pos         (16UL)                    /*!< CTIMER CMPRB2: CMPR1B2 (Bit 16)                       */
+#define CTIMER_CMPRB2_CMPR1B2_Msk         (0xffff0000UL)            /*!< CTIMER CMPRB2: CMPR1B2 (Bitfield-Mask: 0xffff)        */
+#define CTIMER_CMPRB2_CMPR0B2_Pos         (0UL)                     /*!< CTIMER CMPRB2: CMPR0B2 (Bit 0)                        */
+#define CTIMER_CMPRB2_CMPR0B2_Msk         (0xffffUL)                /*!< CTIMER CMPRB2: CMPR0B2 (Bitfield-Mask: 0xffff)        */
+/* =========================================================  CTRL2  ========================================================= */
+#define CTIMER_CTRL2_CTLINK2_Pos          (31UL)                    /*!< CTIMER CTRL2: CTLINK2 (Bit 31)                        */
+#define CTIMER_CTRL2_CTLINK2_Msk          (0x80000000UL)            /*!< CTIMER CTRL2: CTLINK2 (Bitfield-Mask: 0x01)           */
+#define CTIMER_CTRL2_TMRB2POL_Pos         (28UL)                    /*!< CTIMER CTRL2: TMRB2POL (Bit 28)                       */
+#define CTIMER_CTRL2_TMRB2POL_Msk         (0x10000000UL)            /*!< CTIMER CTRL2: TMRB2POL (Bitfield-Mask: 0x01)          */
+#define CTIMER_CTRL2_TMRB2CLR_Pos         (27UL)                    /*!< CTIMER CTRL2: TMRB2CLR (Bit 27)                       */
+#define CTIMER_CTRL2_TMRB2CLR_Msk         (0x8000000UL)             /*!< CTIMER CTRL2: TMRB2CLR (Bitfield-Mask: 0x01)          */
+#define CTIMER_CTRL2_TMRB2IE1_Pos         (26UL)                    /*!< CTIMER CTRL2: TMRB2IE1 (Bit 26)                       */
+#define CTIMER_CTRL2_TMRB2IE1_Msk         (0x4000000UL)             /*!< CTIMER CTRL2: TMRB2IE1 (Bitfield-Mask: 0x01)          */
+#define CTIMER_CTRL2_TMRB2IE0_Pos         (25UL)                    /*!< CTIMER CTRL2: TMRB2IE0 (Bit 25)                       */
+#define CTIMER_CTRL2_TMRB2IE0_Msk         (0x2000000UL)             /*!< CTIMER CTRL2: TMRB2IE0 (Bitfield-Mask: 0x01)          */
+#define CTIMER_CTRL2_TMRB2FN_Pos          (22UL)                    /*!< CTIMER CTRL2: TMRB2FN (Bit 22)                        */
+#define CTIMER_CTRL2_TMRB2FN_Msk          (0x1c00000UL)             /*!< CTIMER CTRL2: TMRB2FN (Bitfield-Mask: 0x07)           */
+#define CTIMER_CTRL2_TMRB2CLK_Pos         (17UL)                    /*!< CTIMER CTRL2: TMRB2CLK (Bit 17)                       */
+#define CTIMER_CTRL2_TMRB2CLK_Msk         (0x3e0000UL)              /*!< CTIMER CTRL2: TMRB2CLK (Bitfield-Mask: 0x1f)          */
+#define CTIMER_CTRL2_TMRB2EN_Pos          (16UL)                    /*!< CTIMER CTRL2: TMRB2EN (Bit 16)                        */
+#define CTIMER_CTRL2_TMRB2EN_Msk          (0x10000UL)               /*!< CTIMER CTRL2: TMRB2EN (Bitfield-Mask: 0x01)           */
+#define CTIMER_CTRL2_TMRA2POL_Pos         (12UL)                    /*!< CTIMER CTRL2: TMRA2POL (Bit 12)                       */
+#define CTIMER_CTRL2_TMRA2POL_Msk         (0x1000UL)                /*!< CTIMER CTRL2: TMRA2POL (Bitfield-Mask: 0x01)          */
+#define CTIMER_CTRL2_TMRA2CLR_Pos         (11UL)                    /*!< CTIMER CTRL2: TMRA2CLR (Bit 11)                       */
+#define CTIMER_CTRL2_TMRA2CLR_Msk         (0x800UL)                 /*!< CTIMER CTRL2: TMRA2CLR (Bitfield-Mask: 0x01)          */
+#define CTIMER_CTRL2_TMRA2IE1_Pos         (10UL)                    /*!< CTIMER CTRL2: TMRA2IE1 (Bit 10)                       */
+#define CTIMER_CTRL2_TMRA2IE1_Msk         (0x400UL)                 /*!< CTIMER CTRL2: TMRA2IE1 (Bitfield-Mask: 0x01)          */
+#define CTIMER_CTRL2_TMRA2IE0_Pos         (9UL)                     /*!< CTIMER CTRL2: TMRA2IE0 (Bit 9)                        */
+#define CTIMER_CTRL2_TMRA2IE0_Msk         (0x200UL)                 /*!< CTIMER CTRL2: TMRA2IE0 (Bitfield-Mask: 0x01)          */
+#define CTIMER_CTRL2_TMRA2FN_Pos          (6UL)                     /*!< CTIMER CTRL2: TMRA2FN (Bit 6)                         */
+#define CTIMER_CTRL2_TMRA2FN_Msk          (0x1c0UL)                 /*!< CTIMER CTRL2: TMRA2FN (Bitfield-Mask: 0x07)           */
+#define CTIMER_CTRL2_TMRA2CLK_Pos         (1UL)                     /*!< CTIMER CTRL2: TMRA2CLK (Bit 1)                        */
+#define CTIMER_CTRL2_TMRA2CLK_Msk         (0x3eUL)                  /*!< CTIMER CTRL2: TMRA2CLK (Bitfield-Mask: 0x1f)          */
+#define CTIMER_CTRL2_TMRA2EN_Pos          (0UL)                     /*!< CTIMER CTRL2: TMRA2EN (Bit 0)                         */
+#define CTIMER_CTRL2_TMRA2EN_Msk          (0x1UL)                   /*!< CTIMER CTRL2: TMRA2EN (Bitfield-Mask: 0x01)           */
+/* =======================================================  CMPRAUXA2  ======================================================= */
+#define CTIMER_CMPRAUXA2_CMPR3A2_Pos      (16UL)                    /*!< CTIMER CMPRAUXA2: CMPR3A2 (Bit 16)                    */
+#define CTIMER_CMPRAUXA2_CMPR3A2_Msk      (0xffff0000UL)            /*!< CTIMER CMPRAUXA2: CMPR3A2 (Bitfield-Mask: 0xffff)     */
+#define CTIMER_CMPRAUXA2_CMPR2A2_Pos      (0UL)                     /*!< CTIMER CMPRAUXA2: CMPR2A2 (Bit 0)                     */
+#define CTIMER_CMPRAUXA2_CMPR2A2_Msk      (0xffffUL)                /*!< CTIMER CMPRAUXA2: CMPR2A2 (Bitfield-Mask: 0xffff)     */
+/* =======================================================  CMPRAUXB2  ======================================================= */
+#define CTIMER_CMPRAUXB2_CMPR3B2_Pos      (16UL)                    /*!< CTIMER CMPRAUXB2: CMPR3B2 (Bit 16)                    */
+#define CTIMER_CMPRAUXB2_CMPR3B2_Msk      (0xffff0000UL)            /*!< CTIMER CMPRAUXB2: CMPR3B2 (Bitfield-Mask: 0xffff)     */
+#define CTIMER_CMPRAUXB2_CMPR2B2_Pos      (0UL)                     /*!< CTIMER CMPRAUXB2: CMPR2B2 (Bit 0)                     */
+#define CTIMER_CMPRAUXB2_CMPR2B2_Msk      (0xffffUL)                /*!< CTIMER CMPRAUXB2: CMPR2B2 (Bitfield-Mask: 0xffff)     */
+/* =========================================================  AUX2  ========================================================== */
+#define CTIMER_AUX2_TMRB2EN23_Pos         (30UL)                    /*!< CTIMER AUX2: TMRB2EN23 (Bit 30)                       */
+#define CTIMER_AUX2_TMRB2EN23_Msk         (0x40000000UL)            /*!< CTIMER AUX2: TMRB2EN23 (Bitfield-Mask: 0x01)          */
+#define CTIMER_AUX2_TMRB2POL23_Pos        (29UL)                    /*!< CTIMER AUX2: TMRB2POL23 (Bit 29)                      */
+#define CTIMER_AUX2_TMRB2POL23_Msk        (0x20000000UL)            /*!< CTIMER AUX2: TMRB2POL23 (Bitfield-Mask: 0x01)         */
+#define CTIMER_AUX2_TMRB2TINV_Pos         (28UL)                    /*!< CTIMER AUX2: TMRB2TINV (Bit 28)                       */
+#define CTIMER_AUX2_TMRB2TINV_Msk         (0x10000000UL)            /*!< CTIMER AUX2: TMRB2TINV (Bitfield-Mask: 0x01)          */
+#define CTIMER_AUX2_TMRB2NOSYNC_Pos       (27UL)                    /*!< CTIMER AUX2: TMRB2NOSYNC (Bit 27)                     */
+#define CTIMER_AUX2_TMRB2NOSYNC_Msk       (0x8000000UL)             /*!< CTIMER AUX2: TMRB2NOSYNC (Bitfield-Mask: 0x01)        */
+#define CTIMER_AUX2_TMRB2TRIG_Pos         (23UL)                    /*!< CTIMER AUX2: TMRB2TRIG (Bit 23)                       */
+#define CTIMER_AUX2_TMRB2TRIG_Msk         (0x7800000UL)             /*!< CTIMER AUX2: TMRB2TRIG (Bitfield-Mask: 0x0f)          */
+#define CTIMER_AUX2_TMRB2LMT_Pos          (16UL)                    /*!< CTIMER AUX2: TMRB2LMT (Bit 16)                        */
+#define CTIMER_AUX2_TMRB2LMT_Msk          (0x3f0000UL)              /*!< CTIMER AUX2: TMRB2LMT (Bitfield-Mask: 0x3f)           */
+#define CTIMER_AUX2_TMRA2EN23_Pos         (14UL)                    /*!< CTIMER AUX2: TMRA2EN23 (Bit 14)                       */
+#define CTIMER_AUX2_TMRA2EN23_Msk         (0x4000UL)                /*!< CTIMER AUX2: TMRA2EN23 (Bitfield-Mask: 0x01)          */
+#define CTIMER_AUX2_TMRA2POL23_Pos        (13UL)                    /*!< CTIMER AUX2: TMRA2POL23 (Bit 13)                      */
+#define CTIMER_AUX2_TMRA2POL23_Msk        (0x2000UL)                /*!< CTIMER AUX2: TMRA2POL23 (Bitfield-Mask: 0x01)         */
+#define CTIMER_AUX2_TMRA2TINV_Pos         (12UL)                    /*!< CTIMER AUX2: TMRA2TINV (Bit 12)                       */
+#define CTIMER_AUX2_TMRA2TINV_Msk         (0x1000UL)                /*!< CTIMER AUX2: TMRA2TINV (Bitfield-Mask: 0x01)          */
+#define CTIMER_AUX2_TMRA2NOSYNC_Pos       (11UL)                    /*!< CTIMER AUX2: TMRA2NOSYNC (Bit 11)                     */
+#define CTIMER_AUX2_TMRA2NOSYNC_Msk       (0x800UL)                 /*!< CTIMER AUX2: TMRA2NOSYNC (Bitfield-Mask: 0x01)        */
+#define CTIMER_AUX2_TMRA2TRIG_Pos         (7UL)                     /*!< CTIMER AUX2: TMRA2TRIG (Bit 7)                        */
+#define CTIMER_AUX2_TMRA2TRIG_Msk         (0x780UL)                 /*!< CTIMER AUX2: TMRA2TRIG (Bitfield-Mask: 0x0f)          */
+#define CTIMER_AUX2_TMRA2LMT_Pos          (0UL)                     /*!< CTIMER AUX2: TMRA2LMT (Bit 0)                         */
+#define CTIMER_AUX2_TMRA2LMT_Msk          (0x7fUL)                  /*!< CTIMER AUX2: TMRA2LMT (Bitfield-Mask: 0x7f)           */
+/* =========================================================  TMR3  ========================================================== */
+#define CTIMER_TMR3_CTTMRB3_Pos           (16UL)                    /*!< CTIMER TMR3: CTTMRB3 (Bit 16)                         */
+#define CTIMER_TMR3_CTTMRB3_Msk           (0xffff0000UL)            /*!< CTIMER TMR3: CTTMRB3 (Bitfield-Mask: 0xffff)          */
+#define CTIMER_TMR3_CTTMRA3_Pos           (0UL)                     /*!< CTIMER TMR3: CTTMRA3 (Bit 0)                          */
+#define CTIMER_TMR3_CTTMRA3_Msk           (0xffffUL)                /*!< CTIMER TMR3: CTTMRA3 (Bitfield-Mask: 0xffff)          */
+/* ========================================================  CMPRA3  ========================================================= */
+#define CTIMER_CMPRA3_CMPR1A3_Pos         (16UL)                    /*!< CTIMER CMPRA3: CMPR1A3 (Bit 16)                       */
+#define CTIMER_CMPRA3_CMPR1A3_Msk         (0xffff0000UL)            /*!< CTIMER CMPRA3: CMPR1A3 (Bitfield-Mask: 0xffff)        */
+#define CTIMER_CMPRA3_CMPR0A3_Pos         (0UL)                     /*!< CTIMER CMPRA3: CMPR0A3 (Bit 0)                        */
+#define CTIMER_CMPRA3_CMPR0A3_Msk         (0xffffUL)                /*!< CTIMER CMPRA3: CMPR0A3 (Bitfield-Mask: 0xffff)        */
+/* ========================================================  CMPRB3  ========================================================= */
+#define CTIMER_CMPRB3_CMPR1B3_Pos         (16UL)                    /*!< CTIMER CMPRB3: CMPR1B3 (Bit 16)                       */
+#define CTIMER_CMPRB3_CMPR1B3_Msk         (0xffff0000UL)            /*!< CTIMER CMPRB3: CMPR1B3 (Bitfield-Mask: 0xffff)        */
+#define CTIMER_CMPRB3_CMPR0B3_Pos         (0UL)                     /*!< CTIMER CMPRB3: CMPR0B3 (Bit 0)                        */
+#define CTIMER_CMPRB3_CMPR0B3_Msk         (0xffffUL)                /*!< CTIMER CMPRB3: CMPR0B3 (Bitfield-Mask: 0xffff)        */
+/* =========================================================  CTRL3  ========================================================= */
+#define CTIMER_CTRL3_CTLINK3_Pos          (31UL)                    /*!< CTIMER CTRL3: CTLINK3 (Bit 31)                        */
+#define CTIMER_CTRL3_CTLINK3_Msk          (0x80000000UL)            /*!< CTIMER CTRL3: CTLINK3 (Bitfield-Mask: 0x01)           */
+#define CTIMER_CTRL3_TMRB3POL_Pos         (28UL)                    /*!< CTIMER CTRL3: TMRB3POL (Bit 28)                       */
+#define CTIMER_CTRL3_TMRB3POL_Msk         (0x10000000UL)            /*!< CTIMER CTRL3: TMRB3POL (Bitfield-Mask: 0x01)          */
+#define CTIMER_CTRL3_TMRB3CLR_Pos         (27UL)                    /*!< CTIMER CTRL3: TMRB3CLR (Bit 27)                       */
+#define CTIMER_CTRL3_TMRB3CLR_Msk         (0x8000000UL)             /*!< CTIMER CTRL3: TMRB3CLR (Bitfield-Mask: 0x01)          */
+#define CTIMER_CTRL3_TMRB3IE1_Pos         (26UL)                    /*!< CTIMER CTRL3: TMRB3IE1 (Bit 26)                       */
+#define CTIMER_CTRL3_TMRB3IE1_Msk         (0x4000000UL)             /*!< CTIMER CTRL3: TMRB3IE1 (Bitfield-Mask: 0x01)          */
+#define CTIMER_CTRL3_TMRB3IE0_Pos         (25UL)                    /*!< CTIMER CTRL3: TMRB3IE0 (Bit 25)                       */
+#define CTIMER_CTRL3_TMRB3IE0_Msk         (0x2000000UL)             /*!< CTIMER CTRL3: TMRB3IE0 (Bitfield-Mask: 0x01)          */
+#define CTIMER_CTRL3_TMRB3FN_Pos          (22UL)                    /*!< CTIMER CTRL3: TMRB3FN (Bit 22)                        */
+#define CTIMER_CTRL3_TMRB3FN_Msk          (0x1c00000UL)             /*!< CTIMER CTRL3: TMRB3FN (Bitfield-Mask: 0x07)           */
+#define CTIMER_CTRL3_TMRB3CLK_Pos         (17UL)                    /*!< CTIMER CTRL3: TMRB3CLK (Bit 17)                       */
+#define CTIMER_CTRL3_TMRB3CLK_Msk         (0x3e0000UL)              /*!< CTIMER CTRL3: TMRB3CLK (Bitfield-Mask: 0x1f)          */
+#define CTIMER_CTRL3_TMRB3EN_Pos          (16UL)                    /*!< CTIMER CTRL3: TMRB3EN (Bit 16)                        */
+#define CTIMER_CTRL3_TMRB3EN_Msk          (0x10000UL)               /*!< CTIMER CTRL3: TMRB3EN (Bitfield-Mask: 0x01)           */
+#define CTIMER_CTRL3_ADCEN_Pos            (15UL)                    /*!< CTIMER CTRL3: ADCEN (Bit 15)                          */
+#define CTIMER_CTRL3_ADCEN_Msk            (0x8000UL)                /*!< CTIMER CTRL3: ADCEN (Bitfield-Mask: 0x01)             */
+#define CTIMER_CTRL3_TMRA3POL_Pos         (12UL)                    /*!< CTIMER CTRL3: TMRA3POL (Bit 12)                       */
+#define CTIMER_CTRL3_TMRA3POL_Msk         (0x1000UL)                /*!< CTIMER CTRL3: TMRA3POL (Bitfield-Mask: 0x01)          */
+#define CTIMER_CTRL3_TMRA3CLR_Pos         (11UL)                    /*!< CTIMER CTRL3: TMRA3CLR (Bit 11)                       */
+#define CTIMER_CTRL3_TMRA3CLR_Msk         (0x800UL)                 /*!< CTIMER CTRL3: TMRA3CLR (Bitfield-Mask: 0x01)          */
+#define CTIMER_CTRL3_TMRA3IE1_Pos         (10UL)                    /*!< CTIMER CTRL3: TMRA3IE1 (Bit 10)                       */
+#define CTIMER_CTRL3_TMRA3IE1_Msk         (0x400UL)                 /*!< CTIMER CTRL3: TMRA3IE1 (Bitfield-Mask: 0x01)          */
+#define CTIMER_CTRL3_TMRA3IE0_Pos         (9UL)                     /*!< CTIMER CTRL3: TMRA3IE0 (Bit 9)                        */
+#define CTIMER_CTRL3_TMRA3IE0_Msk         (0x200UL)                 /*!< CTIMER CTRL3: TMRA3IE0 (Bitfield-Mask: 0x01)          */
+#define CTIMER_CTRL3_TMRA3FN_Pos          (6UL)                     /*!< CTIMER CTRL3: TMRA3FN (Bit 6)                         */
+#define CTIMER_CTRL3_TMRA3FN_Msk          (0x1c0UL)                 /*!< CTIMER CTRL3: TMRA3FN (Bitfield-Mask: 0x07)           */
+#define CTIMER_CTRL3_TMRA3CLK_Pos         (1UL)                     /*!< CTIMER CTRL3: TMRA3CLK (Bit 1)                        */
+#define CTIMER_CTRL3_TMRA3CLK_Msk         (0x3eUL)                  /*!< CTIMER CTRL3: TMRA3CLK (Bitfield-Mask: 0x1f)          */
+#define CTIMER_CTRL3_TMRA3EN_Pos          (0UL)                     /*!< CTIMER CTRL3: TMRA3EN (Bit 0)                         */
+#define CTIMER_CTRL3_TMRA3EN_Msk          (0x1UL)                   /*!< CTIMER CTRL3: TMRA3EN (Bitfield-Mask: 0x01)           */
+/* =======================================================  CMPRAUXA3  ======================================================= */
+#define CTIMER_CMPRAUXA3_CMPR3A3_Pos      (16UL)                    /*!< CTIMER CMPRAUXA3: CMPR3A3 (Bit 16)                    */
+#define CTIMER_CMPRAUXA3_CMPR3A3_Msk      (0xffff0000UL)            /*!< CTIMER CMPRAUXA3: CMPR3A3 (Bitfield-Mask: 0xffff)     */
+#define CTIMER_CMPRAUXA3_CMPR2A3_Pos      (0UL)                     /*!< CTIMER CMPRAUXA3: CMPR2A3 (Bit 0)                     */
+#define CTIMER_CMPRAUXA3_CMPR2A3_Msk      (0xffffUL)                /*!< CTIMER CMPRAUXA3: CMPR2A3 (Bitfield-Mask: 0xffff)     */
+/* =======================================================  CMPRAUXB3  ======================================================= */
+#define CTIMER_CMPRAUXB3_CMPR3B3_Pos      (16UL)                    /*!< CTIMER CMPRAUXB3: CMPR3B3 (Bit 16)                    */
+#define CTIMER_CMPRAUXB3_CMPR3B3_Msk      (0xffff0000UL)            /*!< CTIMER CMPRAUXB3: CMPR3B3 (Bitfield-Mask: 0xffff)     */
+#define CTIMER_CMPRAUXB3_CMPR2B3_Pos      (0UL)                     /*!< CTIMER CMPRAUXB3: CMPR2B3 (Bit 0)                     */
+#define CTIMER_CMPRAUXB3_CMPR2B3_Msk      (0xffffUL)                /*!< CTIMER CMPRAUXB3: CMPR2B3 (Bitfield-Mask: 0xffff)     */
+/* =========================================================  AUX3  ========================================================== */
+#define CTIMER_AUX3_TMRB3EN23_Pos         (30UL)                    /*!< CTIMER AUX3: TMRB3EN23 (Bit 30)                       */
+#define CTIMER_AUX3_TMRB3EN23_Msk         (0x40000000UL)            /*!< CTIMER AUX3: TMRB3EN23 (Bitfield-Mask: 0x01)          */
+#define CTIMER_AUX3_TMRB3POL23_Pos        (29UL)                    /*!< CTIMER AUX3: TMRB3POL23 (Bit 29)                      */
+#define CTIMER_AUX3_TMRB3POL23_Msk        (0x20000000UL)            /*!< CTIMER AUX3: TMRB3POL23 (Bitfield-Mask: 0x01)         */
+#define CTIMER_AUX3_TMRB3TINV_Pos         (28UL)                    /*!< CTIMER AUX3: TMRB3TINV (Bit 28)                       */
+#define CTIMER_AUX3_TMRB3TINV_Msk         (0x10000000UL)            /*!< CTIMER AUX3: TMRB3TINV (Bitfield-Mask: 0x01)          */
+#define CTIMER_AUX3_TMRB3NOSYNC_Pos       (27UL)                    /*!< CTIMER AUX3: TMRB3NOSYNC (Bit 27)                     */
+#define CTIMER_AUX3_TMRB3NOSYNC_Msk       (0x8000000UL)             /*!< CTIMER AUX3: TMRB3NOSYNC (Bitfield-Mask: 0x01)        */
+#define CTIMER_AUX3_TMRB3TRIG_Pos         (23UL)                    /*!< CTIMER AUX3: TMRB3TRIG (Bit 23)                       */
+#define CTIMER_AUX3_TMRB3TRIG_Msk         (0x7800000UL)             /*!< CTIMER AUX3: TMRB3TRIG (Bitfield-Mask: 0x0f)          */
+#define CTIMER_AUX3_TMRB3LMT_Pos          (16UL)                    /*!< CTIMER AUX3: TMRB3LMT (Bit 16)                        */
+#define CTIMER_AUX3_TMRB3LMT_Msk          (0x3f0000UL)              /*!< CTIMER AUX3: TMRB3LMT (Bitfield-Mask: 0x3f)           */
+#define CTIMER_AUX3_TMRA3EN23_Pos         (14UL)                    /*!< CTIMER AUX3: TMRA3EN23 (Bit 14)                       */
+#define CTIMER_AUX3_TMRA3EN23_Msk         (0x4000UL)                /*!< CTIMER AUX3: TMRA3EN23 (Bitfield-Mask: 0x01)          */
+#define CTIMER_AUX3_TMRA3POL23_Pos        (13UL)                    /*!< CTIMER AUX3: TMRA3POL23 (Bit 13)                      */
+#define CTIMER_AUX3_TMRA3POL23_Msk        (0x2000UL)                /*!< CTIMER AUX3: TMRA3POL23 (Bitfield-Mask: 0x01)         */
+#define CTIMER_AUX3_TMRA3TINV_Pos         (12UL)                    /*!< CTIMER AUX3: TMRA3TINV (Bit 12)                       */
+#define CTIMER_AUX3_TMRA3TINV_Msk         (0x1000UL)                /*!< CTIMER AUX3: TMRA3TINV (Bitfield-Mask: 0x01)          */
+#define CTIMER_AUX3_TMRA3NOSYNC_Pos       (11UL)                    /*!< CTIMER AUX3: TMRA3NOSYNC (Bit 11)                     */
+#define CTIMER_AUX3_TMRA3NOSYNC_Msk       (0x800UL)                 /*!< CTIMER AUX3: TMRA3NOSYNC (Bitfield-Mask: 0x01)        */
+#define CTIMER_AUX3_TMRA3TRIG_Pos         (7UL)                     /*!< CTIMER AUX3: TMRA3TRIG (Bit 7)                        */
+#define CTIMER_AUX3_TMRA3TRIG_Msk         (0x780UL)                 /*!< CTIMER AUX3: TMRA3TRIG (Bitfield-Mask: 0x0f)          */
+#define CTIMER_AUX3_TMRA3LMT_Pos          (0UL)                     /*!< CTIMER AUX3: TMRA3LMT (Bit 0)                         */
+#define CTIMER_AUX3_TMRA3LMT_Msk          (0x7fUL)                  /*!< CTIMER AUX3: TMRA3LMT (Bitfield-Mask: 0x7f)           */
+/* =========================================================  TMR4  ========================================================== */
+#define CTIMER_TMR4_CTTMRB4_Pos           (16UL)                    /*!< CTIMER TMR4: CTTMRB4 (Bit 16)                         */
+#define CTIMER_TMR4_CTTMRB4_Msk           (0xffff0000UL)            /*!< CTIMER TMR4: CTTMRB4 (Bitfield-Mask: 0xffff)          */
+#define CTIMER_TMR4_CTTMRA4_Pos           (0UL)                     /*!< CTIMER TMR4: CTTMRA4 (Bit 0)                          */
+#define CTIMER_TMR4_CTTMRA4_Msk           (0xffffUL)                /*!< CTIMER TMR4: CTTMRA4 (Bitfield-Mask: 0xffff)          */
+/* ========================================================  CMPRA4  ========================================================= */
+#define CTIMER_CMPRA4_CMPR1A4_Pos         (16UL)                    /*!< CTIMER CMPRA4: CMPR1A4 (Bit 16)                       */
+#define CTIMER_CMPRA4_CMPR1A4_Msk         (0xffff0000UL)            /*!< CTIMER CMPRA4: CMPR1A4 (Bitfield-Mask: 0xffff)        */
+#define CTIMER_CMPRA4_CMPR0A4_Pos         (0UL)                     /*!< CTIMER CMPRA4: CMPR0A4 (Bit 0)                        */
+#define CTIMER_CMPRA4_CMPR0A4_Msk         (0xffffUL)                /*!< CTIMER CMPRA4: CMPR0A4 (Bitfield-Mask: 0xffff)        */
+/* ========================================================  CMPRB4  ========================================================= */
+#define CTIMER_CMPRB4_CMPR1B4_Pos         (16UL)                    /*!< CTIMER CMPRB4: CMPR1B4 (Bit 16)                       */
+#define CTIMER_CMPRB4_CMPR1B4_Msk         (0xffff0000UL)            /*!< CTIMER CMPRB4: CMPR1B4 (Bitfield-Mask: 0xffff)        */
+#define CTIMER_CMPRB4_CMPR0B4_Pos         (0UL)                     /*!< CTIMER CMPRB4: CMPR0B4 (Bit 0)                        */
+#define CTIMER_CMPRB4_CMPR0B4_Msk         (0xffffUL)                /*!< CTIMER CMPRB4: CMPR0B4 (Bitfield-Mask: 0xffff)        */
+/* =========================================================  CTRL4  ========================================================= */
+#define CTIMER_CTRL4_CTLINK4_Pos          (31UL)                    /*!< CTIMER CTRL4: CTLINK4 (Bit 31)                        */
+#define CTIMER_CTRL4_CTLINK4_Msk          (0x80000000UL)            /*!< CTIMER CTRL4: CTLINK4 (Bitfield-Mask: 0x01)           */
+#define CTIMER_CTRL4_TMRB4POL_Pos         (28UL)                    /*!< CTIMER CTRL4: TMRB4POL (Bit 28)                       */
+#define CTIMER_CTRL4_TMRB4POL_Msk         (0x10000000UL)            /*!< CTIMER CTRL4: TMRB4POL (Bitfield-Mask: 0x01)          */
+#define CTIMER_CTRL4_TMRB4CLR_Pos         (27UL)                    /*!< CTIMER CTRL4: TMRB4CLR (Bit 27)                       */
+#define CTIMER_CTRL4_TMRB4CLR_Msk         (0x8000000UL)             /*!< CTIMER CTRL4: TMRB4CLR (Bitfield-Mask: 0x01)          */
+#define CTIMER_CTRL4_TMRB4IE1_Pos         (26UL)                    /*!< CTIMER CTRL4: TMRB4IE1 (Bit 26)                       */
+#define CTIMER_CTRL4_TMRB4IE1_Msk         (0x4000000UL)             /*!< CTIMER CTRL4: TMRB4IE1 (Bitfield-Mask: 0x01)          */
+#define CTIMER_CTRL4_TMRB4IE0_Pos         (25UL)                    /*!< CTIMER CTRL4: TMRB4IE0 (Bit 25)                       */
+#define CTIMER_CTRL4_TMRB4IE0_Msk         (0x2000000UL)             /*!< CTIMER CTRL4: TMRB4IE0 (Bitfield-Mask: 0x01)          */
+#define CTIMER_CTRL4_TMRB4FN_Pos          (22UL)                    /*!< CTIMER CTRL4: TMRB4FN (Bit 22)                        */
+#define CTIMER_CTRL4_TMRB4FN_Msk          (0x1c00000UL)             /*!< CTIMER CTRL4: TMRB4FN (Bitfield-Mask: 0x07)           */
+#define CTIMER_CTRL4_TMRB4CLK_Pos         (17UL)                    /*!< CTIMER CTRL4: TMRB4CLK (Bit 17)                       */
+#define CTIMER_CTRL4_TMRB4CLK_Msk         (0x3e0000UL)              /*!< CTIMER CTRL4: TMRB4CLK (Bitfield-Mask: 0x1f)          */
+#define CTIMER_CTRL4_TMRB4EN_Pos          (16UL)                    /*!< CTIMER CTRL4: TMRB4EN (Bit 16)                        */
+#define CTIMER_CTRL4_TMRB4EN_Msk          (0x10000UL)               /*!< CTIMER CTRL4: TMRB4EN (Bitfield-Mask: 0x01)           */
+#define CTIMER_CTRL4_TMRA4POL_Pos         (12UL)                    /*!< CTIMER CTRL4: TMRA4POL (Bit 12)                       */
+#define CTIMER_CTRL4_TMRA4POL_Msk         (0x1000UL)                /*!< CTIMER CTRL4: TMRA4POL (Bitfield-Mask: 0x01)          */
+#define CTIMER_CTRL4_TMRA4CLR_Pos         (11UL)                    /*!< CTIMER CTRL4: TMRA4CLR (Bit 11)                       */
+#define CTIMER_CTRL4_TMRA4CLR_Msk         (0x800UL)                 /*!< CTIMER CTRL4: TMRA4CLR (Bitfield-Mask: 0x01)          */
+#define CTIMER_CTRL4_TMRA4IE1_Pos         (10UL)                    /*!< CTIMER CTRL4: TMRA4IE1 (Bit 10)                       */
+#define CTIMER_CTRL4_TMRA4IE1_Msk         (0x400UL)                 /*!< CTIMER CTRL4: TMRA4IE1 (Bitfield-Mask: 0x01)          */
+#define CTIMER_CTRL4_TMRA4IE0_Pos         (9UL)                     /*!< CTIMER CTRL4: TMRA4IE0 (Bit 9)                        */
+#define CTIMER_CTRL4_TMRA4IE0_Msk         (0x200UL)                 /*!< CTIMER CTRL4: TMRA4IE0 (Bitfield-Mask: 0x01)          */
+#define CTIMER_CTRL4_TMRA4FN_Pos          (6UL)                     /*!< CTIMER CTRL4: TMRA4FN (Bit 6)                         */
+#define CTIMER_CTRL4_TMRA4FN_Msk          (0x1c0UL)                 /*!< CTIMER CTRL4: TMRA4FN (Bitfield-Mask: 0x07)           */
+#define CTIMER_CTRL4_TMRA4CLK_Pos         (1UL)                     /*!< CTIMER CTRL4: TMRA4CLK (Bit 1)                        */
+#define CTIMER_CTRL4_TMRA4CLK_Msk         (0x3eUL)                  /*!< CTIMER CTRL4: TMRA4CLK (Bitfield-Mask: 0x1f)          */
+#define CTIMER_CTRL4_TMRA4EN_Pos          (0UL)                     /*!< CTIMER CTRL4: TMRA4EN (Bit 0)                         */
+#define CTIMER_CTRL4_TMRA4EN_Msk          (0x1UL)                   /*!< CTIMER CTRL4: TMRA4EN (Bitfield-Mask: 0x01)           */
+/* =======================================================  CMPRAUXA4  ======================================================= */
+#define CTIMER_CMPRAUXA4_CMPR3A4_Pos      (16UL)                    /*!< CTIMER CMPRAUXA4: CMPR3A4 (Bit 16)                    */
+#define CTIMER_CMPRAUXA4_CMPR3A4_Msk      (0xffff0000UL)            /*!< CTIMER CMPRAUXA4: CMPR3A4 (Bitfield-Mask: 0xffff)     */
+#define CTIMER_CMPRAUXA4_CMPR2A4_Pos      (0UL)                     /*!< CTIMER CMPRAUXA4: CMPR2A4 (Bit 0)                     */
+#define CTIMER_CMPRAUXA4_CMPR2A4_Msk      (0xffffUL)                /*!< CTIMER CMPRAUXA4: CMPR2A4 (Bitfield-Mask: 0xffff)     */
+/* =======================================================  CMPRAUXB4  ======================================================= */
+#define CTIMER_CMPRAUXB4_CMPR3B4_Pos      (16UL)                    /*!< CTIMER CMPRAUXB4: CMPR3B4 (Bit 16)                    */
+#define CTIMER_CMPRAUXB4_CMPR3B4_Msk      (0xffff0000UL)            /*!< CTIMER CMPRAUXB4: CMPR3B4 (Bitfield-Mask: 0xffff)     */
+#define CTIMER_CMPRAUXB4_CMPR2B4_Pos      (0UL)                     /*!< CTIMER CMPRAUXB4: CMPR2B4 (Bit 0)                     */
+#define CTIMER_CMPRAUXB4_CMPR2B4_Msk      (0xffffUL)                /*!< CTIMER CMPRAUXB4: CMPR2B4 (Bitfield-Mask: 0xffff)     */
+/* =========================================================  AUX4  ========================================================== */
+#define CTIMER_AUX4_TMRB4EN23_Pos         (30UL)                    /*!< CTIMER AUX4: TMRB4EN23 (Bit 30)                       */
+#define CTIMER_AUX4_TMRB4EN23_Msk         (0x40000000UL)            /*!< CTIMER AUX4: TMRB4EN23 (Bitfield-Mask: 0x01)          */
+#define CTIMER_AUX4_TMRB4POL23_Pos        (29UL)                    /*!< CTIMER AUX4: TMRB4POL23 (Bit 29)                      */
+#define CTIMER_AUX4_TMRB4POL23_Msk        (0x20000000UL)            /*!< CTIMER AUX4: TMRB4POL23 (Bitfield-Mask: 0x01)         */
+#define CTIMER_AUX4_TMRB4TINV_Pos         (28UL)                    /*!< CTIMER AUX4: TMRB4TINV (Bit 28)                       */
+#define CTIMER_AUX4_TMRB4TINV_Msk         (0x10000000UL)            /*!< CTIMER AUX4: TMRB4TINV (Bitfield-Mask: 0x01)          */
+#define CTIMER_AUX4_TMRB4NOSYNC_Pos       (27UL)                    /*!< CTIMER AUX4: TMRB4NOSYNC (Bit 27)                     */
+#define CTIMER_AUX4_TMRB4NOSYNC_Msk       (0x8000000UL)             /*!< CTIMER AUX4: TMRB4NOSYNC (Bitfield-Mask: 0x01)        */
+#define CTIMER_AUX4_TMRB4TRIG_Pos         (23UL)                    /*!< CTIMER AUX4: TMRB4TRIG (Bit 23)                       */
+#define CTIMER_AUX4_TMRB4TRIG_Msk         (0x7800000UL)             /*!< CTIMER AUX4: TMRB4TRIG (Bitfield-Mask: 0x0f)          */
+#define CTIMER_AUX4_TMRB4LMT_Pos          (16UL)                    /*!< CTIMER AUX4: TMRB4LMT (Bit 16)                        */
+#define CTIMER_AUX4_TMRB4LMT_Msk          (0x3f0000UL)              /*!< CTIMER AUX4: TMRB4LMT (Bitfield-Mask: 0x3f)           */
+#define CTIMER_AUX4_TMRA4EN23_Pos         (14UL)                    /*!< CTIMER AUX4: TMRA4EN23 (Bit 14)                       */
+#define CTIMER_AUX4_TMRA4EN23_Msk         (0x4000UL)                /*!< CTIMER AUX4: TMRA4EN23 (Bitfield-Mask: 0x01)          */
+#define CTIMER_AUX4_TMRA4POL23_Pos        (13UL)                    /*!< CTIMER AUX4: TMRA4POL23 (Bit 13)                      */
+#define CTIMER_AUX4_TMRA4POL23_Msk        (0x2000UL)                /*!< CTIMER AUX4: TMRA4POL23 (Bitfield-Mask: 0x01)         */
+#define CTIMER_AUX4_TMRA4TINV_Pos         (12UL)                    /*!< CTIMER AUX4: TMRA4TINV (Bit 12)                       */
+#define CTIMER_AUX4_TMRA4TINV_Msk         (0x1000UL)                /*!< CTIMER AUX4: TMRA4TINV (Bitfield-Mask: 0x01)          */
+#define CTIMER_AUX4_TMRA4NOSYNC_Pos       (11UL)                    /*!< CTIMER AUX4: TMRA4NOSYNC (Bit 11)                     */
+#define CTIMER_AUX4_TMRA4NOSYNC_Msk       (0x800UL)                 /*!< CTIMER AUX4: TMRA4NOSYNC (Bitfield-Mask: 0x01)        */
+#define CTIMER_AUX4_TMRA4TRIG_Pos         (7UL)                     /*!< CTIMER AUX4: TMRA4TRIG (Bit 7)                        */
+#define CTIMER_AUX4_TMRA4TRIG_Msk         (0x780UL)                 /*!< CTIMER AUX4: TMRA4TRIG (Bitfield-Mask: 0x0f)          */
+#define CTIMER_AUX4_TMRA4LMT_Pos          (0UL)                     /*!< CTIMER AUX4: TMRA4LMT (Bit 0)                         */
+#define CTIMER_AUX4_TMRA4LMT_Msk          (0x7fUL)                  /*!< CTIMER AUX4: TMRA4LMT (Bitfield-Mask: 0x7f)           */
+/* =========================================================  TMR5  ========================================================== */
+#define CTIMER_TMR5_CTTMRB5_Pos           (16UL)                    /*!< CTIMER TMR5: CTTMRB5 (Bit 16)                         */
+#define CTIMER_TMR5_CTTMRB5_Msk           (0xffff0000UL)            /*!< CTIMER TMR5: CTTMRB5 (Bitfield-Mask: 0xffff)          */
+#define CTIMER_TMR5_CTTMRA5_Pos           (0UL)                     /*!< CTIMER TMR5: CTTMRA5 (Bit 0)                          */
+#define CTIMER_TMR5_CTTMRA5_Msk           (0xffffUL)                /*!< CTIMER TMR5: CTTMRA5 (Bitfield-Mask: 0xffff)          */
+/* ========================================================  CMPRA5  ========================================================= */
+#define CTIMER_CMPRA5_CMPR1A5_Pos         (16UL)                    /*!< CTIMER CMPRA5: CMPR1A5 (Bit 16)                       */
+#define CTIMER_CMPRA5_CMPR1A5_Msk         (0xffff0000UL)            /*!< CTIMER CMPRA5: CMPR1A5 (Bitfield-Mask: 0xffff)        */
+#define CTIMER_CMPRA5_CMPR0A5_Pos         (0UL)                     /*!< CTIMER CMPRA5: CMPR0A5 (Bit 0)                        */
+#define CTIMER_CMPRA5_CMPR0A5_Msk         (0xffffUL)                /*!< CTIMER CMPRA5: CMPR0A5 (Bitfield-Mask: 0xffff)        */
+/* ========================================================  CMPRB5  ========================================================= */
+#define CTIMER_CMPRB5_CMPR1B5_Pos         (16UL)                    /*!< CTIMER CMPRB5: CMPR1B5 (Bit 16)                       */
+#define CTIMER_CMPRB5_CMPR1B5_Msk         (0xffff0000UL)            /*!< CTIMER CMPRB5: CMPR1B5 (Bitfield-Mask: 0xffff)        */
+#define CTIMER_CMPRB5_CMPR0B5_Pos         (0UL)                     /*!< CTIMER CMPRB5: CMPR0B5 (Bit 0)                        */
+#define CTIMER_CMPRB5_CMPR0B5_Msk         (0xffffUL)                /*!< CTIMER CMPRB5: CMPR0B5 (Bitfield-Mask: 0xffff)        */
+/* =========================================================  CTRL5  ========================================================= */
+#define CTIMER_CTRL5_CTLINK5_Pos          (31UL)                    /*!< CTIMER CTRL5: CTLINK5 (Bit 31)                        */
+#define CTIMER_CTRL5_CTLINK5_Msk          (0x80000000UL)            /*!< CTIMER CTRL5: CTLINK5 (Bitfield-Mask: 0x01)           */
+#define CTIMER_CTRL5_TMRB5POL_Pos         (28UL)                    /*!< CTIMER CTRL5: TMRB5POL (Bit 28)                       */
+#define CTIMER_CTRL5_TMRB5POL_Msk         (0x10000000UL)            /*!< CTIMER CTRL5: TMRB5POL (Bitfield-Mask: 0x01)          */
+#define CTIMER_CTRL5_TMRB5CLR_Pos         (27UL)                    /*!< CTIMER CTRL5: TMRB5CLR (Bit 27)                       */
+#define CTIMER_CTRL5_TMRB5CLR_Msk         (0x8000000UL)             /*!< CTIMER CTRL5: TMRB5CLR (Bitfield-Mask: 0x01)          */
+#define CTIMER_CTRL5_TMRB5IE1_Pos         (26UL)                    /*!< CTIMER CTRL5: TMRB5IE1 (Bit 26)                       */
+#define CTIMER_CTRL5_TMRB5IE1_Msk         (0x4000000UL)             /*!< CTIMER CTRL5: TMRB5IE1 (Bitfield-Mask: 0x01)          */
+#define CTIMER_CTRL5_TMRB5IE0_Pos         (25UL)                    /*!< CTIMER CTRL5: TMRB5IE0 (Bit 25)                       */
+#define CTIMER_CTRL5_TMRB5IE0_Msk         (0x2000000UL)             /*!< CTIMER CTRL5: TMRB5IE0 (Bitfield-Mask: 0x01)          */
+#define CTIMER_CTRL5_TMRB5FN_Pos          (22UL)                    /*!< CTIMER CTRL5: TMRB5FN (Bit 22)                        */
+#define CTIMER_CTRL5_TMRB5FN_Msk          (0x1c00000UL)             /*!< CTIMER CTRL5: TMRB5FN (Bitfield-Mask: 0x07)           */
+#define CTIMER_CTRL5_TMRB5CLK_Pos         (17UL)                    /*!< CTIMER CTRL5: TMRB5CLK (Bit 17)                       */
+#define CTIMER_CTRL5_TMRB5CLK_Msk         (0x3e0000UL)              /*!< CTIMER CTRL5: TMRB5CLK (Bitfield-Mask: 0x1f)          */
+#define CTIMER_CTRL5_TMRB5EN_Pos          (16UL)                    /*!< CTIMER CTRL5: TMRB5EN (Bit 16)                        */
+#define CTIMER_CTRL5_TMRB5EN_Msk          (0x10000UL)               /*!< CTIMER CTRL5: TMRB5EN (Bitfield-Mask: 0x01)           */
+#define CTIMER_CTRL5_TMRA5POL_Pos         (12UL)                    /*!< CTIMER CTRL5: TMRA5POL (Bit 12)                       */
+#define CTIMER_CTRL5_TMRA5POL_Msk         (0x1000UL)                /*!< CTIMER CTRL5: TMRA5POL (Bitfield-Mask: 0x01)          */
+#define CTIMER_CTRL5_TMRA5CLR_Pos         (11UL)                    /*!< CTIMER CTRL5: TMRA5CLR (Bit 11)                       */
+#define CTIMER_CTRL5_TMRA5CLR_Msk         (0x800UL)                 /*!< CTIMER CTRL5: TMRA5CLR (Bitfield-Mask: 0x01)          */
+#define CTIMER_CTRL5_TMRA5IE1_Pos         (10UL)                    /*!< CTIMER CTRL5: TMRA5IE1 (Bit 10)                       */
+#define CTIMER_CTRL5_TMRA5IE1_Msk         (0x400UL)                 /*!< CTIMER CTRL5: TMRA5IE1 (Bitfield-Mask: 0x01)          */
+#define CTIMER_CTRL5_TMRA5IE0_Pos         (9UL)                     /*!< CTIMER CTRL5: TMRA5IE0 (Bit 9)                        */
+#define CTIMER_CTRL5_TMRA5IE0_Msk         (0x200UL)                 /*!< CTIMER CTRL5: TMRA5IE0 (Bitfield-Mask: 0x01)          */
+#define CTIMER_CTRL5_TMRA5FN_Pos          (6UL)                     /*!< CTIMER CTRL5: TMRA5FN (Bit 6)                         */
+#define CTIMER_CTRL5_TMRA5FN_Msk          (0x1c0UL)                 /*!< CTIMER CTRL5: TMRA5FN (Bitfield-Mask: 0x07)           */
+#define CTIMER_CTRL5_TMRA5CLK_Pos         (1UL)                     /*!< CTIMER CTRL5: TMRA5CLK (Bit 1)                        */
+#define CTIMER_CTRL5_TMRA5CLK_Msk         (0x3eUL)                  /*!< CTIMER CTRL5: TMRA5CLK (Bitfield-Mask: 0x1f)          */
+#define CTIMER_CTRL5_TMRA5EN_Pos          (0UL)                     /*!< CTIMER CTRL5: TMRA5EN (Bit 0)                         */
+#define CTIMER_CTRL5_TMRA5EN_Msk          (0x1UL)                   /*!< CTIMER CTRL5: TMRA5EN (Bitfield-Mask: 0x01)           */
+/* =======================================================  CMPRAUXA5  ======================================================= */
+#define CTIMER_CMPRAUXA5_CMPR3A5_Pos      (16UL)                    /*!< CTIMER CMPRAUXA5: CMPR3A5 (Bit 16)                    */
+#define CTIMER_CMPRAUXA5_CMPR3A5_Msk      (0xffff0000UL)            /*!< CTIMER CMPRAUXA5: CMPR3A5 (Bitfield-Mask: 0xffff)     */
+#define CTIMER_CMPRAUXA5_CMPR2A5_Pos      (0UL)                     /*!< CTIMER CMPRAUXA5: CMPR2A5 (Bit 0)                     */
+#define CTIMER_CMPRAUXA5_CMPR2A5_Msk      (0xffffUL)                /*!< CTIMER CMPRAUXA5: CMPR2A5 (Bitfield-Mask: 0xffff)     */
+/* =======================================================  CMPRAUXB5  ======================================================= */
+#define CTIMER_CMPRAUXB5_CMPR3B5_Pos      (16UL)                    /*!< CTIMER CMPRAUXB5: CMPR3B5 (Bit 16)                    */
+#define CTIMER_CMPRAUXB5_CMPR3B5_Msk      (0xffff0000UL)            /*!< CTIMER CMPRAUXB5: CMPR3B5 (Bitfield-Mask: 0xffff)     */
+#define CTIMER_CMPRAUXB5_CMPR2B5_Pos      (0UL)                     /*!< CTIMER CMPRAUXB5: CMPR2B5 (Bit 0)                     */
+#define CTIMER_CMPRAUXB5_CMPR2B5_Msk      (0xffffUL)                /*!< CTIMER CMPRAUXB5: CMPR2B5 (Bitfield-Mask: 0xffff)     */
+/* =========================================================  AUX5  ========================================================== */
+#define CTIMER_AUX5_TMRB5EN23_Pos         (30UL)                    /*!< CTIMER AUX5: TMRB5EN23 (Bit 30)                       */
+#define CTIMER_AUX5_TMRB5EN23_Msk         (0x40000000UL)            /*!< CTIMER AUX5: TMRB5EN23 (Bitfield-Mask: 0x01)          */
+#define CTIMER_AUX5_TMRB5POL23_Pos        (29UL)                    /*!< CTIMER AUX5: TMRB5POL23 (Bit 29)                      */
+#define CTIMER_AUX5_TMRB5POL23_Msk        (0x20000000UL)            /*!< CTIMER AUX5: TMRB5POL23 (Bitfield-Mask: 0x01)         */
+#define CTIMER_AUX5_TMRB5TINV_Pos         (28UL)                    /*!< CTIMER AUX5: TMRB5TINV (Bit 28)                       */
+#define CTIMER_AUX5_TMRB5TINV_Msk         (0x10000000UL)            /*!< CTIMER AUX5: TMRB5TINV (Bitfield-Mask: 0x01)          */
+#define CTIMER_AUX5_TMRB5NOSYNC_Pos       (27UL)                    /*!< CTIMER AUX5: TMRB5NOSYNC (Bit 27)                     */
+#define CTIMER_AUX5_TMRB5NOSYNC_Msk       (0x8000000UL)             /*!< CTIMER AUX5: TMRB5NOSYNC (Bitfield-Mask: 0x01)        */
+#define CTIMER_AUX5_TMRB5TRIG_Pos         (23UL)                    /*!< CTIMER AUX5: TMRB5TRIG (Bit 23)                       */
+#define CTIMER_AUX5_TMRB5TRIG_Msk         (0x7800000UL)             /*!< CTIMER AUX5: TMRB5TRIG (Bitfield-Mask: 0x0f)          */
+#define CTIMER_AUX5_TMRB5LMT_Pos          (16UL)                    /*!< CTIMER AUX5: TMRB5LMT (Bit 16)                        */
+#define CTIMER_AUX5_TMRB5LMT_Msk          (0x3f0000UL)              /*!< CTIMER AUX5: TMRB5LMT (Bitfield-Mask: 0x3f)           */
+#define CTIMER_AUX5_TMRA5EN23_Pos         (14UL)                    /*!< CTIMER AUX5: TMRA5EN23 (Bit 14)                       */
+#define CTIMER_AUX5_TMRA5EN23_Msk         (0x4000UL)                /*!< CTIMER AUX5: TMRA5EN23 (Bitfield-Mask: 0x01)          */
+#define CTIMER_AUX5_TMRA5POL23_Pos        (13UL)                    /*!< CTIMER AUX5: TMRA5POL23 (Bit 13)                      */
+#define CTIMER_AUX5_TMRA5POL23_Msk        (0x2000UL)                /*!< CTIMER AUX5: TMRA5POL23 (Bitfield-Mask: 0x01)         */
+#define CTIMER_AUX5_TMRA5TINV_Pos         (12UL)                    /*!< CTIMER AUX5: TMRA5TINV (Bit 12)                       */
+#define CTIMER_AUX5_TMRA5TINV_Msk         (0x1000UL)                /*!< CTIMER AUX5: TMRA5TINV (Bitfield-Mask: 0x01)          */
+#define CTIMER_AUX5_TMRA5NOSYNC_Pos       (11UL)                    /*!< CTIMER AUX5: TMRA5NOSYNC (Bit 11)                     */
+#define CTIMER_AUX5_TMRA5NOSYNC_Msk       (0x800UL)                 /*!< CTIMER AUX5: TMRA5NOSYNC (Bitfield-Mask: 0x01)        */
+#define CTIMER_AUX5_TMRA5TRIG_Pos         (7UL)                     /*!< CTIMER AUX5: TMRA5TRIG (Bit 7)                        */
+#define CTIMER_AUX5_TMRA5TRIG_Msk         (0x780UL)                 /*!< CTIMER AUX5: TMRA5TRIG (Bitfield-Mask: 0x0f)          */
+#define CTIMER_AUX5_TMRA5LMT_Pos          (0UL)                     /*!< CTIMER AUX5: TMRA5LMT (Bit 0)                         */
+#define CTIMER_AUX5_TMRA5LMT_Msk          (0x7fUL)                  /*!< CTIMER AUX5: TMRA5LMT (Bitfield-Mask: 0x7f)           */
+/* =========================================================  TMR6  ========================================================== */
+#define CTIMER_TMR6_CTTMRB6_Pos           (16UL)                    /*!< CTIMER TMR6: CTTMRB6 (Bit 16)                         */
+#define CTIMER_TMR6_CTTMRB6_Msk           (0xffff0000UL)            /*!< CTIMER TMR6: CTTMRB6 (Bitfield-Mask: 0xffff)          */
+#define CTIMER_TMR6_CTTMRA6_Pos           (0UL)                     /*!< CTIMER TMR6: CTTMRA6 (Bit 0)                          */
+#define CTIMER_TMR6_CTTMRA6_Msk           (0xffffUL)                /*!< CTIMER TMR6: CTTMRA6 (Bitfield-Mask: 0xffff)          */
+/* ========================================================  CMPRA6  ========================================================= */
+#define CTIMER_CMPRA6_CMPR1A6_Pos         (16UL)                    /*!< CTIMER CMPRA6: CMPR1A6 (Bit 16)                       */
+#define CTIMER_CMPRA6_CMPR1A6_Msk         (0xffff0000UL)            /*!< CTIMER CMPRA6: CMPR1A6 (Bitfield-Mask: 0xffff)        */
+#define CTIMER_CMPRA6_CMPR0A6_Pos         (0UL)                     /*!< CTIMER CMPRA6: CMPR0A6 (Bit 0)                        */
+#define CTIMER_CMPRA6_CMPR0A6_Msk         (0xffffUL)                /*!< CTIMER CMPRA6: CMPR0A6 (Bitfield-Mask: 0xffff)        */
+/* ========================================================  CMPRB6  ========================================================= */
+#define CTIMER_CMPRB6_CMPR1B6_Pos         (16UL)                    /*!< CTIMER CMPRB6: CMPR1B6 (Bit 16)                       */
+#define CTIMER_CMPRB6_CMPR1B6_Msk         (0xffff0000UL)            /*!< CTIMER CMPRB6: CMPR1B6 (Bitfield-Mask: 0xffff)        */
+#define CTIMER_CMPRB6_CMPR0B6_Pos         (0UL)                     /*!< CTIMER CMPRB6: CMPR0B6 (Bit 0)                        */
+#define CTIMER_CMPRB6_CMPR0B6_Msk         (0xffffUL)                /*!< CTIMER CMPRB6: CMPR0B6 (Bitfield-Mask: 0xffff)        */
+/* =========================================================  CTRL6  ========================================================= */
+#define CTIMER_CTRL6_CTLINK6_Pos          (31UL)                    /*!< CTIMER CTRL6: CTLINK6 (Bit 31)                        */
+#define CTIMER_CTRL6_CTLINK6_Msk          (0x80000000UL)            /*!< CTIMER CTRL6: CTLINK6 (Bitfield-Mask: 0x01)           */
+#define CTIMER_CTRL6_TMRB6POL_Pos         (28UL)                    /*!< CTIMER CTRL6: TMRB6POL (Bit 28)                       */
+#define CTIMER_CTRL6_TMRB6POL_Msk         (0x10000000UL)            /*!< CTIMER CTRL6: TMRB6POL (Bitfield-Mask: 0x01)          */
+#define CTIMER_CTRL6_TMRB6CLR_Pos         (27UL)                    /*!< CTIMER CTRL6: TMRB6CLR (Bit 27)                       */
+#define CTIMER_CTRL6_TMRB6CLR_Msk         (0x8000000UL)             /*!< CTIMER CTRL6: TMRB6CLR (Bitfield-Mask: 0x01)          */
+#define CTIMER_CTRL6_TMRB6IE1_Pos         (26UL)                    /*!< CTIMER CTRL6: TMRB6IE1 (Bit 26)                       */
+#define CTIMER_CTRL6_TMRB6IE1_Msk         (0x4000000UL)             /*!< CTIMER CTRL6: TMRB6IE1 (Bitfield-Mask: 0x01)          */
+#define CTIMER_CTRL6_TMRB6IE0_Pos         (25UL)                    /*!< CTIMER CTRL6: TMRB6IE0 (Bit 25)                       */
+#define CTIMER_CTRL6_TMRB6IE0_Msk         (0x2000000UL)             /*!< CTIMER CTRL6: TMRB6IE0 (Bitfield-Mask: 0x01)          */
+#define CTIMER_CTRL6_TMRB6FN_Pos          (22UL)                    /*!< CTIMER CTRL6: TMRB6FN (Bit 22)                        */
+#define CTIMER_CTRL6_TMRB6FN_Msk          (0x1c00000UL)             /*!< CTIMER CTRL6: TMRB6FN (Bitfield-Mask: 0x07)           */
+#define CTIMER_CTRL6_TMRB6CLK_Pos         (17UL)                    /*!< CTIMER CTRL6: TMRB6CLK (Bit 17)                       */
+#define CTIMER_CTRL6_TMRB6CLK_Msk         (0x3e0000UL)              /*!< CTIMER CTRL6: TMRB6CLK (Bitfield-Mask: 0x1f)          */
+#define CTIMER_CTRL6_TMRB6EN_Pos          (16UL)                    /*!< CTIMER CTRL6: TMRB6EN (Bit 16)                        */
+#define CTIMER_CTRL6_TMRB6EN_Msk          (0x10000UL)               /*!< CTIMER CTRL6: TMRB6EN (Bitfield-Mask: 0x01)           */
+#define CTIMER_CTRL6_TMRA6POL_Pos         (12UL)                    /*!< CTIMER CTRL6: TMRA6POL (Bit 12)                       */
+#define CTIMER_CTRL6_TMRA6POL_Msk         (0x1000UL)                /*!< CTIMER CTRL6: TMRA6POL (Bitfield-Mask: 0x01)          */
+#define CTIMER_CTRL6_TMRA6CLR_Pos         (11UL)                    /*!< CTIMER CTRL6: TMRA6CLR (Bit 11)                       */
+#define CTIMER_CTRL6_TMRA6CLR_Msk         (0x800UL)                 /*!< CTIMER CTRL6: TMRA6CLR (Bitfield-Mask: 0x01)          */
+#define CTIMER_CTRL6_TMRA6IE1_Pos         (10UL)                    /*!< CTIMER CTRL6: TMRA6IE1 (Bit 10)                       */
+#define CTIMER_CTRL6_TMRA6IE1_Msk         (0x400UL)                 /*!< CTIMER CTRL6: TMRA6IE1 (Bitfield-Mask: 0x01)          */
+#define CTIMER_CTRL6_TMRA6IE0_Pos         (9UL)                     /*!< CTIMER CTRL6: TMRA6IE0 (Bit 9)                        */
+#define CTIMER_CTRL6_TMRA6IE0_Msk         (0x200UL)                 /*!< CTIMER CTRL6: TMRA6IE0 (Bitfield-Mask: 0x01)          */
+#define CTIMER_CTRL6_TMRA6FN_Pos          (6UL)                     /*!< CTIMER CTRL6: TMRA6FN (Bit 6)                         */
+#define CTIMER_CTRL6_TMRA6FN_Msk          (0x1c0UL)                 /*!< CTIMER CTRL6: TMRA6FN (Bitfield-Mask: 0x07)           */
+#define CTIMER_CTRL6_TMRA6CLK_Pos         (1UL)                     /*!< CTIMER CTRL6: TMRA6CLK (Bit 1)                        */
+#define CTIMER_CTRL6_TMRA6CLK_Msk         (0x3eUL)                  /*!< CTIMER CTRL6: TMRA6CLK (Bitfield-Mask: 0x1f)          */
+#define CTIMER_CTRL6_TMRA6EN_Pos          (0UL)                     /*!< CTIMER CTRL6: TMRA6EN (Bit 0)                         */
+#define CTIMER_CTRL6_TMRA6EN_Msk          (0x1UL)                   /*!< CTIMER CTRL6: TMRA6EN (Bitfield-Mask: 0x01)           */
+/* =======================================================  CMPRAUXA6  ======================================================= */
+#define CTIMER_CMPRAUXA6_CMPR3A6_Pos      (16UL)                    /*!< CTIMER CMPRAUXA6: CMPR3A6 (Bit 16)                    */
+#define CTIMER_CMPRAUXA6_CMPR3A6_Msk      (0xffff0000UL)            /*!< CTIMER CMPRAUXA6: CMPR3A6 (Bitfield-Mask: 0xffff)     */
+#define CTIMER_CMPRAUXA6_CMPR2A6_Pos      (0UL)                     /*!< CTIMER CMPRAUXA6: CMPR2A6 (Bit 0)                     */
+#define CTIMER_CMPRAUXA6_CMPR2A6_Msk      (0xffffUL)                /*!< CTIMER CMPRAUXA6: CMPR2A6 (Bitfield-Mask: 0xffff)     */
+/* =======================================================  CMPRAUXB6  ======================================================= */
+#define CTIMER_CMPRAUXB6_CMPR3B6_Pos      (16UL)                    /*!< CTIMER CMPRAUXB6: CMPR3B6 (Bit 16)                    */
+#define CTIMER_CMPRAUXB6_CMPR3B6_Msk      (0xffff0000UL)            /*!< CTIMER CMPRAUXB6: CMPR3B6 (Bitfield-Mask: 0xffff)     */
+#define CTIMER_CMPRAUXB6_CMPR2B6_Pos      (0UL)                     /*!< CTIMER CMPRAUXB6: CMPR2B6 (Bit 0)                     */
+#define CTIMER_CMPRAUXB6_CMPR2B6_Msk      (0xffffUL)                /*!< CTIMER CMPRAUXB6: CMPR2B6 (Bitfield-Mask: 0xffff)     */
+/* =========================================================  AUX6  ========================================================== */
+#define CTIMER_AUX6_TMRB6EN23_Pos         (30UL)                    /*!< CTIMER AUX6: TMRB6EN23 (Bit 30)                       */
+#define CTIMER_AUX6_TMRB6EN23_Msk         (0x40000000UL)            /*!< CTIMER AUX6: TMRB6EN23 (Bitfield-Mask: 0x01)          */
+#define CTIMER_AUX6_TMRB6POL23_Pos        (29UL)                    /*!< CTIMER AUX6: TMRB6POL23 (Bit 29)                      */
+#define CTIMER_AUX6_TMRB6POL23_Msk        (0x20000000UL)            /*!< CTIMER AUX6: TMRB6POL23 (Bitfield-Mask: 0x01)         */
+#define CTIMER_AUX6_TMRB6TINV_Pos         (28UL)                    /*!< CTIMER AUX6: TMRB6TINV (Bit 28)                       */
+#define CTIMER_AUX6_TMRB6TINV_Msk         (0x10000000UL)            /*!< CTIMER AUX6: TMRB6TINV (Bitfield-Mask: 0x01)          */
+#define CTIMER_AUX6_TMRB6NOSYNC_Pos       (27UL)                    /*!< CTIMER AUX6: TMRB6NOSYNC (Bit 27)                     */
+#define CTIMER_AUX6_TMRB6NOSYNC_Msk       (0x8000000UL)             /*!< CTIMER AUX6: TMRB6NOSYNC (Bitfield-Mask: 0x01)        */
+#define CTIMER_AUX6_TMRB6TRIG_Pos         (23UL)                    /*!< CTIMER AUX6: TMRB6TRIG (Bit 23)                       */
+#define CTIMER_AUX6_TMRB6TRIG_Msk         (0x7800000UL)             /*!< CTIMER AUX6: TMRB6TRIG (Bitfield-Mask: 0x0f)          */
+#define CTIMER_AUX6_TMRB6LMT_Pos          (16UL)                    /*!< CTIMER AUX6: TMRB6LMT (Bit 16)                        */
+#define CTIMER_AUX6_TMRB6LMT_Msk          (0x3f0000UL)              /*!< CTIMER AUX6: TMRB6LMT (Bitfield-Mask: 0x3f)           */
+#define CTIMER_AUX6_TMRA6EN23_Pos         (14UL)                    /*!< CTIMER AUX6: TMRA6EN23 (Bit 14)                       */
+#define CTIMER_AUX6_TMRA6EN23_Msk         (0x4000UL)                /*!< CTIMER AUX6: TMRA6EN23 (Bitfield-Mask: 0x01)          */
+#define CTIMER_AUX6_TMRA6POL23_Pos        (13UL)                    /*!< CTIMER AUX6: TMRA6POL23 (Bit 13)                      */
+#define CTIMER_AUX6_TMRA6POL23_Msk        (0x2000UL)                /*!< CTIMER AUX6: TMRA6POL23 (Bitfield-Mask: 0x01)         */
+#define CTIMER_AUX6_TMRA6TINV_Pos         (12UL)                    /*!< CTIMER AUX6: TMRA6TINV (Bit 12)                       */
+#define CTIMER_AUX6_TMRA6TINV_Msk         (0x1000UL)                /*!< CTIMER AUX6: TMRA6TINV (Bitfield-Mask: 0x01)          */
+#define CTIMER_AUX6_TMRA6NOSYNC_Pos       (11UL)                    /*!< CTIMER AUX6: TMRA6NOSYNC (Bit 11)                     */
+#define CTIMER_AUX6_TMRA6NOSYNC_Msk       (0x800UL)                 /*!< CTIMER AUX6: TMRA6NOSYNC (Bitfield-Mask: 0x01)        */
+#define CTIMER_AUX6_TMRA6TRIG_Pos         (7UL)                     /*!< CTIMER AUX6: TMRA6TRIG (Bit 7)                        */
+#define CTIMER_AUX6_TMRA6TRIG_Msk         (0x780UL)                 /*!< CTIMER AUX6: TMRA6TRIG (Bitfield-Mask: 0x0f)          */
+#define CTIMER_AUX6_TMRA6LMT_Pos          (0UL)                     /*!< CTIMER AUX6: TMRA6LMT (Bit 0)                         */
+#define CTIMER_AUX6_TMRA6LMT_Msk          (0x7fUL)                  /*!< CTIMER AUX6: TMRA6LMT (Bitfield-Mask: 0x7f)           */
+/* =========================================================  TMR7  ========================================================== */
+#define CTIMER_TMR7_CTTMRB7_Pos           (16UL)                    /*!< CTIMER TMR7: CTTMRB7 (Bit 16)                         */
+#define CTIMER_TMR7_CTTMRB7_Msk           (0xffff0000UL)            /*!< CTIMER TMR7: CTTMRB7 (Bitfield-Mask: 0xffff)          */
+#define CTIMER_TMR7_CTTMRA7_Pos           (0UL)                     /*!< CTIMER TMR7: CTTMRA7 (Bit 0)                          */
+#define CTIMER_TMR7_CTTMRA7_Msk           (0xffffUL)                /*!< CTIMER TMR7: CTTMRA7 (Bitfield-Mask: 0xffff)          */
+/* ========================================================  CMPRA7  ========================================================= */
+#define CTIMER_CMPRA7_CMPR1A7_Pos         (16UL)                    /*!< CTIMER CMPRA7: CMPR1A7 (Bit 16)                       */
+#define CTIMER_CMPRA7_CMPR1A7_Msk         (0xffff0000UL)            /*!< CTIMER CMPRA7: CMPR1A7 (Bitfield-Mask: 0xffff)        */
+#define CTIMER_CMPRA7_CMPR0A7_Pos         (0UL)                     /*!< CTIMER CMPRA7: CMPR0A7 (Bit 0)                        */
+#define CTIMER_CMPRA7_CMPR0A7_Msk         (0xffffUL)                /*!< CTIMER CMPRA7: CMPR0A7 (Bitfield-Mask: 0xffff)        */
+/* ========================================================  CMPRB7  ========================================================= */
+#define CTIMER_CMPRB7_CMPR1B7_Pos         (16UL)                    /*!< CTIMER CMPRB7: CMPR1B7 (Bit 16)                       */
+#define CTIMER_CMPRB7_CMPR1B7_Msk         (0xffff0000UL)            /*!< CTIMER CMPRB7: CMPR1B7 (Bitfield-Mask: 0xffff)        */
+#define CTIMER_CMPRB7_CMPR0B7_Pos         (0UL)                     /*!< CTIMER CMPRB7: CMPR0B7 (Bit 0)                        */
+#define CTIMER_CMPRB7_CMPR0B7_Msk         (0xffffUL)                /*!< CTIMER CMPRB7: CMPR0B7 (Bitfield-Mask: 0xffff)        */
+/* =========================================================  CTRL7  ========================================================= */
+#define CTIMER_CTRL7_CTLINK7_Pos          (31UL)                    /*!< CTIMER CTRL7: CTLINK7 (Bit 31)                        */
+#define CTIMER_CTRL7_CTLINK7_Msk          (0x80000000UL)            /*!< CTIMER CTRL7: CTLINK7 (Bitfield-Mask: 0x01)           */
+#define CTIMER_CTRL7_TMRB7POL_Pos         (28UL)                    /*!< CTIMER CTRL7: TMRB7POL (Bit 28)                       */
+#define CTIMER_CTRL7_TMRB7POL_Msk         (0x10000000UL)            /*!< CTIMER CTRL7: TMRB7POL (Bitfield-Mask: 0x01)          */
+#define CTIMER_CTRL7_TMRB7CLR_Pos         (27UL)                    /*!< CTIMER CTRL7: TMRB7CLR (Bit 27)                       */
+#define CTIMER_CTRL7_TMRB7CLR_Msk         (0x8000000UL)             /*!< CTIMER CTRL7: TMRB7CLR (Bitfield-Mask: 0x01)          */
+#define CTIMER_CTRL7_TMRB7IE1_Pos         (26UL)                    /*!< CTIMER CTRL7: TMRB7IE1 (Bit 26)                       */
+#define CTIMER_CTRL7_TMRB7IE1_Msk         (0x4000000UL)             /*!< CTIMER CTRL7: TMRB7IE1 (Bitfield-Mask: 0x01)          */
+#define CTIMER_CTRL7_TMRB7IE0_Pos         (25UL)                    /*!< CTIMER CTRL7: TMRB7IE0 (Bit 25)                       */
+#define CTIMER_CTRL7_TMRB7IE0_Msk         (0x2000000UL)             /*!< CTIMER CTRL7: TMRB7IE0 (Bitfield-Mask: 0x01)          */
+#define CTIMER_CTRL7_TMRB7FN_Pos          (22UL)                    /*!< CTIMER CTRL7: TMRB7FN (Bit 22)                        */
+#define CTIMER_CTRL7_TMRB7FN_Msk          (0x1c00000UL)             /*!< CTIMER CTRL7: TMRB7FN (Bitfield-Mask: 0x07)           */
+#define CTIMER_CTRL7_TMRB7CLK_Pos         (17UL)                    /*!< CTIMER CTRL7: TMRB7CLK (Bit 17)                       */
+#define CTIMER_CTRL7_TMRB7CLK_Msk         (0x3e0000UL)              /*!< CTIMER CTRL7: TMRB7CLK (Bitfield-Mask: 0x1f)          */
+#define CTIMER_CTRL7_TMRB7EN_Pos          (16UL)                    /*!< CTIMER CTRL7: TMRB7EN (Bit 16)                        */
+#define CTIMER_CTRL7_TMRB7EN_Msk          (0x10000UL)               /*!< CTIMER CTRL7: TMRB7EN (Bitfield-Mask: 0x01)           */
+#define CTIMER_CTRL7_TMRA7POL_Pos         (12UL)                    /*!< CTIMER CTRL7: TMRA7POL (Bit 12)                       */
+#define CTIMER_CTRL7_TMRA7POL_Msk         (0x1000UL)                /*!< CTIMER CTRL7: TMRA7POL (Bitfield-Mask: 0x01)          */
+#define CTIMER_CTRL7_TMRA7CLR_Pos         (11UL)                    /*!< CTIMER CTRL7: TMRA7CLR (Bit 11)                       */
+#define CTIMER_CTRL7_TMRA7CLR_Msk         (0x800UL)                 /*!< CTIMER CTRL7: TMRA7CLR (Bitfield-Mask: 0x01)          */
+#define CTIMER_CTRL7_TMRA7IE1_Pos         (10UL)                    /*!< CTIMER CTRL7: TMRA7IE1 (Bit 10)                       */
+#define CTIMER_CTRL7_TMRA7IE1_Msk         (0x400UL)                 /*!< CTIMER CTRL7: TMRA7IE1 (Bitfield-Mask: 0x01)          */
+#define CTIMER_CTRL7_TMRA7IE0_Pos         (9UL)                     /*!< CTIMER CTRL7: TMRA7IE0 (Bit 9)                        */
+#define CTIMER_CTRL7_TMRA7IE0_Msk         (0x200UL)                 /*!< CTIMER CTRL7: TMRA7IE0 (Bitfield-Mask: 0x01)          */
+#define CTIMER_CTRL7_TMRA7FN_Pos          (6UL)                     /*!< CTIMER CTRL7: TMRA7FN (Bit 6)                         */
+#define CTIMER_CTRL7_TMRA7FN_Msk          (0x1c0UL)                 /*!< CTIMER CTRL7: TMRA7FN (Bitfield-Mask: 0x07)           */
+#define CTIMER_CTRL7_TMRA7CLK_Pos         (1UL)                     /*!< CTIMER CTRL7: TMRA7CLK (Bit 1)                        */
+#define CTIMER_CTRL7_TMRA7CLK_Msk         (0x3eUL)                  /*!< CTIMER CTRL7: TMRA7CLK (Bitfield-Mask: 0x1f)          */
+#define CTIMER_CTRL7_TMRA7EN_Pos          (0UL)                     /*!< CTIMER CTRL7: TMRA7EN (Bit 0)                         */
+#define CTIMER_CTRL7_TMRA7EN_Msk          (0x1UL)                   /*!< CTIMER CTRL7: TMRA7EN (Bitfield-Mask: 0x01)           */
+/* =======================================================  CMPRAUXA7  ======================================================= */
+#define CTIMER_CMPRAUXA7_CMPR3A7_Pos      (16UL)                    /*!< CTIMER CMPRAUXA7: CMPR3A7 (Bit 16)                    */
+#define CTIMER_CMPRAUXA7_CMPR3A7_Msk      (0xffff0000UL)            /*!< CTIMER CMPRAUXA7: CMPR3A7 (Bitfield-Mask: 0xffff)     */
+#define CTIMER_CMPRAUXA7_CMPR2A7_Pos      (0UL)                     /*!< CTIMER CMPRAUXA7: CMPR2A7 (Bit 0)                     */
+#define CTIMER_CMPRAUXA7_CMPR2A7_Msk      (0xffffUL)                /*!< CTIMER CMPRAUXA7: CMPR2A7 (Bitfield-Mask: 0xffff)     */
+/* =======================================================  CMPRAUXB7  ======================================================= */
+#define CTIMER_CMPRAUXB7_CMPR3B7_Pos      (16UL)                    /*!< CTIMER CMPRAUXB7: CMPR3B7 (Bit 16)                    */
+#define CTIMER_CMPRAUXB7_CMPR3B7_Msk      (0xffff0000UL)            /*!< CTIMER CMPRAUXB7: CMPR3B7 (Bitfield-Mask: 0xffff)     */
+#define CTIMER_CMPRAUXB7_CMPR2B7_Pos      (0UL)                     /*!< CTIMER CMPRAUXB7: CMPR2B7 (Bit 0)                     */
+#define CTIMER_CMPRAUXB7_CMPR2B7_Msk      (0xffffUL)                /*!< CTIMER CMPRAUXB7: CMPR2B7 (Bitfield-Mask: 0xffff)     */
+/* =========================================================  AUX7  ========================================================== */
+#define CTIMER_AUX7_TMRB7EN23_Pos         (30UL)                    /*!< CTIMER AUX7: TMRB7EN23 (Bit 30)                       */
+#define CTIMER_AUX7_TMRB7EN23_Msk         (0x40000000UL)            /*!< CTIMER AUX7: TMRB7EN23 (Bitfield-Mask: 0x01)          */
+#define CTIMER_AUX7_TMRB7POL23_Pos        (29UL)                    /*!< CTIMER AUX7: TMRB7POL23 (Bit 29)                      */
+#define CTIMER_AUX7_TMRB7POL23_Msk        (0x20000000UL)            /*!< CTIMER AUX7: TMRB7POL23 (Bitfield-Mask: 0x01)         */
+#define CTIMER_AUX7_TMRB7TINV_Pos         (28UL)                    /*!< CTIMER AUX7: TMRB7TINV (Bit 28)                       */
+#define CTIMER_AUX7_TMRB7TINV_Msk         (0x10000000UL)            /*!< CTIMER AUX7: TMRB7TINV (Bitfield-Mask: 0x01)          */
+#define CTIMER_AUX7_TMRB7NOSYNC_Pos       (27UL)                    /*!< CTIMER AUX7: TMRB7NOSYNC (Bit 27)                     */
+#define CTIMER_AUX7_TMRB7NOSYNC_Msk       (0x8000000UL)             /*!< CTIMER AUX7: TMRB7NOSYNC (Bitfield-Mask: 0x01)        */
+#define CTIMER_AUX7_TMRB7TRIG_Pos         (23UL)                    /*!< CTIMER AUX7: TMRB7TRIG (Bit 23)                       */
+#define CTIMER_AUX7_TMRB7TRIG_Msk         (0x7800000UL)             /*!< CTIMER AUX7: TMRB7TRIG (Bitfield-Mask: 0x0f)          */
+#define CTIMER_AUX7_TMRB7LMT_Pos          (16UL)                    /*!< CTIMER AUX7: TMRB7LMT (Bit 16)                        */
+#define CTIMER_AUX7_TMRB7LMT_Msk          (0x3f0000UL)              /*!< CTIMER AUX7: TMRB7LMT (Bitfield-Mask: 0x3f)           */
+#define CTIMER_AUX7_TMRA7EN23_Pos         (14UL)                    /*!< CTIMER AUX7: TMRA7EN23 (Bit 14)                       */
+#define CTIMER_AUX7_TMRA7EN23_Msk         (0x4000UL)                /*!< CTIMER AUX7: TMRA7EN23 (Bitfield-Mask: 0x01)          */
+#define CTIMER_AUX7_TMRA7POL23_Pos        (13UL)                    /*!< CTIMER AUX7: TMRA7POL23 (Bit 13)                      */
+#define CTIMER_AUX7_TMRA7POL23_Msk        (0x2000UL)                /*!< CTIMER AUX7: TMRA7POL23 (Bitfield-Mask: 0x01)         */
+#define CTIMER_AUX7_TMRA7TINV_Pos         (12UL)                    /*!< CTIMER AUX7: TMRA7TINV (Bit 12)                       */
+#define CTIMER_AUX7_TMRA7TINV_Msk         (0x1000UL)                /*!< CTIMER AUX7: TMRA7TINV (Bitfield-Mask: 0x01)          */
+#define CTIMER_AUX7_TMRA7NOSYNC_Pos       (11UL)                    /*!< CTIMER AUX7: TMRA7NOSYNC (Bit 11)                     */
+#define CTIMER_AUX7_TMRA7NOSYNC_Msk       (0x800UL)                 /*!< CTIMER AUX7: TMRA7NOSYNC (Bitfield-Mask: 0x01)        */
+#define CTIMER_AUX7_TMRA7TRIG_Pos         (7UL)                     /*!< CTIMER AUX7: TMRA7TRIG (Bit 7)                        */
+#define CTIMER_AUX7_TMRA7TRIG_Msk         (0x780UL)                 /*!< CTIMER AUX7: TMRA7TRIG (Bitfield-Mask: 0x0f)          */
+#define CTIMER_AUX7_TMRA7LMT_Pos          (0UL)                     /*!< CTIMER AUX7: TMRA7LMT (Bit 0)                         */
+#define CTIMER_AUX7_TMRA7LMT_Msk          (0x7fUL)                  /*!< CTIMER AUX7: TMRA7LMT (Bitfield-Mask: 0x7f)           */
+/* ========================================================  GLOBEN  ========================================================= */
+#define CTIMER_GLOBEN_ENB7_Pos            (15UL)                    /*!< CTIMER GLOBEN: ENB7 (Bit 15)                          */
+#define CTIMER_GLOBEN_ENB7_Msk            (0x8000UL)                /*!< CTIMER GLOBEN: ENB7 (Bitfield-Mask: 0x01)             */
+#define CTIMER_GLOBEN_ENA7_Pos            (14UL)                    /*!< CTIMER GLOBEN: ENA7 (Bit 14)                          */
+#define CTIMER_GLOBEN_ENA7_Msk            (0x4000UL)                /*!< CTIMER GLOBEN: ENA7 (Bitfield-Mask: 0x01)             */
+#define CTIMER_GLOBEN_ENB6_Pos            (13UL)                    /*!< CTIMER GLOBEN: ENB6 (Bit 13)                          */
+#define CTIMER_GLOBEN_ENB6_Msk            (0x2000UL)                /*!< CTIMER GLOBEN: ENB6 (Bitfield-Mask: 0x01)             */
+#define CTIMER_GLOBEN_ENA6_Pos            (12UL)                    /*!< CTIMER GLOBEN: ENA6 (Bit 12)                          */
+#define CTIMER_GLOBEN_ENA6_Msk            (0x1000UL)                /*!< CTIMER GLOBEN: ENA6 (Bitfield-Mask: 0x01)             */
+#define CTIMER_GLOBEN_ENB5_Pos            (11UL)                    /*!< CTIMER GLOBEN: ENB5 (Bit 11)                          */
+#define CTIMER_GLOBEN_ENB5_Msk            (0x800UL)                 /*!< CTIMER GLOBEN: ENB5 (Bitfield-Mask: 0x01)             */
+#define CTIMER_GLOBEN_ENA5_Pos            (10UL)                    /*!< CTIMER GLOBEN: ENA5 (Bit 10)                          */
+#define CTIMER_GLOBEN_ENA5_Msk            (0x400UL)                 /*!< CTIMER GLOBEN: ENA5 (Bitfield-Mask: 0x01)             */
+#define CTIMER_GLOBEN_ENB4_Pos            (9UL)                     /*!< CTIMER GLOBEN: ENB4 (Bit 9)                           */
+#define CTIMER_GLOBEN_ENB4_Msk            (0x200UL)                 /*!< CTIMER GLOBEN: ENB4 (Bitfield-Mask: 0x01)             */
+#define CTIMER_GLOBEN_ENA4_Pos            (8UL)                     /*!< CTIMER GLOBEN: ENA4 (Bit 8)                           */
+#define CTIMER_GLOBEN_ENA4_Msk            (0x100UL)                 /*!< CTIMER GLOBEN: ENA4 (Bitfield-Mask: 0x01)             */
+#define CTIMER_GLOBEN_ENB3_Pos            (7UL)                     /*!< CTIMER GLOBEN: ENB3 (Bit 7)                           */
+#define CTIMER_GLOBEN_ENB3_Msk            (0x80UL)                  /*!< CTIMER GLOBEN: ENB3 (Bitfield-Mask: 0x01)             */
+#define CTIMER_GLOBEN_ENA3_Pos            (6UL)                     /*!< CTIMER GLOBEN: ENA3 (Bit 6)                           */
+#define CTIMER_GLOBEN_ENA3_Msk            (0x40UL)                  /*!< CTIMER GLOBEN: ENA3 (Bitfield-Mask: 0x01)             */
+#define CTIMER_GLOBEN_ENB2_Pos            (5UL)                     /*!< CTIMER GLOBEN: ENB2 (Bit 5)                           */
+#define CTIMER_GLOBEN_ENB2_Msk            (0x20UL)                  /*!< CTIMER GLOBEN: ENB2 (Bitfield-Mask: 0x01)             */
+#define CTIMER_GLOBEN_ENA2_Pos            (4UL)                     /*!< CTIMER GLOBEN: ENA2 (Bit 4)                           */
+#define CTIMER_GLOBEN_ENA2_Msk            (0x10UL)                  /*!< CTIMER GLOBEN: ENA2 (Bitfield-Mask: 0x01)             */
+#define CTIMER_GLOBEN_ENB1_Pos            (3UL)                     /*!< CTIMER GLOBEN: ENB1 (Bit 3)                           */
+#define CTIMER_GLOBEN_ENB1_Msk            (0x8UL)                   /*!< CTIMER GLOBEN: ENB1 (Bitfield-Mask: 0x01)             */
+#define CTIMER_GLOBEN_ENA1_Pos            (2UL)                     /*!< CTIMER GLOBEN: ENA1 (Bit 2)                           */
+#define CTIMER_GLOBEN_ENA1_Msk            (0x4UL)                   /*!< CTIMER GLOBEN: ENA1 (Bitfield-Mask: 0x01)             */
+#define CTIMER_GLOBEN_ENB0_Pos            (1UL)                     /*!< CTIMER GLOBEN: ENB0 (Bit 1)                           */
+#define CTIMER_GLOBEN_ENB0_Msk            (0x2UL)                   /*!< CTIMER GLOBEN: ENB0 (Bitfield-Mask: 0x01)             */
+#define CTIMER_GLOBEN_ENA0_Pos            (0UL)                     /*!< CTIMER GLOBEN: ENA0 (Bit 0)                           */
+#define CTIMER_GLOBEN_ENA0_Msk            (0x1UL)                   /*!< CTIMER GLOBEN: ENA0 (Bitfield-Mask: 0x01)             */
+/* ========================================================  OUTCFG0  ======================================================== */
+#define CTIMER_OUTCFG0_CFG9_Pos           (28UL)                    /*!< CTIMER OUTCFG0: CFG9 (Bit 28)                         */
+#define CTIMER_OUTCFG0_CFG9_Msk           (0x70000000UL)            /*!< CTIMER OUTCFG0: CFG9 (Bitfield-Mask: 0x07)            */
+#define CTIMER_OUTCFG0_CFG8_Pos           (25UL)                    /*!< CTIMER OUTCFG0: CFG8 (Bit 25)                         */
+#define CTIMER_OUTCFG0_CFG8_Msk           (0xe000000UL)             /*!< CTIMER OUTCFG0: CFG8 (Bitfield-Mask: 0x07)            */
+#define CTIMER_OUTCFG0_CFG7_Pos           (22UL)                    /*!< CTIMER OUTCFG0: CFG7 (Bit 22)                         */
+#define CTIMER_OUTCFG0_CFG7_Msk           (0x1c00000UL)             /*!< CTIMER OUTCFG0: CFG7 (Bitfield-Mask: 0x07)            */
+#define CTIMER_OUTCFG0_CFG6_Pos           (19UL)                    /*!< CTIMER OUTCFG0: CFG6 (Bit 19)                         */
+#define CTIMER_OUTCFG0_CFG6_Msk           (0x380000UL)              /*!< CTIMER OUTCFG0: CFG6 (Bitfield-Mask: 0x07)            */
+#define CTIMER_OUTCFG0_CFG5_Pos           (16UL)                    /*!< CTIMER OUTCFG0: CFG5 (Bit 16)                         */
+#define CTIMER_OUTCFG0_CFG5_Msk           (0x70000UL)               /*!< CTIMER OUTCFG0: CFG5 (Bitfield-Mask: 0x07)            */
+#define CTIMER_OUTCFG0_CFG4_Pos           (12UL)                    /*!< CTIMER OUTCFG0: CFG4 (Bit 12)                         */
+#define CTIMER_OUTCFG0_CFG4_Msk           (0x7000UL)                /*!< CTIMER OUTCFG0: CFG4 (Bitfield-Mask: 0x07)            */
+#define CTIMER_OUTCFG0_CFG3_Pos           (9UL)                     /*!< CTIMER OUTCFG0: CFG3 (Bit 9)                          */
+#define CTIMER_OUTCFG0_CFG3_Msk           (0xe00UL)                 /*!< CTIMER OUTCFG0: CFG3 (Bitfield-Mask: 0x07)            */
+#define CTIMER_OUTCFG0_CFG2_Pos           (6UL)                     /*!< CTIMER OUTCFG0: CFG2 (Bit 6)                          */
+#define CTIMER_OUTCFG0_CFG2_Msk           (0x1c0UL)                 /*!< CTIMER OUTCFG0: CFG2 (Bitfield-Mask: 0x07)            */
+#define CTIMER_OUTCFG0_CFG1_Pos           (3UL)                     /*!< CTIMER OUTCFG0: CFG1 (Bit 3)                          */
+#define CTIMER_OUTCFG0_CFG1_Msk           (0x38UL)                  /*!< CTIMER OUTCFG0: CFG1 (Bitfield-Mask: 0x07)            */
+#define CTIMER_OUTCFG0_CFG0_Pos           (0UL)                     /*!< CTIMER OUTCFG0: CFG0 (Bit 0)                          */
+#define CTIMER_OUTCFG0_CFG0_Msk           (0x7UL)                   /*!< CTIMER OUTCFG0: CFG0 (Bitfield-Mask: 0x07)            */
+/* ========================================================  OUTCFG1  ======================================================== */
+#define CTIMER_OUTCFG1_CFG19_Pos          (28UL)                    /*!< CTIMER OUTCFG1: CFG19 (Bit 28)                        */
+#define CTIMER_OUTCFG1_CFG19_Msk          (0x70000000UL)            /*!< CTIMER OUTCFG1: CFG19 (Bitfield-Mask: 0x07)           */
+#define CTIMER_OUTCFG1_CFG18_Pos          (25UL)                    /*!< CTIMER OUTCFG1: CFG18 (Bit 25)                        */
+#define CTIMER_OUTCFG1_CFG18_Msk          (0xe000000UL)             /*!< CTIMER OUTCFG1: CFG18 (Bitfield-Mask: 0x07)           */
+#define CTIMER_OUTCFG1_CFG17_Pos          (22UL)                    /*!< CTIMER OUTCFG1: CFG17 (Bit 22)                        */
+#define CTIMER_OUTCFG1_CFG17_Msk          (0x1c00000UL)             /*!< CTIMER OUTCFG1: CFG17 (Bitfield-Mask: 0x07)           */
+#define CTIMER_OUTCFG1_CFG16_Pos          (19UL)                    /*!< CTIMER OUTCFG1: CFG16 (Bit 19)                        */
+#define CTIMER_OUTCFG1_CFG16_Msk          (0x380000UL)              /*!< CTIMER OUTCFG1: CFG16 (Bitfield-Mask: 0x07)           */
+#define CTIMER_OUTCFG1_CFG15_Pos          (16UL)                    /*!< CTIMER OUTCFG1: CFG15 (Bit 16)                        */
+#define CTIMER_OUTCFG1_CFG15_Msk          (0x70000UL)               /*!< CTIMER OUTCFG1: CFG15 (Bitfield-Mask: 0x07)           */
+#define CTIMER_OUTCFG1_CFG14_Pos          (12UL)                    /*!< CTIMER OUTCFG1: CFG14 (Bit 12)                        */
+#define CTIMER_OUTCFG1_CFG14_Msk          (0x7000UL)                /*!< CTIMER OUTCFG1: CFG14 (Bitfield-Mask: 0x07)           */
+#define CTIMER_OUTCFG1_CFG13_Pos          (9UL)                     /*!< CTIMER OUTCFG1: CFG13 (Bit 9)                         */
+#define CTIMER_OUTCFG1_CFG13_Msk          (0xe00UL)                 /*!< CTIMER OUTCFG1: CFG13 (Bitfield-Mask: 0x07)           */
+#define CTIMER_OUTCFG1_CFG12_Pos          (6UL)                     /*!< CTIMER OUTCFG1: CFG12 (Bit 6)                         */
+#define CTIMER_OUTCFG1_CFG12_Msk          (0x1c0UL)                 /*!< CTIMER OUTCFG1: CFG12 (Bitfield-Mask: 0x07)           */
+#define CTIMER_OUTCFG1_CFG11_Pos          (3UL)                     /*!< CTIMER OUTCFG1: CFG11 (Bit 3)                         */
+#define CTIMER_OUTCFG1_CFG11_Msk          (0x38UL)                  /*!< CTIMER OUTCFG1: CFG11 (Bitfield-Mask: 0x07)           */
+#define CTIMER_OUTCFG1_CFG10_Pos          (0UL)                     /*!< CTIMER OUTCFG1: CFG10 (Bit 0)                         */
+#define CTIMER_OUTCFG1_CFG10_Msk          (0x7UL)                   /*!< CTIMER OUTCFG1: CFG10 (Bitfield-Mask: 0x07)           */
+/* ========================================================  OUTCFG2  ======================================================== */
+#define CTIMER_OUTCFG2_CFG29_Pos          (28UL)                    /*!< CTIMER OUTCFG2: CFG29 (Bit 28)                        */
+#define CTIMER_OUTCFG2_CFG29_Msk          (0x70000000UL)            /*!< CTIMER OUTCFG2: CFG29 (Bitfield-Mask: 0x07)           */
+#define CTIMER_OUTCFG2_CFG28_Pos          (25UL)                    /*!< CTIMER OUTCFG2: CFG28 (Bit 25)                        */
+#define CTIMER_OUTCFG2_CFG28_Msk          (0xe000000UL)             /*!< CTIMER OUTCFG2: CFG28 (Bitfield-Mask: 0x07)           */
+#define CTIMER_OUTCFG2_CFG27_Pos          (22UL)                    /*!< CTIMER OUTCFG2: CFG27 (Bit 22)                        */
+#define CTIMER_OUTCFG2_CFG27_Msk          (0x1c00000UL)             /*!< CTIMER OUTCFG2: CFG27 (Bitfield-Mask: 0x07)           */
+#define CTIMER_OUTCFG2_CFG26_Pos          (19UL)                    /*!< CTIMER OUTCFG2: CFG26 (Bit 19)                        */
+#define CTIMER_OUTCFG2_CFG26_Msk          (0x380000UL)              /*!< CTIMER OUTCFG2: CFG26 (Bitfield-Mask: 0x07)           */
+#define CTIMER_OUTCFG2_CFG25_Pos          (16UL)                    /*!< CTIMER OUTCFG2: CFG25 (Bit 16)                        */
+#define CTIMER_OUTCFG2_CFG25_Msk          (0x70000UL)               /*!< CTIMER OUTCFG2: CFG25 (Bitfield-Mask: 0x07)           */
+#define CTIMER_OUTCFG2_CFG24_Pos          (12UL)                    /*!< CTIMER OUTCFG2: CFG24 (Bit 12)                        */
+#define CTIMER_OUTCFG2_CFG24_Msk          (0x7000UL)                /*!< CTIMER OUTCFG2: CFG24 (Bitfield-Mask: 0x07)           */
+#define CTIMER_OUTCFG2_CFG23_Pos          (9UL)                     /*!< CTIMER OUTCFG2: CFG23 (Bit 9)                         */
+#define CTIMER_OUTCFG2_CFG23_Msk          (0xe00UL)                 /*!< CTIMER OUTCFG2: CFG23 (Bitfield-Mask: 0x07)           */
+#define CTIMER_OUTCFG2_CFG22_Pos          (6UL)                     /*!< CTIMER OUTCFG2: CFG22 (Bit 6)                         */
+#define CTIMER_OUTCFG2_CFG22_Msk          (0x1c0UL)                 /*!< CTIMER OUTCFG2: CFG22 (Bitfield-Mask: 0x07)           */
+#define CTIMER_OUTCFG2_CFG21_Pos          (3UL)                     /*!< CTIMER OUTCFG2: CFG21 (Bit 3)                         */
+#define CTIMER_OUTCFG2_CFG21_Msk          (0x38UL)                  /*!< CTIMER OUTCFG2: CFG21 (Bitfield-Mask: 0x07)           */
+#define CTIMER_OUTCFG2_CFG20_Pos          (0UL)                     /*!< CTIMER OUTCFG2: CFG20 (Bit 0)                         */
+#define CTIMER_OUTCFG2_CFG20_Msk          (0x7UL)                   /*!< CTIMER OUTCFG2: CFG20 (Bitfield-Mask: 0x07)           */
+/* ========================================================  OUTCFG3  ======================================================== */
+#define CTIMER_OUTCFG3_CFG31_Pos          (3UL)                     /*!< CTIMER OUTCFG3: CFG31 (Bit 3)                         */
+#define CTIMER_OUTCFG3_CFG31_Msk          (0x38UL)                  /*!< CTIMER OUTCFG3: CFG31 (Bitfield-Mask: 0x07)           */
+#define CTIMER_OUTCFG3_CFG30_Pos          (0UL)                     /*!< CTIMER OUTCFG3: CFG30 (Bit 0)                         */
+#define CTIMER_OUTCFG3_CFG30_Msk          (0x7UL)                   /*!< CTIMER OUTCFG3: CFG30 (Bitfield-Mask: 0x07)           */
+/* =========================================================  INCFG  ========================================================= */
+#define CTIMER_INCFG_CFGB7_Pos            (15UL)                    /*!< CTIMER INCFG: CFGB7 (Bit 15)                          */
+#define CTIMER_INCFG_CFGB7_Msk            (0x8000UL)                /*!< CTIMER INCFG: CFGB7 (Bitfield-Mask: 0x01)             */
+#define CTIMER_INCFG_CFGA7_Pos            (14UL)                    /*!< CTIMER INCFG: CFGA7 (Bit 14)                          */
+#define CTIMER_INCFG_CFGA7_Msk            (0x4000UL)                /*!< CTIMER INCFG: CFGA7 (Bitfield-Mask: 0x01)             */
+#define CTIMER_INCFG_CFGB6_Pos            (13UL)                    /*!< CTIMER INCFG: CFGB6 (Bit 13)                          */
+#define CTIMER_INCFG_CFGB6_Msk            (0x2000UL)                /*!< CTIMER INCFG: CFGB6 (Bitfield-Mask: 0x01)             */
+#define CTIMER_INCFG_CFGA6_Pos            (12UL)                    /*!< CTIMER INCFG: CFGA6 (Bit 12)                          */
+#define CTIMER_INCFG_CFGA6_Msk            (0x1000UL)                /*!< CTIMER INCFG: CFGA6 (Bitfield-Mask: 0x01)             */
+#define CTIMER_INCFG_CFGB5_Pos            (11UL)                    /*!< CTIMER INCFG: CFGB5 (Bit 11)                          */
+#define CTIMER_INCFG_CFGB5_Msk            (0x800UL)                 /*!< CTIMER INCFG: CFGB5 (Bitfield-Mask: 0x01)             */
+#define CTIMER_INCFG_CFGA5_Pos            (10UL)                    /*!< CTIMER INCFG: CFGA5 (Bit 10)                          */
+#define CTIMER_INCFG_CFGA5_Msk            (0x400UL)                 /*!< CTIMER INCFG: CFGA5 (Bitfield-Mask: 0x01)             */
+#define CTIMER_INCFG_CFGB4_Pos            (9UL)                     /*!< CTIMER INCFG: CFGB4 (Bit 9)                           */
+#define CTIMER_INCFG_CFGB4_Msk            (0x200UL)                 /*!< CTIMER INCFG: CFGB4 (Bitfield-Mask: 0x01)             */
+#define CTIMER_INCFG_CFGA4_Pos            (8UL)                     /*!< CTIMER INCFG: CFGA4 (Bit 8)                           */
+#define CTIMER_INCFG_CFGA4_Msk            (0x100UL)                 /*!< CTIMER INCFG: CFGA4 (Bitfield-Mask: 0x01)             */
+#define CTIMER_INCFG_CFGB3_Pos            (7UL)                     /*!< CTIMER INCFG: CFGB3 (Bit 7)                           */
+#define CTIMER_INCFG_CFGB3_Msk            (0x80UL)                  /*!< CTIMER INCFG: CFGB3 (Bitfield-Mask: 0x01)             */
+#define CTIMER_INCFG_CFGA3_Pos            (6UL)                     /*!< CTIMER INCFG: CFGA3 (Bit 6)                           */
+#define CTIMER_INCFG_CFGA3_Msk            (0x40UL)                  /*!< CTIMER INCFG: CFGA3 (Bitfield-Mask: 0x01)             */
+#define CTIMER_INCFG_CFGB2_Pos            (5UL)                     /*!< CTIMER INCFG: CFGB2 (Bit 5)                           */
+#define CTIMER_INCFG_CFGB2_Msk            (0x20UL)                  /*!< CTIMER INCFG: CFGB2 (Bitfield-Mask: 0x01)             */
+#define CTIMER_INCFG_CFGA2_Pos            (4UL)                     /*!< CTIMER INCFG: CFGA2 (Bit 4)                           */
+#define CTIMER_INCFG_CFGA2_Msk            (0x10UL)                  /*!< CTIMER INCFG: CFGA2 (Bitfield-Mask: 0x01)             */
+#define CTIMER_INCFG_CFGB1_Pos            (3UL)                     /*!< CTIMER INCFG: CFGB1 (Bit 3)                           */
+#define CTIMER_INCFG_CFGB1_Msk            (0x8UL)                   /*!< CTIMER INCFG: CFGB1 (Bitfield-Mask: 0x01)             */
+#define CTIMER_INCFG_CFGA1_Pos            (2UL)                     /*!< CTIMER INCFG: CFGA1 (Bit 2)                           */
+#define CTIMER_INCFG_CFGA1_Msk            (0x4UL)                   /*!< CTIMER INCFG: CFGA1 (Bitfield-Mask: 0x01)             */
+#define CTIMER_INCFG_CFGB0_Pos            (1UL)                     /*!< CTIMER INCFG: CFGB0 (Bit 1)                           */
+#define CTIMER_INCFG_CFGB0_Msk            (0x2UL)                   /*!< CTIMER INCFG: CFGB0 (Bitfield-Mask: 0x01)             */
+#define CTIMER_INCFG_CFGA0_Pos            (0UL)                     /*!< CTIMER INCFG: CFGA0 (Bit 0)                           */
+#define CTIMER_INCFG_CFGA0_Msk            (0x1UL)                   /*!< CTIMER INCFG: CFGA0 (Bitfield-Mask: 0x01)             */
+/* =========================================================  STCFG  ========================================================= */
+#define CTIMER_STCFG_FREEZE_Pos           (31UL)                    /*!< CTIMER STCFG: FREEZE (Bit 31)                         */
+#define CTIMER_STCFG_FREEZE_Msk           (0x80000000UL)            /*!< CTIMER STCFG: FREEZE (Bitfield-Mask: 0x01)            */
+#define CTIMER_STCFG_CLEAR_Pos            (30UL)                    /*!< CTIMER STCFG: CLEAR (Bit 30)                          */
+#define CTIMER_STCFG_CLEAR_Msk            (0x40000000UL)            /*!< CTIMER STCFG: CLEAR (Bitfield-Mask: 0x01)             */
+#define CTIMER_STCFG_COMPARE_H_EN_Pos     (15UL)                    /*!< CTIMER STCFG: COMPARE_H_EN (Bit 15)                   */
+#define CTIMER_STCFG_COMPARE_H_EN_Msk     (0x8000UL)                /*!< CTIMER STCFG: COMPARE_H_EN (Bitfield-Mask: 0x01)      */
+#define CTIMER_STCFG_COMPARE_G_EN_Pos     (14UL)                    /*!< CTIMER STCFG: COMPARE_G_EN (Bit 14)                   */
+#define CTIMER_STCFG_COMPARE_G_EN_Msk     (0x4000UL)                /*!< CTIMER STCFG: COMPARE_G_EN (Bitfield-Mask: 0x01)      */
+#define CTIMER_STCFG_COMPARE_F_EN_Pos     (13UL)                    /*!< CTIMER STCFG: COMPARE_F_EN (Bit 13)                   */
+#define CTIMER_STCFG_COMPARE_F_EN_Msk     (0x2000UL)                /*!< CTIMER STCFG: COMPARE_F_EN (Bitfield-Mask: 0x01)      */
+#define CTIMER_STCFG_COMPARE_E_EN_Pos     (12UL)                    /*!< CTIMER STCFG: COMPARE_E_EN (Bit 12)                   */
+#define CTIMER_STCFG_COMPARE_E_EN_Msk     (0x1000UL)                /*!< CTIMER STCFG: COMPARE_E_EN (Bitfield-Mask: 0x01)      */
+#define CTIMER_STCFG_COMPARE_D_EN_Pos     (11UL)                    /*!< CTIMER STCFG: COMPARE_D_EN (Bit 11)                   */
+#define CTIMER_STCFG_COMPARE_D_EN_Msk     (0x800UL)                 /*!< CTIMER STCFG: COMPARE_D_EN (Bitfield-Mask: 0x01)      */
+#define CTIMER_STCFG_COMPARE_C_EN_Pos     (10UL)                    /*!< CTIMER STCFG: COMPARE_C_EN (Bit 10)                   */
+#define CTIMER_STCFG_COMPARE_C_EN_Msk     (0x400UL)                 /*!< CTIMER STCFG: COMPARE_C_EN (Bitfield-Mask: 0x01)      */
+#define CTIMER_STCFG_COMPARE_B_EN_Pos     (9UL)                     /*!< CTIMER STCFG: COMPARE_B_EN (Bit 9)                    */
+#define CTIMER_STCFG_COMPARE_B_EN_Msk     (0x200UL)                 /*!< CTIMER STCFG: COMPARE_B_EN (Bitfield-Mask: 0x01)      */
+#define CTIMER_STCFG_COMPARE_A_EN_Pos     (8UL)                     /*!< CTIMER STCFG: COMPARE_A_EN (Bit 8)                    */
+#define CTIMER_STCFG_COMPARE_A_EN_Msk     (0x100UL)                 /*!< CTIMER STCFG: COMPARE_A_EN (Bitfield-Mask: 0x01)      */
+#define CTIMER_STCFG_CLKSEL_Pos           (0UL)                     /*!< CTIMER STCFG: CLKSEL (Bit 0)                          */
+#define CTIMER_STCFG_CLKSEL_Msk           (0xfUL)                   /*!< CTIMER STCFG: CLKSEL (Bitfield-Mask: 0x0f)            */
+/* =========================================================  STTMR  ========================================================= */
+#define CTIMER_STTMR_STTMR_Pos            (0UL)                     /*!< CTIMER STTMR: STTMR (Bit 0)                           */
+#define CTIMER_STTMR_STTMR_Msk            (0xffffffffUL)            /*!< CTIMER STTMR: STTMR (Bitfield-Mask: 0xffffffff)       */
+/* ====================================================  CAPTURECONTROL  ===================================================== */
+#define CTIMER_CAPTURECONTROL_CAPTURE3_Pos (3UL)                    /*!< CTIMER CAPTURECONTROL: CAPTURE3 (Bit 3)               */
+#define CTIMER_CAPTURECONTROL_CAPTURE3_Msk (0x8UL)                  /*!< CTIMER CAPTURECONTROL: CAPTURE3 (Bitfield-Mask: 0x01) */
+#define CTIMER_CAPTURECONTROL_CAPTURE2_Pos (2UL)                    /*!< CTIMER CAPTURECONTROL: CAPTURE2 (Bit 2)               */
+#define CTIMER_CAPTURECONTROL_CAPTURE2_Msk (0x4UL)                  /*!< CTIMER CAPTURECONTROL: CAPTURE2 (Bitfield-Mask: 0x01) */
+#define CTIMER_CAPTURECONTROL_CAPTURE1_Pos (1UL)                    /*!< CTIMER CAPTURECONTROL: CAPTURE1 (Bit 1)               */
+#define CTIMER_CAPTURECONTROL_CAPTURE1_Msk (0x2UL)                  /*!< CTIMER CAPTURECONTROL: CAPTURE1 (Bitfield-Mask: 0x01) */
+#define CTIMER_CAPTURECONTROL_CAPTURE0_Pos (0UL)                    /*!< CTIMER CAPTURECONTROL: CAPTURE0 (Bit 0)               */
+#define CTIMER_CAPTURECONTROL_CAPTURE0_Msk (0x1UL)                  /*!< CTIMER CAPTURECONTROL: CAPTURE0 (Bitfield-Mask: 0x01) */
+/* ========================================================  SCMPR0  ========================================================= */
+#define CTIMER_SCMPR0_SCMPR0_Pos          (0UL)                     /*!< CTIMER SCMPR0: SCMPR0 (Bit 0)                         */
+#define CTIMER_SCMPR0_SCMPR0_Msk          (0xffffffffUL)            /*!< CTIMER SCMPR0: SCMPR0 (Bitfield-Mask: 0xffffffff)     */
+/* ========================================================  SCMPR1  ========================================================= */
+#define CTIMER_SCMPR1_SCMPR1_Pos          (0UL)                     /*!< CTIMER SCMPR1: SCMPR1 (Bit 0)                         */
+#define CTIMER_SCMPR1_SCMPR1_Msk          (0xffffffffUL)            /*!< CTIMER SCMPR1: SCMPR1 (Bitfield-Mask: 0xffffffff)     */
+/* ========================================================  SCMPR2  ========================================================= */
+#define CTIMER_SCMPR2_SCMPR2_Pos          (0UL)                     /*!< CTIMER SCMPR2: SCMPR2 (Bit 0)                         */
+#define CTIMER_SCMPR2_SCMPR2_Msk          (0xffffffffUL)            /*!< CTIMER SCMPR2: SCMPR2 (Bitfield-Mask: 0xffffffff)     */
+/* ========================================================  SCMPR3  ========================================================= */
+#define CTIMER_SCMPR3_SCMPR3_Pos          (0UL)                     /*!< CTIMER SCMPR3: SCMPR3 (Bit 0)                         */
+#define CTIMER_SCMPR3_SCMPR3_Msk          (0xffffffffUL)            /*!< CTIMER SCMPR3: SCMPR3 (Bitfield-Mask: 0xffffffff)     */
+/* ========================================================  SCMPR4  ========================================================= */
+#define CTIMER_SCMPR4_SCMPR4_Pos          (0UL)                     /*!< CTIMER SCMPR4: SCMPR4 (Bit 0)                         */
+#define CTIMER_SCMPR4_SCMPR4_Msk          (0xffffffffUL)            /*!< CTIMER SCMPR4: SCMPR4 (Bitfield-Mask: 0xffffffff)     */
+/* ========================================================  SCMPR5  ========================================================= */
+#define CTIMER_SCMPR5_SCMPR5_Pos          (0UL)                     /*!< CTIMER SCMPR5: SCMPR5 (Bit 0)                         */
+#define CTIMER_SCMPR5_SCMPR5_Msk          (0xffffffffUL)            /*!< CTIMER SCMPR5: SCMPR5 (Bitfield-Mask: 0xffffffff)     */
+/* ========================================================  SCMPR6  ========================================================= */
+#define CTIMER_SCMPR6_SCMPR6_Pos          (0UL)                     /*!< CTIMER SCMPR6: SCMPR6 (Bit 0)                         */
+#define CTIMER_SCMPR6_SCMPR6_Msk          (0xffffffffUL)            /*!< CTIMER SCMPR6: SCMPR6 (Bitfield-Mask: 0xffffffff)     */
+/* ========================================================  SCMPR7  ========================================================= */
+#define CTIMER_SCMPR7_SCMPR7_Pos          (0UL)                     /*!< CTIMER SCMPR7: SCMPR7 (Bit 0)                         */
+#define CTIMER_SCMPR7_SCMPR7_Msk          (0xffffffffUL)            /*!< CTIMER SCMPR7: SCMPR7 (Bitfield-Mask: 0xffffffff)     */
+/* ========================================================  SCAPT0  ========================================================= */
+#define CTIMER_SCAPT0_SCAPT0_Pos          (0UL)                     /*!< CTIMER SCAPT0: SCAPT0 (Bit 0)                         */
+#define CTIMER_SCAPT0_SCAPT0_Msk          (0xffffffffUL)            /*!< CTIMER SCAPT0: SCAPT0 (Bitfield-Mask: 0xffffffff)     */
+/* ========================================================  SCAPT1  ========================================================= */
+#define CTIMER_SCAPT1_SCAPT1_Pos          (0UL)                     /*!< CTIMER SCAPT1: SCAPT1 (Bit 0)                         */
+#define CTIMER_SCAPT1_SCAPT1_Msk          (0xffffffffUL)            /*!< CTIMER SCAPT1: SCAPT1 (Bitfield-Mask: 0xffffffff)     */
+/* ========================================================  SCAPT2  ========================================================= */
+#define CTIMER_SCAPT2_SCAPT2_Pos          (0UL)                     /*!< CTIMER SCAPT2: SCAPT2 (Bit 0)                         */
+#define CTIMER_SCAPT2_SCAPT2_Msk          (0xffffffffUL)            /*!< CTIMER SCAPT2: SCAPT2 (Bitfield-Mask: 0xffffffff)     */
+/* ========================================================  SCAPT3  ========================================================= */
+#define CTIMER_SCAPT3_SCAPT3_Pos          (0UL)                     /*!< CTIMER SCAPT3: SCAPT3 (Bit 0)                         */
+#define CTIMER_SCAPT3_SCAPT3_Msk          (0xffffffffUL)            /*!< CTIMER SCAPT3: SCAPT3 (Bitfield-Mask: 0xffffffff)     */
+/* =========================================================  SNVR0  ========================================================= */
+#define CTIMER_SNVR0_SNVR0_Pos            (0UL)                     /*!< CTIMER SNVR0: SNVR0 (Bit 0)                           */
+#define CTIMER_SNVR0_SNVR0_Msk            (0xffffffffUL)            /*!< CTIMER SNVR0: SNVR0 (Bitfield-Mask: 0xffffffff)       */
+/* =========================================================  SNVR1  ========================================================= */
+#define CTIMER_SNVR1_SNVR1_Pos            (0UL)                     /*!< CTIMER SNVR1: SNVR1 (Bit 0)                           */
+#define CTIMER_SNVR1_SNVR1_Msk            (0xffffffffUL)            /*!< CTIMER SNVR1: SNVR1 (Bitfield-Mask: 0xffffffff)       */
+/* =========================================================  SNVR2  ========================================================= */
+#define CTIMER_SNVR2_SNVR2_Pos            (0UL)                     /*!< CTIMER SNVR2: SNVR2 (Bit 0)                           */
+#define CTIMER_SNVR2_SNVR2_Msk            (0xffffffffUL)            /*!< CTIMER SNVR2: SNVR2 (Bitfield-Mask: 0xffffffff)       */
+/* =========================================================  SNVR3  ========================================================= */
+#define CTIMER_SNVR3_SNVR3_Pos            (0UL)                     /*!< CTIMER SNVR3: SNVR3 (Bit 0)                           */
+#define CTIMER_SNVR3_SNVR3_Msk            (0xffffffffUL)            /*!< CTIMER SNVR3: SNVR3 (Bitfield-Mask: 0xffffffff)       */
+/* =========================================================  INTEN  ========================================================= */
+#define CTIMER_INTEN_CTMRB7C1INT_Pos      (31UL)                    /*!< CTIMER INTEN: CTMRB7C1INT (Bit 31)                    */
+#define CTIMER_INTEN_CTMRB7C1INT_Msk      (0x80000000UL)            /*!< CTIMER INTEN: CTMRB7C1INT (Bitfield-Mask: 0x01)       */
+#define CTIMER_INTEN_CTMRA7C1INT_Pos      (30UL)                    /*!< CTIMER INTEN: CTMRA7C1INT (Bit 30)                    */
+#define CTIMER_INTEN_CTMRA7C1INT_Msk      (0x40000000UL)            /*!< CTIMER INTEN: CTMRA7C1INT (Bitfield-Mask: 0x01)       */
+#define CTIMER_INTEN_CTMRB6C1INT_Pos      (29UL)                    /*!< CTIMER INTEN: CTMRB6C1INT (Bit 29)                    */
+#define CTIMER_INTEN_CTMRB6C1INT_Msk      (0x20000000UL)            /*!< CTIMER INTEN: CTMRB6C1INT (Bitfield-Mask: 0x01)       */
+#define CTIMER_INTEN_CTMRA6C1INT_Pos      (28UL)                    /*!< CTIMER INTEN: CTMRA6C1INT (Bit 28)                    */
+#define CTIMER_INTEN_CTMRA6C1INT_Msk      (0x10000000UL)            /*!< CTIMER INTEN: CTMRA6C1INT (Bitfield-Mask: 0x01)       */
+#define CTIMER_INTEN_CTMRB5C1INT_Pos      (27UL)                    /*!< CTIMER INTEN: CTMRB5C1INT (Bit 27)                    */
+#define CTIMER_INTEN_CTMRB5C1INT_Msk      (0x8000000UL)             /*!< CTIMER INTEN: CTMRB5C1INT (Bitfield-Mask: 0x01)       */
+#define CTIMER_INTEN_CTMRA5C1INT_Pos      (26UL)                    /*!< CTIMER INTEN: CTMRA5C1INT (Bit 26)                    */
+#define CTIMER_INTEN_CTMRA5C1INT_Msk      (0x4000000UL)             /*!< CTIMER INTEN: CTMRA5C1INT (Bitfield-Mask: 0x01)       */
+#define CTIMER_INTEN_CTMRB4C1INT_Pos      (25UL)                    /*!< CTIMER INTEN: CTMRB4C1INT (Bit 25)                    */
+#define CTIMER_INTEN_CTMRB4C1INT_Msk      (0x2000000UL)             /*!< CTIMER INTEN: CTMRB4C1INT (Bitfield-Mask: 0x01)       */
+#define CTIMER_INTEN_CTMRA4C1INT_Pos      (24UL)                    /*!< CTIMER INTEN: CTMRA4C1INT (Bit 24)                    */
+#define CTIMER_INTEN_CTMRA4C1INT_Msk      (0x1000000UL)             /*!< CTIMER INTEN: CTMRA4C1INT (Bitfield-Mask: 0x01)       */
+#define CTIMER_INTEN_CTMRB3C1INT_Pos      (23UL)                    /*!< CTIMER INTEN: CTMRB3C1INT (Bit 23)                    */
+#define CTIMER_INTEN_CTMRB3C1INT_Msk      (0x800000UL)              /*!< CTIMER INTEN: CTMRB3C1INT (Bitfield-Mask: 0x01)       */
+#define CTIMER_INTEN_CTMRA3C1INT_Pos      (22UL)                    /*!< CTIMER INTEN: CTMRA3C1INT (Bit 22)                    */
+#define CTIMER_INTEN_CTMRA3C1INT_Msk      (0x400000UL)              /*!< CTIMER INTEN: CTMRA3C1INT (Bitfield-Mask: 0x01)       */
+#define CTIMER_INTEN_CTMRB2C1INT_Pos      (21UL)                    /*!< CTIMER INTEN: CTMRB2C1INT (Bit 21)                    */
+#define CTIMER_INTEN_CTMRB2C1INT_Msk      (0x200000UL)              /*!< CTIMER INTEN: CTMRB2C1INT (Bitfield-Mask: 0x01)       */
+#define CTIMER_INTEN_CTMRA2C1INT_Pos      (20UL)                    /*!< CTIMER INTEN: CTMRA2C1INT (Bit 20)                    */
+#define CTIMER_INTEN_CTMRA2C1INT_Msk      (0x100000UL)              /*!< CTIMER INTEN: CTMRA2C1INT (Bitfield-Mask: 0x01)       */
+#define CTIMER_INTEN_CTMRB1C1INT_Pos      (19UL)                    /*!< CTIMER INTEN: CTMRB1C1INT (Bit 19)                    */
+#define CTIMER_INTEN_CTMRB1C1INT_Msk      (0x80000UL)               /*!< CTIMER INTEN: CTMRB1C1INT (Bitfield-Mask: 0x01)       */
+#define CTIMER_INTEN_CTMRA1C1INT_Pos      (18UL)                    /*!< CTIMER INTEN: CTMRA1C1INT (Bit 18)                    */
+#define CTIMER_INTEN_CTMRA1C1INT_Msk      (0x40000UL)               /*!< CTIMER INTEN: CTMRA1C1INT (Bitfield-Mask: 0x01)       */
+#define CTIMER_INTEN_CTMRB0C1INT_Pos      (17UL)                    /*!< CTIMER INTEN: CTMRB0C1INT (Bit 17)                    */
+#define CTIMER_INTEN_CTMRB0C1INT_Msk      (0x20000UL)               /*!< CTIMER INTEN: CTMRB0C1INT (Bitfield-Mask: 0x01)       */
+#define CTIMER_INTEN_CTMRA0C1INT_Pos      (16UL)                    /*!< CTIMER INTEN: CTMRA0C1INT (Bit 16)                    */
+#define CTIMER_INTEN_CTMRA0C1INT_Msk      (0x10000UL)               /*!< CTIMER INTEN: CTMRA0C1INT (Bitfield-Mask: 0x01)       */
+#define CTIMER_INTEN_CTMRB7C0INT_Pos      (15UL)                    /*!< CTIMER INTEN: CTMRB7C0INT (Bit 15)                    */
+#define CTIMER_INTEN_CTMRB7C0INT_Msk      (0x8000UL)                /*!< CTIMER INTEN: CTMRB7C0INT (Bitfield-Mask: 0x01)       */
+#define CTIMER_INTEN_CTMRA7C0INT_Pos      (14UL)                    /*!< CTIMER INTEN: CTMRA7C0INT (Bit 14)                    */
+#define CTIMER_INTEN_CTMRA7C0INT_Msk      (0x4000UL)                /*!< CTIMER INTEN: CTMRA7C0INT (Bitfield-Mask: 0x01)       */
+#define CTIMER_INTEN_CTMRB6C0INT_Pos      (13UL)                    /*!< CTIMER INTEN: CTMRB6C0INT (Bit 13)                    */
+#define CTIMER_INTEN_CTMRB6C0INT_Msk      (0x2000UL)                /*!< CTIMER INTEN: CTMRB6C0INT (Bitfield-Mask: 0x01)       */
+#define CTIMER_INTEN_CTMRA6C0INT_Pos      (12UL)                    /*!< CTIMER INTEN: CTMRA6C0INT (Bit 12)                    */
+#define CTIMER_INTEN_CTMRA6C0INT_Msk      (0x1000UL)                /*!< CTIMER INTEN: CTMRA6C0INT (Bitfield-Mask: 0x01)       */
+#define CTIMER_INTEN_CTMRB5C0INT_Pos      (11UL)                    /*!< CTIMER INTEN: CTMRB5C0INT (Bit 11)                    */
+#define CTIMER_INTEN_CTMRB5C0INT_Msk      (0x800UL)                 /*!< CTIMER INTEN: CTMRB5C0INT (Bitfield-Mask: 0x01)       */
+#define CTIMER_INTEN_CTMRA5C0INT_Pos      (10UL)                    /*!< CTIMER INTEN: CTMRA5C0INT (Bit 10)                    */
+#define CTIMER_INTEN_CTMRA5C0INT_Msk      (0x400UL)                 /*!< CTIMER INTEN: CTMRA5C0INT (Bitfield-Mask: 0x01)       */
+#define CTIMER_INTEN_CTMRB4C0INT_Pos      (9UL)                     /*!< CTIMER INTEN: CTMRB4C0INT (Bit 9)                     */
+#define CTIMER_INTEN_CTMRB4C0INT_Msk      (0x200UL)                 /*!< CTIMER INTEN: CTMRB4C0INT (Bitfield-Mask: 0x01)       */
+#define CTIMER_INTEN_CTMRA4C0INT_Pos      (8UL)                     /*!< CTIMER INTEN: CTMRA4C0INT (Bit 8)                     */
+#define CTIMER_INTEN_CTMRA4C0INT_Msk      (0x100UL)                 /*!< CTIMER INTEN: CTMRA4C0INT (Bitfield-Mask: 0x01)       */
+#define CTIMER_INTEN_CTMRB3C0INT_Pos      (7UL)                     /*!< CTIMER INTEN: CTMRB3C0INT (Bit 7)                     */
+#define CTIMER_INTEN_CTMRB3C0INT_Msk      (0x80UL)                  /*!< CTIMER INTEN: CTMRB3C0INT (Bitfield-Mask: 0x01)       */
+#define CTIMER_INTEN_CTMRA3C0INT_Pos      (6UL)                     /*!< CTIMER INTEN: CTMRA3C0INT (Bit 6)                     */
+#define CTIMER_INTEN_CTMRA3C0INT_Msk      (0x40UL)                  /*!< CTIMER INTEN: CTMRA3C0INT (Bitfield-Mask: 0x01)       */
+#define CTIMER_INTEN_CTMRB2C0INT_Pos      (5UL)                     /*!< CTIMER INTEN: CTMRB2C0INT (Bit 5)                     */
+#define CTIMER_INTEN_CTMRB2C0INT_Msk      (0x20UL)                  /*!< CTIMER INTEN: CTMRB2C0INT (Bitfield-Mask: 0x01)       */
+#define CTIMER_INTEN_CTMRA2C0INT_Pos      (4UL)                     /*!< CTIMER INTEN: CTMRA2C0INT (Bit 4)                     */
+#define CTIMER_INTEN_CTMRA2C0INT_Msk      (0x10UL)                  /*!< CTIMER INTEN: CTMRA2C0INT (Bitfield-Mask: 0x01)       */
+#define CTIMER_INTEN_CTMRB1C0INT_Pos      (3UL)                     /*!< CTIMER INTEN: CTMRB1C0INT (Bit 3)                     */
+#define CTIMER_INTEN_CTMRB1C0INT_Msk      (0x8UL)                   /*!< CTIMER INTEN: CTMRB1C0INT (Bitfield-Mask: 0x01)       */
+#define CTIMER_INTEN_CTMRA1C0INT_Pos      (2UL)                     /*!< CTIMER INTEN: CTMRA1C0INT (Bit 2)                     */
+#define CTIMER_INTEN_CTMRA1C0INT_Msk      (0x4UL)                   /*!< CTIMER INTEN: CTMRA1C0INT (Bitfield-Mask: 0x01)       */
+#define CTIMER_INTEN_CTMRB0C0INT_Pos      (1UL)                     /*!< CTIMER INTEN: CTMRB0C0INT (Bit 1)                     */
+#define CTIMER_INTEN_CTMRB0C0INT_Msk      (0x2UL)                   /*!< CTIMER INTEN: CTMRB0C0INT (Bitfield-Mask: 0x01)       */
+#define CTIMER_INTEN_CTMRA0C0INT_Pos      (0UL)                     /*!< CTIMER INTEN: CTMRA0C0INT (Bit 0)                     */
+#define CTIMER_INTEN_CTMRA0C0INT_Msk      (0x1UL)                   /*!< CTIMER INTEN: CTMRA0C0INT (Bitfield-Mask: 0x01)       */
+/* ========================================================  INTSTAT  ======================================================== */
+#define CTIMER_INTSTAT_CTMRB7C1INT_Pos    (31UL)                    /*!< CTIMER INTSTAT: CTMRB7C1INT (Bit 31)                  */
+#define CTIMER_INTSTAT_CTMRB7C1INT_Msk    (0x80000000UL)            /*!< CTIMER INTSTAT: CTMRB7C1INT (Bitfield-Mask: 0x01)     */
+#define CTIMER_INTSTAT_CTMRA7C1INT_Pos    (30UL)                    /*!< CTIMER INTSTAT: CTMRA7C1INT (Bit 30)                  */
+#define CTIMER_INTSTAT_CTMRA7C1INT_Msk    (0x40000000UL)            /*!< CTIMER INTSTAT: CTMRA7C1INT (Bitfield-Mask: 0x01)     */
+#define CTIMER_INTSTAT_CTMRB6C1INT_Pos    (29UL)                    /*!< CTIMER INTSTAT: CTMRB6C1INT (Bit 29)                  */
+#define CTIMER_INTSTAT_CTMRB6C1INT_Msk    (0x20000000UL)            /*!< CTIMER INTSTAT: CTMRB6C1INT (Bitfield-Mask: 0x01)     */
+#define CTIMER_INTSTAT_CTMRA6C1INT_Pos    (28UL)                    /*!< CTIMER INTSTAT: CTMRA6C1INT (Bit 28)                  */
+#define CTIMER_INTSTAT_CTMRA6C1INT_Msk    (0x10000000UL)            /*!< CTIMER INTSTAT: CTMRA6C1INT (Bitfield-Mask: 0x01)     */
+#define CTIMER_INTSTAT_CTMRB5C1INT_Pos    (27UL)                    /*!< CTIMER INTSTAT: CTMRB5C1INT (Bit 27)                  */
+#define CTIMER_INTSTAT_CTMRB5C1INT_Msk    (0x8000000UL)             /*!< CTIMER INTSTAT: CTMRB5C1INT (Bitfield-Mask: 0x01)     */
+#define CTIMER_INTSTAT_CTMRA5C1INT_Pos    (26UL)                    /*!< CTIMER INTSTAT: CTMRA5C1INT (Bit 26)                  */
+#define CTIMER_INTSTAT_CTMRA5C1INT_Msk    (0x4000000UL)             /*!< CTIMER INTSTAT: CTMRA5C1INT (Bitfield-Mask: 0x01)     */
+#define CTIMER_INTSTAT_CTMRB4C1INT_Pos    (25UL)                    /*!< CTIMER INTSTAT: CTMRB4C1INT (Bit 25)                  */
+#define CTIMER_INTSTAT_CTMRB4C1INT_Msk    (0x2000000UL)             /*!< CTIMER INTSTAT: CTMRB4C1INT (Bitfield-Mask: 0x01)     */
+#define CTIMER_INTSTAT_CTMRA4C1INT_Pos    (24UL)                    /*!< CTIMER INTSTAT: CTMRA4C1INT (Bit 24)                  */
+#define CTIMER_INTSTAT_CTMRA4C1INT_Msk    (0x1000000UL)             /*!< CTIMER INTSTAT: CTMRA4C1INT (Bitfield-Mask: 0x01)     */
+#define CTIMER_INTSTAT_CTMRB3C1INT_Pos    (23UL)                    /*!< CTIMER INTSTAT: CTMRB3C1INT (Bit 23)                  */
+#define CTIMER_INTSTAT_CTMRB3C1INT_Msk    (0x800000UL)              /*!< CTIMER INTSTAT: CTMRB3C1INT (Bitfield-Mask: 0x01)     */
+#define CTIMER_INTSTAT_CTMRA3C1INT_Pos    (22UL)                    /*!< CTIMER INTSTAT: CTMRA3C1INT (Bit 22)                  */
+#define CTIMER_INTSTAT_CTMRA3C1INT_Msk    (0x400000UL)              /*!< CTIMER INTSTAT: CTMRA3C1INT (Bitfield-Mask: 0x01)     */
+#define CTIMER_INTSTAT_CTMRB2C1INT_Pos    (21UL)                    /*!< CTIMER INTSTAT: CTMRB2C1INT (Bit 21)                  */
+#define CTIMER_INTSTAT_CTMRB2C1INT_Msk    (0x200000UL)              /*!< CTIMER INTSTAT: CTMRB2C1INT (Bitfield-Mask: 0x01)     */
+#define CTIMER_INTSTAT_CTMRA2C1INT_Pos    (20UL)                    /*!< CTIMER INTSTAT: CTMRA2C1INT (Bit 20)                  */
+#define CTIMER_INTSTAT_CTMRA2C1INT_Msk    (0x100000UL)              /*!< CTIMER INTSTAT: CTMRA2C1INT (Bitfield-Mask: 0x01)     */
+#define CTIMER_INTSTAT_CTMRB1C1INT_Pos    (19UL)                    /*!< CTIMER INTSTAT: CTMRB1C1INT (Bit 19)                  */
+#define CTIMER_INTSTAT_CTMRB1C1INT_Msk    (0x80000UL)               /*!< CTIMER INTSTAT: CTMRB1C1INT (Bitfield-Mask: 0x01)     */
+#define CTIMER_INTSTAT_CTMRA1C1INT_Pos    (18UL)                    /*!< CTIMER INTSTAT: CTMRA1C1INT (Bit 18)                  */
+#define CTIMER_INTSTAT_CTMRA1C1INT_Msk    (0x40000UL)               /*!< CTIMER INTSTAT: CTMRA1C1INT (Bitfield-Mask: 0x01)     */
+#define CTIMER_INTSTAT_CTMRB0C1INT_Pos    (17UL)                    /*!< CTIMER INTSTAT: CTMRB0C1INT (Bit 17)                  */
+#define CTIMER_INTSTAT_CTMRB0C1INT_Msk    (0x20000UL)               /*!< CTIMER INTSTAT: CTMRB0C1INT (Bitfield-Mask: 0x01)     */
+#define CTIMER_INTSTAT_CTMRA0C1INT_Pos    (16UL)                    /*!< CTIMER INTSTAT: CTMRA0C1INT (Bit 16)                  */
+#define CTIMER_INTSTAT_CTMRA0C1INT_Msk    (0x10000UL)               /*!< CTIMER INTSTAT: CTMRA0C1INT (Bitfield-Mask: 0x01)     */
+#define CTIMER_INTSTAT_CTMRB7C0INT_Pos    (15UL)                    /*!< CTIMER INTSTAT: CTMRB7C0INT (Bit 15)                  */
+#define CTIMER_INTSTAT_CTMRB7C0INT_Msk    (0x8000UL)                /*!< CTIMER INTSTAT: CTMRB7C0INT (Bitfield-Mask: 0x01)     */
+#define CTIMER_INTSTAT_CTMRA7C0INT_Pos    (14UL)                    /*!< CTIMER INTSTAT: CTMRA7C0INT (Bit 14)                  */
+#define CTIMER_INTSTAT_CTMRA7C0INT_Msk    (0x4000UL)                /*!< CTIMER INTSTAT: CTMRA7C0INT (Bitfield-Mask: 0x01)     */
+#define CTIMER_INTSTAT_CTMRB6C0INT_Pos    (13UL)                    /*!< CTIMER INTSTAT: CTMRB6C0INT (Bit 13)                  */
+#define CTIMER_INTSTAT_CTMRB6C0INT_Msk    (0x2000UL)                /*!< CTIMER INTSTAT: CTMRB6C0INT (Bitfield-Mask: 0x01)     */
+#define CTIMER_INTSTAT_CTMRA6C0INT_Pos    (12UL)                    /*!< CTIMER INTSTAT: CTMRA6C0INT (Bit 12)                  */
+#define CTIMER_INTSTAT_CTMRA6C0INT_Msk    (0x1000UL)                /*!< CTIMER INTSTAT: CTMRA6C0INT (Bitfield-Mask: 0x01)     */
+#define CTIMER_INTSTAT_CTMRB5C0INT_Pos    (11UL)                    /*!< CTIMER INTSTAT: CTMRB5C0INT (Bit 11)                  */
+#define CTIMER_INTSTAT_CTMRB5C0INT_Msk    (0x800UL)                 /*!< CTIMER INTSTAT: CTMRB5C0INT (Bitfield-Mask: 0x01)     */
+#define CTIMER_INTSTAT_CTMRA5C0INT_Pos    (10UL)                    /*!< CTIMER INTSTAT: CTMRA5C0INT (Bit 10)                  */
+#define CTIMER_INTSTAT_CTMRA5C0INT_Msk    (0x400UL)                 /*!< CTIMER INTSTAT: CTMRA5C0INT (Bitfield-Mask: 0x01)     */
+#define CTIMER_INTSTAT_CTMRB4C0INT_Pos    (9UL)                     /*!< CTIMER INTSTAT: CTMRB4C0INT (Bit 9)                   */
+#define CTIMER_INTSTAT_CTMRB4C0INT_Msk    (0x200UL)                 /*!< CTIMER INTSTAT: CTMRB4C0INT (Bitfield-Mask: 0x01)     */
+#define CTIMER_INTSTAT_CTMRA4C0INT_Pos    (8UL)                     /*!< CTIMER INTSTAT: CTMRA4C0INT (Bit 8)                   */
+#define CTIMER_INTSTAT_CTMRA4C0INT_Msk    (0x100UL)                 /*!< CTIMER INTSTAT: CTMRA4C0INT (Bitfield-Mask: 0x01)     */
+#define CTIMER_INTSTAT_CTMRB3C0INT_Pos    (7UL)                     /*!< CTIMER INTSTAT: CTMRB3C0INT (Bit 7)                   */
+#define CTIMER_INTSTAT_CTMRB3C0INT_Msk    (0x80UL)                  /*!< CTIMER INTSTAT: CTMRB3C0INT (Bitfield-Mask: 0x01)     */
+#define CTIMER_INTSTAT_CTMRA3C0INT_Pos    (6UL)                     /*!< CTIMER INTSTAT: CTMRA3C0INT (Bit 6)                   */
+#define CTIMER_INTSTAT_CTMRA3C0INT_Msk    (0x40UL)                  /*!< CTIMER INTSTAT: CTMRA3C0INT (Bitfield-Mask: 0x01)     */
+#define CTIMER_INTSTAT_CTMRB2C0INT_Pos    (5UL)                     /*!< CTIMER INTSTAT: CTMRB2C0INT (Bit 5)                   */
+#define CTIMER_INTSTAT_CTMRB2C0INT_Msk    (0x20UL)                  /*!< CTIMER INTSTAT: CTMRB2C0INT (Bitfield-Mask: 0x01)     */
+#define CTIMER_INTSTAT_CTMRA2C0INT_Pos    (4UL)                     /*!< CTIMER INTSTAT: CTMRA2C0INT (Bit 4)                   */
+#define CTIMER_INTSTAT_CTMRA2C0INT_Msk    (0x10UL)                  /*!< CTIMER INTSTAT: CTMRA2C0INT (Bitfield-Mask: 0x01)     */
+#define CTIMER_INTSTAT_CTMRB1C0INT_Pos    (3UL)                     /*!< CTIMER INTSTAT: CTMRB1C0INT (Bit 3)                   */
+#define CTIMER_INTSTAT_CTMRB1C0INT_Msk    (0x8UL)                   /*!< CTIMER INTSTAT: CTMRB1C0INT (Bitfield-Mask: 0x01)     */
+#define CTIMER_INTSTAT_CTMRA1C0INT_Pos    (2UL)                     /*!< CTIMER INTSTAT: CTMRA1C0INT (Bit 2)                   */
+#define CTIMER_INTSTAT_CTMRA1C0INT_Msk    (0x4UL)                   /*!< CTIMER INTSTAT: CTMRA1C0INT (Bitfield-Mask: 0x01)     */
+#define CTIMER_INTSTAT_CTMRB0C0INT_Pos    (1UL)                     /*!< CTIMER INTSTAT: CTMRB0C0INT (Bit 1)                   */
+#define CTIMER_INTSTAT_CTMRB0C0INT_Msk    (0x2UL)                   /*!< CTIMER INTSTAT: CTMRB0C0INT (Bitfield-Mask: 0x01)     */
+#define CTIMER_INTSTAT_CTMRA0C0INT_Pos    (0UL)                     /*!< CTIMER INTSTAT: CTMRA0C0INT (Bit 0)                   */
+#define CTIMER_INTSTAT_CTMRA0C0INT_Msk    (0x1UL)                   /*!< CTIMER INTSTAT: CTMRA0C0INT (Bitfield-Mask: 0x01)     */
+/* ========================================================  INTCLR  ========================================================= */
+#define CTIMER_INTCLR_CTMRB7C1INT_Pos     (31UL)                    /*!< CTIMER INTCLR: CTMRB7C1INT (Bit 31)                   */
+#define CTIMER_INTCLR_CTMRB7C1INT_Msk     (0x80000000UL)            /*!< CTIMER INTCLR: CTMRB7C1INT (Bitfield-Mask: 0x01)      */
+#define CTIMER_INTCLR_CTMRA7C1INT_Pos     (30UL)                    /*!< CTIMER INTCLR: CTMRA7C1INT (Bit 30)                   */
+#define CTIMER_INTCLR_CTMRA7C1INT_Msk     (0x40000000UL)            /*!< CTIMER INTCLR: CTMRA7C1INT (Bitfield-Mask: 0x01)      */
+#define CTIMER_INTCLR_CTMRB6C1INT_Pos     (29UL)                    /*!< CTIMER INTCLR: CTMRB6C1INT (Bit 29)                   */
+#define CTIMER_INTCLR_CTMRB6C1INT_Msk     (0x20000000UL)            /*!< CTIMER INTCLR: CTMRB6C1INT (Bitfield-Mask: 0x01)      */
+#define CTIMER_INTCLR_CTMRA6C1INT_Pos     (28UL)                    /*!< CTIMER INTCLR: CTMRA6C1INT (Bit 28)                   */
+#define CTIMER_INTCLR_CTMRA6C1INT_Msk     (0x10000000UL)            /*!< CTIMER INTCLR: CTMRA6C1INT (Bitfield-Mask: 0x01)      */
+#define CTIMER_INTCLR_CTMRB5C1INT_Pos     (27UL)                    /*!< CTIMER INTCLR: CTMRB5C1INT (Bit 27)                   */
+#define CTIMER_INTCLR_CTMRB5C1INT_Msk     (0x8000000UL)             /*!< CTIMER INTCLR: CTMRB5C1INT (Bitfield-Mask: 0x01)      */
+#define CTIMER_INTCLR_CTMRA5C1INT_Pos     (26UL)                    /*!< CTIMER INTCLR: CTMRA5C1INT (Bit 26)                   */
+#define CTIMER_INTCLR_CTMRA5C1INT_Msk     (0x4000000UL)             /*!< CTIMER INTCLR: CTMRA5C1INT (Bitfield-Mask: 0x01)      */
+#define CTIMER_INTCLR_CTMRB4C1INT_Pos     (25UL)                    /*!< CTIMER INTCLR: CTMRB4C1INT (Bit 25)                   */
+#define CTIMER_INTCLR_CTMRB4C1INT_Msk     (0x2000000UL)             /*!< CTIMER INTCLR: CTMRB4C1INT (Bitfield-Mask: 0x01)      */
+#define CTIMER_INTCLR_CTMRA4C1INT_Pos     (24UL)                    /*!< CTIMER INTCLR: CTMRA4C1INT (Bit 24)                   */
+#define CTIMER_INTCLR_CTMRA4C1INT_Msk     (0x1000000UL)             /*!< CTIMER INTCLR: CTMRA4C1INT (Bitfield-Mask: 0x01)      */
+#define CTIMER_INTCLR_CTMRB3C1INT_Pos     (23UL)                    /*!< CTIMER INTCLR: CTMRB3C1INT (Bit 23)                   */
+#define CTIMER_INTCLR_CTMRB3C1INT_Msk     (0x800000UL)              /*!< CTIMER INTCLR: CTMRB3C1INT (Bitfield-Mask: 0x01)      */
+#define CTIMER_INTCLR_CTMRA3C1INT_Pos     (22UL)                    /*!< CTIMER INTCLR: CTMRA3C1INT (Bit 22)                   */
+#define CTIMER_INTCLR_CTMRA3C1INT_Msk     (0x400000UL)              /*!< CTIMER INTCLR: CTMRA3C1INT (Bitfield-Mask: 0x01)      */
+#define CTIMER_INTCLR_CTMRB2C1INT_Pos     (21UL)                    /*!< CTIMER INTCLR: CTMRB2C1INT (Bit 21)                   */
+#define CTIMER_INTCLR_CTMRB2C1INT_Msk     (0x200000UL)              /*!< CTIMER INTCLR: CTMRB2C1INT (Bitfield-Mask: 0x01)      */
+#define CTIMER_INTCLR_CTMRA2C1INT_Pos     (20UL)                    /*!< CTIMER INTCLR: CTMRA2C1INT (Bit 20)                   */
+#define CTIMER_INTCLR_CTMRA2C1INT_Msk     (0x100000UL)              /*!< CTIMER INTCLR: CTMRA2C1INT (Bitfield-Mask: 0x01)      */
+#define CTIMER_INTCLR_CTMRB1C1INT_Pos     (19UL)                    /*!< CTIMER INTCLR: CTMRB1C1INT (Bit 19)                   */
+#define CTIMER_INTCLR_CTMRB1C1INT_Msk     (0x80000UL)               /*!< CTIMER INTCLR: CTMRB1C1INT (Bitfield-Mask: 0x01)      */
+#define CTIMER_INTCLR_CTMRA1C1INT_Pos     (18UL)                    /*!< CTIMER INTCLR: CTMRA1C1INT (Bit 18)                   */
+#define CTIMER_INTCLR_CTMRA1C1INT_Msk     (0x40000UL)               /*!< CTIMER INTCLR: CTMRA1C1INT (Bitfield-Mask: 0x01)      */
+#define CTIMER_INTCLR_CTMRB0C1INT_Pos     (17UL)                    /*!< CTIMER INTCLR: CTMRB0C1INT (Bit 17)                   */
+#define CTIMER_INTCLR_CTMRB0C1INT_Msk     (0x20000UL)               /*!< CTIMER INTCLR: CTMRB0C1INT (Bitfield-Mask: 0x01)      */
+#define CTIMER_INTCLR_CTMRA0C1INT_Pos     (16UL)                    /*!< CTIMER INTCLR: CTMRA0C1INT (Bit 16)                   */
+#define CTIMER_INTCLR_CTMRA0C1INT_Msk     (0x10000UL)               /*!< CTIMER INTCLR: CTMRA0C1INT (Bitfield-Mask: 0x01)      */
+#define CTIMER_INTCLR_CTMRB7C0INT_Pos     (15UL)                    /*!< CTIMER INTCLR: CTMRB7C0INT (Bit 15)                   */
+#define CTIMER_INTCLR_CTMRB7C0INT_Msk     (0x8000UL)                /*!< CTIMER INTCLR: CTMRB7C0INT (Bitfield-Mask: 0x01)      */
+#define CTIMER_INTCLR_CTMRA7C0INT_Pos     (14UL)                    /*!< CTIMER INTCLR: CTMRA7C0INT (Bit 14)                   */
+#define CTIMER_INTCLR_CTMRA7C0INT_Msk     (0x4000UL)                /*!< CTIMER INTCLR: CTMRA7C0INT (Bitfield-Mask: 0x01)      */
+#define CTIMER_INTCLR_CTMRB6C0INT_Pos     (13UL)                    /*!< CTIMER INTCLR: CTMRB6C0INT (Bit 13)                   */
+#define CTIMER_INTCLR_CTMRB6C0INT_Msk     (0x2000UL)                /*!< CTIMER INTCLR: CTMRB6C0INT (Bitfield-Mask: 0x01)      */
+#define CTIMER_INTCLR_CTMRA6C0INT_Pos     (12UL)                    /*!< CTIMER INTCLR: CTMRA6C0INT (Bit 12)                   */
+#define CTIMER_INTCLR_CTMRA6C0INT_Msk     (0x1000UL)                /*!< CTIMER INTCLR: CTMRA6C0INT (Bitfield-Mask: 0x01)      */
+#define CTIMER_INTCLR_CTMRB5C0INT_Pos     (11UL)                    /*!< CTIMER INTCLR: CTMRB5C0INT (Bit 11)                   */
+#define CTIMER_INTCLR_CTMRB5C0INT_Msk     (0x800UL)                 /*!< CTIMER INTCLR: CTMRB5C0INT (Bitfield-Mask: 0x01)      */
+#define CTIMER_INTCLR_CTMRA5C0INT_Pos     (10UL)                    /*!< CTIMER INTCLR: CTMRA5C0INT (Bit 10)                   */
+#define CTIMER_INTCLR_CTMRA5C0INT_Msk     (0x400UL)                 /*!< CTIMER INTCLR: CTMRA5C0INT (Bitfield-Mask: 0x01)      */
+#define CTIMER_INTCLR_CTMRB4C0INT_Pos     (9UL)                     /*!< CTIMER INTCLR: CTMRB4C0INT (Bit 9)                    */
+#define CTIMER_INTCLR_CTMRB4C0INT_Msk     (0x200UL)                 /*!< CTIMER INTCLR: CTMRB4C0INT (Bitfield-Mask: 0x01)      */
+#define CTIMER_INTCLR_CTMRA4C0INT_Pos     (8UL)                     /*!< CTIMER INTCLR: CTMRA4C0INT (Bit 8)                    */
+#define CTIMER_INTCLR_CTMRA4C0INT_Msk     (0x100UL)                 /*!< CTIMER INTCLR: CTMRA4C0INT (Bitfield-Mask: 0x01)      */
+#define CTIMER_INTCLR_CTMRB3C0INT_Pos     (7UL)                     /*!< CTIMER INTCLR: CTMRB3C0INT (Bit 7)                    */
+#define CTIMER_INTCLR_CTMRB3C0INT_Msk     (0x80UL)                  /*!< CTIMER INTCLR: CTMRB3C0INT (Bitfield-Mask: 0x01)      */
+#define CTIMER_INTCLR_CTMRA3C0INT_Pos     (6UL)                     /*!< CTIMER INTCLR: CTMRA3C0INT (Bit 6)                    */
+#define CTIMER_INTCLR_CTMRA3C0INT_Msk     (0x40UL)                  /*!< CTIMER INTCLR: CTMRA3C0INT (Bitfield-Mask: 0x01)      */
+#define CTIMER_INTCLR_CTMRB2C0INT_Pos     (5UL)                     /*!< CTIMER INTCLR: CTMRB2C0INT (Bit 5)                    */
+#define CTIMER_INTCLR_CTMRB2C0INT_Msk     (0x20UL)                  /*!< CTIMER INTCLR: CTMRB2C0INT (Bitfield-Mask: 0x01)      */
+#define CTIMER_INTCLR_CTMRA2C0INT_Pos     (4UL)                     /*!< CTIMER INTCLR: CTMRA2C0INT (Bit 4)                    */
+#define CTIMER_INTCLR_CTMRA2C0INT_Msk     (0x10UL)                  /*!< CTIMER INTCLR: CTMRA2C0INT (Bitfield-Mask: 0x01)      */
+#define CTIMER_INTCLR_CTMRB1C0INT_Pos     (3UL)                     /*!< CTIMER INTCLR: CTMRB1C0INT (Bit 3)                    */
+#define CTIMER_INTCLR_CTMRB1C0INT_Msk     (0x8UL)                   /*!< CTIMER INTCLR: CTMRB1C0INT (Bitfield-Mask: 0x01)      */
+#define CTIMER_INTCLR_CTMRA1C0INT_Pos     (2UL)                     /*!< CTIMER INTCLR: CTMRA1C0INT (Bit 2)                    */
+#define CTIMER_INTCLR_CTMRA1C0INT_Msk     (0x4UL)                   /*!< CTIMER INTCLR: CTMRA1C0INT (Bitfield-Mask: 0x01)      */
+#define CTIMER_INTCLR_CTMRB0C0INT_Pos     (1UL)                     /*!< CTIMER INTCLR: CTMRB0C0INT (Bit 1)                    */
+#define CTIMER_INTCLR_CTMRB0C0INT_Msk     (0x2UL)                   /*!< CTIMER INTCLR: CTMRB0C0INT (Bitfield-Mask: 0x01)      */
+#define CTIMER_INTCLR_CTMRA0C0INT_Pos     (0UL)                     /*!< CTIMER INTCLR: CTMRA0C0INT (Bit 0)                    */
+#define CTIMER_INTCLR_CTMRA0C0INT_Msk     (0x1UL)                   /*!< CTIMER INTCLR: CTMRA0C0INT (Bitfield-Mask: 0x01)      */
+/* ========================================================  INTSET  ========================================================= */
+#define CTIMER_INTSET_CTMRB7C1INT_Pos     (31UL)                    /*!< CTIMER INTSET: CTMRB7C1INT (Bit 31)                   */
+#define CTIMER_INTSET_CTMRB7C1INT_Msk     (0x80000000UL)            /*!< CTIMER INTSET: CTMRB7C1INT (Bitfield-Mask: 0x01)      */
+#define CTIMER_INTSET_CTMRA7C1INT_Pos     (30UL)                    /*!< CTIMER INTSET: CTMRA7C1INT (Bit 30)                   */
+#define CTIMER_INTSET_CTMRA7C1INT_Msk     (0x40000000UL)            /*!< CTIMER INTSET: CTMRA7C1INT (Bitfield-Mask: 0x01)      */
+#define CTIMER_INTSET_CTMRB6C1INT_Pos     (29UL)                    /*!< CTIMER INTSET: CTMRB6C1INT (Bit 29)                   */
+#define CTIMER_INTSET_CTMRB6C1INT_Msk     (0x20000000UL)            /*!< CTIMER INTSET: CTMRB6C1INT (Bitfield-Mask: 0x01)      */
+#define CTIMER_INTSET_CTMRA6C1INT_Pos     (28UL)                    /*!< CTIMER INTSET: CTMRA6C1INT (Bit 28)                   */
+#define CTIMER_INTSET_CTMRA6C1INT_Msk     (0x10000000UL)            /*!< CTIMER INTSET: CTMRA6C1INT (Bitfield-Mask: 0x01)      */
+#define CTIMER_INTSET_CTMRB5C1INT_Pos     (27UL)                    /*!< CTIMER INTSET: CTMRB5C1INT (Bit 27)                   */
+#define CTIMER_INTSET_CTMRB5C1INT_Msk     (0x8000000UL)             /*!< CTIMER INTSET: CTMRB5C1INT (Bitfield-Mask: 0x01)      */
+#define CTIMER_INTSET_CTMRA5C1INT_Pos     (26UL)                    /*!< CTIMER INTSET: CTMRA5C1INT (Bit 26)                   */
+#define CTIMER_INTSET_CTMRA5C1INT_Msk     (0x4000000UL)             /*!< CTIMER INTSET: CTMRA5C1INT (Bitfield-Mask: 0x01)      */
+#define CTIMER_INTSET_CTMRB4C1INT_Pos     (25UL)                    /*!< CTIMER INTSET: CTMRB4C1INT (Bit 25)                   */
+#define CTIMER_INTSET_CTMRB4C1INT_Msk     (0x2000000UL)             /*!< CTIMER INTSET: CTMRB4C1INT (Bitfield-Mask: 0x01)      */
+#define CTIMER_INTSET_CTMRA4C1INT_Pos     (24UL)                    /*!< CTIMER INTSET: CTMRA4C1INT (Bit 24)                   */
+#define CTIMER_INTSET_CTMRA4C1INT_Msk     (0x1000000UL)             /*!< CTIMER INTSET: CTMRA4C1INT (Bitfield-Mask: 0x01)      */
+#define CTIMER_INTSET_CTMRB3C1INT_Pos     (23UL)                    /*!< CTIMER INTSET: CTMRB3C1INT (Bit 23)                   */
+#define CTIMER_INTSET_CTMRB3C1INT_Msk     (0x800000UL)              /*!< CTIMER INTSET: CTMRB3C1INT (Bitfield-Mask: 0x01)      */
+#define CTIMER_INTSET_CTMRA3C1INT_Pos     (22UL)                    /*!< CTIMER INTSET: CTMRA3C1INT (Bit 22)                   */
+#define CTIMER_INTSET_CTMRA3C1INT_Msk     (0x400000UL)              /*!< CTIMER INTSET: CTMRA3C1INT (Bitfield-Mask: 0x01)      */
+#define CTIMER_INTSET_CTMRB2C1INT_Pos     (21UL)                    /*!< CTIMER INTSET: CTMRB2C1INT (Bit 21)                   */
+#define CTIMER_INTSET_CTMRB2C1INT_Msk     (0x200000UL)              /*!< CTIMER INTSET: CTMRB2C1INT (Bitfield-Mask: 0x01)      */
+#define CTIMER_INTSET_CTMRA2C1INT_Pos     (20UL)                    /*!< CTIMER INTSET: CTMRA2C1INT (Bit 20)                   */
+#define CTIMER_INTSET_CTMRA2C1INT_Msk     (0x100000UL)              /*!< CTIMER INTSET: CTMRA2C1INT (Bitfield-Mask: 0x01)      */
+#define CTIMER_INTSET_CTMRB1C1INT_Pos     (19UL)                    /*!< CTIMER INTSET: CTMRB1C1INT (Bit 19)                   */
+#define CTIMER_INTSET_CTMRB1C1INT_Msk     (0x80000UL)               /*!< CTIMER INTSET: CTMRB1C1INT (Bitfield-Mask: 0x01)      */
+#define CTIMER_INTSET_CTMRA1C1INT_Pos     (18UL)                    /*!< CTIMER INTSET: CTMRA1C1INT (Bit 18)                   */
+#define CTIMER_INTSET_CTMRA1C1INT_Msk     (0x40000UL)               /*!< CTIMER INTSET: CTMRA1C1INT (Bitfield-Mask: 0x01)      */
+#define CTIMER_INTSET_CTMRB0C1INT_Pos     (17UL)                    /*!< CTIMER INTSET: CTMRB0C1INT (Bit 17)                   */
+#define CTIMER_INTSET_CTMRB0C1INT_Msk     (0x20000UL)               /*!< CTIMER INTSET: CTMRB0C1INT (Bitfield-Mask: 0x01)      */
+#define CTIMER_INTSET_CTMRA0C1INT_Pos     (16UL)                    /*!< CTIMER INTSET: CTMRA0C1INT (Bit 16)                   */
+#define CTIMER_INTSET_CTMRA0C1INT_Msk     (0x10000UL)               /*!< CTIMER INTSET: CTMRA0C1INT (Bitfield-Mask: 0x01)      */
+#define CTIMER_INTSET_CTMRB7C0INT_Pos     (15UL)                    /*!< CTIMER INTSET: CTMRB7C0INT (Bit 15)                   */
+#define CTIMER_INTSET_CTMRB7C0INT_Msk     (0x8000UL)                /*!< CTIMER INTSET: CTMRB7C0INT (Bitfield-Mask: 0x01)      */
+#define CTIMER_INTSET_CTMRA7C0INT_Pos     (14UL)                    /*!< CTIMER INTSET: CTMRA7C0INT (Bit 14)                   */
+#define CTIMER_INTSET_CTMRA7C0INT_Msk     (0x4000UL)                /*!< CTIMER INTSET: CTMRA7C0INT (Bitfield-Mask: 0x01)      */
+#define CTIMER_INTSET_CTMRB6C0INT_Pos     (13UL)                    /*!< CTIMER INTSET: CTMRB6C0INT (Bit 13)                   */
+#define CTIMER_INTSET_CTMRB6C0INT_Msk     (0x2000UL)                /*!< CTIMER INTSET: CTMRB6C0INT (Bitfield-Mask: 0x01)      */
+#define CTIMER_INTSET_CTMRA6C0INT_Pos     (12UL)                    /*!< CTIMER INTSET: CTMRA6C0INT (Bit 12)                   */
+#define CTIMER_INTSET_CTMRA6C0INT_Msk     (0x1000UL)                /*!< CTIMER INTSET: CTMRA6C0INT (Bitfield-Mask: 0x01)      */
+#define CTIMER_INTSET_CTMRB5C0INT_Pos     (11UL)                    /*!< CTIMER INTSET: CTMRB5C0INT (Bit 11)                   */
+#define CTIMER_INTSET_CTMRB5C0INT_Msk     (0x800UL)                 /*!< CTIMER INTSET: CTMRB5C0INT (Bitfield-Mask: 0x01)      */
+#define CTIMER_INTSET_CTMRA5C0INT_Pos     (10UL)                    /*!< CTIMER INTSET: CTMRA5C0INT (Bit 10)                   */
+#define CTIMER_INTSET_CTMRA5C0INT_Msk     (0x400UL)                 /*!< CTIMER INTSET: CTMRA5C0INT (Bitfield-Mask: 0x01)      */
+#define CTIMER_INTSET_CTMRB4C0INT_Pos     (9UL)                     /*!< CTIMER INTSET: CTMRB4C0INT (Bit 9)                    */
+#define CTIMER_INTSET_CTMRB4C0INT_Msk     (0x200UL)                 /*!< CTIMER INTSET: CTMRB4C0INT (Bitfield-Mask: 0x01)      */
+#define CTIMER_INTSET_CTMRA4C0INT_Pos     (8UL)                     /*!< CTIMER INTSET: CTMRA4C0INT (Bit 8)                    */
+#define CTIMER_INTSET_CTMRA4C0INT_Msk     (0x100UL)                 /*!< CTIMER INTSET: CTMRA4C0INT (Bitfield-Mask: 0x01)      */
+#define CTIMER_INTSET_CTMRB3C0INT_Pos     (7UL)                     /*!< CTIMER INTSET: CTMRB3C0INT (Bit 7)                    */
+#define CTIMER_INTSET_CTMRB3C0INT_Msk     (0x80UL)                  /*!< CTIMER INTSET: CTMRB3C0INT (Bitfield-Mask: 0x01)      */
+#define CTIMER_INTSET_CTMRA3C0INT_Pos     (6UL)                     /*!< CTIMER INTSET: CTMRA3C0INT (Bit 6)                    */
+#define CTIMER_INTSET_CTMRA3C0INT_Msk     (0x40UL)                  /*!< CTIMER INTSET: CTMRA3C0INT (Bitfield-Mask: 0x01)      */
+#define CTIMER_INTSET_CTMRB2C0INT_Pos     (5UL)                     /*!< CTIMER INTSET: CTMRB2C0INT (Bit 5)                    */
+#define CTIMER_INTSET_CTMRB2C0INT_Msk     (0x20UL)                  /*!< CTIMER INTSET: CTMRB2C0INT (Bitfield-Mask: 0x01)      */
+#define CTIMER_INTSET_CTMRA2C0INT_Pos     (4UL)                     /*!< CTIMER INTSET: CTMRA2C0INT (Bit 4)                    */
+#define CTIMER_INTSET_CTMRA2C0INT_Msk     (0x10UL)                  /*!< CTIMER INTSET: CTMRA2C0INT (Bitfield-Mask: 0x01)      */
+#define CTIMER_INTSET_CTMRB1C0INT_Pos     (3UL)                     /*!< CTIMER INTSET: CTMRB1C0INT (Bit 3)                    */
+#define CTIMER_INTSET_CTMRB1C0INT_Msk     (0x8UL)                   /*!< CTIMER INTSET: CTMRB1C0INT (Bitfield-Mask: 0x01)      */
+#define CTIMER_INTSET_CTMRA1C0INT_Pos     (2UL)                     /*!< CTIMER INTSET: CTMRA1C0INT (Bit 2)                    */
+#define CTIMER_INTSET_CTMRA1C0INT_Msk     (0x4UL)                   /*!< CTIMER INTSET: CTMRA1C0INT (Bitfield-Mask: 0x01)      */
+#define CTIMER_INTSET_CTMRB0C0INT_Pos     (1UL)                     /*!< CTIMER INTSET: CTMRB0C0INT (Bit 1)                    */
+#define CTIMER_INTSET_CTMRB0C0INT_Msk     (0x2UL)                   /*!< CTIMER INTSET: CTMRB0C0INT (Bitfield-Mask: 0x01)      */
+#define CTIMER_INTSET_CTMRA0C0INT_Pos     (0UL)                     /*!< CTIMER INTSET: CTMRA0C0INT (Bit 0)                    */
+#define CTIMER_INTSET_CTMRA0C0INT_Msk     (0x1UL)                   /*!< CTIMER INTSET: CTMRA0C0INT (Bitfield-Mask: 0x01)      */
+/* =======================================================  STMINTEN  ======================================================== */
+#define CTIMER_STMINTEN_CAPTURED_Pos      (12UL)                    /*!< CTIMER STMINTEN: CAPTURED (Bit 12)                    */
+#define CTIMER_STMINTEN_CAPTURED_Msk      (0x1000UL)                /*!< CTIMER STMINTEN: CAPTURED (Bitfield-Mask: 0x01)       */
+#define CTIMER_STMINTEN_CAPTUREC_Pos      (11UL)                    /*!< CTIMER STMINTEN: CAPTUREC (Bit 11)                    */
+#define CTIMER_STMINTEN_CAPTUREC_Msk      (0x800UL)                 /*!< CTIMER STMINTEN: CAPTUREC (Bitfield-Mask: 0x01)       */
+#define CTIMER_STMINTEN_CAPTUREB_Pos      (10UL)                    /*!< CTIMER STMINTEN: CAPTUREB (Bit 10)                    */
+#define CTIMER_STMINTEN_CAPTUREB_Msk      (0x400UL)                 /*!< CTIMER STMINTEN: CAPTUREB (Bitfield-Mask: 0x01)       */
+#define CTIMER_STMINTEN_CAPTUREA_Pos      (9UL)                     /*!< CTIMER STMINTEN: CAPTUREA (Bit 9)                     */
+#define CTIMER_STMINTEN_CAPTUREA_Msk      (0x200UL)                 /*!< CTIMER STMINTEN: CAPTUREA (Bitfield-Mask: 0x01)       */
+#define CTIMER_STMINTEN_OVERFLOW_Pos      (8UL)                     /*!< CTIMER STMINTEN: OVERFLOW (Bit 8)                     */
+#define CTIMER_STMINTEN_OVERFLOW_Msk      (0x100UL)                 /*!< CTIMER STMINTEN: OVERFLOW (Bitfield-Mask: 0x01)       */
+#define CTIMER_STMINTEN_COMPAREH_Pos      (7UL)                     /*!< CTIMER STMINTEN: COMPAREH (Bit 7)                     */
+#define CTIMER_STMINTEN_COMPAREH_Msk      (0x80UL)                  /*!< CTIMER STMINTEN: COMPAREH (Bitfield-Mask: 0x01)       */
+#define CTIMER_STMINTEN_COMPAREG_Pos      (6UL)                     /*!< CTIMER STMINTEN: COMPAREG (Bit 6)                     */
+#define CTIMER_STMINTEN_COMPAREG_Msk      (0x40UL)                  /*!< CTIMER STMINTEN: COMPAREG (Bitfield-Mask: 0x01)       */
+#define CTIMER_STMINTEN_COMPAREF_Pos      (5UL)                     /*!< CTIMER STMINTEN: COMPAREF (Bit 5)                     */
+#define CTIMER_STMINTEN_COMPAREF_Msk      (0x20UL)                  /*!< CTIMER STMINTEN: COMPAREF (Bitfield-Mask: 0x01)       */
+#define CTIMER_STMINTEN_COMPAREE_Pos      (4UL)                     /*!< CTIMER STMINTEN: COMPAREE (Bit 4)                     */
+#define CTIMER_STMINTEN_COMPAREE_Msk      (0x10UL)                  /*!< CTIMER STMINTEN: COMPAREE (Bitfield-Mask: 0x01)       */
+#define CTIMER_STMINTEN_COMPARED_Pos      (3UL)                     /*!< CTIMER STMINTEN: COMPARED (Bit 3)                     */
+#define CTIMER_STMINTEN_COMPARED_Msk      (0x8UL)                   /*!< CTIMER STMINTEN: COMPARED (Bitfield-Mask: 0x01)       */
+#define CTIMER_STMINTEN_COMPAREC_Pos      (2UL)                     /*!< CTIMER STMINTEN: COMPAREC (Bit 2)                     */
+#define CTIMER_STMINTEN_COMPAREC_Msk      (0x4UL)                   /*!< CTIMER STMINTEN: COMPAREC (Bitfield-Mask: 0x01)       */
+#define CTIMER_STMINTEN_COMPAREB_Pos      (1UL)                     /*!< CTIMER STMINTEN: COMPAREB (Bit 1)                     */
+#define CTIMER_STMINTEN_COMPAREB_Msk      (0x2UL)                   /*!< CTIMER STMINTEN: COMPAREB (Bitfield-Mask: 0x01)       */
+#define CTIMER_STMINTEN_COMPAREA_Pos      (0UL)                     /*!< CTIMER STMINTEN: COMPAREA (Bit 0)                     */
+#define CTIMER_STMINTEN_COMPAREA_Msk      (0x1UL)                   /*!< CTIMER STMINTEN: COMPAREA (Bitfield-Mask: 0x01)       */
+/* ======================================================  STMINTSTAT  ======================================================= */
+#define CTIMER_STMINTSTAT_CAPTURED_Pos    (12UL)                    /*!< CTIMER STMINTSTAT: CAPTURED (Bit 12)                  */
+#define CTIMER_STMINTSTAT_CAPTURED_Msk    (0x1000UL)                /*!< CTIMER STMINTSTAT: CAPTURED (Bitfield-Mask: 0x01)     */
+#define CTIMER_STMINTSTAT_CAPTUREC_Pos    (11UL)                    /*!< CTIMER STMINTSTAT: CAPTUREC (Bit 11)                  */
+#define CTIMER_STMINTSTAT_CAPTUREC_Msk    (0x800UL)                 /*!< CTIMER STMINTSTAT: CAPTUREC (Bitfield-Mask: 0x01)     */
+#define CTIMER_STMINTSTAT_CAPTUREB_Pos    (10UL)                    /*!< CTIMER STMINTSTAT: CAPTUREB (Bit 10)                  */
+#define CTIMER_STMINTSTAT_CAPTUREB_Msk    (0x400UL)                 /*!< CTIMER STMINTSTAT: CAPTUREB (Bitfield-Mask: 0x01)     */
+#define CTIMER_STMINTSTAT_CAPTUREA_Pos    (9UL)                     /*!< CTIMER STMINTSTAT: CAPTUREA (Bit 9)                   */
+#define CTIMER_STMINTSTAT_CAPTUREA_Msk    (0x200UL)                 /*!< CTIMER STMINTSTAT: CAPTUREA (Bitfield-Mask: 0x01)     */
+#define CTIMER_STMINTSTAT_OVERFLOW_Pos    (8UL)                     /*!< CTIMER STMINTSTAT: OVERFLOW (Bit 8)                   */
+#define CTIMER_STMINTSTAT_OVERFLOW_Msk    (0x100UL)                 /*!< CTIMER STMINTSTAT: OVERFLOW (Bitfield-Mask: 0x01)     */
+#define CTIMER_STMINTSTAT_COMPAREH_Pos    (7UL)                     /*!< CTIMER STMINTSTAT: COMPAREH (Bit 7)                   */
+#define CTIMER_STMINTSTAT_COMPAREH_Msk    (0x80UL)                  /*!< CTIMER STMINTSTAT: COMPAREH (Bitfield-Mask: 0x01)     */
+#define CTIMER_STMINTSTAT_COMPAREG_Pos    (6UL)                     /*!< CTIMER STMINTSTAT: COMPAREG (Bit 6)                   */
+#define CTIMER_STMINTSTAT_COMPAREG_Msk    (0x40UL)                  /*!< CTIMER STMINTSTAT: COMPAREG (Bitfield-Mask: 0x01)     */
+#define CTIMER_STMINTSTAT_COMPAREF_Pos    (5UL)                     /*!< CTIMER STMINTSTAT: COMPAREF (Bit 5)                   */
+#define CTIMER_STMINTSTAT_COMPAREF_Msk    (0x20UL)                  /*!< CTIMER STMINTSTAT: COMPAREF (Bitfield-Mask: 0x01)     */
+#define CTIMER_STMINTSTAT_COMPAREE_Pos    (4UL)                     /*!< CTIMER STMINTSTAT: COMPAREE (Bit 4)                   */
+#define CTIMER_STMINTSTAT_COMPAREE_Msk    (0x10UL)                  /*!< CTIMER STMINTSTAT: COMPAREE (Bitfield-Mask: 0x01)     */
+#define CTIMER_STMINTSTAT_COMPARED_Pos    (3UL)                     /*!< CTIMER STMINTSTAT: COMPARED (Bit 3)                   */
+#define CTIMER_STMINTSTAT_COMPARED_Msk    (0x8UL)                   /*!< CTIMER STMINTSTAT: COMPARED (Bitfield-Mask: 0x01)     */
+#define CTIMER_STMINTSTAT_COMPAREC_Pos    (2UL)                     /*!< CTIMER STMINTSTAT: COMPAREC (Bit 2)                   */
+#define CTIMER_STMINTSTAT_COMPAREC_Msk    (0x4UL)                   /*!< CTIMER STMINTSTAT: COMPAREC (Bitfield-Mask: 0x01)     */
+#define CTIMER_STMINTSTAT_COMPAREB_Pos    (1UL)                     /*!< CTIMER STMINTSTAT: COMPAREB (Bit 1)                   */
+#define CTIMER_STMINTSTAT_COMPAREB_Msk    (0x2UL)                   /*!< CTIMER STMINTSTAT: COMPAREB (Bitfield-Mask: 0x01)     */
+#define CTIMER_STMINTSTAT_COMPAREA_Pos    (0UL)                     /*!< CTIMER STMINTSTAT: COMPAREA (Bit 0)                   */
+#define CTIMER_STMINTSTAT_COMPAREA_Msk    (0x1UL)                   /*!< CTIMER STMINTSTAT: COMPAREA (Bitfield-Mask: 0x01)     */
+/* =======================================================  STMINTCLR  ======================================================= */
+#define CTIMER_STMINTCLR_CAPTURED_Pos     (12UL)                    /*!< CTIMER STMINTCLR: CAPTURED (Bit 12)                   */
+#define CTIMER_STMINTCLR_CAPTURED_Msk     (0x1000UL)                /*!< CTIMER STMINTCLR: CAPTURED (Bitfield-Mask: 0x01)      */
+#define CTIMER_STMINTCLR_CAPTUREC_Pos     (11UL)                    /*!< CTIMER STMINTCLR: CAPTUREC (Bit 11)                   */
+#define CTIMER_STMINTCLR_CAPTUREC_Msk     (0x800UL)                 /*!< CTIMER STMINTCLR: CAPTUREC (Bitfield-Mask: 0x01)      */
+#define CTIMER_STMINTCLR_CAPTUREB_Pos     (10UL)                    /*!< CTIMER STMINTCLR: CAPTUREB (Bit 10)                   */
+#define CTIMER_STMINTCLR_CAPTUREB_Msk     (0x400UL)                 /*!< CTIMER STMINTCLR: CAPTUREB (Bitfield-Mask: 0x01)      */
+#define CTIMER_STMINTCLR_CAPTUREA_Pos     (9UL)                     /*!< CTIMER STMINTCLR: CAPTUREA (Bit 9)                    */
+#define CTIMER_STMINTCLR_CAPTUREA_Msk     (0x200UL)                 /*!< CTIMER STMINTCLR: CAPTUREA (Bitfield-Mask: 0x01)      */
+#define CTIMER_STMINTCLR_OVERFLOW_Pos     (8UL)                     /*!< CTIMER STMINTCLR: OVERFLOW (Bit 8)                    */
+#define CTIMER_STMINTCLR_OVERFLOW_Msk     (0x100UL)                 /*!< CTIMER STMINTCLR: OVERFLOW (Bitfield-Mask: 0x01)      */
+#define CTIMER_STMINTCLR_COMPAREH_Pos     (7UL)                     /*!< CTIMER STMINTCLR: COMPAREH (Bit 7)                    */
+#define CTIMER_STMINTCLR_COMPAREH_Msk     (0x80UL)                  /*!< CTIMER STMINTCLR: COMPAREH (Bitfield-Mask: 0x01)      */
+#define CTIMER_STMINTCLR_COMPAREG_Pos     (6UL)                     /*!< CTIMER STMINTCLR: COMPAREG (Bit 6)                    */
+#define CTIMER_STMINTCLR_COMPAREG_Msk     (0x40UL)                  /*!< CTIMER STMINTCLR: COMPAREG (Bitfield-Mask: 0x01)      */
+#define CTIMER_STMINTCLR_COMPAREF_Pos     (5UL)                     /*!< CTIMER STMINTCLR: COMPAREF (Bit 5)                    */
+#define CTIMER_STMINTCLR_COMPAREF_Msk     (0x20UL)                  /*!< CTIMER STMINTCLR: COMPAREF (Bitfield-Mask: 0x01)      */
+#define CTIMER_STMINTCLR_COMPAREE_Pos     (4UL)                     /*!< CTIMER STMINTCLR: COMPAREE (Bit 4)                    */
+#define CTIMER_STMINTCLR_COMPAREE_Msk     (0x10UL)                  /*!< CTIMER STMINTCLR: COMPAREE (Bitfield-Mask: 0x01)      */
+#define CTIMER_STMINTCLR_COMPARED_Pos     (3UL)                     /*!< CTIMER STMINTCLR: COMPARED (Bit 3)                    */
+#define CTIMER_STMINTCLR_COMPARED_Msk     (0x8UL)                   /*!< CTIMER STMINTCLR: COMPARED (Bitfield-Mask: 0x01)      */
+#define CTIMER_STMINTCLR_COMPAREC_Pos     (2UL)                     /*!< CTIMER STMINTCLR: COMPAREC (Bit 2)                    */
+#define CTIMER_STMINTCLR_COMPAREC_Msk     (0x4UL)                   /*!< CTIMER STMINTCLR: COMPAREC (Bitfield-Mask: 0x01)      */
+#define CTIMER_STMINTCLR_COMPAREB_Pos     (1UL)                     /*!< CTIMER STMINTCLR: COMPAREB (Bit 1)                    */
+#define CTIMER_STMINTCLR_COMPAREB_Msk     (0x2UL)                   /*!< CTIMER STMINTCLR: COMPAREB (Bitfield-Mask: 0x01)      */
+#define CTIMER_STMINTCLR_COMPAREA_Pos     (0UL)                     /*!< CTIMER STMINTCLR: COMPAREA (Bit 0)                    */
+#define CTIMER_STMINTCLR_COMPAREA_Msk     (0x1UL)                   /*!< CTIMER STMINTCLR: COMPAREA (Bitfield-Mask: 0x01)      */
+/* =======================================================  STMINTSET  ======================================================= */
+#define CTIMER_STMINTSET_CAPTURED_Pos     (12UL)                    /*!< CTIMER STMINTSET: CAPTURED (Bit 12)                   */
+#define CTIMER_STMINTSET_CAPTURED_Msk     (0x1000UL)                /*!< CTIMER STMINTSET: CAPTURED (Bitfield-Mask: 0x01)      */
+#define CTIMER_STMINTSET_CAPTUREC_Pos     (11UL)                    /*!< CTIMER STMINTSET: CAPTUREC (Bit 11)                   */
+#define CTIMER_STMINTSET_CAPTUREC_Msk     (0x800UL)                 /*!< CTIMER STMINTSET: CAPTUREC (Bitfield-Mask: 0x01)      */
+#define CTIMER_STMINTSET_CAPTUREB_Pos     (10UL)                    /*!< CTIMER STMINTSET: CAPTUREB (Bit 10)                   */
+#define CTIMER_STMINTSET_CAPTUREB_Msk     (0x400UL)                 /*!< CTIMER STMINTSET: CAPTUREB (Bitfield-Mask: 0x01)      */
+#define CTIMER_STMINTSET_CAPTUREA_Pos     (9UL)                     /*!< CTIMER STMINTSET: CAPTUREA (Bit 9)                    */
+#define CTIMER_STMINTSET_CAPTUREA_Msk     (0x200UL)                 /*!< CTIMER STMINTSET: CAPTUREA (Bitfield-Mask: 0x01)      */
+#define CTIMER_STMINTSET_OVERFLOW_Pos     (8UL)                     /*!< CTIMER STMINTSET: OVERFLOW (Bit 8)                    */
+#define CTIMER_STMINTSET_OVERFLOW_Msk     (0x100UL)                 /*!< CTIMER STMINTSET: OVERFLOW (Bitfield-Mask: 0x01)      */
+#define CTIMER_STMINTSET_COMPAREH_Pos     (7UL)                     /*!< CTIMER STMINTSET: COMPAREH (Bit 7)                    */
+#define CTIMER_STMINTSET_COMPAREH_Msk     (0x80UL)                  /*!< CTIMER STMINTSET: COMPAREH (Bitfield-Mask: 0x01)      */
+#define CTIMER_STMINTSET_COMPAREG_Pos     (6UL)                     /*!< CTIMER STMINTSET: COMPAREG (Bit 6)                    */
+#define CTIMER_STMINTSET_COMPAREG_Msk     (0x40UL)                  /*!< CTIMER STMINTSET: COMPAREG (Bitfield-Mask: 0x01)      */
+#define CTIMER_STMINTSET_COMPAREF_Pos     (5UL)                     /*!< CTIMER STMINTSET: COMPAREF (Bit 5)                    */
+#define CTIMER_STMINTSET_COMPAREF_Msk     (0x20UL)                  /*!< CTIMER STMINTSET: COMPAREF (Bitfield-Mask: 0x01)      */
+#define CTIMER_STMINTSET_COMPAREE_Pos     (4UL)                     /*!< CTIMER STMINTSET: COMPAREE (Bit 4)                    */
+#define CTIMER_STMINTSET_COMPAREE_Msk     (0x10UL)                  /*!< CTIMER STMINTSET: COMPAREE (Bitfield-Mask: 0x01)      */
+#define CTIMER_STMINTSET_COMPARED_Pos     (3UL)                     /*!< CTIMER STMINTSET: COMPARED (Bit 3)                    */
+#define CTIMER_STMINTSET_COMPARED_Msk     (0x8UL)                   /*!< CTIMER STMINTSET: COMPARED (Bitfield-Mask: 0x01)      */
+#define CTIMER_STMINTSET_COMPAREC_Pos     (2UL)                     /*!< CTIMER STMINTSET: COMPAREC (Bit 2)                    */
+#define CTIMER_STMINTSET_COMPAREC_Msk     (0x4UL)                   /*!< CTIMER STMINTSET: COMPAREC (Bitfield-Mask: 0x01)      */
+#define CTIMER_STMINTSET_COMPAREB_Pos     (1UL)                     /*!< CTIMER STMINTSET: COMPAREB (Bit 1)                    */
+#define CTIMER_STMINTSET_COMPAREB_Msk     (0x2UL)                   /*!< CTIMER STMINTSET: COMPAREB (Bitfield-Mask: 0x01)      */
+#define CTIMER_STMINTSET_COMPAREA_Pos     (0UL)                     /*!< CTIMER STMINTSET: COMPAREA (Bit 0)                    */
+#define CTIMER_STMINTSET_COMPAREA_Msk     (0x1UL)                   /*!< CTIMER STMINTSET: COMPAREA (Bitfield-Mask: 0x01)      */
+
+
+/* =========================================================================================================================== */
+/* ================                                           GPIO                                            ================ */
+/* =========================================================================================================================== */
+
+/* ========================================================  PADREGA  ======================================================== */
+#define GPIO_PADREGA_PAD3PWRUP_Pos        (30UL)                    /*!< GPIO PADREGA: PAD3PWRUP (Bit 30)                      */
+#define GPIO_PADREGA_PAD3PWRUP_Msk        (0x40000000UL)            /*!< GPIO PADREGA: PAD3PWRUP (Bitfield-Mask: 0x01)         */
+#define GPIO_PADREGA_PAD3FNCSEL_Pos       (27UL)                    /*!< GPIO PADREGA: PAD3FNCSEL (Bit 27)                     */
+#define GPIO_PADREGA_PAD3FNCSEL_Msk       (0x38000000UL)            /*!< GPIO PADREGA: PAD3FNCSEL (Bitfield-Mask: 0x07)        */
+#define GPIO_PADREGA_PAD3STRNG_Pos        (26UL)                    /*!< GPIO PADREGA: PAD3STRNG (Bit 26)                      */
+#define GPIO_PADREGA_PAD3STRNG_Msk        (0x4000000UL)             /*!< GPIO PADREGA: PAD3STRNG (Bitfield-Mask: 0x01)         */
+#define GPIO_PADREGA_PAD3INPEN_Pos        (25UL)                    /*!< GPIO PADREGA: PAD3INPEN (Bit 25)                      */
+#define GPIO_PADREGA_PAD3INPEN_Msk        (0x2000000UL)             /*!< GPIO PADREGA: PAD3INPEN (Bitfield-Mask: 0x01)         */
+#define GPIO_PADREGA_PAD3PULL_Pos         (24UL)                    /*!< GPIO PADREGA: PAD3PULL (Bit 24)                       */
+#define GPIO_PADREGA_PAD3PULL_Msk         (0x1000000UL)             /*!< GPIO PADREGA: PAD3PULL (Bitfield-Mask: 0x01)          */
+#define GPIO_PADREGA_PAD2FNCSEL_Pos       (19UL)                    /*!< GPIO PADREGA: PAD2FNCSEL (Bit 19)                     */
+#define GPIO_PADREGA_PAD2FNCSEL_Msk       (0x380000UL)              /*!< GPIO PADREGA: PAD2FNCSEL (Bitfield-Mask: 0x07)        */
+#define GPIO_PADREGA_PAD2STRNG_Pos        (18UL)                    /*!< GPIO PADREGA: PAD2STRNG (Bit 18)                      */
+#define GPIO_PADREGA_PAD2STRNG_Msk        (0x40000UL)               /*!< GPIO PADREGA: PAD2STRNG (Bitfield-Mask: 0x01)         */
+#define GPIO_PADREGA_PAD2INPEN_Pos        (17UL)                    /*!< GPIO PADREGA: PAD2INPEN (Bit 17)                      */
+#define GPIO_PADREGA_PAD2INPEN_Msk        (0x20000UL)               /*!< GPIO PADREGA: PAD2INPEN (Bitfield-Mask: 0x01)         */
+#define GPIO_PADREGA_PAD2PULL_Pos         (16UL)                    /*!< GPIO PADREGA: PAD2PULL (Bit 16)                       */
+#define GPIO_PADREGA_PAD2PULL_Msk         (0x10000UL)               /*!< GPIO PADREGA: PAD2PULL (Bitfield-Mask: 0x01)          */
+#define GPIO_PADREGA_PAD1RSEL_Pos         (14UL)                    /*!< GPIO PADREGA: PAD1RSEL (Bit 14)                       */
+#define GPIO_PADREGA_PAD1RSEL_Msk         (0xc000UL)                /*!< GPIO PADREGA: PAD1RSEL (Bitfield-Mask: 0x03)          */
+#define GPIO_PADREGA_PAD1FNCSEL_Pos       (11UL)                    /*!< GPIO PADREGA: PAD1FNCSEL (Bit 11)                     */
+#define GPIO_PADREGA_PAD1FNCSEL_Msk       (0x3800UL)                /*!< GPIO PADREGA: PAD1FNCSEL (Bitfield-Mask: 0x07)        */
+#define GPIO_PADREGA_PAD1STRNG_Pos        (10UL)                    /*!< GPIO PADREGA: PAD1STRNG (Bit 10)                      */
+#define GPIO_PADREGA_PAD1STRNG_Msk        (0x400UL)                 /*!< GPIO PADREGA: PAD1STRNG (Bitfield-Mask: 0x01)         */
+#define GPIO_PADREGA_PAD1INPEN_Pos        (9UL)                     /*!< GPIO PADREGA: PAD1INPEN (Bit 9)                       */
+#define GPIO_PADREGA_PAD1INPEN_Msk        (0x200UL)                 /*!< GPIO PADREGA: PAD1INPEN (Bitfield-Mask: 0x01)         */
+#define GPIO_PADREGA_PAD1PULL_Pos         (8UL)                     /*!< GPIO PADREGA: PAD1PULL (Bit 8)                        */
+#define GPIO_PADREGA_PAD1PULL_Msk         (0x100UL)                 /*!< GPIO PADREGA: PAD1PULL (Bitfield-Mask: 0x01)          */
+#define GPIO_PADREGA_PAD0RSEL_Pos         (6UL)                     /*!< GPIO PADREGA: PAD0RSEL (Bit 6)                        */
+#define GPIO_PADREGA_PAD0RSEL_Msk         (0xc0UL)                  /*!< GPIO PADREGA: PAD0RSEL (Bitfield-Mask: 0x03)          */
+#define GPIO_PADREGA_PAD0FNCSEL_Pos       (3UL)                     /*!< GPIO PADREGA: PAD0FNCSEL (Bit 3)                      */
+#define GPIO_PADREGA_PAD0FNCSEL_Msk       (0x38UL)                  /*!< GPIO PADREGA: PAD0FNCSEL (Bitfield-Mask: 0x07)        */
+#define GPIO_PADREGA_PAD0STRNG_Pos        (2UL)                     /*!< GPIO PADREGA: PAD0STRNG (Bit 2)                       */
+#define GPIO_PADREGA_PAD0STRNG_Msk        (0x4UL)                   /*!< GPIO PADREGA: PAD0STRNG (Bitfield-Mask: 0x01)         */
+#define GPIO_PADREGA_PAD0INPEN_Pos        (1UL)                     /*!< GPIO PADREGA: PAD0INPEN (Bit 1)                       */
+#define GPIO_PADREGA_PAD0INPEN_Msk        (0x2UL)                   /*!< GPIO PADREGA: PAD0INPEN (Bitfield-Mask: 0x01)         */
+#define GPIO_PADREGA_PAD0PULL_Pos         (0UL)                     /*!< GPIO PADREGA: PAD0PULL (Bit 0)                        */
+#define GPIO_PADREGA_PAD0PULL_Msk         (0x1UL)                   /*!< GPIO PADREGA: PAD0PULL (Bitfield-Mask: 0x01)          */
+/* ========================================================  PADREGB  ======================================================== */
+#define GPIO_PADREGB_PAD7FNCSEL_Pos       (27UL)                    /*!< GPIO PADREGB: PAD7FNCSEL (Bit 27)                     */
+#define GPIO_PADREGB_PAD7FNCSEL_Msk       (0x38000000UL)            /*!< GPIO PADREGB: PAD7FNCSEL (Bitfield-Mask: 0x07)        */
+#define GPIO_PADREGB_PAD7STRNG_Pos        (26UL)                    /*!< GPIO PADREGB: PAD7STRNG (Bit 26)                      */
+#define GPIO_PADREGB_PAD7STRNG_Msk        (0x4000000UL)             /*!< GPIO PADREGB: PAD7STRNG (Bitfield-Mask: 0x01)         */
+#define GPIO_PADREGB_PAD7INPEN_Pos        (25UL)                    /*!< GPIO PADREGB: PAD7INPEN (Bit 25)                      */
+#define GPIO_PADREGB_PAD7INPEN_Msk        (0x2000000UL)             /*!< GPIO PADREGB: PAD7INPEN (Bitfield-Mask: 0x01)         */
+#define GPIO_PADREGB_PAD7PULL_Pos         (24UL)                    /*!< GPIO PADREGB: PAD7PULL (Bit 24)                       */
+#define GPIO_PADREGB_PAD7PULL_Msk         (0x1000000UL)             /*!< GPIO PADREGB: PAD7PULL (Bitfield-Mask: 0x01)          */
+#define GPIO_PADREGB_PAD6RSEL_Pos         (22UL)                    /*!< GPIO PADREGB: PAD6RSEL (Bit 22)                       */
+#define GPIO_PADREGB_PAD6RSEL_Msk         (0xc00000UL)              /*!< GPIO PADREGB: PAD6RSEL (Bitfield-Mask: 0x03)          */
+#define GPIO_PADREGB_PAD6FNCSEL_Pos       (19UL)                    /*!< GPIO PADREGB: PAD6FNCSEL (Bit 19)                     */
+#define GPIO_PADREGB_PAD6FNCSEL_Msk       (0x380000UL)              /*!< GPIO PADREGB: PAD6FNCSEL (Bitfield-Mask: 0x07)        */
+#define GPIO_PADREGB_PAD6STRNG_Pos        (18UL)                    /*!< GPIO PADREGB: PAD6STRNG (Bit 18)                      */
+#define GPIO_PADREGB_PAD6STRNG_Msk        (0x40000UL)               /*!< GPIO PADREGB: PAD6STRNG (Bitfield-Mask: 0x01)         */
+#define GPIO_PADREGB_PAD6INPEN_Pos        (17UL)                    /*!< GPIO PADREGB: PAD6INPEN (Bit 17)                      */
+#define GPIO_PADREGB_PAD6INPEN_Msk        (0x20000UL)               /*!< GPIO PADREGB: PAD6INPEN (Bitfield-Mask: 0x01)         */
+#define GPIO_PADREGB_PAD6PULL_Pos         (16UL)                    /*!< GPIO PADREGB: PAD6PULL (Bit 16)                       */
+#define GPIO_PADREGB_PAD6PULL_Msk         (0x10000UL)               /*!< GPIO PADREGB: PAD6PULL (Bitfield-Mask: 0x01)          */
+#define GPIO_PADREGB_PAD5RSEL_Pos         (14UL)                    /*!< GPIO PADREGB: PAD5RSEL (Bit 14)                       */
+#define GPIO_PADREGB_PAD5RSEL_Msk         (0xc000UL)                /*!< GPIO PADREGB: PAD5RSEL (Bitfield-Mask: 0x03)          */
+#define GPIO_PADREGB_PAD5FNCSEL_Pos       (11UL)                    /*!< GPIO PADREGB: PAD5FNCSEL (Bit 11)                     */
+#define GPIO_PADREGB_PAD5FNCSEL_Msk       (0x3800UL)                /*!< GPIO PADREGB: PAD5FNCSEL (Bitfield-Mask: 0x07)        */
+#define GPIO_PADREGB_PAD5STRNG_Pos        (10UL)                    /*!< GPIO PADREGB: PAD5STRNG (Bit 10)                      */
+#define GPIO_PADREGB_PAD5STRNG_Msk        (0x400UL)                 /*!< GPIO PADREGB: PAD5STRNG (Bitfield-Mask: 0x01)         */
+#define GPIO_PADREGB_PAD5INPEN_Pos        (9UL)                     /*!< GPIO PADREGB: PAD5INPEN (Bit 9)                       */
+#define GPIO_PADREGB_PAD5INPEN_Msk        (0x200UL)                 /*!< GPIO PADREGB: PAD5INPEN (Bitfield-Mask: 0x01)         */
+#define GPIO_PADREGB_PAD5PULL_Pos         (8UL)                     /*!< GPIO PADREGB: PAD5PULL (Bit 8)                        */
+#define GPIO_PADREGB_PAD5PULL_Msk         (0x100UL)                 /*!< GPIO PADREGB: PAD5PULL (Bitfield-Mask: 0x01)          */
+#define GPIO_PADREGB_PAD4FNCSEL_Pos       (3UL)                     /*!< GPIO PADREGB: PAD4FNCSEL (Bit 3)                      */
+#define GPIO_PADREGB_PAD4FNCSEL_Msk       (0x38UL)                  /*!< GPIO PADREGB: PAD4FNCSEL (Bitfield-Mask: 0x07)        */
+#define GPIO_PADREGB_PAD4STRNG_Pos        (2UL)                     /*!< GPIO PADREGB: PAD4STRNG (Bit 2)                       */
+#define GPIO_PADREGB_PAD4STRNG_Msk        (0x4UL)                   /*!< GPIO PADREGB: PAD4STRNG (Bitfield-Mask: 0x01)         */
+#define GPIO_PADREGB_PAD4INPEN_Pos        (1UL)                     /*!< GPIO PADREGB: PAD4INPEN (Bit 1)                       */
+#define GPIO_PADREGB_PAD4INPEN_Msk        (0x2UL)                   /*!< GPIO PADREGB: PAD4INPEN (Bitfield-Mask: 0x01)         */
+#define GPIO_PADREGB_PAD4PULL_Pos         (0UL)                     /*!< GPIO PADREGB: PAD4PULL (Bit 0)                        */
+#define GPIO_PADREGB_PAD4PULL_Msk         (0x1UL)                   /*!< GPIO PADREGB: PAD4PULL (Bitfield-Mask: 0x01)          */
+/* ========================================================  PADREGC  ======================================================== */
+#define GPIO_PADREGC_PAD11FNCSEL_Pos      (27UL)                    /*!< GPIO PADREGC: PAD11FNCSEL (Bit 27)                    */
+#define GPIO_PADREGC_PAD11FNCSEL_Msk      (0x38000000UL)            /*!< GPIO PADREGC: PAD11FNCSEL (Bitfield-Mask: 0x07)       */
+#define GPIO_PADREGC_PAD11STRNG_Pos       (26UL)                    /*!< GPIO PADREGC: PAD11STRNG (Bit 26)                     */
+#define GPIO_PADREGC_PAD11STRNG_Msk       (0x4000000UL)             /*!< GPIO PADREGC: PAD11STRNG (Bitfield-Mask: 0x01)        */
+#define GPIO_PADREGC_PAD11INPEN_Pos       (25UL)                    /*!< GPIO PADREGC: PAD11INPEN (Bit 25)                     */
+#define GPIO_PADREGC_PAD11INPEN_Msk       (0x2000000UL)             /*!< GPIO PADREGC: PAD11INPEN (Bitfield-Mask: 0x01)        */
+#define GPIO_PADREGC_PAD11PULL_Pos        (24UL)                    /*!< GPIO PADREGC: PAD11PULL (Bit 24)                      */
+#define GPIO_PADREGC_PAD11PULL_Msk        (0x1000000UL)             /*!< GPIO PADREGC: PAD11PULL (Bitfield-Mask: 0x01)         */
+#define GPIO_PADREGC_PAD10FNCSEL_Pos      (19UL)                    /*!< GPIO PADREGC: PAD10FNCSEL (Bit 19)                    */
+#define GPIO_PADREGC_PAD10FNCSEL_Msk      (0x380000UL)              /*!< GPIO PADREGC: PAD10FNCSEL (Bitfield-Mask: 0x07)       */
+#define GPIO_PADREGC_PAD10STRNG_Pos       (18UL)                    /*!< GPIO PADREGC: PAD10STRNG (Bit 18)                     */
+#define GPIO_PADREGC_PAD10STRNG_Msk       (0x40000UL)               /*!< GPIO PADREGC: PAD10STRNG (Bitfield-Mask: 0x01)        */
+#define GPIO_PADREGC_PAD10INPEN_Pos       (17UL)                    /*!< GPIO PADREGC: PAD10INPEN (Bit 17)                     */
+#define GPIO_PADREGC_PAD10INPEN_Msk       (0x20000UL)               /*!< GPIO PADREGC: PAD10INPEN (Bitfield-Mask: 0x01)        */
+#define GPIO_PADREGC_PAD10PULL_Pos        (16UL)                    /*!< GPIO PADREGC: PAD10PULL (Bit 16)                      */
+#define GPIO_PADREGC_PAD10PULL_Msk        (0x10000UL)               /*!< GPIO PADREGC: PAD10PULL (Bitfield-Mask: 0x01)         */
+#define GPIO_PADREGC_PAD9RSEL_Pos         (14UL)                    /*!< GPIO PADREGC: PAD9RSEL (Bit 14)                       */
+#define GPIO_PADREGC_PAD9RSEL_Msk         (0xc000UL)                /*!< GPIO PADREGC: PAD9RSEL (Bitfield-Mask: 0x03)          */
+#define GPIO_PADREGC_PAD9FNCSEL_Pos       (11UL)                    /*!< GPIO PADREGC: PAD9FNCSEL (Bit 11)                     */
+#define GPIO_PADREGC_PAD9FNCSEL_Msk       (0x3800UL)                /*!< GPIO PADREGC: PAD9FNCSEL (Bitfield-Mask: 0x07)        */
+#define GPIO_PADREGC_PAD9STRNG_Pos        (10UL)                    /*!< GPIO PADREGC: PAD9STRNG (Bit 10)                      */
+#define GPIO_PADREGC_PAD9STRNG_Msk        (0x400UL)                 /*!< GPIO PADREGC: PAD9STRNG (Bitfield-Mask: 0x01)         */
+#define GPIO_PADREGC_PAD9INPEN_Pos        (9UL)                     /*!< GPIO PADREGC: PAD9INPEN (Bit 9)                       */
+#define GPIO_PADREGC_PAD9INPEN_Msk        (0x200UL)                 /*!< GPIO PADREGC: PAD9INPEN (Bitfield-Mask: 0x01)         */
+#define GPIO_PADREGC_PAD9PULL_Pos         (8UL)                     /*!< GPIO PADREGC: PAD9PULL (Bit 8)                        */
+#define GPIO_PADREGC_PAD9PULL_Msk         (0x100UL)                 /*!< GPIO PADREGC: PAD9PULL (Bitfield-Mask: 0x01)          */
+#define GPIO_PADREGC_PAD8RSEL_Pos         (6UL)                     /*!< GPIO PADREGC: PAD8RSEL (Bit 6)                        */
+#define GPIO_PADREGC_PAD8RSEL_Msk         (0xc0UL)                  /*!< GPIO PADREGC: PAD8RSEL (Bitfield-Mask: 0x03)          */
+#define GPIO_PADREGC_PAD8FNCSEL_Pos       (3UL)                     /*!< GPIO PADREGC: PAD8FNCSEL (Bit 3)                      */
+#define GPIO_PADREGC_PAD8FNCSEL_Msk       (0x38UL)                  /*!< GPIO PADREGC: PAD8FNCSEL (Bitfield-Mask: 0x07)        */
+#define GPIO_PADREGC_PAD8STRNG_Pos        (2UL)                     /*!< GPIO PADREGC: PAD8STRNG (Bit 2)                       */
+#define GPIO_PADREGC_PAD8STRNG_Msk        (0x4UL)                   /*!< GPIO PADREGC: PAD8STRNG (Bitfield-Mask: 0x01)         */
+#define GPIO_PADREGC_PAD8INPEN_Pos        (1UL)                     /*!< GPIO PADREGC: PAD8INPEN (Bit 1)                       */
+#define GPIO_PADREGC_PAD8INPEN_Msk        (0x2UL)                   /*!< GPIO PADREGC: PAD8INPEN (Bitfield-Mask: 0x01)         */
+#define GPIO_PADREGC_PAD8PULL_Pos         (0UL)                     /*!< GPIO PADREGC: PAD8PULL (Bit 0)                        */
+#define GPIO_PADREGC_PAD8PULL_Msk         (0x1UL)                   /*!< GPIO PADREGC: PAD8PULL (Bitfield-Mask: 0x01)          */
+/* ========================================================  PADREGD  ======================================================== */
+#define GPIO_PADREGD_PAD15FNCSEL_Pos      (27UL)                    /*!< GPIO PADREGD: PAD15FNCSEL (Bit 27)                    */
+#define GPIO_PADREGD_PAD15FNCSEL_Msk      (0x38000000UL)            /*!< GPIO PADREGD: PAD15FNCSEL (Bitfield-Mask: 0x07)       */
+#define GPIO_PADREGD_PAD15STRNG_Pos       (26UL)                    /*!< GPIO PADREGD: PAD15STRNG (Bit 26)                     */
+#define GPIO_PADREGD_PAD15STRNG_Msk       (0x4000000UL)             /*!< GPIO PADREGD: PAD15STRNG (Bitfield-Mask: 0x01)        */
+#define GPIO_PADREGD_PAD15INPEN_Pos       (25UL)                    /*!< GPIO PADREGD: PAD15INPEN (Bit 25)                     */
+#define GPIO_PADREGD_PAD15INPEN_Msk       (0x2000000UL)             /*!< GPIO PADREGD: PAD15INPEN (Bitfield-Mask: 0x01)        */
+#define GPIO_PADREGD_PAD15PULL_Pos        (24UL)                    /*!< GPIO PADREGD: PAD15PULL (Bit 24)                      */
+#define GPIO_PADREGD_PAD15PULL_Msk        (0x1000000UL)             /*!< GPIO PADREGD: PAD15PULL (Bitfield-Mask: 0x01)         */
+#define GPIO_PADREGD_PAD14FNCSEL_Pos      (19UL)                    /*!< GPIO PADREGD: PAD14FNCSEL (Bit 19)                    */
+#define GPIO_PADREGD_PAD14FNCSEL_Msk      (0x380000UL)              /*!< GPIO PADREGD: PAD14FNCSEL (Bitfield-Mask: 0x07)       */
+#define GPIO_PADREGD_PAD14STRNG_Pos       (18UL)                    /*!< GPIO PADREGD: PAD14STRNG (Bit 18)                     */
+#define GPIO_PADREGD_PAD14STRNG_Msk       (0x40000UL)               /*!< GPIO PADREGD: PAD14STRNG (Bitfield-Mask: 0x01)        */
+#define GPIO_PADREGD_PAD14INPEN_Pos       (17UL)                    /*!< GPIO PADREGD: PAD14INPEN (Bit 17)                     */
+#define GPIO_PADREGD_PAD14INPEN_Msk       (0x20000UL)               /*!< GPIO PADREGD: PAD14INPEN (Bitfield-Mask: 0x01)        */
+#define GPIO_PADREGD_PAD14PULL_Pos        (16UL)                    /*!< GPIO PADREGD: PAD14PULL (Bit 16)                      */
+#define GPIO_PADREGD_PAD14PULL_Msk        (0x10000UL)               /*!< GPIO PADREGD: PAD14PULL (Bitfield-Mask: 0x01)         */
+#define GPIO_PADREGD_PAD13FNCSEL_Pos      (11UL)                    /*!< GPIO PADREGD: PAD13FNCSEL (Bit 11)                    */
+#define GPIO_PADREGD_PAD13FNCSEL_Msk      (0x3800UL)                /*!< GPIO PADREGD: PAD13FNCSEL (Bitfield-Mask: 0x07)       */
+#define GPIO_PADREGD_PAD13STRNG_Pos       (10UL)                    /*!< GPIO PADREGD: PAD13STRNG (Bit 10)                     */
+#define GPIO_PADREGD_PAD13STRNG_Msk       (0x400UL)                 /*!< GPIO PADREGD: PAD13STRNG (Bitfield-Mask: 0x01)        */
+#define GPIO_PADREGD_PAD13INPEN_Pos       (9UL)                     /*!< GPIO PADREGD: PAD13INPEN (Bit 9)                      */
+#define GPIO_PADREGD_PAD13INPEN_Msk       (0x200UL)                 /*!< GPIO PADREGD: PAD13INPEN (Bitfield-Mask: 0x01)        */
+#define GPIO_PADREGD_PAD13PULL_Pos        (8UL)                     /*!< GPIO PADREGD: PAD13PULL (Bit 8)                       */
+#define GPIO_PADREGD_PAD13PULL_Msk        (0x100UL)                 /*!< GPIO PADREGD: PAD13PULL (Bitfield-Mask: 0x01)         */
+#define GPIO_PADREGD_PAD12FNCSEL_Pos      (3UL)                     /*!< GPIO PADREGD: PAD12FNCSEL (Bit 3)                     */
+#define GPIO_PADREGD_PAD12FNCSEL_Msk      (0x38UL)                  /*!< GPIO PADREGD: PAD12FNCSEL (Bitfield-Mask: 0x07)       */
+#define GPIO_PADREGD_PAD12STRNG_Pos       (2UL)                     /*!< GPIO PADREGD: PAD12STRNG (Bit 2)                      */
+#define GPIO_PADREGD_PAD12STRNG_Msk       (0x4UL)                   /*!< GPIO PADREGD: PAD12STRNG (Bitfield-Mask: 0x01)        */
+#define GPIO_PADREGD_PAD12INPEN_Pos       (1UL)                     /*!< GPIO PADREGD: PAD12INPEN (Bit 1)                      */
+#define GPIO_PADREGD_PAD12INPEN_Msk       (0x2UL)                   /*!< GPIO PADREGD: PAD12INPEN (Bitfield-Mask: 0x01)        */
+#define GPIO_PADREGD_PAD12PULL_Pos        (0UL)                     /*!< GPIO PADREGD: PAD12PULL (Bit 0)                       */
+#define GPIO_PADREGD_PAD12PULL_Msk        (0x1UL)                   /*!< GPIO PADREGD: PAD12PULL (Bitfield-Mask: 0x01)         */
+/* ========================================================  PADREGE  ======================================================== */
+#define GPIO_PADREGE_PAD19FNCSEL_Pos      (27UL)                    /*!< GPIO PADREGE: PAD19FNCSEL (Bit 27)                    */
+#define GPIO_PADREGE_PAD19FNCSEL_Msk      (0x38000000UL)            /*!< GPIO PADREGE: PAD19FNCSEL (Bitfield-Mask: 0x07)       */
+#define GPIO_PADREGE_PAD19STRNG_Pos       (26UL)                    /*!< GPIO PADREGE: PAD19STRNG (Bit 26)                     */
+#define GPIO_PADREGE_PAD19STRNG_Msk       (0x4000000UL)             /*!< GPIO PADREGE: PAD19STRNG (Bitfield-Mask: 0x01)        */
+#define GPIO_PADREGE_PAD19INPEN_Pos       (25UL)                    /*!< GPIO PADREGE: PAD19INPEN (Bit 25)                     */
+#define GPIO_PADREGE_PAD19INPEN_Msk       (0x2000000UL)             /*!< GPIO PADREGE: PAD19INPEN (Bitfield-Mask: 0x01)        */
+#define GPIO_PADREGE_PAD19PULL_Pos        (24UL)                    /*!< GPIO PADREGE: PAD19PULL (Bit 24)                      */
+#define GPIO_PADREGE_PAD19PULL_Msk        (0x1000000UL)             /*!< GPIO PADREGE: PAD19PULL (Bitfield-Mask: 0x01)         */
+#define GPIO_PADREGE_PAD18FNCSEL_Pos      (19UL)                    /*!< GPIO PADREGE: PAD18FNCSEL (Bit 19)                    */
+#define GPIO_PADREGE_PAD18FNCSEL_Msk      (0x380000UL)              /*!< GPIO PADREGE: PAD18FNCSEL (Bitfield-Mask: 0x07)       */
+#define GPIO_PADREGE_PAD18STRNG_Pos       (18UL)                    /*!< GPIO PADREGE: PAD18STRNG (Bit 18)                     */
+#define GPIO_PADREGE_PAD18STRNG_Msk       (0x40000UL)               /*!< GPIO PADREGE: PAD18STRNG (Bitfield-Mask: 0x01)        */
+#define GPIO_PADREGE_PAD18INPEN_Pos       (17UL)                    /*!< GPIO PADREGE: PAD18INPEN (Bit 17)                     */
+#define GPIO_PADREGE_PAD18INPEN_Msk       (0x20000UL)               /*!< GPIO PADREGE: PAD18INPEN (Bitfield-Mask: 0x01)        */
+#define GPIO_PADREGE_PAD18PULL_Pos        (16UL)                    /*!< GPIO PADREGE: PAD18PULL (Bit 16)                      */
+#define GPIO_PADREGE_PAD18PULL_Msk        (0x10000UL)               /*!< GPIO PADREGE: PAD18PULL (Bitfield-Mask: 0x01)         */
+#define GPIO_PADREGE_PAD17FNCSEL_Pos      (11UL)                    /*!< GPIO PADREGE: PAD17FNCSEL (Bit 11)                    */
+#define GPIO_PADREGE_PAD17FNCSEL_Msk      (0x3800UL)                /*!< GPIO PADREGE: PAD17FNCSEL (Bitfield-Mask: 0x07)       */
+#define GPIO_PADREGE_PAD17STRNG_Pos       (10UL)                    /*!< GPIO PADREGE: PAD17STRNG (Bit 10)                     */
+#define GPIO_PADREGE_PAD17STRNG_Msk       (0x400UL)                 /*!< GPIO PADREGE: PAD17STRNG (Bitfield-Mask: 0x01)        */
+#define GPIO_PADREGE_PAD17INPEN_Pos       (9UL)                     /*!< GPIO PADREGE: PAD17INPEN (Bit 9)                      */
+#define GPIO_PADREGE_PAD17INPEN_Msk       (0x200UL)                 /*!< GPIO PADREGE: PAD17INPEN (Bitfield-Mask: 0x01)        */
+#define GPIO_PADREGE_PAD17PULL_Pos        (8UL)                     /*!< GPIO PADREGE: PAD17PULL (Bit 8)                       */
+#define GPIO_PADREGE_PAD17PULL_Msk        (0x100UL)                 /*!< GPIO PADREGE: PAD17PULL (Bitfield-Mask: 0x01)         */
+#define GPIO_PADREGE_PAD16FNCSEL_Pos      (3UL)                     /*!< GPIO PADREGE: PAD16FNCSEL (Bit 3)                     */
+#define GPIO_PADREGE_PAD16FNCSEL_Msk      (0x38UL)                  /*!< GPIO PADREGE: PAD16FNCSEL (Bitfield-Mask: 0x07)       */
+#define GPIO_PADREGE_PAD16STRNG_Pos       (2UL)                     /*!< GPIO PADREGE: PAD16STRNG (Bit 2)                      */
+#define GPIO_PADREGE_PAD16STRNG_Msk       (0x4UL)                   /*!< GPIO PADREGE: PAD16STRNG (Bitfield-Mask: 0x01)        */
+#define GPIO_PADREGE_PAD16INPEN_Pos       (1UL)                     /*!< GPIO PADREGE: PAD16INPEN (Bit 1)                      */
+#define GPIO_PADREGE_PAD16INPEN_Msk       (0x2UL)                   /*!< GPIO PADREGE: PAD16INPEN (Bitfield-Mask: 0x01)        */
+#define GPIO_PADREGE_PAD16PULL_Pos        (0UL)                     /*!< GPIO PADREGE: PAD16PULL (Bit 0)                       */
+#define GPIO_PADREGE_PAD16PULL_Msk        (0x1UL)                   /*!< GPIO PADREGE: PAD16PULL (Bitfield-Mask: 0x01)         */
+/* ========================================================  PADREGF  ======================================================== */
+#define GPIO_PADREGF_PAD23FNCSEL_Pos      (27UL)                    /*!< GPIO PADREGF: PAD23FNCSEL (Bit 27)                    */
+#define GPIO_PADREGF_PAD23FNCSEL_Msk      (0x38000000UL)            /*!< GPIO PADREGF: PAD23FNCSEL (Bitfield-Mask: 0x07)       */
+#define GPIO_PADREGF_PAD23STRNG_Pos       (26UL)                    /*!< GPIO PADREGF: PAD23STRNG (Bit 26)                     */
+#define GPIO_PADREGF_PAD23STRNG_Msk       (0x4000000UL)             /*!< GPIO PADREGF: PAD23STRNG (Bitfield-Mask: 0x01)        */
+#define GPIO_PADREGF_PAD23INPEN_Pos       (25UL)                    /*!< GPIO PADREGF: PAD23INPEN (Bit 25)                     */
+#define GPIO_PADREGF_PAD23INPEN_Msk       (0x2000000UL)             /*!< GPIO PADREGF: PAD23INPEN (Bitfield-Mask: 0x01)        */
+#define GPIO_PADREGF_PAD23PULL_Pos        (24UL)                    /*!< GPIO PADREGF: PAD23PULL (Bit 24)                      */
+#define GPIO_PADREGF_PAD23PULL_Msk        (0x1000000UL)             /*!< GPIO PADREGF: PAD23PULL (Bitfield-Mask: 0x01)         */
+#define GPIO_PADREGF_PAD22FNCSEL_Pos      (19UL)                    /*!< GPIO PADREGF: PAD22FNCSEL (Bit 19)                    */
+#define GPIO_PADREGF_PAD22FNCSEL_Msk      (0x380000UL)              /*!< GPIO PADREGF: PAD22FNCSEL (Bitfield-Mask: 0x07)       */
+#define GPIO_PADREGF_PAD22STRNG_Pos       (18UL)                    /*!< GPIO PADREGF: PAD22STRNG (Bit 18)                     */
+#define GPIO_PADREGF_PAD22STRNG_Msk       (0x40000UL)               /*!< GPIO PADREGF: PAD22STRNG (Bitfield-Mask: 0x01)        */
+#define GPIO_PADREGF_PAD22INPEN_Pos       (17UL)                    /*!< GPIO PADREGF: PAD22INPEN (Bit 17)                     */
+#define GPIO_PADREGF_PAD22INPEN_Msk       (0x20000UL)               /*!< GPIO PADREGF: PAD22INPEN (Bitfield-Mask: 0x01)        */
+#define GPIO_PADREGF_PAD22PULL_Pos        (16UL)                    /*!< GPIO PADREGF: PAD22PULL (Bit 16)                      */
+#define GPIO_PADREGF_PAD22PULL_Msk        (0x10000UL)               /*!< GPIO PADREGF: PAD22PULL (Bitfield-Mask: 0x01)         */
+#define GPIO_PADREGF_PAD21FNCSEL_Pos      (11UL)                    /*!< GPIO PADREGF: PAD21FNCSEL (Bit 11)                    */
+#define GPIO_PADREGF_PAD21FNCSEL_Msk      (0x3800UL)                /*!< GPIO PADREGF: PAD21FNCSEL (Bitfield-Mask: 0x07)       */
+#define GPIO_PADREGF_PAD21STRNG_Pos       (10UL)                    /*!< GPIO PADREGF: PAD21STRNG (Bit 10)                     */
+#define GPIO_PADREGF_PAD21STRNG_Msk       (0x400UL)                 /*!< GPIO PADREGF: PAD21STRNG (Bitfield-Mask: 0x01)        */
+#define GPIO_PADREGF_PAD21INPEN_Pos       (9UL)                     /*!< GPIO PADREGF: PAD21INPEN (Bit 9)                      */
+#define GPIO_PADREGF_PAD21INPEN_Msk       (0x200UL)                 /*!< GPIO PADREGF: PAD21INPEN (Bitfield-Mask: 0x01)        */
+#define GPIO_PADREGF_PAD21PULL_Pos        (8UL)                     /*!< GPIO PADREGF: PAD21PULL (Bit 8)                       */
+#define GPIO_PADREGF_PAD21PULL_Msk        (0x100UL)                 /*!< GPIO PADREGF: PAD21PULL (Bitfield-Mask: 0x01)         */
+#define GPIO_PADREGF_PAD20FNCSEL_Pos      (3UL)                     /*!< GPIO PADREGF: PAD20FNCSEL (Bit 3)                     */
+#define GPIO_PADREGF_PAD20FNCSEL_Msk      (0x38UL)                  /*!< GPIO PADREGF: PAD20FNCSEL (Bitfield-Mask: 0x07)       */
+#define GPIO_PADREGF_PAD20STRNG_Pos       (2UL)                     /*!< GPIO PADREGF: PAD20STRNG (Bit 2)                      */
+#define GPIO_PADREGF_PAD20STRNG_Msk       (0x4UL)                   /*!< GPIO PADREGF: PAD20STRNG (Bitfield-Mask: 0x01)        */
+#define GPIO_PADREGF_PAD20INPEN_Pos       (1UL)                     /*!< GPIO PADREGF: PAD20INPEN (Bit 1)                      */
+#define GPIO_PADREGF_PAD20INPEN_Msk       (0x2UL)                   /*!< GPIO PADREGF: PAD20INPEN (Bitfield-Mask: 0x01)        */
+#define GPIO_PADREGF_PAD20PULL_Pos        (0UL)                     /*!< GPIO PADREGF: PAD20PULL (Bit 0)                       */
+#define GPIO_PADREGF_PAD20PULL_Msk        (0x1UL)                   /*!< GPIO PADREGF: PAD20PULL (Bitfield-Mask: 0x01)         */
+/* ========================================================  PADREGG  ======================================================== */
+#define GPIO_PADREGG_PAD27RSEL_Pos        (30UL)                    /*!< GPIO PADREGG: PAD27RSEL (Bit 30)                      */
+#define GPIO_PADREGG_PAD27RSEL_Msk        (0xc0000000UL)            /*!< GPIO PADREGG: PAD27RSEL (Bitfield-Mask: 0x03)         */
+#define GPIO_PADREGG_PAD27FNCSEL_Pos      (27UL)                    /*!< GPIO PADREGG: PAD27FNCSEL (Bit 27)                    */
+#define GPIO_PADREGG_PAD27FNCSEL_Msk      (0x38000000UL)            /*!< GPIO PADREGG: PAD27FNCSEL (Bitfield-Mask: 0x07)       */
+#define GPIO_PADREGG_PAD27STRNG_Pos       (26UL)                    /*!< GPIO PADREGG: PAD27STRNG (Bit 26)                     */
+#define GPIO_PADREGG_PAD27STRNG_Msk       (0x4000000UL)             /*!< GPIO PADREGG: PAD27STRNG (Bitfield-Mask: 0x01)        */
+#define GPIO_PADREGG_PAD27INPEN_Pos       (25UL)                    /*!< GPIO PADREGG: PAD27INPEN (Bit 25)                     */
+#define GPIO_PADREGG_PAD27INPEN_Msk       (0x2000000UL)             /*!< GPIO PADREGG: PAD27INPEN (Bitfield-Mask: 0x01)        */
+#define GPIO_PADREGG_PAD27PULL_Pos        (24UL)                    /*!< GPIO PADREGG: PAD27PULL (Bit 24)                      */
+#define GPIO_PADREGG_PAD27PULL_Msk        (0x1000000UL)             /*!< GPIO PADREGG: PAD27PULL (Bitfield-Mask: 0x01)         */
+#define GPIO_PADREGG_PAD26FNCSEL_Pos      (19UL)                    /*!< GPIO PADREGG: PAD26FNCSEL (Bit 19)                    */
+#define GPIO_PADREGG_PAD26FNCSEL_Msk      (0x380000UL)              /*!< GPIO PADREGG: PAD26FNCSEL (Bitfield-Mask: 0x07)       */
+#define GPIO_PADREGG_PAD26STRNG_Pos       (18UL)                    /*!< GPIO PADREGG: PAD26STRNG (Bit 18)                     */
+#define GPIO_PADREGG_PAD26STRNG_Msk       (0x40000UL)               /*!< GPIO PADREGG: PAD26STRNG (Bitfield-Mask: 0x01)        */
+#define GPIO_PADREGG_PAD26INPEN_Pos       (17UL)                    /*!< GPIO PADREGG: PAD26INPEN (Bit 17)                     */
+#define GPIO_PADREGG_PAD26INPEN_Msk       (0x20000UL)               /*!< GPIO PADREGG: PAD26INPEN (Bitfield-Mask: 0x01)        */
+#define GPIO_PADREGG_PAD26PULL_Pos        (16UL)                    /*!< GPIO PADREGG: PAD26PULL (Bit 16)                      */
+#define GPIO_PADREGG_PAD26PULL_Msk        (0x10000UL)               /*!< GPIO PADREGG: PAD26PULL (Bitfield-Mask: 0x01)         */
+#define GPIO_PADREGG_PAD25RSEL_Pos        (14UL)                    /*!< GPIO PADREGG: PAD25RSEL (Bit 14)                      */
+#define GPIO_PADREGG_PAD25RSEL_Msk        (0xc000UL)                /*!< GPIO PADREGG: PAD25RSEL (Bitfield-Mask: 0x03)         */
+#define GPIO_PADREGG_PAD25FNCSEL_Pos      (11UL)                    /*!< GPIO PADREGG: PAD25FNCSEL (Bit 11)                    */
+#define GPIO_PADREGG_PAD25FNCSEL_Msk      (0x3800UL)                /*!< GPIO PADREGG: PAD25FNCSEL (Bitfield-Mask: 0x07)       */
+#define GPIO_PADREGG_PAD25STRNG_Pos       (10UL)                    /*!< GPIO PADREGG: PAD25STRNG (Bit 10)                     */
+#define GPIO_PADREGG_PAD25STRNG_Msk       (0x400UL)                 /*!< GPIO PADREGG: PAD25STRNG (Bitfield-Mask: 0x01)        */
+#define GPIO_PADREGG_PAD25INPEN_Pos       (9UL)                     /*!< GPIO PADREGG: PAD25INPEN (Bit 9)                      */
+#define GPIO_PADREGG_PAD25INPEN_Msk       (0x200UL)                 /*!< GPIO PADREGG: PAD25INPEN (Bitfield-Mask: 0x01)        */
+#define GPIO_PADREGG_PAD25PULL_Pos        (8UL)                     /*!< GPIO PADREGG: PAD25PULL (Bit 8)                       */
+#define GPIO_PADREGG_PAD25PULL_Msk        (0x100UL)                 /*!< GPIO PADREGG: PAD25PULL (Bitfield-Mask: 0x01)         */
+#define GPIO_PADREGG_PAD24FNCSEL_Pos      (3UL)                     /*!< GPIO PADREGG: PAD24FNCSEL (Bit 3)                     */
+#define GPIO_PADREGG_PAD24FNCSEL_Msk      (0x38UL)                  /*!< GPIO PADREGG: PAD24FNCSEL (Bitfield-Mask: 0x07)       */
+#define GPIO_PADREGG_PAD24STRNG_Pos       (2UL)                     /*!< GPIO PADREGG: PAD24STRNG (Bit 2)                      */
+#define GPIO_PADREGG_PAD24STRNG_Msk       (0x4UL)                   /*!< GPIO PADREGG: PAD24STRNG (Bitfield-Mask: 0x01)        */
+#define GPIO_PADREGG_PAD24INPEN_Pos       (1UL)                     /*!< GPIO PADREGG: PAD24INPEN (Bit 1)                      */
+#define GPIO_PADREGG_PAD24INPEN_Msk       (0x2UL)                   /*!< GPIO PADREGG: PAD24INPEN (Bitfield-Mask: 0x01)        */
+#define GPIO_PADREGG_PAD24PULL_Pos        (0UL)                     /*!< GPIO PADREGG: PAD24PULL (Bit 0)                       */
+#define GPIO_PADREGG_PAD24PULL_Msk        (0x1UL)                   /*!< GPIO PADREGG: PAD24PULL (Bitfield-Mask: 0x01)         */
+/* ========================================================  PADREGH  ======================================================== */
+#define GPIO_PADREGH_PAD31FNCSEL_Pos      (27UL)                    /*!< GPIO PADREGH: PAD31FNCSEL (Bit 27)                    */
+#define GPIO_PADREGH_PAD31FNCSEL_Msk      (0x38000000UL)            /*!< GPIO PADREGH: PAD31FNCSEL (Bitfield-Mask: 0x07)       */
+#define GPIO_PADREGH_PAD31STRNG_Pos       (26UL)                    /*!< GPIO PADREGH: PAD31STRNG (Bit 26)                     */
+#define GPIO_PADREGH_PAD31STRNG_Msk       (0x4000000UL)             /*!< GPIO PADREGH: PAD31STRNG (Bitfield-Mask: 0x01)        */
+#define GPIO_PADREGH_PAD31INPEN_Pos       (25UL)                    /*!< GPIO PADREGH: PAD31INPEN (Bit 25)                     */
+#define GPIO_PADREGH_PAD31INPEN_Msk       (0x2000000UL)             /*!< GPIO PADREGH: PAD31INPEN (Bitfield-Mask: 0x01)        */
+#define GPIO_PADREGH_PAD31PULL_Pos        (24UL)                    /*!< GPIO PADREGH: PAD31PULL (Bit 24)                      */
+#define GPIO_PADREGH_PAD31PULL_Msk        (0x1000000UL)             /*!< GPIO PADREGH: PAD31PULL (Bitfield-Mask: 0x01)         */
+#define GPIO_PADREGH_PAD30FNCSEL_Pos      (19UL)                    /*!< GPIO PADREGH: PAD30FNCSEL (Bit 19)                    */
+#define GPIO_PADREGH_PAD30FNCSEL_Msk      (0x380000UL)              /*!< GPIO PADREGH: PAD30FNCSEL (Bitfield-Mask: 0x07)       */
+#define GPIO_PADREGH_PAD30STRNG_Pos       (18UL)                    /*!< GPIO PADREGH: PAD30STRNG (Bit 18)                     */
+#define GPIO_PADREGH_PAD30STRNG_Msk       (0x40000UL)               /*!< GPIO PADREGH: PAD30STRNG (Bitfield-Mask: 0x01)        */
+#define GPIO_PADREGH_PAD30INPEN_Pos       (17UL)                    /*!< GPIO PADREGH: PAD30INPEN (Bit 17)                     */
+#define GPIO_PADREGH_PAD30INPEN_Msk       (0x20000UL)               /*!< GPIO PADREGH: PAD30INPEN (Bitfield-Mask: 0x01)        */
+#define GPIO_PADREGH_PAD30PULL_Pos        (16UL)                    /*!< GPIO PADREGH: PAD30PULL (Bit 16)                      */
+#define GPIO_PADREGH_PAD30PULL_Msk        (0x10000UL)               /*!< GPIO PADREGH: PAD30PULL (Bitfield-Mask: 0x01)         */
+#define GPIO_PADREGH_PAD29FNCSEL_Pos      (11UL)                    /*!< GPIO PADREGH: PAD29FNCSEL (Bit 11)                    */
+#define GPIO_PADREGH_PAD29FNCSEL_Msk      (0x3800UL)                /*!< GPIO PADREGH: PAD29FNCSEL (Bitfield-Mask: 0x07)       */
+#define GPIO_PADREGH_PAD29STRNG_Pos       (10UL)                    /*!< GPIO PADREGH: PAD29STRNG (Bit 10)                     */
+#define GPIO_PADREGH_PAD29STRNG_Msk       (0x400UL)                 /*!< GPIO PADREGH: PAD29STRNG (Bitfield-Mask: 0x01)        */
+#define GPIO_PADREGH_PAD29INPEN_Pos       (9UL)                     /*!< GPIO PADREGH: PAD29INPEN (Bit 9)                      */
+#define GPIO_PADREGH_PAD29INPEN_Msk       (0x200UL)                 /*!< GPIO PADREGH: PAD29INPEN (Bitfield-Mask: 0x01)        */
+#define GPIO_PADREGH_PAD29PULL_Pos        (8UL)                     /*!< GPIO PADREGH: PAD29PULL (Bit 8)                       */
+#define GPIO_PADREGH_PAD29PULL_Msk        (0x100UL)                 /*!< GPIO PADREGH: PAD29PULL (Bitfield-Mask: 0x01)         */
+#define GPIO_PADREGH_PAD28FNCSEL_Pos      (3UL)                     /*!< GPIO PADREGH: PAD28FNCSEL (Bit 3)                     */
+#define GPIO_PADREGH_PAD28FNCSEL_Msk      (0x38UL)                  /*!< GPIO PADREGH: PAD28FNCSEL (Bitfield-Mask: 0x07)       */
+#define GPIO_PADREGH_PAD28STRNG_Pos       (2UL)                     /*!< GPIO PADREGH: PAD28STRNG (Bit 2)                      */
+#define GPIO_PADREGH_PAD28STRNG_Msk       (0x4UL)                   /*!< GPIO PADREGH: PAD28STRNG (Bitfield-Mask: 0x01)        */
+#define GPIO_PADREGH_PAD28INPEN_Pos       (1UL)                     /*!< GPIO PADREGH: PAD28INPEN (Bit 1)                      */
+#define GPIO_PADREGH_PAD28INPEN_Msk       (0x2UL)                   /*!< GPIO PADREGH: PAD28INPEN (Bitfield-Mask: 0x01)        */
+#define GPIO_PADREGH_PAD28PULL_Pos        (0UL)                     /*!< GPIO PADREGH: PAD28PULL (Bit 0)                       */
+#define GPIO_PADREGH_PAD28PULL_Msk        (0x1UL)                   /*!< GPIO PADREGH: PAD28PULL (Bitfield-Mask: 0x01)         */
+/* ========================================================  PADREGI  ======================================================== */
+#define GPIO_PADREGI_PAD35FNCSEL_Pos      (27UL)                    /*!< GPIO PADREGI: PAD35FNCSEL (Bit 27)                    */
+#define GPIO_PADREGI_PAD35FNCSEL_Msk      (0x38000000UL)            /*!< GPIO PADREGI: PAD35FNCSEL (Bitfield-Mask: 0x07)       */
+#define GPIO_PADREGI_PAD35STRNG_Pos       (26UL)                    /*!< GPIO PADREGI: PAD35STRNG (Bit 26)                     */
+#define GPIO_PADREGI_PAD35STRNG_Msk       (0x4000000UL)             /*!< GPIO PADREGI: PAD35STRNG (Bitfield-Mask: 0x01)        */
+#define GPIO_PADREGI_PAD35INPEN_Pos       (25UL)                    /*!< GPIO PADREGI: PAD35INPEN (Bit 25)                     */
+#define GPIO_PADREGI_PAD35INPEN_Msk       (0x2000000UL)             /*!< GPIO PADREGI: PAD35INPEN (Bitfield-Mask: 0x01)        */
+#define GPIO_PADREGI_PAD35PULL_Pos        (24UL)                    /*!< GPIO PADREGI: PAD35PULL (Bit 24)                      */
+#define GPIO_PADREGI_PAD35PULL_Msk        (0x1000000UL)             /*!< GPIO PADREGI: PAD35PULL (Bitfield-Mask: 0x01)         */
+#define GPIO_PADREGI_PAD34FNCSEL_Pos      (19UL)                    /*!< GPIO PADREGI: PAD34FNCSEL (Bit 19)                    */
+#define GPIO_PADREGI_PAD34FNCSEL_Msk      (0x380000UL)              /*!< GPIO PADREGI: PAD34FNCSEL (Bitfield-Mask: 0x07)       */
+#define GPIO_PADREGI_PAD34STRNG_Pos       (18UL)                    /*!< GPIO PADREGI: PAD34STRNG (Bit 18)                     */
+#define GPIO_PADREGI_PAD34STRNG_Msk       (0x40000UL)               /*!< GPIO PADREGI: PAD34STRNG (Bitfield-Mask: 0x01)        */
+#define GPIO_PADREGI_PAD34INPEN_Pos       (17UL)                    /*!< GPIO PADREGI: PAD34INPEN (Bit 17)                     */
+#define GPIO_PADREGI_PAD34INPEN_Msk       (0x20000UL)               /*!< GPIO PADREGI: PAD34INPEN (Bitfield-Mask: 0x01)        */
+#define GPIO_PADREGI_PAD34PULL_Pos        (16UL)                    /*!< GPIO PADREGI: PAD34PULL (Bit 16)                      */
+#define GPIO_PADREGI_PAD34PULL_Msk        (0x10000UL)               /*!< GPIO PADREGI: PAD34PULL (Bitfield-Mask: 0x01)         */
+#define GPIO_PADREGI_PAD33FNCSEL_Pos      (11UL)                    /*!< GPIO PADREGI: PAD33FNCSEL (Bit 11)                    */
+#define GPIO_PADREGI_PAD33FNCSEL_Msk      (0x3800UL)                /*!< GPIO PADREGI: PAD33FNCSEL (Bitfield-Mask: 0x07)       */
+#define GPIO_PADREGI_PAD33STRNG_Pos       (10UL)                    /*!< GPIO PADREGI: PAD33STRNG (Bit 10)                     */
+#define GPIO_PADREGI_PAD33STRNG_Msk       (0x400UL)                 /*!< GPIO PADREGI: PAD33STRNG (Bitfield-Mask: 0x01)        */
+#define GPIO_PADREGI_PAD33INPEN_Pos       (9UL)                     /*!< GPIO PADREGI: PAD33INPEN (Bit 9)                      */
+#define GPIO_PADREGI_PAD33INPEN_Msk       (0x200UL)                 /*!< GPIO PADREGI: PAD33INPEN (Bitfield-Mask: 0x01)        */
+#define GPIO_PADREGI_PAD33PULL_Pos        (8UL)                     /*!< GPIO PADREGI: PAD33PULL (Bit 8)                       */
+#define GPIO_PADREGI_PAD33PULL_Msk        (0x100UL)                 /*!< GPIO PADREGI: PAD33PULL (Bitfield-Mask: 0x01)         */
+#define GPIO_PADREGI_PAD32FNCSEL_Pos      (3UL)                     /*!< GPIO PADREGI: PAD32FNCSEL (Bit 3)                     */
+#define GPIO_PADREGI_PAD32FNCSEL_Msk      (0x38UL)                  /*!< GPIO PADREGI: PAD32FNCSEL (Bitfield-Mask: 0x07)       */
+#define GPIO_PADREGI_PAD32STRNG_Pos       (2UL)                     /*!< GPIO PADREGI: PAD32STRNG (Bit 2)                      */
+#define GPIO_PADREGI_PAD32STRNG_Msk       (0x4UL)                   /*!< GPIO PADREGI: PAD32STRNG (Bitfield-Mask: 0x01)        */
+#define GPIO_PADREGI_PAD32INPEN_Pos       (1UL)                     /*!< GPIO PADREGI: PAD32INPEN (Bit 1)                      */
+#define GPIO_PADREGI_PAD32INPEN_Msk       (0x2UL)                   /*!< GPIO PADREGI: PAD32INPEN (Bitfield-Mask: 0x01)        */
+#define GPIO_PADREGI_PAD32PULL_Pos        (0UL)                     /*!< GPIO PADREGI: PAD32PULL (Bit 0)                       */
+#define GPIO_PADREGI_PAD32PULL_Msk        (0x1UL)                   /*!< GPIO PADREGI: PAD32PULL (Bitfield-Mask: 0x01)         */
+/* ========================================================  PADREGJ  ======================================================== */
+#define GPIO_PADREGJ_PAD39RSEL_Pos        (30UL)                    /*!< GPIO PADREGJ: PAD39RSEL (Bit 30)                      */
+#define GPIO_PADREGJ_PAD39RSEL_Msk        (0xc0000000UL)            /*!< GPIO PADREGJ: PAD39RSEL (Bitfield-Mask: 0x03)         */
+#define GPIO_PADREGJ_PAD39FNCSEL_Pos      (27UL)                    /*!< GPIO PADREGJ: PAD39FNCSEL (Bit 27)                    */
+#define GPIO_PADREGJ_PAD39FNCSEL_Msk      (0x38000000UL)            /*!< GPIO PADREGJ: PAD39FNCSEL (Bitfield-Mask: 0x07)       */
+#define GPIO_PADREGJ_PAD39STRNG_Pos       (26UL)                    /*!< GPIO PADREGJ: PAD39STRNG (Bit 26)                     */
+#define GPIO_PADREGJ_PAD39STRNG_Msk       (0x4000000UL)             /*!< GPIO PADREGJ: PAD39STRNG (Bitfield-Mask: 0x01)        */
+#define GPIO_PADREGJ_PAD39INPEN_Pos       (25UL)                    /*!< GPIO PADREGJ: PAD39INPEN (Bit 25)                     */
+#define GPIO_PADREGJ_PAD39INPEN_Msk       (0x2000000UL)             /*!< GPIO PADREGJ: PAD39INPEN (Bitfield-Mask: 0x01)        */
+#define GPIO_PADREGJ_PAD39PULL_Pos        (24UL)                    /*!< GPIO PADREGJ: PAD39PULL (Bit 24)                      */
+#define GPIO_PADREGJ_PAD39PULL_Msk        (0x1000000UL)             /*!< GPIO PADREGJ: PAD39PULL (Bitfield-Mask: 0x01)         */
+#define GPIO_PADREGJ_PAD38FNCSEL_Pos      (19UL)                    /*!< GPIO PADREGJ: PAD38FNCSEL (Bit 19)                    */
+#define GPIO_PADREGJ_PAD38FNCSEL_Msk      (0x380000UL)              /*!< GPIO PADREGJ: PAD38FNCSEL (Bitfield-Mask: 0x07)       */
+#define GPIO_PADREGJ_PAD38STRNG_Pos       (18UL)                    /*!< GPIO PADREGJ: PAD38STRNG (Bit 18)                     */
+#define GPIO_PADREGJ_PAD38STRNG_Msk       (0x40000UL)               /*!< GPIO PADREGJ: PAD38STRNG (Bitfield-Mask: 0x01)        */
+#define GPIO_PADREGJ_PAD38INPEN_Pos       (17UL)                    /*!< GPIO PADREGJ: PAD38INPEN (Bit 17)                     */
+#define GPIO_PADREGJ_PAD38INPEN_Msk       (0x20000UL)               /*!< GPIO PADREGJ: PAD38INPEN (Bitfield-Mask: 0x01)        */
+#define GPIO_PADREGJ_PAD38PULL_Pos        (16UL)                    /*!< GPIO PADREGJ: PAD38PULL (Bit 16)                      */
+#define GPIO_PADREGJ_PAD38PULL_Msk        (0x10000UL)               /*!< GPIO PADREGJ: PAD38PULL (Bitfield-Mask: 0x01)         */
+#define GPIO_PADREGJ_PAD37PWRDN_Pos       (15UL)                    /*!< GPIO PADREGJ: PAD37PWRDN (Bit 15)                     */
+#define GPIO_PADREGJ_PAD37PWRDN_Msk       (0x8000UL)                /*!< GPIO PADREGJ: PAD37PWRDN (Bitfield-Mask: 0x01)        */
+#define GPIO_PADREGJ_PAD37FNCSEL_Pos      (11UL)                    /*!< GPIO PADREGJ: PAD37FNCSEL (Bit 11)                    */
+#define GPIO_PADREGJ_PAD37FNCSEL_Msk      (0x3800UL)                /*!< GPIO PADREGJ: PAD37FNCSEL (Bitfield-Mask: 0x07)       */
+#define GPIO_PADREGJ_PAD37STRNG_Pos       (10UL)                    /*!< GPIO PADREGJ: PAD37STRNG (Bit 10)                     */
+#define GPIO_PADREGJ_PAD37STRNG_Msk       (0x400UL)                 /*!< GPIO PADREGJ: PAD37STRNG (Bitfield-Mask: 0x01)        */
+#define GPIO_PADREGJ_PAD37INPEN_Pos       (9UL)                     /*!< GPIO PADREGJ: PAD37INPEN (Bit 9)                      */
+#define GPIO_PADREGJ_PAD37INPEN_Msk       (0x200UL)                 /*!< GPIO PADREGJ: PAD37INPEN (Bitfield-Mask: 0x01)        */
+#define GPIO_PADREGJ_PAD37PULL_Pos        (8UL)                     /*!< GPIO PADREGJ: PAD37PULL (Bit 8)                       */
+#define GPIO_PADREGJ_PAD37PULL_Msk        (0x100UL)                 /*!< GPIO PADREGJ: PAD37PULL (Bitfield-Mask: 0x01)         */
+#define GPIO_PADREGJ_PAD36PWRUP_Pos       (6UL)                     /*!< GPIO PADREGJ: PAD36PWRUP (Bit 6)                      */
+#define GPIO_PADREGJ_PAD36PWRUP_Msk       (0x40UL)                  /*!< GPIO PADREGJ: PAD36PWRUP (Bitfield-Mask: 0x01)        */
+#define GPIO_PADREGJ_PAD36FNCSEL_Pos      (3UL)                     /*!< GPIO PADREGJ: PAD36FNCSEL (Bit 3)                     */
+#define GPIO_PADREGJ_PAD36FNCSEL_Msk      (0x38UL)                  /*!< GPIO PADREGJ: PAD36FNCSEL (Bitfield-Mask: 0x07)       */
+#define GPIO_PADREGJ_PAD36STRNG_Pos       (2UL)                     /*!< GPIO PADREGJ: PAD36STRNG (Bit 2)                      */
+#define GPIO_PADREGJ_PAD36STRNG_Msk       (0x4UL)                   /*!< GPIO PADREGJ: PAD36STRNG (Bitfield-Mask: 0x01)        */
+#define GPIO_PADREGJ_PAD36INPEN_Pos       (1UL)                     /*!< GPIO PADREGJ: PAD36INPEN (Bit 1)                      */
+#define GPIO_PADREGJ_PAD36INPEN_Msk       (0x2UL)                   /*!< GPIO PADREGJ: PAD36INPEN (Bitfield-Mask: 0x01)        */
+#define GPIO_PADREGJ_PAD36PULL_Pos        (0UL)                     /*!< GPIO PADREGJ: PAD36PULL (Bit 0)                       */
+#define GPIO_PADREGJ_PAD36PULL_Msk        (0x1UL)                   /*!< GPIO PADREGJ: PAD36PULL (Bitfield-Mask: 0x01)         */
+/* ========================================================  PADREGK  ======================================================== */
+#define GPIO_PADREGK_PAD43RSEL_Pos        (30UL)                    /*!< GPIO PADREGK: PAD43RSEL (Bit 30)                      */
+#define GPIO_PADREGK_PAD43RSEL_Msk        (0xc0000000UL)            /*!< GPIO PADREGK: PAD43RSEL (Bitfield-Mask: 0x03)         */
+#define GPIO_PADREGK_PAD43FNCSEL_Pos      (27UL)                    /*!< GPIO PADREGK: PAD43FNCSEL (Bit 27)                    */
+#define GPIO_PADREGK_PAD43FNCSEL_Msk      (0x38000000UL)            /*!< GPIO PADREGK: PAD43FNCSEL (Bitfield-Mask: 0x07)       */
+#define GPIO_PADREGK_PAD43STRNG_Pos       (26UL)                    /*!< GPIO PADREGK: PAD43STRNG (Bit 26)                     */
+#define GPIO_PADREGK_PAD43STRNG_Msk       (0x4000000UL)             /*!< GPIO PADREGK: PAD43STRNG (Bitfield-Mask: 0x01)        */
+#define GPIO_PADREGK_PAD43INPEN_Pos       (25UL)                    /*!< GPIO PADREGK: PAD43INPEN (Bit 25)                     */
+#define GPIO_PADREGK_PAD43INPEN_Msk       (0x2000000UL)             /*!< GPIO PADREGK: PAD43INPEN (Bitfield-Mask: 0x01)        */
+#define GPIO_PADREGK_PAD43PULL_Pos        (24UL)                    /*!< GPIO PADREGK: PAD43PULL (Bit 24)                      */
+#define GPIO_PADREGK_PAD43PULL_Msk        (0x1000000UL)             /*!< GPIO PADREGK: PAD43PULL (Bitfield-Mask: 0x01)         */
+#define GPIO_PADREGK_PAD42RSEL_Pos        (22UL)                    /*!< GPIO PADREGK: PAD42RSEL (Bit 22)                      */
+#define GPIO_PADREGK_PAD42RSEL_Msk        (0xc00000UL)              /*!< GPIO PADREGK: PAD42RSEL (Bitfield-Mask: 0x03)         */
+#define GPIO_PADREGK_PAD42FNCSEL_Pos      (19UL)                    /*!< GPIO PADREGK: PAD42FNCSEL (Bit 19)                    */
+#define GPIO_PADREGK_PAD42FNCSEL_Msk      (0x380000UL)              /*!< GPIO PADREGK: PAD42FNCSEL (Bitfield-Mask: 0x07)       */
+#define GPIO_PADREGK_PAD42STRNG_Pos       (18UL)                    /*!< GPIO PADREGK: PAD42STRNG (Bit 18)                     */
+#define GPIO_PADREGK_PAD42STRNG_Msk       (0x40000UL)               /*!< GPIO PADREGK: PAD42STRNG (Bitfield-Mask: 0x01)        */
+#define GPIO_PADREGK_PAD42INPEN_Pos       (17UL)                    /*!< GPIO PADREGK: PAD42INPEN (Bit 17)                     */
+#define GPIO_PADREGK_PAD42INPEN_Msk       (0x20000UL)               /*!< GPIO PADREGK: PAD42INPEN (Bitfield-Mask: 0x01)        */
+#define GPIO_PADREGK_PAD42PULL_Pos        (16UL)                    /*!< GPIO PADREGK: PAD42PULL (Bit 16)                      */
+#define GPIO_PADREGK_PAD42PULL_Msk        (0x10000UL)               /*!< GPIO PADREGK: PAD42PULL (Bitfield-Mask: 0x01)         */
+#define GPIO_PADREGK_PAD41PWRDN_Pos       (15UL)                    /*!< GPIO PADREGK: PAD41PWRDN (Bit 15)                     */
+#define GPIO_PADREGK_PAD41PWRDN_Msk       (0x8000UL)                /*!< GPIO PADREGK: PAD41PWRDN (Bitfield-Mask: 0x01)        */
+#define GPIO_PADREGK_PAD41FNCSEL_Pos      (11UL)                    /*!< GPIO PADREGK: PAD41FNCSEL (Bit 11)                    */
+#define GPIO_PADREGK_PAD41FNCSEL_Msk      (0x3800UL)                /*!< GPIO PADREGK: PAD41FNCSEL (Bitfield-Mask: 0x07)       */
+#define GPIO_PADREGK_PAD41STRNG_Pos       (10UL)                    /*!< GPIO PADREGK: PAD41STRNG (Bit 10)                     */
+#define GPIO_PADREGK_PAD41STRNG_Msk       (0x400UL)                 /*!< GPIO PADREGK: PAD41STRNG (Bitfield-Mask: 0x01)        */
+#define GPIO_PADREGK_PAD41INPEN_Pos       (9UL)                     /*!< GPIO PADREGK: PAD41INPEN (Bit 9)                      */
+#define GPIO_PADREGK_PAD41INPEN_Msk       (0x200UL)                 /*!< GPIO PADREGK: PAD41INPEN (Bitfield-Mask: 0x01)        */
+#define GPIO_PADREGK_PAD41PULL_Pos        (8UL)                     /*!< GPIO PADREGK: PAD41PULL (Bit 8)                       */
+#define GPIO_PADREGK_PAD41PULL_Msk        (0x100UL)                 /*!< GPIO PADREGK: PAD41PULL (Bitfield-Mask: 0x01)         */
+#define GPIO_PADREGK_PAD40RSEL_Pos        (6UL)                     /*!< GPIO PADREGK: PAD40RSEL (Bit 6)                       */
+#define GPIO_PADREGK_PAD40RSEL_Msk        (0xc0UL)                  /*!< GPIO PADREGK: PAD40RSEL (Bitfield-Mask: 0x03)         */
+#define GPIO_PADREGK_PAD40FNCSEL_Pos      (3UL)                     /*!< GPIO PADREGK: PAD40FNCSEL (Bit 3)                     */
+#define GPIO_PADREGK_PAD40FNCSEL_Msk      (0x38UL)                  /*!< GPIO PADREGK: PAD40FNCSEL (Bitfield-Mask: 0x07)       */
+#define GPIO_PADREGK_PAD40STRNG_Pos       (2UL)                     /*!< GPIO PADREGK: PAD40STRNG (Bit 2)                      */
+#define GPIO_PADREGK_PAD40STRNG_Msk       (0x4UL)                   /*!< GPIO PADREGK: PAD40STRNG (Bitfield-Mask: 0x01)        */
+#define GPIO_PADREGK_PAD40INPEN_Pos       (1UL)                     /*!< GPIO PADREGK: PAD40INPEN (Bit 1)                      */
+#define GPIO_PADREGK_PAD40INPEN_Msk       (0x2UL)                   /*!< GPIO PADREGK: PAD40INPEN (Bitfield-Mask: 0x01)        */
+#define GPIO_PADREGK_PAD40PULL_Pos        (0UL)                     /*!< GPIO PADREGK: PAD40PULL (Bit 0)                       */
+#define GPIO_PADREGK_PAD40PULL_Msk        (0x1UL)                   /*!< GPIO PADREGK: PAD40PULL (Bitfield-Mask: 0x01)         */
+/* ========================================================  PADREGL  ======================================================== */
+#define GPIO_PADREGL_PAD47FNCSEL_Pos      (27UL)                    /*!< GPIO PADREGL: PAD47FNCSEL (Bit 27)                    */
+#define GPIO_PADREGL_PAD47FNCSEL_Msk      (0x38000000UL)            /*!< GPIO PADREGL: PAD47FNCSEL (Bitfield-Mask: 0x07)       */
+#define GPIO_PADREGL_PAD47STRNG_Pos       (26UL)                    /*!< GPIO PADREGL: PAD47STRNG (Bit 26)                     */
+#define GPIO_PADREGL_PAD47STRNG_Msk       (0x4000000UL)             /*!< GPIO PADREGL: PAD47STRNG (Bitfield-Mask: 0x01)        */
+#define GPIO_PADREGL_PAD47INPEN_Pos       (25UL)                    /*!< GPIO PADREGL: PAD47INPEN (Bit 25)                     */
+#define GPIO_PADREGL_PAD47INPEN_Msk       (0x2000000UL)             /*!< GPIO PADREGL: PAD47INPEN (Bitfield-Mask: 0x01)        */
+#define GPIO_PADREGL_PAD47PULL_Pos        (24UL)                    /*!< GPIO PADREGL: PAD47PULL (Bit 24)                      */
+#define GPIO_PADREGL_PAD47PULL_Msk        (0x1000000UL)             /*!< GPIO PADREGL: PAD47PULL (Bitfield-Mask: 0x01)         */
+#define GPIO_PADREGL_PAD46FNCSEL_Pos      (19UL)                    /*!< GPIO PADREGL: PAD46FNCSEL (Bit 19)                    */
+#define GPIO_PADREGL_PAD46FNCSEL_Msk      (0x380000UL)              /*!< GPIO PADREGL: PAD46FNCSEL (Bitfield-Mask: 0x07)       */
+#define GPIO_PADREGL_PAD46STRNG_Pos       (18UL)                    /*!< GPIO PADREGL: PAD46STRNG (Bit 18)                     */
+#define GPIO_PADREGL_PAD46STRNG_Msk       (0x40000UL)               /*!< GPIO PADREGL: PAD46STRNG (Bitfield-Mask: 0x01)        */
+#define GPIO_PADREGL_PAD46INPEN_Pos       (17UL)                    /*!< GPIO PADREGL: PAD46INPEN (Bit 17)                     */
+#define GPIO_PADREGL_PAD46INPEN_Msk       (0x20000UL)               /*!< GPIO PADREGL: PAD46INPEN (Bitfield-Mask: 0x01)        */
+#define GPIO_PADREGL_PAD46PULL_Pos        (16UL)                    /*!< GPIO PADREGL: PAD46PULL (Bit 16)                      */
+#define GPIO_PADREGL_PAD46PULL_Msk        (0x10000UL)               /*!< GPIO PADREGL: PAD46PULL (Bitfield-Mask: 0x01)         */
+#define GPIO_PADREGL_PAD45FNCSEL_Pos      (11UL)                    /*!< GPIO PADREGL: PAD45FNCSEL (Bit 11)                    */
+#define GPIO_PADREGL_PAD45FNCSEL_Msk      (0x3800UL)                /*!< GPIO PADREGL: PAD45FNCSEL (Bitfield-Mask: 0x07)       */
+#define GPIO_PADREGL_PAD45STRNG_Pos       (10UL)                    /*!< GPIO PADREGL: PAD45STRNG (Bit 10)                     */
+#define GPIO_PADREGL_PAD45STRNG_Msk       (0x400UL)                 /*!< GPIO PADREGL: PAD45STRNG (Bitfield-Mask: 0x01)        */
+#define GPIO_PADREGL_PAD45INPEN_Pos       (9UL)                     /*!< GPIO PADREGL: PAD45INPEN (Bit 9)                      */
+#define GPIO_PADREGL_PAD45INPEN_Msk       (0x200UL)                 /*!< GPIO PADREGL: PAD45INPEN (Bitfield-Mask: 0x01)        */
+#define GPIO_PADREGL_PAD45PULL_Pos        (8UL)                     /*!< GPIO PADREGL: PAD45PULL (Bit 8)                       */
+#define GPIO_PADREGL_PAD45PULL_Msk        (0x100UL)                 /*!< GPIO PADREGL: PAD45PULL (Bitfield-Mask: 0x01)         */
+#define GPIO_PADREGL_PAD44FNCSEL_Pos      (3UL)                     /*!< GPIO PADREGL: PAD44FNCSEL (Bit 3)                     */
+#define GPIO_PADREGL_PAD44FNCSEL_Msk      (0x38UL)                  /*!< GPIO PADREGL: PAD44FNCSEL (Bitfield-Mask: 0x07)       */
+#define GPIO_PADREGL_PAD44STRNG_Pos       (2UL)                     /*!< GPIO PADREGL: PAD44STRNG (Bit 2)                      */
+#define GPIO_PADREGL_PAD44STRNG_Msk       (0x4UL)                   /*!< GPIO PADREGL: PAD44STRNG (Bitfield-Mask: 0x01)        */
+#define GPIO_PADREGL_PAD44INPEN_Pos       (1UL)                     /*!< GPIO PADREGL: PAD44INPEN (Bit 1)                      */
+#define GPIO_PADREGL_PAD44INPEN_Msk       (0x2UL)                   /*!< GPIO PADREGL: PAD44INPEN (Bitfield-Mask: 0x01)        */
+#define GPIO_PADREGL_PAD44PULL_Pos        (0UL)                     /*!< GPIO PADREGL: PAD44PULL (Bit 0)                       */
+#define GPIO_PADREGL_PAD44PULL_Msk        (0x1UL)                   /*!< GPIO PADREGL: PAD44PULL (Bitfield-Mask: 0x01)         */
+/* ========================================================  PADREGM  ======================================================== */
+#define GPIO_PADREGM_PAD49RSEL_Pos        (14UL)                    /*!< GPIO PADREGM: PAD49RSEL (Bit 14)                      */
+#define GPIO_PADREGM_PAD49RSEL_Msk        (0xc000UL)                /*!< GPIO PADREGM: PAD49RSEL (Bitfield-Mask: 0x03)         */
+#define GPIO_PADREGM_PAD49FNCSEL_Pos      (11UL)                    /*!< GPIO PADREGM: PAD49FNCSEL (Bit 11)                    */
+#define GPIO_PADREGM_PAD49FNCSEL_Msk      (0x3800UL)                /*!< GPIO PADREGM: PAD49FNCSEL (Bitfield-Mask: 0x07)       */
+#define GPIO_PADREGM_PAD49STRNG_Pos       (10UL)                    /*!< GPIO PADREGM: PAD49STRNG (Bit 10)                     */
+#define GPIO_PADREGM_PAD49STRNG_Msk       (0x400UL)                 /*!< GPIO PADREGM: PAD49STRNG (Bitfield-Mask: 0x01)        */
+#define GPIO_PADREGM_PAD49INPEN_Pos       (9UL)                     /*!< GPIO PADREGM: PAD49INPEN (Bit 9)                      */
+#define GPIO_PADREGM_PAD49INPEN_Msk       (0x200UL)                 /*!< GPIO PADREGM: PAD49INPEN (Bitfield-Mask: 0x01)        */
+#define GPIO_PADREGM_PAD49PULL_Pos        (8UL)                     /*!< GPIO PADREGM: PAD49PULL (Bit 8)                       */
+#define GPIO_PADREGM_PAD49PULL_Msk        (0x100UL)                 /*!< GPIO PADREGM: PAD49PULL (Bitfield-Mask: 0x01)         */
+#define GPIO_PADREGM_PAD48RSEL_Pos        (6UL)                     /*!< GPIO PADREGM: PAD48RSEL (Bit 6)                       */
+#define GPIO_PADREGM_PAD48RSEL_Msk        (0xc0UL)                  /*!< GPIO PADREGM: PAD48RSEL (Bitfield-Mask: 0x03)         */
+#define GPIO_PADREGM_PAD48FNCSEL_Pos      (3UL)                     /*!< GPIO PADREGM: PAD48FNCSEL (Bit 3)                     */
+#define GPIO_PADREGM_PAD48FNCSEL_Msk      (0x38UL)                  /*!< GPIO PADREGM: PAD48FNCSEL (Bitfield-Mask: 0x07)       */
+#define GPIO_PADREGM_PAD48STRNG_Pos       (2UL)                     /*!< GPIO PADREGM: PAD48STRNG (Bit 2)                      */
+#define GPIO_PADREGM_PAD48STRNG_Msk       (0x4UL)                   /*!< GPIO PADREGM: PAD48STRNG (Bitfield-Mask: 0x01)        */
+#define GPIO_PADREGM_PAD48INPEN_Pos       (1UL)                     /*!< GPIO PADREGM: PAD48INPEN (Bit 1)                      */
+#define GPIO_PADREGM_PAD48INPEN_Msk       (0x2UL)                   /*!< GPIO PADREGM: PAD48INPEN (Bitfield-Mask: 0x01)        */
+#define GPIO_PADREGM_PAD48PULL_Pos        (0UL)                     /*!< GPIO PADREGM: PAD48PULL (Bit 0)                       */
+#define GPIO_PADREGM_PAD48PULL_Msk        (0x1UL)                   /*!< GPIO PADREGM: PAD48PULL (Bitfield-Mask: 0x01)         */
+/* =========================================================  CFGA  ========================================================== */
+#define GPIO_CFGA_GPIO7INTD_Pos           (31UL)                    /*!< GPIO CFGA: GPIO7INTD (Bit 31)                         */
+#define GPIO_CFGA_GPIO7INTD_Msk           (0x80000000UL)            /*!< GPIO CFGA: GPIO7INTD (Bitfield-Mask: 0x01)            */
+#define GPIO_CFGA_GPIO7OUTCFG_Pos         (29UL)                    /*!< GPIO CFGA: GPIO7OUTCFG (Bit 29)                       */
+#define GPIO_CFGA_GPIO7OUTCFG_Msk         (0x60000000UL)            /*!< GPIO CFGA: GPIO7OUTCFG (Bitfield-Mask: 0x03)          */
+#define GPIO_CFGA_GPIO7INCFG_Pos          (28UL)                    /*!< GPIO CFGA: GPIO7INCFG (Bit 28)                        */
+#define GPIO_CFGA_GPIO7INCFG_Msk          (0x10000000UL)            /*!< GPIO CFGA: GPIO7INCFG (Bitfield-Mask: 0x01)           */
+#define GPIO_CFGA_GPIO6INTD_Pos           (27UL)                    /*!< GPIO CFGA: GPIO6INTD (Bit 27)                         */
+#define GPIO_CFGA_GPIO6INTD_Msk           (0x8000000UL)             /*!< GPIO CFGA: GPIO6INTD (Bitfield-Mask: 0x01)            */
+#define GPIO_CFGA_GPIO6OUTCFG_Pos         (25UL)                    /*!< GPIO CFGA: GPIO6OUTCFG (Bit 25)                       */
+#define GPIO_CFGA_GPIO6OUTCFG_Msk         (0x6000000UL)             /*!< GPIO CFGA: GPIO6OUTCFG (Bitfield-Mask: 0x03)          */
+#define GPIO_CFGA_GPIO6INCFG_Pos          (24UL)                    /*!< GPIO CFGA: GPIO6INCFG (Bit 24)                        */
+#define GPIO_CFGA_GPIO6INCFG_Msk          (0x1000000UL)             /*!< GPIO CFGA: GPIO6INCFG (Bitfield-Mask: 0x01)           */
+#define GPIO_CFGA_GPIO5INTD_Pos           (23UL)                    /*!< GPIO CFGA: GPIO5INTD (Bit 23)                         */
+#define GPIO_CFGA_GPIO5INTD_Msk           (0x800000UL)              /*!< GPIO CFGA: GPIO5INTD (Bitfield-Mask: 0x01)            */
+#define GPIO_CFGA_GPIO5OUTCFG_Pos         (21UL)                    /*!< GPIO CFGA: GPIO5OUTCFG (Bit 21)                       */
+#define GPIO_CFGA_GPIO5OUTCFG_Msk         (0x600000UL)              /*!< GPIO CFGA: GPIO5OUTCFG (Bitfield-Mask: 0x03)          */
+#define GPIO_CFGA_GPIO5INCFG_Pos          (20UL)                    /*!< GPIO CFGA: GPIO5INCFG (Bit 20)                        */
+#define GPIO_CFGA_GPIO5INCFG_Msk          (0x100000UL)              /*!< GPIO CFGA: GPIO5INCFG (Bitfield-Mask: 0x01)           */
+#define GPIO_CFGA_GPIO4INTD_Pos           (19UL)                    /*!< GPIO CFGA: GPIO4INTD (Bit 19)                         */
+#define GPIO_CFGA_GPIO4INTD_Msk           (0x80000UL)               /*!< GPIO CFGA: GPIO4INTD (Bitfield-Mask: 0x01)            */
+#define GPIO_CFGA_GPIO4OUTCFG_Pos         (17UL)                    /*!< GPIO CFGA: GPIO4OUTCFG (Bit 17)                       */
+#define GPIO_CFGA_GPIO4OUTCFG_Msk         (0x60000UL)               /*!< GPIO CFGA: GPIO4OUTCFG (Bitfield-Mask: 0x03)          */
+#define GPIO_CFGA_GPIO4INCFG_Pos          (16UL)                    /*!< GPIO CFGA: GPIO4INCFG (Bit 16)                        */
+#define GPIO_CFGA_GPIO4INCFG_Msk          (0x10000UL)               /*!< GPIO CFGA: GPIO4INCFG (Bitfield-Mask: 0x01)           */
+#define GPIO_CFGA_GPIO3INTD_Pos           (15UL)                    /*!< GPIO CFGA: GPIO3INTD (Bit 15)                         */
+#define GPIO_CFGA_GPIO3INTD_Msk           (0x8000UL)                /*!< GPIO CFGA: GPIO3INTD (Bitfield-Mask: 0x01)            */
+#define GPIO_CFGA_GPIO3OUTCFG_Pos         (13UL)                    /*!< GPIO CFGA: GPIO3OUTCFG (Bit 13)                       */
+#define GPIO_CFGA_GPIO3OUTCFG_Msk         (0x6000UL)                /*!< GPIO CFGA: GPIO3OUTCFG (Bitfield-Mask: 0x03)          */
+#define GPIO_CFGA_GPIO3INCFG_Pos          (12UL)                    /*!< GPIO CFGA: GPIO3INCFG (Bit 12)                        */
+#define GPIO_CFGA_GPIO3INCFG_Msk          (0x1000UL)                /*!< GPIO CFGA: GPIO3INCFG (Bitfield-Mask: 0x01)           */
+#define GPIO_CFGA_GPIO2INTD_Pos           (11UL)                    /*!< GPIO CFGA: GPIO2INTD (Bit 11)                         */
+#define GPIO_CFGA_GPIO2INTD_Msk           (0x800UL)                 /*!< GPIO CFGA: GPIO2INTD (Bitfield-Mask: 0x01)            */
+#define GPIO_CFGA_GPIO2OUTCFG_Pos         (9UL)                     /*!< GPIO CFGA: GPIO2OUTCFG (Bit 9)                        */
+#define GPIO_CFGA_GPIO2OUTCFG_Msk         (0x600UL)                 /*!< GPIO CFGA: GPIO2OUTCFG (Bitfield-Mask: 0x03)          */
+#define GPIO_CFGA_GPIO2INCFG_Pos          (8UL)                     /*!< GPIO CFGA: GPIO2INCFG (Bit 8)                         */
+#define GPIO_CFGA_GPIO2INCFG_Msk          (0x100UL)                 /*!< GPIO CFGA: GPIO2INCFG (Bitfield-Mask: 0x01)           */
+#define GPIO_CFGA_GPIO1INTD_Pos           (7UL)                     /*!< GPIO CFGA: GPIO1INTD (Bit 7)                          */
+#define GPIO_CFGA_GPIO1INTD_Msk           (0x80UL)                  /*!< GPIO CFGA: GPIO1INTD (Bitfield-Mask: 0x01)            */
+#define GPIO_CFGA_GPIO1OUTCFG_Pos         (5UL)                     /*!< GPIO CFGA: GPIO1OUTCFG (Bit 5)                        */
+#define GPIO_CFGA_GPIO1OUTCFG_Msk         (0x60UL)                  /*!< GPIO CFGA: GPIO1OUTCFG (Bitfield-Mask: 0x03)          */
+#define GPIO_CFGA_GPIO1INCFG_Pos          (4UL)                     /*!< GPIO CFGA: GPIO1INCFG (Bit 4)                         */
+#define GPIO_CFGA_GPIO1INCFG_Msk          (0x10UL)                  /*!< GPIO CFGA: GPIO1INCFG (Bitfield-Mask: 0x01)           */
+#define GPIO_CFGA_GPIO0INTD_Pos           (3UL)                     /*!< GPIO CFGA: GPIO0INTD (Bit 3)                          */
+#define GPIO_CFGA_GPIO0INTD_Msk           (0x8UL)                   /*!< GPIO CFGA: GPIO0INTD (Bitfield-Mask: 0x01)            */
+#define GPIO_CFGA_GPIO0OUTCFG_Pos         (1UL)                     /*!< GPIO CFGA: GPIO0OUTCFG (Bit 1)                        */
+#define GPIO_CFGA_GPIO0OUTCFG_Msk         (0x6UL)                   /*!< GPIO CFGA: GPIO0OUTCFG (Bitfield-Mask: 0x03)          */
+#define GPIO_CFGA_GPIO0INCFG_Pos          (0UL)                     /*!< GPIO CFGA: GPIO0INCFG (Bit 0)                         */
+#define GPIO_CFGA_GPIO0INCFG_Msk          (0x1UL)                   /*!< GPIO CFGA: GPIO0INCFG (Bitfield-Mask: 0x01)           */
+/* =========================================================  CFGB  ========================================================== */
+#define GPIO_CFGB_GPIO15INTD_Pos          (31UL)                    /*!< GPIO CFGB: GPIO15INTD (Bit 31)                        */
+#define GPIO_CFGB_GPIO15INTD_Msk          (0x80000000UL)            /*!< GPIO CFGB: GPIO15INTD (Bitfield-Mask: 0x01)           */
+#define GPIO_CFGB_GPIO15OUTCFG_Pos        (29UL)                    /*!< GPIO CFGB: GPIO15OUTCFG (Bit 29)                      */
+#define GPIO_CFGB_GPIO15OUTCFG_Msk        (0x60000000UL)            /*!< GPIO CFGB: GPIO15OUTCFG (Bitfield-Mask: 0x03)         */
+#define GPIO_CFGB_GPIO15INCFG_Pos         (28UL)                    /*!< GPIO CFGB: GPIO15INCFG (Bit 28)                       */
+#define GPIO_CFGB_GPIO15INCFG_Msk         (0x10000000UL)            /*!< GPIO CFGB: GPIO15INCFG (Bitfield-Mask: 0x01)          */
+#define GPIO_CFGB_GPIO14INTD_Pos          (27UL)                    /*!< GPIO CFGB: GPIO14INTD (Bit 27)                        */
+#define GPIO_CFGB_GPIO14INTD_Msk          (0x8000000UL)             /*!< GPIO CFGB: GPIO14INTD (Bitfield-Mask: 0x01)           */
+#define GPIO_CFGB_GPIO14OUTCFG_Pos        (25UL)                    /*!< GPIO CFGB: GPIO14OUTCFG (Bit 25)                      */
+#define GPIO_CFGB_GPIO14OUTCFG_Msk        (0x6000000UL)             /*!< GPIO CFGB: GPIO14OUTCFG (Bitfield-Mask: 0x03)         */
+#define GPIO_CFGB_GPIO14INCFG_Pos         (24UL)                    /*!< GPIO CFGB: GPIO14INCFG (Bit 24)                       */
+#define GPIO_CFGB_GPIO14INCFG_Msk         (0x1000000UL)             /*!< GPIO CFGB: GPIO14INCFG (Bitfield-Mask: 0x01)          */
+#define GPIO_CFGB_GPIO13INTD_Pos          (23UL)                    /*!< GPIO CFGB: GPIO13INTD (Bit 23)                        */
+#define GPIO_CFGB_GPIO13INTD_Msk          (0x800000UL)              /*!< GPIO CFGB: GPIO13INTD (Bitfield-Mask: 0x01)           */
+#define GPIO_CFGB_GPIO13OUTCFG_Pos        (21UL)                    /*!< GPIO CFGB: GPIO13OUTCFG (Bit 21)                      */
+#define GPIO_CFGB_GPIO13OUTCFG_Msk        (0x600000UL)              /*!< GPIO CFGB: GPIO13OUTCFG (Bitfield-Mask: 0x03)         */
+#define GPIO_CFGB_GPIO13INCFG_Pos         (20UL)                    /*!< GPIO CFGB: GPIO13INCFG (Bit 20)                       */
+#define GPIO_CFGB_GPIO13INCFG_Msk         (0x100000UL)              /*!< GPIO CFGB: GPIO13INCFG (Bitfield-Mask: 0x01)          */
+#define GPIO_CFGB_GPIO12INTD_Pos          (19UL)                    /*!< GPIO CFGB: GPIO12INTD (Bit 19)                        */
+#define GPIO_CFGB_GPIO12INTD_Msk          (0x80000UL)               /*!< GPIO CFGB: GPIO12INTD (Bitfield-Mask: 0x01)           */
+#define GPIO_CFGB_GPIO12OUTCFG_Pos        (17UL)                    /*!< GPIO CFGB: GPIO12OUTCFG (Bit 17)                      */
+#define GPIO_CFGB_GPIO12OUTCFG_Msk        (0x60000UL)               /*!< GPIO CFGB: GPIO12OUTCFG (Bitfield-Mask: 0x03)         */
+#define GPIO_CFGB_GPIO12INCFG_Pos         (16UL)                    /*!< GPIO CFGB: GPIO12INCFG (Bit 16)                       */
+#define GPIO_CFGB_GPIO12INCFG_Msk         (0x10000UL)               /*!< GPIO CFGB: GPIO12INCFG (Bitfield-Mask: 0x01)          */
+#define GPIO_CFGB_GPIO11INTD_Pos          (15UL)                    /*!< GPIO CFGB: GPIO11INTD (Bit 15)                        */
+#define GPIO_CFGB_GPIO11INTD_Msk          (0x8000UL)                /*!< GPIO CFGB: GPIO11INTD (Bitfield-Mask: 0x01)           */
+#define GPIO_CFGB_GPIO11OUTCFG_Pos        (13UL)                    /*!< GPIO CFGB: GPIO11OUTCFG (Bit 13)                      */
+#define GPIO_CFGB_GPIO11OUTCFG_Msk        (0x6000UL)                /*!< GPIO CFGB: GPIO11OUTCFG (Bitfield-Mask: 0x03)         */
+#define GPIO_CFGB_GPIO11INCFG_Pos         (12UL)                    /*!< GPIO CFGB: GPIO11INCFG (Bit 12)                       */
+#define GPIO_CFGB_GPIO11INCFG_Msk         (0x1000UL)                /*!< GPIO CFGB: GPIO11INCFG (Bitfield-Mask: 0x01)          */
+#define GPIO_CFGB_GPIO10INTD_Pos          (11UL)                    /*!< GPIO CFGB: GPIO10INTD (Bit 11)                        */
+#define GPIO_CFGB_GPIO10INTD_Msk          (0x800UL)                 /*!< GPIO CFGB: GPIO10INTD (Bitfield-Mask: 0x01)           */
+#define GPIO_CFGB_GPIO10OUTCFG_Pos        (9UL)                     /*!< GPIO CFGB: GPIO10OUTCFG (Bit 9)                       */
+#define GPIO_CFGB_GPIO10OUTCFG_Msk        (0x600UL)                 /*!< GPIO CFGB: GPIO10OUTCFG (Bitfield-Mask: 0x03)         */
+#define GPIO_CFGB_GPIO10INCFG_Pos         (8UL)                     /*!< GPIO CFGB: GPIO10INCFG (Bit 8)                        */
+#define GPIO_CFGB_GPIO10INCFG_Msk         (0x100UL)                 /*!< GPIO CFGB: GPIO10INCFG (Bitfield-Mask: 0x01)          */
+#define GPIO_CFGB_GPIO9INTD_Pos           (7UL)                     /*!< GPIO CFGB: GPIO9INTD (Bit 7)                          */
+#define GPIO_CFGB_GPIO9INTD_Msk           (0x80UL)                  /*!< GPIO CFGB: GPIO9INTD (Bitfield-Mask: 0x01)            */
+#define GPIO_CFGB_GPIO9OUTCFG_Pos         (5UL)                     /*!< GPIO CFGB: GPIO9OUTCFG (Bit 5)                        */
+#define GPIO_CFGB_GPIO9OUTCFG_Msk         (0x60UL)                  /*!< GPIO CFGB: GPIO9OUTCFG (Bitfield-Mask: 0x03)          */
+#define GPIO_CFGB_GPIO9INCFG_Pos          (4UL)                     /*!< GPIO CFGB: GPIO9INCFG (Bit 4)                         */
+#define GPIO_CFGB_GPIO9INCFG_Msk          (0x10UL)                  /*!< GPIO CFGB: GPIO9INCFG (Bitfield-Mask: 0x01)           */
+#define GPIO_CFGB_GPIO8INTD_Pos           (3UL)                     /*!< GPIO CFGB: GPIO8INTD (Bit 3)                          */
+#define GPIO_CFGB_GPIO8INTD_Msk           (0x8UL)                   /*!< GPIO CFGB: GPIO8INTD (Bitfield-Mask: 0x01)            */
+#define GPIO_CFGB_GPIO8OUTCFG_Pos         (1UL)                     /*!< GPIO CFGB: GPIO8OUTCFG (Bit 1)                        */
+#define GPIO_CFGB_GPIO8OUTCFG_Msk         (0x6UL)                   /*!< GPIO CFGB: GPIO8OUTCFG (Bitfield-Mask: 0x03)          */
+#define GPIO_CFGB_GPIO8INCFG_Pos          (0UL)                     /*!< GPIO CFGB: GPIO8INCFG (Bit 0)                         */
+#define GPIO_CFGB_GPIO8INCFG_Msk          (0x1UL)                   /*!< GPIO CFGB: GPIO8INCFG (Bitfield-Mask: 0x01)           */
+/* =========================================================  CFGC  ========================================================== */
+#define GPIO_CFGC_GPIO23INTD_Pos          (31UL)                    /*!< GPIO CFGC: GPIO23INTD (Bit 31)                        */
+#define GPIO_CFGC_GPIO23INTD_Msk          (0x80000000UL)            /*!< GPIO CFGC: GPIO23INTD (Bitfield-Mask: 0x01)           */
+#define GPIO_CFGC_GPIO23OUTCFG_Pos        (29UL)                    /*!< GPIO CFGC: GPIO23OUTCFG (Bit 29)                      */
+#define GPIO_CFGC_GPIO23OUTCFG_Msk        (0x60000000UL)            /*!< GPIO CFGC: GPIO23OUTCFG (Bitfield-Mask: 0x03)         */
+#define GPIO_CFGC_GPIO23INCFG_Pos         (28UL)                    /*!< GPIO CFGC: GPIO23INCFG (Bit 28)                       */
+#define GPIO_CFGC_GPIO23INCFG_Msk         (0x10000000UL)            /*!< GPIO CFGC: GPIO23INCFG (Bitfield-Mask: 0x01)          */
+#define GPIO_CFGC_GPIO22INTD_Pos          (27UL)                    /*!< GPIO CFGC: GPIO22INTD (Bit 27)                        */
+#define GPIO_CFGC_GPIO22INTD_Msk          (0x8000000UL)             /*!< GPIO CFGC: GPIO22INTD (Bitfield-Mask: 0x01)           */
+#define GPIO_CFGC_GPIO22OUTCFG_Pos        (25UL)                    /*!< GPIO CFGC: GPIO22OUTCFG (Bit 25)                      */
+#define GPIO_CFGC_GPIO22OUTCFG_Msk        (0x6000000UL)             /*!< GPIO CFGC: GPIO22OUTCFG (Bitfield-Mask: 0x03)         */
+#define GPIO_CFGC_GPIO22INCFG_Pos         (24UL)                    /*!< GPIO CFGC: GPIO22INCFG (Bit 24)                       */
+#define GPIO_CFGC_GPIO22INCFG_Msk         (0x1000000UL)             /*!< GPIO CFGC: GPIO22INCFG (Bitfield-Mask: 0x01)          */
+#define GPIO_CFGC_GPIO21INTD_Pos          (23UL)                    /*!< GPIO CFGC: GPIO21INTD (Bit 23)                        */
+#define GPIO_CFGC_GPIO21INTD_Msk          (0x800000UL)              /*!< GPIO CFGC: GPIO21INTD (Bitfield-Mask: 0x01)           */
+#define GPIO_CFGC_GPIO21OUTCFG_Pos        (21UL)                    /*!< GPIO CFGC: GPIO21OUTCFG (Bit 21)                      */
+#define GPIO_CFGC_GPIO21OUTCFG_Msk        (0x600000UL)              /*!< GPIO CFGC: GPIO21OUTCFG (Bitfield-Mask: 0x03)         */
+#define GPIO_CFGC_GPIO21INCFG_Pos         (20UL)                    /*!< GPIO CFGC: GPIO21INCFG (Bit 20)                       */
+#define GPIO_CFGC_GPIO21INCFG_Msk         (0x100000UL)              /*!< GPIO CFGC: GPIO21INCFG (Bitfield-Mask: 0x01)          */
+#define GPIO_CFGC_GPIO20INTD_Pos          (19UL)                    /*!< GPIO CFGC: GPIO20INTD (Bit 19)                        */
+#define GPIO_CFGC_GPIO20INTD_Msk          (0x80000UL)               /*!< GPIO CFGC: GPIO20INTD (Bitfield-Mask: 0x01)           */
+#define GPIO_CFGC_GPIO20OUTCFG_Pos        (17UL)                    /*!< GPIO CFGC: GPIO20OUTCFG (Bit 17)                      */
+#define GPIO_CFGC_GPIO20OUTCFG_Msk        (0x60000UL)               /*!< GPIO CFGC: GPIO20OUTCFG (Bitfield-Mask: 0x03)         */
+#define GPIO_CFGC_GPIO20INCFG_Pos         (16UL)                    /*!< GPIO CFGC: GPIO20INCFG (Bit 16)                       */
+#define GPIO_CFGC_GPIO20INCFG_Msk         (0x10000UL)               /*!< GPIO CFGC: GPIO20INCFG (Bitfield-Mask: 0x01)          */
+#define GPIO_CFGC_GPIO19INTD_Pos          (15UL)                    /*!< GPIO CFGC: GPIO19INTD (Bit 15)                        */
+#define GPIO_CFGC_GPIO19INTD_Msk          (0x8000UL)                /*!< GPIO CFGC: GPIO19INTD (Bitfield-Mask: 0x01)           */
+#define GPIO_CFGC_GPIO19OUTCFG_Pos        (13UL)                    /*!< GPIO CFGC: GPIO19OUTCFG (Bit 13)                      */
+#define GPIO_CFGC_GPIO19OUTCFG_Msk        (0x6000UL)                /*!< GPIO CFGC: GPIO19OUTCFG (Bitfield-Mask: 0x03)         */
+#define GPIO_CFGC_GPIO19INCFG_Pos         (12UL)                    /*!< GPIO CFGC: GPIO19INCFG (Bit 12)                       */
+#define GPIO_CFGC_GPIO19INCFG_Msk         (0x1000UL)                /*!< GPIO CFGC: GPIO19INCFG (Bitfield-Mask: 0x01)          */
+#define GPIO_CFGC_GPIO18INTD_Pos          (11UL)                    /*!< GPIO CFGC: GPIO18INTD (Bit 11)                        */
+#define GPIO_CFGC_GPIO18INTD_Msk          (0x800UL)                 /*!< GPIO CFGC: GPIO18INTD (Bitfield-Mask: 0x01)           */
+#define GPIO_CFGC_GPIO18OUTCFG_Pos        (9UL)                     /*!< GPIO CFGC: GPIO18OUTCFG (Bit 9)                       */
+#define GPIO_CFGC_GPIO18OUTCFG_Msk        (0x600UL)                 /*!< GPIO CFGC: GPIO18OUTCFG (Bitfield-Mask: 0x03)         */
+#define GPIO_CFGC_GPIO18INCFG_Pos         (8UL)                     /*!< GPIO CFGC: GPIO18INCFG (Bit 8)                        */
+#define GPIO_CFGC_GPIO18INCFG_Msk         (0x100UL)                 /*!< GPIO CFGC: GPIO18INCFG (Bitfield-Mask: 0x01)          */
+#define GPIO_CFGC_GPIO17INTD_Pos          (7UL)                     /*!< GPIO CFGC: GPIO17INTD (Bit 7)                         */
+#define GPIO_CFGC_GPIO17INTD_Msk          (0x80UL)                  /*!< GPIO CFGC: GPIO17INTD (Bitfield-Mask: 0x01)           */
+#define GPIO_CFGC_GPIO17OUTCFG_Pos        (5UL)                     /*!< GPIO CFGC: GPIO17OUTCFG (Bit 5)                       */
+#define GPIO_CFGC_GPIO17OUTCFG_Msk        (0x60UL)                  /*!< GPIO CFGC: GPIO17OUTCFG (Bitfield-Mask: 0x03)         */
+#define GPIO_CFGC_GPIO17INCFG_Pos         (4UL)                     /*!< GPIO CFGC: GPIO17INCFG (Bit 4)                        */
+#define GPIO_CFGC_GPIO17INCFG_Msk         (0x10UL)                  /*!< GPIO CFGC: GPIO17INCFG (Bitfield-Mask: 0x01)          */
+#define GPIO_CFGC_GPIO16INTD_Pos          (3UL)                     /*!< GPIO CFGC: GPIO16INTD (Bit 3)                         */
+#define GPIO_CFGC_GPIO16INTD_Msk          (0x8UL)                   /*!< GPIO CFGC: GPIO16INTD (Bitfield-Mask: 0x01)           */
+#define GPIO_CFGC_GPIO16OUTCFG_Pos        (1UL)                     /*!< GPIO CFGC: GPIO16OUTCFG (Bit 1)                       */
+#define GPIO_CFGC_GPIO16OUTCFG_Msk        (0x6UL)                   /*!< GPIO CFGC: GPIO16OUTCFG (Bitfield-Mask: 0x03)         */
+#define GPIO_CFGC_GPIO16INCFG_Pos         (0UL)                     /*!< GPIO CFGC: GPIO16INCFG (Bit 0)                        */
+#define GPIO_CFGC_GPIO16INCFG_Msk         (0x1UL)                   /*!< GPIO CFGC: GPIO16INCFG (Bitfield-Mask: 0x01)          */
+/* =========================================================  CFGD  ========================================================== */
+#define GPIO_CFGD_GPIO31INTD_Pos          (31UL)                    /*!< GPIO CFGD: GPIO31INTD (Bit 31)                        */
+#define GPIO_CFGD_GPIO31INTD_Msk          (0x80000000UL)            /*!< GPIO CFGD: GPIO31INTD (Bitfield-Mask: 0x01)           */
+#define GPIO_CFGD_GPIO31OUTCFG_Pos        (29UL)                    /*!< GPIO CFGD: GPIO31OUTCFG (Bit 29)                      */
+#define GPIO_CFGD_GPIO31OUTCFG_Msk        (0x60000000UL)            /*!< GPIO CFGD: GPIO31OUTCFG (Bitfield-Mask: 0x03)         */
+#define GPIO_CFGD_GPIO31INCFG_Pos         (28UL)                    /*!< GPIO CFGD: GPIO31INCFG (Bit 28)                       */
+#define GPIO_CFGD_GPIO31INCFG_Msk         (0x10000000UL)            /*!< GPIO CFGD: GPIO31INCFG (Bitfield-Mask: 0x01)          */
+#define GPIO_CFGD_GPIO30INTD_Pos          (27UL)                    /*!< GPIO CFGD: GPIO30INTD (Bit 27)                        */
+#define GPIO_CFGD_GPIO30INTD_Msk          (0x8000000UL)             /*!< GPIO CFGD: GPIO30INTD (Bitfield-Mask: 0x01)           */
+#define GPIO_CFGD_GPIO30OUTCFG_Pos        (25UL)                    /*!< GPIO CFGD: GPIO30OUTCFG (Bit 25)                      */
+#define GPIO_CFGD_GPIO30OUTCFG_Msk        (0x6000000UL)             /*!< GPIO CFGD: GPIO30OUTCFG (Bitfield-Mask: 0x03)         */
+#define GPIO_CFGD_GPIO30INCFG_Pos         (24UL)                    /*!< GPIO CFGD: GPIO30INCFG (Bit 24)                       */
+#define GPIO_CFGD_GPIO30INCFG_Msk         (0x1000000UL)             /*!< GPIO CFGD: GPIO30INCFG (Bitfield-Mask: 0x01)          */
+#define GPIO_CFGD_GPIO29INTD_Pos          (23UL)                    /*!< GPIO CFGD: GPIO29INTD (Bit 23)                        */
+#define GPIO_CFGD_GPIO29INTD_Msk          (0x800000UL)              /*!< GPIO CFGD: GPIO29INTD (Bitfield-Mask: 0x01)           */
+#define GPIO_CFGD_GPIO29OUTCFG_Pos        (21UL)                    /*!< GPIO CFGD: GPIO29OUTCFG (Bit 21)                      */
+#define GPIO_CFGD_GPIO29OUTCFG_Msk        (0x600000UL)              /*!< GPIO CFGD: GPIO29OUTCFG (Bitfield-Mask: 0x03)         */
+#define GPIO_CFGD_GPIO29INCFG_Pos         (20UL)                    /*!< GPIO CFGD: GPIO29INCFG (Bit 20)                       */
+#define GPIO_CFGD_GPIO29INCFG_Msk         (0x100000UL)              /*!< GPIO CFGD: GPIO29INCFG (Bitfield-Mask: 0x01)          */
+#define GPIO_CFGD_GPIO28INTD_Pos          (19UL)                    /*!< GPIO CFGD: GPIO28INTD (Bit 19)                        */
+#define GPIO_CFGD_GPIO28INTD_Msk          (0x80000UL)               /*!< GPIO CFGD: GPIO28INTD (Bitfield-Mask: 0x01)           */
+#define GPIO_CFGD_GPIO28OUTCFG_Pos        (17UL)                    /*!< GPIO CFGD: GPIO28OUTCFG (Bit 17)                      */
+#define GPIO_CFGD_GPIO28OUTCFG_Msk        (0x60000UL)               /*!< GPIO CFGD: GPIO28OUTCFG (Bitfield-Mask: 0x03)         */
+#define GPIO_CFGD_GPIO28INCFG_Pos         (16UL)                    /*!< GPIO CFGD: GPIO28INCFG (Bit 16)                       */
+#define GPIO_CFGD_GPIO28INCFG_Msk         (0x10000UL)               /*!< GPIO CFGD: GPIO28INCFG (Bitfield-Mask: 0x01)          */
+#define GPIO_CFGD_GPIO27INTD_Pos          (15UL)                    /*!< GPIO CFGD: GPIO27INTD (Bit 15)                        */
+#define GPIO_CFGD_GPIO27INTD_Msk          (0x8000UL)                /*!< GPIO CFGD: GPIO27INTD (Bitfield-Mask: 0x01)           */
+#define GPIO_CFGD_GPIO27OUTCFG_Pos        (13UL)                    /*!< GPIO CFGD: GPIO27OUTCFG (Bit 13)                      */
+#define GPIO_CFGD_GPIO27OUTCFG_Msk        (0x6000UL)                /*!< GPIO CFGD: GPIO27OUTCFG (Bitfield-Mask: 0x03)         */
+#define GPIO_CFGD_GPIO27INCFG_Pos         (12UL)                    /*!< GPIO CFGD: GPIO27INCFG (Bit 12)                       */
+#define GPIO_CFGD_GPIO27INCFG_Msk         (0x1000UL)                /*!< GPIO CFGD: GPIO27INCFG (Bitfield-Mask: 0x01)          */
+#define GPIO_CFGD_GPIO26INTD_Pos          (11UL)                    /*!< GPIO CFGD: GPIO26INTD (Bit 11)                        */
+#define GPIO_CFGD_GPIO26INTD_Msk          (0x800UL)                 /*!< GPIO CFGD: GPIO26INTD (Bitfield-Mask: 0x01)           */
+#define GPIO_CFGD_GPIO26OUTCFG_Pos        (9UL)                     /*!< GPIO CFGD: GPIO26OUTCFG (Bit 9)                       */
+#define GPIO_CFGD_GPIO26OUTCFG_Msk        (0x600UL)                 /*!< GPIO CFGD: GPIO26OUTCFG (Bitfield-Mask: 0x03)         */
+#define GPIO_CFGD_GPIO26INCFG_Pos         (8UL)                     /*!< GPIO CFGD: GPIO26INCFG (Bit 8)                        */
+#define GPIO_CFGD_GPIO26INCFG_Msk         (0x100UL)                 /*!< GPIO CFGD: GPIO26INCFG (Bitfield-Mask: 0x01)          */
+#define GPIO_CFGD_GPIO25INTD_Pos          (7UL)                     /*!< GPIO CFGD: GPIO25INTD (Bit 7)                         */
+#define GPIO_CFGD_GPIO25INTD_Msk          (0x80UL)                  /*!< GPIO CFGD: GPIO25INTD (Bitfield-Mask: 0x01)           */
+#define GPIO_CFGD_GPIO25OUTCFG_Pos        (5UL)                     /*!< GPIO CFGD: GPIO25OUTCFG (Bit 5)                       */
+#define GPIO_CFGD_GPIO25OUTCFG_Msk        (0x60UL)                  /*!< GPIO CFGD: GPIO25OUTCFG (Bitfield-Mask: 0x03)         */
+#define GPIO_CFGD_GPIO25INCFG_Pos         (4UL)                     /*!< GPIO CFGD: GPIO25INCFG (Bit 4)                        */
+#define GPIO_CFGD_GPIO25INCFG_Msk         (0x10UL)                  /*!< GPIO CFGD: GPIO25INCFG (Bitfield-Mask: 0x01)          */
+#define GPIO_CFGD_GPIO24INTD_Pos          (3UL)                     /*!< GPIO CFGD: GPIO24INTD (Bit 3)                         */
+#define GPIO_CFGD_GPIO24INTD_Msk          (0x8UL)                   /*!< GPIO CFGD: GPIO24INTD (Bitfield-Mask: 0x01)           */
+#define GPIO_CFGD_GPIO24OUTCFG_Pos        (1UL)                     /*!< GPIO CFGD: GPIO24OUTCFG (Bit 1)                       */
+#define GPIO_CFGD_GPIO24OUTCFG_Msk        (0x6UL)                   /*!< GPIO CFGD: GPIO24OUTCFG (Bitfield-Mask: 0x03)         */
+#define GPIO_CFGD_GPIO24INCFG_Pos         (0UL)                     /*!< GPIO CFGD: GPIO24INCFG (Bit 0)                        */
+#define GPIO_CFGD_GPIO24INCFG_Msk         (0x1UL)                   /*!< GPIO CFGD: GPIO24INCFG (Bitfield-Mask: 0x01)          */
+/* =========================================================  CFGE  ========================================================== */
+#define GPIO_CFGE_GPIO39INTD_Pos          (31UL)                    /*!< GPIO CFGE: GPIO39INTD (Bit 31)                        */
+#define GPIO_CFGE_GPIO39INTD_Msk          (0x80000000UL)            /*!< GPIO CFGE: GPIO39INTD (Bitfield-Mask: 0x01)           */
+#define GPIO_CFGE_GPIO39OUTCFG_Pos        (29UL)                    /*!< GPIO CFGE: GPIO39OUTCFG (Bit 29)                      */
+#define GPIO_CFGE_GPIO39OUTCFG_Msk        (0x60000000UL)            /*!< GPIO CFGE: GPIO39OUTCFG (Bitfield-Mask: 0x03)         */
+#define GPIO_CFGE_GPIO39INCFG_Pos         (28UL)                    /*!< GPIO CFGE: GPIO39INCFG (Bit 28)                       */
+#define GPIO_CFGE_GPIO39INCFG_Msk         (0x10000000UL)            /*!< GPIO CFGE: GPIO39INCFG (Bitfield-Mask: 0x01)          */
+#define GPIO_CFGE_GPIO38INTD_Pos          (27UL)                    /*!< GPIO CFGE: GPIO38INTD (Bit 27)                        */
+#define GPIO_CFGE_GPIO38INTD_Msk          (0x8000000UL)             /*!< GPIO CFGE: GPIO38INTD (Bitfield-Mask: 0x01)           */
+#define GPIO_CFGE_GPIO38OUTCFG_Pos        (25UL)                    /*!< GPIO CFGE: GPIO38OUTCFG (Bit 25)                      */
+#define GPIO_CFGE_GPIO38OUTCFG_Msk        (0x6000000UL)             /*!< GPIO CFGE: GPIO38OUTCFG (Bitfield-Mask: 0x03)         */
+#define GPIO_CFGE_GPIO38INCFG_Pos         (24UL)                    /*!< GPIO CFGE: GPIO38INCFG (Bit 24)                       */
+#define GPIO_CFGE_GPIO38INCFG_Msk         (0x1000000UL)             /*!< GPIO CFGE: GPIO38INCFG (Bitfield-Mask: 0x01)          */
+#define GPIO_CFGE_GPIO37INTD_Pos          (23UL)                    /*!< GPIO CFGE: GPIO37INTD (Bit 23)                        */
+#define GPIO_CFGE_GPIO37INTD_Msk          (0x800000UL)              /*!< GPIO CFGE: GPIO37INTD (Bitfield-Mask: 0x01)           */
+#define GPIO_CFGE_GPIO37OUTCFG_Pos        (21UL)                    /*!< GPIO CFGE: GPIO37OUTCFG (Bit 21)                      */
+#define GPIO_CFGE_GPIO37OUTCFG_Msk        (0x600000UL)              /*!< GPIO CFGE: GPIO37OUTCFG (Bitfield-Mask: 0x03)         */
+#define GPIO_CFGE_GPIO37INCFG_Pos         (20UL)                    /*!< GPIO CFGE: GPIO37INCFG (Bit 20)                       */
+#define GPIO_CFGE_GPIO37INCFG_Msk         (0x100000UL)              /*!< GPIO CFGE: GPIO37INCFG (Bitfield-Mask: 0x01)          */
+#define GPIO_CFGE_GPIO36INTD_Pos          (19UL)                    /*!< GPIO CFGE: GPIO36INTD (Bit 19)                        */
+#define GPIO_CFGE_GPIO36INTD_Msk          (0x80000UL)               /*!< GPIO CFGE: GPIO36INTD (Bitfield-Mask: 0x01)           */
+#define GPIO_CFGE_GPIO36OUTCFG_Pos        (17UL)                    /*!< GPIO CFGE: GPIO36OUTCFG (Bit 17)                      */
+#define GPIO_CFGE_GPIO36OUTCFG_Msk        (0x60000UL)               /*!< GPIO CFGE: GPIO36OUTCFG (Bitfield-Mask: 0x03)         */
+#define GPIO_CFGE_GPIO36INCFG_Pos         (16UL)                    /*!< GPIO CFGE: GPIO36INCFG (Bit 16)                       */
+#define GPIO_CFGE_GPIO36INCFG_Msk         (0x10000UL)               /*!< GPIO CFGE: GPIO36INCFG (Bitfield-Mask: 0x01)          */
+#define GPIO_CFGE_GPIO35INTD_Pos          (15UL)                    /*!< GPIO CFGE: GPIO35INTD (Bit 15)                        */
+#define GPIO_CFGE_GPIO35INTD_Msk          (0x8000UL)                /*!< GPIO CFGE: GPIO35INTD (Bitfield-Mask: 0x01)           */
+#define GPIO_CFGE_GPIO35OUTCFG_Pos        (13UL)                    /*!< GPIO CFGE: GPIO35OUTCFG (Bit 13)                      */
+#define GPIO_CFGE_GPIO35OUTCFG_Msk        (0x6000UL)                /*!< GPIO CFGE: GPIO35OUTCFG (Bitfield-Mask: 0x03)         */
+#define GPIO_CFGE_GPIO35INCFG_Pos         (12UL)                    /*!< GPIO CFGE: GPIO35INCFG (Bit 12)                       */
+#define GPIO_CFGE_GPIO35INCFG_Msk         (0x1000UL)                /*!< GPIO CFGE: GPIO35INCFG (Bitfield-Mask: 0x01)          */
+#define GPIO_CFGE_GPIO34INTD_Pos          (11UL)                    /*!< GPIO CFGE: GPIO34INTD (Bit 11)                        */
+#define GPIO_CFGE_GPIO34INTD_Msk          (0x800UL)                 /*!< GPIO CFGE: GPIO34INTD (Bitfield-Mask: 0x01)           */
+#define GPIO_CFGE_GPIO34OUTCFG_Pos        (9UL)                     /*!< GPIO CFGE: GPIO34OUTCFG (Bit 9)                       */
+#define GPIO_CFGE_GPIO34OUTCFG_Msk        (0x600UL)                 /*!< GPIO CFGE: GPIO34OUTCFG (Bitfield-Mask: 0x03)         */
+#define GPIO_CFGE_GPIO34INCFG_Pos         (8UL)                     /*!< GPIO CFGE: GPIO34INCFG (Bit 8)                        */
+#define GPIO_CFGE_GPIO34INCFG_Msk         (0x100UL)                 /*!< GPIO CFGE: GPIO34INCFG (Bitfield-Mask: 0x01)          */
+#define GPIO_CFGE_GPIO33INTD_Pos          (7UL)                     /*!< GPIO CFGE: GPIO33INTD (Bit 7)                         */
+#define GPIO_CFGE_GPIO33INTD_Msk          (0x80UL)                  /*!< GPIO CFGE: GPIO33INTD (Bitfield-Mask: 0x01)           */
+#define GPIO_CFGE_GPIO33OUTCFG_Pos        (5UL)                     /*!< GPIO CFGE: GPIO33OUTCFG (Bit 5)                       */
+#define GPIO_CFGE_GPIO33OUTCFG_Msk        (0x60UL)                  /*!< GPIO CFGE: GPIO33OUTCFG (Bitfield-Mask: 0x03)         */
+#define GPIO_CFGE_GPIO33INCFG_Pos         (4UL)                     /*!< GPIO CFGE: GPIO33INCFG (Bit 4)                        */
+#define GPIO_CFGE_GPIO33INCFG_Msk         (0x10UL)                  /*!< GPIO CFGE: GPIO33INCFG (Bitfield-Mask: 0x01)          */
+#define GPIO_CFGE_GPIO32INTD_Pos          (3UL)                     /*!< GPIO CFGE: GPIO32INTD (Bit 3)                         */
+#define GPIO_CFGE_GPIO32INTD_Msk          (0x8UL)                   /*!< GPIO CFGE: GPIO32INTD (Bitfield-Mask: 0x01)           */
+#define GPIO_CFGE_GPIO32OUTCFG_Pos        (1UL)                     /*!< GPIO CFGE: GPIO32OUTCFG (Bit 1)                       */
+#define GPIO_CFGE_GPIO32OUTCFG_Msk        (0x6UL)                   /*!< GPIO CFGE: GPIO32OUTCFG (Bitfield-Mask: 0x03)         */
+#define GPIO_CFGE_GPIO32INCFG_Pos         (0UL)                     /*!< GPIO CFGE: GPIO32INCFG (Bit 0)                        */
+#define GPIO_CFGE_GPIO32INCFG_Msk         (0x1UL)                   /*!< GPIO CFGE: GPIO32INCFG (Bitfield-Mask: 0x01)          */
+/* =========================================================  CFGF  ========================================================== */
+#define GPIO_CFGF_GPIO47INTD_Pos          (31UL)                    /*!< GPIO CFGF: GPIO47INTD (Bit 31)                        */
+#define GPIO_CFGF_GPIO47INTD_Msk          (0x80000000UL)            /*!< GPIO CFGF: GPIO47INTD (Bitfield-Mask: 0x01)           */
+#define GPIO_CFGF_GPIO47OUTCFG_Pos        (29UL)                    /*!< GPIO CFGF: GPIO47OUTCFG (Bit 29)                      */
+#define GPIO_CFGF_GPIO47OUTCFG_Msk        (0x60000000UL)            /*!< GPIO CFGF: GPIO47OUTCFG (Bitfield-Mask: 0x03)         */
+#define GPIO_CFGF_GPIO47INCFG_Pos         (28UL)                    /*!< GPIO CFGF: GPIO47INCFG (Bit 28)                       */
+#define GPIO_CFGF_GPIO47INCFG_Msk         (0x10000000UL)            /*!< GPIO CFGF: GPIO47INCFG (Bitfield-Mask: 0x01)          */
+#define GPIO_CFGF_GPIO46INTD_Pos          (27UL)                    /*!< GPIO CFGF: GPIO46INTD (Bit 27)                        */
+#define GPIO_CFGF_GPIO46INTD_Msk          (0x8000000UL)             /*!< GPIO CFGF: GPIO46INTD (Bitfield-Mask: 0x01)           */
+#define GPIO_CFGF_GPIO46OUTCFG_Pos        (25UL)                    /*!< GPIO CFGF: GPIO46OUTCFG (Bit 25)                      */
+#define GPIO_CFGF_GPIO46OUTCFG_Msk        (0x6000000UL)             /*!< GPIO CFGF: GPIO46OUTCFG (Bitfield-Mask: 0x03)         */
+#define GPIO_CFGF_GPIO46INCFG_Pos         (24UL)                    /*!< GPIO CFGF: GPIO46INCFG (Bit 24)                       */
+#define GPIO_CFGF_GPIO46INCFG_Msk         (0x1000000UL)             /*!< GPIO CFGF: GPIO46INCFG (Bitfield-Mask: 0x01)          */
+#define GPIO_CFGF_GPIO45INTD_Pos          (23UL)                    /*!< GPIO CFGF: GPIO45INTD (Bit 23)                        */
+#define GPIO_CFGF_GPIO45INTD_Msk          (0x800000UL)              /*!< GPIO CFGF: GPIO45INTD (Bitfield-Mask: 0x01)           */
+#define GPIO_CFGF_GPIO45OUTCFG_Pos        (21UL)                    /*!< GPIO CFGF: GPIO45OUTCFG (Bit 21)                      */
+#define GPIO_CFGF_GPIO45OUTCFG_Msk        (0x600000UL)              /*!< GPIO CFGF: GPIO45OUTCFG (Bitfield-Mask: 0x03)         */
+#define GPIO_CFGF_GPIO45INCFG_Pos         (20UL)                    /*!< GPIO CFGF: GPIO45INCFG (Bit 20)                       */
+#define GPIO_CFGF_GPIO45INCFG_Msk         (0x100000UL)              /*!< GPIO CFGF: GPIO45INCFG (Bitfield-Mask: 0x01)          */
+#define GPIO_CFGF_GPIO44INTD_Pos          (19UL)                    /*!< GPIO CFGF: GPIO44INTD (Bit 19)                        */
+#define GPIO_CFGF_GPIO44INTD_Msk          (0x80000UL)               /*!< GPIO CFGF: GPIO44INTD (Bitfield-Mask: 0x01)           */
+#define GPIO_CFGF_GPIO44OUTCFG_Pos        (17UL)                    /*!< GPIO CFGF: GPIO44OUTCFG (Bit 17)                      */
+#define GPIO_CFGF_GPIO44OUTCFG_Msk        (0x60000UL)               /*!< GPIO CFGF: GPIO44OUTCFG (Bitfield-Mask: 0x03)         */
+#define GPIO_CFGF_GPIO44INCFG_Pos         (16UL)                    /*!< GPIO CFGF: GPIO44INCFG (Bit 16)                       */
+#define GPIO_CFGF_GPIO44INCFG_Msk         (0x10000UL)               /*!< GPIO CFGF: GPIO44INCFG (Bitfield-Mask: 0x01)          */
+#define GPIO_CFGF_GPIO43INTD_Pos          (15UL)                    /*!< GPIO CFGF: GPIO43INTD (Bit 15)                        */
+#define GPIO_CFGF_GPIO43INTD_Msk          (0x8000UL)                /*!< GPIO CFGF: GPIO43INTD (Bitfield-Mask: 0x01)           */
+#define GPIO_CFGF_GPIO43OUTCFG_Pos        (13UL)                    /*!< GPIO CFGF: GPIO43OUTCFG (Bit 13)                      */
+#define GPIO_CFGF_GPIO43OUTCFG_Msk        (0x6000UL)                /*!< GPIO CFGF: GPIO43OUTCFG (Bitfield-Mask: 0x03)         */
+#define GPIO_CFGF_GPIO43INCFG_Pos         (12UL)                    /*!< GPIO CFGF: GPIO43INCFG (Bit 12)                       */
+#define GPIO_CFGF_GPIO43INCFG_Msk         (0x1000UL)                /*!< GPIO CFGF: GPIO43INCFG (Bitfield-Mask: 0x01)          */
+#define GPIO_CFGF_GPIO42INTD_Pos          (11UL)                    /*!< GPIO CFGF: GPIO42INTD (Bit 11)                        */
+#define GPIO_CFGF_GPIO42INTD_Msk          (0x800UL)                 /*!< GPIO CFGF: GPIO42INTD (Bitfield-Mask: 0x01)           */
+#define GPIO_CFGF_GPIO42OUTCFG_Pos        (9UL)                     /*!< GPIO CFGF: GPIO42OUTCFG (Bit 9)                       */
+#define GPIO_CFGF_GPIO42OUTCFG_Msk        (0x600UL)                 /*!< GPIO CFGF: GPIO42OUTCFG (Bitfield-Mask: 0x03)         */
+#define GPIO_CFGF_GPIO42INCFG_Pos         (8UL)                     /*!< GPIO CFGF: GPIO42INCFG (Bit 8)                        */
+#define GPIO_CFGF_GPIO42INCFG_Msk         (0x100UL)                 /*!< GPIO CFGF: GPIO42INCFG (Bitfield-Mask: 0x01)          */
+#define GPIO_CFGF_GPIO41INTD_Pos          (7UL)                     /*!< GPIO CFGF: GPIO41INTD (Bit 7)                         */
+#define GPIO_CFGF_GPIO41INTD_Msk          (0x80UL)                  /*!< GPIO CFGF: GPIO41INTD (Bitfield-Mask: 0x01)           */
+#define GPIO_CFGF_GPIO41OUTCFG_Pos        (5UL)                     /*!< GPIO CFGF: GPIO41OUTCFG (Bit 5)                       */
+#define GPIO_CFGF_GPIO41OUTCFG_Msk        (0x60UL)                  /*!< GPIO CFGF: GPIO41OUTCFG (Bitfield-Mask: 0x03)         */
+#define GPIO_CFGF_GPIO41INCFG_Pos         (4UL)                     /*!< GPIO CFGF: GPIO41INCFG (Bit 4)                        */
+#define GPIO_CFGF_GPIO41INCFG_Msk         (0x10UL)                  /*!< GPIO CFGF: GPIO41INCFG (Bitfield-Mask: 0x01)          */
+#define GPIO_CFGF_GPIO40INTD_Pos          (3UL)                     /*!< GPIO CFGF: GPIO40INTD (Bit 3)                         */
+#define GPIO_CFGF_GPIO40INTD_Msk          (0x8UL)                   /*!< GPIO CFGF: GPIO40INTD (Bitfield-Mask: 0x01)           */
+#define GPIO_CFGF_GPIO40OUTCFG_Pos        (1UL)                     /*!< GPIO CFGF: GPIO40OUTCFG (Bit 1)                       */
+#define GPIO_CFGF_GPIO40OUTCFG_Msk        (0x6UL)                   /*!< GPIO CFGF: GPIO40OUTCFG (Bitfield-Mask: 0x03)         */
+#define GPIO_CFGF_GPIO40INCFG_Pos         (0UL)                     /*!< GPIO CFGF: GPIO40INCFG (Bit 0)                        */
+#define GPIO_CFGF_GPIO40INCFG_Msk         (0x1UL)                   /*!< GPIO CFGF: GPIO40INCFG (Bitfield-Mask: 0x01)          */
+/* =========================================================  CFGG  ========================================================== */
+#define GPIO_CFGG_GPIO49INTD_Pos          (7UL)                     /*!< GPIO CFGG: GPIO49INTD (Bit 7)                         */
+#define GPIO_CFGG_GPIO49INTD_Msk          (0x80UL)                  /*!< GPIO CFGG: GPIO49INTD (Bitfield-Mask: 0x01)           */
+#define GPIO_CFGG_GPIO49OUTCFG_Pos        (5UL)                     /*!< GPIO CFGG: GPIO49OUTCFG (Bit 5)                       */
+#define GPIO_CFGG_GPIO49OUTCFG_Msk        (0x60UL)                  /*!< GPIO CFGG: GPIO49OUTCFG (Bitfield-Mask: 0x03)         */
+#define GPIO_CFGG_GPIO49INCFG_Pos         (4UL)                     /*!< GPIO CFGG: GPIO49INCFG (Bit 4)                        */
+#define GPIO_CFGG_GPIO49INCFG_Msk         (0x10UL)                  /*!< GPIO CFGG: GPIO49INCFG (Bitfield-Mask: 0x01)          */
+#define GPIO_CFGG_GPIO48INTD_Pos          (3UL)                     /*!< GPIO CFGG: GPIO48INTD (Bit 3)                         */
+#define GPIO_CFGG_GPIO48INTD_Msk          (0x8UL)                   /*!< GPIO CFGG: GPIO48INTD (Bitfield-Mask: 0x01)           */
+#define GPIO_CFGG_GPIO48OUTCFG_Pos        (1UL)                     /*!< GPIO CFGG: GPIO48OUTCFG (Bit 1)                       */
+#define GPIO_CFGG_GPIO48OUTCFG_Msk        (0x6UL)                   /*!< GPIO CFGG: GPIO48OUTCFG (Bitfield-Mask: 0x03)         */
+#define GPIO_CFGG_GPIO48INCFG_Pos         (0UL)                     /*!< GPIO CFGG: GPIO48INCFG (Bit 0)                        */
+#define GPIO_CFGG_GPIO48INCFG_Msk         (0x1UL)                   /*!< GPIO CFGG: GPIO48INCFG (Bitfield-Mask: 0x01)          */
+/* ========================================================  PADKEY  ========================================================= */
+#define GPIO_PADKEY_PADKEY_Pos            (0UL)                     /*!< GPIO PADKEY: PADKEY (Bit 0)                           */
+#define GPIO_PADKEY_PADKEY_Msk            (0xffffffffUL)            /*!< GPIO PADKEY: PADKEY (Bitfield-Mask: 0xffffffff)       */
+/* ==========================================================  RDA  ========================================================== */
+#define GPIO_RDA_RDA_Pos                  (0UL)                     /*!< GPIO RDA: RDA (Bit 0)                                 */
+#define GPIO_RDA_RDA_Msk                  (0xffffffffUL)            /*!< GPIO RDA: RDA (Bitfield-Mask: 0xffffffff)             */
+/* ==========================================================  RDB  ========================================================== */
+#define GPIO_RDB_RDB_Pos                  (0UL)                     /*!< GPIO RDB: RDB (Bit 0)                                 */
+#define GPIO_RDB_RDB_Msk                  (0x3ffffUL)               /*!< GPIO RDB: RDB (Bitfield-Mask: 0x3ffff)                */
+/* ==========================================================  WTA  ========================================================== */
+#define GPIO_WTA_WTA_Pos                  (0UL)                     /*!< GPIO WTA: WTA (Bit 0)                                 */
+#define GPIO_WTA_WTA_Msk                  (0xffffffffUL)            /*!< GPIO WTA: WTA (Bitfield-Mask: 0xffffffff)             */
+/* ==========================================================  WTB  ========================================================== */
+#define GPIO_WTB_WTB_Pos                  (0UL)                     /*!< GPIO WTB: WTB (Bit 0)                                 */
+#define GPIO_WTB_WTB_Msk                  (0x3ffffUL)               /*!< GPIO WTB: WTB (Bitfield-Mask: 0x3ffff)                */
+/* =========================================================  WTSA  ========================================================== */
+#define GPIO_WTSA_WTSA_Pos                (0UL)                     /*!< GPIO WTSA: WTSA (Bit 0)                               */
+#define GPIO_WTSA_WTSA_Msk                (0xffffffffUL)            /*!< GPIO WTSA: WTSA (Bitfield-Mask: 0xffffffff)           */
+/* =========================================================  WTSB  ========================================================== */
+#define GPIO_WTSB_WTSB_Pos                (0UL)                     /*!< GPIO WTSB: WTSB (Bit 0)                               */
+#define GPIO_WTSB_WTSB_Msk                (0x3ffffUL)               /*!< GPIO WTSB: WTSB (Bitfield-Mask: 0x3ffff)              */
+/* =========================================================  WTCA  ========================================================== */
+#define GPIO_WTCA_WTCA_Pos                (0UL)                     /*!< GPIO WTCA: WTCA (Bit 0)                               */
+#define GPIO_WTCA_WTCA_Msk                (0xffffffffUL)            /*!< GPIO WTCA: WTCA (Bitfield-Mask: 0xffffffff)           */
+/* =========================================================  WTCB  ========================================================== */
+#define GPIO_WTCB_WTCB_Pos                (0UL)                     /*!< GPIO WTCB: WTCB (Bit 0)                               */
+#define GPIO_WTCB_WTCB_Msk                (0x3ffffUL)               /*!< GPIO WTCB: WTCB (Bitfield-Mask: 0x3ffff)              */
+/* ==========================================================  ENA  ========================================================== */
+#define GPIO_ENA_ENA_Pos                  (0UL)                     /*!< GPIO ENA: ENA (Bit 0)                                 */
+#define GPIO_ENA_ENA_Msk                  (0xffffffffUL)            /*!< GPIO ENA: ENA (Bitfield-Mask: 0xffffffff)             */
+/* ==========================================================  ENB  ========================================================== */
+#define GPIO_ENB_ENB_Pos                  (0UL)                     /*!< GPIO ENB: ENB (Bit 0)                                 */
+#define GPIO_ENB_ENB_Msk                  (0x3ffffUL)               /*!< GPIO ENB: ENB (Bitfield-Mask: 0x3ffff)                */
+/* =========================================================  ENSA  ========================================================== */
+#define GPIO_ENSA_ENSA_Pos                (0UL)                     /*!< GPIO ENSA: ENSA (Bit 0)                               */
+#define GPIO_ENSA_ENSA_Msk                (0xffffffffUL)            /*!< GPIO ENSA: ENSA (Bitfield-Mask: 0xffffffff)           */
+/* =========================================================  ENSB  ========================================================== */
+#define GPIO_ENSB_ENSB_Pos                (0UL)                     /*!< GPIO ENSB: ENSB (Bit 0)                               */
+#define GPIO_ENSB_ENSB_Msk                (0x3ffffUL)               /*!< GPIO ENSB: ENSB (Bitfield-Mask: 0x3ffff)              */
+/* =========================================================  ENCA  ========================================================== */
+#define GPIO_ENCA_ENCA_Pos                (0UL)                     /*!< GPIO ENCA: ENCA (Bit 0)                               */
+#define GPIO_ENCA_ENCA_Msk                (0xffffffffUL)            /*!< GPIO ENCA: ENCA (Bitfield-Mask: 0xffffffff)           */
+/* =========================================================  ENCB  ========================================================== */
+#define GPIO_ENCB_ENCB_Pos                (0UL)                     /*!< GPIO ENCB: ENCB (Bit 0)                               */
+#define GPIO_ENCB_ENCB_Msk                (0x3ffffUL)               /*!< GPIO ENCB: ENCB (Bitfield-Mask: 0x3ffff)              */
+/* ========================================================  STMRCAP  ======================================================== */
+#define GPIO_STMRCAP_STPOL3_Pos           (30UL)                    /*!< GPIO STMRCAP: STPOL3 (Bit 30)                         */
+#define GPIO_STMRCAP_STPOL3_Msk           (0x40000000UL)            /*!< GPIO STMRCAP: STPOL3 (Bitfield-Mask: 0x01)            */
+#define GPIO_STMRCAP_STSEL3_Pos           (24UL)                    /*!< GPIO STMRCAP: STSEL3 (Bit 24)                         */
+#define GPIO_STMRCAP_STSEL3_Msk           (0x3f000000UL)            /*!< GPIO STMRCAP: STSEL3 (Bitfield-Mask: 0x3f)            */
+#define GPIO_STMRCAP_STPOL2_Pos           (22UL)                    /*!< GPIO STMRCAP: STPOL2 (Bit 22)                         */
+#define GPIO_STMRCAP_STPOL2_Msk           (0x400000UL)              /*!< GPIO STMRCAP: STPOL2 (Bitfield-Mask: 0x01)            */
+#define GPIO_STMRCAP_STSEL2_Pos           (16UL)                    /*!< GPIO STMRCAP: STSEL2 (Bit 16)                         */
+#define GPIO_STMRCAP_STSEL2_Msk           (0x3f0000UL)              /*!< GPIO STMRCAP: STSEL2 (Bitfield-Mask: 0x3f)            */
+#define GPIO_STMRCAP_STPOL1_Pos           (14UL)                    /*!< GPIO STMRCAP: STPOL1 (Bit 14)                         */
+#define GPIO_STMRCAP_STPOL1_Msk           (0x4000UL)                /*!< GPIO STMRCAP: STPOL1 (Bitfield-Mask: 0x01)            */
+#define GPIO_STMRCAP_STSEL1_Pos           (8UL)                     /*!< GPIO STMRCAP: STSEL1 (Bit 8)                          */
+#define GPIO_STMRCAP_STSEL1_Msk           (0x3f00UL)                /*!< GPIO STMRCAP: STSEL1 (Bitfield-Mask: 0x3f)            */
+#define GPIO_STMRCAP_STPOL0_Pos           (6UL)                     /*!< GPIO STMRCAP: STPOL0 (Bit 6)                          */
+#define GPIO_STMRCAP_STPOL0_Msk           (0x40UL)                  /*!< GPIO STMRCAP: STPOL0 (Bitfield-Mask: 0x01)            */
+#define GPIO_STMRCAP_STSEL0_Pos           (0UL)                     /*!< GPIO STMRCAP: STSEL0 (Bit 0)                          */
+#define GPIO_STMRCAP_STSEL0_Msk           (0x3fUL)                  /*!< GPIO STMRCAP: STSEL0 (Bitfield-Mask: 0x3f)            */
+/* ========================================================  IOM0IRQ  ======================================================== */
+#define GPIO_IOM0IRQ_IOM0IRQ_Pos          (0UL)                     /*!< GPIO IOM0IRQ: IOM0IRQ (Bit 0)                         */
+#define GPIO_IOM0IRQ_IOM0IRQ_Msk          (0x3fUL)                  /*!< GPIO IOM0IRQ: IOM0IRQ (Bitfield-Mask: 0x3f)           */
+/* ========================================================  IOM1IRQ  ======================================================== */
+#define GPIO_IOM1IRQ_IOM1IRQ_Pos          (0UL)                     /*!< GPIO IOM1IRQ: IOM1IRQ (Bit 0)                         */
+#define GPIO_IOM1IRQ_IOM1IRQ_Msk          (0x3fUL)                  /*!< GPIO IOM1IRQ: IOM1IRQ (Bitfield-Mask: 0x3f)           */
+/* ========================================================  IOM2IRQ  ======================================================== */
+#define GPIO_IOM2IRQ_IOM2IRQ_Pos          (0UL)                     /*!< GPIO IOM2IRQ: IOM2IRQ (Bit 0)                         */
+#define GPIO_IOM2IRQ_IOM2IRQ_Msk          (0x3fUL)                  /*!< GPIO IOM2IRQ: IOM2IRQ (Bitfield-Mask: 0x3f)           */
+/* ========================================================  IOM3IRQ  ======================================================== */
+#define GPIO_IOM3IRQ_IOM3IRQ_Pos          (0UL)                     /*!< GPIO IOM3IRQ: IOM3IRQ (Bit 0)                         */
+#define GPIO_IOM3IRQ_IOM3IRQ_Msk          (0x3fUL)                  /*!< GPIO IOM3IRQ: IOM3IRQ (Bitfield-Mask: 0x3f)           */
+/* ========================================================  IOM4IRQ  ======================================================== */
+#define GPIO_IOM4IRQ_IOM4IRQ_Pos          (0UL)                     /*!< GPIO IOM4IRQ: IOM4IRQ (Bit 0)                         */
+#define GPIO_IOM4IRQ_IOM4IRQ_Msk          (0x3fUL)                  /*!< GPIO IOM4IRQ: IOM4IRQ (Bitfield-Mask: 0x3f)           */
+/* ========================================================  IOM5IRQ  ======================================================== */
+#define GPIO_IOM5IRQ_IOM5IRQ_Pos          (0UL)                     /*!< GPIO IOM5IRQ: IOM5IRQ (Bit 0)                         */
+#define GPIO_IOM5IRQ_IOM5IRQ_Msk          (0x3fUL)                  /*!< GPIO IOM5IRQ: IOM5IRQ (Bitfield-Mask: 0x3f)           */
+/* =======================================================  BLEIFIRQ  ======================================================== */
+#define GPIO_BLEIFIRQ_BLEIFIRQ_Pos        (0UL)                     /*!< GPIO BLEIFIRQ: BLEIFIRQ (Bit 0)                       */
+#define GPIO_BLEIFIRQ_BLEIFIRQ_Msk        (0x3fUL)                  /*!< GPIO BLEIFIRQ: BLEIFIRQ (Bitfield-Mask: 0x3f)         */
+/* ========================================================  GPIOOBS  ======================================================== */
+#define GPIO_GPIOOBS_OBS_DATA_Pos         (0UL)                     /*!< GPIO GPIOOBS: OBS_DATA (Bit 0)                        */
+#define GPIO_GPIOOBS_OBS_DATA_Msk         (0xffffUL)                /*!< GPIO GPIOOBS: OBS_DATA (Bitfield-Mask: 0xffff)        */
+/* ======================================================  ALTPADCFGA  ======================================================= */
+#define GPIO_ALTPADCFGA_PAD3_SR_Pos       (28UL)                    /*!< GPIO ALTPADCFGA: PAD3_SR (Bit 28)                     */
+#define GPIO_ALTPADCFGA_PAD3_SR_Msk       (0x10000000UL)            /*!< GPIO ALTPADCFGA: PAD3_SR (Bitfield-Mask: 0x01)        */
+#define GPIO_ALTPADCFGA_PAD3_DS1_Pos      (24UL)                    /*!< GPIO ALTPADCFGA: PAD3_DS1 (Bit 24)                    */
+#define GPIO_ALTPADCFGA_PAD3_DS1_Msk      (0x1000000UL)             /*!< GPIO ALTPADCFGA: PAD3_DS1 (Bitfield-Mask: 0x01)       */
+#define GPIO_ALTPADCFGA_PAD2_SR_Pos       (20UL)                    /*!< GPIO ALTPADCFGA: PAD2_SR (Bit 20)                     */
+#define GPIO_ALTPADCFGA_PAD2_SR_Msk       (0x100000UL)              /*!< GPIO ALTPADCFGA: PAD2_SR (Bitfield-Mask: 0x01)        */
+#define GPIO_ALTPADCFGA_PAD2_DS1_Pos      (16UL)                    /*!< GPIO ALTPADCFGA: PAD2_DS1 (Bit 16)                    */
+#define GPIO_ALTPADCFGA_PAD2_DS1_Msk      (0x10000UL)               /*!< GPIO ALTPADCFGA: PAD2_DS1 (Bitfield-Mask: 0x01)       */
+#define GPIO_ALTPADCFGA_PAD1_SR_Pos       (12UL)                    /*!< GPIO ALTPADCFGA: PAD1_SR (Bit 12)                     */
+#define GPIO_ALTPADCFGA_PAD1_SR_Msk       (0x1000UL)                /*!< GPIO ALTPADCFGA: PAD1_SR (Bitfield-Mask: 0x01)        */
+#define GPIO_ALTPADCFGA_PAD1_DS1_Pos      (8UL)                     /*!< GPIO ALTPADCFGA: PAD1_DS1 (Bit 8)                     */
+#define GPIO_ALTPADCFGA_PAD1_DS1_Msk      (0x100UL)                 /*!< GPIO ALTPADCFGA: PAD1_DS1 (Bitfield-Mask: 0x01)       */
+#define GPIO_ALTPADCFGA_PAD0_SR_Pos       (4UL)                     /*!< GPIO ALTPADCFGA: PAD0_SR (Bit 4)                      */
+#define GPIO_ALTPADCFGA_PAD0_SR_Msk       (0x10UL)                  /*!< GPIO ALTPADCFGA: PAD0_SR (Bitfield-Mask: 0x01)        */
+#define GPIO_ALTPADCFGA_PAD0_DS1_Pos      (0UL)                     /*!< GPIO ALTPADCFGA: PAD0_DS1 (Bit 0)                     */
+#define GPIO_ALTPADCFGA_PAD0_DS1_Msk      (0x1UL)                   /*!< GPIO ALTPADCFGA: PAD0_DS1 (Bitfield-Mask: 0x01)       */
+/* ======================================================  ALTPADCFGB  ======================================================= */
+#define GPIO_ALTPADCFGB_PAD7_SR_Pos       (28UL)                    /*!< GPIO ALTPADCFGB: PAD7_SR (Bit 28)                     */
+#define GPIO_ALTPADCFGB_PAD7_SR_Msk       (0x10000000UL)            /*!< GPIO ALTPADCFGB: PAD7_SR (Bitfield-Mask: 0x01)        */
+#define GPIO_ALTPADCFGB_PAD7_DS1_Pos      (24UL)                    /*!< GPIO ALTPADCFGB: PAD7_DS1 (Bit 24)                    */
+#define GPIO_ALTPADCFGB_PAD7_DS1_Msk      (0x1000000UL)             /*!< GPIO ALTPADCFGB: PAD7_DS1 (Bitfield-Mask: 0x01)       */
+#define GPIO_ALTPADCFGB_PAD6_SR_Pos       (20UL)                    /*!< GPIO ALTPADCFGB: PAD6_SR (Bit 20)                     */
+#define GPIO_ALTPADCFGB_PAD6_SR_Msk       (0x100000UL)              /*!< GPIO ALTPADCFGB: PAD6_SR (Bitfield-Mask: 0x01)        */
+#define GPIO_ALTPADCFGB_PAD6_DS1_Pos      (16UL)                    /*!< GPIO ALTPADCFGB: PAD6_DS1 (Bit 16)                    */
+#define GPIO_ALTPADCFGB_PAD6_DS1_Msk      (0x10000UL)               /*!< GPIO ALTPADCFGB: PAD6_DS1 (Bitfield-Mask: 0x01)       */
+#define GPIO_ALTPADCFGB_PAD5_SR_Pos       (12UL)                    /*!< GPIO ALTPADCFGB: PAD5_SR (Bit 12)                     */
+#define GPIO_ALTPADCFGB_PAD5_SR_Msk       (0x1000UL)                /*!< GPIO ALTPADCFGB: PAD5_SR (Bitfield-Mask: 0x01)        */
+#define GPIO_ALTPADCFGB_PAD5_DS1_Pos      (8UL)                     /*!< GPIO ALTPADCFGB: PAD5_DS1 (Bit 8)                     */
+#define GPIO_ALTPADCFGB_PAD5_DS1_Msk      (0x100UL)                 /*!< GPIO ALTPADCFGB: PAD5_DS1 (Bitfield-Mask: 0x01)       */
+#define GPIO_ALTPADCFGB_PAD4_SR_Pos       (4UL)                     /*!< GPIO ALTPADCFGB: PAD4_SR (Bit 4)                      */
+#define GPIO_ALTPADCFGB_PAD4_SR_Msk       (0x10UL)                  /*!< GPIO ALTPADCFGB: PAD4_SR (Bitfield-Mask: 0x01)        */
+#define GPIO_ALTPADCFGB_PAD4_DS1_Pos      (0UL)                     /*!< GPIO ALTPADCFGB: PAD4_DS1 (Bit 0)                     */
+#define GPIO_ALTPADCFGB_PAD4_DS1_Msk      (0x1UL)                   /*!< GPIO ALTPADCFGB: PAD4_DS1 (Bitfield-Mask: 0x01)       */
+/* ======================================================  ALTPADCFGC  ======================================================= */
+#define GPIO_ALTPADCFGC_PAD11_SR_Pos      (28UL)                    /*!< GPIO ALTPADCFGC: PAD11_SR (Bit 28)                    */
+#define GPIO_ALTPADCFGC_PAD11_SR_Msk      (0x10000000UL)            /*!< GPIO ALTPADCFGC: PAD11_SR (Bitfield-Mask: 0x01)       */
+#define GPIO_ALTPADCFGC_PAD11_DS1_Pos     (24UL)                    /*!< GPIO ALTPADCFGC: PAD11_DS1 (Bit 24)                   */
+#define GPIO_ALTPADCFGC_PAD11_DS1_Msk     (0x1000000UL)             /*!< GPIO ALTPADCFGC: PAD11_DS1 (Bitfield-Mask: 0x01)      */
+#define GPIO_ALTPADCFGC_PAD10_SR_Pos      (20UL)                    /*!< GPIO ALTPADCFGC: PAD10_SR (Bit 20)                    */
+#define GPIO_ALTPADCFGC_PAD10_SR_Msk      (0x100000UL)              /*!< GPIO ALTPADCFGC: PAD10_SR (Bitfield-Mask: 0x01)       */
+#define GPIO_ALTPADCFGC_PAD10_DS1_Pos     (16UL)                    /*!< GPIO ALTPADCFGC: PAD10_DS1 (Bit 16)                   */
+#define GPIO_ALTPADCFGC_PAD10_DS1_Msk     (0x10000UL)               /*!< GPIO ALTPADCFGC: PAD10_DS1 (Bitfield-Mask: 0x01)      */
+#define GPIO_ALTPADCFGC_PAD9_SR_Pos       (12UL)                    /*!< GPIO ALTPADCFGC: PAD9_SR (Bit 12)                     */
+#define GPIO_ALTPADCFGC_PAD9_SR_Msk       (0x1000UL)                /*!< GPIO ALTPADCFGC: PAD9_SR (Bitfield-Mask: 0x01)        */
+#define GPIO_ALTPADCFGC_PAD9_DS1_Pos      (8UL)                     /*!< GPIO ALTPADCFGC: PAD9_DS1 (Bit 8)                     */
+#define GPIO_ALTPADCFGC_PAD9_DS1_Msk      (0x100UL)                 /*!< GPIO ALTPADCFGC: PAD9_DS1 (Bitfield-Mask: 0x01)       */
+#define GPIO_ALTPADCFGC_PAD8_SR_Pos       (4UL)                     /*!< GPIO ALTPADCFGC: PAD8_SR (Bit 4)                      */
+#define GPIO_ALTPADCFGC_PAD8_SR_Msk       (0x10UL)                  /*!< GPIO ALTPADCFGC: PAD8_SR (Bitfield-Mask: 0x01)        */
+#define GPIO_ALTPADCFGC_PAD8_DS1_Pos      (0UL)                     /*!< GPIO ALTPADCFGC: PAD8_DS1 (Bit 0)                     */
+#define GPIO_ALTPADCFGC_PAD8_DS1_Msk      (0x1UL)                   /*!< GPIO ALTPADCFGC: PAD8_DS1 (Bitfield-Mask: 0x01)       */
+/* ======================================================  ALTPADCFGD  ======================================================= */
+#define GPIO_ALTPADCFGD_PAD15_SR_Pos      (28UL)                    /*!< GPIO ALTPADCFGD: PAD15_SR (Bit 28)                    */
+#define GPIO_ALTPADCFGD_PAD15_SR_Msk      (0x10000000UL)            /*!< GPIO ALTPADCFGD: PAD15_SR (Bitfield-Mask: 0x01)       */
+#define GPIO_ALTPADCFGD_PAD15_DS1_Pos     (24UL)                    /*!< GPIO ALTPADCFGD: PAD15_DS1 (Bit 24)                   */
+#define GPIO_ALTPADCFGD_PAD15_DS1_Msk     (0x1000000UL)             /*!< GPIO ALTPADCFGD: PAD15_DS1 (Bitfield-Mask: 0x01)      */
+#define GPIO_ALTPADCFGD_PAD14_SR_Pos      (20UL)                    /*!< GPIO ALTPADCFGD: PAD14_SR (Bit 20)                    */
+#define GPIO_ALTPADCFGD_PAD14_SR_Msk      (0x100000UL)              /*!< GPIO ALTPADCFGD: PAD14_SR (Bitfield-Mask: 0x01)       */
+#define GPIO_ALTPADCFGD_PAD14_DS1_Pos     (16UL)                    /*!< GPIO ALTPADCFGD: PAD14_DS1 (Bit 16)                   */
+#define GPIO_ALTPADCFGD_PAD14_DS1_Msk     (0x10000UL)               /*!< GPIO ALTPADCFGD: PAD14_DS1 (Bitfield-Mask: 0x01)      */
+#define GPIO_ALTPADCFGD_PAD13_SR_Pos      (12UL)                    /*!< GPIO ALTPADCFGD: PAD13_SR (Bit 12)                    */
+#define GPIO_ALTPADCFGD_PAD13_SR_Msk      (0x1000UL)                /*!< GPIO ALTPADCFGD: PAD13_SR (Bitfield-Mask: 0x01)       */
+#define GPIO_ALTPADCFGD_PAD13_DS1_Pos     (8UL)                     /*!< GPIO ALTPADCFGD: PAD13_DS1 (Bit 8)                    */
+#define GPIO_ALTPADCFGD_PAD13_DS1_Msk     (0x100UL)                 /*!< GPIO ALTPADCFGD: PAD13_DS1 (Bitfield-Mask: 0x01)      */
+#define GPIO_ALTPADCFGD_PAD12_SR_Pos      (4UL)                     /*!< GPIO ALTPADCFGD: PAD12_SR (Bit 4)                     */
+#define GPIO_ALTPADCFGD_PAD12_SR_Msk      (0x10UL)                  /*!< GPIO ALTPADCFGD: PAD12_SR (Bitfield-Mask: 0x01)       */
+#define GPIO_ALTPADCFGD_PAD12_DS1_Pos     (0UL)                     /*!< GPIO ALTPADCFGD: PAD12_DS1 (Bit 0)                    */
+#define GPIO_ALTPADCFGD_PAD12_DS1_Msk     (0x1UL)                   /*!< GPIO ALTPADCFGD: PAD12_DS1 (Bitfield-Mask: 0x01)      */
+/* ======================================================  ALTPADCFGE  ======================================================= */
+#define GPIO_ALTPADCFGE_PAD19_SR_Pos      (28UL)                    /*!< GPIO ALTPADCFGE: PAD19_SR (Bit 28)                    */
+#define GPIO_ALTPADCFGE_PAD19_SR_Msk      (0x10000000UL)            /*!< GPIO ALTPADCFGE: PAD19_SR (Bitfield-Mask: 0x01)       */
+#define GPIO_ALTPADCFGE_PAD19_DS1_Pos     (24UL)                    /*!< GPIO ALTPADCFGE: PAD19_DS1 (Bit 24)                   */
+#define GPIO_ALTPADCFGE_PAD19_DS1_Msk     (0x1000000UL)             /*!< GPIO ALTPADCFGE: PAD19_DS1 (Bitfield-Mask: 0x01)      */
+#define GPIO_ALTPADCFGE_PAD18_SR_Pos      (20UL)                    /*!< GPIO ALTPADCFGE: PAD18_SR (Bit 20)                    */
+#define GPIO_ALTPADCFGE_PAD18_SR_Msk      (0x100000UL)              /*!< GPIO ALTPADCFGE: PAD18_SR (Bitfield-Mask: 0x01)       */
+#define GPIO_ALTPADCFGE_PAD18_DS1_Pos     (16UL)                    /*!< GPIO ALTPADCFGE: PAD18_DS1 (Bit 16)                   */
+#define GPIO_ALTPADCFGE_PAD18_DS1_Msk     (0x10000UL)               /*!< GPIO ALTPADCFGE: PAD18_DS1 (Bitfield-Mask: 0x01)      */
+#define GPIO_ALTPADCFGE_PAD17_SR_Pos      (12UL)                    /*!< GPIO ALTPADCFGE: PAD17_SR (Bit 12)                    */
+#define GPIO_ALTPADCFGE_PAD17_SR_Msk      (0x1000UL)                /*!< GPIO ALTPADCFGE: PAD17_SR (Bitfield-Mask: 0x01)       */
+#define GPIO_ALTPADCFGE_PAD17_DS1_Pos     (8UL)                     /*!< GPIO ALTPADCFGE: PAD17_DS1 (Bit 8)                    */
+#define GPIO_ALTPADCFGE_PAD17_DS1_Msk     (0x100UL)                 /*!< GPIO ALTPADCFGE: PAD17_DS1 (Bitfield-Mask: 0x01)      */
+#define GPIO_ALTPADCFGE_PAD16_SR_Pos      (4UL)                     /*!< GPIO ALTPADCFGE: PAD16_SR (Bit 4)                     */
+#define GPIO_ALTPADCFGE_PAD16_SR_Msk      (0x10UL)                  /*!< GPIO ALTPADCFGE: PAD16_SR (Bitfield-Mask: 0x01)       */
+#define GPIO_ALTPADCFGE_PAD16_DS1_Pos     (0UL)                     /*!< GPIO ALTPADCFGE: PAD16_DS1 (Bit 0)                    */
+#define GPIO_ALTPADCFGE_PAD16_DS1_Msk     (0x1UL)                   /*!< GPIO ALTPADCFGE: PAD16_DS1 (Bitfield-Mask: 0x01)      */
+/* ======================================================  ALTPADCFGF  ======================================================= */
+#define GPIO_ALTPADCFGF_PAD23_SR_Pos      (28UL)                    /*!< GPIO ALTPADCFGF: PAD23_SR (Bit 28)                    */
+#define GPIO_ALTPADCFGF_PAD23_SR_Msk      (0x10000000UL)            /*!< GPIO ALTPADCFGF: PAD23_SR (Bitfield-Mask: 0x01)       */
+#define GPIO_ALTPADCFGF_PAD23_DS1_Pos     (24UL)                    /*!< GPIO ALTPADCFGF: PAD23_DS1 (Bit 24)                   */
+#define GPIO_ALTPADCFGF_PAD23_DS1_Msk     (0x1000000UL)             /*!< GPIO ALTPADCFGF: PAD23_DS1 (Bitfield-Mask: 0x01)      */
+#define GPIO_ALTPADCFGF_PAD22_SR_Pos      (20UL)                    /*!< GPIO ALTPADCFGF: PAD22_SR (Bit 20)                    */
+#define GPIO_ALTPADCFGF_PAD22_SR_Msk      (0x100000UL)              /*!< GPIO ALTPADCFGF: PAD22_SR (Bitfield-Mask: 0x01)       */
+#define GPIO_ALTPADCFGF_PAD22_DS1_Pos     (16UL)                    /*!< GPIO ALTPADCFGF: PAD22_DS1 (Bit 16)                   */
+#define GPIO_ALTPADCFGF_PAD22_DS1_Msk     (0x10000UL)               /*!< GPIO ALTPADCFGF: PAD22_DS1 (Bitfield-Mask: 0x01)      */
+#define GPIO_ALTPADCFGF_PAD21_SR_Pos      (12UL)                    /*!< GPIO ALTPADCFGF: PAD21_SR (Bit 12)                    */
+#define GPIO_ALTPADCFGF_PAD21_SR_Msk      (0x1000UL)                /*!< GPIO ALTPADCFGF: PAD21_SR (Bitfield-Mask: 0x01)       */
+#define GPIO_ALTPADCFGF_PAD21_DS1_Pos     (8UL)                     /*!< GPIO ALTPADCFGF: PAD21_DS1 (Bit 8)                    */
+#define GPIO_ALTPADCFGF_PAD21_DS1_Msk     (0x100UL)                 /*!< GPIO ALTPADCFGF: PAD21_DS1 (Bitfield-Mask: 0x01)      */
+#define GPIO_ALTPADCFGF_PAD20_SR_Pos      (4UL)                     /*!< GPIO ALTPADCFGF: PAD20_SR (Bit 4)                     */
+#define GPIO_ALTPADCFGF_PAD20_SR_Msk      (0x10UL)                  /*!< GPIO ALTPADCFGF: PAD20_SR (Bitfield-Mask: 0x01)       */
+#define GPIO_ALTPADCFGF_PAD20_DS1_Pos     (0UL)                     /*!< GPIO ALTPADCFGF: PAD20_DS1 (Bit 0)                    */
+#define GPIO_ALTPADCFGF_PAD20_DS1_Msk     (0x1UL)                   /*!< GPIO ALTPADCFGF: PAD20_DS1 (Bitfield-Mask: 0x01)      */
+/* ======================================================  ALTPADCFGG  ======================================================= */
+#define GPIO_ALTPADCFGG_PAD27_SR_Pos      (28UL)                    /*!< GPIO ALTPADCFGG: PAD27_SR (Bit 28)                    */
+#define GPIO_ALTPADCFGG_PAD27_SR_Msk      (0x10000000UL)            /*!< GPIO ALTPADCFGG: PAD27_SR (Bitfield-Mask: 0x01)       */
+#define GPIO_ALTPADCFGG_PAD27_DS1_Pos     (24UL)                    /*!< GPIO ALTPADCFGG: PAD27_DS1 (Bit 24)                   */
+#define GPIO_ALTPADCFGG_PAD27_DS1_Msk     (0x1000000UL)             /*!< GPIO ALTPADCFGG: PAD27_DS1 (Bitfield-Mask: 0x01)      */
+#define GPIO_ALTPADCFGG_PAD26_SR_Pos      (20UL)                    /*!< GPIO ALTPADCFGG: PAD26_SR (Bit 20)                    */
+#define GPIO_ALTPADCFGG_PAD26_SR_Msk      (0x100000UL)              /*!< GPIO ALTPADCFGG: PAD26_SR (Bitfield-Mask: 0x01)       */
+#define GPIO_ALTPADCFGG_PAD26_DS1_Pos     (16UL)                    /*!< GPIO ALTPADCFGG: PAD26_DS1 (Bit 16)                   */
+#define GPIO_ALTPADCFGG_PAD26_DS1_Msk     (0x10000UL)               /*!< GPIO ALTPADCFGG: PAD26_DS1 (Bitfield-Mask: 0x01)      */
+#define GPIO_ALTPADCFGG_PAD25_SR_Pos      (12UL)                    /*!< GPIO ALTPADCFGG: PAD25_SR (Bit 12)                    */
+#define GPIO_ALTPADCFGG_PAD25_SR_Msk      (0x1000UL)                /*!< GPIO ALTPADCFGG: PAD25_SR (Bitfield-Mask: 0x01)       */
+#define GPIO_ALTPADCFGG_PAD25_DS1_Pos     (8UL)                     /*!< GPIO ALTPADCFGG: PAD25_DS1 (Bit 8)                    */
+#define GPIO_ALTPADCFGG_PAD25_DS1_Msk     (0x100UL)                 /*!< GPIO ALTPADCFGG: PAD25_DS1 (Bitfield-Mask: 0x01)      */
+#define GPIO_ALTPADCFGG_PAD24_SR_Pos      (4UL)                     /*!< GPIO ALTPADCFGG: PAD24_SR (Bit 4)                     */
+#define GPIO_ALTPADCFGG_PAD24_SR_Msk      (0x10UL)                  /*!< GPIO ALTPADCFGG: PAD24_SR (Bitfield-Mask: 0x01)       */
+#define GPIO_ALTPADCFGG_PAD24_DS1_Pos     (0UL)                     /*!< GPIO ALTPADCFGG: PAD24_DS1 (Bit 0)                    */
+#define GPIO_ALTPADCFGG_PAD24_DS1_Msk     (0x1UL)                   /*!< GPIO ALTPADCFGG: PAD24_DS1 (Bitfield-Mask: 0x01)      */
+/* ======================================================  ALTPADCFGH  ======================================================= */
+#define GPIO_ALTPADCFGH_PAD31_SR_Pos      (28UL)                    /*!< GPIO ALTPADCFGH: PAD31_SR (Bit 28)                    */
+#define GPIO_ALTPADCFGH_PAD31_SR_Msk      (0x10000000UL)            /*!< GPIO ALTPADCFGH: PAD31_SR (Bitfield-Mask: 0x01)       */
+#define GPIO_ALTPADCFGH_PAD31_DS1_Pos     (24UL)                    /*!< GPIO ALTPADCFGH: PAD31_DS1 (Bit 24)                   */
+#define GPIO_ALTPADCFGH_PAD31_DS1_Msk     (0x1000000UL)             /*!< GPIO ALTPADCFGH: PAD31_DS1 (Bitfield-Mask: 0x01)      */
+#define GPIO_ALTPADCFGH_PAD30_SR_Pos      (20UL)                    /*!< GPIO ALTPADCFGH: PAD30_SR (Bit 20)                    */
+#define GPIO_ALTPADCFGH_PAD30_SR_Msk      (0x100000UL)              /*!< GPIO ALTPADCFGH: PAD30_SR (Bitfield-Mask: 0x01)       */
+#define GPIO_ALTPADCFGH_PAD30_DS1_Pos     (16UL)                    /*!< GPIO ALTPADCFGH: PAD30_DS1 (Bit 16)                   */
+#define GPIO_ALTPADCFGH_PAD30_DS1_Msk     (0x10000UL)               /*!< GPIO ALTPADCFGH: PAD30_DS1 (Bitfield-Mask: 0x01)      */
+#define GPIO_ALTPADCFGH_PAD29_SR_Pos      (12UL)                    /*!< GPIO ALTPADCFGH: PAD29_SR (Bit 12)                    */
+#define GPIO_ALTPADCFGH_PAD29_SR_Msk      (0x1000UL)                /*!< GPIO ALTPADCFGH: PAD29_SR (Bitfield-Mask: 0x01)       */
+#define GPIO_ALTPADCFGH_PAD29_DS1_Pos     (8UL)                     /*!< GPIO ALTPADCFGH: PAD29_DS1 (Bit 8)                    */
+#define GPIO_ALTPADCFGH_PAD29_DS1_Msk     (0x100UL)                 /*!< GPIO ALTPADCFGH: PAD29_DS1 (Bitfield-Mask: 0x01)      */
+#define GPIO_ALTPADCFGH_PAD28_SR_Pos      (4UL)                     /*!< GPIO ALTPADCFGH: PAD28_SR (Bit 4)                     */
+#define GPIO_ALTPADCFGH_PAD28_SR_Msk      (0x10UL)                  /*!< GPIO ALTPADCFGH: PAD28_SR (Bitfield-Mask: 0x01)       */
+#define GPIO_ALTPADCFGH_PAD28_DS1_Pos     (0UL)                     /*!< GPIO ALTPADCFGH: PAD28_DS1 (Bit 0)                    */
+#define GPIO_ALTPADCFGH_PAD28_DS1_Msk     (0x1UL)                   /*!< GPIO ALTPADCFGH: PAD28_DS1 (Bitfield-Mask: 0x01)      */
+/* ======================================================  ALTPADCFGI  ======================================================= */
+#define GPIO_ALTPADCFGI_PAD35_SR_Pos      (28UL)                    /*!< GPIO ALTPADCFGI: PAD35_SR (Bit 28)                    */
+#define GPIO_ALTPADCFGI_PAD35_SR_Msk      (0x10000000UL)            /*!< GPIO ALTPADCFGI: PAD35_SR (Bitfield-Mask: 0x01)       */
+#define GPIO_ALTPADCFGI_PAD35_DS1_Pos     (24UL)                    /*!< GPIO ALTPADCFGI: PAD35_DS1 (Bit 24)                   */
+#define GPIO_ALTPADCFGI_PAD35_DS1_Msk     (0x1000000UL)             /*!< GPIO ALTPADCFGI: PAD35_DS1 (Bitfield-Mask: 0x01)      */
+#define GPIO_ALTPADCFGI_PAD34_SR_Pos      (20UL)                    /*!< GPIO ALTPADCFGI: PAD34_SR (Bit 20)                    */
+#define GPIO_ALTPADCFGI_PAD34_SR_Msk      (0x100000UL)              /*!< GPIO ALTPADCFGI: PAD34_SR (Bitfield-Mask: 0x01)       */
+#define GPIO_ALTPADCFGI_PAD34_DS1_Pos     (16UL)                    /*!< GPIO ALTPADCFGI: PAD34_DS1 (Bit 16)                   */
+#define GPIO_ALTPADCFGI_PAD34_DS1_Msk     (0x10000UL)               /*!< GPIO ALTPADCFGI: PAD34_DS1 (Bitfield-Mask: 0x01)      */
+#define GPIO_ALTPADCFGI_PAD33_SR_Pos      (12UL)                    /*!< GPIO ALTPADCFGI: PAD33_SR (Bit 12)                    */
+#define GPIO_ALTPADCFGI_PAD33_SR_Msk      (0x1000UL)                /*!< GPIO ALTPADCFGI: PAD33_SR (Bitfield-Mask: 0x01)       */
+#define GPIO_ALTPADCFGI_PAD33_DS1_Pos     (8UL)                     /*!< GPIO ALTPADCFGI: PAD33_DS1 (Bit 8)                    */
+#define GPIO_ALTPADCFGI_PAD33_DS1_Msk     (0x100UL)                 /*!< GPIO ALTPADCFGI: PAD33_DS1 (Bitfield-Mask: 0x01)      */
+#define GPIO_ALTPADCFGI_PAD32_SR_Pos      (4UL)                     /*!< GPIO ALTPADCFGI: PAD32_SR (Bit 4)                     */
+#define GPIO_ALTPADCFGI_PAD32_SR_Msk      (0x10UL)                  /*!< GPIO ALTPADCFGI: PAD32_SR (Bitfield-Mask: 0x01)       */
+#define GPIO_ALTPADCFGI_PAD32_DS1_Pos     (0UL)                     /*!< GPIO ALTPADCFGI: PAD32_DS1 (Bit 0)                    */
+#define GPIO_ALTPADCFGI_PAD32_DS1_Msk     (0x1UL)                   /*!< GPIO ALTPADCFGI: PAD32_DS1 (Bitfield-Mask: 0x01)      */
+/* ======================================================  ALTPADCFGJ  ======================================================= */
+#define GPIO_ALTPADCFGJ_PAD39_SR_Pos      (28UL)                    /*!< GPIO ALTPADCFGJ: PAD39_SR (Bit 28)                    */
+#define GPIO_ALTPADCFGJ_PAD39_SR_Msk      (0x10000000UL)            /*!< GPIO ALTPADCFGJ: PAD39_SR (Bitfield-Mask: 0x01)       */
+#define GPIO_ALTPADCFGJ_PAD39_DS1_Pos     (24UL)                    /*!< GPIO ALTPADCFGJ: PAD39_DS1 (Bit 24)                   */
+#define GPIO_ALTPADCFGJ_PAD39_DS1_Msk     (0x1000000UL)             /*!< GPIO ALTPADCFGJ: PAD39_DS1 (Bitfield-Mask: 0x01)      */
+#define GPIO_ALTPADCFGJ_PAD38_SR_Pos      (20UL)                    /*!< GPIO ALTPADCFGJ: PAD38_SR (Bit 20)                    */
+#define GPIO_ALTPADCFGJ_PAD38_SR_Msk      (0x100000UL)              /*!< GPIO ALTPADCFGJ: PAD38_SR (Bitfield-Mask: 0x01)       */
+#define GPIO_ALTPADCFGJ_PAD38_DS1_Pos     (16UL)                    /*!< GPIO ALTPADCFGJ: PAD38_DS1 (Bit 16)                   */
+#define GPIO_ALTPADCFGJ_PAD38_DS1_Msk     (0x10000UL)               /*!< GPIO ALTPADCFGJ: PAD38_DS1 (Bitfield-Mask: 0x01)      */
+#define GPIO_ALTPADCFGJ_PAD37_SR_Pos      (12UL)                    /*!< GPIO ALTPADCFGJ: PAD37_SR (Bit 12)                    */
+#define GPIO_ALTPADCFGJ_PAD37_SR_Msk      (0x1000UL)                /*!< GPIO ALTPADCFGJ: PAD37_SR (Bitfield-Mask: 0x01)       */
+#define GPIO_ALTPADCFGJ_PAD37_DS1_Pos     (8UL)                     /*!< GPIO ALTPADCFGJ: PAD37_DS1 (Bit 8)                    */
+#define GPIO_ALTPADCFGJ_PAD37_DS1_Msk     (0x100UL)                 /*!< GPIO ALTPADCFGJ: PAD37_DS1 (Bitfield-Mask: 0x01)      */
+#define GPIO_ALTPADCFGJ_PAD36_SR_Pos      (4UL)                     /*!< GPIO ALTPADCFGJ: PAD36_SR (Bit 4)                     */
+#define GPIO_ALTPADCFGJ_PAD36_SR_Msk      (0x10UL)                  /*!< GPIO ALTPADCFGJ: PAD36_SR (Bitfield-Mask: 0x01)       */
+#define GPIO_ALTPADCFGJ_PAD36_DS1_Pos     (0UL)                     /*!< GPIO ALTPADCFGJ: PAD36_DS1 (Bit 0)                    */
+#define GPIO_ALTPADCFGJ_PAD36_DS1_Msk     (0x1UL)                   /*!< GPIO ALTPADCFGJ: PAD36_DS1 (Bitfield-Mask: 0x01)      */
+/* ======================================================  ALTPADCFGK  ======================================================= */
+#define GPIO_ALTPADCFGK_PAD43_SR_Pos      (28UL)                    /*!< GPIO ALTPADCFGK: PAD43_SR (Bit 28)                    */
+#define GPIO_ALTPADCFGK_PAD43_SR_Msk      (0x10000000UL)            /*!< GPIO ALTPADCFGK: PAD43_SR (Bitfield-Mask: 0x01)       */
+#define GPIO_ALTPADCFGK_PAD43_DS1_Pos     (24UL)                    /*!< GPIO ALTPADCFGK: PAD43_DS1 (Bit 24)                   */
+#define GPIO_ALTPADCFGK_PAD43_DS1_Msk     (0x1000000UL)             /*!< GPIO ALTPADCFGK: PAD43_DS1 (Bitfield-Mask: 0x01)      */
+#define GPIO_ALTPADCFGK_PAD42_SR_Pos      (20UL)                    /*!< GPIO ALTPADCFGK: PAD42_SR (Bit 20)                    */
+#define GPIO_ALTPADCFGK_PAD42_SR_Msk      (0x100000UL)              /*!< GPIO ALTPADCFGK: PAD42_SR (Bitfield-Mask: 0x01)       */
+#define GPIO_ALTPADCFGK_PAD42_DS1_Pos     (16UL)                    /*!< GPIO ALTPADCFGK: PAD42_DS1 (Bit 16)                   */
+#define GPIO_ALTPADCFGK_PAD42_DS1_Msk     (0x10000UL)               /*!< GPIO ALTPADCFGK: PAD42_DS1 (Bitfield-Mask: 0x01)      */
+#define GPIO_ALTPADCFGK_PAD41_SR_Pos      (12UL)                    /*!< GPIO ALTPADCFGK: PAD41_SR (Bit 12)                    */
+#define GPIO_ALTPADCFGK_PAD41_SR_Msk      (0x1000UL)                /*!< GPIO ALTPADCFGK: PAD41_SR (Bitfield-Mask: 0x01)       */
+#define GPIO_ALTPADCFGK_PAD41_DS1_Pos     (8UL)                     /*!< GPIO ALTPADCFGK: PAD41_DS1 (Bit 8)                    */
+#define GPIO_ALTPADCFGK_PAD41_DS1_Msk     (0x100UL)                 /*!< GPIO ALTPADCFGK: PAD41_DS1 (Bitfield-Mask: 0x01)      */
+#define GPIO_ALTPADCFGK_PAD40_SR_Pos      (4UL)                     /*!< GPIO ALTPADCFGK: PAD40_SR (Bit 4)                     */
+#define GPIO_ALTPADCFGK_PAD40_SR_Msk      (0x10UL)                  /*!< GPIO ALTPADCFGK: PAD40_SR (Bitfield-Mask: 0x01)       */
+#define GPIO_ALTPADCFGK_PAD40_DS1_Pos     (0UL)                     /*!< GPIO ALTPADCFGK: PAD40_DS1 (Bit 0)                    */
+#define GPIO_ALTPADCFGK_PAD40_DS1_Msk     (0x1UL)                   /*!< GPIO ALTPADCFGK: PAD40_DS1 (Bitfield-Mask: 0x01)      */
+/* ======================================================  ALTPADCFGL  ======================================================= */
+#define GPIO_ALTPADCFGL_PAD47_SR_Pos      (28UL)                    /*!< GPIO ALTPADCFGL: PAD47_SR (Bit 28)                    */
+#define GPIO_ALTPADCFGL_PAD47_SR_Msk      (0x10000000UL)            /*!< GPIO ALTPADCFGL: PAD47_SR (Bitfield-Mask: 0x01)       */
+#define GPIO_ALTPADCFGL_PAD47_DS1_Pos     (24UL)                    /*!< GPIO ALTPADCFGL: PAD47_DS1 (Bit 24)                   */
+#define GPIO_ALTPADCFGL_PAD47_DS1_Msk     (0x1000000UL)             /*!< GPIO ALTPADCFGL: PAD47_DS1 (Bitfield-Mask: 0x01)      */
+#define GPIO_ALTPADCFGL_PAD46_SR_Pos      (20UL)                    /*!< GPIO ALTPADCFGL: PAD46_SR (Bit 20)                    */
+#define GPIO_ALTPADCFGL_PAD46_SR_Msk      (0x100000UL)              /*!< GPIO ALTPADCFGL: PAD46_SR (Bitfield-Mask: 0x01)       */
+#define GPIO_ALTPADCFGL_PAD46_DS1_Pos     (16UL)                    /*!< GPIO ALTPADCFGL: PAD46_DS1 (Bit 16)                   */
+#define GPIO_ALTPADCFGL_PAD46_DS1_Msk     (0x10000UL)               /*!< GPIO ALTPADCFGL: PAD46_DS1 (Bitfield-Mask: 0x01)      */
+#define GPIO_ALTPADCFGL_PAD45_SR_Pos      (12UL)                    /*!< GPIO ALTPADCFGL: PAD45_SR (Bit 12)                    */
+#define GPIO_ALTPADCFGL_PAD45_SR_Msk      (0x1000UL)                /*!< GPIO ALTPADCFGL: PAD45_SR (Bitfield-Mask: 0x01)       */
+#define GPIO_ALTPADCFGL_PAD45_DS1_Pos     (8UL)                     /*!< GPIO ALTPADCFGL: PAD45_DS1 (Bit 8)                    */
+#define GPIO_ALTPADCFGL_PAD45_DS1_Msk     (0x100UL)                 /*!< GPIO ALTPADCFGL: PAD45_DS1 (Bitfield-Mask: 0x01)      */
+#define GPIO_ALTPADCFGL_PAD44_SR_Pos      (4UL)                     /*!< GPIO ALTPADCFGL: PAD44_SR (Bit 4)                     */
+#define GPIO_ALTPADCFGL_PAD44_SR_Msk      (0x10UL)                  /*!< GPIO ALTPADCFGL: PAD44_SR (Bitfield-Mask: 0x01)       */
+#define GPIO_ALTPADCFGL_PAD44_DS1_Pos     (0UL)                     /*!< GPIO ALTPADCFGL: PAD44_DS1 (Bit 0)                    */
+#define GPIO_ALTPADCFGL_PAD44_DS1_Msk     (0x1UL)                   /*!< GPIO ALTPADCFGL: PAD44_DS1 (Bitfield-Mask: 0x01)      */
+/* ======================================================  ALTPADCFGM  ======================================================= */
+#define GPIO_ALTPADCFGM_PAD49_SR_Pos      (12UL)                    /*!< GPIO ALTPADCFGM: PAD49_SR (Bit 12)                    */
+#define GPIO_ALTPADCFGM_PAD49_SR_Msk      (0x1000UL)                /*!< GPIO ALTPADCFGM: PAD49_SR (Bitfield-Mask: 0x01)       */
+#define GPIO_ALTPADCFGM_PAD49_DS1_Pos     (8UL)                     /*!< GPIO ALTPADCFGM: PAD49_DS1 (Bit 8)                    */
+#define GPIO_ALTPADCFGM_PAD49_DS1_Msk     (0x100UL)                 /*!< GPIO ALTPADCFGM: PAD49_DS1 (Bitfield-Mask: 0x01)      */
+#define GPIO_ALTPADCFGM_PAD48_SR_Pos      (4UL)                     /*!< GPIO ALTPADCFGM: PAD48_SR (Bit 4)                     */
+#define GPIO_ALTPADCFGM_PAD48_SR_Msk      (0x10UL)                  /*!< GPIO ALTPADCFGM: PAD48_SR (Bitfield-Mask: 0x01)       */
+#define GPIO_ALTPADCFGM_PAD48_DS1_Pos     (0UL)                     /*!< GPIO ALTPADCFGM: PAD48_DS1 (Bit 0)                    */
+#define GPIO_ALTPADCFGM_PAD48_DS1_Msk     (0x1UL)                   /*!< GPIO ALTPADCFGM: PAD48_DS1 (Bitfield-Mask: 0x01)      */
+/* =========================================================  SCDET  ========================================================= */
+#define GPIO_SCDET_SCDET_Pos              (0UL)                     /*!< GPIO SCDET: SCDET (Bit 0)                             */
+#define GPIO_SCDET_SCDET_Msk              (0x3fUL)                  /*!< GPIO SCDET: SCDET (Bitfield-Mask: 0x3f)               */
+/* ========================================================  CTENCFG  ======================================================== */
+#define GPIO_CTENCFG_EN31_Pos             (31UL)                    /*!< GPIO CTENCFG: EN31 (Bit 31)                           */
+#define GPIO_CTENCFG_EN31_Msk             (0x80000000UL)            /*!< GPIO CTENCFG: EN31 (Bitfield-Mask: 0x01)              */
+#define GPIO_CTENCFG_EN30_Pos             (30UL)                    /*!< GPIO CTENCFG: EN30 (Bit 30)                           */
+#define GPIO_CTENCFG_EN30_Msk             (0x40000000UL)            /*!< GPIO CTENCFG: EN30 (Bitfield-Mask: 0x01)              */
+#define GPIO_CTENCFG_EN29_Pos             (29UL)                    /*!< GPIO CTENCFG: EN29 (Bit 29)                           */
+#define GPIO_CTENCFG_EN29_Msk             (0x20000000UL)            /*!< GPIO CTENCFG: EN29 (Bitfield-Mask: 0x01)              */
+#define GPIO_CTENCFG_EN28_Pos             (28UL)                    /*!< GPIO CTENCFG: EN28 (Bit 28)                           */
+#define GPIO_CTENCFG_EN28_Msk             (0x10000000UL)            /*!< GPIO CTENCFG: EN28 (Bitfield-Mask: 0x01)              */
+#define GPIO_CTENCFG_EN27_Pos             (27UL)                    /*!< GPIO CTENCFG: EN27 (Bit 27)                           */
+#define GPIO_CTENCFG_EN27_Msk             (0x8000000UL)             /*!< GPIO CTENCFG: EN27 (Bitfield-Mask: 0x01)              */
+#define GPIO_CTENCFG_EN26_Pos             (26UL)                    /*!< GPIO CTENCFG: EN26 (Bit 26)                           */
+#define GPIO_CTENCFG_EN26_Msk             (0x4000000UL)             /*!< GPIO CTENCFG: EN26 (Bitfield-Mask: 0x01)              */
+#define GPIO_CTENCFG_EN25_Pos             (25UL)                    /*!< GPIO CTENCFG: EN25 (Bit 25)                           */
+#define GPIO_CTENCFG_EN25_Msk             (0x2000000UL)             /*!< GPIO CTENCFG: EN25 (Bitfield-Mask: 0x01)              */
+#define GPIO_CTENCFG_EN24_Pos             (24UL)                    /*!< GPIO CTENCFG: EN24 (Bit 24)                           */
+#define GPIO_CTENCFG_EN24_Msk             (0x1000000UL)             /*!< GPIO CTENCFG: EN24 (Bitfield-Mask: 0x01)              */
+#define GPIO_CTENCFG_EN23_Pos             (23UL)                    /*!< GPIO CTENCFG: EN23 (Bit 23)                           */
+#define GPIO_CTENCFG_EN23_Msk             (0x800000UL)              /*!< GPIO CTENCFG: EN23 (Bitfield-Mask: 0x01)              */
+#define GPIO_CTENCFG_EN22_Pos             (22UL)                    /*!< GPIO CTENCFG: EN22 (Bit 22)                           */
+#define GPIO_CTENCFG_EN22_Msk             (0x400000UL)              /*!< GPIO CTENCFG: EN22 (Bitfield-Mask: 0x01)              */
+#define GPIO_CTENCFG_EN21_Pos             (21UL)                    /*!< GPIO CTENCFG: EN21 (Bit 21)                           */
+#define GPIO_CTENCFG_EN21_Msk             (0x200000UL)              /*!< GPIO CTENCFG: EN21 (Bitfield-Mask: 0x01)              */
+#define GPIO_CTENCFG_EN20_Pos             (20UL)                    /*!< GPIO CTENCFG: EN20 (Bit 20)                           */
+#define GPIO_CTENCFG_EN20_Msk             (0x100000UL)              /*!< GPIO CTENCFG: EN20 (Bitfield-Mask: 0x01)              */
+#define GPIO_CTENCFG_EN19_Pos             (19UL)                    /*!< GPIO CTENCFG: EN19 (Bit 19)                           */
+#define GPIO_CTENCFG_EN19_Msk             (0x80000UL)               /*!< GPIO CTENCFG: EN19 (Bitfield-Mask: 0x01)              */
+#define GPIO_CTENCFG_EN18_Pos             (18UL)                    /*!< GPIO CTENCFG: EN18 (Bit 18)                           */
+#define GPIO_CTENCFG_EN18_Msk             (0x40000UL)               /*!< GPIO CTENCFG: EN18 (Bitfield-Mask: 0x01)              */
+#define GPIO_CTENCFG_EN17_Pos             (17UL)                    /*!< GPIO CTENCFG: EN17 (Bit 17)                           */
+#define GPIO_CTENCFG_EN17_Msk             (0x20000UL)               /*!< GPIO CTENCFG: EN17 (Bitfield-Mask: 0x01)              */
+#define GPIO_CTENCFG_EN16_Pos             (16UL)                    /*!< GPIO CTENCFG: EN16 (Bit 16)                           */
+#define GPIO_CTENCFG_EN16_Msk             (0x10000UL)               /*!< GPIO CTENCFG: EN16 (Bitfield-Mask: 0x01)              */
+#define GPIO_CTENCFG_EN15_Pos             (15UL)                    /*!< GPIO CTENCFG: EN15 (Bit 15)                           */
+#define GPIO_CTENCFG_EN15_Msk             (0x8000UL)                /*!< GPIO CTENCFG: EN15 (Bitfield-Mask: 0x01)              */
+#define GPIO_CTENCFG_EN14_Pos             (14UL)                    /*!< GPIO CTENCFG: EN14 (Bit 14)                           */
+#define GPIO_CTENCFG_EN14_Msk             (0x4000UL)                /*!< GPIO CTENCFG: EN14 (Bitfield-Mask: 0x01)              */
+#define GPIO_CTENCFG_EN13_Pos             (13UL)                    /*!< GPIO CTENCFG: EN13 (Bit 13)                           */
+#define GPIO_CTENCFG_EN13_Msk             (0x2000UL)                /*!< GPIO CTENCFG: EN13 (Bitfield-Mask: 0x01)              */
+#define GPIO_CTENCFG_EN12_Pos             (12UL)                    /*!< GPIO CTENCFG: EN12 (Bit 12)                           */
+#define GPIO_CTENCFG_EN12_Msk             (0x1000UL)                /*!< GPIO CTENCFG: EN12 (Bitfield-Mask: 0x01)              */
+#define GPIO_CTENCFG_EN11_Pos             (11UL)                    /*!< GPIO CTENCFG: EN11 (Bit 11)                           */
+#define GPIO_CTENCFG_EN11_Msk             (0x800UL)                 /*!< GPIO CTENCFG: EN11 (Bitfield-Mask: 0x01)              */
+#define GPIO_CTENCFG_EN10_Pos             (10UL)                    /*!< GPIO CTENCFG: EN10 (Bit 10)                           */
+#define GPIO_CTENCFG_EN10_Msk             (0x400UL)                 /*!< GPIO CTENCFG: EN10 (Bitfield-Mask: 0x01)              */
+#define GPIO_CTENCFG_EN9_Pos              (9UL)                     /*!< GPIO CTENCFG: EN9 (Bit 9)                             */
+#define GPIO_CTENCFG_EN9_Msk              (0x200UL)                 /*!< GPIO CTENCFG: EN9 (Bitfield-Mask: 0x01)               */
+#define GPIO_CTENCFG_EN8_Pos              (8UL)                     /*!< GPIO CTENCFG: EN8 (Bit 8)                             */
+#define GPIO_CTENCFG_EN8_Msk              (0x100UL)                 /*!< GPIO CTENCFG: EN8 (Bitfield-Mask: 0x01)               */
+#define GPIO_CTENCFG_EN7_Pos              (7UL)                     /*!< GPIO CTENCFG: EN7 (Bit 7)                             */
+#define GPIO_CTENCFG_EN7_Msk              (0x80UL)                  /*!< GPIO CTENCFG: EN7 (Bitfield-Mask: 0x01)               */
+#define GPIO_CTENCFG_EN6_Pos              (6UL)                     /*!< GPIO CTENCFG: EN6 (Bit 6)                             */
+#define GPIO_CTENCFG_EN6_Msk              (0x40UL)                  /*!< GPIO CTENCFG: EN6 (Bitfield-Mask: 0x01)               */
+#define GPIO_CTENCFG_EN5_Pos              (5UL)                     /*!< GPIO CTENCFG: EN5 (Bit 5)                             */
+#define GPIO_CTENCFG_EN5_Msk              (0x20UL)                  /*!< GPIO CTENCFG: EN5 (Bitfield-Mask: 0x01)               */
+#define GPIO_CTENCFG_EN4_Pos              (4UL)                     /*!< GPIO CTENCFG: EN4 (Bit 4)                             */
+#define GPIO_CTENCFG_EN4_Msk              (0x10UL)                  /*!< GPIO CTENCFG: EN4 (Bitfield-Mask: 0x01)               */
+#define GPIO_CTENCFG_EN3_Pos              (3UL)                     /*!< GPIO CTENCFG: EN3 (Bit 3)                             */
+#define GPIO_CTENCFG_EN3_Msk              (0x8UL)                   /*!< GPIO CTENCFG: EN3 (Bitfield-Mask: 0x01)               */
+#define GPIO_CTENCFG_EN2_Pos              (2UL)                     /*!< GPIO CTENCFG: EN2 (Bit 2)                             */
+#define GPIO_CTENCFG_EN2_Msk              (0x4UL)                   /*!< GPIO CTENCFG: EN2 (Bitfield-Mask: 0x01)               */
+#define GPIO_CTENCFG_EN1_Pos              (1UL)                     /*!< GPIO CTENCFG: EN1 (Bit 1)                             */
+#define GPIO_CTENCFG_EN1_Msk              (0x2UL)                   /*!< GPIO CTENCFG: EN1 (Bitfield-Mask: 0x01)               */
+#define GPIO_CTENCFG_EN0_Pos              (0UL)                     /*!< GPIO CTENCFG: EN0 (Bit 0)                             */
+#define GPIO_CTENCFG_EN0_Msk              (0x1UL)                   /*!< GPIO CTENCFG: EN0 (Bitfield-Mask: 0x01)               */
+/* ========================================================  INT0EN  ========================================================= */
+#define GPIO_INT0EN_GPIO31_Pos            (31UL)                    /*!< GPIO INT0EN: GPIO31 (Bit 31)                          */
+#define GPIO_INT0EN_GPIO31_Msk            (0x80000000UL)            /*!< GPIO INT0EN: GPIO31 (Bitfield-Mask: 0x01)             */
+#define GPIO_INT0EN_GPIO30_Pos            (30UL)                    /*!< GPIO INT0EN: GPIO30 (Bit 30)                          */
+#define GPIO_INT0EN_GPIO30_Msk            (0x40000000UL)            /*!< GPIO INT0EN: GPIO30 (Bitfield-Mask: 0x01)             */
+#define GPIO_INT0EN_GPIO29_Pos            (29UL)                    /*!< GPIO INT0EN: GPIO29 (Bit 29)                          */
+#define GPIO_INT0EN_GPIO29_Msk            (0x20000000UL)            /*!< GPIO INT0EN: GPIO29 (Bitfield-Mask: 0x01)             */
+#define GPIO_INT0EN_GPIO28_Pos            (28UL)                    /*!< GPIO INT0EN: GPIO28 (Bit 28)                          */
+#define GPIO_INT0EN_GPIO28_Msk            (0x10000000UL)            /*!< GPIO INT0EN: GPIO28 (Bitfield-Mask: 0x01)             */
+#define GPIO_INT0EN_GPIO27_Pos            (27UL)                    /*!< GPIO INT0EN: GPIO27 (Bit 27)                          */
+#define GPIO_INT0EN_GPIO27_Msk            (0x8000000UL)             /*!< GPIO INT0EN: GPIO27 (Bitfield-Mask: 0x01)             */
+#define GPIO_INT0EN_GPIO26_Pos            (26UL)                    /*!< GPIO INT0EN: GPIO26 (Bit 26)                          */
+#define GPIO_INT0EN_GPIO26_Msk            (0x4000000UL)             /*!< GPIO INT0EN: GPIO26 (Bitfield-Mask: 0x01)             */
+#define GPIO_INT0EN_GPIO25_Pos            (25UL)                    /*!< GPIO INT0EN: GPIO25 (Bit 25)                          */
+#define GPIO_INT0EN_GPIO25_Msk            (0x2000000UL)             /*!< GPIO INT0EN: GPIO25 (Bitfield-Mask: 0x01)             */
+#define GPIO_INT0EN_GPIO24_Pos            (24UL)                    /*!< GPIO INT0EN: GPIO24 (Bit 24)                          */
+#define GPIO_INT0EN_GPIO24_Msk            (0x1000000UL)             /*!< GPIO INT0EN: GPIO24 (Bitfield-Mask: 0x01)             */
+#define GPIO_INT0EN_GPIO23_Pos            (23UL)                    /*!< GPIO INT0EN: GPIO23 (Bit 23)                          */
+#define GPIO_INT0EN_GPIO23_Msk            (0x800000UL)              /*!< GPIO INT0EN: GPIO23 (Bitfield-Mask: 0x01)             */
+#define GPIO_INT0EN_GPIO22_Pos            (22UL)                    /*!< GPIO INT0EN: GPIO22 (Bit 22)                          */
+#define GPIO_INT0EN_GPIO22_Msk            (0x400000UL)              /*!< GPIO INT0EN: GPIO22 (Bitfield-Mask: 0x01)             */
+#define GPIO_INT0EN_GPIO21_Pos            (21UL)                    /*!< GPIO INT0EN: GPIO21 (Bit 21)                          */
+#define GPIO_INT0EN_GPIO21_Msk            (0x200000UL)              /*!< GPIO INT0EN: GPIO21 (Bitfield-Mask: 0x01)             */
+#define GPIO_INT0EN_GPIO20_Pos            (20UL)                    /*!< GPIO INT0EN: GPIO20 (Bit 20)                          */
+#define GPIO_INT0EN_GPIO20_Msk            (0x100000UL)              /*!< GPIO INT0EN: GPIO20 (Bitfield-Mask: 0x01)             */
+#define GPIO_INT0EN_GPIO19_Pos            (19UL)                    /*!< GPIO INT0EN: GPIO19 (Bit 19)                          */
+#define GPIO_INT0EN_GPIO19_Msk            (0x80000UL)               /*!< GPIO INT0EN: GPIO19 (Bitfield-Mask: 0x01)             */
+#define GPIO_INT0EN_GPIO18_Pos            (18UL)                    /*!< GPIO INT0EN: GPIO18 (Bit 18)                          */
+#define GPIO_INT0EN_GPIO18_Msk            (0x40000UL)               /*!< GPIO INT0EN: GPIO18 (Bitfield-Mask: 0x01)             */
+#define GPIO_INT0EN_GPIO17_Pos            (17UL)                    /*!< GPIO INT0EN: GPIO17 (Bit 17)                          */
+#define GPIO_INT0EN_GPIO17_Msk            (0x20000UL)               /*!< GPIO INT0EN: GPIO17 (Bitfield-Mask: 0x01)             */
+#define GPIO_INT0EN_GPIO16_Pos            (16UL)                    /*!< GPIO INT0EN: GPIO16 (Bit 16)                          */
+#define GPIO_INT0EN_GPIO16_Msk            (0x10000UL)               /*!< GPIO INT0EN: GPIO16 (Bitfield-Mask: 0x01)             */
+#define GPIO_INT0EN_GPIO15_Pos            (15UL)                    /*!< GPIO INT0EN: GPIO15 (Bit 15)                          */
+#define GPIO_INT0EN_GPIO15_Msk            (0x8000UL)                /*!< GPIO INT0EN: GPIO15 (Bitfield-Mask: 0x01)             */
+#define GPIO_INT0EN_GPIO14_Pos            (14UL)                    /*!< GPIO INT0EN: GPIO14 (Bit 14)                          */
+#define GPIO_INT0EN_GPIO14_Msk            (0x4000UL)                /*!< GPIO INT0EN: GPIO14 (Bitfield-Mask: 0x01)             */
+#define GPIO_INT0EN_GPIO13_Pos            (13UL)                    /*!< GPIO INT0EN: GPIO13 (Bit 13)                          */
+#define GPIO_INT0EN_GPIO13_Msk            (0x2000UL)                /*!< GPIO INT0EN: GPIO13 (Bitfield-Mask: 0x01)             */
+#define GPIO_INT0EN_GPIO12_Pos            (12UL)                    /*!< GPIO INT0EN: GPIO12 (Bit 12)                          */
+#define GPIO_INT0EN_GPIO12_Msk            (0x1000UL)                /*!< GPIO INT0EN: GPIO12 (Bitfield-Mask: 0x01)             */
+#define GPIO_INT0EN_GPIO11_Pos            (11UL)                    /*!< GPIO INT0EN: GPIO11 (Bit 11)                          */
+#define GPIO_INT0EN_GPIO11_Msk            (0x800UL)                 /*!< GPIO INT0EN: GPIO11 (Bitfield-Mask: 0x01)             */
+#define GPIO_INT0EN_GPIO10_Pos            (10UL)                    /*!< GPIO INT0EN: GPIO10 (Bit 10)                          */
+#define GPIO_INT0EN_GPIO10_Msk            (0x400UL)                 /*!< GPIO INT0EN: GPIO10 (Bitfield-Mask: 0x01)             */
+#define GPIO_INT0EN_GPIO9_Pos             (9UL)                     /*!< GPIO INT0EN: GPIO9 (Bit 9)                            */
+#define GPIO_INT0EN_GPIO9_Msk             (0x200UL)                 /*!< GPIO INT0EN: GPIO9 (Bitfield-Mask: 0x01)              */
+#define GPIO_INT0EN_GPIO8_Pos             (8UL)                     /*!< GPIO INT0EN: GPIO8 (Bit 8)                            */
+#define GPIO_INT0EN_GPIO8_Msk             (0x100UL)                 /*!< GPIO INT0EN: GPIO8 (Bitfield-Mask: 0x01)              */
+#define GPIO_INT0EN_GPIO7_Pos             (7UL)                     /*!< GPIO INT0EN: GPIO7 (Bit 7)                            */
+#define GPIO_INT0EN_GPIO7_Msk             (0x80UL)                  /*!< GPIO INT0EN: GPIO7 (Bitfield-Mask: 0x01)              */
+#define GPIO_INT0EN_GPIO6_Pos             (6UL)                     /*!< GPIO INT0EN: GPIO6 (Bit 6)                            */
+#define GPIO_INT0EN_GPIO6_Msk             (0x40UL)                  /*!< GPIO INT0EN: GPIO6 (Bitfield-Mask: 0x01)              */
+#define GPIO_INT0EN_GPIO5_Pos             (5UL)                     /*!< GPIO INT0EN: GPIO5 (Bit 5)                            */
+#define GPIO_INT0EN_GPIO5_Msk             (0x20UL)                  /*!< GPIO INT0EN: GPIO5 (Bitfield-Mask: 0x01)              */
+#define GPIO_INT0EN_GPIO4_Pos             (4UL)                     /*!< GPIO INT0EN: GPIO4 (Bit 4)                            */
+#define GPIO_INT0EN_GPIO4_Msk             (0x10UL)                  /*!< GPIO INT0EN: GPIO4 (Bitfield-Mask: 0x01)              */
+#define GPIO_INT0EN_GPIO3_Pos             (3UL)                     /*!< GPIO INT0EN: GPIO3 (Bit 3)                            */
+#define GPIO_INT0EN_GPIO3_Msk             (0x8UL)                   /*!< GPIO INT0EN: GPIO3 (Bitfield-Mask: 0x01)              */
+#define GPIO_INT0EN_GPIO2_Pos             (2UL)                     /*!< GPIO INT0EN: GPIO2 (Bit 2)                            */
+#define GPIO_INT0EN_GPIO2_Msk             (0x4UL)                   /*!< GPIO INT0EN: GPIO2 (Bitfield-Mask: 0x01)              */
+#define GPIO_INT0EN_GPIO1_Pos             (1UL)                     /*!< GPIO INT0EN: GPIO1 (Bit 1)                            */
+#define GPIO_INT0EN_GPIO1_Msk             (0x2UL)                   /*!< GPIO INT0EN: GPIO1 (Bitfield-Mask: 0x01)              */
+#define GPIO_INT0EN_GPIO0_Pos             (0UL)                     /*!< GPIO INT0EN: GPIO0 (Bit 0)                            */
+#define GPIO_INT0EN_GPIO0_Msk             (0x1UL)                   /*!< GPIO INT0EN: GPIO0 (Bitfield-Mask: 0x01)              */
+/* =======================================================  INT0STAT  ======================================================== */
+#define GPIO_INT0STAT_GPIO31_Pos          (31UL)                    /*!< GPIO INT0STAT: GPIO31 (Bit 31)                        */
+#define GPIO_INT0STAT_GPIO31_Msk          (0x80000000UL)            /*!< GPIO INT0STAT: GPIO31 (Bitfield-Mask: 0x01)           */
+#define GPIO_INT0STAT_GPIO30_Pos          (30UL)                    /*!< GPIO INT0STAT: GPIO30 (Bit 30)                        */
+#define GPIO_INT0STAT_GPIO30_Msk          (0x40000000UL)            /*!< GPIO INT0STAT: GPIO30 (Bitfield-Mask: 0x01)           */
+#define GPIO_INT0STAT_GPIO29_Pos          (29UL)                    /*!< GPIO INT0STAT: GPIO29 (Bit 29)                        */
+#define GPIO_INT0STAT_GPIO29_Msk          (0x20000000UL)            /*!< GPIO INT0STAT: GPIO29 (Bitfield-Mask: 0x01)           */
+#define GPIO_INT0STAT_GPIO28_Pos          (28UL)                    /*!< GPIO INT0STAT: GPIO28 (Bit 28)                        */
+#define GPIO_INT0STAT_GPIO28_Msk          (0x10000000UL)            /*!< GPIO INT0STAT: GPIO28 (Bitfield-Mask: 0x01)           */
+#define GPIO_INT0STAT_GPIO27_Pos          (27UL)                    /*!< GPIO INT0STAT: GPIO27 (Bit 27)                        */
+#define GPIO_INT0STAT_GPIO27_Msk          (0x8000000UL)             /*!< GPIO INT0STAT: GPIO27 (Bitfield-Mask: 0x01)           */
+#define GPIO_INT0STAT_GPIO26_Pos          (26UL)                    /*!< GPIO INT0STAT: GPIO26 (Bit 26)                        */
+#define GPIO_INT0STAT_GPIO26_Msk          (0x4000000UL)             /*!< GPIO INT0STAT: GPIO26 (Bitfield-Mask: 0x01)           */
+#define GPIO_INT0STAT_GPIO25_Pos          (25UL)                    /*!< GPIO INT0STAT: GPIO25 (Bit 25)                        */
+#define GPIO_INT0STAT_GPIO25_Msk          (0x2000000UL)             /*!< GPIO INT0STAT: GPIO25 (Bitfield-Mask: 0x01)           */
+#define GPIO_INT0STAT_GPIO24_Pos          (24UL)                    /*!< GPIO INT0STAT: GPIO24 (Bit 24)                        */
+#define GPIO_INT0STAT_GPIO24_Msk          (0x1000000UL)             /*!< GPIO INT0STAT: GPIO24 (Bitfield-Mask: 0x01)           */
+#define GPIO_INT0STAT_GPIO23_Pos          (23UL)                    /*!< GPIO INT0STAT: GPIO23 (Bit 23)                        */
+#define GPIO_INT0STAT_GPIO23_Msk          (0x800000UL)              /*!< GPIO INT0STAT: GPIO23 (Bitfield-Mask: 0x01)           */
+#define GPIO_INT0STAT_GPIO22_Pos          (22UL)                    /*!< GPIO INT0STAT: GPIO22 (Bit 22)                        */
+#define GPIO_INT0STAT_GPIO22_Msk          (0x400000UL)              /*!< GPIO INT0STAT: GPIO22 (Bitfield-Mask: 0x01)           */
+#define GPIO_INT0STAT_GPIO21_Pos          (21UL)                    /*!< GPIO INT0STAT: GPIO21 (Bit 21)                        */
+#define GPIO_INT0STAT_GPIO21_Msk          (0x200000UL)              /*!< GPIO INT0STAT: GPIO21 (Bitfield-Mask: 0x01)           */
+#define GPIO_INT0STAT_GPIO20_Pos          (20UL)                    /*!< GPIO INT0STAT: GPIO20 (Bit 20)                        */
+#define GPIO_INT0STAT_GPIO20_Msk          (0x100000UL)              /*!< GPIO INT0STAT: GPIO20 (Bitfield-Mask: 0x01)           */
+#define GPIO_INT0STAT_GPIO19_Pos          (19UL)                    /*!< GPIO INT0STAT: GPIO19 (Bit 19)                        */
+#define GPIO_INT0STAT_GPIO19_Msk          (0x80000UL)               /*!< GPIO INT0STAT: GPIO19 (Bitfield-Mask: 0x01)           */
+#define GPIO_INT0STAT_GPIO18_Pos          (18UL)                    /*!< GPIO INT0STAT: GPIO18 (Bit 18)                        */
+#define GPIO_INT0STAT_GPIO18_Msk          (0x40000UL)               /*!< GPIO INT0STAT: GPIO18 (Bitfield-Mask: 0x01)           */
+#define GPIO_INT0STAT_GPIO17_Pos          (17UL)                    /*!< GPIO INT0STAT: GPIO17 (Bit 17)                        */
+#define GPIO_INT0STAT_GPIO17_Msk          (0x20000UL)               /*!< GPIO INT0STAT: GPIO17 (Bitfield-Mask: 0x01)           */
+#define GPIO_INT0STAT_GPIO16_Pos          (16UL)                    /*!< GPIO INT0STAT: GPIO16 (Bit 16)                        */
+#define GPIO_INT0STAT_GPIO16_Msk          (0x10000UL)               /*!< GPIO INT0STAT: GPIO16 (Bitfield-Mask: 0x01)           */
+#define GPIO_INT0STAT_GPIO15_Pos          (15UL)                    /*!< GPIO INT0STAT: GPIO15 (Bit 15)                        */
+#define GPIO_INT0STAT_GPIO15_Msk          (0x8000UL)                /*!< GPIO INT0STAT: GPIO15 (Bitfield-Mask: 0x01)           */
+#define GPIO_INT0STAT_GPIO14_Pos          (14UL)                    /*!< GPIO INT0STAT: GPIO14 (Bit 14)                        */
+#define GPIO_INT0STAT_GPIO14_Msk          (0x4000UL)                /*!< GPIO INT0STAT: GPIO14 (Bitfield-Mask: 0x01)           */
+#define GPIO_INT0STAT_GPIO13_Pos          (13UL)                    /*!< GPIO INT0STAT: GPIO13 (Bit 13)                        */
+#define GPIO_INT0STAT_GPIO13_Msk          (0x2000UL)                /*!< GPIO INT0STAT: GPIO13 (Bitfield-Mask: 0x01)           */
+#define GPIO_INT0STAT_GPIO12_Pos          (12UL)                    /*!< GPIO INT0STAT: GPIO12 (Bit 12)                        */
+#define GPIO_INT0STAT_GPIO12_Msk          (0x1000UL)                /*!< GPIO INT0STAT: GPIO12 (Bitfield-Mask: 0x01)           */
+#define GPIO_INT0STAT_GPIO11_Pos          (11UL)                    /*!< GPIO INT0STAT: GPIO11 (Bit 11)                        */
+#define GPIO_INT0STAT_GPIO11_Msk          (0x800UL)                 /*!< GPIO INT0STAT: GPIO11 (Bitfield-Mask: 0x01)           */
+#define GPIO_INT0STAT_GPIO10_Pos          (10UL)                    /*!< GPIO INT0STAT: GPIO10 (Bit 10)                        */
+#define GPIO_INT0STAT_GPIO10_Msk          (0x400UL)                 /*!< GPIO INT0STAT: GPIO10 (Bitfield-Mask: 0x01)           */
+#define GPIO_INT0STAT_GPIO9_Pos           (9UL)                     /*!< GPIO INT0STAT: GPIO9 (Bit 9)                          */
+#define GPIO_INT0STAT_GPIO9_Msk           (0x200UL)                 /*!< GPIO INT0STAT: GPIO9 (Bitfield-Mask: 0x01)            */
+#define GPIO_INT0STAT_GPIO8_Pos           (8UL)                     /*!< GPIO INT0STAT: GPIO8 (Bit 8)                          */
+#define GPIO_INT0STAT_GPIO8_Msk           (0x100UL)                 /*!< GPIO INT0STAT: GPIO8 (Bitfield-Mask: 0x01)            */
+#define GPIO_INT0STAT_GPIO7_Pos           (7UL)                     /*!< GPIO INT0STAT: GPIO7 (Bit 7)                          */
+#define GPIO_INT0STAT_GPIO7_Msk           (0x80UL)                  /*!< GPIO INT0STAT: GPIO7 (Bitfield-Mask: 0x01)            */
+#define GPIO_INT0STAT_GPIO6_Pos           (6UL)                     /*!< GPIO INT0STAT: GPIO6 (Bit 6)                          */
+#define GPIO_INT0STAT_GPIO6_Msk           (0x40UL)                  /*!< GPIO INT0STAT: GPIO6 (Bitfield-Mask: 0x01)            */
+#define GPIO_INT0STAT_GPIO5_Pos           (5UL)                     /*!< GPIO INT0STAT: GPIO5 (Bit 5)                          */
+#define GPIO_INT0STAT_GPIO5_Msk           (0x20UL)                  /*!< GPIO INT0STAT: GPIO5 (Bitfield-Mask: 0x01)            */
+#define GPIO_INT0STAT_GPIO4_Pos           (4UL)                     /*!< GPIO INT0STAT: GPIO4 (Bit 4)                          */
+#define GPIO_INT0STAT_GPIO4_Msk           (0x10UL)                  /*!< GPIO INT0STAT: GPIO4 (Bitfield-Mask: 0x01)            */
+#define GPIO_INT0STAT_GPIO3_Pos           (3UL)                     /*!< GPIO INT0STAT: GPIO3 (Bit 3)                          */
+#define GPIO_INT0STAT_GPIO3_Msk           (0x8UL)                   /*!< GPIO INT0STAT: GPIO3 (Bitfield-Mask: 0x01)            */
+#define GPIO_INT0STAT_GPIO2_Pos           (2UL)                     /*!< GPIO INT0STAT: GPIO2 (Bit 2)                          */
+#define GPIO_INT0STAT_GPIO2_Msk           (0x4UL)                   /*!< GPIO INT0STAT: GPIO2 (Bitfield-Mask: 0x01)            */
+#define GPIO_INT0STAT_GPIO1_Pos           (1UL)                     /*!< GPIO INT0STAT: GPIO1 (Bit 1)                          */
+#define GPIO_INT0STAT_GPIO1_Msk           (0x2UL)                   /*!< GPIO INT0STAT: GPIO1 (Bitfield-Mask: 0x01)            */
+#define GPIO_INT0STAT_GPIO0_Pos           (0UL)                     /*!< GPIO INT0STAT: GPIO0 (Bit 0)                          */
+#define GPIO_INT0STAT_GPIO0_Msk           (0x1UL)                   /*!< GPIO INT0STAT: GPIO0 (Bitfield-Mask: 0x01)            */
+/* ========================================================  INT0CLR  ======================================================== */
+#define GPIO_INT0CLR_GPIO31_Pos           (31UL)                    /*!< GPIO INT0CLR: GPIO31 (Bit 31)                         */
+#define GPIO_INT0CLR_GPIO31_Msk           (0x80000000UL)            /*!< GPIO INT0CLR: GPIO31 (Bitfield-Mask: 0x01)            */
+#define GPIO_INT0CLR_GPIO30_Pos           (30UL)                    /*!< GPIO INT0CLR: GPIO30 (Bit 30)                         */
+#define GPIO_INT0CLR_GPIO30_Msk           (0x40000000UL)            /*!< GPIO INT0CLR: GPIO30 (Bitfield-Mask: 0x01)            */
+#define GPIO_INT0CLR_GPIO29_Pos           (29UL)                    /*!< GPIO INT0CLR: GPIO29 (Bit 29)                         */
+#define GPIO_INT0CLR_GPIO29_Msk           (0x20000000UL)            /*!< GPIO INT0CLR: GPIO29 (Bitfield-Mask: 0x01)            */
+#define GPIO_INT0CLR_GPIO28_Pos           (28UL)                    /*!< GPIO INT0CLR: GPIO28 (Bit 28)                         */
+#define GPIO_INT0CLR_GPIO28_Msk           (0x10000000UL)            /*!< GPIO INT0CLR: GPIO28 (Bitfield-Mask: 0x01)            */
+#define GPIO_INT0CLR_GPIO27_Pos           (27UL)                    /*!< GPIO INT0CLR: GPIO27 (Bit 27)                         */
+#define GPIO_INT0CLR_GPIO27_Msk           (0x8000000UL)             /*!< GPIO INT0CLR: GPIO27 (Bitfield-Mask: 0x01)            */
+#define GPIO_INT0CLR_GPIO26_Pos           (26UL)                    /*!< GPIO INT0CLR: GPIO26 (Bit 26)                         */
+#define GPIO_INT0CLR_GPIO26_Msk           (0x4000000UL)             /*!< GPIO INT0CLR: GPIO26 (Bitfield-Mask: 0x01)            */
+#define GPIO_INT0CLR_GPIO25_Pos           (25UL)                    /*!< GPIO INT0CLR: GPIO25 (Bit 25)                         */
+#define GPIO_INT0CLR_GPIO25_Msk           (0x2000000UL)             /*!< GPIO INT0CLR: GPIO25 (Bitfield-Mask: 0x01)            */
+#define GPIO_INT0CLR_GPIO24_Pos           (24UL)                    /*!< GPIO INT0CLR: GPIO24 (Bit 24)                         */
+#define GPIO_INT0CLR_GPIO24_Msk           (0x1000000UL)             /*!< GPIO INT0CLR: GPIO24 (Bitfield-Mask: 0x01)            */
+#define GPIO_INT0CLR_GPIO23_Pos           (23UL)                    /*!< GPIO INT0CLR: GPIO23 (Bit 23)                         */
+#define GPIO_INT0CLR_GPIO23_Msk           (0x800000UL)              /*!< GPIO INT0CLR: GPIO23 (Bitfield-Mask: 0x01)            */
+#define GPIO_INT0CLR_GPIO22_Pos           (22UL)                    /*!< GPIO INT0CLR: GPIO22 (Bit 22)                         */
+#define GPIO_INT0CLR_GPIO22_Msk           (0x400000UL)              /*!< GPIO INT0CLR: GPIO22 (Bitfield-Mask: 0x01)            */
+#define GPIO_INT0CLR_GPIO21_Pos           (21UL)                    /*!< GPIO INT0CLR: GPIO21 (Bit 21)                         */
+#define GPIO_INT0CLR_GPIO21_Msk           (0x200000UL)              /*!< GPIO INT0CLR: GPIO21 (Bitfield-Mask: 0x01)            */
+#define GPIO_INT0CLR_GPIO20_Pos           (20UL)                    /*!< GPIO INT0CLR: GPIO20 (Bit 20)                         */
+#define GPIO_INT0CLR_GPIO20_Msk           (0x100000UL)              /*!< GPIO INT0CLR: GPIO20 (Bitfield-Mask: 0x01)            */
+#define GPIO_INT0CLR_GPIO19_Pos           (19UL)                    /*!< GPIO INT0CLR: GPIO19 (Bit 19)                         */
+#define GPIO_INT0CLR_GPIO19_Msk           (0x80000UL)               /*!< GPIO INT0CLR: GPIO19 (Bitfield-Mask: 0x01)            */
+#define GPIO_INT0CLR_GPIO18_Pos           (18UL)                    /*!< GPIO INT0CLR: GPIO18 (Bit 18)                         */
+#define GPIO_INT0CLR_GPIO18_Msk           (0x40000UL)               /*!< GPIO INT0CLR: GPIO18 (Bitfield-Mask: 0x01)            */
+#define GPIO_INT0CLR_GPIO17_Pos           (17UL)                    /*!< GPIO INT0CLR: GPIO17 (Bit 17)                         */
+#define GPIO_INT0CLR_GPIO17_Msk           (0x20000UL)               /*!< GPIO INT0CLR: GPIO17 (Bitfield-Mask: 0x01)            */
+#define GPIO_INT0CLR_GPIO16_Pos           (16UL)                    /*!< GPIO INT0CLR: GPIO16 (Bit 16)                         */
+#define GPIO_INT0CLR_GPIO16_Msk           (0x10000UL)               /*!< GPIO INT0CLR: GPIO16 (Bitfield-Mask: 0x01)            */
+#define GPIO_INT0CLR_GPIO15_Pos           (15UL)                    /*!< GPIO INT0CLR: GPIO15 (Bit 15)                         */
+#define GPIO_INT0CLR_GPIO15_Msk           (0x8000UL)                /*!< GPIO INT0CLR: GPIO15 (Bitfield-Mask: 0x01)            */
+#define GPIO_INT0CLR_GPIO14_Pos           (14UL)                    /*!< GPIO INT0CLR: GPIO14 (Bit 14)                         */
+#define GPIO_INT0CLR_GPIO14_Msk           (0x4000UL)                /*!< GPIO INT0CLR: GPIO14 (Bitfield-Mask: 0x01)            */
+#define GPIO_INT0CLR_GPIO13_Pos           (13UL)                    /*!< GPIO INT0CLR: GPIO13 (Bit 13)                         */
+#define GPIO_INT0CLR_GPIO13_Msk           (0x2000UL)                /*!< GPIO INT0CLR: GPIO13 (Bitfield-Mask: 0x01)            */
+#define GPIO_INT0CLR_GPIO12_Pos           (12UL)                    /*!< GPIO INT0CLR: GPIO12 (Bit 12)                         */
+#define GPIO_INT0CLR_GPIO12_Msk           (0x1000UL)                /*!< GPIO INT0CLR: GPIO12 (Bitfield-Mask: 0x01)            */
+#define GPIO_INT0CLR_GPIO11_Pos           (11UL)                    /*!< GPIO INT0CLR: GPIO11 (Bit 11)                         */
+#define GPIO_INT0CLR_GPIO11_Msk           (0x800UL)                 /*!< GPIO INT0CLR: GPIO11 (Bitfield-Mask: 0x01)            */
+#define GPIO_INT0CLR_GPIO10_Pos           (10UL)                    /*!< GPIO INT0CLR: GPIO10 (Bit 10)                         */
+#define GPIO_INT0CLR_GPIO10_Msk           (0x400UL)                 /*!< GPIO INT0CLR: GPIO10 (Bitfield-Mask: 0x01)            */
+#define GPIO_INT0CLR_GPIO9_Pos            (9UL)                     /*!< GPIO INT0CLR: GPIO9 (Bit 9)                           */
+#define GPIO_INT0CLR_GPIO9_Msk            (0x200UL)                 /*!< GPIO INT0CLR: GPIO9 (Bitfield-Mask: 0x01)             */
+#define GPIO_INT0CLR_GPIO8_Pos            (8UL)                     /*!< GPIO INT0CLR: GPIO8 (Bit 8)                           */
+#define GPIO_INT0CLR_GPIO8_Msk            (0x100UL)                 /*!< GPIO INT0CLR: GPIO8 (Bitfield-Mask: 0x01)             */
+#define GPIO_INT0CLR_GPIO7_Pos            (7UL)                     /*!< GPIO INT0CLR: GPIO7 (Bit 7)                           */
+#define GPIO_INT0CLR_GPIO7_Msk            (0x80UL)                  /*!< GPIO INT0CLR: GPIO7 (Bitfield-Mask: 0x01)             */
+#define GPIO_INT0CLR_GPIO6_Pos            (6UL)                     /*!< GPIO INT0CLR: GPIO6 (Bit 6)                           */
+#define GPIO_INT0CLR_GPIO6_Msk            (0x40UL)                  /*!< GPIO INT0CLR: GPIO6 (Bitfield-Mask: 0x01)             */
+#define GPIO_INT0CLR_GPIO5_Pos            (5UL)                     /*!< GPIO INT0CLR: GPIO5 (Bit 5)                           */
+#define GPIO_INT0CLR_GPIO5_Msk            (0x20UL)                  /*!< GPIO INT0CLR: GPIO5 (Bitfield-Mask: 0x01)             */
+#define GPIO_INT0CLR_GPIO4_Pos            (4UL)                     /*!< GPIO INT0CLR: GPIO4 (Bit 4)                           */
+#define GPIO_INT0CLR_GPIO4_Msk            (0x10UL)                  /*!< GPIO INT0CLR: GPIO4 (Bitfield-Mask: 0x01)             */
+#define GPIO_INT0CLR_GPIO3_Pos            (3UL)                     /*!< GPIO INT0CLR: GPIO3 (Bit 3)                           */
+#define GPIO_INT0CLR_GPIO3_Msk            (0x8UL)                   /*!< GPIO INT0CLR: GPIO3 (Bitfield-Mask: 0x01)             */
+#define GPIO_INT0CLR_GPIO2_Pos            (2UL)                     /*!< GPIO INT0CLR: GPIO2 (Bit 2)                           */
+#define GPIO_INT0CLR_GPIO2_Msk            (0x4UL)                   /*!< GPIO INT0CLR: GPIO2 (Bitfield-Mask: 0x01)             */
+#define GPIO_INT0CLR_GPIO1_Pos            (1UL)                     /*!< GPIO INT0CLR: GPIO1 (Bit 1)                           */
+#define GPIO_INT0CLR_GPIO1_Msk            (0x2UL)                   /*!< GPIO INT0CLR: GPIO1 (Bitfield-Mask: 0x01)             */
+#define GPIO_INT0CLR_GPIO0_Pos            (0UL)                     /*!< GPIO INT0CLR: GPIO0 (Bit 0)                           */
+#define GPIO_INT0CLR_GPIO0_Msk            (0x1UL)                   /*!< GPIO INT0CLR: GPIO0 (Bitfield-Mask: 0x01)             */
+/* ========================================================  INT0SET  ======================================================== */
+#define GPIO_INT0SET_GPIO31_Pos           (31UL)                    /*!< GPIO INT0SET: GPIO31 (Bit 31)                         */
+#define GPIO_INT0SET_GPIO31_Msk           (0x80000000UL)            /*!< GPIO INT0SET: GPIO31 (Bitfield-Mask: 0x01)            */
+#define GPIO_INT0SET_GPIO30_Pos           (30UL)                    /*!< GPIO INT0SET: GPIO30 (Bit 30)                         */
+#define GPIO_INT0SET_GPIO30_Msk           (0x40000000UL)            /*!< GPIO INT0SET: GPIO30 (Bitfield-Mask: 0x01)            */
+#define GPIO_INT0SET_GPIO29_Pos           (29UL)                    /*!< GPIO INT0SET: GPIO29 (Bit 29)                         */
+#define GPIO_INT0SET_GPIO29_Msk           (0x20000000UL)            /*!< GPIO INT0SET: GPIO29 (Bitfield-Mask: 0x01)            */
+#define GPIO_INT0SET_GPIO28_Pos           (28UL)                    /*!< GPIO INT0SET: GPIO28 (Bit 28)                         */
+#define GPIO_INT0SET_GPIO28_Msk           (0x10000000UL)            /*!< GPIO INT0SET: GPIO28 (Bitfield-Mask: 0x01)            */
+#define GPIO_INT0SET_GPIO27_Pos           (27UL)                    /*!< GPIO INT0SET: GPIO27 (Bit 27)                         */
+#define GPIO_INT0SET_GPIO27_Msk           (0x8000000UL)             /*!< GPIO INT0SET: GPIO27 (Bitfield-Mask: 0x01)            */
+#define GPIO_INT0SET_GPIO26_Pos           (26UL)                    /*!< GPIO INT0SET: GPIO26 (Bit 26)                         */
+#define GPIO_INT0SET_GPIO26_Msk           (0x4000000UL)             /*!< GPIO INT0SET: GPIO26 (Bitfield-Mask: 0x01)            */
+#define GPIO_INT0SET_GPIO25_Pos           (25UL)                    /*!< GPIO INT0SET: GPIO25 (Bit 25)                         */
+#define GPIO_INT0SET_GPIO25_Msk           (0x2000000UL)             /*!< GPIO INT0SET: GPIO25 (Bitfield-Mask: 0x01)            */
+#define GPIO_INT0SET_GPIO24_Pos           (24UL)                    /*!< GPIO INT0SET: GPIO24 (Bit 24)                         */
+#define GPIO_INT0SET_GPIO24_Msk           (0x1000000UL)             /*!< GPIO INT0SET: GPIO24 (Bitfield-Mask: 0x01)            */
+#define GPIO_INT0SET_GPIO23_Pos           (23UL)                    /*!< GPIO INT0SET: GPIO23 (Bit 23)                         */
+#define GPIO_INT0SET_GPIO23_Msk           (0x800000UL)              /*!< GPIO INT0SET: GPIO23 (Bitfield-Mask: 0x01)            */
+#define GPIO_INT0SET_GPIO22_Pos           (22UL)                    /*!< GPIO INT0SET: GPIO22 (Bit 22)                         */
+#define GPIO_INT0SET_GPIO22_Msk           (0x400000UL)              /*!< GPIO INT0SET: GPIO22 (Bitfield-Mask: 0x01)            */
+#define GPIO_INT0SET_GPIO21_Pos           (21UL)                    /*!< GPIO INT0SET: GPIO21 (Bit 21)                         */
+#define GPIO_INT0SET_GPIO21_Msk           (0x200000UL)              /*!< GPIO INT0SET: GPIO21 (Bitfield-Mask: 0x01)            */
+#define GPIO_INT0SET_GPIO20_Pos           (20UL)                    /*!< GPIO INT0SET: GPIO20 (Bit 20)                         */
+#define GPIO_INT0SET_GPIO20_Msk           (0x100000UL)              /*!< GPIO INT0SET: GPIO20 (Bitfield-Mask: 0x01)            */
+#define GPIO_INT0SET_GPIO19_Pos           (19UL)                    /*!< GPIO INT0SET: GPIO19 (Bit 19)                         */
+#define GPIO_INT0SET_GPIO19_Msk           (0x80000UL)               /*!< GPIO INT0SET: GPIO19 (Bitfield-Mask: 0x01)            */
+#define GPIO_INT0SET_GPIO18_Pos           (18UL)                    /*!< GPIO INT0SET: GPIO18 (Bit 18)                         */
+#define GPIO_INT0SET_GPIO18_Msk           (0x40000UL)               /*!< GPIO INT0SET: GPIO18 (Bitfield-Mask: 0x01)            */
+#define GPIO_INT0SET_GPIO17_Pos           (17UL)                    /*!< GPIO INT0SET: GPIO17 (Bit 17)                         */
+#define GPIO_INT0SET_GPIO17_Msk           (0x20000UL)               /*!< GPIO INT0SET: GPIO17 (Bitfield-Mask: 0x01)            */
+#define GPIO_INT0SET_GPIO16_Pos           (16UL)                    /*!< GPIO INT0SET: GPIO16 (Bit 16)                         */
+#define GPIO_INT0SET_GPIO16_Msk           (0x10000UL)               /*!< GPIO INT0SET: GPIO16 (Bitfield-Mask: 0x01)            */
+#define GPIO_INT0SET_GPIO15_Pos           (15UL)                    /*!< GPIO INT0SET: GPIO15 (Bit 15)                         */
+#define GPIO_INT0SET_GPIO15_Msk           (0x8000UL)                /*!< GPIO INT0SET: GPIO15 (Bitfield-Mask: 0x01)            */
+#define GPIO_INT0SET_GPIO14_Pos           (14UL)                    /*!< GPIO INT0SET: GPIO14 (Bit 14)                         */
+#define GPIO_INT0SET_GPIO14_Msk           (0x4000UL)                /*!< GPIO INT0SET: GPIO14 (Bitfield-Mask: 0x01)            */
+#define GPIO_INT0SET_GPIO13_Pos           (13UL)                    /*!< GPIO INT0SET: GPIO13 (Bit 13)                         */
+#define GPIO_INT0SET_GPIO13_Msk           (0x2000UL)                /*!< GPIO INT0SET: GPIO13 (Bitfield-Mask: 0x01)            */
+#define GPIO_INT0SET_GPIO12_Pos           (12UL)                    /*!< GPIO INT0SET: GPIO12 (Bit 12)                         */
+#define GPIO_INT0SET_GPIO12_Msk           (0x1000UL)                /*!< GPIO INT0SET: GPIO12 (Bitfield-Mask: 0x01)            */
+#define GPIO_INT0SET_GPIO11_Pos           (11UL)                    /*!< GPIO INT0SET: GPIO11 (Bit 11)                         */
+#define GPIO_INT0SET_GPIO11_Msk           (0x800UL)                 /*!< GPIO INT0SET: GPIO11 (Bitfield-Mask: 0x01)            */
+#define GPIO_INT0SET_GPIO10_Pos           (10UL)                    /*!< GPIO INT0SET: GPIO10 (Bit 10)                         */
+#define GPIO_INT0SET_GPIO10_Msk           (0x400UL)                 /*!< GPIO INT0SET: GPIO10 (Bitfield-Mask: 0x01)            */
+#define GPIO_INT0SET_GPIO9_Pos            (9UL)                     /*!< GPIO INT0SET: GPIO9 (Bit 9)                           */
+#define GPIO_INT0SET_GPIO9_Msk            (0x200UL)                 /*!< GPIO INT0SET: GPIO9 (Bitfield-Mask: 0x01)             */
+#define GPIO_INT0SET_GPIO8_Pos            (8UL)                     /*!< GPIO INT0SET: GPIO8 (Bit 8)                           */
+#define GPIO_INT0SET_GPIO8_Msk            (0x100UL)                 /*!< GPIO INT0SET: GPIO8 (Bitfield-Mask: 0x01)             */
+#define GPIO_INT0SET_GPIO7_Pos            (7UL)                     /*!< GPIO INT0SET: GPIO7 (Bit 7)                           */
+#define GPIO_INT0SET_GPIO7_Msk            (0x80UL)                  /*!< GPIO INT0SET: GPIO7 (Bitfield-Mask: 0x01)             */
+#define GPIO_INT0SET_GPIO6_Pos            (6UL)                     /*!< GPIO INT0SET: GPIO6 (Bit 6)                           */
+#define GPIO_INT0SET_GPIO6_Msk            (0x40UL)                  /*!< GPIO INT0SET: GPIO6 (Bitfield-Mask: 0x01)             */
+#define GPIO_INT0SET_GPIO5_Pos            (5UL)                     /*!< GPIO INT0SET: GPIO5 (Bit 5)                           */
+#define GPIO_INT0SET_GPIO5_Msk            (0x20UL)                  /*!< GPIO INT0SET: GPIO5 (Bitfield-Mask: 0x01)             */
+#define GPIO_INT0SET_GPIO4_Pos            (4UL)                     /*!< GPIO INT0SET: GPIO4 (Bit 4)                           */
+#define GPIO_INT0SET_GPIO4_Msk            (0x10UL)                  /*!< GPIO INT0SET: GPIO4 (Bitfield-Mask: 0x01)             */
+#define GPIO_INT0SET_GPIO3_Pos            (3UL)                     /*!< GPIO INT0SET: GPIO3 (Bit 3)                           */
+#define GPIO_INT0SET_GPIO3_Msk            (0x8UL)                   /*!< GPIO INT0SET: GPIO3 (Bitfield-Mask: 0x01)             */
+#define GPIO_INT0SET_GPIO2_Pos            (2UL)                     /*!< GPIO INT0SET: GPIO2 (Bit 2)                           */
+#define GPIO_INT0SET_GPIO2_Msk            (0x4UL)                   /*!< GPIO INT0SET: GPIO2 (Bitfield-Mask: 0x01)             */
+#define GPIO_INT0SET_GPIO1_Pos            (1UL)                     /*!< GPIO INT0SET: GPIO1 (Bit 1)                           */
+#define GPIO_INT0SET_GPIO1_Msk            (0x2UL)                   /*!< GPIO INT0SET: GPIO1 (Bitfield-Mask: 0x01)             */
+#define GPIO_INT0SET_GPIO0_Pos            (0UL)                     /*!< GPIO INT0SET: GPIO0 (Bit 0)                           */
+#define GPIO_INT0SET_GPIO0_Msk            (0x1UL)                   /*!< GPIO INT0SET: GPIO0 (Bitfield-Mask: 0x01)             */
+/* ========================================================  INT1EN  ========================================================= */
+#define GPIO_INT1EN_GPIO49_Pos            (17UL)                    /*!< GPIO INT1EN: GPIO49 (Bit 17)                          */
+#define GPIO_INT1EN_GPIO49_Msk            (0x20000UL)               /*!< GPIO INT1EN: GPIO49 (Bitfield-Mask: 0x01)             */
+#define GPIO_INT1EN_GPIO48_Pos            (16UL)                    /*!< GPIO INT1EN: GPIO48 (Bit 16)                          */
+#define GPIO_INT1EN_GPIO48_Msk            (0x10000UL)               /*!< GPIO INT1EN: GPIO48 (Bitfield-Mask: 0x01)             */
+#define GPIO_INT1EN_GPIO47_Pos            (15UL)                    /*!< GPIO INT1EN: GPIO47 (Bit 15)                          */
+#define GPIO_INT1EN_GPIO47_Msk            (0x8000UL)                /*!< GPIO INT1EN: GPIO47 (Bitfield-Mask: 0x01)             */
+#define GPIO_INT1EN_GPIO46_Pos            (14UL)                    /*!< GPIO INT1EN: GPIO46 (Bit 14)                          */
+#define GPIO_INT1EN_GPIO46_Msk            (0x4000UL)                /*!< GPIO INT1EN: GPIO46 (Bitfield-Mask: 0x01)             */
+#define GPIO_INT1EN_GPIO45_Pos            (13UL)                    /*!< GPIO INT1EN: GPIO45 (Bit 13)                          */
+#define GPIO_INT1EN_GPIO45_Msk            (0x2000UL)                /*!< GPIO INT1EN: GPIO45 (Bitfield-Mask: 0x01)             */
+#define GPIO_INT1EN_GPIO44_Pos            (12UL)                    /*!< GPIO INT1EN: GPIO44 (Bit 12)                          */
+#define GPIO_INT1EN_GPIO44_Msk            (0x1000UL)                /*!< GPIO INT1EN: GPIO44 (Bitfield-Mask: 0x01)             */
+#define GPIO_INT1EN_GPIO43_Pos            (11UL)                    /*!< GPIO INT1EN: GPIO43 (Bit 11)                          */
+#define GPIO_INT1EN_GPIO43_Msk            (0x800UL)                 /*!< GPIO INT1EN: GPIO43 (Bitfield-Mask: 0x01)             */
+#define GPIO_INT1EN_GPIO42_Pos            (10UL)                    /*!< GPIO INT1EN: GPIO42 (Bit 10)                          */
+#define GPIO_INT1EN_GPIO42_Msk            (0x400UL)                 /*!< GPIO INT1EN: GPIO42 (Bitfield-Mask: 0x01)             */
+#define GPIO_INT1EN_GPIO41_Pos            (9UL)                     /*!< GPIO INT1EN: GPIO41 (Bit 9)                           */
+#define GPIO_INT1EN_GPIO41_Msk            (0x200UL)                 /*!< GPIO INT1EN: GPIO41 (Bitfield-Mask: 0x01)             */
+#define GPIO_INT1EN_GPIO40_Pos            (8UL)                     /*!< GPIO INT1EN: GPIO40 (Bit 8)                           */
+#define GPIO_INT1EN_GPIO40_Msk            (0x100UL)                 /*!< GPIO INT1EN: GPIO40 (Bitfield-Mask: 0x01)             */
+#define GPIO_INT1EN_GPIO39_Pos            (7UL)                     /*!< GPIO INT1EN: GPIO39 (Bit 7)                           */
+#define GPIO_INT1EN_GPIO39_Msk            (0x80UL)                  /*!< GPIO INT1EN: GPIO39 (Bitfield-Mask: 0x01)             */
+#define GPIO_INT1EN_GPIO38_Pos            (6UL)                     /*!< GPIO INT1EN: GPIO38 (Bit 6)                           */
+#define GPIO_INT1EN_GPIO38_Msk            (0x40UL)                  /*!< GPIO INT1EN: GPIO38 (Bitfield-Mask: 0x01)             */
+#define GPIO_INT1EN_GPIO37_Pos            (5UL)                     /*!< GPIO INT1EN: GPIO37 (Bit 5)                           */
+#define GPIO_INT1EN_GPIO37_Msk            (0x20UL)                  /*!< GPIO INT1EN: GPIO37 (Bitfield-Mask: 0x01)             */
+#define GPIO_INT1EN_GPIO36_Pos            (4UL)                     /*!< GPIO INT1EN: GPIO36 (Bit 4)                           */
+#define GPIO_INT1EN_GPIO36_Msk            (0x10UL)                  /*!< GPIO INT1EN: GPIO36 (Bitfield-Mask: 0x01)             */
+#define GPIO_INT1EN_GPIO35_Pos            (3UL)                     /*!< GPIO INT1EN: GPIO35 (Bit 3)                           */
+#define GPIO_INT1EN_GPIO35_Msk            (0x8UL)                   /*!< GPIO INT1EN: GPIO35 (Bitfield-Mask: 0x01)             */
+#define GPIO_INT1EN_GPIO34_Pos            (2UL)                     /*!< GPIO INT1EN: GPIO34 (Bit 2)                           */
+#define GPIO_INT1EN_GPIO34_Msk            (0x4UL)                   /*!< GPIO INT1EN: GPIO34 (Bitfield-Mask: 0x01)             */
+#define GPIO_INT1EN_GPIO33_Pos            (1UL)                     /*!< GPIO INT1EN: GPIO33 (Bit 1)                           */
+#define GPIO_INT1EN_GPIO33_Msk            (0x2UL)                   /*!< GPIO INT1EN: GPIO33 (Bitfield-Mask: 0x01)             */
+#define GPIO_INT1EN_GPIO32_Pos            (0UL)                     /*!< GPIO INT1EN: GPIO32 (Bit 0)                           */
+#define GPIO_INT1EN_GPIO32_Msk            (0x1UL)                   /*!< GPIO INT1EN: GPIO32 (Bitfield-Mask: 0x01)             */
+/* =======================================================  INT1STAT  ======================================================== */
+#define GPIO_INT1STAT_GPIO49_Pos          (17UL)                    /*!< GPIO INT1STAT: GPIO49 (Bit 17)                        */
+#define GPIO_INT1STAT_GPIO49_Msk          (0x20000UL)               /*!< GPIO INT1STAT: GPIO49 (Bitfield-Mask: 0x01)           */
+#define GPIO_INT1STAT_GPIO48_Pos          (16UL)                    /*!< GPIO INT1STAT: GPIO48 (Bit 16)                        */
+#define GPIO_INT1STAT_GPIO48_Msk          (0x10000UL)               /*!< GPIO INT1STAT: GPIO48 (Bitfield-Mask: 0x01)           */
+#define GPIO_INT1STAT_GPIO47_Pos          (15UL)                    /*!< GPIO INT1STAT: GPIO47 (Bit 15)                        */
+#define GPIO_INT1STAT_GPIO47_Msk          (0x8000UL)                /*!< GPIO INT1STAT: GPIO47 (Bitfield-Mask: 0x01)           */
+#define GPIO_INT1STAT_GPIO46_Pos          (14UL)                    /*!< GPIO INT1STAT: GPIO46 (Bit 14)                        */
+#define GPIO_INT1STAT_GPIO46_Msk          (0x4000UL)                /*!< GPIO INT1STAT: GPIO46 (Bitfield-Mask: 0x01)           */
+#define GPIO_INT1STAT_GPIO45_Pos          (13UL)                    /*!< GPIO INT1STAT: GPIO45 (Bit 13)                        */
+#define GPIO_INT1STAT_GPIO45_Msk          (0x2000UL)                /*!< GPIO INT1STAT: GPIO45 (Bitfield-Mask: 0x01)           */
+#define GPIO_INT1STAT_GPIO44_Pos          (12UL)                    /*!< GPIO INT1STAT: GPIO44 (Bit 12)                        */
+#define GPIO_INT1STAT_GPIO44_Msk          (0x1000UL)                /*!< GPIO INT1STAT: GPIO44 (Bitfield-Mask: 0x01)           */
+#define GPIO_INT1STAT_GPIO43_Pos          (11UL)                    /*!< GPIO INT1STAT: GPIO43 (Bit 11)                        */
+#define GPIO_INT1STAT_GPIO43_Msk          (0x800UL)                 /*!< GPIO INT1STAT: GPIO43 (Bitfield-Mask: 0x01)           */
+#define GPIO_INT1STAT_GPIO42_Pos          (10UL)                    /*!< GPIO INT1STAT: GPIO42 (Bit 10)                        */
+#define GPIO_INT1STAT_GPIO42_Msk          (0x400UL)                 /*!< GPIO INT1STAT: GPIO42 (Bitfield-Mask: 0x01)           */
+#define GPIO_INT1STAT_GPIO41_Pos          (9UL)                     /*!< GPIO INT1STAT: GPIO41 (Bit 9)                         */
+#define GPIO_INT1STAT_GPIO41_Msk          (0x200UL)                 /*!< GPIO INT1STAT: GPIO41 (Bitfield-Mask: 0x01)           */
+#define GPIO_INT1STAT_GPIO40_Pos          (8UL)                     /*!< GPIO INT1STAT: GPIO40 (Bit 8)                         */
+#define GPIO_INT1STAT_GPIO40_Msk          (0x100UL)                 /*!< GPIO INT1STAT: GPIO40 (Bitfield-Mask: 0x01)           */
+#define GPIO_INT1STAT_GPIO39_Pos          (7UL)                     /*!< GPIO INT1STAT: GPIO39 (Bit 7)                         */
+#define GPIO_INT1STAT_GPIO39_Msk          (0x80UL)                  /*!< GPIO INT1STAT: GPIO39 (Bitfield-Mask: 0x01)           */
+#define GPIO_INT1STAT_GPIO38_Pos          (6UL)                     /*!< GPIO INT1STAT: GPIO38 (Bit 6)                         */
+#define GPIO_INT1STAT_GPIO38_Msk          (0x40UL)                  /*!< GPIO INT1STAT: GPIO38 (Bitfield-Mask: 0x01)           */
+#define GPIO_INT1STAT_GPIO37_Pos          (5UL)                     /*!< GPIO INT1STAT: GPIO37 (Bit 5)                         */
+#define GPIO_INT1STAT_GPIO37_Msk          (0x20UL)                  /*!< GPIO INT1STAT: GPIO37 (Bitfield-Mask: 0x01)           */
+#define GPIO_INT1STAT_GPIO36_Pos          (4UL)                     /*!< GPIO INT1STAT: GPIO36 (Bit 4)                         */
+#define GPIO_INT1STAT_GPIO36_Msk          (0x10UL)                  /*!< GPIO INT1STAT: GPIO36 (Bitfield-Mask: 0x01)           */
+#define GPIO_INT1STAT_GPIO35_Pos          (3UL)                     /*!< GPIO INT1STAT: GPIO35 (Bit 3)                         */
+#define GPIO_INT1STAT_GPIO35_Msk          (0x8UL)                   /*!< GPIO INT1STAT: GPIO35 (Bitfield-Mask: 0x01)           */
+#define GPIO_INT1STAT_GPIO34_Pos          (2UL)                     /*!< GPIO INT1STAT: GPIO34 (Bit 2)                         */
+#define GPIO_INT1STAT_GPIO34_Msk          (0x4UL)                   /*!< GPIO INT1STAT: GPIO34 (Bitfield-Mask: 0x01)           */
+#define GPIO_INT1STAT_GPIO33_Pos          (1UL)                     /*!< GPIO INT1STAT: GPIO33 (Bit 1)                         */
+#define GPIO_INT1STAT_GPIO33_Msk          (0x2UL)                   /*!< GPIO INT1STAT: GPIO33 (Bitfield-Mask: 0x01)           */
+#define GPIO_INT1STAT_GPIO32_Pos          (0UL)                     /*!< GPIO INT1STAT: GPIO32 (Bit 0)                         */
+#define GPIO_INT1STAT_GPIO32_Msk          (0x1UL)                   /*!< GPIO INT1STAT: GPIO32 (Bitfield-Mask: 0x01)           */
+/* ========================================================  INT1CLR  ======================================================== */
+#define GPIO_INT1CLR_GPIO49_Pos           (17UL)                    /*!< GPIO INT1CLR: GPIO49 (Bit 17)                         */
+#define GPIO_INT1CLR_GPIO49_Msk           (0x20000UL)               /*!< GPIO INT1CLR: GPIO49 (Bitfield-Mask: 0x01)            */
+#define GPIO_INT1CLR_GPIO48_Pos           (16UL)                    /*!< GPIO INT1CLR: GPIO48 (Bit 16)                         */
+#define GPIO_INT1CLR_GPIO48_Msk           (0x10000UL)               /*!< GPIO INT1CLR: GPIO48 (Bitfield-Mask: 0x01)            */
+#define GPIO_INT1CLR_GPIO47_Pos           (15UL)                    /*!< GPIO INT1CLR: GPIO47 (Bit 15)                         */
+#define GPIO_INT1CLR_GPIO47_Msk           (0x8000UL)                /*!< GPIO INT1CLR: GPIO47 (Bitfield-Mask: 0x01)            */
+#define GPIO_INT1CLR_GPIO46_Pos           (14UL)                    /*!< GPIO INT1CLR: GPIO46 (Bit 14)                         */
+#define GPIO_INT1CLR_GPIO46_Msk           (0x4000UL)                /*!< GPIO INT1CLR: GPIO46 (Bitfield-Mask: 0x01)            */
+#define GPIO_INT1CLR_GPIO45_Pos           (13UL)                    /*!< GPIO INT1CLR: GPIO45 (Bit 13)                         */
+#define GPIO_INT1CLR_GPIO45_Msk           (0x2000UL)                /*!< GPIO INT1CLR: GPIO45 (Bitfield-Mask: 0x01)            */
+#define GPIO_INT1CLR_GPIO44_Pos           (12UL)                    /*!< GPIO INT1CLR: GPIO44 (Bit 12)                         */
+#define GPIO_INT1CLR_GPIO44_Msk           (0x1000UL)                /*!< GPIO INT1CLR: GPIO44 (Bitfield-Mask: 0x01)            */
+#define GPIO_INT1CLR_GPIO43_Pos           (11UL)                    /*!< GPIO INT1CLR: GPIO43 (Bit 11)                         */
+#define GPIO_INT1CLR_GPIO43_Msk           (0x800UL)                 /*!< GPIO INT1CLR: GPIO43 (Bitfield-Mask: 0x01)            */
+#define GPIO_INT1CLR_GPIO42_Pos           (10UL)                    /*!< GPIO INT1CLR: GPIO42 (Bit 10)                         */
+#define GPIO_INT1CLR_GPIO42_Msk           (0x400UL)                 /*!< GPIO INT1CLR: GPIO42 (Bitfield-Mask: 0x01)            */
+#define GPIO_INT1CLR_GPIO41_Pos           (9UL)                     /*!< GPIO INT1CLR: GPIO41 (Bit 9)                          */
+#define GPIO_INT1CLR_GPIO41_Msk           (0x200UL)                 /*!< GPIO INT1CLR: GPIO41 (Bitfield-Mask: 0x01)            */
+#define GPIO_INT1CLR_GPIO40_Pos           (8UL)                     /*!< GPIO INT1CLR: GPIO40 (Bit 8)                          */
+#define GPIO_INT1CLR_GPIO40_Msk           (0x100UL)                 /*!< GPIO INT1CLR: GPIO40 (Bitfield-Mask: 0x01)            */
+#define GPIO_INT1CLR_GPIO39_Pos           (7UL)                     /*!< GPIO INT1CLR: GPIO39 (Bit 7)                          */
+#define GPIO_INT1CLR_GPIO39_Msk           (0x80UL)                  /*!< GPIO INT1CLR: GPIO39 (Bitfield-Mask: 0x01)            */
+#define GPIO_INT1CLR_GPIO38_Pos           (6UL)                     /*!< GPIO INT1CLR: GPIO38 (Bit 6)                          */
+#define GPIO_INT1CLR_GPIO38_Msk           (0x40UL)                  /*!< GPIO INT1CLR: GPIO38 (Bitfield-Mask: 0x01)            */
+#define GPIO_INT1CLR_GPIO37_Pos           (5UL)                     /*!< GPIO INT1CLR: GPIO37 (Bit 5)                          */
+#define GPIO_INT1CLR_GPIO37_Msk           (0x20UL)                  /*!< GPIO INT1CLR: GPIO37 (Bitfield-Mask: 0x01)            */
+#define GPIO_INT1CLR_GPIO36_Pos           (4UL)                     /*!< GPIO INT1CLR: GPIO36 (Bit 4)                          */
+#define GPIO_INT1CLR_GPIO36_Msk           (0x10UL)                  /*!< GPIO INT1CLR: GPIO36 (Bitfield-Mask: 0x01)            */
+#define GPIO_INT1CLR_GPIO35_Pos           (3UL)                     /*!< GPIO INT1CLR: GPIO35 (Bit 3)                          */
+#define GPIO_INT1CLR_GPIO35_Msk           (0x8UL)                   /*!< GPIO INT1CLR: GPIO35 (Bitfield-Mask: 0x01)            */
+#define GPIO_INT1CLR_GPIO34_Pos           (2UL)                     /*!< GPIO INT1CLR: GPIO34 (Bit 2)                          */
+#define GPIO_INT1CLR_GPIO34_Msk           (0x4UL)                   /*!< GPIO INT1CLR: GPIO34 (Bitfield-Mask: 0x01)            */
+#define GPIO_INT1CLR_GPIO33_Pos           (1UL)                     /*!< GPIO INT1CLR: GPIO33 (Bit 1)                          */
+#define GPIO_INT1CLR_GPIO33_Msk           (0x2UL)                   /*!< GPIO INT1CLR: GPIO33 (Bitfield-Mask: 0x01)            */
+#define GPIO_INT1CLR_GPIO32_Pos           (0UL)                     /*!< GPIO INT1CLR: GPIO32 (Bit 0)                          */
+#define GPIO_INT1CLR_GPIO32_Msk           (0x1UL)                   /*!< GPIO INT1CLR: GPIO32 (Bitfield-Mask: 0x01)            */
+/* ========================================================  INT1SET  ======================================================== */
+#define GPIO_INT1SET_GPIO49_Pos           (17UL)                    /*!< GPIO INT1SET: GPIO49 (Bit 17)                         */
+#define GPIO_INT1SET_GPIO49_Msk           (0x20000UL)               /*!< GPIO INT1SET: GPIO49 (Bitfield-Mask: 0x01)            */
+#define GPIO_INT1SET_GPIO48_Pos           (16UL)                    /*!< GPIO INT1SET: GPIO48 (Bit 16)                         */
+#define GPIO_INT1SET_GPIO48_Msk           (0x10000UL)               /*!< GPIO INT1SET: GPIO48 (Bitfield-Mask: 0x01)            */
+#define GPIO_INT1SET_GPIO47_Pos           (15UL)                    /*!< GPIO INT1SET: GPIO47 (Bit 15)                         */
+#define GPIO_INT1SET_GPIO47_Msk           (0x8000UL)                /*!< GPIO INT1SET: GPIO47 (Bitfield-Mask: 0x01)            */
+#define GPIO_INT1SET_GPIO46_Pos           (14UL)                    /*!< GPIO INT1SET: GPIO46 (Bit 14)                         */
+#define GPIO_INT1SET_GPIO46_Msk           (0x4000UL)                /*!< GPIO INT1SET: GPIO46 (Bitfield-Mask: 0x01)            */
+#define GPIO_INT1SET_GPIO45_Pos           (13UL)                    /*!< GPIO INT1SET: GPIO45 (Bit 13)                         */
+#define GPIO_INT1SET_GPIO45_Msk           (0x2000UL)                /*!< GPIO INT1SET: GPIO45 (Bitfield-Mask: 0x01)            */
+#define GPIO_INT1SET_GPIO44_Pos           (12UL)                    /*!< GPIO INT1SET: GPIO44 (Bit 12)                         */
+#define GPIO_INT1SET_GPIO44_Msk           (0x1000UL)                /*!< GPIO INT1SET: GPIO44 (Bitfield-Mask: 0x01)            */
+#define GPIO_INT1SET_GPIO43_Pos           (11UL)                    /*!< GPIO INT1SET: GPIO43 (Bit 11)                         */
+#define GPIO_INT1SET_GPIO43_Msk           (0x800UL)                 /*!< GPIO INT1SET: GPIO43 (Bitfield-Mask: 0x01)            */
+#define GPIO_INT1SET_GPIO42_Pos           (10UL)                    /*!< GPIO INT1SET: GPIO42 (Bit 10)                         */
+#define GPIO_INT1SET_GPIO42_Msk           (0x400UL)                 /*!< GPIO INT1SET: GPIO42 (Bitfield-Mask: 0x01)            */
+#define GPIO_INT1SET_GPIO41_Pos           (9UL)                     /*!< GPIO INT1SET: GPIO41 (Bit 9)                          */
+#define GPIO_INT1SET_GPIO41_Msk           (0x200UL)                 /*!< GPIO INT1SET: GPIO41 (Bitfield-Mask: 0x01)            */
+#define GPIO_INT1SET_GPIO40_Pos           (8UL)                     /*!< GPIO INT1SET: GPIO40 (Bit 8)                          */
+#define GPIO_INT1SET_GPIO40_Msk           (0x100UL)                 /*!< GPIO INT1SET: GPIO40 (Bitfield-Mask: 0x01)            */
+#define GPIO_INT1SET_GPIO39_Pos           (7UL)                     /*!< GPIO INT1SET: GPIO39 (Bit 7)                          */
+#define GPIO_INT1SET_GPIO39_Msk           (0x80UL)                  /*!< GPIO INT1SET: GPIO39 (Bitfield-Mask: 0x01)            */
+#define GPIO_INT1SET_GPIO38_Pos           (6UL)                     /*!< GPIO INT1SET: GPIO38 (Bit 6)                          */
+#define GPIO_INT1SET_GPIO38_Msk           (0x40UL)                  /*!< GPIO INT1SET: GPIO38 (Bitfield-Mask: 0x01)            */
+#define GPIO_INT1SET_GPIO37_Pos           (5UL)                     /*!< GPIO INT1SET: GPIO37 (Bit 5)                          */
+#define GPIO_INT1SET_GPIO37_Msk           (0x20UL)                  /*!< GPIO INT1SET: GPIO37 (Bitfield-Mask: 0x01)            */
+#define GPIO_INT1SET_GPIO36_Pos           (4UL)                     /*!< GPIO INT1SET: GPIO36 (Bit 4)                          */
+#define GPIO_INT1SET_GPIO36_Msk           (0x10UL)                  /*!< GPIO INT1SET: GPIO36 (Bitfield-Mask: 0x01)            */
+#define GPIO_INT1SET_GPIO35_Pos           (3UL)                     /*!< GPIO INT1SET: GPIO35 (Bit 3)                          */
+#define GPIO_INT1SET_GPIO35_Msk           (0x8UL)                   /*!< GPIO INT1SET: GPIO35 (Bitfield-Mask: 0x01)            */
+#define GPIO_INT1SET_GPIO34_Pos           (2UL)                     /*!< GPIO INT1SET: GPIO34 (Bit 2)                          */
+#define GPIO_INT1SET_GPIO34_Msk           (0x4UL)                   /*!< GPIO INT1SET: GPIO34 (Bitfield-Mask: 0x01)            */
+#define GPIO_INT1SET_GPIO33_Pos           (1UL)                     /*!< GPIO INT1SET: GPIO33 (Bit 1)                          */
+#define GPIO_INT1SET_GPIO33_Msk           (0x2UL)                   /*!< GPIO INT1SET: GPIO33 (Bitfield-Mask: 0x01)            */
+#define GPIO_INT1SET_GPIO32_Pos           (0UL)                     /*!< GPIO INT1SET: GPIO32 (Bit 0)                          */
+#define GPIO_INT1SET_GPIO32_Msk           (0x1UL)                   /*!< GPIO INT1SET: GPIO32 (Bitfield-Mask: 0x01)            */
+
+
+/* =========================================================================================================================== */
+/* ================                                           IOM0                                            ================ */
+/* =========================================================================================================================== */
+
+/* =========================================================  FIFO  ========================================================== */
+#define IOM0_FIFO_FIFO_Pos                (0UL)                     /*!< IOM0 FIFO: FIFO (Bit 0)                               */
+#define IOM0_FIFO_FIFO_Msk                (0xffffffffUL)            /*!< IOM0 FIFO: FIFO (Bitfield-Mask: 0xffffffff)           */
+/* ========================================================  FIFOPTR  ======================================================== */
+#define IOM0_FIFOPTR_FIFO1REM_Pos         (24UL)                    /*!< IOM0 FIFOPTR: FIFO1REM (Bit 24)                       */
+#define IOM0_FIFOPTR_FIFO1REM_Msk         (0xff000000UL)            /*!< IOM0 FIFOPTR: FIFO1REM (Bitfield-Mask: 0xff)          */
+#define IOM0_FIFOPTR_FIFO1SIZ_Pos         (16UL)                    /*!< IOM0 FIFOPTR: FIFO1SIZ (Bit 16)                       */
+#define IOM0_FIFOPTR_FIFO1SIZ_Msk         (0xff0000UL)              /*!< IOM0 FIFOPTR: FIFO1SIZ (Bitfield-Mask: 0xff)          */
+#define IOM0_FIFOPTR_FIFO0REM_Pos         (8UL)                     /*!< IOM0 FIFOPTR: FIFO0REM (Bit 8)                        */
+#define IOM0_FIFOPTR_FIFO0REM_Msk         (0xff00UL)                /*!< IOM0 FIFOPTR: FIFO0REM (Bitfield-Mask: 0xff)          */
+#define IOM0_FIFOPTR_FIFO0SIZ_Pos         (0UL)                     /*!< IOM0 FIFOPTR: FIFO0SIZ (Bit 0)                        */
+#define IOM0_FIFOPTR_FIFO0SIZ_Msk         (0xffUL)                  /*!< IOM0 FIFOPTR: FIFO0SIZ (Bitfield-Mask: 0xff)          */
+/* ========================================================  FIFOTHR  ======================================================== */
+#define IOM0_FIFOTHR_FIFOWTHR_Pos         (8UL)                     /*!< IOM0 FIFOTHR: FIFOWTHR (Bit 8)                        */
+#define IOM0_FIFOTHR_FIFOWTHR_Msk         (0x3f00UL)                /*!< IOM0 FIFOTHR: FIFOWTHR (Bitfield-Mask: 0x3f)          */
+#define IOM0_FIFOTHR_FIFORTHR_Pos         (0UL)                     /*!< IOM0 FIFOTHR: FIFORTHR (Bit 0)                        */
+#define IOM0_FIFOTHR_FIFORTHR_Msk         (0x3fUL)                  /*!< IOM0 FIFOTHR: FIFORTHR (Bitfield-Mask: 0x3f)          */
+/* ========================================================  FIFOPOP  ======================================================== */
+#define IOM0_FIFOPOP_FIFODOUT_Pos         (0UL)                     /*!< IOM0 FIFOPOP: FIFODOUT (Bit 0)                        */
+#define IOM0_FIFOPOP_FIFODOUT_Msk         (0xffffffffUL)            /*!< IOM0 FIFOPOP: FIFODOUT (Bitfield-Mask: 0xffffffff)    */
+/* =======================================================  FIFOPUSH  ======================================================== */
+#define IOM0_FIFOPUSH_FIFODIN_Pos         (0UL)                     /*!< IOM0 FIFOPUSH: FIFODIN (Bit 0)                        */
+#define IOM0_FIFOPUSH_FIFODIN_Msk         (0xffffffffUL)            /*!< IOM0 FIFOPUSH: FIFODIN (Bitfield-Mask: 0xffffffff)    */
+/* =======================================================  FIFOCTRL  ======================================================== */
+#define IOM0_FIFOCTRL_FIFORSTN_Pos        (1UL)                     /*!< IOM0 FIFOCTRL: FIFORSTN (Bit 1)                       */
+#define IOM0_FIFOCTRL_FIFORSTN_Msk        (0x2UL)                   /*!< IOM0 FIFOCTRL: FIFORSTN (Bitfield-Mask: 0x01)         */
+#define IOM0_FIFOCTRL_POPWR_Pos           (0UL)                     /*!< IOM0 FIFOCTRL: POPWR (Bit 0)                          */
+#define IOM0_FIFOCTRL_POPWR_Msk           (0x1UL)                   /*!< IOM0 FIFOCTRL: POPWR (Bitfield-Mask: 0x01)            */
+/* ========================================================  FIFOLOC  ======================================================== */
+#define IOM0_FIFOLOC_FIFORPTR_Pos         (8UL)                     /*!< IOM0 FIFOLOC: FIFORPTR (Bit 8)                        */
+#define IOM0_FIFOLOC_FIFORPTR_Msk         (0xf00UL)                 /*!< IOM0 FIFOLOC: FIFORPTR (Bitfield-Mask: 0x0f)          */
+#define IOM0_FIFOLOC_FIFOWPTR_Pos         (0UL)                     /*!< IOM0 FIFOLOC: FIFOWPTR (Bit 0)                        */
+#define IOM0_FIFOLOC_FIFOWPTR_Msk         (0xfUL)                   /*!< IOM0 FIFOLOC: FIFOWPTR (Bitfield-Mask: 0x0f)          */
+/* =========================================================  INTEN  ========================================================= */
+#define IOM0_INTEN_CQERR_Pos              (14UL)                    /*!< IOM0 INTEN: CQERR (Bit 14)                            */
+#define IOM0_INTEN_CQERR_Msk              (0x4000UL)                /*!< IOM0 INTEN: CQERR (Bitfield-Mask: 0x01)               */
+#define IOM0_INTEN_CQUPD_Pos              (13UL)                    /*!< IOM0 INTEN: CQUPD (Bit 13)                            */
+#define IOM0_INTEN_CQUPD_Msk              (0x2000UL)                /*!< IOM0 INTEN: CQUPD (Bitfield-Mask: 0x01)               */
+#define IOM0_INTEN_CQPAUSED_Pos           (12UL)                    /*!< IOM0 INTEN: CQPAUSED (Bit 12)                         */
+#define IOM0_INTEN_CQPAUSED_Msk           (0x1000UL)                /*!< IOM0 INTEN: CQPAUSED (Bitfield-Mask: 0x01)            */
+#define IOM0_INTEN_DERR_Pos               (11UL)                    /*!< IOM0 INTEN: DERR (Bit 11)                             */
+#define IOM0_INTEN_DERR_Msk               (0x800UL)                 /*!< IOM0 INTEN: DERR (Bitfield-Mask: 0x01)                */
+#define IOM0_INTEN_DCMP_Pos               (10UL)                    /*!< IOM0 INTEN: DCMP (Bit 10)                             */
+#define IOM0_INTEN_DCMP_Msk               (0x400UL)                 /*!< IOM0 INTEN: DCMP (Bitfield-Mask: 0x01)                */
+#define IOM0_INTEN_ARB_Pos                (9UL)                     /*!< IOM0 INTEN: ARB (Bit 9)                               */
+#define IOM0_INTEN_ARB_Msk                (0x200UL)                 /*!< IOM0 INTEN: ARB (Bitfield-Mask: 0x01)                 */
+#define IOM0_INTEN_STOP_Pos               (8UL)                     /*!< IOM0 INTEN: STOP (Bit 8)                              */
+#define IOM0_INTEN_STOP_Msk               (0x100UL)                 /*!< IOM0 INTEN: STOP (Bitfield-Mask: 0x01)                */
+#define IOM0_INTEN_START_Pos              (7UL)                     /*!< IOM0 INTEN: START (Bit 7)                             */
+#define IOM0_INTEN_START_Msk              (0x80UL)                  /*!< IOM0 INTEN: START (Bitfield-Mask: 0x01)               */
+#define IOM0_INTEN_ICMD_Pos               (6UL)                     /*!< IOM0 INTEN: ICMD (Bit 6)                              */
+#define IOM0_INTEN_ICMD_Msk               (0x40UL)                  /*!< IOM0 INTEN: ICMD (Bitfield-Mask: 0x01)                */
+#define IOM0_INTEN_IACC_Pos               (5UL)                     /*!< IOM0 INTEN: IACC (Bit 5)                              */
+#define IOM0_INTEN_IACC_Msk               (0x20UL)                  /*!< IOM0 INTEN: IACC (Bitfield-Mask: 0x01)                */
+#define IOM0_INTEN_NAK_Pos                (4UL)                     /*!< IOM0 INTEN: NAK (Bit 4)                               */
+#define IOM0_INTEN_NAK_Msk                (0x10UL)                  /*!< IOM0 INTEN: NAK (Bitfield-Mask: 0x01)                 */
+#define IOM0_INTEN_FOVFL_Pos              (3UL)                     /*!< IOM0 INTEN: FOVFL (Bit 3)                             */
+#define IOM0_INTEN_FOVFL_Msk              (0x8UL)                   /*!< IOM0 INTEN: FOVFL (Bitfield-Mask: 0x01)               */
+#define IOM0_INTEN_FUNDFL_Pos             (2UL)                     /*!< IOM0 INTEN: FUNDFL (Bit 2)                            */
+#define IOM0_INTEN_FUNDFL_Msk             (0x4UL)                   /*!< IOM0 INTEN: FUNDFL (Bitfield-Mask: 0x01)              */
+#define IOM0_INTEN_THR_Pos                (1UL)                     /*!< IOM0 INTEN: THR (Bit 1)                               */
+#define IOM0_INTEN_THR_Msk                (0x2UL)                   /*!< IOM0 INTEN: THR (Bitfield-Mask: 0x01)                 */
+#define IOM0_INTEN_CMDCMP_Pos             (0UL)                     /*!< IOM0 INTEN: CMDCMP (Bit 0)                            */
+#define IOM0_INTEN_CMDCMP_Msk             (0x1UL)                   /*!< IOM0 INTEN: CMDCMP (Bitfield-Mask: 0x01)              */
+/* ========================================================  INTSTAT  ======================================================== */
+#define IOM0_INTSTAT_CQERR_Pos            (14UL)                    /*!< IOM0 INTSTAT: CQERR (Bit 14)                          */
+#define IOM0_INTSTAT_CQERR_Msk            (0x4000UL)                /*!< IOM0 INTSTAT: CQERR (Bitfield-Mask: 0x01)             */
+#define IOM0_INTSTAT_CQUPD_Pos            (13UL)                    /*!< IOM0 INTSTAT: CQUPD (Bit 13)                          */
+#define IOM0_INTSTAT_CQUPD_Msk            (0x2000UL)                /*!< IOM0 INTSTAT: CQUPD (Bitfield-Mask: 0x01)             */
+#define IOM0_INTSTAT_CQPAUSED_Pos         (12UL)                    /*!< IOM0 INTSTAT: CQPAUSED (Bit 12)                       */
+#define IOM0_INTSTAT_CQPAUSED_Msk         (0x1000UL)                /*!< IOM0 INTSTAT: CQPAUSED (Bitfield-Mask: 0x01)          */
+#define IOM0_INTSTAT_DERR_Pos             (11UL)                    /*!< IOM0 INTSTAT: DERR (Bit 11)                           */
+#define IOM0_INTSTAT_DERR_Msk             (0x800UL)                 /*!< IOM0 INTSTAT: DERR (Bitfield-Mask: 0x01)              */
+#define IOM0_INTSTAT_DCMP_Pos             (10UL)                    /*!< IOM0 INTSTAT: DCMP (Bit 10)                           */
+#define IOM0_INTSTAT_DCMP_Msk             (0x400UL)                 /*!< IOM0 INTSTAT: DCMP (Bitfield-Mask: 0x01)              */
+#define IOM0_INTSTAT_ARB_Pos              (9UL)                     /*!< IOM0 INTSTAT: ARB (Bit 9)                             */
+#define IOM0_INTSTAT_ARB_Msk              (0x200UL)                 /*!< IOM0 INTSTAT: ARB (Bitfield-Mask: 0x01)               */
+#define IOM0_INTSTAT_STOP_Pos             (8UL)                     /*!< IOM0 INTSTAT: STOP (Bit 8)                            */
+#define IOM0_INTSTAT_STOP_Msk             (0x100UL)                 /*!< IOM0 INTSTAT: STOP (Bitfield-Mask: 0x01)              */
+#define IOM0_INTSTAT_START_Pos            (7UL)                     /*!< IOM0 INTSTAT: START (Bit 7)                           */
+#define IOM0_INTSTAT_START_Msk            (0x80UL)                  /*!< IOM0 INTSTAT: START (Bitfield-Mask: 0x01)             */
+#define IOM0_INTSTAT_ICMD_Pos             (6UL)                     /*!< IOM0 INTSTAT: ICMD (Bit 6)                            */
+#define IOM0_INTSTAT_ICMD_Msk             (0x40UL)                  /*!< IOM0 INTSTAT: ICMD (Bitfield-Mask: 0x01)              */
+#define IOM0_INTSTAT_IACC_Pos             (5UL)                     /*!< IOM0 INTSTAT: IACC (Bit 5)                            */
+#define IOM0_INTSTAT_IACC_Msk             (0x20UL)                  /*!< IOM0 INTSTAT: IACC (Bitfield-Mask: 0x01)              */
+#define IOM0_INTSTAT_NAK_Pos              (4UL)                     /*!< IOM0 INTSTAT: NAK (Bit 4)                             */
+#define IOM0_INTSTAT_NAK_Msk              (0x10UL)                  /*!< IOM0 INTSTAT: NAK (Bitfield-Mask: 0x01)               */
+#define IOM0_INTSTAT_FOVFL_Pos            (3UL)                     /*!< IOM0 INTSTAT: FOVFL (Bit 3)                           */
+#define IOM0_INTSTAT_FOVFL_Msk            (0x8UL)                   /*!< IOM0 INTSTAT: FOVFL (Bitfield-Mask: 0x01)             */
+#define IOM0_INTSTAT_FUNDFL_Pos           (2UL)                     /*!< IOM0 INTSTAT: FUNDFL (Bit 2)                          */
+#define IOM0_INTSTAT_FUNDFL_Msk           (0x4UL)                   /*!< IOM0 INTSTAT: FUNDFL (Bitfield-Mask: 0x01)            */
+#define IOM0_INTSTAT_THR_Pos              (1UL)                     /*!< IOM0 INTSTAT: THR (Bit 1)                             */
+#define IOM0_INTSTAT_THR_Msk              (0x2UL)                   /*!< IOM0 INTSTAT: THR (Bitfield-Mask: 0x01)               */
+#define IOM0_INTSTAT_CMDCMP_Pos           (0UL)                     /*!< IOM0 INTSTAT: CMDCMP (Bit 0)                          */
+#define IOM0_INTSTAT_CMDCMP_Msk           (0x1UL)                   /*!< IOM0 INTSTAT: CMDCMP (Bitfield-Mask: 0x01)            */
+/* ========================================================  INTCLR  ========================================================= */
+#define IOM0_INTCLR_CQERR_Pos             (14UL)                    /*!< IOM0 INTCLR: CQERR (Bit 14)                           */
+#define IOM0_INTCLR_CQERR_Msk             (0x4000UL)                /*!< IOM0 INTCLR: CQERR (Bitfield-Mask: 0x01)              */
+#define IOM0_INTCLR_CQUPD_Pos             (13UL)                    /*!< IOM0 INTCLR: CQUPD (Bit 13)                           */
+#define IOM0_INTCLR_CQUPD_Msk             (0x2000UL)                /*!< IOM0 INTCLR: CQUPD (Bitfield-Mask: 0x01)              */
+#define IOM0_INTCLR_CQPAUSED_Pos          (12UL)                    /*!< IOM0 INTCLR: CQPAUSED (Bit 12)                        */
+#define IOM0_INTCLR_CQPAUSED_Msk          (0x1000UL)                /*!< IOM0 INTCLR: CQPAUSED (Bitfield-Mask: 0x01)           */
+#define IOM0_INTCLR_DERR_Pos              (11UL)                    /*!< IOM0 INTCLR: DERR (Bit 11)                            */
+#define IOM0_INTCLR_DERR_Msk              (0x800UL)                 /*!< IOM0 INTCLR: DERR (Bitfield-Mask: 0x01)               */
+#define IOM0_INTCLR_DCMP_Pos              (10UL)                    /*!< IOM0 INTCLR: DCMP (Bit 10)                            */
+#define IOM0_INTCLR_DCMP_Msk              (0x400UL)                 /*!< IOM0 INTCLR: DCMP (Bitfield-Mask: 0x01)               */
+#define IOM0_INTCLR_ARB_Pos               (9UL)                     /*!< IOM0 INTCLR: ARB (Bit 9)                              */
+#define IOM0_INTCLR_ARB_Msk               (0x200UL)                 /*!< IOM0 INTCLR: ARB (Bitfield-Mask: 0x01)                */
+#define IOM0_INTCLR_STOP_Pos              (8UL)                     /*!< IOM0 INTCLR: STOP (Bit 8)                             */
+#define IOM0_INTCLR_STOP_Msk              (0x100UL)                 /*!< IOM0 INTCLR: STOP (Bitfield-Mask: 0x01)               */
+#define IOM0_INTCLR_START_Pos             (7UL)                     /*!< IOM0 INTCLR: START (Bit 7)                            */
+#define IOM0_INTCLR_START_Msk             (0x80UL)                  /*!< IOM0 INTCLR: START (Bitfield-Mask: 0x01)              */
+#define IOM0_INTCLR_ICMD_Pos              (6UL)                     /*!< IOM0 INTCLR: ICMD (Bit 6)                             */
+#define IOM0_INTCLR_ICMD_Msk              (0x40UL)                  /*!< IOM0 INTCLR: ICMD (Bitfield-Mask: 0x01)               */
+#define IOM0_INTCLR_IACC_Pos              (5UL)                     /*!< IOM0 INTCLR: IACC (Bit 5)                             */
+#define IOM0_INTCLR_IACC_Msk              (0x20UL)                  /*!< IOM0 INTCLR: IACC (Bitfield-Mask: 0x01)               */
+#define IOM0_INTCLR_NAK_Pos               (4UL)                     /*!< IOM0 INTCLR: NAK (Bit 4)                              */
+#define IOM0_INTCLR_NAK_Msk               (0x10UL)                  /*!< IOM0 INTCLR: NAK (Bitfield-Mask: 0x01)                */
+#define IOM0_INTCLR_FOVFL_Pos             (3UL)                     /*!< IOM0 INTCLR: FOVFL (Bit 3)                            */
+#define IOM0_INTCLR_FOVFL_Msk             (0x8UL)                   /*!< IOM0 INTCLR: FOVFL (Bitfield-Mask: 0x01)              */
+#define IOM0_INTCLR_FUNDFL_Pos            (2UL)                     /*!< IOM0 INTCLR: FUNDFL (Bit 2)                           */
+#define IOM0_INTCLR_FUNDFL_Msk            (0x4UL)                   /*!< IOM0 INTCLR: FUNDFL (Bitfield-Mask: 0x01)             */
+#define IOM0_INTCLR_THR_Pos               (1UL)                     /*!< IOM0 INTCLR: THR (Bit 1)                              */
+#define IOM0_INTCLR_THR_Msk               (0x2UL)                   /*!< IOM0 INTCLR: THR (Bitfield-Mask: 0x01)                */
+#define IOM0_INTCLR_CMDCMP_Pos            (0UL)                     /*!< IOM0 INTCLR: CMDCMP (Bit 0)                           */
+#define IOM0_INTCLR_CMDCMP_Msk            (0x1UL)                   /*!< IOM0 INTCLR: CMDCMP (Bitfield-Mask: 0x01)             */
+/* ========================================================  INTSET  ========================================================= */
+#define IOM0_INTSET_CQERR_Pos             (14UL)                    /*!< IOM0 INTSET: CQERR (Bit 14)                           */
+#define IOM0_INTSET_CQERR_Msk             (0x4000UL)                /*!< IOM0 INTSET: CQERR (Bitfield-Mask: 0x01)              */
+#define IOM0_INTSET_CQUPD_Pos             (13UL)                    /*!< IOM0 INTSET: CQUPD (Bit 13)                           */
+#define IOM0_INTSET_CQUPD_Msk             (0x2000UL)                /*!< IOM0 INTSET: CQUPD (Bitfield-Mask: 0x01)              */
+#define IOM0_INTSET_CQPAUSED_Pos          (12UL)                    /*!< IOM0 INTSET: CQPAUSED (Bit 12)                        */
+#define IOM0_INTSET_CQPAUSED_Msk          (0x1000UL)                /*!< IOM0 INTSET: CQPAUSED (Bitfield-Mask: 0x01)           */
+#define IOM0_INTSET_DERR_Pos              (11UL)                    /*!< IOM0 INTSET: DERR (Bit 11)                            */
+#define IOM0_INTSET_DERR_Msk              (0x800UL)                 /*!< IOM0 INTSET: DERR (Bitfield-Mask: 0x01)               */
+#define IOM0_INTSET_DCMP_Pos              (10UL)                    /*!< IOM0 INTSET: DCMP (Bit 10)                            */
+#define IOM0_INTSET_DCMP_Msk              (0x400UL)                 /*!< IOM0 INTSET: DCMP (Bitfield-Mask: 0x01)               */
+#define IOM0_INTSET_ARB_Pos               (9UL)                     /*!< IOM0 INTSET: ARB (Bit 9)                              */
+#define IOM0_INTSET_ARB_Msk               (0x200UL)                 /*!< IOM0 INTSET: ARB (Bitfield-Mask: 0x01)                */
+#define IOM0_INTSET_STOP_Pos              (8UL)                     /*!< IOM0 INTSET: STOP (Bit 8)                             */
+#define IOM0_INTSET_STOP_Msk              (0x100UL)                 /*!< IOM0 INTSET: STOP (Bitfield-Mask: 0x01)               */
+#define IOM0_INTSET_START_Pos             (7UL)                     /*!< IOM0 INTSET: START (Bit 7)                            */
+#define IOM0_INTSET_START_Msk             (0x80UL)                  /*!< IOM0 INTSET: START (Bitfield-Mask: 0x01)              */
+#define IOM0_INTSET_ICMD_Pos              (6UL)                     /*!< IOM0 INTSET: ICMD (Bit 6)                             */
+#define IOM0_INTSET_ICMD_Msk              (0x40UL)                  /*!< IOM0 INTSET: ICMD (Bitfield-Mask: 0x01)               */
+#define IOM0_INTSET_IACC_Pos              (5UL)                     /*!< IOM0 INTSET: IACC (Bit 5)                             */
+#define IOM0_INTSET_IACC_Msk              (0x20UL)                  /*!< IOM0 INTSET: IACC (Bitfield-Mask: 0x01)               */
+#define IOM0_INTSET_NAK_Pos               (4UL)                     /*!< IOM0 INTSET: NAK (Bit 4)                              */
+#define IOM0_INTSET_NAK_Msk               (0x10UL)                  /*!< IOM0 INTSET: NAK (Bitfield-Mask: 0x01)                */
+#define IOM0_INTSET_FOVFL_Pos             (3UL)                     /*!< IOM0 INTSET: FOVFL (Bit 3)                            */
+#define IOM0_INTSET_FOVFL_Msk             (0x8UL)                   /*!< IOM0 INTSET: FOVFL (Bitfield-Mask: 0x01)              */
+#define IOM0_INTSET_FUNDFL_Pos            (2UL)                     /*!< IOM0 INTSET: FUNDFL (Bit 2)                           */
+#define IOM0_INTSET_FUNDFL_Msk            (0x4UL)                   /*!< IOM0 INTSET: FUNDFL (Bitfield-Mask: 0x01)             */
+#define IOM0_INTSET_THR_Pos               (1UL)                     /*!< IOM0 INTSET: THR (Bit 1)                              */
+#define IOM0_INTSET_THR_Msk               (0x2UL)                   /*!< IOM0 INTSET: THR (Bitfield-Mask: 0x01)                */
+#define IOM0_INTSET_CMDCMP_Pos            (0UL)                     /*!< IOM0 INTSET: CMDCMP (Bit 0)                           */
+#define IOM0_INTSET_CMDCMP_Msk            (0x1UL)                   /*!< IOM0 INTSET: CMDCMP (Bitfield-Mask: 0x01)             */
+/* ========================================================  CLKCFG  ========================================================= */
+#define IOM0_CLKCFG_TOTPER_Pos            (24UL)                    /*!< IOM0 CLKCFG: TOTPER (Bit 24)                          */
+#define IOM0_CLKCFG_TOTPER_Msk            (0xff000000UL)            /*!< IOM0 CLKCFG: TOTPER (Bitfield-Mask: 0xff)             */
+#define IOM0_CLKCFG_LOWPER_Pos            (16UL)                    /*!< IOM0 CLKCFG: LOWPER (Bit 16)                          */
+#define IOM0_CLKCFG_LOWPER_Msk            (0xff0000UL)              /*!< IOM0 CLKCFG: LOWPER (Bitfield-Mask: 0xff)             */
+#define IOM0_CLKCFG_DIVEN_Pos             (12UL)                    /*!< IOM0 CLKCFG: DIVEN (Bit 12)                           */
+#define IOM0_CLKCFG_DIVEN_Msk             (0x1000UL)                /*!< IOM0 CLKCFG: DIVEN (Bitfield-Mask: 0x01)              */
+#define IOM0_CLKCFG_DIV3_Pos              (11UL)                    /*!< IOM0 CLKCFG: DIV3 (Bit 11)                            */
+#define IOM0_CLKCFG_DIV3_Msk              (0x800UL)                 /*!< IOM0 CLKCFG: DIV3 (Bitfield-Mask: 0x01)               */
+#define IOM0_CLKCFG_FSEL_Pos              (8UL)                     /*!< IOM0 CLKCFG: FSEL (Bit 8)                             */
+#define IOM0_CLKCFG_FSEL_Msk              (0x700UL)                 /*!< IOM0 CLKCFG: FSEL (Bitfield-Mask: 0x07)               */
+#define IOM0_CLKCFG_IOCLKEN_Pos           (0UL)                     /*!< IOM0 CLKCFG: IOCLKEN (Bit 0)                          */
+#define IOM0_CLKCFG_IOCLKEN_Msk           (0x1UL)                   /*!< IOM0 CLKCFG: IOCLKEN (Bitfield-Mask: 0x01)            */
+/* ======================================================  SUBMODCTRL  ======================================================= */
+#define IOM0_SUBMODCTRL_SMOD1TYPE_Pos     (5UL)                     /*!< IOM0 SUBMODCTRL: SMOD1TYPE (Bit 5)                    */
+#define IOM0_SUBMODCTRL_SMOD1TYPE_Msk     (0xe0UL)                  /*!< IOM0 SUBMODCTRL: SMOD1TYPE (Bitfield-Mask: 0x07)      */
+#define IOM0_SUBMODCTRL_SMOD1EN_Pos       (4UL)                     /*!< IOM0 SUBMODCTRL: SMOD1EN (Bit 4)                      */
+#define IOM0_SUBMODCTRL_SMOD1EN_Msk       (0x10UL)                  /*!< IOM0 SUBMODCTRL: SMOD1EN (Bitfield-Mask: 0x01)        */
+#define IOM0_SUBMODCTRL_SMOD0TYPE_Pos     (1UL)                     /*!< IOM0 SUBMODCTRL: SMOD0TYPE (Bit 1)                    */
+#define IOM0_SUBMODCTRL_SMOD0TYPE_Msk     (0xeUL)                   /*!< IOM0 SUBMODCTRL: SMOD0TYPE (Bitfield-Mask: 0x07)      */
+#define IOM0_SUBMODCTRL_SMOD0EN_Pos       (0UL)                     /*!< IOM0 SUBMODCTRL: SMOD0EN (Bit 0)                      */
+#define IOM0_SUBMODCTRL_SMOD0EN_Msk       (0x1UL)                   /*!< IOM0 SUBMODCTRL: SMOD0EN (Bitfield-Mask: 0x01)        */
+/* ==========================================================  CMD  ========================================================== */
+#define IOM0_CMD_OFFSETLO_Pos             (24UL)                    /*!< IOM0 CMD: OFFSETLO (Bit 24)                           */
+#define IOM0_CMD_OFFSETLO_Msk             (0xff000000UL)            /*!< IOM0 CMD: OFFSETLO (Bitfield-Mask: 0xff)              */
+#define IOM0_CMD_CMDSEL_Pos               (20UL)                    /*!< IOM0 CMD: CMDSEL (Bit 20)                             */
+#define IOM0_CMD_CMDSEL_Msk               (0x300000UL)              /*!< IOM0 CMD: CMDSEL (Bitfield-Mask: 0x03)                */
+#define IOM0_CMD_TSIZE_Pos                (8UL)                     /*!< IOM0 CMD: TSIZE (Bit 8)                               */
+#define IOM0_CMD_TSIZE_Msk                (0xfff00UL)               /*!< IOM0 CMD: TSIZE (Bitfield-Mask: 0xfff)                */
+#define IOM0_CMD_CONT_Pos                 (7UL)                     /*!< IOM0 CMD: CONT (Bit 7)                                */
+#define IOM0_CMD_CONT_Msk                 (0x80UL)                  /*!< IOM0 CMD: CONT (Bitfield-Mask: 0x01)                  */
+#define IOM0_CMD_OFFSETCNT_Pos            (5UL)                     /*!< IOM0 CMD: OFFSETCNT (Bit 5)                           */
+#define IOM0_CMD_OFFSETCNT_Msk            (0x60UL)                  /*!< IOM0 CMD: OFFSETCNT (Bitfield-Mask: 0x03)             */
+#define IOM0_CMD_CMD_Pos                  (0UL)                     /*!< IOM0 CMD: CMD (Bit 0)                                 */
+#define IOM0_CMD_CMD_Msk                  (0x1fUL)                  /*!< IOM0 CMD: CMD (Bitfield-Mask: 0x1f)                   */
+/* ========================================================  CMDRPT  ========================================================= */
+#define IOM0_CMDRPT_CMDRPT_Pos            (0UL)                     /*!< IOM0 CMDRPT: CMDRPT (Bit 0)                           */
+#define IOM0_CMDRPT_CMDRPT_Msk            (0x1fUL)                  /*!< IOM0 CMDRPT: CMDRPT (Bitfield-Mask: 0x1f)             */
+/* =======================================================  OFFSETHI  ======================================================== */
+#define IOM0_OFFSETHI_OFFSETHI_Pos        (0UL)                     /*!< IOM0 OFFSETHI: OFFSETHI (Bit 0)                       */
+#define IOM0_OFFSETHI_OFFSETHI_Msk        (0xffffUL)                /*!< IOM0 OFFSETHI: OFFSETHI (Bitfield-Mask: 0xffff)       */
+/* ========================================================  CMDSTAT  ======================================================== */
+#define IOM0_CMDSTAT_CTSIZE_Pos           (8UL)                     /*!< IOM0 CMDSTAT: CTSIZE (Bit 8)                          */
+#define IOM0_CMDSTAT_CTSIZE_Msk           (0xfff00UL)               /*!< IOM0 CMDSTAT: CTSIZE (Bitfield-Mask: 0xfff)           */
+#define IOM0_CMDSTAT_CMDSTAT_Pos          (5UL)                     /*!< IOM0 CMDSTAT: CMDSTAT (Bit 5)                         */
+#define IOM0_CMDSTAT_CMDSTAT_Msk          (0xe0UL)                  /*!< IOM0 CMDSTAT: CMDSTAT (Bitfield-Mask: 0x07)           */
+#define IOM0_CMDSTAT_CCMD_Pos             (0UL)                     /*!< IOM0 CMDSTAT: CCMD (Bit 0)                            */
+#define IOM0_CMDSTAT_CCMD_Msk             (0x1fUL)                  /*!< IOM0 CMDSTAT: CCMD (Bitfield-Mask: 0x1f)              */
+/* =======================================================  DMATRIGEN  ======================================================= */
+#define IOM0_DMATRIGEN_DTHREN_Pos         (1UL)                     /*!< IOM0 DMATRIGEN: DTHREN (Bit 1)                        */
+#define IOM0_DMATRIGEN_DTHREN_Msk         (0x2UL)                   /*!< IOM0 DMATRIGEN: DTHREN (Bitfield-Mask: 0x01)          */
+#define IOM0_DMATRIGEN_DCMDCMPEN_Pos      (0UL)                     /*!< IOM0 DMATRIGEN: DCMDCMPEN (Bit 0)                     */
+#define IOM0_DMATRIGEN_DCMDCMPEN_Msk      (0x1UL)                   /*!< IOM0 DMATRIGEN: DCMDCMPEN (Bitfield-Mask: 0x01)       */
+/* ======================================================  DMATRIGSTAT  ====================================================== */
+#define IOM0_DMATRIGSTAT_DTOTCMP_Pos      (2UL)                     /*!< IOM0 DMATRIGSTAT: DTOTCMP (Bit 2)                     */
+#define IOM0_DMATRIGSTAT_DTOTCMP_Msk      (0x4UL)                   /*!< IOM0 DMATRIGSTAT: DTOTCMP (Bitfield-Mask: 0x01)       */
+#define IOM0_DMATRIGSTAT_DTHR_Pos         (1UL)                     /*!< IOM0 DMATRIGSTAT: DTHR (Bit 1)                        */
+#define IOM0_DMATRIGSTAT_DTHR_Msk         (0x2UL)                   /*!< IOM0 DMATRIGSTAT: DTHR (Bitfield-Mask: 0x01)          */
+#define IOM0_DMATRIGSTAT_DCMDCMP_Pos      (0UL)                     /*!< IOM0 DMATRIGSTAT: DCMDCMP (Bit 0)                     */
+#define IOM0_DMATRIGSTAT_DCMDCMP_Msk      (0x1UL)                   /*!< IOM0 DMATRIGSTAT: DCMDCMP (Bitfield-Mask: 0x01)       */
+/* ========================================================  DMACFG  ========================================================= */
+#define IOM0_DMACFG_DPWROFF_Pos           (9UL)                     /*!< IOM0 DMACFG: DPWROFF (Bit 9)                          */
+#define IOM0_DMACFG_DPWROFF_Msk           (0x200UL)                 /*!< IOM0 DMACFG: DPWROFF (Bitfield-Mask: 0x01)            */
+#define IOM0_DMACFG_DMAPRI_Pos            (8UL)                     /*!< IOM0 DMACFG: DMAPRI (Bit 8)                           */
+#define IOM0_DMACFG_DMAPRI_Msk            (0x100UL)                 /*!< IOM0 DMACFG: DMAPRI (Bitfield-Mask: 0x01)             */
+#define IOM0_DMACFG_DMADIR_Pos            (1UL)                     /*!< IOM0 DMACFG: DMADIR (Bit 1)                           */
+#define IOM0_DMACFG_DMADIR_Msk            (0x2UL)                   /*!< IOM0 DMACFG: DMADIR (Bitfield-Mask: 0x01)             */
+#define IOM0_DMACFG_DMAEN_Pos             (0UL)                     /*!< IOM0 DMACFG: DMAEN (Bit 0)                            */
+#define IOM0_DMACFG_DMAEN_Msk             (0x1UL)                   /*!< IOM0 DMACFG: DMAEN (Bitfield-Mask: 0x01)              */
+/* ======================================================  DMATOTCOUNT  ====================================================== */
+#define IOM0_DMATOTCOUNT_TOTCOUNT_Pos     (0UL)                     /*!< IOM0 DMATOTCOUNT: TOTCOUNT (Bit 0)                    */
+#define IOM0_DMATOTCOUNT_TOTCOUNT_Msk     (0xfffUL)                 /*!< IOM0 DMATOTCOUNT: TOTCOUNT (Bitfield-Mask: 0xfff)     */
+/* ======================================================  DMATARGADDR  ====================================================== */
+#define IOM0_DMATARGADDR_TARGADDR28_Pos   (28UL)                    /*!< IOM0 DMATARGADDR: TARGADDR28 (Bit 28)                 */
+#define IOM0_DMATARGADDR_TARGADDR28_Msk   (0x10000000UL)            /*!< IOM0 DMATARGADDR: TARGADDR28 (Bitfield-Mask: 0x01)    */
+#define IOM0_DMATARGADDR_TARGADDR_Pos     (0UL)                     /*!< IOM0 DMATARGADDR: TARGADDR (Bit 0)                    */
+#define IOM0_DMATARGADDR_TARGADDR_Msk     (0xfffffUL)               /*!< IOM0 DMATARGADDR: TARGADDR (Bitfield-Mask: 0xfffff)   */
+/* ========================================================  DMASTAT  ======================================================== */
+#define IOM0_DMASTAT_DMAERR_Pos           (2UL)                     /*!< IOM0 DMASTAT: DMAERR (Bit 2)                          */
+#define IOM0_DMASTAT_DMAERR_Msk           (0x4UL)                   /*!< IOM0 DMASTAT: DMAERR (Bitfield-Mask: 0x01)            */
+#define IOM0_DMASTAT_DMACPL_Pos           (1UL)                     /*!< IOM0 DMASTAT: DMACPL (Bit 1)                          */
+#define IOM0_DMASTAT_DMACPL_Msk           (0x2UL)                   /*!< IOM0 DMASTAT: DMACPL (Bitfield-Mask: 0x01)            */
+#define IOM0_DMASTAT_DMATIP_Pos           (0UL)                     /*!< IOM0 DMASTAT: DMATIP (Bit 0)                          */
+#define IOM0_DMASTAT_DMATIP_Msk           (0x1UL)                   /*!< IOM0 DMASTAT: DMATIP (Bitfield-Mask: 0x01)            */
+/* =========================================================  CQCFG  ========================================================= */
+#define IOM0_CQCFG_CQPRI_Pos              (1UL)                     /*!< IOM0 CQCFG: CQPRI (Bit 1)                             */
+#define IOM0_CQCFG_CQPRI_Msk              (0x2UL)                   /*!< IOM0 CQCFG: CQPRI (Bitfield-Mask: 0x01)               */
+#define IOM0_CQCFG_CQEN_Pos               (0UL)                     /*!< IOM0 CQCFG: CQEN (Bit 0)                              */
+#define IOM0_CQCFG_CQEN_Msk               (0x1UL)                   /*!< IOM0 CQCFG: CQEN (Bitfield-Mask: 0x01)                */
+/* ========================================================  CQADDR  ========================================================= */
+#define IOM0_CQADDR_CQADDR28_Pos          (28UL)                    /*!< IOM0 CQADDR: CQADDR28 (Bit 28)                        */
+#define IOM0_CQADDR_CQADDR28_Msk          (0x10000000UL)            /*!< IOM0 CQADDR: CQADDR28 (Bitfield-Mask: 0x01)           */
+#define IOM0_CQADDR_CQADDR_Pos            (2UL)                     /*!< IOM0 CQADDR: CQADDR (Bit 2)                           */
+#define IOM0_CQADDR_CQADDR_Msk            (0xffffcUL)               /*!< IOM0 CQADDR: CQADDR (Bitfield-Mask: 0x3ffff)          */
+/* ========================================================  CQSTAT  ========================================================= */
+#define IOM0_CQSTAT_CQERR_Pos             (2UL)                     /*!< IOM0 CQSTAT: CQERR (Bit 2)                            */
+#define IOM0_CQSTAT_CQERR_Msk             (0x4UL)                   /*!< IOM0 CQSTAT: CQERR (Bitfield-Mask: 0x01)              */
+#define IOM0_CQSTAT_CQPAUSED_Pos          (1UL)                     /*!< IOM0 CQSTAT: CQPAUSED (Bit 1)                         */
+#define IOM0_CQSTAT_CQPAUSED_Msk          (0x2UL)                   /*!< IOM0 CQSTAT: CQPAUSED (Bitfield-Mask: 0x01)           */
+#define IOM0_CQSTAT_CQTIP_Pos             (0UL)                     /*!< IOM0 CQSTAT: CQTIP (Bit 0)                            */
+#define IOM0_CQSTAT_CQTIP_Msk             (0x1UL)                   /*!< IOM0 CQSTAT: CQTIP (Bitfield-Mask: 0x01)              */
+/* ========================================================  CQFLAGS  ======================================================== */
+#define IOM0_CQFLAGS_CQIRQMASK_Pos        (16UL)                    /*!< IOM0 CQFLAGS: CQIRQMASK (Bit 16)                      */
+#define IOM0_CQFLAGS_CQIRQMASK_Msk        (0xffff0000UL)            /*!< IOM0 CQFLAGS: CQIRQMASK (Bitfield-Mask: 0xffff)       */
+#define IOM0_CQFLAGS_CQFLAGS_Pos          (0UL)                     /*!< IOM0 CQFLAGS: CQFLAGS (Bit 0)                         */
+#define IOM0_CQFLAGS_CQFLAGS_Msk          (0xffffUL)                /*!< IOM0 CQFLAGS: CQFLAGS (Bitfield-Mask: 0xffff)         */
+/* ======================================================  CQSETCLEAR  ======================================================= */
+#define IOM0_CQSETCLEAR_CQFCLR_Pos        (16UL)                    /*!< IOM0 CQSETCLEAR: CQFCLR (Bit 16)                      */
+#define IOM0_CQSETCLEAR_CQFCLR_Msk        (0xff0000UL)              /*!< IOM0 CQSETCLEAR: CQFCLR (Bitfield-Mask: 0xff)         */
+#define IOM0_CQSETCLEAR_CQFTGL_Pos        (8UL)                     /*!< IOM0 CQSETCLEAR: CQFTGL (Bit 8)                       */
+#define IOM0_CQSETCLEAR_CQFTGL_Msk        (0xff00UL)                /*!< IOM0 CQSETCLEAR: CQFTGL (Bitfield-Mask: 0xff)         */
+#define IOM0_CQSETCLEAR_CQFSET_Pos        (0UL)                     /*!< IOM0 CQSETCLEAR: CQFSET (Bit 0)                       */
+#define IOM0_CQSETCLEAR_CQFSET_Msk        (0xffUL)                  /*!< IOM0 CQSETCLEAR: CQFSET (Bitfield-Mask: 0xff)         */
+/* =======================================================  CQPAUSEEN  ======================================================= */
+#define IOM0_CQPAUSEEN_CQPEN_Pos          (0UL)                     /*!< IOM0 CQPAUSEEN: CQPEN (Bit 0)                         */
+#define IOM0_CQPAUSEEN_CQPEN_Msk          (0xffffUL)                /*!< IOM0 CQPAUSEEN: CQPEN (Bitfield-Mask: 0xffff)         */
+/* =======================================================  CQCURIDX  ======================================================== */
+#define IOM0_CQCURIDX_CQCURIDX_Pos        (0UL)                     /*!< IOM0 CQCURIDX: CQCURIDX (Bit 0)                       */
+#define IOM0_CQCURIDX_CQCURIDX_Msk        (0xffUL)                  /*!< IOM0 CQCURIDX: CQCURIDX (Bitfield-Mask: 0xff)         */
+/* =======================================================  CQENDIDX  ======================================================== */
+#define IOM0_CQENDIDX_CQENDIDX_Pos        (0UL)                     /*!< IOM0 CQENDIDX: CQENDIDX (Bit 0)                       */
+#define IOM0_CQENDIDX_CQENDIDX_Msk        (0xffUL)                  /*!< IOM0 CQENDIDX: CQENDIDX (Bitfield-Mask: 0xff)         */
+/* ========================================================  STATUS  ========================================================= */
+#define IOM0_STATUS_IDLEST_Pos            (2UL)                     /*!< IOM0 STATUS: IDLEST (Bit 2)                           */
+#define IOM0_STATUS_IDLEST_Msk            (0x4UL)                   /*!< IOM0 STATUS: IDLEST (Bitfield-Mask: 0x01)             */
+#define IOM0_STATUS_CMDACT_Pos            (1UL)                     /*!< IOM0 STATUS: CMDACT (Bit 1)                           */
+#define IOM0_STATUS_CMDACT_Msk            (0x2UL)                   /*!< IOM0 STATUS: CMDACT (Bitfield-Mask: 0x01)             */
+#define IOM0_STATUS_ERR_Pos               (0UL)                     /*!< IOM0 STATUS: ERR (Bit 0)                              */
+#define IOM0_STATUS_ERR_Msk               (0x1UL)                   /*!< IOM0 STATUS: ERR (Bitfield-Mask: 0x01)                */
+/* ========================================================  MSPICFG  ======================================================== */
+#define IOM0_MSPICFG_MSPIRST_Pos          (30UL)                    /*!< IOM0 MSPICFG: MSPIRST (Bit 30)                        */
+#define IOM0_MSPICFG_MSPIRST_Msk          (0x40000000UL)            /*!< IOM0 MSPICFG: MSPIRST (Bitfield-Mask: 0x01)           */
+#define IOM0_MSPICFG_DOUTDLY_Pos          (27UL)                    /*!< IOM0 MSPICFG: DOUTDLY (Bit 27)                        */
+#define IOM0_MSPICFG_DOUTDLY_Msk          (0x38000000UL)            /*!< IOM0 MSPICFG: DOUTDLY (Bitfield-Mask: 0x07)           */
+#define IOM0_MSPICFG_DINDLY_Pos           (24UL)                    /*!< IOM0 MSPICFG: DINDLY (Bit 24)                         */
+#define IOM0_MSPICFG_DINDLY_Msk           (0x7000000UL)             /*!< IOM0 MSPICFG: DINDLY (Bitfield-Mask: 0x07)            */
+#define IOM0_MSPICFG_SPILSB_Pos           (23UL)                    /*!< IOM0 MSPICFG: SPILSB (Bit 23)                         */
+#define IOM0_MSPICFG_SPILSB_Msk           (0x800000UL)              /*!< IOM0 MSPICFG: SPILSB (Bitfield-Mask: 0x01)            */
+#define IOM0_MSPICFG_RDFCPOL_Pos          (22UL)                    /*!< IOM0 MSPICFG: RDFCPOL (Bit 22)                        */
+#define IOM0_MSPICFG_RDFCPOL_Msk          (0x400000UL)              /*!< IOM0 MSPICFG: RDFCPOL (Bitfield-Mask: 0x01)           */
+#define IOM0_MSPICFG_WTFCPOL_Pos          (21UL)                    /*!< IOM0 MSPICFG: WTFCPOL (Bit 21)                        */
+#define IOM0_MSPICFG_WTFCPOL_Msk          (0x200000UL)              /*!< IOM0 MSPICFG: WTFCPOL (Bitfield-Mask: 0x01)           */
+#define IOM0_MSPICFG_WTFCIRQ_Pos          (20UL)                    /*!< IOM0 MSPICFG: WTFCIRQ (Bit 20)                        */
+#define IOM0_MSPICFG_WTFCIRQ_Msk          (0x100000UL)              /*!< IOM0 MSPICFG: WTFCIRQ (Bitfield-Mask: 0x01)           */
+#define IOM0_MSPICFG_MOSIINV_Pos          (18UL)                    /*!< IOM0 MSPICFG: MOSIINV (Bit 18)                        */
+#define IOM0_MSPICFG_MOSIINV_Msk          (0x40000UL)               /*!< IOM0 MSPICFG: MOSIINV (Bitfield-Mask: 0x01)           */
+#define IOM0_MSPICFG_RDFC_Pos             (17UL)                    /*!< IOM0 MSPICFG: RDFC (Bit 17)                           */
+#define IOM0_MSPICFG_RDFC_Msk             (0x20000UL)               /*!< IOM0 MSPICFG: RDFC (Bitfield-Mask: 0x01)              */
+#define IOM0_MSPICFG_WTFC_Pos             (16UL)                    /*!< IOM0 MSPICFG: WTFC (Bit 16)                           */
+#define IOM0_MSPICFG_WTFC_Msk             (0x10000UL)               /*!< IOM0 MSPICFG: WTFC (Bitfield-Mask: 0x01)              */
+#define IOM0_MSPICFG_FULLDUP_Pos          (2UL)                     /*!< IOM0 MSPICFG: FULLDUP (Bit 2)                         */
+#define IOM0_MSPICFG_FULLDUP_Msk          (0x4UL)                   /*!< IOM0 MSPICFG: FULLDUP (Bitfield-Mask: 0x01)           */
+#define IOM0_MSPICFG_SPHA_Pos             (1UL)                     /*!< IOM0 MSPICFG: SPHA (Bit 1)                            */
+#define IOM0_MSPICFG_SPHA_Msk             (0x2UL)                   /*!< IOM0 MSPICFG: SPHA (Bitfield-Mask: 0x01)              */
+#define IOM0_MSPICFG_SPOL_Pos             (0UL)                     /*!< IOM0 MSPICFG: SPOL (Bit 0)                            */
+#define IOM0_MSPICFG_SPOL_Msk             (0x1UL)                   /*!< IOM0 MSPICFG: SPOL (Bitfield-Mask: 0x01)              */
+/* ========================================================  MI2CCFG  ======================================================== */
+#define IOM0_MI2CCFG_STRDIS_Pos           (24UL)                    /*!< IOM0 MI2CCFG: STRDIS (Bit 24)                         */
+#define IOM0_MI2CCFG_STRDIS_Msk           (0x1000000UL)             /*!< IOM0 MI2CCFG: STRDIS (Bitfield-Mask: 0x01)            */
+#define IOM0_MI2CCFG_SMPCNT_Pos           (16UL)                    /*!< IOM0 MI2CCFG: SMPCNT (Bit 16)                         */
+#define IOM0_MI2CCFG_SMPCNT_Msk           (0xff0000UL)              /*!< IOM0 MI2CCFG: SMPCNT (Bitfield-Mask: 0xff)            */
+#define IOM0_MI2CCFG_SDAENDLY_Pos         (12UL)                    /*!< IOM0 MI2CCFG: SDAENDLY (Bit 12)                       */
+#define IOM0_MI2CCFG_SDAENDLY_Msk         (0xf000UL)                /*!< IOM0 MI2CCFG: SDAENDLY (Bitfield-Mask: 0x0f)          */
+#define IOM0_MI2CCFG_SCLENDLY_Pos         (8UL)                     /*!< IOM0 MI2CCFG: SCLENDLY (Bit 8)                        */
+#define IOM0_MI2CCFG_SCLENDLY_Msk         (0xf00UL)                 /*!< IOM0 MI2CCFG: SCLENDLY (Bitfield-Mask: 0x0f)          */
+#define IOM0_MI2CCFG_MI2CRST_Pos          (6UL)                     /*!< IOM0 MI2CCFG: MI2CRST (Bit 6)                         */
+#define IOM0_MI2CCFG_MI2CRST_Msk          (0x40UL)                  /*!< IOM0 MI2CCFG: MI2CRST (Bitfield-Mask: 0x01)           */
+#define IOM0_MI2CCFG_SDADLY_Pos           (4UL)                     /*!< IOM0 MI2CCFG: SDADLY (Bit 4)                          */
+#define IOM0_MI2CCFG_SDADLY_Msk           (0x30UL)                  /*!< IOM0 MI2CCFG: SDADLY (Bitfield-Mask: 0x03)            */
+#define IOM0_MI2CCFG_ARBEN_Pos            (2UL)                     /*!< IOM0 MI2CCFG: ARBEN (Bit 2)                           */
+#define IOM0_MI2CCFG_ARBEN_Msk            (0x4UL)                   /*!< IOM0 MI2CCFG: ARBEN (Bitfield-Mask: 0x01)             */
+#define IOM0_MI2CCFG_I2CLSB_Pos           (1UL)                     /*!< IOM0 MI2CCFG: I2CLSB (Bit 1)                          */
+#define IOM0_MI2CCFG_I2CLSB_Msk           (0x2UL)                   /*!< IOM0 MI2CCFG: I2CLSB (Bitfield-Mask: 0x01)            */
+#define IOM0_MI2CCFG_ADDRSZ_Pos           (0UL)                     /*!< IOM0 MI2CCFG: ADDRSZ (Bit 0)                          */
+#define IOM0_MI2CCFG_ADDRSZ_Msk           (0x1UL)                   /*!< IOM0 MI2CCFG: ADDRSZ (Bitfield-Mask: 0x01)            */
+/* ========================================================  DEVCFG  ========================================================= */
+#define IOM0_DEVCFG_DEVADDR_Pos           (0UL)                     /*!< IOM0 DEVCFG: DEVADDR (Bit 0)                          */
+#define IOM0_DEVCFG_DEVADDR_Msk           (0x3ffUL)                 /*!< IOM0 DEVCFG: DEVADDR (Bitfield-Mask: 0x3ff)           */
+/* ========================================================  IOMDBG  ========================================================= */
+#define IOM0_IOMDBG_DBGDATA_Pos           (3UL)                     /*!< IOM0 IOMDBG: DBGDATA (Bit 3)                          */
+#define IOM0_IOMDBG_DBGDATA_Msk           (0xfffffff8UL)            /*!< IOM0 IOMDBG: DBGDATA (Bitfield-Mask: 0x1fffffff)      */
+#define IOM0_IOMDBG_APBCLKON_Pos          (2UL)                     /*!< IOM0 IOMDBG: APBCLKON (Bit 2)                         */
+#define IOM0_IOMDBG_APBCLKON_Msk          (0x4UL)                   /*!< IOM0 IOMDBG: APBCLKON (Bitfield-Mask: 0x01)           */
+#define IOM0_IOMDBG_IOCLKON_Pos           (1UL)                     /*!< IOM0 IOMDBG: IOCLKON (Bit 1)                          */
+#define IOM0_IOMDBG_IOCLKON_Msk           (0x2UL)                   /*!< IOM0 IOMDBG: IOCLKON (Bitfield-Mask: 0x01)            */
+#define IOM0_IOMDBG_DBGEN_Pos             (0UL)                     /*!< IOM0 IOMDBG: DBGEN (Bit 0)                            */
+#define IOM0_IOMDBG_DBGEN_Msk             (0x1UL)                   /*!< IOM0 IOMDBG: DBGEN (Bitfield-Mask: 0x01)              */
+
+
+/* =========================================================================================================================== */
+/* ================                                          IOSLAVE                                          ================ */
+/* =========================================================================================================================== */
+
+/* ========================================================  FIFOPTR  ======================================================== */
+#define IOSLAVE_FIFOPTR_FIFOSIZ_Pos       (8UL)                     /*!< IOSLAVE FIFOPTR: FIFOSIZ (Bit 8)                      */
+#define IOSLAVE_FIFOPTR_FIFOSIZ_Msk       (0xff00UL)                /*!< IOSLAVE FIFOPTR: FIFOSIZ (Bitfield-Mask: 0xff)        */
+#define IOSLAVE_FIFOPTR_FIFOPTR_Pos       (0UL)                     /*!< IOSLAVE FIFOPTR: FIFOPTR (Bit 0)                      */
+#define IOSLAVE_FIFOPTR_FIFOPTR_Msk       (0xffUL)                  /*!< IOSLAVE FIFOPTR: FIFOPTR (Bitfield-Mask: 0xff)        */
+/* ========================================================  FIFOCFG  ======================================================== */
+#define IOSLAVE_FIFOCFG_ROBASE_Pos        (24UL)                    /*!< IOSLAVE FIFOCFG: ROBASE (Bit 24)                      */
+#define IOSLAVE_FIFOCFG_ROBASE_Msk        (0x3f000000UL)            /*!< IOSLAVE FIFOCFG: ROBASE (Bitfield-Mask: 0x3f)         */
+#define IOSLAVE_FIFOCFG_FIFOMAX_Pos       (8UL)                     /*!< IOSLAVE FIFOCFG: FIFOMAX (Bit 8)                      */
+#define IOSLAVE_FIFOCFG_FIFOMAX_Msk       (0x3f00UL)                /*!< IOSLAVE FIFOCFG: FIFOMAX (Bitfield-Mask: 0x3f)        */
+#define IOSLAVE_FIFOCFG_FIFOBASE_Pos      (0UL)                     /*!< IOSLAVE FIFOCFG: FIFOBASE (Bit 0)                     */
+#define IOSLAVE_FIFOCFG_FIFOBASE_Msk      (0x1fUL)                  /*!< IOSLAVE FIFOCFG: FIFOBASE (Bitfield-Mask: 0x1f)       */
+/* ========================================================  FIFOTHR  ======================================================== */
+#define IOSLAVE_FIFOTHR_FIFOTHR_Pos       (0UL)                     /*!< IOSLAVE FIFOTHR: FIFOTHR (Bit 0)                      */
+#define IOSLAVE_FIFOTHR_FIFOTHR_Msk       (0xffUL)                  /*!< IOSLAVE FIFOTHR: FIFOTHR (Bitfield-Mask: 0xff)        */
+/* =========================================================  FUPD  ========================================================== */
+#define IOSLAVE_FUPD_IOREAD_Pos           (1UL)                     /*!< IOSLAVE FUPD: IOREAD (Bit 1)                          */
+#define IOSLAVE_FUPD_IOREAD_Msk           (0x2UL)                   /*!< IOSLAVE FUPD: IOREAD (Bitfield-Mask: 0x01)            */
+#define IOSLAVE_FUPD_FIFOUPD_Pos          (0UL)                     /*!< IOSLAVE FUPD: FIFOUPD (Bit 0)                         */
+#define IOSLAVE_FUPD_FIFOUPD_Msk          (0x1UL)                   /*!< IOSLAVE FUPD: FIFOUPD (Bitfield-Mask: 0x01)           */
+/* ========================================================  FIFOCTR  ======================================================== */
+#define IOSLAVE_FIFOCTR_FIFOCTR_Pos       (0UL)                     /*!< IOSLAVE FIFOCTR: FIFOCTR (Bit 0)                      */
+#define IOSLAVE_FIFOCTR_FIFOCTR_Msk       (0x3ffUL)                 /*!< IOSLAVE FIFOCTR: FIFOCTR (Bitfield-Mask: 0x3ff)       */
+/* ========================================================  FIFOINC  ======================================================== */
+#define IOSLAVE_FIFOINC_FIFOINC_Pos       (0UL)                     /*!< IOSLAVE FIFOINC: FIFOINC (Bit 0)                      */
+#define IOSLAVE_FIFOINC_FIFOINC_Msk       (0x3ffUL)                 /*!< IOSLAVE FIFOINC: FIFOINC (Bitfield-Mask: 0x3ff)       */
+/* ==========================================================  CFG  ========================================================== */
+#define IOSLAVE_CFG_IFCEN_Pos             (31UL)                    /*!< IOSLAVE CFG: IFCEN (Bit 31)                           */
+#define IOSLAVE_CFG_IFCEN_Msk             (0x80000000UL)            /*!< IOSLAVE CFG: IFCEN (Bitfield-Mask: 0x01)              */
+#define IOSLAVE_CFG_I2CADDR_Pos           (8UL)                     /*!< IOSLAVE CFG: I2CADDR (Bit 8)                          */
+#define IOSLAVE_CFG_I2CADDR_Msk           (0xfff00UL)               /*!< IOSLAVE CFG: I2CADDR (Bitfield-Mask: 0xfff)           */
+#define IOSLAVE_CFG_STARTRD_Pos           (4UL)                     /*!< IOSLAVE CFG: STARTRD (Bit 4)                          */
+#define IOSLAVE_CFG_STARTRD_Msk           (0x10UL)                  /*!< IOSLAVE CFG: STARTRD (Bitfield-Mask: 0x01)            */
+#define IOSLAVE_CFG_LSB_Pos               (2UL)                     /*!< IOSLAVE CFG: LSB (Bit 2)                              */
+#define IOSLAVE_CFG_LSB_Msk               (0x4UL)                   /*!< IOSLAVE CFG: LSB (Bitfield-Mask: 0x01)                */
+#define IOSLAVE_CFG_SPOL_Pos              (1UL)                     /*!< IOSLAVE CFG: SPOL (Bit 1)                             */
+#define IOSLAVE_CFG_SPOL_Msk              (0x2UL)                   /*!< IOSLAVE CFG: SPOL (Bitfield-Mask: 0x01)               */
+#define IOSLAVE_CFG_IFCSEL_Pos            (0UL)                     /*!< IOSLAVE CFG: IFCSEL (Bit 0)                           */
+#define IOSLAVE_CFG_IFCSEL_Msk            (0x1UL)                   /*!< IOSLAVE CFG: IFCSEL (Bitfield-Mask: 0x01)             */
+/* =========================================================  PRENC  ========================================================= */
+#define IOSLAVE_PRENC_PRENC_Pos           (0UL)                     /*!< IOSLAVE PRENC: PRENC (Bit 0)                          */
+#define IOSLAVE_PRENC_PRENC_Msk           (0x1fUL)                  /*!< IOSLAVE PRENC: PRENC (Bitfield-Mask: 0x1f)            */
+/* =======================================================  IOINTCTL  ======================================================== */
+#define IOSLAVE_IOINTCTL_IOINTSET_Pos     (24UL)                    /*!< IOSLAVE IOINTCTL: IOINTSET (Bit 24)                   */
+#define IOSLAVE_IOINTCTL_IOINTSET_Msk     (0xff000000UL)            /*!< IOSLAVE IOINTCTL: IOINTSET (Bitfield-Mask: 0xff)      */
+#define IOSLAVE_IOINTCTL_IOINTCLR_Pos     (16UL)                    /*!< IOSLAVE IOINTCTL: IOINTCLR (Bit 16)                   */
+#define IOSLAVE_IOINTCTL_IOINTCLR_Msk     (0x10000UL)               /*!< IOSLAVE IOINTCTL: IOINTCLR (Bitfield-Mask: 0x01)      */
+#define IOSLAVE_IOINTCTL_IOINT_Pos        (8UL)                     /*!< IOSLAVE IOINTCTL: IOINT (Bit 8)                       */
+#define IOSLAVE_IOINTCTL_IOINT_Msk        (0xff00UL)                /*!< IOSLAVE IOINTCTL: IOINT (Bitfield-Mask: 0xff)         */
+#define IOSLAVE_IOINTCTL_IOINTEN_Pos      (0UL)                     /*!< IOSLAVE IOINTCTL: IOINTEN (Bit 0)                     */
+#define IOSLAVE_IOINTCTL_IOINTEN_Msk      (0xffUL)                  /*!< IOSLAVE IOINTCTL: IOINTEN (Bitfield-Mask: 0xff)       */
+/* ========================================================  GENADD  ========================================================= */
+#define IOSLAVE_GENADD_GADATA_Pos         (0UL)                     /*!< IOSLAVE GENADD: GADATA (Bit 0)                        */
+#define IOSLAVE_GENADD_GADATA_Msk         (0xffUL)                  /*!< IOSLAVE GENADD: GADATA (Bitfield-Mask: 0xff)          */
+/* =========================================================  INTEN  ========================================================= */
+#define IOSLAVE_INTEN_XCMPWR_Pos          (9UL)                     /*!< IOSLAVE INTEN: XCMPWR (Bit 9)                         */
+#define IOSLAVE_INTEN_XCMPWR_Msk          (0x200UL)                 /*!< IOSLAVE INTEN: XCMPWR (Bitfield-Mask: 0x01)           */
+#define IOSLAVE_INTEN_XCMPWF_Pos          (8UL)                     /*!< IOSLAVE INTEN: XCMPWF (Bit 8)                         */
+#define IOSLAVE_INTEN_XCMPWF_Msk          (0x100UL)                 /*!< IOSLAVE INTEN: XCMPWF (Bitfield-Mask: 0x01)           */
+#define IOSLAVE_INTEN_XCMPRR_Pos          (7UL)                     /*!< IOSLAVE INTEN: XCMPRR (Bit 7)                         */
+#define IOSLAVE_INTEN_XCMPRR_Msk          (0x80UL)                  /*!< IOSLAVE INTEN: XCMPRR (Bitfield-Mask: 0x01)           */
+#define IOSLAVE_INTEN_XCMPRF_Pos          (6UL)                     /*!< IOSLAVE INTEN: XCMPRF (Bit 6)                         */
+#define IOSLAVE_INTEN_XCMPRF_Msk          (0x40UL)                  /*!< IOSLAVE INTEN: XCMPRF (Bitfield-Mask: 0x01)           */
+#define IOSLAVE_INTEN_IOINTW_Pos          (5UL)                     /*!< IOSLAVE INTEN: IOINTW (Bit 5)                         */
+#define IOSLAVE_INTEN_IOINTW_Msk          (0x20UL)                  /*!< IOSLAVE INTEN: IOINTW (Bitfield-Mask: 0x01)           */
+#define IOSLAVE_INTEN_GENAD_Pos           (4UL)                     /*!< IOSLAVE INTEN: GENAD (Bit 4)                          */
+#define IOSLAVE_INTEN_GENAD_Msk           (0x10UL)                  /*!< IOSLAVE INTEN: GENAD (Bitfield-Mask: 0x01)            */
+#define IOSLAVE_INTEN_FRDERR_Pos          (3UL)                     /*!< IOSLAVE INTEN: FRDERR (Bit 3)                         */
+#define IOSLAVE_INTEN_FRDERR_Msk          (0x8UL)                   /*!< IOSLAVE INTEN: FRDERR (Bitfield-Mask: 0x01)           */
+#define IOSLAVE_INTEN_FUNDFL_Pos          (2UL)                     /*!< IOSLAVE INTEN: FUNDFL (Bit 2)                         */
+#define IOSLAVE_INTEN_FUNDFL_Msk          (0x4UL)                   /*!< IOSLAVE INTEN: FUNDFL (Bitfield-Mask: 0x01)           */
+#define IOSLAVE_INTEN_FOVFL_Pos           (1UL)                     /*!< IOSLAVE INTEN: FOVFL (Bit 1)                          */
+#define IOSLAVE_INTEN_FOVFL_Msk           (0x2UL)                   /*!< IOSLAVE INTEN: FOVFL (Bitfield-Mask: 0x01)            */
+#define IOSLAVE_INTEN_FSIZE_Pos           (0UL)                     /*!< IOSLAVE INTEN: FSIZE (Bit 0)                          */
+#define IOSLAVE_INTEN_FSIZE_Msk           (0x1UL)                   /*!< IOSLAVE INTEN: FSIZE (Bitfield-Mask: 0x01)            */
+/* ========================================================  INTSTAT  ======================================================== */
+#define IOSLAVE_INTSTAT_XCMPWR_Pos        (9UL)                     /*!< IOSLAVE INTSTAT: XCMPWR (Bit 9)                       */
+#define IOSLAVE_INTSTAT_XCMPWR_Msk        (0x200UL)                 /*!< IOSLAVE INTSTAT: XCMPWR (Bitfield-Mask: 0x01)         */
+#define IOSLAVE_INTSTAT_XCMPWF_Pos        (8UL)                     /*!< IOSLAVE INTSTAT: XCMPWF (Bit 8)                       */
+#define IOSLAVE_INTSTAT_XCMPWF_Msk        (0x100UL)                 /*!< IOSLAVE INTSTAT: XCMPWF (Bitfield-Mask: 0x01)         */
+#define IOSLAVE_INTSTAT_XCMPRR_Pos        (7UL)                     /*!< IOSLAVE INTSTAT: XCMPRR (Bit 7)                       */
+#define IOSLAVE_INTSTAT_XCMPRR_Msk        (0x80UL)                  /*!< IOSLAVE INTSTAT: XCMPRR (Bitfield-Mask: 0x01)         */
+#define IOSLAVE_INTSTAT_XCMPRF_Pos        (6UL)                     /*!< IOSLAVE INTSTAT: XCMPRF (Bit 6)                       */
+#define IOSLAVE_INTSTAT_XCMPRF_Msk        (0x40UL)                  /*!< IOSLAVE INTSTAT: XCMPRF (Bitfield-Mask: 0x01)         */
+#define IOSLAVE_INTSTAT_IOINTW_Pos        (5UL)                     /*!< IOSLAVE INTSTAT: IOINTW (Bit 5)                       */
+#define IOSLAVE_INTSTAT_IOINTW_Msk        (0x20UL)                  /*!< IOSLAVE INTSTAT: IOINTW (Bitfield-Mask: 0x01)         */
+#define IOSLAVE_INTSTAT_GENAD_Pos         (4UL)                     /*!< IOSLAVE INTSTAT: GENAD (Bit 4)                        */
+#define IOSLAVE_INTSTAT_GENAD_Msk         (0x10UL)                  /*!< IOSLAVE INTSTAT: GENAD (Bitfield-Mask: 0x01)          */
+#define IOSLAVE_INTSTAT_FRDERR_Pos        (3UL)                     /*!< IOSLAVE INTSTAT: FRDERR (Bit 3)                       */
+#define IOSLAVE_INTSTAT_FRDERR_Msk        (0x8UL)                   /*!< IOSLAVE INTSTAT: FRDERR (Bitfield-Mask: 0x01)         */
+#define IOSLAVE_INTSTAT_FUNDFL_Pos        (2UL)                     /*!< IOSLAVE INTSTAT: FUNDFL (Bit 2)                       */
+#define IOSLAVE_INTSTAT_FUNDFL_Msk        (0x4UL)                   /*!< IOSLAVE INTSTAT: FUNDFL (Bitfield-Mask: 0x01)         */
+#define IOSLAVE_INTSTAT_FOVFL_Pos         (1UL)                     /*!< IOSLAVE INTSTAT: FOVFL (Bit 1)                        */
+#define IOSLAVE_INTSTAT_FOVFL_Msk         (0x2UL)                   /*!< IOSLAVE INTSTAT: FOVFL (Bitfield-Mask: 0x01)          */
+#define IOSLAVE_INTSTAT_FSIZE_Pos         (0UL)                     /*!< IOSLAVE INTSTAT: FSIZE (Bit 0)                        */
+#define IOSLAVE_INTSTAT_FSIZE_Msk         (0x1UL)                   /*!< IOSLAVE INTSTAT: FSIZE (Bitfield-Mask: 0x01)          */
+/* ========================================================  INTCLR  ========================================================= */
+#define IOSLAVE_INTCLR_XCMPWR_Pos         (9UL)                     /*!< IOSLAVE INTCLR: XCMPWR (Bit 9)                        */
+#define IOSLAVE_INTCLR_XCMPWR_Msk         (0x200UL)                 /*!< IOSLAVE INTCLR: XCMPWR (Bitfield-Mask: 0x01)          */
+#define IOSLAVE_INTCLR_XCMPWF_Pos         (8UL)                     /*!< IOSLAVE INTCLR: XCMPWF (Bit 8)                        */
+#define IOSLAVE_INTCLR_XCMPWF_Msk         (0x100UL)                 /*!< IOSLAVE INTCLR: XCMPWF (Bitfield-Mask: 0x01)          */
+#define IOSLAVE_INTCLR_XCMPRR_Pos         (7UL)                     /*!< IOSLAVE INTCLR: XCMPRR (Bit 7)                        */
+#define IOSLAVE_INTCLR_XCMPRR_Msk         (0x80UL)                  /*!< IOSLAVE INTCLR: XCMPRR (Bitfield-Mask: 0x01)          */
+#define IOSLAVE_INTCLR_XCMPRF_Pos         (6UL)                     /*!< IOSLAVE INTCLR: XCMPRF (Bit 6)                        */
+#define IOSLAVE_INTCLR_XCMPRF_Msk         (0x40UL)                  /*!< IOSLAVE INTCLR: XCMPRF (Bitfield-Mask: 0x01)          */
+#define IOSLAVE_INTCLR_IOINTW_Pos         (5UL)                     /*!< IOSLAVE INTCLR: IOINTW (Bit 5)                        */
+#define IOSLAVE_INTCLR_IOINTW_Msk         (0x20UL)                  /*!< IOSLAVE INTCLR: IOINTW (Bitfield-Mask: 0x01)          */
+#define IOSLAVE_INTCLR_GENAD_Pos          (4UL)                     /*!< IOSLAVE INTCLR: GENAD (Bit 4)                         */
+#define IOSLAVE_INTCLR_GENAD_Msk          (0x10UL)                  /*!< IOSLAVE INTCLR: GENAD (Bitfield-Mask: 0x01)           */
+#define IOSLAVE_INTCLR_FRDERR_Pos         (3UL)                     /*!< IOSLAVE INTCLR: FRDERR (Bit 3)                        */
+#define IOSLAVE_INTCLR_FRDERR_Msk         (0x8UL)                   /*!< IOSLAVE INTCLR: FRDERR (Bitfield-Mask: 0x01)          */
+#define IOSLAVE_INTCLR_FUNDFL_Pos         (2UL)                     /*!< IOSLAVE INTCLR: FUNDFL (Bit 2)                        */
+#define IOSLAVE_INTCLR_FUNDFL_Msk         (0x4UL)                   /*!< IOSLAVE INTCLR: FUNDFL (Bitfield-Mask: 0x01)          */
+#define IOSLAVE_INTCLR_FOVFL_Pos          (1UL)                     /*!< IOSLAVE INTCLR: FOVFL (Bit 1)                         */
+#define IOSLAVE_INTCLR_FOVFL_Msk          (0x2UL)                   /*!< IOSLAVE INTCLR: FOVFL (Bitfield-Mask: 0x01)           */
+#define IOSLAVE_INTCLR_FSIZE_Pos          (0UL)                     /*!< IOSLAVE INTCLR: FSIZE (Bit 0)                         */
+#define IOSLAVE_INTCLR_FSIZE_Msk          (0x1UL)                   /*!< IOSLAVE INTCLR: FSIZE (Bitfield-Mask: 0x01)           */
+/* ========================================================  INTSET  ========================================================= */
+#define IOSLAVE_INTSET_XCMPWR_Pos         (9UL)                     /*!< IOSLAVE INTSET: XCMPWR (Bit 9)                        */
+#define IOSLAVE_INTSET_XCMPWR_Msk         (0x200UL)                 /*!< IOSLAVE INTSET: XCMPWR (Bitfield-Mask: 0x01)          */
+#define IOSLAVE_INTSET_XCMPWF_Pos         (8UL)                     /*!< IOSLAVE INTSET: XCMPWF (Bit 8)                        */
+#define IOSLAVE_INTSET_XCMPWF_Msk         (0x100UL)                 /*!< IOSLAVE INTSET: XCMPWF (Bitfield-Mask: 0x01)          */
+#define IOSLAVE_INTSET_XCMPRR_Pos         (7UL)                     /*!< IOSLAVE INTSET: XCMPRR (Bit 7)                        */
+#define IOSLAVE_INTSET_XCMPRR_Msk         (0x80UL)                  /*!< IOSLAVE INTSET: XCMPRR (Bitfield-Mask: 0x01)          */
+#define IOSLAVE_INTSET_XCMPRF_Pos         (6UL)                     /*!< IOSLAVE INTSET: XCMPRF (Bit 6)                        */
+#define IOSLAVE_INTSET_XCMPRF_Msk         (0x40UL)                  /*!< IOSLAVE INTSET: XCMPRF (Bitfield-Mask: 0x01)          */
+#define IOSLAVE_INTSET_IOINTW_Pos         (5UL)                     /*!< IOSLAVE INTSET: IOINTW (Bit 5)                        */
+#define IOSLAVE_INTSET_IOINTW_Msk         (0x20UL)                  /*!< IOSLAVE INTSET: IOINTW (Bitfield-Mask: 0x01)          */
+#define IOSLAVE_INTSET_GENAD_Pos          (4UL)                     /*!< IOSLAVE INTSET: GENAD (Bit 4)                         */
+#define IOSLAVE_INTSET_GENAD_Msk          (0x10UL)                  /*!< IOSLAVE INTSET: GENAD (Bitfield-Mask: 0x01)           */
+#define IOSLAVE_INTSET_FRDERR_Pos         (3UL)                     /*!< IOSLAVE INTSET: FRDERR (Bit 3)                        */
+#define IOSLAVE_INTSET_FRDERR_Msk         (0x8UL)                   /*!< IOSLAVE INTSET: FRDERR (Bitfield-Mask: 0x01)          */
+#define IOSLAVE_INTSET_FUNDFL_Pos         (2UL)                     /*!< IOSLAVE INTSET: FUNDFL (Bit 2)                        */
+#define IOSLAVE_INTSET_FUNDFL_Msk         (0x4UL)                   /*!< IOSLAVE INTSET: FUNDFL (Bitfield-Mask: 0x01)          */
+#define IOSLAVE_INTSET_FOVFL_Pos          (1UL)                     /*!< IOSLAVE INTSET: FOVFL (Bit 1)                         */
+#define IOSLAVE_INTSET_FOVFL_Msk          (0x2UL)                   /*!< IOSLAVE INTSET: FOVFL (Bitfield-Mask: 0x01)           */
+#define IOSLAVE_INTSET_FSIZE_Pos          (0UL)                     /*!< IOSLAVE INTSET: FSIZE (Bit 0)                         */
+#define IOSLAVE_INTSET_FSIZE_Msk          (0x1UL)                   /*!< IOSLAVE INTSET: FSIZE (Bitfield-Mask: 0x01)           */
+/* ======================================================  REGACCINTEN  ====================================================== */
+#define IOSLAVE_REGACCINTEN_REGACC_Pos    (0UL)                     /*!< IOSLAVE REGACCINTEN: REGACC (Bit 0)                   */
+#define IOSLAVE_REGACCINTEN_REGACC_Msk    (0xffffffffUL)            /*!< IOSLAVE REGACCINTEN: REGACC (Bitfield-Mask: 0xffffffff) */
+/* =====================================================  REGACCINTSTAT  ===================================================== */
+#define IOSLAVE_REGACCINTSTAT_REGACC_Pos  (0UL)                     /*!< IOSLAVE REGACCINTSTAT: REGACC (Bit 0)                 */
+#define IOSLAVE_REGACCINTSTAT_REGACC_Msk  (0xffffffffUL)            /*!< IOSLAVE REGACCINTSTAT: REGACC (Bitfield-Mask: 0xffffffff) */
+/* =====================================================  REGACCINTCLR  ====================================================== */
+#define IOSLAVE_REGACCINTCLR_REGACC_Pos   (0UL)                     /*!< IOSLAVE REGACCINTCLR: REGACC (Bit 0)                  */
+#define IOSLAVE_REGACCINTCLR_REGACC_Msk   (0xffffffffUL)            /*!< IOSLAVE REGACCINTCLR: REGACC (Bitfield-Mask: 0xffffffff) */
+/* =====================================================  REGACCINTSET  ====================================================== */
+#define IOSLAVE_REGACCINTSET_REGACC_Pos   (0UL)                     /*!< IOSLAVE REGACCINTSET: REGACC (Bit 0)                  */
+#define IOSLAVE_REGACCINTSET_REGACC_Msk   (0xffffffffUL)            /*!< IOSLAVE REGACCINTSET: REGACC (Bitfield-Mask: 0xffffffff) */
+
+
+/* =========================================================================================================================== */
+/* ================                                          MCUCTRL                                          ================ */
+/* =========================================================================================================================== */
+
+/* ========================================================  CHIPPN  ========================================================= */
+#define MCUCTRL_CHIPPN_PARTNUM_Pos        (0UL)                     /*!< MCUCTRL CHIPPN: PARTNUM (Bit 0)                       */
+#define MCUCTRL_CHIPPN_PARTNUM_Msk        (0xffffffffUL)            /*!< MCUCTRL CHIPPN: PARTNUM (Bitfield-Mask: 0xffffffff)   */
+/* ========================================================  CHIPID0  ======================================================== */
+#define MCUCTRL_CHIPID0_CHIPID0_Pos       (0UL)                     /*!< MCUCTRL CHIPID0: CHIPID0 (Bit 0)                      */
+#define MCUCTRL_CHIPID0_CHIPID0_Msk       (0xffffffffUL)            /*!< MCUCTRL CHIPID0: CHIPID0 (Bitfield-Mask: 0xffffffff)  */
+/* ========================================================  CHIPID1  ======================================================== */
+#define MCUCTRL_CHIPID1_CHIPID1_Pos       (0UL)                     /*!< MCUCTRL CHIPID1: CHIPID1 (Bit 0)                      */
+#define MCUCTRL_CHIPID1_CHIPID1_Msk       (0xffffffffUL)            /*!< MCUCTRL CHIPID1: CHIPID1 (Bitfield-Mask: 0xffffffff)  */
+/* ========================================================  CHIPREV  ======================================================== */
+#define MCUCTRL_CHIPREV_SIPART_Pos        (8UL)                     /*!< MCUCTRL CHIPREV: SIPART (Bit 8)                       */
+#define MCUCTRL_CHIPREV_SIPART_Msk        (0xfff00UL)               /*!< MCUCTRL CHIPREV: SIPART (Bitfield-Mask: 0xfff)        */
+#define MCUCTRL_CHIPREV_REVMAJ_Pos        (4UL)                     /*!< MCUCTRL CHIPREV: REVMAJ (Bit 4)                       */
+#define MCUCTRL_CHIPREV_REVMAJ_Msk        (0xf0UL)                  /*!< MCUCTRL CHIPREV: REVMAJ (Bitfield-Mask: 0x0f)         */
+#define MCUCTRL_CHIPREV_REVMIN_Pos        (0UL)                     /*!< MCUCTRL CHIPREV: REVMIN (Bit 0)                       */
+#define MCUCTRL_CHIPREV_REVMIN_Msk        (0xfUL)                   /*!< MCUCTRL CHIPREV: REVMIN (Bitfield-Mask: 0x0f)         */
+/* =======================================================  VENDORID  ======================================================== */
+#define MCUCTRL_VENDORID_VENDORID_Pos     (0UL)                     /*!< MCUCTRL VENDORID: VENDORID (Bit 0)                    */
+#define MCUCTRL_VENDORID_VENDORID_Msk     (0xffffffffUL)            /*!< MCUCTRL VENDORID: VENDORID (Bitfield-Mask: 0xffffffff) */
+/* ==========================================================  SKU  ========================================================== */
+#define MCUCTRL_SKU_SECBOOT_Pos           (2UL)                     /*!< MCUCTRL SKU: SECBOOT (Bit 2)                          */
+#define MCUCTRL_SKU_SECBOOT_Msk           (0x4UL)                   /*!< MCUCTRL SKU: SECBOOT (Bitfield-Mask: 0x01)            */
+#define MCUCTRL_SKU_ALLOWBLE_Pos          (1UL)                     /*!< MCUCTRL SKU: ALLOWBLE (Bit 1)                         */
+#define MCUCTRL_SKU_ALLOWBLE_Msk          (0x2UL)                   /*!< MCUCTRL SKU: ALLOWBLE (Bitfield-Mask: 0x01)           */
+#define MCUCTRL_SKU_ALLOWBURST_Pos        (0UL)                     /*!< MCUCTRL SKU: ALLOWBURST (Bit 0)                       */
+#define MCUCTRL_SKU_ALLOWBURST_Msk        (0x1UL)                   /*!< MCUCTRL SKU: ALLOWBURST (Bitfield-Mask: 0x01)         */
+/* =====================================================  FEATUREENABLE  ===================================================== */
+#define MCUCTRL_FEATUREENABLE_BURSTAVAIL_Pos (6UL)                  /*!< MCUCTRL FEATUREENABLE: BURSTAVAIL (Bit 6)             */
+#define MCUCTRL_FEATUREENABLE_BURSTAVAIL_Msk (0x40UL)               /*!< MCUCTRL FEATUREENABLE: BURSTAVAIL (Bitfield-Mask: 0x01) */
+#define MCUCTRL_FEATUREENABLE_BURSTACK_Pos (5UL)                    /*!< MCUCTRL FEATUREENABLE: BURSTACK (Bit 5)               */
+#define MCUCTRL_FEATUREENABLE_BURSTACK_Msk (0x20UL)                 /*!< MCUCTRL FEATUREENABLE: BURSTACK (Bitfield-Mask: 0x01) */
+#define MCUCTRL_FEATUREENABLE_BURSTREQ_Pos (4UL)                    /*!< MCUCTRL FEATUREENABLE: BURSTREQ (Bit 4)               */
+#define MCUCTRL_FEATUREENABLE_BURSTREQ_Msk (0x10UL)                 /*!< MCUCTRL FEATUREENABLE: BURSTREQ (Bitfield-Mask: 0x01) */
+#define MCUCTRL_FEATUREENABLE_BLEAVAIL_Pos (2UL)                    /*!< MCUCTRL FEATUREENABLE: BLEAVAIL (Bit 2)               */
+#define MCUCTRL_FEATUREENABLE_BLEAVAIL_Msk (0x4UL)                  /*!< MCUCTRL FEATUREENABLE: BLEAVAIL (Bitfield-Mask: 0x01) */
+#define MCUCTRL_FEATUREENABLE_BLEACK_Pos  (1UL)                     /*!< MCUCTRL FEATUREENABLE: BLEACK (Bit 1)                 */
+#define MCUCTRL_FEATUREENABLE_BLEACK_Msk  (0x2UL)                   /*!< MCUCTRL FEATUREENABLE: BLEACK (Bitfield-Mask: 0x01)   */
+#define MCUCTRL_FEATUREENABLE_BLEREQ_Pos  (0UL)                     /*!< MCUCTRL FEATUREENABLE: BLEREQ (Bit 0)                 */
+#define MCUCTRL_FEATUREENABLE_BLEREQ_Msk  (0x1UL)                   /*!< MCUCTRL FEATUREENABLE: BLEREQ (Bitfield-Mask: 0x01)   */
+/* =======================================================  DEBUGGER  ======================================================== */
+#define MCUCTRL_DEBUGGER_LOCKOUT_Pos      (0UL)                     /*!< MCUCTRL DEBUGGER: LOCKOUT (Bit 0)                     */
+#define MCUCTRL_DEBUGGER_LOCKOUT_Msk      (0x1UL)                   /*!< MCUCTRL DEBUGGER: LOCKOUT (Bitfield-Mask: 0x01)       */
+/* ========================================================  BODCTRL  ======================================================== */
+#define MCUCTRL_BODCTRL_BODHVREFSEL_Pos   (5UL)                     /*!< MCUCTRL BODCTRL: BODHVREFSEL (Bit 5)                  */
+#define MCUCTRL_BODCTRL_BODHVREFSEL_Msk   (0x20UL)                  /*!< MCUCTRL BODCTRL: BODHVREFSEL (Bitfield-Mask: 0x01)    */
+#define MCUCTRL_BODCTRL_BODLVREFSEL_Pos   (4UL)                     /*!< MCUCTRL BODCTRL: BODLVREFSEL (Bit 4)                  */
+#define MCUCTRL_BODCTRL_BODLVREFSEL_Msk   (0x10UL)                  /*!< MCUCTRL BODCTRL: BODLVREFSEL (Bitfield-Mask: 0x01)    */
+#define MCUCTRL_BODCTRL_BODFPWD_Pos       (3UL)                     /*!< MCUCTRL BODCTRL: BODFPWD (Bit 3)                      */
+#define MCUCTRL_BODCTRL_BODFPWD_Msk       (0x8UL)                   /*!< MCUCTRL BODCTRL: BODFPWD (Bitfield-Mask: 0x01)        */
+#define MCUCTRL_BODCTRL_BODCPWD_Pos       (2UL)                     /*!< MCUCTRL BODCTRL: BODCPWD (Bit 2)                      */
+#define MCUCTRL_BODCTRL_BODCPWD_Msk       (0x4UL)                   /*!< MCUCTRL BODCTRL: BODCPWD (Bitfield-Mask: 0x01)        */
+#define MCUCTRL_BODCTRL_BODHPWD_Pos       (1UL)                     /*!< MCUCTRL BODCTRL: BODHPWD (Bit 1)                      */
+#define MCUCTRL_BODCTRL_BODHPWD_Msk       (0x2UL)                   /*!< MCUCTRL BODCTRL: BODHPWD (Bitfield-Mask: 0x01)        */
+#define MCUCTRL_BODCTRL_BODLPWD_Pos       (0UL)                     /*!< MCUCTRL BODCTRL: BODLPWD (Bit 0)                      */
+#define MCUCTRL_BODCTRL_BODLPWD_Msk       (0x1UL)                   /*!< MCUCTRL BODCTRL: BODLPWD (Bitfield-Mask: 0x01)        */
+/* =======================================================  ADCPWRDLY  ======================================================= */
+#define MCUCTRL_ADCPWRDLY_ADCPWR1_Pos     (8UL)                     /*!< MCUCTRL ADCPWRDLY: ADCPWR1 (Bit 8)                    */
+#define MCUCTRL_ADCPWRDLY_ADCPWR1_Msk     (0xff00UL)                /*!< MCUCTRL ADCPWRDLY: ADCPWR1 (Bitfield-Mask: 0xff)      */
+#define MCUCTRL_ADCPWRDLY_ADCPWR0_Pos     (0UL)                     /*!< MCUCTRL ADCPWRDLY: ADCPWR0 (Bit 0)                    */
+#define MCUCTRL_ADCPWRDLY_ADCPWR0_Msk     (0xffUL)                  /*!< MCUCTRL ADCPWRDLY: ADCPWR0 (Bitfield-Mask: 0xff)      */
+/* ========================================================  ADCCAL  ========================================================= */
+#define MCUCTRL_ADCCAL_ADCCALIBRATED_Pos  (1UL)                     /*!< MCUCTRL ADCCAL: ADCCALIBRATED (Bit 1)                 */
+#define MCUCTRL_ADCCAL_ADCCALIBRATED_Msk  (0x2UL)                   /*!< MCUCTRL ADCCAL: ADCCALIBRATED (Bitfield-Mask: 0x01)   */
+#define MCUCTRL_ADCCAL_CALONPWRUP_Pos     (0UL)                     /*!< MCUCTRL ADCCAL: CALONPWRUP (Bit 0)                    */
+#define MCUCTRL_ADCCAL_CALONPWRUP_Msk     (0x1UL)                   /*!< MCUCTRL ADCCAL: CALONPWRUP (Bitfield-Mask: 0x01)      */
+/* ======================================================  ADCBATTLOAD  ====================================================== */
+#define MCUCTRL_ADCBATTLOAD_BATTLOAD_Pos  (0UL)                     /*!< MCUCTRL ADCBATTLOAD: BATTLOAD (Bit 0)                 */
+#define MCUCTRL_ADCBATTLOAD_BATTLOAD_Msk  (0x1UL)                   /*!< MCUCTRL ADCBATTLOAD: BATTLOAD (Bitfield-Mask: 0x01)   */
+/* ========================================================  ADCTRIM  ======================================================== */
+#define MCUCTRL_ADCTRIM_ADCRFBUFIBTRIM_Pos (11UL)                   /*!< MCUCTRL ADCTRIM: ADCRFBUFIBTRIM (Bit 11)              */
+#define MCUCTRL_ADCTRIM_ADCRFBUFIBTRIM_Msk (0x1800UL)               /*!< MCUCTRL ADCTRIM: ADCRFBUFIBTRIM (Bitfield-Mask: 0x03) */
+#define MCUCTRL_ADCTRIM_ADCREFBUFTRIM_Pos (6UL)                     /*!< MCUCTRL ADCTRIM: ADCREFBUFTRIM (Bit 6)                */
+#define MCUCTRL_ADCTRIM_ADCREFBUFTRIM_Msk (0x7c0UL)                 /*!< MCUCTRL ADCTRIM: ADCREFBUFTRIM (Bitfield-Mask: 0x1f)  */
+#define MCUCTRL_ADCTRIM_ADCREFKEEPIBTRIM_Pos (0UL)                  /*!< MCUCTRL ADCTRIM: ADCREFKEEPIBTRIM (Bit 0)             */
+#define MCUCTRL_ADCTRIM_ADCREFKEEPIBTRIM_Msk (0x3UL)                /*!< MCUCTRL ADCTRIM: ADCREFKEEPIBTRIM (Bitfield-Mask: 0x03) */
+/* ======================================================  ADCREFCOMP  ======================================================= */
+#define MCUCTRL_ADCREFCOMP_ADCRFCMPEN_Pos (16UL)                    /*!< MCUCTRL ADCREFCOMP: ADCRFCMPEN (Bit 16)               */
+#define MCUCTRL_ADCREFCOMP_ADCRFCMPEN_Msk (0x10000UL)               /*!< MCUCTRL ADCREFCOMP: ADCRFCMPEN (Bitfield-Mask: 0x01)  */
+#define MCUCTRL_ADCREFCOMP_ADCREFKEEPTRIM_Pos (8UL)                 /*!< MCUCTRL ADCREFCOMP: ADCREFKEEPTRIM (Bit 8)            */
+#define MCUCTRL_ADCREFCOMP_ADCREFKEEPTRIM_Msk (0x1f00UL)            /*!< MCUCTRL ADCREFCOMP: ADCREFKEEPTRIM (Bitfield-Mask: 0x1f) */
+#define MCUCTRL_ADCREFCOMP_ADC_REFCOMP_OUT_Pos (0UL)                /*!< MCUCTRL ADCREFCOMP: ADC_REFCOMP_OUT (Bit 0)           */
+#define MCUCTRL_ADCREFCOMP_ADC_REFCOMP_OUT_Msk (0x1UL)              /*!< MCUCTRL ADCREFCOMP: ADC_REFCOMP_OUT (Bitfield-Mask: 0x01) */
+/* =======================================================  XTALCTRL  ======================================================== */
+#define MCUCTRL_XTALCTRL_XTALICOMPTRIM_Pos (8UL)                    /*!< MCUCTRL XTALCTRL: XTALICOMPTRIM (Bit 8)               */
+#define MCUCTRL_XTALCTRL_XTALICOMPTRIM_Msk (0x300UL)                /*!< MCUCTRL XTALCTRL: XTALICOMPTRIM (Bitfield-Mask: 0x03) */
+#define MCUCTRL_XTALCTRL_XTALIBUFTRIM_Pos (6UL)                     /*!< MCUCTRL XTALCTRL: XTALIBUFTRIM (Bit 6)                */
+#define MCUCTRL_XTALCTRL_XTALIBUFTRIM_Msk (0xc0UL)                  /*!< MCUCTRL XTALCTRL: XTALIBUFTRIM (Bitfield-Mask: 0x03)  */
+#define MCUCTRL_XTALCTRL_PWDBODXTAL_Pos   (5UL)                     /*!< MCUCTRL XTALCTRL: PWDBODXTAL (Bit 5)                  */
+#define MCUCTRL_XTALCTRL_PWDBODXTAL_Msk   (0x20UL)                  /*!< MCUCTRL XTALCTRL: PWDBODXTAL (Bitfield-Mask: 0x01)    */
+#define MCUCTRL_XTALCTRL_PDNBCMPRXTAL_Pos (4UL)                     /*!< MCUCTRL XTALCTRL: PDNBCMPRXTAL (Bit 4)                */
+#define MCUCTRL_XTALCTRL_PDNBCMPRXTAL_Msk (0x10UL)                  /*!< MCUCTRL XTALCTRL: PDNBCMPRXTAL (Bitfield-Mask: 0x01)  */
+#define MCUCTRL_XTALCTRL_PDNBCOREXTAL_Pos (3UL)                     /*!< MCUCTRL XTALCTRL: PDNBCOREXTAL (Bit 3)                */
+#define MCUCTRL_XTALCTRL_PDNBCOREXTAL_Msk (0x8UL)                   /*!< MCUCTRL XTALCTRL: PDNBCOREXTAL (Bitfield-Mask: 0x01)  */
+#define MCUCTRL_XTALCTRL_BYPCMPRXTAL_Pos  (2UL)                     /*!< MCUCTRL XTALCTRL: BYPCMPRXTAL (Bit 2)                 */
+#define MCUCTRL_XTALCTRL_BYPCMPRXTAL_Msk  (0x4UL)                   /*!< MCUCTRL XTALCTRL: BYPCMPRXTAL (Bitfield-Mask: 0x01)   */
+#define MCUCTRL_XTALCTRL_FDBKDSBLXTAL_Pos (1UL)                     /*!< MCUCTRL XTALCTRL: FDBKDSBLXTAL (Bit 1)                */
+#define MCUCTRL_XTALCTRL_FDBKDSBLXTAL_Msk (0x2UL)                   /*!< MCUCTRL XTALCTRL: FDBKDSBLXTAL (Bitfield-Mask: 0x01)  */
+#define MCUCTRL_XTALCTRL_XTALSWE_Pos      (0UL)                     /*!< MCUCTRL XTALCTRL: XTALSWE (Bit 0)                     */
+#define MCUCTRL_XTALCTRL_XTALSWE_Msk      (0x1UL)                   /*!< MCUCTRL XTALCTRL: XTALSWE (Bitfield-Mask: 0x01)       */
+/* ======================================================  XTALGENCTRL  ====================================================== */
+#define MCUCTRL_XTALGENCTRL_XTALKSBIASTRIM_Pos (8UL)                /*!< MCUCTRL XTALGENCTRL: XTALKSBIASTRIM (Bit 8)           */
+#define MCUCTRL_XTALGENCTRL_XTALKSBIASTRIM_Msk (0x3f00UL)           /*!< MCUCTRL XTALGENCTRL: XTALKSBIASTRIM (Bitfield-Mask: 0x3f) */
+#define MCUCTRL_XTALGENCTRL_XTALBIASTRIM_Pos (2UL)                  /*!< MCUCTRL XTALGENCTRL: XTALBIASTRIM (Bit 2)             */
+#define MCUCTRL_XTALGENCTRL_XTALBIASTRIM_Msk (0xfcUL)               /*!< MCUCTRL XTALGENCTRL: XTALBIASTRIM (Bitfield-Mask: 0x3f) */
+#define MCUCTRL_XTALGENCTRL_ACWARMUP_Pos  (0UL)                     /*!< MCUCTRL XTALGENCTRL: ACWARMUP (Bit 0)                 */
+#define MCUCTRL_XTALGENCTRL_ACWARMUP_Msk  (0x3UL)                   /*!< MCUCTRL XTALGENCTRL: ACWARMUP (Bitfield-Mask: 0x03)   */
+/* =======================================================  MISCCTRL  ======================================================== */
+#define MCUCTRL_MISCCTRL_BLE_RESETN_Pos   (5UL)                     /*!< MCUCTRL MISCCTRL: BLE_RESETN (Bit 5)                  */
+#define MCUCTRL_MISCCTRL_BLE_RESETN_Msk   (0x20UL)                  /*!< MCUCTRL MISCCTRL: BLE_RESETN (Bitfield-Mask: 0x01)    */
+#define MCUCTRL_MISCCTRL_RESERVED_RW_0_Pos (0UL)                    /*!< MCUCTRL MISCCTRL: RESERVED_RW_0 (Bit 0)               */
+#define MCUCTRL_MISCCTRL_RESERVED_RW_0_Msk (0x1fUL)                 /*!< MCUCTRL MISCCTRL: RESERVED_RW_0 (Bitfield-Mask: 0x1f) */
+/* ======================================================  BOOTLOADER  ======================================================= */
+#define MCUCTRL_BOOTLOADER_SECBOOTONRST_Pos (30UL)                  /*!< MCUCTRL BOOTLOADER: SECBOOTONRST (Bit 30)             */
+#define MCUCTRL_BOOTLOADER_SECBOOTONRST_Msk (0xc0000000UL)          /*!< MCUCTRL BOOTLOADER: SECBOOTONRST (Bitfield-Mask: 0x03) */
+#define MCUCTRL_BOOTLOADER_SECBOOT_Pos    (28UL)                    /*!< MCUCTRL BOOTLOADER: SECBOOT (Bit 28)                  */
+#define MCUCTRL_BOOTLOADER_SECBOOT_Msk    (0x30000000UL)            /*!< MCUCTRL BOOTLOADER: SECBOOT (Bitfield-Mask: 0x03)     */
+#define MCUCTRL_BOOTLOADER_SECBOOTFEATURE_Pos (26UL)                /*!< MCUCTRL BOOTLOADER: SECBOOTFEATURE (Bit 26)           */
+#define MCUCTRL_BOOTLOADER_SECBOOTFEATURE_Msk (0xc000000UL)         /*!< MCUCTRL BOOTLOADER: SECBOOTFEATURE (Bitfield-Mask: 0x03) */
+#define MCUCTRL_BOOTLOADER_PROTLOCK_Pos   (2UL)                     /*!< MCUCTRL BOOTLOADER: PROTLOCK (Bit 2)                  */
+#define MCUCTRL_BOOTLOADER_PROTLOCK_Msk   (0x4UL)                   /*!< MCUCTRL BOOTLOADER: PROTLOCK (Bitfield-Mask: 0x01)    */
+#define MCUCTRL_BOOTLOADER_SBLOCK_Pos     (1UL)                     /*!< MCUCTRL BOOTLOADER: SBLOCK (Bit 1)                    */
+#define MCUCTRL_BOOTLOADER_SBLOCK_Msk     (0x2UL)                   /*!< MCUCTRL BOOTLOADER: SBLOCK (Bitfield-Mask: 0x01)      */
+#define MCUCTRL_BOOTLOADER_BOOTLOADERLOW_Pos (0UL)                  /*!< MCUCTRL BOOTLOADER: BOOTLOADERLOW (Bit 0)             */
+#define MCUCTRL_BOOTLOADER_BOOTLOADERLOW_Msk (0x1UL)                /*!< MCUCTRL BOOTLOADER: BOOTLOADERLOW (Bitfield-Mask: 0x01) */
+/* ======================================================  SHADOWVALID  ====================================================== */
+#define MCUCTRL_SHADOWVALID_INFO0_VALID_Pos (2UL)                   /*!< MCUCTRL SHADOWVALID: INFO0_VALID (Bit 2)              */
+#define MCUCTRL_SHADOWVALID_INFO0_VALID_Msk (0x4UL)                 /*!< MCUCTRL SHADOWVALID: INFO0_VALID (Bitfield-Mask: 0x01) */
+#define MCUCTRL_SHADOWVALID_BLDSLEEP_Pos  (1UL)                     /*!< MCUCTRL SHADOWVALID: BLDSLEEP (Bit 1)                 */
+#define MCUCTRL_SHADOWVALID_BLDSLEEP_Msk  (0x2UL)                   /*!< MCUCTRL SHADOWVALID: BLDSLEEP (Bitfield-Mask: 0x01)   */
+#define MCUCTRL_SHADOWVALID_VALID_Pos     (0UL)                     /*!< MCUCTRL SHADOWVALID: VALID (Bit 0)                    */
+#define MCUCTRL_SHADOWVALID_VALID_Msk     (0x1UL)                   /*!< MCUCTRL SHADOWVALID: VALID (Bitfield-Mask: 0x01)      */
+/* =======================================================  SCRATCH0  ======================================================== */
+#define MCUCTRL_SCRATCH0_SCRATCH0_Pos     (0UL)                     /*!< MCUCTRL SCRATCH0: SCRATCH0 (Bit 0)                    */
+#define MCUCTRL_SCRATCH0_SCRATCH0_Msk     (0xffffffffUL)            /*!< MCUCTRL SCRATCH0: SCRATCH0 (Bitfield-Mask: 0xffffffff) */
+/* =======================================================  SCRATCH1  ======================================================== */
+#define MCUCTRL_SCRATCH1_SCRATCH1_Pos     (0UL)                     /*!< MCUCTRL SCRATCH1: SCRATCH1 (Bit 0)                    */
+#define MCUCTRL_SCRATCH1_SCRATCH1_Msk     (0xffffffffUL)            /*!< MCUCTRL SCRATCH1: SCRATCH1 (Bitfield-Mask: 0xffffffff) */
+/* ====================================================  ICODEFAULTADDR  ===================================================== */
+#define MCUCTRL_ICODEFAULTADDR_ICODEFAULTADDR_Pos (0UL)             /*!< MCUCTRL ICODEFAULTADDR: ICODEFAULTADDR (Bit 0)        */
+#define MCUCTRL_ICODEFAULTADDR_ICODEFAULTADDR_Msk (0xffffffffUL)    /*!< MCUCTRL ICODEFAULTADDR: ICODEFAULTADDR (Bitfield-Mask: 0xffffffff) */
+/* ====================================================  DCODEFAULTADDR  ===================================================== */
+#define MCUCTRL_DCODEFAULTADDR_DCODEFAULTADDR_Pos (0UL)             /*!< MCUCTRL DCODEFAULTADDR: DCODEFAULTADDR (Bit 0)        */
+#define MCUCTRL_DCODEFAULTADDR_DCODEFAULTADDR_Msk (0xffffffffUL)    /*!< MCUCTRL DCODEFAULTADDR: DCODEFAULTADDR (Bitfield-Mask: 0xffffffff) */
+/* =====================================================  SYSFAULTADDR  ====================================================== */
+#define MCUCTRL_SYSFAULTADDR_SYSFAULTADDR_Pos (0UL)                 /*!< MCUCTRL SYSFAULTADDR: SYSFAULTADDR (Bit 0)            */
+#define MCUCTRL_SYSFAULTADDR_SYSFAULTADDR_Msk (0xffffffffUL)        /*!< MCUCTRL SYSFAULTADDR: SYSFAULTADDR (Bitfield-Mask: 0xffffffff) */
+/* ======================================================  FAULTSTATUS  ====================================================== */
+#define MCUCTRL_FAULTSTATUS_SYSFAULT_Pos  (2UL)                     /*!< MCUCTRL FAULTSTATUS: SYSFAULT (Bit 2)                 */
+#define MCUCTRL_FAULTSTATUS_SYSFAULT_Msk  (0x4UL)                   /*!< MCUCTRL FAULTSTATUS: SYSFAULT (Bitfield-Mask: 0x01)   */
+#define MCUCTRL_FAULTSTATUS_DCODEFAULT_Pos (1UL)                    /*!< MCUCTRL FAULTSTATUS: DCODEFAULT (Bit 1)               */
+#define MCUCTRL_FAULTSTATUS_DCODEFAULT_Msk (0x2UL)                  /*!< MCUCTRL FAULTSTATUS: DCODEFAULT (Bitfield-Mask: 0x01) */
+#define MCUCTRL_FAULTSTATUS_ICODEFAULT_Pos (0UL)                    /*!< MCUCTRL FAULTSTATUS: ICODEFAULT (Bit 0)               */
+#define MCUCTRL_FAULTSTATUS_ICODEFAULT_Msk (0x1UL)                  /*!< MCUCTRL FAULTSTATUS: ICODEFAULT (Bitfield-Mask: 0x01) */
+/* ====================================================  FAULTCAPTUREEN  ===================================================== */
+#define MCUCTRL_FAULTCAPTUREEN_FAULTCAPTUREEN_Pos (0UL)             /*!< MCUCTRL FAULTCAPTUREEN: FAULTCAPTUREEN (Bit 0)        */
+#define MCUCTRL_FAULTCAPTUREEN_FAULTCAPTUREEN_Msk (0x1UL)           /*!< MCUCTRL FAULTCAPTUREEN: FAULTCAPTUREEN (Bitfield-Mask: 0x01) */
+/* =========================================================  DBGR1  ========================================================= */
+#define MCUCTRL_DBGR1_ONETO8_Pos          (0UL)                     /*!< MCUCTRL DBGR1: ONETO8 (Bit 0)                         */
+#define MCUCTRL_DBGR1_ONETO8_Msk          (0xffffffffUL)            /*!< MCUCTRL DBGR1: ONETO8 (Bitfield-Mask: 0xffffffff)     */
+/* =========================================================  DBGR2  ========================================================= */
+#define MCUCTRL_DBGR2_COOLCODE_Pos        (0UL)                     /*!< MCUCTRL DBGR2: COOLCODE (Bit 0)                       */
+#define MCUCTRL_DBGR2_COOLCODE_Msk        (0xffffffffUL)            /*!< MCUCTRL DBGR2: COOLCODE (Bitfield-Mask: 0xffffffff)   */
+/* =======================================================  PMUENABLE  ======================================================= */
+#define MCUCTRL_PMUENABLE_ENABLE_Pos      (0UL)                     /*!< MCUCTRL PMUENABLE: ENABLE (Bit 0)                     */
+#define MCUCTRL_PMUENABLE_ENABLE_Msk      (0x1UL)                   /*!< MCUCTRL PMUENABLE: ENABLE (Bitfield-Mask: 0x01)       */
+/* =======================================================  TPIUCTRL  ======================================================== */
+#define MCUCTRL_TPIUCTRL_CLKSEL_Pos       (8UL)                     /*!< MCUCTRL TPIUCTRL: CLKSEL (Bit 8)                      */
+#define MCUCTRL_TPIUCTRL_CLKSEL_Msk       (0x700UL)                 /*!< MCUCTRL TPIUCTRL: CLKSEL (Bitfield-Mask: 0x07)        */
+#define MCUCTRL_TPIUCTRL_ENABLE_Pos       (0UL)                     /*!< MCUCTRL TPIUCTRL: ENABLE (Bit 0)                      */
+#define MCUCTRL_TPIUCTRL_ENABLE_Msk       (0x1UL)                   /*!< MCUCTRL TPIUCTRL: ENABLE (Bitfield-Mask: 0x01)        */
+/* ======================================================  OTAPOINTER  ======================================================= */
+#define MCUCTRL_OTAPOINTER_OTAPOINTER_Pos (2UL)                     /*!< MCUCTRL OTAPOINTER: OTAPOINTER (Bit 2)                */
+#define MCUCTRL_OTAPOINTER_OTAPOINTER_Msk (0xfffffffcUL)            /*!< MCUCTRL OTAPOINTER: OTAPOINTER (Bitfield-Mask: 0x3fffffff) */
+#define MCUCTRL_OTAPOINTER_OTASBLUPDATE_Pos (1UL)                   /*!< MCUCTRL OTAPOINTER: OTASBLUPDATE (Bit 1)              */
+#define MCUCTRL_OTAPOINTER_OTASBLUPDATE_Msk (0x2UL)                 /*!< MCUCTRL OTAPOINTER: OTASBLUPDATE (Bitfield-Mask: 0x01) */
+#define MCUCTRL_OTAPOINTER_OTAVALID_Pos   (0UL)                     /*!< MCUCTRL OTAPOINTER: OTAVALID (Bit 0)                  */
+#define MCUCTRL_OTAPOINTER_OTAVALID_Msk   (0x1UL)                   /*!< MCUCTRL OTAPOINTER: OTAVALID (Bitfield-Mask: 0x01)    */
+/* ======================================================  APBDMACTRL  ======================================================= */
+#define MCUCTRL_APBDMACTRL_HYSTERESIS_Pos (8UL)                     /*!< MCUCTRL APBDMACTRL: HYSTERESIS (Bit 8)                */
+#define MCUCTRL_APBDMACTRL_HYSTERESIS_Msk (0xff00UL)                /*!< MCUCTRL APBDMACTRL: HYSTERESIS (Bitfield-Mask: 0xff)  */
+#define MCUCTRL_APBDMACTRL_DECODEABORT_Pos (1UL)                    /*!< MCUCTRL APBDMACTRL: DECODEABORT (Bit 1)               */
+#define MCUCTRL_APBDMACTRL_DECODEABORT_Msk (0x2UL)                  /*!< MCUCTRL APBDMACTRL: DECODEABORT (Bitfield-Mask: 0x01) */
+#define MCUCTRL_APBDMACTRL_DMA_ENABLE_Pos (0UL)                     /*!< MCUCTRL APBDMACTRL: DMA_ENABLE (Bit 0)                */
+#define MCUCTRL_APBDMACTRL_DMA_ENABLE_Msk (0x1UL)                   /*!< MCUCTRL APBDMACTRL: DMA_ENABLE (Bitfield-Mask: 0x01)  */
+/* =======================================================  SRAMMODE  ======================================================== */
+#define MCUCTRL_SRAMMODE_DPREFETCH_CACHE_Pos (5UL)                  /*!< MCUCTRL SRAMMODE: DPREFETCH_CACHE (Bit 5)             */
+#define MCUCTRL_SRAMMODE_DPREFETCH_CACHE_Msk (0x20UL)               /*!< MCUCTRL SRAMMODE: DPREFETCH_CACHE (Bitfield-Mask: 0x01) */
+#define MCUCTRL_SRAMMODE_DPREFETCH_Pos    (4UL)                     /*!< MCUCTRL SRAMMODE: DPREFETCH (Bit 4)                   */
+#define MCUCTRL_SRAMMODE_DPREFETCH_Msk    (0x10UL)                  /*!< MCUCTRL SRAMMODE: DPREFETCH (Bitfield-Mask: 0x01)     */
+#define MCUCTRL_SRAMMODE_IPREFETCH_CACHE_Pos (1UL)                  /*!< MCUCTRL SRAMMODE: IPREFETCH_CACHE (Bit 1)             */
+#define MCUCTRL_SRAMMODE_IPREFETCH_CACHE_Msk (0x2UL)                /*!< MCUCTRL SRAMMODE: IPREFETCH_CACHE (Bitfield-Mask: 0x01) */
+#define MCUCTRL_SRAMMODE_IPREFETCH_Pos    (0UL)                     /*!< MCUCTRL SRAMMODE: IPREFETCH (Bit 0)                   */
+#define MCUCTRL_SRAMMODE_IPREFETCH_Msk    (0x1UL)                   /*!< MCUCTRL SRAMMODE: IPREFETCH (Bitfield-Mask: 0x01)     */
+/* ======================================================  KEXTCLKSEL  ======================================================= */
+#define MCUCTRL_KEXTCLKSEL_KEXTCLKSEL_Pos (0UL)                     /*!< MCUCTRL KEXTCLKSEL: KEXTCLKSEL (Bit 0)                */
+#define MCUCTRL_KEXTCLKSEL_KEXTCLKSEL_Msk (0xffffffffUL)            /*!< MCUCTRL KEXTCLKSEL: KEXTCLKSEL (Bitfield-Mask: 0xffffffff) */
+/* =======================================================  SIMOBUCK4  ======================================================= */
+#define MCUCTRL_SIMOBUCK4_SIMOBUCKIBIASTRIM_Pos (28UL)              /*!< MCUCTRL SIMOBUCK4: SIMOBUCKIBIASTRIM (Bit 28)         */
+#define MCUCTRL_SIMOBUCK4_SIMOBUCKIBIASTRIM_Msk (0xf0000000UL)      /*!< MCUCTRL SIMOBUCK4: SIMOBUCKIBIASTRIM (Bitfield-Mask: 0x0f) */
+#define MCUCTRL_SIMOBUCK4_SIMOBUCKUVLOMODE_Pos (26UL)               /*!< MCUCTRL SIMOBUCK4: SIMOBUCKUVLOMODE (Bit 26)          */
+#define MCUCTRL_SIMOBUCK4_SIMOBUCKUVLOMODE_Msk (0xc000000UL)        /*!< MCUCTRL SIMOBUCK4: SIMOBUCKUVLOMODE (Bitfield-Mask: 0x03) */
+#define MCUCTRL_SIMOBUCK4_SIMOBUCKPRIORITYSEL_Pos (25UL)            /*!< MCUCTRL SIMOBUCK4: SIMOBUCKPRIORITYSEL (Bit 25)       */
+#define MCUCTRL_SIMOBUCK4_SIMOBUCKPRIORITYSEL_Msk (0x2000000UL)     /*!< MCUCTRL SIMOBUCK4: SIMOBUCKPRIORITYSEL (Bitfield-Mask: 0x01) */
+#define MCUCTRL_SIMOBUCK4_SIMOBUCKCOMP2TIMEOUTEN_Pos (24UL)         /*!< MCUCTRL SIMOBUCK4: SIMOBUCKCOMP2TIMEOUTEN (Bit 24)    */
+#define MCUCTRL_SIMOBUCK4_SIMOBUCKCOMP2TIMEOUTEN_Msk (0x1000000UL)  /*!< MCUCTRL SIMOBUCK4: SIMOBUCKCOMP2TIMEOUTEN (Bitfield-Mask: 0x01) */
+#define MCUCTRL_SIMOBUCK4_SIMOBUCKCOMP2LPEN_Pos (23UL)              /*!< MCUCTRL SIMOBUCK4: SIMOBUCKCOMP2LPEN (Bit 23)         */
+#define MCUCTRL_SIMOBUCK4_SIMOBUCKCOMP2LPEN_Msk (0x800000UL)        /*!< MCUCTRL SIMOBUCK4: SIMOBUCKCOMP2LPEN (Bitfield-Mask: 0x01) */
+#define MCUCTRL_SIMOBUCK4_SIMOBUCKCLKDIVSEL_Pos (21UL)              /*!< MCUCTRL SIMOBUCK4: SIMOBUCKCLKDIVSEL (Bit 21)         */
+#define MCUCTRL_SIMOBUCK4_SIMOBUCKCLKDIVSEL_Msk (0x600000UL)        /*!< MCUCTRL SIMOBUCK4: SIMOBUCKCLKDIVSEL (Bitfield-Mask: 0x03) */
+#define MCUCTRL_SIMOBUCK4_SIMOBUCKEXTCLKSEL_Pos (20UL)              /*!< MCUCTRL SIMOBUCK4: SIMOBUCKEXTCLKSEL (Bit 20)         */
+#define MCUCTRL_SIMOBUCK4_SIMOBUCKEXTCLKSEL_Msk (0x100000UL)        /*!< MCUCTRL SIMOBUCK4: SIMOBUCKEXTCLKSEL (Bitfield-Mask: 0x01) */
+#define MCUCTRL_SIMOBUCK4_SIMOBUCKUVLODRVSTRTRIM_Pos (17UL)         /*!< MCUCTRL SIMOBUCK4: SIMOBUCKUVLODRVSTRTRIM (Bit 17)    */
+#define MCUCTRL_SIMOBUCK4_SIMOBUCKUVLODRVSTRTRIM_Msk (0xe0000UL)    /*!< MCUCTRL SIMOBUCK4: SIMOBUCKUVLODRVSTRTRIM (Bitfield-Mask: 0x07) */
+#define MCUCTRL_SIMOBUCK4_SIMOBUCKUVLOCNTRTRIM_Pos (14UL)           /*!< MCUCTRL SIMOBUCK4: SIMOBUCKUVLOCNTRTRIM (Bit 14)      */
+#define MCUCTRL_SIMOBUCK4_SIMOBUCKUVLOCNTRTRIM_Msk (0x1c000UL)      /*!< MCUCTRL SIMOBUCK4: SIMOBUCKUVLOCNTRTRIM (Bitfield-Mask: 0x07) */
+#define MCUCTRL_SIMOBUCK4_SIMOBUCKZXTRIM_Pos (10UL)                 /*!< MCUCTRL SIMOBUCK4: SIMOBUCKZXTRIM (Bit 10)            */
+#define MCUCTRL_SIMOBUCK4_SIMOBUCKZXTRIM_Msk (0x3c00UL)             /*!< MCUCTRL SIMOBUCK4: SIMOBUCKZXTRIM (Bitfield-Mask: 0x0f) */
+#define MCUCTRL_SIMOBUCK4_SIMOBUCKMEMLEAKAGETRIM_Pos (8UL)          /*!< MCUCTRL SIMOBUCK4: SIMOBUCKMEMLEAKAGETRIM (Bit 8)     */
+#define MCUCTRL_SIMOBUCK4_SIMOBUCKMEMLEAKAGETRIM_Msk (0x300UL)      /*!< MCUCTRL SIMOBUCK4: SIMOBUCKMEMLEAKAGETRIM (Bitfield-Mask: 0x03) */
+#define MCUCTRL_SIMOBUCK4_SIMOBUCKMEMLPDRVSTRTRIM_Pos (6UL)         /*!< MCUCTRL SIMOBUCK4: SIMOBUCKMEMLPDRVSTRTRIM (Bit 6)    */
+#define MCUCTRL_SIMOBUCK4_SIMOBUCKMEMLPDRVSTRTRIM_Msk (0xc0UL)      /*!< MCUCTRL SIMOBUCK4: SIMOBUCKMEMLPDRVSTRTRIM (Bitfield-Mask: 0x03) */
+#define MCUCTRL_SIMOBUCK4_SIMOBUCKMEMACTDRVSTRTRIM_Pos (4UL)        /*!< MCUCTRL SIMOBUCK4: SIMOBUCKMEMACTDRVSTRTRIM (Bit 4)   */
+#define MCUCTRL_SIMOBUCK4_SIMOBUCKMEMACTDRVSTRTRIM_Msk (0x30UL)     /*!< MCUCTRL SIMOBUCK4: SIMOBUCKMEMACTDRVSTRTRIM (Bitfield-Mask: 0x03) */
+#define MCUCTRL_SIMOBUCK4_SIMOBUCKMEMLPLOWTONTRIM_Pos (0UL)         /*!< MCUCTRL SIMOBUCK4: SIMOBUCKMEMLPLOWTONTRIM (Bit 0)    */
+#define MCUCTRL_SIMOBUCK4_SIMOBUCKMEMLPLOWTONTRIM_Msk (0xfUL)       /*!< MCUCTRL SIMOBUCK4: SIMOBUCKMEMLPLOWTONTRIM (Bitfield-Mask: 0x0f) */
+/* =======================================================  BLEBUCK2  ======================================================== */
+#define MCUCTRL_BLEBUCK2_BLEBUCKTOND2ATRIM_Pos (12UL)               /*!< MCUCTRL BLEBUCK2: BLEBUCKTOND2ATRIM (Bit 12)          */
+#define MCUCTRL_BLEBUCK2_BLEBUCKTOND2ATRIM_Msk (0x3f000UL)          /*!< MCUCTRL BLEBUCK2: BLEBUCKTOND2ATRIM (Bitfield-Mask: 0x3f) */
+#define MCUCTRL_BLEBUCK2_BLEBUCKTONHITRIM_Pos (6UL)                 /*!< MCUCTRL BLEBUCK2: BLEBUCKTONHITRIM (Bit 6)            */
+#define MCUCTRL_BLEBUCK2_BLEBUCKTONHITRIM_Msk (0xfc0UL)             /*!< MCUCTRL BLEBUCK2: BLEBUCKTONHITRIM (Bitfield-Mask: 0x3f) */
+#define MCUCTRL_BLEBUCK2_BLEBUCKTONLOWTRIM_Pos (0UL)                /*!< MCUCTRL BLEBUCK2: BLEBUCKTONLOWTRIM (Bit 0)           */
+#define MCUCTRL_BLEBUCK2_BLEBUCKTONLOWTRIM_Msk (0x3fUL)             /*!< MCUCTRL BLEBUCK2: BLEBUCKTONLOWTRIM (Bitfield-Mask: 0x3f) */
+/* ======================================================  FLASHWPROT0  ====================================================== */
+#define MCUCTRL_FLASHWPROT0_FW0BITS_Pos   (0UL)                     /*!< MCUCTRL FLASHWPROT0: FW0BITS (Bit 0)                  */
+#define MCUCTRL_FLASHWPROT0_FW0BITS_Msk   (0xffffffffUL)            /*!< MCUCTRL FLASHWPROT0: FW0BITS (Bitfield-Mask: 0xffffffff) */
+/* ======================================================  FLASHWPROT1  ====================================================== */
+#define MCUCTRL_FLASHWPROT1_FW1BITS_Pos   (0UL)                     /*!< MCUCTRL FLASHWPROT1: FW1BITS (Bit 0)                  */
+#define MCUCTRL_FLASHWPROT1_FW1BITS_Msk   (0xffffffffUL)            /*!< MCUCTRL FLASHWPROT1: FW1BITS (Bitfield-Mask: 0xffffffff) */
+/* ======================================================  FLASHRPROT0  ====================================================== */
+#define MCUCTRL_FLASHRPROT0_FR0BITS_Pos   (0UL)                     /*!< MCUCTRL FLASHRPROT0: FR0BITS (Bit 0)                  */
+#define MCUCTRL_FLASHRPROT0_FR0BITS_Msk   (0xffffffffUL)            /*!< MCUCTRL FLASHRPROT0: FR0BITS (Bitfield-Mask: 0xffffffff) */
+/* ======================================================  FLASHRPROT1  ====================================================== */
+#define MCUCTRL_FLASHRPROT1_FR1BITS_Pos   (0UL)                     /*!< MCUCTRL FLASHRPROT1: FR1BITS (Bit 0)                  */
+#define MCUCTRL_FLASHRPROT1_FR1BITS_Msk   (0xffffffffUL)            /*!< MCUCTRL FLASHRPROT1: FR1BITS (Bitfield-Mask: 0xffffffff) */
+/* =================================================  DMASRAMWRITEPROTECT0  ================================================== */
+#define MCUCTRL_DMASRAMWRITEPROTECT0_DMA_WPROT0_Pos (0UL)           /*!< MCUCTRL DMASRAMWRITEPROTECT0: DMA_WPROT0 (Bit 0)      */
+#define MCUCTRL_DMASRAMWRITEPROTECT0_DMA_WPROT0_Msk (0xffffffffUL)  /*!< MCUCTRL DMASRAMWRITEPROTECT0: DMA_WPROT0 (Bitfield-Mask: 0xffffffff) */
+/* =================================================  DMASRAMWRITEPROTECT1  ================================================== */
+#define MCUCTRL_DMASRAMWRITEPROTECT1_DMA_WPROT1_Pos (0UL)           /*!< MCUCTRL DMASRAMWRITEPROTECT1: DMA_WPROT1 (Bit 0)      */
+#define MCUCTRL_DMASRAMWRITEPROTECT1_DMA_WPROT1_Msk (0xffffUL)      /*!< MCUCTRL DMASRAMWRITEPROTECT1: DMA_WPROT1 (Bitfield-Mask: 0xffff) */
+/* ==================================================  DMASRAMREADPROTECT0  ================================================== */
+#define MCUCTRL_DMASRAMREADPROTECT0_DMA_RPROT0_Pos (0UL)            /*!< MCUCTRL DMASRAMREADPROTECT0: DMA_RPROT0 (Bit 0)       */
+#define MCUCTRL_DMASRAMREADPROTECT0_DMA_RPROT0_Msk (0xffffffffUL)   /*!< MCUCTRL DMASRAMREADPROTECT0: DMA_RPROT0 (Bitfield-Mask: 0xffffffff) */
+/* ==================================================  DMASRAMREADPROTECT1  ================================================== */
+#define MCUCTRL_DMASRAMREADPROTECT1_DMA_RPROT1_Pos (0UL)            /*!< MCUCTRL DMASRAMREADPROTECT1: DMA_RPROT1 (Bit 0)       */
+#define MCUCTRL_DMASRAMREADPROTECT1_DMA_RPROT1_Msk (0xffffUL)       /*!< MCUCTRL DMASRAMREADPROTECT1: DMA_RPROT1 (Bitfield-Mask: 0xffff) */
+
+
+/* =========================================================================================================================== */
+/* ================                                           MSPI                                            ================ */
+/* =========================================================================================================================== */
+
+/* =========================================================  CTRL  ========================================================== */
+#define MSPI_CTRL_XFERBYTES_Pos           (16UL)                    /*!< MSPI CTRL: XFERBYTES (Bit 16)                         */
+#define MSPI_CTRL_XFERBYTES_Msk           (0xffff0000UL)            /*!< MSPI CTRL: XFERBYTES (Bitfield-Mask: 0xffff)          */
+#define MSPI_CTRL_PIOSCRAMBLE_Pos         (11UL)                    /*!< MSPI CTRL: PIOSCRAMBLE (Bit 11)                       */
+#define MSPI_CTRL_PIOSCRAMBLE_Msk         (0x800UL)                 /*!< MSPI CTRL: PIOSCRAMBLE (Bitfield-Mask: 0x01)          */
+#define MSPI_CTRL_TXRX_Pos                (10UL)                    /*!< MSPI CTRL: TXRX (Bit 10)                              */
+#define MSPI_CTRL_TXRX_Msk                (0x400UL)                 /*!< MSPI CTRL: TXRX (Bitfield-Mask: 0x01)                 */
+#define MSPI_CTRL_SENDI_Pos               (9UL)                     /*!< MSPI CTRL: SENDI (Bit 9)                              */
+#define MSPI_CTRL_SENDI_Msk               (0x200UL)                 /*!< MSPI CTRL: SENDI (Bitfield-Mask: 0x01)                */
+#define MSPI_CTRL_SENDA_Pos               (8UL)                     /*!< MSPI CTRL: SENDA (Bit 8)                              */
+#define MSPI_CTRL_SENDA_Msk               (0x100UL)                 /*!< MSPI CTRL: SENDA (Bitfield-Mask: 0x01)                */
+#define MSPI_CTRL_ENTURN_Pos              (7UL)                     /*!< MSPI CTRL: ENTURN (Bit 7)                             */
+#define MSPI_CTRL_ENTURN_Msk              (0x80UL)                  /*!< MSPI CTRL: ENTURN (Bitfield-Mask: 0x01)               */
+#define MSPI_CTRL_BIGENDIAN_Pos           (6UL)                     /*!< MSPI CTRL: BIGENDIAN (Bit 6)                          */
+#define MSPI_CTRL_BIGENDIAN_Msk           (0x40UL)                  /*!< MSPI CTRL: BIGENDIAN (Bitfield-Mask: 0x01)            */
+#define MSPI_CTRL_CONT_Pos                (5UL)                     /*!< MSPI CTRL: CONT (Bit 5)                               */
+#define MSPI_CTRL_CONT_Msk                (0x20UL)                  /*!< MSPI CTRL: CONT (Bitfield-Mask: 0x01)                 */
+#define MSPI_CTRL_QUADCMD_Pos             (3UL)                     /*!< MSPI CTRL: QUADCMD (Bit 3)                            */
+#define MSPI_CTRL_QUADCMD_Msk             (0x8UL)                   /*!< MSPI CTRL: QUADCMD (Bitfield-Mask: 0x01)              */
+#define MSPI_CTRL_BUSY_Pos                (2UL)                     /*!< MSPI CTRL: BUSY (Bit 2)                               */
+#define MSPI_CTRL_BUSY_Msk                (0x4UL)                   /*!< MSPI CTRL: BUSY (Bitfield-Mask: 0x01)                 */
+#define MSPI_CTRL_STATUS_Pos              (1UL)                     /*!< MSPI CTRL: STATUS (Bit 1)                             */
+#define MSPI_CTRL_STATUS_Msk              (0x2UL)                   /*!< MSPI CTRL: STATUS (Bitfield-Mask: 0x01)               */
+#define MSPI_CTRL_START_Pos               (0UL)                     /*!< MSPI CTRL: START (Bit 0)                              */
+#define MSPI_CTRL_START_Msk               (0x1UL)                   /*!< MSPI CTRL: START (Bitfield-Mask: 0x01)                */
+/* ==========================================================  CFG  ========================================================== */
+#define MSPI_CFG_CPOL_Pos                 (17UL)                    /*!< MSPI CFG: CPOL (Bit 17)                               */
+#define MSPI_CFG_CPOL_Msk                 (0x20000UL)               /*!< MSPI CFG: CPOL (Bitfield-Mask: 0x01)                  */
+#define MSPI_CFG_CPHA_Pos                 (16UL)                    /*!< MSPI CFG: CPHA (Bit 16)                               */
+#define MSPI_CFG_CPHA_Msk                 (0x10000UL)               /*!< MSPI CFG: CPHA (Bitfield-Mask: 0x01)                  */
+#define MSPI_CFG_TURNAROUND_Pos           (8UL)                     /*!< MSPI CFG: TURNAROUND (Bit 8)                          */
+#define MSPI_CFG_TURNAROUND_Msk           (0x3f00UL)                /*!< MSPI CFG: TURNAROUND (Bitfield-Mask: 0x3f)            */
+#define MSPI_CFG_SEPIO_Pos                (7UL)                     /*!< MSPI CFG: SEPIO (Bit 7)                               */
+#define MSPI_CFG_SEPIO_Msk                (0x80UL)                  /*!< MSPI CFG: SEPIO (Bitfield-Mask: 0x01)                 */
+#define MSPI_CFG_ISIZE_Pos                (6UL)                     /*!< MSPI CFG: ISIZE (Bit 6)                               */
+#define MSPI_CFG_ISIZE_Msk                (0x40UL)                  /*!< MSPI CFG: ISIZE (Bitfield-Mask: 0x01)                 */
+#define MSPI_CFG_ASIZE_Pos                (4UL)                     /*!< MSPI CFG: ASIZE (Bit 4)                               */
+#define MSPI_CFG_ASIZE_Msk                (0x30UL)                  /*!< MSPI CFG: ASIZE (Bitfield-Mask: 0x03)                 */
+#define MSPI_CFG_DEVCFG_Pos               (0UL)                     /*!< MSPI CFG: DEVCFG (Bit 0)                              */
+#define MSPI_CFG_DEVCFG_Msk               (0xfUL)                   /*!< MSPI CFG: DEVCFG (Bitfield-Mask: 0x0f)                */
+/* =========================================================  ADDR  ========================================================== */
+#define MSPI_ADDR_ADDR_Pos                (0UL)                     /*!< MSPI ADDR: ADDR (Bit 0)                               */
+#define MSPI_ADDR_ADDR_Msk                (0xffffffffUL)            /*!< MSPI ADDR: ADDR (Bitfield-Mask: 0xffffffff)           */
+/* =========================================================  INSTR  ========================================================= */
+#define MSPI_INSTR_INSTR_Pos              (0UL)                     /*!< MSPI INSTR: INSTR (Bit 0)                             */
+#define MSPI_INSTR_INSTR_Msk              (0xffffUL)                /*!< MSPI INSTR: INSTR (Bitfield-Mask: 0xffff)             */
+/* ========================================================  TXFIFO  ========================================================= */
+#define MSPI_TXFIFO_TXFIFO_Pos            (0UL)                     /*!< MSPI TXFIFO: TXFIFO (Bit 0)                           */
+#define MSPI_TXFIFO_TXFIFO_Msk            (0xffffffffUL)            /*!< MSPI TXFIFO: TXFIFO (Bitfield-Mask: 0xffffffff)       */
+/* ========================================================  RXFIFO  ========================================================= */
+#define MSPI_RXFIFO_RXFIFO_Pos            (0UL)                     /*!< MSPI RXFIFO: RXFIFO (Bit 0)                           */
+#define MSPI_RXFIFO_RXFIFO_Msk            (0xffffffffUL)            /*!< MSPI RXFIFO: RXFIFO (Bitfield-Mask: 0xffffffff)       */
+/* =======================================================  TXENTRIES  ======================================================= */
+#define MSPI_TXENTRIES_TXENTRIES_Pos      (0UL)                     /*!< MSPI TXENTRIES: TXENTRIES (Bit 0)                     */
+#define MSPI_TXENTRIES_TXENTRIES_Msk      (0x1fUL)                  /*!< MSPI TXENTRIES: TXENTRIES (Bitfield-Mask: 0x1f)       */
+/* =======================================================  RXENTRIES  ======================================================= */
+#define MSPI_RXENTRIES_RXENTRIES_Pos      (0UL)                     /*!< MSPI RXENTRIES: RXENTRIES (Bit 0)                     */
+#define MSPI_RXENTRIES_RXENTRIES_Msk      (0x1fUL)                  /*!< MSPI RXENTRIES: RXENTRIES (Bitfield-Mask: 0x1f)       */
+/* =======================================================  THRESHOLD  ======================================================= */
+#define MSPI_THRESHOLD_RXTHRESH_Pos       (8UL)                     /*!< MSPI THRESHOLD: RXTHRESH (Bit 8)                      */
+#define MSPI_THRESHOLD_RXTHRESH_Msk       (0x1f00UL)                /*!< MSPI THRESHOLD: RXTHRESH (Bitfield-Mask: 0x1f)        */
+#define MSPI_THRESHOLD_TXTHRESH_Pos       (0UL)                     /*!< MSPI THRESHOLD: TXTHRESH (Bit 0)                      */
+#define MSPI_THRESHOLD_TXTHRESH_Msk       (0x1fUL)                  /*!< MSPI THRESHOLD: TXTHRESH (Bitfield-Mask: 0x1f)        */
+/* ========================================================  MSPICFG  ======================================================== */
+#define MSPI_MSPICFG_PRSTN_Pos            (31UL)                    /*!< MSPI MSPICFG: PRSTN (Bit 31)                          */
+#define MSPI_MSPICFG_PRSTN_Msk            (0x80000000UL)            /*!< MSPI MSPICFG: PRSTN (Bitfield-Mask: 0x01)             */
+#define MSPI_MSPICFG_IPRSTN_Pos           (30UL)                    /*!< MSPI MSPICFG: IPRSTN (Bit 30)                         */
+#define MSPI_MSPICFG_IPRSTN_Msk           (0x40000000UL)            /*!< MSPI MSPICFG: IPRSTN (Bitfield-Mask: 0x01)            */
+#define MSPI_MSPICFG_FIFORESET_Pos        (29UL)                    /*!< MSPI MSPICFG: FIFORESET (Bit 29)                      */
+#define MSPI_MSPICFG_FIFORESET_Msk        (0x20000000UL)            /*!< MSPI MSPICFG: FIFORESET (Bitfield-Mask: 0x01)         */
+#define MSPI_MSPICFG_CLKDIV_Pos           (8UL)                     /*!< MSPI MSPICFG: CLKDIV (Bit 8)                          */
+#define MSPI_MSPICFG_CLKDIV_Msk           (0x3f00UL)                /*!< MSPI MSPICFG: CLKDIV (Bitfield-Mask: 0x3f)            */
+#define MSPI_MSPICFG_IOMSEL_Pos           (4UL)                     /*!< MSPI MSPICFG: IOMSEL (Bit 4)                          */
+#define MSPI_MSPICFG_IOMSEL_Msk           (0x70UL)                  /*!< MSPI MSPICFG: IOMSEL (Bitfield-Mask: 0x07)            */
+#define MSPI_MSPICFG_TXNEG_Pos            (3UL)                     /*!< MSPI MSPICFG: TXNEG (Bit 3)                           */
+#define MSPI_MSPICFG_TXNEG_Msk            (0x8UL)                   /*!< MSPI MSPICFG: TXNEG (Bitfield-Mask: 0x01)             */
+#define MSPI_MSPICFG_RXNEG_Pos            (2UL)                     /*!< MSPI MSPICFG: RXNEG (Bit 2)                           */
+#define MSPI_MSPICFG_RXNEG_Msk            (0x4UL)                   /*!< MSPI MSPICFG: RXNEG (Bitfield-Mask: 0x01)             */
+#define MSPI_MSPICFG_RXCAP_Pos            (1UL)                     /*!< MSPI MSPICFG: RXCAP (Bit 1)                           */
+#define MSPI_MSPICFG_RXCAP_Msk            (0x2UL)                   /*!< MSPI MSPICFG: RXCAP (Bitfield-Mask: 0x01)             */
+#define MSPI_MSPICFG_APBCLK_Pos           (0UL)                     /*!< MSPI MSPICFG: APBCLK (Bit 0)                          */
+#define MSPI_MSPICFG_APBCLK_Msk           (0x1UL)                   /*!< MSPI MSPICFG: APBCLK (Bitfield-Mask: 0x01)            */
+/* ========================================================  PADCFG  ========================================================= */
+#define MSPI_PADCFG_REVCS_Pos             (21UL)                    /*!< MSPI PADCFG: REVCS (Bit 21)                           */
+#define MSPI_PADCFG_REVCS_Msk             (0x200000UL)              /*!< MSPI PADCFG: REVCS (Bitfield-Mask: 0x01)              */
+#define MSPI_PADCFG_IN3_Pos               (20UL)                    /*!< MSPI PADCFG: IN3 (Bit 20)                             */
+#define MSPI_PADCFG_IN3_Msk               (0x100000UL)              /*!< MSPI PADCFG: IN3 (Bitfield-Mask: 0x01)                */
+#define MSPI_PADCFG_IN2_Pos               (19UL)                    /*!< MSPI PADCFG: IN2 (Bit 19)                             */
+#define MSPI_PADCFG_IN2_Msk               (0x80000UL)               /*!< MSPI PADCFG: IN2 (Bitfield-Mask: 0x01)                */
+#define MSPI_PADCFG_IN1_Pos               (18UL)                    /*!< MSPI PADCFG: IN1 (Bit 18)                             */
+#define MSPI_PADCFG_IN1_Msk               (0x40000UL)               /*!< MSPI PADCFG: IN1 (Bitfield-Mask: 0x01)                */
+#define MSPI_PADCFG_IN0_Pos               (16UL)                    /*!< MSPI PADCFG: IN0 (Bit 16)                             */
+#define MSPI_PADCFG_IN0_Msk               (0x30000UL)               /*!< MSPI PADCFG: IN0 (Bitfield-Mask: 0x03)                */
+#define MSPI_PADCFG_OUT7_Pos              (4UL)                     /*!< MSPI PADCFG: OUT7 (Bit 4)                             */
+#define MSPI_PADCFG_OUT7_Msk              (0x10UL)                  /*!< MSPI PADCFG: OUT7 (Bitfield-Mask: 0x01)               */
+#define MSPI_PADCFG_OUT6_Pos              (3UL)                     /*!< MSPI PADCFG: OUT6 (Bit 3)                             */
+#define MSPI_PADCFG_OUT6_Msk              (0x8UL)                   /*!< MSPI PADCFG: OUT6 (Bitfield-Mask: 0x01)               */
+#define MSPI_PADCFG_OUT5_Pos              (2UL)                     /*!< MSPI PADCFG: OUT5 (Bit 2)                             */
+#define MSPI_PADCFG_OUT5_Msk              (0x4UL)                   /*!< MSPI PADCFG: OUT5 (Bitfield-Mask: 0x01)               */
+#define MSPI_PADCFG_OUT4_Pos              (1UL)                     /*!< MSPI PADCFG: OUT4 (Bit 1)                             */
+#define MSPI_PADCFG_OUT4_Msk              (0x2UL)                   /*!< MSPI PADCFG: OUT4 (Bitfield-Mask: 0x01)               */
+#define MSPI_PADCFG_OUT3_Pos              (0UL)                     /*!< MSPI PADCFG: OUT3 (Bit 0)                             */
+#define MSPI_PADCFG_OUT3_Msk              (0x1UL)                   /*!< MSPI PADCFG: OUT3 (Bitfield-Mask: 0x01)               */
+/* =======================================================  PADOUTEN  ======================================================== */
+#define MSPI_PADOUTEN_OUTEN_Pos           (0UL)                     /*!< MSPI PADOUTEN: OUTEN (Bit 0)                          */
+#define MSPI_PADOUTEN_OUTEN_Msk           (0x1ffUL)                 /*!< MSPI PADOUTEN: OUTEN (Bitfield-Mask: 0x1ff)           */
+/* =========================================================  FLASH  ========================================================= */
+#define MSPI_FLASH_READINSTR_Pos          (24UL)                    /*!< MSPI FLASH: READINSTR (Bit 24)                        */
+#define MSPI_FLASH_READINSTR_Msk          (0xff000000UL)            /*!< MSPI FLASH: READINSTR (Bitfield-Mask: 0xff)           */
+#define MSPI_FLASH_WRITEINSTR_Pos         (16UL)                    /*!< MSPI FLASH: WRITEINSTR (Bit 16)                       */
+#define MSPI_FLASH_WRITEINSTR_Msk         (0xff0000UL)              /*!< MSPI FLASH: WRITEINSTR (Bitfield-Mask: 0xff)          */
+#define MSPI_FLASH_XIPMIXED_Pos           (8UL)                     /*!< MSPI FLASH: XIPMIXED (Bit 8)                          */
+#define MSPI_FLASH_XIPMIXED_Msk           (0x700UL)                 /*!< MSPI FLASH: XIPMIXED (Bitfield-Mask: 0x07)            */
+#define MSPI_FLASH_XIPSENDI_Pos           (7UL)                     /*!< MSPI FLASH: XIPSENDI (Bit 7)                          */
+#define MSPI_FLASH_XIPSENDI_Msk           (0x80UL)                  /*!< MSPI FLASH: XIPSENDI (Bitfield-Mask: 0x01)            */
+#define MSPI_FLASH_XIPSENDA_Pos           (6UL)                     /*!< MSPI FLASH: XIPSENDA (Bit 6)                          */
+#define MSPI_FLASH_XIPSENDA_Msk           (0x40UL)                  /*!< MSPI FLASH: XIPSENDA (Bitfield-Mask: 0x01)            */
+#define MSPI_FLASH_XIPENTURN_Pos          (5UL)                     /*!< MSPI FLASH: XIPENTURN (Bit 5)                         */
+#define MSPI_FLASH_XIPENTURN_Msk          (0x20UL)                  /*!< MSPI FLASH: XIPENTURN (Bitfield-Mask: 0x01)           */
+#define MSPI_FLASH_XIPBIGENDIAN_Pos       (4UL)                     /*!< MSPI FLASH: XIPBIGENDIAN (Bit 4)                      */
+#define MSPI_FLASH_XIPBIGENDIAN_Msk       (0x10UL)                  /*!< MSPI FLASH: XIPBIGENDIAN (Bitfield-Mask: 0x01)        */
+#define MSPI_FLASH_XIPACK_Pos             (2UL)                     /*!< MSPI FLASH: XIPACK (Bit 2)                            */
+#define MSPI_FLASH_XIPACK_Msk             (0xcUL)                   /*!< MSPI FLASH: XIPACK (Bitfield-Mask: 0x03)              */
+#define MSPI_FLASH_XIPEN_Pos              (0UL)                     /*!< MSPI FLASH: XIPEN (Bit 0)                             */
+#define MSPI_FLASH_XIPEN_Msk              (0x1UL)                   /*!< MSPI FLASH: XIPEN (Bitfield-Mask: 0x01)               */
+/* ======================================================  SCRAMBLING  ======================================================= */
+#define MSPI_SCRAMBLING_SCRENABLE_Pos     (31UL)                    /*!< MSPI SCRAMBLING: SCRENABLE (Bit 31)                   */
+#define MSPI_SCRAMBLING_SCRENABLE_Msk     (0x80000000UL)            /*!< MSPI SCRAMBLING: SCRENABLE (Bitfield-Mask: 0x01)      */
+#define MSPI_SCRAMBLING_SCREND_Pos        (16UL)                    /*!< MSPI SCRAMBLING: SCREND (Bit 16)                      */
+#define MSPI_SCRAMBLING_SCREND_Msk        (0x3ff0000UL)             /*!< MSPI SCRAMBLING: SCREND (Bitfield-Mask: 0x3ff)        */
+#define MSPI_SCRAMBLING_SCRSTART_Pos      (0UL)                     /*!< MSPI SCRAMBLING: SCRSTART (Bit 0)                     */
+#define MSPI_SCRAMBLING_SCRSTART_Msk      (0x3ffUL)                 /*!< MSPI SCRAMBLING: SCRSTART (Bitfield-Mask: 0x3ff)      */
+/* =========================================================  INTEN  ========================================================= */
+#define MSPI_INTEN_SCRERR_Pos             (12UL)                    /*!< MSPI INTEN: SCRERR (Bit 12)                           */
+#define MSPI_INTEN_SCRERR_Msk             (0x1000UL)                /*!< MSPI INTEN: SCRERR (Bitfield-Mask: 0x01)              */
+#define MSPI_INTEN_CQERR_Pos              (11UL)                    /*!< MSPI INTEN: CQERR (Bit 11)                            */
+#define MSPI_INTEN_CQERR_Msk              (0x800UL)                 /*!< MSPI INTEN: CQERR (Bitfield-Mask: 0x01)               */
+#define MSPI_INTEN_CQPAUSED_Pos           (10UL)                    /*!< MSPI INTEN: CQPAUSED (Bit 10)                         */
+#define MSPI_INTEN_CQPAUSED_Msk           (0x400UL)                 /*!< MSPI INTEN: CQPAUSED (Bitfield-Mask: 0x01)            */
+#define MSPI_INTEN_CQUPD_Pos              (9UL)                     /*!< MSPI INTEN: CQUPD (Bit 9)                             */
+#define MSPI_INTEN_CQUPD_Msk              (0x200UL)                 /*!< MSPI INTEN: CQUPD (Bitfield-Mask: 0x01)               */
+#define MSPI_INTEN_CQCMP_Pos              (8UL)                     /*!< MSPI INTEN: CQCMP (Bit 8)                             */
+#define MSPI_INTEN_CQCMP_Msk              (0x100UL)                 /*!< MSPI INTEN: CQCMP (Bitfield-Mask: 0x01)               */
+#define MSPI_INTEN_DERR_Pos               (7UL)                     /*!< MSPI INTEN: DERR (Bit 7)                              */
+#define MSPI_INTEN_DERR_Msk               (0x80UL)                  /*!< MSPI INTEN: DERR (Bitfield-Mask: 0x01)                */
+#define MSPI_INTEN_DCMP_Pos               (6UL)                     /*!< MSPI INTEN: DCMP (Bit 6)                              */
+#define MSPI_INTEN_DCMP_Msk               (0x40UL)                  /*!< MSPI INTEN: DCMP (Bitfield-Mask: 0x01)                */
+#define MSPI_INTEN_RXF_Pos                (5UL)                     /*!< MSPI INTEN: RXF (Bit 5)                               */
+#define MSPI_INTEN_RXF_Msk                (0x20UL)                  /*!< MSPI INTEN: RXF (Bitfield-Mask: 0x01)                 */
+#define MSPI_INTEN_RXO_Pos                (4UL)                     /*!< MSPI INTEN: RXO (Bit 4)                               */
+#define MSPI_INTEN_RXO_Msk                (0x10UL)                  /*!< MSPI INTEN: RXO (Bitfield-Mask: 0x01)                 */
+#define MSPI_INTEN_RXU_Pos                (3UL)                     /*!< MSPI INTEN: RXU (Bit 3)                               */
+#define MSPI_INTEN_RXU_Msk                (0x8UL)                   /*!< MSPI INTEN: RXU (Bitfield-Mask: 0x01)                 */
+#define MSPI_INTEN_TXO_Pos                (2UL)                     /*!< MSPI INTEN: TXO (Bit 2)                               */
+#define MSPI_INTEN_TXO_Msk                (0x4UL)                   /*!< MSPI INTEN: TXO (Bitfield-Mask: 0x01)                 */
+#define MSPI_INTEN_TXE_Pos                (1UL)                     /*!< MSPI INTEN: TXE (Bit 1)                               */
+#define MSPI_INTEN_TXE_Msk                (0x2UL)                   /*!< MSPI INTEN: TXE (Bitfield-Mask: 0x01)                 */
+#define MSPI_INTEN_CMDCMP_Pos             (0UL)                     /*!< MSPI INTEN: CMDCMP (Bit 0)                            */
+#define MSPI_INTEN_CMDCMP_Msk             (0x1UL)                   /*!< MSPI INTEN: CMDCMP (Bitfield-Mask: 0x01)              */
+/* ========================================================  INTSTAT  ======================================================== */
+#define MSPI_INTSTAT_SCRERR_Pos           (12UL)                    /*!< MSPI INTSTAT: SCRERR (Bit 12)                         */
+#define MSPI_INTSTAT_SCRERR_Msk           (0x1000UL)                /*!< MSPI INTSTAT: SCRERR (Bitfield-Mask: 0x01)            */
+#define MSPI_INTSTAT_CQERR_Pos            (11UL)                    /*!< MSPI INTSTAT: CQERR (Bit 11)                          */
+#define MSPI_INTSTAT_CQERR_Msk            (0x800UL)                 /*!< MSPI INTSTAT: CQERR (Bitfield-Mask: 0x01)             */
+#define MSPI_INTSTAT_CQPAUSED_Pos         (10UL)                    /*!< MSPI INTSTAT: CQPAUSED (Bit 10)                       */
+#define MSPI_INTSTAT_CQPAUSED_Msk         (0x400UL)                 /*!< MSPI INTSTAT: CQPAUSED (Bitfield-Mask: 0x01)          */
+#define MSPI_INTSTAT_CQUPD_Pos            (9UL)                     /*!< MSPI INTSTAT: CQUPD (Bit 9)                           */
+#define MSPI_INTSTAT_CQUPD_Msk            (0x200UL)                 /*!< MSPI INTSTAT: CQUPD (Bitfield-Mask: 0x01)             */
+#define MSPI_INTSTAT_CQCMP_Pos            (8UL)                     /*!< MSPI INTSTAT: CQCMP (Bit 8)                           */
+#define MSPI_INTSTAT_CQCMP_Msk            (0x100UL)                 /*!< MSPI INTSTAT: CQCMP (Bitfield-Mask: 0x01)             */
+#define MSPI_INTSTAT_DERR_Pos             (7UL)                     /*!< MSPI INTSTAT: DERR (Bit 7)                            */
+#define MSPI_INTSTAT_DERR_Msk             (0x80UL)                  /*!< MSPI INTSTAT: DERR (Bitfield-Mask: 0x01)              */
+#define MSPI_INTSTAT_DCMP_Pos             (6UL)                     /*!< MSPI INTSTAT: DCMP (Bit 6)                            */
+#define MSPI_INTSTAT_DCMP_Msk             (0x40UL)                  /*!< MSPI INTSTAT: DCMP (Bitfield-Mask: 0x01)              */
+#define MSPI_INTSTAT_RXF_Pos              (5UL)                     /*!< MSPI INTSTAT: RXF (Bit 5)                             */
+#define MSPI_INTSTAT_RXF_Msk              (0x20UL)                  /*!< MSPI INTSTAT: RXF (Bitfield-Mask: 0x01)               */
+#define MSPI_INTSTAT_RXO_Pos              (4UL)                     /*!< MSPI INTSTAT: RXO (Bit 4)                             */
+#define MSPI_INTSTAT_RXO_Msk              (0x10UL)                  /*!< MSPI INTSTAT: RXO (Bitfield-Mask: 0x01)               */
+#define MSPI_INTSTAT_RXU_Pos              (3UL)                     /*!< MSPI INTSTAT: RXU (Bit 3)                             */
+#define MSPI_INTSTAT_RXU_Msk              (0x8UL)                   /*!< MSPI INTSTAT: RXU (Bitfield-Mask: 0x01)               */
+#define MSPI_INTSTAT_TXO_Pos              (2UL)                     /*!< MSPI INTSTAT: TXO (Bit 2)                             */
+#define MSPI_INTSTAT_TXO_Msk              (0x4UL)                   /*!< MSPI INTSTAT: TXO (Bitfield-Mask: 0x01)               */
+#define MSPI_INTSTAT_TXE_Pos              (1UL)                     /*!< MSPI INTSTAT: TXE (Bit 1)                             */
+#define MSPI_INTSTAT_TXE_Msk              (0x2UL)                   /*!< MSPI INTSTAT: TXE (Bitfield-Mask: 0x01)               */
+#define MSPI_INTSTAT_CMDCMP_Pos           (0UL)                     /*!< MSPI INTSTAT: CMDCMP (Bit 0)                          */
+#define MSPI_INTSTAT_CMDCMP_Msk           (0x1UL)                   /*!< MSPI INTSTAT: CMDCMP (Bitfield-Mask: 0x01)            */
+/* ========================================================  INTCLR  ========================================================= */
+#define MSPI_INTCLR_SCRERR_Pos            (12UL)                    /*!< MSPI INTCLR: SCRERR (Bit 12)                          */
+#define MSPI_INTCLR_SCRERR_Msk            (0x1000UL)                /*!< MSPI INTCLR: SCRERR (Bitfield-Mask: 0x01)             */
+#define MSPI_INTCLR_CQERR_Pos             (11UL)                    /*!< MSPI INTCLR: CQERR (Bit 11)                           */
+#define MSPI_INTCLR_CQERR_Msk             (0x800UL)                 /*!< MSPI INTCLR: CQERR (Bitfield-Mask: 0x01)              */
+#define MSPI_INTCLR_CQPAUSED_Pos          (10UL)                    /*!< MSPI INTCLR: CQPAUSED (Bit 10)                        */
+#define MSPI_INTCLR_CQPAUSED_Msk          (0x400UL)                 /*!< MSPI INTCLR: CQPAUSED (Bitfield-Mask: 0x01)           */
+#define MSPI_INTCLR_CQUPD_Pos             (9UL)                     /*!< MSPI INTCLR: CQUPD (Bit 9)                            */
+#define MSPI_INTCLR_CQUPD_Msk             (0x200UL)                 /*!< MSPI INTCLR: CQUPD (Bitfield-Mask: 0x01)              */
+#define MSPI_INTCLR_CQCMP_Pos             (8UL)                     /*!< MSPI INTCLR: CQCMP (Bit 8)                            */
+#define MSPI_INTCLR_CQCMP_Msk             (0x100UL)                 /*!< MSPI INTCLR: CQCMP (Bitfield-Mask: 0x01)              */
+#define MSPI_INTCLR_DERR_Pos              (7UL)                     /*!< MSPI INTCLR: DERR (Bit 7)                             */
+#define MSPI_INTCLR_DERR_Msk              (0x80UL)                  /*!< MSPI INTCLR: DERR (Bitfield-Mask: 0x01)               */
+#define MSPI_INTCLR_DCMP_Pos              (6UL)                     /*!< MSPI INTCLR: DCMP (Bit 6)                             */
+#define MSPI_INTCLR_DCMP_Msk              (0x40UL)                  /*!< MSPI INTCLR: DCMP (Bitfield-Mask: 0x01)               */
+#define MSPI_INTCLR_RXF_Pos               (5UL)                     /*!< MSPI INTCLR: RXF (Bit 5)                              */
+#define MSPI_INTCLR_RXF_Msk               (0x20UL)                  /*!< MSPI INTCLR: RXF (Bitfield-Mask: 0x01)                */
+#define MSPI_INTCLR_RXO_Pos               (4UL)                     /*!< MSPI INTCLR: RXO (Bit 4)                              */
+#define MSPI_INTCLR_RXO_Msk               (0x10UL)                  /*!< MSPI INTCLR: RXO (Bitfield-Mask: 0x01)                */
+#define MSPI_INTCLR_RXU_Pos               (3UL)                     /*!< MSPI INTCLR: RXU (Bit 3)                              */
+#define MSPI_INTCLR_RXU_Msk               (0x8UL)                   /*!< MSPI INTCLR: RXU (Bitfield-Mask: 0x01)                */
+#define MSPI_INTCLR_TXO_Pos               (2UL)                     /*!< MSPI INTCLR: TXO (Bit 2)                              */
+#define MSPI_INTCLR_TXO_Msk               (0x4UL)                   /*!< MSPI INTCLR: TXO (Bitfield-Mask: 0x01)                */
+#define MSPI_INTCLR_TXE_Pos               (1UL)                     /*!< MSPI INTCLR: TXE (Bit 1)                              */
+#define MSPI_INTCLR_TXE_Msk               (0x2UL)                   /*!< MSPI INTCLR: TXE (Bitfield-Mask: 0x01)                */
+#define MSPI_INTCLR_CMDCMP_Pos            (0UL)                     /*!< MSPI INTCLR: CMDCMP (Bit 0)                           */
+#define MSPI_INTCLR_CMDCMP_Msk            (0x1UL)                   /*!< MSPI INTCLR: CMDCMP (Bitfield-Mask: 0x01)             */
+/* ========================================================  INTSET  ========================================================= */
+#define MSPI_INTSET_SCRERR_Pos            (12UL)                    /*!< MSPI INTSET: SCRERR (Bit 12)                          */
+#define MSPI_INTSET_SCRERR_Msk            (0x1000UL)                /*!< MSPI INTSET: SCRERR (Bitfield-Mask: 0x01)             */
+#define MSPI_INTSET_CQERR_Pos             (11UL)                    /*!< MSPI INTSET: CQERR (Bit 11)                           */
+#define MSPI_INTSET_CQERR_Msk             (0x800UL)                 /*!< MSPI INTSET: CQERR (Bitfield-Mask: 0x01)              */
+#define MSPI_INTSET_CQPAUSED_Pos          (10UL)                    /*!< MSPI INTSET: CQPAUSED (Bit 10)                        */
+#define MSPI_INTSET_CQPAUSED_Msk          (0x400UL)                 /*!< MSPI INTSET: CQPAUSED (Bitfield-Mask: 0x01)           */
+#define MSPI_INTSET_CQUPD_Pos             (9UL)                     /*!< MSPI INTSET: CQUPD (Bit 9)                            */
+#define MSPI_INTSET_CQUPD_Msk             (0x200UL)                 /*!< MSPI INTSET: CQUPD (Bitfield-Mask: 0x01)              */
+#define MSPI_INTSET_CQCMP_Pos             (8UL)                     /*!< MSPI INTSET: CQCMP (Bit 8)                            */
+#define MSPI_INTSET_CQCMP_Msk             (0x100UL)                 /*!< MSPI INTSET: CQCMP (Bitfield-Mask: 0x01)              */
+#define MSPI_INTSET_DERR_Pos              (7UL)                     /*!< MSPI INTSET: DERR (Bit 7)                             */
+#define MSPI_INTSET_DERR_Msk              (0x80UL)                  /*!< MSPI INTSET: DERR (Bitfield-Mask: 0x01)               */
+#define MSPI_INTSET_DCMP_Pos              (6UL)                     /*!< MSPI INTSET: DCMP (Bit 6)                             */
+#define MSPI_INTSET_DCMP_Msk              (0x40UL)                  /*!< MSPI INTSET: DCMP (Bitfield-Mask: 0x01)               */
+#define MSPI_INTSET_RXF_Pos               (5UL)                     /*!< MSPI INTSET: RXF (Bit 5)                              */
+#define MSPI_INTSET_RXF_Msk               (0x20UL)                  /*!< MSPI INTSET: RXF (Bitfield-Mask: 0x01)                */
+#define MSPI_INTSET_RXO_Pos               (4UL)                     /*!< MSPI INTSET: RXO (Bit 4)                              */
+#define MSPI_INTSET_RXO_Msk               (0x10UL)                  /*!< MSPI INTSET: RXO (Bitfield-Mask: 0x01)                */
+#define MSPI_INTSET_RXU_Pos               (3UL)                     /*!< MSPI INTSET: RXU (Bit 3)                              */
+#define MSPI_INTSET_RXU_Msk               (0x8UL)                   /*!< MSPI INTSET: RXU (Bitfield-Mask: 0x01)                */
+#define MSPI_INTSET_TXO_Pos               (2UL)                     /*!< MSPI INTSET: TXO (Bit 2)                              */
+#define MSPI_INTSET_TXO_Msk               (0x4UL)                   /*!< MSPI INTSET: TXO (Bitfield-Mask: 0x01)                */
+#define MSPI_INTSET_TXE_Pos               (1UL)                     /*!< MSPI INTSET: TXE (Bit 1)                              */
+#define MSPI_INTSET_TXE_Msk               (0x2UL)                   /*!< MSPI INTSET: TXE (Bitfield-Mask: 0x01)                */
+#define MSPI_INTSET_CMDCMP_Pos            (0UL)                     /*!< MSPI INTSET: CMDCMP (Bit 0)                           */
+#define MSPI_INTSET_CMDCMP_Msk            (0x1UL)                   /*!< MSPI INTSET: CMDCMP (Bitfield-Mask: 0x01)             */
+/* ========================================================  DMACFG  ========================================================= */
+#define MSPI_DMACFG_DMAPWROFF_Pos         (18UL)                    /*!< MSPI DMACFG: DMAPWROFF (Bit 18)                       */
+#define MSPI_DMACFG_DMAPWROFF_Msk         (0x40000UL)               /*!< MSPI DMACFG: DMAPWROFF (Bitfield-Mask: 0x01)          */
+#define MSPI_DMACFG_DMAPRI_Pos            (3UL)                     /*!< MSPI DMACFG: DMAPRI (Bit 3)                           */
+#define MSPI_DMACFG_DMAPRI_Msk            (0x18UL)                  /*!< MSPI DMACFG: DMAPRI (Bitfield-Mask: 0x03)             */
+#define MSPI_DMACFG_DMADIR_Pos            (2UL)                     /*!< MSPI DMACFG: DMADIR (Bit 2)                           */
+#define MSPI_DMACFG_DMADIR_Msk            (0x4UL)                   /*!< MSPI DMACFG: DMADIR (Bitfield-Mask: 0x01)             */
+#define MSPI_DMACFG_DMAEN_Pos             (0UL)                     /*!< MSPI DMACFG: DMAEN (Bit 0)                            */
+#define MSPI_DMACFG_DMAEN_Msk             (0x3UL)                   /*!< MSPI DMACFG: DMAEN (Bitfield-Mask: 0x03)              */
+/* ========================================================  DMASTAT  ======================================================== */
+#define MSPI_DMASTAT_SCRERR_Pos           (3UL)                     /*!< MSPI DMASTAT: SCRERR (Bit 3)                          */
+#define MSPI_DMASTAT_SCRERR_Msk           (0x8UL)                   /*!< MSPI DMASTAT: SCRERR (Bitfield-Mask: 0x01)            */
+#define MSPI_DMASTAT_DMAERR_Pos           (2UL)                     /*!< MSPI DMASTAT: DMAERR (Bit 2)                          */
+#define MSPI_DMASTAT_DMAERR_Msk           (0x4UL)                   /*!< MSPI DMASTAT: DMAERR (Bitfield-Mask: 0x01)            */
+#define MSPI_DMASTAT_DMACPL_Pos           (1UL)                     /*!< MSPI DMASTAT: DMACPL (Bit 1)                          */
+#define MSPI_DMASTAT_DMACPL_Msk           (0x2UL)                   /*!< MSPI DMASTAT: DMACPL (Bitfield-Mask: 0x01)            */
+#define MSPI_DMASTAT_DMATIP_Pos           (0UL)                     /*!< MSPI DMASTAT: DMATIP (Bit 0)                          */
+#define MSPI_DMASTAT_DMATIP_Msk           (0x1UL)                   /*!< MSPI DMASTAT: DMATIP (Bitfield-Mask: 0x01)            */
+/* ======================================================  DMATARGADDR  ====================================================== */
+#define MSPI_DMATARGADDR_TARGADDR_Pos     (0UL)                     /*!< MSPI DMATARGADDR: TARGADDR (Bit 0)                    */
+#define MSPI_DMATARGADDR_TARGADDR_Msk     (0xffffffffUL)            /*!< MSPI DMATARGADDR: TARGADDR (Bitfield-Mask: 0xffffffff) */
+/* ======================================================  DMADEVADDR  ======================================================= */
+#define MSPI_DMADEVADDR_DEVADDR_Pos       (0UL)                     /*!< MSPI DMADEVADDR: DEVADDR (Bit 0)                      */
+#define MSPI_DMADEVADDR_DEVADDR_Msk       (0xffffffffUL)            /*!< MSPI DMADEVADDR: DEVADDR (Bitfield-Mask: 0xffffffff)  */
+/* ======================================================  DMATOTCOUNT  ====================================================== */
+#define MSPI_DMATOTCOUNT_TOTCOUNT_Pos     (0UL)                     /*!< MSPI DMATOTCOUNT: TOTCOUNT (Bit 0)                    */
+#define MSPI_DMATOTCOUNT_TOTCOUNT_Msk     (0xffffUL)                /*!< MSPI DMATOTCOUNT: TOTCOUNT (Bitfield-Mask: 0xffff)    */
+/* =======================================================  DMABCOUNT  ======================================================= */
+#define MSPI_DMABCOUNT_BCOUNT_Pos         (0UL)                     /*!< MSPI DMABCOUNT: BCOUNT (Bit 0)                        */
+#define MSPI_DMABCOUNT_BCOUNT_Msk         (0xffUL)                  /*!< MSPI DMABCOUNT: BCOUNT (Bitfield-Mask: 0xff)          */
+/* =======================================================  DMATHRESH  ======================================================= */
+#define MSPI_DMATHRESH_DMATHRESH_Pos      (0UL)                     /*!< MSPI DMATHRESH: DMATHRESH (Bit 0)                     */
+#define MSPI_DMATHRESH_DMATHRESH_Msk      (0xfUL)                   /*!< MSPI DMATHRESH: DMATHRESH (Bitfield-Mask: 0x0f)       */
+/* =========================================================  CQCFG  ========================================================= */
+#define MSPI_CQCFG_CQAUTOCLEARMASK_Pos    (3UL)                     /*!< MSPI CQCFG: CQAUTOCLEARMASK (Bit 3)                   */
+#define MSPI_CQCFG_CQAUTOCLEARMASK_Msk    (0x8UL)                   /*!< MSPI CQCFG: CQAUTOCLEARMASK (Bitfield-Mask: 0x01)     */
+#define MSPI_CQCFG_CQPWROFF_Pos           (2UL)                     /*!< MSPI CQCFG: CQPWROFF (Bit 2)                          */
+#define MSPI_CQCFG_CQPWROFF_Msk           (0x4UL)                   /*!< MSPI CQCFG: CQPWROFF (Bitfield-Mask: 0x01)            */
+#define MSPI_CQCFG_CQPRI_Pos              (1UL)                     /*!< MSPI CQCFG: CQPRI (Bit 1)                             */
+#define MSPI_CQCFG_CQPRI_Msk              (0x2UL)                   /*!< MSPI CQCFG: CQPRI (Bitfield-Mask: 0x01)               */
+#define MSPI_CQCFG_CQEN_Pos               (0UL)                     /*!< MSPI CQCFG: CQEN (Bit 0)                              */
+#define MSPI_CQCFG_CQEN_Msk               (0x1UL)                   /*!< MSPI CQCFG: CQEN (Bitfield-Mask: 0x01)                */
+/* ========================================================  CQADDR  ========================================================= */
+#define MSPI_CQADDR_CQADDR_Pos            (0UL)                     /*!< MSPI CQADDR: CQADDR (Bit 0)                           */
+#define MSPI_CQADDR_CQADDR_Msk            (0x1fffffffUL)            /*!< MSPI CQADDR: CQADDR (Bitfield-Mask: 0x1fffffff)       */
+/* ========================================================  CQSTAT  ========================================================= */
+#define MSPI_CQSTAT_CQPAUSED_Pos          (3UL)                     /*!< MSPI CQSTAT: CQPAUSED (Bit 3)                         */
+#define MSPI_CQSTAT_CQPAUSED_Msk          (0x8UL)                   /*!< MSPI CQSTAT: CQPAUSED (Bitfield-Mask: 0x01)           */
+#define MSPI_CQSTAT_CQERR_Pos             (2UL)                     /*!< MSPI CQSTAT: CQERR (Bit 2)                            */
+#define MSPI_CQSTAT_CQERR_Msk             (0x4UL)                   /*!< MSPI CQSTAT: CQERR (Bitfield-Mask: 0x01)              */
+#define MSPI_CQSTAT_CQCPL_Pos             (1UL)                     /*!< MSPI CQSTAT: CQCPL (Bit 1)                            */
+#define MSPI_CQSTAT_CQCPL_Msk             (0x2UL)                   /*!< MSPI CQSTAT: CQCPL (Bitfield-Mask: 0x01)              */
+#define MSPI_CQSTAT_CQTIP_Pos             (0UL)                     /*!< MSPI CQSTAT: CQTIP (Bit 0)                            */
+#define MSPI_CQSTAT_CQTIP_Msk             (0x1UL)                   /*!< MSPI CQSTAT: CQTIP (Bitfield-Mask: 0x01)              */
+/* ========================================================  CQFLAGS  ======================================================== */
+#define MSPI_CQFLAGS_CQFLAGS_Pos          (0UL)                     /*!< MSPI CQFLAGS: CQFLAGS (Bit 0)                         */
+#define MSPI_CQFLAGS_CQFLAGS_Msk          (0xffffUL)                /*!< MSPI CQFLAGS: CQFLAGS (Bitfield-Mask: 0xffff)         */
+/* ======================================================  CQSETCLEAR  ======================================================= */
+#define MSPI_CQSETCLEAR_CQFCLR_Pos        (16UL)                    /*!< MSPI CQSETCLEAR: CQFCLR (Bit 16)                      */
+#define MSPI_CQSETCLEAR_CQFCLR_Msk        (0xff0000UL)              /*!< MSPI CQSETCLEAR: CQFCLR (Bitfield-Mask: 0xff)         */
+#define MSPI_CQSETCLEAR_CQFTOGGLE_Pos     (8UL)                     /*!< MSPI CQSETCLEAR: CQFTOGGLE (Bit 8)                    */
+#define MSPI_CQSETCLEAR_CQFTOGGLE_Msk     (0xff00UL)                /*!< MSPI CQSETCLEAR: CQFTOGGLE (Bitfield-Mask: 0xff)      */
+#define MSPI_CQSETCLEAR_CQFSET_Pos        (0UL)                     /*!< MSPI CQSETCLEAR: CQFSET (Bit 0)                       */
+#define MSPI_CQSETCLEAR_CQFSET_Msk        (0xffUL)                  /*!< MSPI CQSETCLEAR: CQFSET (Bitfield-Mask: 0xff)         */
+/* ========================================================  CQPAUSE  ======================================================== */
+#define MSPI_CQPAUSE_CQMASK_Pos           (0UL)                     /*!< MSPI CQPAUSE: CQMASK (Bit 0)                          */
+#define MSPI_CQPAUSE_CQMASK_Msk           (0xffffUL)                /*!< MSPI CQPAUSE: CQMASK (Bitfield-Mask: 0xffff)          */
+/* =======================================================  CQCURIDX  ======================================================== */
+#define MSPI_CQCURIDX_CQCURIDX_Pos        (0UL)                     /*!< MSPI CQCURIDX: CQCURIDX (Bit 0)                       */
+#define MSPI_CQCURIDX_CQCURIDX_Msk        (0xffUL)                  /*!< MSPI CQCURIDX: CQCURIDX (Bitfield-Mask: 0xff)         */
+/* =======================================================  CQENDIDX  ======================================================== */
+#define MSPI_CQENDIDX_CQENDIDX_Pos        (0UL)                     /*!< MSPI CQENDIDX: CQENDIDX (Bit 0)                       */
+#define MSPI_CQENDIDX_CQENDIDX_Msk        (0xffUL)                  /*!< MSPI CQENDIDX: CQENDIDX (Bitfield-Mask: 0xff)         */
+
+
+/* =========================================================================================================================== */
+/* ================                                            PDM                                            ================ */
+/* =========================================================================================================================== */
+
+/* =========================================================  PCFG  ========================================================== */
+#define PDM_PCFG_LRSWAP_Pos               (31UL)                    /*!< PDM PCFG: LRSWAP (Bit 31)                             */
+#define PDM_PCFG_LRSWAP_Msk               (0x80000000UL)            /*!< PDM PCFG: LRSWAP (Bitfield-Mask: 0x01)                */
+#define PDM_PCFG_PGARIGHT_Pos             (26UL)                    /*!< PDM PCFG: PGARIGHT (Bit 26)                           */
+#define PDM_PCFG_PGARIGHT_Msk             (0x7c000000UL)            /*!< PDM PCFG: PGARIGHT (Bitfield-Mask: 0x1f)              */
+#define PDM_PCFG_PGALEFT_Pos              (21UL)                    /*!< PDM PCFG: PGALEFT (Bit 21)                            */
+#define PDM_PCFG_PGALEFT_Msk              (0x3e00000UL)             /*!< PDM PCFG: PGALEFT (Bitfield-Mask: 0x1f)               */
+#define PDM_PCFG_MCLKDIV_Pos              (17UL)                    /*!< PDM PCFG: MCLKDIV (Bit 17)                            */
+#define PDM_PCFG_MCLKDIV_Msk              (0x60000UL)               /*!< PDM PCFG: MCLKDIV (Bitfield-Mask: 0x03)               */
+#define PDM_PCFG_SINCRATE_Pos             (10UL)                    /*!< PDM PCFG: SINCRATE (Bit 10)                           */
+#define PDM_PCFG_SINCRATE_Msk             (0x1fc00UL)               /*!< PDM PCFG: SINCRATE (Bitfield-Mask: 0x7f)              */
+#define PDM_PCFG_ADCHPD_Pos               (9UL)                     /*!< PDM PCFG: ADCHPD (Bit 9)                              */
+#define PDM_PCFG_ADCHPD_Msk               (0x200UL)                 /*!< PDM PCFG: ADCHPD (Bitfield-Mask: 0x01)                */
+#define PDM_PCFG_HPCUTOFF_Pos             (5UL)                     /*!< PDM PCFG: HPCUTOFF (Bit 5)                            */
+#define PDM_PCFG_HPCUTOFF_Msk             (0x1e0UL)                 /*!< PDM PCFG: HPCUTOFF (Bitfield-Mask: 0x0f)              */
+#define PDM_PCFG_CYCLES_Pos               (2UL)                     /*!< PDM PCFG: CYCLES (Bit 2)                              */
+#define PDM_PCFG_CYCLES_Msk               (0x1cUL)                  /*!< PDM PCFG: CYCLES (Bitfield-Mask: 0x07)                */
+#define PDM_PCFG_SOFTMUTE_Pos             (1UL)                     /*!< PDM PCFG: SOFTMUTE (Bit 1)                            */
+#define PDM_PCFG_SOFTMUTE_Msk             (0x2UL)                   /*!< PDM PCFG: SOFTMUTE (Bitfield-Mask: 0x01)              */
+#define PDM_PCFG_PDMCOREEN_Pos            (0UL)                     /*!< PDM PCFG: PDMCOREEN (Bit 0)                           */
+#define PDM_PCFG_PDMCOREEN_Msk            (0x1UL)                   /*!< PDM PCFG: PDMCOREEN (Bitfield-Mask: 0x01)             */
+/* =========================================================  VCFG  ========================================================== */
+#define PDM_VCFG_IOCLKEN_Pos              (31UL)                    /*!< PDM VCFG: IOCLKEN (Bit 31)                            */
+#define PDM_VCFG_IOCLKEN_Msk              (0x80000000UL)            /*!< PDM VCFG: IOCLKEN (Bitfield-Mask: 0x01)               */
+#define PDM_VCFG_RSTB_Pos                 (30UL)                    /*!< PDM VCFG: RSTB (Bit 30)                               */
+#define PDM_VCFG_RSTB_Msk                 (0x40000000UL)            /*!< PDM VCFG: RSTB (Bitfield-Mask: 0x01)                  */
+#define PDM_VCFG_PDMCLKSEL_Pos            (27UL)                    /*!< PDM VCFG: PDMCLKSEL (Bit 27)                          */
+#define PDM_VCFG_PDMCLKSEL_Msk            (0x38000000UL)            /*!< PDM VCFG: PDMCLKSEL (Bitfield-Mask: 0x07)             */
+#define PDM_VCFG_PDMCLKEN_Pos             (26UL)                    /*!< PDM VCFG: PDMCLKEN (Bit 26)                           */
+#define PDM_VCFG_PDMCLKEN_Msk             (0x4000000UL)             /*!< PDM VCFG: PDMCLKEN (Bitfield-Mask: 0x01)              */
+#define PDM_VCFG_I2SEN_Pos                (20UL)                    /*!< PDM VCFG: I2SEN (Bit 20)                              */
+#define PDM_VCFG_I2SEN_Msk                (0x100000UL)              /*!< PDM VCFG: I2SEN (Bitfield-Mask: 0x01)                 */
+#define PDM_VCFG_BCLKINV_Pos              (19UL)                    /*!< PDM VCFG: BCLKINV (Bit 19)                            */
+#define PDM_VCFG_BCLKINV_Msk              (0x80000UL)               /*!< PDM VCFG: BCLKINV (Bitfield-Mask: 0x01)               */
+#define PDM_VCFG_DMICKDEL_Pos             (17UL)                    /*!< PDM VCFG: DMICKDEL (Bit 17)                           */
+#define PDM_VCFG_DMICKDEL_Msk             (0x20000UL)               /*!< PDM VCFG: DMICKDEL (Bitfield-Mask: 0x01)              */
+#define PDM_VCFG_SELAP_Pos                (16UL)                    /*!< PDM VCFG: SELAP (Bit 16)                              */
+#define PDM_VCFG_SELAP_Msk                (0x10000UL)               /*!< PDM VCFG: SELAP (Bitfield-Mask: 0x01)                 */
+#define PDM_VCFG_PCMPACK_Pos              (8UL)                     /*!< PDM VCFG: PCMPACK (Bit 8)                             */
+#define PDM_VCFG_PCMPACK_Msk              (0x100UL)                 /*!< PDM VCFG: PCMPACK (Bitfield-Mask: 0x01)               */
+#define PDM_VCFG_CHSET_Pos                (3UL)                     /*!< PDM VCFG: CHSET (Bit 3)                               */
+#define PDM_VCFG_CHSET_Msk                (0x18UL)                  /*!< PDM VCFG: CHSET (Bitfield-Mask: 0x03)                 */
+/* =======================================================  VOICESTAT  ======================================================= */
+#define PDM_VOICESTAT_FIFOCNT_Pos         (0UL)                     /*!< PDM VOICESTAT: FIFOCNT (Bit 0)                        */
+#define PDM_VOICESTAT_FIFOCNT_Msk         (0x3fUL)                  /*!< PDM VOICESTAT: FIFOCNT (Bitfield-Mask: 0x3f)          */
+/* =======================================================  FIFOREAD  ======================================================== */
+#define PDM_FIFOREAD_FIFOREAD_Pos         (0UL)                     /*!< PDM FIFOREAD: FIFOREAD (Bit 0)                        */
+#define PDM_FIFOREAD_FIFOREAD_Msk         (0xffffffffUL)            /*!< PDM FIFOREAD: FIFOREAD (Bitfield-Mask: 0xffffffff)    */
+/* =======================================================  FIFOFLUSH  ======================================================= */
+#define PDM_FIFOFLUSH_FIFOFLUSH_Pos       (0UL)                     /*!< PDM FIFOFLUSH: FIFOFLUSH (Bit 0)                      */
+#define PDM_FIFOFLUSH_FIFOFLUSH_Msk       (0x1UL)                   /*!< PDM FIFOFLUSH: FIFOFLUSH (Bitfield-Mask: 0x01)        */
+/* ========================================================  FIFOTHR  ======================================================== */
+#define PDM_FIFOTHR_FIFOTHR_Pos           (0UL)                     /*!< PDM FIFOTHR: FIFOTHR (Bit 0)                          */
+#define PDM_FIFOTHR_FIFOTHR_Msk           (0x1fUL)                  /*!< PDM FIFOTHR: FIFOTHR (Bitfield-Mask: 0x1f)            */
+/* =========================================================  INTEN  ========================================================= */
+#define PDM_INTEN_DERR_Pos                (4UL)                     /*!< PDM INTEN: DERR (Bit 4)                               */
+#define PDM_INTEN_DERR_Msk                (0x10UL)                  /*!< PDM INTEN: DERR (Bitfield-Mask: 0x01)                 */
+#define PDM_INTEN_DCMP_Pos                (3UL)                     /*!< PDM INTEN: DCMP (Bit 3)                               */
+#define PDM_INTEN_DCMP_Msk                (0x8UL)                   /*!< PDM INTEN: DCMP (Bitfield-Mask: 0x01)                 */
+#define PDM_INTEN_UNDFL_Pos               (2UL)                     /*!< PDM INTEN: UNDFL (Bit 2)                              */
+#define PDM_INTEN_UNDFL_Msk               (0x4UL)                   /*!< PDM INTEN: UNDFL (Bitfield-Mask: 0x01)                */
+#define PDM_INTEN_OVF_Pos                 (1UL)                     /*!< PDM INTEN: OVF (Bit 1)                                */
+#define PDM_INTEN_OVF_Msk                 (0x2UL)                   /*!< PDM INTEN: OVF (Bitfield-Mask: 0x01)                  */
+#define PDM_INTEN_THR_Pos                 (0UL)                     /*!< PDM INTEN: THR (Bit 0)                                */
+#define PDM_INTEN_THR_Msk                 (0x1UL)                   /*!< PDM INTEN: THR (Bitfield-Mask: 0x01)                  */
+/* ========================================================  INTSTAT  ======================================================== */
+#define PDM_INTSTAT_DERR_Pos              (4UL)                     /*!< PDM INTSTAT: DERR (Bit 4)                             */
+#define PDM_INTSTAT_DERR_Msk              (0x10UL)                  /*!< PDM INTSTAT: DERR (Bitfield-Mask: 0x01)               */
+#define PDM_INTSTAT_DCMP_Pos              (3UL)                     /*!< PDM INTSTAT: DCMP (Bit 3)                             */
+#define PDM_INTSTAT_DCMP_Msk              (0x8UL)                   /*!< PDM INTSTAT: DCMP (Bitfield-Mask: 0x01)               */
+#define PDM_INTSTAT_UNDFL_Pos             (2UL)                     /*!< PDM INTSTAT: UNDFL (Bit 2)                            */
+#define PDM_INTSTAT_UNDFL_Msk             (0x4UL)                   /*!< PDM INTSTAT: UNDFL (Bitfield-Mask: 0x01)              */
+#define PDM_INTSTAT_OVF_Pos               (1UL)                     /*!< PDM INTSTAT: OVF (Bit 1)                              */
+#define PDM_INTSTAT_OVF_Msk               (0x2UL)                   /*!< PDM INTSTAT: OVF (Bitfield-Mask: 0x01)                */
+#define PDM_INTSTAT_THR_Pos               (0UL)                     /*!< PDM INTSTAT: THR (Bit 0)                              */
+#define PDM_INTSTAT_THR_Msk               (0x1UL)                   /*!< PDM INTSTAT: THR (Bitfield-Mask: 0x01)                */
+/* ========================================================  INTCLR  ========================================================= */
+#define PDM_INTCLR_DERR_Pos               (4UL)                     /*!< PDM INTCLR: DERR (Bit 4)                              */
+#define PDM_INTCLR_DERR_Msk               (0x10UL)                  /*!< PDM INTCLR: DERR (Bitfield-Mask: 0x01)                */
+#define PDM_INTCLR_DCMP_Pos               (3UL)                     /*!< PDM INTCLR: DCMP (Bit 3)                              */
+#define PDM_INTCLR_DCMP_Msk               (0x8UL)                   /*!< PDM INTCLR: DCMP (Bitfield-Mask: 0x01)                */
+#define PDM_INTCLR_UNDFL_Pos              (2UL)                     /*!< PDM INTCLR: UNDFL (Bit 2)                             */
+#define PDM_INTCLR_UNDFL_Msk              (0x4UL)                   /*!< PDM INTCLR: UNDFL (Bitfield-Mask: 0x01)               */
+#define PDM_INTCLR_OVF_Pos                (1UL)                     /*!< PDM INTCLR: OVF (Bit 1)                               */
+#define PDM_INTCLR_OVF_Msk                (0x2UL)                   /*!< PDM INTCLR: OVF (Bitfield-Mask: 0x01)                 */
+#define PDM_INTCLR_THR_Pos                (0UL)                     /*!< PDM INTCLR: THR (Bit 0)                               */
+#define PDM_INTCLR_THR_Msk                (0x1UL)                   /*!< PDM INTCLR: THR (Bitfield-Mask: 0x01)                 */
+/* ========================================================  INTSET  ========================================================= */
+#define PDM_INTSET_DERR_Pos               (4UL)                     /*!< PDM INTSET: DERR (Bit 4)                              */
+#define PDM_INTSET_DERR_Msk               (0x10UL)                  /*!< PDM INTSET: DERR (Bitfield-Mask: 0x01)                */
+#define PDM_INTSET_DCMP_Pos               (3UL)                     /*!< PDM INTSET: DCMP (Bit 3)                              */
+#define PDM_INTSET_DCMP_Msk               (0x8UL)                   /*!< PDM INTSET: DCMP (Bitfield-Mask: 0x01)                */
+#define PDM_INTSET_UNDFL_Pos              (2UL)                     /*!< PDM INTSET: UNDFL (Bit 2)                             */
+#define PDM_INTSET_UNDFL_Msk              (0x4UL)                   /*!< PDM INTSET: UNDFL (Bitfield-Mask: 0x01)               */
+#define PDM_INTSET_OVF_Pos                (1UL)                     /*!< PDM INTSET: OVF (Bit 1)                               */
+#define PDM_INTSET_OVF_Msk                (0x2UL)                   /*!< PDM INTSET: OVF (Bitfield-Mask: 0x01)                 */
+#define PDM_INTSET_THR_Pos                (0UL)                     /*!< PDM INTSET: THR (Bit 0)                               */
+#define PDM_INTSET_THR_Msk                (0x1UL)                   /*!< PDM INTSET: THR (Bitfield-Mask: 0x01)                 */
+/* =======================================================  DMATRIGEN  ======================================================= */
+#define PDM_DMATRIGEN_DTHR90_Pos          (1UL)                     /*!< PDM DMATRIGEN: DTHR90 (Bit 1)                         */
+#define PDM_DMATRIGEN_DTHR90_Msk          (0x2UL)                   /*!< PDM DMATRIGEN: DTHR90 (Bitfield-Mask: 0x01)           */
+#define PDM_DMATRIGEN_DTHR_Pos            (0UL)                     /*!< PDM DMATRIGEN: DTHR (Bit 0)                           */
+#define PDM_DMATRIGEN_DTHR_Msk            (0x1UL)                   /*!< PDM DMATRIGEN: DTHR (Bitfield-Mask: 0x01)             */
+/* ======================================================  DMATRIGSTAT  ====================================================== */
+#define PDM_DMATRIGSTAT_DTHR90STAT_Pos    (1UL)                     /*!< PDM DMATRIGSTAT: DTHR90STAT (Bit 1)                   */
+#define PDM_DMATRIGSTAT_DTHR90STAT_Msk    (0x2UL)                   /*!< PDM DMATRIGSTAT: DTHR90STAT (Bitfield-Mask: 0x01)     */
+#define PDM_DMATRIGSTAT_DTHRSTAT_Pos      (0UL)                     /*!< PDM DMATRIGSTAT: DTHRSTAT (Bit 0)                     */
+#define PDM_DMATRIGSTAT_DTHRSTAT_Msk      (0x1UL)                   /*!< PDM DMATRIGSTAT: DTHRSTAT (Bitfield-Mask: 0x01)       */
+/* ========================================================  DMACFG  ========================================================= */
+#define PDM_DMACFG_DPWROFF_Pos            (10UL)                    /*!< PDM DMACFG: DPWROFF (Bit 10)                          */
+#define PDM_DMACFG_DPWROFF_Msk            (0x400UL)                 /*!< PDM DMACFG: DPWROFF (Bitfield-Mask: 0x01)             */
+#define PDM_DMACFG_DAUTOHIP_Pos           (9UL)                     /*!< PDM DMACFG: DAUTOHIP (Bit 9)                          */
+#define PDM_DMACFG_DAUTOHIP_Msk           (0x200UL)                 /*!< PDM DMACFG: DAUTOHIP (Bitfield-Mask: 0x01)            */
+#define PDM_DMACFG_DMAPRI_Pos             (8UL)                     /*!< PDM DMACFG: DMAPRI (Bit 8)                            */
+#define PDM_DMACFG_DMAPRI_Msk             (0x100UL)                 /*!< PDM DMACFG: DMAPRI (Bitfield-Mask: 0x01)              */
+#define PDM_DMACFG_DMADIR_Pos             (2UL)                     /*!< PDM DMACFG: DMADIR (Bit 2)                            */
+#define PDM_DMACFG_DMADIR_Msk             (0x4UL)                   /*!< PDM DMACFG: DMADIR (Bitfield-Mask: 0x01)              */
+#define PDM_DMACFG_DMAEN_Pos              (0UL)                     /*!< PDM DMACFG: DMAEN (Bit 0)                             */
+#define PDM_DMACFG_DMAEN_Msk              (0x1UL)                   /*!< PDM DMACFG: DMAEN (Bitfield-Mask: 0x01)               */
+/* ======================================================  DMATOTCOUNT  ====================================================== */
+#define PDM_DMATOTCOUNT_TOTCOUNT_Pos      (0UL)                     /*!< PDM DMATOTCOUNT: TOTCOUNT (Bit 0)                     */
+#define PDM_DMATOTCOUNT_TOTCOUNT_Msk      (0xfffffUL)               /*!< PDM DMATOTCOUNT: TOTCOUNT (Bitfield-Mask: 0xfffff)    */
+/* ======================================================  DMATARGADDR  ====================================================== */
+#define PDM_DMATARGADDR_UTARGADDR_Pos     (20UL)                    /*!< PDM DMATARGADDR: UTARGADDR (Bit 20)                   */
+#define PDM_DMATARGADDR_UTARGADDR_Msk     (0xfff00000UL)            /*!< PDM DMATARGADDR: UTARGADDR (Bitfield-Mask: 0xfff)     */
+#define PDM_DMATARGADDR_LTARGADDR_Pos     (0UL)                     /*!< PDM DMATARGADDR: LTARGADDR (Bit 0)                    */
+#define PDM_DMATARGADDR_LTARGADDR_Msk     (0xfffffUL)               /*!< PDM DMATARGADDR: LTARGADDR (Bitfield-Mask: 0xfffff)   */
+/* ========================================================  DMASTAT  ======================================================== */
+#define PDM_DMASTAT_DMAERR_Pos            (2UL)                     /*!< PDM DMASTAT: DMAERR (Bit 2)                           */
+#define PDM_DMASTAT_DMAERR_Msk            (0x4UL)                   /*!< PDM DMASTAT: DMAERR (Bitfield-Mask: 0x01)             */
+#define PDM_DMASTAT_DMACPL_Pos            (1UL)                     /*!< PDM DMASTAT: DMACPL (Bit 1)                           */
+#define PDM_DMASTAT_DMACPL_Msk            (0x2UL)                   /*!< PDM DMASTAT: DMACPL (Bitfield-Mask: 0x01)             */
+#define PDM_DMASTAT_DMATIP_Pos            (0UL)                     /*!< PDM DMASTAT: DMATIP (Bit 0)                           */
+#define PDM_DMASTAT_DMATIP_Msk            (0x1UL)                   /*!< PDM DMASTAT: DMATIP (Bitfield-Mask: 0x01)             */
+
+
+/* =========================================================================================================================== */
+/* ================                                          PWRCTRL                                          ================ */
+/* =========================================================================================================================== */
+
+/* =======================================================  SUPPLYSRC  ======================================================= */
+#define PWRCTRL_SUPPLYSRC_BLEBUCKEN_Pos   (0UL)                     /*!< PWRCTRL SUPPLYSRC: BLEBUCKEN (Bit 0)                  */
+#define PWRCTRL_SUPPLYSRC_BLEBUCKEN_Msk   (0x1UL)                   /*!< PWRCTRL SUPPLYSRC: BLEBUCKEN (Bitfield-Mask: 0x01)    */
+/* =====================================================  SUPPLYSTATUS  ====================================================== */
+#define PWRCTRL_SUPPLYSTATUS_BLEBUCKON_Pos (1UL)                    /*!< PWRCTRL SUPPLYSTATUS: BLEBUCKON (Bit 1)               */
+#define PWRCTRL_SUPPLYSTATUS_BLEBUCKON_Msk (0x2UL)                  /*!< PWRCTRL SUPPLYSTATUS: BLEBUCKON (Bitfield-Mask: 0x01) */
+#define PWRCTRL_SUPPLYSTATUS_SIMOBUCKON_Pos (0UL)                   /*!< PWRCTRL SUPPLYSTATUS: SIMOBUCKON (Bit 0)              */
+#define PWRCTRL_SUPPLYSTATUS_SIMOBUCKON_Msk (0x1UL)                 /*!< PWRCTRL SUPPLYSTATUS: SIMOBUCKON (Bitfield-Mask: 0x01) */
+/* =======================================================  DEVPWREN  ======================================================== */
+#define PWRCTRL_DEVPWREN_PWRBLEL_Pos      (13UL)                    /*!< PWRCTRL DEVPWREN: PWRBLEL (Bit 13)                    */
+#define PWRCTRL_DEVPWREN_PWRBLEL_Msk      (0x2000UL)                /*!< PWRCTRL DEVPWREN: PWRBLEL (Bitfield-Mask: 0x01)       */
+#define PWRCTRL_DEVPWREN_PWRPDM_Pos       (12UL)                    /*!< PWRCTRL DEVPWREN: PWRPDM (Bit 12)                     */
+#define PWRCTRL_DEVPWREN_PWRPDM_Msk       (0x1000UL)                /*!< PWRCTRL DEVPWREN: PWRPDM (Bitfield-Mask: 0x01)        */
+#define PWRCTRL_DEVPWREN_PWRMSPI_Pos      (11UL)                    /*!< PWRCTRL DEVPWREN: PWRMSPI (Bit 11)                    */
+#define PWRCTRL_DEVPWREN_PWRMSPI_Msk      (0x800UL)                 /*!< PWRCTRL DEVPWREN: PWRMSPI (Bitfield-Mask: 0x01)       */
+#define PWRCTRL_DEVPWREN_PWRSCARD_Pos     (10UL)                    /*!< PWRCTRL DEVPWREN: PWRSCARD (Bit 10)                   */
+#define PWRCTRL_DEVPWREN_PWRSCARD_Msk     (0x400UL)                 /*!< PWRCTRL DEVPWREN: PWRSCARD (Bitfield-Mask: 0x01)      */
+#define PWRCTRL_DEVPWREN_PWRADC_Pos       (9UL)                     /*!< PWRCTRL DEVPWREN: PWRADC (Bit 9)                      */
+#define PWRCTRL_DEVPWREN_PWRADC_Msk       (0x200UL)                 /*!< PWRCTRL DEVPWREN: PWRADC (Bitfield-Mask: 0x01)        */
+#define PWRCTRL_DEVPWREN_PWRUART1_Pos     (8UL)                     /*!< PWRCTRL DEVPWREN: PWRUART1 (Bit 8)                    */
+#define PWRCTRL_DEVPWREN_PWRUART1_Msk     (0x100UL)                 /*!< PWRCTRL DEVPWREN: PWRUART1 (Bitfield-Mask: 0x01)      */
+#define PWRCTRL_DEVPWREN_PWRUART0_Pos     (7UL)                     /*!< PWRCTRL DEVPWREN: PWRUART0 (Bit 7)                    */
+#define PWRCTRL_DEVPWREN_PWRUART0_Msk     (0x80UL)                  /*!< PWRCTRL DEVPWREN: PWRUART0 (Bitfield-Mask: 0x01)      */
+#define PWRCTRL_DEVPWREN_PWRIOM5_Pos      (6UL)                     /*!< PWRCTRL DEVPWREN: PWRIOM5 (Bit 6)                     */
+#define PWRCTRL_DEVPWREN_PWRIOM5_Msk      (0x40UL)                  /*!< PWRCTRL DEVPWREN: PWRIOM5 (Bitfield-Mask: 0x01)       */
+#define PWRCTRL_DEVPWREN_PWRIOM4_Pos      (5UL)                     /*!< PWRCTRL DEVPWREN: PWRIOM4 (Bit 5)                     */
+#define PWRCTRL_DEVPWREN_PWRIOM4_Msk      (0x20UL)                  /*!< PWRCTRL DEVPWREN: PWRIOM4 (Bitfield-Mask: 0x01)       */
+#define PWRCTRL_DEVPWREN_PWRIOM3_Pos      (4UL)                     /*!< PWRCTRL DEVPWREN: PWRIOM3 (Bit 4)                     */
+#define PWRCTRL_DEVPWREN_PWRIOM3_Msk      (0x10UL)                  /*!< PWRCTRL DEVPWREN: PWRIOM3 (Bitfield-Mask: 0x01)       */
+#define PWRCTRL_DEVPWREN_PWRIOM2_Pos      (3UL)                     /*!< PWRCTRL DEVPWREN: PWRIOM2 (Bit 3)                     */
+#define PWRCTRL_DEVPWREN_PWRIOM2_Msk      (0x8UL)                   /*!< PWRCTRL DEVPWREN: PWRIOM2 (Bitfield-Mask: 0x01)       */
+#define PWRCTRL_DEVPWREN_PWRIOM1_Pos      (2UL)                     /*!< PWRCTRL DEVPWREN: PWRIOM1 (Bit 2)                     */
+#define PWRCTRL_DEVPWREN_PWRIOM1_Msk      (0x4UL)                   /*!< PWRCTRL DEVPWREN: PWRIOM1 (Bitfield-Mask: 0x01)       */
+#define PWRCTRL_DEVPWREN_PWRIOM0_Pos      (1UL)                     /*!< PWRCTRL DEVPWREN: PWRIOM0 (Bit 1)                     */
+#define PWRCTRL_DEVPWREN_PWRIOM0_Msk      (0x2UL)                   /*!< PWRCTRL DEVPWREN: PWRIOM0 (Bitfield-Mask: 0x01)       */
+#define PWRCTRL_DEVPWREN_PWRIOS_Pos       (0UL)                     /*!< PWRCTRL DEVPWREN: PWRIOS (Bit 0)                      */
+#define PWRCTRL_DEVPWREN_PWRIOS_Msk       (0x1UL)                   /*!< PWRCTRL DEVPWREN: PWRIOS (Bitfield-Mask: 0x01)        */
+/* =====================================================  MEMPWDINSLEEP  ===================================================== */
+#define PWRCTRL_MEMPWDINSLEEP_CACHEPWDSLP_Pos (31UL)                /*!< PWRCTRL MEMPWDINSLEEP: CACHEPWDSLP (Bit 31)           */
+#define PWRCTRL_MEMPWDINSLEEP_CACHEPWDSLP_Msk (0x80000000UL)        /*!< PWRCTRL MEMPWDINSLEEP: CACHEPWDSLP (Bitfield-Mask: 0x01) */
+#define PWRCTRL_MEMPWDINSLEEP_FLASH1PWDSLP_Pos (14UL)               /*!< PWRCTRL MEMPWDINSLEEP: FLASH1PWDSLP (Bit 14)          */
+#define PWRCTRL_MEMPWDINSLEEP_FLASH1PWDSLP_Msk (0x4000UL)           /*!< PWRCTRL MEMPWDINSLEEP: FLASH1PWDSLP (Bitfield-Mask: 0x01) */
+#define PWRCTRL_MEMPWDINSLEEP_FLASH0PWDSLP_Pos (13UL)               /*!< PWRCTRL MEMPWDINSLEEP: FLASH0PWDSLP (Bit 13)          */
+#define PWRCTRL_MEMPWDINSLEEP_FLASH0PWDSLP_Msk (0x2000UL)           /*!< PWRCTRL MEMPWDINSLEEP: FLASH0PWDSLP (Bitfield-Mask: 0x01) */
+#define PWRCTRL_MEMPWDINSLEEP_SRAMPWDSLP_Pos (3UL)                  /*!< PWRCTRL MEMPWDINSLEEP: SRAMPWDSLP (Bit 3)             */
+#define PWRCTRL_MEMPWDINSLEEP_SRAMPWDSLP_Msk (0x1ff8UL)             /*!< PWRCTRL MEMPWDINSLEEP: SRAMPWDSLP (Bitfield-Mask: 0x3ff) */
+#define PWRCTRL_MEMPWDINSLEEP_DTCMPWDSLP_Pos (0UL)                  /*!< PWRCTRL MEMPWDINSLEEP: DTCMPWDSLP (Bit 0)             */
+#define PWRCTRL_MEMPWDINSLEEP_DTCMPWDSLP_Msk (0x7UL)                /*!< PWRCTRL MEMPWDINSLEEP: DTCMPWDSLP (Bitfield-Mask: 0x07) */
+/* =======================================================  MEMPWREN  ======================================================== */
+#define PWRCTRL_MEMPWREN_CACHEB2_Pos      (31UL)                    /*!< PWRCTRL MEMPWREN: CACHEB2 (Bit 31)                    */
+#define PWRCTRL_MEMPWREN_CACHEB2_Msk      (0x80000000UL)            /*!< PWRCTRL MEMPWREN: CACHEB2 (Bitfield-Mask: 0x01)       */
+#define PWRCTRL_MEMPWREN_CACHEB0_Pos      (30UL)                    /*!< PWRCTRL MEMPWREN: CACHEB0 (Bit 30)                    */
+#define PWRCTRL_MEMPWREN_CACHEB0_Msk      (0x40000000UL)            /*!< PWRCTRL MEMPWREN: CACHEB0 (Bitfield-Mask: 0x01)       */
+#define PWRCTRL_MEMPWREN_FLASH1_Pos       (14UL)                    /*!< PWRCTRL MEMPWREN: FLASH1 (Bit 14)                     */
+#define PWRCTRL_MEMPWREN_FLASH1_Msk       (0x4000UL)                /*!< PWRCTRL MEMPWREN: FLASH1 (Bitfield-Mask: 0x01)        */
+#define PWRCTRL_MEMPWREN_FLASH0_Pos       (13UL)                    /*!< PWRCTRL MEMPWREN: FLASH0 (Bit 13)                     */
+#define PWRCTRL_MEMPWREN_FLASH0_Msk       (0x2000UL)                /*!< PWRCTRL MEMPWREN: FLASH0 (Bitfield-Mask: 0x01)        */
+#define PWRCTRL_MEMPWREN_SRAM_Pos         (3UL)                     /*!< PWRCTRL MEMPWREN: SRAM (Bit 3)                        */
+#define PWRCTRL_MEMPWREN_SRAM_Msk         (0x1ff8UL)                /*!< PWRCTRL MEMPWREN: SRAM (Bitfield-Mask: 0x3ff)         */
+#define PWRCTRL_MEMPWREN_DTCM_Pos         (0UL)                     /*!< PWRCTRL MEMPWREN: DTCM (Bit 0)                        */
+#define PWRCTRL_MEMPWREN_DTCM_Msk         (0x7UL)                   /*!< PWRCTRL MEMPWREN: DTCM (Bitfield-Mask: 0x07)          */
+/* =====================================================  MEMPWRSTATUS  ====================================================== */
+#define PWRCTRL_MEMPWRSTATUS_CACHEB2_Pos  (16UL)                    /*!< PWRCTRL MEMPWRSTATUS: CACHEB2 (Bit 16)                */
+#define PWRCTRL_MEMPWRSTATUS_CACHEB2_Msk  (0x10000UL)               /*!< PWRCTRL MEMPWRSTATUS: CACHEB2 (Bitfield-Mask: 0x01)   */
+#define PWRCTRL_MEMPWRSTATUS_CACHEB0_Pos  (15UL)                    /*!< PWRCTRL MEMPWRSTATUS: CACHEB0 (Bit 15)                */
+#define PWRCTRL_MEMPWRSTATUS_CACHEB0_Msk  (0x8000UL)                /*!< PWRCTRL MEMPWRSTATUS: CACHEB0 (Bitfield-Mask: 0x01)   */
+#define PWRCTRL_MEMPWRSTATUS_FLASH1_Pos   (14UL)                    /*!< PWRCTRL MEMPWRSTATUS: FLASH1 (Bit 14)                 */
+#define PWRCTRL_MEMPWRSTATUS_FLASH1_Msk   (0x4000UL)                /*!< PWRCTRL MEMPWRSTATUS: FLASH1 (Bitfield-Mask: 0x01)    */
+#define PWRCTRL_MEMPWRSTATUS_FLASH0_Pos   (13UL)                    /*!< PWRCTRL MEMPWRSTATUS: FLASH0 (Bit 13)                 */
+#define PWRCTRL_MEMPWRSTATUS_FLASH0_Msk   (0x2000UL)                /*!< PWRCTRL MEMPWRSTATUS: FLASH0 (Bitfield-Mask: 0x01)    */
+#define PWRCTRL_MEMPWRSTATUS_SRAM9_Pos    (12UL)                    /*!< PWRCTRL MEMPWRSTATUS: SRAM9 (Bit 12)                  */
+#define PWRCTRL_MEMPWRSTATUS_SRAM9_Msk    (0x1000UL)                /*!< PWRCTRL MEMPWRSTATUS: SRAM9 (Bitfield-Mask: 0x01)     */
+#define PWRCTRL_MEMPWRSTATUS_SRAM8_Pos    (11UL)                    /*!< PWRCTRL MEMPWRSTATUS: SRAM8 (Bit 11)                  */
+#define PWRCTRL_MEMPWRSTATUS_SRAM8_Msk    (0x800UL)                 /*!< PWRCTRL MEMPWRSTATUS: SRAM8 (Bitfield-Mask: 0x01)     */
+#define PWRCTRL_MEMPWRSTATUS_SRAM7_Pos    (10UL)                    /*!< PWRCTRL MEMPWRSTATUS: SRAM7 (Bit 10)                  */
+#define PWRCTRL_MEMPWRSTATUS_SRAM7_Msk    (0x400UL)                 /*!< PWRCTRL MEMPWRSTATUS: SRAM7 (Bitfield-Mask: 0x01)     */
+#define PWRCTRL_MEMPWRSTATUS_SRAM6_Pos    (9UL)                     /*!< PWRCTRL MEMPWRSTATUS: SRAM6 (Bit 9)                   */
+#define PWRCTRL_MEMPWRSTATUS_SRAM6_Msk    (0x200UL)                 /*!< PWRCTRL MEMPWRSTATUS: SRAM6 (Bitfield-Mask: 0x01)     */
+#define PWRCTRL_MEMPWRSTATUS_SRAM5_Pos    (8UL)                     /*!< PWRCTRL MEMPWRSTATUS: SRAM5 (Bit 8)                   */
+#define PWRCTRL_MEMPWRSTATUS_SRAM5_Msk    (0x100UL)                 /*!< PWRCTRL MEMPWRSTATUS: SRAM5 (Bitfield-Mask: 0x01)     */
+#define PWRCTRL_MEMPWRSTATUS_SRAM4_Pos    (7UL)                     /*!< PWRCTRL MEMPWRSTATUS: SRAM4 (Bit 7)                   */
+#define PWRCTRL_MEMPWRSTATUS_SRAM4_Msk    (0x80UL)                  /*!< PWRCTRL MEMPWRSTATUS: SRAM4 (Bitfield-Mask: 0x01)     */
+#define PWRCTRL_MEMPWRSTATUS_SRAM3_Pos    (6UL)                     /*!< PWRCTRL MEMPWRSTATUS: SRAM3 (Bit 6)                   */
+#define PWRCTRL_MEMPWRSTATUS_SRAM3_Msk    (0x40UL)                  /*!< PWRCTRL MEMPWRSTATUS: SRAM3 (Bitfield-Mask: 0x01)     */
+#define PWRCTRL_MEMPWRSTATUS_SRAM2_Pos    (5UL)                     /*!< PWRCTRL MEMPWRSTATUS: SRAM2 (Bit 5)                   */
+#define PWRCTRL_MEMPWRSTATUS_SRAM2_Msk    (0x20UL)                  /*!< PWRCTRL MEMPWRSTATUS: SRAM2 (Bitfield-Mask: 0x01)     */
+#define PWRCTRL_MEMPWRSTATUS_SRAM1_Pos    (4UL)                     /*!< PWRCTRL MEMPWRSTATUS: SRAM1 (Bit 4)                   */
+#define PWRCTRL_MEMPWRSTATUS_SRAM1_Msk    (0x10UL)                  /*!< PWRCTRL MEMPWRSTATUS: SRAM1 (Bitfield-Mask: 0x01)     */
+#define PWRCTRL_MEMPWRSTATUS_SRAM0_Pos    (3UL)                     /*!< PWRCTRL MEMPWRSTATUS: SRAM0 (Bit 3)                   */
+#define PWRCTRL_MEMPWRSTATUS_SRAM0_Msk    (0x8UL)                   /*!< PWRCTRL MEMPWRSTATUS: SRAM0 (Bitfield-Mask: 0x01)     */
+#define PWRCTRL_MEMPWRSTATUS_DTCM1_Pos    (2UL)                     /*!< PWRCTRL MEMPWRSTATUS: DTCM1 (Bit 2)                   */
+#define PWRCTRL_MEMPWRSTATUS_DTCM1_Msk    (0x4UL)                   /*!< PWRCTRL MEMPWRSTATUS: DTCM1 (Bitfield-Mask: 0x01)     */
+#define PWRCTRL_MEMPWRSTATUS_DTCM01_Pos   (1UL)                     /*!< PWRCTRL MEMPWRSTATUS: DTCM01 (Bit 1)                  */
+#define PWRCTRL_MEMPWRSTATUS_DTCM01_Msk   (0x2UL)                   /*!< PWRCTRL MEMPWRSTATUS: DTCM01 (Bitfield-Mask: 0x01)    */
+#define PWRCTRL_MEMPWRSTATUS_DTCM00_Pos   (0UL)                     /*!< PWRCTRL MEMPWRSTATUS: DTCM00 (Bit 0)                  */
+#define PWRCTRL_MEMPWRSTATUS_DTCM00_Msk   (0x1UL)                   /*!< PWRCTRL MEMPWRSTATUS: DTCM00 (Bitfield-Mask: 0x01)    */
+/* =====================================================  DEVPWRSTATUS  ====================================================== */
+#define PWRCTRL_DEVPWRSTATUS_SYSDEEPSLEEP_Pos (31UL)                /*!< PWRCTRL DEVPWRSTATUS: SYSDEEPSLEEP (Bit 31)           */
+#define PWRCTRL_DEVPWRSTATUS_SYSDEEPSLEEP_Msk (0x80000000UL)        /*!< PWRCTRL DEVPWRSTATUS: SYSDEEPSLEEP (Bitfield-Mask: 0x01) */
+#define PWRCTRL_DEVPWRSTATUS_COREDEEPSLEEP_Pos (30UL)               /*!< PWRCTRL DEVPWRSTATUS: COREDEEPSLEEP (Bit 30)          */
+#define PWRCTRL_DEVPWRSTATUS_COREDEEPSLEEP_Msk (0x40000000UL)       /*!< PWRCTRL DEVPWRSTATUS: COREDEEPSLEEP (Bitfield-Mask: 0x01) */
+#define PWRCTRL_DEVPWRSTATUS_CORESLEEP_Pos (29UL)                   /*!< PWRCTRL DEVPWRSTATUS: CORESLEEP (Bit 29)              */
+#define PWRCTRL_DEVPWRSTATUS_CORESLEEP_Msk (0x20000000UL)           /*!< PWRCTRL DEVPWRSTATUS: CORESLEEP (Bitfield-Mask: 0x01) */
+#define PWRCTRL_DEVPWRSTATUS_BLEH_Pos     (9UL)                     /*!< PWRCTRL DEVPWRSTATUS: BLEH (Bit 9)                    */
+#define PWRCTRL_DEVPWRSTATUS_BLEH_Msk     (0x200UL)                 /*!< PWRCTRL DEVPWRSTATUS: BLEH (Bitfield-Mask: 0x01)      */
+#define PWRCTRL_DEVPWRSTATUS_BLEL_Pos     (8UL)                     /*!< PWRCTRL DEVPWRSTATUS: BLEL (Bit 8)                    */
+#define PWRCTRL_DEVPWRSTATUS_BLEL_Msk     (0x100UL)                 /*!< PWRCTRL DEVPWRSTATUS: BLEL (Bitfield-Mask: 0x01)      */
+#define PWRCTRL_DEVPWRSTATUS_PWRPDM_Pos   (7UL)                     /*!< PWRCTRL DEVPWRSTATUS: PWRPDM (Bit 7)                  */
+#define PWRCTRL_DEVPWRSTATUS_PWRPDM_Msk   (0x80UL)                  /*!< PWRCTRL DEVPWRSTATUS: PWRPDM (Bitfield-Mask: 0x01)    */
+#define PWRCTRL_DEVPWRSTATUS_PWRMSPI_Pos  (6UL)                     /*!< PWRCTRL DEVPWRSTATUS: PWRMSPI (Bit 6)                 */
+#define PWRCTRL_DEVPWRSTATUS_PWRMSPI_Msk  (0x40UL)                  /*!< PWRCTRL DEVPWRSTATUS: PWRMSPI (Bitfield-Mask: 0x01)   */
+#define PWRCTRL_DEVPWRSTATUS_PWRADC_Pos   (5UL)                     /*!< PWRCTRL DEVPWRSTATUS: PWRADC (Bit 5)                  */
+#define PWRCTRL_DEVPWRSTATUS_PWRADC_Msk   (0x20UL)                  /*!< PWRCTRL DEVPWRSTATUS: PWRADC (Bitfield-Mask: 0x01)    */
+#define PWRCTRL_DEVPWRSTATUS_HCPC_Pos     (4UL)                     /*!< PWRCTRL DEVPWRSTATUS: HCPC (Bit 4)                    */
+#define PWRCTRL_DEVPWRSTATUS_HCPC_Msk     (0x10UL)                  /*!< PWRCTRL DEVPWRSTATUS: HCPC (Bitfield-Mask: 0x01)      */
+#define PWRCTRL_DEVPWRSTATUS_HCPB_Pos     (3UL)                     /*!< PWRCTRL DEVPWRSTATUS: HCPB (Bit 3)                    */
+#define PWRCTRL_DEVPWRSTATUS_HCPB_Msk     (0x8UL)                   /*!< PWRCTRL DEVPWRSTATUS: HCPB (Bitfield-Mask: 0x01)      */
+#define PWRCTRL_DEVPWRSTATUS_HCPA_Pos     (2UL)                     /*!< PWRCTRL DEVPWRSTATUS: HCPA (Bit 2)                    */
+#define PWRCTRL_DEVPWRSTATUS_HCPA_Msk     (0x4UL)                   /*!< PWRCTRL DEVPWRSTATUS: HCPA (Bitfield-Mask: 0x01)      */
+#define PWRCTRL_DEVPWRSTATUS_MCUH_Pos     (1UL)                     /*!< PWRCTRL DEVPWRSTATUS: MCUH (Bit 1)                    */
+#define PWRCTRL_DEVPWRSTATUS_MCUH_Msk     (0x2UL)                   /*!< PWRCTRL DEVPWRSTATUS: MCUH (Bitfield-Mask: 0x01)      */
+#define PWRCTRL_DEVPWRSTATUS_MCUL_Pos     (0UL)                     /*!< PWRCTRL DEVPWRSTATUS: MCUL (Bit 0)                    */
+#define PWRCTRL_DEVPWRSTATUS_MCUL_Msk     (0x1UL)                   /*!< PWRCTRL DEVPWRSTATUS: MCUL (Bitfield-Mask: 0x01)      */
+/* =======================================================  SRAMCTRL  ======================================================== */
+#define PWRCTRL_SRAMCTRL_SRAMLIGHTSLEEP_Pos (8UL)                   /*!< PWRCTRL SRAMCTRL: SRAMLIGHTSLEEP (Bit 8)              */
+#define PWRCTRL_SRAMCTRL_SRAMLIGHTSLEEP_Msk (0xfff00UL)             /*!< PWRCTRL SRAMCTRL: SRAMLIGHTSLEEP (Bitfield-Mask: 0xfff) */
+#define PWRCTRL_SRAMCTRL_SRAMMASTERCLKGATE_Pos (2UL)                /*!< PWRCTRL SRAMCTRL: SRAMMASTERCLKGATE (Bit 2)           */
+#define PWRCTRL_SRAMCTRL_SRAMMASTERCLKGATE_Msk (0x4UL)              /*!< PWRCTRL SRAMCTRL: SRAMMASTERCLKGATE (Bitfield-Mask: 0x01) */
+#define PWRCTRL_SRAMCTRL_SRAMCLKGATE_Pos  (1UL)                     /*!< PWRCTRL SRAMCTRL: SRAMCLKGATE (Bit 1)                 */
+#define PWRCTRL_SRAMCTRL_SRAMCLKGATE_Msk  (0x2UL)                   /*!< PWRCTRL SRAMCTRL: SRAMCLKGATE (Bitfield-Mask: 0x01)   */
+/* =======================================================  ADCSTATUS  ======================================================= */
+#define PWRCTRL_ADCSTATUS_REFBUFPWD_Pos   (5UL)                     /*!< PWRCTRL ADCSTATUS: REFBUFPWD (Bit 5)                  */
+#define PWRCTRL_ADCSTATUS_REFBUFPWD_Msk   (0x20UL)                  /*!< PWRCTRL ADCSTATUS: REFBUFPWD (Bitfield-Mask: 0x01)    */
+#define PWRCTRL_ADCSTATUS_REFKEEPPWD_Pos  (4UL)                     /*!< PWRCTRL ADCSTATUS: REFKEEPPWD (Bit 4)                 */
+#define PWRCTRL_ADCSTATUS_REFKEEPPWD_Msk  (0x10UL)                  /*!< PWRCTRL ADCSTATUS: REFKEEPPWD (Bitfield-Mask: 0x01)   */
+#define PWRCTRL_ADCSTATUS_VBATPWD_Pos     (3UL)                     /*!< PWRCTRL ADCSTATUS: VBATPWD (Bit 3)                    */
+#define PWRCTRL_ADCSTATUS_VBATPWD_Msk     (0x8UL)                   /*!< PWRCTRL ADCSTATUS: VBATPWD (Bitfield-Mask: 0x01)      */
+#define PWRCTRL_ADCSTATUS_VPTATPWD_Pos    (2UL)                     /*!< PWRCTRL ADCSTATUS: VPTATPWD (Bit 2)                   */
+#define PWRCTRL_ADCSTATUS_VPTATPWD_Msk    (0x4UL)                   /*!< PWRCTRL ADCSTATUS: VPTATPWD (Bitfield-Mask: 0x01)     */
+#define PWRCTRL_ADCSTATUS_BGTPWD_Pos      (1UL)                     /*!< PWRCTRL ADCSTATUS: BGTPWD (Bit 1)                     */
+#define PWRCTRL_ADCSTATUS_BGTPWD_Msk      (0x2UL)                   /*!< PWRCTRL ADCSTATUS: BGTPWD (Bitfield-Mask: 0x01)       */
+#define PWRCTRL_ADCSTATUS_ADCPWD_Pos      (0UL)                     /*!< PWRCTRL ADCSTATUS: ADCPWD (Bit 0)                     */
+#define PWRCTRL_ADCSTATUS_ADCPWD_Msk      (0x1UL)                   /*!< PWRCTRL ADCSTATUS: ADCPWD (Bitfield-Mask: 0x01)       */
+/* =========================================================  MISC  ========================================================== */
+#define PWRCTRL_MISC_FORCEBLEBUCKACT_Pos  (7UL)                     /*!< PWRCTRL MISC: FORCEBLEBUCKACT (Bit 7)                 */
+#define PWRCTRL_MISC_FORCEBLEBUCKACT_Msk  (0x80UL)                  /*!< PWRCTRL MISC: FORCEBLEBUCKACT (Bitfield-Mask: 0x01)   */
+#define PWRCTRL_MISC_MEMVRLPBLE_Pos       (6UL)                     /*!< PWRCTRL MISC: MEMVRLPBLE (Bit 6)                      */
+#define PWRCTRL_MISC_MEMVRLPBLE_Msk       (0x40UL)                  /*!< PWRCTRL MISC: MEMVRLPBLE (Bitfield-Mask: 0x01)        */
+#define PWRCTRL_MISC_FORCEMEMVRADC_Pos    (4UL)                     /*!< PWRCTRL MISC: FORCEMEMVRADC (Bit 4)                   */
+#define PWRCTRL_MISC_FORCEMEMVRADC_Msk    (0x30UL)                  /*!< PWRCTRL MISC: FORCEMEMVRADC (Bitfield-Mask: 0x03)     */
+#define PWRCTRL_MISC_FORCEMEMVRLPTIMERS_Pos (3UL)                   /*!< PWRCTRL MISC: FORCEMEMVRLPTIMERS (Bit 3)              */
+#define PWRCTRL_MISC_FORCEMEMVRLPTIMERS_Msk (0x8UL)                 /*!< PWRCTRL MISC: FORCEMEMVRLPTIMERS (Bitfield-Mask: 0x01) */
+#define PWRCTRL_MISC_FORCECOREVRLPTIMERS_Pos (2UL)                  /*!< PWRCTRL MISC: FORCECOREVRLPTIMERS (Bit 2)             */
+#define PWRCTRL_MISC_FORCECOREVRLPTIMERS_Msk (0x4UL)                /*!< PWRCTRL MISC: FORCECOREVRLPTIMERS (Bitfield-Mask: 0x01) */
+#define PWRCTRL_MISC_FORCECOREVRLPPDM_Pos (1UL)                     /*!< PWRCTRL MISC: FORCECOREVRLPPDM (Bit 1)                */
+#define PWRCTRL_MISC_FORCECOREVRLPPDM_Msk (0x2UL)                   /*!< PWRCTRL MISC: FORCECOREVRLPPDM (Bitfield-Mask: 0x01)  */
+#define PWRCTRL_MISC_SIMOBUCKEN_Pos       (0UL)                     /*!< PWRCTRL MISC: SIMOBUCKEN (Bit 0)                      */
+#define PWRCTRL_MISC_SIMOBUCKEN_Msk       (0x1UL)                   /*!< PWRCTRL MISC: SIMOBUCKEN (Bitfield-Mask: 0x01)        */
+/* =====================================================  DEVPWREVENTEN  ===================================================== */
+#define PWRCTRL_DEVPWREVENTEN_BURSTEVEN_Pos (31UL)                  /*!< PWRCTRL DEVPWREVENTEN: BURSTEVEN (Bit 31)             */
+#define PWRCTRL_DEVPWREVENTEN_BURSTEVEN_Msk (0x80000000UL)          /*!< PWRCTRL DEVPWREVENTEN: BURSTEVEN (Bitfield-Mask: 0x01) */
+#define PWRCTRL_DEVPWREVENTEN_BURSTFEATUREEVEN_Pos (30UL)           /*!< PWRCTRL DEVPWREVENTEN: BURSTFEATUREEVEN (Bit 30)      */
+#define PWRCTRL_DEVPWREVENTEN_BURSTFEATUREEVEN_Msk (0x40000000UL)   /*!< PWRCTRL DEVPWREVENTEN: BURSTFEATUREEVEN (Bitfield-Mask: 0x01) */
+#define PWRCTRL_DEVPWREVENTEN_BLEFEATUREEVEN_Pos (29UL)             /*!< PWRCTRL DEVPWREVENTEN: BLEFEATUREEVEN (Bit 29)        */
+#define PWRCTRL_DEVPWREVENTEN_BLEFEATUREEVEN_Msk (0x20000000UL)     /*!< PWRCTRL DEVPWREVENTEN: BLEFEATUREEVEN (Bitfield-Mask: 0x01) */
+#define PWRCTRL_DEVPWREVENTEN_BLELEVEN_Pos (8UL)                    /*!< PWRCTRL DEVPWREVENTEN: BLELEVEN (Bit 8)               */
+#define PWRCTRL_DEVPWREVENTEN_BLELEVEN_Msk (0x100UL)                /*!< PWRCTRL DEVPWREVENTEN: BLELEVEN (Bitfield-Mask: 0x01) */
+#define PWRCTRL_DEVPWREVENTEN_PDMEVEN_Pos (7UL)                     /*!< PWRCTRL DEVPWREVENTEN: PDMEVEN (Bit 7)                */
+#define PWRCTRL_DEVPWREVENTEN_PDMEVEN_Msk (0x80UL)                  /*!< PWRCTRL DEVPWREVENTEN: PDMEVEN (Bitfield-Mask: 0x01)  */
+#define PWRCTRL_DEVPWREVENTEN_MSPIEVEN_Pos (6UL)                    /*!< PWRCTRL DEVPWREVENTEN: MSPIEVEN (Bit 6)               */
+#define PWRCTRL_DEVPWREVENTEN_MSPIEVEN_Msk (0x40UL)                 /*!< PWRCTRL DEVPWREVENTEN: MSPIEVEN (Bitfield-Mask: 0x01) */
+#define PWRCTRL_DEVPWREVENTEN_ADCEVEN_Pos (5UL)                     /*!< PWRCTRL DEVPWREVENTEN: ADCEVEN (Bit 5)                */
+#define PWRCTRL_DEVPWREVENTEN_ADCEVEN_Msk (0x20UL)                  /*!< PWRCTRL DEVPWREVENTEN: ADCEVEN (Bitfield-Mask: 0x01)  */
+#define PWRCTRL_DEVPWREVENTEN_HCPCEVEN_Pos (4UL)                    /*!< PWRCTRL DEVPWREVENTEN: HCPCEVEN (Bit 4)               */
+#define PWRCTRL_DEVPWREVENTEN_HCPCEVEN_Msk (0x10UL)                 /*!< PWRCTRL DEVPWREVENTEN: HCPCEVEN (Bitfield-Mask: 0x01) */
+#define PWRCTRL_DEVPWREVENTEN_HCPBEVEN_Pos (3UL)                    /*!< PWRCTRL DEVPWREVENTEN: HCPBEVEN (Bit 3)               */
+#define PWRCTRL_DEVPWREVENTEN_HCPBEVEN_Msk (0x8UL)                  /*!< PWRCTRL DEVPWREVENTEN: HCPBEVEN (Bitfield-Mask: 0x01) */
+#define PWRCTRL_DEVPWREVENTEN_HCPAEVEN_Pos (2UL)                    /*!< PWRCTRL DEVPWREVENTEN: HCPAEVEN (Bit 2)               */
+#define PWRCTRL_DEVPWREVENTEN_HCPAEVEN_Msk (0x4UL)                  /*!< PWRCTRL DEVPWREVENTEN: HCPAEVEN (Bitfield-Mask: 0x01) */
+#define PWRCTRL_DEVPWREVENTEN_MCUHEVEN_Pos (1UL)                    /*!< PWRCTRL DEVPWREVENTEN: MCUHEVEN (Bit 1)               */
+#define PWRCTRL_DEVPWREVENTEN_MCUHEVEN_Msk (0x2UL)                  /*!< PWRCTRL DEVPWREVENTEN: MCUHEVEN (Bitfield-Mask: 0x01) */
+#define PWRCTRL_DEVPWREVENTEN_MCULEVEN_Pos (0UL)                    /*!< PWRCTRL DEVPWREVENTEN: MCULEVEN (Bit 0)               */
+#define PWRCTRL_DEVPWREVENTEN_MCULEVEN_Msk (0x1UL)                  /*!< PWRCTRL DEVPWREVENTEN: MCULEVEN (Bitfield-Mask: 0x01) */
+/* =====================================================  MEMPWREVENTEN  ===================================================== */
+#define PWRCTRL_MEMPWREVENTEN_CACHEB2EN_Pos (31UL)                  /*!< PWRCTRL MEMPWREVENTEN: CACHEB2EN (Bit 31)             */
+#define PWRCTRL_MEMPWREVENTEN_CACHEB2EN_Msk (0x80000000UL)          /*!< PWRCTRL MEMPWREVENTEN: CACHEB2EN (Bitfield-Mask: 0x01) */
+#define PWRCTRL_MEMPWREVENTEN_CACHEB0EN_Pos (30UL)                  /*!< PWRCTRL MEMPWREVENTEN: CACHEB0EN (Bit 30)             */
+#define PWRCTRL_MEMPWREVENTEN_CACHEB0EN_Msk (0x40000000UL)          /*!< PWRCTRL MEMPWREVENTEN: CACHEB0EN (Bitfield-Mask: 0x01) */
+#define PWRCTRL_MEMPWREVENTEN_FLASH1EN_Pos (14UL)                   /*!< PWRCTRL MEMPWREVENTEN: FLASH1EN (Bit 14)              */
+#define PWRCTRL_MEMPWREVENTEN_FLASH1EN_Msk (0x4000UL)               /*!< PWRCTRL MEMPWREVENTEN: FLASH1EN (Bitfield-Mask: 0x01) */
+#define PWRCTRL_MEMPWREVENTEN_FLASH0EN_Pos (13UL)                   /*!< PWRCTRL MEMPWREVENTEN: FLASH0EN (Bit 13)              */
+#define PWRCTRL_MEMPWREVENTEN_FLASH0EN_Msk (0x2000UL)               /*!< PWRCTRL MEMPWREVENTEN: FLASH0EN (Bitfield-Mask: 0x01) */
+#define PWRCTRL_MEMPWREVENTEN_SRAMEN_Pos  (3UL)                     /*!< PWRCTRL MEMPWREVENTEN: SRAMEN (Bit 3)                 */
+#define PWRCTRL_MEMPWREVENTEN_SRAMEN_Msk  (0x1ff8UL)                /*!< PWRCTRL MEMPWREVENTEN: SRAMEN (Bitfield-Mask: 0x3ff)  */
+#define PWRCTRL_MEMPWREVENTEN_DTCMEN_Pos  (0UL)                     /*!< PWRCTRL MEMPWREVENTEN: DTCMEN (Bit 0)                 */
+#define PWRCTRL_MEMPWREVENTEN_DTCMEN_Msk  (0x7UL)                   /*!< PWRCTRL MEMPWREVENTEN: DTCMEN (Bitfield-Mask: 0x07)   */
+
+
+/* =========================================================================================================================== */
+/* ================                                          RSTGEN                                           ================ */
+/* =========================================================================================================================== */
+
+/* ==========================================================  CFG  ========================================================== */
+#define RSTGEN_CFG_WDREN_Pos              (1UL)                     /*!< RSTGEN CFG: WDREN (Bit 1)                             */
+#define RSTGEN_CFG_WDREN_Msk              (0x2UL)                   /*!< RSTGEN CFG: WDREN (Bitfield-Mask: 0x01)               */
+#define RSTGEN_CFG_BODHREN_Pos            (0UL)                     /*!< RSTGEN CFG: BODHREN (Bit 0)                           */
+#define RSTGEN_CFG_BODHREN_Msk            (0x1UL)                   /*!< RSTGEN CFG: BODHREN (Bitfield-Mask: 0x01)             */
+/* =========================================================  SWPOI  ========================================================= */
+#define RSTGEN_SWPOI_SWPOIKEY_Pos         (0UL)                     /*!< RSTGEN SWPOI: SWPOIKEY (Bit 0)                        */
+#define RSTGEN_SWPOI_SWPOIKEY_Msk         (0xffUL)                  /*!< RSTGEN SWPOI: SWPOIKEY (Bitfield-Mask: 0xff)          */
+/* =========================================================  SWPOR  ========================================================= */
+#define RSTGEN_SWPOR_SWPORKEY_Pos         (0UL)                     /*!< RSTGEN SWPOR: SWPORKEY (Bit 0)                        */
+#define RSTGEN_SWPOR_SWPORKEY_Msk         (0xffUL)                  /*!< RSTGEN SWPOR: SWPORKEY (Bitfield-Mask: 0xff)          */
+/* ========================================================  TPIURST  ======================================================== */
+#define RSTGEN_TPIURST_TPIURST_Pos        (0UL)                     /*!< RSTGEN TPIURST: TPIURST (Bit 0)                       */
+#define RSTGEN_TPIURST_TPIURST_Msk        (0x1UL)                   /*!< RSTGEN TPIURST: TPIURST (Bitfield-Mask: 0x01)         */
+/* =========================================================  INTEN  ========================================================= */
+#define RSTGEN_INTEN_BODH_Pos             (0UL)                     /*!< RSTGEN INTEN: BODH (Bit 0)                            */
+#define RSTGEN_INTEN_BODH_Msk             (0x1UL)                   /*!< RSTGEN INTEN: BODH (Bitfield-Mask: 0x01)              */
+/* ========================================================  INTSTAT  ======================================================== */
+#define RSTGEN_INTSTAT_BODH_Pos           (0UL)                     /*!< RSTGEN INTSTAT: BODH (Bit 0)                          */
+#define RSTGEN_INTSTAT_BODH_Msk           (0x1UL)                   /*!< RSTGEN INTSTAT: BODH (Bitfield-Mask: 0x01)            */
+/* ========================================================  INTCLR  ========================================================= */
+#define RSTGEN_INTCLR_BODH_Pos            (0UL)                     /*!< RSTGEN INTCLR: BODH (Bit 0)                           */
+#define RSTGEN_INTCLR_BODH_Msk            (0x1UL)                   /*!< RSTGEN INTCLR: BODH (Bitfield-Mask: 0x01)             */
+/* ========================================================  INTSET  ========================================================= */
+#define RSTGEN_INTSET_BODH_Pos            (0UL)                     /*!< RSTGEN INTSET: BODH (Bit 0)                           */
+#define RSTGEN_INTSET_BODH_Msk            (0x1UL)                   /*!< RSTGEN INTSET: BODH (Bitfield-Mask: 0x01)             */
+/* =========================================================  STAT  ========================================================== */
+#define RSTGEN_STAT_SBOOT_Pos             (31UL)                    /*!< RSTGEN STAT: SBOOT (Bit 31)                           */
+#define RSTGEN_STAT_SBOOT_Msk             (0x80000000UL)            /*!< RSTGEN STAT: SBOOT (Bitfield-Mask: 0x01)              */
+#define RSTGEN_STAT_FBOOT_Pos             (30UL)                    /*!< RSTGEN STAT: FBOOT (Bit 30)                           */
+#define RSTGEN_STAT_FBOOT_Msk             (0x40000000UL)            /*!< RSTGEN STAT: FBOOT (Bitfield-Mask: 0x01)              */
+#define RSTGEN_STAT_BOBSTAT_Pos           (10UL)                    /*!< RSTGEN STAT: BOBSTAT (Bit 10)                         */
+#define RSTGEN_STAT_BOBSTAT_Msk           (0x400UL)                 /*!< RSTGEN STAT: BOBSTAT (Bitfield-Mask: 0x01)            */
+#define RSTGEN_STAT_BOFSTAT_Pos           (9UL)                     /*!< RSTGEN STAT: BOFSTAT (Bit 9)                          */
+#define RSTGEN_STAT_BOFSTAT_Msk           (0x200UL)                 /*!< RSTGEN STAT: BOFSTAT (Bitfield-Mask: 0x01)            */
+#define RSTGEN_STAT_BOCSTAT_Pos           (8UL)                     /*!< RSTGEN STAT: BOCSTAT (Bit 8)                          */
+#define RSTGEN_STAT_BOCSTAT_Msk           (0x100UL)                 /*!< RSTGEN STAT: BOCSTAT (Bitfield-Mask: 0x01)            */
+#define RSTGEN_STAT_BOUSTAT_Pos           (7UL)                     /*!< RSTGEN STAT: BOUSTAT (Bit 7)                          */
+#define RSTGEN_STAT_BOUSTAT_Msk           (0x80UL)                  /*!< RSTGEN STAT: BOUSTAT (Bitfield-Mask: 0x01)            */
+#define RSTGEN_STAT_WDRSTAT_Pos           (6UL)                     /*!< RSTGEN STAT: WDRSTAT (Bit 6)                          */
+#define RSTGEN_STAT_WDRSTAT_Msk           (0x40UL)                  /*!< RSTGEN STAT: WDRSTAT (Bitfield-Mask: 0x01)            */
+#define RSTGEN_STAT_DBGRSTAT_Pos          (5UL)                     /*!< RSTGEN STAT: DBGRSTAT (Bit 5)                         */
+#define RSTGEN_STAT_DBGRSTAT_Msk          (0x20UL)                  /*!< RSTGEN STAT: DBGRSTAT (Bitfield-Mask: 0x01)           */
+#define RSTGEN_STAT_POIRSTAT_Pos          (4UL)                     /*!< RSTGEN STAT: POIRSTAT (Bit 4)                         */
+#define RSTGEN_STAT_POIRSTAT_Msk          (0x10UL)                  /*!< RSTGEN STAT: POIRSTAT (Bitfield-Mask: 0x01)           */
+#define RSTGEN_STAT_SWRSTAT_Pos           (3UL)                     /*!< RSTGEN STAT: SWRSTAT (Bit 3)                          */
+#define RSTGEN_STAT_SWRSTAT_Msk           (0x8UL)                   /*!< RSTGEN STAT: SWRSTAT (Bitfield-Mask: 0x01)            */
+#define RSTGEN_STAT_BORSTAT_Pos           (2UL)                     /*!< RSTGEN STAT: BORSTAT (Bit 2)                          */
+#define RSTGEN_STAT_BORSTAT_Msk           (0x4UL)                   /*!< RSTGEN STAT: BORSTAT (Bitfield-Mask: 0x01)            */
+#define RSTGEN_STAT_PORSTAT_Pos           (1UL)                     /*!< RSTGEN STAT: PORSTAT (Bit 1)                          */
+#define RSTGEN_STAT_PORSTAT_Msk           (0x2UL)                   /*!< RSTGEN STAT: PORSTAT (Bitfield-Mask: 0x01)            */
+#define RSTGEN_STAT_EXRSTAT_Pos           (0UL)                     /*!< RSTGEN STAT: EXRSTAT (Bit 0)                          */
+#define RSTGEN_STAT_EXRSTAT_Msk           (0x1UL)                   /*!< RSTGEN STAT: EXRSTAT (Bitfield-Mask: 0x01)            */
+
+
+/* =========================================================================================================================== */
+/* ================                                            RTC                                            ================ */
+/* =========================================================================================================================== */
+
+/* ========================================================  CTRLOW  ========================================================= */
+#define RTC_CTRLOW_CTRHR_Pos              (24UL)                    /*!< RTC CTRLOW: CTRHR (Bit 24)                            */
+#define RTC_CTRLOW_CTRHR_Msk              (0x3f000000UL)            /*!< RTC CTRLOW: CTRHR (Bitfield-Mask: 0x3f)               */
+#define RTC_CTRLOW_CTRMIN_Pos             (16UL)                    /*!< RTC CTRLOW: CTRMIN (Bit 16)                           */
+#define RTC_CTRLOW_CTRMIN_Msk             (0x7f0000UL)              /*!< RTC CTRLOW: CTRMIN (Bitfield-Mask: 0x7f)              */
+#define RTC_CTRLOW_CTRSEC_Pos             (8UL)                     /*!< RTC CTRLOW: CTRSEC (Bit 8)                            */
+#define RTC_CTRLOW_CTRSEC_Msk             (0x7f00UL)                /*!< RTC CTRLOW: CTRSEC (Bitfield-Mask: 0x7f)              */
+#define RTC_CTRLOW_CTR100_Pos             (0UL)                     /*!< RTC CTRLOW: CTR100 (Bit 0)                            */
+#define RTC_CTRLOW_CTR100_Msk             (0xffUL)                  /*!< RTC CTRLOW: CTR100 (Bitfield-Mask: 0xff)              */
+/* =========================================================  CTRUP  ========================================================= */
+#define RTC_CTRUP_CTERR_Pos               (31UL)                    /*!< RTC CTRUP: CTERR (Bit 31)                             */
+#define RTC_CTRUP_CTERR_Msk               (0x80000000UL)            /*!< RTC CTRUP: CTERR (Bitfield-Mask: 0x01)                */
+#define RTC_CTRUP_CEB_Pos                 (28UL)                    /*!< RTC CTRUP: CEB (Bit 28)                               */
+#define RTC_CTRUP_CEB_Msk                 (0x10000000UL)            /*!< RTC CTRUP: CEB (Bitfield-Mask: 0x01)                  */
+#define RTC_CTRUP_CB_Pos                  (27UL)                    /*!< RTC CTRUP: CB (Bit 27)                                */
+#define RTC_CTRUP_CB_Msk                  (0x8000000UL)             /*!< RTC CTRUP: CB (Bitfield-Mask: 0x01)                   */
+#define RTC_CTRUP_CTRWKDY_Pos             (24UL)                    /*!< RTC CTRUP: CTRWKDY (Bit 24)                           */
+#define RTC_CTRUP_CTRWKDY_Msk             (0x7000000UL)             /*!< RTC CTRUP: CTRWKDY (Bitfield-Mask: 0x07)              */
+#define RTC_CTRUP_CTRYR_Pos               (16UL)                    /*!< RTC CTRUP: CTRYR (Bit 16)                             */
+#define RTC_CTRUP_CTRYR_Msk               (0xff0000UL)              /*!< RTC CTRUP: CTRYR (Bitfield-Mask: 0xff)                */
+#define RTC_CTRUP_CTRMO_Pos               (8UL)                     /*!< RTC CTRUP: CTRMO (Bit 8)                              */
+#define RTC_CTRUP_CTRMO_Msk               (0x1f00UL)                /*!< RTC CTRUP: CTRMO (Bitfield-Mask: 0x1f)                */
+#define RTC_CTRUP_CTRDATE_Pos             (0UL)                     /*!< RTC CTRUP: CTRDATE (Bit 0)                            */
+#define RTC_CTRUP_CTRDATE_Msk             (0x3fUL)                  /*!< RTC CTRUP: CTRDATE (Bitfield-Mask: 0x3f)              */
+/* ========================================================  ALMLOW  ========================================================= */
+#define RTC_ALMLOW_ALMHR_Pos              (24UL)                    /*!< RTC ALMLOW: ALMHR (Bit 24)                            */
+#define RTC_ALMLOW_ALMHR_Msk              (0x3f000000UL)            /*!< RTC ALMLOW: ALMHR (Bitfield-Mask: 0x3f)               */
+#define RTC_ALMLOW_ALMMIN_Pos             (16UL)                    /*!< RTC ALMLOW: ALMMIN (Bit 16)                           */
+#define RTC_ALMLOW_ALMMIN_Msk             (0x7f0000UL)              /*!< RTC ALMLOW: ALMMIN (Bitfield-Mask: 0x7f)              */
+#define RTC_ALMLOW_ALMSEC_Pos             (8UL)                     /*!< RTC ALMLOW: ALMSEC (Bit 8)                            */
+#define RTC_ALMLOW_ALMSEC_Msk             (0x7f00UL)                /*!< RTC ALMLOW: ALMSEC (Bitfield-Mask: 0x7f)              */
+#define RTC_ALMLOW_ALM100_Pos             (0UL)                     /*!< RTC ALMLOW: ALM100 (Bit 0)                            */
+#define RTC_ALMLOW_ALM100_Msk             (0xffUL)                  /*!< RTC ALMLOW: ALM100 (Bitfield-Mask: 0xff)              */
+/* =========================================================  ALMUP  ========================================================= */
+#define RTC_ALMUP_ALMWKDY_Pos             (16UL)                    /*!< RTC ALMUP: ALMWKDY (Bit 16)                           */
+#define RTC_ALMUP_ALMWKDY_Msk             (0x70000UL)               /*!< RTC ALMUP: ALMWKDY (Bitfield-Mask: 0x07)              */
+#define RTC_ALMUP_ALMMO_Pos               (8UL)                     /*!< RTC ALMUP: ALMMO (Bit 8)                              */
+#define RTC_ALMUP_ALMMO_Msk               (0x1f00UL)                /*!< RTC ALMUP: ALMMO (Bitfield-Mask: 0x1f)                */
+#define RTC_ALMUP_ALMDATE_Pos             (0UL)                     /*!< RTC ALMUP: ALMDATE (Bit 0)                            */
+#define RTC_ALMUP_ALMDATE_Msk             (0x3fUL)                  /*!< RTC ALMUP: ALMDATE (Bitfield-Mask: 0x3f)              */
+/* ========================================================  RTCCTL  ========================================================= */
+#define RTC_RTCCTL_HR1224_Pos             (5UL)                     /*!< RTC RTCCTL: HR1224 (Bit 5)                            */
+#define RTC_RTCCTL_HR1224_Msk             (0x20UL)                  /*!< RTC RTCCTL: HR1224 (Bitfield-Mask: 0x01)              */
+#define RTC_RTCCTL_RSTOP_Pos              (4UL)                     /*!< RTC RTCCTL: RSTOP (Bit 4)                             */
+#define RTC_RTCCTL_RSTOP_Msk              (0x10UL)                  /*!< RTC RTCCTL: RSTOP (Bitfield-Mask: 0x01)               */
+#define RTC_RTCCTL_RPT_Pos                (1UL)                     /*!< RTC RTCCTL: RPT (Bit 1)                               */
+#define RTC_RTCCTL_RPT_Msk                (0xeUL)                   /*!< RTC RTCCTL: RPT (Bitfield-Mask: 0x07)                 */
+#define RTC_RTCCTL_WRTC_Pos               (0UL)                     /*!< RTC RTCCTL: WRTC (Bit 0)                              */
+#define RTC_RTCCTL_WRTC_Msk               (0x1UL)                   /*!< RTC RTCCTL: WRTC (Bitfield-Mask: 0x01)                */
+/* =========================================================  INTEN  ========================================================= */
+#define RTC_INTEN_ALM_Pos                 (0UL)                     /*!< RTC INTEN: ALM (Bit 0)                                */
+#define RTC_INTEN_ALM_Msk                 (0x1UL)                   /*!< RTC INTEN: ALM (Bitfield-Mask: 0x01)                  */
+/* ========================================================  INTSTAT  ======================================================== */
+#define RTC_INTSTAT_ALM_Pos               (0UL)                     /*!< RTC INTSTAT: ALM (Bit 0)                              */
+#define RTC_INTSTAT_ALM_Msk               (0x1UL)                   /*!< RTC INTSTAT: ALM (Bitfield-Mask: 0x01)                */
+/* ========================================================  INTCLR  ========================================================= */
+#define RTC_INTCLR_ALM_Pos                (0UL)                     /*!< RTC INTCLR: ALM (Bit 0)                               */
+#define RTC_INTCLR_ALM_Msk                (0x1UL)                   /*!< RTC INTCLR: ALM (Bitfield-Mask: 0x01)                 */
+/* ========================================================  INTSET  ========================================================= */
+#define RTC_INTSET_ALM_Pos                (0UL)                     /*!< RTC INTSET: ALM (Bit 0)                               */
+#define RTC_INTSET_ALM_Msk                (0x1UL)                   /*!< RTC INTSET: ALM (Bitfield-Mask: 0x01)                 */
+
+
+/* =========================================================================================================================== */
+/* ================                                           SCARD                                           ================ */
+/* =========================================================================================================================== */
+
+/* ==========================================================  SR  =========================================================== */
+#define SCARD_SR_FHF_Pos                  (6UL)                     /*!< SCARD SR: FHF (Bit 6)                                 */
+#define SCARD_SR_FHF_Msk                  (0x40UL)                  /*!< SCARD SR: FHF (Bitfield-Mask: 0x01)                   */
+#define SCARD_SR_FT2REND_Pos              (5UL)                     /*!< SCARD SR: FT2REND (Bit 5)                             */
+#define SCARD_SR_FT2REND_Msk              (0x20UL)                  /*!< SCARD SR: FT2REND (Bitfield-Mask: 0x01)               */
+#define SCARD_SR_PE_Pos                   (4UL)                     /*!< SCARD SR: PE (Bit 4)                                  */
+#define SCARD_SR_PE_Msk                   (0x10UL)                  /*!< SCARD SR: PE (Bitfield-Mask: 0x01)                    */
+#define SCARD_SR_OVR_Pos                  (3UL)                     /*!< SCARD SR: OVR (Bit 3)                                 */
+#define SCARD_SR_OVR_Msk                  (0x8UL)                   /*!< SCARD SR: OVR (Bitfield-Mask: 0x01)                   */
+#define SCARD_SR_FER_Pos                  (2UL)                     /*!< SCARD SR: FER (Bit 2)                                 */
+#define SCARD_SR_FER_Msk                  (0x4UL)                   /*!< SCARD SR: FER (Bitfield-Mask: 0x01)                   */
+#define SCARD_SR_TBERBF_Pos               (1UL)                     /*!< SCARD SR: TBERBF (Bit 1)                              */
+#define SCARD_SR_TBERBF_Msk               (0x2UL)                   /*!< SCARD SR: TBERBF (Bitfield-Mask: 0x01)                */
+#define SCARD_SR_FNE_Pos                  (0UL)                     /*!< SCARD SR: FNE (Bit 0)                                 */
+#define SCARD_SR_FNE_Msk                  (0x1UL)                   /*!< SCARD SR: FNE (Bitfield-Mask: 0x01)                   */
+/* ==========================================================  DR  =========================================================== */
+#define SCARD_DR_DR_Pos                   (0UL)                     /*!< SCARD DR: DR (Bit 0)                                  */
+#define SCARD_DR_DR_Msk                   (0xffUL)                  /*!< SCARD DR: DR (Bitfield-Mask: 0xff)                    */
+/* ==========================================================  SR1  ========================================================== */
+#define SCARD_SR1_IDLE_Pos                (3UL)                     /*!< SCARD SR1: IDLE (Bit 3)                               */
+#define SCARD_SR1_IDLE_Msk                (0x8UL)                   /*!< SCARD SR1: IDLE (Bitfield-Mask: 0x01)                 */
+#define SCARD_SR1_SYNCEND_Pos             (2UL)                     /*!< SCARD SR1: SYNCEND (Bit 2)                            */
+#define SCARD_SR1_SYNCEND_Msk             (0x4UL)                   /*!< SCARD SR1: SYNCEND (Bitfield-Mask: 0x01)              */
+#define SCARD_SR1_PRL_Pos                 (1UL)                     /*!< SCARD SR1: PRL (Bit 1)                                */
+#define SCARD_SR1_PRL_Msk                 (0x2UL)                   /*!< SCARD SR1: PRL (Bitfield-Mask: 0x01)                  */
+#define SCARD_SR1_ECNTOVER_Pos            (0UL)                     /*!< SCARD SR1: ECNTOVER (Bit 0)                           */
+#define SCARD_SR1_ECNTOVER_Msk            (0x1UL)                   /*!< SCARD SR1: ECNTOVER (Bitfield-Mask: 0x01)             */
+/* ======================================================  RETXCNTRMI  ======================================================= */
+#define SCARD_RETXCNTRMI_RETXCNTRMI_Pos   (0UL)                     /*!< SCARD RETXCNTRMI: RETXCNTRMI (Bit 0)                  */
+#define SCARD_RETXCNTRMI_RETXCNTRMI_Msk   (0xfUL)                   /*!< SCARD RETXCNTRMI: RETXCNTRMI (Bitfield-Mask: 0x0f)    */
+/* ========================================================  CLKCTRL  ======================================================== */
+#define SCARD_CLKCTRL_APBCLKEN_Pos        (1UL)                     /*!< SCARD CLKCTRL: APBCLKEN (Bit 1)                       */
+#define SCARD_CLKCTRL_APBCLKEN_Msk        (0x2UL)                   /*!< SCARD CLKCTRL: APBCLKEN (Bitfield-Mask: 0x01)         */
+#define SCARD_CLKCTRL_CLKEN_Pos           (0UL)                     /*!< SCARD CLKCTRL: CLKEN (Bit 0)                          */
+#define SCARD_CLKCTRL_CLKEN_Msk           (0x1UL)                   /*!< SCARD CLKCTRL: CLKEN (Bitfield-Mask: 0x01)            */
+
+
+/* =========================================================================================================================== */
+/* ================                                         SECURITY                                          ================ */
+/* =========================================================================================================================== */
+
+/* =========================================================  CTRL  ========================================================== */
+#define SECURITY_CTRL_CRCERROR_Pos        (31UL)                    /*!< SECURITY CTRL: CRCERROR (Bit 31)                      */
+#define SECURITY_CTRL_CRCERROR_Msk        (0x80000000UL)            /*!< SECURITY CTRL: CRCERROR (Bitfield-Mask: 0x01)         */
+#define SECURITY_CTRL_FUNCTION_Pos        (4UL)                     /*!< SECURITY CTRL: FUNCTION (Bit 4)                       */
+#define SECURITY_CTRL_FUNCTION_Msk        (0xf0UL)                  /*!< SECURITY CTRL: FUNCTION (Bitfield-Mask: 0x0f)         */
+#define SECURITY_CTRL_ENABLE_Pos          (0UL)                     /*!< SECURITY CTRL: ENABLE (Bit 0)                         */
+#define SECURITY_CTRL_ENABLE_Msk          (0x1UL)                   /*!< SECURITY CTRL: ENABLE (Bitfield-Mask: 0x01)           */
+/* ========================================================  SRCADDR  ======================================================== */
+#define SECURITY_SRCADDR_ADDR_Pos         (0UL)                     /*!< SECURITY SRCADDR: ADDR (Bit 0)                        */
+#define SECURITY_SRCADDR_ADDR_Msk         (0xffffffffUL)            /*!< SECURITY SRCADDR: ADDR (Bitfield-Mask: 0xffffffff)    */
+/* ==========================================================  LEN  ========================================================== */
+#define SECURITY_LEN_LEN_Pos              (2UL)                     /*!< SECURITY LEN: LEN (Bit 2)                             */
+#define SECURITY_LEN_LEN_Msk              (0xffffcUL)               /*!< SECURITY LEN: LEN (Bitfield-Mask: 0x3ffff)            */
+/* ========================================================  RESULT  ========================================================= */
+#define SECURITY_RESULT_CRC_Pos           (0UL)                     /*!< SECURITY RESULT: CRC (Bit 0)                          */
+#define SECURITY_RESULT_CRC_Msk           (0xffffffffUL)            /*!< SECURITY RESULT: CRC (Bitfield-Mask: 0xffffffff)      */
+/* =======================================================  LOCKCTRL  ======================================================== */
+#define SECURITY_LOCKCTRL_SELECT_Pos      (0UL)                     /*!< SECURITY LOCKCTRL: SELECT (Bit 0)                     */
+#define SECURITY_LOCKCTRL_SELECT_Msk      (0xffUL)                  /*!< SECURITY LOCKCTRL: SELECT (Bitfield-Mask: 0xff)       */
+/* =======================================================  LOCKSTAT  ======================================================== */
+#define SECURITY_LOCKSTAT_STATUS_Pos      (0UL)                     /*!< SECURITY LOCKSTAT: STATUS (Bit 0)                     */
+#define SECURITY_LOCKSTAT_STATUS_Msk      (0xffffffffUL)            /*!< SECURITY LOCKSTAT: STATUS (Bitfield-Mask: 0xffffffff) */
+/* =========================================================  KEY0  ========================================================== */
+#define SECURITY_KEY0_KEY0_Pos            (0UL)                     /*!< SECURITY KEY0: KEY0 (Bit 0)                           */
+#define SECURITY_KEY0_KEY0_Msk            (0xffffffffUL)            /*!< SECURITY KEY0: KEY0 (Bitfield-Mask: 0xffffffff)       */
+/* =========================================================  KEY1  ========================================================== */
+#define SECURITY_KEY1_KEY1_Pos            (0UL)                     /*!< SECURITY KEY1: KEY1 (Bit 0)                           */
+#define SECURITY_KEY1_KEY1_Msk            (0xffffffffUL)            /*!< SECURITY KEY1: KEY1 (Bitfield-Mask: 0xffffffff)       */
+/* =========================================================  KEY2  ========================================================== */
+#define SECURITY_KEY2_KEY2_Pos            (0UL)                     /*!< SECURITY KEY2: KEY2 (Bit 0)                           */
+#define SECURITY_KEY2_KEY2_Msk            (0xffffffffUL)            /*!< SECURITY KEY2: KEY2 (Bitfield-Mask: 0xffffffff)       */
+/* =========================================================  KEY3  ========================================================== */
+#define SECURITY_KEY3_KEY3_Pos            (0UL)                     /*!< SECURITY KEY3: KEY3 (Bit 0)                           */
+#define SECURITY_KEY3_KEY3_Msk            (0xffffffffUL)            /*!< SECURITY KEY3: KEY3 (Bitfield-Mask: 0xffffffff)       */
+
+
+/* =========================================================================================================================== */
+/* ================                                           UART0                                           ================ */
+/* =========================================================================================================================== */
+
+/* ==========================================================  DR  =========================================================== */
+#define UART0_DR_OEDATA_Pos               (11UL)                    /*!< UART0 DR: OEDATA (Bit 11)                             */
+#define UART0_DR_OEDATA_Msk               (0x800UL)                 /*!< UART0 DR: OEDATA (Bitfield-Mask: 0x01)                */
+#define UART0_DR_BEDATA_Pos               (10UL)                    /*!< UART0 DR: BEDATA (Bit 10)                             */
+#define UART0_DR_BEDATA_Msk               (0x400UL)                 /*!< UART0 DR: BEDATA (Bitfield-Mask: 0x01)                */
+#define UART0_DR_PEDATA_Pos               (9UL)                     /*!< UART0 DR: PEDATA (Bit 9)                              */
+#define UART0_DR_PEDATA_Msk               (0x200UL)                 /*!< UART0 DR: PEDATA (Bitfield-Mask: 0x01)                */
+#define UART0_DR_FEDATA_Pos               (8UL)                     /*!< UART0 DR: FEDATA (Bit 8)                              */
+#define UART0_DR_FEDATA_Msk               (0x100UL)                 /*!< UART0 DR: FEDATA (Bitfield-Mask: 0x01)                */
+#define UART0_DR_DATA_Pos                 (0UL)                     /*!< UART0 DR: DATA (Bit 0)                                */
+#define UART0_DR_DATA_Msk                 (0xffUL)                  /*!< UART0 DR: DATA (Bitfield-Mask: 0xff)                  */
+/* ==========================================================  RSR  ========================================================== */
+#define UART0_RSR_OESTAT_Pos              (3UL)                     /*!< UART0 RSR: OESTAT (Bit 3)                             */
+#define UART0_RSR_OESTAT_Msk              (0x8UL)                   /*!< UART0 RSR: OESTAT (Bitfield-Mask: 0x01)               */
+#define UART0_RSR_BESTAT_Pos              (2UL)                     /*!< UART0 RSR: BESTAT (Bit 2)                             */
+#define UART0_RSR_BESTAT_Msk              (0x4UL)                   /*!< UART0 RSR: BESTAT (Bitfield-Mask: 0x01)               */
+#define UART0_RSR_PESTAT_Pos              (1UL)                     /*!< UART0 RSR: PESTAT (Bit 1)                             */
+#define UART0_RSR_PESTAT_Msk              (0x2UL)                   /*!< UART0 RSR: PESTAT (Bitfield-Mask: 0x01)               */
+#define UART0_RSR_FESTAT_Pos              (0UL)                     /*!< UART0 RSR: FESTAT (Bit 0)                             */
+#define UART0_RSR_FESTAT_Msk              (0x1UL)                   /*!< UART0 RSR: FESTAT (Bitfield-Mask: 0x01)               */
+/* ==========================================================  FR  =========================================================== */
+#define UART0_FR_TXBUSY_Pos               (8UL)                     /*!< UART0 FR: TXBUSY (Bit 8)                              */
+#define UART0_FR_TXBUSY_Msk               (0x100UL)                 /*!< UART0 FR: TXBUSY (Bitfield-Mask: 0x01)                */
+#define UART0_FR_TXFE_Pos                 (7UL)                     /*!< UART0 FR: TXFE (Bit 7)                                */
+#define UART0_FR_TXFE_Msk                 (0x80UL)                  /*!< UART0 FR: TXFE (Bitfield-Mask: 0x01)                  */
+#define UART0_FR_RXFF_Pos                 (6UL)                     /*!< UART0 FR: RXFF (Bit 6)                                */
+#define UART0_FR_RXFF_Msk                 (0x40UL)                  /*!< UART0 FR: RXFF (Bitfield-Mask: 0x01)                  */
+#define UART0_FR_TXFF_Pos                 (5UL)                     /*!< UART0 FR: TXFF (Bit 5)                                */
+#define UART0_FR_TXFF_Msk                 (0x20UL)                  /*!< UART0 FR: TXFF (Bitfield-Mask: 0x01)                  */
+#define UART0_FR_RXFE_Pos                 (4UL)                     /*!< UART0 FR: RXFE (Bit 4)                                */
+#define UART0_FR_RXFE_Msk                 (0x10UL)                  /*!< UART0 FR: RXFE (Bitfield-Mask: 0x01)                  */
+#define UART0_FR_BUSY_Pos                 (3UL)                     /*!< UART0 FR: BUSY (Bit 3)                                */
+#define UART0_FR_BUSY_Msk                 (0x8UL)                   /*!< UART0 FR: BUSY (Bitfield-Mask: 0x01)                  */
+#define UART0_FR_DCD_Pos                  (2UL)                     /*!< UART0 FR: DCD (Bit 2)                                 */
+#define UART0_FR_DCD_Msk                  (0x4UL)                   /*!< UART0 FR: DCD (Bitfield-Mask: 0x01)                   */
+#define UART0_FR_DSR_Pos                  (1UL)                     /*!< UART0 FR: DSR (Bit 1)                                 */
+#define UART0_FR_DSR_Msk                  (0x2UL)                   /*!< UART0 FR: DSR (Bitfield-Mask: 0x01)                   */
+#define UART0_FR_CTS_Pos                  (0UL)                     /*!< UART0 FR: CTS (Bit 0)                                 */
+#define UART0_FR_CTS_Msk                  (0x1UL)                   /*!< UART0 FR: CTS (Bitfield-Mask: 0x01)                   */
+/* =========================================================  ILPR  ========================================================== */
+#define UART0_ILPR_ILPDVSR_Pos            (0UL)                     /*!< UART0 ILPR: ILPDVSR (Bit 0)                           */
+#define UART0_ILPR_ILPDVSR_Msk            (0xffUL)                  /*!< UART0 ILPR: ILPDVSR (Bitfield-Mask: 0xff)             */
+/* =========================================================  IBRD  ========================================================== */
+#define UART0_IBRD_DIVINT_Pos             (0UL)                     /*!< UART0 IBRD: DIVINT (Bit 0)                            */
+#define UART0_IBRD_DIVINT_Msk             (0xffffUL)                /*!< UART0 IBRD: DIVINT (Bitfield-Mask: 0xffff)            */
+/* =========================================================  FBRD  ========================================================== */
+#define UART0_FBRD_DIVFRAC_Pos            (0UL)                     /*!< UART0 FBRD: DIVFRAC (Bit 0)                           */
+#define UART0_FBRD_DIVFRAC_Msk            (0x3fUL)                  /*!< UART0 FBRD: DIVFRAC (Bitfield-Mask: 0x3f)             */
+/* =========================================================  LCRH  ========================================================== */
+#define UART0_LCRH_SPS_Pos                (7UL)                     /*!< UART0 LCRH: SPS (Bit 7)                               */
+#define UART0_LCRH_SPS_Msk                (0x80UL)                  /*!< UART0 LCRH: SPS (Bitfield-Mask: 0x01)                 */
+#define UART0_LCRH_WLEN_Pos               (5UL)                     /*!< UART0 LCRH: WLEN (Bit 5)                              */
+#define UART0_LCRH_WLEN_Msk               (0x60UL)                  /*!< UART0 LCRH: WLEN (Bitfield-Mask: 0x03)                */
+#define UART0_LCRH_FEN_Pos                (4UL)                     /*!< UART0 LCRH: FEN (Bit 4)                               */
+#define UART0_LCRH_FEN_Msk                (0x10UL)                  /*!< UART0 LCRH: FEN (Bitfield-Mask: 0x01)                 */
+#define UART0_LCRH_STP2_Pos               (3UL)                     /*!< UART0 LCRH: STP2 (Bit 3)                              */
+#define UART0_LCRH_STP2_Msk               (0x8UL)                   /*!< UART0 LCRH: STP2 (Bitfield-Mask: 0x01)                */
+#define UART0_LCRH_EPS_Pos                (2UL)                     /*!< UART0 LCRH: EPS (Bit 2)                               */
+#define UART0_LCRH_EPS_Msk                (0x4UL)                   /*!< UART0 LCRH: EPS (Bitfield-Mask: 0x01)                 */
+#define UART0_LCRH_PEN_Pos                (1UL)                     /*!< UART0 LCRH: PEN (Bit 1)                               */
+#define UART0_LCRH_PEN_Msk                (0x2UL)                   /*!< UART0 LCRH: PEN (Bitfield-Mask: 0x01)                 */
+#define UART0_LCRH_BRK_Pos                (0UL)                     /*!< UART0 LCRH: BRK (Bit 0)                               */
+#define UART0_LCRH_BRK_Msk                (0x1UL)                   /*!< UART0 LCRH: BRK (Bitfield-Mask: 0x01)                 */
+/* ==========================================================  CR  =========================================================== */
+#define UART0_CR_CTSEN_Pos                (15UL)                    /*!< UART0 CR: CTSEN (Bit 15)                              */
+#define UART0_CR_CTSEN_Msk                (0x8000UL)                /*!< UART0 CR: CTSEN (Bitfield-Mask: 0x01)                 */
+#define UART0_CR_RTSEN_Pos                (14UL)                    /*!< UART0 CR: RTSEN (Bit 14)                              */
+#define UART0_CR_RTSEN_Msk                (0x4000UL)                /*!< UART0 CR: RTSEN (Bitfield-Mask: 0x01)                 */
+#define UART0_CR_OUT2_Pos                 (13UL)                    /*!< UART0 CR: OUT2 (Bit 13)                               */
+#define UART0_CR_OUT2_Msk                 (0x2000UL)                /*!< UART0 CR: OUT2 (Bitfield-Mask: 0x01)                  */
+#define UART0_CR_OUT1_Pos                 (12UL)                    /*!< UART0 CR: OUT1 (Bit 12)                               */
+#define UART0_CR_OUT1_Msk                 (0x1000UL)                /*!< UART0 CR: OUT1 (Bitfield-Mask: 0x01)                  */
+#define UART0_CR_RTS_Pos                  (11UL)                    /*!< UART0 CR: RTS (Bit 11)                                */
+#define UART0_CR_RTS_Msk                  (0x800UL)                 /*!< UART0 CR: RTS (Bitfield-Mask: 0x01)                   */
+#define UART0_CR_DTR_Pos                  (10UL)                    /*!< UART0 CR: DTR (Bit 10)                                */
+#define UART0_CR_DTR_Msk                  (0x400UL)                 /*!< UART0 CR: DTR (Bitfield-Mask: 0x01)                   */
+#define UART0_CR_RXE_Pos                  (9UL)                     /*!< UART0 CR: RXE (Bit 9)                                 */
+#define UART0_CR_RXE_Msk                  (0x200UL)                 /*!< UART0 CR: RXE (Bitfield-Mask: 0x01)                   */
+#define UART0_CR_TXE_Pos                  (8UL)                     /*!< UART0 CR: TXE (Bit 8)                                 */
+#define UART0_CR_TXE_Msk                  (0x100UL)                 /*!< UART0 CR: TXE (Bitfield-Mask: 0x01)                   */
+#define UART0_CR_LBE_Pos                  (7UL)                     /*!< UART0 CR: LBE (Bit 7)                                 */
+#define UART0_CR_LBE_Msk                  (0x80UL)                  /*!< UART0 CR: LBE (Bitfield-Mask: 0x01)                   */
+#define UART0_CR_CLKSEL_Pos               (4UL)                     /*!< UART0 CR: CLKSEL (Bit 4)                              */
+#define UART0_CR_CLKSEL_Msk               (0x70UL)                  /*!< UART0 CR: CLKSEL (Bitfield-Mask: 0x07)                */
+#define UART0_CR_CLKEN_Pos                (3UL)                     /*!< UART0 CR: CLKEN (Bit 3)                               */
+#define UART0_CR_CLKEN_Msk                (0x8UL)                   /*!< UART0 CR: CLKEN (Bitfield-Mask: 0x01)                 */
+#define UART0_CR_SIRLP_Pos                (2UL)                     /*!< UART0 CR: SIRLP (Bit 2)                               */
+#define UART0_CR_SIRLP_Msk                (0x4UL)                   /*!< UART0 CR: SIRLP (Bitfield-Mask: 0x01)                 */
+#define UART0_CR_SIREN_Pos                (1UL)                     /*!< UART0 CR: SIREN (Bit 1)                               */
+#define UART0_CR_SIREN_Msk                (0x2UL)                   /*!< UART0 CR: SIREN (Bitfield-Mask: 0x01)                 */
+#define UART0_CR_UARTEN_Pos               (0UL)                     /*!< UART0 CR: UARTEN (Bit 0)                              */
+#define UART0_CR_UARTEN_Msk               (0x1UL)                   /*!< UART0 CR: UARTEN (Bitfield-Mask: 0x01)                */
+/* =========================================================  IFLS  ========================================================== */
+#define UART0_IFLS_RXIFLSEL_Pos           (3UL)                     /*!< UART0 IFLS: RXIFLSEL (Bit 3)                          */
+#define UART0_IFLS_RXIFLSEL_Msk           (0x38UL)                  /*!< UART0 IFLS: RXIFLSEL (Bitfield-Mask: 0x07)            */
+#define UART0_IFLS_TXIFLSEL_Pos           (0UL)                     /*!< UART0 IFLS: TXIFLSEL (Bit 0)                          */
+#define UART0_IFLS_TXIFLSEL_Msk           (0x7UL)                   /*!< UART0 IFLS: TXIFLSEL (Bitfield-Mask: 0x07)            */
+/* ==========================================================  IER  ========================================================== */
+#define UART0_IER_OEIM_Pos                (10UL)                    /*!< UART0 IER: OEIM (Bit 10)                              */
+#define UART0_IER_OEIM_Msk                (0x400UL)                 /*!< UART0 IER: OEIM (Bitfield-Mask: 0x01)                 */
+#define UART0_IER_BEIM_Pos                (9UL)                     /*!< UART0 IER: BEIM (Bit 9)                               */
+#define UART0_IER_BEIM_Msk                (0x200UL)                 /*!< UART0 IER: BEIM (Bitfield-Mask: 0x01)                 */
+#define UART0_IER_PEIM_Pos                (8UL)                     /*!< UART0 IER: PEIM (Bit 8)                               */
+#define UART0_IER_PEIM_Msk                (0x100UL)                 /*!< UART0 IER: PEIM (Bitfield-Mask: 0x01)                 */
+#define UART0_IER_FEIM_Pos                (7UL)                     /*!< UART0 IER: FEIM (Bit 7)                               */
+#define UART0_IER_FEIM_Msk                (0x80UL)                  /*!< UART0 IER: FEIM (Bitfield-Mask: 0x01)                 */
+#define UART0_IER_RTIM_Pos                (6UL)                     /*!< UART0 IER: RTIM (Bit 6)                               */
+#define UART0_IER_RTIM_Msk                (0x40UL)                  /*!< UART0 IER: RTIM (Bitfield-Mask: 0x01)                 */
+#define UART0_IER_TXIM_Pos                (5UL)                     /*!< UART0 IER: TXIM (Bit 5)                               */
+#define UART0_IER_TXIM_Msk                (0x20UL)                  /*!< UART0 IER: TXIM (Bitfield-Mask: 0x01)                 */
+#define UART0_IER_RXIM_Pos                (4UL)                     /*!< UART0 IER: RXIM (Bit 4)                               */
+#define UART0_IER_RXIM_Msk                (0x10UL)                  /*!< UART0 IER: RXIM (Bitfield-Mask: 0x01)                 */
+#define UART0_IER_DSRMIM_Pos              (3UL)                     /*!< UART0 IER: DSRMIM (Bit 3)                             */
+#define UART0_IER_DSRMIM_Msk              (0x8UL)                   /*!< UART0 IER: DSRMIM (Bitfield-Mask: 0x01)               */
+#define UART0_IER_DCDMIM_Pos              (2UL)                     /*!< UART0 IER: DCDMIM (Bit 2)                             */
+#define UART0_IER_DCDMIM_Msk              (0x4UL)                   /*!< UART0 IER: DCDMIM (Bitfield-Mask: 0x01)               */
+#define UART0_IER_CTSMIM_Pos              (1UL)                     /*!< UART0 IER: CTSMIM (Bit 1)                             */
+#define UART0_IER_CTSMIM_Msk              (0x2UL)                   /*!< UART0 IER: CTSMIM (Bitfield-Mask: 0x01)               */
+#define UART0_IER_TXCMPMIM_Pos            (0UL)                     /*!< UART0 IER: TXCMPMIM (Bit 0)                           */
+#define UART0_IER_TXCMPMIM_Msk            (0x1UL)                   /*!< UART0 IER: TXCMPMIM (Bitfield-Mask: 0x01)             */
+/* ==========================================================  IES  ========================================================== */
+#define UART0_IES_OERIS_Pos               (10UL)                    /*!< UART0 IES: OERIS (Bit 10)                             */
+#define UART0_IES_OERIS_Msk               (0x400UL)                 /*!< UART0 IES: OERIS (Bitfield-Mask: 0x01)                */
+#define UART0_IES_BERIS_Pos               (9UL)                     /*!< UART0 IES: BERIS (Bit 9)                              */
+#define UART0_IES_BERIS_Msk               (0x200UL)                 /*!< UART0 IES: BERIS (Bitfield-Mask: 0x01)                */
+#define UART0_IES_PERIS_Pos               (8UL)                     /*!< UART0 IES: PERIS (Bit 8)                              */
+#define UART0_IES_PERIS_Msk               (0x100UL)                 /*!< UART0 IES: PERIS (Bitfield-Mask: 0x01)                */
+#define UART0_IES_FERIS_Pos               (7UL)                     /*!< UART0 IES: FERIS (Bit 7)                              */
+#define UART0_IES_FERIS_Msk               (0x80UL)                  /*!< UART0 IES: FERIS (Bitfield-Mask: 0x01)                */
+#define UART0_IES_RTRIS_Pos               (6UL)                     /*!< UART0 IES: RTRIS (Bit 6)                              */
+#define UART0_IES_RTRIS_Msk               (0x40UL)                  /*!< UART0 IES: RTRIS (Bitfield-Mask: 0x01)                */
+#define UART0_IES_TXRIS_Pos               (5UL)                     /*!< UART0 IES: TXRIS (Bit 5)                              */
+#define UART0_IES_TXRIS_Msk               (0x20UL)                  /*!< UART0 IES: TXRIS (Bitfield-Mask: 0x01)                */
+#define UART0_IES_RXRIS_Pos               (4UL)                     /*!< UART0 IES: RXRIS (Bit 4)                              */
+#define UART0_IES_RXRIS_Msk               (0x10UL)                  /*!< UART0 IES: RXRIS (Bitfield-Mask: 0x01)                */
+#define UART0_IES_DSRMRIS_Pos             (3UL)                     /*!< UART0 IES: DSRMRIS (Bit 3)                            */
+#define UART0_IES_DSRMRIS_Msk             (0x8UL)                   /*!< UART0 IES: DSRMRIS (Bitfield-Mask: 0x01)              */
+#define UART0_IES_DCDMRIS_Pos             (2UL)                     /*!< UART0 IES: DCDMRIS (Bit 2)                            */
+#define UART0_IES_DCDMRIS_Msk             (0x4UL)                   /*!< UART0 IES: DCDMRIS (Bitfield-Mask: 0x01)              */
+#define UART0_IES_CTSMRIS_Pos             (1UL)                     /*!< UART0 IES: CTSMRIS (Bit 1)                            */
+#define UART0_IES_CTSMRIS_Msk             (0x2UL)                   /*!< UART0 IES: CTSMRIS (Bitfield-Mask: 0x01)              */
+#define UART0_IES_TXCMPMRIS_Pos           (0UL)                     /*!< UART0 IES: TXCMPMRIS (Bit 0)                          */
+#define UART0_IES_TXCMPMRIS_Msk           (0x1UL)                   /*!< UART0 IES: TXCMPMRIS (Bitfield-Mask: 0x01)            */
+/* ==========================================================  MIS  ========================================================== */
+#define UART0_MIS_OEMIS_Pos               (10UL)                    /*!< UART0 MIS: OEMIS (Bit 10)                             */
+#define UART0_MIS_OEMIS_Msk               (0x400UL)                 /*!< UART0 MIS: OEMIS (Bitfield-Mask: 0x01)                */
+#define UART0_MIS_BEMIS_Pos               (9UL)                     /*!< UART0 MIS: BEMIS (Bit 9)                              */
+#define UART0_MIS_BEMIS_Msk               (0x200UL)                 /*!< UART0 MIS: BEMIS (Bitfield-Mask: 0x01)                */
+#define UART0_MIS_PEMIS_Pos               (8UL)                     /*!< UART0 MIS: PEMIS (Bit 8)                              */
+#define UART0_MIS_PEMIS_Msk               (0x100UL)                 /*!< UART0 MIS: PEMIS (Bitfield-Mask: 0x01)                */
+#define UART0_MIS_FEMIS_Pos               (7UL)                     /*!< UART0 MIS: FEMIS (Bit 7)                              */
+#define UART0_MIS_FEMIS_Msk               (0x80UL)                  /*!< UART0 MIS: FEMIS (Bitfield-Mask: 0x01)                */
+#define UART0_MIS_RTMIS_Pos               (6UL)                     /*!< UART0 MIS: RTMIS (Bit 6)                              */
+#define UART0_MIS_RTMIS_Msk               (0x40UL)                  /*!< UART0 MIS: RTMIS (Bitfield-Mask: 0x01)                */
+#define UART0_MIS_TXMIS_Pos               (5UL)                     /*!< UART0 MIS: TXMIS (Bit 5)                              */
+#define UART0_MIS_TXMIS_Msk               (0x20UL)                  /*!< UART0 MIS: TXMIS (Bitfield-Mask: 0x01)                */
+#define UART0_MIS_RXMIS_Pos               (4UL)                     /*!< UART0 MIS: RXMIS (Bit 4)                              */
+#define UART0_MIS_RXMIS_Msk               (0x10UL)                  /*!< UART0 MIS: RXMIS (Bitfield-Mask: 0x01)                */
+#define UART0_MIS_DSRMMIS_Pos             (3UL)                     /*!< UART0 MIS: DSRMMIS (Bit 3)                            */
+#define UART0_MIS_DSRMMIS_Msk             (0x8UL)                   /*!< UART0 MIS: DSRMMIS (Bitfield-Mask: 0x01)              */
+#define UART0_MIS_DCDMMIS_Pos             (2UL)                     /*!< UART0 MIS: DCDMMIS (Bit 2)                            */
+#define UART0_MIS_DCDMMIS_Msk             (0x4UL)                   /*!< UART0 MIS: DCDMMIS (Bitfield-Mask: 0x01)              */
+#define UART0_MIS_CTSMMIS_Pos             (1UL)                     /*!< UART0 MIS: CTSMMIS (Bit 1)                            */
+#define UART0_MIS_CTSMMIS_Msk             (0x2UL)                   /*!< UART0 MIS: CTSMMIS (Bitfield-Mask: 0x01)              */
+#define UART0_MIS_TXCMPMMIS_Pos           (0UL)                     /*!< UART0 MIS: TXCMPMMIS (Bit 0)                          */
+#define UART0_MIS_TXCMPMMIS_Msk           (0x1UL)                   /*!< UART0 MIS: TXCMPMMIS (Bitfield-Mask: 0x01)            */
+/* ==========================================================  IEC  ========================================================== */
+#define UART0_IEC_OEIC_Pos                (10UL)                    /*!< UART0 IEC: OEIC (Bit 10)                              */
+#define UART0_IEC_OEIC_Msk                (0x400UL)                 /*!< UART0 IEC: OEIC (Bitfield-Mask: 0x01)                 */
+#define UART0_IEC_BEIC_Pos                (9UL)                     /*!< UART0 IEC: BEIC (Bit 9)                               */
+#define UART0_IEC_BEIC_Msk                (0x200UL)                 /*!< UART0 IEC: BEIC (Bitfield-Mask: 0x01)                 */
+#define UART0_IEC_PEIC_Pos                (8UL)                     /*!< UART0 IEC: PEIC (Bit 8)                               */
+#define UART0_IEC_PEIC_Msk                (0x100UL)                 /*!< UART0 IEC: PEIC (Bitfield-Mask: 0x01)                 */
+#define UART0_IEC_FEIC_Pos                (7UL)                     /*!< UART0 IEC: FEIC (Bit 7)                               */
+#define UART0_IEC_FEIC_Msk                (0x80UL)                  /*!< UART0 IEC: FEIC (Bitfield-Mask: 0x01)                 */
+#define UART0_IEC_RTIC_Pos                (6UL)                     /*!< UART0 IEC: RTIC (Bit 6)                               */
+#define UART0_IEC_RTIC_Msk                (0x40UL)                  /*!< UART0 IEC: RTIC (Bitfield-Mask: 0x01)                 */
+#define UART0_IEC_TXIC_Pos                (5UL)                     /*!< UART0 IEC: TXIC (Bit 5)                               */
+#define UART0_IEC_TXIC_Msk                (0x20UL)                  /*!< UART0 IEC: TXIC (Bitfield-Mask: 0x01)                 */
+#define UART0_IEC_RXIC_Pos                (4UL)                     /*!< UART0 IEC: RXIC (Bit 4)                               */
+#define UART0_IEC_RXIC_Msk                (0x10UL)                  /*!< UART0 IEC: RXIC (Bitfield-Mask: 0x01)                 */
+#define UART0_IEC_DSRMIC_Pos              (3UL)                     /*!< UART0 IEC: DSRMIC (Bit 3)                             */
+#define UART0_IEC_DSRMIC_Msk              (0x8UL)                   /*!< UART0 IEC: DSRMIC (Bitfield-Mask: 0x01)               */
+#define UART0_IEC_DCDMIC_Pos              (2UL)                     /*!< UART0 IEC: DCDMIC (Bit 2)                             */
+#define UART0_IEC_DCDMIC_Msk              (0x4UL)                   /*!< UART0 IEC: DCDMIC (Bitfield-Mask: 0x01)               */
+#define UART0_IEC_CTSMIC_Pos              (1UL)                     /*!< UART0 IEC: CTSMIC (Bit 1)                             */
+#define UART0_IEC_CTSMIC_Msk              (0x2UL)                   /*!< UART0 IEC: CTSMIC (Bitfield-Mask: 0x01)               */
+#define UART0_IEC_TXCMPMIC_Pos            (0UL)                     /*!< UART0 IEC: TXCMPMIC (Bit 0)                           */
+#define UART0_IEC_TXCMPMIC_Msk            (0x1UL)                   /*!< UART0 IEC: TXCMPMIC (Bitfield-Mask: 0x01)             */
+
+
+/* =========================================================================================================================== */
+/* ================                                           VCOMP                                           ================ */
+/* =========================================================================================================================== */
+
+/* ==========================================================  CFG  ========================================================== */
+#define VCOMP_CFG_LVLSEL_Pos              (16UL)                    /*!< VCOMP CFG: LVLSEL (Bit 16)                            */
+#define VCOMP_CFG_LVLSEL_Msk              (0xf0000UL)               /*!< VCOMP CFG: LVLSEL (Bitfield-Mask: 0x0f)               */
+#define VCOMP_CFG_NSEL_Pos                (8UL)                     /*!< VCOMP CFG: NSEL (Bit 8)                               */
+#define VCOMP_CFG_NSEL_Msk                (0x300UL)                 /*!< VCOMP CFG: NSEL (Bitfield-Mask: 0x03)                 */
+#define VCOMP_CFG_PSEL_Pos                (0UL)                     /*!< VCOMP CFG: PSEL (Bit 0)                               */
+#define VCOMP_CFG_PSEL_Msk                (0x3UL)                   /*!< VCOMP CFG: PSEL (Bitfield-Mask: 0x03)                 */
+/* =========================================================  STAT  ========================================================== */
+#define VCOMP_STAT_PWDSTAT_Pos            (1UL)                     /*!< VCOMP STAT: PWDSTAT (Bit 1)                           */
+#define VCOMP_STAT_PWDSTAT_Msk            (0x2UL)                   /*!< VCOMP STAT: PWDSTAT (Bitfield-Mask: 0x01)             */
+#define VCOMP_STAT_CMPOUT_Pos             (0UL)                     /*!< VCOMP STAT: CMPOUT (Bit 0)                            */
+#define VCOMP_STAT_CMPOUT_Msk             (0x1UL)                   /*!< VCOMP STAT: CMPOUT (Bitfield-Mask: 0x01)              */
+/* ========================================================  PWDKEY  ========================================================= */
+#define VCOMP_PWDKEY_PWDKEY_Pos           (0UL)                     /*!< VCOMP PWDKEY: PWDKEY (Bit 0)                          */
+#define VCOMP_PWDKEY_PWDKEY_Msk           (0xffffffffUL)            /*!< VCOMP PWDKEY: PWDKEY (Bitfield-Mask: 0xffffffff)      */
+/* =========================================================  INTEN  ========================================================= */
+#define VCOMP_INTEN_OUTHI_Pos             (1UL)                     /*!< VCOMP INTEN: OUTHI (Bit 1)                            */
+#define VCOMP_INTEN_OUTHI_Msk             (0x2UL)                   /*!< VCOMP INTEN: OUTHI (Bitfield-Mask: 0x01)              */
+#define VCOMP_INTEN_OUTLOW_Pos            (0UL)                     /*!< VCOMP INTEN: OUTLOW (Bit 0)                           */
+#define VCOMP_INTEN_OUTLOW_Msk            (0x1UL)                   /*!< VCOMP INTEN: OUTLOW (Bitfield-Mask: 0x01)             */
+/* ========================================================  INTSTAT  ======================================================== */
+#define VCOMP_INTSTAT_OUTHI_Pos           (1UL)                     /*!< VCOMP INTSTAT: OUTHI (Bit 1)                          */
+#define VCOMP_INTSTAT_OUTHI_Msk           (0x2UL)                   /*!< VCOMP INTSTAT: OUTHI (Bitfield-Mask: 0x01)            */
+#define VCOMP_INTSTAT_OUTLOW_Pos          (0UL)                     /*!< VCOMP INTSTAT: OUTLOW (Bit 0)                         */
+#define VCOMP_INTSTAT_OUTLOW_Msk          (0x1UL)                   /*!< VCOMP INTSTAT: OUTLOW (Bitfield-Mask: 0x01)           */
+/* ========================================================  INTCLR  ========================================================= */
+#define VCOMP_INTCLR_OUTHI_Pos            (1UL)                     /*!< VCOMP INTCLR: OUTHI (Bit 1)                           */
+#define VCOMP_INTCLR_OUTHI_Msk            (0x2UL)                   /*!< VCOMP INTCLR: OUTHI (Bitfield-Mask: 0x01)             */
+#define VCOMP_INTCLR_OUTLOW_Pos           (0UL)                     /*!< VCOMP INTCLR: OUTLOW (Bit 0)                          */
+#define VCOMP_INTCLR_OUTLOW_Msk           (0x1UL)                   /*!< VCOMP INTCLR: OUTLOW (Bitfield-Mask: 0x01)            */
+/* ========================================================  INTSET  ========================================================= */
+#define VCOMP_INTSET_OUTHI_Pos            (1UL)                     /*!< VCOMP INTSET: OUTHI (Bit 1)                           */
+#define VCOMP_INTSET_OUTHI_Msk            (0x2UL)                   /*!< VCOMP INTSET: OUTHI (Bitfield-Mask: 0x01)             */
+#define VCOMP_INTSET_OUTLOW_Pos           (0UL)                     /*!< VCOMP INTSET: OUTLOW (Bit 0)                          */
+#define VCOMP_INTSET_OUTLOW_Msk           (0x1UL)                   /*!< VCOMP INTSET: OUTLOW (Bitfield-Mask: 0x01)            */
+
+
+/* =========================================================================================================================== */
+/* ================                                            WDT                                            ================ */
+/* =========================================================================================================================== */
+
+/* ==========================================================  CFG  ========================================================== */
+#define WDT_CFG_CLKSEL_Pos                (24UL)                    /*!< WDT CFG: CLKSEL (Bit 24)                              */
+#define WDT_CFG_CLKSEL_Msk                (0x7000000UL)             /*!< WDT CFG: CLKSEL (Bitfield-Mask: 0x07)                 */
+#define WDT_CFG_INTVAL_Pos                (16UL)                    /*!< WDT CFG: INTVAL (Bit 16)                              */
+#define WDT_CFG_INTVAL_Msk                (0xff0000UL)              /*!< WDT CFG: INTVAL (Bitfield-Mask: 0xff)                 */
+#define WDT_CFG_RESVAL_Pos                (8UL)                     /*!< WDT CFG: RESVAL (Bit 8)                               */
+#define WDT_CFG_RESVAL_Msk                (0xff00UL)                /*!< WDT CFG: RESVAL (Bitfield-Mask: 0xff)                 */
+#define WDT_CFG_RESEN_Pos                 (2UL)                     /*!< WDT CFG: RESEN (Bit 2)                                */
+#define WDT_CFG_RESEN_Msk                 (0x4UL)                   /*!< WDT CFG: RESEN (Bitfield-Mask: 0x01)                  */
+#define WDT_CFG_INTEN_Pos                 (1UL)                     /*!< WDT CFG: INTEN (Bit 1)                                */
+#define WDT_CFG_INTEN_Msk                 (0x2UL)                   /*!< WDT CFG: INTEN (Bitfield-Mask: 0x01)                  */
+#define WDT_CFG_WDTEN_Pos                 (0UL)                     /*!< WDT CFG: WDTEN (Bit 0)                                */
+#define WDT_CFG_WDTEN_Msk                 (0x1UL)                   /*!< WDT CFG: WDTEN (Bitfield-Mask: 0x01)                  */
+/* =========================================================  RSTRT  ========================================================= */
+#define WDT_RSTRT_RSTRT_Pos               (0UL)                     /*!< WDT RSTRT: RSTRT (Bit 0)                              */
+#define WDT_RSTRT_RSTRT_Msk               (0xffUL)                  /*!< WDT RSTRT: RSTRT (Bitfield-Mask: 0xff)                */
+/* =========================================================  LOCK  ========================================================== */
+#define WDT_LOCK_LOCK_Pos                 (0UL)                     /*!< WDT LOCK: LOCK (Bit 0)                                */
+#define WDT_LOCK_LOCK_Msk                 (0xffUL)                  /*!< WDT LOCK: LOCK (Bitfield-Mask: 0xff)                  */
+/* =========================================================  COUNT  ========================================================= */
+#define WDT_COUNT_COUNT_Pos               (0UL)                     /*!< WDT COUNT: COUNT (Bit 0)                              */
+#define WDT_COUNT_COUNT_Msk               (0xffUL)                  /*!< WDT COUNT: COUNT (Bitfield-Mask: 0xff)                */
+/* =========================================================  INTEN  ========================================================= */
+#define WDT_INTEN_WDTINT_Pos              (0UL)                     /*!< WDT INTEN: WDTINT (Bit 0)                             */
+#define WDT_INTEN_WDTINT_Msk              (0x1UL)                   /*!< WDT INTEN: WDTINT (Bitfield-Mask: 0x01)               */
+/* ========================================================  INTSTAT  ======================================================== */
+#define WDT_INTSTAT_WDTINT_Pos            (0UL)                     /*!< WDT INTSTAT: WDTINT (Bit 0)                           */
+#define WDT_INTSTAT_WDTINT_Msk            (0x1UL)                   /*!< WDT INTSTAT: WDTINT (Bitfield-Mask: 0x01)             */
+/* ========================================================  INTCLR  ========================================================= */
+#define WDT_INTCLR_WDTINT_Pos             (0UL)                     /*!< WDT INTCLR: WDTINT (Bit 0)                            */
+#define WDT_INTCLR_WDTINT_Msk             (0x1UL)                   /*!< WDT INTCLR: WDTINT (Bitfield-Mask: 0x01)              */
+/* ========================================================  INTSET  ========================================================= */
+#define WDT_INTSET_WDTINT_Pos             (0UL)                     /*!< WDT INTSET: WDTINT (Bit 0)                            */
+#define WDT_INTSET_WDTINT_Msk             (0x1UL)                   /*!< WDT INTSET: WDTINT (Bitfield-Mask: 0x01)              */
+
+/** @} */ /* End of group PosMask_peripherals */
+
+
+/* =========================================================================================================================== */
+/* ================                           Enumerated Values Peripheral Section                            ================ */
+/* =========================================================================================================================== */
+
+
+/** @addtogroup EnumValue_peripherals
+  * @{
+  */
+
+
+
+/* =========================================================================================================================== */
+/* ================                                            ADC                                            ================ */
+/* =========================================================================================================================== */
+
+/* ==========================================================  CFG  ========================================================== */
+/* ================================================  ADC CFG CLKSEL [24..25]  ================================================ */
+typedef enum {                                  /*!< ADC_CFG_CLKSEL                                                            */
+  ADC_CFG_CLKSEL_OFF                   = 0,     /*!< OFF : Off mode. The HFRC or HFRC_DIV2 clock must be selected
+                                                                    for the ADC to function. The ADC controller
+                                                                    automatically shuts off the clock in it's
+                                                                    low power modes. When setting ADCEN to '0',
+                                                                    the CLKSEL should remain set to one of the
+                                                                    two clock selects for proper power down
+                                                                    sequencing. value.                                         */
+  ADC_CFG_CLKSEL_HFRC                  = 1,     /*!< HFRC : HFRC Core Clock divided by (CORESEL+1) value.                      */
+  ADC_CFG_CLKSEL_HFRC_DIV2             = 2,     /*!< HFRC_DIV2 : HFRC Core Clock / 2 further divided by (CORESEL+1)
+                                                     value.                                                                    */
+} ADC_CFG_CLKSEL_Enum;
+
+/* ===============================================  ADC CFG TRIGPOL [19..19]  ================================================ */
+typedef enum {                                  /*!< ADC_CFG_TRIGPOL                                                           */
+  ADC_CFG_TRIGPOL_RISING_EDGE          = 0,     /*!< RISING_EDGE : Trigger on rising edge. value.                              */
+  ADC_CFG_TRIGPOL_FALLING_EDGE         = 1,     /*!< FALLING_EDGE : Trigger on falling edge. value.                            */
+} ADC_CFG_TRIGPOL_Enum;
+
+/* ===============================================  ADC CFG TRIGSEL [16..18]  ================================================ */
+typedef enum {                                  /*!< ADC_CFG_TRIGSEL                                                           */
+  ADC_CFG_TRIGSEL_EXT0                 = 0,     /*!< EXT0 : Off chip External Trigger0 (ADC_ET0) value.                        */
+  ADC_CFG_TRIGSEL_EXT1                 = 1,     /*!< EXT1 : Off chip External Trigger1 (ADC_ET1) value.                        */
+  ADC_CFG_TRIGSEL_EXT2                 = 2,     /*!< EXT2 : Off chip External Trigger2 (ADC_ET2) value.                        */
+  ADC_CFG_TRIGSEL_EXT3                 = 3,     /*!< EXT3 : Off chip External Trigger3 (ADC_ET3) value.                        */
+  ADC_CFG_TRIGSEL_VCOMP                = 4,     /*!< VCOMP : Voltage Comparator Output value.                                  */
+  ADC_CFG_TRIGSEL_SWT                  = 7,     /*!< SWT : Software Trigger value.                                             */
+} ADC_CFG_TRIGSEL_Enum;
+
+/* ==============================================  ADC CFG DFIFORDEN [12..12]  =============================================== */
+typedef enum {                                  /*!< ADC_CFG_DFIFORDEN                                                         */
+  ADC_CFG_DFIFORDEN_DIS                = 0,     /*!< DIS : Destructive Reads are prevented. Reads to the FIFOPR register
+                                                     will not POP an entry off the FIFO. value.                                */
+  ADC_CFG_DFIFORDEN_EN                 = 1,     /*!< EN : Reads to the FIFOPR registger will automatically pop an
+                                                     entry off the FIFO. value.                                                */
+} ADC_CFG_DFIFORDEN_Enum;
+
+/* =================================================  ADC CFG REFSEL [8..9]  ================================================= */
+typedef enum {                                  /*!< ADC_CFG_REFSEL                                                            */
+  ADC_CFG_REFSEL_INT2P0                = 0,     /*!< INT2P0 : Internal 2.0V Bandgap Reference Voltage value.                   */
+  ADC_CFG_REFSEL_INT1P5                = 1,     /*!< INT1P5 : Internal 1.5V Bandgap Reference Voltage value.                   */
+  ADC_CFG_REFSEL_EXT2P0                = 2,     /*!< EXT2P0 : Off Chip 2.0V Reference value.                                   */
+  ADC_CFG_REFSEL_EXT1P5                = 3,     /*!< EXT1P5 : Off Chip 1.5V Reference value.                                   */
+} ADC_CFG_REFSEL_Enum;
+
+/* =================================================  ADC CFG CKMODE [4..4]  ================================================= */
+typedef enum {                                  /*!< ADC_CFG_CKMODE                                                            */
+  ADC_CFG_CKMODE_LPCKMODE              = 0,     /*!< LPCKMODE : Disable the clock between scans for LPMODE0. Set
+                                                     LPCKMODE to 0x1 while configuring the ADC. value.                         */
+  ADC_CFG_CKMODE_LLCKMODE              = 1,     /*!< LLCKMODE : Low Latency Clock Mode. When set, HFRC and the adc_clk
+                                                     will remain on while in functioning in LPMODE0. value.                    */
+} ADC_CFG_CKMODE_Enum;
+
+/* =================================================  ADC CFG LPMODE [3..3]  ================================================= */
+typedef enum {                                  /*!< ADC_CFG_LPMODE                                                            */
+  ADC_CFG_LPMODE_MODE0                 = 0,     /*!< MODE0 : Low Power Mode 0. Leaves the ADC fully powered between
+                                                     scans with minimum latency between a trigger event and
+                                                     sample data collection. value.                                            */
+  ADC_CFG_LPMODE_MODE1                 = 1,     /*!< MODE1 : Low Power Mode 1. Powers down all circuity and clocks
+                                                     associated with the ADC until the next trigger event. Between
+                                                     scans, the reference buffer requires up to 50us of delay
+                                                     from a scan trigger event before the conversion will commence
+                                                     while operating in this mode. value.                                      */
+} ADC_CFG_LPMODE_Enum;
+
+/* =================================================  ADC CFG RPTEN [2..2]  ================================================== */
+typedef enum {                                  /*!< ADC_CFG_RPTEN                                                             */
+  ADC_CFG_RPTEN_SINGLE_SCAN            = 0,     /*!< SINGLE_SCAN : In Single Scan Mode, the ADC will complete a single
+                                                     scan upon each trigger event. value.                                      */
+  ADC_CFG_RPTEN_REPEATING_SCAN         = 1,     /*!< REPEATING_SCAN : In Repeating Scan Mode, the ADC will complete
+                                                     it's first scan upon the initial trigger event and all
+                                                     subsequent scans will occur at regular intervals defined
+                                                     by the configuration programmed for the CTTMRA3 internal
+                                                     timer until the timer is disabled or the ADC is disabled.
+                                                     When disabling the ADC (setting ADCEN to '0'), the RPTEN
+                                                     bit should be cleared. value.                                             */
+} ADC_CFG_RPTEN_Enum;
+
+/* =================================================  ADC CFG ADCEN [0..0]  ================================================== */
+typedef enum {                                  /*!< ADC_CFG_ADCEN                                                             */
+  ADC_CFG_ADCEN_DIS                    = 0,     /*!< DIS : Disable the ADC module. value.                                      */
+  ADC_CFG_ADCEN_EN                     = 1,     /*!< EN : Enable the ADC module. value.                                        */
+} ADC_CFG_ADCEN_Enum;
+
+/* =========================================================  STAT  ========================================================== */
+/* ================================================  ADC STAT PWDSTAT [0..0]  ================================================ */
+typedef enum {                                  /*!< ADC_STAT_PWDSTAT                                                          */
+  ADC_STAT_PWDSTAT_ON                  = 0,     /*!< ON : Powered on. value.                                                   */
+  ADC_STAT_PWDSTAT_POWERED_DOWN        = 1,     /*!< POWERED_DOWN : ADC Low Power Mode 1. value.                               */
+} ADC_STAT_PWDSTAT_Enum;
+
+/* ==========================================================  SWT  ========================================================== */
+/* ==================================================  ADC SWT SWT [0..7]  =================================================== */
+typedef enum {                                  /*!< ADC_SWT_SWT                                                               */
+  ADC_SWT_SWT_GEN_SW_TRIGGER           = 55,    /*!< GEN_SW_TRIGGER : Writing this value generates a software trigger.
+                                                     value.                                                                    */
+} ADC_SWT_SWT_Enum;
+
+/* ========================================================  SL0CFG  ========================================================= */
+/* ==============================================  ADC SL0CFG ADSEL0 [24..26]  =============================================== */
+typedef enum {                                  /*!< ADC_SL0CFG_ADSEL0                                                         */
+  ADC_SL0CFG_ADSEL0_AVG_1_MSRMT        = 0,     /*!< AVG_1_MSRMT : Average in 1 measurement in the accumulate divide
+                                                     module for this slot. value.                                              */
+  ADC_SL0CFG_ADSEL0_AVG_2_MSRMTS       = 1,     /*!< AVG_2_MSRMTS : Average in 2 measurements in the accumulate divide
+                                                     module for this slot. value.                                              */
+  ADC_SL0CFG_ADSEL0_AVG_4_MSRMTS       = 2,     /*!< AVG_4_MSRMTS : Average in 4 measurements in the accumulate divide
+                                                     module for this slot. value.                                              */
+  ADC_SL0CFG_ADSEL0_AVG_8_MSRMT        = 3,     /*!< AVG_8_MSRMT : Average in 8 measurements in the accumulate divide
+                                                     module for this slot. value.                                              */
+  ADC_SL0CFG_ADSEL0_AVG_16_MSRMTS      = 4,     /*!< AVG_16_MSRMTS : Average in 16 measurements in the accumulate
+                                                     divide module for this slot. value.                                       */
+  ADC_SL0CFG_ADSEL0_AVG_32_MSRMTS      = 5,     /*!< AVG_32_MSRMTS : Average in 32 measurements in the accumulate
+                                                     divide module for this slot. value.                                       */
+  ADC_SL0CFG_ADSEL0_AVG_64_MSRMTS      = 6,     /*!< AVG_64_MSRMTS : Average in 64 measurements in the accumulate
+                                                     divide module for this slot. value.                                       */
+  ADC_SL0CFG_ADSEL0_AVG_128_MSRMTS     = 7,     /*!< AVG_128_MSRMTS : Average in 128 measurements in the accumulate
+                                                     divide module for this slot. value.                                       */
+} ADC_SL0CFG_ADSEL0_Enum;
+
+/* ==============================================  ADC SL0CFG PRMODE0 [16..17]  ============================================== */
+typedef enum {                                  /*!< ADC_SL0CFG_PRMODE0                                                        */
+  ADC_SL0CFG_PRMODE0_P14B              = 0,     /*!< P14B : 14-bit precision mode value.                                       */
+  ADC_SL0CFG_PRMODE0_P12B              = 1,     /*!< P12B : 12-bit precision mode value.                                       */
+  ADC_SL0CFG_PRMODE0_P10B              = 2,     /*!< P10B : 10-bit precision mode value.                                       */
+  ADC_SL0CFG_PRMODE0_P8B               = 3,     /*!< P8B : 8-bit precision mode value.                                         */
+} ADC_SL0CFG_PRMODE0_Enum;
+
+/* ===============================================  ADC SL0CFG CHSEL0 [8..11]  =============================================== */
+typedef enum {                                  /*!< ADC_SL0CFG_CHSEL0                                                         */
+  ADC_SL0CFG_CHSEL0_SE0                = 0,     /*!< SE0 : single ended external GPIO connection to pad16. value.              */
+  ADC_SL0CFG_CHSEL0_SE1                = 1,     /*!< SE1 : single ended external GPIO connection to pad29. value.              */
+  ADC_SL0CFG_CHSEL0_SE2                = 2,     /*!< SE2 : single ended external GPIO connection to pad11. value.              */
+  ADC_SL0CFG_CHSEL0_SE3                = 3,     /*!< SE3 : single ended external GPIO connection to pad31. value.              */
+  ADC_SL0CFG_CHSEL0_SE4                = 4,     /*!< SE4 : single ended external GPIO connection to pad32. value.              */
+  ADC_SL0CFG_CHSEL0_SE5                = 5,     /*!< SE5 : single ended external GPIO connection to pad33. value.              */
+  ADC_SL0CFG_CHSEL0_SE6                = 6,     /*!< SE6 : single ended external GPIO connection to pad34. value.              */
+  ADC_SL0CFG_CHSEL0_SE7                = 7,     /*!< SE7 : single ended external GPIO connection to pad35. value.              */
+  ADC_SL0CFG_CHSEL0_SE8                = 8,     /*!< SE8 : single ended external GPIO connection to pad13. value.              */
+  ADC_SL0CFG_CHSEL0_SE9                = 9,     /*!< SE9 : single ended external GPIO connection to pad12. value.              */
+  ADC_SL0CFG_CHSEL0_DF0                = 10,    /*!< DF0 : differential external GPIO connections to pad12(N) and
+                                                     pad13(P). value.                                                          */
+  ADC_SL0CFG_CHSEL0_DF1                = 11,    /*!< DF1 : differential external GPIO connections to pad15(N) and
+                                                     pad14(P). value.                                                          */
+  ADC_SL0CFG_CHSEL0_TEMP               = 12,    /*!< TEMP : internal temperature sensor. value.                                */
+  ADC_SL0CFG_CHSEL0_BATT               = 13,    /*!< BATT : internal voltage divide-by-3 connection. value.                    */
+  ADC_SL0CFG_CHSEL0_VSS                = 14,    /*!< VSS : Input VSS value.                                                    */
+} ADC_SL0CFG_CHSEL0_Enum;
+
+/* ================================================  ADC SL0CFG WCEN0 [1..1]  ================================================ */
+typedef enum {                                  /*!< ADC_SL0CFG_WCEN0                                                          */
+  ADC_SL0CFG_WCEN0_WCEN                = 1,     /*!< WCEN : Enable the window compare for slot 0. value.                       */
+} ADC_SL0CFG_WCEN0_Enum;
+
+/* ================================================  ADC SL0CFG SLEN0 [0..0]  ================================================ */
+typedef enum {                                  /*!< ADC_SL0CFG_SLEN0                                                          */
+  ADC_SL0CFG_SLEN0_SLEN                = 1,     /*!< SLEN : Enable slot 0 for ADC conversions. value.                          */
+} ADC_SL0CFG_SLEN0_Enum;
+
+/* ========================================================  SL1CFG  ========================================================= */
+/* ==============================================  ADC SL1CFG ADSEL1 [24..26]  =============================================== */
+typedef enum {                                  /*!< ADC_SL1CFG_ADSEL1                                                         */
+  ADC_SL1CFG_ADSEL1_AVG_1_MSRMT        = 0,     /*!< AVG_1_MSRMT : Average in 1 measurement in the accumulate divide
+                                                     module for this slot. value.                                              */
+  ADC_SL1CFG_ADSEL1_AVG_2_MSRMTS       = 1,     /*!< AVG_2_MSRMTS : Average in 2 measurements in the accumulate divide
+                                                     module for this slot. value.                                              */
+  ADC_SL1CFG_ADSEL1_AVG_4_MSRMTS       = 2,     /*!< AVG_4_MSRMTS : Average in 4 measurements in the accumulate divide
+                                                     module for this slot. value.                                              */
+  ADC_SL1CFG_ADSEL1_AVG_8_MSRMT        = 3,     /*!< AVG_8_MSRMT : Average in 8 measurements in the accumulate divide
+                                                     module for this slot. value.                                              */
+  ADC_SL1CFG_ADSEL1_AVG_16_MSRMTS      = 4,     /*!< AVG_16_MSRMTS : Average in 16 measurements in the accumulate
+                                                     divide module for this slot. value.                                       */
+  ADC_SL1CFG_ADSEL1_AVG_32_MSRMTS      = 5,     /*!< AVG_32_MSRMTS : Average in 32 measurements in the accumulate
+                                                     divide module for this slot. value.                                       */
+  ADC_SL1CFG_ADSEL1_AVG_64_MSRMTS      = 6,     /*!< AVG_64_MSRMTS : Average in 64 measurements in the accumulate
+                                                     divide module for this slot. value.                                       */
+  ADC_SL1CFG_ADSEL1_AVG_128_MSRMTS     = 7,     /*!< AVG_128_MSRMTS : Average in 128 measurements in the accumulate
+                                                     divide module for this slot. value.                                       */
+} ADC_SL1CFG_ADSEL1_Enum;
+
+/* ==============================================  ADC SL1CFG PRMODE1 [16..17]  ============================================== */
+typedef enum {                                  /*!< ADC_SL1CFG_PRMODE1                                                        */
+  ADC_SL1CFG_PRMODE1_P14B              = 0,     /*!< P14B : 14-bit precision mode value.                                       */
+  ADC_SL1CFG_PRMODE1_P12B              = 1,     /*!< P12B : 12-bit precision mode value.                                       */
+  ADC_SL1CFG_PRMODE1_P10B              = 2,     /*!< P10B : 10-bit precision mode value.                                       */
+  ADC_SL1CFG_PRMODE1_P8B               = 3,     /*!< P8B : 8-bit precision mode value.                                         */
+} ADC_SL1CFG_PRMODE1_Enum;
+
+/* ===============================================  ADC SL1CFG CHSEL1 [8..11]  =============================================== */
+typedef enum {                                  /*!< ADC_SL1CFG_CHSEL1                                                         */
+  ADC_SL1CFG_CHSEL1_SE0                = 0,     /*!< SE0 : single ended external GPIO connection to pad16. value.              */
+  ADC_SL1CFG_CHSEL1_SE1                = 1,     /*!< SE1 : single ended external GPIO connection to pad29. value.              */
+  ADC_SL1CFG_CHSEL1_SE2                = 2,     /*!< SE2 : single ended external GPIO connection to pad11. value.              */
+  ADC_SL1CFG_CHSEL1_SE3                = 3,     /*!< SE3 : single ended external GPIO connection to pad31. value.              */
+  ADC_SL1CFG_CHSEL1_SE4                = 4,     /*!< SE4 : single ended external GPIO connection to pad32. value.              */
+  ADC_SL1CFG_CHSEL1_SE5                = 5,     /*!< SE5 : single ended external GPIO connection to pad33. value.              */
+  ADC_SL1CFG_CHSEL1_SE6                = 6,     /*!< SE6 : single ended external GPIO connection to pad34. value.              */
+  ADC_SL1CFG_CHSEL1_SE7                = 7,     /*!< SE7 : single ended external GPIO connection to pad35. value.              */
+  ADC_SL1CFG_CHSEL1_SE8                = 8,     /*!< SE8 : single ended external GPIO connection to pad13. value.              */
+  ADC_SL1CFG_CHSEL1_SE9                = 9,     /*!< SE9 : single ended external GPIO connection to pad12. value.              */
+  ADC_SL1CFG_CHSEL1_DF0                = 10,    /*!< DF0 : differential external GPIO connections to pad12(N) and
+                                                     pad13(P). value.                                                          */
+  ADC_SL1CFG_CHSEL1_DF1                = 11,    /*!< DF1 : differential external GPIO connections to pad15(N) and
+                                                     pad14(P). value.                                                          */
+  ADC_SL1CFG_CHSEL1_TEMP               = 12,    /*!< TEMP : internal temperature sensor. value.                                */
+  ADC_SL1CFG_CHSEL1_BATT               = 13,    /*!< BATT : internal voltage divide-by-3 connection. value.                    */
+  ADC_SL1CFG_CHSEL1_VSS                = 14,    /*!< VSS : Input VSS value.                                                    */
+} ADC_SL1CFG_CHSEL1_Enum;
+
+/* ================================================  ADC SL1CFG WCEN1 [1..1]  ================================================ */
+typedef enum {                                  /*!< ADC_SL1CFG_WCEN1                                                          */
+  ADC_SL1CFG_WCEN1_WCEN                = 1,     /*!< WCEN : Enable the window compare for slot 1. value.                       */
+} ADC_SL1CFG_WCEN1_Enum;
+
+/* ================================================  ADC SL1CFG SLEN1 [0..0]  ================================================ */
+typedef enum {                                  /*!< ADC_SL1CFG_SLEN1                                                          */
+  ADC_SL1CFG_SLEN1_SLEN                = 1,     /*!< SLEN : Enable slot 1 for ADC conversions. value.                          */
+} ADC_SL1CFG_SLEN1_Enum;
+
+/* ========================================================  SL2CFG  ========================================================= */
+/* ==============================================  ADC SL2CFG ADSEL2 [24..26]  =============================================== */
+typedef enum {                                  /*!< ADC_SL2CFG_ADSEL2                                                         */
+  ADC_SL2CFG_ADSEL2_AVG_1_MSRMT        = 0,     /*!< AVG_1_MSRMT : Average in 1 measurement in the accumulate divide
+                                                     module for this slot. value.                                              */
+  ADC_SL2CFG_ADSEL2_AVG_2_MSRMTS       = 1,     /*!< AVG_2_MSRMTS : Average in 2 measurements in the accumulate divide
+                                                     module for this slot. value.                                              */
+  ADC_SL2CFG_ADSEL2_AVG_4_MSRMTS       = 2,     /*!< AVG_4_MSRMTS : Average in 4 measurements in the accumulate divide
+                                                     module for this slot. value.                                              */
+  ADC_SL2CFG_ADSEL2_AVG_8_MSRMT        = 3,     /*!< AVG_8_MSRMT : Average in 8 measurements in the accumulate divide
+                                                     module for this slot. value.                                              */
+  ADC_SL2CFG_ADSEL2_AVG_16_MSRMTS      = 4,     /*!< AVG_16_MSRMTS : Average in 16 measurements in the accumulate
+                                                     divide module for this slot. value.                                       */
+  ADC_SL2CFG_ADSEL2_AVG_32_MSRMTS      = 5,     /*!< AVG_32_MSRMTS : Average in 32 measurements in the accumulate
+                                                     divide module for this slot. value.                                       */
+  ADC_SL2CFG_ADSEL2_AVG_64_MSRMTS      = 6,     /*!< AVG_64_MSRMTS : Average in 64 measurements in the accumulate
+                                                     divide module for this slot. value.                                       */
+  ADC_SL2CFG_ADSEL2_AVG_128_MSRMTS     = 7,     /*!< AVG_128_MSRMTS : Average in 128 measurements in the accumulate
+                                                     divide module for this slot. value.                                       */
+} ADC_SL2CFG_ADSEL2_Enum;
+
+/* ==============================================  ADC SL2CFG PRMODE2 [16..17]  ============================================== */
+typedef enum {                                  /*!< ADC_SL2CFG_PRMODE2                                                        */
+  ADC_SL2CFG_PRMODE2_P14B              = 0,     /*!< P14B : 14-bit precision mode value.                                       */
+  ADC_SL2CFG_PRMODE2_P12B              = 1,     /*!< P12B : 12-bit precision mode value.                                       */
+  ADC_SL2CFG_PRMODE2_P10B              = 2,     /*!< P10B : 10-bit precision mode value.                                       */
+  ADC_SL2CFG_PRMODE2_P8B               = 3,     /*!< P8B : 8-bit precision mode value.                                         */
+} ADC_SL2CFG_PRMODE2_Enum;
+
+/* ===============================================  ADC SL2CFG CHSEL2 [8..11]  =============================================== */
+typedef enum {                                  /*!< ADC_SL2CFG_CHSEL2                                                         */
+  ADC_SL2CFG_CHSEL2_SE0                = 0,     /*!< SE0 : single ended external GPIO connection to pad16. value.              */
+  ADC_SL2CFG_CHSEL2_SE1                = 1,     /*!< SE1 : single ended external GPIO connection to pad29. value.              */
+  ADC_SL2CFG_CHSEL2_SE2                = 2,     /*!< SE2 : single ended external GPIO connection to pad11. value.              */
+  ADC_SL2CFG_CHSEL2_SE3                = 3,     /*!< SE3 : single ended external GPIO connection to pad31. value.              */
+  ADC_SL2CFG_CHSEL2_SE4                = 4,     /*!< SE4 : single ended external GPIO connection to pad32. value.              */
+  ADC_SL2CFG_CHSEL2_SE5                = 5,     /*!< SE5 : single ended external GPIO connection to pad33. value.              */
+  ADC_SL2CFG_CHSEL2_SE6                = 6,     /*!< SE6 : single ended external GPIO connection to pad34. value.              */
+  ADC_SL2CFG_CHSEL2_SE7                = 7,     /*!< SE7 : single ended external GPIO connection to pad35. value.              */
+  ADC_SL2CFG_CHSEL2_SE8                = 8,     /*!< SE8 : single ended external GPIO connection to pad13. value.              */
+  ADC_SL2CFG_CHSEL2_SE9                = 9,     /*!< SE9 : single ended external GPIO connection to pad12. value.              */
+  ADC_SL2CFG_CHSEL2_DF0                = 10,    /*!< DF0 : differential external GPIO connections to pad12(N) and
+                                                     pad13(P). value.                                                          */
+  ADC_SL2CFG_CHSEL2_DF1                = 11,    /*!< DF1 : differential external GPIO connections to pad15(N) and
+                                                     pad14(P). value.                                                          */
+  ADC_SL2CFG_CHSEL2_TEMP               = 12,    /*!< TEMP : internal temperature sensor. value.                                */
+  ADC_SL2CFG_CHSEL2_BATT               = 13,    /*!< BATT : internal voltage divide-by-3 connection. value.                    */
+  ADC_SL2CFG_CHSEL2_VSS                = 14,    /*!< VSS : Input VSS value.                                                    */
+} ADC_SL2CFG_CHSEL2_Enum;
+
+/* ================================================  ADC SL2CFG WCEN2 [1..1]  ================================================ */
+typedef enum {                                  /*!< ADC_SL2CFG_WCEN2                                                          */
+  ADC_SL2CFG_WCEN2_WCEN                = 1,     /*!< WCEN : Enable the window compare for slot 2. value.                       */
+} ADC_SL2CFG_WCEN2_Enum;
+
+/* ================================================  ADC SL2CFG SLEN2 [0..0]  ================================================ */
+typedef enum {                                  /*!< ADC_SL2CFG_SLEN2                                                          */
+  ADC_SL2CFG_SLEN2_SLEN                = 1,     /*!< SLEN : Enable slot 2 for ADC conversions. value.                          */
+} ADC_SL2CFG_SLEN2_Enum;
+
+/* ========================================================  SL3CFG  ========================================================= */
+/* ==============================================  ADC SL3CFG ADSEL3 [24..26]  =============================================== */
+typedef enum {                                  /*!< ADC_SL3CFG_ADSEL3                                                         */
+  ADC_SL3CFG_ADSEL3_AVG_1_MSRMT        = 0,     /*!< AVG_1_MSRMT : Average in 1 measurement in the accumulate divide
+                                                     module for this slot. value.                                              */
+  ADC_SL3CFG_ADSEL3_AVG_2_MSRMTS       = 1,     /*!< AVG_2_MSRMTS : Average in 2 measurements in the accumulate divide
+                                                     module for this slot. value.                                              */
+  ADC_SL3CFG_ADSEL3_AVG_4_MSRMTS       = 2,     /*!< AVG_4_MSRMTS : Average in 4 measurements in the accumulate divide
+                                                     module for this slot. value.                                              */
+  ADC_SL3CFG_ADSEL3_AVG_8_MSRMT        = 3,     /*!< AVG_8_MSRMT : Average in 8 measurements in the accumulate divide
+                                                     module for this slot. value.                                              */
+  ADC_SL3CFG_ADSEL3_AVG_16_MSRMTS      = 4,     /*!< AVG_16_MSRMTS : Average in 16 measurements in the accumulate
+                                                     divide module for this slot. value.                                       */
+  ADC_SL3CFG_ADSEL3_AVG_32_MSRMTS      = 5,     /*!< AVG_32_MSRMTS : Average in 32 measurements in the accumulate
+                                                     divide module for this slot. value.                                       */
+  ADC_SL3CFG_ADSEL3_AVG_64_MSRMTS      = 6,     /*!< AVG_64_MSRMTS : Average in 64 measurements in the accumulate
+                                                     divide module for this slot. value.                                       */
+  ADC_SL3CFG_ADSEL3_AVG_128_MSRMTS     = 7,     /*!< AVG_128_MSRMTS : Average in 128 measurements in the accumulate
+                                                     divide module for this slot. value.                                       */
+} ADC_SL3CFG_ADSEL3_Enum;
+
+/* ==============================================  ADC SL3CFG PRMODE3 [16..17]  ============================================== */
+typedef enum {                                  /*!< ADC_SL3CFG_PRMODE3                                                        */
+  ADC_SL3CFG_PRMODE3_P14B              = 0,     /*!< P14B : 14-bit precision mode value.                                       */
+  ADC_SL3CFG_PRMODE3_P12B              = 1,     /*!< P12B : 12-bit precision mode value.                                       */
+  ADC_SL3CFG_PRMODE3_P10B              = 2,     /*!< P10B : 10-bit precision mode value.                                       */
+  ADC_SL3CFG_PRMODE3_P8B               = 3,     /*!< P8B : 8-bit precision mode value.                                         */
+} ADC_SL3CFG_PRMODE3_Enum;
+
+/* ===============================================  ADC SL3CFG CHSEL3 [8..11]  =============================================== */
+typedef enum {                                  /*!< ADC_SL3CFG_CHSEL3                                                         */
+  ADC_SL3CFG_CHSEL3_SE0                = 0,     /*!< SE0 : single ended external GPIO connection to pad16. value.              */
+  ADC_SL3CFG_CHSEL3_SE1                = 1,     /*!< SE1 : single ended external GPIO connection to pad29. value.              */
+  ADC_SL3CFG_CHSEL3_SE2                = 2,     /*!< SE2 : single ended external GPIO connection to pad11. value.              */
+  ADC_SL3CFG_CHSEL3_SE3                = 3,     /*!< SE3 : single ended external GPIO connection to pad31. value.              */
+  ADC_SL3CFG_CHSEL3_SE4                = 4,     /*!< SE4 : single ended external GPIO connection to pad32. value.              */
+  ADC_SL3CFG_CHSEL3_SE5                = 5,     /*!< SE5 : single ended external GPIO connection to pad33. value.              */
+  ADC_SL3CFG_CHSEL3_SE6                = 6,     /*!< SE6 : single ended external GPIO connection to pad34. value.              */
+  ADC_SL3CFG_CHSEL3_SE7                = 7,     /*!< SE7 : single ended external GPIO connection to pad35. value.              */
+  ADC_SL3CFG_CHSEL3_SE8                = 8,     /*!< SE8 : single ended external GPIO connection to pad13. value.              */
+  ADC_SL3CFG_CHSEL3_SE9                = 9,     /*!< SE9 : single ended external GPIO connection to pad12. value.              */
+  ADC_SL3CFG_CHSEL3_DF0                = 10,    /*!< DF0 : differential external GPIO connections to pad12(N) and
+                                                     pad13(P). value.                                                          */
+  ADC_SL3CFG_CHSEL3_DF1                = 11,    /*!< DF1 : differential external GPIO connections to pad15(N) and
+                                                     pad14(P). value.                                                          */
+  ADC_SL3CFG_CHSEL3_TEMP               = 12,    /*!< TEMP : internal temperature sensor. value.                                */
+  ADC_SL3CFG_CHSEL3_BATT               = 13,    /*!< BATT : internal voltage divide-by-3 connection. value.                    */
+  ADC_SL3CFG_CHSEL3_VSS                = 14,    /*!< VSS : Input VSS value.                                                    */
+} ADC_SL3CFG_CHSEL3_Enum;
+
+/* ================================================  ADC SL3CFG WCEN3 [1..1]  ================================================ */
+typedef enum {                                  /*!< ADC_SL3CFG_WCEN3                                                          */
+  ADC_SL3CFG_WCEN3_WCEN                = 1,     /*!< WCEN : Enable the window compare for slot 3. value.                       */
+} ADC_SL3CFG_WCEN3_Enum;
+
+/* ================================================  ADC SL3CFG SLEN3 [0..0]  ================================================ */
+typedef enum {                                  /*!< ADC_SL3CFG_SLEN3                                                          */
+  ADC_SL3CFG_SLEN3_SLEN                = 1,     /*!< SLEN : Enable slot 3 for ADC conversions. value.                          */
+} ADC_SL3CFG_SLEN3_Enum;
+
+/* ========================================================  SL4CFG  ========================================================= */
+/* ==============================================  ADC SL4CFG ADSEL4 [24..26]  =============================================== */
+typedef enum {                                  /*!< ADC_SL4CFG_ADSEL4                                                         */
+  ADC_SL4CFG_ADSEL4_AVG_1_MSRMT        = 0,     /*!< AVG_1_MSRMT : Average in 1 measurement in the accumulate divide
+                                                     module for this slot. value.                                              */
+  ADC_SL4CFG_ADSEL4_AVG_2_MSRMTS       = 1,     /*!< AVG_2_MSRMTS : Average in 2 measurements in the accumulate divide
+                                                     module for this slot. value.                                              */
+  ADC_SL4CFG_ADSEL4_AVG_4_MSRMTS       = 2,     /*!< AVG_4_MSRMTS : Average in 4 measurements in the accumulate divide
+                                                     module for this slot. value.                                              */
+  ADC_SL4CFG_ADSEL4_AVG_8_MSRMT        = 3,     /*!< AVG_8_MSRMT : Average in 8 measurements in the accumulate divide
+                                                     module for this slot. value.                                              */
+  ADC_SL4CFG_ADSEL4_AVG_16_MSRMTS      = 4,     /*!< AVG_16_MSRMTS : Average in 16 measurements in the accumulate
+                                                     divide module for this slot. value.                                       */
+  ADC_SL4CFG_ADSEL4_AVG_32_MSRMTS      = 5,     /*!< AVG_32_MSRMTS : Average in 32 measurements in the accumulate
+                                                     divide module for this slot. value.                                       */
+  ADC_SL4CFG_ADSEL4_AVG_64_MSRMTS      = 6,     /*!< AVG_64_MSRMTS : Average in 64 measurements in the accumulate
+                                                     divide module for this slot. value.                                       */
+  ADC_SL4CFG_ADSEL4_AVG_128_MSRMTS     = 7,     /*!< AVG_128_MSRMTS : Average in 128 measurements in the accumulate
+                                                     divide module for this slot. value.                                       */
+} ADC_SL4CFG_ADSEL4_Enum;
+
+/* ==============================================  ADC SL4CFG PRMODE4 [16..17]  ============================================== */
+typedef enum {                                  /*!< ADC_SL4CFG_PRMODE4                                                        */
+  ADC_SL4CFG_PRMODE4_P14B              = 0,     /*!< P14B : 14-bit precision mode value.                                       */
+  ADC_SL4CFG_PRMODE4_P12B              = 1,     /*!< P12B : 12-bit precision mode value.                                       */
+  ADC_SL4CFG_PRMODE4_P10B              = 2,     /*!< P10B : 10-bit precision mode value.                                       */
+  ADC_SL4CFG_PRMODE4_P8B               = 3,     /*!< P8B : 8-bit precision mode value.                                         */
+} ADC_SL4CFG_PRMODE4_Enum;
+
+/* ===============================================  ADC SL4CFG CHSEL4 [8..11]  =============================================== */
+typedef enum {                                  /*!< ADC_SL4CFG_CHSEL4                                                         */
+  ADC_SL4CFG_CHSEL4_SE0                = 0,     /*!< SE0 : single ended external GPIO connection to pad16. value.              */
+  ADC_SL4CFG_CHSEL4_SE1                = 1,     /*!< SE1 : single ended external GPIO connection to pad29. value.              */
+  ADC_SL4CFG_CHSEL4_SE2                = 2,     /*!< SE2 : single ended external GPIO connection to pad11. value.              */
+  ADC_SL4CFG_CHSEL4_SE3                = 3,     /*!< SE3 : single ended external GPIO connection to pad31. value.              */
+  ADC_SL4CFG_CHSEL4_SE4                = 4,     /*!< SE4 : single ended external GPIO connection to pad32. value.              */
+  ADC_SL4CFG_CHSEL4_SE5                = 5,     /*!< SE5 : single ended external GPIO connection to pad33. value.              */
+  ADC_SL4CFG_CHSEL4_SE6                = 6,     /*!< SE6 : single ended external GPIO connection to pad34. value.              */
+  ADC_SL4CFG_CHSEL4_SE7                = 7,     /*!< SE7 : single ended external GPIO connection to pad35. value.              */
+  ADC_SL4CFG_CHSEL4_SE8                = 8,     /*!< SE8 : single ended external GPIO connection to pad13. value.              */
+  ADC_SL4CFG_CHSEL4_SE9                = 9,     /*!< SE9 : single ended external GPIO connection to pad12. value.              */
+  ADC_SL4CFG_CHSEL4_DF0                = 10,    /*!< DF0 : differential external GPIO connections to pad12(N) and
+                                                     pad13(P). value.                                                          */
+  ADC_SL4CFG_CHSEL4_DF1                = 11,    /*!< DF1 : differential external GPIO connections to pad15(N) and
+                                                     pad14(P). value.                                                          */
+  ADC_SL4CFG_CHSEL4_TEMP               = 12,    /*!< TEMP : internal temperature sensor. value.                                */
+  ADC_SL4CFG_CHSEL4_BATT               = 13,    /*!< BATT : internal voltage divide-by-3 connection. value.                    */
+  ADC_SL4CFG_CHSEL4_VSS                = 14,    /*!< VSS : Input VSS value.                                                    */
+} ADC_SL4CFG_CHSEL4_Enum;
+
+/* ================================================  ADC SL4CFG WCEN4 [1..1]  ================================================ */
+typedef enum {                                  /*!< ADC_SL4CFG_WCEN4                                                          */
+  ADC_SL4CFG_WCEN4_WCEN                = 1,     /*!< WCEN : Enable the window compare for slot 4. value.                       */
+} ADC_SL4CFG_WCEN4_Enum;
+
+/* ================================================  ADC SL4CFG SLEN4 [0..0]  ================================================ */
+typedef enum {                                  /*!< ADC_SL4CFG_SLEN4                                                          */
+  ADC_SL4CFG_SLEN4_SLEN                = 1,     /*!< SLEN : Enable slot 4 for ADC conversions. value.                          */
+} ADC_SL4CFG_SLEN4_Enum;
+
+/* ========================================================  SL5CFG  ========================================================= */
+/* ==============================================  ADC SL5CFG ADSEL5 [24..26]  =============================================== */
+typedef enum {                                  /*!< ADC_SL5CFG_ADSEL5                                                         */
+  ADC_SL5CFG_ADSEL5_AVG_1_MSRMT        = 0,     /*!< AVG_1_MSRMT : Average in 1 measurement in the accumulate divide
+                                                     module for this slot. value.                                              */
+  ADC_SL5CFG_ADSEL5_AVG_2_MSRMTS       = 1,     /*!< AVG_2_MSRMTS : Average in 2 measurements in the accumulate divide
+                                                     module for this slot. value.                                              */
+  ADC_SL5CFG_ADSEL5_AVG_4_MSRMTS       = 2,     /*!< AVG_4_MSRMTS : Average in 4 measurements in the accumulate divide
+                                                     module for this slot. value.                                              */
+  ADC_SL5CFG_ADSEL5_AVG_8_MSRMT        = 3,     /*!< AVG_8_MSRMT : Average in 8 measurements in the accumulate divide
+                                                     module for this slot. value.                                              */
+  ADC_SL5CFG_ADSEL5_AVG_16_MSRMTS      = 4,     /*!< AVG_16_MSRMTS : Average in 16 measurements in the accumulate
+                                                     divide module for this slot. value.                                       */
+  ADC_SL5CFG_ADSEL5_AVG_32_MSRMTS      = 5,     /*!< AVG_32_MSRMTS : Average in 32 measurements in the accumulate
+                                                     divide module for this slot. value.                                       */
+  ADC_SL5CFG_ADSEL5_AVG_64_MSRMTS      = 6,     /*!< AVG_64_MSRMTS : Average in 64 measurements in the accumulate
+                                                     divide module for this slot. value.                                       */
+  ADC_SL5CFG_ADSEL5_AVG_128_MSRMTS     = 7,     /*!< AVG_128_MSRMTS : Average in 128 measurements in the accumulate
+                                                     divide module for this slot. value.                                       */
+} ADC_SL5CFG_ADSEL5_Enum;
+
+/* ==============================================  ADC SL5CFG PRMODE5 [16..17]  ============================================== */
+typedef enum {                                  /*!< ADC_SL5CFG_PRMODE5                                                        */
+  ADC_SL5CFG_PRMODE5_P14B              = 0,     /*!< P14B : 14-bit precision mode value.                                       */
+  ADC_SL5CFG_PRMODE5_P12B              = 1,     /*!< P12B : 12-bit precision mode value.                                       */
+  ADC_SL5CFG_PRMODE5_P10B              = 2,     /*!< P10B : 10-bit precision mode value.                                       */
+  ADC_SL5CFG_PRMODE5_P8B               = 3,     /*!< P8B : 8-bit precision mode value.                                         */
+} ADC_SL5CFG_PRMODE5_Enum;
+
+/* ===============================================  ADC SL5CFG CHSEL5 [8..11]  =============================================== */
+typedef enum {                                  /*!< ADC_SL5CFG_CHSEL5                                                         */
+  ADC_SL5CFG_CHSEL5_SE0                = 0,     /*!< SE0 : single ended external GPIO connection to pad16. value.              */
+  ADC_SL5CFG_CHSEL5_SE1                = 1,     /*!< SE1 : single ended external GPIO connection to pad29. value.              */
+  ADC_SL5CFG_CHSEL5_SE2                = 2,     /*!< SE2 : single ended external GPIO connection to pad11. value.              */
+  ADC_SL5CFG_CHSEL5_SE3                = 3,     /*!< SE3 : single ended external GPIO connection to pad31. value.              */
+  ADC_SL5CFG_CHSEL5_SE4                = 4,     /*!< SE4 : single ended external GPIO connection to pad32. value.              */
+  ADC_SL5CFG_CHSEL5_SE5                = 5,     /*!< SE5 : single ended external GPIO connection to pad33. value.              */
+  ADC_SL5CFG_CHSEL5_SE6                = 6,     /*!< SE6 : single ended external GPIO connection to pad34. value.              */
+  ADC_SL5CFG_CHSEL5_SE7                = 7,     /*!< SE7 : single ended external GPIO connection to pad35. value.              */
+  ADC_SL5CFG_CHSEL5_SE8                = 8,     /*!< SE8 : single ended external GPIO connection to pad13. value.              */
+  ADC_SL5CFG_CHSEL5_SE9                = 9,     /*!< SE9 : single ended external GPIO connection to pad12. value.              */
+  ADC_SL5CFG_CHSEL5_DF0                = 10,    /*!< DF0 : differential external GPIO connections to pad12(N) and
+                                                     pad13(P). value.                                                          */
+  ADC_SL5CFG_CHSEL5_DF1                = 11,    /*!< DF1 : differential external GPIO connections to pad15(N) and
+                                                     pad14(P). value.                                                          */
+  ADC_SL5CFG_CHSEL5_TEMP               = 12,    /*!< TEMP : internal temperature sensor. value.                                */
+  ADC_SL5CFG_CHSEL5_BATT               = 13,    /*!< BATT : internal voltage divide-by-3 connection. value.                    */
+  ADC_SL5CFG_CHSEL5_VSS                = 14,    /*!< VSS : Input VSS value.                                                    */
+} ADC_SL5CFG_CHSEL5_Enum;
+
+/* ================================================  ADC SL5CFG WCEN5 [1..1]  ================================================ */
+typedef enum {                                  /*!< ADC_SL5CFG_WCEN5                                                          */
+  ADC_SL5CFG_WCEN5_WCEN                = 1,     /*!< WCEN : Enable the window compare for slot 5. value.                       */
+} ADC_SL5CFG_WCEN5_Enum;
+
+/* ================================================  ADC SL5CFG SLEN5 [0..0]  ================================================ */
+typedef enum {                                  /*!< ADC_SL5CFG_SLEN5                                                          */
+  ADC_SL5CFG_SLEN5_SLEN                = 1,     /*!< SLEN : Enable slot 5 for ADC conversions. value.                          */
+} ADC_SL5CFG_SLEN5_Enum;
+
+/* ========================================================  SL6CFG  ========================================================= */
+/* ==============================================  ADC SL6CFG ADSEL6 [24..26]  =============================================== */
+typedef enum {                                  /*!< ADC_SL6CFG_ADSEL6                                                         */
+  ADC_SL6CFG_ADSEL6_AVG_1_MSRMT        = 0,     /*!< AVG_1_MSRMT : Average in 1 measurement in the accumulate divide
+                                                     module for this slot. value.                                              */
+  ADC_SL6CFG_ADSEL6_AVG_2_MSRMTS       = 1,     /*!< AVG_2_MSRMTS : Average in 2 measurements in the accumulate divide
+                                                     module for this slot. value.                                              */
+  ADC_SL6CFG_ADSEL6_AVG_4_MSRMTS       = 2,     /*!< AVG_4_MSRMTS : Average in 4 measurements in the accumulate divide
+                                                     module for this slot. value.                                              */
+  ADC_SL6CFG_ADSEL6_AVG_8_MSRMT        = 3,     /*!< AVG_8_MSRMT : Average in 8 measurements in the accumulate divide
+                                                     module for this slot. value.                                              */
+  ADC_SL6CFG_ADSEL6_AVG_16_MSRMTS      = 4,     /*!< AVG_16_MSRMTS : Average in 16 measurements in the accumulate
+                                                     divide module for this slot. value.                                       */
+  ADC_SL6CFG_ADSEL6_AVG_32_MSRMTS      = 5,     /*!< AVG_32_MSRMTS : Average in 32 measurements in the accumulate
+                                                     divide module for this slot. value.                                       */
+  ADC_SL6CFG_ADSEL6_AVG_64_MSRMTS      = 6,     /*!< AVG_64_MSRMTS : Average in 64 measurements in the accumulate
+                                                     divide module for this slot. value.                                       */
+  ADC_SL6CFG_ADSEL6_AVG_128_MSRMTS     = 7,     /*!< AVG_128_MSRMTS : Average in 128 measurements in the accumulate
+                                                     divide module for this slot. value.                                       */
+} ADC_SL6CFG_ADSEL6_Enum;
+
+/* ==============================================  ADC SL6CFG PRMODE6 [16..17]  ============================================== */
+typedef enum {                                  /*!< ADC_SL6CFG_PRMODE6                                                        */
+  ADC_SL6CFG_PRMODE6_P14B              = 0,     /*!< P14B : 14-bit precision mode value.                                       */
+  ADC_SL6CFG_PRMODE6_P12B              = 1,     /*!< P12B : 12-bit precision mode value.                                       */
+  ADC_SL6CFG_PRMODE6_P10B              = 2,     /*!< P10B : 10-bit precision mode value.                                       */
+  ADC_SL6CFG_PRMODE6_P8B               = 3,     /*!< P8B : 8-bit precision mode value.                                         */
+} ADC_SL6CFG_PRMODE6_Enum;
+
+/* ===============================================  ADC SL6CFG CHSEL6 [8..11]  =============================================== */
+typedef enum {                                  /*!< ADC_SL6CFG_CHSEL6                                                         */
+  ADC_SL6CFG_CHSEL6_SE0                = 0,     /*!< SE0 : single ended external GPIO connection to pad16. value.              */
+  ADC_SL6CFG_CHSEL6_SE1                = 1,     /*!< SE1 : single ended external GPIO connection to pad29. value.              */
+  ADC_SL6CFG_CHSEL6_SE2                = 2,     /*!< SE2 : single ended external GPIO connection to pad11. value.              */
+  ADC_SL6CFG_CHSEL6_SE3                = 3,     /*!< SE3 : single ended external GPIO connection to pad31. value.              */
+  ADC_SL6CFG_CHSEL6_SE4                = 4,     /*!< SE4 : single ended external GPIO connection to pad32. value.              */
+  ADC_SL6CFG_CHSEL6_SE5                = 5,     /*!< SE5 : single ended external GPIO connection to pad33. value.              */
+  ADC_SL6CFG_CHSEL6_SE6                = 6,     /*!< SE6 : single ended external GPIO connection to pad34. value.              */
+  ADC_SL6CFG_CHSEL6_SE7                = 7,     /*!< SE7 : single ended external GPIO connection to pad35. value.              */
+  ADC_SL6CFG_CHSEL6_SE8                = 8,     /*!< SE8 : single ended external GPIO connection to pad13. value.              */
+  ADC_SL6CFG_CHSEL6_SE9                = 9,     /*!< SE9 : single ended external GPIO connection to pad12. value.              */
+  ADC_SL6CFG_CHSEL6_DF0                = 10,    /*!< DF0 : differential external GPIO connections to pad12(N) and
+                                                     pad13(P). value.                                                          */
+  ADC_SL6CFG_CHSEL6_DF1                = 11,    /*!< DF1 : differential external GPIO connections to pad15(N) and
+                                                     pad14(P). value.                                                          */
+  ADC_SL6CFG_CHSEL6_TEMP               = 12,    /*!< TEMP : internal temperature sensor. value.                                */
+  ADC_SL6CFG_CHSEL6_BATT               = 13,    /*!< BATT : internal voltage divide-by-3 connection. value.                    */
+  ADC_SL6CFG_CHSEL6_VSS                = 14,    /*!< VSS : Input VSS value.                                                    */
+} ADC_SL6CFG_CHSEL6_Enum;
+
+/* ================================================  ADC SL6CFG WCEN6 [1..1]  ================================================ */
+typedef enum {                                  /*!< ADC_SL6CFG_WCEN6                                                          */
+  ADC_SL6CFG_WCEN6_WCEN                = 1,     /*!< WCEN : Enable the window compare for slot 6. value.                       */
+} ADC_SL6CFG_WCEN6_Enum;
+
+/* ================================================  ADC SL6CFG SLEN6 [0..0]  ================================================ */
+typedef enum {                                  /*!< ADC_SL6CFG_SLEN6                                                          */
+  ADC_SL6CFG_SLEN6_SLEN                = 1,     /*!< SLEN : Enable slot 6 for ADC conversions. value.                          */
+} ADC_SL6CFG_SLEN6_Enum;
+
+/* ========================================================  SL7CFG  ========================================================= */
+/* ==============================================  ADC SL7CFG ADSEL7 [24..26]  =============================================== */
+typedef enum {                                  /*!< ADC_SL7CFG_ADSEL7                                                         */
+  ADC_SL7CFG_ADSEL7_AVG_1_MSRMT        = 0,     /*!< AVG_1_MSRMT : Average in 1 measurement in the accumulate divide
+                                                     module for this slot. value.                                              */
+  ADC_SL7CFG_ADSEL7_AVG_2_MSRMTS       = 1,     /*!< AVG_2_MSRMTS : Average in 2 measurements in the accumulate divide
+                                                     module for this slot. value.                                              */
+  ADC_SL7CFG_ADSEL7_AVG_4_MSRMTS       = 2,     /*!< AVG_4_MSRMTS : Average in 4 measurements in the accumulate divide
+                                                     module for this slot. value.                                              */
+  ADC_SL7CFG_ADSEL7_AVG_8_MSRMT        = 3,     /*!< AVG_8_MSRMT : Average in 8 measurements in the accumulate divide
+                                                     module for this slot. value.                                              */
+  ADC_SL7CFG_ADSEL7_AVG_16_MSRMTS      = 4,     /*!< AVG_16_MSRMTS : Average in 16 measurements in the accumulate
+                                                     divide module for this slot. value.                                       */
+  ADC_SL7CFG_ADSEL7_AVG_32_MSRMTS      = 5,     /*!< AVG_32_MSRMTS : Average in 32 measurements in the accumulate
+                                                     divide module for this slot. value.                                       */
+  ADC_SL7CFG_ADSEL7_AVG_64_MSRMTS      = 6,     /*!< AVG_64_MSRMTS : Average in 64 measurements in the accumulate
+                                                     divide module for this slot. value.                                       */
+  ADC_SL7CFG_ADSEL7_AVG_128_MSRMTS     = 7,     /*!< AVG_128_MSRMTS : Average in 128 measurements in the accumulate
+                                                     divide module for this slot. value.                                       */
+} ADC_SL7CFG_ADSEL7_Enum;
+
+/* ==============================================  ADC SL7CFG PRMODE7 [16..17]  ============================================== */
+typedef enum {                                  /*!< ADC_SL7CFG_PRMODE7                                                        */
+  ADC_SL7CFG_PRMODE7_P14B              = 0,     /*!< P14B : 14-bit precision mode value.                                       */
+  ADC_SL7CFG_PRMODE7_P12B              = 1,     /*!< P12B : 12-bit precision mode value.                                       */
+  ADC_SL7CFG_PRMODE7_P10B              = 2,     /*!< P10B : 10-bit precision mode value.                                       */
+  ADC_SL7CFG_PRMODE7_P8B               = 3,     /*!< P8B : 8-bit precision mode value.                                         */
+} ADC_SL7CFG_PRMODE7_Enum;
+
+/* ===============================================  ADC SL7CFG CHSEL7 [8..11]  =============================================== */
+typedef enum {                                  /*!< ADC_SL7CFG_CHSEL7                                                         */
+  ADC_SL7CFG_CHSEL7_SE0                = 0,     /*!< SE0 : single ended external GPIO connection to pad16. value.              */
+  ADC_SL7CFG_CHSEL7_SE1                = 1,     /*!< SE1 : single ended external GPIO connection to pad29. value.              */
+  ADC_SL7CFG_CHSEL7_SE2                = 2,     /*!< SE2 : single ended external GPIO connection to pad11. value.              */
+  ADC_SL7CFG_CHSEL7_SE3                = 3,     /*!< SE3 : single ended external GPIO connection to pad31. value.              */
+  ADC_SL7CFG_CHSEL7_SE4                = 4,     /*!< SE4 : single ended external GPIO connection to pad32. value.              */
+  ADC_SL7CFG_CHSEL7_SE5                = 5,     /*!< SE5 : single ended external GPIO connection to pad33. value.              */
+  ADC_SL7CFG_CHSEL7_SE6                = 6,     /*!< SE6 : single ended external GPIO connection to pad34. value.              */
+  ADC_SL7CFG_CHSEL7_SE7                = 7,     /*!< SE7 : single ended external GPIO connection to pad35. value.              */
+  ADC_SL7CFG_CHSEL7_SE8                = 8,     /*!< SE8 : single ended external GPIO connection to pad13. value.              */
+  ADC_SL7CFG_CHSEL7_SE9                = 9,     /*!< SE9 : single ended external GPIO connection to pad12. value.              */
+  ADC_SL7CFG_CHSEL7_DF0                = 10,    /*!< DF0 : differential external GPIO connections to pad12(N) and
+                                                     pad13(P). value.                                                          */
+  ADC_SL7CFG_CHSEL7_DF1                = 11,    /*!< DF1 : differential external GPIO connections to pad15(N) and
+                                                     pad14(P). value.                                                          */
+  ADC_SL7CFG_CHSEL7_TEMP               = 12,    /*!< TEMP : internal temperature sensor. value.                                */
+  ADC_SL7CFG_CHSEL7_BATT               = 13,    /*!< BATT : internal voltage divide-by-3 connection. value.                    */
+  ADC_SL7CFG_CHSEL7_VSS                = 14,    /*!< VSS : Input VSS value.                                                    */
+} ADC_SL7CFG_CHSEL7_Enum;
+
+/* ================================================  ADC SL7CFG WCEN7 [1..1]  ================================================ */
+typedef enum {                                  /*!< ADC_SL7CFG_WCEN7                                                          */
+  ADC_SL7CFG_WCEN7_WCEN                = 1,     /*!< WCEN : Enable the window compare for slot 7. value.                       */
+} ADC_SL7CFG_WCEN7_Enum;
+
+/* ================================================  ADC SL7CFG SLEN7 [0..0]  ================================================ */
+typedef enum {                                  /*!< ADC_SL7CFG_SLEN7                                                          */
+  ADC_SL7CFG_SLEN7_SLEN                = 1,     /*!< SLEN : Enable slot 7 for ADC conversions. value.                          */
+} ADC_SL7CFG_SLEN7_Enum;
+
+/* =========================================================  WULIM  ========================================================= */
+/* =========================================================  WLLIM  ========================================================= */
+/* ========================================================  SCWLIM  ========================================================= */
+/* =========================================================  FIFO  ========================================================== */
+/* ========================================================  FIFOPR  ========================================================= */
+/* =========================================================  INTEN  ========================================================= */
+/* =================================================  ADC INTEN DERR [7..7]  ================================================= */
+typedef enum {                                  /*!< ADC_INTEN_DERR                                                            */
+  ADC_INTEN_DERR_DMAERROR              = 1,     /*!< DMAERROR : DMA Error Condition Occurred value.                            */
+} ADC_INTEN_DERR_Enum;
+
+/* =================================================  ADC INTEN DCMP [6..6]  ================================================= */
+typedef enum {                                  /*!< ADC_INTEN_DCMP                                                            */
+  ADC_INTEN_DCMP_DMACOMPLETE           = 1,     /*!< DMACOMPLETE : DMA Completed a transfer value.                             */
+} ADC_INTEN_DCMP_Enum;
+
+/* ================================================  ADC INTEN WCINC [5..5]  ================================================= */
+typedef enum {                                  /*!< ADC_INTEN_WCINC                                                           */
+  ADC_INTEN_WCINC_WCINCINT             = 1,     /*!< WCINCINT : Window comparitor voltage incursion interrupt. value.          */
+} ADC_INTEN_WCINC_Enum;
+
+/* ================================================  ADC INTEN WCEXC [4..4]  ================================================= */
+typedef enum {                                  /*!< ADC_INTEN_WCEXC                                                           */
+  ADC_INTEN_WCEXC_WCEXCINT             = 1,     /*!< WCEXCINT : Window comparitor voltage excursion interrupt. value.          */
+} ADC_INTEN_WCEXC_Enum;
+
+/* ===============================================  ADC INTEN FIFOOVR2 [3..3]  =============================================== */
+typedef enum {                                  /*!< ADC_INTEN_FIFOOVR2                                                        */
+  ADC_INTEN_FIFOOVR2_FIFOFULLINT       = 1,     /*!< FIFOFULLINT : FIFO 100 percent full interrupt. value.                     */
+} ADC_INTEN_FIFOOVR2_Enum;
+
+/* ===============================================  ADC INTEN FIFOOVR1 [2..2]  =============================================== */
+typedef enum {                                  /*!< ADC_INTEN_FIFOOVR1                                                        */
+  ADC_INTEN_FIFOOVR1_FIFO75INT         = 1,     /*!< FIFO75INT : FIFO 75 percent full interrupt. value.                        */
+} ADC_INTEN_FIFOOVR1_Enum;
+
+/* ================================================  ADC INTEN SCNCMP [1..1]  ================================================ */
+typedef enum {                                  /*!< ADC_INTEN_SCNCMP                                                          */
+  ADC_INTEN_SCNCMP_SCNCMPINT           = 1,     /*!< SCNCMPINT : ADC scan complete interrupt. value.                           */
+} ADC_INTEN_SCNCMP_Enum;
+
+/* ================================================  ADC INTEN CNVCMP [0..0]  ================================================ */
+typedef enum {                                  /*!< ADC_INTEN_CNVCMP                                                          */
+  ADC_INTEN_CNVCMP_CNVCMPINT           = 1,     /*!< CNVCMPINT : ADC conversion complete interrupt. value.                     */
+} ADC_INTEN_CNVCMP_Enum;
+
+/* ========================================================  INTSTAT  ======================================================== */
+/* ================================================  ADC INTSTAT DERR [7..7]  ================================================ */
+typedef enum {                                  /*!< ADC_INTSTAT_DERR                                                          */
+  ADC_INTSTAT_DERR_DMAERROR            = 1,     /*!< DMAERROR : DMA Error Condition Occurred value.                            */
+} ADC_INTSTAT_DERR_Enum;
+
+/* ================================================  ADC INTSTAT DCMP [6..6]  ================================================ */
+typedef enum {                                  /*!< ADC_INTSTAT_DCMP                                                          */
+  ADC_INTSTAT_DCMP_DMACOMPLETE         = 1,     /*!< DMACOMPLETE : DMA Completed a transfer value.                             */
+} ADC_INTSTAT_DCMP_Enum;
+
+/* ===============================================  ADC INTSTAT WCINC [5..5]  ================================================ */
+typedef enum {                                  /*!< ADC_INTSTAT_WCINC                                                         */
+  ADC_INTSTAT_WCINC_WCINCINT           = 1,     /*!< WCINCINT : Window comparitor voltage incursion interrupt. value.          */
+} ADC_INTSTAT_WCINC_Enum;
+
+/* ===============================================  ADC INTSTAT WCEXC [4..4]  ================================================ */
+typedef enum {                                  /*!< ADC_INTSTAT_WCEXC                                                         */
+  ADC_INTSTAT_WCEXC_WCEXCINT           = 1,     /*!< WCEXCINT : Window comparitor voltage excursion interrupt. value.          */
+} ADC_INTSTAT_WCEXC_Enum;
+
+/* ==============================================  ADC INTSTAT FIFOOVR2 [3..3]  ============================================== */
+typedef enum {                                  /*!< ADC_INTSTAT_FIFOOVR2                                                      */
+  ADC_INTSTAT_FIFOOVR2_FIFOFULLINT     = 1,     /*!< FIFOFULLINT : FIFO 100 percent full interrupt. value.                     */
+} ADC_INTSTAT_FIFOOVR2_Enum;
+
+/* ==============================================  ADC INTSTAT FIFOOVR1 [2..2]  ============================================== */
+typedef enum {                                  /*!< ADC_INTSTAT_FIFOOVR1                                                      */
+  ADC_INTSTAT_FIFOOVR1_FIFO75INT       = 1,     /*!< FIFO75INT : FIFO 75 percent full interrupt. value.                        */
+} ADC_INTSTAT_FIFOOVR1_Enum;
+
+/* ===============================================  ADC INTSTAT SCNCMP [1..1]  =============================================== */
+typedef enum {                                  /*!< ADC_INTSTAT_SCNCMP                                                        */
+  ADC_INTSTAT_SCNCMP_SCNCMPINT         = 1,     /*!< SCNCMPINT : ADC scan complete interrupt. value.                           */
+} ADC_INTSTAT_SCNCMP_Enum;
+
+/* ===============================================  ADC INTSTAT CNVCMP [0..0]  =============================================== */
+typedef enum {                                  /*!< ADC_INTSTAT_CNVCMP                                                        */
+  ADC_INTSTAT_CNVCMP_CNVCMPINT         = 1,     /*!< CNVCMPINT : ADC conversion complete interrupt. value.                     */
+} ADC_INTSTAT_CNVCMP_Enum;
+
+/* ========================================================  INTCLR  ========================================================= */
+/* ================================================  ADC INTCLR DERR [7..7]  ================================================= */
+typedef enum {                                  /*!< ADC_INTCLR_DERR                                                           */
+  ADC_INTCLR_DERR_DMAERROR             = 1,     /*!< DMAERROR : DMA Error Condition Occurred value.                            */
+} ADC_INTCLR_DERR_Enum;
+
+/* ================================================  ADC INTCLR DCMP [6..6]  ================================================= */
+typedef enum {                                  /*!< ADC_INTCLR_DCMP                                                           */
+  ADC_INTCLR_DCMP_DMACOMPLETE          = 1,     /*!< DMACOMPLETE : DMA Completed a transfer value.                             */
+} ADC_INTCLR_DCMP_Enum;
+
+/* ================================================  ADC INTCLR WCINC [5..5]  ================================================ */
+typedef enum {                                  /*!< ADC_INTCLR_WCINC                                                          */
+  ADC_INTCLR_WCINC_WCINCINT            = 1,     /*!< WCINCINT : Window comparitor voltage incursion interrupt. value.          */
+} ADC_INTCLR_WCINC_Enum;
+
+/* ================================================  ADC INTCLR WCEXC [4..4]  ================================================ */
+typedef enum {                                  /*!< ADC_INTCLR_WCEXC                                                          */
+  ADC_INTCLR_WCEXC_WCEXCINT            = 1,     /*!< WCEXCINT : Window comparitor voltage excursion interrupt. value.          */
+} ADC_INTCLR_WCEXC_Enum;
+
+/* ==============================================  ADC INTCLR FIFOOVR2 [3..3]  =============================================== */
+typedef enum {                                  /*!< ADC_INTCLR_FIFOOVR2                                                       */
+  ADC_INTCLR_FIFOOVR2_FIFOFULLINT      = 1,     /*!< FIFOFULLINT : FIFO 100 percent full interrupt. value.                     */
+} ADC_INTCLR_FIFOOVR2_Enum;
+
+/* ==============================================  ADC INTCLR FIFOOVR1 [2..2]  =============================================== */
+typedef enum {                                  /*!< ADC_INTCLR_FIFOOVR1                                                       */
+  ADC_INTCLR_FIFOOVR1_FIFO75INT        = 1,     /*!< FIFO75INT : FIFO 75 percent full interrupt. value.                        */
+} ADC_INTCLR_FIFOOVR1_Enum;
+
+/* ===============================================  ADC INTCLR SCNCMP [1..1]  ================================================ */
+typedef enum {                                  /*!< ADC_INTCLR_SCNCMP                                                         */
+  ADC_INTCLR_SCNCMP_SCNCMPINT          = 1,     /*!< SCNCMPINT : ADC scan complete interrupt. value.                           */
+} ADC_INTCLR_SCNCMP_Enum;
+
+/* ===============================================  ADC INTCLR CNVCMP [0..0]  ================================================ */
+typedef enum {                                  /*!< ADC_INTCLR_CNVCMP                                                         */
+  ADC_INTCLR_CNVCMP_CNVCMPINT          = 1,     /*!< CNVCMPINT : ADC conversion complete interrupt. value.                     */
+} ADC_INTCLR_CNVCMP_Enum;
+
+/* ========================================================  INTSET  ========================================================= */
+/* ================================================  ADC INTSET DERR [7..7]  ================================================= */
+typedef enum {                                  /*!< ADC_INTSET_DERR                                                           */
+  ADC_INTSET_DERR_DMAERROR             = 1,     /*!< DMAERROR : DMA Error Condition Occurred value.                            */
+} ADC_INTSET_DERR_Enum;
+
+/* ================================================  ADC INTSET DCMP [6..6]  ================================================= */
+typedef enum {                                  /*!< ADC_INTSET_DCMP                                                           */
+  ADC_INTSET_DCMP_DMACOMPLETE          = 1,     /*!< DMACOMPLETE : DMA Completed a transfer value.                             */
+} ADC_INTSET_DCMP_Enum;
+
+/* ================================================  ADC INTSET WCINC [5..5]  ================================================ */
+typedef enum {                                  /*!< ADC_INTSET_WCINC                                                          */
+  ADC_INTSET_WCINC_WCINCINT            = 1,     /*!< WCINCINT : Window comparitor voltage incursion interrupt. value.          */
+} ADC_INTSET_WCINC_Enum;
+
+/* ================================================  ADC INTSET WCEXC [4..4]  ================================================ */
+typedef enum {                                  /*!< ADC_INTSET_WCEXC                                                          */
+  ADC_INTSET_WCEXC_WCEXCINT            = 1,     /*!< WCEXCINT : Window comparitor voltage excursion interrupt. value.          */
+} ADC_INTSET_WCEXC_Enum;
+
+/* ==============================================  ADC INTSET FIFOOVR2 [3..3]  =============================================== */
+typedef enum {                                  /*!< ADC_INTSET_FIFOOVR2                                                       */
+  ADC_INTSET_FIFOOVR2_FIFOFULLINT      = 1,     /*!< FIFOFULLINT : FIFO 100 percent full interrupt. value.                     */
+} ADC_INTSET_FIFOOVR2_Enum;
+
+/* ==============================================  ADC INTSET FIFOOVR1 [2..2]  =============================================== */
+typedef enum {                                  /*!< ADC_INTSET_FIFOOVR1                                                       */
+  ADC_INTSET_FIFOOVR1_FIFO75INT        = 1,     /*!< FIFO75INT : FIFO 75 percent full interrupt. value.                        */
+} ADC_INTSET_FIFOOVR1_Enum;
+
+/* ===============================================  ADC INTSET SCNCMP [1..1]  ================================================ */
+typedef enum {                                  /*!< ADC_INTSET_SCNCMP                                                         */
+  ADC_INTSET_SCNCMP_SCNCMPINT          = 1,     /*!< SCNCMPINT : ADC scan complete interrupt. value.                           */
+} ADC_INTSET_SCNCMP_Enum;
+
+/* ===============================================  ADC INTSET CNVCMP [0..0]  ================================================ */
+typedef enum {                                  /*!< ADC_INTSET_CNVCMP                                                         */
+  ADC_INTSET_CNVCMP_CNVCMPINT          = 1,     /*!< CNVCMPINT : ADC conversion complete interrupt. value.                     */
+} ADC_INTSET_CNVCMP_Enum;
+
+/* =======================================================  DMATRIGEN  ======================================================= */
+/* ======================================================  DMATRIGSTAT  ====================================================== */
+/* ========================================================  DMACFG  ========================================================= */
+/* ==============================================  ADC DMACFG DMAMSK [17..17]  =============================================== */
+typedef enum {                                  /*!< ADC_DMACFG_DMAMSK                                                         */
+  ADC_DMACFG_DMAMSK_DIS                = 0,     /*!< DIS : FIFO Contents are copied directly to memory without modification.
+                                                     value.                                                                    */
+  ADC_DMACFG_DMAMSK_EN                 = 1,     /*!< EN : Only the FIFODATA contents are copied to memory on DMA
+                                                     transfers. The SLOTNUM and FIFOCNT contents are cleared
+                                                     to zero. value.                                                           */
+} ADC_DMACFG_DMAMSK_Enum;
+
+/* ============================================  ADC DMACFG DMAHONSTAT [16..16]  ============================================= */
+typedef enum {                                  /*!< ADC_DMACFG_DMAHONSTAT                                                     */
+  ADC_DMACFG_DMAHONSTAT_DIS            = 0,     /*!< DIS : ADC conversions will continue regardless of DMA status
+                                                     register value.                                                           */
+  ADC_DMACFG_DMAHONSTAT_EN             = 1,     /*!< EN : ADC conversions will not progress if DMAERR or DMACPL bits
+                                                     in DMA status register are set. value.                                    */
+} ADC_DMACFG_DMAHONSTAT_Enum;
+
+/* ==============================================  ADC DMACFG DMADYNPRI [9..9]  ============================================== */
+typedef enum {                                  /*!< ADC_DMACFG_DMADYNPRI                                                      */
+  ADC_DMACFG_DMADYNPRI_DIS             = 0,     /*!< DIS : Disable dynamic priority (use DMAPRI setting only) value.           */
+  ADC_DMACFG_DMADYNPRI_EN              = 1,     /*!< EN : Enable dynamic priority value.                                       */
+} ADC_DMACFG_DMADYNPRI_Enum;
+
+/* ===============================================  ADC DMACFG DMAPRI [8..8]  ================================================ */
+typedef enum {                                  /*!< ADC_DMACFG_DMAPRI                                                         */
+  ADC_DMACFG_DMAPRI_LOW                = 0,     /*!< LOW : Low Priority (service as best effort) value.                        */
+  ADC_DMACFG_DMAPRI_HIGH               = 1,     /*!< HIGH : High Priority (service immediately) value.                         */
+} ADC_DMACFG_DMAPRI_Enum;
+
+/* ===============================================  ADC DMACFG DMADIR [2..2]  ================================================ */
+typedef enum {                                  /*!< ADC_DMACFG_DMADIR                                                         */
+  ADC_DMACFG_DMADIR_P2M                = 0,     /*!< P2M : Peripheral to Memory (SRAM) transaction value.                      */
+  ADC_DMACFG_DMADIR_M2P                = 1,     /*!< M2P : Memory to Peripheral transaction value.                             */
+} ADC_DMACFG_DMADIR_Enum;
+
+/* ================================================  ADC DMACFG DMAEN [0..0]  ================================================ */
+typedef enum {                                  /*!< ADC_DMACFG_DMAEN                                                          */
+  ADC_DMACFG_DMAEN_DIS                 = 0,     /*!< DIS : Disable DMA Function value.                                         */
+  ADC_DMACFG_DMAEN_EN                  = 1,     /*!< EN : Enable DMA Function value.                                           */
+} ADC_DMACFG_DMAEN_Enum;
+
+/* ======================================================  DMATOTCOUNT  ====================================================== */
+/* ======================================================  DMATARGADDR  ====================================================== */
+/* ========================================================  DMASTAT  ======================================================== */
+
+
+/* =========================================================================================================================== */
+/* ================                                          APBDMA                                           ================ */
+/* =========================================================================================================================== */
+
+/* ========================================================  BBVALUE  ======================================================== */
+/* ======================================================  BBSETCLEAR  ======================================================= */
+/* ========================================================  BBINPUT  ======================================================== */
+/* =======================================================  DEBUGDATA  ======================================================= */
+/* =========================================================  DEBUG  ========================================================= */
+/* ==============================================  APBDMA DEBUG DEBUGEN [0..3]  ============================================== */
+typedef enum {                                  /*!< APBDMA_DEBUG_DEBUGEN                                                      */
+  APBDMA_DEBUG_DEBUGEN_OFF             = 0,     /*!< OFF : Debug Disabled value.                                               */
+  APBDMA_DEBUG_DEBUGEN_ARB             = 1,     /*!< ARB : Debug Arb values value.                                             */
+} APBDMA_DEBUG_DEBUGEN_Enum;
+
+
+
+/* =========================================================================================================================== */
+/* ================                                           BLEIF                                           ================ */
+/* =========================================================================================================================== */
+
+/* =========================================================  FIFO  ========================================================== */
+/* ========================================================  FIFOPTR  ======================================================== */
+/* ========================================================  FIFOTHR  ======================================================== */
+/* ========================================================  FIFOPOP  ======================================================== */
+/* =======================================================  FIFOPUSH  ======================================================== */
+/* =======================================================  FIFOCTRL  ======================================================== */
+/* ========================================================  FIFOLOC  ======================================================== */
+/* ========================================================  CLKCFG  ========================================================= */
+/* ===============================================  BLEIF CLKCFG FSEL [8..10]  =============================================== */
+typedef enum {                                  /*!< BLEIF_CLKCFG_FSEL                                                         */
+  BLEIF_CLKCFG_FSEL_MIN_PWR            = 0,     /*!< MIN_PWR : Selects the minimum power clock. This setting should
+                                                     be used whenever the IOM is not active. value.                            */
+  BLEIF_CLKCFG_FSEL_HFRC               = 1,     /*!< HFRC : Selects the HFRC as the input clock. value.                        */
+  BLEIF_CLKCFG_FSEL_HFRC_DIV2          = 2,     /*!< HFRC_DIV2 : Selects the HFRC / 2 as the input clock. value.               */
+  BLEIF_CLKCFG_FSEL_HFRC_DIV4          = 3,     /*!< HFRC_DIV4 : Selects the HFRC / 4 as the input clock. value.               */
+  BLEIF_CLKCFG_FSEL_HFRC_DIV8          = 4,     /*!< HFRC_DIV8 : Selects the HFRC / 8 as the input clock. value.               */
+  BLEIF_CLKCFG_FSEL_HFRC_DIV16         = 5,     /*!< HFRC_DIV16 : Selects the HFRC / 16 as the input clock. value.             */
+  BLEIF_CLKCFG_FSEL_HFRC_DIV32         = 6,     /*!< HFRC_DIV32 : Selects the HFRC / 32 as the input clock. value.             */
+  BLEIF_CLKCFG_FSEL_HFRC_DIV64         = 7,     /*!< HFRC_DIV64 : Selects the HFRC / 64 as the input clock. value.             */
+} BLEIF_CLKCFG_FSEL_Enum;
+
+/* ==========================================================  CMD  ========================================================== */
+/* =================================================  BLEIF CMD CMD [0..4]  ================================================== */
+typedef enum {                                  /*!< BLEIF_CMD_CMD                                                             */
+  BLEIF_CMD_CMD_WRITE                  = 1,     /*!< WRITE : Write command using count of offset bytes specified
+                                                     in the OFFSETCNT field value.                                             */
+  BLEIF_CMD_CMD_READ                   = 2,     /*!< READ : Read command using count of offset bytes specified in
+                                                     the OFFSETCNT field value.                                                */
+} BLEIF_CMD_CMD_Enum;
+
+/* ========================================================  CMDRPT  ========================================================= */
+/* =======================================================  OFFSETHI  ======================================================== */
+/* ========================================================  CMDSTAT  ======================================================== */
+/* =============================================  BLEIF CMDSTAT CMDSTAT [5..7]  ============================================== */
+typedef enum {                                  /*!< BLEIF_CMDSTAT_CMDSTAT                                                     */
+  BLEIF_CMDSTAT_CMDSTAT_ERR            = 1,     /*!< ERR : Error encountered with command value.                               */
+  BLEIF_CMDSTAT_CMDSTAT_ACTIVE         = 2,     /*!< ACTIVE : Actively processing command value.                               */
+  BLEIF_CMDSTAT_CMDSTAT_IDLE           = 4,     /*!< IDLE : Idle state, no active command, no error value.                     */
+  BLEIF_CMDSTAT_CMDSTAT_WAIT           = 6,     /*!< WAIT : Command in progress, but waiting on data from host value.          */
+} BLEIF_CMDSTAT_CMDSTAT_Enum;
+
+/* =========================================================  INTEN  ========================================================= */
+/* ========================================================  INTSTAT  ======================================================== */
+/* ========================================================  INTCLR  ========================================================= */
+/* ========================================================  INTSET  ========================================================= */
+/* =======================================================  DMATRIGEN  ======================================================= */
+/* ======================================================  DMATRIGSTAT  ====================================================== */
+/* ========================================================  DMACFG  ========================================================= */
+/* ==============================================  BLEIF DMACFG DPWROFF [9..9]  ============================================== */
+typedef enum {                                  /*!< BLEIF_DMACFG_DPWROFF                                                      */
+  BLEIF_DMACFG_DPWROFF_DIS             = 0,     /*!< DIS : Power off disabled value.                                           */
+  BLEIF_DMACFG_DPWROFF_EN              = 1,     /*!< EN : Power off enabled value.                                             */
+} BLEIF_DMACFG_DPWROFF_Enum;
+
+/* ==============================================  BLEIF DMACFG DMAPRI [8..8]  =============================================== */
+typedef enum {                                  /*!< BLEIF_DMACFG_DMAPRI                                                       */
+  BLEIF_DMACFG_DMAPRI_LOW              = 0,     /*!< LOW : Low Priority (service as best effort) value.                        */
+  BLEIF_DMACFG_DMAPRI_HIGH             = 1,     /*!< HIGH : High Priority (service immediately) value.                         */
+} BLEIF_DMACFG_DMAPRI_Enum;
+
+/* ==============================================  BLEIF DMACFG DMADIR [1..1]  =============================================== */
+typedef enum {                                  /*!< BLEIF_DMACFG_DMADIR                                                       */
+  BLEIF_DMACFG_DMADIR_P2M              = 0,     /*!< P2M : Peripheral to Memory (SRAM) transaction. To be set when
+                                                     doing IOM read operations, ie reading data from external
+                                                     devices. value.                                                           */
+  BLEIF_DMACFG_DMADIR_M2P              = 1,     /*!< M2P : Memory to Peripheral transaction. To be set when doing
+                                                     IOM write operations, ie writing data to external devices.
+                                                     value.                                                                    */
+} BLEIF_DMACFG_DMADIR_Enum;
+
+/* ===============================================  BLEIF DMACFG DMAEN [0..0]  =============================================== */
+typedef enum {                                  /*!< BLEIF_DMACFG_DMAEN                                                        */
+  BLEIF_DMACFG_DMAEN_DIS               = 0,     /*!< DIS : Disable DMA Function value.                                         */
+  BLEIF_DMACFG_DMAEN_EN                = 1,     /*!< EN : Enable DMA Function value.                                           */
+} BLEIF_DMACFG_DMAEN_Enum;
+
+/* ======================================================  DMATOTCOUNT  ====================================================== */
+/* ======================================================  DMATARGADDR  ====================================================== */
+/* ========================================================  DMASTAT  ======================================================== */
+/* =========================================================  CQCFG  ========================================================= */
+/* ===============================================  BLEIF CQCFG CQPRI [1..1]  ================================================ */
+typedef enum {                                  /*!< BLEIF_CQCFG_CQPRI                                                         */
+  BLEIF_CQCFG_CQPRI_LOW                = 0,     /*!< LOW : Low Priority (service as best effort) value.                        */
+  BLEIF_CQCFG_CQPRI_HIGH               = 1,     /*!< HIGH : High Priority (service immediately) value.                         */
+} BLEIF_CQCFG_CQPRI_Enum;
+
+/* ================================================  BLEIF CQCFG CQEN [0..0]  ================================================ */
+typedef enum {                                  /*!< BLEIF_CQCFG_CQEN                                                          */
+  BLEIF_CQCFG_CQEN_DIS                 = 0,     /*!< DIS : Disable CQ Function value.                                          */
+  BLEIF_CQCFG_CQEN_EN                  = 1,     /*!< EN : Enable CQ Function value.                                            */
+} BLEIF_CQCFG_CQEN_Enum;
+
+/* ========================================================  CQADDR  ========================================================= */
+/* ========================================================  CQSTAT  ========================================================= */
+/* ========================================================  CQFLAGS  ======================================================== */
+/* ======================================================  CQSETCLEAR  ======================================================= */
+/* =======================================================  CQPAUSEEN  ======================================================= */
+/* =============================================  BLEIF CQPAUSEEN CQPEN [0..15]  ============================================= */
+typedef enum {                                  /*!< BLEIF_CQPAUSEEN_CQPEN                                                     */
+  BLEIF_CQPAUSEEN_CQPEN_CNTEQ          = 32768, /*!< CNTEQ : Pauses command queue processing when HWCNT matches SWCNT
+                                                     value.                                                                    */
+  BLEIF_CQPAUSEEN_CQPEN_BLEXOREN       = 16384, /*!< BLEXOREN : Pause command queue when input BLE bit XORed with
+                                                     SWFLAG4 is '1' value.                                                     */
+  BLEIF_CQPAUSEEN_CQPEN_IOMXOREN       = 8192,  /*!< IOMXOREN : Pause command queue when input IOM bit XORed with
+                                                     SWFLAG3 is '1' value.                                                     */
+  BLEIF_CQPAUSEEN_CQPEN_GPIOXOREN      = 4096,  /*!< GPIOXOREN : Pause command queue when input GPIO irq_bit XORed
+                                                     with SWFLAG2 is '1' value.                                                */
+  BLEIF_CQPAUSEEN_CQPEN_MSPI1XNOREN    = 2048,  /*!< MSPI1XNOREN : Pause command queue when input MSPI1 bit XNORed
+                                                     with SWFLAG1 is '1' value.                                                */
+  BLEIF_CQPAUSEEN_CQPEN_MSPI0XNOREN    = 1024,  /*!< MSPI0XNOREN : Pause command queue when input MSPI0 bit XNORed
+                                                     with SWFLAG0 is '1' value.                                                */
+  BLEIF_CQPAUSEEN_CQPEN_MSPI1XOREN     = 512,   /*!< MSPI1XOREN : Pause command queue when input MSPI1 bit XORed
+                                                     with SWFLAG1 is '1' value.                                                */
+  BLEIF_CQPAUSEEN_CQPEN_MSPI0XOREN     = 256,   /*!< MSPI0XOREN : Pause command queue when input MSPI0 bit XORed
+                                                     with SWFLAG0 is '1' value.                                                */
+  BLEIF_CQPAUSEEN_CQPEN_SWFLAGEN7      = 128,   /*!< SWFLAGEN7 : Pause the command queue when software flag bit 7
+                                                     is '1'. value.                                                            */
+  BLEIF_CQPAUSEEN_CQPEN_SWFLAGEN6      = 64,    /*!< SWFLAGEN6 : Pause the command queue when software flag bit 7
+                                                     is '1' value.                                                             */
+  BLEIF_CQPAUSEEN_CQPEN_SWFLAGEN5      = 32,    /*!< SWFLAGEN5 : Pause the command queue when software flag bit 7
+                                                     is '1' value.                                                             */
+  BLEIF_CQPAUSEEN_CQPEN_SWFLAGEN4      = 16,    /*!< SWFLAGEN4 : Pause the command queue when software flag bit 7
+                                                     is '1' value.                                                             */
+  BLEIF_CQPAUSEEN_CQPEN_SWFLAGEN3      = 8,     /*!< SWFLAGEN3 : Pause the command queue when software flag bit 7
+                                                     is '1' value.                                                             */
+  BLEIF_CQPAUSEEN_CQPEN_SWFLAGEN2      = 4,     /*!< SWFLAGEN2 : Pause the command queue when software flag bit 7
+                                                     is '1' value.                                                             */
+  BLEIF_CQPAUSEEN_CQPEN_SWFLAGEN1      = 2,     /*!< SWFLAGEN1 : Pause the command queue when software flag bit 7
+                                                     is '1' value.                                                             */
+  BLEIF_CQPAUSEEN_CQPEN_SWFLGEN0       = 1,     /*!< SWFLGEN0 : Pause the command queue when software flag bit 7
+                                                     is '1' value.                                                             */
+} BLEIF_CQPAUSEEN_CQPEN_Enum;
+
+/* =======================================================  CQCURIDX  ======================================================== */
+/* =======================================================  CQENDIDX  ======================================================== */
+/* ========================================================  STATUS  ========================================================= */
+/* ==============================================  BLEIF STATUS IDLEST [2..2]  =============================================== */
+typedef enum {                                  /*!< BLEIF_STATUS_IDLEST                                                       */
+  BLEIF_STATUS_IDLEST_IDLE             = 1,     /*!< IDLE : The I/O state machine is in the idle state. value.                 */
+} BLEIF_STATUS_IDLEST_Enum;
+
+/* ==============================================  BLEIF STATUS CMDACT [1..1]  =============================================== */
+typedef enum {                                  /*!< BLEIF_STATUS_CMDACT                                                       */
+  BLEIF_STATUS_CMDACT_ACTIVE           = 1,     /*!< ACTIVE : An I/O command is active. Indicates the active module
+                                                     has an active command and is processing this. De-asserted
+                                                     when the command is completed. value.                                     */
+} BLEIF_STATUS_CMDACT_Enum;
+
+/* ================================================  BLEIF STATUS ERR [0..0]  ================================================ */
+typedef enum {                                  /*!< BLEIF_STATUS_ERR                                                          */
+  BLEIF_STATUS_ERR_ERROR               = 1,     /*!< ERROR : Bit has been deprecated and will always return 0. value.          */
+} BLEIF_STATUS_ERR_Enum;
+
+/* ========================================================  MSPICFG  ======================================================== */
+/* =============================================  BLEIF MSPICFG SPILSB [23..23]  ============================================= */
+typedef enum {                                  /*!< BLEIF_MSPICFG_SPILSB                                                      */
+  BLEIF_MSPICFG_SPILSB_MSB             = 0,     /*!< MSB : Send and receive MSB bit first value.                               */
+  BLEIF_MSPICFG_SPILSB_LSB             = 1,     /*!< LSB : Send and receive LSB bit first value.                               */
+} BLEIF_MSPICFG_SPILSB_Enum;
+
+/* ============================================  BLEIF MSPICFG RDFCPOL [22..22]  ============================================= */
+typedef enum {                                  /*!< BLEIF_MSPICFG_RDFCPOL                                                     */
+  BLEIF_MSPICFG_RDFCPOL_NORMAL         = 0,     /*!< NORMAL : SPI_STATUS signal from BLE Core high(1) creates flow
+                                                     control and new read spi transactions will not be started
+                                                     until the signal goes low.(default) value.                                */
+  BLEIF_MSPICFG_RDFCPOL_INVERTED       = 1,     /*!< INVERTED : SPI_STATUS signal from BLE Core low(0) creates flow
+                                                     control and new read spi transactions will not be started
+                                                     until the signal goes high. value.                                        */
+} BLEIF_MSPICFG_RDFCPOL_Enum;
+
+/* ============================================  BLEIF MSPICFG WTFCPOL [21..21]  ============================================= */
+typedef enum {                                  /*!< BLEIF_MSPICFG_WTFCPOL                                                     */
+  BLEIF_MSPICFG_WTFCPOL_NORMAL         = 0,     /*!< NORMAL : SPI_STATUS signal from BLE Core high(1) creates flow
+                                                     control and new write spi transactions will not be started
+                                                     until the signal goes low.(default) value.                                */
+  BLEIF_MSPICFG_WTFCPOL_INVERTED       = 1,     /*!< INVERTED : SPI_STATUS signal from BLE Core high(1) creates low(0)
+                                                     control and new write spi transactions will not be started
+                                                     until the signal goes high. value.                                        */
+} BLEIF_MSPICFG_WTFCPOL_Enum;
+
+/* ==============================================  BLEIF MSPICFG RDFC [17..17]  ============================================== */
+typedef enum {                                  /*!< BLEIF_MSPICFG_RDFC                                                        */
+  BLEIF_MSPICFG_RDFC_DIS               = 0,     /*!< DIS : Read mode flow control disabled. value.                             */
+  BLEIF_MSPICFG_RDFC_EN                = 1,     /*!< EN : Read mode flow control enabled. value.                               */
+} BLEIF_MSPICFG_RDFC_Enum;
+
+/* ==============================================  BLEIF MSPICFG WTFC [16..16]  ============================================== */
+typedef enum {                                  /*!< BLEIF_MSPICFG_WTFC                                                        */
+  BLEIF_MSPICFG_WTFC_DIS               = 0,     /*!< DIS : Write mode flow control disabled. value.                            */
+  BLEIF_MSPICFG_WTFC_EN                = 1,     /*!< EN : Write mode flow control enabled. value.                              */
+} BLEIF_MSPICFG_WTFC_Enum;
+
+/* ===============================================  BLEIF MSPICFG SPHA [1..1]  =============================================== */
+typedef enum {                                  /*!< BLEIF_MSPICFG_SPHA                                                        */
+  BLEIF_MSPICFG_SPHA_SAMPLE_LEADING_EDGE = 0,   /*!< SAMPLE_LEADING_EDGE : Sample on the leading (first) clock edge,
+                                                     rising or falling dependant on the value of SPOL value.                   */
+  BLEIF_MSPICFG_SPHA_SAMPLE_TRAILING_EDGE = 1,  /*!< SAMPLE_TRAILING_EDGE : Sample on the trailing (second) clock
+                                                     edge, rising of falling dependant on the value of SPOL
+                                                     value.                                                                    */
+} BLEIF_MSPICFG_SPHA_Enum;
+
+/* ===============================================  BLEIF MSPICFG SPOL [0..0]  =============================================== */
+typedef enum {                                  /*!< BLEIF_MSPICFG_SPOL                                                        */
+  BLEIF_MSPICFG_SPOL_CLK_BASE_0        = 0,     /*!< CLK_BASE_0 : The initial value of the clock is 0. value.                  */
+  BLEIF_MSPICFG_SPOL_CLK_BASE_1        = 1,     /*!< CLK_BASE_1 : The initial value of the clock is 1. value.                  */
+} BLEIF_MSPICFG_SPOL_Enum;
+
+/* ========================================================  BLECFG  ========================================================= */
+/* ============================================  BLEIF BLECFG SPIISOCTL [14..15]  ============================================ */
+typedef enum {                                  /*!< BLEIF_BLECFG_SPIISOCTL                                                    */
+  BLEIF_BLECFG_SPIISOCTL_ON            = 3,     /*!< ON : SPI signals from BLE Core to/from MCU Core are isolated.
+                                                     value.                                                                    */
+  BLEIF_BLECFG_SPIISOCTL_OFF           = 2,     /*!< OFF : SPI signals from BLE Core to/from MCU Core are not isolated.
+                                                     value.                                                                    */
+  BLEIF_BLECFG_SPIISOCTL_AUTO          = 0,     /*!< AUTO : SPI signals from BLE Core to/from MCU Core are automatically
+                                                     isolated by the logic value.                                              */
+} BLEIF_BLECFG_SPIISOCTL_Enum;
+
+/* ============================================  BLEIF BLECFG PWRISOCTL [12..13]  ============================================ */
+typedef enum {                                  /*!< BLEIF_BLECFG_PWRISOCTL                                                    */
+  BLEIF_BLECFG_PWRISOCTL_ON            = 3,     /*!< ON : BLEH power signal isolation to on (isolated). value.                 */
+  BLEIF_BLECFG_PWRISOCTL_OFF           = 2,     /*!< OFF : BLEH power signal isolation to off (not isolated). value.           */
+  BLEIF_BLECFG_PWRISOCTL_AUTO          = 0,     /*!< AUTO : BLEH Power signal isolation is controlled automatically
+                                                     through the interface logic value.                                        */
+} BLEIF_BLECFG_PWRISOCTL_Enum;
+
+/* ============================================  BLEIF BLECFG BLEHREQCTL [6..7]  ============================================= */
+typedef enum {                                  /*!< BLEIF_BLECFG_BLEHREQCTL                                                   */
+  BLEIF_BLECFG_BLEHREQCTL_ON           = 3,     /*!< ON : BLEH Power-on reg signal is set to on (1). value.                    */
+  BLEIF_BLECFG_BLEHREQCTL_OFF          = 2,     /*!< OFF : BLEH Power-on signal is set to off (0). value.                      */
+  BLEIF_BLECFG_BLEHREQCTL_AUTO         = 0,     /*!< AUTO : BLEH Power-on signal is controlled by the PWRSM logic
+                                                     and automatically controlled value.                                       */
+} BLEIF_BLECFG_BLEHREQCTL_Enum;
+
+/* ============================================  BLEIF BLECFG DCDCFLGCTL [4..5]  ============================================= */
+typedef enum {                                  /*!< BLEIF_BLECFG_DCDCFLGCTL                                                   */
+  BLEIF_BLECFG_DCDCFLGCTL_ON           = 3,     /*!< ON : DCDC Flag signal is set to on (1). value.                            */
+  BLEIF_BLECFG_DCDCFLGCTL_OFF          = 2,     /*!< OFF : DCDC Flag signal is set to off (0). value.                          */
+  BLEIF_BLECFG_DCDCFLGCTL_AUTO         = 0,     /*!< AUTO : DCDC Flag signal is controlled by the PWRSM logic and
+                                                     automatically controlled value.                                           */
+} BLEIF_BLECFG_DCDCFLGCTL_Enum;
+
+/* =============================================  BLEIF BLECFG WAKEUPCTL [2..3]  ============================================= */
+typedef enum {                                  /*!< BLEIF_BLECFG_WAKEUPCTL                                                    */
+  BLEIF_BLECFG_WAKEUPCTL_ON            = 3,     /*!< ON : Wake signal is set to on (1). value.                                 */
+  BLEIF_BLECFG_WAKEUPCTL_OFF           = 2,     /*!< OFF : Wake signal is set to off (0). value.                               */
+  BLEIF_BLECFG_WAKEUPCTL_AUTO          = 0,     /*!< AUTO : Wake signal is controlled by the PWRSM logic and automatically
+                                                     controlled value.                                                         */
+} BLEIF_BLECFG_WAKEUPCTL_Enum;
+
+/* ==============================================  BLEIF BLECFG BLERSTN [1..1]  ============================================== */
+typedef enum {                                  /*!< BLEIF_BLECFG_BLERSTN                                                      */
+  BLEIF_BLECFG_BLERSTN_ACTIVE          = 1,     /*!< ACTIVE : The reset signal is active (0) value.                            */
+  BLEIF_BLECFG_BLERSTN_INACTIVE        = 0,     /*!< INACTIVE : The reset signal is inactive (1) value.                        */
+} BLEIF_BLECFG_BLERSTN_Enum;
+
+/* ==============================================  BLEIF BLECFG PWRSMEN [0..0]  ============================================== */
+typedef enum {                                  /*!< BLEIF_BLECFG_PWRSMEN                                                      */
+  BLEIF_BLECFG_PWRSMEN_ON              = 1,     /*!< ON : Internal power state machine is enabled and will sequence
+                                                     the BLEH power domain as indicated in the design document.
+                                                     Overrides for the power signals are not enabled. value.                   */
+  BLEIF_BLECFG_PWRSMEN_OFF             = 0,     /*!< OFF : Internal power state machine is disabled and will not
+                                                     sequence the BLEH power domain. The values of the overrides
+                                                     will be used to drive the output sequencing signals value.                */
+} BLEIF_BLECFG_PWRSMEN_Enum;
+
+/* ========================================================  PWRCMD  ========================================================= */
+/* ========================================================  BSTATUS  ======================================================== */
+/* ==============================================  BLEIF BSTATUS PWRST [8..10]  ============================================== */
+typedef enum {                                  /*!< BLEIF_BSTATUS_PWRST                                                       */
+  BLEIF_BSTATUS_PWRST_OFF              = 0,     /*!< OFF : Internal power state machine is disabled and will not
+                                                     sequence the BLEH power domain. The values of the overrides
+                                                     will be used to drive the output sequencing signals value.                */
+  BLEIF_BSTATUS_PWRST_INIT             = 1,     /*!< INIT : Initialization state. BLEH not powered value.                      */
+  BLEIF_BSTATUS_PWRST_PWRON            = 2,     /*!< PWRON : Waiting for the powerup of the BLEH value.                        */
+  BLEIF_BSTATUS_PWRST_ACTIVE           = 3,     /*!< ACTIVE : The BLE Core is powered and active value.                        */
+  BLEIF_BSTATUS_PWRST_SLEEP            = 6,     /*!< SLEEP : The BLE Core has entered sleep mode and the power request
+                                                     is inactive value.                                                        */
+  BLEIF_BSTATUS_PWRST_SHUTDOWN         = 4,     /*!< SHUTDOWN : The BLE Core is in shutdown mode value.                        */
+} BLEIF_BSTATUS_PWRST_Enum;
+
+/* =============================================  BLEIF BSTATUS B2MSTATE [0..2]  ============================================= */
+typedef enum {                                  /*!< BLEIF_BSTATUS_B2MSTATE                                                    */
+  BLEIF_BSTATUS_B2MSTATE_RESET         = 0,     /*!< RESET : Reset State value.                                                */
+  BLEIF_BSTATUS_B2MSTATE_Sleep         = 1,     /*!< Sleep : Sleep state. value.                                               */
+  BLEIF_BSTATUS_B2MSTATE_Standby       = 2,     /*!< Standby : Standby State value.                                            */
+  BLEIF_BSTATUS_B2MSTATE_Idle          = 3,     /*!< Idle : Idle state value.                                                  */
+  BLEIF_BSTATUS_B2MSTATE_Active        = 4,     /*!< Active : Active state. value.                                             */
+} BLEIF_BSTATUS_B2MSTATE_Enum;
+
+/* ========================================================  BLEDBG  ========================================================= */
+
+
+/* =========================================================================================================================== */
+/* ================                                         CACHECTRL                                         ================ */
+/* =========================================================================================================================== */
+
+/* =======================================================  CACHECFG  ======================================================== */
+/* ===========================================  CACHECTRL CACHECFG CONFIG [4..7]  ============================================ */
+typedef enum {                                  /*!< CACHECTRL_CACHECFG_CONFIG                                                 */
+  CACHECTRL_CACHECFG_CONFIG_W1_128B_512E = 4,   /*!< W1_128B_512E : Direct mapped, 128-bit linesize, 512 entries
+                                                     (4 SRAMs active) value.                                                   */
+  CACHECTRL_CACHECFG_CONFIG_W2_128B_512E = 5,   /*!< W2_128B_512E : Two-way set associative, 128-bit linesize, 512
+                                                     entries (8 SRAMs active) value.                                           */
+  CACHECTRL_CACHECFG_CONFIG_W1_128B_1024E = 8,  /*!< W1_128B_1024E : Direct mapped, 128-bit linesize, 1024 entries
+                                                     (8 SRAMs active) value.                                                   */
+} CACHECTRL_CACHECFG_CONFIG_Enum;
+
+/* =======================================================  FLASHCFG  ======================================================== */
+/* ==========================================  CACHECTRL FLASHCFG LPMMODE [12..13]  ========================================== */
+typedef enum {                                  /*!< CACHECTRL_FLASHCFG_LPMMODE                                                */
+  CACHECTRL_FLASHCFG_LPMMODE_NEVER     = 0,     /*!< NEVER : High power mode (LPM not used). value.                            */
+  CACHECTRL_FLASHCFG_LPMMODE_STANDBY   = 1,     /*!< STANDBY : Fast Standby mode. LPM deasserted for read operations,
+                                                     but asserted while flash IDLE. value.                                     */
+  CACHECTRL_FLASHCFG_LPMMODE_ALWAYS    = 2,     /*!< ALWAYS : Low Power mode. LPM always asserted for reads. LPM_RD_WAIT
+                                                     must be programmed to accomodate longer read access times.
+                                                     value.                                                                    */
+} CACHECTRL_FLASHCFG_LPMMODE_Enum;
+
+/* =========================================================  CTRL  ========================================================== */
+/* ===========================================  CACHECTRL CTRL RESET_STAT [1..1]  ============================================ */
+typedef enum {                                  /*!< CACHECTRL_CTRL_RESET_STAT                                                 */
+  CACHECTRL_CTRL_RESET_STAT_CLEAR      = 1,     /*!< CLEAR : Clear Cache Stats value.                                          */
+} CACHECTRL_CTRL_RESET_STAT_Enum;
+
+/* =======================================================  NCR0START  ======================================================= */
+/* ========================================================  NCR0END  ======================================================== */
+/* =======================================================  NCR1START  ======================================================= */
+/* ========================================================  NCR1END  ======================================================== */
+/* =========================================================  DMON0  ========================================================= */
+/* =========================================================  DMON1  ========================================================= */
+/* =========================================================  DMON2  ========================================================= */
+/* =========================================================  DMON3  ========================================================= */
+/* =========================================================  IMON0  ========================================================= */
+/* =========================================================  IMON1  ========================================================= */
+/* =========================================================  IMON2  ========================================================= */
+/* =========================================================  IMON3  ========================================================= */
+
+
+/* =========================================================================================================================== */
+/* ================                                          CLKGEN                                           ================ */
+/* =========================================================================================================================== */
+
+/* =========================================================  CALXT  ========================================================= */
+/* =========================================================  CALRC  ========================================================= */
+/* ========================================================  ACALCTR  ======================================================== */
+/* =========================================================  OCTRL  ========================================================= */
+/* ===============================================  CLKGEN OCTRL ACAL [8..10]  =============================================== */
+typedef enum {                                  /*!< CLKGEN_OCTRL_ACAL                                                         */
+  CLKGEN_OCTRL_ACAL_DIS                = 0,     /*!< DIS : Disable Autocalibration value.                                      */
+  CLKGEN_OCTRL_ACAL_1024SEC            = 2,     /*!< 1024SEC : Autocalibrate every 1024 seconds. Once autocalibration
+                                                     is done, an interrupt will be triggered at the end of 1024
+                                                     seconds. value.                                                           */
+  CLKGEN_OCTRL_ACAL_512SEC             = 3,     /*!< 512SEC : Autocalibrate every 512 seconds. Once autocalibration
+                                                     is done, an interrupt will be trigged at the end of 512
+                                                     seconds. value.                                                           */
+  CLKGEN_OCTRL_ACAL_XTFREQ             = 6,     /*!< XTFREQ : Frequency measurement using XT. The XT clock is normally
+                                                     considered much more accurate than the LFRC clock source.
+                                                     value.                                                                    */
+  CLKGEN_OCTRL_ACAL_EXTFREQ            = 7,     /*!< EXTFREQ : Frequency measurement using external clock. value.              */
+} CLKGEN_OCTRL_ACAL_Enum;
+
+/* ===============================================  CLKGEN OCTRL OSEL [7..7]  ================================================ */
+typedef enum {                                  /*!< CLKGEN_OCTRL_OSEL                                                         */
+  CLKGEN_OCTRL_OSEL_RTC_XT             = 0,     /*!< RTC_XT : RTC uses the XT value.                                           */
+  CLKGEN_OCTRL_OSEL_RTC_LFRC           = 1,     /*!< RTC_LFRC : RTC uses the LFRC value.                                       */
+} CLKGEN_OCTRL_OSEL_Enum;
+
+/* ================================================  CLKGEN OCTRL FOS [6..6]  ================================================ */
+typedef enum {                                  /*!< CLKGEN_OCTRL_FOS                                                          */
+  CLKGEN_OCTRL_FOS_DIS                 = 0,     /*!< DIS : Disable the oscillator switch on failure function. value.           */
+  CLKGEN_OCTRL_FOS_EN                  = 1,     /*!< EN : Enable the oscillator switch on failure function. value.             */
+} CLKGEN_OCTRL_FOS_Enum;
+
+/* ==============================================  CLKGEN OCTRL STOPRC [1..1]  =============================================== */
+typedef enum {                                  /*!< CLKGEN_OCTRL_STOPRC                                                       */
+  CLKGEN_OCTRL_STOPRC_EN               = 0,     /*!< EN : Enable the LFRC Oscillator to drive the RTC value.                   */
+  CLKGEN_OCTRL_STOPRC_STOP             = 1,     /*!< STOP : Stop the LFRC Oscillator when driving the RTC value.               */
+} CLKGEN_OCTRL_STOPRC_Enum;
+
+/* ==============================================  CLKGEN OCTRL STOPXT [0..0]  =============================================== */
+typedef enum {                                  /*!< CLKGEN_OCTRL_STOPXT                                                       */
+  CLKGEN_OCTRL_STOPXT_EN               = 0,     /*!< EN : Enable the XT Oscillator to drive the RTC value.                     */
+  CLKGEN_OCTRL_STOPXT_STOP             = 1,     /*!< STOP : Stop the XT Oscillator when driving the RTC value.                 */
+} CLKGEN_OCTRL_STOPXT_Enum;
+
+/* ========================================================  CLKOUT  ========================================================= */
+/* ===============================================  CLKGEN CLKOUT CKEN [7..7]  =============================================== */
+typedef enum {                                  /*!< CLKGEN_CLKOUT_CKEN                                                        */
+  CLKGEN_CLKOUT_CKEN_DIS               = 0,     /*!< DIS : Disable CLKOUT value.                                               */
+  CLKGEN_CLKOUT_CKEN_EN                = 1,     /*!< EN : Enable CLKOUT value.                                                 */
+} CLKGEN_CLKOUT_CKEN_Enum;
+
+/* ==============================================  CLKGEN CLKOUT CKSEL [0..5]  =============================================== */
+typedef enum {                                  /*!< CLKGEN_CLKOUT_CKSEL                                                       */
+  CLKGEN_CLKOUT_CKSEL_LFRC             = 0,     /*!< LFRC : LFRC value.                                                        */
+  CLKGEN_CLKOUT_CKSEL_XT_DIV2          = 1,     /*!< XT_DIV2 : XT / 2 value.                                                   */
+  CLKGEN_CLKOUT_CKSEL_XT_DIV4          = 2,     /*!< XT_DIV4 : XT / 4 value.                                                   */
+  CLKGEN_CLKOUT_CKSEL_XT_DIV8          = 3,     /*!< XT_DIV8 : XT / 8 value.                                                   */
+  CLKGEN_CLKOUT_CKSEL_XT_DIV16         = 4,     /*!< XT_DIV16 : XT / 16 value.                                                 */
+  CLKGEN_CLKOUT_CKSEL_XT_DIV32         = 5,     /*!< XT_DIV32 : XT / 32 value.                                                 */
+  CLKGEN_CLKOUT_CKSEL_RTC_1Hz          = 16,    /*!< RTC_1Hz : 1 Hz as selected in RTC value.                                  */
+  CLKGEN_CLKOUT_CKSEL_XT_DIV2M         = 22,    /*!< XT_DIV2M : XT / 2^21 value.                                               */
+  CLKGEN_CLKOUT_CKSEL_XT               = 23,    /*!< XT : XT value.                                                            */
+  CLKGEN_CLKOUT_CKSEL_CG_100Hz         = 24,    /*!< CG_100Hz : 100 Hz as selected in CLKGEN value.                            */
+  CLKGEN_CLKOUT_CKSEL_HFRC             = 25,    /*!< HFRC : HFRC value.                                                        */
+  CLKGEN_CLKOUT_CKSEL_HFRC_DIV4        = 26,    /*!< HFRC_DIV4 : HFRC / 4 value.                                               */
+  CLKGEN_CLKOUT_CKSEL_HFRC_DIV8        = 27,    /*!< HFRC_DIV8 : HFRC / 8 value.                                               */
+  CLKGEN_CLKOUT_CKSEL_HFRC_DIV16       = 28,    /*!< HFRC_DIV16 : HFRC / 16 value.                                             */
+  CLKGEN_CLKOUT_CKSEL_HFRC_DIV64       = 29,    /*!< HFRC_DIV64 : HFRC / 64 value.                                             */
+  CLKGEN_CLKOUT_CKSEL_HFRC_DIV128      = 30,    /*!< HFRC_DIV128 : HFRC / 128 value.                                           */
+  CLKGEN_CLKOUT_CKSEL_HFRC_DIV256      = 31,    /*!< HFRC_DIV256 : HFRC / 256 value.                                           */
+  CLKGEN_CLKOUT_CKSEL_HFRC_DIV512      = 32,    /*!< HFRC_DIV512 : HFRC / 512 value.                                           */
+  CLKGEN_CLKOUT_CKSEL_FLASH_CLK        = 34,    /*!< FLASH_CLK : Flash Clock value.                                            */
+  CLKGEN_CLKOUT_CKSEL_LFRC_DIV2        = 35,    /*!< LFRC_DIV2 : LFRC / 2 value.                                               */
+  CLKGEN_CLKOUT_CKSEL_LFRC_DIV32       = 36,    /*!< LFRC_DIV32 : LFRC / 32 value.                                             */
+  CLKGEN_CLKOUT_CKSEL_LFRC_DIV512      = 37,    /*!< LFRC_DIV512 : LFRC / 512 value.                                           */
+  CLKGEN_CLKOUT_CKSEL_LFRC_DIV32K      = 38,    /*!< LFRC_DIV32K : LFRC / 32768 value.                                         */
+  CLKGEN_CLKOUT_CKSEL_XT_DIV256        = 39,    /*!< XT_DIV256 : XT / 256 value.                                               */
+  CLKGEN_CLKOUT_CKSEL_XT_DIV8K         = 40,    /*!< XT_DIV8K : XT / 8192 value.                                               */
+  CLKGEN_CLKOUT_CKSEL_XT_DIV64K        = 41,    /*!< XT_DIV64K : XT / 2^16 value.                                              */
+  CLKGEN_CLKOUT_CKSEL_ULFRC_DIV16      = 42,    /*!< ULFRC_DIV16 : Uncal LFRC / 16 value.                                      */
+  CLKGEN_CLKOUT_CKSEL_ULFRC_DIV128     = 43,    /*!< ULFRC_DIV128 : Uncal LFRC / 128 value.                                    */
+  CLKGEN_CLKOUT_CKSEL_ULFRC_1Hz        = 44,    /*!< ULFRC_1Hz : Uncal LFRC / 1024 value.                                      */
+  CLKGEN_CLKOUT_CKSEL_ULFRC_DIV4K      = 45,    /*!< ULFRC_DIV4K : Uncal LFRC / 4096 value.                                    */
+  CLKGEN_CLKOUT_CKSEL_ULFRC_DIV1M      = 46,    /*!< ULFRC_DIV1M : Uncal LFRC / 2^20 value.                                    */
+  CLKGEN_CLKOUT_CKSEL_HFRC_DIV64K      = 47,    /*!< HFRC_DIV64K : HFRC / 2^16 value.                                          */
+  CLKGEN_CLKOUT_CKSEL_HFRC_DIV16M      = 48,    /*!< HFRC_DIV16M : HFRC / 2^24 value.                                          */
+  CLKGEN_CLKOUT_CKSEL_LFRC_DIV1M       = 49,    /*!< LFRC_DIV1M : LFRC / 2^20 value.                                           */
+  CLKGEN_CLKOUT_CKSEL_HFRCNE           = 50,    /*!< HFRCNE : HFRC (not autoenabled) value.                                    */
+  CLKGEN_CLKOUT_CKSEL_HFRCNE_DIV8      = 51,    /*!< HFRCNE_DIV8 : HFRC / 8 (not autoenabled) value.                           */
+  CLKGEN_CLKOUT_CKSEL_XTNE             = 53,    /*!< XTNE : XT (not autoenabled) value.                                        */
+  CLKGEN_CLKOUT_CKSEL_XTNE_DIV16       = 54,    /*!< XTNE_DIV16 : XT / 16 (not autoenabled) value.                             */
+  CLKGEN_CLKOUT_CKSEL_LFRCNE_DIV32     = 55,    /*!< LFRCNE_DIV32 : LFRC / 32 (not autoenabled) value.                         */
+  CLKGEN_CLKOUT_CKSEL_LFRCNE           = 57,    /*!< LFRCNE : LFRC (not autoenabled) - Default for undefined values
+                                                     value.                                                                    */
+} CLKGEN_CLKOUT_CKSEL_Enum;
+
+/* ========================================================  CLKKEY  ========================================================= */
+/* =============================================  CLKGEN CLKKEY CLKKEY [0..31]  ============================================== */
+typedef enum {                                  /*!< CLKGEN_CLKKEY_CLKKEY                                                      */
+  CLKGEN_CLKKEY_CLKKEY_Key             = 71,    /*!< Key : Key value.                                                          */
+} CLKGEN_CLKKEY_CLKKEY_Enum;
+
+/* =========================================================  CCTRL  ========================================================= */
+/* ==============================================  CLKGEN CCTRL CORESEL [0..0]  ============================================== */
+typedef enum {                                  /*!< CLKGEN_CCTRL_CORESEL                                                      */
+  CLKGEN_CCTRL_CORESEL_HFRC            = 0,     /*!< HFRC : Core Clock is HFRC value.                                          */
+  CLKGEN_CCTRL_CORESEL_HFRC_DIV2       = 1,     /*!< HFRC_DIV2 : Core Clock is HFRC / 2 value.                                 */
+} CLKGEN_CCTRL_CORESEL_Enum;
+
+/* ========================================================  STATUS  ========================================================= */
+/* =========================================================  HFADJ  ========================================================= */
+/* ============================================  CLKGEN HFADJ HFADJGAIN [21..23]  ============================================ */
+typedef enum {                                  /*!< CLKGEN_HFADJ_HFADJGAIN                                                    */
+  CLKGEN_HFADJ_HFADJGAIN_Gain_of_1     = 0,     /*!< Gain_of_1 : HF Adjust with Gain of 1 value.                               */
+  CLKGEN_HFADJ_HFADJGAIN_Gain_of_1_in_2 = 1,    /*!< Gain_of_1_in_2 : HF Adjust with Gain of 0.5 value.                        */
+  CLKGEN_HFADJ_HFADJGAIN_Gain_of_1_in_4 = 2,    /*!< Gain_of_1_in_4 : HF Adjust with Gain of 0.25 value.                       */
+  CLKGEN_HFADJ_HFADJGAIN_Gain_of_1_in_8 = 3,    /*!< Gain_of_1_in_8 : HF Adjust with Gain of 0.125 value.                      */
+  CLKGEN_HFADJ_HFADJGAIN_Gain_of_1_in_16 = 4,   /*!< Gain_of_1_in_16 : HF Adjust with Gain of 0.0625 value.                    */
+  CLKGEN_HFADJ_HFADJGAIN_Gain_of_1_in_32 = 5,   /*!< Gain_of_1_in_32 : HF Adjust with Gain of 0.03125 value.                   */
+} CLKGEN_HFADJ_HFADJGAIN_Enum;
+
+/* ============================================  CLKGEN HFADJ HFWARMUP [20..20]  ============================================= */
+typedef enum {                                  /*!< CLKGEN_HFADJ_HFWARMUP                                                     */
+  CLKGEN_HFADJ_HFWARMUP_1SEC           = 0,     /*!< 1SEC : Autoadjust XT warmup period = 1-2 seconds value.                   */
+  CLKGEN_HFADJ_HFWARMUP_2SEC           = 1,     /*!< 2SEC : Autoadjust XT warmup period = 2-4 seconds value.                   */
+} CLKGEN_HFADJ_HFWARMUP_Enum;
+
+/* ==============================================  CLKGEN HFADJ HFADJCK [1..3]  ============================================== */
+typedef enum {                                  /*!< CLKGEN_HFADJ_HFADJCK                                                      */
+  CLKGEN_HFADJ_HFADJCK_4SEC            = 0,     /*!< 4SEC : Autoadjust repeat period = 4 seconds value.                        */
+  CLKGEN_HFADJ_HFADJCK_16SEC           = 1,     /*!< 16SEC : Autoadjust repeat period = 16 seconds value.                      */
+  CLKGEN_HFADJ_HFADJCK_32SEC           = 2,     /*!< 32SEC : Autoadjust repeat period = 32 seconds value.                      */
+  CLKGEN_HFADJ_HFADJCK_64SEC           = 3,     /*!< 64SEC : Autoadjust repeat period = 64 seconds value.                      */
+  CLKGEN_HFADJ_HFADJCK_128SEC          = 4,     /*!< 128SEC : Autoadjust repeat period = 128 seconds value.                    */
+  CLKGEN_HFADJ_HFADJCK_256SEC          = 5,     /*!< 256SEC : Autoadjust repeat period = 256 seconds value.                    */
+  CLKGEN_HFADJ_HFADJCK_512SEC          = 6,     /*!< 512SEC : Autoadjust repeat period = 512 seconds value.                    */
+  CLKGEN_HFADJ_HFADJCK_1024SEC         = 7,     /*!< 1024SEC : Autoadjust repeat period = 1024 seconds value.                  */
+} CLKGEN_HFADJ_HFADJCK_Enum;
+
+/* ==============================================  CLKGEN HFADJ HFADJEN [0..0]  ============================================== */
+typedef enum {                                  /*!< CLKGEN_HFADJ_HFADJEN                                                      */
+  CLKGEN_HFADJ_HFADJEN_DIS             = 0,     /*!< DIS : Disable the HFRC adjustment value.                                  */
+  CLKGEN_HFADJ_HFADJEN_EN              = 1,     /*!< EN : Enable the HFRC adjustment value.                                    */
+} CLKGEN_HFADJ_HFADJEN_Enum;
+
+/* ======================================================  CLOCKENSTAT  ====================================================== */
+/* ========================================  CLKGEN CLOCKENSTAT CLOCKENSTAT [0..31]  ========================================= */
+typedef enum {                                  /*!< CLKGEN_CLOCKENSTAT_CLOCKENSTAT                                            */
+  CLKGEN_CLOCKENSTAT_CLOCKENSTAT_ADC_CLKEN = 1, /*!< ADC_CLKEN : Clock enable for the ADC. value.                              */
+  CLKGEN_CLOCKENSTAT_CLOCKENSTAT_APBDMA_ACTIVITY_CLKEN = 2,/*!< APBDMA_ACTIVITY_CLKEN : Clock enable for the APBDMA ACTIVITY
+                                                     value.                                                                    */
+  CLKGEN_CLOCKENSTAT_CLOCKENSTAT_APBDMA_AOH_CLKEN = 4,/*!< APBDMA_AOH_CLKEN : Clock enable for the APBDMA AOH DOMAIN value.    */
+  CLKGEN_CLOCKENSTAT_CLOCKENSTAT_APBDMA_AOL_CLKEN = 8,/*!< APBDMA_AOL_CLKEN : Clock enable for the APBDMA AOL DOMAIN value.    */
+  CLKGEN_CLOCKENSTAT_CLOCKENSTAT_APBDMA_APB_CLKEN = 16,/*!< APBDMA_APB_CLKEN : Clock enable for the APBDMA_APB value.          */
+  CLKGEN_CLOCKENSTAT_CLOCKENSTAT_APBDMA_BLEL_CLKEN = 32,/*!< APBDMA_BLEL_CLKEN : Clock enable for the APBDMA_BLEL value.       */
+  CLKGEN_CLOCKENSTAT_CLOCKENSTAT_APBDMA_HCPA_CLKEN = 64,/*!< APBDMA_HCPA_CLKEN : Clock enable for the APBDMA_HCPA value.       */
+  CLKGEN_CLOCKENSTAT_CLOCKENSTAT_APBDMA_HCPB_CLKEN = 128,/*!< APBDMA_HCPB_CLKEN : Clock enable for the APBDMA_HCPB value.      */
+  CLKGEN_CLOCKENSTAT_CLOCKENSTAT_APBDMA_HCPC_CLKEN = 256,/*!< APBDMA_HCPC_CLKEN : Clock enable for the APBDMA_HCPC value.      */
+  CLKGEN_CLOCKENSTAT_CLOCKENSTAT_APBDMA_MSPI_CLKEN = 512,/*!< APBDMA_MSPI_CLKEN : Clock enable for the APBDMA_MSPI value.      */
+  CLKGEN_CLOCKENSTAT_CLOCKENSTAT_APBDMA_PDM_CLKEN = 1024,/*!< APBDMA_PDM_CLKEN : Clock enable for the APBDMA_PDM value.        */
+  CLKGEN_CLOCKENSTAT_CLOCKENSTAT_BLEIF_CLK_CLKEN = 2048,/*!< BLEIF_CLK_CLKEN : Clock enable for the BLEIF value.               */
+  CLKGEN_CLOCKENSTAT_CLOCKENSTAT_BLEIF_CLK32K_CLKEN = 4096,/*!< BLEIF_CLK32K_CLKEN : Clock enable for the BLEIF 32khZ CLOCK
+                                                     value.                                                                    */
+  CLKGEN_CLOCKENSTAT_CLOCKENSTAT_CTIMER_CLKEN = 8192,/*!< CTIMER_CLKEN : Clock enable for the CTIMER BLOCK value.              */
+  CLKGEN_CLOCKENSTAT_CLOCKENSTAT_CTIMER0A_CLKEN = 16384,/*!< CTIMER0A_CLKEN : Clock enable for the CTIMER0A value.             */
+  CLKGEN_CLOCKENSTAT_CLOCKENSTAT_CTIMER0B_CLKEN = 32768,/*!< CTIMER0B_CLKEN : Clock enable for the CTIMER0B value.             */
+  CLKGEN_CLOCKENSTAT_CLOCKENSTAT_CTIMER1A_CLKEN = 65536,/*!< CTIMER1A_CLKEN : Clock enable for the CTIMER1A value.             */
+  CLKGEN_CLOCKENSTAT_CLOCKENSTAT_CTIMER1B_CLKEN = 131072,/*!< CTIMER1B_CLKEN : Clock enable for the CTIMER1B value.            */
+  CLKGEN_CLOCKENSTAT_CLOCKENSTAT_CTIMER2A_CLKEN = 262144,/*!< CTIMER2A_CLKEN : Clock enable for the CTIMER2A value.            */
+  CLKGEN_CLOCKENSTAT_CLOCKENSTAT_CTIMER2B_CLKEN = 524288,/*!< CTIMER2B_CLKEN : Clock enable for the CTIMER2B value.            */
+  CLKGEN_CLOCKENSTAT_CLOCKENSTAT_CTIMER3A_CLKEN = 1048576,/*!< CTIMER3A_CLKEN : Clock enable for the CTIMER3A value.           */
+  CLKGEN_CLOCKENSTAT_CLOCKENSTAT_CTIMER3B_CLKEN = 2097152,/*!< CTIMER3B_CLKEN : Clock enable for the CTIMER3B value.           */
+  CLKGEN_CLOCKENSTAT_CLOCKENSTAT_CTIMER4A_CLKEN = 4194304,/*!< CTIMER4A_CLKEN : Clock enable for the CTIMER4A value.           */
+  CLKGEN_CLOCKENSTAT_CLOCKENSTAT_CTIMER4B_CLKEN = 8388608,/*!< CTIMER4B_CLKEN : Clock enable for the CTIMER4B value.           */
+  CLKGEN_CLOCKENSTAT_CLOCKENSTAT_CTIMER5A_CLKEN = 16777216,/*!< CTIMER5A_CLKEN : Clock enable for the CTIMER5A value.          */
+  CLKGEN_CLOCKENSTAT_CLOCKENSTAT_CTIMER5B_CLKEN = 33554432,/*!< CTIMER5B_CLKEN : Clock enable for the CTIMER5B value.          */
+  CLKGEN_CLOCKENSTAT_CLOCKENSTAT_CTIMER6A_CLKEN = 67108864,/*!< CTIMER6A_CLKEN : Clock enable for the CTIMER6A value.          */
+  CLKGEN_CLOCKENSTAT_CLOCKENSTAT_CTIMER6B_CLKEN = 134217728,/*!< CTIMER6B_CLKEN : Clock enable for the CTIMER6B value.         */
+  CLKGEN_CLOCKENSTAT_CLOCKENSTAT_CTIMER7A_CLKEN = 268435456,/*!< CTIMER7A_CLKEN : Clock enable for the CTIMER7A value.         */
+  CLKGEN_CLOCKENSTAT_CLOCKENSTAT_CTIMER7B_CLKEN = 536870912,/*!< CTIMER7B_CLKEN : Clock enable for the CTIMER7B value.         */
+  CLKGEN_CLOCKENSTAT_CLOCKENSTAT_DAP_CLKEN = 1073741824,/*!< DAP_CLKEN : Clock enable for the DAP value.                       */
+  CLKGEN_CLOCKENSTAT_CLOCKENSTAT_IOMSTRIFC0_CLKEN = -2147483648,/*!< IOMSTRIFC0_CLKEN : Clock enable for the IOMSTRIFC0 value. */
+} CLKGEN_CLOCKENSTAT_CLOCKENSTAT_Enum;
+
+/* =====================================================  CLOCKEN2STAT  ====================================================== */
+/* =======================================  CLKGEN CLOCKEN2STAT CLOCKEN2STAT [0..31]  ======================================== */
+typedef enum {                                  /*!< CLKGEN_CLOCKEN2STAT_CLOCKEN2STAT                                          */
+  CLKGEN_CLOCKEN2STAT_CLOCKEN2STAT_IOMSTRIFC1_CLKEN = 1,/*!< IOMSTRIFC1_CLKEN : Clock enable for the IO MASTER 1 IFC INTERFACE
+                                                     value.                                                                    */
+  CLKGEN_CLOCKEN2STAT_CLOCKEN2STAT_IOMSTRIFC2_CLKEN = 2,/*!< IOMSTRIFC2_CLKEN : Clock enable for the IO MASTER 2 IFC INTERFACE
+                                                     value.                                                                    */
+  CLKGEN_CLOCKEN2STAT_CLOCKEN2STAT_IOMSTRIFC3_CLKEN = 4,/*!< IOMSTRIFC3_CLKEN : Clock enable for the IO MASTER 3 IFC INTERFACE
+                                                     value.                                                                    */
+  CLKGEN_CLOCKEN2STAT_CLOCKEN2STAT_IOMSTRIFC4_CLKEN = 8,/*!< IOMSTRIFC4_CLKEN : Clock enable for the IO MASTER 4 IFC INTERFACE
+                                                     value.                                                                    */
+  CLKGEN_CLOCKEN2STAT_CLOCKEN2STAT_IOMSTRIFC5_CLKEN = 16,/*!< IOMSTRIFC5_CLKEN : Clock enable for the IO MASTER 5 IFC INTERFACE
+                                                     value.                                                                    */
+  CLKGEN_CLOCKEN2STAT_CLOCKEN2STAT_PDM_CLKEN = 32,/*!< PDM_CLKEN : Clock enable for the PDM value.                             */
+  CLKGEN_CLOCKEN2STAT_CLOCKEN2STAT_PDMIFC_CLKEN = 64,/*!< PDMIFC_CLKEN : Clock enable for the PDM INTERFACE value.             */
+  CLKGEN_CLOCKEN2STAT_CLOCKEN2STAT_PWRCTRL_CLKEN = 128,/*!< PWRCTRL_CLKEN : Clock enable for the PWRCTRL value.                */
+  CLKGEN_CLOCKEN2STAT_CLOCKEN2STAT_RSTGEN_CLKEN = 256,/*!< RSTGEN_CLKEN : Clock enable for the RSTGEN value.                   */
+  CLKGEN_CLOCKEN2STAT_CLOCKEN2STAT_SCARD_CLKEN = 512,/*!< SCARD_CLKEN : Clock enable for the SCARD value.                      */
+  CLKGEN_CLOCKEN2STAT_CLOCKEN2STAT_SCARD_ALTAPB_CLKEN = 1024,/*!< SCARD_ALTAPB_CLKEN : Clock enable for the SCARD ALTAPB value. */
+  CLKGEN_CLOCKEN2STAT_CLOCKEN2STAT_STIMER_CNT_CLKEN = 2048,/*!< STIMER_CNT_CLKEN : Clock enable for the STIMER_CNT_CLKEN value. */
+  CLKGEN_CLOCKEN2STAT_CLOCKEN2STAT_TPIU_CLKEN = 4096,/*!< TPIU_CLKEN : Clock enable for the TPIU_CLKEN value.                  */
+  CLKGEN_CLOCKEN2STAT_CLOCKEN2STAT_UART0HF_CLKEN = 8192,/*!< UART0HF_CLKEN : Clock enable for the UART0 HF value.              */
+  CLKGEN_CLOCKEN2STAT_CLOCKEN2STAT_UART1HF_CLKEN = 16384,/*!< UART1HF_CLKEN : Clock enable for the UART1 HF value.             */
+  CLKGEN_CLOCKEN2STAT_CLOCKEN2STAT_XT_32KHZ_EN = 1073741824,/*!< XT_32KHZ_EN : Clock enable for the XT 32KHZ value.            */
+  CLKGEN_CLOCKEN2STAT_CLOCKEN2STAT_FORCEHFRC = -2147483648,/*!< FORCEHFRC : HFRC is forced on Status. value.                   */
+} CLKGEN_CLOCKEN2STAT_CLOCKEN2STAT_Enum;
+
+/* =====================================================  CLOCKEN3STAT  ====================================================== */
+/* =======================================  CLKGEN CLOCKEN3STAT CLOCKEN3STAT [0..31]  ======================================== */
+typedef enum {                                  /*!< CLKGEN_CLOCKEN3STAT_CLOCKEN3STAT                                          */
+  CLKGEN_CLOCKEN3STAT_CLOCKEN3STAT_XTAL_enabled = 16777216,/*!< XTAL_enabled : XTAL is enabled value.                          */
+  CLKGEN_CLOCKEN3STAT_CLOCKEN3STAT_HFRC_enabled = 33554432,/*!< HFRC_enabled : HFRC is enabled value.                          */
+  CLKGEN_CLOCKEN3STAT_CLOCKEN3STAT_HFADJEN = 67108864,/*!< HFADJEN : HFRC Adjust enabled value.                                */
+  CLKGEN_CLOCKEN3STAT_CLOCKEN3STAT_HFRC_en_out = 134217728,/*!< HFRC_en_out : HFRC Enabled out value.                          */
+  CLKGEN_CLOCKEN3STAT_CLOCKEN3STAT_RTC_XT = 268435456,/*!< RTC_XT : RTC use XT value.                                          */
+  CLKGEN_CLOCKEN3STAT_CLOCKEN3STAT_clkout_xtal_en = 536870912,/*!< clkout_xtal_en : XTAL clkout enabled value.                 */
+  CLKGEN_CLOCKEN3STAT_CLOCKEN3STAT_clkout_hfrc_en = 1073741824,/*!< clkout_hfrc_en : HFRC clkout enabled value.                */
+  CLKGEN_CLOCKEN3STAT_CLOCKEN3STAT_flashclk_en = -2147483648,/*!< flashclk_en : Flash clk is enabled value.                    */
+} CLKGEN_CLOCKEN3STAT_CLOCKEN3STAT_Enum;
+
+/* =======================================================  FREQCTRL  ======================================================== */
+/* ============================================  CLKGEN FREQCTRL BURSTREQ [0..0]  ============================================ */
+typedef enum {                                  /*!< CLKGEN_FREQCTRL_BURSTREQ                                                  */
+  CLKGEN_FREQCTRL_BURSTREQ_DIS         = 0,     /*!< DIS : Frequency for ARM core stays at 48MHz value.                        */
+  CLKGEN_FREQCTRL_BURSTREQ_EN          = 1,     /*!< EN : Frequency for ARM core is increased to 96MHz value.                  */
+} CLKGEN_FREQCTRL_BURSTREQ_Enum;
+
+/* =====================================================  BLEBUCKTONADJ  ===================================================== */
+/* =====================================  CLKGEN BLEBUCKTONADJ ZEROLENDETECTEN [27..27]  ===================================== */
+typedef enum {                                  /*!< CLKGEN_BLEBUCKTONADJ_ZEROLENDETECTEN                                      */
+  CLKGEN_BLEBUCKTONADJ_ZEROLENDETECTEN_DIS = 0, /*!< DIS : Disable Zero Length Detect value.                                   */
+  CLKGEN_BLEBUCKTONADJ_ZEROLENDETECTEN_EN = 1,  /*!< EN : Enable Zero Length Detect value.                                     */
+} CLKGEN_BLEBUCKTONADJ_ZEROLENDETECTEN_Enum;
+
+/* ====================================  CLKGEN BLEBUCKTONADJ ZEROLENDETECTTRIM [23..26]  ==================================== */
+typedef enum {                                  /*!< CLKGEN_BLEBUCKTONADJ_ZEROLENDETECTTRIM                                    */
+  CLKGEN_BLEBUCKTONADJ_ZEROLENDETECTTRIM_SetF = 15,/*!< SetF : Indicator send when the BLE BUCK asserts blebuck_comp1
+                                                     for about 81us (10 percent margin of error) or more value.                */
+  CLKGEN_BLEBUCKTONADJ_ZEROLENDETECTTRIM_SetE = 14,/*!< SetE : Indicator send when the BLE BUCK asserts blebuck_comp1
+                                                     for about 75.6us (10 percent margin of error) or more value.              */
+  CLKGEN_BLEBUCKTONADJ_ZEROLENDETECTTRIM_SetD = 13,/*!< SetD : Indicator send when the BLE BUCK asserts blebuck_comp1
+                                                     for about 70.2us (10 percent margin of error) or more value.              */
+  CLKGEN_BLEBUCKTONADJ_ZEROLENDETECTTRIM_SetC = 12,/*!< SetC : Indicator send when the BLE BUCK asserts blebuck_comp1
+                                                     for about 64.8us (10 percent margin of error) or more value.              */
+  CLKGEN_BLEBUCKTONADJ_ZEROLENDETECTTRIM_SetB = 11,/*!< SetB : Indicator send when the BLE BUCK asserts blebuck_comp1
+                                                     for about 59.4us (10 percent margin of error) or more value.              */
+  CLKGEN_BLEBUCKTONADJ_ZEROLENDETECTTRIM_SetA = 10,/*!< SetA : Indicator send when the BLE BUCK asserts blebuck_comp1
+                                                     for about 54.0us (10 percent margin of error) or more value.              */
+  CLKGEN_BLEBUCKTONADJ_ZEROLENDETECTTRIM_Set9 = 9,/*!< Set9 : Indicator send when the BLE BUCK asserts blebuck_comp1
+                                                     for about 48.6us (10 percent margin of error) or more value.              */
+  CLKGEN_BLEBUCKTONADJ_ZEROLENDETECTTRIM_Set8 = 8,/*!< Set8 : Indicator send when the BLE BUCK asserts blebuck_comp1
+                                                     for about 43.2us (10 percent margin of error) or more value.              */
+  CLKGEN_BLEBUCKTONADJ_ZEROLENDETECTTRIM_Set7 = 7,/*!< Set7 : Indicator send when the BLE BUCK asserts blebuck_comp1
+                                                     for about 37.8us (10 percent margin of error) or more value.              */
+  CLKGEN_BLEBUCKTONADJ_ZEROLENDETECTTRIM_Set6 = 6,/*!< Set6 : Indicator send when the BLE BUCK asserts blebuck_comp1
+                                                     for about 32.4us (10 percent margin of error) or more value.              */
+  CLKGEN_BLEBUCKTONADJ_ZEROLENDETECTTRIM_Set5 = 5,/*!< Set5 : Indicator send when the BLE BUCK asserts blebuck_comp1
+                                                     for about 27.0us (10 percent margin of error) or more value.              */
+  CLKGEN_BLEBUCKTONADJ_ZEROLENDETECTTRIM_Set4 = 4,/*!< Set4 : Indicator send when the BLE BUCK asserts blebuck_comp1
+                                                     for about 21.6us (10 percent margin of error) or more value.              */
+  CLKGEN_BLEBUCKTONADJ_ZEROLENDETECTTRIM_Set3 = 3,/*!< Set3 : Indicator send when the BLE BUCK asserts blebuck_comp1
+                                                     for about 16.2us (10 percent margin of error) or more value.              */
+  CLKGEN_BLEBUCKTONADJ_ZEROLENDETECTTRIM_Set2 = 2,/*!< Set2 : Indicator send when the BLE BUCK asserts blebuck_comp1
+                                                     for about 10.8us (10 percent margin of error) or more value.              */
+  CLKGEN_BLEBUCKTONADJ_ZEROLENDETECTTRIM_Set1 = 1,/*!< Set1 : Indicator send when the BLE BUCK asserts blebuck_comp1
+                                                     for about 5.4us (10 percent margin of error) or more value.               */
+  CLKGEN_BLEBUCKTONADJ_ZEROLENDETECTTRIM_Set0 = 0,/*!< Set0 : Indicator send when the BLE BUCK asserts blebuck_comp1
+                                                     for about 2.0us (10 percent margin of error) or more value.               */
+} CLKGEN_BLEBUCKTONADJ_ZEROLENDETECTTRIM_Enum;
+
+/* =======================================  CLKGEN BLEBUCKTONADJ TONADJUSTEN [22..22]  ======================================= */
+typedef enum {                                  /*!< CLKGEN_BLEBUCKTONADJ_TONADJUSTEN                                          */
+  CLKGEN_BLEBUCKTONADJ_TONADJUSTEN_DIS = 0,     /*!< DIS : Disable Adjust for BLE BUCK TON trim value.                         */
+  CLKGEN_BLEBUCKTONADJ_TONADJUSTEN_EN  = 1,     /*!< EN : Enable Adjust for BLE BUCK TON trim value.                           */
+} CLKGEN_BLEBUCKTONADJ_TONADJUSTEN_Enum;
+
+/* =====================================  CLKGEN BLEBUCKTONADJ TONADJUSTPERIOD [20..21]  ===================================== */
+typedef enum {                                  /*!< CLKGEN_BLEBUCKTONADJ_TONADJUSTPERIOD                                      */
+  CLKGEN_BLEBUCKTONADJ_TONADJUSTPERIOD_HFRC_3KHz = 3,/*!< HFRC_3KHz : Adjust done for every 1 3KHz period value.               */
+  CLKGEN_BLEBUCKTONADJ_TONADJUSTPERIOD_HFRC_12KHz = 2,/*!< HFRC_12KHz : Adjust done for every 1 12KHz period value.            */
+  CLKGEN_BLEBUCKTONADJ_TONADJUSTPERIOD_HFRC_47KHz = 1,/*!< HFRC_47KHz : Adjust done for every 1 47KHz period value.            */
+  CLKGEN_BLEBUCKTONADJ_TONADJUSTPERIOD_HFRC_94KHz = 0,/*!< HFRC_94KHz : Adjust done for every 1 94KHz period value.            */
+} CLKGEN_BLEBUCKTONADJ_TONADJUSTPERIOD_Enum;
+
+/* =======================================================  INTRPTEN  ======================================================== */
+/* ======================================================  INTRPTSTAT  ======================================================= */
+/* =======================================================  INTRPTCLR  ======================================================= */
+/* =======================================================  INTRPTSET  ======================================================= */
+
+
+/* =========================================================================================================================== */
+/* ================                                          CTIMER                                           ================ */
+/* =========================================================================================================================== */
+
+/* =========================================================  TMR0  ========================================================== */
+/* ========================================================  CMPRA0  ========================================================= */
+/* ========================================================  CMPRB0  ========================================================= */
+/* =========================================================  CTRL0  ========================================================= */
+/* =============================================  CTIMER CTRL0 CTLINK0 [31..31]  ============================================= */
+typedef enum {                                  /*!< CTIMER_CTRL0_CTLINK0                                                      */
+  CTIMER_CTRL0_CTLINK0_TWO_16BIT_TIMERS = 0,    /*!< TWO_16BIT_TIMERS : Use A0/B0 timers as two independent 16-bit
+                                                     timers (default). value.                                                  */
+  CTIMER_CTRL0_CTLINK0_32BIT_TIMER     = 1,     /*!< 32BIT_TIMER : Link A0/B0 timers into a single 32-bit timer.
+                                                     value.                                                                    */
+} CTIMER_CTRL0_CTLINK0_Enum;
+
+/* ============================================  CTIMER CTRL0 TMRB0POL [28..28]  ============================================= */
+typedef enum {                                  /*!< CTIMER_CTRL0_TMRB0POL                                                     */
+  CTIMER_CTRL0_TMRB0POL_NORMAL         = 0,     /*!< NORMAL : The polarity of the TMRPINB0 pin is the same as the
+                                                     timer output. value.                                                      */
+  CTIMER_CTRL0_TMRB0POL_INVERTED       = 1,     /*!< INVERTED : The polarity of the TMRPINB0 pin is the inverse of
+                                                     the timer output. value.                                                  */
+} CTIMER_CTRL0_TMRB0POL_Enum;
+
+/* ============================================  CTIMER CTRL0 TMRB0CLR [27..27]  ============================================= */
+typedef enum {                                  /*!< CTIMER_CTRL0_TMRB0CLR                                                     */
+  CTIMER_CTRL0_TMRB0CLR_RUN            = 0,     /*!< RUN : Allow counter/timer B0 to run value.                                */
+  CTIMER_CTRL0_TMRB0CLR_CLEAR          = 1,     /*!< CLEAR : Holds counter/timer B0 at 0x0000. value.                          */
+} CTIMER_CTRL0_TMRB0CLR_Enum;
+
+/* ============================================  CTIMER CTRL0 TMRB0IE1 [26..26]  ============================================= */
+typedef enum {                                  /*!< CTIMER_CTRL0_TMRB0IE1                                                     */
+  CTIMER_CTRL0_TMRB0IE1_DIS            = 0,     /*!< DIS : Disable counter/timer B0 from generating an interrupt
+                                                     based on COMPR1. value.                                                   */
+  CTIMER_CTRL0_TMRB0IE1_EN             = 1,     /*!< EN : Enable counter/timer B0 to generate an interrupt based
+                                                     on COMPR1. value.                                                         */
+} CTIMER_CTRL0_TMRB0IE1_Enum;
+
+/* ============================================  CTIMER CTRL0 TMRB0IE0 [25..25]  ============================================= */
+typedef enum {                                  /*!< CTIMER_CTRL0_TMRB0IE0                                                     */
+  CTIMER_CTRL0_TMRB0IE0_DIS            = 0,     /*!< DIS : Disable counter/timer B0 from generating an interrupt
+                                                     based on COMPR0. value.                                                   */
+  CTIMER_CTRL0_TMRB0IE0_EN             = 1,     /*!< EN : Enable counter/timer B0 to generate an interrupt based
+                                                     on COMPR0 value.                                                          */
+} CTIMER_CTRL0_TMRB0IE0_Enum;
+
+/* =============================================  CTIMER CTRL0 TMRB0FN [22..24]  ============================================= */
+typedef enum {                                  /*!< CTIMER_CTRL0_TMRB0FN                                                      */
+  CTIMER_CTRL0_TMRB0FN_SINGLECOUNT     = 0,     /*!< SINGLECOUNT : Single count (output toggles and sticks). Count
+                                                     to CMPR0B0, stop. value.                                                  */
+  CTIMER_CTRL0_TMRB0FN_REPEATEDCOUNT   = 1,     /*!< REPEATEDCOUNT : Repeated count (periodic 1-clock-cycle-wide
+                                                     pulses). Count to CMPR0B0, restart. value.                                */
+  CTIMER_CTRL0_TMRB0FN_PULSE_ONCE      = 2,     /*!< PULSE_ONCE : Pulse once (aka one-shot). Count to CMPR0B0, assert,
+                                                     count to CMPR1B0, deassert, stop. value.                                  */
+  CTIMER_CTRL0_TMRB0FN_PULSE_CONT      = 3,     /*!< PULSE_CONT : Pulse continously. Count to CMPR0B0, assert, count
+                                                     to CMPR1B0, deassert, restart. value.                                     */
+  CTIMER_CTRL0_TMRB0FN_SINGLEPATTERN   = 4,     /*!< SINGLEPATTERN : Single pattern. value.                                    */
+  CTIMER_CTRL0_TMRB0FN_REPEATPATTERN   = 5,     /*!< REPEATPATTERN : Repeated pattern. value.                                  */
+  CTIMER_CTRL0_TMRB0FN_CONTINUOUS      = 6,     /*!< CONTINUOUS : Continuous run (aka Free Run). Count continuously.
+                                                     value.                                                                    */
+  CTIMER_CTRL0_TMRB0FN_ALTPWN          = 7,     /*!< ALTPWN : Alternate PWM value.                                             */
+} CTIMER_CTRL0_TMRB0FN_Enum;
+
+/* ============================================  CTIMER CTRL0 TMRB0CLK [17..21]  ============================================= */
+typedef enum {                                  /*!< CTIMER_CTRL0_TMRB0CLK                                                     */
+  CTIMER_CTRL0_TMRB0CLK_TMRPIN         = 0,     /*!< TMRPIN : Clock source is TMRPINB. value.                                  */
+  CTIMER_CTRL0_TMRB0CLK_HFRC_DIV4      = 1,     /*!< HFRC_DIV4 : Clock source is the HFRC / 4 value.                           */
+  CTIMER_CTRL0_TMRB0CLK_HFRC_DIV16     = 2,     /*!< HFRC_DIV16 : Clock source is HFRC / 16 value.                             */
+  CTIMER_CTRL0_TMRB0CLK_HFRC_DIV256    = 3,     /*!< HFRC_DIV256 : Clock source is HFRC / 256 value.                           */
+  CTIMER_CTRL0_TMRB0CLK_HFRC_DIV1024   = 4,     /*!< HFRC_DIV1024 : Clock source is HFRC / 1024 value.                         */
+  CTIMER_CTRL0_TMRB0CLK_HFRC_DIV4K     = 5,     /*!< HFRC_DIV4K : Clock source is HFRC / 4096 value.                           */
+  CTIMER_CTRL0_TMRB0CLK_XT             = 6,     /*!< XT : Clock source is the XT (uncalibrated). value.                        */
+  CTIMER_CTRL0_TMRB0CLK_XT_DIV2        = 7,     /*!< XT_DIV2 : Clock source is XT / 2 value.                                   */
+  CTIMER_CTRL0_TMRB0CLK_XT_DIV16       = 8,     /*!< XT_DIV16 : Clock source is XT / 16 value.                                 */
+  CTIMER_CTRL0_TMRB0CLK_XT_DIV128      = 9,     /*!< XT_DIV128 : Clock source is XT / 128 value.                               */
+  CTIMER_CTRL0_TMRB0CLK_LFRC_DIV2      = 10,    /*!< LFRC_DIV2 : Clock source is LFRC / 2 value.                               */
+  CTIMER_CTRL0_TMRB0CLK_LFRC_DIV32     = 11,    /*!< LFRC_DIV32 : Clock source is LFRC / 32 value.                             */
+  CTIMER_CTRL0_TMRB0CLK_LFRC_DIV1K     = 12,    /*!< LFRC_DIV1K : Clock source is LFRC / 1024 value.                           */
+  CTIMER_CTRL0_TMRB0CLK_LFRC           = 13,    /*!< LFRC : Clock source is LFRC value.                                        */
+  CTIMER_CTRL0_TMRB0CLK_RTC_100HZ      = 14,    /*!< RTC_100HZ : Clock source is 100 Hz from the current RTC oscillator.
+                                                     value.                                                                    */
+  CTIMER_CTRL0_TMRB0CLK_HCLK           = 15,    /*!< HCLK : Clock source is HCLK. value.                                       */
+  CTIMER_CTRL0_TMRB0CLK_XT_DIV4        = 16,    /*!< XT_DIV4 : Clock source is XT / 4 value.                                   */
+  CTIMER_CTRL0_TMRB0CLK_XT_DIV8        = 17,    /*!< XT_DIV8 : Clock source is XT / 8 value.                                   */
+  CTIMER_CTRL0_TMRB0CLK_XT_DIV32       = 18,    /*!< XT_DIV32 : Clock source is XT / 32 value.                                 */
+  CTIMER_CTRL0_TMRB0CLK_CTMRA0         = 20,    /*!< CTMRA0 : Clock source is CTIMERA0 OUT. value.                             */
+  CTIMER_CTRL0_TMRB0CLK_CTMRB1         = 21,    /*!< CTMRB1 : Clock source is CTIMERB1 OUT. value.                             */
+  CTIMER_CTRL0_TMRB0CLK_CTMRA1         = 22,    /*!< CTMRA1 : Clock source is CTIMERA1 OUT. value.                             */
+  CTIMER_CTRL0_TMRB0CLK_CTMRA2         = 23,    /*!< CTMRA2 : Clock source is CTIMERA2 OUT. value.                             */
+  CTIMER_CTRL0_TMRB0CLK_CTMRB2         = 24,    /*!< CTMRB2 : Clock source is CTIMERB2 OUT. value.                             */
+  CTIMER_CTRL0_TMRB0CLK_CTMRB3         = 25,    /*!< CTMRB3 : Clock source is CTIMERB3 OUT. value.                             */
+  CTIMER_CTRL0_TMRB0CLK_CTMRB4         = 26,    /*!< CTMRB4 : Clock source is CTIMERB4 OUT. value.                             */
+  CTIMER_CTRL0_TMRB0CLK_CTMRB5         = 27,    /*!< CTMRB5 : Clock source is CTIMERB5 OUT. value.                             */
+  CTIMER_CTRL0_TMRB0CLK_CTMRB6         = 28,    /*!< CTMRB6 : Clock source is CTIMERB6 OUT. value.                             */
+  CTIMER_CTRL0_TMRB0CLK_BUCKBLE        = 29,    /*!< BUCKBLE : Clock source is BLE buck converter TON pulses. value.           */
+  CTIMER_CTRL0_TMRB0CLK_BUCKB          = 30,    /*!< BUCKB : Clock source is Memory buck converter TON pulses. value.          */
+  CTIMER_CTRL0_TMRB0CLK_BUCKA          = 31,    /*!< BUCKA : Clock source is CPU buck converter TON pulses. value.             */
+} CTIMER_CTRL0_TMRB0CLK_Enum;
+
+/* =============================================  CTIMER CTRL0 TMRB0EN [16..16]  ============================================= */
+typedef enum {                                  /*!< CTIMER_CTRL0_TMRB0EN                                                      */
+  CTIMER_CTRL0_TMRB0EN_DIS             = 0,     /*!< DIS : Counter/Timer B0 Disable. value.                                    */
+  CTIMER_CTRL0_TMRB0EN_EN              = 1,     /*!< EN : Counter/Timer B0 Enable. value.                                      */
+} CTIMER_CTRL0_TMRB0EN_Enum;
+
+/* ============================================  CTIMER CTRL0 TMRA0POL [12..12]  ============================================= */
+typedef enum {                                  /*!< CTIMER_CTRL0_TMRA0POL                                                     */
+  CTIMER_CTRL0_TMRA0POL_NORMAL         = 0,     /*!< NORMAL : The polarity of the TMRPINA0 pin is the same as the
+                                                     timer output. value.                                                      */
+  CTIMER_CTRL0_TMRA0POL_INVERTED       = 1,     /*!< INVERTED : The polarity of the TMRPINA0 pin is the inverse of
+                                                     the timer output. value.                                                  */
+} CTIMER_CTRL0_TMRA0POL_Enum;
+
+/* ============================================  CTIMER CTRL0 TMRA0CLR [11..11]  ============================================= */
+typedef enum {                                  /*!< CTIMER_CTRL0_TMRA0CLR                                                     */
+  CTIMER_CTRL0_TMRA0CLR_RUN            = 0,     /*!< RUN : Allow counter/timer A0 to run value.                                */
+  CTIMER_CTRL0_TMRA0CLR_CLEAR          = 1,     /*!< CLEAR : Holds counter/timer A0 at 0x0000. value.                          */
+} CTIMER_CTRL0_TMRA0CLR_Enum;
+
+/* ============================================  CTIMER CTRL0 TMRA0IE1 [10..10]  ============================================= */
+typedef enum {                                  /*!< CTIMER_CTRL0_TMRA0IE1                                                     */
+  CTIMER_CTRL0_TMRA0IE1_DIS            = 0,     /*!< DIS : Disable counter/timer A0 from generating an interrupt
+                                                     based on COMPR1. value.                                                   */
+  CTIMER_CTRL0_TMRA0IE1_EN             = 1,     /*!< EN : Enable counter/timer A0 to generate an interrupt based
+                                                     on COMPR1. value.                                                         */
+} CTIMER_CTRL0_TMRA0IE1_Enum;
+
+/* =============================================  CTIMER CTRL0 TMRA0IE0 [9..9]  ============================================== */
+typedef enum {                                  /*!< CTIMER_CTRL0_TMRA0IE0                                                     */
+  CTIMER_CTRL0_TMRA0IE0_DIS            = 0,     /*!< DIS : Disable counter/timer A0 from generating an interrupt
+                                                     based on COMPR0. value.                                                   */
+  CTIMER_CTRL0_TMRA0IE0_EN             = 1,     /*!< EN : Enable counter/timer A0 to generate an interrupt based
+                                                     on COMPR0. value.                                                         */
+} CTIMER_CTRL0_TMRA0IE0_Enum;
+
+/* ==============================================  CTIMER CTRL0 TMRA0FN [6..8]  ============================================== */
+typedef enum {                                  /*!< CTIMER_CTRL0_TMRA0FN                                                      */
+  CTIMER_CTRL0_TMRA0FN_SINGLECOUNT     = 0,     /*!< SINGLECOUNT : Single count (output toggles and sticks). Count
+                                                     to CMPR0A0, stop. value.                                                  */
+  CTIMER_CTRL0_TMRA0FN_REPEATEDCOUNT   = 1,     /*!< REPEATEDCOUNT : Repeated count (periodic 1-clock-cycle-wide
+                                                     pulses). Count to CMPR0A0, restart. value.                                */
+  CTIMER_CTRL0_TMRA0FN_PULSE_ONCE      = 2,     /*!< PULSE_ONCE : Pulse once (aka one-shot). Count to CMPR0A0, assert,
+                                                     count to CMPR1A0, deassert, stop. value.                                  */
+  CTIMER_CTRL0_TMRA0FN_PULSE_CONT      = 3,     /*!< PULSE_CONT : Pulse continously. Count to CMPR0A0, assert, count
+                                                     to CMPR1A0, deassert, restart. value.                                     */
+  CTIMER_CTRL0_TMRA0FN_SINGLEPATTERN   = 4,     /*!< SINGLEPATTERN : Single pattern. value.                                    */
+  CTIMER_CTRL0_TMRA0FN_REPEATPATTERN   = 5,     /*!< REPEATPATTERN : Repeated pattern. value.                                  */
+  CTIMER_CTRL0_TMRA0FN_CONTINUOUS      = 6,     /*!< CONTINUOUS : Continuous run (aka Free Run). Count continuously.
+                                                     value.                                                                    */
+  CTIMER_CTRL0_TMRA0FN_ALTPWN          = 7,     /*!< ALTPWN : Alternate PWM value.                                             */
+} CTIMER_CTRL0_TMRA0FN_Enum;
+
+/* =============================================  CTIMER CTRL0 TMRA0CLK [1..5]  ============================================== */
+typedef enum {                                  /*!< CTIMER_CTRL0_TMRA0CLK                                                     */
+  CTIMER_CTRL0_TMRA0CLK_TMRPIN         = 0,     /*!< TMRPIN : Clock source is TMRPINA. value.                                  */
+  CTIMER_CTRL0_TMRA0CLK_HFRC_DIV4      = 1,     /*!< HFRC_DIV4 : Clock source is the HFRC / 4 value.                           */
+  CTIMER_CTRL0_TMRA0CLK_HFRC_DIV16     = 2,     /*!< HFRC_DIV16 : Clock source is HFRC / 16 value.                             */
+  CTIMER_CTRL0_TMRA0CLK_HFRC_DIV256    = 3,     /*!< HFRC_DIV256 : Clock source is HFRC / 256 value.                           */
+  CTIMER_CTRL0_TMRA0CLK_HFRC_DIV1024   = 4,     /*!< HFRC_DIV1024 : Clock source is HFRC / 1024 value.                         */
+  CTIMER_CTRL0_TMRA0CLK_HFRC_DIV4K     = 5,     /*!< HFRC_DIV4K : Clock source is HFRC / 4096 value.                           */
+  CTIMER_CTRL0_TMRA0CLK_XT             = 6,     /*!< XT : Clock source is the XT (uncalibrated). value.                        */
+  CTIMER_CTRL0_TMRA0CLK_XT_DIV2        = 7,     /*!< XT_DIV2 : Clock source is XT / 2 value.                                   */
+  CTIMER_CTRL0_TMRA0CLK_XT_DIV16       = 8,     /*!< XT_DIV16 : Clock source is XT / 16 value.                                 */
+  CTIMER_CTRL0_TMRA0CLK_XT_DIV128      = 9,     /*!< XT_DIV128 : Clock source is XT / 128 value.                               */
+  CTIMER_CTRL0_TMRA0CLK_LFRC_DIV2      = 10,    /*!< LFRC_DIV2 : Clock source is LFRC / 2 value.                               */
+  CTIMER_CTRL0_TMRA0CLK_LFRC_DIV32     = 11,    /*!< LFRC_DIV32 : Clock source is LFRC / 32 value.                             */
+  CTIMER_CTRL0_TMRA0CLK_LFRC_DIV1K     = 12,    /*!< LFRC_DIV1K : Clock source is LFRC / 1024 value.                           */
+  CTIMER_CTRL0_TMRA0CLK_LFRC           = 13,    /*!< LFRC : Clock source is LFRC value.                                        */
+  CTIMER_CTRL0_TMRA0CLK_RTC_100HZ      = 14,    /*!< RTC_100HZ : Clock source is 100 Hz from the current RTC oscillator.
+                                                     value.                                                                    */
+  CTIMER_CTRL0_TMRA0CLK_HCLK_DIV4      = 15,    /*!< HCLK_DIV4 : Clock source is HCLK / 4. value.                              */
+  CTIMER_CTRL0_TMRA0CLK_XT_DIV4        = 16,    /*!< XT_DIV4 : Clock source is XT / 4 value.                                   */
+  CTIMER_CTRL0_TMRA0CLK_XT_DIV8        = 17,    /*!< XT_DIV8 : Clock source is XT / 8 value.                                   */
+  CTIMER_CTRL0_TMRA0CLK_XT_DIV32       = 18,    /*!< XT_DIV32 : Clock source is XT / 32 value.                                 */
+  CTIMER_CTRL0_TMRA0CLK_CTMRB0         = 20,    /*!< CTMRB0 : Clock source is CTIMERB0 OUT. value.                             */
+  CTIMER_CTRL0_TMRA0CLK_CTMRA1         = 21,    /*!< CTMRA1 : Clock source is CTIMERA1 OUT. value.                             */
+  CTIMER_CTRL0_TMRA0CLK_CTMRB1         = 22,    /*!< CTMRB1 : Clock source is CTIMERB1 OUT. value.                             */
+  CTIMER_CTRL0_TMRA0CLK_CTMRA2         = 23,    /*!< CTMRA2 : Clock source is CTIMERA2 OUT. value.                             */
+  CTIMER_CTRL0_TMRA0CLK_CTMRB2         = 24,    /*!< CTMRB2 : Clock source is CTIMERB2 OUT. value.                             */
+  CTIMER_CTRL0_TMRA0CLK_CTMRB3         = 25,    /*!< CTMRB3 : Clock source is CTIMERB3 OUT. value.                             */
+  CTIMER_CTRL0_TMRA0CLK_CTMRB4         = 26,    /*!< CTMRB4 : Clock source is CTIMERB4 OUT. value.                             */
+  CTIMER_CTRL0_TMRA0CLK_CTMRB5         = 27,    /*!< CTMRB5 : Clock source is CTIMERB5 OUT. value.                             */
+  CTIMER_CTRL0_TMRA0CLK_CTMRB6         = 28,    /*!< CTMRB6 : Clock source is CTIMERB6 OUT. value.                             */
+  CTIMER_CTRL0_TMRA0CLK_BUCKBLE        = 29,    /*!< BUCKBLE : Clock source is BLE buck converter TON pulses. value.           */
+  CTIMER_CTRL0_TMRA0CLK_BUCKB          = 30,    /*!< BUCKB : Clock source is Memory buck converter TON pulses. value.          */
+  CTIMER_CTRL0_TMRA0CLK_BUCKA          = 31,    /*!< BUCKA : Clock source is CPU buck converter TON pulses. value.             */
+} CTIMER_CTRL0_TMRA0CLK_Enum;
+
+/* ==============================================  CTIMER CTRL0 TMRA0EN [0..0]  ============================================== */
+typedef enum {                                  /*!< CTIMER_CTRL0_TMRA0EN                                                      */
+  CTIMER_CTRL0_TMRA0EN_DIS             = 0,     /*!< DIS : Counter/Timer A0 Disable. value.                                    */
+  CTIMER_CTRL0_TMRA0EN_EN              = 1,     /*!< EN : Counter/Timer A0 Enable. value.                                      */
+} CTIMER_CTRL0_TMRA0EN_Enum;
+
+/* =======================================================  CMPRAUXA0  ======================================================= */
+/* =======================================================  CMPRAUXB0  ======================================================= */
+/* =========================================================  AUX0  ========================================================== */
+/* ============================================  CTIMER AUX0 TMRB0EN23 [30..30]  ============================================= */
+typedef enum {                                  /*!< CTIMER_AUX0_TMRB0EN23                                                     */
+  CTIMER_AUX0_TMRB0EN23_DIS            = 1,     /*!< DIS : Disable enhanced functions. value.                                  */
+  CTIMER_AUX0_TMRB0EN23_EN             = 0,     /*!< EN : Enable enhanced functions. value.                                    */
+} CTIMER_AUX0_TMRB0EN23_Enum;
+
+/* ============================================  CTIMER AUX0 TMRB0POL23 [29..29]  ============================================ */
+typedef enum {                                  /*!< CTIMER_AUX0_TMRB0POL23                                                    */
+  CTIMER_AUX0_TMRB0POL23_NORM          = 0,     /*!< NORM : Upper output normal polarity value.                                */
+  CTIMER_AUX0_TMRB0POL23_INV           = 1,     /*!< INV : Upper output inverted polarity. value.                              */
+} CTIMER_AUX0_TMRB0POL23_Enum;
+
+/* ============================================  CTIMER AUX0 TMRB0TINV [28..28]  ============================================= */
+typedef enum {                                  /*!< CTIMER_AUX0_TMRB0TINV                                                     */
+  CTIMER_AUX0_TMRB0TINV_DIS            = 0,     /*!< DIS : Disable invert on trigger value.                                    */
+  CTIMER_AUX0_TMRB0TINV_EN             = 1,     /*!< EN : Enable invert on trigger value.                                      */
+} CTIMER_AUX0_TMRB0TINV_Enum;
+
+/* ===========================================  CTIMER AUX0 TMRB0NOSYNC [27..27]  ============================================ */
+typedef enum {                                  /*!< CTIMER_AUX0_TMRB0NOSYNC                                                   */
+  CTIMER_AUX0_TMRB0NOSYNC_DIS          = 0,     /*!< DIS : Synchronization on source clock value.                              */
+  CTIMER_AUX0_TMRB0NOSYNC_NOSYNC       = 1,     /*!< NOSYNC : No synchronization on source clock value.                        */
+} CTIMER_AUX0_TMRB0NOSYNC_Enum;
+
+/* ============================================  CTIMER AUX0 TMRB0TRIG [23..26]  ============================================= */
+typedef enum {                                  /*!< CTIMER_AUX0_TMRB0TRIG                                                     */
+  CTIMER_AUX0_TMRB0TRIG_DIS            = 0,     /*!< DIS : Trigger source is disabled. value.                                  */
+  CTIMER_AUX0_TMRB0TRIG_A0OUT          = 1,     /*!< A0OUT : Trigger source is CTIMERA0 OUT. value.                            */
+  CTIMER_AUX0_TMRB0TRIG_B3OUT          = 2,     /*!< B3OUT : Trigger source is CTIMERB3 OUT. value.                            */
+  CTIMER_AUX0_TMRB0TRIG_A3OUT          = 3,     /*!< A3OUT : Trigger source is CTIMERA3 OUT. value.                            */
+  CTIMER_AUX0_TMRB0TRIG_B2OUT          = 4,     /*!< B2OUT : Trigger source is CTIMERB2 OUT. value.                            */
+  CTIMER_AUX0_TMRB0TRIG_B5OUT          = 5,     /*!< B5OUT : Trigger source is CTIMERB5 OUT. value.                            */
+  CTIMER_AUX0_TMRB0TRIG_A4OUT          = 6,     /*!< A4OUT : Trigger source is CTIMERA4 OUT. value.                            */
+  CTIMER_AUX0_TMRB0TRIG_B4OUT          = 7,     /*!< B4OUT : Trigger source is CTIMERB4 OUT. value.                            */
+  CTIMER_AUX0_TMRB0TRIG_B3OUT2         = 8,     /*!< B3OUT2 : Trigger source is CTIMERB3 OUT2. value.                          */
+  CTIMER_AUX0_TMRB0TRIG_A3OUT2         = 9,     /*!< A3OUT2 : Trigger source is CTIMERA3 OUT2. value.                          */
+  CTIMER_AUX0_TMRB0TRIG_B7OUT2         = 10,    /*!< B7OUT2 : Trigger source is CTIMERB7 OUT2. value.                          */
+  CTIMER_AUX0_TMRB0TRIG_A2OUT2         = 11,    /*!< A2OUT2 : Trigger source is CTIMERA2 OUT2. value.                          */
+  CTIMER_AUX0_TMRB0TRIG_A6OUT2DUAL     = 12,    /*!< A6OUT2DUAL : Trigger source is CTIMERA6 OUT2, dual edge. value.           */
+  CTIMER_AUX0_TMRB0TRIG_A7OUT2DUAL     = 13,    /*!< A7OUT2DUAL : Trigger source is CTIMERA7 OUT2, dual edge. value.           */
+  CTIMER_AUX0_TMRB0TRIG_B5OUT2DUAL     = 14,    /*!< B5OUT2DUAL : Trigger source is CTIMERB5 OUT2, dual edge. value.           */
+  CTIMER_AUX0_TMRB0TRIG_A5OUT2DUAL     = 15,    /*!< A5OUT2DUAL : Trigger source is CTIMERA5 OUT2, dual edge. value.           */
+} CTIMER_AUX0_TMRB0TRIG_Enum;
+
+/* ============================================  CTIMER AUX0 TMRA0EN23 [14..14]  ============================================= */
+typedef enum {                                  /*!< CTIMER_AUX0_TMRA0EN23                                                     */
+  CTIMER_AUX0_TMRA0EN23_DIS            = 1,     /*!< DIS : Disable enhanced functions. value.                                  */
+  CTIMER_AUX0_TMRA0EN23_EN             = 0,     /*!< EN : Enable enhanced functions. value.                                    */
+} CTIMER_AUX0_TMRA0EN23_Enum;
+
+/* ============================================  CTIMER AUX0 TMRA0POL23 [13..13]  ============================================ */
+typedef enum {                                  /*!< CTIMER_AUX0_TMRA0POL23                                                    */
+  CTIMER_AUX0_TMRA0POL23_NORM          = 0,     /*!< NORM : Upper output normal polarity value.                                */
+  CTIMER_AUX0_TMRA0POL23_INV           = 1,     /*!< INV : Upper output inverted polarity. value.                              */
+} CTIMER_AUX0_TMRA0POL23_Enum;
+
+/* ============================================  CTIMER AUX0 TMRA0TINV [12..12]  ============================================= */
+typedef enum {                                  /*!< CTIMER_AUX0_TMRA0TINV                                                     */
+  CTIMER_AUX0_TMRA0TINV_DIS            = 0,     /*!< DIS : Disable invert on trigger value.                                    */
+  CTIMER_AUX0_TMRA0TINV_EN             = 1,     /*!< EN : Enable invert on trigger value.                                      */
+} CTIMER_AUX0_TMRA0TINV_Enum;
+
+/* ===========================================  CTIMER AUX0 TMRA0NOSYNC [11..11]  ============================================ */
+typedef enum {                                  /*!< CTIMER_AUX0_TMRA0NOSYNC                                                   */
+  CTIMER_AUX0_TMRA0NOSYNC_DIS          = 0,     /*!< DIS : Synchronization on source clock value.                              */
+  CTIMER_AUX0_TMRA0NOSYNC_NOSYNC       = 1,     /*!< NOSYNC : No synchronization on source clock value.                        */
+} CTIMER_AUX0_TMRA0NOSYNC_Enum;
+
+/* =============================================  CTIMER AUX0 TMRA0TRIG [7..10]  ============================================= */
+typedef enum {                                  /*!< CTIMER_AUX0_TMRA0TRIG                                                     */
+  CTIMER_AUX0_TMRA0TRIG_DIS            = 0,     /*!< DIS : Trigger source is disabled. value.                                  */
+  CTIMER_AUX0_TMRA0TRIG_B0OUT          = 1,     /*!< B0OUT : Trigger source is CTIMERB0 OUT. value.                            */
+  CTIMER_AUX0_TMRA0TRIG_B3OUT          = 2,     /*!< B3OUT : Trigger source is CTIMERB3 OUT. value.                            */
+  CTIMER_AUX0_TMRA0TRIG_A3OUT          = 3,     /*!< A3OUT : Trigger source is CTIMERA3 OUT. value.                            */
+  CTIMER_AUX0_TMRA0TRIG_A1OUT          = 4,     /*!< A1OUT : Trigger source is CTIMERA1 OUT. value.                            */
+  CTIMER_AUX0_TMRA0TRIG_B1OUT          = 5,     /*!< B1OUT : Trigger source is CTIMERB1 OUT. value.                            */
+  CTIMER_AUX0_TMRA0TRIG_A5OUT          = 6,     /*!< A5OUT : Trigger source is CTIMERA5 OUT. value.                            */
+  CTIMER_AUX0_TMRA0TRIG_B5OUT          = 7,     /*!< B5OUT : Trigger source is CTIMERB5 OUT. value.                            */
+  CTIMER_AUX0_TMRA0TRIG_B3OUT2         = 8,     /*!< B3OUT2 : Trigger source is CTIMERB3 OUT2. value.                          */
+  CTIMER_AUX0_TMRA0TRIG_A3OUT2         = 9,     /*!< A3OUT2 : Trigger source is CTIMERA3 OUT2. value.                          */
+  CTIMER_AUX0_TMRA0TRIG_B6OUT2         = 10,    /*!< B6OUT2 : Trigger source is CTIMERB6 OUT2. value.                          */
+  CTIMER_AUX0_TMRA0TRIG_A2OUT2         = 11,    /*!< A2OUT2 : Trigger source is CTIMERA2 OUT2. value.                          */
+  CTIMER_AUX0_TMRA0TRIG_A6OUT2DUAL     = 12,    /*!< A6OUT2DUAL : Trigger source is CTIMERA6 OUT2, dual edge. value.           */
+  CTIMER_AUX0_TMRA0TRIG_A7OUT2DUAL     = 13,    /*!< A7OUT2DUAL : Trigger source is CTIMERA7 OUT2, dual edge. value.           */
+  CTIMER_AUX0_TMRA0TRIG_B4OUT2DUAL     = 14,    /*!< B4OUT2DUAL : Trigger source is CTIMERB4 OUT2, dual edge. value.           */
+  CTIMER_AUX0_TMRA0TRIG_A4OUT2DUAL     = 15,    /*!< A4OUT2DUAL : Trigger source is CTIMERA4 OUT2, dual edge. value.           */
+} CTIMER_AUX0_TMRA0TRIG_Enum;
+
+/* =========================================================  TMR1  ========================================================== */
+/* ========================================================  CMPRA1  ========================================================= */
+/* ========================================================  CMPRB1  ========================================================= */
+/* =========================================================  CTRL1  ========================================================= */
+/* =============================================  CTIMER CTRL1 CTLINK1 [31..31]  ============================================= */
+typedef enum {                                  /*!< CTIMER_CTRL1_CTLINK1                                                      */
+  CTIMER_CTRL1_CTLINK1_TWO_16BIT_TIMERS = 0,    /*!< TWO_16BIT_TIMERS : Use A1/B1 timers as two independent 16-bit
+                                                     timers (default). value.                                                  */
+  CTIMER_CTRL1_CTLINK1_32BIT_TIMER     = 1,     /*!< 32BIT_TIMER : Link A1/B1 timers into a single 32-bit timer.
+                                                     value.                                                                    */
+} CTIMER_CTRL1_CTLINK1_Enum;
+
+/* ============================================  CTIMER CTRL1 TMRB1POL [28..28]  ============================================= */
+typedef enum {                                  /*!< CTIMER_CTRL1_TMRB1POL                                                     */
+  CTIMER_CTRL1_TMRB1POL_NORMAL         = 0,     /*!< NORMAL : The polarity of the TMRPINB1 pin is the same as the
+                                                     timer output. value.                                                      */
+  CTIMER_CTRL1_TMRB1POL_INVERTED       = 1,     /*!< INVERTED : The polarity of the TMRPINB1 pin is the inverse of
+                                                     the timer output. value.                                                  */
+} CTIMER_CTRL1_TMRB1POL_Enum;
+
+/* ============================================  CTIMER CTRL1 TMRB1CLR [27..27]  ============================================= */
+typedef enum {                                  /*!< CTIMER_CTRL1_TMRB1CLR                                                     */
+  CTIMER_CTRL1_TMRB1CLR_RUN            = 0,     /*!< RUN : Allow counter/timer B1 to run value.                                */
+  CTIMER_CTRL1_TMRB1CLR_CLEAR          = 1,     /*!< CLEAR : Holds counter/timer B1 at 0x0000. value.                          */
+} CTIMER_CTRL1_TMRB1CLR_Enum;
+
+/* ============================================  CTIMER CTRL1 TMRB1IE1 [26..26]  ============================================= */
+typedef enum {                                  /*!< CTIMER_CTRL1_TMRB1IE1                                                     */
+  CTIMER_CTRL1_TMRB1IE1_DIS            = 0,     /*!< DIS : Disable counter/timer B1 from generating an interrupt
+                                                     based on COMPR1. value.                                                   */
+  CTIMER_CTRL1_TMRB1IE1_EN             = 1,     /*!< EN : Enable counter/timer B1 to generate an interrupt based
+                                                     on COMPR1. value.                                                         */
+} CTIMER_CTRL1_TMRB1IE1_Enum;
+
+/* ============================================  CTIMER CTRL1 TMRB1IE0 [25..25]  ============================================= */
+typedef enum {                                  /*!< CTIMER_CTRL1_TMRB1IE0                                                     */
+  CTIMER_CTRL1_TMRB1IE0_DIS            = 0,     /*!< DIS : Disable counter/timer B1 from generating an interrupt
+                                                     based on COMPR0. value.                                                   */
+  CTIMER_CTRL1_TMRB1IE0_EN             = 1,     /*!< EN : Enable counter/timer B1 to generate an interrupt based
+                                                     on COMPR0 value.                                                          */
+} CTIMER_CTRL1_TMRB1IE0_Enum;
+
+/* =============================================  CTIMER CTRL1 TMRB1FN [22..24]  ============================================= */
+typedef enum {                                  /*!< CTIMER_CTRL1_TMRB1FN                                                      */
+  CTIMER_CTRL1_TMRB1FN_SINGLECOUNT     = 0,     /*!< SINGLECOUNT : Single count (output toggles and sticks). Count
+                                                     to CMPR0B1, stop. value.                                                  */
+  CTIMER_CTRL1_TMRB1FN_REPEATEDCOUNT   = 1,     /*!< REPEATEDCOUNT : Repeated count (periodic 1-clock-cycle-wide
+                                                     pulses). Count to CMPR0B1, restart. value.                                */
+  CTIMER_CTRL1_TMRB1FN_PULSE_ONCE      = 2,     /*!< PULSE_ONCE : Pulse once (aka one-shot). Count to CMPR0B1, assert,
+                                                     count to CMPR1B1, deassert, stop. value.                                  */
+  CTIMER_CTRL1_TMRB1FN_PULSE_CONT      = 3,     /*!< PULSE_CONT : Pulse continously. Count to CMPR0B1, assert, count
+                                                     to CMPR1B1, deassert, restart. value.                                     */
+  CTIMER_CTRL1_TMRB1FN_SINGLEPATTERN   = 4,     /*!< SINGLEPATTERN : Single pattern. value.                                    */
+  CTIMER_CTRL1_TMRB1FN_REPEATPATTERN   = 5,     /*!< REPEATPATTERN : Repeated pattern. value.                                  */
+  CTIMER_CTRL1_TMRB1FN_CONTINUOUS      = 6,     /*!< CONTINUOUS : Continuous run (aka Free Run). Count continuously.
+                                                     value.                                                                    */
+  CTIMER_CTRL1_TMRB1FN_ALTPWN          = 7,     /*!< ALTPWN : Alternate PWM value.                                             */
+} CTIMER_CTRL1_TMRB1FN_Enum;
+
+/* ============================================  CTIMER CTRL1 TMRB1CLK [17..21]  ============================================= */
+typedef enum {                                  /*!< CTIMER_CTRL1_TMRB1CLK                                                     */
+  CTIMER_CTRL1_TMRB1CLK_TMRPIN         = 0,     /*!< TMRPIN : Clock source is TMRPINB. value.                                  */
+  CTIMER_CTRL1_TMRB1CLK_HFRC_DIV4      = 1,     /*!< HFRC_DIV4 : Clock source is the HFRC / 4 value.                           */
+  CTIMER_CTRL1_TMRB1CLK_HFRC_DIV16     = 2,     /*!< HFRC_DIV16 : Clock source is HFRC / 16 value.                             */
+  CTIMER_CTRL1_TMRB1CLK_HFRC_DIV256    = 3,     /*!< HFRC_DIV256 : Clock source is HFRC / 256 value.                           */
+  CTIMER_CTRL1_TMRB1CLK_HFRC_DIV1024   = 4,     /*!< HFRC_DIV1024 : Clock source is HFRC / 1024 value.                         */
+  CTIMER_CTRL1_TMRB1CLK_HFRC_DIV4K     = 5,     /*!< HFRC_DIV4K : Clock source is HFRC / 4096 value.                           */
+  CTIMER_CTRL1_TMRB1CLK_XT             = 6,     /*!< XT : Clock source is the XT (uncalibrated). value.                        */
+  CTIMER_CTRL1_TMRB1CLK_XT_DIV2        = 7,     /*!< XT_DIV2 : Clock source is XT / 2 value.                                   */
+  CTIMER_CTRL1_TMRB1CLK_XT_DIV16       = 8,     /*!< XT_DIV16 : Clock source is XT / 16 value.                                 */
+  CTIMER_CTRL1_TMRB1CLK_XT_DIV128      = 9,     /*!< XT_DIV128 : Clock source is XT / 128 value.                               */
+  CTIMER_CTRL1_TMRB1CLK_LFRC_DIV2      = 10,    /*!< LFRC_DIV2 : Clock source is LFRC / 2 value.                               */
+  CTIMER_CTRL1_TMRB1CLK_LFRC_DIV32     = 11,    /*!< LFRC_DIV32 : Clock source is LFRC / 32 value.                             */
+  CTIMER_CTRL1_TMRB1CLK_LFRC_DIV1K     = 12,    /*!< LFRC_DIV1K : Clock source is LFRC / 1024 value.                           */
+  CTIMER_CTRL1_TMRB1CLK_LFRC           = 13,    /*!< LFRC : Clock source is LFRC value.                                        */
+  CTIMER_CTRL1_TMRB1CLK_RTC_100HZ      = 14,    /*!< RTC_100HZ : Clock source is 100 Hz from the current RTC oscillator.
+                                                     value.                                                                    */
+  CTIMER_CTRL1_TMRB1CLK_HCLK           = 15,    /*!< HCLK : Clock source is HCLK. value.                                       */
+  CTIMER_CTRL1_TMRB1CLK_XT_DIV4        = 16,    /*!< XT_DIV4 : Clock source is XT / 4 value.                                   */
+  CTIMER_CTRL1_TMRB1CLK_XT_DIV8        = 17,    /*!< XT_DIV8 : Clock source is XT / 8 value.                                   */
+  CTIMER_CTRL1_TMRB1CLK_XT_DIV32       = 18,    /*!< XT_DIV32 : Clock source is XT / 32 value.                                 */
+  CTIMER_CTRL1_TMRB1CLK_CTMRA1         = 20,    /*!< CTMRA1 : Clock source is CTIMERA1 OUT. value.                             */
+  CTIMER_CTRL1_TMRB1CLK_CTMRA0         = 21,    /*!< CTMRA0 : Clock source is CTIMERA0 OUT. value.                             */
+  CTIMER_CTRL1_TMRB1CLK_CTMRB0         = 22,    /*!< CTMRB0 : Clock source is CTIMERB0 OUT. value.                             */
+  CTIMER_CTRL1_TMRB1CLK_CTMRA2         = 23,    /*!< CTMRA2 : Clock source is CTIMERA2 OUT. value.                             */
+  CTIMER_CTRL1_TMRB1CLK_CTMRB2         = 24,    /*!< CTMRB2 : Clock source is CTIMERB2 OUT. value.                             */
+  CTIMER_CTRL1_TMRB1CLK_CTMRB3         = 25,    /*!< CTMRB3 : Clock source is CTIMERB3 OUT. value.                             */
+  CTIMER_CTRL1_TMRB1CLK_CTMRB4         = 26,    /*!< CTMRB4 : Clock source is CTIMERB4 OUT. value.                             */
+  CTIMER_CTRL1_TMRB1CLK_CTMRB5         = 27,    /*!< CTMRB5 : Clock source is CTIMERB5 OUT. value.                             */
+  CTIMER_CTRL1_TMRB1CLK_CTMRB6         = 28,    /*!< CTMRB6 : Clock source is CTIMERB6 OUT. value.                             */
+  CTIMER_CTRL1_TMRB1CLK_BUCKBLE        = 29,    /*!< BUCKBLE : Clock source is BLE buck converter TON pulses. value.           */
+  CTIMER_CTRL1_TMRB1CLK_BUCKB          = 30,    /*!< BUCKB : Clock source is Memory buck converter TON pulses. value.          */
+  CTIMER_CTRL1_TMRB1CLK_BUCKA          = 31,    /*!< BUCKA : Clock source is CPU buck converter TON pulses. value.             */
+} CTIMER_CTRL1_TMRB1CLK_Enum;
+
+/* =============================================  CTIMER CTRL1 TMRB1EN [16..16]  ============================================= */
+typedef enum {                                  /*!< CTIMER_CTRL1_TMRB1EN                                                      */
+  CTIMER_CTRL1_TMRB1EN_DIS             = 0,     /*!< DIS : Counter/Timer B1 Disable. value.                                    */
+  CTIMER_CTRL1_TMRB1EN_EN              = 1,     /*!< EN : Counter/Timer B1 Enable. value.                                      */
+} CTIMER_CTRL1_TMRB1EN_Enum;
+
+/* ============================================  CTIMER CTRL1 TMRA1POL [12..12]  ============================================= */
+typedef enum {                                  /*!< CTIMER_CTRL1_TMRA1POL                                                     */
+  CTIMER_CTRL1_TMRA1POL_NORMAL         = 0,     /*!< NORMAL : The polarity of the TMRPINA1 pin is the same as the
+                                                     timer output. value.                                                      */
+  CTIMER_CTRL1_TMRA1POL_INVERTED       = 1,     /*!< INVERTED : The polarity of the TMRPINA1 pin is the inverse of
+                                                     the timer output. value.                                                  */
+} CTIMER_CTRL1_TMRA1POL_Enum;
+
+/* ============================================  CTIMER CTRL1 TMRA1CLR [11..11]  ============================================= */
+typedef enum {                                  /*!< CTIMER_CTRL1_TMRA1CLR                                                     */
+  CTIMER_CTRL1_TMRA1CLR_RUN            = 0,     /*!< RUN : Allow counter/timer A1 to run value.                                */
+  CTIMER_CTRL1_TMRA1CLR_CLEAR          = 1,     /*!< CLEAR : Holds counter/timer A1 at 0x0000. value.                          */
+} CTIMER_CTRL1_TMRA1CLR_Enum;
+
+/* ============================================  CTIMER CTRL1 TMRA1IE1 [10..10]  ============================================= */
+typedef enum {                                  /*!< CTIMER_CTRL1_TMRA1IE1                                                     */
+  CTIMER_CTRL1_TMRA1IE1_DIS            = 0,     /*!< DIS : Disable counter/timer A1 from generating an interrupt
+                                                     based on COMPR1. value.                                                   */
+  CTIMER_CTRL1_TMRA1IE1_EN             = 1,     /*!< EN : Enable counter/timer A1 to generate an interrupt based
+                                                     on COMPR1. value.                                                         */
+} CTIMER_CTRL1_TMRA1IE1_Enum;
+
+/* =============================================  CTIMER CTRL1 TMRA1IE0 [9..9]  ============================================== */
+typedef enum {                                  /*!< CTIMER_CTRL1_TMRA1IE0                                                     */
+  CTIMER_CTRL1_TMRA1IE0_DIS            = 0,     /*!< DIS : Disable counter/timer A1 from generating an interrupt
+                                                     based on COMPR0. value.                                                   */
+  CTIMER_CTRL1_TMRA1IE0_EN             = 1,     /*!< EN : Enable counter/timer A1 to generate an interrupt based
+                                                     on COMPR0. value.                                                         */
+} CTIMER_CTRL1_TMRA1IE0_Enum;
+
+/* ==============================================  CTIMER CTRL1 TMRA1FN [6..8]  ============================================== */
+typedef enum {                                  /*!< CTIMER_CTRL1_TMRA1FN                                                      */
+  CTIMER_CTRL1_TMRA1FN_SINGLECOUNT     = 0,     /*!< SINGLECOUNT : Single count (output toggles and sticks). Count
+                                                     to CMPR0A1, stop. value.                                                  */
+  CTIMER_CTRL1_TMRA1FN_REPEATEDCOUNT   = 1,     /*!< REPEATEDCOUNT : Repeated count (periodic 1-clock-cycle-wide
+                                                     pulses). Count to CMPR0A1, restart. value.                                */
+  CTIMER_CTRL1_TMRA1FN_PULSE_ONCE      = 2,     /*!< PULSE_ONCE : Pulse once (aka one-shot). Count to CMPR0A1, assert,
+                                                     count to CMPR1A1, deassert, stop. value.                                  */
+  CTIMER_CTRL1_TMRA1FN_PULSE_CONT      = 3,     /*!< PULSE_CONT : Pulse continously. Count to CMPR0A1, assert, count
+                                                     to CMPR1A1, deassert, restart. value.                                     */
+  CTIMER_CTRL1_TMRA1FN_SINGLEPATTERN   = 4,     /*!< SINGLEPATTERN : Single pattern. value.                                    */
+  CTIMER_CTRL1_TMRA1FN_REPEATPATTERN   = 5,     /*!< REPEATPATTERN : Repeated pattern. value.                                  */
+  CTIMER_CTRL1_TMRA1FN_CONTINUOUS      = 6,     /*!< CONTINUOUS : Continuous run (aka Free Run). Count continuously.
+                                                     value.                                                                    */
+  CTIMER_CTRL1_TMRA1FN_ALTPWN          = 7,     /*!< ALTPWN : Alternate PWM value.                                             */
+} CTIMER_CTRL1_TMRA1FN_Enum;
+
+/* =============================================  CTIMER CTRL1 TMRA1CLK [1..5]  ============================================== */
+typedef enum {                                  /*!< CTIMER_CTRL1_TMRA1CLK                                                     */
+  CTIMER_CTRL1_TMRA1CLK_TMRPIN         = 0,     /*!< TMRPIN : Clock source is TMRPINA. value.                                  */
+  CTIMER_CTRL1_TMRA1CLK_HFRC_DIV4      = 1,     /*!< HFRC_DIV4 : Clock source is the HFRC / 4 value.                           */
+  CTIMER_CTRL1_TMRA1CLK_HFRC_DIV16     = 2,     /*!< HFRC_DIV16 : Clock source is HFRC / 16 value.                             */
+  CTIMER_CTRL1_TMRA1CLK_HFRC_DIV256    = 3,     /*!< HFRC_DIV256 : Clock source is HFRC / 256 value.                           */
+  CTIMER_CTRL1_TMRA1CLK_HFRC_DIV1024   = 4,     /*!< HFRC_DIV1024 : Clock source is HFRC / 1024 value.                         */
+  CTIMER_CTRL1_TMRA1CLK_HFRC_DIV4K     = 5,     /*!< HFRC_DIV4K : Clock source is HFRC / 4096 value.                           */
+  CTIMER_CTRL1_TMRA1CLK_XT             = 6,     /*!< XT : Clock source is the XT (uncalibrated). value.                        */
+  CTIMER_CTRL1_TMRA1CLK_XT_DIV2        = 7,     /*!< XT_DIV2 : Clock source is XT / 2 value.                                   */
+  CTIMER_CTRL1_TMRA1CLK_XT_DIV16       = 8,     /*!< XT_DIV16 : Clock source is XT / 16 value.                                 */
+  CTIMER_CTRL1_TMRA1CLK_XT_DIV128      = 9,     /*!< XT_DIV128 : Clock source is XT / 128 value.                               */
+  CTIMER_CTRL1_TMRA1CLK_LFRC_DIV2      = 10,    /*!< LFRC_DIV2 : Clock source is LFRC / 2 value.                               */
+  CTIMER_CTRL1_TMRA1CLK_LFRC_DIV32     = 11,    /*!< LFRC_DIV32 : Clock source is LFRC / 32 value.                             */
+  CTIMER_CTRL1_TMRA1CLK_LFRC_DIV1K     = 12,    /*!< LFRC_DIV1K : Clock source is LFRC / 1024 value.                           */
+  CTIMER_CTRL1_TMRA1CLK_LFRC           = 13,    /*!< LFRC : Clock source is LFRC value.                                        */
+  CTIMER_CTRL1_TMRA1CLK_RTC_100HZ      = 14,    /*!< RTC_100HZ : Clock source is 100 Hz from the current RTC oscillator.
+                                                     value.                                                                    */
+  CTIMER_CTRL1_TMRA1CLK_HCLK           = 15,    /*!< HCLK : Clock source is HCLK. value.                                       */
+  CTIMER_CTRL1_TMRA1CLK_XT_DIV4        = 16,    /*!< XT_DIV4 : Clock source is XT / 4 value.                                   */
+  CTIMER_CTRL1_TMRA1CLK_XT_DIV8        = 17,    /*!< XT_DIV8 : Clock source is XT / 8 value.                                   */
+  CTIMER_CTRL1_TMRA1CLK_XT_DIV32       = 18,    /*!< XT_DIV32 : Clock source is XT / 32 value.                                 */
+  CTIMER_CTRL1_TMRA1CLK_CTMRB1         = 20,    /*!< CTMRB1 : Clock source is CTIMERB1 OUT. value.                             */
+  CTIMER_CTRL1_TMRA1CLK_CTMRA0         = 21,    /*!< CTMRA0 : Clock source is CTIMERA0 OUT. value.                             */
+  CTIMER_CTRL1_TMRA1CLK_CTMRB0         = 22,    /*!< CTMRB0 : Clock source is CTIMERB0 OUT. value.                             */
+  CTIMER_CTRL1_TMRA1CLK_CTMRA2         = 23,    /*!< CTMRA2 : Clock source is CTIMERA2 OUT. value.                             */
+  CTIMER_CTRL1_TMRA1CLK_CTMRB2         = 24,    /*!< CTMRB2 : Clock source is CTIMERB2 OUT. value.                             */
+  CTIMER_CTRL1_TMRA1CLK_CTMRB3         = 25,    /*!< CTMRB3 : Clock source is CTIMERB3 OUT. value.                             */
+  CTIMER_CTRL1_TMRA1CLK_CTMRB4         = 26,    /*!< CTMRB4 : Clock source is CTIMERB4 OUT. value.                             */
+  CTIMER_CTRL1_TMRA1CLK_CTMRB5         = 27,    /*!< CTMRB5 : Clock source is CTIMERB5 OUT. value.                             */
+  CTIMER_CTRL1_TMRA1CLK_CTMRB6         = 28,    /*!< CTMRB6 : Clock source is CTIMERB6 OUT. value.                             */
+  CTIMER_CTRL1_TMRA1CLK_BUCKBLE        = 29,    /*!< BUCKBLE : Clock source is BLE buck converter TON pulses. value.           */
+  CTIMER_CTRL1_TMRA1CLK_BUCKB          = 30,    /*!< BUCKB : Clock source is Memory buck converter TON pulses. value.          */
+  CTIMER_CTRL1_TMRA1CLK_BUCKA          = 31,    /*!< BUCKA : Clock source is CPU buck converter TON pulses. value.             */
+} CTIMER_CTRL1_TMRA1CLK_Enum;
+
+/* ==============================================  CTIMER CTRL1 TMRA1EN [0..0]  ============================================== */
+typedef enum {                                  /*!< CTIMER_CTRL1_TMRA1EN                                                      */
+  CTIMER_CTRL1_TMRA1EN_DIS             = 0,     /*!< DIS : Counter/Timer A1 Disable. value.                                    */
+  CTIMER_CTRL1_TMRA1EN_EN              = 1,     /*!< EN : Counter/Timer A1 Enable. value.                                      */
+} CTIMER_CTRL1_TMRA1EN_Enum;
+
+/* =======================================================  CMPRAUXA1  ======================================================= */
+/* =======================================================  CMPRAUXB1  ======================================================= */
+/* =========================================================  AUX1  ========================================================== */
+/* ============================================  CTIMER AUX1 TMRB1EN23 [30..30]  ============================================= */
+typedef enum {                                  /*!< CTIMER_AUX1_TMRB1EN23                                                     */
+  CTIMER_AUX1_TMRB1EN23_DIS            = 1,     /*!< DIS : Disable enhanced functions. value.                                  */
+  CTIMER_AUX1_TMRB1EN23_EN             = 0,     /*!< EN : Enable enhanced functions. value.                                    */
+} CTIMER_AUX1_TMRB1EN23_Enum;
+
+/* ============================================  CTIMER AUX1 TMRB1POL23 [29..29]  ============================================ */
+typedef enum {                                  /*!< CTIMER_AUX1_TMRB1POL23                                                    */
+  CTIMER_AUX1_TMRB1POL23_NORM          = 0,     /*!< NORM : Upper output normal polarity value.                                */
+  CTIMER_AUX1_TMRB1POL23_INV           = 1,     /*!< INV : Upper output inverted polarity. value.                              */
+} CTIMER_AUX1_TMRB1POL23_Enum;
+
+/* ============================================  CTIMER AUX1 TMRB1TINV [28..28]  ============================================= */
+typedef enum {                                  /*!< CTIMER_AUX1_TMRB1TINV                                                     */
+  CTIMER_AUX1_TMRB1TINV_DIS            = 0,     /*!< DIS : Disable invert on trigger value.                                    */
+  CTIMER_AUX1_TMRB1TINV_EN             = 1,     /*!< EN : Enable invert on trigger value.                                      */
+} CTIMER_AUX1_TMRB1TINV_Enum;
+
+/* ===========================================  CTIMER AUX1 TMRB1NOSYNC [27..27]  ============================================ */
+typedef enum {                                  /*!< CTIMER_AUX1_TMRB1NOSYNC                                                   */
+  CTIMER_AUX1_TMRB1NOSYNC_DIS          = 0,     /*!< DIS : Synchronization on source clock value.                              */
+  CTIMER_AUX1_TMRB1NOSYNC_NOSYNC       = 1,     /*!< NOSYNC : No synchronization on source clock value.                        */
+} CTIMER_AUX1_TMRB1NOSYNC_Enum;
+
+/* ============================================  CTIMER AUX1 TMRB1TRIG [23..26]  ============================================= */
+typedef enum {                                  /*!< CTIMER_AUX1_TMRB1TRIG                                                     */
+  CTIMER_AUX1_TMRB1TRIG_DIS            = 0,     /*!< DIS : Trigger source is disabled. value.                                  */
+  CTIMER_AUX1_TMRB1TRIG_A1OUT          = 1,     /*!< A1OUT : Trigger source is CTIMERA1 OUT. value.                            */
+  CTIMER_AUX1_TMRB1TRIG_B3OUT          = 2,     /*!< B3OUT : Trigger source is CTIMERB3 OUT. value.                            */
+  CTIMER_AUX1_TMRB1TRIG_A3OUT          = 3,     /*!< A3OUT : Trigger source is CTIMERA3 OUT. value.                            */
+  CTIMER_AUX1_TMRB1TRIG_A6OUT          = 4,     /*!< A6OUT : Trigger source is CTIMERA6 OUT. value.                            */
+  CTIMER_AUX1_TMRB1TRIG_B6OUT          = 5,     /*!< B6OUT : Trigger source is CTIMERB6 OUT. value.                            */
+  CTIMER_AUX1_TMRB1TRIG_A0OUT          = 6,     /*!< A0OUT : Trigger source is CTIMERA0 OUT. value.                            */
+  CTIMER_AUX1_TMRB1TRIG_B0OUT          = 7,     /*!< B0OUT : Trigger source is CTIMERB0 OUT. value.                            */
+  CTIMER_AUX1_TMRB1TRIG_B3OUT2         = 8,     /*!< B3OUT2 : Trigger source is CTIMERB3 OUT2. value.                          */
+  CTIMER_AUX1_TMRB1TRIG_A3OUT2         = 9,     /*!< A3OUT2 : Trigger source is CTIMERA3 OUT2. value.                          */
+  CTIMER_AUX1_TMRB1TRIG_A4OUT2         = 10,    /*!< A4OUT2 : Trigger source is CTIMERA4 OUT2. value.                          */
+  CTIMER_AUX1_TMRB1TRIG_B4OUT2         = 11,    /*!< B4OUT2 : Trigger source is CTIMERB4 OUT2. value.                          */
+  CTIMER_AUX1_TMRB1TRIG_A6OUT2DUAL     = 12,    /*!< A6OUT2DUAL : Trigger source is CTIMERA6 OUT2, dual edge. value.           */
+  CTIMER_AUX1_TMRB1TRIG_A7OUT2DUAL     = 13,    /*!< A7OUT2DUAL : Trigger source is CTIMERA7 OUT2, dual edge. value.           */
+  CTIMER_AUX1_TMRB1TRIG_B5OUT2DUAL     = 14,    /*!< B5OUT2DUAL : Trigger source is CTIMERB5 OUT2, dual edge. value.           */
+  CTIMER_AUX1_TMRB1TRIG_A5OUT2DUAL     = 15,    /*!< A5OUT2DUAL : Trigger source is CTIMERA5 OUT2, dual edge. value.           */
+} CTIMER_AUX1_TMRB1TRIG_Enum;
+
+/* ============================================  CTIMER AUX1 TMRA1EN23 [14..14]  ============================================= */
+typedef enum {                                  /*!< CTIMER_AUX1_TMRA1EN23                                                     */
+  CTIMER_AUX1_TMRA1EN23_DIS            = 1,     /*!< DIS : Disable enhanced functions. value.                                  */
+  CTIMER_AUX1_TMRA1EN23_EN             = 0,     /*!< EN : Enable enhanced functions. value.                                    */
+} CTIMER_AUX1_TMRA1EN23_Enum;
+
+/* ============================================  CTIMER AUX1 TMRA1POL23 [13..13]  ============================================ */
+typedef enum {                                  /*!< CTIMER_AUX1_TMRA1POL23                                                    */
+  CTIMER_AUX1_TMRA1POL23_NORMAL        = 0,     /*!< NORMAL : Upper output normal polarity value.                              */
+  CTIMER_AUX1_TMRA1POL23_INV           = 1,     /*!< INV : Upper output inverted polarity. value.                              */
+} CTIMER_AUX1_TMRA1POL23_Enum;
+
+/* ============================================  CTIMER AUX1 TMRA1TINV [12..12]  ============================================= */
+typedef enum {                                  /*!< CTIMER_AUX1_TMRA1TINV                                                     */
+  CTIMER_AUX1_TMRA1TINV_DIS            = 0,     /*!< DIS : Disable invert on trigger value.                                    */
+  CTIMER_AUX1_TMRA1TINV_EN             = 1,     /*!< EN : Enable invert on trigger value.                                      */
+} CTIMER_AUX1_TMRA1TINV_Enum;
+
+/* ===========================================  CTIMER AUX1 TMRA1NOSYNC [11..11]  ============================================ */
+typedef enum {                                  /*!< CTIMER_AUX1_TMRA1NOSYNC                                                   */
+  CTIMER_AUX1_TMRA1NOSYNC_DIS          = 0,     /*!< DIS : Synchronization on source clock value.                              */
+  CTIMER_AUX1_TMRA1NOSYNC_NOSYNC       = 1,     /*!< NOSYNC : No synchronization on source clock value.                        */
+} CTIMER_AUX1_TMRA1NOSYNC_Enum;
+
+/* =============================================  CTIMER AUX1 TMRA1TRIG [7..10]  ============================================= */
+typedef enum {                                  /*!< CTIMER_AUX1_TMRA1TRIG                                                     */
+  CTIMER_AUX1_TMRA1TRIG_DIS            = 0,     /*!< DIS : Trigger source is disabled. value.                                  */
+  CTIMER_AUX1_TMRA1TRIG_B1OUT          = 1,     /*!< B1OUT : Trigger source is CTIMERB1 OUT. value.                            */
+  CTIMER_AUX1_TMRA1TRIG_B3OUT          = 2,     /*!< B3OUT : Trigger source is CTIMERB3 OUT. value.                            */
+  CTIMER_AUX1_TMRA1TRIG_A3OUT          = 3,     /*!< A3OUT : Trigger source is CTIMERA3 OUT. value.                            */
+  CTIMER_AUX1_TMRA1TRIG_A0OUT          = 4,     /*!< A0OUT : Trigger source is CTIMERA0 OUT. value.                            */
+  CTIMER_AUX1_TMRA1TRIG_B0OUT          = 5,     /*!< B0OUT : Trigger source is CTIMERB0 OUT. value.                            */
+  CTIMER_AUX1_TMRA1TRIG_A5OUT          = 6,     /*!< A5OUT : Trigger source is CTIMERA5 OUT. value.                            */
+  CTIMER_AUX1_TMRA1TRIG_B5OUT          = 7,     /*!< B5OUT : Trigger source is CTIMERB5 OUT. value.                            */
+  CTIMER_AUX1_TMRA1TRIG_B3OUT2         = 8,     /*!< B3OUT2 : Trigger source is CTIMERB3 OUT2. value.                          */
+  CTIMER_AUX1_TMRA1TRIG_A3OUT2         = 9,     /*!< A3OUT2 : Trigger source is CTIMERA3 OUT2. value.                          */
+  CTIMER_AUX1_TMRA1TRIG_A4OUT2         = 10,    /*!< A4OUT2 : Trigger source is CTIMERA4 OUT2. value.                          */
+  CTIMER_AUX1_TMRA1TRIG_B4OUT2         = 11,    /*!< B4OUT2 : Trigger source is CTIMERB4 OUT2. value.                          */
+  CTIMER_AUX1_TMRA1TRIG_A6OUT2DUAL     = 12,    /*!< A6OUT2DUAL : Trigger source is CTIMERA6 OUT2, dual edge. value.           */
+  CTIMER_AUX1_TMRA1TRIG_A7OUT2DUAL     = 13,    /*!< A7OUT2DUAL : Trigger source is CTIMERA7 OUT2, dual edge. value.           */
+  CTIMER_AUX1_TMRA1TRIG_B5OUT2DUAL     = 14,    /*!< B5OUT2DUAL : Trigger source is CTIMERB5 OUT2, dual edge. value.           */
+  CTIMER_AUX1_TMRA1TRIG_A5OUT2DUAL     = 15,    /*!< A5OUT2DUAL : Trigger source is CTIMERA5 OUT2, dual edge. value.           */
+} CTIMER_AUX1_TMRA1TRIG_Enum;
+
+/* =========================================================  TMR2  ========================================================== */
+/* ========================================================  CMPRA2  ========================================================= */
+/* ========================================================  CMPRB2  ========================================================= */
+/* =========================================================  CTRL2  ========================================================= */
+/* =============================================  CTIMER CTRL2 CTLINK2 [31..31]  ============================================= */
+typedef enum {                                  /*!< CTIMER_CTRL2_CTLINK2                                                      */
+  CTIMER_CTRL2_CTLINK2_TWO_16BIT_TIMERS = 0,    /*!< TWO_16BIT_TIMERS : Use A2/B2 timers as two independent 16-bit
+                                                     timers (default). value.                                                  */
+  CTIMER_CTRL2_CTLINK2_32BIT_TIMER     = 1,     /*!< 32BIT_TIMER : Link A2/B2 timers into a single 32-bit timer.
+                                                     value.                                                                    */
+} CTIMER_CTRL2_CTLINK2_Enum;
+
+/* ============================================  CTIMER CTRL2 TMRB2POL [28..28]  ============================================= */
+typedef enum {                                  /*!< CTIMER_CTRL2_TMRB2POL                                                     */
+  CTIMER_CTRL2_TMRB2POL_NORMAL         = 0,     /*!< NORMAL : The polarity of the TMRPINB2 pin is the same as the
+                                                     timer output. value.                                                      */
+  CTIMER_CTRL2_TMRB2POL_INVERTED       = 1,     /*!< INVERTED : The polarity of the TMRPINB2 pin is the inverse of
+                                                     the timer output. value.                                                  */
+} CTIMER_CTRL2_TMRB2POL_Enum;
+
+/* ============================================  CTIMER CTRL2 TMRB2CLR [27..27]  ============================================= */
+typedef enum {                                  /*!< CTIMER_CTRL2_TMRB2CLR                                                     */
+  CTIMER_CTRL2_TMRB2CLR_RUN            = 0,     /*!< RUN : Allow counter/timer B2 to run value.                                */
+  CTIMER_CTRL2_TMRB2CLR_CLEAR          = 1,     /*!< CLEAR : Holds counter/timer B2 at 0x0000. value.                          */
+} CTIMER_CTRL2_TMRB2CLR_Enum;
+
+/* ============================================  CTIMER CTRL2 TMRB2IE1 [26..26]  ============================================= */
+typedef enum {                                  /*!< CTIMER_CTRL2_TMRB2IE1                                                     */
+  CTIMER_CTRL2_TMRB2IE1_DIS            = 0,     /*!< DIS : Disable counter/timer B2 from generating an interrupt
+                                                     based on COMPR1. value.                                                   */
+  CTIMER_CTRL2_TMRB2IE1_EN             = 1,     /*!< EN : Enable counter/timer B2 to generate an interrupt based
+                                                     on COMPR1. value.                                                         */
+} CTIMER_CTRL2_TMRB2IE1_Enum;
+
+/* ============================================  CTIMER CTRL2 TMRB2IE0 [25..25]  ============================================= */
+typedef enum {                                  /*!< CTIMER_CTRL2_TMRB2IE0                                                     */
+  CTIMER_CTRL2_TMRB2IE0_DIS            = 0,     /*!< DIS : Disable counter/timer B2 from generating an interrupt
+                                                     based on COMPR0. value.                                                   */
+  CTIMER_CTRL2_TMRB2IE0_EN             = 1,     /*!< EN : Enable counter/timer B2 to generate an interrupt based
+                                                     on COMPR0 value.                                                          */
+} CTIMER_CTRL2_TMRB2IE0_Enum;
+
+/* =============================================  CTIMER CTRL2 TMRB2FN [22..24]  ============================================= */
+typedef enum {                                  /*!< CTIMER_CTRL2_TMRB2FN                                                      */
+  CTIMER_CTRL2_TMRB2FN_SINGLECOUNT     = 0,     /*!< SINGLECOUNT : Single count (output toggles and sticks). Count
+                                                     to CMPR0B2, stop. value.                                                  */
+  CTIMER_CTRL2_TMRB2FN_REPEATEDCOUNT   = 1,     /*!< REPEATEDCOUNT : Repeated count (periodic 1-clock-cycle-wide
+                                                     pulses). Count to CMPR0B2, restart. value.                                */
+  CTIMER_CTRL2_TMRB2FN_PULSE_ONCE      = 2,     /*!< PULSE_ONCE : Pulse once (aka one-shot). Count to CMPR0B2, assert,
+                                                     count to CMPR1B2, deassert, stop. value.                                  */
+  CTIMER_CTRL2_TMRB2FN_PULSE_CONT      = 3,     /*!< PULSE_CONT : Pulse continously. Count to CMPR0B2, assert, count
+                                                     to CMPR1B2, deassert, restart. value.                                     */
+  CTIMER_CTRL2_TMRB2FN_SINGLEPATTERN   = 4,     /*!< SINGLEPATTERN : Single pattern. value.                                    */
+  CTIMER_CTRL2_TMRB2FN_REPEATPATTERN   = 5,     /*!< REPEATPATTERN : Repeated pattern. value.                                  */
+  CTIMER_CTRL2_TMRB2FN_CONTINUOUS      = 6,     /*!< CONTINUOUS : Continuous run (aka Free Run). Count continuously.
+                                                     value.                                                                    */
+  CTIMER_CTRL2_TMRB2FN_ALTPWN          = 7,     /*!< ALTPWN : Alternate PWM value.                                             */
+} CTIMER_CTRL2_TMRB2FN_Enum;
+
+/* ============================================  CTIMER CTRL2 TMRB2CLK [17..21]  ============================================= */
+typedef enum {                                  /*!< CTIMER_CTRL2_TMRB2CLK                                                     */
+  CTIMER_CTRL2_TMRB2CLK_TMRPIN         = 0,     /*!< TMRPIN : Clock source is TMRPINB. value.                                  */
+  CTIMER_CTRL2_TMRB2CLK_HFRC_DIV4      = 1,     /*!< HFRC_DIV4 : Clock source is the HFRC / 4 value.                           */
+  CTIMER_CTRL2_TMRB2CLK_HFRC_DIV16     = 2,     /*!< HFRC_DIV16 : Clock source is HFRC / 16 value.                             */
+  CTIMER_CTRL2_TMRB2CLK_HFRC_DIV256    = 3,     /*!< HFRC_DIV256 : Clock source is HFRC / 256 value.                           */
+  CTIMER_CTRL2_TMRB2CLK_HFRC_DIV1024   = 4,     /*!< HFRC_DIV1024 : Clock source is HFRC / 1024 value.                         */
+  CTIMER_CTRL2_TMRB2CLK_HFRC_DIV4K     = 5,     /*!< HFRC_DIV4K : Clock source is HFRC / 4096 value.                           */
+  CTIMER_CTRL2_TMRB2CLK_XT             = 6,     /*!< XT : Clock source is the XT (uncalibrated). value.                        */
+  CTIMER_CTRL2_TMRB2CLK_XT_DIV2        = 7,     /*!< XT_DIV2 : Clock source is XT / 2 value.                                   */
+  CTIMER_CTRL2_TMRB2CLK_XT_DIV16       = 8,     /*!< XT_DIV16 : Clock source is XT / 16 value.                                 */
+  CTIMER_CTRL2_TMRB2CLK_XT_DIV128      = 9,     /*!< XT_DIV128 : Clock source is XT / 128 value.                               */
+  CTIMER_CTRL2_TMRB2CLK_LFRC_DIV2      = 10,    /*!< LFRC_DIV2 : Clock source is LFRC / 2 value.                               */
+  CTIMER_CTRL2_TMRB2CLK_LFRC_DIV32     = 11,    /*!< LFRC_DIV32 : Clock source is LFRC / 32 value.                             */
+  CTIMER_CTRL2_TMRB2CLK_LFRC_DIV1K     = 12,    /*!< LFRC_DIV1K : Clock source is LFRC / 1024 value.                           */
+  CTIMER_CTRL2_TMRB2CLK_LFRC           = 13,    /*!< LFRC : Clock source is LFRC value.                                        */
+  CTIMER_CTRL2_TMRB2CLK_RTC_100HZ      = 14,    /*!< RTC_100HZ : Clock source is 100 Hz from the current RTC oscillator.
+                                                     value.                                                                    */
+  CTIMER_CTRL2_TMRB2CLK_HCLK           = 15,    /*!< HCLK : Clock source is HCLK. value.                                       */
+  CTIMER_CTRL2_TMRB2CLK_XT_DIV4        = 16,    /*!< XT_DIV4 : Clock source is XT / 4 value.                                   */
+  CTIMER_CTRL2_TMRB2CLK_XT_DIV8        = 17,    /*!< XT_DIV8 : Clock source is XT / 8 value.                                   */
+  CTIMER_CTRL2_TMRB2CLK_XT_DIV32       = 18,    /*!< XT_DIV32 : Clock source is XT / 32 value.                                 */
+  CTIMER_CTRL2_TMRB2CLK_CTMRA2         = 20,    /*!< CTMRA2 : Clock source is CTIMERA2 OUT. value.                             */
+  CTIMER_CTRL2_TMRB2CLK_CTMRB3         = 21,    /*!< CTMRB3 : Clock source is CTIMERA3 OUT. value.                             */
+  CTIMER_CTRL2_TMRB2CLK_CTMRA3         = 22,    /*!< CTMRA3 : Clock source is CTIMERB3 OUT. value.                             */
+  CTIMER_CTRL2_TMRB2CLK_CTMRA4         = 23,    /*!< CTMRA4 : Clock source is CTIMERA4 OUT. value.                             */
+  CTIMER_CTRL2_TMRB2CLK_CTMRB4         = 24,    /*!< CTMRB4 : Clock source is CTIMERB4 OUT. value.                             */
+  CTIMER_CTRL2_TMRB2CLK_CTMRB0         = 25,    /*!< CTMRB0 : Clock source is CTIMERB0 OUT. value.                             */
+  CTIMER_CTRL2_TMRB2CLK_CTMRB1         = 26,    /*!< CTMRB1 : Clock source is CTIMERB1 OUT. value.                             */
+  CTIMER_CTRL2_TMRB2CLK_CTMRB5         = 27,    /*!< CTMRB5 : Clock source is CTIMERB5 OUT. value.                             */
+  CTIMER_CTRL2_TMRB2CLK_CTMRB6         = 28,    /*!< CTMRB6 : Clock source is CTIMERB6 OUT. value.                             */
+  CTIMER_CTRL2_TMRB2CLK_BUCKBLE        = 29,    /*!< BUCKBLE : Clock source is BLE buck converter TON pulses. value.           */
+  CTIMER_CTRL2_TMRB2CLK_BUCKB          = 30,    /*!< BUCKB : Clock source is Memory buck converter TON pulses. value.          */
+  CTIMER_CTRL2_TMRB2CLK_BUCKA          = 31,    /*!< BUCKA : Clock source is CPU buck converter TON pulses. value.             */
+} CTIMER_CTRL2_TMRB2CLK_Enum;
+
+/* =============================================  CTIMER CTRL2 TMRB2EN [16..16]  ============================================= */
+typedef enum {                                  /*!< CTIMER_CTRL2_TMRB2EN                                                      */
+  CTIMER_CTRL2_TMRB2EN_DIS             = 0,     /*!< DIS : Counter/Timer B2 Disable. value.                                    */
+  CTIMER_CTRL2_TMRB2EN_EN              = 1,     /*!< EN : Counter/Timer B2 Enable. value.                                      */
+} CTIMER_CTRL2_TMRB2EN_Enum;
+
+/* ============================================  CTIMER CTRL2 TMRA2POL [12..12]  ============================================= */
+typedef enum {                                  /*!< CTIMER_CTRL2_TMRA2POL                                                     */
+  CTIMER_CTRL2_TMRA2POL_NORMAL         = 0,     /*!< NORMAL : The polarity of the TMRPINA2 pin is the same as the
+                                                     timer output. value.                                                      */
+  CTIMER_CTRL2_TMRA2POL_INVERTED       = 1,     /*!< INVERTED : The polarity of the TMRPINA2 pin is the inverse of
+                                                     the timer output. value.                                                  */
+} CTIMER_CTRL2_TMRA2POL_Enum;
+
+/* ============================================  CTIMER CTRL2 TMRA2CLR [11..11]  ============================================= */
+typedef enum {                                  /*!< CTIMER_CTRL2_TMRA2CLR                                                     */
+  CTIMER_CTRL2_TMRA2CLR_RUN            = 0,     /*!< RUN : Allow counter/timer A2 to run value.                                */
+  CTIMER_CTRL2_TMRA2CLR_CLEAR          = 1,     /*!< CLEAR : Holds counter/timer A2 at 0x0000. value.                          */
+} CTIMER_CTRL2_TMRA2CLR_Enum;
+
+/* ============================================  CTIMER CTRL2 TMRA2IE1 [10..10]  ============================================= */
+typedef enum {                                  /*!< CTIMER_CTRL2_TMRA2IE1                                                     */
+  CTIMER_CTRL2_TMRA2IE1_DIS            = 0,     /*!< DIS : Disable counter/timer A2 from generating an interrupt
+                                                     based on COMPR1. value.                                                   */
+  CTIMER_CTRL2_TMRA2IE1_EN             = 1,     /*!< EN : Enable counter/timer A2 to generate an interrupt based
+                                                     on COMPR1. value.                                                         */
+} CTIMER_CTRL2_TMRA2IE1_Enum;
+
+/* =============================================  CTIMER CTRL2 TMRA2IE0 [9..9]  ============================================== */
+typedef enum {                                  /*!< CTIMER_CTRL2_TMRA2IE0                                                     */
+  CTIMER_CTRL2_TMRA2IE0_DIS            = 0,     /*!< DIS : Disable counter/timer A2 from generating an interrupt
+                                                     based on COMPR0. value.                                                   */
+  CTIMER_CTRL2_TMRA2IE0_EN             = 1,     /*!< EN : Enable counter/timer A2 to generate an interrupt based
+                                                     on COMPR0. value.                                                         */
+} CTIMER_CTRL2_TMRA2IE0_Enum;
+
+/* ==============================================  CTIMER CTRL2 TMRA2FN [6..8]  ============================================== */
+typedef enum {                                  /*!< CTIMER_CTRL2_TMRA2FN                                                      */
+  CTIMER_CTRL2_TMRA2FN_SINGLECOUNT     = 0,     /*!< SINGLECOUNT : Single count (output toggles and sticks). Count
+                                                     to CMPR0A2, stop. value.                                                  */
+  CTIMER_CTRL2_TMRA2FN_REPEATEDCOUNT   = 1,     /*!< REPEATEDCOUNT : Repeated count (periodic 1-clock-cycle-wide
+                                                     pulses). Count to CMPR0A2, restart. value.                                */
+  CTIMER_CTRL2_TMRA2FN_PULSE_ONCE      = 2,     /*!< PULSE_ONCE : Pulse once (aka one-shot). Count to CMPR0A2, assert,
+                                                     count to CMPR1A2, deassert, stop. value.                                  */
+  CTIMER_CTRL2_TMRA2FN_PULSE_CONT      = 3,     /*!< PULSE_CONT : Pulse continously. Count to CMPR0A2, assert, count
+                                                     to CMPR1A2, deassert, restart. value.                                     */
+  CTIMER_CTRL2_TMRA2FN_SINGLEPATTERN   = 4,     /*!< SINGLEPATTERN : Single pattern. value.                                    */
+  CTIMER_CTRL2_TMRA2FN_REPEATPATTERN   = 5,     /*!< REPEATPATTERN : Repeated pattern. value.                                  */
+  CTIMER_CTRL2_TMRA2FN_CONTINUOUS      = 6,     /*!< CONTINUOUS : Continuous run (aka Free Run). Count continuously.
+                                                     value.                                                                    */
+  CTIMER_CTRL2_TMRA2FN_ALTPWN          = 7,     /*!< ALTPWN : Alternate PWM value.                                             */
+} CTIMER_CTRL2_TMRA2FN_Enum;
+
+/* =============================================  CTIMER CTRL2 TMRA2CLK [1..5]  ============================================== */
+typedef enum {                                  /*!< CTIMER_CTRL2_TMRA2CLK                                                     */
+  CTIMER_CTRL2_TMRA2CLK_TMRPIN         = 0,     /*!< TMRPIN : Clock source is TMRPINA. value.                                  */
+  CTIMER_CTRL2_TMRA2CLK_HFRC_DIV4      = 1,     /*!< HFRC_DIV4 : Clock source is the HFRC / 4 value.                           */
+  CTIMER_CTRL2_TMRA2CLK_HFRC_DIV16     = 2,     /*!< HFRC_DIV16 : Clock source is HFRC / 16 value.                             */
+  CTIMER_CTRL2_TMRA2CLK_HFRC_DIV256    = 3,     /*!< HFRC_DIV256 : Clock source is HFRC / 256 value.                           */
+  CTIMER_CTRL2_TMRA2CLK_HFRC_DIV1024   = 4,     /*!< HFRC_DIV1024 : Clock source is HFRC / 1024 value.                         */
+  CTIMER_CTRL2_TMRA2CLK_HFRC_DIV4K     = 5,     /*!< HFRC_DIV4K : Clock source is HFRC / 4096 value.                           */
+  CTIMER_CTRL2_TMRA2CLK_XT             = 6,     /*!< XT : Clock source is the XT (uncalibrated). value.                        */
+  CTIMER_CTRL2_TMRA2CLK_XT_DIV2        = 7,     /*!< XT_DIV2 : Clock source is XT / 2 value.                                   */
+  CTIMER_CTRL2_TMRA2CLK_XT_DIV16       = 8,     /*!< XT_DIV16 : Clock source is XT / 16 value.                                 */
+  CTIMER_CTRL2_TMRA2CLK_XT_DIV128      = 9,     /*!< XT_DIV128 : Clock source is XT / 128 value.                               */
+  CTIMER_CTRL2_TMRA2CLK_LFRC_DIV2      = 10,    /*!< LFRC_DIV2 : Clock source is LFRC / 2 value.                               */
+  CTIMER_CTRL2_TMRA2CLK_LFRC_DIV32     = 11,    /*!< LFRC_DIV32 : Clock source is LFRC / 32 value.                             */
+  CTIMER_CTRL2_TMRA2CLK_LFRC_DIV1K     = 12,    /*!< LFRC_DIV1K : Clock source is LFRC / 1024 value.                           */
+  CTIMER_CTRL2_TMRA2CLK_LFRC           = 13,    /*!< LFRC : Clock source is LFRC value.                                        */
+  CTIMER_CTRL2_TMRA2CLK_RTC_100HZ      = 14,    /*!< RTC_100HZ : Clock source is 100 Hz from the current RTC oscillator.
+                                                     value.                                                                    */
+  CTIMER_CTRL2_TMRA2CLK_HCLK           = 15,    /*!< HCLK : Clock source is HCLK. value.                                       */
+  CTIMER_CTRL2_TMRA2CLK_XT_DIV4        = 16,    /*!< XT_DIV4 : Clock source is XT / 4 value.                                   */
+  CTIMER_CTRL2_TMRA2CLK_XT_DIV8        = 17,    /*!< XT_DIV8 : Clock source is XT / 8 value.                                   */
+  CTIMER_CTRL2_TMRA2CLK_XT_DIV32       = 18,    /*!< XT_DIV32 : Clock source is XT / 32 value.                                 */
+  CTIMER_CTRL2_TMRA2CLK_CTMRB2         = 20,    /*!< CTMRB2 : Clock source is CTIMERB2 OUT. value.                             */
+  CTIMER_CTRL2_TMRA2CLK_CTMRB3         = 21,    /*!< CTMRB3 : Clock source is CTIMERA3 OUT. value.                             */
+  CTIMER_CTRL2_TMRA2CLK_CTMRA3         = 22,    /*!< CTMRA3 : Clock source is CTIMERB3 OUT. value.                             */
+  CTIMER_CTRL2_TMRA2CLK_CTMRA4         = 23,    /*!< CTMRA4 : Clock source is CTIMERA4 OUT. value.                             */
+  CTIMER_CTRL2_TMRA2CLK_CTMRB4         = 24,    /*!< CTMRB4 : Clock source is CTIMERB4 OUT. value.                             */
+  CTIMER_CTRL2_TMRA2CLK_CTMRB0         = 25,    /*!< CTMRB0 : Clock source is CTIMERB0 OUT. value.                             */
+  CTIMER_CTRL2_TMRA2CLK_CTMRB1         = 26,    /*!< CTMRB1 : Clock source is CTIMERB1 OUT. value.                             */
+  CTIMER_CTRL2_TMRA2CLK_CTMRB5         = 27,    /*!< CTMRB5 : Clock source is CTIMERB5 OUT. value.                             */
+  CTIMER_CTRL2_TMRA2CLK_CTMRB6         = 28,    /*!< CTMRB6 : Clock source is CTIMERB6 OUT. value.                             */
+  CTIMER_CTRL2_TMRA2CLK_BUCKBLE        = 29,    /*!< BUCKBLE : Clock source is BLE buck converter TON pulses. value.           */
+  CTIMER_CTRL2_TMRA2CLK_BUCKB          = 30,    /*!< BUCKB : Clock source is Memory buck converter TON pulses. value.          */
+  CTIMER_CTRL2_TMRA2CLK_BUCKA          = 31,    /*!< BUCKA : Clock source is CPU buck converter TON pulses. value.             */
+} CTIMER_CTRL2_TMRA2CLK_Enum;
+
+/* ==============================================  CTIMER CTRL2 TMRA2EN [0..0]  ============================================== */
+typedef enum {                                  /*!< CTIMER_CTRL2_TMRA2EN                                                      */
+  CTIMER_CTRL2_TMRA2EN_DIS             = 0,     /*!< DIS : Counter/Timer A2 Disable. value.                                    */
+  CTIMER_CTRL2_TMRA2EN_EN              = 1,     /*!< EN : Counter/Timer A2 Enable. value.                                      */
+} CTIMER_CTRL2_TMRA2EN_Enum;
+
+/* =======================================================  CMPRAUXA2  ======================================================= */
+/* =======================================================  CMPRAUXB2  ======================================================= */
+/* =========================================================  AUX2  ========================================================== */
+/* ============================================  CTIMER AUX2 TMRB2EN23 [30..30]  ============================================= */
+typedef enum {                                  /*!< CTIMER_AUX2_TMRB2EN23                                                     */
+  CTIMER_AUX2_TMRB2EN23_DIS            = 1,     /*!< DIS : Disable enhanced functions. value.                                  */
+  CTIMER_AUX2_TMRB2EN23_EN             = 0,     /*!< EN : Enable enhanced functions. value.                                    */
+} CTIMER_AUX2_TMRB2EN23_Enum;
+
+/* ============================================  CTIMER AUX2 TMRB2POL23 [29..29]  ============================================ */
+typedef enum {                                  /*!< CTIMER_AUX2_TMRB2POL23                                                    */
+  CTIMER_AUX2_TMRB2POL23_NORM          = 0,     /*!< NORM : Upper output normal polarity value.                                */
+  CTIMER_AUX2_TMRB2POL23_INV           = 1,     /*!< INV : Upper output inverted polarity. value.                              */
+} CTIMER_AUX2_TMRB2POL23_Enum;
+
+/* ============================================  CTIMER AUX2 TMRB2TINV [28..28]  ============================================= */
+typedef enum {                                  /*!< CTIMER_AUX2_TMRB2TINV                                                     */
+  CTIMER_AUX2_TMRB2TINV_DIS            = 0,     /*!< DIS : Disable invert on trigger value.                                    */
+  CTIMER_AUX2_TMRB2TINV_EN             = 1,     /*!< EN : Enable invert on trigger value.                                      */
+} CTIMER_AUX2_TMRB2TINV_Enum;
+
+/* ===========================================  CTIMER AUX2 TMRB2NOSYNC [27..27]  ============================================ */
+typedef enum {                                  /*!< CTIMER_AUX2_TMRB2NOSYNC                                                   */
+  CTIMER_AUX2_TMRB2NOSYNC_DIS          = 0,     /*!< DIS : Synchronization on source clock value.                              */
+  CTIMER_AUX2_TMRB2NOSYNC_NOSYNC       = 1,     /*!< NOSYNC : No synchronization on source clock value.                        */
+} CTIMER_AUX2_TMRB2NOSYNC_Enum;
+
+/* ============================================  CTIMER AUX2 TMRB2TRIG [23..26]  ============================================= */
+typedef enum {                                  /*!< CTIMER_AUX2_TMRB2TRIG                                                     */
+  CTIMER_AUX2_TMRB2TRIG_DIS            = 0,     /*!< DIS : Trigger source is disabled. value.                                  */
+  CTIMER_AUX2_TMRB2TRIG_A2OUT          = 1,     /*!< A2OUT : Trigger source is CTIMERA2 OUT. value.                            */
+  CTIMER_AUX2_TMRB2TRIG_B3OUT          = 2,     /*!< B3OUT : Trigger source is CTIMERB3 OUT. value.                            */
+  CTIMER_AUX2_TMRB2TRIG_A3OUT          = 3,     /*!< A3OUT : Trigger source is CTIMERA3 OUT. value.                            */
+  CTIMER_AUX2_TMRB2TRIG_A1OUT          = 4,     /*!< A1OUT : Trigger source is CTIMERA1 OUT. value.                            */
+  CTIMER_AUX2_TMRB2TRIG_B1OUT          = 5,     /*!< B1OUT : Trigger source is CTIMERB1 OUT. value.                            */
+  CTIMER_AUX2_TMRB2TRIG_A4OUT          = 6,     /*!< A4OUT : Trigger source is CTIMERA4 OUT. value.                            */
+  CTIMER_AUX2_TMRB2TRIG_B4OUT          = 7,     /*!< B4OUT : Trigger source is CTIMERB4 OUT. value.                            */
+  CTIMER_AUX2_TMRB2TRIG_B3OUT2         = 8,     /*!< B3OUT2 : Trigger source is CTIMERB3 OUT2. value.                          */
+  CTIMER_AUX2_TMRB2TRIG_A3OUT2         = 9,     /*!< A3OUT2 : Trigger source is CTIMERA3 OUT2. value.                          */
+  CTIMER_AUX2_TMRB2TRIG_A5OUT2         = 10,    /*!< A5OUT2 : Trigger source is CTIMERA5 OUT2. value.                          */
+  CTIMER_AUX2_TMRB2TRIG_B5OUT2         = 11,    /*!< B5OUT2 : Trigger source is CTIMERB5 OUT2. value.                          */
+  CTIMER_AUX2_TMRB2TRIG_A6OUT2DUAL     = 12,    /*!< A6OUT2DUAL : Trigger source is CTIMERA6 OUT2, dual edge. value.           */
+  CTIMER_AUX2_TMRB2TRIG_A7OUT2DUAL     = 13,    /*!< A7OUT2DUAL : Trigger source is CTIMERA7 OUT2, dual edge. value.           */
+  CTIMER_AUX2_TMRB2TRIG_B4OUT2DUAL     = 14,    /*!< B4OUT2DUAL : Trigger source is CTIMERB4 OUT2, dual edge. value.           */
+  CTIMER_AUX2_TMRB2TRIG_A4OUT2DUAL     = 15,    /*!< A4OUT2DUAL : Trigger source is CTIMERA4 OUT2, dual edge. value.           */
+} CTIMER_AUX2_TMRB2TRIG_Enum;
+
+/* ============================================  CTIMER AUX2 TMRA2EN23 [14..14]  ============================================= */
+typedef enum {                                  /*!< CTIMER_AUX2_TMRA2EN23                                                     */
+  CTIMER_AUX2_TMRA2EN23_DIS            = 1,     /*!< DIS : Disable enhanced functions. value.                                  */
+  CTIMER_AUX2_TMRA2EN23_EN             = 0,     /*!< EN : Enable enhanced functions. value.                                    */
+} CTIMER_AUX2_TMRA2EN23_Enum;
+
+/* ============================================  CTIMER AUX2 TMRA2POL23 [13..13]  ============================================ */
+typedef enum {                                  /*!< CTIMER_AUX2_TMRA2POL23                                                    */
+  CTIMER_AUX2_TMRA2POL23_NORM          = 0,     /*!< NORM : Upper output normal polarity value.                                */
+  CTIMER_AUX2_TMRA2POL23_INV           = 1,     /*!< INV : Upper output inverted polarity. value.                              */
+} CTIMER_AUX2_TMRA2POL23_Enum;
+
+/* ============================================  CTIMER AUX2 TMRA2TINV [12..12]  ============================================= */
+typedef enum {                                  /*!< CTIMER_AUX2_TMRA2TINV                                                     */
+  CTIMER_AUX2_TMRA2TINV_DIS            = 0,     /*!< DIS : Disable invert on trigger value.                                    */
+  CTIMER_AUX2_TMRA2TINV_EN             = 1,     /*!< EN : Enable invert on trigger value.                                      */
+} CTIMER_AUX2_TMRA2TINV_Enum;
+
+/* ===========================================  CTIMER AUX2 TMRA2NOSYNC [11..11]  ============================================ */
+typedef enum {                                  /*!< CTIMER_AUX2_TMRA2NOSYNC                                                   */
+  CTIMER_AUX2_TMRA2NOSYNC_DIS          = 0,     /*!< DIS : Synchronization on source clock value.                              */
+  CTIMER_AUX2_TMRA2NOSYNC_NOSYNC       = 1,     /*!< NOSYNC : No synchronization on source clock value.                        */
+} CTIMER_AUX2_TMRA2NOSYNC_Enum;
+
+/* =============================================  CTIMER AUX2 TMRA2TRIG [7..10]  ============================================= */
+typedef enum {                                  /*!< CTIMER_AUX2_TMRA2TRIG                                                     */
+  CTIMER_AUX2_TMRA2TRIG_DIS            = 0,     /*!< DIS : Trigger source is disabled. value.                                  */
+  CTIMER_AUX2_TMRA2TRIG_B2OUT          = 1,     /*!< B2OUT : Trigger source is CTIMERB2 OUT. value.                            */
+  CTIMER_AUX2_TMRA2TRIG_B3OUT          = 2,     /*!< B3OUT : Trigger source is CTIMERB3 OUT. value.                            */
+  CTIMER_AUX2_TMRA2TRIG_A3OUT          = 3,     /*!< A3OUT : Trigger source is CTIMERA3 OUT. value.                            */
+  CTIMER_AUX2_TMRA2TRIG_A0OUT          = 4,     /*!< A0OUT : Trigger source is CTIMERA0 OUT. value.                            */
+  CTIMER_AUX2_TMRA2TRIG_B0OUT          = 5,     /*!< B0OUT : Trigger source is CTIMERB0 OUT. value.                            */
+  CTIMER_AUX2_TMRA2TRIG_A4OUT          = 6,     /*!< A4OUT : Trigger source is CTIMERA4 OUT. value.                            */
+  CTIMER_AUX2_TMRA2TRIG_B4OUT          = 7,     /*!< B4OUT : Trigger source is CTIMERB4 OUT. value.                            */
+  CTIMER_AUX2_TMRA2TRIG_B3OUT2         = 8,     /*!< B3OUT2 : Trigger source is CTIMERB3 OUT2. value.                          */
+  CTIMER_AUX2_TMRA2TRIG_A3OUT2         = 9,     /*!< A3OUT2 : Trigger source is CTIMERA3 OUT2. value.                          */
+  CTIMER_AUX2_TMRA2TRIG_A5OUT2         = 10,    /*!< A5OUT2 : Trigger source is CTIMERA5 OUT2. value.                          */
+  CTIMER_AUX2_TMRA2TRIG_B5OUT2         = 11,    /*!< B5OUT2 : Trigger source is CTIMERB5 OUT2. value.                          */
+  CTIMER_AUX2_TMRA2TRIG_A6OUT2DUAL     = 12,    /*!< A6OUT2DUAL : Trigger source is CTIMERA6 OUT2, dual edge. value.           */
+  CTIMER_AUX2_TMRA2TRIG_A7OUT2DUAL     = 13,    /*!< A7OUT2DUAL : Trigger source is CTIMERA7 OUT2, dual edge. value.           */
+  CTIMER_AUX2_TMRA2TRIG_B4OUT2DUAL     = 14,    /*!< B4OUT2DUAL : Trigger source is CTIMERB4 OUT2, dual edge. value.           */
+  CTIMER_AUX2_TMRA2TRIG_A4OUT2DUAL     = 15,    /*!< A4OUT2DUAL : Trigger source is CTIMERA4 OUT2, dual edge. value.           */
+} CTIMER_AUX2_TMRA2TRIG_Enum;
+
+/* =========================================================  TMR3  ========================================================== */
+/* ========================================================  CMPRA3  ========================================================= */
+/* ========================================================  CMPRB3  ========================================================= */
+/* =========================================================  CTRL3  ========================================================= */
+/* =============================================  CTIMER CTRL3 CTLINK3 [31..31]  ============================================= */
+typedef enum {                                  /*!< CTIMER_CTRL3_CTLINK3                                                      */
+  CTIMER_CTRL3_CTLINK3_TWO_16BIT_TIMERS = 0,    /*!< TWO_16BIT_TIMERS : Use A3/B3 timers as two independent 16-bit
+                                                     timers (default). value.                                                  */
+  CTIMER_CTRL3_CTLINK3_32BIT_TIMER     = 1,     /*!< 32BIT_TIMER : Link A3/B3 timers into a single 32-bit timer.
+                                                     value.                                                                    */
+} CTIMER_CTRL3_CTLINK3_Enum;
+
+/* ============================================  CTIMER CTRL3 TMRB3POL [28..28]  ============================================= */
+typedef enum {                                  /*!< CTIMER_CTRL3_TMRB3POL                                                     */
+  CTIMER_CTRL3_TMRB3POL_NORMAL         = 0,     /*!< NORMAL : The polarity of the TMRPINB3 pin is the same as the
+                                                     timer output. value.                                                      */
+  CTIMER_CTRL3_TMRB3POL_INVERTED       = 1,     /*!< INVERTED : The polarity of the TMRPINB3 pin is the inverse of
+                                                     the timer output. value.                                                  */
+} CTIMER_CTRL3_TMRB3POL_Enum;
+
+/* ============================================  CTIMER CTRL3 TMRB3CLR [27..27]  ============================================= */
+typedef enum {                                  /*!< CTIMER_CTRL3_TMRB3CLR                                                     */
+  CTIMER_CTRL3_TMRB3CLR_RUN            = 0,     /*!< RUN : Allow counter/timer B3 to run value.                                */
+  CTIMER_CTRL3_TMRB3CLR_CLEAR          = 1,     /*!< CLEAR : Holds counter/timer B3 at 0x0000. value.                          */
+} CTIMER_CTRL3_TMRB3CLR_Enum;
+
+/* ============================================  CTIMER CTRL3 TMRB3IE1 [26..26]  ============================================= */
+typedef enum {                                  /*!< CTIMER_CTRL3_TMRB3IE1                                                     */
+  CTIMER_CTRL3_TMRB3IE1_DIS            = 0,     /*!< DIS : Disable counter/timer B3 from generating an interrupt
+                                                     based on COMPR1. value.                                                   */
+  CTIMER_CTRL3_TMRB3IE1_EN             = 1,     /*!< EN : Enable counter/timer B3 to generate an interrupt based
+                                                     on COMPR1. value.                                                         */
+} CTIMER_CTRL3_TMRB3IE1_Enum;
+
+/* ============================================  CTIMER CTRL3 TMRB3IE0 [25..25]  ============================================= */
+typedef enum {                                  /*!< CTIMER_CTRL3_TMRB3IE0                                                     */
+  CTIMER_CTRL3_TMRB3IE0_DIS            = 0,     /*!< DIS : Disable counter/timer B3 from generating an interrupt
+                                                     based on COMPR0. value.                                                   */
+  CTIMER_CTRL3_TMRB3IE0_EN             = 1,     /*!< EN : Enable counter/timer B3 to generate an interrupt based
+                                                     on COMPR0 value.                                                          */
+} CTIMER_CTRL3_TMRB3IE0_Enum;
+
+/* =============================================  CTIMER CTRL3 TMRB3FN [22..24]  ============================================= */
+typedef enum {                                  /*!< CTIMER_CTRL3_TMRB3FN                                                      */
+  CTIMER_CTRL3_TMRB3FN_SINGLECOUNT     = 0,     /*!< SINGLECOUNT : Single count (output toggles and sticks). Count
+                                                     to CMPR0B3, stop. value.                                                  */
+  CTIMER_CTRL3_TMRB3FN_REPEATEDCOUNT   = 1,     /*!< REPEATEDCOUNT : Repeated count (periodic 1-clock-cycle-wide
+                                                     pulses). Count to CMPR0B3, restart. value.                                */
+  CTIMER_CTRL3_TMRB3FN_PULSE_ONCE      = 2,     /*!< PULSE_ONCE : Pulse once (aka one-shot). Count to CMPR0B3, assert,
+                                                     count to CMPR1B3, deassert, stop. value.                                  */
+  CTIMER_CTRL3_TMRB3FN_PULSE_CONT      = 3,     /*!< PULSE_CONT : Pulse continously. Count to CMPR0B3, assert, count
+                                                     to CMPR1B3, deassert, restart. value.                                     */
+  CTIMER_CTRL3_TMRB3FN_SINGLEPATTERN   = 4,     /*!< SINGLEPATTERN : Single pattern. value.                                    */
+  CTIMER_CTRL3_TMRB3FN_REPEATPATTERN   = 5,     /*!< REPEATPATTERN : Repeated pattern. value.                                  */
+  CTIMER_CTRL3_TMRB3FN_CONTINUOUS      = 6,     /*!< CONTINUOUS : Continuous run (aka Free Run). Count continuously.
+                                                     value.                                                                    */
+  CTIMER_CTRL3_TMRB3FN_ALTPWN          = 7,     /*!< ALTPWN : Alternate PWM value.                                             */
+} CTIMER_CTRL3_TMRB3FN_Enum;
+
+/* ============================================  CTIMER CTRL3 TMRB3CLK [17..21]  ============================================= */
+typedef enum {                                  /*!< CTIMER_CTRL3_TMRB3CLK                                                     */
+  CTIMER_CTRL3_TMRB3CLK_TMRPIN         = 0,     /*!< TMRPIN : Clock source is TMRPINB. value.                                  */
+  CTIMER_CTRL3_TMRB3CLK_HFRC_DIV4      = 1,     /*!< HFRC_DIV4 : Clock source is the HFRC / 4 value.                           */
+  CTIMER_CTRL3_TMRB3CLK_HFRC_DIV16     = 2,     /*!< HFRC_DIV16 : Clock source is HFRC / 16 value.                             */
+  CTIMER_CTRL3_TMRB3CLK_HFRC_DIV256    = 3,     /*!< HFRC_DIV256 : Clock source is HFRC / 256 value.                           */
+  CTIMER_CTRL3_TMRB3CLK_HFRC_DIV1024   = 4,     /*!< HFRC_DIV1024 : Clock source is HFRC / 1024 value.                         */
+  CTIMER_CTRL3_TMRB3CLK_HFRC_DIV4K     = 5,     /*!< HFRC_DIV4K : Clock source is HFRC / 4096 value.                           */
+  CTIMER_CTRL3_TMRB3CLK_XT             = 6,     /*!< XT : Clock source is the XT (uncalibrated). value.                        */
+  CTIMER_CTRL3_TMRB3CLK_XT_DIV2        = 7,     /*!< XT_DIV2 : Clock source is XT / 2 value.                                   */
+  CTIMER_CTRL3_TMRB3CLK_XT_DIV16       = 8,     /*!< XT_DIV16 : Clock source is XT / 16 value.                                 */
+  CTIMER_CTRL3_TMRB3CLK_XT_DIV128      = 9,     /*!< XT_DIV128 : Clock source is XT / 128 value.                               */
+  CTIMER_CTRL3_TMRB3CLK_LFRC_DIV2      = 10,    /*!< LFRC_DIV2 : Clock source is LFRC / 2 value.                               */
+  CTIMER_CTRL3_TMRB3CLK_LFRC_DIV32     = 11,    /*!< LFRC_DIV32 : Clock source is LFRC / 32 value.                             */
+  CTIMER_CTRL3_TMRB3CLK_LFRC_DIV1K     = 12,    /*!< LFRC_DIV1K : Clock source is LFRC / 1024 value.                           */
+  CTIMER_CTRL3_TMRB3CLK_LFRC           = 13,    /*!< LFRC : Clock source is LFRC value.                                        */
+  CTIMER_CTRL3_TMRB3CLK_RTC_100HZ      = 14,    /*!< RTC_100HZ : Clock source is 100 Hz from the current RTC oscillator.
+                                                     value.                                                                    */
+  CTIMER_CTRL3_TMRB3CLK_HCLK           = 15,    /*!< HCLK : Clock source is HCLK. value.                                       */
+  CTIMER_CTRL3_TMRB3CLK_XT_DIV4        = 16,    /*!< XT_DIV4 : Clock source is XT / 4 value.                                   */
+  CTIMER_CTRL3_TMRB3CLK_XT_DIV8        = 17,    /*!< XT_DIV8 : Clock source is XT / 8 value.                                   */
+  CTIMER_CTRL3_TMRB3CLK_XT_DIV32       = 18,    /*!< XT_DIV32 : Clock source is XT / 32 value.                                 */
+  CTIMER_CTRL3_TMRB3CLK_CTMRA3         = 20,    /*!< CTMRA3 : Clock source is CTIMERA3 OUT. value.                             */
+  CTIMER_CTRL3_TMRB3CLK_CTMRA2         = 21,    /*!< CTMRA2 : Clock source is CTIMERA2 OUT. value.                             */
+  CTIMER_CTRL3_TMRB3CLK_CTMRB2         = 22,    /*!< CTMRB2 : Clock source is CTIMERB2 OUT. value.                             */
+  CTIMER_CTRL3_TMRB3CLK_CTMRA4         = 23,    /*!< CTMRA4 : Clock source is CTIMERA4 OUT. value.                             */
+  CTIMER_CTRL3_TMRB3CLK_CTMRB4         = 24,    /*!< CTMRB4 : Clock source is CTIMERB4 OUT. value.                             */
+  CTIMER_CTRL3_TMRB3CLK_CTMRB0         = 25,    /*!< CTMRB0 : Clock source is CTIMERB0 OUT. value.                             */
+  CTIMER_CTRL3_TMRB3CLK_CTMRB1         = 26,    /*!< CTMRB1 : Clock source is CTIMERB1 OUT. value.                             */
+  CTIMER_CTRL3_TMRB3CLK_CTMRB5         = 27,    /*!< CTMRB5 : Clock source is CTIMERB5 OUT. value.                             */
+  CTIMER_CTRL3_TMRB3CLK_CTMRB6         = 28,    /*!< CTMRB6 : Clock source is CTIMERB6 OUT. value.                             */
+  CTIMER_CTRL3_TMRB3CLK_BUCKBLE        = 29,    /*!< BUCKBLE : Clock source is BLE buck converter TON pulses. value.           */
+  CTIMER_CTRL3_TMRB3CLK_BUCKB          = 30,    /*!< BUCKB : Clock source is Memory buck converter TON pulses. value.          */
+  CTIMER_CTRL3_TMRB3CLK_BUCKA          = 31,    /*!< BUCKA : Clock source is CPU buck converter TON pulses. value.             */
+} CTIMER_CTRL3_TMRB3CLK_Enum;
+
+/* =============================================  CTIMER CTRL3 TMRB3EN [16..16]  ============================================= */
+typedef enum {                                  /*!< CTIMER_CTRL3_TMRB3EN                                                      */
+  CTIMER_CTRL3_TMRB3EN_DIS             = 0,     /*!< DIS : Counter/Timer B3 Disable. value.                                    */
+  CTIMER_CTRL3_TMRB3EN_EN              = 1,     /*!< EN : Counter/Timer B3 Enable. value.                                      */
+} CTIMER_CTRL3_TMRB3EN_Enum;
+
+/* ============================================  CTIMER CTRL3 TMRA3POL [12..12]  ============================================= */
+typedef enum {                                  /*!< CTIMER_CTRL3_TMRA3POL                                                     */
+  CTIMER_CTRL3_TMRA3POL_NORMAL         = 0,     /*!< NORMAL : The polarity of the TMRPINA3 pin is the same as the
+                                                     timer output. value.                                                      */
+  CTIMER_CTRL3_TMRA3POL_INVERTED       = 1,     /*!< INVERTED : The polarity of the TMRPINA3 pin is the inverse of
+                                                     the timer output. value.                                                  */
+} CTIMER_CTRL3_TMRA3POL_Enum;
+
+/* ============================================  CTIMER CTRL3 TMRA3CLR [11..11]  ============================================= */
+typedef enum {                                  /*!< CTIMER_CTRL3_TMRA3CLR                                                     */
+  CTIMER_CTRL3_TMRA3CLR_RUN            = 0,     /*!< RUN : Allow counter/timer A3 to run value.                                */
+  CTIMER_CTRL3_TMRA3CLR_CLEAR          = 1,     /*!< CLEAR : Holds counter/timer A3 at 0x0000. value.                          */
+} CTIMER_CTRL3_TMRA3CLR_Enum;
+
+/* ============================================  CTIMER CTRL3 TMRA3IE1 [10..10]  ============================================= */
+typedef enum {                                  /*!< CTIMER_CTRL3_TMRA3IE1                                                     */
+  CTIMER_CTRL3_TMRA3IE1_DIS            = 0,     /*!< DIS : Disable counter/timer A3 from generating an interrupt
+                                                     based on COMPR1. value.                                                   */
+  CTIMER_CTRL3_TMRA3IE1_EN             = 1,     /*!< EN : Enable counter/timer A3 to generate an interrupt based
+                                                     on COMPR1. value.                                                         */
+} CTIMER_CTRL3_TMRA3IE1_Enum;
+
+/* =============================================  CTIMER CTRL3 TMRA3IE0 [9..9]  ============================================== */
+typedef enum {                                  /*!< CTIMER_CTRL3_TMRA3IE0                                                     */
+  CTIMER_CTRL3_TMRA3IE0_DIS            = 0,     /*!< DIS : Disable counter/timer A3 from generating an interrupt
+                                                     based on COMPR0. value.                                                   */
+  CTIMER_CTRL3_TMRA3IE0_EN             = 1,     /*!< EN : Enable counter/timer A3 to generate an interrupt based
+                                                     on COMPR0. value.                                                         */
+} CTIMER_CTRL3_TMRA3IE0_Enum;
+
+/* ==============================================  CTIMER CTRL3 TMRA3FN [6..8]  ============================================== */
+typedef enum {                                  /*!< CTIMER_CTRL3_TMRA3FN                                                      */
+  CTIMER_CTRL3_TMRA3FN_SINGLECOUNT     = 0,     /*!< SINGLECOUNT : Single count (output toggles and sticks). Count
+                                                     to CMPR0A3, stop. value.                                                  */
+  CTIMER_CTRL3_TMRA3FN_REPEATEDCOUNT   = 1,     /*!< REPEATEDCOUNT : Repeated count (periodic 1-clock-cycle-wide
+                                                     pulses). Count to CMPR0A3, restart. value.                                */
+  CTIMER_CTRL3_TMRA3FN_PULSE_ONCE      = 2,     /*!< PULSE_ONCE : Pulse once (aka one-shot). Count to CMPR0A3, assert,
+                                                     count to CMPR1A3, deassert, stop. value.                                  */
+  CTIMER_CTRL3_TMRA3FN_PULSE_CONT      = 3,     /*!< PULSE_CONT : Pulse continously. Count to CMPR0A3, assert, count
+                                                     to CMPR1A3, deassert, restart. value.                                     */
+  CTIMER_CTRL3_TMRA3FN_SINGLEPATTERN   = 4,     /*!< SINGLEPATTERN : Single pattern. value.                                    */
+  CTIMER_CTRL3_TMRA3FN_REPEATPATTERN   = 5,     /*!< REPEATPATTERN : Repeated pattern. value.                                  */
+  CTIMER_CTRL3_TMRA3FN_CONTINUOUS      = 6,     /*!< CONTINUOUS : Continuous run (aka Free Run). Count continuously.
+                                                     value.                                                                    */
+  CTIMER_CTRL3_TMRA3FN_ALTPWN          = 7,     /*!< ALTPWN : Alternate PWM value.                                             */
+} CTIMER_CTRL3_TMRA3FN_Enum;
+
+/* =============================================  CTIMER CTRL3 TMRA3CLK [1..5]  ============================================== */
+typedef enum {                                  /*!< CTIMER_CTRL3_TMRA3CLK                                                     */
+  CTIMER_CTRL3_TMRA3CLK_TMRPIN         = 0,     /*!< TMRPIN : Clock source is TMRPINA. value.                                  */
+  CTIMER_CTRL3_TMRA3CLK_HFRC_DIV4      = 1,     /*!< HFRC_DIV4 : Clock source is the HFRC / 4 value.                           */
+  CTIMER_CTRL3_TMRA3CLK_HFRC_DIV16     = 2,     /*!< HFRC_DIV16 : Clock source is HFRC / 16 value.                             */
+  CTIMER_CTRL3_TMRA3CLK_HFRC_DIV256    = 3,     /*!< HFRC_DIV256 : Clock source is HFRC / 256 value.                           */
+  CTIMER_CTRL3_TMRA3CLK_HFRC_DIV1024   = 4,     /*!< HFRC_DIV1024 : Clock source is HFRC / 1024 value.                         */
+  CTIMER_CTRL3_TMRA3CLK_HFRC_DIV4K     = 5,     /*!< HFRC_DIV4K : Clock source is HFRC / 4096 value.                           */
+  CTIMER_CTRL3_TMRA3CLK_XT             = 6,     /*!< XT : Clock source is the XT (uncalibrated). value.                        */
+  CTIMER_CTRL3_TMRA3CLK_XT_DIV2        = 7,     /*!< XT_DIV2 : Clock source is XT / 2 value.                                   */
+  CTIMER_CTRL3_TMRA3CLK_XT_DIV16       = 8,     /*!< XT_DIV16 : Clock source is XT / 16 value.                                 */
+  CTIMER_CTRL3_TMRA3CLK_XT_DIV128      = 9,     /*!< XT_DIV128 : Clock source is XT / 128 value.                               */
+  CTIMER_CTRL3_TMRA3CLK_LFRC_DIV2      = 10,    /*!< LFRC_DIV2 : Clock source is LFRC / 2 value.                               */
+  CTIMER_CTRL3_TMRA3CLK_LFRC_DIV32     = 11,    /*!< LFRC_DIV32 : Clock source is LFRC / 32 value.                             */
+  CTIMER_CTRL3_TMRA3CLK_LFRC_DIV1K     = 12,    /*!< LFRC_DIV1K : Clock source is LFRC / 1024 value.                           */
+  CTIMER_CTRL3_TMRA3CLK_LFRC           = 13,    /*!< LFRC : Clock source is LFRC value.                                        */
+  CTIMER_CTRL3_TMRA3CLK_RTC_100HZ      = 14,    /*!< RTC_100HZ : Clock source is 100 Hz from the current RTC oscillator.
+                                                     value.                                                                    */
+  CTIMER_CTRL3_TMRA3CLK_HCLK           = 15,    /*!< HCLK : Clock source is HCLK. value.                                       */
+  CTIMER_CTRL3_TMRA3CLK_XT_DIV4        = 16,    /*!< XT_DIV4 : Clock source is XT / 4 value.                                   */
+  CTIMER_CTRL3_TMRA3CLK_XT_DIV8        = 17,    /*!< XT_DIV8 : Clock source is XT / 8 value.                                   */
+  CTIMER_CTRL3_TMRA3CLK_XT_DIV32       = 18,    /*!< XT_DIV32 : Clock source is XT / 32 value.                                 */
+  CTIMER_CTRL3_TMRA3CLK_CTMRB3         = 20,    /*!< CTMRB3 : Clock source is CTIMERB3 OUT. value.                             */
+  CTIMER_CTRL3_TMRA3CLK_CTMRA2         = 21,    /*!< CTMRA2 : Clock source is CTIMERA2 OUT. value.                             */
+  CTIMER_CTRL3_TMRA3CLK_CTMRB2         = 22,    /*!< CTMRB2 : Clock source is CTIMERB2 OUT. value.                             */
+  CTIMER_CTRL3_TMRA3CLK_CTMRA4         = 23,    /*!< CTMRA4 : Clock source is CTIMERA4 OUT. value.                             */
+  CTIMER_CTRL3_TMRA3CLK_CTMRB4         = 24,    /*!< CTMRB4 : Clock source is CTIMERB4 OUT. value.                             */
+  CTIMER_CTRL3_TMRA3CLK_CTMRB0         = 25,    /*!< CTMRB0 : Clock source is CTIMERB0 OUT. value.                             */
+  CTIMER_CTRL3_TMRA3CLK_CTMRB1         = 26,    /*!< CTMRB1 : Clock source is CTIMERB1 OUT. value.                             */
+  CTIMER_CTRL3_TMRA3CLK_CTMRB5         = 27,    /*!< CTMRB5 : Clock source is CTIMERB5 OUT. value.                             */
+  CTIMER_CTRL3_TMRA3CLK_CTMRB6         = 28,    /*!< CTMRB6 : Clock source is CTIMERB6 OUT. value.                             */
+  CTIMER_CTRL3_TMRA3CLK_BUCKBLE        = 29,    /*!< BUCKBLE : Clock source is BLE buck converter TON pulses. value.           */
+  CTIMER_CTRL3_TMRA3CLK_BUCKB          = 30,    /*!< BUCKB : Clock source is Memory buck converter TON pulses. value.          */
+  CTIMER_CTRL3_TMRA3CLK_BUCKA          = 31,    /*!< BUCKA : Clock source is CPU buck converter TON pulses. value.             */
+} CTIMER_CTRL3_TMRA3CLK_Enum;
+
+/* ==============================================  CTIMER CTRL3 TMRA3EN [0..0]  ============================================== */
+typedef enum {                                  /*!< CTIMER_CTRL3_TMRA3EN                                                      */
+  CTIMER_CTRL3_TMRA3EN_DIS             = 0,     /*!< DIS : Counter/Timer A3 Disable. value.                                    */
+  CTIMER_CTRL3_TMRA3EN_EN              = 1,     /*!< EN : Counter/Timer A3 Enable. value.                                      */
+} CTIMER_CTRL3_TMRA3EN_Enum;
+
+/* =======================================================  CMPRAUXA3  ======================================================= */
+/* =======================================================  CMPRAUXB3  ======================================================= */
+/* =========================================================  AUX3  ========================================================== */
+/* ============================================  CTIMER AUX3 TMRB3EN23 [30..30]  ============================================= */
+typedef enum {                                  /*!< CTIMER_AUX3_TMRB3EN23                                                     */
+  CTIMER_AUX3_TMRB3EN23_DIS            = 1,     /*!< DIS : Disable enhanced functions. value.                                  */
+  CTIMER_AUX3_TMRB3EN23_EN             = 0,     /*!< EN : Enable enhanced functions. value.                                    */
+} CTIMER_AUX3_TMRB3EN23_Enum;
+
+/* ============================================  CTIMER AUX3 TMRB3POL23 [29..29]  ============================================ */
+typedef enum {                                  /*!< CTIMER_AUX3_TMRB3POL23                                                    */
+  CTIMER_AUX3_TMRB3POL23_NORM          = 0,     /*!< NORM : Upper output normal polarity value.                                */
+  CTIMER_AUX3_TMRB3POL23_INV           = 1,     /*!< INV : Upper output inverted polarity. value.                              */
+} CTIMER_AUX3_TMRB3POL23_Enum;
+
+/* ============================================  CTIMER AUX3 TMRB3TINV [28..28]  ============================================= */
+typedef enum {                                  /*!< CTIMER_AUX3_TMRB3TINV                                                     */
+  CTIMER_AUX3_TMRB3TINV_DIS            = 0,     /*!< DIS : Disable invert on trigger value.                                    */
+  CTIMER_AUX3_TMRB3TINV_EN             = 1,     /*!< EN : Enable invert on trigger value.                                      */
+} CTIMER_AUX3_TMRB3TINV_Enum;
+
+/* ===========================================  CTIMER AUX3 TMRB3NOSYNC [27..27]  ============================================ */
+typedef enum {                                  /*!< CTIMER_AUX3_TMRB3NOSYNC                                                   */
+  CTIMER_AUX3_TMRB3NOSYNC_DIS          = 0,     /*!< DIS : Synchronization on source clock value.                              */
+  CTIMER_AUX3_TMRB3NOSYNC_NOSYNC       = 1,     /*!< NOSYNC : No synchronization on source clock value.                        */
+} CTIMER_AUX3_TMRB3NOSYNC_Enum;
+
+/* ============================================  CTIMER AUX3 TMRB3TRIG [23..26]  ============================================= */
+typedef enum {                                  /*!< CTIMER_AUX3_TMRB3TRIG                                                     */
+  CTIMER_AUX3_TMRB3TRIG_DIS            = 0,     /*!< DIS : Trigger source is disabled. value.                                  */
+  CTIMER_AUX3_TMRB3TRIG_A3OUT          = 1,     /*!< A3OUT : Trigger source is CTIMERA3 OUT. value.                            */
+  CTIMER_AUX3_TMRB3TRIG_B2OUT          = 2,     /*!< B2OUT : Trigger source is CTIMERB2 OUT. value.                            */
+  CTIMER_AUX3_TMRB3TRIG_A2OUT          = 3,     /*!< A2OUT : Trigger source is CTIMERA2 OUT. value.                            */
+  CTIMER_AUX3_TMRB3TRIG_A4OUT          = 4,     /*!< A4OUT : Trigger source is CTIMERA4 OUT. value.                            */
+  CTIMER_AUX3_TMRB3TRIG_B4OUT          = 5,     /*!< B4OUT : Trigger source is CTIMERB4 OUT. value.                            */
+  CTIMER_AUX3_TMRB3TRIG_A6OUT          = 6,     /*!< A6OUT : Trigger source is CTIMERA6 OUT. value.                            */
+  CTIMER_AUX3_TMRB3TRIG_B6OUT          = 7,     /*!< B6OUT : Trigger source is CTIMERB6 OUT. value.                            */
+  CTIMER_AUX3_TMRB3TRIG_B5OUT2         = 8,     /*!< B5OUT2 : Trigger source is CTIMERB5 OUT2. value.                          */
+  CTIMER_AUX3_TMRB3TRIG_A5OUT2         = 9,     /*!< A5OUT2 : Trigger source is CTIMERA5 OUT2. value.                          */
+  CTIMER_AUX3_TMRB3TRIG_A1OUT2         = 10,    /*!< A1OUT2 : Trigger source is CTIMERA1 OUT2. value.                          */
+  CTIMER_AUX3_TMRB3TRIG_B1OUT2         = 11,    /*!< B1OUT2 : Trigger source is CTIMERB1 OUT2. value.                          */
+  CTIMER_AUX3_TMRB3TRIG_A6OUT2DUAL     = 12,    /*!< A6OUT2DUAL : Trigger source is CTIMERA6 OUT2, dual edge. value.           */
+  CTIMER_AUX3_TMRB3TRIG_A7OUT2DUAL     = 13,    /*!< A7OUT2DUAL : Trigger source is CTIMERA7 OUT2, dual edge. value.           */
+  CTIMER_AUX3_TMRB3TRIG_B2OUT2DUAL     = 14,    /*!< B2OUT2DUAL : Trigger source is CTIMERB2 OUT2, dual edge. value.           */
+  CTIMER_AUX3_TMRB3TRIG_A2OUT2DUAL     = 15,    /*!< A2OUT2DUAL : Trigger source is CTIMERA2 OUT2, dual edge. value.           */
+} CTIMER_AUX3_TMRB3TRIG_Enum;
+
+/* ============================================  CTIMER AUX3 TMRA3EN23 [14..14]  ============================================= */
+typedef enum {                                  /*!< CTIMER_AUX3_TMRA3EN23                                                     */
+  CTIMER_AUX3_TMRA3EN23_DIS            = 1,     /*!< DIS : Disable enhanced functions. value.                                  */
+  CTIMER_AUX3_TMRA3EN23_EN             = 0,     /*!< EN : Enable enhanced functions. value.                                    */
+} CTIMER_AUX3_TMRA3EN23_Enum;
+
+/* ============================================  CTIMER AUX3 TMRA3POL23 [13..13]  ============================================ */
+typedef enum {                                  /*!< CTIMER_AUX3_TMRA3POL23                                                    */
+  CTIMER_AUX3_TMRA3POL23_NORM          = 0,     /*!< NORM : Upper output normal polarity value.                                */
+  CTIMER_AUX3_TMRA3POL23_INV           = 1,     /*!< INV : Upper output inverted polarity. value.                              */
+} CTIMER_AUX3_TMRA3POL23_Enum;
+
+/* ============================================  CTIMER AUX3 TMRA3TINV [12..12]  ============================================= */
+typedef enum {                                  /*!< CTIMER_AUX3_TMRA3TINV                                                     */
+  CTIMER_AUX3_TMRA3TINV_DIS            = 0,     /*!< DIS : Disable invert on trigger value.                                    */
+  CTIMER_AUX3_TMRA3TINV_EN             = 1,     /*!< EN : Enable invert on trigger value.                                      */
+} CTIMER_AUX3_TMRA3TINV_Enum;
+
+/* ===========================================  CTIMER AUX3 TMRA3NOSYNC [11..11]  ============================================ */
+typedef enum {                                  /*!< CTIMER_AUX3_TMRA3NOSYNC                                                   */
+  CTIMER_AUX3_TMRA3NOSYNC_DIS          = 0,     /*!< DIS : Synchronization on source clock value.                              */
+  CTIMER_AUX3_TMRA3NOSYNC_NOSYNC       = 1,     /*!< NOSYNC : No synchronization on source clock value.                        */
+} CTIMER_AUX3_TMRA3NOSYNC_Enum;
+
+/* =============================================  CTIMER AUX3 TMRA3TRIG [7..10]  ============================================= */
+typedef enum {                                  /*!< CTIMER_AUX3_TMRA3TRIG                                                     */
+  CTIMER_AUX3_TMRA3TRIG_DIS            = 0,     /*!< DIS : Trigger source is disabled. value.                                  */
+  CTIMER_AUX3_TMRA3TRIG_B3OUT          = 1,     /*!< B3OUT : Trigger source is CTIMERB3 OUT. value.                            */
+  CTIMER_AUX3_TMRA3TRIG_B2OUT          = 2,     /*!< B2OUT : Trigger source is CTIMERB2 OUT. value.                            */
+  CTIMER_AUX3_TMRA3TRIG_A2OUT          = 3,     /*!< A2OUT : Trigger source is CTIMERA2 OUT. value.                            */
+  CTIMER_AUX3_TMRA3TRIG_A4OUT          = 4,     /*!< A4OUT : Trigger source is CTIMERA4 OUT. value.                            */
+  CTIMER_AUX3_TMRA3TRIG_B4OUT          = 5,     /*!< B4OUT : Trigger source is CTIMERB4 OUT. value.                            */
+  CTIMER_AUX3_TMRA3TRIG_A7OUT          = 6,     /*!< A7OUT : Trigger source is CTIMERA7 OUT. value.                            */
+  CTIMER_AUX3_TMRA3TRIG_B7OUT          = 7,     /*!< B7OUT : Trigger source is CTIMERB7 OUT. value.                            */
+  CTIMER_AUX3_TMRA3TRIG_B5OUT2         = 8,     /*!< B5OUT2 : Trigger source is CTIMERB5 OUT2. value.                          */
+  CTIMER_AUX3_TMRA3TRIG_A5OUT2         = 9,     /*!< A5OUT2 : Trigger source is CTIMERA5 OUT2. value.                          */
+  CTIMER_AUX3_TMRA3TRIG_A1OUT2         = 10,    /*!< A1OUT2 : Trigger source is CTIMERA1 OUT2. value.                          */
+  CTIMER_AUX3_TMRA3TRIG_B1OUT2         = 11,    /*!< B1OUT2 : Trigger source is CTIMERB1 OUT2. value.                          */
+  CTIMER_AUX3_TMRA3TRIG_A6OUT2DUAL     = 12,    /*!< A6OUT2DUAL : Trigger source is CTIMERA6 OUT2, dual edge. value.           */
+  CTIMER_AUX3_TMRA3TRIG_A7OUT2DUAL     = 13,    /*!< A7OUT2DUAL : Trigger source is CTIMERA7 OUT2, dual edge. value.           */
+  CTIMER_AUX3_TMRA3TRIG_B2OUT2DUAL     = 14,    /*!< B2OUT2DUAL : Trigger source is CTIMERB2 OUT2, dual edge. value.           */
+  CTIMER_AUX3_TMRA3TRIG_A2OUT2DUAL     = 15,    /*!< A2OUT2DUAL : Trigger source is CTIMERA2 OUT2, dual edge. value.           */
+} CTIMER_AUX3_TMRA3TRIG_Enum;
+
+/* =========================================================  TMR4  ========================================================== */
+/* ========================================================  CMPRA4  ========================================================= */
+/* ========================================================  CMPRB4  ========================================================= */
+/* =========================================================  CTRL4  ========================================================= */
+/* =============================================  CTIMER CTRL4 CTLINK4 [31..31]  ============================================= */
+typedef enum {                                  /*!< CTIMER_CTRL4_CTLINK4                                                      */
+  CTIMER_CTRL4_CTLINK4_TWO_16BIT_TIMERS = 0,    /*!< TWO_16BIT_TIMERS : Use A4/B4 timers as two independent 16-bit
+                                                     timers (default). value.                                                  */
+  CTIMER_CTRL4_CTLINK4_32BIT_TIMER     = 1,     /*!< 32BIT_TIMER : Link A4/B4 timers into a single 32-bit timer.
+                                                     value.                                                                    */
+} CTIMER_CTRL4_CTLINK4_Enum;
+
+/* ============================================  CTIMER CTRL4 TMRB4POL [28..28]  ============================================= */
+typedef enum {                                  /*!< CTIMER_CTRL4_TMRB4POL                                                     */
+  CTIMER_CTRL4_TMRB4POL_NORMAL         = 0,     /*!< NORMAL : The polarity of the TMRPINB4 pin is the same as the
+                                                     timer output. value.                                                      */
+  CTIMER_CTRL4_TMRB4POL_INVERTED       = 1,     /*!< INVERTED : The polarity of the TMRPINB4 pin is the inverse of
+                                                     the timer output. value.                                                  */
+} CTIMER_CTRL4_TMRB4POL_Enum;
+
+/* ============================================  CTIMER CTRL4 TMRB4CLR [27..27]  ============================================= */
+typedef enum {                                  /*!< CTIMER_CTRL4_TMRB4CLR                                                     */
+  CTIMER_CTRL4_TMRB4CLR_RUN            = 0,     /*!< RUN : Allow counter/timer B4 to run value.                                */
+  CTIMER_CTRL4_TMRB4CLR_CLEAR          = 1,     /*!< CLEAR : Holds counter/timer B4 at 0x0000. value.                          */
+} CTIMER_CTRL4_TMRB4CLR_Enum;
+
+/* ============================================  CTIMER CTRL4 TMRB4IE1 [26..26]  ============================================= */
+typedef enum {                                  /*!< CTIMER_CTRL4_TMRB4IE1                                                     */
+  CTIMER_CTRL4_TMRB4IE1_DIS            = 0,     /*!< DIS : Disable counter/timer B4 from generating an interrupt
+                                                     based on COMPR1. value.                                                   */
+  CTIMER_CTRL4_TMRB4IE1_EN             = 1,     /*!< EN : Enable counter/timer B4 to generate an interrupt based
+                                                     on COMPR1. value.                                                         */
+} CTIMER_CTRL4_TMRB4IE1_Enum;
+
+/* ============================================  CTIMER CTRL4 TMRB4IE0 [25..25]  ============================================= */
+typedef enum {                                  /*!< CTIMER_CTRL4_TMRB4IE0                                                     */
+  CTIMER_CTRL4_TMRB4IE0_DIS            = 0,     /*!< DIS : Disable counter/timer B4 from generating an interrupt
+                                                     based on COMPR0. value.                                                   */
+  CTIMER_CTRL4_TMRB4IE0_EN             = 1,     /*!< EN : Enable counter/timer B4 to generate an interrupt based
+                                                     on COMPR0 value.                                                          */
+} CTIMER_CTRL4_TMRB4IE0_Enum;
+
+/* =============================================  CTIMER CTRL4 TMRB4FN [22..24]  ============================================= */
+typedef enum {                                  /*!< CTIMER_CTRL4_TMRB4FN                                                      */
+  CTIMER_CTRL4_TMRB4FN_SINGLECOUNT     = 0,     /*!< SINGLECOUNT : Single count (output toggles and sticks). Count
+                                                     to CMPR0B4, stop. value.                                                  */
+  CTIMER_CTRL4_TMRB4FN_REPEATEDCOUNT   = 1,     /*!< REPEATEDCOUNT : Repeated count (periodic 1-clock-cycle-wide
+                                                     pulses). Count to CMPR0B4, restart. value.                                */
+  CTIMER_CTRL4_TMRB4FN_PULSE_ONCE      = 2,     /*!< PULSE_ONCE : Pulse once (aka one-shot). Count to CMPR0B4, assert,
+                                                     count to CMPR1B4, deassert, stop. value.                                  */
+  CTIMER_CTRL4_TMRB4FN_PULSE_CONT      = 3,     /*!< PULSE_CONT : Pulse continously. Count to CMPR0B4, assert, count
+                                                     to CMPR1B4, deassert, restart. value.                                     */
+  CTIMER_CTRL4_TMRB4FN_SINGLEPATTERN   = 4,     /*!< SINGLEPATTERN : Single pattern. value.                                    */
+  CTIMER_CTRL4_TMRB4FN_REPEATPATTERN   = 5,     /*!< REPEATPATTERN : Repeated pattern. value.                                  */
+  CTIMER_CTRL4_TMRB4FN_CONTINUOUS      = 6,     /*!< CONTINUOUS : Continuous run (aka Free Run). Count continuously.
+                                                     value.                                                                    */
+  CTIMER_CTRL4_TMRB4FN_ALTPWN          = 7,     /*!< ALTPWN : Alternate PWM value.                                             */
+} CTIMER_CTRL4_TMRB4FN_Enum;
+
+/* ============================================  CTIMER CTRL4 TMRB4CLK [17..21]  ============================================= */
+typedef enum {                                  /*!< CTIMER_CTRL4_TMRB4CLK                                                     */
+  CTIMER_CTRL4_TMRB4CLK_TMRPIN         = 0,     /*!< TMRPIN : Clock source is TMRPINB. value.                                  */
+  CTIMER_CTRL4_TMRB4CLK_HFRC_DIV4      = 1,     /*!< HFRC_DIV4 : Clock source is the HFRC / 4 value.                           */
+  CTIMER_CTRL4_TMRB4CLK_HFRC_DIV16     = 2,     /*!< HFRC_DIV16 : Clock source is HFRC / 16 value.                             */
+  CTIMER_CTRL4_TMRB4CLK_HFRC_DIV256    = 3,     /*!< HFRC_DIV256 : Clock source is HFRC / 256 value.                           */
+  CTIMER_CTRL4_TMRB4CLK_HFRC_DIV1024   = 4,     /*!< HFRC_DIV1024 : Clock source is HFRC / 1024 value.                         */
+  CTIMER_CTRL4_TMRB4CLK_HFRC_DIV4K     = 5,     /*!< HFRC_DIV4K : Clock source is HFRC / 4096 value.                           */
+  CTIMER_CTRL4_TMRB4CLK_XT             = 6,     /*!< XT : Clock source is the XT (uncalibrated). value.                        */
+  CTIMER_CTRL4_TMRB4CLK_XT_DIV2        = 7,     /*!< XT_DIV2 : Clock source is XT / 2 value.                                   */
+  CTIMER_CTRL4_TMRB4CLK_XT_DIV16       = 8,     /*!< XT_DIV16 : Clock source is XT / 16 value.                                 */
+  CTIMER_CTRL4_TMRB4CLK_XT_DIV128      = 9,     /*!< XT_DIV128 : Clock source is XT / 128 value.                               */
+  CTIMER_CTRL4_TMRB4CLK_LFRC_DIV2      = 10,    /*!< LFRC_DIV2 : Clock source is LFRC / 2 value.                               */
+  CTIMER_CTRL4_TMRB4CLK_LFRC_DIV32     = 11,    /*!< LFRC_DIV32 : Clock source is LFRC / 32 value.                             */
+  CTIMER_CTRL4_TMRB4CLK_LFRC_DIV1K     = 12,    /*!< LFRC_DIV1K : Clock source is LFRC / 1024 value.                           */
+  CTIMER_CTRL4_TMRB4CLK_LFRC           = 13,    /*!< LFRC : Clock source is LFRC value.                                        */
+  CTIMER_CTRL4_TMRB4CLK_RTC_100HZ      = 14,    /*!< RTC_100HZ : Clock source is 100 Hz from the current RTC oscillator.
+                                                     value.                                                                    */
+  CTIMER_CTRL4_TMRB4CLK_HCLK           = 15,    /*!< HCLK : Clock source is HCLK. value.                                       */
+  CTIMER_CTRL4_TMRB4CLK_XT_DIV4        = 16,    /*!< XT_DIV4 : Clock source is XT / 4 value.                                   */
+  CTIMER_CTRL4_TMRB4CLK_XT_DIV8        = 17,    /*!< XT_DIV8 : Clock source is XT / 8 value.                                   */
+  CTIMER_CTRL4_TMRB4CLK_XT_DIV32       = 18,    /*!< XT_DIV32 : Clock source is XT / 32 value.                                 */
+  CTIMER_CTRL4_TMRB4CLK_CTMRA4         = 20,    /*!< CTMRA4 : Clock source is CTIMERA4 OUT. value.                             */
+  CTIMER_CTRL4_TMRB4CLK_CTMRA1         = 21,    /*!< CTMRA1 : Clock source is CTIMERA1 OUT. value.                             */
+  CTIMER_CTRL4_TMRB4CLK_CTMRB1         = 22,    /*!< CTMRB1 : Clock source is CTIMERB1 OUT. value.                             */
+  CTIMER_CTRL4_TMRB4CLK_CTMRA5         = 23,    /*!< CTMRA5 : Clock source is CTIMERA5 OUT. value.                             */
+  CTIMER_CTRL4_TMRB4CLK_CTMRB5         = 24,    /*!< CTMRB5 : Clock source is CTIMERB5 OUT. value.                             */
+  CTIMER_CTRL4_TMRB4CLK_CTMRB0         = 25,    /*!< CTMRB0 : Clock source is CTIMERB0 OUT. value.                             */
+  CTIMER_CTRL4_TMRB4CLK_CTMRB2         = 26,    /*!< CTMRB2 : Clock source is CTIMERB2 OUT. value.                             */
+  CTIMER_CTRL4_TMRB4CLK_CTMRB3         = 27,    /*!< CTMRB3 : Clock source is CTIMERB3 OUT. value.                             */
+  CTIMER_CTRL4_TMRB4CLK_CTMRB6         = 28,    /*!< CTMRB6 : Clock source is CTIMERB6 OUT. value.                             */
+  CTIMER_CTRL4_TMRB4CLK_BUCKBLE        = 29,    /*!< BUCKBLE : Clock source is BLE buck converter TON pulses. value.           */
+  CTIMER_CTRL4_TMRB4CLK_BUCKB          = 30,    /*!< BUCKB : Clock source is Memory buck converter TON pulses. value.          */
+  CTIMER_CTRL4_TMRB4CLK_BUCKA          = 31,    /*!< BUCKA : Clock source is CPU buck converter TON pulses. value.             */
+} CTIMER_CTRL4_TMRB4CLK_Enum;
+
+/* =============================================  CTIMER CTRL4 TMRB4EN [16..16]  ============================================= */
+typedef enum {                                  /*!< CTIMER_CTRL4_TMRB4EN                                                      */
+  CTIMER_CTRL4_TMRB4EN_DIS             = 0,     /*!< DIS : Counter/Timer B4 Disable. value.                                    */
+  CTIMER_CTRL4_TMRB4EN_EN              = 1,     /*!< EN : Counter/Timer B4 Enable. value.                                      */
+} CTIMER_CTRL4_TMRB4EN_Enum;
+
+/* ============================================  CTIMER CTRL4 TMRA4POL [12..12]  ============================================= */
+typedef enum {                                  /*!< CTIMER_CTRL4_TMRA4POL                                                     */
+  CTIMER_CTRL4_TMRA4POL_NORMAL         = 0,     /*!< NORMAL : The polarity of the TMRPINA4 pin is the same as the
+                                                     timer output. value.                                                      */
+  CTIMER_CTRL4_TMRA4POL_INVERTED       = 1,     /*!< INVERTED : The polarity of the TMRPINA4 pin is the inverse of
+                                                     the timer output. value.                                                  */
+} CTIMER_CTRL4_TMRA4POL_Enum;
+
+/* ============================================  CTIMER CTRL4 TMRA4CLR [11..11]  ============================================= */
+typedef enum {                                  /*!< CTIMER_CTRL4_TMRA4CLR                                                     */
+  CTIMER_CTRL4_TMRA4CLR_RUN            = 0,     /*!< RUN : Allow counter/timer A4 to run value.                                */
+  CTIMER_CTRL4_TMRA4CLR_CLEAR          = 1,     /*!< CLEAR : Holds counter/timer A4 at 0x0000. value.                          */
+} CTIMER_CTRL4_TMRA4CLR_Enum;
+
+/* ============================================  CTIMER CTRL4 TMRA4IE1 [10..10]  ============================================= */
+typedef enum {                                  /*!< CTIMER_CTRL4_TMRA4IE1                                                     */
+  CTIMER_CTRL4_TMRA4IE1_DIS            = 0,     /*!< DIS : Disable counter/timer A4 from generating an interrupt
+                                                     based on COMPR1. value.                                                   */
+  CTIMER_CTRL4_TMRA4IE1_EN             = 1,     /*!< EN : Enable counter/timer A4 to generate an interrupt based
+                                                     on COMPR1. value.                                                         */
+} CTIMER_CTRL4_TMRA4IE1_Enum;
+
+/* =============================================  CTIMER CTRL4 TMRA4IE0 [9..9]  ============================================== */
+typedef enum {                                  /*!< CTIMER_CTRL4_TMRA4IE0                                                     */
+  CTIMER_CTRL4_TMRA4IE0_DIS            = 0,     /*!< DIS : Disable counter/timer A4 from generating an interrupt
+                                                     based on COMPR0. value.                                                   */
+  CTIMER_CTRL4_TMRA4IE0_EN             = 1,     /*!< EN : Enable counter/timer A4 to generate an interrupt based
+                                                     on COMPR0. value.                                                         */
+} CTIMER_CTRL4_TMRA4IE0_Enum;
+
+/* ==============================================  CTIMER CTRL4 TMRA4FN [6..8]  ============================================== */
+typedef enum {                                  /*!< CTIMER_CTRL4_TMRA4FN                                                      */
+  CTIMER_CTRL4_TMRA4FN_SINGLECOUNT     = 0,     /*!< SINGLECOUNT : Single count (output toggles and sticks). Count
+                                                     to CMPR0A4, stop. value.                                                  */
+  CTIMER_CTRL4_TMRA4FN_REPEATEDCOUNT   = 1,     /*!< REPEATEDCOUNT : Repeated count (periodic 1-clock-cycle-wide
+                                                     pulses). Count to CMPR0A4, restart. value.                                */
+  CTIMER_CTRL4_TMRA4FN_PULSE_ONCE      = 2,     /*!< PULSE_ONCE : Pulse once (aka one-shot). Count to CMPR0A4, assert,
+                                                     count to CMPR1A4, deassert, stop. value.                                  */
+  CTIMER_CTRL4_TMRA4FN_PULSE_CONT      = 3,     /*!< PULSE_CONT : Pulse continously. Count to CMPR0A4, assert, count
+                                                     to CMPR1A4, deassert, restart. value.                                     */
+  CTIMER_CTRL4_TMRA4FN_SINGLEPATTERN   = 4,     /*!< SINGLEPATTERN : Single pattern. value.                                    */
+  CTIMER_CTRL4_TMRA4FN_REPEATPATTERN   = 5,     /*!< REPEATPATTERN : Repeated pattern. value.                                  */
+  CTIMER_CTRL4_TMRA4FN_CONTINUOUS      = 6,     /*!< CONTINUOUS : Continuous run (aka Free Run). Count continuously.
+                                                     value.                                                                    */
+  CTIMER_CTRL4_TMRA4FN_ALTPWN          = 7,     /*!< ALTPWN : Alternate PWM value.                                             */
+} CTIMER_CTRL4_TMRA4FN_Enum;
+
+/* =============================================  CTIMER CTRL4 TMRA4CLK [1..5]  ============================================== */
+typedef enum {                                  /*!< CTIMER_CTRL4_TMRA4CLK                                                     */
+  CTIMER_CTRL4_TMRA4CLK_TMRPIN         = 0,     /*!< TMRPIN : Clock source is TMRPINA. value.                                  */
+  CTIMER_CTRL4_TMRA4CLK_HFRC_DIV4      = 1,     /*!< HFRC_DIV4 : Clock source is the HFRC / 4 value.                           */
+  CTIMER_CTRL4_TMRA4CLK_HFRC_DIV16     = 2,     /*!< HFRC_DIV16 : Clock source is HFRC / 16 value.                             */
+  CTIMER_CTRL4_TMRA4CLK_HFRC_DIV256    = 3,     /*!< HFRC_DIV256 : Clock source is HFRC / 256 value.                           */
+  CTIMER_CTRL4_TMRA4CLK_HFRC_DIV1024   = 4,     /*!< HFRC_DIV1024 : Clock source is HFRC / 1024 value.                         */
+  CTIMER_CTRL4_TMRA4CLK_HFRC_DIV4K     = 5,     /*!< HFRC_DIV4K : Clock source is HFRC / 4096 value.                           */
+  CTIMER_CTRL4_TMRA4CLK_XT             = 6,     /*!< XT : Clock source is the XT (uncalibrated). value.                        */
+  CTIMER_CTRL4_TMRA4CLK_XT_DIV2        = 7,     /*!< XT_DIV2 : Clock source is XT / 2 value.                                   */
+  CTIMER_CTRL4_TMRA4CLK_XT_DIV16       = 8,     /*!< XT_DIV16 : Clock source is XT / 16 value.                                 */
+  CTIMER_CTRL4_TMRA4CLK_XT_DIV128      = 9,     /*!< XT_DIV128 : Clock source is XT / 128 value.                               */
+  CTIMER_CTRL4_TMRA4CLK_LFRC_DIV2      = 10,    /*!< LFRC_DIV2 : Clock source is LFRC / 2 value.                               */
+  CTIMER_CTRL4_TMRA4CLK_LFRC_DIV32     = 11,    /*!< LFRC_DIV32 : Clock source is LFRC / 32 value.                             */
+  CTIMER_CTRL4_TMRA4CLK_LFRC_DIV1K     = 12,    /*!< LFRC_DIV1K : Clock source is LFRC / 1024 value.                           */
+  CTIMER_CTRL4_TMRA4CLK_LFRC           = 13,    /*!< LFRC : Clock source is LFRC value.                                        */
+  CTIMER_CTRL4_TMRA4CLK_RTC_100HZ      = 14,    /*!< RTC_100HZ : Clock source is 100 Hz from the current RTC oscillator.
+                                                     value.                                                                    */
+  CTIMER_CTRL4_TMRA4CLK_HCLK_DIV4      = 15,    /*!< HCLK_DIV4 : Clock source is HCLK / 4. value.                              */
+  CTIMER_CTRL4_TMRA4CLK_XT_DIV4        = 16,    /*!< XT_DIV4 : Clock source is XT / 4 value.                                   */
+  CTIMER_CTRL4_TMRA4CLK_XT_DIV8        = 17,    /*!< XT_DIV8 : Clock source is XT / 8 value.                                   */
+  CTIMER_CTRL4_TMRA4CLK_XT_DIV32       = 18,    /*!< XT_DIV32 : Clock source is XT / 32 value.                                 */
+  CTIMER_CTRL4_TMRA4CLK_CTMRB4         = 20,    /*!< CTMRB4 : Clock source is CTIMERB4 OUT. value.                             */
+  CTIMER_CTRL4_TMRA4CLK_CTMRA1         = 21,    /*!< CTMRA1 : Clock source is CTIMERA1 OUT. value.                             */
+  CTIMER_CTRL4_TMRA4CLK_CTMRB1         = 22,    /*!< CTMRB1 : Clock source is CTIMERB1 OUT. value.                             */
+  CTIMER_CTRL4_TMRA4CLK_CTMRA5         = 23,    /*!< CTMRA5 : Clock source is CTIMERA5 OUT. value.                             */
+  CTIMER_CTRL4_TMRA4CLK_CTMRB5         = 24,    /*!< CTMRB5 : Clock source is CTIMERB5 OUT. value.                             */
+  CTIMER_CTRL4_TMRA4CLK_CTMRB0         = 25,    /*!< CTMRB0 : Clock source is CTIMERB0 OUT. value.                             */
+  CTIMER_CTRL4_TMRA4CLK_CTMRB2         = 26,    /*!< CTMRB2 : Clock source is CTIMERB2 OUT. value.                             */
+  CTIMER_CTRL4_TMRA4CLK_CTMRB3         = 27,    /*!< CTMRB3 : Clock source is CTIMERB3 OUT. value.                             */
+  CTIMER_CTRL4_TMRA4CLK_CTMRB6         = 28,    /*!< CTMRB6 : Clock source is CTIMERB6 OUT. value.                             */
+  CTIMER_CTRL4_TMRA4CLK_BUCKBLE        = 29,    /*!< BUCKBLE : Clock source is BLE buck converter TON pulses. value.           */
+  CTIMER_CTRL4_TMRA4CLK_BUCKB          = 30,    /*!< BUCKB : Clock source is Memory buck converter TON pulses. value.          */
+  CTIMER_CTRL4_TMRA4CLK_BUCKA          = 31,    /*!< BUCKA : Clock source is CPU buck converter TON pulses. value.             */
+} CTIMER_CTRL4_TMRA4CLK_Enum;
+
+/* ==============================================  CTIMER CTRL4 TMRA4EN [0..0]  ============================================== */
+typedef enum {                                  /*!< CTIMER_CTRL4_TMRA4EN                                                      */
+  CTIMER_CTRL4_TMRA4EN_DIS             = 0,     /*!< DIS : Counter/Timer A4 Disable. value.                                    */
+  CTIMER_CTRL4_TMRA4EN_EN              = 1,     /*!< EN : Counter/Timer A4 Enable. value.                                      */
+} CTIMER_CTRL4_TMRA4EN_Enum;
+
+/* =======================================================  CMPRAUXA4  ======================================================= */
+/* =======================================================  CMPRAUXB4  ======================================================= */
+/* =========================================================  AUX4  ========================================================== */
+/* ============================================  CTIMER AUX4 TMRB4EN23 [30..30]  ============================================= */
+typedef enum {                                  /*!< CTIMER_AUX4_TMRB4EN23                                                     */
+  CTIMER_AUX4_TMRB4EN23_DIS            = 1,     /*!< DIS : Disable enhanced functions. value.                                  */
+  CTIMER_AUX4_TMRB4EN23_EN             = 0,     /*!< EN : Enable enhanced functions. value.                                    */
+} CTIMER_AUX4_TMRB4EN23_Enum;
+
+/* ============================================  CTIMER AUX4 TMRB4POL23 [29..29]  ============================================ */
+typedef enum {                                  /*!< CTIMER_AUX4_TMRB4POL23                                                    */
+  CTIMER_AUX4_TMRB4POL23_NORM          = 0,     /*!< NORM : Upper output normal polarity value.                                */
+  CTIMER_AUX4_TMRB4POL23_INV           = 1,     /*!< INV : Upper output inverted polarity. value.                              */
+} CTIMER_AUX4_TMRB4POL23_Enum;
+
+/* ============================================  CTIMER AUX4 TMRB4TINV [28..28]  ============================================= */
+typedef enum {                                  /*!< CTIMER_AUX4_TMRB4TINV                                                     */
+  CTIMER_AUX4_TMRB4TINV_DIS            = 0,     /*!< DIS : Disable invert on trigger value.                                    */
+  CTIMER_AUX4_TMRB4TINV_EN             = 1,     /*!< EN : Enable invert on trigger value.                                      */
+} CTIMER_AUX4_TMRB4TINV_Enum;
+
+/* ===========================================  CTIMER AUX4 TMRB4NOSYNC [27..27]  ============================================ */
+typedef enum {                                  /*!< CTIMER_AUX4_TMRB4NOSYNC                                                   */
+  CTIMER_AUX4_TMRB4NOSYNC_DIS          = 0,     /*!< DIS : Synchronization on source clock value.                              */
+  CTIMER_AUX4_TMRB4NOSYNC_NOSYNC       = 1,     /*!< NOSYNC : No synchronization on source clock value.                        */
+} CTIMER_AUX4_TMRB4NOSYNC_Enum;
+
+/* ============================================  CTIMER AUX4 TMRB4TRIG [23..26]  ============================================= */
+typedef enum {                                  /*!< CTIMER_AUX4_TMRB4TRIG                                                     */
+  CTIMER_AUX4_TMRB4TRIG_DIS            = 0,     /*!< DIS : Trigger source is disabled. value.                                  */
+  CTIMER_AUX4_TMRB4TRIG_A4OUT          = 1,     /*!< A4OUT : Trigger source is CTIMERA4 OUT. value.                            */
+  CTIMER_AUX4_TMRB4TRIG_B3OUT          = 2,     /*!< B3OUT : Trigger source is CTIMERB3 OUT. value.                            */
+  CTIMER_AUX4_TMRB4TRIG_A3OUT          = 3,     /*!< A3OUT : Trigger source is CTIMERA3 OUT. value.                            */
+  CTIMER_AUX4_TMRB4TRIG_A7OUT          = 4,     /*!< A7OUT : Trigger source is CTIMERA7 OUT. value.                            */
+  CTIMER_AUX4_TMRB4TRIG_B7OUT          = 5,     /*!< B7OUT : Trigger source is CTIMERB7 OUT. value.                            */
+  CTIMER_AUX4_TMRB4TRIG_A1OUT          = 6,     /*!< A1OUT : Trigger source is CTIMERA1 OUT. value.                            */
+  CTIMER_AUX4_TMRB4TRIG_B1OUT          = 7,     /*!< B1OUT : Trigger source is CTIMERB1 OUT. value.                            */
+  CTIMER_AUX4_TMRB4TRIG_B3OUT2         = 8,     /*!< B3OUT2 : Trigger source is CTIMERB3 OUT2. value.                          */
+  CTIMER_AUX4_TMRB4TRIG_A3OUT2         = 9,     /*!< A3OUT2 : Trigger source is CTIMERA3 OUT2. value.                          */
+  CTIMER_AUX4_TMRB4TRIG_A1OUT2         = 10,    /*!< A1OUT2 : Trigger source is CTIMERA1 OUT2. value.                          */
+  CTIMER_AUX4_TMRB4TRIG_B1OUT2         = 11,    /*!< B1OUT2 : Trigger source is CTIMERB1 OUT2. value.                          */
+  CTIMER_AUX4_TMRB4TRIG_A6OUT2DUAL     = 12,    /*!< A6OUT2DUAL : Trigger source is CTIMERA6 OUT2, dual edge. value.           */
+  CTIMER_AUX4_TMRB4TRIG_A7OUT2DUAL     = 13,    /*!< A7OUT2DUAL : Trigger source is CTIMERA7 OUT2, dual edge. value.           */
+  CTIMER_AUX4_TMRB4TRIG_B5OUT2DUAL     = 14,    /*!< B5OUT2DUAL : Trigger source is CTIMERB5 OUT2, dual edge. value.           */
+  CTIMER_AUX4_TMRB4TRIG_A5OUT2DUAL     = 15,    /*!< A5OUT2DUAL : Trigger source is CTIMERA5 OUT2, dual edge. value.           */
+} CTIMER_AUX4_TMRB4TRIG_Enum;
+
+/* ============================================  CTIMER AUX4 TMRA4EN23 [14..14]  ============================================= */
+typedef enum {                                  /*!< CTIMER_AUX4_TMRA4EN23                                                     */
+  CTIMER_AUX4_TMRA4EN23_DIS            = 1,     /*!< DIS : Disable enhanced functions. value.                                  */
+  CTIMER_AUX4_TMRA4EN23_EN             = 0,     /*!< EN : Enable enhanced functions. value.                                    */
+} CTIMER_AUX4_TMRA4EN23_Enum;
+
+/* ============================================  CTIMER AUX4 TMRA4POL23 [13..13]  ============================================ */
+typedef enum {                                  /*!< CTIMER_AUX4_TMRA4POL23                                                    */
+  CTIMER_AUX4_TMRA4POL23_NORM          = 0,     /*!< NORM : Upper output normal polarity value.                                */
+  CTIMER_AUX4_TMRA4POL23_INV           = 1,     /*!< INV : Upper output inverted polarity. value.                              */
+} CTIMER_AUX4_TMRA4POL23_Enum;
+
+/* ============================================  CTIMER AUX4 TMRA4TINV [12..12]  ============================================= */
+typedef enum {                                  /*!< CTIMER_AUX4_TMRA4TINV                                                     */
+  CTIMER_AUX4_TMRA4TINV_DIS            = 0,     /*!< DIS : Disable invert on trigger value.                                    */
+  CTIMER_AUX4_TMRA4TINV_EN             = 1,     /*!< EN : Enable invert on trigger value.                                      */
+} CTIMER_AUX4_TMRA4TINV_Enum;
+
+/* ===========================================  CTIMER AUX4 TMRA4NOSYNC [11..11]  ============================================ */
+typedef enum {                                  /*!< CTIMER_AUX4_TMRA4NOSYNC                                                   */
+  CTIMER_AUX4_TMRA4NOSYNC_DIS          = 0,     /*!< DIS : Synchronization on source clock value.                              */
+  CTIMER_AUX4_TMRA4NOSYNC_NOSYNC       = 1,     /*!< NOSYNC : No synchronization on source clock value.                        */
+} CTIMER_AUX4_TMRA4NOSYNC_Enum;
+
+/* =============================================  CTIMER AUX4 TMRA4TRIG [7..10]  ============================================= */
+typedef enum {                                  /*!< CTIMER_AUX4_TMRA4TRIG                                                     */
+  CTIMER_AUX4_TMRA4TRIG_DIS            = 0,     /*!< DIS : Trigger source is disabled. value.                                  */
+  CTIMER_AUX4_TMRA4TRIG_B4OUT          = 1,     /*!< B4OUT : Trigger source is CTIMERB4 OUT. value.                            */
+  CTIMER_AUX4_TMRA4TRIG_B3OUT          = 2,     /*!< B3OUT : Trigger source is CTIMERB3 OUT. value.                            */
+  CTIMER_AUX4_TMRA4TRIG_A3OUT          = 3,     /*!< A3OUT : Trigger source is CTIMERA3 OUT. value.                            */
+  CTIMER_AUX4_TMRA4TRIG_A6OUT          = 4,     /*!< A6OUT : Trigger source is CTIMERA6 OUT. value.                            */
+  CTIMER_AUX4_TMRA4TRIG_B6OUT          = 5,     /*!< B6OUT : Trigger source is CTIMERB6 OUT. value.                            */
+  CTIMER_AUX4_TMRA4TRIG_A2OUT          = 6,     /*!< A2OUT : Trigger source is CTIMERA2 OUT. value.                            */
+  CTIMER_AUX4_TMRA4TRIG_B2OUT          = 7,     /*!< B2OUT : Trigger source is CTIMERB2 OUT. value.                            */
+  CTIMER_AUX4_TMRA4TRIG_B3OUT2         = 8,     /*!< B3OUT2 : Trigger source is CTIMERB3 OUT2. value.                          */
+  CTIMER_AUX4_TMRA4TRIG_A3OUT2         = 9,     /*!< A3OUT2 : Trigger source is CTIMERA3 OUT2. value.                          */
+  CTIMER_AUX4_TMRA4TRIG_A1OUT2         = 10,    /*!< A1OUT2 : Trigger source is CTIMERA1 OUT2. value.                          */
+  CTIMER_AUX4_TMRA4TRIG_B1OUT2         = 11,    /*!< B1OUT2 : Trigger source is CTIMERB1 OUT2. value.                          */
+  CTIMER_AUX4_TMRA4TRIG_A6OUT2DUAL     = 12,    /*!< A6OUT2DUAL : Trigger source is CTIMERA6 OUT2, dual edge. value.           */
+  CTIMER_AUX4_TMRA4TRIG_A7OUT2DUAL     = 13,    /*!< A7OUT2DUAL : Trigger source is CTIMERA7 OUT2, dual edge. value.           */
+  CTIMER_AUX4_TMRA4TRIG_B5OUT2DUAL     = 14,    /*!< B5OUT2DUAL : Trigger source is CTIMERB5 OUT2, dual edge. value.           */
+  CTIMER_AUX4_TMRA4TRIG_A5OUT2DUAL     = 15,    /*!< A5OUT2DUAL : Trigger source is CTIMERA5 OUT2, dual edge. value.           */
+} CTIMER_AUX4_TMRA4TRIG_Enum;
+
+/* =========================================================  TMR5  ========================================================== */
+/* ========================================================  CMPRA5  ========================================================= */
+/* ========================================================  CMPRB5  ========================================================= */
+/* =========================================================  CTRL5  ========================================================= */
+/* =============================================  CTIMER CTRL5 CTLINK5 [31..31]  ============================================= */
+typedef enum {                                  /*!< CTIMER_CTRL5_CTLINK5                                                      */
+  CTIMER_CTRL5_CTLINK5_TWO_16BIT_TIMERS = 0,    /*!< TWO_16BIT_TIMERS : Use A5/B5 timers as two independent 16-bit
+                                                     timers (default). value.                                                  */
+  CTIMER_CTRL5_CTLINK5_32BIT_TIMER     = 1,     /*!< 32BIT_TIMER : Link A5/B5 timers into a single 32-bit timer.
+                                                     value.                                                                    */
+} CTIMER_CTRL5_CTLINK5_Enum;
+
+/* ============================================  CTIMER CTRL5 TMRB5POL [28..28]  ============================================= */
+typedef enum {                                  /*!< CTIMER_CTRL5_TMRB5POL                                                     */
+  CTIMER_CTRL5_TMRB5POL_NORMAL         = 0,     /*!< NORMAL : The polarity of the TMRPINB5 pin is the same as the
+                                                     timer output. value.                                                      */
+  CTIMER_CTRL5_TMRB5POL_INVERTED       = 1,     /*!< INVERTED : The polarity of the TMRPINB5 pin is the inverse of
+                                                     the timer output. value.                                                  */
+} CTIMER_CTRL5_TMRB5POL_Enum;
+
+/* ============================================  CTIMER CTRL5 TMRB5CLR [27..27]  ============================================= */
+typedef enum {                                  /*!< CTIMER_CTRL5_TMRB5CLR                                                     */
+  CTIMER_CTRL5_TMRB5CLR_RUN            = 0,     /*!< RUN : Allow counter/timer B5 to run value.                                */
+  CTIMER_CTRL5_TMRB5CLR_CLEAR          = 1,     /*!< CLEAR : Holds counter/timer B5 at 0x0000. value.                          */
+} CTIMER_CTRL5_TMRB5CLR_Enum;
+
+/* ============================================  CTIMER CTRL5 TMRB5IE1 [26..26]  ============================================= */
+typedef enum {                                  /*!< CTIMER_CTRL5_TMRB5IE1                                                     */
+  CTIMER_CTRL5_TMRB5IE1_DIS            = 0,     /*!< DIS : Disable counter/timer B5 from generating an interrupt
+                                                     based on COMPR1. value.                                                   */
+  CTIMER_CTRL5_TMRB5IE1_EN             = 1,     /*!< EN : Enable counter/timer B5 to generate an interrupt based
+                                                     on COMPR1. value.                                                         */
+} CTIMER_CTRL5_TMRB5IE1_Enum;
+
+/* ============================================  CTIMER CTRL5 TMRB5IE0 [25..25]  ============================================= */
+typedef enum {                                  /*!< CTIMER_CTRL5_TMRB5IE0                                                     */
+  CTIMER_CTRL5_TMRB5IE0_DIS            = 0,     /*!< DIS : Disable counter/timer B5 from generating an interrupt
+                                                     based on COMPR0. value.                                                   */
+  CTIMER_CTRL5_TMRB5IE0_EN             = 1,     /*!< EN : Enable counter/timer B5 to generate an interrupt based
+                                                     on COMPR0 value.                                                          */
+} CTIMER_CTRL5_TMRB5IE0_Enum;
+
+/* =============================================  CTIMER CTRL5 TMRB5FN [22..24]  ============================================= */
+typedef enum {                                  /*!< CTIMER_CTRL5_TMRB5FN                                                      */
+  CTIMER_CTRL5_TMRB5FN_SINGLECOUNT     = 0,     /*!< SINGLECOUNT : Single count (output toggles and sticks). Count
+                                                     to CMPR0B5, stop. value.                                                  */
+  CTIMER_CTRL5_TMRB5FN_REPEATEDCOUNT   = 1,     /*!< REPEATEDCOUNT : Repeated count (periodic 1-clock-cycle-wide
+                                                     pulses). Count to CMPR0B5, restart. value.                                */
+  CTIMER_CTRL5_TMRB5FN_PULSE_ONCE      = 2,     /*!< PULSE_ONCE : Pulse once (aka one-shot). Count to CMPR0B5, assert,
+                                                     count to CMPR1B5, deassert, stop. value.                                  */
+  CTIMER_CTRL5_TMRB5FN_PULSE_CONT      = 3,     /*!< PULSE_CONT : Pulse continously. Count to CMPR0B5, assert, count
+                                                     to CMPR1B5, deassert, restart. value.                                     */
+  CTIMER_CTRL5_TMRB5FN_SINGLEPATTERN   = 4,     /*!< SINGLEPATTERN : Single pattern. value.                                    */
+  CTIMER_CTRL5_TMRB5FN_REPEATPATTERN   = 5,     /*!< REPEATPATTERN : Repeated pattern. value.                                  */
+  CTIMER_CTRL5_TMRB5FN_CONTINUOUS      = 6,     /*!< CONTINUOUS : Continuous run (aka Free Run). Count continuously.
+                                                     value.                                                                    */
+  CTIMER_CTRL5_TMRB5FN_ALTPWN          = 7,     /*!< ALTPWN : Alternate PWM value.                                             */
+} CTIMER_CTRL5_TMRB5FN_Enum;
+
+/* ============================================  CTIMER CTRL5 TMRB5CLK [17..21]  ============================================= */
+typedef enum {                                  /*!< CTIMER_CTRL5_TMRB5CLK                                                     */
+  CTIMER_CTRL5_TMRB5CLK_TMRPIN         = 0,     /*!< TMRPIN : Clock source is TMRPINB. value.                                  */
+  CTIMER_CTRL5_TMRB5CLK_HFRC_DIV4      = 1,     /*!< HFRC_DIV4 : Clock source is the HFRC / 4 value.                           */
+  CTIMER_CTRL5_TMRB5CLK_HFRC_DIV16     = 2,     /*!< HFRC_DIV16 : Clock source is HFRC / 16 value.                             */
+  CTIMER_CTRL5_TMRB5CLK_HFRC_DIV256    = 3,     /*!< HFRC_DIV256 : Clock source is HFRC / 256 value.                           */
+  CTIMER_CTRL5_TMRB5CLK_HFRC_DIV1024   = 4,     /*!< HFRC_DIV1024 : Clock source is HFRC / 1024 value.                         */
+  CTIMER_CTRL5_TMRB5CLK_HFRC_DIV4K     = 5,     /*!< HFRC_DIV4K : Clock source is HFRC / 4096 value.                           */
+  CTIMER_CTRL5_TMRB5CLK_XT             = 6,     /*!< XT : Clock source is the XT (uncalibrated). value.                        */
+  CTIMER_CTRL5_TMRB5CLK_XT_DIV2        = 7,     /*!< XT_DIV2 : Clock source is XT / 2 value.                                   */
+  CTIMER_CTRL5_TMRB5CLK_XT_DIV16       = 8,     /*!< XT_DIV16 : Clock source is XT / 16 value.                                 */
+  CTIMER_CTRL5_TMRB5CLK_XT_DIV128      = 9,     /*!< XT_DIV128 : Clock source is XT / 128 value.                               */
+  CTIMER_CTRL5_TMRB5CLK_LFRC_DIV2      = 10,    /*!< LFRC_DIV2 : Clock source is LFRC / 2 value.                               */
+  CTIMER_CTRL5_TMRB5CLK_LFRC_DIV32     = 11,    /*!< LFRC_DIV32 : Clock source is LFRC / 32 value.                             */
+  CTIMER_CTRL5_TMRB5CLK_LFRC_DIV1K     = 12,    /*!< LFRC_DIV1K : Clock source is LFRC / 1024 value.                           */
+  CTIMER_CTRL5_TMRB5CLK_LFRC           = 13,    /*!< LFRC : Clock source is LFRC value.                                        */
+  CTIMER_CTRL5_TMRB5CLK_RTC_100HZ      = 14,    /*!< RTC_100HZ : Clock source is 100 Hz from the current RTC oscillator.
+                                                     value.                                                                    */
+  CTIMER_CTRL5_TMRB5CLK_HCLK           = 15,    /*!< HCLK : Clock source is HCLK. value.                                       */
+  CTIMER_CTRL5_TMRB5CLK_XT_DIV4        = 16,    /*!< XT_DIV4 : Clock source is XT / 4 value.                                   */
+  CTIMER_CTRL5_TMRB5CLK_XT_DIV8        = 17,    /*!< XT_DIV8 : Clock source is XT / 8 value.                                   */
+  CTIMER_CTRL5_TMRB5CLK_XT_DIV32       = 18,    /*!< XT_DIV32 : Clock source is XT / 32 value.                                 */
+  CTIMER_CTRL5_TMRB5CLK_CTMRA5         = 20,    /*!< CTMRA5 : Clock source is CTIMERA5 OUT. value.                             */
+  CTIMER_CTRL5_TMRB5CLK_CTMRA0         = 21,    /*!< CTMRA0 : Clock source is CTIMERA0 OUT. value.                             */
+  CTIMER_CTRL5_TMRB5CLK_CTMRB0         = 22,    /*!< CTMRB0 : Clock source is CTIMERB0 OUT. value.                             */
+  CTIMER_CTRL5_TMRB5CLK_CTMRA6         = 23,    /*!< CTMRA6 : Clock source is CTIMERA6 OUT. value.                             */
+  CTIMER_CTRL5_TMRB5CLK_CTMRB6         = 24,    /*!< CTMRB6 : Clock source is CTIMERB6 OUT. value.                             */
+  CTIMER_CTRL5_TMRB5CLK_CTMRB1         = 25,    /*!< CTMRB1 : Clock source is CTIMERB1 OUT. value.                             */
+  CTIMER_CTRL5_TMRB5CLK_CTMRB2         = 26,    /*!< CTMRB2 : Clock source is CTIMERB2 OUT. value.                             */
+  CTIMER_CTRL5_TMRB5CLK_CTMRB3         = 27,    /*!< CTMRB3 : Clock source is CTIMERB3 OUT. value.                             */
+  CTIMER_CTRL5_TMRB5CLK_CTMRB4         = 28,    /*!< CTMRB4 : Clock source is CTIMERB4 OUT. value.                             */
+  CTIMER_CTRL5_TMRB5CLK_BUCKBLE        = 29,    /*!< BUCKBLE : Clock source is BLE buck converter TON pulses. value.           */
+  CTIMER_CTRL5_TMRB5CLK_BUCKB          = 30,    /*!< BUCKB : Clock source is Memory buck converter TON pulses. value.          */
+  CTIMER_CTRL5_TMRB5CLK_BUCKA          = 31,    /*!< BUCKA : Clock source is CPU buck converter TON pulses. value.             */
+} CTIMER_CTRL5_TMRB5CLK_Enum;
+
+/* =============================================  CTIMER CTRL5 TMRB5EN [16..16]  ============================================= */
+typedef enum {                                  /*!< CTIMER_CTRL5_TMRB5EN                                                      */
+  CTIMER_CTRL5_TMRB5EN_DIS             = 0,     /*!< DIS : Counter/Timer B5 Disable. value.                                    */
+  CTIMER_CTRL5_TMRB5EN_EN              = 1,     /*!< EN : Counter/Timer B5 Enable. value.                                      */
+} CTIMER_CTRL5_TMRB5EN_Enum;
+
+/* ============================================  CTIMER CTRL5 TMRA5POL [12..12]  ============================================= */
+typedef enum {                                  /*!< CTIMER_CTRL5_TMRA5POL                                                     */
+  CTIMER_CTRL5_TMRA5POL_NORMAL         = 0,     /*!< NORMAL : The polarity of the TMRPINA5 pin is the same as the
+                                                     timer output. value.                                                      */
+  CTIMER_CTRL5_TMRA5POL_INVERTED       = 1,     /*!< INVERTED : The polarity of the TMRPINA5 pin is the inverse of
+                                                     the timer output. value.                                                  */
+} CTIMER_CTRL5_TMRA5POL_Enum;
+
+/* ============================================  CTIMER CTRL5 TMRA5CLR [11..11]  ============================================= */
+typedef enum {                                  /*!< CTIMER_CTRL5_TMRA5CLR                                                     */
+  CTIMER_CTRL5_TMRA5CLR_RUN            = 0,     /*!< RUN : Allow counter/timer A5 to run value.                                */
+  CTIMER_CTRL5_TMRA5CLR_CLEAR          = 1,     /*!< CLEAR : Holds counter/timer A5 at 0x0000. value.                          */
+} CTIMER_CTRL5_TMRA5CLR_Enum;
+
+/* ============================================  CTIMER CTRL5 TMRA5IE1 [10..10]  ============================================= */
+typedef enum {                                  /*!< CTIMER_CTRL5_TMRA5IE1                                                     */
+  CTIMER_CTRL5_TMRA5IE1_DIS            = 0,     /*!< DIS : Disable counter/timer A5 from generating an interrupt
+                                                     based on COMPR1. value.                                                   */
+  CTIMER_CTRL5_TMRA5IE1_EN             = 1,     /*!< EN : Enable counter/timer A5 to generate an interrupt based
+                                                     on COMPR1. value.                                                         */
+} CTIMER_CTRL5_TMRA5IE1_Enum;
+
+/* =============================================  CTIMER CTRL5 TMRA5IE0 [9..9]  ============================================== */
+typedef enum {                                  /*!< CTIMER_CTRL5_TMRA5IE0                                                     */
+  CTIMER_CTRL5_TMRA5IE0_DIS            = 0,     /*!< DIS : Disable counter/timer A5 from generating an interrupt
+                                                     based on COMPR0. value.                                                   */
+  CTIMER_CTRL5_TMRA5IE0_EN             = 1,     /*!< EN : Enable counter/timer A5 to generate an interrupt based
+                                                     on COMPR0. value.                                                         */
+} CTIMER_CTRL5_TMRA5IE0_Enum;
+
+/* ==============================================  CTIMER CTRL5 TMRA5FN [6..8]  ============================================== */
+typedef enum {                                  /*!< CTIMER_CTRL5_TMRA5FN                                                      */
+  CTIMER_CTRL5_TMRA5FN_SINGLECOUNT     = 0,     /*!< SINGLECOUNT : Single count (output toggles and sticks). Count
+                                                     to CMPR0A5, stop. value.                                                  */
+  CTIMER_CTRL5_TMRA5FN_REPEATEDCOUNT   = 1,     /*!< REPEATEDCOUNT : Repeated count (periodic 1-clock-cycle-wide
+                                                     pulses). Count to CMPR0A5, restart. value.                                */
+  CTIMER_CTRL5_TMRA5FN_PULSE_ONCE      = 2,     /*!< PULSE_ONCE : Pulse once (aka one-shot). Count to CMPR0A5, assert,
+                                                     count to CMPR1A5, deassert, stop. value.                                  */
+  CTIMER_CTRL5_TMRA5FN_PULSE_CONT      = 3,     /*!< PULSE_CONT : Pulse continously. Count to CMPR0A5, assert, count
+                                                     to CMPR1A5, deassert, restart. value.                                     */
+  CTIMER_CTRL5_TMRA5FN_SINGLEPATTERN   = 4,     /*!< SINGLEPATTERN : Single pattern. value.                                    */
+  CTIMER_CTRL5_TMRA5FN_REPEATPATTERN   = 5,     /*!< REPEATPATTERN : Repeated pattern. value.                                  */
+  CTIMER_CTRL5_TMRA5FN_CONTINUOUS      = 6,     /*!< CONTINUOUS : Continuous run (aka Free Run). Count continuously.
+                                                     value.                                                                    */
+  CTIMER_CTRL5_TMRA5FN_ALTPWN          = 7,     /*!< ALTPWN : Alternate PWM value.                                             */
+} CTIMER_CTRL5_TMRA5FN_Enum;
+
+/* =============================================  CTIMER CTRL5 TMRA5CLK [1..5]  ============================================== */
+typedef enum {                                  /*!< CTIMER_CTRL5_TMRA5CLK                                                     */
+  CTIMER_CTRL5_TMRA5CLK_TMRPIN         = 0,     /*!< TMRPIN : Clock source is TMRPINA. value.                                  */
+  CTIMER_CTRL5_TMRA5CLK_HFRC_DIV4      = 1,     /*!< HFRC_DIV4 : Clock source is the HFRC / 4 value.                           */
+  CTIMER_CTRL5_TMRA5CLK_HFRC_DIV16     = 2,     /*!< HFRC_DIV16 : Clock source is HFRC / 16 value.                             */
+  CTIMER_CTRL5_TMRA5CLK_HFRC_DIV256    = 3,     /*!< HFRC_DIV256 : Clock source is HFRC / 256 value.                           */
+  CTIMER_CTRL5_TMRA5CLK_HFRC_DIV1024   = 4,     /*!< HFRC_DIV1024 : Clock source is HFRC / 1024 value.                         */
+  CTIMER_CTRL5_TMRA5CLK_HFRC_DIV4K     = 5,     /*!< HFRC_DIV4K : Clock source is HFRC / 4096 value.                           */
+  CTIMER_CTRL5_TMRA5CLK_XT             = 6,     /*!< XT : Clock source is the XT (uncalibrated). value.                        */
+  CTIMER_CTRL5_TMRA5CLK_XT_DIV2        = 7,     /*!< XT_DIV2 : Clock source is XT / 2 value.                                   */
+  CTIMER_CTRL5_TMRA5CLK_XT_DIV16       = 8,     /*!< XT_DIV16 : Clock source is XT / 16 value.                                 */
+  CTIMER_CTRL5_TMRA5CLK_XT_DIV128      = 9,     /*!< XT_DIV128 : Clock source is XT / 128 value.                               */
+  CTIMER_CTRL5_TMRA5CLK_LFRC_DIV2      = 10,    /*!< LFRC_DIV2 : Clock source is LFRC / 2 value.                               */
+  CTIMER_CTRL5_TMRA5CLK_LFRC_DIV32     = 11,    /*!< LFRC_DIV32 : Clock source is LFRC / 32 value.                             */
+  CTIMER_CTRL5_TMRA5CLK_LFRC_DIV1K     = 12,    /*!< LFRC_DIV1K : Clock source is LFRC / 1024 value.                           */
+  CTIMER_CTRL5_TMRA5CLK_LFRC           = 13,    /*!< LFRC : Clock source is LFRC value.                                        */
+  CTIMER_CTRL5_TMRA5CLK_RTC_100HZ      = 14,    /*!< RTC_100HZ : Clock source is 100 Hz from the current RTC oscillator.
+                                                     value.                                                                    */
+  CTIMER_CTRL5_TMRA5CLK_HCLK           = 15,    /*!< HCLK : Clock source is HCLK. value.                                       */
+  CTIMER_CTRL5_TMRA5CLK_XT_DIV4        = 16,    /*!< XT_DIV4 : Clock source is XT / 4 value.                                   */
+  CTIMER_CTRL5_TMRA5CLK_XT_DIV8        = 17,    /*!< XT_DIV8 : Clock source is XT / 8 value.                                   */
+  CTIMER_CTRL5_TMRA5CLK_XT_DIV32       = 18,    /*!< XT_DIV32 : Clock source is XT / 32 value.                                 */
+  CTIMER_CTRL5_TMRA5CLK_CTMRB5         = 20,    /*!< CTMRB5 : Clock source is CTIMERB5 OUT. value.                             */
+  CTIMER_CTRL5_TMRA5CLK_CTMRA0         = 21,    /*!< CTMRA0 : Clock source is CTIMERA0 OUT. value.                             */
+  CTIMER_CTRL5_TMRA5CLK_CTMRB0         = 22,    /*!< CTMRB0 : Clock source is CTIMERB0 OUT. value.                             */
+  CTIMER_CTRL5_TMRA5CLK_CTMRA6         = 23,    /*!< CTMRA6 : Clock source is CTIMERA6 OUT. value.                             */
+  CTIMER_CTRL5_TMRA5CLK_CTMRB6         = 24,    /*!< CTMRB6 : Clock source is CTIMERB6 OUT. value.                             */
+  CTIMER_CTRL5_TMRA5CLK_CTMRB1         = 25,    /*!< CTMRB1 : Clock source is CTIMERB1 OUT. value.                             */
+  CTIMER_CTRL5_TMRA5CLK_CTMRB2         = 26,    /*!< CTMRB2 : Clock source is CTIMERB2 OUT. value.                             */
+  CTIMER_CTRL5_TMRA5CLK_CTMRB3         = 27,    /*!< CTMRB3 : Clock source is CTIMERB3 OUT. value.                             */
+  CTIMER_CTRL5_TMRA5CLK_CTMRB4         = 28,    /*!< CTMRB4 : Clock source is CTIMERB4 OUT. value.                             */
+  CTIMER_CTRL5_TMRA5CLK_BUCKBLE        = 29,    /*!< BUCKBLE : Clock source is BLE buck converter TON pulses. value.           */
+  CTIMER_CTRL5_TMRA5CLK_BUCKB          = 30,    /*!< BUCKB : Clock source is Memory buck converter TON pulses. value.          */
+  CTIMER_CTRL5_TMRA5CLK_BUCKA          = 31,    /*!< BUCKA : Clock source is CPU buck converter TON pulses. value.             */
+} CTIMER_CTRL5_TMRA5CLK_Enum;
+
+/* ==============================================  CTIMER CTRL5 TMRA5EN [0..0]  ============================================== */
+typedef enum {                                  /*!< CTIMER_CTRL5_TMRA5EN                                                      */
+  CTIMER_CTRL5_TMRA5EN_DIS             = 0,     /*!< DIS : Counter/Timer A5 Disable. value.                                    */
+  CTIMER_CTRL5_TMRA5EN_EN              = 1,     /*!< EN : Counter/Timer A5 Enable. value.                                      */
+} CTIMER_CTRL5_TMRA5EN_Enum;
+
+/* =======================================================  CMPRAUXA5  ======================================================= */
+/* =======================================================  CMPRAUXB5  ======================================================= */
+/* =========================================================  AUX5  ========================================================== */
+/* ============================================  CTIMER AUX5 TMRB5EN23 [30..30]  ============================================= */
+typedef enum {                                  /*!< CTIMER_AUX5_TMRB5EN23                                                     */
+  CTIMER_AUX5_TMRB5EN23_DIS            = 1,     /*!< DIS : Disable enhanced functions. value.                                  */
+  CTIMER_AUX5_TMRB5EN23_EN             = 0,     /*!< EN : Enable enhanced functions. value.                                    */
+} CTIMER_AUX5_TMRB5EN23_Enum;
+
+/* ============================================  CTIMER AUX5 TMRB5POL23 [29..29]  ============================================ */
+typedef enum {                                  /*!< CTIMER_AUX5_TMRB5POL23                                                    */
+  CTIMER_AUX5_TMRB5POL23_NORM          = 0,     /*!< NORM : Upper output normal polarity value.                                */
+  CTIMER_AUX5_TMRB5POL23_INV           = 1,     /*!< INV : Upper output inverted polarity. value.                              */
+} CTIMER_AUX5_TMRB5POL23_Enum;
+
+/* ============================================  CTIMER AUX5 TMRB5TINV [28..28]  ============================================= */
+typedef enum {                                  /*!< CTIMER_AUX5_TMRB5TINV                                                     */
+  CTIMER_AUX5_TMRB5TINV_DIS            = 0,     /*!< DIS : Disable invert on trigger value.                                    */
+  CTIMER_AUX5_TMRB5TINV_EN             = 1,     /*!< EN : Enable invert on trigger value.                                      */
+} CTIMER_AUX5_TMRB5TINV_Enum;
+
+/* ===========================================  CTIMER AUX5 TMRB5NOSYNC [27..27]  ============================================ */
+typedef enum {                                  /*!< CTIMER_AUX5_TMRB5NOSYNC                                                   */
+  CTIMER_AUX5_TMRB5NOSYNC_DIS          = 0,     /*!< DIS : Synchronization on source clock value.                              */
+  CTIMER_AUX5_TMRB5NOSYNC_NOSYNC       = 1,     /*!< NOSYNC : No synchronization on source clock value.                        */
+} CTIMER_AUX5_TMRB5NOSYNC_Enum;
+
+/* ============================================  CTIMER AUX5 TMRB5TRIG [23..26]  ============================================= */
+typedef enum {                                  /*!< CTIMER_AUX5_TMRB5TRIG                                                     */
+  CTIMER_AUX5_TMRB5TRIG_DIS            = 0,     /*!< DIS : Trigger source is disabled. value.                                  */
+  CTIMER_AUX5_TMRB5TRIG_A5OUT          = 1,     /*!< A5OUT : Trigger source is CTIMERA5 OUT. value.                            */
+  CTIMER_AUX5_TMRB5TRIG_B3OUT          = 2,     /*!< B3OUT : Trigger source is CTIMERB3 OUT. value.                            */
+  CTIMER_AUX5_TMRB5TRIG_A3OUT          = 3,     /*!< A3OUT : Trigger source is CTIMERA3 OUT. value.                            */
+  CTIMER_AUX5_TMRB5TRIG_A6OUT          = 4,     /*!< A6OUT : Trigger source is CTIMERA6 OUT. value.                            */
+  CTIMER_AUX5_TMRB5TRIG_B6OUT          = 5,     /*!< B6OUT : Trigger source is CTIMERB6 OUT. value.                            */
+  CTIMER_AUX5_TMRB5TRIG_A1OUT          = 6,     /*!< A1OUT : Trigger source is CTIMERA1 OUT. value.                            */
+  CTIMER_AUX5_TMRB5TRIG_B1OUT          = 7,     /*!< B1OUT : Trigger source is CTIMERB1 OUT. value.                            */
+  CTIMER_AUX5_TMRB5TRIG_B3OUT2         = 8,     /*!< B3OUT2 : Trigger source is CTIMERB3 OUT2. value.                          */
+  CTIMER_AUX5_TMRB5TRIG_A3OUT2         = 9,     /*!< A3OUT2 : Trigger source is CTIMERA3 OUT2. value.                          */
+  CTIMER_AUX5_TMRB5TRIG_A0OUT2         = 10,    /*!< A0OUT2 : Trigger source is CTIMERA0 OUT2. value.                          */
+  CTIMER_AUX5_TMRB5TRIG_B0OUT2         = 11,    /*!< B0OUT2 : Trigger source is CTIMERB0 OUT2. value.                          */
+  CTIMER_AUX5_TMRB5TRIG_A6OUT2DUAL     = 12,    /*!< A6OUT2DUAL : Trigger source is CTIMERA6 OUT2, dual edge. value.           */
+  CTIMER_AUX5_TMRB5TRIG_A7OUT2DUAL     = 13,    /*!< A7OUT2DUAL : Trigger source is CTIMERA7 OUT2, dual edge. value.           */
+  CTIMER_AUX5_TMRB5TRIG_B4OUT2DUAL     = 14,    /*!< B4OUT2DUAL : Trigger source is CTIMERB4 OUT2, dual edge. value.           */
+  CTIMER_AUX5_TMRB5TRIG_A4OUT2DUAL     = 15,    /*!< A4OUT2DUAL : Trigger source is CTIMERA4 OUT2, dual edge. value.           */
+} CTIMER_AUX5_TMRB5TRIG_Enum;
+
+/* ============================================  CTIMER AUX5 TMRA5EN23 [14..14]  ============================================= */
+typedef enum {                                  /*!< CTIMER_AUX5_TMRA5EN23                                                     */
+  CTIMER_AUX5_TMRA5EN23_DIS            = 1,     /*!< DIS : Disable enhanced functions. value.                                  */
+  CTIMER_AUX5_TMRA5EN23_EN             = 0,     /*!< EN : Enable enhanced functions. value.                                    */
+} CTIMER_AUX5_TMRA5EN23_Enum;
+
+/* ============================================  CTIMER AUX5 TMRA5POL23 [13..13]  ============================================ */
+typedef enum {                                  /*!< CTIMER_AUX5_TMRA5POL23                                                    */
+  CTIMER_AUX5_TMRA5POL23_NORMAL        = 0,     /*!< NORMAL : Upper output normal polarity value.                              */
+  CTIMER_AUX5_TMRA5POL23_INV           = 1,     /*!< INV : Upper output inverted polarity. value.                              */
+} CTIMER_AUX5_TMRA5POL23_Enum;
+
+/* ============================================  CTIMER AUX5 TMRA5TINV [12..12]  ============================================= */
+typedef enum {                                  /*!< CTIMER_AUX5_TMRA5TINV                                                     */
+  CTIMER_AUX5_TMRA5TINV_DIS            = 0,     /*!< DIS : Disable invert on trigger value.                                    */
+  CTIMER_AUX5_TMRA5TINV_EN             = 1,     /*!< EN : Enable invert on trigger value.                                      */
+} CTIMER_AUX5_TMRA5TINV_Enum;
+
+/* ===========================================  CTIMER AUX5 TMRA5NOSYNC [11..11]  ============================================ */
+typedef enum {                                  /*!< CTIMER_AUX5_TMRA5NOSYNC                                                   */
+  CTIMER_AUX5_TMRA5NOSYNC_DIS          = 0,     /*!< DIS : Synchronization on source clock value.                              */
+  CTIMER_AUX5_TMRA5NOSYNC_NOSYNC       = 1,     /*!< NOSYNC : No synchronization on source clock value.                        */
+} CTIMER_AUX5_TMRA5NOSYNC_Enum;
+
+/* =============================================  CTIMER AUX5 TMRA5TRIG [7..10]  ============================================= */
+typedef enum {                                  /*!< CTIMER_AUX5_TMRA5TRIG                                                     */
+  CTIMER_AUX5_TMRA5TRIG_DIS            = 0,     /*!< DIS : Trigger source is disabled. value.                                  */
+  CTIMER_AUX5_TMRA5TRIG_B5OUT          = 1,     /*!< B5OUT : Trigger source is CTIMERB5 OUT. value.                            */
+  CTIMER_AUX5_TMRA5TRIG_B3OUT          = 2,     /*!< B3OUT : Trigger source is CTIMERB3 OUT. value.                            */
+  CTIMER_AUX5_TMRA5TRIG_A3OUT          = 3,     /*!< A3OUT : Trigger source is CTIMERA3 OUT. value.                            */
+  CTIMER_AUX5_TMRA5TRIG_A4OUT          = 4,     /*!< A4OUT : Trigger source is CTIMERA4 OUT. value.                            */
+  CTIMER_AUX5_TMRA5TRIG_B4OUT          = 5,     /*!< B4OUT : Trigger source is CTIMERB4 OUT. value.                            */
+  CTIMER_AUX5_TMRA5TRIG_A2OUT          = 6,     /*!< A2OUT : Trigger source is CTIMERA2 OUT. value.                            */
+  CTIMER_AUX5_TMRA5TRIG_B2OUT          = 7,     /*!< B2OUT : Trigger source is CTIMERB2 OUT. value.                            */
+  CTIMER_AUX5_TMRA5TRIG_B3OUT2         = 8,     /*!< B3OUT2 : Trigger source is CTIMERB3 OUT2. value.                          */
+  CTIMER_AUX5_TMRA5TRIG_A3OUT2         = 9,     /*!< A3OUT2 : Trigger source is CTIMERA3 OUT2. value.                          */
+  CTIMER_AUX5_TMRA5TRIG_A0OUT2         = 10,    /*!< A0OUT2 : Trigger source is CTIMERA0 OUT2. value.                          */
+  CTIMER_AUX5_TMRA5TRIG_B0OUT2         = 11,    /*!< B0OUT2 : Trigger source is CTIMERB0 OUT2. value.                          */
+  CTIMER_AUX5_TMRA5TRIG_A6OUT2DUAL     = 12,    /*!< A6OUT2DUAL : Trigger source is CTIMERA6 OUT2, dual edge. value.           */
+  CTIMER_AUX5_TMRA5TRIG_A7OUT2DUAL     = 13,    /*!< A7OUT2DUAL : Trigger source is CTIMERA7 OUT2, dual edge. value.           */
+  CTIMER_AUX5_TMRA5TRIG_B4OUT2DUAL     = 14,    /*!< B4OUT2DUAL : Trigger source is CTIMERB4 OUT2, dual edge. value.           */
+  CTIMER_AUX5_TMRA5TRIG_A4OUT2DUAL     = 15,    /*!< A4OUT2DUAL : Trigger source is CTIMERA4 OUT2, dual edge. value.           */
+} CTIMER_AUX5_TMRA5TRIG_Enum;
+
+/* =========================================================  TMR6  ========================================================== */
+/* ========================================================  CMPRA6  ========================================================= */
+/* ========================================================  CMPRB6  ========================================================= */
+/* =========================================================  CTRL6  ========================================================= */
+/* =============================================  CTIMER CTRL6 CTLINK6 [31..31]  ============================================= */
+typedef enum {                                  /*!< CTIMER_CTRL6_CTLINK6                                                      */
+  CTIMER_CTRL6_CTLINK6_TWO_16BIT_TIMERS = 0,    /*!< TWO_16BIT_TIMERS : Use A6/B6 timers as two independent 16-bit
+                                                     timers (default). value.                                                  */
+  CTIMER_CTRL6_CTLINK6_32BIT_TIMER     = 1,     /*!< 32BIT_TIMER : Link A6/B6 timers into a single 32-bit timer.
+                                                     value.                                                                    */
+} CTIMER_CTRL6_CTLINK6_Enum;
+
+/* ============================================  CTIMER CTRL6 TMRB6POL [28..28]  ============================================= */
+typedef enum {                                  /*!< CTIMER_CTRL6_TMRB6POL                                                     */
+  CTIMER_CTRL6_TMRB6POL_NORMAL         = 0,     /*!< NORMAL : The polarity of the TMRPINB6 pin is the same as the
+                                                     timer output. value.                                                      */
+  CTIMER_CTRL6_TMRB6POL_INVERTED       = 1,     /*!< INVERTED : The polarity of the TMRPINB6 pin is the inverse of
+                                                     the timer output. value.                                                  */
+} CTIMER_CTRL6_TMRB6POL_Enum;
+
+/* ============================================  CTIMER CTRL6 TMRB6CLR [27..27]  ============================================= */
+typedef enum {                                  /*!< CTIMER_CTRL6_TMRB6CLR                                                     */
+  CTIMER_CTRL6_TMRB6CLR_RUN            = 0,     /*!< RUN : Allow counter/timer B6 to run value.                                */
+  CTIMER_CTRL6_TMRB6CLR_CLEAR          = 1,     /*!< CLEAR : Holds counter/timer B6 at 0x0000. value.                          */
+} CTIMER_CTRL6_TMRB6CLR_Enum;
+
+/* ============================================  CTIMER CTRL6 TMRB6IE1 [26..26]  ============================================= */
+typedef enum {                                  /*!< CTIMER_CTRL6_TMRB6IE1                                                     */
+  CTIMER_CTRL6_TMRB6IE1_DIS            = 0,     /*!< DIS : Disable counter/timer B6 from generating an interrupt
+                                                     based on COMPR1. value.                                                   */
+  CTIMER_CTRL6_TMRB6IE1_EN             = 1,     /*!< EN : Enable counter/timer B6 to generate an interrupt based
+                                                     on COMPR1. value.                                                         */
+} CTIMER_CTRL6_TMRB6IE1_Enum;
+
+/* ============================================  CTIMER CTRL6 TMRB6IE0 [25..25]  ============================================= */
+typedef enum {                                  /*!< CTIMER_CTRL6_TMRB6IE0                                                     */
+  CTIMER_CTRL6_TMRB6IE0_DIS            = 0,     /*!< DIS : Disable counter/timer B6 from generating an interrupt
+                                                     based on COMPR0. value.                                                   */
+  CTIMER_CTRL6_TMRB6IE0_EN             = 1,     /*!< EN : Enable counter/timer B6 to generate an interrupt based
+                                                     on COMPR0 value.                                                          */
+} CTIMER_CTRL6_TMRB6IE0_Enum;
+
+/* =============================================  CTIMER CTRL6 TMRB6FN [22..24]  ============================================= */
+typedef enum {                                  /*!< CTIMER_CTRL6_TMRB6FN                                                      */
+  CTIMER_CTRL6_TMRB6FN_SINGLECOUNT     = 0,     /*!< SINGLECOUNT : Single count (output toggles and sticks). Count
+                                                     to CMPR0B6, stop. value.                                                  */
+  CTIMER_CTRL6_TMRB6FN_REPEATEDCOUNT   = 1,     /*!< REPEATEDCOUNT : Repeated count (periodic 1-clock-cycle-wide
+                                                     pulses). Count to CMPR0B6, restart. value.                                */
+  CTIMER_CTRL6_TMRB6FN_PULSE_ONCE      = 2,     /*!< PULSE_ONCE : Pulse once (aka one-shot). Count to CMPR0B6, assert,
+                                                     count to CMPR1B6, deassert, stop. value.                                  */
+  CTIMER_CTRL6_TMRB6FN_PULSE_CONT      = 3,     /*!< PULSE_CONT : Pulse continously. Count to CMPR0B6, assert, count
+                                                     to CMPR1B6, deassert, restart. value.                                     */
+  CTIMER_CTRL6_TMRB6FN_SINGLEPATTERN   = 4,     /*!< SINGLEPATTERN : Single pattern. value.                                    */
+  CTIMER_CTRL6_TMRB6FN_REPEATPATTERN   = 5,     /*!< REPEATPATTERN : Repeated pattern. value.                                  */
+  CTIMER_CTRL6_TMRB6FN_CONTINUOUS      = 6,     /*!< CONTINUOUS : Continuous run (aka Free Run). Count continuously.
+                                                     value.                                                                    */
+  CTIMER_CTRL6_TMRB6FN_ALTPWN          = 7,     /*!< ALTPWN : Alternate PWM value.                                             */
+} CTIMER_CTRL6_TMRB6FN_Enum;
+
+/* ============================================  CTIMER CTRL6 TMRB6CLK [17..21]  ============================================= */
+typedef enum {                                  /*!< CTIMER_CTRL6_TMRB6CLK                                                     */
+  CTIMER_CTRL6_TMRB6CLK_TMRPIN         = 0,     /*!< TMRPIN : Clock source is TMRPINB. value.                                  */
+  CTIMER_CTRL6_TMRB6CLK_HFRC_DIV4      = 1,     /*!< HFRC_DIV4 : Clock source is the HFRC / 4 value.                           */
+  CTIMER_CTRL6_TMRB6CLK_HFRC_DIV16     = 2,     /*!< HFRC_DIV16 : Clock source is HFRC / 16 value.                             */
+  CTIMER_CTRL6_TMRB6CLK_HFRC_DIV256    = 3,     /*!< HFRC_DIV256 : Clock source is HFRC / 256 value.                           */
+  CTIMER_CTRL6_TMRB6CLK_HFRC_DIV1024   = 4,     /*!< HFRC_DIV1024 : Clock source is HFRC / 1024 value.                         */
+  CTIMER_CTRL6_TMRB6CLK_HFRC_DIV4K     = 5,     /*!< HFRC_DIV4K : Clock source is HFRC / 4096 value.                           */
+  CTIMER_CTRL6_TMRB6CLK_XT             = 6,     /*!< XT : Clock source is the XT (uncalibrated). value.                        */
+  CTIMER_CTRL6_TMRB6CLK_XT_DIV2        = 7,     /*!< XT_DIV2 : Clock source is XT / 2 value.                                   */
+  CTIMER_CTRL6_TMRB6CLK_XT_DIV16       = 8,     /*!< XT_DIV16 : Clock source is XT / 16 value.                                 */
+  CTIMER_CTRL6_TMRB6CLK_XT_DIV128      = 9,     /*!< XT_DIV128 : Clock source is XT / 128 value.                               */
+  CTIMER_CTRL6_TMRB6CLK_LFRC_DIV2      = 10,    /*!< LFRC_DIV2 : Clock source is LFRC / 2 value.                               */
+  CTIMER_CTRL6_TMRB6CLK_LFRC_DIV32     = 11,    /*!< LFRC_DIV32 : Clock source is LFRC / 32 value.                             */
+  CTIMER_CTRL6_TMRB6CLK_LFRC_DIV1K     = 12,    /*!< LFRC_DIV1K : Clock source is LFRC / 1024 value.                           */
+  CTIMER_CTRL6_TMRB6CLK_LFRC           = 13,    /*!< LFRC : Clock source is LFRC value.                                        */
+  CTIMER_CTRL6_TMRB6CLK_RTC_100HZ      = 14,    /*!< RTC_100HZ : Clock source is 100 Hz from the current RTC oscillator.
+                                                     value.                                                                    */
+  CTIMER_CTRL6_TMRB6CLK_HCLK           = 15,    /*!< HCLK : Clock source is HCLK. value.                                       */
+  CTIMER_CTRL6_TMRB6CLK_XT_DIV4        = 16,    /*!< XT_DIV4 : Clock source is XT / 4 value.                                   */
+  CTIMER_CTRL6_TMRB6CLK_XT_DIV8        = 17,    /*!< XT_DIV8 : Clock source is XT / 8 value.                                   */
+  CTIMER_CTRL6_TMRB6CLK_XT_DIV32       = 18,    /*!< XT_DIV32 : Clock source is XT / 32 value.                                 */
+  CTIMER_CTRL6_TMRB6CLK_CTMRA6         = 20,    /*!< CTMRA6 : Clock source is CTIMERA6 OUT. value.                             */
+  CTIMER_CTRL6_TMRB6CLK_CTMRA3         = 21,    /*!< CTMRA3 : Clock source is CTIMERA3 OUT. value.                             */
+  CTIMER_CTRL6_TMRB6CLK_CTMRB3         = 22,    /*!< CTMRB3 : Clock source is CTIMERB3 OUT. value.                             */
+  CTIMER_CTRL6_TMRB6CLK_CTMRA7         = 23,    /*!< CTMRA7 : Clock source is CTIMERA7 OUT. value.                             */
+  CTIMER_CTRL6_TMRB6CLK_CTMRB7         = 24,    /*!< CTMRB7 : Clock source is CTIMERB7 OUT. value.                             */
+  CTIMER_CTRL6_TMRB6CLK_CTMRB0         = 25,    /*!< CTMRB0 : Clock source is CTIMERB0 OUT. value.                             */
+  CTIMER_CTRL6_TMRB6CLK_CTMRB1         = 26,    /*!< CTMRB1 : Clock source is CTIMERB1 OUT. value.                             */
+  CTIMER_CTRL6_TMRB6CLK_CTMRB2         = 27,    /*!< CTMRB2 : Clock source is CTIMERB2 OUT. value.                             */
+  CTIMER_CTRL6_TMRB6CLK_CTMRB4         = 28,    /*!< CTMRB4 : Clock source is CTIMERB4 OUT. value.                             */
+  CTIMER_CTRL6_TMRB6CLK_BUCKBLE        = 29,    /*!< BUCKBLE : Clock source is BLE buck converter TON pulses. value.           */
+  CTIMER_CTRL6_TMRB6CLK_BUCKB          = 30,    /*!< BUCKB : Clock source is Memory buck converter TON pulses. value.          */
+  CTIMER_CTRL6_TMRB6CLK_BUCKA          = 31,    /*!< BUCKA : Clock source is CPU buck converter TON pulses. value.             */
+} CTIMER_CTRL6_TMRB6CLK_Enum;
+
+/* =============================================  CTIMER CTRL6 TMRB6EN [16..16]  ============================================= */
+typedef enum {                                  /*!< CTIMER_CTRL6_TMRB6EN                                                      */
+  CTIMER_CTRL6_TMRB6EN_DIS             = 0,     /*!< DIS : Counter/Timer B6 Disable. value.                                    */
+  CTIMER_CTRL6_TMRB6EN_EN              = 1,     /*!< EN : Counter/Timer B6 Enable. value.                                      */
+} CTIMER_CTRL6_TMRB6EN_Enum;
+
+/* ============================================  CTIMER CTRL6 TMRA6POL [12..12]  ============================================= */
+typedef enum {                                  /*!< CTIMER_CTRL6_TMRA6POL                                                     */
+  CTIMER_CTRL6_TMRA6POL_NORMAL         = 0,     /*!< NORMAL : The polarity of the TMRPINA6 pin is the same as the
+                                                     timer output. value.                                                      */
+  CTIMER_CTRL6_TMRA6POL_INVERTED       = 1,     /*!< INVERTED : The polarity of the TMRPINA6 pin is the inverse of
+                                                     the timer output. value.                                                  */
+} CTIMER_CTRL6_TMRA6POL_Enum;
+
+/* ============================================  CTIMER CTRL6 TMRA6CLR [11..11]  ============================================= */
+typedef enum {                                  /*!< CTIMER_CTRL6_TMRA6CLR                                                     */
+  CTIMER_CTRL6_TMRA6CLR_RUN            = 0,     /*!< RUN : Allow counter/timer A6 to run value.                                */
+  CTIMER_CTRL6_TMRA6CLR_CLEAR          = 1,     /*!< CLEAR : Holds counter/timer A6 at 0x0000. value.                          */
+} CTIMER_CTRL6_TMRA6CLR_Enum;
+
+/* ============================================  CTIMER CTRL6 TMRA6IE1 [10..10]  ============================================= */
+typedef enum {                                  /*!< CTIMER_CTRL6_TMRA6IE1                                                     */
+  CTIMER_CTRL6_TMRA6IE1_DIS            = 0,     /*!< DIS : Disable counter/timer A6 from generating an interrupt
+                                                     based on COMPR1. value.                                                   */
+  CTIMER_CTRL6_TMRA6IE1_EN             = 1,     /*!< EN : Enable counter/timer A6 to generate an interrupt based
+                                                     on COMPR1. value.                                                         */
+} CTIMER_CTRL6_TMRA6IE1_Enum;
+
+/* =============================================  CTIMER CTRL6 TMRA6IE0 [9..9]  ============================================== */
+typedef enum {                                  /*!< CTIMER_CTRL6_TMRA6IE0                                                     */
+  CTIMER_CTRL6_TMRA6IE0_DIS            = 0,     /*!< DIS : Disable counter/timer A6 from generating an interrupt
+                                                     based on COMPR0. value.                                                   */
+  CTIMER_CTRL6_TMRA6IE0_EN             = 1,     /*!< EN : Enable counter/timer A6 to generate an interrupt based
+                                                     on COMPR0. value.                                                         */
+} CTIMER_CTRL6_TMRA6IE0_Enum;
+
+/* ==============================================  CTIMER CTRL6 TMRA6FN [6..8]  ============================================== */
+typedef enum {                                  /*!< CTIMER_CTRL6_TMRA6FN                                                      */
+  CTIMER_CTRL6_TMRA6FN_SINGLECOUNT     = 0,     /*!< SINGLECOUNT : Single count (output toggles and sticks). Count
+                                                     to CMPR0A6, stop. value.                                                  */
+  CTIMER_CTRL6_TMRA6FN_REPEATEDCOUNT   = 1,     /*!< REPEATEDCOUNT : Repeated count (periodic 1-clock-cycle-wide
+                                                     pulses). Count to CMPR0A6, restart. value.                                */
+  CTIMER_CTRL6_TMRA6FN_PULSE_ONCE      = 2,     /*!< PULSE_ONCE : Pulse once (aka one-shot). Count to CMPR0A6, assert,
+                                                     count to CMPR1A6, deassert, stop. value.                                  */
+  CTIMER_CTRL6_TMRA6FN_PULSE_CONT      = 3,     /*!< PULSE_CONT : Pulse continously. Count to CMPR0A6, assert, count
+                                                     to CMPR1A6, deassert, restart. value.                                     */
+  CTIMER_CTRL6_TMRA6FN_SINGLEPATTERN   = 4,     /*!< SINGLEPATTERN : Single pattern. value.                                    */
+  CTIMER_CTRL6_TMRA6FN_REPEATPATTERN   = 5,     /*!< REPEATPATTERN : Repeated pattern. value.                                  */
+  CTIMER_CTRL6_TMRA6FN_CONTINUOUS      = 6,     /*!< CONTINUOUS : Continuous run (aka Free Run). Count continuously.
+                                                     value.                                                                    */
+  CTIMER_CTRL6_TMRA6FN_ALTPWN          = 7,     /*!< ALTPWN : Alternate PWM value.                                             */
+} CTIMER_CTRL6_TMRA6FN_Enum;
+
+/* =============================================  CTIMER CTRL6 TMRA6CLK [1..5]  ============================================== */
+typedef enum {                                  /*!< CTIMER_CTRL6_TMRA6CLK                                                     */
+  CTIMER_CTRL6_TMRA6CLK_TMRPIN         = 0,     /*!< TMRPIN : Clock source is TMRPINA. value.                                  */
+  CTIMER_CTRL6_TMRA6CLK_HFRC_DIV4      = 1,     /*!< HFRC_DIV4 : Clock source is the HFRC / 4 value.                           */
+  CTIMER_CTRL6_TMRA6CLK_HFRC_DIV16     = 2,     /*!< HFRC_DIV16 : Clock source is HFRC / 16 value.                             */
+  CTIMER_CTRL6_TMRA6CLK_HFRC_DIV256    = 3,     /*!< HFRC_DIV256 : Clock source is HFRC / 256 value.                           */
+  CTIMER_CTRL6_TMRA6CLK_HFRC_DIV1024   = 4,     /*!< HFRC_DIV1024 : Clock source is HFRC / 1024 value.                         */
+  CTIMER_CTRL6_TMRA6CLK_HFRC_DIV4K     = 5,     /*!< HFRC_DIV4K : Clock source is HFRC / 4096 value.                           */
+  CTIMER_CTRL6_TMRA6CLK_XT             = 6,     /*!< XT : Clock source is the XT (uncalibrated). value.                        */
+  CTIMER_CTRL6_TMRA6CLK_XT_DIV2        = 7,     /*!< XT_DIV2 : Clock source is XT / 2 value.                                   */
+  CTIMER_CTRL6_TMRA6CLK_XT_DIV16       = 8,     /*!< XT_DIV16 : Clock source is XT / 16 value.                                 */
+  CTIMER_CTRL6_TMRA6CLK_XT_DIV128      = 9,     /*!< XT_DIV128 : Clock source is XT / 128 value.                               */
+  CTIMER_CTRL6_TMRA6CLK_LFRC_DIV2      = 10,    /*!< LFRC_DIV2 : Clock source is LFRC / 2 value.                               */
+  CTIMER_CTRL6_TMRA6CLK_LFRC_DIV32     = 11,    /*!< LFRC_DIV32 : Clock source is LFRC / 32 value.                             */
+  CTIMER_CTRL6_TMRA6CLK_LFRC_DIV1K     = 12,    /*!< LFRC_DIV1K : Clock source is LFRC / 1024 value.                           */
+  CTIMER_CTRL6_TMRA6CLK_LFRC           = 13,    /*!< LFRC : Clock source is LFRC value.                                        */
+  CTIMER_CTRL6_TMRA6CLK_RTC_100HZ      = 14,    /*!< RTC_100HZ : Clock source is 100 Hz from the current RTC oscillator.
+                                                     value.                                                                    */
+  CTIMER_CTRL6_TMRA6CLK_HCLK           = 15,    /*!< HCLK : Clock source is HCLK. value.                                       */
+  CTIMER_CTRL6_TMRA6CLK_XT_DIV4        = 16,    /*!< XT_DIV4 : Clock source is XT / 4 value.                                   */
+  CTIMER_CTRL6_TMRA6CLK_XT_DIV8        = 17,    /*!< XT_DIV8 : Clock source is XT / 8 value.                                   */
+  CTIMER_CTRL6_TMRA6CLK_XT_DIV32       = 18,    /*!< XT_DIV32 : Clock source is XT / 32 value.                                 */
+  CTIMER_CTRL6_TMRA6CLK_CTMRB6         = 20,    /*!< CTMRB6 : Clock source is CTIMERB6 OUT. value.                             */
+  CTIMER_CTRL6_TMRA6CLK_CTMRA3         = 21,    /*!< CTMRA3 : Clock source is CTIMERA3 OUT. value.                             */
+  CTIMER_CTRL6_TMRA6CLK_CTMRB3         = 22,    /*!< CTMRB3 : Clock source is CTIMERB3 OUT. value.                             */
+  CTIMER_CTRL6_TMRA6CLK_CTMRA7         = 23,    /*!< CTMRA7 : Clock source is CTIMERA7 OUT. value.                             */
+  CTIMER_CTRL6_TMRA6CLK_CTMRB7         = 24,    /*!< CTMRB7 : Clock source is CTIMERB7 OUT. value.                             */
+  CTIMER_CTRL6_TMRA6CLK_CTMRB0         = 25,    /*!< CTMRB0 : Clock source is CTIMERB0 OUT. value.                             */
+  CTIMER_CTRL6_TMRA6CLK_CTMRB1         = 26,    /*!< CTMRB1 : Clock source is CTIMERB1 OUT. value.                             */
+  CTIMER_CTRL6_TMRA6CLK_CTMRB2         = 27,    /*!< CTMRB2 : Clock source is CTIMERB2 OUT. value.                             */
+  CTIMER_CTRL6_TMRA6CLK_CTMRB4         = 28,    /*!< CTMRB4 : Clock source is CTIMERB4 OUT. value.                             */
+  CTIMER_CTRL6_TMRA6CLK_BUCKBLE        = 29,    /*!< BUCKBLE : Clock source is BLE buck converter TON pulses. value.           */
+  CTIMER_CTRL6_TMRA6CLK_BUCKB          = 30,    /*!< BUCKB : Clock source is Memory buck converter TON pulses. value.          */
+  CTIMER_CTRL6_TMRA6CLK_BUCKA          = 31,    /*!< BUCKA : Clock source is CPU buck converter TON pulses. value.             */
+} CTIMER_CTRL6_TMRA6CLK_Enum;
+
+/* ==============================================  CTIMER CTRL6 TMRA6EN [0..0]  ============================================== */
+typedef enum {                                  /*!< CTIMER_CTRL6_TMRA6EN                                                      */
+  CTIMER_CTRL6_TMRA6EN_DIS             = 0,     /*!< DIS : Counter/Timer A6 Disable. value.                                    */
+  CTIMER_CTRL6_TMRA6EN_EN              = 1,     /*!< EN : Counter/Timer A6 Enable. value.                                      */
+} CTIMER_CTRL6_TMRA6EN_Enum;
+
+/* =======================================================  CMPRAUXA6  ======================================================= */
+/* =======================================================  CMPRAUXB6  ======================================================= */
+/* =========================================================  AUX6  ========================================================== */
+/* ============================================  CTIMER AUX6 TMRB6EN23 [30..30]  ============================================= */
+typedef enum {                                  /*!< CTIMER_AUX6_TMRB6EN23                                                     */
+  CTIMER_AUX6_TMRB6EN23_DIS            = 1,     /*!< DIS : Disable enhanced functions. value.                                  */
+  CTIMER_AUX6_TMRB6EN23_EN             = 0,     /*!< EN : Enable enhanced functions. value.                                    */
+} CTIMER_AUX6_TMRB6EN23_Enum;
+
+/* ============================================  CTIMER AUX6 TMRB6POL23 [29..29]  ============================================ */
+typedef enum {                                  /*!< CTIMER_AUX6_TMRB6POL23                                                    */
+  CTIMER_AUX6_TMRB6POL23_NORM          = 0,     /*!< NORM : Upper output normal polarity value.                                */
+  CTIMER_AUX6_TMRB6POL23_INV           = 1,     /*!< INV : Upper output inverted polarity. value.                              */
+} CTIMER_AUX6_TMRB6POL23_Enum;
+
+/* ============================================  CTIMER AUX6 TMRB6TINV [28..28]  ============================================= */
+typedef enum {                                  /*!< CTIMER_AUX6_TMRB6TINV                                                     */
+  CTIMER_AUX6_TMRB6TINV_DIS            = 0,     /*!< DIS : Disable invert on trigger value.                                    */
+  CTIMER_AUX6_TMRB6TINV_EN             = 1,     /*!< EN : Enable invert on trigger value.                                      */
+} CTIMER_AUX6_TMRB6TINV_Enum;
+
+/* ===========================================  CTIMER AUX6 TMRB6NOSYNC [27..27]  ============================================ */
+typedef enum {                                  /*!< CTIMER_AUX6_TMRB6NOSYNC                                                   */
+  CTIMER_AUX6_TMRB6NOSYNC_DIS          = 0,     /*!< DIS : Synchronization on source clock value.                              */
+  CTIMER_AUX6_TMRB6NOSYNC_NOSYNC       = 1,     /*!< NOSYNC : No synchronization on source clock value.                        */
+} CTIMER_AUX6_TMRB6NOSYNC_Enum;
+
+/* ============================================  CTIMER AUX6 TMRB6TRIG [23..26]  ============================================= */
+typedef enum {                                  /*!< CTIMER_AUX6_TMRB6TRIG                                                     */
+  CTIMER_AUX6_TMRB6TRIG_DIS            = 0,     /*!< DIS : Trigger source is disabled. value.                                  */
+  CTIMER_AUX6_TMRB6TRIG_A6OUT          = 1,     /*!< A6OUT : Trigger source is CTIMERA6 OUT. value.                            */
+  CTIMER_AUX6_TMRB6TRIG_B3OUT          = 2,     /*!< B3OUT : Trigger source is CTIMERB3 OUT. value.                            */
+  CTIMER_AUX6_TMRB6TRIG_A3OUT          = 3,     /*!< A3OUT : Trigger source is CTIMERA3 OUT. value.                            */
+  CTIMER_AUX6_TMRB6TRIG_A4OUT          = 4,     /*!< A4OUT : Trigger source is CTIMERA4 OUT. value.                            */
+  CTIMER_AUX6_TMRB6TRIG_B4OUT          = 5,     /*!< B4OUT : Trigger source is CTIMERB4 OUT. value.                            */
+  CTIMER_AUX6_TMRB6TRIG_A1OUT          = 6,     /*!< A1OUT : Trigger source is CTIMERA1 OUT. value.                            */
+  CTIMER_AUX6_TMRB6TRIG_B1OUT          = 7,     /*!< B1OUT : Trigger source is CTIMERB1 OUT. value.                            */
+  CTIMER_AUX6_TMRB6TRIG_B3OUT2         = 8,     /*!< B3OUT2 : Trigger source is CTIMERB3 OUT2. value.                          */
+  CTIMER_AUX6_TMRB6TRIG_A3OUT2         = 9,     /*!< A3OUT2 : Trigger source is CTIMERA3 OUT2. value.                          */
+  CTIMER_AUX6_TMRB6TRIG_A2OUT2         = 10,    /*!< A2OUT2 : Trigger source is CTIMERA2 OUT2. value.                          */
+  CTIMER_AUX6_TMRB6TRIG_B2OUT2         = 11,    /*!< B2OUT2 : Trigger source is CTIMERB2 OUT2. value.                          */
+  CTIMER_AUX6_TMRB6TRIG_A6OUT2DUAL     = 12,    /*!< A6OUT2DUAL : Trigger source is CTIMERA6 OUT2, dual edge. value.           */
+  CTIMER_AUX6_TMRB6TRIG_A7OUT2DUAL     = 13,    /*!< A7OUT2DUAL : Trigger source is CTIMERA7 OUT2, dual edge. value.           */
+  CTIMER_AUX6_TMRB6TRIG_B0OUT2DUAL     = 14,    /*!< B0OUT2DUAL : Trigger source is CTIMERB0 OUT2, dual edge. value.           */
+  CTIMER_AUX6_TMRB6TRIG_A0OUT2DUAL     = 15,    /*!< A0OUT2DUAL : Trigger source is CTIMERA0 OUT2, dual edge. value.           */
+} CTIMER_AUX6_TMRB6TRIG_Enum;
+
+/* ============================================  CTIMER AUX6 TMRA6EN23 [14..14]  ============================================= */
+typedef enum {                                  /*!< CTIMER_AUX6_TMRA6EN23                                                     */
+  CTIMER_AUX6_TMRA6EN23_DIS            = 1,     /*!< DIS : Disable enhanced functions. value.                                  */
+  CTIMER_AUX6_TMRA6EN23_EN             = 0,     /*!< EN : Enable enhanced functions. value.                                    */
+} CTIMER_AUX6_TMRA6EN23_Enum;
+
+/* ============================================  CTIMER AUX6 TMRA6POL23 [13..13]  ============================================ */
+typedef enum {                                  /*!< CTIMER_AUX6_TMRA6POL23                                                    */
+  CTIMER_AUX6_TMRA6POL23_NORM          = 0,     /*!< NORM : Upper output normal polarity value.                                */
+  CTIMER_AUX6_TMRA6POL23_INV           = 1,     /*!< INV : Upper output inverted polarity. value.                              */
+} CTIMER_AUX6_TMRA6POL23_Enum;
+
+/* ============================================  CTIMER AUX6 TMRA6TINV [12..12]  ============================================= */
+typedef enum {                                  /*!< CTIMER_AUX6_TMRA6TINV                                                     */
+  CTIMER_AUX6_TMRA6TINV_DIS            = 0,     /*!< DIS : Disable invert on trigger value.                                    */
+  CTIMER_AUX6_TMRA6TINV_EN             = 1,     /*!< EN : Enable invert on trigger value.                                      */
+} CTIMER_AUX6_TMRA6TINV_Enum;
+
+/* ===========================================  CTIMER AUX6 TMRA6NOSYNC [11..11]  ============================================ */
+typedef enum {                                  /*!< CTIMER_AUX6_TMRA6NOSYNC                                                   */
+  CTIMER_AUX6_TMRA6NOSYNC_DIS          = 0,     /*!< DIS : Synchronization on source clock value.                              */
+  CTIMER_AUX6_TMRA6NOSYNC_NOSYNC       = 1,     /*!< NOSYNC : No synchronization on source clock value.                        */
+} CTIMER_AUX6_TMRA6NOSYNC_Enum;
+
+/* =============================================  CTIMER AUX6 TMRA6TRIG [7..10]  ============================================= */
+typedef enum {                                  /*!< CTIMER_AUX6_TMRA6TRIG                                                     */
+  CTIMER_AUX6_TMRA6TRIG_DIS            = 0,     /*!< DIS : Trigger source is disabled. value.                                  */
+  CTIMER_AUX6_TMRA6TRIG_B6OUT          = 1,     /*!< B6OUT : Trigger source is CTIMERB6 OUT. value.                            */
+  CTIMER_AUX6_TMRA6TRIG_B3OUT          = 2,     /*!< B3OUT : Trigger source is CTIMERB3 OUT. value.                            */
+  CTIMER_AUX6_TMRA6TRIG_A3OUT          = 3,     /*!< A3OUT : Trigger source is CTIMERA3 OUT. value.                            */
+  CTIMER_AUX6_TMRA6TRIG_A5OUT          = 4,     /*!< A5OUT : Trigger source is CTIMERA5 OUT. value.                            */
+  CTIMER_AUX6_TMRA6TRIG_B5OUT          = 5,     /*!< B5OUT : Trigger source is CTIMERB5 OUT. value.                            */
+  CTIMER_AUX6_TMRA6TRIG_A1OUT          = 6,     /*!< A1OUT : Trigger source is CTIMERA1 OUT. value.                            */
+  CTIMER_AUX6_TMRA6TRIG_B1OUT          = 7,     /*!< B1OUT : Trigger source is CTIMERB1 OUT. value.                            */
+  CTIMER_AUX6_TMRA6TRIG_B3OUT2         = 8,     /*!< B3OUT2 : Trigger source is CTIMERB3 OUT2. value.                          */
+  CTIMER_AUX6_TMRA6TRIG_A3OUT2         = 9,     /*!< A3OUT2 : Trigger source is CTIMERA3 OUT2. value.                          */
+  CTIMER_AUX6_TMRA6TRIG_A2OUT2         = 10,    /*!< A2OUT2 : Trigger source is CTIMERA2 OUT2. value.                          */
+  CTIMER_AUX6_TMRA6TRIG_B2OUT2         = 11,    /*!< B2OUT2 : Trigger source is CTIMERBb OUT2. value.                          */
+  CTIMER_AUX6_TMRA6TRIG_A5OUT2DUAL     = 12,    /*!< A5OUT2DUAL : Trigger source is CTIMERA5 OUT2, dual edge. value.           */
+  CTIMER_AUX6_TMRA6TRIG_A7OUT2DUAL     = 13,    /*!< A7OUT2DUAL : Trigger source is CTIMERA7 OUT2, dual edge. value.           */
+  CTIMER_AUX6_TMRA6TRIG_B0OUT2DUAL     = 14,    /*!< B0OUT2DUAL : Trigger source is CTIMERB0 OUT2, dual edge. value.           */
+  CTIMER_AUX6_TMRA6TRIG_A0OUT2DUAL     = 15,    /*!< A0OUT2DUAL : Trigger source is CTIMERA0 OUT2, dual edge. value.           */
+} CTIMER_AUX6_TMRA6TRIG_Enum;
+
+/* =========================================================  TMR7  ========================================================== */
+/* ========================================================  CMPRA7  ========================================================= */
+/* ========================================================  CMPRB7  ========================================================= */
+/* =========================================================  CTRL7  ========================================================= */
+/* =============================================  CTIMER CTRL7 CTLINK7 [31..31]  ============================================= */
+typedef enum {                                  /*!< CTIMER_CTRL7_CTLINK7                                                      */
+  CTIMER_CTRL7_CTLINK7_TWO_16BIT_TIMERS = 0,    /*!< TWO_16BIT_TIMERS : Use A7/B7 timers as two independent 16-bit
+                                                     timers (default). value.                                                  */
+  CTIMER_CTRL7_CTLINK7_32BIT_TIMER     = 1,     /*!< 32BIT_TIMER : Link A7/B7 timers into a single 32-bit timer.
+                                                     value.                                                                    */
+} CTIMER_CTRL7_CTLINK7_Enum;
+
+/* ============================================  CTIMER CTRL7 TMRB7POL [28..28]  ============================================= */
+typedef enum {                                  /*!< CTIMER_CTRL7_TMRB7POL                                                     */
+  CTIMER_CTRL7_TMRB7POL_NORMAL         = 0,     /*!< NORMAL : The polarity of the TMRPINB7 pin is the same as the
+                                                     timer output. value.                                                      */
+  CTIMER_CTRL7_TMRB7POL_INVERTED       = 1,     /*!< INVERTED : The polarity of the TMRPINB7 pin is the inverse of
+                                                     the timer output. value.                                                  */
+} CTIMER_CTRL7_TMRB7POL_Enum;
+
+/* ============================================  CTIMER CTRL7 TMRB7CLR [27..27]  ============================================= */
+typedef enum {                                  /*!< CTIMER_CTRL7_TMRB7CLR                                                     */
+  CTIMER_CTRL7_TMRB7CLR_RUN            = 0,     /*!< RUN : Allow counter/timer B7 to run value.                                */
+  CTIMER_CTRL7_TMRB7CLR_CLEAR          = 1,     /*!< CLEAR : Holds counter/timer B7 at 0x0000. value.                          */
+} CTIMER_CTRL7_TMRB7CLR_Enum;
+
+/* ============================================  CTIMER CTRL7 TMRB7IE1 [26..26]  ============================================= */
+typedef enum {                                  /*!< CTIMER_CTRL7_TMRB7IE1                                                     */
+  CTIMER_CTRL7_TMRB7IE1_DIS            = 0,     /*!< DIS : Disable counter/timer B7 from generating an interrupt
+                                                     based on COMPR1. value.                                                   */
+  CTIMER_CTRL7_TMRB7IE1_EN             = 1,     /*!< EN : Enable counter/timer B7 to generate an interrupt based
+                                                     on COMPR1. value.                                                         */
+} CTIMER_CTRL7_TMRB7IE1_Enum;
+
+/* ============================================  CTIMER CTRL7 TMRB7IE0 [25..25]  ============================================= */
+typedef enum {                                  /*!< CTIMER_CTRL7_TMRB7IE0                                                     */
+  CTIMER_CTRL7_TMRB7IE0_DIS            = 0,     /*!< DIS : Disable counter/timer B7 from generating an interrupt
+                                                     based on COMPR0. value.                                                   */
+  CTIMER_CTRL7_TMRB7IE0_EN             = 1,     /*!< EN : Enable counter/timer B7 to generate an interrupt based
+                                                     on COMPR0 value.                                                          */
+} CTIMER_CTRL7_TMRB7IE0_Enum;
+
+/* =============================================  CTIMER CTRL7 TMRB7FN [22..24]  ============================================= */
+typedef enum {                                  /*!< CTIMER_CTRL7_TMRB7FN                                                      */
+  CTIMER_CTRL7_TMRB7FN_SINGLECOUNT     = 0,     /*!< SINGLECOUNT : Single count (output toggles and sticks). Count
+                                                     to CMPR0B7, stop. value.                                                  */
+  CTIMER_CTRL7_TMRB7FN_REPEATEDCOUNT   = 1,     /*!< REPEATEDCOUNT : Repeated count (periodic 1-clock-cycle-wide
+                                                     pulses). Count to CMPR0B7, restart. value.                                */
+  CTIMER_CTRL7_TMRB7FN_PULSE_ONCE      = 2,     /*!< PULSE_ONCE : Pulse once (aka one-shot). Count to CMPR0B7, assert,
+                                                     count to CMPR1B7, deassert, stop. value.                                  */
+  CTIMER_CTRL7_TMRB7FN_PULSE_CONT      = 3,     /*!< PULSE_CONT : Pulse continously. Count to CMPR0B7, assert, count
+                                                     to CMPR1B7, deassert, restart. value.                                     */
+  CTIMER_CTRL7_TMRB7FN_SINGLEPATTERN   = 4,     /*!< SINGLEPATTERN : Single pattern. value.                                    */
+  CTIMER_CTRL7_TMRB7FN_REPEATPATTERN   = 5,     /*!< REPEATPATTERN : Repeated pattern. value.                                  */
+  CTIMER_CTRL7_TMRB7FN_CONTINUOUS      = 6,     /*!< CONTINUOUS : Continuous run (aka Free Run). Count continuously.
+                                                     value.                                                                    */
+  CTIMER_CTRL7_TMRB7FN_ALTPWN          = 7,     /*!< ALTPWN : Alternate PWM value.                                             */
+} CTIMER_CTRL7_TMRB7FN_Enum;
+
+/* ============================================  CTIMER CTRL7 TMRB7CLK [17..21]  ============================================= */
+typedef enum {                                  /*!< CTIMER_CTRL7_TMRB7CLK                                                     */
+  CTIMER_CTRL7_TMRB7CLK_TMRPIN         = 0,     /*!< TMRPIN : Clock source is TMRPINB. value.                                  */
+  CTIMER_CTRL7_TMRB7CLK_HFRC_DIV4      = 1,     /*!< HFRC_DIV4 : Clock source is the HFRC / 4 value.                           */
+  CTIMER_CTRL7_TMRB7CLK_HFRC_DIV16     = 2,     /*!< HFRC_DIV16 : Clock source is HFRC / 16 value.                             */
+  CTIMER_CTRL7_TMRB7CLK_HFRC_DIV256    = 3,     /*!< HFRC_DIV256 : Clock source is HFRC / 256 value.                           */
+  CTIMER_CTRL7_TMRB7CLK_HFRC_DIV1024   = 4,     /*!< HFRC_DIV1024 : Clock source is HFRC / 1024 value.                         */
+  CTIMER_CTRL7_TMRB7CLK_HFRC_DIV4K     = 5,     /*!< HFRC_DIV4K : Clock source is HFRC / 4096 value.                           */
+  CTIMER_CTRL7_TMRB7CLK_XT             = 6,     /*!< XT : Clock source is the XT (uncalibrated). value.                        */
+  CTIMER_CTRL7_TMRB7CLK_XT_DIV2        = 7,     /*!< XT_DIV2 : Clock source is XT / 2 value.                                   */
+  CTIMER_CTRL7_TMRB7CLK_XT_DIV16       = 8,     /*!< XT_DIV16 : Clock source is XT / 16 value.                                 */
+  CTIMER_CTRL7_TMRB7CLK_XT_DIV128      = 9,     /*!< XT_DIV128 : Clock source is XT / 128 value.                               */
+  CTIMER_CTRL7_TMRB7CLK_LFRC_DIV2      = 10,    /*!< LFRC_DIV2 : Clock source is LFRC / 2 value.                               */
+  CTIMER_CTRL7_TMRB7CLK_LFRC_DIV32     = 11,    /*!< LFRC_DIV32 : Clock source is LFRC / 32 value.                             */
+  CTIMER_CTRL7_TMRB7CLK_LFRC_DIV1K     = 12,    /*!< LFRC_DIV1K : Clock source is LFRC / 1024 value.                           */
+  CTIMER_CTRL7_TMRB7CLK_LFRC           = 13,    /*!< LFRC : Clock source is LFRC value.                                        */
+  CTIMER_CTRL7_TMRB7CLK_RTC_100HZ      = 14,    /*!< RTC_100HZ : Clock source is 100 Hz from the current RTC oscillator.
+                                                     value.                                                                    */
+  CTIMER_CTRL7_TMRB7CLK_HCLK           = 15,    /*!< HCLK : Clock source is HCLK. value.                                       */
+  CTIMER_CTRL7_TMRB7CLK_XT_DIV4        = 16,    /*!< XT_DIV4 : Clock source is XT / 4 value.                                   */
+  CTIMER_CTRL7_TMRB7CLK_XT_DIV8        = 17,    /*!< XT_DIV8 : Clock source is XT / 8 value.                                   */
+  CTIMER_CTRL7_TMRB7CLK_XT_DIV32       = 18,    /*!< XT_DIV32 : Clock source is XT / 32 value.                                 */
+  CTIMER_CTRL7_TMRB7CLK_CTMRA7         = 20,    /*!< CTMRA7 : Clock source is CTIMERA7 OUT. value.                             */
+  CTIMER_CTRL7_TMRB7CLK_CTMRA2         = 21,    /*!< CTMRA2 : Clock source is CTIMERA2 OUT. value.                             */
+  CTIMER_CTRL7_TMRB7CLK_CTMRB2         = 22,    /*!< CTMRB2 : Clock source is CTIMERB2 OUT. value.                             */
+  CTIMER_CTRL7_TMRB7CLK_CTMRA0         = 23,    /*!< CTMRA0 : Clock source is CTIMERA0 OUT. value.                             */
+  CTIMER_CTRL7_TMRB7CLK_CTMRB0         = 24,    /*!< CTMRB0 : Clock source is CTIMERB0 OUT. value.                             */
+  CTIMER_CTRL7_TMRB7CLK_CTMRB1         = 25,    /*!< CTMRB1 : Clock source is CTIMERB1 OUT. value.                             */
+  CTIMER_CTRL7_TMRB7CLK_CTMRB3         = 26,    /*!< CTMRB3 : Clock source is CTIMERB3 OUT. value.                             */
+  CTIMER_CTRL7_TMRB7CLK_CTMRB4         = 27,    /*!< CTMRB4 : Clock source is CTIMERB4 OUT. value.                             */
+  CTIMER_CTRL7_TMRB7CLK_CTMRB5         = 28,    /*!< CTMRB5 : Clock source is CTIMERB5 OUT. value.                             */
+  CTIMER_CTRL7_TMRB7CLK_BUCKBLE        = 29,    /*!< BUCKBLE : Clock source is BLE buck converter TON pulses. value.           */
+  CTIMER_CTRL7_TMRB7CLK_BUCKB          = 30,    /*!< BUCKB : Clock source is Memory buck converter TON pulses. value.          */
+  CTIMER_CTRL7_TMRB7CLK_BUCKA          = 31,    /*!< BUCKA : Clock source is CPU buck converter TON pulses. value.             */
+} CTIMER_CTRL7_TMRB7CLK_Enum;
+
+/* =============================================  CTIMER CTRL7 TMRB7EN [16..16]  ============================================= */
+typedef enum {                                  /*!< CTIMER_CTRL7_TMRB7EN                                                      */
+  CTIMER_CTRL7_TMRB7EN_DIS             = 0,     /*!< DIS : Counter/Timer B7 Disable. value.                                    */
+  CTIMER_CTRL7_TMRB7EN_EN              = 1,     /*!< EN : Counter/Timer B7 Enable. value.                                      */
+} CTIMER_CTRL7_TMRB7EN_Enum;
+
+/* ============================================  CTIMER CTRL7 TMRA7POL [12..12]  ============================================= */
+typedef enum {                                  /*!< CTIMER_CTRL7_TMRA7POL                                                     */
+  CTIMER_CTRL7_TMRA7POL_NORMAL         = 0,     /*!< NORMAL : The polarity of the TMRPINA7 pin is the same as the
+                                                     timer output. value.                                                      */
+  CTIMER_CTRL7_TMRA7POL_INVERTED       = 1,     /*!< INVERTED : The polarity of the TMRPINA7 pin is the inverse of
+                                                     the timer output. value.                                                  */
+} CTIMER_CTRL7_TMRA7POL_Enum;
+
+/* ============================================  CTIMER CTRL7 TMRA7CLR [11..11]  ============================================= */
+typedef enum {                                  /*!< CTIMER_CTRL7_TMRA7CLR                                                     */
+  CTIMER_CTRL7_TMRA7CLR_RUN            = 0,     /*!< RUN : Allow counter/timer A7 to run value.                                */
+  CTIMER_CTRL7_TMRA7CLR_CLEAR          = 1,     /*!< CLEAR : Holds counter/timer A7 at 0x0000. value.                          */
+} CTIMER_CTRL7_TMRA7CLR_Enum;
+
+/* ============================================  CTIMER CTRL7 TMRA7IE1 [10..10]  ============================================= */
+typedef enum {                                  /*!< CTIMER_CTRL7_TMRA7IE1                                                     */
+  CTIMER_CTRL7_TMRA7IE1_DIS            = 0,     /*!< DIS : Disable counter/timer A7 from generating an interrupt
+                                                     based on COMPR1. value.                                                   */
+  CTIMER_CTRL7_TMRA7IE1_EN             = 1,     /*!< EN : Enable counter/timer A7 to generate an interrupt based
+                                                     on COMPR1. value.                                                         */
+} CTIMER_CTRL7_TMRA7IE1_Enum;
+
+/* =============================================  CTIMER CTRL7 TMRA7IE0 [9..9]  ============================================== */
+typedef enum {                                  /*!< CTIMER_CTRL7_TMRA7IE0                                                     */
+  CTIMER_CTRL7_TMRA7IE0_DIS            = 0,     /*!< DIS : Disable counter/timer A7 from generating an interrupt
+                                                     based on COMPR0. value.                                                   */
+  CTIMER_CTRL7_TMRA7IE0_EN             = 1,     /*!< EN : Enable counter/timer A7 to generate an interrupt based
+                                                     on COMPR0. value.                                                         */
+} CTIMER_CTRL7_TMRA7IE0_Enum;
+
+/* ==============================================  CTIMER CTRL7 TMRA7FN [6..8]  ============================================== */
+typedef enum {                                  /*!< CTIMER_CTRL7_TMRA7FN                                                      */
+  CTIMER_CTRL7_TMRA7FN_SINGLECOUNT     = 0,     /*!< SINGLECOUNT : Single count (output toggles and sticks). Count
+                                                     to CMPR0A7, stop. value.                                                  */
+  CTIMER_CTRL7_TMRA7FN_REPEATEDCOUNT   = 1,     /*!< REPEATEDCOUNT : Repeated count (periodic 1-clock-cycle-wide
+                                                     pulses). Count to CMPR0A7, restart. value.                                */
+  CTIMER_CTRL7_TMRA7FN_PULSE_ONCE      = 2,     /*!< PULSE_ONCE : Pulse once (aka one-shot). Count to CMPR0A7, assert,
+                                                     count to CMPR1A7, deassert, stop. value.                                  */
+  CTIMER_CTRL7_TMRA7FN_PULSE_CONT      = 3,     /*!< PULSE_CONT : Pulse continously. Count to CMPR0A7, assert, count
+                                                     to CMPR1A7, deassert, restart. value.                                     */
+  CTIMER_CTRL7_TMRA7FN_SINGLEPATTERN   = 4,     /*!< SINGLEPATTERN : Single pattern. value.                                    */
+  CTIMER_CTRL7_TMRA7FN_REPEATPATTERN   = 5,     /*!< REPEATPATTERN : Repeated pattern. value.                                  */
+  CTIMER_CTRL7_TMRA7FN_CONTINUOUS      = 6,     /*!< CONTINUOUS : Continuous run (aka Free Run). Count continuously.
+                                                     value.                                                                    */
+  CTIMER_CTRL7_TMRA7FN_ALTPWN          = 7,     /*!< ALTPWN : Alternate PWM value.                                             */
+} CTIMER_CTRL7_TMRA7FN_Enum;
+
+/* =============================================  CTIMER CTRL7 TMRA7CLK [1..5]  ============================================== */
+typedef enum {                                  /*!< CTIMER_CTRL7_TMRA7CLK                                                     */
+  CTIMER_CTRL7_TMRA7CLK_TMRPIN         = 0,     /*!< TMRPIN : Clock source is TMRPINA. value.                                  */
+  CTIMER_CTRL7_TMRA7CLK_HFRC_DIV4      = 1,     /*!< HFRC_DIV4 : Clock source is the HFRC / 4 value.                           */
+  CTIMER_CTRL7_TMRA7CLK_HFRC_DIV16     = 2,     /*!< HFRC_DIV16 : Clock source is HFRC / 16 value.                             */
+  CTIMER_CTRL7_TMRA7CLK_HFRC_DIV256    = 3,     /*!< HFRC_DIV256 : Clock source is HFRC / 256 value.                           */
+  CTIMER_CTRL7_TMRA7CLK_HFRC_DIV1024   = 4,     /*!< HFRC_DIV1024 : Clock source is HFRC / 1024 value.                         */
+  CTIMER_CTRL7_TMRA7CLK_HFRC_DIV4K     = 5,     /*!< HFRC_DIV4K : Clock source is HFRC / 4096 value.                           */
+  CTIMER_CTRL7_TMRA7CLK_XT             = 6,     /*!< XT : Clock source is the XT (uncalibrated). value.                        */
+  CTIMER_CTRL7_TMRA7CLK_XT_DIV2        = 7,     /*!< XT_DIV2 : Clock source is XT / 2 value.                                   */
+  CTIMER_CTRL7_TMRA7CLK_XT_DIV16       = 8,     /*!< XT_DIV16 : Clock source is XT / 16 value.                                 */
+  CTIMER_CTRL7_TMRA7CLK_XT_DIV128      = 9,     /*!< XT_DIV128 : Clock source is XT / 128 value.                               */
+  CTIMER_CTRL7_TMRA7CLK_LFRC_DIV2      = 10,    /*!< LFRC_DIV2 : Clock source is LFRC / 2 value.                               */
+  CTIMER_CTRL7_TMRA7CLK_LFRC_DIV32     = 11,    /*!< LFRC_DIV32 : Clock source is LFRC / 32 value.                             */
+  CTIMER_CTRL7_TMRA7CLK_LFRC_DIV1K     = 12,    /*!< LFRC_DIV1K : Clock source is LFRC / 1024 value.                           */
+  CTIMER_CTRL7_TMRA7CLK_LFRC           = 13,    /*!< LFRC : Clock source is LFRC value.                                        */
+  CTIMER_CTRL7_TMRA7CLK_RTC_100HZ      = 14,    /*!< RTC_100HZ : Clock source is 100 Hz from the current RTC oscillator.
+                                                     value.                                                                    */
+  CTIMER_CTRL7_TMRA7CLK_HCLK           = 15,    /*!< HCLK : Clock source is HCLK. value.                                       */
+  CTIMER_CTRL7_TMRA7CLK_XT_DIV4        = 16,    /*!< XT_DIV4 : Clock source is XT / 4 value.                                   */
+  CTIMER_CTRL7_TMRA7CLK_XT_DIV8        = 17,    /*!< XT_DIV8 : Clock source is XT / 8 value.                                   */
+  CTIMER_CTRL7_TMRA7CLK_XT_DIV32       = 18,    /*!< XT_DIV32 : Clock source is XT / 32 value.                                 */
+  CTIMER_CTRL7_TMRA7CLK_CTMRB7         = 20,    /*!< CTMRB7 : Clock source is CTIMERB7 OUT. value.                             */
+  CTIMER_CTRL7_TMRA7CLK_CTMRA2         = 21,    /*!< CTMRA2 : Clock source is CTIMERA2 OUT. value.                             */
+  CTIMER_CTRL7_TMRA7CLK_CTMRB2         = 22,    /*!< CTMRB2 : Clock source is CTIMERB2 OUT. value.                             */
+  CTIMER_CTRL7_TMRA7CLK_CTMRA0         = 23,    /*!< CTMRA0 : Clock source is CTIMERA0 OUT. value.                             */
+  CTIMER_CTRL7_TMRA7CLK_CTMRB0         = 24,    /*!< CTMRB0 : Clock source is CTIMERB0 OUT. value.                             */
+  CTIMER_CTRL7_TMRA7CLK_CTMRB1         = 25,    /*!< CTMRB1 : Clock source is CTIMERB1 OUT. value.                             */
+  CTIMER_CTRL7_TMRA7CLK_CTMRB3         = 26,    /*!< CTMRB3 : Clock source is CTIMERB3 OUT. value.                             */
+  CTIMER_CTRL7_TMRA7CLK_CTMRB4         = 27,    /*!< CTMRB4 : Clock source is CTIMERB4 OUT. value.                             */
+  CTIMER_CTRL7_TMRA7CLK_CTMRB5         = 28,    /*!< CTMRB5 : Clock source is CTIMERB5 OUT. value.                             */
+  CTIMER_CTRL7_TMRA7CLK_BUCKBLE        = 29,    /*!< BUCKBLE : Clock source is BLE buck converter TON pulses. value.           */
+  CTIMER_CTRL7_TMRA7CLK_BUCKB          = 30,    /*!< BUCKB : Clock source is Memory buck converter TON pulses. value.          */
+  CTIMER_CTRL7_TMRA7CLK_BUCKA          = 31,    /*!< BUCKA : Clock source is CPU buck converter TON pulses. value.             */
+} CTIMER_CTRL7_TMRA7CLK_Enum;
+
+/* ==============================================  CTIMER CTRL7 TMRA7EN [0..0]  ============================================== */
+typedef enum {                                  /*!< CTIMER_CTRL7_TMRA7EN                                                      */
+  CTIMER_CTRL7_TMRA7EN_DIS             = 0,     /*!< DIS : Counter/Timer A7 Disable. value.                                    */
+  CTIMER_CTRL7_TMRA7EN_EN              = 1,     /*!< EN : Counter/Timer A7 Enable. value.                                      */
+} CTIMER_CTRL7_TMRA7EN_Enum;
+
+/* =======================================================  CMPRAUXA7  ======================================================= */
+/* =======================================================  CMPRAUXB7  ======================================================= */
+/* =========================================================  AUX7  ========================================================== */
+/* ============================================  CTIMER AUX7 TMRB7EN23 [30..30]  ============================================= */
+typedef enum {                                  /*!< CTIMER_AUX7_TMRB7EN23                                                     */
+  CTIMER_AUX7_TMRB7EN23_DIS            = 1,     /*!< DIS : Disable enhanced functions. value.                                  */
+  CTIMER_AUX7_TMRB7EN23_EN             = 0,     /*!< EN : Enable enhanced functions. value.                                    */
+} CTIMER_AUX7_TMRB7EN23_Enum;
+
+/* ============================================  CTIMER AUX7 TMRB7POL23 [29..29]  ============================================ */
+typedef enum {                                  /*!< CTIMER_AUX7_TMRB7POL23                                                    */
+  CTIMER_AUX7_TMRB7POL23_NORM          = 0,     /*!< NORM : Upper output normal polarity value.                                */
+  CTIMER_AUX7_TMRB7POL23_INV           = 1,     /*!< INV : Upper output inverted polarity. value.                              */
+} CTIMER_AUX7_TMRB7POL23_Enum;
+
+/* ============================================  CTIMER AUX7 TMRB7TINV [28..28]  ============================================= */
+typedef enum {                                  /*!< CTIMER_AUX7_TMRB7TINV                                                     */
+  CTIMER_AUX7_TMRB7TINV_DIS            = 0,     /*!< DIS : Disable invert on trigger value.                                    */
+  CTIMER_AUX7_TMRB7TINV_EN             = 1,     /*!< EN : Enable invert on trigger value.                                      */
+} CTIMER_AUX7_TMRB7TINV_Enum;
+
+/* ===========================================  CTIMER AUX7 TMRB7NOSYNC [27..27]  ============================================ */
+typedef enum {                                  /*!< CTIMER_AUX7_TMRB7NOSYNC                                                   */
+  CTIMER_AUX7_TMRB7NOSYNC_DIS          = 0,     /*!< DIS : Synchronization on source clock value.                              */
+  CTIMER_AUX7_TMRB7NOSYNC_NOSYNC       = 1,     /*!< NOSYNC : No synchronization on source clock value.                        */
+} CTIMER_AUX7_TMRB7NOSYNC_Enum;
+
+/* ============================================  CTIMER AUX7 TMRB7TRIG [23..26]  ============================================= */
+typedef enum {                                  /*!< CTIMER_AUX7_TMRB7TRIG                                                     */
+  CTIMER_AUX7_TMRB7TRIG_DIS            = 0,     /*!< DIS : Trigger source is disabled. value.                                  */
+  CTIMER_AUX7_TMRB7TRIG_A7OUT          = 1,     /*!< A7OUT : Trigger source is CTIMERA7 OUT. value.                            */
+  CTIMER_AUX7_TMRB7TRIG_B3OUT          = 2,     /*!< B3OUT : Trigger source is CTIMERB3 OUT. value.                            */
+  CTIMER_AUX7_TMRB7TRIG_A3OUT          = 3,     /*!< A3OUT : Trigger source is CTIMERA3 OUT. value.                            */
+  CTIMER_AUX7_TMRB7TRIG_A5OUT          = 4,     /*!< A5OUT : Trigger source is CTIMERA5 OUT. value.                            */
+  CTIMER_AUX7_TMRB7TRIG_B5OUT          = 5,     /*!< B5OUT : Trigger source is CTIMERB5 OUT. value.                            */
+  CTIMER_AUX7_TMRB7TRIG_A2OUT          = 6,     /*!< A2OUT : Trigger source is CTIMERA2 OUT. value.                            */
+  CTIMER_AUX7_TMRB7TRIG_B2OUT          = 7,     /*!< B2OUT : Trigger source is CTIMERB2 OUT. value.                            */
+  CTIMER_AUX7_TMRB7TRIG_B3OUT2         = 8,     /*!< B3OUT2 : Trigger source is CTIMERB3 OUT2. value.                          */
+  CTIMER_AUX7_TMRB7TRIG_A3OUT2         = 9,     /*!< A3OUT2 : Trigger source is CTIMERA3 OUT2. value.                          */
+  CTIMER_AUX7_TMRB7TRIG_A2OUT2         = 10,    /*!< A2OUT2 : Trigger source is CTIMERA2 OUT2. value.                          */
+  CTIMER_AUX7_TMRB7TRIG_B2OUT2         = 11,    /*!< B2OUT2 : Trigger source is CTIMERB2 OUT2. value.                          */
+  CTIMER_AUX7_TMRB7TRIG_A6OUT2DUAL     = 12,    /*!< A6OUT2DUAL : Trigger source is CTIMERA6 OUT2, dual edge. value.           */
+  CTIMER_AUX7_TMRB7TRIG_A7OUT2DUAL     = 13,    /*!< A7OUT2DUAL : Trigger source is CTIMERA7 OUT2, dual edge. value.           */
+  CTIMER_AUX7_TMRB7TRIG_B1OUT2DUAL     = 14,    /*!< B1OUT2DUAL : Trigger source is CTIMERB1 OUT2, dual edge. value.           */
+  CTIMER_AUX7_TMRB7TRIG_A1OUT2DUAL     = 15,    /*!< A1OUT2DUAL : Trigger source is CTIMERA1 OUT2, dual edge. value.           */
+} CTIMER_AUX7_TMRB7TRIG_Enum;
+
+/* ============================================  CTIMER AUX7 TMRA7EN23 [14..14]  ============================================= */
+typedef enum {                                  /*!< CTIMER_AUX7_TMRA7EN23                                                     */
+  CTIMER_AUX7_TMRA7EN23_DIS            = 1,     /*!< DIS : Disable enhanced functions. value.                                  */
+  CTIMER_AUX7_TMRA7EN23_EN             = 0,     /*!< EN : Enable enhanced functions. value.                                    */
+} CTIMER_AUX7_TMRA7EN23_Enum;
+
+/* ============================================  CTIMER AUX7 TMRA7POL23 [13..13]  ============================================ */
+typedef enum {                                  /*!< CTIMER_AUX7_TMRA7POL23                                                    */
+  CTIMER_AUX7_TMRA7POL23_NORM          = 0,     /*!< NORM : Upper output normal polarity value.                                */
+  CTIMER_AUX7_TMRA7POL23_INV           = 1,     /*!< INV : Upper output inverted polarity. value.                              */
+} CTIMER_AUX7_TMRA7POL23_Enum;
+
+/* ============================================  CTIMER AUX7 TMRA7TINV [12..12]  ============================================= */
+typedef enum {                                  /*!< CTIMER_AUX7_TMRA7TINV                                                     */
+  CTIMER_AUX7_TMRA7TINV_DIS            = 0,     /*!< DIS : Disable invert on trigger value.                                    */
+  CTIMER_AUX7_TMRA7TINV_EN             = 1,     /*!< EN : Enable invert on trigger value.                                      */
+} CTIMER_AUX7_TMRA7TINV_Enum;
+
+/* ===========================================  CTIMER AUX7 TMRA7NOSYNC [11..11]  ============================================ */
+typedef enum {                                  /*!< CTIMER_AUX7_TMRA7NOSYNC                                                   */
+  CTIMER_AUX7_TMRA7NOSYNC_DIS          = 0,     /*!< DIS : Synchronization on source clock value.                              */
+  CTIMER_AUX7_TMRA7NOSYNC_NOSYNC       = 1,     /*!< NOSYNC : No synchronization on source clock value.                        */
+} CTIMER_AUX7_TMRA7NOSYNC_Enum;
+
+/* =============================================  CTIMER AUX7 TMRA7TRIG [7..10]  ============================================= */
+typedef enum {                                  /*!< CTIMER_AUX7_TMRA7TRIG                                                     */
+  CTIMER_AUX7_TMRA7TRIG_DIS            = 0,     /*!< DIS : Trigger source is disabled. value.                                  */
+  CTIMER_AUX7_TMRA7TRIG_B7OUT          = 1,     /*!< B7OUT : Trigger source is CTIMERB7 OUT. value.                            */
+  CTIMER_AUX7_TMRA7TRIG_B3OUT          = 2,     /*!< B3OUT : Trigger source is CTIMERB3 OUT. value.                            */
+  CTIMER_AUX7_TMRA7TRIG_A3OUT          = 3,     /*!< A3OUT : Trigger source is CTIMERA3 OUT. value.                            */
+  CTIMER_AUX7_TMRA7TRIG_A1OUT          = 4,     /*!< A1OUT : Trigger source is CTIMERA1 OUT. value.                            */
+  CTIMER_AUX7_TMRA7TRIG_B1OUT          = 5,     /*!< B1OUT : Trigger source is CTIMERB1 OUT. value.                            */
+  CTIMER_AUX7_TMRA7TRIG_A4OUT          = 6,     /*!< A4OUT : Trigger source is CTIMERA4 OUT. value.                            */
+  CTIMER_AUX7_TMRA7TRIG_B4OUT          = 7,     /*!< B4OUT : Trigger source is CTIMERB4 OUT. value.                            */
+  CTIMER_AUX7_TMRA7TRIG_B3OUT2         = 8,     /*!< B3OUT2 : Trigger source is CTIMERB3 OUT2. value.                          */
+  CTIMER_AUX7_TMRA7TRIG_A3OUT2         = 9,     /*!< A3OUT2 : Trigger source is CTIMERA3 OUT2. value.                          */
+  CTIMER_AUX7_TMRA7TRIG_A2OUT2         = 10,    /*!< A2OUT2 : Trigger source is CTIMERA2 OUT2. value.                          */
+  CTIMER_AUX7_TMRA7TRIG_B2OUT2         = 11,    /*!< B2OUT2 : Trigger source is CTIMERB2 OUT2. value.                          */
+  CTIMER_AUX7_TMRA7TRIG_A6OUT2DUAL     = 12,    /*!< A6OUT2DUAL : Trigger source is CTIMERA6 OUT2, dual edge. value.           */
+  CTIMER_AUX7_TMRA7TRIG_A5OUT2DUAL     = 13,    /*!< A5OUT2DUAL : Trigger source is CTIMERA5 OUT2, dual edge. value.           */
+  CTIMER_AUX7_TMRA7TRIG_B4OUT2DUAL     = 14,    /*!< B4OUT2DUAL : Trigger source is CTIMERB4 OUT2, dual edge. value.           */
+  CTIMER_AUX7_TMRA7TRIG_A4OUT2DUAL     = 15,    /*!< A4OUT2DUAL : Trigger source is CTIMERA4 OUT2, dual edge. value.           */
+} CTIMER_AUX7_TMRA7TRIG_Enum;
+
+/* ========================================================  GLOBEN  ========================================================= */
+/* ==============================================  CTIMER GLOBEN ENB7 [15..15]  ============================================== */
+typedef enum {                                  /*!< CTIMER_GLOBEN_ENB7                                                        */
+  CTIMER_GLOBEN_ENB7_LCO               = 1,     /*!< LCO : Use local enable. value.                                            */
+  CTIMER_GLOBEN_ENB7_DIS               = 0,     /*!< DIS : Disable CTIMER. value.                                              */
+} CTIMER_GLOBEN_ENB7_Enum;
+
+/* ==============================================  CTIMER GLOBEN ENA7 [14..14]  ============================================== */
+typedef enum {                                  /*!< CTIMER_GLOBEN_ENA7                                                        */
+  CTIMER_GLOBEN_ENA7_LCO               = 1,     /*!< LCO : Use local enable. value.                                            */
+  CTIMER_GLOBEN_ENA7_DIS               = 0,     /*!< DIS : Disable CTIMER. value.                                              */
+} CTIMER_GLOBEN_ENA7_Enum;
+
+/* ==============================================  CTIMER GLOBEN ENB6 [13..13]  ============================================== */
+typedef enum {                                  /*!< CTIMER_GLOBEN_ENB6                                                        */
+  CTIMER_GLOBEN_ENB6_LCO               = 1,     /*!< LCO : Use local enable. value.                                            */
+  CTIMER_GLOBEN_ENB6_DIS               = 0,     /*!< DIS : Disable CTIMER. value.                                              */
+} CTIMER_GLOBEN_ENB6_Enum;
+
+/* ==============================================  CTIMER GLOBEN ENA6 [12..12]  ============================================== */
+typedef enum {                                  /*!< CTIMER_GLOBEN_ENA6                                                        */
+  CTIMER_GLOBEN_ENA6_LCO               = 1,     /*!< LCO : Use local enable. value.                                            */
+  CTIMER_GLOBEN_ENA6_DIS               = 0,     /*!< DIS : Disable CTIMER. value.                                              */
+} CTIMER_GLOBEN_ENA6_Enum;
+
+/* ==============================================  CTIMER GLOBEN ENB5 [11..11]  ============================================== */
+typedef enum {                                  /*!< CTIMER_GLOBEN_ENB5                                                        */
+  CTIMER_GLOBEN_ENB5_LCO               = 1,     /*!< LCO : Use local enable. value.                                            */
+  CTIMER_GLOBEN_ENB5_DIS               = 0,     /*!< DIS : Disable CTIMER. value.                                              */
+} CTIMER_GLOBEN_ENB5_Enum;
+
+/* ==============================================  CTIMER GLOBEN ENA5 [10..10]  ============================================== */
+typedef enum {                                  /*!< CTIMER_GLOBEN_ENA5                                                        */
+  CTIMER_GLOBEN_ENA5_LCO               = 1,     /*!< LCO : Use local enable. value.                                            */
+  CTIMER_GLOBEN_ENA5_DIS               = 0,     /*!< DIS : Disable CTIMER. value.                                              */
+} CTIMER_GLOBEN_ENA5_Enum;
+
+/* ===============================================  CTIMER GLOBEN ENB4 [9..9]  =============================================== */
+typedef enum {                                  /*!< CTIMER_GLOBEN_ENB4                                                        */
+  CTIMER_GLOBEN_ENB4_LCO               = 1,     /*!< LCO : Use local enable. value.                                            */
+  CTIMER_GLOBEN_ENB4_DIS               = 0,     /*!< DIS : Disable CTIMER. value.                                              */
+} CTIMER_GLOBEN_ENB4_Enum;
+
+/* ===============================================  CTIMER GLOBEN ENA4 [8..8]  =============================================== */
+typedef enum {                                  /*!< CTIMER_GLOBEN_ENA4                                                        */
+  CTIMER_GLOBEN_ENA4_LCO               = 1,     /*!< LCO : Use local enable. value.                                            */
+  CTIMER_GLOBEN_ENA4_DIS               = 0,     /*!< DIS : Disable CTIMER. value.                                              */
+} CTIMER_GLOBEN_ENA4_Enum;
+
+/* ===============================================  CTIMER GLOBEN ENB3 [7..7]  =============================================== */
+typedef enum {                                  /*!< CTIMER_GLOBEN_ENB3                                                        */
+  CTIMER_GLOBEN_ENB3_LCO               = 1,     /*!< LCO : Use local enable. value.                                            */
+  CTIMER_GLOBEN_ENB3_DIS               = 0,     /*!< DIS : Disable CTIMER. value.                                              */
+} CTIMER_GLOBEN_ENB3_Enum;
+
+/* ===============================================  CTIMER GLOBEN ENA3 [6..6]  =============================================== */
+typedef enum {                                  /*!< CTIMER_GLOBEN_ENA3                                                        */
+  CTIMER_GLOBEN_ENA3_LCO               = 1,     /*!< LCO : Use local enable. value.                                            */
+  CTIMER_GLOBEN_ENA3_DIS               = 0,     /*!< DIS : Disable CTIMER. value.                                              */
+} CTIMER_GLOBEN_ENA3_Enum;
+
+/* ===============================================  CTIMER GLOBEN ENB2 [5..5]  =============================================== */
+typedef enum {                                  /*!< CTIMER_GLOBEN_ENB2                                                        */
+  CTIMER_GLOBEN_ENB2_LCO               = 1,     /*!< LCO : Use local enable. value.                                            */
+  CTIMER_GLOBEN_ENB2_DIS               = 0,     /*!< DIS : Disable CTIMER. value.                                              */
+} CTIMER_GLOBEN_ENB2_Enum;
+
+/* ===============================================  CTIMER GLOBEN ENA2 [4..4]  =============================================== */
+typedef enum {                                  /*!< CTIMER_GLOBEN_ENA2                                                        */
+  CTIMER_GLOBEN_ENA2_LCO               = 1,     /*!< LCO : Use local enable. value.                                            */
+  CTIMER_GLOBEN_ENA2_DIS               = 0,     /*!< DIS : Disable CTIMER. value.                                              */
+} CTIMER_GLOBEN_ENA2_Enum;
+
+/* ===============================================  CTIMER GLOBEN ENB1 [3..3]  =============================================== */
+typedef enum {                                  /*!< CTIMER_GLOBEN_ENB1                                                        */
+  CTIMER_GLOBEN_ENB1_LCO               = 1,     /*!< LCO : Use local enable. value.                                            */
+  CTIMER_GLOBEN_ENB1_DIS               = 0,     /*!< DIS : Disable CTIMER. value.                                              */
+} CTIMER_GLOBEN_ENB1_Enum;
+
+/* ===============================================  CTIMER GLOBEN ENA1 [2..2]  =============================================== */
+typedef enum {                                  /*!< CTIMER_GLOBEN_ENA1                                                        */
+  CTIMER_GLOBEN_ENA1_LCO               = 1,     /*!< LCO : Use local enable. value.                                            */
+  CTIMER_GLOBEN_ENA1_DIS               = 0,     /*!< DIS : Disable CTIMER. value.                                              */
+} CTIMER_GLOBEN_ENA1_Enum;
+
+/* ===============================================  CTIMER GLOBEN ENB0 [1..1]  =============================================== */
+typedef enum {                                  /*!< CTIMER_GLOBEN_ENB0                                                        */
+  CTIMER_GLOBEN_ENB0_LCO               = 1,     /*!< LCO : Use local enable. value.                                            */
+  CTIMER_GLOBEN_ENB0_DIS               = 0,     /*!< DIS : Disable CTIMER. value.                                              */
+} CTIMER_GLOBEN_ENB0_Enum;
+
+/* ===============================================  CTIMER GLOBEN ENA0 [0..0]  =============================================== */
+typedef enum {                                  /*!< CTIMER_GLOBEN_ENA0                                                        */
+  CTIMER_GLOBEN_ENA0_LCO               = 1,     /*!< LCO : Use local enable. value.                                            */
+  CTIMER_GLOBEN_ENA0_DIS               = 0,     /*!< DIS : Disable CTIMER. value.                                              */
+} CTIMER_GLOBEN_ENA0_Enum;
+
+/* ========================================================  OUTCFG0  ======================================================== */
+/* =============================================  CTIMER OUTCFG0 CFG9 [28..30]  ============================================== */
+typedef enum {                                  /*!< CTIMER_OUTCFG0_CFG9                                                       */
+  CTIMER_OUTCFG0_CFG9_A7OUT2           = 7,     /*!< A7OUT2 : Output is A7OUT2. value.                                         */
+  CTIMER_OUTCFG0_CFG9_A6OUT2           = 6,     /*!< A6OUT2 : Output is A6OUT2. value.                                         */
+  CTIMER_OUTCFG0_CFG9_B0OUT            = 5,     /*!< B0OUT : Output is B0OUT. value.                                           */
+  CTIMER_OUTCFG0_CFG9_A4OUT            = 4,     /*!< A4OUT : Output is A4OUT. value.                                           */
+  CTIMER_OUTCFG0_CFG9_A2OUT            = 3,     /*!< A2OUT : Output is A2OUT. value.                                           */
+  CTIMER_OUTCFG0_CFG9_A2OUT2           = 2,     /*!< A2OUT2 : Output is A2OUT2 value.                                          */
+  CTIMER_OUTCFG0_CFG9_ONE              = 1,     /*!< ONE : Force output to 1. value.                                           */
+  CTIMER_OUTCFG0_CFG9_ZERO             = 0,     /*!< ZERO : Force output to 0 value.                                           */
+} CTIMER_OUTCFG0_CFG9_Enum;
+
+/* =============================================  CTIMER OUTCFG0 CFG8 [25..27]  ============================================== */
+typedef enum {                                  /*!< CTIMER_OUTCFG0_CFG8                                                       */
+  CTIMER_OUTCFG0_CFG8_A7OUT2           = 7,     /*!< A7OUT2 : Output is A7OUT2. value.                                         */
+  CTIMER_OUTCFG0_CFG8_A6OUT2           = 6,     /*!< A6OUT2 : Output is A6OUT2. value.                                         */
+  CTIMER_OUTCFG0_CFG8_B6OUT            = 5,     /*!< B6OUT : Output is B6OUT. value.                                           */
+  CTIMER_OUTCFG0_CFG8_A4OUT2           = 4,     /*!< A4OUT2 : Output is A4OUT2. value.                                         */
+  CTIMER_OUTCFG0_CFG8_A3OUT2           = 3,     /*!< A3OUT2 : Output is A3OUT. value.                                          */
+  CTIMER_OUTCFG0_CFG8_A2OUT            = 2,     /*!< A2OUT : Output is A2OUT value.                                            */
+  CTIMER_OUTCFG0_CFG8_ONE              = 1,     /*!< ONE : Force output to 1. value.                                           */
+  CTIMER_OUTCFG0_CFG8_ZERO             = 0,     /*!< ZERO : Force output to 0 value.                                           */
+} CTIMER_OUTCFG0_CFG8_Enum;
+
+/* =============================================  CTIMER OUTCFG0 CFG7 [22..24]  ============================================== */
+typedef enum {                                  /*!< CTIMER_OUTCFG0_CFG7                                                       */
+  CTIMER_OUTCFG0_CFG7_A7OUT2           = 7,     /*!< A7OUT2 : Output is A7OUT2. value.                                         */
+  CTIMER_OUTCFG0_CFG7_A6OUT2           = 6,     /*!< A6OUT2 : Output is A6OUT2. value.                                         */
+  CTIMER_OUTCFG0_CFG7_A7OUT            = 5,     /*!< A7OUT : Output is A7OUT. value.                                           */
+  CTIMER_OUTCFG0_CFG7_B5OUT            = 4,     /*!< B5OUT : Output is B5OUT. value.                                           */
+  CTIMER_OUTCFG0_CFG7_B1OUT            = 3,     /*!< B1OUT : Output is B1OUT. value.                                           */
+  CTIMER_OUTCFG0_CFG7_B1OUT2           = 2,     /*!< B1OUT2 : Output is B1OUT2 value.                                          */
+  CTIMER_OUTCFG0_CFG7_ONE              = 1,     /*!< ONE : Force output to 1. value.                                           */
+  CTIMER_OUTCFG0_CFG7_ZERO             = 0,     /*!< ZERO : Force output to 0 value.                                           */
+} CTIMER_OUTCFG0_CFG7_Enum;
+
+/* =============================================  CTIMER OUTCFG0 CFG6 [19..21]  ============================================== */
+typedef enum {                                  /*!< CTIMER_OUTCFG0_CFG6                                                       */
+  CTIMER_OUTCFG0_CFG6_A7OUT2           = 7,     /*!< A7OUT2 : Output is A7OUT2. value.                                         */
+  CTIMER_OUTCFG0_CFG6_A6OUT2           = 6,     /*!< A6OUT2 : Output is A6OUT2. value.                                         */
+  CTIMER_OUTCFG0_CFG6_B7OUT            = 5,     /*!< B7OUT : Output is B7OUT. value.                                           */
+  CTIMER_OUTCFG0_CFG6_B5OUT2           = 4,     /*!< B5OUT2 : Output is B5OUT2. value.                                         */
+  CTIMER_OUTCFG0_CFG6_A1OUT            = 3,     /*!< A1OUT : Output is A1OUT. value.                                           */
+  CTIMER_OUTCFG0_CFG6_B1OUT            = 2,     /*!< B1OUT : Output is B1OUT value.                                            */
+  CTIMER_OUTCFG0_CFG6_ONE              = 1,     /*!< ONE : Force output to 1. value.                                           */
+  CTIMER_OUTCFG0_CFG6_ZERO             = 0,     /*!< ZERO : Force output to 0 value.                                           */
+} CTIMER_OUTCFG0_CFG6_Enum;
+
+/* =============================================  CTIMER OUTCFG0 CFG5 [16..18]  ============================================== */
+typedef enum {                                  /*!< CTIMER_OUTCFG0_CFG5                                                       */
+  CTIMER_OUTCFG0_CFG5_A7OUT2           = 7,     /*!< A7OUT2 : Output is A7OUT2. value.                                         */
+  CTIMER_OUTCFG0_CFG5_A6OUT2           = 6,     /*!< A6OUT2 : Output is A6OUT2. value.                                         */
+  CTIMER_OUTCFG0_CFG5_A7OUT            = 5,     /*!< A7OUT : Output is A7OUT. value.                                           */
+  CTIMER_OUTCFG0_CFG5_B6OUT            = 4,     /*!< B6OUT : Output is A5OUT. value.                                           */
+  CTIMER_OUTCFG0_CFG5_A1OUT            = 3,     /*!< A1OUT : Output is A1OUT. value.                                           */
+  CTIMER_OUTCFG0_CFG5_A1OUT2           = 2,     /*!< A1OUT2 : Output is A1OUT2 value.                                          */
+  CTIMER_OUTCFG0_CFG5_ONE              = 1,     /*!< ONE : Force output to 1. value.                                           */
+  CTIMER_OUTCFG0_CFG5_ZERO             = 0,     /*!< ZERO : Force output to 0 value.                                           */
+} CTIMER_OUTCFG0_CFG5_Enum;
+
+/* =============================================  CTIMER OUTCFG0 CFG4 [12..14]  ============================================== */
+typedef enum {                                  /*!< CTIMER_OUTCFG0_CFG4                                                       */
+  CTIMER_OUTCFG0_CFG4_A7OUT2           = 7,     /*!< A7OUT2 : Output is A7OUT2. value.                                         */
+  CTIMER_OUTCFG0_CFG4_A6OUT2           = 6,     /*!< A6OUT2 : Output is A6OUT2. value.                                         */
+  CTIMER_OUTCFG0_CFG4_B5OUT            = 5,     /*!< B5OUT : Output is B5OUT. value.                                           */
+  CTIMER_OUTCFG0_CFG4_A5OUT2           = 4,     /*!< A5OUT2 : Output is A5OUT2. value.                                         */
+  CTIMER_OUTCFG0_CFG4_A2OUT2           = 3,     /*!< A2OUT2 : Output is A2OUT2. value.                                         */
+  CTIMER_OUTCFG0_CFG4_A1OUT            = 2,     /*!< A1OUT : Output is A1OUT value.                                            */
+  CTIMER_OUTCFG0_CFG4_ONE              = 1,     /*!< ONE : Force output to 1. value.                                           */
+  CTIMER_OUTCFG0_CFG4_ZERO             = 0,     /*!< ZERO : Force output to 0 value.                                           */
+} CTIMER_OUTCFG0_CFG4_Enum;
+
+/* ==============================================  CTIMER OUTCFG0 CFG3 [9..11]  ============================================== */
+typedef enum {                                  /*!< CTIMER_OUTCFG0_CFG3                                                       */
+  CTIMER_OUTCFG0_CFG3_A7OUT2           = 7,     /*!< A7OUT2 : Output is A7OUT2. value.                                         */
+  CTIMER_OUTCFG0_CFG3_A6OUT2           = 6,     /*!< A6OUT2 : Output is A6OUT2. value.                                         */
+  CTIMER_OUTCFG0_CFG3_A6OUT            = 5,     /*!< A6OUT : Output is A6OUT. value.                                           */
+  CTIMER_OUTCFG0_CFG3_A1OUT            = 4,     /*!< A1OUT : Output is A1OUT. value.                                           */
+  CTIMER_OUTCFG0_CFG3_B0OUT            = 3,     /*!< B0OUT : Output is B0OUT. value.                                           */
+  CTIMER_OUTCFG0_CFG3_B0OUT2           = 2,     /*!< B0OUT2 : Output is B0OUT2 value.                                          */
+  CTIMER_OUTCFG0_CFG3_ONE              = 1,     /*!< ONE : Force output to 1. value.                                           */
+  CTIMER_OUTCFG0_CFG3_ZERO             = 0,     /*!< ZERO : Force output to 0 value.                                           */
+} CTIMER_OUTCFG0_CFG3_Enum;
+
+/* ==============================================  CTIMER OUTCFG0 CFG2 [6..8]  =============================================== */
+typedef enum {                                  /*!< CTIMER_OUTCFG0_CFG2                                                       */
+  CTIMER_OUTCFG0_CFG2_A7OUT2           = 7,     /*!< A7OUT2 : Output is A7OUT2. value.                                         */
+  CTIMER_OUTCFG0_CFG2_A6OUT2           = 6,     /*!< A6OUT2 : Output is A6OUT2. value.                                         */
+  CTIMER_OUTCFG0_CFG2_A7OUT            = 5,     /*!< A7OUT : Output is A7OUT. value.                                           */
+  CTIMER_OUTCFG0_CFG2_B6OUT2           = 4,     /*!< B6OUT2 : Output is B6OUT2. value.                                         */
+  CTIMER_OUTCFG0_CFG2_B1OUT2           = 3,     /*!< B1OUT2 : Output is B1OUT2. value.                                         */
+  CTIMER_OUTCFG0_CFG2_B0OUT            = 2,     /*!< B0OUT : Output is B0OUT value.                                            */
+  CTIMER_OUTCFG0_CFG2_ONE              = 1,     /*!< ONE : Force output to 1. value.                                           */
+  CTIMER_OUTCFG0_CFG2_ZERO             = 0,     /*!< ZERO : Force output to 0 value.                                           */
+} CTIMER_OUTCFG0_CFG2_Enum;
+
+/* ==============================================  CTIMER OUTCFG0 CFG1 [3..5]  =============================================== */
+typedef enum {                                  /*!< CTIMER_OUTCFG0_CFG1                                                       */
+  CTIMER_OUTCFG0_CFG1_A7OUT2           = 7,     /*!< A7OUT2 : Output is A7OUT2. value.                                         */
+  CTIMER_OUTCFG0_CFG1_A6OUT2           = 6,     /*!< A6OUT2 : Output is A6OUT2. value.                                         */
+  CTIMER_OUTCFG0_CFG1_B7OUT2           = 5,     /*!< B7OUT2 : Output is B7OUT2. value.                                         */
+  CTIMER_OUTCFG0_CFG1_A5OUT            = 4,     /*!< A5OUT : Output is A5OUT. value.                                           */
+  CTIMER_OUTCFG0_CFG1_A0OUT            = 3,     /*!< A0OUT : Output is A0OUT. value.                                           */
+  CTIMER_OUTCFG0_CFG1_A0OUT2           = 2,     /*!< A0OUT2 : Output is A0OUT2 value.                                          */
+  CTIMER_OUTCFG0_CFG1_ONE              = 1,     /*!< ONE : Force output to 1. value.                                           */
+  CTIMER_OUTCFG0_CFG1_ZERO             = 0,     /*!< ZERO : Force output to 0 value.                                           */
+} CTIMER_OUTCFG0_CFG1_Enum;
+
+/* ==============================================  CTIMER OUTCFG0 CFG0 [0..2]  =============================================== */
+typedef enum {                                  /*!< CTIMER_OUTCFG0_CFG0                                                       */
+  CTIMER_OUTCFG0_CFG0_A7OUT2           = 7,     /*!< A7OUT2 : Output is A7OUT2. value.                                         */
+  CTIMER_OUTCFG0_CFG0_A6OUT2           = 6,     /*!< A6OUT2 : Output is A6OUT2. value.                                         */
+  CTIMER_OUTCFG0_CFG0_A6OUT            = 5,     /*!< A6OUT : Output is A6OUT. value.                                           */
+  CTIMER_OUTCFG0_CFG0_A5OUT2           = 4,     /*!< A5OUT2 : Output is A5OUT2. value.                                         */
+  CTIMER_OUTCFG0_CFG0_B2OUT2           = 3,     /*!< B2OUT2 : Output is B2OUT2. value.                                         */
+  CTIMER_OUTCFG0_CFG0_A0OUT            = 2,     /*!< A0OUT : Output is A0OUT value.                                            */
+  CTIMER_OUTCFG0_CFG0_ONE              = 1,     /*!< ONE : Force output to 1. value.                                           */
+  CTIMER_OUTCFG0_CFG0_ZERO             = 0,     /*!< ZERO : Force output to 0 value.                                           */
+} CTIMER_OUTCFG0_CFG0_Enum;
+
+/* ========================================================  OUTCFG1  ======================================================== */
+/* =============================================  CTIMER OUTCFG1 CFG19 [28..30]  ============================================= */
+typedef enum {                                  /*!< CTIMER_OUTCFG1_CFG19                                                      */
+  CTIMER_OUTCFG1_CFG19_A7OUT2          = 7,     /*!< A7OUT2 : Output is A7OUT2. value.                                         */
+  CTIMER_OUTCFG1_CFG19_A6OUT2          = 6,     /*!< A6OUT2 : Output is A6OUT2. value.                                         */
+  CTIMER_OUTCFG1_CFG19_B1OUT2          = 5,     /*!< B1OUT2 : Output is B1OUT2. value.                                         */
+  CTIMER_OUTCFG1_CFG19_B4OUT           = 4,     /*!< B4OUT : Output is B4OUT. value.                                           */
+  CTIMER_OUTCFG1_CFG19_A2OUT           = 3,     /*!< A2OUT : Output is A2OUT. value.                                           */
+  CTIMER_OUTCFG1_CFG19_B4OUT2          = 2,     /*!< B4OUT2 : Output is B4OUT2 value.                                          */
+  CTIMER_OUTCFG1_CFG19_ONE             = 1,     /*!< ONE : Force output to 1. value.                                           */
+  CTIMER_OUTCFG1_CFG19_ZERO            = 0,     /*!< ZERO : Force output to 0 value.                                           */
+} CTIMER_OUTCFG1_CFG19_Enum;
+
+/* =============================================  CTIMER OUTCFG1 CFG18 [25..27]  ============================================= */
+typedef enum {                                  /*!< CTIMER_OUTCFG1_CFG18                                                      */
+  CTIMER_OUTCFG1_CFG18_A7OUT2          = 7,     /*!< A7OUT2 : Output is A7OUT2. value.                                         */
+  CTIMER_OUTCFG1_CFG18_A6OUT2          = 6,     /*!< A6OUT2 : Output is A6OUT2. value.                                         */
+  CTIMER_OUTCFG1_CFG18_A3OUT2          = 5,     /*!< A3OUT2 : Output is A3OUT2. value.                                         */
+  CTIMER_OUTCFG1_CFG18_A0OUT           = 4,     /*!< A0OUT : Output is A0OUT. value.                                           */
+  CTIMER_OUTCFG1_CFG18_B0OUT           = 3,     /*!< B0OUT : Output is B0OUT. value.                                           */
+  CTIMER_OUTCFG1_CFG18_B4OUT           = 2,     /*!< B4OUT : Output is B4OUT value.                                            */
+  CTIMER_OUTCFG1_CFG18_ONE             = 1,     /*!< ONE : Force output to 1. value.                                           */
+  CTIMER_OUTCFG1_CFG18_ZERO            = 0,     /*!< ZERO : Force output to 0 value.                                           */
+} CTIMER_OUTCFG1_CFG18_Enum;
+
+/* =============================================  CTIMER OUTCFG1 CFG17 [22..24]  ============================================= */
+typedef enum {                                  /*!< CTIMER_OUTCFG1_CFG17                                                      */
+  CTIMER_OUTCFG1_CFG17_A7OUT2          = 7,     /*!< A7OUT2 : Output is A7OUT2. value.                                         */
+  CTIMER_OUTCFG1_CFG17_A6OUT2          = 6,     /*!< A6OUT2 : Output is A6OUT2. value.                                         */
+  CTIMER_OUTCFG1_CFG17_A1OUT2          = 5,     /*!< A1OUT2 : Output is A1OUT2. value.                                         */
+  CTIMER_OUTCFG1_CFG17_A4OUT           = 4,     /*!< A4OUT : Output is A4OUT. value.                                           */
+  CTIMER_OUTCFG1_CFG17_B7OUT           = 3,     /*!< B7OUT : Output is B7OUT. value.                                           */
+  CTIMER_OUTCFG1_CFG17_A4OUT2          = 2,     /*!< A4OUT2 : Output is A4OUT2 value.                                          */
+  CTIMER_OUTCFG1_CFG17_ONE             = 1,     /*!< ONE : Force output to 1. value.                                           */
+  CTIMER_OUTCFG1_CFG17_ZERO            = 0,     /*!< ZERO : Force output to 0 value.                                           */
+} CTIMER_OUTCFG1_CFG17_Enum;
+
+/* =============================================  CTIMER OUTCFG1 CFG16 [19..21]  ============================================= */
+typedef enum {                                  /*!< CTIMER_OUTCFG1_CFG16                                                      */
+  CTIMER_OUTCFG1_CFG16_A7OUT2          = 7,     /*!< A7OUT2 : Output is A7OUT2. value.                                         */
+  CTIMER_OUTCFG1_CFG16_A6OUT2          = 6,     /*!< A6OUT2 : Output is A6OUT2. value.                                         */
+  CTIMER_OUTCFG1_CFG16_B3OUT2          = 5,     /*!< B3OUT2 : Output is B3OUT2. value.                                         */
+  CTIMER_OUTCFG1_CFG16_A0OUT2          = 4,     /*!< A0OUT2 : Output is A0OUT2. value.                                         */
+  CTIMER_OUTCFG1_CFG16_A0OUT           = 3,     /*!< A0OUT : Output is A0OUT. value.                                           */
+  CTIMER_OUTCFG1_CFG16_A4OUT           = 2,     /*!< A4OUT : Output is A4OUT value.                                            */
+  CTIMER_OUTCFG1_CFG16_ONE             = 1,     /*!< ONE : Force output to 1. value.                                           */
+  CTIMER_OUTCFG1_CFG16_ZERO            = 0,     /*!< ZERO : Force output to 0 value.                                           */
+} CTIMER_OUTCFG1_CFG16_Enum;
+
+/* =============================================  CTIMER OUTCFG1 CFG15 [16..18]  ============================================= */
+typedef enum {                                  /*!< CTIMER_OUTCFG1_CFG15                                                      */
+  CTIMER_OUTCFG1_CFG15_A7OUT2          = 7,     /*!< A7OUT2 : Output is A7OUT2. value.                                         */
+  CTIMER_OUTCFG1_CFG15_A6OUT2          = 6,     /*!< A6OUT2 : Output is A6OUT2. value.                                         */
+  CTIMER_OUTCFG1_CFG15_A4OUT2          = 5,     /*!< A4OUT2 : Output is A4OUT2. value.                                         */
+  CTIMER_OUTCFG1_CFG15_A7OUT           = 4,     /*!< A7OUT : Output is B1OUT. value.                                           */
+  CTIMER_OUTCFG1_CFG15_B3OUT           = 3,     /*!< B3OUT : Output is B3OUT. value.                                           */
+  CTIMER_OUTCFG1_CFG15_B3OUT2          = 2,     /*!< B3OUT2 : Output is B3OUT2 value.                                          */
+  CTIMER_OUTCFG1_CFG15_ONE             = 1,     /*!< ONE : Force output to 1. value.                                           */
+  CTIMER_OUTCFG1_CFG15_ZERO            = 0,     /*!< ZERO : Force output to 0 value.                                           */
+} CTIMER_OUTCFG1_CFG15_Enum;
+
+/* =============================================  CTIMER OUTCFG1 CFG14 [12..14]  ============================================= */
+typedef enum {                                  /*!< CTIMER_OUTCFG1_CFG14                                                      */
+  CTIMER_OUTCFG1_CFG14_A7OUT2          = 7,     /*!< A7OUT2 : Output is A7OUT2. value.                                         */
+  CTIMER_OUTCFG1_CFG14_A6OUT2          = 6,     /*!< A6OUT2 : Output is A6OUT2. value.                                         */
+  CTIMER_OUTCFG1_CFG14_A7OUT           = 5,     /*!< A7OUT : Output is A7OUT. value.                                           */
+  CTIMER_OUTCFG1_CFG14_B7OUT2          = 4,     /*!< B7OUT2 : Output is B7OUT2. value.                                         */
+  CTIMER_OUTCFG1_CFG14_B1OUT           = 3,     /*!< B1OUT : Output is B1OUT. value.                                           */
+  CTIMER_OUTCFG1_CFG14_B3OUT           = 2,     /*!< B3OUT : Output is B3OUT value.                                            */
+  CTIMER_OUTCFG1_CFG14_ONE             = 1,     /*!< ONE : Force output to 1. value.                                           */
+  CTIMER_OUTCFG1_CFG14_ZERO            = 0,     /*!< ZERO : Force output to 0 value.                                           */
+} CTIMER_OUTCFG1_CFG14_Enum;
+
+/* =============================================  CTIMER OUTCFG1 CFG13 [9..11]  ============================================== */
+typedef enum {                                  /*!< CTIMER_OUTCFG1_CFG13                                                      */
+  CTIMER_OUTCFG1_CFG13_A7OUT2          = 7,     /*!< A7OUT2 : Output is A7OUT2. value.                                         */
+  CTIMER_OUTCFG1_CFG13_A6OUT2          = 6,     /*!< A6OUT2 : Output is A6OUT2. value.                                         */
+  CTIMER_OUTCFG1_CFG13_B4OUT2          = 5,     /*!< B4OUT2 : Output is B4OUT2. value.                                         */
+  CTIMER_OUTCFG1_CFG13_A6OUT           = 4,     /*!< A6OUT : Output is B1OUT. value.                                           */
+  CTIMER_OUTCFG1_CFG13_A3OUT           = 3,     /*!< A3OUT : Output is A3OUT. value.                                           */
+  CTIMER_OUTCFG1_CFG13_A3OUT2          = 2,     /*!< A3OUT2 : Output is A3OUT2 value.                                          */
+  CTIMER_OUTCFG1_CFG13_ONE             = 1,     /*!< ONE : Force output to 1. value.                                           */
+  CTIMER_OUTCFG1_CFG13_ZERO            = 0,     /*!< ZERO : Force output to 0 value.                                           */
+} CTIMER_OUTCFG1_CFG13_Enum;
+
+/* ==============================================  CTIMER OUTCFG1 CFG12 [6..8]  ============================================== */
+typedef enum {                                  /*!< CTIMER_OUTCFG1_CFG12                                                      */
+  CTIMER_OUTCFG1_CFG12_A7OUT2          = 7,     /*!< A7OUT2 : Output is A7OUT2. value.                                         */
+  CTIMER_OUTCFG1_CFG12_A6OUT2          = 6,     /*!< A6OUT2 : Output is A6OUT2. value.                                         */
+  CTIMER_OUTCFG1_CFG12_B6OUT2          = 5,     /*!< B6OUT2 : Output is B6OUT2. value.                                         */
+  CTIMER_OUTCFG1_CFG12_B0OUT2          = 4,     /*!< B0OUT2 : Output is B0OUT2. value.                                         */
+  CTIMER_OUTCFG1_CFG12_B1OUT           = 3,     /*!< B1OUT : Output is B1OUT. value.                                           */
+  CTIMER_OUTCFG1_CFG12_A3OUT           = 2,     /*!< A3OUT : Output is A3OUT value.                                            */
+  CTIMER_OUTCFG1_CFG12_ONE             = 1,     /*!< ONE : Force output to 1. value.                                           */
+  CTIMER_OUTCFG1_CFG12_ZERO            = 0,     /*!< ZERO : Force output to 0 value.                                           */
+} CTIMER_OUTCFG1_CFG12_Enum;
+
+/* ==============================================  CTIMER OUTCFG1 CFG11 [3..5]  ============================================== */
+typedef enum {                                  /*!< CTIMER_OUTCFG1_CFG11                                                      */
+  CTIMER_OUTCFG1_CFG11_A7OUT2          = 7,     /*!< A7OUT2 : Output is A7OUT2. value.                                         */
+  CTIMER_OUTCFG1_CFG11_A6OUT2          = 6,     /*!< A6OUT2 : Output is A6OUT2. value.                                         */
+  CTIMER_OUTCFG1_CFG11_B5OUT2          = 5,     /*!< B5OUT2 : Output is B5OUT2. value.                                         */
+  CTIMER_OUTCFG1_CFG11_B4OUT           = 4,     /*!< B4OUT : Output is B4OUT. value.                                           */
+  CTIMER_OUTCFG1_CFG11_B2OUT           = 3,     /*!< B2OUT : Output is B2OUT. value.                                           */
+  CTIMER_OUTCFG1_CFG11_B2OUT2          = 2,     /*!< B2OUT2 : Output is B2OUT2 value.                                          */
+  CTIMER_OUTCFG1_CFG11_ONE             = 1,     /*!< ONE : Force output to 1. value.                                           */
+  CTIMER_OUTCFG1_CFG11_ZERO            = 0,     /*!< ZERO : Force output to 0 value.                                           */
+} CTIMER_OUTCFG1_CFG11_Enum;
+
+/* ==============================================  CTIMER OUTCFG1 CFG10 [0..2]  ============================================== */
+typedef enum {                                  /*!< CTIMER_OUTCFG1_CFG10                                                      */
+  CTIMER_OUTCFG1_CFG10_A7OUT2          = 7,     /*!< A7OUT2 : Output is A7OUT2. value.                                         */
+  CTIMER_OUTCFG1_CFG10_A6OUT2          = 6,     /*!< A6OUT2 : Output is A6OUT2. value.                                         */
+  CTIMER_OUTCFG1_CFG10_A6OUT           = 5,     /*!< A6OUT : Output is A6OUT. value.                                           */
+  CTIMER_OUTCFG1_CFG10_B4OUT2          = 4,     /*!< B4OUT2 : Output is B4OUT2. value.                                         */
+  CTIMER_OUTCFG1_CFG10_B3OUT2          = 3,     /*!< B3OUT2 : Output is B3OUT2. value.                                         */
+  CTIMER_OUTCFG1_CFG10_B2OUT           = 2,     /*!< B2OUT : Output is B2OUT value.                                            */
+  CTIMER_OUTCFG1_CFG10_ONE             = 1,     /*!< ONE : Force output to 1. value.                                           */
+  CTIMER_OUTCFG1_CFG10_ZERO            = 0,     /*!< ZERO : Force output to 0 value.                                           */
+} CTIMER_OUTCFG1_CFG10_Enum;
+
+/* ========================================================  OUTCFG2  ======================================================== */
+/* =============================================  CTIMER OUTCFG2 CFG29 [28..30]  ============================================= */
+typedef enum {                                  /*!< CTIMER_OUTCFG2_CFG29                                                      */
+  CTIMER_OUTCFG2_CFG29_A7OUT2          = 7,     /*!< A7OUT2 : Output is A7OUT2. value.                                         */
+  CTIMER_OUTCFG2_CFG29_A6OUT2          = 6,     /*!< A6OUT2 : Output is A6OUT2. value.                                         */
+  CTIMER_OUTCFG2_CFG29_A3OUT2          = 5,     /*!< A3OUT2 : Output is A3OUT2. value.                                         */
+  CTIMER_OUTCFG2_CFG29_A7OUT           = 4,     /*!< A7OUT : Output is A7OUT. value.                                           */
+  CTIMER_OUTCFG2_CFG29_A1OUT           = 3,     /*!< A1OUT : Output is A1OUT. value.                                           */
+  CTIMER_OUTCFG2_CFG29_B5OUT2          = 2,     /*!< B5OUT2 : Output is B5OUT2 value.                                          */
+  CTIMER_OUTCFG2_CFG29_ONE             = 1,     /*!< ONE : Force output to 1. value.                                           */
+  CTIMER_OUTCFG2_CFG29_ZERO            = 0,     /*!< ZERO : Force output to 0 value.                                           */
+} CTIMER_OUTCFG2_CFG29_Enum;
+
+/* =============================================  CTIMER OUTCFG2 CFG28 [25..27]  ============================================= */
+typedef enum {                                  /*!< CTIMER_OUTCFG2_CFG28                                                      */
+  CTIMER_OUTCFG2_CFG28_A7OUT2          = 7,     /*!< A7OUT2 : Output is A7OUT2. value.                                         */
+  CTIMER_OUTCFG2_CFG28_A6OUT2          = 6,     /*!< A6OUT2 : Output is A6OUT2. value.                                         */
+  CTIMER_OUTCFG2_CFG28_B0OUT2          = 5,     /*!< B0OUT2 : Output is B0OUT2. value.                                         */
+  CTIMER_OUTCFG2_CFG28_A5OUT2          = 4,     /*!< A5OUT2 : Output is A5OUT2. value.                                         */
+  CTIMER_OUTCFG2_CFG28_A3OUT           = 3,     /*!< A3OUT : Output is A3OUT. value.                                           */
+  CTIMER_OUTCFG2_CFG28_A7OUT           = 2,     /*!< A7OUT : Output is A7OUT value.                                            */
+  CTIMER_OUTCFG2_CFG28_ONE             = 1,     /*!< ONE : Force output to 1. value.                                           */
+  CTIMER_OUTCFG2_CFG28_ZERO            = 0,     /*!< ZERO : Force output to 0 value.                                           */
+} CTIMER_OUTCFG2_CFG28_Enum;
+
+/* =============================================  CTIMER OUTCFG2 CFG27 [22..24]  ============================================= */
+typedef enum {                                  /*!< CTIMER_OUTCFG2_CFG27                                                      */
+  CTIMER_OUTCFG2_CFG27_A7OUT2          = 7,     /*!< A7OUT2 : Output is A7OUT2. value.                                         */
+  CTIMER_OUTCFG2_CFG27_A6OUT2          = 6,     /*!< A6OUT2 : Output is A6OUT2. value.                                         */
+  CTIMER_OUTCFG2_CFG27_B2OUT2          = 5,     /*!< B2OUT2 : Output is B2OUT2. value.                                         */
+  CTIMER_OUTCFG2_CFG27_B6OUT           = 4,     /*!< B6OUT : Output is B6OUT. value.                                           */
+  CTIMER_OUTCFG2_CFG27_A1OUT           = 3,     /*!< A1OUT : Output is A1OUT. value.                                           */
+  CTIMER_OUTCFG2_CFG27_B6OUT2          = 2,     /*!< B6OUT2 : Output is B6OUT2 value.                                          */
+  CTIMER_OUTCFG2_CFG27_ONE             = 1,     /*!< ONE : Force output to 1. value.                                           */
+  CTIMER_OUTCFG2_CFG27_ZERO            = 0,     /*!< ZERO : Force output to 0 value.                                           */
+} CTIMER_OUTCFG2_CFG27_Enum;
+
+/* =============================================  CTIMER OUTCFG2 CFG26 [19..21]  ============================================= */
+typedef enum {                                  /*!< CTIMER_OUTCFG2_CFG26                                                      */
+  CTIMER_OUTCFG2_CFG26_A7OUT2          = 7,     /*!< A7OUT2 : Output is A7OUT2. value.                                         */
+  CTIMER_OUTCFG2_CFG26_A6OUT2          = 6,     /*!< A6OUT2 : Output is A6OUT2. value.                                         */
+  CTIMER_OUTCFG2_CFG26_A1OUT2          = 5,     /*!< A1OUT2 : Output is A1OUT2. value.                                         */
+  CTIMER_OUTCFG2_CFG26_A5OUT           = 4,     /*!< A5OUT : Output is A5OUT. value.                                           */
+  CTIMER_OUTCFG2_CFG26_B2OUT           = 3,     /*!< B2OUT : Output is B2OUT. value.                                           */
+  CTIMER_OUTCFG2_CFG26_B6OUT           = 2,     /*!< B6OUT : Output is B6OUT value.                                            */
+  CTIMER_OUTCFG2_CFG26_ONE             = 1,     /*!< ONE : Force output to 1. value.                                           */
+  CTIMER_OUTCFG2_CFG26_ZERO            = 0,     /*!< ZERO : Force output to 0 value.                                           */
+} CTIMER_OUTCFG2_CFG26_Enum;
+
+/* =============================================  CTIMER OUTCFG2 CFG25 [16..18]  ============================================= */
+typedef enum {                                  /*!< CTIMER_OUTCFG2_CFG25                                                      */
+  CTIMER_OUTCFG2_CFG25_A7OUT2          = 7,     /*!< A7OUT2 : Output is A7OUT2. value.                                         */
+  CTIMER_OUTCFG2_CFG25_A6OUT2          = 6,     /*!< A6OUT2 : Output is A6OUT2. value.                                         */
+  CTIMER_OUTCFG2_CFG25_A2OUT2          = 5,     /*!< A2OUT2 : Output is A2OUT2. value.                                         */
+  CTIMER_OUTCFG2_CFG25_A6OUT           = 4,     /*!< A6OUT : Output is A6OUT. value.                                           */
+  CTIMER_OUTCFG2_CFG25_B2OUT           = 3,     /*!< B2OUT : Output is B2OUT. value.                                           */
+  CTIMER_OUTCFG2_CFG25_B4OUT2          = 2,     /*!< B4OUT2 : Output is B4OUT2 value.                                          */
+  CTIMER_OUTCFG2_CFG25_ONE             = 1,     /*!< ONE : Force output to 1. value.                                           */
+  CTIMER_OUTCFG2_CFG25_ZERO            = 0,     /*!< ZERO : Force output to 0 value.                                           */
+} CTIMER_OUTCFG2_CFG25_Enum;
+
+/* =============================================  CTIMER OUTCFG2 CFG24 [12..14]  ============================================= */
+typedef enum {                                  /*!< CTIMER_OUTCFG2_CFG24                                                      */
+  CTIMER_OUTCFG2_CFG24_A7OUT2          = 7,     /*!< A7OUT2 : Output is A7OUT2. value.                                         */
+  CTIMER_OUTCFG2_CFG24_A6OUT2          = 6,     /*!< A6OUT2 : Output is A6OUT2. value.                                         */
+  CTIMER_OUTCFG2_CFG24_B1OUT2          = 5,     /*!< B1OUT2 : Output is B1OUT2. value.                                         */
+  CTIMER_OUTCFG2_CFG24_A1OUT           = 4,     /*!< A1OUT : Output is A1OUT. value.                                           */
+  CTIMER_OUTCFG2_CFG24_A2OUT           = 3,     /*!< A2OUT : Output is A2OUT. value.                                           */
+  CTIMER_OUTCFG2_CFG24_A6OUT           = 2,     /*!< A6OUT : Output is A6OUT value.                                            */
+  CTIMER_OUTCFG2_CFG24_ONE             = 1,     /*!< ONE : Force output to 1. value.                                           */
+  CTIMER_OUTCFG2_CFG24_ZERO            = 0,     /*!< ZERO : Force output to 0 value.                                           */
+} CTIMER_OUTCFG2_CFG24_Enum;
+
+/* =============================================  CTIMER OUTCFG2 CFG23 [9..11]  ============================================== */
+typedef enum {                                  /*!< CTIMER_OUTCFG2_CFG23                                                      */
+  CTIMER_OUTCFG2_CFG23_A7OUT2          = 7,     /*!< A7OUT2 : Output is A7OUT2. value.                                         */
+  CTIMER_OUTCFG2_CFG23_A6OUT2          = 6,     /*!< A6OUT2 : Output is A6OUT2. value.                                         */
+  CTIMER_OUTCFG2_CFG23_B0OUT2          = 5,     /*!< B0OUT2 : Output is B0OUT2. value.                                         */
+  CTIMER_OUTCFG2_CFG23_A5OUT           = 4,     /*!< A5OUT : Output is A5OUT. value.                                           */
+  CTIMER_OUTCFG2_CFG23_A7OUT           = 3,     /*!< A7OUT : Output is B1OUT. value.                                           */
+  CTIMER_OUTCFG2_CFG23_B5OUT2          = 2,     /*!< B5OUT2 : Output is B5OUT2 value.                                          */
+  CTIMER_OUTCFG2_CFG23_ONE             = 1,     /*!< ONE : Force output to 1. value.                                           */
+  CTIMER_OUTCFG2_CFG23_ZERO            = 0,     /*!< ZERO : Force output to 0 value.                                           */
+} CTIMER_OUTCFG2_CFG23_Enum;
+
+/* ==============================================  CTIMER OUTCFG2 CFG22 [6..8]  ============================================== */
+typedef enum {                                  /*!< CTIMER_OUTCFG2_CFG22                                                      */
+  CTIMER_OUTCFG2_CFG22_A7OUT2          = 7,     /*!< A7OUT2 : Output is A7OUT2. value.                                         */
+  CTIMER_OUTCFG2_CFG22_A6OUT2          = 6,     /*!< A6OUT2 : Output is A6OUT2. value.                                         */
+  CTIMER_OUTCFG2_CFG22_A2OUT2          = 5,     /*!< A2OUT2 : Output is A2OUT2. value.                                         */
+  CTIMER_OUTCFG2_CFG22_A1OUT           = 4,     /*!< A1OUT : Output is A1OUT. value.                                           */
+  CTIMER_OUTCFG2_CFG22_A6OUT           = 3,     /*!< A6OUT : Output is B1OUT. value.                                           */
+  CTIMER_OUTCFG2_CFG22_B5OUT           = 2,     /*!< B5OUT : Output is B5OUT value.                                            */
+  CTIMER_OUTCFG2_CFG22_ONE             = 1,     /*!< ONE : Force output to 1. value.                                           */
+  CTIMER_OUTCFG2_CFG22_ZERO            = 0,     /*!< ZERO : Force output to 0 value.                                           */
+} CTIMER_OUTCFG2_CFG22_Enum;
+
+/* ==============================================  CTIMER OUTCFG2 CFG21 [3..5]  ============================================== */
+typedef enum {                                  /*!< CTIMER_OUTCFG2_CFG21                                                      */
+  CTIMER_OUTCFG2_CFG21_A7OUT2          = 7,     /*!< A7OUT2 : Output is A7OUT2. value.                                         */
+  CTIMER_OUTCFG2_CFG21_A6OUT2          = 6,     /*!< A6OUT2 : Output is A6OUT2. value.                                         */
+  CTIMER_OUTCFG2_CFG21_A0OUT2          = 5,     /*!< A0OUT2 : Output is A0OUT2. value.                                         */
+  CTIMER_OUTCFG2_CFG21_B5OUT           = 4,     /*!< B5OUT : Output is B5OUT. value.                                           */
+  CTIMER_OUTCFG2_CFG21_A1OUT           = 3,     /*!< A1OUT : Output is A1OUT. value.                                           */
+  CTIMER_OUTCFG2_CFG21_A5OUT2          = 2,     /*!< A5OUT2 : Output is A5OUT2 value.                                          */
+  CTIMER_OUTCFG2_CFG21_ONE             = 1,     /*!< ONE : Force output to 1. value.                                           */
+  CTIMER_OUTCFG2_CFG21_ZERO            = 0,     /*!< ZERO : Force output to 0 value.                                           */
+} CTIMER_OUTCFG2_CFG21_Enum;
+
+/* ==============================================  CTIMER OUTCFG2 CFG20 [0..2]  ============================================== */
+typedef enum {                                  /*!< CTIMER_OUTCFG2_CFG20                                                      */
+  CTIMER_OUTCFG2_CFG20_A7OUT2          = 7,     /*!< A7OUT2 : Output is A7OUT2. value.                                         */
+  CTIMER_OUTCFG2_CFG20_A6OUT2          = 6,     /*!< A6OUT2 : Output is A6OUT2. value.                                         */
+  CTIMER_OUTCFG2_CFG20_B2OUT2          = 5,     /*!< B2OUT2 : Output is B2OUT2. value.                                         */
+  CTIMER_OUTCFG2_CFG20_A1OUT2          = 4,     /*!< A1OUT2 : Output is A1OUT2. value.                                         */
+  CTIMER_OUTCFG2_CFG20_A1OUT           = 3,     /*!< A1OUT : Output is A1OUT. value.                                           */
+  CTIMER_OUTCFG2_CFG20_A5OUT           = 2,     /*!< A5OUT : Output is A5OUT value.                                            */
+  CTIMER_OUTCFG2_CFG20_ONE             = 1,     /*!< ONE : Force output to 1. value.                                           */
+  CTIMER_OUTCFG2_CFG20_ZERO            = 0,     /*!< ZERO : Force output to 0 value.                                           */
+} CTIMER_OUTCFG2_CFG20_Enum;
+
+/* ========================================================  OUTCFG3  ======================================================== */
+/* ==============================================  CTIMER OUTCFG3 CFG31 [3..5]  ============================================== */
+typedef enum {                                  /*!< CTIMER_OUTCFG3_CFG31                                                      */
+  CTIMER_OUTCFG3_CFG31_A7OUT2          = 7,     /*!< A7OUT2 : Output is A7OUT2. value.                                         */
+  CTIMER_OUTCFG3_CFG31_A6OUT2          = 6,     /*!< A6OUT2 : Output is A6OUT2. value.                                         */
+  CTIMER_OUTCFG3_CFG31_B3OUT2          = 5,     /*!< B3OUT2 : Output is B3OUT2. value.                                         */
+  CTIMER_OUTCFG3_CFG31_B7OUT           = 4,     /*!< B7OUT : Output is B7OUT. value.                                           */
+  CTIMER_OUTCFG3_CFG31_A6OUT           = 3,     /*!< A6OUT : Output is A6OUT. value.                                           */
+  CTIMER_OUTCFG3_CFG31_B7OUT2          = 2,     /*!< B7OUT2 : Output is B7OUT2 value.                                          */
+  CTIMER_OUTCFG3_CFG31_ONE             = 1,     /*!< ONE : Force output to 1. value.                                           */
+  CTIMER_OUTCFG3_CFG31_ZERO            = 0,     /*!< ZERO : Force output to 0 value.                                           */
+} CTIMER_OUTCFG3_CFG31_Enum;
+
+/* ==============================================  CTIMER OUTCFG3 CFG30 [0..2]  ============================================== */
+typedef enum {                                  /*!< CTIMER_OUTCFG3_CFG30                                                      */
+  CTIMER_OUTCFG3_CFG30_A7OUT2          = 7,     /*!< A7OUT2 : Output is A7OUT2. value.                                         */
+  CTIMER_OUTCFG3_CFG30_A6OUT2          = 6,     /*!< A6OUT2 : Output is A6OUT2. value.                                         */
+  CTIMER_OUTCFG3_CFG30_A0OUT2          = 5,     /*!< A0OUT2 : Output is A0OUT2. value.                                         */
+  CTIMER_OUTCFG3_CFG30_A4OUT2          = 4,     /*!< A4OUT2 : Output is A4OUT2. value.                                         */
+  CTIMER_OUTCFG3_CFG30_B3OUT           = 3,     /*!< B3OUT : Output is B3OUT. value.                                           */
+  CTIMER_OUTCFG3_CFG30_B7OUT           = 2,     /*!< B7OUT : Output is B7OUT value.                                            */
+  CTIMER_OUTCFG3_CFG30_ONE             = 1,     /*!< ONE : Force output to 1. value.                                           */
+  CTIMER_OUTCFG3_CFG30_ZERO            = 0,     /*!< ZERO : Force output to 0 value.                                           */
+} CTIMER_OUTCFG3_CFG30_Enum;
+
+/* =========================================================  INCFG  ========================================================= */
+/* ==============================================  CTIMER INCFG CFGB7 [15..15]  ============================================== */
+typedef enum {                                  /*!< CTIMER_INCFG_CFGB7                                                        */
+  CTIMER_INCFG_CFGB7_CT31              = 1,     /*!< CT31 : Input is CT31 value.                                               */
+  CTIMER_INCFG_CFGB7_CT30              = 0,     /*!< CT30 : Input is CT30 value.                                               */
+} CTIMER_INCFG_CFGB7_Enum;
+
+/* ==============================================  CTIMER INCFG CFGA7 [14..14]  ============================================== */
+typedef enum {                                  /*!< CTIMER_INCFG_CFGA7                                                        */
+  CTIMER_INCFG_CFGA7_CT29              = 1,     /*!< CT29 : Input is CT29 value.                                               */
+  CTIMER_INCFG_CFGA7_CT28              = 0,     /*!< CT28 : Input is CT28 value.                                               */
+} CTIMER_INCFG_CFGA7_Enum;
+
+/* ==============================================  CTIMER INCFG CFGB6 [13..13]  ============================================== */
+typedef enum {                                  /*!< CTIMER_INCFG_CFGB6                                                        */
+  CTIMER_INCFG_CFGB6_CT27              = 1,     /*!< CT27 : Input is CT27 value.                                               */
+  CTIMER_INCFG_CFGB6_CT26              = 0,     /*!< CT26 : Input is CT26 value.                                               */
+} CTIMER_INCFG_CFGB6_Enum;
+
+/* ==============================================  CTIMER INCFG CFGA6 [12..12]  ============================================== */
+typedef enum {                                  /*!< CTIMER_INCFG_CFGA6                                                        */
+  CTIMER_INCFG_CFGA6_CT25              = 1,     /*!< CT25 : Input is CT25 value.                                               */
+  CTIMER_INCFG_CFGA6_CT24              = 0,     /*!< CT24 : Input is CT24 value.                                               */
+} CTIMER_INCFG_CFGA6_Enum;
+
+/* ==============================================  CTIMER INCFG CFGB5 [11..11]  ============================================== */
+typedef enum {                                  /*!< CTIMER_INCFG_CFGB5                                                        */
+  CTIMER_INCFG_CFGB5_CT23              = 1,     /*!< CT23 : Input is CT23 value.                                               */
+  CTIMER_INCFG_CFGB5_CT22              = 0,     /*!< CT22 : Input is CT22 value.                                               */
+} CTIMER_INCFG_CFGB5_Enum;
+
+/* ==============================================  CTIMER INCFG CFGA5 [10..10]  ============================================== */
+typedef enum {                                  /*!< CTIMER_INCFG_CFGA5                                                        */
+  CTIMER_INCFG_CFGA5_CT21              = 1,     /*!< CT21 : Input is CT21 value.                                               */
+  CTIMER_INCFG_CFGA5_CT20              = 0,     /*!< CT20 : Input is CT20 value.                                               */
+} CTIMER_INCFG_CFGA5_Enum;
+
+/* ===============================================  CTIMER INCFG CFGB4 [9..9]  =============================================== */
+typedef enum {                                  /*!< CTIMER_INCFG_CFGB4                                                        */
+  CTIMER_INCFG_CFGB4_CT19              = 1,     /*!< CT19 : Input is CT19 value.                                               */
+  CTIMER_INCFG_CFGB4_CT18              = 0,     /*!< CT18 : Input is CT18 value.                                               */
+} CTIMER_INCFG_CFGB4_Enum;
+
+/* ===============================================  CTIMER INCFG CFGA4 [8..8]  =============================================== */
+typedef enum {                                  /*!< CTIMER_INCFG_CFGA4                                                        */
+  CTIMER_INCFG_CFGA4_CT17              = 1,     /*!< CT17 : Input is CT17 value.                                               */
+  CTIMER_INCFG_CFGA4_CT16              = 0,     /*!< CT16 : Input is CT16 value.                                               */
+} CTIMER_INCFG_CFGA4_Enum;
+
+/* ===============================================  CTIMER INCFG CFGB3 [7..7]  =============================================== */
+typedef enum {                                  /*!< CTIMER_INCFG_CFGB3                                                        */
+  CTIMER_INCFG_CFGB3_CT15              = 1,     /*!< CT15 : Input is CT15 value.                                               */
+  CTIMER_INCFG_CFGB3_CT14              = 0,     /*!< CT14 : Input is CT14 value.                                               */
+} CTIMER_INCFG_CFGB3_Enum;
+
+/* ===============================================  CTIMER INCFG CFGA3 [6..6]  =============================================== */
+typedef enum {                                  /*!< CTIMER_INCFG_CFGA3                                                        */
+  CTIMER_INCFG_CFGA3_CT13              = 1,     /*!< CT13 : Input is CT13 value.                                               */
+  CTIMER_INCFG_CFGA3_CT12              = 0,     /*!< CT12 : Input is CT12 value.                                               */
+} CTIMER_INCFG_CFGA3_Enum;
+
+/* ===============================================  CTIMER INCFG CFGB2 [5..5]  =============================================== */
+typedef enum {                                  /*!< CTIMER_INCFG_CFGB2                                                        */
+  CTIMER_INCFG_CFGB2_CT11              = 1,     /*!< CT11 : Input is CT11 value.                                               */
+  CTIMER_INCFG_CFGB2_CT10              = 0,     /*!< CT10 : Input is CT10 value.                                               */
+} CTIMER_INCFG_CFGB2_Enum;
+
+/* ===============================================  CTIMER INCFG CFGA2 [4..4]  =============================================== */
+typedef enum {                                  /*!< CTIMER_INCFG_CFGA2                                                        */
+  CTIMER_INCFG_CFGA2_CT9               = 1,     /*!< CT9 : Input is CT9 value.                                                 */
+  CTIMER_INCFG_CFGA2_CT8               = 0,     /*!< CT8 : Input is CT8 value.                                                 */
+} CTIMER_INCFG_CFGA2_Enum;
+
+/* ===============================================  CTIMER INCFG CFGB1 [3..3]  =============================================== */
+typedef enum {                                  /*!< CTIMER_INCFG_CFGB1                                                        */
+  CTIMER_INCFG_CFGB1_CT7               = 1,     /*!< CT7 : Input is CT7 value.                                                 */
+  CTIMER_INCFG_CFGB1_CT6               = 0,     /*!< CT6 : Input is CT6 value.                                                 */
+} CTIMER_INCFG_CFGB1_Enum;
+
+/* ===============================================  CTIMER INCFG CFGA1 [2..2]  =============================================== */
+typedef enum {                                  /*!< CTIMER_INCFG_CFGA1                                                        */
+  CTIMER_INCFG_CFGA1_CT5               = 1,     /*!< CT5 : Input is CT5 value.                                                 */
+  CTIMER_INCFG_CFGA1_CT4               = 0,     /*!< CT4 : Input is CT4 value.                                                 */
+} CTIMER_INCFG_CFGA1_Enum;
+
+/* ===============================================  CTIMER INCFG CFGB0 [1..1]  =============================================== */
+typedef enum {                                  /*!< CTIMER_INCFG_CFGB0                                                        */
+  CTIMER_INCFG_CFGB0_CT3               = 1,     /*!< CT3 : Input is CT3 value.                                                 */
+  CTIMER_INCFG_CFGB0_CT2               = 0,     /*!< CT2 : Input is CT2 value.                                                 */
+} CTIMER_INCFG_CFGB0_Enum;
+
+/* ===============================================  CTIMER INCFG CFGA0 [0..0]  =============================================== */
+typedef enum {                                  /*!< CTIMER_INCFG_CFGA0                                                        */
+  CTIMER_INCFG_CFGA0_CT1               = 1,     /*!< CT1 : Input is CT1 value.                                                 */
+  CTIMER_INCFG_CFGA0_CT0               = 0,     /*!< CT0 : Input is CT0 value.                                                 */
+} CTIMER_INCFG_CFGA0_Enum;
+
+/* =========================================================  STCFG  ========================================================= */
+/* =============================================  CTIMER STCFG FREEZE [31..31]  ============================================== */
+typedef enum {                                  /*!< CTIMER_STCFG_FREEZE                                                       */
+  CTIMER_STCFG_FREEZE_THAW             = 0,     /*!< THAW : Let the COUNTER register run on its input clock. value.            */
+  CTIMER_STCFG_FREEZE_FREEZE           = 1,     /*!< FREEZE : Stop the COUNTER register for loading. value.                    */
+} CTIMER_STCFG_FREEZE_Enum;
+
+/* ==============================================  CTIMER STCFG CLEAR [30..30]  ============================================== */
+typedef enum {                                  /*!< CTIMER_STCFG_CLEAR                                                        */
+  CTIMER_STCFG_CLEAR_RUN               = 0,     /*!< RUN : Let the COUNTER register run on its input clock. value.             */
+  CTIMER_STCFG_CLEAR_CLEAR             = 1,     /*!< CLEAR : Stop the COUNTER register for loading. value.                     */
+} CTIMER_STCFG_CLEAR_Enum;
+
+/* ==========================================  CTIMER STCFG COMPARE_H_EN [15..15]  =========================================== */
+typedef enum {                                  /*!< CTIMER_STCFG_COMPARE_H_EN                                                 */
+  CTIMER_STCFG_COMPARE_H_EN_DISABLE    = 0,     /*!< DISABLE : Compare H disabled. value.                                      */
+  CTIMER_STCFG_COMPARE_H_EN_ENABLE     = 1,     /*!< ENABLE : Compare H enabled. value.                                        */
+} CTIMER_STCFG_COMPARE_H_EN_Enum;
+
+/* ==========================================  CTIMER STCFG COMPARE_G_EN [14..14]  =========================================== */
+typedef enum {                                  /*!< CTIMER_STCFG_COMPARE_G_EN                                                 */
+  CTIMER_STCFG_COMPARE_G_EN_DISABLE    = 0,     /*!< DISABLE : Compare G disabled. value.                                      */
+  CTIMER_STCFG_COMPARE_G_EN_ENABLE     = 1,     /*!< ENABLE : Compare G enabled. value.                                        */
+} CTIMER_STCFG_COMPARE_G_EN_Enum;
+
+/* ==========================================  CTIMER STCFG COMPARE_F_EN [13..13]  =========================================== */
+typedef enum {                                  /*!< CTIMER_STCFG_COMPARE_F_EN                                                 */
+  CTIMER_STCFG_COMPARE_F_EN_DISABLE    = 0,     /*!< DISABLE : Compare F disabled. value.                                      */
+  CTIMER_STCFG_COMPARE_F_EN_ENABLE     = 1,     /*!< ENABLE : Compare F enabled. value.                                        */
+} CTIMER_STCFG_COMPARE_F_EN_Enum;
+
+/* ==========================================  CTIMER STCFG COMPARE_E_EN [12..12]  =========================================== */
+typedef enum {                                  /*!< CTIMER_STCFG_COMPARE_E_EN                                                 */
+  CTIMER_STCFG_COMPARE_E_EN_DISABLE    = 0,     /*!< DISABLE : Compare E disabled. value.                                      */
+  CTIMER_STCFG_COMPARE_E_EN_ENABLE     = 1,     /*!< ENABLE : Compare E enabled. value.                                        */
+} CTIMER_STCFG_COMPARE_E_EN_Enum;
+
+/* ==========================================  CTIMER STCFG COMPARE_D_EN [11..11]  =========================================== */
+typedef enum {                                  /*!< CTIMER_STCFG_COMPARE_D_EN                                                 */
+  CTIMER_STCFG_COMPARE_D_EN_DISABLE    = 0,     /*!< DISABLE : Compare D disabled. value.                                      */
+  CTIMER_STCFG_COMPARE_D_EN_ENABLE     = 1,     /*!< ENABLE : Compare D enabled. value.                                        */
+} CTIMER_STCFG_COMPARE_D_EN_Enum;
+
+/* ==========================================  CTIMER STCFG COMPARE_C_EN [10..10]  =========================================== */
+typedef enum {                                  /*!< CTIMER_STCFG_COMPARE_C_EN                                                 */
+  CTIMER_STCFG_COMPARE_C_EN_DISABLE    = 0,     /*!< DISABLE : Compare C disabled. value.                                      */
+  CTIMER_STCFG_COMPARE_C_EN_ENABLE     = 1,     /*!< ENABLE : Compare C enabled. value.                                        */
+} CTIMER_STCFG_COMPARE_C_EN_Enum;
+
+/* ===========================================  CTIMER STCFG COMPARE_B_EN [9..9]  ============================================ */
+typedef enum {                                  /*!< CTIMER_STCFG_COMPARE_B_EN                                                 */
+  CTIMER_STCFG_COMPARE_B_EN_DISABLE    = 0,     /*!< DISABLE : Compare B disabled. value.                                      */
+  CTIMER_STCFG_COMPARE_B_EN_ENABLE     = 1,     /*!< ENABLE : Compare B enabled. value.                                        */
+} CTIMER_STCFG_COMPARE_B_EN_Enum;
+
+/* ===========================================  CTIMER STCFG COMPARE_A_EN [8..8]  ============================================ */
+typedef enum {                                  /*!< CTIMER_STCFG_COMPARE_A_EN                                                 */
+  CTIMER_STCFG_COMPARE_A_EN_DISABLE    = 0,     /*!< DISABLE : Compare A disabled. value.                                      */
+  CTIMER_STCFG_COMPARE_A_EN_ENABLE     = 1,     /*!< ENABLE : Compare A enabled. value.                                        */
+} CTIMER_STCFG_COMPARE_A_EN_Enum;
+
+/* ==============================================  CTIMER STCFG CLKSEL [0..3]  =============================================== */
+typedef enum {                                  /*!< CTIMER_STCFG_CLKSEL                                                       */
+  CTIMER_STCFG_CLKSEL_NOCLK            = 0,     /*!< NOCLK : No clock enabled. value.                                          */
+  CTIMER_STCFG_CLKSEL_HFRC_DIV16       = 1,     /*!< HFRC_DIV16 : 3MHz from the HFRC clock divider. value.                     */
+  CTIMER_STCFG_CLKSEL_HFRC_DIV256      = 2,     /*!< HFRC_DIV256 : 187.5KHz from the HFRC clock divider. value.                */
+  CTIMER_STCFG_CLKSEL_XTAL_DIV1        = 3,     /*!< XTAL_DIV1 : 32768Hz from the crystal oscillator. value.                   */
+  CTIMER_STCFG_CLKSEL_XTAL_DIV2        = 4,     /*!< XTAL_DIV2 : 16384Hz from the crystal oscillator. value.                   */
+  CTIMER_STCFG_CLKSEL_XTAL_DIV32       = 5,     /*!< XTAL_DIV32 : 1024Hz from the crystal oscillator. value.                   */
+  CTIMER_STCFG_CLKSEL_LFRC_DIV1        = 6,     /*!< LFRC_DIV1 : Approximately 1KHz from the LFRC oscillator (uncalibrated).
+                                                     value.                                                                    */
+  CTIMER_STCFG_CLKSEL_CTIMER0A         = 7,     /*!< CTIMER0A : Use CTIMER 0 section A as a prescaler for the clock
+                                                     source. value.                                                            */
+  CTIMER_STCFG_CLKSEL_CTIMER0B         = 8,     /*!< CTIMER0B : Use CTIMER 0 section B (or A and B linked together)
+                                                     as a prescaler for the clock source. value.                               */
+} CTIMER_STCFG_CLKSEL_Enum;
+
+/* =========================================================  STTMR  ========================================================= */
+/* ====================================================  CAPTURECONTROL  ===================================================== */
+/* =========================================  CTIMER CAPTURECONTROL CAPTURE3 [3..3]  ========================================= */
+typedef enum {                                  /*!< CTIMER_CAPTURECONTROL_CAPTURE3                                            */
+  CTIMER_CAPTURECONTROL_CAPTURE3_DISABLE = 0,   /*!< DISABLE : Capture function disabled. value.                               */
+  CTIMER_CAPTURECONTROL_CAPTURE3_ENABLE = 1,    /*!< ENABLE : Capture function enabled. value.                                 */
+} CTIMER_CAPTURECONTROL_CAPTURE3_Enum;
+
+/* =========================================  CTIMER CAPTURECONTROL CAPTURE2 [2..2]  ========================================= */
+typedef enum {                                  /*!< CTIMER_CAPTURECONTROL_CAPTURE2                                            */
+  CTIMER_CAPTURECONTROL_CAPTURE2_DISABLE = 0,   /*!< DISABLE : Capture function disabled. value.                               */
+  CTIMER_CAPTURECONTROL_CAPTURE2_ENABLE = 1,    /*!< ENABLE : Capture function enabled. value.                                 */
+} CTIMER_CAPTURECONTROL_CAPTURE2_Enum;
+
+/* =========================================  CTIMER CAPTURECONTROL CAPTURE1 [1..1]  ========================================= */
+typedef enum {                                  /*!< CTIMER_CAPTURECONTROL_CAPTURE1                                            */
+  CTIMER_CAPTURECONTROL_CAPTURE1_DISABLE = 0,   /*!< DISABLE : Capture function disabled. value.                               */
+  CTIMER_CAPTURECONTROL_CAPTURE1_ENABLE = 1,    /*!< ENABLE : Capture function enabled. value.                                 */
+} CTIMER_CAPTURECONTROL_CAPTURE1_Enum;
+
+/* =========================================  CTIMER CAPTURECONTROL CAPTURE0 [0..0]  ========================================= */
+typedef enum {                                  /*!< CTIMER_CAPTURECONTROL_CAPTURE0                                            */
+  CTIMER_CAPTURECONTROL_CAPTURE0_DISABLE = 0,   /*!< DISABLE : Capture function disabled. value.                               */
+  CTIMER_CAPTURECONTROL_CAPTURE0_ENABLE = 1,    /*!< ENABLE : Capture function enabled. value.                                 */
+} CTIMER_CAPTURECONTROL_CAPTURE0_Enum;
+
+/* ========================================================  SCMPR0  ========================================================= */
+/* ========================================================  SCMPR1  ========================================================= */
+/* ========================================================  SCMPR2  ========================================================= */
+/* ========================================================  SCMPR3  ========================================================= */
+/* ========================================================  SCMPR4  ========================================================= */
+/* ========================================================  SCMPR5  ========================================================= */
+/* ========================================================  SCMPR6  ========================================================= */
+/* ========================================================  SCMPR7  ========================================================= */
+/* ========================================================  SCAPT0  ========================================================= */
+/* ========================================================  SCAPT1  ========================================================= */
+/* ========================================================  SCAPT2  ========================================================= */
+/* ========================================================  SCAPT3  ========================================================= */
+/* =========================================================  SNVR0  ========================================================= */
+/* =========================================================  SNVR1  ========================================================= */
+/* =========================================================  SNVR2  ========================================================= */
+/* =========================================================  SNVR3  ========================================================= */
+/* =========================================================  INTEN  ========================================================= */
+/* ========================================================  INTSTAT  ======================================================== */
+/* ========================================================  INTCLR  ========================================================= */
+/* ========================================================  INTSET  ========================================================= */
+/* =======================================================  STMINTEN  ======================================================== */
+/* ===========================================  CTIMER STMINTEN CAPTURED [12..12]  =========================================== */
+typedef enum {                                  /*!< CTIMER_STMINTEN_CAPTURED                                                  */
+  CTIMER_STMINTEN_CAPTURED_CAPD_INT    = 1,     /*!< CAPD_INT : Capture D interrupt status bit was set. value.                 */
+} CTIMER_STMINTEN_CAPTURED_Enum;
+
+/* ===========================================  CTIMER STMINTEN CAPTUREC [11..11]  =========================================== */
+typedef enum {                                  /*!< CTIMER_STMINTEN_CAPTUREC                                                  */
+  CTIMER_STMINTEN_CAPTUREC_CAPC_INT    = 1,     /*!< CAPC_INT : CAPTURE C interrupt status bit was set. value.                 */
+} CTIMER_STMINTEN_CAPTUREC_Enum;
+
+/* ===========================================  CTIMER STMINTEN CAPTUREB [10..10]  =========================================== */
+typedef enum {                                  /*!< CTIMER_STMINTEN_CAPTUREB                                                  */
+  CTIMER_STMINTEN_CAPTUREB_CAPB_INT    = 1,     /*!< CAPB_INT : CAPTURE B interrupt status bit was set. value.                 */
+} CTIMER_STMINTEN_CAPTUREB_Enum;
+
+/* ============================================  CTIMER STMINTEN CAPTUREA [9..9]  ============================================ */
+typedef enum {                                  /*!< CTIMER_STMINTEN_CAPTUREA                                                  */
+  CTIMER_STMINTEN_CAPTUREA_CAPA_INT    = 1,     /*!< CAPA_INT : CAPTURE A interrupt status bit was set. value.                 */
+} CTIMER_STMINTEN_CAPTUREA_Enum;
+
+/* ============================================  CTIMER STMINTEN OVERFLOW [8..8]  ============================================ */
+typedef enum {                                  /*!< CTIMER_STMINTEN_OVERFLOW                                                  */
+  CTIMER_STMINTEN_OVERFLOW_OFLOW_INT   = 1,     /*!< OFLOW_INT : Overflow interrupt status bit was set. value.                 */
+} CTIMER_STMINTEN_OVERFLOW_Enum;
+
+/* ============================================  CTIMER STMINTEN COMPAREH [7..7]  ============================================ */
+typedef enum {                                  /*!< CTIMER_STMINTEN_COMPAREH                                                  */
+  CTIMER_STMINTEN_COMPAREH_COMPARED    = 1,     /*!< COMPARED : COUNTER greater than or equal to COMPARE register.
+                                                     value.                                                                    */
+} CTIMER_STMINTEN_COMPAREH_Enum;
+
+/* ============================================  CTIMER STMINTEN COMPAREG [6..6]  ============================================ */
+typedef enum {                                  /*!< CTIMER_STMINTEN_COMPAREG                                                  */
+  CTIMER_STMINTEN_COMPAREG_COMPARED    = 1,     /*!< COMPARED : COUNTER greater than or equal to COMPARE register.
+                                                     value.                                                                    */
+} CTIMER_STMINTEN_COMPAREG_Enum;
+
+/* ============================================  CTIMER STMINTEN COMPAREF [5..5]  ============================================ */
+typedef enum {                                  /*!< CTIMER_STMINTEN_COMPAREF                                                  */
+  CTIMER_STMINTEN_COMPAREF_COMPARED    = 1,     /*!< COMPARED : COUNTER greater than or equal to COMPARE register.
+                                                     value.                                                                    */
+} CTIMER_STMINTEN_COMPAREF_Enum;
+
+/* ============================================  CTIMER STMINTEN COMPAREE [4..4]  ============================================ */
+typedef enum {                                  /*!< CTIMER_STMINTEN_COMPAREE                                                  */
+  CTIMER_STMINTEN_COMPAREE_COMPARED    = 1,     /*!< COMPARED : COUNTER greater than or equal to COMPARE register.
+                                                     value.                                                                    */
+} CTIMER_STMINTEN_COMPAREE_Enum;
+
+/* ============================================  CTIMER STMINTEN COMPARED [3..3]  ============================================ */
+typedef enum {                                  /*!< CTIMER_STMINTEN_COMPARED                                                  */
+  CTIMER_STMINTEN_COMPARED_COMPARED    = 1,     /*!< COMPARED : COUNTER greater than or equal to COMPARE register.
+                                                     value.                                                                    */
+} CTIMER_STMINTEN_COMPARED_Enum;
+
+/* ============================================  CTIMER STMINTEN COMPAREC [2..2]  ============================================ */
+typedef enum {                                  /*!< CTIMER_STMINTEN_COMPAREC                                                  */
+  CTIMER_STMINTEN_COMPAREC_COMPARED    = 1,     /*!< COMPARED : COUNTER greater than or equal to COMPARE register.
+                                                     value.                                                                    */
+} CTIMER_STMINTEN_COMPAREC_Enum;
+
+/* ============================================  CTIMER STMINTEN COMPAREB [1..1]  ============================================ */
+typedef enum {                                  /*!< CTIMER_STMINTEN_COMPAREB                                                  */
+  CTIMER_STMINTEN_COMPAREB_COMPARED    = 1,     /*!< COMPARED : COUNTER greater than or equal to COMPARE register.
+                                                     value.                                                                    */
+} CTIMER_STMINTEN_COMPAREB_Enum;
+
+/* ============================================  CTIMER STMINTEN COMPAREA [0..0]  ============================================ */
+typedef enum {                                  /*!< CTIMER_STMINTEN_COMPAREA                                                  */
+  CTIMER_STMINTEN_COMPAREA_COMPARED    = 1,     /*!< COMPARED : COUNTER greater than or equal to COMPARE register.
+                                                     value.                                                                    */
+} CTIMER_STMINTEN_COMPAREA_Enum;
+
+/* ======================================================  STMINTSTAT  ======================================================= */
+/* ==========================================  CTIMER STMINTSTAT CAPTURED [12..12]  ========================================== */
+typedef enum {                                  /*!< CTIMER_STMINTSTAT_CAPTURED                                                */
+  CTIMER_STMINTSTAT_CAPTURED_CAPD_INT  = 1,     /*!< CAPD_INT : Capture D interrupt status bit was set. value.                 */
+} CTIMER_STMINTSTAT_CAPTURED_Enum;
+
+/* ==========================================  CTIMER STMINTSTAT CAPTUREC [11..11]  ========================================== */
+typedef enum {                                  /*!< CTIMER_STMINTSTAT_CAPTUREC                                                */
+  CTIMER_STMINTSTAT_CAPTUREC_CAPC_INT  = 1,     /*!< CAPC_INT : CAPTURE C interrupt status bit was set. value.                 */
+} CTIMER_STMINTSTAT_CAPTUREC_Enum;
+
+/* ==========================================  CTIMER STMINTSTAT CAPTUREB [10..10]  ========================================== */
+typedef enum {                                  /*!< CTIMER_STMINTSTAT_CAPTUREB                                                */
+  CTIMER_STMINTSTAT_CAPTUREB_CAPB_INT  = 1,     /*!< CAPB_INT : CAPTURE B interrupt status bit was set. value.                 */
+} CTIMER_STMINTSTAT_CAPTUREB_Enum;
+
+/* ===========================================  CTIMER STMINTSTAT CAPTUREA [9..9]  =========================================== */
+typedef enum {                                  /*!< CTIMER_STMINTSTAT_CAPTUREA                                                */
+  CTIMER_STMINTSTAT_CAPTUREA_CAPA_INT  = 1,     /*!< CAPA_INT : CAPTURE A interrupt status bit was set. value.                 */
+} CTIMER_STMINTSTAT_CAPTUREA_Enum;
+
+/* ===========================================  CTIMER STMINTSTAT OVERFLOW [8..8]  =========================================== */
+typedef enum {                                  /*!< CTIMER_STMINTSTAT_OVERFLOW                                                */
+  CTIMER_STMINTSTAT_OVERFLOW_OFLOW_INT = 1,     /*!< OFLOW_INT : Overflow interrupt status bit was set. value.                 */
+} CTIMER_STMINTSTAT_OVERFLOW_Enum;
+
+/* ===========================================  CTIMER STMINTSTAT COMPAREH [7..7]  =========================================== */
+typedef enum {                                  /*!< CTIMER_STMINTSTAT_COMPAREH                                                */
+  CTIMER_STMINTSTAT_COMPAREH_COMPARED  = 1,     /*!< COMPARED : COUNTER greater than or equal to COMPARE register.
+                                                     value.                                                                    */
+} CTIMER_STMINTSTAT_COMPAREH_Enum;
+
+/* ===========================================  CTIMER STMINTSTAT COMPAREG [6..6]  =========================================== */
+typedef enum {                                  /*!< CTIMER_STMINTSTAT_COMPAREG                                                */
+  CTIMER_STMINTSTAT_COMPAREG_COMPARED  = 1,     /*!< COMPARED : COUNTER greater than or equal to COMPARE register.
+                                                     value.                                                                    */
+} CTIMER_STMINTSTAT_COMPAREG_Enum;
+
+/* ===========================================  CTIMER STMINTSTAT COMPAREF [5..5]  =========================================== */
+typedef enum {                                  /*!< CTIMER_STMINTSTAT_COMPAREF                                                */
+  CTIMER_STMINTSTAT_COMPAREF_COMPARED  = 1,     /*!< COMPARED : COUNTER greater than or equal to COMPARE register.
+                                                     value.                                                                    */
+} CTIMER_STMINTSTAT_COMPAREF_Enum;
+
+/* ===========================================  CTIMER STMINTSTAT COMPAREE [4..4]  =========================================== */
+typedef enum {                                  /*!< CTIMER_STMINTSTAT_COMPAREE                                                */
+  CTIMER_STMINTSTAT_COMPAREE_COMPARED  = 1,     /*!< COMPARED : COUNTER greater than or equal to COMPARE register.
+                                                     value.                                                                    */
+} CTIMER_STMINTSTAT_COMPAREE_Enum;
+
+/* ===========================================  CTIMER STMINTSTAT COMPARED [3..3]  =========================================== */
+typedef enum {                                  /*!< CTIMER_STMINTSTAT_COMPARED                                                */
+  CTIMER_STMINTSTAT_COMPARED_COMPARED  = 1,     /*!< COMPARED : COUNTER greater than or equal to COMPARE register.
+                                                     value.                                                                    */
+} CTIMER_STMINTSTAT_COMPARED_Enum;
+
+/* ===========================================  CTIMER STMINTSTAT COMPAREC [2..2]  =========================================== */
+typedef enum {                                  /*!< CTIMER_STMINTSTAT_COMPAREC                                                */
+  CTIMER_STMINTSTAT_COMPAREC_COMPARED  = 1,     /*!< COMPARED : COUNTER greater than or equal to COMPARE register.
+                                                     value.                                                                    */
+} CTIMER_STMINTSTAT_COMPAREC_Enum;
+
+/* ===========================================  CTIMER STMINTSTAT COMPAREB [1..1]  =========================================== */
+typedef enum {                                  /*!< CTIMER_STMINTSTAT_COMPAREB                                                */
+  CTIMER_STMINTSTAT_COMPAREB_COMPARED  = 1,     /*!< COMPARED : COUNTER greater than or equal to COMPARE register.
+                                                     value.                                                                    */
+} CTIMER_STMINTSTAT_COMPAREB_Enum;
+
+/* ===========================================  CTIMER STMINTSTAT COMPAREA [0..0]  =========================================== */
+typedef enum {                                  /*!< CTIMER_STMINTSTAT_COMPAREA                                                */
+  CTIMER_STMINTSTAT_COMPAREA_COMPARED  = 1,     /*!< COMPARED : COUNTER greater than or equal to COMPARE register.
+                                                     value.                                                                    */
+} CTIMER_STMINTSTAT_COMPAREA_Enum;
+
+/* =======================================================  STMINTCLR  ======================================================= */
+/* ==========================================  CTIMER STMINTCLR CAPTURED [12..12]  =========================================== */
+typedef enum {                                  /*!< CTIMER_STMINTCLR_CAPTURED                                                 */
+  CTIMER_STMINTCLR_CAPTURED_CAPD_INT   = 1,     /*!< CAPD_INT : Capture D interrupt status bit was set. value.                 */
+} CTIMER_STMINTCLR_CAPTURED_Enum;
+
+/* ==========================================  CTIMER STMINTCLR CAPTUREC [11..11]  =========================================== */
+typedef enum {                                  /*!< CTIMER_STMINTCLR_CAPTUREC                                                 */
+  CTIMER_STMINTCLR_CAPTUREC_CAPC_INT   = 1,     /*!< CAPC_INT : CAPTURE C interrupt status bit was set. value.                 */
+} CTIMER_STMINTCLR_CAPTUREC_Enum;
+
+/* ==========================================  CTIMER STMINTCLR CAPTUREB [10..10]  =========================================== */
+typedef enum {                                  /*!< CTIMER_STMINTCLR_CAPTUREB                                                 */
+  CTIMER_STMINTCLR_CAPTUREB_CAPB_INT   = 1,     /*!< CAPB_INT : CAPTURE B interrupt status bit was set. value.                 */
+} CTIMER_STMINTCLR_CAPTUREB_Enum;
+
+/* ===========================================  CTIMER STMINTCLR CAPTUREA [9..9]  ============================================ */
+typedef enum {                                  /*!< CTIMER_STMINTCLR_CAPTUREA                                                 */
+  CTIMER_STMINTCLR_CAPTUREA_CAPA_INT   = 1,     /*!< CAPA_INT : CAPTURE A interrupt status bit was set. value.                 */
+} CTIMER_STMINTCLR_CAPTUREA_Enum;
+
+/* ===========================================  CTIMER STMINTCLR OVERFLOW [8..8]  ============================================ */
+typedef enum {                                  /*!< CTIMER_STMINTCLR_OVERFLOW                                                 */
+  CTIMER_STMINTCLR_OVERFLOW_OFLOW_INT  = 1,     /*!< OFLOW_INT : Overflow interrupt status bit was set. value.                 */
+} CTIMER_STMINTCLR_OVERFLOW_Enum;
+
+/* ===========================================  CTIMER STMINTCLR COMPAREH [7..7]  ============================================ */
+typedef enum {                                  /*!< CTIMER_STMINTCLR_COMPAREH                                                 */
+  CTIMER_STMINTCLR_COMPAREH_COMPARED   = 1,     /*!< COMPARED : COUNTER greater than or equal to COMPARE register.
+                                                     value.                                                                    */
+} CTIMER_STMINTCLR_COMPAREH_Enum;
+
+/* ===========================================  CTIMER STMINTCLR COMPAREG [6..6]  ============================================ */
+typedef enum {                                  /*!< CTIMER_STMINTCLR_COMPAREG                                                 */
+  CTIMER_STMINTCLR_COMPAREG_COMPARED   = 1,     /*!< COMPARED : COUNTER greater than or equal to COMPARE register.
+                                                     value.                                                                    */
+} CTIMER_STMINTCLR_COMPAREG_Enum;
+
+/* ===========================================  CTIMER STMINTCLR COMPAREF [5..5]  ============================================ */
+typedef enum {                                  /*!< CTIMER_STMINTCLR_COMPAREF                                                 */
+  CTIMER_STMINTCLR_COMPAREF_COMPARED   = 1,     /*!< COMPARED : COUNTER greater than or equal to COMPARE register.
+                                                     value.                                                                    */
+} CTIMER_STMINTCLR_COMPAREF_Enum;
+
+/* ===========================================  CTIMER STMINTCLR COMPAREE [4..4]  ============================================ */
+typedef enum {                                  /*!< CTIMER_STMINTCLR_COMPAREE                                                 */
+  CTIMER_STMINTCLR_COMPAREE_COMPARED   = 1,     /*!< COMPARED : COUNTER greater than or equal to COMPARE register.
+                                                     value.                                                                    */
+} CTIMER_STMINTCLR_COMPAREE_Enum;
+
+/* ===========================================  CTIMER STMINTCLR COMPARED [3..3]  ============================================ */
+typedef enum {                                  /*!< CTIMER_STMINTCLR_COMPARED                                                 */
+  CTIMER_STMINTCLR_COMPARED_COMPARED   = 1,     /*!< COMPARED : COUNTER greater than or equal to COMPARE register.
+                                                     value.                                                                    */
+} CTIMER_STMINTCLR_COMPARED_Enum;
+
+/* ===========================================  CTIMER STMINTCLR COMPAREC [2..2]  ============================================ */
+typedef enum {                                  /*!< CTIMER_STMINTCLR_COMPAREC                                                 */
+  CTIMER_STMINTCLR_COMPAREC_COMPARED   = 1,     /*!< COMPARED : COUNTER greater than or equal to COMPARE register.
+                                                     value.                                                                    */
+} CTIMER_STMINTCLR_COMPAREC_Enum;
+
+/* ===========================================  CTIMER STMINTCLR COMPAREB [1..1]  ============================================ */
+typedef enum {                                  /*!< CTIMER_STMINTCLR_COMPAREB                                                 */
+  CTIMER_STMINTCLR_COMPAREB_COMPARED   = 1,     /*!< COMPARED : COUNTER greater than or equal to COMPARE register.
+                                                     value.                                                                    */
+} CTIMER_STMINTCLR_COMPAREB_Enum;
+
+/* ===========================================  CTIMER STMINTCLR COMPAREA [0..0]  ============================================ */
+typedef enum {                                  /*!< CTIMER_STMINTCLR_COMPAREA                                                 */
+  CTIMER_STMINTCLR_COMPAREA_COMPARED   = 1,     /*!< COMPARED : COUNTER greater than or equal to COMPARE register.
+                                                     value.                                                                    */
+} CTIMER_STMINTCLR_COMPAREA_Enum;
+
+/* =======================================================  STMINTSET  ======================================================= */
+/* ==========================================  CTIMER STMINTSET CAPTURED [12..12]  =========================================== */
+typedef enum {                                  /*!< CTIMER_STMINTSET_CAPTURED                                                 */
+  CTIMER_STMINTSET_CAPTURED_CAPD_INT   = 1,     /*!< CAPD_INT : Capture D interrupt status bit was set. value.                 */
+} CTIMER_STMINTSET_CAPTURED_Enum;
+
+/* ==========================================  CTIMER STMINTSET CAPTUREC [11..11]  =========================================== */
+typedef enum {                                  /*!< CTIMER_STMINTSET_CAPTUREC                                                 */
+  CTIMER_STMINTSET_CAPTUREC_CAPC_INT   = 1,     /*!< CAPC_INT : CAPTURE C interrupt status bit was set. value.                 */
+} CTIMER_STMINTSET_CAPTUREC_Enum;
+
+/* ==========================================  CTIMER STMINTSET CAPTUREB [10..10]  =========================================== */
+typedef enum {                                  /*!< CTIMER_STMINTSET_CAPTUREB                                                 */
+  CTIMER_STMINTSET_CAPTUREB_CAPB_INT   = 1,     /*!< CAPB_INT : CAPTURE B interrupt status bit was set. value.                 */
+} CTIMER_STMINTSET_CAPTUREB_Enum;
+
+/* ===========================================  CTIMER STMINTSET CAPTUREA [9..9]  ============================================ */
+typedef enum {                                  /*!< CTIMER_STMINTSET_CAPTUREA                                                 */
+  CTIMER_STMINTSET_CAPTUREA_CAPA_INT   = 1,     /*!< CAPA_INT : CAPTURE A interrupt status bit was set. value.                 */
+} CTIMER_STMINTSET_CAPTUREA_Enum;
+
+/* ===========================================  CTIMER STMINTSET OVERFLOW [8..8]  ============================================ */
+typedef enum {                                  /*!< CTIMER_STMINTSET_OVERFLOW                                                 */
+  CTIMER_STMINTSET_OVERFLOW_OFLOW_INT  = 1,     /*!< OFLOW_INT : Overflow interrupt status bit was set. value.                 */
+} CTIMER_STMINTSET_OVERFLOW_Enum;
+
+/* ===========================================  CTIMER STMINTSET COMPAREH [7..7]  ============================================ */
+typedef enum {                                  /*!< CTIMER_STMINTSET_COMPAREH                                                 */
+  CTIMER_STMINTSET_COMPAREH_COMPARED   = 1,     /*!< COMPARED : COUNTER greater than or equal to COMPARE register.
+                                                     value.                                                                    */
+} CTIMER_STMINTSET_COMPAREH_Enum;
+
+/* ===========================================  CTIMER STMINTSET COMPAREG [6..6]  ============================================ */
+typedef enum {                                  /*!< CTIMER_STMINTSET_COMPAREG                                                 */
+  CTIMER_STMINTSET_COMPAREG_COMPARED   = 1,     /*!< COMPARED : COUNTER greater than or equal to COMPARE register.
+                                                     value.                                                                    */
+} CTIMER_STMINTSET_COMPAREG_Enum;
+
+/* ===========================================  CTIMER STMINTSET COMPAREF [5..5]  ============================================ */
+typedef enum {                                  /*!< CTIMER_STMINTSET_COMPAREF                                                 */
+  CTIMER_STMINTSET_COMPAREF_COMPARED   = 1,     /*!< COMPARED : COUNTER greater than or equal to COMPARE register.
+                                                     value.                                                                    */
+} CTIMER_STMINTSET_COMPAREF_Enum;
+
+/* ===========================================  CTIMER STMINTSET COMPAREE [4..4]  ============================================ */
+typedef enum {                                  /*!< CTIMER_STMINTSET_COMPAREE                                                 */
+  CTIMER_STMINTSET_COMPAREE_COMPARED   = 1,     /*!< COMPARED : COUNTER greater than or equal to COMPARE register.
+                                                     value.                                                                    */
+} CTIMER_STMINTSET_COMPAREE_Enum;
+
+/* ===========================================  CTIMER STMINTSET COMPARED [3..3]  ============================================ */
+typedef enum {                                  /*!< CTIMER_STMINTSET_COMPARED                                                 */
+  CTIMER_STMINTSET_COMPARED_COMPARED   = 1,     /*!< COMPARED : COUNTER greater than or equal to COMPARE register.
+                                                     value.                                                                    */
+} CTIMER_STMINTSET_COMPARED_Enum;
+
+/* ===========================================  CTIMER STMINTSET COMPAREC [2..2]  ============================================ */
+typedef enum {                                  /*!< CTIMER_STMINTSET_COMPAREC                                                 */
+  CTIMER_STMINTSET_COMPAREC_COMPARED   = 1,     /*!< COMPARED : COUNTER greater than or equal to COMPARE register.
+                                                     value.                                                                    */
+} CTIMER_STMINTSET_COMPAREC_Enum;
+
+/* ===========================================  CTIMER STMINTSET COMPAREB [1..1]  ============================================ */
+typedef enum {                                  /*!< CTIMER_STMINTSET_COMPAREB                                                 */
+  CTIMER_STMINTSET_COMPAREB_COMPARED   = 1,     /*!< COMPARED : COUNTER greater than or equal to COMPARE register.
+                                                     value.                                                                    */
+} CTIMER_STMINTSET_COMPAREB_Enum;
+
+/* ===========================================  CTIMER STMINTSET COMPAREA [0..0]  ============================================ */
+typedef enum {                                  /*!< CTIMER_STMINTSET_COMPAREA                                                 */
+  CTIMER_STMINTSET_COMPAREA_COMPARED   = 1,     /*!< COMPARED : COUNTER greater than or equal to COMPARE register.
+                                                     value.                                                                    */
+} CTIMER_STMINTSET_COMPAREA_Enum;
+
+
+
+/* =========================================================================================================================== */
+/* ================                                           GPIO                                            ================ */
+/* =========================================================================================================================== */
+
+/* ========================================================  PADREGA  ======================================================== */
+/* ============================================  GPIO PADREGA PAD3PWRUP [30..30]  ============================================ */
+typedef enum {                                  /*!< GPIO_PADREGA_PAD3PWRUP                                                    */
+  GPIO_PADREGA_PAD3PWRUP_DIS           = 0,     /*!< DIS : Power switch disabled value.                                        */
+  GPIO_PADREGA_PAD3PWRUP_EN            = 1,     /*!< EN : Power switch enabled (switched to VDD) value.                        */
+} GPIO_PADREGA_PAD3PWRUP_Enum;
+
+/* ===========================================  GPIO PADREGA PAD3FNCSEL [27..29]  ============================================ */
+typedef enum {                                  /*!< GPIO_PADREGA_PAD3FNCSEL                                                   */
+  GPIO_PADREGA_PAD3FNCSEL_UA0RTS       = 0,     /*!< UA0RTS : Configure as the UART0 RTS output value.                         */
+  GPIO_PADREGA_PAD3FNCSEL_SLnCE        = 1,     /*!< SLnCE : Configure as the IOSLAVE SPI nCE signal value.                    */
+  GPIO_PADREGA_PAD3FNCSEL_NCE3         = 2,     /*!< NCE3 : IOM/MSPI nCE group 3 value.                                        */
+  GPIO_PADREGA_PAD3FNCSEL_GPIO3        = 3,     /*!< GPIO3 : Configure as GPIO3 value.                                         */
+  GPIO_PADREGA_PAD3FNCSEL_MSPI7        = 5,     /*!< MSPI7 : MSPI data connection 7 value.                                     */
+  GPIO_PADREGA_PAD3FNCSEL_TRIG1        = 6,     /*!< TRIG1 : Configure as the ADC Trigger 1 signal value.                      */
+  GPIO_PADREGA_PAD3FNCSEL_I2S_WCLK     = 7,     /*!< I2S_WCLK : Configure as the PDM I2S Word Clock input value.               */
+} GPIO_PADREGA_PAD3FNCSEL_Enum;
+
+/* ============================================  GPIO PADREGA PAD3STRNG [26..26]  ============================================ */
+typedef enum {                                  /*!< GPIO_PADREGA_PAD3STRNG                                                    */
+  GPIO_PADREGA_PAD3STRNG_LOW           = 0,     /*!< LOW : Low drive strength value.                                           */
+  GPIO_PADREGA_PAD3STRNG_HIGH          = 1,     /*!< HIGH : High drive strength value.                                         */
+} GPIO_PADREGA_PAD3STRNG_Enum;
+
+/* ============================================  GPIO PADREGA PAD3INPEN [25..25]  ============================================ */
+typedef enum {                                  /*!< GPIO_PADREGA_PAD3INPEN                                                    */
+  GPIO_PADREGA_PAD3INPEN_DIS           = 0,     /*!< DIS : Pad input disabled value.                                           */
+  GPIO_PADREGA_PAD3INPEN_EN            = 1,     /*!< EN : Pad input enabled value.                                             */
+} GPIO_PADREGA_PAD3INPEN_Enum;
+
+/* ============================================  GPIO PADREGA PAD3PULL [24..24]  ============================================= */
+typedef enum {                                  /*!< GPIO_PADREGA_PAD3PULL                                                     */
+  GPIO_PADREGA_PAD3PULL_DIS            = 0,     /*!< DIS : Pullup disabled value.                                              */
+  GPIO_PADREGA_PAD3PULL_EN             = 1,     /*!< EN : Pullup enabled value.                                                */
+} GPIO_PADREGA_PAD3PULL_Enum;
+
+/* ===========================================  GPIO PADREGA PAD2FNCSEL [19..21]  ============================================ */
+typedef enum {                                  /*!< GPIO_PADREGA_PAD2FNCSEL                                                   */
+  GPIO_PADREGA_PAD2FNCSEL_SLMISO       = 1,     /*!< SLMISO : Configure as the IOSLAVE SPI MISO signal value.                  */
+  GPIO_PADREGA_PAD2FNCSEL_UART0RX      = 2,     /*!< UART0RX : Configure as the UART0 RX input value.                          */
+  GPIO_PADREGA_PAD2FNCSEL_GPIO2        = 3,     /*!< GPIO2 : Configure as GPIO2 value.                                         */
+  GPIO_PADREGA_PAD2FNCSEL_MSPI6        = 5,     /*!< MSPI6 : CMSPI data connection 6 value.                                    */
+  GPIO_PADREGA_PAD2FNCSEL_NCE2         = 7,     /*!< NCE2 : IOM/MSPI nCE group 2 value.                                        */
+} GPIO_PADREGA_PAD2FNCSEL_Enum;
+
+/* ============================================  GPIO PADREGA PAD2STRNG [18..18]  ============================================ */
+typedef enum {                                  /*!< GPIO_PADREGA_PAD2STRNG                                                    */
+  GPIO_PADREGA_PAD2STRNG_LOW           = 0,     /*!< LOW : Low drive strength value.                                           */
+  GPIO_PADREGA_PAD2STRNG_HIGH          = 1,     /*!< HIGH : High drive strength value.                                         */
+} GPIO_PADREGA_PAD2STRNG_Enum;
+
+/* ============================================  GPIO PADREGA PAD2INPEN [17..17]  ============================================ */
+typedef enum {                                  /*!< GPIO_PADREGA_PAD2INPEN                                                    */
+  GPIO_PADREGA_PAD2INPEN_DIS           = 0,     /*!< DIS : Pad input disabled value.                                           */
+  GPIO_PADREGA_PAD2INPEN_EN            = 1,     /*!< EN : Pad input enabled value.                                             */
+} GPIO_PADREGA_PAD2INPEN_Enum;
+
+/* ============================================  GPIO PADREGA PAD2PULL [16..16]  ============================================= */
+typedef enum {                                  /*!< GPIO_PADREGA_PAD2PULL                                                     */
+  GPIO_PADREGA_PAD2PULL_DIS            = 0,     /*!< DIS : Pullup disabled value.                                              */
+  GPIO_PADREGA_PAD2PULL_EN             = 1,     /*!< EN : Pullup enabled value.                                                */
+} GPIO_PADREGA_PAD2PULL_Enum;
+
+/* ============================================  GPIO PADREGA PAD1RSEL [14..15]  ============================================= */
+typedef enum {                                  /*!< GPIO_PADREGA_PAD1RSEL                                                     */
+  GPIO_PADREGA_PAD1RSEL_PULL1_5K       = 0,     /*!< PULL1_5K : Pullup is ~1.5 KOhms value.                                    */
+  GPIO_PADREGA_PAD1RSEL_PULL6K         = 1,     /*!< PULL6K : Pullup is ~6 KOhms value.                                        */
+  GPIO_PADREGA_PAD1RSEL_PULL12K        = 2,     /*!< PULL12K : Pullup is ~12 KOhms value.                                      */
+  GPIO_PADREGA_PAD1RSEL_PULL24K        = 3,     /*!< PULL24K : Pullup is ~24 KOhms value.                                      */
+} GPIO_PADREGA_PAD1RSEL_Enum;
+
+/* ===========================================  GPIO PADREGA PAD1FNCSEL [11..13]  ============================================ */
+typedef enum {                                  /*!< GPIO_PADREGA_PAD1FNCSEL                                                   */
+  GPIO_PADREGA_PAD1FNCSEL_SLSDAWIR3    = 0,     /*!< SLSDAWIR3 : Configure as the IOSLAVE I2C SDA or SPI WIR3 signal
+                                                     value.                                                                    */
+  GPIO_PADREGA_PAD1FNCSEL_SLMOSI       = 1,     /*!< SLMOSI : Configure as the IOSLAVE SPI MOSI signal value.                  */
+  GPIO_PADREGA_PAD1FNCSEL_UART0TX      = 2,     /*!< UART0TX : Configure as the UART0 TX output signal value.                  */
+  GPIO_PADREGA_PAD1FNCSEL_GPIO1        = 3,     /*!< GPIO1 : Configure as GPIO1 value.                                         */
+  GPIO_PADREGA_PAD1FNCSEL_MSPI5        = 5,     /*!< MSPI5 : MSPI data connection 5 value.                                     */
+  GPIO_PADREGA_PAD1FNCSEL_NCE1         = 7,     /*!< NCE1 : IOM/MSPI nCE group 1 value.                                        */
+} GPIO_PADREGA_PAD1FNCSEL_Enum;
+
+/* ============================================  GPIO PADREGA PAD1STRNG [10..10]  ============================================ */
+typedef enum {                                  /*!< GPIO_PADREGA_PAD1STRNG                                                    */
+  GPIO_PADREGA_PAD1STRNG_LOW           = 0,     /*!< LOW : Low drive strength value.                                           */
+  GPIO_PADREGA_PAD1STRNG_HIGH          = 1,     /*!< HIGH : High drive strength value.                                         */
+} GPIO_PADREGA_PAD1STRNG_Enum;
+
+/* =============================================  GPIO PADREGA PAD1INPEN [9..9]  ============================================= */
+typedef enum {                                  /*!< GPIO_PADREGA_PAD1INPEN                                                    */
+  GPIO_PADREGA_PAD1INPEN_DIS           = 0,     /*!< DIS : Pad input disabled value.                                           */
+  GPIO_PADREGA_PAD1INPEN_EN            = 1,     /*!< EN : Pad input enabled value.                                             */
+} GPIO_PADREGA_PAD1INPEN_Enum;
+
+/* =============================================  GPIO PADREGA PAD1PULL [8..8]  ============================================== */
+typedef enum {                                  /*!< GPIO_PADREGA_PAD1PULL                                                     */
+  GPIO_PADREGA_PAD1PULL_DIS            = 0,     /*!< DIS : Pullup disabled value.                                              */
+  GPIO_PADREGA_PAD1PULL_EN             = 1,     /*!< EN : Pullup enabled value.                                                */
+} GPIO_PADREGA_PAD1PULL_Enum;
+
+/* =============================================  GPIO PADREGA PAD0RSEL [6..7]  ============================================== */
+typedef enum {                                  /*!< GPIO_PADREGA_PAD0RSEL                                                     */
+  GPIO_PADREGA_PAD0RSEL_PULL1_5K       = 0,     /*!< PULL1_5K : Pullup is ~1.5 KOhms value.                                    */
+  GPIO_PADREGA_PAD0RSEL_PULL6K         = 1,     /*!< PULL6K : Pullup is ~6 KOhms value.                                        */
+  GPIO_PADREGA_PAD0RSEL_PULL12K        = 2,     /*!< PULL12K : Pullup is ~12 KOhms value.                                      */
+  GPIO_PADREGA_PAD0RSEL_PULL24K        = 3,     /*!< PULL24K : Pullup is ~24 KOhms value.                                      */
+} GPIO_PADREGA_PAD0RSEL_Enum;
+
+/* ============================================  GPIO PADREGA PAD0FNCSEL [3..5]  ============================================= */
+typedef enum {                                  /*!< GPIO_PADREGA_PAD0FNCSEL                                                   */
+  GPIO_PADREGA_PAD0FNCSEL_SLSCL        = 0,     /*!< SLSCL : Configure as the IOSLAVE I2C SCL signal value.                    */
+  GPIO_PADREGA_PAD0FNCSEL_SLSCK        = 1,     /*!< SLSCK : Configure as the IOSLAVE SPI SCK signal value.                    */
+  GPIO_PADREGA_PAD0FNCSEL_CLKOUT       = 2,     /*!< CLKOUT : Configure as the CLKOUT signal value.                            */
+  GPIO_PADREGA_PAD0FNCSEL_GPIO0        = 3,     /*!< GPIO0 : Configure as GPIO0 value.                                         */
+  GPIO_PADREGA_PAD0FNCSEL_MSPI4        = 5,     /*!< MSPI4 : MSPI data connection 4 value.                                     */
+  GPIO_PADREGA_PAD0FNCSEL_NCE0         = 7,     /*!< NCE0 : IOM/MSPI nCE group 0 value.                                        */
+} GPIO_PADREGA_PAD0FNCSEL_Enum;
+
+/* =============================================  GPIO PADREGA PAD0STRNG [2..2]  ============================================= */
+typedef enum {                                  /*!< GPIO_PADREGA_PAD0STRNG                                                    */
+  GPIO_PADREGA_PAD0STRNG_LOW           = 0,     /*!< LOW : Low drive strength value.                                           */
+  GPIO_PADREGA_PAD0STRNG_HIGH          = 1,     /*!< HIGH : High drive strength value.                                         */
+} GPIO_PADREGA_PAD0STRNG_Enum;
+
+/* =============================================  GPIO PADREGA PAD0INPEN [1..1]  ============================================= */
+typedef enum {                                  /*!< GPIO_PADREGA_PAD0INPEN                                                    */
+  GPIO_PADREGA_PAD0INPEN_DIS           = 0,     /*!< DIS : Pad input disabled value.                                           */
+  GPIO_PADREGA_PAD0INPEN_EN            = 1,     /*!< EN : Pad input enabled value.                                             */
+} GPIO_PADREGA_PAD0INPEN_Enum;
+
+/* =============================================  GPIO PADREGA PAD0PULL [0..0]  ============================================== */
+typedef enum {                                  /*!< GPIO_PADREGA_PAD0PULL                                                     */
+  GPIO_PADREGA_PAD0PULL_DIS            = 0,     /*!< DIS : Pullup disabled value.                                              */
+  GPIO_PADREGA_PAD0PULL_EN             = 1,     /*!< EN : Pullup enabled value.                                                */
+} GPIO_PADREGA_PAD0PULL_Enum;
+
+/* ========================================================  PADREGB  ======================================================== */
+/* ===========================================  GPIO PADREGB PAD7FNCSEL [27..29]  ============================================ */
+typedef enum {                                  /*!< GPIO_PADREGB_PAD7FNCSEL                                                   */
+  GPIO_PADREGB_PAD7FNCSEL_NCE7         = 0,     /*!< NCE7 : IOM/MSPI nCE group 7 value.                                        */
+  GPIO_PADREGB_PAD7FNCSEL_M0MOSI       = 1,     /*!< M0MOSI : Configure as the IOMSTR0 SPI MOSI signal value.                  */
+  GPIO_PADREGB_PAD7FNCSEL_CLKOUT       = 2,     /*!< CLKOUT : Configure as the CLKOUT signal value.                            */
+  GPIO_PADREGB_PAD7FNCSEL_GPIO7        = 3,     /*!< GPIO7 : Configure as GPIO7 value.                                         */
+  GPIO_PADREGB_PAD7FNCSEL_TRIG0        = 4,     /*!< TRIG0 : Configure as the ADC Trigger 0 signal value.                      */
+  GPIO_PADREGB_PAD7FNCSEL_UART0TX      = 5,     /*!< UART0TX : Configure as the UART0 TX output signal value.                  */
+  GPIO_PADREGB_PAD7FNCSEL_CT19         = 7,     /*!< CT19 : CTIMER connection 19 value.                                        */
+} GPIO_PADREGB_PAD7FNCSEL_Enum;
+
+/* ============================================  GPIO PADREGB PAD7STRNG [26..26]  ============================================ */
+typedef enum {                                  /*!< GPIO_PADREGB_PAD7STRNG                                                    */
+  GPIO_PADREGB_PAD7STRNG_LOW           = 0,     /*!< LOW : Low drive strength value.                                           */
+  GPIO_PADREGB_PAD7STRNG_HIGH          = 1,     /*!< HIGH : High drive strength value.                                         */
+} GPIO_PADREGB_PAD7STRNG_Enum;
+
+/* ============================================  GPIO PADREGB PAD7INPEN [25..25]  ============================================ */
+typedef enum {                                  /*!< GPIO_PADREGB_PAD7INPEN                                                    */
+  GPIO_PADREGB_PAD7INPEN_DIS           = 0,     /*!< DIS : Pad input disabled value.                                           */
+  GPIO_PADREGB_PAD7INPEN_EN            = 1,     /*!< EN : Pad input enabled value.                                             */
+} GPIO_PADREGB_PAD7INPEN_Enum;
+
+/* ============================================  GPIO PADREGB PAD7PULL [24..24]  ============================================= */
+typedef enum {                                  /*!< GPIO_PADREGB_PAD7PULL                                                     */
+  GPIO_PADREGB_PAD7PULL_DIS            = 0,     /*!< DIS : Pullup disabled value.                                              */
+  GPIO_PADREGB_PAD7PULL_EN             = 1,     /*!< EN : Pullup enabled value.                                                */
+} GPIO_PADREGB_PAD7PULL_Enum;
+
+/* ============================================  GPIO PADREGB PAD6RSEL [22..23]  ============================================= */
+typedef enum {                                  /*!< GPIO_PADREGB_PAD6RSEL                                                     */
+  GPIO_PADREGB_PAD6RSEL_PULL1_5K       = 0,     /*!< PULL1_5K : Pullup is ~1.5 KOhms value.                                    */
+  GPIO_PADREGB_PAD6RSEL_PULL6K         = 1,     /*!< PULL6K : Pullup is ~6 KOhms value.                                        */
+  GPIO_PADREGB_PAD6RSEL_PULL12K        = 2,     /*!< PULL12K : Pullup is ~12 KOhms value.                                      */
+  GPIO_PADREGB_PAD6RSEL_PULL24K        = 3,     /*!< PULL24K : Pullup is ~24 KOhms value.                                      */
+} GPIO_PADREGB_PAD6RSEL_Enum;
+
+/* ===========================================  GPIO PADREGB PAD6FNCSEL [19..21]  ============================================ */
+typedef enum {                                  /*!< GPIO_PADREGB_PAD6FNCSEL                                                   */
+  GPIO_PADREGB_PAD6FNCSEL_M0SDAWIR3    = 0,     /*!< M0SDAWIR3 : Configure as the IOMSTR0 I2C SDA or SPI WIR3 signal
+                                                     value.                                                                    */
+  GPIO_PADREGB_PAD6FNCSEL_M0MISO       = 1,     /*!< M0MISO : Configure as the IOMSTR0 SPI MISO signal value.                  */
+  GPIO_PADREGB_PAD6FNCSEL_UA0CTS       = 2,     /*!< UA0CTS : Configure as the UART0 CTS input signal value.                   */
+  GPIO_PADREGB_PAD6FNCSEL_GPIO6        = 3,     /*!< GPIO6 : Configure as GPIO6 value.                                         */
+  GPIO_PADREGB_PAD6FNCSEL_CT10         = 5,     /*!< CT10 : CTIMER connection 10 value.                                        */
+  GPIO_PADREGB_PAD6FNCSEL_I2S_DAT      = 7,     /*!< I2S_DAT : Configure as the PDM I2S Data output signal value.              */
+} GPIO_PADREGB_PAD6FNCSEL_Enum;
+
+/* ============================================  GPIO PADREGB PAD6STRNG [18..18]  ============================================ */
+typedef enum {                                  /*!< GPIO_PADREGB_PAD6STRNG                                                    */
+  GPIO_PADREGB_PAD6STRNG_LOW           = 0,     /*!< LOW : Low drive strength value.                                           */
+  GPIO_PADREGB_PAD6STRNG_HIGH          = 1,     /*!< HIGH : High drive strength value.                                         */
+} GPIO_PADREGB_PAD6STRNG_Enum;
+
+/* ============================================  GPIO PADREGB PAD6INPEN [17..17]  ============================================ */
+typedef enum {                                  /*!< GPIO_PADREGB_PAD6INPEN                                                    */
+  GPIO_PADREGB_PAD6INPEN_DIS           = 0,     /*!< DIS : Pad input disabled value.                                           */
+  GPIO_PADREGB_PAD6INPEN_EN            = 1,     /*!< EN : Pad input enabled value.                                             */
+} GPIO_PADREGB_PAD6INPEN_Enum;
+
+/* ============================================  GPIO PADREGB PAD6PULL [16..16]  ============================================= */
+typedef enum {                                  /*!< GPIO_PADREGB_PAD6PULL                                                     */
+  GPIO_PADREGB_PAD6PULL_DIS            = 0,     /*!< DIS : Pullup disabled value.                                              */
+  GPIO_PADREGB_PAD6PULL_EN             = 1,     /*!< EN : Pullup enabled value.                                                */
+} GPIO_PADREGB_PAD6PULL_Enum;
+
+/* ============================================  GPIO PADREGB PAD5RSEL [14..15]  ============================================= */
+typedef enum {                                  /*!< GPIO_PADREGB_PAD5RSEL                                                     */
+  GPIO_PADREGB_PAD5RSEL_PULL1_5K       = 0,     /*!< PULL1_5K : Pullup is ~1.5 KOhms value.                                    */
+  GPIO_PADREGB_PAD5RSEL_PULL6K         = 1,     /*!< PULL6K : Pullup is ~6 KOhms value.                                        */
+  GPIO_PADREGB_PAD5RSEL_PULL12K        = 2,     /*!< PULL12K : Pullup is ~12 KOhms value.                                      */
+  GPIO_PADREGB_PAD5RSEL_PULL24K        = 3,     /*!< PULL24K : Pullup is ~24 KOhms value.                                      */
+} GPIO_PADREGB_PAD5RSEL_Enum;
+
+/* ===========================================  GPIO PADREGB PAD5FNCSEL [11..13]  ============================================ */
+typedef enum {                                  /*!< GPIO_PADREGB_PAD5FNCSEL                                                   */
+  GPIO_PADREGB_PAD5FNCSEL_M0SCL        = 0,     /*!< M0SCL : Configure as the IOMSTR0 I2C SCL signal value.                    */
+  GPIO_PADREGB_PAD5FNCSEL_M0SCK        = 1,     /*!< M0SCK : Configure as the IOMSTR0 SPI SCK signal value.                    */
+  GPIO_PADREGB_PAD5FNCSEL_UA0RTS       = 2,     /*!< UA0RTS : Configure as the UART0 RTS signal output value.                  */
+  GPIO_PADREGB_PAD5FNCSEL_GPIO5        = 3,     /*!< GPIO5 : Configure as GPIO5 value.                                         */
+  GPIO_PADREGB_PAD5FNCSEL_EXTHFA       = 5,     /*!< EXTHFA : Configure as the External HFA input clock value.                 */
+  GPIO_PADREGB_PAD5FNCSEL_CT8          = 7,     /*!< CT8 : CTIMER connection 8 value.                                          */
+} GPIO_PADREGB_PAD5FNCSEL_Enum;
+
+/* ============================================  GPIO PADREGB PAD5STRNG [10..10]  ============================================ */
+typedef enum {                                  /*!< GPIO_PADREGB_PAD5STRNG                                                    */
+  GPIO_PADREGB_PAD5STRNG_LOW           = 0,     /*!< LOW : Low drive strength value.                                           */
+  GPIO_PADREGB_PAD5STRNG_HIGH          = 1,     /*!< HIGH : High drive strength value.                                         */
+} GPIO_PADREGB_PAD5STRNG_Enum;
+
+/* =============================================  GPIO PADREGB PAD5INPEN [9..9]  ============================================= */
+typedef enum {                                  /*!< GPIO_PADREGB_PAD5INPEN                                                    */
+  GPIO_PADREGB_PAD5INPEN_DIS           = 0,     /*!< DIS : Pad input disabled value.                                           */
+  GPIO_PADREGB_PAD5INPEN_EN            = 1,     /*!< EN : Pad input enabled value.                                             */
+} GPIO_PADREGB_PAD5INPEN_Enum;
+
+/* =============================================  GPIO PADREGB PAD5PULL [8..8]  ============================================== */
+typedef enum {                                  /*!< GPIO_PADREGB_PAD5PULL                                                     */
+  GPIO_PADREGB_PAD5PULL_DIS            = 0,     /*!< DIS : Pullup disabled value.                                              */
+  GPIO_PADREGB_PAD5PULL_EN             = 1,     /*!< EN : Pullup enabled value.                                                */
+} GPIO_PADREGB_PAD5PULL_Enum;
+
+/* ============================================  GPIO PADREGB PAD4FNCSEL [3..5]  ============================================= */
+typedef enum {                                  /*!< GPIO_PADREGB_PAD4FNCSEL                                                   */
+  GPIO_PADREGB_PAD4FNCSEL_UA0CTS       = 0,     /*!< UA0CTS : Configure as the UART0 CTS input signal value.                   */
+  GPIO_PADREGB_PAD4FNCSEL_SLINT        = 1,     /*!< SLINT : Configure as the IOSLAVE interrupt out signal value.              */
+  GPIO_PADREGB_PAD4FNCSEL_NCE4         = 2,     /*!< NCE4 : IOM/SPI nCE group 4 value.                                         */
+  GPIO_PADREGB_PAD4FNCSEL_GPIO4        = 3,     /*!< GPIO4 : Configure as GPIO4 value.                                         */
+  GPIO_PADREGB_PAD4FNCSEL_UART0RX      = 5,     /*!< UART0RX : Configure as the UART0 RX input value.                          */
+  GPIO_PADREGB_PAD4FNCSEL_CT17         = 6,     /*!< CT17 : CTIMER connection 17 value.                                        */
+  GPIO_PADREGB_PAD4FNCSEL_MSPI2        = 7,     /*!< MSPI2 : MSPI data connection 2 value.                                     */
+} GPIO_PADREGB_PAD4FNCSEL_Enum;
+
+/* =============================================  GPIO PADREGB PAD4STRNG [2..2]  ============================================= */
+typedef enum {                                  /*!< GPIO_PADREGB_PAD4STRNG                                                    */
+  GPIO_PADREGB_PAD4STRNG_LOW           = 0,     /*!< LOW : Low drive strength value.                                           */
+  GPIO_PADREGB_PAD4STRNG_HIGH          = 1,     /*!< HIGH : High drive strength value.                                         */
+} GPIO_PADREGB_PAD4STRNG_Enum;
+
+/* =============================================  GPIO PADREGB PAD4INPEN [1..1]  ============================================= */
+typedef enum {                                  /*!< GPIO_PADREGB_PAD4INPEN                                                    */
+  GPIO_PADREGB_PAD4INPEN_DIS           = 0,     /*!< DIS : Pad input disabled value.                                           */
+  GPIO_PADREGB_PAD4INPEN_EN            = 1,     /*!< EN : Pad input enabled value.                                             */
+} GPIO_PADREGB_PAD4INPEN_Enum;
+
+/* =============================================  GPIO PADREGB PAD4PULL [0..0]  ============================================== */
+typedef enum {                                  /*!< GPIO_PADREGB_PAD4PULL                                                     */
+  GPIO_PADREGB_PAD4PULL_DIS            = 0,     /*!< DIS : Pullup disabled value.                                              */
+  GPIO_PADREGB_PAD4PULL_EN             = 1,     /*!< EN : Pullup enabled value.                                                */
+} GPIO_PADREGB_PAD4PULL_Enum;
+
+/* ========================================================  PADREGC  ======================================================== */
+/* ===========================================  GPIO PADREGC PAD11FNCSEL [27..29]  =========================================== */
+typedef enum {                                  /*!< GPIO_PADREGC_PAD11FNCSEL                                                  */
+  GPIO_PADREGC_PAD11FNCSEL_ADCSE2      = 0,     /*!< ADCSE2 : Configure as the analog input for ADC single ended
+                                                     input 2 value.                                                            */
+  GPIO_PADREGC_PAD11FNCSEL_NCE11       = 1,     /*!< NCE11 : IOM/MSPI nCE group 11 value.                                      */
+  GPIO_PADREGC_PAD11FNCSEL_CT31        = 2,     /*!< CT31 : CTIMER connection 31 value.                                        */
+  GPIO_PADREGC_PAD11FNCSEL_GPIO11      = 3,     /*!< GPIO11 : Configure as GPIO11 value.                                       */
+  GPIO_PADREGC_PAD11FNCSEL_SLINT       = 4,     /*!< SLINT : Configure as the IOSLAVE interrupt out signal value.              */
+  GPIO_PADREGC_PAD11FNCSEL_UA1CTS      = 5,     /*!< UA1CTS : Configure as the UART1 CTS input signal value.                   */
+  GPIO_PADREGC_PAD11FNCSEL_UART0RX     = 6,     /*!< UART0RX : Configure as the UART0 RX input signal value.                   */
+  GPIO_PADREGC_PAD11FNCSEL_PDM_DATA    = 7,     /*!< PDM_DATA : Configure as the PDM Data input signal value.                  */
+} GPIO_PADREGC_PAD11FNCSEL_Enum;
+
+/* ===========================================  GPIO PADREGC PAD11STRNG [26..26]  ============================================ */
+typedef enum {                                  /*!< GPIO_PADREGC_PAD11STRNG                                                   */
+  GPIO_PADREGC_PAD11STRNG_LOW          = 0,     /*!< LOW : Low drive strength value.                                           */
+  GPIO_PADREGC_PAD11STRNG_HIGH         = 1,     /*!< HIGH : High drive strength value.                                         */
+} GPIO_PADREGC_PAD11STRNG_Enum;
+
+/* ===========================================  GPIO PADREGC PAD11INPEN [25..25]  ============================================ */
+typedef enum {                                  /*!< GPIO_PADREGC_PAD11INPEN                                                   */
+  GPIO_PADREGC_PAD11INPEN_DIS          = 0,     /*!< DIS : Pad input disabled value.                                           */
+  GPIO_PADREGC_PAD11INPEN_EN           = 1,     /*!< EN : Pad input enabled value.                                             */
+} GPIO_PADREGC_PAD11INPEN_Enum;
+
+/* ============================================  GPIO PADREGC PAD11PULL [24..24]  ============================================ */
+typedef enum {                                  /*!< GPIO_PADREGC_PAD11PULL                                                    */
+  GPIO_PADREGC_PAD11PULL_DIS           = 0,     /*!< DIS : Pullup disabled value.                                              */
+  GPIO_PADREGC_PAD11PULL_EN            = 1,     /*!< EN : Pullup enabled value.                                                */
+} GPIO_PADREGC_PAD11PULL_Enum;
+
+/* ===========================================  GPIO PADREGC PAD10FNCSEL [19..21]  =========================================== */
+typedef enum {                                  /*!< GPIO_PADREGC_PAD10FNCSEL                                                  */
+  GPIO_PADREGC_PAD10FNCSEL_M1MOSI      = 1,     /*!< M1MOSI : Configure as the IOMSTR1 SPI MOSI signal value.                  */
+  GPIO_PADREGC_PAD10FNCSEL_NCE10       = 2,     /*!< NCE10 : IOM/MSPI nCE group 10 value.                                      */
+  GPIO_PADREGC_PAD10FNCSEL_GPIO10      = 3,     /*!< GPIO10 : Configure as GPIO10 value.                                       */
+  GPIO_PADREGC_PAD10FNCSEL_PDMCLK      = 4,     /*!< PDMCLK : PDM serial clock out value.                                      */
+  GPIO_PADREGC_PAD10FNCSEL_UA1RTS      = 5,     /*!< UA1RTS : Configure as the UART1 RTS output signal value.                  */
+} GPIO_PADREGC_PAD10FNCSEL_Enum;
+
+/* ===========================================  GPIO PADREGC PAD10STRNG [18..18]  ============================================ */
+typedef enum {                                  /*!< GPIO_PADREGC_PAD10STRNG                                                   */
+  GPIO_PADREGC_PAD10STRNG_LOW          = 0,     /*!< LOW : Low drive strength value.                                           */
+  GPIO_PADREGC_PAD10STRNG_HIGH         = 1,     /*!< HIGH : High drive strength value.                                         */
+} GPIO_PADREGC_PAD10STRNG_Enum;
+
+/* ===========================================  GPIO PADREGC PAD10INPEN [17..17]  ============================================ */
+typedef enum {                                  /*!< GPIO_PADREGC_PAD10INPEN                                                   */
+  GPIO_PADREGC_PAD10INPEN_DIS          = 0,     /*!< DIS : Pad input disabled value.                                           */
+  GPIO_PADREGC_PAD10INPEN_EN           = 1,     /*!< EN : Pad input enabled value.                                             */
+} GPIO_PADREGC_PAD10INPEN_Enum;
+
+/* ============================================  GPIO PADREGC PAD10PULL [16..16]  ============================================ */
+typedef enum {                                  /*!< GPIO_PADREGC_PAD10PULL                                                    */
+  GPIO_PADREGC_PAD10PULL_DIS           = 0,     /*!< DIS : Pullup disabled value.                                              */
+  GPIO_PADREGC_PAD10PULL_EN            = 1,     /*!< EN : Pullup enabled value.                                                */
+} GPIO_PADREGC_PAD10PULL_Enum;
+
+/* ============================================  GPIO PADREGC PAD9RSEL [14..15]  ============================================= */
+typedef enum {                                  /*!< GPIO_PADREGC_PAD9RSEL                                                     */
+  GPIO_PADREGC_PAD9RSEL_PULL1_5K       = 0,     /*!< PULL1_5K : Pullup is ~1.5 KOhms value.                                    */
+  GPIO_PADREGC_PAD9RSEL_PULL6K         = 1,     /*!< PULL6K : Pullup is ~6 KOhms value.                                        */
+  GPIO_PADREGC_PAD9RSEL_PULL12K        = 2,     /*!< PULL12K : Pullup is ~12 KOhms value.                                      */
+  GPIO_PADREGC_PAD9RSEL_PULL24K        = 3,     /*!< PULL24K : Pullup is ~24 KOhms value.                                      */
+} GPIO_PADREGC_PAD9RSEL_Enum;
+
+/* ===========================================  GPIO PADREGC PAD9FNCSEL [11..13]  ============================================ */
+typedef enum {                                  /*!< GPIO_PADREGC_PAD9FNCSEL                                                   */
+  GPIO_PADREGC_PAD9FNCSEL_M1SDAWIR3    = 0,     /*!< M1SDAWIR3 : Configure as the IOMSTR1 I2C SDA or SPI WIR3 signal
+                                                     value.                                                                    */
+  GPIO_PADREGC_PAD9FNCSEL_M1MISO       = 1,     /*!< M1MISO : Configure as the IOMSTR1 SPI MISO signal value.                  */
+  GPIO_PADREGC_PAD9FNCSEL_NCE9         = 2,     /*!< NCE9 : IOM/MSPI nCE group 9 value.                                        */
+  GPIO_PADREGC_PAD9FNCSEL_GPIO9        = 3,     /*!< GPIO9 : Configure as GPIO9 value.                                         */
+  GPIO_PADREGC_PAD9FNCSEL_SCCIO        = 4,     /*!< SCCIO : SCARD data I/O connection value.                                  */
+  GPIO_PADREGC_PAD9FNCSEL_UART1RX      = 6,     /*!< UART1RX : Configure as UART1 RX input signal value.                       */
+} GPIO_PADREGC_PAD9FNCSEL_Enum;
+
+/* ============================================  GPIO PADREGC PAD9STRNG [10..10]  ============================================ */
+typedef enum {                                  /*!< GPIO_PADREGC_PAD9STRNG                                                    */
+  GPIO_PADREGC_PAD9STRNG_LOW           = 0,     /*!< LOW : Low drive strength value.                                           */
+  GPIO_PADREGC_PAD9STRNG_HIGH          = 1,     /*!< HIGH : High drive strength value.                                         */
+} GPIO_PADREGC_PAD9STRNG_Enum;
+
+/* =============================================  GPIO PADREGC PAD9INPEN [9..9]  ============================================= */
+typedef enum {                                  /*!< GPIO_PADREGC_PAD9INPEN                                                    */
+  GPIO_PADREGC_PAD9INPEN_DIS           = 0,     /*!< DIS : Pad input disabled value.                                           */
+  GPIO_PADREGC_PAD9INPEN_EN            = 1,     /*!< EN : Pad input enabled value.                                             */
+} GPIO_PADREGC_PAD9INPEN_Enum;
+
+/* =============================================  GPIO PADREGC PAD9PULL [8..8]  ============================================== */
+typedef enum {                                  /*!< GPIO_PADREGC_PAD9PULL                                                     */
+  GPIO_PADREGC_PAD9PULL_DIS            = 0,     /*!< DIS : Pullup disabled value.                                              */
+  GPIO_PADREGC_PAD9PULL_EN             = 1,     /*!< EN : Pullup enabled value.                                                */
+} GPIO_PADREGC_PAD9PULL_Enum;
+
+/* =============================================  GPIO PADREGC PAD8RSEL [6..7]  ============================================== */
+typedef enum {                                  /*!< GPIO_PADREGC_PAD8RSEL                                                     */
+  GPIO_PADREGC_PAD8RSEL_PULL1_5K       = 0,     /*!< PULL1_5K : Pullup is ~1.5 KOhms value.                                    */
+  GPIO_PADREGC_PAD8RSEL_PULL6K         = 1,     /*!< PULL6K : Pullup is ~6 KOhms value.                                        */
+  GPIO_PADREGC_PAD8RSEL_PULL12K        = 2,     /*!< PULL12K : Pullup is ~12 KOhms value.                                      */
+  GPIO_PADREGC_PAD8RSEL_PULL24K        = 3,     /*!< PULL24K : Pullup is ~24 KOhms value.                                      */
+} GPIO_PADREGC_PAD8RSEL_Enum;
+
+/* ============================================  GPIO PADREGC PAD8FNCSEL [3..5]  ============================================= */
+typedef enum {                                  /*!< GPIO_PADREGC_PAD8FNCSEL                                                   */
+  GPIO_PADREGC_PAD8FNCSEL_M1SCL        = 0,     /*!< M1SCL : Configure as the IOMSTR1 I2C SCL signal value.                    */
+  GPIO_PADREGC_PAD8FNCSEL_M1SCK        = 1,     /*!< M1SCK : Configure as the IOMSTR1 SPI SCK signal value.                    */
+  GPIO_PADREGC_PAD8FNCSEL_NCE8         = 2,     /*!< NCE8 : IOM/MSPI nCE group 8 value.                                        */
+  GPIO_PADREGC_PAD8FNCSEL_GPIO8        = 3,     /*!< GPIO8 : Configure as GPIO8 value.                                         */
+  GPIO_PADREGC_PAD8FNCSEL_SCCLK        = 4,     /*!< SCCLK : SCARD serial clock output value.                                  */
+  GPIO_PADREGC_PAD8FNCSEL_UART1TX      = 6,     /*!< UART1TX : Configure as the UART1 TX output signal value.                  */
+} GPIO_PADREGC_PAD8FNCSEL_Enum;
+
+/* =============================================  GPIO PADREGC PAD8STRNG [2..2]  ============================================= */
+typedef enum {                                  /*!< GPIO_PADREGC_PAD8STRNG                                                    */
+  GPIO_PADREGC_PAD8STRNG_LOW           = 0,     /*!< LOW : Low drive strength value.                                           */
+  GPIO_PADREGC_PAD8STRNG_HIGH          = 1,     /*!< HIGH : High drive strength value.                                         */
+} GPIO_PADREGC_PAD8STRNG_Enum;
+
+/* =============================================  GPIO PADREGC PAD8INPEN [1..1]  ============================================= */
+typedef enum {                                  /*!< GPIO_PADREGC_PAD8INPEN                                                    */
+  GPIO_PADREGC_PAD8INPEN_DIS           = 0,     /*!< DIS : Pad input disabled value.                                           */
+  GPIO_PADREGC_PAD8INPEN_EN            = 1,     /*!< EN : Pad input enabled value.                                             */
+} GPIO_PADREGC_PAD8INPEN_Enum;
+
+/* =============================================  GPIO PADREGC PAD8PULL [0..0]  ============================================== */
+typedef enum {                                  /*!< GPIO_PADREGC_PAD8PULL                                                     */
+  GPIO_PADREGC_PAD8PULL_DIS            = 0,     /*!< DIS : Pullup disabled value.                                              */
+  GPIO_PADREGC_PAD8PULL_EN             = 1,     /*!< EN : Pullup enabled value.                                                */
+} GPIO_PADREGC_PAD8PULL_Enum;
+
+/* ========================================================  PADREGD  ======================================================== */
+/* ===========================================  GPIO PADREGD PAD15FNCSEL [27..29]  =========================================== */
+typedef enum {                                  /*!< GPIO_PADREGD_PAD15FNCSEL                                                  */
+  GPIO_PADREGD_PAD15FNCSEL_ADCD1N      = 0,     /*!< ADCD1N : Configure as the analog ADC differential pair 1 N input
+                                                     signal value.                                                             */
+  GPIO_PADREGD_PAD15FNCSEL_NCE15       = 1,     /*!< NCE15 : IOM/MSPI nCE group 15 value.                                      */
+  GPIO_PADREGD_PAD15FNCSEL_UART1RX     = 2,     /*!< UART1RX : Configure as the UART1 RX signal value.                         */
+  GPIO_PADREGD_PAD15FNCSEL_GPIO15      = 3,     /*!< GPIO15 : Configure as GPIO15 value.                                       */
+  GPIO_PADREGD_PAD15FNCSEL_PDMDATA     = 4,     /*!< PDMDATA : PDM serial data input value.                                    */
+  GPIO_PADREGD_PAD15FNCSEL_EXTXT       = 5,     /*!< EXTXT : Configure as the external XTAL oscillator input value.            */
+  GPIO_PADREGD_PAD15FNCSEL_SWDIO       = 6,     /*!< SWDIO : Configure as an alternate port for the SWDIO I/O signal
+                                                     value.                                                                    */
+  GPIO_PADREGD_PAD15FNCSEL_SWO         = 7,     /*!< SWO : Configure as an SWO (Serial Wire Trace output) value.               */
+} GPIO_PADREGD_PAD15FNCSEL_Enum;
+
+/* ===========================================  GPIO PADREGD PAD15STRNG [26..26]  ============================================ */
+typedef enum {                                  /*!< GPIO_PADREGD_PAD15STRNG                                                   */
+  GPIO_PADREGD_PAD15STRNG_LOW          = 0,     /*!< LOW : Low drive strength value.                                           */
+  GPIO_PADREGD_PAD15STRNG_HIGH         = 1,     /*!< HIGH : High drive strength value.                                         */
+} GPIO_PADREGD_PAD15STRNG_Enum;
+
+/* ===========================================  GPIO PADREGD PAD15INPEN [25..25]  ============================================ */
+typedef enum {                                  /*!< GPIO_PADREGD_PAD15INPEN                                                   */
+  GPIO_PADREGD_PAD15INPEN_DIS          = 0,     /*!< DIS : Pad input disabled value.                                           */
+  GPIO_PADREGD_PAD15INPEN_EN           = 1,     /*!< EN : Pad input enabled value.                                             */
+} GPIO_PADREGD_PAD15INPEN_Enum;
+
+/* ============================================  GPIO PADREGD PAD15PULL [24..24]  ============================================ */
+typedef enum {                                  /*!< GPIO_PADREGD_PAD15PULL                                                    */
+  GPIO_PADREGD_PAD15PULL_DIS           = 0,     /*!< DIS : Pullup disabled value.                                              */
+  GPIO_PADREGD_PAD15PULL_EN            = 1,     /*!< EN : Pullup enabled value.                                                */
+} GPIO_PADREGD_PAD15PULL_Enum;
+
+/* ===========================================  GPIO PADREGD PAD14FNCSEL [19..21]  =========================================== */
+typedef enum {                                  /*!< GPIO_PADREGD_PAD14FNCSEL                                                  */
+  GPIO_PADREGD_PAD14FNCSEL_ADCD1P      = 0,     /*!< ADCD1P : Configure as the analog ADC differential pair 1 P input
+                                                     signal value.                                                             */
+  GPIO_PADREGD_PAD14FNCSEL_NCE14       = 1,     /*!< NCE14 : IOM/MSPI nCE group 14 value.                                      */
+  GPIO_PADREGD_PAD14FNCSEL_UART1TX     = 2,     /*!< UART1TX : Configure as the UART1 TX output signal value.                  */
+  GPIO_PADREGD_PAD14FNCSEL_GPIO14      = 3,     /*!< GPIO14 : Configure as GPIO14 value.                                       */
+  GPIO_PADREGD_PAD14FNCSEL_PDMCLK      = 4,     /*!< PDMCLK : PDM serial clock output value.                                   */
+  GPIO_PADREGD_PAD14FNCSEL_EXTHFS      = 5,     /*!< EXTHFS : Configure as the External HFRC oscillator input select
+                                                     value.                                                                    */
+  GPIO_PADREGD_PAD14FNCSEL_SWDCK       = 6,     /*!< SWDCK : Configure as the alternate input for the SWDCK input
+                                                     signal value.                                                             */
+  GPIO_PADREGD_PAD14FNCSEL_32kHzXT     = 7,     /*!< 32kHzXT : Configure as the 32kHz crystal output signal value.             */
+} GPIO_PADREGD_PAD14FNCSEL_Enum;
+
+/* ===========================================  GPIO PADREGD PAD14STRNG [18..18]  ============================================ */
+typedef enum {                                  /*!< GPIO_PADREGD_PAD14STRNG                                                   */
+  GPIO_PADREGD_PAD14STRNG_LOW          = 0,     /*!< LOW : Low drive strength value.                                           */
+  GPIO_PADREGD_PAD14STRNG_HIGH         = 1,     /*!< HIGH : High drive strength value.                                         */
+} GPIO_PADREGD_PAD14STRNG_Enum;
+
+/* ===========================================  GPIO PADREGD PAD14INPEN [17..17]  ============================================ */
+typedef enum {                                  /*!< GPIO_PADREGD_PAD14INPEN                                                   */
+  GPIO_PADREGD_PAD14INPEN_DIS          = 0,     /*!< DIS : Pad input disabled value.                                           */
+  GPIO_PADREGD_PAD14INPEN_EN           = 1,     /*!< EN : Pad input enabled value.                                             */
+} GPIO_PADREGD_PAD14INPEN_Enum;
+
+/* ============================================  GPIO PADREGD PAD14PULL [16..16]  ============================================ */
+typedef enum {                                  /*!< GPIO_PADREGD_PAD14PULL                                                    */
+  GPIO_PADREGD_PAD14PULL_DIS           = 0,     /*!< DIS : Pullup disabled value.                                              */
+  GPIO_PADREGD_PAD14PULL_EN            = 1,     /*!< EN : Pullup enabled value.                                                */
+} GPIO_PADREGD_PAD14PULL_Enum;
+
+/* ===========================================  GPIO PADREGD PAD13FNCSEL [11..13]  =========================================== */
+typedef enum {                                  /*!< GPIO_PADREGD_PAD13FNCSEL                                                  */
+  GPIO_PADREGD_PAD13FNCSEL_ADCD0PSE8   = 0,     /*!< ADCD0PSE8 : Configure as the ADC Differential pair 0 P, or Single
+                                                     Ended input 8 analog input signal. Determination of the
+                                                     D0P vs SE8 usage is done when the particular channel is
+                                                     selected within the ADC module value.                                     */
+  GPIO_PADREGD_PAD13FNCSEL_NCE13       = 1,     /*!< NCE13 : IOM/MSPI nCE group 13 value.                                      */
+  GPIO_PADREGD_PAD13FNCSEL_CT2         = 2,     /*!< CT2 : CTIMER connection 2 value.                                          */
+  GPIO_PADREGD_PAD13FNCSEL_GPIO13      = 3,     /*!< GPIO13 : Configure as GPIO13 value.                                       */
+  GPIO_PADREGD_PAD13FNCSEL_I2SBCLK     = 4,     /*!< I2SBCLK : I2C interface bit clock value.                                  */
+  GPIO_PADREGD_PAD13FNCSEL_EXTHFB      = 5,     /*!< EXTHFB : Configure as the external HFRC oscillator input value.           */
+  GPIO_PADREGD_PAD13FNCSEL_UA0RTS      = 6,     /*!< UA0RTS : Configure as the UART0 RTS signal output value.                  */
+  GPIO_PADREGD_PAD13FNCSEL_UART1RX     = 7,     /*!< UART1RX : Configure as the UART1 RX input signal value.                   */
+} GPIO_PADREGD_PAD13FNCSEL_Enum;
+
+/* ===========================================  GPIO PADREGD PAD13STRNG [10..10]  ============================================ */
+typedef enum {                                  /*!< GPIO_PADREGD_PAD13STRNG                                                   */
+  GPIO_PADREGD_PAD13STRNG_LOW          = 0,     /*!< LOW : Low drive strength value.                                           */
+  GPIO_PADREGD_PAD13STRNG_HIGH         = 1,     /*!< HIGH : High drive strength value.                                         */
+} GPIO_PADREGD_PAD13STRNG_Enum;
+
+/* ============================================  GPIO PADREGD PAD13INPEN [9..9]  ============================================= */
+typedef enum {                                  /*!< GPIO_PADREGD_PAD13INPEN                                                   */
+  GPIO_PADREGD_PAD13INPEN_DIS          = 0,     /*!< DIS : Pad input disabled value.                                           */
+  GPIO_PADREGD_PAD13INPEN_EN           = 1,     /*!< EN : Pad input enabled value.                                             */
+} GPIO_PADREGD_PAD13INPEN_Enum;
+
+/* =============================================  GPIO PADREGD PAD13PULL [8..8]  ============================================= */
+typedef enum {                                  /*!< GPIO_PADREGD_PAD13PULL                                                    */
+  GPIO_PADREGD_PAD13PULL_DIS           = 0,     /*!< DIS : Pullup disabled value.                                              */
+  GPIO_PADREGD_PAD13PULL_EN            = 1,     /*!< EN : Pullup enabled value.                                                */
+} GPIO_PADREGD_PAD13PULL_Enum;
+
+/* ============================================  GPIO PADREGD PAD12FNCSEL [3..5]  ============================================ */
+typedef enum {                                  /*!< GPIO_PADREGD_PAD12FNCSEL                                                  */
+  GPIO_PADREGD_PAD12FNCSEL_ADCD0NSE9   = 0,     /*!< ADCD0NSE9 : Configure as the ADC Differential pair 0 N, or Single
+                                                     Ended input 9 analog input signal. Determination of the
+                                                     D0N vs SE9 usage is done when the particular channel is
+                                                     selected within the ADC module value.                                     */
+  GPIO_PADREGD_PAD12FNCSEL_NCE12       = 1,     /*!< NCE12 : IOM/MSPI nCE group 12 value.                                      */
+  GPIO_PADREGD_PAD12FNCSEL_CT0         = 2,     /*!< CT0 : CTIMER connection 0 value.                                          */
+  GPIO_PADREGD_PAD12FNCSEL_GPIO12      = 3,     /*!< GPIO12 : Configure as GPIO12 value.                                       */
+  GPIO_PADREGD_PAD12FNCSEL_SLnCE       = 4,     /*!< SLnCE : Configure as the IOSLAVE SPI nCE signal value.                    */
+  GPIO_PADREGD_PAD12FNCSEL_PDMCLK      = 5,     /*!< PDMCLK : PDM serial clock output value.                                   */
+  GPIO_PADREGD_PAD12FNCSEL_UA0CTS      = 6,     /*!< UA0CTS : Configure as the UART0 CTS input signal value.                   */
+  GPIO_PADREGD_PAD12FNCSEL_UART1TX     = 7,     /*!< UART1TX : Configure as the UART1 TX output signal value.                  */
+} GPIO_PADREGD_PAD12FNCSEL_Enum;
+
+/* ============================================  GPIO PADREGD PAD12STRNG [2..2]  ============================================= */
+typedef enum {                                  /*!< GPIO_PADREGD_PAD12STRNG                                                   */
+  GPIO_PADREGD_PAD12STRNG_LOW          = 0,     /*!< LOW : Low drive strength value.                                           */
+  GPIO_PADREGD_PAD12STRNG_HIGH         = 1,     /*!< HIGH : High drive strength value.                                         */
+} GPIO_PADREGD_PAD12STRNG_Enum;
+
+/* ============================================  GPIO PADREGD PAD12INPEN [1..1]  ============================================= */
+typedef enum {                                  /*!< GPIO_PADREGD_PAD12INPEN                                                   */
+  GPIO_PADREGD_PAD12INPEN_DIS          = 0,     /*!< DIS : Pad input disabled value.                                           */
+  GPIO_PADREGD_PAD12INPEN_EN           = 1,     /*!< EN : Pad input enabled value.                                             */
+} GPIO_PADREGD_PAD12INPEN_Enum;
+
+/* =============================================  GPIO PADREGD PAD12PULL [0..0]  ============================================= */
+typedef enum {                                  /*!< GPIO_PADREGD_PAD12PULL                                                    */
+  GPIO_PADREGD_PAD12PULL_DIS           = 0,     /*!< DIS : Pullup disabled value.                                              */
+  GPIO_PADREGD_PAD12PULL_EN            = 1,     /*!< EN : Pullup enabled value.                                                */
+} GPIO_PADREGD_PAD12PULL_Enum;
+
+/* ========================================================  PADREGE  ======================================================== */
+/* ===========================================  GPIO PADREGE PAD19FNCSEL [27..29]  =========================================== */
+typedef enum {                                  /*!< GPIO_PADREGE_PAD19FNCSEL                                                  */
+  GPIO_PADREGE_PAD19FNCSEL_CMPRF0      = 0,     /*!< CMPRF0 : Configure as the analog comparator reference 0 signal
+                                                     value.                                                                    */
+  GPIO_PADREGE_PAD19FNCSEL_NCE19       = 1,     /*!< NCE19 : IOM/MSPI nCE group 19 value.                                      */
+  GPIO_PADREGE_PAD19FNCSEL_CT6         = 2,     /*!< CT6 : CTIMER conenction 6 value.                                          */
+  GPIO_PADREGE_PAD19FNCSEL_GPIO19      = 3,     /*!< GPIO19 : Configure as GPIO19 value.                                       */
+  GPIO_PADREGE_PAD19FNCSEL_SCCLK       = 4,     /*!< SCCLK : SCARD serial clock value.                                         */
+  GPIO_PADREGE_PAD19FNCSEL_ANATEST1    = 5,     /*!< ANATEST1 : Configure as the ANATEST1 I/O signal value.                    */
+  GPIO_PADREGE_PAD19FNCSEL_UART1RX     = 6,     /*!< UART1RX : Configure as the UART1 RX input signal value.                   */
+  GPIO_PADREGE_PAD19FNCSEL_I2SBCLK     = 7,     /*!< I2SBCLK : Configure as the PDM I2S bit clock input signal value.          */
+} GPIO_PADREGE_PAD19FNCSEL_Enum;
+
+/* ===========================================  GPIO PADREGE PAD19STRNG [26..26]  ============================================ */
+typedef enum {                                  /*!< GPIO_PADREGE_PAD19STRNG                                                   */
+  GPIO_PADREGE_PAD19STRNG_LOW          = 0,     /*!< LOW : Low drive strength value.                                           */
+  GPIO_PADREGE_PAD19STRNG_HIGH         = 1,     /*!< HIGH : High drive strength value.                                         */
+} GPIO_PADREGE_PAD19STRNG_Enum;
+
+/* ===========================================  GPIO PADREGE PAD19INPEN [25..25]  ============================================ */
+typedef enum {                                  /*!< GPIO_PADREGE_PAD19INPEN                                                   */
+  GPIO_PADREGE_PAD19INPEN_DIS          = 0,     /*!< DIS : Pad input disabled value.                                           */
+  GPIO_PADREGE_PAD19INPEN_EN           = 1,     /*!< EN : Pad input enabled value.                                             */
+} GPIO_PADREGE_PAD19INPEN_Enum;
+
+/* ============================================  GPIO PADREGE PAD19PULL [24..24]  ============================================ */
+typedef enum {                                  /*!< GPIO_PADREGE_PAD19PULL                                                    */
+  GPIO_PADREGE_PAD19PULL_DIS           = 0,     /*!< DIS : Pullup disabled value.                                              */
+  GPIO_PADREGE_PAD19PULL_EN            = 1,     /*!< EN : Pullup enabled value.                                                */
+} GPIO_PADREGE_PAD19PULL_Enum;
+
+/* ===========================================  GPIO PADREGE PAD18FNCSEL [19..21]  =========================================== */
+typedef enum {                                  /*!< GPIO_PADREGE_PAD18FNCSEL                                                  */
+  GPIO_PADREGE_PAD18FNCSEL_CMPIN1      = 0,     /*!< CMPIN1 : Configure as the analog comparator input 1 signal value.         */
+  GPIO_PADREGE_PAD18FNCSEL_NCE18       = 1,     /*!< NCE18 : IOM/MSPI nCE group 18 value.                                      */
+  GPIO_PADREGE_PAD18FNCSEL_CT4         = 2,     /*!< CT4 : CTIMER connection 4 value.                                          */
+  GPIO_PADREGE_PAD18FNCSEL_GPIO18      = 3,     /*!< GPIO18 : Configure as GPIO18 value.                                       */
+  GPIO_PADREGE_PAD18FNCSEL_UA0RTS      = 4,     /*!< UA0RTS : Configure as UART0 RTS output signal value.                      */
+  GPIO_PADREGE_PAD18FNCSEL_ANATEST2    = 5,     /*!< ANATEST2 : Configure as ANATEST2 I/O signal value.                        */
+  GPIO_PADREGE_PAD18FNCSEL_UART1TX     = 6,     /*!< UART1TX : Configure as UART1 TX output signal value.                      */
+  GPIO_PADREGE_PAD18FNCSEL_SCCIO       = 7,     /*!< SCCIO : SCARD data input/output connectin value.                          */
+} GPIO_PADREGE_PAD18FNCSEL_Enum;
+
+/* ===========================================  GPIO PADREGE PAD18STRNG [18..18]  ============================================ */
+typedef enum {                                  /*!< GPIO_PADREGE_PAD18STRNG                                                   */
+  GPIO_PADREGE_PAD18STRNG_LOW          = 0,     /*!< LOW : Low drive strength value.                                           */
+  GPIO_PADREGE_PAD18STRNG_HIGH         = 1,     /*!< HIGH : High drive strength value.                                         */
+} GPIO_PADREGE_PAD18STRNG_Enum;
+
+/* ===========================================  GPIO PADREGE PAD18INPEN [17..17]  ============================================ */
+typedef enum {                                  /*!< GPIO_PADREGE_PAD18INPEN                                                   */
+  GPIO_PADREGE_PAD18INPEN_DIS          = 0,     /*!< DIS : Pad input disabled value.                                           */
+  GPIO_PADREGE_PAD18INPEN_EN           = 1,     /*!< EN : Pad input enabled value.                                             */
+} GPIO_PADREGE_PAD18INPEN_Enum;
+
+/* ============================================  GPIO PADREGE PAD18PULL [16..16]  ============================================ */
+typedef enum {                                  /*!< GPIO_PADREGE_PAD18PULL                                                    */
+  GPIO_PADREGE_PAD18PULL_DIS           = 0,     /*!< DIS : Pullup disabled value.                                              */
+  GPIO_PADREGE_PAD18PULL_EN            = 1,     /*!< EN : Pullup enabled value.                                                */
+} GPIO_PADREGE_PAD18PULL_Enum;
+
+/* ===========================================  GPIO PADREGE PAD17FNCSEL [11..13]  =========================================== */
+typedef enum {                                  /*!< GPIO_PADREGE_PAD17FNCSEL                                                  */
+  GPIO_PADREGE_PAD17FNCSEL_CMPRF1      = 0,     /*!< CMPRF1 : Configure as the analog comparator reference signal
+                                                     1 input signal value.                                                     */
+  GPIO_PADREGE_PAD17FNCSEL_NCE17       = 1,     /*!< NCE17 : IOM/MSPI nCE group 17 value.                                      */
+  GPIO_PADREGE_PAD17FNCSEL_TRIG1       = 2,     /*!< TRIG1 : Configure as the ADC Trigger 1 signal value.                      */
+  GPIO_PADREGE_PAD17FNCSEL_GPIO17      = 3,     /*!< GPIO17 : Configure as GPIO17 value.                                       */
+  GPIO_PADREGE_PAD17FNCSEL_SCCCLK      = 4,     /*!< SCCCLK : SCARD serial clock output value.                                 */
+  GPIO_PADREGE_PAD17FNCSEL_UART0RX     = 6,     /*!< UART0RX : Configure as UART0 RX input signal value.                       */
+  GPIO_PADREGE_PAD17FNCSEL_UA1CTS      = 7,     /*!< UA1CTS : Configure as UART1 CTS input signal value.                       */
+} GPIO_PADREGE_PAD17FNCSEL_Enum;
+
+/* ===========================================  GPIO PADREGE PAD17STRNG [10..10]  ============================================ */
+typedef enum {                                  /*!< GPIO_PADREGE_PAD17STRNG                                                   */
+  GPIO_PADREGE_PAD17STRNG_LOW          = 0,     /*!< LOW : Low drive strength value.                                           */
+  GPIO_PADREGE_PAD17STRNG_HIGH         = 1,     /*!< HIGH : High drive strength value.                                         */
+} GPIO_PADREGE_PAD17STRNG_Enum;
+
+/* ============================================  GPIO PADREGE PAD17INPEN [9..9]  ============================================= */
+typedef enum {                                  /*!< GPIO_PADREGE_PAD17INPEN                                                   */
+  GPIO_PADREGE_PAD17INPEN_DIS          = 0,     /*!< DIS : Pad input disabled value.                                           */
+  GPIO_PADREGE_PAD17INPEN_EN           = 1,     /*!< EN : Pad input enabled value.                                             */
+} GPIO_PADREGE_PAD17INPEN_Enum;
+
+/* =============================================  GPIO PADREGE PAD17PULL [8..8]  ============================================= */
+typedef enum {                                  /*!< GPIO_PADREGE_PAD17PULL                                                    */
+  GPIO_PADREGE_PAD17PULL_DIS           = 0,     /*!< DIS : Pullup disabled value.                                              */
+  GPIO_PADREGE_PAD17PULL_EN            = 1,     /*!< EN : Pullup enabled value.                                                */
+} GPIO_PADREGE_PAD17PULL_Enum;
+
+/* ============================================  GPIO PADREGE PAD16FNCSEL [3..5]  ============================================ */
+typedef enum {                                  /*!< GPIO_PADREGE_PAD16FNCSEL                                                  */
+  GPIO_PADREGE_PAD16FNCSEL_ADCSE0      = 0,     /*!< ADCSE0 : Configure as the analog ADC single ended port 0 input
+                                                     signal value.                                                             */
+  GPIO_PADREGE_PAD16FNCSEL_NCE16       = 1,     /*!< NCE16 : IOM/MSPI nCE group 16 value.                                      */
+  GPIO_PADREGE_PAD16FNCSEL_TRIG0       = 2,     /*!< TRIG0 : Configure as the ADC Trigger 0 signal value.                      */
+  GPIO_PADREGE_PAD16FNCSEL_GPIO16      = 3,     /*!< GPIO16 : Configure as GPIO16 value.                                       */
+  GPIO_PADREGE_PAD16FNCSEL_SCCRST      = 4,     /*!< SCCRST : SCARD reset output value.                                        */
+  GPIO_PADREGE_PAD16FNCSEL_CMPIN0      = 5,     /*!< CMPIN0 : Configure as comparator input 0 signal value.                    */
+  GPIO_PADREGE_PAD16FNCSEL_UART0TX     = 6,     /*!< UART0TX : Configure as UART0 TX output signal value.                      */
+  GPIO_PADREGE_PAD16FNCSEL_UA1RTS      = 7,     /*!< UA1RTS : Configure as UART1 RTS output signal value.                      */
+} GPIO_PADREGE_PAD16FNCSEL_Enum;
+
+/* ============================================  GPIO PADREGE PAD16STRNG [2..2]  ============================================= */
+typedef enum {                                  /*!< GPIO_PADREGE_PAD16STRNG                                                   */
+  GPIO_PADREGE_PAD16STRNG_LOW          = 0,     /*!< LOW : Low drive strength value.                                           */
+  GPIO_PADREGE_PAD16STRNG_HIGH         = 1,     /*!< HIGH : High drive strength value.                                         */
+} GPIO_PADREGE_PAD16STRNG_Enum;
+
+/* ============================================  GPIO PADREGE PAD16INPEN [1..1]  ============================================= */
+typedef enum {                                  /*!< GPIO_PADREGE_PAD16INPEN                                                   */
+  GPIO_PADREGE_PAD16INPEN_DIS          = 0,     /*!< DIS : Pad input disabled value.                                           */
+  GPIO_PADREGE_PAD16INPEN_EN           = 1,     /*!< EN : Pad input enabled value.                                             */
+} GPIO_PADREGE_PAD16INPEN_Enum;
+
+/* =============================================  GPIO PADREGE PAD16PULL [0..0]  ============================================= */
+typedef enum {                                  /*!< GPIO_PADREGE_PAD16PULL                                                    */
+  GPIO_PADREGE_PAD16PULL_DIS           = 0,     /*!< DIS : Pullup disabled value.                                              */
+  GPIO_PADREGE_PAD16PULL_EN            = 1,     /*!< EN : Pullup enabled value.                                                */
+} GPIO_PADREGE_PAD16PULL_Enum;
+
+/* ========================================================  PADREGF  ======================================================== */
+/* ===========================================  GPIO PADREGF PAD23FNCSEL [27..29]  =========================================== */
+typedef enum {                                  /*!< GPIO_PADREGF_PAD23FNCSEL                                                  */
+  GPIO_PADREGF_PAD23FNCSEL_UART0RX     = 0,     /*!< UART0RX : Configure as the UART0 RX signal value.                         */
+  GPIO_PADREGF_PAD23FNCSEL_NCE23       = 1,     /*!< NCE23 : IOM/MSPI nCE group 23 value.                                      */
+  GPIO_PADREGF_PAD23FNCSEL_CT14        = 2,     /*!< CT14 : CTIMER connection 14 value.                                        */
+  GPIO_PADREGF_PAD23FNCSEL_GPIO23      = 3,     /*!< GPIO23 : Configure as GPIO23 value.                                       */
+  GPIO_PADREGF_PAD23FNCSEL_I2SWCLK     = 4,     /*!< I2SWCLK : I2S word clock input value.                                     */
+  GPIO_PADREGF_PAD23FNCSEL_CMPOUT      = 5,     /*!< CMPOUT : Configure as voltage comparitor output value.                    */
+  GPIO_PADREGF_PAD23FNCSEL_MSPI3       = 6,     /*!< MSPI3 : MSPI data connection 3 value.                                     */
+  GPIO_PADREGF_PAD23FNCSEL_EXTXT       = 7,     /*!< EXTXT : External XTAL osacillatgor input value.                           */
+} GPIO_PADREGF_PAD23FNCSEL_Enum;
+
+/* ===========================================  GPIO PADREGF PAD23STRNG [26..26]  ============================================ */
+typedef enum {                                  /*!< GPIO_PADREGF_PAD23STRNG                                                   */
+  GPIO_PADREGF_PAD23STRNG_LOW          = 0,     /*!< LOW : Low drive strength value.                                           */
+  GPIO_PADREGF_PAD23STRNG_HIGH         = 1,     /*!< HIGH : High drive strength value.                                         */
+} GPIO_PADREGF_PAD23STRNG_Enum;
+
+/* ===========================================  GPIO PADREGF PAD23INPEN [25..25]  ============================================ */
+typedef enum {                                  /*!< GPIO_PADREGF_PAD23INPEN                                                   */
+  GPIO_PADREGF_PAD23INPEN_DIS          = 0,     /*!< DIS : Pad input disabled value.                                           */
+  GPIO_PADREGF_PAD23INPEN_EN           = 1,     /*!< EN : Pad input enabled value.                                             */
+} GPIO_PADREGF_PAD23INPEN_Enum;
+
+/* ============================================  GPIO PADREGF PAD23PULL [24..24]  ============================================ */
+typedef enum {                                  /*!< GPIO_PADREGF_PAD23PULL                                                    */
+  GPIO_PADREGF_PAD23PULL_DIS           = 0,     /*!< DIS : Pullup disabled value.                                              */
+  GPIO_PADREGF_PAD23PULL_EN            = 1,     /*!< EN : Pullup enabled value.                                                */
+} GPIO_PADREGF_PAD23PULL_Enum;
+
+/* ===========================================  GPIO PADREGF PAD22FNCSEL [19..21]  =========================================== */
+typedef enum {                                  /*!< GPIO_PADREGF_PAD22FNCSEL                                                  */
+  GPIO_PADREGF_PAD22FNCSEL_UART0TX     = 0,     /*!< UART0TX : Configure as the UART0 TX signal value.                         */
+  GPIO_PADREGF_PAD22FNCSEL_NCE22       = 1,     /*!< NCE22 : IOM/MSPI nCE group 22 value.                                      */
+  GPIO_PADREGF_PAD22FNCSEL_CT12        = 2,     /*!< CT12 : CTIMER connection 12 value.                                        */
+  GPIO_PADREGF_PAD22FNCSEL_GPIO22      = 3,     /*!< GPIO22 : Configure as GPIO22 value.                                       */
+  GPIO_PADREGF_PAD22FNCSEL_PDM_CLK     = 4,     /*!< PDM_CLK : Configure as the PDM CLK output value.                          */
+  GPIO_PADREGF_PAD22FNCSEL_EXTLF       = 5,     /*!< EXTLF : External LFRC input value.                                        */
+  GPIO_PADREGF_PAD22FNCSEL_MSPI0       = 6,     /*!< MSPI0 : MSPI data connection 0 value.                                     */
+  GPIO_PADREGF_PAD22FNCSEL_SWO         = 7,     /*!< SWO : Configure as the serial trace data output signal value.             */
+} GPIO_PADREGF_PAD22FNCSEL_Enum;
+
+/* ===========================================  GPIO PADREGF PAD22STRNG [18..18]  ============================================ */
+typedef enum {                                  /*!< GPIO_PADREGF_PAD22STRNG                                                   */
+  GPIO_PADREGF_PAD22STRNG_LOW          = 0,     /*!< LOW : Low drive strength value.                                           */
+  GPIO_PADREGF_PAD22STRNG_HIGH         = 1,     /*!< HIGH : High drive strength value.                                         */
+} GPIO_PADREGF_PAD22STRNG_Enum;
+
+/* ===========================================  GPIO PADREGF PAD22INPEN [17..17]  ============================================ */
+typedef enum {                                  /*!< GPIO_PADREGF_PAD22INPEN                                                   */
+  GPIO_PADREGF_PAD22INPEN_DIS          = 0,     /*!< DIS : Pad input disabled value.                                           */
+  GPIO_PADREGF_PAD22INPEN_EN           = 1,     /*!< EN : Pad input enabled value.                                             */
+} GPIO_PADREGF_PAD22INPEN_Enum;
+
+/* ============================================  GPIO PADREGF PAD22PULL [16..16]  ============================================ */
+typedef enum {                                  /*!< GPIO_PADREGF_PAD22PULL                                                    */
+  GPIO_PADREGF_PAD22PULL_DIS           = 0,     /*!< DIS : Pullup disabled value.                                              */
+  GPIO_PADREGF_PAD22PULL_EN            = 1,     /*!< EN : Pullup enabled value.                                                */
+} GPIO_PADREGF_PAD22PULL_Enum;
+
+/* ===========================================  GPIO PADREGF PAD21FNCSEL [11..13]  =========================================== */
+typedef enum {                                  /*!< GPIO_PADREGF_PAD21FNCSEL                                                  */
+  GPIO_PADREGF_PAD21FNCSEL_SWDIO       = 0,     /*!< SWDIO : Configure as the serial wire debug data signal value.             */
+  GPIO_PADREGF_PAD21FNCSEL_NCE21       = 1,     /*!< NCE21 : IOM/MSPI nCE group 21 value.                                      */
+  GPIO_PADREGF_PAD21FNCSEL_GPIO21      = 3,     /*!< GPIO21 : Configure as GPIO21 value.                                       */
+  GPIO_PADREGF_PAD21FNCSEL_UART0RX     = 4,     /*!< UART0RX : Configure as UART0 RX input signal value.                       */
+  GPIO_PADREGF_PAD21FNCSEL_UART1RX     = 5,     /*!< UART1RX : Configure as UART1 RX input signal value.                       */
+  GPIO_PADREGF_PAD21FNCSEL_I2SBCLK     = 6,     /*!< I2SBCLK : I2S byte clock input value.                                     */
+  GPIO_PADREGF_PAD21FNCSEL_UA1CTS      = 7,     /*!< UA1CTS : Configure as UART1 CTS input signal value.                       */
+} GPIO_PADREGF_PAD21FNCSEL_Enum;
+
+/* ===========================================  GPIO PADREGF PAD21STRNG [10..10]  ============================================ */
+typedef enum {                                  /*!< GPIO_PADREGF_PAD21STRNG                                                   */
+  GPIO_PADREGF_PAD21STRNG_LOW          = 0,     /*!< LOW : Low drive strength value.                                           */
+  GPIO_PADREGF_PAD21STRNG_HIGH         = 1,     /*!< HIGH : High drive strength value.                                         */
+} GPIO_PADREGF_PAD21STRNG_Enum;
+
+/* ============================================  GPIO PADREGF PAD21INPEN [9..9]  ============================================= */
+typedef enum {                                  /*!< GPIO_PADREGF_PAD21INPEN                                                   */
+  GPIO_PADREGF_PAD21INPEN_DIS          = 0,     /*!< DIS : Pad input disabled value.                                           */
+  GPIO_PADREGF_PAD21INPEN_EN           = 1,     /*!< EN : Pad input enabled value.                                             */
+} GPIO_PADREGF_PAD21INPEN_Enum;
+
+/* =============================================  GPIO PADREGF PAD21PULL [8..8]  ============================================= */
+typedef enum {                                  /*!< GPIO_PADREGF_PAD21PULL                                                    */
+  GPIO_PADREGF_PAD21PULL_DIS           = 0,     /*!< DIS : Pullup disabled value.                                              */
+  GPIO_PADREGF_PAD21PULL_EN            = 1,     /*!< EN : Pullup enabled value.                                                */
+} GPIO_PADREGF_PAD21PULL_Enum;
+
+/* ============================================  GPIO PADREGF PAD20FNCSEL [3..5]  ============================================ */
+typedef enum {                                  /*!< GPIO_PADREGF_PAD20FNCSEL                                                  */
+  GPIO_PADREGF_PAD20FNCSEL_SWDCK       = 0,     /*!< SWDCK : Configure as the serial wire debug clock signal value.            */
+  GPIO_PADREGF_PAD20FNCSEL_NCE20       = 1,     /*!< NCE20 : IOM/MSPI nCE group 20 value.                                      */
+  GPIO_PADREGF_PAD20FNCSEL_GPIO20      = 3,     /*!< GPIO20 : Configure as GPIO20 value.                                       */
+  GPIO_PADREGF_PAD20FNCSEL_UART0TX     = 4,     /*!< UART0TX : Configure as UART0 TX output signal value.                      */
+  GPIO_PADREGF_PAD20FNCSEL_UART1TX     = 5,     /*!< UART1TX : Configure as UART1 TX output signal value.                      */
+  GPIO_PADREGF_PAD20FNCSEL_I2SBCLK     = 6,     /*!< I2SBCLK : I2S byte clock input value.                                     */
+  GPIO_PADREGF_PAD20FNCSEL_UA1RTS      = 7,     /*!< UA1RTS : Configure as UART1 RTS output signal value.                      */
+} GPIO_PADREGF_PAD20FNCSEL_Enum;
+
+/* ============================================  GPIO PADREGF PAD20STRNG [2..2]  ============================================= */
+typedef enum {                                  /*!< GPIO_PADREGF_PAD20STRNG                                                   */
+  GPIO_PADREGF_PAD20STRNG_LOW          = 0,     /*!< LOW : Low drive strength value.                                           */
+  GPIO_PADREGF_PAD20STRNG_HIGH         = 1,     /*!< HIGH : High drive strength value.                                         */
+} GPIO_PADREGF_PAD20STRNG_Enum;
+
+/* ============================================  GPIO PADREGF PAD20INPEN [1..1]  ============================================= */
+typedef enum {                                  /*!< GPIO_PADREGF_PAD20INPEN                                                   */
+  GPIO_PADREGF_PAD20INPEN_DIS          = 0,     /*!< DIS : Pad input disabled value.                                           */
+  GPIO_PADREGF_PAD20INPEN_EN           = 1,     /*!< EN : Pad input enabled value.                                             */
+} GPIO_PADREGF_PAD20INPEN_Enum;
+
+/* =============================================  GPIO PADREGF PAD20PULL [0..0]  ============================================= */
+typedef enum {                                  /*!< GPIO_PADREGF_PAD20PULL                                                    */
+  GPIO_PADREGF_PAD20PULL_DIS           = 0,     /*!< DIS : Pulldown disabled value.                                            */
+  GPIO_PADREGF_PAD20PULL_EN            = 1,     /*!< EN : Pulldown enabled value.                                              */
+} GPIO_PADREGF_PAD20PULL_Enum;
+
+/* ========================================================  PADREGG  ======================================================== */
+/* ============================================  GPIO PADREGG PAD27RSEL [30..31]  ============================================ */
+typedef enum {                                  /*!< GPIO_PADREGG_PAD27RSEL                                                    */
+  GPIO_PADREGG_PAD27RSEL_PULL1_5K      = 0,     /*!< PULL1_5K : Pullup is ~1.5 KOhms value.                                    */
+  GPIO_PADREGG_PAD27RSEL_PULL6K        = 1,     /*!< PULL6K : Pullup is ~6 KOhms value.                                        */
+  GPIO_PADREGG_PAD27RSEL_PULL12K       = 2,     /*!< PULL12K : Pullup is ~12 KOhms value.                                      */
+  GPIO_PADREGG_PAD27RSEL_PULL24K       = 3,     /*!< PULL24K : Pullup is ~24 KOhms value.                                      */
+} GPIO_PADREGG_PAD27RSEL_Enum;
+
+/* ===========================================  GPIO PADREGG PAD27FNCSEL [27..29]  =========================================== */
+typedef enum {                                  /*!< GPIO_PADREGG_PAD27FNCSEL                                                  */
+  GPIO_PADREGG_PAD27FNCSEL_UART0RX     = 0,     /*!< UART0RX : Configure as UART0 RX input signal value.                       */
+  GPIO_PADREGG_PAD27FNCSEL_NCE27       = 1,     /*!< NCE27 : IOM/MSPI nCE group 27 value.                                      */
+  GPIO_PADREGG_PAD27FNCSEL_CT5         = 2,     /*!< CT5 : CTIMER connection 5 value.                                          */
+  GPIO_PADREGG_PAD27FNCSEL_GPIO27      = 3,     /*!< GPIO27 : Configure as GPIO27 value.                                       */
+  GPIO_PADREGG_PAD27FNCSEL_M2SCL       = 4,     /*!< M2SCL : Configure as I2C clock I/O signal from IOMSTR2 value.             */
+  GPIO_PADREGG_PAD27FNCSEL_M2SCK       = 5,     /*!< M2SCK : Configure as SPI clock output signal from IOMSTR2 value.          */
+} GPIO_PADREGG_PAD27FNCSEL_Enum;
+
+/* ===========================================  GPIO PADREGG PAD27STRNG [26..26]  ============================================ */
+typedef enum {                                  /*!< GPIO_PADREGG_PAD27STRNG                                                   */
+  GPIO_PADREGG_PAD27STRNG_LOW          = 0,     /*!< LOW : Low drive strength value.                                           */
+  GPIO_PADREGG_PAD27STRNG_HIGH         = 1,     /*!< HIGH : High drive strength value.                                         */
+} GPIO_PADREGG_PAD27STRNG_Enum;
+
+/* ===========================================  GPIO PADREGG PAD27INPEN [25..25]  ============================================ */
+typedef enum {                                  /*!< GPIO_PADREGG_PAD27INPEN                                                   */
+  GPIO_PADREGG_PAD27INPEN_DIS          = 0,     /*!< DIS : Pad input disabled value.                                           */
+  GPIO_PADREGG_PAD27INPEN_EN           = 1,     /*!< EN : Pad input enabled value.                                             */
+} GPIO_PADREGG_PAD27INPEN_Enum;
+
+/* ============================================  GPIO PADREGG PAD27PULL [24..24]  ============================================ */
+typedef enum {                                  /*!< GPIO_PADREGG_PAD27PULL                                                    */
+  GPIO_PADREGG_PAD27PULL_DIS           = 0,     /*!< DIS : Pullup disabled value.                                              */
+  GPIO_PADREGG_PAD27PULL_EN            = 1,     /*!< EN : Pullup enabled value.                                                */
+} GPIO_PADREGG_PAD27PULL_Enum;
+
+/* ===========================================  GPIO PADREGG PAD26FNCSEL [19..21]  =========================================== */
+typedef enum {                                  /*!< GPIO_PADREGG_PAD26FNCSEL                                                  */
+  GPIO_PADREGG_PAD26FNCSEL_EXTHF       = 0,     /*!< EXTHF : Configure as the external HFRC oscillator input value.            */
+  GPIO_PADREGG_PAD26FNCSEL_NCE26       = 1,     /*!< NCE26 : IOM/MSPI nCE group 26 value.                                      */
+  GPIO_PADREGG_PAD26FNCSEL_CT3         = 2,     /*!< CT3 : CTIMER connection 3 value.                                          */
+  GPIO_PADREGG_PAD26FNCSEL_GPIO26      = 3,     /*!< GPIO26 : Configure as GPIO26 value.                                       */
+  GPIO_PADREGG_PAD26FNCSEL_SCCRST      = 4,     /*!< SCCRST : SCARD reset output value.                                        */
+  GPIO_PADREGG_PAD26FNCSEL_MSPI1       = 5,     /*!< MSPI1 : MSPI data connection 1 value.                                     */
+  GPIO_PADREGG_PAD26FNCSEL_UART0TX     = 6,     /*!< UART0TX : Configure as UART0 TX output signal value.                      */
+  GPIO_PADREGG_PAD26FNCSEL_UA1CTS      = 7,     /*!< UA1CTS : Configure as UART1 CTS input signal value.                       */
+} GPIO_PADREGG_PAD26FNCSEL_Enum;
+
+/* ===========================================  GPIO PADREGG PAD26STRNG [18..18]  ============================================ */
+typedef enum {                                  /*!< GPIO_PADREGG_PAD26STRNG                                                   */
+  GPIO_PADREGG_PAD26STRNG_LOW          = 0,     /*!< LOW : Low drive strength value.                                           */
+  GPIO_PADREGG_PAD26STRNG_HIGH         = 1,     /*!< HIGH : High drive strength value.                                         */
+} GPIO_PADREGG_PAD26STRNG_Enum;
+
+/* ===========================================  GPIO PADREGG PAD26INPEN [17..17]  ============================================ */
+typedef enum {                                  /*!< GPIO_PADREGG_PAD26INPEN                                                   */
+  GPIO_PADREGG_PAD26INPEN_DIS          = 0,     /*!< DIS : Pad input disabled value.                                           */
+  GPIO_PADREGG_PAD26INPEN_EN           = 1,     /*!< EN : Pad input enabled value.                                             */
+} GPIO_PADREGG_PAD26INPEN_Enum;
+
+/* ============================================  GPIO PADREGG PAD26PULL [16..16]  ============================================ */
+typedef enum {                                  /*!< GPIO_PADREGG_PAD26PULL                                                    */
+  GPIO_PADREGG_PAD26PULL_DIS           = 0,     /*!< DIS : Pullup disabled value.                                              */
+  GPIO_PADREGG_PAD26PULL_EN            = 1,     /*!< EN : Pullup enabled value.                                                */
+} GPIO_PADREGG_PAD26PULL_Enum;
+
+/* ============================================  GPIO PADREGG PAD25RSEL [14..15]  ============================================ */
+typedef enum {                                  /*!< GPIO_PADREGG_PAD25RSEL                                                    */
+  GPIO_PADREGG_PAD25RSEL_PULL1_5K      = 0,     /*!< PULL1_5K : Pullup is ~1.5 KOhms value.                                    */
+  GPIO_PADREGG_PAD25RSEL_PULL6K        = 1,     /*!< PULL6K : Pullup is ~6 KOhms value.                                        */
+  GPIO_PADREGG_PAD25RSEL_PULL12K       = 2,     /*!< PULL12K : Pullup is ~12 KOhms value.                                      */
+  GPIO_PADREGG_PAD25RSEL_PULL24K       = 3,     /*!< PULL24K : Pullup is ~24 KOhms value.                                      */
+} GPIO_PADREGG_PAD25RSEL_Enum;
+
+/* ===========================================  GPIO PADREGG PAD25FNCSEL [11..13]  =========================================== */
+typedef enum {                                  /*!< GPIO_PADREGG_PAD25FNCSEL                                                  */
+  GPIO_PADREGG_PAD25FNCSEL_UART1RX     = 0,     /*!< UART1RX : Configure as UART1 RX input signal value.                       */
+  GPIO_PADREGG_PAD25FNCSEL_NCE25       = 1,     /*!< NCE25 : IOM/MSPI nCE group 25 value.                                      */
+  GPIO_PADREGG_PAD25FNCSEL_CT1         = 2,     /*!< CT1 : CTIMER connection 1 value.                                          */
+  GPIO_PADREGG_PAD25FNCSEL_GPIO25      = 3,     /*!< GPIO25 : Configure as GPIO25 value.                                       */
+  GPIO_PADREGG_PAD25FNCSEL_M2SDAWIR3   = 4,     /*!< M2SDAWIR3 : Configure as the IOMSTR2 I2C SDA or SPI WIR3 signal
+                                                     value.                                                                    */
+  GPIO_PADREGG_PAD25FNCSEL_M2MISO      = 5,     /*!< M2MISO : Configure as the IOMSTR2 SPI MISO input signal value.            */
+} GPIO_PADREGG_PAD25FNCSEL_Enum;
+
+/* ===========================================  GPIO PADREGG PAD25STRNG [10..10]  ============================================ */
+typedef enum {                                  /*!< GPIO_PADREGG_PAD25STRNG                                                   */
+  GPIO_PADREGG_PAD25STRNG_LOW          = 0,     /*!< LOW : Low drive strength value.                                           */
+  GPIO_PADREGG_PAD25STRNG_HIGH         = 1,     /*!< HIGH : High drive strength value.                                         */
+} GPIO_PADREGG_PAD25STRNG_Enum;
+
+/* ============================================  GPIO PADREGG PAD25INPEN [9..9]  ============================================= */
+typedef enum {                                  /*!< GPIO_PADREGG_PAD25INPEN                                                   */
+  GPIO_PADREGG_PAD25INPEN_DIS          = 0,     /*!< DIS : Pad input disabled value.                                           */
+  GPIO_PADREGG_PAD25INPEN_EN           = 1,     /*!< EN : Pad input enabled value.                                             */
+} GPIO_PADREGG_PAD25INPEN_Enum;
+
+/* =============================================  GPIO PADREGG PAD25PULL [8..8]  ============================================= */
+typedef enum {                                  /*!< GPIO_PADREGG_PAD25PULL                                                    */
+  GPIO_PADREGG_PAD25PULL_DIS           = 0,     /*!< DIS : Pullup disabled value.                                              */
+  GPIO_PADREGG_PAD25PULL_EN            = 1,     /*!< EN : Pullup enabled value.                                                */
+} GPIO_PADREGG_PAD25PULL_Enum;
+
+/* ============================================  GPIO PADREGG PAD24FNCSEL [3..5]  ============================================ */
+typedef enum {                                  /*!< GPIO_PADREGG_PAD24FNCSEL                                                  */
+  GPIO_PADREGG_PAD24FNCSEL_UART1TX     = 0,     /*!< UART1TX : Configure as UART1 TX output signal value.                      */
+  GPIO_PADREGG_PAD24FNCSEL_NCE24       = 1,     /*!< NCE24 : IOM/MSPI nCE group 24 value.                                      */
+  GPIO_PADREGG_PAD24FNCSEL_MSPI8       = 2,     /*!< MSPI8 : MSPI data connection 8 value.                                     */
+  GPIO_PADREGG_PAD24FNCSEL_GPIO24      = 3,     /*!< GPIO24 : Configure as GPIO24 value.                                       */
+  GPIO_PADREGG_PAD24FNCSEL_UA0CTS      = 4,     /*!< UA0CTS : Configure as UART0 CTS input signal value.                       */
+  GPIO_PADREGG_PAD24FNCSEL_CT21        = 5,     /*!< CT21 : CTIMER connection 21 value.                                        */
+  GPIO_PADREGG_PAD24FNCSEL_32kHzXT     = 6,     /*!< 32kHzXT : Configure as the 32kHz crystal output signal value.             */
+  GPIO_PADREGG_PAD24FNCSEL_SWO         = 7,     /*!< SWO : Configure as the serial trace data output signal value.             */
+} GPIO_PADREGG_PAD24FNCSEL_Enum;
+
+/* ============================================  GPIO PADREGG PAD24STRNG [2..2]  ============================================= */
+typedef enum {                                  /*!< GPIO_PADREGG_PAD24STRNG                                                   */
+  GPIO_PADREGG_PAD24STRNG_LOW          = 0,     /*!< LOW : Low drive strength value.                                           */
+  GPIO_PADREGG_PAD24STRNG_HIGH         = 1,     /*!< HIGH : High drive strength value.                                         */
+} GPIO_PADREGG_PAD24STRNG_Enum;
+
+/* ============================================  GPIO PADREGG PAD24INPEN [1..1]  ============================================= */
+typedef enum {                                  /*!< GPIO_PADREGG_PAD24INPEN                                                   */
+  GPIO_PADREGG_PAD24INPEN_DIS          = 0,     /*!< DIS : Pad input disabled value.                                           */
+  GPIO_PADREGG_PAD24INPEN_EN           = 1,     /*!< EN : Pad input enabled value.                                             */
+} GPIO_PADREGG_PAD24INPEN_Enum;
+
+/* =============================================  GPIO PADREGG PAD24PULL [0..0]  ============================================= */
+typedef enum {                                  /*!< GPIO_PADREGG_PAD24PULL                                                    */
+  GPIO_PADREGG_PAD24PULL_DIS           = 0,     /*!< DIS : Pullup disabled value.                                              */
+  GPIO_PADREGG_PAD24PULL_EN            = 1,     /*!< EN : Pullup enabled value.                                                */
+} GPIO_PADREGG_PAD24PULL_Enum;
+
+/* ========================================================  PADREGH  ======================================================== */
+/* ===========================================  GPIO PADREGH PAD31FNCSEL [27..29]  =========================================== */
+typedef enum {                                  /*!< GPIO_PADREGH_PAD31FNCSEL                                                  */
+  GPIO_PADREGH_PAD31FNCSEL_ADCSE3      = 0,     /*!< ADCSE3 : Configure as the analog input for ADC single ended
+                                                     input 3 value.                                                            */
+  GPIO_PADREGH_PAD31FNCSEL_NCE31       = 1,     /*!< NCE31 : IOM/MSPI nCE group 31 value.                                      */
+  GPIO_PADREGH_PAD31FNCSEL_CT13        = 2,     /*!< CT13 : CTIMER connection 13 value.                                        */
+  GPIO_PADREGH_PAD31FNCSEL_GPIO31      = 3,     /*!< GPIO31 : Configure as GPIO31 value.                                       */
+  GPIO_PADREGH_PAD31FNCSEL_UART0RX     = 4,     /*!< UART0RX : Configure as the UART0 RX input signal value.                   */
+  GPIO_PADREGH_PAD31FNCSEL_SCCCLK      = 5,     /*!< SCCCLK : SCARD serial clock output value.                                 */
+  GPIO_PADREGH_PAD31FNCSEL_UA1RTS      = 7,     /*!< UA1RTS : Configure as UART1 RTS output signal value.                      */
+} GPIO_PADREGH_PAD31FNCSEL_Enum;
+
+/* ===========================================  GPIO PADREGH PAD31STRNG [26..26]  ============================================ */
+typedef enum {                                  /*!< GPIO_PADREGH_PAD31STRNG                                                   */
+  GPIO_PADREGH_PAD31STRNG_LOW          = 0,     /*!< LOW : Low drive strength value.                                           */
+  GPIO_PADREGH_PAD31STRNG_HIGH         = 1,     /*!< HIGH : High drive strength value.                                         */
+} GPIO_PADREGH_PAD31STRNG_Enum;
+
+/* ===========================================  GPIO PADREGH PAD31INPEN [25..25]  ============================================ */
+typedef enum {                                  /*!< GPIO_PADREGH_PAD31INPEN                                                   */
+  GPIO_PADREGH_PAD31INPEN_DIS          = 0,     /*!< DIS : Pad input disabled value.                                           */
+  GPIO_PADREGH_PAD31INPEN_EN           = 1,     /*!< EN : Pad input enabled value.                                             */
+} GPIO_PADREGH_PAD31INPEN_Enum;
+
+/* ============================================  GPIO PADREGH PAD31PULL [24..24]  ============================================ */
+typedef enum {                                  /*!< GPIO_PADREGH_PAD31PULL                                                    */
+  GPIO_PADREGH_PAD31PULL_DIS           = 0,     /*!< DIS : Pullup disabled value.                                              */
+  GPIO_PADREGH_PAD31PULL_EN            = 1,     /*!< EN : Pullup enabled value.                                                */
+} GPIO_PADREGH_PAD31PULL_Enum;
+
+/* ===========================================  GPIO PADREGH PAD30FNCSEL [19..21]  =========================================== */
+typedef enum {                                  /*!< GPIO_PADREGH_PAD30FNCSEL                                                  */
+  GPIO_PADREGH_PAD30FNCSEL_ANATEST1    = 0,     /*!< ANATEST1 : Configure as the ANATEST1 I/O signal value.                    */
+  GPIO_PADREGH_PAD30FNCSEL_NCE30       = 1,     /*!< NCE30 : IOM/MSPI nCE group 30 value.                                      */
+  GPIO_PADREGH_PAD30FNCSEL_CT11        = 2,     /*!< CT11 : CTIMER connection 11 value.                                        */
+  GPIO_PADREGH_PAD30FNCSEL_GPIO30      = 3,     /*!< GPIO30 : Configure as GPIO30 value.                                       */
+  GPIO_PADREGH_PAD30FNCSEL_UART0TX     = 4,     /*!< UART0TX : Configure as UART0 TX output signal value.                      */
+  GPIO_PADREGH_PAD30FNCSEL_UA1RTS      = 5,     /*!< UA1RTS : Configure as UART1 RTS output signal value.                      */
+  GPIO_PADREGH_PAD30FNCSEL_I2S_DAT     = 7,     /*!< I2S_DAT : Configure as the PDM I2S Data output signal value.              */
+} GPIO_PADREGH_PAD30FNCSEL_Enum;
+
+/* ===========================================  GPIO PADREGH PAD30STRNG [18..18]  ============================================ */
+typedef enum {                                  /*!< GPIO_PADREGH_PAD30STRNG                                                   */
+  GPIO_PADREGH_PAD30STRNG_LOW          = 0,     /*!< LOW : Low drive strength value.                                           */
+  GPIO_PADREGH_PAD30STRNG_HIGH         = 1,     /*!< HIGH : High drive strength value.                                         */
+} GPIO_PADREGH_PAD30STRNG_Enum;
+
+/* ===========================================  GPIO PADREGH PAD30INPEN [17..17]  ============================================ */
+typedef enum {                                  /*!< GPIO_PADREGH_PAD30INPEN                                                   */
+  GPIO_PADREGH_PAD30INPEN_DIS          = 0,     /*!< DIS : Pad input disabled value.                                           */
+  GPIO_PADREGH_PAD30INPEN_EN           = 1,     /*!< EN : Pad input enabled value.                                             */
+} GPIO_PADREGH_PAD30INPEN_Enum;
+
+/* ============================================  GPIO PADREGH PAD30PULL [16..16]  ============================================ */
+typedef enum {                                  /*!< GPIO_PADREGH_PAD30PULL                                                    */
+  GPIO_PADREGH_PAD30PULL_DIS           = 0,     /*!< DIS : Pullup disabled value.                                              */
+  GPIO_PADREGH_PAD30PULL_EN            = 1,     /*!< EN : Pullup enabled value.                                                */
+} GPIO_PADREGH_PAD30PULL_Enum;
+
+/* ===========================================  GPIO PADREGH PAD29FNCSEL [11..13]  =========================================== */
+typedef enum {                                  /*!< GPIO_PADREGH_PAD29FNCSEL                                                  */
+  GPIO_PADREGH_PAD29FNCSEL_ADCSE1      = 0,     /*!< ADCSE1 : Configure as the analog input for ADC single ended
+                                                     input 1 value.                                                            */
+  GPIO_PADREGH_PAD29FNCSEL_NCE29       = 1,     /*!< NCE29 : IOM/MSPI nCE group 29 value.                                      */
+  GPIO_PADREGH_PAD29FNCSEL_CT9         = 2,     /*!< CT9 : CTIMER connection 9 value.                                          */
+  GPIO_PADREGH_PAD29FNCSEL_GPIO29      = 3,     /*!< GPIO29 : Configure as GPIO29 value.                                       */
+  GPIO_PADREGH_PAD29FNCSEL_UA0CTS      = 4,     /*!< UA0CTS : Configure as the UART0 CTS input signal value.                   */
+  GPIO_PADREGH_PAD29FNCSEL_UA1CTS      = 5,     /*!< UA1CTS : Configure as the UART1 CTS input signal value.                   */
+  GPIO_PADREGH_PAD29FNCSEL_UART0RX     = 6,     /*!< UART0RX : Configure as the UART0 RX input signal value.                   */
+  GPIO_PADREGH_PAD29FNCSEL_PDM_DATA    = 7,     /*!< PDM_DATA : Configure as PDM DATA input value.                             */
+} GPIO_PADREGH_PAD29FNCSEL_Enum;
+
+/* ===========================================  GPIO PADREGH PAD29STRNG [10..10]  ============================================ */
+typedef enum {                                  /*!< GPIO_PADREGH_PAD29STRNG                                                   */
+  GPIO_PADREGH_PAD29STRNG_LOW          = 0,     /*!< LOW : Low drive strength value.                                           */
+  GPIO_PADREGH_PAD29STRNG_HIGH         = 1,     /*!< HIGH : High drive strength value.                                         */
+} GPIO_PADREGH_PAD29STRNG_Enum;
+
+/* ============================================  GPIO PADREGH PAD29INPEN [9..9]  ============================================= */
+typedef enum {                                  /*!< GPIO_PADREGH_PAD29INPEN                                                   */
+  GPIO_PADREGH_PAD29INPEN_DIS          = 0,     /*!< DIS : Pad input disabled value.                                           */
+  GPIO_PADREGH_PAD29INPEN_EN           = 1,     /*!< EN : Pad input enabled value.                                             */
+} GPIO_PADREGH_PAD29INPEN_Enum;
+
+/* =============================================  GPIO PADREGH PAD29PULL [8..8]  ============================================= */
+typedef enum {                                  /*!< GPIO_PADREGH_PAD29PULL                                                    */
+  GPIO_PADREGH_PAD29PULL_DIS           = 0,     /*!< DIS : Pullup disabled value.                                              */
+  GPIO_PADREGH_PAD29PULL_EN            = 1,     /*!< EN : Pullup enabled value.                                                */
+} GPIO_PADREGH_PAD29PULL_Enum;
+
+/* ============================================  GPIO PADREGH PAD28FNCSEL [3..5]  ============================================ */
+typedef enum {                                  /*!< GPIO_PADREGH_PAD28FNCSEL                                                  */
+  GPIO_PADREGH_PAD28FNCSEL_I2S_WCLK    = 0,     /*!< I2S_WCLK : Configure as the PDM I2S Word Clock input value.               */
+  GPIO_PADREGH_PAD28FNCSEL_NCE28       = 1,     /*!< NCE28 : IOM/MSPI nCE group 28 value.                                      */
+  GPIO_PADREGH_PAD28FNCSEL_CT7         = 2,     /*!< CT7 : CTIMER connection 7 value.                                          */
+  GPIO_PADREGH_PAD28FNCSEL_GPIO28      = 3,     /*!< GPIO28 : Configure as GPIO28 value.                                       */
+  GPIO_PADREGH_PAD28FNCSEL_M2MOSI      = 5,     /*!< M2MOSI : Configure as the IOMSTR2 SPI MOSI output signal value.           */
+  GPIO_PADREGH_PAD28FNCSEL_UART0TX     = 6,     /*!< UART0TX : Configure as the UART0 TX output signal value.                  */
+} GPIO_PADREGH_PAD28FNCSEL_Enum;
+
+/* ============================================  GPIO PADREGH PAD28STRNG [2..2]  ============================================= */
+typedef enum {                                  /*!< GPIO_PADREGH_PAD28STRNG                                                   */
+  GPIO_PADREGH_PAD28STRNG_LOW          = 0,     /*!< LOW : Low drive strength value.                                           */
+  GPIO_PADREGH_PAD28STRNG_HIGH         = 1,     /*!< HIGH : High drive strength value.                                         */
+} GPIO_PADREGH_PAD28STRNG_Enum;
+
+/* ============================================  GPIO PADREGH PAD28INPEN [1..1]  ============================================= */
+typedef enum {                                  /*!< GPIO_PADREGH_PAD28INPEN                                                   */
+  GPIO_PADREGH_PAD28INPEN_DIS          = 0,     /*!< DIS : Pad input disabled value.                                           */
+  GPIO_PADREGH_PAD28INPEN_EN           = 1,     /*!< EN : Pad input enabled value.                                             */
+} GPIO_PADREGH_PAD28INPEN_Enum;
+
+/* =============================================  GPIO PADREGH PAD28PULL [0..0]  ============================================= */
+typedef enum {                                  /*!< GPIO_PADREGH_PAD28PULL                                                    */
+  GPIO_PADREGH_PAD28PULL_DIS           = 0,     /*!< DIS : Pullup disabled value.                                              */
+  GPIO_PADREGH_PAD28PULL_EN            = 1,     /*!< EN : Pullup enabled value.                                                */
+} GPIO_PADREGH_PAD28PULL_Enum;
+
+/* ========================================================  PADREGI  ======================================================== */
+/* ===========================================  GPIO PADREGI PAD35FNCSEL [27..29]  =========================================== */
+typedef enum {                                  /*!< GPIO_PADREGI_PAD35FNCSEL                                                  */
+  GPIO_PADREGI_PAD35FNCSEL_ADCSE7      = 0,     /*!< ADCSE7 : Configure as the analog input for ADC single ended
+                                                     input 7 value.                                                            */
+  GPIO_PADREGI_PAD35FNCSEL_NCE35       = 1,     /*!< NCE35 : IOM/MSPI nCE group 35 value.                                      */
+  GPIO_PADREGI_PAD35FNCSEL_UART1TX     = 2,     /*!< UART1TX : Configure as the UART1 TX signal value.                         */
+  GPIO_PADREGI_PAD35FNCSEL_GPIO35      = 3,     /*!< GPIO35 : Configure as GPIO35 value.                                       */
+  GPIO_PADREGI_PAD35FNCSEL_I2SDAT      = 4,     /*!< I2SDAT : I2S serial data output value.                                    */
+  GPIO_PADREGI_PAD35FNCSEL_CT27        = 5,     /*!< CT27 : CTIMER connection 27 value.                                        */
+  GPIO_PADREGI_PAD35FNCSEL_UA0RTS      = 6,     /*!< UA0RTS : Configure as the UART0 RTS output value.                         */
+} GPIO_PADREGI_PAD35FNCSEL_Enum;
+
+/* ===========================================  GPIO PADREGI PAD35STRNG [26..26]  ============================================ */
+typedef enum {                                  /*!< GPIO_PADREGI_PAD35STRNG                                                   */
+  GPIO_PADREGI_PAD35STRNG_LOW          = 0,     /*!< LOW : Low drive strength value.                                           */
+  GPIO_PADREGI_PAD35STRNG_HIGH         = 1,     /*!< HIGH : High drive strength value.                                         */
+} GPIO_PADREGI_PAD35STRNG_Enum;
+
+/* ===========================================  GPIO PADREGI PAD35INPEN [25..25]  ============================================ */
+typedef enum {                                  /*!< GPIO_PADREGI_PAD35INPEN                                                   */
+  GPIO_PADREGI_PAD35INPEN_DIS          = 0,     /*!< DIS : Pad input disabled value.                                           */
+  GPIO_PADREGI_PAD35INPEN_EN           = 1,     /*!< EN : Pad input enabled value.                                             */
+} GPIO_PADREGI_PAD35INPEN_Enum;
+
+/* ============================================  GPIO PADREGI PAD35PULL [24..24]  ============================================ */
+typedef enum {                                  /*!< GPIO_PADREGI_PAD35PULL                                                    */
+  GPIO_PADREGI_PAD35PULL_DIS           = 0,     /*!< DIS : Pullup disabled value.                                              */
+  GPIO_PADREGI_PAD35PULL_EN            = 1,     /*!< EN : Pullup enabled value.                                                */
+} GPIO_PADREGI_PAD35PULL_Enum;
+
+/* ===========================================  GPIO PADREGI PAD34FNCSEL [19..21]  =========================================== */
+typedef enum {                                  /*!< GPIO_PADREGI_PAD34FNCSEL                                                  */
+  GPIO_PADREGI_PAD34FNCSEL_ADCSE6      = 0,     /*!< ADCSE6 : Configure as the analog input for ADC single ended
+                                                     input 6 value.                                                            */
+  GPIO_PADREGI_PAD34FNCSEL_NCE34       = 1,     /*!< NCE34 : IOM/MSPI nCE group 34 value.                                      */
+  GPIO_PADREGI_PAD34FNCSEL_UA1RTS      = 2,     /*!< UA1RTS : Configure as the UART1 RTS output value.                         */
+  GPIO_PADREGI_PAD34FNCSEL_GPIO34      = 3,     /*!< GPIO34 : Configure as GPIO34 value.                                       */
+  GPIO_PADREGI_PAD34FNCSEL_CMPRF2      = 4,     /*!< CMPRF2 : Configure as the analog comparator reference 2 signal
+                                                     value.                                                                    */
+  GPIO_PADREGI_PAD34FNCSEL_UA0RTS      = 5,     /*!< UA0RTS : Configure as the UART0 RTS output value.                         */
+  GPIO_PADREGI_PAD34FNCSEL_UART0RX     = 6,     /*!< UART0RX : Configure as the UART0 RX input value.                          */
+  GPIO_PADREGI_PAD34FNCSEL_PDMDATA     = 7,     /*!< PDMDATA : PDM serial data input value.                                    */
+} GPIO_PADREGI_PAD34FNCSEL_Enum;
+
+/* ===========================================  GPIO PADREGI PAD34STRNG [18..18]  ============================================ */
+typedef enum {                                  /*!< GPIO_PADREGI_PAD34STRNG                                                   */
+  GPIO_PADREGI_PAD34STRNG_LOW          = 0,     /*!< LOW : Low drive strength value.                                           */
+  GPIO_PADREGI_PAD34STRNG_HIGH         = 1,     /*!< HIGH : High drive strength value.                                         */
+} GPIO_PADREGI_PAD34STRNG_Enum;
+
+/* ===========================================  GPIO PADREGI PAD34INPEN [17..17]  ============================================ */
+typedef enum {                                  /*!< GPIO_PADREGI_PAD34INPEN                                                   */
+  GPIO_PADREGI_PAD34INPEN_DIS          = 0,     /*!< DIS : Pad input disabled value.                                           */
+  GPIO_PADREGI_PAD34INPEN_EN           = 1,     /*!< EN : Pad input enabled value.                                             */
+} GPIO_PADREGI_PAD34INPEN_Enum;
+
+/* ============================================  GPIO PADREGI PAD34PULL [16..16]  ============================================ */
+typedef enum {                                  /*!< GPIO_PADREGI_PAD34PULL                                                    */
+  GPIO_PADREGI_PAD34PULL_DIS           = 0,     /*!< DIS : Pullup disabled value.                                              */
+  GPIO_PADREGI_PAD34PULL_EN            = 1,     /*!< EN : Pullup enabled value.                                                */
+} GPIO_PADREGI_PAD34PULL_Enum;
+
+/* ===========================================  GPIO PADREGI PAD33FNCSEL [11..13]  =========================================== */
+typedef enum {                                  /*!< GPIO_PADREGI_PAD33FNCSEL                                                  */
+  GPIO_PADREGI_PAD33FNCSEL_ADCSE5      = 0,     /*!< ADCSE5 : Configure as the analog ADC single ended port 5 input
+                                                     signal value.                                                             */
+  GPIO_PADREGI_PAD33FNCSEL_NCE33       = 1,     /*!< NCE33 : IOM/MSPI nCE group 33 value.                                      */
+  GPIO_PADREGI_PAD33FNCSEL_32kHzXT     = 2,     /*!< 32kHzXT : Configure as the 32kHz crystal output signal value.             */
+  GPIO_PADREGI_PAD33FNCSEL_GPIO33      = 3,     /*!< GPIO33 : Configure as GPIO33 value.                                       */
+  GPIO_PADREGI_PAD33FNCSEL_UA0CTS      = 5,     /*!< UA0CTS : Configure as the UART0 CTS input value.                          */
+  GPIO_PADREGI_PAD33FNCSEL_CT23        = 6,     /*!< CT23 : CTIMER connection 23 value.                                        */
+  GPIO_PADREGI_PAD33FNCSEL_SWO         = 7,     /*!< SWO : Configure as the serial trace data output signal value.             */
+} GPIO_PADREGI_PAD33FNCSEL_Enum;
+
+/* ===========================================  GPIO PADREGI PAD33STRNG [10..10]  ============================================ */
+typedef enum {                                  /*!< GPIO_PADREGI_PAD33STRNG                                                   */
+  GPIO_PADREGI_PAD33STRNG_LOW          = 0,     /*!< LOW : Low drive strength value.                                           */
+  GPIO_PADREGI_PAD33STRNG_HIGH         = 1,     /*!< HIGH : High drive strength value.                                         */
+} GPIO_PADREGI_PAD33STRNG_Enum;
+
+/* ============================================  GPIO PADREGI PAD33INPEN [9..9]  ============================================= */
+typedef enum {                                  /*!< GPIO_PADREGI_PAD33INPEN                                                   */
+  GPIO_PADREGI_PAD33INPEN_DIS          = 0,     /*!< DIS : Pad input disabled value.                                           */
+  GPIO_PADREGI_PAD33INPEN_EN           = 1,     /*!< EN : Pad input enabled value.                                             */
+} GPIO_PADREGI_PAD33INPEN_Enum;
+
+/* =============================================  GPIO PADREGI PAD33PULL [8..8]  ============================================= */
+typedef enum {                                  /*!< GPIO_PADREGI_PAD33PULL                                                    */
+  GPIO_PADREGI_PAD33PULL_DIS           = 0,     /*!< DIS : Pullup disabled value.                                              */
+  GPIO_PADREGI_PAD33PULL_EN            = 1,     /*!< EN : Pullup enabled value.                                                */
+} GPIO_PADREGI_PAD33PULL_Enum;
+
+/* ============================================  GPIO PADREGI PAD32FNCSEL [3..5]  ============================================ */
+typedef enum {                                  /*!< GPIO_PADREGI_PAD32FNCSEL                                                  */
+  GPIO_PADREGI_PAD32FNCSEL_ADCSE4      = 0,     /*!< ADCSE4 : Configure as the analog input for ADC single ended
+                                                     input 4 value.                                                            */
+  GPIO_PADREGI_PAD32FNCSEL_NCE32       = 1,     /*!< NCE32 : IOM/MSPI nCE group 32 value.                                      */
+  GPIO_PADREGI_PAD32FNCSEL_CT15        = 2,     /*!< CT15 : CTIMER connection 15 value.                                        */
+  GPIO_PADREGI_PAD32FNCSEL_GPIO32      = 3,     /*!< GPIO32 : Configure as GPIO32 value.                                       */
+  GPIO_PADREGI_PAD32FNCSEL_SCCIO       = 4,     /*!< SCCIO : SCARD serial data input/output value.                             */
+  GPIO_PADREGI_PAD32FNCSEL_EXTLF       = 5,     /*!< EXTLF : External input to the LFRC oscillator value.                      */
+  GPIO_PADREGI_PAD32FNCSEL_UA1CTS      = 7,     /*!< UA1CTS : Configure as the UART1 CTS input value.                          */
+} GPIO_PADREGI_PAD32FNCSEL_Enum;
+
+/* ============================================  GPIO PADREGI PAD32STRNG [2..2]  ============================================= */
+typedef enum {                                  /*!< GPIO_PADREGI_PAD32STRNG                                                   */
+  GPIO_PADREGI_PAD32STRNG_LOW          = 0,     /*!< LOW : Low drive strength value.                                           */
+  GPIO_PADREGI_PAD32STRNG_HIGH         = 1,     /*!< HIGH : High drive strength value.                                         */
+} GPIO_PADREGI_PAD32STRNG_Enum;
+
+/* ============================================  GPIO PADREGI PAD32INPEN [1..1]  ============================================= */
+typedef enum {                                  /*!< GPIO_PADREGI_PAD32INPEN                                                   */
+  GPIO_PADREGI_PAD32INPEN_DIS          = 0,     /*!< DIS : Pad input disabled value.                                           */
+  GPIO_PADREGI_PAD32INPEN_EN           = 1,     /*!< EN : Pad input enabled value.                                             */
+} GPIO_PADREGI_PAD32INPEN_Enum;
+
+/* =============================================  GPIO PADREGI PAD32PULL [0..0]  ============================================= */
+typedef enum {                                  /*!< GPIO_PADREGI_PAD32PULL                                                    */
+  GPIO_PADREGI_PAD32PULL_DIS           = 0,     /*!< DIS : Pullup disabled value.                                              */
+  GPIO_PADREGI_PAD32PULL_EN            = 1,     /*!< EN : Pullup enabled value.                                                */
+} GPIO_PADREGI_PAD32PULL_Enum;
+
+/* ========================================================  PADREGJ  ======================================================== */
+/* ============================================  GPIO PADREGJ PAD39RSEL [30..31]  ============================================ */
+typedef enum {                                  /*!< GPIO_PADREGJ_PAD39RSEL                                                    */
+  GPIO_PADREGJ_PAD39RSEL_PULL1_5K      = 0,     /*!< PULL1_5K : Pullup is ~1.5 KOhms value.                                    */
+  GPIO_PADREGJ_PAD39RSEL_PULL6K        = 1,     /*!< PULL6K : Pullup is ~6 KOhms value.                                        */
+  GPIO_PADREGJ_PAD39RSEL_PULL12K       = 2,     /*!< PULL12K : Pullup is ~12 KOhms value.                                      */
+  GPIO_PADREGJ_PAD39RSEL_PULL24K       = 3,     /*!< PULL24K : Pullup is ~24 KOhms value.                                      */
+} GPIO_PADREGJ_PAD39RSEL_Enum;
+
+/* ===========================================  GPIO PADREGJ PAD39FNCSEL [27..29]  =========================================== */
+typedef enum {                                  /*!< GPIO_PADREGJ_PAD39FNCSEL                                                  */
+  GPIO_PADREGJ_PAD39FNCSEL_UART0TX     = 0,     /*!< UART0TX : Configure as the UART0 TX output signal value.                  */
+  GPIO_PADREGJ_PAD39FNCSEL_UART1TX     = 1,     /*!< UART1TX : Configure as the UART1 TX output signal value.                  */
+  GPIO_PADREGJ_PAD39FNCSEL_CT25        = 2,     /*!< CT25 : CTIMER connection 25 value.                                        */
+  GPIO_PADREGJ_PAD39FNCSEL_GPIO39      = 3,     /*!< GPIO39 : Configure as GPIO39 value.                                       */
+  GPIO_PADREGJ_PAD39FNCSEL_M4SCL       = 4,     /*!< M4SCL : Configure as the IOMSTR4 I2C SCL signal value.                    */
+  GPIO_PADREGJ_PAD39FNCSEL_M4SCK       = 5,     /*!< M4SCK : Configure as the IOMSTR4 SPI SCK signal value.                    */
+} GPIO_PADREGJ_PAD39FNCSEL_Enum;
+
+/* ===========================================  GPIO PADREGJ PAD39STRNG [26..26]  ============================================ */
+typedef enum {                                  /*!< GPIO_PADREGJ_PAD39STRNG                                                   */
+  GPIO_PADREGJ_PAD39STRNG_LOW          = 0,     /*!< LOW : Low drive strength value.                                           */
+  GPIO_PADREGJ_PAD39STRNG_HIGH         = 1,     /*!< HIGH : High drive strength value.                                         */
+} GPIO_PADREGJ_PAD39STRNG_Enum;
+
+/* ===========================================  GPIO PADREGJ PAD39INPEN [25..25]  ============================================ */
+typedef enum {                                  /*!< GPIO_PADREGJ_PAD39INPEN                                                   */
+  GPIO_PADREGJ_PAD39INPEN_DIS          = 0,     /*!< DIS : Pad input disabled value.                                           */
+  GPIO_PADREGJ_PAD39INPEN_EN           = 1,     /*!< EN : Pad input enabled value.                                             */
+} GPIO_PADREGJ_PAD39INPEN_Enum;
+
+/* ============================================  GPIO PADREGJ PAD39PULL [24..24]  ============================================ */
+typedef enum {                                  /*!< GPIO_PADREGJ_PAD39PULL                                                    */
+  GPIO_PADREGJ_PAD39PULL_DIS           = 0,     /*!< DIS : Pullup disabled value.                                              */
+  GPIO_PADREGJ_PAD39PULL_EN            = 1,     /*!< EN : Pullup enabled value.                                                */
+} GPIO_PADREGJ_PAD39PULL_Enum;
+
+/* ===========================================  GPIO PADREGJ PAD38FNCSEL [19..21]  =========================================== */
+typedef enum {                                  /*!< GPIO_PADREGJ_PAD38FNCSEL                                                  */
+  GPIO_PADREGJ_PAD38FNCSEL_TRIG3       = 0,     /*!< TRIG3 : Configure as the ADC Trigger 3 signal value.                      */
+  GPIO_PADREGJ_PAD38FNCSEL_NCE38       = 1,     /*!< NCE38 : IOM/MSPI nCE group 38 value.                                      */
+  GPIO_PADREGJ_PAD38FNCSEL_UA0CTS      = 2,     /*!< UA0CTS : Configure as the UART0 CTS signal value.                         */
+  GPIO_PADREGJ_PAD38FNCSEL_GPIO38      = 3,     /*!< GPIO38 : Configure as GPIO38 value.                                       */
+  GPIO_PADREGJ_PAD38FNCSEL_M3MOSI      = 5,     /*!< M3MOSI : Configure as the IOMSTR3 SPI MOSI output signal value.           */
+  GPIO_PADREGJ_PAD38FNCSEL_UART1RX     = 6,     /*!< UART1RX : Configure as the UART1 RX input signal value.                   */
+} GPIO_PADREGJ_PAD38FNCSEL_Enum;
+
+/* ===========================================  GPIO PADREGJ PAD38STRNG [18..18]  ============================================ */
+typedef enum {                                  /*!< GPIO_PADREGJ_PAD38STRNG                                                   */
+  GPIO_PADREGJ_PAD38STRNG_LOW          = 0,     /*!< LOW : Low drive strength value.                                           */
+  GPIO_PADREGJ_PAD38STRNG_HIGH         = 1,     /*!< HIGH : High drive strength value.                                         */
+} GPIO_PADREGJ_PAD38STRNG_Enum;
+
+/* ===========================================  GPIO PADREGJ PAD38INPEN [17..17]  ============================================ */
+typedef enum {                                  /*!< GPIO_PADREGJ_PAD38INPEN                                                   */
+  GPIO_PADREGJ_PAD38INPEN_DIS          = 0,     /*!< DIS : Pad input disabled value.                                           */
+  GPIO_PADREGJ_PAD38INPEN_EN           = 1,     /*!< EN : Pad input enabled value.                                             */
+} GPIO_PADREGJ_PAD38INPEN_Enum;
+
+/* ============================================  GPIO PADREGJ PAD38PULL [16..16]  ============================================ */
+typedef enum {                                  /*!< GPIO_PADREGJ_PAD38PULL                                                    */
+  GPIO_PADREGJ_PAD38PULL_DIS           = 0,     /*!< DIS : Pullup disabled value.                                              */
+  GPIO_PADREGJ_PAD38PULL_EN            = 1,     /*!< EN : Pullup enabled value.                                                */
+} GPIO_PADREGJ_PAD38PULL_Enum;
+
+/* ===========================================  GPIO PADREGJ PAD37PWRDN [15..15]  ============================================ */
+typedef enum {                                  /*!< GPIO_PADREGJ_PAD37PWRDN                                                   */
+  GPIO_PADREGJ_PAD37PWRDN_DIS          = 0,     /*!< DIS : Power switch disabled value.                                        */
+  GPIO_PADREGJ_PAD37PWRDN_EN           = 1,     /*!< EN : Power switch enabled (switch to GND) value.                          */
+} GPIO_PADREGJ_PAD37PWRDN_Enum;
+
+/* ===========================================  GPIO PADREGJ PAD37FNCSEL [11..13]  =========================================== */
+typedef enum {                                  /*!< GPIO_PADREGJ_PAD37FNCSEL                                                  */
+  GPIO_PADREGJ_PAD37FNCSEL_TRIG2       = 0,     /*!< TRIG2 : Configure as the ADC Trigger 2 signal value.                      */
+  GPIO_PADREGJ_PAD37FNCSEL_NCE37       = 1,     /*!< NCE37 : IOM/MSPI nCE group 37 value.                                      */
+  GPIO_PADREGJ_PAD37FNCSEL_UA0RTS      = 2,     /*!< UA0RTS : Configure as the UART0 RTS output signal value.                  */
+  GPIO_PADREGJ_PAD37FNCSEL_GPIO37      = 3,     /*!< GPIO37 : Configure as GPIO37 value.                                       */
+  GPIO_PADREGJ_PAD37FNCSEL_SCCIO       = 4,     /*!< SCCIO : SCARD serial data input/output value.                             */
+  GPIO_PADREGJ_PAD37FNCSEL_UART1TX     = 5,     /*!< UART1TX : Configure as the UART1 TX output signal value.                  */
+  GPIO_PADREGJ_PAD37FNCSEL_PDMCLK      = 6,     /*!< PDMCLK : Configure as the PDM CLK output signal value.                    */
+  GPIO_PADREGJ_PAD37FNCSEL_CT29        = 7,     /*!< CT29 : CTIMER connection 29 value.                                        */
+} GPIO_PADREGJ_PAD37FNCSEL_Enum;
+
+/* ===========================================  GPIO PADREGJ PAD37STRNG [10..10]  ============================================ */
+typedef enum {                                  /*!< GPIO_PADREGJ_PAD37STRNG                                                   */
+  GPIO_PADREGJ_PAD37STRNG_LOW          = 0,     /*!< LOW : Low drive strength value.                                           */
+  GPIO_PADREGJ_PAD37STRNG_HIGH         = 1,     /*!< HIGH : High drive strength value.                                         */
+} GPIO_PADREGJ_PAD37STRNG_Enum;
+
+/* ============================================  GPIO PADREGJ PAD37INPEN [9..9]  ============================================= */
+typedef enum {                                  /*!< GPIO_PADREGJ_PAD37INPEN                                                   */
+  GPIO_PADREGJ_PAD37INPEN_DIS          = 0,     /*!< DIS : Pad input disabled value.                                           */
+  GPIO_PADREGJ_PAD37INPEN_EN           = 1,     /*!< EN : Pad input enabled value.                                             */
+} GPIO_PADREGJ_PAD37INPEN_Enum;
+
+/* =============================================  GPIO PADREGJ PAD37PULL [8..8]  ============================================= */
+typedef enum {                                  /*!< GPIO_PADREGJ_PAD37PULL                                                    */
+  GPIO_PADREGJ_PAD37PULL_DIS           = 0,     /*!< DIS : Pullup disabled value.                                              */
+  GPIO_PADREGJ_PAD37PULL_EN            = 1,     /*!< EN : Pullup enabled value.                                                */
+} GPIO_PADREGJ_PAD37PULL_Enum;
+
+/* ============================================  GPIO PADREGJ PAD36PWRUP [6..6]  ============================================= */
+typedef enum {                                  /*!< GPIO_PADREGJ_PAD36PWRUP                                                   */
+  GPIO_PADREGJ_PAD36PWRUP_DIS          = 0,     /*!< DIS : Power switch disabled value.                                        */
+  GPIO_PADREGJ_PAD36PWRUP_EN           = 1,     /*!< EN : Power switch enabled (switched to VDD) value.                        */
+} GPIO_PADREGJ_PAD36PWRUP_Enum;
+
+/* ============================================  GPIO PADREGJ PAD36FNCSEL [3..5]  ============================================ */
+typedef enum {                                  /*!< GPIO_PADREGJ_PAD36FNCSEL                                                  */
+  GPIO_PADREGJ_PAD36FNCSEL_TRIG1       = 0,     /*!< TRIG1 : Configure as the ADC Trigger 1 signal value.                      */
+  GPIO_PADREGJ_PAD36FNCSEL_NCE36       = 1,     /*!< NCE36 : IOM/MSPI nCE group 36 value.                                      */
+  GPIO_PADREGJ_PAD36FNCSEL_UART1RX     = 2,     /*!< UART1RX : Configure as the UART1 RX input signal value.                   */
+  GPIO_PADREGJ_PAD36FNCSEL_GPIO36      = 3,     /*!< GPIO36 : Configure as GPIO36 value.                                       */
+  GPIO_PADREGJ_PAD36FNCSEL_32kHzXT     = 4,     /*!< 32kHzXT : Configure as the 32kHz output clock from the crystal
+                                                     value.                                                                    */
+  GPIO_PADREGJ_PAD36FNCSEL_UA1CTS      = 5,     /*!< UA1CTS : Configure as the UART1 CTS input signal value.                   */
+  GPIO_PADREGJ_PAD36FNCSEL_UA0CTS      = 6,     /*!< UA0CTS : Configure as the UART0 CTS input signal value.                   */
+  GPIO_PADREGJ_PAD36FNCSEL_PDMDATA     = 7,     /*!< PDMDATA : PDM serial data input value.                                    */
+} GPIO_PADREGJ_PAD36FNCSEL_Enum;
+
+/* ============================================  GPIO PADREGJ PAD36STRNG [2..2]  ============================================= */
+typedef enum {                                  /*!< GPIO_PADREGJ_PAD36STRNG                                                   */
+  GPIO_PADREGJ_PAD36STRNG_LOW          = 0,     /*!< LOW : Low drive strength value.                                           */
+  GPIO_PADREGJ_PAD36STRNG_HIGH         = 1,     /*!< HIGH : High drive strength value.                                         */
+} GPIO_PADREGJ_PAD36STRNG_Enum;
+
+/* ============================================  GPIO PADREGJ PAD36INPEN [1..1]  ============================================= */
+typedef enum {                                  /*!< GPIO_PADREGJ_PAD36INPEN                                                   */
+  GPIO_PADREGJ_PAD36INPEN_DIS          = 0,     /*!< DIS : Pad input disabled value.                                           */
+  GPIO_PADREGJ_PAD36INPEN_EN           = 1,     /*!< EN : Pad input enabled value.                                             */
+} GPIO_PADREGJ_PAD36INPEN_Enum;
+
+/* =============================================  GPIO PADREGJ PAD36PULL [0..0]  ============================================= */
+typedef enum {                                  /*!< GPIO_PADREGJ_PAD36PULL                                                    */
+  GPIO_PADREGJ_PAD36PULL_DIS           = 0,     /*!< DIS : Pullup disabled value.                                              */
+  GPIO_PADREGJ_PAD36PULL_EN            = 1,     /*!< EN : Pullup enabled value.                                                */
+} GPIO_PADREGJ_PAD36PULL_Enum;
+
+/* ========================================================  PADREGK  ======================================================== */
+/* ============================================  GPIO PADREGK PAD43RSEL [30..31]  ============================================ */
+typedef enum {                                  /*!< GPIO_PADREGK_PAD43RSEL                                                    */
+  GPIO_PADREGK_PAD43RSEL_PULL1_5K      = 0,     /*!< PULL1_5K : Pullup is ~1.5 KOhms value.                                    */
+  GPIO_PADREGK_PAD43RSEL_PULL6K        = 1,     /*!< PULL6K : Pullup is ~6 KOhms value.                                        */
+  GPIO_PADREGK_PAD43RSEL_PULL12K       = 2,     /*!< PULL12K : Pullup is ~12 KOhms value.                                      */
+  GPIO_PADREGK_PAD43RSEL_PULL24K       = 3,     /*!< PULL24K : Pullup is ~24 KOhms value.                                      */
+} GPIO_PADREGK_PAD43RSEL_Enum;
+
+/* ===========================================  GPIO PADREGK PAD43FNCSEL [27..29]  =========================================== */
+typedef enum {                                  /*!< GPIO_PADREGK_PAD43FNCSEL                                                  */
+  GPIO_PADREGK_PAD43FNCSEL_UART1RX     = 0,     /*!< UART1RX : Configure as the UART1 RX input signal value.                   */
+  GPIO_PADREGK_PAD43FNCSEL_NCE43       = 1,     /*!< NCE43 : IOM/MSPI nCE group 43 value.                                      */
+  GPIO_PADREGK_PAD43FNCSEL_CT18        = 2,     /*!< CT18 : CTIMER connection 18 value.                                        */
+  GPIO_PADREGK_PAD43FNCSEL_GPIO43      = 3,     /*!< GPIO43 : Configure as GPIO43 value.                                       */
+  GPIO_PADREGK_PAD43FNCSEL_M3SDAWIR3   = 4,     /*!< M3SDAWIR3 : Configure as the IOMSTR3 I2C SDA or SPI WIR3 signal
+                                                     value.                                                                    */
+  GPIO_PADREGK_PAD43FNCSEL_M3MISO      = 5,     /*!< M3MISO : Configure as the IOMSTR3 SPI MISO signal value.                  */
+} GPIO_PADREGK_PAD43FNCSEL_Enum;
+
+/* ===========================================  GPIO PADREGK PAD43STRNG [26..26]  ============================================ */
+typedef enum {                                  /*!< GPIO_PADREGK_PAD43STRNG                                                   */
+  GPIO_PADREGK_PAD43STRNG_LOW          = 0,     /*!< LOW : Low drive strength value.                                           */
+  GPIO_PADREGK_PAD43STRNG_HIGH         = 1,     /*!< HIGH : High drive strength value.                                         */
+} GPIO_PADREGK_PAD43STRNG_Enum;
+
+/* ===========================================  GPIO PADREGK PAD43INPEN [25..25]  ============================================ */
+typedef enum {                                  /*!< GPIO_PADREGK_PAD43INPEN                                                   */
+  GPIO_PADREGK_PAD43INPEN_DIS          = 0,     /*!< DIS : Pad input disabled value.                                           */
+  GPIO_PADREGK_PAD43INPEN_EN           = 1,     /*!< EN : Pad input enabled value.                                             */
+} GPIO_PADREGK_PAD43INPEN_Enum;
+
+/* ============================================  GPIO PADREGK PAD43PULL [24..24]  ============================================ */
+typedef enum {                                  /*!< GPIO_PADREGK_PAD43PULL                                                    */
+  GPIO_PADREGK_PAD43PULL_DIS           = 0,     /*!< DIS : Pullup disabled value.                                              */
+  GPIO_PADREGK_PAD43PULL_EN            = 1,     /*!< EN : Pullup enabled value.                                                */
+} GPIO_PADREGK_PAD43PULL_Enum;
+
+/* ============================================  GPIO PADREGK PAD42RSEL [22..23]  ============================================ */
+typedef enum {                                  /*!< GPIO_PADREGK_PAD42RSEL                                                    */
+  GPIO_PADREGK_PAD42RSEL_PULL1_5K      = 0,     /*!< PULL1_5K : Pullup is ~1.5 KOhms value.                                    */
+  GPIO_PADREGK_PAD42RSEL_PULL6K        = 1,     /*!< PULL6K : Pullup is ~6 KOhms value.                                        */
+  GPIO_PADREGK_PAD42RSEL_PULL12K       = 2,     /*!< PULL12K : Pullup is ~12 KOhms value.                                      */
+  GPIO_PADREGK_PAD42RSEL_PULL24K       = 3,     /*!< PULL24K : Pullup is ~24 KOhms value.                                      */
+} GPIO_PADREGK_PAD42RSEL_Enum;
+
+/* ===========================================  GPIO PADREGK PAD42FNCSEL [19..21]  =========================================== */
+typedef enum {                                  /*!< GPIO_PADREGK_PAD42FNCSEL                                                  */
+  GPIO_PADREGK_PAD42FNCSEL_UART1TX     = 0,     /*!< UART1TX : Configure as the UART1 TX output signal value.                  */
+  GPIO_PADREGK_PAD42FNCSEL_NCE42       = 1,     /*!< NCE42 : IOM/MSPI nCE group 42 value.                                      */
+  GPIO_PADREGK_PAD42FNCSEL_CT16        = 2,     /*!< CT16 : CTIMER connection 16 value.                                        */
+  GPIO_PADREGK_PAD42FNCSEL_GPIO42      = 3,     /*!< GPIO42 : Configure as GPIO42 value.                                       */
+  GPIO_PADREGK_PAD42FNCSEL_M3SCL       = 4,     /*!< M3SCL : Configure as the IOMSTR3 I2C SCL clock I/O signal value.          */
+  GPIO_PADREGK_PAD42FNCSEL_M3SCK       = 5,     /*!< M3SCK : Configure as the IOMSTR3 SPI SCK output value.                    */
+} GPIO_PADREGK_PAD42FNCSEL_Enum;
+
+/* ===========================================  GPIO PADREGK PAD42STRNG [18..18]  ============================================ */
+typedef enum {                                  /*!< GPIO_PADREGK_PAD42STRNG                                                   */
+  GPIO_PADREGK_PAD42STRNG_LOW          = 0,     /*!< LOW : Low drive strength value.                                           */
+  GPIO_PADREGK_PAD42STRNG_HIGH         = 1,     /*!< HIGH : High drive strength value.                                         */
+} GPIO_PADREGK_PAD42STRNG_Enum;
+
+/* ===========================================  GPIO PADREGK PAD42INPEN [17..17]  ============================================ */
+typedef enum {                                  /*!< GPIO_PADREGK_PAD42INPEN                                                   */
+  GPIO_PADREGK_PAD42INPEN_DIS          = 0,     /*!< DIS : Pad input disabled value.                                           */
+  GPIO_PADREGK_PAD42INPEN_EN           = 1,     /*!< EN : Pad input enabled value.                                             */
+} GPIO_PADREGK_PAD42INPEN_Enum;
+
+/* ============================================  GPIO PADREGK PAD42PULL [16..16]  ============================================ */
+typedef enum {                                  /*!< GPIO_PADREGK_PAD42PULL                                                    */
+  GPIO_PADREGK_PAD42PULL_DIS           = 0,     /*!< DIS : Pullup disabled value.                                              */
+  GPIO_PADREGK_PAD42PULL_EN            = 1,     /*!< EN : Pullup enabled value.                                                */
+} GPIO_PADREGK_PAD42PULL_Enum;
+
+/* ===========================================  GPIO PADREGK PAD41PWRDN [15..15]  ============================================ */
+typedef enum {                                  /*!< GPIO_PADREGK_PAD41PWRDN                                                   */
+  GPIO_PADREGK_PAD41PWRDN_DIS          = 0,     /*!< DIS : Power switch disabled value.                                        */
+  GPIO_PADREGK_PAD41PWRDN_EN           = 1,     /*!< EN : Power switch enabled (Switch pad to VSS) value.                      */
+} GPIO_PADREGK_PAD41PWRDN_Enum;
+
+/* ===========================================  GPIO PADREGK PAD41FNCSEL [11..13]  =========================================== */
+typedef enum {                                  /*!< GPIO_PADREGK_PAD41FNCSEL                                                  */
+  GPIO_PADREGK_PAD41FNCSEL_NCE41       = 0,     /*!< NCE41 : IOM/MSPI nCE group 41 value.                                      */
+  GPIO_PADREGK_PAD41FNCSEL_SWO         = 2,     /*!< SWO : Configure as the serial wire debug SWO signal value.                */
+  GPIO_PADREGK_PAD41FNCSEL_GPIO41      = 3,     /*!< GPIO41 : Configure as GPIO41 value.                                       */
+  GPIO_PADREGK_PAD41FNCSEL_I2SWCLK     = 4,     /*!< I2SWCLK : I2S word clock input value.                                     */
+  GPIO_PADREGK_PAD41FNCSEL_UA1RTS      = 5,     /*!< UA1RTS : Configure as the UART1 RTS output signal value.                  */
+  GPIO_PADREGK_PAD41FNCSEL_UART0TX     = 6,     /*!< UART0TX : Configure as the UART0 TX output signal value.                  */
+  GPIO_PADREGK_PAD41FNCSEL_UA0RTS      = 7,     /*!< UA0RTS : Configure as the UART0 RTS output signal value.                  */
+} GPIO_PADREGK_PAD41FNCSEL_Enum;
+
+/* ===========================================  GPIO PADREGK PAD41STRNG [10..10]  ============================================ */
+typedef enum {                                  /*!< GPIO_PADREGK_PAD41STRNG                                                   */
+  GPIO_PADREGK_PAD41STRNG_LOW          = 0,     /*!< LOW : Low drive strength value.                                           */
+  GPIO_PADREGK_PAD41STRNG_HIGH         = 1,     /*!< HIGH : High drive strength value.                                         */
+} GPIO_PADREGK_PAD41STRNG_Enum;
+
+/* ============================================  GPIO PADREGK PAD41INPEN [9..9]  ============================================= */
+typedef enum {                                  /*!< GPIO_PADREGK_PAD41INPEN                                                   */
+  GPIO_PADREGK_PAD41INPEN_DIS          = 0,     /*!< DIS : Pad input disabled value.                                           */
+  GPIO_PADREGK_PAD41INPEN_EN           = 1,     /*!< EN : Pad input enabled value.                                             */
+} GPIO_PADREGK_PAD41INPEN_Enum;
+
+/* =============================================  GPIO PADREGK PAD41PULL [8..8]  ============================================= */
+typedef enum {                                  /*!< GPIO_PADREGK_PAD41PULL                                                    */
+  GPIO_PADREGK_PAD41PULL_DIS           = 0,     /*!< DIS : Pullup disabled value.                                              */
+  GPIO_PADREGK_PAD41PULL_EN            = 1,     /*!< EN : Pullup enabled value.                                                */
+} GPIO_PADREGK_PAD41PULL_Enum;
+
+/* =============================================  GPIO PADREGK PAD40RSEL [6..7]  ============================================= */
+typedef enum {                                  /*!< GPIO_PADREGK_PAD40RSEL                                                    */
+  GPIO_PADREGK_PAD40RSEL_PULL1_5K      = 0,     /*!< PULL1_5K : Pullup is ~1.5 KOhms value.                                    */
+  GPIO_PADREGK_PAD40RSEL_PULL6K        = 1,     /*!< PULL6K : Pullup is ~6 KOhms value.                                        */
+  GPIO_PADREGK_PAD40RSEL_PULL12K       = 2,     /*!< PULL12K : Pullup is ~12 KOhms value.                                      */
+  GPIO_PADREGK_PAD40RSEL_PULL24K       = 3,     /*!< PULL24K : Pullup is ~24 KOhms value.                                      */
+} GPIO_PADREGK_PAD40RSEL_Enum;
+
+/* ============================================  GPIO PADREGK PAD40FNCSEL [3..5]  ============================================ */
+typedef enum {                                  /*!< GPIO_PADREGK_PAD40FNCSEL                                                  */
+  GPIO_PADREGK_PAD40FNCSEL_UART0RX     = 0,     /*!< UART0RX : Configure as the UART0 RX input signal value.                   */
+  GPIO_PADREGK_PAD40FNCSEL_UART1RX     = 1,     /*!< UART1RX : Configure as the UART1 RX input signal value.                   */
+  GPIO_PADREGK_PAD40FNCSEL_TRIG0       = 2,     /*!< TRIG0 : Configure as the ADC Trigger 0 signal value.                      */
+  GPIO_PADREGK_PAD40FNCSEL_GPIO40      = 3,     /*!< GPIO40 : Configure as GPIO40 value.                                       */
+  GPIO_PADREGK_PAD40FNCSEL_M4SDAWIR3   = 4,     /*!< M4SDAWIR3 : Configure as the IOMSTR4 I2C SDA or SPI WIR3 signal
+                                                     value.                                                                    */
+  GPIO_PADREGK_PAD40FNCSEL_M4MISO      = 5,     /*!< M4MISO : Configure as the IOMSTR4 SPI MISO input signal value.            */
+} GPIO_PADREGK_PAD40FNCSEL_Enum;
+
+/* ============================================  GPIO PADREGK PAD40STRNG [2..2]  ============================================= */
+typedef enum {                                  /*!< GPIO_PADREGK_PAD40STRNG                                                   */
+  GPIO_PADREGK_PAD40STRNG_LOW          = 0,     /*!< LOW : Low drive strength value.                                           */
+  GPIO_PADREGK_PAD40STRNG_HIGH         = 1,     /*!< HIGH : High drive strength value.                                         */
+} GPIO_PADREGK_PAD40STRNG_Enum;
+
+/* ============================================  GPIO PADREGK PAD40INPEN [1..1]  ============================================= */
+typedef enum {                                  /*!< GPIO_PADREGK_PAD40INPEN                                                   */
+  GPIO_PADREGK_PAD40INPEN_DIS          = 0,     /*!< DIS : Pad input disabled value.                                           */
+  GPIO_PADREGK_PAD40INPEN_EN           = 1,     /*!< EN : Pad input enabled value.                                             */
+} GPIO_PADREGK_PAD40INPEN_Enum;
+
+/* =============================================  GPIO PADREGK PAD40PULL [0..0]  ============================================= */
+typedef enum {                                  /*!< GPIO_PADREGK_PAD40PULL                                                    */
+  GPIO_PADREGK_PAD40PULL_DIS           = 0,     /*!< DIS : Pullup disabled value.                                              */
+  GPIO_PADREGK_PAD40PULL_EN            = 1,     /*!< EN : Pullup enabled value.                                                */
+} GPIO_PADREGK_PAD40PULL_Enum;
+
+/* ========================================================  PADREGL  ======================================================== */
+/* ===========================================  GPIO PADREGL PAD47FNCSEL [27..29]  =========================================== */
+typedef enum {                                  /*!< GPIO_PADREGL_PAD47FNCSEL                                                  */
+  GPIO_PADREGL_PAD47FNCSEL_32kHzXT     = 0,     /*!< 32kHzXT : Configure as the 32kHz output clock from the crystal
+                                                     value.                                                                    */
+  GPIO_PADREGL_PAD47FNCSEL_NCE47       = 1,     /*!< NCE47 : IOM/MSPI nCE group 47 value.                                      */
+  GPIO_PADREGL_PAD47FNCSEL_CT26        = 2,     /*!< CT26 : CTIMER connection 26 value.                                        */
+  GPIO_PADREGL_PAD47FNCSEL_GPIO47      = 3,     /*!< GPIO47 : Configure as GPIO47 value.                                       */
+  GPIO_PADREGL_PAD47FNCSEL_M5MOSI      = 5,     /*!< M5MOSI : Configure as the IOMSTR5 SPI MOSI output signal value.           */
+  GPIO_PADREGL_PAD47FNCSEL_UART1RX     = 6,     /*!< UART1RX : Configure as the UART1 RX input signal value.                   */
+} GPIO_PADREGL_PAD47FNCSEL_Enum;
+
+/* ===========================================  GPIO PADREGL PAD47STRNG [26..26]  ============================================ */
+typedef enum {                                  /*!< GPIO_PADREGL_PAD47STRNG                                                   */
+  GPIO_PADREGL_PAD47STRNG_LOW          = 0,     /*!< LOW : Low drive strength value.                                           */
+  GPIO_PADREGL_PAD47STRNG_HIGH         = 1,     /*!< HIGH : High drive strength value.                                         */
+} GPIO_PADREGL_PAD47STRNG_Enum;
+
+/* ===========================================  GPIO PADREGL PAD47INPEN [25..25]  ============================================ */
+typedef enum {                                  /*!< GPIO_PADREGL_PAD47INPEN                                                   */
+  GPIO_PADREGL_PAD47INPEN_DIS          = 0,     /*!< DIS : Pad input disabled value.                                           */
+  GPIO_PADREGL_PAD47INPEN_EN           = 1,     /*!< EN : Pad input enabled value.                                             */
+} GPIO_PADREGL_PAD47INPEN_Enum;
+
+/* ============================================  GPIO PADREGL PAD47PULL [24..24]  ============================================ */
+typedef enum {                                  /*!< GPIO_PADREGL_PAD47PULL                                                    */
+  GPIO_PADREGL_PAD47PULL_DIS           = 0,     /*!< DIS : Pullup disabled value.                                              */
+  GPIO_PADREGL_PAD47PULL_EN            = 1,     /*!< EN : Pullup enabled value.                                                */
+} GPIO_PADREGL_PAD47PULL_Enum;
+
+/* ===========================================  GPIO PADREGL PAD46FNCSEL [19..21]  =========================================== */
+typedef enum {                                  /*!< GPIO_PADREGL_PAD46FNCSEL                                                  */
+  GPIO_PADREGL_PAD46FNCSEL_32khz_XT    = 0,     /*!< 32khz_XT : Configure as the 32kHz output clock from the crystal
+                                                     value.                                                                    */
+  GPIO_PADREGL_PAD46FNCSEL_NCE46       = 1,     /*!< NCE46 : IOM/MSPI nCE group 46 value.                                      */
+  GPIO_PADREGL_PAD46FNCSEL_CT24        = 2,     /*!< CT24 : CTIMER connection 24 value.                                        */
+  GPIO_PADREGL_PAD46FNCSEL_GPIO46      = 3,     /*!< GPIO46 : Configure as GPIO46 value.                                       */
+  GPIO_PADREGL_PAD46FNCSEL_SCCRST      = 4,     /*!< SCCRST : SCARD reset output value.                                        */
+  GPIO_PADREGL_PAD46FNCSEL_PDMCLK      = 5,     /*!< PDMCLK : PDM serial clock output value.                                   */
+  GPIO_PADREGL_PAD46FNCSEL_UART1TX     = 6,     /*!< UART1TX : Configure as the UART1 TX output signal value.                  */
+  GPIO_PADREGL_PAD46FNCSEL_SWO         = 7,     /*!< SWO : Configure as the serial wire debug SWO signal value.                */
+} GPIO_PADREGL_PAD46FNCSEL_Enum;
+
+/* ===========================================  GPIO PADREGL PAD46STRNG [18..18]  ============================================ */
+typedef enum {                                  /*!< GPIO_PADREGL_PAD46STRNG                                                   */
+  GPIO_PADREGL_PAD46STRNG_LOW          = 0,     /*!< LOW : Low drive strength value.                                           */
+  GPIO_PADREGL_PAD46STRNG_HIGH         = 1,     /*!< HIGH : High drive strength value.                                         */
+} GPIO_PADREGL_PAD46STRNG_Enum;
+
+/* ===========================================  GPIO PADREGL PAD46INPEN [17..17]  ============================================ */
+typedef enum {                                  /*!< GPIO_PADREGL_PAD46INPEN                                                   */
+  GPIO_PADREGL_PAD46INPEN_DIS          = 0,     /*!< DIS : Pad input disabled value.                                           */
+  GPIO_PADREGL_PAD46INPEN_EN           = 1,     /*!< EN : Pad input enabled value.                                             */
+} GPIO_PADREGL_PAD46INPEN_Enum;
+
+/* ============================================  GPIO PADREGL PAD46PULL [16..16]  ============================================ */
+typedef enum {                                  /*!< GPIO_PADREGL_PAD46PULL                                                    */
+  GPIO_PADREGL_PAD46PULL_DIS           = 0,     /*!< DIS : Pullup disabled value.                                              */
+  GPIO_PADREGL_PAD46PULL_EN            = 1,     /*!< EN : Pullup enabled value.                                                */
+} GPIO_PADREGL_PAD46PULL_Enum;
+
+/* ===========================================  GPIO PADREGL PAD45FNCSEL [11..13]  =========================================== */
+typedef enum {                                  /*!< GPIO_PADREGL_PAD45FNCSEL                                                  */
+  GPIO_PADREGL_PAD45FNCSEL_UA1CTS      = 0,     /*!< UA1CTS : Configure as the UART1 CTS input signal value.                   */
+  GPIO_PADREGL_PAD45FNCSEL_NCE45       = 1,     /*!< NCE45 : IOM/MSPI nCE group 45 value.                                      */
+  GPIO_PADREGL_PAD45FNCSEL_CT22        = 2,     /*!< CT22 : CTIMER connection 22 value.                                        */
+  GPIO_PADREGL_PAD45FNCSEL_GPIO45      = 3,     /*!< GPIO45 : Configure as GPIO45 value.                                       */
+  GPIO_PADREGL_PAD45FNCSEL_I2SDAT      = 4,     /*!< I2SDAT : I2S serial data output value.                                    */
+  GPIO_PADREGL_PAD45FNCSEL_PDMDATA     = 5,     /*!< PDMDATA : PDM serial data input value.                                    */
+  GPIO_PADREGL_PAD45FNCSEL_UART0RX     = 6,     /*!< UART0RX : Configure as the SPI channel 5 nCE signal from IOMSTR5
+                                                     value.                                                                    */
+  GPIO_PADREGL_PAD45FNCSEL_SWO         = 7,     /*!< SWO : Configure as the serial wire debug SWO signal value.                */
+} GPIO_PADREGL_PAD45FNCSEL_Enum;
+
+/* ===========================================  GPIO PADREGL PAD45STRNG [10..10]  ============================================ */
+typedef enum {                                  /*!< GPIO_PADREGL_PAD45STRNG                                                   */
+  GPIO_PADREGL_PAD45STRNG_LOW          = 0,     /*!< LOW : Low drive strength value.                                           */
+  GPIO_PADREGL_PAD45STRNG_HIGH         = 1,     /*!< HIGH : High drive strength value.                                         */
+} GPIO_PADREGL_PAD45STRNG_Enum;
+
+/* ============================================  GPIO PADREGL PAD45INPEN [9..9]  ============================================= */
+typedef enum {                                  /*!< GPIO_PADREGL_PAD45INPEN                                                   */
+  GPIO_PADREGL_PAD45INPEN_DIS          = 0,     /*!< DIS : Pad input disabled value.                                           */
+  GPIO_PADREGL_PAD45INPEN_EN           = 1,     /*!< EN : Pad input enabled value.                                             */
+} GPIO_PADREGL_PAD45INPEN_Enum;
+
+/* =============================================  GPIO PADREGL PAD45PULL [8..8]  ============================================= */
+typedef enum {                                  /*!< GPIO_PADREGL_PAD45PULL                                                    */
+  GPIO_PADREGL_PAD45PULL_DIS           = 0,     /*!< DIS : Pullup disabled value.                                              */
+  GPIO_PADREGL_PAD45PULL_EN            = 1,     /*!< EN : Pullup enabled value.                                                */
+} GPIO_PADREGL_PAD45PULL_Enum;
+
+/* ============================================  GPIO PADREGL PAD44FNCSEL [3..5]  ============================================ */
+typedef enum {                                  /*!< GPIO_PADREGL_PAD44FNCSEL                                                  */
+  GPIO_PADREGL_PAD44FNCSEL_UA1RTS      = 0,     /*!< UA1RTS : Configure as the UART1 RTS output signal value.                  */
+  GPIO_PADREGL_PAD44FNCSEL_NCE44       = 1,     /*!< NCE44 : IOM/MSPI nCE group 44 value.                                      */
+  GPIO_PADREGL_PAD44FNCSEL_CT20        = 2,     /*!< CT20 : CTIMER connection 20 value.                                        */
+  GPIO_PADREGL_PAD44FNCSEL_GPIO44      = 3,     /*!< GPIO44 : Configure as GPIO44 value.                                       */
+  GPIO_PADREGL_PAD44FNCSEL_M4MOSI      = 5,     /*!< M4MOSI : Configure as the IOMSTR4 SPI MOSI signal value.                  */
+  GPIO_PADREGL_PAD44FNCSEL_M5nCE6      = 6,     /*!< M5nCE6 : Configure as the SPI channel 6 nCE signal from IOMSTR5
+                                                     value.                                                                    */
+} GPIO_PADREGL_PAD44FNCSEL_Enum;
+
+/* ============================================  GPIO PADREGL PAD44STRNG [2..2]  ============================================= */
+typedef enum {                                  /*!< GPIO_PADREGL_PAD44STRNG                                                   */
+  GPIO_PADREGL_PAD44STRNG_LOW          = 0,     /*!< LOW : Low drive strength value.                                           */
+  GPIO_PADREGL_PAD44STRNG_HIGH         = 1,     /*!< HIGH : High drive strength value.                                         */
+} GPIO_PADREGL_PAD44STRNG_Enum;
+
+/* ============================================  GPIO PADREGL PAD44INPEN [1..1]  ============================================= */
+typedef enum {                                  /*!< GPIO_PADREGL_PAD44INPEN                                                   */
+  GPIO_PADREGL_PAD44INPEN_DIS          = 0,     /*!< DIS : Pad input disabled value.                                           */
+  GPIO_PADREGL_PAD44INPEN_EN           = 1,     /*!< EN : Pad input enabled value.                                             */
+} GPIO_PADREGL_PAD44INPEN_Enum;
+
+/* =============================================  GPIO PADREGL PAD44PULL [0..0]  ============================================= */
+typedef enum {                                  /*!< GPIO_PADREGL_PAD44PULL                                                    */
+  GPIO_PADREGL_PAD44PULL_DIS           = 0,     /*!< DIS : Pullup disabled value.                                              */
+  GPIO_PADREGL_PAD44PULL_EN            = 1,     /*!< EN : Pullup enabled value.                                                */
+} GPIO_PADREGL_PAD44PULL_Enum;
+
+/* ========================================================  PADREGM  ======================================================== */
+/* ============================================  GPIO PADREGM PAD49RSEL [14..15]  ============================================ */
+typedef enum {                                  /*!< GPIO_PADREGM_PAD49RSEL                                                    */
+  GPIO_PADREGM_PAD49RSEL_PULL1_5K      = 0,     /*!< PULL1_5K : Pullup is ~1.5 KOhms value.                                    */
+  GPIO_PADREGM_PAD49RSEL_PULL6K        = 1,     /*!< PULL6K : Pullup is ~6 KOhms value.                                        */
+  GPIO_PADREGM_PAD49RSEL_PULL12K       = 2,     /*!< PULL12K : Pullup is ~12 KOhms value.                                      */
+  GPIO_PADREGM_PAD49RSEL_PULL24K       = 3,     /*!< PULL24K : Pullup is ~24 KOhms value.                                      */
+} GPIO_PADREGM_PAD49RSEL_Enum;
+
+/* ===========================================  GPIO PADREGM PAD49FNCSEL [11..13]  =========================================== */
+typedef enum {                                  /*!< GPIO_PADREGM_PAD49FNCSEL                                                  */
+  GPIO_PADREGM_PAD49FNCSEL_UART0RX     = 0,     /*!< UART0RX : Configure as the UART0 RX input signal value.                   */
+  GPIO_PADREGM_PAD49FNCSEL_NCE49       = 1,     /*!< NCE49 : IOM/MSPPI nCE group 49 value.                                     */
+  GPIO_PADREGM_PAD49FNCSEL_CT30        = 2,     /*!< CT30 : CTIMER connection 30 value.                                        */
+  GPIO_PADREGM_PAD49FNCSEL_GPIO49      = 3,     /*!< GPIO49 : Configure as GPIO49 value.                                       */
+  GPIO_PADREGM_PAD49FNCSEL_M5SDAWIR3   = 4,     /*!< M5SDAWIR3 : Configure as the IOMSTR5 I2C SDA or SPI WIR3 signal
+                                                     value.                                                                    */
+  GPIO_PADREGM_PAD49FNCSEL_M5MISO      = 5,     /*!< M5MISO : Configure as the IOMSTR5 SPI MISO input signal value.            */
+} GPIO_PADREGM_PAD49FNCSEL_Enum;
+
+/* ===========================================  GPIO PADREGM PAD49STRNG [10..10]  ============================================ */
+typedef enum {                                  /*!< GPIO_PADREGM_PAD49STRNG                                                   */
+  GPIO_PADREGM_PAD49STRNG_LOW          = 0,     /*!< LOW : Low drive strength value.                                           */
+  GPIO_PADREGM_PAD49STRNG_HIGH         = 1,     /*!< HIGH : High drive strength value.                                         */
+} GPIO_PADREGM_PAD49STRNG_Enum;
+
+/* ============================================  GPIO PADREGM PAD49INPEN [9..9]  ============================================= */
+typedef enum {                                  /*!< GPIO_PADREGM_PAD49INPEN                                                   */
+  GPIO_PADREGM_PAD49INPEN_DIS          = 0,     /*!< DIS : Pad input disabled value.                                           */
+  GPIO_PADREGM_PAD49INPEN_EN           = 1,     /*!< EN : Pad input enabled value.                                             */
+} GPIO_PADREGM_PAD49INPEN_Enum;
+
+/* =============================================  GPIO PADREGM PAD49PULL [8..8]  ============================================= */
+typedef enum {                                  /*!< GPIO_PADREGM_PAD49PULL                                                    */
+  GPIO_PADREGM_PAD49PULL_DIS           = 0,     /*!< DIS : Pullup disabled value.                                              */
+  GPIO_PADREGM_PAD49PULL_EN            = 1,     /*!< EN : Pullup enabled value.                                                */
+} GPIO_PADREGM_PAD49PULL_Enum;
+
+/* =============================================  GPIO PADREGM PAD48RSEL [6..7]  ============================================= */
+typedef enum {                                  /*!< GPIO_PADREGM_PAD48RSEL                                                    */
+  GPIO_PADREGM_PAD48RSEL_PULL1_5K      = 0,     /*!< PULL1_5K : Pullup is ~1.5 KOhms value.                                    */
+  GPIO_PADREGM_PAD48RSEL_PULL6K        = 1,     /*!< PULL6K : Pullup is ~6 KOhms value.                                        */
+  GPIO_PADREGM_PAD48RSEL_PULL12K       = 2,     /*!< PULL12K : Pullup is ~12 KOhms value.                                      */
+  GPIO_PADREGM_PAD48RSEL_PULL24K       = 3,     /*!< PULL24K : Pullup is ~24 KOhms value.                                      */
+} GPIO_PADREGM_PAD48RSEL_Enum;
+
+/* ============================================  GPIO PADREGM PAD48FNCSEL [3..5]  ============================================ */
+typedef enum {                                  /*!< GPIO_PADREGM_PAD48FNCSEL                                                  */
+  GPIO_PADREGM_PAD48FNCSEL_UART0TX     = 0,     /*!< UART0TX : Configure as the UART0 TX output signal value.                  */
+  GPIO_PADREGM_PAD48FNCSEL_NCE48       = 1,     /*!< NCE48 : IOM/MSPI nCE group 48 value.                                      */
+  GPIO_PADREGM_PAD48FNCSEL_CT28        = 2,     /*!< CT28 : CTIMER conenction 28 value.                                        */
+  GPIO_PADREGM_PAD48FNCSEL_GPIO48      = 3,     /*!< GPIO48 : Configure as GPIO48 value.                                       */
+  GPIO_PADREGM_PAD48FNCSEL_M5SCL       = 4,     /*!< M5SCL : Configure as the IOMSTR5 I2C SCL clock I/O signal value.          */
+  GPIO_PADREGM_PAD48FNCSEL_M5SCK       = 5,     /*!< M5SCK : Configure as the IOMSTR5 SPI SCK output value.                    */
+} GPIO_PADREGM_PAD48FNCSEL_Enum;
+
+/* ============================================  GPIO PADREGM PAD48STRNG [2..2]  ============================================= */
+typedef enum {                                  /*!< GPIO_PADREGM_PAD48STRNG                                                   */
+  GPIO_PADREGM_PAD48STRNG_LOW          = 0,     /*!< LOW : Low drive strength value.                                           */
+  GPIO_PADREGM_PAD48STRNG_HIGH         = 1,     /*!< HIGH : High drive strength value.                                         */
+} GPIO_PADREGM_PAD48STRNG_Enum;
+
+/* ============================================  GPIO PADREGM PAD48INPEN [1..1]  ============================================= */
+typedef enum {                                  /*!< GPIO_PADREGM_PAD48INPEN                                                   */
+  GPIO_PADREGM_PAD48INPEN_DIS          = 0,     /*!< DIS : Pad input disabled value.                                           */
+  GPIO_PADREGM_PAD48INPEN_EN           = 1,     /*!< EN : Pad input enabled value.                                             */
+} GPIO_PADREGM_PAD48INPEN_Enum;
+
+/* =============================================  GPIO PADREGM PAD48PULL [0..0]  ============================================= */
+typedef enum {                                  /*!< GPIO_PADREGM_PAD48PULL                                                    */
+  GPIO_PADREGM_PAD48PULL_DIS           = 0,     /*!< DIS : Pullup disabled value.                                              */
+  GPIO_PADREGM_PAD48PULL_EN            = 1,     /*!< EN : Pullup enabled value.                                                */
+} GPIO_PADREGM_PAD48PULL_Enum;
+
+/* =========================================================  CFGA  ========================================================== */
+/* =============================================  GPIO CFGA GPIO7INTD [31..31]  ============================================== */
+typedef enum {                                  /*!< GPIO_CFGA_GPIO7INTD                                                       */
+  GPIO_CFGA_GPIO7INTD_nCELOW           = 0,     /*!< nCELOW : FNCSEL = 0x0 - nCE polarity active low value.                    */
+  GPIO_CFGA_GPIO7INTD_nCEHIGH          = 1,     /*!< nCEHIGH : FNCSEL = 0x0 - nCE polarity active high value.                  */
+} GPIO_CFGA_GPIO7INTD_Enum;
+
+/* ============================================  GPIO CFGA GPIO7OUTCFG [29..30]  ============================================= */
+typedef enum {                                  /*!< GPIO_CFGA_GPIO7OUTCFG                                                     */
+  GPIO_CFGA_GPIO7OUTCFG_DIS            = 0,     /*!< DIS : FNCSEL = 0x3 - Output disabled value.                               */
+  GPIO_CFGA_GPIO7OUTCFG_PUSHPULL       = 1,     /*!< PUSHPULL : FNCSEL = 0x3 - Output is push-pull value.                      */
+  GPIO_CFGA_GPIO7OUTCFG_OD             = 2,     /*!< OD : FNCSEL = 0x3 - Output is open drain value.                           */
+  GPIO_CFGA_GPIO7OUTCFG_TS             = 3,     /*!< TS : FNCSEL = 0x3 - Output is tri-state value.                            */
+} GPIO_CFGA_GPIO7OUTCFG_Enum;
+
+/* =============================================  GPIO CFGA GPIO7INCFG [28..28]  ============================================= */
+typedef enum {                                  /*!< GPIO_CFGA_GPIO7INCFG                                                      */
+  GPIO_CFGA_GPIO7INCFG_READ            = 0,     /*!< READ : Read the GPIO pin data value.                                      */
+  GPIO_CFGA_GPIO7INCFG_RDZERO          = 1,     /*!< RDZERO : INTD = 0 - Readback will always be zero value.                   */
+} GPIO_CFGA_GPIO7INCFG_Enum;
+
+/* =============================================  GPIO CFGA GPIO6INTD [27..27]  ============================================== */
+typedef enum {                                  /*!< GPIO_CFGA_GPIO6INTD                                                       */
+  GPIO_CFGA_GPIO6INTD_INTDIS           = 0,     /*!< INTDIS : INCFG = 1 - No interrupt on GPIO transition value.               */
+  GPIO_CFGA_GPIO6INTD_INTBOTH          = 1,     /*!< INTBOTH : INCFG = 1 - Interrupt on either low to high or high
+                                                     to low GPIO transition value.                                             */
+} GPIO_CFGA_GPIO6INTD_Enum;
+
+/* ============================================  GPIO CFGA GPIO6OUTCFG [25..26]  ============================================= */
+typedef enum {                                  /*!< GPIO_CFGA_GPIO6OUTCFG                                                     */
+  GPIO_CFGA_GPIO6OUTCFG_DIS            = 0,     /*!< DIS : FNCSEL = 0x3 - Output disabled value.                               */
+  GPIO_CFGA_GPIO6OUTCFG_PUSHPULL       = 1,     /*!< PUSHPULL : FNCSEL = 0x3 - Output is push-pull value.                      */
+  GPIO_CFGA_GPIO6OUTCFG_OD             = 2,     /*!< OD : FNCSEL = 0x3 - Output is open drain value.                           */
+  GPIO_CFGA_GPIO6OUTCFG_TS             = 3,     /*!< TS : FNCSEL = 0x3 - Output is tri-state value.                            */
+} GPIO_CFGA_GPIO6OUTCFG_Enum;
+
+/* =============================================  GPIO CFGA GPIO6INCFG [24..24]  ============================================= */
+typedef enum {                                  /*!< GPIO_CFGA_GPIO6INCFG                                                      */
+  GPIO_CFGA_GPIO6INCFG_READ            = 0,     /*!< READ : Read the GPIO pin data value.                                      */
+  GPIO_CFGA_GPIO6INCFG_RDZERO          = 1,     /*!< RDZERO : INTD = 0 - Readback will always be zero value.                   */
+} GPIO_CFGA_GPIO6INCFG_Enum;
+
+/* =============================================  GPIO CFGA GPIO5INTD [23..23]  ============================================== */
+typedef enum {                                  /*!< GPIO_CFGA_GPIO5INTD                                                       */
+  GPIO_CFGA_GPIO5INTD_INTDIS           = 0,     /*!< INTDIS : INCFG = 1 - No interrupt on GPIO transition value.               */
+  GPIO_CFGA_GPIO5INTD_INTBOTH          = 1,     /*!< INTBOTH : INCFG = 1 - Interrupt on either low to high or high
+                                                     to low GPIO transition value.                                             */
+} GPIO_CFGA_GPIO5INTD_Enum;
+
+/* ============================================  GPIO CFGA GPIO5OUTCFG [21..22]  ============================================= */
+typedef enum {                                  /*!< GPIO_CFGA_GPIO5OUTCFG                                                     */
+  GPIO_CFGA_GPIO5OUTCFG_DIS            = 0,     /*!< DIS : FNCSEL = 0x3 - Output disabled value.                               */
+  GPIO_CFGA_GPIO5OUTCFG_PUSHPULL       = 1,     /*!< PUSHPULL : FNCSEL = 0x3 - Output is push-pull value.                      */
+  GPIO_CFGA_GPIO5OUTCFG_OD             = 2,     /*!< OD : FNCSEL = 0x3 - Output is open drain value.                           */
+  GPIO_CFGA_GPIO5OUTCFG_TS             = 3,     /*!< TS : FNCSEL = 0x3 - Output is tri-state value.                            */
+} GPIO_CFGA_GPIO5OUTCFG_Enum;
+
+/* =============================================  GPIO CFGA GPIO5INCFG [20..20]  ============================================= */
+typedef enum {                                  /*!< GPIO_CFGA_GPIO5INCFG                                                      */
+  GPIO_CFGA_GPIO5INCFG_READ            = 0,     /*!< READ : Read the GPIO pin data value.                                      */
+  GPIO_CFGA_GPIO5INCFG_RDZERO          = 1,     /*!< RDZERO : INTD = 0 - Readback will always be zero value.                   */
+} GPIO_CFGA_GPIO5INCFG_Enum;
+
+/* =============================================  GPIO CFGA GPIO4INTD [19..19]  ============================================== */
+typedef enum {                                  /*!< GPIO_CFGA_GPIO4INTD                                                       */
+  GPIO_CFGA_GPIO4INTD_nCELOW           = 0,     /*!< nCELOW : FNCSEL = 0x2 - nCE polarity active low value.                    */
+  GPIO_CFGA_GPIO4INTD_nCEHIGH          = 1,     /*!< nCEHIGH : FNCSEL = 0x2 - nCE polarity active high value.                  */
+} GPIO_CFGA_GPIO4INTD_Enum;
+
+/* ============================================  GPIO CFGA GPIO4OUTCFG [17..18]  ============================================= */
+typedef enum {                                  /*!< GPIO_CFGA_GPIO4OUTCFG                                                     */
+  GPIO_CFGA_GPIO4OUTCFG_DIS            = 0,     /*!< DIS : FNCSEL = 0x3 - Output disabled value.                               */
+  GPIO_CFGA_GPIO4OUTCFG_PUSHPULL       = 1,     /*!< PUSHPULL : FNCSEL = 0x3 - Output is push-pull value.                      */
+  GPIO_CFGA_GPIO4OUTCFG_OD             = 2,     /*!< OD : FNCSEL = 0x3 - Output is open drain value.                           */
+  GPIO_CFGA_GPIO4OUTCFG_TS             = 3,     /*!< TS : FNCSEL = 0x3 - Output is tri-state value.                            */
+} GPIO_CFGA_GPIO4OUTCFG_Enum;
+
+/* =============================================  GPIO CFGA GPIO4INCFG [16..16]  ============================================= */
+typedef enum {                                  /*!< GPIO_CFGA_GPIO4INCFG                                                      */
+  GPIO_CFGA_GPIO4INCFG_READ            = 0,     /*!< READ : Read the GPIO pin data value.                                      */
+  GPIO_CFGA_GPIO4INCFG_RDZERO          = 1,     /*!< RDZERO : INTD = 0 - Readback will always be zero value.                   */
+} GPIO_CFGA_GPIO4INCFG_Enum;
+
+/* =============================================  GPIO CFGA GPIO3INTD [15..15]  ============================================== */
+typedef enum {                                  /*!< GPIO_CFGA_GPIO3INTD                                                       */
+  GPIO_CFGA_GPIO3INTD_nCELOW           = 0,     /*!< nCELOW : FNCSEL = 0x2 - nCE polarity active low value.                    */
+  GPIO_CFGA_GPIO3INTD_nCEHIGH          = 1,     /*!< nCEHIGH : FNCSEL = 0x2 - nCE polarity active high value.                  */
+} GPIO_CFGA_GPIO3INTD_Enum;
+
+/* ============================================  GPIO CFGA GPIO3OUTCFG [13..14]  ============================================= */
+typedef enum {                                  /*!< GPIO_CFGA_GPIO3OUTCFG                                                     */
+  GPIO_CFGA_GPIO3OUTCFG_DIS            = 0,     /*!< DIS : FNCSEL = 0x3 - Output disabled value.                               */
+  GPIO_CFGA_GPIO3OUTCFG_PUSHPULL       = 1,     /*!< PUSHPULL : FNCSEL = 0x3 - Output is push-pull value.                      */
+  GPIO_CFGA_GPIO3OUTCFG_OD             = 2,     /*!< OD : FNCSEL = 0x3 - Output is open drain value.                           */
+  GPIO_CFGA_GPIO3OUTCFG_TS             = 3,     /*!< TS : FNCSEL = 0x3 - Output is tri-state value.                            */
+} GPIO_CFGA_GPIO3OUTCFG_Enum;
+
+/* =============================================  GPIO CFGA GPIO3INCFG [12..12]  ============================================= */
+typedef enum {                                  /*!< GPIO_CFGA_GPIO3INCFG                                                      */
+  GPIO_CFGA_GPIO3INCFG_READ            = 0,     /*!< READ : Read the GPIO pin data value.                                      */
+  GPIO_CFGA_GPIO3INCFG_RDZERO          = 1,     /*!< RDZERO : INTD = 0 - Readback will always be zero value.                   */
+} GPIO_CFGA_GPIO3INCFG_Enum;
+
+/* =============================================  GPIO CFGA GPIO2INTD [11..11]  ============================================== */
+typedef enum {                                  /*!< GPIO_CFGA_GPIO2INTD                                                       */
+  GPIO_CFGA_GPIO2INTD_nCELOW           = 0,     /*!< nCELOW : FNCSEL = 0x7 - nCE polarity active low value.                    */
+  GPIO_CFGA_GPIO2INTD_nCEHIGH          = 1,     /*!< nCEHIGH : FNCSEL = 0x7 - nCE polarity active high value.                  */
+} GPIO_CFGA_GPIO2INTD_Enum;
+
+/* =============================================  GPIO CFGA GPIO2OUTCFG [9..10]  ============================================= */
+typedef enum {                                  /*!< GPIO_CFGA_GPIO2OUTCFG                                                     */
+  GPIO_CFGA_GPIO2OUTCFG_DIS            = 0,     /*!< DIS : FNCSEL = 0x3 - Output disabled value.                               */
+  GPIO_CFGA_GPIO2OUTCFG_PUSHPULL       = 1,     /*!< PUSHPULL : FNCSEL = 0x3 - Output is push-pull value.                      */
+  GPIO_CFGA_GPIO2OUTCFG_OD             = 2,     /*!< OD : FNCSEL = 0x3 - Output is open drain value.                           */
+  GPIO_CFGA_GPIO2OUTCFG_TS             = 3,     /*!< TS : FNCSEL = 0x3 - Output is tri-state value.                            */
+} GPIO_CFGA_GPIO2OUTCFG_Enum;
+
+/* ==============================================  GPIO CFGA GPIO2INCFG [8..8]  ============================================== */
+typedef enum {                                  /*!< GPIO_CFGA_GPIO2INCFG                                                      */
+  GPIO_CFGA_GPIO2INCFG_READ            = 0,     /*!< READ : Read the GPIO pin data value.                                      */
+  GPIO_CFGA_GPIO2INCFG_RDZERO          = 1,     /*!< RDZERO : INTD = 0 - Readback will always be zero value.                   */
+} GPIO_CFGA_GPIO2INCFG_Enum;
+
+/* ==============================================  GPIO CFGA GPIO1INTD [7..7]  =============================================== */
+typedef enum {                                  /*!< GPIO_CFGA_GPIO1INTD                                                       */
+  GPIO_CFGA_GPIO1INTD_nCELOW           = 0,     /*!< nCELOW : FNCSEL = 0x7 - nCE polarity active low value.                    */
+  GPIO_CFGA_GPIO1INTD_nCEHIGH          = 1,     /*!< nCEHIGH : FNCSEL = 0x7 - nCE polarity active high value.                  */
+} GPIO_CFGA_GPIO1INTD_Enum;
+
+/* =============================================  GPIO CFGA GPIO1OUTCFG [5..6]  ============================================== */
+typedef enum {                                  /*!< GPIO_CFGA_GPIO1OUTCFG                                                     */
+  GPIO_CFGA_GPIO1OUTCFG_DIS            = 0,     /*!< DIS : FNCSEL = 0x3 - Output disabled value.                               */
+  GPIO_CFGA_GPIO1OUTCFG_PUSHPULL       = 1,     /*!< PUSHPULL : FNCSEL = 0x3 - Output is push-pull value.                      */
+  GPIO_CFGA_GPIO1OUTCFG_OD             = 2,     /*!< OD : FNCSEL = 0x3 - Output is open drain value.                           */
+  GPIO_CFGA_GPIO1OUTCFG_TS             = 3,     /*!< TS : FNCSEL = 0x3 - Output is tri-state value.                            */
+} GPIO_CFGA_GPIO1OUTCFG_Enum;
+
+/* ==============================================  GPIO CFGA GPIO1INCFG [4..4]  ============================================== */
+typedef enum {                                  /*!< GPIO_CFGA_GPIO1INCFG                                                      */
+  GPIO_CFGA_GPIO1INCFG_READ            = 0,     /*!< READ : Read the GPIO pin data value.                                      */
+  GPIO_CFGA_GPIO1INCFG_RDZERO          = 1,     /*!< RDZERO : INTD = 0 - Readback will always be zero value.                   */
+} GPIO_CFGA_GPIO1INCFG_Enum;
+
+/* ==============================================  GPIO CFGA GPIO0INTD [3..3]  =============================================== */
+typedef enum {                                  /*!< GPIO_CFGA_GPIO0INTD                                                       */
+  GPIO_CFGA_GPIO0INTD_nCELOW           = 0,     /*!< nCELOW : FNCSEL = 0x7 - nCE polarity active low value.                    */
+  GPIO_CFGA_GPIO0INTD_nCEHIGH          = 1,     /*!< nCEHIGH : FNCSEL = 0x7 - nCE polarity active high value.                  */
+} GPIO_CFGA_GPIO0INTD_Enum;
+
+/* =============================================  GPIO CFGA GPIO0OUTCFG [1..2]  ============================================== */
+typedef enum {                                  /*!< GPIO_CFGA_GPIO0OUTCFG                                                     */
+  GPIO_CFGA_GPIO0OUTCFG_DIS            = 0,     /*!< DIS : FNCSEL = 0x3 - Output disabled value.                               */
+  GPIO_CFGA_GPIO0OUTCFG_PUSHPULL       = 1,     /*!< PUSHPULL : FNCSEL = 0x3 - Output is push-pull value.                      */
+  GPIO_CFGA_GPIO0OUTCFG_OD             = 2,     /*!< OD : FNCSEL = 0x3 - Output is open drain value.                           */
+  GPIO_CFGA_GPIO0OUTCFG_TS             = 3,     /*!< TS : FNCSEL = 0x3 - Output is tri-state value.                            */
+} GPIO_CFGA_GPIO0OUTCFG_Enum;
+
+/* ==============================================  GPIO CFGA GPIO0INCFG [0..0]  ============================================== */
+typedef enum {                                  /*!< GPIO_CFGA_GPIO0INCFG                                                      */
+  GPIO_CFGA_GPIO0INCFG_READ            = 0,     /*!< READ : Read the GPIO pin data value.                                      */
+  GPIO_CFGA_GPIO0INCFG_RDZERO          = 1,     /*!< RDZERO : INTD = 0 - Readback will always be zero value.                   */
+} GPIO_CFGA_GPIO0INCFG_Enum;
+
+/* =========================================================  CFGB  ========================================================== */
+/* =============================================  GPIO CFGB GPIO15INTD [31..31]  ============================================= */
+typedef enum {                                  /*!< GPIO_CFGB_GPIO15INTD                                                      */
+  GPIO_CFGB_GPIO15INTD_nCELOW          = 0,     /*!< nCELOW : FNCSEL = 0x1 - nCE polarity active low value.                    */
+  GPIO_CFGB_GPIO15INTD_nCEHIGH         = 1,     /*!< nCEHIGH : FNCSEL = 0x1 - nCE polarity active high value.                  */
+} GPIO_CFGB_GPIO15INTD_Enum;
+
+/* ============================================  GPIO CFGB GPIO15OUTCFG [29..30]  ============================================ */
+typedef enum {                                  /*!< GPIO_CFGB_GPIO15OUTCFG                                                    */
+  GPIO_CFGB_GPIO15OUTCFG_DIS           = 0,     /*!< DIS : FNCSEL = 0x3 - Output disabled value.                               */
+  GPIO_CFGB_GPIO15OUTCFG_PUSHPULL      = 1,     /*!< PUSHPULL : FNCSEL = 0x3 - Output is push-pull value.                      */
+  GPIO_CFGB_GPIO15OUTCFG_OD            = 2,     /*!< OD : FNCSEL = 0x3 - Output is open drain value.                           */
+  GPIO_CFGB_GPIO15OUTCFG_TS            = 3,     /*!< TS : FNCSEL = 0x3 - Output is tri-state value.                            */
+} GPIO_CFGB_GPIO15OUTCFG_Enum;
+
+/* ============================================  GPIO CFGB GPIO15INCFG [28..28]  ============================================= */
+typedef enum {                                  /*!< GPIO_CFGB_GPIO15INCFG                                                     */
+  GPIO_CFGB_GPIO15INCFG_READ           = 0,     /*!< READ : Read the GPIO pin data value.                                      */
+  GPIO_CFGB_GPIO15INCFG_RDZERO         = 1,     /*!< RDZERO : INTD = 0 - Readback will always be zero value.                   */
+} GPIO_CFGB_GPIO15INCFG_Enum;
+
+/* =============================================  GPIO CFGB GPIO14INTD [27..27]  ============================================= */
+typedef enum {                                  /*!< GPIO_CFGB_GPIO14INTD                                                      */
+  GPIO_CFGB_GPIO14INTD_nCELOW          = 0,     /*!< nCELOW : FNCSEL = 0x1 - nCE polarity active low value.                    */
+  GPIO_CFGB_GPIO14INTD_nCEHIGH         = 1,     /*!< nCEHIGH : FNCSEL = 0x1 - nCE polarity active high value.                  */
+} GPIO_CFGB_GPIO14INTD_Enum;
+
+/* ============================================  GPIO CFGB GPIO14OUTCFG [25..26]  ============================================ */
+typedef enum {                                  /*!< GPIO_CFGB_GPIO14OUTCFG                                                    */
+  GPIO_CFGB_GPIO14OUTCFG_DIS           = 0,     /*!< DIS : FNCSEL = 0x3 - Output disabled value.                               */
+  GPIO_CFGB_GPIO14OUTCFG_PUSHPULL      = 1,     /*!< PUSHPULL : FNCSEL = 0x3 - Output is push-pull value.                      */
+  GPIO_CFGB_GPIO14OUTCFG_OD            = 2,     /*!< OD : FNCSEL = 0x3 - Output is open drain value.                           */
+  GPIO_CFGB_GPIO14OUTCFG_TS            = 3,     /*!< TS : FNCSEL = 0x3 - Output is tri-state value.                            */
+} GPIO_CFGB_GPIO14OUTCFG_Enum;
+
+/* ============================================  GPIO CFGB GPIO14INCFG [24..24]  ============================================= */
+typedef enum {                                  /*!< GPIO_CFGB_GPIO14INCFG                                                     */
+  GPIO_CFGB_GPIO14INCFG_READ           = 0,     /*!< READ : Read the GPIO pin data value.                                      */
+  GPIO_CFGB_GPIO14INCFG_RDZERO         = 1,     /*!< RDZERO : INTD = 0 - Readback will always be zero value.                   */
+} GPIO_CFGB_GPIO14INCFG_Enum;
+
+/* =============================================  GPIO CFGB GPIO13INTD [23..23]  ============================================= */
+typedef enum {                                  /*!< GPIO_CFGB_GPIO13INTD                                                      */
+  GPIO_CFGB_GPIO13INTD_nCELOW          = 0,     /*!< nCELOW : FNCSEL = 0x1 - nCE polarity active low value.                    */
+  GPIO_CFGB_GPIO13INTD_nCEHIGH         = 1,     /*!< nCEHIGH : FNCSEL = 0x1 - nCE polarity active high value.                  */
+} GPIO_CFGB_GPIO13INTD_Enum;
+
+/* ============================================  GPIO CFGB GPIO13OUTCFG [21..22]  ============================================ */
+typedef enum {                                  /*!< GPIO_CFGB_GPIO13OUTCFG                                                    */
+  GPIO_CFGB_GPIO13OUTCFG_DIS           = 0,     /*!< DIS : FNCSEL = 0x3 - Output disabled value.                               */
+  GPIO_CFGB_GPIO13OUTCFG_PUSHPULL      = 1,     /*!< PUSHPULL : FNCSEL = 0x3 - Output is push-pull value.                      */
+  GPIO_CFGB_GPIO13OUTCFG_OD            = 2,     /*!< OD : FNCSEL = 0x3 - Output is open drain value.                           */
+  GPIO_CFGB_GPIO13OUTCFG_TS            = 3,     /*!< TS : FNCSEL = 0x3 - Output is tri-state value.                            */
+} GPIO_CFGB_GPIO13OUTCFG_Enum;
+
+/* ============================================  GPIO CFGB GPIO13INCFG [20..20]  ============================================= */
+typedef enum {                                  /*!< GPIO_CFGB_GPIO13INCFG                                                     */
+  GPIO_CFGB_GPIO13INCFG_READ           = 0,     /*!< READ : Read the GPIO pin data value.                                      */
+  GPIO_CFGB_GPIO13INCFG_RDZERO         = 1,     /*!< RDZERO : INTD = 0 - Readback will always be zero value.                   */
+} GPIO_CFGB_GPIO13INCFG_Enum;
+
+/* =============================================  GPIO CFGB GPIO12INTD [19..19]  ============================================= */
+typedef enum {                                  /*!< GPIO_CFGB_GPIO12INTD                                                      */
+  GPIO_CFGB_GPIO12INTD_nCELOW          = 0,     /*!< nCELOW : FNCSEL = 0x1 - nCE polarity active low value.                    */
+  GPIO_CFGB_GPIO12INTD_nCEHIGH         = 1,     /*!< nCEHIGH : FNCSEL = 0x1 - nCE polarity active high value.                  */
+} GPIO_CFGB_GPIO12INTD_Enum;
+
+/* ============================================  GPIO CFGB GPIO12OUTCFG [17..18]  ============================================ */
+typedef enum {                                  /*!< GPIO_CFGB_GPIO12OUTCFG                                                    */
+  GPIO_CFGB_GPIO12OUTCFG_DIS           = 0,     /*!< DIS : FNCSEL = 0x3 - Output disabled value.                               */
+  GPIO_CFGB_GPIO12OUTCFG_PUSHPULL      = 1,     /*!< PUSHPULL : FNCSEL = 0x3 - Output is push-pull value.                      */
+  GPIO_CFGB_GPIO12OUTCFG_OD            = 2,     /*!< OD : FNCSEL = 0x3 - Output is open drain value.                           */
+  GPIO_CFGB_GPIO12OUTCFG_TS            = 3,     /*!< TS : FNCSEL = 0x3 - Output is tri-state value.                            */
+} GPIO_CFGB_GPIO12OUTCFG_Enum;
+
+/* ============================================  GPIO CFGB GPIO12INCFG [16..16]  ============================================= */
+typedef enum {                                  /*!< GPIO_CFGB_GPIO12INCFG                                                     */
+  GPIO_CFGB_GPIO12INCFG_READ           = 0,     /*!< READ : Read the GPIO pin data value.                                      */
+  GPIO_CFGB_GPIO12INCFG_RDZERO         = 1,     /*!< RDZERO : INTD = 0 - Readback will always be zero value.                   */
+} GPIO_CFGB_GPIO12INCFG_Enum;
+
+/* =============================================  GPIO CFGB GPIO11INTD [15..15]  ============================================= */
+typedef enum {                                  /*!< GPIO_CFGB_GPIO11INTD                                                      */
+  GPIO_CFGB_GPIO11INTD_nCELOW          = 0,     /*!< nCELOW : FNCSEL = 0x1 - nCE polarity active low value.                    */
+  GPIO_CFGB_GPIO11INTD_nCEHIGH         = 1,     /*!< nCEHIGH : FNCSEL = 0x1 - nCE polarity active high value.                  */
+} GPIO_CFGB_GPIO11INTD_Enum;
+
+/* ============================================  GPIO CFGB GPIO11OUTCFG [13..14]  ============================================ */
+typedef enum {                                  /*!< GPIO_CFGB_GPIO11OUTCFG                                                    */
+  GPIO_CFGB_GPIO11OUTCFG_DIS           = 0,     /*!< DIS : FNCSEL = 0x3 - Output disabled value.                               */
+  GPIO_CFGB_GPIO11OUTCFG_PUSHPULL      = 1,     /*!< PUSHPULL : FNCSEL = 0x3 - Output is push-pull value.                      */
+  GPIO_CFGB_GPIO11OUTCFG_OD            = 2,     /*!< OD : FNCSEL = 0x3 - Output is open drain value.                           */
+  GPIO_CFGB_GPIO11OUTCFG_TS            = 3,     /*!< TS : FNCSEL = 0x3 - Output is tri-state value.                            */
+} GPIO_CFGB_GPIO11OUTCFG_Enum;
+
+/* ============================================  GPIO CFGB GPIO11INCFG [12..12]  ============================================= */
+typedef enum {                                  /*!< GPIO_CFGB_GPIO11INCFG                                                     */
+  GPIO_CFGB_GPIO11INCFG_READ           = 0,     /*!< READ : Read the GPIO pin data value.                                      */
+  GPIO_CFGB_GPIO11INCFG_RDZERO         = 1,     /*!< RDZERO : INTD = 0 - Readback will always be zero value.                   */
+} GPIO_CFGB_GPIO11INCFG_Enum;
+
+/* =============================================  GPIO CFGB GPIO10INTD [11..11]  ============================================= */
+typedef enum {                                  /*!< GPIO_CFGB_GPIO10INTD                                                      */
+  GPIO_CFGB_GPIO10INTD_nCELOW          = 0,     /*!< nCELOW : FNCSEL = 0x2 - nCE polarity active low value.                    */
+  GPIO_CFGB_GPIO10INTD_nCEHIGH         = 1,     /*!< nCEHIGH : FNCSEL = 0x2 - nCE polarity active high value.                  */
+} GPIO_CFGB_GPIO10INTD_Enum;
+
+/* ============================================  GPIO CFGB GPIO10OUTCFG [9..10]  ============================================= */
+typedef enum {                                  /*!< GPIO_CFGB_GPIO10OUTCFG                                                    */
+  GPIO_CFGB_GPIO10OUTCFG_DIS           = 0,     /*!< DIS : FNCSEL = 0x3 - Output disabled value.                               */
+  GPIO_CFGB_GPIO10OUTCFG_PUSHPULL      = 1,     /*!< PUSHPULL : FNCSEL = 0x3 - Output is push-pull value.                      */
+  GPIO_CFGB_GPIO10OUTCFG_OD            = 2,     /*!< OD : FNCSEL = 0x3 - Output is open drain value.                           */
+  GPIO_CFGB_GPIO10OUTCFG_TS            = 3,     /*!< TS : FNCSEL = 0x3 - Output is tri-state value.                            */
+} GPIO_CFGB_GPIO10OUTCFG_Enum;
+
+/* =============================================  GPIO CFGB GPIO10INCFG [8..8]  ============================================== */
+typedef enum {                                  /*!< GPIO_CFGB_GPIO10INCFG                                                     */
+  GPIO_CFGB_GPIO10INCFG_READ           = 0,     /*!< READ : Read the GPIO pin data value.                                      */
+  GPIO_CFGB_GPIO10INCFG_RDZERO         = 1,     /*!< RDZERO : INTD = 0 - Readback will always be zero value.                   */
+} GPIO_CFGB_GPIO10INCFG_Enum;
+
+/* ==============================================  GPIO CFGB GPIO9INTD [7..7]  =============================================== */
+typedef enum {                                  /*!< GPIO_CFGB_GPIO9INTD                                                       */
+  GPIO_CFGB_GPIO9INTD_nCELOW           = 0,     /*!< nCELOW : FNCSEL = 0x2 - nCE polarity active low value.                    */
+  GPIO_CFGB_GPIO9INTD_nCEHIGH          = 1,     /*!< nCEHIGH : FNCSEL = 0x2 - nCE polarity active high value.                  */
+} GPIO_CFGB_GPIO9INTD_Enum;
+
+/* =============================================  GPIO CFGB GPIO9OUTCFG [5..6]  ============================================== */
+typedef enum {                                  /*!< GPIO_CFGB_GPIO9OUTCFG                                                     */
+  GPIO_CFGB_GPIO9OUTCFG_DIS            = 0,     /*!< DIS : FNCSEL = 0x3 - Output disabled value.                               */
+  GPIO_CFGB_GPIO9OUTCFG_PUSHPULL       = 1,     /*!< PUSHPULL : FNCSEL = 0x3 - Output is push-pull value.                      */
+  GPIO_CFGB_GPIO9OUTCFG_OD             = 2,     /*!< OD : FNCSEL = 0x3 - Output is open drain value.                           */
+  GPIO_CFGB_GPIO9OUTCFG_TS             = 3,     /*!< TS : FNCSEL = 0x3 - Output is tri-state value.                            */
+} GPIO_CFGB_GPIO9OUTCFG_Enum;
+
+/* ==============================================  GPIO CFGB GPIO9INCFG [4..4]  ============================================== */
+typedef enum {                                  /*!< GPIO_CFGB_GPIO9INCFG                                                      */
+  GPIO_CFGB_GPIO9INCFG_READ            = 0,     /*!< READ : Read the GPIO pin data value.                                      */
+  GPIO_CFGB_GPIO9INCFG_RDZERO          = 1,     /*!< RDZERO : INTD = 0 - Readback will always be zero value.                   */
+} GPIO_CFGB_GPIO9INCFG_Enum;
+
+/* ==============================================  GPIO CFGB GPIO8INTD [3..3]  =============================================== */
+typedef enum {                                  /*!< GPIO_CFGB_GPIO8INTD                                                       */
+  GPIO_CFGB_GPIO8INTD_nCELOW           = 0,     /*!< nCELOW : FNCSEL = 0x2 - nCE polarity active low value.                    */
+  GPIO_CFGB_GPIO8INTD_nCEHIGH          = 1,     /*!< nCEHIGH : FNCSEL = 0x2 - nCE polarity active high value.                  */
+} GPIO_CFGB_GPIO8INTD_Enum;
+
+/* =============================================  GPIO CFGB GPIO8OUTCFG [1..2]  ============================================== */
+typedef enum {                                  /*!< GPIO_CFGB_GPIO8OUTCFG                                                     */
+  GPIO_CFGB_GPIO8OUTCFG_DIS            = 0,     /*!< DIS : FNCSEL = 0x3 - Output disabled value.                               */
+  GPIO_CFGB_GPIO8OUTCFG_PUSHPULL       = 1,     /*!< PUSHPULL : FNCSEL = 0x3 - Output is push-pull value.                      */
+  GPIO_CFGB_GPIO8OUTCFG_OD             = 2,     /*!< OD : FNCSEL = 0x3 - Output is open drain value.                           */
+  GPIO_CFGB_GPIO8OUTCFG_TS             = 3,     /*!< TS : FNCSEL = 0x3 - Output is tri-state value.                            */
+} GPIO_CFGB_GPIO8OUTCFG_Enum;
+
+/* ==============================================  GPIO CFGB GPIO8INCFG [0..0]  ============================================== */
+typedef enum {                                  /*!< GPIO_CFGB_GPIO8INCFG                                                      */
+  GPIO_CFGB_GPIO8INCFG_READ            = 0,     /*!< READ : Read the GPIO pin data value.                                      */
+  GPIO_CFGB_GPIO8INCFG_RDZERO          = 1,     /*!< RDZERO : INTD = 0 - Readback will always be zero value.                   */
+} GPIO_CFGB_GPIO8INCFG_Enum;
+
+/* =========================================================  CFGC  ========================================================== */
+/* =============================================  GPIO CFGC GPIO23INTD [31..31]  ============================================= */
+typedef enum {                                  /*!< GPIO_CFGC_GPIO23INTD                                                      */
+  GPIO_CFGC_GPIO23INTD_nCELOW          = 0,     /*!< nCELOW : FNCSEL = 0x1 - nCE polarity active low value.                    */
+  GPIO_CFGC_GPIO23INTD_nCEHIGH         = 1,     /*!< nCEHIGH : FNCSEL = 0x1 - nCE polarity active high value.                  */
+} GPIO_CFGC_GPIO23INTD_Enum;
+
+/* ============================================  GPIO CFGC GPIO23OUTCFG [29..30]  ============================================ */
+typedef enum {                                  /*!< GPIO_CFGC_GPIO23OUTCFG                                                    */
+  GPIO_CFGC_GPIO23OUTCFG_DIS           = 0,     /*!< DIS : FNCSEL = 0x3 - Output disabled value.                               */
+  GPIO_CFGC_GPIO23OUTCFG_PUSHPULL      = 1,     /*!< PUSHPULL : FNCSEL = 0x3 - Output is push-pull value.                      */
+  GPIO_CFGC_GPIO23OUTCFG_OD            = 2,     /*!< OD : FNCSEL = 0x3 - Output is open drain value.                           */
+  GPIO_CFGC_GPIO23OUTCFG_TS            = 3,     /*!< TS : FNCSEL = 0x3 - Output is tri-state value.                            */
+} GPIO_CFGC_GPIO23OUTCFG_Enum;
+
+/* ============================================  GPIO CFGC GPIO23INCFG [28..28]  ============================================= */
+typedef enum {                                  /*!< GPIO_CFGC_GPIO23INCFG                                                     */
+  GPIO_CFGC_GPIO23INCFG_READ           = 0,     /*!< READ : Read the GPIO pin data value.                                      */
+  GPIO_CFGC_GPIO23INCFG_RDZERO         = 1,     /*!< RDZERO : INTD = 0 - Readback will always be zero value.                   */
+} GPIO_CFGC_GPIO23INCFG_Enum;
+
+/* =============================================  GPIO CFGC GPIO22INTD [27..27]  ============================================= */
+typedef enum {                                  /*!< GPIO_CFGC_GPIO22INTD                                                      */
+  GPIO_CFGC_GPIO22INTD_nCELOW          = 0,     /*!< nCELOW : FNCSEL = 0x1 - nCE polarity active low value.                    */
+  GPIO_CFGC_GPIO22INTD_nCEHIGH         = 1,     /*!< nCEHIGH : FNCSEL = 0x1 - nCE polarity active high value.                  */
+} GPIO_CFGC_GPIO22INTD_Enum;
+
+/* ============================================  GPIO CFGC GPIO22OUTCFG [25..26]  ============================================ */
+typedef enum {                                  /*!< GPIO_CFGC_GPIO22OUTCFG                                                    */
+  GPIO_CFGC_GPIO22OUTCFG_DIS           = 0,     /*!< DIS : FNCSEL = 0x3 - Output disabled value.                               */
+  GPIO_CFGC_GPIO22OUTCFG_PUSHPULL      = 1,     /*!< PUSHPULL : FNCSEL = 0x3 - Output is push-pull value.                      */
+  GPIO_CFGC_GPIO22OUTCFG_OD            = 2,     /*!< OD : FNCSEL = 0x3 - Output is open drain value.                           */
+  GPIO_CFGC_GPIO22OUTCFG_TS            = 3,     /*!< TS : FNCSEL = 0x3 - Output is tri-state value.                            */
+} GPIO_CFGC_GPIO22OUTCFG_Enum;
+
+/* ============================================  GPIO CFGC GPIO22INCFG [24..24]  ============================================= */
+typedef enum {                                  /*!< GPIO_CFGC_GPIO22INCFG                                                     */
+  GPIO_CFGC_GPIO22INCFG_READ           = 0,     /*!< READ : Read the GPIO pin data value.                                      */
+  GPIO_CFGC_GPIO22INCFG_RDZERO         = 1,     /*!< RDZERO : INTD = 0 - Readback will always be zero value.                   */
+} GPIO_CFGC_GPIO22INCFG_Enum;
+
+/* =============================================  GPIO CFGC GPIO21INTD [23..23]  ============================================= */
+typedef enum {                                  /*!< GPIO_CFGC_GPIO21INTD                                                      */
+  GPIO_CFGC_GPIO21INTD_nCELOW          = 0,     /*!< nCELOW : FNCSEL = 0x1 - nCE polarity active low value.                    */
+  GPIO_CFGC_GPIO21INTD_nCEHIGH         = 1,     /*!< nCEHIGH : FNCSEL = 0x1 - nCE polarity active high value.                  */
+} GPIO_CFGC_GPIO21INTD_Enum;
+
+/* ============================================  GPIO CFGC GPIO21OUTCFG [21..22]  ============================================ */
+typedef enum {                                  /*!< GPIO_CFGC_GPIO21OUTCFG                                                    */
+  GPIO_CFGC_GPIO21OUTCFG_DIS           = 0,     /*!< DIS : FNCSEL = 0x3 - Output disabled value.                               */
+  GPIO_CFGC_GPIO21OUTCFG_PUSHPULL      = 1,     /*!< PUSHPULL : FNCSEL = 0x3 - Output is push-pull value.                      */
+  GPIO_CFGC_GPIO21OUTCFG_OD            = 2,     /*!< OD : FNCSEL = 0x3 - Output is open drain value.                           */
+  GPIO_CFGC_GPIO21OUTCFG_TS            = 3,     /*!< TS : FNCSEL = 0x3 - Output is tri-state value.                            */
+} GPIO_CFGC_GPIO21OUTCFG_Enum;
+
+/* ============================================  GPIO CFGC GPIO21INCFG [20..20]  ============================================= */
+typedef enum {                                  /*!< GPIO_CFGC_GPIO21INCFG                                                     */
+  GPIO_CFGC_GPIO21INCFG_READ           = 0,     /*!< READ : Read the GPIO pin data value.                                      */
+  GPIO_CFGC_GPIO21INCFG_RDZERO         = 1,     /*!< RDZERO : INTD = 0 - Readback will always be zero value.                   */
+} GPIO_CFGC_GPIO21INCFG_Enum;
+
+/* =============================================  GPIO CFGC GPIO20INTD [19..19]  ============================================= */
+typedef enum {                                  /*!< GPIO_CFGC_GPIO20INTD                                                      */
+  GPIO_CFGC_GPIO20INTD_nCELOW          = 0,     /*!< nCELOW : FNCSEL = 0x1 - nCE polarity active low value.                    */
+  GPIO_CFGC_GPIO20INTD_nCEHIGH         = 1,     /*!< nCEHIGH : FNCSEL = 0x1 - nCE polarity active high value.                  */
+} GPIO_CFGC_GPIO20INTD_Enum;
+
+/* ============================================  GPIO CFGC GPIO20OUTCFG [17..18]  ============================================ */
+typedef enum {                                  /*!< GPIO_CFGC_GPIO20OUTCFG                                                    */
+  GPIO_CFGC_GPIO20OUTCFG_DIS           = 0,     /*!< DIS : FNCSEL = 0x3 - Output disabled value.                               */
+  GPIO_CFGC_GPIO20OUTCFG_PUSHPULL      = 1,     /*!< PUSHPULL : FNCSEL = 0x3 - Output is push-pull value.                      */
+  GPIO_CFGC_GPIO20OUTCFG_OD            = 2,     /*!< OD : FNCSEL = 0x3 - Output is open drain value.                           */
+  GPIO_CFGC_GPIO20OUTCFG_TS            = 3,     /*!< TS : FNCSEL = 0x3 - Output is tri-state value.                            */
+} GPIO_CFGC_GPIO20OUTCFG_Enum;
+
+/* ============================================  GPIO CFGC GPIO20INCFG [16..16]  ============================================= */
+typedef enum {                                  /*!< GPIO_CFGC_GPIO20INCFG                                                     */
+  GPIO_CFGC_GPIO20INCFG_READ           = 0,     /*!< READ : Read the GPIO pin data value.                                      */
+  GPIO_CFGC_GPIO20INCFG_RDZERO         = 1,     /*!< RDZERO : INTD = 0 - Readback will always be zero value.                   */
+} GPIO_CFGC_GPIO20INCFG_Enum;
+
+/* =============================================  GPIO CFGC GPIO19INTD [15..15]  ============================================= */
+typedef enum {                                  /*!< GPIO_CFGC_GPIO19INTD                                                      */
+  GPIO_CFGC_GPIO19INTD_nCELOW          = 0,     /*!< nCELOW : FNCSEL = 0x1 - nCE polarity active low value.                    */
+  GPIO_CFGC_GPIO19INTD_nCEHIGH         = 1,     /*!< nCEHIGH : FNCSEL = 0x1 - nCE polarity active high value.                  */
+} GPIO_CFGC_GPIO19INTD_Enum;
+
+/* ============================================  GPIO CFGC GPIO19OUTCFG [13..14]  ============================================ */
+typedef enum {                                  /*!< GPIO_CFGC_GPIO19OUTCFG                                                    */
+  GPIO_CFGC_GPIO19OUTCFG_DIS           = 0,     /*!< DIS : FNCSEL = 0x3 - Output disabled value.                               */
+  GPIO_CFGC_GPIO19OUTCFG_PUSHPULL      = 1,     /*!< PUSHPULL : FNCSEL = 0x3 - Output is push-pull value.                      */
+  GPIO_CFGC_GPIO19OUTCFG_OD            = 2,     /*!< OD : FNCSEL = 0x3 - Output is open drain value.                           */
+  GPIO_CFGC_GPIO19OUTCFG_TS            = 3,     /*!< TS : FNCSEL = 0x3 - Output is tri-state value.                            */
+} GPIO_CFGC_GPIO19OUTCFG_Enum;
+
+/* ============================================  GPIO CFGC GPIO19INCFG [12..12]  ============================================= */
+typedef enum {                                  /*!< GPIO_CFGC_GPIO19INCFG                                                     */
+  GPIO_CFGC_GPIO19INCFG_READ           = 0,     /*!< READ : Read the GPIO pin data value.                                      */
+  GPIO_CFGC_GPIO19INCFG_RDZERO         = 1,     /*!< RDZERO : INTD = 0 - Readback will always be zero value.                   */
+} GPIO_CFGC_GPIO19INCFG_Enum;
+
+/* =============================================  GPIO CFGC GPIO18INTD [11..11]  ============================================= */
+typedef enum {                                  /*!< GPIO_CFGC_GPIO18INTD                                                      */
+  GPIO_CFGC_GPIO18INTD_nCELOW          = 0,     /*!< nCELOW : FNCSEL = 0x1 - nCE polarity active low value.                    */
+  GPIO_CFGC_GPIO18INTD_nCEHIGH         = 1,     /*!< nCEHIGH : FNCSEL = 0x1 - nCE polarity active high value.                  */
+} GPIO_CFGC_GPIO18INTD_Enum;
+
+/* ============================================  GPIO CFGC GPIO18OUTCFG [9..10]  ============================================= */
+typedef enum {                                  /*!< GPIO_CFGC_GPIO18OUTCFG                                                    */
+  GPIO_CFGC_GPIO18OUTCFG_DIS           = 0,     /*!< DIS : FNCSEL = 0x3 - Output disabled value.                               */
+  GPIO_CFGC_GPIO18OUTCFG_PUSHPULL      = 1,     /*!< PUSHPULL : FNCSEL = 0x3 - Output is push-pull value.                      */
+  GPIO_CFGC_GPIO18OUTCFG_OD            = 2,     /*!< OD : FNCSEL = 0x3 - Output is open drain value.                           */
+  GPIO_CFGC_GPIO18OUTCFG_TS            = 3,     /*!< TS : FNCSEL = 0x3 - Output is tri-state value.                            */
+} GPIO_CFGC_GPIO18OUTCFG_Enum;
+
+/* =============================================  GPIO CFGC GPIO18INCFG [8..8]  ============================================== */
+typedef enum {                                  /*!< GPIO_CFGC_GPIO18INCFG                                                     */
+  GPIO_CFGC_GPIO18INCFG_READ           = 0,     /*!< READ : Read the GPIO pin data value.                                      */
+  GPIO_CFGC_GPIO18INCFG_RDZERO         = 1,     /*!< RDZERO : INTD = 0 - Readback will always be zero value.                   */
+} GPIO_CFGC_GPIO18INCFG_Enum;
+
+/* ==============================================  GPIO CFGC GPIO17INTD [7..7]  ============================================== */
+typedef enum {                                  /*!< GPIO_CFGC_GPIO17INTD                                                      */
+  GPIO_CFGC_GPIO17INTD_nCELOW          = 0,     /*!< nCELOW : FNCSEL = 0x1 - nCE polarity active low value.                    */
+  GPIO_CFGC_GPIO17INTD_nCEHIGH         = 1,     /*!< nCEHIGH : FNCSEL = 0x1 - nCE polarity active high value.                  */
+} GPIO_CFGC_GPIO17INTD_Enum;
+
+/* =============================================  GPIO CFGC GPIO17OUTCFG [5..6]  ============================================= */
+typedef enum {                                  /*!< GPIO_CFGC_GPIO17OUTCFG                                                    */
+  GPIO_CFGC_GPIO17OUTCFG_DIS           = 0,     /*!< DIS : FNCSEL = 0x3 - Output disabled value.                               */
+  GPIO_CFGC_GPIO17OUTCFG_PUSHPULL      = 1,     /*!< PUSHPULL : FNCSEL = 0x3 - Output is push-pull value.                      */
+  GPIO_CFGC_GPIO17OUTCFG_OD            = 2,     /*!< OD : FNCSEL = 0x3 - Output is open drain value.                           */
+  GPIO_CFGC_GPIO17OUTCFG_TS            = 3,     /*!< TS : FNCSEL = 0x3 - Output is tri-state value.                            */
+} GPIO_CFGC_GPIO17OUTCFG_Enum;
+
+/* =============================================  GPIO CFGC GPIO17INCFG [4..4]  ============================================== */
+typedef enum {                                  /*!< GPIO_CFGC_GPIO17INCFG                                                     */
+  GPIO_CFGC_GPIO17INCFG_READ           = 0,     /*!< READ : Read the GPIO pin data value.                                      */
+  GPIO_CFGC_GPIO17INCFG_RDZERO         = 1,     /*!< RDZERO : INTD = 0 - Readback will always be zero value.                   */
+} GPIO_CFGC_GPIO17INCFG_Enum;
+
+/* ==============================================  GPIO CFGC GPIO16INTD [3..3]  ============================================== */
+typedef enum {                                  /*!< GPIO_CFGC_GPIO16INTD                                                      */
+  GPIO_CFGC_GPIO16INTD_nCELOW          = 0,     /*!< nCELOW : FNCSEL = 0x1 - nCE polarity active low value.                    */
+  GPIO_CFGC_GPIO16INTD_nCEHIGH         = 1,     /*!< nCEHIGH : FNCSEL = 0x1 - nCE polarity active high value.                  */
+} GPIO_CFGC_GPIO16INTD_Enum;
+
+/* =============================================  GPIO CFGC GPIO16OUTCFG [1..2]  ============================================= */
+typedef enum {                                  /*!< GPIO_CFGC_GPIO16OUTCFG                                                    */
+  GPIO_CFGC_GPIO16OUTCFG_DIS           = 0,     /*!< DIS : FNCSEL = 0x3 - Output disabled value.                               */
+  GPIO_CFGC_GPIO16OUTCFG_PUSHPULL      = 1,     /*!< PUSHPULL : FNCSEL = 0x3 - Output is push-pull value.                      */
+  GPIO_CFGC_GPIO16OUTCFG_OD            = 2,     /*!< OD : FNCSEL = 0x3 - Output is open drain value.                           */
+  GPIO_CFGC_GPIO16OUTCFG_TS            = 3,     /*!< TS : FNCSEL = 0x3 - Output is tri-state value.                            */
+} GPIO_CFGC_GPIO16OUTCFG_Enum;
+
+/* =============================================  GPIO CFGC GPIO16INCFG [0..0]  ============================================== */
+typedef enum {                                  /*!< GPIO_CFGC_GPIO16INCFG                                                     */
+  GPIO_CFGC_GPIO16INCFG_READ           = 0,     /*!< READ : Read the GPIO pin data value.                                      */
+  GPIO_CFGC_GPIO16INCFG_RDZERO         = 1,     /*!< RDZERO : INTD = 0 - Readback will always be zero value.                   */
+} GPIO_CFGC_GPIO16INCFG_Enum;
+
+/* =========================================================  CFGD  ========================================================== */
+/* =============================================  GPIO CFGD GPIO31INTD [31..31]  ============================================= */
+typedef enum {                                  /*!< GPIO_CFGD_GPIO31INTD                                                      */
+  GPIO_CFGD_GPIO31INTD_nCELOW          = 0,     /*!< nCELOW : FNCSEL = 0x1 - nCE polarity active low value.                    */
+  GPIO_CFGD_GPIO31INTD_nCEHIGH         = 1,     /*!< nCEHIGH : FNCSEL = 0x1 - nCE polarity active high value.                  */
+} GPIO_CFGD_GPIO31INTD_Enum;
+
+/* ============================================  GPIO CFGD GPIO31OUTCFG [29..30]  ============================================ */
+typedef enum {                                  /*!< GPIO_CFGD_GPIO31OUTCFG                                                    */
+  GPIO_CFGD_GPIO31OUTCFG_DIS           = 0,     /*!< DIS : FNCSEL = 0x3 - Output disabled value.                               */
+  GPIO_CFGD_GPIO31OUTCFG_PUSHPULL      = 1,     /*!< PUSHPULL : FNCSEL = 0x3 - Output is push-pull value.                      */
+  GPIO_CFGD_GPIO31OUTCFG_OD            = 2,     /*!< OD : FNCSEL = 0x3 - Output is open drain value.                           */
+  GPIO_CFGD_GPIO31OUTCFG_TS            = 3,     /*!< TS : FNCSEL = 0x3 - Output is tri-state value.                            */
+} GPIO_CFGD_GPIO31OUTCFG_Enum;
+
+/* ============================================  GPIO CFGD GPIO31INCFG [28..28]  ============================================= */
+typedef enum {                                  /*!< GPIO_CFGD_GPIO31INCFG                                                     */
+  GPIO_CFGD_GPIO31INCFG_READ           = 0,     /*!< READ : Read the GPIO pin data value.                                      */
+  GPIO_CFGD_GPIO31INCFG_RDZERO         = 1,     /*!< RDZERO : INTD = 0 - Readback will always be zero value.                   */
+} GPIO_CFGD_GPIO31INCFG_Enum;
+
+/* =============================================  GPIO CFGD GPIO30INTD [27..27]  ============================================= */
+typedef enum {                                  /*!< GPIO_CFGD_GPIO30INTD                                                      */
+  GPIO_CFGD_GPIO30INTD_nCELOW          = 0,     /*!< nCELOW : FNCSEL = 0x1 - nCE polarity active low value.                    */
+  GPIO_CFGD_GPIO30INTD_nCEHIGH         = 1,     /*!< nCEHIGH : FNCSEL = 0x1 - nCE polarity active high value.                  */
+} GPIO_CFGD_GPIO30INTD_Enum;
+
+/* ============================================  GPIO CFGD GPIO30OUTCFG [25..26]  ============================================ */
+typedef enum {                                  /*!< GPIO_CFGD_GPIO30OUTCFG                                                    */
+  GPIO_CFGD_GPIO30OUTCFG_DIS           = 0,     /*!< DIS : FNCSEL = 0x3 - Output disabled value.                               */
+  GPIO_CFGD_GPIO30OUTCFG_PUSHPULL      = 1,     /*!< PUSHPULL : FNCSEL = 0x3 - Output is push-pull value.                      */
+  GPIO_CFGD_GPIO30OUTCFG_OD            = 2,     /*!< OD : FNCSEL = 0x3 - Output is open drain value.                           */
+  GPIO_CFGD_GPIO30OUTCFG_TS            = 3,     /*!< TS : FNCSEL = 0x3 - Output is tri-state value.                            */
+} GPIO_CFGD_GPIO30OUTCFG_Enum;
+
+/* ============================================  GPIO CFGD GPIO30INCFG [24..24]  ============================================= */
+typedef enum {                                  /*!< GPIO_CFGD_GPIO30INCFG                                                     */
+  GPIO_CFGD_GPIO30INCFG_READ           = 0,     /*!< READ : Read the GPIO pin data value.                                      */
+  GPIO_CFGD_GPIO30INCFG_RDZERO         = 1,     /*!< RDZERO : INTD = 0 - Readback will always be zero value.                   */
+} GPIO_CFGD_GPIO30INCFG_Enum;
+
+/* =============================================  GPIO CFGD GPIO29INTD [23..23]  ============================================= */
+typedef enum {                                  /*!< GPIO_CFGD_GPIO29INTD                                                      */
+  GPIO_CFGD_GPIO29INTD_nCELOW          = 0,     /*!< nCELOW : FNCSEL = 0x1 - nCE polarity active low value.                    */
+  GPIO_CFGD_GPIO29INTD_nCEHIGH         = 1,     /*!< nCEHIGH : FNCSEL = 0x1 - nCE polarity active high value.                  */
+} GPIO_CFGD_GPIO29INTD_Enum;
+
+/* ============================================  GPIO CFGD GPIO29OUTCFG [21..22]  ============================================ */
+typedef enum {                                  /*!< GPIO_CFGD_GPIO29OUTCFG                                                    */
+  GPIO_CFGD_GPIO29OUTCFG_DIS           = 0,     /*!< DIS : FNCSEL = 0x3 - Output disabled value.                               */
+  GPIO_CFGD_GPIO29OUTCFG_PUSHPULL      = 1,     /*!< PUSHPULL : FNCSEL = 0x3 - Output is push-pull value.                      */
+  GPIO_CFGD_GPIO29OUTCFG_OD            = 2,     /*!< OD : FNCSEL = 0x3 - Output is open drain value.                           */
+  GPIO_CFGD_GPIO29OUTCFG_TS            = 3,     /*!< TS : FNCSEL = 0x3 - Output is tri-state value.                            */
+} GPIO_CFGD_GPIO29OUTCFG_Enum;
+
+/* ============================================  GPIO CFGD GPIO29INCFG [20..20]  ============================================= */
+typedef enum {                                  /*!< GPIO_CFGD_GPIO29INCFG                                                     */
+  GPIO_CFGD_GPIO29INCFG_READ           = 0,     /*!< READ : Read the GPIO pin data value.                                      */
+  GPIO_CFGD_GPIO29INCFG_RDZERO         = 1,     /*!< RDZERO : INTD = 0 - Readback will always be zero value.                   */
+} GPIO_CFGD_GPIO29INCFG_Enum;
+
+/* =============================================  GPIO CFGD GPIO28INTD [19..19]  ============================================= */
+typedef enum {                                  /*!< GPIO_CFGD_GPIO28INTD                                                      */
+  GPIO_CFGD_GPIO28INTD_nCELOW          = 0,     /*!< nCELOW : FNCSEL = 0x1 - nCE polarity active low value.                    */
+  GPIO_CFGD_GPIO28INTD_nCEHIGH         = 1,     /*!< nCEHIGH : FNCSEL = 0x1 - nCE polarity active high value.                  */
+} GPIO_CFGD_GPIO28INTD_Enum;
+
+/* ============================================  GPIO CFGD GPIO28OUTCFG [17..18]  ============================================ */
+typedef enum {                                  /*!< GPIO_CFGD_GPIO28OUTCFG                                                    */
+  GPIO_CFGD_GPIO28OUTCFG_DIS           = 0,     /*!< DIS : FNCSEL = 0x3 - Output disabled value.                               */
+  GPIO_CFGD_GPIO28OUTCFG_PUSHPULL      = 1,     /*!< PUSHPULL : FNCSEL = 0x3 - Output is push-pull value.                      */
+  GPIO_CFGD_GPIO28OUTCFG_OD            = 2,     /*!< OD : FNCSEL = 0x3 - Output is open drain value.                           */
+  GPIO_CFGD_GPIO28OUTCFG_TS            = 3,     /*!< TS : FNCSEL = 0x3 - Output is tri-state value.                            */
+} GPIO_CFGD_GPIO28OUTCFG_Enum;
+
+/* ============================================  GPIO CFGD GPIO28INCFG [16..16]  ============================================= */
+typedef enum {                                  /*!< GPIO_CFGD_GPIO28INCFG                                                     */
+  GPIO_CFGD_GPIO28INCFG_READ           = 0,     /*!< READ : Read the GPIO pin data value.                                      */
+  GPIO_CFGD_GPIO28INCFG_RDZERO         = 1,     /*!< RDZERO : INTD = 0 - Readback will always be zero value.                   */
+} GPIO_CFGD_GPIO28INCFG_Enum;
+
+/* =============================================  GPIO CFGD GPIO27INTD [15..15]  ============================================= */
+typedef enum {                                  /*!< GPIO_CFGD_GPIO27INTD                                                      */
+  GPIO_CFGD_GPIO27INTD_nCELOW          = 0,     /*!< nCELOW : FNCSEL = 0x1 - nCE polarity active low value.                    */
+  GPIO_CFGD_GPIO27INTD_nCEHIGH         = 1,     /*!< nCEHIGH : FNCSEL = 0x1 - nCE polarity active high value.                  */
+} GPIO_CFGD_GPIO27INTD_Enum;
+
+/* ============================================  GPIO CFGD GPIO27OUTCFG [13..14]  ============================================ */
+typedef enum {                                  /*!< GPIO_CFGD_GPIO27OUTCFG                                                    */
+  GPIO_CFGD_GPIO27OUTCFG_DIS           = 0,     /*!< DIS : FNCSEL = 0x3 - Output disabled value.                               */
+  GPIO_CFGD_GPIO27OUTCFG_PUSHPULL      = 1,     /*!< PUSHPULL : FNCSEL = 0x3 - Output is push-pull value.                      */
+  GPIO_CFGD_GPIO27OUTCFG_OD            = 2,     /*!< OD : FNCSEL = 0x3 - Output is open drain value.                           */
+  GPIO_CFGD_GPIO27OUTCFG_TS            = 3,     /*!< TS : FNCSEL = 0x3 - Output is tri-state value.                            */
+} GPIO_CFGD_GPIO27OUTCFG_Enum;
+
+/* ============================================  GPIO CFGD GPIO27INCFG [12..12]  ============================================= */
+typedef enum {                                  /*!< GPIO_CFGD_GPIO27INCFG                                                     */
+  GPIO_CFGD_GPIO27INCFG_READ           = 0,     /*!< READ : Read the GPIO pin data value.                                      */
+  GPIO_CFGD_GPIO27INCFG_RDZERO         = 1,     /*!< RDZERO : INTD = 0 - Readback will always be zero value.                   */
+} GPIO_CFGD_GPIO27INCFG_Enum;
+
+/* =============================================  GPIO CFGD GPIO26INTD [11..11]  ============================================= */
+typedef enum {                                  /*!< GPIO_CFGD_GPIO26INTD                                                      */
+  GPIO_CFGD_GPIO26INTD_nCELOW          = 0,     /*!< nCELOW : FNCSEL = 0x1 - nCE polarity active low value.                    */
+  GPIO_CFGD_GPIO26INTD_nCEHIGH         = 1,     /*!< nCEHIGH : FNCSEL = 0x1 - nCE polarity active high value.                  */
+} GPIO_CFGD_GPIO26INTD_Enum;
+
+/* ============================================  GPIO CFGD GPIO26OUTCFG [9..10]  ============================================= */
+typedef enum {                                  /*!< GPIO_CFGD_GPIO26OUTCFG                                                    */
+  GPIO_CFGD_GPIO26OUTCFG_DIS           = 0,     /*!< DIS : FNCSEL = 0x3 - Output disabled value.                               */
+  GPIO_CFGD_GPIO26OUTCFG_PUSHPULL      = 1,     /*!< PUSHPULL : FNCSEL = 0x3 - Output is push-pull value.                      */
+  GPIO_CFGD_GPIO26OUTCFG_OD            = 2,     /*!< OD : FNCSEL = 0x3 - Output is open drain value.                           */
+  GPIO_CFGD_GPIO26OUTCFG_TS            = 3,     /*!< TS : FNCSEL = 0x3 - Output is tri-state value.                            */
+} GPIO_CFGD_GPIO26OUTCFG_Enum;
+
+/* =============================================  GPIO CFGD GPIO26INCFG [8..8]  ============================================== */
+typedef enum {                                  /*!< GPIO_CFGD_GPIO26INCFG                                                     */
+  GPIO_CFGD_GPIO26INCFG_READ           = 0,     /*!< READ : Read the GPIO pin data value.                                      */
+  GPIO_CFGD_GPIO26INCFG_RDZERO         = 1,     /*!< RDZERO : INTD = 0 - Readback will always be zero value.                   */
+} GPIO_CFGD_GPIO26INCFG_Enum;
+
+/* ==============================================  GPIO CFGD GPIO25INTD [7..7]  ============================================== */
+typedef enum {                                  /*!< GPIO_CFGD_GPIO25INTD                                                      */
+  GPIO_CFGD_GPIO25INTD_nCELOW          = 0,     /*!< nCELOW : FNCSEL = 0x1 - nCE polarity active low value.                    */
+  GPIO_CFGD_GPIO25INTD_nCEHIGH         = 1,     /*!< nCEHIGH : FNCSEL = 0x1 - nCE polarity active high value.                  */
+} GPIO_CFGD_GPIO25INTD_Enum;
+
+/* =============================================  GPIO CFGD GPIO25OUTCFG [5..6]  ============================================= */
+typedef enum {                                  /*!< GPIO_CFGD_GPIO25OUTCFG                                                    */
+  GPIO_CFGD_GPIO25OUTCFG_DIS           = 0,     /*!< DIS : FNCSEL = 0x3 - Output disabled value.                               */
+  GPIO_CFGD_GPIO25OUTCFG_PUSHPULL      = 1,     /*!< PUSHPULL : FNCSEL = 0x3 - Output is push-pull value.                      */
+  GPIO_CFGD_GPIO25OUTCFG_OD            = 2,     /*!< OD : FNCSEL = 0x3 - Output is open drain value.                           */
+  GPIO_CFGD_GPIO25OUTCFG_TS            = 3,     /*!< TS : FNCSEL = 0x3 - Output is tri-state value.                            */
+} GPIO_CFGD_GPIO25OUTCFG_Enum;
+
+/* =============================================  GPIO CFGD GPIO25INCFG [4..4]  ============================================== */
+typedef enum {                                  /*!< GPIO_CFGD_GPIO25INCFG                                                     */
+  GPIO_CFGD_GPIO25INCFG_READ           = 0,     /*!< READ : Read the GPIO pin data value.                                      */
+  GPIO_CFGD_GPIO25INCFG_RDZERO         = 1,     /*!< RDZERO : INTD = 0 - Readback will always be zero value.                   */
+} GPIO_CFGD_GPIO25INCFG_Enum;
+
+/* ==============================================  GPIO CFGD GPIO24INTD [3..3]  ============================================== */
+typedef enum {                                  /*!< GPIO_CFGD_GPIO24INTD                                                      */
+  GPIO_CFGD_GPIO24INTD_nCELOW          = 0,     /*!< nCELOW : FNCSEL = 0x1 - nCE polarity active low value.                    */
+  GPIO_CFGD_GPIO24INTD_nCEHIGH         = 1,     /*!< nCEHIGH : FNCSEL = 0x1 - nCE polarity active high value.                  */
+} GPIO_CFGD_GPIO24INTD_Enum;
+
+/* =============================================  GPIO CFGD GPIO24OUTCFG [1..2]  ============================================= */
+typedef enum {                                  /*!< GPIO_CFGD_GPIO24OUTCFG                                                    */
+  GPIO_CFGD_GPIO24OUTCFG_DIS           = 0,     /*!< DIS : FNCSEL = 0x3 - Output disabled value.                               */
+  GPIO_CFGD_GPIO24OUTCFG_PUSHPULL      = 1,     /*!< PUSHPULL : FNCSEL = 0x3 - Output is push-pull value.                      */
+  GPIO_CFGD_GPIO24OUTCFG_OD            = 2,     /*!< OD : FNCSEL = 0x3 - Output is open drain value.                           */
+  GPIO_CFGD_GPIO24OUTCFG_TS            = 3,     /*!< TS : FNCSEL = 0x3 - Output is tri-state value.                            */
+} GPIO_CFGD_GPIO24OUTCFG_Enum;
+
+/* =============================================  GPIO CFGD GPIO24INCFG [0..0]  ============================================== */
+typedef enum {                                  /*!< GPIO_CFGD_GPIO24INCFG                                                     */
+  GPIO_CFGD_GPIO24INCFG_READ           = 0,     /*!< READ : Read the GPIO pin data value.                                      */
+  GPIO_CFGD_GPIO24INCFG_RDZERO         = 1,     /*!< RDZERO : INTD = 0 - Readback will always be zero value.                   */
+} GPIO_CFGD_GPIO24INCFG_Enum;
+
+/* =========================================================  CFGE  ========================================================== */
+/* =============================================  GPIO CFGE GPIO39INTD [31..31]  ============================================= */
+typedef enum {                                  /*!< GPIO_CFGE_GPIO39INTD                                                      */
+  GPIO_CFGE_GPIO39INTD_INTDIS          = 0,     /*!< INTDIS : INCFG = 1 - No interrupt on GPIO transition value.               */
+  GPIO_CFGE_GPIO39INTD_INTBOTH         = 1,     /*!< INTBOTH : INCFG = 1 - Interrupt on either low to high or high
+                                                     to low GPIO transition value.                                             */
+} GPIO_CFGE_GPIO39INTD_Enum;
+
+/* ============================================  GPIO CFGE GPIO39OUTCFG [29..30]  ============================================ */
+typedef enum {                                  /*!< GPIO_CFGE_GPIO39OUTCFG                                                    */
+  GPIO_CFGE_GPIO39OUTCFG_DIS           = 0,     /*!< DIS : FNCSEL = 0x3 - Output disabled value.                               */
+  GPIO_CFGE_GPIO39OUTCFG_PUSHPULL      = 1,     /*!< PUSHPULL : FNCSEL = 0x3 - Output is push-pull value.                      */
+  GPIO_CFGE_GPIO39OUTCFG_OD            = 2,     /*!< OD : FNCSEL = 0x3 - Output is open drain value.                           */
+  GPIO_CFGE_GPIO39OUTCFG_TS            = 3,     /*!< TS : FNCSEL = 0x3 - Output is tri-state value.                            */
+} GPIO_CFGE_GPIO39OUTCFG_Enum;
+
+/* ============================================  GPIO CFGE GPIO39INCFG [28..28]  ============================================= */
+typedef enum {                                  /*!< GPIO_CFGE_GPIO39INCFG                                                     */
+  GPIO_CFGE_GPIO39INCFG_READ           = 0,     /*!< READ : Read the GPIO pin data value.                                      */
+  GPIO_CFGE_GPIO39INCFG_RDZERO         = 1,     /*!< RDZERO : INTD = 0 - Readback will always be zero value.                   */
+} GPIO_CFGE_GPIO39INCFG_Enum;
+
+/* =============================================  GPIO CFGE GPIO38INTD [27..27]  ============================================= */
+typedef enum {                                  /*!< GPIO_CFGE_GPIO38INTD                                                      */
+  GPIO_CFGE_GPIO38INTD_nCELOW          = 0,     /*!< nCELOW : FNCSEL = 0x1 - nCE polarity active low value.                    */
+  GPIO_CFGE_GPIO38INTD_nCEHIGH         = 1,     /*!< nCEHIGH : FNCSEL = 0x1 - nCE polarity active high value.                  */
+} GPIO_CFGE_GPIO38INTD_Enum;
+
+/* ============================================  GPIO CFGE GPIO38OUTCFG [25..26]  ============================================ */
+typedef enum {                                  /*!< GPIO_CFGE_GPIO38OUTCFG                                                    */
+  GPIO_CFGE_GPIO38OUTCFG_DIS           = 0,     /*!< DIS : FNCSEL = 0x3 - Output disabled value.                               */
+  GPIO_CFGE_GPIO38OUTCFG_PUSHPULL      = 1,     /*!< PUSHPULL : FNCSEL = 0x3 - Output is push-pull value.                      */
+  GPIO_CFGE_GPIO38OUTCFG_OD            = 2,     /*!< OD : FNCSEL = 0x3 - Output is open drain value.                           */
+  GPIO_CFGE_GPIO38OUTCFG_TS            = 3,     /*!< TS : FNCSEL = 0x3 - Output is tri-state value.                            */
+} GPIO_CFGE_GPIO38OUTCFG_Enum;
+
+/* ============================================  GPIO CFGE GPIO38INCFG [24..24]  ============================================= */
+typedef enum {                                  /*!< GPIO_CFGE_GPIO38INCFG                                                     */
+  GPIO_CFGE_GPIO38INCFG_READ           = 0,     /*!< READ : Read the GPIO pin data value.                                      */
+  GPIO_CFGE_GPIO38INCFG_RDZERO         = 1,     /*!< RDZERO : INTD = 0 - Readback will always be zero value.                   */
+} GPIO_CFGE_GPIO38INCFG_Enum;
+
+/* =============================================  GPIO CFGE GPIO37INTD [23..23]  ============================================= */
+typedef enum {                                  /*!< GPIO_CFGE_GPIO37INTD                                                      */
+  GPIO_CFGE_GPIO37INTD_nCELOW          = 0,     /*!< nCELOW : FNCSEL = 0x1 - nCE polarity active low value.                    */
+  GPIO_CFGE_GPIO37INTD_nCEHIGH         = 1,     /*!< nCEHIGH : FNCSEL = 0x1 - nCE polarity active high value.                  */
+} GPIO_CFGE_GPIO37INTD_Enum;
+
+/* ============================================  GPIO CFGE GPIO37OUTCFG [21..22]  ============================================ */
+typedef enum {                                  /*!< GPIO_CFGE_GPIO37OUTCFG                                                    */
+  GPIO_CFGE_GPIO37OUTCFG_DIS           = 0,     /*!< DIS : FNCSEL = 0x3 - Output disabled value.                               */
+  GPIO_CFGE_GPIO37OUTCFG_PUSHPULL      = 1,     /*!< PUSHPULL : FNCSEL = 0x3 - Output is push-pull value.                      */
+  GPIO_CFGE_GPIO37OUTCFG_OD            = 2,     /*!< OD : FNCSEL = 0x3 - Output is open drain value.                           */
+  GPIO_CFGE_GPIO37OUTCFG_TS            = 3,     /*!< TS : FNCSEL = 0x3 - Output is tri-state value.                            */
+} GPIO_CFGE_GPIO37OUTCFG_Enum;
+
+/* ============================================  GPIO CFGE GPIO37INCFG [20..20]  ============================================= */
+typedef enum {                                  /*!< GPIO_CFGE_GPIO37INCFG                                                     */
+  GPIO_CFGE_GPIO37INCFG_READ           = 0,     /*!< READ : Read the GPIO pin data value.                                      */
+  GPIO_CFGE_GPIO37INCFG_RDZERO         = 1,     /*!< RDZERO : INTD = 0 - Readback will always be zero value.                   */
+} GPIO_CFGE_GPIO37INCFG_Enum;
+
+/* =============================================  GPIO CFGE GPIO36INTD [19..19]  ============================================= */
+typedef enum {                                  /*!< GPIO_CFGE_GPIO36INTD                                                      */
+  GPIO_CFGE_GPIO36INTD_nCELOW          = 0,     /*!< nCELOW : FNCSEL = 0x1 - nCE polarity active low value.                    */
+  GPIO_CFGE_GPIO36INTD_nCEHIGH         = 1,     /*!< nCEHIGH : FNCSEL = 0x1 - nCE polarity active high value.                  */
+} GPIO_CFGE_GPIO36INTD_Enum;
+
+/* ============================================  GPIO CFGE GPIO36OUTCFG [17..18]  ============================================ */
+typedef enum {                                  /*!< GPIO_CFGE_GPIO36OUTCFG                                                    */
+  GPIO_CFGE_GPIO36OUTCFG_DIS           = 0,     /*!< DIS : FNCSEL = 0x3 - Output disabled value.                               */
+  GPIO_CFGE_GPIO36OUTCFG_PUSHPULL      = 1,     /*!< PUSHPULL : FNCSEL = 0x3 - Output is push-pull value.                      */
+  GPIO_CFGE_GPIO36OUTCFG_OD            = 2,     /*!< OD : FNCSEL = 0x3 - Output is open drain value.                           */
+  GPIO_CFGE_GPIO36OUTCFG_TS            = 3,     /*!< TS : FNCSEL = 0x3 - Output is tri-state value.                            */
+} GPIO_CFGE_GPIO36OUTCFG_Enum;
+
+/* ============================================  GPIO CFGE GPIO36INCFG [16..16]  ============================================= */
+typedef enum {                                  /*!< GPIO_CFGE_GPIO36INCFG                                                     */
+  GPIO_CFGE_GPIO36INCFG_READ           = 0,     /*!< READ : Read the GPIO pin data value.                                      */
+  GPIO_CFGE_GPIO36INCFG_RDZERO         = 1,     /*!< RDZERO : INTD = 0 - Readback will always be zero value.                   */
+} GPIO_CFGE_GPIO36INCFG_Enum;
+
+/* =============================================  GPIO CFGE GPIO35INTD [15..15]  ============================================= */
+typedef enum {                                  /*!< GPIO_CFGE_GPIO35INTD                                                      */
+  GPIO_CFGE_GPIO35INTD_nCELOW          = 0,     /*!< nCELOW : FNCSEL = 0x1 - nCE polarity active low value.                    */
+  GPIO_CFGE_GPIO35INTD_nCEHIGH         = 1,     /*!< nCEHIGH : FNCSEL = 0x1 - nCE polarity active high value.                  */
+} GPIO_CFGE_GPIO35INTD_Enum;
+
+/* ============================================  GPIO CFGE GPIO35OUTCFG [13..14]  ============================================ */
+typedef enum {                                  /*!< GPIO_CFGE_GPIO35OUTCFG                                                    */
+  GPIO_CFGE_GPIO35OUTCFG_DIS           = 0,     /*!< DIS : FNCSEL = 0x3 - Output disabled value.                               */
+  GPIO_CFGE_GPIO35OUTCFG_PUSHPULL      = 1,     /*!< PUSHPULL : FNCSEL = 0x3 - Output is push-pull value.                      */
+  GPIO_CFGE_GPIO35OUTCFG_OD            = 2,     /*!< OD : FNCSEL = 0x3 - Output is open drain value.                           */
+  GPIO_CFGE_GPIO35OUTCFG_TS            = 3,     /*!< TS : FNCSEL = 0x3 - Output is tri-state value.                            */
+} GPIO_CFGE_GPIO35OUTCFG_Enum;
+
+/* ============================================  GPIO CFGE GPIO35INCFG [12..12]  ============================================= */
+typedef enum {                                  /*!< GPIO_CFGE_GPIO35INCFG                                                     */
+  GPIO_CFGE_GPIO35INCFG_READ           = 0,     /*!< READ : Read the GPIO pin data value.                                      */
+  GPIO_CFGE_GPIO35INCFG_RDZERO         = 1,     /*!< RDZERO : INTD = 0 - Readback will always be zero value.                   */
+} GPIO_CFGE_GPIO35INCFG_Enum;
+
+/* =============================================  GPIO CFGE GPIO34INTD [11..11]  ============================================= */
+typedef enum {                                  /*!< GPIO_CFGE_GPIO34INTD                                                      */
+  GPIO_CFGE_GPIO34INTD_nCELOW          = 0,     /*!< nCELOW : FNCSEL = 0x1 - nCE polarity active low value.                    */
+  GPIO_CFGE_GPIO34INTD_nCEHIGH         = 1,     /*!< nCEHIGH : FNCSEL = 0x1 - nCE polarity active high value.                  */
+} GPIO_CFGE_GPIO34INTD_Enum;
+
+/* ============================================  GPIO CFGE GPIO34OUTCFG [9..10]  ============================================= */
+typedef enum {                                  /*!< GPIO_CFGE_GPIO34OUTCFG                                                    */
+  GPIO_CFGE_GPIO34OUTCFG_DIS           = 0,     /*!< DIS : FNCSEL = 0x3 - Output disabled value.                               */
+  GPIO_CFGE_GPIO34OUTCFG_PUSHPULL      = 1,     /*!< PUSHPULL : FNCSEL = 0x3 - Output is push-pull value.                      */
+  GPIO_CFGE_GPIO34OUTCFG_OD            = 2,     /*!< OD : FNCSEL = 0x3 - Output is open drain value.                           */
+  GPIO_CFGE_GPIO34OUTCFG_TS            = 3,     /*!< TS : FNCSEL = 0x3 - Output is tri-state value.                            */
+} GPIO_CFGE_GPIO34OUTCFG_Enum;
+
+/* =============================================  GPIO CFGE GPIO34INCFG [8..8]  ============================================== */
+typedef enum {                                  /*!< GPIO_CFGE_GPIO34INCFG                                                     */
+  GPIO_CFGE_GPIO34INCFG_READ           = 0,     /*!< READ : Read the GPIO pin data value.                                      */
+  GPIO_CFGE_GPIO34INCFG_RDZERO         = 1,     /*!< RDZERO : INTD = 0 - Readback will always be zero value.                   */
+} GPIO_CFGE_GPIO34INCFG_Enum;
+
+/* ==============================================  GPIO CFGE GPIO33INTD [7..7]  ============================================== */
+typedef enum {                                  /*!< GPIO_CFGE_GPIO33INTD                                                      */
+  GPIO_CFGE_GPIO33INTD_nCELOW          = 0,     /*!< nCELOW : FNCSEL = 0x1 - nCE polarity active low value.                    */
+  GPIO_CFGE_GPIO33INTD_nCEHIGH         = 1,     /*!< nCEHIGH : FNCSEL = 0x1 - nCE polarity active high value.                  */
+} GPIO_CFGE_GPIO33INTD_Enum;
+
+/* =============================================  GPIO CFGE GPIO33OUTCFG [5..6]  ============================================= */
+typedef enum {                                  /*!< GPIO_CFGE_GPIO33OUTCFG                                                    */
+  GPIO_CFGE_GPIO33OUTCFG_DIS           = 0,     /*!< DIS : FNCSEL = 0x3 - Output disabled value.                               */
+  GPIO_CFGE_GPIO33OUTCFG_PUSHPULL      = 1,     /*!< PUSHPULL : FNCSEL = 0x3 - Output is push-pull value.                      */
+  GPIO_CFGE_GPIO33OUTCFG_OD            = 2,     /*!< OD : FNCSEL = 0x3 - Output is open drain value.                           */
+  GPIO_CFGE_GPIO33OUTCFG_TS            = 3,     /*!< TS : FNCSEL = 0x3 - Output is tri-state value.                            */
+} GPIO_CFGE_GPIO33OUTCFG_Enum;
+
+/* =============================================  GPIO CFGE GPIO33INCFG [4..4]  ============================================== */
+typedef enum {                                  /*!< GPIO_CFGE_GPIO33INCFG                                                     */
+  GPIO_CFGE_GPIO33INCFG_READ           = 0,     /*!< READ : Read the GPIO pin data value.                                      */
+  GPIO_CFGE_GPIO33INCFG_RDZERO         = 1,     /*!< RDZERO : INTD = 0 - Readback will always be zero value.                   */
+} GPIO_CFGE_GPIO33INCFG_Enum;
+
+/* ==============================================  GPIO CFGE GPIO32INTD [3..3]  ============================================== */
+typedef enum {                                  /*!< GPIO_CFGE_GPIO32INTD                                                      */
+  GPIO_CFGE_GPIO32INTD_nCELOW          = 0,     /*!< nCELOW : FNCSEL = 0x1 - nCE polarity active low value.                    */
+  GPIO_CFGE_GPIO32INTD_nCEHIGH         = 1,     /*!< nCEHIGH : FNCSEL = 0x1 - nCE polarity active high value.                  */
+} GPIO_CFGE_GPIO32INTD_Enum;
+
+/* =============================================  GPIO CFGE GPIO32OUTCFG [1..2]  ============================================= */
+typedef enum {                                  /*!< GPIO_CFGE_GPIO32OUTCFG                                                    */
+  GPIO_CFGE_GPIO32OUTCFG_DIS           = 0,     /*!< DIS : FNCSEL = 0x3 - Output disabled value.                               */
+  GPIO_CFGE_GPIO32OUTCFG_PUSHPULL      = 1,     /*!< PUSHPULL : FNCSEL = 0x3 - Output is push-pull value.                      */
+  GPIO_CFGE_GPIO32OUTCFG_OD            = 2,     /*!< OD : FNCSEL = 0x3 - Output is open drain value.                           */
+  GPIO_CFGE_GPIO32OUTCFG_TS            = 3,     /*!< TS : FNCSEL = 0x3 - Output is tri-state value.                            */
+} GPIO_CFGE_GPIO32OUTCFG_Enum;
+
+/* =============================================  GPIO CFGE GPIO32INCFG [0..0]  ============================================== */
+typedef enum {                                  /*!< GPIO_CFGE_GPIO32INCFG                                                     */
+  GPIO_CFGE_GPIO32INCFG_READ           = 0,     /*!< READ : Read the GPIO pin data value.                                      */
+  GPIO_CFGE_GPIO32INCFG_RDZERO         = 1,     /*!< RDZERO : INTD = 0 - Readback will always be zero value.                   */
+} GPIO_CFGE_GPIO32INCFG_Enum;
+
+/* =========================================================  CFGF  ========================================================== */
+/* =============================================  GPIO CFGF GPIO47INTD [31..31]  ============================================= */
+typedef enum {                                  /*!< GPIO_CFGF_GPIO47INTD                                                      */
+  GPIO_CFGF_GPIO47INTD_nCELOW          = 0,     /*!< nCELOW : FNCSEL = 0x1 - nCE polarity active low value.                    */
+  GPIO_CFGF_GPIO47INTD_nCEHIGH         = 1,     /*!< nCEHIGH : FNCSEL = 0x1 - nCE polarity active high value.                  */
+} GPIO_CFGF_GPIO47INTD_Enum;
+
+/* ============================================  GPIO CFGF GPIO47OUTCFG [29..30]  ============================================ */
+typedef enum {                                  /*!< GPIO_CFGF_GPIO47OUTCFG                                                    */
+  GPIO_CFGF_GPIO47OUTCFG_DIS           = 0,     /*!< DIS : FNCSEL = 0x3 - Output disabled value.                               */
+  GPIO_CFGF_GPIO47OUTCFG_PUSHPULL      = 1,     /*!< PUSHPULL : FNCSEL = 0x3 - Output is push-pull value.                      */
+  GPIO_CFGF_GPIO47OUTCFG_OD            = 2,     /*!< OD : FNCSEL = 0x3 - Output is open drain value.                           */
+  GPIO_CFGF_GPIO47OUTCFG_TS            = 3,     /*!< TS : FNCSEL = 0x3 - Output is tri-state value.                            */
+} GPIO_CFGF_GPIO47OUTCFG_Enum;
+
+/* ============================================  GPIO CFGF GPIO47INCFG [28..28]  ============================================= */
+typedef enum {                                  /*!< GPIO_CFGF_GPIO47INCFG                                                     */
+  GPIO_CFGF_GPIO47INCFG_READ           = 0,     /*!< READ : Read the GPIO pin data value.                                      */
+  GPIO_CFGF_GPIO47INCFG_RDZERO         = 1,     /*!< RDZERO : INTD = 0 - Readback will always be zero value.                   */
+} GPIO_CFGF_GPIO47INCFG_Enum;
+
+/* =============================================  GPIO CFGF GPIO46INTD [27..27]  ============================================= */
+typedef enum {                                  /*!< GPIO_CFGF_GPIO46INTD                                                      */
+  GPIO_CFGF_GPIO46INTD_nCELOW          = 0,     /*!< nCELOW : FNCSEL = 0x1 - nCE polarity active low value.                    */
+  GPIO_CFGF_GPIO46INTD_nCEHIGH         = 1,     /*!< nCEHIGH : FNCSEL = 0x1 - nCE polarity active high value.                  */
+} GPIO_CFGF_GPIO46INTD_Enum;
+
+/* ============================================  GPIO CFGF GPIO46OUTCFG [25..26]  ============================================ */
+typedef enum {                                  /*!< GPIO_CFGF_GPIO46OUTCFG                                                    */
+  GPIO_CFGF_GPIO46OUTCFG_DIS           = 0,     /*!< DIS : FNCSEL = 0x3 - Output disabled value.                               */
+  GPIO_CFGF_GPIO46OUTCFG_PUSHPULL      = 1,     /*!< PUSHPULL : FNCSEL = 0x3 - Output is push-pull value.                      */
+  GPIO_CFGF_GPIO46OUTCFG_OD            = 2,     /*!< OD : FNCSEL = 0x3 - Output is open drain value.                           */
+  GPIO_CFGF_GPIO46OUTCFG_TS            = 3,     /*!< TS : FNCSEL = 0x3 - Output is tri-state value.                            */
+} GPIO_CFGF_GPIO46OUTCFG_Enum;
+
+/* ============================================  GPIO CFGF GPIO46INCFG [24..24]  ============================================= */
+typedef enum {                                  /*!< GPIO_CFGF_GPIO46INCFG                                                     */
+  GPIO_CFGF_GPIO46INCFG_READ           = 0,     /*!< READ : Read the GPIO pin data value.                                      */
+  GPIO_CFGF_GPIO46INCFG_RDZERO         = 1,     /*!< RDZERO : INTD = 0 - Readback will always be zero value.                   */
+} GPIO_CFGF_GPIO46INCFG_Enum;
+
+/* =============================================  GPIO CFGF GPIO45INTD [23..23]  ============================================= */
+typedef enum {                                  /*!< GPIO_CFGF_GPIO45INTD                                                      */
+  GPIO_CFGF_GPIO45INTD_nCELOW          = 0,     /*!< nCELOW : FNCSEL = 0x1 - nCE polarity active low value.                    */
+  GPIO_CFGF_GPIO45INTD_nCEHIGH         = 1,     /*!< nCEHIGH : FNCSEL = 0x1 - nCE polarity active high value.                  */
+} GPIO_CFGF_GPIO45INTD_Enum;
+
+/* ============================================  GPIO CFGF GPIO45OUTCFG [21..22]  ============================================ */
+typedef enum {                                  /*!< GPIO_CFGF_GPIO45OUTCFG                                                    */
+  GPIO_CFGF_GPIO45OUTCFG_DIS           = 0,     /*!< DIS : FNCSEL = 0x3 - Output disabled value.                               */
+  GPIO_CFGF_GPIO45OUTCFG_PUSHPULL      = 1,     /*!< PUSHPULL : FNCSEL = 0x3 - Output is push-pull value.                      */
+  GPIO_CFGF_GPIO45OUTCFG_OD            = 2,     /*!< OD : FNCSEL = 0x3 - Output is open drain value.                           */
+  GPIO_CFGF_GPIO45OUTCFG_TS            = 3,     /*!< TS : FNCSEL = 0x3 - Output is tri-state value.                            */
+} GPIO_CFGF_GPIO45OUTCFG_Enum;
+
+/* ============================================  GPIO CFGF GPIO45INCFG [20..20]  ============================================= */
+typedef enum {                                  /*!< GPIO_CFGF_GPIO45INCFG                                                     */
+  GPIO_CFGF_GPIO45INCFG_READ           = 0,     /*!< READ : Read the GPIO pin data value.                                      */
+  GPIO_CFGF_GPIO45INCFG_RDZERO         = 1,     /*!< RDZERO : INTD = 0 - Readback will always be zero value.                   */
+} GPIO_CFGF_GPIO45INCFG_Enum;
+
+/* =============================================  GPIO CFGF GPIO44INTD [19..19]  ============================================= */
+typedef enum {                                  /*!< GPIO_CFGF_GPIO44INTD                                                      */
+  GPIO_CFGF_GPIO44INTD_nCELOW          = 0,     /*!< nCELOW : FNCSEL = 0x1 - nCE polarity active low value.                    */
+  GPIO_CFGF_GPIO44INTD_nCEHIGH         = 1,     /*!< nCEHIGH : FNCSEL = 0x1 - nCE polarity active high value.                  */
+} GPIO_CFGF_GPIO44INTD_Enum;
+
+/* ============================================  GPIO CFGF GPIO44OUTCFG [17..18]  ============================================ */
+typedef enum {                                  /*!< GPIO_CFGF_GPIO44OUTCFG                                                    */
+  GPIO_CFGF_GPIO44OUTCFG_DIS           = 0,     /*!< DIS : FNCSEL = 0x3 - Output disabled value.                               */
+  GPIO_CFGF_GPIO44OUTCFG_PUSHPULL      = 1,     /*!< PUSHPULL : FNCSEL = 0x3 - Output is push-pull value.                      */
+  GPIO_CFGF_GPIO44OUTCFG_OD            = 2,     /*!< OD : FNCSEL = 0x3 - Output is open drain value.                           */
+  GPIO_CFGF_GPIO44OUTCFG_TS            = 3,     /*!< TS : FNCSEL = 0x3 - Output is tri-state value.                            */
+} GPIO_CFGF_GPIO44OUTCFG_Enum;
+
+/* ============================================  GPIO CFGF GPIO44INCFG [16..16]  ============================================= */
+typedef enum {                                  /*!< GPIO_CFGF_GPIO44INCFG                                                     */
+  GPIO_CFGF_GPIO44INCFG_READ           = 0,     /*!< READ : Read the GPIO pin data value.                                      */
+  GPIO_CFGF_GPIO44INCFG_RDZERO         = 1,     /*!< RDZERO : INTD = 0 - Readback will always be zero value.                   */
+} GPIO_CFGF_GPIO44INCFG_Enum;
+
+/* =============================================  GPIO CFGF GPIO43INTD [15..15]  ============================================= */
+typedef enum {                                  /*!< GPIO_CFGF_GPIO43INTD                                                      */
+  GPIO_CFGF_GPIO43INTD_nCELOW          = 0,     /*!< nCELOW : FNCSEL = 0x1 - nCE polarity active low value.                    */
+  GPIO_CFGF_GPIO43INTD_nCEHIGH         = 1,     /*!< nCEHIGH : FNCSEL = 0x1 - nCE polarity active high value.                  */
+} GPIO_CFGF_GPIO43INTD_Enum;
+
+/* ============================================  GPIO CFGF GPIO43OUTCFG [13..14]  ============================================ */
+typedef enum {                                  /*!< GPIO_CFGF_GPIO43OUTCFG                                                    */
+  GPIO_CFGF_GPIO43OUTCFG_DIS           = 0,     /*!< DIS : FNCSEL = 0x3 - Output disabled value.                               */
+  GPIO_CFGF_GPIO43OUTCFG_PUSHPULL      = 1,     /*!< PUSHPULL : FNCSEL = 0x3 - Output is push-pull value.                      */
+  GPIO_CFGF_GPIO43OUTCFG_OD            = 2,     /*!< OD : FNCSEL = 0x3 - Output is open drain value.                           */
+  GPIO_CFGF_GPIO43OUTCFG_TS            = 3,     /*!< TS : FNCSEL = 0x3 - Output is tri-state value.                            */
+} GPIO_CFGF_GPIO43OUTCFG_Enum;
+
+/* ============================================  GPIO CFGF GPIO43INCFG [12..12]  ============================================= */
+typedef enum {                                  /*!< GPIO_CFGF_GPIO43INCFG                                                     */
+  GPIO_CFGF_GPIO43INCFG_READ           = 0,     /*!< READ : Read the GPIO pin data value.                                      */
+  GPIO_CFGF_GPIO43INCFG_RDZERO         = 1,     /*!< RDZERO : INTD = 0 - Readback will always be zero value.                   */
+} GPIO_CFGF_GPIO43INCFG_Enum;
+
+/* =============================================  GPIO CFGF GPIO42INTD [11..11]  ============================================= */
+typedef enum {                                  /*!< GPIO_CFGF_GPIO42INTD                                                      */
+  GPIO_CFGF_GPIO42INTD_nCELOW          = 0,     /*!< nCELOW : FNCSEL = 0x1 - nCE polarity active low value.                    */
+  GPIO_CFGF_GPIO42INTD_nCEHIGH         = 1,     /*!< nCEHIGH : FNCSEL = 0x1 - nCE polarity active high value.                  */
+} GPIO_CFGF_GPIO42INTD_Enum;
+
+/* ============================================  GPIO CFGF GPIO42OUTCFG [9..10]  ============================================= */
+typedef enum {                                  /*!< GPIO_CFGF_GPIO42OUTCFG                                                    */
+  GPIO_CFGF_GPIO42OUTCFG_DIS           = 0,     /*!< DIS : FNCSEL = 0x3 - Output disabled value.                               */
+  GPIO_CFGF_GPIO42OUTCFG_PUSHPULL      = 1,     /*!< PUSHPULL : FNCSEL = 0x3 - Output is push-pull value.                      */
+  GPIO_CFGF_GPIO42OUTCFG_OD            = 2,     /*!< OD : FNCSEL = 0x3 - Output is open drain value.                           */
+  GPIO_CFGF_GPIO42OUTCFG_TS            = 3,     /*!< TS : FNCSEL = 0x3 - Output is tri-state value.                            */
+} GPIO_CFGF_GPIO42OUTCFG_Enum;
+
+/* =============================================  GPIO CFGF GPIO42INCFG [8..8]  ============================================== */
+typedef enum {                                  /*!< GPIO_CFGF_GPIO42INCFG                                                     */
+  GPIO_CFGF_GPIO42INCFG_READ           = 0,     /*!< READ : Read the GPIO pin data value.                                      */
+  GPIO_CFGF_GPIO42INCFG_RDZERO         = 1,     /*!< RDZERO : INTD = 0 - Readback will always be zero value.                   */
+} GPIO_CFGF_GPIO42INCFG_Enum;
+
+/* ==============================================  GPIO CFGF GPIO41INTD [7..7]  ============================================== */
+typedef enum {                                  /*!< GPIO_CFGF_GPIO41INTD                                                      */
+  GPIO_CFGF_GPIO41INTD_nCELOW          = 0,     /*!< nCELOW : FNCSEL = 0x0 - nCE polarity active low value.                    */
+  GPIO_CFGF_GPIO41INTD_nCEHIGH         = 1,     /*!< nCEHIGH : FNCSEL = 0x0 - nCE polarity active high value.                  */
+} GPIO_CFGF_GPIO41INTD_Enum;
+
+/* =============================================  GPIO CFGF GPIO41OUTCFG [5..6]  ============================================= */
+typedef enum {                                  /*!< GPIO_CFGF_GPIO41OUTCFG                                                    */
+  GPIO_CFGF_GPIO41OUTCFG_DIS           = 0,     /*!< DIS : FNCSEL = 0x3 - Output disabled value.                               */
+  GPIO_CFGF_GPIO41OUTCFG_PUSHPULL      = 1,     /*!< PUSHPULL : FNCSEL = 0x3 - Output is push-pull value.                      */
+  GPIO_CFGF_GPIO41OUTCFG_OD            = 2,     /*!< OD : FNCSEL = 0x3 - Output is open drain value.                           */
+  GPIO_CFGF_GPIO41OUTCFG_TS            = 3,     /*!< TS : FNCSEL = 0x3 - Output is tri-state value.                            */
+} GPIO_CFGF_GPIO41OUTCFG_Enum;
+
+/* =============================================  GPIO CFGF GPIO41INCFG [4..4]  ============================================== */
+typedef enum {                                  /*!< GPIO_CFGF_GPIO41INCFG                                                     */
+  GPIO_CFGF_GPIO41INCFG_READ           = 0,     /*!< READ : Read the GPIO pin data value.                                      */
+  GPIO_CFGF_GPIO41INCFG_RDZERO         = 1,     /*!< RDZERO : INTD = 0 - Readback will always be zero value.                   */
+} GPIO_CFGF_GPIO41INCFG_Enum;
+
+/* ==============================================  GPIO CFGF GPIO40INTD [3..3]  ============================================== */
+typedef enum {                                  /*!< GPIO_CFGF_GPIO40INTD                                                      */
+  GPIO_CFGF_GPIO40INTD_INTDIS          = 0,     /*!< INTDIS : INCFG = 1 - No interrupt on GPIO transition value.               */
+  GPIO_CFGF_GPIO40INTD_INTBOTH         = 1,     /*!< INTBOTH : INCFG = 1 - Interrupt on either low to high or high
+                                                     to low GPIO transition value.                                             */
+} GPIO_CFGF_GPIO40INTD_Enum;
+
+/* =============================================  GPIO CFGF GPIO40OUTCFG [1..2]  ============================================= */
+typedef enum {                                  /*!< GPIO_CFGF_GPIO40OUTCFG                                                    */
+  GPIO_CFGF_GPIO40OUTCFG_DIS           = 0,     /*!< DIS : FNCSEL = 0x3 - Output disabled value.                               */
+  GPIO_CFGF_GPIO40OUTCFG_PUSHPULL      = 1,     /*!< PUSHPULL : FNCSEL = 0x3 - Output is push-pull value.                      */
+  GPIO_CFGF_GPIO40OUTCFG_OD            = 2,     /*!< OD : FNCSEL = 0x3 - Output is open drain value.                           */
+  GPIO_CFGF_GPIO40OUTCFG_TS            = 3,     /*!< TS : FNCSEL = 0x3 - Output is tri-state value.                            */
+} GPIO_CFGF_GPIO40OUTCFG_Enum;
+
+/* =============================================  GPIO CFGF GPIO40INCFG [0..0]  ============================================== */
+typedef enum {                                  /*!< GPIO_CFGF_GPIO40INCFG                                                     */
+  GPIO_CFGF_GPIO40INCFG_READ           = 0,     /*!< READ : Read the GPIO pin data value.                                      */
+  GPIO_CFGF_GPIO40INCFG_RDZERO         = 1,     /*!< RDZERO : INTD = 0 - Readback will always be zero value.                   */
+} GPIO_CFGF_GPIO40INCFG_Enum;
+
+/* =========================================================  CFGG  ========================================================== */
+/* ==============================================  GPIO CFGG GPIO49INTD [7..7]  ============================================== */
+typedef enum {                                  /*!< GPIO_CFGG_GPIO49INTD                                                      */
+  GPIO_CFGG_GPIO49INTD_nCELOW          = 0,     /*!< nCELOW : FNCSEL = 0x1 - nCE polarity active low value.                    */
+  GPIO_CFGG_GPIO49INTD_nCEHIGH         = 1,     /*!< nCEHIGH : FNCSEL = 0x1 - nCE polarity active high value.                  */
+} GPIO_CFGG_GPIO49INTD_Enum;
+
+/* =============================================  GPIO CFGG GPIO49OUTCFG [5..6]  ============================================= */
+typedef enum {                                  /*!< GPIO_CFGG_GPIO49OUTCFG                                                    */
+  GPIO_CFGG_GPIO49OUTCFG_DIS           = 0,     /*!< DIS : FNCSEL = 0x3 - Output disabled value.                               */
+  GPIO_CFGG_GPIO49OUTCFG_PUSHPULL      = 1,     /*!< PUSHPULL : FNCSEL = 0x3 - Output is push-pull value.                      */
+  GPIO_CFGG_GPIO49OUTCFG_OD            = 2,     /*!< OD : FNCSEL = 0x3 - Output is open drain value.                           */
+  GPIO_CFGG_GPIO49OUTCFG_TS            = 3,     /*!< TS : FNCSEL = 0x3 - Output is tri-state value.                            */
+} GPIO_CFGG_GPIO49OUTCFG_Enum;
+
+/* =============================================  GPIO CFGG GPIO49INCFG [4..4]  ============================================== */
+typedef enum {                                  /*!< GPIO_CFGG_GPIO49INCFG                                                     */
+  GPIO_CFGG_GPIO49INCFG_READ           = 0,     /*!< READ : Read the GPIO pin data value.                                      */
+  GPIO_CFGG_GPIO49INCFG_RDZERO         = 1,     /*!< RDZERO : INTD = 0 - Readback will always be zero value.                   */
+} GPIO_CFGG_GPIO49INCFG_Enum;
+
+/* ==============================================  GPIO CFGG GPIO48INTD [3..3]  ============================================== */
+typedef enum {                                  /*!< GPIO_CFGG_GPIO48INTD                                                      */
+  GPIO_CFGG_GPIO48INTD_nCELOW          = 0,     /*!< nCELOW : FNCSEL = 0x1 - nCE polarity active low value.                    */
+  GPIO_CFGG_GPIO48INTD_nCEHIGH         = 1,     /*!< nCEHIGH : FNCSEL = 0x1 - nCE polarity active high value.                  */
+} GPIO_CFGG_GPIO48INTD_Enum;
+
+/* =============================================  GPIO CFGG GPIO48OUTCFG [1..2]  ============================================= */
+typedef enum {                                  /*!< GPIO_CFGG_GPIO48OUTCFG                                                    */
+  GPIO_CFGG_GPIO48OUTCFG_DIS           = 0,     /*!< DIS : FNCSEL = 0x3 - Output disabled value.                               */
+  GPIO_CFGG_GPIO48OUTCFG_PUSHPULL      = 1,     /*!< PUSHPULL : FNCSEL = 0x3 - Output is push-pull value.                      */
+  GPIO_CFGG_GPIO48OUTCFG_OD            = 2,     /*!< OD : FNCSEL = 0x3 - Output is open drain value.                           */
+  GPIO_CFGG_GPIO48OUTCFG_TS            = 3,     /*!< TS : FNCSEL = 0x3 - Output is tri-state value.                            */
+} GPIO_CFGG_GPIO48OUTCFG_Enum;
+
+/* =============================================  GPIO CFGG GPIO48INCFG [0..0]  ============================================== */
+typedef enum {                                  /*!< GPIO_CFGG_GPIO48INCFG                                                     */
+  GPIO_CFGG_GPIO48INCFG_READ           = 0,     /*!< READ : Read the GPIO pin data value.                                      */
+  GPIO_CFGG_GPIO48INCFG_RDZERO         = 1,     /*!< RDZERO : INTD = 0 - Readback will always be zero value.                   */
+} GPIO_CFGG_GPIO48INCFG_Enum;
+
+/* ========================================================  PADKEY  ========================================================= */
+/* ==============================================  GPIO PADKEY PADKEY [0..31]  =============================================== */
+typedef enum {                                  /*!< GPIO_PADKEY_PADKEY                                                        */
+  GPIO_PADKEY_PADKEY_Key               = 115,   /*!< Key : Key value.                                                          */
+} GPIO_PADKEY_PADKEY_Enum;
+
+/* ==========================================================  RDA  ========================================================== */
+/* ==========================================================  RDB  ========================================================== */
+/* ==========================================================  WTA  ========================================================== */
+/* ==========================================================  WTB  ========================================================== */
+/* =========================================================  WTSA  ========================================================== */
+/* =========================================================  WTSB  ========================================================== */
+/* =========================================================  WTCA  ========================================================== */
+/* =========================================================  WTCB  ========================================================== */
+/* ==========================================================  ENA  ========================================================== */
+/* ==========================================================  ENB  ========================================================== */
+/* =========================================================  ENSA  ========================================================== */
+/* =========================================================  ENSB  ========================================================== */
+/* =========================================================  ENCA  ========================================================== */
+/* =========================================================  ENCB  ========================================================== */
+/* ========================================================  STMRCAP  ======================================================== */
+/* =============================================  GPIO STMRCAP STPOL3 [30..30]  ============================================== */
+typedef enum {                                  /*!< GPIO_STMRCAP_STPOL3                                                       */
+  GPIO_STMRCAP_STPOL3_CAPLH            = 0,     /*!< CAPLH : Capture on low to high GPIO transition value.                     */
+  GPIO_STMRCAP_STPOL3_CAPHL            = 1,     /*!< CAPHL : Capture on high to low GPIO transition value.                     */
+} GPIO_STMRCAP_STPOL3_Enum;
+
+/* =============================================  GPIO STMRCAP STPOL2 [22..22]  ============================================== */
+typedef enum {                                  /*!< GPIO_STMRCAP_STPOL2                                                       */
+  GPIO_STMRCAP_STPOL2_CAPLH            = 0,     /*!< CAPLH : Capture on low to high GPIO transition value.                     */
+  GPIO_STMRCAP_STPOL2_CAPHL            = 1,     /*!< CAPHL : Capture on high to low GPIO transition value.                     */
+} GPIO_STMRCAP_STPOL2_Enum;
+
+/* =============================================  GPIO STMRCAP STPOL1 [14..14]  ============================================== */
+typedef enum {                                  /*!< GPIO_STMRCAP_STPOL1                                                       */
+  GPIO_STMRCAP_STPOL1_CAPLH            = 0,     /*!< CAPLH : Capture on low to high GPIO transition value.                     */
+  GPIO_STMRCAP_STPOL1_CAPHL            = 1,     /*!< CAPHL : Capture on high to low GPIO transition value.                     */
+} GPIO_STMRCAP_STPOL1_Enum;
+
+/* ==============================================  GPIO STMRCAP STPOL0 [6..6]  =============================================== */
+typedef enum {                                  /*!< GPIO_STMRCAP_STPOL0                                                       */
+  GPIO_STMRCAP_STPOL0_CAPLH            = 0,     /*!< CAPLH : Capture on low to high GPIO transition value.                     */
+  GPIO_STMRCAP_STPOL0_CAPHL            = 1,     /*!< CAPHL : Capture on high to low GPIO transition value.                     */
+} GPIO_STMRCAP_STPOL0_Enum;
+
+/* ========================================================  IOM0IRQ  ======================================================== */
+/* ========================================================  IOM1IRQ  ======================================================== */
+/* ========================================================  IOM2IRQ  ======================================================== */
+/* ========================================================  IOM3IRQ  ======================================================== */
+/* ========================================================  IOM4IRQ  ======================================================== */
+/* ========================================================  IOM5IRQ  ======================================================== */
+/* =======================================================  BLEIFIRQ  ======================================================== */
+/* ========================================================  GPIOOBS  ======================================================== */
+/* ======================================================  ALTPADCFGA  ======================================================= */
+/* ===========================================  GPIO ALTPADCFGA PAD3_SR [28..28]  ============================================ */
+typedef enum {                                  /*!< GPIO_ALTPADCFGA_PAD3_SR                                                   */
+  GPIO_ALTPADCFGA_PAD3_SR_SR_EN        = 1,     /*!< SR_EN : Enables Slew rate control on pad value.                           */
+} GPIO_ALTPADCFGA_PAD3_SR_Enum;
+
+/* ===========================================  GPIO ALTPADCFGA PAD2_SR [20..20]  ============================================ */
+typedef enum {                                  /*!< GPIO_ALTPADCFGA_PAD2_SR                                                   */
+  GPIO_ALTPADCFGA_PAD2_SR_SR_EN        = 1,     /*!< SR_EN : Enables Slew rate control on pad value.                           */
+} GPIO_ALTPADCFGA_PAD2_SR_Enum;
+
+/* ===========================================  GPIO ALTPADCFGA PAD1_SR [12..12]  ============================================ */
+typedef enum {                                  /*!< GPIO_ALTPADCFGA_PAD1_SR                                                   */
+  GPIO_ALTPADCFGA_PAD1_SR_SR_EN        = 1,     /*!< SR_EN : Enables Slew rate control on pad value.                           */
+} GPIO_ALTPADCFGA_PAD1_SR_Enum;
+
+/* ============================================  GPIO ALTPADCFGA PAD0_SR [4..4]  ============================================= */
+typedef enum {                                  /*!< GPIO_ALTPADCFGA_PAD0_SR                                                   */
+  GPIO_ALTPADCFGA_PAD0_SR_SR_EN        = 1,     /*!< SR_EN : Enables Slew rate control on pad value.                           */
+} GPIO_ALTPADCFGA_PAD0_SR_Enum;
+
+/* ======================================================  ALTPADCFGB  ======================================================= */
+/* ===========================================  GPIO ALTPADCFGB PAD7_SR [28..28]  ============================================ */
+typedef enum {                                  /*!< GPIO_ALTPADCFGB_PAD7_SR                                                   */
+  GPIO_ALTPADCFGB_PAD7_SR_SR_EN        = 1,     /*!< SR_EN : Enables Slew rate control on pad value.                           */
+} GPIO_ALTPADCFGB_PAD7_SR_Enum;
+
+/* ===========================================  GPIO ALTPADCFGB PAD6_SR [20..20]  ============================================ */
+typedef enum {                                  /*!< GPIO_ALTPADCFGB_PAD6_SR                                                   */
+  GPIO_ALTPADCFGB_PAD6_SR_SR_EN        = 1,     /*!< SR_EN : Enables Slew rate control on pad value.                           */
+} GPIO_ALTPADCFGB_PAD6_SR_Enum;
+
+/* ===========================================  GPIO ALTPADCFGB PAD5_SR [12..12]  ============================================ */
+typedef enum {                                  /*!< GPIO_ALTPADCFGB_PAD5_SR                                                   */
+  GPIO_ALTPADCFGB_PAD5_SR_SR_EN        = 1,     /*!< SR_EN : Enables Slew rate control on pad value.                           */
+} GPIO_ALTPADCFGB_PAD5_SR_Enum;
+
+/* ============================================  GPIO ALTPADCFGB PAD4_SR [4..4]  ============================================= */
+typedef enum {                                  /*!< GPIO_ALTPADCFGB_PAD4_SR                                                   */
+  GPIO_ALTPADCFGB_PAD4_SR_SR_EN        = 1,     /*!< SR_EN : Enables Slew rate control on pad value.                           */
+} GPIO_ALTPADCFGB_PAD4_SR_Enum;
+
+/* ======================================================  ALTPADCFGC  ======================================================= */
+/* ===========================================  GPIO ALTPADCFGC PAD11_SR [28..28]  =========================================== */
+typedef enum {                                  /*!< GPIO_ALTPADCFGC_PAD11_SR                                                  */
+  GPIO_ALTPADCFGC_PAD11_SR_SR_EN       = 1,     /*!< SR_EN : Enables Slew rate control on pad value.                           */
+} GPIO_ALTPADCFGC_PAD11_SR_Enum;
+
+/* ===========================================  GPIO ALTPADCFGC PAD10_SR [20..20]  =========================================== */
+typedef enum {                                  /*!< GPIO_ALTPADCFGC_PAD10_SR                                                  */
+  GPIO_ALTPADCFGC_PAD10_SR_SR_EN       = 1,     /*!< SR_EN : Enables Slew rate control on pad value.                           */
+} GPIO_ALTPADCFGC_PAD10_SR_Enum;
+
+/* ===========================================  GPIO ALTPADCFGC PAD9_SR [12..12]  ============================================ */
+typedef enum {                                  /*!< GPIO_ALTPADCFGC_PAD9_SR                                                   */
+  GPIO_ALTPADCFGC_PAD9_SR_SR_EN        = 1,     /*!< SR_EN : Enables Slew rate control on pad value.                           */
+} GPIO_ALTPADCFGC_PAD9_SR_Enum;
+
+/* ============================================  GPIO ALTPADCFGC PAD8_SR [4..4]  ============================================= */
+typedef enum {                                  /*!< GPIO_ALTPADCFGC_PAD8_SR                                                   */
+  GPIO_ALTPADCFGC_PAD8_SR_SR_EN        = 1,     /*!< SR_EN : Enables Slew rate control on pad value.                           */
+} GPIO_ALTPADCFGC_PAD8_SR_Enum;
+
+/* ======================================================  ALTPADCFGD  ======================================================= */
+/* ===========================================  GPIO ALTPADCFGD PAD15_SR [28..28]  =========================================== */
+typedef enum {                                  /*!< GPIO_ALTPADCFGD_PAD15_SR                                                  */
+  GPIO_ALTPADCFGD_PAD15_SR_SR_EN       = 1,     /*!< SR_EN : Enables Slew rate control on pad value.                           */
+} GPIO_ALTPADCFGD_PAD15_SR_Enum;
+
+/* ===========================================  GPIO ALTPADCFGD PAD14_SR [20..20]  =========================================== */
+typedef enum {                                  /*!< GPIO_ALTPADCFGD_PAD14_SR                                                  */
+  GPIO_ALTPADCFGD_PAD14_SR_SR_EN       = 1,     /*!< SR_EN : Enables Slew rate control on pad value.                           */
+} GPIO_ALTPADCFGD_PAD14_SR_Enum;
+
+/* ===========================================  GPIO ALTPADCFGD PAD13_SR [12..12]  =========================================== */
+typedef enum {                                  /*!< GPIO_ALTPADCFGD_PAD13_SR                                                  */
+  GPIO_ALTPADCFGD_PAD13_SR_SR_EN       = 1,     /*!< SR_EN : Enables Slew rate control on pad value.                           */
+} GPIO_ALTPADCFGD_PAD13_SR_Enum;
+
+/* ============================================  GPIO ALTPADCFGD PAD12_SR [4..4]  ============================================ */
+typedef enum {                                  /*!< GPIO_ALTPADCFGD_PAD12_SR                                                  */
+  GPIO_ALTPADCFGD_PAD12_SR_SR_EN       = 1,     /*!< SR_EN : Enables Slew rate control on pad value.                           */
+} GPIO_ALTPADCFGD_PAD12_SR_Enum;
+
+/* ======================================================  ALTPADCFGE  ======================================================= */
+/* ===========================================  GPIO ALTPADCFGE PAD19_SR [28..28]  =========================================== */
+typedef enum {                                  /*!< GPIO_ALTPADCFGE_PAD19_SR                                                  */
+  GPIO_ALTPADCFGE_PAD19_SR_SR_EN       = 1,     /*!< SR_EN : Enables Slew rate control on pad value.                           */
+} GPIO_ALTPADCFGE_PAD19_SR_Enum;
+
+/* ===========================================  GPIO ALTPADCFGE PAD18_SR [20..20]  =========================================== */
+typedef enum {                                  /*!< GPIO_ALTPADCFGE_PAD18_SR                                                  */
+  GPIO_ALTPADCFGE_PAD18_SR_SR_EN       = 1,     /*!< SR_EN : Enables Slew rate control on pad value.                           */
+} GPIO_ALTPADCFGE_PAD18_SR_Enum;
+
+/* ===========================================  GPIO ALTPADCFGE PAD17_SR [12..12]  =========================================== */
+typedef enum {                                  /*!< GPIO_ALTPADCFGE_PAD17_SR                                                  */
+  GPIO_ALTPADCFGE_PAD17_SR_SR_EN       = 1,     /*!< SR_EN : Enables Slew rate control on pad value.                           */
+} GPIO_ALTPADCFGE_PAD17_SR_Enum;
+
+/* ============================================  GPIO ALTPADCFGE PAD16_SR [4..4]  ============================================ */
+typedef enum {                                  /*!< GPIO_ALTPADCFGE_PAD16_SR                                                  */
+  GPIO_ALTPADCFGE_PAD16_SR_SR_EN       = 1,     /*!< SR_EN : Enables Slew rate control on pad value.                           */
+} GPIO_ALTPADCFGE_PAD16_SR_Enum;
+
+/* ======================================================  ALTPADCFGF  ======================================================= */
+/* ===========================================  GPIO ALTPADCFGF PAD23_SR [28..28]  =========================================== */
+typedef enum {                                  /*!< GPIO_ALTPADCFGF_PAD23_SR                                                  */
+  GPIO_ALTPADCFGF_PAD23_SR_SR_EN       = 1,     /*!< SR_EN : Enables Slew rate control on pad value.                           */
+} GPIO_ALTPADCFGF_PAD23_SR_Enum;
+
+/* ===========================================  GPIO ALTPADCFGF PAD22_SR [20..20]  =========================================== */
+typedef enum {                                  /*!< GPIO_ALTPADCFGF_PAD22_SR                                                  */
+  GPIO_ALTPADCFGF_PAD22_SR_SR_EN       = 1,     /*!< SR_EN : Enables Slew rate control on pad value.                           */
+} GPIO_ALTPADCFGF_PAD22_SR_Enum;
+
+/* ===========================================  GPIO ALTPADCFGF PAD21_SR [12..12]  =========================================== */
+typedef enum {                                  /*!< GPIO_ALTPADCFGF_PAD21_SR                                                  */
+  GPIO_ALTPADCFGF_PAD21_SR_SR_EN       = 1,     /*!< SR_EN : Enables Slew rate control on pad value.                           */
+} GPIO_ALTPADCFGF_PAD21_SR_Enum;
+
+/* ============================================  GPIO ALTPADCFGF PAD20_SR [4..4]  ============================================ */
+typedef enum {                                  /*!< GPIO_ALTPADCFGF_PAD20_SR                                                  */
+  GPIO_ALTPADCFGF_PAD20_SR_SR_EN       = 1,     /*!< SR_EN : Enables Slew rate control on pad value.                           */
+} GPIO_ALTPADCFGF_PAD20_SR_Enum;
+
+/* ======================================================  ALTPADCFGG  ======================================================= */
+/* ===========================================  GPIO ALTPADCFGG PAD27_SR [28..28]  =========================================== */
+typedef enum {                                  /*!< GPIO_ALTPADCFGG_PAD27_SR                                                  */
+  GPIO_ALTPADCFGG_PAD27_SR_SR_EN       = 1,     /*!< SR_EN : Enables Slew rate control on pad value.                           */
+} GPIO_ALTPADCFGG_PAD27_SR_Enum;
+
+/* ===========================================  GPIO ALTPADCFGG PAD26_SR [20..20]  =========================================== */
+typedef enum {                                  /*!< GPIO_ALTPADCFGG_PAD26_SR                                                  */
+  GPIO_ALTPADCFGG_PAD26_SR_SR_EN       = 1,     /*!< SR_EN : Enables Slew rate control on pad value.                           */
+} GPIO_ALTPADCFGG_PAD26_SR_Enum;
+
+/* ===========================================  GPIO ALTPADCFGG PAD25_SR [12..12]  =========================================== */
+typedef enum {                                  /*!< GPIO_ALTPADCFGG_PAD25_SR                                                  */
+  GPIO_ALTPADCFGG_PAD25_SR_SR_EN       = 1,     /*!< SR_EN : Enables Slew rate control on pad value.                           */
+} GPIO_ALTPADCFGG_PAD25_SR_Enum;
+
+/* ============================================  GPIO ALTPADCFGG PAD24_SR [4..4]  ============================================ */
+typedef enum {                                  /*!< GPIO_ALTPADCFGG_PAD24_SR                                                  */
+  GPIO_ALTPADCFGG_PAD24_SR_SR_EN       = 1,     /*!< SR_EN : Enables Slew rate control on pad value.                           */
+} GPIO_ALTPADCFGG_PAD24_SR_Enum;
+
+/* ======================================================  ALTPADCFGH  ======================================================= */
+/* ===========================================  GPIO ALTPADCFGH PAD31_SR [28..28]  =========================================== */
+typedef enum {                                  /*!< GPIO_ALTPADCFGH_PAD31_SR                                                  */
+  GPIO_ALTPADCFGH_PAD31_SR_SR_EN       = 1,     /*!< SR_EN : Enables Slew rate control on pad value.                           */
+} GPIO_ALTPADCFGH_PAD31_SR_Enum;
+
+/* ===========================================  GPIO ALTPADCFGH PAD30_SR [20..20]  =========================================== */
+typedef enum {                                  /*!< GPIO_ALTPADCFGH_PAD30_SR                                                  */
+  GPIO_ALTPADCFGH_PAD30_SR_SR_EN       = 1,     /*!< SR_EN : Enables Slew rate control on pad value.                           */
+} GPIO_ALTPADCFGH_PAD30_SR_Enum;
+
+/* ===========================================  GPIO ALTPADCFGH PAD29_SR [12..12]  =========================================== */
+typedef enum {                                  /*!< GPIO_ALTPADCFGH_PAD29_SR                                                  */
+  GPIO_ALTPADCFGH_PAD29_SR_SR_EN       = 1,     /*!< SR_EN : Enables Slew rate control on pad value.                           */
+} GPIO_ALTPADCFGH_PAD29_SR_Enum;
+
+/* ============================================  GPIO ALTPADCFGH PAD28_SR [4..4]  ============================================ */
+typedef enum {                                  /*!< GPIO_ALTPADCFGH_PAD28_SR                                                  */
+  GPIO_ALTPADCFGH_PAD28_SR_SR_EN       = 1,     /*!< SR_EN : Enables Slew rate control on pad value.                           */
+} GPIO_ALTPADCFGH_PAD28_SR_Enum;
+
+/* ======================================================  ALTPADCFGI  ======================================================= */
+/* ===========================================  GPIO ALTPADCFGI PAD35_SR [28..28]  =========================================== */
+typedef enum {                                  /*!< GPIO_ALTPADCFGI_PAD35_SR                                                  */
+  GPIO_ALTPADCFGI_PAD35_SR_SR_EN       = 1,     /*!< SR_EN : Enables Slew rate control on pad value.                           */
+} GPIO_ALTPADCFGI_PAD35_SR_Enum;
+
+/* ===========================================  GPIO ALTPADCFGI PAD34_SR [20..20]  =========================================== */
+typedef enum {                                  /*!< GPIO_ALTPADCFGI_PAD34_SR                                                  */
+  GPIO_ALTPADCFGI_PAD34_SR_SR_EN       = 1,     /*!< SR_EN : Enables Slew rate control on pad value.                           */
+} GPIO_ALTPADCFGI_PAD34_SR_Enum;
+
+/* ===========================================  GPIO ALTPADCFGI PAD33_SR [12..12]  =========================================== */
+typedef enum {                                  /*!< GPIO_ALTPADCFGI_PAD33_SR                                                  */
+  GPIO_ALTPADCFGI_PAD33_SR_SR_EN       = 1,     /*!< SR_EN : Enables Slew rate control on pad value.                           */
+} GPIO_ALTPADCFGI_PAD33_SR_Enum;
+
+/* ============================================  GPIO ALTPADCFGI PAD32_SR [4..4]  ============================================ */
+typedef enum {                                  /*!< GPIO_ALTPADCFGI_PAD32_SR                                                  */
+  GPIO_ALTPADCFGI_PAD32_SR_SR_EN       = 1,     /*!< SR_EN : Enables Slew rate control on pad value.                           */
+} GPIO_ALTPADCFGI_PAD32_SR_Enum;
+
+/* ======================================================  ALTPADCFGJ  ======================================================= */
+/* ===========================================  GPIO ALTPADCFGJ PAD39_SR [28..28]  =========================================== */
+typedef enum {                                  /*!< GPIO_ALTPADCFGJ_PAD39_SR                                                  */
+  GPIO_ALTPADCFGJ_PAD39_SR_SR_EN       = 1,     /*!< SR_EN : Enables Slew rate control on pad value.                           */
+} GPIO_ALTPADCFGJ_PAD39_SR_Enum;
+
+/* ===========================================  GPIO ALTPADCFGJ PAD38_SR [20..20]  =========================================== */
+typedef enum {                                  /*!< GPIO_ALTPADCFGJ_PAD38_SR                                                  */
+  GPIO_ALTPADCFGJ_PAD38_SR_SR_EN       = 1,     /*!< SR_EN : Enables Slew rate control on pad value.                           */
+} GPIO_ALTPADCFGJ_PAD38_SR_Enum;
+
+/* ===========================================  GPIO ALTPADCFGJ PAD37_SR [12..12]  =========================================== */
+typedef enum {                                  /*!< GPIO_ALTPADCFGJ_PAD37_SR                                                  */
+  GPIO_ALTPADCFGJ_PAD37_SR_SR_EN       = 1,     /*!< SR_EN : Enables Slew rate control on pad value.                           */
+} GPIO_ALTPADCFGJ_PAD37_SR_Enum;
+
+/* ============================================  GPIO ALTPADCFGJ PAD36_SR [4..4]  ============================================ */
+typedef enum {                                  /*!< GPIO_ALTPADCFGJ_PAD36_SR                                                  */
+  GPIO_ALTPADCFGJ_PAD36_SR_SR_EN       = 1,     /*!< SR_EN : Enables Slew rate control on pad value.                           */
+} GPIO_ALTPADCFGJ_PAD36_SR_Enum;
+
+/* ======================================================  ALTPADCFGK  ======================================================= */
+/* ===========================================  GPIO ALTPADCFGK PAD43_SR [28..28]  =========================================== */
+typedef enum {                                  /*!< GPIO_ALTPADCFGK_PAD43_SR                                                  */
+  GPIO_ALTPADCFGK_PAD43_SR_SR_EN       = 1,     /*!< SR_EN : Enables Slew rate control on pad value.                           */
+} GPIO_ALTPADCFGK_PAD43_SR_Enum;
+
+/* ===========================================  GPIO ALTPADCFGK PAD42_SR [20..20]  =========================================== */
+typedef enum {                                  /*!< GPIO_ALTPADCFGK_PAD42_SR                                                  */
+  GPIO_ALTPADCFGK_PAD42_SR_SR_EN       = 1,     /*!< SR_EN : Enables Slew rate control on pad value.                           */
+} GPIO_ALTPADCFGK_PAD42_SR_Enum;
+
+/* ===========================================  GPIO ALTPADCFGK PAD41_SR [12..12]  =========================================== */
+typedef enum {                                  /*!< GPIO_ALTPADCFGK_PAD41_SR                                                  */
+  GPIO_ALTPADCFGK_PAD41_SR_SR_EN       = 1,     /*!< SR_EN : Enables Slew rate control on pad value.                           */
+} GPIO_ALTPADCFGK_PAD41_SR_Enum;
+
+/* ============================================  GPIO ALTPADCFGK PAD40_SR [4..4]  ============================================ */
+typedef enum {                                  /*!< GPIO_ALTPADCFGK_PAD40_SR                                                  */
+  GPIO_ALTPADCFGK_PAD40_SR_SR_EN       = 1,     /*!< SR_EN : Enables Slew rate control on pad value.                           */
+} GPIO_ALTPADCFGK_PAD40_SR_Enum;
+
+/* ======================================================  ALTPADCFGL  ======================================================= */
+/* ===========================================  GPIO ALTPADCFGL PAD47_SR [28..28]  =========================================== */
+typedef enum {                                  /*!< GPIO_ALTPADCFGL_PAD47_SR                                                  */
+  GPIO_ALTPADCFGL_PAD47_SR_SR_EN       = 1,     /*!< SR_EN : Enables Slew rate control on pad value.                           */
+} GPIO_ALTPADCFGL_PAD47_SR_Enum;
+
+/* ===========================================  GPIO ALTPADCFGL PAD46_SR [20..20]  =========================================== */
+typedef enum {                                  /*!< GPIO_ALTPADCFGL_PAD46_SR                                                  */
+  GPIO_ALTPADCFGL_PAD46_SR_SR_EN       = 1,     /*!< SR_EN : Enables Slew rate control on pad value.                           */
+} GPIO_ALTPADCFGL_PAD46_SR_Enum;
+
+/* ===========================================  GPIO ALTPADCFGL PAD45_SR [12..12]  =========================================== */
+typedef enum {                                  /*!< GPIO_ALTPADCFGL_PAD45_SR                                                  */
+  GPIO_ALTPADCFGL_PAD45_SR_SR_EN       = 1,     /*!< SR_EN : Enables Slew rate control on pad value.                           */
+} GPIO_ALTPADCFGL_PAD45_SR_Enum;
+
+/* ============================================  GPIO ALTPADCFGL PAD44_SR [4..4]  ============================================ */
+typedef enum {                                  /*!< GPIO_ALTPADCFGL_PAD44_SR                                                  */
+  GPIO_ALTPADCFGL_PAD44_SR_SR_EN       = 1,     /*!< SR_EN : Enables Slew rate control on pad value.                           */
+} GPIO_ALTPADCFGL_PAD44_SR_Enum;
+
+/* ======================================================  ALTPADCFGM  ======================================================= */
+/* ===========================================  GPIO ALTPADCFGM PAD49_SR [12..12]  =========================================== */
+typedef enum {                                  /*!< GPIO_ALTPADCFGM_PAD49_SR                                                  */
+  GPIO_ALTPADCFGM_PAD49_SR_SR_EN       = 1,     /*!< SR_EN : Enables Slew rate control on pad value.                           */
+} GPIO_ALTPADCFGM_PAD49_SR_Enum;
+
+/* ============================================  GPIO ALTPADCFGM PAD48_SR [4..4]  ============================================ */
+typedef enum {                                  /*!< GPIO_ALTPADCFGM_PAD48_SR                                                  */
+  GPIO_ALTPADCFGM_PAD48_SR_SR_EN       = 1,     /*!< SR_EN : Enables Slew rate control on pad value.                           */
+} GPIO_ALTPADCFGM_PAD48_SR_Enum;
+
+/* =========================================================  SCDET  ========================================================= */
+/* ========================================================  CTENCFG  ======================================================== */
+/* ==============================================  GPIO CTENCFG EN31 [31..31]  =============================================== */
+typedef enum {                                  /*!< GPIO_CTENCFG_EN31                                                         */
+  GPIO_CTENCFG_EN31_DIS                = 1,     /*!< DIS : Disable CT31 for output value.                                      */
+  GPIO_CTENCFG_EN31_EN                 = 0,     /*!< EN : Enable CT31 for output value.                                        */
+} GPIO_CTENCFG_EN31_Enum;
+
+/* ==============================================  GPIO CTENCFG EN30 [30..30]  =============================================== */
+typedef enum {                                  /*!< GPIO_CTENCFG_EN30                                                         */
+  GPIO_CTENCFG_EN30_DIS                = 1,     /*!< DIS : Disable CT30 for output value.                                      */
+  GPIO_CTENCFG_EN30_EN                 = 0,     /*!< EN : Enable CT30 for output value.                                        */
+} GPIO_CTENCFG_EN30_Enum;
+
+/* ==============================================  GPIO CTENCFG EN29 [29..29]  =============================================== */
+typedef enum {                                  /*!< GPIO_CTENCFG_EN29                                                         */
+  GPIO_CTENCFG_EN29_DIS                = 1,     /*!< DIS : Disable CT29 for output value.                                      */
+  GPIO_CTENCFG_EN29_EN                 = 0,     /*!< EN : Enable CT29 for output value.                                        */
+} GPIO_CTENCFG_EN29_Enum;
+
+/* ==============================================  GPIO CTENCFG EN28 [28..28]  =============================================== */
+typedef enum {                                  /*!< GPIO_CTENCFG_EN28                                                         */
+  GPIO_CTENCFG_EN28_DIS                = 1,     /*!< DIS : Disable CT28 for output value.                                      */
+  GPIO_CTENCFG_EN28_EN                 = 0,     /*!< EN : Enable CT28 for output value.                                        */
+} GPIO_CTENCFG_EN28_Enum;
+
+/* ==============================================  GPIO CTENCFG EN27 [27..27]  =============================================== */
+typedef enum {                                  /*!< GPIO_CTENCFG_EN27                                                         */
+  GPIO_CTENCFG_EN27_DIS                = 1,     /*!< DIS : Disable CT27 for output value.                                      */
+  GPIO_CTENCFG_EN27_EN                 = 0,     /*!< EN : Enable CT27 for output value.                                        */
+} GPIO_CTENCFG_EN27_Enum;
+
+/* ==============================================  GPIO CTENCFG EN26 [26..26]  =============================================== */
+typedef enum {                                  /*!< GPIO_CTENCFG_EN26                                                         */
+  GPIO_CTENCFG_EN26_DIS                = 1,     /*!< DIS : Disable CT26 for output value.                                      */
+  GPIO_CTENCFG_EN26_EN                 = 0,     /*!< EN : Enable CT26 for output value.                                        */
+} GPIO_CTENCFG_EN26_Enum;
+
+/* ==============================================  GPIO CTENCFG EN25 [25..25]  =============================================== */
+typedef enum {                                  /*!< GPIO_CTENCFG_EN25                                                         */
+  GPIO_CTENCFG_EN25_DIS                = 1,     /*!< DIS : Disable CT25 for output value.                                      */
+  GPIO_CTENCFG_EN25_EN                 = 0,     /*!< EN : Enable CT25 for output value.                                        */
+} GPIO_CTENCFG_EN25_Enum;
+
+/* ==============================================  GPIO CTENCFG EN24 [24..24]  =============================================== */
+typedef enum {                                  /*!< GPIO_CTENCFG_EN24                                                         */
+  GPIO_CTENCFG_EN24_DIS                = 1,     /*!< DIS : Disable CT24 for output value.                                      */
+  GPIO_CTENCFG_EN24_EN                 = 0,     /*!< EN : Enable CT24 for output value.                                        */
+} GPIO_CTENCFG_EN24_Enum;
+
+/* ==============================================  GPIO CTENCFG EN23 [23..23]  =============================================== */
+typedef enum {                                  /*!< GPIO_CTENCFG_EN23                                                         */
+  GPIO_CTENCFG_EN23_DIS                = 1,     /*!< DIS : Disable CT23 for output value.                                      */
+  GPIO_CTENCFG_EN23_EN                 = 0,     /*!< EN : Enable CT23 for output value.                                        */
+} GPIO_CTENCFG_EN23_Enum;
+
+/* ==============================================  GPIO CTENCFG EN22 [22..22]  =============================================== */
+typedef enum {                                  /*!< GPIO_CTENCFG_EN22                                                         */
+  GPIO_CTENCFG_EN22_DIS                = 1,     /*!< DIS : Disable CT22 for output value.                                      */
+  GPIO_CTENCFG_EN22_EN                 = 0,     /*!< EN : Enable CT22 for output value.                                        */
+} GPIO_CTENCFG_EN22_Enum;
+
+/* ==============================================  GPIO CTENCFG EN21 [21..21]  =============================================== */
+typedef enum {                                  /*!< GPIO_CTENCFG_EN21                                                         */
+  GPIO_CTENCFG_EN21_DIS                = 1,     /*!< DIS : Disable CT21 for output value.                                      */
+  GPIO_CTENCFG_EN21_EN                 = 0,     /*!< EN : Enable CT21 for output value.                                        */
+} GPIO_CTENCFG_EN21_Enum;
+
+/* ==============================================  GPIO CTENCFG EN20 [20..20]  =============================================== */
+typedef enum {                                  /*!< GPIO_CTENCFG_EN20                                                         */
+  GPIO_CTENCFG_EN20_DIS                = 1,     /*!< DIS : Disable CT20 for output value.                                      */
+  GPIO_CTENCFG_EN20_EN                 = 0,     /*!< EN : Enable CT20 for output value.                                        */
+} GPIO_CTENCFG_EN20_Enum;
+
+/* ==============================================  GPIO CTENCFG EN19 [19..19]  =============================================== */
+typedef enum {                                  /*!< GPIO_CTENCFG_EN19                                                         */
+  GPIO_CTENCFG_EN19_DIS                = 1,     /*!< DIS : Disable CT19 for output value.                                      */
+  GPIO_CTENCFG_EN19_EN                 = 0,     /*!< EN : Enable CT19 for output value.                                        */
+} GPIO_CTENCFG_EN19_Enum;
+
+/* ==============================================  GPIO CTENCFG EN18 [18..18]  =============================================== */
+typedef enum {                                  /*!< GPIO_CTENCFG_EN18                                                         */
+  GPIO_CTENCFG_EN18_DIS                = 1,     /*!< DIS : Disable CT18 for output value.                                      */
+  GPIO_CTENCFG_EN18_EN                 = 0,     /*!< EN : Enable CT18 for output value.                                        */
+} GPIO_CTENCFG_EN18_Enum;
+
+/* ==============================================  GPIO CTENCFG EN17 [17..17]  =============================================== */
+typedef enum {                                  /*!< GPIO_CTENCFG_EN17                                                         */
+  GPIO_CTENCFG_EN17_DIS                = 1,     /*!< DIS : Disable CT17 for output value.                                      */
+  GPIO_CTENCFG_EN17_EN                 = 0,     /*!< EN : Enable CT17 for output value.                                        */
+} GPIO_CTENCFG_EN17_Enum;
+
+/* ==============================================  GPIO CTENCFG EN16 [16..16]  =============================================== */
+typedef enum {                                  /*!< GPIO_CTENCFG_EN16                                                         */
+  GPIO_CTENCFG_EN16_DIS                = 1,     /*!< DIS : Disable CT16 for output value.                                      */
+  GPIO_CTENCFG_EN16_EN                 = 0,     /*!< EN : Enable CT16 for output value.                                        */
+} GPIO_CTENCFG_EN16_Enum;
+
+/* ==============================================  GPIO CTENCFG EN15 [15..15]  =============================================== */
+typedef enum {                                  /*!< GPIO_CTENCFG_EN15                                                         */
+  GPIO_CTENCFG_EN15_DIS                = 1,     /*!< DIS : Disable CT15 for output value.                                      */
+  GPIO_CTENCFG_EN15_EN                 = 0,     /*!< EN : Enable CT15 for output value.                                        */
+} GPIO_CTENCFG_EN15_Enum;
+
+/* ==============================================  GPIO CTENCFG EN14 [14..14]  =============================================== */
+typedef enum {                                  /*!< GPIO_CTENCFG_EN14                                                         */
+  GPIO_CTENCFG_EN14_DIS                = 1,     /*!< DIS : Disable CT14 for output value.                                      */
+  GPIO_CTENCFG_EN14_EN                 = 0,     /*!< EN : Enable CT14 for output value.                                        */
+} GPIO_CTENCFG_EN14_Enum;
+
+/* ==============================================  GPIO CTENCFG EN13 [13..13]  =============================================== */
+typedef enum {                                  /*!< GPIO_CTENCFG_EN13                                                         */
+  GPIO_CTENCFG_EN13_DIS                = 1,     /*!< DIS : Disable CT13 for output value.                                      */
+  GPIO_CTENCFG_EN13_EN                 = 0,     /*!< EN : Enable CT13 for output value.                                        */
+} GPIO_CTENCFG_EN13_Enum;
+
+/* ==============================================  GPIO CTENCFG EN12 [12..12]  =============================================== */
+typedef enum {                                  /*!< GPIO_CTENCFG_EN12                                                         */
+  GPIO_CTENCFG_EN12_DIS                = 1,     /*!< DIS : Disable CT12 for output value.                                      */
+  GPIO_CTENCFG_EN12_EN                 = 0,     /*!< EN : Enable CT12 for output value.                                        */
+} GPIO_CTENCFG_EN12_Enum;
+
+/* ==============================================  GPIO CTENCFG EN11 [11..11]  =============================================== */
+typedef enum {                                  /*!< GPIO_CTENCFG_EN11                                                         */
+  GPIO_CTENCFG_EN11_DIS                = 1,     /*!< DIS : Disable CT11 for output value.                                      */
+  GPIO_CTENCFG_EN11_EN                 = 0,     /*!< EN : Enable CT11 for output value.                                        */
+} GPIO_CTENCFG_EN11_Enum;
+
+/* ==============================================  GPIO CTENCFG EN10 [10..10]  =============================================== */
+typedef enum {                                  /*!< GPIO_CTENCFG_EN10                                                         */
+  GPIO_CTENCFG_EN10_DIS                = 1,     /*!< DIS : Disable CT10 for output value.                                      */
+  GPIO_CTENCFG_EN10_EN                 = 0,     /*!< EN : Enable CT10 for output value.                                        */
+} GPIO_CTENCFG_EN10_Enum;
+
+/* ================================================  GPIO CTENCFG EN9 [9..9]  ================================================ */
+typedef enum {                                  /*!< GPIO_CTENCFG_EN9                                                          */
+  GPIO_CTENCFG_EN9_DIS                 = 0,     /*!< DIS : Disable CT9 for output value.                                       */
+} GPIO_CTENCFG_EN9_Enum;
+
+/* ================================================  GPIO CTENCFG EN8 [8..8]  ================================================ */
+typedef enum {                                  /*!< GPIO_CTENCFG_EN8                                                          */
+  GPIO_CTENCFG_EN8_DIS                 = 1,     /*!< DIS : Disable CT8 for output value.                                       */
+  GPIO_CTENCFG_EN8_EN                  = 0,     /*!< EN : Enable CT8 for output value.                                         */
+} GPIO_CTENCFG_EN8_Enum;
+
+/* ================================================  GPIO CTENCFG EN7 [7..7]  ================================================ */
+typedef enum {                                  /*!< GPIO_CTENCFG_EN7                                                          */
+  GPIO_CTENCFG_EN7_DIS                 = 1,     /*!< DIS : Disable CT7 for output value.                                       */
+  GPIO_CTENCFG_EN7_EN                  = 0,     /*!< EN : Enable CT7 for output value.                                         */
+} GPIO_CTENCFG_EN7_Enum;
+
+/* ================================================  GPIO CTENCFG EN6 [6..6]  ================================================ */
+typedef enum {                                  /*!< GPIO_CTENCFG_EN6                                                          */
+  GPIO_CTENCFG_EN6_DIS                 = 1,     /*!< DIS : Disable CT6 for output value.                                       */
+  GPIO_CTENCFG_EN6_EN                  = 0,     /*!< EN : Enable CT6 for output value.                                         */
+} GPIO_CTENCFG_EN6_Enum;
+
+/* ================================================  GPIO CTENCFG EN5 [5..5]  ================================================ */
+typedef enum {                                  /*!< GPIO_CTENCFG_EN5                                                          */
+  GPIO_CTENCFG_EN5_DIS                 = 1,     /*!< DIS : Disable CT5 for output value.                                       */
+  GPIO_CTENCFG_EN5_EN                  = 0,     /*!< EN : Enable CT5 for output value.                                         */
+} GPIO_CTENCFG_EN5_Enum;
+
+/* ================================================  GPIO CTENCFG EN4 [4..4]  ================================================ */
+typedef enum {                                  /*!< GPIO_CTENCFG_EN4                                                          */
+  GPIO_CTENCFG_EN4_DIS                 = 1,     /*!< DIS : Disable CT4 for output value.                                       */
+  GPIO_CTENCFG_EN4_EN                  = 0,     /*!< EN : Enable CT4 for output value.                                         */
+} GPIO_CTENCFG_EN4_Enum;
+
+/* ================================================  GPIO CTENCFG EN3 [3..3]  ================================================ */
+typedef enum {                                  /*!< GPIO_CTENCFG_EN3                                                          */
+  GPIO_CTENCFG_EN3_DIS                 = 1,     /*!< DIS : Disable CT3 for output value.                                       */
+  GPIO_CTENCFG_EN3_EN                  = 0,     /*!< EN : Enable CT3 for output value.                                         */
+} GPIO_CTENCFG_EN3_Enum;
+
+/* ================================================  GPIO CTENCFG EN2 [2..2]  ================================================ */
+typedef enum {                                  /*!< GPIO_CTENCFG_EN2                                                          */
+  GPIO_CTENCFG_EN2_DIS                 = 1,     /*!< DIS : Disable CT2 for output value.                                       */
+  GPIO_CTENCFG_EN2_EN                  = 0,     /*!< EN : Enable CT2 for output value.                                         */
+} GPIO_CTENCFG_EN2_Enum;
+
+/* ================================================  GPIO CTENCFG EN1 [1..1]  ================================================ */
+typedef enum {                                  /*!< GPIO_CTENCFG_EN1                                                          */
+  GPIO_CTENCFG_EN1_DIS                 = 1,     /*!< DIS : Disable CT1 for output value.                                       */
+  GPIO_CTENCFG_EN1_EN                  = 0,     /*!< EN : Enable CT1 for output value.                                         */
+} GPIO_CTENCFG_EN1_Enum;
+
+/* ================================================  GPIO CTENCFG EN0 [0..0]  ================================================ */
+typedef enum {                                  /*!< GPIO_CTENCFG_EN0                                                          */
+  GPIO_CTENCFG_EN0_DIS                 = 1,     /*!< DIS : Disable CT0 for output value.                                       */
+  GPIO_CTENCFG_EN0_EN                  = 0,     /*!< EN : Enable CT0 for output value.                                         */
+} GPIO_CTENCFG_EN0_Enum;
+
+/* ========================================================  INT0EN  ========================================================= */
+/* =======================================================  INT0STAT  ======================================================== */
+/* ========================================================  INT0CLR  ======================================================== */
+/* ========================================================  INT0SET  ======================================================== */
+/* ========================================================  INT1EN  ========================================================= */
+/* =======================================================  INT1STAT  ======================================================== */
+/* ========================================================  INT1CLR  ======================================================== */
+/* ========================================================  INT1SET  ======================================================== */
+
+
+/* =========================================================================================================================== */
+/* ================                                           IOM0                                            ================ */
+/* =========================================================================================================================== */
+
+/* =========================================================  FIFO  ========================================================== */
+/* ========================================================  FIFOPTR  ======================================================== */
+/* ========================================================  FIFOTHR  ======================================================== */
+/* ========================================================  FIFOPOP  ======================================================== */
+/* =======================================================  FIFOPUSH  ======================================================== */
+/* =======================================================  FIFOCTRL  ======================================================== */
+/* ========================================================  FIFOLOC  ======================================================== */
+/* =========================================================  INTEN  ========================================================= */
+/* ========================================================  INTSTAT  ======================================================== */
+/* ========================================================  INTCLR  ========================================================= */
+/* ========================================================  INTSET  ========================================================= */
+/* ========================================================  CLKCFG  ========================================================= */
+/* ==============================================  IOM0 CLKCFG DIVEN [12..12]  =============================================== */
+typedef enum {                                  /*!< IOM0_CLKCFG_DIVEN                                                         */
+  IOM0_CLKCFG_DIVEN_DIS                = 0,     /*!< DIS : Disable TOTPER division. value.                                     */
+  IOM0_CLKCFG_DIVEN_EN                 = 1,     /*!< EN : Enable TOTPER division. value.                                       */
+} IOM0_CLKCFG_DIVEN_Enum;
+
+/* ===============================================  IOM0 CLKCFG DIV3 [11..11]  =============================================== */
+typedef enum {                                  /*!< IOM0_CLKCFG_DIV3                                                          */
+  IOM0_CLKCFG_DIV3_DIS                 = 0,     /*!< DIS : Select divide by 1. value.                                          */
+  IOM0_CLKCFG_DIV3_EN                  = 1,     /*!< EN : Select divide by 3. value.                                           */
+} IOM0_CLKCFG_DIV3_Enum;
+
+/* ===============================================  IOM0 CLKCFG FSEL [8..10]  ================================================ */
+typedef enum {                                  /*!< IOM0_CLKCFG_FSEL                                                          */
+  IOM0_CLKCFG_FSEL_MIN_PWR             = 0,     /*!< MIN_PWR : Selects the minimum power clock. This setting should
+                                                     be used whenever the IOM is not active. value.                            */
+  IOM0_CLKCFG_FSEL_HFRC                = 1,     /*!< HFRC : Selects the HFRC as the input clock. value.                        */
+  IOM0_CLKCFG_FSEL_HFRC_DIV2           = 2,     /*!< HFRC_DIV2 : Selects the HFRC / 2 as the input clock. value.               */
+  IOM0_CLKCFG_FSEL_HFRC_DIV4           = 3,     /*!< HFRC_DIV4 : Selects the HFRC / 4 as the input clock. value.               */
+  IOM0_CLKCFG_FSEL_HFRC_DIV8           = 4,     /*!< HFRC_DIV8 : Selects the HFRC / 8 as the input clock. value.               */
+  IOM0_CLKCFG_FSEL_HFRC_DIV16          = 5,     /*!< HFRC_DIV16 : Selects the HFRC / 16 as the input clock. value.             */
+  IOM0_CLKCFG_FSEL_HFRC_DIV32          = 6,     /*!< HFRC_DIV32 : Selects the HFRC / 32 as the input clock. value.             */
+  IOM0_CLKCFG_FSEL_HFRC_DIV64          = 7,     /*!< HFRC_DIV64 : Selects the HFRC / 64 as the input clock. value.             */
+} IOM0_CLKCFG_FSEL_Enum;
+
+/* ======================================================  SUBMODCTRL  ======================================================= */
+/* ===========================================  IOM0 SUBMODCTRL SMOD1TYPE [5..7]  ============================================ */
+typedef enum {                                  /*!< IOM0_SUBMODCTRL_SMOD1TYPE                                                 */
+  IOM0_SUBMODCTRL_SMOD1TYPE_MSPI       = 0,     /*!< MSPI : SPI Master submodule value.                                        */
+  IOM0_SUBMODCTRL_SMOD1TYPE_I2C_MASTER = 1,     /*!< I2C_MASTER : MI2C submodule value.                                        */
+  IOM0_SUBMODCTRL_SMOD1TYPE_SSPI       = 2,     /*!< SSPI : SPI Slave submodule value.                                         */
+  IOM0_SUBMODCTRL_SMOD1TYPE_SI2C       = 3,     /*!< SI2C : I2C Slave submodule value.                                         */
+  IOM0_SUBMODCTRL_SMOD1TYPE_NA         = 7,     /*!< NA : NOT INSTALLED value.                                                 */
+} IOM0_SUBMODCTRL_SMOD1TYPE_Enum;
+
+/* ===========================================  IOM0 SUBMODCTRL SMOD0TYPE [1..3]  ============================================ */
+typedef enum {                                  /*!< IOM0_SUBMODCTRL_SMOD0TYPE                                                 */
+  IOM0_SUBMODCTRL_SMOD0TYPE_SPI_MASTER = 0,     /*!< SPI_MASTER : MSPI submodule value.                                        */
+  IOM0_SUBMODCTRL_SMOD0TYPE_I2C_MASTER = 1,     /*!< I2C_MASTER : I2C Master submodule value.                                  */
+  IOM0_SUBMODCTRL_SMOD0TYPE_SSPI       = 2,     /*!< SSPI : SPI Slave submodule value.                                         */
+  IOM0_SUBMODCTRL_SMOD0TYPE_SI2C       = 3,     /*!< SI2C : I2C Slave submodule value.                                         */
+  IOM0_SUBMODCTRL_SMOD0TYPE_NA         = 7,     /*!< NA : NOT INSTALLED value.                                                 */
+} IOM0_SUBMODCTRL_SMOD0TYPE_Enum;
+
+/* ==========================================================  CMD  ========================================================== */
+/* ==================================================  IOM0 CMD CMD [0..4]  ================================================== */
+typedef enum {                                  /*!< IOM0_CMD_CMD                                                              */
+  IOM0_CMD_CMD_WRITE                   = 1,     /*!< WRITE : Write command using count of offset bytes specified
+                                                     in the OFFSETCNT field value.                                             */
+  IOM0_CMD_CMD_READ                    = 2,     /*!< READ : Read command using count of offset bytes specified in
+                                                     the OFFSETCNT field value.                                                */
+  IOM0_CMD_CMD_TMW                     = 3,     /*!< TMW : SPI only. Test mode to do constant write operations. Useful
+                                                     for debug and power measurements. Will continually send
+                                                     data in OFFSET field value.                                               */
+  IOM0_CMD_CMD_TMR                     = 4,     /*!< TMR : SPI Only. Test mode to do constant read operations. Useful
+                                                     for debug and power measurements. Will continually read
+                                                     data from external input value.                                           */
+} IOM0_CMD_CMD_Enum;
+
+/* ========================================================  CMDRPT  ========================================================= */
+/* =======================================================  OFFSETHI  ======================================================== */
+/* ========================================================  CMDSTAT  ======================================================== */
+/* ==============================================  IOM0 CMDSTAT CMDSTAT [5..7]  ============================================== */
+typedef enum {                                  /*!< IOM0_CMDSTAT_CMDSTAT                                                      */
+  IOM0_CMDSTAT_CMDSTAT_ERR             = 1,     /*!< ERR : Error encountered with command value.                               */
+  IOM0_CMDSTAT_CMDSTAT_ACTIVE          = 2,     /*!< ACTIVE : Actively processing command value.                               */
+  IOM0_CMDSTAT_CMDSTAT_IDLE            = 4,     /*!< IDLE : Idle state, no active command, no error value.                     */
+  IOM0_CMDSTAT_CMDSTAT_WAIT            = 6,     /*!< WAIT : Command in progress, but waiting on data from host value.          */
+} IOM0_CMDSTAT_CMDSTAT_Enum;
+
+/* =======================================================  DMATRIGEN  ======================================================= */
+/* ======================================================  DMATRIGSTAT  ====================================================== */
+/* ========================================================  DMACFG  ========================================================= */
+/* ==============================================  IOM0 DMACFG DPWROFF [9..9]  =============================================== */
+typedef enum {                                  /*!< IOM0_DMACFG_DPWROFF                                                       */
+  IOM0_DMACFG_DPWROFF_DIS              = 0,     /*!< DIS : Power off disabled value.                                           */
+  IOM0_DMACFG_DPWROFF_EN               = 1,     /*!< EN : Power off enabled value.                                             */
+} IOM0_DMACFG_DPWROFF_Enum;
+
+/* ===============================================  IOM0 DMACFG DMAPRI [8..8]  =============================================== */
+typedef enum {                                  /*!< IOM0_DMACFG_DMAPRI                                                        */
+  IOM0_DMACFG_DMAPRI_LOW               = 0,     /*!< LOW : Low Priority (service as best effort) value.                        */
+  IOM0_DMACFG_DMAPRI_HIGH              = 1,     /*!< HIGH : High Priority (service immediately) value.                         */
+} IOM0_DMACFG_DMAPRI_Enum;
+
+/* ===============================================  IOM0 DMACFG DMADIR [1..1]  =============================================== */
+typedef enum {                                  /*!< IOM0_DMACFG_DMADIR                                                        */
+  IOM0_DMACFG_DMADIR_P2M               = 0,     /*!< P2M : Peripheral to Memory (SRAM) transaction. To be set when
+                                                     doing IOM read operations, ie reading data from external
+                                                     devices. value.                                                           */
+  IOM0_DMACFG_DMADIR_M2P               = 1,     /*!< M2P : Memory to Peripheral transaction. To be set when doing
+                                                     IOM write operations, ie writing data to external devices.
+                                                     value.                                                                    */
+} IOM0_DMACFG_DMADIR_Enum;
+
+/* ===============================================  IOM0 DMACFG DMAEN [0..0]  ================================================ */
+typedef enum {                                  /*!< IOM0_DMACFG_DMAEN                                                         */
+  IOM0_DMACFG_DMAEN_DIS                = 0,     /*!< DIS : Disable DMA Function value.                                         */
+  IOM0_DMACFG_DMAEN_EN                 = 1,     /*!< EN : Enable DMA Function value.                                           */
+} IOM0_DMACFG_DMAEN_Enum;
+
+/* ======================================================  DMATOTCOUNT  ====================================================== */
+/* ======================================================  DMATARGADDR  ====================================================== */
+/* ========================================================  DMASTAT  ======================================================== */
+/* =========================================================  CQCFG  ========================================================= */
+/* ================================================  IOM0 CQCFG CQPRI [1..1]  ================================================ */
+typedef enum {                                  /*!< IOM0_CQCFG_CQPRI                                                          */
+  IOM0_CQCFG_CQPRI_LOW                 = 0,     /*!< LOW : Low Priority (service as best effort) value.                        */
+  IOM0_CQCFG_CQPRI_HIGH                = 1,     /*!< HIGH : High Priority (service immediately) value.                         */
+} IOM0_CQCFG_CQPRI_Enum;
+
+/* ================================================  IOM0 CQCFG CQEN [0..0]  ================================================= */
+typedef enum {                                  /*!< IOM0_CQCFG_CQEN                                                           */
+  IOM0_CQCFG_CQEN_DIS                  = 0,     /*!< DIS : Disable CQ Function value.                                          */
+  IOM0_CQCFG_CQEN_EN                   = 1,     /*!< EN : Enable CQ Function value.                                            */
+} IOM0_CQCFG_CQEN_Enum;
+
+/* ========================================================  CQADDR  ========================================================= */
+/* ========================================================  CQSTAT  ========================================================= */
+/* ========================================================  CQFLAGS  ======================================================== */
+/* ======================================================  CQSETCLEAR  ======================================================= */
+/* =======================================================  CQPAUSEEN  ======================================================= */
+/* =============================================  IOM0 CQPAUSEEN CQPEN [0..15]  ============================================== */
+typedef enum {                                  /*!< IOM0_CQPAUSEEN_CQPEN                                                      */
+  IOM0_CQPAUSEEN_CQPEN_IDXEQ           = 32768, /*!< IDXEQ : Pauses the command queue when the current index matches
+                                                     the last index value.                                                     */
+  IOM0_CQPAUSEEN_CQPEN_BLEXOREN        = 16384, /*!< BLEXOREN : Pause command queue when input BLE bit XORed with
+                                                     SWFLAG4 is '1' value.                                                     */
+  IOM0_CQPAUSEEN_CQPEN_IOMXOREN        = 8192,  /*!< IOMXOREN : Pause command queue when input IOM bit XORed with
+                                                     SWFLAG3 is '1' value.                                                     */
+  IOM0_CQPAUSEEN_CQPEN_GPIOXOREN       = 4096,  /*!< GPIOXOREN : Pause command queue when input GPIO irq_bit XORed
+                                                     with SWFLAG2 is '1' value.                                                */
+  IOM0_CQPAUSEEN_CQPEN_MSPI1XNOREN     = 2048,  /*!< MSPI1XNOREN : Pause command queue when input MSPI1 bit XNORed
+                                                     with SWFLAG1 is '1' value.                                                */
+  IOM0_CQPAUSEEN_CQPEN_MSPI0XNOREN     = 1024,  /*!< MSPI0XNOREN : Pause command queue when input MSPI0 bit XNORed
+                                                     with SWFLAG0 is '1' value.                                                */
+  IOM0_CQPAUSEEN_CQPEN_MSPI1XOREN      = 512,   /*!< MSPI1XOREN : Pause command queue when input MSPI1 bit XORed
+                                                     with SWFLAG1 is '1' value.                                                */
+  IOM0_CQPAUSEEN_CQPEN_MSPI0XOREN      = 256,   /*!< MSPI0XOREN : Pause command queue when input MSPI0 bit XORed
+                                                     with SWFLAG0 is '1' value.                                                */
+  IOM0_CQPAUSEEN_CQPEN_SWFLAGEN7       = 128,   /*!< SWFLAGEN7 : Pause the command queue when software flag bit 7
+                                                     is '1'. value.                                                            */
+  IOM0_CQPAUSEEN_CQPEN_SWFLAGEN6       = 64,    /*!< SWFLAGEN6 : Pause the command queue when software flag bit 6
+                                                     is '1' value.                                                             */
+  IOM0_CQPAUSEEN_CQPEN_SWFLAGEN5       = 32,    /*!< SWFLAGEN5 : Pause the command queue when software flag bit 5
+                                                     is '1' value.                                                             */
+  IOM0_CQPAUSEEN_CQPEN_SWFLAGEN4       = 16,    /*!< SWFLAGEN4 : Pause the command queue when software flag bit 4
+                                                     is '1' value.                                                             */
+  IOM0_CQPAUSEEN_CQPEN_SWFLAGEN3       = 8,     /*!< SWFLAGEN3 : Pause the command queue when software flag bit 3
+                                                     is '1' value.                                                             */
+  IOM0_CQPAUSEEN_CQPEN_SWFLAGEN2       = 4,     /*!< SWFLAGEN2 : Pause the command queue when software flag bit 2
+                                                     is '1' value.                                                             */
+  IOM0_CQPAUSEEN_CQPEN_SWFLAGEN1       = 2,     /*!< SWFLAGEN1 : Pause the command queue when software flag bit 1
+                                                     is '1' value.                                                             */
+  IOM0_CQPAUSEEN_CQPEN_SWFLAGEN0       = 1,     /*!< SWFLAGEN0 : Pause the command queue when software flag bit 0
+                                                     is '1' value.                                                             */
+} IOM0_CQPAUSEEN_CQPEN_Enum;
+
+/* =======================================================  CQCURIDX  ======================================================== */
+/* =======================================================  CQENDIDX  ======================================================== */
+/* ========================================================  STATUS  ========================================================= */
+/* ===============================================  IOM0 STATUS IDLEST [2..2]  =============================================== */
+typedef enum {                                  /*!< IOM0_STATUS_IDLEST                                                        */
+  IOM0_STATUS_IDLEST_IDLE              = 1,     /*!< IDLE : The I/O state machine is in the idle state. value.                 */
+} IOM0_STATUS_IDLEST_Enum;
+
+/* ===============================================  IOM0 STATUS CMDACT [1..1]  =============================================== */
+typedef enum {                                  /*!< IOM0_STATUS_CMDACT                                                        */
+  IOM0_STATUS_CMDACT_ACTIVE            = 1,     /*!< ACTIVE : An I/O command is active. Indicates the active module
+                                                     has an active command and is processing this. De-asserted
+                                                     when the command is completed. value.                                     */
+} IOM0_STATUS_CMDACT_Enum;
+
+/* ================================================  IOM0 STATUS ERR [0..0]  ================================================= */
+typedef enum {                                  /*!< IOM0_STATUS_ERR                                                           */
+  IOM0_STATUS_ERR_ERROR                = 1,     /*!< ERROR : Bit has been deprecated and will always return 0. value.          */
+} IOM0_STATUS_ERR_Enum;
+
+/* ========================================================  MSPICFG  ======================================================== */
+/* =============================================  IOM0 MSPICFG SPILSB [23..23]  ============================================== */
+typedef enum {                                  /*!< IOM0_MSPICFG_SPILSB                                                       */
+  IOM0_MSPICFG_SPILSB_MSB              = 0,     /*!< MSB : Send and receive MSB bit first value.                               */
+  IOM0_MSPICFG_SPILSB_LSB              = 1,     /*!< LSB : Send and receive LSB bit first value.                               */
+} IOM0_MSPICFG_SPILSB_Enum;
+
+/* =============================================  IOM0 MSPICFG RDFCPOL [22..22]  ============================================= */
+typedef enum {                                  /*!< IOM0_MSPICFG_RDFCPOL                                                      */
+  IOM0_MSPICFG_RDFCPOL_HIGH            = 0,     /*!< HIGH : Flow control signal high creates flow control. value.              */
+  IOM0_MSPICFG_RDFCPOL_LOW             = 1,     /*!< LOW : Flow control signal low creates flow control. value.                */
+} IOM0_MSPICFG_RDFCPOL_Enum;
+
+/* =============================================  IOM0 MSPICFG WTFCPOL [21..21]  ============================================= */
+typedef enum {                                  /*!< IOM0_MSPICFG_WTFCPOL                                                      */
+  IOM0_MSPICFG_WTFCPOL_HIGH            = 0,     /*!< HIGH : Flow control signal high(1) creates flow control and
+                                                     byte transfers will stop until the flow control signal
+                                                     goes low. value.                                                          */
+  IOM0_MSPICFG_WTFCPOL_LOW             = 1,     /*!< LOW : Flow control signal low(0) creates flow control and byte
+                                                     transfers will stop until the flow control signal goes
+                                                     high(1). value.                                                           */
+} IOM0_MSPICFG_WTFCPOL_Enum;
+
+/* =============================================  IOM0 MSPICFG WTFCIRQ [20..20]  ============================================= */
+typedef enum {                                  /*!< IOM0_MSPICFG_WTFCIRQ                                                      */
+  IOM0_MSPICFG_WTFCIRQ_MISO            = 0,     /*!< MISO : MISO is used as the write mode flow control signal. value.         */
+  IOM0_MSPICFG_WTFCIRQ_IRQ             = 1,     /*!< IRQ : IRQ is used as the write mode flow control signal. value.           */
+} IOM0_MSPICFG_WTFCIRQ_Enum;
+
+/* =============================================  IOM0 MSPICFG MOSIINV [18..18]  ============================================= */
+typedef enum {                                  /*!< IOM0_MSPICFG_MOSIINV                                                      */
+  IOM0_MSPICFG_MOSIINV_NORMAL          = 0,     /*!< NORMAL : MOSI is set to 0 in read mode and 1 in write mode.
+                                                     value.                                                                    */
+  IOM0_MSPICFG_MOSIINV_INVERT          = 1,     /*!< INVERT : MOSI is set to 1 in read mode and 0 in write mode.
+                                                     value.                                                                    */
+} IOM0_MSPICFG_MOSIINV_Enum;
+
+/* ==============================================  IOM0 MSPICFG RDFC [17..17]  =============================================== */
+typedef enum {                                  /*!< IOM0_MSPICFG_RDFC                                                         */
+  IOM0_MSPICFG_RDFC_DIS                = 0,     /*!< DIS : Read mode flow control disabled. value.                             */
+  IOM0_MSPICFG_RDFC_EN                 = 1,     /*!< EN : Read mode flow control enabled. value.                               */
+} IOM0_MSPICFG_RDFC_Enum;
+
+/* ==============================================  IOM0 MSPICFG WTFC [16..16]  =============================================== */
+typedef enum {                                  /*!< IOM0_MSPICFG_WTFC                                                         */
+  IOM0_MSPICFG_WTFC_DIS                = 0,     /*!< DIS : Write mode flow control disabled. value.                            */
+  IOM0_MSPICFG_WTFC_EN                 = 1,     /*!< EN : Write mode flow control enabled. value.                              */
+} IOM0_MSPICFG_WTFC_Enum;
+
+/* ===============================================  IOM0 MSPICFG SPHA [1..1]  ================================================ */
+typedef enum {                                  /*!< IOM0_MSPICFG_SPHA                                                         */
+  IOM0_MSPICFG_SPHA_SAMPLE_LEADING_EDGE = 0,    /*!< SAMPLE_LEADING_EDGE : Sample on the leading (first) clock edge.
+                                                     value.                                                                    */
+  IOM0_MSPICFG_SPHA_SAMPLE_TRAILING_EDGE = 1,   /*!< SAMPLE_TRAILING_EDGE : Sample on the trailing (second) clock
+                                                     edge. value.                                                              */
+} IOM0_MSPICFG_SPHA_Enum;
+
+/* ===============================================  IOM0 MSPICFG SPOL [0..0]  ================================================ */
+typedef enum {                                  /*!< IOM0_MSPICFG_SPOL                                                         */
+  IOM0_MSPICFG_SPOL_CLK_BASE_0         = 0,     /*!< CLK_BASE_0 : The base value of the clock is 0. value.                     */
+  IOM0_MSPICFG_SPOL_CLK_BASE_1         = 1,     /*!< CLK_BASE_1 : The base value of the clock is 1. value.                     */
+} IOM0_MSPICFG_SPOL_Enum;
+
+/* ========================================================  MI2CCFG  ======================================================== */
+/* ===============================================  IOM0 MI2CCFG ARBEN [2..2]  =============================================== */
+typedef enum {                                  /*!< IOM0_MI2CCFG_ARBEN                                                        */
+  IOM0_MI2CCFG_ARBEN_ARBEN             = 1,     /*!< ARBEN : Enable multi-master bus arbitration support for this
+                                                     i2c master value.                                                         */
+  IOM0_MI2CCFG_ARBEN_ARBDIS            = 0,     /*!< ARBDIS : Disable multi-master bus arbitration support for this
+                                                     i2c master value.                                                         */
+} IOM0_MI2CCFG_ARBEN_Enum;
+
+/* ==============================================  IOM0 MI2CCFG I2CLSB [1..1]  =============================================== */
+typedef enum {                                  /*!< IOM0_MI2CCFG_I2CLSB                                                       */
+  IOM0_MI2CCFG_I2CLSB_MSBFIRST         = 0,     /*!< MSBFIRST : Byte data is transmitted MSB first onto the bus/read
+                                                     from the bus value.                                                       */
+  IOM0_MI2CCFG_I2CLSB_LSBFIRST         = 1,     /*!< LSBFIRST : Byte data is transmitted LSB first onto the bus/read
+                                                     from the bus value.                                                       */
+} IOM0_MI2CCFG_I2CLSB_Enum;
+
+/* ==============================================  IOM0 MI2CCFG ADDRSZ [0..0]  =============================================== */
+typedef enum {                                  /*!< IOM0_MI2CCFG_ADDRSZ                                                       */
+  IOM0_MI2CCFG_ADDRSZ_ADDRSZ7          = 0,     /*!< ADDRSZ7 : Use 7b addressing for I2C master transactions value.            */
+  IOM0_MI2CCFG_ADDRSZ_ADDRSZ10         = 1,     /*!< ADDRSZ10 : Use 10b addressing for I2C master transactions value.          */
+} IOM0_MI2CCFG_ADDRSZ_Enum;
+
+/* ========================================================  DEVCFG  ========================================================= */
+/* ========================================================  IOMDBG  ========================================================= */
+
+
+/* =========================================================================================================================== */
+/* ================                                          IOSLAVE                                          ================ */
+/* =========================================================================================================================== */
+
+/* ========================================================  FIFOPTR  ======================================================== */
+/* ========================================================  FIFOCFG  ======================================================== */
+/* ========================================================  FIFOTHR  ======================================================== */
+/* =========================================================  FUPD  ========================================================== */
+/* ========================================================  FIFOCTR  ======================================================== */
+/* ========================================================  FIFOINC  ======================================================== */
+/* ==========================================================  CFG  ========================================================== */
+/* ==============================================  IOSLAVE CFG IFCEN [31..31]  =============================================== */
+typedef enum {                                  /*!< IOSLAVE_CFG_IFCEN                                                         */
+  IOSLAVE_CFG_IFCEN_DIS                = 0,     /*!< DIS : Disable the IOSLAVE value.                                          */
+  IOSLAVE_CFG_IFCEN_EN                 = 1,     /*!< EN : Enable the IOSLAVE value.                                            */
+} IOSLAVE_CFG_IFCEN_Enum;
+
+/* ==============================================  IOSLAVE CFG STARTRD [4..4]  =============================================== */
+typedef enum {                                  /*!< IOSLAVE_CFG_STARTRD                                                       */
+  IOSLAVE_CFG_STARTRD_LATE             = 0,     /*!< LATE : Initiate I/O RAM read late in each transferred byte.
+                                                     value.                                                                    */
+  IOSLAVE_CFG_STARTRD_EARLY            = 1,     /*!< EARLY : Initiate I/O RAM read early in each transferred byte.
+                                                     value.                                                                    */
+} IOSLAVE_CFG_STARTRD_Enum;
+
+/* ================================================  IOSLAVE CFG LSB [2..2]  ================================================= */
+typedef enum {                                  /*!< IOSLAVE_CFG_LSB                                                           */
+  IOSLAVE_CFG_LSB_MSB_FIRST            = 0,     /*!< MSB_FIRST : Data is assumed to be sent and received with MSB
+                                                     first. value.                                                             */
+  IOSLAVE_CFG_LSB_LSB_FIRST            = 1,     /*!< LSB_FIRST : Data is assumed to be sent and received with LSB
+                                                     first. value.                                                             */
+} IOSLAVE_CFG_LSB_Enum;
+
+/* ================================================  IOSLAVE CFG SPOL [1..1]  ================================================ */
+typedef enum {                                  /*!< IOSLAVE_CFG_SPOL                                                          */
+  IOSLAVE_CFG_SPOL_SPI_MODES_0_3       = 0,     /*!< SPI_MODES_0_3 : Polarity 0, handles SPI modes 0 and 3. value.             */
+  IOSLAVE_CFG_SPOL_SPI_MODES_1_2       = 1,     /*!< SPI_MODES_1_2 : Polarity 1, handles SPI modes 1 and 2. value.             */
+} IOSLAVE_CFG_SPOL_Enum;
+
+/* ===============================================  IOSLAVE CFG IFCSEL [0..0]  =============================================== */
+typedef enum {                                  /*!< IOSLAVE_CFG_IFCSEL                                                        */
+  IOSLAVE_CFG_IFCSEL_I2C               = 0,     /*!< I2C : Selects I2C interface for the IO Slave. value.                      */
+  IOSLAVE_CFG_IFCSEL_SPI               = 1,     /*!< SPI : Selects SPI interface for the IO Slave. value.                      */
+} IOSLAVE_CFG_IFCSEL_Enum;
+
+/* =========================================================  PRENC  ========================================================= */
+/* =======================================================  IOINTCTL  ======================================================== */
+/* ========================================================  GENADD  ========================================================= */
+/* =========================================================  INTEN  ========================================================= */
+/* ========================================================  INTSTAT  ======================================================== */
+/* ========================================================  INTCLR  ========================================================= */
+/* ========================================================  INTSET  ========================================================= */
+/* ======================================================  REGACCINTEN  ====================================================== */
+/* =====================================================  REGACCINTSTAT  ===================================================== */
+/* =====================================================  REGACCINTCLR  ====================================================== */
+/* =====================================================  REGACCINTSET  ====================================================== */
+
+
+/* =========================================================================================================================== */
+/* ================                                          MCUCTRL                                          ================ */
+/* =========================================================================================================================== */
+
+/* ========================================================  CHIPPN  ========================================================= */
+/* ============================================  MCUCTRL CHIPPN PARTNUM [0..31]  ============================================= */
+typedef enum {                                  /*!< MCUCTRL_CHIPPN_PARTNUM                                                    */
+  MCUCTRL_CHIPPN_PARTNUM_APOLLO3       = 100663296,/*!< APOLLO3 : Apollo3 part number is 0x06xxxxxx. value.                    */
+  MCUCTRL_CHIPPN_PARTNUM_APOLLO2       = 50331648,/*!< APOLLO2 : Apollo2 part number is 0x03xxxxxx. value.                     */
+  MCUCTRL_CHIPPN_PARTNUM_APOLLO        = 16777216,/*!< APOLLO : Apollo part number is 0x01xxxxxx. value.                       */
+  MCUCTRL_CHIPPN_PARTNUM_PN_M          = -16777216,/*!< PN_M : Mask for the part number field. value.                          */
+  MCUCTRL_CHIPPN_PARTNUM_PN_S          = 24,    /*!< PN_S : Bit position for the part number field. value.                     */
+  MCUCTRL_CHIPPN_PARTNUM_FLASHSIZE_M   = 15728640,/*!< FLASHSIZE_M : Mask for the FLASH_SIZE field.Values:0: 16KB1:
+                                                     32KB2: 64KB3: 128KB4: 256KB5: 512KB6: 1MB7: 2MB value.                    */
+  MCUCTRL_CHIPPN_PARTNUM_FLASHSIZE_S   = 20,    /*!< FLASHSIZE_S : Bit position for the FLASH_SIZE field. value.               */
+  MCUCTRL_CHIPPN_PARTNUM_SRAMSIZE_M    = 983040,/*!< SRAMSIZE_M : Mask for the SRAM_SIZE field.Values:0: 16KB1: 32KB2:
+                                                     64KB3: 128KB4: 256KB5: 512KB6: 1MB7: 384KB value.                         */
+  MCUCTRL_CHIPPN_PARTNUM_SRAMSIZE_S    = 16,    /*!< SRAMSIZE_S : Bit position for the SRAM_SIZE field. value.                 */
+  MCUCTRL_CHIPPN_PARTNUM_REV_M         = 65280, /*!< REV_M : Mask for the revision field. Bits [15:12] are major
+                                                     rev, [11:8] are minor rev.Values:0: Major Rev A, Minor
+                                                     Rev 01: Major Rev B, Minor Rev 1 value.                                   */
+  MCUCTRL_CHIPPN_PARTNUM_REV_S         = 8,     /*!< REV_S : Bit position for the revision field. value.                       */
+  MCUCTRL_CHIPPN_PARTNUM_PKG_M         = 192,   /*!< PKG_M : Mask for the package field.Values:0: SIP1: QFN2: BGA3:
+                                                     CSP value.                                                                */
+  MCUCTRL_CHIPPN_PARTNUM_PKG_S         = 6,     /*!< PKG_S : Bit position for the package field. value.                        */
+  MCUCTRL_CHIPPN_PARTNUM_PINS_M        = 56,    /*!< PINS_M : Mask for the pins field.Values:0: 25 pins1: 49 pins2:
+                                                     64 pins3: 81 pins value.                                                  */
+  MCUCTRL_CHIPPN_PARTNUM_PINS_S        = 3,     /*!< PINS_S : Bit position for the pins field. value.                          */
+  MCUCTRL_CHIPPN_PARTNUM_TEMP_S        = 1,     /*!< TEMP_S : Bit position for the temperature field. value.                   */
+  MCUCTRL_CHIPPN_PARTNUM_QUAL_S        = 0,     /*!< QUAL_S : Bit position for the qualified field. value.                     */
+} MCUCTRL_CHIPPN_PARTNUM_Enum;
+
+/* ========================================================  CHIPID0  ======================================================== */
+/* ============================================  MCUCTRL CHIPID0 CHIPID0 [0..31]  ============================================ */
+typedef enum {                                  /*!< MCUCTRL_CHIPID0_CHIPID0                                                   */
+  MCUCTRL_CHIPID0_CHIPID0_APOLLO3      = 0,     /*!< APOLLO3 : Apollo3 CHIPID0. value.                                         */
+} MCUCTRL_CHIPID0_CHIPID0_Enum;
+
+/* ========================================================  CHIPID1  ======================================================== */
+/* ============================================  MCUCTRL CHIPID1 CHIPID1 [0..31]  ============================================ */
+typedef enum {                                  /*!< MCUCTRL_CHIPID1_CHIPID1                                                   */
+  MCUCTRL_CHIPID1_CHIPID1_APOLLO3      = 0,     /*!< APOLLO3 : Apollo3 CHIPID1. value.                                         */
+} MCUCTRL_CHIPID1_CHIPID1_Enum;
+
+/* ========================================================  CHIPREV  ======================================================== */
+/* =============================================  MCUCTRL CHIPREV REVMAJ [4..7]  ============================================= */
+typedef enum {                                  /*!< MCUCTRL_CHIPREV_REVMAJ                                                    */
+  MCUCTRL_CHIPREV_REVMAJ_A             = 1,     /*!< A : Apollo3 revision A value.                                             */
+} MCUCTRL_CHIPREV_REVMAJ_Enum;
+
+/* =============================================  MCUCTRL CHIPREV REVMIN [0..3]  ============================================= */
+typedef enum {                                  /*!< MCUCTRL_CHIPREV_REVMIN                                                    */
+  MCUCTRL_CHIPREV_REVMIN_REV1          = 2,     /*!< REV1 : Apollo3 minor rev 1. value.                                        */
+  MCUCTRL_CHIPREV_REVMIN_REV0          = 1,     /*!< REV0 : Apollo3 minor rev 0. Minor revision value, succeeding
+                                                     minor revisions will increment from this value. value.                    */
+} MCUCTRL_CHIPREV_REVMIN_Enum;
+
+/* =======================================================  VENDORID  ======================================================== */
+/* ===========================================  MCUCTRL VENDORID VENDORID [0..31]  =========================================== */
+typedef enum {                                  /*!< MCUCTRL_VENDORID_VENDORID                                                 */
+  MCUCTRL_VENDORID_VENDORID_AMBIQ      = 1095582289,/*!< AMBIQ : Ambiq Vendor ID value.                                        */
+} MCUCTRL_VENDORID_VENDORID_Enum;
+
+/* ==========================================================  SKU  ========================================================== */
+/* =====================================================  FEATUREENABLE  ===================================================== */
+/* ========================================  MCUCTRL FEATUREENABLE BURSTAVAIL [6..6]  ======================================== */
+typedef enum {                                  /*!< MCUCTRL_FEATUREENABLE_BURSTAVAIL                                          */
+  MCUCTRL_FEATUREENABLE_BURSTAVAIL_AVAIL = 1,   /*!< AVAIL : Burst functionality available value.                              */
+  MCUCTRL_FEATUREENABLE_BURSTAVAIL_NOTAVAIL = 0,/*!< NOTAVAIL : Burst functionality not available value.                       */
+} MCUCTRL_FEATUREENABLE_BURSTAVAIL_Enum;
+
+/* =========================================  MCUCTRL FEATUREENABLE BURSTREQ [4..4]  ========================================= */
+typedef enum {                                  /*!< MCUCTRL_FEATUREENABLE_BURSTREQ                                            */
+  MCUCTRL_FEATUREENABLE_BURSTREQ_EN    = 1,     /*!< EN : Enable the Burst functionality value.                                */
+  MCUCTRL_FEATUREENABLE_BURSTREQ_DIS   = 0,     /*!< DIS : Disable the Burst functionality value.                              */
+} MCUCTRL_FEATUREENABLE_BURSTREQ_Enum;
+
+/* =========================================  MCUCTRL FEATUREENABLE BLEAVAIL [2..2]  ========================================= */
+typedef enum {                                  /*!< MCUCTRL_FEATUREENABLE_BLEAVAIL                                            */
+  MCUCTRL_FEATUREENABLE_BLEAVAIL_AVAIL = 1,     /*!< AVAIL : BLE functionality available value.                                */
+  MCUCTRL_FEATUREENABLE_BLEAVAIL_NOTAVAIL = 0,  /*!< NOTAVAIL : BLE functionality not available value.                         */
+} MCUCTRL_FEATUREENABLE_BLEAVAIL_Enum;
+
+/* ==========================================  MCUCTRL FEATUREENABLE BLEREQ [0..0]  ========================================== */
+typedef enum {                                  /*!< MCUCTRL_FEATUREENABLE_BLEREQ                                              */
+  MCUCTRL_FEATUREENABLE_BLEREQ_EN      = 1,     /*!< EN : Enable the BLE functionality value.                                  */
+  MCUCTRL_FEATUREENABLE_BLEREQ_DIS     = 0,     /*!< DIS : Disable the BLE functionality value.                                */
+} MCUCTRL_FEATUREENABLE_BLEREQ_Enum;
+
+/* =======================================================  DEBUGGER  ======================================================== */
+/* ========================================================  BODCTRL  ======================================================== */
+/* =======================================================  ADCPWRDLY  ======================================================= */
+/* ========================================================  ADCCAL  ========================================================= */
+/* ==========================================  MCUCTRL ADCCAL ADCCALIBRATED [1..1]  ========================================== */
+typedef enum {                                  /*!< MCUCTRL_ADCCAL_ADCCALIBRATED                                              */
+  MCUCTRL_ADCCAL_ADCCALIBRATED_FALSE   = 0,     /*!< FALSE : ADC is not calibrated value.                                      */
+  MCUCTRL_ADCCAL_ADCCALIBRATED_TRUE    = 1,     /*!< TRUE : ADC is calibrated value.                                           */
+} MCUCTRL_ADCCAL_ADCCALIBRATED_Enum;
+
+/* ===========================================  MCUCTRL ADCCAL CALONPWRUP [0..0]  ============================================ */
+typedef enum {                                  /*!< MCUCTRL_ADCCAL_CALONPWRUP                                                 */
+  MCUCTRL_ADCCAL_CALONPWRUP_DIS        = 0,     /*!< DIS : Disable automatic calibration on initial power up value.            */
+  MCUCTRL_ADCCAL_CALONPWRUP_EN         = 1,     /*!< EN : Enable automatic calibration on initial power up value.              */
+} MCUCTRL_ADCCAL_CALONPWRUP_Enum;
+
+/* ======================================================  ADCBATTLOAD  ====================================================== */
+/* ==========================================  MCUCTRL ADCBATTLOAD BATTLOAD [0..0]  ========================================== */
+typedef enum {                                  /*!< MCUCTRL_ADCBATTLOAD_BATTLOAD                                              */
+  MCUCTRL_ADCBATTLOAD_BATTLOAD_DIS     = 0,     /*!< DIS : Battery load is disconnected value.                                 */
+  MCUCTRL_ADCBATTLOAD_BATTLOAD_EN      = 1,     /*!< EN : Battery load is enabled value.                                       */
+} MCUCTRL_ADCBATTLOAD_BATTLOAD_Enum;
+
+/* ========================================================  ADCTRIM  ======================================================== */
+/* ======================================================  ADCREFCOMP  ======================================================= */
+/* =======================================================  XTALCTRL  ======================================================== */
+/* ==========================================  MCUCTRL XTALCTRL PWDBODXTAL [5..5]  =========================================== */
+typedef enum {                                  /*!< MCUCTRL_XTALCTRL_PWDBODXTAL                                               */
+  MCUCTRL_XTALCTRL_PWDBODXTAL_PWRUPBOD = 0,     /*!< PWRUPBOD : Power up xtal on BOD value.                                    */
+  MCUCTRL_XTALCTRL_PWDBODXTAL_PWRDNBOD = 1,     /*!< PWRDNBOD : Power down XTAL on BOD. value.                                 */
+} MCUCTRL_XTALCTRL_PWDBODXTAL_Enum;
+
+/* =========================================  MCUCTRL XTALCTRL PDNBCMPRXTAL [4..4]  ========================================== */
+typedef enum {                                  /*!< MCUCTRL_XTALCTRL_PDNBCMPRXTAL                                             */
+  MCUCTRL_XTALCTRL_PDNBCMPRXTAL_PWRUPCOMP = 1,  /*!< PWRUPCOMP : Power up XTAL oscillator comparator. value.                   */
+  MCUCTRL_XTALCTRL_PDNBCMPRXTAL_PWRDNCOMP = 0,  /*!< PWRDNCOMP : Power down XTAL oscillator comparator. value.                 */
+} MCUCTRL_XTALCTRL_PDNBCMPRXTAL_Enum;
+
+/* =========================================  MCUCTRL XTALCTRL PDNBCOREXTAL [3..3]  ========================================== */
+typedef enum {                                  /*!< MCUCTRL_XTALCTRL_PDNBCOREXTAL                                             */
+  MCUCTRL_XTALCTRL_PDNBCOREXTAL_PWRUPCORE = 1,  /*!< PWRUPCORE : Power up XTAL oscillator core. value.                         */
+  MCUCTRL_XTALCTRL_PDNBCOREXTAL_PWRDNCORE = 0,  /*!< PWRDNCORE : Power down XTAL oscillator core. value.                       */
+} MCUCTRL_XTALCTRL_PDNBCOREXTAL_Enum;
+
+/* ==========================================  MCUCTRL XTALCTRL BYPCMPRXTAL [2..2]  ========================================== */
+typedef enum {                                  /*!< MCUCTRL_XTALCTRL_BYPCMPRXTAL                                              */
+  MCUCTRL_XTALCTRL_BYPCMPRXTAL_USECOMP = 0,     /*!< USECOMP : Use the XTAL oscillator comparator. value.                      */
+  MCUCTRL_XTALCTRL_BYPCMPRXTAL_BYPCOMP = 1,     /*!< BYPCOMP : Bypass the XTAL oscillator comparator. value.                   */
+} MCUCTRL_XTALCTRL_BYPCMPRXTAL_Enum;
+
+/* =========================================  MCUCTRL XTALCTRL FDBKDSBLXTAL [1..1]  ========================================== */
+typedef enum {                                  /*!< MCUCTRL_XTALCTRL_FDBKDSBLXTAL                                             */
+  MCUCTRL_XTALCTRL_FDBKDSBLXTAL_EN     = 0,     /*!< EN : Enable XTAL oscillator comparator. value.                            */
+  MCUCTRL_XTALCTRL_FDBKDSBLXTAL_DIS    = 1,     /*!< DIS : Disable XTAL oscillator comparator. value.                          */
+} MCUCTRL_XTALCTRL_FDBKDSBLXTAL_Enum;
+
+/* ============================================  MCUCTRL XTALCTRL XTALSWE [0..0]  ============================================ */
+typedef enum {                                  /*!< MCUCTRL_XTALCTRL_XTALSWE                                                  */
+  MCUCTRL_XTALCTRL_XTALSWE_OVERRIDE_DIS = 0,    /*!< OVERRIDE_DIS : XTAL Software Override Disable. value.                     */
+  MCUCTRL_XTALCTRL_XTALSWE_OVERRIDE_EN = 1,     /*!< OVERRIDE_EN : XTAL Software Override Enable. value.                       */
+} MCUCTRL_XTALCTRL_XTALSWE_Enum;
+
+/* ======================================================  XTALGENCTRL  ====================================================== */
+/* ==========================================  MCUCTRL XTALGENCTRL ACWARMUP [0..1]  ========================================== */
+typedef enum {                                  /*!< MCUCTRL_XTALGENCTRL_ACWARMUP                                              */
+  MCUCTRL_XTALGENCTRL_ACWARMUP_SEC1    = 0,     /*!< SEC1 : Warmup period of 1-2 seconds value.                                */
+  MCUCTRL_XTALGENCTRL_ACWARMUP_SEC2    = 1,     /*!< SEC2 : Warmup period of 2-4 seconds value.                                */
+  MCUCTRL_XTALGENCTRL_ACWARMUP_SEC4    = 2,     /*!< SEC4 : Warmup period of 4-8 seconds value.                                */
+  MCUCTRL_XTALGENCTRL_ACWARMUP_SEC8    = 3,     /*!< SEC8 : Warmup period of 8-16 seconds value.                               */
+} MCUCTRL_XTALGENCTRL_ACWARMUP_Enum;
+
+/* =======================================================  MISCCTRL  ======================================================== */
+/* ======================================================  BOOTLOADER  ======================================================= */
+/* =======================================  MCUCTRL BOOTLOADER SECBOOTONRST [30..31]  ======================================== */
+typedef enum {                                  /*!< MCUCTRL_BOOTLOADER_SECBOOTONRST                                           */
+  MCUCTRL_BOOTLOADER_SECBOOTONRST_DISABLED = 0, /*!< DISABLED : Secure boot disabled value.                                    */
+  MCUCTRL_BOOTLOADER_SECBOOTONRST_ENABLED = 1,  /*!< ENABLED : Secure boot enabled value.                                      */
+  MCUCTRL_BOOTLOADER_SECBOOTONRST_ERROR = 2,    /*!< ERROR : Error in secure boot configuration value.                         */
+} MCUCTRL_BOOTLOADER_SECBOOTONRST_Enum;
+
+/* ==========================================  MCUCTRL BOOTLOADER SECBOOT [28..29]  ========================================== */
+typedef enum {                                  /*!< MCUCTRL_BOOTLOADER_SECBOOT                                                */
+  MCUCTRL_BOOTLOADER_SECBOOT_DISABLED  = 0,     /*!< DISABLED : Secure boot disabled value.                                    */
+  MCUCTRL_BOOTLOADER_SECBOOT_ENABLED   = 1,     /*!< ENABLED : Secure boot enabled value.                                      */
+  MCUCTRL_BOOTLOADER_SECBOOT_ERROR     = 2,     /*!< ERROR : Error in secure boot configuration value.                         */
+} MCUCTRL_BOOTLOADER_SECBOOT_Enum;
+
+/* ======================================  MCUCTRL BOOTLOADER SECBOOTFEATURE [26..27]  ======================================= */
+typedef enum {                                  /*!< MCUCTRL_BOOTLOADER_SECBOOTFEATURE                                         */
+  MCUCTRL_BOOTLOADER_SECBOOTFEATURE_DISABLED = 0,/*!< DISABLED : Secure boot disabled value.                                   */
+  MCUCTRL_BOOTLOADER_SECBOOTFEATURE_ENABLED = 1,/*!< ENABLED : Secure boot enabled value.                                      */
+  MCUCTRL_BOOTLOADER_SECBOOTFEATURE_ERROR = 2,  /*!< ERROR : Error in secure boot configuration value.                         */
+} MCUCTRL_BOOTLOADER_SECBOOTFEATURE_Enum;
+
+/* ==========================================  MCUCTRL BOOTLOADER PROTLOCK [2..2]  =========================================== */
+typedef enum {                                  /*!< MCUCTRL_BOOTLOADER_PROTLOCK                                               */
+  MCUCTRL_BOOTLOADER_PROTLOCK_LOCK     = 1,     /*!< LOCK : Enable the secure boot lock value.                                 */
+} MCUCTRL_BOOTLOADER_PROTLOCK_Enum;
+
+/* ===========================================  MCUCTRL BOOTLOADER SBLOCK [1..1]  ============================================ */
+typedef enum {                                  /*!< MCUCTRL_BOOTLOADER_SBLOCK                                                 */
+  MCUCTRL_BOOTLOADER_SBLOCK_LOCK       = 1,     /*!< LOCK : Enable the secure boot lock value.                                 */
+} MCUCTRL_BOOTLOADER_SBLOCK_Enum;
+
+/* ========================================  MCUCTRL BOOTLOADER BOOTLOADERLOW [0..0]  ======================================== */
+typedef enum {                                  /*!< MCUCTRL_BOOTLOADER_BOOTLOADERLOW                                          */
+  MCUCTRL_BOOTLOADER_BOOTLOADERLOW_ADDR0 = 1,   /*!< ADDR0 : Bootloader code at 0x00000000. value.                             */
+} MCUCTRL_BOOTLOADER_BOOTLOADERLOW_Enum;
+
+/* ======================================================  SHADOWVALID  ====================================================== */
+/* ========================================  MCUCTRL SHADOWVALID INFO0_VALID [2..2]  ========================================= */
+typedef enum {                                  /*!< MCUCTRL_SHADOWVALID_INFO0_VALID                                           */
+  MCUCTRL_SHADOWVALID_INFO0_VALID_VALID = 1,    /*!< VALID : Flash info0 (customer) space contains valid data. value.          */
+} MCUCTRL_SHADOWVALID_INFO0_VALID_Enum;
+
+/* ==========================================  MCUCTRL SHADOWVALID BLDSLEEP [1..1]  ========================================== */
+typedef enum {                                  /*!< MCUCTRL_SHADOWVALID_BLDSLEEP                                              */
+  MCUCTRL_SHADOWVALID_BLDSLEEP_DEEPSLEEP = 1,   /*!< DEEPSLEEP : Bootloader will go to deep sleep if no flash image
+                                                     loaded value.                                                             */
+} MCUCTRL_SHADOWVALID_BLDSLEEP_Enum;
+
+/* ===========================================  MCUCTRL SHADOWVALID VALID [0..0]  ============================================ */
+typedef enum {                                  /*!< MCUCTRL_SHADOWVALID_VALID                                                 */
+  MCUCTRL_SHADOWVALID_VALID_VALID      = 1,     /*!< VALID : Flash information space contains valid data. value.               */
+} MCUCTRL_SHADOWVALID_VALID_Enum;
+
+/* =======================================================  SCRATCH0  ======================================================== */
+/* =======================================================  SCRATCH1  ======================================================== */
+/* ====================================================  ICODEFAULTADDR  ===================================================== */
+/* ====================================================  DCODEFAULTADDR  ===================================================== */
+/* =====================================================  SYSFAULTADDR  ====================================================== */
+/* ======================================================  FAULTSTATUS  ====================================================== */
+/* ==========================================  MCUCTRL FAULTSTATUS SYSFAULT [2..2]  ========================================== */
+typedef enum {                                  /*!< MCUCTRL_FAULTSTATUS_SYSFAULT                                              */
+  MCUCTRL_FAULTSTATUS_SYSFAULT_NOFAULT = 0,     /*!< NOFAULT : No bus fault has been detected. value.                          */
+  MCUCTRL_FAULTSTATUS_SYSFAULT_FAULT   = 1,     /*!< FAULT : Bus fault detected. value.                                        */
+} MCUCTRL_FAULTSTATUS_SYSFAULT_Enum;
+
+/* =========================================  MCUCTRL FAULTSTATUS DCODEFAULT [1..1]  ========================================= */
+typedef enum {                                  /*!< MCUCTRL_FAULTSTATUS_DCODEFAULT                                            */
+  MCUCTRL_FAULTSTATUS_DCODEFAULT_NOFAULT = 0,   /*!< NOFAULT : No DCODE fault has been detected. value.                        */
+  MCUCTRL_FAULTSTATUS_DCODEFAULT_FAULT = 1,     /*!< FAULT : DCODE fault detected. value.                                      */
+} MCUCTRL_FAULTSTATUS_DCODEFAULT_Enum;
+
+/* =========================================  MCUCTRL FAULTSTATUS ICODEFAULT [0..0]  ========================================= */
+typedef enum {                                  /*!< MCUCTRL_FAULTSTATUS_ICODEFAULT                                            */
+  MCUCTRL_FAULTSTATUS_ICODEFAULT_NOFAULT = 0,   /*!< NOFAULT : No ICODE fault has been detected. value.                        */
+  MCUCTRL_FAULTSTATUS_ICODEFAULT_FAULT = 1,     /*!< FAULT : ICODE fault detected. value.                                      */
+} MCUCTRL_FAULTSTATUS_ICODEFAULT_Enum;
+
+/* ====================================================  FAULTCAPTUREEN  ===================================================== */
+/* =====================================  MCUCTRL FAULTCAPTUREEN FAULTCAPTUREEN [0..0]  ====================================== */
+typedef enum {                                  /*!< MCUCTRL_FAULTCAPTUREEN_FAULTCAPTUREEN                                     */
+  MCUCTRL_FAULTCAPTUREEN_FAULTCAPTUREEN_DIS = 0,/*!< DIS : Disable fault capture. value.                                       */
+  MCUCTRL_FAULTCAPTUREEN_FAULTCAPTUREEN_EN = 1, /*!< EN : Enable fault capture. value.                                         */
+} MCUCTRL_FAULTCAPTUREEN_FAULTCAPTUREEN_Enum;
+
+/* =========================================================  DBGR1  ========================================================= */
+/* =========================================================  DBGR2  ========================================================= */
+/* =======================================================  PMUENABLE  ======================================================= */
+/* ============================================  MCUCTRL PMUENABLE ENABLE [0..0]  ============================================ */
+typedef enum {                                  /*!< MCUCTRL_PMUENABLE_ENABLE                                                  */
+  MCUCTRL_PMUENABLE_ENABLE_DIS         = 0,     /*!< DIS : Disable MCU power management. value.                                */
+  MCUCTRL_PMUENABLE_ENABLE_EN          = 1,     /*!< EN : Enable MCU power management. value.                                  */
+} MCUCTRL_PMUENABLE_ENABLE_Enum;
+
+/* =======================================================  TPIUCTRL  ======================================================== */
+/* ============================================  MCUCTRL TPIUCTRL CLKSEL [8..10]  ============================================ */
+typedef enum {                                  /*!< MCUCTRL_TPIUCTRL_CLKSEL                                                   */
+  MCUCTRL_TPIUCTRL_CLKSEL_LOWPWR       = 0,     /*!< LOWPWR : Low power state. value.                                          */
+  MCUCTRL_TPIUCTRL_CLKSEL_HFRCDIV2     = 1,     /*!< HFRCDIV2 : Selects HFRC divided by 2 as the source TPIU clk
+                                                     value.                                                                    */
+  MCUCTRL_TPIUCTRL_CLKSEL_HFRCDIV8     = 2,     /*!< HFRCDIV8 : Selects HFRC divided by 8 as the source TPIU clk
+                                                     value.                                                                    */
+  MCUCTRL_TPIUCTRL_CLKSEL_HFRCDIV16    = 3,     /*!< HFRCDIV16 : Selects HFRC divided by 16 as the source TPIU clk
+                                                     value.                                                                    */
+  MCUCTRL_TPIUCTRL_CLKSEL_HFRCDIV32    = 4,     /*!< HFRCDIV32 : Selects HFRC divided by 32 as the source TPIU clk
+                                                     value.                                                                    */
+} MCUCTRL_TPIUCTRL_CLKSEL_Enum;
+
+/* ============================================  MCUCTRL TPIUCTRL ENABLE [0..0]  ============================================= */
+typedef enum {                                  /*!< MCUCTRL_TPIUCTRL_ENABLE                                                   */
+  MCUCTRL_TPIUCTRL_ENABLE_DIS          = 0,     /*!< DIS : Disable the TPIU. value.                                            */
+  MCUCTRL_TPIUCTRL_ENABLE_EN           = 1,     /*!< EN : Enable the TPIU. value.                                              */
+} MCUCTRL_TPIUCTRL_ENABLE_Enum;
+
+/* ======================================================  OTAPOINTER  ======================================================= */
+/* ======================================================  APBDMACTRL  ======================================================= */
+/* =========================================  MCUCTRL APBDMACTRL DECODEABORT [1..1]  ========================================= */
+typedef enum {                                  /*!< MCUCTRL_APBDMACTRL_DECODEABORT                                            */
+  MCUCTRL_APBDMACTRL_DECODEABORT_DISABLE = 0,   /*!< DISABLE : Bus operations to powered down peripherals are quietly
+                                                     discarded value.                                                          */
+  MCUCTRL_APBDMACTRL_DECODEABORT_ENABLE = 1,    /*!< ENABLE : Bus operations to powered down peripherals result in
+                                                     a bus fault. value.                                                       */
+} MCUCTRL_APBDMACTRL_DECODEABORT_Enum;
+
+/* =========================================  MCUCTRL APBDMACTRL DMA_ENABLE [0..0]  ========================================== */
+typedef enum {                                  /*!< MCUCTRL_APBDMACTRL_DMA_ENABLE                                             */
+  MCUCTRL_APBDMACTRL_DMA_ENABLE_DISABLE = 0,    /*!< DISABLE : DMA operations disabled value.                                  */
+  MCUCTRL_APBDMACTRL_DMA_ENABLE_ENABLE = 1,     /*!< ENABLE : DMA operations enabled value.                                    */
+} MCUCTRL_APBDMACTRL_DMA_ENABLE_Enum;
+
+/* =======================================================  SRAMMODE  ======================================================== */
+/* ======================================================  KEXTCLKSEL  ======================================================= */
+/* =========================================  MCUCTRL KEXTCLKSEL KEXTCLKSEL [0..31]  ========================================= */
+typedef enum {                                  /*!< MCUCTRL_KEXTCLKSEL_KEXTCLKSEL                                             */
+  MCUCTRL_KEXTCLKSEL_KEXTCLKSEL_Key    = 83,    /*!< Key : Key value.                                                          */
+} MCUCTRL_KEXTCLKSEL_KEXTCLKSEL_Enum;
+
+/* =======================================================  SIMOBUCK4  ======================================================= */
+/* =======================================================  BLEBUCK2  ======================================================== */
+/* ======================================================  FLASHWPROT0  ====================================================== */
+/* ======================================================  FLASHWPROT1  ====================================================== */
+/* ======================================================  FLASHRPROT0  ====================================================== */
+/* ======================================================  FLASHRPROT1  ====================================================== */
+/* =================================================  DMASRAMWRITEPROTECT0  ================================================== */
+/* =================================================  DMASRAMWRITEPROTECT1  ================================================== */
+/* ==================================================  DMASRAMREADPROTECT0  ================================================== */
+/* ==================================================  DMASRAMREADPROTECT1  ================================================== */
+
+
+/* =========================================================================================================================== */
+/* ================                                           MSPI                                            ================ */
+/* =========================================================================================================================== */
+
+/* =========================================================  CTRL  ========================================================== */
+/* ==========================================================  CFG  ========================================================== */
+/* ================================================  MSPI CFG CPOL [17..17]  ================================================= */
+typedef enum {                                  /*!< MSPI_CFG_CPOL                                                             */
+  MSPI_CFG_CPOL_LOW                    = 0,     /*!< LOW : Clock inactive state is low. value.                                 */
+  MSPI_CFG_CPOL_HIGH                   = 1,     /*!< HIGH : Clock inactive state is high. value.                               */
+} MSPI_CFG_CPOL_Enum;
+
+/* ================================================  MSPI CFG CPHA [16..16]  ================================================= */
+typedef enum {                                  /*!< MSPI_CFG_CPHA                                                             */
+  MSPI_CFG_CPHA_MIDDLE                 = 0,     /*!< MIDDLE : Clock toggles in middle of data bit. value.                      */
+  MSPI_CFG_CPHA_START                  = 1,     /*!< START : Clock toggles at start of data bit. value.                        */
+} MSPI_CFG_CPHA_Enum;
+
+/* ================================================  MSPI CFG DEVCFG [0..3]  ================================================= */
+typedef enum {                                  /*!< MSPI_CFG_DEVCFG                                                           */
+  MSPI_CFG_DEVCFG_SERIAL0              = 1,     /*!< SERIAL0 : Single bit SPI flash on chip select 0 value.                    */
+  MSPI_CFG_DEVCFG_SERIAL1              = 2,     /*!< SERIAL1 : Single bit SPI flash on chip select 1 value.                    */
+  MSPI_CFG_DEVCFG_DUAL0                = 5,     /*!< DUAL0 : Dual SPI flash on chip select 0 value.                            */
+  MSPI_CFG_DEVCFG_DUAL1                = 6,     /*!< DUAL1 : Dual bit SPI flash on chip select 1 value.                        */
+  MSPI_CFG_DEVCFG_QUAD0                = 9,     /*!< QUAD0 : Quad SPI flash on chip select 0 value.                            */
+  MSPI_CFG_DEVCFG_QUAD1                = 10,    /*!< QUAD1 : Quad SPI flash on chip select 1 value.                            */
+  MSPI_CFG_DEVCFG_OCTAL0               = 13,    /*!< OCTAL0 : Octal SPI flash on chip select 0 value.                          */
+  MSPI_CFG_DEVCFG_OCTAL1               = 14,    /*!< OCTAL1 : Octal SPI flash on chip select 1 value.                          */
+  MSPI_CFG_DEVCFG_QUADPAIRED           = 15,    /*!< QUADPAIRED : Dual Quad SPI flash on chip selects 0/1. value.              */
+  MSPI_CFG_DEVCFG_QUADPAIRED_SERIAL    = 3,     /*!< QUADPAIRED_SERIAL : Dual Quad SPI flash on chip selects 0/1,
+                                                     but transmit in serial mode for initialization operations
+                                                     value.                                                                    */
+} MSPI_CFG_DEVCFG_Enum;
+
+/* =========================================================  ADDR  ========================================================== */
+/* =========================================================  INSTR  ========================================================= */
+/* ========================================================  TXFIFO  ========================================================= */
+/* ========================================================  RXFIFO  ========================================================= */
+/* =======================================================  TXENTRIES  ======================================================= */
+/* =======================================================  RXENTRIES  ======================================================= */
+/* =======================================================  THRESHOLD  ======================================================= */
+/* ========================================================  MSPICFG  ======================================================== */
+/* ==============================================  MSPI MSPICFG CLKDIV [8..13]  ============================================== */
+typedef enum {                                  /*!< MSPI_MSPICFG_CLKDIV                                                       */
+  MSPI_MSPICFG_CLKDIV_CLK24            = 2,     /*!< CLK24 : 24 MHz MSPI clock value.                                          */
+  MSPI_MSPICFG_CLKDIV_CLK12            = 4,     /*!< CLK12 : 12 MHz MSPI clock value.                                          */
+  MSPI_MSPICFG_CLKDIV_CLK6             = 8,     /*!< CLK6 : 6 MHz MSPI clock value.                                            */
+  MSPI_MSPICFG_CLKDIV_CLK3             = 16,    /*!< CLK3 : 3 MHz MSPI clock value.                                            */
+  MSPI_MSPICFG_CLKDIV_CLK1_5           = 32,    /*!< CLK1_5 : 1.5 MHz MSPI clock value.                                        */
+} MSPI_MSPICFG_CLKDIV_Enum;
+
+/* ==============================================  MSPI MSPICFG IOMSEL [4..6]  =============================================== */
+typedef enum {                                  /*!< MSPI_MSPICFG_IOMSEL                                                       */
+  MSPI_MSPICFG_IOMSEL_IOM0             = 0,     /*!< IOM0 : ERROR: desc VALUE MISSING value.                                   */
+  MSPI_MSPICFG_IOMSEL_IOM1             = 1,     /*!< IOM1 : ERROR: desc VALUE MISSING value.                                   */
+  MSPI_MSPICFG_IOMSEL_IOM2             = 2,     /*!< IOM2 : ERROR: desc VALUE MISSING value.                                   */
+  MSPI_MSPICFG_IOMSEL_IOM3             = 3,     /*!< IOM3 : ERROR: desc VALUE MISSING value.                                   */
+  MSPI_MSPICFG_IOMSEL_IOM4             = 4,     /*!< IOM4 : ERROR: desc VALUE MISSING value.                                   */
+  MSPI_MSPICFG_IOMSEL_IOM5             = 5,     /*!< IOM5 : ERROR: desc VALUE MISSING value.                                   */
+  MSPI_MSPICFG_IOMSEL_DISABLED         = 7,     /*!< DISABLED : No IOM selected. Signals always zero. value.                   */
+} MSPI_MSPICFG_IOMSEL_Enum;
+
+/* ===============================================  MSPI MSPICFG TXNEG [3..3]  =============================================== */
+typedef enum {                                  /*!< MSPI_MSPICFG_TXNEG                                                        */
+  MSPI_MSPICFG_TXNEG_NORMAL            = 0,     /*!< NORMAL : TX launched from posedge internal clock value.                   */
+  MSPI_MSPICFG_TXNEG_NEGEDGE           = 1,     /*!< NEGEDGE : TX data launched from negedge of internal clock value.          */
+} MSPI_MSPICFG_TXNEG_Enum;
+
+/* ===============================================  MSPI MSPICFG RXNEG [2..2]  =============================================== */
+typedef enum {                                  /*!< MSPI_MSPICFG_RXNEG                                                        */
+  MSPI_MSPICFG_RXNEG_NORMAL            = 0,     /*!< NORMAL : RX data sampled on posedge of internal clock value.              */
+  MSPI_MSPICFG_RXNEG_NEGEDGE           = 1,     /*!< NEGEDGE : RX data sampled on negedge of internal clock value.             */
+} MSPI_MSPICFG_RXNEG_Enum;
+
+/* ===============================================  MSPI MSPICFG RXCAP [1..1]  =============================================== */
+typedef enum {                                  /*!< MSPI_MSPICFG_RXCAP                                                        */
+  MSPI_MSPICFG_RXCAP_NORMAL            = 0,     /*!< NORMAL : RX Capture phase aligns with CPHA setting value.                 */
+  MSPI_MSPICFG_RXCAP_DELAY             = 1,     /*!< DELAY : RX Capture phase is delayed from CPHA setting by one
+                                                     clock edge value.                                                         */
+} MSPI_MSPICFG_RXCAP_Enum;
+
+/* ==============================================  MSPI MSPICFG APBCLK [0..0]  =============================================== */
+typedef enum {                                  /*!< MSPI_MSPICFG_APBCLK                                                       */
+  MSPI_MSPICFG_APBCLK_DIS              = 0,     /*!< DIS : Disable continuous clock. value.                                    */
+  MSPI_MSPICFG_APBCLK_EN               = 1,     /*!< EN : Enable continuous clock. value.                                      */
+} MSPI_MSPICFG_APBCLK_Enum;
+
+/* ========================================================  PADCFG  ========================================================= */
+/* =======================================================  PADOUTEN  ======================================================== */
+/* ==============================================  MSPI PADOUTEN OUTEN [0..8]  =============================================== */
+typedef enum {                                  /*!< MSPI_PADOUTEN_OUTEN                                                       */
+  MSPI_PADOUTEN_OUTEN_QUAD0            = 271,   /*!< QUAD0 : Quad0 (4 data + 1 clock) value.                                   */
+  MSPI_PADOUTEN_OUTEN_QUAD1            = 496,   /*!< QUAD1 : Quad1 (4 data + 1 clock) value.                                   */
+  MSPI_PADOUTEN_OUTEN_OCTAL            = 511,   /*!< OCTAL : Octal (8 data + 1 clock) value.                                   */
+  MSPI_PADOUTEN_OUTEN_SERIAL0          = 259,   /*!< SERIAL0 : Serial (2 data + 1 clock) value.                                */
+} MSPI_PADOUTEN_OUTEN_Enum;
+
+/* =========================================================  FLASH  ========================================================= */
+/* ===============================================  MSPI FLASH XIPACK [2..3]  ================================================ */
+typedef enum {                                  /*!< MSPI_FLASH_XIPACK                                                         */
+  MSPI_FLASH_XIPACK_NOACK              = 0,     /*!< NOACK : No acknowledege sent. Data IOs are tristated the first
+                                                     turnaround cycle value.                                                   */
+  MSPI_FLASH_XIPACK_ACK                = 2,     /*!< ACK : Positive acknowledege sent. Data IOs are driven to 0 the
+                                                     first turnaround cycle to acknowledge XIP mode value.                     */
+  MSPI_FLASH_XIPACK_TERMINATE          = 3,     /*!< TERMINATE : Negative acknowledege sent. Data IOs are driven
+                                                     to 1 the first turnaround cycle to terminate XIP mode.
+                                                     XIPSENDI should be reenabled for the next transfer value.                 */
+} MSPI_FLASH_XIPACK_Enum;
+
+/* ======================================================  SCRAMBLING  ======================================================= */
+/* =========================================================  INTEN  ========================================================= */
+/* ========================================================  INTSTAT  ======================================================== */
+/* ========================================================  INTCLR  ========================================================= */
+/* ========================================================  INTSET  ========================================================= */
+/* ========================================================  DMACFG  ========================================================= */
+/* ===============================================  MSPI DMACFG DMAPRI [3..4]  =============================================== */
+typedef enum {                                  /*!< MSPI_DMACFG_DMAPRI                                                        */
+  MSPI_DMACFG_DMAPRI_LOW               = 0,     /*!< LOW : Low Priority (service as best effort) value.                        */
+  MSPI_DMACFG_DMAPRI_HIGH              = 1,     /*!< HIGH : High Priority (service immediately) value.                         */
+  MSPI_DMACFG_DMAPRI_AUTO              = 2,     /*!< AUTO : Auto Priority (priority raised once TX FIFO empties or
+                                                     RX FIFO fills) value.                                                     */
+} MSPI_DMACFG_DMAPRI_Enum;
+
+/* ===============================================  MSPI DMACFG DMADIR [2..2]  =============================================== */
+typedef enum {                                  /*!< MSPI_DMACFG_DMADIR                                                        */
+  MSPI_DMACFG_DMADIR_P2M               = 0,     /*!< P2M : Peripheral to Memory (SRAM) transaction value.                      */
+  MSPI_DMACFG_DMADIR_M2P               = 1,     /*!< M2P : Memory to Peripheral transaction value.                             */
+} MSPI_DMACFG_DMADIR_Enum;
+
+/* ===============================================  MSPI DMACFG DMAEN [0..1]  ================================================ */
+typedef enum {                                  /*!< MSPI_DMACFG_DMAEN                                                         */
+  MSPI_DMACFG_DMAEN_DIS                = 0,     /*!< DIS : Disable DMA Function value.                                         */
+  MSPI_DMACFG_DMAEN_EN                 = 3,     /*!< EN : Enable HW controlled DMA Function to manage DMA to flash
+                                                     devices. HW will automatically handle issuance of instruction/address
+                                                     bytes based on settings in the FLASH register. value.                     */
+} MSPI_DMACFG_DMAEN_Enum;
+
+/* ========================================================  DMASTAT  ======================================================== */
+/* ======================================================  DMATARGADDR  ====================================================== */
+/* ======================================================  DMADEVADDR  ======================================================= */
+/* ======================================================  DMATOTCOUNT  ====================================================== */
+/* =======================================================  DMABCOUNT  ======================================================= */
+/* =======================================================  DMATHRESH  ======================================================= */
+/* =========================================================  CQCFG  ========================================================= */
+/* ================================================  MSPI CQCFG CQPRI [1..1]  ================================================ */
+typedef enum {                                  /*!< MSPI_CQCFG_CQPRI                                                          */
+  MSPI_CQCFG_CQPRI_LOW                 = 0,     /*!< LOW : Low Priority (service as best effort) value.                        */
+  MSPI_CQCFG_CQPRI_HIGH                = 1,     /*!< HIGH : High Priority (service immediately) value.                         */
+} MSPI_CQCFG_CQPRI_Enum;
+
+/* ================================================  MSPI CQCFG CQEN [0..0]  ================================================= */
+typedef enum {                                  /*!< MSPI_CQCFG_CQEN                                                           */
+  MSPI_CQCFG_CQEN_DIS                  = 0,     /*!< DIS : Disable CQ Function value.                                          */
+  MSPI_CQCFG_CQEN_EN                   = 1,     /*!< EN : Enable CQ Function value.                                            */
+} MSPI_CQCFG_CQEN_Enum;
+
+/* ========================================================  CQADDR  ========================================================= */
+/* ========================================================  CQSTAT  ========================================================= */
+/* ========================================================  CQFLAGS  ======================================================== */
+/* =============================================  MSPI CQFLAGS CQFLAGS [0..15]  ============================================== */
+typedef enum {                                  /*!< MSPI_CQFLAGS_CQFLAGS                                                      */
+  MSPI_CQFLAGS_CQFLAGS_STOP            = 32768, /*!< STOP : CQ Stop Flag. When set, CQ processing will complete.
+                                                     value.                                                                    */
+  MSPI_CQFLAGS_CQFLAGS_CQIDX           = 16384, /*!< CQIDX : CQ Index Pointers (CURIDX/ENDIDX) match. value.                   */
+  MSPI_CQFLAGS_CQFLAGS_DMACPL          = 2048,  /*!< DMACPL : DMA Complete Status (hardwired DMACPL bit in DMASTAT)
+                                                     value.                                                                    */
+  MSPI_CQFLAGS_CQFLAGS_CMDCPL          = 1024,  /*!< CMDCPL : PIO Operation completed (STATUS bit in CTRL register)
+                                                     value.                                                                    */
+  MSPI_CQFLAGS_CQFLAGS_IOM1READY       = 512,   /*!< IOM1READY : IOM Buffer 1 Ready Status (from selected IOM). This
+                                                     status is the result of XOR'ing the IOM0START with the
+                                                     incoming status from the IOM. When high, MSPI can send
+                                                     to the buffer. value.                                                     */
+  MSPI_CQFLAGS_CQFLAGS_IOM0READY       = 256,   /*!< IOM0READY : IOM Buffer 0 Ready Status (from selected IOM). This
+                                                     status is the result of XOR'ing the IOM0START with the
+                                                     incoming status from the IOM. When high, MSPI can send
+                                                     to the buffer. value.                                                     */
+  MSPI_CQFLAGS_CQFLAGS_SWFLAG7         = 128,   /*!< SWFLAG7 : Software flag 7. Can be used by software to start/pause
+                                                     operations value.                                                         */
+  MSPI_CQFLAGS_CQFLAGS_SWFLAG6         = 64,    /*!< SWFLAG6 : Software flag 6. Can be used by software to start/pause
+                                                     operatoins value.                                                         */
+  MSPI_CQFLAGS_CQFLAGS_SWFLAG5         = 32,    /*!< SWFLAG5 : Software flag 5. Can be used by software to start/pause
+                                                     operations value.                                                         */
+  MSPI_CQFLAGS_CQFLAGS_SWFLAG4         = 16,    /*!< SWFLAG4 : Software flag 4. Can be used by software to start/pause
+                                                     operatoins value.                                                         */
+  MSPI_CQFLAGS_CQFLAGS_SWFLAG3         = 8,     /*!< SWFLAG3 : Software flag 3. Can be used by software to start/pause
+                                                     operations value.                                                         */
+  MSPI_CQFLAGS_CQFLAGS_SWFLAG2         = 4,     /*!< SWFLAG2 : Software flag 2. Can be used by software to start/pause
+                                                     operatoins value.                                                         */
+  MSPI_CQFLAGS_CQFLAGS_SWFLAG1         = 2,     /*!< SWFLAG1 : Software flag 1. Can be used by software to start/pause
+                                                     operations value.                                                         */
+  MSPI_CQFLAGS_CQFLAGS_SWFLAG0         = 1,     /*!< SWFLAG0 : Software flag 0. Can be used by software to start/pause
+                                                     operatoins value.                                                         */
+} MSPI_CQFLAGS_CQFLAGS_Enum;
+
+/* ======================================================  CQSETCLEAR  ======================================================= */
+/* ========================================================  CQPAUSE  ======================================================== */
+/* ==============================================  MSPI CQPAUSE CQMASK [0..15]  ============================================== */
+typedef enum {                                  /*!< MSPI_CQPAUSE_CQMASK                                                       */
+  MSPI_CQPAUSE_CQMASK_STOP             = 32768, /*!< STOP : CQ Stop Flag. When set, CQ processing will complete.
+                                                     value.                                                                    */
+  MSPI_CQPAUSE_CQMASK_CQIDX            = 16384, /*!< CQIDX : CQ Index Pointers (CURIDX/ENDIDX) match. value.                   */
+  MSPI_CQPAUSE_CQMASK_DMACPL           = 2048,  /*!< DMACPL : DMA Complete Status (hardwired DMACPL bit in DMASTAT)
+                                                     value.                                                                    */
+  MSPI_CQPAUSE_CQMASK_CMDCPL           = 1024,  /*!< CMDCPL : PIO Operation completed (STATUS bit in CTRL register)
+                                                     value.                                                                    */
+  MSPI_CQPAUSE_CQMASK_IOM1READY        = 512,   /*!< IOM1READY : IOM Buffer 1 Ready Status (from selected IOM). This
+                                                     status is the result of XOR'ing the IOM0START with the
+                                                     incoming status from the IOM. When high, MSPI can send
+                                                     to the buffer. value.                                                     */
+  MSPI_CQPAUSE_CQMASK_IOM0READY        = 256,   /*!< IOM0READY : IOM Buffer 0 Ready Status (from selected IOM). This
+                                                     status is the result of XOR'ing the IOM0START with the
+                                                     incoming status from the IOM. When high, MSPI can send
+                                                     to the buffer. value.                                                     */
+  MSPI_CQPAUSE_CQMASK_SWFLAG7          = 128,   /*!< SWFLAG7 : Software flag 7. Can be used by software to start/pause
+                                                     operations value.                                                         */
+  MSPI_CQPAUSE_CQMASK_SWFLAG6          = 64,    /*!< SWFLAG6 : Software flag 6. Can be used by software to start/pause
+                                                     operatoins value.                                                         */
+  MSPI_CQPAUSE_CQMASK_SWFLAG5          = 32,    /*!< SWFLAG5 : Software flag 5. Can be used by software to start/pause
+                                                     operations value.                                                         */
+  MSPI_CQPAUSE_CQMASK_SWFLAG4          = 16,    /*!< SWFLAG4 : Software flag 4. Can be used by software to start/pause
+                                                     operatoins value.                                                         */
+  MSPI_CQPAUSE_CQMASK_SWFLAG3          = 8,     /*!< SWFLAG3 : Software flag 3. Can be used by software to start/pause
+                                                     operations value.                                                         */
+  MSPI_CQPAUSE_CQMASK_SWFLAG2          = 4,     /*!< SWFLAG2 : Software flag 2. Can be used by software to start/pause
+                                                     operatoins value.                                                         */
+  MSPI_CQPAUSE_CQMASK_SWFLAG1          = 2,     /*!< SWFLAG1 : Software flag 1. Can be used by software to start/pause
+                                                     operations value.                                                         */
+  MSPI_CQPAUSE_CQMASK_SWFLAG0          = 1,     /*!< SWFLAG0 : Software flag 0. Can be used by software to start/pause
+                                                     operatoins value.                                                         */
+} MSPI_CQPAUSE_CQMASK_Enum;
+
+/* =======================================================  CQCURIDX  ======================================================== */
+/* =======================================================  CQENDIDX  ======================================================== */
+
+
+/* =========================================================================================================================== */
+/* ================                                            PDM                                            ================ */
+/* =========================================================================================================================== */
+
+/* =========================================================  PCFG  ========================================================== */
+/* ===============================================  PDM PCFG LRSWAP [31..31]  ================================================ */
+typedef enum {                                  /*!< PDM_PCFG_LRSWAP                                                           */
+  PDM_PCFG_LRSWAP_EN                   = 1,     /*!< EN : Swap left and right channels (FIFO Read RIGHT_LEFT). value.          */
+  PDM_PCFG_LRSWAP_NOSWAP               = 0,     /*!< NOSWAP : No channel swapping (IFO Read LEFT_RIGHT). value.                */
+} PDM_PCFG_LRSWAP_Enum;
+
+/* ==============================================  PDM PCFG PGARIGHT [26..30]  =============================================== */
+typedef enum {                                  /*!< PDM_PCFG_PGARIGHT                                                         */
+  PDM_PCFG_PGARIGHT_P405DB             = 31,    /*!< P405DB : 40.5 db gain. value.                                             */
+  PDM_PCFG_PGARIGHT_P390DB             = 30,    /*!< P390DB : 39.0 db gain. value.                                             */
+  PDM_PCFG_PGARIGHT_P375DB             = 29,    /*!< P375DB : 37.5 db gain. value.                                             */
+  PDM_PCFG_PGARIGHT_P360DB             = 28,    /*!< P360DB : 36.0 db gain. value.                                             */
+  PDM_PCFG_PGARIGHT_P345DB             = 27,    /*!< P345DB : 34.5 db gain. value.                                             */
+  PDM_PCFG_PGARIGHT_P330DB             = 26,    /*!< P330DB : 33.0 db gain. value.                                             */
+  PDM_PCFG_PGARIGHT_P315DB             = 25,    /*!< P315DB : 31.5 db gain. value.                                             */
+  PDM_PCFG_PGARIGHT_P300DB             = 24,    /*!< P300DB : 30.0 db gain. value.                                             */
+  PDM_PCFG_PGARIGHT_P285DB             = 23,    /*!< P285DB : 28.5 db gain. value.                                             */
+  PDM_PCFG_PGARIGHT_P270DB             = 22,    /*!< P270DB : 27.0 db gain. value.                                             */
+  PDM_PCFG_PGARIGHT_P255DB             = 21,    /*!< P255DB : 25.5 db gain. value.                                             */
+  PDM_PCFG_PGARIGHT_P240DB             = 20,    /*!< P240DB : 24.0 db gain. value.                                             */
+  PDM_PCFG_PGARIGHT_P225DB             = 19,    /*!< P225DB : 22.5 db gain. value.                                             */
+  PDM_PCFG_PGARIGHT_P210DB             = 18,    /*!< P210DB : 21.0 db gain. value.                                             */
+  PDM_PCFG_PGARIGHT_P195DB             = 17,    /*!< P195DB : 19.5 db gain. value.                                             */
+  PDM_PCFG_PGARIGHT_P180DB             = 16,    /*!< P180DB : 18.0 db gain. value.                                             */
+  PDM_PCFG_PGARIGHT_P165DB             = 15,    /*!< P165DB : 16.5 db gain. value.                                             */
+  PDM_PCFG_PGARIGHT_P150DB             = 14,    /*!< P150DB : 15.0 db gain. value.                                             */
+  PDM_PCFG_PGARIGHT_P135DB             = 13,    /*!< P135DB : 13.5 db gain. value.                                             */
+  PDM_PCFG_PGARIGHT_P120DB             = 12,    /*!< P120DB : 12.0 db gain. value.                                             */
+  PDM_PCFG_PGARIGHT_P105DB             = 11,    /*!< P105DB : 10.5 db gain. value.                                             */
+  PDM_PCFG_PGARIGHT_P90DB              = 10,    /*!< P90DB : 9.0 db gain. value.                                               */
+  PDM_PCFG_PGARIGHT_P75DB              = 9,     /*!< P75DB : 7.5 db gain. value.                                               */
+  PDM_PCFG_PGARIGHT_P60DB              = 8,     /*!< P60DB : 6.0 db gain. value.                                               */
+  PDM_PCFG_PGARIGHT_P45DB              = 7,     /*!< P45DB : 4.5 db gain. value.                                               */
+  PDM_PCFG_PGARIGHT_P30DB              = 6,     /*!< P30DB : 3.0 db gain. value.                                               */
+  PDM_PCFG_PGARIGHT_P15DB              = 5,     /*!< P15DB : 1.5 db gain. value.                                               */
+  PDM_PCFG_PGARIGHT_0DB                = 4,     /*!< 0DB : 0.0 db gain. value.                                                 */
+  PDM_PCFG_PGARIGHT_M15DB              = 3,     /*!< M15DB : -1.5 db gain. value.                                              */
+  PDM_PCFG_PGARIGHT_M300DB             = 2,     /*!< M300DB : -3.0 db gain. value.                                             */
+  PDM_PCFG_PGARIGHT_M45DB              = 1,     /*!< M45DB : -4.5 db gain. value.                                              */
+  PDM_PCFG_PGARIGHT_M60DB              = 0,     /*!< M60DB : -6.0 db gain. value.                                              */
+} PDM_PCFG_PGARIGHT_Enum;
+
+/* ===============================================  PDM PCFG PGALEFT [21..25]  =============================================== */
+typedef enum {                                  /*!< PDM_PCFG_PGALEFT                                                          */
+  PDM_PCFG_PGALEFT_P405DB              = 31,    /*!< P405DB : 40.5 db gain. value.                                             */
+  PDM_PCFG_PGALEFT_P390DB              = 30,    /*!< P390DB : 39.0 db gain. value.                                             */
+  PDM_PCFG_PGALEFT_P375DB              = 29,    /*!< P375DB : 37.5 db gain. value.                                             */
+  PDM_PCFG_PGALEFT_P360DB              = 28,    /*!< P360DB : 36.0 db gain. value.                                             */
+  PDM_PCFG_PGALEFT_P345DB              = 27,    /*!< P345DB : 34.5 db gain. value.                                             */
+  PDM_PCFG_PGALEFT_P330DB              = 26,    /*!< P330DB : 33.0 db gain. value.                                             */
+  PDM_PCFG_PGALEFT_P315DB              = 25,    /*!< P315DB : 31.5 db gain. value.                                             */
+  PDM_PCFG_PGALEFT_P300DB              = 24,    /*!< P300DB : 30.0 db gain. value.                                             */
+  PDM_PCFG_PGALEFT_P285DB              = 23,    /*!< P285DB : 28.5 db gain. value.                                             */
+  PDM_PCFG_PGALEFT_P270DB              = 22,    /*!< P270DB : 27.0 db gain. value.                                             */
+  PDM_PCFG_PGALEFT_P255DB              = 21,    /*!< P255DB : 25.5 db gain. value.                                             */
+  PDM_PCFG_PGALEFT_P240DB              = 20,    /*!< P240DB : 24.0 db gain. value.                                             */
+  PDM_PCFG_PGALEFT_P225DB              = 19,    /*!< P225DB : 22.5 db gain. value.                                             */
+  PDM_PCFG_PGALEFT_P210DB              = 18,    /*!< P210DB : 21.0 db gain. value.                                             */
+  PDM_PCFG_PGALEFT_P195DB              = 17,    /*!< P195DB : 19.5 db gain. value.                                             */
+  PDM_PCFG_PGALEFT_P180DB              = 16,    /*!< P180DB : 18.0 db gain. value.                                             */
+  PDM_PCFG_PGALEFT_P165DB              = 15,    /*!< P165DB : 16.5 db gain. value.                                             */
+  PDM_PCFG_PGALEFT_P150DB              = 14,    /*!< P150DB : 15.0 db gain. value.                                             */
+  PDM_PCFG_PGALEFT_P135DB              = 13,    /*!< P135DB : 13.5 db gain. value.                                             */
+  PDM_PCFG_PGALEFT_P120DB              = 12,    /*!< P120DB : 12.0 db gain. value.                                             */
+  PDM_PCFG_PGALEFT_P105DB              = 11,    /*!< P105DB : 10.5 db gain. value.                                             */
+  PDM_PCFG_PGALEFT_P90DB               = 10,    /*!< P90DB : 9.0 db gain. value.                                               */
+  PDM_PCFG_PGALEFT_P75DB               = 9,     /*!< P75DB : 7.5 db gain. value.                                               */
+  PDM_PCFG_PGALEFT_P60DB               = 8,     /*!< P60DB : 6.0 db gain. value.                                               */
+  PDM_PCFG_PGALEFT_P45DB               = 7,     /*!< P45DB : 4.5 db gain. value.                                               */
+  PDM_PCFG_PGALEFT_P30DB               = 6,     /*!< P30DB : 3.0 db gain. value.                                               */
+  PDM_PCFG_PGALEFT_P15DB               = 5,     /*!< P15DB : 1.5 db gain. value.                                               */
+  PDM_PCFG_PGALEFT_0DB                 = 4,     /*!< 0DB : 0.0 db gain. value.                                                 */
+  PDM_PCFG_PGALEFT_M15DB               = 3,     /*!< M15DB : -1.5 db gain. value.                                              */
+  PDM_PCFG_PGALEFT_M300DB              = 2,     /*!< M300DB : -3.0 db gain. value.                                             */
+  PDM_PCFG_PGALEFT_M45DB               = 1,     /*!< M45DB : -4.5 db gain. value.                                              */
+  PDM_PCFG_PGALEFT_M60DB               = 0,     /*!< M60DB : -6.0 db gain. value.                                              */
+} PDM_PCFG_PGALEFT_Enum;
+
+/* ===============================================  PDM PCFG MCLKDIV [17..18]  =============================================== */
+typedef enum {                                  /*!< PDM_PCFG_MCLKDIV                                                          */
+  PDM_PCFG_MCLKDIV_MCKDIV4             = 3,     /*!< MCKDIV4 : Divide input clock by 4 value.                                  */
+  PDM_PCFG_MCLKDIV_MCKDIV3             = 2,     /*!< MCKDIV3 : Divide input clock by 3 value.                                  */
+  PDM_PCFG_MCLKDIV_MCKDIV2             = 1,     /*!< MCKDIV2 : Divide input clock by 2 value.                                  */
+  PDM_PCFG_MCLKDIV_MCKDIV1             = 0,     /*!< MCKDIV1 : Divide input clock by 1 value.                                  */
+} PDM_PCFG_MCLKDIV_Enum;
+
+/* ================================================  PDM PCFG ADCHPD [9..9]  ================================================= */
+typedef enum {                                  /*!< PDM_PCFG_ADCHPD                                                           */
+  PDM_PCFG_ADCHPD_EN                   = 1,     /*!< EN : Enable high pass filter. value.                                      */
+  PDM_PCFG_ADCHPD_DIS                  = 0,     /*!< DIS : Disable high pass filter. value.                                    */
+} PDM_PCFG_ADCHPD_Enum;
+
+/* ===============================================  PDM PCFG SOFTMUTE [1..1]  ================================================ */
+typedef enum {                                  /*!< PDM_PCFG_SOFTMUTE                                                         */
+  PDM_PCFG_SOFTMUTE_EN                 = 1,     /*!< EN : Enable Soft Mute. value.                                             */
+  PDM_PCFG_SOFTMUTE_DIS                = 0,     /*!< DIS : Disable Soft Mute. value.                                           */
+} PDM_PCFG_SOFTMUTE_Enum;
+
+/* ===============================================  PDM PCFG PDMCOREEN [0..0]  =============================================== */
+typedef enum {                                  /*!< PDM_PCFG_PDMCOREEN                                                        */
+  PDM_PCFG_PDMCOREEN_EN                = 1,     /*!< EN : Enable Data Streaming. value.                                        */
+  PDM_PCFG_PDMCOREEN_DIS               = 0,     /*!< DIS : Disable Data Streaming. value.                                      */
+} PDM_PCFG_PDMCOREEN_Enum;
+
+/* =========================================================  VCFG  ========================================================== */
+/* ===============================================  PDM VCFG IOCLKEN [31..31]  =============================================== */
+typedef enum {                                  /*!< PDM_VCFG_IOCLKEN                                                          */
+  PDM_VCFG_IOCLKEN_DIS                 = 0,     /*!< DIS : Disable FIFO read. value.                                           */
+  PDM_VCFG_IOCLKEN_EN                  = 1,     /*!< EN : Enable FIFO read. value.                                             */
+} PDM_VCFG_IOCLKEN_Enum;
+
+/* ================================================  PDM VCFG RSTB [30..30]  ================================================= */
+typedef enum {                                  /*!< PDM_VCFG_RSTB                                                             */
+  PDM_VCFG_RSTB_RESET                  = 0,     /*!< RESET : Reset the core. value.                                            */
+  PDM_VCFG_RSTB_NORM                   = 1,     /*!< NORM : Enable the core. value.                                            */
+} PDM_VCFG_RSTB_Enum;
+
+/* ==============================================  PDM VCFG PDMCLKSEL [27..29]  ============================================== */
+typedef enum {                                  /*!< PDM_VCFG_PDMCLKSEL                                                        */
+  PDM_VCFG_PDMCLKSEL_DISABLE           = 0,     /*!< DISABLE : Static value. value.                                            */
+  PDM_VCFG_PDMCLKSEL_12MHz             = 1,     /*!< 12MHz : PDM clock is 12 MHz. value.                                       */
+  PDM_VCFG_PDMCLKSEL_6MHz              = 2,     /*!< 6MHz : PDM clock is 6 MHz. value.                                         */
+  PDM_VCFG_PDMCLKSEL_3MHz              = 3,     /*!< 3MHz : PDM clock is 3 MHz. value.                                         */
+  PDM_VCFG_PDMCLKSEL_1_5MHz            = 4,     /*!< 1_5MHz : PDM clock is 1.5 MHz. value.                                     */
+  PDM_VCFG_PDMCLKSEL_750KHz            = 5,     /*!< 750KHz : PDM clock is 750 KHz. value.                                     */
+  PDM_VCFG_PDMCLKSEL_375KHz            = 6,     /*!< 375KHz : PDM clock is 375 KHz. value.                                     */
+  PDM_VCFG_PDMCLKSEL_187KHz            = 7,     /*!< 187KHz : PDM clock is 187.5 KHz. value.                                   */
+} PDM_VCFG_PDMCLKSEL_Enum;
+
+/* ==============================================  PDM VCFG PDMCLKEN [26..26]  =============================================== */
+typedef enum {                                  /*!< PDM_VCFG_PDMCLKEN                                                         */
+  PDM_VCFG_PDMCLKEN_DIS                = 0,     /*!< DIS : Disable serial clock. value.                                        */
+  PDM_VCFG_PDMCLKEN_EN                 = 1,     /*!< EN : Enable serial clock. value.                                          */
+} PDM_VCFG_PDMCLKEN_Enum;
+
+/* ================================================  PDM VCFG I2SEN [20..20]  ================================================ */
+typedef enum {                                  /*!< PDM_VCFG_I2SEN                                                            */
+  PDM_VCFG_I2SEN_DIS                   = 0,     /*!< DIS : Disable I2S interface. value.                                       */
+  PDM_VCFG_I2SEN_EN                    = 1,     /*!< EN : Enable I2S interface. value.                                         */
+} PDM_VCFG_I2SEN_Enum;
+
+/* ===============================================  PDM VCFG BCLKINV [19..19]  =============================================== */
+typedef enum {                                  /*!< PDM_VCFG_BCLKINV                                                          */
+  PDM_VCFG_BCLKINV_INV                 = 0,     /*!< INV : BCLK inverted. value.                                               */
+  PDM_VCFG_BCLKINV_NORM                = 1,     /*!< NORM : BCLK not inverted. value.                                          */
+} PDM_VCFG_BCLKINV_Enum;
+
+/* ==============================================  PDM VCFG DMICKDEL [17..17]  =============================================== */
+typedef enum {                                  /*!< PDM_VCFG_DMICKDEL                                                         */
+  PDM_VCFG_DMICKDEL_0CYC               = 0,     /*!< 0CYC : No delay. value.                                                   */
+  PDM_VCFG_DMICKDEL_1CYC               = 1,     /*!< 1CYC : 1 cycle delay. value.                                              */
+} PDM_VCFG_DMICKDEL_Enum;
+
+/* ================================================  PDM VCFG SELAP [16..16]  ================================================ */
+typedef enum {                                  /*!< PDM_VCFG_SELAP                                                            */
+  PDM_VCFG_SELAP_I2S                   = 1,     /*!< I2S : Clock source from I2S BCLK. value.                                  */
+  PDM_VCFG_SELAP_INTERNAL              = 0,     /*!< INTERNAL : Clock source from internal clock generator. value.             */
+} PDM_VCFG_SELAP_Enum;
+
+/* ================================================  PDM VCFG PCMPACK [8..8]  ================================================ */
+typedef enum {                                  /*!< PDM_VCFG_PCMPACK                                                          */
+  PDM_VCFG_PCMPACK_DIS                 = 0,     /*!< DIS : Disable PCM packing. value.                                         */
+  PDM_VCFG_PCMPACK_EN                  = 1,     /*!< EN : Enable PCM packing. value.                                           */
+} PDM_VCFG_PCMPACK_Enum;
+
+/* =================================================  PDM VCFG CHSET [3..4]  ================================================= */
+typedef enum {                                  /*!< PDM_VCFG_CHSET                                                            */
+  PDM_VCFG_CHSET_DIS                   = 0,     /*!< DIS : Channel disabled. value.                                            */
+  PDM_VCFG_CHSET_LEFT                  = 1,     /*!< LEFT : Mono left channel. value.                                          */
+  PDM_VCFG_CHSET_RIGHT                 = 2,     /*!< RIGHT : Mono right channel. value.                                        */
+  PDM_VCFG_CHSET_STEREO                = 3,     /*!< STEREO : Stereo channels. value.                                          */
+} PDM_VCFG_CHSET_Enum;
+
+/* =======================================================  VOICESTAT  ======================================================= */
+/* =======================================================  FIFOREAD  ======================================================== */
+/* =======================================================  FIFOFLUSH  ======================================================= */
+/* ========================================================  FIFOTHR  ======================================================== */
+/* =========================================================  INTEN  ========================================================= */
+/* ========================================================  INTSTAT  ======================================================== */
+/* ========================================================  INTCLR  ========================================================= */
+/* ========================================================  INTSET  ========================================================= */
+/* =======================================================  DMATRIGEN  ======================================================= */
+/* ======================================================  DMATRIGSTAT  ====================================================== */
+/* ========================================================  DMACFG  ========================================================= */
+/* ===============================================  PDM DMACFG DMAPRI [8..8]  ================================================ */
+typedef enum {                                  /*!< PDM_DMACFG_DMAPRI                                                         */
+  PDM_DMACFG_DMAPRI_LOW                = 0,     /*!< LOW : Low Priority (service as best effort) value.                        */
+  PDM_DMACFG_DMAPRI_HIGH               = 1,     /*!< HIGH : High Priority (service immediately) value.                         */
+} PDM_DMACFG_DMAPRI_Enum;
+
+/* ===============================================  PDM DMACFG DMADIR [2..2]  ================================================ */
+typedef enum {                                  /*!< PDM_DMACFG_DMADIR                                                         */
+  PDM_DMACFG_DMADIR_P2M                = 0,     /*!< P2M : Peripheral to Memory (SRAM) transaction. THe PDM module
+                                                     will only DMA to memory. value.                                           */
+  PDM_DMACFG_DMADIR_M2P                = 1,     /*!< M2P : Memory to Peripheral transaction. Not available for PDM
+                                                     module value.                                                             */
+} PDM_DMACFG_DMADIR_Enum;
+
+/* ================================================  PDM DMACFG DMAEN [0..0]  ================================================ */
+typedef enum {                                  /*!< PDM_DMACFG_DMAEN                                                          */
+  PDM_DMACFG_DMAEN_DIS                 = 0,     /*!< DIS : Disable DMA Function value.                                         */
+  PDM_DMACFG_DMAEN_EN                  = 1,     /*!< EN : Enable DMA Function value.                                           */
+} PDM_DMACFG_DMAEN_Enum;
+
+/* ======================================================  DMATOTCOUNT  ====================================================== */
+/* ======================================================  DMATARGADDR  ====================================================== */
+/* ========================================================  DMASTAT  ======================================================== */
+
+
+/* =========================================================================================================================== */
+/* ================                                          PWRCTRL                                          ================ */
+/* =========================================================================================================================== */
+
+/* =======================================================  SUPPLYSRC  ======================================================= */
+/* ==========================================  PWRCTRL SUPPLYSRC BLEBUCKEN [0..0]  =========================================== */
+typedef enum {                                  /*!< PWRCTRL_SUPPLYSRC_BLEBUCKEN                                               */
+  PWRCTRL_SUPPLYSRC_BLEBUCKEN_EN       = 1,     /*!< EN : Enable the BLE Buck. value.                                          */
+  PWRCTRL_SUPPLYSRC_BLEBUCKEN_DIS      = 0,     /*!< DIS : Disable the BLE Buck. value.                                        */
+} PWRCTRL_SUPPLYSRC_BLEBUCKEN_Enum;
+
+/* =====================================================  SUPPLYSTATUS  ====================================================== */
+/* =========================================  PWRCTRL SUPPLYSTATUS BLEBUCKON [1..1]  ========================================= */
+typedef enum {                                  /*!< PWRCTRL_SUPPLYSTATUS_BLEBUCKON                                            */
+  PWRCTRL_SUPPLYSTATUS_BLEBUCKON_LDO   = 0,     /*!< LDO : Indicates the the LDO is supplying the BLE/Burst power
+                                                     domain value.                                                             */
+  PWRCTRL_SUPPLYSTATUS_BLEBUCKON_BUCK  = 1,     /*!< BUCK : Indicates the the Buck is supplying the BLE/Burst power
+                                                     domain value.                                                             */
+} PWRCTRL_SUPPLYSTATUS_BLEBUCKON_Enum;
+
+/* ========================================  PWRCTRL SUPPLYSTATUS SIMOBUCKON [0..0]  ========================================= */
+typedef enum {                                  /*!< PWRCTRL_SUPPLYSTATUS_SIMOBUCKON                                           */
+  PWRCTRL_SUPPLYSTATUS_SIMOBUCKON_OFF  = 0,     /*!< OFF : Indicates the the SIMO Buck is OFF. value.                          */
+  PWRCTRL_SUPPLYSTATUS_SIMOBUCKON_ON   = 1,     /*!< ON : Indicates the the SIMO Buck is ON. value.                            */
+} PWRCTRL_SUPPLYSTATUS_SIMOBUCKON_Enum;
+
+/* =======================================================  DEVPWREN  ======================================================== */
+/* ===========================================  PWRCTRL DEVPWREN PWRBLEL [13..13]  =========================================== */
+typedef enum {                                  /*!< PWRCTRL_DEVPWREN_PWRBLEL                                                  */
+  PWRCTRL_DEVPWREN_PWRBLEL_EN          = 1,     /*!< EN : Power up BLE controller value.                                       */
+  PWRCTRL_DEVPWREN_PWRBLEL_DIS         = 0,     /*!< DIS : Power down BLE controller value.                                    */
+} PWRCTRL_DEVPWREN_PWRBLEL_Enum;
+
+/* ===========================================  PWRCTRL DEVPWREN PWRPDM [12..12]  ============================================ */
+typedef enum {                                  /*!< PWRCTRL_DEVPWREN_PWRPDM                                                   */
+  PWRCTRL_DEVPWREN_PWRPDM_EN           = 1,     /*!< EN : Power up PDM value.                                                  */
+  PWRCTRL_DEVPWREN_PWRPDM_DIS          = 0,     /*!< DIS : Power down PDM value.                                               */
+} PWRCTRL_DEVPWREN_PWRPDM_Enum;
+
+/* ===========================================  PWRCTRL DEVPWREN PWRMSPI [11..11]  =========================================== */
+typedef enum {                                  /*!< PWRCTRL_DEVPWREN_PWRMSPI                                                  */
+  PWRCTRL_DEVPWREN_PWRMSPI_EN          = 1,     /*!< EN : Power up MSPI value.                                                 */
+  PWRCTRL_DEVPWREN_PWRMSPI_DIS         = 0,     /*!< DIS : Power down MSPI value.                                              */
+} PWRCTRL_DEVPWREN_PWRMSPI_Enum;
+
+/* ==========================================  PWRCTRL DEVPWREN PWRSCARD [10..10]  =========================================== */
+typedef enum {                                  /*!< PWRCTRL_DEVPWREN_PWRSCARD                                                 */
+  PWRCTRL_DEVPWREN_PWRSCARD_EN         = 1,     /*!< EN : Power up SCARD value.                                                */
+  PWRCTRL_DEVPWREN_PWRSCARD_DIS        = 0,     /*!< DIS : Power down SCARD value.                                             */
+} PWRCTRL_DEVPWREN_PWRSCARD_Enum;
+
+/* ============================================  PWRCTRL DEVPWREN PWRADC [9..9]  ============================================= */
+typedef enum {                                  /*!< PWRCTRL_DEVPWREN_PWRADC                                                   */
+  PWRCTRL_DEVPWREN_PWRADC_EN           = 1,     /*!< EN : Power up ADC value.                                                  */
+  PWRCTRL_DEVPWREN_PWRADC_DIS          = 0,     /*!< DIS : Power Down ADC value.                                               */
+} PWRCTRL_DEVPWREN_PWRADC_Enum;
+
+/* ===========================================  PWRCTRL DEVPWREN PWRUART1 [8..8]  ============================================ */
+typedef enum {                                  /*!< PWRCTRL_DEVPWREN_PWRUART1                                                 */
+  PWRCTRL_DEVPWREN_PWRUART1_EN         = 1,     /*!< EN : Power up UART 1 value.                                               */
+  PWRCTRL_DEVPWREN_PWRUART1_DIS        = 0,     /*!< DIS : Power down UART 1 value.                                            */
+} PWRCTRL_DEVPWREN_PWRUART1_Enum;
+
+/* ===========================================  PWRCTRL DEVPWREN PWRUART0 [7..7]  ============================================ */
+typedef enum {                                  /*!< PWRCTRL_DEVPWREN_PWRUART0                                                 */
+  PWRCTRL_DEVPWREN_PWRUART0_EN         = 1,     /*!< EN : Power up UART 0 value.                                               */
+  PWRCTRL_DEVPWREN_PWRUART0_DIS        = 0,     /*!< DIS : Power down UART 0 value.                                            */
+} PWRCTRL_DEVPWREN_PWRUART0_Enum;
+
+/* ============================================  PWRCTRL DEVPWREN PWRIOM5 [6..6]  ============================================ */
+typedef enum {                                  /*!< PWRCTRL_DEVPWREN_PWRIOM5                                                  */
+  PWRCTRL_DEVPWREN_PWRIOM5_EN          = 1,     /*!< EN : Power up IO Master 5 value.                                          */
+  PWRCTRL_DEVPWREN_PWRIOM5_DIS         = 0,     /*!< DIS : Power down IO Master 5 value.                                       */
+} PWRCTRL_DEVPWREN_PWRIOM5_Enum;
+
+/* ============================================  PWRCTRL DEVPWREN PWRIOM4 [5..5]  ============================================ */
+typedef enum {                                  /*!< PWRCTRL_DEVPWREN_PWRIOM4                                                  */
+  PWRCTRL_DEVPWREN_PWRIOM4_EN          = 1,     /*!< EN : Power up IO Master 4 value.                                          */
+  PWRCTRL_DEVPWREN_PWRIOM4_DIS         = 0,     /*!< DIS : Power down IO Master 4 value.                                       */
+} PWRCTRL_DEVPWREN_PWRIOM4_Enum;
+
+/* ============================================  PWRCTRL DEVPWREN PWRIOM3 [4..4]  ============================================ */
+typedef enum {                                  /*!< PWRCTRL_DEVPWREN_PWRIOM3                                                  */
+  PWRCTRL_DEVPWREN_PWRIOM3_EN          = 1,     /*!< EN : Power up IO Master 3 value.                                          */
+  PWRCTRL_DEVPWREN_PWRIOM3_DIS         = 0,     /*!< DIS : Power down IO Master 3 value.                                       */
+} PWRCTRL_DEVPWREN_PWRIOM3_Enum;
+
+/* ============================================  PWRCTRL DEVPWREN PWRIOM2 [3..3]  ============================================ */
+typedef enum {                                  /*!< PWRCTRL_DEVPWREN_PWRIOM2                                                  */
+  PWRCTRL_DEVPWREN_PWRIOM2_EN          = 1,     /*!< EN : Power up IO Master 2 value.                                          */
+  PWRCTRL_DEVPWREN_PWRIOM2_DIS         = 0,     /*!< DIS : Power down IO Master 2 value.                                       */
+} PWRCTRL_DEVPWREN_PWRIOM2_Enum;
+
+/* ============================================  PWRCTRL DEVPWREN PWRIOM1 [2..2]  ============================================ */
+typedef enum {                                  /*!< PWRCTRL_DEVPWREN_PWRIOM1                                                  */
+  PWRCTRL_DEVPWREN_PWRIOM1_EN          = 1,     /*!< EN : Power up IO Master 1 value.                                          */
+  PWRCTRL_DEVPWREN_PWRIOM1_DIS         = 0,     /*!< DIS : Power down IO Master 1 value.                                       */
+} PWRCTRL_DEVPWREN_PWRIOM1_Enum;
+
+/* ============================================  PWRCTRL DEVPWREN PWRIOM0 [1..1]  ============================================ */
+typedef enum {                                  /*!< PWRCTRL_DEVPWREN_PWRIOM0                                                  */
+  PWRCTRL_DEVPWREN_PWRIOM0_EN          = 1,     /*!< EN : Power up IO Master 0 value.                                          */
+  PWRCTRL_DEVPWREN_PWRIOM0_DIS         = 0,     /*!< DIS : Power down IO Master 0 value.                                       */
+} PWRCTRL_DEVPWREN_PWRIOM0_Enum;
+
+/* ============================================  PWRCTRL DEVPWREN PWRIOS [0..0]  ============================================= */
+typedef enum {                                  /*!< PWRCTRL_DEVPWREN_PWRIOS                                                   */
+  PWRCTRL_DEVPWREN_PWRIOS_EN           = 1,     /*!< EN : Power up IO slave value.                                             */
+  PWRCTRL_DEVPWREN_PWRIOS_DIS          = 0,     /*!< DIS : Power down IO slave value.                                          */
+} PWRCTRL_DEVPWREN_PWRIOS_Enum;
+
+/* =====================================================  MEMPWDINSLEEP  ===================================================== */
+/* ======================================  PWRCTRL MEMPWDINSLEEP CACHEPWDSLP [31..31]  ======================================= */
+typedef enum {                                  /*!< PWRCTRL_MEMPWDINSLEEP_CACHEPWDSLP                                         */
+  PWRCTRL_MEMPWDINSLEEP_CACHEPWDSLP_EN = 1,     /*!< EN : Power down cache in deep sleep value.                                */
+  PWRCTRL_MEMPWDINSLEEP_CACHEPWDSLP_DIS = 0,    /*!< DIS : Retain cache in deep sleep value.                                   */
+} PWRCTRL_MEMPWDINSLEEP_CACHEPWDSLP_Enum;
+
+/* ======================================  PWRCTRL MEMPWDINSLEEP FLASH1PWDSLP [14..14]  ====================================== */
+typedef enum {                                  /*!< PWRCTRL_MEMPWDINSLEEP_FLASH1PWDSLP                                        */
+  PWRCTRL_MEMPWDINSLEEP_FLASH1PWDSLP_EN = 1,    /*!< EN : Flash1 is powered down during deepsleep value.                       */
+  PWRCTRL_MEMPWDINSLEEP_FLASH1PWDSLP_DIS = 0,   /*!< DIS : Flash1 is kept powered on during deepsleep value.                   */
+} PWRCTRL_MEMPWDINSLEEP_FLASH1PWDSLP_Enum;
+
+/* ======================================  PWRCTRL MEMPWDINSLEEP FLASH0PWDSLP [13..13]  ====================================== */
+typedef enum {                                  /*!< PWRCTRL_MEMPWDINSLEEP_FLASH0PWDSLP                                        */
+  PWRCTRL_MEMPWDINSLEEP_FLASH0PWDSLP_EN = 1,    /*!< EN : Flash0 is powered down during deepsleep value.                       */
+  PWRCTRL_MEMPWDINSLEEP_FLASH0PWDSLP_DIS = 0,   /*!< DIS : Flash0 is kept powered on during deepsleep value.                   */
+} PWRCTRL_MEMPWDINSLEEP_FLASH0PWDSLP_Enum;
+
+/* =======================================  PWRCTRL MEMPWDINSLEEP SRAMPWDSLP [3..12]  ======================================== */
+typedef enum {                                  /*!< PWRCTRL_MEMPWDINSLEEP_SRAMPWDSLP                                          */
+  PWRCTRL_MEMPWDINSLEEP_SRAMPWDSLP_NONE = 0,    /*!< NONE : All banks retained value.                                          */
+  PWRCTRL_MEMPWDINSLEEP_SRAMPWDSLP_GROUP0 = 1,  /*!< GROUP0 : SRAM GROUP0 powered down (64KB-96KB) value.                      */
+  PWRCTRL_MEMPWDINSLEEP_SRAMPWDSLP_GROUP1 = 2,  /*!< GROUP1 : SRAM GROUP1 powered down (96KB-128KB) value.                     */
+  PWRCTRL_MEMPWDINSLEEP_SRAMPWDSLP_GROUP2 = 4,  /*!< GROUP2 : SRAM GROUP2 powered down (128KB-160KB) value.                    */
+  PWRCTRL_MEMPWDINSLEEP_SRAMPWDSLP_GROUP3 = 8,  /*!< GROUP3 : SRAM GROUP3 powered down (160KB-192KB) value.                    */
+  PWRCTRL_MEMPWDINSLEEP_SRAMPWDSLP_GROUP4 = 16, /*!< GROUP4 : SRAM GROUP4 powered down (192KB-224KB) value.                    */
+  PWRCTRL_MEMPWDINSLEEP_SRAMPWDSLP_GROUP5 = 32, /*!< GROUP5 : SRAM GROUP5 powered down (224KB-256KB) value.                    */
+  PWRCTRL_MEMPWDINSLEEP_SRAMPWDSLP_GROUP6 = 64, /*!< GROUP6 : SRAM GROUP6 powered down (256KB-288KB) value.                    */
+  PWRCTRL_MEMPWDINSLEEP_SRAMPWDSLP_GROUP7 = 128,/*!< GROUP7 : SRAM GROUP7 powered down (288KB-320KB) value.                    */
+  PWRCTRL_MEMPWDINSLEEP_SRAMPWDSLP_GROUP8 = 256,/*!< GROUP8 : SRAM GROUP8 powered down (320KB-352KB) value.                    */
+  PWRCTRL_MEMPWDINSLEEP_SRAMPWDSLP_GROUP9 = 512,/*!< GROUP9 : SRAM GROUP9 powered down (352KB-384KB) value.                    */
+  PWRCTRL_MEMPWDINSLEEP_SRAMPWDSLP_SRAM64K = 3, /*!< SRAM64K : Powerdown lower 64k SRAM (64KB-128KB) value.                    */
+  PWRCTRL_MEMPWDINSLEEP_SRAMPWDSLP_SRAM128K = 15,/*!< SRAM128K : Powerdown lower 128k SRAM (64KB-192KB) value.                 */
+  PWRCTRL_MEMPWDINSLEEP_SRAMPWDSLP_ALLBUTLOWER32K = 1022,/*!< ALLBUTLOWER32K : All SRAM banks but lower 32k powered down (96KB-384KB).
+                                                     value.                                                                    */
+  PWRCTRL_MEMPWDINSLEEP_SRAMPWDSLP_ALLBUTLOWER64K = 1020,/*!< ALLBUTLOWER64K : All banks but lower 64k powered down. value.    */
+  PWRCTRL_MEMPWDINSLEEP_SRAMPWDSLP_ALLBUTLOWER128K = 1008,/*!< ALLBUTLOWER128K : All banks but lower 128k powered down. value. */
+  PWRCTRL_MEMPWDINSLEEP_SRAMPWDSLP_ALL = 1023,  /*!< ALL : All banks powered down. value.                                      */
+} PWRCTRL_MEMPWDINSLEEP_SRAMPWDSLP_Enum;
+
+/* ========================================  PWRCTRL MEMPWDINSLEEP DTCMPWDSLP [0..2]  ======================================== */
+typedef enum {                                  /*!< PWRCTRL_MEMPWDINSLEEP_DTCMPWDSLP                                          */
+  PWRCTRL_MEMPWDINSLEEP_DTCMPWDSLP_NONE = 0,    /*!< NONE : All DTCM retained value.                                           */
+  PWRCTRL_MEMPWDINSLEEP_DTCMPWDSLP_GROUP0DTCM0 = 1,/*!< GROUP0DTCM0 : Group0_DTCM0 powered down in deep sleep (0KB-8KB)
+                                                     value.                                                                    */
+  PWRCTRL_MEMPWDINSLEEP_DTCMPWDSLP_GROUP0DTCM1 = 2,/*!< GROUP0DTCM1 : Group0_DTCM1 powered down in deep sleep (8KB-32KB)
+                                                     value.                                                                    */
+  PWRCTRL_MEMPWDINSLEEP_DTCMPWDSLP_GROUP0 = 3,  /*!< GROUP0 : Both DTCMs in group0 are powered down in deep sleep
+                                                     (0KB-32KB) value.                                                         */
+  PWRCTRL_MEMPWDINSLEEP_DTCMPWDSLP_ALLBUTGROUP0DTCM0 = 6,/*!< ALLBUTGROUP0DTCM0 : Group1 and Group0_DTCM1 are powered down
+                                                     in deep sleep (8KB-64KB) value.                                           */
+  PWRCTRL_MEMPWDINSLEEP_DTCMPWDSLP_GROUP1 = 4,  /*!< GROUP1 : Group1 DTCM powered down in deep sleep (32KB-64KB)
+                                                     value.                                                                    */
+  PWRCTRL_MEMPWDINSLEEP_DTCMPWDSLP_ALL = 7,     /*!< ALL : All DTCMs powered down in deep sleep (0KB-64KB) value.              */
+} PWRCTRL_MEMPWDINSLEEP_DTCMPWDSLP_Enum;
+
+/* =======================================================  MEMPWREN  ======================================================== */
+/* ===========================================  PWRCTRL MEMPWREN CACHEB2 [31..31]  =========================================== */
+typedef enum {                                  /*!< PWRCTRL_MEMPWREN_CACHEB2                                                  */
+  PWRCTRL_MEMPWREN_CACHEB2_EN          = 1,     /*!< EN : Power up Cache Bank 2 value.                                         */
+  PWRCTRL_MEMPWREN_CACHEB2_DIS         = 0,     /*!< DIS : Power down Cache Bank 2 value.                                      */
+} PWRCTRL_MEMPWREN_CACHEB2_Enum;
+
+/* ===========================================  PWRCTRL MEMPWREN CACHEB0 [30..30]  =========================================== */
+typedef enum {                                  /*!< PWRCTRL_MEMPWREN_CACHEB0                                                  */
+  PWRCTRL_MEMPWREN_CACHEB0_EN          = 1,     /*!< EN : Power up Cache Bank 0 value.                                         */
+  PWRCTRL_MEMPWREN_CACHEB0_DIS         = 0,     /*!< DIS : Power down Cache Bank 0 value.                                      */
+} PWRCTRL_MEMPWREN_CACHEB0_Enum;
+
+/* ===========================================  PWRCTRL MEMPWREN FLASH1 [14..14]  ============================================ */
+typedef enum {                                  /*!< PWRCTRL_MEMPWREN_FLASH1                                                   */
+  PWRCTRL_MEMPWREN_FLASH1_EN           = 1,     /*!< EN : Power up Flash1 value.                                               */
+  PWRCTRL_MEMPWREN_FLASH1_DIS          = 0,     /*!< DIS : Power down Flash1 value.                                            */
+} PWRCTRL_MEMPWREN_FLASH1_Enum;
+
+/* ===========================================  PWRCTRL MEMPWREN FLASH0 [13..13]  ============================================ */
+typedef enum {                                  /*!< PWRCTRL_MEMPWREN_FLASH0                                                   */
+  PWRCTRL_MEMPWREN_FLASH0_EN           = 1,     /*!< EN : Power up Flash0 value.                                               */
+  PWRCTRL_MEMPWREN_FLASH0_DIS          = 0,     /*!< DIS : Power down Flash0 value.                                            */
+} PWRCTRL_MEMPWREN_FLASH0_Enum;
+
+/* =============================================  PWRCTRL MEMPWREN SRAM [3..12]  ============================================= */
+typedef enum {                                  /*!< PWRCTRL_MEMPWREN_SRAM                                                     */
+  PWRCTRL_MEMPWREN_SRAM_NONE           = 0,     /*!< NONE : Do not power ON any of the SRAM banks value.                       */
+  PWRCTRL_MEMPWREN_SRAM_GROUP0         = 1,     /*!< GROUP0 : Power ON only SRAM group0 (0KB-32KB) value.                      */
+  PWRCTRL_MEMPWREN_SRAM_GROUP1         = 2,     /*!< GROUP1 : Power ON only SRAM group1 (32KB-64KB) value.                     */
+  PWRCTRL_MEMPWREN_SRAM_GROUP2         = 4,     /*!< GROUP2 : Power ON only SRAM group2 (64KB-96KB) value.                     */
+  PWRCTRL_MEMPWREN_SRAM_GROUP3         = 8,     /*!< GROUP3 : Power ON only SRAM group3 (96KB-128KB) value.                    */
+  PWRCTRL_MEMPWREN_SRAM_GROUP4         = 16,    /*!< GROUP4 : Power ON only SRAM group4 (128KB-160KB) value.                   */
+  PWRCTRL_MEMPWREN_SRAM_GROUP5         = 32,    /*!< GROUP5 : Power ON only SRAM group5 (160KB-192KB) value.                   */
+  PWRCTRL_MEMPWREN_SRAM_GROUP6         = 64,    /*!< GROUP6 : Power ON only SRAM group6 (192KB-224KB) value.                   */
+  PWRCTRL_MEMPWREN_SRAM_GROUP7         = 128,   /*!< GROUP7 : Power ON only SRAM group7 (224KB-256KB) value.                   */
+  PWRCTRL_MEMPWREN_SRAM_GROUP8         = 256,   /*!< GROUP8 : Power ON only SRAM group8 (256KB-288KB) value.                   */
+  PWRCTRL_MEMPWREN_SRAM_GROUP9         = 512,   /*!< GROUP9 : Power ON only SRAM group9 (288KB-320KB) value.                   */
+  PWRCTRL_MEMPWREN_SRAM_SRAM64K        = 3,     /*!< SRAM64K : Power ON only lower 64k value.                                  */
+  PWRCTRL_MEMPWREN_SRAM_SRAM128K       = 15,    /*!< SRAM128K : Power ON only lower 128k value.                                */
+  PWRCTRL_MEMPWREN_SRAM_SRAM256K       = 255,   /*!< SRAM256K : Power ON only lower 256k value.                                */
+  PWRCTRL_MEMPWREN_SRAM_ALL            = 1023,  /*!< ALL : All SRAM banks (320K) powered ON value.                             */
+} PWRCTRL_MEMPWREN_SRAM_Enum;
+
+/* =============================================  PWRCTRL MEMPWREN DTCM [0..2]  ============================================== */
+typedef enum {                                  /*!< PWRCTRL_MEMPWREN_DTCM                                                     */
+  PWRCTRL_MEMPWREN_DTCM_NONE           = 0,     /*!< NONE : Do not enable power to any DTCMs value.                            */
+  PWRCTRL_MEMPWREN_DTCM_GROUP0DTCM0    = 1,     /*!< GROUP0DTCM0 : Power ON only GROUP0_DTCM0 value.                           */
+  PWRCTRL_MEMPWREN_DTCM_GROUP0DTCM1    = 2,     /*!< GROUP0DTCM1 : Power ON only GROUP0_DTCM1 value.                           */
+  PWRCTRL_MEMPWREN_DTCM_GROUP0         = 3,     /*!< GROUP0 : Power ON only DTCMs in group0 value.                             */
+  PWRCTRL_MEMPWREN_DTCM_GROUP1         = 4,     /*!< GROUP1 : Power ON only DTCMs in group1 value.                             */
+  PWRCTRL_MEMPWREN_DTCM_ALL            = 7,     /*!< ALL : Power ON all DTCMs value.                                           */
+} PWRCTRL_MEMPWREN_DTCM_Enum;
+
+/* =====================================================  MEMPWRSTATUS  ====================================================== */
+/* =====================================================  DEVPWRSTATUS  ====================================================== */
+/* =======================================================  SRAMCTRL  ======================================================== */
+/* ========================================  PWRCTRL SRAMCTRL SRAMLIGHTSLEEP [8..19]  ======================================== */
+typedef enum {                                  /*!< PWRCTRL_SRAMCTRL_SRAMLIGHTSLEEP                                           */
+  PWRCTRL_SRAMCTRL_SRAMLIGHTSLEEP_ALL  = 255,   /*!< ALL : Enable LIGHT SLEEP for ALL SRAMs value.                             */
+  PWRCTRL_SRAMCTRL_SRAMLIGHTSLEEP_DIS  = 0,     /*!< DIS : Disables LIGHT SLEEP for ALL SRAMs value.                           */
+} PWRCTRL_SRAMCTRL_SRAMLIGHTSLEEP_Enum;
+
+/* =======================================  PWRCTRL SRAMCTRL SRAMMASTERCLKGATE [2..2]  ======================================= */
+typedef enum {                                  /*!< PWRCTRL_SRAMCTRL_SRAMMASTERCLKGATE                                        */
+  PWRCTRL_SRAMCTRL_SRAMMASTERCLKGATE_EN = 1,    /*!< EN : Enable Master SRAM Clock Gate value.                                 */
+  PWRCTRL_SRAMCTRL_SRAMMASTERCLKGATE_DIS = 0,   /*!< DIS : Disables Master SRAM Clock Gating value.                            */
+} PWRCTRL_SRAMCTRL_SRAMMASTERCLKGATE_Enum;
+
+/* ==========================================  PWRCTRL SRAMCTRL SRAMCLKGATE [1..1]  ========================================== */
+typedef enum {                                  /*!< PWRCTRL_SRAMCTRL_SRAMCLKGATE                                              */
+  PWRCTRL_SRAMCTRL_SRAMCLKGATE_EN      = 1,     /*!< EN : Enable Individual SRAM Clock Gating value.                           */
+  PWRCTRL_SRAMCTRL_SRAMCLKGATE_DIS     = 0,     /*!< DIS : Disables Individual SRAM Clock Gating value.                        */
+} PWRCTRL_SRAMCTRL_SRAMCLKGATE_Enum;
+
+/* =======================================================  ADCSTATUS  ======================================================= */
+/* =========================================================  MISC  ========================================================== */
+/* ============================================  PWRCTRL MISC MEMVRLPBLE [6..6]  ============================================= */
+typedef enum {                                  /*!< PWRCTRL_MISC_MEMVRLPBLE                                                   */
+  PWRCTRL_MISC_MEMVRLPBLE_EN           = 1,     /*!< EN : Mem VR can go to lp mode even when BLE is powered on. value.         */
+  PWRCTRL_MISC_MEMVRLPBLE_DIS          = 0,     /*!< DIS : Mem VR will stay in active mode when BLE is powered on.
+                                                     value.                                                                    */
+} PWRCTRL_MISC_MEMVRLPBLE_Enum;
+
+/* ===========================================  PWRCTRL MISC FORCEMEMVRADC [4..5]  =========================================== */
+typedef enum {                                  /*!< PWRCTRL_MISC_FORCEMEMVRADC                                                */
+  PWRCTRL_MISC_FORCEMEMVRADC_ACT       = 2,     /*!< ACT : In this mode if all the other domains but ADC are powered
+                                                     down, mem VR will stay in ACT mode. value.                                */
+  PWRCTRL_MISC_FORCEMEMVRADC_LP        = 1,     /*!< LP : In this mode if all the other domains but ADC are powered
+                                                     down, mem VR will stay in LP mode. value.                                 */
+  PWRCTRL_MISC_FORCEMEMVRADC_DIS       = 0,     /*!< DIS : In this mode if all the other domains but ADC are powered
+                                                     down, mem VR will duty cycle between active and LP modes
+                                                     depending on ADC sampling. value.                                         */
+} PWRCTRL_MISC_FORCEMEMVRADC_Enum;
+
+/* ============================================  PWRCTRL MISC SIMOBUCKEN [0..0]  ============================================= */
+typedef enum {                                  /*!< PWRCTRL_MISC_SIMOBUCKEN                                                   */
+  PWRCTRL_MISC_SIMOBUCKEN_EN           = 1,     /*!< EN : Enable the SIMO Buck value.                                          */
+  PWRCTRL_MISC_SIMOBUCKEN_DIS          = 0,     /*!< DIS : Disable the SIMO Buck value.                                        */
+} PWRCTRL_MISC_SIMOBUCKEN_Enum;
+
+/* =====================================================  DEVPWREVENTEN  ===================================================== */
+/* =======================================  PWRCTRL DEVPWREVENTEN BURSTEVEN [31..31]  ======================================== */
+typedef enum {                                  /*!< PWRCTRL_DEVPWREVENTEN_BURSTEVEN                                           */
+  PWRCTRL_DEVPWREVENTEN_BURSTEVEN_EN   = 1,     /*!< EN : Enable BURST status event value.                                     */
+  PWRCTRL_DEVPWREVENTEN_BURSTEVEN_DIS  = 0,     /*!< DIS : Disable BURST status event value.                                   */
+} PWRCTRL_DEVPWREVENTEN_BURSTEVEN_Enum;
+
+/* ====================================  PWRCTRL DEVPWREVENTEN BURSTFEATUREEVEN [30..30]  ==================================== */
+typedef enum {                                  /*!< PWRCTRL_DEVPWREVENTEN_BURSTFEATUREEVEN                                    */
+  PWRCTRL_DEVPWREVENTEN_BURSTFEATUREEVEN_EN = 1,/*!< EN : Enable BURSTFEATURE status event value.                              */
+  PWRCTRL_DEVPWREVENTEN_BURSTFEATUREEVEN_DIS = 0,/*!< DIS : Disable BURSTFEATURE status event value.                           */
+} PWRCTRL_DEVPWREVENTEN_BURSTFEATUREEVEN_Enum;
+
+/* =====================================  PWRCTRL DEVPWREVENTEN BLEFEATUREEVEN [29..29]  ===================================== */
+typedef enum {                                  /*!< PWRCTRL_DEVPWREVENTEN_BLEFEATUREEVEN                                      */
+  PWRCTRL_DEVPWREVENTEN_BLEFEATUREEVEN_EN = 1,  /*!< EN : Enable BLEFEATURE status event value.                                */
+  PWRCTRL_DEVPWREVENTEN_BLEFEATUREEVEN_DIS = 0, /*!< DIS : Disable BLEFEATURE status event value.                              */
+} PWRCTRL_DEVPWREVENTEN_BLEFEATUREEVEN_Enum;
+
+/* =========================================  PWRCTRL DEVPWREVENTEN BLELEVEN [8..8]  ========================================= */
+typedef enum {                                  /*!< PWRCTRL_DEVPWREVENTEN_BLELEVEN                                            */
+  PWRCTRL_DEVPWREVENTEN_BLELEVEN_EN    = 1,     /*!< EN : Enable BLE power-on status event value.                              */
+  PWRCTRL_DEVPWREVENTEN_BLELEVEN_DIS   = 0,     /*!< DIS : Disable BLE power-on status event value.                            */
+} PWRCTRL_DEVPWREVENTEN_BLELEVEN_Enum;
+
+/* =========================================  PWRCTRL DEVPWREVENTEN PDMEVEN [7..7]  ========================================== */
+typedef enum {                                  /*!< PWRCTRL_DEVPWREVENTEN_PDMEVEN                                             */
+  PWRCTRL_DEVPWREVENTEN_PDMEVEN_EN     = 1,     /*!< EN : Enable PDM power-on status event value.                              */
+  PWRCTRL_DEVPWREVENTEN_PDMEVEN_DIS    = 0,     /*!< DIS : Disable PDM power-on status event value.                            */
+} PWRCTRL_DEVPWREVENTEN_PDMEVEN_Enum;
+
+/* =========================================  PWRCTRL DEVPWREVENTEN MSPIEVEN [6..6]  ========================================= */
+typedef enum {                                  /*!< PWRCTRL_DEVPWREVENTEN_MSPIEVEN                                            */
+  PWRCTRL_DEVPWREVENTEN_MSPIEVEN_EN    = 1,     /*!< EN : Enable MSPI power-on status event value.                             */
+  PWRCTRL_DEVPWREVENTEN_MSPIEVEN_DIS   = 0,     /*!< DIS : Disable MSPI power-on status event value.                           */
+} PWRCTRL_DEVPWREVENTEN_MSPIEVEN_Enum;
+
+/* =========================================  PWRCTRL DEVPWREVENTEN ADCEVEN [5..5]  ========================================== */
+typedef enum {                                  /*!< PWRCTRL_DEVPWREVENTEN_ADCEVEN                                             */
+  PWRCTRL_DEVPWREVENTEN_ADCEVEN_EN     = 1,     /*!< EN : Enable ADC power-on status event value.                              */
+  PWRCTRL_DEVPWREVENTEN_ADCEVEN_DIS    = 0,     /*!< DIS : Disable ADC power-on status event value.                            */
+} PWRCTRL_DEVPWREVENTEN_ADCEVEN_Enum;
+
+/* =========================================  PWRCTRL DEVPWREVENTEN HCPCEVEN [4..4]  ========================================= */
+typedef enum {                                  /*!< PWRCTRL_DEVPWREVENTEN_HCPCEVEN                                            */
+  PWRCTRL_DEVPWREVENTEN_HCPCEVEN_EN    = 1,     /*!< EN : Enable HCPC power-on status event value.                             */
+  PWRCTRL_DEVPWREVENTEN_HCPCEVEN_DIS   = 0,     /*!< DIS : Disable HCPC power-on status event value.                           */
+} PWRCTRL_DEVPWREVENTEN_HCPCEVEN_Enum;
+
+/* =========================================  PWRCTRL DEVPWREVENTEN HCPBEVEN [3..3]  ========================================= */
+typedef enum {                                  /*!< PWRCTRL_DEVPWREVENTEN_HCPBEVEN                                            */
+  PWRCTRL_DEVPWREVENTEN_HCPBEVEN_EN    = 1,     /*!< EN : Enable HCPB power-on status event value.                             */
+  PWRCTRL_DEVPWREVENTEN_HCPBEVEN_DIS   = 0,     /*!< DIS : Disable HCPB power-on status event value.                           */
+} PWRCTRL_DEVPWREVENTEN_HCPBEVEN_Enum;
+
+/* =========================================  PWRCTRL DEVPWREVENTEN HCPAEVEN [2..2]  ========================================= */
+typedef enum {                                  /*!< PWRCTRL_DEVPWREVENTEN_HCPAEVEN                                            */
+  PWRCTRL_DEVPWREVENTEN_HCPAEVEN_EN    = 1,     /*!< EN : Enable HCPA power-on status event value.                             */
+  PWRCTRL_DEVPWREVENTEN_HCPAEVEN_DIS   = 0,     /*!< DIS : Disable HCPA power-on status event value.                           */
+} PWRCTRL_DEVPWREVENTEN_HCPAEVEN_Enum;
+
+/* =========================================  PWRCTRL DEVPWREVENTEN MCUHEVEN [1..1]  ========================================= */
+typedef enum {                                  /*!< PWRCTRL_DEVPWREVENTEN_MCUHEVEN                                            */
+  PWRCTRL_DEVPWREVENTEN_MCUHEVEN_EN    = 1,     /*!< EN : Enable MCHU power-on status event value.                             */
+  PWRCTRL_DEVPWREVENTEN_MCUHEVEN_DIS   = 0,     /*!< DIS : Disable MCUH power-on status event value.                           */
+} PWRCTRL_DEVPWREVENTEN_MCUHEVEN_Enum;
+
+/* =========================================  PWRCTRL DEVPWREVENTEN MCULEVEN [0..0]  ========================================= */
+typedef enum {                                  /*!< PWRCTRL_DEVPWREVENTEN_MCULEVEN                                            */
+  PWRCTRL_DEVPWREVENTEN_MCULEVEN_EN    = 1,     /*!< EN : Enable MCUL power-on status event value.                             */
+  PWRCTRL_DEVPWREVENTEN_MCULEVEN_DIS   = 0,     /*!< DIS : Disable MCUL power-on status event value.                           */
+} PWRCTRL_DEVPWREVENTEN_MCULEVEN_Enum;
+
+/* =====================================================  MEMPWREVENTEN  ===================================================== */
+/* =======================================  PWRCTRL MEMPWREVENTEN CACHEB2EN [31..31]  ======================================== */
+typedef enum {                                  /*!< PWRCTRL_MEMPWREVENTEN_CACHEB2EN                                           */
+  PWRCTRL_MEMPWREVENTEN_CACHEB2EN_EN   = 1,     /*!< EN : Enable CACHE BANK 2 status event value.                              */
+  PWRCTRL_MEMPWREVENTEN_CACHEB2EN_DIS  = 0,     /*!< DIS : Disable CACHE BANK 2 status event value.                            */
+} PWRCTRL_MEMPWREVENTEN_CACHEB2EN_Enum;
+
+/* =======================================  PWRCTRL MEMPWREVENTEN CACHEB0EN [30..30]  ======================================== */
+typedef enum {                                  /*!< PWRCTRL_MEMPWREVENTEN_CACHEB0EN                                           */
+  PWRCTRL_MEMPWREVENTEN_CACHEB0EN_EN   = 1,     /*!< EN : Enable CACHE BANK 0 status event value.                              */
+  PWRCTRL_MEMPWREVENTEN_CACHEB0EN_DIS  = 0,     /*!< DIS : Disable CACHE BANK 0 status event value.                            */
+} PWRCTRL_MEMPWREVENTEN_CACHEB0EN_Enum;
+
+/* ========================================  PWRCTRL MEMPWREVENTEN FLASH1EN [14..14]  ======================================== */
+typedef enum {                                  /*!< PWRCTRL_MEMPWREVENTEN_FLASH1EN                                            */
+  PWRCTRL_MEMPWREVENTEN_FLASH1EN_EN    = 1,     /*!< EN : Enable FLASH status event value.                                     */
+  PWRCTRL_MEMPWREVENTEN_FLASH1EN_DIS   = 0,     /*!< DIS : Disables FLASH status event value.                                  */
+} PWRCTRL_MEMPWREVENTEN_FLASH1EN_Enum;
+
+/* ========================================  PWRCTRL MEMPWREVENTEN FLASH0EN [13..13]  ======================================== */
+typedef enum {                                  /*!< PWRCTRL_MEMPWREVENTEN_FLASH0EN                                            */
+  PWRCTRL_MEMPWREVENTEN_FLASH0EN_EN    = 1,     /*!< EN : Enable FLASH status event value.                                     */
+  PWRCTRL_MEMPWREVENTEN_FLASH0EN_DIS   = 0,     /*!< DIS : Disables FLASH status event value.                                  */
+} PWRCTRL_MEMPWREVENTEN_FLASH0EN_Enum;
+
+/* =========================================  PWRCTRL MEMPWREVENTEN SRAMEN [3..12]  ========================================== */
+typedef enum {                                  /*!< PWRCTRL_MEMPWREVENTEN_SRAMEN                                              */
+  PWRCTRL_MEMPWREVENTEN_SRAMEN_NONE    = 0,     /*!< NONE : Disable SRAM power-on status event value.                          */
+  PWRCTRL_MEMPWREVENTEN_SRAMEN_GROUP0EN = 1,    /*!< GROUP0EN : Enable SRAM group0 (0KB-32KB) power on status event
+                                                     value.                                                                    */
+  PWRCTRL_MEMPWREVENTEN_SRAMEN_GROUP1EN = 2,    /*!< GROUP1EN : Enable SRAM group1 (32KB-64KB) power on status event
+                                                     value.                                                                    */
+  PWRCTRL_MEMPWREVENTEN_SRAMEN_GROUP2EN = 4,    /*!< GROUP2EN : Enable SRAM group2 (64KB-96KB) power on status event
+                                                     value.                                                                    */
+  PWRCTRL_MEMPWREVENTEN_SRAMEN_GROUP3EN = 8,    /*!< GROUP3EN : Enable SRAM group3 (96KB-128KB) power on status event
+                                                     value.                                                                    */
+  PWRCTRL_MEMPWREVENTEN_SRAMEN_GROUP4EN = 16,   /*!< GROUP4EN : Enable SRAM group4 (128KB-160KB) power on status
+                                                     event value.                                                              */
+  PWRCTRL_MEMPWREVENTEN_SRAMEN_GROUP5EN = 32,   /*!< GROUP5EN : Enable SRAM group5 (160KB-192KB) power on status
+                                                     event value.                                                              */
+  PWRCTRL_MEMPWREVENTEN_SRAMEN_GROUP6EN = 64,   /*!< GROUP6EN : Enable SRAM group6 (192KB-224KB) power on status
+                                                     event value.                                                              */
+  PWRCTRL_MEMPWREVENTEN_SRAMEN_GROUP7EN = 128,  /*!< GROUP7EN : Enable SRAM group7 (224KB-256KB) power on status
+                                                     event value.                                                              */
+  PWRCTRL_MEMPWREVENTEN_SRAMEN_GROUP8EN = 256,  /*!< GROUP8EN : Enable SRAM group8 (256KB-288KB) power on status
+                                                     event value.                                                              */
+  PWRCTRL_MEMPWREVENTEN_SRAMEN_GROUP9EN = 512,  /*!< GROUP9EN : Enable SRAM group9 (288KB-320KB) power on status
+                                                     event value.                                                              */
+} PWRCTRL_MEMPWREVENTEN_SRAMEN_Enum;
+
+/* ==========================================  PWRCTRL MEMPWREVENTEN DTCMEN [0..2]  ========================================== */
+typedef enum {                                  /*!< PWRCTRL_MEMPWREVENTEN_DTCMEN                                              */
+  PWRCTRL_MEMPWREVENTEN_DTCMEN_NONE    = 0,     /*!< NONE : Do not enable DTCM power-on status event value.                    */
+  PWRCTRL_MEMPWREVENTEN_DTCMEN_GROUP0DTCM0EN = 1,/*!< GROUP0DTCM0EN : Enable GROUP0_DTCM0 power on status event value.         */
+  PWRCTRL_MEMPWREVENTEN_DTCMEN_GROUP0DTCM1EN = 2,/*!< GROUP0DTCM1EN : Enable GROUP0_DTCM1 power on status event value.         */
+  PWRCTRL_MEMPWREVENTEN_DTCMEN_GROUP0EN = 3,    /*!< GROUP0EN : Enable DTCMs in group0 power on status event value.            */
+  PWRCTRL_MEMPWREVENTEN_DTCMEN_GROUP1EN = 4,    /*!< GROUP1EN : Enable DTCMs in group1 power on status event value.            */
+  PWRCTRL_MEMPWREVENTEN_DTCMEN_ALL     = 7,     /*!< ALL : Enable all DTCM power on status event value.                        */
+} PWRCTRL_MEMPWREVENTEN_DTCMEN_Enum;
+
+
+
+/* =========================================================================================================================== */
+/* ================                                          RSTGEN                                           ================ */
+/* =========================================================================================================================== */
+
+/* ==========================================================  CFG  ========================================================== */
+/* =========================================================  SWPOI  ========================================================= */
+/* =============================================  RSTGEN SWPOI SWPOIKEY [0..7]  ============================================== */
+typedef enum {                                  /*!< RSTGEN_SWPOI_SWPOIKEY                                                     */
+  RSTGEN_SWPOI_SWPOIKEY_KEYVALUE       = 27,    /*!< KEYVALUE : Writing 0x1B key value generates a software POI reset.
+                                                     value.                                                                    */
+} RSTGEN_SWPOI_SWPOIKEY_Enum;
+
+/* =========================================================  SWPOR  ========================================================= */
+/* =============================================  RSTGEN SWPOR SWPORKEY [0..7]  ============================================== */
+typedef enum {                                  /*!< RSTGEN_SWPOR_SWPORKEY                                                     */
+  RSTGEN_SWPOR_SWPORKEY_KEYVALUE       = 212,   /*!< KEYVALUE : Writing 0xD4 key value generates a software POR reset.
+                                                     value.                                                                    */
+} RSTGEN_SWPOR_SWPORKEY_Enum;
+
+/* ========================================================  TPIURST  ======================================================== */
+/* =========================================================  INTEN  ========================================================= */
+/* ========================================================  INTSTAT  ======================================================== */
+/* ========================================================  INTCLR  ========================================================= */
+/* ========================================================  INTSET  ========================================================= */
+/* =========================================================  STAT  ========================================================== */
+
+
+/* =========================================================================================================================== */
+/* ================                                            RTC                                            ================ */
+/* =========================================================================================================================== */
+
+/* ========================================================  CTRLOW  ========================================================= */
+/* =========================================================  CTRUP  ========================================================= */
+/* ===============================================  RTC CTRUP CTERR [31..31]  ================================================ */
+typedef enum {                                  /*!< RTC_CTRUP_CTERR                                                           */
+  RTC_CTRUP_CTERR_NOERR                = 0,     /*!< NOERR : No read error occurred value.                                     */
+  RTC_CTRUP_CTERR_RDERR                = 1,     /*!< RDERR : Read error occurred value.                                        */
+} RTC_CTRUP_CTERR_Enum;
+
+/* ================================================  RTC CTRUP CEB [28..28]  ================================================= */
+typedef enum {                                  /*!< RTC_CTRUP_CEB                                                             */
+  RTC_CTRUP_CEB_DIS                    = 0,     /*!< DIS : Disable the Century bit from changing value.                        */
+  RTC_CTRUP_CEB_EN                     = 1,     /*!< EN : Enable the Century bit to change value.                              */
+} RTC_CTRUP_CEB_Enum;
+
+/* =================================================  RTC CTRUP CB [27..27]  ================================================= */
+typedef enum {                                  /*!< RTC_CTRUP_CB                                                              */
+  RTC_CTRUP_CB_2000                    = 0,     /*!< 2000 : Century is 2000s value.                                            */
+  RTC_CTRUP_CB_1900_2100               = 1,     /*!< 1900_2100 : Century is 1900s/2100s value.                                 */
+} RTC_CTRUP_CB_Enum;
+
+/* ========================================================  ALMLOW  ========================================================= */
+/* =========================================================  ALMUP  ========================================================= */
+/* ========================================================  RTCCTL  ========================================================= */
+/* ===============================================  RTC RTCCTL HR1224 [5..5]  ================================================ */
+typedef enum {                                  /*!< RTC_RTCCTL_HR1224                                                         */
+  RTC_RTCCTL_HR1224_24HR               = 0,     /*!< 24HR : Hours in 24 hour mode value.                                       */
+  RTC_RTCCTL_HR1224_12HR               = 1,     /*!< 12HR : Hours in 12 hour mode value.                                       */
+} RTC_RTCCTL_HR1224_Enum;
+
+/* ================================================  RTC RTCCTL RSTOP [4..4]  ================================================ */
+typedef enum {                                  /*!< RTC_RTCCTL_RSTOP                                                          */
+  RTC_RTCCTL_RSTOP_RUN                 = 0,     /*!< RUN : Allow the RTC input clock to run value.                             */
+  RTC_RTCCTL_RSTOP_STOP                = 1,     /*!< STOP : Stop the RTC input clock value.                                    */
+} RTC_RTCCTL_RSTOP_Enum;
+
+/* =================================================  RTC RTCCTL RPT [1..3]  ================================================= */
+typedef enum {                                  /*!< RTC_RTCCTL_RPT                                                            */
+  RTC_RTCCTL_RPT_DIS                   = 0,     /*!< DIS : Alarm interrupt disabled value.                                     */
+  RTC_RTCCTL_RPT_YEAR                  = 1,     /*!< YEAR : Interrupt every year value.                                        */
+  RTC_RTCCTL_RPT_MONTH                 = 2,     /*!< MONTH : Interrupt every month value.                                      */
+  RTC_RTCCTL_RPT_WEEK                  = 3,     /*!< WEEK : Interrupt every week value.                                        */
+  RTC_RTCCTL_RPT_DAY                   = 4,     /*!< DAY : Interrupt every day value.                                          */
+  RTC_RTCCTL_RPT_HR                    = 5,     /*!< HR : Interrupt every hour value.                                          */
+  RTC_RTCCTL_RPT_MIN                   = 6,     /*!< MIN : Interrupt every minute value.                                       */
+  RTC_RTCCTL_RPT_SEC                   = 7,     /*!< SEC : Interrupt every second/10th/100th value.                            */
+} RTC_RTCCTL_RPT_Enum;
+
+/* ================================================  RTC RTCCTL WRTC [0..0]  ================================================= */
+typedef enum {                                  /*!< RTC_RTCCTL_WRTC                                                           */
+  RTC_RTCCTL_WRTC_DIS                  = 0,     /*!< DIS : Counter writes are disabled value.                                  */
+  RTC_RTCCTL_WRTC_EN                   = 1,     /*!< EN : Counter writes are enabled value.                                    */
+} RTC_RTCCTL_WRTC_Enum;
+
+/* =========================================================  INTEN  ========================================================= */
+/* ========================================================  INTSTAT  ======================================================== */
+/* ========================================================  INTCLR  ========================================================= */
+/* ========================================================  INTSET  ========================================================= */
+
+
+/* =========================================================================================================================== */
+/* ================                                           SCARD                                           ================ */
+/* =========================================================================================================================== */
+
+/* ==========================================================  SR  =========================================================== */
+/* ==========================================================  DR  =========================================================== */
+/* ==========================================================  SR1  ========================================================== */
+/* ======================================================  RETXCNTRMI  ======================================================= */
+/* ========================================================  CLKCTRL  ======================================================== */
+
+
+/* =========================================================================================================================== */
+/* ================                                         SECURITY                                          ================ */
+/* =========================================================================================================================== */
+
+/* =========================================================  CTRL  ========================================================== */
+/* =============================================  SECURITY CTRL FUNCTION [4..7]  ============================================= */
+typedef enum {                                  /*!< SECURITY_CTRL_FUNCTION                                                    */
+  SECURITY_CTRL_FUNCTION_CRC32         = 0,     /*!< CRC32 : Perform CRC32 operation value.                                    */
+} SECURITY_CTRL_FUNCTION_Enum;
+
+/* ========================================================  SRCADDR  ======================================================== */
+/* ==========================================================  LEN  ========================================================== */
+/* ========================================================  RESULT  ========================================================= */
+/* =======================================================  LOCKCTRL  ======================================================== */
+/* ============================================  SECURITY LOCKCTRL SELECT [0..7]  ============================================ */
+typedef enum {                                  /*!< SECURITY_LOCKCTRL_SELECT                                                  */
+  SECURITY_LOCKCTRL_SELECT_CUSTOMER_KEY = 1,    /*!< CUSTOMER_KEY : Unlock Customer Key (access to top half of info0)
+                                                     value.                                                                    */
+  SECURITY_LOCKCTRL_SELECT_NONE        = 0,     /*!< NONE : Lock Control should be set to NONE when not in use. value.         */
+} SECURITY_LOCKCTRL_SELECT_Enum;
+
+/* =======================================================  LOCKSTAT  ======================================================== */
+/* ===========================================  SECURITY LOCKSTAT STATUS [0..31]  ============================================ */
+typedef enum {                                  /*!< SECURITY_LOCKSTAT_STATUS                                                  */
+  SECURITY_LOCKSTAT_STATUS_CUSTOMER_KEY = 1,    /*!< CUSTOMER_KEY : Customer Key is unlocked (access is granted to
+                                                     top half of info0) value.                                                 */
+  SECURITY_LOCKSTAT_STATUS_NONE        = 0,     /*!< NONE : No resources are unlocked value.                                   */
+} SECURITY_LOCKSTAT_STATUS_Enum;
+
+/* =========================================================  KEY0  ========================================================== */
+/* =========================================================  KEY1  ========================================================== */
+/* =========================================================  KEY2  ========================================================== */
+/* =========================================================  KEY3  ========================================================== */
+
+
+/* =========================================================================================================================== */
+/* ================                                           UART0                                           ================ */
+/* =========================================================================================================================== */
+
+/* ==========================================================  DR  =========================================================== */
+/* ===============================================  UART0 DR OEDATA [11..11]  ================================================ */
+typedef enum {                                  /*!< UART0_DR_OEDATA                                                           */
+  UART0_DR_OEDATA_NOERR                = 0,     /*!< NOERR : No error on UART OEDATA, overrun error indicator. value.          */
+  UART0_DR_OEDATA_ERR                  = 1,     /*!< ERR : Error on UART OEDATA, overrun error indicator. value.               */
+} UART0_DR_OEDATA_Enum;
+
+/* ===============================================  UART0 DR BEDATA [10..10]  ================================================ */
+typedef enum {                                  /*!< UART0_DR_BEDATA                                                           */
+  UART0_DR_BEDATA_NOERR                = 0,     /*!< NOERR : No error on UART BEDATA, break error indicator. value.            */
+  UART0_DR_BEDATA_ERR                  = 1,     /*!< ERR : Error on UART BEDATA, break error indicator. value.                 */
+} UART0_DR_BEDATA_Enum;
+
+/* ================================================  UART0 DR PEDATA [9..9]  ================================================= */
+typedef enum {                                  /*!< UART0_DR_PEDATA                                                           */
+  UART0_DR_PEDATA_NOERR                = 0,     /*!< NOERR : No error on UART PEDATA, parity error indicator. value.           */
+  UART0_DR_PEDATA_ERR                  = 1,     /*!< ERR : Error on UART PEDATA, parity error indicator. value.                */
+} UART0_DR_PEDATA_Enum;
+
+/* ================================================  UART0 DR FEDATA [8..8]  ================================================= */
+typedef enum {                                  /*!< UART0_DR_FEDATA                                                           */
+  UART0_DR_FEDATA_NOERR                = 0,     /*!< NOERR : No error on UART FEDATA, framing error indicator. value.          */
+  UART0_DR_FEDATA_ERR                  = 1,     /*!< ERR : Error on UART FEDATA, framing error indicator. value.               */
+} UART0_DR_FEDATA_Enum;
+
+/* ==========================================================  RSR  ========================================================== */
+/* ================================================  UART0 RSR OESTAT [3..3]  ================================================ */
+typedef enum {                                  /*!< UART0_RSR_OESTAT                                                          */
+  UART0_RSR_OESTAT_NOERR               = 0,     /*!< NOERR : No error on UART OESTAT, overrun error indicator. value.          */
+  UART0_RSR_OESTAT_ERR                 = 1,     /*!< ERR : Error on UART OESTAT, overrun error indicator. value.               */
+} UART0_RSR_OESTAT_Enum;
+
+/* ================================================  UART0 RSR BESTAT [2..2]  ================================================ */
+typedef enum {                                  /*!< UART0_RSR_BESTAT                                                          */
+  UART0_RSR_BESTAT_NOERR               = 0,     /*!< NOERR : No error on UART BESTAT, break error indicator. value.            */
+  UART0_RSR_BESTAT_ERR                 = 1,     /*!< ERR : Error on UART BESTAT, break error indicator. value.                 */
+} UART0_RSR_BESTAT_Enum;
+
+/* ================================================  UART0 RSR PESTAT [1..1]  ================================================ */
+typedef enum {                                  /*!< UART0_RSR_PESTAT                                                          */
+  UART0_RSR_PESTAT_NOERR               = 0,     /*!< NOERR : No error on UART PESTAT, parity error indicator. value.           */
+  UART0_RSR_PESTAT_ERR                 = 1,     /*!< ERR : Error on UART PESTAT, parity error indicator. value.                */
+} UART0_RSR_PESTAT_Enum;
+
+/* ================================================  UART0 RSR FESTAT [0..0]  ================================================ */
+typedef enum {                                  /*!< UART0_RSR_FESTAT                                                          */
+  UART0_RSR_FESTAT_NOERR               = 0,     /*!< NOERR : No error on UART FESTAT, framing error indicator. value.          */
+  UART0_RSR_FESTAT_ERR                 = 1,     /*!< ERR : Error on UART FESTAT, framing error indicator. value.               */
+} UART0_RSR_FESTAT_Enum;
+
+/* ==========================================================  FR  =========================================================== */
+/* =================================================  UART0 FR TXFE [7..7]  ================================================== */
+typedef enum {                                  /*!< UART0_FR_TXFE                                                             */
+  UART0_FR_TXFE_XMTFIFO_EMPTY          = 1,     /*!< XMTFIFO_EMPTY : Transmit fifo is empty. value.                            */
+} UART0_FR_TXFE_Enum;
+
+/* =================================================  UART0 FR RXFF [6..6]  ================================================== */
+typedef enum {                                  /*!< UART0_FR_RXFF                                                             */
+  UART0_FR_RXFF_RCVFIFO_FULL           = 1,     /*!< RCVFIFO_FULL : Receive fifo is full. value.                               */
+} UART0_FR_RXFF_Enum;
+
+/* =================================================  UART0 FR TXFF [5..5]  ================================================== */
+typedef enum {                                  /*!< UART0_FR_TXFF                                                             */
+  UART0_FR_TXFF_XMTFIFO_FULL           = 1,     /*!< XMTFIFO_FULL : Transmit fifo is full. value.                              */
+} UART0_FR_TXFF_Enum;
+
+/* =================================================  UART0 FR RXFE [4..4]  ================================================== */
+typedef enum {                                  /*!< UART0_FR_RXFE                                                             */
+  UART0_FR_RXFE_RCVFIFO_EMPTY          = 1,     /*!< RCVFIFO_EMPTY : Receive fifo is empty. value.                             */
+} UART0_FR_RXFE_Enum;
+
+/* =================================================  UART0 FR BUSY [3..3]  ================================================== */
+typedef enum {                                  /*!< UART0_FR_BUSY                                                             */
+  UART0_FR_BUSY_BUSY                   = 1,     /*!< BUSY : UART busy indicator. value.                                        */
+} UART0_FR_BUSY_Enum;
+
+/* ==================================================  UART0 FR DCD [2..2]  ================================================== */
+typedef enum {                                  /*!< UART0_FR_DCD                                                              */
+  UART0_FR_DCD_DETECTED                = 1,     /*!< DETECTED : Data carrier detect detected. value.                           */
+} UART0_FR_DCD_Enum;
+
+/* ==================================================  UART0 FR DSR [1..1]  ================================================== */
+typedef enum {                                  /*!< UART0_FR_DSR                                                              */
+  UART0_FR_DSR_READY                   = 1,     /*!< READY : Data set ready. value.                                            */
+} UART0_FR_DSR_Enum;
+
+/* ==================================================  UART0 FR CTS [0..0]  ================================================== */
+typedef enum {                                  /*!< UART0_FR_CTS                                                              */
+  UART0_FR_CTS_CLEARTOSEND             = 1,     /*!< CLEARTOSEND : Clear to send is indicated. value.                          */
+} UART0_FR_CTS_Enum;
+
+/* =========================================================  ILPR  ========================================================== */
+/* =========================================================  IBRD  ========================================================== */
+/* =========================================================  FBRD  ========================================================== */
+/* =========================================================  LCRH  ========================================================== */
+/* ==========================================================  CR  =========================================================== */
+/* ================================================  UART0 CR CLKSEL [4..6]  ================================================= */
+typedef enum {                                  /*!< UART0_CR_CLKSEL                                                           */
+  UART0_CR_CLKSEL_NOCLK                = 0,     /*!< NOCLK : No UART clock. This is the low power default. value.              */
+  UART0_CR_CLKSEL_24MHZ                = 1,     /*!< 24MHZ : 24 MHz clock. value.                                              */
+  UART0_CR_CLKSEL_12MHZ                = 2,     /*!< 12MHZ : 12 MHz clock. value.                                              */
+  UART0_CR_CLKSEL_6MHZ                 = 3,     /*!< 6MHZ : 6 MHz clock. value.                                                */
+  UART0_CR_CLKSEL_3MHZ                 = 4,     /*!< 3MHZ : 3 MHz clock. value.                                                */
+} UART0_CR_CLKSEL_Enum;
+
+/* =========================================================  IFLS  ========================================================== */
+/* ==========================================================  IER  ========================================================== */
+/* ==========================================================  IES  ========================================================== */
+/* ==========================================================  MIS  ========================================================== */
+/* ==========================================================  IEC  ========================================================== */
+
+
+/* =========================================================================================================================== */
+/* ================                                           VCOMP                                           ================ */
+/* =========================================================================================================================== */
+
+/* ==========================================================  CFG  ========================================================== */
+/* ===============================================  VCOMP CFG LVLSEL [16..19]  =============================================== */
+typedef enum {                                  /*!< VCOMP_CFG_LVLSEL                                                          */
+  VCOMP_CFG_LVLSEL_0P58V               = 0,     /*!< 0P58V : Set Reference input to 0.58 Volts. value.                         */
+  VCOMP_CFG_LVLSEL_0P77V               = 1,     /*!< 0P77V : Set Reference input to 0.77 Volts. value.                         */
+  VCOMP_CFG_LVLSEL_0P97V               = 2,     /*!< 0P97V : Set Reference input to 0.97 Volts. value.                         */
+  VCOMP_CFG_LVLSEL_1P16V               = 3,     /*!< 1P16V : Set Reference input to 1.16 Volts. value.                         */
+  VCOMP_CFG_LVLSEL_1P35V               = 4,     /*!< 1P35V : Set Reference input to 1.35 Volts. value.                         */
+  VCOMP_CFG_LVLSEL_1P55V               = 5,     /*!< 1P55V : Set Reference input to 1.55 Volts. value.                         */
+  VCOMP_CFG_LVLSEL_1P74V               = 6,     /*!< 1P74V : Set Reference input to 1.74 Volts. value.                         */
+  VCOMP_CFG_LVLSEL_1P93V               = 7,     /*!< 1P93V : Set Reference input to 1.93 Volts. value.                         */
+  VCOMP_CFG_LVLSEL_2P13V               = 8,     /*!< 2P13V : Set Reference input to 2.13 Volts. value.                         */
+  VCOMP_CFG_LVLSEL_2P32V               = 9,     /*!< 2P32V : Set Reference input to 2.32 Volts. value.                         */
+  VCOMP_CFG_LVLSEL_2P51V               = 10,    /*!< 2P51V : Set Reference input to 2.51 Volts. value.                         */
+  VCOMP_CFG_LVLSEL_2P71V               = 11,    /*!< 2P71V : Set Reference input to 2.71 Volts. value.                         */
+  VCOMP_CFG_LVLSEL_2P90V               = 12,    /*!< 2P90V : Set Reference input to 2.90 Volts. value.                         */
+  VCOMP_CFG_LVLSEL_3P09V               = 13,    /*!< 3P09V : Set Reference input to 3.09 Volts. value.                         */
+  VCOMP_CFG_LVLSEL_3P29V               = 14,    /*!< 3P29V : Set Reference input to 3.29 Volts. value.                         */
+  VCOMP_CFG_LVLSEL_3P48V               = 15,    /*!< 3P48V : Set Reference input to 3.48 Volts. value.                         */
+} VCOMP_CFG_LVLSEL_Enum;
+
+/* =================================================  VCOMP CFG NSEL [8..9]  ================================================= */
+typedef enum {                                  /*!< VCOMP_CFG_NSEL                                                            */
+  VCOMP_CFG_NSEL_VREFEXT1              = 0,     /*!< VREFEXT1 : Use external reference 1 for reference input. value.           */
+  VCOMP_CFG_NSEL_VREFEXT2              = 1,     /*!< VREFEXT2 : Use external reference 2 for reference input. value.           */
+  VCOMP_CFG_NSEL_VREFEXT3              = 2,     /*!< VREFEXT3 : Use external reference 3 for reference input. value.           */
+  VCOMP_CFG_NSEL_DAC                   = 3,     /*!< DAC : Use DAC output selected by LVLSEL for reference input.
+                                                     value.                                                                    */
+} VCOMP_CFG_NSEL_Enum;
+
+/* =================================================  VCOMP CFG PSEL [0..1]  ================================================= */
+typedef enum {                                  /*!< VCOMP_CFG_PSEL                                                            */
+  VCOMP_CFG_PSEL_VDDADJ                = 0,     /*!< VDDADJ : Use VDDADJ for the positive input. value.                        */
+  VCOMP_CFG_PSEL_VTEMP                 = 1,     /*!< VTEMP : Use the temperature sensor output for the positive input.
+                                                     Note: If this channel is selected for PSEL, the bandap
+                                                     circuit required for temperature comparisons will automatically
+                                                     turn on. The bandgap circuit requires 11us to stabalize.
+                                                     value.                                                                    */
+  VCOMP_CFG_PSEL_VEXT1                 = 2,     /*!< VEXT1 : Use external voltage 0 for positive input. value.                 */
+  VCOMP_CFG_PSEL_VEXT2                 = 3,     /*!< VEXT2 : Use external voltage 1 for positive input. value.                 */
+} VCOMP_CFG_PSEL_Enum;
+
+/* =========================================================  STAT  ========================================================== */
+/* ===============================================  VCOMP STAT PWDSTAT [1..1]  =============================================== */
+typedef enum {                                  /*!< VCOMP_STAT_PWDSTAT                                                        */
+  VCOMP_STAT_PWDSTAT_POWERED_DOWN      = 1,     /*!< POWERED_DOWN : The voltage comparator is powered down. value.             */
+} VCOMP_STAT_PWDSTAT_Enum;
+
+/* ===============================================  VCOMP STAT CMPOUT [0..0]  ================================================ */
+typedef enum {                                  /*!< VCOMP_STAT_CMPOUT                                                         */
+  VCOMP_STAT_CMPOUT_VOUT_LOW           = 0,     /*!< VOUT_LOW : The negative input of the comparator is greater than
+                                                     the positive input. value.                                                */
+  VCOMP_STAT_CMPOUT_VOUT_HIGH          = 1,     /*!< VOUT_HIGH : The positive input of the comparator is greater
+                                                     than the negative input. value.                                           */
+} VCOMP_STAT_CMPOUT_Enum;
+
+/* ========================================================  PWDKEY  ========================================================= */
+/* ==============================================  VCOMP PWDKEY PWDKEY [0..31]  ============================================== */
+typedef enum {                                  /*!< VCOMP_PWDKEY_PWDKEY                                                       */
+  VCOMP_PWDKEY_PWDKEY_Key              = 55,    /*!< Key : Key value.                                                          */
+} VCOMP_PWDKEY_PWDKEY_Enum;
+
+/* =========================================================  INTEN  ========================================================= */
+/* ========================================================  INTSTAT  ======================================================== */
+/* ========================================================  INTCLR  ========================================================= */
+/* ========================================================  INTSET  ========================================================= */
+
+
+/* =========================================================================================================================== */
+/* ================                                            WDT                                            ================ */
+/* =========================================================================================================================== */
+
+/* ==========================================================  CFG  ========================================================== */
+/* ================================================  WDT CFG CLKSEL [24..26]  ================================================ */
+typedef enum {                                  /*!< WDT_CFG_CLKSEL                                                            */
+  WDT_CFG_CLKSEL_OFF                   = 0,     /*!< OFF : Low Power Mode. This setting disables the watch dog timer.
+                                                     value.                                                                    */
+  WDT_CFG_CLKSEL_128HZ                 = 1,     /*!< 128HZ : 128 Hz LFRC clock. value.                                         */
+  WDT_CFG_CLKSEL_16HZ                  = 2,     /*!< 16HZ : 16 Hz LFRC clock. value.                                           */
+  WDT_CFG_CLKSEL_1HZ                   = 3,     /*!< 1HZ : 1 Hz LFRC clock. value.                                             */
+  WDT_CFG_CLKSEL_1_16HZ                = 4,     /*!< 1_16HZ : 1/16th Hz LFRC clock. value.                                     */
+} WDT_CFG_CLKSEL_Enum;
+
+/* =========================================================  RSTRT  ========================================================= */
+/* ================================================  WDT RSTRT RSTRT [0..7]  ================================================= */
+typedef enum {                                  /*!< WDT_RSTRT_RSTRT                                                           */
+  WDT_RSTRT_RSTRT_KEYVALUE             = 178,   /*!< KEYVALUE : This is the key value to write to WDTRSTRT to restart
+                                                     the WDT. This is a write only register. value.                            */
+} WDT_RSTRT_RSTRT_Enum;
+
+/* =========================================================  LOCK  ========================================================== */
+/* =================================================  WDT LOCK LOCK [0..7]  ================================================== */
+typedef enum {                                  /*!< WDT_LOCK_LOCK                                                             */
+  WDT_LOCK_LOCK_KEYVALUE               = 58,    /*!< KEYVALUE : This is the key value to write to WDTLOCK to lock
+                                                     the WDT. value.                                                           */
+} WDT_LOCK_LOCK_Enum;
+
+/* =========================================================  COUNT  ========================================================= */
+/* =========================================================  INTEN  ========================================================= */
+/* ========================================================  INTSTAT  ======================================================== */
+/* ========================================================  INTCLR  ========================================================= */
+/* ========================================================  INTSET  ========================================================= */
+
+/** @} */ /* End of group EnumValue_peripherals */
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* APOLLO3_H */
+
+
+/** @} */ /* End of group apollo3 */
+
+/** @} */ /* End of group Ambiq Micro */
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/captured_data_to_wav.py b/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/captured_data_to_wav.py
new file mode 100644
index 00000000000..84b62594524
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/captured_data_to_wav.py
@@ -0,0 +1,40 @@
+# SPDX-License-Identifier: Apache-2.0
+# 
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+# www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import re
+import struct
+import matplotlib.pyplot as plt
+import soundfile as sf
+
+def new_data_to_array(fn):
+    vals = []
+    with open(fn) as f:
+        for n, line in enumerate(f):
+            if n is not 0:
+                vals.extend([int(v, 16) for v in line.split()])
+    b = ''.join(map(chr, vals))
+    y = struct.unpack('<'+'h'*int(len(b)/2), b)
+            
+    return y
+
+        
+data = 'captured_data.txt'            
+vals = np.array(new_data_to_array(data)).astype(float)
+
+#plt.plot(vals, 'o-')
+#plt.show(block=False)
+
+wav = vals/np.max(np.abs(vals))
+sf.write('captured_data.wav', wav, 16000)
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/compare_1k.py b/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/compare_1k.py
new file mode 100644
index 00000000000..9c91560d505
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/compare_1k.py
@@ -0,0 +1,153 @@
+# SPDX-License-Identifier: Apache-2.0
+# 
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+# www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import re
+import struct
+import matplotlib.pyplot as plt
+import soundfile as sf
+
+def new_data_to_array(fn, datatype='int16'):
+    vals = []
+    with open(fn) as f:
+        for n, line in enumerate(f):
+            if n is not 0:
+                vals.extend([int(v, 16) for v in line.split()])
+    b = ''.join(map(chr, vals))
+
+    if datatype == 'int8':
+        typestr = 'b'
+        arraylen = int(len(b))
+    elif datatype == 'int16':
+        typestr = 'h'
+        arraylen = int(len(b)//2)
+    elif datatype == 'int32':
+        typestr = 'i'
+        arraylen = int(len(b)//4)
+    if datatype == 'uint8':
+        typestr = 'B'
+        arraylen = int(len(b))
+    elif datatype == 'uint16':
+        typestr = 'H'
+        arraylen = int(len(b)//2)
+    elif datatype == 'uint32':
+        typestr = 'I'
+        arraylen = int(len(b)//4)
+
+    y = np.array(struct.unpack('<'+typestr*arraylen, b))
+            
+    return y
+
+# x is the fixed-point input in Qm.n format
+def to_float(x, n):
+    return x.astype(float)*2**(-n)
+
+micro_windowed_input = new_data_to_array('micro_windowed_input.txt', datatype='int32')
+cmsis_windowed_input = new_data_to_array('cmsis_windowed_input.txt', datatype='int16')
+
+micro_dft = new_data_to_array('micro_dft.txt', datatype='int32')
+cmsis_dft = new_data_to_array('cmsis_dft.txt', datatype='int16')
+py_dft = np.fft.rfft(to_float(cmsis_windowed_input,15), n=512)
+py_result = np.empty((2*py_dft.size), dtype=np.float)
+py_result[0::2] = np.real(py_dft)
+py_result[1::2] = np.imag(py_dft)
+
+micro_power = new_data_to_array('micro_power.txt', datatype='int32')
+cmsis_power = new_data_to_array('cmsis_power.txt', datatype='int16')
+py_power = np.square(np.abs(py_dft))
+
+micro_power_avg = new_data_to_array('micro_power_avg.txt', datatype='uint8')
+cmsis_power_avg = new_data_to_array('cmsis_power_avg.txt', datatype='uint8')
+
+plt.figure(1)
+plt.subplot(311)
+plt.plot(micro_windowed_input, label='Micro fixed')
+plt.legend()
+plt.subplot(312)
+plt.plot(cmsis_windowed_input, label='CMSIS fixed')
+plt.legend()
+plt.subplot(313)
+plt.plot(to_float(micro_windowed_input, 30), label='Micro to float')
+plt.plot(to_float(cmsis_windowed_input, 15), label='CMSIS to float')
+plt.legend()
+
+plt.figure(2)
+plt.subplot(311)
+plt.plot(micro_dft, label='Micro fixed')
+plt.legend()
+plt.subplot(312)
+plt.plot(cmsis_dft, label='CMSIS fixed')
+plt.legend()
+plt.subplot(313)
+plt.plot(to_float(micro_dft, 22), label='Micro to float')
+# CMSIS result has 6 fractionanl bits (not 7) due to documentation error (see README.md)
+plt.plot(to_float(cmsis_dft, 6), label='CMSIS to float') 
+plt.plot(py_result, label='Python result')
+plt.legend()
+
+plt.figure(3)
+plt.subplot(311)
+plt.plot(micro_power, label='Micro fixed')
+plt.legend()
+plt.subplot(312)
+plt.plot(cmsis_power[0:256], label='CMSIS fixed')
+plt.legend()
+plt.subplot(313)
+plt.plot(to_float(micro_power, 22), label='Micro to float')
+plt.plot(to_float(cmsis_power[0:256], 6), label='CMSIS to float')
+plt.plot(py_power, label='Python result')
+plt.legend()
+
+plt.figure(4)
+plt.plot(micro_power_avg, label='Micro fixed')
+plt.plot(cmsis_power_avg, label='CMSIS fixed')
+plt.legend()
+plt.show()
+
+#t = np.arange(16000.*0.03)/16000.
+#sin1k = 0.1*np.sin(2*np.pi*1000*t) # Factor of 10 because micro preprocessing overflows otherwise
+#
+#plt.figure(1)
+#plt.subplot(511)
+#plt.plot(sin1k)
+#plt.title('Input sine')
+#
+#plt.subplot(512)
+#plt.plot(to_float(micro_windowed_input, 30), label='Micro-Lite')
+#plt.plot(to_float(cmsis_windowed_input, 15), label='CMSIS')
+#plt.title('Windowed sine')
+#plt.legend(loc='center right')
+#
+#plt.subplot(513)
+#plt.plot(to_float(micro_dft, 22), label='Micro-Lite')
+#plt.plot(to_float(cmsis_dft, 6), label='CMSIS') 
+#plt.title('FFT')
+#plt.legend(loc='center')
+#
+#plt.subplot(514)
+#plt.plot(to_float(micro_power, 22), label='Micro-Lite')
+#plt.plot(to_float(cmsis_power[0:256], 6), label='CMSIS')
+#plt.title('|FFT|^2')
+#plt.legend(loc='center right')
+#
+#plt.subplot(515)
+#plt.plot(micro_power_avg, label='Micro-Lite')
+#plt.plot(cmsis_power_avg, label='CMSIS')
+#plt.title('Averaged |FFT|^2')
+#plt.legend(loc='center right')
+#
+#plt.tight_layout(pad=0, w_pad=0.2, h_pad=0.2)
+#
+#plt.show()
+#
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/preprocessor_1k.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/preprocessor_1k.cc
new file mode 100644
index 00000000000..31fae1f2dc2
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/preprocessor_1k.cc
@@ -0,0 +1,57 @@
+/* This file is a modification of the Tensorflow Micro Lite file preprocessor.cc
+ * We have retained the original copyright and header information, in
+ * accordance with the Apache 2.0 license terms.
+ */
+
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/sin_1k.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/experimental/micro/micro_error_reporter.h"
+#include "tensorflow/lite/experimental/micro/testing/micro_test.h"
+
+extern "C" {
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/system_apollo3.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/apollo3.h"
+}
+
+#define output_data_size 43
+int count;
+
+extern TfLiteStatus Preprocess(tflite::ErrorReporter* error_reporter,
+                        const int16_t* input, int input_size, int output_size,
+                        uint8_t* output);
+
+TF_LITE_MICRO_TESTS_BEGIN
+CoreDebug->DEMCR |= CoreDebug_DEMCR_TRCENA_Msk;
+//DWT->LAR = 0xC5ACCE55; 
+DWT->CYCCNT = 0;
+DWT->CTRL |= DWT_CTRL_CYCCNTENA_Msk;
+
+TF_LITE_MICRO_TEST(TestPreprocessor) {
+  tflite::MicroErrorReporter micro_error_reporter;
+  tflite::ErrorReporter* error_reporter = &micro_error_reporter;
+
+  uint8_t calculated_data[output_data_size];
+  TfLiteStatus yes_status = Preprocess(
+      error_reporter, g_sin_1k, g_sin_1k_size,
+      output_data_size, calculated_data);
+  count = DWT->CYCCNT;
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, yes_status);
+
+}
+
+TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/preprocessor_1k_cmsis_test.cmd b/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/preprocessor_1k_cmsis_test.cmd
new file mode 100644
index 00000000000..0f701660a87
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/preprocessor_1k_cmsis_test.cmd
@@ -0,0 +1,44 @@
+# SPDX-License-Identifier: Apache-2.0
+# 
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+# www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Needs to be compiled with -O0
+file ../../../tools/make/gen/apollo3evb_cortex-m4/bin/preprocessor_1k_cmsis_test
+target remote localhost:2331
+load ../../../tools/make/gen/apollo3evb_cortex-m4/bin/preprocessor_1k_cmsis_test
+monitor reset
+break preprocessor.cc:70
+commands
+dump verilog value cmsis_windowed_input.txt bufB
+c
+end
+break preprocessor.cc:77
+commands
+dump verilog value cmsis_dft.txt bufA
+c
+end
+break preprocessor.cc:82
+commands
+dump verilog value cmsis_power.txt bufB
+c
+end
+break preprocessor.cc:84
+commands
+dump verilog memory cmsis_power_avg.txt output output+42
+c
+end
+break preprocessor_1k.cc:53
+commands
+print count
+end
+c
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/preprocessor_1k_micro_test.cmd b/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/preprocessor_1k_micro_test.cmd
new file mode 100644
index 00000000000..35c602d78ce
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/preprocessor_1k_micro_test.cmd
@@ -0,0 +1,32 @@
+# SPDX-License-Identifier: Apache-2.0
+# 
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+# www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Needs to be run when compiled with -O0
+file ../../../tools/make/gen/apollo3evb_cortex-m4/bin/preprocessor_1k_micro_test
+target remote localhost:2331
+load ../../../tools/make/gen/apollo3evb_cortex-m4/bin/preprocessor_1k_micro_test
+monitor reset
+break preprocessor.cc:211
+commands
+dump verilog value micro_windowed_input.txt fixed_input
+dump verilog value micro_dft.txt fourier_values
+dump verilog value micro_power.txt power_spectrum
+dump verilog memory micro_power_avg.txt output output+42
+c
+end
+break preprocessor_1k.cc:53
+commands
+print count
+end
+c
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/preprocessor_test.cmd b/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/preprocessor_test.cmd
new file mode 100644
index 00000000000..4458af17d67
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/preprocessor_test.cmd
@@ -0,0 +1,18 @@
+# SPDX-License-Identifier: Apache-2.0
+# 
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+# www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+file ../../gen/apollo3evb_cortex-m4/bin/preprocessor_test
+target remote localhost:2331
+load ../../gen/apollo3evb_cortex-m4/bin/preprocessor_test
+monitor reset
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/pushbutton_cmsis_scores.cmd b/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/pushbutton_cmsis_scores.cmd
new file mode 100644
index 00000000000..db299f72771
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/pushbutton_cmsis_scores.cmd
@@ -0,0 +1,33 @@
+# SPDX-License-Identifier: Apache-2.0
+# 
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+# www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+file ../../../tools/make/gen/apollo3evb_cortex-m4/bin/pushbutton_cmsis_speech_test
+target remote localhost:2331
+load ../../../tools/make/gen/apollo3evb_cortex-m4/bin/pushbutton_cmsis_speech_test
+monitor reset
+break pushbutton_main.c:325
+commands
+printf "Silence score: %d\n", g_silence_score
+printf "Unknown score: %d\n", g_unknown_score
+printf "Yes score: %d\n", g_yes_score
+printf "No score: %d\n", g_no_score
+printf "g_scores[0]: %d\n", g_scores[0]
+printf "g_scores[1]: %d\n", g_scores[1]
+printf "g_scores[2]: %d\n", g_scores[2]
+printf "g_scores[3]: %d\n", g_scores[3]
+printf "max_score: %d\n", max_score
+printf "max_score_index: %d\n", max_score_index
+c
+end
+c
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/pushbutton_cmsis_voice.cmd b/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/pushbutton_cmsis_voice.cmd
new file mode 100644
index 00000000000..8cd1a7e90eb
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/pushbutton_cmsis_voice.cmd
@@ -0,0 +1,24 @@
+# SPDX-License-Identifier: Apache-2.0
+# 
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+# www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+file ../../../tools/make/gen/apollo3evb_cortex-m4/bin/pushbutton_cmsis_speech_test
+target remote localhost:2331
+load ../../../tools/make/gen/apollo3evb_cortex-m4/bin/pushbutton_cmsis_speech_test
+monitor reset
+break pushbutton_main.c:316
+commands
+dump verilog value captured_data.txt captured_data
+c
+end
+c
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/pushbutton_main.c b/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/pushbutton_main.c
new file mode 100644
index 00000000000..82337e63da1
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/pushbutton_main.c
@@ -0,0 +1,339 @@
+/* This file is a modification of the Tensorflow Micro Lite file _main.c
+ * We have retained the original copyright and header information, in
+ * accordance with the Apache 2.0 license terms.
+ */
+
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <stdint.h>
+#include "am_mcu_apollo.h"              // Defines AM_CMSIS_REGS
+#include "am_bsp.h"
+#include "am_util.h"
+
+#define ARM_MATH_CM4
+#include <arm_math.h>
+
+//*****************************************************************************
+// Parameters
+// 
+// Total number of bytes transferred = 320*50*2 = 32000
+//*****************************************************************************
+
+#define FRAME_SIZE 320      // Capture one 320-sample (20-ms) frame at a time
+#define NUM_FRAMES 50       // Number of frames in 1 second
+
+//*****************************************************************************
+// GLOBALS
+//*****************************************************************************
+
+volatile int16_t g_numFramesCaptured = 0;
+volatile bool g_bPDMDataReady = false;
+int16_t captured_data[FRAME_SIZE*NUM_FRAMES]; // Location of 1-second data buffer
+extern uint8_t g_silence_score;
+extern uint8_t g_unknown_score;
+extern uint8_t g_yes_score;
+extern uint8_t g_no_score;
+q7_t g_scores[4] = {0};
+
+
+//*****************************************************************************
+// The entry point for the application.
+//*****************************************************************************
+extern int main(int argc, char**argv);
+
+void DebugLog(const char* s) { am_util_stdio_printf( "%s", s); }
+void DebugLogInt32(int32_t i) { am_util_stdio_printf( "%d", i); }
+void DebugLogUInt32(uint32_t i) { am_util_stdio_printf( "%d", i); }
+void DebugLogHex(uint32_t i) { am_util_stdio_printf( "0x%8x", i); }
+void DebugLogFloat(float i) { am_util_stdio_printf( "%f", i); }
+
+//*****************************************************************************
+// PDM configuration information.
+//*****************************************************************************
+void *PDMHandle;
+
+am_hal_pdm_config_t g_sPdmConfig =
+{
+	.eClkDivider = AM_HAL_PDM_MCLKDIV_1,
+	.eLeftGain = AM_HAL_PDM_GAIN_P225DB, 
+	.eRightGain = AM_HAL_PDM_GAIN_P225DB,
+	.ui32DecimationRate = 48, // OSR = 1500/16 = 96 = 2*SINCRATE --> SINC_RATE = 48 
+	.bHighPassEnable = 0,
+	.ui32HighPassCutoff = 0xB,
+	.ePDMClkSpeed = AM_HAL_PDM_CLK_1_5MHZ,
+	.bInvertI2SBCLK = 0,
+	.ePDMClkSource = AM_HAL_PDM_INTERNAL_CLK,
+	.bPDMSampleDelay = 0,
+	.bDataPacking = 1,
+	.ePCMChannels = AM_HAL_PDM_CHANNEL_RIGHT,
+	.bLRSwap = 0,
+};
+
+
+//*****************************************************************************
+// BUTTON0 pin configuration settings.
+//*****************************************************************************
+const am_hal_gpio_pincfg_t g_deepsleep_button0 =
+{
+    .uFuncSel = 3,
+    .eIntDir = AM_HAL_GPIO_PIN_INTDIR_LO2HI,
+    .eGPInput = AM_HAL_GPIO_PIN_INPUT_ENABLE,
+};
+
+//*****************************************************************************
+// PDM initialization.
+//*****************************************************************************
+void pdm_init(void)
+{
+	//
+	// Initialize, power-up, and configure the PDM.
+	//
+	am_hal_pdm_initialize(0, &PDMHandle);
+	am_hal_pdm_power_control(PDMHandle, AM_HAL_PDM_POWER_ON, false);
+	am_hal_pdm_configure(PDMHandle, &g_sPdmConfig);
+	am_hal_pdm_enable(PDMHandle);
+
+	//
+	// Configure the necessary pins.
+	//
+	am_hal_gpio_pincfg_t sPinCfg = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+
+// ARPIT 181019
+	//sPinCfg.uFuncSel = AM_HAL_PIN_10_PDMCLK;
+	//am_hal_gpio_pinconfig(10, sPinCfg);
+	sPinCfg.uFuncSel = AM_HAL_PIN_12_PDMCLK;
+	am_hal_gpio_pinconfig(12, sPinCfg);
+
+	sPinCfg.uFuncSel = AM_HAL_PIN_11_PDMDATA;
+	am_hal_gpio_pinconfig(11, sPinCfg);
+
+	//am_hal_gpio_state_write(14, AM_HAL_GPIO_OUTPUT_CLEAR);
+	//am_hal_gpio_pinconfig(14, g_AM_HAL_GPIO_OUTPUT);
+
+	//
+	// Configure and enable PDM interrupts (set up to trigger on DMA
+	// completion).
+	//
+	am_hal_pdm_interrupt_enable(PDMHandle, (AM_HAL_PDM_INT_DERR
+	                                        | AM_HAL_PDM_INT_DCMP
+	                                        | AM_HAL_PDM_INT_UNDFL
+	                                        | AM_HAL_PDM_INT_OVF));
+
+#if AM_CMSIS_REGS
+	NVIC_EnableIRQ(PDM_IRQn);
+#else
+	am_hal_interrupt_enable(AM_HAL_INTERRUPT_PDM);
+#endif
+}
+
+//*****************************************************************************
+//
+// Start a transaction to get some number of bytes from the PDM interface.
+//
+//*****************************************************************************
+void pdm_data_get(void)
+{
+	//
+	// Configure DMA and target address.
+	//
+	am_hal_pdm_transfer_t sTransfer;
+	sTransfer.ui32TargetAddr = (uint32_t ) (&captured_data[FRAME_SIZE*g_numFramesCaptured]);
+	sTransfer.ui32TotalCount = 2*FRAME_SIZE; // Each sample is 2 bytes
+
+	//
+	// Start the data transfer.
+	//
+	am_hal_pdm_dma_start(PDMHandle, &sTransfer);
+}
+
+
+//*****************************************************************************
+//
+// PDM interrupt handler.
+//
+//*****************************************************************************
+void am_pdm_isr(void)
+{
+	uint32_t ui32Status;
+	//
+	// Read the interrupt status.
+	//
+	am_hal_pdm_interrupt_status_get(PDMHandle, &ui32Status, true);
+	am_hal_pdm_interrupt_clear(PDMHandle, ui32Status);
+
+	//
+	// Once our DMA transaction completes, send a flag to the main routine
+	//
+	if (ui32Status & AM_HAL_PDM_INT_DCMP)
+		g_bPDMDataReady = true;
+}
+
+
+//*****************************************************************************
+// GPIO ISR
+// Will enable the PDM, set number of frames transferred to 0, and turn on LED
+//*****************************************************************************
+void
+am_gpio_isr(void)
+{
+    //
+    // Delay for debounce.
+    //
+    am_util_delay_ms(200);
+
+    //
+    // Clear the GPIO Interrupt (write to clear).
+    //
+    am_hal_gpio_interrupt_clear(AM_HAL_GPIO_BIT(AM_BSP_GPIO_BUTTON0));
+
+    // Start audio transfer
+	am_hal_pdm_fifo_flush(PDMHandle);
+	pdm_data_get();
+	am_hal_pdm_enable(PDMHandle);
+
+    //
+    // Turn on LED 0
+    //
+    am_devices_led_on(am_bsp_psLEDs, 0);
+}
+
+int _main(void)
+{
+    am_util_id_t sIdDevice;
+    uint32_t ui32StrBuf;
+
+    //
+    // Set the clock frequency.
+    //
+    am_hal_clkgen_control(AM_HAL_CLKGEN_CONTROL_SYSCLK_MAX, 0);
+
+    //
+    // Set the default cache configuration
+    //
+    am_hal_cachectrl_config(&am_hal_cachectrl_defaults);
+    am_hal_cachectrl_enable();
+
+    //
+    // Configure the board for low power operation.
+    //
+    am_bsp_low_power_init();
+
+
+#if defined(AM_BSP_NUM_BUTTONS)  &&  defined(AM_BSP_NUM_LEDS)
+    //
+    // Configure the button pin.
+    //
+    am_hal_gpio_pinconfig(AM_BSP_GPIO_BUTTON0, g_deepsleep_button0);
+
+    //
+    // Clear the GPIO Interrupt (write to clear).
+    //
+    am_hal_gpio_interrupt_clear(AM_HAL_GPIO_BIT(AM_BSP_GPIO_BUTTON0));
+
+    //
+    // Enable the GPIO/button interrupt.
+    //
+    am_hal_gpio_interrupt_enable(AM_HAL_GPIO_BIT(AM_BSP_GPIO_BUTTON0));
+
+    //
+    // Configure the LEDs.
+    //
+    am_devices_led_array_init(am_bsp_psLEDs, AM_BSP_NUM_LEDS);
+
+    //
+    // Turn the LEDs off
+    //
+    for (int ix = 0; ix < AM_BSP_NUM_LEDS; ix++)
+    {
+        am_devices_led_off(am_bsp_psLEDs, ix);
+    }
+
+//    am_devices_led_on(am_bsp_psLEDs, 1);
+#endif // defined(AM_BSP_NUM_BUTTONS)  &&  defined(AM_BSP_NUM_LEDS)
+
+#if AM_CMSIS_REGS
+    NVIC_EnableIRQ(GPIO_IRQn);
+#else // AM_CMSIS_REGS
+    am_hal_interrupt_enable(AM_HAL_INTERRUPT_GPIO);
+#endif // AM_CMSIS_REGS
+
+    //
+    // Enable interrupts to the core.
+    //
+    am_hal_interrupt_master_enable();
+
+	// Turn on PDM
+	pdm_init();
+
+    //
+    // Initialize the printf interface for UART output
+    //
+    am_bsp_uart_printf_enable();
+
+    //
+    // Print the banner.
+    //
+    am_util_stdio_terminal_clear();
+    am_util_stdio_printf("Starting streaming test\n\n");
+
+    // Score variables
+    q7_t max_score = 0;
+    uint32_t max_score_index = 0;
+
+    while(1)
+    {
+
+        am_hal_interrupt_master_disable();
+
+        if (g_bPDMDataReady)
+        {
+            g_bPDMDataReady = false;
+            g_numFramesCaptured++;
+
+			if (g_numFramesCaptured < NUM_FRAMES) {
+                pdm_data_get(); // Start converting the next set of PCM samples.
+            } 
+
+            else 
+            {
+                g_numFramesCaptured = 0;
+                //am_hal_pdm_disable(PDMHandle);
+                am_devices_led_off(am_bsp_psLEDs, 0);
+
+                main(0, NULL);
+
+                g_scores[0] = (q7_t) g_silence_score - 128;
+                g_scores[1] = (q7_t) g_unknown_score - 128;
+                g_scores[2] = (q7_t) g_yes_score - 128;
+                g_scores[3] = (q7_t) g_no_score - 128;
+
+                am_devices_led_off(am_bsp_psLEDs, max_score_index+1); // Turn off LED for previous max score
+                arm_max_q7(g_scores, 4, &max_score, &max_score_index);
+                am_devices_led_on(am_bsp_psLEDs, max_score_index+1); // Turn on LED for new max score
+            }
+        }
+
+        //
+        // Go to Deep Sleep.
+        //
+        am_hal_sysctrl_sleep(AM_HAL_SYSCTRL_SLEEP_DEEP);
+
+        am_hal_interrupt_master_enable();
+
+    }
+
+    //main(0, NULL);
+}
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/pushbutton_test.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/pushbutton_test.cc
new file mode 100644
index 00000000000..ce4de4dbd8b
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/pushbutton_test.cc
@@ -0,0 +1,120 @@
+/* This file is a modification of the Tensorflow Micro Lite file micro_speech_test.cc
+ * We have retained the original copyright and header information, in
+ * accordance with the Apache 2.0 license terms.
+ */
+
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/preprocessor.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/experimental/micro/micro_error_reporter.h"
+#include "tensorflow/lite/experimental/micro/testing/micro_test.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/tiny_conv_model_data.h"
+#include "tensorflow/lite/experimental/micro/kernels/all_ops_resolver.h"
+#include "tensorflow/lite/experimental/micro/micro_interpreter.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/version.h"
+
+extern int16_t captured_data[16000];
+uint8_t g_silence_score = 0;
+uint8_t g_unknown_score = 0;
+uint8_t g_yes_score = 0;
+uint8_t g_no_score = 0;
+
+TF_LITE_MICRO_TESTS_BEGIN
+
+TF_LITE_MICRO_TEST(TestPreprocessor) {
+  tflite::MicroErrorReporter micro_error_reporter;
+  tflite::ErrorReporter* error_reporter = &micro_error_reporter;
+
+  uint8_t preprocessed_data[43*49];
+  TfLiteStatus preprocess_1sec_status = Preprocess_1sec(
+      error_reporter, captured_data, preprocessed_data);
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, preprocess_1sec_status);
+
+
+  // Map the model into a usable data structure. This doesn't involve any
+  // copying or parsing, it's a very lightweight operation.
+  const tflite::Model* model = ::tflite::GetModel(g_tiny_conv_model_data);
+  if (model->version() != TFLITE_SCHEMA_VERSION) {
+    error_reporter->Report(
+        "Model provided is schema version %d not equal "
+        "to supported version %d.\n",
+        model->version(), TFLITE_SCHEMA_VERSION);
+  }
+
+  // This pulls in all the operation implementations we need.
+  tflite::ops::micro::AllOpsResolver resolver;
+
+  // Create an area of memory to use for input, output, and intermediate arrays.
+  const int tensor_arena_size = 10 * 1024;
+  uint8_t tensor_arena[tensor_arena_size];
+  tflite::SimpleTensorAllocator tensor_allocator(tensor_arena,
+                                                 tensor_arena_size);
+
+  // Build an interpreter to run the model with.
+  tflite::MicroInterpreter interpreter(model, resolver, &tensor_allocator,
+                                       error_reporter);
+
+  // Get information about the memory area to use for the model's input.
+  TfLiteTensor* input = interpreter.input(0);
+
+  // Make sure the input has the properties we expect.
+  TF_LITE_MICRO_EXPECT_NE(nullptr, input);
+  TF_LITE_MICRO_EXPECT_EQ(4, input->dims->size);
+  TF_LITE_MICRO_EXPECT_EQ(1, input->dims->data[0]);
+  TF_LITE_MICRO_EXPECT_EQ(49, input->dims->data[1]);
+  TF_LITE_MICRO_EXPECT_EQ(43, input->dims->data[2]);
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteUInt8, input->type);
+
+  // Copy a spectrogram created from a .wav audio file of someone saying "Yes",
+  // into the memory area used for the input.
+  for (int i = 0; i < input->bytes; ++i) {
+    input->data.uint8[i] = preprocessed_data[i];
+  }
+
+  // Run the model on this input and make sure it succeeds.
+  TfLiteStatus invoke_status = interpreter.Invoke();
+  if (invoke_status != kTfLiteOk) {
+    error_reporter->Report("Invoke failed\n");
+  }
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, invoke_status);
+
+  // Get the output from the model, and make sure it's the expected size and
+  // type.
+  TfLiteTensor* output = interpreter.output(0);
+  TF_LITE_MICRO_EXPECT_EQ(2, output->dims->size);
+  TF_LITE_MICRO_EXPECT_EQ(1, output->dims->data[0]);
+  TF_LITE_MICRO_EXPECT_EQ(4, output->dims->data[1]);
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteUInt8, output->type);
+
+  // There are four possible classes in the output, each with a score.
+  const int kSilenceIndex = 0;
+  const int kUnknownIndex = 1;
+  const int kYesIndex = 2;
+  const int kNoIndex = 3;
+
+  // Make sure that the expected "Yes" score is higher than the other classes.
+  g_silence_score = output->data.uint8[kSilenceIndex];
+  g_unknown_score = output->data.uint8[kUnknownIndex];
+  g_yes_score = output->data.uint8[kYesIndex];
+  g_no_score = output->data.uint8[kNoIndex];
+
+  error_reporter->Report("Ran successfully\n");
+
+}
+
+TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/system_apollo3.c b/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/system_apollo3.c
new file mode 100755
index 00000000000..c7f15735633
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/system_apollo3.c
@@ -0,0 +1,116 @@
+//*****************************************************************************
+//
+//! @file system_apollo3.c
+//!
+//! @brief Ambiq Micro Apollo3 MCU specific functions.
+//
+//*****************************************************************************
+
+/*
+ * Copyright (C) 2015-2017, Ambiq Micro
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of itscontributors may be used to endorse
+ * or promote products derived from thissoftware without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * @file     apollo3.h
+ * @brief    CMSIS HeaderFile
+ * @version  1.0
+ * @date     10. August 2018
+ * @note     Generated by SVDConv V3.3.18 on Friday, 10.08.2018 16:52:09
+ *           from File 'apollo3.svd',
+ *           last modified on Friday, 10.08.2018 20:01:31
+ */
+
+
+#include <stdint.h>
+#include "system_apollo3.h"
+#include "apollo3.h"
+
+//*****************************************************************************
+//
+// Defines
+//
+//*****************************************************************************
+
+//
+// Clocks
+//
+#define __HSI             (6000000UL)
+#define __XTAL            (32768UL)         // Crystal Oscillator frequency
+#define __SYS_OSC_CLK     (48000000)        // Main oscillator frequency
+#define __SYSTEM_CLOCK    (1*__SYS_OSC_CLK)
+
+//
+// Initialize SystemCoreClock with the system core clock frequency value
+// achieved after system intitialization.
+// This means system core clock frequency after call to SystemInit()
+//
+uint32_t SystemCoreClock = __SYSTEM_CLOCK;  // System Clock Frequency (Core Clock)
+
+//*****************************************************************************
+//
+//! @brief Set the global clock frequncy.
+//!
+//! This function sets the global clock frequency.
+//!
+//! @return None.
+//
+//*****************************************************************************
+void
+SystemCoreClockUpdate(void)
+{
+    //
+    // Calculate the system frequency based upon the current register settings.
+    // This function can be used to retrieve the system core clock frequeny
+    // after user changed register sittings.
+    //
+    SystemCoreClock = __SYS_OSC_CLK / (CLKGEN->CCTRL_b.CORESEL + 1);
+}
+
+//*****************************************************************************
+//
+//! @brief Initialize the system.
+//!
+//! This function sets up the microcontroller system.
+//!
+//! @return None.
+//
+//*****************************************************************************
+void
+SystemInit(void)
+{
+    //
+    // Initialize the system
+    // Do not use global variables because this function is called before
+    // reaching pre-main. RW section maybe overwritten afterwards.
+    //
+    SystemCoreClock = __SYSTEM_CLOCK;
+
+    CLKGEN->CLKKEY = 0x47;              // Enable write to CCTRL
+    CLKGEN->CCTRL_b.CORESEL = 0;        // Div by 1 for 48MHz
+    CLKGEN->CLKKEY = 0;                 // Disable write to CCTRL
+}
+
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/system_apollo3.h b/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/system_apollo3.h
new file mode 100755
index 00000000000..7fd9b51d5a8
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/system_apollo3.h
@@ -0,0 +1,72 @@
+//*****************************************************************************
+//
+//! @file system_Apollo3.h
+//!
+//! @brief Ambiq Micro Apollo3 MCU specific functions.
+//
+//*****************************************************************************
+
+/*
+ * Copyright (C) 2015-2017, Ambiq Micro
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of itscontributors may be used to endorse
+ * or promote products derived from thissoftware without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * @file     apollo3.h
+ * @brief    CMSIS HeaderFile
+ * @version  1.0
+ * @date     10. August 2018
+ * @note     Generated by SVDConv V3.3.18 on Friday, 10.08.2018 16:52:09
+ *           from File 'apollo3.svd',
+ *           last modified on Friday, 10.08.2018 20:01:31
+ */
+
+
+#ifndef SYSTEM_APOLLO3_H
+#define SYSTEM_APOLLO3_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdint.h>
+
+extern uint32_t SystemCoreClock;     // System Clock Frequency (Core Clock)
+
+//*****************************************************************************
+//
+// External function definitions
+//
+//*****************************************************************************
+extern void SystemInit (void);
+extern void SystemCoreClockUpdate (void);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // SYSTEM_APOLLO3_H
+
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider.cc
new file mode 100644
index 00000000000..c0365d56901
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider.cc
@@ -0,0 +1,33 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider.h"
+
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/model_settings.h"
+
+namespace {
+int16_t g_dummy_audio_data[kMaxAudioSampleSize];
+}  // namespace
+
+TfLiteStatus GetAudioSamples(tflite::ErrorReporter* error_reporter,
+                             int start_ms, int duration_ms,
+                             int* audio_samples_size, int16_t** audio_samples) {
+  for (int i = 0; i < kMaxAudioSampleSize; ++i) {
+    g_dummy_audio_data[i] = 0;
+  }
+  *audio_samples_size = kMaxAudioSampleSize;
+  *audio_samples = g_dummy_audio_data;
+  return kTfLiteOk;
+}
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider.h b/tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider.h
new file mode 100644
index 00000000000..7e2442a5e83
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider.h
@@ -0,0 +1,36 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_AUDIO_PROVIDER_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_AUDIO_PROVIDER_H_
+
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/experimental/micro/micro_error_reporter.h"
+
+// This is an abstraction around an audio source like a microphone, and is
+// expected to return 16-bit PCM sample data for a given point in time. The
+// sample data itself should be used as quickly as possible by the caller, since
+// to allow memory optimizations there are no guarantees that the samples won't
+// be overwritten by new data in the future. In practice, implementations should
+// ensure that there's a reasonable time allowed for clients to access the data
+// before any reuse.
+// The reference implementation can have no platform-specific dependencies, so
+// it just returns an array filled with zeros. For real applications, you should
+// ensure there's a specialized implementation that accesses hardware APIs.
+TfLiteStatus GetAudioSamples(tflite::ErrorReporter* error_reporter,
+                             int start_ms, int duration_ms,
+                             int* audio_samples_size, int16_t** audio_samples);
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_AUDIO_PROVIDER_H_
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider_test.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider_test.cc
new file mode 100644
index 00000000000..5f7c7605f0f
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider_test.cc
@@ -0,0 +1,44 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/model_settings.h"
+#include "tensorflow/lite/experimental/micro/micro_error_reporter.h"
+#include "tensorflow/lite/experimental/micro/testing/micro_test.h"
+
+TF_LITE_MICRO_TESTS_BEGIN
+
+TF_LITE_MICRO_TEST(TestAudioProvider) {
+  tflite::MicroErrorReporter micro_error_reporter;
+  tflite::ErrorReporter* error_reporter = &micro_error_reporter;
+
+  int audio_samples_size = 0;
+  int16_t* audio_samples = nullptr;
+  TfLiteStatus get_status =
+      GetAudioSamples(error_reporter, 0, kFeatureSliceDurationMs,
+                      &audio_samples_size, &audio_samples);
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, get_status);
+  TF_LITE_MICRO_EXPECT_LE(audio_samples_size, kMaxAudioSampleSize);
+  TF_LITE_MICRO_EXPECT_NE(audio_samples, nullptr);
+
+  // Make sure we can read all of the returned memory locations.
+  int total = 0;
+  for (int i = 0; i < audio_samples_size; ++i) {
+    total += audio_samples[i];
+  }
+}
+
+TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/feature_provider.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/feature_provider.cc
new file mode 100644
index 00000000000..c4c52ac0ff3
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/feature_provider.cc
@@ -0,0 +1,121 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/feature_provider.h"
+
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/model_settings.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/preprocessor.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/timer.h"
+
+namespace {
+// Stores the timestamp for the previous fetch of audio data, so that we can
+// avoid recalculating all the features from scratch if some earlier timeslices
+// are still present.
+int32_t g_last_time_in_ms = 0;
+// Make sure we don't try to use cached information if this is the first call
+// into the provider.
+bool g_is_first_run = true;
+}  // namespace
+
+FeatureProvider::FeatureProvider(int feature_size, uint8_t* feature_data)
+    : feature_size_(feature_size), feature_data_(feature_data) {
+  // Initialize the feature data to default values.
+  for (int n = 0; n < feature_size_; ++n) {
+    feature_data_[n] = 0;
+  }
+}
+
+FeatureProvider::~FeatureProvider() {}
+
+TfLiteStatus FeatureProvider::PopulateFeatureData(
+    tflite::ErrorReporter* error_reporter, int* how_many_new_slices) {
+  if (feature_size_ != kFeatureElementCount) {
+    error_reporter->Report("Requested feature_data_ size %d doesn't match %d",
+                           feature_size_, kFeatureElementCount);
+    return kTfLiteError;
+  }
+
+  const int32_t time_in_ms = TimeInMilliseconds();
+  // Quantize the time into steps as long as each window stride, so we can
+  // figure out which audio data we need to fetch.
+  const int last_step = (g_last_time_in_ms / kFeatureSliceStrideMs);
+  const int current_step = (time_in_ms / kFeatureSliceStrideMs);
+  g_last_time_in_ms = time_in_ms;
+
+  int slices_needed = current_step - last_step;
+  // If this is the first call, make sure we don't use any cached information.
+  if (g_is_first_run) {
+    g_is_first_run = false;
+    slices_needed = kFeatureSliceCount;
+  }
+  if (slices_needed > kFeatureSliceCount) {
+    slices_needed = kFeatureSliceCount;
+  }
+  *how_many_new_slices = slices_needed;
+
+  const int slices_to_keep = kFeatureSliceCount - slices_needed;
+  const int slices_to_drop = kFeatureSliceCount - slices_to_keep;
+  // If we can avoid recalculating some slices, just move the existing data
+  // up in the spectrogram, to perform something like this:
+  // last time = 80ms          current time = 120ms
+  // +-----------+             +-----------+
+  // | data@20ms |         --> | data@60ms |
+  // +-----------+       --    +-----------+
+  // | data@40ms |     --  --> | data@80ms |
+  // +-----------+   --  --    +-----------+
+  // | data@60ms | --  --      |  <empty>  |
+  // +-----------+   --        +-----------+
+  // | data@80ms | --          |  <empty>  |
+  // +-----------+             +-----------+
+  if (slices_to_keep > 0) {
+    for (int dest_slice = 0; dest_slice < slices_to_keep; ++dest_slice) {
+      uint8_t* dest_slice_data =
+          feature_data_ + (dest_slice * kFeatureSliceSize);
+      const int src_slice = dest_slice + slices_to_drop;
+      const uint8_t* src_slice_data =
+          feature_data_ + (src_slice * kFeatureSliceSize);
+      for (int i = 0; i < kFeatureSliceSize; ++i) {
+        dest_slice_data[i] = src_slice_data[i];
+      }
+    }
+  }
+  // Any slices that need to be filled in with feature data have their
+  // appropriate audio data pulled, and features calculated for that slice.
+  if (slices_needed > 0) {
+    for (int new_slice = slices_to_keep; new_slice < kFeatureSliceCount;
+         ++new_slice) {
+      const int new_step = (current_step - kFeatureSliceCount + 1) + new_slice;
+      const int32_t slice_start_ms = (new_step * kFeatureSliceStrideMs);
+      int16_t* audio_samples = nullptr;
+      int audio_samples_size = 0;
+      GetAudioSamples(error_reporter, slice_start_ms, kFeatureSliceDurationMs,
+                      &audio_samples_size, &audio_samples);
+      if (audio_samples_size < kMaxAudioSampleSize) {
+        error_reporter->Report("Audio data size %d  too small, want %d",
+                               audio_samples_size, kMaxAudioSampleSize);
+        return kTfLiteError;
+      }
+      uint8_t* new_slice_data = feature_data_ + (new_slice * kFeatureSliceSize);
+      TfLiteStatus preprocess_status =
+          Preprocess(error_reporter, audio_samples, audio_samples_size,
+                     kFeatureSliceSize, new_slice_data);
+      if (preprocess_status != kTfLiteOk) {
+        return preprocess_status;
+      }
+    }
+  }
+  return kTfLiteOk;
+}
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/feature_provider.h b/tensorflow/lite/experimental/micro/examples/micro_speech/feature_provider.h
new file mode 100644
index 00000000000..a86c56ebf05
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/feature_provider.h
@@ -0,0 +1,48 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_FEATURE_PROVIDER_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_FEATURE_PROVIDER_H_
+
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/experimental/micro/micro_error_reporter.h"
+
+// Binds itself to an area of memory intended to hold the input features for an
+// audio-recognition neural network model, and fills that data area with the
+// features representing the current audio input, for example from a microphone.
+// The audio features themselves are a two-dimensional array, made up of
+// horizontal slices representing the frequencies at one point in time, stacked
+// on top of each other to form a spectrogram showing how those frequencies
+// changed over time.
+class FeatureProvider {
+ public:
+  // Create the provider, and bind it to an area of memory. This memory should
+  // remain accessible for the lifetime of the provider object, since subsequent
+  // calls will fill it with feature data. The provider does no memory
+  // management of this data.
+  FeatureProvider(int feature_size, uint8_t* feature_data);
+  ~FeatureProvider();
+
+  // Fills the feature data with information from audio inputs, and returns how
+  // many feature slices were updated.
+  TfLiteStatus PopulateFeatureData(tflite::ErrorReporter* error_reporter,
+                                   int* how_many_new_slices);
+
+ private:
+  int feature_size_;
+  uint8_t* feature_data_;
+};
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_FEATURE_PROVIDER_H_
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/feature_provider_test.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/feature_provider_test.cc
new file mode 100644
index 00000000000..1e52aec8d27
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/feature_provider_test.cc
@@ -0,0 +1,38 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/feature_provider.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/model_settings.h"
+#include "tensorflow/lite/experimental/micro/micro_error_reporter.h"
+#include "tensorflow/lite/experimental/micro/testing/micro_test.h"
+
+TF_LITE_MICRO_TESTS_BEGIN
+
+TF_LITE_MICRO_TEST(TestFeatureProvider) {
+  tflite::MicroErrorReporter micro_error_reporter;
+  tflite::ErrorReporter* error_reporter = &micro_error_reporter;
+
+  uint8_t feature_data[kFeatureElementCount];
+  FeatureProvider feature_provider(kFeatureElementCount, feature_data);
+
+  int how_many_new_slices = 0;
+  TfLiteStatus populate_status = feature_provider.PopulateFeatureData(
+      error_reporter, &how_many_new_slices);
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, populate_status);
+  TF_LITE_MICRO_EXPECT_EQ(kFeatureSliceCount, how_many_new_slices);
+}
+
+TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/fixed_point/preprocessor.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/fixed_point/preprocessor.cc
index de60c982f3a..b623d8d11b7 100644
--- a/tensorflow/lite/experimental/micro/examples/micro_speech/fixed_point/preprocessor.cc
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/fixed_point/preprocessor.cc
@@ -31,6 +31,8 @@ limitations under the License.
 
 #include <cmath>
 
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/model_settings.h"
+
 namespace {
 
 // q format notation: qx.y => 1 sign bit, x-1 integer bits, y fraction bits.
@@ -66,13 +68,6 @@ inline int32_t FloatToFixed_Q2_30(float input) {
   return static_cast<int32_t>(roundf(input * (1 << 30)));
 }
 
-// These constants allow us to allocate fixed-sized arrays on the stack for our
-// working memory.
-constexpr int kInputSize = 512;
-constexpr int kAverageWindowSize = 6;
-constexpr int kOutputSize =
-    ((kInputSize / 2) + (kAverageWindowSize - 1)) / kAverageWindowSize;
-
 // Performs a discrete Fourier transform on the real inputs. This corresponds to
 // rdft() in the FFT package at http://www.kurims.kyoto-u.ac.jp/~ooura/fft.html,
 // and to kiss_fftr() in KISSFFT at https://github.com/mborgerding/kissfft.
@@ -127,14 +122,14 @@ TfLiteStatus Preprocess(tflite::ErrorReporter* error_reporter,
                         const int16_t* input, int input_size, int output_size,
                         uint8_t* output) {
   // Ensure our input and output data arrays are valid.
-  if (input_size > kInputSize) {
+  if (input_size > kMaxAudioSampleSize) {
     error_reporter->Report("Input size %d larger than %d", input_size,
-                           kInputSize);
+                           kMaxAudioSampleSize);
     return kTfLiteError;
   }
-  if (output_size != kOutputSize) {
+  if (output_size != kFeatureSliceSize) {
     error_reporter->Report("Requested output size %d doesn't match %d",
-                           output_size, kOutputSize);
+                           output_size, kFeatureSliceSize);
     return kTfLiteError;
   }
 
@@ -142,18 +137,17 @@ TfLiteStatus Preprocess(tflite::ErrorReporter* error_reporter,
   // In a real application, we'd calculate this table once in an initialization
   // function and store it for repeated reuse.
   // q1.15 format.
-  int16_t window_function[kInputSize];
+  int16_t window_function[kMaxAudioSampleSize];
   CalculatePeriodicHann(input_size, window_function);
 
   // Apply the window function to our time series input, and pad it with zeroes
   // to the next power of two.
-  int32_t fixed_input[kInputSize];
-  for (int i = 0; i < kInputSize; ++i) {
+  int32_t fixed_input[kMaxAudioSampleSize];
+  for (int i = 0; i < kMaxAudioSampleSize; ++i) {
     if (i < input_size) {
       // input is int16_t.  Treat as q1.15 fixed point value in range [-1,1)
       // window_function is also q1.15 fixed point number
-      fixed_input[i] =
-          Q1_15_FixedMultiply_Q2_30(input[i], window_function[i]);
+      fixed_input[i] = Q1_15_FixedMultiply_Q2_30(input[i], window_function[i]);
     } else {
       fixed_input[i] = 0;
     }
@@ -161,31 +155,31 @@ TfLiteStatus Preprocess(tflite::ErrorReporter* error_reporter,
 
   // Pull the frequency data from the time series sample.
   // Calculated in q10.22 format from q2.30 inputs.
-  int32_t fourier_values[kInputSize];
-  CalculateDiscreteFourierTransform(fixed_input, kInputSize, fourier_values);
+  int32_t fourier_values[kMaxAudioSampleSize];
+  CalculateDiscreteFourierTransform(fixed_input, kMaxAudioSampleSize,
+                                    fourier_values);
 
   // We have the complex numbers giving us information about each frequency
   // band, but all we want to know is how strong each frequency is, so calculate
   // the squared magnitude by adding together the squares of each component.
-  int32_t power_spectrum[kInputSize / 2];
-  for (int i = 0; i < (kInputSize / 2); ++i) {
+  int32_t power_spectrum[kMaxAudioSampleSize / 2];
+  for (int i = 0; i < (kMaxAudioSampleSize / 2); ++i) {
     const int32_t real = fourier_values[(i * 2) + 0];
     const int32_t imaginary = fourier_values[(i * 2) + 1];
     // q10.22 results
-    power_spectrum[i] =
-        Q10_22_FixedMultiply_Q10_22(real, real) +
-        Q10_22_FixedMultiply_Q10_22(imaginary, imaginary);
+    power_spectrum[i] = Q10_22_FixedMultiply_Q10_22(real, real) +
+                        Q10_22_FixedMultiply_Q10_22(imaginary, imaginary);
   }
 
   // Finally, reduce the size of the output by averaging together six adjacent
   // frequencies into each slot, producing an array of 43 values.
   // Power_spectrum numbers are q10.22.  Divide by kAverageWindowSize inside
   // loop to prevent overflow.
-  for (int i = 0; i < kOutputSize; ++i) {
+  for (int i = 0; i < kFeatureSliceSize; ++i) {
     int32_t average = 0;
     for (int j = 0; j < kAverageWindowSize; ++j) {
       const int index = (i * kAverageWindowSize) + j;
-      if (index < (kInputSize / 2)) {
+      if (index < (kMaxAudioSampleSize / 2)) {
         average += power_spectrum[index] / kAverageWindowSize;
       }
     }
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/main.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/main.cc
new file mode 100644
index 00000000000..1890c25cf2b
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/main.cc
@@ -0,0 +1,112 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/feature_provider.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/model_settings.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/tiny_conv_model_data.h"
+#include "tensorflow/lite/experimental/micro/kernels/all_ops_resolver.h"
+#include "tensorflow/lite/experimental/micro/micro_error_reporter.h"
+#include "tensorflow/lite/experimental/micro/micro_interpreter.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/version.h"
+
+int main(int argc, char* argv[]) {
+  // Set up logging.
+  tflite::MicroErrorReporter micro_error_reporter;
+  tflite::ErrorReporter* error_reporter = &micro_error_reporter;
+
+  // Map the model into a usable data structure. This doesn't involve any
+  // copying or parsing, it's a very lightweight operation.
+  const tflite::Model* model = ::tflite::GetModel(g_tiny_conv_model_data);
+  if (model->version() != TFLITE_SCHEMA_VERSION) {
+    error_reporter->Report(
+        "Model provided is schema version %d not equal "
+        "to supported version %d.\n",
+        model->version(), TFLITE_SCHEMA_VERSION);
+    return 1;
+  }
+
+  // This pulls in all the operation implementations we need.
+  tflite::ops::micro::AllOpsResolver resolver;
+
+  // Create an area of memory to use for input, output, and intermediate arrays.
+  // The size of this will depend on the model you're using, and may need to be
+  // determined by experimentation.
+  const int tensor_arena_size = 10 * 1024;
+  uint8_t tensor_arena[tensor_arena_size];
+  tflite::SimpleTensorAllocator tensor_allocator(tensor_arena,
+                                                 tensor_arena_size);
+
+  // Build an interpreter to run the model with.
+  tflite::MicroInterpreter interpreter(model, resolver, &tensor_allocator,
+                                       error_reporter);
+
+  // Get information about the memory area to use for the model's input.
+  TfLiteTensor* model_input = interpreter.input(0);
+  if ((model_input->dims->size != 4) || (model_input->dims->data[0] != 1) ||
+      (model_input->dims->data[1] != kFeatureSliceCount) ||
+      (model_input->dims->data[2] != kFeatureSliceSize) ||
+      (model_input->type != kTfLiteUInt8)) {
+    error_reporter->Report("Bad input tensor parameters in model");
+    return 1;
+  }
+
+  // Prepare to access the audio spectrograms from a microphone or other source
+  // that will provide the inputs to the neural network.
+  FeatureProvider feature_provider(kFeatureElementCount,
+                                   model_input->data.uint8);
+
+  // Keep reading and analysing audio data in an infinite loop.
+  while (true) {
+    // Fetch the spectrogram for the current time.
+    int how_many_new_slices = 0;
+    TfLiteStatus feature_status = feature_provider.PopulateFeatureData(
+        error_reporter, &how_many_new_slices);
+    if (feature_status != kTfLiteOk) {
+      error_reporter->Report("Feature generation failed");
+      return 1;
+    }
+    // If no new audio samples have been received since last time, don't bother
+    // running the network model.
+    if (how_many_new_slices == 0) {
+      continue;
+    }
+
+    // Run the model on the spectrogram input and make sure it succeeds.
+    TfLiteStatus invoke_status = interpreter.Invoke();
+    if (invoke_status != kTfLiteOk) {
+      error_reporter->Report("Invoke failed");
+      return 1;
+    }
+
+    // The output from the model is a vector containing the scores for each
+    // kind of prediction, so figure out what the highest scoring category was.
+    TfLiteTensor* output = interpreter.output(0);
+    uint8_t top_category_score = 0;
+    int top_category_index = 0;
+    for (int category_index = 0; category_index < kCategoryCount;
+         ++category_index) {
+      const uint8_t category_score = output->data.uint8[category_index];
+      if (category_score > top_category_score) {
+        top_category_score = category_score;
+        top_category_index = category_index;
+      }
+    }
+
+    error_reporter->Report("Heard %s", kCategoryLabels[top_category_index]);
+  }
+
+  return 0;
+}
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/no_power_spectrum_data.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/model_settings.cc
similarity index 73%
rename from tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/no_power_spectrum_data.cc
rename to tensorflow/lite/experimental/micro/examples/micro_speech/model_settings.cc
index a3e561a8b51..b9b8fb37b19 100644
--- a/tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/no_power_spectrum_data.cc
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/model_settings.cc
@@ -13,8 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-// See the header for documentation on the meaning of this data.
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/model_settings.h"
 
-#include "tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/no_power_spectrum_data.h"
-
-const uint8_t g_no_power_spectrum_data[g_no_power_spectrum_data_size] = {233,6,5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4,40};
+const char* kCategoryLabels[kCategoryCount] = {
+    "silence",
+    "unknown",
+    "yes",
+    "no",
+};
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/model_settings.h b/tensorflow/lite/experimental/micro/examples/micro_speech/model_settings.h
new file mode 100644
index 00000000000..1d8f3123a57
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/model_settings.h
@@ -0,0 +1,42 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MODEL_SETTINGS_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MODEL_SETTINGS_H_
+
+// Keeping these as constant expressions allow us to allocate fixed-sized arrays
+// on the stack for our working memory.
+
+// The size of the input time series data we pass to the FFT to produce the
+// frequency information. This has to be a power of two, and since we're dealing
+// with 30ms of 16KHz inputs, which means 480 samples, this is the next value.
+constexpr int kMaxAudioSampleSize = 512;
+
+// All of these values are derived from the values used during model training,
+// if you change your model you'll need to update these constants.
+constexpr int kAverageWindowSize = 6;
+constexpr int kFeatureSliceSize =
+    ((kMaxAudioSampleSize / 2) + (kAverageWindowSize - 1)) / kAverageWindowSize;
+constexpr int kFeatureSliceCount = 49;
+constexpr int kFeatureElementCount = (kFeatureSliceSize * kFeatureSliceCount);
+constexpr int kFeatureSliceStrideMs = 20;
+constexpr int kFeatureSliceDurationMs = 30;
+
+constexpr int kCategoryCount = 4;
+constexpr int kSilenceIndex = 0;
+constexpr int kUnknownIndex = 1;
+extern const char* kCategoryLabels[kCategoryCount];
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MODEL_SETTINGS_H_
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/preprocessor.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/preprocessor.cc
index 12f9e22038b..743d2292247 100644
--- a/tensorflow/lite/experimental/micro/examples/micro_speech/preprocessor.cc
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/preprocessor.cc
@@ -28,14 +28,9 @@ limitations under the License.
 
 #include <cmath>
 
-namespace {
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/model_settings.h"
 
-// These constants allow us to allocate fixed-sized arrays on the stack for our
-// working memory.
-constexpr int kInputSize = 512;
-constexpr int kAverageWindowSize = 6;
-constexpr int kOutputSize =
-    ((kInputSize / 2) + (kAverageWindowSize - 1)) / kAverageWindowSize;
+namespace {
 
 // Performs a discrete Fourier transform on the real inputs. This corresponds to
 // rdft() in the FFT package at http://www.kurims.kyoto-u.ac.jp/~ooura/fft.html,
@@ -78,27 +73,27 @@ TfLiteStatus Preprocess(tflite::ErrorReporter* error_reporter,
                         const int16_t* input, int input_size, int output_size,
                         uint8_t* output) {
   // Ensure our input and output data arrays are valid.
-  if (input_size > kInputSize) {
+  if (input_size > kMaxAudioSampleSize) {
     error_reporter->Report("Input size %d larger than %d", input_size,
-                           kInputSize);
+                           kMaxAudioSampleSize);
     return kTfLiteError;
   }
-  if (output_size != kOutputSize) {
+  if (output_size != kFeatureSliceSize) {
     error_reporter->Report("Requested output size %d doesn't match %d",
-                           output_size, kOutputSize);
+                           output_size, kFeatureSliceSize);
     return kTfLiteError;
   }
 
   // Pre-calculate the window function we'll be applying to the input data.
   // In a real application, we'd calculate this table once in an initialization
   // function and store it for repeated reuse.
-  float window_function[kInputSize];
+  float window_function[kMaxAudioSampleSize];
   CalculatePeriodicHann(input_size, window_function);
 
   // Apply the window function to our time series input, and pad it with zeroes
   // to the next power of two.
-  float float_input[kInputSize];
-  for (int i = 0; i < kInputSize; ++i) {
+  float float_input[kMaxAudioSampleSize];
+  for (int i = 0; i < kMaxAudioSampleSize; ++i) {
     if (i < input_size) {
       float_input[i] =
           (input[i] * window_function[i]) / static_cast<float>(1 << 15);
@@ -108,14 +103,15 @@ TfLiteStatus Preprocess(tflite::ErrorReporter* error_reporter,
   }
 
   // Pull the frequency data from the time series sample.
-  float fourier_values[kInputSize];
-  CalculateDiscreteFourierTransform(float_input, kInputSize, fourier_values);
+  float fourier_values[kMaxAudioSampleSize];
+  CalculateDiscreteFourierTransform(float_input, kMaxAudioSampleSize,
+                                    fourier_values);
 
   // We have the complex numbers giving us information about each frequency
   // band, but all we want to know is how strong each frequency is, so calculate
   // the squared magnitude by adding together the squares of each component.
-  float power_spectrum[kInputSize / 2];
-  for (int i = 0; i < (kInputSize / 2); ++i) {
+  float power_spectrum[kMaxAudioSampleSize / 2];
+  for (int i = 0; i < (kMaxAudioSampleSize / 2); ++i) {
     const float real = fourier_values[(i * 2) + 0];
     const float imaginary = fourier_values[(i * 2) + 1];
     power_spectrum[i] = (real * real) + (imaginary * imaginary);
@@ -123,11 +119,11 @@ TfLiteStatus Preprocess(tflite::ErrorReporter* error_reporter,
 
   // Finally, reduce the size of the output by averaging together six adjacent
   // frequencies into each slot, producing an array of 43 values.
-  for (int i = 0; i < kOutputSize; ++i) {
+  for (int i = 0; i < kFeatureSliceSize; ++i) {
     float total = 0.0f;
     for (int j = 0; j < kAverageWindowSize; ++j) {
       const int index = (i * kAverageWindowSize) + j;
-      if (index < (kInputSize / 2)) {
+      if (index < (kMaxAudioSampleSize / 2)) {
         total += power_spectrum[index];
       }
     }
@@ -147,3 +143,13 @@ TfLiteStatus Preprocess(tflite::ErrorReporter* error_reporter,
   }
   return kTfLiteOk;
 }
+
+TfLiteStatus Preprocess_1sec(tflite::ErrorReporter* error_reporter,
+    const int16_t* input, uint8_t* output) {
+  int i;
+  for(i=0; i<49; i++) {
+    Preprocess(error_reporter, input+i*320, 480, 43, output+i*43);
+  }
+  return kTfLiteOk;
+}
+
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/preprocessor.h b/tensorflow/lite/experimental/micro/examples/micro_speech/preprocessor.h
index dede2a86421..0057b4505f0 100644
--- a/tensorflow/lite/experimental/micro/examples/micro_speech/preprocessor.h
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/preprocessor.h
@@ -19,8 +19,16 @@ limitations under the License.
 #include "tensorflow/lite/c/c_api_internal.h"
 #include "tensorflow/lite/experimental/micro/micro_error_reporter.h"
 
+// Converts audio sample data into a more compact form that's appropriate for
+// feeding into a neural network. There are reference implementations that use
+// both floating point and fixed point available, but because the calculations
+// involved can be time-consuming, it's recommended that you use or write
+// specialized versions for your platform.
 TfLiteStatus Preprocess(tflite::ErrorReporter* error_reporter,
                         const int16_t* input, int input_size, int output_size,
                         uint8_t* output);
 
+TfLiteStatus Preprocess_1sec(tflite::ErrorReporter* error_reporter,
+                        const int16_t* input, uint8_t* output);
+
 #endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_PREPROCESSOR_H_
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/preprocessor_test.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/preprocessor_test.cc
index e8b49f67e3d..d9b0c48ba3b 100644
--- a/tensorflow/lite/experimental/micro/examples/micro_speech/preprocessor_test.cc
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/preprocessor_test.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/lite/experimental/micro/micro_error_reporter.h"
 #include "tensorflow/lite/experimental/micro/testing/micro_test.h"
 
+
 TF_LITE_MICRO_TESTS_BEGIN
 
 TF_LITE_MICRO_TEST(TestPreprocessor) {
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/yes_power_spectrum_data.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/timer.cc
similarity index 73%
rename from tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/yes_power_spectrum_data.cc
rename to tensorflow/lite/experimental/micro/examples/micro_speech/timer.cc
index 89edb510401..6c96a61ab51 100644
--- a/tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/yes_power_spectrum_data.cc
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/timer.cc
@@ -13,8 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-// See the header for documentation on the meaning of this data.
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/timer.h"
 
-#include "tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/yes_power_spectrum_data.h"
-
-const uint8_t g_yes_power_spectrum_data[g_yes_power_spectrum_data_size] = {8,88,8,0,0,0,0,0,0,0,0,3,12,0,5,22,19,5,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,9,1};
+int32_t TimeInMilliseconds() {
+  static int current_time = 0;
+  current_time += 100;
+  return current_time;
+}
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/timer.h b/tensorflow/lite/experimental/micro/examples/micro_speech/timer.h
new file mode 100644
index 00000000000..162952844a8
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/timer.h
@@ -0,0 +1,31 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_TIMER_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_TIMER_H_
+
+#include <cstdint>
+
+// Returns the time in milliseconds. There's no contract about what time zero
+// represents, the accuracy, or the granularity of the result. Subsequent calls
+// will generally not return a lower value, but even that's not guaranteed if
+// there's an overflow  wraparound.
+// The reference implementation of this function just returns a constantly
+// incrementing value for each call, since it would need a non-portable platform
+// call to access time information. For real applications, you'll need to write
+// your own platform-specific implementation.
+int32_t TimeInMilliseconds();
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_TIMER_H_
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/timer_test.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/timer_test.cc
new file mode 100644
index 00000000000..0487a12b25f
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/timer_test.cc
@@ -0,0 +1,49 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/timer.h"
+
+#include <limits>
+
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/experimental/micro/micro_error_reporter.h"
+#include "tensorflow/lite/experimental/micro/testing/micro_test.h"
+
+TF_LITE_MICRO_TESTS_BEGIN
+
+TF_LITE_MICRO_TEST(TestTimer) {
+  // Make sure that the technically-undefined overflow behavior we rely on below
+  // works on this platform. It's still not guaranteed, but at least this is a
+  // sanity check.  Turn off when running with ASan, as it will complain about
+  // the following undefined behavior.
+#ifndef ADDRESS_SANITIZER
+  int32_t overflow_value = std::numeric_limits<int32_t>::max();
+  overflow_value += 1;
+  TF_LITE_MICRO_EXPECT_EQ(std::numeric_limits<int32_t>::min(), overflow_value);
+#endif
+
+  const int32_t first_time = TimeInMilliseconds();
+  const int32_t second_time = TimeInMilliseconds();
+
+  // It's possible that the timer may have wrapped around from +BIG_NUM to
+  // -BIG_NUM between the first and second calls, since we're storing
+  // milliseconds in a 32-bit integer. It's not reasonable that the call itself
+  // would have taken more than 2^31 milliseconds though, so look at the
+  // difference and rely on integer overflow to ensure it's accurate.
+  const int32_t time_delta = (second_time - first_time);
+  TF_LITE_MICRO_EXPECT_LE(0, time_delta);
+}
+
+TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/experimental/micro/testing/micro_test.h b/tensorflow/lite/experimental/micro/testing/micro_test.h
index 10bab05faec..2f20dd5ac77 100644
--- a/tensorflow/lite/experimental/micro/testing/micro_test.h
+++ b/tensorflow/lite/experimental/micro/testing/micro_test.h
@@ -153,4 +153,22 @@ extern tflite::ErrorReporter* reporter;
     }                                                                        \
   } while (false)
 
+#define TF_LITE_MICRO_EXPECT_GE(x, y)                                         \
+  do {                                                                        \
+    if ((x) < (y)) {                                                          \
+      micro_test::reporter->Report(#x " >= " #y " failed at %s:%d", __FILE__, \
+                                   __LINE__);                                 \
+      micro_test::did_test_fail = true;                                       \
+    }                                                                         \
+  } while (false)
+
+#define TF_LITE_MICRO_EXPECT_LE(x, y)                                         \
+  do {                                                                        \
+    if ((x) > (y)) {                                                          \
+      micro_test::reporter->Report(#x " <= " #y " failed at %s:%d", __FILE__, \
+                                   __LINE__);                                 \
+      micro_test::did_test_fail = true;                                       \
+    }                                                                         \
+  } while (false)
+
 #endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_TESTING_MICRO_TEST_H_
diff --git a/tensorflow/lite/experimental/micro/tools/make/Makefile b/tensorflow/lite/experimental/micro/tools/make/Makefile
index 463709a6e5c..3f6f9273589 100644
--- a/tensorflow/lite/experimental/micro/tools/make/Makefile
+++ b/tensorflow/lite/experimental/micro/tools/make/Makefile
@@ -59,13 +59,27 @@ tensorflow/lite/experimental/micro/examples/micro_speech/tiny_conv_model_data.cc
 tensorflow/lite/experimental/micro/examples/micro_speech/no_features_data.cc \
 tensorflow/lite/experimental/micro/examples/micro_speech/yes_features_data.cc
 
+# Test binary for the streaming microcontroller speech model.
+PUSHBUTTON_MICRO_SPEECH_TEST_SRCS := \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_speech_test.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/tiny_conv_model_data.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/no_features_data.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/yes_features_data.cc
+
+# Test binary for the streaming microcontroller speech model.
+PUSHBUTTON_CMSIS_SPEECH_TEST_SRCS := \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_speech_test.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/tiny_conv_model_data.cc
+
 # Test binary for the microcontroller speech model.
 PREPROCESSOR_TEST_SRCS := \
 tensorflow/lite/experimental/micro/examples/micro_speech/preprocessor_test.cc \
 tensorflow/lite/experimental/micro/examples/micro_speech/no_30ms_sample_data.cc \
 tensorflow/lite/experimental/micro/examples/micro_speech/yes_30ms_sample_data.cc \
 tensorflow/lite/experimental/micro/examples/micro_speech/no_power_spectrum_data.cc \
-tensorflow/lite/experimental/micro/examples/micro_speech/yes_power_spectrum_data.cc
+tensorflow/lite/experimental/micro/examples/micro_speech/yes_power_spectrum_data.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/yes_waveform.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/yes_features_data.cc
 
 PREPROCESSOR_REFERENCE_TEST_SRCS = \
 $(PREPROCESSOR_TEST_SRCS) \
@@ -75,6 +89,11 @@ PREPROCESSOR_FIXED_TEST_SRCS += \
 $(PREPROCESSOR_TEST_SRCS) \
 tensorflow/lite/experimental/micro/examples/micro_speech/fixed_point/preprocessor.cc
 
+PREPROCESSOR_1K_SRCS := \
+tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/preprocessor_1k.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/sin_1k.cc
+
+
 MICROLITE_TEST_SRCS := \
 $(wildcard tensorflow/lite/experimental/micro/*test.cc) \
 $(wildcard tensorflow/lite/experimental/micro/kernels/*test.cc)
@@ -101,6 +120,8 @@ ALL_SRCS := \
 	$(PREPROCESSOR_REFERENCE_TEST_SRCS) \
 	$(PREPROCESSOR_FIXED_TEST_SRCS) \
 	$(MICROLITE_CC_SRCS) \
+	$(PREPROCESSOR_1K_MICRO_TEST_SRCS) \
+	$(PREPROCESSOR_1K_CMSIS_TEST_SRCS) \
 	$(MICROLITE_TEST_SRCS)
 
 # Where compiled objects are stored.
@@ -112,8 +133,12 @@ LIBDIR := $(GENDIR)lib/
 MICROLITE_LIB_PATH := $(LIBDIR)$(MICROLITE_LIB_NAME)
 
 MICRO_SPEECH_TEST_BINARY := $(BINDIR)micro_speech_test
+PUSHBUTTON_MICRO_SPEECH_TEST_BINARY := $(BINDIR)pushbutton_micro_speech_test
+PUSHBUTTON_CMSIS_SPEECH_TEST_BINARY := $(BINDIR)pushbutton_cmsis_speech_test
 PREPROCESSOR_REFERENCE_TEST_BINARY := $(BINDIR)preprocessor_reference_test
 PREPROCESSOR_FIXED_TEST_BINARY := $(BINDIR)preprocessor_fixed_test
+PREPROCESSOR_1K_MICRO_TEST_BINARY := $(BINDIR)preprocessor_1k_micro_test
+PREPROCESSOR_1K_CMSIS_TEST_BINARY := $(BINDIR)preprocessor_1k_cmsis_test
 
 CXX := $(CC_PREFIX)${TARGET_TOOLCHAIN_PREFIX}g++
 CC := $(CC_PREFIX)${TARGET_TOOLCHAIN_PREFIX}gcc
@@ -128,6 +153,13 @@ $(patsubst %.cc,%.o,$(patsubst %.c,%.o,$(PREPROCESSOR_REFERENCE_TEST_SRCS))))
 PREPROCESSOR_FIXED_TEST_OBJS := $(addprefix $(OBJDIR), \
 $(patsubst %.cc,%.o,$(patsubst %.c,%.o,$(PREPROCESSOR_FIXED_TEST_SRCS))))
 
+PREPROCESSOR_1K_MICRO_TEST_OBJS := $(addprefix $(OBJDIR), \
+$(patsubst %.cc,%.o,$(patsubst %.c,%.o,$(PREPROCESSOR_1K_MICRO_TEST_SRCS))))
+
+PREPROCESSOR_1K_CMSIS_TEST_OBJS := $(addprefix $(OBJDIR), \
+$(patsubst %.cc,%.o,$(patsubst %.c,%.o,$(PREPROCESSOR_1K_CMSIS_TEST_SRCS))) \
+arm_bitreversal2.o)
+
 MICROLITE_LIB_OBJS := $(addprefix $(OBJDIR), \
 $(patsubst %.cc,%.o,$(patsubst %.c,%.o,$(MICROLITE_CC_SRCS))))
 
@@ -194,6 +226,35 @@ preprocessor_fixed_test_bin: $(PREPROCESSOR_FIXED_TEST_BINARY).bin
 test_preprocessor_fixed: $(PREPROCESSOR_FIXED_TEST_BINARY)
 	$(TEST_SCRIPT) $(PREPROCESSOR_FIXED_TEST_BINARY) '~~~ALL TESTS PASSED~~~'
 
+$(PREPROCESSOR_1K_MICRO_TEST_BINARY): $(PREPROCESSOR_1K_MICRO_TEST_OBJS) $(MICROLITE_LIB_PATH)
+	@mkdir -p $(dir $@)
+	$(CXX) $(CXXFLAGS) $(INCLUDES) \
+  	-o $(PREPROCESSOR_1K_MICRO_TEST_BINARY) $(PREPROCESSOR_1K_MICRO_TEST_OBJS) \
+  	$(LIBFLAGS) $(MICROLITE_LIB_PATH) $(LDFLAGS) $(MICROLITE_LIBS)
+
+preprocessor_1k_micro_test: $(PREPROCESSOR_1K_MICRO_TEST_BINARY)
+preprocessor_1k_micro_test_bin: $(PREPROCESSOR_1K_MICRO_TEST_BINARY).bin
+
+test_preprocessor_1k_micro: $(PREPROCESSOR_1K_MICRO_TEST_BINARY)
+	$(TEST_SCRIPT) $(PREPROCESSOR_1K_MICRO_TEST_BINARY) '~~~ALL TESTS PASSED~~~'
+
+
+$(PREPROCESSOR_1K_CMSIS_TEST_BINARY): $(PREPROCESSOR_1K_CMSIS_TEST_OBJS) $(MICROLITE_LIB_PATH)
+	@mkdir -p $(dir $@)
+	$(CXX) $(CXXFLAGS) $(INCLUDES) \
+  	-o $(PREPROCESSOR_1K_CMSIS_TEST_BINARY) $(PREPROCESSOR_1K_CMSIS_TEST_OBJS) \
+  	$(LIBFLAGS) $(MICROLITE_LIB_PATH) $(LDFLAGS) $(MICROLITE_LIBS)
+
+preprocessor_1k_cmsis_test: $(PREPROCESSOR_1K_CMSIS_TEST_BINARY)
+preprocessor_1k_cmsis_test_bin: $(PREPROCESSOR_1K_CMSIS_TEST_BINARY).bin
+
+test_preprocessor_1k_cmsis: $(PREPROCESSOR_1K_CMSIS_TEST_BINARY)
+	$(TEST_SCRIPT) $(PREPROCESSOR_1K_CMSIS_TEST_BINARY) '~~~ALL TESTS PASSED~~~'
+
+
+$(OBJDIR)arm_bitreversal2.o:
+	$(CXX) $(CXXFLAGS) $(INCLUDES) -c $(CMSIS_SRC_DIR)/TransformFunctions/arm_bitreversal2.S -o $(OBJDIR)arm_bitreversal2.o
+
 $(BINDIR)%_test : $(OBJDIR)%_test.o $(MICROLITE_LIB_PATH)
 	@mkdir -p $(dir $@)
 	$(CXX) $(CXXFLAGS) $(INCLUDES) \
diff --git a/tensorflow/lite/experimental/micro/tools/make/targets/apollo3evb/README.md b/tensorflow/lite/experimental/micro/tools/make/targets/apollo3evb/README.md
deleted file mode 100644
index fec4923e0ec..00000000000
--- a/tensorflow/lite/experimental/micro/tools/make/targets/apollo3evb/README.md
+++ /dev/null
@@ -1,13 +0,0 @@
-Follow these steps to get the preprocessor test working on Apollo 3:
-
-1. Download the SDK to the be at the same level as tensorflow.git
-2. Copy and prepare files by running tensorflow/lite/experimental/micro/tools/make/targets/apollo3evb/prep_apollo3_files.sh
-3. Recompile libarm_cortexM4lf_math.a with the softfp option, and call it libarm_cartexM4lf_math_softfp.a. The original version was compiled with the hard option, and this caused conflicts with existing software. We might be able to fix this in the future
-4. Install Segger JLink tools from https://www.segger.com/downloads/jlink/ 
-5. Compile the preprocessor_test_bin project with the following command: make -f tensorflow/lite/experimental/micro/tools/make/Makefile TARGET=apollo3evb VENDORLIB=cmsis-dsp preprocessor_test_bin
-6. Download to the target with JFlashLiteExe with the following settings:
-    1. Device = AMA3B1KK-KBR
-    2. Interface = SWD at 1000 kHz
-    3. Data file = tensorflow/lite/experimental/micro/tools/make/gen/apollo3evb_cortex-m4/bin/preprocessor_test.bin
-    4. Prog Addr = 0x0000C000
-7. Connect to device via serial port (115200 baud) and press reset button. Should see all tests passed. Seeing a discrepance between Windows and Linux testing --> need to debug
diff --git a/tensorflow/lite/experimental/micro/tools/make/targets/apollo3evb/get_yesno_data.cmd b/tensorflow/lite/experimental/micro/tools/make/targets/apollo3evb/get_yesno_data.cmd
deleted file mode 100644
index 71d5389ee30..00000000000
--- a/tensorflow/lite/experimental/micro/tools/make/targets/apollo3evb/get_yesno_data.cmd
+++ /dev/null
@@ -1,10 +0,0 @@
-file ../../gen/apollo3evb_cortex-m4/bin/preprocessor_test
-target remote localhost:2331
-load ../../gen/apollo3evb_cortex-m4/bin/preprocessor_test
-monitor reset
-break preprocessor_test.cc:35
-break preprocessor_test.cc:51
-c
-dump verilog value yes_calculated_data.txt yes_calculated_data
-c
-dump verilog value no_calculated_data.txt no_calculated_data
diff --git a/tensorflow/lite/experimental/micro/tools/make/targets/apollo3evb/prep_apollo3_files.sh b/tensorflow/lite/experimental/micro/tools/make/targets/apollo3evb/prep_apollo3_files.sh
index 79ce18f11f5..7ef23095022 100755
--- a/tensorflow/lite/experimental/micro/tools/make/targets/apollo3evb/prep_apollo3_files.sh
+++ b/tensorflow/lite/experimental/micro/tools/make/targets/apollo3evb/prep_apollo3_files.sh
@@ -14,19 +14,22 @@
 # limitations under the License.
 # ==============================================================================
 
-if [ ! -d "../Apollo3-SDK-2018.08.13" ]; then
+AP3_DIR="tensorflow/lite/experimental/micro/tools/make/downloads/Apollo3-SDK-2018.08.13"
+if [ ! -d $AP3_DIR ]; then
     echo "Apollo 3 SDK does not exist"
-    echo "Either the SDK has not been downloaded, or this script is not being done from the root of the repository"
+    echo "Either the SDK has not been downloaded, or this script is not being run from the root of the repository"
 else
     DEST_DIR="tensorflow/lite/experimental/micro/tools/make/targets/apollo3evb"
-    AP3_DIR="../Apollo3-SDK-2018.08.13"
     cp "$AP3_DIR/boards/apollo3_evb/examples/hello_world/gcc/startup_gcc.c" "$DEST_DIR"
-    cp "$AP3_DIR/utils/am_util_delay.c" "$DEST_DIR"
-    cp "$AP3_DIR/utils/am_util_faultisr.c" "$DEST_DIR"
-    cp "$AP3_DIR/utils/am_util_id.c" "$DEST_DIR"
-    cp "$AP3_DIR/utils/am_util_stdio.c" "$DEST_DIR"
-    cp "$AP3_DIR/boards/apollo3_evb/bsp/gcc/bin/libam_bsp.a" "$DEST_DIR"
-    cp "$AP3_DIR/mcu/apollo3/hal/gcc/bin/libam_hal.a" "$DEST_DIR"
+    cp "$AP3_DIR/boards/apollo3_evb/examples/hello_world/gcc/hello_world.ld" "$DEST_DIR/apollo3evb.ld"
     sed -i -e '131s/1024/1024\*20/g' "$DEST_DIR/startup_gcc.c"
     sed -i -e 's/main/_main/g' "$DEST_DIR/startup_gcc.c"
+    sed -i -e '3s/hello_world.ld/apollo3evb.ld/g' "$DEST_DIR/apollo3evb.ld"
+    sed -i -e '3s/startup_gnu/startup_gcc/g' "$DEST_DIR/apollo3evb.ld"
+    sed -i -e '6s/am_reset_isr/Reset_Handler/g' "$DEST_DIR/apollo3evb.ld"
+    sed -i -e '22s/\*(.text\*)/\*(.text\*)\n\n\t\/\* These are the C++ global constructors.  Stick them all here and\n\t \* then walk through the array in main() calling them all.\n\t \*\/\n\t_init_array_start = .;\n\tKEEP (\*(SORT(.init_array\*)))\n\t_init_array_end = .;\n\n\t\/\* XXX Currently not doing anything for global destructors. \*\/\n/g' "$DEST_DIR/apollo3evb.ld"
+    sed -i -e "70s/} > SRAM/} > SRAM\n    \/\* Add this to satisfy reference to symbol 'end' from libnosys.a(sbrk.o)\n     \* to denote the HEAP start.\n     \*\/\n   end = .;/g" "$DEST_DIR/apollo3evb.ld"
+    echo "Finished preparing Apollo3 files"
+    
+
 fi
diff --git a/tensorflow/lite/experimental/micro/tools/make/targets/apollo3evb/preprocessor_test.cmd b/tensorflow/lite/experimental/micro/tools/make/targets/apollo3evb/preprocessor_test.cmd
deleted file mode 100644
index 1b1db457fea..00000000000
--- a/tensorflow/lite/experimental/micro/tools/make/targets/apollo3evb/preprocessor_test.cmd
+++ /dev/null
@@ -1,4 +0,0 @@
-file ../../gen/apollo3evb_cortex-m4/bin/preprocessor_test
-target remote localhost:2331
-load ../../gen/apollo3evb_cortex-m4/bin/preprocessor_test
-monitor reset
diff --git a/tensorflow/lite/experimental/micro/tools/make/targets/apollo3evb/replace_calculated_data.py b/tensorflow/lite/experimental/micro/tools/make/targets/apollo3evb/replace_calculated_data.py
deleted file mode 100644
index 3c1b0110fcd..00000000000
--- a/tensorflow/lite/experimental/micro/tools/make/targets/apollo3evb/replace_calculated_data.py
+++ /dev/null
@@ -1,33 +0,0 @@
-import numpy as np
-import re
-
-# This should be run from make/targets/apollo3evb
-
-def new_data_to_array(fn):
-    vals = []
-    with open(fn) as f:
-        for n, line in enumerate(f):
-            if n is not 0:
-                vals.extend([str(int(v, 16)) for v in line.split()])
-            
-    return ','.join(vals)
-
-def replace_data(fn_old, new_data):
-    patt = '(?<=\{).+?(?=\})'
-    with open(fn_old,'r') as f:
-        str_old = f.read()
-        str_new = re.sub(patt, new_data, str_old, flags=re.DOTALL)
-    with open(fn_old,'w') as f:
-        f.write(str_new)
-        
-            
-yes_old = '../../../../examples/micro_speech/CMSIS/yes_power_spectrum_data.cc'
-no_old = '../../../../examples/micro_speech/CMSIS/no_power_spectrum_data.cc'
-yes_new = 'yes_calculated_data.txt'
-no_new = 'no_calculated_data.txt'
-
-yes_new_vals = new_data_to_array(yes_new)
-no_new_vals = new_data_to_array(no_new)
-
-replace_data(yes_old, yes_new_vals)
-replace_data(no_old, no_new_vals)
diff --git a/tensorflow/lite/experimental/micro/tools/make/targets/apollo3evb_makefile.inc b/tensorflow/lite/experimental/micro/tools/make/targets/apollo3evb_makefile.inc
index 153a19fcd4d..3db4367a17c 100644
--- a/tensorflow/lite/experimental/micro/tools/make/targets/apollo3evb_makefile.inc
+++ b/tensorflow/lite/experimental/micro/tools/make/targets/apollo3evb_makefile.inc
@@ -4,10 +4,10 @@ ifeq ($(TARGET), apollo3evb)
   TARGET_TOOLCHAIN_PREFIX := arm-none-eabi-
   # Download the Ambiq Apollo3 SDK and set this variable to find the header
   # files:
-  APOLLO3_SDK := ../Apollo3-SDK-2018.08.13
+  APOLLO3_SDK := $(MAKEFILE_DIR)/downloads/Apollo3-SDK-2018.08.13
   # Need a pointer to the GNU ARM toolchain for crtbegin.o for the fp functions
   # with the softfp interfaces.
-  GCC_ARM := ../gcc-arm-none-eabi-7-2018-q2-update/
+  GCC_ARM := $(MAKEFILE_DIR)/downloads/gcc-arm-none-eabi-7-2018-q2-update/
 
   PLATFORM_FLAGS = \
     -DPART_apollo3 \
@@ -17,6 +17,7 @@ ifeq ($(TARGET), apollo3evb)
     -DTF_LITE_STATIC_MEMORY \
     -DTF_LITE_MCU_DEBUG_LOG \
     -D __FPU_PRESENT=1 \
+    -DARM_MATH_CM4 \
     -fno-rtti \
     -fmessage-length=0 \
     -fno-exceptions \
@@ -42,7 +43,7 @@ ifeq ($(TARGET), apollo3evb)
     -fomit-frame-pointer \
     -fpermissive \
     -nostdlib \
-    -g \
+    -ggdb \
     -O3
   CXXFLAGS += $(PLATFORM_FLAGS)
   CCFLAGS += $(PLATFORM_FLAGS)
@@ -56,11 +57,9 @@ ifeq ($(TARGET), apollo3evb)
     -Wl,-T,$(MAKEFILE_DIR)/targets/apollo3evb/apollo3evb.ld \
     -Wl,-Map=$(MAKEFILE_DIR)/gen/$(TARGET).map,--cref
   BUILD_TYPE := micro
-  # The apollo3evb libs should be copied from the SDK after building them.
   MICROLITE_LIBS := \
-    $(MAKEFILE_DIR)/targets/apollo3evb/libam_bsp.a \
-    $(MAKEFILE_DIR)/targets/apollo3evb/libam_hal.a \
-    $(MAKEFILE_DIR)/downloads/cmsis/CMSIS/Lib/GCC/libarm_cortexM4lf_math_softfp.a \
+    $(APOLLO3_SDK)/boards/apollo3_evb/bsp/gcc/bin/libam_bsp.a \
+    $(APOLLO3_SDK)/mcu/apollo3/hal/gcc/bin/libam_hal.a \
     $(GCC_ARM)/lib/gcc/arm-none-eabi/7.3.1/thumb/v7e-m/fpv4-sp/softfp/crtbegin.o \
     -lm
   INCLUDES += \
@@ -83,11 +82,63 @@ ifeq ($(TARGET), apollo3evb)
   # of the DebugLog interfaces.
   MICROLITE_CC_SRCS += \
     $(MAKEFILE_DIR)/targets/apollo3evb/startup_gcc.c \
-    $(MAKEFILE_DIR)/targets/apollo3evb/_main.c \
-    $(MAKEFILE_DIR)/targets/apollo3evb/am_util_delay.c \
-    $(MAKEFILE_DIR)/targets/apollo3evb/am_util_faultisr.c \
-    $(MAKEFILE_DIR)/targets/apollo3evb/am_util_id.c \
-    $(MAKEFILE_DIR)/targets/apollo3evb/am_util_stdio.c
+    $(APOLLO3_SDK)/utils/am_util_delay.c \
+    $(APOLLO3_SDK)/utils/am_util_faultisr.c \
+    $(APOLLO3_SDK)/utils/am_util_id.c \
+    $(APOLLO3_SDK)/utils/am_util_stdio.c
+
+  CMSIS_SRC_DIR := tensorflow/lite/experimental/micro/tools/make/downloads/cmsis/CMSIS/DSP/Source
+  CMSIS_SRCS := \
+  $(CMSIS_SRC_DIR)/BasicMathFunctions/arm_mult_q15.c \
+  $(CMSIS_SRC_DIR)/TransformFunctions/arm_rfft_init_q15.c \
+  $(CMSIS_SRC_DIR)/TransformFunctions/arm_rfft_q15.c \
+  $(CMSIS_SRC_DIR)/TransformFunctions/arm_cfft_q15.c \
+  $(CMSIS_SRC_DIR)/TransformFunctions/arm_cfft_radix4_q15.c \
+  $(CMSIS_SRC_DIR)/CommonTables/arm_const_structs.c \
+  $(CMSIS_SRC_DIR)/CommonTables/arm_common_tables.c \
+  $(CMSIS_SRC_DIR)/StatisticsFunctions/arm_mean_q15.c \
+  $(CMSIS_SRC_DIR)/StatisticsFunctions/arm_max_q7.c
+
+  AP3_MICRO_DIR := tensorflow/lite/experimental/micro/examples/micro_speech/apollo3
+  CMSIS_DIR := tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS
+
+  MICRO_SPEECH_TEST_SRCS += \
+    $(AP3_MICRO_DIR)/_main.c
+
+  PREPROCESSOR_1K_CMSIS_TEST_SRCS := \
+  $(PREPROCESSOR_1K_SRCS) \
+  $(CMSIS_DIR)/preprocessor.cc \
+  $(CMSIS_DIR)/arm_cmplx_mag_squared_q10p6.c \
+  $(CMSIS_DIR)/hanning.c \
+  $(AP3_MICRO_DIR)/system_apollo3.c \
+  $(AP3_MICRO_DIR)/_main.c \
+  $(CMSIS_SRCS)
+
+  PREPROCESSOR_1K_MICRO_TEST_SRCS := \
+  $(PREPROCESSOR_1K_SRCS) \
+  tensorflow/lite/experimental/micro/examples/micro_speech/fixed_point/preprocessor.cc \
+  tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/system_apollo3.c \
+  $(AP3_MICRO_DIR)/_main.c
+
+  PUSHBUTTON_MICRO_SPEECH_TEST_SRCS := \
+    $(AP3_MICRO_DIR)/../preprocessor.cc \
+    $(AP3_MICRO_DIR)/pushbutton_main.c \
+    $(AP3_MICRO_DIR)/pushbutton_test.cc \
+    $(AP3_MICRO_DIR)/../tiny_conv_model_data.cc \
+    $(APOLLO3_SDK)/devices/am_devices_led.c
+
+  PUSHBUTTON_CMSIS_SPEECH_TEST_SRCS := \
+    $(AP3_MICRO_DIR)/pushbutton_main.c \
+    $(AP3_MICRO_DIR)/pushbutton_test.cc \
+    $(AP3_MICRO_DIR)/../tiny_conv_model_data.cc \
+    $(CMSIS_DIR)/preprocessor.cc \
+    $(CMSIS_DIR)/arm_cmplx_mag_squared_q10p6.c \
+    $(CMSIS_DIR)/hanning.c \
+    $(APOLLO3_SDK)/devices/am_devices_led.c \
+    $(CMSIS_SRCS)
+
+  PREPROCESSOR_TEST_SRCS += \
+    $(AP3_MICRO_DIR)/_main.c 
 
   TEST_SCRIPT := tensorflow/lite/experimental/log_test/test_apollo3evb_binary.sh
   # These are tests that don't currently work on the blue pill.
@@ -96,4 +147,5 @@ ifeq ($(TARGET), apollo3evb)
     tensorflow/lite/experimental/micro/simple_tensor_allocator_test.cc
   MICROLITE_TEST_SRCS := $(filter-out $(EXCLUDED_TESTS), $(MICROLITE_TEST_SRCS))
 
+
 endif
diff --git a/tensorflow/lite/experimental/micro/tools/make/targets/cmsis-dsp_makefile.inc b/tensorflow/lite/experimental/micro/tools/make/targets/cmsis-dsp_makefile.inc
deleted file mode 100644
index 54b33932292..00000000000
--- a/tensorflow/lite/experimental/micro/tools/make/targets/cmsis-dsp_makefile.inc
+++ /dev/null
@@ -1,9 +0,0 @@
-ifeq ($(VENDORLIB), cmsis-dsp)
-PREPROCESSOR_TEST_SRCS := \
-tensorflow/lite/experimental/micro/examples/micro_speech/no_30ms_sample_data.cc \
-tensorflow/lite/experimental/micro/examples/micro_speech/yes_30ms_sample_data.cc \
-tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/preprocessor.cc \
-tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/preprocessor_test.cc \
-tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/no_power_spectrum_data.cc \
-tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/yes_power_spectrum_data.cc
-endif
diff --git a/tensorflow/lite/experimental/microfrontend/python/kernel_tests/audio_microfrontend_op_test.py b/tensorflow/lite/experimental/microfrontend/python/kernel_tests/audio_microfrontend_op_test.py
index 020d40bc13c..561f5f7a50e 100644
--- a/tensorflow/lite/experimental/microfrontend/python/kernel_tests/audio_microfrontend_op_test.py
+++ b/tensorflow/lite/experimental/microfrontend/python/kernel_tests/audio_microfrontend_op_test.py
@@ -110,7 +110,7 @@ class AudioFeatureGenerationTest(tf.test.TestCase):
           left_context=1,
           right_context=1)
       self.assertAllEqual(
-          filterbanks.eval(),
+          self.evaluate(filterbanks),
           [[479, 425, 479, 425, 436, 378], [479, 425, 436, 378, 410, 350],
            [436, 378, 410, 350, 391, 325], [410, 350, 391, 325, 391, 325]])
 
@@ -153,7 +153,7 @@ class AudioFeatureGenerationTest(tf.test.TestCase):
           frame_stride=3,
           zero_padding=True)
       self.assertAllEqual(
-          filterbanks.eval(),
+          self.evaluate(filterbanks),
           [[0, 0, 0, 0, 479, 425], [436, 378, 410, 350, 391, 325],
            [374, 308, 362, 292, 352, 275]])
 
diff --git a/tensorflow/lite/experimental/writer/BUILD b/tensorflow/lite/experimental/writer/BUILD
index 506c668cf2c..57ce6363671 100644
--- a/tensorflow/lite/experimental/writer/BUILD
+++ b/tensorflow/lite/experimental/writer/BUILD
@@ -1,6 +1,9 @@
-package(default_visibility = [
-    "//visibility:public",
-])
+package(
+    default_visibility = [
+        "//visibility:public",
+    ],
+    features = ["-parse_headers"],
+)
 
 licenses(["notice"])  # Apache 2.0
 
diff --git a/tensorflow/lite/experimental/writer/option_writer_generator.cc b/tensorflow/lite/experimental/writer/option_writer_generator.cc
index 036809e94ab..fa360a2f47e 100644
--- a/tensorflow/lite/experimental/writer/option_writer_generator.cc
+++ b/tensorflow/lite/experimental/writer/option_writer_generator.cc
@@ -56,6 +56,7 @@ static const char* param_structs[] = {"TfLiteConvParams",
                                       "TfLiteTransposeParams",
                                       "TfLiteReducerParams",
                                       "TfLiteSplitParams",
+                                      "TfLiteSplitVParams",
                                       "TfLiteSqueezeParams",
                                       "TfLiteStridedSliceParams",
                                       "TfLiteArgMaxParams",
@@ -66,6 +67,8 @@ static const char* param_structs[] = {"TfLiteConvParams",
                                       "TfLiteFakeQuantParams",
                                       "TfLitePackParams",
                                       "TfLiteOneHotParams",
+                                      "TfLiteLeakyReluParams",
+                                      "TfLiteMirrorPaddingParams",
                                       nullptr};
 }  // namespace
 
@@ -152,6 +155,7 @@ class OpOptionData {
     op_to_option_["BIDIRECTIONAL_SEQUENCE_RNN"] = "SequenceRNNOptions";
     op_to_option_["UNIDIRECTIONAL_SEQUENCE_RNN"] = "SequenceRNNOptions";
     op_to_option_["UNIDIRECTIONAL_SEQUENCE_RNN"] = "SequenceRNNOptions";
+    op_to_option_["MIRROR_PAD"] = "";  // TODO(karimnosseir): MirrorPadOptions.
     // Manually specified mappings between ops and options (none)
     op_to_option_["EMBEDDING_LOOKUP"] =
         "";  // TODO(aselle): maybe something else.
diff --git a/tensorflow/lite/g3doc/_book.yaml b/tensorflow/lite/g3doc/_book.yaml
index ab0d186848f..a51c7a667f3 100644
--- a/tensorflow/lite/g3doc/_book.yaml
+++ b/tensorflow/lite/g3doc/_book.yaml
@@ -3,11 +3,11 @@ upper_tabs:
 - include: /_upper_tabs_left.yaml
 - include: /api_docs/_upper_tabs_api.yaml
 # Dropdown menu
-- name: Ecosystem
-  path: /ecosystem
+- name: Resources
+  path: /resources
   is_default: true
   menu:
-  - include: /ecosystem/_menu_toc.yaml
+  - include: /resources/_menu_toc.yaml
   lower_tabs:
     # Subsite tabs
     other:
diff --git a/tensorflow/lite/g3doc/_index.yaml b/tensorflow/lite/g3doc/_index.yaml
index 093f86b5420..1b3f1d616ae 100644
--- a/tensorflow/lite/g3doc/_index.yaml
+++ b/tensorflow/lite/g3doc/_index.yaml
@@ -189,7 +189,7 @@ landing_page:
       - label: Read more
         path: https://cloud.google.com/blog/products/ai-machine-learning/ai-motion-designing-simple-system-see-understand-and-react-real-world-part-ii
     - heading: "Introducing the Model Optimization Toolkit"
-      image_path: /ecosystem/images/tf-logo-card-16x9.png
+      image_path: /resources/images/tf-logo-card-16x9.png
       path: https://medium.com/tensorflow/introducing-the-model-optimization-toolkit-for-tensorflow-254aca1ba0a3
       buttons:
       - label: Read on TensorFlow blog
@@ -205,7 +205,7 @@ landing_page:
     background: grey
     items:
     - heading: "Using TensorFlow Lite on Android"
-      image_path: /ecosystem/images/tf-logo-card-16x9.png
+      image_path: /resources/images/tf-logo-card-16x9.png
       path: https://medium.com/tensorflow/using-tensorflow-lite-on-android-9bbc9cb7d69d
       buttons:
       - label: Read on TensorFlow blog
@@ -216,7 +216,7 @@ landing_page:
       - label: Watch the video
         path: https://www.youtube.com/watch?v=FAMfy7izB6A
     - heading: "TensorFlow Lite on GitHub"
-      image_path: /ecosystem/images/github-card-16x9.png
+      image_path: /resources/images/github-card-16x9.png
       path: https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite
       buttons:
       - label: View on GitHub
diff --git a/tensorflow/lite/g3doc/convert/index.md b/tensorflow/lite/g3doc/convert/index.md
index bc92a1c1a11..60fa265c295 100644
--- a/tensorflow/lite/g3doc/convert/index.md
+++ b/tensorflow/lite/g3doc/convert/index.md
@@ -6,14 +6,20 @@ file used by the TensorFlow Lite interpreter.
 ## From model training to device deployment
 
 After a TensorFlow model is trained, the TensorFlow Lite converter uses that
-model to generate a TensorFlow Lite [FlatBuffer](https://google.github.io/flatbuffers/)
-file (`.tflite`). The converter supports as input:
+model to generate a TensorFlow Lite
+[FlatBuffer](https://google.github.io/flatbuffers/) file (`.tflite`). The
+converter supports as input:
 [SavedModels](https://www.tensorflow.org/guide/saved_model#using_savedmodel_with_estimators),
 frozen graphs (models generated by
 [freeze_graph.py](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/tools/freeze_graph.py)),
-and `tf.keras` models.  The TensorFlow Lite `FlatBuffer` file is deployed to a
-client device (generally a mobile or embedded device), and the TensorFlow Lite
+and `tf.keras` HDF5 models. The TensorFlow Lite `FlatBuffer` file is deployed to
+a client device (generally a mobile or embedded device), and the TensorFlow Lite
 interpreter uses the compressed model for on-device inference. This conversion
 process is shown in the diagram below:
 
 ![TFLite converter workflow](../images/convert/workflow.svg)
+
+The TensorFlow Lite Converter can be used either from [Python](python_api.md) or
+from the [command line](cmdline_examples.md). This allows you to integrate the
+conversion step into the model design workflow, ensuring the model is easy to
+convert to a mobile inference graph.
diff --git a/tensorflow/lite/g3doc/convert/python_api.md b/tensorflow/lite/g3doc/convert/python_api.md
index 4bdf0d8cbe8..b914a34fa87 100644
--- a/tensorflow/lite/g3doc/convert/python_api.md
+++ b/tensorflow/lite/g3doc/convert/python_api.md
@@ -3,10 +3,9 @@
 This page provides examples on how to use the TensorFlow Lite Converter and the
 TensorFlow Lite interpreter using the Python API.
 
-Note: TFLite recently moved from `tf.contrib.lite` to `tf.lite`. If you are
-using tensorflow `r1.12` or earlier you will need to add `.contrib` to the
-commands below. `tf.lite` works with newer builds, like the nightly build,
-which can be installed with: `pip install tf-nightly`
+Note: These docs describe the converter in the TensorFlow nightly release,
+installed using `pip install tf-nightly`. For docs describing older versions
+reference ["Converting models from TensorFlow 1.12"](#pre_tensorflow_1.12).
 
 [TOC]
 
@@ -24,11 +23,6 @@ The API for converting TensorFlow models to TensorFlow Lite as of TensorFlow 1.9
 is `tf.lite.TFLiteConverter`. The API for calling the Python intepreter
 is `tf.lite.Interpreter`.
 
-Note: Reference "Additional Instructions" sections for converting TensorFlow
-models to TensorFlow Lite
-[in TensorFlow 1.9 to TensorFlow 1.11](#pre_tensorflow_1.11) and
-[prior to TensorFlow 1.9](#pre_tensorflow_1.9)
-
 `TFLiteConverter` provides class methods based on the original format of the
 model. `TFLiteConverter.from_session()` is available for GraphDefs.
 `TFLiteConverter.from_saved_model()` is available for SavedModels.
@@ -250,14 +244,13 @@ either install the nightly build with
 [Docker](https://www.tensorflow.org/install/docker), or
 [build the pip package from source](https://www.tensorflow.org/install/source).
 
-### Converting models in TensorFlow 1.9 to TensorFlow 1.11 <a name="pre_tensorflow_1.11"></a>
+### Converting models from TensorFlow 1.12 <a name="pre_tensorflow_1.12"></a>
 
-To convert TensorFlow models to TensorFlow Lite in TensorFlow 1.9 through
-TensorFlow 1.11, use `TocoConverter`. `TocoConverter` is semantically
-identically to `TFLiteConverter`.
+Reference the following table to convert TensorFlow models to TensorFlow Lite in
+and before TensorFlow 1.12. Run `help()` to get details of each API.
 
-### Converting models prior to TensorFlow 1.9 <a name="pre_tensorflow_1.9"></a>
-
-To convert TensorFlow models to TensorFlow Lite in TensorFlow 1.7 and TensorFlow
-1.8, use the `toco_convert` function. Run `help(tf.lite.toco_convert)`
-to get details about accepted parameters.
+TensorFlow Version | Python API
+------------------ | ---------------------------------
+1.12               | `tf.contrib.lite.TFLiteConverter`
+1.9-1.11           | `tf.contrib.lite.TocoConverter`
+1.7-1.8            | `tf.contrib.lite.toco_convert`
diff --git a/tensorflow/lite/g3doc/devguide.md b/tensorflow/lite/g3doc/devguide.md
index 270cb8ce378..fdd02638f9b 100644
--- a/tensorflow/lite/g3doc/devguide.md
+++ b/tensorflow/lite/g3doc/devguide.md
@@ -35,7 +35,7 @@ by suggesting contextually relevant messages. The model is built specifically fo
 memory constrained devices, such as watches and phones, and has been successfully
 used in Smart Replies on Android Wear. Currently, this model is Android-specific.
 
-These pre-trained models are [available for download](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/g3doc/models.md)
+These pre-trained models are [available for download](models.md).
 
 ### Re-train Inception-V3 or MobileNet for a custom data set
 
@@ -57,51 +57,59 @@ A developer may choose to train a custom model using Tensorflow (see the
 [TensorFlow tutorials](../tutorials/) for examples of building and training
 models). If you have already written a model, the first step is to export this
 to a `tf.GraphDef` file. This is required because some formats do not store the
-model structure outside the code, and we must communicate with other parts of the
-framework. See
-[Exporting the Inference Graph](https://github.com/tensorflow/models/blob/master/research/slim/README.md)
-to create .pb file for the custom model.
+model structure outside the code, and we must communicate with other parts of
+the framework. See
+[Exporting the Inference Graph](https://www.tensorflow.org/tutorials/keras/save_and_restore_models#save_the_entire_model)
+to create file for the custom model.
 
-TensorFlow Lite currently supports a subset of TensorFlow operators. Refer to the
-[TensorFlow Lite & TensorFlow Compatibility Guide](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/g3doc/tf_ops_compatibility.md)
+TensorFlow Lite currently supports a subset of TensorFlow operators. Refer to
+the [TensorFlow Lite & TensorFlow Compatibility Guide](tf_ops_compatibility.md)
 for supported operators and their usage. This set of operators will continue to
 grow in future Tensorflow Lite releases.
 
-
 ## 2. Convert the model format
 
-The model generated (or downloaded) in the previous step is a *standard*
-Tensorflow model and you should now have a .pb or .pbtxt `tf.GraphDef` file.
-Models generated with transfer learning (re-training) or custom models must be
-converted—but, we must first freeze the graph to convert the model to the
-Tensorflow Lite format. This process uses several model formats:
+The [TensorFlow Lite Converter](convert/index.md) accepts the following file
+formats:
 
-* `tf.GraphDef` (.pb) —A protobuf that represents the TensorFlow training or
-  computation graph. It contains operators, tensors, and variables definitions.
-* *CheckPoint* (.ckpt) —Serialized variables from a TensorFlow graph. Since this
-  does not contain a graph structure, it cannot be interpreted by itself.
-* `FrozenGraphDef` —A subclass of `GraphDef` that does not contain
-  variables. A `GraphDef` can be converted to a `FrozenGraphDef` by taking a
-  CheckPoint and a `GraphDef`, and converting each variable into a constant
-  using the value retrieved from the CheckPoint.
-* `SavedModel` —A `GraphDef` and CheckPoint with a signature that labels
-  input and output arguments to a model. A `GraphDef` and CheckPoint can be
-  extracted from a `SavedModel`.
-* *TensorFlow Lite model* (.tflite) —A serialized
-  [FlatBuffer](https://google.github.io/flatbuffers/) that contains TensorFlow
-  Lite operators and tensors for the TensorFlow Lite interpreter, similar to a
-  `FrozenGraphDef`.
+*   `SavedModel` — A `GraphDef` and checkpoint with a signature that labels
+    input and output arguments to a model. See the documentation for converting
+    SavedModels using [Python](convert/python_api.md#basic_savedmodel) or using
+    the [command line](convert/cmdline_examples.md#savedmodel).
+*   `tf.keras` - A HDF5 file containing a model with weights and input and
+    output arguments generated by `tf.Keras`. See the documentation for
+    converting HDF5 models using
+    [Python](convert/python_api.md#basic_keras_file) or using the
+    [command line](convert/cmdline_examples.md#keras).
+*   `frozen tf.GraphDef` — A subclass of `tf.GraphDef` that does not contain
+    variables. A `GraphDef` can be converted to a `frozen GraphDef` by taking a
+    checkpoint and a `GraphDef`, and converting each variable into a constant
+    using the value retrieved from the checkpoint. Instructions on converting a
+    `tf.GraphDef` to a TensorFlow Lite model are described in the next
+    subsection.
 
-### Freeze Graph
+### Converting a tf.GraphDef
 
-To use the `GraphDef` .pb file with TensorFlow Lite, you must have checkpoints
-that contain trained weight parameters. The .pb file only contains the structure
-of the graph. The process of merging the checkpoint values with the graph
-structure is called *freezing the graph*.
+TensorFlow models may be saved as a .pb or .pbtxt `tf.GraphDef` file. In order
+to convert the `tf.GraphDef` file to TensorFlow Lite, the model must first be
+frozen. This process invovles several file formats including the `frozen
+GraphDef`:
 
-You should have a checkpoints folder or download them for a pre-trained model
-(for example,
-[MobileNets](https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet_v1.md)).
+*   `tf.GraphDef` (.pb or .pbtxt) — A protobuf that represents the TensorFlow
+    training or computation graph. It contains operators, tensors, and variables
+    definitions.
+*   *checkpoint* (.ckpt) — Serialized variables from a TensorFlow graph. Since
+    this does not contain a graph structure, it cannot be interpreted by itself.
+*   *TensorFlow Lite model* (.tflite) — A serialized
+    [FlatBuffer](https://google.github.io/flatbuffers/) that contains TensorFlow
+    Lite operators and tensors for the TensorFlow Lite interpreter.
+
+You must have checkpoints that contain trained weights. The `tf.GraphDef` file
+only contains the structure of the graph. The process of merging the checkpoint
+values with the graph structure is called *freezing the graph*.
+
+`tf.GraphDef` and checkpoint files for MobileNet models are available
+[here](https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet_v1.md).
 
 To freeze the graph, use the following command (changing the arguments):
 
@@ -113,69 +121,53 @@ freeze_graph --input_graph=/tmp/mobilenet_v1_224.pb \
   --output_node_names=MobileNetV1/Predictions/Reshape_1
 ```
 
-The `input_binary` flag must be enabled so the protobuf is read and written in
-a binary format. Set the `input_graph` and `input_checkpoint` files.
+Set the `input_binary` flag to `True` when reading a binary protobuf, a `.pb`
+file. Set to `False` for a `.pbtxt` file.
 
-The `output_node_names` may not be obvious outside of the code that built the
-model. The easiest way to find them is to visualize the graph, either with
-[TensorBoard](https://codelabs.developers.google.com/codelabs/tensorflow-for-poets-2/#3)
-or `graphviz`.
+Set `input_graph` and `input_checkpoint` to the respective filenames. The
+`output_node_names` may not be obvious outside of the code that built the model.
+The easiest way to find them is to visualize the graph, either with
+[TensorBoard](https://www.tensorflow.org/guide/summaries_and_tensorboard) or
+`graphviz`.
 
 The frozen `GraphDef` is now ready for conversion to the `FlatBuffer` format
-(.tflite) for use on Android or iOS devices. For Android, the Tensorflow
-Optimizing Converter tool supports both float and quantized models. To convert
-the frozen `GraphDef` to the .tflite format:
+(.tflite) for use on Android or iOS devices. For Android, the TensorFlow Lite
+Converter tool supports both float and quantized models. To convert the frozen
+`GraphDef` to the .tflite format use a command similar to the following:
 
 ```
-toco --input_file=$(pwd)/mobilenet_v1_1.0_224/frozen_graph.pb \
-  --input_format=TENSORFLOW_GRAPHDEF \
-  --output_format=TFLITE \
+tflite_convert \
   --output_file=/tmp/mobilenet_v1_1.0_224.tflite \
-  --inference_type=FLOAT \
-  --input_type=FLOAT \
+  --graph_def_file=/tmp/mobilenet_v1_0.50_128/frozen_graph.pb \
   --input_arrays=input \
-  --output_arrays=MobilenetV1/Predictions/Reshape_1 \
-  --input_shapes=1,224,224,3
+  --output_arrays=MobilenetV1/Predictions/Reshape_1
 ```
 
-The `input_file` argument should reference the frozen `GraphDef` file
-containing the model architecture. The [frozen_graph.pb](https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_1.0_224_frozen.tgz)
-file used here is available for download. `output_file` is where the TensorFlow
-Lite model will get generated. The `input_type` and `inference_type`
-arguments should be set to `FLOAT`, unless converting a
-<a href="https://www.tensorflow.org/performance/quantization">quantized model</a>.
-Setting the `input_array`, `output_array`, and `input_shape` arguments are not as
-straightforward. The easiest way to find these values is to explore the graph
-using Tensorboard. Reuse the arguments for specifying the output nodes for
-inference in the `freeze_graph` step.
+The
+[frozen_graph.pb](https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_1.0_224_frozen.tgz)
+file used here is available for download. Setting the `input_array` and
+`output_array` arguments is not straightforward. The easiest way to find these
+values is to explore the graph using
+[TensorBoard](https://www.tensorflow.org/guide/summaries_and_tensorboard). Reuse
+the arguments for specifying the output nodes for inference in the
+`freeze_graph` step.
 
-It is also possible to use the Tensorflow Optimizing Converter with protobufs
-from either Python or from the command line (see the 
-[toco_from_protos.py](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/toco/python/toco_from_protos.py)
-example). This allows you to integrate the conversion step into the model design
-workflow, ensuring the model is easily convertible to a mobile inference graph.
-For example:
+### Full converter reference
 
-```python
-import tensorflow as tf
+The [TensorFlow Lite Converter](convert/index.md) can be
+[Python](convert/python_api.md) or from the
+[command line](convert/cmdline_examples.md). This allows you to integrate the
+conversion step into the model design workflow, ensuring the model is easy to
+convert to a mobile inference graph.
 
-img = tf.placeholder(name="img", dtype=tf.float32, shape=(1, 64, 64, 3))
-val = img + tf.constant([1., 2., 3.]) + tf.constant([1., 4., 4.])
-out = tf.identity(val, name="out")
+### Ops compatibility
 
-with tf.Session() as sess:
-  tflite_model = tf.lite.toco_convert(sess.graph_def, [img], [out])
-  open("converteds_model.tflite", "wb").write(tflite_model)
-```
-
-For usage, see the Tensorflow Optimizing Converter
-[command-line examples](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/toco/g3doc/cmdline_examples.md).
-
-Refer to the
-[Ops compatibility guide](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/g3doc/tf_ops_compatibility.md)
-for troubleshooting help, and if that doesn't help, please
+Refer to the [ops compatibility guide](tf_ops_compatibility.md) for
+troubleshooting help, and if that doesn't help, please
 [file an issue](https://github.com/tensorflow/tensorflow/issues).
 
+### Graph vizualization tool
+
 The [development repo](https://github.com/tensorflow/tensorflow) contains a tool
 to visualize TensorFlow Lite models after conversion. To build the
 [visualize.py](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/tools/visualize.py)
@@ -212,8 +204,8 @@ installing TensorFlow on Android and setting up `bazel` and Android Studio.
 ### iOS
 
 To integrate a TensorFlow model in an iOS app, see the
-[TensorFlow Lite for iOS](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/g3doc/ios.md)
-guide and <a href="./demo_ios.md">iOS demo</a> guide.
+[TensorFlow Lite for iOS](ios.md) guide and <a href="./demo_ios.md">iOS demo</a>
+guide.
 
 #### Core ML support
 
@@ -227,6 +219,5 @@ devices. To use the converter, refer to the
 ### Raspberry Pi
 
 Compile Tensorflow Lite for a Raspberry Pi by following the
-[RPi build instructions](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/g3doc/rpi.md)
-This compiles a static library file (`.a`) used to build your app. There are
-plans for Python bindings and a demo app.
+[RPi build instructions](rpi.md) This compiles a static library file (`.a`) used
+to build your app. There are plans for Python bindings and a demo app.
diff --git a/tensorflow/lite/g3doc/models.md b/tensorflow/lite/g3doc/models.md
index 537e285490f..62b3f17c79a 100644
--- a/tensorflow/lite/g3doc/models.md
+++ b/tensorflow/lite/g3doc/models.md
@@ -76,8 +76,11 @@ Mobilenet_V1_1.0_128_quant  | [paper](https://arxiv.org/pdf/1712.05877.pdf), [tf
 Mobilenet_V1_1.0_160_quant  | [paper](https://arxiv.org/pdf/1712.05877.pdf), [tflite&pb](http://download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_1.0_160_quant.tgz)  | 4.3 Mb     | 66.9%          | 86.7%          | 37.4 ms
 Mobilenet_V1_1.0_192_quant  | [paper](https://arxiv.org/pdf/1712.05877.pdf), [tflite&pb](http://download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_1.0_192_quant.tgz)  | 4.3 Mb     | 69.1%          | 88.1%          | 51.9 ms
 Mobilenet_V1_1.0_224_quant  | [paper](https://arxiv.org/pdf/1712.05877.pdf), [tflite&pb](http://download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_1.0_224_quant.tgz)  | 4.3 Mb     | 70.0%          | 89.0%          | 70.2 ms
-Mobilenet_v2_1.0_224_quant  | [paper](https://arxiv.org/abs/1806.08342), [tflite&pb](http://download.tensorflow.org/models/tflite_11_05_08/mobilenet_v2_1.0_224_quant.tgz)              | 3.4 Mb     | 70.8%          | 89.9%          | 80.3 ms
-Inception_v3_quant          | [paper](https://arxiv.org/abs/1806.08342),[tflite&pb](http://download.tensorflow.org/models/tflite_11_05_08/inception_v3_quant.tgz)                       | 23 Mb      | 77.5%          | 93.7%          | 637 ms
+Mobilenet_V2_1.0_224_quant  | [paper](https://arxiv.org/abs/1806.08342), [tflite&pb](http://download.tensorflow.org/models/tflite_11_05_08/mobilenet_v2_1.0_224_quant.tgz)              | 3.4 Mb     | 70.8%          | 89.9%          | 80.3 ms
+Inception_V1_quant          | [paper](https://arxiv.org/abs/1409.4842), [tflite&pb](http://download.tensorflow.org/models/inception_v1_224_quant_20181026.tgz)                          | 6.4 Mb     | 70.1%          | 89.8%          | 154.5 ms
+Inception_V2_quant          | [paper](https://arxiv.org/abs/1512.00567), [tflite&pb](http://download.tensorflow.org/models/inception_v2_224_quant_20181026.tgz)                         | 11 Mb      | 73.5%          | 91.4%          | 235.0 ms
+Inception_V3_quant          | [paper](https://arxiv.org/abs/1806.08342),[tflite&pb](http://download.tensorflow.org/models/tflite_11_05_08/inception_v3_quant.tgz)                       | 23 Mb      | 77.5%          | 93.7%          | 637 ms
+Inception_V4_quant          | [paper](https://arxiv.org/abs/1602.07261), [tflite&pb](http://download.tensorflow.org/models/inception_v4_299_quant_20181026.tgz)                         | 41 Mb      | 79.5%          | 93.9%          | 1250.8 ms
 
 ## Other models
 
diff --git a/tensorflow/lite/g3doc/tf_ops_compatibility.md b/tensorflow/lite/g3doc/tf_ops_compatibility.md
index b0dfb0fed1f..dcfda72137c 100644
--- a/tensorflow/lite/g3doc/tf_ops_compatibility.md
+++ b/tensorflow/lite/g3doc/tf_ops_compatibility.md
@@ -1,4 +1,3 @@
-
 # TensorFlow Lite & TensorFlow Compatibility Guide
 
 TensorFlow Lite supports a number of TensorFlow operations used in common
@@ -75,6 +74,7 @@ counterparts:
     0D tensor*
 *   [tf.squeeze](https://www.tensorflow.org/api_docs/python/tf/squeeze) - *as
     long as axis is not provided*
+*   [tf.squared_difference](https://www.tensorflow.org/versions/master/api_docs/python/tf/squared_difference)
 *   [tf.strided_slice](https://www.tensorflow.org/api_docs/python/tf/strided_slice) -
     *as long as ellipsis_mask and new_axis_mask are not used*
 *   [tf.transpose](https://www.tensorflow.org/versions/master/api_docs/python/tf/transpose) -
@@ -139,6 +139,17 @@ following common ops are not supported at the moment:
 The following TensorFlow Lite operations are fully supported and used in place
 of the TensorFlow operations listed above:
 
+**ABS**
+
+```
+Inputs {
+  0: a tensor
+}
+Outputs {
+  0: elementwise abs of the input
+}
+```
+
 **ADD**
 
 ```
@@ -154,6 +165,30 @@ Options {
 }
 ```
 
+**ARG_MAX**
+
+```
+Inputs {
+  0: a tensor
+  1: a tensor
+}
+Outputs {
+  0: A tensor of indices of maximum values.
+}
+```
+
+**ARG_MIN**
+
+```
+Inputs {
+  0: a tensor
+  1: a tensor
+}
+Outputs {
+  0: A tensor of indices of minium values.
+}
+```
+
 **AVERAGE_POOL_2D**
 
 ```
@@ -280,6 +315,18 @@ Outputs {
 }
 ```
 
+**FILL**
+
+```
+Inputs {
+  0: a 1D tensor
+  1: a 0D (scalar) tensor
+}
+Outputs {
+  0: A tensor of shape `tensor 0` filled with the value in `tensor 1`.
+}
+```
+
 **FLOOR**
 
 ```
@@ -291,6 +338,30 @@ outputs: {
 }
 ```
 
+**FLOOR_DIV**
+
+```
+Inputs {
+  0: a tensor
+  1: a tensor
+}
+Outputs {
+  0: result of computing element-wise floor of `tensor 0` divided by `tensor 1`.
+}
+```
+
+**FLOOR_MOD**
+
+```
+Inputs {
+  0: a tensor
+  1: a tensor
+}
+Outputs {
+  0: result of computing element-wise floor of `tensor 0` modulo `tensor 1`.
+}
+```
+
 **FULLY_CONNECTED**
 
 ```
@@ -378,6 +449,34 @@ Options {
 }
 ```
 
+**LEAKY_RELU**
+
+```
+Inputs {
+  0: a tensor
+}
+Outputs {
+  0: a tensor equivalent to max(input, input * alpha)
+}
+Options {
+  alpha: slope of the activation at x < 0 (provided alpha <= 1)
+}
+```
+
+**LEAKY_RELU**
+
+```
+Inputs {
+  0: a tensor
+}
+Outputs {
+  0: a tensor equivalent to max(input, input * alpha)
+}
+Options {
+  alpha
+}
+```
+
 **LESS**
 
 ```
@@ -421,6 +520,18 @@ Options {
 }
 ```
 
+**LOGICAL_OR**
+
+```
+Inputs {
+  0: a list of tensors.
+  1: a list of tensors.
+}
+Outputs {
+  0: A tensor of logical_or output tensors.
+}
+```
+
 **LOGISTIC**
 
 ```
@@ -498,6 +609,18 @@ Outputs {
 }
 ```
 
+**PACK**
+
+```
+Inputs {
+  0: a list of tensors.
+  1: an integer.
+}
+Outputs {
+  0: A tensor of stacked tensors.
+}
+```
+
 **PAD**
 
 ```
@@ -539,6 +662,35 @@ Outputs {
 }
 ```
 
+**POW**
+
+```
+Inputs {
+  0: a tensor
+  1: a tensor
+}
+Outputs {
+  0: elementwise pow of the input tensors
+}
+```
+
+**RANGE**
+
+```
+Inputs {
+  0: a 0D (scalar) tensor
+  1: a 0D (scalar) tensor
+  2: a 0D (scalar) tensor
+}
+Outputs {
+  0: A 1D tensor of type `dtype` defined by a sequence where `tensor 0` is the
+  start, `tensor 1` is the limit, and `tensor 2` is the delta.
+}
+Options {
+  dtype
+}
+```
+
 **RELU**
 
 ```
@@ -587,6 +739,22 @@ Options {
 }
 ```
 
+**RESIZE_NEAREST_NEIGHBOR**
+
+```
+Inputs {
+  0: a 4D tensor
+  1: a 1D tensor with 2 elements
+}
+Outputs {
+  0: A tensor of type `tensor 0` resized according to `tensor 1` heigh/width values
+  using nearest neighbors interpolation.
+}
+Options {
+  align_corners
+}
+```
+
 **RSQRT**
 
 ```
@@ -698,6 +866,22 @@ Options {
 }
 ```
 
+**SPLIT_V**
+
+```
+Inputs {
+  0: tensor (input)
+  1: 1-D tensor (size_splits)
+  2: 0-D tensor (axis)
+}
+Outputs {
+  0-N: subtensors built from the input tensors
+}
+Options {
+  num_splits: Specifies number of outputs
+}
+```
+
 **SQRT**
 
 ```
@@ -781,66 +965,6 @@ Outputs {
 }
 ```
 
-**POW**
-
-```
-Inputs {
-  0: a tensor
-  1: a tensor
-}
-Outputs {
-  0: elementwise pow of the input tensors
-}
-```
-
-**ARG_MAX**
-
-```
-Inputs {
-  0: a tensor
-  1: a tensor
-}
-Outputs {
-  0: A tensor of indices of maximum values.
-}
-```
-
-**ARG_MIN**
-
-```
-Inputs {
-  0: a tensor
-  1: a tensor
-}
-Outputs {
-  0: A tensor of indices of minium values.
-}
-```
-
-**PACK**
-
-```
-Inputs {
-  0: a list of tensors.
-  1: an integer.
-}
-Outputs {
-  0: A tensor of stacked tensors.
-}
-```
-
-**LOGICAL_OR**
-
-```
-Inputs {
-  0: a list of tensors.
-  1: a list of tensors.
-}
-Outputs {
-  0: A tensor of logical_or output tensors.
-}
-```
-
 **UNPACK**
 
 ```
@@ -854,18 +978,6 @@ Outputs {
 }
 ```
 
-**FLOOR_DIV**
-
-```
-Inputs {
-  0: a list of tensors.
-  1: a list of tensors.
-}
-Outputs {
-  0: A tensor of floor_div output tensors.
-}
-```
-
 **ZEROS_LIKE**
 
 ```
@@ -877,6 +989,18 @@ Outputs {
 }
 ```
 
+**FILL**
+
+```
+Inputs {
+  0: A Tensor. Must be one of the following types: int32, int64. 1-D. Represents the shape of the output tensor.
+  1: A Tensor. 0-D (scalar). Value to fill the returned tensor.
+}
+Outputs {
+  0: A tensor of the same type as value (input1).
+}
+```
+
 And these are TensorFlow Lite operations that are present but not ready for
 custom models yet:
 
diff --git a/tensorflow/lite/g3doc/using_select_tf_ops.md b/tensorflow/lite/g3doc/using_select_tf_ops.md
new file mode 100644
index 00000000000..aa51f58baa4
--- /dev/null
+++ b/tensorflow/lite/g3doc/using_select_tf_ops.md
@@ -0,0 +1,249 @@
+# [Experimental] Using TensorFlow Lite with select TensorFlow ops
+
+The TensorFlow Lite builtin op library has grown rapidly, and will continue to
+grow, but there remains a long tail of TensorFlow ops that are not yet natively
+supported by TensorFlow Lite . These unsupported ops can be a point of friction
+in the TensorFlow Lite model conversion process. To that end, the team has
+recently been working on an experimental mechanism for reducing this friction.
+
+This document outlines how to use TensorFlow Lite with select TensorFlow ops.
+*Note that this feature is experimental and is under active development.* As you
+use this feature, keep in mind the [known limitations](#known-limitations), and
+please send feedback about models that work and issues you are facing to
+tflite@tensorflow.org.
+
+TensorFlow Lite will continue to have
+[TensorFlow Lite builtin ops](tf_ops_compatibility.md) optimized for mobile and
+embedded devices. However, TensorFlow Lite models can now use a subset of
+TensorFlow ops when TFLite builtin ops are not sufficient.
+
+Models converted with TensorFlow ops will require a TensorFlow Lite interpreter
+that has a larger binary size than the interpreter with only TFLite builtin ops.
+Additionally, performance optimizations will not be available for any TensorFlow
+ops in the TensorFlow Lite model.
+
+This document outlines how to [convert](#converting-the-model) and
+[run](#running-the-model) a TFLite model with TensorFlow ops on your platform of
+choice. It also discusses some [known limitations](#known-limitations), the
+[future plans](#future-plans) for this feature, and basic
+[performance and size metrics](#metrics).
+
+## Converting the model
+
+To convert a TensorFlow model to a TensorFlow Lite model with TensorFlow ops,
+use the `target_ops` argument in the
+[TensorFlow Lite converter](https://www.tensorflow.org/lite/convert/). The
+following values are valid options for `target_ops`:
+
+*   `TFLITE_BUILTINS` - Converts models using TensorFlow Lite builtin ops.
+*   `SELECT_TF_OPS` - Converts models using TensorFlow ops. The exact subset of
+    supported ops can be found in the whitelist at
+    `lite/toco/tflite/whitelisted_flex_ops.cc`.
+
+The recommended approach is to convert the model with `TFLITE_BUILTINS`, then
+with both `TFLITE_BUILTINS,SELECT_TF_OPS`, and finally with only
+`SELECT_TF_OPS`. Using both options (i.e. `TFLITE_BUILTINS,SELECT_TF_OPS`)
+creates models with TensorFlow Lite ops where possible. Using only
+`SELECT_TF_OPS` is useful when the model contains TensorFlow ops that are only
+partially supported by TensorFlow Lite, and one would like to avoid those
+limitations.
+
+The following example shows how to use `target_ops` in the
+[`TFLiteConverter`](https://www.tensorflow.org/lite/convert/python_api) Python
+API.
+
+```
+import tensorflow as tf
+
+converter = tf.lite.TFLiteConverter.from_saved_model(saved_model_dir)
+converter.target_ops = [tf.lite.OpsSet.TFLITE_BUILTINS,
+                        tf.lite.OpsSet.SELECT_TF_OPS]
+tflite_model = converter.convert()
+open("converted_model.tflite", "wb").write(tflite_model)
+```
+
+The following example shows how to use `target_ops` in the
+[`tflite_convert`](https://www.tensorflow.org/lite/convert/cmdline_examples)
+command line tool.
+
+```
+tflite_convert \
+  --output_file=/tmp/foo.tflite \
+  --graph_def_file=/tmp/foo.pb \
+  --input_arrays=input \
+  --output_arrays=MobilenetV1/Predictions/Reshape_1 \
+  --target_ops=TFLITE_BUILTINS,SELECT_TF_OPS
+```
+
+When building and running `tflite_convert` directly with `bazel`, please pass
+`--define=with_select_tf_ops=true` as an additional argument.
+
+```
+bazel run --define=with_select_tf_ops=true tflite_convert -- \
+  --output_file=/tmp/foo.tflite \
+  --graph_def_file=/tmp/foo.pb \
+  --input_arrays=input \
+  --output_arrays=MobilenetV1/Predictions/Reshape_1 \
+  --target_ops=TFLITE_BUILTINS,SELECT_TF_OPS
+```
+
+## Running the model
+
+When using a TensorFlow Lite model that has been converted with support for
+select TensorFlow ops, the client must also use a TensorFlow Lite runtime that
+includes the necessary library of TensorFlow ops.
+
+### Android AAR
+
+A new Android AAR target with select TensorFlow ops has been added for
+convenience. Assuming a <a href="./demo_android.md">working TensorFlow Lite
+build environment</a>, build the Android AAR with select TensorFlow ops as
+follows:
+
+```sh
+bazel build --cxxopt='--std=c++11' -c opt             \
+  --config=android_arm --config=monolithic          \
+  //tensorflow/lite/java:tensorflow-lite-with-select-tf-ops
+```
+
+This will generate an AAR file in `bazel-genfiles/tensorflow/lite/java/`. From
+there, you can either import the AAR directly into your project, or publish the
+custom AAR to your local Maven repository:
+
+```sh
+mvn install:install-file \
+  -Dfile=bazel-genfiles/tensorflow/lite/java/tensorflow-lite-with-select-tf-ops.aar \
+  -DgroupId=org.tensorflow \
+  -DartifactId=tensorflow-lite-with-select-tf-ops -Dversion=0.1.100 -Dpackaging=aar
+```
+
+Finally, in your app's `build.gradle`, ensure you have the `mavenLocal()`
+dependency and replace the standard TensorFlow Lite dependency with the one that
+has support for select TensorFlow ops:
+
+```
+allprojects {
+    repositories {
+        jcenter()
+        mavenLocal()
+    }
+}
+
+dependencies {
+    compile 'org.tensorflow:tensorflow-lite-with-select-tf-ops:0.1.100'
+}
+```
+
+### iOS
+
+With XCode Command Line Tools installed, TensorFlow Lite with select TensorFlow
+ops support can be built with the following command:
+
+```sh
+tensorflow/contrib/makefile/build_all_ios_with_tflite.sh
+```
+
+This will generate the required static linking libraries in the
+`tensorflow/contrib/makefile/gen/lib/` directory.
+
+The TensorFlow Lite camera example app can be used to test this. A new
+TensorFlow Lite XCode project with support for select TensorFlow ops has been
+added to
+`tensorflow/lite/examples/ios/camera/tflite_camera_example_with_select_tf_ops.xcodeproj`.
+
+To use this feature in a your own project, either clone the example project or
+set the project settings for a new or existing project to the following:
+
+*   In Build Phases -> Link Binary With Libraries, add the static libraries
+    under `tensorflow/contrib/makefile/gen/lib/` directory:
+    *   `libtensorflow-lite.a`
+    *   `libprotobuf.a`
+    *   `nsync.a`
+*   In Build Settings -> Header Search Paths, add the following directories:
+    *   `tensorflow/lite/`
+    *   `tensorflow/contrib/makefile/downloads/flatbuffer/include`
+    *   `tensorflow/contrib/makefile/downloads/eigen`
+*   In Build Settings -> Other Linker Flags, add `-force_load
+    tensorflow/contrib/makefile/gen/lib/libtensorflow-lite.a`.
+
+A CocoaPod with support for select TensorFlow ops will also be released in the
+future.
+
+### C++
+
+When building TensorFlow Lite libraries using the bazel pipeline, the additional
+TensorFlow ops library can be included and enabled as follows:
+
+*   Enable monolithic builds if necessary by adding the `--config=monolithic`
+    build flag.
+*   Do one of the following:
+    *   Include the `--define=with_select_tf_ops=true` build flag in the `bazel
+        build` invocation when building TensorFlow Lite.
+    *   Add the TensorFlow ops delegate library dependency to the build
+        dependencies: `tensorflow/lite/delegates/flex:delegate`.
+
+Note that the necessary `TfLiteDelegate` will be installed automatically when
+creating the interpreter at runtime as long as the delegate is linked into the
+client library. It is not necessary to explicitly install the delegate instance
+as is typically required with other delegate types.
+
+### Python pip Package
+
+Python support is actively under development.
+
+## Metrics
+
+### Performance
+
+When using a mixture of both builtin and select TensorFlow ops, all of the same
+TensorFlow Lite optimizations and optimized builtin kernels will be be available
+and usable with the converted model. For the TensorFlow ops, performance should
+generally be comparable to that of
+[TensorFlow Mobile](https://www.tensorflow.org/lite/tfmobile/).
+
+The following table describes the average time taken to run inference on
+MobileNet on a Pixel 2. The listed times are an average of 100 runs. These
+targets were built for Android using the flags: `--config=android_arm64 -c opt`.
+
+Build                                | Time (milliseconds)
+------------------------------------ | -------------------
+Only built-in ops (`TFLITE_BUILTIN`) | 260.7
+Using only TF ops (`SELECT_TF_OPS`)  | 264.5
+
+### Binary Size
+
+The following table describes the binary size of TensorFlow Lite for each build.
+These targets were built for Android using `--config=android_arm -c opt`.
+
+Build                 | C++ Binary Size | Android APK Size
+--------------------- | --------------- | ----------------
+Only built-in ops     | 796 KB          | 561 KB
+Built-in ops + TF ops | 23.0 MB         | 8.0 MB
+
+## Known Limitations
+
+The following is a list of some of the known limitations:
+
+*   Control flow ops are not yet supported.
+*   The
+    [`post_training_quantization`](https://www.tensorflow.org/performance/post_training_quantization)
+    flag is currently not supported for TensorFlow ops so it will not quantize
+    weights for any TensorFlow ops. In models with both TensorFlow Lite builtin
+    ops and TensorFlow ops, the weights for the builtin ops will be quantized.
+*   Ops that require explicit initialization from resources, like HashTableV2,
+    are not yet supported.
+*   Certain TensorFlow ops may not support the full set of input/output types
+    that are typically available on stock TensorFlow.
+
+## Future Plans
+
+The following is a list of improvements to this pipeline that are in progress:
+
+*   *Selective registration* - There is work being done to make it simple to
+    generate TFLite interpreter binaries that only contain the TensorFlow ops
+    required for a particular set of models.
+*   *Improved usability* - The conversion process will be simplified to only
+    require a single pass through the converter. Additionally, pre-built Android
+    AAR and iOS CocoaPod binaries will be provided.
+*   *Improved performance* - There is work being done to ensure TensorFlow Lite
+    with TensorFlow ops has performance parity to TensorFlow Mobile.
diff --git a/tensorflow/lite/interpreter.cc b/tensorflow/lite/interpreter.cc
index c90fc3be87e..e2129ed46d9 100644
--- a/tensorflow/lite/interpreter.cc
+++ b/tensorflow/lite/interpreter.cc
@@ -20,7 +20,6 @@ limitations under the License.
 #include <cstdint>
 #include <cstring>
 
-#include "tensorflow/lite/arena_planner.h"
 #include "tensorflow/lite/c/c_api_internal.h"
 #include "tensorflow/lite/context_util.h"
 #include "tensorflow/lite/core/api/error_reporter.h"
@@ -32,110 +31,15 @@ limitations under the License.
 #include "tensorflow/lite/util.h"
 
 namespace tflite {
-namespace {
-
-TfLiteStatus ReportOpError(TfLiteContext* context, const TfLiteNode& node,
-                           const TfLiteRegistration& registration,
-                           int node_index, const char* message) {
-  context->ReportError(
-      context, "Node number %d (%s) %s.\n", node_index,
-      registration.custom_name
-          ? registration.custom_name
-          : EnumNameBuiltinOperator(
-                static_cast<BuiltinOperator>(registration.builtin_code)),
-      message);
-  return kTfLiteError;
-}
-
-// Stub method which returns kTfLiteError when the function is forbidden.
-// We're registrating this function to several different function to save
-// compiled binary size. Please note the restrictions:
-// * The type of first parameter have to be `TfLiteContext*`.
-// * All paramteters must be trivailly destructible. (E.g. No C++ class)
-TfLiteStatus ForbiddenContextFunction(TfLiteContext* context, ...) {
-  context->ReportError(context,
-                       "The function is forbidden if not calling in delegate.");
-  return kTfLiteError;
-}
-
-// Set the ForbiddenContextFunction to a compatible function pointer.
-template <typename FunctionType>
-void SetForbiddenContextFunction(FunctionType* func) {
-  *func = reinterpret_cast<FunctionType>(ForbiddenContextFunction);
-}
-
-// Returns true if at least one tensor in the given list is kTfLiteDynamic.
-template <typename TensorIntArray>
-bool HasDynamicTensorImpl(const TfLiteContext& context,
-                          const TensorIntArray& int_array) {
-  for (int i : int_array) {
-    const TfLiteTensor& tensor = context.tensors[i];
-    if (tensor.allocation_type == kTfLiteDynamic) {
-      return true;
-    }
-  }
-  return false;
-}
-
-}  // namespace
-
-// A trivial implementation of GraphInfo around the Interpreter.
-// NOTE: this interpreter info represents the subset of the
-// graph that is executed according to execution plan. Thus,
-// the indices are execution plan indices rather than raw node
-// indices.
-class InterpreterInfo : public GraphInfo {
- public:
-  explicit InterpreterInfo(Interpreter* interpreter)
-      : interpreter_(interpreter) {}
-
-  size_t num_tensors() const override { return interpreter_->tensors_size(); }
-  TfLiteTensor* tensor(size_t index) override {
-    return interpreter_->tensor(index);
-  }
-  size_t num_nodes() const override {
-    return interpreter_->execution_plan().size();
-  }
-  const TfLiteNode& node(size_t index) const override {
-    int node_index = interpreter_->execution_plan()[index];
-    return interpreter_->node_and_registration(node_index)->first;
-  }
-  const std::vector<int>& inputs() const override {
-    return interpreter_->inputs();
-  }
-  const std::vector<int>& outputs() const override {
-    return interpreter_->outputs();
-  }
-  const std::vector<int>& variables() const override {
-    return interpreter_->variables();
-  }
-
- public:
-  Interpreter* interpreter_;
-};
 
 Interpreter::Interpreter(ErrorReporter* error_reporter)
     : error_reporter_(error_reporter ? error_reporter
                                      : DefaultErrorReporter()) {
-  context_.impl_ = static_cast<void*>(this);
-  context_.ResizeTensor = ResizeTensor;
-  context_.ReportError = ReportError;
-  context_.AddTensors = AddTensors;
-  context_.tensors = nullptr;
-  context_.tensors_size = 0;
-  context_.allow_fp32_relax_to_fp16 = false;
-  context_.recommended_num_threads = -1;
-  context_.GetExternalContext = GetExternalContext;
-  context_.SetExternalContext = SetExternalContext;
-
-  // Invalid to call these these except from TfLiteDelegate
-  SwitchToKernelContext();
+  // There's always at least 1 subgraph which is the primary subgraph.
+  AddSubgraphs(1);
+  context_ = primary_subgraph().context();
 
   // Reserve some space for the tensors to avoid excessive resizing.
-  tensors_.reserve(kTensorsReservedCapacity);
-  nodes_and_registration_.reserve(kTensorsReservedCapacity);
-  next_execution_plan_index_to_prepare_ = 0;
-
   for (int i = 0; i < kTfLiteMaxExternalContexts; ++i) {
     external_contexts_[i] = nullptr;
   }
@@ -143,670 +47,88 @@ Interpreter::Interpreter(ErrorReporter* error_reporter)
   UseNNAPI(false);
 }
 
-Interpreter::~Interpreter() {
-  for (auto& nodeAndReg : nodes_and_registration_) {
-    TfLiteNode& node = nodeAndReg.first;
-    TfLiteIntArrayFree(node.inputs);
-    TfLiteIntArrayFree(node.outputs);
-    TfLiteIntArrayFree(node.temporaries);
-    if (node.builtin_data) free(node.builtin_data);
-    OpFree(nodeAndReg.second, node.user_data);
-    node.builtin_data = nullptr;
-  }
-
-  for (size_t i = 0; i < context_.tensors_size; i++) {
-    TfLiteTensor* tensor = &context_.tensors[i];
-    if (tensor->buffer_handle != kTfLiteNullBufferHandle &&
-        tensor->delegate->FreeBufferHandle != nullptr) {
-      tensor->delegate->FreeBufferHandle(&context_, tensor->delegate,
-                                         &tensor->buffer_handle);
-    }
-    TfLiteTensorFree(tensor);
-  }
-}
-
-TfLiteStatus Interpreter::ReplaceNodeSubsetsWithDelegateKernels(
-    TfLiteContext* context, TfLiteRegistration registration,
-    const TfLiteIntArray* nodes_to_replace, TfLiteDelegate* delegate) {
-  return static_cast<Interpreter*>(context->impl_)
-      ->ReplaceNodeSubsetsWithDelegateKernels(registration, nodes_to_replace,
-                                              delegate);
-}
-
-namespace {
-
-// Copy a std::vector<int> to an existing TfLiteIntArray.
-// This is a low-level data manipulation function, and it's caller's
-// responsibility to ensure TfLiteIntArray has enough size.
-void CopyVectorToTfLiteIntArray(const std::vector<int>& vec,
-                                TfLiteIntArray* arr) {
-  arr->size = vec.size();
-  memcpy(arr->data, vec.data(), sizeof(int) * arr->size);
-}
-
-// This function allocates a continuous memory space that contains a
-// TfLiteDelegateParams followed by a several TfLiteIntArray.
-// When calling `free` at TfLiteDelegateParams*, all the allocated space
-// will be freed together.
-//
-// +-----------------------------------+
-// | TfLiteDelegateParams              |
-// | TfLiteDelegate* delegate;         |
-// | TfLiteIntArray* nodes_to_replace; |--\
-// | TfLiteIntArray* input_tensors;    |--+--\
-// | TfLiteIntArray* output_tensors;   |--+--+--\
-// +-----------------------------------+  |  |  |
-// | TfLiteIntArray (variable size)    |<-/  |  |
-// +-----------------------------------+     |  |
-// | TfLiteIntArray (variable size)    |<----/  |
-// +-----------------------------------+        |
-// | TfLiteIntArray (variable size)    |<-------/
-// +-----------------------------------+
-TfLiteDelegateParams* CreateDelegateParams(TfLiteDelegate* delegate,
-                                           const NodeSubset& node_subset) {
-  // Step 1: Calculate the allocation size.
-  int allocation_size = sizeof(TfLiteDelegateParams);
-
-  int nodes_to_replace_size =
-      TfLiteIntArrayGetSizeInBytes(node_subset.nodes.size());
-  allocation_size += nodes_to_replace_size;
-
-  int input_tensors_size =
-      TfLiteIntArrayGetSizeInBytes(node_subset.input_tensors.size());
-  allocation_size += input_tensors_size;
-
-  int output_tensors_size =
-      TfLiteIntArrayGetSizeInBytes(node_subset.output_tensors.size());
-  allocation_size += output_tensors_size;
-
-  // Step 2: Allocate the memory.
-  // Use `char*` for conveniently step through the allocated space by bytes.
-  char* allocation = reinterpret_cast<char*>(malloc(allocation_size));
-
-  // Step 3: Fill all data structures structures.
-  TfLiteDelegateParams* params =
-      reinterpret_cast<TfLiteDelegateParams*>(allocation);
-  params->delegate = delegate;
-  allocation += sizeof(TfLiteDelegateParams);
-
-  params->nodes_to_replace = reinterpret_cast<TfLiteIntArray*>(allocation);
-  CopyVectorToTfLiteIntArray(node_subset.nodes, params->nodes_to_replace);
-  allocation += nodes_to_replace_size;
-
-  params->input_tensors = reinterpret_cast<TfLiteIntArray*>(allocation);
-  CopyVectorToTfLiteIntArray(node_subset.input_tensors, params->input_tensors);
-  allocation += input_tensors_size;
-
-  params->output_tensors = reinterpret_cast<TfLiteIntArray*>(allocation);
-  CopyVectorToTfLiteIntArray(node_subset.output_tensors,
-                             params->output_tensors);
-  allocation += output_tensors_size;
-
-  return params;
-}
-
-}  // namespace
-
-TfLiteStatus Interpreter::ReplaceNodeSubsetsWithDelegateKernels(
-    TfLiteRegistration registration, const TfLiteIntArray* nodes_to_replace,
-    TfLiteDelegate* delegate) {
-  // Annotate the registration as DELEGATE op.
-  registration.builtin_code = BuiltinOperator_DELEGATE;
-
-  // Analyze the graph to find all independent node_subsets that are either
-  // fully not-this-delegate or this-delegate computation.
-  InterpreterInfo info(this);
-  std::vector<NodeSubset> node_subsets;
-  PartitionGraphIntoIndependentNodeSubsets(&info, nodes_to_replace,
-                                           &node_subsets);
-
-  execution_plan_.clear();
-  for (auto& node_subset : node_subsets) {
-    // Subsets calimed by the delegate should have a "macro" op created, the
-    // other node_subsets (kTfNonPartition) just have their nodes added back to
-    // the execution plan.
-    switch (node_subset.type) {
-      case NodeSubset::kTfNonPartition:
-        for (auto it = node_subset.nodes.begin(); it != node_subset.nodes.end();
-             ++it) {
-          execution_plan_.push_back(*it);
-        }
-        break;
-      case NodeSubset::kTfPartition: {
-        int node_index;
-
-        TfLiteDelegateParams* params =
-            CreateDelegateParams(delegate, node_subset);
-        TF_LITE_ENSURE_STATUS(AddNodeWithParameters(
-            node_subset.input_tensors, node_subset.output_tensors, nullptr, 0,
-            params, &registration, &node_index));
-
-        // Initialize the output tensors's delegate-related fields.
-        for (int tensor_index : node_subset.output_tensors) {
-          TfLiteTensor* tensor = &tensors_[tensor_index];
-          TF_LITE_ENSURE(&context_, tensor->delegate == nullptr ||
-                                        tensor->delegate == delegate);
-          tensor->delegate = delegate;
-        }
-
-        // Associate the node with the delegate.
-        TfLiteNode* node = &nodes_and_registration_[node_index].first;
-        node->delegate = delegate;
-      } break;
-      case NodeSubset::kTfUnexplored:
-        return kTfLiteError;
-        break;
-    }
-  }
-  return kTfLiteOk;
-}
-
-TfLiteExternalContext* Interpreter::GetExternalContext(
-    TfLiteExternalContextType type) {
-  if (type >= 0 && type < kTfLiteMaxExternalContexts) {
-    return external_contexts_[type];
-  }
-  return nullptr;
-}
-
-TfLiteExternalContext* Interpreter::GetExternalContext(
-    struct TfLiteContext* context, TfLiteExternalContextType type) {
-  return static_cast<Interpreter*>(context->impl_)->GetExternalContext(type);
-}
+Interpreter::~Interpreter() {}
 
 void Interpreter::SetExternalContext(TfLiteExternalContextType type,
                                      TfLiteExternalContext* ctx) {
-  if (type >= 0 && type < kTfLiteMaxExternalContexts) {
-    external_contexts_[type] = ctx;
-  }
-}
-
-void Interpreter::SetExternalContext(struct TfLiteContext* context,
-                                     TfLiteExternalContextType type,
-                                     TfLiteExternalContext* ctx) {
-  return static_cast<Interpreter*>(context->impl_)
-      ->SetExternalContext(type, ctx);
-}
-
-// Gets an TfLiteIntArray* representing the execution plan. The interpreter owns
-// this memory and it is only guaranteed to exist during the invocation of the
-// delegate prepare.
-TfLiteStatus Interpreter::GetExecutionPlan(TfLiteIntArray** execution_plan) {
-  // TODO(aselle): Do not make a copy here
-  plan_cache_.reset(TfLiteIntArrayCreate(execution_plan_.size()));
-  *execution_plan = plan_cache_.get();
-  static_assert(sizeof(plan_cache_->data[0]) == sizeof(execution_plan_[0]),
-                "TfLiteIntArray and execution_plan do not contain same type.");
-  std::memcpy(plan_cache_->data, execution_plan_.data(),
-              sizeof(plan_cache_->data[0]) * execution_plan_.size());
-  return kTfLiteOk;
-}
-
-// WARNING: This is an experimental interface that is subject to change.
-// Entry point for C node plugin API to get the execution plan
-TfLiteStatus Interpreter::GetExecutionPlan(struct TfLiteContext* context,
-                                           TfLiteIntArray** execution_plan) {
-  return static_cast<Interpreter*>(context->impl_)
-      ->GetExecutionPlan(execution_plan);
+  primary_subgraph().SetExternalContext(type, ctx);
 }
 
 TfLiteStatus Interpreter::SetInputs(std::vector<int> inputs) {
-  TF_LITE_ENSURE_OK(&context_,
-                    CheckTensorIndices("inputs", inputs.data(), inputs.size()));
-  inputs_ = std::move(inputs);
-  return kTfLiteOk;
+  return primary_subgraph().SetInputs(inputs);
 }
 
 TfLiteStatus Interpreter::SetOutputs(std::vector<int> outputs) {
-  TF_LITE_ENSURE_OK(
-      &context_, CheckTensorIndices("outputs", outputs.data(), outputs.size()));
-  outputs_ = std::move(outputs);
-  return kTfLiteOk;
+  return primary_subgraph().SetOutputs(outputs);
 }
 
 TfLiteStatus Interpreter::SetVariables(std::vector<int> variables) {
-  TF_LITE_ENSURE_OK(&context_, CheckTensorIndices("variables", variables.data(),
-                                                  variables.size()));
-  variables_ = std::move(variables);
-  return kTfLiteOk;
-}
-
-TfLiteStatus Interpreter::CheckTensorIndices(const char* label,
-                                             const int* indices, int length) {
-  // Making sure kOptionalTensor is not re-defined to something other than -1.
-  static_assert(kOptionalTensor == -1, "kOptionalTensor should be defined -1");
-
-  for (int i = 0; i < length; i++) {
-    int index = indices[i];
-    // Continue if index == kOptionalTensor before additional comparisons below,
-    // size_t(-1) is always >= context_tensors_size.
-    if (index == kOptionalTensor) {
-      continue;
-    }
-    if (index < 0 || static_cast<size_t>(index) >= context_.tensors_size) {
-      ReportError(&context_, "Invalid tensor index %d in %s\n", index, label);
-      consistent_ = false;
-      return kTfLiteError;
-    }
-  }
-  return kTfLiteOk;
-}
-
-TfLiteStatus Interpreter::BytesRequired(TfLiteType type, const int* dims,
-                                        size_t dims_size, size_t* bytes) {
-  // TODO(aselle): Check for overflow here using overflow.h in TensorFlow
-  // MultiplyWithoutOverflow.
-  TF_LITE_ENSURE(&context_, bytes != nullptr);
-  size_t count = 1;
-  for (int k = 0; k < dims_size; k++) count *= dims[k];
-  switch (type) {
-    case kTfLiteFloat32:
-      *bytes = sizeof(float) * count;
-      break;
-    case kTfLiteInt16:
-      *bytes = sizeof(int16_t) * count;
-      break;
-    case kTfLiteInt32:
-      *bytes = sizeof(int32_t) * count;
-      break;
-    case kTfLiteUInt8:
-      *bytes = sizeof(uint8_t) * count;
-      break;
-    case kTfLiteInt64:
-      *bytes = sizeof(int64_t) * count;
-      break;
-    case kTfLiteBool:
-      *bytes = sizeof(bool) * count;
-      break;
-    case kTfLiteComplex64:
-      *bytes = sizeof(std::complex<float>) * count;
-      break;
-    default:
-      ReportError(&context_,
-                  "Only float32, int16, int32, int64, uint8, bool, complex64 "
-                  "supported currently.");
-      return kTfLiteError;
-  }
-  return kTfLiteOk;
+  return primary_subgraph().SetVariables(variables);
 }
 
 TfLiteStatus Interpreter::AllocateTensors() {
-  if (!consistent_) {
-    ReportError(&context_, "AllocateTensors() called on inconsistent model.");
-    return kTfLiteError;
-  }
-
-  // Explicit (re)allocation is necessary if nodes have been changed or tensors
-  // have been resized. For inputs marked as dynamic, we can't short-circuit the
-  // allocation as the client may have done the resize manually.
-  if (state_ != kStateUninvokable && !HasDynamicTensorImpl(context_, inputs_)) {
-    return kTfLiteOk;
-  }
-
-  next_execution_plan_index_to_prepare_ = 0;
-  if (memory_planner_) {
-    TF_LITE_ENSURE_STATUS(memory_planner_->ResetAllocations());
-  }
-
-  TF_LITE_ENSURE_STATUS(PrepareOpsAndTensors());
-
-  state_ = kStateInvokable;
-
-  // Reset the variable tensors to zero after (re)allocating the tensors.
-  // Developers shouldn't rely on the side effect of this function to reset
-  // variable tesnsors. They should call `ResetVariableTensors` directly
-  // instead.
-  ResetVariableTensors();
-
-  return kTfLiteOk;
-}
-
-// TODO(ycling): Support non-zero default values.
-TfLiteStatus Interpreter::ResetVariableTensors() {
-  for (auto& tensor : tensors_) {
-    if (!tensor.is_variable) {
-      continue;
-    }
-
-    // Variable tensors have to be `kTfLiteArenaRwPersistent`, and must be
-    // allocated after the initial `PrepareOpsAndTensors()` is called.
-    TF_LITE_ENSURE_EQ(&context_, tensor.allocation_type,
-                      kTfLiteArenaRwPersistent);
-    TF_LITE_ENSURE(&context_, tensor.data.raw != nullptr);
-
-    memset(tensor.data.raw, 0, tensor.bytes);
-  }
-  return kTfLiteOk;
+  return primary_subgraph().AllocateTensors();
 }
 
 void Interpreter::ReserveNodes(int count) {
-  nodes_and_registration_.reserve(count);
+  primary_subgraph().nodes_and_registration().reserve(count);
+}
+
+void Interpreter::AddSubgraphs(int subgraphs_to_add,
+                               int* first_new_subgraph_index) {
+  const size_t base_index = subgraphs_.size();
+  if (first_new_subgraph_index) *first_new_subgraph_index = base_index;
+
+  subgraphs_.reserve(base_index + subgraphs_to_add);
+  for (int i = 0; i < subgraphs_to_add; ++i) {
+    Subgraph* subgraph =
+        new Subgraph(error_reporter_, external_contexts_, &subgraphs_);
+    subgraphs_.emplace_back(subgraph);
+  }
 }
 
 TfLiteStatus Interpreter::AddNodeWithParameters(
     const std::vector<int>& inputs, const std::vector<int>& outputs,
     const char* init_data, size_t init_data_size, void* builtin_data,
     const TfLiteRegistration* registration, int* node_index) {
-  if (state_ == kStateInvokableAndImmutable) {
-    ReportError(&context_,
-                "AddNodeWithParameters is disallowed when graph is immutable.");
-    return kTfLiteError;
-  }
-  state_ = kStateUninvokable;
-
-  std::unique_ptr<void, decltype(free)*> builtin_data_deleter(builtin_data,
-                                                              free);
-
-  TF_LITE_ENSURE_OK(&context_, CheckTensorIndices("node inputs", inputs.data(),
-                                                  inputs.size()));
-  TF_LITE_ENSURE_OK(
-      &context_,
-      CheckTensorIndices("node outputs", outputs.data(), outputs.size()));
-
-  int new_node_index = nodes_and_registration_.size();
-  if (node_index) *node_index = new_node_index;
-  nodes_and_registration_.resize(nodes_and_registration_.size() + 1);
-  auto& node_and_reg = nodes_and_registration_.back();
-  TfLiteNode& node = node_and_reg.first;
-  if (node.inputs) TfLiteIntArrayFree(node.inputs);
-  if (node.outputs) TfLiteIntArrayFree(node.outputs);
-  if (node.temporaries) TfLiteIntArrayFree(node.temporaries);
-
-  // NOTE, here we are not using move semantics yet, since our internal
-  // representation isn't std::vector, but in the future we would like to avoid
-  // copies, so we want the interface to take r-value references now.
-  node.inputs = ConvertVectorToTfLiteIntArray(inputs);
-  node.outputs = ConvertVectorToTfLiteIntArray(outputs);
-  node.temporaries = TfLiteIntArrayCreate(0);
-  if (init_data) {
-    node.user_data = OpInit(*registration, init_data, init_data_size);
-  } else {
-    node.user_data =
-        OpInit(*registration,
-               reinterpret_cast<const char*>(builtin_data_deleter.get()), 0);
-  }
-
-  node.builtin_data = builtin_data_deleter.release();
-  // TODO(ycling): Filling `custom_initial_data` and `custom_initial_data_size`
-  // properly for nodes generated by ReplaceNodeSubsetsWithDelegateKernels.
-
-  if (registration->builtin_code == BuiltinOperator_CUSTOM) {
-    // When it's a CUSTOM op, the `custom_options` field in the Flatbuffer
-    // `Operator` table is passed in.
-    node.custom_initial_data = init_data;
-    node.custom_initial_data_size = init_data_size;
-  } else {
-    node.custom_initial_data = nullptr;
-    node.custom_initial_data_size = 0;
-  }
-
-  node.delegate = nullptr;
-  node_and_reg.second = *registration;
-  execution_plan_.push_back(new_node_index);
-  return kTfLiteOk;
+  return primary_subgraph().AddNodeWithParameters(inputs, outputs, init_data,
+                                                  init_data_size, builtin_data,
+                                                  registration, node_index);
 }
 
 TfLiteStatus Interpreter::ResizeInputTensor(int tensor_index,
                                             const std::vector<int>& dims) {
-  if (state_ == kStateInvokableAndImmutable) {
-    ReportError(&context_,
-                "ResizeInputTensor is disallowed when graph is immutable.");
-    return kTfLiteError;
-  }
-
-  // TODO(aselle): All bounds checks can be implemented as one-sided bounds
-  // checks by casting to unsigned for efficiency. Profile before doing this.
-  TF_LITE_ENSURE(&context_,
-                 tensor_index < context_.tensors_size && tensor_index >= 0);
-  TfLiteTensor* tensor = &context_.tensors[tensor_index];
-
-  // Short-circuit the state change if the dimensions don't change, avoiding
-  // unnecessary (re)allocations.
-  if (EqualArrayAndTfLiteIntArray(tensor->dims, dims.size(), dims.data())) {
-    return kTfLiteOk;
-  }
-
-  state_ = kStateUninvokable;
-  return ResizeTensorImpl(tensor, ConvertVectorToTfLiteIntArray(dims));
-}
-
-bool HasDynamicTensor(const TfLiteContext& context,
-                      const TfLiteIntArray* int_array) {
-  return HasDynamicTensorImpl(context, TfLiteIntArrayView{int_array});
-}
-
-TfLiteStatus Interpreter::PrepareOpsStartingAt(
-    int first_execution_plan_index, int* last_execution_plan_index_prepared) {
-  for (int execution_plan_index = first_execution_plan_index;
-       execution_plan_index < execution_plan_.size(); execution_plan_index++) {
-    int node_index = execution_plan_[execution_plan_index];
-    TfLiteNode& node = nodes_and_registration_[node_index].first;
-    const TfLiteRegistration& registration =
-        nodes_and_registration_[node_index].second;
-    EnsureTensorsVectorCapacity();
-    if (OpPrepare(registration, &node) == kTfLiteError) {
-      return ReportOpError(&context_, node, registration, node_index,
-                           "failed to prepare");
-    }
-
-    *last_execution_plan_index_prepared = execution_plan_index;
-
-    // Discontinue if the node has dynamic outputs. Note that we don't
-    // stop for dynamic temporary tensors since they won't affect the
-    // sizes of other tensors in the graph.
-    if (HasDynamicTensor(context_, node.outputs)) {
-      break;
-    }
-  }
-  return kTfLiteOk;
-}
-
-TfLiteStatus Interpreter::PrepareOpsAndTensors() {
-  if (!memory_planner_) {
-    memory_planner_.reset(new ArenaPlanner(
-        &context_, std::unique_ptr<GraphInfo>(new InterpreterInfo(this)),
-        /*preserve_inputs=*/true, /*preserve_intermediates*/ false));
-    memory_planner_->PlanAllocations();
-  }
-
-  int last_exec_plan_index_prepared = 0;
-
-  TF_LITE_ENSURE_STATUS(PrepareOpsStartingAt(
-      next_execution_plan_index_to_prepare_, &last_exec_plan_index_prepared));
-  TF_LITE_ENSURE_STATUS(memory_planner_->ExecuteAllocations(
-      next_execution_plan_index_to_prepare_, last_exec_plan_index_prepared));
-
-  next_execution_plan_index_to_prepare_ = last_exec_plan_index_prepared + 1;
-  return kTfLiteOk;
+  return primary_subgraph().ResizeInputTensor(tensor_index, dims);
 }
 
 TfLiteStatus Interpreter::Invoke() {
-  if (!consistent_) {
-    ReportError(&context_, "Invoke called on model that is not consistent.");
-    return kTfLiteError;
-  }
-  if (state_ == kStateUninvokable) {
-    ReportError(&context_, "Invoke called on model that is not ready.");
-    return kTfLiteError;
-  }
-
-  TfLiteStatus status = kTfLiteOk;
-  if (nnapi_delegate_) {
-    if (next_execution_plan_index_to_prepare_ == execution_plan_.size()) {
-      TF_LITE_ENSURE_OK(&context_, nnapi_delegate_->Invoke(this));
-      return kTfLiteOk;
-    } else {
-      // TODO(aselle): In the future, we would like this to be an
-      // automatic tflite CPU fallback.
-      ReportError(&context_,
-                  "NNAPI was requested, but dependent sized tensors "
-                  "being used.\n");
-      return kTfLiteError;
-    }
-  }
-
-  // Invocations are always done in node order.
-  // Note that calling Invoke repeatedly will cause the original memory plan to
-  // be reused, unless either ResizeInputTensor() or AllocateTensors() has been
-  // called.
-  for (int execution_plan_index = 0;
-       execution_plan_index < execution_plan_.size(); execution_plan_index++) {
-    if (execution_plan_index == next_execution_plan_index_to_prepare_) {
-      TF_LITE_ENSURE_STATUS(PrepareOpsAndTensors());
-      TF_LITE_ENSURE(&context_, next_execution_plan_index_to_prepare_ >=
-                                    execution_plan_index);
-    }
-    int node_index = execution_plan_[execution_plan_index];
-    TfLiteNode& node = nodes_and_registration_[node_index].first;
-    const TfLiteRegistration& registration =
-        nodes_and_registration_[node_index].second;
-    SCOPED_OPERATOR_PROFILE(profiler_, node_index);
-
-    // TODO(ycling): This is an extra loop through inputs to check if the data
-    // need to be copied from Delegate buffer to raw memory, which is often not
-    // needed. We may want to cache this in prepare to know if this needs to be
-    // done for a node or not.
-    for (int i = 0; i < node.inputs->size; ++i) {
-      int tensor_index = node.inputs->data[i];
-      if (tensor_index == kOptionalTensor) {
-        continue;
-      }
-      TfLiteTensor* tensor = &tensors_[tensor_index];
-      if (tensor->delegate && tensor->delegate != node.delegate &&
-          tensor->data_is_stale) {
-        EnsureTensorDataIsReadable(tensor_index);
-      }
-    }
-
-    EnsureTensorsVectorCapacity();
-    tensor_resized_since_op_invoke_ = false;
-    if (OpInvoke(registration, &node) == kTfLiteError) {
-      status = ReportOpError(&context_, node, registration, node_index,
-                             "failed to invoke");
-    }
-
-    // Force execution prep for downstream ops if the latest op triggered the
-    // resize of a dynamic tensor.
-    if (tensor_resized_since_op_invoke_ &&
-        HasDynamicTensor(context_, node.outputs)) {
-      next_execution_plan_index_to_prepare_ = execution_plan_index + 1;
-    }
-  }
+  TfLiteStatus status = primary_subgraph().Invoke();
 
   if (!allow_buffer_handle_output_) {
-    for (int tensor_index : outputs_) {
-      EnsureTensorDataIsReadable(tensor_index);
+    for (int tensor_index : outputs()) {
+      primary_subgraph().EnsureTensorDataIsReadable(tensor_index);
     }
   }
 
   return status;
 }
 
-TfLiteStatus Interpreter::ResizeTensor(TfLiteContext* context,
-                                       TfLiteTensor* tensor,
-                                       TfLiteIntArray* new_size) {
-  // Note here that context->impl_ is recovering the this pointer for an
-  // instance of Interpreter to call into the member function ResizeTensorImpl
-  // (this function is static).
-  return static_cast<Interpreter*>(context->impl_)
-      ->ResizeTensorImpl(tensor, new_size);
-}
-
-void Interpreter::ReportErrorImpl(const char* format, va_list args) {
-  error_reporter_->Report(format, args);
-}
-
-void Interpreter::ReportError(TfLiteContext* context, const char* format, ...) {
-  va_list args;
-  va_start(args, format);
-  auto* f = static_cast<Interpreter*>(context->impl_);
-  // Note here that context->impl_ is recovering the this pointer for an
-  // instance of Interpreter to call into the member function ReportErrorImpl
-  // (this function is static).
-  f->ReportErrorImpl(format, args);
-  va_end(args);
-}
-
 TfLiteStatus Interpreter::AddTensors(int tensors_to_add,
                                      int* first_new_tensor_index) {
-  const size_t base_index = tensors_.size();
-  if (first_new_tensor_index) *first_new_tensor_index = base_index;
-  tensors_.resize(tensors_.size() + tensors_to_add);
-  for (size_t i = base_index; i < tensors_.size(); i++) {
-    memset(&tensors_[i], 0, sizeof(tensors_[i]));
-    tensors_[i].buffer_handle = kTfLiteNullBufferHandle;
-  }
-  context_.tensors = tensors_.data();
-  context_.tensors_size = tensors_.size();
-  return kTfLiteOk;
+  return primary_subgraph().AddTensors(tensors_to_add, first_new_tensor_index);
 }
 
-TfLiteStatus Interpreter::AddTensors(TfLiteContext* context, int tensors_to_add,
-                                     int* first_new_tensor_index) {
-  // Note here that context->impl_ is recovering the this pointer for an
-  // instance of Interpreter to call into the member function AddTensors
-  // (this function is static).
-  return static_cast<Interpreter*>(context->impl_)
-      ->AddTensors(tensors_to_add, first_new_tensor_index);
-}
-
-TfLiteStatus Interpreter::GetNodeAndRegistration(
-    int node_index, TfLiteNode** node, TfLiteRegistration** registration) {
-  TF_LITE_ENSURE(&context_, node_index >= 0);
-  TF_LITE_ENSURE(&context_, static_cast<size_t>(node_index) < nodes_size());
-  TF_LITE_ENSURE(&context_, node != nullptr && registration != nullptr);
-  *node = &nodes_and_registration_[node_index].first;
-  *registration = &nodes_and_registration_[node_index].second;
-  return kTfLiteOk;
-}
-
-TfLiteStatus Interpreter::GetNodeAndRegistration(
-    struct TfLiteContext* context, int node_index, TfLiteNode** node,
-    TfLiteRegistration** registration) {
-  return static_cast<Interpreter*>(context->impl_)
-      ->GetNodeAndRegistration(node_index, node, registration);
+TfLiteStatus Interpreter::ResetVariableTensors() {
+  return primary_subgraph().ResetVariableTensors();
 }
 
 TfLiteStatus Interpreter::SetTensorParametersReadOnly(
     int tensor_index, TfLiteType type, const char* name, const size_t rank,
     const int* dims, TfLiteQuantizationParams quantization, const char* buffer,
     size_t bytes, const Allocation* allocation) {
-  if (state_ == kStateInvokableAndImmutable) {
-    ReportError(
-        &context_,
-        "SetTensorParametersReadOnly is disallowed when graph is immutable.");
-    return kTfLiteError;
-  }
-
-  TF_LITE_ENSURE(&context_,
-                 tensor_index < context_.tensors_size && tensor_index >= 0);
-  // For most tensors we know exactly how much memory is necessary so we can
-  // ensure the buffer is large enough. However, we need to skip string tensors
-  // because their sizes change with the contents of the individual strings.
-  if (type != kTfLiteString) {
-    size_t required_bytes;
-    TF_LITE_ENSURE_OK(&context_,
-                      BytesRequired(type, dims, rank, &required_bytes));
-    TF_LITE_ENSURE_EQ(&context_, required_bytes, bytes);
-  }
-
-  TfLiteTensor& tensor = context_.tensors[tensor_index];
-  if (type == tensor.type &&
-      EqualArrayAndTfLiteIntArray(tensor.dims, rank, dims)) {
-    // Fast path which does not invalidate the invokable property.
-    TfLiteTensorDataFree(&tensor);
-    tensor.data.raw = const_cast<char*>(buffer);
-    if (!tensor.dims) tensor.dims = ConvertArrayToTfLiteIntArray(rank, dims);
-    tensor.params = quantization;
-    tensor.allocation_type = kTfLiteMmapRo;
-    tensor.allocation = allocation;
-  } else {
-    state_ = kStateUninvokable;
-    TfLiteTensorReset(type, name, ConvertArrayToTfLiteIntArray(rank, dims),
-                      quantization, const_cast<char*>(buffer), bytes,
-                      kTfLiteMmapRo, allocation, false, &tensor);
-  }
-  return kTfLiteOk;
+  return primary_subgraph().SetTensorParametersReadOnly(
+      tensor_index, type, name, rank, dims, quantization, buffer, bytes,
+      allocation);
 }
 
 // Set description of inputs/outputs/data/fptrs for node `node_index`.
@@ -816,186 +138,52 @@ TfLiteStatus Interpreter::SetTensorParametersReadOnly(
 TfLiteStatus Interpreter::SetTensorParametersReadWrite(
     int tensor_index, TfLiteType type, const char* name, const size_t rank,
     const int* dims, TfLiteQuantizationParams quantization, bool is_variable) {
-  if (state_ == kStateInvokableAndImmutable) {
-    ReportError(
-        &context_,
-        "SetTensorParametersReadWrite is disallowed when graph is immutable.");
-    return kTfLiteError;
-  }
-  TF_LITE_ENSURE(&context_,
-                 tensor_index < context_.tensors_size && tensor_index >= 0);
-  size_t required_bytes = 0;
-  if (type != kTfLiteString) {
-    // These types will be allocated in our arena so we need to record how
-    // many bytes we will need based on the dimensions. String tensors are
-    // allocated dynamically and we can't know ahead of time how much space
-    // they will require.
-    TF_LITE_ENSURE_OK(&context_,
-                      BytesRequired(type, dims, rank, &required_bytes));
-  }
-
-  TfLiteAllocationType allocation_type = kTfLiteArenaRw;
-  if (type == kTfLiteString) {
-    if (is_variable) {
-      // We don't have a real use case for string variable tensor.
-      ReportError(&context_, "String variable tensor isn't supported.");
-      return kTfLiteError;
-    }
-    allocation_type = kTfLiteDynamic;
-  } else if (is_variable) {
-    allocation_type = kTfLiteArenaRwPersistent;
-  }
-
-  TfLiteTensorReset(type, name, ConvertArrayToTfLiteIntArray(rank, dims),
-                    quantization,
-                    /*buffer=*/nullptr, required_bytes, allocation_type,
-                    nullptr, is_variable, &context_.tensors[tensor_index]);
-  return kTfLiteOk;
+  return primary_subgraph().SetTensorParametersReadWrite(
+      tensor_index, type, name, rank, dims, quantization, is_variable);
 }
 
 TfLiteStatus Interpreter::SetExecutionPlan(const std::vector<int>& new_plan) {
-  for (int node_index : new_plan) {
-    TF_LITE_ENSURE(&context_, node_index >= 0 && node_index < nodes_size());
-  }
-  execution_plan_ = new_plan;
-  return kTfLiteOk;
+  return primary_subgraph().SetExecutionPlan(new_plan);
 }
 
-TfLiteStatus Interpreter::ResizeTensorImpl(TfLiteTensor* tensor,
-                                           TfLiteIntArray* new_size) {
-  // Note that in theory we could resize kTfLiteArenaRwPersistent tensors too.
-  if (tensor->allocation_type == kTfLiteArenaRw ||
-      tensor->allocation_type == kTfLiteDynamic ||
-      tensor->allocation_type == kTfLiteArenaRwPersistent) {
-    tensor_resized_since_op_invoke_ |=
-        TfLiteIntArrayEqual(tensor->dims, new_size) == 0;
-    if (tensor->type != kTfLiteString) {
-      size_t bytesRequired;
-      TfLiteStatus status = BytesRequired(tensor->type, new_size->data,
-                                          new_size->size, &bytesRequired);
-      if (status != kTfLiteOk) {
-        TfLiteIntArrayFree(new_size);
-        return kTfLiteError;
-      }
-
-      // Realloc space for kTfLiteDynamic tensors.
-      TfLiteTensorRealloc(bytesRequired, tensor);
-      tensor->bytes = bytesRequired;
-    }
-    if (tensor->dims) TfLiteIntArrayFree(tensor->dims);
-    tensor->dims = new_size;
-
-    if (tensor->allocation_type != kTfLiteDynamic) {
-      tensor->data.raw = nullptr;
-    }
-  } else {
-    // kTfLiteMmapRo tensors are stored in the flatbuffer and are therefore
-    // of fixed size.
-    TfLiteIntArrayFree(new_size);
-    ReportError(&context_, "Attempting to resize a fixed-size tensor.");
-    return kTfLiteError;
-  }
-  return kTfLiteOk;
-}
-
-void Interpreter::UseNNAPI(bool enable) {
-  // TODO(aselle): This is a workaround for finding if NNAPI exists.
-  // We also need to make sure getLibraryHandle() is renamed to be NNAPI
-  // prefixed.
-  if (!NNAPIDelegate::IsSupported()) enable = false;
-  if (!enable) {
-    nnapi_delegate_.reset();
-  } else if (!nnapi_delegate_) {
-    nnapi_delegate_.reset(new NNAPIDelegate);
-  }
-}
+void Interpreter::UseNNAPI(bool enable) { primary_subgraph().UseNNAPI(enable); }
 
 void Interpreter::SetNumThreads(int num_threads) {
-  context_.recommended_num_threads = num_threads;
+  for (auto& subgraph : subgraphs_) {
+    subgraph->context()->recommended_num_threads = num_threads;
+  }
 
   for (int i = 0; i < kTfLiteMaxExternalContexts; ++i) {
     auto* c = external_contexts_[i];
     if (c && c->Refresh) {
-      c->Refresh(&context_);
+      c->Refresh(context_);
     }
   }
 }
 
-void Interpreter::SwitchToDelegateContext() {
-  context_.GetNodeAndRegistration = GetNodeAndRegistration;
-  context_.ReplaceNodeSubsetsWithDelegateKernels =
-      ReplaceNodeSubsetsWithDelegateKernels;
-  context_.GetExecutionPlan = GetExecutionPlan;
-}
-
-void Interpreter::SwitchToKernelContext() {
-  SetForbiddenContextFunction(&context_.GetNodeAndRegistration);
-  SetForbiddenContextFunction(&context_.ReplaceNodeSubsetsWithDelegateKernels);
-  SetForbiddenContextFunction(&context_.GetExecutionPlan);
+void Interpreter::SetAllowFp16PrecisionForFp32(bool allow) {
+  for (auto& subgraph : subgraphs_) {
+    subgraph->context()->allow_fp32_relax_to_fp16 = allow;
+  }
 }
 
 TfLiteStatus Interpreter::ModifyGraphWithDelegate(TfLiteDelegate* delegate) {
-  if (!(delegate->flags & kTfLiteDelegateFlagsAllowDynamicTensors)) {
-    int last_execution_plan_index_prepared;
-    TF_LITE_ENSURE_OK(&context_, PrepareOpsStartingAt(
-                                     0, &last_execution_plan_index_prepared));
-
-    bool has_dynamic_tensors = true;
-    // Dynamic tensors exist if not all nodes can be prepared.
-    if (last_execution_plan_index_prepared + 1 == execution_plan_.size()) {
-      // If all the nodes can be prepared, check if the last node has dynamic
-      // tensors.
-      int node_index = execution_plan_[last_execution_plan_index_prepared];
-      TfLiteNode& node = nodes_and_registration_[node_index].first;
-      if (!HasDynamicTensor(context_, node.outputs)) {
-        has_dynamic_tensors = false;
-      }
-    }
-    if (has_dynamic_tensors) {
-      ReportError(
-          &context_,
-          "Attempting to use a delegate that only supports static-sized "
-          "tensors with a graph that has dynamic-sized tensors.");
-      return kTfLiteError;
-    }
-  }
-
-  // TODO(aselle): Consider if it is worth storing pointers to delegates.
-  // Setup additional context interface.
-  SwitchToDelegateContext();
-
-  TfLiteStatus status = delegate->Prepare(&context_, delegate);
-
-  // Remove additional context info.
-  SwitchToKernelContext();
-
-  TF_LITE_ENSURE_OK(&context_, status);
-
-  if (!(delegate->flags & kTfLiteDelegateFlagsAllowDynamicTensors)) {
-    // Reset the state to force tensor/op reallocation.
-    state_ = kStateUninvokable;
-    TF_LITE_ENSURE_OK(&context_, AllocateTensors());
-    TF_LITE_ENSURE_EQ(&context_, state_, kStateInvokable);
-    // After using a delegate which doesn't support dynamic tensors, make the
-    // entire graph immutable.
-    state_ = kStateInvokableAndImmutable;
-  }
-
-  return status;
+  return primary_subgraph().ModifyGraphWithDelegate(delegate);
 }
 
 TfLiteStatus Interpreter::SetBufferHandle(int tensor_index,
                                           TfLiteBufferHandle buffer_handle,
                                           TfLiteDelegate* delegate) {
-  TF_LITE_ENSURE(&context_, tensor_index < tensors_size());
-  TfLiteTensor* tensor = &tensors_[tensor_index];
+  TF_LITE_ENSURE(context_, tensor_index < tensors_size());
+  std::vector<TfLiteTensor>& tensors = primary_subgraph().tensors();
+  TfLiteTensor* tensor = &tensors[tensor_index];
 
-  TF_LITE_ENSURE(&context_,
+  TF_LITE_ENSURE(context_,
                  tensor->delegate == nullptr || tensor->delegate == delegate);
   tensor->delegate = delegate;
   if (tensor->buffer_handle != kTfLiteNullBufferHandle) {
-    TF_LITE_ENSURE(&context_, tensor->delegate->FreeBufferHandle != nullptr);
-    tensor->delegate->FreeBufferHandle(&context_, tensor->delegate,
+    TF_LITE_ENSURE(context_, tensor->delegate->FreeBufferHandle != nullptr);
+    tensor->delegate->FreeBufferHandle(context_, tensor->delegate,
                                        &tensor->buffer_handle);
   }
   tensor->buffer_handle = buffer_handle;
@@ -1006,8 +194,9 @@ TfLiteStatus Interpreter::SetBufferHandle(int tensor_index,
 TfLiteStatus Interpreter::GetBufferHandle(int tensor_index,
                                           TfLiteBufferHandle* buffer_handle,
                                           TfLiteDelegate** delegate) {
-  TF_LITE_ENSURE(&context_, tensor_index < tensors_size());
-  TfLiteTensor* tensor = &tensors_[tensor_index];
+  TF_LITE_ENSURE(context_, tensor_index < tensors_size());
+  std::vector<TfLiteTensor>& tensors = primary_subgraph().tensors();
+  TfLiteTensor* tensor = &tensors[tensor_index];
 
   *delegate = tensor->delegate;
   *buffer_handle = tensor->buffer_handle;
@@ -1015,4 +204,12 @@ TfLiteStatus Interpreter::GetBufferHandle(int tensor_index,
   return kTfLiteOk;
 }
 
+void Interpreter::SetProfiler(profiling::Profiler* profiler) {
+  for (auto& subgraph : subgraphs_) subgraph->SetProfiler(profiler);
+}
+
+profiling::Profiler* Interpreter::GetProfiler() {
+  return primary_subgraph().GetProfiler();
+}
+
 }  // namespace tflite
diff --git a/tensorflow/lite/interpreter.h b/tensorflow/lite/interpreter.h
index 415c5f0979c..6192d56ca2b 100644
--- a/tensorflow/lite/interpreter.h
+++ b/tensorflow/lite/interpreter.h
@@ -25,6 +25,7 @@ limitations under the License.
 #include "tensorflow/lite/allocation.h"
 #include "tensorflow/lite/c/c_api_internal.h"
 #include "tensorflow/lite/core/api/error_reporter.h"
+#include "tensorflow/lite/core/subgraph.h"
 #include "tensorflow/lite/memory_planner.h"
 #include "tensorflow/lite/profiling/profiler.h"
 #include "tensorflow/lite/stderr_reporter.h"
@@ -57,6 +58,10 @@ constexpr TfLiteType typeToTfLiteType<unsigned char>() {
   return kTfLiteUInt8;
 }
 template <>
+constexpr TfLiteType typeToTfLiteType<int8_t>() {
+  return kTfLiteInt8;
+}
+template <>
 constexpr TfLiteType typeToTfLiteType<bool>() {
   return kTfLiteBool;
 }
@@ -69,9 +74,6 @@ constexpr TfLiteType typeToTfLiteType<string>() {
   return kTfLiteString;
 }
 
-// Forward declare since NNAPIDelegate uses Interpreter.
-class NNAPIDelegate;
-
 // An interpreter for a graph of nodes that input and output from tensors.
 // Each node of the graph processes a set of input tensors and produces a
 // set of output Tensors. All inputs/output tensors are referenced by index.
@@ -100,12 +102,6 @@ class NNAPIDelegate;
 // foo.Invoke();
 //
 
-struct TfLiteIntArrayDeleter {
-  void operator()(TfLiteIntArray* a) {
-    if (a) TfLiteIntArrayFree(a);
-  }
-};
-
 class Interpreter {
  public:
   // Instantiate an interpreter. All errors associated with reading and
@@ -117,6 +113,7 @@ class Interpreter {
 
   ~Interpreter();
 
+  // Interpreters are not copyable as they have non-trivial memory semantics.
   Interpreter(const Interpreter&) = delete;
   Interpreter& operator=(const Interpreter&) = delete;
 
@@ -197,34 +194,40 @@ class Interpreter {
   // Functions to access tensor data
 
   // Read only access to list of inputs.
-  const std::vector<int>& inputs() const { return inputs_; }
+  const std::vector<int>& inputs() const { return primary_subgraph().inputs(); }
 
   // Return the name of a given input. The given index must be between 0 and
   // inputs().size().
   const char* GetInputName(int index) const {
-    return context_.tensors[inputs_[index]].name;
+    return context_->tensors[inputs()[index]].name;
   }
 
   // Read only access to list of outputs.
-  const std::vector<int>& outputs() const { return outputs_; }
+  const std::vector<int>& outputs() const {
+    return primary_subgraph().outputs();
+  }
 
   // Read only access to list of variable tensors.
-  const std::vector<int>& variables() const { return variables_; }
+  const std::vector<int>& variables() const {
+    return primary_subgraph().variables();
+  }
 
   // Return the name of a given output. The given index must be between 0 and
   // outputs().size().
   const char* GetOutputName(int index) const {
-    return context_.tensors[outputs_[index]].name;
+    return context_->tensors[outputs()[index]].name;
   }
 
   // Return the number of tensors in the model.
-  size_t tensors_size() const { return context_.tensors_size; }
+  size_t tensors_size() const { return context_->tensors_size; }
 
   // Return the number of ops in the model.
-  size_t nodes_size() const { return nodes_and_registration_.size(); }
+  size_t nodes_size() const { return primary_subgraph().nodes_size(); }
 
   // WARNING: Experimental interface, subject to change
-  const std::vector<int>& execution_plan() const { return execution_plan_; }
+  const std::vector<int>& execution_plan() const {
+    return primary_subgraph().execution_plan();
+  }
 
   // WARNING: Experimental interface, subject to change
   // Overrides execution plan. This bounds checks indices sent in.
@@ -234,27 +237,18 @@ class Interpreter {
   // TODO(aselle): Create a safe ArrayHandle interface to avoid exposing this
   // read/write access to structure
   TfLiteTensor* tensor(int tensor_index) {
-    if (tensor_index < 0 ||
-        static_cast<size_t>(tensor_index) >= context_.tensors_size)
-      return nullptr;
-    return &context_.tensors[tensor_index];
+    return primary_subgraph().tensor(tensor_index);
   }
 
   // Get an immutable tensor data structure.
   const TfLiteTensor* tensor(int tensor_index) const {
-    if (tensor_index < 0 ||
-        static_cast<size_t>(tensor_index) >= context_.tensors_size)
-      return nullptr;
-    return &context_.tensors[tensor_index];
+    return primary_subgraph().tensor(tensor_index);
   }
 
   // Get a pointer to an operation and registration data structure if in bounds.
   const std::pair<TfLiteNode, TfLiteRegistration>* node_and_registration(
       int node_index) const {
-    if (node_index < 0 ||
-        static_cast<size_t>(node_index) >= nodes_and_registration_.size())
-      return nullptr;
-    return &nodes_and_registration_[node_index];
+    return primary_subgraph().node_and_registration(node_index);
   }
 
   // Perform a checked cast to the appropriate tensor type (mutable pointer
@@ -285,28 +279,28 @@ class Interpreter {
   // index must be between 0 and inputs().size().
   template <class T>
   T* typed_input_tensor(int index) {
-    return typed_tensor<T>(inputs_[index]);
+    return typed_tensor<T>(inputs()[index]);
   }
 
   // Return an immutable pointer into the data of a given input tensor. The
   // given index must be between 0 and inputs().size().
   template <class T>
   const T* typed_input_tensor(int index) const {
-    return typed_tensor<T>(inputs_[index]);
+    return typed_tensor<T>(inputs()[index]);
   }
 
   // Return a mutable pointer into the data of a given output tensor. The given
   // index must be between 0 and outputs().size().
   template <class T>
   T* typed_output_tensor(int index) {
-    return typed_tensor<T>(outputs_[index]);
+    return typed_tensor<T>(outputs()[index]);
   }
 
   // Return an immutable pointer into the data of a given output tensor. The
   // given index must be between 0 and outputs().size().
   template <class T>
   const T* typed_output_tensor(int index) const {
-    return typed_tensor<T>(outputs_[index]);
+    return typed_tensor<T>(outputs()[index]);
   }
 
   // Change the dimensionality of a given tensor. Note, this is only acceptable
@@ -321,7 +315,6 @@ class Interpreter {
   // Update allocations for all tensors. This will redim dependent tensors using
   // the input tensor dimensionality as given. This is relatively expensive.
   // If you know that your sizes are not changing, you need not call this.
-
   // Returns status of success or failure.
   TfLiteStatus AllocateTensors();
 
@@ -342,14 +335,12 @@ class Interpreter {
   // Allow float16 precision for FP32 calculation when possible.
   // default: not allow.
   // WARNING: This is an experimental API and subject to change.
-  void SetAllowFp16PrecisionForFp32(bool allow) {
-    context_.allow_fp32_relax_to_fp16 = allow;
-  }
+  void SetAllowFp16PrecisionForFp32(bool allow);
 
   // Get the half precision flag.
   // WARNING: This is an experimental API and subject to change.
   bool GetAllowFp16PrecisionForFp32() const {
-    return context_.allow_fp32_relax_to_fp16;
+    return context_->allow_fp32_relax_to_fp16;
   }
 
   // Owning handle to a TfLiteDelegate instance.
@@ -366,18 +357,7 @@ class Interpreter {
   // it might require to copy the data from delegate buffer to raw memory.
   // WARNING: This is an experimental API and subject to change.
   TfLiteStatus EnsureTensorDataIsReadable(int tensor_index) {
-    TfLiteTensor* t = tensor(tensor_index);
-    TF_LITE_ENSURE(&context_, t != nullptr);
-    if (t->data_is_stale) {
-      TF_LITE_ENSURE(&context_, t->delegate != nullptr);
-      TF_LITE_ENSURE(&context_, t->buffer_handle != kTfLiteNullBufferHandle);
-      // This can be null if the delegate doesn't use its own buffer.
-      TF_LITE_ENSURE(&context_, t->delegate->CopyFromBufferHandle != nullptr);
-      t->delegate->CopyFromBufferHandle(
-          &context_, t->delegate, t->buffer_handle, t->data.raw, t->bytes);
-      t->data_is_stale = false;
-    }
-    return kTfLiteOk;
+    return primary_subgraph().EnsureTensorDataIsReadable(tensor_index);
   }
 
   // Set the delegate buffer handle to a tensor. It can be called in the
@@ -400,9 +380,9 @@ class Interpreter {
                                TfLiteBufferHandle* buffer_handle,
                                TfLiteDelegate** delegate);
 
-  void SetProfiler(profiling::Profiler* profiler) { profiler_ = profiler; }
+  void SetProfiler(profiling::Profiler* profiler);
 
-  profiling::Profiler* GetProfiler() { return profiler_; }
+  profiling::Profiler* GetProfiler();
 
   // The default capacity of `tensors_` vector.
   static constexpr int kTensorsReservedCapacity = 128;
@@ -435,143 +415,48 @@ class Interpreter {
   const char* OpProfilingString(const TfLiteRegistration& op_reg,
                                 const TfLiteNode* node) const {
     if (op_reg.profiling_string == nullptr) return nullptr;
-    return op_reg.profiling_string(&context_, node);
+    return op_reg.profiling_string(context_, node);
   }
 
   // Set the value of an external context.
   void SetExternalContext(TfLiteExternalContextType type,
                           TfLiteExternalContext* ctx);
 
+  // Adds `subgraphs_to_add` subgraphs, preserving pre-existing Subgraph
+  // entries. The value pointed to by `first_new_subgraph_index` will be set to
+  // the index of the first new subgraph if `first_new_subgraph_index` is
+  // non-null.
+  // WARNING: This is an experimental API and subject to change.
+  void AddSubgraphs(int subgraphs_to_add,
+                    int* first_new_subgraph_index = nullptr);
+
+  // Return the number of subgraphs in the model.
+  // WARNING: This is an experimental API and subject to change.
+  size_t subgraphs_size() const { return subgraphs_.size(); }
+
+  // Get a pointer to a subgraph if in bounds.
+  // WARNING: This is an experimental API and subject to change.
+  Subgraph* subgraph(int subgraph_index) {
+    if (subgraph_index < 0 ||
+        static_cast<size_t>(subgraph_index) >= subgraphs_size())
+      return nullptr;
+    return &*subgraphs_[subgraph_index];
+  }
+
+  // WARNING: Experimental interface, subject to change
+  Subgraph& primary_subgraph() {
+    return *subgraphs_.front();  // Safe as subgraphs_ always has 1 entry.
+  }
+
+  // WARNING: Experimental interface, subject to change
+  const Subgraph& primary_subgraph() const {
+    return *subgraphs_.front();  // Safe as subgraphs_ always has 1 entry.
+  }
+
  private:
   friend class InterpreterBuilder;
   friend class InterpreterTest;
 
-  // Prevent 'context_' from accessing functions that are only available to
-  // delegated kernels.
-  void SwitchToKernelContext();
-
-  // Add delegate-only functions to 'context_'.
-  void SwitchToDelegateContext();
-
-  // Give 'op_reg' a chance to initialize itself using the contents of
-  // 'buffer'.
-  void* OpInit(const TfLiteRegistration& op_reg, const char* buffer,
-               size_t length) {
-    if (op_reg.init == nullptr) return nullptr;
-    return op_reg.init(&context_, buffer, length);
-  }
-
-  // Let 'op_reg' release any memory it might have allocated via 'OpInit'.
-  void OpFree(const TfLiteRegistration& op_reg, void* buffer) {
-    if (op_reg.free == nullptr) return;
-    if (buffer) {
-      op_reg.free(&context_, buffer);
-    }
-  }
-
-  // Prepare the given 'node' for execution.
-  TfLiteStatus OpPrepare(const TfLiteRegistration& op_reg, TfLiteNode* node) {
-    if (op_reg.prepare == nullptr) return kTfLiteOk;
-    return op_reg.prepare(&context_, node);
-  }
-
-  // Invoke the operator represented by 'node'.
-  TfLiteStatus OpInvoke(const TfLiteRegistration& op_reg, TfLiteNode* node) {
-    if (op_reg.invoke == nullptr) return kTfLiteError;
-    return op_reg.invoke(&context_, node);
-  }
-
-  // Call OpPrepare() for as many ops as possible, allocating memory for their
-  // tensors. If an op containing dynamic tensors is found, preparation will be
-  // postponed until this function is called again. This allows the interpreter
-  // to wait until Invoke() to resolve the sizes of dynamic tensors.
-  TfLiteStatus PrepareOpsAndTensors();
-
-  // Call OpPrepare() for all ops starting at 'first_node'. Stop when a
-  // dynamic tensors is found or all ops have been prepared. Fill
-  // 'last_node_prepared' with the id of the op containing dynamic tensors, or
-  // the last in the graph.
-  TfLiteStatus PrepareOpsStartingAt(int first_execution_plan_index,
-                                    int* last_execution_plan_index_prepared);
-
-  // Tensors needed by the interpreter. Use `AddTensors` to add more blank
-  // tensor entries. Note, `tensors_.data()` needs to be synchronized to the
-  // `context_` whenever this std::vector is reallocated. Currently this
-  // only happens in `AddTensors()`.
-  std::vector<TfLiteTensor> tensors_;
-
-  // Check if an array of tensor indices are valid with respect to the Tensor
-  // array.
-  // NOTE: this changes consistent_ to be false if indices are out of bounds.
-  TfLiteStatus CheckTensorIndices(const char* label, const int* indices,
-                                  int length);
-
-  // Compute the number of bytes required to represent a tensor with dimensions
-  // specified by the array dims (of length dims_size). Returns the status code
-  // and bytes.
-  TfLiteStatus BytesRequired(TfLiteType type, const int* dims, size_t dims_size,
-                             size_t* bytes);
-
-  // Request an tensor be resized implementation. If the given tensor is of
-  // type kTfLiteDynamic it will also be allocated new memory.
-  TfLiteStatus ResizeTensorImpl(TfLiteTensor* tensor, TfLiteIntArray* new_size);
-
-  // Report a detailed error string (will be printed to stderr).
-  // TODO(aselle): allow user of class to provide alternative destinations.
-  void ReportErrorImpl(const char* format, va_list args);
-
-  // Entry point for C node plugin API to request an tensor be resized.
-  static TfLiteStatus ResizeTensor(TfLiteContext* context, TfLiteTensor* tensor,
-                                   TfLiteIntArray* new_size);
-  // Entry point for C node plugin API to report an error.
-  static void ReportError(TfLiteContext* context, const char* format, ...);
-
-  // Entry point for C node plugin API to add new tensors.
-  static TfLiteStatus AddTensors(TfLiteContext* context, int tensors_to_add,
-                                 int* first_new_tensor_index);
-
-  // WARNING: This is an experimental API and subject to change.
-  // Entry point for C API ReplaceNodeSubsetsWithDelegateKernels
-  static TfLiteStatus ReplaceNodeSubsetsWithDelegateKernels(
-      TfLiteContext* context, TfLiteRegistration registration,
-      const TfLiteIntArray* nodes_to_replace, TfLiteDelegate* delegate);
-
-  // Update the execution graph to replace some of the nodes with stub
-  // nodes. Specifically any node index that has `nodes[index]==1` will be
-  // slated for replacement with a delegate kernel specified by registration.
-  // Ownership of 'nodes_to_replace' and 'delegate' remains with the caller.
-  // WARNING: This is an experimental interface that is subject to change.
-  TfLiteStatus ReplaceNodeSubsetsWithDelegateKernels(
-      TfLiteRegistration registration, const TfLiteIntArray* nodes_to_replace,
-      TfLiteDelegate* delegate);
-
-  // WARNING: This is an experimental interface that is subject to change.
-  // Gets the internal pointer to a TensorFlow lite node by node_index.
-  TfLiteStatus GetNodeAndRegistration(int node_index, TfLiteNode** node,
-                                      TfLiteRegistration** registration);
-
-  // WARNING: This is an experimental interface that is subject to change.
-  // Entry point for C node plugin API to get a node by index.
-  static TfLiteStatus GetNodeAndRegistration(struct TfLiteContext*,
-                                             int node_index, TfLiteNode** node,
-                                             TfLiteRegistration** registration);
-
-  // WARNING: This is an experimental interface that is subject to change.
-  // Gets an TfLiteIntArray* representing the execution plan. The interpreter
-  // owns this memory and it is only guaranteed to exist during the invocation
-  // of the delegate prepare.
-  TfLiteStatus GetExecutionPlan(TfLiteIntArray** execution_plan);
-
-  // WARNING: This is an experimental interface that is subject to change.
-  // Entry point for C node plugin API to get the execution plan.
-  static TfLiteStatus GetExecutionPlan(struct TfLiteContext* context,
-                                       TfLiteIntArray** execution_plan);
-
-  // Retrieve an existing external context by type.
-  TfLiteExternalContext* GetExternalContext(TfLiteExternalContextType type);
-  static TfLiteExternalContext* GetExternalContext(
-      struct TfLiteContext* context, TfLiteExternalContextType type);
-
   // Set the value of an external context.
   static void SetExternalContext(struct TfLiteContext* context,
                                  TfLiteExternalContextType type,
@@ -587,105 +472,28 @@ class Interpreter {
     return ModifyGraphWithDelegate(owned_delegates_.back().get());
   }
 
-  // Ensures that `tensors_` has at least `kTensorsCapacityHeadroom` extra
-  // capacity. Calling this function may invalidate existing pointers to
-  // tensors. After calling this function, adding `kTensorsCapacityHeadroom`
-  // more tensors won't invalidate the pointer to existing tensors.
-  void EnsureTensorsVectorCapacity() {
-    const size_t required_capacity = tensors_size() + kTensorsCapacityHeadroom;
-    if (required_capacity > tensors_.capacity()) {
-      tensors_.reserve(required_capacity);
-      context_.tensors = tensors_.data();
-    }
-  }
-
-  // The state of the Interpreter.
-  enum State {
-    // The interpreter isn't ready to be invoked.
-    // `AllocateTensor` need to be called to enter an invokable state.
-    kStateUninvokable = 0,
-    // The interpreter is ready to be invoked.
-    kStateInvokable,
-    // The interpreter is ready to be invoked, and graph can't be further
-    // modified. The interpreter will enter this state when calling
-    // `ModifyGraphWithDelegate` with `allow_dynamic_tensors=false`.
-    kStateInvokableAndImmutable,
-  };
-  State state_ = kStateUninvokable;
-
   // A pure C data structure used to communicate with the pure C plugin
   // interface. To avoid copying tensor metadata, this is also the definitive
   // structure to store tensors.
-  TfLiteContext context_;
-
-  // Node inputs/outputs are stored in TfLiteNode and TfLiteRegistration stores
-  // function pointers to actual implementation.
-  std::vector<std::pair<TfLiteNode, TfLiteRegistration>>
-      nodes_and_registration_;
-
-  // Whether the model is consistent. That is to say if the inputs and outputs
-  // of every node and the global inputs and outputs are valid indexes into
-  // the tensor array.
-  bool consistent_ = true;
-
-  // Array of indices representing the tensors that are inputs to the
-  // interpreter.
-  std::vector<int> inputs_;
-
-  // Array of indices representing the tensors that are outputs to the
-  // interpreter.
-  std::vector<int> outputs_;
-
-  // Array of indices representing the tensors that are variable tensors.
-  std::vector<int> variables_;
+  // This is the primary subgraph context.
+  TfLiteContext* context_;
 
   // The error reporter delegate that tflite will forward queries errors to.
   ErrorReporter* error_reporter_;
 
-  // Index of the next node to prepare.
-  // During Invoke(), Interpreter will allocate input tensors first, which are
-  // known to be fixed size. Then it will allocate outputs from nodes as many
-  // as possible. When there is a node that produces dynamic sized tensor.
-  // Interpreter will stop allocating tensors, set the value of next allocate
-  // node id, and execute the node to generate the output tensor before continue
-  // to allocate successors. This process repeats until all nodes are executed.
-  // NOTE: this relies on the order of nodes that is in topological order.
-  int next_execution_plan_index_to_prepare_;
-
-  // WARNING: This is an experimental interface that is subject to change.
-  // This is a list of node indices (to index into nodes_and_registration).
-  // This represents a valid topological sort (dependency ordered) execution
-  // plan. In particular, it is valid for this ordering to contain only a
-  // subset of the node indices.
-  std::vector<int> execution_plan_;
-
-  // In the future, we'd like a TfLiteIntArray compatible representation.
-  // TODO(aselle): replace execution_plan_ with this.
-  std::unique_ptr<TfLiteIntArray, TfLiteIntArrayDeleter> plan_cache_;
-
-  // Whether to delegate to NN API
-  std::unique_ptr<NNAPIDelegate> nnapi_delegate_;
-
   // List of delegates that have been installed and are owned by this
   // interpreter instance. Useful if client delegate ownership is burdensome.
   // WARNING: This is an experimental API and subject to change.
   // TODO(b/116667551): Use TfLiteExternalContext for storing state.
   std::vector<TfLiteDelegatePtr> owned_delegates_;
 
-  std::unique_ptr<MemoryPlanner> memory_planner_;
-
   bool allow_buffer_handle_output_ = false;
 
-  // Tracking bit for whether a tensor was resized in the course of an op
-  // invocation. This is a useful hint to ensure that dynamic tensor outputs
-  // trigger downstream reallocation after op invocation.
-  bool tensor_resized_since_op_invoke_ = false;
-
-  // Profiler for this interpreter instance.
-  profiling::Profiler* profiler_ = nullptr;
-
   // List of active external contexts.
   TfLiteExternalContext* external_contexts_[kTfLiteMaxExternalContexts];
+
+  // Subgraphs
+  std::vector<std::unique_ptr<Subgraph>> subgraphs_;
 };
 
 }  // namespace tflite
diff --git a/tensorflow/lite/interpreter_test.cc b/tensorflow/lite/interpreter_test.cc
index 7f03c3ceba1..78b5d1b8873 100644
--- a/tensorflow/lite/interpreter_test.cc
+++ b/tensorflow/lite/interpreter_test.cc
@@ -38,7 +38,7 @@ class InterpreterTest : public ::testing::Test {
   }
 
  protected:
-  TfLiteContext* GetInterpreterContext() { return &interpreter_.context_; }
+  TfLiteContext* GetInterpreterContext() { return interpreter_.context_; }
 
   Interpreter interpreter_;
 };
@@ -566,7 +566,7 @@ TEST(BasicInterpreter, ThreeStepAllocate) {
     DynamicBuffer buf;
     StringRef str_ref = GetString(input, 0);
     buf.AddString(str_ref);
-    buf.WriteToTensor(output);
+    buf.WriteToTensorAsVector(output);
     return kTfLiteOk;
   };
 
@@ -1090,17 +1090,17 @@ class TestDelegate : public ::testing::Test {
         TfLiteIntArrayFree(nodes_to_separate);
         return kTfLiteOk;
       };
-      delegate_.CopyToBufferHandle =
-          [](TfLiteContext* context, TfLiteDelegate* delegate,
-             TfLiteBufferHandle buffer_handle, void* data,
-             size_t size) -> TfLiteStatus {
+      delegate_.CopyToBufferHandle = [](TfLiteContext* context,
+                                        TfLiteDelegate* delegate,
+                                        TfLiteBufferHandle buffer_handle,
+                                        TfLiteTensor* tensor) -> TfLiteStatus {
         // TODO(ycling): Implement tests to test buffer copying logic.
         return kTfLiteOk;
       };
       delegate_.CopyFromBufferHandle =
           [](TfLiteContext* context, TfLiteDelegate* delegate,
-             TfLiteBufferHandle buffer_handle, void* data,
-             size_t size) -> TfLiteStatus {
+             TfLiteBufferHandle buffer_handle,
+             TfLiteTensor* output) -> TfLiteStatus {
         // TODO(ycling): Implement tests to test buffer copying logic.
         return kTfLiteOk;
       };
diff --git a/tensorflow/lite/java/src/main/java/org/tensorflow/lite/DataType.java b/tensorflow/lite/java/src/main/java/org/tensorflow/lite/DataType.java
index 41093e8ffe6..bd47574f71b 100644
--- a/tensorflow/lite/java/src/main/java/org/tensorflow/lite/DataType.java
+++ b/tensorflow/lite/java/src/main/java/org/tensorflow/lite/DataType.java
@@ -27,7 +27,10 @@ public enum DataType {
   UINT8(3),
 
   /** 64-bit signed integer. */
-  INT64(4);
+  INT64(4),
+
+  /** Strings. */
+  STRING(5);
 
   private final int value;
 
@@ -46,6 +49,8 @@ public enum DataType {
         return 1;
       case INT64:
         return 8;
+      case STRING:
+        return -1;
     }
     throw new IllegalArgumentException(
         "DataType error: DataType " + this + " is not supported yet");
@@ -82,6 +87,8 @@ public enum DataType {
         return "byte";
       case INT64:
         return "long";
+      case STRING:
+        return "string";
     }
     throw new IllegalArgumentException(
         "DataType error: DataType " + this + " is not supported yet");
diff --git a/tensorflow/lite/java/src/main/java/org/tensorflow/lite/Tensor.java b/tensorflow/lite/java/src/main/java/org/tensorflow/lite/Tensor.java
index 6ca47aa3edf..7aa24b4198a 100644
--- a/tensorflow/lite/java/src/main/java/org/tensorflow/lite/Tensor.java
+++ b/tensorflow/lite/java/src/main/java/org/tensorflow/lite/Tensor.java
@@ -162,6 +162,8 @@ public final class Tensor {
         return DataType.UINT8;
       } else if (long.class.equals(c)) {
         return DataType.INT64;
+      } else if (String.class.equals(c)) {
+        return DataType.STRING;
       }
     }
     throw new IllegalArgumentException(
diff --git a/tensorflow/lite/java/src/main/native/BUILD b/tensorflow/lite/java/src/main/native/BUILD
index 2abba243458..8f95f14518a 100644
--- a/tensorflow/lite/java/src/main/native/BUILD
+++ b/tensorflow/lite/java/src/main/native/BUILD
@@ -43,6 +43,7 @@ cc_library(
         "//tensorflow/lite:context",
         "//tensorflow/lite:framework",
         "//tensorflow/lite:schema_fbs_version",
+        "//tensorflow/lite:string_util",
     ],
     alwayslink = 1,
 )
diff --git a/tensorflow/lite/java/src/main/native/nativeinterpreterwrapper_jni.cc b/tensorflow/lite/java/src/main/native/nativeinterpreterwrapper_jni.cc
index c7389c58110..1e98f942504 100644
--- a/tensorflow/lite/java/src/main/native/nativeinterpreterwrapper_jni.cc
+++ b/tensorflow/lite/java/src/main/native/nativeinterpreterwrapper_jni.cc
@@ -78,6 +78,8 @@ int getDataType(TfLiteType data_type) {
       return 3;
     case kTfLiteInt64:
       return 4;
+    case kTfLiteString:
+      return 5;
     default:
       return -1;
   }
diff --git a/tensorflow/lite/java/src/main/native/tensor_jni.cc b/tensorflow/lite/java/src/main/native/tensor_jni.cc
index 1d813d50da4..82d2679de9c 100644
--- a/tensorflow/lite/java/src/main/native/tensor_jni.cc
+++ b/tensorflow/lite/java/src/main/native/tensor_jni.cc
@@ -16,8 +16,10 @@ limitations under the License.
 #include "tensorflow/lite/java/src/main/native/tensor_jni.h"
 #include <cstring>
 #include <memory>
+#include "tensorflow/lite/c/c_api_internal.h"
 #include "tensorflow/lite/interpreter.h"
 #include "tensorflow/lite/java/src/main/native/exception_jni.h"
+#include "tensorflow/lite/string_util.h"
 
 namespace {
 
@@ -48,7 +50,7 @@ TfLiteTensor* GetTensorFromHandle(JNIEnv* env, jlong handle) {
   return reinterpret_cast<TensorHandle*>(handle)->tensor();
 }
 
-size_t elementByteSize(TfLiteType data_type) {
+size_t ElementByteSize(TfLiteType data_type) {
   // The code in this file makes the assumption that the
   // TensorFlow TF_DataTypes and the Java primitive types
   // have the same byte sizes. Validate that:
@@ -77,11 +79,11 @@ size_t elementByteSize(TfLiteType data_type) {
   }
 }
 
-size_t writeOneDimensionalArray(JNIEnv* env, jobject object, TfLiteType type,
+size_t WriteOneDimensionalArray(JNIEnv* env, jobject object, TfLiteType type,
                                 void* dst, size_t dst_size) {
   jarray array = static_cast<jarray>(object);
   const int num_elements = env->GetArrayLength(array);
-  size_t to_copy = num_elements * elementByteSize(type);
+  size_t to_copy = num_elements * ElementByteSize(type);
   if (to_copy > dst_size) {
     throwException(env, kIllegalStateException,
                    "Internal error: cannot write Java array of %d bytes to "
@@ -126,10 +128,10 @@ size_t writeOneDimensionalArray(JNIEnv* env, jobject object, TfLiteType type,
   }
 }
 
-size_t readOneDimensionalArray(JNIEnv* env, TfLiteType data_type,
+size_t ReadOneDimensionalArray(JNIEnv* env, TfLiteType data_type,
                                const void* src, size_t src_size, jarray dst) {
   const int len = env->GetArrayLength(dst);
-  const size_t size = len * elementByteSize(data_type);
+  const size_t size = len * ElementByteSize(data_type);
   if (size > src_size) {
     throwException(
         env, kIllegalStateException,
@@ -170,17 +172,17 @@ size_t readOneDimensionalArray(JNIEnv* env, TfLiteType data_type,
   return 0;
 }
 
-size_t readMultiDimensionalArray(JNIEnv* env, TfLiteType data_type, char* src,
+size_t ReadMultiDimensionalArray(JNIEnv* env, TfLiteType data_type, char* src,
                                  size_t src_size, int dims_left, jarray dst) {
   if (dims_left == 1) {
-    return readOneDimensionalArray(env, data_type, src, src_size, dst);
+    return ReadOneDimensionalArray(env, data_type, src, src_size, dst);
   } else {
     jobjectArray ndarray = static_cast<jobjectArray>(dst);
     int len = env->GetArrayLength(ndarray);
     size_t size = 0;
     for (int i = 0; i < len; ++i) {
       jarray row = static_cast<jarray>(env->GetObjectArrayElement(ndarray, i));
-      size += readMultiDimensionalArray(env, data_type, src + size,
+      size += ReadMultiDimensionalArray(env, data_type, src + size,
                                         src_size - size, dims_left - 1, row);
       env->DeleteLocalRef(row);
       if (env->ExceptionCheck()) return size;
@@ -189,10 +191,43 @@ size_t readMultiDimensionalArray(JNIEnv* env, TfLiteType data_type, char* src,
   }
 }
 
-size_t writeMultiDimensionalArray(JNIEnv* env, jobject src, TfLiteType type,
+// Returns the total number of strings read.
+int ReadMultiDimensionalStringArray(JNIEnv* env, TfLiteTensor* tensor,
+                                    int dims_left, int start_str_index,
+                                    jarray dst) {
+  jobjectArray object_array = static_cast<jobjectArray>(dst);
+  int len = env->GetArrayLength(object_array);
+  int num_strings_read = 0;
+
+  // If dst is a 1-dimensional array, copy the strings into it. Else
+  // recursively call ReadMultiDimensionalStringArray over sub-dimensions.
+  if (dims_left == 1) {
+    for (int i = 0; i < len; ++i) {
+      const tflite::StringRef strref =
+          tflite::GetString(tensor, start_str_index + num_strings_read);
+      jstring string_dest = env->NewStringUTF(strref.str);
+      env->SetObjectArrayElement(object_array, i, string_dest);
+      env->DeleteLocalRef(string_dest);
+      ++num_strings_read;
+    }
+  } else {
+    for (int i = 0; i < len; ++i) {
+      jarray row =
+          static_cast<jarray>(env->GetObjectArrayElement(object_array, i));
+      num_strings_read += ReadMultiDimensionalStringArray(
+          env, tensor, dims_left - 1, start_str_index + num_strings_read, row);
+      env->DeleteLocalRef(row);
+      if (env->ExceptionCheck()) return num_strings_read;
+    }
+  }
+
+  return num_strings_read;
+}
+
+size_t WriteMultiDimensionalArray(JNIEnv* env, jobject src, TfLiteType type,
                                   int dims_left, char** dst, int dst_size) {
   if (dims_left <= 1) {
-    return writeOneDimensionalArray(env, src, type, *dst, dst_size);
+    return WriteOneDimensionalArray(env, src, type, *dst, dst_size);
   } else {
     jobjectArray ndarray = static_cast<jobjectArray>(src);
     int len = env->GetArrayLength(ndarray);
@@ -200,7 +235,7 @@ size_t writeMultiDimensionalArray(JNIEnv* env, jobject src, TfLiteType type,
     for (int i = 0; i < len; ++i) {
       jobject row = env->GetObjectArrayElement(ndarray, i);
       char* next_dst = *dst + sz;
-      sz += writeMultiDimensionalArray(env, row, type, dims_left - 1, &next_dst,
+      sz += WriteMultiDimensionalArray(env, row, type, dims_left - 1, &next_dst,
                                        dst_size - sz);
       env->DeleteLocalRef(row);
       if (env->ExceptionCheck()) return sz;
@@ -209,6 +244,44 @@ size_t writeMultiDimensionalArray(JNIEnv* env, jobject src, TfLiteType type,
   }
 }
 
+void PopulateStringDynamicBuffer(JNIEnv* env, jobject src,
+                                 tflite::DynamicBuffer* dst_buffer,
+                                 int dims_left) {
+  jobjectArray object_array = static_cast<jobjectArray>(src);
+  const int num_elements = env->GetArrayLength(object_array);
+
+  // If src is a 1-dimensional array, add the strings into dst_buffer. Else
+  // recursively call populateStringDynamicBuffer over sub-dimensions.
+  if (dims_left <= 1) {
+    for (int i = 0; i < num_elements; ++i) {
+      jstring string_obj =
+          static_cast<jstring>(env->GetObjectArrayElement(object_array, i));
+      const char* chars = env->GetStringUTFChars(string_obj, nullptr);
+      // + 1 for terminating character.
+      const int byte_len = env->GetStringUTFLength(string_obj) + 1;
+      dst_buffer->AddString(chars, byte_len);
+      env->ReleaseStringUTFChars(string_obj, chars);
+      env->DeleteLocalRef(string_obj);
+    }
+  } else {
+    for (int i = 0; i < num_elements; ++i) {
+      jobject row = env->GetObjectArrayElement(object_array, i);
+      PopulateStringDynamicBuffer(env, row, dst_buffer, dims_left - 1);
+      env->DeleteLocalRef(row);
+      if (env->ExceptionCheck()) return;
+    }
+  }
+}
+
+void WriteMultiDimensionalStringArray(JNIEnv* env, jobject src,
+                                      TfLiteTensor* tensor) {
+  tflite::DynamicBuffer dst_buffer;
+  PopulateStringDynamicBuffer(env, src, &dst_buffer, tensor->dims->size);
+  if (!env->ExceptionCheck()) {
+    dst_buffer.WriteToTensor(tensor, /*new_shape=*/nullptr);
+  }
+}
+
 }  // namespace
 
 JNIEXPORT jlong JNICALL Java_org_tensorflow_lite_Tensor_create(
@@ -266,8 +339,14 @@ Java_org_tensorflow_lite_Tensor_readMultiDimensionalArray(JNIEnv* env,
                    "Internal error: Cannot copy empty/scalar Tensors.");
     return;
   }
-  readMultiDimensionalArray(env, tensor->type, tensor->data.raw, tensor->bytes,
-                            num_dims, static_cast<jarray>(value));
+  if (tensor->type == kTfLiteString) {
+    ReadMultiDimensionalStringArray(env, tensor, num_dims, 0,
+                                    static_cast<jarray>(value));
+  } else {
+    ReadMultiDimensionalArray(env, tensor->type, tensor->data.raw,
+                              tensor->bytes, num_dims,
+                              static_cast<jarray>(value));
+  }
 }
 
 JNIEXPORT void JNICALL
@@ -277,7 +356,7 @@ Java_org_tensorflow_lite_Tensor_writeMultiDimensionalArray(JNIEnv* env,
                                                            jobject src) {
   TfLiteTensor* tensor = GetTensorFromHandle(env, handle);
   if (tensor == nullptr) return;
-  if (tensor->data.raw == nullptr) {
+  if (tensor->type != kTfLiteString && tensor->data.raw == nullptr) {
     throwException(env, kIllegalArgumentException,
                    "Internal error: Target Tensor hasn't been allocated.");
     return;
@@ -287,8 +366,12 @@ Java_org_tensorflow_lite_Tensor_writeMultiDimensionalArray(JNIEnv* env,
                    "Internal error: Cannot copy empty/scalar Tensors.");
     return;
   }
-  writeMultiDimensionalArray(env, src, tensor->type, tensor->dims->size,
-                             &tensor->data.raw, tensor->bytes);
+  if (tensor->type == kTfLiteString) {
+    WriteMultiDimensionalStringArray(env, src, tensor);
+  } else {
+    WriteMultiDimensionalArray(env, src, tensor->type, tensor->dims->size,
+                               &tensor->data.raw, tensor->bytes);
+  }
 }
 
 JNIEXPORT jint JNICALL Java_org_tensorflow_lite_Tensor_dtype(JNIEnv* env,
diff --git a/tensorflow/lite/java/src/test/java/org/tensorflow/lite/DataTypeTest.java b/tensorflow/lite/java/src/test/java/org/tensorflow/lite/DataTypeTest.java
index 6d6417f895e..8412ec0e9da 100644
--- a/tensorflow/lite/java/src/test/java/org/tensorflow/lite/DataTypeTest.java
+++ b/tensorflow/lite/java/src/test/java/org/tensorflow/lite/DataTypeTest.java
@@ -30,6 +30,7 @@ public final class DataTypeTest {
     assertThat(DataType.INT32.byteSize()).isEqualTo(4);
     assertThat(DataType.UINT8.byteSize()).isEqualTo(1);
     assertThat(DataType.INT64.byteSize()).isEqualTo(8);
+    assertThat(DataType.STRING.byteSize()).isEqualTo(-1);
   }
 
   @Test
diff --git a/tensorflow/lite/java/src/test/java/org/tensorflow/lite/NativeInterpreterWrapperTest.java b/tensorflow/lite/java/src/test/java/org/tensorflow/lite/NativeInterpreterWrapperTest.java
index 07d334c33b2..b00efa77cbf 100644
--- a/tensorflow/lite/java/src/test/java/org/tensorflow/lite/NativeInterpreterWrapperTest.java
+++ b/tensorflow/lite/java/src/test/java/org/tensorflow/lite/NativeInterpreterWrapperTest.java
@@ -43,6 +43,9 @@ public final class NativeInterpreterWrapperTest {
   private static final String BYTE_MODEL_PATH =
       "tensorflow/lite/java/src/testdata/uint8.bin";
 
+  private static final String STRING_MODEL_PATH =
+      "tensorflow/lite/java/src/testdata/string.bin";
+
   private static final String QUANTIZED_MODEL_PATH =
       "tensorflow/lite/java/src/testdata/quantized.bin";
 
@@ -224,6 +227,50 @@ public final class NativeInterpreterWrapperTest {
     wrapper.close();
   }
 
+  @Test
+  public void testRunWithString() {
+    NativeInterpreterWrapper wrapper = new NativeInterpreterWrapper(STRING_MODEL_PATH);
+    String[] oneD = {"s1", "s22", "s333"};
+    String[][] twoD = {oneD, oneD, oneD, oneD, oneD, oneD, oneD, oneD};
+    String[][][] threeD = {twoD, twoD, twoD, twoD, twoD, twoD, twoD, twoD};
+    String[][][][] fourD = {threeD, threeD};
+    Object[] inputs = {fourD};
+    String[][][][] parsedOutputs = new String[2][4][4][12];
+    Map<Integer, Object> outputs = new HashMap<>();
+    outputs.put(0, parsedOutputs);
+    wrapper.run(inputs, outputs);
+    String[] outputOneD = parsedOutputs[0][0][0];
+    String[] expected = {
+      "s1", "s22", "s333", "s1", "s22", "s333", "s1", "s22", "s333", "s1", "s22", "s333"
+    };
+    assertThat(outputOneD).isEqualTo(expected);
+    wrapper.close();
+  }
+
+  @Test
+  public void testRunWithString_wrongShapeError() {
+    NativeInterpreterWrapper wrapper = new NativeInterpreterWrapper(STRING_MODEL_PATH);
+    String[] oneD = {"s1", "s22", "s333"};
+    String[][] twoD = {oneD, oneD, oneD, oneD, oneD, oneD, oneD, oneD};
+    String[][][] threeD = {twoD, twoD, twoD, twoD, twoD, twoD, twoD, twoD};
+    String[][][][] fourD = {threeD, threeD};
+    Object[] inputs = {fourD};
+    String[][][][] parsedOutputs = new String[2][4][4][10];
+    Map<Integer, Object> outputs = new HashMap<>();
+    outputs.put(0, parsedOutputs);
+    try {
+      wrapper.run(inputs, outputs);
+      fail();
+    } catch (IllegalArgumentException e) {
+      assertThat(e)
+          .hasMessageThat()
+          .contains(
+              "Cannot copy between a TensorFlowLite tensor with shape [2, 4, 4, 12] and "
+                  + "a Java object with shape [2, 4, 4, 10]");
+    }
+    wrapper.close();
+  }
+
   @Test
   public void testRunWithByteBufferHavingBytes() {
     NativeInterpreterWrapper wrapper = new NativeInterpreterWrapper(BYTE_MODEL_PATH);
diff --git a/tensorflow/lite/java/src/testdata/string.bin b/tensorflow/lite/java/src/testdata/string.bin
new file mode 100644
index 00000000000..36a2509acdf
Binary files /dev/null and b/tensorflow/lite/java/src/testdata/string.bin differ
diff --git a/tensorflow/lite/kernels/BUILD b/tensorflow/lite/kernels/BUILD
index 010ba834661..00d9d1feae5 100644
--- a/tensorflow/lite/kernels/BUILD
+++ b/tensorflow/lite/kernels/BUILD
@@ -183,6 +183,7 @@ cc_library(
         "exp.cc",
         "expand_dims.cc",
         "fake_quant.cc",
+        "fill.cc",
         "floor.cc",
         "floor_div.cc",
         "floor_mod.cc",
@@ -197,6 +198,7 @@ cc_library(
         "lstm.cc",
         "maximum_minimum.cc",
         "mfcc.cc",
+        "mirror_pad.cc",
         "mul.cc",
         "neg.cc",
         "one_hot.cc",
@@ -219,6 +221,8 @@ cc_library(
         "sparse_output_fully_connected.cc",
         "sparse_to_dense.cc",
         "split.cc",
+        "split_v.cc",
+        "squared_difference.cc",
         "squeeze.cc",
         "strided_slice.cc",
         "sub.cc",
@@ -1064,6 +1068,22 @@ tf_cc_test(
     ],
 )
 
+tf_cc_test(
+    name = "split_v_test",
+    size = "small",
+    srcs = ["split_v_test.cc"],
+    tags = [
+        "no_oss",
+        "tflite_not_portable_ios",
+    ],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
 tf_cc_test(
     name = "squeeze_test",
     size = "small",
@@ -1379,6 +1399,32 @@ tf_cc_test(
     ],
 )
 
+tf_cc_test(
+    name = "squared_difference_test",
+    size = "small",
+    srcs = ["squared_difference_test.cc"],
+    tags = ["tflite_not_portable_ios"],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+tf_cc_test(
+    name = "fill_test",
+    size = "small",
+    srcs = ["fill_test.cc"],
+    tags = ["tflite_not_portable_ios"],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
 filegroup(
     name = "all_files",
     srcs = glob(
@@ -1392,3 +1438,14 @@ filegroup(
 )
 
 tflite_portable_test_suite()
+
+tf_cc_test(
+    name = "mirror_pad_test",
+    srcs = ["mirror_pad_test.cc"],
+    deps = [
+        ":builtin_ops",
+        ":test_util",
+        "//tensorflow/lite:framework",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
diff --git a/tensorflow/lite/kernels/activations.cc b/tensorflow/lite/kernels/activations.cc
index 9c525d96407..a7665425604 100644
--- a/tensorflow/lite/kernels/activations.cc
+++ b/tensorflow/lite/kernels/activations.cc
@@ -45,6 +45,11 @@ struct LogSoftmaxOpData : public OpData {
   int32_t reverse_scaling_right_shift = 0;
 };
 
+struct PreluOpData : public OpData {
+  int32_t output_multiplier = 0;
+  int output_shift = 0;
+};
+
 void* Init(TfLiteContext* context, const char* buffer, size_t length) {
   // This is a builtin op, so we don't use the contents in 'buffer', if any.
   // Instead, we allocate a new object to carry information from Prepare() to
@@ -57,6 +62,10 @@ void* LogSoftmaxInit(TfLiteContext* context, const char* buffer,
   return new LogSoftmaxOpData;
 }
 
+void* PreluInit(TfLiteContext* context, const char* buffer, size_t length) {
+  return new PreluOpData;
+}
+
 void Free(TfLiteContext* context, void* buffer) {
   delete reinterpret_cast<OpData*>(buffer);
 }
@@ -65,6 +74,10 @@ void LogSoftmaxFree(TfLiteContext* context, void* buffer) {
   delete reinterpret_cast<LogSoftmaxOpData*>(buffer);
 }
 
+void PreluFree(TfLiteContext* context, void* buffer) {
+  delete reinterpret_cast<PreluOpData*>(buffer);
+}
+
 TfLiteStatus GenericPrepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
@@ -253,13 +266,18 @@ TfLiteStatus PreluPrepare(TfLiteContext* context, TfLiteNode* node) {
   const TfLiteTensor* input = GetInput(context, node, 0);
   TfLiteTensor* output = GetOutput(context, node, 0);
   const TfLiteTensor* alpha = GetInput(context, node, 1);
+  PreluOpData* data = reinterpret_cast<PreluOpData*>(node->user_data);
 
-  // Currently only Float32 is supported
-  // TODO(ycling): Support other data types.
-  TF_LITE_ENSURE_EQ(context, input->type, kTfLiteFloat32);
-  TF_LITE_ENSURE_EQ(context, alpha->type, kTfLiteFloat32);
+  TF_LITE_ENSURE_EQ(context, input->type, alpha->type);
   output->type = input->type;
 
+  if (output->type == kTfLiteUInt8 || output->type == kTfLiteInt16) {
+    double real_multiplier =
+        input->params.scale * alpha->params.scale / output->params.scale;
+    QuantizeMultiplierSmallerThanOneExp(
+        real_multiplier, &data->output_multiplier, &data->output_shift);
+  }
+
   // PRelu (parameteric Relu) shares the same alpha value on "shared axis".
   // This means it's always required to "broadcast" alpha values in PRelu.
   TfLiteIntArray* output_size = nullptr;
@@ -288,8 +306,8 @@ TfLiteStatus ReluEval(TfLiteContext* context, TfLiteNode* node) {
       return kTfLiteOk;
     } break;
     default:
-      context->ReportError(context, "Only float32 supported currently, got %d.",
-                           input->type);
+      context->ReportError(context, "Only float32 supported currently, got %s.",
+                           TfLiteTypeGetName(input->type));
       return kTfLiteError;
   }
 }
@@ -309,8 +327,8 @@ TfLiteStatus Relu1Eval(TfLiteContext* context, TfLiteNode* node) {
       return kTfLiteOk;
     } break;
     default:
-      context->ReportError(context, "Only float32 supported currently, got %d.",
-                           input->type);
+      context->ReportError(context, "Only float32 supported currently, got %s.",
+                           TfLiteTypeGetName(input->type));
       return kTfLiteError;
   }
 }
@@ -327,9 +345,24 @@ TfLiteStatus Relu6Eval(TfLiteContext* context, TfLiteNode* node) {
       for (; in < in_end; in++, out++) *out = std::min(std::max(0.f, *in), 6.f);
       return kTfLiteOk;
     } break;
+    case kTfLiteUInt8: {
+      ActivationParams params;
+      params.activation_type = FusedActivationFunctionType::kRelu6;
+      params.quantized_activation_min = std::max(
+          0, output->params.zero_point +
+                 static_cast<int32>(roundf(0.f / output->params.scale)));
+      params.quantized_activation_max = std::min(
+          255, output->params.zero_point +
+                   static_cast<int32>(roundf(6.f / output->params.scale)));
+      optimized_ops::ReluX(params, GetTensorShape(input),
+                           GetTensorData<uint8>(input), GetTensorShape(output),
+                           GetTensorData<uint8>(output));
+      return kTfLiteOk;
+    } break;
     default:
-      context->ReportError(context, "Only float32 supported currently, got %d.",
-                           input->type);
+      context->ReportError(
+          context, "Only float32 and uint8 supported currently, got %s.",
+          TfLiteTypeGetName(input->type));
       return kTfLiteError;
   }
 }
@@ -367,8 +400,8 @@ TfLiteStatus TanhEval(TfLiteContext* context, TfLiteNode* node) {
       return kTfLiteOk;
     } break;
     default:
-      context->ReportError(context, "Only float32 supported currently, got %d.",
-                           input->type);
+      context->ReportError(context, "Only float32 supported currently, got %s.",
+                           TfLiteTypeGetName(input->type));
       return kTfLiteError;
   }
 }
@@ -407,9 +440,8 @@ TfLiteStatus SigmoidEval(TfLiteContext* context, TfLiteNode* node) {
       break;
     }
     default:
-      context->ReportError(context, "Only float32 supported currently, got %d.",
-                           input->type);
-      return kTfLiteError;
+      context->ReportError(context, "Only float32 supported currently, got %s.",
+                           TfLiteTypeGetName(input->type));
   }
   return kTfLiteOk;
 }
@@ -604,8 +636,8 @@ TfLiteStatus SoftmaxEval(TfLiteContext* context, TfLiteNode* node) {
     }
     default:
       context->ReportError(
-          context, "Only float32 and uint8_t supported currently, got %d.",
-          input->type);
+          context, "Only float32 and uint8_t supported currently, got %s.",
+          TfLiteTypeGetName(input->type));
       return kTfLiteError;
   }
 }
@@ -636,8 +668,8 @@ TfLiteStatus LogSoftmaxEval(TfLiteContext* context, TfLiteNode* node) {
       return kTfLiteOk;
     }
     default:
-      context->ReportError(context, "Only float32 supported currently., got %d",
-                           input->type);
+      context->ReportError(context, "Only float32 supported currently., got %s",
+                           TfLiteTypeGetName(input->type));
       return kTfLiteError;
   }
 }
@@ -651,16 +683,57 @@ TfLiteStatus PreluEval(TfLiteContext* context, TfLiteNode* node) {
   const TfLiteTensor* input = GetInput(context, node, 0);
   const TfLiteTensor* alpha = GetInput(context, node, 1);
   TfLiteTensor* output = GetOutput(context, node, 0);
-  if (input->type != kTfLiteFloat32) {
-    context->ReportError(context, "Only float32 supported currently, got %d.",
-                         input->type);
-    return kTfLiteError;
+  const PreluOpData* data = reinterpret_cast<PreluOpData*>(node->user_data);
+  switch (input->type) {
+    case kTfLiteFloat32: {
+      reference_ops::BroadcastBinaryFunction4DSlow<float, float, float>(
+          GetTensorShape(input), GetTensorData<float>(input),
+          GetTensorShape(alpha), GetTensorData<float>(alpha),
+          GetTensorShape(output), GetTensorData<float>(output),
+          ApplyPrelu<float>);
+      return kTfLiteOk;
+    } break;
+    case kTfLiteUInt8: {
+      PreluParams op_params;
+      op_params.input_offset = -input->params.zero_point;
+      op_params.alpha_offset = -alpha->params.zero_point;
+      op_params.output_offset = output->params.zero_point;
+      op_params.output_multiplier = data->output_multiplier;
+      op_params.output_shift = data->output_shift;
+      reference_ops::BroadcastPrelu4DSlow(
+          op_params, GetTensorShape(input), GetTensorData<uint8_t>(input),
+          GetTensorShape(alpha), GetTensorData<uint8_t>(alpha),
+          GetTensorShape(output), GetTensorData<uint8_t>(output));
+      return kTfLiteOk;
+    } break;
+    default:
+      context->ReportError(context,
+                           "Only float32, uint8 supported currently, got %d.",
+                           TfLiteTypeGetName(input->type));
+      return kTfLiteError;
+  }
+}
+
+TfLiteStatus LeakyReluEval(TfLiteContext* context, TfLiteNode* node) {
+  const TfLiteTensor* input = GetInput(context, node, 0);
+  TfLiteTensor* output = GetOutput(context, node, 0);
+  const auto* params =
+      reinterpret_cast<TfLiteLeakyReluParams*>(node->builtin_data);
+
+  LeakyReluParams op_params;
+  op_params.alpha = params->alpha;
+  switch (input->type) {
+    case kTfLiteFloat32: {
+      optimized_ops::LeakyRelu(
+          op_params, GetTensorShape(input), GetTensorData<float>(input),
+          GetTensorShape(output), GetTensorData<float>(output));
+      return kTfLiteOk;
+    } break;
+    default:
+      context->ReportError(context, "Only float32 supported currently, got %s.",
+                           TfLiteTypeGetName(input->type));
+      return kTfLiteError;
   }
-  reference_ops::BroadcastBinaryFunction4DSlow<float, float, float>(
-      GetTensorShape(input), GetTensorData<float>(input), GetTensorShape(alpha),
-      GetTensorData<float>(alpha), GetTensorShape(output),
-      GetTensorData<float>(output), ApplyPrelu<float>);
-  return kTfLiteOk;
 }
 
 }  // namespace activations
@@ -715,12 +788,19 @@ TfLiteRegistration* Register_LOG_SOFTMAX() {
 }
 
 TfLiteRegistration* Register_PRELU() {
-  static TfLiteRegistration r = {/*init=*/nullptr, /*free=*/nullptr,
+  static TfLiteRegistration r = {activations::PreluInit, activations::PreluFree,
                                  activations::PreluPrepare,
                                  activations::PreluEval};
   return &r;
 }
 
+TfLiteRegistration* Register_LEAKY_RELU() {
+  static TfLiteRegistration r = {/*init=*/nullptr, /*free=*/nullptr,
+                                 activations::GenericPrepare,
+                                 activations::LeakyReluEval};
+  return &r;
+}
+
 }  // namespace builtin
 }  // namespace ops
 }  // namespace tflite
diff --git a/tensorflow/lite/kernels/activations_test.cc b/tensorflow/lite/kernels/activations_test.cc
index fff4121dc0c..67f137baff2 100644
--- a/tensorflow/lite/kernels/activations_test.cc
+++ b/tensorflow/lite/kernels/activations_test.cc
@@ -170,6 +170,29 @@ TEST(FloatActivationsOpTest, Tanh) {
                              })));
 }
 
+TEST(QuantizedActivationsOpTest, Relu6) {
+  const float kMin = -1;
+  const float kMax = 127.f / 128.f;
+  QuantizedActivationsOpModel m(
+      BuiltinOperator_RELU6,
+      /*input=*/{TensorType_UINT8, {1, 2, 4, 1}, 8 * kMin, 8 * kMax},
+      /*output=*/{TensorType_UINT8, {1, 2, 4, 1}, 8 * kMin, 8 * kMax});
+  m.SetInput<uint8_t>({
+      0, -6, 2, 4,   //
+      3, -2, 10, 1,  //
+  });
+  m.Invoke();
+  EXPECT_THAT(m.GetDequantizedOutput<uint8_t>(),
+              ElementsAreArray(ArrayFloatNear(
+                  {
+                      0, 0, 2, 4,  //
+                      3, 0, 6, 1,  //
+                  },
+                  kQuantizedTolerance)));
+  EXPECT_THAT(m.GetOutput<uint8_t>(),
+              ElementsAreArray({128, 128, 160, 192, 176, 128, 224, 144}));
+}
+
 TEST(QuantizedActivationsOpTest, Tanh) {
   const float kMin = -1;
   const float kMax = 127.f / 128.f;
@@ -563,22 +586,17 @@ TEST(QuantizedActivationsOpTest, LogSoftmax) {
               ElementsAreArray({189, 93, 221, 253, 142, 63, 255, 111}));
 }
 
-class PReluOpModel : public SingleOpModel {
+// A base class of PRelu op model. It provides the constructor for
+// FloatPReluOpModel and QuantizedPReluOpModel.
+class BasePReluOpModel : public SingleOpModel {
  public:
-  PReluOpModel(const TensorData& input, const TensorData& alpha) {
+  BasePReluOpModel(const TensorData& input, const TensorData& alpha) {
     input_ = AddInput(input);
     alpha_ = AddInput(alpha);
-    output_ = AddOutput(input);
+    output_ = AddOutput({input.type, input.shape, input.min, input.max});
     SetBuiltinOp(BuiltinOperator_PRELU, BuiltinOptions_NONE, 0);
     BuildInterpreter({GetShape(input_), GetShape(alpha_)});
   }
-  void SetInput(std::initializer_list<float> data) {
-    PopulateTensor(input_, data);
-  }
-  void SetAlpha(std::initializer_list<float> data) {
-    PopulateTensor(alpha_, data);
-  }
-  std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
 
  protected:
   int input_;
@@ -586,9 +604,47 @@ class PReluOpModel : public SingleOpModel {
   int output_;
 };
 
+// The FloatPReluOpModel class handles float input and output.
+class FloatPReluOpModel : public BasePReluOpModel {
+ public:
+  using BasePReluOpModel::BasePReluOpModel;
+
+  void SetInput(std::initializer_list<float> data) {
+    PopulateTensor(input_, data);
+  }
+  void SetAlpha(std::initializer_list<float> data) {
+    PopulateTensor(alpha_, data);
+  }
+  std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
+};
+
+// The QuantizedPReluOpModel class handles quantized input and output.
+class QuantizedPReluOpModel : public BasePReluOpModel {
+ public:
+  using BasePReluOpModel::BasePReluOpModel;
+
+  template <typename T>
+  void SetInput(std::initializer_list<float> data) {
+    QuantizeAndPopulate<T>(input_, data);
+  }
+  template <typename T>
+  void SetAlpha(std::initializer_list<float> data) {
+    QuantizeAndPopulate<T>(alpha_, data);
+  }
+  template <typename T>
+  std::vector<T> GetOutput() {
+    return ExtractVector<T>(output_);
+  }
+  template <typename T>
+  std::vector<float> GetDequantizedOutput() {
+    return Dequantize<T>(ExtractVector<T>(output_), GetScale(output_),
+                         GetZeroPoint(output_));
+  }
+};
+
 TEST(FloatActivationsOpTest, PRelu) {
-  PReluOpModel m({TensorType_FLOAT32, {1, 2, 2, 3}},
-                 {TensorType_FLOAT32, {1, 1, 3}});
+  FloatPReluOpModel m({TensorType_FLOAT32, {1, 2, 2, 3}},
+                      {TensorType_FLOAT32, {1, 1, 3}});
 
   m.SetInput({
       0.0f, 0.0f, 0.0f,     // Row 1, Column 1
@@ -606,6 +662,69 @@ TEST(FloatActivationsOpTest, PRelu) {
                              }));
 }
 
+TEST(QuantizedActivationsOpTest, PRelu) {
+  const float kMin = -1;
+  const float kMax = 127.f / 128.f;
+  QuantizedPReluOpModel m({TensorType_UINT8, {1, 2, 2, 3}, kMin, kMax},
+                          {TensorType_UINT8, {1, 1, 3}, kMin, kMax});
+  m.SetInput<uint8_t>({
+      0.0f, 0.0f, 0.0f,        // Row 1, Column 1
+      0.5f, 0.5f, 0.5f,        // Row 1, Column 2
+      -1.0f, -1.0f, -1.0f,     // Row 2, Column 1
+      -0.25f, -0.25f, -0.25f,  // Row 1, Column 2
+  });
+  m.SetAlpha<uint8_t>({0.0f, 0.5f, -0.5f});
+  m.Invoke();
+  EXPECT_THAT(m.GetDequantizedOutput<uint8_t>(),
+              ElementsAreArray(ArrayFloatNear(
+                  {
+                      0.0f, 0.0f, 0.0f,       // Row 1, Column 1
+                      0.5f, 0.5f, 0.5f,       // Row 1, Column 2
+                      0.0f, -0.5f, 0.5f,      // Row 2, Column 1
+                      0.0f, -0.125f, 0.125f,  // Row 1, Column 2
+                  },
+                  kQuantizedTolerance)));
+  EXPECT_THAT(m.GetOutput<uint8_t>(), ElementsAreArray({
+                                          128, 128, 128,  // Row 1, Column 1
+                                          192, 192, 192,  // Row 1, Column 2
+                                          128, 64, 192,   // Row 2, Column 1
+                                          128, 112, 144,  // Row 1, Column 2
+                                      }));
+}
+
+class LeakyReluOpModel : public SingleOpModel {
+ public:
+  LeakyReluOpModel(const TensorData& input, float alpha) {
+    input_ = AddInput(input);
+    output_ = AddOutput(input);
+    SetBuiltinOp(BuiltinOperator_LEAKY_RELU, BuiltinOptions_LeakyReluOptions,
+                 CreateLeakyReluOptions(builder_, alpha).Union());
+    BuildInterpreter({GetShape(input_)});
+  }
+  void SetInput(std::initializer_list<float> data) {
+    PopulateTensor(input_, data);
+  }
+  std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
+
+ protected:
+  int input_;
+  int output_;
+};
+
+TEST(FloatActivationsOpTest, LeakyRelu) {
+  LeakyReluOpModel m({TensorType_FLOAT32, {2, 3}}, 0.5f);
+
+  m.SetInput({
+      0.0f, 1.0f, 3.0f,    // Row 1
+      1.0f, -1.0f, -2.0f,  // Row 2
+  });
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({
+                                 0.0f, 1.0f, 3.0f,    // Row 1
+                                 1.0f, -0.5f, -1.0f,  // Row 2
+                             }));
+}
+
 }  // namespace
 }  // namespace tflite
 
diff --git a/tensorflow/lite/kernels/conv.cc b/tensorflow/lite/kernels/conv.cc
index 0c14b9eb656..1fd870be93e 100644
--- a/tensorflow/lite/kernels/conv.cc
+++ b/tensorflow/lite/kernels/conv.cc
@@ -24,7 +24,6 @@ limitations under the License.
 #include "tensorflow/lite/c/c_api_internal.h"
 #include "tensorflow/lite/kernels/eigen_support.h"
 #include "tensorflow/lite/kernels/gemm_support.h"
-#include "tensorflow/lite/kernels/internal/optimized/cblas_conv.h"
 #include "tensorflow/lite/kernels/internal/optimized/multithreaded_conv.h"
 #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
 #include "tensorflow/lite/kernels/internal/quantization_util.h"
@@ -491,11 +490,10 @@ void EvalFloat(TfLiteContext* context, TfLiteNode* node,
   CalculateActivationRange(params->activation, &output_activation_min,
                            &output_activation_max);
   KernelType effective_kernel_type;
-  if ((kernel_type == kMultithreadOptimized ||
-       kernel_type == kCblasOptimized) &&
+  if ((kernel_type == kMultithreadOptimized) &&
       (params->dilation_width_factor != 1 ||
        params->dilation_height_factor != 1)) {
-    // kMultithreadOptimized and kCblasOptimized do not support dilation.
+    // kMultithreadOptimized does not support dilation.
     // Therefore, fallback to optimized.
     effective_kernel_type = kGenericOptimized;
   } else {
@@ -521,6 +519,7 @@ void EvalFloat(TfLiteContext* context, TfLiteNode* node,
                           GetTensorData<float>(im2col));
       break;
     }
+    case kCblasOptimized:
     case kGenericOptimized: {
       optimized_ops::Conv(op_params, GetTensorShape(input),
                           GetTensorData<float>(input), GetTensorShape(filter),
@@ -546,15 +545,6 @@ void EvalFloat(TfLiteContext* context, TfLiteNode* node,
           GetTensorData<float>(im2col));
       break;
     }
-    case kCblasOptimized: {
-      cblas_ops::Conv(op_params, GetTensorShape(input),
-                      GetTensorData<float>(input), GetTensorShape(filter),
-                      GetTensorData<float>(filter), GetTensorShape(bias),
-                      GetTensorData<float>(bias), GetTensorShape(output),
-                      GetTensorData<float>(output), GetTensorShape(im2col),
-                      GetTensorData<float>(im2col));
-      break;
-    }
   }
 }
 
diff --git a/tensorflow/lite/kernels/eigen_support.cc b/tensorflow/lite/kernels/eigen_support.cc
index 44e0086ad88..bad5975a7c1 100644
--- a/tensorflow/lite/kernels/eigen_support.cc
+++ b/tensorflow/lite/kernels/eigen_support.cc
@@ -34,6 +34,15 @@ static_assert(
     "kDefaultArenaAlignment doesn't comply with Eigen alignment requirement.");
 #endif  // EIGEN_DONT_ALIGN
 
+// Helper routine for updating the global Eigen thread count used for OpenMP.
+void SetEigenNbThreads(int threads) {
+#if defined(EIGEN_HAS_OPENMP)
+  // The global Eigen thread count is only used when OpenMP is enabled. As this
+  // call causes problems with tsan, make it only when OpenMP is available.
+  Eigen::setNbThreads(context->recommended_num_threads);
+#endif  // defined(EIGEN_HAS_OPENMP)
+}
+
 // We have a single global threadpool for all convolution operations. This means
 // that inferences started from different threads may block each other, but
 // since the underlying resource of CPU cores should be consumed by the
@@ -78,7 +87,7 @@ void InitDevice(TfLiteContext* context, RefCountedEigenContext* ptr) {
 }
 
 TfLiteStatus Refresh(TfLiteContext* context) {
-  Eigen::setNbThreads(context->recommended_num_threads);
+  SetEigenNbThreads(context->recommended_num_threads);
 
   auto* ptr = GetEigenContext(context);
   if (ptr != nullptr) {
@@ -94,7 +103,7 @@ void IncrementUsageCounter(TfLiteContext* context) {
   auto* ptr = GetEigenContext(context);
   if (ptr == nullptr) {
     if (context->recommended_num_threads != -1) {
-      Eigen::setNbThreads(context->recommended_num_threads);
+      SetEigenNbThreads(context->recommended_num_threads);
     }
     ptr = new RefCountedEigenContext;
     ptr->type = kTfLiteEigenContext;
diff --git a/tensorflow/lite/kernels/elementwise.cc b/tensorflow/lite/kernels/elementwise.cc
index 416a69eb0ed..a79388b900e 100644
--- a/tensorflow/lite/kernels/elementwise.cc
+++ b/tensorflow/lite/kernels/elementwise.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include <cmath>
 #include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 
@@ -74,6 +75,10 @@ inline TfLiteStatus EvalLogical(TfLiteContext* context, TfLiteNode* node,
   return EvalImpl<bool>(context, node, bool_func, kTfLiteBool);
 }
 
+TfLiteStatus AbsEval(TfLiteContext* context, TfLiteNode* node) {
+  return EvalNumeric(context, node, std::abs);
+}
+
 TfLiteStatus SinEval(TfLiteContext* context, TfLiteNode* node) {
   return EvalNumeric(context, node, std::sin);
 }
@@ -101,6 +106,14 @@ TfLiteStatus LogicalNotEval(TfLiteContext* context, TfLiteNode* node) {
 }  // namespace
 }  // namespace elementwise
 
+TfLiteRegistration* Register_ABS() {
+  static TfLiteRegistration r = {
+      /*init=*/nullptr, /*free=*/nullptr,
+      elementwise::GenericPrepare<elementwise::IsNumericSupportedType>,
+      elementwise::AbsEval};
+  return &r;
+}
+
 TfLiteRegistration* Register_SIN() {
   static TfLiteRegistration r = {
       /*init=*/nullptr, /*free=*/nullptr,
diff --git a/tensorflow/lite/kernels/elementwise_test.cc b/tensorflow/lite/kernels/elementwise_test.cc
index 52df8dc3cca..7d243200812 100644
--- a/tensorflow/lite/kernels/elementwise_test.cc
+++ b/tensorflow/lite/kernels/elementwise_test.cc
@@ -74,6 +74,19 @@ TEST(ElementWise, Log) {
   EXPECT_THAT(m.GetTensorShape(m.output()), ElementsAreArray({1, 1, 4, 1}));
 }
 
+TEST(FloatActivationsOpTest, Abs) {
+  ElementWiseOpFloatModel m(BuiltinOperator_ABS, {1, 2, 4, 1});
+  m.PopulateTensor<float>(m.input(), {
+                                         0.f, -6.2f, 2.f, 4.f,  //
+                                         3.f, -2.f, 10.f, 1.f,  //
+                                     });
+  m.Invoke();
+  EXPECT_THAT(m.ExtractVector<float>(m.output()), ElementsAreArray({
+                                                      0.f, 6.2f, 2.f, 4.f,  //
+                                                      3.f, 2.f, 10.f, 1.f,  //
+                                                  }));
+}
+
 TEST(ElementWise, Sqrt) {
   ElementWiseOpFloatModel m(BuiltinOperator_SQRT, {1, 1, 4, 1});
   m.PopulateTensor<float>(m.input(), {0, 1, 2, 4});
diff --git a/tensorflow/lite/kernels/fill.cc b/tensorflow/lite/kernels/fill.cc
new file mode 100644
index 00000000000..079ee44f371
--- /dev/null
+++ b/tensorflow/lite/kernels/fill.cc
@@ -0,0 +1,141 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
+#include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+
+namespace tflite {
+namespace ops {
+namespace builtin {
+namespace fill {
+
+namespace {
+
+constexpr int kDimsTensor = 0;
+constexpr int kValueTensor = 1;
+constexpr int kOutputTensor = 0;
+
+template <typename T>
+TfLiteStatus ResizeOutputImpl(TfLiteContext* context, const TfLiteTensor* dims,
+                              TfLiteTensor* output) {
+  TfLiteIntArray* output_shape = TfLiteIntArrayCreate(dims->dims->data[0]);
+  for (int i = 0; i < output_shape->size; ++i) {
+    T data = GetTensorData<T>(dims)[i];
+    if (data < 0) {
+      context->ReportError(context, "Fill dimensions must be >= 0", dims->type);
+      return kTfLiteError;
+    }
+    output_shape->data[i] = data;
+  }
+  return context->ResizeTensor(context, output, output_shape);
+}
+
+TfLiteStatus ResizeOutput(TfLiteContext* context, const TfLiteTensor* dims,
+                          TfLiteTensor* output) {
+  switch (dims->type) {
+    case kTfLiteInt32:
+      return ResizeOutputImpl<int32_t>(context, dims, output);
+    case kTfLiteInt64:
+      return ResizeOutputImpl<int64_t>(context, dims, output);
+    default:
+      context->ReportError(
+          context,
+          "Fill only currently supports int32, int64 for input 0, "
+          "got %d.",
+          dims->type);
+      return kTfLiteError;
+  }
+}
+
+}  // namespace
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 2);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+
+  const TfLiteTensor* dims = GetInput(context, node, kDimsTensor);
+  const TfLiteTensor* value = GetInput(context, node, kValueTensor);
+
+  // Make sure the 1st input tensor is 1-D.
+  TF_LITE_ENSURE_EQ(context, NumDimensions(dims), 1);
+
+  // Make sure the 1st input tensor is int32 or int64.
+  const auto dtype = dims->type;
+  TF_LITE_ENSURE(context, dtype == kTfLiteInt32 || dtype == kTfLiteInt64);
+
+  // Make sure the 2nd input tensor is a scalar.
+  TF_LITE_ENSURE_EQ(context, NumDimensions(value), 0);
+
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  output->type = value->type;
+
+  if (IsConstantTensor(dims)) {
+    TF_LITE_ENSURE_OK(context, ResizeOutput(context, dims, output));
+  } else {
+    SetTensorToDynamic(output);
+  }
+  return kTfLiteOk;
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  const TfLiteTensor* value = GetInput(context, node, kValueTensor);
+
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  if (IsDynamicTensor(output)) {
+    const TfLiteTensor* dims = GetInput(context, node, kDimsTensor);
+    TF_LITE_ENSURE_OK(context, ResizeOutput(context, dims, output));
+  }
+#define TF_LITE_FILL(data_type)                                               \
+  reference_ops::Fill(GetTensorShape(value), GetTensorData<data_type>(value), \
+                      GetTensorShape(output),                                 \
+                      GetTensorData<data_type>(output))
+  switch (output->type) {
+    case kTfLiteInt32:
+      TF_LITE_FILL(int32_t);
+      break;
+    case kTfLiteInt64:
+      TF_LITE_FILL(int64_t);
+      break;
+    case kTfLiteFloat32:
+      TF_LITE_FILL(float);
+      break;
+    default:
+      context->ReportError(
+          context,
+          "Fill only currently supports int32, int64, float32 for input 1,"
+          "got %d.",
+          value->type);
+      return kTfLiteError;
+  }
+#undef TF_LITE_FILL
+  return kTfLiteOk;
+}
+
+}  // namespace fill
+
+TfLiteRegistration* Register_FILL() {
+  static TfLiteRegistration r = {/*init=*/nullptr, /*free=*/nullptr,
+                                 fill::Prepare, fill::Eval};
+  return &r;
+}
+
+}  // namespace builtin
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/lite/kernels/fill_test.cc b/tensorflow/lite/kernels/fill_test.cc
new file mode 100644
index 00000000000..08044d76f9d
--- /dev/null
+++ b/tensorflow/lite/kernels/fill_test.cc
@@ -0,0 +1,94 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <gtest/gtest.h>
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/model.h"
+
+namespace tflite {
+namespace {
+
+using ::testing::ElementsAreArray;
+using ::testing::IsEmpty;
+
+class FillOpModel : public SingleOpModel {
+ public:
+  explicit FillOpModel(const TensorData& input1, const TensorData& input2) {
+    input1_ = AddInput(input1);
+    input2_ = AddInput(input2);
+    output_ = AddOutput(input1);
+    SetBuiltinOp(BuiltinOperator_FILL, BuiltinOptions_FillOptions,
+                 CreateFillOptions(builder_).Union());
+    BuildInterpreter({GetShape(input1_), GetShape(input2_)});
+  }
+
+  int input1() { return input1_; }
+  int input2() { return input2_; }
+  int output() { return output_; }
+
+ protected:
+  int input1_;
+  int input2_;
+  int output_;
+};
+
+TEST(FillOpModel, FillInt32) {
+  FillOpModel m({TensorType_INT32, {2}}, {TensorType_INT32});
+  m.PopulateTensor<int32_t>(m.input1(), {2, 3});
+  m.PopulateTensor<int32_t>(m.input2(), {-11});
+  m.Invoke();
+  EXPECT_THAT(m.ExtractVector<int32_t>(m.output()),
+              ElementsAreArray({-11, -11, -11, -11, -11, -11}));
+  EXPECT_THAT(m.GetTensorShape(m.output()), ElementsAreArray({2, 3}));
+}
+
+TEST(FillOpModel, FillInt64) {
+  FillOpModel m({TensorType_INT32, {2}}, {TensorType_INT64});
+  m.PopulateTensor<int32_t>(m.input1(), {2, 4});
+  m.PopulateTensor<int64_t>(m.input2(), {2 ^ 45});
+  m.Invoke();
+  EXPECT_THAT(m.ExtractVector<int64_t>(m.output()),
+              ElementsAreArray({2 ^ 45, 2 ^ 45, 2 ^ 45, 2 ^ 45, 2 ^ 45, 2 ^ 45,
+                                2 ^ 45, 2 ^ 45}));
+  EXPECT_THAT(m.GetTensorShape(m.output()), ElementsAreArray({2, 4}));
+}
+
+TEST(FillOpModel, FillFloat) {
+  FillOpModel m({TensorType_INT64, {3}}, {TensorType_FLOAT32});
+  m.PopulateTensor<int64_t>(m.input1(), {2, 2, 2});
+  m.PopulateTensor<float>(m.input2(), {4.0});
+  m.Invoke();
+  EXPECT_THAT(m.ExtractVector<float>(m.output()),
+              ElementsAreArray({4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0}));
+  EXPECT_THAT(m.GetTensorShape(m.output()), ElementsAreArray({2, 2, 2}));
+}
+
+TEST(FillOpModel, FillOutputScalar) {
+  FillOpModel m({TensorType_INT64, {0}}, {TensorType_FLOAT32});
+  m.PopulateTensor<float>(m.input2(), {4.0});
+  m.Invoke();
+  EXPECT_THAT(m.ExtractVector<float>(m.output()), ElementsAreArray({4.0}));
+  EXPECT_THAT(m.GetTensorShape(m.output()), IsEmpty());
+}
+
+}  // namespace
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/lite/kernels/fully_connected.cc b/tensorflow/lite/kernels/fully_connected.cc
index 63cca1cf542..a1eecb284ab 100644
--- a/tensorflow/lite/kernels/fully_connected.cc
+++ b/tensorflow/lite/kernels/fully_connected.cc
@@ -117,7 +117,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   // Note that quantized inference requires that all tensors have their
   // parameters set. This is usually done during quantized training.
   TfLiteType data_type = input->type;
-  if (data_type != kTfLiteFloat32) {
+  if (data_type != kTfLiteFloat32 && data_type != kTfLiteInt32) {
     double real_multiplier = 0.0;
     TF_LITE_ENSURE_STATUS(GetQuantizedConvolutionMultipler(
         context, input, filter, bias, output, &real_multiplier));
diff --git a/tensorflow/lite/kernels/gather.cc b/tensorflow/lite/kernels/gather.cc
index 61884d6a12c..f205daae134 100644
--- a/tensorflow/lite/kernels/gather.cc
+++ b/tensorflow/lite/kernels/gather.cc
@@ -118,7 +118,7 @@ TfLiteStatus GatherStrings(TfLiteContext* context, const TfLiteTensor* input,
     const auto string_ref = GetString(input, pos);
     buffer.AddString(string_ref.str, string_ref.len);
   }
-  buffer.WriteToTensor(output);
+  buffer.WriteToTensorAsVector(output);
   return kTfLiteOk;
 }
 
diff --git a/tensorflow/lite/kernels/hashtable_lookup.cc b/tensorflow/lite/kernels/hashtable_lookup.cc
index b6ae7a3d1a5..da1116cf858 100644
--- a/tensorflow/lite/kernels/hashtable_lookup.cc
+++ b/tensorflow/lite/kernels/hashtable_lookup.cc
@@ -137,7 +137,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
     }
   }
   if (output->type == kTfLiteString) {
-    buf.WriteToTensor(output);
+    buf.WriteToTensorAsVector(output);
   }
 
   return kTfLiteOk;
diff --git a/tensorflow/lite/kernels/internal/BUILD b/tensorflow/lite/kernels/internal/BUILD
index 6d9690ea460..7d2653f0a1d 100644
--- a/tensorflow/lite/kernels/internal/BUILD
+++ b/tensorflow/lite/kernels/internal/BUILD
@@ -234,8 +234,6 @@ cc_library(
 cc_library(
     name = "optimized",
     hdrs = [
-        "optimized/cblas_conv.h",
-        "optimized/cblas_reference.h",
         "optimized/eigen_spatial_convolutions.h",
         "optimized/eigen_tensor_reduced_instantiations_oss.h",
         "optimized/multithreaded_conv.h",
diff --git a/tensorflow/lite/kernels/internal/optimized/cblas_conv.h b/tensorflow/lite/kernels/internal/optimized/cblas_conv.h
deleted file mode 100644
index 53772050503..00000000000
--- a/tensorflow/lite/kernels/internal/optimized/cblas_conv.h
+++ /dev/null
@@ -1,109 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_CBLAS_CONV_H_
-#define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_CBLAS_CONV_H_
-
-// The Conv implementation based on CBLAS interface. This is only used on iOS
-// for now, utilizing Apple's Accelerate framework.
-
-#if TFLITE_USE_APPLE_ACCELERATE_FOR_CONV
-#include <Accelerate/Accelerate.h>
-#else
-#include "tensorflow/lite/kernels/internal/optimized/cblas_reference.h"
-#endif
-
-#include "tensorflow/lite/kernels/internal/optimized/multithreaded_conv.h"
-#include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
-
-namespace tflite {
-namespace cblas_ops {
-
-inline void Conv(const ConvParams& params, const RuntimeShape& input_shape,
-                 const float* input_data, const RuntimeShape& filter_shape,
-                 const float* filter_data, const RuntimeShape& bias_shape,
-                 const float* bias_data, const RuntimeShape& output_shape,
-                 float* output_data, const RuntimeShape& im2col_shape,
-                 float* im2col_data) {
-  const int stride_width = params.stride_width;
-  const int stride_height = params.stride_height;
-  const int pad_width = params.padding_values.width;
-  const int pad_height = params.padding_values.height;
-  const int dilation_width_factor = params.dilation_width_factor;
-  const int dilation_height_factor = params.dilation_height_factor;
-  const float output_activation_min = params.float_activation_min;
-  const float output_activation_max = params.float_activation_max;
-  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
-  TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
-  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
-  gemmlowp::ScopedProfilingLabel label("Conv/cblas");
-
-  const float* gemm_input_data = nullptr;
-  const RuntimeShape* gemm_input_shape = nullptr;
-  const int filter_width = filter_shape.Dims(2);
-  const int filter_height = filter_shape.Dims(1);
-  const bool need_im2col = stride_width != 1 || stride_height != 1 ||
-                           filter_width != 1 || filter_height != 1;
-  if (need_im2col) {
-    TFLITE_DCHECK(im2col_data);
-    ConvParams op_params;
-    op_params.padding_type = PaddingType::kSame;
-    op_params.padding_values.width = pad_width;
-    op_params.padding_values.height = pad_height;
-    op_params.stride_width = stride_width;
-    op_params.stride_height = stride_height;
-    op_params.dilation_width_factor = dilation_width_factor;
-    op_params.dilation_height_factor = dilation_height_factor;
-    optimized_ops::Im2col(op_params, filter_height, filter_width, 0,
-                          input_shape, input_data, im2col_shape, im2col_data);
-
-    gemm_input_data = im2col_data;
-    gemm_input_shape = &im2col_shape;
-  } else {
-    TFLITE_DCHECK(!im2col_data);
-    gemm_input_data = input_data;
-    gemm_input_shape = &input_shape;
-  }
-
-  // The following code computes matrix multiplication c = a * transponse(b)
-  // with CBLAS, where:
-  // * `a` is a matrix with dimensions (m, k).
-  // * `b` is a matrix with dimensions (n, k), so transpose(b) is (k, n).
-  // * `c` is a matrix with dimensions (m, n).
-  // The naming of variables are aligned with CBLAS specification here.
-  const float* a = gemm_input_data;
-  const float* b = filter_data;
-  float* c = output_data;
-  const int gemm_input_dims = gemm_input_shape->DimensionsCount();
-  int m = FlatSizeSkipDim(*gemm_input_shape, gemm_input_dims - 1);
-  int n = output_shape.Dims(3);
-  int k = gemm_input_shape->Dims(gemm_input_dims - 1);
-  // The stride of matrix a, b and c respectively.
-  int stride_a = k;
-  int stride_b = k;
-  int stride_c = n;
-
-  cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, m, n, k, 1.0f, a,
-              stride_a, b, stride_b, 0.0f, c, stride_c);
-
-  optimized_ops::AddBiasAndEvalActivationFunction(
-      output_activation_min, output_activation_max, bias_shape, bias_data,
-      output_shape, output_data);
-}
-
-}  // namespace cblas_ops
-}  // namespace tflite
-
-#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_CBLAS_CONV_H_
diff --git a/tensorflow/lite/kernels/internal/optimized/cblas_reference.h b/tensorflow/lite/kernels/internal/optimized/cblas_reference.h
deleted file mode 100644
index fa07578612a..00000000000
--- a/tensorflow/lite/kernels/internal/optimized/cblas_reference.h
+++ /dev/null
@@ -1,69 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_CBLAS_REFERENCE_H_
-#define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_CBLAS_REFERENCE_H_
-
-#include "tensorflow/lite/kernels/internal/compatibility.h"
-
-// The reference implementation for a small subset of CBLAS interface.
-// This is only used for testing CBLAS implementation, and should never be used
-// in production code.
-
-namespace tflite {
-namespace cblas_ops {
-
-// The following code follows the original CBLAS specification, and it might
-// conflict with the TensorFlow naming convention.
-// TODO(ycling): Find another way to test CBLAS with bazel, without writing
-// a reference implementation by ourselves.
-enum CBLAS_ORDER { CblasRowMajor = 0, CblasColMajor = 1 };
-
-enum CBLAS_TRANSPOSE { CblasNoTrans = 0, CblasTrans = 1, CblasConjTrans = 2 };
-
-// A reference implementation for matrix multiplication.
-// The following code computes, c = a * transponse(b) matrix multiplication
-// with CBLAS, where:
-// * `a` is a matrix with dimensions (m, k).
-// * `b` is a matrix with dimensions (n, k), so transpose(b) is (k, n).
-// * `c` is a matrix with dimensions (m, n).
-// The naming of variables is aligned with CBLAS specification here.
-void cblas_sgemm(const enum CBLAS_ORDER order,
-                 const enum CBLAS_TRANSPOSE trans_a,
-                 const enum CBLAS_TRANSPOSE trans_b, const int m, const int n,
-                 const int k, const float alpha, const float *a,
-                 const int stride_a, const float *b, const int stride_b,
-                 const float beta, float *c, const int stride_c) {
-  TFLITE_DCHECK(order == CblasRowMajor);
-  TFLITE_DCHECK(trans_a == CblasNoTrans);
-  TFLITE_DCHECK(trans_b == CblasTrans);
-  TFLITE_DCHECK(beta == 0.0f);
-  for (int row = 0; row < m; ++row) {
-    for (int col = 0; col < n; ++col) {
-      // If `beta` non-zero, multiple it with the original values in output.
-      // Otherwise, ignore the original value in output completely.
-      float value = 0.0f;
-      for (int idx = 0; idx < k; ++idx) {
-        value += alpha * a[stride_a * row + idx] * b[stride_b * col + idx];
-      }
-      c[stride_c * row + col] = value;
-    }
-  }
-}
-
-}  // namespace cblas_ops
-}  // namespace tflite
-
-#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_CBLAS_REFERENCE_H_
diff --git a/tensorflow/lite/kernels/internal/optimized/depthwiseconv_float.h b/tensorflow/lite/kernels/internal/optimized/depthwiseconv_float.h
index 25b66d4b553..c77715de579 100644
--- a/tensorflow/lite/kernels/internal/optimized/depthwiseconv_float.h
+++ b/tensorflow/lite/kernels/internal/optimized/depthwiseconv_float.h
@@ -793,22 +793,26 @@ void FloatDepthwiseConvAccumRow(int stride, int dilation_factor,
     int out_x_loop_end_unclampled = 0;
     if (kAllowStrided) {
       if (stride == 2) {
-        out_x_loop_start_unclampled = (pad_width - filter_x + 1) / 2;
+        out_x_loop_start_unclampled =
+            (pad_width - dilation_factor * filter_x + 1) / 2;
         out_x_loop_end_unclampled =
-            (pad_width + input_width - filter_x + 1) / 2;
+            (pad_width + input_width - dilation_factor * filter_x + 1) / 2;
       } else if (stride == 4) {
-        out_x_loop_start_unclampled = (pad_width - filter_x + 3) / 4;
+        out_x_loop_start_unclampled =
+            (pad_width - dilation_factor * filter_x + 3) / 4;
         out_x_loop_end_unclampled =
-            (pad_width + input_width - filter_x + 3) / 4;
+            (pad_width + input_width - dilation_factor * filter_x + 3) / 4;
       } else {
         out_x_loop_start_unclampled =
-            (pad_width - filter_x + stride - 1) / stride;
-        out_x_loop_end_unclampled =
-            (pad_width + input_width - filter_x + stride - 1) / stride;
+            (pad_width - dilation_factor * filter_x + stride - 1) / stride;
+        out_x_loop_end_unclampled = (pad_width + input_width -
+                                     dilation_factor * filter_x + stride - 1) /
+                                    stride;
       }
     } else {
-      out_x_loop_start_unclampled = pad_width - filter_x;
-      out_x_loop_end_unclampled = pad_width + input_width - filter_x;
+      out_x_loop_start_unclampled = pad_width - dilation_factor * filter_x;
+      out_x_loop_end_unclampled =
+          pad_width + input_width - dilation_factor * filter_x;
     }
     // The kernel will have to iterate on the segment of the
     // output row that starts at out_x_loop_start and out_x_loop_end.
@@ -819,7 +823,8 @@ void FloatDepthwiseConvAccumRow(int stride, int dilation_factor,
 
     float* acc_buffer_ptr =
         acc_buffer + (out_x_loop_start - out_x_buffer_start) * output_depth;
-    const int in_x_origin = (out_x_loop_start * stride) - pad_width + filter_x;
+    const int in_x_origin =
+        (out_x_loop_start * stride) - pad_width + dilation_factor * filter_x;
     const float* input_ptr = input_data + in_x_origin * input_depth;
     const int num_output_pixels = out_x_loop_end - out_x_loop_start;
     FloatDepthwiseConvKernel<kAllowStrided, kFixedInputDepth,
@@ -936,8 +941,7 @@ inline void DepthwiseConv(
                                         FIXED_DEPTH_MULTIPLIER)           \
   if (!row_accum_func && (stride_width == 1 || ALLOW_STRIDED) &&          \
       (input_depth == FIXED_INPUT_DEPTH || FIXED_INPUT_DEPTH == 0) &&     \
-      depth_multiplier == FIXED_DEPTH_MULTIPLIER &&                       \
-      dilation_height_factor == 1 && dilation_width_factor == 1) {        \
+      depth_multiplier == FIXED_DEPTH_MULTIPLIER) {                       \
     row_accum_func =                                                      \
         FloatDepthwiseConvAccumRow<ALLOW_STRIDED, FIXED_INPUT_DEPTH,      \
                                    FIXED_DEPTH_MULTIPLIER>;               \
diff --git a/tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8.h b/tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8.h
index 5317cea8843..d3dca799a7c 100644
--- a/tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8.h
+++ b/tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8.h
@@ -1499,22 +1499,26 @@ void QuantizedDepthwiseConvAccumRow(int stride, int dilation_factor,
     int out_x_loop_end_unclampled = 0;
     if (kAllowStrided) {
       if (stride == 2) {
-        out_x_loop_start_unclampled = (pad_width - filter_x + 1) / 2;
+        out_x_loop_start_unclampled =
+            (pad_width - dilation_factor * filter_x + 1) / 2;
         out_x_loop_end_unclampled =
-            (pad_width + input_width - filter_x + 1) / 2;
+            (pad_width + input_width - dilation_factor * filter_x + 1) / 2;
       } else if (stride == 4) {
-        out_x_loop_start_unclampled = (pad_width - filter_x + 3) / 4;
+        out_x_loop_start_unclampled =
+            (pad_width - dilation_factor * filter_x + 3) / 4;
         out_x_loop_end_unclampled =
-            (pad_width + input_width - filter_x + 3) / 4;
+            (pad_width + input_width - dilation_factor * filter_x + 3) / 4;
       } else {
         out_x_loop_start_unclampled =
-            (pad_width - filter_x + stride - 1) / stride;
-        out_x_loop_end_unclampled =
-            (pad_width + input_width - filter_x + stride - 1) / stride;
+            (pad_width - dilation_factor * filter_x + stride - 1) / stride;
+        out_x_loop_end_unclampled = (pad_width + input_width -
+                                     dilation_factor * filter_x + stride - 1) /
+                                    stride;
       }
     } else {
-      out_x_loop_start_unclampled = pad_width - filter_x;
-      out_x_loop_end_unclampled = pad_width + input_width - filter_x;
+      out_x_loop_start_unclampled = pad_width - dilation_factor * filter_x;
+      out_x_loop_end_unclampled =
+          pad_width + input_width - dilation_factor * filter_x;
     }
     // The kernel will have to iterate on the segment of the
     // output row that starts at out_x_loop_start and out_x_loop_end.
@@ -1525,7 +1529,8 @@ void QuantizedDepthwiseConvAccumRow(int stride, int dilation_factor,
 
     int32* acc_buffer_ptr =
         acc_buffer + (out_x_loop_start - out_x_buffer_start) * output_depth;
-    const int in_x_origin = (out_x_loop_start * stride) - pad_width + filter_x;
+    const int in_x_origin =
+        (out_x_loop_start * stride) - pad_width + dilation_factor * filter_x;
     const uint8* input_ptr = input_data + in_x_origin * input_depth;
     const int num_output_pixels = out_x_loop_end - out_x_loop_start;
     QuantizedDepthwiseConvKernel<
@@ -1703,8 +1708,7 @@ inline void DepthwiseConvGeneral(
                                         FIXED_DEPTH_MULTIPLIER)           \
   if (!row_accum_func && (stride_width == 1 || ALLOW_STRIDED) &&          \
       (input_depth == FIXED_INPUT_DEPTH || FIXED_INPUT_DEPTH == 0) &&     \
-      depth_multiplier == FIXED_DEPTH_MULTIPLIER &&                       \
-      dilation_width_factor == 1 && dilation_height_factor == 1) {        \
+      depth_multiplier == FIXED_DEPTH_MULTIPLIER) {                       \
     row_accum_func =                                                      \
         QuantizedDepthwiseConvAccumRow<ALLOW_STRIDED, FIXED_INPUT_DEPTH,  \
                                        FIXED_DEPTH_MULTIPLIER>;           \
diff --git a/tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8_3x3_filter.h b/tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8_3x3_filter.h
index 3f2ed0b1f0e..5859bcaed4a 100644
--- a/tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8_3x3_filter.h
+++ b/tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8_3x3_filter.h
@@ -23,11 +23,6 @@ limitations under the License.
 namespace tflite {
 namespace optimized_ops {
 
-// clang-format gets confused with this file and ends up formatting lines to
-// be larger than 80 characters. Turn off here and back on at the end of the
-// file.
-// clang-format off
-
 // See CategorizeDotProductKernel for definitive taxonomy.
 enum class DotProduct3x3KernelType {
   kNone = 0,  // Parameter combination is not supported for dot product kernels.
@@ -120,42 +115,58 @@ struct DepthwiseConvParams {
 #define OFFSET_OUTPUT_WIDTH 84
 #define OFFSET_OUTPUT_HEIGHT 88
 
-static_assert(offsetof(DepthwiseConvParams, input_depth) ==
-                  OFFSET_INPUT_DEPTH, "");
+static_assert(offsetof(DepthwiseConvParams, input_depth) == OFFSET_INPUT_DEPTH,
+              "");
 static_assert(offsetof(DepthwiseConvParams, input_row_size) ==
-                  OFFSET_INPUT_ROW_SIZE, "");
+                  OFFSET_INPUT_ROW_SIZE,
+              "");
 static_assert(offsetof(DepthwiseConvParams, output_depth) ==
-                  OFFSET_OUTPUT_DEPTH, "");
+                  OFFSET_OUTPUT_DEPTH,
+              "");
 static_assert(offsetof(DepthwiseConvParams, output_row_size) ==
-                  OFFSET_OUTPUT_ROW_SIZE, "");
+                  OFFSET_OUTPUT_ROW_SIZE,
+              "");
 static_assert(offsetof(DepthwiseConvParams, filter_row_size) ==
-                  OFFSET_FILTER_ROW_SIZE, "");
+                  OFFSET_FILTER_ROW_SIZE,
+              "");
 static_assert(offsetof(DepthwiseConvParams, input_offset) ==
-                  OFFSET_INPUT_OFFSET, "");
+                  OFFSET_INPUT_OFFSET,
+              "");
 static_assert(offsetof(DepthwiseConvParams, output_offset) ==
-                  OFFSET_OUTPUT_OFFSET, "");
+                  OFFSET_OUTPUT_OFFSET,
+              "");
 static_assert(offsetof(DepthwiseConvParams, filter_offset) ==
-                  OFFSET_FILTER_OFFSET, "");
+                  OFFSET_FILTER_OFFSET,
+              "");
 static_assert(offsetof(DepthwiseConvParams, output_multiplier) ==
-                  OFFSET_OUTPUT_MULTIPLIER, "");
+                  OFFSET_OUTPUT_MULTIPLIER,
+              "");
 static_assert(offsetof(DepthwiseConvParams, output_activation_min) ==
-                  OFFSET_OUTPUT_ACTIVATION_MIN, "");
+                  OFFSET_OUTPUT_ACTIVATION_MIN,
+              "");
 static_assert(offsetof(DepthwiseConvParams, output_activation_max) ==
-                  OFFSET_OUTPUT_ACTIVATION_MAX, "");
+                  OFFSET_OUTPUT_ACTIVATION_MAX,
+              "");
 static_assert(offsetof(DepthwiseConvParams, output_right_shift) ==
-                  OFFSET_OUTPUT_RIGHT_SHIFT, "");
-static_assert(offsetof(DepthwiseConvParams, input_width) ==
-                  OFFSET_INPUT_WIDTH, "");
+                  OFFSET_OUTPUT_RIGHT_SHIFT,
+              "");
+static_assert(offsetof(DepthwiseConvParams, input_width) == OFFSET_INPUT_WIDTH,
+              "");
 static_assert(offsetof(DepthwiseConvParams, input_height) ==
-                  OFFSET_INPUT_HEIGHT, "");
+                  OFFSET_INPUT_HEIGHT,
+              "");
 static_assert(offsetof(DepthwiseConvParams, stride_width) ==
-                  OFFSET_STRIDE_WIDTH, "");
+                  OFFSET_STRIDE_WIDTH,
+              "");
 static_assert(offsetof(DepthwiseConvParams, stride_height) ==
-                  OFFSET_STRIDE_HEIGHT, "");
+                  OFFSET_STRIDE_HEIGHT,
+              "");
 static_assert(offsetof(DepthwiseConvParams, output_width) ==
-                  OFFSET_OUTPUT_WIDTH, "");
+                  OFFSET_OUTPUT_WIDTH,
+              "");
 static_assert(offsetof(DepthwiseConvParams, output_height) ==
-                  OFFSET_OUTPUT_HEIGHT, "");
+                  OFFSET_OUTPUT_HEIGHT,
+              "");
 
 template <int32 kDepth, int32 kStrideWidth, int32 kStrideHeight>
 struct DepthwiseConvWindow {};
@@ -164,10 +175,10 @@ template <>
 struct DepthwiseConvWindow<8, 1, 1> {
  public:
   static inline void Run(const uint8* input_ptr, const uint8* filter_ptr,
-                  const int32* bias_ptr, uint8* output_ptr, int64_t input_depth,
-                  int64_t input_row_size, int32 output_window_height,
-                  int32 output_window_width,
-                  const DepthwiseConvParams* params_ptr) {
+                         const int32* bias_ptr, uint8* output_ptr,
+                         int64_t input_depth, int64_t input_row_size,
+                         int32 output_window_height, int32 output_window_width,
+                         const DepthwiseConvParams* params_ptr) {
     const int64_t input_width_increment = 2 * input_depth;
     const int64_t input_height_increment = 2 * input_row_size;
     const int64_t output_height_increment = 2 * params_ptr->output_row_size;
@@ -1147,10 +1158,10 @@ struct DepthwiseConvWindow<8, 1, 1> {
 template <>
 struct DepthwiseConvWindow<8, 2, 2> {
   static inline void Run(const uint8* input_ptr, const uint8* filter_ptr,
-                  const int32* bias_ptr, uint8* output_ptr, int64_t input_depth,
-                  int64_t input_row_size, int32 output_window_height,
-                  int32 output_window_width,
-                  const DepthwiseConvParams* params_ptr) {
+                         const int32* bias_ptr, uint8* output_ptr,
+                         int64_t input_depth, int64_t input_row_size,
+                         int32 output_window_height, int32 output_window_width,
+                         const DepthwiseConvParams* params_ptr) {
     const int64_t input_width_increment = 4 * input_depth;
     const int64_t input_height_increment = 4 * input_row_size;
     const int64_t output_height_increment = 2 * params_ptr->output_row_size;
@@ -2990,11 +3001,10 @@ struct ShuffleParams {
   ShuffleParams() = default;
   ShuffleParams(int32 output_width, int32 output_height, int32 stride_width,
                 int32 stride_height)
-  : output_width(output_width)
-  , output_height(output_height)
-  , input_width(get_shuffle_input_size(stride_width, output_width))
-  , input_height(get_shuffle_input_size(stride_height, output_height)) {
-  }
+      : output_width(output_width),
+        output_height(output_height),
+        input_width(get_shuffle_input_size(stride_width, output_width)),
+        input_height(get_shuffle_input_size(stride_height, output_height)) {}
 };
 
 template <int32 kStrideWidth, int32 kStrideHeight>
@@ -3003,10 +3013,10 @@ struct DepthwiseConvThroughDepth {
   // |start_depth| to |end_depth|. Keep this not inlined to maintain a small
   // binary size. We use a DepthwiseConvParams struct for read only params
   // to minimize call overhead.
-  static __attribute__((noinline)) void Run(const uint8* input_ptr,
-      const uint8* filter_ptr, const int32* bias_ptr, uint8* output_ptr,
-      int64_t start_depth, int64_t end_depth, int64_t input_depth,
-      int64_t input_row_size, int32 output_window_height,
+  static __attribute__((noinline)) void Run(
+      const uint8* input_ptr, const uint8* filter_ptr, const int32* bias_ptr,
+      uint8* output_ptr, int64_t start_depth, int64_t end_depth,
+      int64_t input_depth, int64_t input_row_size, int32 output_window_height,
       int32 output_window_width, const DepthwiseConvParams& params) {
     for (; start_depth <= end_depth - 8; start_depth += 8) {
       DepthwiseConvWindow<8, kStrideWidth, kStrideHeight>::Run(
@@ -3029,12 +3039,15 @@ struct DepthwiseConvMultiRow {
                          uint8* output_data, const DepthwiseConvParams& params,
                          const ShuffleParams& shuffle_params,
                          uint8* shuffle_workspace) {
-    TFLITE_DCHECK(shuffle_params.input_height ==
+    TFLITE_DCHECK(
+        shuffle_params.input_height ==
         get_shuffle_input_size(kStrideHeight, shuffle_params.output_height));
-    TFLITE_DCHECK(shuffle_params.input_width ==
+    TFLITE_DCHECK(
+        shuffle_params.input_width ==
         get_shuffle_input_size(kStrideWidth, shuffle_params.output_width));
-    TFLITE_DCHECK(64 * shuffle_params.input_width * shuffle_params.input_height
-                  <= DEPTHWISECONV_SHUFFLE_WORKSPACE_SIZE);
+    TFLITE_DCHECK(64 * shuffle_params.input_width *
+                      shuffle_params.input_height <=
+                  DEPTHWISECONV_SHUFFLE_WORKSPACE_SIZE);
 
     int32 out_x = start_x;
 
@@ -3045,7 +3058,7 @@ struct DepthwiseConvMultiRow {
     if (params.output_depth > 64 ||
         (params.output_depth <= 64 && params.input_width > 150)) {
       for (; out_x <= (end_x - shuffle_params.output_width);
-             out_x += shuffle_params.output_width) {
+           out_x += shuffle_params.output_width) {
         const uint8* input_ptr = input_data;
         const int32* bias_ptr = bias_data;
         const uint8* filter_ptr = filter_data;
@@ -3091,8 +3104,8 @@ struct DepthwiseConvMultiRow {
         }
 
         // Handle leftover depth.
-        ConvKernel::Run(input_ptr, filter_ptr, bias_ptr, output_ptr,
-                        depth, params.output_depth, params.input_depth,
+        ConvKernel::Run(input_ptr, filter_ptr, bias_ptr, output_ptr, depth,
+                        params.output_depth, params.input_depth,
                         params.input_row_size, shuffle_params.output_height,
                         shuffle_params.output_width, params);
 
@@ -3119,13 +3132,15 @@ struct DepthwiseConvMultiRow {
 //   * Horizontal edges.
 //   * Vertical edges.
 inline void DepthwiseConvHandlePadding(const uint8* input_data,
-    const uint8* filter_data, const int32* bias_data, uint8* output_data,
-    const DepthwiseConvParams& params) {
+                                       const uint8* filter_data,
+                                       const int32* bias_data,
+                                       uint8* output_data,
+                                       const DepthwiseConvParams& params) {
   if (params.input_width == 1 && params.input_height == 1) {
-    const uint8* filter_ptr = filter_data + params.filter_row_size
-        + params.output_depth;
-    DepthwiseConvPartial<EdgeType::kCenter, 1, 1>::Run(input_data, filter_ptr,
-        bias_data, output_data, &params);
+    const uint8* filter_ptr =
+        filter_data + params.filter_row_size + params.output_depth;
+    DepthwiseConvPartial<EdgeType::kCenter, 1, 1>::Run(
+        input_data, filter_ptr, bias_data, output_data, &params);
     return;
   }
 
@@ -3136,27 +3151,27 @@ inline void DepthwiseConvHandlePadding(const uint8* input_data,
 
   // Handle top row.
   const uint8* input_ptr = input_data;
-  const uint8* filter_ptr = filter_data + params.filter_row_size
-      + params.output_depth;
+  const uint8* filter_ptr =
+      filter_data + params.filter_row_size + params.output_depth;
   uint8* output_ptr = output_data;
 
-  DepthwiseConvPartial<EdgeType::kCorner, 1, 1>::Run(input_ptr, filter_ptr,
-      bias_data, output_ptr, &params);
+  DepthwiseConvPartial<EdgeType::kCorner, 1, 1>::Run(
+      input_ptr, filter_ptr, bias_data, output_ptr, &params);
 
   input_ptr += (params.stride_width - 1) * params.input_depth;
   filter_ptr = filter_data + params.filter_row_size;
   output_ptr += params.output_depth;
 
   for (int32 out_x = out_x_start_corner + 1; out_x < out_x_end_corner;
-           out_x++) {
+       out_x++) {
     DepthwiseConvPartial<EdgeType::kHorizontal, 1, 1>::Run(
         input_ptr, filter_ptr, bias_data, output_ptr, &params);
     input_ptr += params.stride_width * params.input_depth;
     output_ptr += params.output_depth;
   }
 
-  DepthwiseConvPartial<EdgeType::kCorner, 1, 1>::Run(input_ptr, filter_ptr,
-      bias_data, output_ptr, &params);
+  DepthwiseConvPartial<EdgeType::kCorner, 1, 1>::Run(
+      input_ptr, filter_ptr, bias_data, output_ptr, &params);
 
   // Handle left side.
   input_ptr = input_data + (params.stride_width - 1) * params.input_row_size;
@@ -3164,7 +3179,7 @@ inline void DepthwiseConvHandlePadding(const uint8* input_data,
   output_ptr = output_data + params.output_row_size;
 
   for (int32 out_y = out_y_start_corner + 1; out_y < out_y_end_corner;
-           out_y++) {
+       out_y++) {
     DepthwiseConvPartial<EdgeType::kVertical, 1, 1>::Run(
         input_ptr, filter_ptr, bias_data, output_ptr, &params);
     input_ptr += params.stride_width * params.input_row_size;
@@ -3172,14 +3187,14 @@ inline void DepthwiseConvHandlePadding(const uint8* input_data,
   }
 
   // Handle right side.
-  input_ptr = input_data + (params.input_width - 2) * params.input_depth
-      + (params.stride_width - 1) * params.input_row_size;
+  input_ptr = input_data + (params.input_width - 2) * params.input_depth +
+              (params.stride_width - 1) * params.input_row_size;
   filter_ptr = filter_data;
   output_ptr = output_data + params.output_row_size +
-      (params.output_width - 1) * params.output_depth;
+               (params.output_width - 1) * params.output_depth;
 
   for (int32 out_y = out_y_start_corner + 1; out_y < out_y_end_corner;
-         out_y++) {
+       out_y++) {
     DepthwiseConvPartial<EdgeType::kVertical, 1, 1>::Run(
         input_ptr, filter_ptr, bias_data, output_ptr, &params);
     input_ptr += params.stride_width * params.input_row_size;
@@ -3189,26 +3204,26 @@ inline void DepthwiseConvHandlePadding(const uint8* input_data,
   // Handle bottom row.
   input_ptr = input_data + (params.input_height - 2) * params.input_row_size;
   filter_ptr = filter_data + params.output_depth;
-  output_ptr = output_data +
-      (params.output_height - 1) * params.output_row_size;
+  output_ptr =
+      output_data + (params.output_height - 1) * params.output_row_size;
 
-  DepthwiseConvPartial<EdgeType::kCorner, 1, 1>::Run(input_ptr, filter_ptr,
-      bias_data, output_ptr, &params);
+  DepthwiseConvPartial<EdgeType::kCorner, 1, 1>::Run(
+      input_ptr, filter_ptr, bias_data, output_ptr, &params);
 
   input_ptr += (params.stride_width == 1) ? 0 : params.input_depth;
   filter_ptr = filter_data;
   output_ptr += params.output_depth;
 
   for (int32 out_x = out_x_start_corner + 1; out_x < out_x_end_corner;
-           out_x++) {
+       out_x++) {
     DepthwiseConvPartial<EdgeType::kHorizontal, 1, 1>::Run(
         input_ptr, filter_ptr, bias_data, output_ptr, &params);
     input_ptr += params.stride_width * params.input_depth;
     output_ptr += params.output_depth;
   }
 
-  DepthwiseConvPartial<EdgeType::kCorner, 1, 1>::Run(input_ptr, filter_ptr,
-      bias_data, output_ptr, &params);
+  DepthwiseConvPartial<EdgeType::kCorner, 1, 1>::Run(
+      input_ptr, filter_ptr, bias_data, output_ptr, &params);
 }
 
 inline bool Fast3x3FilterKernelSupported(
@@ -3383,8 +3398,8 @@ inline void DepthwiseConv3x3Filter(
       const int in_x = (out_x * stride_width) - pad_width;
       const int in_y = (out_y * stride_height) - pad_height;
       input_ptr += in_y * params.input_row_size + in_x * params.input_depth;
-      output_ptr += out_y * params.output_row_size
-          + out_x * params.output_depth;
+      output_ptr +=
+          out_y * params.output_row_size + out_x * params.output_depth;
     }
 
     // Shuffling shapes that maximize width over the shuffle workspace size
@@ -3439,7 +3454,6 @@ inline void DepthwiseConv3x3Filter(
     }
   }
 }
-// clang-format on
 
 #endif  // __aarch64__
 
diff --git a/tensorflow/lite/kernels/internal/optimized/optimized_ops.h b/tensorflow/lite/kernels/internal/optimized/optimized_ops.h
index 6f2cd4faab2..c7691e27632 100644
--- a/tensorflow/lite/kernels/internal/optimized/optimized_ops.h
+++ b/tensorflow/lite/kernels/internal/optimized/optimized_ops.h
@@ -25,6 +25,10 @@ limitations under the License.
 #include <tuple>
 #include <type_traits>
 
+#if defined(TF_LITE_USE_CBLAS) && defined(__APPLE__)
+#include <Accelerate/Accelerate.h>
+#endif
+
 #include "third_party/eigen3/Eigen/Core"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "fixedpoint/fixedpoint.h"
@@ -60,11 +64,13 @@ using reference_ops::DepthConcatenation;
 using reference_ops::Dequantize;
 using reference_ops::Div;
 using reference_ops::FakeQuant;
+using reference_ops::Fill;
 using reference_ops::Gather;
 using reference_ops::Greater;
 using reference_ops::GreaterEqual;
 using reference_ops::GreaterEqualWithScaling;
 using reference_ops::GreaterWithScaling;
+using reference_ops::LeakyRelu;
 using reference_ops::Less;
 using reference_ops::LessEqual;
 using reference_ops::LessEqualWithScaling;
@@ -1867,18 +1873,45 @@ inline void Conv(const ConvParams& params, const RuntimeShape& input_shape,
     gemm_input_shape = &input_shape;
   }
 
-  const auto im2col_matrix_map =
-      MapAsMatrixWithLastDimAsRows(gemm_input_data, *gemm_input_shape);
-  const auto filter_matrix_map =
-      MapAsMatrixWithFirstDimAsCols(filter_data, filter_shape);
-  auto output_matrix_map =
-      MapAsMatrixWithLastDimAsRows(output_data, output_shape);
+  // The following code computes matrix multiplication c = a * transponse(b)
+  // with CBLAS, where:
+  // * `a` is a matrix with dimensions (m, k).
+  // * `b` is a matrix with dimensions (n, k), so transpose(b) is (k, n).
+  // * `c` is a matrix with dimensions (m, n).
+  // The naming of variables are aligned with CBLAS specification here.
+  const float* a = gemm_input_data;
+  const float* b = filter_data;
+  float* c = output_data;
+  const int gemm_input_dims = gemm_input_shape->DimensionsCount();
+  int m = FlatSizeSkipDim(*gemm_input_shape, gemm_input_dims - 1);
+  int n = output_shape.Dims(3);
+  int k = gemm_input_shape->Dims(gemm_input_dims - 1);
 
-  Gemm(filter_matrix_map.transpose(), im2col_matrix_map, &output_matrix_map);
+#if defined(TF_LITE_USE_CBLAS) && defined(__APPLE__)
+  // The stride of matrix a, b and c respectively.
+  int stride_a = k;
+  int stride_b = k;
+  int stride_c = n;
 
-  AddBiasAndEvalActivationFunction(output_activation_min, output_activation_max,
-                                   bias_shape, bias_data, output_shape,
-                                   output_data);
+  cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, m, n, k, 1.0f, a,
+              stride_a, b, stride_b, 0.0f, c, stride_c);
+#else
+  // When an optimized CBLAS implementation is not available, fall back
+  // to using Eigen.
+  typedef Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>
+      Matrix;
+  typedef Eigen::Map<Matrix> MatrixRef;
+  typedef Eigen::Map<const Matrix> ConstMatrixRef;
+
+  MatrixRef matrix_c(c, m, n);
+  ConstMatrixRef matrix_a(a, m, k);
+  ConstMatrixRef matrix_b(b, n, k);
+  matrix_c.noalias() = matrix_a * matrix_b.transpose();
+#endif  //  defined(TF_LITE_USE_CBLAS) && defined(__APPLE__)
+
+  optimized_ops::AddBiasAndEvalActivationFunction(
+      output_activation_min, output_activation_max, bias_shape, bias_data,
+      output_shape, output_data);
 }
 
 inline void HybridConv(const ConvParams& params, float* scaling_factors_ptr,
@@ -4292,7 +4325,6 @@ inline void LogSoftmax(const SoftmaxParams& params,
   using FixedPointScaledDiff =
       gemmlowp::FixedPoint<int32, kScaledDiffIntegerBits>;
   using FixedPointAccum = gemmlowp::FixedPoint<int32, kAccumulationIntegerBits>;
-  using FixedPoint0 = gemmlowp::FixedPoint<int32, 0>;
 
   const int trailing_dim = input_shape.DimensionsCount() - 1;
   const int outer_size =
diff --git a/tensorflow/lite/kernels/internal/reference/reference_ops.h b/tensorflow/lite/kernels/internal/reference/reference_ops.h
index 1bd9129488a..ea3ab06da1f 100644
--- a/tensorflow/lite/kernels/internal/reference/reference_ops.h
+++ b/tensorflow/lite/kernels/internal/reference/reference_ops.h
@@ -558,6 +558,19 @@ inline void ReluX(const tflite::ActivationParams& params,
   }
 }
 
+inline void LeakyRelu(const tflite::LeakyReluParams& params,
+                      const RuntimeShape& input_shape, const float* input_data,
+                      const RuntimeShape& output_shape, float* output_data) {
+  gemmlowp::ScopedProfilingLabel label("LeakyRelu (not fused)");
+  const int flat_size = MatchingFlatSize(input_shape, output_shape);
+  for (int i = 0; i < flat_size; ++i) {
+    const float val = input_data[i];
+    // Note that this implementation matches that of TensorFlow, and corresponds
+    // to the traditional LeakyRelu equation only for alpha <= 1.
+    output_data[i] = std::max(val, val * params.alpha);
+  }
+}
+
 inline void L2Normalization(const tflite::L2NormalizationParams& op_params,
                             const RuntimeShape& input_shape,
                             const float* input_data,
@@ -2723,7 +2736,6 @@ inline void LogSoftmax(const SoftmaxParams& params,
   using FixedPointScaledDiff =
       gemmlowp::FixedPoint<int32, kScaledDiffIntegerBits>;
   using FixedPointAccum = gemmlowp::FixedPoint<int32, kAccumulationIntegerBits>;
-  using FixedPoint0 = gemmlowp::FixedPoint<int32, 0>;
 
   const int trailing_dim = input_shape.DimensionsCount() - 1;
   const int outer_size =
@@ -3651,8 +3663,10 @@ inline void Mean(const tflite::MeanParams& op_params,
                  const RuntimeShape& unextended_output_shape, T* output_data) {
   gemmlowp::ScopedProfilingLabel label("Mean");
 
-  TFLITE_DCHECK_LE(unextended_input_shape.DimensionsCount(), 4);
-  TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4);
+  // Current implementation only supports dimension equals 4 and simultaneous
+  // reduction over width and height.
+  TFLITE_CHECK_EQ(unextended_input_shape.DimensionsCount(), 4);
+  TFLITE_CHECK_LE(unextended_output_shape.DimensionsCount(), 4);
   const RuntimeShape input_shape =
       RuntimeShape::ExtendedShape(4, unextended_input_shape);
   const RuntimeShape output_shape =
@@ -3666,8 +3680,6 @@ inline void Mean(const tflite::MeanParams& op_params,
   const int input_height = input_shape.Dims(1);
   const int input_width = input_shape.Dims(2);
 
-  // The current implementation only supports simultaneous reduction over
-  // width and height.
   TFLITE_DCHECK_EQ(op_params.axis_count, 2);
   TFLITE_DCHECK((op_params.axis[0] == 1 && op_params.axis[1] == 2) ||
                 (op_params.axis[0] == 2 && op_params.axis[1] == 1));
@@ -4554,6 +4566,63 @@ inline void ResizeNearestNeighbor(
   }
 }
 
+inline void BroadcastPrelu4DSlow(const PreluParams& params,
+                                 const RuntimeShape& input_shape,
+                                 const uint8* input_data,
+                                 const RuntimeShape& alpha_shape,
+                                 const uint8* alpha_data,
+                                 const RuntimeShape& output_shape,
+                                 uint8* output_data) {
+  TFLITE_DCHECK_LE(input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(alpha_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(output_shape.DimensionsCount(), 4);
+  const RuntimeShape extended_output_shape =
+      RuntimeShape::ExtendedShape(4, output_shape);
+  NdArrayDesc<4> desc1;
+  NdArrayDesc<4> desc2;
+  NdArrayDescsForElementwiseBroadcast(input_shape, alpha_shape, &desc1, &desc2);
+
+  for (int b = 0; b < extended_output_shape.Dims(0); ++b) {
+    for (int y = 0; y < extended_output_shape.Dims(1); ++y) {
+      for (int x = 0; x < extended_output_shape.Dims(2); ++x) {
+        for (int c = 0; c < extended_output_shape.Dims(3); ++c) {
+          int output_index = Offset(extended_output_shape, b, y, x, c);
+          int input_index = SubscriptToIndex(desc1, b, y, x, c);
+          const int32 input_value =
+              params.input_offset + input_data[input_index];
+          if (input_value >= 0) {
+            output_data[output_index] = input_data[input_index];
+          } else {
+            auto alpha_index = SubscriptToIndex(desc2, b, y, x, c);
+            const int32 alpha_value =
+                params.alpha_offset + alpha_data[alpha_index];
+            const int32 unclamped_output =
+                params.output_offset +
+                MultiplyByQuantizedMultiplierSmallerThanOneExp(
+                    input_value * alpha_value, params.output_multiplier,
+                    params.output_shift);
+            const int32 quantized_min = std::numeric_limits<uint8_t>::min();
+            const int32 quantized_max = std::numeric_limits<uint8_t>::max();
+            const int32 clamped_output = std::min(
+                quantized_max, std::max(quantized_min, unclamped_output));
+            output_data[output_index] = static_cast<uint8>(clamped_output);
+          }
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+void Fill(const RuntimeShape& value_shape, const T* value_data,
+          const RuntimeShape& output_shape, T* output_data) {
+  TFLITE_DCHECK_EQ(value_shape.DimensionsCount(), 0);
+  const int flat_size = output_shape.FlatSize();
+  for (int i = 0; i < flat_size; ++i) {
+    output_data[i] = *value_data;
+  }
+}
+
 }  // namespace reference_ops
 }  // namespace tflite
 
diff --git a/tensorflow/lite/kernels/internal/tensor_ctypes.h b/tensorflow/lite/kernels/internal/tensor_ctypes.h
index d24dca9bfbb..4a94b703f8b 100644
--- a/tensorflow/lite/kernels/internal/tensor_ctypes.h
+++ b/tensorflow/lite/kernels/internal/tensor_ctypes.h
@@ -53,6 +53,11 @@ inline bool* GetTensorData(TfLiteTensor* tensor) {
   return tensor != nullptr ? tensor->data.b : nullptr;
 }
 
+template <>
+inline int8_t* GetTensorData(TfLiteTensor* tensor) {
+  return tensor != nullptr ? tensor->data.int8 : nullptr;
+}
+
 template <typename T>
 inline const T* GetTensorData(const TfLiteTensor* tensor);
 
@@ -66,6 +71,11 @@ inline const uint8_t* GetTensorData(const TfLiteTensor* tensor) {
   return tensor != nullptr ? tensor->data.uint8 : nullptr;
 }
 
+template <>
+inline const int8_t* GetTensorData(const TfLiteTensor* tensor) {
+  return tensor != nullptr ? tensor->data.int8 : nullptr;
+}
+
 template <>
 inline const int16_t* GetTensorData(const TfLiteTensor* tensor) {
   return tensor != nullptr ? tensor->data.i16 : nullptr;
diff --git a/tensorflow/lite/kernels/internal/types.h b/tensorflow/lite/kernels/internal/types.h
index a05bd5e0033..859ec8c6825 100644
--- a/tensorflow/lite/kernels/internal/types.h
+++ b/tensorflow/lite/kernels/internal/types.h
@@ -904,6 +904,14 @@ struct PadParams {
   ResizingCategory resizing_category;
 };
 
+struct PreluParams {
+  int32 input_offset;
+  int32 alpha_offset;
+  int32 output_offset;
+  int32 output_multiplier;
+  int output_shift;
+};
+
 struct PoolParams {
   FusedActivationFunctionType activation;
   PaddingType padding_type;
@@ -1006,6 +1014,10 @@ struct UnpackParams {
   int16 axis;
 };
 
+struct LeakyReluParams {
+  float alpha;
+};
+
 template <typename P>
 inline void SetActivationParams(float min, float max, P* params) {
   params->float_activation_min = min;
diff --git a/tensorflow/lite/kernels/mirror_pad.cc b/tensorflow/lite/kernels/mirror_pad.cc
new file mode 100644
index 00000000000..e74e47f7a37
--- /dev/null
+++ b/tensorflow/lite/kernels/mirror_pad.cc
@@ -0,0 +1,374 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
+#include "tensorflow/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
+#include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/kernels/op_macros.h"
+
+namespace tflite {
+namespace ops {
+namespace builtin {
+namespace mirror_pad {
+namespace {
+
+// Simple class that represents a mirror padded tensor - which is the output
+// from the Op.
+struct PaddedTensor {
+  // If not null that means this is a scalar value.
+  // Note: This is not owned by default. It will point to the value
+  // in the input tensor.
+  const void* value = nullptr;
+  // If this tensor is not one value, then this vector will have
+  // all the tensors that belongs to this tensor.
+  // Pointers are owned.
+  std::vector<std::unique_ptr<PaddedTensor>> values;
+  // Pointers to PaddedTensors that are padded on the left of the current
+  // tensor.
+  std::vector<PaddedTensor*> left_pad_ptrs;
+  // Pointers to PaddedTensors that are padded on the right of the current
+  // tensor.
+  std::vector<PaddedTensor*> right_pad_ptrs;
+
+  // Returns mutable pointer to the tensor identified by 'indices'.
+  PaddedTensor* GetMutable(const std::vector<int>& indices) {
+    auto* result = this;
+    for (int i = 0; i < indices.size(); ++i) {
+      if (indices[i] >= result->values.size()) {
+        return nullptr;
+      }
+      result = result->values[indices[i]].get();
+      if (result == nullptr) break;
+    }
+    return result;
+  }
+};
+
+// Util method to initialize the memory of the padded tensor.
+void InitializeTensorMemory(const TfLiteIntArray* const dims, int dim_index,
+                            int dims_size, PaddedTensor* padded_tensor) {
+  if (dim_index >= dims_size) {
+    return;
+  }
+  padded_tensor->values.reserve(dims->data[dim_index]);
+  for (int i = 0; i < dims->data[dim_index]; ++i) {
+    padded_tensor->values.emplace_back(new PaddedTensor());
+    InitializeTensorMemory(dims, dim_index + 1, dims_size,
+                           padded_tensor->values.back().get());
+  }
+}
+
+// Returns pointer to the value at the specified index in 'data'.
+inline const void* GetValuePointerAtIndex(const void* data, int index,
+                                          const TfLiteType data_type) {
+  switch (data_type) {
+    case kTfLiteFloat32:
+      return static_cast<const float*>(data) + index;
+    case kTfLiteInt32:
+      return static_cast<const int32_t*>(data) + index;
+    case kTfLiteUInt8:
+      return static_cast<const uint8_t*>(data) + index;
+    case kTfLiteInt64:
+      return static_cast<const int64_t*>(data) + index;
+    case kTfLiteBool:
+      return static_cast<const bool*>(data) + index;
+    case kTfLiteInt16:
+      return static_cast<const int16_t*>(data) + index;
+    case kTfLiteInt8:
+      return static_cast<const int8_t*>(data) + index;
+    // Unsupported types ?
+    default:
+      return nullptr;
+  }
+  return nullptr;
+}
+
+// Util method that increment index in the N-d array.
+void IncrementTensorIndex(const TfLiteIntArray* dims,
+                          std::vector<int>* tensor_index_ptr) {
+  int dimension_index = dims->size - 1;
+  auto& tensor_index = *tensor_index_ptr;
+  tensor_index[dimension_index]++;
+  while (dimension_index >= 0 &&
+         tensor_index[dimension_index] == dims->data[dimension_index]) {
+    tensor_index[dimension_index] = 0;
+    dimension_index--;
+    if (dimension_index >= 0) tensor_index[dimension_index]++;
+  }
+}
+
+// Fills the 'padded_tensor' with data from 'input_tensor'.
+TfLiteStatus InitFromInputTensor(const TfLiteTensor* input_tensor,
+                                 PaddedTensor* padded_tensor) {
+  const auto* dims = input_tensor->dims;
+  const auto data_type = input_tensor->type;
+  const void* data = static_cast<const void*>(input_tensor->data.raw_const);
+  // Either invalid input or unsupported type.+
+  if (data == nullptr) {
+    return kTfLiteError;
+  }
+  // Index of current processing tensor.
+  std::vector<int> tensor_index(dims->size, 0);
+  int flat_index = 0;
+  const int num_elements = NumElements(input_tensor);
+  while (flat_index < num_elements) {
+    auto* tensor = padded_tensor->GetMutable(tensor_index);
+    if (tensor == nullptr) {
+      return kTfLiteError;
+    }
+    tensor->value = GetValuePointerAtIndex(data, flat_index, data_type);
+    IncrementTensorIndex(dims, &tensor_index);
+    ++flat_index;
+  }
+
+  return kTfLiteOk;
+}
+
+template <typename T>
+inline void GetPadding(const T* data, int offset, int64_t* left_pad,
+                       int64_t* right_pad) {
+  *left_pad = static_cast<int64_t>(*(data + offset * 2));
+  *right_pad = static_cast<int64_t>(*(data + offset * 2 + 1));
+}
+
+inline TfLiteStatus GetPadding(const TfLiteTensor* padding_matrix,
+                               int dimension, int64_t* left_pad,
+                               int64_t* right_pad) {
+  switch (padding_matrix->type) {
+    case kTfLiteInt32:
+      GetPadding(padding_matrix->data.i32, dimension, left_pad, right_pad);
+      break;
+    case kTfLiteInt64:
+      GetPadding(padding_matrix->data.i64, dimension, left_pad, right_pad);
+      break;
+    default:
+      return kTfLiteError;
+  }
+  return kTfLiteOk;
+}
+
+TfLiteStatus ValidateTensor(const TfLiteTensor* padding_matrix, int offset,
+                            int dimension_index, PaddedTensor* padded_tensor,
+                            TfLiteContext* context) {
+  if (dimension_index >= padding_matrix->dims->data[0]) {
+    return kTfLiteOk;
+  }
+
+  int64_t left_pad = 0, right_pad = 0;
+  TF_LITE_ENSURE_STATUS(
+      GetPadding(padding_matrix, dimension_index, &left_pad, &right_pad));
+  // If we are not going to include border we must have enough values
+  // to use.
+  if (left_pad + offset > padded_tensor->values.size()) {
+    context->ReportError(
+        context, "Not enough values for Mirror Pad, required %d, available %d.",
+        left_pad + offset, padded_tensor->values.size());
+    return kTfLiteError;
+  }
+  if (right_pad + offset > padded_tensor->values.size()) {
+    context->ReportError(
+        context, "Not enough values for Mirror Pad, required %d, available %d.",
+        right_pad + offset, padded_tensor->values.size());
+    return kTfLiteError;
+  }
+  if (!padded_tensor->values.empty()) {
+    ValidateTensor(padding_matrix, offset, dimension_index + 1,
+                   padded_tensor->values[0].get(), context);
+  }
+  return kTfLiteOk;
+}
+
+// Fills 'padded_tensor' with the padding information based on
+// 'padding_matrix'.
+// 'dimension_index' represents which dimension the function is operating on.
+TfLiteStatus PadTensor(const TfLiteTensor* padding_matrix, int offset,
+                       int dimension_index, PaddedTensor* padded_tensor,
+                       TfLiteContext* context) {
+  if (dimension_index >= padding_matrix->dims->data[0]) return kTfLiteOk;
+
+  int64_t left_pad = 0, right_pad = 0;
+  TF_LITE_ENSURE_STATUS(
+      GetPadding(padding_matrix, dimension_index, &left_pad, &right_pad));
+
+  for (int i = left_pad + offset - 1; i >= offset && left_pad > 0;
+       --i, --left_pad) {
+    padded_tensor->left_pad_ptrs.push_back(padded_tensor->values[i].get());
+  }
+  for (int i = padded_tensor->values.size() - (1 + offset);
+       i >= 0 && right_pad > 0; --i, --right_pad) {
+    padded_tensor->right_pad_ptrs.push_back(padded_tensor->values[i].get());
+  }
+
+  for (auto& tensor : padded_tensor->values) {
+    TF_LITE_ENSURE_STATUS(PadTensor(padding_matrix, offset, dimension_index + 1,
+                                    tensor.get(), context));
+  }
+  return kTfLiteOk;
+}
+
+// Fills 'output_data' with data from 'padded_tensor'.
+// The function does this recursively by setting left padding first then
+// original data, followed by the right padding.
+template <typename T>
+int FillOutput(const PaddedTensor* padded_tensor, T* output_data,
+               int index_in_output) {
+  if (padded_tensor == nullptr || output_data == nullptr) {
+    return -1;
+  }
+  if (padded_tensor->value != nullptr) {
+    output_data[index_in_output] = *static_cast<const T*>(padded_tensor->value);
+    return index_in_output + 1;
+  }
+  for (const auto* tensor : padded_tensor->left_pad_ptrs) {
+    index_in_output = FillOutput(tensor, output_data, index_in_output);
+  }
+  for (const auto& tensor : padded_tensor->values) {
+    index_in_output = FillOutput(tensor.get(), output_data, index_in_output);
+  }
+  for (const auto* tensor : padded_tensor->right_pad_ptrs) {
+    index_in_output = FillOutput(tensor, output_data, index_in_output);
+  }
+  return index_in_output;
+}
+
+// Returns the shape of the final output after padding.
+std::unique_ptr<TfLiteIntArray, void (*)(TfLiteIntArray*)> GetPaddedOutputShape(
+    const TfLiteTensor* input, const TfLiteTensor* padding_matrix) {
+  const int input_dims = NumDimensions(input);
+  std::unique_ptr<TfLiteIntArray, void (*)(TfLiteIntArray*)> shape(
+      TfLiteIntArrayCreate(input_dims), TfLiteIntArrayFree);
+
+  int64_t left_pad = 0, right_pad = 0;
+  for (int i = 0; i < input_dims; ++i) {
+    GetPadding(padding_matrix, i, &left_pad, &right_pad);
+    shape->data[i] = SizeOfDimension(input, i) + left_pad + right_pad;
+  }
+  return shape;
+}
+
+}  // namespace
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  const TfLiteTensor* input_tensor = GetInput(context, node, 0);
+  const TfLiteTensor* padding_matrix = GetInput(context, node, 1);
+  auto* params =
+      reinterpret_cast<TfLiteMirrorPaddingParams*>(node->builtin_data);
+
+  if (params == nullptr) {
+    return kTfLiteError;
+  }
+  const int input_dims = NumDimensions(input_tensor);
+
+  TfLiteTensor* output_tensor = GetOutput(context, node, 0);
+  if (IsDynamicTensor(output_tensor)) {
+    auto output_size = GetPaddedOutputShape(input_tensor, padding_matrix);
+    if (output_size == nullptr) {
+      return kTfLiteError;
+    }
+    TF_LITE_ENSURE_STATUS(
+        context->ResizeTensor(context, output_tensor, output_size.release()));
+  }
+
+  PaddedTensor padded_tensor;
+  // Initialize memory.
+  InitializeTensorMemory(input_tensor->dims, 0, input_dims, &padded_tensor);
+  // Set the values from the input_tensor.
+  TF_LITE_ENSURE_STATUS(InitFromInputTensor(input_tensor, &padded_tensor));
+
+  const int offset =
+      params->mode != TfLiteMirrorPaddingMode::kTfLiteMirrorPaddingReflect ? 0
+                                                                           : 1;
+  // Make sure padding values are sufficient and valid to use.
+  TF_LITE_ENSURE_STATUS(
+      ValidateTensor(padding_matrix, offset, 0, &padded_tensor, context));
+  // Apply padding.
+  TF_LITE_ENSURE_STATUS(
+      PadTensor(padding_matrix, offset, 0, &padded_tensor, context));
+
+  // Fill the output tensor from the padded tensor.
+  TfLiteStatus status = kTfLiteOk;
+
+#define TF_LITE_MIRROR_PAD(type) \
+  FillOutput(&padded_tensor, GetTensorData<type>(output_tensor), 0);
+
+  switch (output_tensor->type) {
+    case kTfLiteFloat32: {
+      TF_LITE_MIRROR_PAD(float);
+      break;
+    }
+    case kTfLiteInt32: {
+      TF_LITE_MIRROR_PAD(int32_t);
+      break;
+    }
+    case kTfLiteUInt8: {
+      TF_LITE_MIRROR_PAD(uint8_t);
+      break;
+    }
+    case kTfLiteInt64: {
+      TF_LITE_MIRROR_PAD(int64_t);
+      break;
+    }
+    default:
+      status = kTfLiteError;
+      break;
+  }
+#undef TF_LITE_MIRROR_PAD
+  return status;
+}
+
+void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+  return nullptr;
+}
+
+void Free(TfLiteContext* context, void* buffer) {}
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  const TfLiteTensor* input_tensor = GetInput(context, node, 0);
+  const TfLiteTensor* padding_matrix = GetInput(context, node, 1);
+  TfLiteTensor* output_tensor = GetOutput(context, node, 0);
+
+  TF_LITE_ENSURE_EQ(context, NumDimensions(padding_matrix), 2);
+  TF_LITE_ENSURE_EQ(context, SizeOfDimension(padding_matrix, 0),
+                    NumDimensions(input_tensor));
+
+  if (!IsConstantTensor(padding_matrix)) {
+    SetTensorToDynamic(output_tensor);
+    return kTfLiteOk;
+  }
+  // We have constant padding, so we can infer output size.
+
+  auto output_size = GetPaddedOutputShape(input_tensor, padding_matrix);
+  if (output_size == nullptr) {
+    return kTfLiteError;
+  }
+  return context->ResizeTensor(context, output_tensor, output_size.release());
+}
+
+}  // namespace mirror_pad
+TfLiteRegistration* Register_MIRROR_PAD() {
+  static TfLiteRegistration r = {mirror_pad::Init, mirror_pad::Free,
+                                 mirror_pad::Prepare, mirror_pad::Eval};
+  return &r;
+}
+
+}  // namespace builtin
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/lite/kernels/mirror_pad_test.cc b/tensorflow/lite/kernels/mirror_pad_test.cc
new file mode 100644
index 00000000000..fd09e6e4493
--- /dev/null
+++ b/tensorflow/lite/kernels/mirror_pad_test.cc
@@ -0,0 +1,189 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <gtest/gtest.h>
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/model.h"
+
+namespace tflite {
+namespace {
+
+using ::testing::ElementsAreArray;
+
+template <typename T>
+class BaseMirrorPadOpModel : public SingleOpModel {
+ public:
+  BaseMirrorPadOpModel(const TensorData& input,
+                       const TensorData& padding_matrix,
+                       const TensorData& output,
+                       const tflite::MirrorPadMode mode) {
+    input_id_ = AddInput(input);
+    padding_matrix_id_ = AddInput(padding_matrix);
+    output_id_ = AddOutput(output);
+    SetBuiltinOp(BuiltinOperator_MIRROR_PAD, BuiltinOptions_MirrorPadOptions,
+                 CreateMirrorPadOptions(builder_, mode).Union());
+    BuildInterpreter({GetShape(input_id_), GetShape(padding_matrix_id_)});
+  }
+
+  int input_tensor_id() { return input_id_; }
+  int padding_matrix_tensor_id() { return padding_matrix_id_; }
+
+  std::vector<T> GetOutput() { return ExtractVector<T>(output_id_); }
+
+ protected:
+  int input_id_;
+  int padding_matrix_id_;
+  int output_id_;
+};
+
+TEST(MirrorPadTest, EmptyPad) {
+  BaseMirrorPadOpModel<int> model(
+      {TensorType_INT32, {2, 3}}, {TensorType_INT32, {2, 2}},
+      {TensorType_INT32, {}}, tflite::MirrorPadMode_REFLECT);
+  model.PopulateTensor<int>(model.input_tensor_id(), {1, 2, 3, 4, 5, 6});
+  model.PopulateTensor<int>(model.padding_matrix_tensor_id(), {0, 0, 0, 0});
+  model.Invoke();
+  EXPECT_THAT(model.GetOutput(), ElementsAreArray({1, 2, 3, 4, 5, 6}));
+}
+
+TEST(MirrorPadTest, PadOneSide_right_Reflect) {
+  BaseMirrorPadOpModel<int> model(
+      {TensorType_INT32, {2, 3}}, {TensorType_INT32, {2, 2}},
+      {TensorType_INT32, {}}, tflite::MirrorPadMode_REFLECT);
+  model.PopulateTensor<int>(model.input_tensor_id(), {1, 2, 3, 4, 5, 6});
+  model.PopulateTensor<int>(model.padding_matrix_tensor_id(), {0, 1, 0, 1});
+  model.Invoke();
+  EXPECT_THAT(model.GetOutput(),
+              ElementsAreArray({1, 2, 3, 2, 4, 5, 6, 5, 1, 2, 3, 2}));
+}
+
+TEST(MirrorPadTest, PadOneSide_left_Reflect) {
+  BaseMirrorPadOpModel<int> model(
+      {TensorType_INT32, {2, 3}}, {TensorType_INT32, {2, 2}},
+      {TensorType_INT32, {}}, tflite::MirrorPadMode_REFLECT);
+  model.PopulateTensor<int>(model.input_tensor_id(), {1, 2, 3, 4, 5, 6});
+  model.PopulateTensor<int>(model.padding_matrix_tensor_id(), {1, 0, 1, 0});
+  model.Invoke();
+  EXPECT_THAT(model.GetOutput(),
+              ElementsAreArray({5, 4, 5, 6, 2, 1, 2, 3, 5, 4, 5, 6}));
+}
+
+TEST(MirrorPadTest, PadOneSide_right_Symmetric) {
+  BaseMirrorPadOpModel<int> model(
+      {TensorType_INT32, {2, 3}}, {TensorType_INT32, {2, 2}},
+      {TensorType_INT32, {}}, tflite::MirrorPadMode_SYMMETRIC);
+  model.PopulateTensor<int>(model.input_tensor_id(), {1, 2, 3, 4, 5, 6});
+  model.PopulateTensor<int>(model.padding_matrix_tensor_id(), {0, 1, 0, 1});
+  model.Invoke();
+  EXPECT_THAT(model.GetOutput(),
+              ElementsAreArray({1, 2, 3, 3, 4, 5, 6, 6, 4, 5, 6, 6}));
+}
+
+TEST(MirrorPadTest, PadOneSide_left_Symmetric) {
+  BaseMirrorPadOpModel<int> model(
+      {TensorType_INT32, {2, 3}}, {TensorType_INT32, {2, 2}},
+      {TensorType_INT32, {}}, tflite::MirrorPadMode_SYMMETRIC);
+  model.PopulateTensor<int>(model.input_tensor_id(), {1, 2, 3, 4, 5, 6});
+  model.PopulateTensor<int>(model.padding_matrix_tensor_id(), {1, 0, 1, 0});
+  model.Invoke();
+  EXPECT_THAT(model.GetOutput(),
+              ElementsAreArray({1, 1, 2, 3, 1, 1, 2, 3, 4, 4, 5, 6}));
+}
+
+TEST(MirrorPadTest, PadBothSides_Symmetric) {
+  BaseMirrorPadOpModel<int> model(
+      {TensorType_INT32, {2, 3}}, {TensorType_INT32, {2, 2}},
+      {TensorType_INT32, {}}, tflite::MirrorPadMode_SYMMETRIC);
+  model.PopulateTensor<int>(model.input_tensor_id(), {1, 2, 3, 4, 5, 6});
+  model.PopulateTensor<int>(model.padding_matrix_tensor_id(), {1, 1, 1, 1});
+  model.Invoke();
+  EXPECT_THAT(model.GetOutput(),
+              ElementsAreArray({1, 1, 2, 3, 3, 1, 1, 2, 3, 3,
+                                4, 4, 5, 6, 6, 4, 4, 5, 6, 6}));
+}
+
+TEST(MirrorPadTest, PadBothSides_Reflect) {
+  BaseMirrorPadOpModel<int> model(
+      {TensorType_INT32, {2, 3}}, {TensorType_INT32, {2, 2}},
+      {TensorType_INT32, {}}, tflite::MirrorPadMode_REFLECT);
+  model.PopulateTensor<int>(model.input_tensor_id(), {1, 2, 3, 4, 5, 6});
+  model.PopulateTensor<int>(model.padding_matrix_tensor_id(), {1, 1, 1, 1});
+  model.Invoke();
+  EXPECT_THAT(model.GetOutput(),
+              ElementsAreArray({5, 4, 5, 6, 5, 2, 1, 2, 3, 2,
+                                5, 4, 5, 6, 5, 2, 1, 2, 3, 2}));
+}
+
+TEST(MirrorPadTest, PadBothSides_Symmetric_Whole) {
+  BaseMirrorPadOpModel<int> model(
+      {TensorType_INT32, {2, 3}}, {TensorType_INT32, {2, 2}},
+      {TensorType_INT32, {}}, tflite::MirrorPadMode_SYMMETRIC);
+  model.PopulateTensor<int>(model.input_tensor_id(), {1, 2, 3, 4, 5, 6});
+  model.PopulateTensor<int>(model.padding_matrix_tensor_id(), {2, 2, 3, 3});
+  model.Invoke();
+  EXPECT_THAT(
+      model.GetOutput(),
+      ElementsAreArray({6, 5, 4, 4, 5, 6, 6, 5, 4, 3, 2, 1, 1, 2, 3, 3, 2, 1,
+                        3, 2, 1, 1, 2, 3, 3, 2, 1, 6, 5, 4, 4, 5, 6, 6, 5, 4,
+                        6, 5, 4, 4, 5, 6, 6, 5, 4, 3, 2, 1, 1, 2, 3, 3, 2, 1}));
+}
+
+TEST(MirrorPadTest, PadBothSides_Reflect_Whole) {
+  BaseMirrorPadOpModel<int> model(
+      {TensorType_INT32, {2, 3}}, {TensorType_INT32, {2, 2}},
+      {TensorType_INT32, {}}, tflite::MirrorPadMode_REFLECT);
+  model.PopulateTensor<int>(model.input_tensor_id(), {1, 2, 3, 4, 5, 6});
+  model.PopulateTensor<int>(model.padding_matrix_tensor_id(), {1, 1, 2, 2});
+  model.Invoke();
+  EXPECT_THAT(model.GetOutput(),
+              ElementsAreArray({6, 5, 4, 5, 6, 5, 4, 3, 2, 1, 2, 3, 2, 1,
+                                6, 5, 4, 5, 6, 5, 4, 3, 2, 1, 2, 3, 2, 1}));
+}
+
+TEST(MirrorPadTest, Pad_Symmetric) {
+  BaseMirrorPadOpModel<int> model(
+      {TensorType_INT32, {2, 3}}, {TensorType_INT32, {2, 2}},
+      {TensorType_INT32, {}}, tflite::MirrorPadMode_SYMMETRIC);
+  model.PopulateTensor<int>(model.input_tensor_id(), {1, 2, 3, 4, 5, 6});
+  model.PopulateTensor<int>(model.padding_matrix_tensor_id(), {1, 1, 2, 2});
+  model.Invoke();
+  EXPECT_THAT(model.GetOutput(),
+              ElementsAreArray({2, 1, 1, 2, 3, 3, 2, 2, 1, 1, 2, 3, 3, 2,
+                                5, 4, 4, 5, 6, 6, 5, 5, 4, 4, 5, 6, 6, 5}));
+}
+
+TEST(MirrorPadTest, Pad_1D_Reflect) {
+  BaseMirrorPadOpModel<int> model(
+      {TensorType_INT32, {3}}, {TensorType_INT32, {1, 2}},
+      {TensorType_INT32, {}}, tflite::MirrorPadMode_REFLECT);
+  model.PopulateTensor<int>(model.input_tensor_id(), {1, 2, 3});
+  model.PopulateTensor<int>(model.padding_matrix_tensor_id(), {0, 2});
+  model.Invoke();
+  EXPECT_THAT(model.GetOutput(), ElementsAreArray({1, 2, 3, 2, 1}));
+}
+
+TEST(MirrorPadTest, Pad_1D_Symmetric) {
+  BaseMirrorPadOpModel<int> model(
+      {TensorType_INT32, {3}}, {TensorType_INT32, {1, 2}},
+      {TensorType_INT32, {}}, tflite::MirrorPadMode_SYMMETRIC);
+  model.PopulateTensor<int>(model.input_tensor_id(), {1, 2, 3});
+  model.PopulateTensor<int>(model.padding_matrix_tensor_id(), {0, 2});
+  model.Invoke();
+  EXPECT_THAT(model.GetOutput(), ElementsAreArray({1, 2, 3, 3, 2}));
+}
+
+}  // namespace
+}  // namespace tflite
diff --git a/tensorflow/lite/kernels/pooling_test.cc b/tensorflow/lite/kernels/pooling_test.cc
index 80eef025090..98777f1c13f 100644
--- a/tensorflow/lite/kernels/pooling_test.cc
+++ b/tensorflow/lite/kernels/pooling_test.cc
@@ -67,6 +67,10 @@ class QuantizedPoolingOpModel : public BasePoolingOpModel {
     QuantizeAndPopulate<uint8_t>(input_, data);
   }
 
+  void SetInput(const std::vector<float>& data) {
+    QuantizeAndPopulate<uint8_t>(input_, data);
+  }
+
   std::vector<uint8_t> GetOutput() { return ExtractVector<uint8_t>(output_); }
   std::vector<float> GetDequantizedOutput() {
     return Dequantize<uint8_t>(ExtractVector<uint8_t>(output_),
@@ -106,6 +110,45 @@ TEST(QuantizedPoolingOpTest, AveragePool) {
   EXPECT_THAT(m.GetOutput(), ElementsAreArray({44, 92}));
 }
 
+// Send in a white image, expect a white pixel.
+TEST(QuantizedPoolingOpTest, AveragePoolImageSize16) {
+  int image_size = 16;
+  QuantizedPoolingOpModel m(
+      BuiltinOperator_AVERAGE_POOL_2D,
+      /*input=*/{TensorType_UINT8, {1, image_size, image_size, 1}, 0, 16},
+      /*filter_width=*/image_size,
+      /*filter_height=*/image_size,
+      /*output=*/{TensorType_UINT8, {}, 0, 16});
+
+  std::vector<float> input(image_size * image_size, 16.f);
+  m.SetInput(input);
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput(), ::testing::ElementsAre(255));
+  EXPECT_THAT(m.GetDequantizedOutput(), ElementsAreArray(ArrayFloatNear({16})));
+}
+
+// Send in a white image, expect something other than a white pixel, due to
+// overflow.
+TEST(QuantizedPoolingOpTest, AveragePoolImageSize17) {
+  int image_size = 17;
+  QuantizedPoolingOpModel m(
+      BuiltinOperator_AVERAGE_POOL_2D,
+      /*input=*/{TensorType_UINT8, {1, image_size, image_size, 1}, 0, 16},
+      /*filter_width=*/image_size,
+      /*filter_height=*/image_size,
+      /*output=*/{TensorType_UINT8, {}, 0, 16});
+
+  std::vector<float> input(image_size * image_size, 16.f);
+  m.SetInput(input);
+  m.Invoke();
+
+  // Ordinarily we would see '255' here. However, the optimized version of
+  // AveragePool uses a uint16 accumulator which causes it to overflow for
+  // images this large.
+  EXPECT_THAT(m.GetOutput(), ::testing::ElementsAre(28));
+}
+
 TEST(FloatPoolingOpTest, MaxPool) {
   FloatPoolingOpModel m(BuiltinOperator_MAX_POOL_2D,
                         /*input=*/{TensorType_FLOAT32, {1, 2, 4, 1}},
diff --git a/tensorflow/lite/kernels/reduce.cc b/tensorflow/lite/kernels/reduce.cc
index ed2d475f6d7..336e827ca4c 100644
--- a/tensorflow/lite/kernels/reduce.cc
+++ b/tensorflow/lite/kernels/reduce.cc
@@ -20,6 +20,8 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/quantization_util.h"
 #include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/kernels/internal/types.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/kernels/op_macros.h"
 
@@ -229,6 +231,17 @@ TfLiteStatus PrepareMeanOrSum(TfLiteContext* context, TfLiteNode* node) {
   return ResizeTempSum(context, &op_context, temp_sum);
 }
 
+void ResolveAxis(const int* axis_data, int axis_count,
+                 tflite::MeanParams* op_params) {
+  int i = 0;
+  for (; i < axis_count; ++i) {
+    op_params->axis[i] = static_cast<int16>(axis_data[i]);
+  }
+  for (; i < 4; ++i) {
+    op_params->axis[i] = 1;
+  }
+}
+
 template <KernelType kernel_type>
 TfLiteStatus EvalMean(TfLiteContext* context, TfLiteNode* node) {
   OpContext op_context(context, node);
@@ -257,9 +270,23 @@ TfLiteStatus EvalMean(TfLiteContext* context, TfLiteNode* node) {
 
   if (kernel_type == kReference) {
     switch (op_context.input->type) {
-      case kTfLiteFloat32:
-        TF_LITE_ENSURE(context, TF_LITE_MEAN(reference_ops, float, float));
-        break;
+      case kTfLiteFloat32: {
+        tflite::MeanParams op_params;
+        op_params.axis_count = num_axis;
+        ResolveAxis(GetTensorData<int>(op_context.axis), num_axis, &op_params);
+        const TfLiteTensor* input = op_context.input;
+        if (op_context.params->keep_dims && NumDimensions(input) == 4 &&
+            op_params.axis_count == 2 &&
+            ((op_params.axis[0] == 1 && op_params.axis[1] == 2) ||
+             (op_params.axis[0] == 2 && op_params.axis[1] == 1))) {
+          reference_ops::Mean(op_params, GetTensorShape(input),
+                              GetTensorData<float>(input),
+                              GetTensorShape(op_context.output),
+                              GetTensorData<float>(op_context.output));
+        } else {
+          TF_LITE_ENSURE(context, TF_LITE_MEAN(reference_ops, float, float));
+        }
+      } break;
       case kTfLiteInt32:
         TF_LITE_ENSURE(context, TF_LITE_MEAN(reference_ops, int, int64_t));
         break;
@@ -286,7 +313,8 @@ TfLiteStatus EvalMean(TfLiteContext* context, TfLiteNode* node) {
                   GetTensorData<int>(op_context.axis), num_axis,
                   op_context.params->keep_dims, GetTensorData<int>(temp_index),
                   GetTensorData<int>(resolved_axis),
-                  GetTensorData<int>(temp_sum), /*compute_sum=*/false));
+                  GetTensorData<int>(temp_sum),
+                  /*compute_sum=*/false));
         }
         break;
       default:
diff --git a/tensorflow/lite/kernels/register.cc b/tensorflow/lite/kernels/register.cc
index c6834537671..3c60d281b39 100644
--- a/tensorflow/lite/kernels/register.cc
+++ b/tensorflow/lite/kernels/register.cc
@@ -31,6 +31,7 @@ TfLiteRegistration* Register_RELU_1();
 
 namespace builtin {
 
+TfLiteRegistration* Register_ABS();
 TfLiteRegistration* Register_RELU();
 TfLiteRegistration* Register_RELU_N1_TO_1();
 TfLiteRegistration* Register_RELU6();
@@ -74,6 +75,7 @@ TfLiteRegistration* Register_GATHER();
 TfLiteRegistration* Register_TRANSPOSE();
 TfLiteRegistration* Register_MEAN();
 TfLiteRegistration* Register_SPLIT();
+TfLiteRegistration* Register_SPLIT_V();
 TfLiteRegistration* Register_SQUEEZE();
 TfLiteRegistration* Register_STRIDED_SLICE();
 TfLiteRegistration* Register_EXP();
@@ -123,6 +125,10 @@ TfLiteRegistration* Register_SQUARE();
 TfLiteRegistration* Register_ZEROS_LIKE();
 TfLiteRegistration* Register_FLOOR_MOD();
 TfLiteRegistration* Register_RANGE();
+TfLiteRegistration* Register_LEAKY_RELU();
+TfLiteRegistration* Register_SQUARED_DIFFERENCE();
+TfLiteRegistration* Register_FILL();
+TfLiteRegistration* Register_MIRROR_PAD();
 
 TfLiteStatus UnsupportedTensorFlowOp(TfLiteContext* context, TfLiteNode* node) {
   context->ReportError(
@@ -152,6 +158,7 @@ const TfLiteRegistration* BuiltinOpResolver::FindOp(const char* op,
 }
 
 BuiltinOpResolver::BuiltinOpResolver() {
+  AddBuiltin(BuiltinOperator_ABS, Register_ABS());
   AddBuiltin(BuiltinOperator_RELU, Register_RELU());
   AddBuiltin(BuiltinOperator_RELU_N1_TO_1, Register_RELU_N1_TO_1());
   AddBuiltin(BuiltinOperator_RELU6, Register_RELU6());
@@ -207,6 +214,7 @@ BuiltinOpResolver::BuiltinOpResolver() {
   AddBuiltin(BuiltinOperator_DIV, Register_DIV());
   AddBuiltin(BuiltinOperator_SUB, Register_SUB());
   AddBuiltin(BuiltinOperator_SPLIT, Register_SPLIT());
+  AddBuiltin(BuiltinOperator_SPLIT_V, Register_SPLIT_V());
   AddBuiltin(BuiltinOperator_SQUEEZE, Register_SQUEEZE());
   AddBuiltin(BuiltinOperator_STRIDED_SLICE, Register_STRIDED_SLICE());
   AddBuiltin(BuiltinOperator_EXP, Register_EXP());
@@ -256,6 +264,10 @@ BuiltinOpResolver::BuiltinOpResolver() {
   AddBuiltin(BuiltinOperator_ZEROS_LIKE, Register_ZEROS_LIKE());
   AddBuiltin(BuiltinOperator_FLOOR_MOD, Register_FLOOR_MOD());
   AddBuiltin(BuiltinOperator_RANGE, Register_RANGE());
+  AddBuiltin(BuiltinOperator_LEAKY_RELU, Register_LEAKY_RELU());
+  AddBuiltin(BuiltinOperator_SQUARED_DIFFERENCE, Register_SQUARED_DIFFERENCE());
+  AddBuiltin(BuiltinOperator_FILL, Register_FILL());
+  AddBuiltin(BuiltinOperator_MIRROR_PAD, Register_MIRROR_PAD());
 
   // TODO(andrewharp, ahentz): Move these somewhere more appropriate so that
   // custom ops aren't always included by default.
diff --git a/tensorflow/lite/kernels/register.h b/tensorflow/lite/kernels/register.h
index eb5ce667d4c..059c9d165ee 100644
--- a/tensorflow/lite/kernels/register.h
+++ b/tensorflow/lite/kernels/register.h
@@ -15,7 +15,6 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_KERNELS_REGISTER_H_
 #define TENSORFLOW_LITE_KERNELS_REGISTER_H_
 
-#include <unordered_map>
 #include "tensorflow/lite/c/c_api_internal.h"
 #include "tensorflow/lite/model.h"
 #include "tensorflow/lite/mutable_op_resolver.h"
diff --git a/tensorflow/lite/kernels/skip_gram.cc b/tensorflow/lite/kernels/skip_gram.cc
index f20719ecaf6..265ba18a3e3 100644
--- a/tensorflow/lite/kernels/skip_gram.cc
+++ b/tensorflow/lite/kernels/skip_gram.cc
@@ -107,7 +107,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   // Generate n-grams recursively.
   tflite::DynamicBuffer buf;
   if (words.size() < params->ngram_size) {
-    buf.WriteToTensor(GetOutput(context, node, 0));
+    buf.WriteToTensorAsVector(GetOutput(context, node, 0));
     return kTfLiteOk;
   }
 
@@ -145,7 +145,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
     }
   }
 
-  buf.WriteToTensor(GetOutput(context, node, 0));
+  buf.WriteToTensorAsVector(GetOutput(context, node, 0));
   return kTfLiteOk;
 }
 }  // namespace
diff --git a/tensorflow/lite/kernels/split_v.cc b/tensorflow/lite/kernels/split_v.cc
new file mode 100644
index 00000000000..060e3c5f79c
--- /dev/null
+++ b/tensorflow/lite/kernels/split_v.cc
@@ -0,0 +1,207 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <vector>
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
+#include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
+#include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/kernels/op_macros.h"
+
+namespace tflite {
+namespace ops {
+namespace builtin {
+namespace split_v {
+
+struct OpContext {
+  OpContext(TfLiteContext* context, TfLiteNode* node) {
+    params = reinterpret_cast<TfLiteSplitVParams*>(node->builtin_data);
+    input = GetInput(context, node, 0);
+    size_splits = GetInput(context, node, 1);
+    axis = GetInput(context, node, 2);
+  }
+  TfLiteSplitVParams* params;
+  const TfLiteTensor* input;
+  const TfLiteTensor* size_splits;
+  const TfLiteTensor* axis;
+};
+
+TfLiteStatus UseDynamicOutputTensors(TfLiteContext* context, TfLiteNode* node) {
+  for (int i = 0; i < NumOutputs(node); ++i) {
+    SetTensorToDynamic(GetOutput(context, node, i));
+  }
+  return kTfLiteOk;
+}
+
+template <typename T>
+void GetSizeSplitsVector(const TfLiteTensor* size_splits,
+                         std::vector<int64_t>* size_splits_vector) {
+  const auto num_elements = NumElements(size_splits);
+  for (int i = 0; i < num_elements; ++i) {
+    size_splits_vector->push_back(GetTensorData<T>(size_splits)[i]);
+  }
+}
+
+TfLiteStatus ResizeOutputTensors(TfLiteContext* context, TfLiteNode* node,
+                                 const TfLiteTensor* input,
+                                 const TfLiteTensor* size_splits,
+                                 const TfLiteTensor* axis) {
+  int axis_value = GetTensorData<int>(axis)[0];
+  if (axis_value < 0) {
+    axis_value += NumDimensions(input);
+  }
+
+  std::vector<int64_t> size_splits_vector;
+  if (size_splits->type == kTfLiteInt32) {
+    GetSizeSplitsVector<int32_t>(size_splits, &size_splits_vector);
+  } else if (size_splits->type == kTfLiteInt64) {
+    GetSizeSplitsVector<int64_t>(size_splits, &size_splits_vector);
+  } else {
+    context->ReportError(context, "size_splits only support type int32|int64.");
+    return kTfLiteError;
+  }
+
+  int minus_one_index = -1;
+  int64_t size_splits_sum = 0;
+
+  for (int i = 0; i < size_splits_vector.size(); ++i) {
+    if (size_splits_vector.at(i) == -1) {
+      if (minus_one_index == -1) {
+        minus_one_index = i;
+      } else {
+        context->ReportError(context,
+                             "The size_splits contains more than one -1.");
+      }
+    } else {
+      size_splits_sum += size_splits_vector.at(i);
+    }
+  }
+
+  const int input_size = SizeOfDimension(input, axis_value);
+
+  if (minus_one_index != -1) {
+    if (size_splits_sum > input_size) {
+      context->ReportError(
+          context,
+          "The sum of size_splits must be less than the dimension of value.");
+    } else {
+      size_splits_vector[minus_one_index] = input_size - size_splits_sum;
+    }
+  } else if (size_splits_sum != input_size) {
+    context->ReportError(
+        context,
+        "The size_splits must sum to the dimension of value along axis.");
+  }
+
+  for (int i = 0; i < NumOutputs(node); ++i) {
+    TfLiteIntArray* output_dims = TfLiteIntArrayCopy(input->dims);
+    output_dims->data[axis_value] = size_splits_vector.at(i);
+    TfLiteTensor* output = GetOutput(context, node, i);
+    TF_LITE_ENSURE_STATUS(context->ResizeTensor(context, output, output_dims));
+  }
+
+  return kTfLiteOk;
+}
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 3);
+
+  OpContext op_context(context, node);
+
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), op_context.params->num_splits);
+
+  auto input_type = op_context.input->type;
+  TF_LITE_ENSURE(context, input_type == kTfLiteFloat32 ||
+                              input_type == kTfLiteUInt8 ||
+                              input_type == kTfLiteInt16);
+  for (int i = 0; i < NumOutputs(node); ++i) {
+    GetOutput(context, node, i)->type = input_type;
+  }
+
+  auto size_splits = op_context.size_splits;
+  TF_LITE_ENSURE_EQ(context, NumDimensions(size_splits), 1);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), NumElements(size_splits));
+
+  // If we know the contents of the 'size_splits' tensor and the 'axis' tensor,
+  // resize all outputs. Otherwise, wait until Eval().
+  if (IsConstantTensor(op_context.size_splits) &&
+      IsConstantTensor(op_context.axis)) {
+    return ResizeOutputTensors(context, node, op_context.input,
+                               op_context.size_splits, op_context.axis);
+  } else {
+    return UseDynamicOutputTensors(context, node);
+  }
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  OpContext op_context(context, node);
+
+  // When the 'size_splits' and the 'axis' tensor is non-const we can't resize
+  // output tensors in Prepare(), and we have to do it now.
+  if (!IsConstantTensor(op_context.axis) ||
+      !IsConstantTensor(op_context.size_splits)) {
+    TF_LITE_ENSURE_OK(
+        context, ResizeOutputTensors(context, node, op_context.input,
+                                     op_context.size_splits, op_context.axis));
+  }
+
+  int axis_value = GetTensorData<int>(op_context.axis)[0];
+
+  // Use split function to build the outputs since they share the same logic.
+#define TF_LITE_SPLIT_V(scalar)                                     \
+  VectorOfTensors<scalar> all_outputs(*context, *node->outputs);    \
+  tflite::SplitParams op_params;                                    \
+  op_params.num_split = NumOutputs(node);                           \
+  op_params.axis = axis_value;                                      \
+  reference_ops::Split(op_params, GetTensorShape(op_context.input), \
+                       GetTensorData<scalar>(op_context.input),     \
+                       all_outputs.shapes(), all_outputs.data());
+  switch (op_context.input->type) {
+    case kTfLiteFloat32: {
+      TF_LITE_SPLIT_V(float);
+      break;
+    }
+    case kTfLiteUInt8: {
+      TF_LITE_SPLIT_V(uint8_t);
+      break;
+    }
+    case kTfLiteInt16: {
+      TF_LITE_SPLIT_V(int16_t);
+      break;
+    }
+    default:
+      context->ReportError(
+          context,
+          "Only float32, uint8 and int16 are currently supported, got %d.",
+          op_context.input->type);
+      return kTfLiteError;
+  }
+#undef TF_LITE_SPLIT_V
+
+  return kTfLiteOk;
+}
+
+}  // namespace split_v
+
+TfLiteRegistration* Register_SPLIT_V() {
+  static TfLiteRegistration r = {nullptr, nullptr, split_v::Prepare,
+                                 split_v::Eval};
+  return &r;
+}
+
+}  // namespace builtin
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/lite/kernels/split_v_test.cc b/tensorflow/lite/kernels/split_v_test.cc
new file mode 100644
index 00000000000..2d1d36d6851
--- /dev/null
+++ b/tensorflow/lite/kernels/split_v_test.cc
@@ -0,0 +1,175 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <initializer_list>
+#include <gtest/gtest.h>
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/model.h"
+
+namespace tflite {
+namespace {
+
+using ::testing::ElementsAreArray;
+
+constexpr int kAxisIsATensor = -1000;
+
+class SplitVOpModel : public SingleOpModel {
+ public:
+  SplitVOpModel(const TensorData& input, const TensorData& size_splits,
+                int num_splits, int axis) {
+    input_ = AddInput(input);
+    size_splits_ = AddInput(size_splits);
+    if (axis == kAxisIsATensor) {
+      axis_ = AddInput({TensorType_INT32, {1}});
+    } else {
+      axis_ = AddConstInput(TensorType_INT32, {axis}, {1});
+    }
+    for (int i = 0; i < num_splits; ++i) {
+      outputs_.push_back(AddOutput(input.type));
+    }
+    SetBuiltinOp(BuiltinOperator_SPLIT_V, BuiltinOptions_SplitVOptions,
+                 CreateSplitVOptions(builder_, num_splits).Union());
+    if (axis == kAxisIsATensor) {
+      BuildInterpreter(
+          {GetShape(input_), GetShape(size_splits_), GetShape(axis_)});
+    } else {
+      BuildInterpreter({GetShape(input_), GetShape(size_splits_), {}});
+    }
+  }
+
+  void SetInput(std::initializer_list<float> data) {
+    PopulateTensor(input_, data);
+  }
+  void SetSizeSplits(std::initializer_list<int> data) {
+    PopulateTensor(size_splits_, data);
+  }
+  void SetAxis(int axis) { PopulateTensor(axis_, {axis}); }
+
+  std::vector<float> GetOutput(int i) {
+    return ExtractVector<float>(outputs_[i]);
+  }
+  std::vector<int> GetOutputShape(int i) { return GetTensorShape(outputs_[i]); }
+
+ private:
+  int input_;
+  int size_splits_;
+  int axis_;
+  std::vector<int> outputs_;
+};
+
+// TODO(ruic): Add tests to test quantized values. b/119638735
+using TensorValues = std::initializer_list<float>;
+
+void Check(int axis, std::initializer_list<int> input_shape,
+           std::initializer_list<int> size_splits_shape,
+           std::vector<std::initializer_list<int>> output_shapes,
+           const TensorValues& input_data,
+           const std::initializer_list<int>& size_splits_data,
+           const std::vector<TensorValues>& output_data) {
+  int num_splits = size_splits_data.size();
+  SplitVOpModel m({TensorType_FLOAT32, input_shape},
+                  {TensorType_INT32, size_splits_shape}, num_splits,
+                  kAxisIsATensor);
+  m.SetInput(input_data);
+  m.SetSizeSplits(size_splits_data);
+  m.SetAxis(axis);
+  m.Invoke();
+  for (int i = 0; i < num_splits; ++i) {
+    EXPECT_THAT(m.GetOutput(i), ElementsAreArray(output_data[i]));
+    EXPECT_THAT(m.GetOutputShape(i), ElementsAreArray(output_shapes[i]));
+  }
+
+  SplitVOpModel const_m({TensorType_FLOAT32, input_shape},
+                        {TensorType_INT32, size_splits_shape}, num_splits,
+                        axis);
+  const_m.SetInput(input_data);
+  const_m.SetSizeSplits(size_splits_data);
+  const_m.Invoke();
+  for (int i = 0; i < num_splits; ++i) {
+    EXPECT_THAT(const_m.GetOutput(i), ElementsAreArray(output_data[i]));
+    EXPECT_THAT(const_m.GetOutputShape(i), ElementsAreArray(output_shapes[i]));
+  }
+}
+
+TEST(SplitVOpTest, TwoDimensional) {
+  // Input shape: {4, 3}
+  // size_splits: {1, 1, 3}
+  // axis: 0
+  // We should have 3 outpus with shapes respectively:
+  //  output 0 : {1, 3}
+  //  output 1 : {1, 3}
+  //  output 1 : {2, 3}
+  Check(/*axis=*/0, {4, 3}, {3}, {{1, 3}, {1, 3}, {2, 3}},
+        {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}, {1, 1, 2},
+        {{1, 2, 3}, {4, 5, 6}, {7, 8, 9, 10, 11, 12}});
+}
+
+TEST(SplitVOpTest, FourDimensional) {
+  Check(/*axis=*/0, {2, 2, 2, 2}, {2}, {{1, 2, 2, 2}, {1, 2, 2, 2}},
+        {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}, {1, 1},
+        {
+            {1, 2, 3, 4, 5, 6, 7, 8},
+            {9, 10, 11, 12, 13, 14, 15, 16},
+        });
+  Check(/*axis=*/1, {2, 2, 2, 2}, {2}, {{2, 1, 2, 2}, {2, 1, 2, 2}},
+        {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}, {1, -1},
+        {
+            {1, 2, 3, 4, 9, 10, 11, 12},
+            {5, 6, 7, 8, 13, 14, 15, 16},
+        });
+  Check(/*axis=*/2, {2, 2, 2, 2}, {2}, {{2, 2, 1, 2}, {2, 2, 1, 2}},
+        {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}, {1, 1},
+        {
+            {1, 2, 5, 6, 9, 10, 13, 14},
+            {3, 4, 7, 8, 11, 12, 15, 16},
+        });
+  Check(/*axis=*/3, {2, 2, 2, 2}, {2}, {{2, 2, 2, 1}, {2, 2, 2, 1}},
+        {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}, {1, 1},
+        {
+            {1, 3, 5, 7, 9, 11, 13, 15},
+            {2, 4, 6, 8, 10, 12, 14, 16},
+        });
+}
+
+TEST(SplitVOpTest, OneDimensional) {
+  Check(/*axis=*/0, {8}, {8}, {{1}, {1}, {1}, {1}, {1}, {1}, {1}, {1}},
+        {1, 2, 3, 4, 5, 6, 7, 8}, {1, 1, 1, 1, 1, 1, 1, 1},
+        {{1}, {2}, {3}, {4}, {5}, {6}, {7}, {8}});
+}
+
+TEST(SplitVOpTest, OneDimensional2) {
+  Check(/*axis=*/0, {8}, {8}, {{1}, {1}, {1}, {1}, {1}, {1}, {2}, {0}},
+        {1, 2, 3, 4, 5, 6, 7, 8}, {1, 1, 1, 1, 1, 1, 2, -1},
+        {{1}, {2}, {3}, {4}, {5}, {6}, {7, 8}, {}});
+}
+
+TEST(SplitVOpTest, NegativeAxis) {
+  Check(/*axis=*/-4, {2, 2, 2, 2}, {2}, {{1, 2, 2, 2}, {1, 2, 2, 2}},
+        {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}, {1, 1},
+        {
+            {1, 2, 3, 4, 5, 6, 7, 8},
+            {9, 10, 11, 12, 13, 14, 15, 16},
+        });
+}
+
+}  // namespace
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/lite/kernels/squared_difference.cc b/tensorflow/lite/kernels/squared_difference.cc
new file mode 100644
index 00000000000..59b53a6287d
--- /dev/null
+++ b/tensorflow/lite/kernels/squared_difference.cc
@@ -0,0 +1,129 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
+#include "tensorflow/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
+#include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/kernels/op_macros.h"
+
+namespace tflite {
+namespace ops {
+namespace builtin {
+namespace squared_difference {
+
+constexpr int kInputTensor1 = 0;
+constexpr int kInputTensor2 = 1;
+constexpr int kOutputTensor = 0;
+
+struct OpData {
+  bool requires_broadcast;
+};
+
+template <typename T>
+T SquaredDifference(T input1, T input2) {
+  const T difference = input1 - input2;
+  return difference * difference;
+}
+
+void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+  auto* data = new OpData;
+  data->requires_broadcast = false;
+  return data;
+}
+
+void Free(TfLiteContext* context, void* buffer) {
+  delete reinterpret_cast<OpData*>(buffer);
+}
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  OpData* data = reinterpret_cast<OpData*>(node->user_data);
+
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 2);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+
+  const TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
+  const TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  TF_LITE_ENSURE_EQ(context, input1->type, input2->type);
+  output->type = input2->type;
+
+  data->requires_broadcast = !HaveSameShapes(input1, input2);
+
+  TfLiteIntArray* output_size = nullptr;
+  if (data->requires_broadcast) {
+    TF_LITE_ENSURE_OK(context, CalculateShapeForBroadcast(
+                                   context, input1, input2, &output_size));
+  } else {
+    output_size = TfLiteIntArrayCopy(input1->dims);
+  }
+
+  return context->ResizeTensor(context, output, output_size);
+}
+
+template <typename T>
+void EvalSquaredDifference(TfLiteContext* context, TfLiteNode* node,
+                           const OpData* data, const TfLiteTensor* input1,
+                           const TfLiteTensor* input2, TfLiteTensor* output) {
+  if (data->requires_broadcast) {
+    reference_ops::BroadcastBinaryFunction4DSlow<T, T, T>(
+        GetTensorShape(input1), GetTensorData<T>(input1),
+        GetTensorShape(input2), GetTensorData<T>(input2),
+        GetTensorShape(output), GetTensorData<T>(output), SquaredDifference<T>);
+  } else {
+    reference_ops::BinaryFunction<T, T, T>(
+        GetTensorShape(input1), GetTensorData<T>(input1),
+        GetTensorShape(input2), GetTensorData<T>(input2),
+        GetTensorShape(output), GetTensorData<T>(output), SquaredDifference<T>);
+  }
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  OpData* data = reinterpret_cast<OpData*>(node->user_data);
+
+  const TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
+  const TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  if (output->type == kTfLiteFloat32) {
+    EvalSquaredDifference<float>(context, node, data, input1, input2, output);
+  } else if (output->type == kTfLiteInt32) {
+    EvalSquaredDifference<int32_t>(context, node, data, input1, input2, output);
+  } else {
+    context->ReportError(context,
+                         "SquaredDifference only supports FLOAT32, INT32 and "
+                         "quantized UINT8 now, got %d.",
+                         output->type);
+    return kTfLiteError;
+  }
+
+  return kTfLiteOk;
+}
+
+}  // namespace squared_difference
+
+TfLiteRegistration* Register_SQUARED_DIFFERENCE() {
+  static TfLiteRegistration r = {
+      squared_difference::Init, squared_difference::Free,
+      squared_difference::Prepare, squared_difference::Eval};
+  return &r;
+}
+
+}  // namespace builtin
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/lite/kernels/squared_difference_test.cc b/tensorflow/lite/kernels/squared_difference_test.cc
new file mode 100644
index 00000000000..32bcab3b87f
--- /dev/null
+++ b/tensorflow/lite/kernels/squared_difference_test.cc
@@ -0,0 +1,157 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <gtest/gtest.h>
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/model.h"
+
+namespace tflite {
+namespace {
+
+using ::testing::ElementsAreArray;
+
+class BaseSquaredDifferenceOpModel : public SingleOpModel {
+ public:
+  BaseSquaredDifferenceOpModel(const TensorData& input1,
+                               const TensorData& input2,
+                               const TensorData& output) {
+    input1_ = AddInput(input1);
+    input2_ = AddInput(input2);
+    output_ = AddOutput(output);
+    SetBuiltinOp(BuiltinOperator_SQUARED_DIFFERENCE,
+                 BuiltinOptions_SquaredDifferenceOptions,
+                 CreateSquaredDifferenceOptions(builder_).Union());
+    BuildInterpreter({GetShape(input1_), GetShape(input2_)});
+  }
+
+  int input1() { return input1_; }
+  int input2() { return input2_; }
+
+ protected:
+  int input1_;
+  int input2_;
+  int output_;
+};
+
+class FloatSquaredDifferenceOpModel : public BaseSquaredDifferenceOpModel {
+ public:
+  using BaseSquaredDifferenceOpModel::BaseSquaredDifferenceOpModel;
+
+  std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
+};
+
+class IntegerSquaredDifferenceOpModel : public BaseSquaredDifferenceOpModel {
+ public:
+  using BaseSquaredDifferenceOpModel::BaseSquaredDifferenceOpModel;
+
+  std::vector<int32_t> GetOutput() { return ExtractVector<int32_t>(output_); }
+};
+
+TEST(FloatSquaredDifferenceOpTest, FloatType_SameShape) {
+  FloatSquaredDifferenceOpModel m({TensorType_FLOAT32, {1, 2, 2, 1}},
+                                  {TensorType_FLOAT32, {1, 2, 2, 1}},
+                                  {TensorType_FLOAT32, {}});
+  m.PopulateTensor<float>(m.input1(), {-0.2, 0.2, -1.2, 0.8});
+  m.PopulateTensor<float>(m.input2(), {0.5, 0.2, -1.5, 0.5});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(),
+              ElementsAreArray(ArrayFloatNear({0.49, 0.0, 0.09, 0.09})));
+}
+
+TEST(FloatSquaredDifferenceOpTest, FloatType_VariousInputShapes) {
+  std::vector<std::vector<int>> test_shapes = {
+      {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}};
+  for (int i = 0; i < test_shapes.size(); ++i) {
+    FloatSquaredDifferenceOpModel m({TensorType_FLOAT32, test_shapes[i]},
+                                    {TensorType_FLOAT32, test_shapes[i]},
+                                    {TensorType_FLOAT32, {}});
+    m.PopulateTensor<float>(m.input1(), {-2.0, 0.2, 0.3, 0.8, 1.1, -2.0});
+    m.PopulateTensor<float>(m.input2(), {1.0, 0.2, 0.6, 0.4, -1.0, -0.0});
+    m.Invoke();
+    EXPECT_THAT(
+        m.GetOutput(),
+        ElementsAreArray(ArrayFloatNear({9.0, 0.0, 0.09, 0.16, 4.41, 4.0})))
+        << "With shape number " << i;
+  }
+}
+
+TEST(FloatSquaredDifferenceOpTest, FloatType_WithBroadcast) {
+  std::vector<std::vector<int>> test_shapes = {
+      {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}};
+  for (int i = 0; i < test_shapes.size(); ++i) {
+    FloatSquaredDifferenceOpModel m(
+        {TensorType_FLOAT32, test_shapes[i]},
+        {TensorType_FLOAT32, {}},  // always a scalar
+        {TensorType_FLOAT32, {}});
+    m.PopulateTensor<float>(m.input1(), {-0.2, 0.2, 0.5, 0.8, 0.11, 1.1});
+    m.PopulateTensor<float>(m.input2(), {0.1});
+    m.Invoke();
+    EXPECT_THAT(
+        m.GetOutput(),
+        ElementsAreArray(ArrayFloatNear({0.09, 0.01, 0.16, 0.49, 0.0001, 1.0})))
+        << "With shape number " << i;
+  }
+}
+
+TEST(IntegerSquaredDifferenceOpTest, IntegerType_SameShape) {
+  IntegerSquaredDifferenceOpModel m({TensorType_INT32, {1, 2, 2, 1}},
+                                    {TensorType_INT32, {1, 2, 2, 1}},
+                                    {TensorType_INT32, {}});
+  m.PopulateTensor<int32_t>(m.input1(), {-2, 2, -15, 8});
+  m.PopulateTensor<int32_t>(m.input2(), {5, -2, -3, 5});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({49, 16, 144, 9}));
+}
+
+TEST(IntegerSquaredDifferenceOpTest, IntegerType_VariousInputShapes) {
+  std::vector<std::vector<int>> test_shapes = {
+      {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}};
+  for (int i = 0; i < test_shapes.size(); ++i) {
+    IntegerSquaredDifferenceOpModel m({TensorType_INT32, test_shapes[i]},
+                                      {TensorType_INT32, test_shapes[i]},
+                                      {TensorType_INT32, {}});
+    m.PopulateTensor<int32_t>(m.input1(), {-20, 2, 3, 8, 11, -20});
+    m.PopulateTensor<int32_t>(m.input2(), {1, 2, 6, 5, -5, -20});
+    m.Invoke();
+    EXPECT_THAT(m.GetOutput(), ElementsAreArray({441, 0, 9, 9, 256, 0}))
+        << "With shape number " << i;
+  }
+}
+
+TEST(IntegerSquaredDifferenceOpTest, IntegerType_WithBroadcast) {
+  std::vector<std::vector<int>> test_shapes = {
+      {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}};
+  for (int i = 0; i < test_shapes.size(); ++i) {
+    IntegerSquaredDifferenceOpModel m(
+        {TensorType_INT32, test_shapes[i]},
+        {TensorType_INT32, {}},  // always a scalar
+        {TensorType_INT32, {}});
+    m.PopulateTensor<int32_t>(m.input1(), {-20, 10, 7, 3, 1, 13});
+    m.PopulateTensor<int32_t>(m.input2(), {3});
+    m.Invoke();
+    EXPECT_THAT(m.GetOutput(), ElementsAreArray({529, 49, 16, 0, 4, 100}))
+        << "With shape number " << i;
+  }
+}
+
+}  // namespace
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/lite/kernels/test_util.h b/tensorflow/lite/kernels/test_util.h
index 43a5137a941..dadabb86abb 100644
--- a/tensorflow/lite/kernels/test_util.h
+++ b/tensorflow/lite/kernels/test_util.h
@@ -199,7 +199,7 @@ class SingleOpModel {
     for (const string& s : content) {
       buf.AddString(s.data(), s.length());
     }
-    buf.WriteToTensor(tensor);
+    buf.WriteToTensor(tensor, /*new_shape=*/nullptr);
   }
 
   // Populate the tensor given its index.
@@ -307,6 +307,7 @@ class SingleOpModel {
 
     if (is_quantized) {
       if (t.min != 0 || t.max != 0) {
+        // TODO(b/119422369): Handle signed int8 here.
         if (t.type == TensorType_UINT8) {
           std::tie(t.scale, t.zero_point) =
               QuantizationParams<uint8_t>(t.min, t.max);
diff --git a/tensorflow/lite/lib_package/create_ios_frameworks.sh b/tensorflow/lite/lib_package/create_ios_frameworks.sh
index fa466ed5bc7..7901655b7c6 100755
--- a/tensorflow/lite/lib_package/create_ios_frameworks.sh
+++ b/tensorflow/lite/lib_package/create_ios_frameworks.sh
@@ -30,7 +30,7 @@ echo "Creating target Headers directories"
 mkdir -p $FW_DIR_TFLITE_HDRS
 
 echo "Headers, populating: TensorFlow Lite"
-cd $TFLITE_DIR/../../..
+cd $TFLITE_DIR/../..
 
 find tensorflow/lite -name '*.h' \
     -not -path 'tensorflow/lite/tools/*' \
@@ -51,10 +51,10 @@ cd $FW_DIR_TFLITE_HDRS
 tar xf tmp.tar
 rm -f tmp.tar
 
-cd $TFLITE_DIR/../../..
+cd $TFLITE_DIR/../..
 echo "Generate master LICENSE file and copy to target"
 bazel build //tensorflow/tools/lib_package:clicenses_generate
-cp $TFLITE_DIR/../../../bazel-genfiles/tensorflow/tools/lib_package/include/tensorflow/c/LICENSE \
+cp $TFLITE_DIR/../../bazel-genfiles/tensorflow/tools/lib_package/include/tensorflow/c/LICENSE \
    $FW_DIR_TFLITE
 
 echo "Copying static libraries"
diff --git a/tensorflow/lite/models/smartreply/demo/app/src/main/java/com/example/android/smartreply/SmartReplyClient.java b/tensorflow/lite/models/smartreply/demo/app/src/main/java/com/example/android/smartreply/SmartReplyClient.java
index d5b1ac0ffbc..fbd75051e71 100644
--- a/tensorflow/lite/models/smartreply/demo/app/src/main/java/com/example/android/smartreply/SmartReplyClient.java
+++ b/tensorflow/lite/models/smartreply/demo/app/src/main/java/com/example/android/smartreply/SmartReplyClient.java
@@ -90,29 +90,26 @@ public class SmartReplyClient implements AutoCloseable {
   }
 
   private MappedByteBuffer loadModelFile() throws IOException {
-    AssetFileDescriptor fileDescriptor = context.getAssets().openFd(MODEL_PATH);
-    FileInputStream inputStream = new FileInputStream(fileDescriptor.getFileDescriptor());
-    try {
+    try (AssetFileDescriptor fileDescriptor = context.getAssets().openFd(MODEL_PATH);
+        FileInputStream inputStream = new FileInputStream(fileDescriptor.getFileDescriptor())) {
       FileChannel fileChannel = inputStream.getChannel();
       long startOffset = fileDescriptor.getStartOffset();
       long declaredLength = fileDescriptor.getDeclaredLength();
       return fileChannel.map(FileChannel.MapMode.READ_ONLY, startOffset, declaredLength);
-    } finally {
-      inputStream.close();
     }
   }
 
   private String[] loadBackoffList() throws IOException {
     List<String> labelList = new ArrayList<String>();
-    BufferedReader reader =
-        new BufferedReader(new InputStreamReader(context.getAssets().open(BACKOFF_PATH)));
-    String line;
-    while ((line = reader.readLine()) != null) {
-      if (!line.isEmpty()) {
-        labelList.add(line);
+    try (BufferedReader reader =
+        new BufferedReader(new InputStreamReader(context.getAssets().open(BACKOFF_PATH)))) {
+      String line;
+      while ((line = reader.readLine()) != null) {
+        if (!line.isEmpty()) {
+          labelList.add(line);
+        }
       }
     }
-    reader.close();
     String[] ans = new String[labelList.size()];
     labelList.toArray(ans);
     return ans;
diff --git a/tensorflow/lite/models/smartreply/ops/normalize.cc b/tensorflow/lite/models/smartreply/ops/normalize.cc
index 8480260f279..3cb11cc055b 100644
--- a/tensorflow/lite/models/smartreply/ops/normalize.cc
+++ b/tensorflow/lite/models/smartreply/ops/normalize.cc
@@ -92,7 +92,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 
   tflite::DynamicBuffer buf;
   buf.AddString(result.data(), result.length());
-  buf.WriteToTensor(GetOutput(context, node, 0));
+  buf.WriteToTensorAsVector(GetOutput(context, node, 0));
   return kTfLiteOk;
 }
 
diff --git a/tensorflow/lite/models/smartreply/predictor.cc b/tensorflow/lite/models/smartreply/predictor.cc
index 7db25029777..59bf4a3cf1e 100644
--- a/tensorflow/lite/models/smartreply/predictor.cc
+++ b/tensorflow/lite/models/smartreply/predictor.cc
@@ -49,7 +49,7 @@ void ExecuteTfLite(const std::string& sentence,
     TfLiteTensor* input = interpreter->tensor(interpreter->inputs()[0]);
     tflite::DynamicBuffer buf;
     buf.AddString(sentence.data(), sentence.length());
-    buf.WriteToTensor(input);
+    buf.WriteToTensorAsVector(input);
     interpreter->AllocateTensors();
 
     interpreter->Invoke();
diff --git a/tensorflow/lite/nnapi_delegate.cc b/tensorflow/lite/nnapi_delegate.cc
index 950bdb39425..26d75696a1c 100644
--- a/tensorflow/lite/nnapi_delegate.cc
+++ b/tensorflow/lite/nnapi_delegate.cc
@@ -140,13 +140,13 @@ NNAPIDelegate::~NNAPIDelegate() {
   // ANeuralNetworksShutdown();
 }
 
-// Adds the tensors of the interpreter to the NN API model.
-TfLiteStatus addTensorOperands(tflite::Interpreter* interpreter,
+// Adds the tensors of the subgraph to the NN API model.
+TfLiteStatus addTensorOperands(tflite::Subgraph* subgraph,
                                ANeuralNetworksModel* nn_model,
                                uint32_t* no_of_operands_added,
                                std::vector<int64_t>* nnapi_ids) {
   uint32_t next_id = 0;
-  for (size_t i = 0; i < interpreter->tensors_size(); i++) {
+  for (size_t i = 0; i < subgraph->tensors_size(); i++) {
     // Skip temporaries and RNN back-edges.
     if ((*nnapi_ids)[i] == kOperandNotNeeded) continue;
 
@@ -156,7 +156,7 @@ TfLiteStatus addTensorOperands(tflite::Interpreter* interpreter,
     // NNAPI requires 32-bit float scale to be zero, tflite doesn't care
     float scale = 0.0f;
     int32_t zeroPoint = 0;
-    TfLiteTensor* tensor = interpreter->tensor(i);
+    TfLiteTensor* tensor = subgraph->tensor(i);
     switch (tensor->type) {
       case kTfLiteNoType:
         // Tensors added during initialization of Ops don't have a type yet and
@@ -240,12 +240,12 @@ void MapAndAddTensorIds(const int* from_ids_buf, size_t from_ids_count,
 // Adds the operations and their parameters to the NN API model.
 // 'next-id' is the operand ID of the next operand of the model.
 TfLiteStatus AddOpsAndParams(
-    tflite::Interpreter* interpreter, ANeuralNetworksModel* nn_model,
+    tflite::Subgraph* subgraph, ANeuralNetworksModel* nn_model,
     uint32_t next_id, std::vector<int>* model_state_inputs,
     std::vector<int>* model_state_outputs,
     const std::vector<int64_t>& tensor_id_to_nnapi_id) {
-  for (size_t i = 0; i < interpreter->nodes_size(); i++) {
-    const auto* node_and_registration = interpreter->node_and_registration(i);
+  for (size_t i = 0; i < subgraph->nodes_size(); i++) {
+    const auto* node_and_registration = subgraph->node_and_registration(i);
     const TfLiteNode& node = node_and_registration->first;
     const TfLiteRegistration& registration = node_and_registration->second;
     tflite::BuiltinOperator builtin =
@@ -291,9 +291,9 @@ TfLiteStatus AddOpsAndParams(
     // For each state_out tensor, a corresponding state_in operand needs to be
     // created for NNAPI.
     auto duplicate_state_tensor_float32 =
-        [interpreter, &nn_model, &next_id, &augmented_inputs,
-         &model_state_inputs, &model_state_outputs](int tensor_id) {
-          const TfLiteTensor* tensor = interpreter->tensor(tensor_id);
+        [subgraph, &nn_model, &next_id, &augmented_inputs, &model_state_inputs,
+         &model_state_outputs](int tensor_id) {
+          const TfLiteTensor* tensor = subgraph->tensor(tensor_id);
           ANeuralNetworksOperandType operand_type{
               ANEURALNETWORKS_TENSOR_FLOAT32,
               static_cast<uint32_t>(tensor->dims->size),
@@ -388,11 +388,11 @@ TfLiteStatus AddOpsAndParams(
     };
 
     // LSTM in NNAPI requires scratch tensor as an output operand.
-    auto add_lstm_scratch_tensor_float32 = [interpreter, &node, &nn_model,
+    auto add_lstm_scratch_tensor_float32 = [subgraph, &node, &nn_model,
                                             &next_id, &augmented_outputs]() {
       if (node.temporaries->size == 0) return;
       int scratch_buffer_index = node.temporaries->data[0];
-      const TfLiteTensor* tensor = interpreter->tensor(scratch_buffer_index);
+      const TfLiteTensor* tensor = subgraph->tensor(scratch_buffer_index);
       ANeuralNetworksOperandType operand_type{
           ANEURALNETWORKS_TENSOR_FLOAT32,
           static_cast<uint32_t>(tensor->dims->size),
@@ -584,7 +584,7 @@ TfLiteStatus AddOpsAndParams(
         // The permutation input tensor value dictates the output dimensions.
         // TODO(b/110888333): Support dynamically-sized tensors in delegates.
         if ((node.inputs->size > 1) &&
-            (interpreter->tensor(node.inputs->data[1])->allocation_type !=
+            (subgraph->tensor(node.inputs->data[1])->allocation_type !=
              kTfLiteMmapRo)) {
           logError("NNAPI does not yet support dynamic tensors.");
           return kTfLiteError;
@@ -601,14 +601,13 @@ TfLiteStatus AddOpsAndParams(
           return kTfLiteError;
         }
         if ((node.inputs->size > 0) &&
-            (interpreter->tensor(node.inputs->data[0])->dims->size != 4)) {
+            (subgraph->tensor(node.inputs->data[0])->dims->size != 4)) {
           logError("NNAPI only supports input rank 4 for L2Normalization");
           return kTfLiteError;
         }
         break;
       case tflite::BuiltinOperator_HASHTABLE_LOOKUP:
-        if (interpreter->tensor(node.outputs->data[0])->type !=
-            kTfLiteFloat32) {
+        if (subgraph->tensor(node.outputs->data[0])->type != kTfLiteFloat32) {
           logError("NNAPI only support HASHTABLE_LOOKUP with float32 output",
                    builtin);
           return kTfLiteError;
@@ -682,6 +681,11 @@ TfLiteStatus AddOpsAndParams(
       case tflite::BuiltinOperator_FILL:
       case tflite::BuiltinOperator_FLOOR_MOD:
       case tflite::BuiltinOperator_RANGE:
+      case tflite::BuiltinOperator_LEAKY_RELU:
+      case tflite::BuiltinOperator_SQUARED_DIFFERENCE:
+      case tflite::BuiltinOperator_MIRROR_PAD:
+      case tflite::BuiltinOperator_ABS:
+      case tflite::BuiltinOperator_SPLIT_V:
         logError("Op code %d is currently not delegated to NNAPI", builtin);
         return kTfLiteError;
         break;
@@ -706,7 +710,7 @@ TfLiteStatus AddOpsAndParams(
   return kTfLiteOk;
 }
 
-TfLiteStatus NNAPIDelegate::BuildGraph(Interpreter* interpreter) {
+TfLiteStatus NNAPIDelegate::BuildGraph(Subgraph* subgraph) {
   if (nn_model_ && nn_compiled_model_) return model_status_;
 
   // TODO(aselle): This is not correct. need to handle resize invalidation.
@@ -718,7 +722,7 @@ TfLiteStatus NNAPIDelegate::BuildGraph(Interpreter* interpreter) {
     // inputs and outputs and mark the mapping in tensor_id_to_nnapi_id with
     // kOperandIdNotSet. addTensorOperands will replace those with the
     // corresponding NNAPI operand ids and skip kOperandNotNeeded entries.
-    std::vector<int64_t> tensor_id_to_nnapi_id(interpreter->tensors_size(),
+    std::vector<int64_t> tensor_id_to_nnapi_id(subgraph->tensors_size(),
                                                kOperandNotNeeded);
     auto set_ids_to_not_set = [&tensor_id_to_nnapi_id](const int* buf,
                                                        size_t count) {
@@ -729,35 +733,31 @@ TfLiteStatus NNAPIDelegate::BuildGraph(Interpreter* interpreter) {
         }
       }
     };
-    for (size_t i = 0; i < interpreter->nodes_size(); i++) {
-      const auto* node_and_registration = interpreter->node_and_registration(i);
+    for (size_t i = 0; i < subgraph->nodes_size(); i++) {
+      const auto* node_and_registration = subgraph->node_and_registration(i);
       const TfLiteNode& node = node_and_registration->first;
       set_ids_to_not_set(node.inputs->data, node.inputs->size);
       set_ids_to_not_set(node.outputs->data, node.outputs->size);
     }
-    set_ids_to_not_set(interpreter->inputs().data(),
-                       interpreter->inputs().size());
-    set_ids_to_not_set(interpreter->outputs().data(),
-                       interpreter->outputs().size());
+    set_ids_to_not_set(subgraph->inputs().data(), subgraph->inputs().size());
+    set_ids_to_not_set(subgraph->outputs().data(), subgraph->outputs().size());
 
     uint32_t next_id = 0;
     RETURN_ERROR_IF_TFLITE_FAILED(addTensorOperands(
-        interpreter, nn_model_, &next_id, &tensor_id_to_nnapi_id));
+        subgraph, nn_model_, &next_id, &tensor_id_to_nnapi_id));
     RETURN_ERROR_IF_TFLITE_FAILED(
-        AddOpsAndParams(interpreter, nn_model_, next_id, &model_states_inputs_,
+        AddOpsAndParams(subgraph, nn_model_, next_id, &model_states_inputs_,
                         &model_states_outputs_, tensor_id_to_nnapi_id));
 
     std::vector<uint32_t> augmented_inputs;
-    MapAndAddTensorIds(interpreter->inputs().data(),
-                       interpreter->inputs().size(), &augmented_inputs,
-                       tensor_id_to_nnapi_id);
+    MapAndAddTensorIds(subgraph->inputs().data(), subgraph->inputs().size(),
+                       &augmented_inputs, tensor_id_to_nnapi_id);
     augmented_inputs.insert(augmented_inputs.end(),
                             model_states_inputs_.begin(),
                             model_states_inputs_.end());
     std::vector<uint32_t> augmented_outputs;
-    MapAndAddTensorIds(interpreter->outputs().data(),
-                       interpreter->outputs().size(), &augmented_outputs,
-                       tensor_id_to_nnapi_id);
+    MapAndAddTensorIds(subgraph->outputs().data(), subgraph->outputs().size(),
+                       &augmented_outputs, tensor_id_to_nnapi_id);
     MapAndAddTensorIds(model_states_outputs_.data(),
                        model_states_outputs_.size(), &augmented_outputs,
                        tensor_id_to_nnapi_id);
@@ -770,7 +770,7 @@ TfLiteStatus NNAPIDelegate::BuildGraph(Interpreter* interpreter) {
 
     if (GetAndroidSdkVersionCached() >= 28) {
       CHECK_NN(ANeuralNetworksModel_relaxComputationFloat32toFloat16(
-          nn_model_, interpreter->GetAllowFp16PrecisionForFp32()));
+          nn_model_, subgraph->GetAllowFp16PrecisionForFp32()));
     }
     CHECK_NN(ANeuralNetworksModel_finish(nn_model_));
   }
@@ -781,9 +781,9 @@ TfLiteStatus NNAPIDelegate::BuildGraph(Interpreter* interpreter) {
   return kTfLiteOk;
 }
 
-TfLiteStatus NNAPIDelegate::Invoke(Interpreter* interpreter) {
+TfLiteStatus NNAPIDelegate::Invoke(Subgraph* subgraph) {
   if (!nn_model_) {
-    model_status_ = BuildGraph(interpreter);
+    model_status_ = BuildGraph(subgraph);
     if (model_status_ != kTfLiteOk) {
       logError("Failed to build graph for NNAPI");
     }
@@ -796,19 +796,19 @@ TfLiteStatus NNAPIDelegate::Invoke(Interpreter* interpreter) {
   CHECK_NN(ANeuralNetworksExecution_create(nn_compiled_model_, &execution));
 
   // Currently perform deep copy of input buffer
-  for (size_t i = 0; i < interpreter->inputs().size(); i++) {
-    int input = interpreter->inputs()[i];
+  for (size_t i = 0; i < subgraph->inputs().size(); i++) {
+    int input = subgraph->inputs()[i];
     // TODO(aselle): Is this what we want or do we want input instead?
     // TODO(aselle): This should be called setInputValue maybe to be cons.
-    TfLiteTensor* tensor = interpreter->tensor(input);
+    TfLiteTensor* tensor = subgraph->tensor(input);
     CHECK_NN(ANeuralNetworksExecution_setInput(
         execution, i, nullptr, tensor->data.raw, tensor->bytes));
   }
 
   // Tell nn api where to place final data.
-  for (size_t i = 0; i < interpreter->outputs().size(); i++) {
-    int output = interpreter->outputs()[i];
-    TfLiteTensor* tensor = interpreter->tensor(output);
+  for (size_t i = 0; i < subgraph->outputs().size(); i++) {
+    int output = subgraph->outputs()[i];
+    TfLiteTensor* tensor = subgraph->tensor(output);
     CHECK_NN(ANeuralNetworksExecution_setOutput(
         execution, i, nullptr, tensor->data.raw, tensor->bytes));
   }
@@ -817,16 +817,16 @@ TfLiteStatus NNAPIDelegate::Invoke(Interpreter* interpreter) {
   // current invocation.
   for (size_t i = 0; i < model_states_outputs_.size(); i++) {
     int state_tensor_idx = model_states_outputs_[i];
-    TfLiteTensor* tensor = interpreter->tensor(state_tensor_idx);
+    TfLiteTensor* tensor = subgraph->tensor(state_tensor_idx);
     // Here we are using a deep copy for state_in tensors so that we are not
     // reading and writing into the same buffer during a invocation.
     // TODO(miaowang): using double shared buffer to minimize the copies.
     CHECK_NN(ANeuralNetworksExecution_setInput(
-        execution, i + interpreter->inputs().size(), nullptr, tensor->data.raw,
+        execution, i + subgraph->inputs().size(), nullptr, tensor->data.raw,
         tensor->bytes));
     // Tell NNAPI where to output the state_out.
     CHECK_NN(ANeuralNetworksExecution_setOutput(
-        execution, i + interpreter->outputs().size(), nullptr, tensor->data.raw,
+        execution, i + subgraph->outputs().size(), nullptr, tensor->data.raw,
         tensor->bytes));
   }
 
@@ -839,9 +839,9 @@ TfLiteStatus NNAPIDelegate::Invoke(Interpreter* interpreter) {
 
 #if 0
   printf("From the NN API:\n");
-  TfLiteTensor* tensor = interpreter->tensor(interpreter->outputs()[0]);
+  TfLiteTensor* tensor = subgraph->tensor(subgraph->outputs()[0]);
   if (float* data =
-          interpreter->typed_tensor<float>(interpreter->outputs()[0])) {
+          subgraph->typed_tensor<float>(subgraph->outputs()[0])) {
     size_t num = tensor->bytes / sizeof(float);
     for (float* p = data; p < data + num; p++) {
       printf(" %f", *p);
diff --git a/tensorflow/lite/nnapi_delegate.h b/tensorflow/lite/nnapi_delegate.h
index 63b408c1416..b4f8e4ecf39 100644
--- a/tensorflow/lite/nnapi_delegate.h
+++ b/tensorflow/lite/nnapi_delegate.h
@@ -18,6 +18,7 @@ limitations under the License.
 #include "tensorflow/lite/allocation.h"
 #include "tensorflow/lite/c/c_api_internal.h"
 #include "tensorflow/lite/core/api/error_reporter.h"
+#include "tensorflow/lite/core/subgraph.h"
 #include "tensorflow/lite/interpreter.h"
 
 class ANeuralNetworksModel;
@@ -50,10 +51,10 @@ class NNAPIDelegate {
   ~NNAPIDelegate();
 
   // Convert a tflite graph to NNAPI
-  TfLiteStatus BuildGraph(Interpreter* interpreter);
+  TfLiteStatus BuildGraph(Subgraph* subgraph);
 
   // Run
-  TfLiteStatus Invoke(Interpreter* interpreter);
+  TfLiteStatus Invoke(Subgraph* subgraph);
 
   // Whether the current platform supports NNAPI delegation.
   static bool IsSupported();
diff --git a/tensorflow/lite/nnapi_delegate_disabled.cc b/tensorflow/lite/nnapi_delegate_disabled.cc
index 44dc21f1b6c..a8f2c0bfe38 100644
--- a/tensorflow/lite/nnapi_delegate_disabled.cc
+++ b/tensorflow/lite/nnapi_delegate_disabled.cc
@@ -35,13 +35,11 @@ NNAPIDelegate::~NNAPIDelegate() {
 #undef UNUSED_MEMBER
 }
 
-TfLiteStatus NNAPIDelegate::BuildGraph(Interpreter* interpreter) {
+TfLiteStatus NNAPIDelegate::BuildGraph(Subgraph* subgraph) {
   return kTfLiteError;
 }
 
-TfLiteStatus NNAPIDelegate::Invoke(Interpreter* interpreter) {
-  return kTfLiteError;
-}
+TfLiteStatus NNAPIDelegate::Invoke(Subgraph* subgraph) { return kTfLiteError; }
 
 bool NNAPIDelegate::IsSupported() { return false; }
 
diff --git a/tensorflow/lite/optional_debug_tools.cc b/tensorflow/lite/optional_debug_tools.cc
index 5ee1cf6d33d..1113bf01b17 100644
--- a/tensorflow/lite/optional_debug_tools.cc
+++ b/tensorflow/lite/optional_debug_tools.cc
@@ -44,6 +44,8 @@ const char* TensorTypeName(TfLiteType type) {
       return "kTfLiteInt32";
     case kTfLiteUInt8:
       return "kTfLiteUInt8";
+    case kTfLiteInt8:
+      return "kTfLiteInt8";
     case kTfLiteInt64:
       return "kTfLiteInt64";
     case kTfLiteString:
diff --git a/tensorflow/lite/python/BUILD b/tensorflow/lite/python/BUILD
index 017dd72f781..acf827892bf 100644
--- a/tensorflow/lite/python/BUILD
+++ b/tensorflow/lite/python/BUILD
@@ -89,6 +89,7 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         "//tensorflow/lite/toco:toco_flags_proto_py",
+        "//tensorflow/python:dtypes",
     ],
 )
 
@@ -103,6 +104,7 @@ py_library(
         "//tensorflow/lite/toco:toco_flags_proto_py",
         "//tensorflow/lite/toco/python:tensorflow_wrap_toco",
         "//tensorflow/lite/toco/python:toco_from_protos",
+        "//tensorflow/python:dtypes",
         "//tensorflow/python:platform",
     ],
 )
diff --git a/tensorflow/lite/python/convert.py b/tensorflow/lite/python/convert.py
index 9991fb2a733..563312e0278 100644
--- a/tensorflow/lite/python/convert.py
+++ b/tensorflow/lite/python/convert.py
@@ -28,6 +28,8 @@ import tempfile as _tempfile
 from tensorflow.lite.python import lite_constants
 from tensorflow.lite.toco import model_flags_pb2 as _model_flags_pb2
 from tensorflow.lite.toco import toco_flags_pb2 as _toco_flags_pb2
+from tensorflow.lite.toco import types_pb2 as _types_pb2
+from tensorflow.python.framework import dtypes
 from tensorflow.python.platform import resource_loader as _resource_loader
 from tensorflow.python.util import deprecation
 from tensorflow.python.util.lazy_loader import LazyLoader
@@ -53,6 +55,18 @@ else:
 if _toco_from_proto_bin and not _os.path.exists(_toco_from_proto_bin):
   _toco_from_proto_bin = "toco_from_protos"
 
+
+# Map of tf.dtypes to TFLite types_flag_pb2.
+_MAP_TF_TO_TFLITE_TYPES = {
+    dtypes.float32: _types_pb2.FLOAT,
+    dtypes.int32: _types_pb2.INT32,
+    dtypes.int64: _types_pb2.INT64,
+    dtypes.string: _types_pb2.STRING,
+    dtypes.uint8: _types_pb2.QUANTIZED_UINT8,
+    dtypes.complex64: _types_pb2.COMPLEX64
+}
+
+
 def _try_convert_to_unicode(output):
   if output is None:
     return u""
@@ -65,6 +79,24 @@ def _try_convert_to_unicode(output):
   return output
 
 
+def convert_dtype_to_tflite_type(tf_dtype):
+  """Converts tf.dtype to TFLite proto type.
+
+  Args:
+    tf_dtype: tf.dtype
+
+  Raises:
+    ValueError: Unsupported tf.dtype.
+
+  Returns:
+    types_flag_pb2.
+  """
+  result = _MAP_TF_TO_TFLITE_TYPES.get(tf_dtype)
+  if result is None:
+    raise ValueError("Unsupported tf.dtype {0}".format(tf_dtype))
+  return result
+
+
 class OpsSet(enum.Enum):
   """Enum class defining the sets of ops available to generate TFLite models.
 
@@ -214,10 +246,10 @@ def build_toco_convert_protos(input_tensors,
       `foo.get_shape()` and `foo.dtype`.
     output_tensors: List of output tensors (only .name is used from this).
     inference_type: Target data type of real-number arrays in the output file.
-      Must be `{FLOAT, QUANTIZED_UINT8}`.  (default FLOAT)
+      Must be `{tf.float32, tf.uint8}`.  (default tf.float32)
     inference_input_type: Target data type of real-number input arrays. Allows
       for a different type for input arrays in the case of quantization.
-      Must be `{FLOAT, QUANTIZED_UINT8}`. (default `inference_type`)
+      Must be `{tf.float32, tf.uint8}`. (default `inference_type`)
     input_format: Type of data to read Currently must be
       `{TENSORFLOW_GRAPHDEF}`. (default TENSORFLOW_GRAPHDEF)
     input_shapes: Input array shape. It needs to be a list of the same length
@@ -269,16 +301,19 @@ def build_toco_convert_protos(input_tensors,
     process.
 
   Raises:
-    ValueError: If the input tensor type is unknown
+    ValueError:
+      If the input tensor type is unknown
+      Missing mean_values or std_dev_values
     RuntimeError: If TOCO fails to convert (in which case the runtime error's
       error text will contain the TOCO error log)
   """
   toco = _toco_flags_pb2.TocoFlags()
   toco.input_format = input_format
   toco.output_format = output_format
-  toco.inference_type = inference_type
+  toco.inference_type = convert_dtype_to_tflite_type(inference_type)
   if inference_input_type:
-    toco.inference_input_type = inference_input_type
+    toco.inference_input_type = convert_dtype_to_tflite_type(
+        inference_input_type)
   else:
     toco.inference_input_type = toco.inference_type
   toco.drop_control_dependency = drop_control_dependency
@@ -302,9 +337,14 @@ def build_toco_convert_protos(input_tensors,
   model.change_concat_input_ranges = change_concat_input_ranges
   for idx, input_tensor in enumerate(input_tensors):
     input_array = model.input_arrays.add()
-    if toco.inference_input_type == lite_constants.QUANTIZED_UINT8:
-      input_array.mean_value, input_array.std_value = quantized_input_stats[idx]
     input_array.name = tensor_name(input_tensor)
+    input_array.data_type = convert_dtype_to_tflite_type(input_tensor.dtype)
+
+    if toco.inference_input_type == _types_pb2.QUANTIZED_UINT8:
+      if not quantized_input_stats:
+        raise ValueError("std_dev and mean must be defined when "
+                         "inference_input_type is QUANTIZED_UINT8.")
+      input_array.mean_value, input_array.std_value = quantized_input_stats[idx]
     if input_shapes is None:
       shape = input_tensor.get_shape()
     else:
@@ -352,7 +392,11 @@ def toco_convert_graph_def(input_data, input_arrays_with_shape, output_arrays,
 
   for idx, (name, shape) in enumerate(input_arrays_with_shape):
     input_array = model_flags.input_arrays.add()
-    if kwargs["inference_type"] == lite_constants.QUANTIZED_UINT8:
+    if toco_flags.inference_input_type == _types_pb2.QUANTIZED_UINT8:
+      if (("quantized_input_stats" not in kwargs) or
+          (not kwargs["quantized_input_stats"])):
+        raise ValueError("std_dev and mean must be defined when "
+                         "inference_input_type is QUANTIZED_UINT8.")
       input_array.mean_value, input_array.std_value = kwargs[
           "quantized_input_stats"][idx]
     input_array.name = name
diff --git a/tensorflow/lite/python/convert_saved_model.py b/tensorflow/lite/python/convert_saved_model.py
index 3f54d2559c4..f8d986b7469 100644
--- a/tensorflow/lite/python/convert_saved_model.py
+++ b/tensorflow/lite/python/convert_saved_model.py
@@ -197,12 +197,27 @@ def set_tensor_shapes(tensors, shapes):
     tensors: TensorFlow ops.Tensor.
     shapes: Dict of strings representing input tensor names to list of
       integers representing input shapes (e.g., {"foo": : [1, 16, 16, 3]}).
+
+  Raises:
+    ValueError:
+      `shapes` contains an invalid tensor.
+      `shapes` contains an invalid shape for a valid tensor.
   """
   if shapes:
-    for tensor in tensors:
-      shape = shapes.get(tensor_name(tensor))
+    tensor_names_to_tensor = {tensor_name(tensor): tensor for tensor in tensors}
+    for name, shape in shapes.items():
+      if name not in tensor_names_to_tensor:
+        raise ValueError("Invalid tensor \'{}\' found in tensor shapes "
+                         "map.".format(name))
       if shape is not None:
-        tensor.set_shape(shape)
+        tensor = tensor_names_to_tensor[name]
+        try:
+          tensor.set_shape(shape)
+        except ValueError as error:
+          message = ("The shape of tensor '{0}' cannot be changed from {1} to "
+                     "{2}. {3}".format(name, tensor.get_shape(), shape,
+                                       str(error)))
+          raise ValueError(message)
 
 
 def freeze_saved_model(saved_model_dir, input_arrays, input_shapes,
diff --git a/tensorflow/lite/python/convert_saved_model_test.py b/tensorflow/lite/python/convert_saved_model_test.py
index dff582f1a16..76113853ca9 100644
--- a/tensorflow/lite/python/convert_saved_model_test.py
+++ b/tensorflow/lite/python/convert_saved_model_test.py
@@ -75,12 +75,30 @@ class TensorFunctionsTest(test_util.TensorFlowTestCase):
     convert_saved_model.set_tensor_shapes([tensor], {"Placeholder": [1, 3, 5]})
     self.assertEqual([1, 3, 5], tensor.shape.as_list())
 
-  def testSetTensorShapeInvalid(self):
+  def testSetTensorShapeArrayInvalid(self):
+    # Tests set_tensor_shape where the tensor name passed in doesn't exist.
     tensor = array_ops.placeholder(shape=[None, 3, 5], dtype=dtypes.float32)
     self.assertEqual([None, 3, 5], tensor.shape.as_list())
 
-    convert_saved_model.set_tensor_shapes([tensor],
-                                          {"invalid-input": [5, 3, 5]})
+    with self.assertRaises(ValueError) as error:
+      convert_saved_model.set_tensor_shapes([tensor],
+                                            {"invalid-input": [5, 3, 5]})
+    self.assertEqual(
+        "Invalid tensor 'invalid-input' found in tensor shapes map.",
+        str(error.exception))
+    self.assertEqual([None, 3, 5], tensor.shape.as_list())
+
+  def testSetTensorShapeDimensionInvalid(self):
+    # Tests set_tensor_shape where the shape passed in is incompatiable.
+    tensor = array_ops.placeholder(shape=[None, 3, 5], dtype=dtypes.float32)
+    self.assertEqual([None, 3, 5], tensor.shape.as_list())
+
+    with self.assertRaises(ValueError) as error:
+      convert_saved_model.set_tensor_shapes([tensor],
+                                            {"Placeholder": [1, 5, 5]})
+    self.assertIn(
+        "The shape of tensor 'Placeholder' cannot be changed from "
+        "(?, 3, 5) to [1, 5, 5].", str(error.exception))
     self.assertEqual([None, 3, 5], tensor.shape.as_list())
 
   def testSetTensorShapeEmpty(self):
diff --git a/tensorflow/lite/python/convert_test.py b/tensorflow/lite/python/convert_test.py
index 7a0bce921b5..2a6f1f634f8 100644
--- a/tensorflow/lite/python/convert_test.py
+++ b/tensorflow/lite/python/convert_test.py
@@ -23,6 +23,7 @@ from tensorflow.lite.python import convert
 from tensorflow.lite.python import lite_constants
 from tensorflow.lite.python import op_hint
 from tensorflow.lite.python.interpreter import Interpreter
+from tensorflow.lite.toco import types_pb2 as _types_pb2
 from tensorflow.python.client import session
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import test_util
@@ -65,6 +66,21 @@ class ConvertTest(test_util.TensorFlowTestCase):
         quantized_input_stats=[(0., 1.)])
     self.assertTrue(tflite_model)
 
+  def testQuantizationInvalid(self):
+    in_tensor = array_ops.placeholder(
+        shape=[1, 16, 16, 3], dtype=dtypes.float32)
+    out_tensor = array_ops.fake_quant_with_min_max_args(
+        in_tensor + in_tensor, min=0., max=1.)
+    sess = session.Session()
+
+    with self.assertRaises(ValueError) as error:
+      convert.toco_convert(
+          sess.graph_def, [in_tensor], [out_tensor],
+          inference_type=lite_constants.QUANTIZED_UINT8)
+    self.assertEqual(
+        "std_dev and mean must be defined when inference_input_type is "
+        "QUANTIZED_UINT8.", str(error.exception))
+
   def testGraphDefBasic(self):
     in_tensor = array_ops.placeholder(
         shape=[1, 16, 16, 3], dtype=dtypes.float32, name="input")
@@ -138,6 +154,27 @@ class ConvertTest(test_util.TensorFlowTestCase):
     self.assertTrue(([1, 16, 16, 3] == output_details[0]["shape"]).all())
     self.assertTrue(output_details[0]["quantization"][0] > 0)  # scale
 
+  def testGraphDefQuantizationInvalid(self):
+    in_tensor_1 = array_ops.placeholder(
+        shape=[1, 16, 16, 3], dtype=dtypes.float32, name="inputA")
+    in_tensor_2 = array_ops.placeholder(
+        shape=[1, 16, 16, 3], dtype=dtypes.float32, name="inputB")
+    _ = array_ops.fake_quant_with_min_max_args(
+        in_tensor_1 + in_tensor_2, min=0., max=1., name="output")
+    sess = session.Session()
+
+    input_arrays_map = [("inputA", [1, 16, 16, 3]), ("inputB", [1, 16, 16, 3])]
+    output_arrays = ["output"]
+    with self.assertRaises(ValueError) as error:
+      convert.toco_convert_graph_def(
+          sess.graph_def,
+          input_arrays_map,
+          output_arrays,
+          inference_type=lite_constants.QUANTIZED_UINT8)
+    self.assertEqual(
+        "std_dev and mean must be defined when inference_input_type is "
+        "QUANTIZED_UINT8.", str(error.exception))
+
 
 class ConvertTestOpHint(test_util.TensorFlowTestCase):
   """Test the hint to stub functionality."""
@@ -329,6 +366,27 @@ class ConvertTestOpHint(test_util.TensorFlowTestCase):
               output_nodes=[op_hint._tensor_name_base(output.name)]),
           set(["agg", "Const", "Identity"]))
 
+  def testConvertDtype(self):
+    self.assertEqual(
+        convert.convert_dtype_to_tflite_type(lite_constants.FLOAT),
+        _types_pb2.FLOAT)
+    self.assertEqual(
+        convert.convert_dtype_to_tflite_type(dtypes.float32), _types_pb2.FLOAT)
+    self.assertEqual(
+        convert.convert_dtype_to_tflite_type(dtypes.int32), _types_pb2.INT32)
+    self.assertEqual(
+        convert.convert_dtype_to_tflite_type(dtypes.int64), _types_pb2.INT64)
+    self.assertEqual(
+        convert.convert_dtype_to_tflite_type(dtypes.string), _types_pb2.STRING)
+    self.assertEqual(
+        convert.convert_dtype_to_tflite_type(dtypes.uint8),
+        _types_pb2.QUANTIZED_UINT8)
+    self.assertEqual(
+        convert.convert_dtype_to_tflite_type(dtypes.complex64),
+        _types_pb2.COMPLEX64)
+    with self.assertRaises(ValueError):
+      convert.convert_dtype_to_tflite_type(dtypes.bool)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper.cc b/tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper.cc
index e71752fe631..d14af439ec0 100644
--- a/tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper.cc
+++ b/tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper.cc
@@ -124,6 +124,8 @@ int TfLiteTypeToPyArrayType(TfLiteType tf_lite_type) {
       return NPY_INT16;
     case kTfLiteUInt8:
       return NPY_UINT8;
+    case kTfLiteInt8:
+      return NPY_INT8;
     case kTfLiteInt64:
       return NPY_INT64;
     case kTfLiteString:
@@ -150,6 +152,8 @@ TfLiteType TfLiteTypeFromPyArray(PyArrayObject* array) {
       return kTfLiteInt16;
     case NPY_UINT8:
       return kTfLiteUInt8;
+    case NPY_INT8:
+      return kTfLiteInt8;
     case NPY_INT64:
       return kTfLiteInt64;
     case NPY_BOOL:
diff --git a/tensorflow/lite/python/lite.py b/tensorflow/lite/python/lite.py
index 5810553da2c..1b20ff2f92b 100644
--- a/tensorflow/lite/python/lite.py
+++ b/tensorflow/lite/python/lite.py
@@ -25,8 +25,6 @@ EXPERIMENTAL: APIs here are unstable and likely to change without notice.
 @@convert_op_hints_to_stubs
 @@build_toco_convert_protos
 
-@@FLOAT
-@@QUANTIZED_UINT8
 @@TFLITE
 @@GRAPHVIZ_DOT
 
@@ -78,10 +76,10 @@ class TFLiteConverter(object):
   Attributes:
 
     inference_type: Target data type of real-number arrays in the output file.
-      Must be `{FLOAT, QUANTIZED_UINT8}`.  (default FLOAT)
+      Must be `{tf.float32, tf.uint8}`. (default tf.float32)
     inference_input_type: Target data type of real-number input arrays. Allows
       for a different type for input arrays in the case of quantization.
-      Must be `{FLOAT, QUANTIZED_UINT8}`. (default `inference_type`)
+      Must be `{tf.float32, tf.uint8}`. (default `inference_type`)
     output_format: Output file format. Currently must be `{TFLITE,
       GRAPHVIZ_DOT}`. (default TFLITE)
     quantized_input_stats: Dict of strings representing input tensor names
@@ -402,15 +400,16 @@ class TFLiteConverter(object):
     # Checks dimensions in input tensor.
     if self._has_valid_tensors():
       for tensor in self._input_tensors:
-        if not tensor.get_shape():
+        shape = tensor.get_shape()
+        if not shape or not shape.as_list():
           raise ValueError("Provide an input shape for input array "
                            "'{0}'.".format(_tensor_name(tensor)))
-        shape = tensor.get_shape().as_list()
-        if None in shape[1:]:
+        shape_list = shape.as_list()
+        if None in shape_list[1:]:
           raise ValueError(
               "None is only supported in the 1st dimension. Tensor '{0}' has "
-              "invalid shape '{1}'.".format(_tensor_name(tensor), shape))
-        elif shape[0] is None:
+              "invalid shape '{1}'.".format(_tensor_name(tensor), shape_list))
+        elif shape_list[0] is None:
           self._set_batch_size(batch_size=1)
 
     # Get quantization stats. Ensures there is one stat per name if the stats
diff --git a/tensorflow/lite/python/lite_constants.py b/tensorflow/lite/python/lite_constants.py
index fdefc5e6cf0..f5d6d103795 100644
--- a/tensorflow/lite/python/lite_constants.py
+++ b/tensorflow/lite/python/lite_constants.py
@@ -19,26 +19,25 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.lite.toco import toco_flags_pb2 as _toco_flags_pb2
-from tensorflow.lite.toco import types_pb2 as _types_pb2
+from tensorflow.python.framework import dtypes
 from tensorflow.python.util.all_util import remove_undocumented
 from tensorflow.python.util.tf_export import tf_export as _tf_export
 
-# Enum types from the protobuf promoted to the API
-FLOAT = _types_pb2.FLOAT
-INT32 = _types_pb2.INT32
-INT64 = _types_pb2.INT64
-STRING = _types_pb2.STRING
-QUANTIZED_UINT8 = _types_pb2.QUANTIZED_UINT8
-COMPLEX64 = _types_pb2.COMPLEX64
+FLOAT = dtypes.float32
+INT32 = dtypes.int32
+INT64 = dtypes.int64
+STRING = dtypes.string
+QUANTIZED_UINT8 = dtypes.uint8
+COMPLEX64 = dtypes.complex64
 TENSORFLOW_GRAPHDEF = _toco_flags_pb2.TENSORFLOW_GRAPHDEF
 TFLITE = _toco_flags_pb2.TFLITE
 GRAPHVIZ_DOT = _toco_flags_pb2.GRAPHVIZ_DOT
 
-_tf_export("lite.constants.FLOAT").export_constant(__name__, "FLOAT")
-_tf_export("lite.constants.INT32").export_constant(__name__, "INT32")
-_tf_export("lite.constants.INT64").export_constant(__name__, "INT64")
-_tf_export("lite.constants.STRING").export_constant(__name__, "STRING")
-_tf_export("lite.constants.QUANTIZED_UINT8").export_constant(
+_tf_export(v1=["lite.constants.FLOAT"]).export_constant(__name__, "FLOAT")
+_tf_export(v1=["lite.constants.INT32"]).export_constant(__name__, "INT32")
+_tf_export(v1=["lite.constants.INT64"]).export_constant(__name__, "INT64")
+_tf_export(v1=["lite.constants.STRING"]).export_constant(__name__, "STRING")
+_tf_export(v1=["lite.constants.QUANTIZED_UINT8"]).export_constant(
     __name__, "QUANTIZED_UINT8")
 _tf_export("lite.constants.TFLITE").export_constant(__name__, "TFLITE")
 _tf_export("lite.constants.GRAPHVIZ_DOT").export_constant(
diff --git a/tensorflow/lite/python/lite_test.py b/tensorflow/lite/python/lite_test.py
index 5a5697db92b..1ae0d3c3ed0 100644
--- a/tensorflow/lite/python/lite_test.py
+++ b/tensorflow/lite/python/lite_test.py
@@ -182,7 +182,7 @@ class FromSessionTest(test_util.TensorFlowTestCase):
     out_tensor = in_tensor + in_tensor
     sess = session.Session()
 
-    # Test invalid shape. None after 1st dimension.
+    # Test None as shape.
     converter = lite.TFLiteConverter.from_session(sess, [in_tensor],
                                                   [out_tensor])
     with self.assertRaises(ValueError) as error:
@@ -190,7 +190,20 @@ class FromSessionTest(test_util.TensorFlowTestCase):
     self.assertEqual('Provide an input shape for input array \'Placeholder\'.',
                      str(error.exception))
 
-  def testBatchSizeInvalid(self):
+  def testSizeEmptyInvalid(self):
+    in_tensor = array_ops.placeholder(dtype=dtypes.float32, shape=[])
+    out_tensor = in_tensor + in_tensor
+    sess = session.Session()
+
+    # Test empty shape.
+    converter = lite.TFLiteConverter.from_session(sess, [in_tensor],
+                                                  [out_tensor])
+    with self.assertRaises(ValueError) as error:
+      converter.convert()
+    self.assertEqual('Provide an input shape for input array \'Placeholder\'.',
+                     str(error.exception))
+
+  def testSizeInvalid(self):
     in_tensor = array_ops.placeholder(
         shape=[1, None, 16, 3], dtype=dtypes.float32)
     out_tensor = in_tensor + in_tensor
@@ -931,12 +944,13 @@ class FromKerasFile(test_util.TensorFlowTestCase):
     """Test a Sequential tf.keras model testing input shapes argument."""
     keras_file = self._getSequentialModel()
 
-    # Passing in shape of invalid input array has no impact as long as all input
-    # arrays have a shape.
-    converter = lite.TFLiteConverter.from_keras_model_file(
-        keras_file, input_shapes={'invalid-input': [2, 3]})
-    tflite_model = converter.convert()
-    self.assertTrue(tflite_model)
+    # Passing in shape of invalid input array raises error.
+    with self.assertRaises(ValueError) as error:
+      converter = lite.TFLiteConverter.from_keras_model_file(
+          keras_file, input_shapes={'invalid-input': [2, 3]})
+    self.assertEqual(
+        "Invalid tensor 'invalid-input' found in tensor shapes map.",
+        str(error.exception))
 
     # Passing in shape of valid input array.
     converter = lite.TFLiteConverter.from_keras_model_file(
diff --git a/tensorflow/lite/python/op_hint.py b/tensorflow/lite/python/op_hint.py
index 3afce1baf2e..8d7f9316bfe 100644
--- a/tensorflow/lite/python/op_hint.py
+++ b/tensorflow/lite/python/op_hint.py
@@ -104,9 +104,9 @@ class OpHint(object):
   that make up the pseudo op. A similar process is done to any output that
   is to be exported from the current op.
 
-  TODO(aselle): When TensorFlow functions functionality works for arbitrary
-  constructs, this mechanism can be retired and changed to use python defun's.
   """
+  # TODO(aselle): When TensorFlow functions functionality works for arbitrary
+  # constructs, this mechanism can be retired and changed to use python defun's.
 
   # Attr constants that are used for representation in the GraphDef. These
   # will be used on every Identity op that is involved in a total OpHint.
@@ -403,7 +403,7 @@ class _LiteOperand(object):
       out_graphdef: A graphdef that is ready to have this input added.
 
     Returns:
-      The the output that the stub should use as an input for this operand.
+      The output that the stub should use as an input for this operand.
 
     Raises:
       RuntimeError: if the method is not implemented.
diff --git a/tensorflow/lite/python/tflite_convert.py b/tensorflow/lite/python/tflite_convert.py
index 00ea6d722e2..341b539bead 100644
--- a/tensorflow/lite/python/tflite_convert.py
+++ b/tensorflow/lite/python/tflite_convert.py
@@ -25,7 +25,6 @@ import sys
 from tensorflow.lite.python import lite
 from tensorflow.lite.python import lite_constants
 from tensorflow.lite.toco import toco_flags_pb2 as _toco_flags_pb2
-from tensorflow.lite.toco import types_pb2 as _types_pb2
 from tensorflow.python.platform import app
 
 
@@ -41,6 +40,27 @@ def _parse_set(values):
   return None
 
 
+def _parse_inference_type(value, flag):
+  """Converts the inference type to the value of the constant.
+
+  Args:
+    value: str representing the inference type.
+    flag: str representing the flag name.
+
+  Returns:
+    tf.dtype.
+
+  Raises:
+    ValueError: Unsupported value.
+  """
+  if value == "FLOAT":
+    return lite_constants.FLOAT
+  if value == "QUANTIZED_UINT8":
+    return lite_constants.QUANTIZED_UINT8
+  raise ValueError("Unsupported value for --{0}. Only FLOAT and "
+                   "QUANTIZED_UINT8 are supported.".format(flag))
+
+
 def _get_toco_converter(flags):
   """Makes a TFLiteConverter object based on the flags provided.
 
@@ -101,10 +121,11 @@ def _convert_model(flags):
   # Create converter.
   converter = _get_toco_converter(flags)
   if flags.inference_type:
-    converter.inference_type = _types_pb2.IODataType.Value(flags.inference_type)
+    converter.inference_type = _parse_inference_type(flags.inference_type,
+                                                     "inference_type")
   if flags.inference_input_type:
-    converter.inference_input_type = _types_pb2.IODataType.Value(
-        flags.inference_input_type)
+    converter.inference_input_type = _parse_inference_type(
+        flags.inference_input_type, "inference_input_type")
   if flags.output_format:
     converter.output_format = _toco_flags_pb2.FileFormat.Value(
         flags.output_format)
@@ -115,7 +136,7 @@ def _convert_model(flags):
 
     # In quantized inference, mean_value has to be integer so that the real
     # value 0.0 is exactly representable.
-    if flags.inference_type == lite_constants.QUANTIZED_UINT8:
+    if converter.inference_type == lite_constants.QUANTIZED_UINT8:
       mean_values = _parse_array(flags.mean_values, type_fn=int)
     else:
       mean_values = _parse_array(flags.mean_values, type_fn=float)
@@ -156,7 +177,7 @@ def _convert_model(flags):
 
   if flags.post_training_quantize:
     converter.post_training_quantize = flags.post_training_quantize
-    if flags.inference_type == lite_constants.QUANTIZED_UINT8:
+    if converter.inference_type == lite_constants.QUANTIZED_UINT8:
       print("--post_training_quantize quantizes a graph of inference_type "
             "FLOAT. Overriding inference type QUANTIZED_UINT8 to FLOAT.")
       converter.inference_type = lite_constants.FLOAT
diff --git a/tensorflow/lite/schema/schema.fbs b/tensorflow/lite/schema/schema.fbs
index 9b0eae74c3b..6436167303b 100644
--- a/tensorflow/lite/schema/schema.fbs
+++ b/tensorflow/lite/schema/schema.fbs
@@ -200,6 +200,11 @@ enum BuiltinOperator : byte {
   FLOOR_MOD = 95,
   RANGE = 96,
   RESIZE_NEAREST_NEIGHBOR = 97,
+  LEAKY_RELU = 98,
+  SQUARED_DIFFERENCE = 99,
+  MIRROR_PAD = 100,
+  ABS = 101,
+  SPLIT_V = 102,
 }
 
 // Options for the builtin operators.
@@ -278,6 +283,11 @@ union BuiltinOptions {
   FloorModOptions,
   RangeOptions,
   ResizeNearestNeighborOptions,
+  LeakyReluOptions,
+  SquaredDifferenceOptions,
+  MirrorPadOptions,
+  AbsOptions,
+  SplitVOptions,
 }
 
 enum Padding : byte { SAME, VALID }
@@ -526,6 +536,10 @@ table SplitOptions {
   num_splits: int;
 }
 
+table SplitVOptions {
+  num_splits: int;
+}
+
 table StridedSliceOptions {
   begin_mask: int;
   end_mask: int;
@@ -629,6 +643,10 @@ table OneHotOptions {
   axis:int;
 }
 
+table AbsOptions {
+}
+
+
 table LogicalAndOptions {
 }
 
@@ -658,6 +676,24 @@ table FloorModOptions {
 table RangeOptions {
 }
 
+table LeakyReluOptions {
+  alpha:float;
+}
+
+table SquaredDifferenceOptions {
+}
+
+enum MirrorPadMode : byte {
+  // Doesn't include borders.
+  REFLECT = 0,
+  // Includes borders.
+  SYMMETRIC = 1,
+}
+
+table MirrorPadOptions {
+  mode:MirrorPadMode;
+}
+
 // An OperatorCode can be an enum value (BuiltinOperator) if the operator is a
 // builtin, or a string if the operator is custom.
 table OperatorCode {
diff --git a/tensorflow/lite/schema/schema_generated.h b/tensorflow/lite/schema/schema_generated.h
index b7885cfcc50..af8b143364e 100755
--- a/tensorflow/lite/schema/schema_generated.h
+++ b/tensorflow/lite/schema/schema_generated.h
@@ -148,6 +148,9 @@ struct SqueezeOptionsT;
 struct SplitOptions;
 struct SplitOptionsT;
 
+struct SplitVOptions;
+struct SplitVOptionsT;
+
 struct StridedSliceOptions;
 struct StridedSliceOptionsT;
 
@@ -226,6 +229,9 @@ struct LogicalOrOptionsT;
 struct OneHotOptions;
 struct OneHotOptionsT;
 
+struct AbsOptions;
+struct AbsOptionsT;
+
 struct LogicalAndOptions;
 struct LogicalAndOptionsT;
 
@@ -253,6 +259,15 @@ struct FloorModOptionsT;
 struct RangeOptions;
 struct RangeOptionsT;
 
+struct LeakyReluOptions;
+struct LeakyReluOptionsT;
+
+struct SquaredDifferenceOptions;
+struct SquaredDifferenceOptionsT;
+
+struct MirrorPadOptions;
+struct MirrorPadOptionsT;
+
 struct OperatorCode;
 struct OperatorCodeT;
 
@@ -500,11 +515,16 @@ enum BuiltinOperator {
   BuiltinOperator_FLOOR_MOD = 95,
   BuiltinOperator_RANGE = 96,
   BuiltinOperator_RESIZE_NEAREST_NEIGHBOR = 97,
+  BuiltinOperator_LEAKY_RELU = 98,
+  BuiltinOperator_SQUARED_DIFFERENCE = 99,
+  BuiltinOperator_MIRROR_PAD = 100,
+  BuiltinOperator_ABS = 101,
+  BuiltinOperator_SPLIT_V = 102,
   BuiltinOperator_MIN = BuiltinOperator_ADD,
-  BuiltinOperator_MAX = BuiltinOperator_RESIZE_NEAREST_NEIGHBOR
+  BuiltinOperator_MAX = BuiltinOperator_SPLIT_V
 };
 
-inline const BuiltinOperator (&EnumValuesBuiltinOperator())[97] {
+inline const BuiltinOperator (&EnumValuesBuiltinOperator())[102] {
   static const BuiltinOperator values[] = {
     BuiltinOperator_ADD,
     BuiltinOperator_AVERAGE_POOL_2D,
@@ -602,7 +622,12 @@ inline const BuiltinOperator (&EnumValuesBuiltinOperator())[97] {
     BuiltinOperator_FILL,
     BuiltinOperator_FLOOR_MOD,
     BuiltinOperator_RANGE,
-    BuiltinOperator_RESIZE_NEAREST_NEIGHBOR
+    BuiltinOperator_RESIZE_NEAREST_NEIGHBOR,
+    BuiltinOperator_LEAKY_RELU,
+    BuiltinOperator_SQUARED_DIFFERENCE,
+    BuiltinOperator_MIRROR_PAD,
+    BuiltinOperator_ABS,
+    BuiltinOperator_SPLIT_V
   };
   return values;
 }
@@ -707,6 +732,11 @@ inline const char * const *EnumNamesBuiltinOperator() {
     "FLOOR_MOD",
     "RANGE",
     "RESIZE_NEAREST_NEIGHBOR",
+    "LEAKY_RELU",
+    "SQUARED_DIFFERENCE",
+    "MIRROR_PAD",
+    "ABS",
+    "SPLIT_V",
     nullptr
   };
   return names;
@@ -793,11 +823,16 @@ enum BuiltinOptions {
   BuiltinOptions_FloorModOptions = 72,
   BuiltinOptions_RangeOptions = 73,
   BuiltinOptions_ResizeNearestNeighborOptions = 74,
+  BuiltinOptions_LeakyReluOptions = 75,
+  BuiltinOptions_SquaredDifferenceOptions = 76,
+  BuiltinOptions_MirrorPadOptions = 77,
+  BuiltinOptions_AbsOptions = 78,
+  BuiltinOptions_SplitVOptions = 79,
   BuiltinOptions_MIN = BuiltinOptions_NONE,
-  BuiltinOptions_MAX = BuiltinOptions_ResizeNearestNeighborOptions
+  BuiltinOptions_MAX = BuiltinOptions_SplitVOptions
 };
 
-inline const BuiltinOptions (&EnumValuesBuiltinOptions())[75] {
+inline const BuiltinOptions (&EnumValuesBuiltinOptions())[80] {
   static const BuiltinOptions values[] = {
     BuiltinOptions_NONE,
     BuiltinOptions_Conv2DOptions,
@@ -873,7 +908,12 @@ inline const BuiltinOptions (&EnumValuesBuiltinOptions())[75] {
     BuiltinOptions_UnidirectionalSequenceLSTMOptions,
     BuiltinOptions_FloorModOptions,
     BuiltinOptions_RangeOptions,
-    BuiltinOptions_ResizeNearestNeighborOptions
+    BuiltinOptions_ResizeNearestNeighborOptions,
+    BuiltinOptions_LeakyReluOptions,
+    BuiltinOptions_SquaredDifferenceOptions,
+    BuiltinOptions_MirrorPadOptions,
+    BuiltinOptions_AbsOptions,
+    BuiltinOptions_SplitVOptions
   };
   return values;
 }
@@ -955,6 +995,11 @@ inline const char * const *EnumNamesBuiltinOptions() {
     "FloorModOptions",
     "RangeOptions",
     "ResizeNearestNeighborOptions",
+    "LeakyReluOptions",
+    "SquaredDifferenceOptions",
+    "MirrorPadOptions",
+    "AbsOptions",
+    "SplitVOptions",
     nullptr
   };
   return names;
@@ -1265,6 +1310,26 @@ template<> struct BuiltinOptionsTraits<ResizeNearestNeighborOptions> {
   static const BuiltinOptions enum_value = BuiltinOptions_ResizeNearestNeighborOptions;
 };
 
+template<> struct BuiltinOptionsTraits<LeakyReluOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_LeakyReluOptions;
+};
+
+template<> struct BuiltinOptionsTraits<SquaredDifferenceOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_SquaredDifferenceOptions;
+};
+
+template<> struct BuiltinOptionsTraits<MirrorPadOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_MirrorPadOptions;
+};
+
+template<> struct BuiltinOptionsTraits<AbsOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_AbsOptions;
+};
+
+template<> struct BuiltinOptionsTraits<SplitVOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_SplitVOptions;
+};
+
 struct BuiltinOptionsUnion {
   BuiltinOptions type;
   void *value;
@@ -1888,6 +1953,46 @@ struct BuiltinOptionsUnion {
     return type == BuiltinOptions_ResizeNearestNeighborOptions ?
       reinterpret_cast<const ResizeNearestNeighborOptionsT *>(value) : nullptr;
   }
+  LeakyReluOptionsT *AsLeakyReluOptions() {
+    return type == BuiltinOptions_LeakyReluOptions ?
+      reinterpret_cast<LeakyReluOptionsT *>(value) : nullptr;
+  }
+  const LeakyReluOptionsT *AsLeakyReluOptions() const {
+    return type == BuiltinOptions_LeakyReluOptions ?
+      reinterpret_cast<const LeakyReluOptionsT *>(value) : nullptr;
+  }
+  SquaredDifferenceOptionsT *AsSquaredDifferenceOptions() {
+    return type == BuiltinOptions_SquaredDifferenceOptions ?
+      reinterpret_cast<SquaredDifferenceOptionsT *>(value) : nullptr;
+  }
+  const SquaredDifferenceOptionsT *AsSquaredDifferenceOptions() const {
+    return type == BuiltinOptions_SquaredDifferenceOptions ?
+      reinterpret_cast<const SquaredDifferenceOptionsT *>(value) : nullptr;
+  }
+  MirrorPadOptionsT *AsMirrorPadOptions() {
+    return type == BuiltinOptions_MirrorPadOptions ?
+      reinterpret_cast<MirrorPadOptionsT *>(value) : nullptr;
+  }
+  const MirrorPadOptionsT *AsMirrorPadOptions() const {
+    return type == BuiltinOptions_MirrorPadOptions ?
+      reinterpret_cast<const MirrorPadOptionsT *>(value) : nullptr;
+  }
+  AbsOptionsT *AsAbsOptions() {
+    return type == BuiltinOptions_AbsOptions ?
+      reinterpret_cast<AbsOptionsT *>(value) : nullptr;
+  }
+  const AbsOptionsT *AsAbsOptions() const {
+    return type == BuiltinOptions_AbsOptions ?
+      reinterpret_cast<const AbsOptionsT *>(value) : nullptr;
+  }
+  SplitVOptionsT *AsSplitVOptions() {
+    return type == BuiltinOptions_SplitVOptions ?
+      reinterpret_cast<SplitVOptionsT *>(value) : nullptr;
+  }
+  const SplitVOptionsT *AsSplitVOptions() const {
+    return type == BuiltinOptions_SplitVOptions ?
+      reinterpret_cast<const SplitVOptionsT *>(value) : nullptr;
+  }
 };
 
 bool VerifyBuiltinOptions(flatbuffers::Verifier &verifier, const void *obj, BuiltinOptions type);
@@ -2085,6 +2190,35 @@ inline const char *EnumNameCombinerType(CombinerType e) {
   return EnumNamesCombinerType()[index];
 }
 
+enum MirrorPadMode {
+  MirrorPadMode_REFLECT = 0,
+  MirrorPadMode_SYMMETRIC = 1,
+  MirrorPadMode_MIN = MirrorPadMode_REFLECT,
+  MirrorPadMode_MAX = MirrorPadMode_SYMMETRIC
+};
+
+inline const MirrorPadMode (&EnumValuesMirrorPadMode())[2] {
+  static const MirrorPadMode values[] = {
+    MirrorPadMode_REFLECT,
+    MirrorPadMode_SYMMETRIC
+  };
+  return values;
+}
+
+inline const char * const *EnumNamesMirrorPadMode() {
+  static const char * const names[] = {
+    "REFLECT",
+    "SYMMETRIC",
+    nullptr
+  };
+  return names;
+}
+
+inline const char *EnumNameMirrorPadMode(MirrorPadMode e) {
+  const size_t index = static_cast<int>(e);
+  return EnumNamesMirrorPadMode()[index];
+}
+
 enum CustomOptionsFormat {
   CustomOptionsFormat_FLEXBUFFERS = 0,
   CustomOptionsFormat_MIN = CustomOptionsFormat_FLEXBUFFERS,
@@ -4935,6 +5069,60 @@ inline flatbuffers::Offset<SplitOptions> CreateSplitOptions(
 
 flatbuffers::Offset<SplitOptions> CreateSplitOptions(flatbuffers::FlatBufferBuilder &_fbb, const SplitOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
+struct SplitVOptionsT : public flatbuffers::NativeTable {
+  typedef SplitVOptions TableType;
+  int32_t num_splits;
+  SplitVOptionsT()
+      : num_splits(0) {
+  }
+};
+
+struct SplitVOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef SplitVOptionsT NativeTableType;
+  enum {
+    VT_NUM_SPLITS = 4
+  };
+  int32_t num_splits() const {
+    return GetField<int32_t>(VT_NUM_SPLITS, 0);
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int32_t>(verifier, VT_NUM_SPLITS) &&
+           verifier.EndTable();
+  }
+  SplitVOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(SplitVOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<SplitVOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const SplitVOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct SplitVOptionsBuilder {
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_num_splits(int32_t num_splits) {
+    fbb_.AddElement<int32_t>(SplitVOptions::VT_NUM_SPLITS, num_splits, 0);
+  }
+  explicit SplitVOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  SplitVOptionsBuilder &operator=(const SplitVOptionsBuilder &);
+  flatbuffers::Offset<SplitVOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<SplitVOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<SplitVOptions> CreateSplitVOptions(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    int32_t num_splits = 0) {
+  SplitVOptionsBuilder builder_(_fbb);
+  builder_.add_num_splits(num_splits);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<SplitVOptions> CreateSplitVOptions(flatbuffers::FlatBufferBuilder &_fbb, const SplitVOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
 struct StridedSliceOptionsT : public flatbuffers::NativeTable {
   typedef StridedSliceOptions TableType;
   int32_t begin_mask;
@@ -6247,6 +6435,46 @@ inline flatbuffers::Offset<OneHotOptions> CreateOneHotOptions(
 
 flatbuffers::Offset<OneHotOptions> CreateOneHotOptions(flatbuffers::FlatBufferBuilder &_fbb, const OneHotOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
+struct AbsOptionsT : public flatbuffers::NativeTable {
+  typedef AbsOptions TableType;
+  AbsOptionsT() {
+  }
+};
+
+struct AbsOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef AbsOptionsT NativeTableType;
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+  AbsOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(AbsOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<AbsOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const AbsOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct AbsOptionsBuilder {
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  explicit AbsOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  AbsOptionsBuilder &operator=(const AbsOptionsBuilder &);
+  flatbuffers::Offset<AbsOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<AbsOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<AbsOptions> CreateAbsOptions(
+    flatbuffers::FlatBufferBuilder &_fbb) {
+  AbsOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<AbsOptions> CreateAbsOptions(flatbuffers::FlatBufferBuilder &_fbb, const AbsOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
 struct LogicalAndOptionsT : public flatbuffers::NativeTable {
   typedef LogicalAndOptions TableType;
   LogicalAndOptionsT() {
@@ -6633,6 +6861,154 @@ inline flatbuffers::Offset<RangeOptions> CreateRangeOptions(
 
 flatbuffers::Offset<RangeOptions> CreateRangeOptions(flatbuffers::FlatBufferBuilder &_fbb, const RangeOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
+struct LeakyReluOptionsT : public flatbuffers::NativeTable {
+  typedef LeakyReluOptions TableType;
+  float alpha;
+  LeakyReluOptionsT()
+      : alpha(0.0f) {
+  }
+};
+
+struct LeakyReluOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef LeakyReluOptionsT NativeTableType;
+  enum {
+    VT_ALPHA = 4
+  };
+  float alpha() const {
+    return GetField<float>(VT_ALPHA, 0.0f);
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<float>(verifier, VT_ALPHA) &&
+           verifier.EndTable();
+  }
+  LeakyReluOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(LeakyReluOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<LeakyReluOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const LeakyReluOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct LeakyReluOptionsBuilder {
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_alpha(float alpha) {
+    fbb_.AddElement<float>(LeakyReluOptions::VT_ALPHA, alpha, 0.0f);
+  }
+  explicit LeakyReluOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  LeakyReluOptionsBuilder &operator=(const LeakyReluOptionsBuilder &);
+  flatbuffers::Offset<LeakyReluOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<LeakyReluOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<LeakyReluOptions> CreateLeakyReluOptions(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    float alpha = 0.0f) {
+  LeakyReluOptionsBuilder builder_(_fbb);
+  builder_.add_alpha(alpha);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<LeakyReluOptions> CreateLeakyReluOptions(flatbuffers::FlatBufferBuilder &_fbb, const LeakyReluOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct SquaredDifferenceOptionsT : public flatbuffers::NativeTable {
+  typedef SquaredDifferenceOptions TableType;
+  SquaredDifferenceOptionsT() {
+  }
+};
+
+struct SquaredDifferenceOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef SquaredDifferenceOptionsT NativeTableType;
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+  SquaredDifferenceOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(SquaredDifferenceOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<SquaredDifferenceOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const SquaredDifferenceOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct SquaredDifferenceOptionsBuilder {
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  explicit SquaredDifferenceOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  SquaredDifferenceOptionsBuilder &operator=(const SquaredDifferenceOptionsBuilder &);
+  flatbuffers::Offset<SquaredDifferenceOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<SquaredDifferenceOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<SquaredDifferenceOptions> CreateSquaredDifferenceOptions(
+    flatbuffers::FlatBufferBuilder &_fbb) {
+  SquaredDifferenceOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<SquaredDifferenceOptions> CreateSquaredDifferenceOptions(flatbuffers::FlatBufferBuilder &_fbb, const SquaredDifferenceOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct MirrorPadOptionsT : public flatbuffers::NativeTable {
+  typedef MirrorPadOptions TableType;
+  MirrorPadMode mode;
+  MirrorPadOptionsT()
+      : mode(MirrorPadMode_REFLECT) {
+  }
+};
+
+struct MirrorPadOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef MirrorPadOptionsT NativeTableType;
+  enum {
+    VT_MODE = 4
+  };
+  MirrorPadMode mode() const {
+    return static_cast<MirrorPadMode>(GetField<int8_t>(VT_MODE, 0));
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int8_t>(verifier, VT_MODE) &&
+           verifier.EndTable();
+  }
+  MirrorPadOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(MirrorPadOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<MirrorPadOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const MirrorPadOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct MirrorPadOptionsBuilder {
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_mode(MirrorPadMode mode) {
+    fbb_.AddElement<int8_t>(MirrorPadOptions::VT_MODE, static_cast<int8_t>(mode), 0);
+  }
+  explicit MirrorPadOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  MirrorPadOptionsBuilder &operator=(const MirrorPadOptionsBuilder &);
+  flatbuffers::Offset<MirrorPadOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<MirrorPadOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<MirrorPadOptions> CreateMirrorPadOptions(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    MirrorPadMode mode = MirrorPadMode_REFLECT) {
+  MirrorPadOptionsBuilder builder_(_fbb);
+  builder_.add_mode(mode);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<MirrorPadOptions> CreateMirrorPadOptions(flatbuffers::FlatBufferBuilder &_fbb, const MirrorPadOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
 struct OperatorCodeT : public flatbuffers::NativeTable {
   typedef OperatorCode TableType;
   BuiltinOperator builtin_code;
@@ -6988,6 +7364,21 @@ struct Operator FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   const ResizeNearestNeighborOptions *builtin_options_as_ResizeNearestNeighborOptions() const {
     return builtin_options_type() == BuiltinOptions_ResizeNearestNeighborOptions ? static_cast<const ResizeNearestNeighborOptions *>(builtin_options()) : nullptr;
   }
+  const LeakyReluOptions *builtin_options_as_LeakyReluOptions() const {
+    return builtin_options_type() == BuiltinOptions_LeakyReluOptions ? static_cast<const LeakyReluOptions *>(builtin_options()) : nullptr;
+  }
+  const SquaredDifferenceOptions *builtin_options_as_SquaredDifferenceOptions() const {
+    return builtin_options_type() == BuiltinOptions_SquaredDifferenceOptions ? static_cast<const SquaredDifferenceOptions *>(builtin_options()) : nullptr;
+  }
+  const MirrorPadOptions *builtin_options_as_MirrorPadOptions() const {
+    return builtin_options_type() == BuiltinOptions_MirrorPadOptions ? static_cast<const MirrorPadOptions *>(builtin_options()) : nullptr;
+  }
+  const AbsOptions *builtin_options_as_AbsOptions() const {
+    return builtin_options_type() == BuiltinOptions_AbsOptions ? static_cast<const AbsOptions *>(builtin_options()) : nullptr;
+  }
+  const SplitVOptions *builtin_options_as_SplitVOptions() const {
+    return builtin_options_type() == BuiltinOptions_SplitVOptions ? static_cast<const SplitVOptions *>(builtin_options()) : nullptr;
+  }
   const flatbuffers::Vector<uint8_t> *custom_options() const {
     return GetPointer<const flatbuffers::Vector<uint8_t> *>(VT_CUSTOM_OPTIONS);
   }
@@ -7315,6 +7706,26 @@ template<> inline const ResizeNearestNeighborOptions *Operator::builtin_options_
   return builtin_options_as_ResizeNearestNeighborOptions();
 }
 
+template<> inline const LeakyReluOptions *Operator::builtin_options_as<LeakyReluOptions>() const {
+  return builtin_options_as_LeakyReluOptions();
+}
+
+template<> inline const SquaredDifferenceOptions *Operator::builtin_options_as<SquaredDifferenceOptions>() const {
+  return builtin_options_as_SquaredDifferenceOptions();
+}
+
+template<> inline const MirrorPadOptions *Operator::builtin_options_as<MirrorPadOptions>() const {
+  return builtin_options_as_MirrorPadOptions();
+}
+
+template<> inline const AbsOptions *Operator::builtin_options_as<AbsOptions>() const {
+  return builtin_options_as_AbsOptions();
+}
+
+template<> inline const SplitVOptions *Operator::builtin_options_as<SplitVOptions>() const {
+  return builtin_options_as_SplitVOptions();
+}
+
 struct OperatorBuilder {
   flatbuffers::FlatBufferBuilder &fbb_;
   flatbuffers::uoffset_t start_;
@@ -8932,6 +9343,32 @@ inline flatbuffers::Offset<SplitOptions> CreateSplitOptions(flatbuffers::FlatBuf
       _num_splits);
 }
 
+inline SplitVOptionsT *SplitVOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new SplitVOptionsT();
+  UnPackTo(_o, _resolver);
+  return _o;
+}
+
+inline void SplitVOptions::UnPackTo(SplitVOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = num_splits(); _o->num_splits = _e; };
+}
+
+inline flatbuffers::Offset<SplitVOptions> SplitVOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const SplitVOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateSplitVOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<SplitVOptions> CreateSplitVOptions(flatbuffers::FlatBufferBuilder &_fbb, const SplitVOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const SplitVOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _num_splits = _o->num_splits;
+  return tflite::CreateSplitVOptions(
+      _fbb,
+      _num_splits);
+}
+
 inline StridedSliceOptionsT *StridedSliceOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
   auto _o = new StridedSliceOptionsT();
   UnPackTo(_o, _resolver);
@@ -9593,6 +10030,29 @@ inline flatbuffers::Offset<OneHotOptions> CreateOneHotOptions(flatbuffers::FlatB
       _axis);
 }
 
+inline AbsOptionsT *AbsOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new AbsOptionsT();
+  UnPackTo(_o, _resolver);
+  return _o;
+}
+
+inline void AbsOptions::UnPackTo(AbsOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+}
+
+inline flatbuffers::Offset<AbsOptions> AbsOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const AbsOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateAbsOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<AbsOptions> CreateAbsOptions(flatbuffers::FlatBufferBuilder &_fbb, const AbsOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const AbsOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return tflite::CreateAbsOptions(
+      _fbb);
+}
+
 inline LogicalAndOptionsT *LogicalAndOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
   auto _o = new LogicalAndOptionsT();
   UnPackTo(_o, _resolver);
@@ -9806,6 +10266,81 @@ inline flatbuffers::Offset<RangeOptions> CreateRangeOptions(flatbuffers::FlatBuf
       _fbb);
 }
 
+inline LeakyReluOptionsT *LeakyReluOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new LeakyReluOptionsT();
+  UnPackTo(_o, _resolver);
+  return _o;
+}
+
+inline void LeakyReluOptions::UnPackTo(LeakyReluOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = alpha(); _o->alpha = _e; };
+}
+
+inline flatbuffers::Offset<LeakyReluOptions> LeakyReluOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const LeakyReluOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateLeakyReluOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<LeakyReluOptions> CreateLeakyReluOptions(flatbuffers::FlatBufferBuilder &_fbb, const LeakyReluOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const LeakyReluOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _alpha = _o->alpha;
+  return tflite::CreateLeakyReluOptions(
+      _fbb,
+      _alpha);
+}
+
+inline SquaredDifferenceOptionsT *SquaredDifferenceOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new SquaredDifferenceOptionsT();
+  UnPackTo(_o, _resolver);
+  return _o;
+}
+
+inline void SquaredDifferenceOptions::UnPackTo(SquaredDifferenceOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+}
+
+inline flatbuffers::Offset<SquaredDifferenceOptions> SquaredDifferenceOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const SquaredDifferenceOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateSquaredDifferenceOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<SquaredDifferenceOptions> CreateSquaredDifferenceOptions(flatbuffers::FlatBufferBuilder &_fbb, const SquaredDifferenceOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const SquaredDifferenceOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return tflite::CreateSquaredDifferenceOptions(
+      _fbb);
+}
+
+inline MirrorPadOptionsT *MirrorPadOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new MirrorPadOptionsT();
+  UnPackTo(_o, _resolver);
+  return _o;
+}
+
+inline void MirrorPadOptions::UnPackTo(MirrorPadOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = mode(); _o->mode = _e; };
+}
+
+inline flatbuffers::Offset<MirrorPadOptions> MirrorPadOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const MirrorPadOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateMirrorPadOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<MirrorPadOptions> CreateMirrorPadOptions(flatbuffers::FlatBufferBuilder &_fbb, const MirrorPadOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const MirrorPadOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _mode = _o->mode;
+  return tflite::CreateMirrorPadOptions(
+      _fbb,
+      _mode);
+}
+
 inline OperatorCodeT *OperatorCode::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
   auto _o = new OperatorCodeT();
   UnPackTo(_o, _resolver);
@@ -10360,6 +10895,26 @@ inline bool VerifyBuiltinOptions(flatbuffers::Verifier &verifier, const void *ob
       auto ptr = reinterpret_cast<const ResizeNearestNeighborOptions *>(obj);
       return verifier.VerifyTable(ptr);
     }
+    case BuiltinOptions_LeakyReluOptions: {
+      auto ptr = reinterpret_cast<const LeakyReluOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_SquaredDifferenceOptions: {
+      auto ptr = reinterpret_cast<const SquaredDifferenceOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_MirrorPadOptions: {
+      auto ptr = reinterpret_cast<const MirrorPadOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_AbsOptions: {
+      auto ptr = reinterpret_cast<const AbsOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_SplitVOptions: {
+      auto ptr = reinterpret_cast<const SplitVOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
     default: return false;
   }
 }
@@ -10674,6 +11229,26 @@ inline void *BuiltinOptionsUnion::UnPack(const void *obj, BuiltinOptions type, c
       auto ptr = reinterpret_cast<const ResizeNearestNeighborOptions *>(obj);
       return ptr->UnPack(resolver);
     }
+    case BuiltinOptions_LeakyReluOptions: {
+      auto ptr = reinterpret_cast<const LeakyReluOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_SquaredDifferenceOptions: {
+      auto ptr = reinterpret_cast<const SquaredDifferenceOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_MirrorPadOptions: {
+      auto ptr = reinterpret_cast<const MirrorPadOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_AbsOptions: {
+      auto ptr = reinterpret_cast<const AbsOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_SplitVOptions: {
+      auto ptr = reinterpret_cast<const SplitVOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
     default: return nullptr;
   }
 }
@@ -10976,6 +11551,26 @@ inline flatbuffers::Offset<void> BuiltinOptionsUnion::Pack(flatbuffers::FlatBuff
       auto ptr = reinterpret_cast<const ResizeNearestNeighborOptionsT *>(value);
       return CreateResizeNearestNeighborOptions(_fbb, ptr, _rehasher).Union();
     }
+    case BuiltinOptions_LeakyReluOptions: {
+      auto ptr = reinterpret_cast<const LeakyReluOptionsT *>(value);
+      return CreateLeakyReluOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_SquaredDifferenceOptions: {
+      auto ptr = reinterpret_cast<const SquaredDifferenceOptionsT *>(value);
+      return CreateSquaredDifferenceOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_MirrorPadOptions: {
+      auto ptr = reinterpret_cast<const MirrorPadOptionsT *>(value);
+      return CreateMirrorPadOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_AbsOptions: {
+      auto ptr = reinterpret_cast<const AbsOptionsT *>(value);
+      return CreateAbsOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_SplitVOptions: {
+      auto ptr = reinterpret_cast<const SplitVOptionsT *>(value);
+      return CreateSplitVOptions(_fbb, ptr, _rehasher).Union();
+    }
     default: return 0;
   }
 }
@@ -11278,6 +11873,26 @@ inline BuiltinOptionsUnion::BuiltinOptionsUnion(const BuiltinOptionsUnion &u) FL
       value = new ResizeNearestNeighborOptionsT(*reinterpret_cast<ResizeNearestNeighborOptionsT *>(u.value));
       break;
     }
+    case BuiltinOptions_LeakyReluOptions: {
+      value = new LeakyReluOptionsT(*reinterpret_cast<LeakyReluOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_SquaredDifferenceOptions: {
+      value = new SquaredDifferenceOptionsT(*reinterpret_cast<SquaredDifferenceOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_MirrorPadOptions: {
+      value = new MirrorPadOptionsT(*reinterpret_cast<MirrorPadOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_AbsOptions: {
+      value = new AbsOptionsT(*reinterpret_cast<AbsOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_SplitVOptions: {
+      value = new SplitVOptionsT(*reinterpret_cast<SplitVOptionsT *>(u.value));
+      break;
+    }
     default:
       break;
   }
@@ -11655,6 +12270,31 @@ inline void BuiltinOptionsUnion::Reset() {
       delete ptr;
       break;
     }
+    case BuiltinOptions_LeakyReluOptions: {
+      auto ptr = reinterpret_cast<LeakyReluOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_SquaredDifferenceOptions: {
+      auto ptr = reinterpret_cast<SquaredDifferenceOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_MirrorPadOptions: {
+      auto ptr = reinterpret_cast<MirrorPadOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_AbsOptions: {
+      auto ptr = reinterpret_cast<AbsOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_SplitVOptions: {
+      auto ptr = reinterpret_cast<SplitVOptionsT *>(value);
+      delete ptr;
+      break;
+    }
     default: break;
   }
   value = nullptr;
diff --git a/tensorflow/lite/string_util.cc b/tensorflow/lite/string_util.cc
index 1b33f5bcba0..6efa11d60c5 100644
--- a/tensorflow/lite/string_util.cc
+++ b/tensorflow/lite/string_util.cc
@@ -96,8 +96,7 @@ int DynamicBuffer::WriteToBuffer(char** buffer) {
   return bytes;
 }
 
-void DynamicBuffer::WriteToTensor(TfLiteTensor* tensor) {
-  // Set tensor content pointer to tensor_buffer, and release original data.
+void DynamicBuffer::WriteToTensorAsVector(TfLiteTensor* tensor) {
   auto dims = TfLiteIntArrayCreate(1);
   dims->data[0] = offset_.size() - 1;  // Store number of strings.
   WriteToTensor(tensor, dims);
@@ -108,6 +107,10 @@ void DynamicBuffer::WriteToTensor(TfLiteTensor* tensor,
   char* tensor_buffer;
   int bytes = WriteToBuffer(&tensor_buffer);
 
+  if (new_shape == nullptr) {
+    new_shape = TfLiteIntArrayCopy(tensor->dims);
+  }
+
   // Set tensor content pointer to tensor_buffer, and release original data.
   TfLiteTensorReset(tensor->type, tensor->name, new_shape, tensor->params,
                     tensor_buffer, bytes, kTfLiteDynamic, tensor->allocation,
diff --git a/tensorflow/lite/string_util.h b/tensorflow/lite/string_util.h
index c9b74482f7d..f076db76f2d 100644
--- a/tensorflow/lite/string_util.h
+++ b/tensorflow/lite/string_util.h
@@ -74,12 +74,18 @@ class DynamicBuffer {
   // The function allocates space for the buffer but does NOT take ownership.
   int WriteToBuffer(char** buffer);
 
-  // Fill content into a string tensor, with the given new_shape. The new
-  // shape must match the number of strings in this object.
+  // Fill content into a string tensor, with the given new_shape. The new shape
+  // must match the number of strings in this object. Caller relinquishes
+  // ownership of new_shape. If 'new_shape' is nullptr, keep the tensor's
+  // existing shape.
   void WriteToTensor(TfLiteTensor* tensor, TfLiteIntArray* new_shape);
 
   // Fill content into a string tensor. Set shape to {num_strings}.
-  void WriteToTensor(TfLiteTensor* tensor);
+  void WriteToTensorAsVector(TfLiteTensor* tensor);
+
+  // Deprecated. Use WriteToTensorAsVector() or pass in the new shpe.
+  // TODO(b/120230709): remove when people migrate away.
+  void WriteToTensor(TfLiteTensor* tensor) { WriteToTensorAsVector(tensor); }
 
  private:
   // Data buffer to store contents of strings, not including headers.
diff --git a/tensorflow/lite/string_util_test.cc b/tensorflow/lite/string_util_test.cc
index 377cdd77eb4..cbf1d7b226a 100644
--- a/tensorflow/lite/string_util_test.cc
+++ b/tensorflow/lite/string_util_test.cc
@@ -55,7 +55,7 @@ TEST(StringUtil, TestStringUtil) {
   new_shape->data[0] = 2;
   new_shape->data[1] = 1;
   buf0.WriteToTensor(t0, new_shape);
-  buf1.WriteToTensor(t1);
+  buf1.WriteToTensorAsVector(t1);
 
   // Check tensor shapes.
   EXPECT_EQ(t0->dims->size, 2);
@@ -99,7 +99,7 @@ TEST(StringUtil, TestAddJoinedString) {
 
   DynamicBuffer buf;
   buf.AddJoinedString({{s0, 3}, {s1, 4}, {s2, 0}, {s3, 3}}, ' ');
-  buf.WriteToTensor(t0);
+  buf.WriteToTensorAsVector(t0);
 
   ASSERT_EQ(GetStringCount(t0), 1);
   StringRef str_ref;
@@ -115,12 +115,43 @@ TEST(StringUtil, TestEmptyList) {
   t0->type = kTfLiteString;
   t0->allocation_type = kTfLiteDynamic;
   DynamicBuffer buf;
-  buf.WriteToTensor(t0);
+  buf.WriteToTensorAsVector(t0);
 
   ASSERT_EQ(GetStringCount(t0), 0);
   ASSERT_EQ(t0->bytes, 8);
 }
 
+TEST(StringUtil, TestShapes) {
+  Interpreter interpreter;
+  interpreter.AddTensors(1);
+  TfLiteTensor* t0 = interpreter.tensor(0);
+  t0->type = kTfLiteString;
+  t0->allocation_type = kTfLiteDynamic;
+  t0->dims = TfLiteIntArrayCreate(2);
+  t0->dims->data[0] = 2;
+  t0->dims->data[1] = 1;
+
+  // Not setting a new shape: number of strings must match
+  DynamicBuffer buf;
+  buf.AddString("ABC", 3);
+  buf.AddString("X", 1);
+  buf.WriteToTensor(t0, nullptr);
+
+  ASSERT_EQ(t0->dims->size, 2);
+  EXPECT_EQ(t0->dims->data[0], 2);
+  EXPECT_EQ(t0->dims->data[1], 1);
+
+  auto new_shape = TfLiteIntArrayCreate(2);
+  new_shape->data[0] = 1;
+  new_shape->data[1] = 2;
+
+  buf.WriteToTensor(t0, new_shape);
+
+  ASSERT_EQ(t0->dims->size, 2);
+  EXPECT_EQ(t0->dims->data[0], 1);
+  EXPECT_EQ(t0->dims->data[1], 2);
+}
+
 }  // namespace tflite
 
 int main(int argc, char** argv) {
diff --git a/tensorflow/lite/testing/generate_examples.py b/tensorflow/lite/testing/generate_examples.py
index 2b129df766a..4c731a7d18e 100644
--- a/tensorflow/lite/testing/generate_examples.py
+++ b/tensorflow/lite/testing/generate_examples.py
@@ -103,8 +103,8 @@ KNOWN_BUGS = {
     r"batch_to_space_nd.*input_shape=\[8,2,2,2,1,1\]": "70594733",
     # Div will use floordiv.
     r"div.*int32": "72051395",
-    # No support for SplitV
-    r"split.*num_or_size_splits=\[2,2\]": "73377559",
+    # Constant 1D gather crashes toco.
+    r"gather_buggy.*input_shape=\[3\].*": "120029508",
 }
 
 
@@ -370,7 +370,8 @@ def make_zip_of_tests(zip_path,
                       make_graph,
                       make_test_inputs,
                       extra_toco_options=ExtraTocoOptions(),
-                      use_frozen_graph=False):
+                      use_frozen_graph=False,
+                      expected_tf_success=None):
   """Helper to make a zip file of a bunch of TensorFlow models.
 
   This does a cartestian product of the dictionary of test_parameters and
@@ -390,6 +391,8 @@ def make_zip_of_tests(zip_path,
       `output_tensors` and returns tuple `(input_values, output_values)`.
     extra_toco_options: Additional toco options.
     use_frozen_graph: Whether or not freeze graph before toco converter.
+    expected_tf_success: Number of times tensorflow is supposed to succeed in
+      executing the input graphs. `None` means "unknown".
 
   Raises:
     RuntimeError: if there are toco errors that can't be ignored.
@@ -550,6 +553,11 @@ def make_zip_of_tests(zip_path,
                    " and %d TOCO converted graphs (%.1f%%"), zip_path,
                   total_conversions, tf_success, toco_success, percent)
 
+  if expected_tf_success is not None and tf_success != expected_tf_success:
+    raise RuntimeError(
+        "Expected TF to succeed %d times, but that happened %d times" %
+        (expected_tf_success, tf_success))
+
   if not FLAGS.ignore_toco_errors and toco_errors > 0:
     raise RuntimeError(
         "Found %d errors while generating toco models" % toco_errors)
@@ -616,6 +624,30 @@ def make_max_pool_tests(zip_path):
   make_pool_tests(tf.nn.max_pool)(zip_path)
 
 
+def make_abs_tests(zip_path):
+  """Make a set of tests to do relu."""
+
+  # Chose a set of parameters
+  test_parameters = [{
+      "input_shape": [[], [1], [2, 3], [1, 1, 1, 1], [1, 3, 4, 3],
+                      [3, 15, 14, 3], [3, 1, 2, 4, 6], [2, 2, 3, 4, 5, 6]],
+  }]
+
+  def build_graph(parameters):
+    input_tensor = tf.placeholder(
+        dtype=tf.float32, name="input", shape=parameters["input_shape"])
+    out = tf.abs(input_tensor)
+    return [input_tensor], [out]
+
+  def build_inputs(parameters, sess, inputs, outputs):
+    input_values = create_tensor_data(
+        np.float32, parameters["input_shape"], min_value=-10, max_value=10)
+    return [input_values], sess.run(
+        outputs, feed_dict=dict(zip(inputs, [input_values])))
+
+  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+
+
 def make_relu_tests(zip_path):
   """Make a set of tests to do relu."""
 
@@ -747,6 +779,34 @@ def make_prelu_tests(zip_path):
       use_frozen_graph=True)
 
 
+def make_leaky_relu_tests(zip_path):
+  """Make a set of tests to do LeakyRelu."""
+
+  test_parameters = [
+      {
+          "input_shape": [[], [1], [5], [1, 10, 10, 3], [3, 3, 3, 3]],
+          "alpha": [0.1, 1.0, 2.0, -0.1, -1.0, -2.0],
+      },
+  ]
+
+  def build_graph(parameters):
+    """Build the graph for the test case."""
+
+    input_tensor = tf.placeholder(
+        dtype=tf.float32, name="input", shape=parameters["input_shape"])
+    out = tf.nn.leaky_relu(input_tensor, alpha=parameters["alpha"])
+    return [input_tensor], [out]
+
+  def build_inputs(parameters, sess, inputs, outputs):
+    """Build the inputs for the test case."""
+    input_values = create_tensor_data(
+        np.float32, parameters["input_shape"], min_value=-3, max_value=10)
+    return [input_values], sess.run(
+        outputs, feed_dict=dict(zip(inputs, [input_values])))
+
+  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+
+
 # This function tests various TensorFLow functions that generates Const op,
 # including `tf.ones`, `tf.zeros` and random functions.
 def make_constant_tests(zip_path):
@@ -755,6 +815,7 @@ def make_constant_tests(zip_path):
   test_parameters = [{
       "dtype": [tf.float32, tf.int32],
       "input_shape": [[], [1], [2], [1, 1, 1, 1], [2, 2, 2, 2]],
+      "constant_is_also_output": [True, False],
   }]
 
   def build_graph(parameters):
@@ -764,17 +825,19 @@ def make_constant_tests(zip_path):
         shape=parameters["input_shape"])
     constant = tf.constant(
         create_tensor_data(parameters["dtype"], parameters["input_shape"]))
-    # This maximum node is here to avoid the situation where a graph output is
-    # a constant, which is an error in toco.
-    out = tf.maximum(dummy_input, constant)
-    return [dummy_input], [out]
+    out = [tf.maximum(dummy_input, constant)]
+    if parameters["constant_is_also_output"]:
+      out.append(constant)
+
+    return [dummy_input], out
 
   def build_inputs(parameters, sess, inputs, outputs):
     dummy_input = np.zeros(
         parameters["input_shape"], dtype=_TF_TYPE_INFO[parameters["dtype"]][0])
     return [dummy_input], sess.run(outputs, feed_dict={inputs[0]: dummy_input})
 
-  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs,
+                    expected_tf_success=20)
 
 
 def make_binary_op_tests(zip_path, binary_operator):
@@ -869,34 +932,46 @@ def make_reduce_tests(reduce_op,
   def f(zip_path):
     """Actual function that generates examples."""
 
-    test_parameters = [{
-        "input_dtype": [tf.float32, tf.int32, tf.int64],
-        "input_shape": [[3, 2, 4]],
-        "axis": [
-            0, 1, 2, [0, 1], [0, 2], [1, 2], [0, 1, 2], [1, 0], [2, 0],
-            [2, 1], [2, 1, 0], [2, 0, 1], -1, -2, -3, [1, -1], [0, -1], [-1, 0],
-            [-1, -2, -3], [0, 0, 0], [2, 2, 0], [1, 0, -3, -3]
-        ],
-        "const_axis": [True, False],
-        "keepdims": [True, False],
-    }, {
-        "input_dtype": [tf.float32],
-        "input_shape": [[1, 8, 8, 3]],
-        "axis": [
-            0, 1, 2, 3, [1, 2], [0, 3], [1, 2, 3], [0, 1, 2, 3],
-            [3, 2, 1, 0], [3, 1, 0, 2], [2, 0], [3, 0], [3, 1], [1, 0], -1, -2,
-            -3, -4, [0, -2], [2, 3, -1, 0], [3, 1, 2, -3], [3, -4], [2, 2, 2],
-            [2, 2, 3], [-3, -3, -4], [-3, 2, 1]
-        ],
-        "const_axis": [True, False],
-        "keepdims": [True, False],
-    }, {
-        "input_dtype": [tf.float32],
-        "input_shape": [[], [1, 8, 8, 3], [3, 2, 4]],
-        "axis": [None],
-        "const_axis": [True],
-        "keepdims": [True, False],
-    }]
+    test_parameters = [
+        {
+            "input_dtype": [tf.float32, tf.int32, tf.int64],
+            "input_shape": [[3, 3, 2, 4]],
+            "axis": [
+                0, 1, 2, [0, 1], [0, 2], [1, 2], [0, 1, 2], [1, 0], [2, 0],
+                [2, 1], [2, 1, 0], [2, 0, 1], -1, -2, -3, [1, -1], [0, -1],
+                [-1, 0], [-1, -2, -3], [0, 0, 0], [2, 2, 0], [1, 0, -3, -3]
+            ],
+            "const_axis": [True, False],
+            "keepdims": [True, False],
+        },
+        {
+            "input_dtype": [tf.float32],
+            "input_shape": [[1, 8, 8, 3]],
+            "axis": [
+                0, 1, 2, 3, [1, 2], [0, 3], [1, 2, 3], [0, 1, 2,
+                                                        3], [3, 2, 1, 0],
+                [3, 1, 0, 2], [2, 0], [3, 0], [3, 1], [1, 0], -1, -2, -3, -4,
+                [0, -2], [2, 3, -1, 0], [3, 1, 2, -3], [3, -4], [2, 2, 2],
+                [2, 2, 3], [-3, -3, -4], [-3, 2, 1]
+            ],
+            "const_axis": [True, False],
+            "keepdims": [True, False],
+        },
+        {
+            "input_dtype": [tf.float32],
+            "input_shape": [[], [1, 8, 8, 3], [3, 2, 4]],
+            "axis": [[]],  # shape is: [0]
+            "const_axis": [False],
+            "keepdims": [True, False],
+        },
+        {
+            "input_dtype": [tf.float32],
+            "input_shape": [[], [1, 8, 8, 3], [3, 2, 4]],
+            "axis": [None],  # shape is: []
+            "const_axis": [True],
+            "keepdims": [True, False],
+        }
+    ]
 
     def build_graph(parameters):
       """Build the mean op testing graph."""
@@ -1135,6 +1210,10 @@ def make_floor_mod_tests(zip_path):
   make_binary_op_tests(zip_path, tf.floormod)
 
 
+def make_squared_difference_tests(zip_path):
+  make_binary_op_tests(zip_path, tf.squared_difference)
+
+
 def make_gather_tests(zip_path):
   """Make a set of tests to do gather."""
 
@@ -1142,9 +1221,9 @@ def make_gather_tests(zip_path):
       # TODO(mgubin): add string tests when they are supported by Toco.
       # TODO(mgubin): add tests for Nd indices when they are supported by
       # TfLite.
-      "params_dtype": [tf.float32, tf.int32],
+      "params_dtype": [tf.float32, tf.int32, tf.int64],
       "params_shape": [[10], [1, 2, 20]],
-      "indices_dtype": [tf.int32],
+      "indices_dtype": [tf.int32, tf.int64],
       "indices_shape": [[3], [5]],
       "axis": [-1, 0, 1],
   }]
@@ -1172,7 +1251,43 @@ def make_gather_tests(zip_path):
     return [params, indices], sess.run(
         outputs, feed_dict=dict(zip(inputs, [params, indices])))
 
-  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+  # Note that TF can't execute with index=1 and params_shape=[10].
+  make_zip_of_tests(
+      zip_path,
+      test_parameters,
+      build_graph,
+      build_inputs,
+      expected_tf_success=60)
+
+
+def make_gather_buggy_tests(zip_path):
+  """Make a set of tests to show gather crashes toco."""
+
+  test_parameters = [{
+      "input_shape": [[3]],
+      "reference_shape": [[2]],
+  }, {
+      "input_shape": [[2, 3]],
+      "reference_shape": [[2, 3]],
+  }]
+
+  def build_graph(parameters):
+    """Build a graph where the inputs to Gather are constants."""
+    reference = tf.placeholder(
+        dtype=tf.int32, shape=parameters["reference_shape"])
+    gather_input = tf.constant(
+        create_tensor_data(tf.int32, parameters["input_shape"]))
+    gather_indices = tf.constant([0, 1], tf.int32)
+    out = tf.equal(reference, tf.gather(gather_input, gather_indices))
+    return [reference], [out]
+
+  def build_inputs(parameters, sess, inputs, outputs):
+    reference_values = np.zeros(parameters["reference_shape"], dtype=np.int32)
+    return [reference_values], sess.run(
+        outputs, feed_dict={inputs[0]: reference_values})
+
+  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs,
+                    expected_tf_success=2)
 
 
 def make_global_batch_norm_tests(zip_path):
@@ -1340,23 +1455,27 @@ def make_conv_with_shared_weights_tests(zip_path):
     input_shape, filter_shape = get_tensor_shapes(parameters)
     input_tensor = tf.placeholder(
         dtype=tf.float32, name="input", shape=input_shape)
+    input_tensors = [input_tensor]
 
     # Construct a constant weights tensor which will be used by both Conv2D.
     filter_tensor = tf.constant(
         create_tensor_data(np.float32, filter_shape), dtype=tf.float32)
-    input_tensors = [input_tensor]
+
+    # Ensure that FuseBinaryIntoFollowingAffine works with an input which
+    # is shared by multiple affine ops.
+    conv_input = input_tensor + 0.1
 
     # Construct 2 Conv2D operations which use exactly the same input and
     # weights.
     result1 = tf.nn.conv2d(
-        input_tensor,
+        conv_input,
         filter_tensor,
         strides=parameters["strides"],
         dilations=parameters["dilations"],
         padding=parameters["padding"],
         data_format=parameters["data_format"])
     result2 = tf.nn.conv2d(
-        input_tensor,
+        conv_input,
         filter_tensor,
         strides=parameters["strides"],
         dilations=parameters["dilations"],
@@ -1524,7 +1643,7 @@ def make_split_tests(zip_path):
 
   test_parameters = [{
       "input_shape": [[1, 3, 4, 6], [2, 4, 1], [6, 4], [8]],
-      "num_or_size_splits": [1, 2, 3, 4, 5, [2, 2]],
+      "num_or_size_splits": [1, 2, 3, 4, 5],
       "axis": [0, 1, 2, 3, -4, -3, -2, -1],
   }]
 
@@ -1542,6 +1661,29 @@ def make_split_tests(zip_path):
   make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
 
 
+def make_splitv_tests(zip_path):
+  """Make a set of tests to do tf.split_v."""
+
+  test_parameters = [{
+      "input_shape": [[1, 3, 4, 6], [2, 4, 1], [6, 4], [8]],
+      "size_splits": [[2, 2], [1, 3], [4, 2], [5, 3],
+                      [-1, 1], [-1, 2], [-1, 4]],
+      "axis": [0, 1, 2, 3, -4, -3, -2, -1],
+  }]
+
+  def build_graph(parameters):
+    input_tensor = tf.placeholder(
+        dtype=tf.float32, name="input", shape=parameters["input_shape"])
+    out = tf.split(input_tensor, parameters["size_splits"], parameters["axis"])
+    return [input_tensor], [out[0]]
+
+  def build_inputs(parameters, sess, inputs, outputs):
+    values = [create_tensor_data(np.float32, parameters["input_shape"])]
+    return values, sess.run(outputs, feed_dict=dict(zip(inputs, values)))
+
+  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+
+
 def make_concat_tests(zip_path):
   """Make a set of tests to do concatenation."""
 
@@ -2468,6 +2610,32 @@ def make_strided_slice_1d_exhaustive_tests(zip_path):
   _make_strided_slice_tests(zip_path, test_parameters)
 
 
+def make_strided_slice_buggy_tests(zip_path):
+  """Make a set of tests to show strided_slice yields incorrect results."""
+
+  test_parameters = [{
+      "unused_iteration_counter": [1],
+  }]
+
+  def build_graph(parameters):
+    """Build the strided_slice op testing graph."""
+    del parameters
+    input_values = tf.placeholder(dtype=tf.float32, shape=[4, 2])
+    data = tf.constant([[0, 1, 2, 3],
+                        [4, 5, 6, 7],
+                        [8, 9, 10, 11],
+                        [12, 13, 14, 15]], tf.float32)
+    return [input_values], [input_values + data[:, :2]]
+
+  def build_inputs(parameters, sess, inputs, outputs):
+    del parameters
+    input_values = np.zeros([4, 2], dtype=np.float32)
+    return [input_values], sess.run(
+        outputs, feed_dict={inputs[0]: input_values})
+
+  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+
+
 def make_lstm_tests(zip_path):
   """Make a set of tests to do basic Lstm cell."""
 
@@ -3121,7 +3289,7 @@ def make_transpose_conv_tests(zip_path):
 def make_tile_tests(zip_path):
   """Make a set of tests to do tile."""
   test_parameters = [{
-      "input_dtype": [tf.float32, tf.int32],
+      "input_dtype": [tf.float32, tf.int32, tf.bool],
       "input_shape": [[3, 2, 1], [2, 2, 2]],
       "multiplier_dtype": [tf.int32, tf.int64],
       "multiplier_shape": [[3]]
@@ -3143,8 +3311,10 @@ def make_tile_tests(zip_path):
   def build_inputs(parameters, sess, inputs, outputs):
     input_value = create_tensor_data(parameters["input_dtype"],
                                      parameters["input_shape"])
-    multipliers_value = create_tensor_data(parameters["multiplier_dtype"],
-                                           parameters["multiplier_shape"])
+    multipliers_value = create_tensor_data(
+        parameters["multiplier_dtype"],
+        parameters["multiplier_shape"],
+        min_value=0)
     return [input_value, multipliers_value], sess.run(
         outputs,
         feed_dict={
@@ -3365,6 +3535,36 @@ def make_range_tests(zip_path):
   make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
 
 
+def make_fill_tests(zip_path):
+  """Make a set of tests to do fill."""
+
+  test_parameters = [{
+      "dims_dtype": [tf.int32, tf.int64],
+      "dims_shape": [[], [1], [3], [3, 3]],
+      "value_dtype": [tf.int32, tf.int64, tf.float32],
+  }]
+
+  def build_graph(parameters):
+    """Build the fill op testing graph."""
+    input1 = tf.placeholder(
+        dtype=parameters["dims_dtype"],
+        name="dims",
+        shape=parameters["dims_shape"])
+    input2 = tf.placeholder(
+        dtype=parameters["value_dtype"], name="value", shape=[])
+    out = tf.fill(input1, input2)
+    return [input1, input2], [out]
+
+  def build_inputs(parameters, sess, inputs, outputs):
+    input1 = create_tensor_data(parameters["dims_dtype"],
+                                parameters["dims_shape"], 1)
+    input2 = create_scalar_data(parameters["value_dtype"])
+    return [input1, input2], sess.run(
+        outputs, feed_dict=dict(zip(inputs, [input1, input2])))
+
+  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+
+
 def _make_logical_tests(op):
   """Make a set of tests to do logical operations."""
 
@@ -3416,6 +3616,141 @@ def make_logical_xor_tests(zip_path):
   return _make_logical_tests(tf.logical_xor)(zip_path)
 
 
+def make_mirror_pad_tests(zip_path):
+  """Make a set of tests to do mirror_pad."""
+
+  test_parameters = [
+      {
+          "input_shape": [[2, 3]],
+          "padding_matrix": [[[1, 1], [2, 1]]],
+          "mode": ["REFLECT"],
+          "type": ["const"]
+      },
+      {
+          "input_shape": [[2, 3]],
+          "padding_matrix": [[[1, 1], [1, 1]]],
+          "mode": ["REFLECT"],
+          "type": ["const"]
+      },
+      {
+          "input_shape": [[2, 3]],
+          "padding_matrix": [[[1, 1], [2, 1]]],
+          "mode": ["SYMMETRIC"],
+          "type": ["placeholder"]
+      },
+      {
+          "input_shape": [[2, 3]],
+          "padding_matrix": [[[1, 1], [2, 1]]],
+          "mode": ["REFLECT"],
+          "type": ["placeholder"]
+      },
+      {
+          "input_shape": [[3]],
+          "padding_matrix": [[[0, 2]]],
+          "mode": ["SYMMETRIC"],
+          "type": ["placeholder"]
+      },
+      {
+          "input_shape": [[3]],
+          "padding_matrix": [[[0, 2]]],
+          "mode": ["SYMMETRIC"],
+          "type": ["const"]
+      },
+      {
+          "input_shape": [[3]],
+          "padding_matrix": [[[0, 2]]],
+          "mode": ["REFLECT"],
+          "type": ["const"]
+      },
+  ]
+
+  def build_graph(parameters):
+    """Build the graph for the test case."""
+
+    input_tensor = tf.placeholder(
+        dtype=tf.int32, name="input", shape=parameters["input_shape"])
+    if parameters["type"] != "const":
+      padding_matrix = tf.placeholder(
+          dtype=tf.int32,
+          name="padding",
+          shape=[len(parameters["input_shape"]), 2])
+      input_tensors = [input_tensor, padding_matrix]
+    else:
+      padding_matrix = tf.constant(np.array(parameters["padding_matrix"]))
+      input_tensors = [input_tensor]
+    output = tf.pad(
+        input_tensor, paddings=padding_matrix, mode=parameters["mode"])
+
+    return input_tensors, [output]
+
+  def build_inputs(parameters, sess, inputs, outputs):
+    input_values = [create_tensor_data(tf.int32, parameters["input_shape"])]
+    if parameters["type"] != "const":
+      input_values.append(np.array(parameters["padding_matrix"]))
+    return input_values, sess.run(
+        outputs, feed_dict=dict(zip(inputs, input_values)))
+
+  make_zip_of_tests(
+      zip_path,
+      test_parameters,
+      build_graph,
+      build_inputs,
+      expected_tf_success=7)
+
+
+def make_unroll_batch_matmul_tests(zip_path):
+  """Make a set of tests to test unroll_batch_matmul."""
+
+  test_parameters = [{"dtype": [tf.float32], "shape": [[(2, 2, 3), (2, 3, 2)]]}]
+
+  def build_graph(parameters):
+    """Build the batch_matmul op testing graph."""
+    input_tensor1 = tf.placeholder(
+        dtype=parameters["dtype"], shape=parameters["shape"][0])
+    input_tensor2 = tf.placeholder(
+        dtype=parameters["dtype"], shape=parameters["shape"][1])
+    # Should be unrolled and replaced with fully_connected ops in the end.
+    out = tf.matmul(input_tensor1, input_tensor2)
+    return [input_tensor1, input_tensor2], [out]
+
+  def build_inputs(parameters, sess, inputs, outputs):
+    input_value1 = create_tensor_data(
+        parameters["dtype"], shape=parameters["shape"][0])
+    input_value2 = create_tensor_data(
+        parameters["dtype"], shape=parameters["shape"][1])
+    return [input_value1, input_value2], sess.run(
+        outputs, feed_dict=dict(zip(inputs, [input_value1, input_value2])))
+
+  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+
+
+def make_placeholder_with_default_tests(zip_path):
+  """Make a set of tests to test placeholder_with_default."""
+
+  test_parameters = [{
+      "dtype": [tf.float32, tf.int32, tf.int64],
+  }]
+
+  def build_graph(parameters):
+    """Build the placeholder_with_default testing graph."""
+    const_node = tf.constant(
+        [1, 2, 2, 0], shape=[2, 2], dtype=parameters["dtype"])
+    input_tensor = tf.placeholder_with_default(
+        const_node, shape=[2, 2], name="input")
+    out = tf.equal(input_tensor, const_node, name="output")
+
+    return [input_tensor], [out]
+
+  def build_inputs(parameters, sess, inputs, outputs):
+    numpy_type = _TF_TYPE_INFO[parameters["dtype"]][0]
+    input_value = np.array([[1, 0], [2, 1]], numpy_type)
+    return [input_value], sess.run(
+        outputs, feed_dict=dict(zip(inputs, [input_value])))
+
+  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs,
+                    expected_tf_success=3)
+
+
 # Toco binary path provided by the generate rule.
 bin_path = None
 
diff --git a/tensorflow/lite/testing/generated_examples_zip_test.cc b/tensorflow/lite/testing/generated_examples_zip_test.cc
index aedea52065f..91a4851fb02 100644
--- a/tensorflow/lite/testing/generated_examples_zip_test.cc
+++ b/tensorflow/lite/testing/generated_examples_zip_test.cc
@@ -101,6 +101,10 @@ std::map<string, string> kBrokenTests = {
     {R"(^\/mul.*dtype=tf\.int64)", "119126484"},
     {R"(^\/add.*dtype=tf\.int64)", "119126484"},
     {R"(^\/floor_div.*dtype=tf\.int64)", "119126484"},
+    {R"(^\/squared_difference.*dtype=tf\.int64)", "119126484"},
+
+    // Strided Slice chooses the wrong dimension.
+    {R"(^\/strided_slice_buggy)", "119786029"},
 };
 
 // Allows test data to be unarchived into a temporary directory and makes
diff --git a/tensorflow/lite/testing/tflite_driver.cc b/tensorflow/lite/testing/tflite_driver.cc
index 3a0febb780c..27e3a3770bb 100644
--- a/tensorflow/lite/testing/tflite_driver.cc
+++ b/tensorflow/lite/testing/tflite_driver.cc
@@ -147,9 +147,10 @@ TfLiteDriver::TfLiteDriver(bool use_nnapi, const string& delegate_name)
 }
 
 TfLiteDriver::~TfLiteDriver() {
-  for (TfLiteTensor* t : tensors_to_deallocate_) {
-    free(t->data.raw);
+  for (auto t : tensors_to_deallocate_) {
+    DeallocateStringTensor(t.second);
   }
+  interpreter_.reset();
 }
 
 void TfLiteDriver::AllocateTensors() {
@@ -242,12 +243,10 @@ void TfLiteDriver::SetInput(int id, const string& csv_values) {
     case kTfLiteString: {
       string s = absl::HexStringToBytes(csv_values);
 
-      tensor->data.raw = reinterpret_cast<char*>(malloc(s.size()));
-      tensor->bytes = s.size();
+      DeallocateStringTensor(tensors_to_deallocate_[id]);
+      AllocateStringTensor(id, s.size(), tensor);
       memcpy(tensor->data.raw, s.data(), s.size());
 
-      // We must remember to free the memory we allocated above.
-      tensors_to_deallocate_.push_back(tensor);
       break;
     }
     default:
diff --git a/tensorflow/lite/testing/tflite_driver.h b/tensorflow/lite/testing/tflite_driver.h
index d8b40565bac..1da0533c57c 100644
--- a/tensorflow/lite/testing/tflite_driver.h
+++ b/tensorflow/lite/testing/tflite_driver.h
@@ -49,6 +49,18 @@ class TfLiteDriver : public TestRunner {
   string ReadOutput(int id) override { return "no-op"; }
 
  private:
+  void DeallocateStringTensor(TfLiteTensor* t) {
+    if (t) {
+      free(t->data.raw);
+      t->data.raw = nullptr;
+    }
+  }
+  void AllocateStringTensor(int id, size_t num_bytes, TfLiteTensor* t) {
+    t->data.raw = reinterpret_cast<char*>(malloc(num_bytes));
+    t->bytes = num_bytes;
+    tensors_to_deallocate_[id] = t;
+  }
+
   void ResetLSTMStateTensors();
 
   class Expectation;
@@ -59,7 +71,7 @@ class TfLiteDriver : public TestRunner {
   std::unique_ptr<Interpreter> interpreter_;
   std::map<int, std::unique_ptr<Expectation>> expected_output_;
   bool must_allocate_tensors_ = true;
-  std::vector<TfLiteTensor*> tensors_to_deallocate_;
+  std::map<int, TfLiteTensor*> tensors_to_deallocate_;
 };
 
 }  // namespace testing
diff --git a/tensorflow/lite/toco/BUILD b/tensorflow/lite/toco/BUILD
index 14302874441..82aa1f557ef 100644
--- a/tensorflow/lite/toco/BUILD
+++ b/tensorflow/lite/toco/BUILD
@@ -395,9 +395,10 @@ tf_cc_test(
 
 # :toco is the main public command-line tool exposing the functionality
 # of the :toco_tooling library.
-tf_cc_binary(
-    name = "toco",
-    srcs = ["toco.cc"],
+cc_library(
+    name = "toco_convert",
+    srcs = ["toco_convert.cc"],
+    hdrs = ["toco_convert.h"],
     visibility = ["//visibility:public"],
     deps = [
         ":model",
@@ -416,6 +417,51 @@ tf_cc_binary(
     ],
 )
 
+tf_cc_binary(
+    name = "toco",
+    srcs = ["toco.cc"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":model",
+        ":model_cmdline_flags",
+        ":model_flags_proto_cc",
+        ":toco_cmdline_flags",
+        ":toco_convert",
+        ":toco_flags_proto_cc",
+        ":toco_port",
+        ":toco_tooling",
+        ":types_proto_cc",
+        "@com_google_absl//absl/strings",
+        "//tensorflow/core:lib",
+        # We cannot embed the core:ops dependency directly into :toco_tooling as
+        # it can conflict with downstream deps when toco is used as a library.
+        "//tensorflow/core:ops",
+    ],
+)
+
+tf_cc_test(
+    name = "toco_convert_test",
+    srcs = ["toco_convert_test.cc"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":model",
+        ":model_cmdline_flags",
+        ":model_flags_proto_cc",
+        ":toco_cmdline_flags",
+        ":toco_convert",
+        ":toco_flags_proto_cc",
+        ":toco_port",
+        ":toco_tooling",
+        ":types_proto_cc",
+        "@com_google_googletest//:gtest_main",
+        "@com_google_absl//absl/strings",
+        "//tensorflow/core:lib",
+        # We cannot embed the core:ops dependency directly into :toco_tooling as
+        # it can conflict with downstream deps when toco is used as a library.
+        "//tensorflow/core:ops",
+    ],
+)
+
 tf_cc_test(
     name = "toco_port_test",
     srcs = ["toco_port_test.cc"],
diff --git a/tensorflow/lite/toco/README.md b/tensorflow/lite/toco/README.md
index 91f6f618a37..fe98a90b385 100644
--- a/tensorflow/lite/toco/README.md
+++ b/tensorflow/lite/toco/README.md
@@ -8,9 +8,9 @@ the usage documentation.
 
 Usage information is given in these documents:
 
-*   [Command-line glossary](g3doc/cmdline_reference.md)
-*   [Command-line examples](g3doc/cmdline_examples.md)
-*   [Python API examples](g3doc/python_api.md)
+*   [Command-line glossary](../g3doc/convert/cmdline_reference.md)
+*   [Command-line examples](../g3doc/convert/cmdline_examples.md)
+*   [Python API examples](../g3doc/convert/python_api.md)
 
 ## Where the converter fits in the TensorFlow landscape
 
@@ -26,4 +26,4 @@ to client devices, generally mobile devices, where the TensorFlow Lite
 interpreter handles them on-device.  This flow is represented in the diagram
 below.
 
-![drawing](g3doc/toco_landscape.svg)
+![drawing](../g3doc/images/convert/workflow.svg)
diff --git a/tensorflow/lite/toco/export_tensorflow.cc b/tensorflow/lite/toco/export_tensorflow.cc
index 1752745aaee..9fff0015527 100644
--- a/tensorflow/lite/toco/export_tensorflow.cc
+++ b/tensorflow/lite/toco/export_tensorflow.cc
@@ -48,7 +48,8 @@ using tensorflow::TensorProto;
 namespace toco {
 namespace {
 
-tensorflow::DataType GetTensorFlowDataType(ArrayDataType data_type) {
+tensorflow::DataType GetTensorFlowDataType(ArrayDataType data_type,
+                                           const string& error_location) {
   switch (data_type) {
     case ArrayDataType::kBool:
       return tensorflow::DT_BOOL;
@@ -66,14 +67,21 @@ tensorflow::DataType GetTensorFlowDataType(ArrayDataType data_type) {
       return tensorflow::DT_COMPLEX64;
     default:
     case ArrayDataType::kNone:
-      LOG(FATAL) << "Unsupported data type: " << static_cast<int>(data_type);
+      LOG(FATAL) << "Unsupported data type '" << ArrayDataTypeName(data_type)
+                 << "' in " << error_location;
       return tensorflow::DT_INVALID;
   }
 }
 
+tensorflow::DataType GetTensorFlowDataTypeForOp(ArrayDataType data_type,
+                                                const string& op_name) {
+  return GetTensorFlowDataType(data_type, "op '" + op_name + "'");
+}
+
 tensorflow::DataType GetTensorFlowDataType(const Model& model,
                                            const string& array_name) {
-  return GetTensorFlowDataType(model.GetArray(array_name).data_type);
+  return GetTensorFlowDataType(model.GetArray(array_name).data_type,
+                               "array '" + array_name + "'");
 }
 
 // TensorFlow sometimes forbids what it calls "legacy scalars",
@@ -1150,6 +1158,29 @@ void ConvertSplitOperator(const Model& model,
                                   tensorflow_graph);
 }
 
+void ConvertSplitVOperator(const Model& model,
+                           const TensorFlowSplitVOperator& src_op,
+                           GraphDef* tensorflow_graph) {
+  tensorflow::NodeDef* split_v_op = tensorflow_graph->add_node();
+  split_v_op->set_op("SplitV");
+  split_v_op->set_name(src_op.outputs[0]);
+  for (const auto& input : src_op.inputs) {
+    *split_v_op->add_input() = input;
+  }
+  (*split_v_op->mutable_attr())["T"].set_type(
+      GetTensorFlowDataType(model, src_op.inputs[0]));
+  (*split_v_op->mutable_attr())["num_split"].set_i(src_op.num_split);
+  const auto& split_dim_array = model.GetArray(src_op.inputs[1]);
+  CHECK(split_dim_array.buffer);
+  CHECK(split_dim_array.data_type == ArrayDataType::kInt32);
+  const auto& split_dim_data =
+      split_dim_array.GetBuffer<ArrayDataType::kInt32>().data;
+  CHECK_EQ(split_dim_data.size(), 1);
+  const int split_dim = split_dim_data[0];
+  CreateDummyConcatDimTensorConst(src_op.inputs[0], split_dim,
+                                  tensorflow_graph);
+}
+
 void ConvertCastOperator(const Model& model, const CastOperator& src_op,
                          GraphDef* tensorflow_graph) {
   tensorflow::NodeDef* cast_op = tensorflow_graph->add_node();
@@ -1285,7 +1316,7 @@ void ConvertRangeOperator(const Model& model, const RangeOperator& src_op,
   *range_op->add_input() = src_op.inputs[1];
   *range_op->add_input() = src_op.inputs[2];
   (*range_op->mutable_attr())["Tidx"].set_type(
-      GetTensorFlowDataType(src_op.dtype));
+      GetTensorFlowDataTypeForOp(src_op.dtype, /*op_name=*/src_op.outputs[0]));
 }
 
 void ConvertPackOperator(const Model& model, const PackOperator& src_op,
@@ -1298,7 +1329,8 @@ void ConvertPackOperator(const Model& model, const PackOperator& src_op,
   }
   (*pack_op->mutable_attr())["axis"].set_i(src_op.axis);
   (*pack_op->mutable_attr())["N"].set_i(src_op.inputs.size());
-  (*pack_op->mutable_attr())["T"].set_type(GetTensorFlowDataType(src_op.dtype));
+  (*pack_op->mutable_attr())["T"].set_type(
+      GetTensorFlowDataTypeForOp(src_op.dtype, src_op.outputs[0]));
 }
 
 void ConvertFillOperator(const Model& model, const FillOperator& src_op,
@@ -1887,7 +1919,7 @@ void ConvertRandomUniformOperator(const Model& model,
       GetTensorFlowDataType(model, src_op.inputs[0]);
   (*new_op->mutable_attr())["T"].set_type(shape_type);
   (*new_op->mutable_attr())["dtype"].set_type(
-      GetTensorFlowDataType(src_op.dtype));
+      GetTensorFlowDataTypeForOp(src_op.dtype, src_op.outputs[0]));
   (*new_op->mutable_attr())["seed"].set_i(src_op.seed);
   (*new_op->mutable_attr())["seed2"].set_i(src_op.seed2);
 }
@@ -2124,6 +2156,10 @@ void ConvertOperator(const Model& model, const Operator& src_op,
     ConvertSplitOperator(model,
                          static_cast<const TensorFlowSplitOperator&>(src_op),
                          tensorflow_graph);
+  } else if (src_op.type == OperatorType::kSplitV) {
+    ConvertSplitVOperator(model,
+                          static_cast<const TensorFlowSplitVOperator&>(src_op),
+                          tensorflow_graph);
   } else if (src_op.type == OperatorType::kFakeQuant) {
     ConvertFakeQuantOperator(static_cast<const FakeQuantOperator&>(src_op),
                              tensorflow_graph);
diff --git a/tensorflow/lite/toco/graph_transformations/fuse_binary_into_following_affine.cc b/tensorflow/lite/toco/graph_transformations/fuse_binary_into_following_affine.cc
index 6b4765b23c4..436b639253f 100644
--- a/tensorflow/lite/toco/graph_transformations/fuse_binary_into_following_affine.cc
+++ b/tensorflow/lite/toco/graph_transformations/fuse_binary_into_following_affine.cc
@@ -221,9 +221,8 @@ void FuseMulOrDivParamsIntoFollowingAffine(Model* model, Operator* following_op,
   Operator* following_op = GetOpWithInput(*model, binary_op->outputs[0]);
 
   if (!following_op) {
-    AddMessageF(
-        "Not fusing %s because it is not consumed by exactly one other op",
-        LogName(*binary_op));
+    AddMessageF("Not fusing %s because it is not consumed by any op",
+                LogName(*binary_op));
     return ::tensorflow::Status::OK();
   }
 
@@ -288,7 +287,10 @@ void FuseMulOrDivParamsIntoFollowingAffine(Model* model, Operator* following_op,
   AddMessageF("Fusing %s into the following %s", LogName(*binary_op),
               LogName(*following_op));
 
-  model->EraseArray(binary_op->outputs[0]);
+  if (CountOpsWithInput(*model, binary_op->outputs[0]) == 1) {
+    model->EraseArray(binary_op->outputs[0]);
+  }
+
   following_op->inputs[0] = binary_op->inputs[index_of_variable_input];
   const auto& old_constant_param_name =
       binary_op->inputs[index_of_constant_input];
diff --git a/tensorflow/lite/toco/graph_transformations/propagate_array_data_types.cc b/tensorflow/lite/toco/graph_transformations/propagate_array_data_types.cc
index 9a458dccb9c..cbae6610d7f 100644
--- a/tensorflow/lite/toco/graph_transformations/propagate_array_data_types.cc
+++ b/tensorflow/lite/toco/graph_transformations/propagate_array_data_types.cc
@@ -86,6 +86,13 @@ void SetDataTypeForAllOutputs(Model* model, Operator* op,
       SetDataTypeForAllOutputs(model, op, data_type);
       break;
     }
+    case OperatorType::kSplitV: {
+      // These operators produce output with the same type as its 1st input
+      CHECK_GE(op->inputs.size(), 3);
+      const ArrayDataType data_type = model->GetArray(op->inputs[0]).data_type;
+      SetDataTypeForAllOutputs(model, op, data_type);
+      break;
+    }
     case OperatorType::kTransposeConv: {
       // These operators produce an output with the same type as their 3rd input
       CHECK_GE(op->inputs.size(), 3);
diff --git a/tensorflow/lite/toco/graph_transformations/propagate_fixed_sizes.cc b/tensorflow/lite/toco/graph_transformations/propagate_fixed_sizes.cc
index 78ea54e452b..0e653f08a04 100644
--- a/tensorflow/lite/toco/graph_transformations/propagate_fixed_sizes.cc
+++ b/tensorflow/lite/toco/graph_transformations/propagate_fixed_sizes.cc
@@ -15,6 +15,7 @@ limitations under the License.
 #include <algorithm>
 #include <iterator>
 #include <memory>
+#include <numeric>
 #include <string>
 #include <unordered_map>
 #include <vector>
@@ -786,6 +787,97 @@ void ProcessTensorFlowSplitOperator(Model* model, TensorFlowSplitOperator* op) {
   }
 }
 
+void ProcessTensorFlowSplitVOperator(Model* model,
+                                     TensorFlowSplitVOperator* op) {
+  CHECK_EQ(op->inputs.size(), 3);
+
+  const auto& input_array = model->GetArray(op->inputs[0]);
+  // Yield until input dims have been resolved.
+  if (!input_array.has_shape()) {
+    return;
+  }
+  const Shape& input_shape = input_array.shape();
+
+  // Yield until size_splits is constant.
+  if (!IsConstantParameterArray(*model, op->inputs[1])) {
+    return;
+  }
+  const auto& size_array = model->GetArray(op->inputs[1]);
+  // Yield until size_splits dims have been resolved.
+  if (!size_array.has_shape()) {
+    return;
+  }
+  const Shape& size_shape = size_array.shape();
+
+  CHECK(size_array.data_type == ArrayDataType::kInt32 ||
+        size_array.data_type == ArrayDataType::kInt64)
+      << "size_splits must be int32, int64";
+  CHECK_EQ(size_shape.dimensions_count(), 1) << "size_splits must be 1-D";
+
+  std::vector<int64> size_splits_vector;
+  if (size_array.data_type == ArrayDataType::kInt32) {
+    for (const auto each_size :
+         size_array.GetBuffer<ArrayDataType::kInt32>().data) {
+      size_splits_vector.push_back(each_size);
+    }
+  } else {
+    size_splits_vector = size_array.GetBuffer<ArrayDataType::kInt64>().data;
+  }
+
+  // Yield until axis is constant.
+  if (!IsConstantParameterArray(*model, op->inputs[2])) {
+    return;
+  }
+  const auto& axis_array = model->GetArray(op->inputs[2]);
+  // Yield until axis dims have been resolved.
+  if (!axis_array.has_shape()) {
+    return;
+  }
+
+  CHECK(axis_array.data_type == ArrayDataType::kInt32)
+      << "Axis array must be int32.";
+  CHECK_EQ(RequiredBufferSizeForShape(axis_array.shape()), 1)
+      << "Axis array must be scalar.";
+
+  int axis = axis_array.GetBuffer<ArrayDataType::kInt32>().data[0];
+  if (axis < 0) {
+    axis += input_shape.dimensions_count();
+  }
+
+  CHECK_EQ(op->num_split, size_splits_vector.size());
+
+  int64_t minus_one_count = 0, size_splits_sum = 0;
+  for (auto size : size_splits_vector) {
+    if (size == -1) {
+      ++minus_one_count;
+    } else {
+      size_splits_sum += size;
+    }
+  }
+
+  const int input_size = input_shape.dims(axis);
+
+  CHECK_LE(minus_one_count, 1) << "size_splits can contain at most one -1.";
+
+  if (minus_one_count == 1) {
+    CHECK_LE(size_splits_sum, input_size);
+    auto iter =
+        std::find(size_splits_vector.begin(), size_splits_vector.end(), -1);
+    *iter = input_size - size_splits_sum;
+  } else {
+    CHECK_EQ(size_splits_sum, input_size);
+  }
+
+  CHECK_EQ(op->outputs.size(), op->num_split);
+
+  for (int i = 0; i < op->outputs.size(); ++i) {
+    const auto& output = op->outputs[i];
+    Shape output_shape = input_shape;
+    (*output_shape.mutable_dims())[axis] = size_splits_vector.at(i);
+    model->GetArray(output).copy_shape(output_shape);
+  }
+}
+
 void ProcessAveragePoolOperator(Model* model, AveragePoolOperator* op) {
   const string& input_name = op->inputs[0];
   const auto& input_array = model->GetArray(input_name);
@@ -1691,6 +1783,51 @@ void ProcessUnpackOperator(Model* model, UnpackOperator* op) {
   }
 }
 
+void ProcessMirrorPadOperator(Model* model, MirrorPadOperator* op) {
+  CHECK_EQ(op->inputs.size(), 2);
+  const auto& input_array = model->GetArray(op->inputs[0]);
+  const auto& padding_matrix = model->GetArray(op->inputs[1]);
+
+  // Yield until input dims have been resolved.
+  if (!input_array.has_shape()) {
+    return;
+  }
+
+  auto& output_array = model->GetArray(op->outputs[0]);
+  // If output already computed or padding matrix is non
+  // const then return.
+  if (output_array.has_shape() ||
+      !IsConstantParameterArray(*model, op->inputs[1])) {
+    return;
+  }
+  Shape output_shape = input_array.shape();
+  std::vector<int>& dims = *output_shape.mutable_dims();
+
+  std::vector<int64_t> padding;
+  if (padding_matrix.data_type == ArrayDataType::kInt32) {
+    const auto& data = padding_matrix.GetBuffer<ArrayDataType::kInt32>().data;
+    for (auto elem : data) {
+      padding.push_back(static_cast<int64_t>(elem));
+    }
+  } else if (padding_matrix.data_type == ArrayDataType::kInt64) {
+    const auto& data = padding_matrix.GetBuffer<ArrayDataType::kInt64>().data;
+    for (auto elem : data) {
+      padding.push_back(elem);
+    }
+  } else {
+    CHECK(padding_matrix.data_type == ArrayDataType::kInt64 ||
+          padding_matrix.data_type == ArrayDataType::kInt32);
+  }
+  CHECK_EQ(padding_matrix.shape().dimensions_count(), 2);
+  CHECK_EQ(input_array.shape().dimensions_count(),
+           padding_matrix.shape().dims(0));
+  for (int i = 0; i < input_array.shape().dimensions_count(); ++i) {
+    dims[i] += padding[i * 2] + padding[i * 2 + 1];
+  }
+
+  output_array.copy_shape(output_shape);
+}
+
 }  // namespace
 
 ::tensorflow::Status PropagateFixedSizes::Run(Model* model,
@@ -1707,6 +1844,7 @@ void ProcessUnpackOperator(Model* model, UnpackOperator* op) {
   }
 
   switch (op->type) {
+    case OperatorType::kAbs:
     case OperatorType::kBatchNormalization:
     case OperatorType::kL2Normalization:
     case OperatorType::kDequantize:
@@ -1714,6 +1852,7 @@ void ProcessUnpackOperator(Model* model, UnpackOperator* op) {
     case OperatorType::kRelu1:
     case OperatorType::kRelu6:
     case OperatorType::kPRelu:
+    case OperatorType::kLeakyRelu:
     case OperatorType::kSoftmax:
     case OperatorType::kLogSoftmax:
     case OperatorType::kLog:
@@ -1759,6 +1898,7 @@ void ProcessUnpackOperator(Model* model, UnpackOperator* op) {
     case OperatorType::kEqual:
     case OperatorType::kNotEqual:
     case OperatorType::kPow:
+    case OperatorType::kSquaredDifference:
       ProcessSimpleBinaryOperator(model, op);
       break;
     case OperatorType::kAddN:
@@ -1834,6 +1974,10 @@ void ProcessUnpackOperator(Model* model, UnpackOperator* op) {
       ProcessTensorFlowSplitOperator(model,
                                      static_cast<TensorFlowSplitOperator*>(op));
       break;
+    case OperatorType::kSplitV:
+      ProcessTensorFlowSplitVOperator(
+          model, static_cast<TensorFlowSplitVOperator*>(op));
+      break;
     case OperatorType::kSqueeze:
       ProcessSqueezeOperator(model, static_cast<SqueezeOperator*>(op));
       break;
@@ -1956,6 +2100,9 @@ void ProcessUnpackOperator(Model* model, UnpackOperator* op) {
     case OperatorType::kUnpack:
       ProcessUnpackOperator(model, static_cast<UnpackOperator*>(op));
       break;
+    case OperatorType::kMirrorPad:
+      ProcessMirrorPadOperator(model, static_cast<MirrorPadOperator*>(op));
+      break;
     default:
       // Unimplemented, another graph transformation should drop it.
       LOG(FATAL) << "Unhandled operator type " << OperatorTypeName(op->type);
diff --git a/tensorflow/lite/toco/graph_transformations/quantize.cc b/tensorflow/lite/toco/graph_transformations/quantize.cc
index e28b7288f01..1146078c301 100644
--- a/tensorflow/lite/toco/graph_transformations/quantize.cc
+++ b/tensorflow/lite/toco/graph_transformations/quantize.cc
@@ -64,7 +64,8 @@ bool SupportsQuantization(const Operator& op) {
          type == OperatorType::kRelu1 || type == OperatorType::kRelu6 ||
          type == OperatorType::kShape || type == OperatorType::kExpandDims ||
          type == OperatorType::kPack || type == OperatorType::kTopK_V2 ||
-         type == OperatorType::kResizeNearestNeighbor;
+         type == OperatorType::kResizeNearestNeighbor ||
+         type == OperatorType::kPRelu;
 }
 
 // The quantized op allows output arrays of type float using
@@ -360,7 +361,7 @@ bool ChooseQuantizationForOperatorOutput(
       op.type == OperatorType::kSpaceToDepth ||
       op.type == OperatorType::kReshape || op.type == OperatorType::kSplit ||
       op.type == OperatorType::kRelu || op.type == OperatorType::kRelu1 ||
-      op.type == OperatorType::kRelu6) {
+      op.type == OperatorType::kRelu6 || op.type == OperatorType::kPRelu) {
     int data_input_index = 0;
     if (op.type == OperatorType::kSplit) {
       data_input_index = 1;
diff --git a/tensorflow/lite/toco/graph_transformations/resolve_constant_gather.cc b/tensorflow/lite/toco/graph_transformations/resolve_constant_gather.cc
index 1149930131e..c72135923e5 100644
--- a/tensorflow/lite/toco/graph_transformations/resolve_constant_gather.cc
+++ b/tensorflow/lite/toco/graph_transformations/resolve_constant_gather.cc
@@ -47,6 +47,10 @@ inline void Gather(const Array& input_array, int input_rank,
     stride *= input_shape.dims(i);
   }
 
+  // Let's make sure we have enough space for all element in the memcpy()
+  // below, which writes 'stride' elements startng at 'i * stride'.
+  CHECK_EQ(stride * coords_shape.dims(0), output_data.size());
+
   for (int i = 0; i < coords_shape.dims(0); ++i) {
     DCHECK_GE(coords_data[i], 0);
     DCHECK_LT(coords_data[i], input_shape.dims(rev_input_rank));
diff --git a/tensorflow/lite/toco/graph_transformations/resolve_reduce_attributes.cc b/tensorflow/lite/toco/graph_transformations/resolve_reduce_attributes.cc
index ea5d33009b4..9ceba45e93f 100644
--- a/tensorflow/lite/toco/graph_transformations/resolve_reduce_attributes.cc
+++ b/tensorflow/lite/toco/graph_transformations/resolve_reduce_attributes.cc
@@ -35,6 +35,11 @@ bool ResolveAttributes(Model* model, T* op) {
 
   const Array& indices_array = model->GetArray(op->inputs[1]);
   if (!indices_array.has_shape()) return false;
+
+  // It is ok for indices_array to have a shape for an empty tensor. In that
+  // case, we don't bother setting 'axis'.
+  if (indices_array.buffer->Length() == 0) return false;
+
   op->axis = indices_array.GetBuffer<ArrayDataType::kInt32>().data;
   return true;
 }
diff --git a/tensorflow/lite/toco/graph_transformations/unroll_batch_matmul.cc b/tensorflow/lite/toco/graph_transformations/unroll_batch_matmul.cc
index d59954fc740..41a735394d7 100644
--- a/tensorflow/lite/toco/graph_transformations/unroll_batch_matmul.cc
+++ b/tensorflow/lite/toco/graph_transformations/unroll_batch_matmul.cc
@@ -117,7 +117,8 @@ namespace toco {
     auto* slice_b_op = new SliceOperator;
     slice_b_op->inputs = {
         batch_op->inputs[1],
-        CreateInt32Array(model, batch_name + "/slice_b/slice/begin", {0, 0, 0}),
+        CreateInt32Array(model, batch_name + "/slice_b/slice/begin",
+                         {batch, 0, 0}),
         CreateInt32Array(
             model, batch_name + "/slice_b/slice/size",
             {1, input_array_b.shape().dims(1), input_array_b.shape().dims(2)}),
diff --git a/tensorflow/lite/toco/import_tensorflow.cc b/tensorflow/lite/toco/import_tensorflow.cc
index 96f3c6a6ab9..0b2f8103943 100644
--- a/tensorflow/lite/toco/import_tensorflow.cc
+++ b/tensorflow/lite/toco/import_tensorflow.cc
@@ -935,6 +935,25 @@ tensorflow::Status ConvertSplitOperator(
   return tensorflow::Status::OK();
 }
 
+tensorflow::Status ConvertSplitVOperator(
+    const NodeDef& node, const TensorFlowImportFlags& tf_import_flags,
+    Model* model) {
+  CHECK_EQ(node.op(), "SplitV");
+  TF_QCHECK_OK(CheckInputsCount(node, tf_import_flags, 3));
+  auto* op = new TensorFlowSplitVOperator;
+  op->inputs.push_back(node.input(0));
+  op->inputs.push_back(node.input(1));
+  op->inputs.push_back(node.input(2));
+  const int num_split = GetIntAttr(node, "num_split");
+  op->outputs.push_back(node.name());
+  for (int i = 1; i < num_split; i++) {
+    op->outputs.push_back(absl::StrCat(node.name(), ":", i));
+  }
+  op->num_split = num_split;
+  model->operators.emplace_back(op);
+  return tensorflow::Status::OK();
+}
+
 tensorflow::Status ConvertSwitchOperator(
     const NodeDef& node, const TensorFlowImportFlags& tf_import_flags,
     Model* model) {
@@ -1134,6 +1153,31 @@ tensorflow::Status ConvertConcatOperator(
   return tensorflow::Status::OK();
 }
 
+tensorflow::Status ConvertMirrorPadOperator(
+    const NodeDef& node, const TensorFlowImportFlags& tf_import_flags,
+    Model* model) {
+  if (node.op() != "MirrorPad") {
+    LOG(FATAL) << "Expected MirrorPad.";
+  }
+  const int num_inputs = GetInputsCount(node, tf_import_flags);
+  CHECK_EQ(num_inputs, 2);
+  auto* op = new MirrorPadOperator;
+  for (int i = 0; i < num_inputs; ++i) {
+    op->inputs.push_back(node.input(i));
+  }
+  op->outputs.push_back(node.name());
+  const auto mode = GetStringAttr(node, "mode");
+  if (mode == "REFLECT") {
+    op->mode = toco::MirrorPadMode::kReflect;
+  } else if (mode == "SYMMETRIC") {
+    op->mode = toco::MirrorPadMode::kSymmetric;
+  }
+
+  model->operators.emplace_back(op);
+
+  return tensorflow::Status::OK();
+}
+
 static constexpr int kAnyNumInputs = -1;
 
 enum FlexSupport { kFlexOk, kFlexNotOk };
@@ -1221,7 +1265,7 @@ void GetOutputNamesFromNodeDef(const NodeDef& node,
 void GetOutputTypesFromNodeDef(const NodeDef& node,
                                const tensorflow::OpDef& op_def,
                                TensorFlowUnsupportedOperator* op) {
-  // The the given type to the op, or clear the types if invalid.
+  // The given type to the op, or clear the types if invalid.
   auto add_type = [&node, op](tensorflow::DataType type) {
     if (type == tensorflow::DT_INVALID) {
       LOG(WARNING) << "Op node missing output type attribute: " << node.name();
@@ -2012,13 +2056,13 @@ bool InlineAllFunctions(GraphDef* graphdef) {
   tensorflow::SessionOptions options;
   auto* device_count = options.config.mutable_device_count();
   device_count->insert({"CPU", 1});
-  std::vector<tensorflow::Device*> devices;
+  std::vector<std::unique_ptr<tensorflow::Device>> devices;
   TF_CHECK_OK(tensorflow::DeviceFactory::AddDevices(
       options, "/job:localhost/replica:0/task:0", &devices));
 
   tensorflow::FunctionLibraryDefinition fld(tensorflow::OpRegistry::Global(),
                                             graphdef_copy.library());
-  tensorflow::DeviceMgr device_mgr(devices);
+  tensorflow::DeviceMgr device_mgr(std::move(devices));
   tensorflow::OptimizerOptions o_opts;
   tensorflow::ProcessFunctionLibraryRuntime pflr(
       &device_mgr, tensorflow::Env::Default(), TF_GRAPH_DEF_VERSION, &fld,
@@ -2220,6 +2264,21 @@ tensorflow::Status ConvertUnidirectionalSequenceLstm(
   return tensorflow::Status::OK();
 }
 
+tensorflow::Status ConvertLeakyReluOperator(
+    const NodeDef& node, const TensorFlowImportFlags& tf_import_flags,
+    Model* model) {
+  CHECK_EQ(node.op(), "LeakyRelu");
+  TF_QCHECK_OK(CheckInputsCount(node, tf_import_flags, 1));
+  CHECK_EQ(GetDataTypeAttr(node, "T"), DT_FLOAT);
+  const auto& input_name = node.input(0);
+  auto* op = new LeakyReluOperator;
+  op->inputs.push_back(input_name);
+  op->outputs.push_back(node.name());
+  op->alpha = GetFloatAttr(node, "alpha");
+  model->operators.emplace_back(op);
+  return tensorflow::Status::OK();
+}
+
 }  // namespace
 
 namespace internal {
@@ -2233,12 +2292,14 @@ ConverterMapType GetTensorFlowNodeConverterMapForFlex() {
   return std::unordered_map<std::string, ConverterType>({
       // We need to let TCO convert Placeholder information into
       // array data, so that the data types are correct.
+      {"LegacyFedInput", ConvertPlaceholderOperator},
       {"Placeholder", ConvertPlaceholderOperator},
   });
 }
 
 ConverterMapType GetTensorFlowNodeConverterMap() {
   return std::unordered_map<std::string, ConverterType>({
+      {"Abs", ConvertSimpleOperator<AbsOperator>},
       {"Add", ConvertSimpleOperator<AddOperator, 2>},
       {"AddN", ConvertSimpleOperatorFlexOk<AddNOperator>},
       {"All", ConvertSimpleOperator<TensorFlowAllOperator>},
@@ -2282,6 +2343,7 @@ ConverterMapType GetTensorFlowNodeConverterMap() {
        ConvertSimpleOperator<TensorFlowGreaterEqualOperator, 2>},
       {"Identity", ConvertIdentityOperator},
       {"LRN", ConvertLRNOperator},
+      {"LeakyRelu", ConvertLeakyReluOperator},
       {"LegacyFedInput", ConvertPlaceholderOperator},
       {"Less", ConvertSimpleOperator<TensorFlowLessOperator, 2>},
       {"LessEqual", ConvertSimpleOperator<TensorFlowLessEqualOperator, 2>},
@@ -2332,8 +2394,11 @@ ConverterMapType GetTensorFlowNodeConverterMap() {
       {"SpaceToDepth", ConvertSpaceToDepthOperator},
       {"SparseToDense", ConvertSparseToDenseOperator},
       {"Split", ConvertSplitOperator},
+      {"SplitV", ConvertSplitVOperator},
       {"Sqrt", ConvertSimpleOperator<TensorFlowSqrtOperator, 1>},
       {"Square", ConvertSimpleOperator<TensorFlowSquareOperator, 1>},
+      {"SquaredDifference",
+       ConvertSimpleOperator<SquaredDifferenceOperator, 2>},
       {"Squeeze", ConvertSqueezeOperator},
       {"StopGradient", ConvertIdentityOperator},
       {"StridedSlice", ConvertStridedSliceOperator},
@@ -2349,6 +2414,7 @@ ConverterMapType GetTensorFlowNodeConverterMap() {
       {"Unpack", ConvertUnpackOperator},
       {"ZerosLike", ConvertSimpleOperator<TensorFlowZerosLikeOperator, 1>},
       {"UnidirectionalSequenceLstm", ConvertUnidirectionalSequenceLstm},
+      {"MirrorPad", ConvertMirrorPadOperator},
   });
 }
 
diff --git a/tensorflow/lite/toco/model.h b/tensorflow/lite/toco/model.h
index f85e1c28787..f19355968fa 100644
--- a/tensorflow/lite/toco/model.h
+++ b/tensorflow/lite/toco/model.h
@@ -121,8 +121,10 @@ enum class OperatorType : uint8 {
   kRsqrt,
   kShape,
   kSplit,
+  kSplitV,
   kSqrt,
   kSquare,
+  kSquaredDifference,
   kSum,
   kSwitch,
   kTile,
@@ -152,7 +154,10 @@ enum class OperatorType : uint8 {
   kCTCBeamSearchDecoder,
   kUnpack,
   kZerosLike,
-  kResizeNearestNeighbor
+  kResizeNearestNeighbor,
+  kLeakyRelu,
+  kAbs,
+  kMirrorPad
 };
 
 // Helper to deal with TensorFlow arrays using a different ordering of
@@ -653,6 +658,17 @@ struct MulOperator : Operator {
   MulOperator() : Operator(OperatorType::kMul) {}
 };
 
+// Element-wise Abs operator:
+//   x -> abs(x)
+//
+// Inputs:
+//   inputs[0]: required: the input array
+//
+// TensorFlow equivalent: Relu
+struct AbsOperator : Operator {
+  AbsOperator() : Operator(OperatorType::kAbs) {}
+};
+
 // Element-wise Relu operator:
 //   x -> max(0, x)
 //
@@ -699,6 +715,19 @@ struct PReluOperator : Operator {
   PReluOperator() : Operator(OperatorType::kPRelu) {}
 };
 
+// LeakyRelu
+//   x -> max(x, alpha * x)
+//
+// Inputs:
+//   inputs[0]: required: the input array
+//
+// TensorFlow equivalent: LeakyRelu
+struct LeakyReluOperator : Operator {
+  LeakyReluOperator() : Operator(OperatorType::kLeakyRelu) {}
+
+  float alpha = 0.2f;  // 0.2 matches the default value for the TF op attribute.
+};
+
 // Element-wise Logistic operator:
 //   x -> Logistic(x) = 1 / (1 + exp(-x))
 //
@@ -1289,6 +1318,17 @@ struct TensorFlowSquareOperator : Operator {
   TensorFlowSquareOperator() : Operator(OperatorType::kSquare) {}
 };
 
+// Element-wise squared difference ((x-y)*(x-y)) operator.
+//
+// Inputs:
+//   inputs[0]: required: the left-hand side array
+//   inputs[1]: required: the right-hand side array
+//
+// TensorFlow equivalent: SquaredDifference
+struct SquaredDifferenceOperator : Operator {
+  SquaredDifferenceOperator() : Operator(OperatorType::kSquaredDifference) {}
+};
+
 // Transposes a tensor.
 //
 // By default, this operation performs a regular matrix transpose on 2-D input
@@ -1363,6 +1403,12 @@ struct TensorFlowSplitOperator : Operator {
   int num_split = 0;
 };
 
+// TensorFlow SplitV equivalent. Refer to TensorFlow documentation for details.
+struct TensorFlowSplitVOperator : Operator {
+  TensorFlowSplitVOperator() : Operator(OperatorType::kSplitV) {}
+  int num_split = 0;
+};
+
 // TensorFlow Concat equivalent. Refer to TensorFlow documentation for details.
 // Not fully supported, just a placeholder to handle TensorFlow graphs and
 // support graph transformations to other operator types by matching sub-graphs.
@@ -1887,6 +1933,23 @@ struct TensorFlowZerosLikeOperator : Operator {
   TensorFlowZerosLikeOperator() : Operator(OperatorType::kZerosLike) {}
 };
 
+enum class MirrorPadMode { kNone, kSymmetric, kReflect };
+
+// MirrorPad Operator:
+//
+// Inputs:
+// Inputs[0]: required: input tensor to be padded.
+// Inputs[1]: required: 2 Column matrix specifying padding sizes. The number of
+// rows must be the same as the rank of the input.
+// Inputs[2]: required: REFLECT or SYMMETRIC.
+//
+// TensorFlow equivalent: MirrorPad.
+struct MirrorPadOperator : Operator {
+  MirrorPadOperator() : Operator(OperatorType::kMirrorPad) {}
+  // mode is either SYMMETRIC or REFLECT.
+  MirrorPadMode mode;
+};
+
 // Alloc's are used for transient arrays only. An Alloc specifies which interval
 // of the "transient_data" workspace buffer passed to inference functions, is to
 // be used for the transient array at hand. The 'start' and 'end' values are
diff --git a/tensorflow/lite/toco/tflite/export.cc b/tensorflow/lite/toco/tflite/export.cc
index 489c21295ef..8b9448486df 100644
--- a/tensorflow/lite/toco/tflite/export.cc
+++ b/tensorflow/lite/toco/tflite/export.cc
@@ -126,7 +126,6 @@ OperatorKey::OperatorKey(
     type_ = builtin_ops.at(name);
     return;
   }
-
   // The logic below is all for custom ops or Flex ops.
   is_custom_op_ = true;
   type_ = BuiltinOperator_CUSTOM;
@@ -332,6 +331,11 @@ Offset<Vector<Offset<Operator>>> ExportOperators(
     std::set<int32_t>* variable_tensor_indices, const ExportParams& params) {
   variable_tensor_indices->clear();
 
+  auto is_tflite_builtin = [](const BaseOperator* op) {
+    const auto& tflite_builtins = GetBuiltinOpsMap();
+    return (op && tflite_builtins.find(op->name()) != tflite_builtins.end());
+  };
+
   // The operators are in execution order, so we just follow tf.mini order.
   std::vector<Offset<Operator>> op_vector;
   for (const auto& op : model.operators) {
@@ -360,7 +364,19 @@ Offset<Vector<Offset<Operator>>> ExportOperators(
     auto options = Options::Custom(0);
 
     std::vector<bool> mutating_input_variables;
-    if (tflite_op) {
+
+    // It is conceivable that an op is exportable via Serialize() but does not
+    // have a corresponding TFLITE builtin. In that case, when flex mode is
+    // enabled we should export it as a flex op, not as a native.
+    bool export_as_flex_op = !is_tflite_builtin(tflite_op) &&
+                             key.is_flex_op() &&
+                             !op->tensorflow_node_def.empty();
+    if (export_as_flex_op) {
+      auto fbb = WriteFlexOpOptions(op->tensorflow_node_def);
+      if (fbb) {
+        options = Options::Custom(builder->CreateVector(fbb->GetBuffer()));
+      }
+    } else if (tflite_op) {
       options = tflite_op->Serialize(*op, builder);
       mutating_input_variables = tflite_op->GetMutatingInputVariables(*op);
 
@@ -373,12 +389,13 @@ Offset<Vector<Offset<Operator>>> ExportOperators(
           variable_tensor_indices->insert(variable_tensor_index);
         }
       }
-    } else if (key.is_flex_op() && !op->tensorflow_node_def.empty()) {
-      auto fbb = WriteFlexOpOptions(op->tensorflow_node_def);
-      if (fbb) {
-        options = Options::Custom(builder->CreateVector(fbb->GetBuffer()));
-      }
+    } else {
+      // We don't know much about this op. It doesn't have a serializer and
+      // it is not supposed to be exported as a flex op. We will treat it as
+      // a regular custom op: we will still create an operator for it, but it
+      // will not have any 'options'.
     }
+
     // The only supported CustomOptionFormat is FLEXBUFFERS now.
     op_vector.push_back(CreateOperator(
         *builder, op_index, builder->CreateVector(inputs),
diff --git a/tensorflow/lite/toco/tflite/export_test.cc b/tensorflow/lite/toco/tflite/export_test.cc
index b6c67772aca..b371296784a 100644
--- a/tensorflow/lite/toco/tflite/export_test.cc
+++ b/tensorflow/lite/toco/tflite/export_test.cc
@@ -46,6 +46,18 @@ class ExportTest : public ::testing::Test {
         input_model_.operators.emplace_back(new AddOperator);
       } else if (name == "Sub") {
         input_model_.operators.emplace_back(new SubOperator);
+      } else if (name == "Assert") {
+        auto* op = new TensorFlowAssertOperator;
+
+        // Even though assert is known to TOCO, it doesn't have a tflite
+        // serializer, so it has to be exported as a custom op. If we attach a
+        // NodeDef to it, however, it will be exported as a flex op instead.
+        ::tensorflow::NodeDef node_def;
+        node_def.set_name("Assert");
+        node_def.set_op("Assert");
+        node_def.SerializeToString(&op->tensorflow_node_def);
+
+        input_model_.operators.emplace_back(op);
       } else {
         auto* op = new TensorFlowUnsupportedOperator;
         op->tensorflow_op = name;
@@ -232,37 +244,38 @@ class OpSetsTest : public ExportTest {
 TEST_F(OpSetsTest, BuiltinsOnly) {
   // --target_op_set=TFLITE_BUILTINS
   SetAllowedOpSets({kTfLiteBuiltins});
-  EXPECT_THAT(ImportExport({"Add", "AdjustHue", "UnrollAndFold"}),
+  EXPECT_THAT(ImportExport({"Add", "AdjustHue", "UnrollAndFold", "Assert"}),
               ElementsAre());
   EXPECT_THAT(ImportExport({"Add"}), ElementsAre("builtin:ADD"));
 
   // --target_op_set=TFLITE_BUILTINS --allow_custom_ops
   SetAllowedOpSets({kTfLiteBuiltins, kCustomOps});
-  EXPECT_THAT(
-      ImportExport({"Add", "AdjustHue", "UnrollAndFold"}),
-      ElementsAre("builtin:ADD", "custom:AdjustHue", "custom:UnrollAndFold"));
+  EXPECT_THAT(ImportExport({"Add", "AdjustHue", "UnrollAndFold", "Assert"}),
+              ElementsAre("builtin:ADD", "custom:AdjustHue", "custom:Assert",
+                          "custom:UnrollAndFold"));
 }
 
 TEST_F(OpSetsTest, TfSelectOnly) {
   // --target_op_set=SELECT_TF_OPS
   SetAllowedOpSets({kSelectTfOps});
-  EXPECT_THAT(
-      ImportExport({"Add", "AdjustHue", "RandomUniform", "UnrollAndFold"}),
-      ElementsAre());
+  EXPECT_THAT(ImportExport({"Add", "AdjustHue", "RandomUniform",
+                            "UnrollAndFold", "Assert"}),
+              ElementsAre());
   EXPECT_THAT(ImportExport({"Add"}), ElementsAre("custom:FlexAdd"));
 
   // --target_op_set=SELECT_TF_OPS --allow_custom_ops
   SetAllowedOpSets({kSelectTfOps, kCustomOps});
   EXPECT_THAT(
-      ImportExport({"Add", "AdjustHue", "RandomUniform", "UnrollAndFold"}),
-      ElementsAre("custom:AdjustHue", "custom:FlexAdd",
+      ImportExport(
+          {"Add", "AdjustHue", "RandomUniform", "UnrollAndFold", "Assert"}),
+      ElementsAre("custom:AdjustHue", "custom:FlexAdd", "custom:FlexAssert",
                   "custom:FlexRandomUniform", "custom:UnrollAndFold"));
 }
 
 TEST_F(OpSetsTest, BuiltinsAndTfSelect) {
   // --target_op_set=TFLITE_BUILTINS,SELECT_TF_OPS
   SetAllowedOpSets({kTfLiteBuiltins, kSelectTfOps});
-  EXPECT_THAT(ImportExport({"Add", "AdjustHue", "UnrollAndFold"}),
+  EXPECT_THAT(ImportExport({"Add", "AdjustHue", "UnrollAndFold", "Assert"}),
               ElementsAre());
   EXPECT_THAT(ImportExport({"Add", "RandomUniform"}),
               ElementsAre("builtin:ADD", "custom:FlexRandomUniform"));
@@ -270,9 +283,10 @@ TEST_F(OpSetsTest, BuiltinsAndTfSelect) {
   // --target_op_set=TFLITE_BUILTINS,SELECT_TF_OPS --allow_custom_ops
   SetAllowedOpSets({kTfLiteBuiltins, kSelectTfOps, kCustomOps});
   EXPECT_THAT(
-      ImportExport({"Add", "AdjustHue", "RandomUniform", "UnrollAndFold"}),
-      ElementsAre("builtin:ADD", "custom:AdjustHue", "custom:FlexRandomUniform",
-                  "custom:UnrollAndFold"));
+      ImportExport(
+          {"Add", "AdjustHue", "RandomUniform", "UnrollAndFold", "Assert"}),
+      ElementsAre("builtin:ADD", "custom:AdjustHue", "custom:FlexAssert",
+                  "custom:FlexRandomUniform", "custom:UnrollAndFold"));
 }
 
 // This test is based on a hypothetical scenario that dilation is supported
diff --git a/tensorflow/lite/toco/tflite/operator.cc b/tensorflow/lite/toco/tflite/operator.cc
index 84ae4482469..e0faed49271 100644
--- a/tensorflow/lite/toco/tflite/operator.cc
+++ b/tensorflow/lite/toco/tflite/operator.cc
@@ -978,6 +978,26 @@ class Split
   int GetVersion(const Operator& op) const override { return 1; }
 };
 
+class SplitV
+    : public BuiltinOperator<TensorFlowSplitVOperator, ::tflite::SplitVOptions,
+                             ::tflite::BuiltinOptions_SplitVOptions> {
+ public:
+  using BuiltinOperator::BuiltinOperator;
+
+  flatbuffers::Offset<TfLiteOptions> WriteOptions(
+      const TocoOperator& op,
+      flatbuffers::FlatBufferBuilder* builder) const override {
+    return ::tflite::CreateSplitVOptions(*builder, op.num_split);
+  }
+
+  void ReadOptions(const TfLiteOptions& options,
+                   TocoOperator* op) const override {
+    op->num_split = options.num_splits();
+  }
+
+  int GetVersion(const Operator& op) const override { return 1; }
+};
+
 class StridedSlice
     : public BuiltinOperator<StridedSliceOperator,
                              ::tflite::StridedSliceOptions,
@@ -1218,6 +1238,66 @@ class Unpack : public BuiltinOperator<UnpackOperator, ::tflite::UnpackOptions,
   int GetVersion(const Operator& op) const override { return 1; }
 };
 
+class LeakyRelu
+    : public BuiltinOperator<LeakyReluOperator, ::tflite::LeakyReluOptions,
+                             ::tflite::BuiltinOptions_LeakyReluOptions> {
+ public:
+  using BuiltinOperator::BuiltinOperator;
+  flatbuffers::Offset<TfLiteOptions> WriteOptions(
+      const TocoOperator& op,
+      flatbuffers::FlatBufferBuilder* builder) const override {
+    return ::tflite::CreateLeakyReluOptions(*builder, op.alpha);
+  }
+  void ReadOptions(const TfLiteOptions& options,
+                   TocoOperator* op) const override {
+    op->alpha = options.alpha();
+  }
+
+  int GetVersion(const Operator& op) const override { return 1; }
+};
+
+class SquaredDifference
+    : public BuiltinOperator<
+          SquaredDifferenceOperator, ::tflite::SquaredDifferenceOptions,
+          ::tflite::BuiltinOptions_SquaredDifferenceOptions> {
+ public:
+  using BuiltinOperator::BuiltinOperator;
+
+  flatbuffers::Offset<TfLiteOptions> WriteOptions(
+      const TocoOperator& op,
+      flatbuffers::FlatBufferBuilder* builder) const override {
+    return ::tflite::CreateSquaredDifferenceOptions(*builder);
+  }
+
+  void ReadOptions(const TfLiteOptions& options,
+                   TocoOperator* op) const override {}
+
+  int GetVersion(const Operator& op) const override { return 1; }
+};
+
+class MirrorPad
+    : public BuiltinOperator<MirrorPadOperator, ::tflite::MirrorPadOptions,
+                             ::tflite::BuiltinOptions_MirrorPadOptions> {
+ public:
+  using BuiltinOperator::BuiltinOperator;
+  flatbuffers::Offset<TfLiteOptions> WriteOptions(
+      const TocoOperator& op,
+      flatbuffers::FlatBufferBuilder* builder) const override {
+    return ::tflite::CreateMirrorPadOptions(
+        *builder, op.mode == MirrorPadMode::kReflect
+                      ? ::tflite::MirrorPadMode::MirrorPadMode_REFLECT
+                      : ::tflite::MirrorPadMode::MirrorPadMode_SYMMETRIC);
+  }
+  void ReadOptions(const TfLiteOptions& options,
+                   TocoOperator* op) const override {
+    op->mode = options.mode() == ::tflite::MirrorPadMode::MirrorPadMode_REFLECT
+                   ? MirrorPadMode::kReflect
+                   : MirrorPadMode::kSymmetric;
+  }
+
+  int GetVersion(const Operator& op) const override { return 1; }
+};
+
 std::unique_ptr<flexbuffers::Builder> WriteFlexOpOptions(
     const string& tensorflow_node_def) {
   auto fbb = absl::make_unique<flexbuffers::Builder>();
@@ -1447,6 +1527,7 @@ std::vector<std::unique_ptr<BaseOperator>> BuildOperatorList(
                                     OperatorType::kMaxPool));
   ops.push_back(
       MakeUnique<Mul>(::tflite::BuiltinOperator_MUL, OperatorType::kMul));
+
   ops.push_back(
       MakeUnique<Pad>(::tflite::BuiltinOperator_PAD, OperatorType::kPad));
   ops.push_back(
@@ -1483,6 +1564,8 @@ std::vector<std::unique_ptr<BaseOperator>> BuildOperatorList(
                                     OperatorType::kSqueeze));
   ops.push_back(
       MakeUnique<Split>(::tflite::BuiltinOperator_SPLIT, OperatorType::kSplit));
+  ops.push_back(MakeUnique<SplitV>(::tflite::BuiltinOperator_SPLIT_V,
+                                   OperatorType::kSplitV));
   ops.push_back(MakeUnique<StridedSlice>(
       ::tflite::BuiltinOperator_STRIDED_SLICE, OperatorType::kStridedSlice));
   ops.push_back(MakeUnique<TopK_V2>(::tflite::BuiltinOperator_TOPK_V2,
@@ -1516,6 +1599,13 @@ std::vector<std::unique_ptr<BaseOperator>> BuildOperatorList(
                                    OperatorType::kOneHot));
   ops.push_back(MakeUnique<Unpack>(::tflite::BuiltinOperator_UNPACK,
                                    OperatorType::kUnpack));
+  ops.push_back(MakeUnique<LeakyRelu>(::tflite::BuiltinOperator_LEAKY_RELU,
+                                      OperatorType::kLeakyRelu));
+  ops.push_back(MakeUnique<SquaredDifference>(
+      ::tflite::BuiltinOperator_SQUARED_DIFFERENCE,
+      OperatorType::kSquaredDifference));
+  ops.push_back(MakeUnique<MirrorPad>(::tflite::BuiltinOperator_MIRROR_PAD,
+                                      OperatorType::kMirrorPad));
 
   // Custom Operators.
   ops.push_back(
@@ -1600,7 +1690,10 @@ std::vector<std::unique_ptr<BaseOperator>> BuildOperatorList(
       "SQUARE", OperatorType::kSquare));
   ops.push_back(MakeUnique<SimpleOperator<TensorFlowZerosLikeOperator>>(
       "ZEROS_LIKE", OperatorType::kZerosLike));
-
+  ops.push_back(
+      MakeUnique<SimpleOperator<AbsOperator>>("ABS", OperatorType::kAbs));
+  ops.push_back(
+      MakeUnique<SimpleOperator<FillOperator>>("FILL", OperatorType::kFill));
   return ops;
 }
 }  // namespace
diff --git a/tensorflow/lite/toco/tflite/operator_test.cc b/tensorflow/lite/toco/tflite/operator_test.cc
index 8a776cbf0be..14ec89cd73f 100644
--- a/tensorflow/lite/toco/tflite/operator_test.cc
+++ b/tensorflow/lite/toco/tflite/operator_test.cc
@@ -151,6 +151,7 @@ TEST_F(OperatorTest, SimpleOperators) {
                                                    OperatorType::kZerosLike);
   CheckSimpleOperator<FloorModOperator>("FLOOR_MOD", OperatorType::kFloorMod);
   CheckSimpleOperator<RangeOperator>("RANGE", OperatorType::kRange);
+  CheckSimpleOperator<FillOperator>("FILL", OperatorType::kFill);
 }
 
 TEST_F(OperatorTest, BuiltinAdd) {
@@ -310,6 +311,14 @@ TEST_F(OperatorTest, CustomSplit) {
   EXPECT_EQ(op.num_split, output_toco_op->num_split);
 }
 
+TEST_F(OperatorTest, CustomSplitV) {
+  TensorFlowSplitVOperator op;
+  op.num_split = 123;
+  auto output_toco_op = SerializeAndDeserialize(
+      GetOperator("SPLIT_V", OperatorType::kSplitV), op);
+  EXPECT_EQ(op.num_split, output_toco_op->num_split);
+}
+
 TEST_F(OperatorTest, BuiltinAveragePool) {
   AveragePoolOperator op;
   op.fused_activation_function = FusedActivationFunctionType::kRelu6;
@@ -517,6 +526,21 @@ TEST_F(OperatorTest, BuiltinUnpack) {
   EXPECT_EQ(op.axis, output_toco_op->axis);
 }
 
+TEST_F(OperatorTest, BuiltinLeakyRelu) {
+  LeakyReluOperator op;
+  op.alpha = 3;
+  auto output_toco_op = SerializeAndDeserialize(
+      GetOperator("LEAKY_RELU", OperatorType::kLeakyRelu), op);
+  EXPECT_EQ(op.alpha, output_toco_op->alpha);
+}
+
+TEST_F(OperatorTest, BuiltinSquaredDifference) {
+  SquaredDifferenceOperator op;
+  auto output_toco_op = SerializeAndDeserialize(
+      GetOperator("SQUARED_DIFFERENCE", OperatorType::kSquaredDifference), op);
+  ASSERT_NE(nullptr, output_toco_op.get());
+}
+
 TEST_F(OperatorTest, CustomCTCBeamSearchDecoder) {
   CTCBeamSearchDecoderOperator op;
   op.beam_width = 3;
@@ -592,6 +616,14 @@ TEST_F(OperatorTest, TestShouldExportAsFlexOp) {
   EXPECT_FALSE(ShouldExportAsFlexOp(true, "RFFT"));
 }
 
+TEST_F(OperatorTest, BuiltinMirrorPad) {
+  MirrorPadOperator op;
+  op.mode = MirrorPadMode::kReflect;
+  auto output_toco_op = SerializeAndDeserialize(
+      GetOperator("MIRROR_PAD", OperatorType::kMirrorPad), op);
+  EXPECT_EQ(op.mode, output_toco_op->mode);
+}
+
 }  // namespace
 }  // namespace tflite
 
diff --git a/tensorflow/lite/toco/tflite/whitelisted_flex_ops.cc b/tensorflow/lite/toco/tflite/whitelisted_flex_ops.cc
index d251589b483..039a918af16 100644
--- a/tensorflow/lite/toco/tflite/whitelisted_flex_ops.cc
+++ b/tensorflow/lite/toco/tflite/whitelisted_flex_ops.cc
@@ -187,6 +187,7 @@ bool IsWhitelistedFlexOp(const std::string& tensorflow_op_name) {
           "MirrorPad",
           "MirrorPadGrad",
           "Mul",
+          "Multinomial",
           "Neg",
           "NextIteration",
           "NonMaxSuppression",
diff --git a/tensorflow/lite/toco/toco.cc b/tensorflow/lite/toco/toco.cc
index 9740015850a..4a3d6a58487 100644
--- a/tensorflow/lite/toco/toco.cc
+++ b/tensorflow/lite/toco/toco.cc
@@ -16,87 +16,9 @@ limitations under the License.
 #include <memory>
 #include <string>
 
-#include "absl/strings/string_view.h"
-#include "tensorflow/lite/toco/model.h"
 #include "tensorflow/lite/toco/model_cmdline_flags.h"
-#include "tensorflow/lite/toco/model_flags.pb.h"
 #include "tensorflow/lite/toco/toco_cmdline_flags.h"
-#include "tensorflow/lite/toco/toco_flags.pb.h"
-#include "tensorflow/lite/toco/toco_port.h"
-#include "tensorflow/lite/toco/toco_tooling.h"
-#include "tensorflow/lite/toco/toco_types.h"
-#include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/platform/logging.h"
-
-namespace toco {
-namespace {
-
-// Checks the permissions of the output file to ensure it is writeable.
-void CheckOutputFilePermissions(const Arg<string>& output_file) {
-  QCHECK(output_file.specified()) << "Missing required flag --output_file.\n";
-  QCHECK(port::file::Writable(output_file.value()).ok())
-      << "Specified output_file is not writable: " << output_file.value()
-      << ".\n";
-}
-
-// Checks the permissions of the frozen model file.
-void CheckFrozenModelPermissions(const Arg<string>& input_file) {
-  QCHECK(input_file.specified()) << "Missing required flag --input_file.\n";
-  QCHECK(port::file::Exists(input_file.value(), port::file::Defaults()).ok())
-      << "Specified input_file does not exist: " << input_file.value() << ".\n";
-  QCHECK(port::file::Readable(input_file.value(), port::file::Defaults()).ok())
-      << "Specified input_file exists, but is not readable: "
-      << input_file.value() << ".\n";
-}
-
-// Reads the contents of the GraphDef from either the frozen graph file or the
-// SavedModel directory. If it reads the SavedModel directory, it updates the
-// ModelFlags and TocoFlags accordingly.
-void ReadInputData(const ParsedTocoFlags& parsed_toco_flags,
-                   const ParsedModelFlags& parsed_model_flags,
-                   TocoFlags* toco_flags, ModelFlags* model_flags,
-                   string* graph_def_contents) {
-  port::CheckInitGoogleIsDone("InitGoogle is not done yet.\n");
-
-  // Ensure savedmodel_directory is not set.
-  QCHECK(!parsed_toco_flags.savedmodel_directory.specified())
-      << "Use `tensorflow/lite/python/tflite_convert` script with "
-      << "SavedModel directories.\n";
-
-  // Checks the input file permissions and reads the contents.
-  CheckFrozenModelPermissions(parsed_toco_flags.input_file);
-  CHECK(port::file::GetContents(parsed_toco_flags.input_file.value(),
-                                graph_def_contents, port::file::Defaults())
-            .ok());
-}
-
-tensorflow::Status ToolMain(const ParsedTocoFlags& parsed_toco_flags,
-                            const ParsedModelFlags& parsed_model_flags) {
-  ModelFlags model_flags;
-  ReadModelFlagsFromCommandLineFlags(parsed_model_flags, &model_flags);
-
-  TocoFlags toco_flags;
-  ReadTocoFlagsFromCommandLineFlags(parsed_toco_flags, &toco_flags);
-
-  string graph_def_contents;
-  ReadInputData(parsed_toco_flags, parsed_model_flags, &toco_flags,
-                &model_flags, &graph_def_contents);
-  CheckOutputFilePermissions(parsed_toco_flags.output_file);
-
-  std::unique_ptr<Model> model =
-      Import(toco_flags, model_flags, graph_def_contents);
-  Transform(toco_flags, model.get());
-  string output_file_contents;
-  TF_RETURN_IF_ERROR(Export(toco_flags, *model, toco_flags.allow_custom_ops(),
-                            &output_file_contents));
-  TF_RETURN_IF_ERROR(
-      port::file::SetContents(parsed_toco_flags.output_file.value(),
-                              output_file_contents, port::file::Defaults()));
-  return tensorflow::Status();
-}
-
-}  // namespace
-}  // namespace toco
+#include "tensorflow/lite/toco/toco_convert.h"
 
 int main(int argc, char** argv) {
   toco::string msg;
@@ -126,6 +48,6 @@ int main(int argc, char** argv) {
     return 1;
   }
   toco::port::InitGoogle(argv[0], effective_argc, &effective_argv, true);
-  auto status = toco::ToolMain(parsed_toco_flags, parsed_model_flags);
+  auto status = toco::Convert(parsed_toco_flags, parsed_model_flags);
   return status.ok() ? 0 : -1;
 }
diff --git a/tensorflow/lite/toco/toco_convert.cc b/tensorflow/lite/toco/toco_convert.cc
new file mode 100644
index 00000000000..28e7b10ecd0
--- /dev/null
+++ b/tensorflow/lite/toco/toco_convert.cc
@@ -0,0 +1,108 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <cstdio>
+#include <memory>
+#include <string>
+
+#include "absl/strings/string_view.h"
+#include "tensorflow/lite/toco/model.h"
+#include "tensorflow/lite/toco/model_cmdline_flags.h"
+#include "tensorflow/lite/toco/model_flags.pb.h"
+#include "tensorflow/lite/toco/toco_cmdline_flags.h"
+#include "tensorflow/lite/toco/toco_flags.pb.h"
+#include "tensorflow/lite/toco/toco_port.h"
+#include "tensorflow/lite/toco/toco_tooling.h"
+#include "tensorflow/lite/toco/toco_types.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace toco {
+namespace {
+
+// Checks the permissions of the output file to ensure it is writeable.
+void CheckOutputFilePermissions(const Arg<string>& output_file) {
+  QCHECK(output_file.specified()) << "Missing required flag --output_file.\n";
+  QCHECK(port::file::Writable(output_file.value()).ok())
+      << "Specified output_file is not writable: " << output_file.value()
+      << ".\n";
+}
+
+// Checks the permissions of the frozen model file.
+void CheckFrozenModelPermissions(const Arg<string>& input_file) {
+  QCHECK(input_file.specified()) << "Missing required flag --input_file.\n";
+  QCHECK(port::file::Exists(input_file.value(), port::file::Defaults()).ok())
+      << "Specified input_file does not exist: " << input_file.value() << ".\n";
+  QCHECK(port::file::Readable(input_file.value(), port::file::Defaults()).ok())
+      << "Specified input_file exists, but is not readable: "
+      << input_file.value() << ".\n";
+}
+
+// Reads the contents of the GraphDef from either the frozen graph file or the
+// SavedModel directory. If it reads the SavedModel directory, it updates the
+// ModelFlags and TocoFlags accordingly.
+void ReadInputData(const ParsedTocoFlags& parsed_toco_flags,
+                   const ParsedModelFlags& parsed_model_flags,
+                   TocoFlags* toco_flags, ModelFlags* model_flags,
+                   string* graph_def_contents) {
+  port::CheckInitGoogleIsDone("InitGoogle is not done yet.\n");
+
+  // Ensure savedmodel_directory is not set.
+  QCHECK(!parsed_toco_flags.savedmodel_directory.specified())
+      << "Use `tensorflow/lite/python/tflite_convert` script with "
+      << "SavedModel directories.\n";
+
+  // Checks the input file permissions and reads the contents.
+  CheckFrozenModelPermissions(parsed_toco_flags.input_file);
+  CHECK(port::file::GetContents(parsed_toco_flags.input_file.value(),
+                                graph_def_contents, port::file::Defaults())
+            .ok());
+}
+}  // namespace
+
+tensorflow::Status Convert(const string& graph_def_contents,
+                           const TocoFlags& toco_flags,
+                           const ModelFlags& model_flags,
+                           string* output_file_contents) {
+  std::unique_ptr<Model> model =
+      Import(toco_flags, model_flags, graph_def_contents);
+  Transform(toco_flags, model.get());
+  return Export(toco_flags, *model, toco_flags.allow_custom_ops(),
+                output_file_contents);
+}
+
+tensorflow::Status Convert(const ParsedTocoFlags& parsed_toco_flags,
+                           const ParsedModelFlags& parsed_model_flags) {
+  ModelFlags model_flags;
+  ReadModelFlagsFromCommandLineFlags(parsed_model_flags, &model_flags);
+
+  TocoFlags toco_flags;
+  ReadTocoFlagsFromCommandLineFlags(parsed_toco_flags, &toco_flags);
+
+  string graph_def_contents;
+  ReadInputData(parsed_toco_flags, parsed_model_flags, &toco_flags,
+                &model_flags, &graph_def_contents);
+  CheckOutputFilePermissions(parsed_toco_flags.output_file);
+
+  string output_file_contents;
+  TF_RETURN_IF_ERROR(Convert(graph_def_contents, toco_flags, model_flags,
+                             &output_file_contents));
+
+  TF_RETURN_IF_ERROR(
+      port::file::SetContents(parsed_toco_flags.output_file.value(),
+                              output_file_contents, port::file::Defaults()));
+  return tensorflow::Status();
+}
+
+}  // namespace toco
diff --git a/tensorflow/lite/toco/toco_convert.h b/tensorflow/lite/toco/toco_convert.h
new file mode 100644
index 00000000000..ebbd336d3f5
--- /dev/null
+++ b/tensorflow/lite/toco/toco_convert.h
@@ -0,0 +1,34 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_TOCO_TOCO_CONVERT_H_
+#define TENSORFLOW_LITE_TOCO_TOCO_CONVERT_H_
+
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/lite/toco/args.h"
+#include "tensorflow/lite/toco/model_flags.pb.h"
+#include "tensorflow/lite/toco/toco_flags.pb.h"
+
+namespace toco {
+
+tensorflow::Status Convert(const string& graph_def_contents,
+                           const TocoFlags& toco_flags,
+                           const ModelFlags& model_flags,
+                           string* output_file_contents);
+
+tensorflow::Status Convert(const ParsedTocoFlags& parsed_toco_flags,
+                           const ParsedModelFlags& parsed_model_flags);
+}  // namespace toco
+
+#endif  // TENSORFLOW_LITE_TOCO_TOCO_CONVERT_H_
diff --git a/tensorflow/lite/toco/toco_convert_test.cc b/tensorflow/lite/toco/toco_convert_test.cc
new file mode 100644
index 00000000000..c3c440db943
--- /dev/null
+++ b/tensorflow/lite/toco/toco_convert_test.cc
@@ -0,0 +1,173 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/toco/toco_convert.h"
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+
+namespace toco {
+namespace {
+
+TEST(TocoTest, MissingInputFile) {
+  ParsedTocoFlags toco_flags;
+  ParsedModelFlags model_flags;
+  EXPECT_DEATH(Convert(toco_flags, model_flags).ok(),
+               "Missing required flag --input_file");
+}
+
+TEST(TocoTest, BadInputFormat) {
+  TocoFlags toco_flags;
+  ModelFlags model_flags;
+
+  string input;
+  string output;
+
+  EXPECT_DEATH(Convert(input, toco_flags, model_flags, &output).ok(),
+               "Unhandled input_format='FILE_FORMAT_UNKNOWN'");
+}
+
+TEST(TocoTest, MissingOuputArrays) {
+  TocoFlags toco_flags;
+  ModelFlags model_flags;
+
+  toco_flags.set_input_format(TENSORFLOW_GRAPHDEF);
+  string input;
+  string output;
+
+  EXPECT_DEATH(Convert(input, toco_flags, model_flags, &output).ok(),
+               "This model does not define output arrays, so a --output_arrays "
+               "flag must be given on the command-line");
+}
+
+TEST(TocoTest, BadOutputArray) {
+  TocoFlags toco_flags;
+  ModelFlags model_flags;
+
+  toco_flags.set_input_format(TENSORFLOW_GRAPHDEF);
+  model_flags.add_output_arrays("output1");
+  string input;
+  string output;
+
+  EXPECT_DEATH(Convert(input, toco_flags, model_flags, &output).ok(),
+               "Specified output array .output1. is not produced by any op "
+               "in this graph. Is it a typo. To silence this message, pass "
+               "this flag:  allow_nonexistent_arrays");
+}
+
+TEST(TocoTest, BadOutputFormat) {
+  TocoFlags toco_flags;
+  ModelFlags model_flags;
+
+  toco_flags.set_input_format(TENSORFLOW_GRAPHDEF);
+  model_flags.add_output_arrays("output1");
+  string input = R"GraphDef(
+    node {
+      name: "output1"
+      input: "input1"
+      input: "input2"
+      op: "Sub"
+      attr { key: "T" value { type: DT_FLOAT } }
+    }
+  )GraphDef";
+
+  string output;
+
+  EXPECT_DEATH(Convert(input, toco_flags, model_flags, &output).ok(),
+               "Unhandled output_format='FILE_FORMAT_UNKNOWN'");
+}
+
+TEST(TocoTest, SimpleFloatModel) {
+  TocoFlags toco_flags;
+  ModelFlags model_flags;
+
+  toco_flags.set_input_format(TENSORFLOW_GRAPHDEF);
+  toco_flags.set_output_format(TENSORFLOW_GRAPHDEF);
+
+  // Inputs are automatically selected (but that might not be a good idea).
+  model_flags.add_output_arrays("output1");
+  string input = R"GraphDef(
+    node {
+      name: "input1"
+      op: "Placeholder"
+      attr { key: "dtype" value { type: DT_INT64 } }
+    }
+    node {
+      name: "input2"
+      op: "Placeholder"
+      attr { key: "dtype" value { type: DT_INT64 } }
+    }
+    node {
+      name: "output1"
+      input: "input1"
+      input: "input2"
+      op: "Sub"
+      attr { key: "T" value { type: DT_FLOAT } }
+    }
+  )GraphDef";
+
+  string output;
+  EXPECT_TRUE(Convert(input, toco_flags, model_flags, &output).ok());
+  EXPECT_TRUE(!output.empty());
+}
+
+TEST(TocoTest, TransientStringTensors) {
+  TocoFlags toco_flags;
+  ModelFlags model_flags;
+
+  toco_flags.set_input_format(TENSORFLOW_GRAPHDEF);
+
+  // We need to do a couple of things to trigger the transient array
+  // initialization code: output format must support memory planning, and the
+  // input array must have a shape.
+  toco_flags.set_output_format(TFLITE);
+
+  model_flags.add_output_arrays("output1");
+  string input = R"GraphDef(
+    node {
+      name: "input1"
+      op: "Placeholder"
+      attr { key: "dtype" value { type: DT_STRING } }
+      attr { key: "shape" value { shape { dim { size:1 }}}}
+    }
+    node {
+      name: "indices1"
+      op: "Placeholder"
+      attr { key: "dtype" value { type: DT_INT64 } }
+    }
+    node {
+      name: "intermediate1"
+      op: "Gather"
+      input: "input1"
+      input: "indices1"
+      attr { key: "Tparams" value { type: DT_STRING } }
+      attr { key: "Tindices" value { type: DT_INT64 } }
+    }
+    node {
+      name: "output1"
+      op: "Gather"
+      input: "intermediate1"
+      input: "indices2"
+      attr { key: "Tparams" value { type: DT_STRING } }
+      attr { key: "Tindices" value { type: DT_INT64 } }
+    }
+  )GraphDef";
+
+  string output;
+
+  EXPECT_TRUE(Convert(input, toco_flags, model_flags, &output).ok());
+  EXPECT_TRUE(!output.empty());
+}
+
+}  // namespace
+}  // namespace toco
diff --git a/tensorflow/lite/toco/toco_tooling.cc b/tensorflow/lite/toco/toco_tooling.cc
index 5f96e833fbf..55a454e66de 100644
--- a/tensorflow/lite/toco/toco_tooling.cc
+++ b/tensorflow/lite/toco/toco_tooling.cc
@@ -210,7 +210,8 @@ std::unique_ptr<Model> Import(const TocoFlags& toco_flags,
       CheckInvariants(*model);
       break;
     default:
-      LOG(FATAL) << "Unhandled input_format";
+      LOG(FATAL) << "Unhandled input_format='"
+                 << FileFormat_Name(toco_flags.input_format()) << "'";
   }
 
   LogDump(kLogLevelModelChanged, "AT IMPORT", *model);
@@ -308,6 +309,7 @@ void Transform(const TocoFlags& toco_flags, Model* model) {
   // Fix any issues with IO edges. This must happen after any transform that
   // may modify the structure of the edges.
   FixEdgeArrays(model);
+  FixOperatorOrdering(model);
 
   if (quantize_output) {
     // If the user specified default min/max ranges we need to set all arrays
@@ -424,7 +426,8 @@ tensorflow::Status Export(const TocoFlags& toco_flags, const Model& model,
       DumpGraphviz(model, output_file_contents);
       break;
     default:
-      LOG(FATAL) << "Unhandled output_format";
+      LOG(FATAL) << "Unhandled output_format='"
+                 << FileFormat_Name(toco_flags.output_format()) << "'";
   }
   return tensorflow::Status();
 }
diff --git a/tensorflow/lite/toco/tooling_util.cc b/tensorflow/lite/toco/tooling_util.cc
index e33f7c8452f..af4cd386a20 100644
--- a/tensorflow/lite/toco/tooling_util.cc
+++ b/tensorflow/lite/toco/tooling_util.cc
@@ -308,6 +308,7 @@ const char* OperatorTypeName(OperatorType type) {
 #define HANDLE_OPERATORTYPENAME_CASE(c) \
   case OperatorType::k##c:              \
     return #c;
+    HANDLE_OPERATORTYPENAME_CASE(Abs)
     HANDLE_OPERATORTYPENAME_CASE(Add)
     HANDLE_OPERATORTYPENAME_CASE(AddN)
     HANDLE_OPERATORTYPENAME_CASE(AveragePool)
@@ -371,6 +372,7 @@ const char* OperatorTypeName(OperatorType type) {
     HANDLE_OPERATORTYPENAME_CASE(Shape)
     HANDLE_OPERATORTYPENAME_CASE(Slice)
     HANDLE_OPERATORTYPENAME_CASE(Split)
+    HANDLE_OPERATORTYPENAME_CASE(SplitV)
     HANDLE_OPERATORTYPENAME_CASE(Sqrt)
     HANDLE_OPERATORTYPENAME_CASE(Square)
     HANDLE_OPERATORTYPENAME_CASE(Switch)
@@ -411,6 +413,9 @@ const char* OperatorTypeName(OperatorType type) {
     HANDLE_OPERATORTYPENAME_CASE(ZerosLike)
     HANDLE_OPERATORTYPENAME_CASE(UnidirectionalSequenceLstm)
     HANDLE_OPERATORTYPENAME_CASE(ResizeNearestNeighbor)
+    HANDLE_OPERATORTYPENAME_CASE(LeakyRelu)
+    HANDLE_OPERATORTYPENAME_CASE(SquaredDifference)
+    HANDLE_OPERATORTYPENAME_CASE(MirrorPad)
     default:
       LOG(FATAL) << "Unhandled op type";
 #undef HANDLE_OPERATORTYPENAME_CASE
@@ -439,6 +444,7 @@ bool OperatorSupportsFusedActivation(OperatorType type) {
     case OperatorType::kMaxPool:
     case OperatorType::kMul:
     case OperatorType::kSub:
+    case OperatorType::kSquaredDifference:
       return true;
     default:
       return false;
@@ -531,12 +537,12 @@ void DumpGraphvizVideoFrame(const Model& model) {
   if (!dump_hashes.count(hash)) {
     LOG(INFO) << "DUMPING GRAPHVIZ VIDEO FRAME: " << dump_id;
     dump_hashes.insert(hash);
-    CHECK(port::file::SetContents(
-              port::file::JoinPath(
-                  dump_options.dump_graphviz,
-                  toco::port::StringF("toco_video_%05d.dot", dump_id)),
-              graphviz_dump, port::file::Defaults())
-              .ok());
+    const auto result = port::file::SetContents(
+        port::file::JoinPath(
+            dump_options.dump_graphviz,
+            toco::port::StringF("toco_video_%05d.dot", dump_id)),
+        graphviz_dump, port::file::Defaults());
+    QCHECK(result.ok()) << result.error_message();
     dump_id++;
   }
 }
@@ -550,14 +556,13 @@ void LogDump(int log_level, const string& message, const Model& model) {
     string graphviz_dump;
 
     DumpGraphviz(model, &graphviz_dump);
-    CHECK(port::file::SetContents(
-              port::file::JoinPath(
-                  dump_options.dump_graphviz,
-                  absl::StrCat("toco_",
-                               absl::StrReplaceAll(message, {{" ", "_"}}),
-                               ".dot")),
-              graphviz_dump, port::file::Defaults())
-              .ok());
+    const auto result = port::file::SetContents(
+        port::file::JoinPath(
+            dump_options.dump_graphviz,
+            absl::StrCat("toco_", absl::StrReplaceAll(message, {{" ", "_"}}),
+                         ".dot")),
+        graphviz_dump, port::file::Defaults());
+    QCHECK(result.ok()) << result.error_message();
   }
 
   if (!VLOG_IS_ON(log_level)) {
@@ -894,6 +899,9 @@ void CheckNonExistentIOArrays(const Model& model) {
         << "\" is not consumed by any op in this graph. " << general_comment;
   }
   for (const string& output_array : model.flags.output_arrays()) {
+    if (IsConstantParameterArray(model, output_array)) {
+      continue;  // It is OK to request that a constant be an output.
+    }
     QCHECK(GetOpWithOutput(model, output_array))
         << "Specified output array \"" << output_array
         << "\" is not produced by any op in this graph. " << general_comment;
@@ -1032,10 +1040,10 @@ void CheckEachArray(const Model& model) {
     if (colon_pos != string::npos) {
       CHECK_EQ(name.substr(colon_pos + 1).find_first_not_of("0123456789"),
                string::npos)
-          << "Array name must only have digits after colon";
+          << "Array '" << name << "' has non-digit characters after colon.";
     }
-    CHECK_GT(colon_pos, 0)
-        << "First character of array name must not be a colon.";
+    CHECK_GT(colon_pos, 0) << "Array '" << name
+                           << "' must not start with a colon.";
   }
 }
 
@@ -1767,6 +1775,14 @@ bool IsAllocatableTransientArray(const Model& model, const string& array_name) {
   if (!array->has_shape()) {
     return false;
   }
+
+  // The size of string tensors is rarely known ahead of time, so all transient
+  // tensors of this type will need to be dynamically allocated.
+  if (array->final_data_type == ArrayDataType::kString ||
+      array->data_type == ArrayDataType::kString) {
+    return false;
+  }
+
   return true;
 }
 
@@ -2207,6 +2223,8 @@ ArrayDataType ConvertIODataTypeToArrayDataType(IODataType type) {
       return ArrayDataType::kFloat;
     case QUANTIZED_UINT8:
       return ArrayDataType::kUint8;
+    case INT8:
+      return ArrayDataType::kInt8;
     case QUANTIZED_INT16:
       return ArrayDataType::kInt16;
     case INT32:
diff --git a/tensorflow/lite/toco/types.proto b/tensorflow/lite/toco/types.proto
index 12f711fd8a3..fa911b8a4c8 100644
--- a/tensorflow/lite/toco/types.proto
+++ b/tensorflow/lite/toco/types.proto
@@ -43,4 +43,7 @@ enum IODataType {
 
   // Complex64, not quantized
   COMPLEX64 = 8;
+
+  // Int8, quantized based on QuantizationParameters in schema.
+  INT8 = 9;
 }
diff --git a/tensorflow/lite/tools/benchmark/benchmark_model.cc b/tensorflow/lite/tools/benchmark/benchmark_model.cc
index 05148aea65b..e9b485efcaa 100644
--- a/tensorflow/lite/tools/benchmark/benchmark_model.cc
+++ b/tensorflow/lite/tools/benchmark/benchmark_model.cc
@@ -51,11 +51,13 @@ using tensorflow::Stat;
 BenchmarkParams BenchmarkModel::DefaultParams() {
   BenchmarkParams params;
   params.AddParam("num_runs", BenchmarkParam::Create<int32_t>(50));
+  params.AddParam("min_secs", BenchmarkParam::Create<float>(1.0f));
   params.AddParam("run_delay", BenchmarkParam::Create<float>(-1.0f));
   params.AddParam("num_threads", BenchmarkParam::Create<int32_t>(1));
   params.AddParam("benchmark_name", BenchmarkParam::Create<std::string>(""));
   params.AddParam("output_prefix", BenchmarkParam::Create<std::string>(""));
   params.AddParam("warmup_runs", BenchmarkParam::Create<int32_t>(1));
+  params.AddParam("warmup_min_secs", BenchmarkParam::Create<float>(0.5f));
   return params;
 }
 
@@ -73,19 +75,34 @@ void BenchmarkLoggingListener::OnBenchmarkEnd(const BenchmarkResults &results) {
 
 std::vector<Flag> BenchmarkModel::GetFlags() {
   return {
-      CreateFlag<int32_t>("num_runs", &params_, "number of runs"),
+      CreateFlag<int32_t>("num_runs", &params_,
+                          "minimum number of runs, see also min_secs"),
+      CreateFlag<float>(
+          "min_secs", &params_,
+          "minimum number of seconds to rerun for, potentially making the "
+          "actual number of runs to be greater than num_runs"),
       CreateFlag<float>("run_delay", &params_, "delay between runs in seconds"),
       CreateFlag<int32_t>("num_threads", &params_, "number of threads"),
       CreateFlag<std::string>("benchmark_name", &params_, "benchmark name"),
       CreateFlag<std::string>("output_prefix", &params_,
                               "benchmark output prefix"),
-      CreateFlag<int32_t>("warmup_runs", &params_,
-                          "how many runs to initialize model"),
+      CreateFlag<int32_t>(
+          "warmup_runs", &params_,
+          "minimum number of runs performed on initialization, to "
+          "allow performance characteristics to settle, see also "
+          "warmup_min_secs"),
+      CreateFlag<float>(
+          "warmup_min_secs", &params_,
+          "minimum number of seconds to rerun for, potentially making the "
+          "actual number of warm-up runs to be greater than warmup_runs"),
   };
 }
 
 void BenchmarkModel::LogParams() {
-  TFLITE_LOG(INFO) << "Num runs: [" << params_.Get<int32_t>("num_runs") << "]";
+  TFLITE_LOG(INFO) << "Min num runs: [" << params_.Get<int32_t>("num_runs")
+                   << "]";
+  TFLITE_LOG(INFO) << "Min runs duration (seconds): ["
+                   << params_.Get<float>("min_secs") << "]";
   TFLITE_LOG(INFO) << "Inter-run delay (seconds): ["
                    << params_.Get<float>("run_delay") << "]";
   TFLITE_LOG(INFO) << "Num threads: [" << params_.Get<int32_t>("num_threads")
@@ -94,16 +111,24 @@ void BenchmarkModel::LogParams() {
                    << params_.Get<std::string>("benchmark_name") << "]";
   TFLITE_LOG(INFO) << "Output prefix: ["
                    << params_.Get<std::string>("output_prefix") << "]";
-  TFLITE_LOG(INFO) << "Warmup runs: [" << params_.Get<int32_t>("warmup_runs")
-                   << "]";
+  TFLITE_LOG(INFO) << "Min warmup runs: ["
+                   << params_.Get<int32_t>("warmup_runs") << "]";
+  TFLITE_LOG(INFO) << "Min warmup runs duration (seconds): ["
+                   << params_.Get<float>("warmup_min_secs") << "]";
 }
 
 void BenchmarkModel::PrepareInputsAndOutputs() {}
 
-Stat<int64_t> BenchmarkModel::Run(int num_times, RunType run_type) {
+Stat<int64_t> BenchmarkModel::Run(int min_num_times, float min_secs,
+                                  RunType run_type) {
   Stat<int64_t> run_stats;
-  TFLITE_LOG(INFO) << "Running benchmark for " << num_times << " iterations ";
-  for (int run = 0; run < num_times; run++) {
+  TFLITE_LOG(INFO) << "Running benchmark for at least " << min_num_times
+                   << " iterations and at least " << min_secs << " seconds";
+  int64_t min_finish_us =
+      profiling::time::NowMicros() + static_cast<int64_t>(min_secs * 1.e6f);
+  for (int run = 0;
+       run < min_num_times || profiling::time::NowMicros() < min_finish_us;
+       run++) {
     PrepareInputsAndOutputs();
     listeners_.OnSingleRunStart(run_type);
     int64_t start_us = profiling::time::NowMicros();
@@ -145,9 +170,11 @@ void BenchmarkModel::Run() {
 
   uint64_t input_bytes = ComputeInputBytes();
   Stat<int64_t> warmup_time_us =
-      Run(params_.Get<int32_t>("warmup_runs"), WARMUP);
+      Run(params_.Get<int32_t>("warmup_runs"),
+          params_.Get<float>("warmup_min_secs"), WARMUP);
   Stat<int64_t> inference_time_us =
-      Run(params_.Get<int32_t>("num_runs"), REGULAR);
+      Run(params_.Get<int32_t>("num_runs"), params_.Get<float>("min_secs"),
+          REGULAR);
   listeners_.OnBenchmarkEnd(
       {startup_latency_us, input_bytes, warmup_time_us, inference_time_us});
 }
diff --git a/tensorflow/lite/tools/benchmark/benchmark_model.h b/tensorflow/lite/tools/benchmark/benchmark_model.h
index d8a9b05010a..31ee5c92aa3 100644
--- a/tensorflow/lite/tools/benchmark/benchmark_model.h
+++ b/tensorflow/lite/tools/benchmark/benchmark_model.h
@@ -150,7 +150,8 @@ class BenchmarkModel {
   bool ParseFlags(int argc, char** argv);
   virtual std::vector<Flag> GetFlags();
   virtual uint64_t ComputeInputBytes() = 0;
-  virtual tensorflow::Stat<int64_t> Run(int num_times, RunType run_type);
+  virtual tensorflow::Stat<int64_t> Run(int min_num_times, float min_secs,
+                                        RunType run_type);
   virtual void PrepareInputsAndOutputs();
   virtual void RunImpl() = 0;
   BenchmarkParams params_;
diff --git a/tensorflow/lite/tools/benchmark/benchmark_test.cc b/tensorflow/lite/tools/benchmark/benchmark_test.cc
index 59d23d90086..8191fbcd735 100644
--- a/tensorflow/lite/tools/benchmark/benchmark_test.cc
+++ b/tensorflow/lite/tools/benchmark/benchmark_test.cc
@@ -33,6 +33,7 @@ namespace {
 BenchmarkParams CreateParams() {
   BenchmarkParams params;
   params.AddParam("num_runs", BenchmarkParam::Create<int32_t>(2));
+  params.AddParam("min_secs", BenchmarkParam::Create<float>(1.0f));
   params.AddParam("run_delay", BenchmarkParam::Create<float>(-1.0f));
   params.AddParam("num_threads", BenchmarkParam::Create<int32_t>(1));
   params.AddParam("benchmark_name", BenchmarkParam::Create<std::string>(""));
@@ -42,6 +43,7 @@ BenchmarkParams CreateParams() {
   params.AddParam("input_layer", BenchmarkParam::Create<std::string>(""));
   params.AddParam("input_layer_shape", BenchmarkParam::Create<std::string>(""));
   params.AddParam("use_nnapi", BenchmarkParam::Create<bool>(false));
+  params.AddParam("warmup_min_secs", BenchmarkParam::Create<float>(0.5f));
   return params;
 }
 
diff --git a/tensorflow/lite/tools/benchmark/benchmark_tflite_model.cc b/tensorflow/lite/tools/benchmark/benchmark_tflite_model.cc
index 777d9dde7dd..16f70870b69 100644
--- a/tensorflow/lite/tools/benchmark/benchmark_tflite_model.cc
+++ b/tensorflow/lite/tools/benchmark/benchmark_tflite_model.cc
@@ -181,7 +181,9 @@ bool PopulateInputLayerInfo(
   return true;
 }
 
-BenchmarkParams GetDefaultParams() {
+}  // namespace
+
+BenchmarkParams BenchmarkTfLiteModel::DefaultParams() {
   BenchmarkParams default_params = BenchmarkModel::DefaultParams();
   default_params.AddParam("graph", BenchmarkParam::Create<std::string>(""));
   default_params.AddParam("input_layer",
@@ -192,10 +194,8 @@ BenchmarkParams GetDefaultParams() {
   return default_params;
 }
 
-}  // namespace
-
 BenchmarkTfLiteModel::BenchmarkTfLiteModel()
-    : BenchmarkTfLiteModel(GetDefaultParams()) {}
+    : BenchmarkTfLiteModel(DefaultParams()) {}
 
 BenchmarkTfLiteModel::BenchmarkTfLiteModel(BenchmarkParams params)
     : BenchmarkModel(std::move(params)) {
@@ -279,7 +279,7 @@ void BenchmarkTfLiteModel::PrepareInputsAndOutputs() {
       FillRandomString(&buffer, sizes, []() {
         return "we're have some friends over saturday to hang out in the yard";
       });
-      buffer.WriteToTensor(interpreter->tensor(i));
+      buffer.WriteToTensor(interpreter->tensor(i), /*new_shape=*/nullptr);
     } else {
       TFLITE_LOG(FATAL) << "Don't know how to populate tensor " << t->name
                         << " of type " << t->type;
@@ -319,6 +319,7 @@ void BenchmarkTfLiteModel::Init() {
   bool use_nnapi = params_.Get<bool>("use_nnapi");
 
   interpreter->UseNNAPI(use_nnapi);
+  ApplyDelegates();
 
   auto interpreter_inputs = interpreter->inputs();
 
diff --git a/tensorflow/lite/tools/benchmark/benchmark_tflite_model.h b/tensorflow/lite/tools/benchmark/benchmark_tflite_model.h
index 401ab5427d3..83599e644d1 100644
--- a/tensorflow/lite/tools/benchmark/benchmark_tflite_model.h
+++ b/tensorflow/lite/tools/benchmark/benchmark_tflite_model.h
@@ -77,11 +77,16 @@ class BenchmarkTfLiteModel : public BenchmarkModel {
   };
 
  protected:
+  static BenchmarkParams DefaultParams();
   void PrepareInputsAndOutputs() override;
 
- private:
+  // Allows installation of custom delegates during initialization
+  virtual void ApplyDelegates() {}
+
   std::unique_ptr<tflite::FlatBufferModel> model;
   std::unique_ptr<tflite::Interpreter> interpreter;
+
+ private:
   std::vector<InputLayerInfo> inputs;
   ProfilingListener profiling_listener_;
   GemmlowpProfilingListener gemmlowp_profiling_listener_;
diff --git a/tensorflow/lite/tools/benchmark/ios/TFLiteBenchmark/TFLiteBenchmark.xcodeproj/project.pbxproj b/tensorflow/lite/tools/benchmark/ios/TFLiteBenchmark/TFLiteBenchmark.xcodeproj/project.pbxproj
index 958936a6607..a5f5bfbbdaf 100644
--- a/tensorflow/lite/tools/benchmark/ios/TFLiteBenchmark/TFLiteBenchmark.xcodeproj/project.pbxproj
+++ b/tensorflow/lite/tools/benchmark/ios/TFLiteBenchmark/TFLiteBenchmark.xcodeproj/project.pbxproj
@@ -20,7 +20,7 @@
 
 /* Begin PBXFileReference section */
 		6FE7579920D59CE500F01636 /* benchmark_params.json */ = {isa = PBXFileReference; lastKnownFileType = text.json; path = benchmark_params.json; sourceTree = "<group>"; };
-		6FE7579C20D5A5E000F01636 /* benchmark-lib.a */ = {isa = PBXFileReference; lastKnownFileType = archive.ar; name = "benchmark-lib.a"; path = "$SRCROOT/../../../../../../../tensorflow/lite/tools/make/gen/lib/benchmark-lib.a"; sourceTree = "<group>"; };
+		6FE7579C20D5A5E000F01636 /* benchmark-lib.a */ = {isa = PBXFileReference; lastKnownFileType = archive.ar; name = "benchmark-lib.a"; path = "$SRCROOT/../../../../../../tensorflow/lite/tools/make/gen/lib/benchmark-lib.a"; sourceTree = "<group>"; };
 		6FE7579E20D5A6A700F01636 /* Accelerate.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = Accelerate.framework; path = System/Library/Frameworks/Accelerate.framework; sourceTree = SDKROOT; };
 		6FE757A020D5AB8000F01636 /* mobilenet_v1_1.0_224.tflite */ = {isa = PBXFileReference; lastKnownFileType = file; path = mobilenet_v1_1.0_224.tflite; sourceTree = "<group>"; };
 		6FE93FF820D592D8008C9FE4 /* TFLiteBenchmark.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = TFLiteBenchmark.app; sourceTree = BUILT_PRODUCTS_DIR; };
@@ -309,19 +309,19 @@
 				ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon;
 				CODE_SIGN_STYLE = Automatic;
 				"HEADER_SEARCH_PATHS[arch=*]" = (
-					$SRCROOT/../../../../../../../,
-					$SRCROOT/../../../../../../../tensorflow/lite/tools/make/downloads/eigen,
-					$SRCROOT/../../../../../../../tensorflow/lite/tools/make/downloads/gemmlowp,
-					$SRCROOT/../../../../../../../tensorflow/lite/tools/make/downloads/neon_2_sse,
-					$SRCROOT/../../../../../../../tensorflow/lite/tools/make/downloads/farmhash/src,
-					$SRCROOT/../../../../../../../tensorflow/lite/tools/make/downloads/flatbuffers/include,
+					$SRCROOT/../../../../../../,
+					$SRCROOT/../../../../../../tensorflow/lite/tools/make/downloads/eigen,
+					$SRCROOT/../../../../../../tensorflow/lite/tools/make/downloads/gemmlowp,
+					$SRCROOT/../../../../../../tensorflow/lite/tools/make/downloads/neon_2_sse,
+					$SRCROOT/../../../../../../tensorflow/lite/tools/make/downloads/farmhash/src,
+					$SRCROOT/../../../../../../tensorflow/lite/tools/make/downloads/flatbuffers/include,
 				);
 				INFOPLIST_FILE = TFLiteBenchmark/Info.plist;
 				LD_RUNPATH_SEARCH_PATHS = (
 					"$(inherited)",
 					"@executable_path/Frameworks",
 				);
-				"LIBRARY_SEARCH_PATHS[arch=*]" = $SRCROOT/../../../../../../../tensorflow/lite/tools/make/gen/lib;
+				"LIBRARY_SEARCH_PATHS[arch=*]" = $SRCROOT/../../../../../../tensorflow/lite/tools/make/gen/lib;
 				PRODUCT_BUNDLE_IDENTIFIER = example.TFLiteBenchmark;
 				PRODUCT_NAME = "$(TARGET_NAME)";
 				TARGETED_DEVICE_FAMILY = "1,2";
@@ -335,19 +335,19 @@
 				ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon;
 				CODE_SIGN_STYLE = Automatic;
 				"HEADER_SEARCH_PATHS[arch=*]" = (
-					$SRCROOT/../../../../../../../,
-					$SRCROOT/../../../../../../../tensorflow/lite/tools/make/downloads/eigen,
-					$SRCROOT/../../../../../../../tensorflow/lite/tools/make/downloads/gemmlowp,
-					$SRCROOT/../../../../../../../tensorflow/lite/tools/make/downloads/neon_2_sse,
-					$SRCROOT/../../../../../../../tensorflow/lite/tools/make/downloads/farmhash/src,
-					$SRCROOT/../../../../../../../tensorflow/lite/tools/make/downloads/flatbuffers/include,
+					$SRCROOT/../../../../../../,
+					$SRCROOT/../../../../../../tensorflow/lite/tools/make/downloads/eigen,
+					$SRCROOT/../../../../../../tensorflow/lite/tools/make/downloads/gemmlowp,
+					$SRCROOT/../../../../../../tensorflow/lite/tools/make/downloads/neon_2_sse,
+					$SRCROOT/../../../../../../tensorflow/lite/tools/make/downloads/farmhash/src,
+					$SRCROOT/../../../../../../tensorflow/lite/tools/make/downloads/flatbuffers/include,
 				);
 				INFOPLIST_FILE = TFLiteBenchmark/Info.plist;
 				LD_RUNPATH_SEARCH_PATHS = (
 					"$(inherited)",
 					"@executable_path/Frameworks",
 				);
-				"LIBRARY_SEARCH_PATHS[arch=*]" = $SRCROOT/../../../../../../../tensorflow/lite/tools/make/gen/lib;
+				"LIBRARY_SEARCH_PATHS[arch=*]" = $SRCROOT/../../../../../../tensorflow/lite/tools/make/gen/lib;
 				PRODUCT_BUNDLE_IDENTIFIER = example.TFLiteBenchmark;
 				PRODUCT_NAME = "$(TARGET_NAME)";
 				TARGETED_DEVICE_FAMILY = "1,2";
diff --git a/tensorflow/lite/tools/make/Makefile b/tensorflow/lite/tools/make/Makefile
index 8f123558545..363a069d5e2 100644
--- a/tensorflow/lite/tools/make/Makefile
+++ b/tensorflow/lite/tools/make/Makefile
@@ -85,6 +85,7 @@ CORE_CC_ALL_SRCS := \
 $(wildcard tensorflow/lite/*.cc) \
 $(wildcard tensorflow/lite/*.c) \
 $(wildcard tensorflow/lite/c/*.c) \
+$(wildcard tensorflow/lite/core/*.cc) \
 $(wildcard tensorflow/lite/core/api/*.cc)
 ifneq ($(BUILD_TYPE),micro)
 CORE_CC_ALL_SRCS += \
@@ -113,6 +114,10 @@ ifeq ($(BUILD_TYPE),micro)
 CORE_CC_EXCLUDE_SRCS += \
 tensorflow/lite/mmap_allocation.cc \
 tensorflow/lite/nnapi_delegate.cc
+else
+CORE_CC_EXCLUDE_SRCS += \
+tensorflow/contrib/lite/mmap_allocation_disabled.cc \
+tensorflow/contrib/lite/nnapi_delegate_disabled.cc
 endif
 # Filter out all the excluded files.
 TF_LITE_CC_SRCS := $(filter-out $(CORE_CC_EXCLUDE_SRCS), $(CORE_CC_ALL_SRCS))
@@ -208,6 +213,9 @@ $(BENCHMARK_BINARY) : $(BENCHMARK_LIB)
 
 benchmark: $(BENCHMARK_BINARY)
 
+libdir:
+	@echo $(LIBDIR)
+
 # Gets rid of all generated files.
 clean:
 	rm -rf $(MAKEFILE_DIR)/gen
diff --git a/tensorflow/lite/tools/make/targets/ios_makefile.inc b/tensorflow/lite/tools/make/targets/ios_makefile.inc
index 7f36b8ecef4..ae9276f9a63 100644
--- a/tensorflow/lite/tools/make/targets/ios_makefile.inc
+++ b/tensorflow/lite/tools/make/targets/ios_makefile.inc
@@ -22,7 +22,7 @@ ifeq ($(TARGET), ios)
 	TARGET_ARCH := x86_64
 	CXXFLAGS += -miphoneos-version-min=$(MIN_SDK_VERSION) \
 		-DGEMMLOWP_ALLOW_SLOW_SCALAR_FALLBACK \
-		-DTFLITE_USE_APPLE_ACCELERATE_FOR_CONV \
+		-DTF_LITE_USE_CBLAS \
 		-fembed-bitcode \
 		-Wno-c++11-narrowing \
 		-mno-thumb \
diff --git a/tensorflow/lite/tools/optimize/g3doc/quantize_weights.md b/tensorflow/lite/tools/optimize/g3doc/quantize_weights.md
index 2517882c84c..cea164c38f0 100644
--- a/tensorflow/lite/tools/optimize/g3doc/quantize_weights.md
+++ b/tensorflow/lite/tools/optimize/g3doc/quantize_weights.md
@@ -3,7 +3,7 @@
 ## Recommended usage
 
 The Quantize Weights transformation is integrated with
-[tflite_convert](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/toco/g3doc/cmdline_reference.md#transformation-flags).
+[tflite_convert](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/g3doc/convert/cmdline_reference.md#transformation-flags).
 
 The recommended way of invoking this tool is by simply adding the
 `--post_training_quantize` flag to your original tflite_convert invocation. For
diff --git a/tensorflow/lite/tools/pip_package/MANIFEST.in b/tensorflow/lite/tools/pip_package/MANIFEST.in
new file mode 100644
index 00000000000..bb574e63a37
--- /dev/null
+++ b/tensorflow/lite/tools/pip_package/MANIFEST.in
@@ -0,0 +1 @@
+recursive-include * *.py
diff --git a/tensorflow/lite/tools/pip_package/README.md b/tensorflow/lite/tools/pip_package/README.md
new file mode 100644
index 00000000000..8190782c39f
--- /dev/null
+++ b/tensorflow/lite/tools/pip_package/README.md
@@ -0,0 +1,33 @@
+# Building TensorFlow Lite Standalone Pip
+
+Many users would like to deploy TensorFlow lite interpreter and use it from
+Python without requiring the rest of TensorFlow.
+
+## Steps
+
+To build a binary wheel run this script:
+```
+sudo apt install swig libjpeg-dev zlib1g-dev python3-dev python3-numpy
+sh tensorflow/lite/tools/pip_package/build_pip_package.sh
+```
+That will print out some output and a .whl file. You can then install that
+```
+pip install --upgrade <wheel>
+```
+
+Note, unlike tensorflow this will be installed to a tflite_runtime namespace.
+You can then use the Tensorflow Lite interpreter as.
+```
+import tflite_runtime as tflr
+interpreter = tflr.lite.Interpreter(model_path="foo.tflite")
+```
+
+This currently works to build on Linux machines including Raspberry Pi. In
+the future, cross compilation to smaller SOCs like Raspberry Pi from
+bigger host will be supported.
+
+## Caveats
+
+* You cannot use TensorFlow Select ops, only TensorFlow Lite builtins.
+* Currently custom ops and delegates cannot be registered.
+
diff --git a/tensorflow/lite/tools/pip_package/build_pip_package.sh b/tensorflow/lite/tools/pip_package/build_pip_package.sh
new file mode 100644
index 00000000000..2887ce84712
--- /dev/null
+++ b/tensorflow/lite/tools/pip_package/build_pip_package.sh
@@ -0,0 +1,54 @@
+#!/usr/bin/env bash
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+set -e
+
+# Find where this script lives and then the Tensorflow root.
+MY_DIRECTORY=`dirname $0`
+export TENSORFLOW_SRC_ROOT=`realpath $MY_DIRECTORY/../../../..`
+
+export TENSORFLOW_VERSION=`grep "_VERSION = " $TENSORFLOW_SRC_ROOT/tensorflow/tools/pip_package/setup.py  | cut -d'=' -f 2 | sed "s/[ '-]//g"`;
+
+
+# Build a pip build tree.
+BUILD_ROOT=/tmp/tflite_pip
+rm -rf $BUILD_ROOT
+mkdir -p $BUILD_ROOT/tflite_runtime/lite
+mkdir -p $BUILD_ROOT/tflite_runtime/lite/python
+
+# Build an importable module tree
+cat > $BUILD_ROOT/tflite_runtime/__init__.py <<EOF;
+import tflite_runtime.lite.interpreter
+EOF
+
+cat > $BUILD_ROOT/tflite_runtime/lite/__init__.py <<EOF;
+from interpreter import Interpreter as Interpreter
+EOF
+
+cat > $BUILD_ROOT/tflite_runtime/lite/python/__init__.py <<EOF;
+# Python module for TensorFlow Lite
+EOF
+
+# Copy necessary source files
+TFLITE_ROOT=$TENSORFLOW_SRC_ROOT/tensorflow/lite
+cp -r  $TFLITE_ROOT/python/interpreter_wrapper $BUILD_ROOT
+cp $TFLITE_ROOT/python/interpreter.py $BUILD_ROOT/tflite_runtime/lite/
+cp $TFLITE_ROOT/tools/pip_package/setup.py $BUILD_ROOT
+cp $TFLITE_ROOT/tools/pip_package/MANIFEST.in $BUILD_ROOT
+
+# Build the Pip
+cd $BUILD_ROOT
+python setup.py bdist_wheel
diff --git a/tensorflow/lite/tools/pip_package/setup.py b/tensorflow/lite/tools/pip_package/setup.py
new file mode 100644
index 00000000000..64d62ee1f2d
--- /dev/null
+++ b/tensorflow/lite/tools/pip_package/setup.py
@@ -0,0 +1,150 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""TensorFlow Lite is for mobile and embedded devices.
+
+TensorFlow Lite is the official solution for running machine learning models on
+mobile and embedded devices. It enables on-device machine learning inference
+with low latency and a small binary size on Android, iOS, and other operating
+systems.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import multiprocessing
+import os
+import subprocess
+
+from distutils.command.build_ext import build_ext
+import numpy
+
+from setuptools import Extension
+from setuptools import find_packages
+from setuptools import setup
+from setuptools.command.build_py import build_py
+PACKAGE_NAME = 'tflite-runtime'
+PACKAGE_VERSION = os.environ['TENSORFLOW_VERSION']
+DOCLINES = __doc__.split('\n')
+PACKAGE = 'tflite_runtime.lite.python'
+TENSORFLOW_DIR = os.environ['TENSORFLOW_SRC_ROOT']
+
+# Setup cross compiling
+TARGET = (
+    os.environ['TENSORFLOW_TARGET'] if 'TENSORFLOW_TARGET' in os.environ
+    else None)
+if TARGET == 'rpi':
+  os.environ['CXX'] = 'arm-linux-gnueabihf-g++'
+  os.environ['CC'] = 'arm-linux-gnueabihf-g++'
+MAKE_CROSS_OPTIONS = ['TARGET=%s' % TARGET]  if TARGET else []
+
+RELATIVE_MAKE_DIR = os.path.join('tensorflow', 'lite', 'tools', 'make')
+MAKE_DIR = os.path.join(TENSORFLOW_DIR, RELATIVE_MAKE_DIR)
+DOWNLOADS_DIR = os.path.join(MAKE_DIR, 'downloads')
+RELATIVE_MAKEFILE_PATH = os.path.join(RELATIVE_MAKE_DIR, 'Makefile')
+DOWNLOAD_SCRIPT_PATH = os.path.join(MAKE_DIR, 'download_dependencies.sh')
+
+
+def make_args(target='', quiet=True):
+  """Construct make command line."""
+  args = (['make', 'SHELL=/bin/bash', '-C', TENSORFLOW_DIR]
+          + MAKE_CROSS_OPTIONS +
+          ['-f', RELATIVE_MAKEFILE_PATH, '-j',
+           str(multiprocessing.cpu_count())])
+  if quiet:
+    args.append('--quiet')
+  if target:
+    args.append(target)
+  return args
+
+
+def make_output(target):
+  """Invoke make on the target and return output."""
+  return subprocess.check_output(make_args(target)).decode('utf-8').strip()
+
+
+def make():
+  """Invoke make to build tflite C++ sources.
+
+  Build dependencies:
+     apt-get install swig libjpeg-dev zlib1g-dev python3-dev python3-nump
+  """
+  subprocess.check_call(make_args(quiet=False))
+
+
+def download_dependencies():
+  """Download build dependencies if haven't done yet."""
+  if not os.path.isdir(DOWNLOADS_DIR) or not os.listdir(DOWNLOADS_DIR):
+    subprocess.check_call(DOWNLOAD_SCRIPT_PATH)
+
+
+class CustomBuildExt(build_ext, object):
+
+  def run(self):
+    download_dependencies()
+    make()
+
+    return super(CustomBuildExt, self).run()
+
+
+class CustomBuildPy(build_py, object):
+
+  def run(self):
+    self.run_command('build_ext')
+    return super(CustomBuildPy, self).run()
+
+
+LIB_TFLITE = 'tensorflow-lite'
+LIB_TFLITE_DIR = make_output('libdir')
+
+ext = Extension(
+    name='%s._interpreter_wrapper' % PACKAGE,
+    language='c++',
+    sources=['interpreter_wrapper/interpreter_wrapper.i',
+             'interpreter_wrapper/interpreter_wrapper.cc'],
+    swig_opts=['-c++',
+               '-I%s' % TENSORFLOW_DIR,
+               '-module', 'interpreter_wrapper',
+               '-outdir', '.'],
+    extra_compile_args=['-std=c++11'],
+    include_dirs=[TENSORFLOW_DIR,
+                  os.path.join(TENSORFLOW_DIR, 'tensorflow', 'lite', 'tools',
+                               'pip_package'),
+                  numpy.get_include(),
+                  os.path.join(DOWNLOADS_DIR, 'flatbuffers', 'include'),
+                  os.path.join(DOWNLOADS_DIR, 'absl')],
+    libraries=[LIB_TFLITE],
+    library_dirs=[LIB_TFLITE_DIR])
+
+
+setup(
+    name=PACKAGE_NAME,
+    version=PACKAGE_VERSION,
+    description=DOCLINES[0],
+    long_description='\n'.join(DOCLINES[2:]),
+    url='https://www.tensorflow.org/lite/',
+    author='Google Inc.',
+    author_email='opensource@google.com',
+    license='Apache 2.0',
+    include_package_data=True,
+    keywords='tflite tensorflow tensor machine learning',
+    packages=find_packages(exclude=[]),
+    ext_modules=[ext],
+    package_dir={PACKAGE: '.'},
+    cmdclass={
+        'build_ext': CustomBuildExt,
+        'build_py': CustomBuildPy,
+    }
+)
diff --git a/tensorflow/lite/tutorials/mnist_tflite.py b/tensorflow/lite/tutorials/mnist_tflite.py
index 002365717fc..6cc58461635 100644
--- a/tensorflow/lite/tutorials/mnist_tflite.py
+++ b/tensorflow/lite/tutorials/mnist_tflite.py
@@ -34,8 +34,8 @@ flags = flags.FLAGS
 def test_image_generator():
   # Generates an iterator over images
   with tf.Session() as sess:
-    input_data = dataset.test(
-        flags.data_dir).make_one_shot_iterator().get_next()
+    input_data = tf.compat.v1.data.make_one_shot_iterator(dataset.test(
+        flags.data_dir)).get_next()
     try:
       while True:
         yield sess.run(input_data)
diff --git a/tensorflow/lite/util.h b/tensorflow/lite/util.h
index 64a5b52e2f9..dbb87528d06 100644
--- a/tensorflow/lite/util.h
+++ b/tensorflow/lite/util.h
@@ -52,6 +52,12 @@ bool EqualArrayAndTfLiteIntArray(const TfLiteIntArray* a, const int b_size,
 
 size_t CombineHashes(std::initializer_list<size_t> hashes);
 
+struct TfLiteIntArrayDeleter {
+  void operator()(TfLiteIntArray* a) {
+    if (a) TfLiteIntArrayFree(a);
+  }
+};
+
 }  // namespace tflite
 
 #endif  // TENSORFLOW_LITE_UTIL_H_
diff --git a/tensorflow/opensource_only.files b/tensorflow/opensource_only.files
new file mode 100644
index 00000000000..8ff1645b989
--- /dev/null
+++ b/tensorflow/opensource_only.files
@@ -0,0 +1,17 @@
+tensorflow/third_party/toolchains/preconfig/ubuntu14.04/py3/BUILD
+tensorflow/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda9.0/BUILD
+tensorflow/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc/BUILD
+tensorflow/third_party/toolchains/preconfig/ubuntu14.04/cuda10.0-cudnn7/cuda/build_defs.bzl
+tensorflow/third_party/toolchains/preconfig/ubuntu14.04/cuda10.0-cudnn7/cuda/BUILD
+tensorflow/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda10.0/BUILD
+tensorflow/third_party/toolchains/preconfig/ubuntu14.04/cuda9.0-cudnn7/cuda/build_defs.bzl
+tensorflow/third_party/toolchains/preconfig/ubuntu14.04/cuda9.0-cudnn7/cuda/BUILD
+tensorflow/third_party/toolchains/preconfig/ubuntu14.04/nccl2/BUILD
+tensorflow/third_party/toolchains/preconfig/generate/workspace.bzl
+tensorflow/third_party/toolchains/preconfig/generate/containers.bzl
+tensorflow/third_party/toolchains/preconfig/generate/generate.bzl
+tensorflow/third_party/toolchains/preconfig/generate/BUILD
+tensorflow/third_party/toolchains/preconfig/win_1803/bazel_018/BUILD
+tensorflow/third_party/toolchains/preconfig/win_1803/bazel_018/dummy_toolchain.bzl
+tensorflow/third_party/toolchains/preconfig/win_1803/py36/BUILD
+tensorflow/third_party/toolchains/preconfig/win_1803/BUILD
\ No newline at end of file
diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index 4fe92262ba6..a558045e4af 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -102,6 +102,7 @@ py_library(
         ":framework_for_generated_wrappers",
         ":functional_ops",
         ":gradient_checker",
+        ":gradient_checker_v2",
         ":graph_util",
         ":histogram_ops",
         ":image_ops",
@@ -124,7 +125,6 @@ py_library(
         ":session_ops",
         ":sets",
         ":sparse_ops",
-        ":spectral_ops",
         ":spectral_ops_test_util",
         ":standard_ops",
         ":state_ops",
@@ -132,6 +132,7 @@ py_library(
         ":subscribe",
         ":summary",
         ":tensor_array_ops",
+        ":tensor_forest_ops",
         ":test_ops",  # TODO: Break testing code out into separate rule.
         ":tf_cluster",
         ":tf_item",
@@ -524,6 +525,17 @@ py_test(
     ],
 )
 
+py_test(
+    name = "dispatch_test",
+    srcs = ["util/dispatch_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":client_testlib",
+        ":platform",
+        ":util",
+    ],
+)
+
 py_test(
     name = "keyword_args_test",
     srcs = ["util/keyword_args_test.py"],
@@ -854,7 +866,6 @@ py_library(
     deps = [
         ":c_api_util",
         ":control_flow_util",
-        ":cpp_shape_inference_proto_py",
         ":device",
         ":dtypes",
         ":error_interpolation",
@@ -862,6 +873,7 @@ py_library(
         ":platform",
         ":registry",
         ":tensor_shape",
+        ":tf2",
         ":traceable_stack",
         ":util",
         ":versions",
@@ -880,6 +892,8 @@ py_library(
     deps = [
         ":auto_control_deps",
         ":framework_ops",
+        ":sparse_tensor",
+        ":tensor_array_ops",
         "//tensorflow/python/autograph",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:graph_only_ops",
@@ -894,6 +908,8 @@ py_library(
     deps = [
         ":control_flow_ops",
         ":framework_ops",
+        ":sparse_tensor",
+        ":tensor_array_ops",
         ":util",
     ],
 )
@@ -981,6 +997,7 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         ":dtypes",
+        ":tf2",
         ":util",
         "//tensorflow/core:protos_all_py",
     ],
@@ -994,6 +1011,7 @@ py_library(
         ":common_shapes",
         ":dtypes",
         ":tensor_shape",
+        ":util",
         "//third_party/py/numpy",
     ],
 )
@@ -1052,6 +1070,7 @@ py_library(
         ":random_seed",
         ":resource_variable_ops",
         ":session",
+        ":tensor_array_ops",
         ":training",
         ":util",
         ":variables",
@@ -1076,10 +1095,13 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         ":client",
+        ":cond_v2",
         ":framework_test_lib",
         ":gradient_checker",
+        ":gradient_checker_v2",
         ":platform_test",
         ":util",
+        ":while_v2",
     ],
 )
 
@@ -1384,6 +1406,7 @@ py_test(
     srcs_version = "PY2AND3",
     tags = ["no_pip"],  # test_ops_2 is not available in pip.
     deps = [
+        ":cond_v2",
         ":control_flow_ops",
         ":errors",
         ":framework",
@@ -1398,6 +1421,7 @@ py_test(
         ":util",
         ":variable_scope",
         ":variables",
+        ":while_v2",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:function",
@@ -1618,6 +1642,14 @@ tf_gen_op_wrapper_private_py(
     ],
 )
 
+tf_gen_op_wrapper_private_py(
+    name = "tensor_forest_ops_gen",
+    visibility = ["//tensorflow:internal"],
+    deps = [
+        "//tensorflow/core:tensor_forest_ops_op_lib",
+    ],
+)
+
 tf_gen_op_wrapper_private_py(
     name = "summary_ops_gen",
     visibility = ["//tensorflow:__subpackages__"],
@@ -1837,6 +1869,7 @@ tf_gen_op_wrapper_private_py(
 
 tf_gen_op_wrapper_private_py(
     name = "spectral_ops_gen",
+    visibility = ["//tensorflow/python/ops/signal:__pkg__"],
 )
 
 tf_gen_op_wrapper_private_py(
@@ -1941,6 +1974,28 @@ py_library(
     ],
 )
 
+py_library(
+    name = "tensor_forest_ops",
+    srcs = ["ops/tensor_forest_ops.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":framework",
+        ":ops",
+        ":tensor_forest_ops_gen",
+        ":training",
+        "//tensorflow/core/kernels/boosted_trees:boosted_trees_proto_py",
+    ],
+)
+
+py_library(
+    name = "optional_grad",
+    srcs = ["ops/optional_grad.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":framework_ops",
+    ],
+)
+
 py_library(
     name = "sets",
     srcs = [
@@ -2056,7 +2111,6 @@ py_library(
     srcs = ["ops/control_flow_ops.py"],
     srcs_version = "PY2AND3",
     deps = [
-        "tensor_shape",
         ":array_ops",
         ":array_ops_gen",
         ":constant_op",
@@ -2071,6 +2125,7 @@ py_library(
         ":resource_variable_ops_gen",
         ":sparse_tensor",
         ":tensor_array_ops",
+        ":tensor_shape",
         ":tf2",
         ":tf_should_use",
         ":util",
@@ -2093,7 +2148,9 @@ py_library(
     srcs = ["ops/control_flow_util_v2.py"],
     srcs_version = "PY2AND3",
     deps = [
-        "framework_ops",
+        ":control_flow_util",
+        ":framework_ops",
+        "//tensorflow/core:protos_all_py",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:function",
     ],
@@ -2118,7 +2175,7 @@ py_library(
         ":graph_to_function_def",
         ":pywrap_tensorflow",
         ":util",
-        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/eager:function",
     ],
 )
@@ -2145,7 +2202,6 @@ py_library(
         ":tensor_shape",
         ":tensor_util",
         ":util",
-        "//tensorflow/core:protos_all_py",
         "//tensorflow/python/eager:function",
     ],
 )
@@ -2263,10 +2319,10 @@ py_library(
         ":manip_ops",
         ":math_grad",
         ":math_ops",
+        ":optional_grad",
         ":platform",
         ":random_grad",
         ":resource_variable_ops",
-        ":spectral_grad",
         ":tensor_array_ops",
         ":tensor_util",
         ":unconnected_gradients",
@@ -2508,7 +2564,6 @@ py_library(
         ":nn_ops_gen",
         ":sparse_ops_gen",
         ":sparse_tensor",
-        ":spectral_ops_gen",
         ":state_ops",
         ":state_ops_gen",
         ":tensor_shape",
@@ -2814,33 +2869,34 @@ py_test(
         ":framework_test_lib",
         ":sparse_ops",
         ":sparse_tensor",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
 py_library(
-    name = "spectral_grad",
-    srcs = ["ops/spectral_grad.py"],
+    name = "sort_ops",
+    srcs = ["ops/sort_ops.py"],
     srcs_version = "PY2AND3",
     deps = [
         ":array_ops",
         ":framework",
-        ":framework_for_generated_wrappers",
         ":math_ops",
-        ":spectral_ops",
+        ":nn_ops",
         "//third_party/py/numpy",
     ],
 )
 
-py_library(
-    name = "spectral_ops",
-    srcs = ["ops/spectral_ops.py"],
+py_test(
+    name = "sort_ops_test",
+    srcs = ["ops/sort_ops_test.py"],
     srcs_version = "PY2AND3",
     deps = [
         ":array_ops",
-        ":dtypes",
-        ":framework_ops",
-        ":math_ops",
-        ":spectral_ops_gen",
+        ":client_testlib",
+        ":framework",
+        ":random_ops",
+        ":sort_ops",
+        "//third_party/py/numpy",
     ],
 )
 
@@ -2958,10 +3014,10 @@ py_library(
         ":random_ops",
         ":script_ops",
         ":session_ops",
+        ":sort_ops",
         ":sparse_grad",
         ":sparse_ops",
         ":special_math_ops",
-        ":spectral_grad",
         ":state_grad",
         ":state_ops",
         ":stateless_random_ops",
@@ -2972,6 +3028,7 @@ py_library(
         ":util",
         ":variable_scope",
         ":variables",
+        "//tensorflow/python/eager:wrap_function",
         "//tensorflow/python/ops/distributions",
         "//tensorflow/python/ops/linalg",
     ],
@@ -3066,13 +3123,16 @@ py_library(
     deps = [
         ":array_ops",
         ":constant_op",
+        ":control_flow_ops_gen",
         ":data_flow_ops_gen",
         ":dtypes",
         ":errors",
         ":framework_ops",
+        ":list_ops",
         ":math_ops",
         ":tensor_shape",
         ":tensor_util",
+        ":tf2",
         ":tf_should_use",
         "//tensorflow/python/eager:context",
     ],
@@ -3131,6 +3191,19 @@ py_library(
     ],
 )
 
+py_library(
+    name = "gradient_checker_v2",
+    srcs = ["ops/gradient_checker_v2.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":array_ops",
+        ":framework_for_generated_wrappers",
+        ":gradients",
+        ":platform",
+        "//third_party/py/numpy",
+    ],
+)
+
 # This target is deprecated.
 py_library(
     name = "ops",
@@ -3196,6 +3269,22 @@ cuda_py_test(
     ],
 )
 
+cuda_py_test(
+    name = "gradient_checker_v2_test",
+    size = "medium",
+    srcs = ["ops/gradient_checker_v2_test.py"],
+    additional_deps = [
+        ":array_ops",
+        ":client_testlib",
+        ":framework_for_generated_wrappers",
+        ":math_ops",
+        ":nn_grad",
+        ":nn_ops",
+        ":platform",
+        "//third_party/py/numpy",
+    ],
+)
+
 cuda_py_test(
     name = "gradients_test",
     size = "medium",
@@ -3304,6 +3393,9 @@ cuda_py_test(
         ":client_testlib",
         ":framework_for_generated_wrappers",
         ":math_ops",
+        "//tensorflow/python/eager:backprop",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:execution_callbacks",
         "//third_party/py/numpy",
     ],
     tags = ["no_windows_gpu"],
@@ -3475,6 +3567,7 @@ py_library(
         "@six_archive//:six",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/distribute:reduce_util",
         "//tensorflow/python/distribute:distribute_coordinator_context",
         "//tensorflow/python/eager:backprop",
         "//tensorflow/python/eager:context",
@@ -3486,6 +3579,19 @@ py_library(
     ],
 )
 
+# Dependency added and used by ClusterResolvers to avoid circular dependency between keras, distribute, and training.
+py_library(
+    name = "training_server_lib",
+    srcs = ["training/server_lib.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":framework",
+        ":pywrap_tensorflow",
+        ":util",
+        "//tensorflow/core:protos_all_py",
+    ],
+)
+
 py_library(
     name = "saveable_object",
     srcs = ["training/saveable_object.py"],
@@ -3560,17 +3666,6 @@ py_library(
     ],
 )
 
-py_library(
-    name = "device_util",
-    srcs = ["training/device_util.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":device",
-        ":framework_ops",
-        "//tensorflow/python/eager:context",
-    ],
-)
-
 py_library(
     name = "distribute",
     srcs = [
@@ -3579,29 +3674,7 @@ py_library(
     ],
     srcs_version = "PY2AND3",
     deps = [
-        ":array_ops",
-        ":control_flow_ops",
-        ":device_util",
-        ":framework_ops",
-        ":platform",
-        ":resource_variable_ops",
-        ":state_ops",
-        ":util",
-        ":variable_scope",
-        "//tensorflow/python/data",
-        "//tensorflow/python/ops/losses",
-    ],
-)
-
-py_test(
-    name = "distribute_test",
-    size = "small",
-    srcs = ["training/distribute_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":client_testlib",
-        ":distribute",
-        ":variable_scope",
+        "//tensorflow/python/distribute:distribute_lib",
     ],
 )
 
@@ -4108,11 +4181,24 @@ genrule(
 
 # Get the import library of  _pywrap_tensorflow_internal.dll
 filegroup(
-    name = "pywrap_tensorflow_import_lib_file",
+    name = "get_pywrap_tensorflow_import_lib_file",
     srcs = [":_pywrap_tensorflow_internal.so"],
     output_group = "interface_library",
 )
 
+# Rename the import library for _pywrap_tensorflow_internal.pyd to _pywrap_tensorflow_internal.lib
+# (It was _pywrap_tensorflow_internal.so.if.lib).
+genrule(
+    name = "pywrap_tensorflow_import_lib_file",
+    srcs = [":get_pywrap_tensorflow_import_lib_file"],
+    outs = ["_pywrap_tensorflow_internal.lib"],
+    cmd = select({
+        "//tensorflow:windows": "cp -f $< $@",
+        "//conditions:default": "touch $@",  # Just a placeholder for Unix platforms
+    }),
+    visibility = ["//visibility:public"],
+)
+
 # Create a cc_import rule for the import library of _pywrap_tensorflow_internal.dll
 # so that custom ops' dynamic libraries can link against it.
 cc_import(
@@ -4590,7 +4676,6 @@ cuda_py_tests(
         "training/basic_loops_test.py",
         "training/coordinator_test.py",
         "training/device_setter_test.py",
-        "training/device_util_test.py",
         "training/ftrl_test.py",
         "training/gradient_descent_test.py",
         "training/learning_rate_decay_test.py",
@@ -4901,7 +4986,7 @@ py_test(
         ":training",
         ":variable_scope",
         ":variables",
-        "//tensorflow/python/feature_column",
+        "//tensorflow/python/feature_column:feature_column_py",
         "//third_party/py/numpy",
     ],
 )
diff --git a/tensorflow/python/__init__.py b/tensorflow/python/__init__.py
index 5da304e38cc..547043030b1 100644
--- a/tensorflow/python/__init__.py
+++ b/tensorflow/python/__init__.py
@@ -86,12 +86,12 @@ from tensorflow.python.ops import image_ops as image
 from tensorflow.python.ops import manip_ops as manip
 from tensorflow.python.ops import metrics
 from tensorflow.python.ops import nn
+from tensorflow.python.ops import ragged
 from tensorflow.python.ops import sets
-from tensorflow.python.ops import spectral_ops as spectral
 from tensorflow.python.ops.distributions import distributions
 from tensorflow.python.ops.linalg import linalg
 from tensorflow.python.ops.losses import losses
-from tensorflow.python.ops import signal
+from tensorflow.python.ops.signal import signal
 from tensorflow.python.profiler import profiler
 from tensorflow.python.saved_model import saved_model
 from tensorflow.python.summary import summary
@@ -163,7 +163,7 @@ tf_export('Summary', 'summary.Summary')(Summary)
 tf_export('summary.SummaryDescription')(SummaryDescription)
 tf_export('SummaryMetadata')(SummaryMetadata)
 tf_export('summary.TaggedRunMetadata')(TaggedRunMetadata)
-tf_export('TensorInfo')(TensorInfo)
+tf_export(v1=['TensorInfo'])(TensorInfo)
 # pylint: enable=undefined-variable
 
 # Special dunders that we choose to export:
diff --git a/tensorflow/python/autograph/converters/BUILD b/tensorflow/python/autograph/converters/BUILD
index ced2e4796b1..3ac446db02c 100644
--- a/tensorflow/python/autograph/converters/BUILD
+++ b/tensorflow/python/autograph/converters/BUILD
@@ -63,7 +63,6 @@ py_test(
     name = "asserts_test",
     srcs = ["asserts_test.py"],
     srcs_version = "PY2AND3",
-    tags = ["no_windows"],
     deps = [
         ":converters",
         "//tensorflow/python:client_testlib",
@@ -239,7 +238,6 @@ py_test(
     name = "error_handlers_test",
     srcs = ["error_handlers_test.py"],
     srcs_version = "PY2AND3",
-    tags = ["no_windows"],
     deps = [
         ":converters",
         "//tensorflow/python:client_testlib",
diff --git a/tensorflow/python/autograph/converters/asserts_test.py b/tensorflow/python/autograph/converters/asserts_test.py
index eef628aeb6f..9ae448892a0 100644
--- a/tensorflow/python/autograph/converters/asserts_test.py
+++ b/tensorflow/python/autograph/converters/asserts_test.py
@@ -23,12 +23,14 @@ from tensorflow.python.autograph.converters import side_effect_guards
 from tensorflow.python.autograph.core import converter_testing
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import errors_impl
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import gen_control_flow_ops
 from tensorflow.python.platform import test
 
 
 class AssertsTest(converter_testing.TestCase):
 
+  @test_util.run_deprecated_v1
   def test_basic(self):
 
     def test_fn(a):
@@ -41,7 +43,7 @@ class AssertsTest(converter_testing.TestCase):
         op = result.test_fn(constant_op.constant(False))
         with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
                                      'test message'):
-          sess.run(op)
+          self.evaluate(op)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/autograph/converters/builtin_functions_test.py b/tensorflow/python/autograph/converters/builtin_functions_test.py
index 30cfb13233a..2683be16ec7 100644
--- a/tensorflow/python/autograph/converters/builtin_functions_test.py
+++ b/tensorflow/python/autograph/converters/builtin_functions_test.py
@@ -24,12 +24,14 @@ from tensorflow.python.autograph.converters import builtin_functions
 from tensorflow.python.autograph.core import converter_testing
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
 
 
 class BuiltinFunctionsTest(converter_testing.TestCase):
 
+  @test_util.run_deprecated_v1
   def test_len(self):
 
     def test_fn(a):
@@ -41,6 +43,7 @@ class BuiltinFunctionsTest(converter_testing.TestCase):
         ops = result.test_fn(p)
         self.assertEqual(sess.run(ops, {p: [0, 0, 0]}), 3)
 
+  @test_util.run_deprecated_v1
   def test_print(self):
 
     if six.PY2:
@@ -54,6 +57,7 @@ class BuiltinFunctionsTest(converter_testing.TestCase):
         with self.assertPrints('a\n'):
           sess.run(result.test_fn('a'))
 
+  @test_util.run_deprecated_v1
   def test_print_multiple_values(self):
 
     if six.PY2:
diff --git a/tensorflow/python/autograph/converters/call_trees.py b/tensorflow/python/autograph/converters/call_trees.py
index 55cea89126a..9b85fc8367c 100644
--- a/tensorflow/python/autograph/converters/call_trees.py
+++ b/tensorflow/python/autograph/converters/call_trees.py
@@ -22,7 +22,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from collections import namedtuple
+import collections
 
 import gast
 
@@ -35,7 +35,7 @@ from tensorflow.python.autograph.pyct import templates
 from tensorflow.python.util import tf_inspect
 
 
-class FunctionInfo(namedtuple('FunctionInfo', ('dtype',))):
+class FunctionInfo(collections.namedtuple('FunctionInfo', ('dtype',))):
   pass
 
 
@@ -116,12 +116,19 @@ class CallTreeTransformer(converter.Base):
   def _function_is_compilable(self, target_entity):
     """Determines whether an entity can be compiled at all."""
     # TODO(mdan): Expand.
+
     if target_entity.__module__ is None:
       # Functions like builtins and NumPy don't expose a module.
       # Those in general should not be compiled.
       return False
+
     if inspect_utils.isbuiltin(target_entity):
       return False
+
+    if inspect_utils.isnamedtuple(target_entity):
+      # namedtuple doesn't expose its source code, making it uncompilable.
+      return False
+
     return True
 
   def _should_compile(self, node, fqn):
@@ -140,6 +147,11 @@ class CallTreeTransformer(converter.Base):
 
     if target_entity is not None:
 
+      # Currently, lambdas are always converted.
+      # TODO(mdan): Allow markers of the kind f = ag.do_not_convert(lambda: ...)
+      if inspect_utils.islambda(target_entity):
+        return True
+
       # This may be reached when "calling" a callable attribute of an object.
       # For example:
       #
@@ -296,7 +308,13 @@ class CallTreeTransformer(converter.Base):
         # safe for graph mode.
         return node
 
+      elif inspect_utils.isnamedtuple(target_entity):
+        # Although not compilable, we assume they are safe for graph mode.
+        node = self.generic_visit(node)
+        return node
+
       else:
+        # TODO(mdan): Instert dynamic conversion here instead.
         raise NotImplementedError(
             'py_func with return values (unknown function)')
     else:
diff --git a/tensorflow/python/autograph/converters/call_trees_test.py b/tensorflow/python/autograph/converters/call_trees_test.py
index 916c736fb4b..454d75d755c 100644
--- a/tensorflow/python/autograph/converters/call_trees_test.py
+++ b/tensorflow/python/autograph/converters/call_trees_test.py
@@ -18,6 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import collections
+
 import numpy as np
 
 from tensorflow.python.autograph.converters import call_trees
@@ -85,6 +87,34 @@ class CallTreesTest(converter_testing.TestCase):
       tc = TestClass()
       self.assertEquals(3, result.test_fn_2(tc, 1))
 
+  def test_known_called_lambda(self):
+
+    l = lambda x: x
+
+    def test_fn(a):
+      return l(a)
+
+    ns = {'l': l}
+    node, ctx = self.prepare(test_fn, ns)
+    node = call_trees.transform(node, ctx)
+
+    with self.compiled(node, ns) as result:
+      self.assertEquals(1, result.test_fn(1))
+
+  def test_known_called_namedtuple(self):
+
+    nt = collections.namedtuple('TestNamedTuple', ['a'])
+
+    def test_fn(a):
+      return nt(a)
+
+    ns = {'nt': nt}
+    node, ctx = self.prepare(test_fn, ns)
+    node = call_trees.transform(node, ctx)
+
+    with self.compiled(node, ns) as result:
+      self.assertEquals(nt(1), result.test_fn(1))
+
   def test_py_func_known_function(self):
 
     def test_fn():
@@ -94,7 +124,7 @@ class CallTreesTest(converter_testing.TestCase):
                         dtypes.int64) as result:
       with self.cached_session() as sess:
         self.assertTrue(isinstance(result.test_fn(), ops.Tensor))
-        self.assertIn(sess.run(result.test_fn()), (0, 1, 2))
+        self.assertIn(self.evaluate(result.test_fn()), (0, 1, 2))
 
   def test_uncompiled_modules(self):
 
@@ -113,7 +143,7 @@ class CallTreesTest(converter_testing.TestCase):
     with self.compiled(node, ns) as result:
       with self.cached_session() as sess:
         result_tensor = result.test_fn(constant_op.constant(1))
-        self.assertEquals(sess.run(result_tensor), 3)
+        self.assertEquals(self.evaluate(result_tensor), 3)
 
   def test_call_to_decorated_function(self):
 
diff --git a/tensorflow/python/autograph/converters/continue_statements.py b/tensorflow/python/autograph/converters/continue_statements.py
index 584cdc1efd4..05e19e59fc6 100644
--- a/tensorflow/python/autograph/converters/continue_statements.py
+++ b/tensorflow/python/autograph/converters/continue_statements.py
@@ -24,94 +24,93 @@ from tensorflow.python.autograph.pyct import templates
 from tensorflow.python.autograph.pyct.static_analysis.annos import NodeAnno
 
 
-# Tags for local state.
-CONTROL_VAR_NAME = 'control_var_name'
-CONTINUE_USED = 'continue_used'
-GUARD_CREATED = 'guard_created'
-CREATE_GUARD_NEXT = 'create_guard_next'
+class _Continue(object):
+
+  def __init__(self):
+    self.used = False
+    self.control_var_name = None
+    self.create_guard = False
+    self.guard_created = False
+
+  def __repr__(self):
+    return 'used: %s, var: %s' % (self.used, self.control_var_name)
 
 
 class ContinueCanonicalizationTransformer(converter.Base):
   """Canonicalizes continue statements into additional conditionals."""
 
   def visit_Continue(self, node):
-    self.set_local(CONTINUE_USED, True)
+    self.state[_Continue].used = True
     template = """
-      var_name = tf.constant(True)
+      var_name = True
     """
     return templates.replace(
-        template, var_name=self.get_local(CONTROL_VAR_NAME))
+        template, var_name=self.state[_Continue].control_var_name)
 
   def _postprocess_statement(self, node):
     # Example of how the state machine below works:
     #
-    #   1| stmt           # State: CONTINUE_USED = False
+    #   1| stmt           # State: Continue_.used = False
     #    |                # Action: none
     #   2| if cond:
-    #   3|   continue     # State: CONTINUE_USED = True,
-    #    |                #        GUARD_CREATED = False,
-    #    |                #        CREATE_GUARD_NEXT = False
-    #    |                # Action: set CREATE_GUARD_NEXT = True
-    #   4| stmt           # State: CONTINUE_USED = True,
-    #    |                #        GUARD_CREATED = False,
-    #    |                #        CREATE_GUARD_NEXT = True
+    #   3|   continue     # State: Continue_.used = True,
+    #    |                #        Continue_.guard_created = False,
+    #    |                #        Continue_.create_guard = False
+    #    |                # Action: Continue_.create_guard = True
+    #   4| stmt           # State: Continue_.used = True,
+    #    |                #        Continue_.guard_created = False,
+    #    |                #        Continue_.create_guard = True
     #    |                # Action: create `if not continue_used`,
-    #    |                #         set GUARD_CREATED = True
-    #   5| stmt           # State: CONTINUE_USED = True, GUARD_CREATED = True
+    #    |                #         set Continue_.guard_created = True
+    #   5| stmt           # State: Continue_.used = True,
+    #    |                #        Continue_.guard_created = True
     #    |                # Action: none (will be wrapped under previously
     #    |                #         created if node)
 
-    if self.get_local(CONTINUE_USED, False):
-      if self.get_local(GUARD_CREATED, False):
+    if self.state[_Continue].used:
+      if self.state[_Continue].guard_created:
         return node, None
 
-      elif not self.get_local(CREATE_GUARD_NEXT, False):
-        self.set_local(CREATE_GUARD_NEXT, True)
+      elif not self.state[_Continue].create_guard:
+        self.state[_Continue].create_guard = True
         return node, None
 
       else:
-        self.set_local(GUARD_CREATED, True)
+        self.state[_Continue].guard_created = True
         template = """
           if not var_name:
             original_node
         """
         cond, = templates.replace(
             template,
-            var_name=self.get_local(CONTROL_VAR_NAME),
+            var_name=self.state[_Continue].control_var_name,
             original_node=node)
         return cond, cond.body
     return node, None
 
   def _visit_loop_body(self, node, nodes):
-    self.enter_local_scope()
+    self.state[_Continue].enter()
     scope = anno.getanno(node, NodeAnno.BODY_SCOPE)
     continue_var = self.ctx.namer.new_symbol('continue_', scope.referenced)
-    self.set_local(CONTROL_VAR_NAME, continue_var)
+    self.state[_Continue].control_var_name = continue_var
 
     nodes = self.visit_block(nodes, after_visit=self._postprocess_statement)
 
-    if self.get_local(CONTINUE_USED, False):
+    if self.state[_Continue].used:
       template = """
-        var_name = tf.constant(False)
+        var_name = False
       """
       control_var_init = templates.replace(template, var_name=continue_var)
       nodes = control_var_init + nodes
 
-    self.exit_local_scope()
+    self.state[_Continue].exit()
     return nodes
 
-  def _visit_non_loop_body(self, nodes):
-    self.enter_local_scope(inherit=(CONTROL_VAR_NAME,))
-    nodes = self.visit_block(nodes, after_visit=self._postprocess_statement)
-    continue_used = self.get_local(CONTINUE_USED, False)
-    self.exit_local_scope(keep=(CONTINUE_USED,))
-    return nodes, continue_used
-
   def visit_While(self, node):
     node.test = self.visit(node.test)
     node.body = self._visit_loop_body(node, node.body)
     # A continue in the else clause applies to the containing scope.
-    node.orelse, _ = self._visit_non_loop_body(node.orelse)
+    node.orelse = self.visit_block(node.orelse)
     return node
 
   def visit_For(self, node):
@@ -119,21 +118,11 @@ class ContinueCanonicalizationTransformer(converter.Base):
     node.iter = self.generic_visit(node.iter)
     node.body = self._visit_loop_body(node, node.body)
     # A continue in the else clause applies to the containing scope.
-    node.orelse, _ = self._visit_non_loop_body(node.orelse)
-    return node
-
-  def visit_If(self, node):
-    node.test = self.generic_visit(node.test)
-    node.body, continue_used_body = self._visit_non_loop_body(node.body)
-    node.orelse, continue_used_orelse = self._visit_non_loop_body(node.orelse)
-    self.set_local(CONTINUE_USED, continue_used_body or continue_used_orelse)
-    return node
-
-  def visit_With(self, node):
-    node.items = self.visit_block(node.items)
-    node.body, _ = self._visit_non_loop_body(node.body)
+    node.orelse = self.visit_block(node.orelse)
     return node
 
 
 def transform(node, ctx):
-  return ContinueCanonicalizationTransformer(ctx).visit(node)
+  transformer = ContinueCanonicalizationTransformer(ctx)
+  node = transformer.visit(node)
+  return node
diff --git a/tensorflow/python/autograph/converters/control_flow.py b/tensorflow/python/autograph/converters/control_flow.py
index 5853e044c53..bef6cae1bb8 100644
--- a/tensorflow/python/autograph/converters/control_flow.py
+++ b/tensorflow/python/autograph/converters/control_flow.py
@@ -106,14 +106,49 @@ class ControlFlowTransformer(converter.Base):
       return 'no variables'
     return ', '.join(map(str, symbol_set))
 
-  def visit_If(self, node):
-    node = self.generic_visit(node)
+  def _determine_aliased_symbols(self, scope, node_defined_in, block):
+    if block:
+      block_live_in = set(anno.getanno(block[0], anno.Static.LIVE_VARS_IN))
+    else:
+      block_live_in = set()
 
+    # For the purpose of aliasing, composite symbols with live owners are live
+    # as well. Otherwise this would leak tensors from the conditional's body.
+    #
+    # For example:
+    #
+    #   obj = some_obj
+    #   if cond:
+    #     obj.a = val
+    #
+    # Thanslating to the code below would be incorrect:
+    #
+    #   def true_fn():
+    #     obj.a = val()  # Wrong! leaks ops owned by true_fn
+    #     return obj.a
+    for s in scope.modified:
+      if s.is_composite():
+        live_parents = block_live_in & s.owner_set
+        if live_parents:
+          block_live_in.add(s)
+    return scope.modified & node_defined_in & block_live_in
+
+  def visit_If(self, node):
     body_scope = anno.getanno(node, annos.NodeAnno.BODY_SCOPE)
     orelse_scope = anno.getanno(node, annos.NodeAnno.ORELSE_SCOPE)
     defined_in = anno.getanno(node, anno.Static.DEFINED_VARS_IN)
     live_out = anno.getanno(node, anno.Static.LIVE_VARS_OUT)
 
+    # Note: this information needs to be extracted before the body conversion
+    # that happens in the call to generic_visit below, because the conversion
+    # generates nodes that lack static analysis annotations.
+    need_alias_in_body = self._determine_aliased_symbols(
+        body_scope, defined_in, node.body)
+    need_alias_in_orelse = self._determine_aliased_symbols(
+        orelse_scope, defined_in, node.orelse)
+
+    node = self.generic_visit(node)
+
     modified_in_cond = body_scope.modified | orelse_scope.modified
     returned_from_cond = set()
     for s in modified_in_cond:
@@ -125,9 +160,6 @@ class ControlFlowTransformer(converter.Base):
         if live_out & s.owner_set:
           returned_from_cond.add(s)
 
-    need_alias_in_body = body_scope.modified & defined_in
-    need_alias_in_orelse = orelse_scope.modified & defined_in
-
     created_in_body = body_scope.modified & returned_from_cond - defined_in
     created_in_orelse = orelse_scope.modified & returned_from_cond - defined_in
 
diff --git a/tensorflow/python/autograph/converters/control_flow_test.py b/tensorflow/python/autograph/converters/control_flow_test.py
index 03fdfc804e4..034fcbe3865 100644
--- a/tensorflow/python/autograph/converters/control_flow_test.py
+++ b/tensorflow/python/autograph/converters/control_flow_test.py
@@ -23,6 +23,7 @@ from tensorflow.python.autograph.core import converter_testing
 from tensorflow.python.autograph.pyct import transformer
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
 
 
@@ -36,6 +37,7 @@ class ControlFlowTest(converter_testing.TestCase):
       with self.cached_session() as sess:
         self.assertEqual(sess.run(result.test_fn(*inputs)), expected)
 
+  @test_util.run_deprecated_v1
   def test_while_basic(self):
 
     def test_fn(n):
@@ -48,6 +50,7 @@ class ControlFlowTest(converter_testing.TestCase):
 
     self.assertTransformedResult(test_fn, constant_op.constant(5), (10, 5, 5))
 
+  @test_util.run_deprecated_v1
   def test_while_nested(self):
 
     def test_fn(n):
@@ -66,6 +69,7 @@ class ControlFlowTest(converter_testing.TestCase):
     self.assertTransformedResult(test_fn, constant_op.constant(5),
                                  (25, 5, 0, 5))
 
+  @test_util.run_deprecated_v1
   def test_while_single_output(self):
 
     def test_fn(n):
@@ -86,6 +90,7 @@ class ControlFlowTest(converter_testing.TestCase):
     with self.assertRaises(NameError):
       control_flow.transform(node, ctx)
 
+  @test_util.run_deprecated_v1
   def test_if_basic(self):
 
     def test_fn(n):
@@ -100,6 +105,7 @@ class ControlFlowTest(converter_testing.TestCase):
     self.assertTransformedResult(test_fn, constant_op.constant(1), (-1, 0))
     self.assertTransformedResult(test_fn, constant_op.constant(-1), (0, -2))
 
+  @test_util.run_deprecated_v1
   def test_if_complex_outputs(self):
 
     class TestClass(object):
@@ -124,6 +130,7 @@ class ControlFlowTest(converter_testing.TestCase):
         res_obj = result.test_fn(constant_op.constant(-1), TestClass(0, 0))
         self.assertEqual(sess.run((res_obj.a, res_obj.b)), (0, -2))
 
+  @test_util.run_deprecated_v1
   def test_if_single_output(self):
 
     def test_fn(n):
@@ -133,6 +140,7 @@ class ControlFlowTest(converter_testing.TestCase):
 
     self.assertTransformedResult(test_fn, constant_op.constant(1), -1)
 
+  @test_util.run_deprecated_v1
   def test_if_semi(self):
 
     def test_fn(n):
@@ -143,6 +151,7 @@ class ControlFlowTest(converter_testing.TestCase):
     self.assertTransformedResult(test_fn, constant_op.constant(2), 3)
     self.assertTransformedResult(test_fn, constant_op.constant(-3), -3)
 
+  @test_util.run_deprecated_v1
   def test_if_local_var(self):
 
     def test_fn(n):
@@ -154,6 +163,7 @@ class ControlFlowTest(converter_testing.TestCase):
     self.assertTransformedResult(test_fn, constant_op.constant(1), 5)
     self.assertTransformedResult(test_fn, constant_op.constant(-1), -1)
 
+  @test_util.run_deprecated_v1
   def test_if_no_outputs(self):
 
     def test_fn(n):
@@ -177,6 +187,7 @@ class ControlFlowTest(converter_testing.TestCase):
     with self.assertRaises(transformer.AutographParseError):
       control_flow.transform(node, ctx)
 
+  @test_util.run_deprecated_v1
   def test_simple_for(self):
 
     def test_fn(l):
@@ -191,6 +202,7 @@ class ControlFlowTest(converter_testing.TestCase):
     empty_vector = constant_op.constant([], shape=(0,), dtype=dtypes.int32)
     self.assertTransformedResult(test_fn, empty_vector, (0, 0))
 
+  @test_util.run_deprecated_v1
   def test_for_single_output(self):
 
     def test_fn(l):
@@ -235,6 +247,7 @@ class ControlFlowTest(converter_testing.TestCase):
     with self.assertRaises(NameError):
       control_flow.transform(node, ctx)
 
+  @test_util.run_deprecated_v1
   def test_for_tuple_unpacking(self):
     def test_fn(x_list):
       z = tf.constant(0)  # pylint:disable=undefined-variable
diff --git a/tensorflow/python/autograph/converters/function_scopes_test.py b/tensorflow/python/autograph/converters/function_scopes_test.py
index e5ce03a1090..5a1248c8015 100644
--- a/tensorflow/python/autograph/converters/function_scopes_test.py
+++ b/tensorflow/python/autograph/converters/function_scopes_test.py
@@ -22,11 +22,13 @@ from tensorflow.python.autograph.converters import function_scopes
 from tensorflow.python.autograph.core import converter_testing
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
 
 
 class FunctionBodyTransformerTest(converter_testing.TestCase):
 
+  @test_util.run_deprecated_v1
   def test_basic(self):
 
     def test_fn(l):
@@ -40,6 +42,7 @@ class FunctionBodyTransformerTest(converter_testing.TestCase):
       self.assertIn('test_fn/', result_op.op.name)
       self.assertEqual('Docstring.', result.test_fn.__doc__)
 
+  @test_util.run_deprecated_v1
   def test_multiline_docstring(self):
 
     tf = None
@@ -58,6 +61,7 @@ class FunctionBodyTransformerTest(converter_testing.TestCase):
       self.assertIn('First sentence.', result.test_fn.__doc__)
       self.assertIn('Second sentence.', result.test_fn.__doc__)
 
+  @test_util.run_deprecated_v1
   def test_nested_functions(self):
 
     def test_fn(l):
@@ -74,6 +78,7 @@ class FunctionBodyTransformerTest(converter_testing.TestCase):
       self.assertNotIn('inner_fn', first.op.name)
       self.assertIn('test_fn/inner_fn/', second.op.name)
 
+  @test_util.run_deprecated_v1
   def test_method(self):
 
     class TestClass(object):
diff --git a/tensorflow/python/autograph/converters/lists_test.py b/tensorflow/python/autograph/converters/lists_test.py
index f6da845fcc3..39843c7d74f 100644
--- a/tensorflow/python/autograph/converters/lists_test.py
+++ b/tensorflow/python/autograph/converters/lists_test.py
@@ -68,7 +68,7 @@ class ListTest(converter_testing.TestCase):
       with self.cached_session() as sess:
         tl = result.test_fn()
         r = list_ops.tensor_list_stack(tl, dtypes.int32)
-        self.assertAllEqual(sess.run(r), [1, 2, 3])
+        self.assertAllEqual(self.evaluate(r), [1, 2, 3])
 
   def test_list_pop(self):
 
@@ -91,8 +91,8 @@ class ListTest(converter_testing.TestCase):
       with self.cached_session() as sess:
         ts, tl = result.test_fn()
         r = list_ops.tensor_list_stack(tl, dtypes.int32)
-        self.assertAllEqual(sess.run(r), [1, 2])
-        self.assertAllEqual(sess.run(ts), 3)
+        self.assertAllEqual(self.evaluate(r), [1, 2])
+        self.assertAllEqual(self.evaluate(ts), 3)
 
   def test_double_list_pop(self):
 
@@ -123,7 +123,7 @@ class ListTest(converter_testing.TestCase):
 
     with self.compiled(node, {}, array_ops.stack, dtypes.int32) as result:
       with self.cached_session() as sess:
-        self.assertAllEqual(sess.run(result.test_fn()), [1, 2, 3])
+        self.assertAllEqual(self.evaluate(result.test_fn()), [1, 2, 3])
 
   # TODO(mdan): Add a test with tf.stack with axis kwarg.
 
diff --git a/tensorflow/python/autograph/converters/logical_expressions_test.py b/tensorflow/python/autograph/converters/logical_expressions_test.py
index 99db04a7751..687412750e0 100644
--- a/tensorflow/python/autograph/converters/logical_expressions_test.py
+++ b/tensorflow/python/autograph/converters/logical_expressions_test.py
@@ -21,11 +21,13 @@ from __future__ import print_function
 from tensorflow.python.autograph.converters import logical_expressions
 from tensorflow.python.autograph.core import converter_testing
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
 
 
 class LogicalExpressionTest(converter_testing.TestCase):
 
+  @test_util.run_deprecated_v1
   def test_equals(self):
 
     def test_fn(a, b):
@@ -36,6 +38,7 @@ class LogicalExpressionTest(converter_testing.TestCase):
         self.assertTrue(sess.run(result.test_fn(constant_op.constant(1), 1)))
         self.assertFalse(sess.run(result.test_fn(constant_op.constant(1), 2)))
 
+  @test_util.run_deprecated_v1
   def test_bool_ops(self):
 
     def test_fn(a, b, c):
@@ -48,6 +51,7 @@ class LogicalExpressionTest(converter_testing.TestCase):
         self.assertFalse(
             sess.run(result.test_fn(constant_op.constant(True), False, True)))
 
+  @test_util.run_deprecated_v1
   def test_comparison(self):
 
     def test_fn(a, b, c, d):
diff --git a/tensorflow/python/autograph/converters/side_effect_guards_test.py b/tensorflow/python/autograph/converters/side_effect_guards_test.py
index cef3199169c..645267e5600 100644
--- a/tensorflow/python/autograph/converters/side_effect_guards_test.py
+++ b/tensorflow/python/autograph/converters/side_effect_guards_test.py
@@ -23,6 +23,7 @@ from tensorflow.python.autograph.core import converter_testing
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variable_scope
@@ -34,6 +35,7 @@ tf = None  # Will be replaced by a mock.
 
 class SideEffectGuardsTest(converter_testing.TestCase):
 
+  @test_util.run_deprecated_v1
   def test_side_effect_on_return_only_variable(self):
 
     def test_fn(a):
@@ -48,12 +50,12 @@ class SideEffectGuardsTest(converter_testing.TestCase):
     with self.compiled(node, {}, state_ops.assign) as result:
       with self.cached_session() as sess:
         v = variable_scope.get_variable('test', initializer=2)
-        sess.run(v.initializer)
-        sess.run(result.test_fn(v))
+        self.evaluate(v.initializer)
+        self.evaluate(result.test_fn(v))
         # TODO(mdan): Add support for this use case.
         # Right now the variable `a` is not conditioned on the `assign` because
         # there's no way to add control dependencies to a variable object.
-        self.assertEqual(2, sess.run(v))
+        self.assertEqual(2, self.evaluate(v))
 
   def test_side_effect_on_used_variable(self):
 
@@ -69,12 +71,13 @@ class SideEffectGuardsTest(converter_testing.TestCase):
     with self.compiled(node, {}, state_ops.assign) as result:
       with self.cached_session() as sess:
         v = variable_scope.get_variable('test', initializer=2)
-        sess.run(v.initializer)
-        sess.run(result.test_fn(v))
+        self.evaluate(v.initializer)
+        self.evaluate(result.test_fn(v))
         # TODO(mdan): Ensure the result of test_fn(v) is also deterministic.
         # Right now it's 3 or 4 based on whether the read is synchronized.
-        self.assertEqual(3, sess.run(v))
+        self.assertEqual(3, self.evaluate(v))
 
+  @test_util.run_deprecated_v1
   def test_side_effect_on_tensor(self):
 
     def test_fn(a):
@@ -109,10 +112,10 @@ class SideEffectGuardsTest(converter_testing.TestCase):
     with self.compiled(node, {}, state_ops.assign_add) as result:
       with self.cached_session() as sess:
         v = variable_scope.get_variable('test', initializer=2)
-        sess.run(v.initializer)
-        sess.run(result.test_fn(v))
+        self.evaluate(v.initializer)
+        self.evaluate(result.test_fn(v))
         # TODO(mdan): Ensure the result of test_fn(v) is also deterministic.
-        self.assertEqual(4, sess.run(v))
+        self.assertEqual(4, self.evaluate(v))
 
   def test_multiline_nested_block(self):
 
@@ -130,10 +133,10 @@ class SideEffectGuardsTest(converter_testing.TestCase):
     with self.compiled(node, {}, state_ops.assign, ops.name_scope) as result:
       with self.cached_session() as sess:
         v = variable_scope.get_variable('test', initializer=2)
-        sess.run(v.initializer)
-        sess.run(result.test_fn(v))
+        self.evaluate(v.initializer)
+        self.evaluate(result.test_fn(v))
         # TODO(mdan): Ensure the result of test_fn(v) is also deterministic.
-        self.assertEqual(3, sess.run(v))
+        self.assertEqual(3, self.evaluate(v))
 
   def test_multiline_block_unsafe(self):
 
@@ -153,10 +156,10 @@ class SideEffectGuardsTest(converter_testing.TestCase):
                        state_ops.assign_add) as result:
       with self.cached_session() as sess:
         v = variable_scope.get_variable('test', initializer=2)
-        sess.run(v.initializer)
-        sess.run(result.test_fn(v))
+        self.evaluate(v.initializer)
+        self.evaluate(result.test_fn(v))
         # TODO(mdan): Ensure the result of test_fn(v) is also deterministic.
-        self.assertEqual(4, sess.run(v))
+        self.assertEqual(4, self.evaluate(v))
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/autograph/converters/slices_test.py b/tensorflow/python/autograph/converters/slices_test.py
index e190a7cfe84..bd049afdfce 100644
--- a/tensorflow/python/autograph/converters/slices_test.py
+++ b/tensorflow/python/autograph/converters/slices_test.py
@@ -49,7 +49,7 @@ class SliceTest(converter_testing.TestCase):
         tl = list_ops.tensor_list_from_tensor(
             [1, 2], element_shape=constant_op.constant([], dtype=dtypes.int32))
         y = result.test_fn(tl)
-        self.assertEqual(2, sess.run(y))
+        self.assertEqual(2, self.evaluate(y))
 
   def test_index_access_multiple_definitions(self):
 
diff --git a/tensorflow/python/autograph/core/converter.py b/tensorflow/python/autograph/core/converter.py
index 49e24895a2b..e88c4674ee2 100644
--- a/tensorflow/python/autograph/core/converter.py
+++ b/tensorflow/python/autograph/core/converter.py
@@ -82,6 +82,7 @@ from tensorflow.python.autograph.pyct.static_analysis import live_values
 from tensorflow.python.autograph.pyct.static_analysis import liveness
 from tensorflow.python.autograph.pyct.static_analysis import reaching_definitions
 from tensorflow.python.autograph.pyct.static_analysis import type_info
+from tensorflow.python.eager import function
 
 # TODO(mdan): These contexts can be refactored into first class objects.
 # For example, we could define Program and Entity abstractions that hold on
@@ -96,7 +97,7 @@ class Verbosity(IntEnum):
   Attributes:
    * BRIEF: No logging, minimal error messages.
    * VERBOSE: Detailed logging of generated code, detailed error messages.
- """
+  """
   BRIEF = 0
   VERBOSE = 1
 
@@ -151,7 +152,7 @@ class ConversionOptions(object):
                optional_features=Feature.ALL):
     self.recursive = recursive
     self.verbose = verbose
-    self.strip_decorators = strip_decorators or ()
+    self._strip_decorators = strip_decorators or ()
     self.force_conversion = force_conversion
     # TODO(mdan): Rename to conversion_recursion_depth?
     self.internal_convert_user_code = internal_convert_user_code
@@ -161,6 +162,12 @@ class ConversionOptions(object):
     optional_features = frozenset(optional_features)
     self.optional_features = optional_features
 
+  @property
+  def strip_decorators(self):
+    # A few decorators are included by default.
+    # TODO(mdan): Revert if function.defun becomes a public symbol.
+    return self._strip_decorators + (function.defun,)
+
   def uses(self, feature):
     return (Feature.ALL in self.optional_features or
             feature in self.optional_features)
@@ -216,7 +223,7 @@ class ConversionOptions(object):
             as_qualified_name(ConversionOptions)),
         recursive_val=parser.parse_expression(str(self.recursive)),
         verbose_val=parser.parse_expression(str(int(self.verbose))),
-        strip_decorators_val=list_of_names(self.strip_decorators),
+        strip_decorators_val=list_of_names(self._strip_decorators),
         force_conversion_val=parser.parse_expression(
             str(self.force_conversion)),
         internal_convert_user_code_val=parser.parse_expression(
diff --git a/tensorflow/python/autograph/core/converter_testing.py b/tensorflow/python/autograph/core/converter_testing.py
index 7b0608d03fc..f1374081d3c 100644
--- a/tensorflow/python/autograph/core/converter_testing.py
+++ b/tensorflow/python/autograph/core/converter_testing.py
@@ -32,6 +32,7 @@ from tensorflow.python.autograph.core import errors
 from tensorflow.python.autograph.core import function_wrapping
 from tensorflow.python.autograph.lang import special_functions
 from tensorflow.python.autograph.pyct import compiler
+from tensorflow.python.autograph.pyct import inspect_utils
 from tensorflow.python.autograph.pyct import origin_info
 from tensorflow.python.autograph.pyct import parser
 from tensorflow.python.autograph.pyct import pretty_printer
@@ -43,7 +44,7 @@ def imported_decorator(f):
   return lambda a: f(a) + 1
 
 
-# TODO(mdan): We might be able to use the real namer here.
+# TODO(mdan): We should use the real namer here.
 class FakeNamer(object):
   """A fake namer that uses a global counter to generate unique names."""
 
@@ -61,7 +62,8 @@ class FakeNamer(object):
                              original_fqn,
                              live_entity=None,
                              owner_type=None):
-    del live_entity
+    if inspect_utils.islambda(live_entity):
+      return None, False
     if owner_type is not None:
       return None, False
     return ('renamed_%s' % '_'.join(original_fqn)), True
diff --git a/tensorflow/python/autograph/core/errors_test.py b/tensorflow/python/autograph/core/errors_test.py
index aa6c293268c..845a28a5222 100644
--- a/tensorflow/python/autograph/core/errors_test.py
+++ b/tensorflow/python/autograph/core/errors_test.py
@@ -22,6 +22,7 @@ from tensorflow.python.autograph.core import errors
 from tensorflow.python.autograph.pyct import origin_info
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors as tf_errors
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
 from tensorflow.python.util import tf_inspect
@@ -47,6 +48,7 @@ class RuntimeErrorsTest(test.TestCase):
                                     'test_comment')
     return loc, origin
 
+  @test_util.run_deprecated_v1
   def test_improved_errors_basic(self):
     loc, origin = self.fake_origin(zero_div, 2)
     zero_div_caller.ag_source_map = {loc: origin}
@@ -55,13 +57,14 @@ class RuntimeErrorsTest(test.TestCase):
     with self.assertRaises(errors.TfRuntimeError) as cm:
       with errors.improved_errors(zero_div_caller):
         with self.cached_session() as sess:
-          sess.run(ops)
+          self.evaluate(ops)
 
     for frame in cm.exception.custom_traceback:
       _, _, function_name, _ = frame
       self.assertNotEqual('zero_div', function_name)
     self.assertIn(origin.as_frame(), set(cm.exception.custom_traceback))
 
+  @test_util.run_deprecated_v1
   def test_improved_errors_no_matching_lineno(self):
     loc, origin = self.fake_origin(zero_div, -1)
     zero_div_caller.ag_source_map = {loc: origin}
@@ -70,7 +73,7 @@ class RuntimeErrorsTest(test.TestCase):
     with self.assertRaises(errors.TfRuntimeError) as cm:
       with errors.improved_errors(zero_div_caller):
         with self.cached_session() as sess:
-          sess.run(ops)
+          self.evaluate(ops)
 
     all_function_names = set()
     for frame in cm.exception.custom_traceback:
@@ -79,6 +82,7 @@ class RuntimeErrorsTest(test.TestCase):
       self.assertNotEqual('test_function_name', function_name)
     self.assertIn('zero_div', all_function_names)
 
+  @test_util.run_deprecated_v1
   def test_improved_errors_failures(self):
     loc, _ = self.fake_origin(zero_div, 2)
     zero_div_caller.ag_source_map = {loc: 'bogus object'}
@@ -87,7 +91,7 @@ class RuntimeErrorsTest(test.TestCase):
     with self.assertRaises(tf_errors.InvalidArgumentError):
       with errors.improved_errors(zero_div_caller):
         with self.cached_session() as sess:
-          sess.run(ops)
+          self.evaluate(ops)
 
   def test_improved_errors_validation(self):
     with self.assertRaisesRegexp(
diff --git a/tensorflow/python/autograph/core/function_wrapping_test.py b/tensorflow/python/autograph/core/function_wrapping_test.py
index 5e217055c71..7e21b979dbc 100644
--- a/tensorflow/python/autograph/core/function_wrapping_test.py
+++ b/tensorflow/python/autograph/core/function_wrapping_test.py
@@ -20,11 +20,13 @@ from __future__ import print_function
 
 from tensorflow.python.autograph.core import function_wrapping
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
 
 
 class FunctionWrappingTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def test_function_scope_name(self):
     with function_wrapping.function_scope('test_name'):
       t = constant_op.constant(1)
diff --git a/tensorflow/python/autograph/core/naming.py b/tensorflow/python/autograph/core/naming.py
index 43fcbcfc030..b8d79daebaa 100644
--- a/tensorflow/python/autograph/core/naming.py
+++ b/tensorflow/python/autograph/core/naming.py
@@ -18,8 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.autograph.pyct import inspect_utils
 from tensorflow.python.autograph.pyct import qual_names
-from tensorflow.python.util import tf_inspect
 
 
 class Namer(object):
@@ -77,8 +77,7 @@ class Namer(object):
     if not self.recursive:
       return None, False
 
-    if (live_entity is not None and tf_inspect.isfunction(live_entity) and
-        live_entity.__name__ == '<lambda>'):
+    if (live_entity is not None and inspect_utils.islambda(live_entity)):
       return None, False
 
     if owner_type is not None and owner_type not in self.partial_types:
diff --git a/tensorflow/python/autograph/impl/BUILD b/tensorflow/python/autograph/impl/BUILD
index 2f9037c43b6..201a8887541 100644
--- a/tensorflow/python/autograph/impl/BUILD
+++ b/tensorflow/python/autograph/impl/BUILD
@@ -41,7 +41,6 @@ py_test(
     name = "api_test",
     srcs = ["api_test.py"],
     srcs_version = "PY2AND3",
-    tags = ["no_windows"],
     deps = [
         ":impl",
         "//tensorflow/python:client_testlib",
@@ -54,7 +53,6 @@ py_test(
     name = "conversion_test",
     srcs = ["conversion_test.py"],
     srcs_version = "PY2AND3",
-    tags = ["no_windows"],
     deps = [
         ":impl",
         "//tensorflow/python:client_testlib",
diff --git a/tensorflow/python/autograph/impl/api.py b/tensorflow/python/autograph/impl/api.py
index 69674b2be3c..19a472064ae 100644
--- a/tensorflow/python/autograph/impl/api.py
+++ b/tensorflow/python/autograph/impl/api.py
@@ -195,6 +195,17 @@ def converted_call(f, owner, options, *args, **kwargs):
   if not options.internal_convert_user_code:
     return f(*args, **kwargs)
 
+  # Unwrap functools.partial objects
+  # TODO(allenl, mdan): Consider sharing unwrapping logic with tf_inspect.
+  while isinstance(f, functools.partial):
+    args = f.args + args
+    new_kwargs = {}
+    if f.keywords is not None:
+      new_kwargs.update(f.keywords)
+    new_kwargs.update(kwargs)
+    kwargs = new_kwargs
+    f = f.func
+
   if tf_inspect.isfunction(f) or tf_inspect.ismethod(f):
     # Regular functions
     target_entity = f
diff --git a/tensorflow/python/autograph/impl/api_test.py b/tensorflow/python/autograph/impl/api_test.py
index ef577568c4e..66edda51193 100644
--- a/tensorflow/python/autograph/impl/api_test.py
+++ b/tensorflow/python/autograph/impl/api_test.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import functools
 import gc
 
 import numpy as np
@@ -28,6 +29,7 @@ from tensorflow.python.autograph.impl import api
 from tensorflow.python.autograph.pyct import parser
 from tensorflow.python.autograph.utils import py_func
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import test_util
 from tensorflow.python.keras.engine import sequential
 from tensorflow.python.keras.layers import core
 from tensorflow.python.ops import variables
@@ -43,6 +45,7 @@ class TestResource(str):
 
 class ApiTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def test_decorator_recurses(self):
 
     class TestClass(object):
@@ -63,8 +66,9 @@ class ApiTest(test.TestCase):
       x = tc.test_method(
           constant_op.constant([2, 4]), constant_op.constant(1),
           constant_op.constant(-2))
-      self.assertListEqual([0, 1], sess.run(x).tolist())
+      self.assertListEqual([0, 1], self.evaluate(x).tolist())
 
+  @test_util.run_deprecated_v1
   def test_decorator_does_not_recurse(self):
 
     class TestClass(object):
@@ -83,8 +87,9 @@ class ApiTest(test.TestCase):
       x = tc.test_method(
           constant_op.constant([2, 4]), constant_op.constant(1),
           constant_op.constant(-2))
-      self.assertListEqual([0, 1], sess.run(x).tolist())
+      self.assertListEqual([0, 1], self.evaluate(x).tolist())
 
+  @test_util.run_deprecated_v1
   def test_decorator_calls_unconverted_graph(self):
 
     class TestClass(object):
@@ -104,8 +109,9 @@ class ApiTest(test.TestCase):
       x = tc.test_method(
           constant_op.constant([2, 4]), constant_op.constant(1),
           constant_op.constant(-2))
-      self.assertListEqual([0, 1], sess.run(x).tolist())
+      self.assertListEqual([0, 1], self.evaluate(x).tolist())
 
+  @test_util.run_deprecated_v1
   def test_decorator_calls_unconverted_py_func(self):
 
     class TestClass(object):
@@ -130,8 +136,9 @@ class ApiTest(test.TestCase):
       x = tc.test_method(
           constant_op.constant([2, 4]), constant_op.constant(1),
           constant_op.constant(-2))
-      self.assertListEqual([0, 1], sess.run(x).tolist())
+      self.assertListEqual([0, 1], self.evaluate(x).tolist())
 
+  @test_util.run_deprecated_v1
   def test_decorator_calls_decorated(self):
 
     class TestClass(object):
@@ -153,7 +160,7 @@ class ApiTest(test.TestCase):
       x = tc.test_method(
           constant_op.constant([2, 4]), constant_op.constant(1),
           constant_op.constant(-2))
-      self.assertListEqual([0, 1], sess.run(x).tolist())
+      self.assertListEqual([0, 1], self.evaluate(x).tolist())
 
   def test_decorator_preserves_argspec(self):
 
@@ -171,6 +178,7 @@ class ApiTest(test.TestCase):
         list(tf_inspect.getfullargspec(tc.called_member)),
         list(tf_inspect.getfullargspec(tc.called_member_converted)))
 
+  @test_util.run_deprecated_v1
   def test_convert_call_site_decorator(self):
 
     class TestClass(object):
@@ -192,7 +200,7 @@ class ApiTest(test.TestCase):
       x = tc.test_method(
           constant_op.constant([2, 4]), constant_op.constant(1),
           constant_op.constant(-2))
-      self.assertListEqual([0, 1], sess.run(x).tolist())
+      self.assertListEqual([0, 1], self.evaluate(x).tolist())
 
   def test_converted_call_builtin(self):
     x = api.converted_call(range, None, converter.ConversionOptions(), 3)
@@ -208,7 +216,27 @@ class ApiTest(test.TestCase):
     with self.cached_session() as sess:
       x = api.converted_call(test_fn, None, converter.ConversionOptions(),
                              constant_op.constant(-1))
-      self.assertEqual(1, sess.run(x))
+      self.assertEqual(1, self.evaluate(x))
+
+  def test_converted_call_functools_partial(self):
+
+    def test_fn(x, y, z):
+      if x < 0:
+        return -x, -y, -z
+      return x, y, z
+
+    x = api.converted_call(
+        functools.partial(test_fn, constant_op.constant(-1), z=-3),
+        None, converter.ConversionOptions(),
+        constant_op.constant(-2))
+    self.assertEqual((1, 2, 3), self.evaluate(x))
+
+    x = api.converted_call(
+        functools.partial(
+            functools.partial(test_fn, constant_op.constant(-1)), z=-3),
+        None, converter.ConversionOptions(),
+        constant_op.constant(-2))
+    self.assertEqual((1, 2, 3), self.evaluate(x))
 
   def test_converted_call_method_explicit_owner(self):
     # TODO(mdan): Implement.
@@ -234,7 +262,7 @@ class ApiTest(test.TestCase):
       tc = TestClass(constant_op.constant(-1))
       x = api.converted_call(tc.test_method, None,
                              converter.ConversionOptions(), tc)
-      self.assertEqual(1, sess.run(x))
+      self.assertEqual(1, self.evaluate(x))
 
   def test_converted_call_method_by_class(self):
 
@@ -252,7 +280,7 @@ class ApiTest(test.TestCase):
       tc = TestClass(constant_op.constant(-1))
       x = api.converted_call(TestClass.test_method, None,
                              converter.ConversionOptions(), tc)
-      self.assertEqual(1, sess.run(x))
+      self.assertEqual(1, self.evaluate(x))
 
   def test_converted_call_callable_object(self):
 
@@ -269,7 +297,7 @@ class ApiTest(test.TestCase):
     with self.cached_session() as sess:
       tc = TestClass(constant_op.constant(-1))
       x = api.converted_call(tc, None, converter.ConversionOptions())
-      self.assertEqual(1, sess.run(x))
+      self.assertEqual(1, self.evaluate(x))
 
   def test_converted_call_constructor(self):
 
@@ -288,7 +316,7 @@ class ApiTest(test.TestCase):
                               constant_op.constant(-1))
       # tc is now a converted object.
       x = tc.test_method()
-      self.assertEqual(1, sess.run(x))
+      self.assertEqual(1, self.evaluate(x))
 
   def test_converted_call_already_converted(self):
 
@@ -298,13 +326,14 @@ class ApiTest(test.TestCase):
     with self.cached_session() as sess:
       x = api.converted_call(f, None, converter.ConversionOptions(),
                              constant_op.constant(0))
-      self.assertTrue(sess.run(x))
+      self.assertTrue(self.evaluate(x))
 
       converted_f = api.to_graph(f)
       x = api.converted_call(converted_f, None, converter.ConversionOptions(),
                              constant_op.constant(0))
-      self.assertTrue(sess.run(x))
+      self.assertTrue(self.evaluate(x))
 
+  @test_util.run_deprecated_v1
   def test_converted_call_no_user_code(self):
 
     def f(x):
@@ -334,8 +363,8 @@ class ApiTest(test.TestCase):
                            constant_op.constant([[0.0]]), training=True)
 
     with self.cached_session() as sess:
-      sess.run(variables.global_variables_initializer())
-      self.assertAllEqual([[0.0, 0.0]], sess.run(x))
+      self.evaluate(variables.global_variables_initializer())
+      self.assertAllEqual([[0.0, 0.0]], self.evaluate(x))
 
   def test_converted_call_whitelisted_method_extra_self(self):
 
@@ -349,8 +378,8 @@ class ApiTest(test.TestCase):
                            model, constant_op.constant([[0.0]]), training=True)
 
     with self.cached_session() as sess:
-      sess.run(variables.global_variables_initializer())
-      self.assertAllEqual([[0.0, 0.0]], sess.run(x))
+      self.evaluate(variables.global_variables_initializer())
+      self.assertAllEqual([[0.0, 0.0]], self.evaluate(x))
 
   def test_converted_call_whitelisted_method_via_owner(self):
 
@@ -364,8 +393,8 @@ class ApiTest(test.TestCase):
                            constant_op.constant([[0.0]]), training=True)
 
     with self.cached_session() as sess:
-      sess.run(variables.global_variables_initializer())
-      self.assertAllEqual([[0.0, 0.0]], sess.run(x))
+      self.evaluate(variables.global_variables_initializer())
+      self.assertAllEqual([[0.0, 0.0]], self.evaluate(x))
 
   def test_converted_call_lambda(self):
 
@@ -376,9 +405,10 @@ class ApiTest(test.TestCase):
     x = api.converted_call(l, None, opts, constant_op.constant(0))
 
     with self.cached_session() as sess:
-      sess.run(variables.global_variables_initializer())
-      self.assertAllEqual(True, sess.run(x))
+      self.evaluate(variables.global_variables_initializer())
+      self.assertAllEqual(True, self.evaluate(x))
 
+  @test_util.run_deprecated_v1
   def test_to_graph_basic(self):
 
     def test_fn(x, s):
@@ -390,8 +420,9 @@ class ApiTest(test.TestCase):
 
     with self.cached_session() as sess:
       x = compiled_fn(constant_op.constant([4, 8]), 4)
-      self.assertListEqual([1, 2], sess.run(x).tolist())
+      self.assertListEqual([1, 2], self.evaluate(x).tolist())
 
+  @test_util.run_deprecated_v1
   def test_to_graph_with_defaults(self):
 
     foo = 4
@@ -405,7 +436,7 @@ class ApiTest(test.TestCase):
 
     with self.cached_session() as sess:
       x = compiled_fn(constant_op.constant([4, 8]))
-      self.assertListEqual([1, 2], sess.run(x).tolist())
+      self.assertListEqual([1, 2], self.evaluate(x).tolist())
 
   def test_to_code_basic(self):
 
diff --git a/tensorflow/python/autograph/impl/conversion.py b/tensorflow/python/autograph/impl/conversion.py
index 328a4b5fe48..f8decd24e8e 100644
--- a/tensorflow/python/autograph/impl/conversion.py
+++ b/tensorflow/python/autograph/impl/conversion.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import functools
 import imp
 
 import gast
@@ -72,12 +73,31 @@ def is_whitelisted_for_graph(o):
   Returns:
     Boolean
   """
-  m = tf_inspect.getmodule(o)
+  # TODO(b/120224672): Fix this.
+  if isinstance(o, functools.partial):
+    # tf_inspect.getmodule(functools.partial(...)) otherwise returns None since
+    # functools.partial objects do not have a __module__ attribute.
+    m = functools
+  else:
+    m = tf_inspect.getmodule(o)
   for prefix, in config.DEFAULT_UNCOMPILED_MODULES:
     if m.__name__.startswith(prefix):
       return True
+
   if hasattr(o, 'autograph_info__'):
     return True
+
+  if inspect_utils.isnamedtuple(o):
+    # Due to the way they're constructed, namedtuple types cannot be converted
+    # because they don't expose source code. But we assume they are safe for
+    # graph mode since they are just containers.
+    if tf_inspect.isclass(o) and len(o.__bases__) > 1:
+      logging.log_first_n(
+          logging.level_warning(),
+          'Entity {} looks like a namedtuple subclass. If it has any custom'
+          ' methods, they will not be converted by AutoGraph.'.format(o), 1)
+    return True
+
   return False
 
 
@@ -281,11 +301,10 @@ def function_to_graph(f,
   node, source = parser.parse_entity(f)
   node = node.body[0]
 
-  # In general, the output of inspect.getsource is inexact because it uses crude
-  # regex matching methods to search the source file. This is particularly
-  # problematic for lambda functions, where the entire containing lines are
-  # returned. Certain distributions of CPython may also return the enclosing
-  # function for local functions.
+  # In general, the output of inspect.getsource is inexact because it uses
+  # regex matching to adjust the exact location around the line number that
+  # CPython records. This is particularly problematic for lambda functions,
+  # where the entire containing lines are returned.
   nodes = ast_util.find_matching_definitions(node, f)
   if len(nodes) != 1:
     if f.__name__ == '<lambda>':
@@ -298,8 +317,8 @@ def function_to_graph(f,
       raise ValueError(
           'Unable to identify source code of function {}. The source code'
           ' reported by Python did not include exactly one matching signature:'
-          '\n{}\nTo avoid ambiguity, use a unique name for each'
-          ' function.'.format(f, source))
+          '\n{}\n. This is an extremely rare occurrence. Please report it to'
+          ' the TensorFlow team.'.format(f, source))
   node, = nodes
 
   # TODO(znado): Place inside standard_analysis.
diff --git a/tensorflow/python/autograph/lang/special_functions_test.py b/tensorflow/python/autograph/lang/special_functions_test.py
index 123ee65b326..8d40f4036c5 100644
--- a/tensorflow/python/autograph/lang/special_functions_test.py
+++ b/tensorflow/python/autograph/lang/special_functions_test.py
@@ -36,7 +36,7 @@ class SpecialFunctionsTest(test.TestCase):
     python_one = special_functions.match_staging_level(1, 1)
     with self.cached_session() as sess:
       self.assertTrue(tensor_util.is_tensor(tensor_one))
-      self.assertAllEqual(sess.run(tensor_one), 1)
+      self.assertAllEqual(self.evaluate(tensor_one), 1)
       self.assertEqual(python_one, 1)
 
   def test_tensor_list_empty_list(self):
@@ -45,21 +45,21 @@ class SpecialFunctionsTest(test.TestCase):
                                       element_shape=())
     sl = list_ops.tensor_list_stack(l, element_dtype=dtypes.int32)
     with self.cached_session() as sess:
-      self.assertAllEqual(sess.run(sl), [])
+      self.assertAllEqual(self.evaluate(sl), [])
 
     l = special_functions.tensor_list((),
                                       element_dtype=dtypes.int32,
                                       element_shape=())
     sl = list_ops.tensor_list_stack(l, element_dtype=dtypes.int32)
     with self.cached_session() as sess:
-      self.assertAllEqual(sess.run(sl), [])
+      self.assertAllEqual(self.evaluate(sl), [])
 
   def test_tensor_list_tensor(self):
     l = special_functions.tensor_list(
         constant_op.constant([], dtype=dtypes.int32))
     sl = list_ops.tensor_list_stack(l, element_dtype=dtypes.int32)
     with self.cached_session() as sess:
-      self.assertAllEqual(sess.run(sl), [])
+      self.assertAllEqual(self.evaluate(sl), [])
 
   def test_tensor_list_unsupported_initializer(self):
     with self.assertRaisesRegexp(ValueError, 'unknown type'):
@@ -76,7 +76,7 @@ class SpecialFunctionsTest(test.TestCase):
     l = special_functions.tensor_list(elements)
     sl = list_ops.tensor_list_stack(l, element_dtype=dtypes.int32)
     with self.cached_session() as sess:
-      self.assertAllEqual(sess.run(sl), [[1, 2], [3, 4]])
+      self.assertAllEqual(self.evaluate(sl), [[1, 2], [3, 4]])
 
   def test_tensor_list_array_from_elements(self):
     elements = [constant_op.constant([1, 2]), constant_op.constant([3, 4])]
@@ -84,7 +84,7 @@ class SpecialFunctionsTest(test.TestCase):
     l = special_functions.tensor_list(elements, use_tensor_array=True)
     sl = l.stack()
     with self.cached_session() as sess:
-      self.assertAllEqual(sess.run(sl), [[1, 2], [3, 4]])
+      self.assertAllEqual(self.evaluate(sl), [[1, 2], [3, 4]])
 
   def test_stack(self):
     self.assertEqual(special_functions.stack(1, strict=False), 1)
diff --git a/tensorflow/python/autograph/operators/control_flow.py b/tensorflow/python/autograph/operators/control_flow.py
index 6eedd695a74..670897744ae 100644
--- a/tensorflow/python/autograph/operators/control_flow.py
+++ b/tensorflow/python/autograph/operators/control_flow.py
@@ -61,7 +61,7 @@ def for_stmt(iter_, extra_test, body, init_state):
   """
   if tensor_util.is_tensor(iter_):
     return _known_len_for_stmt(iter_, extra_test, body, init_state)
-  elif isinstance(iter_, dataset_ops.Dataset):
+  elif isinstance(iter_, dataset_ops.DatasetV2):
     return _dataset_for_stmt(iter_, extra_test, body, init_state)
   else:
     return _py_for_stmt(iter_, extra_test, body, init_state)
@@ -123,7 +123,7 @@ def _dataset_for_stmt(ds, extra_test, body, init_state):
         (dataset_ops.Dataset.from_tensors(tag).repeat(), ds))
   ds_with_epoch = epoch_numbers.flat_map(lambda i: tag_with(ds, i))
 
-  iterator = ds_with_epoch.make_initializable_iterator()
+  iterator = dataset_ops.make_initializable_iterator(ds_with_epoch)
   with ops.control_dependencies((iterator.initializer,)):
     epoch_number, iterate = iterator.get_next()
 
diff --git a/tensorflow/python/autograph/operators/control_flow_test.py b/tensorflow/python/autograph/operators/control_flow_test.py
index 2dea18dc5fa..0a7d4b64022 100644
--- a/tensorflow/python/autograph/operators/control_flow_test.py
+++ b/tensorflow/python/autograph/operators/control_flow_test.py
@@ -22,12 +22,14 @@ from tensorflow.python.autograph.operators import control_flow
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 
 
 class ForLoopTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def test_tensor(self):
     s = control_flow.for_stmt(
         constant_op.constant([1, 2, 3, 4]),
@@ -35,7 +37,7 @@ class ForLoopTest(test.TestCase):
         body=lambda i, s: (s + i,),
         init_state=(0,))
     with self.cached_session() as sess:
-      self.assertEqual((10,), sess.run(s))
+      self.assertEqual((10,), self.evaluate(s))
 
   def test_python(self):
     s = control_flow.for_stmt(
@@ -45,6 +47,7 @@ class ForLoopTest(test.TestCase):
         init_state=(0,))
     self.assertEqual(10, s)
 
+  @test_util.run_deprecated_v1
   def test_dataset(self):
     to_int32 = lambda i: math_ops.cast(i, dtypes.int32)
     s = control_flow.for_stmt(
@@ -53,11 +56,12 @@ class ForLoopTest(test.TestCase):
         body=lambda i, s: (s + i,),
         init_state=(0,))
     with self.cached_session() as sess:
-      self.assertEqual((10,), sess.run(s))
+      self.assertEqual((10,), self.evaluate(s))
 
 
 class WhileLoopTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def test_tensor(self):
     n = constant_op.constant(5)
     results = control_flow.while_stmt(
@@ -66,7 +70,7 @@ class WhileLoopTest(test.TestCase):
         init_state=(0, 0),
         extra_deps=(n,))
     with self.cached_session() as sess:
-      self.assertEqual((5, 10), sess.run(results))
+      self.assertEqual((5, 10), self.evaluate(results))
 
   def test_python(self):
     n = 5
@@ -87,23 +91,25 @@ class IfStmtTest(test.TestCase):
     return control_flow.if_stmt(
         cond=cond, body=lambda: (1, 2), orelse=lambda: (-1, -2))
 
+  @test_util.run_deprecated_v1
   def test_tensor(self):
     with self.cached_session() as sess:
       t = self.single_return_if_stmt(constant_op.constant(True))
-      self.assertEqual(1, sess.run(t))
+      self.assertEqual(1, self.evaluate(t))
       t = self.single_return_if_stmt(constant_op.constant(False))
-      self.assertEqual(-1, sess.run(t))
+      self.assertEqual(-1, self.evaluate(t))
 
   def test_python(self):
     self.assertEqual(1, self.single_return_if_stmt(True))
     self.assertEqual(-1, self.single_return_if_stmt(False))
 
+  @test_util.run_deprecated_v1
   def test_tensor_multiple_returns(self):
     with self.cached_session() as sess:
       t = self.multi_return_if_stmt(constant_op.constant(True))
-      self.assertAllEqual([1, 2], sess.run(t))
+      self.assertAllEqual([1, 2], self.evaluate(t))
       t = self.multi_return_if_stmt(constant_op.constant(False))
-      self.assertAllEqual([-1, -2], sess.run(t))
+      self.assertAllEqual([-1, -2], self.evaluate(t))
 
   def test_python_multiple_returns(self):
     self.assertEqual((1, 2), self.multi_return_if_stmt(True))
diff --git a/tensorflow/python/autograph/operators/data_structures_test.py b/tensorflow/python/autograph/operators/data_structures_test.py
index 6039b07982c..c5a3a3d1cac 100644
--- a/tensorflow/python/autograph/operators/data_structures_test.py
+++ b/tensorflow/python/autograph/operators/data_structures_test.py
@@ -22,6 +22,7 @@ from tensorflow.python.autograph.operators import data_structures
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import list_ops
 from tensorflow.python.ops import tensor_array_ops
 from tensorflow.python.platform import test
@@ -43,7 +44,7 @@ class ListTest(test.TestCase):
     l = data_structures.tf_tensor_list_new([3, 4, 5])
     t = list_ops.tensor_list_stack(l, element_dtype=dtypes.int32)
     with self.cached_session() as sess:
-      self.assertAllEqual(sess.run(t), [3, 4, 5])
+      self.assertAllEqual(self.evaluate(t), [3, 4, 5])
 
   def test_tf_tensor_list_new_empty(self):
     l = data_structures.tf_tensor_list_new([],
@@ -51,14 +52,15 @@ class ListTest(test.TestCase):
                                            element_shape=())
     t = list_ops.tensor_list_stack(l, element_dtype=dtypes.int32)
     with self.cached_session() as sess:
-      self.assertAllEqual(sess.run(t), [])
+      self.assertAllEqual(self.evaluate(t), [])
 
   def test_tf_tensor_list_new_from_tensor(self):
     l = data_structures.tf_tensor_list_new(constant_op.constant([3, 4, 5]))
     t = list_ops.tensor_list_stack(l, element_dtype=dtypes.int32)
     with self.cached_session() as sess:
-      self.assertAllEqual(sess.run(t), [3, 4, 5])
+      self.assertAllEqual(self.evaluate(t), [3, 4, 5])
 
+  @test_util.run_deprecated_v1
   def test_tf_tensor_list_new_illegal_input(self):
     with self.assertRaises(ValueError):
       data_structures.tf_tensor_list_new([3, 4.0])
@@ -77,7 +79,7 @@ class ListTest(test.TestCase):
     l = data_structures.tf_tensor_array_new([3, 4, 5])
     t = l.stack()
     with self.cached_session() as sess:
-      self.assertAllEqual(sess.run(t), [3, 4, 5])
+      self.assertAllEqual(self.evaluate(t), [3, 4, 5])
 
   def test_tf_tensor_array_new_illegal_input(self):
     with self.assertRaises(ValueError):
@@ -102,15 +104,16 @@ class ListTest(test.TestCase):
 
     t = list_ops.tensor_list_stack(l, element_dtype=x.dtype)
     with self.cached_session() as sess:
-      self.assertAllEqual(sess.run(t), [[1, 2, 3]])
+      self.assertAllEqual(self.evaluate(t), [[1, 2, 3]])
 
+  @test_util.run_v1_only("b/117943489")
   def test_append_tensorarray(self):
     l = tensor_array_ops.TensorArray(dtypes.int32, size=0, dynamic_size=True)
     l1 = data_structures.list_append(l, 1)
     l2 = data_structures.list_append(l1, 2)
     with self.cached_session() as sess:
-      self.assertAllEqual(sess.run(l1.stack()), [1])
-      self.assertAllEqual(sess.run(l2.stack()), [1, 2])
+      self.assertAllEqual(self.evaluate(l1.stack()), [1])
+      self.assertAllEqual(self.evaluate(l2.stack()), [1, 2])
 
   def test_append_python(self):
     l = []
@@ -131,10 +134,10 @@ class ListTest(test.TestCase):
 
     with self.cached_session() as sess:
       l, x = data_structures.list_pop(l, None, opts)
-      self.assertAllEqual(sess.run(x), [3, 4])
+      self.assertAllEqual(self.evaluate(x), [3, 4])
 
       t = list_ops.tensor_list_stack(l, element_dtype=initial_list.dtype)
-      self.assertAllEqual(sess.run(t), [[1, 2]])
+      self.assertAllEqual(self.evaluate(t), [[1, 2]])
 
   def test_pop_python(self):
     l = [1, 2, 3]
@@ -152,12 +155,12 @@ class ListTest(test.TestCase):
 
     with self.cached_session() as sess:
       t = data_structures.list_stack(l, opts)
-      self.assertAllEqual(sess.run(t), sess.run(initial_list))
+      self.assertAllEqual(self.evaluate(t), self.evaluate(initial_list))
 
+  @test_util.run_deprecated_v1
   def test_stack_tensor_list_empty(self):
     l = list_ops.empty_tensor_list(
-        element_shape=-1,
-        element_dtype=dtypes.variant)
+        element_shape=None, element_dtype=dtypes.variant)
 
     opts = data_structures.ListStackOpts(
         element_dtype=dtypes.int32, original_call=None)
diff --git a/tensorflow/python/autograph/operators/exceptions_test.py b/tensorflow/python/autograph/operators/exceptions_test.py
index 186535d05b5..21ba76bb952 100644
--- a/tensorflow/python/autograph/operators/exceptions_test.py
+++ b/tensorflow/python/autograph/operators/exceptions_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 from tensorflow.python.autograph.operators import exceptions
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import errors_impl
+from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
 
 
@@ -30,8 +31,9 @@ class ExceptionsTest(test.TestCase):
     with self.cached_session() as sess:
       t = exceptions.assert_stmt(
           constant_op.constant(True), lambda: constant_op.constant('ignored'))
-      sess.run(t)
+      self.evaluate(t)
 
+  @test_util.run_deprecated_v1
   def test_assert_tf_triggered(self):
     with self.cached_session() as sess:
       t = exceptions.assert_stmt(
@@ -40,8 +42,9 @@ class ExceptionsTest(test.TestCase):
 
       with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
                                    'test message'):
-        sess.run(t)
+        self.evaluate(t)
 
+  @test_util.run_deprecated_v1
   def test_assert_tf_multiple_printed_values(self):
     two_tensors = [
         constant_op.constant('test message'),
@@ -53,7 +56,7 @@ class ExceptionsTest(test.TestCase):
 
       with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
                                    'test message.*another message'):
-        sess.run(t)
+        self.evaluate(t)
 
   def test_assert_python_untriggered(self):
     side_effect_trace = []
diff --git a/tensorflow/python/autograph/operators/logical_test.py b/tensorflow/python/autograph/operators/logical_test.py
index d6649f7b2bf..e22f39932d1 100644
--- a/tensorflow/python/autograph/operators/logical_test.py
+++ b/tensorflow/python/autograph/operators/logical_test.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 
 from tensorflow.python.autograph.operators import logical
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
 
 
@@ -42,14 +43,15 @@ class LogicalOperatorsTest(test.TestCase):
     self.assertFalse(logical.and_(lambda: False, lambda: True))
     self.assertFalse(logical.and_(lambda: False, self.assertNotCalled))
 
+  @test_util.run_deprecated_v1
   def test_and_tf(self):
     with self.cached_session() as sess:
       t = logical.and_(self._tf_true, self._tf_true)
-      self.assertEqual(sess.run(t), True)
+      self.assertEqual(self.evaluate(t), True)
       t = logical.and_(self._tf_true, lambda: True)
-      self.assertEqual(sess.run(t), True)
+      self.assertEqual(self.evaluate(t), True)
       t = logical.and_(self._tf_false, lambda: True)
-      self.assertEqual(sess.run(t), False)
+      self.assertEqual(self.evaluate(t), False)
       # TODO(mdan): Add a test for ops with side effects.
 
   def test_or_python(self):
@@ -60,14 +62,15 @@ class LogicalOperatorsTest(test.TestCase):
     self.assertTrue(logical.or_(lambda: False, lambda: True))
     self.assertTrue(logical.or_(lambda: True, self.assertNotCalled))
 
+  @test_util.run_deprecated_v1
   def test_or_tf(self):
     with self.cached_session() as sess:
       t = logical.or_(self._tf_false, self._tf_true)
-      self.assertEqual(sess.run(t), True)
+      self.assertEqual(self.evaluate(t), True)
       t = logical.or_(self._tf_false, lambda: True)
-      self.assertEqual(sess.run(t), True)
+      self.assertEqual(self.evaluate(t), True)
       t = logical.or_(self._tf_true, lambda: True)
-      self.assertEqual(sess.run(t), True)
+      self.assertEqual(self.evaluate(t), True)
       # TODO(mdan): Add a test for ops with side effects.
 
   def test_not_python(self):
@@ -78,7 +81,7 @@ class LogicalOperatorsTest(test.TestCase):
   def test_not_tf(self):
     with self.cached_session() as sess:
       t = logical.not_(self._tf_false())
-      self.assertEqual(sess.run(t), True)
+      self.assertEqual(self.evaluate(t), True)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/autograph/operators/py_builtins_test.py b/tensorflow/python/autograph/operators/py_builtins_test.py
index 443e30a475d..c856e39d141 100644
--- a/tensorflow/python/autograph/operators/py_builtins_test.py
+++ b/tensorflow/python/autograph/operators/py_builtins_test.py
@@ -27,6 +27,7 @@ from tensorflow.python.autograph.operators import py_builtins
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import tensor_array_ops
 from tensorflow.python.platform import test
@@ -38,29 +39,29 @@ class PyBuiltinsTest(test.TestCase):
     self.assertEqual(py_builtins.abs_(-1), 1)
     with self.cached_session() as sess:
       t = py_builtins.abs_(constant_op.constant(-1))
-      self.assertEqual(sess.run(t), 1)
+      self.assertEqual(self.evaluate(t), 1)
       t = py_builtins.abs_(constant_op.constant([-1, 2, -3]))
-      self.assertAllEqual(sess.run(t), [1, 2, 3])
+      self.assertAllEqual(self.evaluate(t), [1, 2, 3])
 
   def test_float(self):
     self.assertEqual(py_builtins.float_(10), 10.0)
     self.assertEqual(py_builtins.float_('10.0'), 10.0)
     with self.cached_session() as sess:
       t = py_builtins.float_(constant_op.constant(1, dtype=dtypes.int64))
-      self.assertEqual(sess.run(t), 1.0)
+      self.assertEqual(self.evaluate(t), 1.0)
       st = py_builtins.float_(constant_op.constant('1.0'))
-      self.assertEqual(sess.run(st), 1.0)
+      self.assertEqual(self.evaluate(st), 1.0)
 
   def test_int(self):
     self.assertEqual(py_builtins.int_(10.0), 10)
     self.assertEqual(py_builtins.int_('11', 2), 3)
     with self.cached_session() as sess:
       t = py_builtins.int_(constant_op.constant(1, dtype=dtypes.float64))
-      self.assertEqual(sess.run(t), 1)
+      self.assertEqual(self.evaluate(t), 1)
       st = py_builtins.int_(constant_op.constant('1'))
-      self.assertEqual(sess.run(st), 1)
+      self.assertEqual(self.evaluate(st), 1)
       st = py_builtins.int_(constant_op.constant('1'), 10)
-      self.assertEqual(sess.run(st), 1)
+      self.assertEqual(self.evaluate(st), 1)
 
   def test_int_unsupported_base(self):
     t = constant_op.constant(1, dtype=dtypes.float64)
@@ -73,14 +74,15 @@ class PyBuiltinsTest(test.TestCase):
       t = py_builtins.len_(constant_op.constant([[1], [2], [3]]))
       self.assertEqual(t, 3)
       ta = py_builtins.len_(tensor_array_ops.TensorArray(dtypes.int32, size=5))
-      self.assertEqual(sess.run(ta), 5)
+      self.assertEqual(self.evaluate(ta), 5)
       tl = py_builtins.len_(data_structures.tf_tensor_list_new([3, 4, 5]))
-      self.assertEqual(sess.run(tl), 3)
+      self.assertEqual(self.evaluate(tl), 3)
 
   def test_len_scalar(self):
     with self.assertRaises(ValueError):
       py_builtins.len_(constant_op.constant(1))
 
+  @test_util.run_deprecated_v1
   def test_len_dynamic_shape(self):
     with self.cached_session() as sess:
       p = array_ops.placeholder(dtype=dtypes.int32, shape=None)
@@ -91,6 +93,7 @@ class PyBuiltinsTest(test.TestCase):
         t = py_builtins.len_(p)
         sess.run(t, {p: 1})
 
+  @test_util.run_deprecated_v1
   def test_print_tensors(self):
     try:
       out_capturer = six.StringIO()
@@ -101,6 +104,7 @@ class PyBuiltinsTest(test.TestCase):
     finally:
       sys.stdout = sys.__stdout__
 
+  @test_util.run_deprecated_v1
   def test_print_complex(self):
     try:
       out_capturer = six.StringIO()
@@ -120,18 +124,18 @@ class PyBuiltinsTest(test.TestCase):
   def test_range_tensor(self):
     with self.cached_session() as sess:
       r = py_builtins.range_(constant_op.constant(3))
-      self.assertAllEqual(sess.run(r), [0, 1, 2])
+      self.assertAllEqual(self.evaluate(r), [0, 1, 2])
       r = py_builtins.range_(1, constant_op.constant(3))
-      self.assertAllEqual(sess.run(r), [1, 2])
+      self.assertAllEqual(self.evaluate(r), [1, 2])
       r = py_builtins.range_(2, 0, constant_op.constant(-1))
-      self.assertAllEqual(sess.run(r), [2, 1])
+      self.assertAllEqual(self.evaluate(r), [2, 1])
 
   def test_range_tensor_empty_range(self):
     with self.session() as sess:
       r = py_builtins.range_(constant_op.constant(-3))
-      self.assertAllEqual(sess.run(r), [])
+      self.assertAllEqual(self.evaluate(r), [])
       r = py_builtins.range_(5, constant_op.constant(2))
-      self.assertAllEqual(sess.run(r), [])
+      self.assertAllEqual(self.evaluate(r), [])
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/autograph/operators/slices_test.py b/tensorflow/python/autograph/operators/slices_test.py
index 9e4865b3c66..d444054fd77 100644
--- a/tensorflow/python/autograph/operators/slices_test.py
+++ b/tensorflow/python/autograph/operators/slices_test.py
@@ -34,7 +34,7 @@ class SlicesTest(test.TestCase):
 
     with self.cached_session() as sess:
       t = list_ops.tensor_list_stack(l, element_dtype=initial_list.dtype)
-      self.assertAllEqual(sess.run(t), [[5, 6], [3, 4]])
+      self.assertAllEqual(self.evaluate(t), [[5, 6], [3, 4]])
 
   def test_get_item_tensor_list(self):
     initial_list = constant_op.constant([[1, 2], [3, 4]])
@@ -44,7 +44,7 @@ class SlicesTest(test.TestCase):
         l, 1, slices.GetItemOpts(element_dtype=initial_list.dtype))
 
     with self.cached_session() as sess:
-      self.assertAllEqual(sess.run(t), [3, 4])
+      self.assertAllEqual(self.evaluate(t), [3, 4])
 
   def test_get_item_tensor_string(self):
     initial_str = constant_op.constant('abcd')
@@ -52,14 +52,14 @@ class SlicesTest(test.TestCase):
                         slices.GetItemOpts(element_dtype=initial_str.dtype))
 
     with self.cached_session() as sess:
-      self.assertEqual(sess.run(t), b'b')
+      self.assertEqual(self.evaluate(t), b'b')
 
     initial_list_str = constant_op.constant(['abcd', 'bcde'])
     t = slices.get_item(initial_list_str, 1,
                         slices.GetItemOpts(element_dtype=initial_str.dtype))
 
     with self.cached_session() as sess:
-      self.assertEqual(sess.run(t), b'bcde')
+      self.assertEqual(self.evaluate(t), b'bcde')
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/autograph/pyct/BUILD b/tensorflow/python/autograph/pyct/BUILD
index ddadc6b96e8..ba8ec271394 100644
--- a/tensorflow/python/autograph/pyct/BUILD
+++ b/tensorflow/python/autograph/pyct/BUILD
@@ -80,7 +80,6 @@ py_test(
     name = "compiler_test",
     srcs = ["compiler_test.py"],
     srcs_version = "PY2AND3",
-    tags = ["no_windows"],
     deps = [
         ":pyct",
         "//tensorflow/python:client_testlib",
@@ -154,7 +153,6 @@ py_test(
     name = "transformer_test",
     srcs = ["transformer_test.py"],
     srcs_version = "PY2AND3",
-    tags = ["no_windows"],
     deps = [
         ":pyct",
         "//tensorflow/python:client_testlib",
diff --git a/tensorflow/python/autograph/pyct/inspect_utils.py b/tensorflow/python/autograph/pyct/inspect_utils.py
index 4d56b93671e..7c819f364fa 100644
--- a/tensorflow/python/autograph/pyct/inspect_utils.py
+++ b/tensorflow/python/autograph/pyct/inspect_utils.py
@@ -46,6 +46,28 @@ if six.PY2:
   SPECIAL_BUILTINS['xrange'] = xrange
 
 
+def islambda(f):
+  if not tf_inspect.isfunction(f):
+    return False
+  if not hasattr(f, '__name__'):
+    return False
+  return f.__name__ == '<lambda>'
+
+
+def isnamedtuple(f):
+  """Returns True if the argument is a namedtuple-like."""
+  if not (tf_inspect.isclass(f) and issubclass(f, tuple)):
+    return False
+  if not hasattr(f, '_fields'):
+    return False
+  fields = getattr(f, '_fields')
+  if not isinstance(fields, tuple):
+    return False
+  if not all(isinstance(f, str) for f in fields):
+    return False
+  return True
+
+
 def isbuiltin(f):
   """Returns True if the argument is a built-in function."""
   if f in SPECIAL_BUILTINS.values():
diff --git a/tensorflow/python/autograph/pyct/inspect_utils_test.py b/tensorflow/python/autograph/pyct/inspect_utils_test.py
index 622e3bafc0a..a2c39056d1b 100644
--- a/tensorflow/python/autograph/pyct/inspect_utils_test.py
+++ b/tensorflow/python/autograph/pyct/inspect_utils_test.py
@@ -18,7 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from functools import wraps
+import collections
+import functools
 import imp
 import types
 import weakref
@@ -46,7 +47,7 @@ def wrapping_decorator():
     def replacement(*_):
       return None
 
-    @wraps(f)
+    @functools.wraps(f)
     def wrapper(*args, **kwargs):
       return replacement(*args, **kwargs)
     return wrapper
@@ -95,6 +96,38 @@ def free_factory():
 
 class InspectUtilsTest(test.TestCase):
 
+  def test_islambda(self):
+    def test_fn():
+      pass
+
+    self.assertTrue(inspect_utils.islambda(lambda x: x))
+    self.assertFalse(inspect_utils.islambda(test_fn))
+
+  def test_isnamedtuple(self):
+    nt = collections.namedtuple('TestNamedTuple', ['a', 'b'])
+
+    class NotANamedTuple(tuple):
+      pass
+
+    self.assertTrue(inspect_utils.isnamedtuple(nt))
+    self.assertFalse(inspect_utils.isnamedtuple(NotANamedTuple))
+
+  def test_isnamedtuple_confounder(self):
+    """This test highlights false positives when detecting named tuples."""
+
+    class NamedTupleLike(tuple):
+      _fields = ('a', 'b')
+
+    self.assertTrue(inspect_utils.isnamedtuple(NamedTupleLike))
+
+  def test_isnamedtuple_subclass(self):
+    """This test highlights false positives when detecting named tuples."""
+
+    class NamedTupleSubclass(collections.namedtuple('Test', ['a', 'b'])):
+      pass
+
+    self.assertTrue(inspect_utils.isnamedtuple(NamedTupleSubclass))
+
   def test_getnamespace_globals(self):
     ns = inspect_utils.getnamespace(factory)
     self.assertEqual(ns['free_function'], free_function)
diff --git a/tensorflow/python/autograph/pyct/static_analysis/BUILD b/tensorflow/python/autograph/pyct/static_analysis/BUILD
index 4a4ccdcbd15..5e260c5730a 100644
--- a/tensorflow/python/autograph/pyct/static_analysis/BUILD
+++ b/tensorflow/python/autograph/pyct/static_analysis/BUILD
@@ -38,7 +38,6 @@ py_test(
     name = "activity_test",
     srcs = ["activity_test.py"],
     srcs_version = "PY2AND3",
-    tags = ["no_windows"],
     deps = [
         ":static_analysis",
         "//tensorflow/python:client_testlib",
@@ -51,7 +50,6 @@ py_test(
     name = "live_values_test",
     srcs = ["live_values_test.py"],
     srcs_version = "PY2AND3",
-    tags = ["no_windows"],
     deps = [
         ":static_analysis",
         "//tensorflow/python:client_testlib",
diff --git a/tensorflow/python/autograph/pyct/static_analysis/liveness.py b/tensorflow/python/autograph/pyct/static_analysis/liveness.py
index 451398f1b70..f8b8d7fa77c 100644
--- a/tensorflow/python/autograph/pyct/static_analysis/liveness.py
+++ b/tensorflow/python/autograph/pyct/static_analysis/liveness.py
@@ -161,6 +161,16 @@ class Annotator(transformer.Base):
     self.cross_function_analyzer = cross_function_analyzer
     self.current_analyzer = None
 
+  def visit(self, node):
+    node = super(Annotator, self).visit(node)
+    if (self.current_analyzer is not None and
+        isinstance(node, gast.stmt) and
+        node in self.current_analyzer.graph.index):
+      cfg_node = self.current_analyzer.graph.index[node]
+      anno.setanno(node, anno.Static.LIVE_VARS_IN,
+                   frozenset(self.current_analyzer.in_[cfg_node]))
+    return node
+
   def visit_FunctionDef(self, node):
     parent_analyzer = self.current_analyzer
     self.current_analyzer = self.cross_function_analyzer.analyzers[node]
@@ -198,6 +208,10 @@ class Annotator(transformer.Base):
     node = self._block_statement_live_out(node)
     return self._block_statement_live_in(node, node.test)
 
+  def visit_With(self, node):
+    node = self.generic_visit(node)
+    return self._block_statement_live_in(node, node.items[0])
+
   def visit_Expr(self, node):
     node = self.generic_visit(node)
     cfg_node = self.current_analyzer.graph.index[node]
diff --git a/tensorflow/python/autograph/utils/misc_test.py b/tensorflow/python/autograph/utils/misc_test.py
index 8d2b0d6e138..c78df48d626 100644
--- a/tensorflow/python/autograph/utils/misc_test.py
+++ b/tensorflow/python/autograph/utils/misc_test.py
@@ -19,6 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.autograph.utils.misc import alias_tensors
+from tensorflow.python.framework import test_util
 from tensorflow.python.framework.constant_op import constant
 from tensorflow.python.ops.variables import Variable
 from tensorflow.python.platform import test
@@ -26,14 +27,16 @@ from tensorflow.python.platform import test
 
 class MiscTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def test_alias_single_tensor(self):
     a = constant(1)
 
     new_a = alias_tensors(a)
     self.assertFalse(new_a is a)
     with self.cached_session() as sess:
-      self.assertEqual(1, sess.run(new_a))
+      self.assertEqual(1, self.evaluate(new_a))
 
+  @test_util.run_deprecated_v1
   def test_alias_tensors(self):
     a = constant(1)
     v = Variable(2)
@@ -47,7 +50,7 @@ class MiscTest(test.TestCase):
     self.assertTrue(new_s is s)
     self.assertTrue(new_l is l)
     with self.cached_session() as sess:
-      self.assertEqual(1, sess.run(new_a))
+      self.assertEqual(1, self.evaluate(new_a))
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/autograph/utils/py_func_test.py b/tensorflow/python/autograph/utils/py_func_test.py
index 1c220d94922..28cefd8c3ed 100644
--- a/tensorflow/python/autograph/utils/py_func_test.py
+++ b/tensorflow/python/autograph/utils/py_func_test.py
@@ -34,13 +34,13 @@ class PyFuncTest(test.TestCase):
     with self.cached_session() as sess:
       result = py_func.wrap_py_func(test_fn, dtypes.int64,
                                     (1, constant_op.constant(1), 1))
-      self.assertEqual(3, sess.run(result))
+      self.assertEqual(3, self.evaluate(result))
       result = py_func.wrap_py_func(test_fn, dtypes.int64, (1, 1, 1))
-      self.assertEqual(3, sess.run(result))
+      self.assertEqual(3, self.evaluate(result))
       result = py_func.wrap_py_func(
           test_fn, dtypes.int64,
           (constant_op.constant(1), 1, constant_op.constant(1)))
-      self.assertEqual(3, sess.run(result))
+      self.assertEqual(3, self.evaluate(result))
 
   def test_wrap_py_func_complex_args(self):
 
@@ -54,10 +54,10 @@ class PyFuncTest(test.TestCase):
 
     with self.cached_session() as sess:
       result = py_func.wrap_py_func(test_fn, dtypes.int64, (7, TestClass()))
-      self.assertEqual(35, sess.run(result))
+      self.assertEqual(35, self.evaluate(result))
       result = py_func.wrap_py_func(test_fn, dtypes.int64,
                                     (constant_op.constant(7), TestClass()))
-      self.assertEqual(35, sess.run(result))
+      self.assertEqual(35, self.evaluate(result))
 
   def test_wrap_py_func_kwargs(self):
 
@@ -74,13 +74,13 @@ class PyFuncTest(test.TestCase):
           'c': 11,
           'd': TestClass(13)
       })
-      self.assertEqual(178, sess.run(result))
+      self.assertEqual(178, self.evaluate(result))
       result = py_func.wrap_py_func(test_fn, dtypes.int64,
                                     (constant_op.constant(7), TestClass(5)), {
                                         'c': constant_op.constant(11),
                                         'd': TestClass(13)
                                     })
-      self.assertEqual(178, sess.run(result))
+      self.assertEqual(178, self.evaluate(result))
 
   def test_wrap_py_func_dummy_return(self):
 
@@ -91,11 +91,11 @@ class PyFuncTest(test.TestCase):
 
     with self.cached_session() as sess:
       result = py_func.wrap_py_func(test_fn, None, (5,), use_dummy_return=True)
-      self.assertEqual(1, sess.run(result))
+      self.assertEqual(1, self.evaluate(result))
       self.assertEqual([1], side_counter)
       result = py_func.wrap_py_func(
           test_fn, None, (constant_op.constant(5),), use_dummy_return=True)
-      self.assertEqual(1, sess.run(result))
+      self.assertEqual(1, self.evaluate(result))
       self.assertEqual([2], side_counter)
 
 
diff --git a/tensorflow/python/autograph/utils/tensor_list_test.py b/tensorflow/python/autograph/utils/tensor_list_test.py
index 697c166eb12..bbbc3bf6918 100644
--- a/tensorflow/python/autograph/utils/tensor_list_test.py
+++ b/tensorflow/python/autograph/utils/tensor_list_test.py
@@ -19,10 +19,10 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.autograph.utils import tensor_list as tl
-from tensorflow.python.client.session import Session
 from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.framework.constant_op import constant
 from tensorflow.python.ops import list_ops
 from tensorflow.python.ops import tensor_array_ops
@@ -34,6 +34,7 @@ class TensorListTest(test.TestCase):
   def _shape(self, shape_tuple):
     return constant(shape_tuple, dtypes.int32)
 
+  @test_util.run_v1_only("b/117943489")
   def test_dynamic_list_append(self):
     l = []
     l = tl.dynamic_list_append(l, 1)
@@ -42,19 +43,16 @@ class TensorListTest(test.TestCase):
     l = list_ops.empty_tensor_list(self._shape(()), dtypes.int32)
     l = tl.dynamic_list_append(l, 1)
     s = list_ops.tensor_list_stack(l, element_dtype=dtypes.int32)
-    with self.cached_session() as sess:
-      self.assertAllEqual(sess.run(s), [1])
+    self.assertAllEqual(s, [1])
 
     l = tensor_array_ops.TensorArray(dtypes.int32, size=0, dynamic_size=True)
     l = tl.dynamic_list_append(l, 1)
     s = l.stack()
-    with self.cached_session() as sess:
-      self.assertAllEqual(sess.run(s), [1])
+    self.assertAllEqual(s, [1])
 
     l = tl.TensorList(self._shape(()), dtypes.int32)
     l = tl.dynamic_list_append(l, 1)
-    with self.cached_session() as sess:
-      self.assertAllEqual(sess.run(l[0]), 1)
+    self.assertAllEqual(l[0], 1)
 
   def test_list_append_python(self):
     with context.eager_mode():
@@ -80,6 +78,7 @@ class TensorListTest(test.TestCase):
       l[0] = ops.convert_to_tensor(b)
       self.assertEqual(l[0].numpy(), b.numpy())
 
+  @test_util.run_deprecated_v1
   def test_list_append_tf(self):
     a = constant(3.0)
     l = tl.TensorList(a.shape, a.dtype)
@@ -91,13 +90,12 @@ class TensorListTest(test.TestCase):
     c3 = l.count()
     a2 = l.pop()
     c4 = l.count()
-    with Session() as sess:
-      c1, c2, c3, c4, a, a2 = sess.run([c1, c2, c3, c4, a, a2])
-      self.assertEqual(c1, 1)
-      self.assertEqual(c2, 2)
-      self.assertEqual(c3, 1)
-      self.assertEqual(c4, 0)
-      self.assertEqual(a, a2)
+    c1, c2, c3, c4, a, a2 = self.evaluate([c1, c2, c3, c4, a, a2])
+    self.assertEqual(c1, 1)
+    self.assertEqual(c2, 2)
+    self.assertEqual(c3, 1)
+    self.assertEqual(c4, 0)
+    self.assertEqual(a, a2)
 
   def test_list_index_tf(self):
     a = constant(3.0)
@@ -107,10 +105,9 @@ class TensorListTest(test.TestCase):
     l0 = l[0]
     l[0] = b
     l1 = l[0]
-    with self.cached_session() as sess:
-      l0, l1, a, b = sess.run([l0, l1, a, b])
-      self.assertEqual(l0, a)
-      self.assertEqual(l1, b)
+    l0, l1, a, b = self.evaluate([l0, l1, a, b])
+    self.assertEqual(l0, a)
+    self.assertEqual(l1, b)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/autograph/utils/type_check.py b/tensorflow/python/autograph/utils/type_check.py
index 8748abc47bc..ccef7dee039 100644
--- a/tensorflow/python/autograph/utils/type_check.py
+++ b/tensorflow/python/autograph/utils/type_check.py
@@ -30,4 +30,4 @@ def is_tensor(*args):
   Returns:
     True if any *args are TensorFlow types, False if none are.
   """
-  return any([tensor_util.is_tensor(a) for a in args])
+  return any(tensor_util.is_tensor(a) for a in args)
diff --git a/tensorflow/python/autograph/utils/type_check_test.py b/tensorflow/python/autograph/utils/type_check_test.py
index b3d1304e16f..2521dc9f925 100644
--- a/tensorflow/python/autograph/utils/type_check_test.py
+++ b/tensorflow/python/autograph/utils/type_check_test.py
@@ -28,6 +28,7 @@ from tensorflow.python.platform import test
 
 class TypeCheckTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def test_checks(self):
     self.assertTrue(type_check.is_tensor(constant_op.constant([1, 2, 3])))
     self.assertTrue(
diff --git a/tensorflow/python/client/device_lib.i b/tensorflow/python/client/device_lib.i
index 944e855cee2..3e579152d51 100644
--- a/tensorflow/python/client/device_lib.i
+++ b/tensorflow/python/client/device_lib.i
@@ -48,17 +48,14 @@ static std::vector<string> ListDevicesWithSessionConfig(
   std::vector<string> output;
   SessionOptions options;
   options.config = config;
-  std::vector<Device*> devices;
+  std::vector<std::unique_ptr<Device>> devices;
   Status status = DeviceFactory::AddDevices(
       options, "" /* name_prefix */, &devices);
   if (!status.ok()) {
     Set_TF_Status_from_Status(out_status, status);
   }
 
-  std::vector<std::unique_ptr<Device>> device_holder(devices.begin(),
-                                                     devices.end());
-
-  for (const Device* device : devices) {
+  for (const std::unique_ptr<Device>& device : devices) {
     const DeviceAttributes& attr = device->attributes();
     string attr_serialized;
     if (!attr.SerializeToString(&attr_serialized)) {
diff --git a/tensorflow/python/client/session.py b/tensorflow/python/client/session.py
index 06c66dda9fb..87a200ed336 100644
--- a/tensorflow/python/client/session.py
+++ b/tensorflow/python/client/session.py
@@ -828,7 +828,7 @@ class BaseSession(SessionInterface):
     nested list, tuple, namedtuple, dict, or OrderedDict containing graph
     elements at its leaves.  A graph element can be one of the following types:
 
-    * An `tf.Operation`.
+    * A `tf.Operation`.
       The corresponding fetched value will be `None`.
     * A `tf.Tensor`.
       The corresponding fetched value will be a numpy ndarray containing the
@@ -1097,7 +1097,7 @@ class BaseSession(SessionInterface):
           if isinstance(subfeed_val, ops.Tensor):
             raise TypeError('The value of a feed cannot be a tf.Tensor object. '
                             'Acceptable feed values include Python scalars, '
-                            'strings, lists, numpy ndarrays, or TensorHandles.'
+                            'strings, lists, numpy ndarrays, or TensorHandles. '
                             'For reference, the tensor object was ' +
                             str(feed_val) + ' which was passed to the '
                             'feed with key ' + str(feed) + '.')
diff --git a/tensorflow/python/client/session_clusterspec_prop_test.py b/tensorflow/python/client/session_clusterspec_prop_test.py
index df020f88a88..224f880ed15 100644
--- a/tensorflow/python/client/session_clusterspec_prop_test.py
+++ b/tensorflow/python/client/session_clusterspec_prop_test.py
@@ -62,7 +62,7 @@ class SessionClusterSpecPropagationTest(test_util.TensorFlowTestCase):
 
     const = constant_op.constant(17)
     sess = session.Session(server1.target, config=config)
-    output = sess.run(const)
+    output = self.evaluate(const)
     self.assertEqual(17, output)
 
   def testClusterSpecPropagationWorker2Placement(self):
@@ -106,7 +106,7 @@ class SessionClusterSpecPropagationTest(test_util.TensorFlowTestCase):
     with ops.Graph().as_default() as g, ops.device('/job:worker/task:0'):
       const = constant_op.constant(17)
     sess = session.Session(server1.target, config=config, graph=g)
-    output = sess.run(const)
+    output = self.evaluate(const)
     self.assertEqual(17, output)
 
   def testCanonicalDeviceNames(self):
@@ -208,7 +208,7 @@ class SessionClusterSpecPropagationTest(test_util.TensorFlowTestCase):
       with ops.device('/job:worker/task:0/cpu:0'):
         sum3 = sum1 + sum2
     sess = session.Session(server1.target, config=config, graph=g)
-    output = sess.run(sum3)
+    output = self.evaluate(sum3)
     self.assertEqual(40, output)
 
   def testLegacyDeviceNames(self):
diff --git a/tensorflow/python/client/session_partial_run_test.py b/tensorflow/python/client/session_partial_run_test.py
index 92ca47efa93..a97930635af 100644
--- a/tensorflow/python/client/session_partial_run_test.py
+++ b/tensorflow/python/client/session_partial_run_test.py
@@ -117,7 +117,7 @@ class PartialRunTest(test_util.TensorFlowTestCase):
     a = constant_op.constant(2.0, dtypes.float32)
     b = a * 2
     c = b * 3
-    r1 = sess.run([b, c])
+    r1 = self.evaluate([b, c])
     h = sess.partial_run_setup([b, c], [])
     r2 = sess.partial_run(h, [b, c])
     self.assertEqual(r1, r2)
@@ -188,6 +188,7 @@ class PartialRunTest(test_util.TensorFlowTestCase):
     r = sess.partial_run(h, [b], {})
     self.assertEqual([6.0], r)
 
+  @test_util.run_deprecated_v1
   def testInvalidPartialRunSetup(self):
     sess = session.Session()
     x = array_ops.placeholder(dtypes.float32, shape=[])
@@ -196,6 +197,7 @@ class PartialRunTest(test_util.TensorFlowTestCase):
         'specify at least one target to fetch or execute.'):
       sess.partial_run_setup(fetches=[], feeds=[x])
 
+  @test_util.run_deprecated_v1
   def testPartialRunSetupNoFeedsPassed(self):
     sess = session.Session()
     r1 = constant_op.constant([6.0])
@@ -204,80 +206,102 @@ class PartialRunTest(test_util.TensorFlowTestCase):
     result1 = sess.partial_run(h, r1)
     self.assertEqual([6.0], result1)
 
+  @test_util.run_deprecated_v1
   def testPartialRunDirect(self):
     self.RunTestPartialRun(session.Session())
 
+  @test_util.run_deprecated_v1
   def testPartialRunIncompleteDirect(self):
     self.RunTestPartialRunIncomplete(session.Session())
 
+  @test_util.run_deprecated_v1
   def testConcurrentPartialRunDirect(self):
     self.RunTestConcurrentPartialRun(session.Session())
 
+  @test_util.run_deprecated_v1
   def testManyPartialRunDirect(self):
     self.RunTestManyPartialRun(session.Session())
 
+  @test_util.run_deprecated_v1
   def testRunAndPartialRunDirect(self):
     self.RunTestRunAndPartialRun(session.Session())
 
+  @test_util.run_deprecated_v1
   def testPartialRunMissingPlaceholderFeedExceptionDirect(self):
     self.RunTestPartialRunMissingPlaceholderFeedException(session.Session())
 
+  @test_util.run_deprecated_v1
   def testPartialRunUnspecifiedFeedDirect(self):
     self.RunTestPartialRunUnspecifiedFeed(session.Session())
 
+  @test_util.run_deprecated_v1
   def testPartialRunUnspecifiedFetchDirect(self):
     self.RunTestPartialRunUnspecifiedFetch(session.Session())
 
+  @test_util.run_deprecated_v1
   def testPartialRunAlreadyFedDirect(self):
     self.RunTestPartialRunAlreadyFed(session.Session())
 
+  @test_util.run_deprecated_v1
   def testPartialRunAlreadyFetchedDirect(self):
     self.RunTestPartialRunAlreadyFetched(session.Session())
 
+  @test_util.run_deprecated_v1
   def testPartialRunEmptyFetchesDirect(self):
     self.RunTestPartialRunEmptyFetches(session.Session())
 
+  @test_util.run_deprecated_v1
   def testPartialRunDist(self):
     server = server_lib.Server.create_local_server()
     self.RunTestPartialRun(session.Session(server.target))
 
+  @test_util.run_deprecated_v1
   def testPartialRunIncompleteDist(self):
     server = server_lib.Server.create_local_server()
     self.RunTestPartialRunIncomplete(session.Session(server.target))
 
+  @test_util.run_deprecated_v1
   def testConcurrentPartialRunDist(self):
     server = server_lib.Server.create_local_server()
     self.RunTestConcurrentPartialRun(session.Session(server.target))
 
+  @test_util.run_deprecated_v1
   def testManyPartialRunDist(self):
     server = server_lib.Server.create_local_server()
     self.RunTestManyPartialRun(session.Session(server.target))
 
+  @test_util.run_deprecated_v1
   def testRunAndPartialRunDist(self):
     server = server_lib.Server.create_local_server()
     self.RunTestRunAndPartialRun(session.Session(server.target))
 
+  @test_util.run_deprecated_v1
   def testPartialRunMissingPlaceholderFeedExceptionDist(self):
     server = server_lib.Server.create_local_server()
     self.RunTestPartialRunMissingPlaceholderFeedException(
         session.Session(server.target))
 
+  @test_util.run_deprecated_v1
   def testPartialRunUnspecifiedFeedDist(self):
     server = server_lib.Server.create_local_server()
     self.RunTestPartialRunUnspecifiedFeed(session.Session(server.target))
 
+  @test_util.run_deprecated_v1
   def testPartialRunUnspecifiedFetchDist(self):
     server = server_lib.Server.create_local_server()
     self.RunTestPartialRunUnspecifiedFetch(session.Session(server.target))
 
+  @test_util.run_deprecated_v1
   def testPartialRunAlreadyFedDist(self):
     server = server_lib.Server.create_local_server()
     self.RunTestPartialRunAlreadyFed(session.Session(server.target))
 
+  @test_util.run_deprecated_v1
   def testPartialRunAlreadyFetchedDist(self):
     server = server_lib.Server.create_local_server()
     self.RunTestPartialRunAlreadyFetched(session.Session(server.target))
 
+  @test_util.run_deprecated_v1
   def testPartialRunEmptyFetchesDist(self):
     server = server_lib.Server.create_local_server()
     self.RunTestPartialRunEmptyFetches(session.Session(server.target))
diff --git a/tensorflow/python/client/timeline_test.py b/tensorflow/python/client/timeline_test.py
index dfd01476430..61c0da01b83 100644
--- a/tensorflow/python/client/timeline_test.py
+++ b/tensorflow/python/client/timeline_test.py
@@ -57,6 +57,7 @@ class TimelineTest(test.TestCase):
     ctf = tl.generate_chrome_trace_format()
     self._validateTrace(ctf)
 
+  @test_util.run_deprecated_v1
   def testTimelineCpu(self):
     run_options = config_pb2.RunOptions(
         trace_level=config_pb2.RunOptions.FULL_TRACE)
@@ -147,7 +148,7 @@ class TimelineTest(test.TestCase):
         num2 = variables.Variable(2.0, name='num2')
       with ops.device('/cpu:2'):
         result = num1 + num2 + num1 * num2
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       sess.run(result, options=run_options, run_metadata=run_metadata)
 
     self.assertTrue(run_metadata.HasField('step_stats'))
@@ -176,7 +177,7 @@ class TimelineTest(test.TestCase):
         num2 = variables.Variable(2.0, name='num2')
       with ops.device('/cpu:2'):
         result = num1 + num2 + num1 * num2
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       sess.run(result, options=run_options, run_metadata=run_metadata)
     self.assertTrue(run_metadata.HasField('step_stats'))
     step_stats = run_metadata.step_stats
diff --git a/tensorflow/python/client/virtual_gpu_test.py b/tensorflow/python/client/virtual_gpu_test.py
index 5892e0fc845..e82ee0666c3 100644
--- a/tensorflow/python/client/virtual_gpu_test.py
+++ b/tensorflow/python/client/virtual_gpu_test.py
@@ -216,7 +216,7 @@ class VirtualGpuTest(test_util.TensorFlowTestCase):
       for d in self._util.devices:
         with ops.device(d):
           var = variables.Variable(random_ops.random_uniform(mat_shape))
-          sess.run(var.initializer)
+          self.evaluate(var.initializer)
           data.append(var)
       s = data[0]
       for i in range(1, len(data)):
diff --git a/tensorflow/python/compat/BUILD b/tensorflow/python/compat/BUILD
index e0a1c8e0571..9f2ce8c676e 100644
--- a/tensorflow/python/compat/BUILD
+++ b/tensorflow/python/compat/BUILD
@@ -9,7 +9,10 @@ py_library(
     srcs = ["compat.py"],
     srcs_version = "PY2AND3",
     visibility = ["//tensorflow:internal"],
-    deps = ["//tensorflow/python:util"],
+    deps = [
+        "//tensorflow/python:tf2",
+        "//tensorflow/python:util",
+    ],
 )
 
 tf_py_test(
diff --git a/tensorflow/python/compat/compat.py b/tensorflow/python/compat/compat.py
index 385fd431f4c..0b6ff30488e 100644
--- a/tensorflow/python/compat/compat.py
+++ b/tensorflow/python/compat/compat.py
@@ -23,10 +23,16 @@ from __future__ import division
 from __future__ import print_function
 
 import datetime
+
+from tensorflow.python import tf2
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.ops import variable_scope
+
 from tensorflow.python.util import tf_contextlib
 from tensorflow.python.util.tf_export import tf_export
 
-_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2018, 11, 13)
+_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2018, 12, 4)
 
 
 @tf_export("compat.forward_compatible")
@@ -132,3 +138,40 @@ def forward_compatibility_horizon(year, month, day):
     yield
   finally:
     _FORWARD_COMPATIBILITY_HORIZON = old_compat_date
+
+
+@tf_export(v1=["enable_v2_behavior"])
+def enable_v2_behavior():
+  """Enables TensorFlow 2.x behaviors.
+
+  This function can be called at the beginning of the program (before `Tensors`,
+  `Graphs` or other structures have been created, and before devices have been
+  initialized. It switches all global behaviors that are different between
+  TensorFlow 1.x and 2.x to behave as intended for 2.x.
+
+  This function is called in the main TensorFlow `__init__.py` file, user should
+  not need to call it, except during complex migrations.
+  """
+  tf2.enable()  # Switches TensorArrayV2 and control flow V2
+  ops.enable_eager_execution()
+  tensor_shape.enable_v2_tensorshape()  # Also switched by tf2
+  variable_scope.enable_resource_variables()
+
+
+@tf_export(v1=["disable_v2_behavior"])
+def disable_v2_behavior():
+  """Enables TensorFlow 2.x behaviors.
+
+  This function can be called at the beginning of the program (before `Tensors`,
+  `Graphs` or other structures have been created, and before devices have been
+  initialized. It switches all global behaviors that are different between
+  TensorFlow 1.x and 2.x to behave as intended for 1.x.
+
+  User can call this function to disable 2.x behavior during complex migrations.
+  """
+  tf2.disable()  # Switches TensorArrayV2 and control flow V2
+  ops.disable_eager_execution()
+  tensor_shape.disable_v2_tensorshape()  # Also switched by tf2
+  variable_scope.disable_resource_variables()
+
+
diff --git a/tensorflow/python/data/__init__.py b/tensorflow/python/data/__init__.py
index 7536ba668ab..75ba88f3034 100644
--- a/tensorflow/python/data/__init__.py
+++ b/tensorflow/python/data/__init__.py
@@ -24,6 +24,8 @@ from __future__ import print_function
 # pylint: disable=unused-import
 from tensorflow.python.data import experimental
 from tensorflow.python.data.ops.dataset_ops import Dataset
+from tensorflow.python.data.ops.dataset_ops import make_initializable_iterator
+from tensorflow.python.data.ops.dataset_ops import make_one_shot_iterator
 from tensorflow.python.data.ops.iterator_ops import Iterator
 from tensorflow.python.data.ops.readers import FixedLengthRecordDataset
 from tensorflow.python.data.ops.readers import TextLineDataset
diff --git a/tensorflow/python/data/benchmarks/BUILD b/tensorflow/python/data/benchmarks/BUILD
index fd723e0d712..5b0500eae19 100644
--- a/tensorflow/python/data/benchmarks/BUILD
+++ b/tensorflow/python/data/benchmarks/BUILD
@@ -6,6 +6,61 @@ exports_files(["LICENSE"])
 
 load("//tensorflow:tensorflow.bzl", "py_test")
 
+py_test(
+    name = "batch_benchmark",
+    srcs = ["batch_benchmark.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:session",
+        "//tensorflow/python:sparse_tensor",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_test(
+    name = "filter_benchmark",
+    srcs = ["filter_benchmark.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:session",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_test(
+    name = "from_tensor_slices_benchmark",
+    srcs = ["from_tensor_slices_benchmark.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:session",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_test(
+    name = "map_benchmark",
+    srcs = ["map_benchmark.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:session",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//third_party/py/numpy",
+    ],
+)
+
 py_test(
     name = "range_benchmark",
     srcs = ["range_benchmark.py"],
diff --git a/tensorflow/python/data/benchmarks/batch_benchmark.py b/tensorflow/python/data/benchmarks/batch_benchmark.py
new file mode 100644
index 00000000000..e063849f703
--- /dev/null
+++ b/tensorflow/python/data/benchmarks/batch_benchmark.py
@@ -0,0 +1,85 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Benchmarks for `tf.data.Dataset.batch()`."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import time
+
+import numpy as np
+
+from tensorflow.python.client import session
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.ops import array_ops
+from tensorflow.python.platform import test
+
+
+# TODO(b/119837791): Add eager benchmarks.
+class BatchBenchmark(test.Benchmark):
+  """Benchmarks for `tf.data.Dataset.batch()`."""
+
+  def benchmarkBatchSparse(self):
+    non_zeros_per_row_values = [0, 1, 5, 10, 100]
+    batch_size_values = [1, 32, 64, 128, 1024]
+
+    sparse_placeholder = array_ops.sparse_placeholder(dtype=dtypes.int64)
+    batch_size_placeholder = array_ops.placeholder(dtype=dtypes.int64, shape=[])
+
+    dataset = dataset_ops.Dataset.from_tensors(sparse_placeholder).repeat(
+        ).batch(batch_size_placeholder)
+    iterator = dataset_ops.make_initializable_iterator(dataset)
+    next_element = iterator.get_next()
+
+    for non_zeros_per_row in non_zeros_per_row_values:
+
+      sparse_value = sparse_tensor.SparseTensorValue(
+          indices=np.arange(non_zeros_per_row, dtype=np.int64)[:, np.newaxis],
+          values=np.arange(non_zeros_per_row, dtype=np.int64),
+          dense_shape=[1000])
+
+      for batch_size in batch_size_values:
+
+        with session.Session() as sess:
+          sess.run(iterator.initializer, feed_dict={
+              sparse_placeholder: sparse_value,
+              batch_size_placeholder: batch_size})
+          # Run five steps to warm up the session caches before taking the
+          # first measurement.
+          for _ in range(5):
+            sess.run(next_element.indices.op)
+          deltas = []
+          for _ in range(100):
+            start = time.time()
+            for _ in range(100):
+              sess.run(next_element.indices.op)
+            end = time.time()
+            deltas.append(end - start)
+
+        median_wall_time = np.median(deltas) / 100.0
+
+        print("Batch sparse dataset non-zeros per row: %d batch_size: %d "
+              "wall time: %f"
+              % (non_zeros_per_row, batch_size, median_wall_time))
+        self.report_benchmark(
+            iters=10000, wall_time=median_wall_time,
+            name="batch_sparse_dataset_nnz_%d_batch_size_%d" % (
+                non_zeros_per_row, batch_size))
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/data/benchmarks/filter_benchmark.py b/tensorflow/python/data/benchmarks/filter_benchmark.py
new file mode 100644
index 00000000000..a6d86fe2218
--- /dev/null
+++ b/tensorflow/python/data/benchmarks/filter_benchmark.py
@@ -0,0 +1,69 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Benchmarks for `tf.data.Dataset.filter()`."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import time
+
+import numpy as np
+
+from tensorflow.python.client import session
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.platform import test
+
+
+# TODO(b/119837791): Add eager benchmarks.
+class FilterBenchmark(test.Benchmark):
+  """Benchmarks for `tf.data.Dataset.filter()`."""
+
+  def _benchmark(self, predicate, name):
+    with ops.Graph().as_default():
+      dataset = (
+          dataset_ops.Dataset.from_tensors(True).repeat(None).filter(predicate))
+      iterator = dataset_ops.make_one_shot_iterator(dataset)
+      next_element = iterator.get_next()
+
+      with session.Session() as sess:
+        for _ in range(5):
+          sess.run(next_element.op)
+        deltas = []
+        for _ in range(100):
+          start = time.time()
+          for _ in range(100):
+            sess.run(next_element.op)
+          end = time.time()
+          deltas.append(end - start)
+
+        median_wall_time = np.median(deltas) / 100
+        print("Filter dataset using %s. Median wall time: %f" %
+              (name, median_wall_time))
+        self.report_benchmark(
+            iters=100,
+            wall_time=median_wall_time,
+            name=name)
+
+  def benchmarkSimpleFunction(self):
+    self._benchmark(array_ops.identity, "simple_function")
+
+  def benchmarkReturnComponentOptimization(self):
+    self._benchmark(lambda x: x, "return_component")
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/data/benchmarks/from_tensor_slices_benchmark.py b/tensorflow/python/data/benchmarks/from_tensor_slices_benchmark.py
new file mode 100644
index 00000000000..d7f1a4e7af5
--- /dev/null
+++ b/tensorflow/python/data/benchmarks/from_tensor_slices_benchmark.py
@@ -0,0 +1,188 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Benchmarks for `tf.data.Dataset.from_tensor_slices()`."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import time
+
+import numpy as np
+
+from tensorflow.python.client import session
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import errors
+from tensorflow.python.platform import test
+
+
+# TODO(b/119837791): Add eager benchmarks.
+class FromTensorSlicesBenchmark(test.Benchmark):
+  """Benchmarks for `tf.data.Dataset.from_tensor_slices()`."""
+
+  def benchmarkSliceRepeatBatch(self):
+    input_size = 10000
+    batch_size = 100
+    num_epochs = 100
+
+    input_data = np.random.randn(input_size)
+
+    dataset = (
+        dataset_ops.Dataset.from_tensor_slices(input_data)
+        .repeat(num_epochs + 1).batch(batch_size))
+    iterator = dataset_ops.make_initializable_iterator(dataset)
+    next_element = iterator.get_next()
+
+    with session.Session() as sess:
+      sess.run(iterator.initializer)
+      # Run one whole epoch to burn in the computation.
+      for _ in range(input_size // batch_size):
+        sess.run(next_element)
+      deltas = []
+      try:
+        while True:
+          start = time.time()
+          sess.run(next_element)
+          deltas.append(time.time() - start)
+      except errors.OutOfRangeError:
+        pass
+
+    median_wall_time = np.median(deltas)
+    print("Slice/repeat/batch with sess.run() input size: %d batch size: %d "
+          "Median wall time per element: %f" % (input_size, batch_size,
+                                                median_wall_time))
+    self.report_benchmark(
+        iters=len(deltas),
+        wall_time=median_wall_time,
+        name="slice_repeat_batch_input_%d_batch_%d" % (input_size, batch_size))
+
+  def benchmarkSliceRepeatBatchCallable(self):
+    input_size = 10000
+    batch_size = 100
+    num_epochs = 100
+
+    input_data = np.random.randn(input_size)
+
+    dataset = (
+        dataset_ops.Dataset.from_tensor_slices(input_data)
+        .repeat(num_epochs + 1).batch(batch_size))
+    iterator = dataset_ops.make_initializable_iterator(dataset)
+    next_element = iterator.get_next()
+
+    with session.Session() as sess:
+      sess.run(iterator.initializer)
+      get_next_element = sess.make_callable(next_element)
+      # Run one whole epoch to burn in the computation.
+      for _ in range(input_size // batch_size):
+        get_next_element()
+      deltas = []
+      try:
+        while True:
+          start = time.time()
+          get_next_element()
+          deltas.append(time.time() - start)
+      except errors.OutOfRangeError:
+        pass
+
+    median_wall_time = np.median(deltas)
+    print(
+        "Slice/repeat/batch with callable input size: %d batch size: %d Median"
+        " wall time per element: %f" % (input_size, batch_size,
+                                        median_wall_time))
+    self.report_benchmark(
+        iters=len(deltas),
+        wall_time=median_wall_time,
+        name="slice_repeat_batch_callable_input_%d_batch_%d" %
+        (input_size, batch_size))
+
+  def benchmarkReshapeSliceRepeatCallable(self):
+    input_size = 10000
+    batch_size = 100
+    num_epochs = 100
+
+    input_data = np.random.randn(input_size)
+
+    dataset = (
+        dataset_ops.Dataset.from_tensor_slices(input_data.reshape(100, 100))
+        .repeat(num_epochs + 1))
+    iterator = dataset_ops.make_initializable_iterator(dataset)
+    next_element = iterator.get_next()
+
+    with session.Session() as sess:
+      sess.run(iterator.initializer)
+      get_next_element = sess.make_callable(next_element)
+      # Run one whole epoch to burn in the computation.
+      for _ in range(input_size // batch_size):
+        get_next_element()
+      deltas = []
+      try:
+        while True:
+          start = time.time()
+          get_next_element()
+          deltas.append(time.time() - start)
+      except errors.OutOfRangeError:
+        pass
+
+    median_wall_time = np.median(deltas)
+    print("Reshape/slice/repeat with callable input size: %d batch size: %d "
+          "Median wall time per element: %f" % (input_size, batch_size,
+                                                median_wall_time))
+    self.report_benchmark(
+        iters=len(deltas),
+        wall_time=median_wall_time,
+        name="reshape_slice_repeat_callable_input_%d_batch_%d" %
+        (input_size, batch_size))
+
+  def benchmarkSliceBatchCacheRepeatCallable(self):
+    input_size = 10000
+    batch_size = 100
+    num_epochs = 100
+
+    input_data = np.random.randn(input_size)
+
+    dataset = (
+        dataset_ops.Dataset.from_tensor_slices(input_data).batch(batch_size)
+        .cache().repeat(num_epochs + 1))
+    iterator = dataset_ops.make_initializable_iterator(dataset)
+    next_element = iterator.get_next()
+
+    with session.Session() as sess:
+      sess.run(iterator.initializer)
+      get_next_element = sess.make_callable(next_element)
+      # Run one whole epoch to burn in the computation.
+      for _ in range(input_size // batch_size):
+        get_next_element()
+      deltas = []
+      try:
+        while True:
+          start = time.time()
+          get_next_element()
+          deltas.append(time.time() - start)
+      except errors.OutOfRangeError:
+        pass
+
+    median_wall_time = np.median(deltas)
+    print(
+        "Slice/batch/cache/repeat with callable input size: %d batch size: %d "
+        "Median wall time per element: %f"
+        % (input_size, batch_size, median_wall_time))
+    self.report_benchmark(
+        iters=len(deltas),
+        wall_time=median_wall_time,
+        name="slice_batch_cache_repeat_callable_input_%d_batch_%d" %
+        (input_size, batch_size))
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/data/benchmarks/map_benchmark.py b/tensorflow/python/data/benchmarks/map_benchmark.py
new file mode 100644
index 00000000000..65d945cdae8
--- /dev/null
+++ b/tensorflow/python/data/benchmarks/map_benchmark.py
@@ -0,0 +1,135 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Bechmarks for `tf.data.Dataset.map()`."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import time
+
+import numpy as np
+
+from tensorflow.python.client import session
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import ops
+from tensorflow.python.platform import test
+
+
+# TODO(b/119837791): Add eager benchmarks.
+class MapBenchmark(test.Benchmark):
+  """Bechmarks for `tf.data.Dataset.map()`."""
+
+  def benchmarkChainOfMaps(self):
+    chain_lengths = [0, 1, 2, 5, 10, 20, 50]
+    for chain_length in chain_lengths:
+      for mode in ["general", "single-threaded", "short-circuit"]:
+        if mode == "general":
+          map_fn = lambda x: x + 1
+          use_inter_op_parallelism = True
+          print_label = ""
+          benchmark_label = ""
+        if mode == "single-threaded":
+          map_fn = lambda x: x + 1
+          use_inter_op_parallelism = False
+          print_label = " (single threaded mode)"
+          benchmark_label = "_single_threaded"
+        if mode == "short-circuit":
+          map_fn = lambda x: x
+          use_inter_op_parallelism = True  # should not have any significance
+          print_label = " (short circuit mode)"
+          benchmark_label = "_short_circuit"
+
+        with ops.Graph().as_default():
+          dataset = dataset_ops.Dataset.from_tensors(0).repeat(None)
+          for _ in range(chain_length):
+            dataset = dataset_ops.MapDataset(
+                dataset,
+                map_fn,
+                use_inter_op_parallelism=use_inter_op_parallelism)
+          iterator = dataset_ops.make_one_shot_iterator(dataset)
+          next_element = iterator.get_next()
+
+          with session.Session() as sess:
+            for _ in range(5):
+              sess.run(next_element.op)
+            deltas = []
+            for _ in range(100):
+              start = time.time()
+              for _ in range(100):
+                sess.run(next_element.op)
+              end = time.time()
+              deltas.append(end - start)
+
+            median_wall_time = np.median(deltas) / 100
+            print("Map dataset chain length%s: %d Median wall time: %f" %
+                  (print_label, chain_length, median_wall_time))
+            self.report_benchmark(
+                iters=1000,
+                wall_time=median_wall_time,
+                name="map_dataset_chain_length_%d%s" % (chain_length,
+                                                        benchmark_label))
+
+  def benchmarkMapFanOut(self):
+    fan_outs = [1, 2, 5, 10, 20, 50, 100]
+    for fan_out in fan_outs:
+      for mode in ["general", "single-threaded", "short-circuit"]:
+        if mode == "general":
+          map_fn = lambda *xs: [x + 1 for x in xs]
+          use_inter_op_parallelism = True
+          print_label = ""
+          benchmark_label = ""
+        if mode == "single-threaded":
+          map_fn = lambda *xs: [x + 1 for x in xs]
+          use_inter_op_parallelism = False
+          print_label = " (single threaded mode)"
+          benchmark_label = "_single_threaded"
+        if mode == "short-circuit":
+          map_fn = lambda *xs: xs
+          use_inter_op_parallelism = True  # should not have any significance
+          print_label = " (short circuit mode)"
+          benchmark_label = "_short_circuit"
+
+        with ops.Graph().as_default():
+          dataset = dataset_ops.Dataset.from_tensors(
+              tuple(0 for _ in range(fan_out))).repeat(None)
+          dataset = dataset_ops.MapDataset(
+              dataset,
+              map_fn,
+              use_inter_op_parallelism=use_inter_op_parallelism)
+          iterator = dataset_ops.make_one_shot_iterator(dataset)
+          next_element = iterator.get_next()
+
+          with session.Session() as sess:
+            for _ in range(5):
+              sess.run(next_element[0].op)
+            deltas = []
+            for _ in range(100):
+              start = time.time()
+              for _ in range(100):
+                sess.run(next_element[0].op)
+              end = time.time()
+              deltas.append(end - start)
+
+            median_wall_time = np.median(deltas) / 100
+            print("Map dataset fan out%s: %d Median wall time: %f" %
+                  (print_label, fan_out, median_wall_time))
+            self.report_benchmark(
+                iters=1000,
+                wall_time=median_wall_time,
+                name="map_dataset_fan_out_%d%s" % (fan_out, benchmark_label))
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/data/benchmarks/range_benchmark.py b/tensorflow/python/data/benchmarks/range_benchmark.py
index 25f63b79a26..a5020e28730 100644
--- a/tensorflow/python/data/benchmarks/range_benchmark.py
+++ b/tensorflow/python/data/benchmarks/range_benchmark.py
@@ -39,7 +39,7 @@ class RangeBenchmark(test.Benchmark):
     # costs).
     dataset = dataset_ops.Dataset.range(num_elements).skip(
         num_elements - 1).take(1).with_options(options)
-    iterator = dataset.make_initializable_iterator()
+    iterator = dataset_ops.make_initializable_iterator(dataset)
     next_element = iterator.get_next()
 
     with session.Session() as sess:
diff --git a/tensorflow/python/data/experimental/__init__.py b/tensorflow/python/data/experimental/__init__.py
index 126c2be4420..14dfec37cd0 100644
--- a/tensorflow/python/data/experimental/__init__.py
+++ b/tensorflow/python/data/experimental/__init__.py
@@ -25,6 +25,7 @@ See [Importing Data](https://tensorflow.org/guide/datasets) for an overview.
 @@Counter
 @@CheckpointInputPipelineHook
 @@CsvDataset
+@@OptimizationOptions
 @@Optional
 @@RandomDataset
 @@Reducer
@@ -32,12 +33,15 @@ See [Importing Data](https://tensorflow.org/guide/datasets) for an overview.
 @@StatsAggregator
 @@StatsOptions
 @@TFRecordWriter
+@@ThreadingOptions
 
 @@bucket_by_sequence_length
+@@cardinality
 @@choose_from_datasets
 @@copy_to_device
 @@dense_to_sparse_batch
 @@enumerate_dataset
+@@filter_for_shard
 @@get_next_as_optional
 @@get_single_element
 @@group_by_reducer
@@ -59,6 +63,8 @@ See [Importing Data](https://tensorflow.org/guide/datasets) for an overview.
 @@unique
 
 @@AUTOTUNE
+@@INFINITE_CARDINALITY
+@@UNKNOWN_CARDINALITY
 """
 
 from __future__ import absolute_import
@@ -70,9 +76,13 @@ from __future__ import print_function
 from tensorflow.python.data.experimental.ops.batching import dense_to_sparse_batch
 from tensorflow.python.data.experimental.ops.batching import map_and_batch
 from tensorflow.python.data.experimental.ops.batching import unbatch
+from tensorflow.python.data.experimental.ops.cardinality import cardinality
+from tensorflow.python.data.experimental.ops.cardinality import INFINITE as INFINITE_CARDINALITY
+from tensorflow.python.data.experimental.ops.cardinality import UNKNOWN as UNKNOWN_CARDINALITY
 from tensorflow.python.data.experimental.ops.counter import Counter
 from tensorflow.python.data.experimental.ops.enumerate_ops import enumerate_dataset
 from tensorflow.python.data.experimental.ops.error_ops import ignore_errors
+from tensorflow.python.data.experimental.ops.filter_for_shard_ops import filter_for_shard
 from tensorflow.python.data.experimental.ops.get_single_element import get_single_element
 from tensorflow.python.data.experimental.ops.grouping import bucket_by_sequence_length
 from tensorflow.python.data.experimental.ops.grouping import group_by_reducer
@@ -83,10 +93,8 @@ from tensorflow.python.data.experimental.ops.interleave_ops import parallel_inte
 from tensorflow.python.data.experimental.ops.interleave_ops import sample_from_datasets
 from tensorflow.python.data.experimental.ops.iterator_ops import CheckpointInputPipelineHook
 from tensorflow.python.data.experimental.ops.iterator_ops import make_saveable_from_iterator
-
-# Optimization constant that can be used to enable auto-tuning.
 from tensorflow.python.data.experimental.ops.optimization import AUTOTUNE
-
+from tensorflow.python.data.experimental.ops.optimization_options import OptimizationOptions
 from tensorflow.python.data.experimental.ops.parsing_ops import parse_example_dataset
 from tensorflow.python.data.experimental.ops.prefetching_ops import copy_to_device
 from tensorflow.python.data.experimental.ops.prefetching_ops import prefetch_to_device
@@ -101,6 +109,7 @@ from tensorflow.python.data.experimental.ops.shuffle_ops import shuffle_and_repe
 from tensorflow.python.data.experimental.ops.stats_aggregator import StatsAggregator
 from tensorflow.python.data.experimental.ops.stats_ops import latency_stats
 from tensorflow.python.data.experimental.ops.stats_options import StatsOptions
+from tensorflow.python.data.experimental.ops.threading_options import ThreadingOptions
 from tensorflow.python.data.experimental.ops.unique import unique
 from tensorflow.python.data.experimental.ops.writers import TFRecordWriter
 from tensorflow.python.data.ops.iterator_ops import get_next_as_optional
diff --git a/tensorflow/python/data/experimental/benchmarks/BUILD b/tensorflow/python/data/experimental/benchmarks/BUILD
index b89fbe7757b..8175116c6ed 100644
--- a/tensorflow/python/data/experimental/benchmarks/BUILD
+++ b/tensorflow/python/data/experimental/benchmarks/BUILD
@@ -8,15 +8,12 @@ load("//tensorflow:tensorflow.bzl", "cuda_py_test")
 load("//tensorflow:tensorflow.bzl", "py_test")
 
 py_test(
-    name = "map_and_batch_benchmark",
-    size = "medium",
-    srcs = ["map_and_batch_benchmark.py"],
+    name = "autotune_benchmark",
+    srcs = ["autotune_benchmark.py"],
     srcs_version = "PY2AND3",
     deps = [
-        "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:random_ops",
+        "//tensorflow/python:math_ops",
         "//tensorflow/python:session",
         "//tensorflow/python/data/experimental/ops:batching",
         "//tensorflow/python/data/experimental/ops:optimization",
@@ -26,17 +23,102 @@ py_test(
 )
 
 py_test(
-    name = "map_benchmark",
-    size = "medium",
-    srcs = ["map_benchmark.py"],
+    name = "csv_dataset_benchmark",
+    srcs = ["csv_dataset_benchmark.py"],
+    srcs_version = "PY2AND3",
+    tags = ["no_pip"],
+    deps = [
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:parsing_ops",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:session",
+        "//tensorflow/python/data/experimental/ops:readers",
+        "//tensorflow/python/data/ops:readers",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_test(
+    name = "map_and_batch_benchmark",
+    srcs = ["map_and_batch_benchmark.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:random_ops",
+        "//tensorflow/python:session",
+        "//tensorflow/python/data/experimental/ops:batching",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_test(
+    name = "map_vectorization_benchmark",
+    srcs = ["map_vectorization_benchmark.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:parsing_ops",
+        "//tensorflow/python:session",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/util:nest",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_test(
+    name = "matching_files_benchmark",
+    size = "small",
+    srcs = ["matching_files_benchmark.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:util",
+        "//tensorflow/python/data/experimental/ops:matching_files",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_test(
+    name = "optimize_benchmark",
+    srcs = ["optimize_benchmark.py"],
     srcs_version = "PY2AND3",
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:session",
-        "//tensorflow/python/data/experimental/ops:batching",
-        "//tensorflow/python/data/experimental/ops:optimization",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_test(
+    name = "unbatch_benchmark",
+    srcs = ["unbatch_benchmark.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:session",
+        "//tensorflow/python/data/experimental/ops:batching",
         "//tensorflow/python/data/ops:dataset_ops",
         "//third_party/py/numpy",
     ],
diff --git a/tensorflow/python/data/experimental/benchmarks/autotune_benchmark.py b/tensorflow/python/data/experimental/benchmarks/autotune_benchmark.py
new file mode 100644
index 00000000000..b48ef95666e
--- /dev/null
+++ b/tensorflow/python/data/experimental/benchmarks/autotune_benchmark.py
@@ -0,0 +1,187 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Benchmarks for autotuning performance knobs."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import time
+
+import numpy as np
+
+from tensorflow.python.client import session
+from tensorflow.python.data.experimental.ops import batching
+from tensorflow.python.data.experimental.ops import optimization
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import test
+
+
+class AutotuneBenchmark(test.Benchmark):
+  """Benchmarks for autotuning performance knobs."""
+
+  def benchmarkMap(self):
+    k = 1024 * 1024
+    dataset = dataset_ops.Dataset.from_tensors((np.random.rand(1, 4 * k),
+                                                np.random.rand(4 * k,
+                                                               1))).repeat()
+    dataset = dataset.map(
+        math_ops.matmul, num_parallel_calls=optimization.AUTOTUNE)
+    iterator = dataset_ops.make_one_shot_iterator(dataset)
+    get_next = iterator.get_next()
+
+    deltas = []
+    with session.Session() as sess:
+      for _ in range(5):
+        sess.run(get_next.op)
+      for _ in range(1000):
+        start = time.time()
+        sess.run(get_next.op)
+        end = time.time()
+        deltas.append(end - start)
+
+    print("%f (median), %f (mean), %f (stddev), %f (min), %f (max)\n" %
+          (np.median(deltas), np.mean(deltas), np.std(deltas), np.min(deltas),
+           np.max(deltas)))
+    self.report_benchmark(
+        iters=1000, wall_time=np.median(deltas), name="map_autotune")
+
+  def benchmarkMapAndBatch(self):
+    self._benchmarkMapAndBatch(numa_aware=False)
+    self._benchmarkMapAndBatch(numa_aware=True)
+
+  def _benchmarkMapAndBatch(self, numa_aware):
+    batch_size = 16
+    k = 1024 * 1024
+    dataset = dataset_ops.Dataset.from_tensors((np.random.rand(1, 4 * k),
+                                                np.random.rand(4 * k,
+                                                               1))).repeat()
+    dataset = dataset.apply(
+        batching.map_and_batch(
+            math_ops.matmul,
+            num_parallel_calls=optimization.AUTOTUNE,
+            batch_size=batch_size))
+    options = dataset_ops.Options()
+    options.experimental_numa_aware = numa_aware
+    dataset = dataset.with_options(options)
+    iterator = dataset_ops.make_one_shot_iterator(dataset)
+    get_next = iterator.get_next()
+
+    deltas = []
+    with session.Session() as sess:
+      for _ in range(5):
+        sess.run(get_next.op)
+      for _ in range(100):
+        start = time.time()
+        sess.run(get_next.op)
+        end = time.time()
+        deltas.append(end - start)
+
+    print("%f (median), %f (mean), %f (stddev), %f (min), %f (max)\n" %
+          (np.median(deltas), np.mean(deltas), np.std(deltas), np.min(deltas),
+           np.max(deltas)))
+
+    self.report_benchmark(
+        iters=100,
+        wall_time=np.median(deltas),
+        name=("numa_" if numa_aware else "") + "map_and_batch_autotune")
+
+  def benchmarkInterleave(self):
+    k = 1024 * 1024
+    dataset = dataset_ops.Dataset.from_tensors((np.random.rand(1, 4 * k),
+                                                np.random.rand(4 * k,
+                                                               1))).repeat()
+    dataset = dataset.map(math_ops.matmul)
+    dataset = dataset_ops.Dataset.range(1).repeat().interleave(
+        lambda _: dataset,
+        cycle_length=10,
+        num_parallel_calls=optimization.AUTOTUNE)
+    iterator = dataset_ops.make_one_shot_iterator(dataset)
+    get_next = iterator.get_next()
+
+    deltas = []
+    with session.Session() as sess:
+      for _ in range(5):
+        sess.run(get_next.op)
+      for _ in range(1000):
+        start = time.time()
+        sess.run(get_next.op)
+        end = time.time()
+        deltas.append(end - start)
+
+    print("%f (median), %f (mean), %f (stddev), %f (min), %f (max)\n" %
+          (np.median(deltas), np.mean(deltas), np.std(deltas), np.min(deltas),
+           np.max(deltas)))
+    self.report_benchmark(
+        iters=1000,
+        wall_time=np.median(deltas),
+        name="interleave_autotune")
+
+  def benchmarkMapAndInterleave(self):
+    k = 1024 * 1024
+    a = (np.random.rand(1, 8 * k), np.random.rand(8 * k, 1))
+    b = (np.random.rand(1, 4 * k), np.random.rand(4 * k, 1))
+    c = (np.random.rand(1, 2 * k), np.random.rand(2 * k, 1))
+    dataset = dataset_ops.Dataset.from_tensors((a, b, c)).repeat()
+
+    def f1(a, b, c):
+      x, y = a
+      return math_ops.matmul(x, y), b, c
+
+    def f2(a, b, c):
+      x, y = b
+      return a, math_ops.matmul(x, y), c
+
+    def f3(a, b, c):
+      x, y = c
+      return a, b, math_ops.matmul(x, y)
+
+    dataset = dataset.map(f1, num_parallel_calls=optimization.AUTOTUNE)
+    dataset = dataset_ops.Dataset.range(1).repeat().interleave(
+        lambda _: dataset,
+        num_parallel_calls=optimization.AUTOTUNE,
+        cycle_length=2)
+
+    dataset = dataset.map(f2, num_parallel_calls=optimization.AUTOTUNE)
+    dataset = dataset_ops.Dataset.range(1).repeat().interleave(
+        lambda _: dataset,
+        num_parallel_calls=optimization.AUTOTUNE,
+        cycle_length=2)
+
+    dataset = dataset.map(f3, num_parallel_calls=optimization.AUTOTUNE)
+    iterator = dataset_ops.make_one_shot_iterator(dataset)
+    get_next = iterator.get_next()
+
+    deltas = []
+    with session.Session() as sess:
+      for _ in range(5):
+        sess.run(get_next)
+      for _ in range(100):
+        start = time.time()
+        sess.run(get_next)
+        end = time.time()
+        deltas.append(end - start)
+
+    print("%f (median), %f (mean), %f (stddev), %f (min), %f (max)\n" %
+          (np.median(deltas), np.mean(deltas), np.std(deltas), np.min(deltas),
+           np.max(deltas)))
+    self.report_benchmark(
+        iters=100,
+        wall_time=np.median(deltas),
+        name="map_and_interleave_autotune")
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/data/experimental/benchmarks/csv_dataset_benchmark.py b/tensorflow/python/data/experimental/benchmarks/csv_dataset_benchmark.py
new file mode 100644
index 00000000000..03345ce4e66
--- /dev/null
+++ b/tensorflow/python/data/experimental/benchmarks/csv_dataset_benchmark.py
@@ -0,0 +1,130 @@
+#  Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Benchmarks for `tf.data.experimental.CsvDataset`."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import string
+import tempfile
+import time
+
+import numpy as np
+
+from tensorflow.python.client import session
+from tensorflow.python.data.experimental.ops import readers
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.ops import readers as core_readers
+from tensorflow.python.ops import parsing_ops
+from tensorflow.python.platform import gfile
+from tensorflow.python.platform import googletest
+from tensorflow.python.platform import test
+
+
+class CsvDatasetBenchmark(test.Benchmark):
+  """Benchmarks for `tf.data.experimental.CsvDataset`."""
+
+  FLOAT_VAL = '1.23456E12'
+  STR_VAL = string.ascii_letters * 10
+
+  def _setUp(self, str_val):
+    # Since this isn't test.TestCase, have to manually create a test dir
+    gfile.MakeDirs(googletest.GetTempDir())
+    self._temp_dir = tempfile.mkdtemp(dir=googletest.GetTempDir())
+
+    self._num_cols = [4, 64, 256]
+    self._num_per_iter = 5000
+    self._filenames = []
+    for n in self._num_cols:
+      fn = os.path.join(self._temp_dir, 'file%d.csv' % n)
+      with open(fn, 'wb') as f:
+        # Just write 100 rows and use `repeat`... Assumes the cost
+        # of creating an iterator is not significant
+        row = ','.join([str_val for _ in range(n)])
+        f.write('\n'.join([row for _ in range(100)]))
+      self._filenames.append(fn)
+
+  def _tearDown(self):
+    gfile.DeleteRecursively(self._temp_dir)
+
+  def _runBenchmark(self, dataset, num_cols, prefix):
+    dataset = dataset.skip(self._num_per_iter - 1)
+    deltas = []
+    for _ in range(10):
+      next_element = dataset_ops.make_one_shot_iterator(dataset).get_next()
+      with session.Session() as sess:
+        start = time.time()
+        # NOTE: This depends on the underlying implementation of skip, to have
+        # the net effect of calling `GetNext` num_per_iter times on the
+        # input dataset. We do it this way (instead of a python for loop, or
+        # batching N inputs in one iter) so that the overhead from session.run
+        # or batch doesn't dominate. If we eventually optimize skip, this has
+        # to change.
+        sess.run(next_element)
+        end = time.time()
+      deltas.append(end - start)
+    # Median wall time per CSV record read and decoded
+    median_wall_time = np.median(deltas) / self._num_per_iter
+    print('%s num_cols: %d Median wall time: %f' % (prefix, num_cols,
+                                                    median_wall_time))
+    self.report_benchmark(
+        iters=self._num_per_iter,
+        wall_time=median_wall_time,
+        name='%s_with_cols_%d' % (prefix, num_cols))
+
+  def benchmarkMapWithFloats(self):
+    self._setUp(self.FLOAT_VAL)
+    for i in range(len(self._filenames)):
+      num_cols = self._num_cols[i]
+      kwargs = {'record_defaults': [[0.0]] * num_cols}
+      dataset = core_readers.TextLineDataset(self._filenames[i]).repeat()
+      dataset = dataset.map(lambda l: parsing_ops.decode_csv(l, **kwargs))  # pylint: disable=cell-var-from-loop
+      self._runBenchmark(dataset, num_cols, 'csv_float_map_decode_csv')
+    self._tearDown()
+
+  def benchmarkMapWithStrings(self):
+    self._setUp(self.STR_VAL)
+    for i in range(len(self._filenames)):
+      num_cols = self._num_cols[i]
+      kwargs = {'record_defaults': [['']] * num_cols}
+      dataset = core_readers.TextLineDataset(self._filenames[i]).repeat()
+      dataset = dataset.map(lambda l: parsing_ops.decode_csv(l, **kwargs))  # pylint: disable=cell-var-from-loop
+      self._runBenchmark(dataset, num_cols, 'csv_strings_map_decode_csv')
+    self._tearDown()
+
+  def benchmarkCsvDatasetWithFloats(self):
+    self._setUp(self.FLOAT_VAL)
+    for i in range(len(self._filenames)):
+      num_cols = self._num_cols[i]
+      kwargs = {'record_defaults': [[0.0]] * num_cols}
+      dataset = core_readers.TextLineDataset(self._filenames[i]).repeat()
+      dataset = readers.CsvDataset(self._filenames[i], **kwargs).repeat()  # pylint: disable=cell-var-from-loop
+      self._runBenchmark(dataset, num_cols, 'csv_float_fused_dataset')
+    self._tearDown()
+
+  def benchmarkCsvDatasetWithStrings(self):
+    self._setUp(self.STR_VAL)
+    for i in range(len(self._filenames)):
+      num_cols = self._num_cols[i]
+      kwargs = {'record_defaults': [['']] * num_cols}
+      dataset = core_readers.TextLineDataset(self._filenames[i]).repeat()
+      dataset = readers.CsvDataset(self._filenames[i], **kwargs).repeat()  # pylint: disable=cell-var-from-loop
+      self._runBenchmark(dataset, num_cols, 'csv_strings_fused_dataset')
+    self._tearDown()
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/data/experimental/benchmarks/map_and_batch_benchmark.py b/tensorflow/python/data/experimental/benchmarks/map_and_batch_benchmark.py
index a90156cd33e..fbd06a5a78e 100644
--- a/tensorflow/python/data/experimental/benchmarks/map_and_batch_benchmark.py
+++ b/tensorflow/python/data/experimental/benchmarks/map_and_batch_benchmark.py
@@ -17,6 +17,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import hashlib
+import itertools
 import time
 
 import numpy as np
@@ -25,11 +27,15 @@ from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.client import session
 from tensorflow.python.data.experimental.ops import batching
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.platform import test
 
+_NUMPY_RANDOM_SEED = 42
+
 
 class MapAndBatchBenchmark(test.Benchmark):
   """Benchmarks for `tf.data.experimental.map_and_batch()`."""
@@ -48,7 +54,7 @@ class MapAndBatchBenchmark(test.Benchmark):
 
     dataset = dataset.apply(batching.map_and_batch(
         lambda _: dense_value, batch_size_placeholder))
-    iterator = dataset.make_initializable_iterator()
+    iterator = dataset_ops.make_initializable_iterator(dataset)
     next_element = iterator.get_next()
 
     for shape in shapes:
@@ -89,6 +95,129 @@ class MapAndBatchBenchmark(test.Benchmark):
             name="benchmark_batch_dense_dataset_nnz_%d_batch_size_%d" % (
                 np.prod(shape), batch_size))
 
+  def benchmarkMapAndBatchChainingVersusFusing(self):
+    """Compares the performance of chaining and fusing map and batch.
+
+    NOTE: It is recommended to build the benchmark with
+    `-c opt --copt=-mavx --copt=-mavx2 --copt=-mfma --copt=-gmlt`
+    and execute it on a machine with at least 32 CPU cores.
+    """
+
+    # Sequential pipeline configurations.
+    seq_elem_size_series = itertools.product([1], [1], [1, 2, 4, 8], [16])
+    seq_batch_size_series = itertools.product([1], [1], [1], [8, 16, 32, 64])
+
+    # Parallel pipeline configuration.
+    par_elem_size_series = itertools.product([32], [32], [1, 2, 4, 8], [256])
+    par_batch_size_series = itertools.product([32], [32], [1],
+                                              [128, 256, 512, 1024])
+    par_num_calls_series = itertools.product([8, 16, 32, 64], [32], [1], [512])
+    par_inter_op_series = itertools.product([32], [8, 16, 32, 64], [1], [512])
+
+    def name(method, label, num_calls, inter_op, element_size, batch_size):
+      return ("%s_id_%s_num_calls_%d_inter_op_%d_elem_size_%d_batch_size_%d" % (
+          method,
+          hashlib.sha1(label).hexdigest()[:8],
+          num_calls,
+          inter_op,
+          element_size,
+          batch_size,
+      ))
+
+    def benchmark(label, series):
+      """Runs benchmark the given series."""
+
+      print("%s:" % label)
+
+      def make_base_dataset(element_size):
+        k = 1024 * 1024
+        x = constant_op.constant(np.random.rand(element_size, 4 * k))
+        y = constant_op.constant(np.random.rand(4 * k, 1))
+        return dataset_ops.Dataset.range(1000000000000).map(lambda _: (x, y))
+
+      for num_calls, inter_op, element_size, batch_size in series:
+
+        num_iters = 1024 // (
+            (element_size * batch_size) // min(num_calls, inter_op))
+        dataset = make_base_dataset(element_size)
+        chained_dataset = dataset.map(
+            math_ops.matmul,
+            num_parallel_calls=num_calls).batch(batch_size=batch_size)
+        chained_iterator = dataset_ops.make_one_shot_iterator(chained_dataset)
+        chained_get_next = chained_iterator.get_next()
+
+        chained_deltas = []
+        with session.Session(
+            config=config_pb2.ConfigProto(
+                inter_op_parallelism_threads=inter_op,
+                use_per_session_threads=True)) as sess:
+          for _ in range(5):
+            sess.run(chained_get_next.op)
+          for _ in range(num_iters):
+            start = time.time()
+            sess.run(chained_get_next.op)
+            end = time.time()
+            chained_deltas.append(end - start)
+
+        fused_dataset = dataset.apply(
+            batching.map_and_batch(
+                math_ops.matmul,
+                num_parallel_calls=num_calls,
+                batch_size=batch_size))
+        fused_iterator = dataset_ops.make_one_shot_iterator(fused_dataset)
+        fused_get_next = fused_iterator.get_next()
+
+        fused_deltas = []
+        with session.Session(
+            config=config_pb2.ConfigProto(
+                inter_op_parallelism_threads=inter_op,
+                use_per_session_threads=True)) as sess:
+
+          for _ in range(5):
+            sess.run(fused_get_next.op)
+          for _ in range(num_iters):
+            start = time.time()
+            sess.run(fused_get_next.op)
+            end = time.time()
+            fused_deltas.append(end - start)
+
+        print(
+            "batch size: %d, num parallel calls: %d, inter-op parallelism: %d, "
+            "element size: %d, num iters: %d\nchained wall time: %f (median), "
+            "%f (mean), %f (stddev), %f (min), %f (max)\n  fused wall time: "
+            "%f (median), %f (mean), %f (stddev), %f (min), %f (max)\n    "
+            "chained/fused:    %.2fx (median),    %.2fx (mean)" %
+            (batch_size, num_calls, inter_op, element_size, num_iters,
+             np.median(chained_deltas), np.mean(chained_deltas),
+             np.std(chained_deltas), np.min(chained_deltas),
+             np.max(chained_deltas), np.median(fused_deltas),
+             np.mean(fused_deltas), np.std(fused_deltas), np.min(fused_deltas),
+             np.max(fused_deltas),
+             np.median(chained_deltas) / np.median(fused_deltas),
+             np.mean(chained_deltas) / np.mean(fused_deltas)))
+
+        self.report_benchmark(
+            iters=num_iters,
+            wall_time=np.median(chained_deltas),
+            name=name("chained", label, num_calls, inter_op, element_size,
+                      batch_size))
+
+        self.report_benchmark(
+            iters=num_iters,
+            wall_time=np.median(fused_deltas),
+            name=name("fused", label, num_calls, inter_op, element_size,
+                      batch_size))
+
+      print()
+
+    np.random.seed(_NUMPY_RANDOM_SEED)
+    benchmark("Sequential element size evaluation", seq_elem_size_series)
+    benchmark("Sequential batch size evaluation", seq_batch_size_series)
+    benchmark("Parallel element size evaluation", par_elem_size_series)
+    benchmark("Parallel batch size evaluation", par_batch_size_series)
+    benchmark("Transformation parallelism evaluation", par_num_calls_series)
+    benchmark("Threadpool size evaluation", par_inter_op_series)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/data/experimental/benchmarks/map_benchmark.py b/tensorflow/python/data/experimental/benchmarks/map_benchmark.py
deleted file mode 100644
index ad253cffa56..00000000000
--- a/tensorflow/python/data/experimental/benchmarks/map_benchmark.py
+++ /dev/null
@@ -1,245 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for the experimental input pipeline ops."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import hashlib
-import itertools
-import time
-
-import numpy as np
-
-from tensorflow.core.protobuf import config_pb2
-from tensorflow.python.client import session
-from tensorflow.python.data.experimental.ops import batching
-from tensorflow.python.data.experimental.ops import optimization
-from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.platform import test
-
-_NUMPY_RANDOM_SEED = 42
-
-
-class MapDatasetBenchmark(test.Benchmark):
-
-  # The purpose of this benchmark is to compare the performance of chaining vs
-  # fusing of the map and batch transformations across various configurations.
-  #
-  # NOTE: It is recommended to build the benchmark with
-  # `-c opt --copt=-mavx --copt=-mavx2 --copt=-mfma --copt=-gmlt`
-  # and execute it on a machine with at least 32 CPU cores.
-  def benchmarkMapAndBatch(self):
-
-    # Sequential pipeline configurations.
-    seq_elem_size_series = itertools.product([1], [1], [1, 2, 4, 8], [16])
-    seq_batch_size_series = itertools.product([1], [1], [1], [8, 16, 32, 64])
-
-    # Parallel pipeline configuration.
-    par_elem_size_series = itertools.product([32], [32], [1, 2, 4, 8], [256])
-    par_batch_size_series = itertools.product([32], [32], [1],
-                                              [128, 256, 512, 1024])
-    par_num_calls_series = itertools.product([8, 16, 32, 64], [32], [1], [512])
-    par_inter_op_series = itertools.product([32], [8, 16, 32, 64], [1], [512])
-
-    def name(method, label, num_calls, inter_op, element_size, batch_size):
-      return ("%s_id_%s_num_calls_%d_inter_op_%d_elem_size_%d_batch_size_%d" % (
-          method,
-          hashlib.sha1(label).hexdigest(),
-          num_calls,
-          inter_op,
-          element_size,
-          batch_size,
-      ))
-
-    def benchmark(label, series):
-
-      print("%s:" % label)
-      for num_calls, inter_op, element_size, batch_size in series:
-
-        num_iters = 1024 // (
-            (element_size * batch_size) // min(num_calls, inter_op))
-        k = 1024 * 1024
-        dataset = dataset_ops.Dataset.from_tensors((np.random.rand(
-            element_size, 4 * k), np.random.rand(4 * k, 1))).repeat()
-
-        chained_dataset = dataset.map(
-            math_ops.matmul,
-            num_parallel_calls=num_calls).batch(batch_size=batch_size)
-        chained_iterator = chained_dataset.make_one_shot_iterator()
-        chained_get_next = chained_iterator.get_next()
-
-        chained_deltas = []
-        with session.Session(
-            config=config_pb2.ConfigProto(
-                inter_op_parallelism_threads=inter_op,
-                use_per_session_threads=True)) as sess:
-          for _ in range(5):
-            sess.run(chained_get_next.op)
-          for _ in range(num_iters):
-            start = time.time()
-            sess.run(chained_get_next.op)
-            end = time.time()
-            chained_deltas.append(end - start)
-
-        fused_dataset = dataset.apply(
-            batching.map_and_batch(
-                math_ops.matmul,
-                num_parallel_calls=num_calls,
-                batch_size=batch_size))
-        fused_iterator = fused_dataset.make_one_shot_iterator()
-        fused_get_next = fused_iterator.get_next()
-
-        fused_deltas = []
-        with session.Session(
-            config=config_pb2.ConfigProto(
-                inter_op_parallelism_threads=inter_op,
-                use_per_session_threads=True)) as sess:
-
-          for _ in range(5):
-            sess.run(fused_get_next.op)
-          for _ in range(num_iters):
-            start = time.time()
-            sess.run(fused_get_next.op)
-            end = time.time()
-            fused_deltas.append(end - start)
-
-        print(
-            "batch size: %d, num parallel calls: %d, inter-op parallelism: %d, "
-            "element size: %d, num iters: %d\nchained wall time: %f (median), "
-            "%f (mean), %f (stddev), %f (min), %f (max)\n  fused wall time: "
-            "%f (median), %f (mean), %f (stddev), %f (min), %f (max)\n    "
-            "chained/fused:    %.2fx (median),    %.2fx (mean)" %
-            (batch_size, num_calls, inter_op, element_size, num_iters,
-             np.median(chained_deltas), np.mean(chained_deltas),
-             np.std(chained_deltas), np.min(chained_deltas),
-             np.max(chained_deltas), np.median(fused_deltas),
-             np.mean(fused_deltas), np.std(fused_deltas), np.min(fused_deltas),
-             np.max(fused_deltas),
-             np.median(chained_deltas) / np.median(fused_deltas),
-             np.mean(chained_deltas) / np.mean(fused_deltas)))
-
-        self.report_benchmark(
-            iters=num_iters,
-            wall_time=np.median(chained_deltas),
-            name=name("chained", label, num_calls, inter_op, element_size,
-                      batch_size))
-
-        self.report_benchmark(
-            iters=num_iters,
-            wall_time=np.median(fused_deltas),
-            name=name("fused", label, num_calls, inter_op, element_size,
-                      batch_size))
-
-      print("")
-
-    np.random.seed(_NUMPY_RANDOM_SEED)
-    benchmark("Sequential element size evaluation", seq_elem_size_series)
-    benchmark("Sequential batch size evaluation", seq_batch_size_series)
-    benchmark("Parallel element size evaluation", par_elem_size_series)
-    benchmark("Parallel batch size evaluation", par_batch_size_series)
-    benchmark("Transformation parallelism evaluation", par_num_calls_series)
-    benchmark("Threadpool size evaluation", par_inter_op_series)
-
-  # This benchmark compares the performance of pipeline with multiple chained
-  # maps with and without map fusion.
-  def benchmarkChainOfMaps(self):
-    chain_lengths = [0, 1, 2, 5, 10, 20, 50]
-    for chain_length in chain_lengths:
-      self._benchmarkChainOfMaps(chain_length, False)
-      self._benchmarkChainOfMaps(chain_length, True)
-
-  def _benchmarkChainOfMaps(self, chain_length, optimize_dataset):
-    with ops.Graph().as_default():
-      dataset = dataset_ops.Dataset.from_tensors(0).repeat(None)
-      for _ in range(chain_length):
-        dataset = dataset.map(lambda x: x)
-      if optimize_dataset:
-        dataset = dataset.apply(optimization.optimize(["map_fusion"]))
-
-      iterator = dataset.make_one_shot_iterator()
-      next_element = iterator.get_next()
-
-      with session.Session() as sess:
-        for _ in range(5):
-          sess.run(next_element.op)
-        deltas = []
-        for _ in range(100):
-          start = time.time()
-          for _ in range(100):
-            sess.run(next_element.op)
-          end = time.time()
-          deltas.append(end - start)
-
-        median_wall_time = np.median(deltas) / 100
-        opt_mark = "opt" if optimize_dataset else "no-opt"
-        print("Map dataset {} chain length: {} Median wall time: {}".format(
-            opt_mark, chain_length, median_wall_time))
-        self.report_benchmark(
-            iters=1000,
-            wall_time=median_wall_time,
-            name="benchmark_map_dataset_chain_latency_{}_{}".format(
-                opt_mark, chain_length))
-
-
-class MapAndFilterBenchmark(test.Benchmark):
-
-  # This benchmark compares the performance of pipeline with multiple chained
-  # map + filter with and without map fusion.
-  def benchmarkMapAndFilter(self):
-    chain_lengths = [0, 1, 2, 5, 10, 20, 50]
-    for chain_length in chain_lengths:
-      self._benchmarkMapAndFilter(chain_length, False)
-      self._benchmarkMapAndFilter(chain_length, True)
-
-  def _benchmarkMapAndFilter(self, chain_length, optimize_dataset):
-    with ops.Graph().as_default():
-      dataset = dataset_ops.Dataset.from_tensors(0).repeat(None)
-      for _ in range(chain_length):
-        dataset = dataset.map(lambda x: x + 5).filter(
-            lambda x: math_ops.greater_equal(x - 5, 0))
-      if optimize_dataset:
-        dataset = dataset.apply(
-            optimization.optimize(["map_and_filter_fusion"]))
-
-      iterator = dataset.make_one_shot_iterator()
-      next_element = iterator.get_next()
-
-      with session.Session() as sess:
-        for _ in range(10):
-          sess.run(next_element.op)
-        deltas = []
-        for _ in range(100):
-          start = time.time()
-          for _ in range(100):
-            sess.run(next_element.op)
-          end = time.time()
-          deltas.append(end - start)
-
-        median_wall_time = np.median(deltas) / 100
-        opt_mark = "opt" if optimize_dataset else "no-opt"
-        print("Map and filter dataset {} chain length: {} Median wall time: {}".
-              format(opt_mark, chain_length, median_wall_time))
-        self.report_benchmark(
-            iters=1000,
-            wall_time=median_wall_time,
-            name="benchmark_map_and_filter_dataset_chain_latency_{}_{}".format(
-                opt_mark, chain_length))
-
-
-if __name__ == "__main__":
-  test.main()
diff --git a/tensorflow/python/data/experimental/benchmarks/map_vectorization_benchmark.py b/tensorflow/python/data/experimental/benchmarks/map_vectorization_benchmark.py
new file mode 100644
index 00000000000..47ec6391f78
--- /dev/null
+++ b/tensorflow/python/data/experimental/benchmarks/map_vectorization_benchmark.py
@@ -0,0 +1,194 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Benchmarks for the `MapVectorization` optimization."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import time
+
+import numpy as np
+
+from tensorflow.core.example import example_pb2
+from tensorflow.core.example import feature_pb2
+from tensorflow.python.client import session
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.util import nest
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import parsing_ops
+from tensorflow.python.platform import test
+
+
+def _generate_csv_test_case():
+  """Generates a `decode_csv()` test case."""
+
+  def csv_factory():
+    return dataset_ops.Dataset.from_tensor_slices(["1.0:2:a",
+                                                   "2.4:5:c"]).repeat(5)
+
+  def decode_csv_fn(x):
+    return parsing_ops.decode_csv(
+        x,
+        record_defaults=[
+            constant_op.constant([], dtypes.float32),
+            constant_op.constant([], dtypes.int32),
+            constant_op.constant([], dtypes.string)
+        ],
+        field_delim=":")
+
+  return decode_csv_fn, csv_factory
+
+
+def _generate_parse_single_example_test_case():
+  """Generates a `parse_single_example()` test case."""
+
+  def parse_example_factory():
+    """Parse example factory."""
+
+    def _int64_feature(*values):
+      return feature_pb2.Feature(int64_list=feature_pb2.Int64List(value=values))
+
+    def _bytes_feature(*values):
+      return feature_pb2.Feature(
+          bytes_list=feature_pb2.BytesList(
+              value=[v.encode("utf-8") for v in values]))
+
+    return dataset_ops.Dataset.from_tensor_slices(
+        constant_op.constant([
+            example_pb2.Example(
+                features=feature_pb2.Features(
+                    feature={
+                        "dense_int": _int64_feature(i),
+                        "dense_str": _bytes_feature(str(i)),
+                        "sparse_int": _int64_feature(i, i * 2, i * 4, i * 8),
+                        "sparse_str": _bytes_feature(*["abc"] * i)
+                    })).SerializeToString() for i in range(10)
+        ]))
+
+  def parse_single_example_fn(x):
+    features = {
+        "dense_int": parsing_ops.FixedLenFeature((), dtypes.int64, 0),
+        "dense_str": parsing_ops.FixedLenFeature((), dtypes.string, ""),
+        "sparse_int": parsing_ops.VarLenFeature(dtypes.int64),
+        "sparse_str": parsing_ops.VarLenFeature(dtypes.string),
+    }
+    return parsing_ops.parse_single_example(x, features)
+
+  return parse_single_example_fn, parse_example_factory
+
+
+# TODO(rachelim): Add a benchmark for more expensive transformations, such as
+# vgg_preprocessing.
+class MapVectorizationBenchmark(test.Benchmark):
+  """Benchmarks for the `MapVectorization` optimization."""
+
+  def _run(self, x, num_iters=100, name=None):
+    deltas = []
+    with session.Session() as sess:
+      for _ in range(5):
+        # Warm up session...
+        sess.run(x)
+      for _ in range(num_iters):
+        start = time.time()
+        sess.run(x)
+        end = time.time()
+        deltas.append(end - start)
+    median_time = np.median(deltas)
+    self.report_benchmark(iters=num_iters, wall_time=median_time, name=name)
+    return median_time
+
+  def _compare(self, input_dataset, map_fn, batch_size, input_size, str_id):
+    num_elems = int(np.sum([np.prod(x) for x in input_size]))
+    name_template = "{}__batch_size_{}_input_element_size_{}_{}"
+    unoptimized = input_dataset.map(map_fn).batch(batch_size)
+    unoptimized_op = dataset_ops.make_one_shot_iterator(unoptimized).get_next()
+
+    optimized = input_dataset.map(map_fn).batch(batch_size)
+    options = dataset_ops.Options()
+    options.experimental_map_vectorization = True
+    optimized = optimized.with_options(options)
+    optimized_op = dataset_ops.make_one_shot_iterator(optimized).get_next()
+
+    unoptimized_time = self._run(
+        unoptimized_op,
+        name=name_template.format(str_id, batch_size, num_elems, "unoptimized"))
+    optimized_time = self._run(
+        optimized_op,
+        name=name_template.format(str_id, batch_size, num_elems, "optimized"))
+
+    print("Batch size: {}\n"
+          "Input element size: {}\n"
+          "Transformation: {}\n"
+          "Speedup: {}\n".format(batch_size, input_size, str_id,
+                                 (unoptimized_time / optimized_time)))
+
+  # Known cheap functions
+  def benchmarkIdentity(self):
+    self._benchmark_helper(lambda *args: [array_ops.identity(x) for x in args],
+                           "identity")
+
+  def benchmarkAddConst(self):
+    self._benchmark_helper(lambda *args: [x + 1 for x in args], "add_const")
+
+  def benchmarkReturnConst(self):
+    self._benchmark_helper(lambda *args: [constant_op.constant(2)], "ret_const")
+
+  def benchmarkSelect(self):
+    self._benchmark_helper(lambda *args: args[0], "select")
+
+  def benchmarkCast(self):
+    self._benchmark_helper(
+        lambda *args: [math_ops.cast(x, dtypes.float64) for x in args], "cast")
+
+  def benchmarkReshape(self):
+    self._benchmark_helper(
+        lambda *args: [array_ops.reshape(x, (-1, 30)) for x in args], "reshape")
+
+  def benchmarkDecodeCSV(self):
+    csv_fn, csv_factory = _generate_csv_test_case()
+    self._benchmark_helper(csv_fn, "decode_csv", lambda: [csv_factory()])
+
+  def benchmarkParseSingleExample(self):
+    # NOTE: Since we haven't implemented a vectorizer for "SerializeSparse",
+    # this function is only naively vectorized.
+    parse_fn, parse_factory = _generate_parse_single_example_test_case()
+
+    self._benchmark_helper(parse_fn, "parse_single_example",
+                           lambda: [parse_factory()])
+
+  def _default_dataset_factory(self):
+    input_sizes = [(10, 10, 3), (10, 100, 300)]
+    for sz in input_sizes:
+      yield dataset_ops.Dataset.from_tensor_slices(np.random.rand(*sz))
+
+  def _benchmark_helper(self, map_fn, str_id, base_dataset_factory=None):
+    if base_dataset_factory is None:
+      base_dataset_factory = self._default_dataset_factory
+
+    batch_size = 1000
+    for base_dataset in base_dataset_factory():
+      base_dataset = base_dataset.repeat()
+      input_size = [
+          tuple(shape.as_list())
+          for shape in nest.flatten(base_dataset.output_shapes)
+      ]
+      self._compare(base_dataset, map_fn, batch_size, input_size, str_id)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/data/experimental/benchmarks/matching_files_benchmark.py b/tensorflow/python/data/experimental/benchmarks/matching_files_benchmark.py
new file mode 100644
index 00000000000..c53f8dd7c53
--- /dev/null
+++ b/tensorflow/python/data/experimental/benchmarks/matching_files_benchmark.py
@@ -0,0 +1,102 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Benchmark for the experimental `MatchingFilesDataset`."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import shutil
+import tempfile
+import time
+
+import numpy as np
+
+from tensorflow.python.client import session
+from tensorflow.python.data.experimental.ops import matching_files
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import ops
+from tensorflow.python.platform import test
+
+
+class MatchingFilesBenchmark(test.Benchmark):
+  """Benchmark for the experimental `MatchingFilesDataset`."""
+
+  def benchmarkNestedDirectories(self):
+    tmp_dir = tempfile.mkdtemp()
+    width = 500
+    depth = 10
+    for i in range(width):
+      for j in range(depth):
+        new_base = os.path.join(tmp_dir, str(i),
+                                *[str(dir_name) for dir_name in range(j)])
+        os.makedirs(new_base)
+        child_files = ['a.py', 'b.pyc'] if j < depth - 1 else ['c.txt', 'd.log']
+        for f in child_files:
+          filename = os.path.join(new_base, f)
+          open(filename, 'w').close()
+
+    patterns = [
+        os.path.join(tmp_dir, os.path.join(*['**'
+                                             for _ in range(depth)]), suffix)
+        for suffix in ['*.txt', '*.log']
+    ]
+
+    deltas = []
+    iters = 3
+    for _ in range(iters):
+      with ops.Graph().as_default():
+        dataset = matching_files.MatchingFilesDataset(patterns)
+        next_element = dataset_ops.make_one_shot_iterator(dataset).get_next()
+
+        with session.Session() as sess:
+          sub_deltas = []
+          while True:
+            try:
+              start = time.time()
+              sess.run(next_element)
+              end = time.time()
+              sub_deltas.append(end - start)
+            except errors.OutOfRangeError:
+              break
+          deltas.append(sub_deltas)
+
+    median_deltas = np.median(deltas, axis=0)
+    print('Nested directory size (width*depth): %d*%d Median wall time: '
+          '%fs (read first filename), %fs (read second filename), avg %fs'
+          ' (read %d more filenames)' %
+          (width, depth, median_deltas[0], median_deltas[1],
+           np.average(median_deltas[2:]), len(median_deltas) - 2))
+    self.report_benchmark(
+        iters=iters,
+        wall_time=np.sum(median_deltas),
+        extras={
+            'read first file:':
+                median_deltas[0],
+            'read second file:':
+                median_deltas[1],
+            'avg time for reading %d more filenames:' %
+            (len(median_deltas) - 2):
+                np.average(median_deltas[2:])
+        },
+        name='dataset_nested_directory(%d*%d)' %
+        (width, depth))
+
+    shutil.rmtree(tmp_dir, ignore_errors=True)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/data/experimental/benchmarks/optimize_benchmark.py b/tensorflow/python/data/experimental/benchmarks/optimize_benchmark.py
new file mode 100644
index 00000000000..2f9b89111fc
--- /dev/null
+++ b/tensorflow/python/data/experimental/benchmarks/optimize_benchmark.py
@@ -0,0 +1,120 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Benchmarks for static optimizations."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import time
+
+import numpy as np
+
+from tensorflow.python.client import session
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import test
+
+
+class OptimizationBenchmark(test.Benchmark):
+  """Benchmarks for static optimizations."""
+
+  def benchmarkMapFusion(self):
+    """Evaluates performance map of fusion."""
+
+    chain_lengths = [0, 1, 2, 5, 10, 20, 50]
+    for chain_length in chain_lengths:
+      self._benchmarkMapFusion(chain_length, False)
+      self._benchmarkMapFusion(chain_length, True)
+
+  def _benchmarkMapFusion(self, chain_length, optimize_dataset):
+    with ops.Graph().as_default():
+      dataset = dataset_ops.Dataset.from_tensors(0).repeat(None)
+      for _ in range(chain_length):
+        dataset = dataset.map(lambda x: x)
+      if optimize_dataset:
+        options = dataset_ops.Options()
+        options.experimental_map_fusion = True
+        dataset = dataset.with_options(options)
+
+      iterator = dataset_ops.make_one_shot_iterator(dataset)
+      next_element = iterator.get_next()
+
+      with session.Session() as sess:
+        for _ in range(5):
+          sess.run(next_element.op)
+        deltas = []
+        for _ in range(100):
+          start = time.time()
+          for _ in range(100):
+            sess.run(next_element.op)
+          end = time.time()
+          deltas.append(end - start)
+
+        median_wall_time = np.median(deltas) / 100
+        opt_mark = "opt" if optimize_dataset else "noopt"
+        print("Map dataset {} chain length: {} Median wall time: {}".format(
+            opt_mark, chain_length, median_wall_time))
+        self.report_benchmark(
+            iters=100,
+            wall_time=median_wall_time,
+            name="map_fusion_{}_chain_length_{}".format(
+                opt_mark, chain_length))
+
+  def benchmarkMapAndFilterFusion(self):
+    """Evaluates performance map of fusion."""
+
+    chain_lengths = [0, 1, 2, 5, 10, 20, 50]
+    for chain_length in chain_lengths:
+      self._benchmarkMapAndFilterFusion(chain_length, False)
+      self._benchmarkMapAndFilterFusion(chain_length, True)
+
+  def _benchmarkMapAndFilterFusion(self, chain_length, optimize_dataset):
+    with ops.Graph().as_default():
+      dataset = dataset_ops.Dataset.from_tensors(0).repeat(None)
+      for _ in range(chain_length):
+        dataset = dataset.map(lambda x: x + 5).filter(
+            lambda x: math_ops.greater_equal(x - 5, 0))
+      if optimize_dataset:
+        options = dataset_ops.Options()
+        options.experimental_map_and_filter_fusion = True
+        dataset = dataset.with_options(options)
+      iterator = dataset_ops.make_one_shot_iterator(dataset)
+      next_element = iterator.get_next()
+
+      with session.Session() as sess:
+        for _ in range(10):
+          sess.run(next_element.op)
+        deltas = []
+        for _ in range(100):
+          start = time.time()
+          for _ in range(100):
+            sess.run(next_element.op)
+          end = time.time()
+          deltas.append(end - start)
+
+        median_wall_time = np.median(deltas) / 100
+        opt_mark = "opt" if optimize_dataset else "noopt"
+        print("Map and filter dataset {} chain length: {} Median wall time: {}"
+              .format(opt_mark, chain_length, median_wall_time))
+        self.report_benchmark(
+            iters=100,
+            wall_time=median_wall_time,
+            name="map_and_filter_fusion_{}_chain_length_{}".format(
+                opt_mark, chain_length))
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/data/experimental/benchmarks/unbatch_benchmark.py b/tensorflow/python/data/experimental/benchmarks/unbatch_benchmark.py
new file mode 100644
index 00000000000..c36a32534dd
--- /dev/null
+++ b/tensorflow/python/data/experimental/benchmarks/unbatch_benchmark.py
@@ -0,0 +1,107 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for `tf.data.experimental.unbatch()`."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import time
+
+import numpy as np
+
+from tensorflow.python.client import session
+from tensorflow.python.data.experimental.ops import batching
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.platform import test
+
+
+class UnbatchBenchmark(test.Benchmark):
+  """Benchmarks for `tf.data.experimental.unbatch()`."""
+
+  def benchmarkNativeUnbatch(self):
+    batch_sizes = [1, 2, 5, 10, 20, 50]
+    elems_per_trial = 10000
+    with ops.Graph().as_default():
+      dataset = dataset_ops.Dataset.from_tensors("element").repeat(None)
+      batch_size_placeholder = array_ops.placeholder(dtypes.int64, shape=[])
+      dataset = dataset.batch(batch_size_placeholder)
+      dataset = dataset.apply(batching.unbatch())
+      dataset = dataset.skip(elems_per_trial)
+      iterator = dataset_ops.make_initializable_iterator(dataset)
+      next_element = iterator.get_next()
+
+      with session.Session() as sess:
+        for batch_size in batch_sizes:
+          deltas = []
+          for _ in range(5):
+            sess.run(
+                iterator.initializer,
+                feed_dict={batch_size_placeholder: batch_size})
+            start = time.time()
+            sess.run(next_element.op)
+            end = time.time()
+            deltas.append((end - start) / elems_per_trial)
+
+          median_wall_time = np.median(deltas)
+          print("Unbatch (native) batch size: %d Median wall time per element:"
+                " %f microseconds" % (batch_size, median_wall_time * 1e6))
+          self.report_benchmark(
+              iters=10000,
+              wall_time=median_wall_time,
+              name="native_batch_size_%d" %
+              batch_size)
+
+  # Include a benchmark of the previous `unbatch()` implementation that uses
+  # a composition of more primitive ops. Eventually we'd hope to generate code
+  # that is as good in both cases.
+  def benchmarkOldUnbatchImplementation(self):
+    batch_sizes = [1, 2, 5, 10, 20, 50]
+    elems_per_trial = 10000
+    with ops.Graph().as_default():
+      dataset = dataset_ops.Dataset.from_tensors("element").repeat(None)
+      batch_size_placeholder = array_ops.placeholder(dtypes.int64, shape=[])
+      dataset = dataset.batch(batch_size_placeholder)
+      dataset = dataset.flat_map(dataset_ops.Dataset.from_tensor_slices)
+      dataset = dataset.skip(elems_per_trial)
+      iterator = dataset_ops.make_initializable_iterator(dataset)
+      next_element = iterator.get_next()
+
+      with session.Session() as sess:
+        for batch_size in batch_sizes:
+          deltas = []
+          for _ in range(5):
+            sess.run(
+                iterator.initializer,
+                feed_dict={batch_size_placeholder: batch_size})
+            start = time.time()
+            sess.run(next_element.op)
+            end = time.time()
+            deltas.append((end - start) / elems_per_trial)
+
+          median_wall_time = np.median(deltas)
+          print("Unbatch (unfused) batch size: %d Median wall time per element:"
+                " %f microseconds" % (batch_size, median_wall_time * 1e6))
+          self.report_benchmark(
+              iters=10000,
+              wall_time=median_wall_time,
+              name="unfused_batch_size_%d" %
+              batch_size)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/data/experimental/kernel_tests/BUILD b/tensorflow/python/data/experimental/kernel_tests/BUILD
index c9b11a2c381..c76e576b5b4 100644
--- a/tensorflow/python/data/experimental/kernel_tests/BUILD
+++ b/tensorflow/python/data/experimental/kernel_tests/BUILD
@@ -72,15 +72,11 @@ py_test(
         "//tensorflow/python:errors",
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:parsing_ops",
-        "//tensorflow/python:platform",
-        "//tensorflow/python:platform_test",
-        "//tensorflow/python:session",
         "//tensorflow/python/data/experimental/ops:error_ops",
         "//tensorflow/python/data/experimental/ops:readers",
         "//tensorflow/python/data/kernel_tests:test_base",
         "//tensorflow/python/data/ops:readers",
         "//tensorflow/python/eager:context",
-        "//third_party/py/numpy",
     ],
 )
 
@@ -153,27 +149,6 @@ py_test(
     ],
 )
 
-cuda_py_test(
-    name = "function_buffering_resource_test",
-    size = "small",
-    srcs = ["function_buffering_resource_test.py"],
-    additional_deps = [
-        "//tensorflow/python/data/experimental/ops:prefetching_ops",
-        "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python/data/kernel_tests:test_base",
-        "//tensorflow/python/eager:function",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:resource_variable_ops",
-        "//tensorflow/python/data/ops:dataset_ops",
-        "//tensorflow/python/data/ops:iterator_ops",
-    ],
-    tags = ["no_windows_gpu"],
-)
-
 py_test(
     name = "get_single_element_test",
     size = "small",
@@ -371,6 +346,37 @@ py_test(
     ],
 )
 
+py_test(
+    name = "matching_files_test",
+    size = "small",
+    srcs = ["matching_files_test.py"],
+    srcs_version = "PY2AND3",
+    tags = ["no_pip"],
+    deps = [
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:util",
+        "//tensorflow/python/data/experimental/ops:matching_files",
+        "//tensorflow/python/data/kernel_tests:test_base",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_test(
+    name = "cardinality_test",
+    srcs = ["cardinality_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python/data/experimental/ops:cardinality",
+        "//tensorflow/python/data/kernel_tests:test_base",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
 py_test(
     name = "override_threadpool_test",
     size = "small",
@@ -618,7 +624,9 @@ py_test(
     size = "medium",
     srcs = ["stats_dataset_ops_test.py"],
     srcs_version = "PY2AND3",
-    tags = ["no_pip"],
+    tags = [
+        "no_pip",
+    ],
     deps = [
         ":reader_dataset_ops_test_base",
         ":stats_dataset_test_base",
diff --git a/tensorflow/python/data/experimental/kernel_tests/batch_dataset_op_test.py b/tensorflow/python/data/experimental/kernel_tests/batch_dataset_op_test.py
deleted file mode 100644
index e896752a269..00000000000
--- a/tensorflow/python/data/experimental/kernel_tests/batch_dataset_op_test.py
+++ /dev/null
@@ -1,688 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for the experimental input pipeline ops."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import math
-import time
-
-from absl.testing import parameterized
-import numpy as np
-
-from tensorflow.python.client import session
-from tensorflow.python.data.experimental.ops import batching
-from tensorflow.python.data.kernel_tests import test_base
-from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import errors
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import sparse_tensor
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import script_ops
-from tensorflow.python.ops import string_ops
-from tensorflow.python.platform import test
-from tensorflow.python.util import compat
-
-
-class BatchDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
-
-  def testDenseToSparseBatchDataset(self):
-    components = np.random.randint(12, size=(100,)).astype(np.int32)
-    iterator = (
-        dataset_ops.Dataset.from_tensor_slices(components)
-        .map(lambda x: array_ops.fill([x], x)).apply(
-            batching.dense_to_sparse_batch(4,
-                                           [12])).make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(init_op)
-
-      for start in range(0, len(components), 4):
-        results = sess.run(get_next)
-        self.assertAllEqual([[i, j]
-                             for i, c in enumerate(components[start:start + 4])
-                             for j in range(c)], results.indices)
-        self.assertAllEqual(
-            [c for c in components[start:start + 4] for _ in range(c)],
-            results.values)
-        self.assertAllEqual([min(4,
-                                 len(components) - start), 12],
-                            results.dense_shape)
-
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testDenseToSparseBatchDatasetWithUnknownShape(self):
-    components = np.random.randint(5, size=(40,)).astype(np.int32)
-    iterator = (
-        dataset_ops.Dataset.from_tensor_slices(components).map(
-            lambda x: array_ops.fill([x, x], x)).apply(
-                batching.dense_to_sparse_batch(
-                    4, [5, None])).make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(init_op)
-
-      for start in range(0, len(components), 4):
-        results = sess.run(get_next)
-        self.assertAllEqual([[i, j, z]
-                             for i, c in enumerate(components[start:start + 4])
-                             for j in range(c)
-                             for z in range(c)], results.indices)
-        self.assertAllEqual([
-            c for c in components[start:start + 4] for _ in range(c)
-            for _ in range(c)
-        ], results.values)
-        self.assertAllEqual([
-            min(4,
-                len(components) - start), 5,
-            np.max(components[start:start + 4])
-        ], results.dense_shape)
-
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testDenseToSparseBatchDatasetWithInvalidShape(self):
-    input_tensor = array_ops.constant([[1]])
-    with self.assertRaisesRegexp(ValueError, "Dimension -2 must be >= 0"):
-      dataset_ops.Dataset.from_tensors(input_tensor).apply(
-          batching.dense_to_sparse_batch(4,
-                                         [-2])).make_initializable_iterator()
-
-  def testDenseToSparseBatchDatasetShapeErrors(self):
-    input_tensor = array_ops.placeholder(dtypes.int32)
-    iterator = (
-        dataset_ops.Dataset.from_tensors(input_tensor).apply(
-            batching.dense_to_sparse_batch(4,
-                                           [12])).make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      # Initialize with an input tensor of incompatible rank.
-      sess.run(init_op, feed_dict={input_tensor: [[1]]})
-      with self.assertRaisesRegexp(errors.InvalidArgumentError,
-                                   "incompatible with the row shape"):
-        sess.run(get_next)
-
-      # Initialize with an input tensor that is larger than `row_shape`.
-      sess.run(init_op, feed_dict={input_tensor: range(13)})
-      with self.assertRaisesRegexp(errors.DataLossError,
-                                   "larger than the row shape"):
-        sess.run(get_next)
-
-  def testUnbatchWithUnknownRankInput(self):
-    placeholder = array_ops.placeholder(dtypes.int32)
-    dataset = dataset_ops.Dataset.from_tensors(placeholder).apply(
-        batching.unbatch())
-    iterator = dataset.make_initializable_iterator()
-    next_elem = iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(iterator.initializer, feed_dict={placeholder: [0, 1, 2, 3]})
-      for i in range(4):
-        self.assertEqual(i, sess.run(next_elem))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_elem)
-
-  def testUnbatchScalarDataset(self):
-    data = tuple([math_ops.range(10) for _ in range(3)])
-    data = dataset_ops.Dataset.from_tensor_slices(data)
-    expected_types = (dtypes.int32,) * 3
-    data = data.batch(2)
-    self.assertEqual(expected_types, data.output_types)
-    data = data.apply(batching.unbatch())
-    self.assertEqual(expected_types, data.output_types)
-
-    iterator = data.make_one_shot_iterator()
-    op = iterator.get_next()
-
-    with self.cached_session() as sess:
-      for i in range(10):
-        self.assertEqual((i,) * 3, sess.run(op))
-
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(op)
-
-  def testUnbatchDatasetWithStrings(self):
-    data = tuple([math_ops.range(10) for _ in range(3)])
-    data = dataset_ops.Dataset.from_tensor_slices(data)
-    data = data.map(lambda x, y, z: (x, string_ops.as_string(y), z))
-    expected_types = (dtypes.int32, dtypes.string, dtypes.int32)
-    data = data.batch(2)
-    self.assertEqual(expected_types, data.output_types)
-    data = data.apply(batching.unbatch())
-    self.assertEqual(expected_types, data.output_types)
-
-    iterator = data.make_one_shot_iterator()
-    op = iterator.get_next()
-
-    with self.cached_session() as sess:
-      for i in range(10):
-        self.assertEqual((i, compat.as_bytes(str(i)), i), sess.run(op))
-
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(op)
-
-  def testUnbatchDatasetWithSparseTensor(self):
-    st = sparse_tensor.SparseTensorValue(
-        indices=[[i, i] for i in range(10)],
-        values=list(range(10)),
-        dense_shape=[10, 10])
-    data = dataset_ops.Dataset.from_tensors(st)
-    data = data.apply(batching.unbatch())
-    data = data.batch(5)
-    data = data.apply(batching.unbatch())
-    iterator = data.make_one_shot_iterator()
-    next_element = iterator.get_next()
-
-    with self.cached_session() as sess:
-      for i in range(10):
-        st_row = sess.run(next_element)
-        self.assertEqual([i], st_row.indices)
-        self.assertEqual([i], st_row.values)
-        self.assertEqual([10], st_row.dense_shape)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
-
-  def testUnbatchDatasetWithDenseAndSparseTensor(self):
-    st = sparse_tensor.SparseTensorValue(
-        indices=[[i, i] for i in range(10)],
-        values=list(range(10)),
-        dense_shape=[10, 10])
-    data = dataset_ops.Dataset.from_tensors((list(range(10)), st))
-    data = data.apply(batching.unbatch())
-    data = data.batch(5)
-    data = data.apply(batching.unbatch())
-    iterator = data.make_one_shot_iterator()
-    next_element = iterator.get_next()
-
-    with self.cached_session() as sess:
-      for i in range(10):
-        dense_elem, st_row = sess.run(next_element)
-        self.assertEqual(i, dense_elem)
-        self.assertEqual([i], st_row.indices)
-        self.assertEqual([i], st_row.values)
-        self.assertEqual([10], st_row.dense_shape)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
-
-  def testUnbatchSingleElementTupleDataset(self):
-    data = tuple([(math_ops.range(10),) for _ in range(3)])
-    data = dataset_ops.Dataset.from_tensor_slices(data)
-    expected_types = ((dtypes.int32,),) * 3
-    data = data.batch(2)
-    self.assertEqual(expected_types, data.output_types)
-    data = data.apply(batching.unbatch())
-    self.assertEqual(expected_types, data.output_types)
-
-    iterator = data.make_one_shot_iterator()
-    op = iterator.get_next()
-
-    with self.cached_session() as sess:
-      for i in range(10):
-        self.assertEqual(((i,),) * 3, sess.run(op))
-
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(op)
-
-  def testUnbatchMultiElementTupleDataset(self):
-    data = tuple([(math_ops.range(10 * i, 10 * i + 10),
-                   array_ops.fill([10], "hi")) for i in range(3)])
-    data = dataset_ops.Dataset.from_tensor_slices(data)
-    expected_types = ((dtypes.int32, dtypes.string),) * 3
-    data = data.batch(2)
-    self.assertAllEqual(expected_types, data.output_types)
-    data = data.apply(batching.unbatch())
-    self.assertAllEqual(expected_types, data.output_types)
-
-    iterator = data.make_one_shot_iterator()
-    op = iterator.get_next()
-
-    with self.cached_session() as sess:
-      for i in range(10):
-        self.assertEqual(((i, b"hi"), (10 + i, b"hi"), (20 + i, b"hi")),
-                         sess.run(op))
-
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(op)
-
-  def testUnbatchEmpty(self):
-    data = dataset_ops.Dataset.from_tensors(
-        (constant_op.constant([]), constant_op.constant([], shape=[0, 4]),
-         constant_op.constant([], shape=[0, 4, 0])))
-    data = data.apply(batching.unbatch())
-    iterator = data.make_one_shot_iterator()
-    next_element = iterator.get_next()
-
-    with self.cached_session() as sess:
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
-
-  def testUnbatchStaticShapeMismatch(self):
-    data = dataset_ops.Dataset.from_tensors((np.arange(7), np.arange(8),
-                                             np.arange(9)))
-    with self.assertRaises(ValueError):
-      data.apply(batching.unbatch())
-
-  def testUnbatchDynamicShapeMismatch(self):
-    ph1 = array_ops.placeholder(dtypes.int32, shape=[None])
-    ph2 = array_ops.placeholder(dtypes.int32, shape=None)
-    data = dataset_ops.Dataset.from_tensors((ph1, ph2))
-    data = data.apply(batching.unbatch())
-    iterator = data.make_initializable_iterator()
-    next_element = iterator.get_next()
-
-    with self.cached_session() as sess:
-      # Mismatch in the 0th dimension.
-      sess.run(
-          iterator.initializer,
-          feed_dict={
-              ph1: np.arange(7).astype(np.int32),
-              ph2: np.arange(8).astype(np.int32)
-          })
-      with self.assertRaises(errors.InvalidArgumentError):
-        sess.run(next_element)
-
-      # No 0th dimension (i.e. scalar value) for one component.
-      sess.run(
-          iterator.initializer,
-          feed_dict={
-              ph1: np.arange(7).astype(np.int32),
-              ph2: 7
-          })
-      with self.assertRaises(errors.InvalidArgumentError):
-        sess.run(next_element)
-
-  @parameterized.named_parameters(
-      ("Default", None, None),
-      ("SequentialCalls", 1, None),
-      ("ParallelCalls", 2, None),
-      ("ParallelBatches", None, 10),
-  )
-  def testMapAndBatch(self, num_parallel_calls, num_parallel_batches):
-    """Test a dataset that maps a TF function across its input elements."""
-    # The pipeline is TensorSliceDataset ->
-    # RepeatDataset(count) -> MapAndBatchDataset(square_3, batch_size).
-    components = (np.arange(7),
-                  np.array([[1, 2, 3]]) * np.arange(7)[:, np.newaxis],
-                  np.array(37.0) * np.arange(7))
-
-    count = array_ops.placeholder(dtypes.int64, shape=[])
-    batch_size = array_ops.placeholder(dtypes.int64, shape=[])
-
-    def _map_fn(x, y, z):
-      return math_ops.square(x), math_ops.square(y), math_ops.square(z)
-
-    iterator = (
-        dataset_ops.Dataset.from_tensor_slices(components).repeat(count).apply(
-            batching.map_and_batch(
-                map_func=_map_fn,
-                batch_size=batch_size,
-                num_parallel_calls=num_parallel_calls,
-                num_parallel_batches=num_parallel_batches))
-        .make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    self.assertEqual([[None] + list(c.shape[1:]) for c in components],
-                     [t.shape.as_list() for t in get_next])
-
-    with self.cached_session() as sess:
-      # Batch of a finite input, where the batch_size divides the
-      # total number of elements.
-      sess.run(init_op, feed_dict={count: 28, batch_size: 14})
-      num_batches = (28 * 7) // 14
-      for i in range(num_batches):
-        result = sess.run(get_next)
-        for component, result_component in zip(components, result):
-          for j in range(14):
-            self.assertAllEqual(component[(i * 14 + j) % 7]**2,
-                                result_component[j])
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-      # Batch of a finite input, where the batch_size does not
-      # divide the total number of elements.
-      sess.run(init_op, feed_dict={count: 14, batch_size: 8})
-
-      # We expect (num_batches - 1) full-sized batches.
-      num_batches = int(math.ceil((14 * 7) / 8))
-      for i in range(num_batches - 1):
-        result = sess.run(get_next)
-        for component, result_component in zip(components, result):
-          for j in range(8):
-            self.assertAllEqual(component[(i * 8 + j) % 7]**2,
-                                result_component[j])
-      result = sess.run(get_next)
-      for component, result_component in zip(components, result):
-        for j in range((14 * 7) % 8):
-          self.assertAllEqual(component[((num_batches - 1) * 8 + j) % 7]**2,
-                              result_component[j])
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-      # Batch of an empty input should fail straight away.
-      sess.run(init_op, feed_dict={count: 0, batch_size: 8})
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-      # Empty batch should be an initialization time error.
-      with self.assertRaises(errors.InvalidArgumentError):
-        sess.run(init_op, feed_dict={count: 14, batch_size: 0})
-
-  @parameterized.named_parameters(
-      ("Even", False),
-      ("Uneven", True),
-  )
-  def testMapAndBatchPartialBatch(self, drop_remainder):
-    iterator = (
-        dataset_ops.Dataset.range(10).apply(
-            batching.map_and_batch(
-                lambda x: array_ops.reshape(x * x, [1]),
-                batch_size=4,
-                drop_remainder=drop_remainder)).make_one_shot_iterator())
-    if drop_remainder:
-      self.assertEqual([4, 1], iterator.output_shapes.as_list())
-    else:
-      self.assertEqual([None, 1], iterator.output_shapes.as_list())
-    next_element = iterator.get_next()
-    with self.cached_session() as sess:
-      self.assertAllEqual([[0], [1], [4], [9]], sess.run(next_element))
-      self.assertAllEqual([[16], [25], [36], [49]], sess.run(next_element))
-      if not drop_remainder:
-        self.assertAllEqual([[64], [81]], sess.run(next_element))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
-
-  def testMapAndBatchYieldsPartialBatch(self):
-    iterator = (
-        dataset_ops.Dataset.range(10).apply(
-            batching.map_and_batch(lambda x: array_ops.reshape(x * x, [1]),
-                                   4)).make_one_shot_iterator())
-    self.assertEqual([None, 1], iterator.output_shapes.as_list())
-    next_element = iterator.get_next()
-    with self.cached_session() as sess:
-      self.assertAllEqual([[0], [1], [4], [9]], sess.run(next_element))
-      self.assertAllEqual([[16], [25], [36], [49]], sess.run(next_element))
-      self.assertAllEqual([[64], [81]], sess.run(next_element))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
-
-  def testMapAndBatchParallelGetNext(self):
-    iterator = (
-        dataset_ops.Dataset.range(50000).apply(
-            batching.map_and_batch(lambda x: x,
-                                   batch_size=100)).make_one_shot_iterator())
-    elements = []
-    for _ in range(100):
-      elements.append(iterator.get_next())
-    with self.cached_session() as sess:
-      for i in range(5):
-        got = sess.run(elements)
-        got.sort(key=lambda x: x[0])
-        expected = []
-        for j in range(100):
-          expected.append(range(i * 10000 + j * 100, i * 10000 + (j + 1) * 100))
-        self.assertAllEqual(got, expected)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(elements)
-
-  def testMapAndBatchParallelGetNextDropRemainder(self):
-    iterator = (
-        dataset_ops.Dataset.range(49999).apply(
-            batching.map_and_batch(
-                lambda x: x, batch_size=100,
-                drop_remainder=True)).make_one_shot_iterator())
-    elements = []
-    for _ in range(100):
-      elements.append(iterator.get_next())
-    with self.cached_session() as sess:
-      for i in range(4):
-        got = sess.run(elements)
-        got.sort(key=lambda x: x[0])
-        expected = []
-        for j in range(100):
-          expected.append(range(i * 10000 + j * 100, i * 10000 + (j + 1) * 100))
-        self.assertAllEqual(got, expected)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(elements)
-
-  def testMapAndBatchSparse(self):
-
-    def _sparse(i):
-      return sparse_tensor.SparseTensorValue(
-          indices=[[0]], values=(i * [1]), dense_shape=[1])
-
-    iterator = dataset_ops.Dataset.range(10).apply(
-        batching.map_and_batch(_sparse, 5)).make_initializable_iterator()
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(init_op)
-      for i in range(2):
-        actual = sess.run(get_next)
-        expected = sparse_tensor.SparseTensorValue(
-            indices=[[0, 0], [1, 0], [2, 0], [3, 0], [4, 0]],
-            values=[i * 5, i * 5 + 1, i * 5 + 2, i * 5 + 3, i * 5 + 4],
-            dense_shape=[5, 1])
-        self.assertTrue(sparse_tensor.is_sparse(actual))
-        self.assertSparseValuesEqual(actual, expected)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testMapAndBatchFails(self):
-    """Test a dataset that maps a TF function across its input elements."""
-    dataset = dataset_ops.Dataset.from_tensors(
-        array_ops.check_numerics(
-            constant_op.constant(1.0) / constant_op.constant(0.0), "oops"))
-    batch_size = array_ops.placeholder(dtypes.int64, shape=[])
-    iterator = (
-        dataset.apply(batching.map_and_batch(
-            lambda x: x, batch_size)).make_initializable_iterator())
-    init_op = iterator.initializer
-    with self.cached_session() as sess:
-      with self.assertRaisesRegexp(errors.InvalidArgumentError, "oops"):
-        sess.run(init_op, feed_dict={batch_size: 14})
-
-  def testMapAndBatchShapeMismatch(self):
-    """Test a dataset that maps a TF function across its input elements."""
-
-    def generator():
-      yield [1]
-      yield [2]
-      yield [3]
-      yield [[4, 5, 6]]
-
-    dataset = dataset_ops.Dataset.from_generator(
-        generator, output_types=dtypes.int32)
-    batch_size = 4
-    iterator = (
-        dataset.apply(batching.map_and_batch(
-            lambda x: x, batch_size)).make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-    with self.cached_session() as sess:
-      sess.run(init_op)
-      with self.assertRaisesRegexp(errors.InvalidArgumentError,
-                                   "number of elements does not match"):
-        sess.run(get_next)
-
-  def testMapAndBatchImplicitDispose(self):
-    # Tests whether a map and batch dataset will be cleaned up correctly when
-    # the pipeline does not run it until exhaustion.
-    # The pipeline is TensorSliceDataset -> RepeatDataset(1000) ->
-    # MapAndBatchDataset(f=square_3, batch_size=100).
-    components = (np.arange(1000),
-                  np.array([[1, 2, 3]]) * np.arange(1000)[:, np.newaxis],
-                  np.array(37.0) * np.arange(1000))
-
-    def _map_fn(x, y, z):
-      return math_ops.square(x), math_ops.square(y), math_ops.square(z)
-
-    dataset = dataset_ops.Dataset.from_tensor_slices(components).repeat(
-        1000).apply(batching.map_and_batch(_map_fn, batch_size=100))
-    dataset = dataset.prefetch(5)
-    iterator = dataset.make_one_shot_iterator()
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      for _ in range(3):
-        sess.run(get_next)
-
-  @parameterized.named_parameters(
-      ("1", 0),
-      ("2", 5),
-      ("3", 10),
-      ("4", 90),
-      ("5", 95),
-      ("6", 99),
-  )
-  def testMapAndBatchOutOfRangeError(self, threshold):
-
-    def raising_py_fn(i):
-      if i >= threshold:
-        raise StopIteration()
-      else:
-        return i
-
-    iterator = (
-        dataset_ops.Dataset.range(100).apply(
-            batching.map_and_batch(
-                lambda x: script_ops.py_func(raising_py_fn, [x], dtypes.int64),
-                batch_size=10)).make_one_shot_iterator())
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      for i in range(threshold // 10):
-        self.assertAllEqual([i * 10 + j for j in range(10)], sess.run(get_next))
-      if threshold % 10 != 0:
-        self.assertAllEqual(
-            [threshold // 10 * 10 + j for j in range(threshold % 10)],
-            sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  @parameterized.named_parameters(
-      ("1", False, dtypes.bool),
-      ("2", -42, dtypes.int8),
-      ("3", -42, dtypes.int16),
-      ("4", -42, dtypes.int32),
-      ("5", -42, dtypes.int64),
-      ("6", 42, dtypes.uint8),
-      ("7", 42, dtypes.uint16),
-      ("8", 42.0, dtypes.float16),
-      ("9", 42.0, dtypes.float32),
-      ("10", 42.0, dtypes.float64),
-      ("11", b"hello", dtypes.string),
-  )
-  def testMapAndBatchTypes(self, element, dtype):
-
-    def gen():
-      yield element
-
-    dataset = dataset_ops.Dataset.from_generator(gen, dtype).repeat(100).apply(
-        batching.map_and_batch(lambda x: x, batch_size=10))
-
-    get_next = dataset.make_one_shot_iterator().get_next()
-
-    with self.cached_session() as sess:
-      for _ in range(10):
-        self.assertAllEqual([element for _ in range(10)], sess.run(get_next))
-
-
-class UnbatchDatasetBenchmark(test.Benchmark):
-
-  def benchmarkNativeUnbatch(self):
-    batch_sizes = [1, 2, 5, 10, 20, 50]
-    elems_per_trial = 10000
-    with ops.Graph().as_default():
-      dataset = dataset_ops.Dataset.from_tensors("element").repeat(None)
-      batch_size_placeholder = array_ops.placeholder(dtypes.int64, shape=[])
-      dataset = dataset.batch(batch_size_placeholder)
-      dataset = dataset.apply(batching.unbatch())
-      dataset = dataset.skip(elems_per_trial)
-      iterator = dataset.make_initializable_iterator()
-      next_element = iterator.get_next()
-
-      with session.Session() as sess:
-        for batch_size in batch_sizes:
-          deltas = []
-          for _ in range(5):
-            sess.run(
-                iterator.initializer,
-                feed_dict={batch_size_placeholder: batch_size})
-            start = time.time()
-            sess.run(next_element.op)
-            end = time.time()
-            deltas.append((end - start) / elems_per_trial)
-
-          median_wall_time = np.median(deltas)
-          print("Unbatch (native) batch size: %d Median wall time per element:"
-                " %f microseconds" % (batch_size, median_wall_time * 1e6))
-          self.report_benchmark(
-              iters=10000,
-              wall_time=median_wall_time,
-              name="benchmark_unbatch_dataset_native_batch_size_%d" %
-              batch_size)
-
-  # Include a benchmark of the previous `unbatch()` implementation that uses
-  # a composition of more primitive ops. Eventually we'd hope to generate code
-  # that is as good in both cases.
-  def benchmarkOldUnbatchImplementation(self):
-    batch_sizes = [1, 2, 5, 10, 20, 50]
-    elems_per_trial = 10000
-    with ops.Graph().as_default():
-      dataset = dataset_ops.Dataset.from_tensors("element").repeat(None)
-      batch_size_placeholder = array_ops.placeholder(dtypes.int64, shape=[])
-      dataset = dataset.batch(batch_size_placeholder)
-      dataset = dataset.flat_map(dataset_ops.Dataset.from_tensor_slices)
-      dataset = dataset.skip(elems_per_trial)
-      iterator = dataset.make_initializable_iterator()
-      next_element = iterator.get_next()
-
-      with session.Session() as sess:
-        for batch_size in batch_sizes:
-          deltas = []
-          for _ in range(5):
-            sess.run(
-                iterator.initializer,
-                feed_dict={batch_size_placeholder: batch_size})
-            start = time.time()
-            sess.run(next_element.op)
-            end = time.time()
-            deltas.append((end - start) / elems_per_trial)
-
-          median_wall_time = np.median(deltas)
-          print("Unbatch (unfused) batch size: %d Median wall time per element:"
-                " %f microseconds" % (batch_size, median_wall_time * 1e6))
-          self.report_benchmark(
-              iters=10000,
-              wall_time=median_wall_time,
-              name="benchmark_unbatch_dataset_unfused_batch_size_%d" %
-              batch_size)
-
-
-if __name__ == "__main__":
-  test.main()
diff --git a/tensorflow/python/data/experimental/kernel_tests/bucket_by_sequence_length_test.py b/tensorflow/python/data/experimental/kernel_tests/bucket_by_sequence_length_test.py
index 3903ec49b98..8264dee3c15 100644
--- a/tensorflow/python/data/experimental/kernel_tests/bucket_by_sequence_length_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/bucket_by_sequence_length_test.py
@@ -105,14 +105,14 @@ class BucketBySequenceLengthTest(test_base.DatasetTestBase):
               boundaries,
               batch_sizes,
               no_padding=no_padding))
-      batch, = dataset.make_one_shot_iterator().get_next()
+      batch, = dataset_ops.make_one_shot_iterator(dataset).get_next()
 
       with self.cached_session() as sess:
         batches = []
         for _ in range(4):
-          batches.append(sess.run(batch))
+          batches.append(self.evaluate(batch))
         with self.assertRaises(errors.OutOfRangeError):
-          sess.run(batch)
+          self.evaluate(batch)
       batch_sizes_val = []
       lengths_val = []
       for batch in batches:
@@ -155,14 +155,14 @@ class BucketBySequenceLengthTest(test_base.DatasetTestBase):
             grouping.bucket_by_sequence_length(
                 element_len, boundaries, batch_sizes,
                 pad_to_bucket_boundary=True))
-    batch, = dataset.make_one_shot_iterator().get_next()
+    batch, = dataset_ops.make_one_shot_iterator(dataset).get_next()
 
     with self.cached_session() as sess:
       batches = []
       for _ in range(3):
-        batches.append(sess.run(batch))
+        batches.append(self.evaluate(batch))
       with self.assertRaisesOpError("bucket_boundaries"):
-        sess.run(batch)
+        self.evaluate(batch)
     batch_sizes_val = []
     lengths_val = []
     for batch in batches:
@@ -192,14 +192,14 @@ class BucketBySequenceLengthTest(test_base.DatasetTestBase):
             grouping.bucket_by_sequence_length(
                 element_len, boundaries, batch_sizes,
                 pad_to_bucket_boundary=True))
-    batch, = dataset.make_one_shot_iterator().get_next()
+    batch, = dataset_ops.make_one_shot_iterator(dataset).get_next()
 
     with self.cached_session() as sess:
       batches = []
       for _ in range(5):
-        batches.append(sess.run(batch))
+        batches.append(self.evaluate(batch))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(batch)
+        self.evaluate(batch)
 
     self.assertAllEqual(batches[0], [[1, 0],
                                      [1, 1]])
@@ -295,12 +295,12 @@ class BucketBySequenceLengthTest(test_base.DatasetTestBase):
 
     def _compute_batches(dataset):
       """Computes actual batch outputs of dataset and stores in a set."""
-      batch = dataset.make_one_shot_iterator().get_next()
+      batch = dataset_ops.make_one_shot_iterator(dataset).get_next()
       all_sparse_tensors = set()
       with self.cached_session() as sess:
         with self.assertRaises(errors.OutOfRangeError):
           while True:
-            output = sess.run(batch)
+            output = self.evaluate(batch)
             sprs_tensor = (tuple([tuple(idx) for idx in output.indices]),
                            tuple(output.values))
             all_sparse_tensors.add(sprs_tensor)
diff --git a/tensorflow/python/data/experimental/kernel_tests/cardinality_test.py b/tensorflow/python/data/experimental/kernel_tests/cardinality_test.py
new file mode 100644
index 00000000000..943f0f1f812
--- /dev/null
+++ b/tensorflow/python/data/experimental/kernel_tests/cardinality_test.py
@@ -0,0 +1,158 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for `tf.data.experimental.cardinality()`."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+
+from tensorflow.python.data.experimental.ops import cardinality
+from tensorflow.python.data.kernel_tests import test_base
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.platform import test
+
+
+class NumElementsTest(test_base.DatasetTestBase, parameterized.TestCase):
+  """Tests for `tf.data.experimental.cardinality()`."""
+
+  @parameterized.named_parameters(
+      # pylint: disable=g-long-lambda
+      ("Batch1",
+       lambda: dataset_ops.Dataset.range(5).batch(2, drop_remainder=True), 2),
+      ("Batch2",
+       lambda: dataset_ops.Dataset.range(5).batch(2, drop_remainder=False), 3),
+      ("Batch3",
+       lambda: dataset_ops.Dataset.range(5).filter(lambda _: True).batch(2),
+       cardinality.UNKNOWN),
+      ("Batch4", lambda: dataset_ops.Dataset.range(5).repeat().batch(2),
+       cardinality.INFINITE),
+      ("Cache1", lambda: dataset_ops.Dataset.range(5).cache(), 5),
+      ("Cache2", lambda: dataset_ops.Dataset.range(5).cache("foo"), 5),
+      ("Concatenate1", lambda: dataset_ops.Dataset.range(5).concatenate(
+          dataset_ops.Dataset.range(5)), 10),
+      ("Concatenate2",
+       lambda: dataset_ops.Dataset.range(5).filter(lambda _: True).concatenate(
+           dataset_ops.Dataset.range(5)), cardinality.UNKNOWN),
+      ("Concatenate3", lambda: dataset_ops.Dataset.range(5).repeat().
+       concatenate(dataset_ops.Dataset.range(5)),
+       cardinality.INFINITE),
+      ("Concatenate4", lambda: dataset_ops.Dataset.range(5).concatenate(
+          dataset_ops.Dataset.range(5).filter(lambda _: True)),
+       cardinality.UNKNOWN),
+      ("Concatenate5",
+       lambda: dataset_ops.Dataset.range(5).filter(lambda _: True).concatenate(
+           dataset_ops.Dataset.range(5).filter(lambda _: True)),
+       cardinality.UNKNOWN),
+      ("Concatenate6", lambda: dataset_ops.Dataset.range(5).repeat().
+       concatenate(dataset_ops.Dataset.range(5).filter(lambda _: True)),
+       cardinality.INFINITE),
+      ("Concatenate7", lambda: dataset_ops.Dataset.range(5).concatenate(
+          dataset_ops.Dataset.range(5).repeat()), cardinality.INFINITE),
+      ("Concatenate8",
+       lambda: dataset_ops.Dataset.range(5).filter(lambda _: True).concatenate(
+           dataset_ops.Dataset.range(5).repeat()), cardinality.INFINITE),
+      ("Concatenate9",
+       lambda: dataset_ops.Dataset.range(5).repeat().concatenate(
+           dataset_ops.Dataset.range(5).repeat()), cardinality.INFINITE),
+      ("FlatMap", lambda: dataset_ops.Dataset.range(5).flat_map(
+          lambda _: dataset_ops.Dataset.from_tensors(0)),
+       cardinality.UNKNOWN),
+      ("Filter", lambda: dataset_ops.Dataset.range(5).filter(lambda _: True),
+       cardinality.UNKNOWN),
+      ("FromTensors1", lambda: dataset_ops.Dataset.from_tensors(0), 1),
+      ("FromTensors2", lambda: dataset_ops.Dataset.from_tensors((0, 1)), 1),
+      ("FromTensorSlices1",
+       lambda: dataset_ops.Dataset.from_tensor_slices([0, 0, 0]), 3),
+      ("FromTensorSlices2",
+       lambda: dataset_ops.Dataset.from_tensor_slices(([0, 0, 0], [1, 1, 1])),
+       3),
+      ("Interleave1", lambda: dataset_ops.Dataset.range(5).interleave(
+          lambda _: dataset_ops.Dataset.from_tensors(0), cycle_length=1),
+       cardinality.UNKNOWN),
+      ("Interleave2", lambda: dataset_ops.Dataset.range(5).interleave(
+          lambda _: dataset_ops.Dataset.from_tensors(0),
+          cycle_length=1,
+          num_parallel_calls=1), cardinality.UNKNOWN),
+      ("Map1", lambda: dataset_ops.Dataset.range(5).map(lambda x: x), 5),
+      ("Map2", lambda: dataset_ops.Dataset.range(5).map(
+          lambda x: x, num_parallel_calls=1), 5),
+      ("PaddedBatch1", lambda: dataset_ops.Dataset.range(5).padded_batch(
+          2, [], drop_remainder=True), 2),
+      ("PaddedBatch2", lambda: dataset_ops.Dataset.range(5).padded_batch(
+          2, [], drop_remainder=False), 3),
+      ("PaddedBatch3", lambda: dataset_ops.Dataset.range(5).filter(
+          lambda _: True).padded_batch(2, []), cardinality.UNKNOWN),
+      ("PaddedBatch4",
+       lambda: dataset_ops.Dataset.range(5).repeat().padded_batch(2, []),
+       cardinality.INFINITE),
+      ("Prefetch", lambda: dataset_ops.Dataset.range(5).prefetch(buffer_size=1),
+       5),
+      ("Range1", lambda: dataset_ops.Dataset.range(0), 0),
+      ("Range2", lambda: dataset_ops.Dataset.range(5), 5),
+      ("Range3", lambda: dataset_ops.Dataset.range(5, 10), 5),
+      ("Range4", lambda: dataset_ops.Dataset.range(10, 5), 0),
+      ("Range5", lambda: dataset_ops.Dataset.range(5, 10, 2), 3),
+      ("Range6", lambda: dataset_ops.Dataset.range(10, 5, -2), 3),
+      ("Repeat1", lambda: dataset_ops.Dataset.range(0).repeat(0), 0),
+      ("Repeat2", lambda: dataset_ops.Dataset.range(1).repeat(0), 0),
+      ("Repeat3", lambda: dataset_ops.Dataset.range(0).repeat(5), 0),
+      ("Repeat4", lambda: dataset_ops.Dataset.range(1).repeat(5), 5),
+      ("Repeat5", lambda: dataset_ops.Dataset.range(0).repeat(), 0),
+      ("Repeat6", lambda: dataset_ops.Dataset.range(1).repeat(),
+       cardinality.INFINITE),
+      ("Shuffle", lambda: dataset_ops.Dataset.range(5).shuffle(buffer_size=1),
+       5),
+      ("Skip1", lambda: dataset_ops.Dataset.range(5).skip(2), 3),
+      ("Skip2", lambda: dataset_ops.Dataset.range(5).skip(8), 0),
+      ("Skip3",
+       lambda: dataset_ops.Dataset.range(5).filter(lambda _: True).skip(2),
+       cardinality.UNKNOWN),
+      ("Skip4", lambda: dataset_ops.Dataset.range(5).repeat().skip(2),
+       cardinality.INFINITE),
+      ("Take1", lambda: dataset_ops.Dataset.range(5).take(2), 2),
+      ("Take2", lambda: dataset_ops.Dataset.range(5).take(8), 5),
+      ("Take3",
+       lambda: dataset_ops.Dataset.range(5).filter(lambda _: True).take(2),
+       cardinality.UNKNOWN),
+      ("Take4", lambda: dataset_ops.Dataset.range(5).repeat().take(2), 2),
+      ("Window1", lambda: dataset_ops.Dataset.range(5).window(
+          size=2, shift=2, drop_remainder=True), 2),
+      ("Window2", lambda: dataset_ops.Dataset.range(5).window(
+          size=2, shift=2, drop_remainder=False), 3),
+      ("Zip1", lambda: dataset_ops.Dataset.zip(dataset_ops.Dataset.range(5)),
+       5),
+      ("Zip2", lambda: dataset_ops.Dataset.zip(
+          (dataset_ops.Dataset.range(5), dataset_ops.Dataset.range(3))), 3),
+      ("Zip3", lambda: dataset_ops.Dataset.zip(
+          (dataset_ops.Dataset.range(5),
+           dataset_ops.Dataset.range(3).repeat())), 5),
+      ("Zip4", lambda: dataset_ops.Dataset.zip(
+          (dataset_ops.Dataset.range(5).repeat(),
+           dataset_ops.Dataset.range(3).repeat())), cardinality.INFINITE),
+      ("Zip5", lambda: dataset_ops.Dataset.zip(
+          (dataset_ops.Dataset.range(5),
+           dataset_ops.Dataset.range(3).filter(lambda _: True))),
+       cardinality.UNKNOWN),
+      # pylint: enable=g-long-lambda
+  )
+  def testNumElements(self, dataset_fn, expected_result):
+    with self.cached_session() as sess:
+      self.assertEqual(
+          sess.run(cardinality.cardinality(dataset_fn())), expected_result)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/data/experimental/kernel_tests/copy_to_device_test.py b/tensorflow/python/data/experimental/kernel_tests/copy_to_device_test.py
index cea8bd6f0b7..b8166fe8334 100644
--- a/tensorflow/python/data/experimental/kernel_tests/copy_to_device_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/copy_to_device_test.py
@@ -35,13 +35,14 @@ from tensorflow.python.util import compat as util_compat
 
 class CopyToDeviceTest(test_base.DatasetTestBase):
 
+  @test_util.run_deprecated_v1
   def testCopyToDevice(self):
     host_dataset = dataset_ops.Dataset.range(10)
     device_dataset = host_dataset.apply(
         prefetching_ops.copy_to_device("/cpu:1"))
 
     with ops.device("/cpu:1"):
-      iterator = device_dataset.make_one_shot_iterator()
+      iterator = dataset_ops.make_one_shot_iterator(device_dataset)
       next_element = iterator.get_next()
 
     self.assertEqual(host_dataset.output_types, device_dataset.output_types)
@@ -55,19 +56,20 @@ class CopyToDeviceTest(test_base.DatasetTestBase):
     self.assertEqual([], next_element.shape)
 
     worker_config = config_pb2.ConfigProto(device_count={"CPU": 2})
-    with self.test_session(config=worker_config) as sess:
+    with self.test_session(config=worker_config):
       for i in range(10):
-        self.assertEqual(i, sess.run(next_element))
+        self.assertEqual(i, self.evaluate(next_element))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
+        self.evaluate(next_element)
 
+  @test_util.run_deprecated_v1
   def testCopyToDeviceInt32(self):
     host_dataset = dataset_ops.Dataset.from_tensors([0, 1, 2, 3])
     device_dataset = host_dataset.apply(
         prefetching_ops.copy_to_device("/cpu:1"))
 
     with ops.device("/cpu:1"):
-      iterator = device_dataset.make_one_shot_iterator()
+      iterator = dataset_ops.make_one_shot_iterator(device_dataset)
       next_element = iterator.get_next()
 
     self.assertEqual(host_dataset.output_types, device_dataset.output_types)
@@ -81,18 +83,19 @@ class CopyToDeviceTest(test_base.DatasetTestBase):
     self.assertEqual((4,), next_element.shape)
 
     worker_config = config_pb2.ConfigProto(device_count={"CPU": 2})
-    with self.test_session(config=worker_config) as sess:
-      self.assertAllEqual([0, 1, 2, 3], sess.run(next_element))
+    with self.test_session(config=worker_config):
+      self.assertAllEqual([0, 1, 2, 3], self.evaluate(next_element))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
+        self.evaluate(next_element)
 
+  @test_util.run_deprecated_v1
   def testCopyToSameDevice(self):
     host_dataset = dataset_ops.Dataset.range(10)
     device_dataset = host_dataset.apply(
         prefetching_ops.copy_to_device("/cpu:0"))
 
     with ops.device("/cpu:0"):
-      iterator = device_dataset.make_one_shot_iterator()
+      iterator = dataset_ops.make_one_shot_iterator(device_dataset)
       next_element = iterator.get_next()
 
     self.assertEqual(host_dataset.output_types, device_dataset.output_types)
@@ -106,19 +109,20 @@ class CopyToDeviceTest(test_base.DatasetTestBase):
     self.assertEqual([], next_element.shape)
 
     worker_config = config_pb2.ConfigProto(device_count={"CPU": 2})
-    with self.test_session(config=worker_config) as sess:
+    with self.test_session(config=worker_config):
       for i in range(10):
-        self.assertEqual(i, sess.run(next_element))
+        self.assertEqual(i, self.evaluate(next_element))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
+        self.evaluate(next_element)
 
+  @test_util.run_deprecated_v1
   def testCopyToDeviceWithPrefetch(self):
     host_dataset = dataset_ops.Dataset.range(10)
     device_dataset = host_dataset.apply(
         prefetching_ops.copy_to_device("/cpu:1")).prefetch(1)
 
     with ops.device("/cpu:1"):
-      iterator = device_dataset.make_one_shot_iterator()
+      iterator = dataset_ops.make_one_shot_iterator(device_dataset)
       next_element = iterator.get_next()
 
     self.assertEqual(host_dataset.output_types, device_dataset.output_types)
@@ -132,19 +136,20 @@ class CopyToDeviceTest(test_base.DatasetTestBase):
     self.assertEqual([], next_element.shape)
 
     worker_config = config_pb2.ConfigProto(device_count={"CPU": 2})
-    with self.test_session(config=worker_config) as sess:
+    with self.test_session(config=worker_config):
       for i in range(10):
-        self.assertEqual(i, sess.run(next_element))
+        self.assertEqual(i, self.evaluate(next_element))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
+        self.evaluate(next_element)
 
+  @test_util.run_deprecated_v1
   def testCopyDictToDevice(self):
     host_dataset = dataset_ops.Dataset.range(10).map(lambda x: {"a": x})
     device_dataset = host_dataset.apply(
         prefetching_ops.copy_to_device("/cpu:1"))
 
     with ops.device("/cpu:1"):
-      iterator = device_dataset.make_one_shot_iterator()
+      iterator = dataset_ops.make_one_shot_iterator(device_dataset)
       next_element = iterator.get_next()
 
     self.assertEqual(host_dataset.output_types, device_dataset.output_types)
@@ -158,19 +163,20 @@ class CopyToDeviceTest(test_base.DatasetTestBase):
     self.assertEqual([], next_element["a"].shape)
 
     worker_config = config_pb2.ConfigProto(device_count={"CPU": 2})
-    with self.test_session(config=worker_config) as sess:
+    with self.test_session(config=worker_config):
       for i in range(10):
-        self.assertEqual({"a": i}, sess.run(next_element))
+        self.assertEqual({"a": i}, self.evaluate(next_element))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
+        self.evaluate(next_element)
 
+  @test_util.run_deprecated_v1
   def testCopyDictToDeviceWithPrefetch(self):
     host_dataset = dataset_ops.Dataset.range(10).map(lambda x: {"a": x})
     device_dataset = host_dataset.apply(
         prefetching_ops.copy_to_device("/cpu:1")).prefetch(1)
 
     with ops.device("/cpu:1"):
-      iterator = device_dataset.make_one_shot_iterator()
+      iterator = dataset_ops.make_one_shot_iterator(device_dataset)
       next_element = iterator.get_next()
 
     self.assertEqual(host_dataset.output_types, device_dataset.output_types)
@@ -184,12 +190,13 @@ class CopyToDeviceTest(test_base.DatasetTestBase):
     self.assertEqual([], next_element["a"].shape)
 
     worker_config = config_pb2.ConfigProto(device_count={"CPU": 2})
-    with self.test_session(config=worker_config) as sess:
+    with self.test_session(config=worker_config):
       for i in range(10):
-        self.assertEqual({"a": i}, sess.run(next_element))
+        self.assertEqual({"a": i}, self.evaluate(next_element))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
+        self.evaluate(next_element)
 
+  @test_util.run_deprecated_v1
   def testCopySparseTensorsToDevice(self):
 
     def make_tensor(i):
@@ -202,7 +209,7 @@ class CopyToDeviceTest(test_base.DatasetTestBase):
         prefetching_ops.copy_to_device("/cpu:1"))
 
     with ops.device("/cpu:1"):
-      iterator = device_dataset.make_one_shot_iterator()
+      iterator = dataset_ops.make_one_shot_iterator(device_dataset)
       next_element = iterator.get_next()
 
     self.assertEqual(host_dataset.output_types, device_dataset.output_types)
@@ -215,15 +222,16 @@ class CopyToDeviceTest(test_base.DatasetTestBase):
     self.assertEqual(dtypes.int64, next_element.dtype)
 
     worker_config = config_pb2.ConfigProto(device_count={"CPU": 2})
-    with self.test_session(config=worker_config) as sess:
+    with self.test_session(config=worker_config):
       for i in range(10):
-        actual = sess.run(next_element)
+        actual = self.evaluate(next_element)
         self.assertAllEqual([i], actual.values)
         self.assertAllEqual([[0, 0]], actual.indices)
         self.assertAllEqual([2, 2], actual.dense_shape)
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
+        self.evaluate(next_element)
 
+  @test_util.run_deprecated_v1
   def testCopySparseTensorsToDeviceWithPrefetch(self):
 
     def make_tensor(i):
@@ -236,7 +244,7 @@ class CopyToDeviceTest(test_base.DatasetTestBase):
         prefetching_ops.copy_to_device("/cpu:1")).prefetch(1)
 
     with ops.device("/cpu:1"):
-      iterator = device_dataset.make_one_shot_iterator()
+      iterator = dataset_ops.make_one_shot_iterator(device_dataset)
       next_element = iterator.get_next()
 
     self.assertEqual(host_dataset.output_types, device_dataset.output_types)
@@ -249,14 +257,14 @@ class CopyToDeviceTest(test_base.DatasetTestBase):
     self.assertEqual(dtypes.int64, next_element.dtype)
 
     worker_config = config_pb2.ConfigProto(device_count={"CPU": 2})
-    with self.test_session(config=worker_config) as sess:
+    with self.test_session(config=worker_config):
       for i in range(10):
-        actual = sess.run(next_element)
+        actual = self.evaluate(next_element)
         self.assertAllEqual([i], actual.values)
         self.assertAllEqual([[0, 0]], actual.indices)
         self.assertAllEqual([2, 2], actual.dense_shape)
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
+        self.evaluate(next_element)
 
   def testCopyToDeviceGpu(self):
     if not test_util.is_gpu_available():
@@ -267,15 +275,16 @@ class CopyToDeviceTest(test_base.DatasetTestBase):
         prefetching_ops.copy_to_device("/gpu:0"))
 
     with ops.device("/gpu:0"):
-      iterator = device_dataset.make_initializable_iterator()
+      iterator = dataset_ops.make_initializable_iterator(device_dataset)
       next_element = iterator.get_next()
 
-    with self.cached_session() as sess:
-      sess.run(iterator.initializer)
+    with self.cached_session(
+        config=config_pb2.ConfigProto(allow_soft_placement=False)):
+      self.evaluate(iterator.initializer)
       for i in range(10):
-        self.assertEqual(i, sess.run(next_element))
+        self.assertEqual(i, self.evaluate(next_element))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
+        self.evaluate(next_element)
 
   def testCopyToDeviceGpuWithPrefetch(self):
     if not test_util.is_gpu_available():
@@ -286,15 +295,16 @@ class CopyToDeviceTest(test_base.DatasetTestBase):
         prefetching_ops.copy_to_device("/gpu:0")).prefetch(1)
 
     with ops.device("/gpu:0"):
-      iterator = device_dataset.make_initializable_iterator()
+      iterator = dataset_ops.make_initializable_iterator(device_dataset)
       next_element = iterator.get_next()
 
-    with self.cached_session() as sess:
-      sess.run(iterator.initializer)
+    with self.cached_session(
+        config=config_pb2.ConfigProto(allow_soft_placement=False)):
+      self.evaluate(iterator.initializer)
       for i in range(10):
-        self.assertEqual(i, sess.run(next_element))
+        self.assertEqual(i, self.evaluate(next_element))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
+        self.evaluate(next_element)
 
   def testCopyToDeviceGpuWithMap(self):
     if not test_util.is_gpu_available():
@@ -319,18 +329,19 @@ class CopyToDeviceTest(test_base.DatasetTestBase):
     device_dataset = device_dataset.with_options(options)
 
     with ops.device("/gpu:0"):
-      iterator = device_dataset.make_initializable_iterator()
+      iterator = dataset_ops.make_initializable_iterator(device_dataset)
       next_element = iterator.get_next()
 
-    with self.cached_session() as sess:
-      sess.run(iterator.initializer)
+    with self.cached_session(
+        config=config_pb2.ConfigProto(allow_soft_placement=False)):
+      self.evaluate(iterator.initializer)
       for i in range(10):
-        x, y, z = sess.run(next_element)
+        x, y, z = self.evaluate(next_element)
         self.assertEqual(i**2, x)
         self.assertEqual(float(i**2), y)
         self.assertEqual(util_compat.as_bytes(str(i)), z)
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
+        self.evaluate(next_element)
 
   def testCopyToDeviceGpuInt32(self):
     if not test_util.is_gpu_available():
@@ -341,14 +352,15 @@ class CopyToDeviceTest(test_base.DatasetTestBase):
         prefetching_ops.copy_to_device("/gpu:0"))
 
     with ops.device("/gpu:0"):
-      iterator = device_dataset.make_initializable_iterator()
+      iterator = dataset_ops.make_initializable_iterator(device_dataset)
       next_element = iterator.get_next()
 
-    with self.cached_session() as sess:
-      sess.run(iterator.initializer)
-      self.assertAllEqual([0, 1, 2, 3], sess.run(next_element))
+    with self.cached_session(
+        config=config_pb2.ConfigProto(allow_soft_placement=False)):
+      self.evaluate(iterator.initializer)
+      self.assertAllEqual([0, 1, 2, 3], self.evaluate(next_element))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
+        self.evaluate(next_element)
 
   def testCopyToDeviceGpuInt32AndPrefetch(self):
     if not test_util.is_gpu_available():
@@ -359,14 +371,15 @@ class CopyToDeviceTest(test_base.DatasetTestBase):
         prefetching_ops.copy_to_device("/gpu:0")).prefetch(1)
 
     with ops.device("/gpu:0"):
-      iterator = device_dataset.make_initializable_iterator()
+      iterator = dataset_ops.make_initializable_iterator(device_dataset)
       next_element = iterator.get_next()
 
-    with self.cached_session() as sess:
-      sess.run(iterator.initializer)
-      self.assertAllEqual([0, 1, 2, 3], sess.run(next_element))
+    with self.cached_session(
+        config=config_pb2.ConfigProto(allow_soft_placement=False)):
+      self.evaluate(iterator.initializer)
+      self.assertAllEqual([0, 1, 2, 3], self.evaluate(next_element))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
+        self.evaluate(next_element)
 
   def testCopyToDeviceGpuStrings(self):
     if not test_util.is_gpu_available():
@@ -377,14 +390,15 @@ class CopyToDeviceTest(test_base.DatasetTestBase):
         prefetching_ops.copy_to_device("/gpu:0"))
 
     with ops.device("/gpu:0"):
-      iterator = device_dataset.make_initializable_iterator()
+      iterator = dataset_ops.make_initializable_iterator(device_dataset)
       next_element = iterator.get_next()
 
-    with self.cached_session() as sess:
-      sess.run(iterator.initializer)
-      self.assertAllEqual([b"a", b"b", b"c"], sess.run(next_element))
+    with self.cached_session(
+        config=config_pb2.ConfigProto(allow_soft_placement=False)):
+      self.evaluate(iterator.initializer)
+      self.assertAllEqual([b"a", b"b", b"c"], self.evaluate(next_element))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
+        self.evaluate(next_element)
 
   def testCopyToDeviceGpuStringsAndPrefetch(self):
     if not test_util.is_gpu_available():
@@ -395,14 +409,15 @@ class CopyToDeviceTest(test_base.DatasetTestBase):
         prefetching_ops.copy_to_device("/gpu:0"))
 
     with ops.device("/gpu:0"):
-      iterator = device_dataset.make_initializable_iterator()
+      iterator = dataset_ops.make_initializable_iterator(device_dataset)
       next_element = iterator.get_next()
 
-    with self.cached_session() as sess:
-      sess.run(iterator.initializer)
-      self.assertAllEqual([b"a", b"b", b"c"], sess.run(next_element))
+    with self.cached_session(
+        config=config_pb2.ConfigProto(allow_soft_placement=False)):
+      self.evaluate(iterator.initializer)
+      self.assertAllEqual([b"a", b"b", b"c"], self.evaluate(next_element))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
+        self.evaluate(next_element)
 
   def testCopyToDevicePingPongCPUGPU(self):
     if not test_util.is_gpu_available():
@@ -416,23 +431,25 @@ class CopyToDeviceTest(test_base.DatasetTestBase):
           prefetching_ops.copy_to_device("/cpu:0", source_device="/gpu:0"))
 
       with ops.device("/cpu:0"):
-        iterator = back_to_cpu_dataset.make_initializable_iterator()
+        iterator = dataset_ops.make_initializable_iterator(back_to_cpu_dataset)
         next_element = iterator.get_next()
 
-      with self.cached_session() as sess:
-        sess.run(iterator.initializer)
+      with self.cached_session(
+          config=config_pb2.ConfigProto(allow_soft_placement=False)):
+        self.evaluate(iterator.initializer)
         for i in range(10):
-          self.assertEqual(i, sess.run(next_element))
+          self.assertEqual(i, self.evaluate(next_element))
         with self.assertRaises(errors.OutOfRangeError):
-          sess.run(next_element)
+          self.evaluate(next_element)
 
+  @test_util.run_deprecated_v1
   def testCopyToDeviceWithReInit(self):
     host_dataset = dataset_ops.Dataset.range(10)
     device_dataset = host_dataset.apply(
         prefetching_ops.copy_to_device("/cpu:1"))
 
     with ops.device("/cpu:1"):
-      iterator = device_dataset.make_initializable_iterator()
+      iterator = dataset_ops.make_initializable_iterator(device_dataset)
       next_element = iterator.get_next()
 
     self.assertEqual(host_dataset.output_types, device_dataset.output_types)
@@ -446,23 +463,24 @@ class CopyToDeviceTest(test_base.DatasetTestBase):
     self.assertEqual([], next_element.shape)
 
     worker_config = config_pb2.ConfigProto(device_count={"CPU": 2})
-    with self.test_session(config=worker_config) as sess:
-      sess.run(iterator.initializer)
+    with self.test_session(config=worker_config):
+      self.evaluate(iterator.initializer)
       for i in range(5):
-        self.assertEqual(i, sess.run(next_element))
-      sess.run(iterator.initializer)
+        self.assertEqual(i, self.evaluate(next_element))
+      self.evaluate(iterator.initializer)
       for i in range(10):
-        self.assertEqual(i, sess.run(next_element))
+        self.assertEqual(i, self.evaluate(next_element))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
+        self.evaluate(next_element)
 
+  @test_util.run_deprecated_v1
   def testCopyToDeviceWithReInitAndPrefetch(self):
     host_dataset = dataset_ops.Dataset.range(10)
     device_dataset = host_dataset.apply(
         prefetching_ops.copy_to_device("/cpu:1")).prefetch(1)
 
     with ops.device("/cpu:1"):
-      iterator = device_dataset.make_initializable_iterator()
+      iterator = dataset_ops.make_initializable_iterator(device_dataset)
       next_element = iterator.get_next()
 
     self.assertEqual(host_dataset.output_types, device_dataset.output_types)
@@ -476,15 +494,15 @@ class CopyToDeviceTest(test_base.DatasetTestBase):
     self.assertEqual([], next_element.shape)
 
     worker_config = config_pb2.ConfigProto(device_count={"CPU": 2})
-    with self.test_session(config=worker_config) as sess:
-      sess.run(iterator.initializer)
+    with self.test_session(config=worker_config):
+      self.evaluate(iterator.initializer)
       for i in range(5):
-        self.assertEqual(i, sess.run(next_element))
-      sess.run(iterator.initializer)
+        self.assertEqual(i, self.evaluate(next_element))
+      self.evaluate(iterator.initializer)
       for i in range(10):
-        self.assertEqual(i, sess.run(next_element))
+        self.assertEqual(i, self.evaluate(next_element))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
+        self.evaluate(next_element)
 
   def testCopyToDeviceGpuWithReInit(self):
     if not test_util.is_gpu_available():
@@ -495,18 +513,19 @@ class CopyToDeviceTest(test_base.DatasetTestBase):
         prefetching_ops.copy_to_device("/gpu:0"))
 
     with ops.device("/gpu:0"):
-      iterator = device_dataset.make_initializable_iterator()
+      iterator = dataset_ops.make_initializable_iterator(device_dataset)
       next_element = iterator.get_next()
 
-    with self.cached_session() as sess:
-      sess.run(iterator.initializer)
+    with self.cached_session(
+        config=config_pb2.ConfigProto(allow_soft_placement=False)):
+      self.evaluate(iterator.initializer)
       for i in range(5):
-        self.assertEqual(i, sess.run(next_element))
-      sess.run(iterator.initializer)
+        self.assertEqual(i, self.evaluate(next_element))
+      self.evaluate(iterator.initializer)
       for i in range(10):
-        self.assertEqual(i, sess.run(next_element))
+        self.assertEqual(i, self.evaluate(next_element))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
+        self.evaluate(next_element)
 
   def testCopyToDeviceGpuWithReInitAndPrefetch(self):
     if not test_util.is_gpu_available():
@@ -517,18 +536,19 @@ class CopyToDeviceTest(test_base.DatasetTestBase):
         prefetching_ops.copy_to_device("/gpu:0")).prefetch(1)
 
     with ops.device("/gpu:0"):
-      iterator = device_dataset.make_initializable_iterator()
+      iterator = dataset_ops.make_initializable_iterator(device_dataset)
       next_element = iterator.get_next()
 
-    with self.cached_session() as sess:
-      sess.run(iterator.initializer)
+    with self.cached_session(
+        config=config_pb2.ConfigProto(allow_soft_placement=False)):
+      self.evaluate(iterator.initializer)
       for i in range(5):
-        self.assertEqual(i, sess.run(next_element))
-      sess.run(iterator.initializer)
+        self.assertEqual(i, self.evaluate(next_element))
+      self.evaluate(iterator.initializer)
       for i in range(10):
-        self.assertEqual(i, sess.run(next_element))
+        self.assertEqual(i, self.evaluate(next_element))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
+        self.evaluate(next_element)
 
   def testIteratorGetNextAsOptionalOnGPU(self):
     if not test_util.is_gpu_available():
@@ -538,33 +558,35 @@ class CopyToDeviceTest(test_base.DatasetTestBase):
     device_dataset = host_dataset.apply(
         prefetching_ops.copy_to_device("/gpu:0"))
     with ops.device("/gpu:0"):
-      iterator = device_dataset.make_initializable_iterator()
+      iterator = dataset_ops.make_initializable_iterator(device_dataset)
       next_elem = iterator_ops.get_next_as_optional(iterator)
       elem_has_value_t = next_elem.has_value()
       elem_value_t = next_elem.get_value()
 
-    with self.cached_session() as sess:
+    with self.cached_session(
+        config=config_pb2.ConfigProto(allow_soft_placement=False)):
       # Before initializing the iterator, evaluating the optional fails with
       # a FailedPreconditionError.
       with self.assertRaises(errors.FailedPreconditionError):
-        sess.run(elem_has_value_t)
+        self.evaluate(elem_has_value_t)
       with self.assertRaises(errors.FailedPreconditionError):
-        sess.run(elem_value_t)
+        self.evaluate(elem_value_t)
 
       # For each element of the dataset, assert that the optional evaluates to
       # the expected value.
-      sess.run(iterator.initializer)
+      self.evaluate(iterator.initializer)
       for i in range(3):
-        elem_has_value, elem_value = sess.run([elem_has_value_t, elem_value_t])
+        elem_has_value, elem_value = self.evaluate(
+            [elem_has_value_t, elem_value_t])
         self.assertTrue(elem_has_value)
         self.assertEqual(i, elem_value)
 
       # After exhausting the iterator, `next_elem.has_value()` will evaluate to
       # false, and attempting to get the value will fail.
       for _ in range(2):
-        self.assertFalse(sess.run(elem_has_value_t))
+        self.assertFalse(self.evaluate(elem_has_value_t))
         with self.assertRaises(errors.InvalidArgumentError):
-          sess.run(elem_value_t)
+          self.evaluate(elem_value_t)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/data/experimental/kernel_tests/counter_test.py b/tensorflow/python/data/experimental/kernel_tests/counter_test.py
index 4e114ac4791..49e1f2272b7 100644
--- a/tensorflow/python/data/experimental/kernel_tests/counter_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/counter_test.py
@@ -19,32 +19,35 @@ from __future__ import print_function
 
 from tensorflow.python.data.experimental.ops import counter
 from tensorflow.python.data.kernel_tests import test_base
+from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
 
 
 class CounterTest(test_base.DatasetTestBase):
 
+  @test_util.run_deprecated_v1
   def testCounter(self):
     """Test dataset construction using `count`."""
-    iterator = (counter.Counter(start=3, step=4)
-                .make_one_shot_iterator())
+    iterator = dataset_ops.make_one_shot_iterator(
+        counter.Counter(start=3, step=4))
     get_next = iterator.get_next()
     self.assertEqual([], get_next.shape.as_list())
     self.assertEqual(dtypes.int64, get_next.dtype)
 
-    negative_iterator = (counter.Counter(start=0, step=-1)
-                         .make_one_shot_iterator())
+    negative_iterator = dataset_ops.make_one_shot_iterator(
+        counter.Counter(start=0, step=-1))
     negative_get_next = negative_iterator.get_next()
 
     with self.cached_session() as sess:
-      self.assertEqual(3, sess.run(get_next))
-      self.assertEqual(3 + 4, sess.run(get_next))
-      self.assertEqual(3 + 2 * 4, sess.run(get_next))
+      self.assertEqual(3, self.evaluate(get_next))
+      self.assertEqual(3 + 4, self.evaluate(get_next))
+      self.assertEqual(3 + 2 * 4, self.evaluate(get_next))
 
-      self.assertEqual(0, sess.run(negative_get_next))
-      self.assertEqual(-1, sess.run(negative_get_next))
-      self.assertEqual(-2, sess.run(negative_get_next))
+      self.assertEqual(0, self.evaluate(negative_get_next))
+      self.assertEqual(-1, self.evaluate(negative_get_next))
+      self.assertEqual(-2, self.evaluate(negative_get_next))
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/data/experimental/kernel_tests/csv_dataset_test.py b/tensorflow/python/data/experimental/kernel_tests/csv_dataset_test.py
index fb75be1fbcf..b2f1b43ecf6 100644
--- a/tensorflow/python/data/experimental/kernel_tests/csv_dataset_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/csv_dataset_test.py
@@ -20,14 +20,8 @@ from __future__ import print_function
 
 import gzip
 import os
-import string
-import tempfile
-import time
 import zlib
 
-import numpy as np
-
-from tensorflow.python.client import session
 from tensorflow.python.data.experimental.ops import error_ops
 from tensorflow.python.data.experimental.ops import readers
 from tensorflow.python.data.kernel_tests import test_base
@@ -38,8 +32,6 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import parsing_ops
-from tensorflow.python.platform import gfile
-from tensorflow.python.platform import googletest
 from tensorflow.python.platform import test
 
 
@@ -537,96 +529,5 @@ class CsvDatasetTest(test_base.DatasetTestBase):
           record_defaults=record_defaults)
 
 
-class CsvDatasetBenchmark(test.Benchmark):
-  """Benchmarks for the various ways of creating a dataset from CSV files.
-  """
-  FLOAT_VAL = '1.23456E12'
-  STR_VAL = string.ascii_letters * 10
-
-  def _setUp(self, str_val):
-    # Since this isn't test.TestCase, have to manually create a test dir
-    gfile.MakeDirs(googletest.GetTempDir())
-    self._temp_dir = tempfile.mkdtemp(dir=googletest.GetTempDir())
-
-    self._num_cols = [4, 64, 256]
-    self._num_per_iter = 5000
-    self._filenames = []
-    for n in self._num_cols:
-      fn = os.path.join(self._temp_dir, 'file%d.csv' % n)
-      with open(fn, 'wb') as f:
-        # Just write 100 rows and use `repeat`... Assumes the cost
-        # of creating an iterator is not significant
-        row = ','.join([str_val for _ in range(n)])
-        f.write('\n'.join([row for _ in range(100)]))
-      self._filenames.append(fn)
-
-  def _tearDown(self):
-    gfile.DeleteRecursively(self._temp_dir)
-
-  def _runBenchmark(self, dataset, num_cols, prefix):
-    dataset = dataset.skip(self._num_per_iter - 1)
-    deltas = []
-    for _ in range(10):
-      next_element = dataset.make_one_shot_iterator().get_next()
-      with session.Session() as sess:
-        start = time.time()
-        # NOTE: This depends on the underlying implementation of skip, to have
-        # the net effect of calling `GetNext` num_per_iter times on the
-        # input dataset. We do it this way (instead of a python for loop, or
-        # batching N inputs in one iter) so that the overhead from session.run
-        # or batch doesn't dominate. If we eventually optimize skip, this has
-        # to change.
-        sess.run(next_element)
-        end = time.time()
-      deltas.append(end - start)
-    # Median wall time per CSV record read and decoded
-    median_wall_time = np.median(deltas) / self._num_per_iter
-    print('%s num_cols: %d Median wall time: %f' % (prefix, num_cols,
-                                                    median_wall_time))
-    self.report_benchmark(
-        iters=self._num_per_iter,
-        wall_time=median_wall_time,
-        name='%s_with_cols_%d' % (prefix, num_cols))
-
-  def benchmarkMapWithFloats(self):
-    self._setUp(self.FLOAT_VAL)
-    for i in range(len(self._filenames)):
-      num_cols = self._num_cols[i]
-      kwargs = {'record_defaults': [[0.0]] * num_cols}
-      dataset = core_readers.TextLineDataset(self._filenames[i]).repeat()
-      dataset = dataset.map(lambda l: parsing_ops.decode_csv(l, **kwargs))  # pylint: disable=cell-var-from-loop
-      self._runBenchmark(dataset, num_cols, 'csv_float_map_decode_csv')
-    self._tearDown()
-
-  def benchmarkMapWithStrings(self):
-    self._setUp(self.STR_VAL)
-    for i in range(len(self._filenames)):
-      num_cols = self._num_cols[i]
-      kwargs = {'record_defaults': [['']] * num_cols}
-      dataset = core_readers.TextLineDataset(self._filenames[i]).repeat()
-      dataset = dataset.map(lambda l: parsing_ops.decode_csv(l, **kwargs))  # pylint: disable=cell-var-from-loop
-      self._runBenchmark(dataset, num_cols, 'csv_strings_map_decode_csv')
-    self._tearDown()
-
-  def benchmarkCsvDatasetWithFloats(self):
-    self._setUp(self.FLOAT_VAL)
-    for i in range(len(self._filenames)):
-      num_cols = self._num_cols[i]
-      kwargs = {'record_defaults': [[0.0]] * num_cols}
-      dataset = core_readers.TextLineDataset(self._filenames[i]).repeat()
-      dataset = readers.CsvDataset(self._filenames[i], **kwargs).repeat()  # pylint: disable=cell-var-from-loop
-      self._runBenchmark(dataset, num_cols, 'csv_float_fused_dataset')
-    self._tearDown()
-
-  def benchmarkCsvDatasetWithStrings(self):
-    self._setUp(self.STR_VAL)
-    for i in range(len(self._filenames)):
-      num_cols = self._num_cols[i]
-      kwargs = {'record_defaults': [['']] * num_cols}
-      dataset = core_readers.TextLineDataset(self._filenames[i]).repeat()
-      dataset = readers.CsvDataset(self._filenames[i], **kwargs).repeat()  # pylint: disable=cell-var-from-loop
-      self._runBenchmark(dataset, num_cols, 'csv_strings_fused_dataset')
-    self._tearDown()
-
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/data/experimental/kernel_tests/dense_to_sparse_batch_test.py b/tensorflow/python/data/experimental/kernel_tests/dense_to_sparse_batch_test.py
index 73be6cbcca8..22e057a2848 100644
--- a/tensorflow/python/data/experimental/kernel_tests/dense_to_sparse_batch_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/dense_to_sparse_batch_test.py
@@ -24,27 +24,28 @@ from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
 
 
 class DenseToSparseBatchTest(test_base.DatasetTestBase):
 
+  @test_util.run_deprecated_v1
   def testDenseToSparseBatchDataset(self):
     components = np.random.randint(12, size=(100,)).astype(np.int32)
-    iterator = (
+    iterator = dataset_ops.make_initializable_iterator(
         dataset_ops.Dataset.from_tensor_slices(components)
         .map(lambda x: array_ops.fill([x], x)).apply(
-            batching.dense_to_sparse_batch(4, [12]))
-        .make_initializable_iterator())
+            batching.dense_to_sparse_batch(4, [12])))
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
     with self.cached_session() as sess:
-      sess.run(init_op)
+      self.evaluate(init_op)
 
       for start in range(0, len(components), 4):
-        results = sess.run(get_next)
+        results = self.evaluate(get_next)
         self.assertAllEqual([[i, j]
                              for i, c in enumerate(components[start:start + 4])
                              for j in range(c)], results.indices)
@@ -56,23 +57,23 @@ class DenseToSparseBatchTest(test_base.DatasetTestBase):
                             results.dense_shape)
 
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+        self.evaluate(get_next)
 
+  @test_util.run_deprecated_v1
   def testDenseToSparseBatchDatasetWithUnknownShape(self):
     components = np.random.randint(5, size=(40,)).astype(np.int32)
-    iterator = (
+    iterator = dataset_ops.make_initializable_iterator(
         dataset_ops.Dataset.from_tensor_slices(components)
         .map(lambda x: array_ops.fill([x, x], x)).apply(
-            batching.dense_to_sparse_batch(
-                4, [5, None])).make_initializable_iterator())
+            batching.dense_to_sparse_batch(4, [5, None])))
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
     with self.cached_session() as sess:
-      sess.run(init_op)
+      self.evaluate(init_op)
 
       for start in range(0, len(components), 4):
-        results = sess.run(get_next)
+        results = self.evaluate(get_next)
         self.assertAllEqual([[i, j, z]
                              for i, c in enumerate(components[start:start + 4])
                              for j in range(c)
@@ -89,20 +90,22 @@ class DenseToSparseBatchTest(test_base.DatasetTestBase):
         ], results.dense_shape)
 
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+        self.evaluate(get_next)
 
+  @test_util.run_deprecated_v1
   def testDenseToSparseBatchDatasetWithInvalidShape(self):
     input_tensor = array_ops.constant([[1]])
     with self.assertRaisesRegexp(ValueError, "Dimension -2 must be >= 0"):
-      dataset_ops.Dataset.from_tensors(input_tensor).apply(
-          batching.dense_to_sparse_batch(4, [-2])).make_initializable_iterator()
+      dataset_ops.make_initializable_iterator(
+          dataset_ops.Dataset.from_tensors(input_tensor).apply(
+              batching.dense_to_sparse_batch(4, [-2])))
 
+  @test_util.run_deprecated_v1
   def testDenseToSparseBatchDatasetShapeErrors(self):
     input_tensor = array_ops.placeholder(dtypes.int32)
-    iterator = (
+    iterator = dataset_ops.make_initializable_iterator(
         dataset_ops.Dataset.from_tensors(input_tensor).apply(
-            batching.dense_to_sparse_batch(4, [12]))
-        .make_initializable_iterator())
+            batching.dense_to_sparse_batch(4, [12])))
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
@@ -111,13 +114,13 @@ class DenseToSparseBatchTest(test_base.DatasetTestBase):
       sess.run(init_op, feed_dict={input_tensor: [[1]]})
       with self.assertRaisesRegexp(errors.InvalidArgumentError,
                                    "incompatible with the row shape"):
-        sess.run(get_next)
+        self.evaluate(get_next)
 
       # Initialize with an input tensor that is larger than `row_shape`.
       sess.run(init_op, feed_dict={input_tensor: range(13)})
       with self.assertRaisesRegexp(errors.DataLossError,
                                    "larger than the row shape"):
-        sess.run(get_next)
+        self.evaluate(get_next)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/data/experimental/kernel_tests/directed_interleave_dataset_test.py b/tensorflow/python/data/experimental/kernel_tests/directed_interleave_dataset_test.py
index 796a692c56f..21443420666 100644
--- a/tensorflow/python/data/experimental/kernel_tests/directed_interleave_dataset_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/directed_interleave_dataset_test.py
@@ -24,11 +24,13 @@ from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import random_seed
+from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
 
 
 class DirectedInterleaveDatasetTest(test_base.DatasetTestBase):
 
+  @test_util.run_deprecated_v1
   def testBasic(self):
     selector_dataset = dataset_ops.Dataset.range(10).repeat(100)
     input_datasets = [
@@ -36,16 +38,16 @@ class DirectedInterleaveDatasetTest(test_base.DatasetTestBase):
     ]
     dataset = interleave_ops._DirectedInterleaveDataset(selector_dataset,
                                                         input_datasets)
-    iterator = dataset.make_initializable_iterator()
+    iterator = dataset_ops.make_initializable_iterator(dataset)
     next_element = iterator.get_next()
 
     with self.cached_session() as sess:
-      sess.run(iterator.initializer)
+      self.evaluate(iterator.initializer)
       for _ in range(100):
         for i in range(10):
-          self.assertEqual(i, sess.run(next_element))
+          self.assertEqual(i, self.evaluate(next_element))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
+        self.evaluate(next_element)
 
   def _normalize(self, vec):
     return vec / vec.sum()
@@ -65,18 +67,19 @@ class DirectedInterleaveDatasetTest(test_base.DatasetTestBase):
         for i in range(num_datasets)
     ], weights)
     dataset = dataset.take(num_samples)
-    iterator = dataset.make_one_shot_iterator()
+    iterator = dataset_ops.make_one_shot_iterator(dataset)
     next_element = iterator.get_next()
 
     with self.cached_session() as sess:
       freqs = np.zeros([num_datasets])
       for _ in range(num_samples):
-        freqs[sess.run(next_element)] += 1
+        freqs[self.evaluate(next_element)] += 1
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
+        self.evaluate(next_element)
 
     return freqs
 
+  @test_util.run_deprecated_v1
   def testSampleFromDatasets(self):
     random_seed.set_random_seed(1619)
     num_samples = 5000
@@ -96,20 +99,21 @@ class DirectedInterleaveDatasetTest(test_base.DatasetTestBase):
       freqs = self._testSampleFromDatasetsHelper(probs_ds, classes, num_samples)
       self.assertLess(self._chi2(probs, freqs / num_samples), 1e-2)
 
+  @test_util.run_deprecated_v1
   def testSelectFromDatasets(self):
     words = [b"foo", b"bar", b"baz"]
     datasets = [dataset_ops.Dataset.from_tensors(w).repeat() for w in words]
     choice_array = np.random.randint(3, size=(15,), dtype=np.int64)
     choice_dataset = dataset_ops.Dataset.from_tensor_slices(choice_array)
     dataset = interleave_ops.choose_from_datasets(datasets, choice_dataset)
-    iterator = dataset.make_one_shot_iterator()
+    iterator = dataset_ops.make_one_shot_iterator(dataset)
     next_element = iterator.get_next()
 
     with self.cached_session() as sess:
       for i in choice_array:
-        self.assertEqual(words[i], sess.run(next_element))
+        self.assertEqual(words[i], self.evaluate(next_element))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
+        self.evaluate(next_element)
 
   def testErrors(self):
     with self.assertRaisesRegexp(ValueError,
diff --git a/tensorflow/python/data/experimental/kernel_tests/enumerate_dataset_test.py b/tensorflow/python/data/experimental/kernel_tests/enumerate_dataset_test.py
index e54235d9f80..25742098f18 100644
--- a/tensorflow/python/data/experimental/kernel_tests/enumerate_dataset_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/enumerate_dataset_test.py
@@ -24,17 +24,20 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
 
 
 class EnumerateDatasetTest(test_base.DatasetTestBase):
 
+  @test_util.run_deprecated_v1
   def testEnumerateDataset(self):
     components = (["a", "b"], [1, 2], [37.0, 38])
     start = constant_op.constant(20, dtype=dtypes.int64)
 
-    iterator = (dataset_ops.Dataset.from_tensor_slices(components).apply(
-        enumerate_ops.enumerate_dataset(start)).make_initializable_iterator())
+    iterator = dataset_ops.make_initializable_iterator(
+        dataset_ops.Dataset.from_tensor_slices(components).apply(
+            enumerate_ops.enumerate_dataset(start)))
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
@@ -44,12 +47,12 @@ class EnumerateDatasetTest(test_base.DatasetTestBase):
                      [t.shape for t in get_next[1]])
 
     with self.cached_session() as sess:
-      sess.run(init_op)
-      self.assertEqual((20, (b"a", 1, 37.0)), sess.run(get_next))
-      self.assertEqual((21, (b"b", 2, 38.0)), sess.run(get_next))
+      self.evaluate(init_op)
+      self.assertEqual((20, (b"a", 1, 37.0)), self.evaluate(get_next))
+      self.assertEqual((21, (b"b", 2, 38.0)), self.evaluate(get_next))
 
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+        self.evaluate(get_next)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/data/experimental/kernel_tests/filter_dataset_op_test.py b/tensorflow/python/data/experimental/kernel_tests/filter_dataset_op_test.py
index c6ee88c676d..357b5f1b49b 100644
--- a/tensorflow/python/data/experimental/kernel_tests/filter_dataset_op_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/filter_dataset_op_test.py
@@ -47,17 +47,17 @@ class FilterBenchmark(test.Benchmark):
       if optimize_dataset:
         dataset = dataset.apply(optimization.optimize(["filter_fusion"]))
 
-      iterator = dataset.make_one_shot_iterator()
+      iterator = dataset_ops.make_one_shot_iterator(dataset)
       next_element = iterator.get_next()
 
       with session.Session() as sess:
         for _ in range(10):
-          sess.run(next_element.op)
+          self.evaluate(next_element.op)
         deltas = []
         for _ in range(100):
           start = time.time()
           for _ in range(100):
-            sess.run(next_element.op)
+            self.evaluate(next_element.op)
           end = time.time()
           deltas.append(end - start)
 
diff --git a/tensorflow/python/data/experimental/kernel_tests/function_buffering_resource_test.py b/tensorflow/python/data/experimental/kernel_tests/function_buffering_resource_test.py
deleted file mode 100644
index d38452e265a..00000000000
--- a/tensorflow/python/data/experimental/kernel_tests/function_buffering_resource_test.py
+++ /dev/null
@@ -1,248 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for the private `FunctionBufferingResource` used in prefetching."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import threading
-
-from tensorflow.core.protobuf import config_pb2
-from tensorflow.python.data.experimental.ops import prefetching_ops
-from tensorflow.python.data.kernel_tests import test_base
-from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.data.ops import iterator_ops
-from tensorflow.python.eager import function
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import errors
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_spec
-from tensorflow.python.framework import test_util
-from tensorflow.python.ops import resource_variable_ops
-from tensorflow.python.platform import test
-
-
-class FunctionBufferingResourceTest(test_base.DatasetTestBase):
-
-  def setUp(self):
-    self._event = threading.Event()
-
-  def _create_ds_and_iterator(self, device0, initializable=False):
-
-    def gen():
-      for i in range(1, 10):
-        yield [float(i)]
-        if i == 6:
-          self._event.set()
-
-    with ops.device(device0):
-      ds = dataset_ops.Dataset.from_generator(gen, (dtypes.float32))
-      if initializable:
-        ds_iterator = ds.make_initializable_iterator()
-      else:
-        ds_iterator = ds.make_one_shot_iterator()
-      return (ds, ds_iterator)
-
-  def _create_ops(self, ds, ds_iterator, buffer_name, device0, device1):
-    ds_iterator_handle = ds_iterator.string_handle()
-
-    @function.defun(input_signature=[tensor_spec.TensorSpec([], dtypes.string)])
-    def _remote_fn(h):
-      remote_iterator = iterator_ops.Iterator.from_string_handle(
-          h, ds.output_types, ds.output_shapes)
-      return remote_iterator.get_next()
-
-    target = constant_op.constant(device0)
-    with ops.device(device1):
-      buffer_resource_handle = prefetching_ops.function_buffering_resource(
-          f=_remote_fn.get_concrete_function(),
-          output_types=[dtypes.float32],
-          target_device=target,
-          string_arg=ds_iterator_handle,
-          buffer_size=3,
-          shared_name=buffer_name)
-
-    with ops.device(device1):
-      prefetch_op = prefetching_ops.function_buffering_resource_get_next(
-          function_buffer_resource=buffer_resource_handle,
-          output_types=[dtypes.float32])
-      reset_op = prefetching_ops.function_buffering_resource_reset(
-          function_buffer_resource=buffer_resource_handle)
-      destroy_op = resource_variable_ops.destroy_resource_op(
-          buffer_resource_handle, ignore_lookup_error=True)
-
-    return (prefetch_op, reset_op, destroy_op)
-
-  def _prefetch_fn_helper_one_shot(self, buffer_name, device0, device1):
-    worker_config = config_pb2.ConfigProto(device_count={"CPU": 2})
-
-    ds, ds_iterator = self._create_ds_and_iterator(device0, initializable=False)
-    prefetch_op, _, destroy_op = self._create_ops(ds, ds_iterator, buffer_name,
-                                                  device0, device1)
-
-    with self.test_session(config=worker_config) as sess:
-      elem = sess.run(prefetch_op)
-      self.assertEqual(elem, [1.0])
-      elem = sess.run(prefetch_op)
-      self.assertEqual(elem, [2.0])
-      elem = sess.run(prefetch_op)
-      self.assertEqual(elem, [3.0])
-      elem = sess.run(prefetch_op)
-      self.assertEqual(elem, [4.0])
-      self._event.wait()
-      elem = sess.run(prefetch_op)
-      self.assertEqual(elem, [5.0])
-      sess.run(destroy_op)
-
-  def testSameDeviceCPU(self):
-    self._prefetch_fn_helper_one_shot("same_device_cpu",
-                                      "/job:localhost/replica:0/task:0/cpu:0",
-                                      "/job:localhost/replica:0/task:0/cpu:0")
-
-  def testDifferentDeviceCPU(self):
-    self._prefetch_fn_helper_one_shot("diff_device_cpu",
-                                      "/job:localhost/replica:0/task:0/cpu:0",
-                                      "/job:localhost/replica:0/task:0/cpu:1")
-
-  def testDifferentDeviceCPUGPU(self):
-    if not test_util.is_gpu_available():
-      self.skipTest("No GPU available")
-
-    self._prefetch_fn_helper_one_shot("cpu_gpu",
-                                      "/job:localhost/replica:0/task:0/cpu:0",
-                                      "/job:localhost/replica:0/task:0/gpu:0")
-
-  def testReinitialization(self):
-    worker_config = config_pb2.ConfigProto(device_count={"CPU": 2})
-
-    device0 = "/job:localhost/replica:0/task:0/cpu:0"
-    device1 = "/job:localhost/replica:0/task:0/cpu:1"
-    ds, ds_iterator = self._create_ds_and_iterator(device0, initializable=True)
-    prefetch_op, reset_op, destroy_op = self._create_ops(
-        ds, ds_iterator, "reinit", device0, device1)
-
-    with self.test_session(config=worker_config) as sess:
-      sess.run(ds_iterator.initializer)
-      elem = sess.run(prefetch_op)
-      self.assertEqual(elem, [1.0])
-      elem = sess.run(prefetch_op)
-      self.assertEqual(elem, [2.0])
-      elem = sess.run(prefetch_op)
-      self.assertEqual(elem, [3.0])
-      elem = sess.run(prefetch_op)
-      self.assertEqual(elem, [4.0])
-      self._event.wait()
-      elem = sess.run(prefetch_op)
-      self.assertEqual(elem, [5.0])
-      # Lets reset the function buffering resource and reinitialize the
-      # iterator. Should be able to go through this again.
-      self._event.clear()
-      sess.run(reset_op)
-      sess.run(ds_iterator.initializer)
-      elem = sess.run(prefetch_op)
-      self.assertEqual(elem, [1.0])
-      elem = sess.run(prefetch_op)
-      self.assertEqual(elem, [2.0])
-      elem = sess.run(prefetch_op)
-      self.assertEqual(elem, [3.0])
-      elem = sess.run(prefetch_op)
-      self.assertEqual(elem, [4.0])
-      self._event.wait()
-      elem = sess.run(prefetch_op)
-      self.assertEqual(elem, [5.0])
-      sess.run(destroy_op)
-
-  def testReinitializationOutOfRange(self):
-    worker_config = config_pb2.ConfigProto(device_count={"CPU": 2})
-
-    device0 = "/job:localhost/replica:0/task:0/cpu:0"
-    device1 = "/job:localhost/replica:0/task:0/cpu:1"
-    ds, ds_iterator = self._create_ds_and_iterator(device0, initializable=True)
-    prefetch_op, reset_op, destroy_op = self._create_ops(
-        ds, ds_iterator, "reinit", device0, device1)
-
-    with self.test_session(config=worker_config) as sess:
-      sess.run(ds_iterator.initializer)
-      for i in range(1, 10):
-        elem = sess.run(prefetch_op)
-        self.assertEqual(elem, [float(i)])
-      # Try fetching after its over twice to test out end of sequence.
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(prefetch_op)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(prefetch_op)
-
-      # Now reset everything and try it out again.
-      self._event.clear()
-      sess.run(reset_op)
-      sess.run(ds_iterator.initializer)
-      for i in range(1, 10):
-        elem = sess.run(prefetch_op)
-        self.assertEqual(elem, [float(i)])
-      # Try fetching after its over twice to test out end of sequence.
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(prefetch_op)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(prefetch_op)
-
-      sess.run(destroy_op)
-
-  def testStringsGPU(self):
-    if not test_util.is_gpu_available():
-      self.skipTest("No GPU available")
-
-    device0 = "/job:localhost/replica:0/task:0/cpu:0"
-    device1 = "/job:localhost/replica:0/task:0/gpu:0"
-
-    ds = dataset_ops.Dataset.from_tensor_slices(["a", "b", "c"])
-    ds_iterator = ds.make_one_shot_iterator()
-    ds_iterator_handle = ds_iterator.string_handle()
-
-    @function.defun(input_signature=[tensor_spec.TensorSpec([], dtypes.string)])
-    def _remote_fn(h):
-      remote_iterator = iterator_ops.Iterator.from_string_handle(
-          h, ds.output_types, ds.output_shapes)
-      return remote_iterator.get_next()
-
-    target = constant_op.constant(device0)
-    with ops.device(device1):
-      buffer_resource_handle = prefetching_ops.function_buffering_resource(
-          f=_remote_fn.get_concrete_function(),
-          output_types=[dtypes.string],
-          target_device=target,
-          string_arg=ds_iterator_handle,
-          buffer_size=3,
-          shared_name="strings")
-
-    with ops.device(device1):
-      prefetch_op = prefetching_ops.function_buffering_resource_get_next(
-          function_buffer_resource=buffer_resource_handle,
-          output_types=[dtypes.string])
-      destroy_op = resource_variable_ops.destroy_resource_op(
-          buffer_resource_handle, ignore_lookup_error=True)
-
-    with self.cached_session() as sess:
-      self.assertEqual([b"a"], sess.run(prefetch_op))
-      self.assertEqual([b"b"], sess.run(prefetch_op))
-      self.assertEqual([b"c"], sess.run(prefetch_op))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(prefetch_op)
-
-      sess.run(destroy_op)
-
-
-if __name__ == "__main__":
-  test.main()
diff --git a/tensorflow/python/data/experimental/kernel_tests/get_single_element_test.py b/tensorflow/python/data/experimental/kernel_tests/get_single_element_test.py
index 8c07afbac57..ef576563a15 100644
--- a/tensorflow/python/data/experimental/kernel_tests/get_single_element_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/get_single_element_test.py
@@ -25,6 +25,7 @@ from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
 
@@ -39,6 +40,7 @@ class GetSingleElementTest(test_base.DatasetTestBase, parameterized.TestCase):
       ("MoreThanOne", 0, 2, errors.InvalidArgumentError,
        "Dataset had more than one element."),
   )
+  @test_util.run_deprecated_v1
   def testGetSingleElement(self, skip, take, error=None, error_msg=None):
     skip_t = array_ops.placeholder(dtypes.int64, shape=[])
     take_t = array_ops.placeholder(dtypes.int64, shape=[])
@@ -67,6 +69,17 @@ class GetSingleElementTest(test_base.DatasetTestBase, parameterized.TestCase):
         with self.assertRaisesRegexp(error, error_msg):
           sess.run(element, feed_dict={skip_t: skip, take_t: take})
 
+  def testWindow(self):
+    """Test that `get_single_element()` can consume a nested dataset."""
+    def flat_map_func(ds):
+      batched = ds.batch(2)
+      element = get_single_element.get_single_element(batched)
+      return dataset_ops.Dataset.from_tensors(element)
+
+    dataset = dataset_ops.Dataset.range(10).window(2).flat_map(flat_map_func)
+    self.assertDatasetProduces(
+        dataset, [[0, 1], [2, 3], [4, 5], [6, 7], [8, 9]])
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/data/experimental/kernel_tests/group_by_reducer_test.py b/tensorflow/python/data/experimental/kernel_tests/group_by_reducer_test.py
index 90303285931..8507df3d3a2 100644
--- a/tensorflow/python/data/experimental/kernel_tests/group_by_reducer_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/group_by_reducer_test.py
@@ -27,6 +27,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
@@ -36,14 +37,15 @@ class GroupByReducerTest(test_base.DatasetTestBase):
 
   def checkResults(self, dataset, shapes, values):
     self.assertEqual(shapes, dataset.output_shapes)
-    get_next = dataset.make_one_shot_iterator().get_next()
+    get_next = dataset_ops.make_one_shot_iterator(dataset).get_next()
     with self.cached_session() as sess:
       for expected in values:
-        got = sess.run(get_next)
+        got = self.evaluate(get_next)
         self.assertEqual(got, expected)
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+        self.evaluate(get_next)
 
+  @test_util.run_deprecated_v1
   def testSum(self):
     reducer = grouping.Reducer(
         init_func=lambda _: np.int64(0),
@@ -55,6 +57,7 @@ class GroupByReducerTest(test_base.DatasetTestBase):
       self.checkResults(
           dataset, shapes=tensor_shape.scalar(), values=[(i - 1) * i, i * i])
 
+  @test_util.run_deprecated_v1
   def testAverage(self):
 
     def reduce_fn(x, y):
@@ -72,6 +75,7 @@ class GroupByReducerTest(test_base.DatasetTestBase):
       self.checkResults(
           dataset, shapes=tensor_shape.scalar(), values=[i - 1, i])
 
+  @test_util.run_deprecated_v1
   def testConcat(self):
     components = np.array(list("abcdefghijklmnopqrst")).view(np.chararray)
     reducer = grouping.Reducer(
@@ -88,6 +92,7 @@ class GroupByReducerTest(test_base.DatasetTestBase):
           shapes=tensor_shape.scalar(),
           values=[b"acegikmoqs" [:i], b"bdfhjlnprt" [:i]])
 
+  @test_util.run_deprecated_v1
   def testSparseSum(self):
     def _sparse(i):
       return sparse_tensor.SparseTensorValue(
@@ -105,6 +110,7 @@ class GroupByReducerTest(test_base.DatasetTestBase):
       self.checkResults(
           dataset, shapes=tensor_shape.scalar(), values=[(i - 1) * i, i * i])
 
+  @test_util.run_deprecated_v1
   def testChangingStateShape(self):
 
     def reduce_fn(x, _):
@@ -124,14 +130,14 @@ class GroupByReducerTest(test_base.DatasetTestBase):
           grouping.group_by_reducer(lambda x: x, reducer))
       self.assertEqual([None], dataset.output_shapes[0].as_list())
       self.assertIs(None, dataset.output_shapes[1].ndims)
-      iterator = dataset.make_one_shot_iterator()
+      iterator = dataset_ops.make_one_shot_iterator(dataset)
       get_next = iterator.get_next()
       with self.cached_session() as sess:
-        x, y = sess.run(get_next)
+        x, y = self.evaluate(get_next)
         self.assertAllEqual([0] * (2**i), x)
         self.assertAllEqual(np.array(1, ndmin=i), y)
         with self.assertRaises(errors.OutOfRangeError):
-          sess.run(get_next)
+          self.evaluate(get_next)
 
   def testTypeMismatch(self):
     reducer = grouping.Reducer(
@@ -188,9 +194,9 @@ class GroupByReducerTest(test_base.DatasetTestBase):
     dataset = dataset_ops.Dataset.zip(
         (dataset_ops.Dataset.range(10), dataset_ops.Dataset.range(10))).apply(
             grouping.group_by_reducer(lambda x, y: np.int64(0), reducer))
-    get_next = dataset.make_one_shot_iterator().get_next()
+    get_next = dataset_ops.make_one_shot_iterator(dataset).get_next()
     with self.cached_session() as sess:
-      x, y = sess.run(get_next)
+      x, y = self.evaluate(get_next)
       self.assertAllEqual(x, np.asarray([x for x in range(10)]))
       self.assertEqual(y, 45)
 
diff --git a/tensorflow/python/data/experimental/kernel_tests/group_by_window_test.py b/tensorflow/python/data/experimental/kernel_tests/group_by_window_test.py
index 557d56e8b9a..cbb79e55f50 100644
--- a/tensorflow/python/data/experimental/kernel_tests/group_by_window_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/group_by_window_test.py
@@ -27,6 +27,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import string_ops
@@ -49,6 +50,7 @@ class GroupByWindowTest(test_base.DatasetTestBase):
              32, (tensor_shape.TensorShape([]), tensor_shape.TensorShape(
                  [None]), tensor_shape.TensorShape([3])))))
 
+  @test_util.run_deprecated_v1
   def testSingleBucket(self):
 
     def _map_fn(v):
@@ -63,14 +65,14 @@ class GroupByWindowTest(test_base.DatasetTestBase):
             lambda x, y, z: 0,
             lambda k, bucket: self._dynamicPad(k, bucket, 32), 32))
 
-    iterator = bucketed_dataset.make_initializable_iterator()
+    iterator = dataset_ops.make_initializable_iterator(bucketed_dataset)
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
     with self.cached_session() as sess:
-      sess.run(init_op)
+      self.evaluate(init_op)
 
-      which_bucket, bucketed_values = sess.run(get_next)
+      which_bucket, bucketed_values = self.evaluate(get_next)
 
       self.assertEqual(0, which_bucket)
 
@@ -84,6 +86,7 @@ class GroupByWindowTest(test_base.DatasetTestBase):
       self.assertAllEqual(expected_unk_int64, bucketed_values[1])
       self.assertAllEqual(expected_vec3_str, bucketed_values[2])
 
+  @test_util.run_deprecated_v1
   def testEvenOddBuckets(self):
 
     def _map_fn(v):
@@ -98,16 +101,16 @@ class GroupByWindowTest(test_base.DatasetTestBase):
             lambda x, y, z: math_ops.cast(x % 2, dtypes.int64),
             lambda k, bucket: self._dynamicPad(k, bucket, 32), 32))
 
-    iterator = bucketed_dataset.make_initializable_iterator()
+    iterator = dataset_ops.make_initializable_iterator(bucketed_dataset)
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
     with self.cached_session() as sess:
-      sess.run(init_op)
+      self.evaluate(init_op)
 
       # Get two minibatches (one containing even values, one containing odds)
-      which_bucket_even, bucketed_values_even = sess.run(get_next)
-      which_bucket_odd, bucketed_values_odd = sess.run(get_next)
+      which_bucket_even, bucketed_values_even = self.evaluate(get_next)
+      which_bucket_odd, bucketed_values_odd = self.evaluate(get_next)
 
       # Count number of bucket_tensors.
       self.assertEqual(3, len(bucketed_values_even))
@@ -141,6 +144,7 @@ class GroupByWindowTest(test_base.DatasetTestBase):
       self.assertAllEqual(expected_unk_int64, bucketed_values_odd[1])
       self.assertAllEqual(expected_vec3_str, bucketed_values_odd[2])
 
+  @test_util.run_deprecated_v1
   def testEvenOddBucketsFilterOutAllOdd(self):
 
     def _map_fn(v):
@@ -169,16 +173,16 @@ class GroupByWindowTest(test_base.DatasetTestBase):
             lambda d: math_ops.cast(d["x"] % 2, dtypes.int64),
             lambda k, bucket: _dynamic_pad_fn(k, bucket, 32), 32))
 
-    iterator = bucketed_dataset.make_initializable_iterator()
+    iterator = dataset_ops.make_initializable_iterator(bucketed_dataset)
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
     with self.cached_session() as sess:
-      sess.run(init_op)
+      self.evaluate(init_op)
 
       # Get two minibatches ([0, 2, ...] and [64, 66, ...])
-      which_bucket0, bucketed_values_even0 = sess.run(get_next)
-      which_bucket1, bucketed_values_even1 = sess.run(get_next)
+      which_bucket0, bucketed_values_even0 = self.evaluate(get_next)
+      which_bucket1, bucketed_values_even1 = self.evaluate(get_next)
 
       # Ensure that bucket 1 was completely filtered out
       self.assertAllEqual(0, which_bucket0)
@@ -188,6 +192,7 @@ class GroupByWindowTest(test_base.DatasetTestBase):
       self.assertAllEqual(
           np.arange(64, 128, 2, dtype=np.int64), bucketed_values_even1["x"])
 
+  @test_util.run_deprecated_v1
   def testDynamicWindowSize(self):
     components = np.arange(100).astype(np.int64)
 
@@ -202,16 +207,16 @@ class GroupByWindowTest(test_base.DatasetTestBase):
     dataset = dataset_ops.Dataset.from_tensor_slices(components).apply(
         grouping.group_by_window(lambda x: x % 2, lambda _, xs: xs.batch(20),
                                  None, window_size_func))
-    iterator = dataset.make_initializable_iterator()
+    iterator = dataset_ops.make_initializable_iterator(dataset)
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
     with self.cached_session() as sess:
-      sess.run(init_op)
+      self.evaluate(init_op)
       with self.assertRaises(errors.OutOfRangeError):
         batches = 0
         while True:
-          result = sess.run(get_next)
+          result = self.evaluate(get_next)
           is_even = all(x % 2 == 0 for x in result)
           is_odd = all(x % 2 == 1 for x in result)
           self.assertTrue(is_even or is_odd)
@@ -221,22 +226,23 @@ class GroupByWindowTest(test_base.DatasetTestBase):
 
       self.assertEqual(batches, 15)
 
+  @test_util.run_deprecated_v1
   def testSimple(self):
     components = np.random.randint(100, size=(200,)).astype(np.int64)
-    iterator = (
+    iterator = dataset_ops.make_initializable_iterator(
         dataset_ops.Dataset.from_tensor_slices(components).map(lambda x: x * x)
         .apply(
             grouping.group_by_window(lambda x: x % 2, lambda _, xs: xs.batch(4),
-                                     4)).make_initializable_iterator())
+                                     4)))
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
     with self.cached_session() as sess:
-      sess.run(init_op)
+      self.evaluate(init_op)
       counts = []
       with self.assertRaises(errors.OutOfRangeError):
         while True:
-          result = sess.run(get_next)
+          result = self.evaluate(get_next)
           self.assertTrue(
               all(x % 2 == 0
                   for x in result) or all(x % 2 == 1)
@@ -248,61 +254,64 @@ class GroupByWindowTest(test_base.DatasetTestBase):
       self.assertGreaterEqual(num_full_batches, 24)
       self.assertTrue(all(c == 4 for c in counts[:num_full_batches]))
 
+  @test_util.run_deprecated_v1
   def testImmediateOutput(self):
     components = np.array(
         [0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 0, 0, 2, 2, 0, 0], dtype=np.int64)
-    iterator = (
+    iterator = dataset_ops.make_initializable_iterator(
         dataset_ops.Dataset.from_tensor_slices(components).repeat(-1).apply(
             grouping.group_by_window(lambda x: x % 3, lambda _, xs: xs.batch(4),
-                                     4)).make_initializable_iterator())
+                                     4)))
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
     with self.cached_session() as sess:
-      sess.run(init_op)
+      self.evaluate(init_op)
       # The input is infinite, so this test demonstrates that:
       # 1. We produce output without having to consume the entire input,
       # 2. Different buckets can produce output at different rates, and
       # 3. For deterministic input, the output is deterministic.
       for _ in range(3):
-        self.assertAllEqual([0, 0, 0, 0], sess.run(get_next))
-        self.assertAllEqual([1, 1, 1, 1], sess.run(get_next))
-        self.assertAllEqual([2, 2, 2, 2], sess.run(get_next))
-        self.assertAllEqual([0, 0, 0, 0], sess.run(get_next))
+        self.assertAllEqual([0, 0, 0, 0], self.evaluate(get_next))
+        self.assertAllEqual([1, 1, 1, 1], self.evaluate(get_next))
+        self.assertAllEqual([2, 2, 2, 2], self.evaluate(get_next))
+        self.assertAllEqual([0, 0, 0, 0], self.evaluate(get_next))
 
+  @test_util.run_deprecated_v1
   def testSmallGroups(self):
     components = np.array([0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0], dtype=np.int64)
-    iterator = (
+    iterator = dataset_ops.make_initializable_iterator(
         dataset_ops.Dataset.from_tensor_slices(components).apply(
             grouping.group_by_window(lambda x: x % 2, lambda _, xs: xs.batch(4),
-                                     4)).make_initializable_iterator())
+                                     4)))
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
     with self.cached_session() as sess:
-      sess.run(init_op)
-      self.assertAllEqual([0, 0, 0, 0], sess.run(get_next))
-      self.assertAllEqual([1, 1, 1, 1], sess.run(get_next))
+      self.evaluate(init_op)
+      self.assertAllEqual([0, 0, 0, 0], self.evaluate(get_next))
+      self.assertAllEqual([1, 1, 1, 1], self.evaluate(get_next))
       # The small outputs at the end are deterministically produced in key
       # order.
-      self.assertAllEqual([0, 0, 0], sess.run(get_next))
-      self.assertAllEqual([1], sess.run(get_next))
+      self.assertAllEqual([0, 0, 0], self.evaluate(get_next))
+      self.assertAllEqual([1], self.evaluate(get_next))
 
+  @test_util.run_deprecated_v1
   def testEmpty(self):
-    iterator = (
+    iterator = dataset_ops.make_initializable_iterator(
         dataset_ops.Dataset.range(4).apply(
-            grouping.group_by_window(lambda _: 0, lambda _, xs: xs, 0))
-        .make_initializable_iterator())
+            grouping.group_by_window(lambda _: 0, lambda _, xs: xs, 0)))
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
     with self.cached_session() as sess:
-      sess.run(init_op)
+      self.evaluate(init_op)
       with self.assertRaisesRegexp(
           errors.InvalidArgumentError,
           "Window size must be greater than zero, but got 0."):
-        print(sess.run(get_next))
+        print(self.evaluate(get_next))
 
+  @test_util.run_deprecated_v1
   def testReduceFuncError(self):
     components = np.random.randint(100, size=(200,)).astype(np.int64)
 
@@ -314,19 +323,19 @@ class GroupByWindowTest(test_base.DatasetTestBase):
           padded_shapes=(tensor_shape.TensorShape([]),
                          constant_op.constant([5], dtype=dtypes.int64) * -1))
 
-    iterator = (
+    iterator = dataset_ops.make_initializable_iterator(
         dataset_ops.Dataset.from_tensor_slices(components)
         .map(lambda x: (x, ops.convert_to_tensor([x * x]))).apply(
-            grouping.group_by_window(lambda x, _: x % 2, reduce_func,
-                                     32)).make_initializable_iterator())
+            grouping.group_by_window(lambda x, _: x % 2, reduce_func, 32)))
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
     with self.cached_session() as sess:
-      sess.run(init_op)
+      self.evaluate(init_op)
       with self.assertRaises(errors.InvalidArgumentError):
-        sess.run(get_next)
+        self.evaluate(get_next)
 
+  @test_util.run_deprecated_v1
   def testConsumeWindowDatasetMoreThanOnce(self):
     components = np.random.randint(50, size=(200,)).astype(np.int64)
 
@@ -340,22 +349,21 @@ class GroupByWindowTest(test_base.DatasetTestBase):
               4, padded_shapes=ops.convert_to_tensor([(key + 1) * 10])),
       ))
 
-    iterator = (
+    iterator = dataset_ops.make_initializable_iterator(
         dataset_ops.Dataset.from_tensor_slices(components)
         .map(lambda x: array_ops.fill([math_ops.cast(x, dtypes.int32)], x))
         .apply(grouping.group_by_window(
             lambda x: math_ops.cast(array_ops.shape(x)[0] // 10, dtypes.int64),
-            reduce_func, 4))
-        .make_initializable_iterator())
+            reduce_func, 4)))
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
     with self.cached_session() as sess:
-      sess.run(init_op)
+      self.evaluate(init_op)
       counts = []
       with self.assertRaises(errors.OutOfRangeError):
         while True:
-          tight_result, multiple_of_10_result = sess.run(get_next)
+          tight_result, multiple_of_10_result = self.evaluate(get_next)
           self.assertEqual(0, multiple_of_10_result.shape[1] % 10)
           self.assertAllEqual(tight_result,
                               multiple_of_10_result[:, :tight_result.shape[1]])
diff --git a/tensorflow/python/data/experimental/kernel_tests/ignore_errors_test.py b/tensorflow/python/data/experimental/kernel_tests/ignore_errors_test.py
index c0ec1486ab8..81f580fccbd 100644
--- a/tensorflow/python/data/experimental/kernel_tests/ignore_errors_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/ignore_errors_test.py
@@ -25,6 +25,7 @@ from tensorflow.python.data.experimental.ops import error_ops
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import errors
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import io_ops
 from tensorflow.python.platform import test
@@ -35,6 +36,7 @@ _NUMPY_RANDOM_SEED = 42
 
 class IgnoreErrorsTest(test_base.DatasetTestBase):
 
+  @test_util.run_deprecated_v1
   def testMapIgnoreError(self):
     components = np.array([1., 2., 3., np.nan, 5.]).astype(np.float32)
 
@@ -42,17 +44,18 @@ class IgnoreErrorsTest(test_base.DatasetTestBase):
         dataset_ops.Dataset.from_tensor_slices(components)
         .map(lambda x: array_ops.check_numerics(x, "message")).apply(
             error_ops.ignore_errors()))
-    iterator = dataset.make_initializable_iterator()
+    iterator = dataset_ops.make_initializable_iterator(dataset)
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
     with self.cached_session() as sess:
-      sess.run(init_op)
+      self.evaluate(init_op)
       for x in [1., 2., 3., 5.]:
-        self.assertEqual(x, sess.run(get_next))
+        self.assertEqual(x, self.evaluate(get_next))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+        self.evaluate(get_next)
 
+  @test_util.run_deprecated_v1
   def testParallelMapIgnoreError(self):
     components = np.array([1., 2., 3., np.nan, 5.]).astype(np.float32)
 
@@ -60,17 +63,18 @@ class IgnoreErrorsTest(test_base.DatasetTestBase):
         dataset_ops.Dataset.from_tensor_slices(components).map(
             lambda x: array_ops.check_numerics(x, "message"),
             num_parallel_calls=2).prefetch(2).apply(error_ops.ignore_errors()))
-    iterator = dataset.make_initializable_iterator()
+    iterator = dataset_ops.make_initializable_iterator(dataset)
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
     with self.cached_session() as sess:
-      sess.run(init_op)
+      self.evaluate(init_op)
       for x in [1., 2., 3., 5.]:
-        self.assertEqual(x, sess.run(get_next))
+        self.assertEqual(x, self.evaluate(get_next))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+        self.evaluate(get_next)
 
+  @test_util.run_deprecated_v1
   def testReadFileIgnoreError(self):
 
     def write_string_to_file(value, filename):
@@ -87,28 +91,28 @@ class IgnoreErrorsTest(test_base.DatasetTestBase):
         dataset_ops.Dataset.from_tensor_slices(filenames).map(
             io_ops.read_file,
             num_parallel_calls=2).prefetch(2).apply(error_ops.ignore_errors()))
-    iterator = dataset.make_initializable_iterator()
+    iterator = dataset_ops.make_initializable_iterator(dataset)
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
     with self.cached_session() as sess:
       # All of the files are present.
-      sess.run(init_op)
+      self.evaluate(init_op)
       for filename in filenames:
-        self.assertEqual(compat.as_bytes(filename), sess.run(get_next))
+        self.assertEqual(compat.as_bytes(filename), self.evaluate(get_next))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+        self.evaluate(get_next)
 
       # Delete one of the files.
       os.remove(filenames[0])
 
       # Attempting to read filenames[0] will fail, but ignore_errors()
       # will catch the error.
-      sess.run(init_op)
+      self.evaluate(init_op)
       for filename in filenames[1:]:
-        self.assertEqual(compat.as_bytes(filename), sess.run(get_next))
+        self.assertEqual(compat.as_bytes(filename), self.evaluate(get_next))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+        self.evaluate(get_next)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/data/experimental/kernel_tests/indexed_dataset_ops_test.py b/tensorflow/python/data/experimental/kernel_tests/indexed_dataset_ops_test.py
index c93a8353ce0..c3c4ccd0770 100644
--- a/tensorflow/python/data/experimental/kernel_tests/indexed_dataset_ops_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/indexed_dataset_ops_test.py
@@ -24,6 +24,7 @@ from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_experimental_dataset_ops as ged_ops
 from tensorflow.python.platform import test
@@ -31,6 +32,7 @@ from tensorflow.python.platform import test
 
 class IndexedDatasetOpsTest(test_base.DatasetTestBase):
 
+  @test_util.run_deprecated_v1
   def testLowLevelIndexedDatasetOps(self):
     identity = ged_ops.experimental_identity_indexed_dataset(
         ops.convert_to_tensor(16, dtype=dtypes.uint64))
@@ -46,14 +48,15 @@ class IndexedDatasetOpsTest(test_base.DatasetTestBase):
         handle, index, output_types=[dtypes.uint64], output_shapes=[[]])
 
     with self.cached_session() as sess:
-      sess.run(materialize)
+      self.evaluate(materialize)
       self.assertEqual([3], sess.run(get_op, feed_dict={index: 3}))
 
+  @test_util.run_deprecated_v1
   def testIdentityIndexedDataset(self):
     ds = indexed_dataset_ops.IdentityIndexedDataset(16)
     materialized = ds.materialize()
     with self.cached_session() as sess:
-      sess.run(materialized.initializer)
+      self.evaluate(materialized.initializer)
       placeholder = array_ops.placeholder(dtypes.uint64, shape=[])
       for i in range(16):
         output = sess.run(
@@ -68,12 +71,13 @@ class IndexedDatasetOpsTest(test_base.DatasetTestBase):
     itr = ds.make_initializable_iterator()
     n = itr.get_next()
     with self.cached_session() as sess:
-      sess.run(itr.initializer)
+      self.evaluate(itr.initializer)
       for i in range(16):
-        output = sess.run(n)
+        output = self.evaluate(n)
         self.assertEqual(i, output)
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(n)
+        self.evaluate(n)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/data/experimental/kernel_tests/make_batched_features_dataset_test.py b/tensorflow/python/data/experimental/kernel_tests/make_batched_features_dataset_test.py
index 91ae8cb1bd2..7c788104948 100644
--- a/tensorflow/python/data/experimental/kernel_tests/make_batched_features_dataset_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/make_batched_features_dataset_test.py
@@ -21,11 +21,13 @@ import numpy as np
 
 from tensorflow.python.data.experimental.kernel_tests import reader_dataset_ops_test_base
 from tensorflow.python.data.experimental.ops import readers
+from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import readers as core_readers
 from tensorflow.python.data.util import nest
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import io_ops
 from tensorflow.python.ops import parsing_ops
 from tensorflow.python.platform import test
@@ -40,11 +42,12 @@ class MakeBatchedFeaturesDatasetTest(
         with ops.Graph().as_default() as g:
           with self.session(graph=g) as sess:
             # Basic test: read from file 0.
-            self.outputs = self.make_batch_feature(
-                filenames=self.test_filenames[0],
-                label_key="label",
-                num_epochs=num_epochs,
-                batch_size=batch_size).make_one_shot_iterator().get_next()
+            self.outputs = dataset_ops.make_one_shot_iterator(
+                self.make_batch_feature(
+                    filenames=self.test_filenames[0],
+                    label_key="label",
+                    num_epochs=num_epochs,
+                    batch_size=batch_size)).get_next()
             self.verify_records(
                 sess,
                 batch_size,
@@ -57,11 +60,12 @@ class MakeBatchedFeaturesDatasetTest(
         with ops.Graph().as_default() as g:
           with self.session(graph=g) as sess:
             # Basic test: read from file 1.
-            self.outputs = self.make_batch_feature(
-                filenames=self.test_filenames[1],
-                label_key="label",
-                num_epochs=num_epochs,
-                batch_size=batch_size).make_one_shot_iterator().get_next()
+            self.outputs = dataset_ops.make_one_shot_iterator(
+                self.make_batch_feature(
+                    filenames=self.test_filenames[1],
+                    label_key="label",
+                    num_epochs=num_epochs,
+                    batch_size=batch_size)).get_next()
             self.verify_records(
                 sess,
                 batch_size,
@@ -74,11 +78,12 @@ class MakeBatchedFeaturesDatasetTest(
         with ops.Graph().as_default() as g:
           with self.session(graph=g) as sess:
             # Basic test: read from both files.
-            self.outputs = self.make_batch_feature(
-                filenames=self.test_filenames,
-                label_key="label",
-                num_epochs=num_epochs,
-                batch_size=batch_size).make_one_shot_iterator().get_next()
+            self.outputs = dataset_ops.make_one_shot_iterator(
+                self.make_batch_feature(
+                    filenames=self.test_filenames,
+                    label_key="label",
+                    num_epochs=num_epochs,
+                    batch_size=batch_size)).get_next()
             self.verify_records(
                 sess,
                 batch_size,
@@ -90,14 +95,16 @@ class MakeBatchedFeaturesDatasetTest(
         with ops.Graph().as_default() as g:
           with self.session(graph=g) as sess:
             # Basic test: read from both files.
-            self.outputs = self.make_batch_feature(
-                filenames=self.test_filenames,
-                num_epochs=num_epochs,
-                batch_size=batch_size).make_one_shot_iterator().get_next()
+            self.outputs = dataset_ops.make_one_shot_iterator(
+                self.make_batch_feature(
+                    filenames=self.test_filenames,
+                    num_epochs=num_epochs,
+                    batch_size=batch_size)).get_next()
             self.verify_records(sess, batch_size, num_epochs=num_epochs)
             with self.assertRaises(errors.OutOfRangeError):
               self._next_actual_batch(sess)
 
+  @test_util.run_deprecated_v1
   def testReadWithEquivalentDataset(self):
     features = {
         "file": parsing_ops.FixedLenFeature([], dtypes.int64),
@@ -107,19 +114,19 @@ class MakeBatchedFeaturesDatasetTest(
         core_readers.TFRecordDataset(self.test_filenames)
         .map(lambda x: parsing_ops.parse_single_example(x, features))
         .repeat(10).batch(2))
-    iterator = dataset.make_initializable_iterator()
+    iterator = dataset_ops.make_initializable_iterator(dataset)
     init_op = iterator.initializer
     next_element = iterator.get_next()
 
     with self.cached_session() as sess:
-      sess.run(init_op)
+      self.evaluate(init_op)
       for file_batch, _, _, _, record_batch, _ in self._next_expected_batch(
           range(self._num_files), 2, 10):
-        actual_batch = sess.run(next_element)
+        actual_batch = self.evaluate(next_element)
         self.assertAllEqual(file_batch, actual_batch["file"])
         self.assertAllEqual(record_batch, actual_batch["record"])
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
+        self.evaluate(next_element)
 
   def testReadWithFusedShuffleRepeatDataset(self):
     num_epochs = 5
@@ -128,18 +135,18 @@ class MakeBatchedFeaturesDatasetTest(
       # Test that shuffling with same seed produces the same result.
       with ops.Graph().as_default() as g:
         with self.session(graph=g) as sess:
-          outputs1 = self.make_batch_feature(
+          outputs1 = dataset_ops.make_one_shot_iterator(self.make_batch_feature(
               filenames=self.test_filenames[0],
               num_epochs=num_epochs,
               batch_size=batch_size,
               shuffle=True,
-              shuffle_seed=5).make_one_shot_iterator().get_next()
-          outputs2 = self.make_batch_feature(
+              shuffle_seed=5)).get_next()
+          outputs2 = dataset_ops.make_one_shot_iterator(self.make_batch_feature(
               filenames=self.test_filenames[0],
               num_epochs=num_epochs,
               batch_size=batch_size,
               shuffle=True,
-              shuffle_seed=5).make_one_shot_iterator().get_next()
+              shuffle_seed=5)).get_next()
           for _ in range(total_records // batch_size):
             batch1 = self._run_actual_batch(outputs1, sess)
             batch2 = self._run_actual_batch(outputs2, sess)
@@ -149,18 +156,18 @@ class MakeBatchedFeaturesDatasetTest(
       # Test that shuffling with different seeds produces a different order.
       with ops.Graph().as_default() as g:
         with self.session(graph=g) as sess:
-          outputs1 = self.make_batch_feature(
+          outputs1 = dataset_ops.make_one_shot_iterator(self.make_batch_feature(
               filenames=self.test_filenames[0],
               num_epochs=num_epochs,
               batch_size=batch_size,
               shuffle=True,
-              shuffle_seed=5).make_one_shot_iterator().get_next()
-          outputs2 = self.make_batch_feature(
+              shuffle_seed=5)).get_next()
+          outputs2 = dataset_ops.make_one_shot_iterator(self.make_batch_feature(
               filenames=self.test_filenames[0],
               num_epochs=num_epochs,
               batch_size=batch_size,
               shuffle=True,
-              shuffle_seed=15).make_one_shot_iterator().get_next()
+              shuffle_seed=15)).get_next()
           all_equal = True
           for _ in range(total_records // batch_size):
             batch1 = self._run_actual_batch(outputs1, sess)
@@ -176,14 +183,14 @@ class MakeBatchedFeaturesDatasetTest(
         for parser_num_threads in [2, 4]:
           with ops.Graph().as_default() as g:
             with self.session(graph=g) as sess:
-              self.outputs = self.make_batch_feature(
-                  filenames=self.test_filenames,
-                  label_key="label",
-                  num_epochs=num_epochs,
-                  batch_size=batch_size,
-                  reader_num_threads=reader_num_threads,
-                  parser_num_threads=parser_num_threads).make_one_shot_iterator(
-                  ).get_next()
+              self.outputs = dataset_ops.make_one_shot_iterator(
+                  self.make_batch_feature(
+                      filenames=self.test_filenames,
+                      label_key="label",
+                      num_epochs=num_epochs,
+                      batch_size=batch_size,
+                      reader_num_threads=reader_num_threads,
+                      parser_num_threads=parser_num_threads)).get_next()
               self.verify_records(
                   sess,
                   batch_size,
@@ -195,13 +202,13 @@ class MakeBatchedFeaturesDatasetTest(
 
           with ops.Graph().as_default() as g:
             with self.session(graph=g) as sess:
-              self.outputs = self.make_batch_feature(
-                  filenames=self.test_filenames,
-                  num_epochs=num_epochs,
-                  batch_size=batch_size,
-                  reader_num_threads=reader_num_threads,
-                  parser_num_threads=parser_num_threads).make_one_shot_iterator(
-                  ).get_next()
+              self.outputs = dataset_ops.make_one_shot_iterator(
+                  self.make_batch_feature(
+                      filenames=self.test_filenames,
+                      num_epochs=num_epochs,
+                      batch_size=batch_size,
+                      reader_num_threads=reader_num_threads,
+                      parser_num_threads=parser_num_threads)).get_next()
               self.verify_records(
                   sess,
                   batch_size,
@@ -215,12 +222,12 @@ class MakeBatchedFeaturesDatasetTest(
       for num_epochs in [1, 10]:
         with ops.Graph().as_default():
           # Basic test: read from file 0.
-          outputs = self.make_batch_feature(
+          outputs = dataset_ops.make_one_shot_iterator(self.make_batch_feature(
               filenames=self.test_filenames[0],
               label_key="label",
               num_epochs=num_epochs,
               batch_size=batch_size,
-              drop_final_batch=True).make_one_shot_iterator().get_next()
+              drop_final_batch=True)).get_next()
           for tensor in nest.flatten(outputs):
             if isinstance(tensor, ops.Tensor):  # Guard against SparseTensor.
               self.assertEqual(tensor.shape[0], batch_size)
diff --git a/tensorflow/python/data/experimental/kernel_tests/make_csv_dataset_test.py b/tensorflow/python/data/experimental/kernel_tests/make_csv_dataset_test.py
index e4bf0891842..e80accee330 100644
--- a/tensorflow/python/data/experimental/kernel_tests/make_csv_dataset_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/make_csv_dataset_test.py
@@ -25,11 +25,13 @@ import numpy as np
 
 from tensorflow.python.data.experimental.ops import readers
 from tensorflow.python.data.kernel_tests import test_base
+from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.util import nest
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
 
 
@@ -82,7 +84,7 @@ class MakeCsvDatasetTest(test_base.DatasetTestBase):
       expected_output,
       expected_keys,
   ):
-    nxt = dataset.make_one_shot_iterator().get_next()
+    nxt = dataset_ops.make_one_shot_iterator(dataset).get_next()
 
     for expected_features in self._next_expected_batch(
         expected_output,
@@ -90,7 +92,7 @@ class MakeCsvDatasetTest(test_base.DatasetTestBase):
         batch_size,
         num_epochs,
     ):
-      actual_features = sess.run(nxt)
+      actual_features = self.evaluate(nxt)
 
       if label_name is not None:
         expected_labels = expected_features.pop(label_name)
@@ -102,7 +104,7 @@ class MakeCsvDatasetTest(test_base.DatasetTestBase):
         self.assertAllEqual(expected_features[k], actual_features[k])
 
     with self.assertRaises(errors.OutOfRangeError):
-      sess.run(nxt)
+      self.evaluate(nxt)
 
   def _test_dataset(self,
                     inputs,
@@ -127,6 +129,7 @@ class MakeCsvDatasetTest(test_base.DatasetTestBase):
         self._verify_output(sess, dataset, batch_size, num_epochs, label_name,
                             expected_output, expected_keys)
 
+  @test_util.run_deprecated_v1
   def testMakeCSVDataset(self):
     """Tests making a CSV dataset with keys and defaults provided."""
     record_defaults = [
@@ -158,6 +161,7 @@ class MakeCsvDatasetTest(test_base.DatasetTestBase):
         column_defaults=record_defaults,
     )
 
+  @test_util.run_deprecated_v1
   def testMakeCSVDataset_withBatchSizeAndEpochs(self):
     """Tests making a CSV dataset with keys and defaults provided."""
     record_defaults = [
@@ -189,6 +193,7 @@ class MakeCsvDatasetTest(test_base.DatasetTestBase):
         column_defaults=record_defaults,
     )
 
+  @test_util.run_deprecated_v1
   def testMakeCSVDataset_withCompressionType(self):
     """Tests `compression_type` argument."""
     record_defaults = [
@@ -257,6 +262,7 @@ class MakeCsvDatasetTest(test_base.DatasetTestBase):
           label_name="not_a_real_label",
           column_names=column_names)
 
+  @test_util.run_deprecated_v1
   def testMakeCSVDataset_withNoLabel(self):
     """Tests making a CSV dataset with no label provided."""
     record_defaults = [
@@ -286,6 +292,7 @@ class MakeCsvDatasetTest(test_base.DatasetTestBase):
         column_defaults=record_defaults,
     )
 
+  @test_util.run_deprecated_v1
   def testMakeCSVDataset_withNoHeader(self):
     """Tests that datasets can be created from CSV files with no header line.
     """
@@ -347,6 +354,7 @@ class MakeCsvDatasetTest(test_base.DatasetTestBase):
         column_defaults=record_defaults,
     )
 
+  @test_util.run_deprecated_v1
   def testMakeCSVDataset_withNoColNames(self):
     """Tests that datasets can be created when column names are not specified.
 
@@ -451,6 +459,7 @@ class MakeCsvDatasetTest(test_base.DatasetTestBase):
         header=True,
     )
 
+  @test_util.run_deprecated_v1
   def testMakeCSVDataset_withSelectCols(self):
     record_defaults = [
         constant_op.constant([], dtypes.int32),
@@ -557,6 +566,7 @@ class MakeCsvDatasetTest(test_base.DatasetTestBase):
           label_name=None,
           select_columns=["invalid_col_name"])
 
+  @test_util.run_deprecated_v1
   def testMakeCSVDataset_withShuffle(self):
     record_defaults = [
         constant_op.constant([], dtypes.int32),
@@ -604,11 +614,11 @@ class MakeCsvDatasetTest(test_base.DatasetTestBase):
               shuffle_seed=5,
               num_epochs=2,
           )
-          outputs1 = dataset1.make_one_shot_iterator().get_next()
-          outputs2 = dataset2.make_one_shot_iterator().get_next()
+          outputs1 = dataset_ops.make_one_shot_iterator(dataset1).get_next()
+          outputs2 = dataset_ops.make_one_shot_iterator(dataset2).get_next()
           for _ in range(total_records // batch_size):
-            batch1 = nest.flatten(sess.run(outputs1))
-            batch2 = nest.flatten(sess.run(outputs2))
+            batch1 = nest.flatten(self.evaluate(outputs1))
+            batch2 = nest.flatten(self.evaluate(outputs2))
             for i in range(len(batch1)):
               self.assertAllEqual(batch1[i], batch2[i])
 
@@ -635,12 +645,12 @@ class MakeCsvDatasetTest(test_base.DatasetTestBase):
               shuffle_seed=6,
               num_epochs=2,
           )
-          outputs1 = dataset1.make_one_shot_iterator().get_next()
-          outputs2 = dataset2.make_one_shot_iterator().get_next()
+          outputs1 = dataset_ops.make_one_shot_iterator(dataset1).get_next()
+          outputs2 = dataset_ops.make_one_shot_iterator(dataset2).get_next()
           all_equal = False
           for _ in range(total_records // batch_size):
-            batch1 = nest.flatten(sess.run(outputs1))
-            batch2 = nest.flatten(sess.run(outputs2))
+            batch1 = nest.flatten(self.evaluate(outputs1))
+            batch2 = nest.flatten(self.evaluate(outputs2))
             for i in range(len(batch1)):
               all_equal = all_equal and np.array_equal(batch1[i], batch2[i])
           self.assertFalse(all_equal)
diff --git a/tensorflow/python/data/experimental/kernel_tests/make_tf_record_dataset_test.py b/tensorflow/python/data/experimental/kernel_tests/make_tf_record_dataset_test.py
index 657cf3c00ee..ab2feb64262 100644
--- a/tensorflow/python/data/experimental/kernel_tests/make_tf_record_dataset_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/make_tf_record_dataset_test.py
@@ -19,6 +19,7 @@ from __future__ import print_function
 
 from tensorflow.python.data.experimental.kernel_tests import reader_dataset_ops_test_base
 from tensorflow.python.data.experimental.ops import readers
+from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.util import nest
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
@@ -105,7 +106,7 @@ class MakeTFRecordDatasetTest(
     for expected_batch in self._next_expected_batch(
         file_indices, batch_size, num_epochs, interleave_cycle_length,
         drop_final_batch, use_parser_fn):
-      actual_batch = sess.run(outputs)
+      actual_batch = self.evaluate(outputs)
       self.assertAllEqual(expected_batch, actual_batch)
 
   def _read_test(self, batch_size, num_epochs, file_index=None,
@@ -122,20 +123,21 @@ class MakeTFRecordDatasetTest(
 
     with ops.Graph().as_default() as g:
       with self.session(graph=g) as sess:
-        outputs = readers.make_tf_record_dataset(
-            file_pattern=file_pattern,
-            num_epochs=num_epochs,
-            batch_size=batch_size,
-            parser_fn=fn,
-            num_parallel_reads=num_parallel_reads,
-            drop_final_batch=drop_final_batch,
-            shuffle=False).make_one_shot_iterator().get_next()
+        outputs = dataset_ops.make_one_shot_iterator(
+            readers.make_tf_record_dataset(
+                file_pattern=file_pattern,
+                num_epochs=num_epochs,
+                batch_size=batch_size,
+                parser_fn=fn,
+                num_parallel_reads=num_parallel_reads,
+                drop_final_batch=drop_final_batch,
+                shuffle=False)).get_next()
         self._verify_records(
             sess, outputs, batch_size, file_index, num_epochs=num_epochs,
             interleave_cycle_length=num_parallel_reads,
             drop_final_batch=drop_final_batch, use_parser_fn=parser_fn)
         with self.assertRaises(errors.OutOfRangeError):
-          sess.run(outputs)
+          self.evaluate(outputs)
 
   def testRead(self):
     for batch_size in [1, 2]:
@@ -185,22 +187,22 @@ class MakeTFRecordDatasetTest(
             num_parallel_reads=num_parallel_reads,
             shuffle=True,
             shuffle_seed=seed)
-        iterator = dataset.make_initializable_iterator()
+        iterator = dataset_ops.make_initializable_iterator(dataset)
         next_element = iterator.get_next()
 
-        sess.run(iterator.initializer)
+        self.evaluate(iterator.initializer)
         first_batches = []
         try:
           while True:
-            first_batches.append(sess.run(next_element))
+            first_batches.append(self.evaluate(next_element))
         except errors.OutOfRangeError:
           pass
 
-        sess.run(iterator.initializer)
+        self.evaluate(iterator.initializer)
         second_batches = []
         try:
           while True:
-            second_batches.append(sess.run(next_element))
+            second_batches.append(self.evaluate(next_element))
         except errors.OutOfRangeError:
           pass
 
diff --git a/tensorflow/python/data/experimental/kernel_tests/map_and_batch_test.py b/tensorflow/python/data/experimental/kernel_tests/map_and_batch_test.py
index 5ead6d1c754..e6e24c3db1f 100644
--- a/tensorflow/python/data/experimental/kernel_tests/map_and_batch_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/map_and_batch_test.py
@@ -29,6 +29,7 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
@@ -48,6 +49,7 @@ class MapAndBatchTest(test_base.DatasetTestBase, parameterized.TestCase):
       ("ParallelCallsNUMA", 2, None, True),
       ("ParallelBatchesNUMA", None, 10, True),
   )
+  @test_util.run_deprecated_v1
   def testMapAndBatch(self, num_parallel_calls, num_parallel_batches,
                       numa_aware):
     """Test a dataset that maps a TF function across its input elements."""
@@ -76,7 +78,7 @@ class MapAndBatchTest(test_base.DatasetTestBase, parameterized.TestCase):
       options.experimental_numa_aware = True
       dataset = dataset.with_options(options)
 
-    iterator = dataset.make_initializable_iterator()
+    iterator = dataset_ops.make_initializable_iterator(dataset)
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
@@ -89,13 +91,13 @@ class MapAndBatchTest(test_base.DatasetTestBase, parameterized.TestCase):
       sess.run(init_op, feed_dict={count: 28, batch_size: 14})
       num_batches = (28 * 7) // 14
       for i in range(num_batches):
-        result = sess.run(get_next)
+        result = self.evaluate(get_next)
         for component, result_component in zip(components, result):
           for j in range(14):
             self.assertAllEqual(component[(i * 14 + j) % 7]**2,
                                 result_component[j])
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+        self.evaluate(get_next)
 
       # Batch of a finite input, where the batch_size does not
       # divide the total number of elements.
@@ -104,23 +106,23 @@ class MapAndBatchTest(test_base.DatasetTestBase, parameterized.TestCase):
       # We expect (num_batches - 1) full-sized batches.
       num_batches = int(math.ceil((14 * 7) / 8))
       for i in range(num_batches - 1):
-        result = sess.run(get_next)
+        result = self.evaluate(get_next)
         for component, result_component in zip(components, result):
           for j in range(8):
             self.assertAllEqual(component[(i * 8 + j) % 7]**2,
                                 result_component[j])
-      result = sess.run(get_next)
+      result = self.evaluate(get_next)
       for component, result_component in zip(components, result):
         for j in range((14 * 7) % 8):
           self.assertAllEqual(component[((num_batches - 1) * 8 + j) % 7]**2,
                               result_component[j])
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+        self.evaluate(get_next)
 
       # Batch of an empty input should fail straight away.
       sess.run(init_op, feed_dict={count: 0, batch_size: 8})
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+        self.evaluate(get_next)
 
       # Empty batch should be an initialization time error.
       with self.assertRaises(errors.InvalidArgumentError):
@@ -132,6 +134,7 @@ class MapAndBatchTest(test_base.DatasetTestBase, parameterized.TestCase):
       ("EvenNUMA", False, True),
       ("UnevenNUMA", True, True),
   )
+  @test_util.run_deprecated_v1
   def testMapAndBatchPartialBatch(self, drop_remainder, numa_aware):
     dataset = (
         dataset_ops.Dataset.range(10).apply(
@@ -144,7 +147,7 @@ class MapAndBatchTest(test_base.DatasetTestBase, parameterized.TestCase):
       options = dataset_ops.Options()
       options.experimental_numa_aware = True
       dataset = dataset.with_options(options)
-    iterator = dataset.make_one_shot_iterator()
+    iterator = dataset_ops.make_one_shot_iterator(dataset)
 
     if drop_remainder:
       self.assertEqual([4, 1], iterator.output_shapes.as_list())
@@ -152,17 +155,18 @@ class MapAndBatchTest(test_base.DatasetTestBase, parameterized.TestCase):
       self.assertEqual([None, 1], iterator.output_shapes.as_list())
     next_element = iterator.get_next()
     with self.cached_session() as sess:
-      self.assertAllEqual([[0], [1], [4], [9]], sess.run(next_element))
-      self.assertAllEqual([[16], [25], [36], [49]], sess.run(next_element))
+      self.assertAllEqual([[0], [1], [4], [9]], self.evaluate(next_element))
+      self.assertAllEqual([[16], [25], [36], [49]], self.evaluate(next_element))
       if not drop_remainder:
-        self.assertAllEqual([[64], [81]], sess.run(next_element))
+        self.assertAllEqual([[64], [81]], self.evaluate(next_element))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
+        self.evaluate(next_element)
 
   @parameterized.named_parameters(
       ("Normal", False),
       ("NUMA", True),
   )
+  @test_util.run_deprecated_v1
   def testMapAndBatchYieldsPartialBatch(self, numa_aware):
     dataset = (
         dataset_ops.Dataset.range(10).apply(
@@ -173,20 +177,21 @@ class MapAndBatchTest(test_base.DatasetTestBase, parameterized.TestCase):
       options.experimental_numa_aware = True
       dataset = dataset.with_options(options)
 
-    iterator = dataset.make_one_shot_iterator()
+    iterator = dataset_ops.make_one_shot_iterator(dataset)
     self.assertEqual([None, 1], iterator.output_shapes.as_list())
     next_element = iterator.get_next()
     with self.cached_session() as sess:
-      self.assertAllEqual([[0], [1], [4], [9]], sess.run(next_element))
-      self.assertAllEqual([[16], [25], [36], [49]], sess.run(next_element))
-      self.assertAllEqual([[64], [81]], sess.run(next_element))
+      self.assertAllEqual([[0], [1], [4], [9]], self.evaluate(next_element))
+      self.assertAllEqual([[16], [25], [36], [49]], self.evaluate(next_element))
+      self.assertAllEqual([[64], [81]], self.evaluate(next_element))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
+        self.evaluate(next_element)
 
   @parameterized.named_parameters(
       ("Normal", False),
       ("NUMA", True),
   )
+  @test_util.run_deprecated_v1
   def testMapAndBatchParallelGetNext(self, numa_aware):
     dataset = dataset_ops.Dataset.range(50000).apply(
         batching.map_and_batch(lambda x: x, batch_size=100))
@@ -194,26 +199,27 @@ class MapAndBatchTest(test_base.DatasetTestBase, parameterized.TestCase):
       options = dataset_ops.Options()
       options.experimental_numa_aware = True
       dataset = dataset.with_options(options)
-    iterator = dataset.make_one_shot_iterator()
+    iterator = dataset_ops.make_one_shot_iterator(dataset)
 
     elements = []
     for _ in range(100):
       elements.append(iterator.get_next())
     with self.cached_session() as sess:
       for i in range(5):
-        got = sess.run(elements)
+        got = self.evaluate(elements)
         got.sort(key=lambda x: x[0])
         expected = []
         for j in range(100):
           expected.append(range(i * 10000 + j * 100, i * 10000 + (j + 1) * 100))
         self.assertAllEqual(got, expected)
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(elements)
+        self.evaluate(elements)
 
   @parameterized.named_parameters(
       ("Normal", False),
       ("NUMA", True),
   )
+  @test_util.run_deprecated_v1
   def testMapAndBatchParallelGetNextDropRemainder(self, numa_aware):
     dataset = dataset_ops.Dataset.range(49999).apply(
         batching.map_and_batch(
@@ -223,26 +229,27 @@ class MapAndBatchTest(test_base.DatasetTestBase, parameterized.TestCase):
       options = dataset_ops.Options()
       options.experimental_numa_aware = True
       dataset = dataset.with_options(options)
-    iterator = dataset.make_one_shot_iterator()
+    iterator = dataset_ops.make_one_shot_iterator(dataset)
 
     elements = []
     for _ in range(100):
       elements.append(iterator.get_next())
     with self.cached_session() as sess:
       for i in range(4):
-        got = sess.run(elements)
+        got = self.evaluate(elements)
         got.sort(key=lambda x: x[0])
         expected = []
         for j in range(100):
           expected.append(range(i * 10000 + j * 100, i * 10000 + (j + 1) * 100))
         self.assertAllEqual(got, expected)
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(elements)
+        self.evaluate(elements)
 
   @parameterized.named_parameters(
       ("Normal", False),
       ("NUMA", True),
   )
+  @test_util.run_deprecated_v1
   def testMapAndBatchSparse(self, numa_aware):
 
     def _sparse(i):
@@ -255,15 +262,15 @@ class MapAndBatchTest(test_base.DatasetTestBase, parameterized.TestCase):
       options = dataset_ops.Options()
       options.experimental_numa_aware = True
       dataset = dataset.with_options(options)
-    iterator = dataset.make_initializable_iterator()
+    iterator = dataset_ops.make_initializable_iterator(dataset)
 
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
     with self.cached_session() as sess:
-      sess.run(init_op)
+      self.evaluate(init_op)
       for i in range(2):
-        actual = sess.run(get_next)
+        actual = self.evaluate(get_next)
         expected = sparse_tensor.SparseTensorValue(
             indices=[[0, 0], [1, 0], [2, 0], [3, 0], [4, 0]],
             values=[i * 5, i * 5 + 1, i * 5 + 2, i * 5 + 3, i * 5 + 4],
@@ -271,12 +278,13 @@ class MapAndBatchTest(test_base.DatasetTestBase, parameterized.TestCase):
         self.assertTrue(sparse_tensor.is_sparse(actual))
         self.assertSparseValuesEqual(actual, expected)
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+        self.evaluate(get_next)
 
   @parameterized.named_parameters(
       ("Normal", False),
       ("NUMA", True),
   )
+  @test_util.run_deprecated_v1
   def testMapAndBatchFails(self, numa_aware):
     """Test a dataset that maps a TF function across its input elements."""
     dataset = dataset_ops.Dataset.from_tensors(
@@ -288,7 +296,7 @@ class MapAndBatchTest(test_base.DatasetTestBase, parameterized.TestCase):
       options = dataset_ops.Options()
       options.experimental_numa_aware = True
       dataset = dataset.with_options(options)
-    iterator = dataset.make_initializable_iterator()
+    iterator = dataset_ops.make_initializable_iterator(dataset)
 
     init_op = iterator.initializer
     with self.cached_session() as sess:
@@ -299,6 +307,7 @@ class MapAndBatchTest(test_base.DatasetTestBase, parameterized.TestCase):
       ("Normal", False),
       ("NUMA", True),
   )
+  @test_util.run_deprecated_v1
   def testMapAndBatchShapeMismatch(self, numa_aware):
     """Test a dataset that maps a TF function across its input elements."""
 
@@ -316,15 +325,15 @@ class MapAndBatchTest(test_base.DatasetTestBase, parameterized.TestCase):
       options = dataset_ops.Options()
       options.experimental_numa_aware = True
       dataset = dataset.with_options(options)
-    iterator = dataset.make_initializable_iterator()
+    iterator = dataset_ops.make_initializable_iterator(dataset)
 
     init_op = iterator.initializer
     get_next = iterator.get_next()
     with self.cached_session() as sess:
-      sess.run(init_op)
+      self.evaluate(init_op)
       with self.assertRaisesRegexp(errors.InvalidArgumentError,
                                    "number of elements does not match"):
-        sess.run(get_next)
+        self.evaluate(get_next)
 
   @parameterized.named_parameters(
       ("Normal", False),
@@ -349,12 +358,12 @@ class MapAndBatchTest(test_base.DatasetTestBase, parameterized.TestCase):
       options = dataset_ops.Options()
       options.experimental_numa_aware = True
       dataset = dataset.with_options(options)
-    iterator = dataset.make_one_shot_iterator()
+    iterator = dataset_ops.make_one_shot_iterator(dataset)
     get_next = iterator.get_next()
 
     with self.cached_session() as sess:
       for _ in range(3):
-        sess.run(get_next)
+        self.evaluate(get_next)
 
   @parameterized.named_parameters(
       ("1", 0, False),
@@ -370,6 +379,7 @@ class MapAndBatchTest(test_base.DatasetTestBase, parameterized.TestCase):
       ("5NUMA", 95, True),
       ("6NUMA", 99, True),
   )
+  @test_util.run_deprecated_v1
   def testMapAndBatchOutOfRangeError(self, threshold, numa_aware):
 
     def raising_py_fn(i):
@@ -388,18 +398,19 @@ class MapAndBatchTest(test_base.DatasetTestBase, parameterized.TestCase):
       options = dataset_ops.Options()
       options.experimental_numa_aware = True
       dataset = dataset.with_options(options)
-    iterator = dataset.make_one_shot_iterator()
+    iterator = dataset_ops.make_one_shot_iterator(dataset)
     get_next = iterator.get_next()
 
     with self.cached_session() as sess:
       for i in range(threshold // 10):
-        self.assertAllEqual([i * 10 + j for j in range(10)], sess.run(get_next))
+        self.assertAllEqual([i * 10 + j for j in range(10)],
+                            self.evaluate(get_next))
       if threshold % 10 != 0:
         self.assertAllEqual(
             [threshold // 10 * 10 + j for j in range(threshold % 10)],
-            sess.run(get_next))
+            self.evaluate(get_next))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+        self.evaluate(get_next)
 
   @parameterized.named_parameters(
       ("1", False, dtypes.bool, False),
@@ -438,11 +449,12 @@ class MapAndBatchTest(test_base.DatasetTestBase, parameterized.TestCase):
       options.experimental_numa_aware = True
       dataset = dataset.with_options(options)
 
-    get_next = dataset.make_one_shot_iterator().get_next()
+    get_next = dataset_ops.make_one_shot_iterator(dataset).get_next()
 
     with self.cached_session() as sess:
       for _ in range(10):
-        self.assertAllEqual([element for _ in range(10)], sess.run(get_next))
+        self.assertAllEqual([element for _ in range(10)],
+                            self.evaluate(get_next))
 
   @parameterized.named_parameters(
       ("Identity", None, lambda x: x, None),
@@ -450,10 +462,11 @@ class MapAndBatchTest(test_base.DatasetTestBase, parameterized.TestCase):
       ("Swap", (None, None), lambda x, y: (y, x), None),
       ("Project", (None, None), lambda x, y: x, None),
   )
+  @test_util.run_deprecated_v1
   def testShortCircuit(self, structure, map_fn, num_parallel_calls):
     dataset = self.structuredDataset(structure).repeat().apply(
         batching.map_and_batch(map_fn, batch_size=10))
-    get_next = dataset.make_one_shot_iterator().get_next()
+    get_next = dataset_ops.make_one_shot_iterator(dataset).get_next()
 
     with self.cached_session() as sess:
       if isinstance(structure, tuple):
@@ -462,23 +475,25 @@ class MapAndBatchTest(test_base.DatasetTestBase, parameterized.TestCase):
       else:
         expected = map_fn(
             sess.run(self.structuredElement(structure, shape=[10])))
-      self.assertAllEqual(expected, sess.run(get_next))
+      self.assertAllEqual(expected, self.evaluate(get_next))
 
+  @test_util.run_deprecated_v1
   def testShortCircuitCapturedInput(self):
     captured_t = array_ops.placeholder(dtypes.int64, shape=[])
     dataset = self.structuredDataset(None).repeat().apply(
         batching.map_and_batch(lambda x: captured_t, batch_size=10))
-    iterator = dataset.make_initializable_iterator()
+    iterator = dataset_ops.make_initializable_iterator(dataset)
     get_next = iterator.get_next()
 
     with self.cached_session() as sess:
       sess.run(iterator.initializer, feed_dict={captured_t: 42})
-      self.assertAllEqual([42] * 10, sess.run(get_next))
+      self.assertAllEqual([42] * 10, self.evaluate(get_next))
 
   @parameterized.named_parameters(
       ("Normal", False),
       ("NUMA", True),
   )
+  @test_util.run_deprecated_v1
   def testMapAndBatchControlFlow(self, numa_aware):
 
     def map_fn(x):
@@ -494,20 +509,20 @@ class MapAndBatchTest(test_base.DatasetTestBase, parameterized.TestCase):
       options = dataset_ops.Options()
       options.experimental_numa_aware = True
       dataset = dataset.with_options(options)
-    iterator = dataset.make_one_shot_iterator()
+    iterator = dataset_ops.make_one_shot_iterator(dataset)
     get_next = iterator.get_next()
     with self.cached_session() as sess:
       for i in range(10):
         print("Case %d" % i)
         if i < 5:
           self.assertAllEqual([i * 10 + j + 1 for j in range(10)],
-                              sess.run(get_next))
+                              self.evaluate(get_next))
         else:
           self.assertAllEqual(
               [((i * 10) + j) * ((i * 10) + j) for j in range(10)],
-              sess.run(get_next))
+              self.evaluate(get_next))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+        self.evaluate(get_next)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/data/experimental/kernel_tests/map_defun_op_test.py b/tensorflow/python/data/experimental/kernel_tests/map_defun_op_test.py
index 11694540fae..6042ca1c63f 100644
--- a/tensorflow/python/data/experimental/kernel_tests/map_defun_op_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/map_defun_op_test.py
@@ -218,7 +218,7 @@ class MapDefunTest(test_base.DatasetTestBase):
 
   def _assert_op_cancelled(self, sess, map_defun_op):
     with self.assertRaisesRegexp(errors.CancelledError, "was cancelled"):
-      sess.run(map_defun_op)
+      self.evaluate(map_defun_op)
 
   def testMapDefunWithParentCancellation(self):
     # Checks that a cancellation of the parent graph is threaded through to
@@ -260,10 +260,10 @@ class MapDefunBenchmark(test.Benchmark):
     with session.Session() as sess:
       # Warm up the session
       for _ in range(5):
-        sess.run(op)
+        self.evaluate(op)
       start = time.time()
       for _ in range(num_iters):
-        sess.run(op)
+        self.evaluate(op)
       end = time.time()
       mean_us = (end - start) * 1e6 / num_iters
       self.report_benchmark(
diff --git a/tensorflow/python/data/kernel_tests/matching_files_dataset_op_test.py b/tensorflow/python/data/experimental/kernel_tests/matching_files_test.py
similarity index 57%
rename from tensorflow/python/data/kernel_tests/matching_files_dataset_op_test.py
rename to tensorflow/python/data/experimental/kernel_tests/matching_files_test.py
index 4d86ec4228a..0ee7616d35e 100644
--- a/tensorflow/python/data/kernel_tests/matching_files_dataset_op_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/matching_files_test.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for the experimental input pipeline ops."""
+"""Tests for the private `MatchingFilesDataset`."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
@@ -20,20 +20,17 @@ from __future__ import print_function
 import os
 import shutil
 import tempfile
-import time
 
-import numpy as np
-
-from tensorflow.python.client import session
+from tensorflow.python.data.experimental.ops import matching_files
 from tensorflow.python.data.kernel_tests import test_base
-from tensorflow.python.data.ops.dataset_ops import MatchingFilesDataset
+from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import errors
-from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
 from tensorflow.python.util import compat
 
 
-class MatchingFilesDatasetTest(test_base.DatasetTestBase):
+class MatchingFilesTest(test_base.DatasetTestBase):
 
   def setUp(self):
     self.tmp_dir = tempfile.mkdtemp()
@@ -45,34 +42,40 @@ class MatchingFilesDatasetTest(test_base.DatasetTestBase):
     for filename in filenames:
       open(os.path.join(self.tmp_dir, filename), 'a').close()
 
+  @test_util.run_deprecated_v1
   def testNonExistingDirectory(self):
-    """Test the MatchingFiles dataset with a non-existing directory"""
+    """Test the MatchingFiles dataset with a non-existing directory."""
 
     self.tmp_dir = os.path.join(self.tmp_dir, 'nonexistingdir')
-    dataset = MatchingFilesDataset(os.path.join(self.tmp_dir, '*'))
+    dataset = matching_files.MatchingFilesDataset(
+        os.path.join(self.tmp_dir, '*'))
     with self.cached_session() as sess:
-      next_element = dataset.make_one_shot_iterator().get_next()
+      next_element = dataset_ops.make_one_shot_iterator(dataset).get_next()
       with self.assertRaises(errors.NotFoundError):
         sess.run(next_element)
 
+  @test_util.run_deprecated_v1
   def testEmptyDirectory(self):
-    """Test the MatchingFiles dataset with an empty directory"""
+    """Test the MatchingFiles dataset with an empty directory."""
 
-    dataset = MatchingFilesDataset(os.path.join(self.tmp_dir, '*'))
+    dataset = matching_files.MatchingFilesDataset(
+        os.path.join(self.tmp_dir, '*'))
     with self.cached_session() as sess:
-      next_element = dataset.make_one_shot_iterator().get_next()
+      next_element = dataset_ops.make_one_shot_iterator(dataset).get_next()
       with self.assertRaises(errors.NotFoundError):
         sess.run(next_element)
 
+  @test_util.run_deprecated_v1
   def testSimpleDirectory(self):
-    """Test the MatchingFiles dataset with a simple directory"""
+    """Test the MatchingFiles dataset with a simple directory."""
 
     filenames = ['a', 'b', 'c']
     self._touchTempFiles(filenames)
 
-    dataset = MatchingFilesDataset(os.path.join(self.tmp_dir, '*'))
+    dataset = matching_files.MatchingFilesDataset(
+        os.path.join(self.tmp_dir, '*'))
     with self.cached_session() as sess:
-      next_element = dataset.make_one_shot_iterator().get_next()
+      next_element = dataset_ops.make_one_shot_iterator(dataset).get_next()
 
       expected_filenames = []
       actual_filenames = []
@@ -85,15 +88,17 @@ class MatchingFilesDatasetTest(test_base.DatasetTestBase):
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(next_element)
 
+  @test_util.run_deprecated_v1
   def testFileSuffixes(self):
-    """Test the MatchingFiles dataset using the suffixes of filename"""
+    """Test the MatchingFiles dataset using the suffixes of filename."""
 
     filenames = ['a.txt', 'b.py', 'c.py', 'd.pyc']
     self._touchTempFiles(filenames)
 
-    dataset = MatchingFilesDataset(os.path.join(self.tmp_dir, '*.py'))
+    dataset = matching_files.MatchingFilesDataset(
+        os.path.join(self.tmp_dir, '*.py'))
     with self.cached_session() as sess:
-      next_element = dataset.make_one_shot_iterator().get_next()
+      next_element = dataset_ops.make_one_shot_iterator(dataset).get_next()
       expected_filenames = []
       actual_filenames = []
       for filename in filenames[1:-1]:
@@ -105,15 +110,17 @@ class MatchingFilesDatasetTest(test_base.DatasetTestBase):
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(next_element)
 
+  @test_util.run_deprecated_v1
   def testFileMiddles(self):
-    """Test the MatchingFiles dataset using the middles of filename"""
+    """Test the MatchingFiles dataset using the middles of filename."""
 
     filenames = ['aa.txt', 'bb.py', 'bbc.pyc', 'cc.pyc']
     self._touchTempFiles(filenames)
 
-    dataset = MatchingFilesDataset(os.path.join(self.tmp_dir, 'b*.py*'))
+    dataset = matching_files.MatchingFilesDataset(
+        os.path.join(self.tmp_dir, 'b*.py*'))
     with self.cached_session() as sess:
-      next_element = dataset.make_one_shot_iterator().get_next()
+      next_element = dataset_ops.make_one_shot_iterator(dataset).get_next()
       expected_filenames = []
       actual_filenames = []
       for filename in filenames[1:3]:
@@ -125,8 +132,9 @@ class MatchingFilesDatasetTest(test_base.DatasetTestBase):
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(next_element)
 
+  @test_util.run_deprecated_v1
   def testNestedDirectories(self):
-    """Test the MatchingFiles dataset with nested directories"""
+    """Test the MatchingFiles dataset with nested directories."""
 
     filenames = []
     width = 8
@@ -147,9 +155,9 @@ class MatchingFilesDatasetTest(test_base.DatasetTestBase):
                      suffix) for suffix in ['*.txt', '*.log']
     ]
 
-    dataset = MatchingFilesDataset(patterns)
+    dataset = matching_files.MatchingFilesDataset(patterns)
     with self.cached_session() as sess:
-      next_element = dataset.make_one_shot_iterator().get_next()
+      next_element = dataset_ops.make_one_shot_iterator(dataset).get_next()
       expected_filenames = [
           compat.as_bytes(filename)
           for filename in filenames
@@ -165,70 +173,5 @@ class MatchingFilesDatasetTest(test_base.DatasetTestBase):
       self.assertItemsEqual(expected_filenames, actual_filenames)
 
 
-class MatchingFilesDatasetBenchmark(test.Benchmark):
-
-  def benchmarkNestedDirectories(self):
-    tmp_dir = tempfile.mkdtemp()
-    width = 500
-    depth = 10
-    for i in range(width):
-      for j in range(depth):
-        new_base = os.path.join(tmp_dir, str(i),
-                                *[str(dir_name) for dir_name in range(j)])
-        os.makedirs(new_base)
-        child_files = ['a.py', 'b.pyc'] if j < depth - 1 else ['c.txt', 'd.log']
-        for f in child_files:
-          filename = os.path.join(new_base, f)
-          open(filename, 'w').close()
-
-    patterns = [
-        os.path.join(tmp_dir, os.path.join(*['**'
-                                             for _ in range(depth)]), suffix)
-        for suffix in ['*.txt', '*.log']
-    ]
-
-    deltas = []
-    iters = 3
-    for _ in range(iters):
-      with ops.Graph().as_default():
-        dataset = MatchingFilesDataset(patterns)
-        next_element = dataset.make_one_shot_iterator().get_next()
-
-        with session.Session() as sess:
-          sub_deltas = []
-          while True:
-            try:
-              start = time.time()
-              sess.run(next_element)
-              end = time.time()
-              sub_deltas.append(end - start)
-            except errors.OutOfRangeError:
-              break
-          deltas.append(sub_deltas)
-
-    median_deltas = np.median(deltas, axis=0)
-    print('Nested directory size (width*depth): %d*%d Median wall time: '
-          '%fs (read first filename), %fs (read second filename), avg %fs'
-          ' (read %d more filenames)' %
-          (width, depth, median_deltas[0], median_deltas[1],
-           np.average(median_deltas[2:]), len(median_deltas) - 2))
-    self.report_benchmark(
-        iters=iters,
-        wall_time=np.sum(median_deltas),
-        extras={
-            'read first file:':
-                median_deltas[0],
-            'read second file:':
-                median_deltas[1],
-            'avg time for reading %d more filenames:' %
-            (len(median_deltas) - 2):
-                np.average(median_deltas[2:])
-        },
-        name='benchmark_matching_files_dataset_nesteddirectory(%d*%d)' %
-        (width, depth))
-
-    shutil.rmtree(tmp_dir, ignore_errors=True)
-
-
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/data/experimental/kernel_tests/optimization/BUILD b/tensorflow/python/data/experimental/kernel_tests/optimization/BUILD
index 9946ef5a42f..f214944254c 100644
--- a/tensorflow/python/data/experimental/kernel_tests/optimization/BUILD
+++ b/tensorflow/python/data/experimental/kernel_tests/optimization/BUILD
@@ -42,6 +42,7 @@ py_test(
         "//tensorflow/python:errors",
         "//tensorflow/python:math_ops",
         "//tensorflow/python/data/experimental/ops:optimization",
+        "//tensorflow/python/data/experimental/ops:optimization_options",
         "//tensorflow/python/data/kernel_tests:test_base",
         "//tensorflow/python/data/ops:dataset_ops",
         "@absl_py//absl/testing:parameterized",
@@ -68,6 +69,7 @@ py_test(
         "//tensorflow/python:math_ops",
         "//tensorflow/python:random_ops",
         "//tensorflow/python/data/experimental/ops:optimization",
+        "//tensorflow/python/data/experimental/ops:optimization_options",
         "//tensorflow/python/data/kernel_tests:test_base",
         "//tensorflow/python/data/ops:dataset_ops",
         "@absl_py//absl/testing:parameterized",
@@ -127,6 +129,7 @@ py_test(
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:errors",
         "//tensorflow/python/data/experimental/ops:optimization",
+        "//tensorflow/python/data/experimental/ops:optimization_options",
         "//tensorflow/python/data/kernel_tests:test_base",
         "//tensorflow/python/data/ops:dataset_ops",
     ],
@@ -148,6 +151,7 @@ py_test(
         "//tensorflow/python:errors",
         "//tensorflow/python:math_ops",
         "//tensorflow/python/data/experimental/ops:optimization",
+        "//tensorflow/python/data/experimental/ops:optimization_options",
         "//tensorflow/python/data/kernel_tests:test_base",
         "//tensorflow/python/data/ops:dataset_ops",
         "@absl_py//absl/testing:parameterized",
@@ -167,6 +171,7 @@ py_test(
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:errors",
         "//tensorflow/python/data/experimental/ops:optimization",
+        "//tensorflow/python/data/experimental/ops:optimization_options",
         "//tensorflow/python/data/kernel_tests:test_base",
         "//tensorflow/python/data/ops:dataset_ops",
         "@absl_py//absl/testing:parameterized",
@@ -192,6 +197,7 @@ py_test(
         "//tensorflow/python:math_ops",
         "//tensorflow/python:random_ops",
         "//tensorflow/python/data/experimental/ops:optimization",
+        "//tensorflow/python/data/experimental/ops:optimization_options",
         "//tensorflow/python/data/kernel_tests:test_base",
         "//tensorflow/python/data/ops:dataset_ops",
         "@absl_py//absl/testing:parameterized",
@@ -202,6 +208,7 @@ py_test(
     name = "map_vectorization_test",
     size = "medium",
     srcs = ["map_vectorization_test.py"],
+    shard_count = 8,
     srcs_version = "PY2AND3",
     tags = [
         "no_oss",
@@ -220,15 +227,15 @@ py_test(
         "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
         "//tensorflow/python:framework_ops",
+        "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:math_ops",
-        "//tensorflow/python:nn_ops",
+        "//tensorflow/python:nn",
         "//tensorflow/python:parsing_ops",
-        "//tensorflow/python:session",
         "//tensorflow/python:sparse_tensor",
         "//tensorflow/python/data/experimental/ops:optimization",
+        "//tensorflow/python/data/experimental/ops:optimization_options",
         "//tensorflow/python/data/kernel_tests:test_base",
         "//tensorflow/python/data/ops:dataset_ops",
-        "//tensorflow/python/data/util:nest",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
@@ -248,12 +255,9 @@ py_test(
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:errors",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python/data/experimental/ops:batching",
         "//tensorflow/python/data/experimental/ops:optimization",
         "//tensorflow/python/data/kernel_tests:test_base",
         "//tensorflow/python/data/ops:dataset_ops",
-        "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
 )
@@ -275,6 +279,7 @@ py_test(
         "//tensorflow/python:errors",
         "//tensorflow/python:math_ops",
         "//tensorflow/python/data/experimental/ops:optimization",
+        "//tensorflow/python/data/experimental/ops:optimization_options",
         "//tensorflow/python/data/kernel_tests:test_base",
         "//tensorflow/python/data/ops:dataset_ops",
     ],
@@ -296,10 +301,17 @@ py_test(
         "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
         "//tensorflow/python:random_ops",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python/data/experimental/ops:batching",
+        "//tensorflow/python/data/experimental/ops:grouping",
         "//tensorflow/python/data/experimental/ops:optimization",
+        "//tensorflow/python/data/experimental/ops:optimization_options",
+        "//tensorflow/python/data/experimental/ops:scan_ops",
         "//tensorflow/python/data/kernel_tests:test_base",
         "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/eager:context",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -316,6 +328,7 @@ py_test(
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:errors",
         "//tensorflow/python/data/experimental/ops:optimization",
+        "//tensorflow/python/data/experimental/ops:optimization_options",
         "//tensorflow/python/data/kernel_tests:test_base",
         "//tensorflow/python/data/ops:dataset_ops",
     ],
diff --git a/tensorflow/python/data/experimental/kernel_tests/optimization/assert_next_dataset_test.py b/tensorflow/python/data/experimental/kernel_tests/optimization/assert_next_dataset_test.py
index ed719a0ce9b..9b8248a78da 100644
--- a/tensorflow/python/data/experimental/kernel_tests/optimization/assert_next_dataset_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/optimization/assert_next_dataset_test.py
@@ -21,32 +21,27 @@ from tensorflow.python.data.experimental.ops import optimization
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import errors
+from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
 
 
+@test_util.run_all_in_graph_and_eager_modes
 class AssertNextDatasetTest(test_base.DatasetTestBase):
 
   def testAssertNext(self):
     dataset = dataset_ops.Dataset.from_tensors(0).apply(
         optimization.assert_next(["Map"])).map(lambda x: x)
-    iterator = dataset.make_one_shot_iterator()
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      self.assertEqual(0, sess.run(get_next))
+    self.assertDatasetProduces(dataset, expected_output=[0])
 
   def testAssertNextInvalid(self):
     dataset = dataset_ops.Dataset.from_tensors(0).apply(
         optimization.assert_next(["Whoops"])).map(lambda x: x)
-    iterator = dataset.make_one_shot_iterator()
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      with self.assertRaisesRegexp(
-          errors.InvalidArgumentError,
-          "Asserted Whoops transformation at offset 0 but encountered "
-          "Map transformation instead."):
-        sess.run(get_next)
+    self.assertDatasetProduces(
+        dataset,
+        expected_error=(
+            errors.InvalidArgumentError,
+            "Asserted Whoops transformation at offset 0 but encountered "
+            "Map transformation instead."))
 
   def testAssertNextShort(self):
     dataset = dataset_ops.Dataset.from_tensors(0).apply(
@@ -54,14 +49,11 @@ class AssertNextDatasetTest(test_base.DatasetTestBase):
     options = dataset_ops.Options()
     options.experimental_autotune = False
     dataset = dataset.with_options(options)
-    iterator = dataset.make_one_shot_iterator()
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      with self.assertRaisesRegexp(
-          errors.InvalidArgumentError,
-          "Asserted next 2 transformations but encountered only 1."):
-        sess.run(get_next)
+    self.assertDatasetProduces(
+        dataset,
+        expected_error=(
+            errors.InvalidArgumentError,
+            "Asserted next 2 transformations but encountered only 1."))
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/data/experimental/kernel_tests/optimization/filter_fusion_test.py b/tensorflow/python/data/experimental/kernel_tests/optimization/filter_fusion_test.py
index 80a0d879dc2..7371cf31dff 100644
--- a/tensorflow/python/data/experimental/kernel_tests/optimization/filter_fusion_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/optimization/filter_fusion_test.py
@@ -20,11 +20,12 @@ from __future__ import print_function
 from absl.testing import parameterized
 
 from tensorflow.python.data.experimental.ops import optimization
+from tensorflow.python.data.experimental.ops.optimization_options import OptimizationOptions
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import errors
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 
@@ -58,6 +59,7 @@ def _filter_fusion_test_cases():
   return tuple(tests)
 
 
+@test_util.run_all_in_graph_and_eager_modes
 class FilterFusionTest(test_base.DatasetTestBase, parameterized.TestCase):
 
   @parameterized.named_parameters(*_filter_fusion_test_cases())
@@ -70,28 +72,25 @@ class FilterFusionTest(test_base.DatasetTestBase, parameterized.TestCase):
 
     dataset = dataset.cache()
     options = dataset_ops.Options()
-    options.experimental_filter_fusion = True
+    options.experimental_optimization = OptimizationOptions()
+    options.experimental_optimization.filter_fusion = True
     dataset = dataset.with_options(options)
-    iterator = dataset.make_one_shot_iterator()
-    get_next = iterator.get_next()
-    with self.cached_session() as sess:
-      for x in range(5):
-        r = map_function(x)
-        filtered = False
-        for predicate in predicates:
-          if isinstance(r, tuple):
-            b = predicate(*r)  # Pass tuple as multiple arguments.
-          else:
-            b = predicate(r)
-          if not sess.run(b):
-            filtered = True
-            break
+    expected_output = []
+    for x in range(5):
+      r = map_function(x)
+      filtered = False
+      for predicate in predicates:
+        if isinstance(r, tuple):
+          b = predicate(*r)  # Pass tuple as multiple arguments.
+        else:
+          b = predicate(r)
+        if not self.evaluate(b):
+          filtered = True
+          break
 
-        if not filtered:
-          result = sess.run(get_next)
-          self.assertAllEqual(r, result)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+      if not filtered:
+        expected_output.append(r)
+    self.assertDatasetProduces(dataset, expected_output=expected_output)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/data/experimental/kernel_tests/optimization/hoist_random_uniform_test.py b/tensorflow/python/data/experimental/kernel_tests/optimization/hoist_random_uniform_test.py
index 9f7fbfeba0d..5f3a8683fbb 100644
--- a/tensorflow/python/data/experimental/kernel_tests/optimization/hoist_random_uniform_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/optimization/hoist_random_uniform_test.py
@@ -20,12 +20,15 @@ from __future__ import print_function
 from absl.testing import parameterized
 
 from tensorflow.python.data.experimental.ops import optimization
+from tensorflow.python.data.experimental.ops.optimization_options import OptimizationOptions
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
@@ -58,23 +61,29 @@ def _hoist_random_uniform_test_cases():
   return tuple(tests)
 
 
+@test_util.run_all_in_graph_and_eager_modes
 class HoistRandomUniformTest(test_base.DatasetTestBase, parameterized.TestCase):
 
   def _testDataset(self, dataset):
-    iterator = dataset.make_one_shot_iterator()
-    get_next = iterator.get_next()
     previous_result = 0
-    with self.cached_session() as sess:
-      for _ in range(5):
-        result = sess.run(get_next)
-        self.assertLessEqual(1, result)
-        self.assertLessEqual(result, 10)
-        # This checks if the result is somehow random by checking if we are not
-        # generating the same values.
-        self.assertNotEqual(previous_result, result)
-        previous_result = result
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+    if context.executing_eagerly():
+      iterator = dataset.__iter__()
+      get_next = iterator._next_internal  # pylint: disable=protected-access
+    else:
+      iterator = dataset_ops.make_one_shot_iterator(dataset)
+      get_next = iterator.get_next
+    for _ in range(5):
+      result = self.evaluate(get_next())
+      self.assertLessEqual(1, result)
+      self.assertLessEqual(result, 10)
+      # This checks if the result is somehow random by checking if we are not
+      # generating the same values.
+      self.assertNotEqual(previous_result, result)
+      previous_result = result
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
 
   @parameterized.named_parameters(*_hoist_random_uniform_test_cases())
   def testHoisting(self, function, will_optimize):
@@ -83,7 +92,8 @@ class HoistRandomUniformTest(test_base.DatasetTestBase, parameterized.TestCase):
             ["Zip[0]", "Map"] if will_optimize else ["Map"])).map(function)
 
     options = dataset_ops.Options()
-    options.experimental_hoist_random_uniform = True
+    options.experimental_optimization = OptimizationOptions()
+    options.experimental_optimization.hoist_random_uniform = True
     dataset = dataset.with_options(options)
     self._testDataset(dataset)
 
@@ -99,7 +109,8 @@ class HoistRandomUniformTest(test_base.DatasetTestBase, parameterized.TestCase):
     dataset = dataset_ops.Dataset.range(5).apply(
         optimization.assert_next(["Zip[0]", "Map"])).map(random_with_capture)
     options = dataset_ops.Options()
-    options.experimental_hoist_random_uniform = True
+    options.experimental_optimization = OptimizationOptions()
+    options.experimental_optimization.hoist_random_uniform = True
     dataset = dataset.with_options(options)
     self._testDataset(dataset)
 
diff --git a/tensorflow/python/data/experimental/kernel_tests/optimization/latency_all_edges_test.py b/tensorflow/python/data/experimental/kernel_tests/optimization/latency_all_edges_test.py
index 7144d834f9f..fc65f52704c 100644
--- a/tensorflow/python/data/experimental/kernel_tests/optimization/latency_all_edges_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/optimization/latency_all_edges_test.py
@@ -22,10 +22,11 @@ from tensorflow.python.data.experimental.ops import optimization
 from tensorflow.python.data.experimental.ops import stats_aggregator
 from tensorflow.python.data.experimental.ops import stats_options
 from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.framework import errors
+from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
 
 
+@test_util.run_all_in_graph_and_eager_modes
 class LatencyAllEdgesTest(stats_dataset_test_base.StatsDatasetTestBase):
 
   def testLatencyStatsOptimization(self):
@@ -39,22 +40,18 @@ class LatencyAllEdgesTest(stats_dataset_test_base.StatsDatasetTestBase):
     options.experimental_stats.latency_all_edges = True
     options.experimental_stats.aggregator = aggregator
     dataset = dataset.with_options(options)
-    iterator = dataset.make_initializable_iterator()
-    get_next = iterator.get_next()
+    self.assertDatasetProduces(
+        dataset,
+        expected_output=[1],
+        requires_initialization=True,
+        num_test_iterations=1)
     summary_t = aggregator.get_summary()
-
-    with self.cached_session() as sess:
-      sess.run(iterator.initializer)
-      self.assertEqual(1 * 1, sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-      summary_str = sess.run(summary_t)
-      self._assertSummaryHasCount(summary_str,
-                                  "record_latency_TensorDataset/_1", 1)
-      self._assertSummaryHasCount(summary_str, "record_latency_MapDataset/_4",
-                                  1)
-      self._assertSummaryHasCount(summary_str,
-                                  "record_latency_PrefetchDataset/_6", 1)
+    summary_str = self.evaluate(summary_t)
+    self._assertSummaryHasCount(summary_str, "record_latency_TensorDataset/_1",
+                                1)
+    self._assertSummaryHasCount(summary_str, "record_latency_MapDataset/_4", 1)
+    self._assertSummaryHasCount(summary_str,
+                                "record_latency_PrefetchDataset/_6", 1)
 
   def testLatencyStatsOptimizationV2(self):
     aggregator = stats_aggregator.StatsAggregator()
@@ -63,24 +60,21 @@ class LatencyAllEdgesTest(stats_dataset_test_base.StatsDatasetTestBase):
             ["LatencyStats", "Map", "LatencyStats", "Prefetch",
              "LatencyStats"])).map(lambda x: x * x).prefetch(1)
     options = dataset_ops.Options()
-    options.experimental_stats = stats_options.StatsOptions(aggregator)
+    options.experimental_stats = stats_options.StatsOptions()
+    options.experimental_stats.aggregator = aggregator
     dataset = dataset.with_options(options)
-    iterator = dataset.make_initializable_iterator()
-    get_next = iterator.get_next()
+    self.assertDatasetProduces(
+        dataset,
+        expected_output=[1],
+        requires_initialization=True,
+        num_test_iterations=1)
     summary_t = aggregator.get_summary()
-
-    with self.cached_session() as sess:
-      sess.run(iterator.initializer)
-      self.assertEqual(1, sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-      summary_str = sess.run(summary_t)
-      self._assertSummaryHasCount(summary_str,
-                                  "record_latency_TensorDataset/_1", 1)
-      self._assertSummaryHasCount(summary_str, "record_latency_MapDataset/_4",
-                                  1)
-      self._assertSummaryHasCount(summary_str,
-                                  "record_latency_PrefetchDataset/_6", 1)
+    summary_str = self.evaluate(summary_t)
+    self._assertSummaryHasCount(summary_str, "record_latency_TensorDataset/_1",
+                                1)
+    self._assertSummaryHasCount(summary_str, "record_latency_MapDataset/_4", 1)
+    self._assertSummaryHasCount(summary_str,
+                                "record_latency_PrefetchDataset/_6", 1)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/data/experimental/kernel_tests/optimization/make_numa_aware_test.py b/tensorflow/python/data/experimental/kernel_tests/optimization/make_numa_aware_test.py
index 6191a7db084..2386dd5f116 100644
--- a/tensorflow/python/data/experimental/kernel_tests/optimization/make_numa_aware_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/optimization/make_numa_aware_test.py
@@ -21,10 +21,11 @@ from tensorflow.python.data.experimental.ops import batching
 from tensorflow.python.data.experimental.ops import optimization
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.framework import errors
+from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
 
 
+@test_util.run_all_in_graph_and_eager_modes
 class MakeNumaAwareTest(test_base.DatasetTestBase):
 
   def testMakeNumaAware(self):
@@ -34,13 +35,8 @@ class MakeNumaAwareTest(test_base.DatasetTestBase):
     options = dataset_ops.Options()
     options.experimental_numa_aware = True
     dataset = dataset.with_options(options)
-    iterator = dataset.make_one_shot_iterator()
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      self.assertAllEqual([x * x for x in range(10)], sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+    self.assertDatasetProduces(
+        dataset, expected_output=[[x * x for x in range(10)]])
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/data/experimental/kernel_tests/optimization/map_and_batch_fusion_test.py b/tensorflow/python/data/experimental/kernel_tests/optimization/map_and_batch_fusion_test.py
index ddf3cbbcc35..801f664f09c 100644
--- a/tensorflow/python/data/experimental/kernel_tests/optimization/map_and_batch_fusion_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/optimization/map_and_batch_fusion_test.py
@@ -18,12 +18,14 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.data.experimental.ops import optimization
+from tensorflow.python.data.experimental.ops.optimization_options import OptimizationOptions
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.framework import errors
+from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
 
 
+@test_util.run_all_in_graph_and_eager_modes
 class MapAndBatchFusionTest(test_base.DatasetTestBase):
 
   def testMapAndBatchFusion(self):
@@ -31,15 +33,11 @@ class MapAndBatchFusionTest(test_base.DatasetTestBase):
         optimization.assert_next(
             ["MapAndBatch"])).map(lambda x: x * x).batch(10)
     options = dataset_ops.Options()
-    options.experimental_map_and_batch_fusion = True
+    options.experimental_optimization = OptimizationOptions()
+    options.experimental_optimization.map_and_batch_fusion = True
     dataset = dataset.with_options(options)
-    iterator = dataset.make_one_shot_iterator()
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      self.assertAllEqual([x * x for x in range(10)], sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+    self.assertDatasetProduces(
+        dataset, expected_output=[[x * x for x in range(10)]])
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/data/experimental/kernel_tests/optimization/map_and_filter_fusion_test.py b/tensorflow/python/data/experimental/kernel_tests/optimization/map_and_filter_fusion_test.py
index 3b4ca623409..db8f214fbfc 100644
--- a/tensorflow/python/data/experimental/kernel_tests/optimization/map_and_filter_fusion_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/optimization/map_and_filter_fusion_test.py
@@ -20,11 +20,12 @@ from __future__ import print_function
 from absl.testing import parameterized
 
 from tensorflow.python.data.experimental.ops import optimization
+from tensorflow.python.data.experimental.ops.optimization_options import OptimizationOptions
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import errors
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 
@@ -62,23 +63,20 @@ def _map_and_filter_fusion_test_cases():
   return tuple(tests)
 
 
+@test_util.run_all_in_graph_and_eager_modes
 class MapAndFilterFusionTest(test_base.DatasetTestBase, parameterized.TestCase):
 
   def _testMapAndFilter(self, dataset, function, predicate):
-    iterator = dataset.make_one_shot_iterator()
-    get_next = iterator.get_next()
-    with self.cached_session() as sess:
-      for x in range(10):
-        r = function(x)
-        if isinstance(r, tuple):
-          b = predicate(*r)  # Pass tuple as multiple arguments.
-        else:
-          b = predicate(r)
-        if sess.run(b):
-          result = sess.run(get_next)
-          self.assertAllEqual(r, result)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+    expected_output = []
+    for x in range(10):
+      r = function(x)
+      if isinstance(r, tuple):
+        b = predicate(*r)  # Pass tuple as multiple arguments.
+      else:
+        b = predicate(r)
+      if self.evaluate(b):
+        expected_output.append(r)
+    self.assertDatasetProduces(dataset, expected_output=expected_output)
 
   @parameterized.named_parameters(*_map_and_filter_fusion_test_cases())
   def testMapFilterFusion(self, function, predicate):
@@ -86,7 +84,8 @@ class MapAndFilterFusionTest(test_base.DatasetTestBase, parameterized.TestCase):
         optimization.assert_next(
             ["Map", "FilterByLastComponent"])).map(function).filter(predicate)
     options = dataset_ops.Options()
-    options.experimental_map_and_filter_fusion = True
+    options.experimental_optimization = OptimizationOptions()
+    options.experimental_optimization.map_and_filter_fusion = True
     dataset = dataset.with_options(options)
     self._testMapAndFilter(dataset, function, predicate)
 
@@ -104,7 +103,8 @@ class MapAndFilterFusionTest(test_base.DatasetTestBase, parameterized.TestCase):
         optimization.assert_next(["Map",
                                   "Filter"])).map(function).filter(predicate)
     options = dataset_ops.Options()
-    options.experimental_map_and_filter_fusion = True
+    options.experimental_optimization = OptimizationOptions()
+    options.experimental_optimization.map_and_filter_fusion = True
     dataset = dataset.with_options(options)
     self._testMapAndFilter(dataset, function, predicate)
 
diff --git a/tensorflow/python/data/experimental/kernel_tests/optimization/map_fusion_test.py b/tensorflow/python/data/experimental/kernel_tests/optimization/map_fusion_test.py
index ec63ad72006..d8d63903749 100644
--- a/tensorflow/python/data/experimental/kernel_tests/optimization/map_fusion_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/optimization/map_fusion_test.py
@@ -20,9 +20,10 @@ from __future__ import print_function
 from absl.testing import parameterized
 
 from tensorflow.python.data.experimental.ops import optimization
+from tensorflow.python.data.experimental.ops.optimization_options import OptimizationOptions
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.framework import errors
+from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
 
 
@@ -62,6 +63,7 @@ def _map_fusion_test_cases():
   return tuple(tests)
 
 
+@test_util.run_all_in_graph_and_eager_modes
 class MapFusionTest(test_base.DatasetTestBase, parameterized.TestCase):
 
   @parameterized.named_parameters(*_map_fusion_test_cases())
@@ -73,23 +75,19 @@ class MapFusionTest(test_base.DatasetTestBase, parameterized.TestCase):
 
     dataset = dataset.cache()
     options = dataset_ops.Options()
-    options.experimental_map_fusion = True
+    options.experimental_optimization = OptimizationOptions()
+    options.experimental_optimization.map_fusion = True
     dataset = dataset.with_options(options)
-    iterator = dataset.make_one_shot_iterator()
-    get_next = iterator.get_next()
-    with self.cached_session() as sess:
-      for x in range(5):
-        result = sess.run(get_next)
-        r = x
-        for function in functions:
-          if isinstance(r, tuple):
-            r = function(*r)  # Pass tuple as multiple arguments.
-          else:
-            r = function(r)
-        self.assertAllEqual(r, result)
-
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+    expected_output = []
+    for x in range(5):
+      r = x
+      for function in functions:
+        if isinstance(r, tuple):
+          r = function(*r)  # Pass tuple as multiple arguments.
+        else:
+          r = function(r)
+      expected_output.append(r)
+    self.assertDatasetProduces(dataset, expected_output=expected_output)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/data/experimental/kernel_tests/optimization/map_parallelization_test.py b/tensorflow/python/data/experimental/kernel_tests/optimization/map_parallelization_test.py
index c95f7b2eb19..0ff3fff4f85 100644
--- a/tensorflow/python/data/experimental/kernel_tests/optimization/map_parallelization_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/optimization/map_parallelization_test.py
@@ -20,11 +20,12 @@ from __future__ import print_function
 from absl.testing import parameterized
 
 from tensorflow.python.data.experimental.ops import optimization
+from tensorflow.python.data.experimental.ops.optimization_options import OptimizationOptions
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
@@ -58,6 +59,7 @@ def _map_parallelization_test_cases():
           ("AssertWithRandom", assert_with_random, False))
 
 
+@test_util.run_all_in_graph_and_eager_modes
 class MapParallelizationTest(test_base.DatasetTestBase, parameterized.TestCase):
 
   @parameterized.named_parameters(*_map_parallelization_test_cases())
@@ -66,23 +68,12 @@ class MapParallelizationTest(test_base.DatasetTestBase, parameterized.TestCase):
     dataset = dataset_ops.Dataset.range(5).apply(
         optimization.assert_next(next_nodes)).map(function)
     options = dataset_ops.Options()
-    options.experimental_map_parallelization = True
+    options.experimental_optimization = OptimizationOptions()
+    options.experimental_optimization.map_parallelization = True
     dataset = dataset.with_options(options)
-    iterator = dataset.make_one_shot_iterator()
-    get_next = iterator.get_next()
-
-    with self.test_session() as sess:
-      for x in range(5):
-        result = sess.run(get_next)
-        # No need to run the pipeline if it was not optimized.  Also the results
-        # might be hard to check because of random.
-        if not should_optimize:
-          return
-        r = function(x)
-        self.assertAllEqual(r, result)
-
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+    if should_optimize:
+      self.assertDatasetProduces(
+          dataset, expected_output=[function(x) for x in range(5)])
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/data/experimental/kernel_tests/optimization/map_vectorization_test.py b/tensorflow/python/data/experimental/kernel_tests/optimization/map_vectorization_test.py
index f10b66ff691..c2e08e2cd8c 100644
--- a/tensorflow/python/data/experimental/kernel_tests/optimization/map_vectorization_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/optimization/map_vectorization_test.py
@@ -17,23 +17,21 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import time
-
 from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.core.example import example_pb2
 from tensorflow.core.example import feature_pb2
-from tensorflow.python.client import session
 from tensorflow.python.data.experimental.ops import optimization
+from tensorflow.python.data.experimental.ops.optimization_options import OptimizationOptions
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.data.util import nest
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import bitwise_ops
 from tensorflow.python.ops import check_ops
@@ -319,6 +317,7 @@ def _generate_optimization_test_cases():
   } for x in test_cases for num_parallel_calls in (None, 12)]
 
 
+@test_util.run_all_in_graph_and_eager_modes
 class MapVectorizationTest(test_base.DatasetTestBase, parameterized.TestCase):
 
   def _get_test_datasets(self,
@@ -355,7 +354,8 @@ class MapVectorizationTest(test_base.DatasetTestBase, parameterized.TestCase):
     optimized = _make_dataset(["Batch", map_node_name]
                               if expect_optimized else [map_node_name, "Batch"])
     options = dataset_ops.Options()
-    options.experimental_map_vectorization = True
+    options.experimental_optimization = OptimizationOptions()
+    options.experimental_optimization.map_vectorization = True
     optimized = optimized.with_options(options)
     return unoptimized, optimized
 
@@ -366,7 +366,8 @@ class MapVectorizationTest(test_base.DatasetTestBase, parameterized.TestCase):
                                                      num_parallel_calls)
     self.assertDatasetsEqual(unoptimized, optimized)
 
-  def testOptimizationBadMapFn(self):
+  # TODO(b/117581999): Add eager coverage for the following tests.
+  def testSkipEagerOptimizationBadMapFn(self):
     # Test map functions that give an error
     def map_fn(x):
       # x has leading dimension 5, this will raise an error
@@ -375,7 +376,7 @@ class MapVectorizationTest(test_base.DatasetTestBase, parameterized.TestCase):
     base_dataset = dataset_ops.Dataset.range(5).repeat(5).batch(
         5, drop_remainder=True)
     _, optimized = self._get_test_datasets(base_dataset, map_fn)
-    nxt = optimized.make_one_shot_iterator().get_next()
+    nxt = dataset_ops.make_one_shot_iterator(optimized).get_next()
     with self.assertRaisesRegexp(errors.InvalidArgumentError,
                                  r"indices = 10 is not in \[0, 5\)"):
       self.evaluate(nxt)
@@ -394,7 +395,8 @@ class MapVectorizationTest(test_base.DatasetTestBase, parameterized.TestCase):
         base_dataset, map_fn, expect_optimized=True)
     self.assertDatasetsEqual(optimized, unoptimized)
 
-  def testOptimizationIgnoreStateful(self):
+  # TODO(b/117581999): Add eager coverage for the following tests.
+  def testSkipEagerOptimizationIgnoreStateful(self):
 
     def map_fn(x):
       with ops.control_dependencies([check_ops.assert_equal(x, 0)]):
@@ -420,7 +422,8 @@ class MapVectorizationTest(test_base.DatasetTestBase, parameterized.TestCase):
         base_dataset, map_fn, expect_optimized=False)
     self.assertDatasetsEqual(unoptimized, optimized)
 
-  def testOptimizationIgnoreRaggedMap(self):
+  # TODO(b/117581999): Add eager coverage for the following tests.
+  def testSkipEagerOptimizationIgnoreRaggedMap(self):
     # Don't optimize when the output of the map fn shapes are unknown.
     def map_fn(x):
       return array_ops.tile(x, x)
@@ -434,102 +437,5 @@ class MapVectorizationTest(test_base.DatasetTestBase, parameterized.TestCase):
          ("IteratorGetNext", "IteratorGetNext_1", 1)])
 
 
-class MapVectorizationBenchmark(test.Benchmark):
-  # TODO(rachelim): Add a benchmark for more expensive transformations, such as
-  # vgg_preprocessing.
-
-  def _run(self, x, num_iters=100, name=None):
-    deltas = []
-    with session.Session() as sess:
-      for _ in range(5):
-        # Warm up session...
-        sess.run(x)
-      for _ in range(num_iters):
-        start = time.time()
-        sess.run(x)
-        end = time.time()
-        deltas.append(end - start)
-    median_time = np.median(deltas)
-    self.report_benchmark(iters=num_iters, wall_time=median_time, name=name)
-    return median_time
-
-  def _compare(self, input_dataset, map_fn, batch_size, input_size, str_id):
-    num_elems = int(np.sum([np.prod(x) for x in input_size]))
-    name_template = "{}__batch_size_{}_input_element_size_{}_{}"
-    unoptimized = input_dataset.map(map_fn).batch(batch_size)
-    unoptimized_op = unoptimized.make_one_shot_iterator().get_next()
-
-    optimized = input_dataset.map(map_fn).batch(batch_size)
-    options = dataset_ops.Options()
-    options.experimental_map_vectorization = True
-    optimized = optimized.with_options(options)
-    optimized_op = optimized.make_one_shot_iterator().get_next()
-
-    unoptimized_time = self._run(
-        unoptimized_op,
-        name=name_template.format(str_id, batch_size, num_elems, "unoptimized"))
-    optimized_time = self._run(
-        optimized_op,
-        name=name_template.format(str_id, batch_size, num_elems, "optimized"))
-
-    print("Batch size: {}\n"
-          "Input element size: {}\n"
-          "Transformation: {}\n"
-          "Speedup: {}\n".format(batch_size, input_size, str_id,
-                                 (unoptimized_time / optimized_time)))
-
-  # Known cheap functions
-  def benchmarkIdentity(self):
-    self._benchmark_helper(lambda *args: [array_ops.identity(x) for x in args],
-                           "identity")
-
-  def benchmarkAddConst(self):
-    self._benchmark_helper(lambda *args: [x + 1 for x in args], "add_const")
-
-  def benchmarkReturnConst(self):
-    self._benchmark_helper(lambda *args: [constant_op.constant(2)], "ret_const")
-
-  def benchmarkSelect(self):
-    self._benchmark_helper(lambda *args: args[0], "select")
-
-  def benchmarkCast(self):
-    self._benchmark_helper(
-        lambda *args: [math_ops.cast(x, dtypes.float64) for x in args], "cast")
-
-  def benchmarkReshape(self):
-    self._benchmark_helper(
-        lambda *args: [array_ops.reshape(x, (-1, 30)) for x in args], "reshape")
-
-  def benchmarkDecodeCSV(self):
-    csv_fn, csv_factory = _generate_csv_test_case()
-    self._benchmark_helper(csv_fn, "decode_csv", lambda: [csv_factory()])
-
-  def benchmarkParseSingleExample(self):
-    # NOTE: Since we haven't implemented a vectorizer for "SerializeSparse",
-    # this function is only naively vectorized.
-    parse_fn, parse_factory = _generate_parse_single_example_test_case()
-
-    self._benchmark_helper(parse_fn, "parse_single_example",
-                           lambda: [parse_factory()])
-
-  def _default_dataset_factory(self):
-    input_sizes = [(10, 10, 3), (10, 100, 300)]
-    for sz in input_sizes:
-      yield dataset_ops.Dataset.from_tensor_slices(np.random.rand(*sz))
-
-  def _benchmark_helper(self, map_fn, str_id, base_dataset_factory=None):
-    if base_dataset_factory is None:
-      base_dataset_factory = self._default_dataset_factory
-
-    batch_size = 1000
-    for base_dataset in base_dataset_factory():
-      base_dataset = base_dataset.repeat()
-      input_size = [
-          tuple(shape.as_list())
-          for shape in nest.flatten(base_dataset.output_shapes)
-      ]
-      self._compare(base_dataset, map_fn, batch_size, input_size, str_id)
-
-
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/data/experimental/kernel_tests/optimization/model_dataset_test.py b/tensorflow/python/data/experimental/kernel_tests/optimization/model_dataset_test.py
index 5b49bdf4532..0f0274b41f2 100644
--- a/tensorflow/python/data/experimental/kernel_tests/optimization/model_dataset_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/optimization/model_dataset_test.py
@@ -17,181 +17,18 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import time
-
 from absl.testing import parameterized
-import numpy as np
 
-from tensorflow.python.data.experimental.ops import batching
 from tensorflow.python.data.experimental.ops import optimization
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import errors
-from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 
 
+# TODO(b/117581999): Add eager coverage for the following tests.
 class ModelDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
 
-  def testModelMap(self):
-    k = 1024 * 1024
-    dataset = dataset_ops.Dataset.from_tensors((np.random.rand(1, 4 * k),
-                                                np.random.rand(4 * k,
-                                                               1))).repeat()
-    dataset = dataset.map(math_ops.matmul)
-    dataset = dataset_ops._ModelDataset(dataset)
-    iterator = dataset.make_one_shot_iterator()
-    get_next = iterator.get_next()
-
-    deltas = []
-    with self.cached_session() as sess:
-      for _ in range(5):
-        sess.run(get_next.op)
-      for _ in range(100):
-        start = time.time()
-        sess.run(get_next.op)
-        end = time.time()
-        deltas.append(end - start)
-
-    print("%f (median), %f (mean), %f (stddev), %f (min), %f (max)\n" %
-          (np.median(deltas), np.mean(deltas), np.std(deltas), np.min(deltas),
-           np.max(deltas)))
-
-  def testModelParallelMap(self):
-    k = 1024 * 1024
-    dataset = dataset_ops.Dataset.from_tensors((np.random.rand(1, 4 * k),
-                                                np.random.rand(4 * k,
-                                                               1))).repeat()
-    dataset = dataset.map(
-        math_ops.matmul, num_parallel_calls=optimization.AUTOTUNE)
-    dataset = dataset_ops._ModelDataset(dataset)
-    iterator = dataset.make_one_shot_iterator()
-    get_next = iterator.get_next()
-
-    deltas = []
-    with self.cached_session() as sess:
-      for _ in range(5):
-        sess.run(get_next.op)
-      for _ in range(1000):
-        start = time.time()
-        sess.run(get_next.op)
-        end = time.time()
-        deltas.append(end - start)
-
-    print("%f (median), %f (mean), %f (stddev), %f (min), %f (max)\n" %
-          (np.median(deltas), np.mean(deltas), np.std(deltas), np.min(deltas),
-           np.max(deltas)))
-
-  @parameterized.named_parameters(
-      ("Default", False),
-      ("NUMA", True),
-  )
-  def testModelMapAndBatch(self, numa_aware):
-    batch_size = 16
-    k = 1024 * 1024
-    dataset = dataset_ops.Dataset.from_tensors((np.random.rand(1, 4 * k),
-                                                np.random.rand(4 * k,
-                                                               1))).repeat()
-    dataset = dataset.apply(
-        batching.map_and_batch(
-            math_ops.matmul,
-            num_parallel_calls=optimization.AUTOTUNE,
-            batch_size=batch_size))
-    dataset = dataset_ops._ModelDataset(dataset)
-    options = dataset_ops.Options()
-    options.experimental_numa_aware = numa_aware
-    dataset = dataset.with_options(options)
-    iterator = dataset.make_one_shot_iterator()
-    get_next = iterator.get_next()
-
-    deltas = []
-    with self.cached_session() as sess:
-      for _ in range(5):
-        sess.run(get_next.op)
-      for _ in range(10):
-        start = time.time()
-        sess.run(get_next.op)
-        end = time.time()
-        deltas.append(end - start)
-
-    print("%f (median), %f (mean), %f (stddev), %f (min), %f (max)\n" %
-          (np.median(deltas), np.mean(deltas), np.std(deltas), np.min(deltas),
-           np.max(deltas)))
-
-  def testModelParallelInterleave(self):
-    k = 1024 * 1024
-    dataset = dataset_ops.Dataset.from_tensors((np.random.rand(1, 4 * k),
-                                                np.random.rand(4 * k,
-                                                               1))).repeat()
-    dataset = dataset.map(math_ops.matmul)
-    dataset = dataset_ops.Dataset.range(1).repeat().interleave(
-        lambda _: dataset,
-        cycle_length=10,
-        num_parallel_calls=optimization.AUTOTUNE)
-    dataset = dataset_ops._ModelDataset(dataset)
-    iterator = dataset.make_one_shot_iterator()
-    get_next = iterator.get_next()
-
-    deltas = []
-    with self.cached_session() as sess:
-      for _ in range(5):
-        sess.run(get_next.op)
-      for _ in range(1000):
-        start = time.time()
-        sess.run(get_next.op)
-        end = time.time()
-        deltas.append(end - start)
-
-    print("%f (median), %f (mean), %f (stddev), %f (min), %f (max)\n" %
-          (np.median(deltas), np.mean(deltas), np.std(deltas), np.min(deltas),
-           np.max(deltas)))
-
-  def testModelNested(self):
-    k = 1024 * 1024
-    a = (np.random.rand(1, 8 * k), np.random.rand(8 * k, 1))
-    b = (np.random.rand(1, 4 * k), np.random.rand(4 * k, 1))
-    c = (np.random.rand(1, 2 * k), np.random.rand(2 * k, 1))
-    dataset = dataset_ops.Dataset.from_tensors((a, b, c)).repeat()
-
-    def f1(a, b, c):
-      x, y = a
-      return math_ops.matmul(x, y), b, c
-
-    def f2(a, b, c):
-      x, y = b
-      return a, math_ops.matmul(x, y), c
-
-    def f3(a, b, c):
-      x, y = c
-      return a, b, math_ops.matmul(x, y)
-
-    dataset = dataset.map(f1, num_parallel_calls=optimization.AUTOTUNE)
-    dataset = dataset_ops.Dataset.range(1).repeat().interleave(
-        lambda _: dataset, cycle_length=2)
-
-    dataset = dataset.map(f2, num_parallel_calls=optimization.AUTOTUNE)
-    dataset = dataset_ops.Dataset.range(1).repeat().interleave(
-        lambda _: dataset, cycle_length=2)
-
-    dataset = dataset.map(f3, num_parallel_calls=optimization.AUTOTUNE)
-    dataset = dataset_ops._ModelDataset(dataset)
-    iterator = dataset.make_one_shot_iterator()
-    get_next = iterator.get_next()
-
-    deltas = []
-    with self.cached_session() as sess:
-      for _ in range(5):
-        sess.run(get_next)
-      for _ in range(100):
-        start = time.time()
-        sess.run(get_next)
-        end = time.time()
-        deltas.append(end - start)
-
-    print("%f (median), %f (mean), %f (stddev), %f (min), %f (max)\n" %
-          (np.median(deltas), np.mean(deltas), np.std(deltas), np.min(deltas),
-           np.max(deltas)))
-
   def testAutotuneOption(self):
     dataset = dataset_ops.Dataset.from_tensors(0)
     dataset = dataset.map(lambda x: x).apply(
@@ -200,13 +37,13 @@ class ModelDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
     options.experimental_autotune = True
     dataset = dataset.with_options(options)
 
-    iterator = dataset.make_one_shot_iterator()
+    iterator = dataset_ops.make_one_shot_iterator(dataset)
     get_next = iterator.get_next()
 
     with self.cached_session() as sess:
-      self.assertEqual(0, sess.run(get_next))
+      self.assertEqual(0, self.evaluate(get_next))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+        self.evaluate(get_next)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/data/experimental/kernel_tests/optimization/noop_elimination_test.py b/tensorflow/python/data/experimental/kernel_tests/optimization/noop_elimination_test.py
index ddcd7f4da4b..ce86bfa4e0f 100644
--- a/tensorflow/python/data/experimental/kernel_tests/optimization/noop_elimination_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/optimization/noop_elimination_test.py
@@ -18,15 +18,17 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.data.experimental.ops import optimization
+from tensorflow.python.data.experimental.ops.optimization_options import OptimizationOptions
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import errors
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 
 
+@test_util.run_all_in_graph_and_eager_modes
 class NoopEliminationTest(test_base.DatasetTestBase):
 
   def testNoopElimination(self):
@@ -41,18 +43,10 @@ class NoopEliminationTest(test_base.DatasetTestBase):
     dataset = dataset.repeat(some_tensor).skip(5).take(-1).skip(0).repeat(
         1).prefetch(0).prefetch(1).cache()
     options = dataset_ops.Options()
-    options.experimental_noop_elimination = True
+    options.experimental_optimization = OptimizationOptions()
+    options.experimental_optimization.noop_elimination = True
     dataset = dataset.with_options(options)
-    iterator = dataset.make_one_shot_iterator()
-    get_next = iterator.get_next()
-
-    with self.test_session() as sess:
-      for x in range(5):
-        result = sess.run(get_next)
-        self.assertAllEqual(result, x)
-
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+    self.assertDatasetProduces(dataset, expected_output=range(5))
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/data/experimental/kernel_tests/optimization/optimize_dataset_test.py b/tensorflow/python/data/experimental/kernel_tests/optimization/optimize_dataset_test.py
index 739b6a9bf4c..751be833267 100644
--- a/tensorflow/python/data/experimental/kernel_tests/optimization/optimize_dataset_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/optimization/optimize_dataset_test.py
@@ -17,54 +17,125 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import warnings
+
+from absl.testing import parameterized
 import numpy as np
 
+from tensorflow.python.data.experimental.ops import batching
+from tensorflow.python.data.experimental.ops import grouping
 from tensorflow.python.data.experimental.ops import optimization
+from tensorflow.python.data.experimental.ops import optimization_options
+from tensorflow.python.data.experimental.ops import scan_ops
 from tensorflow.python.data.experimental.ops import threadpool
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import variable_scope
 from tensorflow.python.platform import test
 
 
-class OptimizeDatasetTest(test_base.DatasetTestBase):
+def _generate_captured_refvar_test_cases():
+  """Generates testcases.
+
+  Returns:
+    A list of tuples of (testcase_name, make_dataset_fn). make_dataset_fn takes
+    a tf.Variable as input and creates a test dataset that uses that variable.
+  """
+
+  def make_map_dataset(var):
+    return dataset_ops.Dataset.from_tensors(0).map(lambda x: x + var)
+
+  def make_flat_map_dataset(var):
+    return dataset_ops.Dataset.from_tensors(
+        0).flat_map(lambda _: dataset_ops.Dataset.from_tensors(var))
+
+  def make_filter_dataset(var):
+    return dataset_ops.Dataset.from_tensors(0).filter(lambda x: x < var)
+
+  def make_map_and_batch_dataset(var):
+
+    def map_fn(x):
+      return x + var
+
+    return dataset_ops.Dataset.from_tensors(0).apply(
+        batching.map_and_batch(map_fn, 1))
+
+  def make_group_by_reducer_dataset(var):
+    reducer = grouping.Reducer(
+        init_func=lambda _: 0,
+        reduce_func=lambda x, y: x,
+        finalize_func=lambda _: var)
+    return dataset_ops.Dataset.range(5).apply(
+        grouping.group_by_reducer(lambda x: x % 2, reducer))
+
+  def make_group_by_window_dataset(var):
+
+    def reduce_fn(key, bucket):
+      del key, bucket
+      return dataset_ops.Dataset.from_tensors(var)
+
+    return dataset_ops.Dataset.from_tensors(0).repeat(10).apply(
+        grouping.group_by_window(lambda _: 0, reduce_fn, 10))
+
+  def make_scan_dataset(var):
+    return dataset_ops.Dataset.from_tensors(0).apply(
+        scan_ops.scan(
+            0, lambda old_state, elem: (old_state + 1, elem + old_state + var)))
+
+  return [
+      # Core datasets
+      ("Map", make_map_dataset),
+      ("FlatMap", make_flat_map_dataset),
+      ("Filter", make_filter_dataset),
+      # Experimental datasets
+      ("MapAndBatch", make_map_and_batch_dataset),
+      ("GroupByReducer", make_group_by_reducer_dataset),
+      ("GroupByWindow", make_group_by_window_dataset),
+      ("Scan", make_scan_dataset)
+  ]
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class OptimizeDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
 
   def testOptimizationStatefulFunction(self):
-    dataset = dataset_ops.Dataset.range(10).map(
-        lambda _: random_ops.random_uniform([])).batch(10)
+    dataset = dataset_ops.Dataset.range(
+        10).map(lambda _: random_ops.random_uniform([])).batch(10)
     dataset = dataset_ops._OptimizeDataset(dataset, [])
-    iterator = dataset.make_one_shot_iterator()
-    get_next = iterator.get_next()
+    get_next = self.getNext(dataset)
+    self.evaluate(get_next())
 
-    with self.cached_session() as sess:
-      sess.run(get_next)
-
-  def testOptimizationLargeInputFromTensor(self):
+  # TODO(b/117581999): Add eager coverage for the following tests.
+  def testSkipEagerOptimizationLargeInputFromTensor(self):
     input_t = array_ops.placeholder(dtypes.int32, (None, None, None))
     dataset = dataset_ops.Dataset.from_tensors(input_t)
     dataset = dataset_ops._OptimizeDataset(dataset, [])
-    iterator = dataset.make_initializable_iterator()
+    iterator = dataset_ops.make_initializable_iterator(dataset)
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
     with self.cached_session() as sess:
       sess.run(init_op, {input_t: np.ones([512, 1024, 1025], np.int32)})
-      sess.run(get_next)
+      self.evaluate(get_next)
 
-  def testOptimizationLargeInputFromTensorSlices(self):
+  # TODO(b/117581999): Add eager coverage for the following tests.
+  def testSkipEagerOptimizationLargeInputFromTensorSlices(self):
     input_t = array_ops.placeholder(dtypes.int32, (None, None, None, None))
     dataset = dataset_ops.Dataset.from_tensor_slices(input_t)
     dataset = dataset_ops._OptimizeDataset(dataset, [])
-    iterator = dataset.make_initializable_iterator()
+    iterator = dataset_ops.make_initializable_iterator(dataset)
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
     with self.cached_session() as sess:
       sess.run(init_op, {input_t: np.ones([1, 512, 1024, 1025], np.int32)})
-      sess.run(get_next)
+      self.evaluate(get_next)
 
   def testOptimizationNestedDataset(self):
 
@@ -78,13 +149,22 @@ class OptimizeDatasetTest(test_base.DatasetTestBase):
     dataset = dataset_ops.Dataset.range(1)
     dataset = dataset.flat_map(flat_map_fn)
     dataset = dataset_ops._OptimizeDataset(dataset, ["noop_elimination"])
-    iterator = dataset.make_one_shot_iterator()
-    get_next = iterator.get_next()
+    self.assertDatasetProduces(dataset, expected_output=[0])
 
-    with self.cached_session() as sess:
-      self.assertEquals(0, sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+  def testOptimizationNestedDatasetWithModifiedRetval(self):
+
+    def flat_map_fn(_):
+      dataset = dataset_ops.Dataset.from_tensors(0)
+      dataset = dataset.apply(optimization.assert_next(["MapAndBatch"]))
+      # Should be fused by map and batch fusion
+      dataset = dataset.map(lambda x: x)
+      dataset = dataset.batch(1)
+      return dataset
+
+    dataset = dataset_ops.Dataset.range(1)
+    dataset = dataset.flat_map(flat_map_fn)
+    dataset = dataset_ops._OptimizeDataset(dataset, ["map_and_batch_fusion"])
+    self.assertDatasetProduces(dataset, expected_output=[[0]])
 
   def testOptimizationThreadPoolDataset(self):
     dataset = dataset_ops.Dataset.range(10).batch(10)
@@ -95,14 +175,10 @@ class OptimizeDatasetTest(test_base.DatasetTestBase):
             2, display_name="private_thread_pool_%d" % 2))
 
     dataset = dataset_ops._OptimizeDataset(dataset, [])
-    iterator = dataset.make_initializable_iterator()
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(iterator.initializer)
-      self.assertAllEqual(list(range(10)), sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+    self.assertDatasetProduces(
+        dataset,
+        expected_output=[list(range(10))],
+        requires_initialization=True)
 
   def testOptimizationNonSerializable(self):
     dataset = dataset_ops.Dataset.from_tensors(0)
@@ -113,26 +189,61 @@ class OptimizeDatasetTest(test_base.DatasetTestBase):
     dataset = dataset.skip(0)  # Should be removed by noop elimination
     dataset = dataset.cache()
     dataset = dataset_ops._OptimizeDataset(dataset, ["noop_elimination"])
-    iterator = dataset.make_one_shot_iterator()
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      self.assertEquals(0, sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+    self.assertDatasetProduces(dataset, expected_output=[0])
 
   def testOptimizationNonSerializableAsDirectInput(self):
-    """Tests that non-serializable dataset can be OptimizeDataset's input.
-    """
+    """Tests that non-serializable dataset can be OptimizeDataset's input."""
     dataset = dataset_ops.Dataset.from_tensors(0)
     dataset = dataset.apply(optimization.non_serializable())
     dataset = dataset_ops._OptimizeDataset(dataset, ["noop_elimination"])
-    iterator = dataset.make_one_shot_iterator()
-    get_next = iterator.get_next()
-    with self.cached_session() as sess:
-      self.assertEquals(0, sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+    self.assertDatasetProduces(dataset, expected_output=[0])
+
+  @parameterized.named_parameters(_generate_captured_refvar_test_cases())
+  # Skip eager because RefVariables are not supported in eager mode.
+  def testSkipEagerOptimizationWithCapturedRefVar(self, dataset_fn):
+    """Tests that default optimizations are disabled with ref variables."""
+    variable = variable_scope.get_variable(
+        "v", initializer=0, use_resource=False)
+    assign_op = variable.assign_add(1)
+
+    unoptimized_dataset = dataset_fn(variable)
+
+    options = dataset_ops.Options()
+    opt_options = optimization_options.OptimizationOptions()
+    opt_options.noop_elimination = True
+    opt_options.map_and_batch_fusion = True
+    options.experimental_optimization = opt_options
+    optimized_dataset = unoptimized_dataset.with_options(options)
+
+    # Check that warning is logged.
+    warnings.simplefilter("always")
+    with warnings.catch_warnings(record=True) as w:
+      optimized_it = optimized_dataset.make_initializable_iterator()
+
+    self.assertGreaterEqual(len(w), 1)
+    expected = ("tf.data static optimizations are not compatible with "
+                "tf.Variable. The following optimizations will be disabled: "
+                "map_and_batch_fusion, noop_elimination. To enable "
+                "optimizations, use resource variables instead by calling "
+                "`tf.enable_resource_variables()` at the start of the program.")
+    self.assertTrue(any([expected in str(warning) for warning in w]))
+
+    # Check that outputs are the same in the optimized and unoptimized cases,
+    # when the variable value is changing.
+    unoptimized_it = unoptimized_dataset.make_initializable_iterator()
+    with ops.control_dependencies([assign_op]):
+      unoptimized_output = unoptimized_it.get_next()
+      optimized_output = optimized_it.get_next()
+
+    self.evaluate(variable.initializer)
+    self.evaluate((unoptimized_it.initializer, optimized_it.initializer))
+    while True:
+      try:
+        unoptimized, optimized = self.evaluate((unoptimized_output,
+                                                optimized_output))
+        self.assertEqual(unoptimized, optimized)
+      except errors.OutOfRangeError:
+        break
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/data/experimental/kernel_tests/optimization/shuffle_and_repeat_fusion_test.py b/tensorflow/python/data/experimental/kernel_tests/optimization/shuffle_and_repeat_fusion_test.py
index 36582f449f3..5f746ec63ac 100644
--- a/tensorflow/python/data/experimental/kernel_tests/optimization/shuffle_and_repeat_fusion_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/optimization/shuffle_and_repeat_fusion_test.py
@@ -18,31 +18,35 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.data.experimental.ops import optimization
+from tensorflow.python.data.experimental.ops.optimization_options import OptimizationOptions
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import errors
+from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
 
 
+@test_util.run_all_in_graph_and_eager_modes
 class ShuffleAndRepeatFusionTest(test_base.DatasetTestBase):
 
   def testShuffleAndRepeatFusion(self):
     dataset = dataset_ops.Dataset.range(10).apply(
         optimization.assert_next(["ShuffleAndRepeat"])).shuffle(10).repeat(2)
     options = dataset_ops.Options()
-    options.experimental_shuffle_and_repeat_fusion = True
+    options.experimental_optimization = OptimizationOptions()
+    options.experimental_optimization.shuffle_and_repeat_fusion = True
     dataset = dataset.with_options(options)
-    iterator = dataset.make_one_shot_iterator()
-    get_next = iterator.get_next()
+    get_next = self.getNext(dataset)
 
-    with self.cached_session() as sess:
-      for _ in range(2):
-        results = []
-        for _ in range(10):
-          results.append(sess.run(get_next))
-        self.assertAllEqual([x for x in range(10)], sorted(results))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+    for _ in range(2):
+      results = []
+      for _ in range(10):
+        results.append(self.evaluate(get_next()))
+      self.assertAllEqual([x for x in range(10)], sorted(results))
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/data/experimental/kernel_tests/override_threadpool_test.py b/tensorflow/python/data/experimental/kernel_tests/override_threadpool_test.py
index 5e419a9b2f9..aa81663a188 100644
--- a/tensorflow/python/data/experimental/kernel_tests/override_threadpool_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/override_threadpool_test.py
@@ -22,12 +22,15 @@ import threading
 from absl.testing import parameterized
 import numpy as np
 
+from tensorflow.core.framework import graph_pb2
+from tensorflow.python.data.experimental.ops import threading_options
 from tensorflow.python.data.experimental.ops import threadpool
 from tensorflow.python.data.experimental.ops import unique
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import script_ops
 from tensorflow.python.platform import test
 
@@ -35,18 +38,7 @@ from tensorflow.python.platform import test
 class OverrideThreadpoolTest(test_base.DatasetTestBase,
                              parameterized.TestCase):
 
-  @parameterized.named_parameters(
-      ("1", 1, None),
-      ("2", 2, None),
-      ("3", 4, None),
-      ("4", 8, None),
-      ("5", 16, None),
-      ("6", 4, -1),
-      ("7", 4, 0),
-      ("8", 4, 1),
-      ("9", 4, 4),
-  )
-  def testNumThreads(self, num_threads, max_intra_op_parallelism):
+  def _testNumThreadsHelper(self, num_threads, override_threadpool_fn):
 
     def get_thread_id(_):
       # Python creates a dummy thread object to represent the current
@@ -60,32 +52,86 @@ class OverrideThreadpoolTest(test_base.DatasetTestBase,
         dataset_ops.Dataset.range(1000).map(
             lambda x: script_ops.py_func(get_thread_id, [x], dtypes.int64),
             num_parallel_calls=32).apply(unique.unique()))
-
-    dataset = threadpool.override_threadpool(
-        dataset,
-        threadpool.PrivateThreadPool(
-            num_threads,
-            max_intra_op_parallelism=max_intra_op_parallelism,
-            display_name="private_thread_pool_%d" % num_threads))
-
-    iterator = dataset.make_initializable_iterator()
+    dataset = override_threadpool_fn(dataset)
+    iterator = dataset_ops.make_initializable_iterator(dataset)
     next_element = iterator.get_next()
 
-    with self.cached_session() as sess:
-      sess.run(iterator.initializer)
-      thread_ids = []
-      try:
-        while True:
-          thread_ids.append(sess.run(next_element))
-      except errors.OutOfRangeError:
-        pass
-      self.assertEqual(len(thread_ids), len(set(thread_ids)))
-      self.assertGreater(len(thread_ids), 0)
+    self.evaluate(iterator.initializer)
+    thread_ids = []
+    try:
+      while True:
+        thread_ids.append(self.evaluate(next_element))
+    except errors.OutOfRangeError:
+      pass
+    self.assertLen(thread_ids, len(set(thread_ids)))
+    self.assertNotEmpty(thread_ids)
+    if num_threads:
       # NOTE(mrry): We don't control the thread pool scheduling, and
       # so cannot guarantee that all of the threads in the pool will
       # perform work.
       self.assertLessEqual(len(thread_ids), num_threads)
 
+  @parameterized.named_parameters(
+      ("1", 1, None),
+      ("2", 2, None),
+      ("3", 4, None),
+      ("4", 8, None),
+      ("5", 16, None),
+      ("6", 4, -1),
+      ("7", 4, 0),
+      ("8", 4, 1),
+      ("9", 4, 4),
+  )
+  @test_util.run_deprecated_v1
+  def testNumThreadsDeprecated(self, num_threads, max_intra_op_parallelism):
+
+    def override_threadpool_fn(dataset):
+      return threadpool.override_threadpool(
+          dataset,
+          threadpool.PrivateThreadPool(
+              num_threads,
+              max_intra_op_parallelism=max_intra_op_parallelism,
+              display_name="private_thread_pool_%d" % num_threads))
+
+    self._testNumThreadsHelper(num_threads, override_threadpool_fn)
+
+  @parameterized.named_parameters(
+      ("1", 1, None),
+      ("2", 2, None),
+      ("3", 4, None),
+      ("4", 8, None),
+      ("5", 16, None),
+      ("6", None, 0),
+      ("7", None, 1),
+      ("8", None, 4),
+      ("9", 4, 0),
+      ("10", 4, 1),
+      ("11", 4, 4),
+      ("12", None, None),
+  )
+  @test_util.run_deprecated_v1
+  def testNumThreads(self, num_threads, max_intra_op_parallelism):
+
+    def override_threadpool_fn(dataset):
+      t_options = threading_options.ThreadingOptions()
+      if max_intra_op_parallelism is not None:
+        t_options.max_intra_op_parallelism = max_intra_op_parallelism
+      if num_threads is not None:
+        t_options.private_threadpool_size = num_threads
+      options = dataset_ops.Options()
+      options.experimental_threading = t_options
+      return dataset.with_options(options)
+
+    self._testNumThreadsHelper(num_threads, override_threadpool_fn)
+
+  def testMaxIntraOpParallelismAsGraphDefInternal(self):
+    dataset = dataset_ops.Dataset.from_tensors(0)
+    dataset = dataset_ops._MaxIntraOpParallelismDataset(dataset, 1)
+    graph = graph_pb2.GraphDef().FromString(
+        self.evaluate(dataset._as_serialized_graph()))
+    self.assertTrue(
+        any([node.op != "MaxIntraOpParallelismDataset" for node in graph.node]))
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/data/experimental/kernel_tests/parallel_interleave_test.py b/tensorflow/python/data/experimental/kernel_tests/parallel_interleave_test.py
index 90ac250df70..113326c028a 100644
--- a/tensorflow/python/data/experimental/kernel_tests/parallel_interleave_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/parallel_interleave_test.py
@@ -86,7 +86,7 @@ class ParallelInterleaveTest(test_base.DatasetTestBase):
                                                self.block_length, self.sloppy,
                                                self.buffer_output_elements,
                                                self.prefetch_input_elements)))
-    self.iterator = self.dataset.make_initializable_iterator()
+    self.iterator = dataset_ops.make_initializable_iterator(self.dataset)
     self.init_op = self.iterator.initializer
     self.next_element = self.iterator.get_next()
 
@@ -195,9 +195,9 @@ class ParallelInterleaveTest(test_base.DatasetTestBase):
           [[4] * 4, [5] * 5, [6] * 6] * self.repeat_count, 1, 1):
         self.write_coordination_events[expected_element].set()
         self.assertEqual(expected_element * expected_element,
-                         sess.run(self.next_element))
+                         self.evaluate(self.next_element))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(self.next_element)
+        self.evaluate(self.next_element)
 
   def testSingleThreaded(self):
     self._testSingleThreaded()
@@ -235,10 +235,10 @@ class ParallelInterleaveTest(test_base.DatasetTestBase):
       for expected_element in self._interleave(
           [[3] * 3, [7] * 7, [4] * 4] * self.repeat_count, 2, 1):
         self.write_coordination_events[expected_element].set()
-        output = sess.run(self.next_element)
+        output = self.evaluate(self.next_element)
         self.assertEqual(expected_element * expected_element, output)
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(self.next_element)
+        self.evaluate(self.next_element)
 
   def _testTwoThreadsNoContention(self, sloppy=False):
     # num_threads > 1.
@@ -262,7 +262,7 @@ class ParallelInterleaveTest(test_base.DatasetTestBase):
         self.write_coordination_events[expected_element].set()
         if done_first_event:  # First event starts the worker threads.
           self.read_coordination_events[expected_element].acquire()
-        actual_element = sess.run(self.next_element)
+        actual_element = self.evaluate(self.next_element)
         if not done_first_event:
           self.read_coordination_events[expected_element].acquire()
           done_first_event = True
@@ -270,7 +270,7 @@ class ParallelInterleaveTest(test_base.DatasetTestBase):
                          "At index %s: %s expected, got: %s" %
                          (i, expected_element, actual_element))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(self.next_element)
+        self.evaluate(self.next_element)
 
   def testTwoThreadsNoContention(self):
     self._testTwoThreadsNoContention()
@@ -309,7 +309,7 @@ class ParallelInterleaveTest(test_base.DatasetTestBase):
         else:
           self.write_coordination_events[expected_element].set()
         time.sleep(0.5)  # Sleep to consistently "avoid" the race condition.
-        actual_element = sess.run(self.next_element)
+        actual_element = self.evaluate(self.next_element)
         if not done_first_event:
           done_first_event = True
           self.assertTrue(
@@ -318,7 +318,7 @@ class ParallelInterleaveTest(test_base.DatasetTestBase):
                          "At index %s: %s expected, got: %s" %
                          (i, expected_element, actual_element))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(self.next_element)
+        self.evaluate(self.next_element)
 
   def testTwoThreadsNoContentionWithRaces(self):
     self._testTwoThreadsNoContentionWithRaces()
@@ -348,7 +348,7 @@ class ParallelInterleaveTest(test_base.DatasetTestBase):
         self.write_coordination_events[expected_element].set()
         if done_first_event:  # First event starts the worker threads.
           self.read_coordination_events[expected_element].acquire()
-        actual_element = sess.run(self.next_element)
+        actual_element = self.evaluate(self.next_element)
         if not done_first_event:
           done_first_event = True
           self.read_coordination_events[expected_element].acquire()
@@ -356,7 +356,7 @@ class ParallelInterleaveTest(test_base.DatasetTestBase):
                          "At index %s: %s expected, got: %s" %
                          (i, expected_element, actual_element))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(self.next_element)
+        self.evaluate(self.next_element)
 
   def testTwoThreadsNoContentionBlockLength(self):
     self._testTwoThreadsNoContentionBlockLength()
@@ -396,7 +396,7 @@ class ParallelInterleaveTest(test_base.DatasetTestBase):
         else:
           self.write_coordination_events[expected_element].set()
         time.sleep(0.5)  # Sleep to consistently "avoid" the race condition.
-        actual_element = sess.run(self.next_element)
+        actual_element = self.evaluate(self.next_element)
         if not done_first_event:
           done_first_event = True
           self.assertTrue(
@@ -405,7 +405,7 @@ class ParallelInterleaveTest(test_base.DatasetTestBase):
                          "At index %s: %s expected, got: %s" %
                          (i, expected_element, actual_element))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(self.next_element)
+        self.evaluate(self.next_element)
 
   def testTwoThreadsNoContentionWithRacesAndBlocking(self):
     self._testTwoThreadsNoContentionWithRacesAndBlocking()
@@ -428,7 +428,7 @@ class ParallelInterleaveTest(test_base.DatasetTestBase):
               self.prefetch_input_elements: 0,
           })
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(self.next_element)
+        self.evaluate(self.next_element)
 
   def testEmptyInput(self):
     self._testEmptyInput()
@@ -451,7 +451,7 @@ class ParallelInterleaveTest(test_base.DatasetTestBase):
               self.prefetch_input_elements: 0,
           })
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(self.next_element)
+        self.evaluate(self.next_element)
 
   def testNonEmptyInputIntoEmptyOutputs(self):
     self._testNonEmptyInputIntoEmptyOutputs()
@@ -484,7 +484,7 @@ class ParallelInterleaveTest(test_base.DatasetTestBase):
         # presence of finishing iterators.
         if done_first_event and not (sloppy and (i in race_indices)):
           self.read_coordination_events[expected_element].acquire()
-        actual_element = sess.run(self.next_element)
+        actual_element = self.evaluate(self.next_element)
         if not done_first_event or (sloppy and (i in race_indices)):
           done_first_event = True
           self.read_coordination_events[expected_element].acquire()
@@ -520,10 +520,10 @@ class ParallelInterleaveTest(test_base.DatasetTestBase):
       ]
       for element in mis_ordering:
         self.write_coordination_events[element].set()
-        self.assertEqual(element * element, sess.run(self.next_element))
+        self.assertEqual(element * element, self.evaluate(self.next_element))
         self.assertTrue(self.read_coordination_events[element].acquire(False))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(self.next_element)
+        self.evaluate(self.next_element)
 
   def testBlockLengthWithContentionSloppy(self):
     with self.cached_session() as sess:
@@ -549,7 +549,7 @@ class ParallelInterleaveTest(test_base.DatasetTestBase):
         self.write_coordination_events[expected_element].set()
         if done_first_event:  # First event starts the worker threads.
           self.read_coordination_events[expected_element].acquire()
-        actual_element = sess.run(self.next_element)
+        actual_element = self.evaluate(self.next_element)
         if not done_first_event:
           self.read_coordination_events[expected_element].acquire()
           done_first_event = True
@@ -557,7 +557,7 @@ class ParallelInterleaveTest(test_base.DatasetTestBase):
                          "At index %s: %s expected, got: %s" %
                          (i, expected_element, actual_element))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(self.next_element)
+        self.evaluate(self.next_element)
 
   def _testEarlyExit(self, sloppy=False):
     # Exiting without consuming all input should not block
@@ -575,7 +575,7 @@ class ParallelInterleaveTest(test_base.DatasetTestBase):
           })
       for i in range(4, 7):
         self.write_coordination_events[i].set()
-      elem = sess.run(self.next_element)  # Start all workers
+      elem = self.evaluate(self.next_element)  # Start all workers
       # Allow the one successful worker to progress beyond the py_func again.
       elem = int(math.sqrt(elem))
       self.write_coordination_events[elem].set()
@@ -603,12 +603,12 @@ class ParallelInterleaveTest(test_base.DatasetTestBase):
     dataset = dataset.apply(
         interleave_ops.parallel_interleave(
             interleave_fn, cycle_length=16, block_length=2, sloppy=sloppy))
-    iterator = dataset.make_one_shot_iterator()
+    iterator = dataset_ops.make_one_shot_iterator(dataset)
 
     with self.cached_session() as sess:
       output_values = []
       for _ in range(30):
-        output_values.append(sess.run(iterator.get_next()))
+        output_values.append(self.evaluate(iterator.get_next()))
 
     expected_values = self._interleave(
         [[4] * 4, [5] * 5, [6] * 6] * self.repeat_count, 1, 2)
@@ -630,20 +630,19 @@ class ParallelInterleaveTest(test_base.DatasetTestBase):
           sparse_ops.sparse_to_dense(x.indices, x.dense_shape, x.values))
 
     dataset = dataset_ops.Dataset.range(10).map(_map_fn)
-    iterator = dataset.apply(
-        interleave_ops.parallel_interleave(
-            _interleave_fn, cycle_length=1)).make_initializable_iterator()
+    iterator = dataset_ops.make_initializable_iterator(dataset.apply(
+        interleave_ops.parallel_interleave(_interleave_fn, cycle_length=1)))
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
     with self.cached_session() as sess:
-      sess.run(init_op)
+      self.evaluate(init_op)
       for i in range(10):
         for j in range(2):
           expected = [i, 0] if j % 2 == 0 else [0, -i]
-          self.assertAllEqual(expected, sess.run(get_next))
+          self.assertAllEqual(expected, self.evaluate(get_next))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+        self.evaluate(get_next)
 
   def testErrorsInOutputFn(self):
     with self.cached_session() as sess:
@@ -668,15 +667,15 @@ class ParallelInterleaveTest(test_base.DatasetTestBase):
           self.error = ValueError()
           self.write_coordination_events[expected_element].set()
           with self.assertRaises(errors.InvalidArgumentError):
-            sess.run(self.next_element)
+            self.evaluate(self.next_element)
         else:
           self.write_coordination_events[expected_element].set()
-          actual_element = sess.run(self.next_element)
+          actual_element = self.evaluate(self.next_element)
           self.assertEqual(expected_element * expected_element, actual_element,
                            "At index %s: %s expected, got: %s" %
                            (i, expected_element, actual_element))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(self.next_element)
+        self.evaluate(self.next_element)
 
   def testErrorsInInputFn(self):
 
@@ -701,7 +700,7 @@ class ParallelInterleaveTest(test_base.DatasetTestBase):
                                                self.buffer_output_elements,
                                                self.prefetch_input_elements)))
 
-    self.iterator = self.dataset.make_initializable_iterator()
+    self.iterator = dataset_ops.make_initializable_iterator(self.dataset)
     self.init_op = self.iterator.initializer
     self.next_element = self.iterator.get_next()
 
@@ -720,14 +719,14 @@ class ParallelInterleaveTest(test_base.DatasetTestBase):
           self._interleave([[4] * 4, [5], [6] * 6] * self.repeat_count, 2, 1)):
         if expected_element == 5:
           with self.assertRaises(errors.InvalidArgumentError):
-            sess.run(self.next_element)
+            self.evaluate(self.next_element)
         else:
-          actual_element = sess.run(self.next_element)
+          actual_element = self.evaluate(self.next_element)
           self.assertEqual(expected_element, actual_element,
                            "At index %s: %s expected, got: %s" %
                            (i, expected_element, actual_element))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(self.next_element)
+        self.evaluate(self.next_element)
 
   def testErrorsInInterleaveFn(self):
 
@@ -750,7 +749,7 @@ class ParallelInterleaveTest(test_base.DatasetTestBase):
                                                self.buffer_output_elements,
                                                self.prefetch_input_elements)))
 
-    self.iterator = self.dataset.make_initializable_iterator()
+    self.iterator = dataset_ops.make_initializable_iterator(self.dataset)
     self.init_op = self.iterator.initializer
     self.next_element = self.iterator.get_next()
 
@@ -769,14 +768,14 @@ class ParallelInterleaveTest(test_base.DatasetTestBase):
           self._interleave([[4] * 4, [5], [6] * 6] * self.repeat_count, 2, 1)):
         if expected_element == 5:
           with self.assertRaises(errors.InvalidArgumentError):
-            sess.run(self.next_element)
+            self.evaluate(self.next_element)
         else:
-          actual_element = sess.run(self.next_element)
+          actual_element = self.evaluate(self.next_element)
           self.assertEqual(expected_element, actual_element,
                            "At index %s: %s expected, got: %s" %
                            (i, expected_element, actual_element))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(self.next_element)
+        self.evaluate(self.next_element)
 
   def testShutdownRace(self):
     dataset = dataset_ops.Dataset.range(20)
@@ -789,17 +788,17 @@ class ParallelInterleaveTest(test_base.DatasetTestBase):
             buffer_output_elements=1,
             prefetch_input_elements=0))
     dataset = dataset.batch(32)
-    iterator = dataset.make_initializable_iterator()
+    iterator = dataset_ops.make_initializable_iterator(dataset)
     next_element = iterator.get_next()
 
     results = []
     with self.cached_session() as sess:
       for _ in range(2):
         elements = []
-        sess.run(iterator.initializer)
+        self.evaluate(iterator.initializer)
         try:
           while True:
-            elements.extend(sess.run(next_element))
+            elements.extend(self.evaluate(next_element))
         except errors.OutOfRangeError:
           pass
         results.append(elements)
diff --git a/tensorflow/python/data/experimental/kernel_tests/parse_example_dataset_test.py b/tensorflow/python/data/experimental/kernel_tests/parse_example_dataset_test.py
index c74f754fefb..76e0d4d72a6 100644
--- a/tensorflow/python/data/experimental/kernel_tests/parse_example_dataset_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/parse_example_dataset_test.py
@@ -144,6 +144,7 @@ class ParseExampleDatasetTest(test_base.DatasetTestBase):
         expected_values=expected_output,
         create_iterator_twice=True)
 
+  @test_util.run_deprecated_v1
   def testEmptySerializedWithoutDefaultsShouldFail(self):
     input_features = {
         "st_a":
@@ -177,6 +178,7 @@ class ParseExampleDatasetTest(test_base.DatasetTestBase):
         expected_err=(errors_impl.InvalidArgumentError,
                       "Feature: c \\(data type: float\\) is required"))
 
+  @test_util.run_deprecated_v1
   def testDenseNotMatchingShapeShouldFail(self):
     original = [
         example(features=features({
@@ -669,6 +671,7 @@ class ParseExampleDatasetTest(test_base.DatasetTestBase):
     for batch_size in (1, 10, 20, 100, 256):
       self._testSerializedContainingVarLenDenseLargerBatch(batch_size)
 
+  @test_util.run_deprecated_v1
   def testSkipEagerSerializedShapeMismatch(self):
     aname = "a"
     bname = "b"
@@ -706,6 +709,7 @@ class ParseExampleDatasetTest(test_base.DatasetTestBase):
         expected_err=(ValueError,
                       "Cannot reshape a tensor with 0 elements to shape"))
 
+  @test_util.run_deprecated_v1
   def testSerializedContainingVarLenDense(self):
     aname = "a"
     bname = "b"
diff --git a/tensorflow/python/data/experimental/kernel_tests/prefetch_to_device_test.py b/tensorflow/python/data/experimental/kernel_tests/prefetch_to_device_test.py
index f73725366c4..80bd43e9ade 100644
--- a/tensorflow/python/data/experimental/kernel_tests/prefetch_to_device_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/prefetch_to_device_test.py
@@ -31,17 +31,15 @@ from tensorflow.python.platform import test
 
 class PrefetchToDeviceTest(test_base.DatasetTestBase):
 
+  @test_util.run_deprecated_v1
   def testPrefetchToDevice(self):
     host_dataset = dataset_ops.Dataset.range(10)
     device_dataset = host_dataset.apply(
         prefetching_ops.prefetch_to_device("/cpu:1"))
 
-    # NOTE(mrry): This device block creates the "host" dataset and iterator on
-    # /cpu:0, and ensures that the prefetching is across devices. In typical use
-    # this would not be necessary, because the GPU device would not support any
-    # of the dataset-related ops.
-    with ops.device("/cpu:0"):
-      iterator = device_dataset.make_one_shot_iterator()
+    with ops.device("/cpu:1"):
+      iterator = dataset_ops.make_one_shot_iterator(device_dataset)
+      next_element = iterator.get_next()
 
     self.assertEqual(host_dataset.output_types, device_dataset.output_types)
     self.assertEqual(host_dataset.output_types, iterator.output_types)
@@ -50,29 +48,26 @@ class PrefetchToDeviceTest(test_base.DatasetTestBase):
     self.assertEqual(host_dataset.output_classes, device_dataset.output_classes)
     self.assertEqual(host_dataset.output_classes, iterator.output_classes)
 
-    next_element = iterator.get_next()
     self.assertEqual(dtypes.int64, next_element.dtype)
     self.assertEqual([], next_element.shape)
 
     worker_config = config_pb2.ConfigProto(device_count={"CPU": 2})
-    with self.test_session(config=worker_config) as sess:
+    with self.test_session(config=worker_config):
       for i in range(10):
-        self.assertEqual(i, sess.run(next_element))
+        self.assertEqual(i, self.evaluate(next_element))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
+        self.evaluate(next_element)
 
+  @test_util.run_deprecated_v1
   def testPrefetchToSameDevice(self):
     host_dataset = dataset_ops.Dataset.range(10)
     device_dataset = host_dataset.apply(
         prefetching_ops.prefetch_to_device(
             "/job:localhost/replica:0/task:0/device:CPU:0"))
 
-    # NOTE(mrry): This device block creates the "host" dataset and iterator on
-    # /cpu:0, and ensures that the prefetching is across devices. In typical use
-    # this would not be necessary, because the GPU device would not support any
-    # of the dataset-related ops.
-    with ops.device("/cpu:0"):
-      iterator = device_dataset.make_one_shot_iterator()
+    with ops.device("/cpu:1"):
+      iterator = dataset_ops.make_one_shot_iterator(device_dataset)
+      next_element = iterator.get_next()
 
     self.assertEqual(host_dataset.output_types, device_dataset.output_types)
     self.assertEqual(host_dataset.output_types, iterator.output_types)
@@ -81,27 +76,24 @@ class PrefetchToDeviceTest(test_base.DatasetTestBase):
     self.assertEqual(host_dataset.output_classes, device_dataset.output_classes)
     self.assertEqual(host_dataset.output_classes, iterator.output_classes)
 
-    next_element = iterator.get_next()
     self.assertEqual(dtypes.int64, next_element.dtype)
     self.assertEqual([], next_element.shape)
 
-    with self.cached_session() as sess:
+    with self.cached_session():
       for i in range(10):
-        self.assertEqual(i, sess.run(next_element))
+        self.assertEqual(i, self.evaluate(next_element))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
+        self.evaluate(next_element)
 
+  @test_util.run_deprecated_v1
   def testPrefetchDictToDevice(self):
     host_dataset = dataset_ops.Dataset.range(10).map(lambda x: {"a": x})
     device_dataset = host_dataset.apply(
         prefetching_ops.prefetch_to_device("/cpu:1"))
 
-    # NOTE(mrry): This device block creates the "host" dataset and iterator on
-    # /cpu:0, and ensures that the prefetching is across devices. In typical use
-    # this would not be necessary, because the GPU device would not support any
-    # of the dataset-related ops.
-    with ops.device("/cpu:0"):
-      iterator = device_dataset.make_one_shot_iterator()
+    with ops.device("/cpu:1"):
+      iterator = dataset_ops.make_one_shot_iterator(device_dataset)
+      next_element = iterator.get_next()
 
     self.assertEqual(host_dataset.output_types, device_dataset.output_types)
     self.assertEqual(host_dataset.output_types, iterator.output_types)
@@ -110,17 +102,17 @@ class PrefetchToDeviceTest(test_base.DatasetTestBase):
     self.assertEqual(host_dataset.output_classes, device_dataset.output_classes)
     self.assertEqual(host_dataset.output_classes, iterator.output_classes)
 
-    next_element = iterator.get_next()
     self.assertEqual(dtypes.int64, next_element["a"].dtype)
     self.assertEqual([], next_element["a"].shape)
 
     worker_config = config_pb2.ConfigProto(device_count={"CPU": 2})
-    with self.test_session(config=worker_config) as sess:
+    with self.test_session(config=worker_config):
       for i in range(10):
-        self.assertEqual({"a": i}, sess.run(next_element))
+        self.assertEqual({"a": i}, self.evaluate(next_element))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
+        self.evaluate(next_element)
 
+  @test_util.run_deprecated_v1
   def testPrefetchSparseTensorsToDevice(self):
     def make_tensor(i):
       return sparse_tensor.SparseTensorValue(
@@ -130,12 +122,9 @@ class PrefetchToDeviceTest(test_base.DatasetTestBase):
     device_dataset = host_dataset.apply(
         prefetching_ops.prefetch_to_device("/cpu:1"))
 
-    # NOTE(mrry): This device block creates the "host" dataset and iterator on
-    # /cpu:0, and ensures that the prefetching is across devices. In typical use
-    # this would not be necessary, because the GPU device would not support any
-    # of the dataset-related ops.
-    with ops.device("/cpu:0"):
-      iterator = device_dataset.make_one_shot_iterator()
+    with ops.device("/cpu:1"):
+      iterator = dataset_ops.make_one_shot_iterator(device_dataset)
+      next_element = iterator.get_next()
 
     self.assertEqual(host_dataset.output_types, device_dataset.output_types)
     self.assertEqual(host_dataset.output_types, iterator.output_types)
@@ -144,18 +133,17 @@ class PrefetchToDeviceTest(test_base.DatasetTestBase):
     self.assertEqual(host_dataset.output_classes, device_dataset.output_classes)
     self.assertEqual(host_dataset.output_classes, iterator.output_classes)
 
-    next_element = iterator.get_next()
     self.assertEqual(dtypes.int64, next_element.dtype)
 
     worker_config = config_pb2.ConfigProto(device_count={"CPU": 2})
-    with self.test_session(config=worker_config) as sess:
+    with self.test_session(config=worker_config):
       for i in range(10):
-        actual = sess.run(next_element)
+        actual = self.evaluate(next_element)
         self.assertAllEqual([i], actual.values)
         self.assertAllEqual([[0, 0]], actual.indices)
         self.assertAllEqual([2, 2], actual.dense_shape)
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
+        self.evaluate(next_element)
 
   def testPrefetchToDeviceGpu(self):
     if not test_util.is_gpu_available():
@@ -165,26 +153,26 @@ class PrefetchToDeviceTest(test_base.DatasetTestBase):
     device_dataset = host_dataset.apply(
         prefetching_ops.prefetch_to_device("/gpu:0"))
 
-    iterator = device_dataset.make_one_shot_iterator()
+    iterator = dataset_ops.make_initializable_iterator(device_dataset)
     next_element = iterator.get_next()
 
-    with self.cached_session() as sess:
+    with self.cached_session(
+        config=config_pb2.ConfigProto(allow_soft_placement=False)):
+      self.evaluate(iterator.initializer)
       for i in range(10):
-        self.assertEqual(i, sess.run(next_element))
+        self.assertEqual(i, self.evaluate(next_element))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
+        self.evaluate(next_element)
 
+  @test_util.run_deprecated_v1
   def testPrefetchToDeviceWithReInit(self):
     host_dataset = dataset_ops.Dataset.range(10)
     device_dataset = host_dataset.apply(
         prefetching_ops.prefetch_to_device("/cpu:1"))
 
-    # NOTE(mrry): This device block creates the "host" dataset and iterator on
-    # /cpu:0, and ensures that the prefetching is across devices. In typical use
-    # this would not be necessary, because the GPU device would not support any
-    # of the dataset-related ops.
-    with ops.device("/cpu:0"):
-      iterator = device_dataset.make_initializable_iterator()
+    with ops.device("/cpu:1"):
+      iterator = dataset_ops.make_initializable_iterator(device_dataset)
+      next_element = iterator.get_next()
 
     self.assertEqual(host_dataset.output_types, device_dataset.output_types)
     self.assertEqual(host_dataset.output_types, iterator.output_types)
@@ -193,20 +181,19 @@ class PrefetchToDeviceTest(test_base.DatasetTestBase):
     self.assertEqual(host_dataset.output_classes, device_dataset.output_classes)
     self.assertEqual(host_dataset.output_classes, iterator.output_classes)
 
-    next_element = iterator.get_next()
     self.assertEqual(dtypes.int64, next_element.dtype)
     self.assertEqual([], next_element.shape)
 
     worker_config = config_pb2.ConfigProto(device_count={"CPU": 2})
-    with self.test_session(config=worker_config) as sess:
-      sess.run(iterator.initializer)
+    with self.test_session(config=worker_config):
+      self.evaluate(iterator.initializer)
       for i in range(5):
-        self.assertEqual(i, sess.run(next_element))
-      sess.run(iterator.initializer)
+        self.assertEqual(i, self.evaluate(next_element))
+      self.evaluate(iterator.initializer)
       for i in range(10):
-        self.assertEqual(i, sess.run(next_element))
+        self.assertEqual(i, self.evaluate(next_element))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
+        self.evaluate(next_element)
 
   def testPrefetchToDeviceGpuWithReInit(self):
     if not test_util.is_gpu_available():
@@ -216,18 +203,19 @@ class PrefetchToDeviceTest(test_base.DatasetTestBase):
     device_dataset = host_dataset.apply(
         prefetching_ops.prefetch_to_device("/gpu:0"))
 
-    iterator = device_dataset.make_initializable_iterator()
+    iterator = dataset_ops.make_initializable_iterator(device_dataset)
     next_element = iterator.get_next()
 
-    with self.cached_session() as sess:
-      sess.run(iterator.initializer)
+    with self.cached_session(
+        config=config_pb2.ConfigProto(allow_soft_placement=False)):
+      self.evaluate(iterator.initializer)
       for i in range(5):
-        self.assertEqual(i, sess.run(next_element))
-      sess.run(iterator.initializer)
+        self.assertEqual(i, self.evaluate(next_element))
+      self.evaluate(iterator.initializer)
       for i in range(10):
-        self.assertEqual(i, sess.run(next_element))
+        self.assertEqual(i, self.evaluate(next_element))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
+        self.evaluate(next_element)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/data/experimental/kernel_tests/rejection_resample_test.py b/tensorflow/python/data/experimental/kernel_tests/rejection_resample_test.py
index 4c879dbae68..76f68f50c81 100644
--- a/tensorflow/python/data/experimental/kernel_tests/rejection_resample_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/rejection_resample_test.py
@@ -28,6 +28,7 @@ from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import string_ops
@@ -47,7 +48,7 @@ def _time_resampling(
           initial_dist=init_dist,
           seed=142))
 
-  get_next = dataset.make_one_shot_iterator().get_next()
+  get_next = dataset_ops.make_one_shot_iterator(dataset).get_next()
 
   with test_obj.test_session() as sess:
     start_time = time.time()
@@ -63,6 +64,7 @@ class RejectionResampleTest(test_base.DatasetTestBase, parameterized.TestCase):
   @parameterized.named_parameters(
       ("InitialDistributionKnown", True),
       ("InitialDistributionUnknown", False))
+  @test_util.run_deprecated_v1
   def testDistribution(self, initial_known):
     classes = np.random.randint(5, size=(20000,))  # Uniformly sampled
     target_dist = [0.9, 0.05, 0.05, 0.0, 0.0]
@@ -71,12 +73,12 @@ class RejectionResampleTest(test_base.DatasetTestBase, parameterized.TestCase):
     dataset = dataset_ops.Dataset.from_tensor_slices(classes).shuffle(
         200, seed=21).map(lambda c: (c, string_ops.as_string(c))).repeat()
 
-    get_next = dataset.apply(
+    get_next = dataset_ops.make_one_shot_iterator(dataset.apply(
         resampling.rejection_resample(
             target_dist=target_dist,
             initial_dist=initial_dist,
             class_func=lambda c, _: c,
-            seed=27)).make_one_shot_iterator().get_next()
+            seed=27))).get_next()
 
     with self.cached_session() as sess:
       returned = []
@@ -97,6 +99,7 @@ class RejectionResampleTest(test_base.DatasetTestBase, parameterized.TestCase):
   @parameterized.named_parameters(
       ("OnlyInitial", True),
       ("NotInitial", False))
+  @test_util.run_deprecated_v1
   def testEdgeCasesSampleFromInitialDataset(self, only_initial_dist):
     init_dist = [0.5, 0.5]
     target_dist = [0.5, 0.5] if only_initial_dist else [0.0, 1.0]
@@ -114,7 +117,7 @@ class RejectionResampleTest(test_base.DatasetTestBase, parameterized.TestCase):
             target_dist=target_dist,
             initial_dist=init_dist))
 
-    get_next = dataset.make_one_shot_iterator().get_next()
+    get_next = dataset_ops.make_one_shot_iterator(dataset).get_next()
 
     with self.cached_session() as sess:
       returned = []
@@ -122,6 +125,7 @@ class RejectionResampleTest(test_base.DatasetTestBase, parameterized.TestCase):
         while True:
           returned.append(sess.run(get_next))
 
+  @test_util.run_deprecated_v1
   def testRandomClasses(self):
     init_dist = [0.25, 0.25, 0.25, 0.25]
     target_dist = [0.0, 0.0, 0.0, 1.0]
@@ -145,7 +149,7 @@ class RejectionResampleTest(test_base.DatasetTestBase, parameterized.TestCase):
             target_dist=target_dist,
             initial_dist=init_dist))
 
-    get_next = dataset.make_one_shot_iterator().get_next()
+    get_next = dataset_ops.make_one_shot_iterator(dataset).get_next()
 
     with self.cached_session() as sess:
       returned = []
diff --git a/tensorflow/python/data/experimental/kernel_tests/restructured_dataset_test.py b/tensorflow/python/data/experimental/kernel_tests/restructured_dataset_test.py
index 516e489d043..658e6120cf9 100644
--- a/tensorflow/python/data/experimental/kernel_tests/restructured_dataset_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/restructured_dataset_test.py
@@ -22,12 +22,14 @@ from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.util import nest
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
 
 
 class RestructuredDatasetTest(test_base.DatasetTestBase):
 
+  @test_util.run_deprecated_v1
   def testRestructureDataset(self):
     components = (array_ops.placeholder(dtypes.int32),
                   (array_ops.placeholder(dtypes.int32, shape=[None]),
diff --git a/tensorflow/python/data/experimental/kernel_tests/scan_test.py b/tensorflow/python/data/experimental/kernel_tests/scan_test.py
index 0730455431f..03af7ecd2fa 100644
--- a/tensorflow/python/data/experimental/kernel_tests/scan_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/scan_test.py
@@ -40,6 +40,7 @@ class ScanTest(test_base.DatasetTestBase):
     return dataset_ops.Dataset.from_tensors(0).repeat().apply(
         scan_ops.scan(start, scan_fn))
 
+  @test_util.run_deprecated_v1
   def testCount(self):
     def make_scan_fn(step):
       return lambda state, _: (state + step, state)
@@ -47,8 +48,8 @@ class ScanTest(test_base.DatasetTestBase):
     start = array_ops.placeholder(dtypes.int32, shape=[])
     step = array_ops.placeholder(dtypes.int32, shape=[])
     take = array_ops.placeholder(dtypes.int64, shape=[])
-    iterator = self._counting_dataset(
-        start, make_scan_fn(step)).take(take).make_initializable_iterator()
+    iterator = dataset_ops.make_initializable_iterator(self._counting_dataset(
+        start, make_scan_fn(step)).take(take))
     next_element = iterator.get_next()
 
     with self.cached_session() as sess:
@@ -60,15 +61,15 @@ class ScanTest(test_base.DatasetTestBase):
                  feed_dict={start: start_val, step: step_val, take: take_val})
         for expected, _ in zip(
             itertools.count(start_val, step_val), range(take_val)):
-          self.assertEqual(expected, sess.run(next_element))
+          self.assertEqual(expected, self.evaluate(next_element))
         with self.assertRaises(errors.OutOfRangeError):
-          sess.run(next_element)
+          self.evaluate(next_element)
 
   @test_util.run_in_graph_and_eager_modes
   def testFibonacci(self):
-    iterator = dataset_ops.Dataset.from_tensors(1).repeat(None).apply(
-        scan_ops.scan([0, 1], lambda a, _: ([a[1], a[0] + a[1]], a[1]))
-    ).make_one_shot_iterator()
+    iterator = dataset_ops.make_one_shot_iterator(
+        dataset_ops.Dataset.from_tensors(1).repeat(None).apply(
+            scan_ops.scan([0, 1], lambda a, _: ([a[1], a[0] + a[1]], a[1]))))
 
     if context.executing_eagerly():
       next_element = iterator.get_next
@@ -83,6 +84,7 @@ class ScanTest(test_base.DatasetTestBase):
     self.assertEqual(5, self.evaluate(next_element()))
     self.assertEqual(8, self.evaluate(next_element()))
 
+  @test_util.run_deprecated_v1
   def testSparseCount(self):
     def _sparse(i):
       return sparse_tensor.SparseTensorValue(
@@ -96,9 +98,8 @@ class ScanTest(test_base.DatasetTestBase):
     start = array_ops.placeholder(dtypes.int32, shape=[])
     step = array_ops.placeholder(dtypes.int32, shape=[])
     take = array_ops.placeholder(dtypes.int64, shape=[])
-    iterator = self._counting_dataset(
-        _sparse(start),
-        make_scan_fn(step)).take(take).make_initializable_iterator()
+    iterator = dataset_ops.make_initializable_iterator(self._counting_dataset(
+        _sparse(start), make_scan_fn(step)).take(take))
     next_element = iterator.get_next()
 
     with self.cached_session() as sess:
@@ -110,10 +111,11 @@ class ScanTest(test_base.DatasetTestBase):
                  feed_dict={start: start_val, step: step_val, take: take_val})
         for expected, _ in zip(
             itertools.count(start_val, step_val), range(take_val)):
-          self.assertEqual(expected, sess.run(next_element).values[0])
+          self.assertEqual(expected, self.evaluate(next_element).values[0])
         with self.assertRaises(errors.OutOfRangeError):
-          sess.run(next_element)
+          self.evaluate(next_element)
 
+  @test_util.run_deprecated_v1
   def testChangingStateShape(self):
     # Test the fixed-point shape invariant calculations: start with
     # initial values with known shapes, and use a scan function that
@@ -131,16 +133,16 @@ class ScanTest(test_base.DatasetTestBase):
     self.assertIs(None, dataset.output_shapes[0][1].ndims)
     self.assertEqual([], dataset.output_shapes[1].as_list())
 
-    iterator = dataset.make_one_shot_iterator()
+    iterator = dataset_ops.make_one_shot_iterator(dataset)
     next_element = iterator.get_next()
 
     with self.cached_session() as sess:
       for i in range(5):
-        (longer_vector_val, larger_rank_val), _ = sess.run(next_element)
+        (longer_vector_val, larger_rank_val), _ = self.evaluate(next_element)
         self.assertAllEqual([0] * (2**i), longer_vector_val)
         self.assertAllEqual(np.array(1, ndmin=i), larger_rank_val)
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
+        self.evaluate(next_element)
 
   def testIncorrectStateType(self):
 
diff --git a/tensorflow/python/data/experimental/kernel_tests/serialization/BUILD b/tensorflow/python/data/experimental/kernel_tests/serialization/BUILD
index 2cfb5759036..4a2e28f4964 100644
--- a/tensorflow/python/data/experimental/kernel_tests/serialization/BUILD
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/BUILD
@@ -74,7 +74,11 @@ py_test(
     size = "small",
     srcs = ["checkpoint_input_pipeline_hook_test.py"],
     srcs_version = "PY2AND3",
-    tags = ["no_pip"],
+    tags = [
+        "no_pip",
+        "no_windows",
+        "notsan",
+    ],
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:constant_op",
@@ -313,6 +317,7 @@ py_test(
     srcs_version = "PY2AND3",
     tags = [
         "no_pip",
+        "no_windows",
         "notap",
     ],
     deps = [
@@ -355,9 +360,13 @@ py_test(
     size = "small",
     srcs = ["matching_files_dataset_serialization_test.py"],
     srcs_version = "PY2AND3",
+    tags = [
+        "no_windows",
+    ],
     deps = [
         ":dataset_serialization_test_base",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python/data/experimental/ops:matching_files",
         "//tensorflow/python/data/ops:dataset_ops",
         "//third_party/py/numpy",
     ],
diff --git a/tensorflow/python/data/experimental/kernel_tests/serialization/checkpoint_input_pipeline_hook_test.py b/tensorflow/python/data/experimental/kernel_tests/serialization/checkpoint_input_pipeline_hook_test.py
index 94393d6d4ba..8cc66d0c293 100644
--- a/tensorflow/python/data/experimental/kernel_tests/serialization/checkpoint_input_pipeline_hook_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/checkpoint_input_pipeline_hook_test.py
@@ -21,17 +21,18 @@ from __future__ import print_function
 from tensorflow.python.data.experimental.ops import iterator_ops
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.estimator import estimator
-from tensorflow.python.estimator import model_fn
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 from tensorflow.python.training import checkpoint_management
 from tensorflow.python.training import saver as saver_lib
 from tensorflow.python.training import training_util
+from tensorflow_estimator.python.estimator import estimator
+from tensorflow_estimator.python.estimator import model_fn
 
 
 class CheckpointInputPipelineHookTest(test_base.DatasetTestBase):
@@ -68,6 +69,7 @@ class CheckpointInputPipelineHookTest(test_base.DatasetTestBase):
   def _build_iterator_saver_hook(self, est):
     return iterator_ops.CheckpointInputPipelineHook(est)
 
+  @test_util.run_deprecated_v1
   def testReturnDatasetFromInputFn(self):
 
     def _input_fn():
@@ -80,6 +82,7 @@ class CheckpointInputPipelineHookTest(test_base.DatasetTestBase):
     est.train(_input_fn, steps=2, hooks=[self._build_iterator_saver_hook(est)])
     self.assertSequenceEqual(self._read_vars(est.model_dir), (4, 3))
 
+  @test_util.run_deprecated_v1
   def testBuildIteratorInInputFn(self):
 
     def _input_fn():
@@ -94,6 +97,7 @@ class CheckpointInputPipelineHookTest(test_base.DatasetTestBase):
     est.train(_input_fn, steps=2, hooks=[self._build_iterator_saver_hook(est)])
     self.assertSequenceEqual(self._read_vars(est.model_dir), (4, 3))
 
+  @test_util.run_deprecated_v1
   def testDoNotRestore(self):
 
     def _input_fn():
diff --git a/tensorflow/python/data/experimental/kernel_tests/serialization/dataset_serialization_test_base.py b/tensorflow/python/data/experimental/kernel_tests/serialization/dataset_serialization_test_base.py
index 7f435b82397..e65aa44d060 100644
--- a/tensorflow/python/data/experimental/kernel_tests/serialization/dataset_serialization_test_base.py
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/dataset_serialization_test_base.py
@@ -23,6 +23,7 @@ import os
 import numpy as np
 
 from tensorflow.python.data.experimental.ops import iterator_ops as contrib_iterator_ops
+from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import iterator_ops
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
@@ -578,7 +579,7 @@ class DatasetSerializationTestBase(test.TestCase):
     return np.linspace(0, num_outputs, num_samples, dtype=int)
 
   def _build_graph(self, ds_fn, sparse_tensors=False):
-    iterator = ds_fn().make_initializable_iterator()
+    iterator = dataset_ops.make_initializable_iterator(ds_fn())
 
     saveable = contrib_iterator_ops.make_saveable_from_iterator(iterator)
     ops.add_to_collection(ops.GraphKeys.SAVEABLE_OBJECTS, saveable)
diff --git a/tensorflow/python/data/experimental/kernel_tests/serialization/filter_dataset_serialization_test.py b/tensorflow/python/data/experimental/kernel_tests/serialization/filter_dataset_serialization_test.py
index 225f6cbac01..e3ba8ad231b 100644
--- a/tensorflow/python/data/experimental/kernel_tests/serialization/filter_dataset_serialization_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/filter_dataset_serialization_test.py
@@ -17,8 +17,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import numpy as np
-
 from tensorflow.python.data.experimental.kernel_tests.serialization import dataset_serialization_test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import sparse_tensor
@@ -35,7 +33,7 @@ class FilterDatasetSerializationTest(
 
   def testFilterCore(self):
     div = 3
-    num_outputs = np.sum([x % 3 != 2 for x in range(100)])
+    num_outputs = sum(x % 3 != 2 for x in range(100))
     self.run_core_tests(lambda: self._build_filter_range_graph(div),
                         lambda: self._build_filter_range_graph(div * 2),
                         num_outputs)
@@ -47,7 +45,7 @@ class FilterDatasetSerializationTest(
                 lambda d: d["foo"] + d["bar"])
 
   def testFilterDictCore(self):
-    num_outputs = np.sum([(x**2) % 2 == 0 for x in range(10)])
+    num_outputs = sum((x**2) % 2 == 0 for x in range(10))
     self.run_core_tests(self._build_filter_dict_graph, None, num_outputs)
 
   def _build_sparse_filter(self):
diff --git a/tensorflow/python/data/experimental/kernel_tests/serialization/matching_files_dataset_serialization_test.py b/tensorflow/python/data/experimental/kernel_tests/serialization/matching_files_dataset_serialization_test.py
index 7edb200d2ec..c026e97835c 100644
--- a/tensorflow/python/data/experimental/kernel_tests/serialization/matching_files_dataset_serialization_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/matching_files_dataset_serialization_test.py
@@ -22,7 +22,7 @@ import shutil
 import tempfile
 
 from tensorflow.python.data.experimental.kernel_tests.serialization import dataset_serialization_test_base
-from tensorflow.python.data.ops.dataset_ops import MatchingFilesDataset
+from tensorflow.python.data.experimental.ops import matching_files
 from tensorflow.python.platform import test
 
 
@@ -30,7 +30,7 @@ class MatchingFilesDatasetSerializationTest(
     dataset_serialization_test_base.DatasetSerializationTestBase):
 
   def _build_iterator_graph(self, test_patterns):
-    return MatchingFilesDataset(test_patterns)
+    return matching_files.MatchingFilesDataset(test_patterns)
 
   def testMatchingFilesCore(self):
     tmp_dir = tempfile.mkdtemp()
diff --git a/tensorflow/python/data/experimental/kernel_tests/serialization/range_dataset_serialization_test.py b/tensorflow/python/data/experimental/kernel_tests/serialization/range_dataset_serialization_test.py
index ef99d01c73c..34419a31493 100644
--- a/tensorflow/python/data/experimental/kernel_tests/serialization/range_dataset_serialization_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/range_dataset_serialization_test.py
@@ -56,8 +56,8 @@ class RangeDatasetSerializationTest(
   def testSaveRestore(self):
 
     def _build_graph(start, stop):
-      iterator = dataset_ops.Dataset.range(start,
-                                           stop).make_initializable_iterator()
+      iterator = dataset_ops.make_initializable_iterator(
+          dataset_ops.Dataset.range(start, stop))
       init_op = iterator.initializer
       get_next = iterator.get_next()
       save_op = self._save_op(iterator._iterator_resource)
@@ -71,36 +71,36 @@ class RangeDatasetSerializationTest(
     with ops.Graph().as_default() as g:
       init_op, get_next, save_op, _ = _build_graph(start, stop)
       with self.session(graph=g) as sess:
-        sess.run(variables.global_variables_initializer())
-        sess.run(init_op)
+        self.evaluate(variables.global_variables_initializer())
+        self.evaluate(init_op)
         for i in range(start, break_point):
-          self.assertEqual(i, sess.run(get_next))
-        sess.run(save_op)
+          self.assertEqual(i, self.evaluate(get_next))
+        self.evaluate(save_op)
 
     with ops.Graph().as_default() as g:
       init_op, get_next, _, restore_op = _build_graph(start, stop)
       with self.session(graph=g) as sess:
-        sess.run(init_op)
-        sess.run(restore_op)
+        self.evaluate(init_op)
+        self.evaluate(restore_op)
         for i in range(break_point, stop):
-          self.assertEqual(i, sess.run(get_next))
+          self.assertEqual(i, self.evaluate(get_next))
         with self.assertRaises(errors.OutOfRangeError):
-          sess.run(get_next)
+          self.evaluate(get_next)
 
     # Saving and restoring in same session.
     with ops.Graph().as_default() as g:
       init_op, get_next, save_op, restore_op = _build_graph(start, stop)
       with self.session(graph=g) as sess:
-        sess.run(variables.global_variables_initializer())
-        sess.run(init_op)
+        self.evaluate(variables.global_variables_initializer())
+        self.evaluate(init_op)
         for i in range(start, break_point):
-          self.assertEqual(i, sess.run(get_next))
-        sess.run(save_op)
-        sess.run(restore_op)
+          self.assertEqual(i, self.evaluate(get_next))
+        self.evaluate(save_op)
+        self.evaluate(restore_op)
         for i in range(break_point, stop):
-          self.assertEqual(i, sess.run(get_next))
+          self.assertEqual(i, self.evaluate(get_next))
         with self.assertRaises(errors.OutOfRangeError):
-          sess.run(get_next)
+          self.evaluate(get_next)
 
   def _build_range_dataset(self, start, stop):
     return dataset_ops.Dataset.range(start, stop)
diff --git a/tensorflow/python/data/experimental/kernel_tests/serialization/serialization_integration_test.py b/tensorflow/python/data/experimental/kernel_tests/serialization/serialization_integration_test.py
index 88d5c896c9f..12fa0989d07 100644
--- a/tensorflow/python/data/experimental/kernel_tests/serialization/serialization_integration_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/serialization_integration_test.py
@@ -60,9 +60,9 @@ class SerializationIntegrationTest(test.TestCase):
       init_ops, get_next_ops, saver = self._build_graph(num_pipelines,
                                                         num_outputs)
       with self.session(graph=g) as sess:
-        sess.run(init_ops)
+        self.evaluate(init_ops)
         for _ in range(break_point):
-          output = sess.run(get_next_ops)
+          output = self.evaluate(get_next_ops)
           for i in range(num_pipelines):
             all_outputs[i].append(output[i])
         saver.save(sess, self._ckpt_path())
@@ -73,7 +73,7 @@ class SerializationIntegrationTest(test.TestCase):
       with self.session(graph=g) as sess:
         saver.restore(sess, self._ckpt_path())
         for _ in range(num_outputs - break_point):
-          output = sess.run(get_next_ops)
+          output = self.evaluate(get_next_ops)
           for i in range(num_pipelines):
             all_outputs[i].append(output[i])
 
diff --git a/tensorflow/python/data/experimental/kernel_tests/serialization/shuffle_dataset_serialization_test.py b/tensorflow/python/data/experimental/kernel_tests/serialization/shuffle_dataset_serialization_test.py
index a04f1ddafce..e753a7a15be 100644
--- a/tensorflow/python/data/experimental/kernel_tests/serialization/shuffle_dataset_serialization_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/shuffle_dataset_serialization_test.py
@@ -138,9 +138,9 @@ class ShuffleDatasetSerializationTest(
           saver = saver_lib.Saver(allow_empty=True)
           with self.session(graph=g) as sess:
             self._save(sess, saver)
-            expected = [sess.run(get_next_ops) for _ in range(num_outputs)]
+            expected = [self.evaluate(get_next_ops) for _ in range(num_outputs)]
             self._restore(saver, sess)
-            actual = [sess.run(get_next_ops) for _ in range(num_outputs)]
+            actual = [self.evaluate(get_next_ops) for _ in range(num_outputs)]
             self.match(expected, actual)
 
 
diff --git a/tensorflow/python/data/experimental/kernel_tests/shuffle_and_repeat_test.py b/tensorflow/python/data/experimental/kernel_tests/shuffle_and_repeat_test.py
index c208963a861..9528f83291f 100644
--- a/tensorflow/python/data/experimental/kernel_tests/shuffle_and_repeat_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/shuffle_and_repeat_test.py
@@ -24,6 +24,7 @@ from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
 
 
@@ -34,16 +35,17 @@ class ShuffleAndRepeatTest(test_base.DatasetTestBase):
         shuffle_ops.shuffle_and_repeat(buffer_size=5, count=count, seed=seed))
 
   def _gen_outputs(self, ds_fn, num_outputs, verify_exhausted=True):
-    get_next = ds_fn().make_one_shot_iterator().get_next()
+    get_next = dataset_ops.make_one_shot_iterator(ds_fn()).get_next()
     outputs = []
     with self.cached_session() as sess:
       for _ in range(num_outputs):
-        outputs.append(sess.run(get_next))
+        outputs.append(self.evaluate(get_next))
       if verify_exhausted:
         with self.assertRaises(errors.OutOfRangeError):
-          sess.run(get_next)
+          self.evaluate(get_next)
     return outputs
 
+  @test_util.run_deprecated_v1
   def testCorrectOutput(self):
     output = self._gen_outputs(lambda: self._build_ds(10), 100)
     self.assertSequenceEqual(
@@ -52,6 +54,7 @@ class ShuffleAndRepeatTest(test_base.DatasetTestBase):
     for i in range(5):
       self.assertSequenceEqual(sorted(output[i * 20:(i + 1) * 20]), range(20))
 
+  @test_util.run_deprecated_v1
   def testReshuffling(self):
     # Check that the output orders of different epochs are indeed different.
     output = self._gen_outputs(lambda: self._build_ds(10), 100)
@@ -60,17 +63,20 @@ class ShuffleAndRepeatTest(test_base.DatasetTestBase):
       epoch2 = output[(i + 1) * 20:(i + 2) * 20]
       self.assertNotEqual(epoch1, epoch2)
 
+  @test_util.run_deprecated_v1
   def testSameOrderForSameSeeds(self):
     output1 = self._gen_outputs(lambda: self._build_ds(10), 100)
     output2 = self._gen_outputs(lambda: self._build_ds(10), 100)
     self.assertEqual(output1, output2)
 
+  @test_util.run_deprecated_v1
   def testDifferentOrderForDifferentSeeds(self):
     output1 = self._gen_outputs(lambda: self._build_ds(10), 100)
     output2 = self._gen_outputs(lambda: self._build_ds(20), 100)
     self.assertNotEqual(output1, output2)
     self.assertEqual(sorted(output1), sorted(output2))
 
+  @test_util.run_deprecated_v1
   def testCountNone(self):
     output1 = self._gen_outputs(
         lambda: self._build_ds(10, count=None), 100, verify_exhausted=False)
@@ -79,6 +85,7 @@ class ShuffleAndRepeatTest(test_base.DatasetTestBase):
     self.assertNotEqual(output1, output2)
     self.assertEqual(sorted(output1), sorted(output2))
 
+  @test_util.run_deprecated_v1
   def testCountMinusOne(self):
     output1 = self._gen_outputs(
         lambda: self._build_ds(10, count=-1), 100, verify_exhausted=False)
@@ -108,7 +115,7 @@ class ShuffleAndRepeatTest(test_base.DatasetTestBase):
           shuffle_ops.shuffle_and_repeat(buffer_size=21))
       get_next_op = ds.make_one_shot_iterator().get_next()
       with self.session(graph=g) as sess:
-        sess.run(get_next_op)
+        self.evaluate(get_next_op)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/data/experimental/kernel_tests/sleep_test.py b/tensorflow/python/data/experimental/kernel_tests/sleep_test.py
index bf53acc82a8..46b22f80b6d 100644
--- a/tensorflow/python/data/experimental/kernel_tests/sleep_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/sleep_test.py
@@ -23,6 +23,7 @@ from tensorflow.python.data.experimental.ops import sleep
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import errors
+from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
 
 _NUMPY_RANDOM_SEED = 42
@@ -30,22 +31,23 @@ _NUMPY_RANDOM_SEED = 42
 
 class SleepTest(test_base.DatasetTestBase):
 
+  @test_util.run_deprecated_v1
   def testSleep(self):
     sleep_microseconds = 100
     dataset = dataset_ops.Dataset.range(10).apply(
         sleep.sleep(sleep_microseconds))
-    iterator = dataset.make_initializable_iterator()
+    iterator = dataset_ops.make_initializable_iterator(dataset)
     next_element = iterator.get_next()
 
     with self.cached_session() as sess:
-      sess.run(iterator.initializer)
+      self.evaluate(iterator.initializer)
       start_time = time.time()
       for i in range(10):
-        self.assertEqual(i, sess.run(next_element))
+        self.assertEqual(i, self.evaluate(next_element))
       end_time = time.time()
       self.assertGreater(end_time - start_time, (10 * sleep_microseconds) / 1e6)
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
+        self.evaluate(next_element)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/data/experimental/kernel_tests/sql_dataset_test.py b/tensorflow/python/data/experimental/kernel_tests/sql_dataset_test.py
index a2c11696387..eb66927ee5c 100644
--- a/tensorflow/python/data/experimental/kernel_tests/sql_dataset_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/sql_dataset_test.py
@@ -39,10 +39,11 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase):
                             "ORDER BY first_name DESC"
             })
         for _ in range(2):  # Dataset is repeated. See setUp.
-          self.assertEqual((b"John", b"Doe", b"Hi!"), sess.run(get_next))
-          self.assertEqual((b"Jane", b"Moe", b"Hi again!"), sess.run(get_next))
+          self.assertEqual((b"John", b"Doe", b"Hi!"), self.evaluate(get_next))
+          self.assertEqual((b"Jane", b"Moe", b"Hi again!"),
+                           self.evaluate(get_next))
         with self.assertRaises(errors.OutOfRangeError):
-          sess.run(get_next)
+          self.evaluate(get_next)
 
   # Test that SqlDataset works on a join query.
   def testReadResultSetJoinQuery(self):
@@ -58,9 +59,10 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase):
                   "ON students.first_name = people.first_name "
                   "AND students.last_name = people.last_name"
           })
-      self.assertEqual((b"John", b"California", b"Hi!"), sess.run(get_next))
+      self.assertEqual((b"John", b"California", b"Hi!"),
+                       self.evaluate(get_next))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+        self.evaluate(get_next)
 
   # Test that SqlDataset can read a database entry with a null-terminator
   # in the middle of the text and place the entry in a `string` tensor.
@@ -75,10 +77,11 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase):
                   "SELECT first_name, last_name, favorite_nonsense_word "
                   "FROM students ORDER BY first_name DESC"
           })
-      self.assertEqual((b"John", b"Doe", b"n\0nsense"), sess.run(get_next))
-      self.assertEqual((b"Jane", b"Moe", b"nonsense\0"), sess.run(get_next))
+      self.assertEqual((b"John", b"Doe", b"n\0nsense"), self.evaluate(get_next))
+      self.assertEqual((b"Jane", b"Moe", b"nonsense\0"),
+                       self.evaluate(get_next))
     with self.assertRaises(errors.OutOfRangeError):
-      sess.run(get_next)
+      self.evaluate(get_next)
 
   # Test that SqlDataset works when used on two different queries.
   # Because the output types of the dataset must be determined at graph-creation
@@ -93,21 +96,22 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase):
               self.query: "SELECT first_name, last_name, motto FROM students "
                           "ORDER BY first_name DESC"
           })
-      self.assertEqual((b"John", b"Doe", b"Hi!"), sess.run(get_next))
-      self.assertEqual((b"Jane", b"Moe", b"Hi again!"), sess.run(get_next))
+      self.assertEqual((b"John", b"Doe", b"Hi!"), self.evaluate(get_next))
+      self.assertEqual((b"Jane", b"Moe", b"Hi again!"), self.evaluate(get_next))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+        self.evaluate(get_next)
       sess.run(
           init_op,
           feed_dict={
               self.query: "SELECT first_name, last_name, state FROM people "
                           "ORDER BY first_name DESC"
           })
-      self.assertEqual((b"John", b"Doe", b"California"), sess.run(get_next))
+      self.assertEqual((b"John", b"Doe", b"California"),
+                       self.evaluate(get_next))
       self.assertEqual((b"Benjamin", b"Franklin", b"Pennsylvania"),
-                       sess.run(get_next))
+                       self.evaluate(get_next))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+        self.evaluate(get_next)
 
   # Test that an `OutOfRangeError` is raised on the first call to
   # `get_next_str_only` if result set is empty.
@@ -122,7 +126,7 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase):
                           "WHERE first_name = 'Nonexistent'"
           })
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+        self.evaluate(get_next)
 
   # Test that an error is raised when `driver_name` is invalid.
   def testReadResultSetWithInvalidDriverName(self):
@@ -151,7 +155,7 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase):
                   "ORDER BY first_name DESC"
           })
       with self.assertRaises(errors.UnknownError):
-        sess.run(get_next)
+        self.evaluate(get_next)
 
   # Test that an error is raised when there is a syntax error in `query`.
   def testReadResultSetOfQueryWithSyntaxError(self):
@@ -166,7 +170,7 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase):
                   "ORDER BY first_name DESC"
           })
       with self.assertRaises(errors.UnknownError):
-        sess.run(get_next)
+        self.evaluate(get_next)
 
   # Test that an error is raised when the number of columns in `query`
   # does not match the length of `output_types`.
@@ -181,7 +185,7 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase):
                           "ORDER BY first_name DESC"
           })
       with self.assertRaises(errors.InvalidArgumentError):
-        sess.run(get_next)
+        self.evaluate(get_next)
 
   # Test that no results are returned when `query` is an insert query rather
   # than a select query. In particular, the error refers to the number of
@@ -199,7 +203,7 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase):
                   "VALUES ('Foo', 'Bar', 'Baz'), ('Fizz', 'Buzz', 'Fizzbuzz')"
           })
       with self.assertRaises(errors.InvalidArgumentError):
-        sess.run(get_next)
+        self.evaluate(get_next)
 
   # Test that `SqlDataset` can read an integer from a SQLite database table and
   # place it in an `int8` tensor.
@@ -212,10 +216,10 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase):
               self.query: "SELECT first_name, desk_number FROM students "
                           "ORDER BY first_name DESC"
           })
-      self.assertEqual((b"John", 9), sess.run(get_next))
-      self.assertEqual((b"Jane", 127), sess.run(get_next))
+      self.assertEqual((b"John", 9), self.evaluate(get_next))
+      self.assertEqual((b"Jane", 127), self.evaluate(get_next))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+        self.evaluate(get_next)
 
   # Test that `SqlDataset` can read a negative or 0-valued integer from a
   # SQLite database table and place it in an `int8` tensor.
@@ -230,9 +234,9 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase):
                           "FROM students "
                           "WHERE first_name = 'John' ORDER BY first_name DESC"
           })
-      self.assertEqual((b"John", 0, -2), sess.run(get_next))
+      self.assertEqual((b"John", 0, -2), self.evaluate(get_next))
     with self.assertRaises(errors.OutOfRangeError):
-      sess.run(get_next)
+      self.evaluate(get_next)
 
   # Test that `SqlDataset` can read a large (positive or negative) integer from
   # a SQLite database table and place it in an `int8` tensor.
@@ -246,11 +250,11 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase):
                   "SELECT desk_number, favorite_negative_number FROM students "
                   "ORDER BY first_name DESC"
           })
-      self.assertEqual((9, -2), sess.run(get_next))
+      self.assertEqual((9, -2), self.evaluate(get_next))
       # Max and min values of int8
-      self.assertEqual((127, -128), sess.run(get_next))
+      self.assertEqual((127, -128), self.evaluate(get_next))
     with self.assertRaises(errors.OutOfRangeError):
-      sess.run(get_next)
+      self.evaluate(get_next)
 
   # Test that `SqlDataset` can read an integer from a SQLite database table and
   # place it in an `int16` tensor.
@@ -263,10 +267,10 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase):
               self.query: "SELECT first_name, desk_number FROM students "
                           "ORDER BY first_name DESC"
           })
-      self.assertEqual((b"John", 9), sess.run(get_next))
-      self.assertEqual((b"Jane", 127), sess.run(get_next))
+      self.assertEqual((b"John", 9), self.evaluate(get_next))
+      self.assertEqual((b"Jane", 127), self.evaluate(get_next))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+        self.evaluate(get_next)
 
   # Test that `SqlDataset` can read a negative or 0-valued integer from a
   # SQLite database table and place it in an `int16` tensor.
@@ -281,9 +285,9 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase):
                           "FROM students "
                           "WHERE first_name = 'John' ORDER BY first_name DESC"
           })
-      self.assertEqual((b"John", 0, -2), sess.run(get_next))
+      self.assertEqual((b"John", 0, -2), self.evaluate(get_next))
     with self.assertRaises(errors.OutOfRangeError):
-      sess.run(get_next)
+      self.evaluate(get_next)
 
   # Test that `SqlDataset` can read a large (positive or negative) integer from
   # a SQLite database table and place it in an `int16` tensor.
@@ -297,11 +301,11 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase):
                           "FROM students ORDER BY first_name DESC"
           })
       # Max value of int16
-      self.assertEqual((b"John", 32767), sess.run(get_next))
+      self.assertEqual((b"John", 32767), self.evaluate(get_next))
       # Min value of int16
-      self.assertEqual((b"Jane", -32768), sess.run(get_next))
+      self.assertEqual((b"Jane", -32768), self.evaluate(get_next))
     with self.assertRaises(errors.OutOfRangeError):
-      sess.run(get_next)
+      self.evaluate(get_next)
 
   # Test that `SqlDataset` can read an integer from a SQLite database table and
   # place it in an `int32` tensor.
@@ -314,8 +318,8 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase):
               self.query: "SELECT first_name, desk_number FROM students "
                           "ORDER BY first_name DESC"
           })
-      self.assertEqual((b"John", 9), sess.run(get_next))
-      self.assertEqual((b"Jane", 127), sess.run(get_next))
+      self.assertEqual((b"John", 9), self.evaluate(get_next))
+      self.assertEqual((b"Jane", 127), self.evaluate(get_next))
 
   # Test that `SqlDataset` can read a negative or 0-valued integer from a
   # SQLite database table and place it in an `int32` tensor.
@@ -328,10 +332,10 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase):
               self.query: "SELECT first_name, income FROM students "
                           "ORDER BY first_name DESC"
           })
-      self.assertEqual((b"John", 0), sess.run(get_next))
-      self.assertEqual((b"Jane", -20000), sess.run(get_next))
+      self.assertEqual((b"John", 0), self.evaluate(get_next))
+      self.assertEqual((b"Jane", -20000), self.evaluate(get_next))
     with self.assertRaises(errors.OutOfRangeError):
-      sess.run(get_next)
+      self.evaluate(get_next)
 
   # Test that `SqlDataset` can read a large (positive or negative) integer from
   # a SQLite database table and place it in an `int32` tensor.
@@ -345,11 +349,11 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase):
                           "ORDER BY first_name DESC"
           })
       # Max value of int32
-      self.assertEqual((b"John", 2147483647), sess.run(get_next))
+      self.assertEqual((b"John", 2147483647), self.evaluate(get_next))
       # Min value of int32
-      self.assertEqual((b"Jane", -2147483648), sess.run(get_next))
+      self.assertEqual((b"Jane", -2147483648), self.evaluate(get_next))
     with self.assertRaises(errors.OutOfRangeError):
-      sess.run(get_next)
+      self.evaluate(get_next)
 
   # Test that `SqlDataset` can read a numeric `varchar` from a SQLite database
   # table and place it in an `int32` tensor.
@@ -362,10 +366,10 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase):
               self.query: "SELECT first_name, school_id FROM students "
                           "ORDER BY first_name DESC"
           })
-      self.assertEqual((b"John", 123), sess.run(get_next))
-      self.assertEqual((b"Jane", 1000), sess.run(get_next))
+      self.assertEqual((b"John", 123), self.evaluate(get_next))
+      self.assertEqual((b"Jane", 1000), self.evaluate(get_next))
     with self.assertRaises(errors.OutOfRangeError):
-      sess.run(get_next)
+      self.evaluate(get_next)
 
   # Test that `SqlDataset` can read an integer from a SQLite database table
   # and place it in an `int64` tensor.
@@ -378,10 +382,10 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase):
               self.query: "SELECT first_name, desk_number FROM students "
                           "ORDER BY first_name DESC"
           })
-      self.assertEqual((b"John", 9), sess.run(get_next))
-      self.assertEqual((b"Jane", 127), sess.run(get_next))
+      self.assertEqual((b"John", 9), self.evaluate(get_next))
+      self.assertEqual((b"Jane", 127), self.evaluate(get_next))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+        self.evaluate(get_next)
 
   # Test that `SqlDataset` can read a negative or 0-valued integer from a
   # SQLite database table and place it in an `int64` tensor.
@@ -394,10 +398,10 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase):
               self.query: "SELECT first_name, income FROM students "
                           "ORDER BY first_name DESC"
           })
-      self.assertEqual((b"John", 0), sess.run(get_next))
-      self.assertEqual((b"Jane", -20000), sess.run(get_next))
+      self.assertEqual((b"John", 0), self.evaluate(get_next))
+      self.assertEqual((b"Jane", -20000), self.evaluate(get_next))
     with self.assertRaises(errors.OutOfRangeError):
-      sess.run(get_next)
+      self.evaluate(get_next)
 
   # Test that `SqlDataset` can read a large (positive or negative) integer from
   # a SQLite database table and place it in an `int64` tensor.
@@ -412,11 +416,11 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase):
                   "ORDER BY first_name DESC"
           })
       # Max value of int64
-      self.assertEqual((b"John", 9223372036854775807), sess.run(get_next))
+      self.assertEqual((b"John", 9223372036854775807), self.evaluate(get_next))
       # Min value of int64
-      self.assertEqual((b"Jane", -9223372036854775808), sess.run(get_next))
+      self.assertEqual((b"Jane", -9223372036854775808), self.evaluate(get_next))
     with self.assertRaises(errors.OutOfRangeError):
-      sess.run(get_next)
+      self.evaluate(get_next)
 
   # Test that `SqlDataset` can read an integer from a SQLite database table and
   # place it in a `uint8` tensor.
@@ -429,10 +433,10 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase):
               self.query: "SELECT first_name, desk_number FROM students "
                           "ORDER BY first_name DESC"
           })
-      self.assertEqual((b"John", 9), sess.run(get_next))
-      self.assertEqual((b"Jane", 127), sess.run(get_next))
+      self.assertEqual((b"John", 9), self.evaluate(get_next))
+      self.assertEqual((b"Jane", 127), self.evaluate(get_next))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+        self.evaluate(get_next)
 
   # Test that `SqlDataset` can read the minimum and maximum uint8 values from a
   # SQLite database table and place them in `uint8` tensors.
@@ -446,11 +450,11 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase):
                           "ORDER BY first_name DESC"
           })
       # Min value of uint8
-      self.assertEqual((b"John", 0), sess.run(get_next))
+      self.assertEqual((b"John", 0), self.evaluate(get_next))
       # Max value of uint8
-      self.assertEqual((b"Jane", 255), sess.run(get_next))
+      self.assertEqual((b"Jane", 255), self.evaluate(get_next))
     with self.assertRaises(errors.OutOfRangeError):
-      sess.run(get_next)
+      self.evaluate(get_next)
 
   # Test that `SqlDataset` can read an integer from a SQLite database table
   # and place it in a `uint16` tensor.
@@ -463,10 +467,10 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase):
               self.query: "SELECT first_name, desk_number FROM students "
                           "ORDER BY first_name DESC"
           })
-      self.assertEqual((b"John", 9), sess.run(get_next))
-      self.assertEqual((b"Jane", 127), sess.run(get_next))
+      self.assertEqual((b"John", 9), self.evaluate(get_next))
+      self.assertEqual((b"Jane", 127), self.evaluate(get_next))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+        self.evaluate(get_next)
 
   # Test that `SqlDataset` can read the minimum and maximum uint16 values from a
   # SQLite database table and place them in `uint16` tensors.
@@ -480,11 +484,11 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase):
                           "ORDER BY first_name DESC"
           })
       # Min value of uint16
-      self.assertEqual((b"John", 0), sess.run(get_next))
+      self.assertEqual((b"John", 0), self.evaluate(get_next))
       # Max value of uint16
-      self.assertEqual((b"Jane", 65535), sess.run(get_next))
+      self.assertEqual((b"Jane", 65535), self.evaluate(get_next))
     with self.assertRaises(errors.OutOfRangeError):
-      sess.run(get_next)
+      self.evaluate(get_next)
 
   # Test that `SqlDataset` can read a 0-valued and 1-valued integer from a
   # SQLite database table and place them as `True` and `False` respectively
@@ -499,10 +503,10 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase):
                   "SELECT first_name, registration_complete FROM students "
                   "ORDER BY first_name DESC"
           })
-      self.assertEqual((b"John", True), sess.run(get_next))
-      self.assertEqual((b"Jane", False), sess.run(get_next))
+      self.assertEqual((b"John", True), self.evaluate(get_next))
+      self.assertEqual((b"Jane", False), self.evaluate(get_next))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+        self.evaluate(get_next)
 
   # Test that `SqlDataset` can read an integer that is not 0-valued or 1-valued
   # from a SQLite database table and place it as `True` in a `bool` tensor.
@@ -515,10 +519,10 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase):
               self.query: "SELECT first_name, favorite_medium_sized_number "
                           "FROM students ORDER BY first_name DESC"
           })
-      self.assertEqual((b"John", True), sess.run(get_next))
-      self.assertEqual((b"Jane", True), sess.run(get_next))
+      self.assertEqual((b"John", True), self.evaluate(get_next))
+      self.assertEqual((b"Jane", True), self.evaluate(get_next))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+        self.evaluate(get_next)
 
   # Test that `SqlDataset` can read a float from a SQLite database table
   # and place it in a `float64` tensor.
@@ -533,10 +537,11 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase):
                   "SELECT first_name, last_name, victories FROM townspeople "
                   "ORDER BY first_name"
           })
-      self.assertEqual((b"George", b"Washington", 20.0), sess.run(get_next))
-      self.assertEqual((b"John", b"Adams", -19.95), sess.run(get_next))
+      self.assertEqual((b"George", b"Washington", 20.0),
+                       self.evaluate(get_next))
+      self.assertEqual((b"John", b"Adams", -19.95), self.evaluate(get_next))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+        self.evaluate(get_next)
 
   # Test that `SqlDataset` can read a float from a SQLite database table beyond
   # the precision of 64-bit IEEE, without throwing an error. Test that
@@ -555,13 +560,13 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase):
       self.assertEqual(
           (b"George", b"Washington",
            1331241.321342132321324589798264627463827647382647382643874),
-          sess.run(get_next))
+          self.evaluate(get_next))
       self.assertEqual(
           (b"John", b"Adams",
            1331241321342132321324589798264627463827647382647382643874.0),
-          sess.run(get_next))
+          self.evaluate(get_next))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+        self.evaluate(get_next)
 
   # Test that `SqlDataset` can read a float from a SQLite database table,
   # representing the largest integer representable as a 64-bit IEEE float
@@ -579,11 +584,11 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase):
                   "ORDER BY first_name"
           })
       self.assertNotEqual((b"George", b"Washington", 9007199254740992.0),
-                          sess.run(get_next))
+                          self.evaluate(get_next))
       self.assertNotEqual((b"John", b"Adams", 9007199254740991.0),
-                          sess.run(get_next))
+                          self.evaluate(get_next))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+        self.evaluate(get_next)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/data/experimental/kernel_tests/sql_dataset_test_base.py b/tensorflow/python/data/experimental/kernel_tests/sql_dataset_test_base.py
index 6aaaa90c651..809e09c8042 100644
--- a/tensorflow/python/data/experimental/kernel_tests/sql_dataset_test_base.py
+++ b/tensorflow/python/data/experimental/kernel_tests/sql_dataset_test_base.py
@@ -24,6 +24,7 @@ import sqlite3
 
 from tensorflow.python.data.experimental.ops import readers
 from tensorflow.python.data.kernel_tests import test_base
+from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import dtypes
 from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
@@ -35,7 +36,7 @@ class SqlDatasetTestBase(test_base.DatasetTestBase):
   def _createSqlDataset(self, output_types, num_repeats=1):
     dataset = readers.SqlDataset(self.driver_name, self.data_source_name,
                                  self.query, output_types).repeat(num_repeats)
-    iterator = dataset.make_initializable_iterator()
+    iterator = dataset_ops.make_initializable_iterator(dataset)
     init_op = iterator.initializer
     get_next = iterator.get_next()
     return init_op, get_next
diff --git a/tensorflow/python/data/experimental/kernel_tests/stats_dataset_ops_test.py b/tensorflow/python/data/experimental/kernel_tests/stats_dataset_ops_test.py
index 83028937d36..8a300364f95 100644
--- a/tensorflow/python/data/experimental/kernel_tests/stats_dataset_ops_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/stats_dataset_ops_test.py
@@ -30,6 +30,7 @@ from tensorflow.python.data.experimental.ops import stats_options
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
@@ -45,84 +46,84 @@ def function_set_stats_aggregator(dataset,
 
 def function_apply_options(dataset, aggregator, prefix="", counter_prefix=""):
   options = dataset_ops.Options()
-  options.experimental_stats = stats_options.StatsOptions(aggregator)
+  options.experimental_stats = stats_options.StatsOptions()
+  options.experimental_stats.aggregator = aggregator
+  options.experimental_stats.prefix = prefix
+  options.experimental_stats.counter_prefix = counter_prefix
   options.experimental_stats.latency_all_edges = False
-  if prefix:
-    options.experimental_stats.prefix = prefix
-  if counter_prefix:
-    options.experimental_stats.counter_prefix = counter_prefix
   return dataset.with_options(options)
 
 
 @parameterized.named_parameters(
-    dict(
-        testcase_name="SetStatsAggregator",
-        dataset_transformation=function_set_stats_aggregator),
-    dict(
-        testcase_name="StatsOptions",
-        dataset_transformation=function_apply_options))
+    ("SetStatsAggregator", function_set_stats_aggregator),
+    ("StatsOptions", function_apply_options),
+)
 class StatsDatasetTest(stats_dataset_test_base.StatsDatasetTestBase):
 
+  @test_util.run_deprecated_v1
   def testBytesProduced(self, dataset_transformation):
     aggregator = stats_aggregator.StatsAggregator()
     dataset = dataset_ops.Dataset.range(100).map(
         lambda x: array_ops.tile([x], ops.convert_to_tensor([x]))).apply(
             stats_ops.bytes_produced_stats("bytes_produced"))
     dataset = dataset_transformation(dataset, aggregator)
-    iterator = dataset.make_initializable_iterator()
+    iterator = dataset_ops.make_initializable_iterator(dataset)
     next_element = iterator.get_next()
     summary_t = aggregator.get_summary()
 
     with self.cached_session() as sess:
-      sess.run(iterator.initializer)
+      self.evaluate(iterator.initializer)
       expected_sum = 0.0
       for i in range(100):
         self.assertAllEqual(
-            np.array([i] * i, dtype=np.int64), sess.run(next_element))
-        summary_str = sess.run(summary_t)
+            np.array([i] * i, dtype=np.int64), self.evaluate(next_element))
+        summary_str = self.evaluate(summary_t)
         self._assertSummaryHasCount(summary_str, "bytes_produced", float(i + 1))
         expected_sum += i * 8.0
         self._assertSummaryHasSum(summary_str, "bytes_produced", expected_sum)
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
-      summary_str = sess.run(summary_t)
+        self.evaluate(next_element)
+      summary_str = self.evaluate(summary_t)
       self._assertSummaryHasCount(summary_str, "bytes_produced", 100.0)
       self._assertSummaryHasSum(summary_str, "bytes_produced", expected_sum)
 
+  @test_util.run_deprecated_v1
   def testLatencyStats(self, dataset_transformation):
     aggregator = stats_aggregator.StatsAggregator()
     dataset = dataset_ops.Dataset.range(100).apply(
         stats_ops.latency_stats("record_latency"))
     dataset = dataset_transformation(dataset, aggregator)
-    iterator = dataset.make_initializable_iterator()
+    iterator = dataset_ops.make_initializable_iterator(dataset)
     next_element = iterator.get_next()
     summary_t = aggregator.get_summary()
 
     with self.cached_session() as sess:
-      sess.run(iterator.initializer)
+      self.evaluate(iterator.initializer)
       for i in range(100):
-        self.assertEqual(i, sess.run(next_element))
+        self.assertEqual(i, self.evaluate(next_element))
         self._assertSummaryHasCount(
-            sess.run(summary_t), "record_latency", float(i + 1))
+            self.evaluate(summary_t), "record_latency", float(i + 1))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
-      self._assertSummaryHasCount(sess.run(summary_t), "record_latency", 100.0)
+        self.evaluate(next_element)
+      self._assertSummaryHasCount(
+          self.evaluate(summary_t), "record_latency", 100.0)
 
+  @test_util.run_deprecated_v1
   def testPrefetchBufferUtilization(self, dataset_transformation):
     aggregator = stats_aggregator.StatsAggregator()
     dataset = dataset_ops.Dataset.range(100).map(
         lambda x: array_ops.tile([x], ops.convert_to_tensor([x]))).prefetch(-1)
     dataset = dataset_transformation(dataset, aggregator)
-    iterator = dataset.make_initializable_iterator()
+    iterator = dataset_ops.make_initializable_iterator(dataset)
     next_element = iterator.get_next()
     summary_t = aggregator.get_summary()
 
     with self.cached_session() as sess:
-      sess.run(iterator.initializer)
+      self.evaluate(iterator.initializer)
       for i in range(100):
         self.assertAllEqual(
-            np.array([i] * i, dtype=np.int64), sess.run(next_element))
-        summary_str = sess.run(summary_t)
+            np.array([i] * i, dtype=np.int64), self.evaluate(next_element))
+        summary_str = self.evaluate(summary_t)
         self._assertSummaryHasCount(summary_str, "Prefetch::buffer_utilization",
                                     float(i + 1))
         self._assertSummaryContains(summary_str, "Prefetch::buffer_capacity")
@@ -130,58 +131,62 @@ class StatsDatasetTest(stats_dataset_test_base.StatsDatasetTestBase):
         self._assertSummaryHasRange(summary_str, "Prefetch::buffer_utilization",
                                     0, 1)
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
-      summary_str = sess.run(summary_t)
+        self.evaluate(next_element)
+      summary_str = self.evaluate(summary_t)
       self._assertSummaryHasCount(summary_str, "Prefetch::buffer_utilization",
                                   100)
 
+  @test_util.run_deprecated_v1
   def testPrefetchBufferScalars(self, dataset_transformation):
     aggregator = stats_aggregator.StatsAggregator()
     dataset = dataset_ops.Dataset.range(10).map(
         lambda x: array_ops.tile([x], ops.convert_to_tensor([x]))).prefetch(0)
     dataset = dataset_transformation(dataset, aggregator)
-    iterator = dataset.make_initializable_iterator()
+    iterator = dataset_ops.make_initializable_iterator(dataset)
     next_element = iterator.get_next()
     summary_t = aggregator.get_summary()
 
     with self.cached_session() as sess:
-      sess.run(iterator.initializer)
+      self.evaluate(iterator.initializer)
       for i in range(10):
         self.assertAllEqual(
-            np.array([i] * i, dtype=np.int64), sess.run(next_element))
-        summary_str = sess.run(summary_t)
+            np.array([i] * i, dtype=np.int64), self.evaluate(next_element))
+        summary_str = self.evaluate(summary_t)
         self._assertSummaryHasScalarValue(summary_str,
                                           "Prefetch::buffer_capacity", 0)
         self._assertSummaryHasScalarValue(summary_str, "Prefetch::buffer_size",
                                           0)
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
+        self.evaluate(next_element)
 
+  @test_util.run_deprecated_v1
   def testFilteredElementsStats(self, dataset_transformation):
     aggregator = stats_aggregator.StatsAggregator()
     dataset = dataset_ops.Dataset.range(101).filter(
         lambda x: math_ops.equal(math_ops.mod(x, 3), 0))
     dataset = dataset_transformation(dataset, aggregator)
-    iterator = dataset.make_initializable_iterator()
+    iterator = dataset_ops.make_initializable_iterator(dataset)
     next_element = iterator.get_next()
     summary_t = aggregator.get_summary()
 
     with self.test_session() as sess:
-      sess.run(iterator.initializer)
+      self.evaluate(iterator.initializer)
       for i in range(34):
-        self.assertEqual(i * 3, sess.run(next_element))
+        self.assertEqual(i * 3, self.evaluate(next_element))
         if i is not 0:
           self._assertSummaryHasScalarValue(
-              sess.run(summary_t), "Filter::dropped_elements", float(i * 2))
+              self.evaluate(summary_t), "Filter::dropped_elements",
+              float(i * 2))
         self._assertSummaryHasScalarValue(
-            sess.run(summary_t), "Filter::filtered_elements", float(i + 1))
+            self.evaluate(summary_t), "Filter::filtered_elements", float(i + 1))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
+        self.evaluate(next_element)
       self._assertSummaryHasScalarValue(
-          sess.run(summary_t), "Filter::dropped_elements", 67.0)
+          self.evaluate(summary_t), "Filter::dropped_elements", 67.0)
       self._assertSummaryHasScalarValue(
-          sess.run(summary_t), "Filter::filtered_elements", 34.0)
+          self.evaluate(summary_t), "Filter::filtered_elements", 34.0)
 
+  @test_util.run_deprecated_v1
   def testMapBufferUtilization(self, dataset_transformation):
 
     def dataset_fn():
@@ -196,6 +201,7 @@ class StatsDatasetTest(stats_dataset_test_base.StatsDatasetTestBase):
         dataset_transformation,
         function_processing_time=True)
 
+  @test_util.run_deprecated_v1
   def testMapAutoTuneBufferUtilization(self, dataset_transformation):
 
     def dataset_fn():
@@ -213,6 +219,7 @@ class StatsDatasetTest(stats_dataset_test_base.StatsDatasetTestBase):
         dataset_transformation,
         function_processing_time=True)
 
+  @test_util.run_deprecated_v1
   def testInterleaveAutoTuneBufferUtilization(self, dataset_transformation):
 
     def dataset_fn():
@@ -229,6 +236,7 @@ class StatsDatasetTest(stats_dataset_test_base.StatsDatasetTestBase):
     self._testParallelCallsStats(dataset_fn, "ParallelInterleaveV2", 10,
                                  dataset_transformation)
 
+  @test_util.run_deprecated_v1
   def testMapAndBatchAutoTuneBufferUtilization(self, dataset_transformation):
 
     def dataset_fn():
@@ -250,104 +258,114 @@ class StatsDatasetTest(stats_dataset_test_base.StatsDatasetTestBase):
         check_elements=False,
         function_processing_time=True)
 
+  @test_util.run_deprecated_v1
   def testReinitialize(self, dataset_transformation):
     aggregator = stats_aggregator.StatsAggregator()
     dataset = dataset_ops.Dataset.range(100).apply(
         stats_ops.latency_stats("record_latency"))
     dataset = dataset_transformation(dataset, aggregator)
-    iterator = dataset.make_initializable_iterator()
+    iterator = dataset_ops.make_initializable_iterator(dataset)
     next_element = iterator.get_next()
     summary_t = aggregator.get_summary()
 
     with self.cached_session() as sess:
       for j in range(5):
-        sess.run(iterator.initializer)
+        self.evaluate(iterator.initializer)
         for i in range(100):
-          self.assertEqual(i, sess.run(next_element))
+          self.assertEqual(i, self.evaluate(next_element))
           self._assertSummaryHasCount(
-              sess.run(summary_t), "record_latency", float((j * 100) + i + 1))
+              self.evaluate(summary_t), "record_latency",
+              float((j * 100) + i + 1))
         with self.assertRaises(errors.OutOfRangeError):
-          sess.run(next_element)
+          self.evaluate(next_element)
         self._assertSummaryHasCount(
-            sess.run(summary_t), "record_latency", (j + 1) * 100.0)
+            self.evaluate(summary_t), "record_latency", (j + 1) * 100.0)
 
+  @test_util.run_deprecated_v1
   def testNoAggregatorRegistered(self, dataset_transformation):
     dataset = dataset_ops.Dataset.range(100).apply(
         stats_ops.latency_stats("record_latency"))
-    iterator = dataset.make_initializable_iterator()
+    iterator = dataset_ops.make_initializable_iterator(dataset)
     next_element = iterator.get_next()
 
     with self.cached_session() as sess:
-      sess.run(iterator.initializer)
+      self.evaluate(iterator.initializer)
       for i in range(100):
-        self.assertEqual(i, sess.run(next_element))
+        self.assertEqual(i, self.evaluate(next_element))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
+        self.evaluate(next_element)
 
+  @test_util.run_deprecated_v1
   def testMultipleTags(self, dataset_transformation):
     aggregator = stats_aggregator.StatsAggregator()
     dataset = dataset_ops.Dataset.range(100).apply(
         stats_ops.latency_stats("record_latency")).apply(
             stats_ops.latency_stats("record_latency_2"))
     dataset = dataset_transformation(dataset, aggregator)
-    iterator = dataset.make_initializable_iterator()
+    iterator = dataset_ops.make_initializable_iterator(dataset)
     next_element = iterator.get_next()
     summary_t = aggregator.get_summary()
 
     with self.cached_session() as sess:
-      sess.run(iterator.initializer)
+      self.evaluate(iterator.initializer)
       for i in range(100):
-        self.assertEqual(i, sess.run(next_element))
+        self.assertEqual(i, self.evaluate(next_element))
         self._assertSummaryHasCount(
-            sess.run(summary_t), "record_latency", float(i + 1))
+            self.evaluate(summary_t), "record_latency", float(i + 1))
         self._assertSummaryHasCount(
-            sess.run(summary_t), "record_latency_2", float(i + 1))
+            self.evaluate(summary_t), "record_latency_2", float(i + 1))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
-      self._assertSummaryHasCount(sess.run(summary_t), "record_latency", 100.0)
+        self.evaluate(next_element)
       self._assertSummaryHasCount(
-          sess.run(summary_t), "record_latency_2", 100.0)
+          self.evaluate(summary_t), "record_latency", 100.0)
+      self._assertSummaryHasCount(
+          self.evaluate(summary_t), "record_latency_2", 100.0)
 
+  @test_util.run_deprecated_v1
   def testRepeatedTags(self, dataset_transformation):
     aggregator = stats_aggregator.StatsAggregator()
     dataset = dataset_ops.Dataset.range(100).apply(
         stats_ops.latency_stats("record_latency")).apply(
             stats_ops.latency_stats("record_latency"))
     dataset = dataset_transformation(dataset, aggregator)
-    iterator = dataset.make_initializable_iterator()
+    iterator = dataset_ops.make_initializable_iterator(dataset)
     next_element = iterator.get_next()
     summary_t = aggregator.get_summary()
 
     with self.cached_session() as sess:
-      sess.run(iterator.initializer)
+      self.evaluate(iterator.initializer)
       for i in range(100):
-        self.assertEqual(i, sess.run(next_element))
+        self.assertEqual(i, self.evaluate(next_element))
         self._assertSummaryHasCount(
-            sess.run(summary_t), "record_latency", float(2 * (i + 1)))
+            self.evaluate(summary_t), "record_latency", float(2 * (i + 1)))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
-      self._assertSummaryHasCount(sess.run(summary_t), "record_latency", 200.0)
+        self.evaluate(next_element)
+      self._assertSummaryHasCount(
+          self.evaluate(summary_t), "record_latency", 200.0)
 
+  @test_util.run_deprecated_v1
   def testMultipleIteratorsSameAggregator(self, dataset_transformation):
     aggregator = stats_aggregator.StatsAggregator()
     dataset = dataset_ops.Dataset.range(100).apply(
         stats_ops.latency_stats("record_latency"))
     dataset = dataset_transformation(dataset, aggregator)
-    iterator_0 = dataset.make_initializable_iterator()
-    iterator_1 = dataset.make_initializable_iterator()
+    iterator_0 = dataset_ops.make_initializable_iterator(dataset)
+    iterator_1 = dataset_ops.make_initializable_iterator(dataset)
     next_element = iterator_0.get_next() + iterator_1.get_next()
     summary_t = aggregator.get_summary()
 
     with self.cached_session() as sess:
-      sess.run([iterator_0.initializer, iterator_1.initializer])
+      self.evaluate([iterator_0.initializer, iterator_1.initializer])
       for i in range(100):
-        self.assertEqual(i * 2, sess.run(next_element))
+        self.assertEqual(i * 2, self.evaluate(next_element))
         self._assertSummaryHasCount(
-            sess.run(summary_t), "record_latency", float(2 * (i + 1)))
+            self.evaluate(summary_t), "record_latency", float(2 * (i + 1)))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
-      self._assertSummaryHasCount(sess.run(summary_t), "record_latency", 200.0)
+        self.evaluate(next_element)
+      self._assertSummaryHasCount(
+          self.evaluate(summary_t), "record_latency", 200.0)
 
+  @test_util.run_deprecated_v1
   def testMultipleDatasetWithPrefixes(self, dataset_transformation):
     aggregator = stats_aggregator.StatsAggregator()
     dataset = dataset_ops.Dataset.range(100).apply(
@@ -356,25 +374,25 @@ class StatsDatasetTest(stats_dataset_test_base.StatsDatasetTestBase):
     dataset2 = dataset_ops.Dataset.range(100).apply(
         stats_ops.latency_stats("record_latency"))
     dataset2 = dataset_transformation(dataset2, aggregator, prefix="dataset2")
-    iterator_0 = dataset.make_initializable_iterator()
-    iterator_1 = dataset2.make_initializable_iterator()
+    iterator_0 = dataset_ops.make_initializable_iterator(dataset)
+    iterator_1 = dataset_ops.make_initializable_iterator(dataset2)
     next_element = iterator_0.get_next() + iterator_1.get_next()
     summary_t = aggregator.get_summary()
 
     with self.test_session() as sess:
-      sess.run([iterator_0.initializer, iterator_1.initializer])
+      self.evaluate([iterator_0.initializer, iterator_1.initializer])
       for i in range(100):
-        self.assertEqual(i * 2, sess.run(next_element))
+        self.assertEqual(i * 2, self.evaluate(next_element))
         self._assertSummaryHasCount(
-            sess.run(summary_t), "dataset1_record_latency", float(i + 1))
+            self.evaluate(summary_t), "dataset1_record_latency", float(i + 1))
         self._assertSummaryHasCount(
-            sess.run(summary_t), "dataset2_record_latency", float(i + 1))
+            self.evaluate(summary_t), "dataset2_record_latency", float(i + 1))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
+        self.evaluate(next_element)
       self._assertSummaryHasCount(
-          sess.run(summary_t), "dataset1_record_latency", 100.0)
+          self.evaluate(summary_t), "dataset1_record_latency", 100.0)
       self._assertSummaryHasCount(
-          sess.run(summary_t), "dataset2_record_latency", 100.0)
+          self.evaluate(summary_t), "dataset2_record_latency", 100.0)
 
 
 @parameterized.named_parameters(
@@ -388,6 +406,7 @@ class FeatureStatsDatasetTest(
     stats_dataset_test_base.StatsDatasetTestBase,
     reader_dataset_ops_test_base.MakeBatchedFeaturesDatasetTestBase):
 
+  @test_util.run_deprecated_v1
   def testFeaturesStats(self, dataset_transformation):
     num_epochs = 5
     total_records = num_epochs * self._num_records
@@ -416,25 +435,26 @@ class FeatureStatsDatasetTest(
 
     dataset = dataset_transformation(
         dataset_fn(), aggregator, prefix="record_stats")
-    iterator = dataset.make_initializable_iterator()
+    iterator = dataset_ops.make_initializable_iterator(dataset)
     next_element = iterator.get_next()
     summary_t = aggregator.get_summary()
 
     with self.test_session() as sess:
-      sess.run(iterator.initializer)
+      self.evaluate(iterator.initializer)
       for _ in range(num_output):
-        sess.run(next_element)
+        self.evaluate(next_element)
 
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
+        self.evaluate(next_element)
       self._assertSummaryHasCount(
-          sess.run(summary_t), "record_stats_features", total_records)
+          self.evaluate(summary_t), "record_stats_features", total_records)
       self._assertSummaryHasCount(
-          sess.run(summary_t), "record_stats_feature-values", total_records)
+          self.evaluate(summary_t), "record_stats_feature-values",
+          total_records)
       self._assertSummaryHasSum(
-          sess.run(summary_t), "record_stats_features", total_records * 4)
+          self.evaluate(summary_t), "record_stats_features", total_records * 4)
       self._assertSummaryHasSum(
-          sess.run(summary_t), "record_stats_feature-values",
+          self.evaluate(summary_t), "record_stats_feature-values",
           self._sum_keywords(1) * num_epochs + 3 * total_records)
 
 
diff --git a/tensorflow/python/data/experimental/kernel_tests/stats_dataset_test_base.py b/tensorflow/python/data/experimental/kernel_tests/stats_dataset_test_base.py
index c5bf9267590..ab1d1c3028a 100644
--- a/tensorflow/python/data/experimental/kernel_tests/stats_dataset_test_base.py
+++ b/tensorflow/python/data/experimental/kernel_tests/stats_dataset_test_base.py
@@ -22,6 +22,7 @@ import numpy as np
 from tensorflow.core.framework import summary_pb2
 from tensorflow.python.data.experimental.ops import stats_aggregator
 from tensorflow.python.data.kernel_tests import test_base
+from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import errors
 
 
@@ -93,7 +94,7 @@ class StatsDatasetTestBase(test_base.DatasetTestBase):
     aggregator = stats_aggregator.StatsAggregator()
     dataset = dataset_fn()
     dataset = dataset_transformation(dataset, aggregator)
-    iterator = dataset.make_initializable_iterator()
+    iterator = dataset_ops.make_initializable_iterator(dataset)
     next_element = iterator.get_next()
     summary_t = aggregator.get_summary()
 
diff --git a/tensorflow/python/data/experimental/kernel_tests/unbatch_test.py b/tensorflow/python/data/experimental/kernel_tests/unbatch_test.py
index 0278a208cbb..cef5e8d269c 100644
--- a/tensorflow/python/data/experimental/kernel_tests/unbatch_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/unbatch_test.py
@@ -17,20 +17,18 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import time
 
 from absl.testing import parameterized
 import numpy as np
 
-from tensorflow.python.client import session
 from tensorflow.python.data.experimental.ops import batching
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
-from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import string_ops
@@ -40,20 +38,22 @@ from tensorflow.python.util import compat
 
 class UnbatchTest(test_base.DatasetTestBase, parameterized.TestCase):
 
+  @test_util.run_deprecated_v1
   def testUnbatchWithUnknownRankInput(self):
     placeholder = array_ops.placeholder(dtypes.int32)
     dataset = dataset_ops.Dataset.from_tensors(placeholder).apply(
         batching.unbatch())
-    iterator = dataset.make_initializable_iterator()
+    iterator = dataset_ops.make_initializable_iterator(dataset)
     next_elem = iterator.get_next()
 
     with self.cached_session() as sess:
       sess.run(iterator.initializer, feed_dict={placeholder: [0, 1, 2, 3]})
       for i in range(4):
-        self.assertEqual(i, sess.run(next_elem))
+        self.assertEqual(i, self.evaluate(next_elem))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_elem)
+        self.evaluate(next_elem)
 
+  @test_util.run_deprecated_v1
   def testUnbatchScalarDataset(self):
     data = tuple([math_ops.range(10) for _ in range(3)])
     data = dataset_ops.Dataset.from_tensor_slices(data)
@@ -63,16 +63,17 @@ class UnbatchTest(test_base.DatasetTestBase, parameterized.TestCase):
     data = data.apply(batching.unbatch())
     self.assertEqual(expected_types, data.output_types)
 
-    iterator = data.make_one_shot_iterator()
+    iterator = dataset_ops.make_one_shot_iterator(data)
     op = iterator.get_next()
 
     with self.cached_session() as sess:
       for i in range(10):
-        self.assertEqual((i,) * 3, sess.run(op))
+        self.assertEqual((i,) * 3, self.evaluate(op))
 
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(op)
+        self.evaluate(op)
 
+  @test_util.run_deprecated_v1
   def testUnbatchDatasetWithStrings(self):
     data = tuple([math_ops.range(10) for _ in range(3)])
     data = dataset_ops.Dataset.from_tensor_slices(data)
@@ -83,16 +84,17 @@ class UnbatchTest(test_base.DatasetTestBase, parameterized.TestCase):
     data = data.apply(batching.unbatch())
     self.assertEqual(expected_types, data.output_types)
 
-    iterator = data.make_one_shot_iterator()
+    iterator = dataset_ops.make_one_shot_iterator(data)
     op = iterator.get_next()
 
     with self.cached_session() as sess:
       for i in range(10):
-        self.assertEqual((i, compat.as_bytes(str(i)), i), sess.run(op))
+        self.assertEqual((i, compat.as_bytes(str(i)), i), self.evaluate(op))
 
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(op)
+        self.evaluate(op)
 
+  @test_util.run_deprecated_v1
   def testUnbatchDatasetWithSparseTensor(self):
     st = sparse_tensor.SparseTensorValue(
         indices=[[i, i] for i in range(10)],
@@ -102,18 +104,19 @@ class UnbatchTest(test_base.DatasetTestBase, parameterized.TestCase):
     data = data.apply(batching.unbatch())
     data = data.batch(5)
     data = data.apply(batching.unbatch())
-    iterator = data.make_one_shot_iterator()
+    iterator = dataset_ops.make_one_shot_iterator(data)
     next_element = iterator.get_next()
 
     with self.cached_session() as sess:
       for i in range(10):
-        st_row = sess.run(next_element)
+        st_row = self.evaluate(next_element)
         self.assertEqual([i], st_row.indices)
         self.assertEqual([i], st_row.values)
         self.assertEqual([10], st_row.dense_shape)
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
+        self.evaluate(next_element)
 
+  @test_util.run_deprecated_v1
   def testUnbatchDatasetWithDenseAndSparseTensor(self):
     st = sparse_tensor.SparseTensorValue(
         indices=[[i, i] for i in range(10)],
@@ -123,19 +126,20 @@ class UnbatchTest(test_base.DatasetTestBase, parameterized.TestCase):
     data = data.apply(batching.unbatch())
     data = data.batch(5)
     data = data.apply(batching.unbatch())
-    iterator = data.make_one_shot_iterator()
+    iterator = dataset_ops.make_one_shot_iterator(data)
     next_element = iterator.get_next()
 
     with self.cached_session() as sess:
       for i in range(10):
-        dense_elem, st_row = sess.run(next_element)
+        dense_elem, st_row = self.evaluate(next_element)
         self.assertEqual(i, dense_elem)
         self.assertEqual([i], st_row.indices)
         self.assertEqual([i], st_row.values)
         self.assertEqual([10], st_row.dense_shape)
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
+        self.evaluate(next_element)
 
+  @test_util.run_deprecated_v1
   def testUnbatchSingleElementTupleDataset(self):
     data = tuple([(math_ops.range(10),) for _ in range(3)])
     data = dataset_ops.Dataset.from_tensor_slices(data)
@@ -145,16 +149,17 @@ class UnbatchTest(test_base.DatasetTestBase, parameterized.TestCase):
     data = data.apply(batching.unbatch())
     self.assertEqual(expected_types, data.output_types)
 
-    iterator = data.make_one_shot_iterator()
+    iterator = dataset_ops.make_one_shot_iterator(data)
     op = iterator.get_next()
 
     with self.cached_session() as sess:
       for i in range(10):
-        self.assertEqual(((i,),) * 3, sess.run(op))
+        self.assertEqual(((i,),) * 3, self.evaluate(op))
 
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(op)
+        self.evaluate(op)
 
+  @test_util.run_deprecated_v1
   def testUnbatchMultiElementTupleDataset(self):
     data = tuple([(math_ops.range(10 * i, 10 * i + 10),
                    array_ops.fill([10], "hi")) for i in range(3)])
@@ -165,28 +170,29 @@ class UnbatchTest(test_base.DatasetTestBase, parameterized.TestCase):
     data = data.apply(batching.unbatch())
     self.assertAllEqual(expected_types, data.output_types)
 
-    iterator = data.make_one_shot_iterator()
+    iterator = dataset_ops.make_one_shot_iterator(data)
     op = iterator.get_next()
 
     with self.cached_session() as sess:
       for i in range(10):
         self.assertEqual(((i, b"hi"), (10 + i, b"hi"), (20 + i, b"hi")),
-                         sess.run(op))
+                         self.evaluate(op))
 
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(op)
+        self.evaluate(op)
 
+  @test_util.run_deprecated_v1
   def testUnbatchEmpty(self):
     data = dataset_ops.Dataset.from_tensors(
         (constant_op.constant([]), constant_op.constant([], shape=[0, 4]),
          constant_op.constant([], shape=[0, 4, 0])))
     data = data.apply(batching.unbatch())
-    iterator = data.make_one_shot_iterator()
+    iterator = dataset_ops.make_one_shot_iterator(data)
     next_element = iterator.get_next()
 
     with self.cached_session() as sess:
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
+        self.evaluate(next_element)
 
   def testUnbatchStaticShapeMismatch(self):
     data = dataset_ops.Dataset.from_tensors((np.arange(7), np.arange(8),
@@ -194,12 +200,13 @@ class UnbatchTest(test_base.DatasetTestBase, parameterized.TestCase):
     with self.assertRaises(ValueError):
       data.apply(batching.unbatch())
 
+  @test_util.run_deprecated_v1
   def testUnbatchDynamicShapeMismatch(self):
     ph1 = array_ops.placeholder(dtypes.int32, shape=[None])
     ph2 = array_ops.placeholder(dtypes.int32, shape=None)
     data = dataset_ops.Dataset.from_tensors((ph1, ph2))
     data = data.apply(batching.unbatch())
-    iterator = data.make_initializable_iterator()
+    iterator = dataset_ops.make_initializable_iterator(data)
     next_element = iterator.get_next()
 
     with self.cached_session() as sess:
@@ -211,7 +218,7 @@ class UnbatchTest(test_base.DatasetTestBase, parameterized.TestCase):
               ph2: np.arange(8).astype(np.int32)
           })
       with self.assertRaises(errors.InvalidArgumentError):
-        sess.run(next_element)
+        self.evaluate(next_element)
 
       # No 0th dimension (i.e. scalar value) for one component.
       sess.run(
@@ -221,79 +228,7 @@ class UnbatchTest(test_base.DatasetTestBase, parameterized.TestCase):
               ph2: 7
           })
       with self.assertRaises(errors.InvalidArgumentError):
-        sess.run(next_element)
-
-
-class UnbatchBenchmark(test.Benchmark):
-
-  def benchmarkNativeUnbatch(self):
-    batch_sizes = [1, 2, 5, 10, 20, 50]
-    elems_per_trial = 10000
-    with ops.Graph().as_default():
-      dataset = dataset_ops.Dataset.from_tensors("element").repeat(None)
-      batch_size_placeholder = array_ops.placeholder(dtypes.int64, shape=[])
-      dataset = dataset.batch(batch_size_placeholder)
-      dataset = dataset.apply(batching.unbatch())
-      dataset = dataset.skip(elems_per_trial)
-      iterator = dataset.make_initializable_iterator()
-      next_element = iterator.get_next()
-
-      with session.Session() as sess:
-        for batch_size in batch_sizes:
-          deltas = []
-          for _ in range(5):
-            sess.run(
-                iterator.initializer,
-                feed_dict={batch_size_placeholder: batch_size})
-            start = time.time()
-            sess.run(next_element.op)
-            end = time.time()
-            deltas.append((end - start) / elems_per_trial)
-
-          median_wall_time = np.median(deltas)
-          print("Unbatch (native) batch size: %d Median wall time per element:"
-                " %f microseconds" % (batch_size, median_wall_time * 1e6))
-          self.report_benchmark(
-              iters=10000,
-              wall_time=median_wall_time,
-              name="benchmark_unbatch_dataset_native_batch_size_%d" %
-              batch_size)
-
-  # Include a benchmark of the previous `unbatch()` implementation that uses
-  # a composition of more primitive ops. Eventually we'd hope to generate code
-  # that is as good in both cases.
-  def benchmarkOldUnbatchImplementation(self):
-    batch_sizes = [1, 2, 5, 10, 20, 50]
-    elems_per_trial = 10000
-    with ops.Graph().as_default():
-      dataset = dataset_ops.Dataset.from_tensors("element").repeat(None)
-      batch_size_placeholder = array_ops.placeholder(dtypes.int64, shape=[])
-      dataset = dataset.batch(batch_size_placeholder)
-      dataset = dataset.flat_map(dataset_ops.Dataset.from_tensor_slices)
-      dataset = dataset.skip(elems_per_trial)
-      iterator = dataset.make_initializable_iterator()
-      next_element = iterator.get_next()
-
-      with session.Session() as sess:
-        for batch_size in batch_sizes:
-          deltas = []
-          for _ in range(5):
-            sess.run(
-                iterator.initializer,
-                feed_dict={batch_size_placeholder: batch_size})
-            start = time.time()
-            sess.run(next_element.op)
-            end = time.time()
-            deltas.append((end - start) / elems_per_trial)
-
-          median_wall_time = np.median(deltas)
-          print("Unbatch (unfused) batch size: %d Median wall time per element:"
-                " %f microseconds" % (batch_size, median_wall_time * 1e6))
-          self.report_benchmark(
-              iters=10000,
-              wall_time=median_wall_time,
-              name="benchmark_unbatch_dataset_unfused_batch_size_%d" %
-              batch_size)
+        self.evaluate(next_element)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/data/experimental/kernel_tests/unique_test.py b/tensorflow/python/data/experimental/kernel_tests/unique_test.py
index 847cff26b0d..1d9941d7f4d 100644
--- a/tensorflow/python/data/experimental/kernel_tests/unique_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/unique_test.py
@@ -22,6 +22,7 @@ from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
+from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
 from tensorflow.python.util import compat
 
@@ -43,20 +44,21 @@ class UniqueTest(test_base.DatasetTestBase):
     current_test_case = []
     dataset = dataset_ops.Dataset.from_generator(lambda: current_test_case,
                                                  dtype).apply(unique.unique())
-    iterator = dataset.make_initializable_iterator()
+    iterator = dataset_ops.make_initializable_iterator(dataset)
     next_element = iterator.get_next()
 
     with self.cached_session() as sess:
       for test_case, expected in test_cases:
         current_test_case = test_case
-        sess.run(iterator.initializer)
+        self.evaluate(iterator.initializer)
         for element in expected:
           if dtype == dtypes.string:
             element = compat.as_bytes(element)
-          self.assertAllEqual(element, sess.run(next_element))
+          self.assertAllEqual(element, self.evaluate(next_element))
         with self.assertRaises(errors.OutOfRangeError):
-          sess.run(next_element)
+          self.evaluate(next_element)
 
+  @test_util.run_deprecated_v1
   def testSimpleInt(self):
     for dtype in [dtypes.int32, dtypes.int64]:
       self._testSimpleHelper(dtype, [
@@ -69,6 +71,7 @@ class UniqueTest(test_base.DatasetTestBase):
           ([[1, 1], [1, 1], [2, 2], [3, 3], [1, 1]], [[1, 1], [2, 2], [3, 3]]),
       ])
 
+  @test_util.run_deprecated_v1
   def testSimpleString(self):
     self._testSimpleHelper(dtypes.string, [
         ([], []),
diff --git a/tensorflow/python/data/experimental/ops/BUILD b/tensorflow/python/data/experimental/ops/BUILD
index 170fda90b68..43f43182f60 100644
--- a/tensorflow/python/data/experimental/ops/BUILD
+++ b/tensorflow/python/data/experimental/ops/BUILD
@@ -4,6 +4,16 @@ licenses(["notice"])  # Apache 2.0
 
 exports_files(["LICENSE"])
 
+py_library(
+    name = "cardinality",
+    srcs = ["cardinality.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:experimental_dataset_ops_gen",
+        "//tensorflow/python:tensor_util",
+    ],
+)
+
 py_library(
     name = "counter",
     srcs = ["counter.py"],
@@ -54,8 +64,8 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         "//tensorflow/python:constant_op",
-        "//tensorflow/python:dataset_ops_gen",
         "//tensorflow/python:dtypes",
+        "//tensorflow/python:experimental_dataset_ops_gen",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:random_seed",
         "//tensorflow/python:tensor_shape",
@@ -125,6 +135,7 @@ py_library(
         "//tensorflow/python/data/util:convert",
         "//tensorflow/python/data/util:nest",
         "//tensorflow/python/data/util:sparse",
+        "//tensorflow/python/data/util:structure",
         "//third_party/py/numpy",
     ],
 )
@@ -139,6 +150,18 @@ py_library(
     ],
 )
 
+py_library(
+    name = "filter_for_shard_ops",
+    srcs = ["filter_for_shard_ops.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:ops",
+        "//tensorflow/python:tensor_util",
+    ],
+)
+
 py_library(
     name = "error_ops",
     srcs = ["error_ops.py"],
@@ -165,7 +188,7 @@ py_library(
         "//tensorflow/python:tensor_shape",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/util:nest",
-        "//tensorflow/python/data/util:sparse",
+        "//tensorflow/python/data/util:structure",
     ],
 )
 
@@ -188,6 +211,28 @@ py_library(
     ],
 )
 
+py_library(
+    name = "map_defun",
+    srcs = ["map_defun.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:dataset_ops_gen",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:tensor_shape",
+    ],
+)
+
+py_library(
+    name = "matching_files",
+    srcs = ["matching_files.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:dataset_ops_gen",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:tensor_shape",
+    ],
+)
+
 py_library(
     name = "optimization",
     srcs = ["optimization.py"],
@@ -201,6 +246,16 @@ py_library(
     ],
 )
 
+py_library(
+    name = "optimization_options",
+    srcs = ["optimization_options.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:util",
+        "//tensorflow/python/data/util:options",
+    ],
+)
+
 py_library(
     name = "parsing_ops",
     srcs = ["parsing_ops.py"],
@@ -217,17 +272,6 @@ py_library(
     ],
 )
 
-py_library(
-    name = "map_defun",
-    srcs = ["map_defun.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        "//tensorflow/python:dataset_ops_gen",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:tensor_shape",
-    ],
-)
-
 py_library(
     name = "resampling",
     srcs = ["resampling.py"],
@@ -253,7 +297,7 @@ py_library(
     srcs = ["scan_ops.py"],
     srcs_version = "PY2AND3",
     deps = [
-        "//tensorflow/python:dataset_ops_gen",
+        "//tensorflow/python:experimental_dataset_ops_gen",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:function",
         "//tensorflow/python/data/ops:dataset_ops",
@@ -303,6 +347,18 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         ":stats_aggregator",
+        "//tensorflow/python:util",
+        "//tensorflow/python/data/util:options",
+    ],
+)
+
+py_library(
+    name = "threading_options",
+    srcs = ["threading_options.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:util",
+        "//tensorflow/python/data/util:options",
     ],
 )
 
@@ -313,9 +369,8 @@ py_library(
     deps = [
         "//tensorflow/python:experimental_dataset_ops_gen",
         "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python:util",
         "//tensorflow/python/data/ops:dataset_ops",
-        "//tensorflow/python/data/util:nest",
-        "//tensorflow/python/data/util:sparse",
         "//tensorflow/python/eager:context",
     ],
 )
@@ -378,14 +433,17 @@ py_library(
     name = "dataset_ops",
     deps = [
         ":batching",
+        ":cardinality",
         ":counter",
         ":enumerate_ops",
         ":error_ops",
+        ":filter_for_shard_ops",
         ":get_single_element",
         ":grouping",
         ":indexed_dataset_ops",
         ":interleave_ops",
         ":map_defun",
+        ":matching_files",
         ":optimization",
         ":prefetching_ops",
         ":readers",
diff --git a/tensorflow/python/data/experimental/ops/batching.py b/tensorflow/python/data/experimental/ops/batching.py
index d8985fd13bf..668bf3e8006 100644
--- a/tensorflow/python/data/experimental/ops/batching.py
+++ b/tensorflow/python/data/experimental/ops/batching.py
@@ -30,11 +30,12 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gen_array_ops
-from tensorflow.python.ops import gen_dataset_ops
+from tensorflow.python.ops import gen_experimental_dataset_ops as ged_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import sparse_ops
 from tensorflow.python.util.tf_export import tf_export
@@ -365,7 +366,7 @@ class _UnbatchDataset(dataset_ops.UnaryDataset):
     self._input_dataset = input_dataset
 
   def _as_variant_tensor(self):
-    return gen_dataset_ops.unbatch_dataset(
+    return ged_ops.experimental_unbatch_dataset(
         self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
         **dataset_ops.flat_structure(self))
 
@@ -455,7 +456,7 @@ class _DenseToSparseBatchDataset(dataset_ops.UnaryDataset):
     self._row_shape = row_shape
 
   def _as_variant_tensor(self):
-    return gen_dataset_ops.dense_to_sparse_batch_dataset(
+    return ged_ops.experimental_dense_to_sparse_batch_dataset(
         self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
         self._batch_size,
         row_shape=convert.partial_shape_to_tensor(self._row_shape),
@@ -569,13 +570,16 @@ class _RestructuredDataset(dataset_ops.UnaryDataset):
     return self._output_shapes
 
 
-class _MapAndBatchDataset(dataset_ops.MapDataset):
+class _MapAndBatchDataset(dataset_ops.UnaryDataset):
   """A `Dataset` that maps a function over a batch of elements."""
 
   def __init__(self, input_dataset, map_func, batch_size, num_parallel_calls,
                drop_remainder):
     """See `Dataset.map()` for details."""
-    super(_MapAndBatchDataset, self).__init__(input_dataset, map_func)
+    super(_MapAndBatchDataset, self).__init__(input_dataset)
+    self._input_dataset = input_dataset
+    self._map_func = dataset_ops.StructuredFunctionWrapper(
+        map_func, "tf.data.experimental.map_and_batch()", dataset=input_dataset)
     self._batch_size_t = ops.convert_to_tensor(
         batch_size, dtype=dtypes.int64, name="batch_size")
     self._num_parallel_calls_t = ops.convert_to_tensor(
@@ -583,36 +587,40 @@ class _MapAndBatchDataset(dataset_ops.MapDataset):
     self._drop_remainder_t = ops.convert_to_tensor(
         drop_remainder, dtype=dtypes.bool, name="drop_remainder")
 
-    self._batch_size = batch_size
-    self._drop_remainder = drop_remainder
+    constant_drop_remainder = tensor_util.constant_value(self._drop_remainder_t)
+    if constant_drop_remainder:
+      # NOTE(mrry): `constant_drop_remainder` may be `None` (unknown statically)
+      # or `False` (explicitly retaining the remainder).
+      self._output_structure = self._map_func.output_structure._batch(  # pylint: disable=protected-access
+          tensor_util.constant_value(self._batch_size_t))
+    else:
+      self._output_structure = self._map_func.output_structure._batch(None)  # pylint: disable=protected-access
+
+  def _functions(self):
+    return [self._map_func]
 
   def _as_variant_tensor(self):
     # pylint: disable=protected-access
-    input_resource = self._input_dataset._as_variant_tensor()
-    return gen_dataset_ops.map_and_batch_dataset_v2(
-        input_resource,
-        self._map_func.captured_inputs,
-        f=self._map_func,
+    return ged_ops.experimental_map_and_batch_dataset(
+        self._input_dataset._as_variant_tensor(),
+        self._map_func.function.captured_inputs,
+        f=self._map_func.function,
         batch_size=self._batch_size_t,
         num_parallel_calls=self._num_parallel_calls_t,
         drop_remainder=self._drop_remainder_t,
-        **dataset_ops.flat_structure(self))
-    # pylint: enable=protected-access
+        **dataset_ops.flat_structure(structure=self._output_structure))
+
+  @property
+  def output_classes(self):
+    return self._output_structure._to_legacy_output_classes()  # pylint: disable=protected-access
 
   @property
   def output_shapes(self):
-    dim = self._batch_size if self._drop_remainder else None
-    return nest.pack_sequence_as(self._output_shapes, [
-        tensor_shape.vector(dim).concatenate(s)
-        for s in nest.flatten(self._output_shapes)
-    ])
+    return self._output_structure._to_legacy_output_shapes()  # pylint: disable=protected-access
 
   @property
   def output_types(self):
-    return self._output_types
-
-  def _transformation_name(self):
-    return "tf.data.experimental.map_and_batch()"
+    return self._output_structure._to_legacy_output_types()  # pylint: disable=protected-access
 
 
 @tf_export("data.experimental.map_and_batch")
diff --git a/tensorflow/python/data/experimental/ops/cardinality.py b/tensorflow/python/data/experimental/ops/cardinality.py
new file mode 100644
index 00000000000..9cf0a8801e8
--- /dev/null
+++ b/tensorflow/python/data/experimental/ops/cardinality.py
@@ -0,0 +1,50 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Cardinality analysis of `Dataset` objects."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.ops import gen_experimental_dataset_ops as ged_ops
+from tensorflow.python.util.tf_export import tf_export
+
+
+INFINITE = -1
+UNKNOWN = -2
+tf_export("data.experimental.INFINITE_CARDINALITY").export_constant(
+    __name__, "INFINITE")
+tf_export("data.experimental.UNKNOWN_CARDINALITY").export_constant(
+    __name__, "UNKNOWN")
+
+
+@tf_export("data.experimental.cardinality")
+def cardinality(dataset):
+  """Returns the cardinality of `dataset`, if known.
+
+  The operation returns the cardinality of `dataset`. The operation may return
+  `tf.data.experimental.INFINITE_CARDINALITY` if `dataset` contains an infinite
+  number of elements or `tf.data.experimental.UNKNOWN_CARDINALITY` if the
+  analysis fails to determine the number of elements in `dataset` (e.g. when the
+  dataset source is a file).
+
+  Args:
+    dataset: A `tf.data.Dataset` for which to determine cardinality.
+
+  Returns:
+    A scalar `tf.int64` `Tensor` representing the cardinality of `dataset`. If
+    the cardinality is infinite or unknown, the operation returns the named
+    constant `INFINITE_CARDINALITY` and `UNKNOWN_CARDINALITY` respectively.
+  """
+  return ged_ops.experimental_dataset_cardinality(dataset._as_variant_tensor())  # pylint: disable=protected-access
diff --git a/tensorflow/python/data/experimental/ops/counter.py b/tensorflow/python/data/experimental/ops/counter.py
index 42200eaef9c..652eb9d0029 100644
--- a/tensorflow/python/data/experimental/ops/counter.py
+++ b/tensorflow/python/data/experimental/ops/counter.py
@@ -25,8 +25,8 @@ from tensorflow.python.framework import ops
 from tensorflow.python.util.tf_export import tf_export
 
 
-@tf_export("data.experimental.Counter")
-def Counter(start=0, step=1, dtype=dtypes.int64):
+@tf_export("data.experimental.Counter", v1=[])
+def CounterV2(start=0, step=1, dtype=dtypes.int64):
   """Creates a `Dataset` that counts from `start` in steps of size `step`.
 
   For example:
@@ -53,3 +53,13 @@ def Counter(start=0, step=1, dtype=dtypes.int64):
     step = ops.convert_to_tensor(step, dtype=dtype, name="step")
     return dataset_ops.Dataset.from_tensors(0).repeat(None).apply(
         scan_ops.scan(start, lambda state, _: (state + step, state)))
+
+
+@tf_export(v1=["data.experimental.Counter"])
+def CounterV1(start=0, step=1, dtype=dtypes.int64):
+  return dataset_ops.DatasetV1Adapter(CounterV2(start, step, dtype))
+CounterV1.__doc__ = CounterV2.__doc__
+
+# TODO(b/119044825): Until all `tf.data` unit tests are converted to V2, keep
+# this alias in place.
+Counter = CounterV1  # pylint: disable=invalid-name
diff --git a/tensorflow/python/data/experimental/ops/enumerate_ops.py b/tensorflow/python/data/experimental/ops/enumerate_ops.py
index a1af98f552c..f38cab12a76 100644
--- a/tensorflow/python/data/experimental/ops/enumerate_ops.py
+++ b/tensorflow/python/data/experimental/ops/enumerate_ops.py
@@ -26,9 +26,9 @@ from tensorflow.python.util.tf_export import tf_export
 
 @tf_export("data.experimental.enumerate_dataset")
 def enumerate_dataset(start=0):
-  """A transformation that enumerate the elements of a dataset.
+  """A transformation that enumerates the elements of a dataset.
 
-  It is Similar to python's `enumerate`.
+  It is similar to python's `enumerate`.
   For example:
 
   ```python
diff --git a/tensorflow/python/data/experimental/ops/error_ops.py b/tensorflow/python/data/experimental/ops/error_ops.py
index 82e274b70c5..879b13ce092 100644
--- a/tensorflow/python/data/experimental/ops/error_ops.py
+++ b/tensorflow/python/data/experimental/ops/error_ops.py
@@ -52,7 +52,7 @@ def ignore_errors():
   return _apply_fn
 
 
-class _IgnoreErrorsDataset(dataset_ops.UnaryDataset):
+class _IgnoreErrorsDataset(dataset_ops.UnaryUnchangedStructureDataset):
   """A `Dataset` that silently ignores errors when computing its input."""
 
   def __init__(self, input_dataset):
@@ -64,15 +64,3 @@ class _IgnoreErrorsDataset(dataset_ops.UnaryDataset):
     return gen_experimental_dataset_ops.experimental_ignore_errors_dataset(
         self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
         **dataset_ops.flat_structure(self))
-
-  @property
-  def output_classes(self):
-    return self._input_dataset.output_classes
-
-  @property
-  def output_shapes(self):
-    return self._input_dataset.output_shapes
-
-  @property
-  def output_types(self):
-    return self._input_dataset.output_types
diff --git a/tensorflow/python/data/experimental/ops/filter_for_shard_ops.py b/tensorflow/python/data/experimental/ops/filter_for_shard_ops.py
new file mode 100644
index 00000000000..91d3dca3e9a
--- /dev/null
+++ b/tensorflow/python/data/experimental/ops/filter_for_shard_ops.py
@@ -0,0 +1,106 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Naive shard dataset transformation."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_util
+from tensorflow.python.ops import math_ops
+from tensorflow.python.util.tf_export import tf_export
+
+
+@tf_export("data.experimental.filter_for_shard")
+def filter_for_shard(num_shards, shard_index):
+  """Creates a `Dataset` that includes only 1/`num_shards` of this dataset.
+
+  This dataset operator is very useful when running distributed training, as
+  it allows each worker to read a unique subset.
+
+  When reading a single input file, you can skip elements as follows:
+
+  ```python
+  d = tf.data.TFRecordDataset(FLAGS.input_file)
+  d = d.apply(tf.data.experimental.naive_shard(FLAGS.num_workers,
+                                               FLAGS.worker_index))
+  d = d.repeat(FLAGS.num_epochs)
+  d = d.shuffle(FLAGS.shuffle_buffer_size)
+  d = d.map(parser_fn, num_parallel_calls=FLAGS.num_map_threads)
+  ```
+
+  Important caveats:
+
+  - Be sure to shard before you use any randomizing operator (such as
+    shuffle).
+  - Generally it is best if the shard operator is used early in the dataset
+    pipeline. For example, when reading from a set of TFRecord files, shard
+    before converting the dataset to input samples. This avoids reading every
+    file on every worker. The following is an example of an efficient
+    sharding strategy within a complete pipeline:
+
+  ```python
+  d = Dataset.list_files(FLAGS.pattern)
+  d = d.apply(tf.data.experimental.naive_shard(FLAGS.num_workers,
+                                               FLAGS.worker_index))
+  d = d.repeat(FLAGS.num_epochs)
+  d = d.shuffle(FLAGS.shuffle_buffer_size)
+  d = d.interleave(tf.data.TFRecordDataset,
+                   cycle_length=FLAGS.num_readers, block_length=1)
+  d = d.map(parser_fn, num_parallel_calls=FLAGS.num_map_threads)
+  ```
+
+  Args:
+    num_shards: A `tf.int64` scalar `tf.Tensor`, representing the number of
+      shards operating in parallel.
+    shard_index: A `tf.int64` scalar `tf.Tensor`, representing the worker index.
+
+  Returns:
+    A `Dataset` transformation function, which can be passed to
+    `tf.data.Dataset.apply`.
+
+  Raises:
+    ValueError: if `num_shards` or `shard_index` are illegal values. Note: error
+      checking is done on a best-effort basis, and errors aren't guaranteed to
+      be caught upon dataset creation. (e.g. providing in a placeholder tensor
+      bypasses the early checking, and will instead result in an error during
+      a session.run call.)
+  """
+  num_shards = ops.convert_to_tensor(
+      num_shards, name="num_shards", dtype=dtypes.int64)
+  num_shards_static = tensor_util.constant_value(num_shards)
+  shard_index = ops.convert_to_tensor(shard_index, name="shard_index",
+                                      dtype=dtypes.int64)
+  shard_index_static = tensor_util.constant_value(shard_index)
+
+  if num_shards_static is not None and num_shards_static < 1:
+    raise ValueError("num_shards must be >= 1; got: %s" % num_shards_static)
+  if shard_index_static is not None and shard_index_static < 0:
+    raise ValueError("shard_index must be >= 0; got: %s" % shard_index_static)
+  if (shard_index_static is not None and num_shards_static is not None and
+      shard_index_static >= num_shards_static):
+    raise ValueError("shard_index must be < num_shards; %s is not < %s" %
+                     (shard_index_static, num_shards_static))
+
+  def filter_fn(elem_index, _):
+    mod_result = math_ops.mod(elem_index, num_shards)
+    return math_ops.equal(mod_result, shard_index)
+
+  def _apply_fn(dataset):
+    # pylint: disable=protected-access
+    return dataset._enumerate().filter(filter_fn).map(lambda _, elem: elem)
+
+  return _apply_fn
diff --git a/tensorflow/python/data/experimental/ops/get_single_element.py b/tensorflow/python/data/experimental/ops/get_single_element.py
index 132526166cf..73116edf128 100644
--- a/tensorflow/python/data/experimental/ops/get_single_element.py
+++ b/tensorflow/python/data/experimental/ops/get_single_element.py
@@ -60,7 +60,7 @@ def get_single_element(dataset):
     InvalidArgumentError (at runtime): if `dataset` does not contain exactly
       one element.
   """
-  if not isinstance(dataset, dataset_ops.Dataset):
+  if not isinstance(dataset, dataset_ops.DatasetV2):
     raise TypeError("`dataset` must be a `tf.data.Dataset` object.")
 
   nested_ret = nest.pack_sequence_as(
diff --git a/tensorflow/python/data/experimental/ops/grouping.py b/tensorflow/python/data/experimental/ops/grouping.py
index 026867d405f..ad9fe9a4e89 100644
--- a/tensorflow/python/data/experimental/ops/grouping.py
+++ b/tensorflow/python/data/experimental/ops/grouping.py
@@ -21,13 +21,14 @@ import numpy as np
 
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.util import nest
+from tensorflow.python.data.util import structure
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
-from tensorflow.python.ops import gen_dataset_ops
+from tensorflow.python.ops import gen_experimental_dataset_ops as ged_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.util.tf_export import tf_export
 
@@ -236,29 +237,6 @@ def bucket_by_sequence_length(element_length_func,
     return _apply_fn
 
 
-def _map_x_dataset(map_func):
-  """A transformation that maps `map_func` across its input.
-
-  This transformation is similar to `tf.data.Dataset.map`, but in addition to
-  supporting dense and sparse tensor inputs, it also supports dataset inputs.
-
-  Args:
-    map_func: A function mapping a nested structure of tensors and/or datasets
-      (having shapes and types defined by `self.output_shapes` and
-     `self.output_types`) to another nested structure of tensors and/or
-     datasets.
-
-  Returns:
-    Dataset: A `Dataset`.
-  """
-
-  def _apply_fn(dataset):
-    """Function from `Dataset` to `Dataset` that applies the transformation."""
-    return _MapXDataset(dataset, map_func)
-
-  return _apply_fn
-
-
 class _GroupByReducerDataset(dataset_ops.UnaryDataset):
   """A `Dataset` that groups its input and performs a reduction."""
 
@@ -284,7 +262,7 @@ class _GroupByReducerDataset(dataset_ops.UnaryDataset):
           "`key_func` must return a single tf.int64 tensor. "
           "Got type=%s and shape=%s"
           % (wrapped_func.output_types, wrapped_func.output_shapes))
-    self._key_func = wrapped_func.function
+    self._key_func = wrapped_func
 
   def _make_init_func(self, init_func):
     """Make wrapping defun for init_func."""
@@ -294,7 +272,7 @@ class _GroupByReducerDataset(dataset_ops.UnaryDataset):
         input_classes=ops.Tensor,
         input_shapes=tensor_shape.scalar(),
         input_types=dtypes.int64)
-    self._init_func = wrapped_func.function
+    self._init_func = wrapped_func
     self._state_classes = wrapped_func.output_classes
     self._state_shapes = wrapped_func.output_shapes
     self._state_types = wrapped_func.output_types
@@ -356,8 +334,8 @@ class _GroupByReducerDataset(dataset_ops.UnaryDataset):
         self._state_shapes = nest.pack_sequence_as(self._state_shapes,
                                                    weakened_state_shapes)
 
-    self._reduce_func = wrapped_func.function
-    self._reduce_func.add_to_graph(ops.get_default_graph())
+    self._reduce_func = wrapped_func
+    self._reduce_func.function.add_to_graph(ops.get_default_graph())
 
   def _make_finalize_func(self, finalize_func):
     """Make wrapping defun for finalize_func."""
@@ -367,7 +345,7 @@ class _GroupByReducerDataset(dataset_ops.UnaryDataset):
         input_classes=self._state_classes,
         input_shapes=self._state_shapes,
         input_types=self._state_types)
-    self._finalize_func = wrapped_func.function
+    self._finalize_func = wrapped_func
     self._output_classes = wrapped_func.output_classes
     self._output_shapes = wrapped_func.output_shapes
     self._output_types = wrapped_func.output_types
@@ -384,17 +362,22 @@ class _GroupByReducerDataset(dataset_ops.UnaryDataset):
   def output_types(self):
     return self._output_types
 
+  def _functions(self):
+    return [
+        self._key_func, self._init_func, self._reduce_func, self._finalize_func
+    ]
+
   def _as_variant_tensor(self):
-    return gen_dataset_ops.group_by_reducer_dataset(
+    return ged_ops.experimental_group_by_reducer_dataset(
         self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
-        self._key_func.captured_inputs,
-        self._init_func.captured_inputs,
-        self._reduce_func.captured_inputs,
-        self._finalize_func.captured_inputs,
-        key_func=self._key_func,
-        init_func=self._init_func,
-        reduce_func=self._reduce_func,
-        finalize_func=self._finalize_func,
+        self._key_func.function.captured_inputs,
+        self._init_func.function.captured_inputs,
+        self._reduce_func.function.captured_inputs,
+        self._finalize_func.function.captured_inputs,
+        key_func=self._key_func.function,
+        init_func=self._init_func.function,
+        reduce_func=self._reduce_func.function,
+        finalize_func=self._finalize_func.function,
         **dataset_ops.flat_structure(self))
 
   def _transformation_name(self):
@@ -430,7 +413,7 @@ class _GroupByWindowDataset(dataset_ops.UnaryDataset):
         wrapped_func.output_shapes.is_compatible_with(tensor_shape.scalar())):
       raise ValueError(
           "`window_size_func` must return a single tf.int64 scalar tensor.")
-    self._window_size_func = wrapped_func.function
+    self._window_size_func = wrapped_func
 
   def _make_key_func(self, key_func, input_dataset):
     """Make wrapping defun for key_func."""
@@ -444,25 +427,29 @@ class _GroupByWindowDataset(dataset_ops.UnaryDataset):
         wrapped_func.output_shapes.is_compatible_with(tensor_shape.scalar())):
       raise ValueError(
           "`key_func` must return a single tf.int64 scalar tensor.")
-    self._key_func = wrapped_func.function
+    self._key_func = wrapped_func
 
   def _make_reduce_func(self, reduce_func, input_dataset):
     """Make wrapping defun for reduce_func."""
-    nested_dataset = dataset_ops._NestedDatasetComponent(input_dataset)  # pylint: disable=protected-access
+    nested_dataset = dataset_ops.DatasetStructure(
+        structure.Structure._from_legacy_structure(  # pylint: disable=protected-access
+            input_dataset.output_types, input_dataset.output_shapes,
+            input_dataset.output_classes))
     wrapped_func = dataset_ops.StructuredFunctionWrapper(
         reduce_func,
         self._transformation_name(),
         input_classes=(ops.Tensor, nested_dataset),
         input_shapes=(tensor_shape.scalar(), nested_dataset),
-        input_types=(dtypes.int64, nested_dataset),
-        experimental_nested_dataset_support=True)
+        input_types=(dtypes.int64, nested_dataset))
     if not isinstance(
-        wrapped_func.output_classes, dataset_ops._NestedDatasetComponent):  # pylint: disable=protected-access
+        wrapped_func.output_structure, dataset_ops.DatasetStructure):
       raise TypeError("`reduce_func` must return a `Dataset` object.")
-    self._output_classes = wrapped_func.output_classes.output_classes
-    self._output_types = wrapped_func.output_types.output_types
-    self._output_shapes = wrapped_func.output_shapes.output_shapes
-    self._reduce_func = wrapped_func.function
+    # pylint: disable=protected-access
+    element_structure = wrapped_func.output_structure._element_structure
+    self._output_classes = element_structure._to_legacy_output_classes()
+    self._output_types = element_structure._to_legacy_output_types()
+    self._output_shapes = element_structure._to_legacy_output_shapes()
+    self._reduce_func = wrapped_func
 
   @property
   def output_classes(self):
@@ -476,15 +463,18 @@ class _GroupByWindowDataset(dataset_ops.UnaryDataset):
   def output_types(self):
     return self._output_types
 
+  def _functions(self):
+    return [self._key_func, self._reduce_func, self._window_size_func]
+
   def _as_variant_tensor(self):
-    return gen_dataset_ops.group_by_window_dataset(
+    return ged_ops.experimental_group_by_window_dataset(
         self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
-        self._key_func.captured_inputs,
-        self._reduce_func.captured_inputs,
-        self._window_size_func.captured_inputs,
-        key_func=self._key_func,
-        reduce_func=self._reduce_func,
-        window_size_func=self._window_size_func,
+        self._key_func.function.captured_inputs,
+        self._reduce_func.function.captured_inputs,
+        self._window_size_func.function.captured_inputs,
+        key_func=self._key_func.function,
+        reduce_func=self._reduce_func.function,
+        window_size_func=self._window_size_func.function,
         **dataset_ops.flat_structure(self))
 
   def _transformation_name(self):
@@ -517,45 +507,3 @@ class Reducer(object):
   @property
   def finalize_func(self):
     return self._finalize_func
-
-
-class _MapXDataset(dataset_ops.UnaryDataset):
-  """A `Dataset` that maps a function over elements in its input."""
-
-  def __init__(self, input_dataset, map_func):
-    """See `map_x_dataset()` for details."""
-    super(_MapXDataset, self).__init__(input_dataset)
-    self._input_dataset = input_dataset
-
-    wrapped_func = dataset_ops.StructuredFunctionWrapper(
-        map_func,
-        self._transformation_name(),
-        dataset=input_dataset,
-        experimental_nested_dataset_support=True)
-    self._output_classes = wrapped_func.output_classes
-    self._output_shapes = wrapped_func.output_shapes
-    self._output_types = wrapped_func.output_types
-    self._map_func = wrapped_func.function
-
-  def _as_variant_tensor(self):
-    input_t = self._input_dataset._as_variant_tensor()  # pylint: disable=protected-access
-    return gen_dataset_ops.map_dataset(
-        input_t,
-        self._map_func.captured_inputs,
-        f=self._map_func,
-        **dataset_ops.flat_structure(self))
-
-  @property
-  def output_classes(self):
-    return self._output_classes
-
-  @property
-  def output_shapes(self):
-    return self._output_shapes
-
-  @property
-  def output_types(self):
-    return self._output_types
-
-  def _transformation_name(self):
-    return "tf.data.experimental.map_x_dataset()"
diff --git a/tensorflow/python/data/experimental/ops/indexed_dataset_ops.py b/tensorflow/python/data/experimental/ops/indexed_dataset_ops.py
index 9c06474a2f8..570f0116f76 100644
--- a/tensorflow/python/data/experimental/ops/indexed_dataset_ops.py
+++ b/tensorflow/python/data/experimental/ops/indexed_dataset_ops.py
@@ -65,6 +65,7 @@ class MaterializedIndexedDataset(object):
             sparse.as_dense_types(self._output_shapes, self._output_classes)))
 
 
+# TODO(saeta): Add a `DatasetV1` wrapper if this is exposed via the public API.
 class IndexedDataset(dataset_ops.Dataset):
   """IndexedDataset is highly experimental!
   """
@@ -149,6 +150,7 @@ class IndexedDataset(dataset_ops.Dataset):
     raise NotImplementedError("IndexedDataset._as_variant_tensor")
 
 
+# TODO(saeta): Add a `DatasetV1` wrapper if this is exposed via the public API.
 class IdentityIndexedDataset(IndexedDataset):
   """IdentityIndexedDataset is a trivial indexed dataset used for testing.
   """
diff --git a/tensorflow/python/data/experimental/ops/interleave_ops.py b/tensorflow/python/data/experimental/ops/interleave_ops.py
index a3c094859ef..8b0fdfce11d 100644
--- a/tensorflow/python/data/experimental/ops/interleave_ops.py
+++ b/tensorflow/python/data/experimental/ops/interleave_ops.py
@@ -133,8 +133,8 @@ class _DirectedInterleaveDataset(dataset_ops.Dataset):
     return self._data_inputs[0].output_types
 
 
-@tf_export("data.experimental.sample_from_datasets")
-def sample_from_datasets(datasets, weights=None, seed=None):
+@tf_export("data.experimental.sample_from_datasets", v1=[])
+def sample_from_datasets_v2(datasets, weights=None, seed=None):
   """Samples elements at random from the datasets in `datasets`.
 
   Args:
@@ -158,7 +158,7 @@ def sample_from_datasets(datasets, weights=None, seed=None):
       length of the `datasets` element.
   """
   num_datasets = len(datasets)
-  if not isinstance(weights, dataset_ops.Dataset):
+  if not isinstance(weights, dataset_ops.DatasetV2):
     if weights is None:
       # Select inputs with uniform probability.
       logits = [[1.0] * num_datasets]
@@ -217,8 +217,15 @@ def sample_from_datasets(datasets, weights=None, seed=None):
   return _DirectedInterleaveDataset(selector_input, datasets)
 
 
-@tf_export("data.experimental.choose_from_datasets")
-def choose_from_datasets(datasets, choice_dataset):
+@tf_export(v1=["data.experimental.sample_from_datasets"])
+def sample_from_datasets_v1(datasets, weights=None, seed=None):
+  return dataset_ops.DatasetV1Adapter(
+      sample_from_datasets_v2(datasets, weights, seed))
+sample_from_datasets_v1.__doc__ = sample_from_datasets_v2.__doc__
+
+
+@tf_export("data.experimental.choose_from_datasets", v1=[])
+def choose_from_datasets_v2(datasets, choice_dataset):
   """Creates a dataset that deterministically chooses elements from `datasets`.
 
   For example, given the following datasets:
@@ -260,3 +267,16 @@ def choose_from_datasets(datasets, choice_dataset):
     raise TypeError("`choice_dataset` must be a dataset of scalar "
                     "`tf.int64` tensors.")
   return _DirectedInterleaveDataset(choice_dataset, datasets)
+
+
+@tf_export(v1=["data.experimental.choose_from_datasets"])
+def choose_from_datasets_v1(datasets, choice_dataset):
+  return dataset_ops.DatasetV1Adapter(
+      choose_from_datasets_v2(datasets, choice_dataset))
+choose_from_datasets_v1.__doc__ = choose_from_datasets_v2.__doc__
+
+
+# TODO(b/119044825): Until all `tf.data` unit tests are converted to V2, keep
+# these aliases in place.
+choose_from_datasets = choose_from_datasets_v1
+sample_from_datasets = sample_from_datasets_v1
diff --git a/tensorflow/python/data/experimental/ops/matching_files.py b/tensorflow/python/data/experimental/ops/matching_files.py
new file mode 100644
index 00000000000..8398f86e31c
--- /dev/null
+++ b/tensorflow/python/data/experimental/ops/matching_files.py
@@ -0,0 +1,51 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Experimental API for matching input filenames."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.ops import gen_experimental_dataset_ops as ged_ops
+
+
+class MatchingFilesDataset(dataset_ops.DatasetSource):
+  """A `Dataset` that list the files according to the input patterns."""
+
+  def __init__(self, patterns):
+    super(MatchingFilesDataset, self).__init__()
+    self._patterns = ops.convert_to_tensor(
+        patterns, dtype=dtypes.string, name="patterns")
+
+  def _as_variant_tensor(self):
+    return ged_ops.experimental_matching_files_dataset(self._patterns)
+
+  @property
+  def output_classes(self):
+    return ops.Tensor
+
+  @property
+  def output_shapes(self):
+    return tensor_shape.scalar()
+
+  @property
+  def output_types(self):
+    return dtypes.string
+
+
diff --git a/tensorflow/python/data/experimental/ops/optimization.py b/tensorflow/python/data/experimental/ops/optimization.py
index b744db7f1e5..c6c7de9265c 100644
--- a/tensorflow/python/data/experimental/ops/optimization.py
+++ b/tensorflow/python/data/experimental/ops/optimization.py
@@ -100,7 +100,7 @@ def optimize(optimizations=None):
   return _apply_fn
 
 
-class _AssertNextDataset(dataset_ops.UnaryDataset):
+class _AssertNextDataset(dataset_ops.UnaryUnchangedStructureDataset):
   """A `Dataset` that asserts which transformations happen next."""
 
   def __init__(self, input_dataset, transformations):
@@ -118,20 +118,8 @@ class _AssertNextDataset(dataset_ops.UnaryDataset):
         self._transformations,
         **dataset_ops.flat_structure(self))
 
-  @property
-  def output_classes(self):
-    return self._input_dataset.output_classes
 
-  @property
-  def output_shapes(self):
-    return self._input_dataset.output_shapes
-
-  @property
-  def output_types(self):
-    return self._input_dataset.output_types
-
-
-class _NonSerializableDataset(dataset_ops.UnaryDataset):
+class _NonSerializableDataset(dataset_ops.UnaryUnchangedStructureDataset):
   """A `Dataset` that performs non-serializable identity transformation."""
 
   def __init__(self, input_dataset):
@@ -143,15 +131,3 @@ class _NonSerializableDataset(dataset_ops.UnaryDataset):
     return gen_experimental_dataset_ops.experimental_non_serializable_dataset(
         self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
         **dataset_ops.flat_structure(self))
-
-  @property
-  def output_classes(self):
-    return self._input_dataset.output_classes
-
-  @property
-  def output_shapes(self):
-    return self._input_dataset.output_shapes
-
-  @property
-  def output_types(self):
-    return self._input_dataset.output_types
diff --git a/tensorflow/python/data/experimental/ops/optimization_options.py b/tensorflow/python/data/experimental/ops/optimization_options.py
new file mode 100644
index 00000000000..dc9d3193748
--- /dev/null
+++ b/tensorflow/python/data/experimental/ops/optimization_options.py
@@ -0,0 +1,83 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Experimental API for controlling optimizations in `tf.data` pipelines."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+
+from tensorflow.python.data.util import options
+from tensorflow.python.util.tf_export import tf_export
+
+
+@tf_export("data.experimental.OptimizationOptions")
+class OptimizationOptions(options.OptionsBase):
+  """Represents options for dataset optimizations.
+
+  You can apply `OptimizationOptions` to a `dataset` object, as follows:
+
+  ```python
+  options = tf.data.Options()
+  options.optimization = tf.data.experimental.OptimizationOptions()
+  options.optimization.map_and_batch_fusion = True
+  dataset = dataset.with_options(options)
+  ```
+  """
+
+  filter_fusion = options.create_option(
+      name="filter_fusion",
+      ty=bool,
+      docstring="Whether to fuse filter transformations.")
+
+  hoist_random_uniform = options.create_option(
+      name="hoist_random_uniform",
+      ty=bool,
+      docstring=
+      "Whether to hoist `tf.random_uniform()` ops out of map transformations.")
+
+  map_and_batch_fusion = options.create_option(
+      name="map_and_batch_fusion",
+      ty=bool,
+      docstring="Whether to fuse map and batch transformations.")
+
+  map_and_filter_fusion = options.create_option(
+      name="map_and_filter_fusion",
+      ty=bool,
+      docstring="Whether to fuse map and filter transformations.")
+
+  map_fusion = options.create_option(
+      name="map_and_filter_fusion",
+      ty=bool,
+      docstring="Whether to fuse map transformations.")
+
+  map_parallelization = options.create_option(
+      name="map_parallelization",
+      ty=bool,
+      docstring="Whether to parallelize stateless map transformations.")
+
+  map_vectorization = options.create_option(
+      name="map_vectorization",
+      ty=bool,
+      docstring="Whether to vectorize map transformations.")
+
+  noop_elimination = options.create_option(
+      name="noop_elimination",
+      ty=bool,
+      docstring="Whether to eliminate no-op transformations.")
+
+  shuffle_and_repeat_fusion = options.create_option(
+      name="shuffle_and_repeat_fusion",
+      ty=bool,
+      docstring="Whether to fuse shuffle and repeat transformations.")
diff --git a/tensorflow/python/data/experimental/ops/parsing_ops.py b/tensorflow/python/data/experimental/ops/parsing_ops.py
index 6615b9022a2..44cb0b8a2c2 100644
--- a/tensorflow/python/data/experimental/ops/parsing_ops.py
+++ b/tensorflow/python/data/experimental/ops/parsing_ops.py
@@ -22,7 +22,7 @@ from tensorflow.python.data.util import nest
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
-from tensorflow.python.ops import gen_dataset_ops
+from tensorflow.python.ops import gen_experimental_dataset_ops
 from tensorflow.python.ops import parsing_ops
 from tensorflow.python.util.tf_export import tf_export
 
@@ -80,7 +80,7 @@ class _ParseExampleDataset(dataset_ops.UnaryDataset):
             ]))
 
   def _as_variant_tensor(self):
-    return gen_dataset_ops.parse_example_dataset(
+    return gen_experimental_dataset_ops.experimental_parse_example_dataset(
         self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
         self._num_parallel_calls,
         self._dense_defaults,
@@ -138,10 +138,10 @@ def parse_example_dataset(features, num_parallel_calls=1):
   def _apply_fn(dataset):
     """Function from `Dataset` to `Dataset` that applies the transformation."""
     out_dataset = _ParseExampleDataset(dataset, features, num_parallel_calls)
-    if any([
+    if any(
         isinstance(feature, parsing_ops.SparseFeature)
         for _, feature in features.items()
-    ]):
+    ):
       # pylint: disable=protected-access
       # pylint: disable=g-long-lambda
       out_dataset = out_dataset.map(
diff --git a/tensorflow/python/data/experimental/ops/prefetching_ops.py b/tensorflow/python/data/experimental/ops/prefetching_ops.py
index d34f9f25bda..0894dd5f701 100644
--- a/tensorflow/python/data/experimental/ops/prefetching_ops.py
+++ b/tensorflow/python/data/experimental/ops/prefetching_ops.py
@@ -17,13 +17,10 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import warnings
-
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import iterator_ops
 from tensorflow.python.data.util import nest
 from tensorflow.python.data.util import sparse
-from tensorflow.python.eager import context
 from tensorflow.python.eager import function
 from tensorflow.python.framework import device as framework_device
 from tensorflow.python.framework import dtypes
@@ -37,304 +34,6 @@ from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.util.tf_export import tf_export
 
 
-def function_buffering_resource(string_arg,
-                                target_device,
-                                f,
-                                buffer_size,
-                                output_types,
-                                container="",
-                                shared_name=None,
-                                name=None):
-  """Creates a FunctionBufferingResource.
-
-  A FunctionBufferingResource fills up a buffer by calling a function `f` on
-  `target_device`. `f` should take in only a single string argument as input.
-
-  Args:
-    string_arg: The single string argument to the function.
-    target_device: The device to run `f` on.
-    f: The function to be executed.
-    buffer_size: Size of the buffer to be populated.
-    output_types: The output types generated by the function.
-    container: (Optional) string. Defaults to "".
-    shared_name: (Optional) string.
-    name: (Optional) string to name the op.
-
-  Returns:
-    Handle to a FunctionBufferingResource.
-  """
-  if shared_name is None:
-    shared_name = ""
-  return ged_ops.experimental_function_buffering_resource(
-      string_arg=string_arg,
-      target_device=target_device,
-      shared_name=shared_name,
-      f=f,
-      buffer_size=buffer_size,
-      container=container,
-      name=name,
-      output_types=output_types)
-
-
-def function_buffering_resource_get_next(function_buffer_resource,
-                                         output_types,
-                                         name=None):
-  return ged_ops.experimental_function_buffering_resource_get_next(
-      function_buffer_resource=function_buffer_resource,
-      output_types=output_types,
-      name=name)
-
-
-def function_buffering_resource_reset(function_buffer_resource, name=None):
-  return ged_ops.experimental_function_buffering_resource_reset(
-      function_buffer_resource=function_buffer_resource, name=name)
-
-
-# pylint: disable=protected-access
-class _PrefetchToDeviceIterator(object):
-  """A replacement for `tf.data.Iterator` that prefetches to another device.
-
-  Args:
-    input_dataset: The input dataset
-    one_shot: If true, we make a one shot iterator that's already initialized.
-    device: A fully specified device string where we want to prefetch to
-    buffer_size: Size of the prefetching buffer.
-    shared_name: (Optional.) If non-empty, the returned iterator will be
-        shared under the given name across multiple sessions that share the
-        same devices (e.g. when using a remote server).
-
-  Returns:
-    An Iterator type object.
-  """
-
-  def __init__(self,
-               input_dataset,
-               one_shot,
-               device,
-               buffer_size,
-               shared_name=None):
-    self._input_dataset = input_dataset
-    self._get_next_call_count = 0
-    self._one_shot = one_shot
-    if shared_name is None:
-      shared_name = ""
-
-    if self._one_shot:
-      self._input_iterator = input_dataset.make_one_shot_iterator()
-    else:
-      self._input_iterator = iterator_ops.Iterator.from_structure(
-          self._input_dataset.output_types, self._input_dataset.output_shapes,
-          shared_name, self._input_dataset.output_classes)
-    input_iterator_handle = self._input_iterator.string_handle()
-
-    @function.defun(input_signature=[tensor_spec.TensorSpec([], dtypes.string)])
-    # handle is a scalar `tf.Tensor` of type `tf.string`
-    def _prefetch_fn(handle):
-      """Prefetches one element from `input_iterator`."""
-      remote_iterator = iterator_ops.Iterator.from_string_handle(
-          handle, self._input_iterator.output_types,
-          self._input_iterator.output_shapes,
-          self._input_iterator.output_classes)
-      ret = remote_iterator.get_next()
-      return nest.flatten(sparse.serialize_sparse_tensors(ret))
-
-    self._prefetch_fn = _prefetch_fn._get_concrete_function_internal()  # pylint: disable=protected-access
-
-    iterator_device = ged_ops.experimental_iterator_get_device(
-        self._input_iterator._iterator_resource)
-
-    with ops.device(device):
-      self._buffering_resource = function_buffering_resource(
-          f=self._prefetch_fn,
-          target_device=iterator_device,
-          string_arg=input_iterator_handle,
-          buffer_size=buffer_size,
-          shared_name=shared_name,
-          output_types=nest.flatten(
-              sparse.as_dense_types(self._input_dataset.output_types,
-                                    self._input_dataset.output_classes)))
-
-    if not self._one_shot:
-      reset_op = function_buffering_resource_reset(self._buffering_resource)
-      with ops.control_dependencies([reset_op]):
-        self._initializer = self._input_iterator.make_initializer(
-            self._input_dataset)
-
-  def get_next(self, name=None):
-    """See `tf.data.Iterator.get_next`."""
-    self._get_next_call_count += 1
-    if self._get_next_call_count > iterator_ops.GET_NEXT_CALL_WARNING_THRESHOLD:
-      warnings.warn(iterator_ops.GET_NEXT_CALL_WARNING_MESSAGE)
-
-    flat_ret = ged_ops.experimental_function_buffering_resource_get_next(
-        self._buffering_resource,
-        output_types=nest.flatten(
-            sparse.as_dense_types(self.output_types, self.output_classes)),
-        name=name)
-
-    ret = sparse.deserialize_sparse_tensors(
-        nest.pack_sequence_as(self.output_types, flat_ret),
-        self.output_types, self.output_shapes, self.output_classes)
-
-    for tensor, shape in zip(
-        nest.flatten(ret), nest.flatten(self.output_shapes)):
-      if isinstance(tensor, ops.Tensor):
-        tensor.set_shape(shape)
-
-    return ret
-
-  @property
-  def initializer(self):
-    if self._one_shot:
-      raise NotImplementedError("Can't initialize a one_shot_iterator")
-    return self._initializer
-
-  @property
-  def output_classes(self):
-    return self._input_dataset.output_classes
-
-  @property
-  def output_shapes(self):
-    return self._input_dataset.output_shapes
-
-  @property
-  def output_types(self):
-    return self._input_dataset.output_types
-
-
-class _PrefetchToDeviceEagerIterator(iterator_ops.EagerIterator):
-  """A replacement for `tf.data.Iterator` that prefetches to another device.
-
-  Args:
-    input_dataset: The input dataset
-    one_shot: If true, we make a one shot iterator that's already initialized.
-    device: A fully specified device string where we want to prefetch to
-    buffer_size: Size of the prefetching buffer.
-    shared_name: (Optional.) If non-empty, the returned iterator will be
-        shared under the given name across multiple sessions that share the
-        same devices (e.g. when using a remote server).
-
-  Returns:
-    An Iterator type object.
-  """
-
-  def __init__(self,
-               input_dataset,
-               device,
-               buffer_size):
-    with ops.device("/device:CPU:0"):
-      super(_PrefetchToDeviceEagerIterator, self).__init__(input_dataset)
-      input_iterator_handle = gen_dataset_ops.iterator_to_string_handle(
-          self._resource)
-
-    self._device = device
-
-    @function.defun(input_signature=[tensor_spec.TensorSpec([], dtypes.string)])
-    def _prefetch_fn(handle):
-      """Prefetches one element from `input_iterator`."""
-      remote_iterator = iterator_ops.Iterator.from_string_handle(
-          handle, self.output_types, self.output_shapes, self.output_classes)
-      ret = remote_iterator.get_next()
-      return nest.flatten(sparse.serialize_sparse_tensors(ret))
-
-    self._prefetch_fn = _prefetch_fn._get_concrete_function_internal()  # pylint: disable=protected-access
-
-    with ops.device(device):
-      self._buffering_resource = function_buffering_resource(
-          f=self._prefetch_fn,
-          output_types=self._flat_output_types,
-          target_device=ged_ops.experimental_iterator_get_device(
-              self._resource),
-          string_arg=input_iterator_handle,
-          buffer_size=buffer_size,
-          shared_name=iterator_ops._generate_shared_name(
-              "function_buffer_resource"))
-
-  def _next_internal(self):
-    """Returns a nested structure of `tf.Tensor`s containing the next element.
-    """
-    # This runs in sync mode as iterators use an error status to communicate
-    # that there is no more data to iterate over.
-    # TODO(b/77291417): Fix
-    with context.execution_mode(context.SYNC):
-      with ops.device(self._device):
-        flat_ret = ged_ops.experimental_function_buffering_resource_get_next(
-            function_buffer_resource=self._buffering_resource,
-            output_types=self._flat_output_types)
-      return self._element_structure._from_tensor_list(flat_ret)
-# pylint: enable=protected-access
-
-
-class _PrefetchToDeviceDataset(dataset_ops.UnaryDataset):
-  """A `Dataset` whose iterator prefetches elements to another device."""
-
-  def __init__(self, input_dataset, device, buffer_size):
-    super(_PrefetchToDeviceDataset, self).__init__(input_dataset)
-    self._input_dataset = input_dataset
-    self._device = device
-    self._buffer_size = buffer_size if buffer_size is not None else 1
-
-  # The static analysis cannot tell that the eager iterator's superclass has
-  # a `next()` method.
-  # pylint: disable=non-iterator-returned
-  def __iter__(self):
-    """Creates an `Iterator` for enumerating the elements of this dataset.
-
-    The returned iterator implements the Python iterator protocol and therefore
-    can only be used in eager mode.
-
-    Returns:
-      An `Iterator` over the elements of this dataset.
-
-    Raises:
-      RuntimeError: If eager execution is enabled.
-    """
-    if context.executing_eagerly():
-      return _PrefetchToDeviceEagerIterator(self._input_dataset, self._device,
-                                            self._buffer_size)
-    else:
-      raise RuntimeError("dataset.__iter__() is only supported when eager "
-                         "execution is enabled.")
-  # pylint: enable=non-iterator-returned
-
-  def make_one_shot_iterator(self):
-    if context.executing_eagerly():
-      return _PrefetchToDeviceEagerIterator(self._input_dataset, self._device,
-                                            self._buffer_size)
-    else:
-      return _PrefetchToDeviceIterator(self._input_dataset, one_shot=True,
-                                       device=self._device,
-                                       buffer_size=self._buffer_size)
-
-  def make_initializable_iterator(self, shared_name=None):
-    return _PrefetchToDeviceIterator(
-        self._input_dataset,
-        one_shot=False,
-        device=self._device,
-        buffer_size=self._buffer_size,
-        shared_name=shared_name)
-
-  def _as_variant_tensor(self):
-    # TODO(mrry): Raise this error earlier (e.g. when one of the Dataset
-    # transformation methods is called.
-    # TODO(mrry): Investigate support for chaining further transformations after
-    # the prefetch, including GPU support.
-    raise NotImplementedError("`prefetch_to_device()` must be the last "
-                              "transformation in a dataset pipeline.")
-
-  @property
-  def output_types(self):
-    return self._input_dataset.output_types
-
-  @property
-  def output_shapes(self):
-    return self._input_dataset.output_shapes
-
-  @property
-  def output_classes(self):
-    return self._input_dataset.output_classes
-
-
 @tf_export("data.experimental.prefetch_to_device")
 def prefetch_to_device(device, buffer_size=None):
   """A transformation that prefetches dataset values to the given `device`.
@@ -352,7 +51,8 @@ def prefetch_to_device(device, buffer_size=None):
     `tf.data.Dataset.apply`.
   """
   def _apply_fn(dataset):
-    return _PrefetchToDeviceDataset(dataset, device, buffer_size)
+    return dataset.apply(
+        copy_to_device(target_device=device)).prefetch(buffer_size)
 
   return _apply_fn
 
@@ -371,8 +71,11 @@ def copy_to_device(target_device, source_device="/cpu:0"):
   """
 
   def _apply_fn(dataset):
+    options = dataset_ops.Options()
+    options.experimental_autotune = False
     return _CopyToDeviceDataset(
-        dataset, target_device=target_device, source_device=source_device)
+        dataset, target_device=target_device,
+        source_device=source_device).with_options(options)
 
   return _apply_fn
 
@@ -380,7 +83,7 @@ def copy_to_device(target_device, source_device="/cpu:0"):
 # TODO(rohanj): Use the _input_hostmem attr on the RemoteCall ops to indicate
 # all inputs to the Op are in host memory, thereby avoiding some unnecessary
 # Sends and Recvs.
-class _CopyToDeviceDataset(dataset_ops.UnaryDataset):
+class _CopyToDeviceDataset(dataset_ops.UnaryUnchangedStructureDataset):
   """A `Dataset` that copies elements to another device."""
 
   def __init__(self, input_dataset, target_device, source_device="/cpu:0"):
@@ -529,18 +232,6 @@ class _CopyToDeviceDataset(dataset_ops.UnaryDataset):
           output_types=self._flat_output_types,
           output_shapes=self._flat_output_shapes)
 
-  @property
-  def output_types(self):
-    return self._input_dataset.output_types
-
-  @property
-  def output_shapes(self):
-    return self._input_dataset.output_shapes
-
-  @property
-  def output_classes(self):
-    return self._input_dataset.output_classes
-
 
 class _MapOnGpuDataset(dataset_ops.UnaryDataset):
   """A `Dataset` that maps a function over elements in its using a GPU."""
@@ -551,36 +242,35 @@ class _MapOnGpuDataset(dataset_ops.UnaryDataset):
     self._input_dataset = input_dataset
     self._use_inter_op_parallelism = use_inter_op_parallelism
 
-    wrapped_func = dataset_ops.StructuredFunctionWrapper(
+    self._map_func = dataset_ops.StructuredFunctionWrapper(
         map_func,
         self._transformation_name(),
         dataset=input_dataset,
         defun_kwargs={"experimental_ints_on_device": True})
-    self._output_classes = wrapped_func.output_classes
-    self._output_shapes = wrapped_func.output_shapes
-    self._output_types = wrapped_func.output_types
-    self._map_func = wrapped_func.function
+
+  def _functions(self):
+    return [self._map_func]
 
   def _as_variant_tensor(self):
     input_t = self._input_dataset._as_variant_tensor()  # pylint: disable=protected-access
     return ged_ops.experimental_map_dataset(
         input_t,
-        self._map_func.captured_inputs,
-        f=self._map_func,
+        self._map_func.function.captured_inputs,
+        f=self._map_func.function,
         use_inter_op_parallelism=self._use_inter_op_parallelism,
         **dataset_ops.flat_structure(self))
 
   @property
   def output_classes(self):
-    return self._output_classes
+    return self._map_func.output_classes
 
   @property
   def output_shapes(self):
-    return self._output_shapes
+    return self._map_func.output_shapes
 
   @property
   def output_types(self):
-    return self._output_types
+    return self._map_func.output_types
 
   def _transformation_name(self):
     return "map_on_gpu()"
diff --git a/tensorflow/python/data/experimental/ops/random_ops.py b/tensorflow/python/data/experimental/ops/random_ops.py
index e3a2aeab31e..a4359f356b1 100644
--- a/tensorflow/python/data/experimental/ops/random_ops.py
+++ b/tensorflow/python/data/experimental/ops/random_ops.py
@@ -17,26 +17,28 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import functools
+
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.util import random_seed
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
-from tensorflow.python.ops import gen_dataset_ops
+from tensorflow.python.ops import gen_experimental_dataset_ops
 from tensorflow.python.util.tf_export import tf_export
 
 
-@tf_export("data.experimental.RandomDataset")
-class RandomDataset(dataset_ops.DatasetSource):
+@tf_export("data.experimental.RandomDataset", v1=[])
+class RandomDatasetV2(dataset_ops.DatasetSource):
   """A `Dataset` of pseudorandom values."""
 
   def __init__(self, seed=None):
     """A `Dataset` of pseudorandom values."""
-    super(RandomDataset, self).__init__()
+    super(RandomDatasetV2, self).__init__()
     self._seed, self._seed2 = random_seed.get_seed(seed)
 
   def _as_variant_tensor(self):
-    return gen_dataset_ops.random_dataset(
+    return gen_experimental_dataset_ops.experimental_random_dataset(
         seed=self._seed,
         seed2=self._seed2,
         **dataset_ops.flat_structure(self))
@@ -52,3 +54,18 @@ class RandomDataset(dataset_ops.DatasetSource):
   @property
   def output_types(self):
     return dtypes.int64
+
+
+@tf_export(v1=["data.experimental.RandomDataset"])
+class RandomDatasetV1(dataset_ops.DatasetV1Adapter):
+  """A `Dataset` of pseudorandom values."""
+
+  @functools.wraps(RandomDatasetV2.__init__)
+  def __init__(self, seed=None):
+    wrapped = RandomDatasetV2(seed)
+    super(RandomDatasetV1, self).__init__(wrapped)
+
+
+# TODO(b/119044825): Until all `tf.data` unit tests are converted to V2, keep
+# this alias in place.
+RandomDataset = RandomDatasetV1
diff --git a/tensorflow/python/data/experimental/ops/readers.py b/tensorflow/python/data/experimental/ops/readers.py
index fe601925860..a4c260dde78 100644
--- a/tensorflow/python/data/experimental/ops/readers.py
+++ b/tensorflow/python/data/experimental/ops/readers.py
@@ -19,6 +19,7 @@ from __future__ import print_function
 
 import collections
 import csv
+import functools
 
 import numpy as np
 
@@ -36,7 +37,6 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.lib.io import file_io
-from tensorflow.python.ops import gen_dataset_ops
 from tensorflow.python.ops import gen_experimental_dataset_ops
 from tensorflow.python.ops import io_ops
 from tensorflow.python.platform import gfile
@@ -307,8 +307,8 @@ def make_tf_record_dataset(file_pattern,
     return dataset.prefetch(buffer_size=prefetch_buffer_size)
 
 
-@tf_export("data.experimental.make_csv_dataset")
-def make_csv_dataset(
+@tf_export("data.experimental.make_csv_dataset", v1=[])
+def make_csv_dataset_v2(
     file_pattern,
     batch_size,
     column_names=None,
@@ -507,11 +507,42 @@ def make_csv_dataset(
   return dataset
 
 
+@tf_export(v1=["data.experimental.make_csv_dataset"])
+def make_csv_dataset_v1(
+    file_pattern,
+    batch_size,
+    column_names=None,
+    column_defaults=None,
+    label_name=None,
+    select_columns=None,
+    field_delim=",",
+    use_quote_delim=True,
+    na_value="",
+    header=True,
+    num_epochs=None,
+    shuffle=True,
+    shuffle_buffer_size=10000,
+    shuffle_seed=None,
+    prefetch_buffer_size=optimization.AUTOTUNE,
+    num_parallel_reads=1,
+    sloppy=False,
+    num_rows_for_inference=100,
+    compression_type=None,
+):  # pylint: disable=missing-docstring
+  return dataset_ops.DatasetV1Adapter(make_csv_dataset_v2(
+      file_pattern, batch_size, column_names, column_defaults, label_name,
+      select_columns, field_delim, use_quote_delim, na_value, header,
+      num_epochs, shuffle, shuffle_buffer_size, shuffle_seed,
+      prefetch_buffer_size, num_parallel_reads, sloppy, num_rows_for_inference,
+      compression_type))
+make_csv_dataset_v1.__doc__ = make_csv_dataset_v2.__doc__
+
+
 _DEFAULT_READER_BUFFER_SIZE_BYTES = 4 * 1024 * 1024  # 4 MB
 
 
-@tf_export("data.experimental.CsvDataset")
-class CsvDataset(dataset_ops.DatasetSource):
+@tf_export("data.experimental.CsvDataset", v1=[])
+class CsvDatasetV2(dataset_ops.DatasetSource):
   """A Dataset comprising lines from one or more CSV files."""
 
   def __init__(self,
@@ -541,7 +572,9 @@ class CsvDataset(dataset_ops.DatasetSource):
 
     We can construct a CsvDataset from it as follows:
     ```python
-    dataset = tf.data.experimental.CsvDataset(
+    tf.enable_eager_execution()
+
+     dataset = tf.data.experimental.CsvDataset(
         "my_file*.csv",
         [tf.float32,  # Required field, use dtype or empty tensor
          tf.constant([0.0], dtype=tf.float32),  # Optional field, default to 0.0
@@ -553,13 +586,8 @@ class CsvDataset(dataset_ops.DatasetSource):
 
     The expected output of its iterations is:
     ```python
-    next_element = dataset.make_one_shot_iterator().get_next()
-    with tf.Session() as sess:
-      while True:
-        try:
-          print(sess.run(next_element))
-        except tf.errors.OutOfRangeError:
-          break
+    for element in dataset:
+      print(element)
 
     >> (4.28e10, 5.55e6, 12)
     >> (-5.3e14, 0.0, 2)
@@ -594,7 +622,7 @@ class CsvDataset(dataset_ops.DatasetSource):
         the input data. If specified, only this subset of columns will be
         parsed. Defaults to parsing all columns.
     """
-    super(CsvDataset, self).__init__()
+    super(CsvDatasetV2, self).__init__()
     self._filenames = ops.convert_to_tensor(
         filenames, dtype=dtypes.string, name="filenames")
     self._compression_type = convert.optional_param_to_tensor(
@@ -658,22 +686,43 @@ class CsvDataset(dataset_ops.DatasetSource):
     return self._output_classes
 
 
-@tf_export("data.experimental.make_batched_features_dataset")
-def make_batched_features_dataset(file_pattern,
-                                  batch_size,
-                                  features,
-                                  reader=core_readers.TFRecordDataset,
-                                  label_key=None,
-                                  reader_args=None,
-                                  num_epochs=None,
-                                  shuffle=True,
-                                  shuffle_buffer_size=10000,
-                                  shuffle_seed=None,
-                                  prefetch_buffer_size=optimization.AUTOTUNE,
-                                  reader_num_threads=1,
-                                  parser_num_threads=2,
-                                  sloppy_ordering=False,
-                                  drop_final_batch=False):
+@tf_export(v1=["data.experimental.CsvDataset"])
+class CsvDatasetV1(dataset_ops.DatasetV1Adapter):
+  """A Dataset comprising lines from one or more CSV files."""
+
+  @functools.wraps(CsvDatasetV2.__init__)
+  def __init__(self,
+               filenames,
+               record_defaults,
+               compression_type=None,
+               buffer_size=None,
+               header=False,
+               field_delim=",",
+               use_quote_delim=True,
+               na_value="",
+               select_cols=None):
+    wrapped = CsvDatasetV2(filenames, record_defaults, compression_type,
+                           buffer_size, header, field_delim, use_quote_delim,
+                           na_value, select_cols)
+    super(CsvDatasetV1, self).__init__(wrapped)
+
+
+@tf_export("data.experimental.make_batched_features_dataset", v1=[])
+def make_batched_features_dataset_v2(file_pattern,
+                                     batch_size,
+                                     features,
+                                     reader=core_readers.TFRecordDataset,
+                                     label_key=None,
+                                     reader_args=None,
+                                     num_epochs=None,
+                                     shuffle=True,
+                                     shuffle_buffer_size=10000,
+                                     shuffle_seed=None,
+                                     prefetch_buffer_size=optimization.AUTOTUNE,
+                                     reader_num_threads=1,
+                                     parser_num_threads=2,
+                                     sloppy_ordering=False,
+                                     drop_final_batch=False):
   """Returns a `Dataset` of feature dictionaries from `Example` protos.
 
   If label_key argument is provided, returns a `Dataset` of tuple
@@ -819,6 +868,31 @@ def make_batched_features_dataset(file_pattern,
   return dataset
 
 
+@tf_export(v1=["data.experimental.make_batched_features_dataset"])
+def make_batched_features_dataset_v1(file_pattern,  # pylint: disable=missing-docstring
+                                     batch_size,
+                                     features,
+                                     reader=core_readers.TFRecordDataset,
+                                     label_key=None,
+                                     reader_args=None,
+                                     num_epochs=None,
+                                     shuffle=True,
+                                     shuffle_buffer_size=10000,
+                                     shuffle_seed=None,
+                                     prefetch_buffer_size=optimization.AUTOTUNE,
+                                     reader_num_threads=1,
+                                     parser_num_threads=2,
+                                     sloppy_ordering=False,
+                                     drop_final_batch=False):
+  return dataset_ops.DatasetV1Adapter(make_batched_features_dataset_v2(
+      file_pattern, batch_size, features, reader, label_key, reader_args,
+      num_epochs, shuffle, shuffle_buffer_size, shuffle_seed,
+      prefetch_buffer_size, reader_num_threads, parser_num_threads,
+      sloppy_ordering, drop_final_batch))
+make_batched_features_dataset_v2.__doc__ = (
+    make_batched_features_dataset_v1.__doc__)
+
+
 def _get_file_names(file_pattern, shuffle):
   """Parse list of file names from pattern, optionally shuffled.
 
@@ -850,8 +924,8 @@ def _get_file_names(file_pattern, shuffle):
   return file_names
 
 
-@tf_export("data.experimental.SqlDataset")
-class SqlDataset(dataset_ops.DatasetSource):
+@tf_export("data.experimental.SqlDataset", v1=[])
+class SqlDatasetV2(dataset_ops.DatasetSource):
   """A `Dataset` consisting of the results from a SQL query."""
 
   def __init__(self, driver_name, data_source_name, query, output_types):
@@ -861,17 +935,14 @@ class SqlDataset(dataset_ops.DatasetSource):
     For example:
 
     ```python
+    tf.enable_eager_execution()
+
     dataset = tf.data.experimental.SqlDataset("sqlite", "/foo/bar.sqlite3",
                                               "SELECT name, age FROM people",
                                               (tf.string, tf.int32))
-    iterator = dataset.make_one_shot_iterator()
-    next_element = iterator.get_next()
     # Prints the rows of the result set of the above query.
-    while True:
-      try:
-        print(sess.run(next_element))
-      except tf.errors.OutOfRangeError:
-        break
+    for element in dataset:
+      print(element)
     ```
 
     Args:
@@ -883,7 +954,7 @@ class SqlDataset(dataset_ops.DatasetSource):
       output_types: A tuple of `tf.DType` objects representing the types of the
         columns returned by `query`.
     """
-    super(SqlDataset, self).__init__()
+    super(SqlDatasetV2, self).__init__()
     self._driver_name = ops.convert_to_tensor(
         driver_name, dtype=dtypes.string, name="driver_name")
     self._data_source_name = ops.convert_to_tensor(
@@ -893,10 +964,9 @@ class SqlDataset(dataset_ops.DatasetSource):
     self._output_types = output_types
 
   def _as_variant_tensor(self):
-    return gen_dataset_ops.sql_dataset(self._driver_name,
-                                       self._data_source_name, self._query,
-                                       nest.flatten(self.output_types),
-                                       nest.flatten(self.output_shapes))
+    return gen_experimental_dataset_ops.experimental_sql_dataset(
+        self._driver_name, self._data_source_name, self._query,
+        nest.flatten(self.output_types), nest.flatten(self.output_shapes))
 
   @property
   def output_classes(self):
@@ -910,3 +980,21 @@ class SqlDataset(dataset_ops.DatasetSource):
   @property
   def output_types(self):
     return self._output_types
+
+
+@tf_export(v1=["data.experimental.SqlDataset"])
+class SqlDatasetV1(dataset_ops.DatasetV1Adapter):
+  """A `Dataset` consisting of the results from a SQL query."""
+
+  @functools.wraps(SqlDatasetV2.__init__)
+  def __init__(self, driver_name, data_source_name, query, output_types):
+    wrapped = SqlDatasetV2(driver_name, data_source_name, query, output_types)
+    super(SqlDatasetV1, self).__init__(wrapped)
+
+
+# TODO(b/119044825): Until all `tf.data` unit tests are converted to V2, keep
+# these aliases in place.
+CsvDataset = CsvDatasetV1
+SqlDataset = SqlDatasetV1
+make_batched_features_dataset = make_batched_features_dataset_v1
+make_csv_dataset = make_csv_dataset_v1
diff --git a/tensorflow/python/data/experimental/ops/scan_ops.py b/tensorflow/python/data/experimental/ops/scan_ops.py
index 1194238e2f9..c768373cf49 100644
--- a/tensorflow/python/data/experimental/ops/scan_ops.py
+++ b/tensorflow/python/data/experimental/ops/scan_ops.py
@@ -24,7 +24,7 @@ from tensorflow.python.data.util import nest
 from tensorflow.python.data.util import sparse
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
-from tensorflow.python.ops import gen_dataset_ops
+from tensorflow.python.ops import gen_experimental_dataset_ops
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -125,16 +125,19 @@ class _ScanDataset(dataset_ops.UnaryDataset):
         self._state_shapes = nest.pack_sequence_as(self._state_shapes,
                                                    weakened_state_shapes)
 
-    self._scan_func = wrapped_func.function
-    self._scan_func.add_to_graph(ops.get_default_graph())
+    self._scan_func = wrapped_func
+    self._scan_func.function.add_to_graph(ops.get_default_graph())
+
+  def _functions(self):
+    return [self._scan_func]
 
   def _as_variant_tensor(self):
     input_t = self._input_dataset._as_variant_tensor()  # pylint: disable=protected-access
-    return gen_dataset_ops.scan_dataset(
+    return gen_experimental_dataset_ops.experimental_scan_dataset(
         input_t,
         nest.flatten(sparse.serialize_sparse_tensors(self._initial_state)),
-        self._scan_func.captured_inputs,
-        f=self._scan_func,
+        self._scan_func.function.captured_inputs,
+        f=self._scan_func.function,
         **dataset_ops.flat_structure(self))
 
   @property
diff --git a/tensorflow/python/data/experimental/ops/shuffle_ops.py b/tensorflow/python/data/experimental/ops/shuffle_ops.py
index a4307212daf..d12328a7145 100644
--- a/tensorflow/python/data/experimental/ops/shuffle_ops.py
+++ b/tensorflow/python/data/experimental/ops/shuffle_ops.py
@@ -26,7 +26,7 @@ from tensorflow.python.ops import gen_dataset_ops
 from tensorflow.python.util.tf_export import tf_export
 
 
-class _ShuffleAndRepeatDataset(dataset_ops.UnaryDataset):
+class _ShuffleAndRepeatDataset(dataset_ops.UnaryUnchangedStructureDataset):
   """A `Dataset` that fuses `shuffle` and `repeat`."""
 
   def __init__(self, input_dataset, buffer_size, count=None, seed=None):
@@ -53,18 +53,6 @@ class _ShuffleAndRepeatDataset(dataset_ops.UnaryDataset):
         **dataset_ops.flat_structure(self))
     # pylint: enable=protected-access
 
-  @property
-  def output_classes(self):
-    return self._input_dataset.output_classes
-
-  @property
-  def output_shapes(self):
-    return self._input_dataset.output_shapes
-
-  @property
-  def output_types(self):
-    return self._input_dataset.output_types
-
 
 @tf_export("data.experimental.shuffle_and_repeat")
 def shuffle_and_repeat(buffer_size, count=None, seed=None):
diff --git a/tensorflow/python/data/experimental/ops/sleep.py b/tensorflow/python/data/experimental/ops/sleep.py
index 7e7d370f702..5e9d021ada9 100644
--- a/tensorflow/python/data/experimental/ops/sleep.py
+++ b/tensorflow/python/data/experimental/ops/sleep.py
@@ -21,7 +21,7 @@ from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.ops import gen_experimental_dataset_ops
 
 
-class _SleepDataset(dataset_ops.UnaryDataset):
+class _SleepDataset(dataset_ops.UnaryUnchangedStructureDataset):
   """A `Dataset` that sleeps before producing each upstream element."""
 
   def __init__(self, input_dataset, sleep_microseconds):
diff --git a/tensorflow/python/data/experimental/ops/stats_aggregator.py b/tensorflow/python/data/experimental/ops/stats_aggregator.py
index 5274c816a49..d5fcc033ab7 100644
--- a/tensorflow/python/data/experimental/ops/stats_aggregator.py
+++ b/tensorflow/python/data/experimental/ops/stats_aggregator.py
@@ -17,7 +17,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.ops import gen_dataset_ops
+from tensorflow.python.ops import gen_experimental_dataset_ops as ged_ops
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -47,7 +47,6 @@ class StatsAggregator(object):
   options = dataset_ops.Options()
   options.experimental_stats = tf.data.experimental.StatsOptions(aggregator)
   dataset = dataset.with_options(options)
-  iterator = dataset.make_one_shot_iterator()
   ```
 
   To get a protocol buffer summary of the currently aggregated statistics,
@@ -69,7 +68,7 @@ class StatsAggregator(object):
 
   def __init__(self):
     """Creates a `StatsAggregator`."""
-    self._resource = gen_dataset_ops.stats_aggregator_handle()
+    self._resource = ged_ops.experimental_stats_aggregator_handle()
 
   # TODO(b/116314787): Update this/add support for V2 summary API.
   def get_summary(self):
@@ -81,4 +80,4 @@ class StatsAggregator(object):
     Returns:
       A scalar string `tf.Tensor` that summarizes the aggregated statistics.
     """
-    return gen_dataset_ops.stats_aggregator_summary(self._resource)
+    return ged_ops.experimental_stats_aggregator_summary(self._resource)
diff --git a/tensorflow/python/data/experimental/ops/stats_ops.py b/tensorflow/python/data/experimental/ops/stats_ops.py
index ca2f5f2a887..15a9d24546e 100644
--- a/tensorflow/python/data/experimental/ops/stats_ops.py
+++ b/tensorflow/python/data/experimental/ops/stats_ops.py
@@ -20,7 +20,7 @@ from __future__ import print_function
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
-from tensorflow.python.ops import gen_dataset_ops
+from tensorflow.python.ops import gen_experimental_dataset_ops
 from tensorflow.python.util import deprecation
 from tensorflow.python.util.tf_export import tf_export
 
@@ -66,8 +66,10 @@ def bytes_produced_stats(tag):
   """
 
   def _apply_fn(dataset):
-    return _StatsDataset(dataset, gen_dataset_ops.bytes_produced_stats_dataset,
-                         tag)
+    return _StatsDataset(
+        dataset,
+        gen_experimental_dataset_ops.experimental_bytes_produced_stats_dataset,
+        tag)
 
   return _apply_fn
 
@@ -89,12 +91,14 @@ def latency_stats(tag):
   """
 
   def _apply_fn(dataset):
-    return _StatsDataset(dataset, gen_dataset_ops.latency_stats_dataset, tag)
+    return _StatsDataset(
+        dataset,
+        gen_experimental_dataset_ops.experimental_latency_stats_dataset, tag)
 
   return _apply_fn
 
 
-class _StatsDataset(dataset_ops.UnaryDataset):
+class _StatsDataset(dataset_ops.UnaryUnchangedStructureDataset):
   """A `Dataset` that acts as an identity, and also records statistics."""
 
   def __init__(self, input_dataset, op_function, tag):
@@ -108,15 +112,3 @@ class _StatsDataset(dataset_ops.UnaryDataset):
         self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
         self._tag,
         **dataset_ops.flat_structure(self))
-
-  @property
-  def output_shapes(self):
-    return self._input_dataset.output_shapes
-
-  @property
-  def output_types(self):
-    return self._input_dataset.output_types
-
-  @property
-  def output_classes(self):
-    return self._input_dataset.output_classes
diff --git a/tensorflow/python/data/experimental/ops/stats_options.py b/tensorflow/python/data/experimental/ops/stats_options.py
index c088d3d8881..6e884aa08ae 100644
--- a/tensorflow/python/data/experimental/ops/stats_options.py
+++ b/tensorflow/python/data/experimental/ops/stats_options.py
@@ -20,25 +20,24 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.data.experimental.ops import stats_aggregator
+from tensorflow.python.data.util import options
 from tensorflow.python.util.tf_export import tf_export
 
 
 @tf_export("data.experimental.StatsOptions")
-class StatsOptions(object):
+class StatsOptions(options.OptionsBase):
   """Represents options for collecting dataset stats using `StatsAggregator`.
 
   To apply `StatsOptions` with a `tf.data.Dataset` object, use the following
   pattern:
 
   ```python
-  aggretator = tf.data.experimental.StatsAggregator()
+  aggregator = tf.data.experimental.StatsAggregator()
 
-  options = dataset_ops.Options()
+  options = tf.data.Options()
   options.experimental_stats = tf.data.experimental.StatsOptions()
   options.experimental_stats.aggregator = aggregator
   dataset = dataset.with_options(options)
-
-  iterator = dataset.make_one_shot_iterator()
   ```
 
   Note: a `StatsAggregator` object can be attached either duing construction or
@@ -52,52 +51,29 @@ class StatsOptions(object):
   ```
   """
 
-  for _name, _ty, _default, _docstring in [
-      ("aggregator", stats_aggregator.StatsAggregator, None,
-       "Associate the given statistics options with the dataset pipeline."),
-      ("prefix", str, "",
-       "Prefix to prepend all statistics recorded for the input `dataset` with."
-      ),
-      ("counter_prefix", str, "",
-       "Prefix for the statistics recorded as counter."),
-      ("latency_all_edges", bool, True,
-       "Whether to add latency measurements on all edges."),
-  ]:
+  aggregator = options.create_option(
+      name="aggregator",
+      ty=stats_aggregator.StatsAggregator,
+      docstring=
+      "Associates the given statistics aggregator with the dataset pipeline.")
 
-    def _make_getter(name):  # pylint: disable=no-self-argument
+  prefix = options.create_option(
+      name="prefix",
+      ty=str,
+      docstring=
+      "Prefix to prepend all statistics recorded for the input `dataset` with.",
+      default="")
 
-      def getter(self):
-        return getattr(self, "_" + name)
+  counter_prefix = options.create_option(
+      name="counter_prefix",
+      ty=str,
+      docstring=
+      "Prefix for the statistics recorded as counter.",
+      default="")
 
-      return getter
-
-    def _make_setter(name, ty):  # pylint: disable=no-self-argument
-
-      def setter(self, value):
-        if not isinstance(value, ty):
-          raise TypeError(
-              "Attempting to set the option %s to incompatible value: %r when "
-              "it expects  %r" % (name, value, ty))
-        setattr(self, "_" + name, value)
-
-      return setter
-
-    vars()["_" + _name] = _default
-    vars()[_name] = property(
-        _make_getter(_name), _make_setter(_name, _ty), _default, _docstring)
-
-  def __init__(self, aggregator=None):
-    if aggregator:
-      self.aggregator = aggregator
-
-  def __eq__(self, other):
-    if isinstance(other, self.__class__):
-      return self.__dict__ == other.__dict__
-    else:
-      return False
-
-  def __ne__(self, other):
-    return not self.__eq__(other)
-
-  def __str__(self):
-    return str(self.__dict__)
+  latency_all_edges = options.create_option(
+      name="latency_all_edges",
+      ty=bool,
+      docstring=
+      "Whether to add latency measurements on all edges.",
+      default=True)
diff --git a/tensorflow/python/data/experimental/ops/threading_options.py b/tensorflow/python/data/experimental/ops/threading_options.py
new file mode 100644
index 00000000000..dbf662186f8
--- /dev/null
+++ b/tensorflow/python/data/experimental/ops/threading_options.py
@@ -0,0 +1,50 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Experimental API for controlling threading in `tf.data` pipelines."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+
+from tensorflow.python.data.util import options
+from tensorflow.python.util.tf_export import tf_export
+
+
+@tf_export("data.experimental.ThreadingOptions")
+class ThreadingOptions(options.OptionsBase):
+  """Represents options for dataset threading.
+
+  To apply `ThreadingOptions` to a `dataset` object, use the following pattern:
+
+  ```python
+  options = tf.data.Options()
+  options.experimental_threading = tf.data.experimental.ThreadingOptions()
+  options.experimental_threading.private_threadpool_size = 10
+  dataset = dataset.with_options(options)
+  ```
+  """
+
+  max_intra_op_parallelism = options.create_option(
+      name="max_intra_op_parallelism",
+      ty=int,
+      docstring=
+      "If set, it overrides the maximum degree of intra-op parallelism.")
+
+  private_threadpool_size = options.create_option(
+      name="private_threadpool_size",
+      ty=int,
+      docstring=
+      "If set, the dataset will use a private threadpool of the given size.",
+      default=None)
diff --git a/tensorflow/python/data/experimental/ops/threadpool.py b/tensorflow/python/data/experimental/ops/threadpool.py
index 3ea017c6e80..69e8829d687 100644
--- a/tensorflow/python/data/experimental/ops/threadpool.py
+++ b/tensorflow/python/data/experimental/ops/threadpool.py
@@ -60,7 +60,7 @@ class PrivateThreadPool(object):
           display_name=display_name)
 
 
-class _ThreadPoolDataset(dataset_ops.UnaryDataset):
+class _ThreadPoolDataset(dataset_ops.UnaryUnchangedStructureDataset):
   """A `Dataset` that acts as an identity, and sets a custom threadpool."""
 
   def __init__(self, input_dataset, thread_pool):
@@ -74,18 +74,6 @@ class _ThreadPoolDataset(dataset_ops.UnaryDataset):
         self._thread_pool._resource,  # pylint: disable=protected-access
         **dataset_ops.flat_structure(self))
 
-  @property
-  def output_shapes(self):
-    return self._input_dataset.output_shapes
-
-  @property
-  def output_types(self):
-    return self._input_dataset.output_types
-
-  @property
-  def output_classes(self):
-    return self._input_dataset.output_classes
-
 
 # TODO(b/73383364): Properly export in the `tf.data.experimental` API when
 # stable or make private / remove.
diff --git a/tensorflow/python/data/experimental/ops/unique.py b/tensorflow/python/data/experimental/ops/unique.py
index 2a7775c456e..55ed98d8542 100644
--- a/tensorflow/python/data/experimental/ops/unique.py
+++ b/tensorflow/python/data/experimental/ops/unique.py
@@ -48,7 +48,7 @@ def unique():
   return _apply_fn
 
 
-class _UniqueDataset(dataset_ops.UnaryDataset):
+class _UniqueDataset(dataset_ops.UnaryUnchangedStructureDataset):
   """A `Dataset` contains the unique elements from its input."""
 
   def __init__(self, input_dataset):
@@ -65,15 +65,3 @@ class _UniqueDataset(dataset_ops.UnaryDataset):
     return gen_experimental_dataset_ops.experimental_unique_dataset(
         self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
         **dataset_ops.flat_structure(self))
-
-  @property
-  def output_classes(self):
-    return self._input_dataset.output_classes
-
-  @property
-  def output_shapes(self):
-    return self._input_dataset.output_shapes
-
-  @property
-  def output_types(self):
-    return self._input_dataset.output_types
diff --git a/tensorflow/python/data/experimental/ops/writers.py b/tensorflow/python/data/experimental/ops/writers.py
index 994447cb4db..aef6da51409 100644
--- a/tensorflow/python/data/experimental/ops/writers.py
+++ b/tensorflow/python/data/experimental/ops/writers.py
@@ -22,7 +22,7 @@ from tensorflow.python.data.util import convert
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
-from tensorflow.python.ops import gen_dataset_ops
+from tensorflow.python.ops import gen_experimental_dataset_ops
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -48,7 +48,7 @@ class TFRecordWriter(object):
     Returns:
       A `tf.Operation` that, when run, writes contents of `dataset` to a file.
     """
-    if not isinstance(dataset, dataset_ops.Dataset):
+    if not isinstance(dataset, dataset_ops.DatasetV2):
       raise TypeError("`dataset` must be a `tf.data.Dataset` object.")
     if (dataset.output_types != dtypes.string or
         dataset.output_shapes != tensor_shape.scalar()):
@@ -56,5 +56,5 @@ class TFRecordWriter(object):
           "`dataset` must produce scalar `DT_STRING` tensors whereas it "
           "produces shape {0} and types {1}".format(dataset.output_shapes,
                                                     dataset.output_types))
-    return gen_dataset_ops.dataset_to_tf_record(
+    return gen_experimental_dataset_ops.experimental_dataset_to_tf_record(
         dataset._as_variant_tensor(), self._filename, self._compression_type)  # pylint: disable=protected-access
diff --git a/tensorflow/python/data/kernel_tests/BUILD b/tensorflow/python/data/kernel_tests/BUILD
index 21eed2b070a..9f7ce99cbc6 100644
--- a/tensorflow/python/data/kernel_tests/BUILD
+++ b/tensorflow/python/data/kernel_tests/BUILD
@@ -10,122 +10,101 @@ load("//tensorflow:tensorflow.bzl", "tf_py_test")
 load("//tensorflow:tensorflow.bzl", "cuda_py_test")
 
 tf_py_test(
-    name = "batch_dataset_op_test",
+    name = "batch_test",
     size = "small",
-    srcs = ["batch_dataset_op_test.py"],
+    srcs = ["batch_test.py"],
     additional_deps = [
         ":test_base",
         "@absl_py//absl/testing:parameterized",
         "//third_party/py/numpy",
+        "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python:constant_op",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
         "//tensorflow/python:math_ops",
-        "//tensorflow/python:string_ops",
-        "//tensorflow/python:tensor_shape",
-        "//tensorflow/python:util",
-        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python:sparse_tensor",
     ],
 )
 
 tf_py_test(
-    name = "cache_dataset_op_test",
+    name = "cache_test",
     size = "small",
-    srcs = ["cache_dataset_op_test.py"],
+    srcs = ["cache_test.py"],
     additional_deps = [
         ":test_base",
         "//third_party/py/numpy",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/ops:iterator_ops",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:constant_op",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
-        "//tensorflow/python:variables",
-        "//tensorflow/python/data/ops:dataset_ops",
-        "//tensorflow/python/data/ops:iterator_ops",
-    ],
-)
-
-tf_py_test(
-    name = "concatenate_dataset_op_test",
-    size = "small",
-    srcs = ["concatenate_dataset_op_test.py"],
-    additional_deps = [
-        ":test_base",
-        "//third_party/py/numpy",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:tensor_shape",
-        "//tensorflow/python/data/ops:dataset_ops",
-        "//tensorflow/python/data/util:nest",
-    ],
-)
-
-tf_py_test(
-    name = "dataset_constructor_op_test",
-    size = "small",
-    srcs = ["dataset_constructor_op_test.py"],
-    additional_deps = [
-        ":test_base",
-        "//third_party/py/numpy",
-        "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:errors",
         "//tensorflow/python:framework_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:random_ops",
-        "//tensorflow/python:resource_variable_ops",
-        "//tensorflow/python:session",
-        "//tensorflow/python:sparse_tensor",
-        "//tensorflow/python:tensor_shape",
-        "//tensorflow/python/data/ops:dataset_ops",
-        "//tensorflow/python/data/util:nest",
-        "//tensorflow/python/data/util:sparse",
-    ],
-    tags = [
-        "manual",
-        "nomac",  # b/62040583
+        "//tensorflow/python:variables",
     ],
 )
 
 tf_py_test(
-    name = "dataset_from_generator_op_test",
-    size = "medium",
-    srcs = ["dataset_from_generator_op_test.py"],
+    name = "concatenate_test",
+    size = "small",
+    srcs = ["concatenate_test.py"],
     additional_deps = [
         ":test_base",
         "//third_party/py/numpy",
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
         "//tensorflow/python:tensor_shape",
         "//tensorflow/python/data/ops:dataset_ops",
-        "//tensorflow/python/data/util:sparse",
+        "//tensorflow/python/data/util:nest",
     ],
 )
 
 tf_py_test(
-    name = "dataset_ops_test",
+    name = "dataset_checkpoint_test",
     size = "small",
-    srcs = ["dataset_ops_test.py"],
+    srcs = ["dataset_checkpoint_test.py"],
+    additional_deps = [
+        ":test_base",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/ops:iterator_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:dataset_ops_gen",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:io_ops",
+        "//tensorflow/python:parsing_ops",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:tensor_shape",
+        "//tensorflow/python:variables",
+    ],
+)
+
+tf_py_test(
+    name = "dataset_test",
+    size = "small",
+    srcs = ["dataset_test.py"],
     additional_deps = [
         ":test_base",
         "@absl_py//absl/testing:parameterized",
         "//third_party/py/numpy",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:sparse_tensor",
+        "//tensorflow/core:protos_all_py",
         "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/ops:readers",
+        "//tensorflow/python/data/util:nest",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:sparse_tensor",
     ],
 )
 
 tf_py_test(
-    name = "filter_dataset_op_test",
+    name = "filter_test",
     size = "small",
-    srcs = ["filter_dataset_op_test.py"],
+    srcs = ["filter_test.py"],
     additional_deps = [
         ":test_base",
         "//third_party/py/numpy",
@@ -141,12 +120,36 @@ tf_py_test(
 )
 
 tf_py_test(
-    name = "flat_map_dataset_op_test",
+    name = "fixed_length_record_dataset_test",
     size = "small",
-    srcs = ["flat_map_dataset_op_test.py"],
+    srcs = ["fixed_length_record_dataset_test.py"],
+    additional_deps = [
+        ":test_base",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:dataset_ops_gen",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:io_ops",
+        "//tensorflow/python:parsing_ops",
+        "//tensorflow/python:tensor_shape",
+        "//tensorflow/python:util",
+        "//tensorflow/python/data/ops:iterator_ops",
+        "//tensorflow/python/data/ops:readers",
+    ],
+)
+
+tf_py_test(
+    name = "flat_map_test",
+    size = "medium",
+    srcs = ["flat_map_test.py"],
     additional_deps = [
         ":test_base",
         "//third_party/py/numpy",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python/data/ops:readers",
+        "//tensorflow/python/data/util:nest",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:errors",
         "//tensorflow/python:session",
@@ -159,58 +162,157 @@ tf_py_test(
 )
 
 tf_py_test(
-    name = "list_files_dataset_op_test",
-    size = "small",
-    srcs = ["list_files_dataset_op_test.py"],
+    name = "from_generator_test",
+    size = "medium",
+    srcs = ["from_generator_test.py"],
     additional_deps = [
         ":test_base",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:util",
-        "//tensorflow/python/data/ops:dataset_ops",
-    ],
-)
-
-tf_py_test(
-    name = "inputs_test",
-    size = "small",
-    srcs = ["inputs_test.py"],
-    additional_deps = [
-        ":test_base",
-        "@absl_py//absl/testing:parameterized",
-        "//third_party/py/numpy",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:sparse_tensor",
-        "//tensorflow/python/data/ops:dataset_ops",
-    ],
-)
-
-tf_py_test(
-    name = "interleave_dataset_op_test",
-    size = "small",
-    srcs = ["interleave_dataset_op_test.py"],
-    additional_deps = [
-        ":test_base",
-        "@absl_py//absl/testing:parameterized",
-        "//third_party/py/numpy",
-        "//tensorflow/python:array_ops",
+        "//third_party/py/numpy",
+        "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
+        "//tensorflow/python:script_ops",
         "//tensorflow/python:session",
+    ],
+)
+
+tf_py_test(
+    name = "from_sparse_tensor_slices_test",
+    size = "small",
+    srcs = ["from_sparse_tensor_slices_test.py"],
+    additional_deps = [
+        ":test_base",
+        "//third_party/py/numpy",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python:session",
+        "//tensorflow/python:sparse_tensor",
+        "//tensorflow/python:tensor_shape",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/util:nest",
+    ],
+)
+
+tf_py_test(
+    name = "from_tensors_test",
+    size = "small",
+    srcs = ["from_tensors_test.py"],
+    additional_deps = [
+        ":test_base",
+        "//third_party/py/numpy",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python:session",
+        "//tensorflow/python:sparse_tensor",
+        "//tensorflow/python:tensor_shape",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/util:nest",
+    ],
+    tags = [
+        "nomac",  # b/62040583
+    ],
+)
+
+tf_py_test(
+    name = "from_tensor_slices_test",
+    size = "small",
+    srcs = ["from_tensor_slices_test.py"],
+    additional_deps = [
+        ":test_base",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:sparse_tensor",
+        "//tensorflow/python:tensor_shape",
+        "//tensorflow/python/data/ops:dataset_ops",
+    ],
+)
+
+tf_py_test(
+    name = "interleave_test",
+    size = "medium",
+    srcs = ["interleave_test.py"],
+    additional_deps = [
+        ":test_base",
+        "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:script_ops",
         "//tensorflow/python:sparse_ops",
         "//tensorflow/python:sparse_tensor",
-        "//tensorflow/python:training",
+    ],
+)
+
+tf_py_test(
+    name = "iterator_checkpoint_test",
+    size = "medium",
+    srcs = ["iterator_checkpoint_test.py"],
+    additional_deps = [
+        ":test_base",
         "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/training/checkpointable:util",
+        "//tensorflow/python:checkpoint_management",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:math_ops",
+    ],
+    grpc_enabled = True,
+)
+
+tf_py_test(
+    name = "iterator_cluster_test",
+    size = "small",
+    srcs = ["iterator_cluster_test.py"],
+    additional_deps = [
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/ops:iterator_ops",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:function",
+        "//tensorflow/python:functional_ops",
+        "//tensorflow/python:lookup_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:session",
+        "//tensorflow/python:string_ops",
+    ],
+    grpc_enabled = True,
+    tags = [
+        "no_oss",  # Test flaky due to port collisions.
+        "no_windows",
     ],
 )
 
 cuda_py_test(
-    name = "iterator_ops_test",
-    size = "small",
-    srcs = ["iterator_ops_test.py"],
+    name = "iterator_test",
+    size = "medium",
+    srcs = ["iterator_test.py"],
     additional_deps = [
         "@absl_py//absl/testing:parameterized",
         "//third_party/py/numpy",
@@ -249,41 +351,30 @@ cuda_py_test(
 )
 
 tf_py_test(
-    name = "iterator_ops_cluster_test",
+    name = "list_files_test",
     size = "small",
-    srcs = ["iterator_ops_cluster_test.py"],
+    srcs = ["list_files_test.py"],
     additional_deps = [
-        "//tensorflow/core:protos_all_py",
+        ":test_base",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:function",
-        "//tensorflow/python:functional_ops",
-        "//tensorflow/python:session",
+        "//tensorflow/python:util",
         "//tensorflow/python/data/ops:dataset_ops",
-        "//tensorflow/python/data/ops:iterator_ops",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:string_ops",
-        "//tensorflow/python:lookup_ops",
-    ],
-    grpc_enabled = True,
-    tags = [
-        "no_oss",  # Test flaky due to port collisions.
-        "no_windows",
     ],
 )
 
 tf_py_test(
-    name = "map_dataset_op_test",
-    size = "small",
-    srcs = ["map_dataset_op_test.py"],
+    name = "map_test",
+    size = "medium",
+    srcs = ["map_test.py"],
     additional_deps = [
         ":test_base",
         "@absl_py//absl/testing:parameterized",
         "//third_party/py/numpy",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:constant_op",
@@ -297,27 +388,12 @@ tf_py_test(
         "//tensorflow/python:math_ops",
         "//tensorflow/python:random_ops",
         "//tensorflow/python:script_ops",
+        "//tensorflow/python:session",
         "//tensorflow/python:sparse_ops",
         "//tensorflow/python:sparse_tensor",
         "//tensorflow/python:string_ops",
+        "//tensorflow/python:tensor_util",
         "//tensorflow/python:variable_scope",
-        "//tensorflow/python/data/ops:dataset_ops",
-    ],
-)
-
-tf_py_test(
-    name = "matching_files_dataset_op_test",
-    size = "small",
-    srcs = ["matching_files_dataset_op_test.py"],
-    additional_deps = [
-        ":test_base",
-        "//third_party/py/numpy",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:util",
-        "//tensorflow/python/data/ops:dataset_ops",
     ],
 )
 
@@ -332,6 +408,7 @@ cuda_py_test(
         "//tensorflow/python/data/ops:multi_device_iterator_ops",
         "//tensorflow/python/data/ops:iterator_ops",
         "//tensorflow/python/data/experimental/ops:optimization",
+        "//tensorflow/python/data/experimental/ops:optimization_options",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:dtypes",
@@ -345,9 +422,9 @@ cuda_py_test(
 )
 
 cuda_py_test(
-    name = "optional_ops_test",
+    name = "optional_test",
     size = "small",
-    srcs = ["optional_ops_test.py"],
+    srcs = ["optional_test.py"],
     additional_deps = [
         ":test_base",
         "@absl_py//absl/testing:parameterized",
@@ -366,9 +443,30 @@ cuda_py_test(
 )
 
 tf_py_test(
-    name = "prefetch_dataset_op_test",
+    name = "padded_batch_test",
     size = "small",
-    srcs = ["prefetch_dataset_op_test.py"],
+    srcs = ["padded_batch_test.py"],
+    additional_deps = [
+        ":test_base",
+        "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:sparse_tensor",
+        "//tensorflow/python:string_ops",
+        "//tensorflow/python:tensor_shape",
+        "//tensorflow/python:util",
+    ],
+)
+
+tf_py_test(
+    name = "prefetch_test",
+    size = "small",
+    srcs = ["prefetch_test.py"],
     additional_deps = [
         ":test_base",
         "@absl_py//absl/testing:parameterized",
@@ -377,59 +475,26 @@ tf_py_test(
         "//tensorflow/python:dataset_ops_gen",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
-        "//tensorflow/python/data/ops:dataset_ops",
     ],
 )
 
 tf_py_test(
-    name = "range_dataset_op_test",
+    name = "range_test",
     size = "small",
-    srcs = ["range_dataset_op_test.py"],
+    srcs = ["range_test.py"],
     additional_deps = [
         ":test_base",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:dataset_ops_gen",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:io_ops",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:parsing_ops",
-        "//tensorflow/python:platform",
-        "//tensorflow/python:tensor_shape",
-        "//tensorflow/python:variables",
         "//tensorflow/python/data/ops:dataset_ops",
-        "//tensorflow/python/data/ops:iterator_ops",
-    ],
-)
-
-tf_py_test(
-    name = "reader_dataset_ops_test",
-    size = "small",
-    srcs = ["reader_dataset_ops_test.py"],
-    additional_deps = [
-        ":test_base",
-        "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:dataset_ops_gen",
-        "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:io_ops",
-        "//tensorflow/python:lib",
-        "//tensorflow/python:parsing_ops",
-        "//tensorflow/python:tensor_shape",
-        "//tensorflow/python:util",
-        "//tensorflow/python/data/ops:iterator_ops",
-        "//tensorflow/python/data/ops:readers",
+        "//tensorflow/python:framework_test_lib",
     ],
 )
 
 tf_py_test(
-    name = "reduce_dataset_op_test",
+    name = "reduce_test",
     size = "small",
-    srcs = ["reduce_dataset_op_test.py"],
+    srcs = ["reduce_test.py"],
     additional_deps = [
         ":test_base",
         "@absl_py//absl/testing:parameterized",
@@ -437,7 +502,6 @@ tf_py_test(
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:dtypes",
-        "//tensorflow/python:errors",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:sparse_tensor",
         "//tensorflow/python/data/ops:dataset_ops",
@@ -445,9 +509,9 @@ tf_py_test(
 )
 
 tf_py_test(
-    name = "sequence_dataset_op_test",
+    name = "repeat_test",
     size = "small",
-    srcs = ["sequence_dataset_op_test.py"],
+    srcs = ["repeat_test.py"],
     additional_deps = [
         ":test_base",
         "//third_party/py/numpy",
@@ -460,9 +524,9 @@ tf_py_test(
 )
 
 tf_py_test(
-    name = "shard_dataset_op_test",
+    name = "shard_test",
     size = "small",
-    srcs = ["shard_dataset_op_test.py"],
+    srcs = ["shard_test.py"],
     additional_deps = [
         ":test_base",
         "//tensorflow/python:client_testlib",
@@ -472,9 +536,9 @@ tf_py_test(
 )
 
 tf_py_test(
-    name = "shuffle_dataset_op_test",
+    name = "shuffle_test",
     size = "small",
-    srcs = ["shuffle_dataset_op_test.py"],
+    srcs = ["shuffle_test.py"],
     additional_deps = [
         ":test_base",
         "@absl_py//absl/testing:parameterized",
@@ -491,21 +555,91 @@ tf_py_test(
     ],
 )
 
-py_library(
-    name = "test_base",
-    srcs = ["test_base.py"],
-    deps = [
+tf_py_test(
+    name = "skip_test",
+    size = "small",
+    srcs = ["skip_test.py"],
+    additional_deps = [
+        ":test_base",
+        "//third_party/py/numpy",
+        "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
-        "//tensorflow/python:sparse_tensor",
-        "//tensorflow/python/data/util:nest",
+        "//tensorflow/python/data/ops:dataset_ops",
     ],
 )
 
 tf_py_test(
-    name = "window_dataset_op_test",
+    name = "take_test",
     size = "small",
-    srcs = ["window_dataset_op_test.py"],
+    srcs = ["take_test.py"],
+    additional_deps = [
+        ":test_base",
+        "//third_party/py/numpy",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python/data/ops:dataset_ops",
+    ],
+)
+
+tf_py_test(
+    name = "text_line_dataset_test",
+    size = "small",
+    srcs = ["text_line_dataset_test.py"],
+    additional_deps = [
+        ":test_base",
+        "//tensorflow/python/data/ops:iterator_ops",
+        "//tensorflow/python/data/ops:readers",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:util",
+    ],
+)
+
+tf_py_test(
+    name = "tf_record_dataset_test",
+    size = "small",
+    srcs = ["tf_record_dataset_test.py"],
+    additional_deps = [
+        ":test_base",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/ops:iterator_ops",
+        "//tensorflow/python/data/ops:readers",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:lib",
+        "//tensorflow/python:util",
+    ],
+)
+
+py_library(
+    name = "test_base",
+    srcs = ["test_base.py"],
+    deps = [
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:sparse_tensor",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/util:nest",
+        "//tensorflow/python/eager:context",
+    ],
+)
+
+tf_py_test(
+    name = "window_test",
+    size = "medium",
+    srcs = ["window_test.py"],
     additional_deps = [
         ":test_base",
         "@absl_py//absl/testing:parameterized",
@@ -521,9 +655,9 @@ tf_py_test(
 )
 
 tf_py_test(
-    name = "zip_dataset_op_test",
+    name = "zip_test",
     size = "small",
-    srcs = ["zip_dataset_op_test.py"],
+    srcs = ["zip_test.py"],
     additional_deps = [
         ":test_base",
         "//third_party/py/numpy",
diff --git a/tensorflow/python/data/kernel_tests/batch_dataset_op_test.py b/tensorflow/python/data/kernel_tests/batch_dataset_op_test.py
deleted file mode 100644
index e8decb9ad0e..00000000000
--- a/tensorflow/python/data/kernel_tests/batch_dataset_op_test.py
+++ /dev/null
@@ -1,515 +0,0 @@
-# -*- coding: utf-8 -*-
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for the experimental input pipeline ops."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import time
-
-from absl.testing import parameterized
-import numpy as np
-
-from tensorflow.python.client import session
-from tensorflow.python.data.kernel_tests import test_base
-from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import errors
-from tensorflow.python.framework import sparse_tensor
-from tensorflow.python.framework import tensor_shape
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import string_ops
-from tensorflow.python.platform import test
-from tensorflow.python.util import compat
-
-
-class BatchDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
-
-  @parameterized.named_parameters(
-      ('even', 28, 14, False),
-      ('uneven_with_remainder', 28, 15, False),
-      ('uneven_without_remainder', 28, 15, True),
-      ('empty', 0, 14, False),
-  )
-  def testBatchDataset(self, count, batch_size, drop_remainder):
-    """Tests the batch dataset logic for various input configurations.
-
-    Args:
-      count: the number of input elements
-      batch_size: the batch size
-      drop_remainder: whether a smaller batch size should be produced if batch
-        size does not divide number of inputs evenly
-    """
-
-    # The pipeline is TensorSliceDataset -> MapDataset(square_3) ->
-    # RepeatDataset(count) -> BatchDataset(batch_size).
-    components = (np.arange(7),
-                  np.array([[1, 2, 3]]) * np.arange(7)[:, np.newaxis],
-                  np.array(37.0) * np.arange(7))
-
-    count_t = array_ops.placeholder(dtypes.int64, shape=[])
-    batch_size_t = array_ops.placeholder(dtypes.int64, shape=[])
-    drop_remainder_t = array_ops.placeholder(dtypes.bool, shape=[])
-
-    def _map_fn(x, y, z):
-      return math_ops.square(x), math_ops.square(y), math_ops.square(z)
-
-    iterator = (
-        dataset_ops.Dataset.from_tensor_slices(components).map(_map_fn)
-        .repeat(count).batch(batch_size,
-                             drop_remainder).make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    if drop_remainder:
-      dim0 = batch_size
-    else:
-      dim0 = None
-    self.assertEqual([[dim0] + list(c.shape[1:]) for c in components],
-                     [t.shape.as_list() for t in get_next])
-
-    with self.cached_session() as sess:
-      sess.run(
-          init_op,
-          feed_dict={
-              count_t: count,
-              batch_size_t: batch_size,
-              drop_remainder_t: drop_remainder
-          })
-      num_full_batches = (count * 7) // batch_size
-      for i in range(num_full_batches):
-        result = sess.run(get_next)
-        for component, result_component in zip(components, result):
-          for j in range(batch_size):
-            self.assertAllEqual(component[(i * batch_size + j) % 7]**2,
-                                result_component[j])
-      if not drop_remainder and (count * 7) % batch_size > 0:
-        result = sess.run(get_next)
-        for component, result_component in zip(components, result):
-          for j in range((count * 7) % batch_size):
-            self.assertAllEqual(
-                component[(num_full_batches * batch_size + j) % 7]**2,
-                result_component[j])
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testBatchDatasetInvalidBatchSize(self):
-    iterator = (dataset_ops.Dataset.range(10).batch(0).make_one_shot_iterator())
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      with self.assertRaises(errors.InvalidArgumentError):
-        sess.run(get_next)
-
-  def testBatchSparse(self):
-
-    def _sparse(i):
-      return sparse_tensor.SparseTensorValue(
-          indices=[[0]], values=(i * [1]), dense_shape=[1])
-
-    iterator = dataset_ops.Dataset.range(10).map(_sparse).batch(
-        5).make_initializable_iterator()
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(init_op)
-      for i in range(2):
-        actual = sess.run(get_next)
-        expected = sparse_tensor.SparseTensorValue(
-            indices=[[0, 0], [1, 0], [2, 0], [3, 0], [4, 0]],
-            values=[i * 5, i * 5 + 1, i * 5 + 2, i * 5 + 3, i * 5 + 4],
-            dense_shape=[5, 1])
-        self.assertTrue(sparse_tensor.is_sparse(actual))
-        self.assertSparseValuesEqual(actual, expected)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testBatchSparseWithDifferentDenseShapes(self):
-
-    def _sparse(i):
-      return sparse_tensor.SparseTensorValue(
-          indices=array_ops.expand_dims(
-              math_ops.range(i, dtype=dtypes.int64), 1),
-          values=array_ops.fill([math_ops.to_int32(i)], i),
-          dense_shape=[i])
-
-    iterator = dataset_ops.Dataset.range(10).map(_sparse).batch(
-        5).make_initializable_iterator()
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(init_op)
-      for i in range(2):
-        actual = sess.run(get_next)
-        expected_indices = []
-        expected_values = []
-        for j in range(5):
-          for k in range(i * 5 + j):
-            expected_indices.append([j, k])
-            expected_values.append(i * 5 + j)
-        expected = sparse_tensor.SparseTensorValue(
-            indices=expected_indices,
-            values=expected_values,
-            dense_shape=[5, (i + 1) * 5 - 1])
-        self.assertTrue(sparse_tensor.is_sparse(actual))
-        self.assertSparseValuesEqual(actual, expected)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testNestedBatchSparse(self):
-
-    def _sparse(i):
-      return sparse_tensor.SparseTensorValue(
-          indices=[[0]], values=(i * [1]), dense_shape=[1])
-
-    iterator = dataset_ops.Dataset.range(10).map(_sparse).batch(5).batch(
-        2).make_initializable_iterator()
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(init_op)
-      actual = sess.run(get_next)
-      expected = sparse_tensor.SparseTensorValue(
-          indices=[[0, 0, 0], [0, 1, 0], [0, 2, 0], [0, 3, 0], [0, 4, 0],
-                   [1, 0, 0], [1, 1, 0], [1, 2, 0], [1, 3, 0], [1, 4, 0]],
-          values=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
-          dense_shape=[2, 5, 1])
-      self.assertTrue(sparse_tensor.is_sparse(actual))
-      self.assertSparseValuesEqual(actual, expected)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testBatchShapeError(self):
-
-    def generator():
-      yield [1.0, 2.0, 3.0]
-      yield [4.0, 5.0, 6.0]
-      yield [7.0, 8.0, 9.0, 10.0]
-
-    iterator = (
-        dataset_ops.Dataset.from_generator(
-            generator, dtypes.float32, output_shapes=[None]).batch(3)
-        .make_initializable_iterator())
-    next_element = iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(iterator.initializer)
-      with self.assertRaisesRegexp(
-          errors.InvalidArgumentError,
-          r'Cannot batch tensors with different shapes in component 0. '
-          r'First element had shape \[3\] and element 2 had shape \[4\].'):
-        sess.run(next_element)
-
-
-def _random_seq_lens(count):
-  return np.random.randint(20, size=(count,)).astype(np.int32)
-
-
-class PaddedBatchDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
-
-  @parameterized.named_parameters(
-      ('default_padding', _random_seq_lens(32), 4, [-1], False),
-      ('constant_padding', _random_seq_lens(32), 4, [25], False),
-      ('uneven_with_remainder', _random_seq_lens(34), 4, [-1], False),
-      ('uneven_without_remainder', _random_seq_lens(34), 4, [-1], True),
-  )
-  def testPaddedBatchDataset(self, seq_lens, batch_size, padded_shapes,
-                             drop_remainder):
-    """Tests the padded batch dataset logic for various input configurations.
-
-    Args:
-      seq_lens: the input sequence lengths
-      batch_size: the batch size
-      padded_shapes: the padded shapes to use
-      drop_remainder: whether a smaller batch size should be produced if batch
-        size does not divide number of inputs evenly
-    """
-
-    seq_lens_t = array_ops.placeholder(dtypes.int32, shape=[None])
-    batch_size_t = array_ops.placeholder(dtypes.int64, shape=[])
-    padded_shapes_t = array_ops.placeholder(dtypes.int64, shape=[1])
-    drop_remainder_t = array_ops.placeholder(dtypes.bool, shape=[])
-
-    iterator = (
-        dataset_ops.Dataset.from_tensor_slices(seq_lens_t)
-        .map(lambda x: array_ops.fill([x], x)).padded_batch(
-            batch_size=batch_size_t,
-            drop_remainder=drop_remainder_t,
-            padded_shapes=padded_shapes_t).make_initializable_iterator())
-
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(
-          init_op,
-          feed_dict={
-              seq_lens_t: seq_lens,
-              batch_size_t: batch_size,
-              padded_shapes_t: padded_shapes,
-              drop_remainder_t: drop_remainder,
-          })
-
-      num_full_batches = len(seq_lens) // batch_size
-
-      for i in range(num_full_batches):
-        result = sess.run(get_next)
-        padded_len = padded_shapes[0]
-        if padded_len is None or padded_len == -1:
-          padded_len = np.max(result) if result.size > 0 else 0
-        self.assertEqual((batch_size, padded_len), result.shape)
-        for j in range(batch_size):
-          seq_len = seq_lens[(i * batch_size) + j]
-          self.assertAllEqual(result[j, :seq_len], [seq_len] * seq_len)
-          self.assertAllEqual(result[j, seq_len:],
-                              [0] * (padded_len - seq_len))
-
-      if not drop_remainder and len(seq_lens) % batch_size > 0:
-        result = sess.run(get_next)
-        padded_len = np.max(result) if result.size > 0 else 0
-        self.assertEqual((len(seq_lens) % batch_size, padded_len),
-                         result.shape)
-        for j in range(len(seq_lens) % batch_size):
-          seq_len = seq_lens[num_full_batches * batch_size + j]
-          self.assertAllEqual(result[j, :seq_len], [seq_len] * seq_len)
-          self.assertAllEqual(result[j, seq_len:],
-                              [0] * (padded_len - seq_len))
-
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testPaddedBatchShortPadding(self):
-    iterator = (
-        dataset_ops.Dataset.from_tensor_slices([6, 5, 5, 5, 5])
-        .map(lambda x: array_ops.fill([x], x)).padded_batch(
-            batch_size=4, padded_shapes=[5]).make_one_shot_iterator())
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      with self.assertRaises(errors.DataLossError):
-        sess.run(get_next)
-
-  def testPaddedBatchEmptyTensors(self):
-    iterator = (
-        dataset_ops.Dataset.from_tensor_slices([0, 0, 0, 0])
-        .map(lambda x: array_ops.fill([x], x)).padded_batch(
-            batch_size=4, padded_shapes=[-1]).make_one_shot_iterator())
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      result = sess.run(get_next)
-      self.assertAllEqual([[], [], [], []], result)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testPaddedBatchDatasetNonDefaultPadding(self):
-    seq_lens = array_ops.placeholder(dtypes.int32, shape=[None])
-    padded_shape = array_ops.placeholder(dtypes.int64, shape=[1])
-
-    def fill_tuple(x):
-      filled = array_ops.fill([x], x)
-      return (filled, string_ops.as_string(filled))
-
-    iterator = (
-        dataset_ops.Dataset.from_tensor_slices(seq_lens).map(fill_tuple)
-        .padded_batch(
-            4,
-            padded_shapes=(padded_shape, padded_shape),
-            padding_values=(-1, '<end>')).make_initializable_iterator())
-
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      # Test with random sequence lengths, and max padding.
-      random_seq_lens = np.random.randint(20, size=(32,)).astype(np.int32)
-      sess.run(
-          init_op, feed_dict={
-              padded_shape: [-1],
-              seq_lens: random_seq_lens
-          })
-      for i in range(8):
-        result = sess.run(get_next)
-        padded_len = np.max(result[0])
-        self.assertEqual((4, padded_len), result[0].shape)
-        self.assertEqual((4, padded_len), result[1].shape)
-        for j in range(4):
-          seq_len = random_seq_lens[(i * 4) + j]
-          self.assertAllEqual(result[0][j, :seq_len], [seq_len] * seq_len)
-          self.assertAllEqual(result[0][j, seq_len:],
-                              [-1] * (padded_len - seq_len))
-          self.assertAllEqual(result[1][j, :seq_len],
-                              [compat.as_bytes(str(seq_len))] * seq_len)
-          self.assertAllEqual(result[1][j, seq_len:],
-                              [b'<end>'] * (padded_len - seq_len))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testPaddedBatchDatasetUnicode(self):
-    # See GitHub issue 16149
-    def generator():
-      data = [[u'Простой', u'тест', u'юникода'],
-              [u'никогда', u'не', u'бывает', u'простым']]
-
-      for seq in data:
-        yield seq, [0, 1, 2, 3]
-
-    dataset = dataset_ops.Dataset.from_generator(
-        generator, (dtypes.string, dtypes.int32),
-        (tensor_shape.TensorShape([None]), tensor_shape.TensorShape([None])))
-    padded_dataset = dataset.padded_batch(
-        2, padded_shapes=([None], [None]), padding_values=('', 0))
-    with self.cached_session() as sess:
-      next_element = padded_dataset.make_one_shot_iterator().get_next()
-      sess.run(next_element)
-
-  def testPaddedBatchDatasetShapeSpecifications(self):
-    int_placeholder = array_ops.placeholder(dtypes.int32)
-    float_placeholder = array_ops.placeholder(dtypes.float32)
-    string_placeholder = array_ops.placeholder(dtypes.string)
-    input_dataset = dataset_ops.Dataset.from_tensors(
-        (int_placeholder, float_placeholder, string_placeholder))
-
-    # Test different ways of specifying the `padded_shapes` argument.
-    dynamic_padding_from_tensor_shapes = input_dataset.padded_batch(
-        32,
-        padded_shapes=(tensor_shape.TensorShape([None]),
-                       tensor_shape.TensorShape([None, None]),
-                       tensor_shape.TensorShape([37])))
-    dynamic_padding_from_lists = input_dataset.padded_batch(
-        32, padded_shapes=([None], [None, None], [37]))
-    dynamic_padding_from_lists_with_minus_one = input_dataset.padded_batch(
-        32, padded_shapes=([-1], [-1, -1], [37]))
-    dynamic_padding_from_tensors = input_dataset.padded_batch(
-        32,
-        padded_shapes=(constant_op.constant([-1], dtype=dtypes.int64),
-                       constant_op.constant([-1, -1], dtype=dtypes.int64),
-                       constant_op.constant([37], dtype=dtypes.int64)))
-
-    for dataset in [
-        dynamic_padding_from_tensor_shapes, dynamic_padding_from_lists,
-        dynamic_padding_from_lists_with_minus_one, dynamic_padding_from_tensors
-    ]:
-      self.assertEqual([None, None], dataset.output_shapes[0].as_list())
-      self.assertEqual([None, None, None], dataset.output_shapes[1].as_list())
-      self.assertEqual([None, 37], dataset.output_shapes[2].as_list())
-
-  def testPaddedBatchSparseError(self):
-
-    def _map_fn(i):
-      return sparse_tensor.SparseTensorValue(
-          indices=[[0, 0]], values=(i * [1]), dense_shape=[1, 1]), i
-
-    with self.assertRaises(TypeError):
-      _ = dataset_ops.Dataset.range(10).map(_map_fn).padded_batch(10)
-
-  def testPaddedBatchShapeError(self):
-    with self.assertRaisesRegexp(
-        ValueError, r'The padded shape \(1,\) is not compatible with the '
-        r'corresponding input component shape \(\).'):
-      _ = dataset_ops.Dataset.range(10).padded_batch(5, padded_shapes=[1])
-
-    with self.assertRaisesRegexp(
-        ValueError, r'The padded shape \(1,\) is not compatible with the '
-        r'corresponding input component shape \(3,\).'):
-      _ = dataset_ops.Dataset.from_tensors([1, 2, 3]).padded_batch(
-          5, padded_shapes=[1])
-
-    with self.assertRaisesRegexp(
-        ValueError, r'Padded shape .* must be a 1-D tensor '
-        r'of tf.int64 values, but its shape was \(2, 2\).'):
-      _ = dataset_ops.Dataset.from_tensors([1, 2, 3]).padded_batch(
-          5, padded_shapes=[[1, 1], [1, 1]])
-
-    with self.assertRaisesRegexp(
-        TypeError, r'Padded shape .* must be a 1-D tensor '
-        r'of tf.int64 values, but its element type was float32.'):
-      _ = dataset_ops.Dataset.from_tensors([1, 2, 3]).padded_batch(
-          5, padded_shapes=constant_op.constant([1., 2., 3.]))
-
-    with self.assertRaisesRegexp(
-        ValueError, r'The padded shape \(1,\) is not compatible with the '
-        r'corresponding input component shape \(\).'):
-      shape_as_tensor = constant_op.constant([1], dtype=dtypes.int64)
-      _ = dataset_ops.Dataset.range(10).padded_batch(
-          5, padded_shapes=shape_as_tensor)
-
-    with self.assertRaisesRegexp(
-        ValueError,
-        r'The padded shape \((\?|None), (\?|None)\) is not compatible with the '
-        r'corresponding input component shape \(\).'):
-      shape_as_tensor = array_ops.placeholder(dtypes.int64, shape=[2])
-      _ = dataset_ops.Dataset.range(10).padded_batch(
-          5, padded_shapes=shape_as_tensor)
-
-
-class BatchDatasetBenchmark(test.Benchmark):
-
-  def benchmarkBatchSparse(self):
-    non_zeros_per_row_values = [0, 1, 5, 10, 100]
-    batch_size_values = [1, 32, 64, 128, 1024]
-
-    sparse_placeholder = array_ops.sparse_placeholder(dtype=dtypes.int64)
-    batch_size_placeholder = array_ops.placeholder(dtype=dtypes.int64, shape=[])
-
-    dataset = dataset_ops.Dataset.from_tensors(sparse_placeholder).repeat(
-        ).batch(batch_size_placeholder)
-    iterator = dataset.make_initializable_iterator()
-    next_element = iterator.get_next()
-
-    for non_zeros_per_row in non_zeros_per_row_values:
-
-      sparse_value = sparse_tensor.SparseTensorValue(
-          indices=np.arange(non_zeros_per_row, dtype=np.int64)[:, np.newaxis],
-          values=np.arange(non_zeros_per_row, dtype=np.int64),
-          dense_shape=[1000])
-
-      for batch_size in batch_size_values:
-
-        with session.Session() as sess:
-          sess.run(iterator.initializer, feed_dict={
-              sparse_placeholder: sparse_value,
-              batch_size_placeholder: batch_size})
-          # Run five steps to warm up the session caches before taking the
-          # first measurement.
-          for _ in range(5):
-            sess.run(next_element.indices.op)
-          deltas = []
-          for _ in range(100):
-            start = time.time()
-            for _ in range(100):
-              sess.run(next_element.indices.op)
-            end = time.time()
-            deltas.append(end - start)
-
-        median_wall_time = np.median(deltas) / 100.0
-
-        print('Batch sparse dataset non-zeros per row: %d batch_size: %d '
-              'wall time: %f'
-              % (non_zeros_per_row, batch_size, median_wall_time))
-        self.report_benchmark(
-            iters=10000, wall_time=median_wall_time,
-            name='benchmark_batch_sparse_dataset_nnz_%d_batch_size_%d' % (
-                non_zeros_per_row, batch_size))
-
-
-if __name__ == '__main__':
-  test.main()
diff --git a/tensorflow/python/data/kernel_tests/batch_test.py b/tensorflow/python/data/kernel_tests/batch_test.py
new file mode 100644
index 00000000000..5b035e59173
--- /dev/null
+++ b/tensorflow/python/data/kernel_tests/batch_test.py
@@ -0,0 +1,173 @@
+# -*- coding: utf-8 -*-
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for `tf.data.Dataset.batch()`."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+import numpy as np
+
+from tensorflow.python.data.kernel_tests import test_base
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.util import nest
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import test
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class BatchTest(test_base.DatasetTestBase, parameterized.TestCase):
+
+  @parameterized.named_parameters(
+      ('even', 28, 14, False),
+      ('uneven_with_remainder', 28, 15, False),
+      ('uneven_without_remainder', 28, 15, True),
+      ('empty', 0, 14, False),
+  )
+  def testBatchDataset(self, count, batch_size, drop_remainder):
+    """Tests the batch dataset logic for various input configurations.
+
+    Args:
+      count: the number of input elements
+      batch_size: the batch size
+      drop_remainder: whether a smaller batch size should be produced if batch
+        size does not divide number of inputs evenly
+    """
+
+    # The pipeline is TensorSliceDataset -> MapDataset(square_3) ->
+    # RepeatDataset(count) -> BatchDataset(batch_size).
+    components = (np.arange(7),
+                  np.array([[1, 2, 3]]) * np.arange(7)[:, np.newaxis],
+                  np.array(37.0) * np.arange(7))
+
+    def _map_fn(x, y, z):
+      return math_ops.square(x), math_ops.square(y), math_ops.square(z)
+
+    dataset = dataset_ops.Dataset.from_tensor_slices(components).map(
+        _map_fn).repeat(count).batch(batch_size, drop_remainder)
+    get_next = self.getNext(dataset)
+
+    if drop_remainder:
+      dim0 = batch_size
+    else:
+      dim0 = None
+    self.assertEqual(
+        [ts.as_list() for ts in nest.flatten(dataset.output_shapes)],
+        [[dim0] + list(c.shape[1:]) for c in components])
+
+    num_full_batches = (count * 7) // batch_size
+    for i in range(num_full_batches):
+      result = self.evaluate(get_next())
+      for component, result_component in zip(components, result):
+        for j in range(batch_size):
+          self.assertAllEqual(component[(i * batch_size + j) % 7]**2,
+                              result_component[j])
+    if not drop_remainder and (count * 7) % batch_size > 0:
+      result = self.evaluate(get_next())
+      for component, result_component in zip(components, result):
+        for j in range((count * 7) % batch_size):
+          self.assertAllEqual(
+              component[(num_full_batches * batch_size + j) % 7]**2,
+              result_component[j])
+    with self.assertRaises(errors.OutOfRangeError):
+      result = self.evaluate(get_next())
+
+  def testBatchDatasetInvalidBatchSize(self):
+    dataset = (dataset_ops.Dataset.range(10).batch(0))
+    self.assertDatasetProduces(
+        dataset, expected_error=(errors.InvalidArgumentError, ''))
+
+  def testBatchSparse(self):
+
+    def _sparse(i):
+      return sparse_tensor.SparseTensorValue(
+          indices=[[0]], values=(i * [1]), dense_shape=[1])
+
+    dataset = dataset_ops.Dataset.range(10).map(_sparse).batch(5)
+    expected_output = [
+        sparse_tensor.SparseTensorValue(
+            indices=[[0, 0], [1, 0], [2, 0], [3, 0], [4, 0]],
+            values=[i * 5, i * 5 + 1, i * 5 + 2, i * 5 + 3, i * 5 + 4],
+            dense_shape=[5, 1]) for i in range(2)
+    ]
+    self.assertDatasetProduces(dataset, expected_output=expected_output)
+
+  def testBatchSparseWithDifferentDenseShapes(self):
+
+    def _sparse(i):
+      return sparse_tensor.SparseTensorValue(
+          indices=array_ops.expand_dims(
+              math_ops.range(i, dtype=dtypes.int64), 1),
+          values=array_ops.fill([math_ops.to_int32(i)], i),
+          dense_shape=[i])
+
+    dataset = dataset_ops.Dataset.range(10).map(_sparse).batch(5)
+    expected_output = []
+    for i in range(2):
+      expected_indices = []
+      expected_outputs = []
+      for j in range(5):
+        for k in range(i * 5 + j):
+          expected_indices.append([j, k])
+          expected_outputs.append(i * 5 + j)
+      expected_output.append(
+          sparse_tensor.SparseTensorValue(
+              indices=expected_indices,
+              values=expected_outputs,
+              dense_shape=[5, (i + 1) * 5 - 1]))
+    self.assertDatasetProduces(dataset, expected_output=expected_output)
+
+  def testNestedBatchSparse(self):
+
+    def _sparse(i):
+      return sparse_tensor.SparseTensorValue(
+          indices=[[0]], values=(i * [1]), dense_shape=[1])
+
+    dataset = dataset_ops.Dataset.range(10).map(_sparse).batch(5).batch(2)
+    expected_output = [
+        sparse_tensor.SparseTensorValue(
+            indices=[[0, 0, 0], [0, 1, 0], [0, 2, 0], [0, 3, 0], [0, 4, 0],
+                     [1, 0, 0], [1, 1, 0], [1, 2, 0], [1, 3, 0], [1, 4, 0]],
+            values=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
+            dense_shape=[2, 5, 1])
+    ]
+    self.assertDatasetProduces(dataset, expected_output=expected_output)
+
+  def testBatchShapeError(self):
+
+    def generator():
+      yield [1.0, 2.0, 3.0]
+      yield [4.0, 5.0, 6.0]
+      yield [7.0, 8.0, 9.0, 10.0]
+
+    dataset = (
+        dataset_ops.Dataset.from_generator(
+            generator, dtypes.float32, output_shapes=[None]).batch(3))
+    self.assertDatasetProduces(
+        dataset,
+        expected_error=(
+            errors.InvalidArgumentError,
+            r'Cannot batch tensors with different shapes in component 0. First '
+            r'element had shape \[3\] and element 2 had shape \[4\].'))
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/data/kernel_tests/cache_dataset_op_test.py b/tensorflow/python/data/kernel_tests/cache_dataset_op_test.py
deleted file mode 100644
index 63625fac03b..00000000000
--- a/tensorflow/python/data/kernel_tests/cache_dataset_op_test.py
+++ /dev/null
@@ -1,318 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for the experimental input pipeline ops."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from os import path
-import shutil
-import tempfile
-
-import numpy as np
-
-from tensorflow.python.data.kernel_tests import test_base
-from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.data.ops import iterator_ops
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import errors
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import variables
-from tensorflow.python.platform import test
-
-
-class FileCacheDatasetTest(test_base.DatasetTestBase):
-
-  def setUp(self):
-    self.tmp_dir = tempfile.mkdtemp()
-    self.cache_prefix = path.join(self.tmp_dir, "cache")
-
-  def tearDown(self):
-    if self.tmp_dir:
-      shutil.rmtree(self.tmp_dir, ignore_errors=True)
-
-  def testCacheDatasetPassthrough(self):
-    components = (np.array([1, 2, 3, 4]), np.array([5, 6, 7, 8]),
-                  np.array([9.0, 10.0, 11.0, 12.0]))
-    count_placeholder = array_ops.placeholder_with_default(
-        constant_op.constant(5, dtypes.int64), shape=[])
-    filename_placeholder = array_ops.placeholder(dtypes.string, shape=[])
-
-    repeat_dataset = (dataset_ops.Dataset.from_tensor_slices(components)
-                      .repeat(count_placeholder))
-
-    cache_dataset = repeat_dataset.cache(filename_placeholder)
-
-    self.assertEqual(
-        tuple([c.shape[1:] for c in components]), cache_dataset.output_shapes)
-
-    # Create initialization ops for iterators without and with
-    # caching, respectively.
-    iterator = iterator_ops.Iterator.from_structure(cache_dataset.output_types,
-                                                    cache_dataset.output_shapes)
-    init_fifo_op = iterator.make_initializer(repeat_dataset)
-    init_cache_op = iterator.make_initializer(cache_dataset)
-
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      # First run without caching to collect the "ground truth".
-      sess.run(init_fifo_op)
-      elements = []
-      for _ in range(20):
-        elements.append(sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-      # Assert that the cached dataset has the same elements as the
-      # "ground truth".
-      sess.run(
-          init_cache_op, feed_dict={filename_placeholder: self.cache_prefix})
-      cached_elements = []
-      for _ in range(20):
-        cached_elements.append(sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-      self.assertAllEqual(elements, cached_elements)
-
-      # Re-initialize with an empty upstream (to throw errors.OutOfRangeError
-      # if we didn't use the cache).
-      sess.run(
-          init_cache_op,
-          feed_dict={
-              count_placeholder: 0,
-              filename_placeholder: self.cache_prefix
-          })
-      replayed_elements = []
-      for _ in range(20):
-        replayed_elements.append(sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-      self.assertEqual(cached_elements, replayed_elements)
-
-      # Re-initialize with an empty upstream and a missing cache file (should
-      # throw errors.OutOfRangeError immediately).
-      sess.run(
-          init_cache_op,
-          feed_dict={
-              count_placeholder: 0,
-              filename_placeholder: self.cache_prefix + "nonsense"
-          })
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testConcurrentWriters(self):
-    components = (np.array([1, 2, 3, 4]), np.array([5, 6, 7, 8]),
-                  np.array([9.0, 10.0, 11.0, 12.0]))
-    filename_placeholder = array_ops.placeholder(dtypes.string, shape=[])
-
-    cache_dataset1 = (dataset_ops.Dataset.from_tensor_slices(components)
-                      .cache(filename_placeholder))
-    cache_dataset2 = (dataset_ops.Dataset.from_tensor_slices(components)
-                      .cache(filename_placeholder))
-
-    iterator1 = cache_dataset1.make_initializable_iterator()
-    iterator2 = cache_dataset2.make_initializable_iterator()
-    init_cache_op1 = iterator1.initializer
-    init_cache_op2 = iterator2.initializer
-
-    get_next1 = iterator1.get_next()
-    get_next2 = iterator2.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(
-          init_cache_op1, feed_dict={filename_placeholder: self.cache_prefix})
-      sess.run(get_next1)  # this should succeed
-
-      sess.run(
-          init_cache_op2, feed_dict={filename_placeholder: self.cache_prefix})
-      with self.assertRaises(errors.AlreadyExistsError):
-        sess.run(get_next2)
-
-      sess.run(get_next1)  # this should continue to succeed
-
-  def testConcurrentReaders(self):
-    components = (np.array([1, 2, 3, 4]), np.array([5, 6, 7, 8]),
-                  np.array([9.0, 10.0, 11.0, 12.0]))
-    filename_placeholder = array_ops.placeholder(dtypes.string, shape=[])
-
-    cache_dataset1 = (dataset_ops.Dataset.from_tensor_slices(components)
-                      .cache(filename_placeholder))
-    cache_dataset2 = (dataset_ops.Dataset.from_tensor_slices(components)
-                      .cache(filename_placeholder))
-
-    iterator1 = cache_dataset1.make_initializable_iterator()
-    iterator2 = cache_dataset2.make_initializable_iterator()
-    init_cache_op1 = iterator1.initializer
-    init_cache_op2 = iterator2.initializer
-
-    get_next1 = iterator1.get_next()
-    get_next2 = iterator2.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(
-          init_cache_op1, feed_dict={filename_placeholder: self.cache_prefix})
-      elements = []
-      for _ in range(4):
-        elements.append(sess.run(get_next1))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next1)
-
-      # Re-initialize
-      sess.run(
-          init_cache_op1, feed_dict={filename_placeholder: self.cache_prefix})
-      sess.run(
-          init_cache_op2, feed_dict={filename_placeholder: self.cache_prefix})
-
-      # Reading concurrently should succeed.
-      elements_itr1 = []
-      elements_itr2 = []
-      elements_itr2.append(sess.run(get_next2))
-      elements_itr1.append(sess.run(get_next1))
-      elements_itr2.append(sess.run(get_next2))
-      elements_itr1.append(sess.run(get_next1))
-      # Intentionally reversing the order
-      elements_itr1.append(sess.run(get_next1))
-      elements_itr2.append(sess.run(get_next2))
-      elements_itr1.append(sess.run(get_next1))
-      elements_itr2.append(sess.run(get_next2))
-
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next2)
-
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next1)
-
-      self.assertAllEqual(elements, elements_itr1)
-      self.assertAllEqual(elements, elements_itr2)
-
-
-class MemoryCacheDatasetTest(test_base.DatasetTestBase):
-
-  def testCacheDatasetPassthrough(self):
-    with ops.device("cpu:0"):
-      repeat_count = variables.Variable(constant_op.constant(10, dtypes.int64))
-      dataset = dataset_ops.Dataset.range(3).flat_map(
-          lambda x: dataset_ops.Dataset.from_tensors(x).repeat(repeat_count))
-
-      cached_dataset = dataset.cache().repeat(2)
-      uncached_dataset = dataset.repeat(2)
-
-      # Needs to be initializable to capture the variable.
-      cached_iterator = cached_dataset.make_initializable_iterator()
-      cached_next = cached_iterator.get_next()
-      uncached_iterator = uncached_dataset.make_initializable_iterator()
-      uncached_next = uncached_iterator.get_next()
-
-      with self.cached_session() as sess:
-
-        sess.run(repeat_count.initializer)
-        sess.run(cached_iterator.initializer)
-        sess.run(uncached_iterator.initializer)
-
-        for i in range(3):
-          for _ in range(10):
-            self.assertEqual(sess.run(cached_next), i)
-            self.assertEqual(sess.run(uncached_next), i)
-
-        sess.run(repeat_count.assign(0))
-
-        # The uncached iterator should now be empty.
-        with self.assertRaises(errors.OutOfRangeError):
-          sess.run(uncached_next)
-
-        # The cached iterator replays from cache.
-        for i in range(3):
-          for _ in range(10):
-            self.assertEqual(sess.run(cached_next), i)
-
-        # The cached iterator should now be empty.
-        with self.assertRaises(errors.OutOfRangeError):
-          sess.run(cached_next)
-
-  def testEmptyCacheReading(self):
-    components = (np.array([1, 2, 3, 4]), np.array([5, 6, 7, 8]),
-                  np.array([9.0, 10.0, 11.0, 12.0]))
-    count_placeholder = array_ops.placeholder_with_default(
-        constant_op.constant(5, dtypes.int64), shape=[])
-
-    repeat_dataset = (dataset_ops.Dataset.from_tensor_slices(components)
-                      .repeat(count_placeholder))
-
-    cache_dataset = repeat_dataset.cache()
-
-    # Create initialization ops for iterators without and with
-    # caching, respectively.
-    iterator = cache_dataset.make_initializable_iterator()
-    init_cache_op = iterator.initializer
-
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      # Initialize with an empty upstream and a missing cache file (should
-      # throw errors.OutOfRangeError immediately).
-      sess.run(init_cache_op, feed_dict={count_placeholder: 0})
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testConcurrentReaders(self):
-    count_placeholder = array_ops.placeholder_with_default(
-        constant_op.constant(5, dtypes.int64), shape=[])
-    dataset = dataset_ops.Dataset.range(count_placeholder).cache()
-    d1 = dataset.map(lambda x: x + 1)
-    d2 = dataset.map(lambda x: x + 6)
-
-    i1 = d1.make_initializable_iterator()
-    i2 = d2.make_initializable_iterator()
-
-    with self.cached_session() as sess:
-      sess.run(i1.initializer)
-
-      self.assertEqual(1, sess.run(i1.get_next()))
-      self.assertEqual(2, sess.run(i1.get_next()))
-      self.assertEqual(3, sess.run(i1.get_next()))
-
-      sess.run(i2.initializer, feed_dict={count_placeholder: 3})
-
-      self.assertEqual(6, sess.run(i2.get_next()))
-      self.assertEqual(7, sess.run(i2.get_next()))
-      self.assertEqual(4, sess.run(i1.get_next()))  # interleave execution
-      self.assertEqual([8, 5], sess.run([i2.get_next(), i1.get_next()]))
-
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(i1.get_next())
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(i2.get_next())
-
-  def testCacheTakeRepeat(self):
-    dataset = dataset_ops.Dataset.range(10).cache().take(5).repeat(2)
-    itr = dataset.make_one_shot_iterator()
-    n = itr.get_next()
-
-    expected_values = [0, 1, 2, 3, 4, 0, 1, 2, 3, 4]
-
-    with self.cached_session() as sess:
-      for i, expected in enumerate(expected_values):
-        self.assertEqual(expected, sess.run(n),
-                         "Unexpected value at index %s" % i)
-
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(itr.get_next())
-
-
-if __name__ == "__main__":
-  test.main()
diff --git a/tensorflow/python/data/kernel_tests/cache_test.py b/tensorflow/python/data/kernel_tests/cache_test.py
new file mode 100644
index 00000000000..b561cd58baf
--- /dev/null
+++ b/tensorflow/python/data/kernel_tests/cache_test.py
@@ -0,0 +1,253 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for `tf.data.Dataset.cache()`."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from os import path
+import shutil
+import tempfile
+
+import numpy as np
+
+from tensorflow.python.data.kernel_tests import test_base
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class FileCacheTest(test_base.DatasetTestBase):
+
+  def setUp(self):
+    self.tmp_dir = tempfile.mkdtemp()
+    self.cache_prefix = path.join(self.tmp_dir, "cache")
+
+  def tearDown(self):
+    if self.tmp_dir:
+      shutil.rmtree(self.tmp_dir, ignore_errors=True)
+
+  def testCacheDatasetPassthrough(self):
+    components = (np.array([1, 2, 3, 4]), np.array([5, 6, 7, 8]),
+                  np.array([9.0, 10.0, 11.0, 12.0]))
+
+    def dataset_fn(count=5, filename=None):
+      repeat_dataset = (
+          dataset_ops.Dataset.from_tensor_slices(components).repeat(count))
+      if filename:
+        return repeat_dataset.cache(filename)
+      else:
+        return repeat_dataset
+
+    self.assertEqual(
+        tuple([c.shape[1:] for c in components]),
+        dataset_fn().output_shapes)
+
+    get_next = self.getNext(dataset_fn())
+
+    # First run without caching to collect the "ground truth".
+    elements = []
+    for _ in range(20):
+      elements.append(self.evaluate(get_next()))
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
+
+    # Assert that the cached dataset has the same elements as the
+    # "ground truth".
+    get_next = self.getNext(dataset_fn(filename=self.cache_prefix))
+    cached_elements = []
+    for _ in range(20):
+      cached_elements.append(self.evaluate(get_next()))
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
+    self.assertAllEqual(elements, cached_elements)
+
+    # Re-initialize with an empty upstream (to throw errors.OutOfRangeError
+    # if we didn't use the cache).
+    get_next = self.getNext(dataset_fn(count=0, filename=self.cache_prefix))
+    replayed_elements = []
+    for _ in range(20):
+      replayed_elements.append(self.evaluate(get_next()))
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
+    self.assertEqual(cached_elements, replayed_elements)
+
+    # Re-initialize with an empty upstream and a missing cache file (should
+    # throw errors.OutOfRangeError immediately).
+    get_next = self.getNext(
+        dataset_fn(count=0, filename=self.cache_prefix + "nonsense"))
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
+
+  def testConcurrentWriters(self):
+    components = (np.array([1, 2, 3, 4]), np.array([5, 6, 7, 8]),
+                  np.array([9.0, 10.0, 11.0, 12.0]))
+
+    cache_dataset1 = (
+        dataset_ops.Dataset.from_tensor_slices(components).cache(
+            self.cache_prefix))
+    cache_dataset2 = (
+        dataset_ops.Dataset.from_tensor_slices(components).cache(
+            self.cache_prefix))
+
+    get_next1 = self.getNext(cache_dataset1)
+    get_next2 = self.getNext(cache_dataset2)
+
+    self.evaluate(get_next1())  # this should succeed
+
+    with self.assertRaises(errors.AlreadyExistsError):
+      self.evaluate(get_next2())
+
+    self.evaluate(get_next1())  # this should continue to succeed
+
+  def testConcurrentReaders(self):
+    components = (np.array([1, 2, 3, 4]), np.array([5, 6, 7, 8]),
+                  np.array([9.0, 10.0, 11.0, 12.0]))
+
+    cache_dataset1 = (
+        dataset_ops.Dataset.from_tensor_slices(components).cache(
+            self.cache_prefix))
+    cache_dataset2 = (
+        dataset_ops.Dataset.from_tensor_slices(components).cache(
+            self.cache_prefix))
+
+    get_next1 = self.getNext(cache_dataset1)
+    get_next2 = self.getNext(cache_dataset2)
+
+    elements = []
+    for _ in range(4):
+      elements.append(self.evaluate(get_next1()))
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next1())
+
+    # Re-initialize
+    get_next1 = self.getNext(cache_dataset1)
+    get_next2 = self.getNext(cache_dataset2)
+
+    # Reading concurrently should succeed.
+    elements_itr1 = []
+    elements_itr2 = []
+    elements_itr2.append(self.evaluate(get_next2()))
+    elements_itr1.append(self.evaluate(get_next1()))
+    elements_itr2.append(self.evaluate(get_next2()))
+    elements_itr1.append(self.evaluate(get_next1()))
+    # Intentionally reversing the order
+    elements_itr1.append(self.evaluate(get_next1()))
+    elements_itr2.append(self.evaluate(get_next2()))
+    elements_itr1.append(self.evaluate(get_next1()))
+    elements_itr2.append(self.evaluate(get_next2()))
+
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next2())
+
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next1())
+
+    self.assertAllEqual(elements, elements_itr1)
+    self.assertAllEqual(elements, elements_itr2)
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class MemoryCacheTest(test_base.DatasetTestBase):
+
+  def testCacheDatasetPassthrough(self):
+    with ops.device("cpu:0"):
+      repeat_count = variables.Variable(constant_op.constant(10, dtypes.int64))
+      dataset = dataset_ops.Dataset.range(3).flat_map(
+          lambda x: dataset_ops.Dataset.from_tensors(x).repeat(repeat_count))
+
+      cached_dataset = dataset.cache().repeat(2)
+      uncached_dataset = dataset.repeat(2)
+
+      self.evaluate(repeat_count.initializer)
+      # Needs to be initializable to capture the variable.
+      cached_next = self.getNext(cached_dataset, requires_initialization=True)
+      uncached_next = self.getNext(
+          uncached_dataset, requires_initialization=True)
+      for i in range(3):
+        for _ in range(10):
+          self.assertEqual(self.evaluate(cached_next()), i)
+          self.assertEqual(self.evaluate(uncached_next()), i)
+
+      self.evaluate(repeat_count.assign(0))
+
+      # The uncached iterator should now be empty.
+      with self.assertRaises(errors.OutOfRangeError):
+        self.evaluate(uncached_next())
+
+      # The cached iterator replays from cache.
+      for i in range(3):
+        for _ in range(10):
+          self.assertEqual(self.evaluate(cached_next()), i)
+
+      # The cached iterator should now be empty.
+      with self.assertRaises(errors.OutOfRangeError):
+        self.evaluate(cached_next())
+
+  def testEmptyCacheReading(self):
+    components = (np.array([1, 2, 3, 4]), np.array([5, 6, 7, 8]),
+                  np.array([9.0, 10.0, 11.0, 12.0]))
+
+    repeat_dataset = (
+        dataset_ops.Dataset.from_tensor_slices(components).repeat(0))
+    cache_dataset = repeat_dataset.cache()
+
+    # Create initialization ops for iterators without and with
+    # caching, respectively.
+    self.assertDatasetProduces(cache_dataset, expected_output=[])
+
+  def testConcurrentReaders(self):
+
+    dataset = dataset_ops.Dataset.range(5).cache()
+    d1 = dataset.map(lambda x: x + 1)
+    d2 = dataset.map(lambda x: x + 6)
+
+    get_next1 = self.getNext(d1)
+
+    self.assertEqual(1, self.evaluate(get_next1()))
+    self.assertEqual(2, self.evaluate(get_next1()))
+    self.assertEqual(3, self.evaluate(get_next1()))
+
+    get_next2 = self.getNext(d2)
+
+    self.assertEqual(6, self.evaluate(get_next2()))
+    self.assertEqual(7, self.evaluate(get_next2()))
+    self.assertEqual(4, self.evaluate(get_next1()))  # interleave execution
+    self.assertEqual([8, 5],
+                     [self.evaluate(get_next2()),
+                      self.evaluate(get_next1())])
+    self.assertEqual(9, self.evaluate(get_next2()))
+    self.assertEqual(10, self.evaluate(get_next2()))
+
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next2())
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next1())
+
+  def testCacheTakeRepeat(self):
+    dataset = dataset_ops.Dataset.range(10).cache().take(5).repeat(2)
+
+    expected_output = [0, 1, 2, 3, 4, 0, 1, 2, 3, 4]
+    self.assertDatasetProduces(dataset, expected_output=expected_output)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/data/kernel_tests/concatenate_dataset_op_test.py b/tensorflow/python/data/kernel_tests/concatenate_test.py
similarity index 75%
rename from tensorflow/python/data/kernel_tests/concatenate_dataset_op_test.py
rename to tensorflow/python/data/kernel_tests/concatenate_test.py
index 83af31f380e..5d8bfdc8f3a 100644
--- a/tensorflow/python/data/kernel_tests/concatenate_dataset_op_test.py
+++ b/tensorflow/python/data/kernel_tests/concatenate_test.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for the experimental input pipeline ops."""
+"""Tests for `tf.data.Dataset.concatenate()."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
@@ -24,10 +24,12 @@ from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.util import nest
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
 
 
-class ConcatenateDatasetTest(test_base.DatasetTestBase):
+@test_util.run_all_in_graph_and_eager_modes
+class ConcatenateTest(test_base.DatasetTestBase):
 
   def testConcatenateDataset(self):
     input_components = (
@@ -46,23 +48,19 @@ class ConcatenateDatasetTest(test_base.DatasetTestBase):
     self.assertEqual(concatenated.output_shapes, (tensor_shape.TensorShape(
         [20]), tensor_shape.TensorShape([15]), tensor_shape.TensorShape([])))
 
-    iterator = concatenated.make_initializable_iterator()
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
+    get_next = self.getNext(concatenated)
 
-    with self.cached_session() as sess:
-      sess.run(init_op)
-      for i in range(9):
-        result = sess.run(get_next)
-        if i < 4:
-          for component, result_component in zip(input_components, result):
-            self.assertAllEqual(component[i], result_component)
-        else:
-          for component, result_component in zip(to_concatenate_components,
-                                                 result):
-            self.assertAllEqual(component[i - 4], result_component)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+    for i in range(9):
+      result = self.evaluate(get_next())
+      if i < 4:
+        for component, result_component in zip(input_components, result):
+          self.assertAllEqual(component[i], result_component)
+      else:
+        for component, result_component in zip(to_concatenate_components,
+                                               result):
+          self.assertAllEqual(component[i - 4], result_component)
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
 
   def testConcatenateDatasetDifferentShape(self):
     input_components = (
@@ -79,24 +77,18 @@ class ConcatenateDatasetTest(test_base.DatasetTestBase):
     self.assertEqual(
         [ts.as_list()
          for ts in nest.flatten(concatenated.output_shapes)], [[20], [None]])
-
-    iterator = concatenated.make_initializable_iterator()
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(init_op)
-      for i in range(9):
-        result = sess.run(get_next)
-        if i < 4:
-          for component, result_component in zip(input_components, result):
-            self.assertAllEqual(component[i], result_component)
-        else:
-          for component, result_component in zip(to_concatenate_components,
-                                                 result):
-            self.assertAllEqual(component[i - 4], result_component)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+    get_next = self.getNext(concatenated)
+    for i in range(9):
+      result = self.evaluate(get_next())
+      if i < 4:
+        for component, result_component in zip(input_components, result):
+          self.assertAllEqual(component[i], result_component)
+      else:
+        for component, result_component in zip(to_concatenate_components,
+                                               result):
+          self.assertAllEqual(component[i - 4], result_component)
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
 
   def testConcatenateDatasetDifferentStructure(self):
     input_components = (
diff --git a/tensorflow/python/data/kernel_tests/range_dataset_op_test.py b/tensorflow/python/data/kernel_tests/dataset_checkpoint_test.py
similarity index 84%
rename from tensorflow/python/data/kernel_tests/range_dataset_op_test.py
rename to tensorflow/python/data/kernel_tests/dataset_checkpoint_test.py
index b71e6b2ea43..6dcd94ea020 100644
--- a/tensorflow/python/data/kernel_tests/range_dataset_op_test.py
+++ b/tensorflow/python/data/kernel_tests/dataset_checkpoint_test.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Test RangeDataset."""
+"""Checkpoint tests for `tf.data.Dataset`."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
@@ -26,7 +26,6 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
-from tensorflow.python.framework import test_util
 from tensorflow.python.ops import gen_dataset_ops
 from tensorflow.python.ops import io_ops
 from tensorflow.python.ops import parsing_ops
@@ -35,51 +34,7 @@ from tensorflow.python.platform import gfile
 from tensorflow.python.platform import test
 
 
-@test_util.run_all_in_graph_and_eager_modes
-class RangeDatasetTest(test_base.DatasetTestBase):
-
-  def testStop(self):
-    dataset = dataset_ops.Dataset.range(5)
-    self.assertDatasetProduces(dataset, expected_output=range(5))
-
-  def testStartStop(self):
-    start, stop = 2, 5
-    dataset = dataset_ops.Dataset.range(start, stop)
-    self.assertDatasetProduces(dataset, expected_output=range(2, 5))
-
-  def testStartStopStep(self):
-    start, stop, step = 2, 10, 2
-    dataset = dataset_ops.Dataset.range(start, stop, step)
-    self.assertDatasetProduces(dataset, expected_output=range(2, 10, 2))
-
-  def testZeroStep(self):
-    start, stop, step = 2, 10, 0
-    dataset = dataset_ops.Dataset.range(start, stop, step)
-    self.assertDatasetProduces(
-        dataset, expected_err=(errors.InvalidArgumentError, ""))
-
-  def testNegativeStep(self):
-    start, stop, step = 2, 10, -1
-    dataset = dataset_ops.Dataset.range(start, stop, step)
-    self.assertDatasetProduces(dataset, expected_output=range(2, 10, -1))
-
-  def testStopLessThanStart(self):
-    start, stop = 10, 2
-    dataset = dataset_ops.Dataset.range(start, stop)
-    self.assertDatasetProduces(dataset, expected_output=range(10, 2))
-
-  def testStopLessThanStartWithPositiveStep(self):
-    start, stop, step = 10, 2, 2
-    dataset = dataset_ops.Dataset.range(start, stop, step)
-    self.assertDatasetProduces(dataset, expected_output=range(10, 2, 2))
-
-  def testStopLessThanStartWithNegativeStep(self):
-    start, stop, step = 10, 2, -1
-    dataset = dataset_ops.Dataset.range(start, stop, step)
-    self.assertDatasetProduces(dataset, expected_output=range(10, 2, -1))
-
-
-class ExperimentalCheckpointDatasetTest(test_base.DatasetTestBase):
+class DatasetCheckpointTest(test_base.DatasetTestBase):
 
   def tearDown(self):
     # Remove all checkpoint files.
@@ -109,8 +64,8 @@ class ExperimentalCheckpointDatasetTest(test_base.DatasetTestBase):
   def testSaveRestore(self):
 
     def _build_graph(start, stop):
-      iterator = dataset_ops.Dataset.range(start,
-                                           stop).make_initializable_iterator()
+      iterator = dataset_ops.make_initializable_iterator(
+          dataset_ops.Dataset.range(start, stop))
       init_op = iterator.initializer
       get_next = iterator.get_next()
       save_op = self._save_op(iterator._iterator_resource)
@@ -159,7 +114,7 @@ class ExperimentalCheckpointDatasetTest(test_base.DatasetTestBase):
 
     def _build_graph(start, stop, num_epochs):
       dataset = dataset_ops.Dataset.range(start, stop).repeat(num_epochs)
-      iterator = dataset.make_initializable_iterator()
+      iterator = dataset_ops.make_initializable_iterator(dataset)
       init_op = iterator.initializer
       get_next = iterator.get_next()
       save_op = self._save_op(iterator._iterator_resource)
@@ -206,7 +161,7 @@ class ExperimentalCheckpointDatasetTest(test_base.DatasetTestBase):
 
     def _build_graph(start, stop):
       dataset = dataset_ops.Dataset.range(start, stop)
-      iterator = dataset.make_initializable_iterator()
+      iterator = dataset_ops.make_initializable_iterator(dataset)
       init_op = iterator.initializer
       get_next = iterator.get_next()
       save_op = self._save_op(iterator._iterator_resource)
@@ -245,7 +200,7 @@ class ExperimentalCheckpointDatasetTest(test_base.DatasetTestBase):
 
     def _build_graph(start, stop):
       dataset = dataset_ops.Dataset.range(start, stop)
-      iterator = dataset.make_initializable_iterator()
+      iterator = dataset_ops.make_initializable_iterator(dataset)
       init_op = iterator.initializer
       get_next = iterator.get_next()
       save_op = self._save_op(iterator._iterator_resource)
@@ -278,8 +233,8 @@ class ExperimentalCheckpointDatasetTest(test_base.DatasetTestBase):
   def testMultipleSaves(self):
 
     def _build_graph(start, stop):
-      iterator = dataset_ops.Dataset.range(start,
-                                           stop).make_initializable_iterator()
+      iterator = dataset_ops.make_initializable_iterator(
+          dataset_ops.Dataset.range(start, stop))
       init_op = iterator.initializer
       get_next = iterator.get_next()
       save_op = self._save_op(iterator._iterator_resource)
@@ -321,8 +276,8 @@ class ExperimentalCheckpointDatasetTest(test_base.DatasetTestBase):
   def testSaveRestoreWithRepeat(self):
 
     def _build_graph(start, stop, num_epochs):
-      iterator = dataset_ops.Dataset.range(
-          start, stop).repeat(num_epochs).make_initializable_iterator()
+      iterator = dataset_ops.make_initializable_iterator(
+          dataset_ops.Dataset.range(start, stop).repeat(num_epochs))
       init_op = iterator.initializer
       get_next = iterator.get_next()
       save_op = self._save_op(iterator._iterator_resource)
@@ -366,8 +321,8 @@ class ExperimentalCheckpointDatasetTest(test_base.DatasetTestBase):
   def testSaveRestoreExhaustedIterator(self):
 
     def _build_graph(start, stop, num_epochs):
-      iterator = dataset_ops.Dataset.range(
-          start, stop).repeat(num_epochs).make_initializable_iterator()
+      iterator = dataset_ops.make_initializable_iterator(
+          dataset_ops.Dataset.range(start, stop).repeat(num_epochs))
       init_op = iterator.initializer
       get_next = iterator.get_next()
       save_op = self._save_op(iterator._iterator_resource)
diff --git a/tensorflow/python/data/kernel_tests/dataset_constructor_op_test.py b/tensorflow/python/data/kernel_tests/dataset_constructor_op_test.py
deleted file mode 100644
index bc6b36285aa..00000000000
--- a/tensorflow/python/data/kernel_tests/dataset_constructor_op_test.py
+++ /dev/null
@@ -1,650 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for the experimental input pipeline ops."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import time
-
-import numpy as np
-
-from tensorflow.core.protobuf import config_pb2
-from tensorflow.python.client import session
-from tensorflow.python.data.kernel_tests import test_base
-from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.data.util import nest
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import errors
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import sparse_tensor
-from tensorflow.python.framework import tensor_shape
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import resource_variable_ops
-from tensorflow.python.platform import test
-
-
-class DatasetConstructorTest(test_base.DatasetTestBase):
-
-  def testFromTensors(self):
-    """Test a dataset that represents a single tuple of tensors."""
-    components = (np.array(1), np.array([1, 2, 3]), np.array(37.0))
-
-    iterator = (dataset_ops.Dataset.from_tensors(components)
-                .make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    self.assertEqual([c.shape for c in components],
-                     [t.shape for t in get_next])
-
-    with self.cached_session() as sess:
-      sess.run(init_op)
-      results = sess.run(get_next)
-      for component, result_component in zip(components, results):
-        self.assertAllEqual(component, result_component)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testFromTensorsSparse(self):
-    """Test a dataset that represents a single tuple of tensors."""
-    components = (sparse_tensor.SparseTensorValue(
-        indices=np.array([[0]]),
-        values=np.array([0]),
-        dense_shape=np.array([1])),
-                  sparse_tensor.SparseTensorValue(
-                      indices=np.array([[0, 0], [1, 1]]),
-                      values=np.array([-1, 1]),
-                      dense_shape=np.array([2, 2])))
-
-    iterator = (
-        dataset_ops.Dataset.from_tensors(components)
-        .make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    self.assertEqual(
-        [tensor_shape.TensorShape(c.dense_shape) for c in components],
-        [shape for shape in iterator.output_shapes])
-
-    with self.cached_session() as sess:
-      sess.run(init_op)
-      results = sess.run(get_next)
-      for component, result_component in zip(components, results):
-        self.assertSparseValuesEqual(component, result_component)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testFromTensorsMixed(self):
-    """Test an dataset that represents a single tuple of tensors."""
-    components = (np.array(1), np.array([1, 2, 3]), np.array(37.0),
-                  sparse_tensor.SparseTensorValue(
-                      indices=np.array([[0]]),
-                      values=np.array([0]),
-                      dense_shape=np.array([1])),
-                  sparse_tensor.SparseTensorValue(
-                      indices=np.array([[0, 0], [1, 1]]),
-                      values=np.array([-1, 1]),
-                      dense_shape=np.array([2, 2])))
-
-    iterator = (
-        dataset_ops.Dataset.from_tensors(components)
-        .make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    self.assertEqual([
-        tensor_shape.TensorShape(c.dense_shape)
-        if sparse_tensor.is_sparse(c) else c.shape for c in components
-    ], [shape for shape in iterator.output_shapes])
-
-    with self.cached_session() as sess:
-      sess.run(init_op)
-      results = sess.run(get_next)
-      for component, result_component in zip(components, results):
-        if sparse_tensor.is_sparse(component):
-          self.assertSparseValuesEqual(component, result_component)
-        else:
-          self.assertAllEqual(component, result_component)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testFromTensorSlices(self):
-    """Test a dataset that represents the slices from a tuple of tensors."""
-    components = (
-        np.tile(np.array([[1], [2], [3], [4]]), 20), np.tile(
-            np.array([[12], [13], [14], [15]]), 22),
-        np.array([37.0, 38.0, 39.0, 40.0])
-    )
-
-    iterator = (dataset_ops.Dataset.from_tensor_slices(components)
-                .make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    self.assertEqual([c.shape[1:] for c in components],
-                     [t.shape for t in get_next])
-
-    with self.cached_session() as sess:
-      sess.run(init_op)
-      for i in range(4):
-        results = sess.run(get_next)
-        for component, result_component in zip(components, results):
-          self.assertAllEqual(component[i], result_component)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testFromTensorSlicesSparse(self):
-    """Test a dataset that represents the slices from a tuple of tensors."""
-    components = (sparse_tensor.SparseTensorValue(
-        indices=np.array([[0, 0], [1, 0], [2, 0]]),
-        values=np.array([0, 0, 0]),
-        dense_shape=np.array([3, 1])),
-                  sparse_tensor.SparseTensorValue(
-                      indices=np.array([[0, 0], [1, 1], [2, 2]]),
-                      values=np.array([1, 2, 3]),
-                      dense_shape=np.array([3, 3])))
-
-    iterator = (
-        dataset_ops.Dataset.from_tensor_slices(components)
-        .make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    self.assertEqual(
-        [tensor_shape.TensorShape(c.dense_shape[1:]) for c in components],
-        [shape for shape in iterator.output_shapes])
-
-    with self.cached_session() as sess:
-      sess.run(init_op)
-      expected = [
-          (sparse_tensor.SparseTensorValue(
-              indices=np.array([[0]]),
-              values=np.array([0]),
-              dense_shape=np.array([1])),
-           sparse_tensor.SparseTensorValue(
-               indices=np.array([[0]]),
-               values=np.array([1]),
-               dense_shape=np.array([3]))),
-          (sparse_tensor.SparseTensorValue(
-              indices=np.array([[0]]),
-              values=np.array([0]),
-              dense_shape=np.array([1])),
-           sparse_tensor.SparseTensorValue(
-               indices=np.array([[1]]),
-               values=np.array([2]),
-               dense_shape=np.array([3]))),
-          (sparse_tensor.SparseTensorValue(
-              indices=np.array([[0]]),
-              values=np.array([0]),
-              dense_shape=np.array([1])),
-           sparse_tensor.SparseTensorValue(
-               indices=np.array([[2]]),
-               values=np.array([3]),
-               dense_shape=np.array([3]))),
-      ]
-      for i in range(3):
-        results = sess.run(get_next)
-        for component, result_component in zip(expected[i], results):
-          self.assertSparseValuesEqual(component, result_component)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testFromTensorSlicesMixed(self):
-    """Test a dataset that represents the slices from a tuple of tensors."""
-    components = (np.tile(np.array([[1], [2], [3]]), 20),
-                  np.tile(np.array([[12], [13], [14]]), 22),
-                  np.array([37.0, 38.0, 39.0]),
-                  sparse_tensor.SparseTensorValue(
-                      indices=np.array([[0, 0], [1, 0], [2, 0]]),
-                      values=np.array([0, 0, 0]),
-                      dense_shape=np.array([3, 1])),
-                  sparse_tensor.SparseTensorValue(
-                      indices=np.array([[0, 0], [1, 1], [2, 2]]),
-                      values=np.array([1, 2, 3]),
-                      dense_shape=np.array([3, 3])))
-
-    iterator = (
-        dataset_ops.Dataset.from_tensor_slices(components)
-        .make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    self.assertEqual([
-        tensor_shape.TensorShape(c.dense_shape[1:])
-        if sparse_tensor.is_sparse(c) else c.shape[1:] for c in components
-    ], [shape for shape in iterator.output_shapes])
-
-    with self.cached_session() as sess:
-      sess.run(init_op)
-      expected = [
-          (sparse_tensor.SparseTensorValue(
-              indices=np.array([[0]]),
-              values=np.array([0]),
-              dense_shape=np.array([1])),
-           sparse_tensor.SparseTensorValue(
-               indices=np.array([[0]]),
-               values=np.array([1]),
-               dense_shape=np.array([3]))),
-          (sparse_tensor.SparseTensorValue(
-              indices=np.array([[0]]),
-              values=np.array([0]),
-              dense_shape=np.array([1])),
-           sparse_tensor.SparseTensorValue(
-               indices=np.array([[1]]),
-               values=np.array([2]),
-               dense_shape=np.array([3]))),
-          (sparse_tensor.SparseTensorValue(
-              indices=np.array([[0]]),
-              values=np.array([0]),
-              dense_shape=np.array([1])),
-           sparse_tensor.SparseTensorValue(
-               indices=np.array([[2]]),
-               values=np.array([3]),
-               dense_shape=np.array([3]))),
-      ]
-      for i in range(3):
-        results = sess.run(get_next)
-        for component, result_component in zip(
-            (list(zip(*components[:3]))[i] + expected[i]), results):
-          if sparse_tensor.is_sparse(component):
-            self.assertSparseValuesEqual(component, result_component)
-          else:
-            self.assertAllEqual(component, result_component)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testFromTensorSlicesWithDict(self):
-    components = {"foo": [1, 2, 3], "bar": [[4.0], [5.0], [6.0]]}
-    iterator = (dataset_ops.Dataset.from_tensor_slices(components)
-                .make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    self.assertEqual(dtypes.int32, iterator.output_types["foo"])
-    self.assertEqual(dtypes.float32, iterator.output_types["bar"])
-    self.assertEqual((), iterator.output_shapes["foo"])
-    self.assertEqual((1,), iterator.output_shapes["bar"])
-
-    with self.cached_session() as sess:
-      sess.run(init_op)
-      for i in range(3):
-        results = sess.run(get_next)
-        self.assertEqual(components["foo"][i], results["foo"])
-        self.assertEqual(components["bar"][i], results["bar"])
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testFromSparseTensorSlices(self):
-    """Test a dataset based on slices of a `tf.SparseTensor`."""
-    st = array_ops.sparse_placeholder(dtypes.float64)
-    iterator = (dataset_ops.Dataset.from_sparse_tensor_slices(st)
-                .make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = sparse_tensor.SparseTensor(*iterator.get_next())
-
-    with self.cached_session() as sess:
-      slices = [[1., 2., 3.], [1.], [1.], [1., 2.], [], [1., 2.], [], [], []]
-
-      # Test with sparse tensor in the appropriate order.
-      indices = np.array(
-          [[i, j] for i in range(len(slices)) for j in range(len(slices[i]))])
-      values = np.array([val for s in slices for val in s])
-      dense_shape = np.array([len(slices), max(len(s) for s in slices) + 1])
-      sparse_feed = sparse_tensor.SparseTensorValue(indices, values,
-                                                    dense_shape)
-      sess.run(init_op, feed_dict={st: sparse_feed})
-      for i, s in enumerate(slices):
-        results = sess.run(get_next)
-        self.assertAllEqual(s, results.values)
-        expected_indices = np.array(
-            [[j] for j in range(len(slices[i]))]).reshape([-1, 1])
-        self.assertAllEqual(expected_indices, results.indices)
-        self.assertAllEqual(dense_shape[1:], results.dense_shape)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-      # Test with sparse tensor in the reverse order, which is not
-      # currently supported.
-      reverse_order_indices = indices[::-1, :]
-      reverse_order_values = values[::-1]
-      sparse_feed = sparse_tensor.SparseTensorValue(
-          reverse_order_indices, reverse_order_values, dense_shape)
-      with self.assertRaises(errors.UnimplementedError):
-        sess.run(init_op, feed_dict={st: sparse_feed})
-
-      # Test with an empty sparse tensor.
-      empty_indices = np.empty((0, 4), dtype=np.int64)
-      empty_values = np.empty((0,), dtype=np.float64)
-      empty_dense_shape = [0, 4, 37, 9]
-      sparse_feed = sparse_tensor.SparseTensorValue(empty_indices, empty_values,
-                                                    empty_dense_shape)
-      sess.run(init_op, feed_dict={st: sparse_feed})
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  # pylint: disable=g-long-lambda,unnecessary-lambda
-  def testNestedStructure(self):
-    components = (np.array([1, 2, 3], dtype=np.int64),
-                  (np.array([4., 5.]), np.array([6., 7.])),
-                  np.array([8, 9, 10], dtype=np.int64))
-
-    dataset = dataset_ops.Dataset.from_tensors(components)
-    self.assertEquals((dtypes.int64, (dtypes.float64, dtypes.float64),
-                       dtypes.int64), dataset.output_types)
-    self.assertEquals(([3], ([2], [2]), [3]), dataset.output_shapes)
-
-    dataset = dataset.shuffle(10, 10)
-    self.assertEquals((dtypes.int64, (dtypes.float64, dtypes.float64),
-                       dtypes.int64), dataset.output_types)
-    self.assertEquals(([3], ([2], [2]), [3]), dataset.output_shapes)
-
-    dataset = dataset.repeat(-1)
-    self.assertEquals((dtypes.int64, (dtypes.float64, dtypes.float64),
-                       dtypes.int64), dataset.output_types)
-    self.assertEquals(([3], ([2], [2]), [3]), dataset.output_shapes)
-
-    dataset = dataset.filter(lambda x, y, z: True)
-    self.assertEquals((dtypes.int64, (dtypes.float64, dtypes.float64),
-                       dtypes.int64), dataset.output_types)
-    self.assertEquals(([3], ([2], [2]), [3]), dataset.output_shapes)
-
-    dataset = dataset.take(5)
-    self.assertEquals((dtypes.int64, (dtypes.float64, dtypes.float64),
-                       dtypes.int64), dataset.output_types)
-    self.assertEquals(([3], ([2], [2]), [3]), dataset.output_shapes)
-
-    dataset = dataset.map(lambda x, y, z: ((x, z), (y[0], y[1])))
-    self.assertEquals(((dtypes.int64, dtypes.int64),
-                       (dtypes.float64, dtypes.float64)), dataset.output_types)
-    self.assertEquals((([3], [3]), ([2], [2])), dataset.output_shapes)
-
-    dataset = dataset.flat_map(
-        lambda x, y: dataset_ops.Dataset.from_tensors(((x[0], x[1]),
-                                                       (y[0], y[1])))
-    )
-    self.assertEquals(((dtypes.int64, dtypes.int64),
-                       (dtypes.float64, dtypes.float64)), dataset.output_types)
-    self.assertEquals((([3], [3]), ([2], [2])), dataset.output_shapes)
-
-    dataset = dataset.batch(32)
-    self.assertEquals(((dtypes.int64, dtypes.int64),
-                       (dtypes.float64, dtypes.float64)), dataset.output_types)
-    self.assertEquals((([None, 3], [None, 3]), ([None, 2], [None, 2])),
-                      nest.pack_sequence_as(dataset.output_shapes, [
-                          s.as_list()
-                          for s in nest.flatten(dataset.output_shapes)
-                      ]))
-
-    iterator = dataset.make_one_shot_iterator()
-    (w, x), (y, z) = iterator.get_next()
-    self.assertEquals(dtypes.int64, w.dtype)
-    self.assertEquals(dtypes.int64, x.dtype)
-    self.assertEquals(dtypes.float64, y.dtype)
-    self.assertEquals(dtypes.float64, z.dtype)
-    self.assertEquals([None, 3], w.shape.as_list())
-    self.assertEquals([None, 3], x.shape.as_list())
-    self.assertEquals([None, 2], y.shape.as_list())
-    self.assertEquals([None, 2], z.shape.as_list())
-
-    iterator = dataset.make_initializable_iterator()
-    (w, x), (y, z) = iterator.get_next()
-    self.assertEquals(dtypes.int64, w.dtype)
-    self.assertEquals(dtypes.int64, x.dtype)
-    self.assertEquals(dtypes.float64, y.dtype)
-    self.assertEquals(dtypes.float64, z.dtype)
-    self.assertEquals([None, 3], w.shape.as_list())
-    self.assertEquals([None, 3], x.shape.as_list())
-    self.assertEquals([None, 2], y.shape.as_list())
-    self.assertEquals([None, 2], z.shape.as_list())
-
-    # Define a separate set of components with matching leading
-    # dimension for the from-slices constructor.
-    components_for_slices = (np.array([1, 2, 3], dtype=np.int64),
-                             (np.array([4., 5., 6.]),
-                              np.array([7., 8., 9.])),
-                             np.array([10, 11, 12], dtype=np.int64))
-
-    dataset = dataset_ops.Dataset.from_tensor_slices(components_for_slices)
-    self.assertEquals((dtypes.int64, (dtypes.float64, dtypes.float64),
-                       dtypes.int64), dataset.output_types)
-    self.assertEquals(([], ([], []), []), dataset.output_shapes)
-
-  def testNestedDict(self):
-    components = {"a": {"aa": 1, "ab": [2.0, 2.0]}, "b": [3, 3, 3]}
-    dataset = dataset_ops.Dataset.from_tensors(components)
-    self.assertEquals(dtypes.int32, dataset.output_types["a"]["aa"])
-    self.assertEquals(dtypes.float32, dataset.output_types["a"]["ab"])
-    self.assertEquals(dtypes.int32, dataset.output_types["b"])
-    self.assertEquals([], dataset.output_shapes["a"]["aa"])
-    self.assertEquals([2], dataset.output_shapes["a"]["ab"])
-    self.assertEquals([3], dataset.output_shapes["b"])
-
-  def testNonSequenceNestedStructure(self):
-    components = np.array([1, 2, 3], dtype=np.int64)
-
-    dataset = dataset_ops.Dataset.from_tensors(components)
-    self.assertEquals(dtypes.int64, dataset.output_types)
-    self.assertEquals([3], dataset.output_shapes)
-
-    dataset = dataset.filter(
-        lambda x: math_ops.reduce_all(math_ops.equal(x, components)))
-    self.assertEquals(dtypes.int64, dataset.output_types)
-    self.assertEquals([3], dataset.output_shapes)
-
-    dataset = dataset.map(lambda x: array_ops.stack([x, x]))
-    self.assertEquals(dtypes.int64, dataset.output_types)
-    self.assertEquals([2, 3], dataset.output_shapes)
-
-    dataset = dataset.flat_map(
-        lambda x: dataset_ops.Dataset.from_tensor_slices(x))
-    self.assertEquals(dtypes.int64, dataset.output_types)
-    self.assertEquals([3], dataset.output_shapes)
-
-    iterator = dataset.make_one_shot_iterator()
-    get_next = iterator.get_next()
-    self.assertEquals(dtypes.int64, get_next.dtype)
-    self.assertEquals([3], get_next.shape)
-
-  def testSplitPipelineFailsWithPlacementError(self):
-    with session.Session(
-        target="",
-        config=config_pb2.ConfigProto(device_count={"CPU": 2})) as sess:
-
-      dataset = dataset_ops.Dataset.from_tensors(0)
-
-      # Define a pipeline that attempts to use variables on two
-      # different devices.
-      #
-      # Initialize the variables before creating to iterator, to avoid the
-      # placement algorithm overriding the DT_RESOURCE colocation constraints.
-      with ops.device("/cpu:0"):
-        var_0 = resource_variable_ops.ResourceVariable(initial_value=0)
-        dataset = dataset.map(lambda x: x + var_0.read_value())
-      sess.run(var_0.initializer)
-
-      with ops.device("/cpu:1"):
-        var_1 = resource_variable_ops.ResourceVariable(initial_value=0)
-        dataset = dataset.map(lambda x: x + var_1.read_value())
-      sess.run(var_1.initializer)
-
-      iterator = dataset.make_initializable_iterator()
-      sess.run(iterator.initializer)
-
-      with self.assertRaisesRegexp(
-          errors.FailedPreconditionError,
-          "Error while reading resource variable Variable"):
-        sess.run(iterator.get_next())
-
-
-class DatasetConstructorBenchmark(test.Benchmark):
-
-  def benchmarkSliceRepeatBatch(self):
-    input_size = 10000
-    batch_size = 100
-    num_epochs = 100
-
-    input_data = np.random.randn(input_size)
-
-    dataset = (
-        dataset_ops.Dataset.from_tensor_slices(input_data)
-        .repeat(num_epochs + 1).batch(batch_size))
-    iterator = dataset.make_initializable_iterator()
-    next_element = iterator.get_next()
-
-    with session.Session() as sess:
-      sess.run(iterator.initializer)
-      # Run one whole epoch to burn in the computation.
-      for _ in range(input_size // batch_size):
-        sess.run(next_element)
-      deltas = []
-      try:
-        while True:
-          start = time.time()
-          sess.run(next_element)
-          deltas.append(time.time() - start)
-      except errors.OutOfRangeError:
-        pass
-
-    median_wall_time = np.median(deltas)
-    print("Slice/repeat/batch with sess.run() input size: %d batch size: %d "
-          "Median wall time per element: %f" % (input_size, batch_size,
-                                                median_wall_time))
-    self.report_benchmark(
-        iters=len(deltas),
-        wall_time=median_wall_time,
-        name="benchmark_slice_repeat_batch_input_%d_batch_%d" % (input_size,
-                                                                 batch_size))
-
-  def benchmarkSliceRepeatBatchCallable(self):
-    input_size = 10000
-    batch_size = 100
-    num_epochs = 100
-
-    input_data = np.random.randn(input_size)
-
-    dataset = (
-        dataset_ops.Dataset.from_tensor_slices(input_data)
-        .repeat(num_epochs + 1).batch(batch_size))
-    iterator = dataset.make_initializable_iterator()
-    next_element = iterator.get_next()
-
-    with session.Session() as sess:
-      sess.run(iterator.initializer)
-      get_next_element = sess.make_callable(next_element)
-      # Run one whole epoch to burn in the computation.
-      for _ in range(input_size // batch_size):
-        get_next_element()
-      deltas = []
-      try:
-        while True:
-          start = time.time()
-          get_next_element()
-          deltas.append(time.time() - start)
-      except errors.OutOfRangeError:
-        pass
-
-    median_wall_time = np.median(deltas)
-    print(
-        "Slice/repeat/batch with callable input size: %d batch size: %d Median"
-        " wall time per element: %f" % (input_size, batch_size,
-                                        median_wall_time))
-    self.report_benchmark(
-        iters=len(deltas),
-        wall_time=median_wall_time,
-        name="benchmark_slice_repeat_batch_callable_input_%d_batch_%d" %
-        (input_size, batch_size))
-
-  def benchmarkReshapeSliceRepeatCallable(self):
-    input_size = 10000
-    batch_size = 100
-    num_epochs = 100
-
-    input_data = np.random.randn(input_size)
-
-    dataset = (
-        dataset_ops.Dataset.from_tensor_slices(input_data.reshape(100, 100))
-        .repeat(num_epochs + 1))
-    iterator = dataset.make_initializable_iterator()
-    next_element = iterator.get_next()
-
-    with session.Session() as sess:
-      sess.run(iterator.initializer)
-      get_next_element = sess.make_callable(next_element)
-      # Run one whole epoch to burn in the computation.
-      for _ in range(input_size // batch_size):
-        get_next_element()
-      deltas = []
-      try:
-        while True:
-          start = time.time()
-          get_next_element()
-          deltas.append(time.time() - start)
-      except errors.OutOfRangeError:
-        pass
-
-    median_wall_time = np.median(deltas)
-    print("Reshape/slice/repeat with callable input size: %d batch size: %d "
-          "Median wall time per element: %f" % (input_size, batch_size,
-                                                median_wall_time))
-    self.report_benchmark(
-        iters=len(deltas),
-        wall_time=median_wall_time,
-        name="benchmark_reshape_slice_repeat_callable_input_%d_batch_%d" %
-        (input_size, batch_size))
-
-  def benchmarkSliceBatchCacheRepeatCallable(self):
-    input_size = 10000
-    batch_size = 100
-    num_epochs = 100
-
-    input_data = np.random.randn(input_size)
-
-    dataset = (
-        dataset_ops.Dataset.from_tensor_slices(input_data).batch(batch_size)
-        .cache().repeat(num_epochs + 1))
-    iterator = dataset.make_initializable_iterator()
-    next_element = iterator.get_next()
-
-    with session.Session() as sess:
-      sess.run(iterator.initializer)
-      get_next_element = sess.make_callable(next_element)
-      # Run one whole epoch to burn in the computation.
-      for _ in range(input_size // batch_size):
-        get_next_element()
-      deltas = []
-      try:
-        while True:
-          start = time.time()
-          get_next_element()
-          deltas.append(time.time() - start)
-      except errors.OutOfRangeError:
-        pass
-
-    median_wall_time = np.median(deltas)
-    print(
-        "Slice/batch/cache/repeat with callable input size: %d batch size: %d "
-        "Median wall time per element: %f"
-        % (input_size, batch_size, median_wall_time))
-    self.report_benchmark(
-        iters=len(deltas),
-        wall_time=median_wall_time,
-        name="benchmark_slice_batch_cache_repeat_callable_input_%d_batch_%d" %
-        (input_size, batch_size))
-
-
-if __name__ == "__main__":
-  test.main()
diff --git a/tensorflow/python/data/kernel_tests/dataset_ops_test.py b/tensorflow/python/data/kernel_tests/dataset_test.py
similarity index 69%
rename from tensorflow/python/data/kernel_tests/dataset_ops_test.py
rename to tensorflow/python/data/kernel_tests/dataset_test.py
index a5324af4d0c..2952c08be02 100644
--- a/tensorflow/python/data/kernel_tests/dataset_ops_test.py
+++ b/tensorflow/python/data/kernel_tests/dataset_test.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for the input pipeline ops."""
+"""Tests for `tf.data.Dataset`."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -24,21 +24,26 @@ import numpy as np
 from tensorflow.core.framework import graph_pb2
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.ops import optional_ops
 from tensorflow.python.data.ops import readers
 from tensorflow.python.data.util import nest
+from tensorflow.python.data.util import structure
+from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
 
 
-class DatasetOpsTest(test_base.DatasetTestBase, parameterized.TestCase):
+@test_util.run_all_in_graph_and_eager_modes
+class DatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
 
   def testAsSerializedGraph(self):
     dataset = dataset_ops.Dataset.range(10)
-    with self.cached_session() as sess:
-      graph = graph_pb2.GraphDef().FromString(
-          sess.run(dataset._as_serialized_graph()))
-      self.assertTrue(any([node.op != "RangeDataset" for node in graph.node]))
+    graph = graph_pb2.GraphDef().FromString(
+        self.evaluate(dataset._as_serialized_graph()))
+    self.assertTrue(any([node.op != "RangeDataset" for node in graph.node]))
 
   @staticmethod
   def make_apply_fn(dataset):
@@ -76,7 +81,7 @@ class DatasetOpsTest(test_base.DatasetTestBase, parameterized.TestCase):
        lambda: readers.FixedLengthRecordDataset("", 42)),
       ("FromGenerator",
        lambda: dataset_ops.Dataset.from_generator(
-           DatasetOpsTest.make_gen(), dtypes.int32),
+           DatasetTest.make_gen(), dtypes.int32),
        1),
       ("FromTensors", lambda: dataset_ops.Dataset.from_tensors([42])),
       ("FromTensorSlices", lambda: dataset_ops.Dataset.from_tensors([42])),
@@ -222,12 +227,12 @@ class DatasetOpsTest(test_base.DatasetTestBase, parameterized.TestCase):
     options1 = dataset_ops.Options()
     options1.experimental_autotune = True
     options2 = dataset_ops.Options()
-    options2.experimental_filter_fusion = False
+    options2.experimental_deterministic = False
     ds = dataset_ops.Dataset.range(0).with_options(options1).with_options(
         options2)
     self.assertTrue(ds.options().experimental_autotune)
     # Explicitly check that flag is False since assertFalse allows None
-    self.assertIs(ds.options().experimental_filter_fusion, False)
+    self.assertIs(ds.options().experimental_deterministic, False)
 
   def testOptionsTwiceDifferentError(self):
     options1 = dataset_ops.Options()
@@ -235,20 +240,78 @@ class DatasetOpsTest(test_base.DatasetTestBase, parameterized.TestCase):
     options2 = dataset_ops.Options()
     options2.experimental_autotune = False
     with self.assertRaisesRegexp(ValueError,
-                                 "Cannot merge incompatible values of option"):
+                                 "Cannot merge incompatible values"):
       dataset_ops.Dataset.range(0).with_options(options1).with_options(options2)
 
   def testOptionsMergeOptionsFromMultipleInputs(self):
     options1 = dataset_ops.Options()
     options1.experimental_autotune = True
     options2 = dataset_ops.Options()
-    options2.experimental_filter_fusion = True
+    options2.experimental_deterministic = True
     ds = dataset_ops.Dataset.zip(
         (dataset_ops.Dataset.range(0).with_options(options1),
          dataset_ops.Dataset.range(0).with_options(options2)))
     self.assertTrue(ds.options().experimental_autotune)
-    self.assertTrue(ds.options().experimental_filter_fusion)
+    self.assertTrue(ds.options().experimental_deterministic)
 
+  # TODO(b/119882922): use-after-free bug in eager mode.
+  # pylint: disable=g-long-lambda
+  @parameterized.named_parameters(
+      ("Tensor", lambda: constant_op.constant(37.0),
+       structure.TensorStructure(dtypes.float32, [])),
+      ("SparseTensor", lambda: sparse_tensor.SparseTensor(
+          indices=[[0]], values=constant_op.constant([0], dtype=dtypes.int32),
+          dense_shape=[1]),
+       structure.SparseTensorStructure(dtypes.int32, [1])),
+      ("Nest", lambda: {
+          "a": constant_op.constant(37.0),
+          "b": (constant_op.constant(["Foo"]), constant_op.constant("Bar"))},
+       structure.NestedStructure({
+           "a": structure.TensorStructure(dtypes.float32, []),
+           "b": (structure.TensorStructure(dtypes.string, [1]),
+                 structure.TensorStructure(dtypes.string, []))})),
+      ("Dataset", lambda: dataset_ops.Dataset.from_tensor_slices(
+          constant_op.constant([1, 2, 3])),
+       dataset_ops.DatasetStructure(
+           structure.TensorStructure(dtypes.int32, []))),
+      ("Optional", lambda: optional_ops.Optional.from_value(37.0),
+       optional_ops.OptionalStructure(
+           structure.TensorStructure(dtypes.float32, []))),
+  )
+  def testSkipEagerDatasetStructure(self, tf_value_fn,
+                                    expected_element_structure):
+    dataset = dataset_ops.Dataset.from_tensors(0).map(lambda _: tf_value_fn())
+    dataset_structure = structure.Structure.from_value(dataset)
+    self.assertIsInstance(dataset_structure, dataset_ops.DatasetStructure)
+
+    # TODO(b/110122868): Add a public API to `tf.data.Dataset` for accessing
+    # the element structure.
+    self.assertTrue(expected_element_structure.is_compatible_with(
+        dataset_structure._element_structure))
+    self.assertTrue(dataset_structure._element_structure.is_compatible_with(
+        expected_element_structure))
+
+    self.assertEqual([dtypes.variant], dataset_structure._flat_types)
+    self.assertEqual([tensor_shape.scalar()], dataset_structure._flat_shapes)
+
+    # Assert that the `Dataset` survives a round-trip via _from_tensor_list()
+    # and _to_tensor_list().
+    round_trip_dataset = dataset_structure._from_tensor_list(
+        dataset_structure._to_tensor_list(dataset))
+
+    value = tf_value_fn()
+
+    if isinstance(value, dataset_ops.Dataset):
+      self.assertDatasetsEqual(value, dataset.flat_map(lambda x: x))
+    elif isinstance(value, optional_ops.Optional):
+      self.assertDatasetProduces(
+          round_trip_dataset.map(lambda opt: opt.get_value()),
+          [self.evaluate(value.get_value())],
+          requires_initialization=True)
+    else:
+      self.assertDatasetProduces(
+          round_trip_dataset, [self.evaluate(tf_value_fn())],
+          requires_initialization=True)
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/data/kernel_tests/filter_dataset_op_test.py b/tensorflow/python/data/kernel_tests/filter_dataset_op_test.py
deleted file mode 100644
index a0c6b37a6dc..00000000000
--- a/tensorflow/python/data/kernel_tests/filter_dataset_op_test.py
+++ /dev/null
@@ -1,220 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for the experimental input pipeline ops."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import time
-
-import numpy as np
-
-from tensorflow.python.client import session
-from tensorflow.python.data.kernel_tests import test_base
-from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import errors
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import sparse_tensor
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import functional_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.platform import test
-
-
-class FilterDatasetTest(test_base.DatasetTestBase):
-
-  def testFilterDataset(self):
-    components = (
-        np.arange(7, dtype=np.int64),
-        np.array([[1, 2, 3]], dtype=np.int64) * np.arange(
-            7, dtype=np.int64)[:, np.newaxis],
-        np.array(37.0, dtype=np.float64) * np.arange(7)
-    )
-    count = array_ops.placeholder(dtypes.int64, shape=[])
-    modulus = array_ops.placeholder(dtypes.int64)
-
-    def _map_fn(x, y, z):
-      return math_ops.square(x), math_ops.square(y), math_ops.square(z)
-
-    iterator = (
-        dataset_ops.Dataset.from_tensor_slices(components).map(_map_fn)
-        .repeat(count)
-        .filter(lambda x, _y, _z: math_ops.equal(math_ops.mod(x, modulus), 0))
-        .make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    self.assertEqual([c.shape[1:] for c in components],
-                     [t.shape for t in get_next])
-
-    with self.cached_session() as sess:
-      # Test that we can dynamically feed a different modulus value for each
-      # iterator.
-      def do_test(count_val, modulus_val):
-        sess.run(init_op, feed_dict={count: count_val, modulus: modulus_val})
-        for _ in range(count_val):
-          for i in [x for x in range(7) if x**2 % modulus_val == 0]:
-            result = sess.run(get_next)
-            for component, result_component in zip(components, result):
-              self.assertAllEqual(component[i]**2, result_component)
-        with self.assertRaises(errors.OutOfRangeError):
-          sess.run(get_next)
-
-      do_test(14, 2)
-      do_test(4, 18)
-
-      # Test an empty dataset.
-      do_test(0, 1)
-
-  def testFilterRange(self):
-    dataset = dataset_ops.Dataset.range(100).filter(
-        lambda x: math_ops.not_equal(math_ops.mod(x, 3), 2))
-    iterator = dataset.make_one_shot_iterator()
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      self.assertEqual(0, sess.run(get_next))
-      self.assertEqual(1, sess.run(get_next))
-      self.assertEqual(3, sess.run(get_next))
-
-  def testFilterDict(self):
-    iterator = (dataset_ops.Dataset.range(10)
-                .map(lambda x: {"foo": x * 2, "bar": x ** 2})
-                .filter(lambda d: math_ops.equal(d["bar"] % 2, 0))
-                .map(lambda d: d["foo"] + d["bar"])
-                .make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(init_op)
-      for i in range(10):
-        if (i ** 2) % 2 == 0:
-          self.assertEqual(i * 2 + i ** 2, sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testUseStepContainerInFilter(self):
-    input_data = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.int64)
-
-    # Define a predicate that returns true for the first element of
-    # the sequence and not the second, and uses `tf.map_fn()`.
-    def _predicate(xs):
-      squared_xs = functional_ops.map_fn(lambda x: x * x, xs)
-      summed = math_ops.reduce_sum(squared_xs)
-      return math_ops.equal(summed, 1 + 4 + 9)
-
-    iterator = (
-        dataset_ops.Dataset.from_tensor_slices([[1, 2, 3], [4, 5, 6]])
-        .filter(_predicate)
-        .make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(init_op)
-      self.assertAllEqual(input_data[0], sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testSparse(self):
-
-    def _map_fn(i):
-      return sparse_tensor.SparseTensorValue(
-          indices=np.array([[0, 0]]),
-          values=(i * np.array([1])),
-          dense_shape=np.array([1, 1])), i
-
-    def _filter_fn(_, i):
-      return math_ops.equal(i % 2, 0)
-
-    iterator = (
-        dataset_ops.Dataset.range(10).map(_map_fn).filter(_filter_fn).map(
-            lambda x, i: x).make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(init_op)
-      for i in range(5):
-        actual = sess.run(get_next)
-        self.assertTrue(isinstance(actual, sparse_tensor.SparseTensorValue))
-        self.assertSparseValuesEqual(actual, _map_fn(i * 2)[0])
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testShortCircuit(self):
-    iterator = (
-        dataset_ops.Dataset.zip(
-            (dataset_ops.Dataset.range(10),
-             dataset_ops.Dataset.from_tensors(True).repeat(None)))
-        .filter(lambda x, y: y).make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(init_op)
-      for i in range(10):
-        self.assertEqual((i, True), sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testParallelFilters(self):
-    dataset = dataset_ops.Dataset.range(10).filter(
-        lambda x: math_ops.equal(x % 2, 0))
-    iterators = [dataset.make_one_shot_iterator() for _ in range(10)]
-    next_elements = [iterator.get_next() for iterator in iterators]
-    with self.cached_session() as sess:
-      self.assertEqual([0 for _ in range(10)], sess.run(next_elements))
-
-
-class FilterDatasetBenchmark(test.Benchmark):
-
-  def _benchmark(self, predicate, name):
-    with ops.Graph().as_default():
-      dataset = (
-          dataset_ops.Dataset.from_tensors(True).repeat(None).filter(predicate))
-      iterator = dataset.make_one_shot_iterator()
-      next_element = iterator.get_next()
-
-      with session.Session() as sess:
-        for _ in range(5):
-          sess.run(next_element.op)
-        deltas = []
-        for _ in range(100):
-          start = time.time()
-          for _ in range(100):
-            sess.run(next_element.op)
-          end = time.time()
-          deltas.append(end - start)
-
-        median_wall_time = np.median(deltas) / 100
-        print("Filter dataset using %s. Median wall time: %f" %
-              (name, median_wall_time))
-        self.report_benchmark(
-            iters=100,
-            wall_time=median_wall_time,
-            name="benchmark_filter_dataset_%s" % name)
-
-  def benchmarkSimpleFunction(self):
-    self._benchmark(array_ops.identity, "simple_function")
-
-  def benchmarkReturnComponentOptimization(self):
-    self._benchmark(lambda x: x, "return_component")
-
-
-if __name__ == "__main__":
-  test.main()
diff --git a/tensorflow/python/data/kernel_tests/filter_test.py b/tensorflow/python/data/kernel_tests/filter_test.py
new file mode 100644
index 00000000000..afaf954cbc6
--- /dev/null
+++ b/tensorflow/python/data/kernel_tests/filter_test.py
@@ -0,0 +1,128 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for `tf.data.Dataset.filter()`."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.data.kernel_tests import test_base
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import functional_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import test
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class FilterTest(test_base.DatasetTestBase):
+
+  def testFilterDataset(self):
+    components = (
+        np.arange(7, dtype=np.int64),
+        np.array([[1, 2, 3]], dtype=np.int64) * np.arange(
+            7, dtype=np.int64)[:, np.newaxis],
+        np.array(37.0, dtype=np.float64) * np.arange(7)
+    )
+    def _map_fn(x, y, z):
+      return math_ops.square(x), math_ops.square(y), math_ops.square(z)
+
+    def do_test(count, modulus):
+      dataset = dataset_ops.Dataset.from_tensor_slices(components).map(
+          _map_fn).repeat(count).filter(
+              lambda x, _y, _z: math_ops.equal(math_ops.mod(x, modulus), 0))
+      self.assertEqual([c.shape[1:] for c in components],
+                       [shape for shape in dataset.output_shapes])
+      get_next = self.getNext(dataset)
+      for _ in range(count):
+        for i in [x for x in range(7) if x**2 % modulus == 0]:
+          result = self.evaluate(get_next())
+          for component, result_component in zip(components, result):
+            self.assertAllEqual(component[i]**2, result_component)
+      with self.assertRaises(errors.OutOfRangeError):
+        self.evaluate(get_next())
+
+    do_test(14, 2)
+    do_test(4, 18)
+
+    # Test an empty dataset.
+    do_test(0, 1)
+
+  def testFilterRange(self):
+    dataset = dataset_ops.Dataset.range(4).filter(
+        lambda x: math_ops.not_equal(math_ops.mod(x, 3), 2))
+    self.assertDatasetProduces(dataset, expected_output=[0, 1, 3])
+
+  def testFilterDict(self):
+    dataset = dataset_ops.Dataset.range(10).map(
+        lambda x: {"foo": x * 2, "bar": x ** 2}).filter(
+            lambda d: math_ops.equal(d["bar"] % 2, 0)).map(
+                lambda d: d["foo"] + d["bar"])
+    self.assertDatasetProduces(
+        dataset,
+        expected_output=[(i * 2 + i**2) for i in range(10) if not (i**2) % 2])
+
+  def testUseStepContainerInFilter(self):
+    input_data = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.int64)
+
+    # Define a predicate that returns true for the first element of
+    # the sequence and not the second, and uses `tf.map_fn()`.
+    def _predicate(xs):
+      squared_xs = functional_ops.map_fn(lambda x: x * x, xs)
+      summed = math_ops.reduce_sum(squared_xs)
+      return math_ops.equal(summed, 1 + 4 + 9)
+
+    dataset = dataset_ops.Dataset.from_tensor_slices(
+        [[1, 2, 3], [4, 5, 6]]).filter(_predicate)
+    self.assertDatasetProduces(dataset, expected_output=[input_data[0]])
+
+  def testSparse(self):
+
+    def _map_fn(i):
+      return sparse_tensor.SparseTensorValue(
+          indices=np.array([[0, 0]]),
+          values=(i * np.array([1])),
+          dense_shape=np.array([1, 1])), i
+
+    def _filter_fn(_, i):
+      return math_ops.equal(i % 2, 0)
+
+    dataset = dataset_ops.Dataset.range(10).map(_map_fn).filter(_filter_fn).map(
+        lambda x, i: x)
+    self.assertDatasetProduces(
+        dataset, expected_output=[_map_fn(i * 2)[0] for i in range(5)])
+
+  def testShortCircuit(self):
+    dataset = dataset_ops.Dataset.zip(
+        (dataset_ops.Dataset.range(10),
+         dataset_ops.Dataset.from_tensors(True).repeat(None)
+        )).filter(lambda x, y: y)
+    self.assertDatasetProduces(
+        dataset, expected_output=[(i, True) for i in range(10)])
+
+  def testParallelFilters(self):
+    dataset = dataset_ops.Dataset.range(10).filter(
+        lambda x: math_ops.equal(x % 2, 0))
+    next_elements = [self.getNext(dataset) for _ in range(10)]
+    self.assertEqual([0 for _ in range(10)],
+                     self.evaluate(
+                         [next_element() for next_element in next_elements]))
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/data/kernel_tests/fixed_length_record_dataset_test.py b/tensorflow/python/data/kernel_tests/fixed_length_record_dataset_test.py
new file mode 100644
index 00000000000..9503e57ca7c
--- /dev/null
+++ b/tensorflow/python/data/kernel_tests/fixed_length_record_dataset_test.py
@@ -0,0 +1,171 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for `tf.data.FixedLengthRecordDataset`."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import gzip
+import os
+import zlib
+
+from tensorflow.python.data.kernel_tests import test_base
+from tensorflow.python.data.ops import readers
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import test_util
+from tensorflow.python.platform import test
+from tensorflow.python.util import compat
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class FixedLengthRecordDatasetTest(test_base.DatasetTestBase):
+
+  def setUp(self):
+    super(FixedLengthRecordDatasetTest, self).setUp()
+    self._num_files = 2
+    self._num_records = 7
+    self._header_bytes = 5
+    self._record_bytes = 3
+    self._footer_bytes = 2
+
+  def _record(self, f, r):
+    return compat.as_bytes(str(f * 2 + r) * self._record_bytes)
+
+  def _createFiles(self, compression_type=None):
+    filenames = []
+    for i in range(self._num_files):
+      fn = os.path.join(self.get_temp_dir(), "fixed_length_record.%d.txt" % i)
+      filenames.append(fn)
+
+      contents = []
+      contents.append(b"H" * self._header_bytes)
+      for j in range(self._num_records):
+        contents.append(self._record(i, j))
+      contents.append(b"F" * self._footer_bytes)
+      contents = b"".join(contents)
+
+      if not compression_type:
+        with open(fn, "wb") as f:
+          f.write(contents)
+      elif compression_type == "GZIP":
+        with gzip.GzipFile(fn, "wb") as f:
+          f.write(contents)
+      elif compression_type == "ZLIB":
+        contents = zlib.compress(contents)
+        with open(fn, "wb") as f:
+          f.write(contents)
+      else:
+        raise ValueError("Unsupported compression_type", compression_type)
+
+    return filenames
+
+  def _testFixedLengthRecordDataset(self, compression_type=None):
+    test_filenames = self._createFiles(compression_type=compression_type)
+
+    def dataset_fn(filenames, num_epochs, batch_size=None):
+      repeat_dataset = readers.FixedLengthRecordDataset(
+          filenames,
+          self._record_bytes,
+          self._header_bytes,
+          self._footer_bytes,
+          compression_type=compression_type).repeat(num_epochs)
+      if batch_size:
+        return repeat_dataset.batch(batch_size)
+      return repeat_dataset
+
+    # Basic test: read from file 0.
+    self.assertDatasetProduces(
+        dataset_fn([test_filenames[0]], 1),
+        expected_output=[
+            self._record(0, i) for i in range(self._num_records)
+        ])
+
+    # Basic test: read from file 1.
+    self.assertDatasetProduces(
+        dataset_fn([test_filenames[1]], 1),
+        expected_output=[
+            self._record(1, i) for i in range(self._num_records)
+        ])
+
+    # Basic test: read from both files.
+    expected_output = []
+    for j in range(self._num_files):
+      expected_output.extend(
+          [self._record(j, i) for i in range(self._num_records)])
+    self.assertDatasetProduces(
+        dataset_fn(test_filenames, 1), expected_output=expected_output)
+
+    # Test repeated iteration through both files.
+    get_next = self.getNext(dataset_fn(test_filenames, 10))
+    for _ in range(10):
+      for j in range(self._num_files):
+        for i in range(self._num_records):
+          self.assertEqual(self._record(j, i), self.evaluate(get_next()))
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
+
+    # Test batched and repeated iteration through both files.
+    get_next = self.getNext(dataset_fn(test_filenames, 10, self._num_records))
+    for _ in range(10):
+      for j in range(self._num_files):
+        self.assertAllEqual(
+            [self._record(j, i) for i in range(self._num_records)],
+            self.evaluate(get_next()))
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
+
+  def testFixedLengthRecordDatasetNoCompression(self):
+    self._testFixedLengthRecordDataset()
+
+  def testFixedLengthRecordDatasetGzipCompression(self):
+    self._testFixedLengthRecordDataset(compression_type="GZIP")
+
+  def testFixedLengthRecordDatasetZlibCompression(self):
+    self._testFixedLengthRecordDataset(compression_type="ZLIB")
+
+  def testFixedLengthRecordDatasetBuffering(self):
+    test_filenames = self._createFiles()
+    dataset = readers.FixedLengthRecordDataset(
+        test_filenames,
+        self._record_bytes,
+        self._header_bytes,
+        self._footer_bytes,
+        buffer_size=10)
+    expected_output = []
+    for j in range(self._num_files):
+      expected_output.extend(
+          [self._record(j, i) for i in range(self._num_records)])
+    self.assertDatasetProduces(dataset, expected_output=expected_output)
+
+  def testFixedLengthRecordDatasetWrongSize(self):
+    test_filenames = self._createFiles()
+    dataset = readers.FixedLengthRecordDataset(
+        test_filenames,
+        self._record_bytes + 1,  # Incorrect record length.
+        self._header_bytes,
+        self._footer_bytes,
+        buffer_size=10)
+    self.assertDatasetProduces(
+        dataset,
+        expected_error=(
+            errors.InvalidArgumentError,
+            r"Excluding the header \(5 bytes\) and footer \(2 bytes\), input "
+            r"file \".*fixed_length_record.0.txt\" has body length 21 bytes, "
+            r"which is not an exact multiple of the record length \(4 bytes\).")
+        )
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/data/kernel_tests/flat_map_dataset_op_test.py b/tensorflow/python/data/kernel_tests/flat_map_test.py
similarity index 57%
rename from tensorflow/python/data/kernel_tests/flat_map_dataset_op_test.py
rename to tensorflow/python/data/kernel_tests/flat_map_test.py
index 68038f9cfc0..ff52821b107 100644
--- a/tensorflow/python/data/kernel_tests/flat_map_dataset_op_test.py
+++ b/tensorflow/python/data/kernel_tests/flat_map_test.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for the experimental input pipeline ops."""
+"""Tests for `tf.data.Dataset.flat_map()`."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
@@ -26,54 +26,42 @@ from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import sparse_ops
 from tensorflow.python.platform import test
 from tensorflow.python.training import server_lib
 
 
-class FlatMapDatasetTest(test_base.DatasetTestBase):
+@test_util.run_all_in_graph_and_eager_modes
+class FlatMapTest(test_base.DatasetTestBase):
 
   # pylint: disable=g-long-lambda
   def testFlatMapDataset(self):
     repeats = [1, 2, 3, 4, 5, 0, 1]
     components = np.array(repeats, dtype=np.int64)
-    iterator = (
-        dataset_ops.Dataset.from_tensor_slices(components)
-        .flat_map(lambda x: dataset_ops.Dataset.from_tensors([x]).repeat(x))
-        .make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(init_op)
-      for i in repeats:
-        for _ in range(i):
-          self.assertEqual(i, sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+    dataset = dataset_ops.Dataset.from_tensor_slices(components).flat_map(
+        lambda x: dataset_ops.Dataset.from_tensors([x]).repeat(x))
+    expected_output = []
+    for i in repeats:
+      expected_output.extend([[i]] * i)
+    self.assertDatasetProduces(dataset, expected_output=expected_output)
 
   def testNestedFlatMapDataset(self):
     repeats = [[1, 2], [3, 4], [5, 0], [1, 7]]
     components = np.array(repeats, dtype=np.int64)
-    iterator = (
-        dataset_ops.Dataset.from_tensor_slices(components)
-        .flat_map(lambda x: dataset_ops.Dataset.from_tensor_slices(x)
-                  .flat_map(lambda y: dataset_ops.Dataset.from_tensors(y)
-                            .repeat(y))).make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
+    dataset = dataset_ops.Dataset.from_tensor_slices(components).flat_map(
+        lambda x: dataset_ops.Dataset.from_tensor_slices(x).flat_map(
+            lambda y: dataset_ops.Dataset.from_tensors(y).repeat(y))
+    )
+    expected_output = []
+    for row in repeats:
+      for i in row:
+        expected_output.extend([i] * i)
+    self.assertDatasetProduces(dataset, expected_output=expected_output)
 
-    with self.cached_session() as sess:
-      sess.run(init_op)
-      for row in repeats:
-        for i in row:
-          for _ in range(i):
-            self.assertEqual(i, sess.run(get_next))
-
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testSharedResourceNestedFlatMapDataset(self):
+  # Note: no eager mode coverage, session specific test.
+  @test_util.run_deprecated_v1
+  def testSkipEagerSharedResourceNestedFlatMapDataset(self):
     repeats = [[1, 2], [3, 4], [5, 0], [1, 7]]
     components = np.array(repeats, dtype=np.int64)
     iterator = (
@@ -106,22 +94,16 @@ class FlatMapDatasetTest(test_base.DatasetTestBase):
           sess.run(get_next)
 
   def testMapDict(self):
-    iterator = (dataset_ops.Dataset.range(10)
-                .map(lambda x: {"foo": x * 2, "bar": x ** 2})
-                .flat_map(lambda d: dataset_ops.Dataset.from_tensors(d["foo"])
-                          .repeat(d["bar"]))
-                .make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(init_op)
-      for i in range(10):
-        for _ in range(i ** 2):
-          self.assertEqual(i * 2, sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-  # pylint: enable=g-long-lambda
+    dataset = dataset_ops.Dataset.range(10).map(
+        lambda x: {"foo": x * 2, "bar": x ** 2}).flat_map(
+            lambda d: dataset_ops.Dataset.from_tensors(
+                d["foo"]).repeat(d["bar"]))
+    get_next = self.getNext(dataset)
+    for i in range(10):
+      for _ in range(i**2):
+        self.assertEqual(i * 2, self.evaluate(get_next()))
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
 
   def testSparse(self):
     def _map_fn(i):
@@ -132,20 +114,12 @@ class FlatMapDatasetTest(test_base.DatasetTestBase):
       return dataset_ops.Dataset.from_tensor_slices(
           sparse_ops.sparse_to_dense(x.indices, x.dense_shape, x.values))
 
-    iterator = (
-        dataset_ops.Dataset.range(10).map(_map_fn).flat_map(_flat_map_fn)
-        .make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(init_op)
-      for i in range(10):
-        for j in range(2):
-          expected = [i, 0] if j % 2 == 0 else [0, -i]
-          self.assertAllEqual(expected, sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+    dataset = dataset_ops.Dataset.range(10).map(_map_fn).flat_map(_flat_map_fn)
+    expected_output = []
+    for i in range(10):
+      for j in range(2):
+        expected_output.append([i, 0] if j % 2 == 0 else [0, -i])
+    self.assertDatasetProduces(dataset, expected_output=expected_output)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/data/kernel_tests/dataset_from_generator_op_test.py b/tensorflow/python/data/kernel_tests/from_generator_test.py
similarity index 85%
rename from tensorflow/python/data/kernel_tests/dataset_from_generator_op_test.py
rename to tensorflow/python/data/kernel_tests/from_generator_test.py
index cb8cb9a77df..a6625534e7a 100644
--- a/tensorflow/python/data/kernel_tests/dataset_from_generator_op_test.py
+++ b/tensorflow/python/data/kernel_tests/from_generator_test.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for the experimental input pipeline ops."""
+"""Tests for tf.data.Dataset.from_generator()."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
@@ -27,21 +27,21 @@ from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import script_ops
 from tensorflow.python.platform import test
 
 
-class DatasetConstructorTest(test_base.DatasetTestBase):
+class FromGeneratorTest(test_base.DatasetTestBase):
 
   def _testFromGenerator(self, generator, elem_sequence, num_repeats,
                          output_types=None):
     if output_types is None:
       output_types = dtypes.int64
-    iterator = (
+    iterator = dataset_ops.make_initializable_iterator(
         dataset_ops.Dataset.from_generator(generator, output_types=output_types)
         .repeat(num_repeats)
-        .prefetch(5)
-        .make_initializable_iterator())
+        .prefetch(5))
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
@@ -55,11 +55,10 @@ class DatasetConstructorTest(test_base.DatasetTestBase):
           sess.run(get_next)
 
   def _testFromGeneratorOneShot(self, generator, elem_sequence, num_repeats):
-    iterator = (
+    iterator = dataset_ops.make_one_shot_iterator(
         dataset_ops.Dataset.from_generator(generator, output_types=dtypes.int64)
         .repeat(num_repeats)
-        .prefetch(5)
-        .make_one_shot_iterator())
+        .prefetch(5))
     get_next = iterator.get_next()
 
     with self.cached_session() as sess:
@@ -69,6 +68,7 @@ class DatasetConstructorTest(test_base.DatasetTestBase):
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
 
+  @test_util.run_deprecated_v1
   def testFromGeneratorUsingFunction(self):
     def generator():
       for i in range(1, 100):
@@ -79,18 +79,21 @@ class DatasetConstructorTest(test_base.DatasetTestBase):
     self._testFromGeneratorOneShot(generator, elem_sequence, 1)
     self._testFromGeneratorOneShot(generator, elem_sequence, 5)
 
+  @test_util.run_deprecated_v1
   def testFromGeneratorUsingList(self):
     generator = lambda: [[i] * i for i in range(1, 100)]
     elem_sequence = list(generator())
     self._testFromGenerator(generator, elem_sequence, 1)
     self._testFromGenerator(generator, elem_sequence, 5)
 
+  @test_util.run_deprecated_v1
   def testFromGeneratorUsingNdarray(self):
     generator = lambda: np.arange(100, dtype=np.int64)
     elem_sequence = list(generator())
     self._testFromGenerator(generator, elem_sequence, 1, output_types=np.int64)
     self._testFromGenerator(generator, elem_sequence, 5, output_types=np.int64)
 
+  @test_util.run_deprecated_v1
   def testFromGeneratorUsingGeneratorExpression(self):
     # NOTE(mrry): Generator *expressions* are not repeatable (or in
     # general reusable), because they eagerly evaluate the `for`
@@ -102,6 +105,7 @@ class DatasetConstructorTest(test_base.DatasetTestBase):
     self._testFromGenerator(generator, elem_sequence, 1)
     self._testFromGenerator(generator, elem_sequence, 5)
 
+  @test_util.run_deprecated_v1
   def testFromMultipleConcurrentGenerators(self):
     num_inner_repeats = 5
     num_outer_repeats = 100
@@ -124,11 +128,10 @@ class DatasetConstructorTest(test_base.DatasetTestBase):
           output_shapes=([None], [3]))
               .repeat(num_inner_repeats).prefetch(5))
 
-    iterator = (
+    iterator = dataset_ops.make_initializable_iterator(
         dataset_ops.Dataset.range(num_outer_repeats)
         .interleave(interleave_fn, cycle_length=10,
-                    block_length=len(input_list))
-        .make_initializable_iterator())
+                    block_length=len(input_list)))
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
@@ -183,11 +186,10 @@ class DatasetConstructorTest(test_base.DatasetTestBase):
       return dataset_ops.Dataset.from_generator(
           generator, output_types=dtypes.int64, output_shapes=[]).prefetch(2)
 
-    iterator = (
+    iterator = dataset_ops.make_initializable_iterator(
         dataset_ops.Dataset.range(num_parallel_iterators)
         .interleave(
-            interleave_fn, cycle_length=num_parallel_iterators, block_length=1)
-        .make_initializable_iterator())
+            interleave_fn, cycle_length=num_parallel_iterators, block_length=1))
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
@@ -199,6 +201,7 @@ class DatasetConstructorTest(test_base.DatasetTestBase):
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
 
+  @test_util.run_deprecated_v1
   def testFromGeneratorImplicitConversion(self):
     def generator():
       yield [1]
@@ -206,9 +209,9 @@ class DatasetConstructorTest(test_base.DatasetTestBase):
       yield [3]
 
     for dtype in [dtypes.int8, dtypes.int32, dtypes.int64]:
-      iterator = (dataset_ops.Dataset.from_generator(
-          generator, output_types=dtype, output_shapes=[1])
-                  .make_initializable_iterator())
+      iterator = dataset_ops.make_initializable_iterator(
+          dataset_ops.Dataset.from_generator(
+              generator, output_types=dtype, output_shapes=[1]))
       init_op = iterator.initializer
       get_next = iterator.get_next()
 
@@ -223,15 +226,16 @@ class DatasetConstructorTest(test_base.DatasetTestBase):
         with self.assertRaises(errors.OutOfRangeError):
           sess.run(get_next)
 
+  @test_util.run_deprecated_v1
   def testFromGeneratorString(self):
     def generator():
       yield "foo"
       yield b"bar"
       yield u"baz"
 
-    iterator = (dataset_ops.Dataset.from_generator(
-        generator, output_types=dtypes.string, output_shapes=[])
-                .make_initializable_iterator())
+    iterator = dataset_ops.make_initializable_iterator(
+        dataset_ops.Dataset.from_generator(
+            generator, output_types=dtypes.string, output_shapes=[]))
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
@@ -243,6 +247,7 @@ class DatasetConstructorTest(test_base.DatasetTestBase):
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
 
+  @test_util.run_deprecated_v1
   def testFromGeneratorTypeError(self):
     def generator():
       yield np.array([1, 2, 3], dtype=np.int64)
@@ -250,9 +255,9 @@ class DatasetConstructorTest(test_base.DatasetTestBase):
       yield "ERROR"
       yield np.array([7, 8, 9], dtype=np.int64)
 
-    iterator = (dataset_ops.Dataset.from_generator(
-        generator, output_types=dtypes.int64, output_shapes=[3])
-                .make_initializable_iterator())
+    iterator = dataset_ops.make_initializable_iterator(
+        dataset_ops.Dataset.from_generator(
+            generator, output_types=dtypes.int64, output_shapes=[3]))
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
@@ -266,6 +271,7 @@ class DatasetConstructorTest(test_base.DatasetTestBase):
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
 
+  @test_util.run_deprecated_v1
   def testFromGeneratorShapeError(self):
     def generator():
       yield np.array([1, 2, 3], dtype=np.int64)
@@ -273,9 +279,9 @@ class DatasetConstructorTest(test_base.DatasetTestBase):
       yield np.array([7, 8, 9, 10], dtype=np.int64)
       yield np.array([11, 12, 13], dtype=np.int64)
 
-    iterator = (dataset_ops.Dataset.from_generator(
-        generator, output_types=dtypes.int64, output_shapes=[3])
-                .make_initializable_iterator())
+    iterator = dataset_ops.make_initializable_iterator(
+        dataset_ops.Dataset.from_generator(
+            generator, output_types=dtypes.int64, output_shapes=[3]))
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
@@ -289,6 +295,7 @@ class DatasetConstructorTest(test_base.DatasetTestBase):
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
 
+  @test_util.run_deprecated_v1
   def testFromGeneratorStructureError(self):
     def generator():
       yield 1, 2
@@ -297,9 +304,9 @@ class DatasetConstructorTest(test_base.DatasetTestBase):
       yield 6, 7, 8
       yield 9, 10
 
-    iterator = (dataset_ops.Dataset.from_generator(
-        generator, output_types=(dtypes.int64, dtypes.int64))
-                .make_initializable_iterator())
+    iterator = dataset_ops.make_initializable_iterator(
+        dataset_ops.Dataset.from_generator(
+            generator, output_types=(dtypes.int64, dtypes.int64)))
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
@@ -317,14 +324,15 @@ class DatasetConstructorTest(test_base.DatasetTestBase):
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
 
+  @test_util.run_deprecated_v1
   def testFromGeneratorHeterogeneous(self):
     def generator():
       yield 1
       yield [2, 3]
 
-    iterator = (
+    iterator = dataset_ops.make_initializable_iterator(
         dataset_ops.Dataset.from_generator(
-            generator, output_types=dtypes.int64).make_initializable_iterator())
+            generator, output_types=dtypes.int64))
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
@@ -335,6 +343,7 @@ class DatasetConstructorTest(test_base.DatasetTestBase):
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
 
+  @test_util.run_deprecated_v1
   def testFromGeneratorStopShort(self):
 
     def generator():
@@ -342,9 +351,9 @@ class DatasetConstructorTest(test_base.DatasetTestBase):
       yield 1
       yield 2
 
-    iterator = (
+    iterator = dataset_ops.make_initializable_iterator(
         dataset_ops.Dataset.from_generator(
-            generator, output_types=dtypes.int64).make_initializable_iterator())
+            generator, output_types=dtypes.int64))
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
@@ -353,6 +362,7 @@ class DatasetConstructorTest(test_base.DatasetTestBase):
       self.assertAllEqual(0, sess.run(get_next))
       self.assertAllEqual(1, sess.run(get_next))
 
+  @test_util.run_deprecated_v1
   def testFromGeneratorDestructorCalled(self):
     # Use an `Event` to signal that the generator has been deleted.
     event = threading.Event()
@@ -371,9 +381,9 @@ class DatasetConstructorTest(test_base.DatasetTestBase):
       def __del__(self):
         event.set()
 
-    iterator = dataset_ops.Dataset.from_generator(
-        GeneratorWrapper,
-        output_types=dtypes.int64).take(2).make_initializable_iterator()
+    iterator = dataset_ops.make_initializable_iterator(
+        dataset_ops.Dataset.from_generator(
+            GeneratorWrapper, output_types=dtypes.int64).take(2))
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
@@ -387,6 +397,7 @@ class DatasetConstructorTest(test_base.DatasetTestBase):
       # iterator terminates (and the generator iterator is deleted).
       self.assertTrue(event.is_set())
 
+  @test_util.run_deprecated_v1
   def testFromGeneratorWithArgs(self):
 
     def flat_map_fn(elem):
@@ -399,10 +410,8 @@ class DatasetConstructorTest(test_base.DatasetTestBase):
           generator_with_arg, output_types=dtypes.int64, output_shapes=(),
           args=(elem,))
 
-    iterator = (dataset_ops.Dataset
-                .range(5)
-                .flat_map(flat_map_fn)
-                .make_initializable_iterator())
+    iterator = dataset_ops.make_initializable_iterator(
+        dataset_ops.Dataset.range(5).flat_map(flat_map_fn))
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
@@ -414,6 +423,7 @@ class DatasetConstructorTest(test_base.DatasetTestBase):
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
 
+  @test_util.run_deprecated_v1
   def testFromGeneratorWithTwoArgs(self):
 
     def flat_map_fn(elem, message):
@@ -426,12 +436,11 @@ class DatasetConstructorTest(test_base.DatasetTestBase):
           generator_with_arg, output_types=(dtypes.int64, dtypes.string),
           output_shapes=((), ()), args=(elem, message))
 
-    iterator = (
+    iterator = dataset_ops.make_initializable_iterator(
         dataset_ops.Dataset.zip(
             (dataset_ops.Dataset.range(5),
              dataset_ops.Dataset.from_tensors("Hi!").repeat(None)))
-        .flat_map(flat_map_fn)
-        .make_initializable_iterator())
+        .flat_map(flat_map_fn))
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
@@ -446,6 +455,7 @@ class DatasetConstructorTest(test_base.DatasetTestBase):
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
 
+  @test_util.run_deprecated_v1
   def testGeneratorDatasetFinalizeFunctionCalled(self):
     # NOTE(mrry): This test tests the internal `_GeneratorDataset`,
     # which affords more control over what the finalize function can do than
@@ -462,10 +472,9 @@ class DatasetConstructorTest(test_base.DatasetTestBase):
                                 stateful=True)
 
     dummy = constant_op.constant(37)
-    iterator = (dataset_ops._GeneratorDataset(dummy, lambda x: x,
-                                              lambda x: x, finalize_fn)
-                .take(2)
-                .make_initializable_iterator())
+    iterator = dataset_ops.make_initializable_iterator(
+        dataset_ops._GeneratorDataset(
+            dummy, lambda x: x, lambda x: x, finalize_fn).take(2))
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
diff --git a/tensorflow/python/data/kernel_tests/from_sparse_tensor_slices_test.py b/tensorflow/python/data/kernel_tests/from_sparse_tensor_slices_test.py
new file mode 100644
index 00000000000..ef608ebb670
--- /dev/null
+++ b/tensorflow/python/data/kernel_tests/from_sparse_tensor_slices_test.py
@@ -0,0 +1,86 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for `tf.data.Dataset.from_sparse_tensor_slices()`."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.data.kernel_tests import test_base
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.platform import test
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class FromSparseTensorSlicesTest(test_base.DatasetTestBase):
+
+  @test_util.run_deprecated_v1
+  def testSkipEagerFromSparseTensorSlices(self):
+    """Test a dataset based on slices of a `tf.SparseTensor`."""
+    st = array_ops.sparse_placeholder(dtypes.float64)
+    iterator = dataset_ops.make_initializable_iterator(
+        dataset_ops.Dataset.from_sparse_tensor_slices(st))
+    init_op = iterator.initializer
+    get_next = sparse_tensor.SparseTensor(*iterator.get_next())
+
+    with self.cached_session() as sess:
+      slices = [[1., 2., 3.], [1.], [1.], [1., 2.], [], [1., 2.], [], [], []]
+
+      # Test with sparse tensor in the appropriate order.
+      indices = np.array(
+          [[i, j] for i in range(len(slices)) for j in range(len(slices[i]))])
+      values = np.array([val for s in slices for val in s])
+      dense_shape = np.array([len(slices), max(len(s) for s in slices) + 1])
+      sparse_feed = sparse_tensor.SparseTensorValue(indices, values,
+                                                    dense_shape)
+      sess.run(init_op, feed_dict={st: sparse_feed})
+      for i, s in enumerate(slices):
+        results = sess.run(get_next)
+        self.assertAllEqual(s, results.values)
+        expected_indices = np.array(
+            [[j] for j in range(len(slices[i]))]).reshape([-1, 1])
+        self.assertAllEqual(expected_indices, results.indices)
+        self.assertAllEqual(dense_shape[1:], results.dense_shape)
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+      # Test with sparse tensor in the reverse order, which is not
+      # currently supported.
+      reverse_order_indices = indices[::-1, :]
+      reverse_order_values = values[::-1]
+      sparse_feed = sparse_tensor.SparseTensorValue(
+          reverse_order_indices, reverse_order_values, dense_shape)
+      with self.assertRaises(errors.UnimplementedError):
+        sess.run(init_op, feed_dict={st: sparse_feed})
+
+      # Test with an empty sparse tensor.
+      empty_indices = np.empty((0, 4), dtype=np.int64)
+      empty_values = np.empty((0,), dtype=np.float64)
+      empty_dense_shape = [0, 4, 37, 9]
+      sparse_feed = sparse_tensor.SparseTensorValue(empty_indices, empty_values,
+                                                    empty_dense_shape)
+      sess.run(init_op, feed_dict={st: sparse_feed})
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/data/kernel_tests/from_tensor_slices_test.py b/tensorflow/python/data/kernel_tests/from_tensor_slices_test.py
new file mode 100644
index 00000000000..9a480e56789
--- /dev/null
+++ b/tensorflow/python/data/kernel_tests/from_tensor_slices_test.py
@@ -0,0 +1,177 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for `tf.data.Dataset.from_tensor_slices()."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.data.kernel_tests import test_base
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import test_util
+from tensorflow.python.platform import test
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class FromTensorSlicesTest(test_base.DatasetTestBase):
+
+  def testFromTensorSlices(self):
+    """Test a dataset that represents the slices from a tuple of tensors."""
+    components = (
+        np.tile(np.array([[1], [2], [3], [4]]), 20), np.tile(
+            np.array([[12], [13], [14], [15]]), 22),
+        np.array([37.0, 38.0, 39.0, 40.0])
+    )
+
+    dataset = dataset_ops.Dataset.from_tensor_slices(components)
+    get_next = self.getNext(dataset)
+
+    self.assertEqual([c.shape[1:] for c in components],
+                     [shape for shape in dataset.output_shapes])
+
+    for i in range(4):
+      results = self.evaluate(get_next())
+      for component, result_component in zip(components, results):
+        self.assertAllEqual(component[i], result_component)
+    with self.assertRaises(errors.OutOfRangeError):
+      results = self.evaluate(get_next())
+
+  def testSkipEagerFromTensorSlicesSparse(self):
+    """Test a dataset that represents the slices from a tuple of tensors."""
+    components = (sparse_tensor.SparseTensorValue(
+        indices=np.array([[0, 0], [1, 0], [2, 0]]),
+        values=np.array([0, 0, 0]),
+        dense_shape=np.array([3, 1])),
+                  sparse_tensor.SparseTensorValue(
+                      indices=np.array([[0, 0], [1, 1], [2, 2]]),
+                      values=np.array([1, 2, 3]),
+                      dense_shape=np.array([3, 3])))
+
+    dataset = dataset_ops.Dataset.from_tensor_slices(components)
+
+    self.assertEqual(
+        [tensor_shape.TensorShape(c.dense_shape[1:]) for c in components],
+        [shape for shape in dataset.output_shapes])
+
+    expected = [
+        (sparse_tensor.SparseTensorValue(
+            indices=np.array([[0]]),
+            values=np.array([0]),
+            dense_shape=np.array([1])),
+         sparse_tensor.SparseTensorValue(
+             indices=np.array([[0]]),
+             values=np.array([1]),
+             dense_shape=np.array([3]))),
+        (sparse_tensor.SparseTensorValue(
+            indices=np.array([[0]]),
+            values=np.array([0]),
+            dense_shape=np.array([1])),
+         sparse_tensor.SparseTensorValue(
+             indices=np.array([[1]]),
+             values=np.array([2]),
+             dense_shape=np.array([3]))),
+        (sparse_tensor.SparseTensorValue(
+            indices=np.array([[0]]),
+            values=np.array([0]),
+            dense_shape=np.array([1])),
+         sparse_tensor.SparseTensorValue(
+             indices=np.array([[2]]),
+             values=np.array([3]),
+             dense_shape=np.array([3]))),
+    ]
+    self.assertDatasetProduces(dataset, expected_output=expected)
+
+  def testFromTensorSlicesMixed(self):
+    """Test a dataset that represents the slices from a tuple of tensors."""
+    components = (np.tile(np.array([[1], [2], [3]]), 20),
+                  np.tile(np.array([[12], [13], [14]]), 22),
+                  np.array([37.0, 38.0, 39.0]),
+                  sparse_tensor.SparseTensorValue(
+                      indices=np.array([[0, 0], [1, 0], [2, 0]]),
+                      values=np.array([0, 0, 0]),
+                      dense_shape=np.array([3, 1])),
+                  sparse_tensor.SparseTensorValue(
+                      indices=np.array([[0, 0], [1, 1], [2, 2]]),
+                      values=np.array([1, 2, 3]),
+                      dense_shape=np.array([3, 3])))
+
+    dataset = dataset_ops.Dataset.from_tensor_slices(components)
+    get_next = self.getNext(dataset)
+    self.assertEqual([
+        tensor_shape.TensorShape(c.dense_shape[1:])
+        if sparse_tensor.is_sparse(c) else c.shape[1:] for c in components
+    ], [shape for shape in dataset.output_shapes])
+
+    expected = [
+        (sparse_tensor.SparseTensorValue(
+            indices=np.array([[0]]),
+            values=np.array([0]),
+            dense_shape=np.array([1])),
+         sparse_tensor.SparseTensorValue(
+             indices=np.array([[0]]),
+             values=np.array([1]),
+             dense_shape=np.array([3]))),
+        (sparse_tensor.SparseTensorValue(
+            indices=np.array([[0]]),
+            values=np.array([0]),
+            dense_shape=np.array([1])),
+         sparse_tensor.SparseTensorValue(
+             indices=np.array([[1]]),
+             values=np.array([2]),
+             dense_shape=np.array([3]))),
+        (sparse_tensor.SparseTensorValue(
+            indices=np.array([[0]]),
+            values=np.array([0]),
+            dense_shape=np.array([1])),
+         sparse_tensor.SparseTensorValue(
+             indices=np.array([[2]]),
+             values=np.array([3]),
+             dense_shape=np.array([3]))),
+    ]
+    for i in range(3):
+      results = self.evaluate(get_next())
+      for component, result_component in zip(
+          (list(zip(*components[:3]))[i] + expected[i]), results):
+        if sparse_tensor.is_sparse(component):
+          self.assertSparseValuesEqual(component, result_component)
+        else:
+          self.assertAllEqual(component, result_component)
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
+
+  def testFromTensorSlicesWithDict(self):
+    components = {"foo": [1, 2, 3], "bar": [[4.0], [5.0], [6.0]]}
+    dataset = dataset_ops.Dataset.from_tensor_slices(components)
+    get_next = self.getNext(dataset)
+
+    self.assertEqual(dtypes.int32, dataset.output_types["foo"])
+    self.assertEqual(dtypes.float32, dataset.output_types["bar"])
+    self.assertEqual((), dataset.output_shapes["foo"])
+    self.assertEqual((1,), dataset.output_shapes["bar"])
+
+    for i in range(3):
+      results = self.evaluate(get_next())
+      self.assertEqual(components["foo"][i], results["foo"])
+      self.assertEqual(components["bar"][i], results["bar"])
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/data/kernel_tests/from_tensors_test.py b/tensorflow/python/data/kernel_tests/from_tensors_test.py
new file mode 100644
index 00000000000..ab3c15263fd
--- /dev/null
+++ b/tensorflow/python/data/kernel_tests/from_tensors_test.py
@@ -0,0 +1,259 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for `tf.data.Dataset.from_tensors()."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.core.protobuf import config_pb2
+from tensorflow.python.client import session
+from tensorflow.python.data.kernel_tests import test_base
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.util import nest
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.platform import test
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class FromTensorsTest(test_base.DatasetTestBase):
+
+  def testFromTensors(self):
+    """Test a dataset that represents a single tuple of tensors."""
+    components = (np.array(1), np.array([1, 2, 3]), np.array(37.0))
+
+    dataset = dataset_ops.Dataset.from_tensors(components)
+
+    self.assertEqual([c.shape for c in components],
+                     nest.flatten(dataset.output_shapes))
+
+    self.assertDatasetProduces(dataset, expected_output=[components])
+
+  def testSkipEagerFromTensorsSparse(self):
+    """Test a dataset that represents a single tuple of tensors."""
+    components = (sparse_tensor.SparseTensorValue(
+        indices=np.array([[0]]),
+        values=np.array([0]),
+        dense_shape=np.array([1])),
+                  sparse_tensor.SparseTensorValue(
+                      indices=np.array([[0, 0], [1, 1]]),
+                      values=np.array([-1, 1]),
+                      dense_shape=np.array([2, 2])))
+
+    dataset = dataset_ops.Dataset.from_tensors(components)
+
+    self.assertEqual(
+        [tensor_shape.TensorShape(c.dense_shape) for c in components],
+        [shape for shape in dataset.output_shapes])
+    self.assertDatasetProduces(dataset, expected_output=[components])
+
+  def testFromTensorsMixed(self):
+    """Test an dataset that represents a single tuple of tensors."""
+    components = (np.array(1), np.array([1, 2, 3]), np.array(37.0),
+                  sparse_tensor.SparseTensorValue(
+                      indices=np.array([[0]]),
+                      values=np.array([0]),
+                      dense_shape=np.array([1])),
+                  sparse_tensor.SparseTensorValue(
+                      indices=np.array([[0, 0], [1, 1]]),
+                      values=np.array([-1, 1]),
+                      dense_shape=np.array([2, 2])))
+
+    dataset = dataset_ops.Dataset.from_tensors(components)
+    self.assertEqual([
+        tensor_shape.TensorShape(c.dense_shape)
+        if sparse_tensor.is_sparse(c) else c.shape for c in components
+    ], [shape for shape in dataset.output_shapes])
+
+    self.assertDatasetProduces(dataset, expected_output=[components])
+
+  # pylint: disable=g-long-lambda,unnecessary-lambda
+  def testNestedStructure(self):
+    components = (np.array([1, 2, 3], dtype=np.int64),
+                  (np.array([4., 5.]), np.array([6., 7.])),
+                  np.array([8, 9, 10], dtype=np.int64))
+
+    dataset = dataset_ops.Dataset.from_tensors(components)
+    self.assertEquals((dtypes.int64, (dtypes.float64, dtypes.float64),
+                       dtypes.int64), dataset.output_types)
+    self.assertEquals(([3], ([2], [2]), [3]), dataset.output_shapes)
+
+    dataset = dataset.shuffle(10, 10)
+    self.assertEquals((dtypes.int64, (dtypes.float64, dtypes.float64),
+                       dtypes.int64), dataset.output_types)
+    self.assertEquals(([3], ([2], [2]), [3]), dataset.output_shapes)
+
+    dataset = dataset.repeat(-1)
+    self.assertEquals((dtypes.int64, (dtypes.float64, dtypes.float64),
+                       dtypes.int64), dataset.output_types)
+    self.assertEquals(([3], ([2], [2]), [3]), dataset.output_shapes)
+
+    dataset = dataset.filter(lambda x, y, z: True)
+    self.assertEquals((dtypes.int64, (dtypes.float64, dtypes.float64),
+                       dtypes.int64), dataset.output_types)
+    self.assertEquals(([3], ([2], [2]), [3]), dataset.output_shapes)
+
+    dataset = dataset.take(5)
+    self.assertEquals((dtypes.int64, (dtypes.float64, dtypes.float64),
+                       dtypes.int64), dataset.output_types)
+    self.assertEquals(([3], ([2], [2]), [3]), dataset.output_shapes)
+
+    dataset = dataset.map(lambda x, y, z: ((x, z), (y[0], y[1])))
+    self.assertEquals(((dtypes.int64, dtypes.int64),
+                       (dtypes.float64, dtypes.float64)), dataset.output_types)
+    self.assertEquals((([3], [3]), ([2], [2])), dataset.output_shapes)
+
+    dataset = dataset.flat_map(
+        lambda x, y: dataset_ops.Dataset.from_tensors(((x[0], x[1]),
+                                                       (y[0], y[1])))
+    )
+    self.assertEquals(((dtypes.int64, dtypes.int64),
+                       (dtypes.float64, dtypes.float64)), dataset.output_types)
+    self.assertEquals((([3], [3]), ([2], [2])), dataset.output_shapes)
+
+    dataset = dataset.batch(32)
+    self.assertEquals(((dtypes.int64, dtypes.int64),
+                       (dtypes.float64, dtypes.float64)), dataset.output_types)
+    self.assertEquals((([None, 3], [None, 3]), ([None, 2], [None, 2])),
+                      nest.pack_sequence_as(dataset.output_shapes, [
+                          s.as_list()
+                          for s in nest.flatten(dataset.output_shapes)
+                      ]))
+
+    # Define a separate set of components with matching leading
+    # dimension for the from-slices constructor.
+    components_for_slices = (np.array([1, 2, 3], dtype=np.int64),
+                             (np.array([4., 5., 6.]), np.array([7., 8., 9.])),
+                             np.array([10, 11, 12], dtype=np.int64))
+
+    dataset = dataset_ops.Dataset.from_tensor_slices(components_for_slices)
+    self.assertEquals((dtypes.int64,
+                       (dtypes.float64, dtypes.float64), dtypes.int64),
+                      dataset.output_types)
+    self.assertEquals(([], ([], []), []), dataset.output_shapes)
+
+  # TODO(b/117581999): more specific shapes in eager mode.
+  @test_util.run_deprecated_v1
+  def testSkipEagerNestedStructure(self):
+    components = (np.array([1, 2, 3], dtype=np.int64), (np.array([4., 5.]),
+                                                        np.array([6., 7.])),
+                  np.array([8, 9, 10], dtype=np.int64))
+
+    dataset = dataset_ops.Dataset.from_tensors(components)
+    dataset = dataset.map(lambda x, y, z: ((x, z), (y[0], y[1])))
+
+    dataset = dataset.flat_map(
+        lambda x, y: dataset_ops.Dataset.from_tensors(
+            ((x[0], x[1]), (y[0], y[1])))).batch(32)
+
+    get_next = self.getNext(dataset)
+    (w, x), (y, z) = get_next()
+    self.assertEquals(dtypes.int64, w.dtype)
+    self.assertEquals(dtypes.int64, x.dtype)
+    self.assertEquals(dtypes.float64, y.dtype)
+    self.assertEquals(dtypes.float64, z.dtype)
+    self.assertEquals([None, 3], w.shape.as_list())
+    self.assertEquals([None, 3], x.shape.as_list())
+    self.assertEquals([None, 2], y.shape.as_list())
+    self.assertEquals([None, 2], z.shape.as_list())
+
+    get_next = self.getNext(dataset)
+    (w, x), (y, z) = get_next()
+    self.assertEquals(dtypes.int64, w.dtype)
+    self.assertEquals(dtypes.int64, x.dtype)
+    self.assertEquals(dtypes.float64, y.dtype)
+    self.assertEquals(dtypes.float64, z.dtype)
+    self.assertEquals([None, 3], w.shape.as_list())
+    self.assertEquals([None, 3], x.shape.as_list())
+    self.assertEquals([None, 2], y.shape.as_list())
+    self.assertEquals([None, 2], z.shape.as_list())
+
+  def testNestedDict(self):
+    components = {"a": {"aa": 1, "ab": [2.0, 2.0]}, "b": [3, 3, 3]}
+    dataset = dataset_ops.Dataset.from_tensors(components)
+    self.assertEquals(dtypes.int32, dataset.output_types["a"]["aa"])
+    self.assertEquals(dtypes.float32, dataset.output_types["a"]["ab"])
+    self.assertEquals(dtypes.int32, dataset.output_types["b"])
+    self.assertEquals([], dataset.output_shapes["a"]["aa"])
+    self.assertEquals([2], dataset.output_shapes["a"]["ab"])
+    self.assertEquals([3], dataset.output_shapes["b"])
+
+  def testNonSequenceNestedStructure(self):
+    components = np.array([1, 2, 3], dtype=np.int64)
+
+    dataset = dataset_ops.Dataset.from_tensors(components)
+    self.assertEquals(dtypes.int64, dataset.output_types)
+    self.assertEquals([3], dataset.output_shapes)
+
+    dataset = dataset.filter(
+        lambda x: math_ops.reduce_all(math_ops.equal(x, components)))
+    self.assertEquals(dtypes.int64, dataset.output_types)
+    self.assertEquals([3], dataset.output_shapes)
+
+    dataset = dataset.map(lambda x: array_ops.stack([x, x]))
+    self.assertEquals(dtypes.int64, dataset.output_types)
+    self.assertEquals([2, 3], dataset.output_shapes)
+
+    dataset = dataset.flat_map(
+        lambda x: dataset_ops.Dataset.from_tensor_slices(x))
+    self.assertEquals(dtypes.int64, dataset.output_types)
+    self.assertEquals([3], dataset.output_shapes)
+
+    get_next = self.getNext(dataset)
+    self.assertEquals(dtypes.int64, get_next().dtype)
+    self.assertEquals([3], get_next().shape)
+
+  def testSkipEagerSplitPipelineFailsWithPlacementError(self):
+    with session.Session(
+        target="",
+        config=config_pb2.ConfigProto(device_count={"CPU": 2})) as sess:
+
+      dataset = dataset_ops.Dataset.from_tensors(0)
+
+      # Define a pipeline that attempts to use variables on two
+      # different devices.
+      #
+      # Initialize the variables before creating to iterator, to avoid the
+      # placement algorithm overriding the DT_RESOURCE colocation constraints.
+      with ops.device("/cpu:0"):
+        var_0 = resource_variable_ops.ResourceVariable(initial_value=0)
+        dataset = dataset.map(lambda x: x + var_0.read_value())
+      sess.run(var_0.initializer)
+
+      with ops.device("/cpu:1"):
+        var_1 = resource_variable_ops.ResourceVariable(initial_value=0)
+        dataset = dataset.map(lambda x: x + var_1.read_value())
+      sess.run(var_1.initializer)
+
+      iterator = dataset_ops.make_initializable_iterator(dataset)
+      sess.run(iterator.initializer)
+
+      with self.assertRaisesRegexp(
+          errors.FailedPreconditionError,
+          "Error while reading resource variable Variable"):
+        sess.run(iterator.get_next())
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/data/kernel_tests/inputs_test.py b/tensorflow/python/data/kernel_tests/inputs_test.py
deleted file mode 100644
index d089b49bcc6..00000000000
--- a/tensorflow/python/data/kernel_tests/inputs_test.py
+++ /dev/null
@@ -1,149 +0,0 @@
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from absl.testing import parameterized
-import numpy as np
-
-from tensorflow.python.data.kernel_tests import test_base
-from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.data.ops import readers
-from tensorflow.python.data.util import nest
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import sparse_tensor
-from tensorflow.python.platform import test
-
-
-class InputsTest(test_base.DatasetTestBase, parameterized.TestCase):
-
-  @staticmethod
-  def make_apply_fn(dataset):
-
-    def apply_fn(dataset):
-
-      def _apply_fn(dataset):
-        return dataset.cache()
-
-      return dataset.apply(_apply_fn)
-
-    return apply_fn
-
-  @staticmethod
-  def make_gen():
-
-    def gen():
-      yield 42
-
-    return gen
-
-  @staticmethod
-  def make_interleave_fn(dataset, num_parallel_calls=None):
-
-    def interleave_fn(dataset):
-      return dataset.interleave(
-          lambda x: dataset_ops.Dataset.range(0),
-          cycle_length=2,
-          num_parallel_calls=num_parallel_calls)
-
-    return interleave_fn
-
-  @parameterized.named_parameters(
-      ("FixedLengthRecord", readers.FixedLengthRecordDataset("", 42)),
-      ("FromGenerator",
-       dataset_ops.Dataset.from_generator(make_gen.__func__(), dtypes.int32),
-       1),
-      ("FromSparseTensorSlices",
-       dataset_ops.Dataset.from_sparse_tensor_slices(
-           sparse_tensor.SparseTensor(
-               indices=np.array([[0, 0], [1, 0], [2, 0]]),
-               values=np.array([0, 0, 0]),
-               dense_shape=np.array([3, 1])))),
-      ("FromTensors", dataset_ops.Dataset.from_tensors([42])),
-      ("FromTensorSlices", dataset_ops.Dataset.from_tensors([42])),
-      ("Range", dataset_ops.Dataset.range(10)),
-      ("TextLine", readers.TextLineDataset("")),
-      ("TFRecord", readers.TFRecordDataset(""), 1),
-  )
-  def testDatasetSourceInputs(self, dataset, num_inputs=0):
-    self.assertEqual(num_inputs, len(dataset._inputs()))
-
-  @parameterized.named_parameters(
-      ("Apply", make_apply_fn.__func__(dataset_ops.Dataset.range(0)),
-       dataset_ops.Dataset.range(0)),
-      ("Batch", lambda x: x.batch(10), dataset_ops.Dataset.range(0)),
-      ("Cache", lambda x: x.cache(), dataset_ops.Dataset.range(0)),
-      ("Filter", lambda x: x.filter(lambda x: True),
-       dataset_ops.Dataset.range(0)),
-      ("FlatMap", lambda x: x.flat_map(lambda x: dataset_ops.Dataset.range(0)),
-       dataset_ops.Dataset.range(0)),
-      ("Interleave", make_interleave_fn.__func__(dataset_ops.Dataset.range(0)),
-       dataset_ops.Dataset.range(0)),
-      ("Map", lambda x: x.map(lambda x: x), dataset_ops.Dataset.range(0)),
-      ("PaddedBatch", lambda x: x.padded_batch(10, []),
-       dataset_ops.Dataset.range(0)),
-      ("ParallelInterleave",
-       make_interleave_fn.__func__(dataset_ops.Dataset.range(0), 2),
-       dataset_ops.Dataset.range(0)),
-      ("ParallelMap", lambda x: x.map(lambda x: x, num_parallel_calls=2),
-       dataset_ops.Dataset.range(0)),
-      ("Repeat", lambda x: x.repeat(), dataset_ops.Dataset.range(0)),
-      ("Shuffle", lambda x: x.shuffle(10), dataset_ops.Dataset.range(0)),
-      ("Skip", lambda x: x.skip(1), dataset_ops.Dataset.range(0)),
-      ("Take", lambda x: x.take(1), dataset_ops.Dataset.range(0)),
-      ("Window", lambda x: x.window(10), dataset_ops.Dataset.range(0)),
-  )
-  def testUnaryTransformationInputs(self, dataset_fn, input_dataset):
-    self.assertEqual([input_dataset], dataset_fn(input_dataset)._inputs())
-
-  @parameterized.named_parameters(
-      ("Concatenate", lambda x, y: x.concatenate(y),
-       dataset_ops.Dataset.range(0), dataset_ops.Dataset.range(1)))
-  def testBinaryTransformationInputs(self, dataset_fn, input1, input2):
-    self.assertEqual([input1, input2], dataset_fn(input1, input2)._inputs())
-
-  @parameterized.named_parameters(
-      ("ZipOne", dataset_ops.Dataset.zip, (dataset_ops.Dataset.range(0))),
-      ("ZipNest", dataset_ops.Dataset.zip,
-       (dataset_ops.Dataset.range(0),
-        (dataset_ops.Dataset.range(1), dataset_ops.Dataset.range(2)))),
-      ("ZipTuple", dataset_ops.Dataset.zip,
-       (dataset_ops.Dataset.range(0), dataset_ops.Dataset.range(1))))
-  def testVariadicTransformationInputs(self, dataset_fn, input_datasets):
-    self.assertEqual(
-        nest.flatten(input_datasets),
-        dataset_fn(input_datasets)._inputs())
-
-  def testCollectInputs(self):
-    ds1 = dataset_ops.Dataset.range(0)
-    ds2 = ds1.concatenate(ds1)
-    ds3 = dataset_ops.Dataset.zip((ds2, ds1, ds2))
-
-    inputs = []
-    queue = [ds3]
-    while queue:
-      ds = queue[0]
-      queue = queue[1:]
-      queue.extend(ds._inputs())
-      inputs.append(ds)
-
-    self.assertEqual(5, inputs.count(ds1))
-    self.assertEqual(2, inputs.count(ds2))
-    self.assertEqual(1, inputs.count(ds3))
-
-
-if __name__ == "__main__":
-  test.main()
diff --git a/tensorflow/python/data/kernel_tests/interleave_dataset_op_test.py b/tensorflow/python/data/kernel_tests/interleave_test.py
similarity index 85%
rename from tensorflow/python/data/kernel_tests/interleave_dataset_op_test.py
rename to tensorflow/python/data/kernel_tests/interleave_test.py
index b911c249ced..c3450e65251 100644
--- a/tensorflow/python/data/kernel_tests/interleave_dataset_op_test.py
+++ b/tensorflow/python/data/kernel_tests/interleave_test.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for the experimental input pipeline ops."""
+"""Tests for `tf.data.Dataset.interleave()`."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
@@ -27,6 +27,7 @@ from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import script_ops
 from tensorflow.python.ops import sparse_ops
@@ -115,7 +116,7 @@ def _make_coordinated_sloppy_dataset(input_values, cycle_length, block_length,
   dataset = dataset_ops.Dataset.from_tensor_slices(input_values).repeat(
       2).interleave(interleave_fn, cycle_length, block_length,
                     num_parallel_calls).with_options(options)
-  iterator = dataset.make_one_shot_iterator()
+  iterator = dataset_ops.make_one_shot_iterator(dataset)
   get_next = iterator.get_next()
   return get_next, coordination_events
 
@@ -133,7 +134,8 @@ def _repeat(values, count):
   return [[value] * value for value in np.tile(values, count)]
 
 
-class InterleaveDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
+@test_util.run_all_in_graph_and_eager_modes
+class InterleaveTest(test_base.DatasetTestBase, parameterized.TestCase):
 
   @parameterized.named_parameters(
       ("1", [4, 5, 6], 1, 1, [
@@ -191,16 +193,11 @@ class InterleaveDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
         count).interleave(
             lambda x: dataset_ops.Dataset.from_tensors(x).repeat(x),
             cycle_length, block_length, num_parallel_calls)
-    get_next = dataset.make_one_shot_iterator().get_next()
-
-    with self.cached_session() as sess:
-      for expected_element in _interleave(
-          _repeat(input_values, count), cycle_length, block_length):
-        self.assertEqual(expected_element, sess.run(get_next))
-
-      for _ in range(2):
-        with self.assertRaises(errors.OutOfRangeError):
-          sess.run(get_next)
+    expected_output = [
+        element for element in _interleave(
+            _repeat(input_values, count), cycle_length, block_length)
+    ]
+    self.assertDatasetProduces(dataset, expected_output)
 
   @parameterized.named_parameters(
       ("1", np.float32([1., np.nan, 2., np.nan, 3.]), 1, 3, None),
@@ -223,17 +220,16 @@ class InterleaveDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
         lambda x: array_ops.check_numerics(x, "message")).interleave(
             dataset_ops.Dataset.from_tensors, cycle_length, block_length,
             num_parallel_calls)
-    get_next = dataset.make_one_shot_iterator().get_next()
+    get_next = self.getNext(dataset)
 
-    with self.cached_session() as sess:
-      for value in input_values:
-        if np.isnan(value):
-          with self.assertRaises(errors.InvalidArgumentError):
-            sess.run(get_next)
-        else:
-          self.assertEqual(value, sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+    for value in input_values:
+      if np.isnan(value):
+        with self.assertRaises(errors.InvalidArgumentError):
+          self.evaluate(get_next())
+      else:
+        self.assertEqual(value, self.evaluate(get_next()))
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
 
   def testInterleaveSparse(self):
 
@@ -245,18 +241,17 @@ class InterleaveDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
       return dataset_ops.Dataset.from_tensor_slices(
           sparse_ops.sparse_to_dense(x.indices, x.dense_shape, x.values))
 
-    iterator = (
-        dataset_ops.Dataset.range(10).map(_map_fn).interleave(
-            _interleave_fn, cycle_length=1).make_one_shot_iterator())
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      for i in range(10):
-        for j in range(2):
-          expected = [i, 0] if j % 2 == 0 else [0, -i]
-          self.assertAllEqual(expected, sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+    dataset = dataset_ops.Dataset.range(10).map(_map_fn).interleave(
+        _interleave_fn, cycle_length=1)
+    get_next = self.getNext(dataset)
+    for i in range(10):
+      for j in range(2):
+        expected = [i, 0] if j % 2 == 0 else [0, -i]
+        self.assertAllEqual(expected, self.evaluate(get_next()))
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
 
   @parameterized.named_parameters(
       ("1", np.int64([4, 5, 6]), 2, 1, 1),
@@ -269,8 +264,8 @@ class InterleaveDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
       ("8", np.int64([4, 0, 6]), 2, 3, 1),
       ("9", np.int64([4, 0, 6]), 2, 3, 2),
   )
-  def testSloppyInterleaveInOrder(self, input_values, cycle_length,
-                                  block_length, num_parallel_calls):
+  def testSkipEagerSloppyInterleaveInOrder(self, input_values, cycle_length,
+                                           block_length, num_parallel_calls):
     get_next, coordination_events = _make_coordinated_sloppy_dataset(
         input_values, cycle_length, block_length, num_parallel_calls)
     config = config_pb2.ConfigProto(
@@ -281,7 +276,7 @@ class InterleaveDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
           _repeat(input_values, 2), cycle_length, block_length):
         coordination_events[expected_element].set()
         self.assertEqual(expected_element * expected_element,
-                         sess.run(get_next))
+                         self.evaluate(get_next))
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
 
@@ -291,8 +286,8 @@ class InterleaveDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
       ("3", np.int64([4, 5, 6]), 3, 2, 3),
       ("4", np.int64([4, 0, 6]), 2, 3, 2),
   )
-  def testSloppyInterleaveOutOfOrder(self, input_values, cycle_length,
-                                     block_length, num_parallel_calls):
+  def testSkipEagerSloppyInterleaveOutOfOrder(self, input_values, cycle_length,
+                                              block_length, num_parallel_calls):
     get_next, coordination_events = _make_coordinated_sloppy_dataset(
         input_values, cycle_length, block_length, num_parallel_calls)
     config = config_pb2.ConfigProto(
@@ -308,7 +303,7 @@ class InterleaveDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
 
       for element in elements:
         coordination_events[element].set()
-        self.assertEqual(element * element, sess.run(get_next))
+        self.assertEqual(element * element, self.evaluate(get_next))
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
 
diff --git a/tensorflow/python/data/kernel_tests/iterator_checkpoint_test.py b/tensorflow/python/data/kernel_tests/iterator_checkpoint_test.py
new file mode 100644
index 00000000000..91b356691b7
--- /dev/null
+++ b/tensorflow/python/data/kernel_tests/iterator_checkpoint_test.py
@@ -0,0 +1,129 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Checkpoint tests for `tf.data.Iterator`."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import functools
+import os
+
+from tensorflow.python.data.kernel_tests import test_base
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.eager import context
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import test
+from tensorflow.python.training import checkpoint_management
+from tensorflow.python.training.checkpointable import util as checkpointable_utils
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class IteratorCheckpointingTest(test_base.DatasetTestBase):
+
+  def testSaveRestoreOneShotIterator(self):
+    checkpoint_directory = self.get_temp_dir()
+    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+    dataset = dataset_ops.Dataset.from_tensor_slices([1, 2, 3, 4, 5, 6]).map(
+        math_ops.square).batch(2)
+    iterator = iter(dataset) if context.executing_eagerly(
+    ) else dataset_ops.make_one_shot_iterator(dataset)
+    get_next = iterator.get_next if context.executing_eagerly(
+    ) else functools.partial(self.evaluate, iterator.get_next())
+    checkpoint = checkpointable_utils.Checkpoint(iterator=iterator)
+    self.assertAllEqual([1, 4], get_next())
+    save_path = checkpoint.save(checkpoint_prefix)
+    self.assertAllEqual([9, 16], get_next())
+    self.assertAllEqual([25, 36], get_next())
+    checkpoint.restore(save_path).run_restore_ops()
+    self.assertAllEqual([9, 16], get_next())
+    self.assertAllEqual([25, 36], get_next())
+    with self.assertRaises(errors.OutOfRangeError):
+      get_next()
+
+  def testSaveRestoreMultipleIterator(self):
+    checkpoint_directory = self.get_temp_dir()
+    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+    dataset = dataset_ops.Dataset.from_tensor_slices(
+        [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11])
+    dataset = dataset.map(math_ops.square).batch(2)
+    iterator_1 = iter(dataset) if context.executing_eagerly(
+    ) else dataset_ops.make_one_shot_iterator(dataset)
+    get_next_1 = iterator_1.get_next if context.executing_eagerly(
+    ) else functools.partial(self.evaluate, iterator_1.get_next())
+    iterator_2 = iter(dataset) if context.executing_eagerly(
+    ) else dataset_ops.make_one_shot_iterator(dataset)
+    get_next_2 = iterator_2.get_next if context.executing_eagerly(
+    ) else functools.partial(self.evaluate, iterator_2.get_next())
+    dataset_2 = dataset_ops.Dataset.range(10)
+    iterator_3 = iter(dataset_2) if context.executing_eagerly(
+    ) else dataset_ops.make_one_shot_iterator(dataset_2)
+    get_next_3 = iterator_3.get_next if context.executing_eagerly(
+    ) else functools.partial(self.evaluate, iterator_3.get_next())
+    checkpoint = checkpointable_utils.Checkpoint(
+        iterator_1=iterator_1, iterator_2=iterator_2, iterator_3=iterator_3)
+    self.assertAllEqual([1, 4], get_next_1())
+    self.assertAllEqual(0, get_next_3())
+    self.assertAllEqual(1, get_next_3())
+    self.assertAllEqual(2, get_next_3())
+    save_path = checkpoint.save(checkpoint_prefix)
+    self.assertAllEqual([1, 4], get_next_2())
+    self.assertAllEqual([9, 16], get_next_2())
+    self.assertAllEqual(3, get_next_3())
+    checkpoint.restore(save_path).run_restore_ops()
+    self.assertAllEqual([9, 16], get_next_1())
+    self.assertAllEqual([1, 4], get_next_2())
+    self.assertAllEqual(3, get_next_3())
+
+  def testRestoreExhaustedIterator(self):
+    checkpoint_directory = self.get_temp_dir()
+    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+    dataset = dataset_ops.Dataset.range(3)
+    iterator = iter(dataset) if context.executing_eagerly(
+    ) else dataset_ops.make_one_shot_iterator(dataset)
+    get_next = iterator.get_next if context.executing_eagerly(
+    ) else functools.partial(self.evaluate, iterator.get_next())
+    checkpoint = checkpointable_utils.Checkpoint(iterator=iterator)
+    self.assertAllEqual(0, get_next())
+    self.assertAllEqual(1, get_next())
+    save_path = checkpoint.save(checkpoint_prefix)
+    self.assertAllEqual(2, get_next())
+    checkpoint.restore(save_path).run_restore_ops()
+    self.assertAllEqual(2, get_next())
+    save_path = checkpoint.save(checkpoint_prefix)
+    checkpoint.restore(save_path).run_restore_ops()
+    with self.assertRaises(errors.OutOfRangeError):
+      get_next()
+
+  def testRestoreInReconstructedIteratorInitializable(self):
+    checkpoint_directory = self.get_temp_dir()
+    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+    dataset = dataset_ops.Dataset.range(10)
+    iterator = iter(dataset) if context.executing_eagerly(
+    ) else dataset_ops.make_initializable_iterator(dataset)
+    get_next = iterator.get_next
+    checkpoint = checkpointable_utils.Checkpoint(iterator=iterator)
+    for i in range(5):
+      checkpoint.restore(
+          checkpoint_management.latest_checkpoint(
+              checkpoint_directory)).initialize_or_restore()
+      for j in range(2):
+        self.assertEqual(i * 2 + j, self.evaluate(get_next()))
+      checkpoint.save(file_prefix=checkpoint_prefix)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/data/kernel_tests/iterator_ops_cluster_test.py b/tensorflow/python/data/kernel_tests/iterator_cluster_test.py
similarity index 96%
rename from tensorflow/python/data/kernel_tests/iterator_ops_cluster_test.py
rename to tensorflow/python/data/kernel_tests/iterator_cluster_test.py
index bf5fd781d65..728bed20a12 100644
--- a/tensorflow/python/data/kernel_tests/iterator_ops_cluster_test.py
+++ b/tensorflow/python/data/kernel_tests/iterator_cluster_test.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for the experimental input pipeline ops that need test_util."""
+"""Tests for `tf.data.Iterator` using distributed sessions."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
@@ -47,7 +47,7 @@ class IteratorClusterTest(test.TestCase):
 
     with ops.device("/job:worker/replica:0/task:0/cpu:1"):
       dataset_3 = dataset_ops.Dataset.from_tensor_slices([1, 2, 3])
-      iterator_3 = dataset_3.make_one_shot_iterator()
+      iterator_3 = dataset_ops.make_one_shot_iterator(dataset_3)
       iterator_3_handle = iterator_3.string_handle()
 
     with ops.device("/job:worker/replica:0/task:0/cpu:0"):
@@ -62,7 +62,7 @@ class IteratorClusterTest(test.TestCase):
   def _testRemoteIteratorHelper(self, device0, device1, target):
     with ops.device(device1):
       dataset_3 = dataset_ops.Dataset.from_tensor_slices([1, 2, 3])
-      iterator_3 = dataset_3.make_one_shot_iterator()
+      iterator_3 = dataset_ops.make_one_shot_iterator(dataset_3)
       iterator_3_handle = iterator_3.string_handle()
 
     @function.Defun(dtypes.string)
@@ -161,7 +161,7 @@ class IteratorClusterTest(test.TestCase):
         dataset_ops.Dataset.from_tensor_slices(components).map(_map_fn)
         .repeat(None).prefetch(10000))
 
-    iterator = dataset.make_initializable_iterator()
+    iterator = dataset_ops.make_initializable_iterator(dataset)
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
diff --git a/tensorflow/python/data/kernel_tests/iterator_ops_test.py b/tensorflow/python/data/kernel_tests/iterator_test.py
similarity index 83%
rename from tensorflow/python/data/kernel_tests/iterator_ops_test.py
rename to tensorflow/python/data/kernel_tests/iterator_test.py
index a2a3528cc62..916cf8bb45c 100644
--- a/tensorflow/python/data/kernel_tests/iterator_ops_test.py
+++ b/tensorflow/python/data/kernel_tests/iterator_test.py
@@ -12,12 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for the experimental input pipeline ops."""
+"""Tests for `tf.data.Iterator`."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import functools
 import os
 import warnings
 
@@ -50,24 +49,24 @@ from tensorflow.python.ops import parsing_ops
 from tensorflow.python.ops import script_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
-from tensorflow.python.training import checkpoint_management
 from tensorflow.python.training import server_lib
-from tensorflow.python.training.checkpointable import util as checkpointable_utils
 from tensorflow.python.util import compat
 
 
 class IteratorTest(test.TestCase, parameterized.TestCase):
 
+  @test_util.run_deprecated_v1
   def testNoGradients(self):
     component = constant_op.constant([1.])
     side = constant_op.constant(0.)
     add = lambda x: x + side
     dataset = dataset_ops.Dataset.from_tensor_slices(component).map(add)
-    value = dataset.make_one_shot_iterator().get_next()
+    value = dataset_ops.make_one_shot_iterator(dataset).get_next()
     self.assertIsNone(gradients_impl.gradients(value, component)[0])
     self.assertIsNone(gradients_impl.gradients(value, side)[0])
     self.assertIsNone(gradients_impl.gradients(value, [component, side])[0])
 
+  @test_util.run_deprecated_v1
   def testCapturingStateInOneShotRaisesException(self):
     var = variables.Variable(37.0, name="myvar")
     dataset = (
@@ -76,8 +75,9 @@ class IteratorTest(test.TestCase, parameterized.TestCase):
     with self.assertRaisesRegexp(
         ValueError, r"`Dataset.make_one_shot_iterator\(\)` does not support "
         "datasets that capture stateful objects.+myvar"):
-      dataset.make_one_shot_iterator()
+      dataset_ops.make_one_shot_iterator(dataset)
 
+  @test_util.run_deprecated_v1
   def testOneShotIterator(self):
     components = (np.arange(7),
                   np.array([[1, 2, 3]]) * np.arange(7)[:, np.newaxis],
@@ -86,9 +86,9 @@ class IteratorTest(test.TestCase, parameterized.TestCase):
     def _map_fn(x, y, z):
       return math_ops.square(x), math_ops.square(y), math_ops.square(z)
 
-    iterator = (
+    iterator = dataset_ops.make_one_shot_iterator(
         dataset_ops.Dataset.from_tensor_slices(components).map(_map_fn)
-        .repeat(14).make_one_shot_iterator())
+        .repeat(14))
     get_next = iterator.get_next()
 
     self.assertEqual([c.shape[1:] for c in components],
@@ -103,6 +103,7 @@ class IteratorTest(test.TestCase, parameterized.TestCase):
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
 
+  @test_util.run_deprecated_v1
   def testOneShotIteratorCaptureByValue(self):
     components = (np.arange(7),
                   np.array([[1, 2, 3]]) * np.arange(7)[:, np.newaxis],
@@ -112,9 +113,9 @@ class IteratorTest(test.TestCase, parameterized.TestCase):
     def _map_fn(x, y, z):
       return math_ops.square(x), math_ops.square(y), math_ops.square(z)
 
-    iterator = (
+    iterator = dataset_ops.make_one_shot_iterator(
         dataset_ops.Dataset.from_tensor_slices(tensor_components)
-        .map(_map_fn).repeat(14).make_one_shot_iterator())
+        .map(_map_fn).repeat(14))
     get_next = iterator.get_next()
 
     self.assertEqual([c.shape[1:] for c in components],
@@ -139,9 +140,9 @@ class IteratorTest(test.TestCase, parameterized.TestCase):
       def _map_fn(x, y, z):
         return math_ops.square(x), math_ops.square(y), math_ops.square(z)
 
-      iterator = (
+      iterator = dataset_ops.make_one_shot_iterator(
           dataset_ops.Dataset.from_tensor_slices(components)
-          .map(_map_fn).repeat(14).make_one_shot_iterator())
+          .map(_map_fn).repeat(14))
       return iterator.get_next()
 
     server = server_lib.Server.create_local_server()
@@ -165,9 +166,10 @@ class IteratorTest(test.TestCase, parameterized.TestCase):
         with self.assertRaises(errors.OutOfRangeError):
           sess.run(get_next)
 
+  @test_util.run_deprecated_v1
   def testOneShotIteratorNonBlocking(self):
     dataset = dataset_ops.Dataset.from_tensors([1, 2, 3]).map(lambda x: x * x)
-    iterator = dataset.make_one_shot_iterator()
+    iterator = dataset_ops.make_one_shot_iterator(dataset)
     next_element = iterator.get_next()
 
     # Create a session with a single thread to ensure that the
@@ -203,12 +205,13 @@ class IteratorTest(test.TestCase, parameterized.TestCase):
                        len([None for r in results if r is None]))
       self.assertAllEqual([[1, 4, 9]], [r for r in results if r is not None])
 
+  @test_util.run_deprecated_v1
   def testOneShotIteratorInitializerFails(self):
     # Define a dataset whose initialization will always fail.
     dataset = dataset_ops.Dataset.from_tensors(
         array_ops.check_numerics(
             constant_op.constant(1.0) / constant_op.constant(0.0), "oops"))
-    iterator = dataset.make_one_shot_iterator()
+    iterator = dataset_ops.make_one_shot_iterator(dataset)
     next_element = iterator.get_next()
 
     with self.cached_session() as sess:
@@ -283,11 +286,11 @@ class IteratorTest(test.TestCase, parameterized.TestCase):
         with self.assertRaises(errors.OutOfRangeError):
           sess.run(get_next)
 
+  @test_util.run_deprecated_v1
   def testNotInitializedError(self):
     components = (np.array(1), np.array([1, 2, 3]), np.array(37.0))
-    iterator = (
-        dataset_ops.Dataset.from_tensors(components)
-        .make_initializable_iterator())
+    iterator = dataset_ops.make_initializable_iterator(
+        dataset_ops.Dataset.from_tensors(components))
     get_next = iterator.get_next()
 
     with self.cached_session() as sess:
@@ -295,6 +298,7 @@ class IteratorTest(test.TestCase, parameterized.TestCase):
                                    "iterator has not been initialized"):
         sess.run(get_next)
 
+  @test_util.run_deprecated_v1
   def testReinitializableIterator(self):
     dataset_3 = dataset_ops.Dataset.from_tensors(
         constant_op.constant([1, 2, 3]))
@@ -334,6 +338,33 @@ class IteratorTest(test.TestCase, parameterized.TestCase):
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
 
+  @test_util.run_deprecated_v1
+  def testReinitializableIteratorWithFunctions(self):
+
+    def g():
+      for i in range(10):
+        yield i
+
+    iterator = iterator_ops.Iterator.from_structure(dtypes.int64, [])
+    next_element = iterator.get_next()
+
+    with self.cached_session() as sess:
+      dataset_1 = dataset_ops.Dataset.from_generator(
+          g, output_types=dtypes.int64)
+      sess.run(iterator.make_initializer(dataset_1))
+      for expected in range(10):
+        self.assertEqual(expected, sess.run(next_element))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(next_element)
+
+      dataset_2 = dataset_ops.Dataset.from_generator(
+          g, output_types=dtypes.int64)
+      sess.run(iterator.make_initializer(dataset_2))
+      for expected in range(10):
+        self.assertEqual(expected, sess.run(next_element))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(next_element)
+
   def testReinitializableIteratorStaticErrors(self):
     # Non-matching structure for types and shapes.
     with self.assertRaises(TypeError):
@@ -367,12 +398,13 @@ class IteratorTest(test.TestCase, parameterized.TestCase):
               (constant_op.constant([1, 2, 3], dtype=dtypes.int64),
                constant_op.constant([4., 5., 6., 7.], dtype=dtypes.float64))))
 
+  @test_util.run_deprecated_v1
   def testIteratorStringHandle(self):
     dataset_3 = dataset_ops.Dataset.from_tensor_slices([1, 2, 3])
     dataset_4 = dataset_ops.Dataset.from_tensor_slices([10, 20, 30, 40])
 
-    iterator_3 = dataset_3.make_one_shot_iterator()
-    iterator_4 = dataset_4.make_one_shot_iterator()
+    iterator_3 = dataset_ops.make_one_shot_iterator(dataset_3)
+    iterator_4 = dataset_ops.make_one_shot_iterator(dataset_4)
 
     handle_placeholder = array_ops.placeholder(dtypes.string, shape=[])
     feedable_iterator = iterator_ops.Iterator.from_string_handle(
@@ -422,13 +454,14 @@ class IteratorTest(test.TestCase, parameterized.TestCase):
         sess.run(
             next_element, feed_dict={handle_placeholder: iterator_4_handle})
 
+  @test_util.run_deprecated_v1
   def testIteratorStringHandleFuture(self):
     with forward_compat.forward_compatibility_horizon(2018, 8, 4):
       dataset_3 = dataset_ops.Dataset.from_tensor_slices([1, 2, 3])
       dataset_4 = dataset_ops.Dataset.from_tensor_slices([10, 20, 30, 40])
 
-      iterator_3 = dataset_3.make_one_shot_iterator()
-      iterator_4 = dataset_4.make_one_shot_iterator()
+      iterator_3 = dataset_ops.make_one_shot_iterator(dataset_3)
+      iterator_4 = dataset_ops.make_one_shot_iterator(dataset_4)
 
       handle_placeholder = array_ops.placeholder(dtypes.string, shape=[])
       feedable_iterator = iterator_ops.Iterator.from_string_handle(
@@ -485,10 +518,11 @@ class IteratorTest(test.TestCase, parameterized.TestCase):
           sess.run(
               next_element, feed_dict={handle_placeholder: iterator_4_handle})
 
+  @test_util.run_deprecated_v1
   def testIteratorStringHandleReuseTensorObject(self):
     dataset = dataset_ops.Dataset.from_tensor_slices([1, 2, 3])
-    one_shot_iterator = dataset.make_one_shot_iterator()
-    initializable_iterator = dataset.make_initializable_iterator()
+    one_shot_iterator = dataset_ops.make_one_shot_iterator(dataset)
+    initializable_iterator = dataset_ops.make_initializable_iterator(dataset)
     structure_iterator = iterator_ops.Iterator.from_structure(
         dataset.output_types)
 
@@ -513,6 +547,7 @@ class IteratorTest(test.TestCase, parameterized.TestCase):
     self.assertEqual("foo_1", handle_with_same_name.op.name)
     self.assertIsNot(handle_with_name, handle_with_same_name)
 
+  @test_util.run_deprecated_v1
   def testIteratorStringHandleError(self):
     dataset_int_scalar = (
         dataset_ops.Dataset.from_tensor_slices([1, 2, 3]).repeat())
@@ -528,10 +563,10 @@ class IteratorTest(test.TestCase, parameterized.TestCase):
         handle_placeholder, dtypes.int32)
 
     with self.cached_session() as sess:
-      handle_int_scalar = sess.run(
-          dataset_int_scalar.make_one_shot_iterator().string_handle())
-      handle_float_vector = sess.run(
-          dataset_float_vector.make_one_shot_iterator().string_handle())
+      handle_int_scalar = sess.run(dataset_ops.make_one_shot_iterator(
+          dataset_int_scalar).string_handle())
+      handle_float_vector = sess.run(dataset_ops.make_one_shot_iterator(
+          dataset_float_vector).string_handle())
 
       self.assertEqual(1,
                        sess.run(
@@ -553,13 +588,14 @@ class IteratorTest(test.TestCase, parameterized.TestCase):
             feedable_int_vector.get_next(),
             feed_dict={handle_placeholder: handle_float_vector}))
 
+  @test_util.run_deprecated_v1
   def testRemoteIteratorUsingRemoteCallOpDirectSession(self):
     worker_config = config_pb2.ConfigProto()
     worker_config.device_count["CPU"] = 3
 
     with ops.device("/job:localhost/replica:0/task:0/cpu:1"):
       dataset_3 = dataset_ops.Dataset.from_tensor_slices([1, 2, 3])
-      iterator_3 = dataset_3.make_one_shot_iterator()
+      iterator_3 = dataset_ops.make_one_shot_iterator(dataset_3)
       iterator_3_handle = iterator_3.string_handle()
 
     @function.Defun(dtypes.string)
@@ -609,6 +645,7 @@ class IteratorTest(test.TestCase, parameterized.TestCase):
                 target_placeholder: "/job:localhost/replica:0/task:0/cpu:1"
             })
 
+  @test_util.run_deprecated_v1
   def testRemoteIteratorUsingRemoteCallOpMultiWorkers(self):
     s1 = server_lib.Server.create_local_server()
     s2 = server_lib.Server.create_local_server()
@@ -631,7 +668,7 @@ class IteratorTest(test.TestCase, parameterized.TestCase):
     for device in worker_devices:
       with ops.device(device):
         src = dataset_ops.Dataset.from_tensor_slices([device])
-        itr = src.make_one_shot_iterator()
+        itr = dataset_ops.make_one_shot_iterator(src)
         itr_handles.append(itr.string_handle())
 
     targets = dataset_ops.Dataset.from_tensor_slices(worker_devices)
@@ -649,7 +686,7 @@ class IteratorTest(test.TestCase, parameterized.TestCase):
 
     with ops.device("/job:client"):
       client_dataset = dataset_ops.Dataset.zip((targets, handles)).map(map_fn)
-      itr = client_dataset.make_initializable_iterator()
+      itr = dataset_ops.make_initializable_iterator(client_dataset)
       n = itr.get_next()
 
     with session.Session(s3.target, config=config) as sess:
@@ -667,7 +704,7 @@ class IteratorTest(test.TestCase, parameterized.TestCase):
 
     with ops.device("/job:localhost/replica:0/task:0/cpu:0"):
       dataset_3 = dataset_ops.Dataset.from_tensor_slices([1, 2, 3])
-      iterator_3 = dataset_3.make_one_shot_iterator()
+      iterator_3 = dataset_ops.make_one_shot_iterator(dataset_3)
       iterator_3_handle = iterator_3.string_handle()
 
     def _encode_raw(byte_array):
@@ -716,6 +753,7 @@ class IteratorTest(test.TestCase, parameterized.TestCase):
                 target_placeholder: "/job:localhost/replica:0/task:0/cpu:0"
             })
 
+  @test_util.run_deprecated_v1
   def testIncorrectIteratorRestore(self):
 
     def _path():
@@ -738,8 +776,8 @@ class IteratorTest(test.TestCase, parameterized.TestCase):
     def _build_range_dataset_graph():
       start = 1
       stop = 10
-      iterator = dataset_ops.Dataset.range(start,
-                                           stop).make_initializable_iterator()
+      iterator = dataset_ops.make_initializable_iterator(
+          dataset_ops.Dataset.range(start, stop))
       init_op = iterator.initializer
       get_next = iterator.get_next()
       save_op = _save_op(iterator._iterator_resource)
@@ -748,8 +786,8 @@ class IteratorTest(test.TestCase, parameterized.TestCase):
 
     def _build_reader_dataset_graph():
       filenames = ["test"]  # Does not exist but we don't care in this test.
-      iterator = readers.FixedLengthRecordDataset(
-          filenames, 1, 0, 0).make_initializable_iterator()
+      iterator = dataset_ops.make_initializable_iterator(
+          readers.FixedLengthRecordDataset(filenames, 1, 0, 0))
       init_op = iterator.initializer
       get_next_op = iterator.get_next()
       save_op = _save_op(iterator._iterator_resource)
@@ -774,8 +812,9 @@ class IteratorTest(test.TestCase, parameterized.TestCase):
         with self.assertRaises(errors.InvalidArgumentError):
           sess.run(restore_op)
 
+  @test_util.run_deprecated_v1
   def testRepeatedGetNextWarning(self):
-    iterator = dataset_ops.Dataset.range(10).make_one_shot_iterator()
+    iterator = dataset_ops.make_one_shot_iterator(dataset_ops.Dataset.range(10))
     warnings.simplefilter("always")
     with warnings.catch_warnings(record=True) as w:
       for _ in range(100):
@@ -818,8 +857,8 @@ class IteratorTest(test.TestCase, parameterized.TestCase):
                             expected_output_classes, expected_output_types,
                             expected_output_shapes):
     tf_value = tf_value_fn()
-    iterator = dataset_ops.Dataset.from_tensors(
-        tf_value).make_one_shot_iterator()
+    iterator = dataset_ops.make_one_shot_iterator(
+        dataset_ops.Dataset.from_tensors(tf_value))
 
     self.assertTrue(expected_element_structure.is_compatible_with(
         iterator._element_structure))
@@ -832,100 +871,11 @@ class IteratorTest(test.TestCase, parameterized.TestCase):
 
   def testIteratorGetNextName(self):
     with ops.Graph().as_default():
-      iterator = dataset_ops.Dataset.from_tensors(37.0).make_one_shot_iterator()
+      iterator = dataset_ops.make_one_shot_iterator(
+          dataset_ops.Dataset.from_tensors(37.0))
       next_element = iterator.get_next(name="overridden_name")
       self.assertEqual("overridden_name", next_element.op.name)
 
 
-class IteratorCheckpointingTest(test.TestCase):
-
-  @test_util.run_in_graph_and_eager_modes
-  def testSaveRestoreOneShotIterator(self):
-    checkpoint_directory = self.get_temp_dir()
-    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
-    dataset = dataset_ops.Dataset.from_tensor_slices([1, 2, 3, 4, 5, 6]).map(
-        math_ops.square).batch(2)
-    iterator = dataset.make_one_shot_iterator()
-    get_next = iterator.get_next if context.executing_eagerly(
-    ) else functools.partial(self.evaluate, iterator.get_next())
-    checkpoint = checkpointable_utils.Checkpoint(iterator=iterator)
-    self.assertAllEqual([1, 4], get_next())
-    save_path = checkpoint.save(checkpoint_prefix)
-    self.assertAllEqual([9, 16], get_next())
-    self.assertAllEqual([25, 36], get_next())
-    checkpoint.restore(save_path).run_restore_ops()
-    self.assertAllEqual([9, 16], get_next())
-    self.assertAllEqual([25, 36], get_next())
-    with self.assertRaises(errors.OutOfRangeError):
-      get_next()
-
-  @test_util.run_in_graph_and_eager_modes
-  def testSaveRestoreMultipleIterator(self):
-    checkpoint_directory = self.get_temp_dir()
-    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
-    dataset = dataset_ops.Dataset.from_tensor_slices(
-        [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11])
-    dataset = dataset.map(math_ops.square).batch(2)
-    iterator_1 = dataset.make_one_shot_iterator()
-    get_next_1 = iterator_1.get_next if context.executing_eagerly(
-    ) else functools.partial(self.evaluate, iterator_1.get_next())
-    iterator_2 = dataset.make_one_shot_iterator()
-    get_next_2 = iterator_2.get_next if context.executing_eagerly(
-    ) else functools.partial(self.evaluate, iterator_2.get_next())
-    dataset_2 = dataset_ops.Dataset.range(10)
-    iterator_3 = dataset_2.make_one_shot_iterator()
-    get_next_3 = iterator_3.get_next if context.executing_eagerly(
-    ) else functools.partial(self.evaluate, iterator_3.get_next())
-    checkpoint = checkpointable_utils.Checkpoint(
-        iterator_1=iterator_1, iterator_2=iterator_2, iterator_3=iterator_3)
-    self.assertAllEqual([1, 4], get_next_1())
-    self.assertAllEqual(0, get_next_3())
-    self.assertAllEqual(1, get_next_3())
-    self.assertAllEqual(2, get_next_3())
-    save_path = checkpoint.save(checkpoint_prefix)
-    self.assertAllEqual([1, 4], get_next_2())
-    self.assertAllEqual([9, 16], get_next_2())
-    self.assertAllEqual(3, get_next_3())
-    checkpoint.restore(save_path).run_restore_ops()
-    self.assertAllEqual([9, 16], get_next_1())
-    self.assertAllEqual([1, 4], get_next_2())
-    self.assertAllEqual(3, get_next_3())
-
-  @test_util.run_in_graph_and_eager_modes
-  def testRestoreExhaustedIterator(self):
-    checkpoint_directory = self.get_temp_dir()
-    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
-    dataset = dataset_ops.Dataset.range(3)
-    iterator = dataset.make_one_shot_iterator()
-    get_next = iterator.get_next if context.executing_eagerly(
-    ) else functools.partial(self.evaluate, iterator.get_next())
-    checkpoint = checkpointable_utils.Checkpoint(iterator=iterator)
-    self.assertAllEqual(0, get_next())
-    self.assertAllEqual(1, get_next())
-    save_path = checkpoint.save(checkpoint_prefix)
-    self.assertAllEqual(2, get_next())
-    checkpoint.restore(save_path).run_restore_ops()
-    self.assertAllEqual(2, get_next())
-    save_path = checkpoint.save(checkpoint_prefix)
-    checkpoint.restore(save_path).run_restore_ops()
-    with self.assertRaises(errors.OutOfRangeError):
-      get_next()
-
-  def testRestoreInReconstructedIteratorInitializable(self):
-    checkpoint_directory = self.get_temp_dir()
-    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
-    dataset = dataset_ops.Dataset.range(10)
-    iterator = dataset.make_initializable_iterator()
-    get_next = iterator.get_next()
-    checkpoint = checkpointable_utils.Checkpoint(iterator=iterator)
-    for i in range(5):
-      with self.cached_session() as sess:
-        checkpoint.restore(checkpoint_management.latest_checkpoint(
-            checkpoint_directory)).initialize_or_restore(sess)
-        for j in range(2):
-          self.assertEqual(i * 2 + j, sess.run(get_next))
-        checkpoint.save(file_prefix=checkpoint_prefix)
-
-
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/data/kernel_tests/list_files_dataset_op_test.py b/tensorflow/python/data/kernel_tests/list_files_dataset_op_test.py
deleted file mode 100644
index b58c1444dae..00000000000
--- a/tensorflow/python/data/kernel_tests/list_files_dataset_op_test.py
+++ /dev/null
@@ -1,291 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for the experimental input pipeline ops."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from os import path
-import shutil
-import tempfile
-
-from tensorflow.python.data.kernel_tests import test_base
-from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import errors
-from tensorflow.python.ops import array_ops
-from tensorflow.python.platform import test
-from tensorflow.python.util import compat
-
-
-class ListFilesDatasetOpTest(test_base.DatasetTestBase):
-
-  def setUp(self):
-    self.tmp_dir = tempfile.mkdtemp()
-
-  def tearDown(self):
-    shutil.rmtree(self.tmp_dir, ignore_errors=True)
-
-  def _touchTempFiles(self, filenames):
-    for filename in filenames:
-      open(path.join(self.tmp_dir, filename), 'a').close()
-
-  def testEmptyDirectory(self):
-    dataset = dataset_ops.Dataset.list_files(path.join(self.tmp_dir, '*'))
-    with self.cached_session() as sess:
-      itr = dataset.make_one_shot_iterator()
-      next_element = itr.get_next()
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
-
-  def testSimpleDirectory(self):
-    filenames = ['a', 'b', 'c']
-    self._touchTempFiles(filenames)
-
-    dataset = dataset_ops.Dataset.list_files(path.join(self.tmp_dir, '*'))
-    with self.cached_session() as sess:
-      itr = dataset.make_one_shot_iterator()
-      next_element = itr.get_next()
-
-      full_filenames = []
-      produced_filenames = []
-      for filename in filenames:
-        full_filenames.append(
-            compat.as_bytes(path.join(self.tmp_dir, filename)))
-        produced_filenames.append(compat.as_bytes(sess.run(next_element)))
-      self.assertItemsEqual(full_filenames, produced_filenames)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(itr.get_next())
-
-  def testSimpleDirectoryNotShuffled(self):
-    filenames = ['b', 'c', 'a']
-    self._touchTempFiles(filenames)
-
-    dataset = dataset_ops.Dataset.list_files(
-        path.join(self.tmp_dir, '*'), shuffle=False)
-    with self.cached_session() as sess:
-      itr = dataset.make_one_shot_iterator()
-      next_element = itr.get_next()
-
-      for filename in sorted(filenames):
-        self.assertEqual(compat.as_bytes(path.join(self.tmp_dir, filename)),
-                         sess.run(next_element))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(itr.get_next())
-
-  def testFixedSeedResultsInRepeatableOrder(self):
-    filenames = ['a', 'b', 'c']
-    self._touchTempFiles(filenames)
-
-    dataset = dataset_ops.Dataset.list_files(
-        path.join(self.tmp_dir, '*'), shuffle=True, seed=37)
-    with self.cached_session() as sess:
-      itr = dataset.make_initializable_iterator()
-      next_element = itr.get_next()
-
-      full_filenames = [compat.as_bytes(path.join(self.tmp_dir, filename))
-                        for filename in filenames]
-
-      all_produced_filenames = []
-      for _ in range(3):
-        produced_filenames = []
-        sess.run(itr.initializer)
-        try:
-          while True:
-            produced_filenames.append(sess.run(next_element))
-        except errors.OutOfRangeError:
-          pass
-        all_produced_filenames.append(produced_filenames)
-
-      # Each run should produce the same set of filenames, which may be
-      # different from the order of `full_filenames`.
-      self.assertItemsEqual(full_filenames, all_produced_filenames[0])
-      # However, the different runs should produce filenames in the same order
-      # as each other.
-      self.assertEqual(all_produced_filenames[0], all_produced_filenames[1])
-      self.assertEqual(all_produced_filenames[0], all_produced_filenames[2])
-
-  def testEmptyDirectoryInitializer(self):
-    filename_placeholder = array_ops.placeholder(dtypes.string, shape=[])
-    dataset = dataset_ops.Dataset.list_files(filename_placeholder)
-
-    with self.cached_session() as sess:
-      itr = dataset.make_initializable_iterator()
-      with self.assertRaisesRegexp(
-          errors.InvalidArgumentError, 'No files matched pattern: '):
-        sess.run(
-            itr.initializer,
-            feed_dict={filename_placeholder: path.join(self.tmp_dir, '*')})
-
-  def testSimpleDirectoryInitializer(self):
-    filenames = ['a', 'b', 'c']
-    self._touchTempFiles(filenames)
-
-    filename_placeholder = array_ops.placeholder(dtypes.string, shape=[])
-    dataset = dataset_ops.Dataset.list_files(filename_placeholder)
-
-    with self.cached_session() as sess:
-      itr = dataset.make_initializable_iterator()
-      next_element = itr.get_next()
-      sess.run(
-          itr.initializer,
-          feed_dict={filename_placeholder: path.join(self.tmp_dir, '*')})
-
-      full_filenames = []
-      produced_filenames = []
-      for filename in filenames:
-        full_filenames.append(
-            compat.as_bytes(path.join(self.tmp_dir, filename)))
-        produced_filenames.append(compat.as_bytes(sess.run(next_element)))
-
-      self.assertItemsEqual(full_filenames, produced_filenames)
-
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(itr.get_next())
-
-  def testFileSuffixes(self):
-    filenames = ['a.txt', 'b.py', 'c.py', 'd.pyc']
-    self._touchTempFiles(filenames)
-
-    filename_placeholder = array_ops.placeholder(dtypes.string, shape=[])
-    dataset = dataset_ops.Dataset.list_files(filename_placeholder)
-
-    with self.cached_session() as sess:
-      itr = dataset.make_initializable_iterator()
-      next_element = itr.get_next()
-      sess.run(
-          itr.initializer,
-          feed_dict={filename_placeholder: path.join(self.tmp_dir, '*.py')})
-
-      full_filenames = []
-      produced_filenames = []
-      for filename in filenames[1:-1]:
-        full_filenames.append(
-            compat.as_bytes(path.join(self.tmp_dir, filename)))
-        produced_filenames.append(compat.as_bytes(sess.run(next_element)))
-      self.assertItemsEqual(full_filenames, produced_filenames)
-
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(itr.get_next())
-
-  def testFileMiddles(self):
-    filenames = ['a.txt', 'b.py', 'c.pyc']
-    self._touchTempFiles(filenames)
-
-    filename_placeholder = array_ops.placeholder(dtypes.string, shape=[])
-    dataset = dataset_ops.Dataset.list_files(filename_placeholder)
-
-    with self.cached_session() as sess:
-      itr = dataset.make_initializable_iterator()
-      next_element = itr.get_next()
-      sess.run(
-          itr.initializer,
-          feed_dict={filename_placeholder: path.join(self.tmp_dir, '*.py*')})
-
-      full_filenames = []
-      produced_filenames = []
-      for filename in filenames[1:]:
-        full_filenames.append(
-            compat.as_bytes(path.join(self.tmp_dir, filename)))
-        produced_filenames.append(compat.as_bytes(sess.run(next_element)))
-
-      self.assertItemsEqual(full_filenames, produced_filenames)
-
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(itr.get_next())
-
-  def testNoShuffle(self):
-    filenames = ['a', 'b', 'c']
-    self._touchTempFiles(filenames)
-
-    # Repeat the list twice and ensure that the order is the same each time.
-    # NOTE(mrry): This depends on an implementation detail of `list_files()`,
-    # which is that the list of files is captured when the iterator is
-    # initialized. Otherwise, or if e.g. the iterator were initialized more than
-    # once, it's possible that the non-determinism of `tf.matching_files()`
-    # would cause this test to fail. However, it serves as a useful confirmation
-    # that the `shuffle=False` argument is working as intended.
-    # TODO(b/73959787): Provide some ordering guarantees so that this test is
-    # more meaningful.
-    dataset = dataset_ops.Dataset.list_files(
-        path.join(self.tmp_dir, '*'), shuffle=False).repeat(2)
-    with self.cached_session() as sess:
-      itr = dataset.make_one_shot_iterator()
-      next_element = itr.get_next()
-
-      full_filenames = []
-      produced_filenames = []
-      for filename in filenames * 2:
-        full_filenames.append(
-            compat.as_bytes(path.join(self.tmp_dir, filename)))
-        produced_filenames.append(compat.as_bytes(sess.run(next_element)))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(itr.get_next())
-      self.assertItemsEqual(full_filenames, produced_filenames)
-      self.assertEqual(produced_filenames[:len(filenames)],
-                       produced_filenames[len(filenames):])
-
-  def testMultiplePatternsAsList(self):
-    filenames = ['a.txt', 'b.py', 'c.py', 'd.pyc']
-    self._touchTempFiles(filenames)
-
-    patterns = [path.join(self.tmp_dir, pat) for pat in ['*.py', '*.txt']]
-    dataset = dataset_ops.Dataset.list_files(patterns)
-    with self.cached_session() as sess:
-      itr = dataset.make_one_shot_iterator()
-      next_element = itr.get_next()
-
-      full_filenames = []
-      produced_filenames = []
-      for filename in filenames[:-1]:
-        full_filenames.append(
-            compat.as_bytes(path.join(self.tmp_dir, filename)))
-        produced_filenames.append(compat.as_bytes(sess.run(next_element)))
-      self.assertItemsEqual(full_filenames, produced_filenames)
-
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(itr.get_next())
-
-  def testMultiplePatternsAsTensor(self):
-    filenames = ['a.txt', 'b.py', 'c.py', 'd.pyc']
-    self._touchTempFiles(filenames)
-
-    filename_placeholder = array_ops.placeholder(
-        dtypes.string, shape=[
-            2,
-        ])
-    dataset = dataset_ops.Dataset.list_files(filename_placeholder)
-
-    with self.cached_session() as sess:
-      itr = dataset.make_initializable_iterator()
-      next_element = itr.get_next()
-      patterns = [path.join(self.tmp_dir, pat) for pat in ['*.py', '*.txt']]
-      sess.run(itr.initializer, feed_dict={filename_placeholder: patterns})
-
-      full_filenames = []
-      produced_filenames = []
-      for filename in filenames[:-1]:
-        full_filenames.append(
-            compat.as_bytes(path.join(self.tmp_dir, filename)))
-        produced_filenames.append(compat.as_bytes(sess.run(next_element)))
-      self.assertItemsEqual(full_filenames, produced_filenames)
-
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(itr.get_next())
-
-
-if __name__ == '__main__':
-  test.main()
diff --git a/tensorflow/python/data/kernel_tests/list_files_test.py b/tensorflow/python/data/kernel_tests/list_files_test.py
new file mode 100644
index 00000000000..789f1ab6de7
--- /dev/null
+++ b/tensorflow/python/data/kernel_tests/list_files_test.py
@@ -0,0 +1,214 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for `tf.data.Dataset.list_files()`."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from os import path
+import shutil
+import tempfile
+
+from tensorflow.python.data.kernel_tests import test_base
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import test_util
+from tensorflow.python.platform import test
+from tensorflow.python.util import compat
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class ListFilesTest(test_base.DatasetTestBase):
+
+  def setUp(self):
+    self.tmp_dir = tempfile.mkdtemp()
+
+  def tearDown(self):
+    shutil.rmtree(self.tmp_dir, ignore_errors=True)
+
+  def _touchTempFiles(self, filenames):
+    for filename in filenames:
+      open(path.join(self.tmp_dir, filename), 'a').close()
+
+  # Note: eager mode fails in assertion error same as initializer in graph mode.
+  @test_util.run_deprecated_v1
+  def testSkipEagerEmptyDirectory(self):
+    dataset = dataset_ops.Dataset.list_files(path.join(self.tmp_dir, '*'))
+    self.assertDatasetProduces(dataset, expected_output=[])
+
+  def testSimpleDirectory(self):
+    filenames = ['a', 'b', 'c']
+    self._touchTempFiles(filenames)
+
+    dataset = dataset_ops.Dataset.list_files(path.join(self.tmp_dir, '*'))
+    self.assertDatasetProduces(
+        dataset,
+        expected_output=[
+            compat.as_bytes(path.join(self.tmp_dir, filename))
+            for filename in filenames
+        ],
+        assert_items_equal=True)
+
+  def testSimpleDirectoryNotShuffled(self):
+    filenames = ['b', 'c', 'a']
+    self._touchTempFiles(filenames)
+
+    dataset = dataset_ops.Dataset.list_files(
+        path.join(self.tmp_dir, '*'), shuffle=False)
+    self.assertDatasetProduces(
+        dataset,
+        expected_output=[
+            compat.as_bytes(path.join(self.tmp_dir, filename))
+            for filename in sorted(filenames)
+        ])
+
+  def testFixedSeedResultsInRepeatableOrder(self):
+    filenames = ['a', 'b', 'c']
+    self._touchTempFiles(filenames)
+
+    dataset = dataset_ops.Dataset.list_files(
+        path.join(self.tmp_dir, '*'), shuffle=True, seed=37)
+
+    full_filenames = [compat.as_bytes(path.join(self.tmp_dir, filename))
+                      for filename in filenames]
+
+    all_produced_filenames = []
+    for _ in range(3):
+      produced_filenames = []
+      next_element = self.getNext(dataset, requires_initialization=True)
+      try:
+        while True:
+          produced_filenames.append(self.evaluate(next_element()))
+      except errors.OutOfRangeError:
+        pass
+      all_produced_filenames.append(produced_filenames)
+
+    # Each run should produce the same set of filenames, which may be
+    # different from the order of `full_filenames`.
+    self.assertItemsEqual(full_filenames, all_produced_filenames[0])
+    # However, the different runs should produce filenames in the same order
+    # as each other.
+    self.assertEqual(all_produced_filenames[0], all_produced_filenames[1])
+    self.assertEqual(all_produced_filenames[0], all_produced_filenames[2])
+
+  # TODO(b/117581999): eager mode assertion fail wrapped, debug.
+  def tesSkipEagerEmptyDirectoryInitializer(self):
+    dataset = dataset_ops.Dataset.list_files(path.join(self.tmp_dir, '*'))
+    self.assertDatasetProduces(
+        dataset,
+        expected_error=(errors.InvalidArgumentError,
+                        'No files matched pattern'),
+        requires_initialization=True)
+
+  def testSimpleDirectoryInitializer(self):
+    filenames = ['a', 'b', 'c']
+    self._touchTempFiles(filenames)
+
+    dataset = dataset_ops.Dataset.list_files(path.join(self.tmp_dir, '*'))
+    self.assertDatasetProduces(
+        dataset,
+        expected_output=[
+            compat.as_bytes(path.join(self.tmp_dir, filename))
+            for filename in filenames
+        ],
+        assert_items_equal=True)
+
+  def testFileSuffixes(self):
+    filenames = ['a.txt', 'b.py', 'c.py', 'd.pyc']
+    self._touchTempFiles(filenames)
+
+    dataset = dataset_ops.Dataset.list_files(path.join(self.tmp_dir, '*.py'))
+    self.assertDatasetProduces(
+        dataset,
+        expected_output=[
+            compat.as_bytes(path.join(self.tmp_dir, filename))
+            for filename in filenames[1:-1]
+        ],
+        assert_items_equal=True)
+
+  def testFileMiddles(self):
+    filenames = ['a.txt', 'b.py', 'c.pyc']
+    self._touchTempFiles(filenames)
+
+    dataset = dataset_ops.Dataset.list_files(path.join(self.tmp_dir, '*.py*'))
+    self.assertDatasetProduces(
+        dataset,
+        expected_output=[
+            compat.as_bytes(path.join(self.tmp_dir, filename))
+            for filename in filenames[1:]
+        ],
+        assert_items_equal=True)
+
+  def testNoShuffle(self):
+    filenames = ['a', 'b', 'c']
+    self._touchTempFiles(filenames)
+
+    # Repeat the list twice and ensure that the order is the same each time.
+    # NOTE(mrry): This depends on an implementation detail of `list_files()`,
+    # which is that the list of files is captured when the iterator is
+    # initialized. Otherwise, or if e.g. the iterator were initialized more than
+    # once, it's possible that the non-determinism of `tf.matching_files()`
+    # would cause this test to fail. However, it serves as a useful confirmation
+    # that the `shuffle=False` argument is working as intended.
+    # TODO(b/73959787): Provide some ordering guarantees so that this test is
+    # more meaningful.
+    dataset = dataset_ops.Dataset.list_files(
+        path.join(self.tmp_dir, '*'), shuffle=False).repeat(2)
+    next_element = self.getNext(dataset)
+
+    full_filenames = []
+    produced_filenames = []
+    for filename in filenames * 2:
+      full_filenames.append(compat.as_bytes(path.join(self.tmp_dir, filename)))
+      produced_filenames.append(compat.as_bytes(self.evaluate(next_element())))
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(next_element())
+    self.assertItemsEqual(full_filenames, produced_filenames)
+    self.assertEqual(produced_filenames[:len(filenames)],
+                     produced_filenames[len(filenames):])
+
+  def testMultiplePatternsAsList(self):
+    filenames = ['a.txt', 'b.py', 'c.py', 'd.pyc']
+    self._touchTempFiles(filenames)
+
+    patterns = [path.join(self.tmp_dir, pat) for pat in ['*.py', '*.txt']]
+    dataset = dataset_ops.Dataset.list_files(patterns)
+    self.assertDatasetProduces(
+        dataset,
+        expected_output=[
+            compat.as_bytes(path.join(self.tmp_dir, filename))
+            for filename in filenames[:-1]
+        ],
+        assert_items_equal=True)
+
+  def testMultiplePatternsAsTensor(self):
+    filenames = ['a.txt', 'b.py', 'c.py', 'd.pyc']
+    self._touchTempFiles(filenames)
+
+    dataset = dataset_ops.Dataset.list_files(
+        [path.join(self.tmp_dir, pat) for pat in ['*.py', '*.txt']])
+    self.assertDatasetProduces(
+        dataset,
+        expected_output=[
+            compat.as_bytes(path.join(self.tmp_dir, filename))
+            for filename in filenames[:-1]
+        ],
+        assert_items_equal=True)
+
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/data/kernel_tests/map_dataset_op_test.py b/tensorflow/python/data/kernel_tests/map_test.py
similarity index 80%
rename from tensorflow/python/data/kernel_tests/map_dataset_op_test.py
rename to tensorflow/python/data/kernel_tests/map_test.py
index 81ef7d16be2..fdce7914471 100644
--- a/tensorflow/python/data/kernel_tests/map_dataset_op_test.py
+++ b/tensorflow/python/data/kernel_tests/map_test.py
@@ -19,7 +19,6 @@ from __future__ import print_function
 
 from collections import namedtuple
 import threading
-import time
 import warnings
 
 from absl.testing import parameterized
@@ -27,7 +26,6 @@ import numpy as np
 
 from tensorflow.core.framework import attr_value_pb2
 from tensorflow.core.protobuf import config_pb2
-from tensorflow.python.client import session
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import constant_op
@@ -79,7 +77,7 @@ def _make_coordinated_sloppy_dataset(num_elements, num_parallel_calls):
   options.experimental_deterministic = False
   dataset = dataset_ops.Dataset.range(num_elements).map(
       map_fn, num_parallel_calls).with_options(options)
-  iterator = dataset.make_one_shot_iterator()
+  iterator = dataset_ops.make_one_shot_iterator(dataset)
   next_element = iterator.get_next()
   return next_element, coordination_events
 
@@ -102,7 +100,7 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
     count = array_ops.placeholder(dtypes.int64, shape=[])
 
     dataset = self._buildMapDataset(components, count)
-    iterator = dataset.make_initializable_iterator()
+    iterator = dataset_ops.make_initializable_iterator(dataset)
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
@@ -168,7 +166,7 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
 
     dataset = self._buildParallelMapDataset(
         components, count, num_parallel_calls, output_buffer_size)
-    iterator = dataset.make_initializable_iterator()
+    iterator = dataset_ops.make_initializable_iterator(dataset)
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
@@ -237,7 +235,7 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
     dataset = self._buildParallelMapDataset(components, 1000, 100, 100)
     # NOTE(mrry): Also test that the prefetching thread is cancelled correctly.
     dataset = dataset.prefetch(100)
-    iterator = dataset.make_initializable_iterator()
+    iterator = dataset_ops.make_initializable_iterator(dataset)
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
@@ -252,7 +250,7 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
     dataset = (dataset_ops.Dataset.from_tensor_slices(components)
                .map(lambda x: array_ops.check_numerics(x, "message"),
                     num_parallel_calls=2))
-    iterator = dataset.make_initializable_iterator()
+    iterator = dataset_ops.make_initializable_iterator(dataset)
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
@@ -267,7 +265,7 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
     dataset = (dataset_ops.Dataset.from_tensor_slices(components)
                .map(lambda x: array_ops.check_numerics(x, "message"),
                     num_parallel_calls=2))
-    iterator = dataset.make_initializable_iterator()
+    iterator = dataset_ops.make_initializable_iterator(dataset)
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
@@ -288,7 +286,7 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
     dataset = (dataset_ops.Dataset.from_tensor_slices(components)
                .map(lambda x: array_ops.check_numerics(x, "message"))
                .prefetch(2))
-    iterator = dataset.make_initializable_iterator()
+    iterator = dataset_ops.make_initializable_iterator(dataset)
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
@@ -314,8 +312,8 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
       return dataset_ops.Dataset.range(10).map(_map_fn)
 
     def _build_graph():
-      captured_iterator = dataset_ops.Dataset.range(
-          10).make_initializable_iterator()
+      captured_iterator = dataset_ops.make_initializable_iterator(
+          dataset_ops.Dataset.range(10))
       ds = _build_ds(captured_iterator)
       iterator = ds.make_initializable_iterator()
       init_op = iterator.initializer
@@ -345,10 +343,9 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
     input_sentences = dataset_ops.Dataset.from_tensor_slices(
         ["brain brain tank salad surgery", "surgery brain"])
 
-    iterator = (input_sentences
-                .map(lambda x: string_ops.string_split([x]).values)
-                .map(table.lookup)
-                .make_initializable_iterator())
+    iterator = dataset_ops.make_initializable_iterator(
+        input_sentences
+        .map(lambda x: string_ops.string_split([x]).values).map(table.lookup))
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
@@ -365,8 +362,9 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
     queue = data_flow_ops.FIFOQueue(200, dtypes.int64, shapes=[])
     enqueue_op = queue.enqueue_many(elements)
     close_op = queue.close()
-    iterator = (dataset_ops.Dataset.from_tensors(0).repeat(-1)
-                .map(lambda _: queue.dequeue()).make_initializable_iterator())
+    iterator = dataset_ops.make_initializable_iterator(
+        dataset_ops.Dataset.from_tensors(0).repeat(-1)
+        .map(lambda _: queue.dequeue()))
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
@@ -389,9 +387,9 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
     enqueue_op = queue.enqueue_many(elements)
     close_op = queue.close()
 
-    iterator = (dataset_ops.Dataset.from_tensors(0).repeat(-1)
-                .map(lambda _: (queue.dequeue(), queue_2.dequeue()))
-                .make_initializable_iterator())
+    iterator = dataset_ops.make_initializable_iterator(
+        dataset_ops.Dataset.from_tensors(0).repeat(-1)
+        .map(lambda _: (queue.dequeue(), queue_2.dequeue())))
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
@@ -408,9 +406,9 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
   def testCaptureVariable(self):
     counter_var = variable_scope.get_variable(
         "counter", (), dtypes.int32, use_resource=True)
-    iterator = (dataset_ops.Dataset.from_tensors(0).repeat(10)
-                .map(lambda _: counter_var.assign_add(1))
-                .make_initializable_iterator())
+    iterator = dataset_ops.make_initializable_iterator(
+        dataset_ops.Dataset.from_tensors(0).repeat(10)
+        .map(lambda _: counter_var.assign_add(1)))
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
@@ -428,9 +426,9 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
   def testCaptureUninitializedVariableError(self):
     counter_var = variable_scope.get_variable(
         "counter", (), dtypes.int32, use_resource=True)
-    iterator = (dataset_ops.Dataset.from_tensors(0).repeat(10)
-                .map(lambda _: counter_var.assign_add(1))
-                .make_initializable_iterator())
+    iterator = dataset_ops.make_initializable_iterator(
+        dataset_ops.Dataset.from_tensors(0).repeat(10)
+        .map(lambda _: counter_var.assign_add(1)))
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
@@ -440,9 +438,9 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
         sess.run(get_next)
 
   def testSeededStatefulOperatorIsProperlyStateful(self):
-    iterator = (dataset_ops.Dataset.from_tensors(0).repeat(10)
-                .map(lambda _: random_ops.random_uniform((), seed=11)).batch(2)
-                .make_initializable_iterator())
+    iterator = dataset_ops.make_initializable_iterator(
+        dataset_ops.Dataset.from_tensors(0).repeat(10)
+        .map(lambda _: random_ops.random_uniform((), seed=11)).batch(2))
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
@@ -464,11 +462,11 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
       self.assertAllClose(random_values, random_values_2)
 
   def testStatefulMapKeepsStateAcrossIterators(self):
-    iterator = (dataset_ops.Dataset.from_tensors(0).repeat(10)
-                .map(lambda _: random_ops.random_uniform((), seed=11))
-                .repeat(1000)
-                .batch(10)
-                .make_initializable_iterator())
+    iterator = dataset_ops.make_initializable_iterator(
+        dataset_ops.Dataset.from_tensors(0).repeat(10)
+        .map(lambda _: random_ops.random_uniform((), seed=11))
+        .repeat(1000)
+        .batch(10))
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
@@ -493,9 +491,8 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
       counter_var.assign_add(1)
       return x
 
-    iterator = (dataset_ops.Dataset.range(10)
-                .map(increment_fn)
-                .make_initializable_iterator())
+    iterator = dataset_ops.make_initializable_iterator(
+        dataset_ops.Dataset.range(10).map(increment_fn))
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
@@ -511,17 +508,17 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
       self.assertEqual(10, sess.run(counter_var))
 
   def testMapDict(self):
-    iterator = (dataset_ops.Dataset.range(10)
-                .map(lambda x: {"foo": x * 2, "bar": x ** 2})
-                .map(lambda d: d["foo"] + d["bar"])
-                .make_initializable_iterator())
+    iterator = dataset_ops.make_initializable_iterator(
+        dataset_ops.Dataset.range(10)
+        .map(lambda x: {"foo": x * 2, "bar": x ** 2})
+        .map(lambda d: d["foo"] + d["bar"]))
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
     with self.cached_session() as sess:
       sess.run(init_op)
       for i in range(10):
-        self.assertEqual(i * 2 + i ** 2, sess.run(get_next))
+        self.assertEqual(i * 2 + i**2, sess.run(get_next))
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
 
@@ -546,8 +543,9 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
     dataset_tuple = dataset_tuple.map(preprocess_tuple)
     dataset_namedtuple = dataset_namedtuple.map(preprocess_namedtuple)
 
-    next_tuple = dataset_tuple.make_one_shot_iterator().get_next()
-    next_namedtuple = dataset_namedtuple.make_one_shot_iterator().get_next()
+    next_tuple = dataset_ops.make_one_shot_iterator(dataset_tuple).get_next()
+    next_namedtuple = dataset_ops.make_one_shot_iterator(
+        dataset_namedtuple).get_next()
 
     # make sure both datasets contain the same data
     with self.cached_session() as sess:
@@ -561,16 +559,15 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
 
   def testUseStepContainerInMap(self):
     row = np.arange(6)
-    iterator = (
+    iterator = dataset_ops.make_initializable_iterator(
         dataset_ops.Dataset.from_tensors(row)
-        .map(lambda elems: functional_ops.map_fn(lambda x: x * x, elems))
-        .make_initializable_iterator())
+        .map(lambda elems: functional_ops.map_fn(lambda x: x * x, elems)))
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
     with self.cached_session() as sess:
       sess.run(init_op)
-      self.assertAllEqual(row ** 2, sess.run(get_next))
+      self.assertAllEqual(row**2, sess.run(get_next))
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
 
@@ -600,9 +597,9 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
           pred_fn_pairs, default=multiply, exclusive=True)
 
     def build_dataset(row, num):
-      iterator = (
+      iterator = dataset_ops.make_initializable_iterator(
           dataset_ops.Dataset.from_tensor_slices(row).map(
-              lambda x: control_map_fn(x, num)).make_initializable_iterator())
+              lambda x: control_map_fn(x, num)))
       init_op = iterator.initializer
       get_next = iterator.get_next()
       return init_op, get_next
@@ -639,11 +636,10 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
 
     def build_dataset(row, num):
       # pylint: disable=g-long-lambda
-      iterator = (
+      iterator = dataset_ops.make_initializable_iterator(
           dataset_ops.Dataset.from_tensors(row).map(
-              lambda elems: functional_ops.map_fn(lambda x:
-                                                  control_map_fn(x, num), elems)
-              ).make_initializable_iterator())
+              lambda elems: functional_ops.map_fn(
+                  lambda x: control_map_fn(x, num), elems)))
       init_op = iterator.initializer
       get_next = iterator.get_next()
       return init_op, get_next
@@ -687,11 +683,10 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
     row = np.arange(6)
     num = 2
     # pylint: disable=g-long-lambda
-    iterator = (
+    iterator = dataset_ops.make_initializable_iterator(
         dataset_ops.Dataset.from_tensors(row).map(
-            lambda elems: functional_ops.map_fn(lambda x:
-                                                control_map_fn(x, num), elems)
-            ).make_initializable_iterator())
+            lambda elems: functional_ops.map_fn(
+                lambda x: control_map_fn(x, num), elems)))
     # pylint: enable=g-long-lambda
     init_op = iterator.initializer
     get_next = iterator.get_next()
@@ -721,11 +716,10 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
       return script_ops.py_func(_map_py_func, [x], x.dtype)
 
     buffer_size_placeholder = array_ops.placeholder(dtypes.int64, shape=[])
-    iterator = (
+    iterator = dataset_ops.make_initializable_iterator(
         dataset_ops.Dataset.range(100)
         .map(_map_fn)
-        .prefetch(buffer_size_placeholder)
-        .make_initializable_iterator())
+        .prefetch(buffer_size_placeholder))
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
@@ -761,9 +755,9 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
           sess.run(get_next)
 
   def testReturnList(self):
-    iterator = (dataset_ops.Dataset.range(10)
-                .map(lambda x: [x, constant_op.constant(37.0)])
-                .make_initializable_iterator())
+    iterator = dataset_ops.make_initializable_iterator(
+        dataset_ops.Dataset.range(10)
+        .map(lambda x: [x, constant_op.constant(37.0)]))
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
@@ -782,9 +776,8 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
       return script_ops.py_func(
           _map_py_func, [x_tensor], [dtypes.int64, dtypes.float64])
 
-    iterator = (dataset_ops.Dataset.range(10)
-                .map(_map_fn)
-                .make_initializable_iterator())
+    iterator = dataset_ops.make_initializable_iterator(
+        dataset_ops.Dataset.range(10).map(_map_fn))
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
@@ -803,9 +796,8 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
           values=(i * np.array([1])),
           dense_shape=np.array([1, 1]))
 
-    iterator = (dataset_ops.Dataset.range(10)
-                .map(_sparse)
-                .make_initializable_iterator())
+    iterator = dataset_ops.make_initializable_iterator(
+        dataset_ops.Dataset.range(10).map(_sparse))
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
@@ -830,9 +822,8 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
       self.assertTrue(sparse_tensor.is_sparse(i))
       return sparse_ops.sparse_concat(0, [i, i])
 
-    iterator = (
-        dataset_ops.Dataset.range(10).map(_sparse).map(_check)
-        .make_initializable_iterator())
+    iterator = dataset_ops.make_initializable_iterator(
+        dataset_ops.Dataset.range(10).map(_sparse).map(_check))
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
@@ -852,11 +843,10 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
       else:
         return i
 
-    iterator = (
+    iterator = dataset_ops.make_initializable_iterator(
         dataset_ops.Dataset.range(105)
         .map(lambda x: script_ops.py_func(raising_py_func, [x], dtypes.int64),
-             num_parallel_calls=2)
-        .make_initializable_iterator())
+             num_parallel_calls=2))
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
@@ -868,9 +858,8 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
         sess.run(get_next)
 
   def testConstantOutput(self):
-    iterator = (
-        dataset_ops.Dataset.range(10).map(lambda x: [x, "hello", 10])
-        .make_initializable_iterator())
+    iterator = dataset_ops.make_initializable_iterator(
+        dataset_ops.Dataset.range(10).map(lambda x: [x, "hello", 10]))
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
@@ -901,12 +890,14 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
         break
     self.assertTrue(found_warning)
 
-  def testNestedDatasetError(self):
-    dataset = dataset_ops.Dataset.from_tensors([1.0, 2.0, 3.0])
-    with self.assertRaisesRegexp(
-        NotImplementedError, r"The Dataset.map\(\) transformation does not "
-        "currently support nested datasets as outputs."):
-      _ = dataset.map(dataset_ops.Dataset.from_tensor_slices)
+  def testNestedDatasetMap(self):
+    # TODO(b/110122868): When iterators can yield a `tf.data.Dataset`, remove
+    # the `get_single_element()` call.
+    dataset = dataset_ops.Dataset.from_tensors([1.0, 2.0, 3.0]).map(
+        dataset_ops.Dataset.from_tensor_slices).map(
+            lambda ds: ds.batch(3)).flat_map(lambda x: x)
+
+    self.assertDatasetProduces(dataset, [[1.0, 2.0, 3.0]])
 
   def testReturnValueError(self):
     dataset = dataset_ops.Dataset.from_tensors([1.0, 2.0, 3.0])
@@ -939,7 +930,7 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
       return const_tensor
 
     dataset = dataset.map(broken_function)
-    iterator = dataset.make_initializable_iterator()
+    iterator = dataset_ops.make_initializable_iterator(dataset)
 
     with self.cached_session() as sess:
       with self.assertRaisesRegexp(errors.InvalidArgumentError, "BrokenConst"):
@@ -966,7 +957,7 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
       return tids
 
     dataset = make_dataset_fn(dataset, _map_fn)
-    iterator = dataset.make_one_shot_iterator()
+    iterator = dataset_ops.make_one_shot_iterator(dataset)
     get_next = iterator.get_next()
 
     with self.cached_session() as sess:
@@ -987,7 +978,7 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
   def testShortCircuit(self, structure, map_fn, num_parallel_calls):
     dataset = self.structuredDataset(structure).repeat().map(
         map_fn, num_parallel_calls=num_parallel_calls)
-    get_next = dataset.make_one_shot_iterator().get_next()
+    get_next = dataset_ops.make_one_shot_iterator(dataset).get_next()
 
     with self.cached_session() as sess:
       if isinstance(structure, tuple):
@@ -1004,7 +995,7 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
     captured_t = array_ops.placeholder(dtypes.int64, shape=[])
     dataset = self.structuredDataset(None).repeat().map(
         lambda x: captured_t, num_parallel_calls=num_parallel_calls)
-    iterator = dataset.make_initializable_iterator()
+    iterator = dataset_ops.make_initializable_iterator(dataset)
     get_next = iterator.get_next()
 
     with self.cached_session() as sess:
@@ -1055,108 +1046,5 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
         sess.run(get_next)
 
 
-class MapDatasetBenchmark(test.Benchmark):
-
-  def benchmarkChainOfMaps(self):
-    chain_lengths = [0, 1, 2, 5, 10, 20, 50]
-    for chain_length in chain_lengths:
-      for mode in ["general", "single-threaded", "short-circuit"]:
-        if mode == "general":
-          map_fn = lambda x: x + 1
-          use_inter_op_parallelism = True
-          print_label = ""
-          benchmark_label = ""
-        if mode == "single-threaded":
-          map_fn = lambda x: x + 1
-          use_inter_op_parallelism = False
-          print_label = " (single threaded mode)"
-          benchmark_label = "_single_threaded"
-        if mode == "short-circuit":
-          map_fn = lambda x: x
-          use_inter_op_parallelism = True  # should not have any significance
-          print_label = " (short circuit mode)"
-          benchmark_label = "_short_circuit"
-
-        with ops.Graph().as_default():
-          dataset = dataset_ops.Dataset.from_tensors(0).repeat(None)
-          for _ in range(chain_length):
-            dataset = dataset_ops.MapDataset(
-                dataset,
-                map_fn,
-                use_inter_op_parallelism=use_inter_op_parallelism)
-          iterator = dataset.make_one_shot_iterator()
-          next_element = iterator.get_next()
-
-          with session.Session() as sess:
-            for _ in range(5):
-              sess.run(next_element.op)
-            deltas = []
-            for _ in range(100):
-              start = time.time()
-              for _ in range(100):
-                sess.run(next_element.op)
-              end = time.time()
-              deltas.append(end - start)
-
-            median_wall_time = np.median(deltas) / 100
-            print("Map dataset chain length%s: %d Median wall time: %f" %
-                  (print_label, chain_length, median_wall_time))
-            self.report_benchmark(
-                iters=1000,
-                wall_time=median_wall_time,
-                name="benchmark_map_dataset_chain_latency_%d%s" %
-                (chain_length, benchmark_label))
-
-  def benchmarkMapFanOut(self):
-    fan_outs = [1, 2, 5, 10, 20, 50, 100]
-    for fan_out in fan_outs:
-      for mode in ["general", "single-threaded", "short-circuit"]:
-        if mode == "general":
-          map_fn = lambda *xs: [x + 1 for x in xs]
-          use_inter_op_parallelism = True
-          print_label = ""
-          benchmark_label = ""
-        if mode == "single-threaded":
-          map_fn = lambda *xs: [x + 1 for x in xs]
-          use_inter_op_parallelism = False
-          print_label = " (single threaded mode)"
-          benchmark_label = "_single_threaded"
-        if mode == "short-circuit":
-          map_fn = lambda *xs: xs
-          use_inter_op_parallelism = True  # should not have any significance
-          print_label = " (short circuit mode)"
-          benchmark_label = "_short_circuit"
-
-        with ops.Graph().as_default():
-          dataset = dataset_ops.Dataset.from_tensors(
-              tuple(0 for _ in range(fan_out))).repeat(None)
-          dataset = dataset_ops.MapDataset(
-              dataset,
-              map_fn,
-              use_inter_op_parallelism=use_inter_op_parallelism)
-          iterator = dataset.make_one_shot_iterator()
-          next_element = iterator.get_next()
-
-          with session.Session() as sess:
-            for _ in range(5):
-              sess.run(next_element[0].op)
-            deltas = []
-            for _ in range(100):
-              start = time.time()
-              for _ in range(100):
-                sess.run(next_element[0].op)
-              end = time.time()
-              deltas.append(end - start)
-
-            median_wall_time = np.median(deltas) / 100
-            print("Map dataset fan out%s: %d Median wall time: %f" %
-                  (print_label, fan_out, median_wall_time))
-            self.report_benchmark(
-                iters=1000,
-                wall_time=median_wall_time,
-                name="benchmark_map_dataset_fan_out_%d%s" % (fan_out,
-                                                             benchmark_label))
-
-
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/data/kernel_tests/multi_device_iterator_test.py b/tensorflow/python/data/kernel_tests/multi_device_iterator_test.py
index 42ee1e21864..622ebb55dec 100644
--- a/tensorflow/python/data/kernel_tests/multi_device_iterator_test.py
+++ b/tensorflow/python/data/kernel_tests/multi_device_iterator_test.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""MultiDeviceIterator tests."""
+"""Tests for `tf.data.MultiDeviceIterator`."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -20,6 +20,7 @@ from __future__ import print_function
 
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.data.experimental.ops import optimization
+from tensorflow.python.data.experimental.ops.optimization_options import OptimizationOptions
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import multi_device_iterator_ops
@@ -31,6 +32,7 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
 
 
+# TODO(b/117581999): Add eager coverage.
 class MultiDeviceIteratorTest(test_base.DatasetTestBase):
 
   def testNoGetNext(self):
@@ -40,7 +42,7 @@ class MultiDeviceIteratorTest(test_base.DatasetTestBase):
 
     config = config_pb2.ConfigProto(device_count={"CPU": 3})
     with self.test_session(config=config) as sess:
-      sess.run(multi_device_iterator.initializer)
+      self.evaluate(multi_device_iterator.initializer)
 
   def testBasic(self):
     dataset = dataset_ops.Dataset.range(10)
@@ -50,13 +52,13 @@ class MultiDeviceIteratorTest(test_base.DatasetTestBase):
 
     config = config_pb2.ConfigProto(device_count={"CPU": 3})
     with self.test_session(config=config) as sess:
-      sess.run(multi_device_iterator.initializer)
+      self.evaluate(multi_device_iterator.initializer)
       for i in range(0, 10, 2):
-        self.assertEqual(i, sess.run(elem_on_1))
-        self.assertEqual(i + 1, sess.run(elem_on_2))
+        self.assertEqual(i, self.evaluate(elem_on_1))
+        self.assertEqual(i + 1, self.evaluate(elem_on_2))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(elem_on_1)
-        sess.run(elem_on_2)
+        self.evaluate(elem_on_1)
+        self.evaluate(elem_on_2)
 
   def testOneOnSameDevice(self):
     with ops.device("/cpu:0"):
@@ -67,13 +69,13 @@ class MultiDeviceIteratorTest(test_base.DatasetTestBase):
 
     config = config_pb2.ConfigProto(device_count={"CPU": 2})
     with self.test_session(config=config) as sess:
-      sess.run(multi_device_iterator.initializer)
+      self.evaluate(multi_device_iterator.initializer)
       for i in range(0, 10, 2):
-        self.assertEqual(i, sess.run(elem_on_1))
-        self.assertEqual(i + 1, sess.run(elem_on_2))
+        self.assertEqual(i, self.evaluate(elem_on_1))
+        self.assertEqual(i + 1, self.evaluate(elem_on_2))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(elem_on_1)
-        sess.run(elem_on_2)
+        self.evaluate(elem_on_1)
+        self.evaluate(elem_on_2)
 
   def testRepeatDevices(self):
     with ops.device("/cpu:0"):
@@ -85,17 +87,17 @@ class MultiDeviceIteratorTest(test_base.DatasetTestBase):
 
     config = config_pb2.ConfigProto(device_count={"CPU": 3})
     with self.test_session(config=config) as sess:
-      sess.run(multi_device_iterator.initializer)
+      self.evaluate(multi_device_iterator.initializer)
       for i in range(0, 20, 4):
-        self.assertEqual(i, sess.run(elem_on_1))
-        self.assertEqual(i + 1, sess.run(elem_on_2))
-        self.assertEqual(i + 2, sess.run(elem_on_3))
-        self.assertEqual(i + 3, sess.run(elem_on_4))
+        self.assertEqual(i, self.evaluate(elem_on_1))
+        self.assertEqual(i + 1, self.evaluate(elem_on_2))
+        self.assertEqual(i + 2, self.evaluate(elem_on_3))
+        self.assertEqual(i + 3, self.evaluate(elem_on_4))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(elem_on_1)
-        sess.run(elem_on_2)
-        sess.run(elem_on_3)
-        sess.run(elem_on_4)
+        self.evaluate(elem_on_1)
+        self.evaluate(elem_on_2)
+        self.evaluate(elem_on_3)
+        self.evaluate(elem_on_4)
 
   def testNotFullyDivisible(self):
     dataset = dataset_ops.Dataset.range(9)
@@ -105,14 +107,14 @@ class MultiDeviceIteratorTest(test_base.DatasetTestBase):
 
     config = config_pb2.ConfigProto(device_count={"CPU": 3})
     with self.test_session(config=config) as sess:
-      sess.run(multi_device_iterator.initializer)
+      self.evaluate(multi_device_iterator.initializer)
       for i in range(0, 8, 2):
-        self.assertEqual(i, sess.run(elem_on_1))
-        self.assertEqual(i + 1, sess.run(elem_on_2))
-      self.assertEqual(8, sess.run(elem_on_1))
+        self.assertEqual(i, self.evaluate(elem_on_1))
+        self.assertEqual(i + 1, self.evaluate(elem_on_2))
+      self.assertEqual(8, self.evaluate(elem_on_1))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(elem_on_1)
-        sess.run(elem_on_2)
+        self.evaluate(elem_on_1)
+        self.evaluate(elem_on_2)
 
   def testGetNextAsOptional(self):
     dataset = dataset_ops.Dataset.range(9)
@@ -126,7 +128,7 @@ class MultiDeviceIteratorTest(test_base.DatasetTestBase):
 
     config = config_pb2.ConfigProto(device_count={"CPU": 3})
     with self.test_session(config=config) as sess:
-      sess.run(multi_device_iterator.initializer)
+      self.evaluate(multi_device_iterator.initializer)
       for i in range(0, 8, 2):
         elem_on_1_has_value, elem_on_1_value = sess.run(
             [elem_on_1_has_value_t, elem_on_1_t])
@@ -140,12 +142,12 @@ class MultiDeviceIteratorTest(test_base.DatasetTestBase):
           [elem_on_1_has_value_t, elem_on_1_t])
       self.assertTrue(elem_on_1_has_value)
       self.assertEqual(8, elem_on_1_value)
-      self.assertFalse(sess.run(elem_on_1_has_value_t))
-      self.assertFalse(sess.run(elem_on_2_has_value_t))
+      self.assertFalse(self.evaluate(elem_on_1_has_value_t))
+      self.assertFalse(self.evaluate(elem_on_2_has_value_t))
       with self.assertRaises(errors.InvalidArgumentError):
-        sess.run(elem_on_1_t)
+        self.evaluate(elem_on_1_t)
       with self.assertRaises(errors.InvalidArgumentError):
-        sess.run(elem_on_2_t)
+        self.evaluate(elem_on_2_t)
 
   def testUneven(self):
     dataset = dataset_ops.Dataset.range(10)
@@ -155,14 +157,14 @@ class MultiDeviceIteratorTest(test_base.DatasetTestBase):
 
     config = config_pb2.ConfigProto(device_count={"CPU": 3})
     with self.test_session(config=config) as sess:
-      sess.run(multi_device_iterator.initializer)
+      self.evaluate(multi_device_iterator.initializer)
       for i in range(0, 10, 2):
-        self.assertEqual(i, sess.run(elem_on_1))
+        self.assertEqual(i, self.evaluate(elem_on_1))
       for i in range(0, 10, 2):
-        self.assertEqual(i + 1, sess.run(elem_on_2))
+        self.assertEqual(i + 1, self.evaluate(elem_on_2))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(elem_on_1)
-        sess.run(elem_on_2)
+        self.evaluate(elem_on_1)
+        self.evaluate(elem_on_2)
 
   def testMultipleInitializations(self):
     with ops.device("/cpu:0"):
@@ -179,7 +181,8 @@ class MultiDeviceIteratorTest(test_base.DatasetTestBase):
     with self.test_session(config=config) as sess:
       for i in range(1000):
         sess.run(init_op, feed_dict={epoch: i})
-        self.assertEqual([(i, 0), (i, 1)], sess.run([elem_on_1, elem_on_2]))
+        self.assertEqual([(i, 0), (i, 1)], self.evaluate([elem_on_1,
+                                                          elem_on_2]))
 
   def testBasicGpu(self):
     if not test_util.is_gpu_available():
@@ -192,13 +195,13 @@ class MultiDeviceIteratorTest(test_base.DatasetTestBase):
 
     config = config_pb2.ConfigProto(device_count={"CPU": 2, "GPU": 1})
     with self.test_session(config=config) as sess:
-      sess.run(multi_device_iterator.initializer)
+      self.evaluate(multi_device_iterator.initializer)
       for i in range(0, 10, 2):
-        self.assertEqual(i, sess.run(elem_on_1))
-        self.assertEqual(i + 1, sess.run(elem_on_2))
+        self.assertEqual(i, self.evaluate(elem_on_1))
+        self.assertEqual(i + 1, self.evaluate(elem_on_2))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(elem_on_1)
-        sess.run(elem_on_2)
+        self.evaluate(elem_on_1)
+        self.evaluate(elem_on_2)
 
   def testUnevenGpu(self):
     if not test_util.is_gpu_available():
@@ -211,14 +214,14 @@ class MultiDeviceIteratorTest(test_base.DatasetTestBase):
 
     config = config_pb2.ConfigProto(device_count={"CPU": 2, "GPU": 1})
     with self.test_session(config=config) as sess:
-      sess.run(multi_device_iterator.initializer)
+      self.evaluate(multi_device_iterator.initializer)
       for i in range(0, 10, 2):
-        self.assertEqual(i, sess.run(elem_on_1))
+        self.assertEqual(i, self.evaluate(elem_on_1))
       for i in range(0, 10, 2):
-        self.assertEqual(i + 1, sess.run(elem_on_2))
+        self.assertEqual(i + 1, self.evaluate(elem_on_2))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(elem_on_1)
-        sess.run(elem_on_2)
+        self.evaluate(elem_on_1)
+        self.evaluate(elem_on_2)
 
   def testGetNextAsOptionalGpu(self):
     if not test_util.is_gpu_available():
@@ -235,7 +238,7 @@ class MultiDeviceIteratorTest(test_base.DatasetTestBase):
 
     config = config_pb2.ConfigProto(device_count={"CPU": 2, "GPU": 1})
     with self.test_session(config=config) as sess:
-      sess.run(multi_device_iterator.initializer)
+      self.evaluate(multi_device_iterator.initializer)
       for i in range(0, 8, 2):
         elem_on_1_has_value, elem_on_1_value = sess.run(
             [elem_on_1_has_value_t, elem_on_1_t])
@@ -249,12 +252,12 @@ class MultiDeviceIteratorTest(test_base.DatasetTestBase):
           [elem_on_1_has_value_t, elem_on_1_t])
       self.assertTrue(elem_on_1_has_value)
       self.assertEqual(8, elem_on_1_value)
-      self.assertFalse(sess.run(elem_on_1_has_value_t))
-      self.assertFalse(sess.run(elem_on_2_has_value_t))
+      self.assertFalse(self.evaluate(elem_on_1_has_value_t))
+      self.assertFalse(self.evaluate(elem_on_2_has_value_t))
       with self.assertRaises(errors.InvalidArgumentError):
-        sess.run(elem_on_1_t)
+        self.evaluate(elem_on_1_t)
       with self.assertRaises(errors.InvalidArgumentError):
-        sess.run(elem_on_2_t)
+        self.evaluate(elem_on_2_t)
 
   def testOptimization(self):
     dataset = dataset_ops.Dataset.range(10)
@@ -263,7 +266,8 @@ class MultiDeviceIteratorTest(test_base.DatasetTestBase):
     dataset = dataset.cache()
 
     options = dataset_ops.Options()
-    options.experimental_noop_elimination = True
+    options.experimental_optimization = OptimizationOptions()
+    options.experimental_optimization.noop_elimination = True
     dataset = dataset.with_options(options)
 
     multi_device_iterator = multi_device_iterator_ops.MultiDeviceIterator(
@@ -272,13 +276,13 @@ class MultiDeviceIteratorTest(test_base.DatasetTestBase):
 
     config = config_pb2.ConfigProto(device_count={"CPU": 3})
     with self.test_session(config=config) as sess:
-      sess.run(multi_device_iterator.initializer)
+      self.evaluate(multi_device_iterator.initializer)
       for i in range(0, 10, 2):
-        self.assertEqual(i, sess.run(elem_on_1))
-        self.assertEqual(i + 1, sess.run(elem_on_2))
+        self.assertEqual(i, self.evaluate(elem_on_1))
+        self.assertEqual(i + 1, self.evaluate(elem_on_2))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(elem_on_1)
-        sess.run(elem_on_2)
+        self.evaluate(elem_on_1)
+        self.evaluate(elem_on_2)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/data/kernel_tests/optional_ops_test.py b/tensorflow/python/data/kernel_tests/optional_test.py
similarity index 64%
rename from tensorflow/python/data/kernel_tests/optional_ops_test.py
rename to tensorflow/python/data/kernel_tests/optional_test.py
index 604e3ad88ec..c2c62e9423e 100644
--- a/tensorflow/python/data/kernel_tests/optional_ops_test.py
+++ b/tensorflow/python/data/kernel_tests/optional_test.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for the Optional data type wrapper."""
+"""Tests for `tf.data.Optional`."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
@@ -33,18 +33,18 @@ from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 
 
+@test_util.run_all_in_graph_and_eager_modes
 class OptionalTest(test_base.DatasetTestBase, parameterized.TestCase):
 
-  @test_util.run_in_graph_and_eager_modes
   def testFromValue(self):
     opt = optional_ops.Optional.from_value(constant_op.constant(37.0))
     self.assertTrue(self.evaluate(opt.has_value()))
     self.assertEqual(37.0, self.evaluate(opt.get_value()))
 
-  @test_util.run_in_graph_and_eager_modes
   def testFromStructuredValue(self):
     opt = optional_ops.Optional.from_value({
         "a": constant_op.constant(37.0),
@@ -56,7 +56,6 @@ class OptionalTest(test_base.DatasetTestBase, parameterized.TestCase):
         "b": ([b"Foo"], b"Bar")
     }, self.evaluate(opt.get_value()))
 
-  @test_util.run_in_graph_and_eager_modes
   def testFromSparseTensor(self):
     st_0 = sparse_tensor.SparseTensorValue(
         indices=np.array([[0]]),
@@ -75,7 +74,7 @@ class OptionalTest(test_base.DatasetTestBase, parameterized.TestCase):
       self.assertAllEqual(expected.dense_shape,
                           self.evaluate(actual.dense_shape))
 
-  @test_util.run_in_graph_and_eager_modes
+  @test_util.run_deprecated_v1
   def testFromNone(self):
     value_structure = structure.TensorStructure(dtypes.float32, [])
     opt = optional_ops.Optional.none_from_structure(value_structure)
@@ -90,7 +89,90 @@ class OptionalTest(test_base.DatasetTestBase, parameterized.TestCase):
     with self.assertRaises(errors.InvalidArgumentError):
       self.evaluate(opt.get_value())
 
-  @test_util.run_in_graph_and_eager_modes
+  def testAddN(self):
+    devices = ["/cpu:0"]
+    if test_util.is_gpu_available():
+      devices.append("/gpu:0")
+    for device in devices:
+      with ops.device(device):
+        # With value
+        opt1 = optional_ops.Optional.from_value((1.0, 2.0))
+        opt2 = optional_ops.Optional.from_value((3.0, 4.0))
+
+        add_tensor = math_ops.add_n([opt1._variant_tensor,
+                                     opt2._variant_tensor])
+        add_opt = optional_ops._OptionalImpl(add_tensor, opt1.value_structure)
+        self.assertAllEqual(self.evaluate(add_opt.get_value()), (4.0, 6.0))
+
+        # Without value
+        opt_none1 = optional_ops.Optional.none_from_structure(
+            opt1.value_structure)
+        opt_none2 = optional_ops.Optional.none_from_structure(
+            opt2.value_structure)
+        add_tensor = math_ops.add_n([opt_none1._variant_tensor,
+                                     opt_none2._variant_tensor])
+        add_opt = optional_ops._OptionalImpl(add_tensor,
+                                             opt_none1.value_structure)
+        self.assertFalse(self.evaluate(add_opt.has_value()))
+
+  def testNestedAddN(self):
+    devices = ["/cpu:0"]
+    if test_util.is_gpu_available():
+      devices.append("/gpu:0")
+    for device in devices:
+      with ops.device(device):
+        opt1 = optional_ops.Optional.from_value([1, 2.0])
+        opt2 = optional_ops.Optional.from_value([3, 4.0])
+        opt3 = optional_ops.Optional.from_value((5.0, opt1._variant_tensor))
+        opt4 = optional_ops.Optional.from_value((6.0, opt2._variant_tensor))
+
+        add_tensor = math_ops.add_n([opt3._variant_tensor,
+                                     opt4._variant_tensor])
+        add_opt = optional_ops._OptionalImpl(add_tensor, opt3.value_structure)
+        self.assertEqual(self.evaluate(add_opt.get_value()[0]), 11.0)
+
+        inner_add_opt = optional_ops._OptionalImpl(add_opt.get_value()[1],
+                                                   opt1.value_structure)
+        self.assertAllEqual(inner_add_opt.get_value(), [4, 6.0])
+
+  def testZerosLike(self):
+    devices = ["/cpu:0"]
+    if test_util.is_gpu_available():
+      devices.append("/gpu:0")
+    for device in devices:
+      with ops.device(device):
+        # With value
+        opt = optional_ops.Optional.from_value((1.0, 2.0))
+        zeros_tensor = array_ops.zeros_like(opt._variant_tensor)
+        zeros_opt = optional_ops._OptionalImpl(zeros_tensor,
+                                               opt.value_structure)
+        self.assertAllEqual(self.evaluate(zeros_opt.get_value()),
+                            (0.0, 0.0))
+
+        # Without value
+        opt_none = optional_ops.Optional.none_from_structure(
+            opt.value_structure)
+        zeros_tensor = array_ops.zeros_like(opt_none._variant_tensor)
+        zeros_opt = optional_ops._OptionalImpl(zeros_tensor,
+                                               opt_none.value_structure)
+        self.assertFalse(self.evaluate(zeros_opt.has_value()))
+
+  def testNestedZerosLike(self):
+    devices = ["/cpu:0"]
+    if test_util.is_gpu_available():
+      devices.append("/gpu:0")
+    for device in devices:
+      with ops.device(device):
+        opt1 = optional_ops.Optional.from_value(1.0)
+        opt2 = optional_ops.Optional.from_value(opt1._variant_tensor)
+
+        zeros_tensor = array_ops.zeros_like(opt2._variant_tensor)
+        zeros_opt = optional_ops._OptionalImpl(zeros_tensor,
+                                               opt2.value_structure)
+        inner_zeros_opt = optional_ops._OptionalImpl(zeros_opt.get_value(),
+                                                     opt1.value_structure)
+        self.assertEqual(self.evaluate(inner_zeros_opt.get_value()), 0.0)
+
   def testCopyToGPU(self):
     if not test_util.is_gpu_available():
       self.skipTest("No GPU available")
@@ -120,6 +202,41 @@ class OptionalTest(test_base.DatasetTestBase, parameterized.TestCase):
                      self.evaluate(gpu_optional_with_value_values))
     self.assertFalse(self.evaluate(gpu_optional_none_has_value))
 
+  def testNestedCopyToGPU(self):
+    if not test_util.is_gpu_available():
+      self.skipTest("No GPU available")
+
+    with ops.device("/cpu:0"):
+      optional_with_value = optional_ops.Optional.from_value(
+          (constant_op.constant(37.0), constant_op.constant("Foo"),
+           constant_op.constant(42)))
+      optional_none = optional_ops.Optional.none_from_structure(
+          structure.TensorStructure(dtypes.float32, []))
+      nested_optional = optional_ops.Optional.from_value(
+          (optional_with_value._variant_tensor, optional_none._variant_tensor,
+           1.0))
+
+    with ops.device("/gpu:0"):
+      gpu_nested_optional = optional_ops._OptionalImpl(
+          array_ops.identity(nested_optional._variant_tensor),
+          nested_optional.value_structure)
+
+      gpu_nested_optional_has_value = gpu_nested_optional.has_value()
+      gpu_nested_optional_values = gpu_nested_optional.get_value()
+
+    self.assertTrue(self.evaluate(gpu_nested_optional_has_value))
+
+    inner_with_value = optional_ops._OptionalImpl(
+        gpu_nested_optional_values[0], optional_with_value.value_structure)
+
+    inner_none = optional_ops._OptionalImpl(
+        gpu_nested_optional_values[1], optional_none.value_structure)
+
+    self.assertEqual((37.0, b"Foo", 42),
+                     self.evaluate(inner_with_value.get_value()))
+    self.assertFalse(self.evaluate(inner_none.has_value()))
+    self.assertEqual(1.0, self.evaluate(gpu_nested_optional_values[2]))
+
   def _assertElementValueEqual(self, expected, actual):
     if isinstance(expected, dict):
       self.assertItemsEqual(list(expected.keys()), list(actual.keys()))
@@ -151,7 +268,9 @@ class OptionalTest(test_base.DatasetTestBase, parameterized.TestCase):
        optional_ops.OptionalStructure(
            structure.TensorStructure(dtypes.float32, []))),
   )
-  def testOptionalStructure(self, tf_value_fn, expected_value_structure):
+  @test_util.run_deprecated_v1
+  def testSkipEagerOptionalStructure(self, tf_value_fn,
+                                     expected_value_structure):
     tf_value = tf_value_fn()
     opt = optional_ops.Optional.from_value(tf_value)
 
@@ -205,7 +324,9 @@ class OptionalTest(test_base.DatasetTestBase, parameterized.TestCase):
                     indices=[[0, 1], [1, 0]], values=[37.0, 42.0],
                     dense_shape=[2, 2])}, False),
   )
-  def testIteratorGetNextAsOptional(self, np_value, tf_value_fn, works_on_gpu):
+  @test_util.run_deprecated_v1
+  def testSkipEagerIteratorGetNextAsOptional(self, np_value, tf_value_fn,
+                                             works_on_gpu):
     if not works_on_gpu and test.is_gpu_available():
       self.skipTest("Test case not yet supported on GPU.")
     ds = dataset_ops.Dataset.from_tensors(np_value).repeat(3)
diff --git a/tensorflow/python/data/kernel_tests/padded_batch_test.py b/tensorflow/python/data/kernel_tests/padded_batch_test.py
new file mode 100644
index 00000000000..dcfb2f507bf
--- /dev/null
+++ b/tensorflow/python/data/kernel_tests/padded_batch_test.py
@@ -0,0 +1,243 @@
+# -*- coding: utf-8 -*-
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for `tf.data.Dataset.padded_batch()`."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+import numpy as np
+
+from tensorflow.python.data.kernel_tests import test_base
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import string_ops
+from tensorflow.python.platform import test
+from tensorflow.python.util import compat
+
+
+def _random_seq_lens(count):
+  return np.random.randint(20, size=(count,)).astype(np.int32)
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class PaddedBatchTest(test_base.DatasetTestBase, parameterized.TestCase):
+
+  @parameterized.named_parameters(
+      ('default_padding', _random_seq_lens(32), 4, [-1], False),
+      ('constant_padding', _random_seq_lens(32), 4, [25], False),
+      ('uneven_with_remainder', _random_seq_lens(34), 4, [-1], False),
+      ('uneven_without_remainder', _random_seq_lens(34), 4, [-1], True),
+  )
+  def testPaddedBatchDataset(self, seq_lens, batch_size, padded_shapes,
+                             drop_remainder):
+    """Tests the padded batch dataset logic for various input configurations.
+
+    Args:
+      seq_lens: the input sequence lengths
+      batch_size: the batch size
+      padded_shapes: the padded shapes to use
+      drop_remainder: whether a smaller batch size should be produced if batch
+        size does not divide number of inputs evenly
+    """
+
+    dataset = dataset_ops.Dataset.from_tensor_slices(seq_lens).map(
+        lambda x: array_ops.fill([x], x)).padded_batch(
+            batch_size=batch_size,
+            drop_remainder=drop_remainder,
+            padded_shapes=padded_shapes)
+
+    num_full_batches = len(seq_lens) // batch_size
+    get_next = self.getNext(dataset)
+    for i in range(num_full_batches):
+      result = self.evaluate(get_next())
+      padded_len = padded_shapes[0]
+      if padded_len is None or padded_len == -1:
+        padded_len = np.max(result) if result.size > 0 else 0
+      self.assertEqual((batch_size, padded_len), result.shape)
+      for j in range(batch_size):
+        seq_len = seq_lens[(i * batch_size) + j]
+        self.assertAllEqual(result[j, :seq_len], [seq_len] * seq_len)
+        self.assertAllEqual(result[j, seq_len:], [0] * (padded_len - seq_len))
+
+    if not drop_remainder and len(seq_lens) % batch_size > 0:
+      result = self.evaluate(get_next())
+      padded_len = np.max(result) if result.size > 0 else 0
+      self.assertEqual((len(seq_lens) % batch_size, padded_len), result.shape)
+      for j in range(len(seq_lens) % batch_size):
+        seq_len = seq_lens[num_full_batches * batch_size + j]
+        self.assertAllEqual(result[j, :seq_len], [seq_len] * seq_len)
+        self.assertAllEqual(result[j, seq_len:], [0] * (padded_len - seq_len))
+
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
+
+  @test_util.run_deprecated_v1
+  def testPaddedBatchShortPadding(self):
+    dataset = (
+        dataset_ops.Dataset.from_tensor_slices(
+            [6, 5, 5, 5, 5]).map(lambda x: array_ops.fill([x], x)).padded_batch(
+                batch_size=4, padded_shapes=[5]))
+    self.assertDatasetProduces(
+        dataset, expected_error=(errors.DataLossError, ''))
+
+  def testPaddedBatchEmptyTensors(self):
+    dataset = (
+        dataset_ops.Dataset.from_tensor_slices(
+            [0, 0, 0, 0]).map(lambda x: array_ops.fill([x], x)).padded_batch(
+                batch_size=4, padded_shapes=[-1]))
+    self.assertDatasetProduces(dataset, expected_output=[[[], [], [], []]])
+
+  def testPaddedBatchDatasetNonDefaultPadding(self):
+
+    def fill_tuple(x):
+      filled = array_ops.fill([x], x)
+      return (filled, string_ops.as_string(filled))
+
+    random_seq_lens = np.random.randint(20, size=(32,)).astype(np.int32)
+    dataset = (
+        dataset_ops.Dataset.from_tensor_slices(random_seq_lens).map(fill_tuple)
+        .padded_batch(
+            4, padded_shapes=([-1], [-1]), padding_values=(-1, '<end>')))
+
+    get_next = self.getNext(dataset)
+    for i in range(8):
+      result = self.evaluate(get_next())
+      padded_len = np.max(result[0])
+      self.assertEqual((4, padded_len), result[0].shape)
+      self.assertEqual((4, padded_len), result[1].shape)
+      for j in range(4):
+        seq_len = random_seq_lens[(i * 4) + j]
+        self.assertAllEqual(result[0][j, :seq_len], [seq_len] * seq_len)
+        self.assertAllEqual(result[0][j, seq_len:],
+                            [-1] * (padded_len - seq_len))
+        self.assertAllEqual(result[1][j, :seq_len],
+                            [compat.as_bytes(str(seq_len))] * seq_len)
+        self.assertAllEqual(result[1][j, seq_len:],
+                            [b'<end>'] * (padded_len - seq_len))
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
+
+  def testPaddedBatchDatasetUnicode(self):
+    # See GitHub issue 16149
+    def generator():
+      data = [[u'Простой', u'тест', u'юникода'],
+              [u'никогда', u'не', u'бывает', u'простым']]
+
+      for seq in data:
+        yield seq, [0, 1, 2, 3]
+
+    dataset = dataset_ops.Dataset.from_generator(
+        generator, (dtypes.string, dtypes.int32),
+        (tensor_shape.TensorShape([None]), tensor_shape.TensorShape([None])))
+    padded_dataset = dataset.padded_batch(
+        2, padded_shapes=([None], [None]), padding_values=('', 0))
+    next_element = self.getNext(padded_dataset)
+    self.evaluate(next_element())
+
+  @test_util.run_deprecated_v1
+  def testSkipEagerPaddedBatchDatasetShapeSpecifications(self):
+    int_placeholder = array_ops.placeholder(dtypes.int32)
+    float_placeholder = array_ops.placeholder(dtypes.float32)
+    string_placeholder = array_ops.placeholder(dtypes.string)
+    input_dataset = dataset_ops.Dataset.from_tensors(
+        (int_placeholder, float_placeholder, string_placeholder))
+
+    # Test different ways of specifying the `padded_shapes` argument.
+    dynamic_padding_from_tensor_shapes = input_dataset.padded_batch(
+        32,
+        padded_shapes=(tensor_shape.TensorShape([None]),
+                       tensor_shape.TensorShape([None, None]),
+                       tensor_shape.TensorShape([37])))
+    dynamic_padding_from_lists = input_dataset.padded_batch(
+        32, padded_shapes=([None], [None, None], [37]))
+    dynamic_padding_from_lists_with_minus_one = input_dataset.padded_batch(
+        32, padded_shapes=([-1], [-1, -1], [37]))
+    dynamic_padding_from_tensors = input_dataset.padded_batch(
+        32,
+        padded_shapes=(constant_op.constant([-1], dtype=dtypes.int64),
+                       constant_op.constant([-1, -1], dtype=dtypes.int64),
+                       constant_op.constant([37], dtype=dtypes.int64)))
+
+    for dataset in [
+        dynamic_padding_from_tensor_shapes, dynamic_padding_from_lists,
+        dynamic_padding_from_lists_with_minus_one, dynamic_padding_from_tensors
+    ]:
+      self.assertEqual([None, None], dataset.output_shapes[0].as_list())
+      self.assertEqual([None, None, None], dataset.output_shapes[1].as_list())
+      self.assertEqual([None, 37], dataset.output_shapes[2].as_list())
+
+  def testPaddedBatchSparseError(self):
+
+    def _map_fn(i):
+      return sparse_tensor.SparseTensorValue(
+          indices=[[0, 0]], values=(i * [1]), dense_shape=[1, 1]), i
+
+    with self.assertRaises(TypeError):
+      _ = dataset_ops.Dataset.range(10).map(_map_fn).padded_batch(10)
+
+  def testPaddedBatchShapeError(self):
+    with self.assertRaisesRegexp(
+        ValueError, r'The padded shape \(1,\) is not compatible with the '
+        r'corresponding input component shape \(\).'):
+      _ = dataset_ops.Dataset.range(10).padded_batch(5, padded_shapes=[1])
+
+    with self.assertRaisesRegexp(
+        ValueError, r'The padded shape \(1,\) is not compatible with the '
+        r'corresponding input component shape \(3,\).'):
+      _ = dataset_ops.Dataset.from_tensors([1, 2, 3]).padded_batch(
+          5, padded_shapes=[1])
+
+    with self.assertRaisesRegexp(
+        ValueError, r'Padded shape .* must be a 1-D tensor '
+        r'of tf.int64 values, but its shape was \(2, 2\).'):
+      _ = dataset_ops.Dataset.from_tensors([1, 2, 3]).padded_batch(
+          5, padded_shapes=[[1, 1], [1, 1]])
+
+    with self.assertRaisesRegexp(
+        TypeError, r'Padded shape .* must be a 1-D tensor '
+        r'of tf.int64 values, but its element type was float32.'):
+      _ = dataset_ops.Dataset.from_tensors([1, 2, 3]).padded_batch(
+          5, padded_shapes=constant_op.constant([1., 2., 3.]))
+
+    with self.assertRaisesRegexp(
+        ValueError, r'The padded shape \(1,\) is not compatible with the '
+        r'corresponding input component shape \(\).'):
+      shape_as_tensor = constant_op.constant([1], dtype=dtypes.int64)
+      _ = dataset_ops.Dataset.range(10).padded_batch(
+          5, padded_shapes=shape_as_tensor)
+
+  @test_util.run_deprecated_v1
+  def testSkipEagerPaddedBatchShapeError(self):
+    with self.assertRaisesRegexp(
+        ValueError,
+        r'The padded shape \((\?|None), (\?|None)\) is not compatible with the '
+        r'corresponding input component shape \(\).'):
+      shape_as_tensor = array_ops.placeholder(dtypes.int64, shape=[2])
+      _ = dataset_ops.Dataset.range(10).padded_batch(
+          5, padded_shapes=shape_as_tensor)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/data/kernel_tests/prefetch_dataset_op_test.py b/tensorflow/python/data/kernel_tests/prefetch_test.py
similarity index 52%
rename from tensorflow/python/data/kernel_tests/prefetch_dataset_op_test.py
rename to tensorflow/python/data/kernel_tests/prefetch_test.py
index 76e2697b29d..a143ba0ac63 100644
--- a/tensorflow/python/data/kernel_tests/prefetch_dataset_op_test.py
+++ b/tensorflow/python/data/kernel_tests/prefetch_test.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Test PrefetchDataset."""
+"""Tests for `tf.data.Dataset.prefetch()`."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
@@ -21,40 +21,24 @@ from absl.testing import parameterized
 
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
-from tensorflow.python.ops import array_ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
 
 
-class PrefetchDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
+@test_util.run_all_in_graph_and_eager_modes
+class PrefetchTest(test_base.DatasetTestBase, parameterized.TestCase):
 
   @parameterized.parameters((-1), (0), (5))
   def testBufferSize(self, buffer_size):
-    buffer_size_t = array_ops.placeholder(dtypes.int64, shape=[])
-    iterator = dataset_ops.Dataset.range(10).prefetch(
-        buffer_size=buffer_size_t).make_initializable_iterator()
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(init_op, feed_dict={buffer_size_t: buffer_size})
-      for m in range(10):
-        self.assertEqual(m, sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+    dataset = dataset_ops.Dataset.range(10).prefetch(buffer_size=buffer_size)
+    self.assertDatasetProduces(dataset, expected_output=range(10))
 
   @parameterized.parameters((-2), (-42))
   def testInvalidBufferSize(self, buffer_size):
-    buffer_size_t = array_ops.placeholder(dtypes.int64, shape=[])
-    iterator = dataset_ops.Dataset.range(10).prefetch(
-        buffer_size=buffer_size_t).make_initializable_iterator()
-    init_op = iterator.initializer
-
-    with self.assertRaisesRegexp(errors.InvalidArgumentError, "buffer_size"):
-      with self.cached_session() as sess:
-        sess.run(init_op, feed_dict={buffer_size_t: buffer_size})
-
+    dataset = dataset_ops.Dataset.range(10).prefetch(buffer_size=buffer_size)
+    self.assertDatasetProduces(
+        dataset, expected_error=(errors.InvalidArgumentError, "buffer_size"))
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/data/kernel_tests/range_test.py b/tensorflow/python/data/kernel_tests/range_test.py
new file mode 100644
index 00000000000..3f5d25e7f39
--- /dev/null
+++ b/tensorflow/python/data/kernel_tests/range_test.py
@@ -0,0 +1,72 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for `tf.data.Dataset.range()`."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.data.kernel_tests import test_base
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import test_util
+from tensorflow.python.platform import test
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class RangeTest(test_base.DatasetTestBase):
+
+  def testStop(self):
+    dataset = dataset_ops.Dataset.range(5)
+    self.assertDatasetProduces(dataset, expected_output=range(5))
+
+  def testStartStop(self):
+    start, stop = 2, 5
+    dataset = dataset_ops.Dataset.range(start, stop)
+    self.assertDatasetProduces(dataset, expected_output=range(2, 5))
+
+  def testStartStopStep(self):
+    start, stop, step = 2, 10, 2
+    dataset = dataset_ops.Dataset.range(start, stop, step)
+    self.assertDatasetProduces(dataset, expected_output=range(2, 10, 2))
+
+  def testZeroStep(self):
+    start, stop, step = 2, 10, 0
+    dataset = dataset_ops.Dataset.range(start, stop, step)
+    self.assertDatasetProduces(
+        dataset, expected_error=(errors.InvalidArgumentError, ""))
+
+  def testNegativeStep(self):
+    start, stop, step = 2, 10, -1
+    dataset = dataset_ops.Dataset.range(start, stop, step)
+    self.assertDatasetProduces(dataset, expected_output=range(2, 10, -1))
+
+  def testStopLessThanStart(self):
+    start, stop = 10, 2
+    dataset = dataset_ops.Dataset.range(start, stop)
+    self.assertDatasetProduces(dataset, expected_output=range(10, 2))
+
+  def testStopLessThanStartWithPositiveStep(self):
+    start, stop, step = 10, 2, 2
+    dataset = dataset_ops.Dataset.range(start, stop, step)
+    self.assertDatasetProduces(dataset, expected_output=range(10, 2, 2))
+
+  def testStopLessThanStartWithNegativeStep(self):
+    start, stop, step = 10, 2, -1
+    dataset = dataset_ops.Dataset.range(start, stop, step)
+    self.assertDatasetProduces(dataset, expected_output=range(10, 2, -1))
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/data/kernel_tests/reader_dataset_ops_test.py b/tensorflow/python/data/kernel_tests/reader_dataset_ops_test.py
deleted file mode 100644
index 4fef4f30bf9..00000000000
--- a/tensorflow/python/data/kernel_tests/reader_dataset_ops_test.py
+++ /dev/null
@@ -1,846 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for the experimental input pipeline ops."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import gzip
-import os
-import zlib
-
-from tensorflow.python.data.kernel_tests import test_base
-from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.data.ops import iterator_ops
-from tensorflow.python.data.ops import readers
-from tensorflow.python.eager import context
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import errors
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_shape
-from tensorflow.python.lib.io import python_io
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import gen_dataset_ops
-from tensorflow.python.ops import io_ops
-from tensorflow.python.ops import parsing_ops
-from tensorflow.python.platform import test
-from tensorflow.python.util import compat
-
-
-try:
-  import psutil  # pylint: disable=g-import-not-at-top
-  psutil_import_succeeded = True
-except ImportError:
-  psutil_import_succeeded = False
-
-
-class TextLineDatasetTest(test_base.DatasetTestBase):
-
-  def _lineText(self, f, l):
-    return compat.as_bytes("%d: %d" % (f, l))
-
-  def _createFiles(self,
-                   num_files,
-                   num_lines,
-                   crlf=False,
-                   compression_type=None):
-    filenames = []
-    for i in range(num_files):
-      fn = os.path.join(self.get_temp_dir(), "text_line.%d.txt" % i)
-      filenames.append(fn)
-      contents = []
-      for j in range(num_lines):
-        contents.append(self._lineText(i, j))
-        # Always include a newline after the record unless it is
-        # at the end of the file, in which case we include it
-        if j + 1 != num_lines or i == 0:
-          contents.append(b"\r\n" if crlf else b"\n")
-      contents = b"".join(contents)
-
-      if not compression_type:
-        with open(fn, "wb") as f:
-          f.write(contents)
-      elif compression_type == "GZIP":
-        with gzip.GzipFile(fn, "wb") as f:
-          f.write(contents)
-      elif compression_type == "ZLIB":
-        contents = zlib.compress(contents)
-        with open(fn, "wb") as f:
-          f.write(contents)
-      else:
-        raise ValueError("Unsupported compression_type", compression_type)
-
-    return filenames
-
-  def _testTextLineDataset(self, compression_type=None):
-    test_filenames = self._createFiles(
-        2, 5, crlf=True, compression_type=compression_type)
-    filenames = array_ops.placeholder(dtypes.string, shape=[None])
-    num_epochs = array_ops.placeholder(dtypes.int64, shape=[])
-    batch_size = array_ops.placeholder(dtypes.int64, shape=[])
-
-    repeat_dataset = readers.TextLineDataset(
-        filenames, compression_type=compression_type).repeat(num_epochs)
-    batch_dataset = repeat_dataset.batch(batch_size)
-
-    iterator = iterator_ops.Iterator.from_structure(batch_dataset.output_types)
-    init_op = iterator.make_initializer(repeat_dataset)
-    init_batch_op = iterator.make_initializer(batch_dataset)
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      # Basic test: read from file 0.
-      sess.run(
-          init_op, feed_dict={filenames: [test_filenames[0]],
-                              num_epochs: 1})
-      for i in range(5):
-        self.assertEqual(self._lineText(0, i), sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-      # Basic test: read from file 1.
-      sess.run(
-          init_op, feed_dict={filenames: [test_filenames[1]],
-                              num_epochs: 1})
-      for i in range(5):
-        self.assertEqual(self._lineText(1, i), sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-      # Basic test: read from both files.
-      sess.run(init_op, feed_dict={filenames: test_filenames, num_epochs: 1})
-      for j in range(2):
-        for i in range(5):
-          self.assertEqual(self._lineText(j, i), sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-      # Test repeated iteration through both files.
-      sess.run(init_op, feed_dict={filenames: test_filenames, num_epochs: 10})
-      for _ in range(10):
-        for j in range(2):
-          for i in range(5):
-            self.assertEqual(self._lineText(j, i), sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-      # Test batched and repeated iteration through both files.
-      sess.run(
-          init_batch_op,
-          feed_dict={filenames: test_filenames,
-                     num_epochs: 10,
-                     batch_size: 5})
-      for _ in range(10):
-        self.assertAllEqual([self._lineText(0, i) for i in range(5)],
-                            sess.run(get_next))
-        self.assertAllEqual([self._lineText(1, i) for i in range(5)],
-                            sess.run(get_next))
-
-  def testTextLineDatasetNoCompression(self):
-    self._testTextLineDataset()
-
-  def testTextLineDatasetGzipCompression(self):
-    self._testTextLineDataset(compression_type="GZIP")
-
-  def testTextLineDatasetZlibCompression(self):
-    self._testTextLineDataset(compression_type="ZLIB")
-
-  def testTextLineDatasetBuffering(self):
-    test_filenames = self._createFiles(2, 5, crlf=True)
-
-    repeat_dataset = readers.TextLineDataset(test_filenames, buffer_size=10)
-    iterator = repeat_dataset.make_one_shot_iterator()
-
-    with self.cached_session() as sess:
-      for j in range(2):
-        for i in range(5):
-          self.assertEqual(self._lineText(j, i), sess.run(iterator.get_next()))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(iterator.get_next())
-
-  def testIteratorResourceCleanup(self):
-    filename = os.path.join(self.get_temp_dir(), "text.txt")
-    with open(filename, "wt") as f:
-      for i in range(3):
-        f.write("%d\n" % (i,))
-    with context.eager_mode():
-      first_iterator = iter(readers.TextLineDataset(filename))
-      self.assertEqual(b"0", next(first_iterator).numpy())
-      second_iterator = iter(readers.TextLineDataset(filename))
-      self.assertEqual(b"0", next(second_iterator).numpy())
-      # Eager kernel caching is based on op attributes, which includes the
-      # Dataset's output shape. Create a different kernel to test that they
-      # don't create resources with the same names.
-      different_kernel_iterator = iter(
-          readers.TextLineDataset(filename).repeat().batch(16))
-      self.assertEqual([16], next(different_kernel_iterator).shape)
-      # Remove our references to the Python Iterator objects, which (assuming no
-      # reference cycles) is enough to trigger DestroyResourceOp and close the
-      # partially-read files.
-      del first_iterator
-      del second_iterator
-      del different_kernel_iterator
-      if not psutil_import_succeeded:
-        self.skipTest(
-            "psutil is required to check that we've closed our files.")
-      open_files = psutil.Process().open_files()
-      self.assertNotIn(filename, [open_file.path for open_file in open_files])
-
-
-class FixedLengthRecordReaderTest(test_base.DatasetTestBase):
-
-  def setUp(self):
-    super(FixedLengthRecordReaderTest, self).setUp()
-    self._num_files = 2
-    self._num_records = 7
-    self._header_bytes = 5
-    self._record_bytes = 3
-    self._footer_bytes = 2
-
-  def _record(self, f, r):
-    return compat.as_bytes(str(f * 2 + r) * self._record_bytes)
-
-  def _createFiles(self, compression_type=None):
-    filenames = []
-    for i in range(self._num_files):
-      fn = os.path.join(self.get_temp_dir(), "fixed_length_record.%d.txt" % i)
-      filenames.append(fn)
-
-      contents = []
-      contents.append(b"H" * self._header_bytes)
-      for j in range(self._num_records):
-        contents.append(self._record(i, j))
-      contents.append(b"F" * self._footer_bytes)
-      contents = b"".join(contents)
-
-      if not compression_type:
-        with open(fn, "wb") as f:
-          f.write(contents)
-      elif compression_type == "GZIP":
-        with gzip.GzipFile(fn, "wb") as f:
-          f.write(contents)
-      elif compression_type == "ZLIB":
-        contents = zlib.compress(contents)
-        with open(fn, "wb") as f:
-          f.write(contents)
-      else:
-        raise ValueError("Unsupported compression_type", compression_type)
-
-    return filenames
-
-  def _testFixedLengthRecordDataset(self, compression_type=None):
-    test_filenames = self._createFiles(compression_type=compression_type)
-    filenames = array_ops.placeholder(dtypes.string, shape=[None])
-    num_epochs = array_ops.placeholder(dtypes.int64, shape=[])
-    batch_size = array_ops.placeholder(dtypes.int64, shape=[])
-
-    repeat_dataset = (
-        readers.FixedLengthRecordDataset(
-            filenames,
-            self._record_bytes,
-            self._header_bytes,
-            self._footer_bytes,
-            compression_type=compression_type).repeat(num_epochs))
-    batch_dataset = repeat_dataset.batch(batch_size)
-
-    iterator = iterator_ops.Iterator.from_structure(batch_dataset.output_types)
-    init_op = iterator.make_initializer(repeat_dataset)
-    init_batch_op = iterator.make_initializer(batch_dataset)
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      # Basic test: read from file 0.
-      sess.run(
-          init_op, feed_dict={filenames: [test_filenames[0]],
-                              num_epochs: 1})
-      for i in range(self._num_records):
-        self.assertEqual(self._record(0, i), sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-      # Basic test: read from file 1.
-      sess.run(
-          init_op, feed_dict={filenames: [test_filenames[1]],
-                              num_epochs: 1})
-      for i in range(self._num_records):
-        self.assertEqual(self._record(1, i), sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-      # Basic test: read from both files.
-      sess.run(init_op, feed_dict={filenames: test_filenames, num_epochs: 1})
-      for j in range(self._num_files):
-        for i in range(self._num_records):
-          self.assertEqual(self._record(j, i), sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-      # Test repeated iteration through both files.
-      sess.run(init_op, feed_dict={filenames: test_filenames, num_epochs: 10})
-      for _ in range(10):
-        for j in range(self._num_files):
-          for i in range(self._num_records):
-            self.assertEqual(self._record(j, i), sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-      # Test batched and repeated iteration through both files.
-      sess.run(
-          init_batch_op,
-          feed_dict={
-              filenames: test_filenames,
-              num_epochs: 10,
-              batch_size: self._num_records
-          })
-      for _ in range(10):
-        for j in range(self._num_files):
-          self.assertAllEqual(
-              [self._record(j, i) for i in range(self._num_records)],
-              sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testFixedLengthRecordDatasetNoCompression(self):
-    self._testFixedLengthRecordDataset()
-
-  def testFixedLengthRecordDatasetGzipCompression(self):
-    self._testFixedLengthRecordDataset(compression_type="GZIP")
-
-  def testFixedLengthRecordDatasetZlibCompression(self):
-    self._testFixedLengthRecordDataset(compression_type="ZLIB")
-
-  def testFixedLengthRecordDatasetBuffering(self):
-    test_filenames = self._createFiles()
-    dataset = readers.FixedLengthRecordDataset(
-        test_filenames,
-        self._record_bytes,
-        self._header_bytes,
-        self._footer_bytes,
-        buffer_size=10)
-    iterator = dataset.make_one_shot_iterator()
-
-    with self.cached_session() as sess:
-      for j in range(self._num_files):
-        for i in range(self._num_records):
-          self.assertEqual(self._record(j, i), sess.run(iterator.get_next()))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(iterator.get_next())
-
-  def testFixedLengthRecordDatasetWrongSize(self):
-    test_filenames = self._createFiles()
-    dataset = readers.FixedLengthRecordDataset(
-        test_filenames,
-        self._record_bytes + 1,  # Incorrect record length.
-        self._header_bytes,
-        self._footer_bytes,
-        buffer_size=10)
-    iterator = dataset.make_one_shot_iterator()
-
-    with self.cached_session() as sess:
-      with self.assertRaisesRegexp(
-          errors.InvalidArgumentError,
-          r"Excluding the header \(5 bytes\) and footer \(2 bytes\), input "
-          r"file \".*fixed_length_record.0.txt\" has body length 21 bytes, "
-          r"which is not an exact multiple of the record length \(4 bytes\)."):
-        sess.run(iterator.get_next())
-
-  def _iterator_checkpoint_path(self):
-    return os.path.join(self.get_temp_dir(), "iterator")
-
-  def _save_op(self, iterator_resource):
-    iterator_state_variant = gen_dataset_ops.serialize_iterator(
-        iterator_resource)
-    save_op = io_ops.write_file(
-        self._iterator_checkpoint_path(),
-        parsing_ops.serialize_tensor(iterator_state_variant))
-    return save_op
-
-  def _restore_op(self, iterator_resource):
-    iterator_state_variant = parsing_ops.parse_tensor(
-        io_ops.read_file(self._iterator_checkpoint_path()), dtypes.variant)
-    restore_op = gen_dataset_ops.deserialize_iterator(iterator_resource,
-                                                      iterator_state_variant)
-    return restore_op
-
-  def _build_iterator_graph(self, num_epochs):
-    filenames = self._createFiles()
-    dataset = (readers.FixedLengthRecordDataset(
-        filenames, self._record_bytes, self._header_bytes, self._footer_bytes)
-               .repeat(num_epochs))
-    iterator = dataset.make_initializable_iterator()
-    init_op = iterator.initializer
-    get_next_op = iterator.get_next()
-    save_op = self._save_op(iterator._iterator_resource)
-    restore_op = self._restore_op(iterator._iterator_resource)
-    return init_op, get_next_op, save_op, restore_op
-
-  def _restore_iterator(self):
-    output_types = dtypes.string
-    output_shapes = tensor_shape.scalar()
-    iterator = iterator_ops.Iterator.from_structure(output_types, output_shapes)
-    get_next = iterator.get_next()
-    restore_op = self._restore_op(iterator._iterator_resource)
-    return restore_op, get_next
-
-  def testSaveRestore(self):
-    num_epochs = 10
-    epoch_break = 5
-    file_break = self._num_files // 2
-    record_break = self._num_records // 2
-
-    with ops.Graph().as_default() as g:
-      init_op, get_next_op, save_op, restore_op = self._build_iterator_graph(
-          num_epochs=num_epochs)
-      with self.session(graph=g) as sess:
-        sess.run(init_op)
-        # Note: There is no checkpoint saved currently so a NotFoundError is
-        # raised.
-        with self.assertRaises(errors.NotFoundError):
-          sess.run(restore_op)
-        for epoch in range(num_epochs):
-          for f in range(self._num_files):
-            for r in range(self._num_records):
-              if (epoch == epoch_break and f == file_break and
-                  r == record_break):
-                sess.run(save_op)
-                break
-              self.assertEqual(self._record(f, r), sess.run(get_next_op))
-            else:
-              continue
-            break
-          else:
-            continue
-          break
-        else:
-          with self.assertRaises(errors.OutOfRangeError):
-            sess.run(get_next_op)
-
-    with ops.Graph().as_default() as g:
-      init_op, get_next_op, save_op, restore_op = self._build_iterator_graph(
-          num_epochs=num_epochs)
-      with self.session(graph=g) as sess:
-        sess.run(restore_op)
-        for epoch in range(num_epochs):
-          for f in range(self._num_files):
-            for r in range(self._num_records):
-              if (epoch < epoch_break or
-                  (epoch == epoch_break and f < file_break) or
-                  (epoch == epoch_break and f == file_break and
-                   r < record_break)):
-                continue
-              self.assertEqual(self._record(f, r), sess.run(get_next_op))
-        with self.assertRaises(errors.OutOfRangeError):
-          sess.run(get_next_op)
-
-  def testInitThenRestore(self):
-    # Note: Calling init_op before restore_op is redundant. This test just makes
-    # sure we do not fail if restore is called on an already initialized
-    # iterator resource.
-    num_epochs = 10
-    epoch_break = 5
-    file_break = self._num_files // 2
-    record_break = self._num_records // 2
-
-    with ops.Graph().as_default() as g:
-      init_op, get_next_op, save_op, restore_op = self._build_iterator_graph(
-          num_epochs=num_epochs)
-      with self.session(graph=g) as sess:
-        sess.run(init_op)
-        # Note: There is no checkpoint saved currently so a NotFoundError is
-        # raised.
-        with self.assertRaises(errors.NotFoundError):
-          sess.run(restore_op)
-        for epoch in range(num_epochs):
-          for f in range(self._num_files):
-            for r in range(self._num_records):
-              if (epoch == epoch_break and f == file_break and
-                  r == record_break):
-                sess.run(save_op)
-                break
-              self.assertEqual(self._record(f, r), sess.run(get_next_op))
-            else:
-              continue
-            break
-          else:
-            continue
-          break
-        else:
-          with self.assertRaises(errors.OutOfRangeError):
-            sess.run(get_next_op)
-
-    with ops.Graph().as_default() as g:
-      init_op, get_next_op, save_op, restore_op = self._build_iterator_graph(
-          num_epochs=num_epochs)
-      with self.session(graph=g) as sess:
-        sess.run(init_op)
-        sess.run(restore_op)
-        for epoch in range(num_epochs):
-          for f in range(self._num_files):
-            for r in range(self._num_records):
-              if (epoch < epoch_break or
-                  (epoch == epoch_break and f < file_break) or
-                  (epoch == epoch_break and f == file_break and
-                   r < record_break)):
-                continue
-              self.assertEqual(self._record(f, r), sess.run(get_next_op))
-        with self.assertRaises(errors.OutOfRangeError):
-          sess.run(get_next_op)
-
-  def testRestoreInModifiedGraph(self):
-    num_epochs = 10
-    num_epochs_1 = 20
-    epoch_break = 5
-    file_break = self._num_files // 2
-    record_break = self._num_records // 2
-
-    with ops.Graph().as_default() as g:
-      init_op, get_next_op, save_op, restore_op = self._build_iterator_graph(
-          num_epochs=num_epochs)
-      with self.session(graph=g) as sess:
-        sess.run(init_op)
-        # Note: There is no checkpoint saved currently so a NotFoundError is
-        # raised.
-        with self.assertRaises(errors.NotFoundError):
-          sess.run(restore_op)
-        for epoch in range(num_epochs):
-          for f in range(self._num_files):
-            for r in range(self._num_records):
-              if (epoch == epoch_break and f == file_break and
-                  r == record_break):
-                sess.run(save_op)
-                break
-              self.assertEqual(self._record(f, r), sess.run(get_next_op))
-            else:
-              continue
-            break
-          else:
-            continue
-          break
-        else:
-          with self.assertRaises(errors.OutOfRangeError):
-            sess.run(get_next_op)
-
-    with ops.Graph().as_default() as g:
-      init_op, get_next_op, save_op, restore_op = self._build_iterator_graph(
-          num_epochs=num_epochs_1)
-      with self.session(graph=g) as sess:
-        sess.run(restore_op)
-        for epoch in range(num_epochs):
-          for f in range(self._num_files):
-            for r in range(self._num_records):
-              if (epoch < epoch_break or
-                  (epoch == epoch_break and f < file_break) or
-                  (epoch == epoch_break and f == file_break and
-                   r < record_break)):
-                continue
-              self.assertEqual(self._record(f, r), sess.run(get_next_op))
-        with self.assertRaises(errors.OutOfRangeError):
-          sess.run(get_next_op)
-
-  def testRestoreWithoutBuildingDatasetGraph(self):
-    num_epochs = 10
-    epoch_break = 5
-    file_break = self._num_files // 2
-    record_break = self._num_records // 2
-
-    with ops.Graph().as_default() as g:
-      init_op, get_next_op, save_op, restore_op = self._build_iterator_graph(
-          num_epochs=num_epochs)
-      with self.session(graph=g) as sess:
-        sess.run(init_op)
-        # Note: There is no checkpoint saved currently so a NotFoundError is
-        # raised.
-        with self.assertRaises(errors.NotFoundError):
-          sess.run(restore_op)
-        for epoch in range(num_epochs):
-          for f in range(self._num_files):
-            for r in range(self._num_records):
-              if (epoch == epoch_break and f == file_break and
-                  r == record_break):
-                sess.run(save_op)
-                break
-              self.assertEqual(self._record(f, r), sess.run(get_next_op))
-            else:
-              continue
-            break
-          else:
-            continue
-          break
-        else:
-          with self.assertRaises(errors.OutOfRangeError):
-            sess.run(get_next_op)
-
-    with ops.Graph().as_default() as g:
-      restore_op, get_next_op = self._restore_iterator()
-      with self.session(graph=g) as sess:
-        sess.run(restore_op)
-        for epoch in range(num_epochs):
-          for f in range(self._num_files):
-            for r in range(self._num_records):
-              if (epoch < epoch_break or
-                  (epoch == epoch_break and f < file_break) or
-                  (epoch == epoch_break and f == file_break and
-                   r < record_break)):
-                continue
-              self.assertEqual(self._record(f, r), sess.run(get_next_op))
-        with self.assertRaises(errors.OutOfRangeError):
-          sess.run(get_next_op)
-
-  def testRestoreUnusedIterator(self):
-    num_epochs = 10
-    with ops.Graph().as_default() as g:
-      init_op, get_next_op, save_op, restore_op = self._build_iterator_graph(
-          num_epochs=num_epochs)
-      with self.session(graph=g) as sess:
-        sess.run(init_op)
-        # Note: There is no checkpoint saved currently so a NotFoundError is
-        # raised.
-        with self.assertRaises(errors.NotFoundError):
-          sess.run(restore_op)
-        # Save unused iterator.
-        sess.run(save_op)
-    with ops.Graph().as_default() as g:
-      init_op, get_next_op, save_op, restore_op = self._build_iterator_graph(
-          num_epochs=num_epochs)
-      with self.session(graph=g) as sess:
-        sess.run(restore_op)
-        for _ in range(num_epochs * self._num_files * self._num_records):
-          sess.run(get_next_op)
-        with self.assertRaises(errors.OutOfRangeError):
-          sess.run(get_next_op)
-
-  def testRestoreExhaustedIterator(self):
-    num_epochs = 10
-
-    with ops.Graph().as_default() as g:
-      init_op, get_next_op, save_op, restore_op = self._build_iterator_graph(
-          num_epochs=num_epochs)
-      with self.session(graph=g) as sess:
-        sess.run(init_op)
-        # Note: There is no checkpoint saved currently so a NotFoundError is
-        # raised.
-        with self.assertRaises(errors.NotFoundError):
-          sess.run(restore_op)
-        for _ in range(num_epochs):
-          for f in range(self._num_files):
-            for r in range(self._num_records):
-              self.assertEqual(self._record(f, r), sess.run(get_next_op))
-        with self.assertRaises(errors.OutOfRangeError):
-          sess.run(get_next_op)
-        sess.run(save_op)
-
-    with ops.Graph().as_default() as g:
-      init_op, get_next_op, save_op, restore_op = self._build_iterator_graph(
-          num_epochs=num_epochs)
-      with self.session(graph=g) as sess:
-        sess.run(restore_op)
-        with self.assertRaises(errors.OutOfRangeError):
-          sess.run(get_next_op)
-
-
-class TFRecordDatasetTest(test_base.DatasetTestBase):
-
-  def setUp(self):
-    super(TFRecordDatasetTest, self).setUp()
-    self._num_files = 2
-    self._num_records = 7
-
-    self.test_filenames = self._createFiles()
-
-    self.filenames = array_ops.placeholder(dtypes.string, shape=[None])
-    self.num_epochs = array_ops.placeholder_with_default(
-        constant_op.constant(1, dtypes.int64), shape=[])
-    self.compression_type = array_ops.placeholder_with_default("", shape=[])
-    self.batch_size = array_ops.placeholder(dtypes.int64, shape=[])
-
-    repeat_dataset = readers.TFRecordDataset(self.filenames,
-                                             self.compression_type).repeat(
-                                                 self.num_epochs)
-    batch_dataset = repeat_dataset.batch(self.batch_size)
-
-    iterator = iterator_ops.Iterator.from_structure(batch_dataset.output_types)
-    self.init_op = iterator.make_initializer(repeat_dataset)
-    self.init_batch_op = iterator.make_initializer(batch_dataset)
-    self.get_next = iterator.get_next()
-
-  def _record(self, f, r):
-    return compat.as_bytes("Record %d of file %d" % (r, f))
-
-  def _createFiles(self):
-    filenames = []
-    for i in range(self._num_files):
-      fn = os.path.join(self.get_temp_dir(), "tf_record.%d.txt" % i)
-      filenames.append(fn)
-      writer = python_io.TFRecordWriter(fn)
-      for j in range(self._num_records):
-        writer.write(self._record(i, j))
-      writer.close()
-    return filenames
-
-  def testReadOneEpoch(self):
-    with self.cached_session() as sess:
-      # Basic test: read from file 0.
-      sess.run(
-          self.init_op,
-          feed_dict={
-              self.filenames: [self.test_filenames[0]],
-              self.num_epochs: 1
-          })
-      for i in range(self._num_records):
-        self.assertAllEqual(self._record(0, i), sess.run(self.get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(self.get_next)
-
-      # Basic test: read from file 1.
-      sess.run(
-          self.init_op,
-          feed_dict={
-              self.filenames: [self.test_filenames[1]],
-              self.num_epochs: 1
-          })
-      for i in range(self._num_records):
-        self.assertAllEqual(self._record(1, i), sess.run(self.get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(self.get_next)
-
-      # Basic test: read from both files.
-      sess.run(
-          self.init_op,
-          feed_dict={self.filenames: self.test_filenames,
-                     self.num_epochs: 1})
-      for j in range(self._num_files):
-        for i in range(self._num_records):
-          self.assertAllEqual(self._record(j, i), sess.run(self.get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(self.get_next)
-
-  def testReadTenEpochs(self):
-    with self.cached_session() as sess:
-      sess.run(
-          self.init_op,
-          feed_dict={self.filenames: self.test_filenames,
-                     self.num_epochs: 10})
-      for _ in range(10):
-        for j in range(self._num_files):
-          for i in range(self._num_records):
-            self.assertAllEqual(self._record(j, i), sess.run(self.get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(self.get_next)
-
-  def testReadTenEpochsOfBatches(self):
-    with self.cached_session() as sess:
-      sess.run(
-          self.init_batch_op,
-          feed_dict={
-              self.filenames: self.test_filenames,
-              self.num_epochs: 10,
-              self.batch_size: self._num_records
-          })
-      for _ in range(10):
-        for j in range(self._num_files):
-          values = sess.run(self.get_next)
-          self.assertAllEqual(
-              [self._record(j, i) for i in range(self._num_records)], values)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(self.get_next)
-
-  def testReadZlibFiles(self):
-    zlib_files = []
-    for i, fn in enumerate(self.test_filenames):
-      with open(fn, "rb") as f:
-        cdata = zlib.compress(f.read())
-
-        zfn = os.path.join(self.get_temp_dir(), "tfrecord_%s.z" % i)
-        with open(zfn, "wb") as f:
-          f.write(cdata)
-        zlib_files.append(zfn)
-
-    with self.cached_session() as sess:
-      sess.run(
-          self.init_op,
-          feed_dict={self.filenames: zlib_files,
-                     self.compression_type: "ZLIB"})
-      for j in range(self._num_files):
-        for i in range(self._num_records):
-          self.assertAllEqual(self._record(j, i), sess.run(self.get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(self.get_next)
-
-  def testReadGzipFiles(self):
-    gzip_files = []
-    for i, fn in enumerate(self.test_filenames):
-      with open(fn, "rb") as f:
-        gzfn = os.path.join(self.get_temp_dir(), "tfrecord_%s.gz" % i)
-        with gzip.GzipFile(gzfn, "wb") as gzf:
-          gzf.write(f.read())
-        gzip_files.append(gzfn)
-
-    with self.cached_session() as sess:
-      sess.run(
-          self.init_op,
-          feed_dict={self.filenames: gzip_files,
-                     self.compression_type: "GZIP"})
-      for j in range(self._num_files):
-        for i in range(self._num_records):
-          self.assertAllEqual(self._record(j, i), sess.run(self.get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(self.get_next)
-
-  def testReadWithBuffer(self):
-    one_mebibyte = 2**20
-    d = readers.TFRecordDataset(self.test_filenames, buffer_size=one_mebibyte)
-    iterator = d.make_one_shot_iterator()
-    next_element = iterator.get_next()
-    with self.cached_session() as sess:
-      for j in range(self._num_files):
-        for i in range(self._num_records):
-          self.assertAllEqual(self._record(j, i), sess.run(next_element))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
-
-  def testReadFromDatasetOfFiles(self):
-    files = dataset_ops.Dataset.from_tensor_slices(self.test_filenames)
-    d = readers.TFRecordDataset(files)
-    iterator = d.make_one_shot_iterator()
-    next_element = iterator.get_next()
-    with self.cached_session() as sess:
-      for j in range(self._num_files):
-        for i in range(self._num_records):
-          self.assertAllEqual(self._record(j, i), sess.run(next_element))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
-
-  def testReadTenEpochsFromDatasetOfFilesInParallel(self):
-    files = dataset_ops.Dataset.from_tensor_slices(
-        self.test_filenames).repeat(10)
-    d = readers.TFRecordDataset(files, num_parallel_reads=4)
-    iterator = d.make_one_shot_iterator()
-    next_element = iterator.get_next()
-    expected = []
-    actual = []
-    with self.cached_session() as sess:
-      for _ in range(10):
-        for j in range(self._num_files):
-          for i in range(self._num_records):
-            expected.append(self._record(j, i))
-            actual.append(sess.run(next_element))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
-      self.assertEqual(sorted(expected), sorted(actual))
-
-
-if __name__ == "__main__":
-  test.main()
diff --git a/tensorflow/python/data/kernel_tests/reduce_dataset_op_test.py b/tensorflow/python/data/kernel_tests/reduce_test.py
similarity index 72%
rename from tensorflow/python/data/kernel_tests/reduce_dataset_op_test.py
rename to tensorflow/python/data/kernel_tests/reduce_test.py
index 11e07300b97..14bbc0bf72c 100644
--- a/tensorflow/python/data/kernel_tests/reduce_dataset_op_test.py
+++ b/tensorflow/python/data/kernel_tests/reduce_test.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for the experimental input pipeline ops."""
+"""Tests for `tf.data.Dataset.reduce()`."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
@@ -22,21 +22,24 @@ import numpy as np
 
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 
 
-class ReduceDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
+@test_util.run_all_in_graph_and_eager_modes
+class ReduceTest(test_base.DatasetTestBase, parameterized.TestCase):
 
   def testSum(self):
     for i in range(10):
       ds = dataset_ops.Dataset.range(1, i + 1)
-      result = ds.reduce(np.int64(0), lambda x, y: x + y)
-      with self.cached_session() as sess:
-        self.assertEqual(((i + 1) * i) // 2, sess.run(result))
+      result = ds.reduce(
+          constant_op.constant(0, dtype=dtypes.int64), lambda x, y: x + y)
+      self.assertEqual(((i + 1) * i) // 2, self.evaluate(result))
 
   def testSumTuple(self):
 
@@ -47,9 +50,8 @@ class ReduceDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
     for i in range(10):
       ds = dataset_ops.Dataset.range(1, i + 1)
       ds = dataset_ops.Dataset.zip((ds, ds))
-      result = ds.reduce(np.int64(0), reduce_fn)
-      with self.cached_session() as sess:
-        self.assertEqual(((i + 1) * i), sess.run(result))
+      result = ds.reduce(constant_op.constant(0, dtype=dtypes.int64), reduce_fn)
+      self.assertEqual(((i + 1) * i), self.evaluate(result))
 
   def testSumAndCount(self):
 
@@ -59,13 +61,15 @@ class ReduceDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
 
     for i in range(10):
       ds = dataset_ops.Dataset.range(1, i + 1)
-      result = ds.reduce((np.int64(0), np.int64(0)), reduce_fn)
-      with self.cached_session() as sess:
-        s, c = sess.run(result)
-        self.assertEqual(((i + 1) * i) // 2, s)
-        self.assertEqual(i, c)
+      result = ds.reduce((constant_op.constant(0, dtype=dtypes.int64),
+                          constant_op.constant(0, dtype=dtypes.int64)),
+                         reduce_fn)
+      s, c = self.evaluate(result)
+      self.assertEqual(((i + 1) * i) // 2, s)
+      self.assertEqual(i, c)
 
-  def testSquareUsingPlaceholder(self):
+  @test_util.run_deprecated_v1
+  def testSkipEagerSquareUsingPlaceholder(self):
     delta = array_ops.placeholder(dtype=dtypes.int64)
 
     def reduce_fn(state, _):
@@ -92,8 +96,7 @@ class ReduceDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
     for i in range(10):
       ds = dataset_ops.Dataset.from_tensors(make_sparse_fn(i+1))
       result = ds.reduce(make_sparse_fn(0), reduce_fn)
-      with self.cached_session() as sess:
-        self.assertSparseValuesEqual(make_sparse_fn(i+1), sess.run(result))
+      self.assertSparseValuesEqual(make_sparse_fn(i + 1), self.evaluate(result))
 
   def testNested(self):
 
@@ -115,10 +118,10 @@ class ReduceDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
     for i in range(10):
       ds = dataset_ops.Dataset.range(1, i + 1).map(map_fn)
       result = ds.reduce(map_fn(0), reduce_fn)
-      with self.cached_session() as sess:
-        result = sess.run(result)
-        self.assertEqual(((i + 1) * i) // 2, result["dense"])
-        self.assertSparseValuesEqual(make_sparse_fn(i), result["sparse"])
+      result = self.evaluate(result)
+      self.assertEqual(((i + 1) * i) // 2, result["dense"])
+      self.assertSparseValuesEqual(make_sparse_fn(i), result["sparse"])
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/data/kernel_tests/repeat_test.py b/tensorflow/python/data/kernel_tests/repeat_test.py
new file mode 100644
index 00000000000..4ef2fc1bfc8
--- /dev/null
+++ b/tensorflow/python/data/kernel_tests/repeat_test.py
@@ -0,0 +1,84 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for `tf.data.Dataset.repeat()`."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.data.kernel_tests import test_base
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.platform import test
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class RepeatTest(test_base.DatasetTestBase):
+
+  def testRepeatTensorDataset(self):
+    """Test a dataset that repeats its input multiple times."""
+    components = (np.array(1), np.array([1, 2, 3]), np.array(37.0))
+    # This placeholder can be fed when dataset-definition subgraph
+    # runs (i.e. `init_op` below) to configure the number of
+    # repetitions used in a particular iterator.
+
+    def do_test(count):
+      dataset = dataset_ops.Dataset.from_tensors(components).repeat(count)
+      self.assertEqual([c.shape for c in components],
+                       [shape for shape in dataset.output_shapes])
+      self.assertDatasetProduces(dataset, [components] * count)
+
+    # Test a finite repetition.
+    do_test(3)
+
+    # test a different finite repetition.
+    do_test(7)
+
+    # Test an empty repetition.
+    do_test(0)
+
+    # Test an infinite repetition.
+    # NOTE(mrry): There's not a good way to test that the sequence
+    # actually is infinite.
+    dataset = dataset_ops.Dataset.from_tensors(components).repeat(-1)
+    self.assertEqual([c.shape for c in components],
+                     [shape for shape in dataset.output_shapes])
+    get_next = self.getNext(dataset)
+    for _ in range(17):
+      results = self.evaluate(get_next())
+      for component, result_component in zip(components, results):
+        self.assertAllEqual(component, result_component)
+
+  def testRepeatRepeatTensorDataset(self):
+    """Test the composition of repeat datasets."""
+    components = (np.array(1), np.array([1, 2, 3]), np.array(37.0))
+    inner_count, outer_count = 7, 14
+
+    dataset = dataset_ops.Dataset.from_tensors(components).repeat(
+        inner_count).repeat(outer_count)
+    self.assertEqual([c.shape for c in components],
+                     [shape for shape in dataset.output_shapes])
+    self.assertDatasetProduces(dataset,
+                               [components] * (inner_count * outer_count))
+
+  def testRepeatEmptyDataset(self):
+    """Test that repeating an empty dataset does not hang."""
+    dataset = dataset_ops.Dataset.from_tensors(0).repeat(10).skip(10).repeat(-1)
+    self.assertDatasetProduces(dataset, [])
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/data/kernel_tests/sequence_dataset_op_test.py b/tensorflow/python/data/kernel_tests/sequence_dataset_op_test.py
deleted file mode 100644
index e86356dee7c..00000000000
--- a/tensorflow/python/data/kernel_tests/sequence_dataset_op_test.py
+++ /dev/null
@@ -1,210 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for the experimental input pipeline ops."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-
-from tensorflow.python.data.kernel_tests import test_base
-from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import errors
-from tensorflow.python.ops import array_ops
-from tensorflow.python.platform import test
-
-
-class SequenceDatasetTest(test_base.DatasetTestBase):
-
-  def testRepeatTensorDataset(self):
-    """Test a dataset that repeats its input multiple times."""
-    components = (np.array(1), np.array([1, 2, 3]), np.array(37.0))
-    # This placeholder can be fed when dataset-definition subgraph
-    # runs (i.e. `init_op` below) to configure the number of
-    # repetitions used in a particular iterator.
-    count_placeholder = array_ops.placeholder(dtypes.int64, shape=[])
-
-    iterator = (dataset_ops.Dataset.from_tensors(components)
-                .repeat(count_placeholder).make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    self.assertEqual([c.shape for c in components],
-                     [t.shape for t in get_next])
-
-    with self.cached_session() as sess:
-      # Test a finite repetition.
-      sess.run(init_op, feed_dict={count_placeholder: 3})
-      for _ in range(3):
-        results = sess.run(get_next)
-        for component, result_component in zip(components, results):
-          self.assertAllEqual(component, result_component)
-
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-      # Test a different finite repetition.
-      sess.run(init_op, feed_dict={count_placeholder: 7})
-      for _ in range(7):
-        results = sess.run(get_next)
-        for component, result_component in zip(components, results):
-          self.assertAllEqual(component, result_component)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-      # Test an empty repetition.
-      sess.run(init_op, feed_dict={count_placeholder: 0})
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-      # Test an infinite repetition.
-      # NOTE(mrry): There's not a good way to test that the sequence
-      # actually is infinite.
-      sess.run(init_op, feed_dict={count_placeholder: -1})
-      for _ in range(17):
-        results = sess.run(get_next)
-        for component, result_component in zip(components, results):
-          self.assertAllEqual(component, result_component)
-
-  def testTakeTensorDataset(self):
-    components = (np.arange(10),)
-    count_placeholder = array_ops.placeholder(dtypes.int64, shape=[])
-
-    iterator = (dataset_ops.Dataset.from_tensor_slices(components)
-                .take(count_placeholder).make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    self.assertEqual([c.shape[1:] for c in components],
-                     [t.shape for t in get_next])
-
-    with self.cached_session() as sess:
-      # Take fewer than input size
-      sess.run(init_op, feed_dict={count_placeholder: 4})
-      for i in range(4):
-        results = sess.run(get_next)
-        self.assertAllEqual(results, components[0][i:i+1])
-
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-      # Take more than input size
-      sess.run(init_op, feed_dict={count_placeholder: 25})
-      for i in range(10):
-        results = sess.run(get_next)
-        self.assertAllEqual(results, components[0][i:i+1])
-
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-      # Take all of input
-      sess.run(init_op, feed_dict={count_placeholder: -1})
-      for i in range(10):
-        results = sess.run(get_next)
-        self.assertAllEqual(results, components[0][i:i+1])
-
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-      # Take nothing
-      sess.run(init_op, feed_dict={count_placeholder: 0})
-
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testSkipTensorDataset(self):
-    components = (np.arange(10),)
-    count_placeholder = array_ops.placeholder(dtypes.int64, shape=[])
-
-    iterator = (dataset_ops.Dataset.from_tensor_slices(components)
-                .skip(count_placeholder).make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    self.assertEqual([c.shape[1:] for c in components],
-                     [t.shape for t in get_next])
-
-    with self.cached_session() as sess:
-      # Skip fewer than input size, we should skip
-      # the first 4 elements and then read the rest.
-      sess.run(init_op, feed_dict={count_placeholder: 4})
-      for i in range(4, 10):
-        results = sess.run(get_next)
-        self.assertAllEqual(results, components[0][i:i+1])
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-      # Skip more than input size: get nothing.
-      sess.run(init_op, feed_dict={count_placeholder: 25})
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-      # Skip exactly input size.
-      sess.run(init_op, feed_dict={count_placeholder: 10})
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-      # Set -1 for 'count': skip the entire dataset.
-      sess.run(init_op, feed_dict={count_placeholder: -1})
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-      # Skip nothing
-      sess.run(init_op, feed_dict={count_placeholder: 0})
-      for i in range(0, 10):
-        results = sess.run(get_next)
-        self.assertAllEqual(results, components[0][i:i+1])
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testRepeatRepeatTensorDataset(self):
-    """Test the composition of repeat datasets."""
-    components = (np.array(1), np.array([1, 2, 3]), np.array(37.0))
-    inner_count = array_ops.placeholder(dtypes.int64, shape=[])
-    outer_count = array_ops.placeholder(dtypes.int64, shape=[])
-
-    iterator = (dataset_ops.Dataset.from_tensors(components).repeat(inner_count)
-                .repeat(outer_count).make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    self.assertEqual([c.shape for c in components],
-                     [t.shape for t in get_next])
-
-    with self.cached_session() as sess:
-      sess.run(init_op, feed_dict={inner_count: 7, outer_count: 14})
-      for _ in range(7 * 14):
-        results = sess.run(get_next)
-        for component, result_component in zip(components, results):
-          self.assertAllEqual(component, result_component)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testRepeatEmptyDataset(self):
-    """Test that repeating an empty dataset does not hang."""
-    iterator = (dataset_ops.Dataset.from_tensors(0).repeat(10).skip(10)
-                .repeat(-1).make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(init_op)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-
-if __name__ == "__main__":
-  test.main()
diff --git a/tensorflow/python/data/kernel_tests/shard_dataset_op_test.py b/tensorflow/python/data/kernel_tests/shard_test.py
similarity index 52%
rename from tensorflow/python/data/kernel_tests/shard_dataset_op_test.py
rename to tensorflow/python/data/kernel_tests/shard_test.py
index b9f3c79da56..928550676d5 100644
--- a/tensorflow/python/data/kernel_tests/shard_dataset_op_test.py
+++ b/tensorflow/python/data/kernel_tests/shard_test.py
@@ -12,50 +12,33 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for the experimental input pipeline ops."""
+"""Tests for `tf.data.Dataset.shard()`."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.framework import errors
+from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
 
 
-class ShardDatasetOpTest(test_base.DatasetTestBase):
+@test_util.run_all_in_graph_and_eager_modes
+class ShardTest(test_base.DatasetTestBase):
 
   def testSimpleCase(self):
     dataset = dataset_ops.Dataset.range(10).shard(5, 2)
-    iterator = dataset.make_one_shot_iterator()
-
-    with self.cached_session() as sess:
-      self.assertEqual(2, sess.run(iterator.get_next()))
-      self.assertEqual(7, sess.run(iterator.get_next()))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(iterator.get_next())
+    self.assertDatasetProduces(dataset, expected_output=[2, 7])
 
   def testNestedData(self):
     dataset_a = dataset_ops.Dataset.range(10)
     dataset_b = dataset_ops.Dataset.range(10, 0, -1)
     dataset = dataset_ops.Dataset.zip((dataset_a, dataset_b)).shard(5, 2)
-    iterator = dataset.make_one_shot_iterator()
-
-    with self.cached_session() as sess:
-      self.assertEqual((2, 8), sess.run(iterator.get_next()))
-      self.assertEqual((7, 3), sess.run(iterator.get_next()))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(iterator.get_next())
+    self.assertDatasetProduces(dataset, expected_output=[(2, 8), (7, 3)])
 
   def testOffsetZero(self):
     dataset = dataset_ops.Dataset.range(10).shard(5, 0)
-    iterator = dataset.make_one_shot_iterator()
-
-    with self.cached_session() as sess:
-      self.assertEqual(0, sess.run(iterator.get_next()))
-      self.assertEqual(5, sess.run(iterator.get_next()))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(iterator.get_next())
+    self.assertDatasetProduces(dataset, expected_output=[0, 5])
 
   def testOffsetGreaterNumShards(self):
     with self.assertRaises(ValueError):
@@ -75,38 +58,19 @@ class ShardDatasetOpTest(test_base.DatasetTestBase):
 
   def testIteratorEndsBeforeFirstElem(self):
     dataset = dataset_ops.Dataset.range(1).shard(5, 2)
-    iterator = dataset.make_one_shot_iterator()
-
-    with self.cached_session() as sess:
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(iterator.get_next())
+    self.assertDatasetProduces(dataset, expected_output=[])
 
   def testLargerWorkerPool(self):
     dataset = dataset_ops.Dataset.range(10).shard(7, 5)
-    iterator = dataset.make_one_shot_iterator()
-    with self.cached_session() as sess:
-      self.assertEqual(5, sess.run(iterator.get_next()))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(iterator.get_next())
+    self.assertDatasetProduces(dataset, expected_output=[5])
 
   def testIndexEqualsNumShards(self):
     dataset = dataset_ops.Dataset.range(10).shard(5, 4)
-    iterator = dataset.make_one_shot_iterator()
-    with self.cached_session() as sess:
-      self.assertEqual(4, sess.run(iterator.get_next()))
-      self.assertEqual(9, sess.run(iterator.get_next()))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(iterator.get_next())
+    self.assertDatasetProduces(dataset, expected_output=[4, 9])
 
   def testIndexEqualsNumShards2(self):
     dataset = dataset_ops.Dataset.range(10).shard(4, 3)
-    iterator = dataset.make_one_shot_iterator()
-    with self.cached_session() as sess:
-      self.assertEqual(3, sess.run(iterator.get_next()))
-      self.assertEqual(7, sess.run(iterator.get_next()))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(iterator.get_next())
-
+    self.assertDatasetProduces(dataset, expected_output=[3, 7])
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/data/kernel_tests/shuffle_dataset_op_test.py b/tensorflow/python/data/kernel_tests/shuffle_dataset_op_test.py
deleted file mode 100644
index cad28f860e9..00000000000
--- a/tensorflow/python/data/kernel_tests/shuffle_dataset_op_test.py
+++ /dev/null
@@ -1,278 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for the experimental input pipeline ops."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import collections
-
-from absl.testing import parameterized
-import numpy as np
-
-from tensorflow.python.data.kernel_tests import test_base
-from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.data.ops import iterator_ops
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import errors
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import random_seed
-from tensorflow.python.ops import array_ops
-from tensorflow.python.platform import test
-
-
-class ShuffleDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
-
-  def testShuffleDataset(self):
-    components = (
-        np.array([1, 2, 3, 4]), np.array([5, 6, 7, 8]),
-        np.array([9.0, 10.0, 11.0, 12.0])
-    )
-    count_placeholder = array_ops.placeholder_with_default(
-        constant_op.constant(5, dtypes.int64), shape=[])
-    buffer_size_placeholder = array_ops.placeholder(dtypes.int64, shape=[])
-    seed_placeholder = array_ops.placeholder(dtypes.int64, shape=[])
-
-    repeat_dataset = (dataset_ops.Dataset.from_tensor_slices(components)
-                      .repeat(count_placeholder))
-
-    shuffle_dataset = repeat_dataset.shuffle(buffer_size_placeholder,
-                                             seed_placeholder)
-
-    self.assertEqual(tuple([c.shape[1:] for c in components]),
-                     shuffle_dataset.output_shapes)
-
-    # Create initialization ops for iterators without and with
-    # shuffling, respectively.
-    iterator = iterator_ops.Iterator.from_structure(
-        shuffle_dataset.output_types, shuffle_dataset.output_shapes)
-    init_fifo_op = iterator.make_initializer(repeat_dataset)
-    init_shuffle_op = iterator.make_initializer(shuffle_dataset)
-
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      # First run without shuffling to collect the "ground truth".
-      sess.run(init_fifo_op)
-      unshuffled_elements = []
-      for _ in range(20):
-        unshuffled_elements.append(sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-      # Assert that the shuffled dataset has the same elements as the
-      # "ground truth".
-      sess.run(
-          init_shuffle_op,
-          feed_dict={buffer_size_placeholder: 100,
-                     seed_placeholder: 37})
-      shuffled_elements = []
-      for _ in range(20):
-        shuffled_elements.append(sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-      self.assertAllEqual(
-          sorted(unshuffled_elements), sorted(shuffled_elements))
-
-      # Assert that shuffling twice with the same seeds gives the same sequence.
-      sess.run(
-          init_shuffle_op,
-          feed_dict={buffer_size_placeholder: 100,
-                     seed_placeholder: 37})
-      reshuffled_elements_same_seed = []
-      for _ in range(20):
-        reshuffled_elements_same_seed.append(sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-      self.assertEqual(shuffled_elements, reshuffled_elements_same_seed)
-
-      # Assert that shuffling twice with a different seed gives a different
-      # permutation of the same elements.
-      sess.run(
-          init_shuffle_op,
-          feed_dict={buffer_size_placeholder: 100,
-                     seed_placeholder: 1037})
-      reshuffled_elements_different_seed = []
-      for _ in range(20):
-        reshuffled_elements_different_seed.append(sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-      self.assertNotEqual(shuffled_elements, reshuffled_elements_different_seed)
-      self.assertAllEqual(
-          sorted(shuffled_elements), sorted(reshuffled_elements_different_seed))
-
-      # Assert that the shuffled dataset has the same elements as the
-      # "ground truth" when the buffer size is smaller than the input
-      # dataset.
-      sess.run(
-          init_shuffle_op,
-          feed_dict={buffer_size_placeholder: 2,
-                     seed_placeholder: 37})
-      reshuffled_elements_small_buffer = []
-      for _ in range(20):
-        reshuffled_elements_small_buffer.append(sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-      self.assertAllEqual(
-          sorted(unshuffled_elements), sorted(reshuffled_elements_small_buffer))
-
-      # Test the case of shuffling an empty dataset.
-      sess.run(init_shuffle_op, feed_dict={buffer_size_placeholder: 2,
-                                           seed_placeholder: 37,
-                                           count_placeholder: 0})
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testSeedZero(self):
-    """Test for same behavior when the seed is a Python or Tensor zero."""
-    iterator = (
-        dataset_ops.Dataset.range(10).shuffle(10, seed=0)
-        .make_one_shot_iterator())
-    get_next = iterator.get_next()
-
-    elems = []
-    with self.cached_session() as sess:
-      for _ in range(10):
-        elems.append(sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-    seed_placeholder = array_ops.placeholder(dtypes.int64, shape=[])
-    iterator = (
-        dataset_ops.Dataset.range(10).shuffle(10, seed=seed_placeholder)
-        .make_initializable_iterator())
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(iterator.initializer, feed_dict={seed_placeholder: 0})
-      for elem in elems:
-        self.assertEqual(elem, sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testDefaultArguments(self):
-    components = [0, 1, 2, 3, 4]
-    iterator = (dataset_ops.Dataset.from_tensor_slices(components).shuffle(5)
-                .repeat().make_one_shot_iterator())
-
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      counts = collections.defaultdict(lambda: 0)
-      for _ in range(10):
-        for _ in range(5):
-          counts[sess.run(get_next)] += 1
-
-    for i in range(5):
-      self.assertEqual(10, counts[i])
-
-  def testShuffleNoReshuffleEachIteration(self):
-    iterator = (dataset_ops.Dataset.range(10)
-                .shuffle(10, reshuffle_each_iteration=False)
-                .batch(10)
-                .repeat(3)
-                .make_one_shot_iterator())
-    next_element = iterator.get_next()
-
-    with self.cached_session() as sess:
-      initial_permutation = sess.run(next_element)
-      self.assertAllEqual(initial_permutation, sess.run(next_element))
-      self.assertAllEqual(initial_permutation, sess.run(next_element))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
-
-  def testShuffleReshuffleEachIteration(self):
-    iterator = (dataset_ops.Dataset.range(10)
-                .shuffle(10, seed=3, reshuffle_each_iteration=True)
-                .batch(10)
-                .repeat(3)
-                .make_one_shot_iterator())
-    next_element = iterator.get_next()
-
-    with self.cached_session() as sess:
-      initial_permutation = list(sess.run(next_element))
-      for _ in range(2):
-        next_permutation = list(sess.run(next_element))
-        self.assertNotEqual(initial_permutation, next_permutation)
-        self.assertAllEqual(
-            sorted(initial_permutation), sorted(next_permutation))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
-
-  @parameterized.named_parameters(
-      ("ReshuffleGraphLevelSeed", True, 38, None),
-      ("ReshuffleOpLevelSeed", True, None, 42),
-      ("ReshuffleGraphAndOpLevelSeed", True, 38, 42),
-      ("NoReshuffleGraphLevelSeed", False, 38, None),
-      ("NoReshuffleOpLevelSeed", False, None, 42),
-      ("NoReshuffleGraphAndOpLevelSeed", False, 38, 42),
-  )
-  def testShuffleSeed(self, reshuffle, graph_level_seed, op_level_seed):
-    results = []
-    for _ in range(2):
-      with ops.Graph().as_default() as g:
-        random_seed.set_random_seed(graph_level_seed)
-        dataset = dataset_ops.Dataset.range(10).shuffle(
-            10, seed=op_level_seed, reshuffle_each_iteration=reshuffle).repeat(
-                3)
-        iterator = dataset.make_one_shot_iterator()
-        next_element = iterator.get_next()
-
-        run_results = []
-        with self.session(graph=g) as sess:
-          for _ in range(30):
-            run_results.append(sess.run(next_element))
-          with self.assertRaises(errors.OutOfRangeError):
-            sess.run(next_element)
-        results.append(run_results)
-
-    self.assertAllEqual(results[0], results[1])
-
-  @parameterized.named_parameters(
-      ("ReshuffleOneShot", True, False),
-      ("ReshuffleInitializable", True, True),
-      ("NoReshuffleOneShot", False, False),
-      ("NoReshuffleInitializable", False, True),
-  )
-  def testMultipleIterators(self, reshuffle, initializable):
-    with ops.Graph().as_default() as g:
-      dataset = dataset_ops.Dataset.range(100).shuffle(
-          10, reshuffle_each_iteration=reshuffle).repeat(3)
-
-      if initializable:
-        iterators = [dataset.make_initializable_iterator() for _ in range(2)]
-      else:
-        iterators = [dataset.make_one_shot_iterator() for _ in range(2)]
-
-      results = []
-      with self.session(graph=g) as sess:
-        for iterator in iterators:
-          if initializable:
-            sess.run(iterator.initializer)
-          next_element = iterator.get_next()
-          run_results = []
-          for _ in range(300):
-            run_results.append(sess.run(next_element))
-          with self.assertRaises(errors.OutOfRangeError):
-            sess.run(next_element)
-
-          results.append(run_results)
-
-        self.assertNotEqual(results[0], results[1])
-
-
-if __name__ == "__main__":
-  test.main()
diff --git a/tensorflow/python/data/kernel_tests/shuffle_test.py b/tensorflow/python/data/kernel_tests/shuffle_test.py
new file mode 100644
index 00000000000..13df870938d
--- /dev/null
+++ b/tensorflow/python/data/kernel_tests/shuffle_test.py
@@ -0,0 +1,249 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for `tf.data.Dataset.shuffle()`."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+
+from absl.testing import parameterized
+import numpy as np
+
+from tensorflow.python.data.kernel_tests import test_base
+from tensorflow.python.data.ops import dataset_ops
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import random_seed
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.platform import test
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class ShuffleTest(test_base.DatasetTestBase, parameterized.TestCase):
+
+  def testShuffleDataset(self):
+    components = (
+        np.array([1, 2, 3, 4]), np.array([5, 6, 7, 8]),
+        np.array([9.0, 10.0, 11.0, 12.0])
+    )
+
+    def dataset_fn(count=5, buffer_size=None, seed=0):
+      repeat_dataset = (
+          dataset_ops.Dataset.from_tensor_slices(components).repeat(count))
+      if buffer_size:
+        shuffle_dataset = repeat_dataset.shuffle(buffer_size, seed)
+
+        self.assertEqual(
+            tuple([c.shape[1:] for c in components]),
+            shuffle_dataset.output_shapes)
+        return shuffle_dataset
+      else:
+        return repeat_dataset
+
+    # First run without shuffling to collect the "ground truth".
+    get_next = self.getNext(dataset_fn())
+    unshuffled_elements = []
+    for _ in range(20):
+      unshuffled_elements.append(self.evaluate(get_next()))
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
+
+    # Assert that the shuffled dataset has the same elements as the
+    # "ground truth".
+    get_next = self.getNext(dataset_fn(buffer_size=100, seed=37))
+    shuffled_elements = []
+    for _ in range(20):
+      shuffled_elements.append(self.evaluate(get_next()))
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
+    self.assertAllEqual(sorted(unshuffled_elements), sorted(shuffled_elements))
+
+    # Assert that shuffling twice with the same seeds gives the same sequence.
+    get_next = self.getNext(dataset_fn(buffer_size=100, seed=37))
+    reshuffled_elements_same_seed = []
+    for _ in range(20):
+      reshuffled_elements_same_seed.append(self.evaluate(get_next()))
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
+    self.assertEqual(shuffled_elements, reshuffled_elements_same_seed)
+
+    # Assert that shuffling twice with a different seed gives a different
+    # permutation of the same elements.
+    get_next = self.getNext(dataset_fn(buffer_size=100, seed=137))
+    reshuffled_elements_different_seed = []
+    for _ in range(20):
+      reshuffled_elements_different_seed.append(self.evaluate(get_next()))
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
+    self.assertNotEqual(shuffled_elements, reshuffled_elements_different_seed)
+    self.assertAllEqual(
+        sorted(shuffled_elements), sorted(reshuffled_elements_different_seed))
+
+    # Assert that the shuffled dataset has the same elements as the
+    # "ground truth" when the buffer size is smaller than the input
+    # dataset.
+    get_next = self.getNext(dataset_fn(buffer_size=2, seed=37))
+    reshuffled_elements_small_buffer = []
+    for _ in range(20):
+      reshuffled_elements_small_buffer.append(self.evaluate(get_next()))
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
+    self.assertAllEqual(
+        sorted(unshuffled_elements), sorted(reshuffled_elements_small_buffer))
+
+    # Test the case of shuffling an empty dataset.
+    get_next = self.getNext(dataset_fn(count=0, buffer_size=100, seed=37))
+
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
+
+  @test_util.run_deprecated_v1
+  def testSkipEagerSeedZero(self):
+    """Test for same behavior when the seed is a Python or Tensor zero."""
+    iterator = dataset_ops.make_one_shot_iterator(
+        dataset_ops.Dataset.range(10).shuffle(10, seed=0))
+    get_next = iterator.get_next()
+
+    elems = []
+    with self.cached_session() as sess:
+      for _ in range(10):
+        elems.append(sess.run(get_next))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+    seed_placeholder = array_ops.placeholder(dtypes.int64, shape=[])
+    iterator = dataset_ops.make_initializable_iterator(
+        dataset_ops.Dataset.range(10).shuffle(10, seed=seed_placeholder))
+    get_next = iterator.get_next()
+
+    with self.cached_session() as sess:
+      sess.run(iterator.initializer, feed_dict={seed_placeholder: 0})
+      for elem in elems:
+        self.assertEqual(elem, sess.run(get_next))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+  def testDefaultArguments(self):
+    components = [0, 1, 2, 3, 4]
+    dataset = dataset_ops.Dataset.from_tensor_slices(components).shuffle(
+        5).repeat()
+    get_next = self.getNext(dataset)
+    counts = collections.defaultdict(lambda: 0)
+    for _ in range(10):
+      for _ in range(5):
+        counts[self.evaluate(get_next())] += 1
+
+    for i in range(5):
+      self.assertEqual(10, counts[i])
+
+  def testShuffleNoReshuffleEachIteration(self):
+    dataset = dataset_ops.Dataset.range(10).shuffle(
+        10, reshuffle_each_iteration=False).batch(10).repeat(3)
+    next_element = self.getNext(dataset)
+
+    initial_permutation = self.evaluate(next_element())
+    self.assertAllEqual(initial_permutation, self.evaluate(next_element()))
+    self.assertAllEqual(initial_permutation, self.evaluate(next_element()))
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(next_element())
+
+  def testShuffleReshuffleEachIteration(self):
+    dataset = dataset_ops.Dataset.range(10).shuffle(
+        10, seed=3, reshuffle_each_iteration=True).batch(10).repeat(3)
+    next_element = self.getNext(dataset)
+
+    initial_permutation = list(self.evaluate(next_element()))
+    for _ in range(2):
+      next_permutation = list(self.evaluate(next_element()))
+      self.assertNotEqual(initial_permutation, next_permutation)
+      self.assertAllEqual(sorted(initial_permutation), sorted(next_permutation))
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(next_element())
+
+  @parameterized.named_parameters(
+      ("ReshuffleGraphLevelSeed", True, 38, None),
+      ("ReshuffleOpLevelSeed", True, None, 42),
+      ("ReshuffleGraphAndOpLevelSeed", True, 38, 42),
+      ("NoReshuffleGraphLevelSeed", False, 38, None),
+      ("NoReshuffleOpLevelSeed", False, None, 42),
+      ("NoReshuffleGraphAndOpLevelSeed", False, 38, 42),
+  )
+  def testSkipEagerShuffleSeed(self, reshuffle, graph_level_seed,
+                               op_level_seed):
+    results = []
+    for _ in range(2):
+      with ops.Graph().as_default() as g:
+        random_seed.set_random_seed(graph_level_seed)
+        dataset = dataset_ops.Dataset.range(10).shuffle(
+            10, seed=op_level_seed, reshuffle_each_iteration=reshuffle).repeat(
+                3)
+        iterator = dataset_ops.make_one_shot_iterator(dataset)
+        next_element = iterator.get_next()
+
+        run_results = []
+        with self.session(graph=g) as sess:
+          for _ in range(30):
+            run_results.append(sess.run(next_element))
+          with self.assertRaises(errors.OutOfRangeError):
+            sess.run(next_element)
+        results.append(run_results)
+
+    self.assertAllEqual(results[0], results[1])
+
+  # TODO(b/117581999): fails for eager mode with result[0] equal to result[1],
+  # debug.
+  @parameterized.named_parameters(
+      ("ReshuffleOneShot", True, False),
+      ("ReshuffleInitializable", True, True),
+      ("NoReshuffleOneShot", False, False),
+      ("NoReshuffleInitializable", False, True),
+  )
+  def testSkipEagerMultipleIterators(self, reshuffle, initializable):
+    with ops.Graph().as_default() as g:
+      dataset = dataset_ops.Dataset.range(100).shuffle(
+          10, reshuffle_each_iteration=reshuffle).repeat(3)
+
+      if initializable:
+        iterators = [dataset_ops.make_initializable_iterator(dataset)
+                     for _ in range(2)]
+      else:
+        iterators = [dataset_ops.make_one_shot_iterator(dataset)
+                     for _ in range(2)]
+
+      results = []
+      with self.session(graph=g) as sess:
+        for iterator in iterators:
+          if initializable:
+            sess.run(iterator.initializer)
+          next_element = iterator.get_next()
+          run_results = []
+          for _ in range(300):
+            run_results.append(sess.run(next_element))
+          with self.assertRaises(errors.OutOfRangeError):
+            sess.run(next_element)
+
+          results.append(run_results)
+
+        self.assertNotEqual(results[0], results[1])
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/data/kernel_tests/skip_test.py b/tensorflow/python/data/kernel_tests/skip_test.py
new file mode 100644
index 00000000000..c22be576921
--- /dev/null
+++ b/tensorflow/python/data/kernel_tests/skip_test.py
@@ -0,0 +1,62 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for `tf.data.Dataset.skip()`."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.data.kernel_tests import test_base
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.platform import test
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class SkipTest(test_base.DatasetTestBase):
+
+  def testSkipTensorDataset(self):
+    components = (np.arange(10),)
+
+    def do_test(count):
+      dataset = dataset_ops.Dataset.from_tensor_slices(components).skip(count)
+      self.assertEqual([c.shape[1:] for c in components],
+                       [shape for shape in dataset.output_shapes])
+      start_range = min(count, 10) if count != -1 else 10
+      self.assertDatasetProduces(
+          dataset,
+          [tuple(components[0][i:i + 1]) for i in range(start_range, 10)])
+
+    # Skip fewer than input size, we should skip
+    # the first 4 elements and then read the rest.
+    do_test(4)
+
+    # Skip more than input size: get nothing.
+    do_test(25)
+
+    # Skip exactly input size.
+    do_test(10)
+
+    # Set -1 for 'count': skip the entire dataset.
+    do_test(-1)
+
+    # Skip nothing
+    do_test(0)
+
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/data/kernel_tests/take_test.py b/tensorflow/python/data/kernel_tests/take_test.py
new file mode 100644
index 00000000000..03a7ece2d8c
--- /dev/null
+++ b/tensorflow/python/data/kernel_tests/take_test.py
@@ -0,0 +1,55 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for `tf.data.Dataset.take()`."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.data.kernel_tests import test_base
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.platform import test
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class TakeTest(test_base.DatasetTestBase):
+
+  def testTakeTensorDataset(self):
+    components = (np.arange(10),)
+
+    def do_test(count):
+      dataset = dataset_ops.Dataset.from_tensor_slices(components).take(count)
+      self.assertEqual([c.shape[1:] for c in components],
+                       [shape for shape in dataset.output_shapes])
+      num_output = min(count, 10) if count != -1 else 10
+      self.assertDatasetProduces(
+          dataset, [tuple(components[0][i:i + 1]) for i in range(num_output)])
+
+    # Take fewer than input size
+    do_test(4)
+
+    # Take more than input size
+    do_test(25)
+
+    # Take all of input
+    do_test(-1)
+
+    # Take nothing
+    do_test(0)
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/data/kernel_tests/test_base.py b/tensorflow/python/data/kernel_tests/test_base.py
index edb3eff3c17..85f6c9de231 100644
--- a/tensorflow/python/data/kernel_tests/test_base.py
+++ b/tensorflow/python/data/kernel_tests/test_base.py
@@ -38,56 +38,102 @@ class DatasetTestBase(test.TestCase):
     self.assertAllEqual(a.values, b.values)
     self.assertAllEqual(a.dense_shape, b.dense_shape)
 
-  def getNext(self, dataset):
+  def getNext(self, dataset, requires_initialization=False):
     """Returns a callable that returns the next element of the dataset.
 
     Example use:
     ```python
     # In both graph and eager modes
     dataset = ...
-    nxt = self.getNext(dataset)
-    result = self.evaluate(nxt())
+    get_next = self.getNext(dataset)
+    result = self.evaluate(get_next())
     ```
 
     Args:
-      dataset: A dataset whose next element is returned
-
+      dataset: A dataset whose elements will be returned.
+      requires_initialization: Indicates that when the test is executed in graph
+        mode, it should use an initializable iterator to iterate through the
+        dataset (e.g. when it contains stateful nodes). Defaults to False.
     Returns:
-      A callable that returns the next element of `dataset`
+      A callable that returns the next element of `dataset`.
     """
-    it = dataset.make_one_shot_iterator()
     if context.executing_eagerly():
-      return it.get_next
+      iterator = dataset.__iter__()
+      return iterator._next_internal  # pylint: disable=protected-access
     else:
-      nxt = it.get_next()
-      return lambda: nxt
-
-  def _compare_output_to_expected(self, result_values, expected_values):
-    for i in range(len(result_values)):
-      if sparse_tensor.is_sparse(result_values[i]):
-        self.assertSparseValuesEqual(result_values[i], expected_values[i])
+      if requires_initialization:
+        iterator = dataset_ops.make_initializable_iterator(dataset)
+        self.evaluate(iterator.initializer)
       else:
-        self.assertAllEqual(result_values[i], expected_values[i])
+        iterator = dataset_ops.make_one_shot_iterator(dataset)
+      get_next = iterator.get_next()
+      return lambda: get_next
+
+  def _compareOutputToExpected(self, result_values, expected_values,
+                               assert_items_equal):
+    if assert_items_equal:
+      # TODO(shivaniagrawal): add support for nested elements containing sparse
+      # tensors when needed.
+      self.assertItemsEqual(result_values, expected_values)
+      return
+    for i in range(len(result_values)):
+      nest.assert_same_structure(result_values[i], expected_values[i])
+      for result_value, expected_value in zip(
+          nest.flatten(result_values[i]), nest.flatten(expected_values[i])):
+        if sparse_tensor.is_sparse(result_value):
+          self.assertSparseValuesEqual(result_value, expected_value)
+        else:
+          self.assertAllEqual(result_value, expected_value)
 
   def assertDatasetProduces(self,
-                            input_dataset,
+                            dataset,
                             expected_output=None,
-                            expected_err=None,
-                            create_iterator_twice=True):
+                            expected_error=None,
+                            requires_initialization=False,
+                            num_test_iterations=1,
+                            assert_items_equal=False):
+    """Asserts that a dataset produces the expected output / error.
 
-    if expected_err:
-      with self.assertRaisesWithPredicateMatch(expected_err[0],
-                                               expected_err[1]):
-        get_next = self.getNext(input_dataset)
+    Args:
+      dataset: A dataset to check for the expected output / error.
+      expected_output: A list of elements that the dataset is expected to
+        produce.
+      expected_error: A tuple `(type, predicate)` identifying the expected error
+        `dataset` should raise. The `type` should match the expected exception
+        type, while `predicate` should either be 1) a unary function that inputs
+        the raised exception and returns a boolean indicator of success or 2) a
+        regular expression that is expected to match the error message
+        partially.
+      requires_initialization: Indicates that when the test is executed in graph
+        mode, it should use an initializable iterator to iterate through the
+        dataset (e.g. when it contains stateful nodes). Defaults to False.
+      num_test_iterations: Number of times `dataset` will be iterated. Defaults
+        to 2.
+      assert_items_equal: Tests expected_output has (only) the same elements
+        regardless of order.
+    """
+    self.assertTrue(
+        expected_error is not None or expected_output is not None,
+        "Exactly one of expected_output or expected error should be provided.")
+    if expected_error:
+      self.assertTrue(
+          expected_output is None,
+          "Exactly one of expected_output or expected error should be provided."
+      )
+      with self.assertRaisesWithPredicateMatch(expected_error[0],
+                                               expected_error[1]):
+        get_next = self.getNext(
+            dataset, requires_initialization=requires_initialization)
         self.evaluate(get_next())
       return
-    repeated = 2 if create_iterator_twice else 1
-    for _ in range(repeated):
-      get_next = self.getNext(input_dataset)
+    self.assertGreater(num_test_iterations, 0)
+    for _ in range(num_test_iterations):
+      get_next = self.getNext(
+          dataset, requires_initialization=requires_initialization)
       result = []
       for _ in range(len(expected_output)):
         result.append(self.evaluate(get_next()))
-      self._compare_output_to_expected(result, expected_output)
+      self._compareOutputToExpected(result, expected_output, assert_items_equal)
       with self.assertRaises(errors.OutOfRangeError):
         self.evaluate(get_next())
       with self.assertRaises(errors.OutOfRangeError):
@@ -132,7 +178,7 @@ class DatasetTestBase(test.TestCase):
     try:
       self.evaluate(next1())
       raise ValueError(
-          'Expected dataset to raise an error of type %s, but it did not.' %
+          "Expected dataset to raise an error of type %s, but it did not." %
           repr(exception_class))
     except exception_class as e:
       expected_message = e.message
diff --git a/tensorflow/python/data/kernel_tests/text_line_dataset_test.py b/tensorflow/python/data/kernel_tests/text_line_dataset_test.py
new file mode 100644
index 00000000000..4db09a98084
--- /dev/null
+++ b/tensorflow/python/data/kernel_tests/text_line_dataset_test.py
@@ -0,0 +1,165 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for `tf.data.TextLineDataset`."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import gzip
+import os
+import zlib
+
+from tensorflow.python.data.kernel_tests import test_base
+from tensorflow.python.data.ops import readers
+from tensorflow.python.eager import context
+from tensorflow.python.framework import test_util
+from tensorflow.python.platform import test
+from tensorflow.python.util import compat
+
+
+try:
+  import psutil  # pylint: disable=g-import-not-at-top
+  psutil_import_succeeded = True
+except ImportError:
+  psutil_import_succeeded = False
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class TextLineDatasetTest(test_base.DatasetTestBase):
+
+  def _lineText(self, f, l):
+    return compat.as_bytes("%d: %d" % (f, l))
+
+  def _createFiles(self,
+                   num_files,
+                   num_lines,
+                   crlf=False,
+                   compression_type=None):
+    filenames = []
+    for i in range(num_files):
+      fn = os.path.join(self.get_temp_dir(), "text_line.%d.txt" % i)
+      filenames.append(fn)
+      contents = []
+      for j in range(num_lines):
+        contents.append(self._lineText(i, j))
+        # Always include a newline after the record unless it is
+        # at the end of the file, in which case we include it
+        if j + 1 != num_lines or i == 0:
+          contents.append(b"\r\n" if crlf else b"\n")
+      contents = b"".join(contents)
+
+      if not compression_type:
+        with open(fn, "wb") as f:
+          f.write(contents)
+      elif compression_type == "GZIP":
+        with gzip.GzipFile(fn, "wb") as f:
+          f.write(contents)
+      elif compression_type == "ZLIB":
+        contents = zlib.compress(contents)
+        with open(fn, "wb") as f:
+          f.write(contents)
+      else:
+        raise ValueError("Unsupported compression_type", compression_type)
+
+    return filenames
+
+  def _testTextLineDataset(self, compression_type=None):
+    test_filenames = self._createFiles(
+        2, 5, crlf=True, compression_type=compression_type)
+
+    def dataset_fn(filenames, num_epochs, batch_size=None):
+      repeat_dataset = readers.TextLineDataset(
+          filenames, compression_type=compression_type).repeat(num_epochs)
+      if batch_size:
+        return repeat_dataset.batch(batch_size)
+      return repeat_dataset
+
+    # Basic test: read from file 0.
+    expected_output = [self._lineText(0, i) for i in range(5)]
+    self.assertDatasetProduces(
+        dataset_fn([test_filenames[0]], 1), expected_output=expected_output)
+
+    # Basic test: read from file 1.
+    self.assertDatasetProduces(
+        dataset_fn([test_filenames[1]], 1),
+        expected_output=[self._lineText(1, i) for i in range(5)])
+
+    # Basic test: read from both files.
+    expected_output = [self._lineText(0, i) for i in range(5)]
+    expected_output.extend([self._lineText(1, i) for i in range(5)])
+    self.assertDatasetProduces(
+        dataset_fn(test_filenames, 1), expected_output=expected_output)
+
+    # Test repeated iteration through both files.
+    expected_output = [self._lineText(0, i) for i in range(5)]
+    expected_output.extend([self._lineText(1, i) for i in range(5)])
+    self.assertDatasetProduces(
+        dataset_fn(test_filenames, 10), expected_output=expected_output * 10)
+
+    # Test batched and repeated iteration through both files.
+    self.assertDatasetProduces(
+        dataset_fn(test_filenames, 10, 5),
+        expected_output=[[self._lineText(0, i) for i in range(5)],
+                         [self._lineText(1, i) for i in range(5)]] * 10)
+
+  def testTextLineDatasetNoCompression(self):
+    self._testTextLineDataset()
+
+  def testTextLineDatasetGzipCompression(self):
+    self._testTextLineDataset(compression_type="GZIP")
+
+  def testTextLineDatasetZlibCompression(self):
+    self._testTextLineDataset(compression_type="ZLIB")
+
+  def testTextLineDatasetBuffering(self):
+    test_filenames = self._createFiles(2, 5, crlf=True)
+
+    repeat_dataset = readers.TextLineDataset(test_filenames, buffer_size=10)
+    expected_output = []
+    for j in range(2):
+      expected_output.extend([self._lineText(j, i) for i in range(5)])
+    self.assertDatasetProduces(repeat_dataset, expected_output=expected_output)
+
+  def testIteratorResourceCleanup(self):
+    filename = os.path.join(self.get_temp_dir(), "text.txt")
+    with open(filename, "wt") as f:
+      for i in range(3):
+        f.write("%d\n" % (i,))
+    with context.eager_mode():
+      first_iterator = iter(readers.TextLineDataset(filename))
+      self.assertEqual(b"0", next(first_iterator).numpy())
+      second_iterator = iter(readers.TextLineDataset(filename))
+      self.assertEqual(b"0", next(second_iterator).numpy())
+      # Eager kernel caching is based on op attributes, which includes the
+      # Dataset's output shape. Create a different kernel to test that they
+      # don't create resources with the same names.
+      different_kernel_iterator = iter(
+          readers.TextLineDataset(filename).repeat().batch(16))
+      self.assertEqual([16], next(different_kernel_iterator).shape)
+      # Remove our references to the Python Iterator objects, which (assuming no
+      # reference cycles) is enough to trigger DestroyResourceOp and close the
+      # partially-read files.
+      del first_iterator
+      del second_iterator
+      del different_kernel_iterator
+      if not psutil_import_succeeded:
+        self.skipTest(
+            "psutil is required to check that we've closed our files.")
+      open_files = psutil.Process().open_files()
+      self.assertNotIn(filename, [open_file.path for open_file in open_files])
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/data/kernel_tests/tf_record_dataset_test.py b/tensorflow/python/data/kernel_tests/tf_record_dataset_test.py
new file mode 100644
index 00000000000..13a70aa88d0
--- /dev/null
+++ b/tensorflow/python/data/kernel_tests/tf_record_dataset_test.py
@@ -0,0 +1,170 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for `tf.data.TFRecordDataset`."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import gzip
+import os
+import zlib
+
+from tensorflow.python.data.kernel_tests import test_base
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.ops import readers
+from tensorflow.python.framework import test_util
+from tensorflow.python.lib.io import python_io
+from tensorflow.python.platform import test
+from tensorflow.python.util import compat
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class TFRecordDatasetTest(test_base.DatasetTestBase):
+
+  def setUp(self):
+    super(TFRecordDatasetTest, self).setUp()
+    self._num_files = 2
+    self._num_records = 7
+
+    self.test_filenames = self._createFiles()
+
+  def dataset_fn(self,
+                 filenames,
+                 compression_type="",
+                 num_epochs=1,
+                 batch_size=None):
+
+    repeat_dataset = readers.TFRecordDataset(
+        filenames, compression_type).repeat(num_epochs)
+    if batch_size:
+      return repeat_dataset.batch(batch_size)
+    return repeat_dataset
+
+  def _record(self, f, r):
+    return compat.as_bytes("Record %d of file %d" % (r, f))
+
+  def _createFiles(self):
+    filenames = []
+    for i in range(self._num_files):
+      fn = os.path.join(self.get_temp_dir(), "tf_record.%d.txt" % i)
+      filenames.append(fn)
+      writer = python_io.TFRecordWriter(fn)
+      for j in range(self._num_records):
+        writer.write(self._record(i, j))
+      writer.close()
+    return filenames
+
+  def testReadOneEpoch(self):
+    # Basic test: read from file 0.
+    dataset = self.dataset_fn(self.test_filenames[0])
+    self.assertDatasetProduces(
+        dataset,
+        expected_output=[self._record(0, i) for i in range(self._num_records)])
+
+    # Basic test: read from file 1.
+    dataset = self.dataset_fn(self.test_filenames[1])
+    self.assertDatasetProduces(
+        dataset,
+        expected_output=[self._record(1, i) for i in range(self._num_records)])
+
+    # Basic test: read from both files.
+    dataset = self.dataset_fn(self.test_filenames)
+    expected_output = []
+    for j in range(self._num_files):
+      expected_output.extend(
+          [self._record(j, i) for i in range(self._num_records)])
+    self.assertDatasetProduces(dataset, expected_output=expected_output)
+
+  def testReadTenEpochs(self):
+    dataset = self.dataset_fn(self.test_filenames, num_epochs=10)
+    expected_output = []
+    for j in range(self._num_files):
+      expected_output.extend(
+          [self._record(j, i) for i in range(self._num_records)])
+    self.assertDatasetProduces(dataset, expected_output=expected_output * 10)
+
+  def testReadTenEpochsOfBatches(self):
+    dataset = self.dataset_fn(
+        self.test_filenames, num_epochs=10, batch_size=self._num_records)
+    expected_output = []
+    for j in range(self._num_files):
+      expected_output.append(
+          [self._record(j, i) for i in range(self._num_records)])
+    self.assertDatasetProduces(dataset, expected_output=expected_output * 10)
+
+  def testReadZlibFiles(self):
+    zlib_files = []
+    for i, fn in enumerate(self.test_filenames):
+      with open(fn, "rb") as f:
+        cdata = zlib.compress(f.read())
+
+        zfn = os.path.join(self.get_temp_dir(), "tfrecord_%s.z" % i)
+        with open(zfn, "wb") as f:
+          f.write(cdata)
+        zlib_files.append(zfn)
+    expected_output = []
+    for j in range(self._num_files):
+      expected_output.extend(
+          [self._record(j, i) for i in range(self._num_records)])
+    dataset = self.dataset_fn(zlib_files, compression_type="ZLIB")
+    self.assertDatasetProduces(dataset, expected_output=expected_output)
+
+  def testReadGzipFiles(self):
+    gzip_files = []
+    for i, fn in enumerate(self.test_filenames):
+      with open(fn, "rb") as f:
+        gzfn = os.path.join(self.get_temp_dir(), "tfrecord_%s.gz" % i)
+        with gzip.GzipFile(gzfn, "wb") as gzf:
+          gzf.write(f.read())
+        gzip_files.append(gzfn)
+    expected_output = []
+    for j in range(self._num_files):
+      expected_output.extend(
+          [self._record(j, i) for i in range(self._num_records)])
+    dataset = self.dataset_fn(gzip_files, compression_type="GZIP")
+    self.assertDatasetProduces(dataset, expected_output=expected_output)
+
+  def testReadWithBuffer(self):
+    one_mebibyte = 2**20
+    dataset = readers.TFRecordDataset(
+        self.test_filenames, buffer_size=one_mebibyte)
+    expected_output = []
+    for j in range(self._num_files):
+      expected_output.extend(
+          [self._record(j, i) for i in range(self._num_records)])
+    self.assertDatasetProduces(dataset, expected_output=expected_output)
+
+  def testReadFromDatasetOfFiles(self):
+    files = dataset_ops.Dataset.from_tensor_slices(self.test_filenames)
+    expected_output = []
+    for j in range(self._num_files):
+      expected_output.extend(
+          [self._record(j, i) for i in range(self._num_records)])
+    dataset = readers.TFRecordDataset(files)
+    self.assertDatasetProduces(dataset, expected_output=expected_output)
+
+  def testReadTenEpochsFromDatasetOfFilesInParallel(self):
+    files = dataset_ops.Dataset.from_tensor_slices(
+        self.test_filenames).repeat(10)
+    expected_output = []
+    for j in range(self._num_files):
+      expected_output.extend(
+          [self._record(j, i) for i in range(self._num_records)])
+    dataset = readers.TFRecordDataset(files, num_parallel_reads=4)
+    self.assertDatasetProduces(
+        dataset, expected_output=expected_output * 10, assert_items_equal=True)
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/data/kernel_tests/window_dataset_op_test.py b/tensorflow/python/data/kernel_tests/window_dataset_op_test.py
deleted file mode 100644
index 9d067810944..00000000000
--- a/tensorflow/python/data/kernel_tests/window_dataset_op_test.py
+++ /dev/null
@@ -1,291 +0,0 @@
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for the experimental input pipeline ops."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from absl.testing import parameterized
-import numpy as np
-
-from tensorflow.python.data.kernel_tests import test_base
-from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import errors
-from tensorflow.python.framework import sparse_tensor
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.platform import test
-
-
-class WindowDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
-
-  @parameterized.named_parameters(
-      ("1", 20, 14, 7, 1),
-      ("2", 20, 17, 9, 1),
-      ("3", 20, 14, 14, 1),
-      ("4", 20, 10, 14, 1),
-      ("5", 20, 14, 19, 1),
-      ("6", 20, 4, 1, 2),
-      ("7", 20, 2, 1, 6),
-      ("8", 20, 4, 7, 2),
-      ("9", 20, 2, 7, 6),
-      ("10", 1, 10, 4, 1),
-      ("11", 0, 10, 4, 1),
-      ("12", 20, 14, 7, 1, False),
-      ("13", 20, 17, 9, 1, False),
-      ("14", 20, 14, 14, 1, False),
-      ("15", 20, 10, 14, 1, False),
-      ("16", 20, 14, 19, 1, False),
-      ("17", 20, 4, 1, 2, False),
-      ("18", 20, 2, 1, 6, False),
-      ("19", 20, 4, 7, 2, False),
-      ("20", 20, 2, 7, 6, False),
-      ("21", 1, 10, 4, 1, False),
-      ("22", 0, 10, 4, 1, False),
-  )
-  def testWindowDataset(self, count, size, shift, stride, drop_remainder=True):
-    """Tests a dataset that slides a window its input elements."""
-    components = (np.arange(7),
-                  np.array([[1, 2, 3]]) * np.arange(7)[:, np.newaxis],
-                  np.array(37.0) * np.arange(7))
-
-    count_t = array_ops.placeholder(dtypes.int64, shape=[])
-    size_t = array_ops.placeholder(dtypes.int64, shape=[])
-    shift_t = array_ops.placeholder(dtypes.int64, shape=[])
-    stride_t = array_ops.placeholder(dtypes.int64, shape=[])
-    drop_remainder_t = array_ops.placeholder(dtypes.bool, shape=[])
-
-    def _map_fn(x, y, z):
-      return math_ops.square(x), math_ops.square(y), math_ops.square(z)
-
-    def _flat_map_fn(x, y, z):
-      return dataset_ops.Dataset.zip((x.batch(batch_size=size_t),
-                                      y.batch(batch_size=size_t),
-                                      z.batch(batch_size=size_t)))
-
-    iterator = dataset_ops.Dataset.from_tensor_slices(components).map(
-        _map_fn).repeat(count).window(
-            size=size_t,
-            shift=shift_t,
-            stride=stride_t,
-            drop_remainder=drop_remainder_t).flat_map(
-                _flat_map_fn).make_initializable_iterator()
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    self.assertEqual([[None] + list(c.shape[1:]) for c in components],
-                     [t.shape.as_list() for t in get_next])
-
-    with self.cached_session() as sess:
-      sess.run(
-          init_op,
-          feed_dict={
-              count_t: count,
-              size_t: size,
-              shift_t: shift,
-              stride_t: stride,
-              drop_remainder_t: drop_remainder
-          })
-      num_full_batches = max(
-          0, (count * 7 - ((size - 1) * stride + 1)) // shift + 1)
-      for i in range(num_full_batches):
-        result = sess.run(get_next)
-        for component, result_component in zip(components, result):
-          for j in range(size):
-            self.assertAllEqual(component[(i * shift + j * stride) % 7]**2,
-                                result_component[j])
-      if not drop_remainder:
-        num_partial_batches = (count * 7) // shift + (
-            (count * 7) % shift > 0) - num_full_batches
-        for i in range(num_partial_batches):
-          result = sess.run(get_next)
-          for component, result_component in zip(components, result):
-            remaining = (count * 7) - ((num_full_batches + i) * shift)
-            num_elements = remaining // stride + ((remaining % stride) > 0)
-            for j in range(num_elements):
-              self.assertAllEqual(
-                  component[((num_full_batches + i) * shift + j * stride) % 7]
-                  **2, result_component[j])
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  @parameterized.named_parameters(
-      ("1", 14, 0, 3, 1),
-      ("2", 14, 3, 0, 1),
-      ("3", 14, 3, 3, 0),
-  )
-  def testWindowDatasetInvalid(self, count, size, shift, stride):
-    count_t = array_ops.placeholder(dtypes.int64, shape=[])
-    size_t = array_ops.placeholder(dtypes.int64, shape=[])
-    shift_t = array_ops.placeholder(dtypes.int64, shape=[])
-    stride_t = array_ops.placeholder(dtypes.int64, shape=[])
-
-    iterator = dataset_ops.Dataset.range(10).map(lambda x: x).repeat(
-        count_t).window(
-            size=size_t, shift=shift_t,
-            stride=stride_t).flat_map(lambda x: x.batch(batch_size=size_t)
-                                     ).make_initializable_iterator()
-    init_op = iterator.initializer
-
-    with self.cached_session() as sess:
-      with self.assertRaises(errors.InvalidArgumentError):
-        sess.run(
-            init_op,
-            feed_dict={
-                count_t: count,
-                size_t: size,
-                shift_t: shift,
-                stride_t: stride
-            })
-
-  def testWindowSparse(self):
-
-    def _sparse(i):
-      return sparse_tensor.SparseTensorValue(
-          indices=[[0]], values=(i * [1]), dense_shape=[1])
-
-    iterator = dataset_ops.Dataset.range(10).map(_sparse).window(
-        size=5, shift=3, drop_remainder=True).flat_map(
-            lambda x: x.batch(batch_size=5)).make_initializable_iterator()
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(init_op)
-      num_batches = (10 - 5) // 3 + 1
-      for i in range(num_batches):
-        actual = sess.run(get_next)
-        expected = sparse_tensor.SparseTensorValue(
-            indices=[[0, 0], [1, 0], [2, 0], [3, 0], [4, 0]],
-            values=[i * 3, i * 3 + 1, i * 3 + 2, i * 3 + 3, i * 3 + 4],
-            dense_shape=[5, 1])
-        self.assertTrue(sparse_tensor.is_sparse(actual))
-        self.assertSparseValuesEqual(actual, expected)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testWindowSparseWithDifferentDenseShapes(self):
-
-    def _sparse(i):
-      return sparse_tensor.SparseTensorValue(
-          indices=array_ops.expand_dims(
-              math_ops.range(i, dtype=dtypes.int64), 1),
-          values=array_ops.fill([math_ops.to_int32(i)], i),
-          dense_shape=[i])
-
-    iterator = dataset_ops.Dataset.range(10).map(_sparse).window(
-        size=5, shift=3, drop_remainder=True).flat_map(
-            lambda x: x.batch(batch_size=5)).make_initializable_iterator()
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(init_op)
-      num_batches = (10 - 5) // 3 + 1
-      for i in range(num_batches):
-        actual = sess.run(get_next)
-        expected_indices = []
-        expected_values = []
-        for j in range(5):
-          for k in range(i * 3 + j):
-            expected_indices.append([j, k])
-            expected_values.append(i * 3 + j)
-        expected = sparse_tensor.SparseTensorValue(
-            indices=expected_indices,
-            values=expected_values,
-            dense_shape=[5, i * 3 + 5 - 1])
-        self.assertTrue(sparse_tensor.is_sparse(actual))
-        self.assertSparseValuesEqual(actual, expected)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testNestedWindowSparse(self):
-
-    def _sparse(i):
-      return sparse_tensor.SparseTensorValue(
-          indices=[[0]], values=(i * [1]), dense_shape=[1])
-
-    iterator = dataset_ops.Dataset.range(10).map(_sparse).window(
-        size=4, shift=2,
-        drop_remainder=True).flat_map(lambda x: x.batch(batch_size=4)).window(
-            size=3, shift=1, drop_remainder=True).flat_map(
-                lambda x: x.batch(batch_size=3)).make_initializable_iterator()
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(init_op)
-      # Slide: 1st batch.
-      actual = sess.run(get_next)
-      expected = sparse_tensor.SparseTensorValue(
-          indices=[[0, 0, 0], [0, 1, 0], [0, 2, 0], [0, 3, 0], [1, 0, 0],
-                   [1, 1, 0], [1, 2, 0], [1, 3, 0], [2, 0, 0], [2, 1, 0],
-                   [2, 2, 0], [2, 3, 0]],
-          values=[0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7],
-          dense_shape=[3, 4, 1])
-      self.assertTrue(sparse_tensor.is_sparse(actual))
-      self.assertSparseValuesEqual(actual, expected)
-      # Slide: 2nd batch.
-      actual = sess.run(get_next)
-      expected = sparse_tensor.SparseTensorValue(
-          indices=[[0, 0, 0], [0, 1, 0], [0, 2, 0], [0, 3, 0], [1, 0, 0],
-                   [1, 1, 0], [1, 2, 0], [1, 3, 0], [2, 0, 0], [2, 1, 0],
-                   [2, 2, 0], [2, 3, 0]],
-          values=[2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9],
-          dense_shape=[3, 4, 1])
-      self.assertTrue(sparse_tensor.is_sparse(actual))
-      self.assertSparseValuesEqual(actual, expected)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testWindowShapeError(self):
-
-    def generator():
-      yield [1.0, 2.0, 3.0]
-      yield [4.0, 5.0, 6.0]
-      yield [7.0, 8.0, 9.0, 10.0]
-
-    iterator = dataset_ops.Dataset.from_generator(
-        generator, dtypes.float32, output_shapes=[None]).window(
-            size=3, shift=1).flat_map(
-                lambda x: x.batch(batch_size=3)).make_initializable_iterator()
-    next_element = iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(iterator.initializer)
-      with self.assertRaisesRegexp(
-          errors.InvalidArgumentError,
-          r"Cannot batch tensors with different shapes in component 0. "
-          r"First element had shape \[3\] and element 2 had shape \[4\]."):
-        sess.run(next_element)
-
-  def testWindowIgnoreErrors(self):
-    input_values = np.float32([1., np.nan, 2., np.nan, 3.])
-    dataset = dataset_ops.Dataset.from_tensor_slices(input_values).map(
-        lambda x: array_ops.check_numerics(x, "message")).window(
-            size=2, shift=2, stride=2,
-            drop_remainder=True).flat_map(lambda x: x.batch(batch_size=2))
-    get_next = dataset.make_one_shot_iterator().get_next()
-
-    with self.cached_session() as sess:
-      self.assertAllEqual(np.float32([1., 2.]), sess.run(get_next))
-      self.assertAllEqual(np.float32([2., 3.]), sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-
-if __name__ == "__main__":
-  test.main()
diff --git a/tensorflow/python/data/kernel_tests/window_test.py b/tensorflow/python/data/kernel_tests/window_test.py
new file mode 100644
index 00000000000..d083142ab6a
--- /dev/null
+++ b/tensorflow/python/data/kernel_tests/window_test.py
@@ -0,0 +1,231 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for `tf.data.Dataset.window()`."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+import numpy as np
+
+from tensorflow.python.data.kernel_tests import test_base
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.util import nest
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import test
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class WindowTest(test_base.DatasetTestBase, parameterized.TestCase):
+
+  @parameterized.named_parameters(
+      ("1", 20, 14, 7, 1),
+      ("2", 20, 17, 9, 1),
+      ("3", 20, 14, 14, 1),
+      ("4", 20, 10, 14, 1),
+      ("5", 20, 14, 19, 1),
+      ("6", 20, 4, 1, 2),
+      ("7", 20, 2, 1, 6),
+      ("8", 20, 4, 7, 2),
+      ("9", 20, 2, 7, 6),
+      ("10", 1, 10, 4, 1),
+      ("11", 0, 10, 4, 1),
+      ("12", 20, 14, 7, 1, False),
+      ("13", 20, 17, 9, 1, False),
+      ("14", 20, 14, 14, 1, False),
+      ("15", 20, 10, 14, 1, False),
+      ("16", 20, 14, 19, 1, False),
+      ("17", 20, 4, 1, 2, False),
+      ("18", 20, 2, 1, 6, False),
+      ("19", 20, 4, 7, 2, False),
+      ("20", 20, 2, 7, 6, False),
+      ("21", 1, 10, 4, 1, False),
+      ("22", 0, 10, 4, 1, False),
+  )
+  def testWindowDataset(self, count, size, shift, stride, drop_remainder=True):
+    """Tests a dataset that slides a window its input elements."""
+    components = (np.arange(7),
+                  np.array([[1, 2, 3]]) * np.arange(7)[:, np.newaxis],
+                  np.array(37.0) * np.arange(7))
+
+    def _map_fn(x, y, z):
+      return math_ops.square(x), math_ops.square(y), math_ops.square(z)
+
+    def _flat_map_fn(x, y, z):
+      return dataset_ops.Dataset.zip((x.batch(batch_size=size),
+                                      y.batch(batch_size=size),
+                                      z.batch(batch_size=size)))
+
+    dataset = dataset_ops.Dataset.from_tensor_slices(components).map(
+        _map_fn).repeat(count).window(
+            size=size,
+            shift=shift,
+            stride=stride,
+            drop_remainder=drop_remainder).flat_map(_flat_map_fn)
+    get_next = self.getNext(dataset)
+
+    self.assertEqual(
+        [[None] + list(c.shape[1:]) for c in components],
+        [ts.as_list() for ts in nest.flatten(dataset.output_shapes)])
+
+    num_full_batches = max(0,
+                           (count * 7 - ((size - 1) * stride + 1)) // shift + 1)
+    for i in range(num_full_batches):
+      result = self.evaluate(get_next())
+      for component, result_component in zip(components, result):
+        for j in range(size):
+          self.assertAllEqual(component[(i * shift + j * stride) % 7]**2,
+                              result_component[j])
+    if not drop_remainder:
+      num_partial_batches = (count * 7) // shift + (
+          (count * 7) % shift > 0) - num_full_batches
+      for i in range(num_partial_batches):
+        result = self.evaluate(get_next())
+        for component, result_component in zip(components, result):
+          remaining = (count * 7) - ((num_full_batches + i) * shift)
+          num_elements = remaining // stride + ((remaining % stride) > 0)
+          for j in range(num_elements):
+            self.assertAllEqual(
+                component[((num_full_batches + i) * shift + j * stride) % 7]**2,
+                result_component[j])
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
+
+  @parameterized.named_parameters(
+      ("1", 14, 0, 3, 1),
+      ("2", 14, 3, 0, 1),
+      ("3", 14, 3, 3, 0),
+  )
+  def testWindowDatasetInvalid(self, count, size, shift, stride):
+    dataset = dataset_ops.Dataset.range(10).map(lambda x: x).repeat(
+        count).window(
+            size=size, shift=shift,
+            stride=stride).flat_map(lambda x: x.batch(batch_size=size))
+    self.assertDatasetProduces(
+        dataset, expected_error=(errors.InvalidArgumentError, ""))
+
+  def testWindowSparse(self):
+
+    def _sparse(i):
+      return sparse_tensor.SparseTensorValue(
+          indices=[[0]], values=(i * [1]), dense_shape=[1])
+
+    dataset = dataset_ops.Dataset.range(10).map(_sparse).window(
+        size=5, shift=3,
+        drop_remainder=True).flat_map(lambda x: x.batch(batch_size=5))
+
+    num_batches = (10 - 5) // 3 + 1
+    expected_output = [
+        sparse_tensor.SparseTensorValue(
+            indices=[[0, 0], [1, 0], [2, 0], [3, 0], [4, 0]],
+            values=[i * 3, i * 3 + 1, i * 3 + 2, i * 3 + 3, i * 3 + 4],
+            dense_shape=[5, 1]) for i in range(num_batches)
+    ]
+    self.assertDatasetProduces(dataset, expected_output=expected_output)
+
+  def testWindowSparseWithDifferentDenseShapes(self):
+
+    def _sparse(i):
+      return sparse_tensor.SparseTensorValue(
+          indices=array_ops.expand_dims(
+              math_ops.range(i, dtype=dtypes.int64), 1),
+          values=array_ops.fill([math_ops.to_int32(i)], i),
+          dense_shape=[i])
+
+    dataset = dataset_ops.Dataset.range(10).map(_sparse).window(
+        size=5, shift=3,
+        drop_remainder=True).flat_map(lambda x: x.batch(batch_size=5))
+
+    expected_output = []
+    num_batches = (10 - 5) // 3 + 1
+    for i in range(num_batches):
+      expected_indices = []
+      expected_values = []
+      for j in range(5):
+        for k in range(i * 3 + j):
+          expected_indices.append([j, k])
+          expected_values.append(i * 3 + j)
+      expected_output.append(
+          sparse_tensor.SparseTensorValue(
+              indices=expected_indices,
+              values=expected_values,
+              dense_shape=[5, i * 3 + 5 - 1]))
+    self.assertDatasetProduces(dataset, expected_output=expected_output)
+
+  def testNestedWindowSparse(self):
+
+    def _sparse(i):
+      return sparse_tensor.SparseTensorValue(
+          indices=[[0]], values=(i * [1]), dense_shape=[1])
+
+    dataset = dataset_ops.Dataset.range(10).map(_sparse).window(
+        size=4, shift=2,
+        drop_remainder=True).flat_map(lambda x: x.batch(batch_size=4)).window(
+            size=3, shift=1,
+            drop_remainder=True).flat_map(lambda x: x.batch(batch_size=3))
+
+    expected_output = [
+        sparse_tensor.SparseTensorValue(
+            indices=[[0, 0, 0], [0, 1, 0], [0, 2, 0], [0, 3, 0], [1, 0, 0],
+                     [1, 1, 0], [1, 2, 0], [1, 3, 0], [2, 0, 0], [2, 1, 0],
+                     [2, 2, 0], [2, 3, 0]],
+            values=[0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7],
+            dense_shape=[3, 4, 1]),
+        sparse_tensor.SparseTensorValue(
+            indices=[[0, 0, 0], [0, 1, 0], [0, 2, 0], [0, 3, 0], [1, 0, 0],
+                     [1, 1, 0], [1, 2, 0], [1, 3, 0], [2, 0, 0], [2, 1, 0],
+                     [2, 2, 0], [2, 3, 0]],
+            values=[2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9],
+            dense_shape=[3, 4, 1])
+    ]
+    self.assertDatasetProduces(dataset, expected_output=expected_output)
+
+  def testWindowShapeError(self):
+
+    def generator():
+      yield [1.0, 2.0, 3.0]
+      yield [4.0, 5.0, 6.0]
+      yield [7.0, 8.0, 9.0, 10.0]
+
+    dataset = dataset_ops.Dataset.from_generator(
+        generator, dtypes.float32, output_shapes=[None]).window(
+            size=3, shift=1).flat_map(lambda x: x.batch(batch_size=3))
+    self.assertDatasetProduces(
+        dataset,
+        expected_error=(
+            errors.InvalidArgumentError,
+            r"Cannot batch tensors with different shapes in component 0. "
+            r"First element had shape \[3\] and element 2 had shape \[4\]."))
+
+  def testWindowIgnoreErrors(self):
+    input_values = np.float32([1., np.nan, 2., np.nan, 3.])
+    dataset = dataset_ops.Dataset.from_tensor_slices(input_values).map(
+        lambda x: array_ops.check_numerics(x, "message")).window(
+            size=2, shift=2, stride=2,
+            drop_remainder=True).flat_map(lambda x: x.batch(batch_size=2))
+    self.assertDatasetProduces(
+        dataset, expected_output=[np.float32([1., 2.]),
+                                  np.float32([2., 3.])])
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/data/kernel_tests/zip_dataset_op_test.py b/tensorflow/python/data/kernel_tests/zip_dataset_op_test.py
deleted file mode 100644
index 9d76387a343..00000000000
--- a/tensorflow/python/data/kernel_tests/zip_dataset_op_test.py
+++ /dev/null
@@ -1,115 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for the experimental input pipeline ops."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-
-from tensorflow.python.data.kernel_tests import test_base
-from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import errors
-from tensorflow.python.ops import array_ops
-from tensorflow.python.platform import test
-
-
-class ZipDatasetTest(test_base.DatasetTestBase):
-
-  def testZipDataset(self):
-    component_placeholders = [
-        array_ops.placeholder(dtypes.int64),
-        array_ops.placeholder(dtypes.int64),
-        array_ops.placeholder(dtypes.float64)
-    ]
-
-    datasets = tuple([
-        dataset_ops.Dataset.from_tensor_slices(component_placeholder)
-        for component_placeholder in component_placeholders
-    ])
-    zipped = dataset_ops.Dataset.zip(datasets)
-
-    iterator = zipped.make_initializable_iterator()
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      equal_length_components = [
-          np.tile(np.array([[1], [2], [3], [4]]), 20),
-          np.tile(np.array([[12], [13], [14], [15]]), 22),
-          np.array([37.0, 38.0, 39.0, 40.0])
-      ]
-      sess.run(init_op, feed_dict={ph: value for ph, value in zip(
-          component_placeholders, equal_length_components)})
-      for i in range(4):
-        results = sess.run(get_next)
-        for component, result_component in zip(
-            equal_length_components, results):
-          self.assertAllEqual(component[i], result_component)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-      variable_length_components = [[1, 2, 3, 4], [1, 2, 3, 4, 5], [1.0, 2.0]]
-      sess.run(init_op, feed_dict={ph: value for ph, value in zip(
-          component_placeholders, variable_length_components)})
-      for i in range(2):
-        results = sess.run(get_next)
-        for component, result_component in zip(
-            variable_length_components, results):
-          self.assertAllEqual(component[i], result_component)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testNestedZipDataset(self):
-    component_placeholders = [
-        array_ops.placeholder(dtypes.int64, shape=[4, 20]),
-        array_ops.placeholder(dtypes.int64, shape=[4, 22]),
-        array_ops.placeholder(dtypes.float64, shape=[4])
-    ]
-
-    datasets = [
-        dataset_ops.Dataset.from_tensor_slices(component_placeholder)
-        for component_placeholder in component_placeholders
-    ]
-    zipped = dataset_ops.Dataset.zip((datasets[0], (datasets[1], datasets[2])))
-
-    iterator = zipped.make_initializable_iterator()
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    self.assertEqual([20], get_next[0].shape)
-    self.assertEqual([22], get_next[1][0].shape)
-    self.assertEqual([], get_next[1][1].shape)
-
-    with self.cached_session() as sess:
-      equal_length_components = [
-          np.tile(np.array([[1], [2], [3], [4]]), 20),
-          np.tile(np.array([[12], [13], [14], [15]]), 22),
-          np.array([37.0, 38.0, 39.0, 40.0])
-      ]
-      sess.run(init_op, feed_dict={ph: value for ph, value in zip(
-          component_placeholders, equal_length_components)})
-      for i in range(4):
-        result1, (result2, result3) = sess.run(get_next)
-        self.assertAllEqual(equal_length_components[0][i], result1)
-        self.assertAllEqual(equal_length_components[1][i], result2)
-        self.assertAllEqual(equal_length_components[2][i], result3)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-
-if __name__ == "__main__":
-  test.main()
diff --git a/tensorflow/python/data/kernel_tests/zip_test.py b/tensorflow/python/data/kernel_tests/zip_test.py
new file mode 100644
index 00000000000..477c9fa7da1
--- /dev/null
+++ b/tensorflow/python/data/kernel_tests/zip_test.py
@@ -0,0 +1,101 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for `tf.data.Dataset.zip()`."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.data.kernel_tests import test_base
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import test_util
+from tensorflow.python.platform import test
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class ZipTest(test_base.DatasetTestBase):
+
+  def testZipDataset(self):
+
+    def dataset_fn(components):
+      datasets = tuple([
+          dataset_ops.Dataset.from_tensor_slices(component)
+          for component in components
+      ])
+      return dataset_ops.Dataset.zip(datasets)
+
+    equal_length_components = [
+        np.tile(np.array([[1], [2], [3], [4]]), 20),
+        np.tile(np.array([[12], [13], [14], [15]]), 22),
+        np.array([37.0, 38.0, 39.0, 40.0])
+    ]
+
+    get_next = self.getNext(dataset_fn(equal_length_components))
+    for i in range(4):
+      results = self.evaluate(get_next())
+      for component, result_component in zip(equal_length_components, results):
+        self.assertAllEqual(component[i], result_component)
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
+
+    variable_length_components = [[1, 2, 3, 4], [1, 2, 3, 4, 5], [1.0, 2.0]]
+    get_next = self.getNext(dataset_fn(variable_length_components))
+    for i in range(2):
+      results = self.evaluate(get_next())
+      for component, result_component in zip(variable_length_components,
+                                             results):
+        self.assertAllEqual(component[i], result_component)
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
+
+  def testNestedZipDataset(self):
+
+    equal_length_components = [
+        np.tile(np.array([[1], [2], [3], [4]]), 20),
+        np.tile(np.array([[12], [13], [14], [15]]), 22),
+        np.array([37.0, 38.0, 39.0, 40.0])
+    ]
+    datasets = [
+        dataset_ops.Dataset.from_tensor_slices(component)
+        for component in equal_length_components
+    ]
+    dataset = dataset_ops.Dataset.zip((datasets[0], (datasets[1], datasets[2])))
+
+    self.assertEqual(
+        dataset.output_shapes,
+        (tensor_shape.TensorShape([20]),
+         (tensor_shape.TensorShape([22]), tensor_shape.TensorShape([]))))
+
+    get_next = self.getNext(dataset)
+    for i in range(4):
+      result1, (result2, result3) = self.evaluate(get_next())
+      self.assertAllEqual(equal_length_components[0][i], result1)
+      self.assertAllEqual(equal_length_components[1][i], result2)
+      self.assertAllEqual(equal_length_components[2][i], result3)
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/data/ops/BUILD b/tensorflow/python/data/ops/BUILD
index 18edc0872d7..0c5acda180f 100644
--- a/tensorflow/python/data/ops/BUILD
+++ b/tensorflow/python/data/ops/BUILD
@@ -14,6 +14,7 @@ py_library(
         "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:dataset_ops_gen",
         "//tensorflow/python:dtypes",
+        "//tensorflow/python:experimental_dataset_ops_gen",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:function",
         "//tensorflow/python:math_ops",
@@ -25,8 +26,12 @@ py_library(
         "//tensorflow/python:tensor_shape",
         "//tensorflow/python:tensor_util",
         "//tensorflow/python:util",
+        "//tensorflow/python/data/experimental/ops:filter_for_shard_ops",
+        "//tensorflow/python/data/experimental/ops:optimization_options",
         "//tensorflow/python/data/experimental/ops:stats_options",
+        "//tensorflow/python/data/experimental/ops:threading_options",
         "//tensorflow/python/data/util:nest",
+        "//tensorflow/python/data/util:options",
         "//tensorflow/python/data/util:random_seed",
         "//tensorflow/python/data/util:sparse",
         "//tensorflow/python/data/util:structure",
diff --git a/tensorflow/python/data/ops/dataset_ops.py b/tensorflow/python/data/ops/dataset_ops.py
index 3836a68e7d4..ba6d20373e4 100644
--- a/tensorflow/python/data/ops/dataset_ops.py
+++ b/tensorflow/python/data/ops/dataset_ops.py
@@ -18,6 +18,7 @@ from __future__ import division
 from __future__ import print_function
 
 import abc
+import functools
 import threading
 import warnings
 
@@ -25,11 +26,16 @@ import numpy as np
 import six
 
 from tensorflow.python.compat import compat
+from tensorflow.python.data.experimental.ops import filter_for_shard_ops
+from tensorflow.python.data.experimental.ops import optimization_options
 from tensorflow.python.data.experimental.ops import stats_options
+from tensorflow.python.data.experimental.ops import threading_options
 from tensorflow.python.data.ops import iterator_ops
 from tensorflow.python.data.util import nest
+from tensorflow.python.data.util import options as options_lib
 from tensorflow.python.data.util import random_seed
 from tensorflow.python.data.util import sparse
+from tensorflow.python.data.util import structure as structure_lib
 from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -43,6 +49,7 @@ from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gen_dataset_ops
+from tensorflow.python.ops import gen_experimental_dataset_ops as ged_ops
 from tensorflow.python.ops import gen_io_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import script_ops
@@ -52,9 +59,12 @@ from tensorflow.python.util import function_utils
 from tensorflow.python.util.tf_export import tf_export
 
 
-@tf_export("data.Dataset")
+ops.NotDifferentiable("ReduceDataset")
+
+
+@tf_export("data.Dataset", v1=[])
 @six.add_metaclass(abc.ABCMeta)
-class Dataset(object):
+class DatasetV2(object):
   """Represents a potentially large set of elements.
 
   A `Dataset` can be used to represent an input pipeline as a
@@ -62,9 +72,6 @@ class Dataset(object):
   plan" of transformations that act on those elements.
   """
 
-  def __init__(self):
-    pass
-
   def _as_serialized_graph(self):
     """Produces serialized graph representation of the dataset.
 
@@ -89,6 +96,37 @@ class Dataset(object):
 
     raise NotImplementedError("Dataset._inputs")
 
+  def _has_captured_ref(self):
+    """Whether this dataset uses a function that captures ref variables.
+
+    Returns:
+      A boolean, which if true indicates that the dataset or one of its inputs
+      uses a function that captures ref variables.
+    """
+    if context.executing_eagerly():
+      # RefVariables are not supported in eager mode
+      return False
+
+    def is_tensor_or_parent_ref(tensor):
+      if tensor.dtype._is_ref_dtype:  # pylint: disable=protected-access
+        return True
+      return any([is_tensor_or_parent_ref(x) for x in tensor.op.inputs])
+
+    for fn in self._functions():
+      if any([is_tensor_or_parent_ref(t) for t in fn.function.captured_inputs]):
+        return True
+
+    return any(
+        [input_dataset._has_captured_ref() for input_dataset in self._inputs()])  # pylint: disable=protected-access
+
+  def _functions(self):
+    """Returns a list of functions associated with this dataset.
+
+    Returns:
+      A list of `StructuredFunctionWrapper` objects.
+    """
+    return []
+
   def options(self):
     """Returns the options for this dataset and its inputs.
 
@@ -107,9 +145,26 @@ class Dataset(object):
 
     dataset = self
     options = self.options()
+    if options.experimental_threading is not None:
+      t_options = options.experimental_threading
+      if t_options.private_threadpool_size is not None:
+        dataset = _PrivateThreadPoolDataset(dataset,
+                                            t_options.private_threadpool_size)
+      if t_options.max_intra_op_parallelism is not None:
+        dataset = _MaxIntraOpParallelismDataset(
+            dataset, t_options.max_intra_op_parallelism)
     static_optimizations = options._static_optimizations()  # pylint: disable=protected-access
     if static_optimizations:
-      dataset = _OptimizeDataset(dataset, static_optimizations)
+      if self._has_captured_ref():
+        warnings.warn(
+            "tf.data static optimizations are not compatible with tf.Variable. "
+            "The following optimizations will be disabled: %s. To enable "
+            "optimizations, use resource variables instead by calling "
+            "`tf.enable_resource_variables()` at the start of the program." %
+            ", ".join(static_optimizations))
+      else:
+        dataset = _OptimizeDataset(dataset, static_optimizations)
+
     if options.experimental_autotune is not False:
       dataset = _ModelDataset(dataset)
     if options.experimental_stats and options.experimental_stats.aggregator:  # pylint: disable=line-too-long
@@ -119,51 +174,6 @@ class Dataset(object):
           options.experimental_stats.counter_prefix)
     return dataset
 
-  def make_initializable_iterator(self, shared_name=None):
-    """Creates an `Iterator` for enumerating the elements of this dataset.
-
-    Note: The returned iterator will be in an uninitialized state,
-    and you must run the `iterator.initializer` operation before using it:
-
-    ```python
-    dataset = ...
-    iterator = dataset.make_initializable_iterator()
-    # ...
-    sess.run(iterator.initializer)
-    ```
-
-    Args:
-      shared_name: (Optional.) If non-empty, the returned iterator will be
-        shared under the given name across multiple sessions that share the
-        same devices (e.g. when using a remote server).
-
-    Returns:
-      An `Iterator` over the elements of this dataset.
-
-    Raises:
-      RuntimeError: If eager execution is enabled.
-    """
-    if context.executing_eagerly():
-      raise RuntimeError(
-          "dataset.make_initializable_iterator is not supported when eager "
-          "execution is enabled.")
-    dataset = self._apply_options()
-    if shared_name is None:
-      shared_name = ""
-    if compat.forward_compatible(2018, 8, 3):
-      iterator_resource = gen_dataset_ops.iterator_v2(
-          container="", shared_name=shared_name, **flat_structure(self))
-    else:
-      iterator_resource = gen_dataset_ops.iterator(
-          container="", shared_name=shared_name, **flat_structure(self))
-    with ops.colocate_with(iterator_resource):
-      initializer = gen_dataset_ops.make_iterator(
-          dataset._as_variant_tensor(),  # pylint: disable=protected-access
-          iterator_resource)
-    return iterator_ops.Iterator(iterator_resource, initializer,
-                                 dataset.output_types, dataset.output_shapes,
-                                 dataset.output_classes)
-
   def __iter__(self):
     """Creates an `Iterator` for enumerating the elements of this dataset.
 
@@ -183,55 +193,6 @@ class Dataset(object):
       raise RuntimeError("dataset.__iter__() is only supported when eager "
                          "execution is enabled.")
 
-  def make_one_shot_iterator(self):
-    """Creates an `Iterator` for enumerating the elements of this dataset.
-
-    Note: The returned iterator will be initialized automatically.
-    A "one-shot" iterator does not currently support re-initialization.
-
-    Returns:
-      An `Iterator` over the elements of this dataset.
-    """
-    if context.executing_eagerly():
-      dataset = self._apply_options()
-      return iterator_ops.EagerIterator(dataset)
-
-    graph_level_seed, op_level_seed = core_random_seed.get_seed(None)
-
-    # NOTE(mrry): We capture by value here to ensure that `_make_dataset()` is
-    # a 0-argument function.
-    @function.Defun(capture_by_value=True)
-    def _make_dataset():
-      """Factory function for a dataset."""
-      # NOTE(mrry): `Defun` does not capture the graph-level seed from the
-      # enclosing graph, so if a graph-level seed is present we set the local
-      # graph seed based on a combination of the graph- and op-level seeds.
-      if graph_level_seed is not None:
-        assert op_level_seed is not None
-        core_random_seed.set_random_seed(
-            (graph_level_seed + 87654321 * op_level_seed) % (2 ** 63 - 1))
-
-      dataset = self._apply_options()
-      return dataset._as_variant_tensor()  # pylint: disable=protected-access
-
-    try:
-      _make_dataset.add_to_graph(ops.get_default_graph())
-    except ValueError as err:
-      if "Cannot capture a stateful node" in str(err):
-        raise ValueError(
-            "Failed to create a one-shot iterator for a dataset. "
-            "`Dataset.make_one_shot_iterator()` does not support datasets that "
-            "capture stateful objects, such as a `Variable` or `LookupTable`. "
-            "In these cases, use `Dataset.make_initializable_iterator()`. "
-            "(Original error: %s)" % err)
-      else:
-        six.reraise(ValueError, err)
-
-    return iterator_ops.Iterator(
-        gen_dataset_ops.one_shot_iterator(
-            dataset_factory=_make_dataset, **flat_structure(self)),
-        None, self.output_types, self.output_shapes, self.output_classes)
-
   @abc.abstractproperty
   def output_classes(self):
     """Returns the class of each component of an element of this dataset.
@@ -279,9 +240,10 @@ class Dataset(object):
     Note that if `tensors` contains a NumPy array, and eager execution is not
     enabled, the values will be embedded in the graph as one or more
     `tf.constant` operations. For large datasets (> 1 GB), this can waste
-    memory and run into byte limits of graph serialization.  If tensors contains
-    one or more large NumPy arrays, consider the alternative described in
-    [this guide](https://tensorflow.org/guide/datasets#consuming_numpy_arrays).
+    memory and run into byte limits of graph serialization. If `tensors`
+    contains one or more large NumPy arrays, consider the alternative described
+    in [this
+    guide](https://tensorflow.org/guide/datasets#consuming_numpy_arrays).
 
     Args:
       tensors: A nested structure of tensors.
@@ -298,9 +260,10 @@ class Dataset(object):
     Note that if `tensors` contains a NumPy array, and eager execution is not
     enabled, the values will be embedded in the graph as one or more
     `tf.constant` operations. For large datasets (> 1 GB), this can waste
-    memory and run into byte limits of graph serialization.  If tensors contains
-    one or more large NumPy arrays, consider the alternative described in
-    [this guide](https://tensorflow.org/guide/datasets#consuming_numpy_arrays).
+    memory and run into byte limits of graph serialization. If `tensors`
+    contains one or more large NumPy arrays, consider the alternative described
+    in [this guide](
+    https://tensorflow.org/guide/datasets#consuming_numpy_arrays).
 
     Args:
       tensors: A nested structure of tensors, each having the same size in the
@@ -311,19 +274,6 @@ class Dataset(object):
     """
     return TensorSliceDataset(tensors)
 
-  @staticmethod
-  @deprecation.deprecated(None, "Use `tf.data.Dataset.from_tensor_slices()`.")
-  def from_sparse_tensor_slices(sparse_tensor):
-    """Splits each rank-N `tf.SparseTensor` in this dataset row-wise.
-
-    Args:
-      sparse_tensor: A `tf.SparseTensor`.
-
-    Returns:
-      Dataset: A `Dataset` of rank-(N-1) sparse tensors.
-    """
-    return SparseTensorSliceDataset(sparse_tensor)
-
   class _GeneratorState(object):
     """Stores outstanding iterators created from a Python generator.
 
@@ -373,17 +323,19 @@ class Dataset(object):
 
     ```python
     import itertools
+    tf.enable_eager_execution()
 
     def gen():
       for i in itertools.count(1):
         yield (i, [1] * i)
 
-    ds = Dataset.from_generator(
+    ds = tf.data.Dataset.from_generator(
         gen, (tf.int64, tf.int64), (tf.TensorShape([]), tf.TensorShape([None])))
-    value = ds.make_one_shot_iterator().get_next()
 
-    sess.run(value)  # (1, array([1]))
-    sess.run(value)  # (2, array([1, 1]))
+    for value in ds.take(2):
+      print value
+    # (1, array([1]))
+    # (2, array([1, 1]))
     ```
 
     NOTE: The current implementation of `Dataset.from_generator()` uses
@@ -435,7 +387,7 @@ class Dataset(object):
     flattened_types = [dtypes.as_dtype(dt) for dt in nest.flatten(output_types)]
     flattened_shapes = nest.flatten(output_shapes)
 
-    generator_state = Dataset._GeneratorState(generator)
+    generator_state = DatasetV2._GeneratorState(generator)
 
     def get_iterator_id_fn(unused_dummy):
       """Creates a unique `iterator_id` for each pass over the dataset.
@@ -580,7 +532,7 @@ class Dataset(object):
     ```
 
     Args:
-      *args: follow same semantics as python's xrange.
+      *args: follows the same semantics as python's xrange.
         len(args) == 1 -> start = 0, stop = args[0], step = 1
         len(args) == 2 -> start = args[0], stop = args[1], step = 1
         len(args) == 3 -> start = args[0], stop = args[1, stop = args[2]
@@ -820,78 +772,6 @@ class Dataset(object):
     """
     return SkipDataset(self, count)
 
-  def shard(self, num_shards, index):
-    """Creates a `Dataset` that includes only 1/`num_shards` of this dataset.
-
-    This dataset operator is very useful when running distributed training, as
-    it allows each worker to read a unique subset.
-
-    When reading a single input file, you can skip elements as follows:
-
-    ```python
-    d = tf.data.TFRecordDataset(FLAGS.input_file)
-    d = d.shard(FLAGS.num_workers, FLAGS.worker_index)
-    d = d.repeat(FLAGS.num_epochs)
-    d = d.shuffle(FLAGS.shuffle_buffer_size)
-    d = d.map(parser_fn, num_parallel_calls=FLAGS.num_map_threads)
-    ```
-
-    Important caveats:
-
-    - Be sure to shard before you use any randomizing operator (such as
-      shuffle).
-    - Generally it is best if the shard operator is used early in the dataset
-      pipeline. For example, when reading from a set of TFRecord files, shard
-      before converting the dataset to input samples. This avoids reading every
-      file on every worker. The following is an example of an efficient
-      sharding strategy within a complete pipeline:
-
-    ```python
-    d = Dataset.list_files(FLAGS.pattern)
-    d = d.shard(FLAGS.num_workers, FLAGS.worker_index)
-    d = d.repeat(FLAGS.num_epochs)
-    d = d.shuffle(FLAGS.shuffle_buffer_size)
-    d = d.interleave(tf.data.TFRecordDataset,
-                     cycle_length=FLAGS.num_readers, block_length=1)
-    d = d.map(parser_fn, num_parallel_calls=FLAGS.num_map_threads)
-    ```
-
-    Args:
-      num_shards: A `tf.int64` scalar `tf.Tensor`, representing the number of
-        shards operating in parallel.
-      index: A `tf.int64` scalar `tf.Tensor`, representing the worker index.
-
-    Returns:
-      Dataset: A `Dataset`.
-
-    Raises:
-      ValueError: if `num_shards` or `index` are illegal values. Note: error
-        checking is done on a best-effort basis, and aren't guaranteed to be
-        caught upon dataset creation. (e.g. providing in a placeholder tensor
-        bypasses the early checking, and will instead result in an error during
-        a session.run call.)
-    """
-    num_shards = ops.convert_to_tensor(
-        num_shards, name="num_shards", dtype=dtypes.int64)
-    num_shards_static = tensor_util.constant_value(num_shards)
-    index = ops.convert_to_tensor(index, name="index", dtype=dtypes.int64)
-    index_static = tensor_util.constant_value(index)
-
-    if num_shards_static is not None and num_shards_static < 1:
-      raise ValueError("num_shards must be >= 1; got: %s" % num_shards_static)
-    if index_static is not None and index_static < 0:
-      raise ValueError("index must be >= 0; got: %s" % index_static)
-    if (index_static is not None and num_shards_static is not None and
-        index_static >= num_shards_static):
-      raise ValueError("index must be <= num_shards; %s is not < %s" %
-                       (index_static, num_shards_static))
-
-    def filter_fn(elem_index, _):
-      mod_result = math_ops.mod(elem_index, num_shards)
-      return math_ops.equal(mod_result, index)
-
-    return self._enumerate().filter(filter_fn).map(lambda _, elem: elem)
-
   def batch(self, batch_size, drop_remainder=False):
     """Combines consecutive elements of this dataset into batches.
 
@@ -906,7 +786,7 @@ class Dataset(object):
       batch_size: A `tf.int64` scalar `tf.Tensor`, representing the number of
         consecutive elements of this dataset to combine in a single batch.
       drop_remainder: (Optional.) A `tf.bool` scalar `tf.Tensor`, representing
-        whether the last batch should be dropped in the case its has fewer than
+        whether the last batch should be dropped in the case it has fewer than
         `batch_size` elements; the default behavior is not to drop the smaller
         batch.
 
@@ -963,7 +843,7 @@ class Dataset(object):
         respective components.  Defaults are `0` for numeric types and
         the empty string for string types.
       drop_remainder: (Optional.) A `tf.bool` scalar `tf.Tensor`, representing
-        whether the last batch should be dropped in the case its has fewer than
+        whether the last batch should be dropped in the case it has fewer than
         `batch_size` elements; the default behavior is not to drop the smaller
         batch.
 
@@ -1217,7 +1097,7 @@ class Dataset(object):
           dataset.
     """
     dataset = transformation_func(self)
-    if not isinstance(dataset, Dataset):
+    if not isinstance(dataset, DatasetV2):
       raise TypeError("`transformation_func` must return a Dataset.")
     dataset._input_datasets = [self]  # pylint: disable=protected-access
     return dataset
@@ -1384,10 +1264,9 @@ class Dataset(object):
   def with_options(self, options):
     """Returns a new `tf.data.Dataset` with the given options set.
 
-    The options are "global" in the sense they apply to the entire input
-    pipeline in which the `with_options` transformation is used. If options are
-    set multiple times, they are merged if possible (see
-    `tf.data.Options.merge()` for details).
+    The options are "global" in the sense they apply to the entire dataset.
+    If options are set multiple times, they are merged as long as different
+    options do not use different non-default values.
 
     Args:
       options: A `tf.data.Options` that identifies the options the use.
@@ -1396,109 +1275,467 @@ class Dataset(object):
       Dataset: A `Dataset` with the given options.
 
     Raises:
-      ValueError: if options are set more than once
+      ValueError: when an option is set more than once to a non-default value
     """
     return _OptionsDataset(self, options)
 
 
-@tf_export("data.Options")
-class Options(object):
-  """Represents options for tf.data.Dataset.
+@tf_export(v1=["data.Dataset"])
+class DatasetV1(DatasetV2):
+  """Represents a potentially large set of elements.
 
-  An `Options` object can be for instance used to control which static
-  optimizations to apply or whether to use performance modeling to dynamically
-  tune the parallelism of operations such as `tf.data.Dataset.map` or
-  `tf.data.Dataset.interleave`.
+  A `Dataset` can be used to represent an input pipeline as a
+  collection of elements (nested structures of tensors) and a "logical
+  plan" of transformations that act on those elements.
   """
-  for _name, _ty, _docstring in [
-      ("experimental_autotune", bool,
-       "Whether to dynamically adjust the values of tunable parameters (e.g. "
-       "degrees of parallelism)."),
-      ("experimental_deterministic", bool,
-       "Whether the outputs need to be produced in deterministic order."),
-      ("experimental_filter_fusion", bool,
-       "Whether to fuse filter transformations."),
-      ("experimental_hoist_random_uniform", bool,
-       "Whether to hoist `tf.random_uniform()` ops out of map transformations."
-      ),
-      ("experimental_stats", stats_options.StatsOptions,
-       "Associate the given statistics options with the dataset pipeline."),
-      ("experimental_map_and_batch_fusion", bool,
-       "Whether to fuse map and batch transformations."),
-      ("experimental_map_and_filter_fusion", bool,
-       "Whether to fuse map and filter transformations."),
-      ("experimental_map_fusion", bool, "Whether to fuse map transformations."),
-      ("experimental_map_parallelization", bool,
-       "Whether to parallelize stateless map transformations."),
-      ("experimental_map_vectorization", bool,
-       "Whether to vectorize map transformations."),
-      ("experimental_noop_elimination", bool,
-       "Whether to eliminate no-op transformations."),
-      ("experimental_shuffle_and_repeat_fusion", bool,
-       "Whether to fuse shuffle and repeat transformations."),
-      ("experimental_numa_aware", bool,
-       "Whether to use NUMA-aware operations."),
-  ]:
-
-    def _make_getter(name):  # pylint: disable=no-self-argument
-
-      def getter(self):
-        return getattr(self, "_" + name)
-
-      return getter
-
-    def _make_setter(name, ty):  # pylint: disable=no-self-argument
-
-      def setter(self, value):
-        if not isinstance(value, ty):
-          raise TypeError(
-              "Attempting to set the option %s to incompatible value: %r when "
-              "it expects  %r" % (name, value, ty))
-        setattr(self, "_" + name, value)
-
-      return setter
-
-    vars()["_" + _name] = None
-    vars()[_name] = property(
-        _make_getter(_name), _make_setter(_name, _ty), None, _docstring)
 
   def __init__(self):
     pass
 
-  def __eq__(self, other):
-    if isinstance(other, self.__class__):
-      return self.__dict__ == other.__dict__
-    else:
-      return False
+  @deprecation.deprecated(
+      None, "Use `for ... in dataset:` to iterate over a dataset. If using "
+      "`tf.estimator`, return the `Dataset` object directly from your input "
+      "function. As a last resort, you can use "
+      "`tf.compat.v1.data.make_one_shot_iterator(dataset)`.")
+  def make_one_shot_iterator(self):
+    """Creates an `Iterator` for enumerating the elements of this dataset.
 
-  def __ne__(self, other):
-    return not self.__eq__(other)
+    Note: The returned iterator will be initialized automatically.
+    A "one-shot" iterator does not currently support re-initialization.
+
+    Returns:
+      An `Iterator` over the elements of this dataset.
+    """
+    if context.executing_eagerly():
+      dataset = self._apply_options()
+      return iterator_ops.EagerIterator(dataset)
+
+    graph_level_seed, op_level_seed = core_random_seed.get_seed(None)
+
+    # NOTE(mrry): We capture by value here to ensure that `_make_dataset()` is
+    # a 0-argument function.
+    @function.Defun(capture_by_value=True)
+    def _make_dataset():
+      """Factory function for a dataset."""
+      # NOTE(mrry): `Defun` does not capture the graph-level seed from the
+      # enclosing graph, so if a graph-level seed is present we set the local
+      # graph seed based on a combination of the graph- and op-level seeds.
+      if graph_level_seed is not None:
+        assert op_level_seed is not None
+        core_random_seed.set_random_seed(
+            (graph_level_seed + 87654321 * op_level_seed) % (2 ** 63 - 1))
+
+      dataset = self._apply_options()
+      return dataset._as_variant_tensor()  # pylint: disable=protected-access
+
+    try:
+      _make_dataset.add_to_graph(ops.get_default_graph())
+    except ValueError as err:
+      if "Cannot capture a stateful node" in str(err):
+        raise ValueError(
+            "Failed to create a one-shot iterator for a dataset. "
+            "`Dataset.make_one_shot_iterator()` does not support datasets that "
+            "capture stateful objects, such as a `Variable` or `LookupTable`. "
+            "In these cases, use `Dataset.make_initializable_iterator()`. "
+            "(Original error: %s)" % err)
+      else:
+        six.reraise(ValueError, err)
+
+    return iterator_ops.Iterator(
+        gen_dataset_ops.one_shot_iterator(
+            dataset_factory=_make_dataset, **flat_structure(self)),
+        None, self.output_types, self.output_shapes, self.output_classes)
+
+  @deprecation.deprecated(
+      None, "Use `for ... in dataset:` to iterate over a dataset. If using "
+      "`tf.estimator`, return the `Dataset` object directly from your input "
+      "function. As a last resort, you can use "
+      "`tf.compat.v1.data.make_initializable_iterator(dataset)`.")
+  def make_initializable_iterator(self, shared_name=None):
+    """Creates an `Iterator` for enumerating the elements of this dataset.
+
+    Note: The returned iterator will be in an uninitialized state,
+    and you must run the `iterator.initializer` operation before using it:
+
+    ```python
+    dataset = ...
+    iterator = dataset.make_initializable_iterator()
+    # ...
+    sess.run(iterator.initializer)
+    ```
+
+    Args:
+      shared_name: (Optional.) If non-empty, the returned iterator will be
+        shared under the given name across multiple sessions that share the
+        same devices (e.g. when using a remote server).
+
+    Returns:
+      An `Iterator` over the elements of this dataset.
+
+    Raises:
+      RuntimeError: If eager execution is enabled.
+    """
+    if context.executing_eagerly():
+      raise RuntimeError(
+          "dataset.make_initializable_iterator is not supported when eager "
+          "execution is enabled.")
+    dataset = self._apply_options()
+    if shared_name is None:
+      shared_name = ""
+    if compat.forward_compatible(2018, 8, 3):
+      iterator_resource = gen_dataset_ops.iterator_v2(
+          container="", shared_name=shared_name, **flat_structure(self))
+    else:
+      iterator_resource = gen_dataset_ops.iterator(
+          container="", shared_name=shared_name, **flat_structure(self))
+    with ops.colocate_with(iterator_resource):
+      initializer = gen_dataset_ops.make_iterator(
+          dataset._as_variant_tensor(),  # pylint: disable=protected-access
+          iterator_resource)
+    return iterator_ops.Iterator(iterator_resource, initializer,
+                                 dataset.output_types, dataset.output_shapes,
+                                 dataset.output_classes)
+
+  @staticmethod
+  @functools.wraps(DatasetV2.from_tensors)
+  def from_tensors(tensors):
+    return DatasetV1Adapter(DatasetV2.from_tensors(tensors))
+
+  @staticmethod
+  @functools.wraps(DatasetV2.from_tensor_slices)
+  def from_tensor_slices(tensors):
+    return DatasetV1Adapter(DatasetV2.from_tensor_slices(tensors))
+
+  @staticmethod
+  @deprecation.deprecated(None, "Use `tf.data.Dataset.from_tensor_slices()`.")
+  def from_sparse_tensor_slices(sparse_tensor):
+    """Splits each rank-N `tf.SparseTensor` in this dataset row-wise.
+
+    Args:
+      sparse_tensor: A `tf.SparseTensor`.
+
+    Returns:
+      Dataset: A `Dataset` of rank-(N-1) sparse tensors.
+    """
+    return DatasetV1Adapter(SparseTensorSliceDataset(sparse_tensor))
+
+  @staticmethod
+  @functools.wraps(DatasetV2.from_generator)
+  def from_generator(generator, output_types, output_shapes=None, args=None):
+    return DatasetV1Adapter(DatasetV2.from_generator(
+        generator, output_types, output_shapes, args))
+
+  @staticmethod
+  @functools.wraps(DatasetV2.range)
+  def range(*args):
+    return DatasetV1Adapter(DatasetV2.range(*args))
+
+  @staticmethod
+  @functools.wraps(DatasetV2.zip)
+  def zip(datasets):
+    return DatasetV1Adapter(DatasetV2.zip(datasets))
+
+  @functools.wraps(DatasetV2.concatenate)
+  def concatenate(self, dataset):
+    return DatasetV1Adapter(super(DatasetV1, self).concatenate(dataset))
+
+  @functools.wraps(DatasetV2.prefetch)
+  def prefetch(self, buffer_size):
+    return DatasetV1Adapter(super(DatasetV1, self).prefetch(buffer_size))
+
+  @staticmethod
+  @functools.wraps(DatasetV2.list_files)
+  def list_files(file_pattern, shuffle=None, seed=None):
+    return DatasetV1Adapter(DatasetV2.list_files(file_pattern, shuffle, seed))
+
+  @functools.wraps(DatasetV2.repeat)
+  def repeat(self, count=None):
+    return DatasetV1Adapter(super(DatasetV1, self).repeat(count))
+
+  @functools.wraps(DatasetV2.shuffle)
+  def shuffle(self, buffer_size, seed=None, reshuffle_each_iteration=None):
+    return DatasetV1Adapter(super(DatasetV1, self).shuffle(
+        buffer_size, seed, reshuffle_each_iteration))
+
+  @functools.wraps(DatasetV2.cache)
+  def cache(self, filename=""):
+    return DatasetV1Adapter(super(DatasetV1, self).cache(filename))
+
+  @functools.wraps(DatasetV2.take)
+  def take(self, count):
+    return DatasetV1Adapter(super(DatasetV1, self).take(count))
+
+  @functools.wraps(DatasetV2.skip)
+  def skip(self, count):
+    return DatasetV1Adapter(super(DatasetV1, self).skip(count))
+
+  @deprecation.deprecated(
+      None, "Use `dataset.apply(tf.data.experimental.filter_for_shard(...))`.")
+  def shard(self, num_shards, index):
+    """Creates a `Dataset` that includes only 1/`num_shards` of this dataset.
+
+    This dataset operator is very useful when running distributed training, as
+    it allows each worker to read a unique subset.
+
+    When reading a single input file, you can skip elements as follows:
+
+    ```python
+    d = tf.data.TFRecordDataset(FLAGS.input_file)
+    d = d.shard(FLAGS.num_workers, FLAGS.worker_index)
+    d = d.repeat(FLAGS.num_epochs)
+    d = d.shuffle(FLAGS.shuffle_buffer_size)
+    d = d.map(parser_fn, num_parallel_calls=FLAGS.num_map_threads)
+    ```
+
+    Important caveats:
+
+    - Be sure to shard before you use any randomizing operator (such as
+      shuffle).
+    - Generally it is best if the shard operator is used early in the dataset
+      pipeline. For example, when reading from a set of TFRecord files, shard
+      before converting the dataset to input samples. This avoids reading every
+      file on every worker. The following is an example of an efficient
+      sharding strategy within a complete pipeline:
+
+    ```python
+    d = Dataset.list_files(FLAGS.pattern)
+    d = d.shard(FLAGS.num_workers, FLAGS.worker_index)
+    d = d.repeat(FLAGS.num_epochs)
+    d = d.shuffle(FLAGS.shuffle_buffer_size)
+    d = d.interleave(tf.data.TFRecordDataset,
+                     cycle_length=FLAGS.num_readers, block_length=1)
+    d = d.map(parser_fn, num_parallel_calls=FLAGS.num_map_threads)
+    ```
+
+    Args:
+      num_shards: A `tf.int64` scalar `tf.Tensor`, representing the number of
+        shards operating in parallel.
+      index: A `tf.int64` scalar `tf.Tensor`, representing the worker index.
+
+    Returns:
+      Dataset: A `Dataset`.
+
+    Raises:
+      ValueError: if `num_shards` or `index` are illegal values. Note: error
+        checking is done on a best-effort basis, and errors aren't guaranteed
+        to be caught upon dataset creation. (e.g. providing in a placeholder
+        tensor bypasses the early checking, and will instead result in an error
+        during a session.run call.)
+    """
+    return self.apply(filter_for_shard_ops.filter_for_shard(num_shards, index))
+
+  @functools.wraps(DatasetV2.batch)
+  def batch(self, batch_size, drop_remainder=False):
+    return DatasetV1Adapter(super(DatasetV1, self).batch(
+        batch_size, drop_remainder))
+
+  @functools.wraps(DatasetV2.padded_batch)
+  def padded_batch(self,
+                   batch_size,
+                   padded_shapes,
+                   padding_values=None,
+                   drop_remainder=False):
+    return DatasetV1Adapter(super(DatasetV1, self).padded_batch(
+        batch_size, padded_shapes, padding_values, drop_remainder))
+
+  @functools.wraps(DatasetV2.map)
+  def map(self, map_func, num_parallel_calls=None):
+    return DatasetV1Adapter(super(DatasetV1, self).map(
+        map_func, num_parallel_calls))
+
+  @functools.wraps(DatasetV2.flat_map)
+  def flat_map(self, map_func):
+    return DatasetV1Adapter(super(DatasetV1, self).flat_map(map_func))
+
+  @functools.wraps(DatasetV2.interleave)
+  def interleave(self,
+                 map_func,
+                 cycle_length,
+                 block_length=1,
+                 num_parallel_calls=None):
+    return DatasetV1Adapter(super(DatasetV1, self).interleave(
+        map_func, cycle_length, block_length, num_parallel_calls))
+
+  @functools.wraps(DatasetV2.filter)
+  def filter(self, predicate):
+    return DatasetV1Adapter(super(DatasetV1, self).filter(predicate))
+
+  @functools.wraps(DatasetV2.apply)
+  def apply(self, transformation_func):
+    return DatasetV1Adapter(super(DatasetV1, self).apply(transformation_func))
+
+  @functools.wraps(DatasetV2.window)
+  def window(self, size, shift=None, stride=1, drop_remainder=False):
+    return DatasetV1Adapter(super(DatasetV1, self).window(
+        size, shift, stride, drop_remainder))
+
+  @functools.wraps(DatasetV2.with_options)
+  def with_options(self, options):
+    return DatasetV1Adapter(super(DatasetV1, self).with_options(options))
+
+
+# TODO(b/119044825): Until all `tf.data` unit tests are converted to V2, keep
+# this alias in place.
+Dataset = DatasetV1
+
+
+class DatasetV1Adapter(DatasetV1):
+  """Wraps a V2 `Dataset` object in the `tf.compat.v1.data.Dataset` API."""
+
+  def __init__(self, dataset):
+    super(DatasetV1Adapter, self).__init__()
+    self._dataset = dataset
+
+  def _as_variant_tensor(self):
+    return self._dataset._as_variant_tensor()  # pylint: disable=protected-access
+
+  def _has_captured_ref(self):
+    return self._dataset._has_captured_ref()  # pylint: disable=protected-access
+
+  def _inputs(self):
+    return self._dataset._inputs()  # pylint: disable=protected-access
+
+  def options(self):
+    return self._dataset.options()
+
+  @property
+  def output_classes(self):
+    return self._dataset.output_classes
+
+  @property
+  def output_shapes(self):
+    return self._dataset.output_shapes
+
+  @property
+  def output_types(self):
+    return self._dataset.output_types
+
+  def __iter__(self):
+    return iter(self._dataset)
+
+
+@tf_export(v1=["data.make_one_shot_iterator"])
+def make_one_shot_iterator(dataset):
+  """Creates a `tf.data.Iterator` for enumerating the elements of a dataset.
+
+  Note: The returned iterator will be initialized automatically.
+  A "one-shot" iterator does not support re-initialization.
+
+  Args:
+    dataset: A `tf.data.Dataset`.
+
+  Returns:
+    A `tf.data.Iterator` over the elements of this dataset.
+  """
+  try:
+    # Call the defined `make_one_shot_iterator()` if there is one, because some
+    # datasets (e.g. for prefetching) override its behavior.
+    return dataset.make_one_shot_iterator()
+  except AttributeError:
+    return DatasetV1Adapter(dataset).make_one_shot_iterator()
+
+
+@tf_export(v1=["data.make_initializable_iterator"])
+def make_initializable_iterator(dataset):
+  """Creates a `tf.data.Iterator` for enumerating the elements of a dataset.
+
+  Note: The returned iterator will be in an uninitialized state,
+  and you must run the `iterator.initializer` operation before using it:
+
+  ```python
+  dataset = ...
+  iterator = dataset.make_initializable_iterator()
+  # ...
+  sess.run(iterator.initializer)
+  ```
+
+  Args:
+    dataset: A `tf.data.Dataset`.
+
+  Returns:
+    A `tf.data.Iterator` over the elements of `dataset`.
+
+  Raises:
+    RuntimeError: If eager execution is enabled.
+  """
+  try:
+    # Call the defined `make_one_shot_iterator()` if there is one, because some
+    # datasets (e.g. for prefetching) override its behavior.
+    return dataset.make_initializable_iterator()
+  except AttributeError:
+    return DatasetV1Adapter(dataset).make_initializable_iterator()
+
+
+@tf_export("data.Options")
+class Options(options_lib.OptionsBase):
+  """Represents options for tf.data.Dataset.
+
+  An `Options` object can be, for instance, used to control which static
+  optimizations to apply or whether to use performance modeling to dynamically
+  tune the parallelism of operations such as `tf.data.Dataset.map` or
+  `tf.data.Dataset.interleave`.
+  """
+
+  experimental_autotune = options_lib.create_option(
+      name="experimental_autotune",
+      ty=bool,
+      docstring=
+      "Whether to dynamically adjust the values of tunable parameters (e.g. "
+      "degrees of parallelism).")
+
+  experimental_deterministic = options_lib.create_option(
+      name="experimental_deterministic",
+      ty=bool,
+      docstring=
+      "Whether to dynamically adjust the values of tunable parameters (e.g. "
+      "degrees of parallelism).")
+
+  experimental_numa_aware = options_lib.create_option(
+      name="experimental_numa_aware",
+      ty=bool,
+      docstring="Whether to use NUMA-aware operations.")
+
+  experimental_optimization = options_lib.create_option(
+      name="experimental_optimization",
+      ty=optimization_options.OptimizationOptions,
+      docstring="Associates the given optimization options with the dataset.")
+
+  experimental_stats = options_lib.create_option(
+      name="experimental_stats",
+      ty=stats_options.StatsOptions,
+      docstring="Associates the given statistics options with the dataset.")
+
+  experimental_threading = options_lib.create_option(
+      name="experimental_threading",
+      ty=threading_options.ThreadingOptions,
+      docstring="Associates the given threading options with the dataset.")
 
   def _static_optimizations(self):
     """Produces the list of enabled static optimizations."""
-    experimental_optimizations = [
-        "filter_fusion",
-        "hoist_random_uniform",
-        "map_and_batch_fusion",
-        "map_and_filter_fusion",
-        "map_fusion",
-        "map_parallelization",
-        "map_vectorization",
-        "noop_elimination",
-        "shuffle_and_repeat_fusion",
-    ]
-    result = []
-    for exp_opt in experimental_optimizations:
-      if getattr(self, "experimental_" + exp_opt):
-        result.append(exp_opt)
 
-    if getattr(self, "experimental_numa_aware"):
+    result = []
+    exp_optimization_options = self.experimental_optimization
+    if exp_optimization_options:
+      optimizations = [
+          "filter_fusion",
+          "hoist_random_uniform",
+          "map_and_batch_fusion",
+          "map_and_filter_fusion",
+          "map_fusion",
+          "map_parallelization",
+          "map_vectorization",
+          "noop_elimination",
+          "shuffle_and_repeat_fusion",
+      ]
+      for optimization in optimizations:
+        if getattr(exp_optimization_options, optimization):
+          result.append(optimization)
+    if self.experimental_numa_aware:
       result.append("make_numa_aware")
-    if getattr(self, "experimental_deterministic") is False:
+    if self.experimental_deterministic is False:
       result.append("make_sloppy")
-    experimental_stats_options = getattr(self, "experimental_stats")
-    if experimental_stats_options and getattr(experimental_stats_options,
-                                              "latency_all_edges"):
+    exp_stats_options = self.experimental_stats
+    if exp_stats_options and exp_stats_options.latency_all_edges:
       result.append("latency_all_edges")
     return result
 
@@ -1518,42 +1755,17 @@ class Options(object):
       New `tf.data.Options()` object which is the result of merging self with
       the input `tf.data.Options`.
     """
-    result = Options()
-    for other in [self, options]:
-      for name in [
-          "experimental_autotune",
-          "experimental_deterministic",
-          "experimental_filter_fusion",
-          "experimental_hoist_random_uniform",
-          "experimental_map_and_batch_fusion",
-          "experimental_map_and_filter_fusion",
-          "experimental_map_fusion",
-          "experimental_map_parallelization",
-          "experimental_map_vectorization",
-          "experimental_noop_elimination",
-          "experimental_numa_aware",
-          "experimental_shuffle_and_repeat_fusion",
-          "experimental_stats",
-      ]:
-        this = getattr(result, name)
-        that = getattr(other, name)
-        if that is not None:
-          if this is None:
-            setattr(result, name, that)
-          elif this != that:
-            raise ValueError(
-                "Cannot merge incompatible values of option: %s" % (name))
-    return result
+    return options_lib.merge_options(self, options)
 
 
-class DatasetSource(Dataset):
+class DatasetSource(DatasetV2):
   """Abstract class representing a dataset with no inputs."""
 
   def _inputs(self):
     return []
 
 
-class UnaryDataset(Dataset):
+class UnaryDataset(DatasetV2):
   """Abstract class representing a dataset with one input."""
 
   def __init__(self, input_dataset):
@@ -1564,6 +1776,22 @@ class UnaryDataset(Dataset):
     return [self._input_dataset]
 
 
+class UnaryUnchangedStructureDataset(UnaryDataset):
+  """Represents a unary dataset with the same input and output structure."""
+
+  @property
+  def output_classes(self):
+    return self._input_dataset.output_classes  # pylint: disable=protected-access
+
+  @property
+  def output_shapes(self):
+    return self._input_dataset.output_shapes  # pylint: disable=protected-access
+
+  @property
+  def output_types(self):
+    return self._input_dataset.output_types  # pylint: disable=protected-access
+
+
 class TensorDataset(DatasetSource):
   """A `Dataset` with a single element, viz. a nested structure of tensors."""
 
@@ -1684,63 +1912,7 @@ class SparseTensorSliceDataset(DatasetSource):
     return (dtypes.int64, self._sparse_tensor.dtype, dtypes.int64)
 
 
-class _NestedDatasetComponent(object):
-  """The structure of a `Dataset` nested in a component of another `Dataset`.
-
-  A `StructuredFunctionWrapper` around a function that returns a `Dataset` as
-  one of its components will have a `NestedDatasetComponent` in the
-  corresponding position in the `output_classes`, `output_shapes`, and
-  `output_types` properties.
-
-  NOTE(mrry): This class is not currently exposed via the public API. Support
-  for nested datasets can be enabled on a function-by-function basis by setting
-  `experimental_nested_dataset_support=True` in the `StructuredFunctionWrapper`
-  initializer.
-
-  TODO(b/110122868): Add this class, or something equivalent, to the public API.
-  We are considering revising the public API for accessing Dataset structure
-  (`output_classes` etc.) based on experience with nested datasets and other
-  custom component types.
-  """
-
-  def __init__(self,
-               dataset=None,
-               output_shapes=None,
-               output_types=None,
-               output_classes=None):
-    if dataset is None:
-      if (output_classes is None or output_shapes is None or
-          output_types is None):
-        raise ValueError(
-            "Either `dataset`, or all of `output_classes`, "
-            "`output_shapes`, and `output_types` must be specified.")
-      self._output_classes = output_classes
-      self._output_shapes = output_shapes
-      self._output_types = output_types
-    else:
-      if not (output_classes is None and output_shapes is None and
-              output_types is None):
-        raise ValueError(
-            "Either `dataset`, or all of `output_classes`, "
-            "`output_shapes`, and `output_types` must be specified.")
-      self._output_classes = dataset.output_classes
-      self._output_shapes = dataset.output_shapes
-      self._output_types = dataset.output_types
-
-  @property
-  def output_classes(self):
-    return self._output_classes
-
-  @property
-  def output_shapes(self):
-    return self._output_shapes
-
-  @property
-  def output_types(self):
-    return self._output_types
-
-
-class _VariantDataset(Dataset):
+class _VariantDataset(DatasetV2):
   """A Dataset wrapper around a `tf.variant`-typed function argument."""
 
   def __init__(self, dataset_variant, structure):
@@ -1756,15 +1928,76 @@ class _VariantDataset(Dataset):
 
   @property
   def output_classes(self):
-    return self._structure.output_classes
+    return self._structure._to_legacy_output_classes()  # pylint: disable=protected-access
 
   @property
   def output_shapes(self):
-    return self._structure.output_shapes
+    return self._structure._to_legacy_output_shapes()  # pylint: disable=protected-access
 
   @property
   def output_types(self):
-    return self._structure.output_types
+    return self._structure._to_legacy_output_types()  # pylint: disable=protected-access
+
+
+class DatasetStructure(structure_lib.Structure):
+  """Represents a `Dataset` of structured values."""
+
+  def __init__(self, element_structure):
+    self._element_structure = element_structure
+
+  @property
+  def _flat_shapes(self):
+    return [tensor_shape.scalar()]
+
+  @property
+  def _flat_types(self):
+    return [dtypes.variant]
+
+  def is_compatible_with(self, other):
+    # pylint: disable=protected-access
+    return (isinstance(other, DatasetStructure) and
+            self._element_structure.is_compatible_with(
+                other._element_structure))
+
+  def _to_tensor_list(self, value):
+    return [value._as_variant_tensor()]  # pylint: disable=protected-access
+
+  def _from_tensor_list(self, flat_value):
+    if (len(flat_value) != 1 or flat_value[0].dtype != dtypes.variant or
+        not flat_value[0].shape.is_compatible_with(tensor_shape.scalar())):
+      raise ValueError(
+          "DatasetStructure corresponds to a single tf.variant scalar.")
+    return self._from_compatible_tensor_list(flat_value)
+
+  def _from_compatible_tensor_list(self, flat_value):
+    # pylint: disable=protected-access
+    return _VariantDataset(flat_value[0], self._element_structure)
+
+  @staticmethod
+  def from_value(value):
+    # TODO(b/110122868): We can simplify this when a `Dataset` object has a
+    # `Structure`-valued property.
+    element_structure = structure_lib.Structure._from_legacy_structure(
+        value.output_types, value.output_shapes, value.output_classes)
+    return DatasetStructure(element_structure)
+
+  def _to_legacy_output_types(self):
+    return self
+
+  def _to_legacy_output_shapes(self):
+    return self
+
+  def _to_legacy_output_classes(self):
+    return self
+
+  def _batch(self, batch_size):
+    raise NotImplementedError("Batching for `tf.data.Dataset` objects.")
+
+
+# pylint: disable=protected-access
+structure_lib.Structure._register_custom_converter(DatasetV2,
+                                                   DatasetStructure.from_value)
+# pylint: enable=protected-access
 
 
 class StructuredFunctionWrapper(object):
@@ -1779,7 +2012,6 @@ class StructuredFunctionWrapper(object):
                input_shapes=None,
                input_types=None,
                add_to_graph=True,
-               experimental_nested_dataset_support=False,
                defun_kwargs=None):
     """Creates a new `StructuredFunctionWrapper` for the given function.
 
@@ -1799,8 +2031,6 @@ class StructuredFunctionWrapper(object):
         argument defines the element types and structure for `func` arguments.
       add_to_graph: (Optional.) If `True`, the function will be added to the
         default graph.
-      experimental_nested_dataset_support: (Optional.) If `True`, the function
-        will support `tf.data.Dataset` objects as arguments and return values.
       defun_kwargs: (Optional.) A dictionary mapping string argument names to
         values. If supplied, will be passed to `function.Defun()` as keyword
         arguments.
@@ -1825,6 +2055,9 @@ class StructuredFunctionWrapper(object):
       self._input_types = dataset.output_types
       self._input_classes = dataset.output_classes
 
+    self._input_structure = structure_lib.Structure._from_legacy_structure(  # pylint: disable=protected-access
+        self._input_types, self._input_shapes, self._input_classes)
+
     self._transformation_name = transformation_name
     readable_transformation_name = transformation_name.replace(
         ".", "_")[:-2] if len(transformation_name) > 2 else ""
@@ -1832,39 +2065,18 @@ class StructuredFunctionWrapper(object):
         readable_transformation_name,
         function_utils.get_func_name(func),
         str(ops.uid())
-
     ])
 
-    # TODO(b/110122868): Enable this support for all `tf.data` functions.
-    self._nested_dataset_support = experimental_nested_dataset_support
-
     if defun_kwargs is None:
       defun_kwargs = {}
 
     @function.Defun(
-        *self._defun_args(), func_name=self._func_name, **defun_kwargs)
+        *self._input_structure._flat_types, func_name=self._func_name,  # pylint: disable=protected-access
+        **defun_kwargs)
     def tf_data_structured_function_wrapper(*args):
       """Wrapper for passing nested structures to and from tf.data functions."""
-      flat_args = []
-      for arg, arg_class, arg_shape, arg_type in zip(
-          args,
-          nest.flatten(self._input_classes),
-          nest.flatten(self._input_shapes),
-          nest.flatten(self._input_types)):
-        # TODO(b/110122868): Add a registration mechanism for new component
-        # types.
-        if arg_class is sparse_tensor_lib.SparseTensor:
-          arg = sparse.deserialize_sparse_tensors(
-              arg, arg_type, arg_shape, arg_class)
-          arg.indices.set_shape([None, arg_shape.ndims])
-          arg.dense_shape.set_shape([arg_shape.ndims])
-        elif isinstance(arg_class, _NestedDatasetComponent):
-          assert self._nested_dataset_support
-          arg = _VariantDataset(arg, arg_class)
-        else:
-          arg.set_shape(arg_shape)
-        flat_args.append(arg)
-      nested_args = nest.pack_sequence_as(self._input_classes, flat_args)
+      # pylint: disable=protected-access
+      nested_args = self._input_structure._from_compatible_tensor_list(args)
       if not _should_unpack_args(nested_args):
         nested_args = (nested_args,)
 
@@ -1882,55 +2094,14 @@ class StructuredFunctionWrapper(object):
       if isinstance(ret, list):
         ret = tuple(ret)
 
-      # Convert any `SparseTensorValue`s to `SparseTensor`s and all other
-      # values to tensors.
-      flat_ret = []
-      flat_classes = []
-      flat_shapes = []
-      flat_types = []
-      for t in nest.flatten(ret):
-        # TODO(b/110122868): Add a registration mechanism for new component
-        # types.
-        if sparse_tensor_lib.is_sparse(t):
-          t = sparse_tensor_lib.SparseTensor.from_value(t)
-          flat_ret.append(sparse.serialize_sparse_tensors(t))
-          flat_classes.append(sparse_tensor_lib.SparseTensor)
-          flat_shapes.append(t.get_shape())
-          flat_types.append(t.dtype)
-        elif isinstance(t, Dataset):
-          if not self._nested_dataset_support:
-            raise NotImplementedError(
-                "The %s transformation does not currently support nested "
-                "datasets as outputs." % self._transformation_name)
-
-          flat_ret.append(t._as_variant_tensor())  # pylint: disable=protected-access
-          component = _NestedDatasetComponent(t)
-          flat_classes.append(component)
-          flat_shapes.append(component)
-          flat_types.append(component)
-          if t.options() != Options():
-            warnings.warn("Encountered a nested dataset with non-default "
-                          "options. These options will not be propagated to "
-                          "the outer dataset.")
-        else:
-          try:
-            t = ops.convert_to_tensor(t)
-          except (ValueError, TypeError):
-            raise TypeError("Unsupported return value from function passed to "
-                            "%s: %s." % (transformation_name, t))
-          flat_ret.append(t)
-          flat_classes.append(ops.Tensor)
-          flat_shapes.append(t.get_shape())
-          flat_types.append(t.dtype)
-
-      ret = nest.pack_sequence_as(ret, flat_ret)
-      self._output_classes = nest.pack_sequence_as(ret, flat_classes)
-      self._output_shapes = nest.pack_sequence_as(ret, flat_shapes)
-      self._output_types = nest.pack_sequence_as(ret, flat_types)
+      try:
+        self._output_structure = structure_lib.Structure.from_value(ret)
+      except (ValueError, TypeError):
+        raise TypeError("Unsupported return value from function passed to "
+                        "%s: %s." % (transformation_name, ret))
 
       _warn_if_collections(transformation_name)
-
-      return flat_ret
+      return self._output_structure._to_tensor_list(ret)
 
     self._function = tf_data_structured_function_wrapper
     if add_to_graph:
@@ -1941,45 +2112,32 @@ class StructuredFunctionWrapper(object):
       # in case (e.g.) we need to rerun the function.
       self._function._create_definition_if_needed()  # pylint: disable=protected-access
 
-  def _defun_args(self):
-    """Returns a flat list of `tf.DType` for the input element structure."""
-    ret = []
-    for input_type, input_class in zip(nest.flatten(self._input_types),
-                                       nest.flatten(self._input_classes)):
-      # TODO(b/110122868): Add a registration mechanism for new component types.
-      if input_class is sparse_tensor_lib.SparseTensor:
-        ret.append(dtypes.variant)
-      elif isinstance(input_class, _NestedDatasetComponent):
-        if not self._nested_dataset_support:
-          raise NotImplementedError(
-              "The %s transformation does not currently support nested "
-              "datasets as inputs." % self._transformation_name)
-        ret.append(dtypes.variant)
-      else:
-        assert isinstance(input_type, dtypes.DType)
-        ret.append(input_type)
-    return ret
+  @property
+  def output_structure(self):
+    return self._output_structure
 
   @property
   def output_classes(self):
-    return self._output_classes
+    return self._output_structure._to_legacy_output_classes()  # pylint: disable=protected-access
 
   @property
   def output_shapes(self):
-    return self._output_shapes
+    return self._output_structure._to_legacy_output_shapes()  # pylint: disable=protected-access
 
   @property
   def output_types(self):
-    return self._output_types
+    return self._output_structure._to_legacy_output_types()  # pylint: disable=protected-access
 
   @property
   def function(self):
     return self._function
 
 
-def flat_structure(dataset):
+def flat_structure(dataset=None, structure=None):
   """Helper for setting `output_shapes` and `output_types` attrs of Dataset ops.
 
+  Either `dataset` or `structure` must be passed to this function.
+
   Most Dataset op constructors expect `output_shapes` and `output_types`
   arguments that represent the flattened structure of an element. This helper
   function generates these attrs as a keyword argument dictionary, allowing
@@ -1987,36 +2145,20 @@ def flat_structure(dataset):
   `**flat_structure(self)` to the op constructor.
 
   Args:
-    dataset: A `tf.data.Dataset`.
+    dataset: (Optional.) A `tf.data.Dataset`.
+    structure: (Optional.) A `Structure`.
 
   Returns:
     A dictionary of keyword arguments that can be passed to many Dataset op
     constructors.
   """
-  output_classes = []
-  output_shapes = []
-  output_types = []
-  for output_class, output_shape, output_type in zip(
-      nest.flatten(dataset.output_classes), nest.flatten(dataset.output_shapes),
-      nest.flatten(dataset.output_types)):
-    if isinstance(output_class, _NestedDatasetComponent):
-      output_classes.append(output_class.output_classes)
-      output_shapes.append(output_shape.output_shapes)
-      output_types.append(output_type.output_types)
-    else:
-      output_classes.append(output_class)
-      output_shapes.append(output_shape)
-      output_types.append(output_type)
-
-  output_classes = nest.pack_sequence_as(dataset.output_classes, output_classes)
-  output_shapes = nest.pack_sequence_as(dataset.output_shapes, output_shapes)
-  output_types = nest.pack_sequence_as(dataset.output_types, output_types)
-
+  # pylint: disable=protected-access
+  if structure is None:
+    structure = structure_lib.Structure._from_legacy_structure(
+        dataset.output_types, dataset.output_shapes, dataset.output_classes)
   return {
-      "output_shapes":
-          nest.flatten(sparse.as_dense_shapes(output_shapes, output_classes)),
-      "output_types":
-          nest.flatten(sparse.as_dense_types(output_types, output_classes)),
+      "output_shapes": structure._flat_shapes,
+      "output_types": structure._flat_types,
   }
 
 
@@ -2108,14 +2250,14 @@ class _GeneratorDataset(DatasetSource):
     return "Dataset.from_generator()"
 
 
-class ZipDataset(Dataset):
+class ZipDataset(DatasetV2):
   """A `Dataset` that zips its inputs together."""
 
   def __init__(self, datasets):
     """See `Dataset.zip()` for details."""
     super(ZipDataset, self).__init__()
     for ds in nest.flatten(datasets):
-      if not isinstance(ds, Dataset):
+      if not isinstance(ds, DatasetV2):
         if isinstance(ds, list):
           message = ("The argument to `Dataset.zip()` must be a nested "
                      "structure of `Dataset` objects. Nested structures do not "
@@ -2155,7 +2297,7 @@ class ZipDataset(Dataset):
         [ds.output_types for ds in nest.flatten(self._datasets)])
 
 
-class ConcatenateDataset(Dataset):
+class ConcatenateDataset(DatasetV2):
   """A `Dataset` that concatenates its input with given dataset."""
 
   def __init__(self, input_dataset, dataset_to_concatenate):
@@ -2210,7 +2352,7 @@ class ConcatenateDataset(Dataset):
     return self._output_types
 
 
-class RepeatDataset(UnaryDataset):
+class RepeatDataset(UnaryUnchangedStructureDataset):
   """A `Dataset` that repeats its input several times."""
 
   def __init__(self, input_dataset, count):
@@ -2229,18 +2371,6 @@ class RepeatDataset(UnaryDataset):
         count=self._count,
         **flat_structure(self))
 
-  @property
-  def output_classes(self):
-    return self._input_dataset.output_classes
-
-  @property
-  def output_shapes(self):
-    return self._input_dataset.output_shapes
-
-  @property
-  def output_types(self):
-    return self._input_dataset.output_types
-
 
 class RangeDataset(DatasetSource):
   """A `Dataset` of a step separated range of values."""
@@ -2290,7 +2420,7 @@ class RangeDataset(DatasetSource):
     return dtypes.int64
 
 
-class CacheDataset(UnaryDataset):
+class CacheDataset(UnaryUnchangedStructureDataset):
   """A `Dataset` that caches elements of its input."""
 
   def __init__(self, input_dataset, filename):
@@ -2306,20 +2436,8 @@ class CacheDataset(UnaryDataset):
         filename=self._filename,
         **flat_structure(self))
 
-  @property
-  def output_classes(self):
-    return self._input_dataset.output_classes
 
-  @property
-  def output_shapes(self):
-    return self._input_dataset.output_shapes
-
-  @property
-  def output_types(self):
-    return self._input_dataset.output_types
-
-
-class ShuffleDataset(UnaryDataset):
+class ShuffleDataset(UnaryUnchangedStructureDataset):
   """A `Dataset` that randomly shuffles the elements of its input."""
 
   def __init__(self,
@@ -2367,20 +2485,8 @@ class ShuffleDataset(UnaryDataset):
         reshuffle_each_iteration=self._reshuffle_each_iteration,
         **flat_structure(self))
 
-  @property
-  def output_classes(self):
-    return self._input_dataset.output_classes
 
-  @property
-  def output_shapes(self):
-    return self._input_dataset.output_shapes
-
-  @property
-  def output_types(self):
-    return self._input_dataset.output_types
-
-
-class TakeDataset(UnaryDataset):
+class TakeDataset(UnaryUnchangedStructureDataset):
   """A `Dataset` containing the first `count` elements from its input."""
 
   def __init__(self, input_dataset, count):
@@ -2395,20 +2501,8 @@ class TakeDataset(UnaryDataset):
         count=self._count,
         **flat_structure(self))
 
-  @property
-  def output_classes(self):
-    return self._input_dataset.output_classes
 
-  @property
-  def output_shapes(self):
-    return self._input_dataset.output_shapes
-
-  @property
-  def output_types(self):
-    return self._input_dataset.output_types
-
-
-class SkipDataset(UnaryDataset):
+class SkipDataset(UnaryUnchangedStructureDataset):
   """A `Dataset` skipping the first `count` elements from its input."""
 
   def __init__(self, input_dataset, count):
@@ -2423,18 +2517,6 @@ class SkipDataset(UnaryDataset):
         count=self._count,
         **flat_structure(self))
 
-  @property
-  def output_classes(self):
-    return self._input_dataset.output_classes
-
-  @property
-  def output_shapes(self):
-    return self._input_dataset.output_shapes
-
-  @property
-  def output_types(self):
-    return self._input_dataset.output_types
-
 
 class BatchDataset(UnaryDataset):
   """A `Dataset` that batches contiguous elements from its input."""
@@ -2448,37 +2530,37 @@ class BatchDataset(UnaryDataset):
     self._drop_remainder = ops.convert_to_tensor(
         drop_remainder, dtype=dtypes.bool, name="drop_remainder")
 
-  def _as_variant_tensor(self):
-    # TODO(jsimsa): Switch to using v2 only any time after 6/30/2018.
-    if smart_cond.smart_constant_value(self._drop_remainder) is False:
-      return gen_dataset_ops.batch_dataset(
-          self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
-          batch_size=self._batch_size,
-          **flat_structure(self))
+    # pylint: disable=protected-access
+    input_structure = structure_lib.Structure._from_legacy_structure(
+        input_dataset.output_types, input_dataset.output_shapes,
+        input_dataset.output_classes)
+    constant_drop_remainder = tensor_util.constant_value(self._drop_remainder)
+    if constant_drop_remainder:
+      # NOTE(mrry): `constant_drop_remainder` may be `None` (unknown statically)
+      # or `False` (explicitly retaining the remainder).
+      self._output_structure = input_structure._batch(
+          tensor_util.constant_value(self._batch_size))
     else:
-      return gen_dataset_ops.batch_dataset_v2(
-          self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
-          batch_size=self._batch_size,
-          drop_remainder=self._drop_remainder,
-          **flat_structure(self))
+      self._output_structure = input_structure._batch(None)
+
+  def _as_variant_tensor(self):
+    return gen_dataset_ops.batch_dataset_v2(
+        self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
+        batch_size=self._batch_size,
+        drop_remainder=self._drop_remainder,
+        **flat_structure(structure=self._output_structure))
 
   @property
   def output_classes(self):
-    return self._input_dataset.output_classes
+    return self._output_structure._to_legacy_output_classes()  # pylint: disable=protected-access
 
   @property
   def output_shapes(self):
-    input_shapes = self._input_dataset.output_shapes
-    return nest.pack_sequence_as(input_shapes, [
-        tensor_shape.vector(
-            tensor_util.constant_value(self._batch_size) if smart_cond.
-            smart_constant_value(self._drop_remainder) else None).concatenate(s)
-        for s in nest.flatten(self._input_dataset.output_shapes)
-    ])
+    return self._output_structure._to_legacy_output_shapes()  # pylint: disable=protected-access
 
   @property
   def output_types(self):
-    return self._input_dataset.output_types
+    return self._output_structure._to_legacy_output_types()  # pylint: disable=protected-access
 
 
 def _is_padded_shape_compatible_with(padded_shape, input_component_shape):
@@ -2704,66 +2786,37 @@ class MapDataset(UnaryDataset):
     super(MapDataset, self).__init__(input_dataset)
     self._input_dataset = input_dataset
     self._use_inter_op_parallelism = use_inter_op_parallelism
-
-    wrapped_func = StructuredFunctionWrapper(
+    self._map_func = StructuredFunctionWrapper(
         map_func, self._transformation_name(), dataset=input_dataset)
-    self._output_classes = wrapped_func.output_classes
-    self._output_shapes = wrapped_func.output_shapes
-    self._output_types = wrapped_func.output_types
-    self._map_func = wrapped_func.function
 
   def _as_variant_tensor(self):
     input_t = self._input_dataset._as_variant_tensor()  # pylint: disable=protected-access
     return gen_dataset_ops.map_dataset(
         input_t,
-        self._map_func.captured_inputs,
-        f=self._map_func,
+        self._map_func.function.captured_inputs,
+        f=self._map_func.function,
         use_inter_op_parallelism=self._use_inter_op_parallelism,
-        **flat_structure(self))
+        **flat_structure(structure=self._map_func.output_structure))
+
+  def _functions(self):
+    return [self._map_func]
 
   @property
   def output_classes(self):
-    return self._output_classes
+    return self._map_func.output_classes
 
   @property
   def output_shapes(self):
-    return self._output_shapes
+    return self._map_func.output_shapes
 
   @property
   def output_types(self):
-    return self._output_types
+    return self._map_func.output_types
 
   def _transformation_name(self):
     return "Dataset.map()"
 
 
-class MatchingFilesDataset(Dataset):
-  """A `Dataset` that list the files according to the input patterns."""
-
-  def __init__(self, patterns):
-    super(MatchingFilesDataset, self).__init__()
-    self._patterns = ops.convert_to_tensor(
-        patterns, dtype=dtypes.string, name="patterns")
-
-  def _as_variant_tensor(self):
-    return gen_dataset_ops.matching_files_dataset(self._patterns)
-
-  def _inputs(self):
-    return []
-
-  @property
-  def output_classes(self):
-    return ops.Tensor
-
-  @property
-  def output_shapes(self):
-    return tensor_shape.scalar()
-
-  @property
-  def output_types(self):
-    return dtypes.string
-
-
 class ParallelMapDataset(MapDataset):
   """A `Dataset` that maps a function over elements in its input in parallel."""
 
@@ -2780,16 +2833,15 @@ class ParallelMapDataset(MapDataset):
         num_parallel_calls, dtype=dtypes.int32, name="num_parallel_calls")
 
   def _as_variant_tensor(self):
-    input_t = self._input_dataset._as_variant_tensor()  # pylint: disable=protected-access
     # pylint: disable=protected-access
+    input_t = self._input_dataset._as_variant_tensor()
     return gen_dataset_ops.parallel_map_dataset(
         input_t,
-        self._map_func.captured_inputs,
-        f=self._map_func,
+        self._map_func.function.captured_inputs,
+        f=self._map_func.function,
         num_parallel_calls=self._num_parallel_calls,
         use_inter_op_parallelism=self._use_inter_op_parallelism,
-        **flat_structure(self))
-    # pylint: enable=protected-access
+        **flat_structure(structure=self._map_func.output_structure))
 
 
 class FlatMapDataset(UnaryDataset):
@@ -2800,36 +2852,33 @@ class FlatMapDataset(UnaryDataset):
     super(FlatMapDataset, self).__init__(input_dataset)
     self._input_dataset = input_dataset
 
-    wrapped_func = StructuredFunctionWrapper(
-        map_func,
-        self._transformation_name(),
-        dataset=input_dataset,
-        experimental_nested_dataset_support=True)
-    if not isinstance(wrapped_func.output_classes, _NestedDatasetComponent):
+    self._map_func = StructuredFunctionWrapper(
+        map_func, self._transformation_name(), dataset=input_dataset)
+    if not isinstance(self._map_func.output_structure, DatasetStructure):
       raise TypeError("`map_func` must return a `Dataset` object.")
-    self._output_classes = wrapped_func.output_classes.output_classes
-    self._output_types = wrapped_func.output_types.output_types
-    self._output_shapes = wrapped_func.output_shapes.output_shapes
-    self._map_func = wrapped_func.function
+    self._output_structure = self._map_func.output_structure._element_structure  # pylint: disable=protected-access
+
+  def _functions(self):
+    return [self._map_func]
 
   def _as_variant_tensor(self):
     return gen_dataset_ops.flat_map_dataset(
         self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
-        self._map_func.captured_inputs,
-        f=self._map_func,
-        **flat_structure(self))
+        self._map_func.function.captured_inputs,
+        f=self._map_func.function,
+        **flat_structure(structure=self._output_structure))
 
   @property
   def output_classes(self):
-    return self._output_classes
+    return self._output_structure._to_legacy_output_classes()  # pylint: disable=protected-access
 
   @property
   def output_shapes(self):
-    return self._output_shapes
+    return self._output_structure._to_legacy_output_shapes()  # pylint: disable=protected-access
 
   @property
   def output_types(self):
-    return self._output_types
+    return self._output_structure._to_legacy_output_types()  # pylint: disable=protected-access
 
   def _transformation_name(self):
     return "Dataset.flat_map()"
@@ -2848,13 +2897,14 @@ class InterleaveDataset(FlatMapDataset):
         block_length, dtype=dtypes.int64, name="block_length")
 
   def _as_variant_tensor(self):
+    # pylint: disable=protected-access
     return gen_dataset_ops.interleave_dataset(
-        self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
-        self._map_func.captured_inputs,  # pylint: disable=protected-access
+        self._input_dataset._as_variant_tensor(),
+        self._map_func.function.captured_inputs,
         self._cycle_length,
         self._block_length,
-        f=self._map_func,  # pylint: disable=protected-access
-        **flat_structure(self))
+        f=self._map_func.function,
+        **flat_structure(structure=self._output_structure))
 
   def _transformation_name(self):
     return "Dataset.interleave()"
@@ -2877,20 +2927,21 @@ class ParallelInterleaveDataset(FlatMapDataset):
         num_parallel_calls, dtype=dtypes.int64, name="num_parallel_calls")
 
   def _as_variant_tensor(self):
+    # pylint: disable=protected-access
     return gen_dataset_ops.parallel_interleave_dataset_v2(
-        self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
-        self._map_func.captured_inputs,  # pylint: disable=protected-access
+        self._input_dataset._as_variant_tensor(),
+        self._map_func.function.captured_inputs,
         self._cycle_length,
         self._block_length,
         self._num_parallel_calls,
-        f=self._map_func,  # pylint: disable=protected-access
-        **flat_structure(self))
+        f=self._map_func.function,
+        **flat_structure(structure=self._output_structure))
 
   def _transformation_name(self):
     return "Dataset.interleave()"
 
 
-class FilterDataset(UnaryDataset):
+class FilterDataset(UnaryUnchangedStructureDataset):
   """A `Dataset` that filters its input according to a predicate function."""
 
   def __init__(self, input_dataset, predicate):
@@ -2903,32 +2954,23 @@ class FilterDataset(UnaryDataset):
         wrapped_func.output_types == dtypes.bool and
         wrapped_func.output_shapes.is_compatible_with(tensor_shape.scalar())):
       raise ValueError("`predicate` must return a scalar boolean tensor.")
-    self._predicate = wrapped_func.function
+    self._predicate = wrapped_func
+
+  def _functions(self):
+    return [self._predicate]
 
   def _as_variant_tensor(self):
     return gen_dataset_ops.filter_dataset(
         self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
-        other_arguments=self._predicate.captured_inputs,
-        predicate=self._predicate,
+        other_arguments=self._predicate.function.captured_inputs,
+        predicate=self._predicate.function,
         **flat_structure(self))
 
-  @property
-  def output_classes(self):
-    return self._input_dataset.output_classes
-
-  @property
-  def output_shapes(self):
-    return self._input_dataset.output_shapes
-
-  @property
-  def output_types(self):
-    return self._input_dataset.output_types
-
   def _transformation_name(self):
     return "Dataset.filter()"
 
 
-class PrefetchDataset(UnaryDataset):
+class PrefetchDataset(UnaryUnchangedStructureDataset):
   """A `Dataset` that asynchronously prefetches its input."""
 
   def __init__(self, input_dataset, buffer_size):
@@ -2946,18 +2988,6 @@ class PrefetchDataset(UnaryDataset):
         buffer_size=self._buffer_size,
         **flat_structure(self))
 
-  @property
-  def output_classes(self):
-    return self._input_dataset.output_classes
-
-  @property
-  def output_shapes(self):
-    return self._input_dataset.output_shapes
-
-  @property
-  def output_types(self):
-    return self._input_dataset.output_types
-
 
 class WindowDataset(UnaryDataset):
   """A dataset that creates window datasets from the input elements."""
@@ -2975,10 +3005,9 @@ class WindowDataset(UnaryDataset):
     self._output_classes = nest.pack_sequence_as(
         input_dataset.output_classes,
         [
-            _NestedDatasetComponent(  # pylint: disable=protected-access
-                output_classes=output_class,
-                output_shapes=output_shape,
-                output_types=output_type)
+            DatasetStructure(
+                structure_lib.Structure._from_legacy_structure(  # pylint: disable=protected-access
+                    output_type, output_shape, output_class))
             for output_class, output_shape, output_type in zip(
                 nest.flatten(input_dataset.output_classes),
                 nest.flatten(input_dataset.output_shapes),
@@ -3009,7 +3038,7 @@ class WindowDataset(UnaryDataset):
     return self._output_types
 
 
-class _OptionsDataset(UnaryDataset):
+class _OptionsDataset(UnaryUnchangedStructureDataset):
   """An identity `Dataset` that stores options."""
 
   def __init__(self, input_dataset, options):
@@ -3027,20 +3056,8 @@ class _OptionsDataset(UnaryDataset):
   def options(self):
     return self._options
 
-  @property
-  def output_classes(self):
-    return self._input_dataset.output_classes
 
-  @property
-  def output_shapes(self):
-    return self._input_dataset.output_shapes
-
-  @property
-  def output_types(self):
-    return self._input_dataset.output_types
-
-
-class _ModelDataset(UnaryDataset):
+class _ModelDataset(UnaryUnchangedStructureDataset):
   """A `Dataset` that acts as an identity, and models performance."""
 
   def __init__(self, input_dataset):
@@ -3053,20 +3070,8 @@ class _ModelDataset(UnaryDataset):
         self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
         **flat_structure(self))
 
-  @property
-  def output_classes(self):
-    return self._input_dataset.output_classes
 
-  @property
-  def output_shapes(self):
-    return self._input_dataset.output_shapes
-
-  @property
-  def output_types(self):
-    return self._input_dataset.output_types
-
-
-class _OptimizeDataset(UnaryDataset):
+class _OptimizeDataset(UnaryUnchangedStructureDataset):
   """A `Dataset` that acts as an identity, and applies optimizations."""
 
   def __init__(self, input_dataset, optimizations):
@@ -3084,21 +3089,9 @@ class _OptimizeDataset(UnaryDataset):
         self._optimizations,
         **flat_structure(self))
 
-  @property
-  def output_classes(self):
-    return self._input_dataset.output_classes
 
-  @property
-  def output_shapes(self):
-    return self._input_dataset.output_shapes
-
-  @property
-  def output_types(self):
-    return self._input_dataset.output_types
-
-
-class _SetStatsAggregatorDataset(UnaryDataset):
-  """A `Dataset` that acts as an identity, and sets stats aggregator."""
+class _SetStatsAggregatorDataset(UnaryUnchangedStructureDataset):
+  """A `Dataset` that acts as an identity, and sets a stats aggregator."""
 
   def __init__(self, input_dataset, aggregator, prefix, counter_prefix):
     super(_SetStatsAggregatorDataset, self).__init__(input_dataset)
@@ -3108,21 +3101,43 @@ class _SetStatsAggregatorDataset(UnaryDataset):
     self._counter_prefix = counter_prefix
 
   def _as_variant_tensor(self):
-    return gen_dataset_ops.set_stats_aggregator_dataset(
+    return ged_ops.experimental_set_stats_aggregator_dataset(
         self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
         self._stats_aggregator._resource,  # pylint: disable=protected-access
         self._prefix,
         self._counter_prefix,
         **flat_structure(self))
 
-  @property
-  def output_shapes(self):
-    return self._input_dataset.output_shapes
 
-  @property
-  def output_types(self):
-    return self._input_dataset.output_types
+class _MaxIntraOpParallelismDataset(UnaryUnchangedStructureDataset):
+  """A `Dataset` that acts as an identity, overriding intra-op parallelism."""
 
-  @property
-  def output_classes(self):
-    return self._input_dataset.output_classes
+  def __init__(self, input_dataset, max_intra_op_parallelism):
+    super(_MaxIntraOpParallelismDataset, self).__init__(input_dataset)
+    self._input_dataset = input_dataset
+    self._max_intra_op_parallelism = ops.convert_to_tensor(
+        max_intra_op_parallelism,
+        dtype=dtypes.int64,
+        name="max_intra_op_parallelism")
+
+  def _as_variant_tensor(self):
+    return ged_ops.experimental_max_intra_op_parallelism_dataset(
+        self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
+        self._max_intra_op_parallelism,
+        **flat_structure(self))
+
+
+class _PrivateThreadPoolDataset(UnaryUnchangedStructureDataset):
+  """A `Dataset` that acts as an identity, setting a private threadpool."""
+
+  def __init__(self, input_dataset, num_threads):
+    super(_PrivateThreadPoolDataset, self).__init__(input_dataset)
+    self._input_dataset = input_dataset
+    self._num_threads = ops.convert_to_tensor(
+        num_threads, dtype=dtypes.int64, name="num_threads")
+
+  def _as_variant_tensor(self):
+    return ged_ops.experimental_private_thread_pool_dataset(
+        self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
+        self._num_threads,
+        **flat_structure(self))
diff --git a/tensorflow/python/data/ops/iterator_ops.py b/tensorflow/python/data/ops/iterator_ops.py
index 68b03ba93be..e2ca64c8025 100644
--- a/tensorflow/python/data/ops/iterator_ops.py
+++ b/tensorflow/python/data/ops/iterator_ops.py
@@ -68,7 +68,7 @@ def _device_stack_is_empty():
   return not bool(device_stack)
 
 
-@tf_export("data.Iterator")
+@tf_export(v1=["data.Iterator"])
 class Iterator(checkpointable.CheckpointableBase):
   """Represents the state of iterating through a `Dataset`."""
 
diff --git a/tensorflow/python/data/ops/optional_ops.py b/tensorflow/python/data/ops/optional_ops.py
index 91cf883ce94..15ec755c676 100644
--- a/tensorflow/python/data/ops/optional_ops.py
+++ b/tensorflow/python/data/ops/optional_ops.py
@@ -183,19 +183,17 @@ class OptionalStructure(structure.Structure):
     return OptionalStructure(value.value_structure)
 
   def _to_legacy_output_types(self):
-    raise NotImplementedError("The `output_types` property is not supported on "
-                              "structured objects containing an `Optional`. "
-                              "Use the corresponding `structure` property.")
+    return self
 
   def _to_legacy_output_shapes(self):
-    raise NotImplementedError("The `output_shapes` property is not supported on"
-                              " structured objects containing an `Optional`. "
-                              "Use the corresponding `structure` property.")
+    return self
 
   def _to_legacy_output_classes(self):
-    raise NotImplementedError("The `output_classes` property is not supported "
-                              "on structured objects containing an `Optional`. "
-                              "Use the corresponding `structure` property.")
+    return self
+
+  def _batch(self, batch_size):
+    raise NotImplementedError(
+        "Batching for `tf.data.experimental.Optional` objects.")
 
 
 # pylint: disable=protected-access
diff --git a/tensorflow/python/data/ops/readers.py b/tensorflow/python/data/ops/readers.py
index 7e165a052d7..70a3b1b1cb3 100644
--- a/tensorflow/python/data/ops/readers.py
+++ b/tensorflow/python/data/ops/readers.py
@@ -25,6 +25,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_dataset_ops
+from tensorflow.python.ops import gen_experimental_dataset_ops as ged_ops
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -32,8 +33,8 @@ from tensorflow.python.util.tf_export import tf_export
 _DEFAULT_READER_BUFFER_SIZE_BYTES = 256 * 1024  # 256 KB
 
 
-@tf_export("data.TextLineDataset")
-class TextLineDataset(dataset_ops.DatasetSource):
+@tf_export("data.TextLineDataset", v1=[])
+class TextLineDatasetV2(dataset_ops.DatasetSource):
   """A `Dataset` comprising lines from one or more text files."""
 
   def __init__(self, filenames, compression_type=None, buffer_size=None):
@@ -47,7 +48,7 @@ class TextLineDataset(dataset_ops.DatasetSource):
         to buffer. A value of 0 results in the default buffering values chosen
         based on the compression type.
     """
-    super(TextLineDataset, self).__init__()
+    super(TextLineDatasetV2, self).__init__()
     self._filenames = ops.convert_to_tensor(
         filenames, dtype=dtypes.string, name="filenames")
     self._compression_type = convert.optional_param_to_tensor(
@@ -75,6 +76,24 @@ class TextLineDataset(dataset_ops.DatasetSource):
     return dtypes.string
 
 
+@tf_export(v1=["data.TextLineDataset"])
+class TextLineDatasetV1(dataset_ops.DatasetV1Adapter):
+  """A `Dataset` comprising lines from one or more text files."""
+
+  def __init__(self, filenames, compression_type=None, buffer_size=None):
+    wrapped = TextLineDatasetV2(filenames, compression_type, buffer_size)
+    super(TextLineDatasetV1, self).__init__(wrapped)
+  __init__.__doc__ = TextLineDatasetV2.__init__.__doc__
+
+  @property
+  def _filenames(self):
+    return self._dataset._filenames  # pylint: disable=protected-access
+
+  @_filenames.setter
+  def _filenames(self, value):
+    self._dataset._filenames = value  # pylint: disable=protected-access
+
+
 class _TFRecordDataset(dataset_ops.DatasetSource):
   """A `Dataset` comprising records from one or more TFRecord files."""
 
@@ -140,29 +159,28 @@ class ParallelInterleaveDataset(dataset_ops.InterleaveDataset):
 
   def _as_variant_tensor(self):
     # pylint: disable=protected-access
-    return gen_dataset_ops.parallel_interleave_dataset(
+    return ged_ops.experimental_parallel_interleave_dataset(
         self._input_dataset._as_variant_tensor(),
-        self._map_func.captured_inputs,
+        self._map_func.function.captured_inputs,
         self._cycle_length,
         self._block_length,
         self._sloppy,
         self._buffer_output_elements,
         self._prefetch_input_elements,
-        f=self._map_func,
-        **dataset_ops.flat_structure(self))
-    # pylint: enable=protected-access
+        f=self._map_func.function,
+        **dataset_ops.flat_structure(structure=self._output_structure))
 
   def _transformation_name(self):
     return "tf.data.experimental.parallel_interleave()"
 
 
-@tf_export("data.TFRecordDataset")
-class TFRecordDataset(dataset_ops.Dataset):
+@tf_export("data.TFRecordDataset", v1=[])
+class TFRecordDatasetV2(dataset_ops.DatasetV2):
   """A `Dataset` comprising records from one or more TFRecord files."""
 
   def __init__(self, filenames, compression_type=None, buffer_size=None,
                num_parallel_reads=None):
-    """Creates a `TFRecordDataset` to read for one or more TFRecord files.
+    """Creates a `TFRecordDataset` to read one or more TFRecord files.
 
     NOTE: The `num_parallel_reads` argument can be used to improve performance
     when reading from a remote filesystem.
@@ -182,8 +200,8 @@ class TFRecordDataset(dataset_ops.Dataset):
       TypeError: If any argument does not have the expected type.
       ValueError: If any argument does not have the expected shape.
     """
-    super(TFRecordDataset, self).__init__()
-    if isinstance(filenames, dataset_ops.Dataset):
+    super(TFRecordDatasetV2, self).__init__()
+    if isinstance(filenames, dataset_ops.DatasetV2):
       if filenames.output_types != dtypes.string:
         raise TypeError(
             "`filenames` must be a `tf.data.Dataset` of `tf.string` elements.")
@@ -194,7 +212,7 @@ class TFRecordDataset(dataset_ops.Dataset):
     else:
       filenames = ops.convert_to_tensor(filenames, dtype=dtypes.string)
       filenames = array_ops.reshape(filenames, [-1], name="flat_filenames")
-      filenames = dataset_ops.Dataset.from_tensor_slices(filenames)
+      filenames = dataset_ops.DatasetV2.from_tensor_slices(filenames)
 
     self._filenames = filenames
     self._compression_type = compression_type
@@ -217,10 +235,10 @@ class TFRecordDataset(dataset_ops.Dataset):
              compression_type=None,
              buffer_size=None,
              num_parallel_reads=None):
-    return TFRecordDataset(filenames or self._filenames,
-                           compression_type or self._compression_type,
-                           buffer_size or self._buffer_size,
-                           num_parallel_reads or self._num_parallel_reads)
+    return TFRecordDatasetV2(filenames or self._filenames,
+                             compression_type or self._compression_type,
+                             buffer_size or self._buffer_size,
+                             num_parallel_reads or self._num_parallel_reads)
 
   def _as_variant_tensor(self):
     return self._impl._as_variant_tensor()  # pylint: disable=protected-access
@@ -241,8 +259,40 @@ class TFRecordDataset(dataset_ops.Dataset):
     return self._impl.output_types
 
 
-@tf_export("data.FixedLengthRecordDataset")
-class FixedLengthRecordDataset(dataset_ops.DatasetSource):
+@tf_export(v1=["data.TFRecordDataset"])
+class TFRecordDatasetV1(dataset_ops.DatasetV1Adapter):
+  """A `Dataset` comprising records from one or more TFRecord files."""
+
+  def __init__(self, filenames, compression_type=None, buffer_size=None,
+               num_parallel_reads=None):
+    wrapped = TFRecordDatasetV2(
+        filenames, compression_type, buffer_size, num_parallel_reads)
+    super(TFRecordDatasetV1, self).__init__(wrapped)
+  __init__.__doc__ = TFRecordDatasetV2.__init__.__doc__
+
+  def _clone(self,
+             filenames=None,
+             compression_type=None,
+             buffer_size=None,
+             num_parallel_reads=None):
+    # pylint: disable=protected-access
+    return TFRecordDatasetV1(
+        filenames or self._dataset._filenames,
+        compression_type or self._dataset._compression_type,
+        buffer_size or self._dataset._buffer_size,
+        num_parallel_reads or self._dataset._num_parallel_reads)
+
+  @property
+  def _filenames(self):
+    return self._dataset._filenames  # pylint: disable=protected-access
+
+  @_filenames.setter
+  def _filenames(self, value):
+    self._dataset._filenames = value  # pylint: disable=protected-access
+
+
+@tf_export("data.FixedLengthRecordDataset", v1=[])
+class FixedLengthRecordDatasetV2(dataset_ops.DatasetSource):
   """A `Dataset` of fixed-length records from one or more binary files."""
 
   def __init__(self,
@@ -267,7 +317,7 @@ class FixedLengthRecordDataset(dataset_ops.DatasetSource):
       compression_type: (Optional.) A `tf.string` scalar evaluating to one of
         `""` (no compression), `"ZLIB"`, or `"GZIP"`.
     """
-    super(FixedLengthRecordDataset, self).__init__()
+    super(FixedLengthRecordDatasetV2, self).__init__()
     self._filenames = ops.convert_to_tensor(
         filenames, dtype=dtypes.string, name="filenames")
     self._record_bytes = ops.convert_to_tensor(
@@ -296,7 +346,6 @@ class FixedLengthRecordDataset(dataset_ops.DatasetSource):
           self._filenames, self._header_bytes, self._record_bytes,
           self._footer_bytes, self._buffer_size)
 
-
   @property
   def output_classes(self):
     return ops.Tensor
@@ -308,3 +357,36 @@ class FixedLengthRecordDataset(dataset_ops.DatasetSource):
   @property
   def output_types(self):
     return dtypes.string
+
+
+@tf_export(v1=["data.FixedLengthRecordDataset"])
+class FixedLengthRecordDatasetV1(dataset_ops.DatasetV1Adapter):
+  """A `Dataset` of fixed-length records from one or more binary files."""
+
+  def __init__(self,
+               filenames,
+               record_bytes,
+               header_bytes=None,
+               footer_bytes=None,
+               buffer_size=None,
+               compression_type=None):
+    wrapped = FixedLengthRecordDatasetV2(
+        filenames, record_bytes, header_bytes, footer_bytes, buffer_size,
+        compression_type)
+    super(FixedLengthRecordDatasetV1, self).__init__(wrapped)
+  __init__.__doc__ = FixedLengthRecordDatasetV2.__init__.__doc__
+
+  @property
+  def _filenames(self):
+    return self._dataset._filenames  # pylint: disable=protected-access
+
+  @_filenames.setter
+  def _filenames(self, value):
+    self._dataset._filenames = value  # pylint: disable=protected-access
+
+
+# TODO(b/119044825): Until all `tf.data` unit tests are converted to V2, keep
+# these aliases in place.
+FixedLengthRecordDataset = FixedLengthRecordDatasetV1
+TFRecordDataset = TFRecordDatasetV1
+TextLineDataset = TextLineDatasetV1
diff --git a/tensorflow/python/data/util/BUILD b/tensorflow/python/data/util/BUILD
index 39082ce3707..f15ebc32a83 100644
--- a/tensorflow/python/data/util/BUILD
+++ b/tensorflow/python/data/util/BUILD
@@ -97,6 +97,23 @@ py_test(
     ],
 )
 
+py_library(
+    name = "options",
+    srcs = ["options.py"],
+    srcs_version = "PY2AND3",
+)
+
+py_test(
+    name = "options_test",
+    size = "small",
+    srcs = ["options_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":options",
+        "//tensorflow/python:client_testlib",
+    ],
+)
+
 py_library(
     name = "convert",
     srcs = ["convert.py"],
diff --git a/tensorflow/python/data/util/convert_test.py b/tensorflow/python/data/util/convert_test.py
index 89c3afb2969..78ca6e95139 100644
--- a/tensorflow/python/data/util/convert_test.py
+++ b/tensorflow/python/data/util/convert_test.py
@@ -22,6 +22,7 @@ from tensorflow.python.data.util import convert
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
 from tensorflow.python.util import compat
 
@@ -30,47 +31,53 @@ class ConvertTest(test.TestCase):
 
   def testInteger(self):
     resp = convert.optional_param_to_tensor("foo", 3)
-    with self.cached_session() as sess:
-      self.assertEqual(3, sess.run(resp))
+    self.assertEqual(3, self.evaluate(resp))
 
   def testIntegerDefault(self):
     resp = convert.optional_param_to_tensor("foo", None)
-    with self.cached_session() as sess:
-      self.assertEqual(0, sess.run(resp))
+    self.assertEqual(0, self.evaluate(resp))
 
   def testStringDefault(self):
     resp = convert.optional_param_to_tensor("bar", None, "default",
                                             dtypes.string)
-    with self.cached_session() as sess:
-      self.assertEqual(compat.as_bytes("default"), sess.run(resp))
+    self.assertEqual(compat.as_bytes("default"), self.evaluate(resp))
 
   def testString(self):
     resp = convert.optional_param_to_tensor("bar", "value", "default",
                                             dtypes.string)
-    with self.cached_session() as sess:
-      self.assertEqual(compat.as_bytes("value"), sess.run(resp))
+    self.assertEqual(compat.as_bytes("value"), self.evaluate(resp))
 
   def testPartialShapeToTensorKnownDimension(self):
-    with self.cached_session() as sess:
-      self.assertAllEqual([1], sess.run(convert.partial_shape_to_tensor(
-          tensor_shape.TensorShape([1]))))
-      self.assertAllEqual([1], sess.run(convert.partial_shape_to_tensor((1,))))
-      self.assertAllEqual([1], sess.run(convert.partial_shape_to_tensor([1])))
-      self.assertAllEqual([1], sess.run(convert.partial_shape_to_tensor(
-          constant_op.constant([1], dtype=dtypes.int64))))
+    self.assertAllEqual([1],
+                        self.evaluate(
+                            convert.partial_shape_to_tensor(
+                                tensor_shape.TensorShape([1]))))
+    self.assertAllEqual([1], self.evaluate(
+        convert.partial_shape_to_tensor((1,))))
+    self.assertAllEqual([1], self.evaluate(
+        convert.partial_shape_to_tensor([1])))
+    self.assertAllEqual([1],
+                        self.evaluate(
+                            convert.partial_shape_to_tensor(
+                                constant_op.constant([1], dtype=dtypes.int64))))
 
+  @test_util.run_deprecated_v1
   def testPartialShapeToTensorUnknownDimension(self):
-    with self.cached_session() as sess:
-      self.assertAllEqual([-1], sess.run(convert.partial_shape_to_tensor(
-          tensor_shape.TensorShape([None]))))
-      self.assertAllEqual([-1], sess.run(convert.partial_shape_to_tensor(
-          (None,))))
-      self.assertAllEqual([-1], sess.run(convert.partial_shape_to_tensor(
-          [None])))
-      self.assertAllEqual([-1], sess.run(convert.partial_shape_to_tensor(
-          [-1])))
-      self.assertAllEqual([-1], sess.run(convert.partial_shape_to_tensor(
-          constant_op.constant([-1], dtype=dtypes.int64))))
+    self.assertAllEqual([-1],
+                        self.evaluate(
+                            convert.partial_shape_to_tensor(
+                                tensor_shape.TensorShape([None]))))
+    self.assertAllEqual([-1],
+                        self.evaluate(convert.partial_shape_to_tensor((None,))))
+    self.assertAllEqual([-1],
+                        self.evaluate(convert.partial_shape_to_tensor([None])))
+    self.assertAllEqual([-1],
+                        self.evaluate(convert.partial_shape_to_tensor([-1])))
+    self.assertAllEqual([-1],
+                        self.evaluate(
+                            convert.partial_shape_to_tensor(
+                                constant_op.constant([-1],
+                                                     dtype=dtypes.int64))))
 
     with self.assertRaisesRegexp(
         ValueError, r"The given shape .* must be a 1-D tensor of tf.int64 "
@@ -84,42 +91,63 @@ class ConvertTest(test.TestCase):
       convert.partial_shape_to_tensor(constant_op.constant([1., 1.]))
 
   def testPartialShapeToTensorMultipleDimensions(self):
-    with self.cached_session() as sess:
-      self.assertAllEqual([3, 6], sess.run(convert.partial_shape_to_tensor(
-          tensor_shape.TensorShape([3, 6]))))
-      self.assertAllEqual([3, 6], sess.run(convert.partial_shape_to_tensor(
-          (3, 6))))
-      self.assertAllEqual([3, 6], sess.run(convert.partial_shape_to_tensor(
-          [3, 6])))
-      self.assertAllEqual([3, 6], sess.run(convert.partial_shape_to_tensor(
-          constant_op.constant([3, 6], dtype=dtypes.int64))))
+    self.assertAllEqual([3, 6],
+                        self.evaluate(
+                            convert.partial_shape_to_tensor(
+                                tensor_shape.TensorShape([3, 6]))))
+    self.assertAllEqual([3, 6],
+                        self.evaluate(convert.partial_shape_to_tensor((3, 6))))
+    self.assertAllEqual([3, 6],
+                        self.evaluate(convert.partial_shape_to_tensor([3, 6])))
+    self.assertAllEqual([3, 6],
+                        self.evaluate(
+                            convert.partial_shape_to_tensor(
+                                constant_op.constant([3, 6],
+                                                     dtype=dtypes.int64))))
 
-      self.assertAllEqual([3, -1], sess.run(convert.partial_shape_to_tensor(
-          tensor_shape.TensorShape([3, None]))))
-      self.assertAllEqual([3, -1], sess.run(convert.partial_shape_to_tensor(
-          (3, None))))
-      self.assertAllEqual([3, -1], sess.run(convert.partial_shape_to_tensor(
-          [3, None])))
-      self.assertAllEqual([3, -1], sess.run(convert.partial_shape_to_tensor(
-          constant_op.constant([3, -1], dtype=dtypes.int64))))
+    self.assertAllEqual([3, -1],
+                        self.evaluate(
+                            convert.partial_shape_to_tensor(
+                                tensor_shape.TensorShape([3, None]))))
+    self.assertAllEqual([3, -1],
+                        self.evaluate(
+                            convert.partial_shape_to_tensor((3, None))))
+    self.assertAllEqual([3, -1],
+                        self.evaluate(
+                            convert.partial_shape_to_tensor([3, None])))
+    self.assertAllEqual([3, -1],
+                        self.evaluate(
+                            convert.partial_shape_to_tensor(
+                                constant_op.constant([3, -1],
+                                                     dtype=dtypes.int64))))
 
-      self.assertAllEqual([-1, -1], sess.run(convert.partial_shape_to_tensor(
-          tensor_shape.TensorShape([None, None]))))
-      self.assertAllEqual([-1, -1], sess.run(convert.partial_shape_to_tensor(
-          (None, None))))
-      self.assertAllEqual([-1, -1], sess.run(convert.partial_shape_to_tensor(
-          [None, None])))
-      self.assertAllEqual([-1, -1], sess.run(convert.partial_shape_to_tensor(
-          constant_op.constant([-1, -1], dtype=dtypes.int64))))
+    self.assertAllEqual([-1, -1],
+                        self.evaluate(
+                            convert.partial_shape_to_tensor(
+                                tensor_shape.TensorShape([None, None]))))
+    self.assertAllEqual([-1, -1],
+                        self.evaluate(
+                            convert.partial_shape_to_tensor((None, None))))
+    self.assertAllEqual([-1, -1],
+                        self.evaluate(
+                            convert.partial_shape_to_tensor([None, None])))
+    self.assertAllEqual([-1, -1],
+                        self.evaluate(
+                            convert.partial_shape_to_tensor(
+                                constant_op.constant([-1, -1],
+                                                     dtype=dtypes.int64))))
 
   def testPartialShapeToTensorScalar(self):
-    with self.cached_session() as sess:
-      self.assertAllEqual([], sess.run(convert.partial_shape_to_tensor(
-          tensor_shape.TensorShape([]))))
-      self.assertAllEqual([], sess.run(convert.partial_shape_to_tensor(())))
-      self.assertAllEqual([], sess.run(convert.partial_shape_to_tensor([])))
-      self.assertAllEqual([], sess.run(convert.partial_shape_to_tensor(
-          constant_op.constant([], dtype=dtypes.int64))))
+    self.assertAllEqual([],
+                        self.evaluate(
+                            convert.partial_shape_to_tensor(
+                                tensor_shape.TensorShape([]))))
+    self.assertAllEqual([], self.evaluate(convert.partial_shape_to_tensor(())))
+    self.assertAllEqual([], self.evaluate(convert.partial_shape_to_tensor([])))
+    self.assertAllEqual([],
+                        self.evaluate(
+                            convert.partial_shape_to_tensor(
+                                constant_op.constant([], dtype=dtypes.int64))))
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/data/util/options.py b/tensorflow/python/data/util/options.py
new file mode 100644
index 00000000000..9badba8e567
--- /dev/null
+++ b/tensorflow/python/data/util/options.py
@@ -0,0 +1,131 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utilities for tf.data options."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+
+def _internal_attr_name(name):
+  return "_" + name
+
+
+class OptionsBase(object):
+  """Base class for representing a set of tf.data options.
+
+  Attributes:
+    _options: Stores the option values.
+  """
+
+  def __init__(self):
+    self._options = {}
+
+  def __eq__(self, other):
+    if not isinstance(other, self.__class__):
+      return NotImplemented
+    for name in set(self._options) | set(other._options):  # pylint: disable=protected-access
+      if getattr(self, name) != getattr(other, name):
+        return False
+    return True
+
+  def __ne__(self, other):
+    if isinstance(other, self.__class__):
+      return not self.__eq__(other)
+    else:
+      return NotImplemented
+
+
+def create_option(name, ty, docstring, default=None):
+  """Creates a type-checked property.
+
+  Args:
+    name: the name to use
+    ty: the type to use
+    docstring: the docstring to use
+    default: the default value to use
+
+  Returns:
+    A type-checked property.
+  """
+
+  def get_fn(self):
+    return self._options.get(name, default)  # pylint: disable=protected-access
+
+  def set_fn(self, value):
+    if not isinstance(value, ty):
+      raise TypeError("Property \"%s\" must be of type %s, got: %r (type: %r)" %
+                      (name, ty, value, type(value)))
+    self._options[name] = value  # pylint: disable=protected-access
+
+  return property(get_fn, set_fn, None, docstring)
+
+
+def merge_options(*options_list):
+  """Merges the given options, returning the result as a new options object.
+
+  The input arguments are expected to have a matching type that derives from
+  `OptionsBase` (and thus each represent a set of options). The method outputs
+  an object of the same type created by merging the sets of options represented
+  by the input arguments.
+
+  The sets of options can be merged as long as there does not exist an option
+  with different non-default values.
+
+  If an option is an instance of `OptionsBase` itself, then this method is
+  applied recursively to the set of options represented by this option.
+
+  Args:
+    *options_list: options to merge
+
+  Raises:
+    TypeError: if the input arguments are incompatible or not derived from
+      `OptionsBase`
+    ValueError: if the given options cannot be merged
+
+  Returns:
+    A new options object which is the result of merging the given options.
+  """
+  if len(options_list) < 1:
+    raise ValueError("At least one options should be provided")
+  result_type = type(options_list[0])
+
+  for options in options_list:
+    if not isinstance(options, result_type):
+      raise TypeError("Incompatible options type: %r vs %r" % (type(options),
+                                                               result_type))
+
+  if not isinstance(options_list[0], OptionsBase):
+    raise TypeError("The inputs should inherit from `OptionsBase`")
+
+  default_options = result_type()
+  result = result_type()
+  for options in options_list:
+    # Iterate over all set options and merge the into the result.
+    for name in options._options:  # pylint: disable=protected-access
+      this = getattr(result, name)
+      that = getattr(options, name)
+      default = getattr(default_options, name)
+      if that == default:
+        continue
+      elif this == default:
+        setattr(result, name, that)
+      elif isinstance(this, OptionsBase):
+        setattr(result, name, merge_options(this, that))
+      elif this != that:
+        raise ValueError(
+            "Cannot merge incompatible values (%r and %r) of option: %s" %
+            (this, that, name))
+  return result
diff --git a/tensorflow/python/data/util/options_test.py b/tensorflow/python/data/util/options_test.py
new file mode 100644
index 00000000000..c5169835a32
--- /dev/null
+++ b/tensorflow/python/data/util/options_test.py
@@ -0,0 +1,96 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for dataset options utilities."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.data.util import options
+from tensorflow.python.platform import test
+
+
+class _TestOptions(options.OptionsBase):
+  x = options.create_option(
+      name="x", ty=int, docstring="the answer to everything", default=42)
+  y = options.create_option(
+      name="y", ty=float, docstring="a tasty pie", default=3.14)
+
+
+class _NestedTestOptions(options.OptionsBase):
+  opts = options.create_option(
+      name="opts", ty=_TestOptions, docstring="nested options")
+
+
+class OptionsTest(test.TestCase):
+
+  def testDocumentation(self):
+    self.assertEqual(_TestOptions.x.__doc__, "the answer to everything")
+    self.assertEqual(_TestOptions.y.__doc__, "a tasty pie")
+
+  def testCreateOption(self):
+    opts = _TestOptions()
+    self.assertEqual(opts.x, 42)
+    self.assertEqual(opts.y, 3.14)
+    self.assertIsInstance(opts.x, int)
+    self.assertIsInstance(opts.y, float)
+    opts.x = 0
+    self.assertEqual(opts.x, 0)
+    with self.assertRaises(TypeError):
+      opts.x = 3.14
+    opts.y = 0.0
+    self.assertEqual(opts.y, 0.0)
+    with self.assertRaises(TypeError):
+      opts.y = 42
+
+  def testMergeOptions(self):
+    options1, options2 = _TestOptions(), _TestOptions()
+    with self.assertRaises(ValueError):
+      options.merge_options()
+    merged_options = options.merge_options(options1, options2)
+    self.assertEqual(merged_options.x, 42)
+    self.assertEqual(merged_options.y, 3.14)
+    options1.x = 0
+    options2.y = 0.0
+    merged_options = options.merge_options(options1, options2)
+    self.assertEqual(merged_options.x, 0)
+    self.assertEqual(merged_options.y, 0.0)
+
+  def testMergeNestedOptions(self):
+    options1, options2 = _NestedTestOptions(), _NestedTestOptions()
+    merged_options = options.merge_options(options1, options2)
+    self.assertEqual(merged_options.opts, None)
+    options1.opts = _TestOptions()
+    merged_options = options.merge_options(options1, options2)
+    self.assertEqual(merged_options.opts, _TestOptions())
+    options2.opts = _TestOptions()
+    merged_options = options.merge_options(options1, options2)
+    self.assertEqual(merged_options.opts, _TestOptions())
+    options1.opts.x = 0
+    options2.opts.y = 0.0
+    merged_options = options.merge_options(options1, options2)
+    self.assertEqual(merged_options.opts.x, 0)
+    self.assertEqual(merged_options.opts.y, 0.0)
+
+  def testMergeOptionsInvalid(self):
+    with self.assertRaises(TypeError):
+      options.merge_options(0)
+    options1, options2 = _TestOptions(), _NestedTestOptions()
+    with self.assertRaises(TypeError):
+      options.merge_options(options1, options2)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/data/util/sparse.py b/tensorflow/python/data/util/sparse.py
index 5e6d2247097..f2e22fefd31 100644
--- a/tensorflow/python/data/util/sparse.py
+++ b/tensorflow/python/data/util/sparse.py
@@ -34,7 +34,7 @@ def any_sparse(classes):
   Returns:
     `True` if `classes` contains a sparse tensor type and `False` otherwise.
   """
-  return any([c is sparse_tensor.SparseTensor for c in nest.flatten(classes)])
+  return any(c is sparse_tensor.SparseTensor for c in nest.flatten(classes))
 
 
 def as_dense_shapes(shapes, classes):
diff --git a/tensorflow/python/data/util/sparse_test.py b/tensorflow/python/data/util/sparse_test.py
index 056b32480f3..06acf55ab9d 100644
--- a/tensorflow/python/data/util/sparse_test.py
+++ b/tensorflow/python/data/util/sparse_test.py
@@ -25,6 +25,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
 
 
@@ -292,10 +293,11 @@ class SparseTest(test.TestCase):
       return
     self.assertTrue(isinstance(b, sparse_tensor.SparseTensor))
     with self.cached_session():
-      self.assertAllEqual(a.eval().indices, b.eval().indices)
-      self.assertAllEqual(a.eval().values, b.eval().values)
-      self.assertAllEqual(a.eval().dense_shape, b.eval().dense_shape)
+      self.assertAllEqual(a.eval().indices, self.evaluate(b).indices)
+      self.assertAllEqual(a.eval().values, self.evaluate(b).values)
+      self.assertAllEqual(a.eval().dense_shape, self.evaluate(b).dense_shape)
 
+  @test_util.run_deprecated_v1
   def testSerializeDeserialize(self):
     test_cases = (
         (),
@@ -325,6 +327,7 @@ class SparseTest(test.TestCase):
       for a, e in zip(nest.flatten(actual), nest.flatten(expected)):
         self.assertSparseValuesEqual(a, e)
 
+  @test_util.run_deprecated_v1
   def testSerializeManyDeserialize(self):
     test_cases = (
         (),
diff --git a/tensorflow/python/data/util/structure.py b/tensorflow/python/data/util/structure.py
index 9a3118297db..874e8abc6df 100644
--- a/tensorflow/python/data/util/structure.py
+++ b/tensorflow/python/data/util/structure.py
@@ -144,6 +144,19 @@ class Structure(object):
     """
     return self._from_tensor_list(flat_value)
 
+  @abc.abstractmethod
+  def _batch(self, batch_size):
+    """Returns a structure representing a batch of objects with this structure.
+
+    Args:
+      batch_size: An `int` representing the number of elements in a batch,
+        or `None` if the batch size may vary.
+
+    Returns:
+      A `Structure` representing a batch of objects with this structure.
+    """
+    raise NotImplementedError("Structure._batch()")
+
   @staticmethod
   def from_value(value):
     """Returns a `Structure` that represents the given `value`.
@@ -208,14 +221,16 @@ class Structure(object):
     flat_ret = []
     for flat_type, flat_shape, flat_class in zip(flat_types, flat_shapes,
                                                  flat_classes):
-      if issubclass(flat_class, sparse_tensor_lib.SparseTensor):
+      if isinstance(flat_class, Structure):
+        flat_ret.append(flat_class)
+      elif issubclass(flat_class, sparse_tensor_lib.SparseTensor):
         flat_ret.append(SparseTensorStructure(flat_type, flat_shape))
       elif issubclass(flat_class, ops.Tensor):
         flat_ret.append(TensorStructure(flat_type, flat_shape))
       else:
         # NOTE(mrry): Since legacy structures produced by iterators only
-        # comprise Tensors, SparseTensors, and nests, we do not need to support
-        # all structure types here.
+        # comprise Tensors, SparseTensors, and nests, we do not need to
+        # support all structure types here.
         raise TypeError(
             "Could not build a structure for output class %r" % flat_type)
 
@@ -314,15 +329,23 @@ class NestedStructure(Structure):
                        % (len(self._flat_types), len(flat_value)))
 
     flat_ret = []
-    for sub_value, structure in zip(flat_value, self._flat_nested_structure):
-      flat_ret.append(structure._from_tensor_list([sub_value]))
+    i = 0
+    for structure in self._flat_nested_structure:
+      num_flat_values = len(structure._flat_types)
+      sub_value = flat_value[i:i + num_flat_values]
+      flat_ret.append(structure._from_tensor_list(sub_value))
+      i += num_flat_values
 
     return nest.pack_sequence_as(self._nested_structure, flat_ret)
 
   def _from_compatible_tensor_list(self, flat_value):
     flat_ret = []
-    for sub_value, structure in zip(flat_value, self._flat_nested_structure):
-      flat_ret.append(structure._from_compatible_tensor_list([sub_value]))
+    i = 0
+    for structure in self._flat_nested_structure:
+      num_flat_values = len(structure._flat_types)
+      sub_value = flat_value[i:i + num_flat_values]
+      flat_ret.append(structure._from_compatible_tensor_list(sub_value))
+      i += num_flat_values
 
     return nest.pack_sequence_as(self._nested_structure, flat_ret)
 
@@ -345,6 +368,10 @@ class NestedStructure(Structure):
     return nest.map_structure(
         lambda s: s._to_legacy_output_classes(), self._nested_structure)
 
+  def _batch(self, batch_size):
+    return NestedStructure(nest.map_structure(
+        lambda s: s._batch(batch_size), self._nested_structure))
+
 
 class TensorStructure(Structure):
   """Represents structural information about a `tf.Tensor`."""
@@ -381,6 +408,13 @@ class TensorStructure(Structure):
     return self._from_compatible_tensor_list(flat_value)
 
   def _from_compatible_tensor_list(self, flat_value):
+    # TODO(b/112266545): It would be cleaner to create a new `ensure_shape()`
+    # op here and return that, instead of mutating the input's shape using
+    # `Tensor.set_shape()`. However, that would add extra ops on the arguments
+    # of each `tf.data` function, which could impact performance. When this
+    # bug is resolved, we should be able to add the `ensure_shape()` ops and
+    # optimize them away using contextual shape information.
+    flat_value[0].set_shape(self._shape)
     return flat_value[0]
 
   @staticmethod
@@ -396,6 +430,11 @@ class TensorStructure(Structure):
   def _to_legacy_output_classes(self):
     return ops.Tensor
 
+  def _batch(self, batch_size):
+    return TensorStructure(
+        self._dtype,
+        tensor_shape.TensorShape([batch_size]).concatenate(self._shape))
+
 
 class SparseTensorStructure(Structure):
   """Represents structural information about a `tf.SparseTensor`."""
@@ -406,7 +445,11 @@ class SparseTensorStructure(Structure):
 
   @property
   def _flat_shapes(self):
-    return [tensor_shape.vector(3)]
+    # NOTE(mrry): The default flat shape of a boxed `SparseTensor` is `(3,)`,
+    # but a `SparseTensorStructure` can also represent a batch of boxed
+    # `SparseTensor` objects with shape `(?, 3)` (and batches of batches, etc.),
+    # so the flat shape must be unknown.
+    return [tensor_shape.unknown_shape(None)]
 
   @property
   def _flat_types(self):
@@ -428,8 +471,11 @@ class SparseTensorStructure(Structure):
     return self._from_compatible_tensor_list(flat_value)
 
   def _from_compatible_tensor_list(self, flat_value):
-    return sparse_ops.deserialize_sparse(
+    ret = sparse_ops.deserialize_sparse(
         flat_value[0], dtype=self._dtype, rank=self._dense_shape.ndims)
+    ret.indices.set_shape([None, self._dense_shape.ndims])
+    ret.dense_shape.set_shape([self._dense_shape.ndims])
+    return ret
 
   @staticmethod
   def from_value(value):
@@ -446,3 +492,8 @@ class SparseTensorStructure(Structure):
 
   def _to_legacy_output_classes(self):
     return sparse_tensor_lib.SparseTensor
+
+  def _batch(self, batch_size):
+    return SparseTensorStructure(
+        self._dtype,
+        tensor_shape.TensorShape([batch_size]).concatenate(self._dense_shape))
diff --git a/tensorflow/python/data/util/structure_test.py b/tensorflow/python/data/util/structure_test.py
index 630a0c912bc..314a459effb 100644
--- a/tensorflow/python/data/util/structure_test.py
+++ b/tensorflow/python/data/util/structure_test.py
@@ -28,6 +28,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
@@ -44,7 +45,7 @@ class StructureTest(test.TestCase, parameterized.TestCase):
        [dtypes.float32], [[]]),
       (lambda: sparse_tensor.SparseTensor(
           indices=[[3, 4]], values=[-1], dense_shape=[4, 5]),
-       structure.SparseTensorStructure, [dtypes.variant], [[3]]),
+       structure.SparseTensorStructure, [dtypes.variant], [None]),
       (lambda: (constant_op.constant(37.0), constant_op.constant([1, 2, 3])),
        structure.NestedStructure, [dtypes.float32, dtypes.int32], [[], [3]]),
       (lambda: {
@@ -58,14 +59,17 @@ class StructureTest(test.TestCase, parameterized.TestCase):
                 sparse_tensor.SparseTensor(
                     indices=[[3, 4]], values=[-1], dense_shape=[4, 5]))
       }, structure.NestedStructure,
-       [dtypes.float32, dtypes.variant, dtypes.variant], [[], [3], [3]]))
+       [dtypes.float32, dtypes.variant, dtypes.variant], [[], None, None]))
   def testFlatStructure(self, value_fn, expected_structure, expected_types,
                         expected_shapes):
     value = value_fn()
     s = structure.Structure.from_value(value)
     self.assertIsInstance(s, expected_structure)
     self.assertEqual(expected_types, s._flat_types)
-    self.assertEqual(expected_shapes, s._flat_shapes)
+    for expected, actual in zip(expected_shapes, s._flat_shapes):
+      self.assertTrue(actual.is_compatible_with(expected))
+      self.assertTrue(
+          tensor_shape.as_shape(expected).is_compatible_with(actual))
 
   @parameterized.parameters(
       (lambda: constant_op.constant(37.0), lambda: [
@@ -112,6 +116,7 @@ class StructureTest(test.TestCase, parameterized.TestCase):
                   indices=[[0], [1], [2]], values=[4, 5, 6], dense_shape=[3])
       }, (constant_op.constant(15.0), constant_op.constant([4, 5, 6]))]),
   )
+  @test_util.run_deprecated_v1
   def testIsCompatibleWithStructure(
       self, original_value_fn, compatible_values_fn, incompatible_values_fn):
     original_value = original_value_fn()
@@ -354,5 +359,65 @@ class StructureTest(test.TestCase, parameterized.TestCase):
     self.assertTrue(expected_structure.is_compatible_with(actual_structure))
     self.assertTrue(actual_structure.is_compatible_with(expected_structure))
 
+  def testNestedNestedStructure(self):
+    # Although `Structure.from_value()` will not construct one, a nested
+    # structure containing nested `NestedStructure` objects can occur if a
+    # structure is constructed manually.
+    s = structure.NestedStructure(
+        (structure.TensorStructure(dtypes.int64, []),
+         structure.NestedStructure(
+             (structure.TensorStructure(dtypes.float32, []),
+              structure.TensorStructure(dtypes.string, [])))))
+
+    int64_t = constant_op.constant(37, dtype=dtypes.int64)
+    float32_t = constant_op.constant(42.0)
+    string_t = constant_op.constant("Foo")
+
+    nested_tensors = (int64_t, (float32_t, string_t))
+
+    tensor_list = s._to_tensor_list(nested_tensors)
+    for expected, actual in zip([int64_t, float32_t, string_t], tensor_list):
+      self.assertIs(expected, actual)
+
+    (actual_int64_t, (actual_float32_t, actual_string_t)) = s._from_tensor_list(
+        tensor_list)
+    self.assertIs(int64_t, actual_int64_t)
+    self.assertIs(float32_t, actual_float32_t)
+    self.assertIs(string_t, actual_string_t)
+
+    (actual_int64_t, (actual_float32_t, actual_string_t)) = (
+        s._from_compatible_tensor_list(tensor_list))
+    self.assertIs(int64_t, actual_int64_t)
+    self.assertIs(float32_t, actual_float32_t)
+    self.assertIs(string_t, actual_string_t)
+
+  @parameterized.named_parameters(
+      ("Tensor", structure.TensorStructure(dtypes.float32, []), 32,
+       structure.TensorStructure(dtypes.float32, [32])),
+      ("TensorUnknown", structure.TensorStructure(dtypes.float32, []), None,
+       structure.TensorStructure(dtypes.float32, [None])),
+      ("SparseTensor", structure.SparseTensorStructure(dtypes.float32, [None]),
+       32, structure.SparseTensorStructure(dtypes.float32, [32, None])),
+      ("SparseTensorUnknown",
+       structure.SparseTensorStructure(dtypes.float32, [4]), None,
+       structure.SparseTensorStructure(dtypes.float32, [None, 4])),
+      ("Nest", structure.NestedStructure({
+          "a": structure.TensorStructure(dtypes.float32, []),
+          "b": (structure.SparseTensorStructure(dtypes.int32, [2, 2]),
+                structure.TensorStructure(dtypes.string, []))}), 128,
+       structure.NestedStructure({
+           "a": structure.TensorStructure(dtypes.float32, [128]),
+           "b": (structure.SparseTensorStructure(dtypes.int32, [128, 2, 2]),
+                 structure.TensorStructure(dtypes.string, [128]))})),
+  )
+  def testBatch(self, element_structure, batch_size,
+                expected_batched_structure):
+    batched_structure = element_structure._batch(batch_size)
+    self.assertTrue(
+        batched_structure.is_compatible_with(expected_batched_structure))
+    self.assertTrue(
+        expected_batched_structure.is_compatible_with(batched_structure))
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/debug/BUILD b/tensorflow/python/debug/BUILD
index 79951232097..c6abd476d9d 100644
--- a/tensorflow/python/debug/BUILD
+++ b/tensorflow/python/debug/BUILD
@@ -557,6 +557,7 @@ py_test(
         ":source_utils",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:client",
+        "//tensorflow/python:cond_v2",
         "//tensorflow/python:constant_op",
         "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:framework_ops",
@@ -566,6 +567,7 @@ py_test(
         "//tensorflow/python:resource_variable_ops",
         "//tensorflow/python:util",
         "//tensorflow/python:variables",
+        "//tensorflow/python:while_v2",
         "//third_party/py/numpy",
     ],
 )
diff --git a/tensorflow/python/debug/cli/analyzer_cli_test.py b/tensorflow/python/debug/cli/analyzer_cli_test.py
index f197a9e4dce..322ecf94667 100644
--- a/tensorflow/python/debug/cli/analyzer_cli_test.py
+++ b/tensorflow/python/debug/cli/analyzer_cli_test.py
@@ -645,6 +645,7 @@ class AnalyzerCLISimpleMulAddTest(test_util.TensorFlowTestCase):
     self.assertEqual(len("Size (B)") + 1, dump_size_col_width)
     self.assertEqual(len("Op type") + 1, op_type_col_width)
 
+  @test_util.run_deprecated_v1
   def testMeasureTensorListColumnWidthsGivesRightAnswerForData(self):
     dump = self._debug_dump.dumped_tensor_data[0]
     self.assertLess(dump.dump_size_bytes, 1000)
@@ -660,6 +661,7 @@ class AnalyzerCLISimpleMulAddTest(test_util.TensorFlowTestCase):
     # column should be determined by the length of "VariableV2".
     self.assertEqual(len("VariableV2") + 1, op_type_col_width)
 
+  @test_util.run_deprecated_v1
   def testListTensors(self):
     # Use shorthand alias for the command prefix.
     out = self._registry.dispatch_command("lt", [])
@@ -673,6 +675,7 @@ class AnalyzerCLISimpleMulAddTest(test_util.TensorFlowTestCase):
     # Check the main menu.
     check_main_menu(self, out, list_tensors_enabled=False)
 
+  @test_util.run_deprecated_v1
   def testListTensorsInReverseTimeOrderWorks(self):
     # Use shorthand alias for the command prefix.
     out = self._registry.dispatch_command("lt", ["-s", "timestamp", "-r"])
@@ -688,6 +691,7 @@ class AnalyzerCLISimpleMulAddTest(test_util.TensorFlowTestCase):
         reverse=True)
     check_main_menu(self, out, list_tensors_enabled=False)
 
+  @test_util.run_deprecated_v1
   def testListTensorsInDumpSizeOrderWorks(self):
     out = self._registry.dispatch_command("lt", ["-s", "dump_size"])
     assert_listed_tensors(
@@ -701,6 +705,7 @@ class AnalyzerCLISimpleMulAddTest(test_util.TensorFlowTestCase):
         sort_by="dump_size")
     check_main_menu(self, out, list_tensors_enabled=False)
 
+  @test_util.run_deprecated_v1
   def testListTensorsInReverseDumpSizeOrderWorks(self):
     out = self._registry.dispatch_command("lt", ["-s", "dump_size", "-r"])
     assert_listed_tensors(
@@ -720,6 +725,7 @@ class AnalyzerCLISimpleMulAddTest(test_util.TensorFlowTestCase):
     self.assertIn("ValueError: Unsupported key to sort tensors by: foobar",
                   out.lines)
 
+  @test_util.run_deprecated_v1
   def testListTensorsInOpTypeOrderWorks(self):
     # Use shorthand alias for the command prefix.
     out = self._registry.dispatch_command("lt", ["-s", "op_type"])
@@ -735,6 +741,7 @@ class AnalyzerCLISimpleMulAddTest(test_util.TensorFlowTestCase):
         reverse=False)
     check_main_menu(self, out, list_tensors_enabled=False)
 
+  @test_util.run_deprecated_v1
   def testListTensorsInReverseOpTypeOrderWorks(self):
     # Use shorthand alias for the command prefix.
     out = self._registry.dispatch_command("lt", ["-s", "op_type", "-r"])
@@ -750,6 +757,7 @@ class AnalyzerCLISimpleMulAddTest(test_util.TensorFlowTestCase):
         reverse=True)
     check_main_menu(self, out, list_tensors_enabled=False)
 
+  @test_util.run_deprecated_v1
   def testListTensorsInTensorNameOrderWorks(self):
     # Use shorthand alias for the command prefix.
     out = self._registry.dispatch_command("lt", ["-s", "tensor_name"])
@@ -765,6 +773,7 @@ class AnalyzerCLISimpleMulAddTest(test_util.TensorFlowTestCase):
         reverse=False)
     check_main_menu(self, out, list_tensors_enabled=False)
 
+  @test_util.run_deprecated_v1
   def testListTensorsInReverseTensorNameOrderWorks(self):
     # Use shorthand alias for the command prefix.
     out = self._registry.dispatch_command("lt", ["-s", "tensor_name", "-r"])
@@ -780,6 +789,7 @@ class AnalyzerCLISimpleMulAddTest(test_util.TensorFlowTestCase):
         reverse=True)
     check_main_menu(self, out, list_tensors_enabled=False)
 
+  @test_util.run_deprecated_v1
   def testListTensorsFilterByNodeNameRegex(self):
     out = self._registry.dispatch_command("list_tensors",
                                           ["--node_name_filter", ".*read.*"])
@@ -793,6 +803,7 @@ class AnalyzerCLISimpleMulAddTest(test_util.TensorFlowTestCase):
     assert_listed_tensors(self, out, [], [], node_name_regex="^read")
     check_main_menu(self, out, list_tensors_enabled=False)
 
+  @test_util.run_deprecated_v1
   def testListTensorFilterByOpTypeRegex(self):
     out = self._registry.dispatch_command("list_tensors",
                                           ["--op_type_filter", "Identity"])
@@ -821,6 +832,7 @@ class AnalyzerCLISimpleMulAddTest(test_util.TensorFlowTestCase):
         op_type_regex="(Add|MatMul)")
     check_main_menu(self, out, list_tensors_enabled=False)
 
+  @test_util.run_deprecated_v1
   def testListTensorWithFilterAndNodeNameExclusionWorks(self):
     # First, create and register the filter.
     def is_2x1_vector(datum, tensor):
@@ -877,6 +889,7 @@ class AnalyzerCLISimpleMulAddTest(test_util.TensorFlowTestCase):
     out = self._registry.dispatch_command("list_tensors", ["--bar"])
     check_syntax_error_output(self, out, "list_tensors")
 
+  @test_util.run_deprecated_v1
   def testNodeInfoByNodeName(self):
     node_name = "simple_mul_add/matmul"
     out = self._registry.dispatch_command("node_info", [node_name])
@@ -901,6 +914,7 @@ class AnalyzerCLISimpleMulAddTest(test_util.TensorFlowTestCase):
         [(len(out.lines[0]) - len(node_name), len(out.lines[0]), "bold")],
         out.font_attr_segs[0])
 
+  @test_util.run_deprecated_v1
   def testNodeInfoShowAttributes(self):
     node_name = "simple_mul_add/matmul"
     out = self._registry.dispatch_command("node_info", ["-a", node_name])
@@ -924,6 +938,7 @@ class AnalyzerCLISimpleMulAddTest(test_util.TensorFlowTestCase):
         print_tensor_node_name=node_name,
         list_outputs_node_name=node_name)
 
+  @test_util.run_deprecated_v1
   def testNodeInfoShowDumps(self):
     node_name = "simple_mul_add/matmul"
     out = self._registry.dispatch_command("node_info", ["-d", node_name])
@@ -948,6 +963,7 @@ class AnalyzerCLISimpleMulAddTest(test_util.TensorFlowTestCase):
                     len(out.lines[16]) - len(out.lines[16].strip()),
                     len(out.lines[16]), "pt %s:0 -n 0" % node_name)
 
+  @test_util.run_deprecated_v1
   def testNodeInfoShowStackTraceUnavailableIsIndicated(self):
     self._debug_dump.set_python_graph(None)
 
@@ -971,6 +987,7 @@ class AnalyzerCLISimpleMulAddTest(test_util.TensorFlowTestCase):
         print_tensor_node_name=node_name,
         list_outputs_node_name=node_name)
 
+  @test_util.run_deprecated_v1
   def testNodeInfoShowStackTraceAvailableWorks(self):
     self._debug_dump.set_python_graph(self._sess.graph)
 
@@ -994,6 +1011,7 @@ class AnalyzerCLISimpleMulAddTest(test_util.TensorFlowTestCase):
         print_tensor_node_name=node_name,
         list_outputs_node_name=node_name)
 
+  @test_util.run_deprecated_v1
   def testNodeInfoByTensorName(self):
     node_name = "simple_mul_add/u/read"
     tensor_name = node_name + ":0"
@@ -1363,6 +1381,7 @@ class AnalyzerCLISimpleMulAddTest(test_util.TensorFlowTestCase):
         break
     return index
 
+  @test_util.run_deprecated_v1
   def testPrintSourceForOpNamesWholeFileWorks(self):
     self._debug_dump.set_python_graph(self._sess.graph)
     out = self._registry.dispatch_command(
@@ -1415,6 +1434,7 @@ class AnalyzerCLISimpleMulAddTest(test_util.TensorFlowTestCase):
     self.assertEqual("pt simple_mul_add/add",
                      out.font_attr_segs[index + 1][0][2].content)
 
+  @test_util.run_deprecated_v1
   def testPrintSourceForTensorNamesWholeFileWorks(self):
     self._debug_dump.set_python_graph(self._sess.graph)
     out = self._registry.dispatch_command(
@@ -1435,6 +1455,7 @@ class AnalyzerCLISimpleMulAddTest(test_util.TensorFlowTestCase):
     self.assertEqual("pt simple_mul_add/u:0",
                      out.font_attr_segs[index + 2][0][2].content)
 
+  @test_util.run_deprecated_v1
   def testPrintSourceForOpNamesStartingAtSpecifiedLineWorks(self):
     self._debug_dump.set_python_graph(self._sess.graph)
     out = self._registry.dispatch_command(
@@ -1461,6 +1482,7 @@ class AnalyzerCLISimpleMulAddTest(test_util.TensorFlowTestCase):
     self.assertEqual("pt simple_mul_add/u/read",
                      out.font_attr_segs[index + 3][0][2].content)
 
+  @test_util.run_deprecated_v1
   def testPrintSourceForOpNameSettingMaximumElementCountWorks(self):
     self._debug_dump.set_python_graph(self._sess.graph)
     out = self._registry.dispatch_command(
@@ -1505,6 +1527,7 @@ class AnalyzerCLISimpleMulAddTest(test_util.TensorFlowTestCase):
         self.assertTrue(cli_shared.COLOR_GRAY in attr_seg[2] or
                         attr_seg[2] == cli_shared.COLOR_GRAY)
 
+  @test_util.run_deprecated_v1
   def testListSourceWithNodeNameFilterWithMatchesWorks(self):
     self._debug_dump.set_python_graph(self._sess.graph)
     out = self._registry.dispatch_command("list_source", ["-n", ".*/read"])
@@ -1583,7 +1606,7 @@ class AnalyzerCLISimpleMulAddTest(test_util.TensorFlowTestCase):
       x = variables.VariableV1([1, 3, 3, 7], name="x")
       _, idx = array_ops.unique(x, name="x_unique")
       idx_times_two = math_ops.multiply(idx, 2, name="idx_times_two")
-      sess.run(x.initializer)
+      self.evaluate(x.initializer)
 
       run_options = config_pb2.RunOptions(output_partition_graphs=True)
       debug_utils.watch_graph(
@@ -1719,6 +1742,7 @@ class AnalyzerCLIControlDepTest(test_util.TensorFlowTestCase):
     # Tear down temporary dump directory.
     shutil.rmtree(cls._dump_root)
 
+  @test_util.run_deprecated_v1
   def testNodeInfoWithControlDependencies(self):
     # Call node_info on a node with control inputs.
     out = self._registry.dispatch_command("node_info",
@@ -1759,6 +1783,7 @@ class AnalyzerCLIControlDepTest(test_util.TensorFlowTestCase):
                     len(out.lines[z_line]),
                     "ni -a -d -t control_deps/ctrl_dep_z")
 
+  @test_util.run_deprecated_v1
   def testListInputsNonRecursiveNoControl(self):
     """List inputs non-recursively, without any control inputs."""
 
@@ -1801,6 +1826,7 @@ class AnalyzerCLIControlDepTest(test_util.TensorFlowTestCase):
                     len(out.lines[3]) - len("control_deps/ctrl_dep_y"),
                     len(out.lines[3]), "li -c -r control_deps/ctrl_dep_y")
 
+  @test_util.run_deprecated_v1
   def testListInputsNonRecursiveNoControlUsingTensorName(self):
     """List inputs using the name of an output tensor of the node."""
 
@@ -1829,6 +1855,7 @@ class AnalyzerCLIControlDepTest(test_util.TensorFlowTestCase):
                     len(out.lines[3]) - len("control_deps/ctrl_dep_y"),
                     len(out.lines[3]), "li -c -r control_deps/ctrl_dep_y")
 
+  @test_util.run_deprecated_v1
   def testListInputsNonRecursiveWithControls(self):
     """List inputs non-recursively, with control inputs."""
     node_name = "control_deps/ctrl_dep_z"
@@ -1859,6 +1886,7 @@ class AnalyzerCLIControlDepTest(test_util.TensorFlowTestCase):
                     len(out.lines[5]) - len("control_deps/x"),
                     len(out.lines[5]), "li -c -r control_deps/x")
 
+  @test_util.run_deprecated_v1
   def testListInputsRecursiveWithControls(self):
     """List inputs recursively, with control inputs."""
     node_name = "control_deps/ctrl_dep_z"
@@ -1904,6 +1932,7 @@ class AnalyzerCLIControlDepTest(test_util.TensorFlowTestCase):
                     len(out.lines[18]) - len("control_deps/x"),
                     len(out.lines[18]), "li -c -r control_deps/x")
 
+  @test_util.run_deprecated_v1
   def testListInputsRecursiveWithControlsWithDepthLimit(self):
     """List inputs recursively, with control inputs and a depth limit."""
     node_name = "control_deps/ctrl_dep_z"
@@ -1963,6 +1992,7 @@ class AnalyzerCLIControlDepTest(test_util.TensorFlowTestCase):
         "ERROR: There is no node named \"control_deps/z/foo\" in the "
         "partition graphs"], out.lines)
 
+  @test_util.run_deprecated_v1
   def testListRecipientsRecursiveWithControlsWithDepthLimit(self):
     """List recipients recursively, with control inputs and a depth limit."""
 
@@ -2034,6 +2064,7 @@ class AnalyzerCLIWhileLoopTest(test_util.TensorFlowTestCase):
     # Tear down temporary dump directory.
     shutil.rmtree(cls._dump_root)
 
+  @test_util.run_deprecated_v1
   def testMultipleDumpsPrintTensorNoNumber(self):
     output = self._registry.dispatch_command("pt", ["while/Identity:0"])
 
@@ -2051,6 +2082,7 @@ class AnalyzerCLIWhileLoopTest(test_util.TensorFlowTestCase):
     self.assertEqual("For example:", output.lines[-2])
     self.assertEqual("  print_tensor while/Identity:0 -n 0", output.lines[-1])
 
+  @test_util.run_deprecated_v1
   def testMultipleDumpsPrintTensorWithNumber(self):
     for i in xrange(5):
       output = self._registry.dispatch_command(
@@ -2064,6 +2096,7 @@ class AnalyzerCLIWhileLoopTest(test_util.TensorFlowTestCase):
       self.assertTrue(output.lines[4].startswith("array(%d" % i))
       self.assertTrue(output.lines[4].endswith(")"))
 
+  @test_util.run_deprecated_v1
   def testMultipleDumpsPrintTensorInvalidNumber(self):
     output = self._registry.dispatch_command("pt",
                                              ["while/Identity:0", "-n", "10"])
diff --git a/tensorflow/python/debug/cli/cli_shared_test.py b/tensorflow/python/debug/cli/cli_shared_test.py
index 07b364db9f2..d191a234fde 100644
--- a/tensorflow/python/debug/cli/cli_shared_test.py
+++ b/tensorflow/python/debug/cli/cli_shared_test.py
@@ -118,6 +118,7 @@ class GetRunStartIntroAndDescriptionTest(test_util.TensorFlowTestCase):
   def tearDown(self):
     ops.reset_default_graph()
 
+  @test_util.run_deprecated_v1
   def testSingleFetchNoFeeds(self):
     run_start_intro = cli_shared.get_run_start_intro(12, self.const_a, None, {})
 
@@ -181,6 +182,7 @@ class GetRunStartIntroAndDescriptionTest(test_util.TensorFlowTestCase):
     run_start_intro = cli_shared.get_run_start_intro(1, self.sparse_d, None, {})
     self.assertEqual(str(self.sparse_d), run_start_intro.lines[4].strip())
 
+  @test_util.run_deprecated_v1
   def testTwoFetchesListNoFeeds(self):
     fetches = [self.const_a, self.const_b]
     run_start_intro = cli_shared.get_run_start_intro(1, fetches, None, {})
@@ -197,6 +199,7 @@ class GetRunStartIntroAndDescriptionTest(test_util.TensorFlowTestCase):
     description = cli_shared.get_run_short_description(1, fetches, None)
     self.assertEqual("run #1: 2 fetches; 0 feeds", description)
 
+  @test_util.run_deprecated_v1
   def testNestedListAsFetches(self):
     fetches = [self.const_c, [self.const_a, self.const_b]]
     run_start_intro = cli_shared.get_run_start_intro(1, fetches, None, {})
@@ -210,6 +213,7 @@ class GetRunStartIntroAndDescriptionTest(test_util.TensorFlowTestCase):
     description = cli_shared.get_run_short_description(1, fetches, None)
     self.assertEqual("run #1: 3 fetches; 0 feeds", description)
 
+  @test_util.run_deprecated_v1
   def testNestedDictAsFetches(self):
     fetches = {"c": self.const_c, "ab": {"a": self.const_a, "b": self.const_b}}
     run_start_intro = cli_shared.get_run_start_intro(1, fetches, None, {})
@@ -227,6 +231,7 @@ class GetRunStartIntroAndDescriptionTest(test_util.TensorFlowTestCase):
     description = cli_shared.get_run_short_description(1, fetches, None)
     self.assertEqual("run #1: 3 fetches; 0 feeds", description)
 
+  @test_util.run_deprecated_v1
   def testTwoFetchesAsTupleNoFeeds(self):
     fetches = (self.const_a, self.const_b)
     run_start_intro = cli_shared.get_run_start_intro(1, fetches, None, {})
@@ -243,6 +248,7 @@ class GetRunStartIntroAndDescriptionTest(test_util.TensorFlowTestCase):
     description = cli_shared.get_run_short_description(1, fetches, None)
     self.assertEqual("run #1: 2 fetches; 0 feeds", description)
 
+  @test_util.run_deprecated_v1
   def testTwoFetchesAsNamedTupleNoFeeds(self):
     fetches_namedtuple = namedtuple("fetches", "x y")
     fetches = fetches_namedtuple(self.const_b, self.const_c)
@@ -260,6 +266,7 @@ class GetRunStartIntroAndDescriptionTest(test_util.TensorFlowTestCase):
     description = cli_shared.get_run_short_description(1, fetches, None)
     self.assertEqual("run #1: 2 fetches; 0 feeds", description)
 
+  @test_util.run_deprecated_v1
   def testWithFeedDict(self):
     feed_dict = {
         self.const_a: 10.0,
@@ -283,6 +290,7 @@ class GetRunStartIntroAndDescriptionTest(test_util.TensorFlowTestCase):
                                                        feed_dict)
     self.assertEqual("run #1: 1 fetch (c:0); 2 feeds", description)
 
+  @test_util.run_deprecated_v1
   def testTensorFilters(self):
     feed_dict = {self.const_a: 10.0}
     tensor_filters = {
@@ -313,11 +321,13 @@ class GetRunStartIntroAndDescriptionTest(test_util.TensorFlowTestCase):
     command_set.add(annot[2].content)
     self.assertEqual({"run -f filter_a", "run -f filter_b"}, command_set)
 
+  @test_util.run_deprecated_v1
   def testGetRunShortDescriptionWorksForTensorFeedKey(self):
     short_description = cli_shared.get_run_short_description(
         1, self.const_a, {self.const_a: 42.0})
     self.assertEqual("run #1: 1 fetch (a:0); 1 feed (a:0)", short_description)
 
+  @test_util.run_deprecated_v1
   def testGetRunShortDescriptionWorksForUnicodeFeedKey(self):
     short_description = cli_shared.get_run_short_description(
         1, self.const_a, {u"foo": 42.0})
@@ -332,6 +342,7 @@ class GetErrorIntroTest(test_util.TensorFlowTestCase):
   def tearDown(self):
     ops.reset_default_graph()
 
+  @test_util.run_deprecated_v1
   def testShapeError(self):
     tf_error = errors.OpError(None, self.var_a.initializer, "foo description",
                               None)
diff --git a/tensorflow/python/debug/cli/profile_analyzer_cli_test.py b/tensorflow/python/debug/cli/profile_analyzer_cli_test.py
index 60b60479707..effcd500c70 100644
--- a/tensorflow/python/debug/cli/profile_analyzer_cli_test.py
+++ b/tensorflow/python/debug/cli/profile_analyzer_cli_test.py
@@ -348,6 +348,7 @@ class ProfileAnalyzerPrintSourceTest(test_util.TensorFlowTestCase):
     ops.reset_default_graph()
     super(ProfileAnalyzerPrintSourceTest, self).tearDown()
 
+  @test_util.run_deprecated_v1
   def testPrintSourceForWhileLoop(self):
     prof_output = self.prof_analyzer.print_source([__file__])
 
@@ -361,6 +362,7 @@ class ProfileAnalyzerPrintSourceTest(test_util.TensorFlowTestCase):
         r"\[(\|)+(\s)*\] .*us .*7\(55\) .*L%d.*(\S)+" % self.loop_lineno,
         prof_output.lines)
 
+  @test_util.run_deprecated_v1
   def testPrintSourceOutputContainsClickableLinks(self):
     prof_output = self.prof_analyzer.print_source([__file__])
     any_match, line_index = _at_least_one_line_matches(
@@ -377,6 +379,7 @@ class ProfileAnalyzerPrintSourceTest(test_util.TensorFlowTestCase):
         break
     self.assertTrue(any_menu_item_match)
 
+  @test_util.run_deprecated_v1
   def testPrintSourceWithNonDefaultTimeUnit(self):
     prof_output = self.prof_analyzer.print_source([
         __file__, "--time_unit", "ms"])
@@ -391,6 +394,7 @@ class ProfileAnalyzerPrintSourceTest(test_util.TensorFlowTestCase):
         r"\[(\|)+(\s)*\] .*ms .*7\(55\) .*L%d.*(\S)+" % self.loop_lineno,
         prof_output.lines)
 
+  @test_util.run_deprecated_v1
   def testPrintSourceWithNodeNameFilter(self):
     prof_output = self.prof_analyzer.print_source([
         __file__, "--node_name_filter", "x$"])
@@ -423,6 +427,7 @@ class ProfileAnalyzerPrintSourceTest(test_util.TensorFlowTestCase):
         break
     self.assertTrue(any_menu_item_match)
 
+  @test_util.run_deprecated_v1
   def testPrintSourceWithOpTypeFilter(self):
     prof_output = self.prof_analyzer.print_source([
         __file__, "--op_type_filter", "Less"])
diff --git a/tensorflow/python/debug/lib/common_test.py b/tensorflow/python/debug/lib/common_test.py
index 5af0dafcf9f..f6413f6b7b3 100644
--- a/tensorflow/python/debug/lib/common_test.py
+++ b/tensorflow/python/debug/lib/common_test.py
@@ -27,6 +27,7 @@ from tensorflow.python.platform import googletest
 
 class CommonTest(test_util.TensorFlowTestCase):
 
+  @test_util.run_deprecated_v1
   def testOnFeedOneFetch(self):
     a = constant_op.constant(10.0, name="a")
     b = constant_op.constant(20.0, name="b")
@@ -35,6 +36,7 @@ class CommonTest(test_util.TensorFlowTestCase):
     self.assertItemsEqual(["a:0"], loaded[0])
     self.assertItemsEqual(["b:0"], loaded[1])
 
+  @test_util.run_deprecated_v1
   def testGetRunKeyFlat(self):
     a = constant_op.constant(10.0, name="a")
     b = constant_op.constant(20.0, name="b")
@@ -43,6 +45,7 @@ class CommonTest(test_util.TensorFlowTestCase):
     self.assertItemsEqual(["a:0"], loaded[0])
     self.assertItemsEqual(["a:0", "b:0"], loaded[1])
 
+  @test_util.run_deprecated_v1
   def testGetRunKeyNestedFetches(self):
     a = constant_op.constant(10.0, name="a")
     b = constant_op.constant(20.0, name="b")
diff --git a/tensorflow/python/debug/lib/debug_gradients_test.py b/tensorflow/python/debug/lib/debug_gradients_test.py
index 01867fc69d0..1c531478638 100644
--- a/tensorflow/python/debug/lib/debug_gradients_test.py
+++ b/tensorflow/python/debug/lib/debug_gradients_test.py
@@ -54,6 +54,7 @@ class IdentifyGradientTest(test_util.TensorFlowTestCase):
     ops.reset_default_graph()
     debug_gradients.clear_gradient_debuggers()
 
+  @test_util.run_deprecated_v1
   def testIdentifyGradientGivesCorrectTensorObjectWithoutContextManager(self):
     grad_debugger = debug_gradients.GradientsDebugger()
     id_grad_w = grad_debugger.identify_gradient(self.w)
@@ -84,6 +85,7 @@ class IdentifyGradientTest(test_util.TensorFlowTestCase):
     self.assertIsInstance(w_grad, ops.Tensor)
     self.assertAllClose(1.0, self.sess.run(w_grad))
 
+  @test_util.run_deprecated_v1
   def testIdentifyGradientGivesCorrectTensorObjectWithTfGradients(self):
     grad_debugger = debug_gradients.GradientsDebugger()
     id_grad_w = grad_debugger.identify_gradient(self.w)
@@ -115,6 +117,7 @@ class IdentifyGradientTest(test_util.TensorFlowTestCase):
     self.assertIsInstance(w_grad, ops.Tensor)
     self.assertAllClose(1.0, self.sess.run(w_grad))
 
+  @test_util.run_deprecated_v1
   def testCallingIdentifyGradientTwiceWithTheSameGradientsDebuggerErrors(self):
     grad_debugger = debug_gradients.GradientsDebugger()
     grad_debugger.identify_gradient(self.w)
@@ -122,6 +125,7 @@ class IdentifyGradientTest(test_util.TensorFlowTestCase):
                                  "The graph already contains an op named .*"):
       grad_debugger.identify_gradient(self.w)
 
+  @test_util.run_deprecated_v1
   def testIdentifyGradientWorksOnMultipleLosses(self):
     grad_debugger_1 = debug_gradients.GradientsDebugger()
     grad_debugger_2 = debug_gradients.GradientsDebugger()
@@ -150,6 +154,7 @@ class IdentifyGradientTest(test_util.TensorFlowTestCase):
     self.assertAllClose(2.0 * 5.0, self.sess.run(dz1_dy))
     self.assertAllClose(0.5 * (5.0**-0.5), self.sess.run(dz2_dy))
 
+  @test_util.run_deprecated_v1
   def testIdentifyGradientRaisesLookupErrorForUnknownXTensor(self):
     grad_debugger_1 = debug_gradients.GradientsDebugger()
     grad_debugger_2 = debug_gradients.GradientsDebugger()
@@ -170,6 +175,7 @@ class IdentifyGradientTest(test_util.TensorFlowTestCase):
         r"This GradientsDebugger has not received any gradient tensor for "):
       grad_debugger_2.gradient_tensor(self.w)
 
+  @test_util.run_deprecated_v1
   def testIdentifyGradientRaisesTypeErrorForNonTensorOrTensorNameInput(self):
     grad_debugger = debug_gradients.GradientsDebugger()
     with self.assertRaisesRegexp(
@@ -178,6 +184,7 @@ class IdentifyGradientTest(test_util.TensorFlowTestCase):
         r"has type .*Operation.*"):
       grad_debugger.gradient_tensor(variables.global_variables_initializer())
 
+  @test_util.run_deprecated_v1
   def testIdentifyGradientTensorWorksWithGradientDescentOptimizer(self):
     grad_debugger = debug_gradients.GradientsDebugger()
     id_grad_w = grad_debugger.identify_gradient(self.w)
@@ -193,6 +200,7 @@ class IdentifyGradientTest(test_util.TensorFlowTestCase):
     self.assertIsInstance(w_grad, ops.Tensor)
     self.assertAllClose(1.0, self.sess.run(w_grad))
 
+  @test_util.run_deprecated_v1
   def testWatchGradientsByXTensorNamesWorks(self):
     y = math_ops.add(self.w, -1.0, name="y")
 
@@ -219,6 +227,7 @@ class IdentifyGradientTest(test_util.TensorFlowTestCase):
     self.assertIsInstance(w_grad, ops.Tensor)
     self.assertAllClose(1.0, self.sess.run(w_grad))
 
+  @test_util.run_deprecated_v1
   def testWatchGradientsByXTensorNamesWorksWithoutContextManager(self):
     y = math_ops.add(self.w, -1.0, name="y")
 
@@ -245,6 +254,7 @@ class IdentifyGradientTest(test_util.TensorFlowTestCase):
     self.assertIsInstance(w_grad, ops.Tensor)
     self.assertAllClose(1.0, self.sess.run(w_grad))
 
+  @test_util.run_deprecated_v1
   def testWatchGradientsWorksOnRefTensor(self):
     y = math_ops.add(self.w, -1.0, name="y")
 
@@ -263,6 +273,7 @@ class IdentifyGradientTest(test_util.TensorFlowTestCase):
     self.assertAllClose(3.0, self.sess.run(
         grad_debugger.gradient_tensor("u:0")))
 
+  @test_util.run_deprecated_v1
   def testWatchGradientsWorksOnMultipleTensors(self):
     y = math_ops.add(self.w, -1.0, name="y")
 
@@ -283,6 +294,7 @@ class IdentifyGradientTest(test_util.TensorFlowTestCase):
     self.assertAllClose(3.0, self.sess.run(
         grad_debugger.gradient_tensor("u:0")))
 
+  @test_util.run_deprecated_v1
   def testWatchGradientsByXTensorsWorks(self):
     y = math_ops.add(self.w, -1.0, name="foo/y")
     z = math_ops.square(y, name="foo/z")
@@ -305,6 +317,7 @@ class IdentifyGradientTest(test_util.TensorFlowTestCase):
     self.assertAllClose(10.0, self.sess.run(w_grad))
     self.assertAllClose(30.0, self.sess.run(u_grad))
 
+  @test_util.run_deprecated_v1
   def testWatchGradientsByTensorCanWorkOnMultipleLosses(self):
     y = math_ops.add(self.w, -1.0, name="y")
     z1 = math_ops.square(y, name="z1")
@@ -330,6 +343,7 @@ class IdentifyGradientTest(test_util.TensorFlowTestCase):
     self.assertAllClose(2.0 * 5.0, self.sess.run(dz1_dy))
     self.assertAllClose(0.5 * (5.0**-0.5), self.sess.run(dz2_dy))
 
+  @test_util.run_deprecated_v1
   def testGradientsValuesFromDumpWorks(self):
     y = math_ops.add(self.w, -1.0, name="y")
     z = math_ops.square(y, name="z")
diff --git a/tensorflow/python/debug/lib/debug_graph_reconstruction_test.py b/tensorflow/python/debug/lib/debug_graph_reconstruction_test.py
index 1f67f8a0d4e..34030c0adca 100644
--- a/tensorflow/python/debug/lib/debug_graph_reconstruction_test.py
+++ b/tensorflow/python/debug/lib/debug_graph_reconstruction_test.py
@@ -126,8 +126,8 @@ class ReconstructNonDebugGraphTest(test_util.TensorFlowTestCase):
       u = variables.Variable([12.0], name="u")
       v = variables.Variable([30.0], name="v")
       w = math_ops.add(u, v, name="w")
-      sess.run(u.initializer)
-      sess.run(v.initializer)
+      self.evaluate(u.initializer)
+      self.evaluate(v.initializer)
 
       self._compareOriginalAndReconstructedGraphDefs(
           sess, w, expected_output=[42.0])
@@ -139,7 +139,7 @@ class ReconstructNonDebugGraphTest(test_util.TensorFlowTestCase):
         b = math_ops.add(a, a, name="b")
       with ops.control_dependencies([a, b]):
         c = math_ops.multiply(b, b, name="c")
-      sess.run(a.initializer)
+      self.evaluate(a.initializer)
 
       self._compareOriginalAndReconstructedGraphDefs(
           sess, c, expected_output=400.0)
@@ -150,8 +150,8 @@ class ReconstructNonDebugGraphTest(test_util.TensorFlowTestCase):
       y = variables.Variable(20.0, name="y")
       cond = control_flow_ops.cond(
           x > y, lambda: math_ops.add(x, 1), lambda: math_ops.add(y, 1))
-      sess.run(x.initializer)
-      sess.run(y.initializer)
+      self.evaluate(x.initializer)
+      self.evaluate(y.initializer)
 
       self._compareOriginalAndReconstructedGraphDefs(
           sess, cond, expected_output=21.0)
@@ -173,8 +173,8 @@ class ReconstructNonDebugGraphTest(test_util.TensorFlowTestCase):
       toy_loss = x * (u - v)
       train_op = gradient_descent.GradientDescentOptimizer(
           learning_rate=0.1).minimize(toy_loss, name="train_op")
-      sess.run(u.initializer)
-      sess.run(v.initializer)
+      self.evaluate(u.initializer)
+      self.evaluate(v.initializer)
 
       self._compareOriginalAndReconstructedGraphDefs(sess, train_op)
 
diff --git a/tensorflow/python/debug/lib/debug_utils_test.py b/tensorflow/python/debug/lib/debug_utils_test.py
index 23ab98444cd..cf59b30e3da 100644
--- a/tensorflow/python/debug/lib/debug_utils_test.py
+++ b/tensorflow/python/debug/lib/debug_utils_test.py
@@ -185,6 +185,7 @@ class DebugUtilsTest(test_util.TensorFlowTestCase):
     self.assertEqual(["file:///tmp/tfdbg_1", "file:///tmp/tfdbg_2"],
                      watch_0.debug_urls)
 
+  @test_util.run_deprecated_v1
   def testWatchGraph_allNodes(self):
     debug_utils.watch_graph(
         self._run_options,
@@ -216,6 +217,7 @@ class DebugUtilsTest(test_util.TensorFlowTestCase):
     self.assertTrue("p1" in node_names)
     self.assertTrue("s" in node_names)
 
+  @test_util.run_deprecated_v1
   def testWatchGraph_nodeNameWhitelist(self):
     debug_utils.watch_graph(
         self._run_options,
@@ -230,6 +232,7 @@ class DebugUtilsTest(test_util.TensorFlowTestCase):
         sorted(["a1_init", "a1", "a1/Assign", "a1/read", "p1"]),
         sorted(node_names))
 
+  @test_util.run_deprecated_v1
   def testWatchGraph_opTypeWhitelist(self):
     debug_utils.watch_graph(
         self._run_options,
@@ -255,6 +258,7 @@ class DebugUtilsTest(test_util.TensorFlowTestCase):
         ["DebugIdentity"], ["file:///tmp/tfdbg_1"])
     self.assertEqual(["p1"], node_names)
 
+  @test_util.run_deprecated_v1
   def testWatchGraph_tensorDTypeWhitelist(self):
     debug_utils.watch_graph(
         self._run_options,
@@ -267,6 +271,7 @@ class DebugUtilsTest(test_util.TensorFlowTestCase):
         ["DebugIdentity"], ["file:///tmp/tfdbg_1"])
     self.assertItemsEqual(["a1", "a1/Assign", "b", "b/Assign"], node_names)
 
+  @test_util.run_deprecated_v1
   def testWatchGraph_nodeNameAndTensorDTypeWhitelists(self):
     debug_utils.watch_graph(
         self._run_options,
@@ -280,6 +285,7 @@ class DebugUtilsTest(test_util.TensorFlowTestCase):
         ["DebugIdentity"], ["file:///tmp/tfdbg_1"])
     self.assertItemsEqual(["a1", "a1/Assign"], node_names)
 
+  @test_util.run_deprecated_v1
   def testWatchGraph_nodeNameBlacklist(self):
     debug_utils.watch_graph_with_blacklists(
         self._run_options,
@@ -294,6 +300,7 @@ class DebugUtilsTest(test_util.TensorFlowTestCase):
         sorted(["b_init", "b", "b/Assign", "b/read", "c", "s"]),
         sorted(node_names))
 
+  @test_util.run_deprecated_v1
   def testWatchGraph_opTypeBlacklist(self):
     debug_utils.watch_graph_with_blacklists(
         self._run_options,
@@ -306,6 +313,7 @@ class DebugUtilsTest(test_util.TensorFlowTestCase):
         ["DebugIdentity"], ["file:///tmp/tfdbg_1"])
     self.assertEqual(sorted(["p1", "s"]), sorted(node_names))
 
+  @test_util.run_deprecated_v1
   def testWatchGraph_nodeNameAndOpTypeBlacklists(self):
     debug_utils.watch_graph_with_blacklists(
         self._run_options,
@@ -319,6 +327,7 @@ class DebugUtilsTest(test_util.TensorFlowTestCase):
         ["DebugIdentity"], ["file:///tmp/tfdbg_1"])
     self.assertEqual(["s"], node_names)
 
+  @test_util.run_deprecated_v1
   def testWatchGraph_tensorDTypeBlacklists(self):
     debug_utils.watch_graph_with_blacklists(
         self._run_options,
@@ -335,6 +344,7 @@ class DebugUtilsTest(test_util.TensorFlowTestCase):
     self.assertNotIn("b/Assign", node_names)
     self.assertIn("s", node_names)
 
+  @test_util.run_deprecated_v1
   def testWatchGraph_nodeNameAndTensorDTypeBlacklists(self):
     debug_utils.watch_graph_with_blacklists(
         self._run_options,
diff --git a/tensorflow/python/debug/lib/session_debug_file_test.py b/tensorflow/python/debug/lib/session_debug_file_test.py
index 1874160dd63..f5f9ba29ab5 100644
--- a/tensorflow/python/debug/lib/session_debug_file_test.py
+++ b/tensorflow/python/debug/lib/session_debug_file_test.py
@@ -28,6 +28,7 @@ from tensorflow.python.debug.lib import debug_utils
 from tensorflow.python.debug.lib import session_debug_testlib
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import googletest
@@ -44,6 +45,7 @@ class SessionDebugFileTest(session_debug_testlib.SessionDebugTestBase):
     else:
       return os.path.join(self._dump_root, "run_%d" % run_number)
 
+  @test_util.run_deprecated_v1
   def testAllowsDifferentWatchesOnDifferentRuns(self):
     """Test watching different tensors on different runs of the same graph."""
 
diff --git a/tensorflow/python/debug/lib/session_debug_multi_gpu_test.py b/tensorflow/python/debug/lib/session_debug_multi_gpu_test.py
index b0dc25851ca..8eef45392f2 100644
--- a/tensorflow/python/debug/lib/session_debug_multi_gpu_test.py
+++ b/tensorflow/python/debug/lib/session_debug_multi_gpu_test.py
@@ -67,7 +67,7 @@ class SessionDebugMultiGPUTest(test_util.TensorFlowTestCase):
         u1 = math_ops.multiply(v, v, name="u1")
       w = math_ops.subtract(u1, u0, name="w")
 
-      sess.run(v.initializer)
+      self.evaluate(v.initializer)
 
       run_options = config_pb2.RunOptions(output_partition_graphs=True)
       debug_utils.watch_graph(run_options, sess.graph,
diff --git a/tensorflow/python/debug/lib/source_utils_test.py b/tensorflow/python/debug/lib/source_utils_test.py
index 4a8d4eaa99f..9083297fdbb 100644
--- a/tensorflow/python/debug/lib/source_utils_test.py
+++ b/tensorflow/python/debug/lib/source_utils_test.py
@@ -65,6 +65,7 @@ class GuessIsTensorFlowLibraryTest(test_util.TensorFlowTestCase):
     self.assertTrue(
         source_utils.guess_is_tensorflow_py_library(source_utils.__file__))
 
+  @test_util.run_deprecated_v1
   def testFileInPythonKernelsPathReturnsTrue(self):
     x = constant_op.constant(42.0, name="x")
     self.assertTrue(
@@ -109,8 +110,8 @@ class SourceHelperTest(test_util.TensorFlowTestCase):
       self.w = math_ops.matmul(self.u, self.v, name="w")
       self.w_line_number = line_number_above()
 
-      sess.run(self.u.initializer)
-      sess.run(self.v.initializer)
+      self.evaluate(self.u.initializer)
+      self.evaluate(self.v.initializer)
 
       run_options = config_pb2.RunOptions(output_partition_graphs=True)
       debug_utils.watch_graph(
diff --git a/tensorflow/python/debug/wrappers/framework_test.py b/tensorflow/python/debug/wrappers/framework_test.py
index 73e08ce7d59..68584b4ede4 100644
--- a/tensorflow/python/debug/wrappers/framework_test.py
+++ b/tensorflow/python/debug/wrappers/framework_test.py
@@ -339,7 +339,7 @@ class DebugWrapperSessionTest(test_util.TensorFlowTestCase):
 
     with wrapper.as_default():
       foo = constant_op.constant(42, name="foo")
-      self.assertEqual(42, foo.eval())
+      self.assertEqual(42, self.evaluate(foo))
       self.assertEqual(foo, self._observer["run_fetches"])
 
   def testWrapperShouldSupportSessionClose(self):
diff --git a/tensorflow/python/distribute/BUILD b/tensorflow/python/distribute/BUILD
index 1c3d8ea67e5..2d9a1764db7 100644
--- a/tensorflow/python/distribute/BUILD
+++ b/tensorflow/python/distribute/BUILD
@@ -7,15 +7,140 @@ licenses(["notice"])  # Apache 2.0
 exports_files(["LICENSE"])
 
 load("//tensorflow:tensorflow.bzl", "py_test")
+load("//tensorflow:tensorflow.bzl", "tf_py_test")
+load("//tensorflow:tensorflow.bzl", "cuda_py_test")
 
 py_library(
-    name = "distribute",
+    name = "all_reduce",
+    srcs = [
+        "all_reduce.py",
+    ],
     srcs_version = "PY2AND3",
-    visibility = ["//visibility:public"],
     deps = [
-        ":distribute_config",
-        ":distribute_coordinator",
-        ":distribute_coordinator_context",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:nccl_ops",
+    ],
+)
+
+tf_py_test(
+    name = "all_reduce_test",
+    srcs = ["all_reduce_test.py"],
+    additional_deps = [
+        ":all_reduce",
+        "//third_party/py/numpy",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:state_ops",
+    ],
+)
+
+py_library(
+    name = "cross_device_ops",
+    srcs = ["cross_device_ops.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":cross_device_utils",
+        ":device_util",
+        ":reduce_util",
+        ":values",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:device_lib",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python/eager:context",
+        "@six_archive//:six",
+    ],
+)
+
+py_library(
+    name = "cross_device_utils",
+    srcs = ["cross_device_utils.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":all_reduce",
+        ":values",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:collective_ops",
+        "//tensorflow/python:device",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:gradients",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:nccl_ops",
+    ],
+)
+
+py_library(
+    name = "device_util",
+    srcs = ["device_util.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:device",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python/eager:context",
+    ],
+)
+
+cuda_py_test(
+    name = "device_util_test",
+    srcs = ["device_util_test.py"],
+    additional_deps = [
+        ":device_util",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_ops",
+    ],
+)
+
+py_library(
+    name = "distribute_lib",
+    srcs = [
+        "distribute_lib.py",
+        "distribution_strategy_context.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":device_util",
+        ":reduce_util",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python:state_ops",
+        "//tensorflow/python:util",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python/data",
+        "//tensorflow/python/distribute/cluster_resolver:cluster_resolver_lib",
+        "//tensorflow/python/ops/losses",
+        "//tensorflow/tools/docs:doc_controls",
+    ],
+)
+
+py_test(
+    name = "distribute_lib_test",
+    size = "small",
+    srcs = ["distribute_lib_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":distribute_lib",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:variable_scope",
     ],
 )
 
@@ -45,7 +170,6 @@ py_library(
 
 py_test(
     name = "distribute_coordinator_test",
-    size = "large",
     srcs = ["distribute_coordinator_test.py"],
     srcs_version = "PY2AND3",
     tags = [
@@ -76,6 +200,35 @@ py_library(
     deps = [],
 )
 
+py_library(
+    name = "mirrored_strategy",
+    srcs = ["mirrored_strategy.py"],
+    deps = [
+        ":cross_device_ops",
+        ":device_util",
+        ":distribute_lib",
+        ":multi_worker_util",
+        ":reduce_util",
+        ":shared_variable_creator",
+        ":values",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:device",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:pywrap_tensorflow",
+        "//tensorflow/python:tensor_util",
+        "//tensorflow/python:training",
+        "//tensorflow/python:util",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:tape",
+    ],
+)
+
 py_library(
     name = "multi_worker_util",
     srcs = [
@@ -88,6 +241,35 @@ py_library(
     ],
 )
 
+py_library(
+    name = "input_ops",
+    srcs = ["input_ops.py"],
+    deps = [
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python/data/experimental/ops:filter_for_shard_ops",
+        "//tensorflow/python/data/util:nest",
+    ],
+)
+
+cuda_py_test(
+    name = "input_ops_test",
+    srcs = ["input_ops_test.py"],
+    additional_deps = [
+        ":input_ops",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/ops:readers",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:io_ops",
+        "//tensorflow/python:util",
+    ],
+    tags = [
+        "no_pip",
+    ],
+)
+
 py_test(
     name = "multi_worker_util_test",
     srcs = ["multi_worker_util_test.py"],
@@ -120,3 +302,49 @@ py_library(
         "//tensorflow/python:training",
     ],
 )
+
+py_library(
+    name = "reduce_util",
+    srcs = ["reduce_util.py"],
+    deps = [
+        "//tensorflow/python:util",
+        "//tensorflow/python:variable_scope",
+    ],
+)
+
+py_library(
+    name = "shared_variable_creator",
+    srcs = ["shared_variable_creator.py"],
+)
+
+py_test(
+    name = "shared_variable_creator_test",
+    srcs = ["shared_variable_creator_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":shared_variable_creator",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python/eager:test",
+    ],
+)
+
+py_library(
+    name = "values",
+    srcs = ["values.py"],
+    deps = [
+        ":device_util",
+        ":distribute_lib",
+        ":input_ops",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python:training",
+        "//tensorflow/python:util",
+        "//tensorflow/python/data/ops:multi_device_iterator_ops",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/training/checkpointable:base",
+        "@six_archive//:six",
+    ],
+)
diff --git a/tensorflow/python/distribute/all_reduce.py b/tensorflow/python/distribute/all_reduce.py
new file mode 100644
index 00000000000..bd7c45ae27a
--- /dev/null
+++ b/tensorflow/python/distribute/all_reduce.py
@@ -0,0 +1,860 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utilities to construct a TF subgraph implementing distributed All-Reduce."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import math
+
+from tensorflow.python.framework import device as device_lib
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nccl_ops
+
+
+def _flatten_tensors(tensors):
+  """Check tensors for isomorphism and flatten.
+
+  Args:
+    tensors: list of T `tf.Tensor` which must all have the same shape.
+
+  Returns:
+    tensors: a list of T `tf.Tensor` which are flattened (1D) views of tensors
+    shape: the original shape of each element of input tensors
+
+  Raises:
+    ValueError: tensors are empty or non-isomorphic or have unknown shape.
+  """
+  if not tensors:
+    raise ValueError("tensors cannot be empty")
+  shape = tensors[0].shape
+  for tensor in tensors:
+    shape = shape.merge_with(tensor.shape)
+  if not shape.is_fully_defined():
+    raise ValueError("Tensors must have statically known shape.")
+  if len(shape) != 1:
+    reshaped = []
+    for t in tensors:
+      with ops.colocate_with(t):
+        reshaped.append(array_ops.reshape(t, [-1]))
+    tensors = reshaped
+  return tensors, shape
+
+
+def _reshape_tensors(tensors, shape):
+  """Reshape tensors flattened by _flatten_tensors.
+
+  Args:
+    tensors: list of T `tf.Tensor` of identical length 1D tensors.
+    shape: list of integers describing the desired shape.  Product of
+      the elements must equal the length of each tensor.
+
+  Returns:
+    list of T `tf.Tensor` which are the reshaped inputs.
+  """
+  reshaped = []
+  for t in tensors:
+    with ops.colocate_with(t):
+      reshaped.append(array_ops.reshape(t, shape))
+  return reshaped
+
+
+def _padded_split(tensor, pieces):
+  """Like split for 1D tensors but pads-out case where len % pieces != 0.
+
+  Args:
+    tensor: T `tf.Tensor` that must be 1D.
+    pieces: a positive integer specifying the number of pieces into which
+      tensor should be split.
+
+  Returns:
+    list of T `tf.Tensor` of length pieces, which hold the values of
+      thin input tensor, in order.  The final tensor may
+      be zero-padded on the end to make its size equal to those of all
+      of the other tensors.
+
+  Raises:
+    ValueError: The input tensor is not 1D.
+  """
+  shape = tensor.shape
+  if 1 != len(shape):
+    raise ValueError("input tensor must be 1D")
+  tensor_len = shape.dims[0].value
+  with ops.colocate_with(tensor):
+    if tensor_len % pieces != 0:
+      # pad to an even length
+      chunk_size = 1 + tensor_len // pieces
+      if pieces > tensor_len:
+        # This is an edge case that should not come up in practice,
+        # i.e. a different reduction algorithm would be better,
+        # but we'll make it work just for completeness.
+        pad_len = pieces - tensor_len
+        extended_whole = array_ops.concat(
+            [tensor, array_ops.zeros([pad_len], dtype=tensor.dtype)], 0)
+        parts = array_ops.split(extended_whole, pieces)
+        return parts, pad_len
+      elif (pieces - 1) * chunk_size >= tensor_len:
+        # Another edge case of limited real interest.
+        pad_len = (pieces * chunk_size) % tensor_len
+        extended_whole = array_ops.concat(
+            [tensor, array_ops.zeros([pad_len], dtype=tensor.dtype)], 0)
+        parts = array_ops.split(extended_whole, pieces)
+        return parts, pad_len
+      else:
+        last_chunk_size = tensor_len - (pieces - 1) * chunk_size
+        pad_len = chunk_size - last_chunk_size
+        piece_lens = [chunk_size for _ in range(pieces - 1)] + [last_chunk_size]
+        parts = array_ops.split(tensor, piece_lens)
+        parts[-1] = array_ops.concat(
+            [parts[-1], array_ops.zeros([pad_len], dtype=tensor.dtype)], 0)
+        return parts, pad_len
+    else:
+      return array_ops.split(tensor, pieces), 0
+
+
+def _strip_padding(tensors, pad_len):
+  """Strip the suffix padding added by _padded_split.
+
+  Args:
+    tensors: list of T `tf.Tensor` of identical length 1D tensors.
+    pad_len: number of elements to be stripped from the end of each tensor.
+
+  Returns:
+    list of T `tf.Tensor` which are the stripped inputs.
+
+  Raises:
+    ValueError: tensors must be a non-empty list of 1D tensors, and
+      each must be longer than pad_len.
+  """
+  if not tensors:
+    raise ValueError("tensors cannot be empty")
+  shape = tensors[0].shape
+  if len(shape) > 1:
+    raise ValueError("tensors must be 1D")
+  prefix_len = int(shape[0] - pad_len)
+  if prefix_len < 0:
+    raise ValueError("pad_len longer than tensor")
+  stripped = []
+  for t in tensors:
+    with ops.colocate_with(t):
+      stripped.append(array_ops.slice(t, [0], [prefix_len]))
+  return stripped
+
+
+def _ragged_split(tensor, pieces):
+  """Like split for 1D tensors but allows case where len % pieces != 0.
+
+  Args:
+    tensor: T `tf.Tensor` that must be 1D.
+    pieces: a positive integer specifying the number of pieces into which
+      tensor should be split.
+
+  Returns:
+    list of T `tf.Tensor` of length pieces, which hold the values of
+      the input tensor, in order.  The final tensor may be shorter
+      than the others, which will all be of equal length.
+
+  Raises:
+    ValueError: input tensor must be 1D.
+  """
+  shape = tensor.shape
+  if 1 != len(shape):
+    raise ValueError("input tensor must be 1D")
+  tensor_len = shape.dims[0].value
+  chunk_size = tensor_len // pieces
+  with ops.colocate_with(tensor):
+    if tensor_len != (pieces * chunk_size):
+      # last piece will be short
+      assert pieces > 1
+      last_chunk_size = tensor_len - ((pieces - 1) * chunk_size)
+      assert last_chunk_size > 0
+      piece_lens = [chunk_size for _ in range(pieces - 1)] + [last_chunk_size]
+      return array_ops.split(tensor, piece_lens)
+    else:
+      return array_ops.split(tensor, pieces)
+
+
+def _ring_permutations(num_workers, num_subchunks, gpu_perm):
+  """"Generate an array of device index arrays, one for each subchunk.
+
+  In the basic ring reduction algorithm there are size(T)/num_devices
+  data chunks and each device process one chunk per tick, i.e. sending
+  one chunk and receiving one chunk.  The idea of subchunking is that
+  each device processes num_subchunks smaller data regions per tick,
+  and the ring rank permutation is different for each subchunk index
+  so that a device is potentially sending to and receiving from
+  num_subchunks different other devices at each tick.  Where multiple
+  independent data channels exist between devices, this strategy
+  supplies a method of using them in parallel.
+
+  Args:
+    num_workers: number of worker tasks
+    num_subchunks: number of subchunks into which to divide each per-GPU chunk.
+    gpu_perm: an array of integers in [0, num_gpus-1] giving the default
+      ring order of GPUs at each worker.  Other permutations will be generated
+      by rotating this array and splicing together per-worker instances.
+
+  Raises:
+    ValueError: the number of subchunks may not exceed the number of GPUs.
+
+  Returns:
+    pred_by_s_d: list of lists that maps (by index) from (subchunk, dev) to
+        preceding device in the permutation for that subchunk.  The
+        device index of GPU i at worker j is i + (j * num_gpus).
+    rank_by_s_d: list of lists that maps (by index) from (subchunk, dev) to
+       local rank of device d in the permutation for that subchunk.
+  """
+  num_gpus = len(gpu_perm)
+  devices = num_workers * num_gpus
+  if devices == 0:
+    return [], []
+  if num_subchunks > num_gpus:
+    raise ValueError(
+        "num_subchunks %d must be <= num_gpus %d" % (num_subchunks, num_gpus))
+  rotation_interval = max(1, int(num_gpus / num_subchunks))
+  perms_by_s = []
+  for s in range(0, num_subchunks):
+    full_order = []
+    offset = s * rotation_interval
+    for w in range(0, num_workers):
+      default_order = [(w * num_gpus) + i for i in gpu_perm]
+      dev_order = default_order[offset:] + default_order[:offset]
+      full_order += dev_order
+    perms_by_s.append(full_order)
+  pred_by_s_d = [[-1 for d in range(0, devices)]
+                 for s in range(0, num_subchunks)]
+  rank_by_s_d = [[-1 for d in range(0, devices)]
+                 for s in range(0, num_subchunks)]
+  for s in range(0, num_subchunks):
+    for d in range(0, devices):
+      for t in range(0, devices):
+        if d == perms_by_s[s][t]:
+          rank_by_s_d[s][d] = t
+          pred_by_s_d[s][d] = perms_by_s[s][(t + devices - 1) % devices]
+          break
+  return (pred_by_s_d, rank_by_s_d)
+
+
+def build_ring_all_reduce(input_tensors, num_workers, num_subchunks,
+                          gpu_perm, red_op, un_op=None):
+  """Construct a subgraph performing a ring-style all-reduce of input_tensors.
+
+  Args:
+    input_tensors: a list of T `tf.Tensor` objects, which must all
+      have the same shape and type.
+    num_workers: number of worker tasks spanned by input_tensors.
+    num_subchunks: number of subchunks each device should process in one tick.
+    gpu_perm: a list of ints giving a ring-wise rank ordering of GPUs at
+      each worker.  All workers must have the same number of
+      GPUs with the same rank ordering.  If NVLINK is available, this should
+      be a ring order supported by NVLINK edges.
+    red_op: a binary operator for elementwise reduction.
+    un_op: an optional unary operator to apply to fully reduced values.
+
+  Raises:
+    ValueError: empty input_tensors or they don't all have same
+    size.
+
+  Returns:
+    a list of T `tf.Tensor` identical sum-reductions of input_tensors.
+  """
+  if len(input_tensors) < 2:
+    raise ValueError("input_tensors must be length 2 or longer")
+  input_tensors, shape = _flatten_tensors(input_tensors)
+  devices = [t.device for t in input_tensors]
+  (pred_by_s_d, rank_by_s_d) = _ring_permutations(
+      num_workers, num_subchunks, gpu_perm)
+  chunks_by_dev, pad_len = _build_ring_gather(
+      input_tensors, devices,
+      num_subchunks, pred_by_s_d, rank_by_s_d, red_op)
+  if un_op:
+    chunks_by_dev = _apply_unary_to_chunks(un_op, chunks_by_dev)
+  output_tensors = _build_ring_scatter(pred_by_s_d, rank_by_s_d,
+                                       chunks_by_dev)
+  if pad_len > 0:
+    output_tensors = _strip_padding(output_tensors, pad_len)
+  if len(shape) != 1:
+    output_tensors = _reshape_tensors(output_tensors, shape)
+  return output_tensors
+
+
+def _build_ring_gather(input_tensors, devices, num_subchunks,
+                       pred_by_s_d, rank_by_s_d, red_op):
+  """Construct a subgraph for the first (reduction) pass of ring all-reduce.
+
+  Args:
+    input_tensors: a list of T `tf.Tensor` 1D input tensors of same
+      shape and type.
+    devices: array of device name strings
+    num_subchunks: number of subchunks each device should process in one tick.
+    pred_by_s_d: as produced by _ring_permutations
+    rank_by_s_d: as produced by _ring_permutations
+    red_op: a binary operator for elementwise reduction
+
+  Raises:
+    ValueError: tensors must all be one dimensional.
+
+  Returns:
+    list of list of T `tf.Tensor` of (partially) reduced values where
+    exactly num_subchunks chunks at each device are fully reduced.
+  """
+  num_devices = len(input_tensors)
+  if num_devices == 0:
+    return []
+  if num_devices == 1:
+    return input_tensors
+  shape = input_tensors[0].shape
+  if 1 != len(shape):
+    raise ValueError("input tensors must be 1D")
+  num_chunks = num_devices * num_subchunks
+  num_ticks = num_devices - 1
+  # Initialize chunks_by_dev with splits of the input tensors.
+  chunks_by_dev = []
+  split_pad_len = 0
+  for d in range(0, num_devices):
+    with ops.device(devices[d]):
+      splits, split_pad_len = _padded_split(input_tensors[d], num_chunks)
+      chunks_by_dev.append(splits)
+  # Reduction phase
+  for tick in range(0, num_ticks):
+    # One new partial reduction for every chunk
+    new_partial_reductions = [None for _ in range(0, num_chunks)]
+    # Compute reductions with respect to last tick's values
+    for d in range(0, num_devices):
+      with ops.device(devices[d]):
+        for s in range(0, num_subchunks):
+          rank = rank_by_s_d[s][d]
+          seg_index = (rank + num_devices - (2 + tick)) % num_devices
+          pred_dev = pred_by_s_d[s][d]
+          chunk_index = (seg_index * num_subchunks) + s
+          new_partial_reductions[chunk_index] = red_op(
+              chunks_by_dev[pred_dev][chunk_index],
+              chunks_by_dev[d][chunk_index])
+    # Update chunks_by_dev with the new values at the end of the tick.
+    for d in range(0, num_devices):
+      for s in range(0, num_subchunks):
+        rank = rank_by_s_d[s][d]
+        seg_index = (rank + num_devices - (2 + tick)) % num_devices
+        chunk_index = (seg_index * num_subchunks) + s
+        chunks_by_dev[d][chunk_index] = new_partial_reductions[chunk_index]
+  return chunks_by_dev, split_pad_len
+
+
+def _apply_unary_to_chunks(f, chunks_by_dev):
+  """Apply a unary op to each tensor in chunks_by_dev, on same device.
+
+  Args:
+    f: a unary function over T `tf.Tensor`.
+    chunks_by_dev: list of lists of T `tf.Tensor`.
+
+  Returns:
+    new list of lists of T `tf.Tensor` with the same structure as
+    chunks_by_dev containing the derived tensors.
+  """
+  output = []
+  for x in chunks_by_dev:
+    with ops.colocate_with(x[0]):
+      output.append([f(t) for t in x])
+  return output
+
+
+def _build_ring_scatter(pred_by_s_d, rank_by_s_d,
+                        chunks_by_dev):
+  """Construct subgraph for second (scatter) pass of ring all-reduce.
+
+  Args:
+    pred_by_s_d: as produced by _ring_permutations
+    rank_by_s_d: as produced by _ring_permutations
+    chunks_by_dev: list of list of T `tf.Tensor` indexed by ints
+      (device, chunk)
+
+  Raises:
+    ValueError: chunks_by_dev is not well-formed
+
+  Returns:
+    list of T `tf.Tensor` which are the fully reduced tensors, one
+    at each device corresponding to the outer dimension of chunks_by_dev.
+  """
+  num_devices = len(chunks_by_dev)
+  num_chunks = len(chunks_by_dev[0])
+  if 0 != num_chunks % num_devices:
+    raise ValueError(
+        "Expect number of chunks per device to be divisible by num_devices")
+  num_subchunks = int(num_chunks / num_devices)
+  num_ticks = num_devices - 1
+  for tick in range(0, num_ticks):
+    passed_values = [None for _ in range(0, num_chunks)]
+    for d in range(0, num_devices):
+      with ops.colocate_with(chunks_by_dev[d][0]):
+        for s in range(0, num_subchunks):
+          rank = rank_by_s_d[s][d]
+          seg_index = (rank + num_devices - (1 + tick)) % num_devices
+          pred_dev = pred_by_s_d[s][d]
+          chunk_index = (seg_index * num_subchunks) + s
+          passed_values[chunk_index] = array_ops.identity(
+              chunks_by_dev[pred_dev][chunk_index])
+    for d in range(0, num_devices):
+      for s in range(0, num_subchunks):
+        rank = rank_by_s_d[s][d]
+        seg_index = (rank + num_devices - (1 + tick)) % num_devices
+        chunk_index = (seg_index * num_subchunks) + s
+        chunks_by_dev[d][chunk_index] = passed_values[chunk_index]
+  # Join chunks at each device.
+  output = []
+  for x in chunks_by_dev:
+    with ops.colocate_with(x[0]):
+      output.append(array_ops.concat(x, 0))
+  return output
+
+
+def build_recursive_hd_all_reduce(input_tensors, red_op, un_op=None):
+  """Construct a subgraph for recursive halving-doubling all-reduce.
+
+  The recursive halving-doubling algorithm is described in
+  http://www.mcs.anl.gov/~thakur/papers/ijhpca-coll.pdf
+
+  The concept is to arrange the participating n devices in
+  a linear sequence where devices exchange data pairwise
+  with one other device in each round.  During the gather
+  phase there are lg(n) rounds where devices exchange
+  increasingly smaller sub-tensors with another device
+  at increasingly greater distances, until at the top
+  each device has 1/n of the fully reduced values.  During the
+  scatter phase each device exchanges its fully reduced
+  sub-tensor (which doubles in length at each round)
+  with one other device at increasingly smaller distances
+  until each device has all of the fully reduced values.
+
+  Note: this preliminary version requires that len(input_tensors) be a
+    power of 2.  TODO(tucker): relax this restriction.  Also, the
+    number of elements in each tensor must be divisible by 2^h where h
+    is the number of hops in each phase.  This will also be relaxed in
+    the future with edge-case specific logic.
+
+  Args:
+    input_tensors: list of T `tf.Tensor` to be elementwise reduced.
+    red_op: a binary elementwise reduction Op.
+    un_op: an optional unary elementwise Op to apply to reduced values.
+
+  Returns:
+    list of T `tf.Tensor` which are the fully reduced tensors, one
+    at each device of input_tensors.
+
+  Raises:
+    ValueError: num_devices not a power of 2, or tensor len not divisible
+    by 2 the proper number of times.
+  """
+  devices = [t.device for t in input_tensors]
+  input_tensors, shape = _flatten_tensors(input_tensors)
+  reduced_shards = _build_recursive_hd_gather(input_tensors, devices, red_op)
+  if un_op:
+    reduced_shards = [un_op(t) for t in reduced_shards]
+  output_tensors = _build_recursive_hd_scatter(reduced_shards, devices)
+  if len(shape) != 1:
+    output_tensors = _reshape_tensors(output_tensors, shape)
+  return output_tensors
+
+
+def _build_recursive_hd_gather(input_tensors, devices, red_op):
+  """Construct the gather phase of recursive halving-doubling all-reduce.
+
+  Args:
+    input_tensors: list of T `tf.Tensor` to be elementwise reduced.
+    devices: a list of strings naming the devices hosting input_tensors,
+      which will also be used to host the (partial) reduction values.
+    red_op: a binary elementwise reduction Op.
+
+  Returns:
+    list of T `tf.Tensor` which are the fully reduced tensor shards.
+
+  Raises:
+    ValueError: num_devices not a power of 2, or tensor len not divisible
+    by 2 the proper number of times.
+  """
+  num_devices = len(devices)
+  num_hops = int(math.log(num_devices, 2))
+  if num_devices != (2 ** num_hops):
+    raise ValueError("num_devices must be a power of 2")
+  chunks = input_tensors
+  for h in range(0, num_hops):
+    span = 2 ** h
+    group_size = span * 2
+    new_chunks = [[] for _ in devices]
+    for d in range(0, num_devices):
+      if (d % group_size) >= (group_size / 2):
+        # skip right half of a pair
+        continue
+      left_dev = devices[d]
+      right_dev = devices[d + span]
+      left_split = array_ops.split(chunks[d], 2)
+      right_split = array_ops.split(chunks[d+span], 2)
+      with ops.device(left_dev):
+        new_chunks[d] = red_op(left_split[0], right_split[0])
+      with ops.device(right_dev):
+        new_chunks[d + span] = red_op(left_split[1], right_split[1])
+    chunks = new_chunks
+  return chunks
+
+
+def _build_recursive_hd_scatter(input_tensors, devices):
+  """Construct the scatter phase of recursive halving-doublng all-reduce.
+
+  Args:
+    input_tensors: list of T `tf.Tensor` that are fully-reduced shards.
+    devices: a list of strings naming the devices on which the reconstituted
+      full tensors should be placed.
+
+  Returns:
+    list of T `tf.Tensor` which are the fully reduced tensors.
+  """
+  num_devices = len(devices)
+  num_hops = int(math.log(num_devices, 2))
+  assert num_devices == (2 ** num_hops), "num_devices must be a power of 2"
+  chunks = input_tensors
+  for h in reversed(range(0, num_hops)):
+    span = 2 ** h
+    group_size = span * 2
+    new_chunks = [[] for _ in devices]
+    for d in range(0, num_devices):
+      if (d % group_size) >= (group_size / 2):
+        # skip right half of a pair
+        continue
+      left_idx = d
+      right_idx = d + span
+      left_dev = devices[left_idx]
+      right_dev = devices[right_idx]
+      with ops.device(left_dev):
+        new_chunks[left_idx] = array_ops.concat([chunks[left_idx],
+                                                 chunks[right_idx]], 0)
+      with ops.device(right_dev):
+        new_chunks[right_idx] = array_ops.concat([chunks[left_idx],
+                                                  chunks[right_idx]], 0)
+    chunks = new_chunks
+  return chunks
+
+
+def build_shuffle_all_reduce(input_tensors, gather_devices, red_op, un_op=None):
+  """Construct a subgraph for shuffle all-reduce.
+
+  Shuffle reduce is essentially the algorithm implemented when using
+  parameter servers.  Suppose tensor length is n, there are d devices
+  and g gather shards.  Each device sends a n/g length sub-tensor to
+  each gather shard.  The gather shards perform a reduction across d
+  fragments, then broadcast the result back to each device.  The
+  devices then join the g fully reduced fragments they receive from
+  the shards.  The gather shards could perform d-1 pairwise
+  reductions, or one d-way reduction.  The first is better where
+  reduction Op time is low compared to transmission time, the second
+  better in the other case.
+
+  Args:
+    input_tensors: list of T @(tf.Tensor} values to be reduced.
+    gather_devices: list of names of devices on which reduction shards
+      should be placed.
+    red_op: an n-array elementwise reduction Op
+    un_op: optional elementwise unary Op to be applied to fully-reduced values.
+
+  Returns:
+    list of T `tf.Tensor` which are the fully reduced tensors.
+  """
+  input_tensors, shape = _flatten_tensors(input_tensors)
+  dst_devices = [t.device for t in input_tensors]
+  reduced_shards = _build_shuffle_gather(input_tensors, gather_devices,
+                                         red_op, un_op)
+  output_tensors = _build_shuffle_scatter(reduced_shards, dst_devices)
+  if len(shape) != 1:
+    output_tensors = _reshape_tensors(output_tensors, shape)
+  return output_tensors
+
+
+def _build_shuffle_gather(input_tensors, gather_devices, red_op, un_op=None):
+  """Construct the gather (concentrate and reduce) phase of shuffle all-reduce.
+
+  Args:
+    input_tensors: list of T @(tf.Tensor} values to be reduced.
+    gather_devices: list of names of devices on which reduction shards
+      should be placed.
+    red_op: the binary reduction Op
+    un_op: optional elementwise unary Op to be applied to fully-reduced values.
+
+  Returns:
+    list of T `tf.Tensor` which are the fully reduced shards.
+
+  Raises:
+    ValueError: inputs not well-formed.
+  """
+  num_source_devices = len(input_tensors)
+  num_gather_devices = len(gather_devices)
+  shape = input_tensors[0].shape
+  if len(shape) != 1:
+    raise ValueError("input_tensors must be 1D")
+  shards_by_source = []
+  for d in range(0, num_source_devices):
+    with ops.colocate_with(input_tensors[d]):
+      shards_by_source.append(
+          _ragged_split(input_tensors[d], num_gather_devices))
+  reduced_shards = []
+  for d in range(0, num_gather_devices):
+    with ops.device(gather_devices[d]):
+      values = [s[d] for s in shards_by_source]
+      red_shard = red_op(values)
+      if un_op:
+        red_shard = un_op(red_shard)
+      reduced_shards.append(red_shard)
+  return reduced_shards
+
+
+def _build_shuffle_scatter(reduced_shards, dst_devices):
+  """Build the scatter phase of shuffle all-reduce.
+
+  Args:
+    reduced_shards:  list of T @(tf.Tensor} fully reduced shards
+    dst_devices: list of names of devices at which the fully-reduced value
+      should be reconstituted.
+
+  Returns:
+    list of T `tf.Tensor` scattered tensors.
+  """
+  num_devices = len(dst_devices)
+  out_tensors = []
+  for d in range(0, num_devices):
+    with ops.device(dst_devices[d]):
+      out_tensors.append(array_ops.concat(reduced_shards, 0))
+  return out_tensors
+
+
+def _split_by_task(devices, values):
+  """Partition devices and values by common task.
+
+  Args:
+    devices: list of device name strings
+    values: list of T `tf.tensor` of same length as devices.
+
+  Returns:
+    (per_task_devices, per_task_values) where both values are
+    lists of lists with isomorphic structure: the outer list is
+    indexed by task, and the inner list has length of the number
+    of values belonging to that task.  per_task_devices contains
+    the specific devices to which the values are local, and
+    per_task_values contains the corresponding values.
+
+  Raises:
+    ValueError: devices must be same length as values.
+  """
+  num_devices = len(devices)
+  if num_devices != len(values):
+    raise ValueError("len(devices) must equal len(values)")
+  per_task_devices = collections.OrderedDict()
+  per_task_values = collections.OrderedDict()
+  for d in range(num_devices):
+    d_spec = device_lib.DeviceSpec.from_string(devices[d])
+    if not hasattr(d_spec, "task") or d_spec.task is None:
+      assert False, "failed to parse device %s" % devices[d]
+    index = (d_spec.job or "localhost", d_spec.replica or 0, d_spec.task)
+    if index not in per_task_devices:
+      per_task_devices[index] = []
+      per_task_values[index] = []
+    per_task_devices[index].append(devices[d])
+    per_task_values[index].append(values[d])
+
+  return (list(per_task_devices.values()), list(per_task_values.values()))
+
+
+def build_nccl_all_reduce(input_tensors, red_op, un_op=None):
+  """Build a subgraph that does one full all-reduce, using NCCL.
+
+  Args:
+    input_tensors: list of T `tf.Tensor` of same-shape and type values to
+      be reduced.
+    red_op: binary elementwise reduction operator.  Must be one of
+      {tf.add}
+    un_op: optional unary elementwise Op to apply to fully-reduce values.
+
+  Returns:
+    list of T `tf.Tensor` of reduced values.
+
+  Raises:
+    ValueError: red_op not supported.
+  """
+  if red_op == math_ops.add:
+    output_tensors = nccl_ops.all_sum(input_tensors)
+  else:
+    raise ValueError("red_op not supported by NCCL all-reduce: ", red_op)
+  if un_op:
+    un_op_wrapped = []
+    for t in output_tensors:
+      with ops.colocate_with(t):
+        un_op_wrapped.append(un_op(t))
+    output_tensors = un_op_wrapped
+  return output_tensors
+
+
+def _build_nccl_hybrid(input_tensors, red_op, upper_level_f):
+  """Construct a subgraph for NCCL hybrid all-reduce.
+
+  Args:
+    input_tensors: list of T `tf.Tensor` of same-shape and type values to
+      be reduced.
+    red_op: binary elementwise reduction operator.
+    upper_level_f: function for reducing one value per worker, across
+      workers.
+
+  Returns:
+    list of T `tf.Tensor` of reduced values.
+
+  Raises:
+    ValueError: inputs not well-formed.
+  """
+  input_tensors, shape = _flatten_tensors(input_tensors)
+  devices = [t.device for t in input_tensors]
+  per_worker_devices, per_worker_values = _split_by_task(devices, input_tensors)
+  num_workers = len(per_worker_devices)
+  up_values = [None for w in range(0, num_workers)]
+  up_devices = up_values[:]
+  down_values = up_values[:]
+  # First stage: reduce within each worker using NCCL
+  for w in range(0, num_workers):
+    worker_values = build_nccl_all_reduce(per_worker_values[w], red_op)
+    # NOTE: these reductions will not run to completion unless
+    # every output value is used.  Since we only need one, we
+    # need to put control dependencies on the rest.
+    with ops.control_dependencies(worker_values):
+      with ops.device(worker_values[0].device):
+        up_values[w] = array_ops.identity(worker_values[0])
+      up_devices[w] = per_worker_devices[w][0]
+  # Second stage: Apply upper_level_f to reduce across first device at
+  # each worker
+  level_2_output = upper_level_f(up_values)
+  # Third stage: propagate within each worker using NCCL Broadcast
+  for w in range(0, num_workers):
+    dst_tensors = []
+    with ops.device(per_worker_devices[w][0]):
+      broadcast_src = nccl_ops.broadcast(array_ops.identity(level_2_output[w]))
+    for d in per_worker_devices[w]:
+      with ops.device(d):
+        dst_tensors.append(array_ops.identity(broadcast_src))
+    down_values[w] = dst_tensors
+  output_tensors = [v for sublist in down_values for v in sublist]
+  if len(shape) != 1:
+    output_tensors = _reshape_tensors(output_tensors, shape)
+  return output_tensors
+
+
+def _reduce_non_singleton(input_tensors, red_f, un_op):
+  """If len(input_tensors) > 1, apply red_f, else apply un_op."""
+  if len(input_tensors) > 1:
+    return red_f(input_tensors)
+  else:
+    if not un_op:
+      return input_tensors
+    output_tensors = []
+    for t in input_tensors:
+      with ops.colocate_with(t):
+        output_tensors.append(un_op(t))
+    return output_tensors
+
+
+def build_nccl_then_ring(input_tensors, subdiv, red_op, un_op=None):
+  """Construct hybrid of NCCL within workers, Ring across workers."""
+  def upper_builder(y):
+    return build_ring_all_reduce(y, len(y), subdiv, [0], red_op, un_op)
+  def upper_level_f(x):
+    return _reduce_non_singleton(x, upper_builder, un_op)
+  return _build_nccl_hybrid(input_tensors, red_op, upper_level_f)
+
+
+def build_nccl_then_recursive_hd(input_tensors, red_op, un_op=None):
+  """Construct hybrid of NCCL within workers, Recursive-HD across workers."""
+  upper_level_f = lambda x: build_recursive_hd_all_reduce(x, red_op, un_op)
+  return _build_nccl_hybrid(input_tensors, red_op, upper_level_f)
+
+
+def build_nccl_then_shuffle(input_tensors, gather_devices, nccl_red_op,
+                            shuffle_red_op, un_op=None):
+  """Construct hybrid of NCCL within workers, Shuffle across workers."""
+  def upper_level_f(x):
+    return build_shuffle_all_reduce(x, gather_devices, shuffle_red_op, un_op)
+
+  return _build_nccl_hybrid(input_tensors, nccl_red_op, upper_level_f)
+
+
+def _build_shuffle_hybrid(input_tensors, gather_devices, red_op, upper_level_f):
+  """Construct a subgraph for Shuffle hybrid all-reduce.
+
+  Args:
+    input_tensors: list of T `tf.Tensor` of same-shape and type values to
+      be reduced.
+    gather_devices: list of device names on which to host gather shards.
+    red_op: binary elementwise reduction operator.
+    upper_level_f: function for reducing one value per worker, across
+      workers.
+
+  Returns:
+    list of T `tf.Tensor` of reduced values.
+
+  Raises:
+    ValueError: inputs not well-formed.
+  """
+  input_tensors, shape = _flatten_tensors(input_tensors)
+  # First stage, reduce across each worker using gather_devices.
+  devices = [t.device for t in input_tensors]
+  per_worker_devices, per_worker_values = _split_by_task(devices, input_tensors)
+  num_workers = len(per_worker_devices)
+  up_values = []
+  if len(gather_devices) != num_workers:
+    raise ValueError("For shuffle hybrid, gather_devices must contain one "
+                     "device per worker. ")
+  for w in range(0, num_workers):
+    reduced_shards = _build_shuffle_gather(
+        per_worker_values[w], [gather_devices[w]], red_op)
+    up_values.append(reduced_shards[0])
+  # Second stage, apply upper_level_f.
+  level_2_output = upper_level_f(up_values)
+  # Third stage, apply shuffle scatter at each worker.
+  output_tensors = []
+  for w in range(0, num_workers):
+    output_tensors += _build_shuffle_scatter(
+        [level_2_output[w]], per_worker_devices[w])
+  if len(shape) != 1:
+    output_tensors = _reshape_tensors(output_tensors, shape)
+  return output_tensors
+
+
+def build_shuffle_then_ring(input_tensors, gather_devices, subdiv,
+                            red_n_op, red_op, un_op=None):
+  """Construct hybrid of Shuffle within workers, Ring across workers."""
+  def upper_builder(tensors):
+    return build_ring_all_reduce(tensors, len(tensors), subdiv, [0],
+                                 red_op, un_op)
+  def upper_level_f(tensors):
+    return _reduce_non_singleton(tensors, upper_builder, un_op)
+  return _build_shuffle_hybrid(
+      input_tensors, gather_devices, red_n_op, upper_level_f)
+
+
+def build_shuffle_then_shuffle(input_tensors, first_gather_devices,
+                               second_gather_devices, red_op, un_op=None):
+  """Construct hybrid of Shuffle within workers, Shuffle across workers."""
+  def upper_builder(tensors):
+    return build_shuffle_all_reduce(tensors, second_gather_devices,
+                                    red_op, un_op)
+  def upper_level_f(tensors):
+    return _reduce_non_singleton(tensors, upper_builder, un_op)
+  return _build_shuffle_hybrid(
+      input_tensors, first_gather_devices, red_op, upper_level_f)
diff --git a/tensorflow/contrib/all_reduce/python/all_reduce_test.py b/tensorflow/python/distribute/all_reduce_test.py
similarity index 97%
rename from tensorflow/contrib/all_reduce/python/all_reduce_test.py
rename to tensorflow/python/distribute/all_reduce_test.py
index 304fd7fb8a3..2c6b853124c 100644
--- a/tensorflow/contrib/all_reduce/python/all_reduce_test.py
+++ b/tensorflow/python/distribute/all_reduce_test.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for tensorflow.contrib.all_reduce.python..all_reduce."""
+"""Tests for all_reduce."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -22,8 +22,8 @@ import time
 
 import numpy as np
 
-from tensorflow.contrib.all_reduce.python import all_reduce as ar
 from tensorflow.core.framework import types_pb2
+from tensorflow.python.distribute import all_reduce as ar
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
@@ -37,6 +37,7 @@ from tensorflow.python.platform import tf_logging
 
 class AllReduceTest(test_util.TensorFlowTestCase):
 
+  @test_util.run_deprecated_v1
   def testFlattenTensorsShapesDefined(self):
     x = array_ops.placeholder(types_pb2.DT_FLOAT, [None])
     with self.assertRaisesRegexp(ValueError,
@@ -100,6 +101,7 @@ class AllReduceTest(test_util.TensorFlowTestCase):
           input_tensors.append(array_ops.identity(t8))
     return input_tensors, device_names
 
+  @test_util.run_deprecated_v1
   def testBuildRingGatherPassStructure(self):
     # 1 worker, 1 device
     input_tensors, device_names = self._buildInput(1, 1)
@@ -159,7 +161,7 @@ class AllReduceTest(test_util.TensorFlowTestCase):
       output_tensors = build_f(input_tensors, un_op)
       sum_reduced = math_ops.add_n(output_tensors)
       sum_reduced.op.run()
-      self.assertAllClose(sum_reduced.eval(), simple_sum.eval())
+      self.assertAllClose(sum_reduced.eval(), self.evaluate(simple_sum))
 
   def _testRingAllReduce(self, num_workers, num_gpus, shape, subdiv):
     start_time = time.time()
@@ -170,6 +172,7 @@ class AllReduceTest(test_util.TensorFlowTestCase):
                     "subdiv=%d elapsed=%f" %
                     (num_workers, num_gpus, shape, subdiv, elapsed))
 
+  @test_util.run_deprecated_v1
   def testRingAllReduce(self):
     self._testRingAllReduce(1, 2, [], 1)
     self._testRingAllReduce(1, 2, [8], 1)
@@ -199,6 +202,7 @@ class AllReduceTest(test_util.TensorFlowTestCase):
     tf_logging.info("ShuffleAllReduce num_workers=%d num_gpus=%d shape=%s "
                     "elapsed=%f" % (num_workers, num_gpus, shape, elapsed))
 
+  @test_util.run_deprecated_v1
   def testShuffleAllReduce(self):
     self._testShuffleAllReduce(1, 2, [], 1)
     self._testShuffleAllReduce(1, 2, [8], 1)
@@ -225,6 +229,7 @@ class AllReduceTest(test_util.TensorFlowTestCase):
                     "shape=%s elapsed=%f" %
                     (num_workers, num_gpus, shape, elapsed))
 
+  @test_util.run_deprecated_v1
   def testRecursiveHDAllReduce(self):
     self._testRecursiveHDAllReduce(1, 2, [8])
     self._testRecursiveHDAllReduce(1, 2, [4, 4])
diff --git a/tensorflow/python/distribute/cluster_resolver/BUILD b/tensorflow/python/distribute/cluster_resolver/BUILD
new file mode 100644
index 00000000000..360a2993cd9
--- /dev/null
+++ b/tensorflow/python/distribute/cluster_resolver/BUILD
@@ -0,0 +1,180 @@
+# Description: Operations defined for Cluster Resolvers
+
+load("//tensorflow:tensorflow.bzl", "tf_py_test")
+
+package(
+    default_visibility = [
+        "//tensorflow:__subpackages__",
+    ],
+)
+
+licenses(["notice"])  # Apache 2.0
+
+py_library(
+    name = "cluster_resolver_lib",
+    srcs = [
+        "__init__.py",
+    ],
+    srcs_version = "PY2AND3",
+    visibility = ["//visibility:public"],
+    deps = [
+        ":base_cluster_resolver_py",
+        ":gce_cluster_resolver_py",
+        ":kubernetes_cluster_resolver_py",
+        ":slurm_cluster_resolver_py",
+        ":tfconfig_cluster_resolver_py",
+        ":tpu_cluster_resolver_py",
+        "//tensorflow/python:util",
+    ],
+)
+
+py_library(
+    name = "base_cluster_resolver_py",
+    srcs = ["cluster_resolver.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:training_server_lib",
+    ],
+)
+
+py_library(
+    name = "gce_cluster_resolver_py",
+    srcs = ["gce_cluster_resolver.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":base_cluster_resolver_py",
+        "//tensorflow/python:training_server_lib",
+    ],
+)
+
+py_library(
+    name = "tfconfig_cluster_resolver_py",
+    srcs = ["tfconfig_cluster_resolver.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":base_cluster_resolver_py",
+        "//tensorflow/python:training_server_lib",
+    ],
+)
+
+py_library(
+    name = "tpu_cluster_resolver_py",
+    srcs = ["tpu_cluster_resolver.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":base_cluster_resolver_py",
+        "//tensorflow/python:training_server_lib",
+    ],
+)
+
+py_library(
+    name = "slurm_cluster_resolver_py",
+    srcs = ["slurm_cluster_resolver.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":base_cluster_resolver_py",
+        "//tensorflow/python:training_server_lib",
+    ],
+)
+
+py_library(
+    name = "kubernetes_cluster_resolver_py",
+    srcs = ["kubernetes_cluster_resolver.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":base_cluster_resolver_py",
+        "//tensorflow/python:training_server_lib",
+    ],
+)
+
+tf_py_test(
+    name = "base_cluster_resolver_py_test",
+    srcs = ["cluster_resolver_test.py"],
+    additional_deps = [
+        ":base_cluster_resolver_py",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:training_server_lib",
+    ],
+    main = "cluster_resolver_test.py",
+)
+
+tf_py_test(
+    name = "gce_cluster_resolver_py_test",
+    size = "small",
+    srcs = ["gce_cluster_resolver_test.py"],
+    additional_deps = [
+        ":gce_cluster_resolver_py",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:training_server_lib",
+    ],
+    main = "gce_cluster_resolver_test.py",
+)
+
+tf_py_test(
+    name = "tfconfig_cluster_resolver_py_test",
+    size = "small",
+    srcs = ["tfconfig_cluster_resolver_test.py"],
+    additional_deps = [
+        ":tfconfig_cluster_resolver_py",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:training_server_lib",
+    ],
+    grpc_enabled = True,
+    main = "tfconfig_cluster_resolver_test.py",
+)
+
+tf_py_test(
+    name = "tpu_cluster_resolver_py_test",
+    size = "small",
+    srcs = ["tpu_cluster_resolver_test.py"],
+    additional_deps = [
+        ":tpu_cluster_resolver_py",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:training_server_lib",
+    ],
+    grpc_enabled = True,
+    main = "tpu_cluster_resolver_test.py",
+)
+
+tf_py_test(
+    name = "slurm_cluster_resolver_py_test",
+    size = "small",
+    srcs = ["slurm_cluster_resolver_test.py"],
+    additional_deps = [
+        ":slurm_cluster_resolver_py",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:training_server_lib",
+    ],
+    main = "slurm_cluster_resolver_test.py",
+    tags = [],
+)
+
+tf_py_test(
+    name = "kubernetes_cluster_resolver_py_test",
+    size = "small",
+    srcs = ["kubernetes_cluster_resolver_test.py"],
+    additional_deps = [
+        ":kubernetes_cluster_resolver_py",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:training_server_lib",
+    ],
+    main = "kubernetes_cluster_resolver_test.py",
+)
diff --git a/tensorflow/contrib/cluster_resolver/README.md b/tensorflow/python/distribute/cluster_resolver/README.md
similarity index 100%
rename from tensorflow/contrib/cluster_resolver/README.md
rename to tensorflow/python/distribute/cluster_resolver/README.md
diff --git a/tensorflow/contrib/cluster_resolver/python/training/README.slurm b/tensorflow/python/distribute/cluster_resolver/README.slurm
similarity index 100%
rename from tensorflow/contrib/cluster_resolver/python/training/README.slurm
rename to tensorflow/python/distribute/cluster_resolver/README.slurm
diff --git a/tensorflow/python/distribute/cluster_resolver/__init__.py b/tensorflow/python/distribute/cluster_resolver/__init__.py
new file mode 100644
index 00000000000..ef87f59b7fd
--- /dev/null
+++ b/tensorflow/python/distribute/cluster_resolver/__init__.py
@@ -0,0 +1,57 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Library Imports for Cluster Resolvers."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.distribute.cluster_resolver import cluster_resolver
+from tensorflow.python.distribute.cluster_resolver import gce_cluster_resolver
+from tensorflow.python.distribute.cluster_resolver import kubernetes_cluster_resolver
+from tensorflow.python.distribute.cluster_resolver import slurm_cluster_resolver
+from tensorflow.python.distribute.cluster_resolver import tfconfig_cluster_resolver
+from tensorflow.python.distribute.cluster_resolver import tpu_cluster_resolver
+
+from tensorflow.python.distribute.cluster_resolver.cluster_resolver import ClusterResolver
+from tensorflow.python.distribute.cluster_resolver.cluster_resolver import SimpleClusterResolver
+from tensorflow.python.distribute.cluster_resolver.cluster_resolver import UnionClusterResolver
+from tensorflow.python.distribute.cluster_resolver.gce_cluster_resolver import GceClusterResolver
+from tensorflow.python.distribute.cluster_resolver.kubernetes_cluster_resolver import KubernetesClusterResolver
+from tensorflow.python.distribute.cluster_resolver.slurm_cluster_resolver import SlurmClusterResolver
+from tensorflow.python.distribute.cluster_resolver.tfconfig_cluster_resolver import TFConfigClusterResolver
+from tensorflow.python.distribute.cluster_resolver.tpu_cluster_resolver import TPUClusterResolver
+
+from tensorflow.python.util.all_util import remove_undocumented
+
+_allowed_symbols = [
+    'cluster_resolver',
+    'gce_cluster_resolver',
+    'kubernetes_cluster_resolver',
+    'slurm_cluster_resolver',
+    'tfconfig_cluster_resolver',
+    'tpu_cluster_resolver',
+    'ClusterResolver',
+    'SimpleClusterResolver',
+    'UnionClusterResolver',
+    'GceClusterResolver',
+    'KubernetesClusterResolver',
+    'TFConfigClusterResolver',
+    'TPUClusterResolver',
+    'SlurmClusterResolver',
+]
+
+remove_undocumented(__name__, _allowed_symbols)
+
diff --git a/tensorflow/python/distribute/cluster_resolver/cluster_resolver.py b/tensorflow/python/distribute/cluster_resolver/cluster_resolver.py
new file mode 100644
index 00000000000..7774ac0e122
--- /dev/null
+++ b/tensorflow/python/distribute/cluster_resolver/cluster_resolver.py
@@ -0,0 +1,374 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Cluster Resolvers are used for dynamic cluster IP/hostname resolution."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import abc
+
+import six
+
+from tensorflow.python.training.server_lib import ClusterSpec
+
+
+def format_master_url(master, rpc_layer=None):
+  if rpc_layer:
+    return '%s://%s' % (rpc_layer, master)
+  else:
+    return master
+
+
+@six.add_metaclass(abc.ABCMeta)
+class ClusterResolver(object):
+  """Abstract class for all implementations of ClusterResolvers.
+
+  This defines the skeleton for all implementations of ClusterResolvers.
+  ClusterResolvers are a way for TensorFlow to communicate with various cluster
+  management systems (e.g. GCE, AWS, etc...).
+
+  By letting TensorFlow communicate with these systems, we will be able to
+  automatically discover and resolve IP addresses for various TensorFlow
+  workers. This will eventually allow us to automatically recover from
+  underlying machine failures and scale TensorFlow worker clusters up and down.
+
+  Note to Implementors: In addition to these abstract methods, you must also
+  implement the task_type, task_index, and rpc_layer attributes. You may choose
+  to implement them either as properties with getters or setters or directly
+  set the attributes.
+
+  - task_type is the name of the server's current named job (e.g. 'worker',
+     'ps' in a distributed parameterized training job).
+  - task_index is the ordinal index of the server within the task type.
+  - rpc_layer is the protocol used by TensorFlow to communicate with other
+      TensorFlow servers in a distributed environment.
+  """
+
+  @abc.abstractmethod
+  def cluster_spec(self):
+    """Retrieve the current state of the cluster and returns a ClusterSpec.
+
+    Returns:
+      A ClusterSpec representing the state of the cluster at the moment this
+      function is called.
+
+    Implementors of this function must take care in ensuring that the
+    ClusterSpec returned is up-to-date at the time of calling this function.
+    This usually means retrieving the information from the underlying cluster
+    management system every time this function is invoked and reconstructing
+    a cluster_spec, rather than attempting to cache anything.
+    """
+    raise NotImplementedError()
+
+  @abc.abstractmethod
+  def master(self, task_type=None, task_index=None, rpc_layer=None):
+    """Retrieves the name or URL of the session master.
+
+    Args:
+      task_type: (Optional) The type of the TensorFlow task of the master.
+      task_index: (Optional) The index of the TensorFlow task of the master.
+      rpc_layer: (Optional) The RPC protocol for the given cluster.
+
+    Returns:
+      The name or URL of the session master.
+
+    Implementors of this function must take care in ensuring that the master
+    returned is up-to-date at the time to calling this function. This usually
+    means retrieving the master every time this function is invoked.
+    """
+    raise NotImplementedError()
+
+  @abc.abstractmethod
+  def num_accelerators_per_worker(self, session_config=None):
+    """Returns the number of accelerator cores per worker.
+
+    This returns the number of accelerator cores (such as GPUs and TPUs)
+    available per worker. If workers only has CPU cores available, then this
+    should return 0. This method will query the master for this information
+    if it is not otherwise known.
+
+    Args:
+      session_config: (Optional) Configuration for starting a new session to
+        query how many accelerator cores it has.
+    """
+    raise NotImplementedError()
+
+  @abc.abstractproperty
+  def environment(self):
+    """Returns the current environment which TensorFlow is running in."""
+    raise NotImplementedError()
+
+
+class SimpleClusterResolver(ClusterResolver):
+  """Simple implementation of ClusterResolver that accepts a ClusterSpec."""
+
+  def __init__(self, cluster_spec, master='', task_type=None, task_index=None,
+               environment='', num_accelerators_per_worker=0,
+               rpc_layer=None):
+    """Creates a SimpleClusterResolver from a ClusterSpec."""
+    super(SimpleClusterResolver, self).__init__()
+
+    self._task_type = task_type
+    self._task_index = task_index
+    self._environment = environment
+    self._num_accelerators_per_worker = num_accelerators_per_worker
+    self._rpc_layer = rpc_layer
+
+    if not isinstance(cluster_spec, ClusterSpec):
+      raise TypeError('cluster_spec must be a ClusterSpec.')
+    self._cluster_spec = cluster_spec
+
+    if not isinstance(master, str):
+      raise TypeError('master must be a string.')
+    self._master = master
+
+  def cluster_spec(self):
+    """Returns the ClusterSpec passed into the constructor."""
+    return self._cluster_spec
+
+  def master(self, task_type=None, task_index=None, rpc_layer=None):
+    """Returns the master address to use when creating a session.
+
+    Args:
+      task_type: (Optional) The type of the TensorFlow task of the master.
+      task_index: (Optional) The index of the TensorFlow task of the master.
+      rpc_layer: (Optional) The RPC used by distributed TensorFlow.
+
+    Returns:
+      The name or URL of the session master.
+
+    If a task_type and task_index is given, this will override the `master`
+    string passed into the initialization function.
+    """
+    if task_type is not None and task_index is not None:
+      master = self.cluster_spec().task_address(task_type, task_index)
+    else:
+      master = self._master
+
+    return format_master_url(master, rpc_layer=rpc_layer or self._rpc_layer)
+
+  @property
+  def task_type(self):
+    return self._task_type
+
+  @property
+  def task_index(self):
+    return self._task_index
+
+  @task_type.setter
+  def task_type(self, task_type):
+    self._task_type = task_type
+
+  @task_index.setter
+  def task_index(self, task_index):
+    self._task_index = task_index
+
+  @property
+  def environment(self):
+    return self._environment
+
+  def num_accelerators_per_worker(self, session_config=None):
+    """Returns the number of accelerator cores per worker.
+
+    Args:
+      session_config: Unused. The SimpleClusterResolver does not do automatic
+        detection of accelerators, so a TensorFlow session will never be
+        created, and thus a `session_config` is never necessary here, and will
+        be ignored.
+    """
+    del session_config
+    return self._num_accelerators_per_worker
+
+  @property
+  def rpc_layer(self):
+    return self._rpc_layer
+
+  @rpc_layer.setter
+  def rpc_layer(self, rpc_layer):
+    self._rpc_layer = rpc_layer
+
+
+class UnionClusterResolver(ClusterResolver):
+  """Performs a union on underlying ClusterResolvers.
+
+  This class performs a union given two or more existing ClusterResolvers. It
+  merges the underlying ClusterResolvers, and returns one unified ClusterSpec
+  when cluster_spec is called. The details of the merge function is
+  documented in the cluster_spec function.
+
+  For additional Cluster Resolver properties such as task type, task index,
+  rpc layer, environment, etc..., we will return the value from the first
+  ClusterResolver in the union.
+  """
+
+  def __init__(self, *args, **kwargs):
+    """Initializes a UnionClusterResolver with other ClusterResolvers.
+
+    Args:
+      *args: `ClusterResolver` objects to be unionized.
+      **kwargs:
+        rpc_layer - (Optional) Override value for the RPC layer used by
+          TensorFlow.
+        task_type - (Optional) Override value for the current task type.
+        task_index - (Optional) Override value for the current task index.
+
+    Raises:
+      TypeError: If any argument is not a subclass of `ClusterResolvers`.
+      ValueError: If there are no arguments passed.
+    """
+    super(UnionClusterResolver, self).__init__()
+
+    self._rpc_layer = kwargs.pop('rpc_layer', None)
+    self._task_type = kwargs.pop('task_type', None)
+    self._task_index = kwargs.pop('task_index', None)
+
+    if kwargs:
+      raise ValueError('Unexpected kwargs provided {!r}'.format(kwargs))
+
+    if not args:
+      raise ValueError('At least one ClusterResolver is required.')
+
+    for cluster_resolver in args:
+      if not isinstance(cluster_resolver, ClusterResolver):
+        raise TypeError('All arguments must be a sub-class of '
+                        '`ClusterResolver.`')
+    self._cluster_resolvers = args
+
+  def cluster_spec(self):
+    """Returns a union of all the ClusterSpecs from the ClusterResolvers.
+
+    Returns:
+      A ClusterSpec containing host information merged from all the underlying
+      ClusterResolvers.
+
+    Raises:
+      KeyError: If there are conflicting keys detected when merging two or
+      more dictionaries, this exception is raised.
+
+    Note: If there are multiple ClusterResolvers exposing ClusterSpecs with the
+    same job name, we will merge the list/dict of workers.
+
+    If *all* underlying ClusterSpecs expose the set of workers as lists, we will
+    concatenate the lists of workers, starting with the list of workers from
+    the first ClusterResolver passed into the constructor.
+
+    If *any* of the ClusterSpecs expose the set of workers as a dict, we will
+    treat all the sets of workers as dicts (even if they are returned as lists)
+    and will only merge them into a dict if there is no conflicting keys. If
+    there is a conflicting key, we will raise a `KeyError`.
+    """
+
+    merged_cluster = {}
+
+    # We figure out whether it is all lists for a particular job, or whether
+    # there are dicts inside.
+    for cluster_resolver in self._cluster_resolvers:
+      cluster_spec = cluster_resolver.cluster_spec()
+      cluster_dict = cluster_spec.as_dict()
+
+      for job_name, tasks in cluster_dict.items():
+        if job_name in merged_cluster:
+          # If we see a dict, then we write a dict out regardless.
+          if isinstance(tasks, dict):
+            merged_cluster[job_name] = {}
+        else:
+          # We take whichever type is present.
+          if isinstance(tasks, list):
+            merged_cluster[job_name] = []
+          else:
+            merged_cluster[job_name] = {}
+
+    # We then do the merge as appropriate in merged_cluster[job].
+    for cluster_resolver in self._cluster_resolvers:
+      cluster_spec = cluster_resolver.cluster_spec()
+      cluster_dict = cluster_spec.as_dict()
+
+      for job_name, tasks in cluster_dict.items():
+        if isinstance(merged_cluster[job_name], list):
+          # We all have lists, we can just concatenate and be done.
+          merged_cluster[job_name].extend(tasks)
+        else:
+          if isinstance(tasks, list):
+            # We convert to a dictionary if the type is a list.
+            task_dict = dict(zip(range(0, len(tasks)), tasks))
+          else:
+            # We can simply make a copy (for update) and be done.
+            task_dict = tasks.copy()
+
+          # We detect if there are duplicates, and raise an error if so.
+          task_keys = set(task_dict)
+          merged_keys = set(merged_cluster[job_name].keys())
+          intersected_keys = task_keys.intersection(merged_keys)
+          if intersected_keys:
+            raise KeyError('Duplicate keys detected when merging two '
+                           'ClusterSpecs: %s' % repr(intersected_keys))
+
+          # We do the merge after all the processing.
+          merged_cluster[job_name].update(task_dict)
+
+    return ClusterSpec(merged_cluster)
+
+  def master(self, task_type=None, task_index=None, rpc_layer=None):
+    """Returns the master address to use when creating a session.
+
+    This usually returns the master from the first ClusterResolver passed in,
+    but you can override this by specifying the task_type and task_index.
+
+    Args:
+      task_type: (Optional) The type of the TensorFlow task of the master.
+      task_index: (Optional) The index of the TensorFlow task of the master.
+      rpc_layer: (Optional) The RPC protocol for the given cluster.
+
+    Returns:
+      The name or URL of the session master.
+    """
+    if task_type is not None and task_index is not None:
+      master = self.cluster_spec().task_address(task_type, task_index)
+      return format_master_url(master, rpc_layer or self._rpc_layer)
+
+    return self._cluster_resolvers[0].master(rpc_layer=rpc_layer)
+
+  @property
+  def task_type(self):
+    return self._task_type or self._cluster_resolvers[0].task_type
+
+  @property
+  def task_index(self):
+    return self._task_index or self._cluster_resolvers[0].task_index
+
+  @task_type.setter
+  def task_type(self, task_type):
+    self._task_type = task_type
+
+  @task_index.setter
+  def task_index(self, task_index):
+    self._task_index = task_index
+
+  @property
+  def environment(self):
+    return self._cluster_resolvers[0].environment
+
+  def num_accelerators_per_worker(self, session_config=None):
+    return self._cluster_resolvers[0].num_accelerators_per_worker(
+        session_config)
+
+  @property
+  def rpc_layer(self):
+    return self._rpc_layer or self._cluster_resolvers[0].rpc_layer
+
+  @rpc_layer.setter
+  def rpc_layer(self, rpc_layer):
+    self._rpc_layer = rpc_layer
diff --git a/tensorflow/contrib/cluster_resolver/python/training/cluster_resolver_test.py b/tensorflow/python/distribute/cluster_resolver/cluster_resolver_test.py
similarity index 98%
rename from tensorflow/contrib/cluster_resolver/python/training/cluster_resolver_test.py
rename to tensorflow/python/distribute/cluster_resolver/cluster_resolver_test.py
index b94c9612b5b..b5448faec6b 100644
--- a/tensorflow/contrib/cluster_resolver/python/training/cluster_resolver_test.py
+++ b/tensorflow/python/distribute/cluster_resolver/cluster_resolver_test.py
@@ -18,8 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.cluster_resolver.python.training.cluster_resolver import SimpleClusterResolver
-from tensorflow.contrib.cluster_resolver.python.training.cluster_resolver import UnionClusterResolver
+from tensorflow.python.distribute.cluster_resolver import SimpleClusterResolver
+from tensorflow.python.distribute.cluster_resolver import UnionClusterResolver
 from tensorflow.python.platform import test
 from tensorflow.python.training import server_lib
 
diff --git a/tensorflow/python/distribute/cluster_resolver/gce_cluster_resolver.py b/tensorflow/python/distribute/cluster_resolver/gce_cluster_resolver.py
new file mode 100644
index 00000000000..b167bc8fc85
--- /dev/null
+++ b/tensorflow/python/distribute/cluster_resolver/gce_cluster_resolver.py
@@ -0,0 +1,206 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Implementation of Cluster Resolvers for GCE Instance Groups."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.distribute.cluster_resolver.cluster_resolver import ClusterResolver
+from tensorflow.python.training.server_lib import ClusterSpec
+
+_GOOGLE_API_CLIENT_INSTALLED = True
+try:
+  from googleapiclient import discovery  # pylint: disable=g-import-not-at-top
+  from oauth2client.client import GoogleCredentials  # pylint: disable=g-import-not-at-top
+except ImportError:
+  _GOOGLE_API_CLIENT_INSTALLED = False
+
+
+def _format_master_url(master, rpc_layer=None):
+  return '%s://%s' % (rpc_layer, master) if rpc_layer else master
+
+
+class GceClusterResolver(ClusterResolver):
+  """Cluster Resolver for Google Compute Engine.
+
+  This is an implementation of cluster resolvers for the Google Compute Engine
+  instance group platform. By specifying a project, zone, and instance group,
+  this will retrieve the IP address of all the instances within the instance
+  group and return a Cluster Resolver object suitable for use for distributed
+  TensorFlow.
+  """
+
+  def __init__(self,
+               project,
+               zone,
+               instance_group,
+               port,
+               task_type='worker',
+               task_index=0,
+               rpc_layer='grpc',
+               num_accelerators_per_worker=0,
+               credentials='default',
+               service=None):
+    """Creates a new GceClusterResolver object.
+
+    This takes in a few parameters and creates a GceClusterResolver project. It
+    will then use these parameters to query the GCE API for the IP addresses of
+    each instance in the instance group.
+
+    Args:
+      project: Name of the GCE project.
+      zone: Zone of the GCE instance group.
+      instance_group: Name of the GCE instance group.
+      port: Port of the listening TensorFlow server (default: 8470)
+      task_type: Name of the TensorFlow job this GCE instance group of VM
+        instances belong to.
+      task_index: The task index for this particular VM, within the GCE
+        instance group. In particular, every single instance should be assigned
+        a unique ordinal index within an instance group manually so that they
+        can be distinguished from each other.
+      rpc_layer: The RPC layer TensorFlow should use to communicate across
+        instances.
+      num_accelerators_per_worker: Number of accelerators (GPUs) present per
+        instance.
+      credentials: GCE Credentials. If nothing is specified, this defaults to
+        GoogleCredentials.get_application_default().
+      service: The GCE API object returned by the googleapiclient.discovery
+        function. (Default: discovery.build('compute', 'v1')). If you specify a
+        custom service object, then the credentials parameter will be ignored.
+
+    Raises:
+      ImportError: If the googleapiclient is not installed.
+    """
+    self._project = project
+    self._zone = zone
+    self._instance_group = instance_group
+    self._task_type = task_type
+    self._task_index = task_index
+    self._rpc_layer = rpc_layer
+    self._port = port
+    self._credentials = credentials
+
+    if credentials == 'default':
+      if _GOOGLE_API_CLIENT_INSTALLED:
+        self._credentials = GoogleCredentials.get_application_default()
+
+    if service is None:
+      if not _GOOGLE_API_CLIENT_INSTALLED:
+        raise ImportError('googleapiclient must be installed before using the '
+                          'GCE cluster resolver')
+      self._service = discovery.build(
+          'compute', 'v1',
+          credentials=self._credentials)
+    else:
+      self._service = service
+
+  def cluster_spec(self):
+    """Returns a ClusterSpec object based on the latest instance group info.
+
+    This returns a ClusterSpec object for use based on information from the
+    specified instance group. We will retrieve the information from the GCE APIs
+    every time this method is called.
+
+    Returns:
+      A ClusterSpec containing host information retrieved from GCE.
+    """
+    request_body = {'instanceState': 'RUNNING'}
+    request = self._service.instanceGroups().listInstances(
+        project=self._project,
+        zone=self._zone,
+        instanceGroups=self._instance_group,
+        body=request_body,
+        orderBy='name')
+
+    worker_list = []
+
+    while request is not None:
+      response = request.execute()
+
+      items = response['items']
+      for instance in items:
+        instance_name = instance['instance'].split('/')[-1]
+
+        instance_request = self._service.instances().get(
+            project=self._project,
+            zone=self._zone,
+            instance=instance_name)
+
+        if instance_request is not None:
+          instance_details = instance_request.execute()
+          ip_address = instance_details['networkInterfaces'][0]['networkIP']
+          instance_url = '%s:%s' % (ip_address, self._port)
+          worker_list.append(instance_url)
+
+      request = self._service.instanceGroups().listInstances_next(
+          previous_request=request,
+          previous_response=response)
+
+    worker_list.sort()
+    return ClusterSpec({self._task_type: worker_list})
+
+  def master(self, task_type=None, task_index=None, rpc_layer=None):
+    task_type = task_type if task_type is not None else self._task_type
+    task_index = task_index if task_index is not None else self._task_index
+
+    if task_type is not None and task_index is not None:
+      master = self.cluster_spec().task_address(task_type, task_index)
+      if rpc_layer or self._rpc_layer:
+        return '%s://%s' % (rpc_layer or self._rpc_layer, master)
+      else:
+        return master
+
+    return ''
+
+  @property
+  def task_type(self):
+    return self._task_type
+
+  @property
+  def task_index(self):
+    return self._task_index
+
+  @task_type.setter
+  def task_type(self, task_type):
+    raise RuntimeError(
+        'You cannot reset the task_type of the GceClusterResolver after it has '
+        'been created.')
+
+  @task_index.setter
+  def task_index(self, task_index):
+    self._task_index = task_index
+
+  @property
+  def environment(self):
+    """Returns the current environment which TensorFlow is running in.
+
+    For users in the GCE environment, the environment property is always an
+    empty string, and Google users will not use this ClusterResolver for running
+    on internal systems.
+    """
+    return ''
+
+  @property
+  def rpc_layer(self):
+    return self._rpc_layer
+
+  @rpc_layer.setter
+  def rpc_layer(self, rpc_layer):
+    self._rpc_layer = rpc_layer
+
+  def num_accelerators_per_worker(self, session_config=None):
+    del session_config  # Unused, since this is set manually in __init__.
+    return self._num_accelerators_per_worker
diff --git a/tensorflow/contrib/cluster_resolver/python/training/gce_cluster_resolver_test.py b/tensorflow/python/distribute/cluster_resolver/gce_cluster_resolver_test.py
similarity index 98%
rename from tensorflow/contrib/cluster_resolver/python/training/gce_cluster_resolver_test.py
rename to tensorflow/python/distribute/cluster_resolver/gce_cluster_resolver_test.py
index c691552e860..d4f0660c922 100644
--- a/tensorflow/contrib/cluster_resolver/python/training/gce_cluster_resolver_test.py
+++ b/tensorflow/python/distribute/cluster_resolver/gce_cluster_resolver_test.py
@@ -18,8 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.cluster_resolver.python.training.cluster_resolver import UnionClusterResolver
-from tensorflow.contrib.cluster_resolver.python.training.gce_cluster_resolver import GceClusterResolver
+from tensorflow.python.distribute.cluster_resolver import GceClusterResolver
+from tensorflow.python.distribute.cluster_resolver import UnionClusterResolver
 from tensorflow.python.platform import test
 from tensorflow.python.training import server_lib
 
diff --git a/tensorflow/python/distribute/cluster_resolver/kubernetes_cluster_resolver.py b/tensorflow/python/distribute/cluster_resolver/kubernetes_cluster_resolver.py
new file mode 100644
index 00000000000..041c0815409
--- /dev/null
+++ b/tensorflow/python/distribute/cluster_resolver/kubernetes_cluster_resolver.py
@@ -0,0 +1,173 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Implementation of Cluster Resolvers for Kubernetes."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.client import device_lib
+from tensorflow.python.distribute.cluster_resolver.cluster_resolver import ClusterResolver
+from tensorflow.python.distribute.cluster_resolver.cluster_resolver import format_master_url
+from tensorflow.python.training import server_lib
+
+_KUBERNETES_API_CLIENT_INSTALLED = True
+try:
+  from kubernetes import client as k8sclient  # pylint: disable=g-import-not-at-top
+  from kubernetes import config as k8sconfig  # pylint: disable=g-import-not-at-top
+except ImportError:
+  _KUBERNETES_API_CLIENT_INSTALLED = False
+
+
+class KubernetesClusterResolver(ClusterResolver):
+  """Cluster Resolver for Kubernetes.
+
+  This is an implementation of cluster resolvers for Kubernetes. When given the
+  the Kubernetes namespace and label selector for pods, we will retrieve the
+  pod IP addresses of all running pods matching the selector, and return a
+  ClusterSpec based on that information.
+  """
+
+  def __init__(self,
+               job_to_label_mapping=None,
+               tf_server_port=8470,
+               rpc_layer='grpc',
+               override_client=None):
+    """Initializes a new KubernetesClusterResolver.
+
+    This initializes a new Kubernetes Cluster Resolver. The Cluster Resolver
+    will attempt to talk to the Kubernetes master to retrieve all the instances
+    of pods matching a label selector.
+
+    Args:
+      job_to_label_mapping: A mapping of TensorFlow jobs to label selectors.
+        This allows users to specify many TensorFlow jobs in one Cluster
+        Resolver, and each job can have pods belong with different label
+        selectors. For example, a sample mapping might be
+        ```
+        {'worker': ['job-name=worker-cluster-a', 'job-name=worker-cluster-b'],
+         'ps': ['job-name=ps-1', 'job-name=ps-2']}
+        ```
+      tf_server_port: The port the TensorFlow server is listening on.
+      rpc_layer: (Optional) The RPC layer TensorFlow should use to communicate
+        between tasks in Kubernetes. Defaults to 'grpc'.
+      override_client: The Kubernetes client (usually automatically retrieved
+        using `from kubernetes import client as k8sclient`). If you pass this
+        in, you are responsible for setting Kubernetes credentials manually.
+
+    Raises:
+      ImportError: If the Kubernetes Python client is not installed and no
+        `override_client` is passed in.
+      RuntimeError: If autoresolve_task is not a boolean or a callable.
+    """
+    if _KUBERNETES_API_CLIENT_INSTALLED:
+      k8sconfig.load_kube_config()
+
+    if not job_to_label_mapping:
+      job_to_label_mapping = {'worker': ['job-name=tensorflow']}
+
+    if not override_client and not _KUBERNETES_API_CLIENT_INSTALLED:
+      raise ImportError('The Kubernetes Python client must be installed before'
+                        'using the Kubernetes Cluster Resolver. To install the'
+                        'Kubernetes Python client, run `pip install '
+                        'kubernetes` on your command line.')
+
+    self._job_to_label_mapping = job_to_label_mapping
+    self._tf_server_port = tf_server_port
+    self._override_client = override_client
+
+    self.task_type = None
+    self.task_index = None
+    self.rpc_layer = rpc_layer
+
+  def master(self, task_type=None, task_index=None, rpc_layer=None):
+    """Returns the master address to use when creating a session.
+
+    You must have set the task_type and task_index object properties before
+    calling this function, or pass in the `task_type` and `task_index`
+    parameters when using this function. If you do both, the function parameters
+    will override the object properties.
+
+    Args:
+      task_type: (Optional) The type of the TensorFlow task of the master.
+      task_index: (Optional) The index of the TensorFlow task of the master.
+      rpc_layer: (Optional) The RPC protocol for the given cluster.
+
+    Returns:
+      The name or URL of the session master.
+    """
+    if task_type is not None and task_index is not None:
+      return format_master_url(
+          self.cluster_spec().task_address(task_type, task_index),
+          rpc_layer or self.rpc_layer)
+
+    if self.task_type is not None and self.task_index is not None:
+      return format_master_url(
+          self.cluster_spec().task_address(self.task_type, self.task_index),
+          rpc_layer or self.rpc_layer)
+
+    return ''
+
+  def cluster_spec(self):
+    """Returns a ClusterSpec object based on the latest info from Kubernetes.
+
+    We retrieve the information from the Kubernetes master every time this
+    method is called.
+
+    Returns:
+      A ClusterSpec containing host information returned from Kubernetes.
+
+    Raises:
+      RuntimeError: If any of the pods returned by the master is not in the
+        `Running` phase.
+    """
+    if not self._override_client:
+      k8sconfig.load_kube_config()
+
+    client = self._override_client or k8sclient.CoreV1Api()
+    cluster_map = {}
+
+    for tf_job in self._job_to_label_mapping:
+      all_pods = []
+      for selector in self._job_to_label_mapping[tf_job]:
+        ret = client.list_pod_for_all_namespaces(label_selector=selector)
+        selected_pods = []
+
+        # Sort the list by the name to make sure it doesn't change call to call.
+        for pod in sorted(ret.items, key=lambda x: x.metadata.name):
+          if pod.status.phase == 'Running':
+            selected_pods.append(
+                '%s:%s' % (pod.status.host_ip, self._tf_server_port))
+          else:
+            raise RuntimeError('Pod "%s" is not running; phase: "%s"' %
+                               (pod.metadata.name, pod.status.phase))
+        all_pods.extend(selected_pods)
+      cluster_map[tf_job] = all_pods
+
+    return server_lib.ClusterSpec(cluster_map)
+
+  @property
+  def environment(self):
+    """Returns the current environment which TensorFlow is running in.
+
+    For users in the Cloud environment, the environment property is always an
+    empty string, and Google users will not use this ClusterResolver for running
+    on internal systems.
+    """
+    return ''
+
+  def num_accelerators_per_worker(self, session_config=None):
+    local_devices = device_lib.list_local_devices(session_config)
+    return len([d for d in local_devices if d.device_type == 'GPU'])
diff --git a/tensorflow/contrib/cluster_resolver/python/training/kubernetes_cluster_resolver_test.py b/tensorflow/python/distribute/cluster_resolver/kubernetes_cluster_resolver_test.py
similarity index 87%
rename from tensorflow/contrib/cluster_resolver/python/training/kubernetes_cluster_resolver_test.py
rename to tensorflow/python/distribute/cluster_resolver/kubernetes_cluster_resolver_test.py
index fbb26e803d7..a9750fa60b9 100644
--- a/tensorflow/contrib/cluster_resolver/python/training/kubernetes_cluster_resolver_test.py
+++ b/tensorflow/python/distribute/cluster_resolver/kubernetes_cluster_resolver_test.py
@@ -18,7 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.cluster_resolver.python.training import KubernetesClusterResolver
+from tensorflow.python.distribute.cluster_resolver import KubernetesClusterResolver
 from tensorflow.python.platform import test
 from tensorflow.python.training import server_lib
 
@@ -109,6 +109,23 @@ class KubernetesClusterResolverTest(test.TestCase):
     """
     self._verifyClusterSpecEquality(actual_cluster_spec, str(expected_proto))
 
+  def testGetMasterWithOverrideParameters(self):
+    ret = _create_pod_list(
+        ('worker-0', 'Running', '10.1.2.3'),
+        ('worker-1', 'Running', '10.1.2.4'),
+        ('worker-2', 'Running', '10.1.2.5'))
+
+    cluster_resolver = KubernetesClusterResolver(
+        override_client=_mock_kubernetes_client(
+            {'job-name=tensorflow': ret}))
+    cluster_resolver.task_type = 'worker'
+    cluster_resolver.task_index = 0
+    self.assertEqual(cluster_resolver.task_type, 'worker')
+    self.assertEqual(cluster_resolver.task_index, 0)
+    self.assertEqual(cluster_resolver.master(), 'grpc://10.1.2.3:8470')
+    self.assertEqual(cluster_resolver.master('worker', 2),
+                     'grpc://10.1.2.5:8470')
+
   def testNonRunningPod(self):
     ret = _create_pod_list(('tensorflow-abc123', 'Failed', '10.1.2.3'),)
 
diff --git a/tensorflow/python/distribute/cluster_resolver/slurm_cluster_resolver.py b/tensorflow/python/distribute/cluster_resolver/slurm_cluster_resolver.py
new file mode 100644
index 00000000000..fd3c6d6a18f
--- /dev/null
+++ b/tensorflow/python/distribute/cluster_resolver/slurm_cluster_resolver.py
@@ -0,0 +1,226 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Implementation of Cluster Resolvers for Slurm workload manager."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import os
+import subprocess
+
+from tensorflow.python.distribute.cluster_resolver.cluster_resolver import ClusterResolver
+from tensorflow.python.training.server_lib import ClusterSpec
+
+
+class SlurmClusterResolver(ClusterResolver):
+  """Cluster Resolver for system with Slurm workload manager.
+
+  This is an implementation of cluster resolvers for Slurm clusters. This allows
+  the specification of jobs and task counts, number of tasks per node, number of
+  GPUs on each node and number of GPUs for each task, It retrieves system
+  attributes by Slurm environment variables, resolves allocated computing node
+  names, construct a cluster and return a Cluster Resolver object which an be
+  use for distributed TensorFlow.
+  """
+
+  def _resolve_hostnames(self):
+    """Resolve host names of nodes allocated in current jobs.
+
+    Returns:
+      A list of node names as strings.
+    """
+    hostlist = (subprocess.check_output(['scontrol', 'show', 'hostname']).
+                decode('utf-8').strip().split('\n'))
+    return hostlist
+
+  def __init__(self,
+               jobs,
+               port_base=8888,
+               gpus_per_node=1,
+               gpus_per_task=1,
+               tasks_per_node=None,
+               auto_set_gpu=True,
+               rpc_layer='grpc'):
+    """Creates a new SlurmClusterResolver object.
+
+    This takes in parameters and creates a SlurmClusterResolver object. It uses
+    those parameters to check which nodes will processes reside and resolves
+    their hostnames. With the number of the GPUs on each node and number of GPUs
+    for each task it offsets the port number for each processes and allocate
+    GPUs to tasks by setting environment variables. The resolver currently
+    supports homogeneous tasks and default Slurm process allocation.
+
+    Args:
+      jobs: Dictionary with job names as key and number of tasks in the job as
+        value
+      port_base: The first port number to start with for processes on a node.
+      gpus_per_node: Number of GPUs available on each node.
+      gpus_per_task: Number of GPUs to be used for each task.
+      tasks_per_node: Number of tasks to run on each node, if not set defaults
+        to Slurm's output environment variable SLURM_NTASKS_PER_NODE.
+      auto_set_gpu: Set the visible CUDA devices automatically while resolving
+        the cluster by setting CUDA_VISIBLE_DEVICES environment variable.
+        Defaults to True.
+      rpc_layer: (Optional) The protocol TensorFlow uses to communicate between
+        nodes. Defaults to 'grpc'.
+
+    Returns:
+      A ClusterResolver object which can be used with distributed TensorFlow.
+
+    Raises:
+      RuntimeError: If requested more GPUs per node then available or requested
+      more tasks then assigned tasks.
+    """
+
+    # check if launched by mpirun
+    if 'OMPI_COMM_WORLD_RANK' in os.environ:
+      self._rank = int(os.environ['OMPI_COMM_WORLD_RANK'])
+      num_tasks = int(os.environ['OMPI_COMM_WORLD_SIZE'])
+    else:
+      self._rank = int(os.environ['SLURM_PROCID'])
+      num_tasks = int(os.environ['SLURM_NTASKS'])
+
+    self._jobs = collections.OrderedDict(sorted(jobs.items()))
+    self._port_base = port_base
+
+    # user specification overrides SLURM specification
+    if tasks_per_node is not None:
+      self._tasks_per_node = tasks_per_node
+    elif tasks_per_node is None and 'SLURM_NTASKS_PER_NODE' in os.environ:
+      self._tasks_per_node = int(os.environ['SLURM_NTASKS_PER_NODE'])
+    else:
+      raise RuntimeError('Neither `tasks_per_node` or '
+                         'SLURM_NTASKS_PER_NODE is set.')
+
+    self._gpus_per_node = gpus_per_node
+    self._gpus_per_task = gpus_per_task
+
+    self._auto_set_gpu = auto_set_gpu
+    self.task_type = None
+    self.task_index = None
+    self.rpc_layer = rpc_layer
+
+    self._gpu_allocation = []
+    self._cluster_allocation = {}
+
+    if self._tasks_per_node * self._gpus_per_task > self._gpus_per_node:
+      raise RuntimeError('Requested more GPUs per node then available.')
+
+    if sum(self._jobs.values()) != num_tasks:
+      raise RuntimeError('Requested more tasks then assigned tasks.')
+
+  def cluster_spec(self):
+    """Returns a ClusterSpec object based on the latest instance group info.
+
+    This returns a ClusterSpec object for use based on information from the
+    specified initialization parameters and Slurm environment variables. The
+    cluster specification is resolved each time this function is called. The
+    resolver extract hostnames of nodes by scontrol and pack tasks in that
+    order until a node a has number of tasks that is equal to specification.
+    GPUs on nodes are allocated to tasks by specification through setting
+    CUDA_VISIBLE_DEVICES environment variable.
+
+    Returns:
+      A ClusterSpec containing host information retrieved from Slurm's
+        environment variables.
+    """
+    hostlist = self._resolve_hostnames()
+
+    task_list = []
+    self._gpu_allocation = []
+    self._cluster_allocation = {}
+
+    for host in hostlist:
+      for port_offset, gpu_offset in zip(
+          range(self._tasks_per_node),
+          range(0, self._gpus_per_node, self._gpus_per_task)):
+
+        host_addr = '%s:%d' % (host, self._port_base + port_offset)
+        task_list.append(host_addr)
+        gpu_id_list = []
+
+        for gpu_id in range(gpu_offset, gpu_offset + self._gpus_per_task):
+          gpu_id_list.append(str(gpu_id))
+
+        self._gpu_allocation.append(','.join(gpu_id_list))
+
+    cluster_rank_offset_start = 0
+    cluster_rank_offset_end = 0
+
+    for task_type, num_tasks in self._jobs.items():
+      cluster_rank_offset_end = cluster_rank_offset_start + num_tasks
+
+      self._cluster_allocation[task_type] = (
+          task_list[cluster_rank_offset_start:cluster_rank_offset_end])
+
+      if cluster_rank_offset_start <= self._rank < cluster_rank_offset_end:
+        self.task_type = task_type
+        self.task_index = self._rank - cluster_rank_offset_start
+
+      cluster_rank_offset_start = cluster_rank_offset_end
+
+    if self._auto_set_gpu is True:
+      os.environ['CUDA_VISIBLE_DEVICES'] = self._gpu_allocation[self._rank]
+
+    return ClusterSpec(self._cluster_allocation)
+
+  def get_task_info(self):
+    """Returns job name and task_index for the process which calls this.
+
+    This returns the job name and task index for the process which calls this
+    function according to its rank and cluster specification. The job name and
+    task index are set after a cluster is constructed by cluster_spec otherwise
+    defaults to None.
+
+    Returns:
+      A string specifying job name the process belongs to and an integner
+        specifying the task index the process belongs to in that job.
+    """
+    return self.task_type, self.task_index
+
+  def master(self, task_type=None, task_index=None, rpc_layer=None):
+    """Returns the master string for connecting to a TensorFlow master.
+
+    Args:
+      task_type: (Optional) Overrides the default auto-selected task type.
+      task_index: (Optional) Overrides the default auto-slected task index.
+      rpc_layer: (Optional) Overrides the default RPC protocol TensorFlow uses
+        to communicate across nodes.
+
+    Returns:
+      A connection string for connecting to a TensorFlow master.
+    """
+    task_type = task_type if task_type is not None else self.task_type
+    task_index = task_index if task_index is not None else self.task_index
+    rpc_layer = rpc_layer or self.rpc_layer
+    master = self.cluster_spec().task_address(task_type, task_index)
+
+    return '%s://%s' % (rpc_layer, master) if rpc_layer else master
+
+  @property
+  def environment(self):
+    """Returns the current environment which TensorFlow is running in.
+
+    For users in the Slurm environment, the environment property is always an
+    empty string, and Google users will not use this ClusterResolver for running
+    on internal systems.
+    """
+    return ''
+
+  def num_accelerators_per_worker(self, session_config=None):
+    del session_config  # Unused, since this is set in __init__ manually.
+    return self._gpus_per_node
diff --git a/tensorflow/contrib/cluster_resolver/python/training/slurm_cluster_resolver_test.py b/tensorflow/python/distribute/cluster_resolver/slurm_cluster_resolver_test.py
similarity index 85%
rename from tensorflow/contrib/cluster_resolver/python/training/slurm_cluster_resolver_test.py
rename to tensorflow/python/distribute/cluster_resolver/slurm_cluster_resolver_test.py
index 9aa7df745eb..076539d16f1 100644
--- a/tensorflow/contrib/cluster_resolver/python/training/slurm_cluster_resolver_test.py
+++ b/tensorflow/python/distribute/cluster_resolver/slurm_cluster_resolver_test.py
@@ -20,7 +20,7 @@ from __future__ import print_function
 
 import os
 
-from tensorflow.contrib.cluster_resolver.python.training.slurm_cluster_resolver import SlurmClusterResolver
+from tensorflow.python.distribute.cluster_resolver import SlurmClusterResolver
 from tensorflow.python.platform import test
 from tensorflow.python.training import server_lib
 
@@ -67,6 +67,31 @@ class SlurmClusterResolverTest(test.TestCase):
     """
     self._verifyClusterSpecEquality(actual_cluster_spec, expected_proto)
 
+  @mock.patch.dict(os.environ, {'SLURM_PROCID': '0', 'SLURM_NTASKS': '3'})
+  @mock.patch.object(SlurmClusterResolver, '_resolve_hostnames',
+                     mock_resolve_hostnames_output)
+  def testSimpleMasterRetrieval(self):
+    slurm_cluster_resolver = SlurmClusterResolver(
+        jobs={
+            'ps': 1,
+            'worker': 2
+        },
+        port_base=8888,
+        tasks_per_node=1,
+        gpus_per_node=1,
+        gpus_per_task=1,
+        auto_set_gpu=False)
+
+    slurm_cluster_resolver.task_type = 'worker'
+    slurm_cluster_resolver.task_index = 1
+    self.assertEqual(slurm_cluster_resolver.master(), 'grpc://t02n43:8888')
+
+    slurm_cluster_resolver.rpc_layer = 'ab'
+    self.assertEqual(slurm_cluster_resolver.master('ps', 0), 'ab://t02n13:8888')
+    self.assertEqual(
+        slurm_cluster_resolver.master('ps', 0, rpc_layer='test'),
+        'test://t02n13:8888')
+
   @mock.patch.dict(os.environ, {
       'SLURM_PROCID': '0',
       'SLURM_NTASKS': '3',
diff --git a/tensorflow/python/distribute/cluster_resolver/tfconfig_cluster_resolver.py b/tensorflow/python/distribute/cluster_resolver/tfconfig_cluster_resolver.py
new file mode 100644
index 00000000000..a3246e77f4d
--- /dev/null
+++ b/tensorflow/python/distribute/cluster_resolver/tfconfig_cluster_resolver.py
@@ -0,0 +1,171 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Implementation of Cluster Resolvers for TF_CONFIG Environment Variables."""
+
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import json
+import os
+
+from tensorflow.python.distribute.cluster_resolver.cluster_resolver import ClusterResolver
+from tensorflow.python.training.server_lib import ClusterSpec
+
+_TF_CONFIG_ENV = 'TF_CONFIG'
+_SESSION_MASTER_KEY = 'session_master'
+_RPC_LAYER_KEY = 'rpc_layer'
+_TASK_KEY = 'task'
+
+
+def format_master_url(master, rpc_layer=None):
+  if rpc_layer:
+    return '%s://%s' % (rpc_layer, master)
+  else:
+    return master
+
+
+def _load_tf_config():
+  return json.loads(os.environ.get(_TF_CONFIG_ENV, '{}'))
+
+
+def _get_value_in_tfconfig(key, default=None):
+  tf_config = _load_tf_config()
+  return tf_config[key] if key in tf_config else default
+
+
+class TFConfigClusterResolver(ClusterResolver):
+  """Implementation of a ClusterResolver which reads the TF_CONFIG EnvVar."""
+
+  def __init__(self,
+               task_type=None,
+               task_index=None,
+               rpc_layer=None,
+               environment=None,
+               num_accelerators_per_worker=0):
+    """Creates a new TFConfigClusterResolver.
+
+    Args:
+      task_type: (String, optional) Overrides the task type specified in the
+        TF_CONFIG environment variable.
+      task_index: (Integer, optional) Overrides the task index specified in the
+        TF_CONFIG environment variable.
+      rpc_layer: (String, optional) Overrides the rpc layer TensorFlow uses.
+      environment: (String, optional) Overrides the environment TensorFlow
+        operates in.
+      num_accelerators_per_worker: (Integer, optional) Specifies the number of
+        accelerators (e.g. GPUs, TPUs, others) that each node has.
+    """
+
+    self._task_type = task_type
+    self._task_index = task_index
+    self._rpc_layer = rpc_layer
+    self._environment = environment
+    self._num_accelerators_per_worker = num_accelerators_per_worker
+
+  @property
+  def task_type(self):
+    if self._task_type is None:
+      task_info = _get_value_in_tfconfig(_TASK_KEY, {})
+      return task_info['type'] if 'type' in task_info else None
+    else:
+      return self._task_type
+
+  @property
+  def task_index(self):
+    if self._task_type is None:
+      task_info = _get_value_in_tfconfig(_TASK_KEY, {})
+      return task_info['index'] if 'index' in task_info else None
+    else:
+      return self._task_index
+
+  @task_type.setter
+  def task_type(self, task_type):
+    self._task_type = task_type
+
+  @task_index.setter
+  def task_index(self, task_index):
+    self._task_index = task_index
+
+  @property
+  def environment(self):
+    return self._environment
+
+  @property
+  def rpc_layer(self):
+    if self._rpc_layer is None:
+      return _get_value_in_tfconfig(_RPC_LAYER_KEY)
+    else:
+      return self._rpc_layer
+
+  @rpc_layer.setter
+  def rpc_layer(self, rpc_layer):
+    self._rpc_layer = rpc_layer
+
+  def num_accelerators_per_worker(self, session_config=None):
+    # TODO(frankchn): Connect to server (w/ session_config) in the future.
+    del session_config  # Unused, we do not connect to another server here.
+    return self._num_accelerators_per_worker
+
+  def cluster_spec(self):
+    """Returns a ClusterSpec based on the TF_CONFIG environment variable.
+
+    Returns:
+      A ClusterSpec with information from the TF_CONFIG environment variable.
+    """
+    tf_config = _load_tf_config()
+    if 'cluster' not in tf_config:
+      return ClusterSpec({})
+    return ClusterSpec(tf_config['cluster'])
+
+  def master(self, task_type=None, task_index=None, rpc_layer=None):
+    """Returns the master address to use when creating a TensorFlow session.
+
+    Args:
+      task_type: (String, optional) Overrides and sets the task_type of the
+        master.
+      task_index: (Integer, optional) Overrides and sets the task id of the
+        master.
+      rpc_layer: (String, optional) Overrides and sets the protocol over which
+        TensorFlow nodes communicate with each other.
+
+    Returns:
+      The address of the master.
+
+    Raises:
+      RuntimeError: If the task_type or task_id is not specified and the
+        `TF_CONFIG` environment variable does not contain a task section.
+    """
+
+    # If `session_master` is set, just use that.
+    session_master = _get_value_in_tfconfig(_SESSION_MASTER_KEY)
+    if session_master is not None:
+      return session_master
+
+    # Return an empty string if we are the only job in the ClusterSpec.
+    cluster_spec = self.cluster_spec()
+    if (not cluster_spec.jobs or
+        (len(cluster_spec.jobs) == 1 and
+         len(cluster_spec.job_tasks(cluster_spec.jobs[0])) == 1)):
+      return ''
+
+    # We try to auto-detect the task type and id, but uses the user-supplied one
+    # where available
+    task_type = task_type if task_type is not None else self.task_type
+    task_index = task_index if task_index is not None else self.task_index
+
+    return format_master_url(cluster_spec.task_address(task_type, task_index),
+                             self.rpc_layer)
diff --git a/tensorflow/contrib/cluster_resolver/python/training/tfconfig_cluster_resolver_test.py b/tensorflow/python/distribute/cluster_resolver/tfconfig_cluster_resolver_test.py
similarity index 72%
rename from tensorflow/contrib/cluster_resolver/python/training/tfconfig_cluster_resolver_test.py
rename to tensorflow/python/distribute/cluster_resolver/tfconfig_cluster_resolver_test.py
index 468161d2aa4..c20e51bc0bb 100644
--- a/tensorflow/contrib/cluster_resolver/python/training/tfconfig_cluster_resolver_test.py
+++ b/tensorflow/python/distribute/cluster_resolver/tfconfig_cluster_resolver_test.py
@@ -20,7 +20,7 @@ from __future__ import print_function
 
 import os
 
-from tensorflow.contrib.cluster_resolver.python.training.tfconfig_cluster_resolver import TFConfigClusterResolver
+from tensorflow.python.distribute.cluster_resolver import TFConfigClusterResolver
 from tensorflow.python.platform import test
 from tensorflow.python.training import server_lib
 
@@ -133,6 +133,58 @@ class TFConfigClusterResolverTest(test.TestCase):
     cluster_resolver = TFConfigClusterResolver()
     self.assertEqual('grpc://ps0:2222', cluster_resolver.master())
 
+  def testTaskTypeIndexRpcRead(self):
+    os.environ['TF_CONFIG'] = """
+    {
+      "cluster": {
+        "ps": ["ps0:2222", "ps1:2222"],
+        "worker": ["worker0:2222", "worker1:2222", "worker2:2222"]
+      },
+      "rpc_layer": "grpc",
+      "task": {
+        "type": "ps",
+        "index": 0
+      }
+    }
+    """
+
+    cluster_resolver = TFConfigClusterResolver()
+    self.assertEqual('ps', cluster_resolver.task_type)
+    self.assertEqual(0, cluster_resolver.task_index)
+    self.assertEqual('grpc', cluster_resolver.rpc_layer)
+
+  def testParameterOverrides(self):
+    os.environ['TF_CONFIG'] = """
+    {
+      "cluster": {
+        "ps": ["ps0:2222", "ps1:2222"],
+        "worker": ["worker0:2222", "worker1:2222", "worker2:2222"]
+      },
+      "rpc_layer": "grpc",
+      "task": {
+        "type": "ps",
+        "index": 1
+      }
+    }
+    """
+
+    cluster_resolver = TFConfigClusterResolver(task_type='ps', task_index=0,
+                                               num_accelerators_per_worker=8)
+
+    self.assertEqual('grpc://ps0:2222', cluster_resolver.master())
+    self.assertEqual('ps', cluster_resolver.task_type)
+    self.assertEqual(0, cluster_resolver.task_index)
+    self.assertEqual(8, cluster_resolver.num_accelerators_per_worker())
+
+    cluster_resolver.task_type = 'worker'
+    cluster_resolver.task_index = 1
+    cluster_resolver.rpc_layer = 'test'
+
+    self.assertEqual('test://worker1:2222', cluster_resolver.master())
+    self.assertEqual('worker', cluster_resolver.task_type)
+    self.assertEqual(1, cluster_resolver.task_index)
+    self.assertEqual('test', cluster_resolver.rpc_layer)
+
   def testZeroItemsInClusterSpecMasterRead(self):
     os.environ['TF_CONFIG'] = """
     {}
diff --git a/tensorflow/python/distribute/cluster_resolver/tpu_cluster_resolver.py b/tensorflow/python/distribute/cluster_resolver/tpu_cluster_resolver.py
new file mode 100644
index 00000000000..1956bd75a87
--- /dev/null
+++ b/tensorflow/python/distribute/cluster_resolver/tpu_cluster_resolver.py
@@ -0,0 +1,423 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Implementation of Cluster Resolvers for Cloud TPUs."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+from six.moves.urllib.request import Request
+from six.moves.urllib.request import urlopen
+
+from tensorflow.python.distribute.cluster_resolver.cluster_resolver import ClusterResolver
+from tensorflow.python.distribute.cluster_resolver.cluster_resolver import format_master_url
+from tensorflow.python.training import server_lib
+from tensorflow.python.util import compat
+
+_GOOGLE_API_CLIENT_INSTALLED = True
+try:
+  from googleapiclient import discovery  # pylint: disable=g-import-not-at-top
+  from oauth2client.client import GoogleCredentials  # pylint: disable=g-import-not-at-top
+except ImportError:
+  _GOOGLE_API_CLIENT_INSTALLED = False
+
+
+_GKE_ENV_VARIABLE = 'KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS'
+_ENDPOINTS_SEPARATOR = ','
+_DEFAULT_ENV_VARIABLE = 'TPU_NAME'
+_DISCOVERY_SERVICE_URL_ENV_VARIABLE = 'TPU_API_DISCOVERY_URL'
+
+
+class TPUClusterResolver(ClusterResolver):
+  """Cluster Resolver for Google Cloud TPUs.
+
+  This is an implementation of cluster resolvers for the Google Cloud TPU
+  service. As Cloud TPUs are in alpha, you will need to specify a API definition
+  file for this to consume, in addition to a list of Cloud TPUs in your Google
+  Cloud Platform project.
+  """
+
+  def _tpuService(self):
+    """Creates a new Cloud TPU API object.
+
+    This works around an issue where the underlying HTTP connection sometimes
+    times out when the script has been running for too long. Other methods in
+    this object calls this method to get a new API object whenever they need
+    to communicate with the Cloud API.
+
+    Returns:
+      A Google Cloud TPU API object.
+    """
+    if self._service:
+      return self._service
+
+    credentials = self._credentials
+    if credentials is None or credentials == 'default':
+      credentials = GoogleCredentials.get_application_default()
+
+    if self._discovery_url:
+      return discovery.build(
+          'tpu', 'v1alpha1',
+          credentials=credentials,
+          discoveryServiceUrl=self._discovery_url)
+    else:
+      return discovery.build(
+          'tpu', 'v1alpha1',
+          credentials=credentials)
+
+  def _requestComputeMetadata(self, path):
+    req = Request('http://metadata/computeMetadata/v1/%s' % path,
+                  headers={'Metadata-Flavor': 'Google'})
+    resp = urlopen(req)
+    return compat.as_bytes(resp.read())
+
+  def _shouldResolve(self):
+    if isinstance(self._should_resolve_override, bool):
+      return self._should_resolve_override
+    if (self._tpu == compat.as_bytes('') or
+        self._tpu == compat.as_bytes('local') or
+        self._tpu.startswith(compat.as_bytes('/bns')) or
+        self._tpu.startswith(compat.as_bytes('localhost:')) or
+        self._tpu.startswith(compat.as_bytes('grpc://'))):
+      return False
+    return True
+
+  @staticmethod
+  def _inGke():
+    """When running in GKE, the environment variable will be set."""
+    return _GKE_ENV_VARIABLE in os.environ
+
+  @staticmethod
+  def _gkeEndpoints():
+    return os.environ[_GKE_ENV_VARIABLE]
+
+  @staticmethod
+  def _envVarFallback():
+    if _DEFAULT_ENV_VARIABLE in os.environ:
+      return os.environ[_DEFAULT_ENV_VARIABLE]
+    return None
+
+  @staticmethod
+  def _environmentDiscoveryUrl():
+    return os.environ.get(_DISCOVERY_SERVICE_URL_ENV_VARIABLE)
+
+  def __init__(self,
+               tpu=None,
+               zone=None,
+               project=None,
+               job_name='worker',
+               coordinator_name=None,
+               coordinator_address=None,
+               credentials='default',
+               service=None,
+               discovery_url=None):
+    """Creates a new TPUClusterResolver object.
+
+    The ClusterResolver will then use the parameters to query the Cloud TPU APIs
+    for the IP addresses and ports of each Cloud TPU listed.
+
+    Args:
+      tpu: Either a string, or a list of strings corresponding to the TPUs to
+        use. If the single string is the empty string, the string 'local', or a
+        string that begins with 'grpc://' or '/bns', then it is assumed to not
+        correspond with a Cloud TPU and will instead be passed as the session
+        master and no ClusterSpec propagation will be done.
+      zone: Zone where the TPUs are located. If omitted or empty, we will assume
+        that the zone of the TPU is the same as the zone of the GCE VM, which we
+        will try to discover from the GCE metadata service.
+      project: Name of the GCP project containing Cloud TPUs. If omitted or
+        empty, we will try to discover the project name of the GCE VM from the
+        GCE metadata service.
+      job_name: Name of the TensorFlow job the TPUs belong to.
+      coordinator_name: The name to use for the coordinator. Set to None if the
+        coordinator should not be included in the computed ClusterSpec.
+      coordinator_address: The address of the coordinator (typically an ip:port
+        pair). If set to None, a TF server will be started. If coordinator_name
+        is None, a TF server will not be started even if coordinator_address is
+        None.
+      credentials: GCE Credentials. If None, then we use default credentials
+        from the oauth2client
+      service: The GCE API object returned by the googleapiclient.discovery
+        function. If you specify a custom service object, then the credentials
+        parameter will be ignored.
+      discovery_url: A URL template that points to the location of
+        the discovery service. It should have two parameters {api} and
+        {apiVersion} that when filled in produce an absolute URL to the
+        discovery document for that service. The environment variable
+        'TPU_API_DISCOVERY_URL' will override this.
+
+    Raises:
+      ImportError: If the googleapiclient is not installed.
+      ValueError: If no TPUs are specified.
+    """
+    if isinstance(tpu, list):
+      if not tpu:
+        raise ValueError('At least one TPU must be specified.')
+      if len(tpu) != 1:
+        raise NotImplementedError(
+            'Using multiple TPUs in a single session is not yet implemented')
+      tpu = tpu[0]
+
+    in_gke = self._inGke()
+    # When using GKE with Cloud TPUs, the env variable will be set.
+    if tpu is None:
+      if in_gke:
+        tpu = self._gkeEndpoints()
+      else:
+        tpu = self._envVarFallback()
+
+    if tpu is None:
+      raise ValueError('Please provide a TPU Name to connect to.')
+
+    self._tpu = compat.as_bytes(tpu)  # self._tpu is always bytes
+
+    # By default the task_type is 'worker` and the task_index is 0 (which is the
+    # first worker in the task).
+    self.task_type = job_name
+    self.task_index = 0
+
+    if tpu.startswith('grpc://'):
+      # Cloud environment, where we are using GRPC to communicate to TPUs.
+      self._environment = ''
+    elif tpu == 'local' or not tpu:
+      # Google environment, where the TPU is attached to the host.
+      self._environment = 'google'
+    elif tpu.startswith('/bns'):
+      # Google environment, where we reach the TPU through BNS.
+      self._environment = 'google'
+
+    # If TPU is in the Google environment or exists locally, we don't use any
+    # RPC layer.
+    if tpu.startswith('/bns') or tpu == 'local' or not tpu:
+      self.rpc_layer = None
+    else:
+      self.rpc_layer = 'grpc'
+
+    # Setting this overrides the return value of self._shouldResolve()
+    self._should_resolve_override = None
+
+    # We strip out the protocol if it is included, and override the
+    # shouldResolve function to never resolve. We are adding the protocol back
+    # in later in self.master().
+    if self.rpc_layer is not None and tpu.startswith(self.rpc_layer + '://'):
+      tpu = tpu[len(self.rpc_layer + '://'):]
+      self._tpu = tpu
+      self._should_resolve_override = False
+
+    # Whether we should actually attempt to contact Cloud APIs
+    should_resolve = self._shouldResolve()
+
+    # We error out if we are in a non-Cloud environment which cannot talk to the
+    # Cloud APIs using the standard class and a special object is not passed in.
+    self._service = service
+    if (self._service is None and should_resolve and
+        not _GOOGLE_API_CLIENT_INSTALLED):
+      raise ImportError('googleapiclient and oauth2client must be installed '
+                        'before using the TPU cluster resolver. Execute: '
+                        '`pip install --upgrade google-api-python-client` '
+                        'and `pip install --upgrade oauth2client` to '
+                        'install with pip.')
+
+    # We save user-passed credentials, unless the user didn't pass in anything.
+    self._credentials = credentials
+    if (credentials == 'default' and should_resolve and
+        _GOOGLE_API_CLIENT_INSTALLED):
+      self._credentials = None
+
+    # Automatically detect project and zone if unspecified.
+    if not project and should_resolve:
+      project = compat.as_str(
+          self._requestComputeMetadata('project/project-id'))
+    if not zone and should_resolve:
+      zone_path = compat.as_str(self._requestComputeMetadata('instance/zone'))
+      zone = zone_path.split('/')[-1]
+    self._project = project
+    self._zone = zone
+
+    self._discovery_url = self._environmentDiscoveryUrl() or discovery_url
+
+    self._coordinator_name = coordinator_name
+    if (coordinator_name and not coordinator_address and
+        (should_resolve or in_gke)):
+      self._start_local_server()
+    else:
+      self._coordinator_address = coordinator_address
+
+  def master(self, task_type=None, task_index=None, rpc_layer=None):
+    """Get the Master string to be used for the session.
+
+    In the normal case, this returns the grpc path (grpc://1.2.3.4:8470) of
+    first instance in the ClusterSpec returned by the cluster_spec function.
+
+    If a non-TPU name is used when constructing a TPUClusterResolver, that will
+    be returned instead (e.g. If the tpus argument's value when constructing
+    this TPUClusterResolver was 'grpc://10.240.1.2:8470',
+    'grpc://10.240.1.2:8470' will be returned).
+
+    Args:
+      task_type: (Optional, string) The type of the TensorFlow task of the
+        master.
+      task_index: (Optional, integer) The index of the TensorFlow task of the
+        master.
+      rpc_layer: (Optional, string) The RPC protocol TensorFlow should use to
+        communicate with TPUs.
+
+    Returns:
+      string, the connection string to use when creating a session.
+
+    Raises:
+      ValueError: If none of the TPUs specified exists.
+    """
+    if self._shouldResolve():
+      # We are going to communicate with the Cloud TPU APIs to get a Cluster.
+      cluster_spec = self.cluster_spec()
+      if task_type is not None and task_index is not None:
+        # task_type and task_index is from the function parameter
+        master = cluster_spec.task_address(task_type, task_index)
+      elif self.task_type is not None and self.task_index is not None:
+        # task_type and task_index is from the object
+        master = cluster_spec.task_address(self.task_type, self.task_index)
+      else:
+        # by default we take the first item in the cluster with the right name
+        job_tasks = cluster_spec.job_tasks(self.task_type)
+        if not job_tasks:
+          raise ValueError('No TPUs with the specified names exist.')
+        master = job_tasks[0]
+    else:
+      if isinstance(self._tpu, (bytes, bytearray)):
+        master = self._tpu.split(compat.as_bytes(_ENDPOINTS_SEPARATOR))[0]
+      else:
+        master = self._tpu.split(_ENDPOINTS_SEPARATOR)[0]
+    return format_master_url(master, rpc_layer or self.rpc_layer)
+
+  def get_master(self):
+    return self.master()
+
+  def get_job_name(self):
+    if self._shouldResolve():
+      return self.task_type
+
+  def cluster_spec(self):
+    """Returns a ClusterSpec object based on the latest TPU information.
+
+    We retrieve the information from the GCE APIs every time this method is
+    called.
+
+    Returns:
+      A ClusterSpec containing host information returned from Cloud TPUs.
+
+    Raises:
+      RuntimeError: If the provided TPU is not healthy.
+    """
+    ############################################################################
+    # There are 5 potential cases this code must handle:
+    #  1. [Normal case.] We should resolve the TPU name to a set of tasks, and
+    #      a. Create a ClusterSpec that includes the coordinator job
+    #      b. Create a ClusterSpec without the coordinator job.
+    #  2. [GKE / No API Access.] We should not resolve the TPU name to a set of
+    #     tasks and
+    #      a. Create a ClusterSpec with the coordinator
+    #      b. Create a ClusterSpec without the coordinator
+    #  3. [Other (legacy non-gRPC).] We should return an empty ClusterSpec.
+    ############################################################################
+
+    if self._shouldResolve():
+      # Case 1.
+      full_name = 'projects/%s/locations/%s/nodes/%s' % (
+          self._project, self._zone, compat.as_text(self._tpu))
+      service = self._tpuService()
+      request = service.projects().locations().nodes().get(name=full_name)
+      response = request.execute()
+
+      if 'state' in response and response['state'] != 'READY':
+        raise RuntimeError('TPU "%s" is not yet ready; state: "%s"' %
+                           (compat.as_text(self._tpu), response['state']))
+
+      if 'health' in response and response['health'] != 'HEALTHY':
+        raise RuntimeError('TPU "%s" is unhealthy: "%s"' %
+                           (compat.as_text(self._tpu), response['health']))
+
+      if 'networkEndpoints' in response:
+        worker_list = [
+            '%s:%s' % (endpoint['ipAddress'], endpoint['port'])
+            for endpoint in response['networkEndpoints']
+        ]
+      else:
+        # Fall back to the deprecated response format
+        instance_url = '%s:%s' % (response['ipAddress'], response['port'])
+        worker_list = [instance_url]
+
+      cluster_spec = {self.task_type: worker_list}
+    else:
+      if self.rpc_layer is None:
+        # Case 3.
+        return None
+      # Case 2.
+      tpus = []
+      for tpu in self._tpu.split(_ENDPOINTS_SEPARATOR):
+        # We are working around the fact that GKE environment variable that is
+        # supplied to us has the protocol string embedded in it, but we want
+        # to strip it out for the ClusterSpec.
+        if (self.rpc_layer is not None and
+            tpu.startswith(self.rpc_layer + '://')):
+          tpus.append(tpu[len(self.rpc_layer + '://'):])
+        else:
+          tpus.append(tpu)
+      cluster_spec = {self.task_type: tpus}
+
+    if self._coordinator_address:
+      # {1, 2}.a
+      cluster_spec[self._coordinator_name] = [self._coordinator_address]
+
+    return server_lib.ClusterSpec(cluster_spec)
+
+  def num_accelerators_per_worker(self, session_config=None):
+    """Returns the number of TPU cores per worker.
+
+    This defaults to 8 for all current TPU configurations, and we do not need
+    to query any remote systems for this.
+
+    Args:
+      session_config: Unused. Not currently necessary to query anything as this
+        number is 8 for all TPU configurations.
+    """
+    del session_config  # Unused. Not necessary to query anything.
+    return 8
+
+  @property
+  def environment(self):
+    """Returns the current environment which TensorFlow is running in."""
+    return self._environment
+
+  def _start_local_server(self):
+    address = self._requestComputeMetadata('instance/network-interfaces/0/ip')
+    self._server = server_lib.Server(
+        {
+            'local': ['0.0.0.0:0']
+        }, protocol='grpc', config=None, start=True)
+    # self._server.target is of the form: grpc://ipaddress:port
+    target = compat.as_bytes(self._server.target)
+    splits = target.split(compat.as_bytes(':'))
+    assert len(splits) == 3, self._server.target
+    assert splits[0] == compat.as_bytes('grpc'), self._server.target
+    self._coordinator_port = compat.as_text(splits[2])
+    self._coordinator_address = '%s:%s' % (
+        address, compat.as_text(self._coordinator_port))
+
+  def __deepcopy__(self, memo):
+    # TODO(b/73668574): Remove this once RunConfig avoids performing deepcopy.
+    return self
diff --git a/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver_test.py b/tensorflow/python/distribute/cluster_resolver/tpu_cluster_resolver_test.py
similarity index 86%
rename from tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver_test.py
rename to tensorflow/python/distribute/cluster_resolver/tpu_cluster_resolver_test.py
index 478c82967ba..0f22ede3d9b 100644
--- a/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver_test.py
+++ b/tensorflow/python/distribute/cluster_resolver/tpu_cluster_resolver_test.py
@@ -20,7 +20,7 @@ from __future__ import print_function
 
 import os
 
-from tensorflow.contrib.cluster_resolver.python.training.tpu_cluster_resolver import TPUClusterResolver
+from tensorflow.python.distribute.cluster_resolver import TPUClusterResolver
 from tensorflow.python.platform import test
 from tensorflow.python.training import server_lib
 from tensorflow.python.util import compat
@@ -132,6 +132,7 @@ class TPUClusterResolverTest(test.TestCase):
     }
     """ % tpu_cluster_resolver._coordinator_port
     self._verifyClusterSpecEquality(actual_cluster_spec, str(expected_proto))
+    self.assertEqual(tpu_cluster_resolver.master(), 'grpc://10.1.2.3:8470')
 
   @mock.patch.object(TPUClusterResolver, '_requestComputeMetadata',
                      mock_request_compute_metadata)
@@ -157,6 +158,7 @@ class TPUClusterResolverTest(test.TestCase):
     job { name: 'worker' tasks { key: 0 value: '10.1.2.3:8470' } }
     """
     self._verifyClusterSpecEquality(actual_cluster_spec, expected_proto)
+    self.assertEqual(tpu_cluster_resolver.master(), 'grpc://10.1.2.3:8470')
 
   @mock.patch.object(TPUClusterResolver, '_requestComputeMetadata',
                      mock_request_compute_metadata)
@@ -226,6 +228,7 @@ class TPUClusterResolverTest(test.TestCase):
     job { name: 'worker' tasks { key: 0 value: '10.1.2.3:8470' } }
     """
     self._verifyClusterSpecEquality(actual_cluster_spec, expected_proto)
+    self.assertEqual(tpu_cluster_resolver.master(), 'grpc://10.1.2.3:8470')
 
   def testNewNetworkEndpointFormat(self):
     tpu_map = {
@@ -304,6 +307,7 @@ class TPUClusterResolverTest(test.TestCase):
     }
     """ % tpu_cluster_resolver._coordinator_port
     self._verifyClusterSpecEquality(actual_cluster_spec, str(expected_proto))
+    self.assertEqual(tpu_cluster_resolver.master(), 'grpc://10.2.3.4:8470')
 
   def testPodResolutionNoCoordinator(self):
     tpu_map = {
@@ -350,6 +354,7 @@ class TPUClusterResolverTest(test.TestCase):
     }
     """
     self._verifyClusterSpecEquality(actual_cluster_spec, expected_proto)
+    self.assertEqual(tpu_cluster_resolver.master(), 'grpc://10.2.3.4:8470')
 
   def testGetMasterNoEntries(self):
     tpu_map = {}
@@ -464,5 +469,62 @@ class TPUClusterResolverTest(test.TestCase):
     self.assertEqual('https://{api}.internal/{apiVersion}',
                      TPUClusterResolver._environmentDiscoveryUrl())
 
+  def testEnvironmentAndRpcDetectionForGoogle(self):
+    tpu_cluster_resolver = TPUClusterResolver(tpu='/bns/ab/cd/ef')
+    self.assertEqual(tpu_cluster_resolver.environment, 'google')
+    self.assertEqual(tpu_cluster_resolver.rpc_layer, None)
+
+  def testEnvironmentAndRpcDetectionForGrpcString(self):
+    tpu_cluster_resolver = TPUClusterResolver(tpu='grpc://10.1.2.3:8470')
+    self.assertEqual(tpu_cluster_resolver.environment, '')
+    self.assertEqual(tpu_cluster_resolver.rpc_layer, 'grpc')
+    self.assertEqual(tpu_cluster_resolver.master(), 'grpc://10.1.2.3:8470')
+
+  def testOverrideTaskTypeAndIndexAndGetMaster(self):
+    tpu_map = {
+        'projects/test-project/locations/us-central1-c/nodes/test-tpu-1': {
+            'health':
+                'HEALTHY',
+            'networkEndpoints': [
+                {
+                    'ipAddress': '10.2.3.4',
+                    'port': 8470,
+                },
+                {
+                    'ipAddress': '10.2.3.5',
+                    'port': 8470,
+                },
+                {
+                    'ipAddress': '10.2.3.6',
+                    'port': 8470,
+                },
+                {
+                    'ipAddress': '10.2.3.7',
+                    'port': 8470,
+                },
+            ]
+        }
+    }
+
+    tpu_cluster_resolver = TPUClusterResolver(
+        project='test-project',
+        zone='us-central1-c',
+        tpu='test-tpu-1',
+        coordinator_name=None,
+        credentials=None,
+        service=self.mock_service_client(tpu_map=tpu_map))
+
+    self.assertEqual(tpu_cluster_resolver.master(), 'grpc://10.2.3.4:8470')
+
+    tpu_cluster_resolver.task_type = 'worker'
+    tpu_cluster_resolver.task_index = 3
+    self.assertEqual(tpu_cluster_resolver.master(), 'grpc://10.2.3.7:8470')
+
+    self.assertEqual(
+        tpu_cluster_resolver.master(
+            task_type='worker', task_index=2, rpc_layer='test'),
+        'test://10.2.3.6:8470')
+
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/contrib/distribute/python/cross_tower_ops.py b/tensorflow/python/distribute/cross_device_ops.py
similarity index 86%
rename from tensorflow/contrib/distribute/python/cross_tower_ops.py
rename to tensorflow/python/distribute/cross_device_ops.py
index b5b349aa64e..a88ed625331 100644
--- a/tensorflow/contrib/distribute/python/cross_tower_ops.py
+++ b/tensorflow/python/distribute/cross_device_ops.py
@@ -21,17 +21,17 @@ from __future__ import print_function
 import collections
 import six
 
-from tensorflow.contrib.distribute.python import cross_tower_utils
-from tensorflow.contrib.distribute.python import values as value_lib
 from tensorflow.python.client import device_lib
+from tensorflow.python.distribute import cross_device_utils
+from tensorflow.python.distribute import device_util
+from tensorflow.python.distribute import reduce_util
+from tensorflow.python.distribute import values as value_lib
 from tensorflow.python.eager import context
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import resource_variable_ops
-from tensorflow.python.ops import variable_scope as vs
 from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.training import device_util
 
 
 def check_destinations(destinations):
@@ -103,10 +103,10 @@ def _validate_value_destination_pairs(value_destination_pairs):
   # pylint: disable=g-missing-docstring
   if not value_destination_pairs: return False
   if not isinstance(value_destination_pairs, (list, tuple)): return False
-  if not all([isinstance(pair, tuple) for pair in value_destination_pairs]):
+  if not all(isinstance(pair, tuple) for pair in value_destination_pairs):
     return False
-  if not all([isinstance(v[0], value_lib.PerReplica)
-              for v in value_destination_pairs]):
+  if not all(isinstance(v[0], value_lib.PerReplica)
+             for v in value_destination_pairs):
     return False
   return True
 
@@ -132,10 +132,10 @@ def _devices_match(left, right):
 
 
 def _all_devices_match(value_destination_pairs):
-  if not all([_devices_match(v, d) for v, d in value_destination_pairs]):
+  if not all(_devices_match(v, d) for v, d in value_destination_pairs):
     return False
-  if not all([_devices_match(v, value_destination_pairs[0][0])
-              for v, _ in value_destination_pairs[1:]]):
+  if not all(_devices_match(v, value_destination_pairs[0][0])
+             for v, _ in value_destination_pairs[1:]):
     return False
   return True
 
@@ -144,13 +144,13 @@ def _simple_broadcast(value, destinations):
   index = {}
   devices = get_devices_from(destinations)
   for d in devices:
-    index[d] = cross_tower_utils.copy_tensor_or_indexed_slices_to_device(
+    index[d] = cross_device_utils.copy_tensor_or_indexed_slices_to_device(
         value, d)
   return value_lib.Mirrored(index)
 
 
 def _simple_reduce(per_replica_value, reduce_to_device, accumulation_fn,
-                   aggregation):
+                   reduce_op):
   # pylint: disable=g-missing-docstring
   all_values = []
   count = 0
@@ -162,14 +162,13 @@ def _simple_reduce(per_replica_value, reduce_to_device, accumulation_fn,
 
   with ops.device(reduce_to_device):
     with context.context().device_policy(context.DEVICE_PLACEMENT_SILENT):
-      reduced = cross_tower_utils.aggregate_tensors_or_indexed_slices(
+      reduced = cross_device_utils.aggregate_tensors_or_indexed_slices(
           all_values, accumulation_fn)
-      if aggregation == vs.VariableAggregation.MEAN:
-        reduced = cross_tower_utils.divide_by_n_tensors_or_indexed_slices(
+      if reduce_op == reduce_util.ReduceOp.MEAN:
+        reduced = cross_device_utils.divide_by_n_tensors_or_indexed_slices(
             reduced, count)
-      elif aggregation != vs.VariableAggregation.SUM:
-        raise ValueError("`aggregation` must be VariableAggregation.SUM "
-                         "or VariableAggregation.MEAN.")
+      elif reduce_op != reduce_util.ReduceOp.SUM:
+        raise ValueError("`reduce_op` must be Reduce.SUM or Reduce.MEAN.")
   return reduced
 
 
@@ -179,15 +178,15 @@ class CrossDeviceOps(object):
   def __init__(self):
     pass
 
-  def reduce(self, aggregation, per_replica_value, destinations):
+  def reduce(self, reduce_op, per_replica_value, destinations):
     """Reduce `per_replica_value` to `destinations`.
 
-    It runs the reduction operation defined by `aggregation` and put the
+    It runs the reduction operation defined by `reduce_op` and put the
     result on `destinations`.
 
     Args:
-      aggregation: Indicates how a variable will be aggregated. Accepted values
-        are `tf.VariableAggregation.SUM`, `tf.VariableAggregation.MEAN`.
+      reduce_op: Indicates how per_replica_value will be reduced. Accepted
+        values are `tf.distribute.ReduceOp.SUM`, `tf.distribute.ReduceOp.MEAN`.
       per_replica_value: a PerReplica object or a tensor with device set.
       destinations: the reduction destinations.
 
@@ -201,17 +200,17 @@ class CrossDeviceOps(object):
       per_replica_value = _make_tensor_into_per_replica(per_replica_value)
 
     validate_destinations(destinations)
-    return self._reduce(aggregation, per_replica_value, destinations)
+    return self._reduce(reduce_op, per_replica_value, destinations)
 
-  def batch_reduce(self, aggregation, value_destination_pairs):
+  def batch_reduce(self, reduce_op, value_destination_pairs):
     """Reduce PerReplica objects in a batch.
 
     Reduce each first element in `value_destination_pairs` to each second
     element which indicates the destinations.
 
     Args:
-      aggregation: Indicates how a variable will be aggregated. Accepted values
-        are `tf.VariableAggregation.SUM`, `tf.VariableAggregation.MEAN`.
+      reduce_op: Indicates how per_replica_value will be reduced. Accepted
+        values are `tf.distribute.ReduceOp.SUM`, `tf.distribute.ReduceOp.MEAN`.
       value_destination_pairs: a list or a tuple of tuples of PerReplica objects
         (or tensors with device set if there is one device) and destinations.
 
@@ -231,7 +230,7 @@ class CrossDeviceOps(object):
     for _, d in value_destination_pairs:
       validate_destinations(d)
 
-    return self._batch_reduce(aggregation, value_destination_pairs)
+    return self._batch_reduce(reduce_op, value_destination_pairs)
 
   def broadcast(self, tensor, destinations):
     """Broadcast the `tensor` to destinations.
@@ -246,11 +245,11 @@ class CrossDeviceOps(object):
     validate_destinations(destinations)
     return self._broadcast(tensor, destinations)
 
-  def _reduce(self, aggregation, per_replica_value, destinations):
+  def _reduce(self, reduce_op, per_replica_value, destinations):
     raise NotImplementedError(
         "_reduce method must be implemented in descendants.")
 
-  def _batch_reduce(self, aggregation, value_destination_pairs):
+  def _batch_reduce(self, reduce_op, value_destination_pairs):
     raise NotImplementedError(
         "_batch_reduce method must be implemented in descendants.")
 
@@ -276,19 +275,19 @@ class ReductionToOneDeviceCrossDeviceOps(CrossDeviceOps):
     self.accumulation_fn = accumulation_fn
     super(ReductionToOneDeviceCrossDeviceOps, self).__init__()
 
-  def _reduce(self, aggregation, per_replica_value, destinations):
+  def _reduce(self, reduce_op, per_replica_value, destinations):
     if check_destinations(destinations):
       devices = get_devices_from(destinations)
     else:
       devices = get_devices_from(per_replica_value)
     reduce_to_device = self.reduce_to_device or devices[0]
     reduced = _simple_reduce(per_replica_value, reduce_to_device,
-                             self.accumulation_fn, aggregation)
+                             self.accumulation_fn, reduce_op)
     return self.broadcast(reduced, devices)
 
-  def _batch_reduce(self, aggregation, value_destination_pairs):
+  def _batch_reduce(self, reduce_op, value_destination_pairs):
     return [
-        self._reduce(aggregation, t, destinations=v)
+        self._reduce(reduce_op, t, destinations=v)
         for t, v in value_destination_pairs
     ]
 
@@ -323,20 +322,20 @@ def _group_value_by_device(per_replica_values):
 
 def _ungroup_and_make_mirrored(grouped_reduced,
                                destinations,
-                               aggregation,
+                               reduce_op,
                                num_between_graph_workers=1):
   """Ungroup results from all-reduce and make Mirrored objects.
 
   Each all-reduce result will be divided by the number of destinations before
-  Mirrored objects are created if aggregation is "mean".
+  Mirrored objects are created if reduce_op is "mean".
 
   Args:
     grouped_reduced: a list of lists, each sublist has components for each
       device, paired with a None. It is the result from
-      cross_tower_utils.aggregate_gradients_using*.
+      cross_device_utils.aggregate_gradients_using*.
     destinations: a list of device strings for returned Mirrored objects.
-    aggregation: Indicates how a variable will be aggregated. Accepted values
-      are `tf.VariableAggregation.SUM`, `tf.VariableAggregation.MEAN`.
+    reduce_op: Indicates how values will be aggregated. Accepted values
+      are `tf.distribute.ReduceOp.SUM`, `tf.distribute.ReduceOp.MEAN`.
     num_between_graph_workers: number of workers in the between-graph
       replication.
 
@@ -346,7 +345,7 @@ def _ungroup_and_make_mirrored(grouped_reduced,
   index = [{} for _ in range(len(grouped_reduced[0]))]
   for d, per_replica_reduced in enumerate(grouped_reduced):
     for i, (v, _) in enumerate(per_replica_reduced):
-      if aggregation == vs.VariableAggregation.MEAN:
+      if reduce_op == reduce_util.ReduceOp.MEAN:
         index[i][destinations[d]] = v / (
             len(destinations) * num_between_graph_workers)
       else:
@@ -402,7 +401,7 @@ class ConcatAndSplitPacker(object):
         # all gradient shapes are defined, we use another method to get the
         # total size.
         # TODO(yuefengz): move this logic to array_ops.size.
-        if all([g.shape.is_fully_defined() for g, _ in device_grads_and_vars]):
+        if all(g.shape.is_fully_defined() for g, _ in device_grads_and_vars):
           total_grad_size = sum(
               [g.shape.num_elements() for g, _ in device_grads_and_vars])
         else:
@@ -486,7 +485,7 @@ class AggregateSmallTensorPacker(object):
     """Aggregate small tensors."""
     if (self.agg_small_grads_max_bytes > 0 and
         self.agg_small_grads_max_group > 0):
-      device_grads, self.packing = cross_tower_utils.pack_small_tensors(
+      device_grads, self.packing = cross_device_utils.pack_small_tensors(
           grouped_grads_and_vars,
           max_bytes=self.agg_small_grads_max_bytes,
           max_group=self.agg_small_grads_max_group)
@@ -494,8 +493,8 @@ class AggregateSmallTensorPacker(object):
 
   def unpack(self, summed_device_grad_packs):
     """Reverse the aggregation process."""
-    return cross_tower_utils.unpack_small_tensors(summed_device_grad_packs,
-                                                  self.packing)
+    return cross_device_utils.unpack_small_tensors(summed_device_grad_packs,
+                                                   self.packing)
 
 
 def _pack_tensors(device_grads,
@@ -557,13 +556,13 @@ class AllReduceCrossDeviceOps(CrossDeviceOps):
     self._agg_small_grads_max_group = agg_small_grads_max_group
     super(AllReduceCrossDeviceOps, self).__init__()
 
-  def _reduce(self, aggregation, per_replica_value, destinations):
-    contains_indexed_slices = cross_tower_utils.contains_indexed_slices(
+  def _reduce(self, reduce_op, per_replica_value, destinations):
+    contains_indexed_slices = cross_device_utils.contains_indexed_slices(
         per_replica_value)
     if (_devices_match(per_replica_value, destinations)
         and not context.executing_eagerly()
         and not contains_indexed_slices):
-      return self._batch_all_reduce(aggregation, [per_replica_value])[0]
+      return self._batch_all_reduce(reduce_op, [per_replica_value])[0]
     else:
       if contains_indexed_slices:
         logging.log_first_n(
@@ -576,16 +575,16 @@ class AllReduceCrossDeviceOps(CrossDeviceOps):
         devices = get_devices_from(per_replica_value)
       reduce_to_device = devices[0]
       reduced = _simple_reduce(per_replica_value, reduce_to_device,
-                               math_ops.add_n, aggregation)
+                               math_ops.add_n, reduce_op)
       return self.broadcast(reduced, devices)
 
-  def _batch_reduce(self, aggregation, value_destination_pairs):
+  def _batch_reduce(self, reduce_op, value_destination_pairs):
     all_devices_match = _all_devices_match(value_destination_pairs)
-    contains_indexed_slices = cross_tower_utils.contains_indexed_slices(
+    contains_indexed_slices = cross_device_utils.contains_indexed_slices(
         value_destination_pairs)
     if (all_devices_match and not context.executing_eagerly()
         and not contains_indexed_slices):
-      return self._batch_all_reduce(aggregation,
+      return self._batch_all_reduce(reduce_op,
                                     [v[0] for v in value_destination_pairs])
     else:
       if not all_devices_match:
@@ -595,11 +594,11 @@ class AllReduceCrossDeviceOps(CrossDeviceOps):
                             10)
 
       return [
-          self._reduce(aggregation, t, destinations=v)
+          self._reduce(reduce_op, t, destinations=v)
           for t, v in value_destination_pairs
       ]
 
-  def _batch_all_reduce(self, aggregation, per_replica_values):
+  def _batch_all_reduce(self, reduce_op, per_replica_values):
     """All reduce algorithm in a batch."""
     logging.log_first_n(
         logging.INFO, "batch_all_reduce invoked for batches size = %d with "
@@ -619,18 +618,18 @@ class AllReduceCrossDeviceOps(CrossDeviceOps):
     # the balance on num_splits.
     if self._all_reduce_alg == "nccl":
       # TODO(yuefengz): merge this into the all-reduce library.
-      reduced = cross_tower_utils.aggregate_gradients_using_nccl(
+      reduced = cross_device_utils.aggregate_gradients_using_nccl(
           device_grad_packs)
     else:
       # TODO(yuefengz): check that gpu ids in `destinations` are in ascending
       # order.
       reduced = (
-          cross_tower_utils.aggregate_gradients_using_hierarchical_copy(
+          cross_device_utils.aggregate_gradients_using_hierarchical_copy(
               destinations, device_grad_packs))
 
     reduced = _unpack_tensors(reduced, tensor_packer)
     return _ungroup_and_make_mirrored(reduced, per_replica_values[0].devices,
-                                      aggregation)
+                                      reduce_op)
 
 
 # For compatibility with code using the old name of `AllReduceCrossDeviceOps`.
@@ -713,7 +712,7 @@ class MultiWorkerAllReduce(AllReduceCrossDeviceOps):
           validate_and_complete_spec(spec) for spec in all_reduce_spec
       ]
 
-  def _batch_all_reduce(self, aggregation, per_replica_values):
+  def _batch_all_reduce(self, reduce_op, per_replica_values):
     """All reduce algorithm in a batch."""
     logging.log_first_n(
         logging.INFO,
@@ -741,13 +740,13 @@ class MultiWorkerAllReduce(AllReduceCrossDeviceOps):
         this_grads = remaining_grads
         remaining_grads = []
       else:
-        (this_grads, remaining_grads) = cross_tower_utils.split_grads_by_size(
+        (this_grads, remaining_grads) = cross_device_utils.split_grads_by_size(
             spec_tuple.limit, remaining_grads)
       if this_grads:
         device_grad_packs, tensor_packer = _pack_tensors(
             this_grads, self._num_packs, self._agg_small_grads_max_bytes,
             self._agg_small_grads_max_group)
-        range_agg_grads = cross_tower_utils.sum_gradients_all_reduce(
+        range_agg_grads = cross_device_utils.sum_gradients_all_reduce(
             self._worker_devices, device_grad_packs, len(self._worker_devices),
             spec_tuple.alg, spec_tuple.shards, range(self._num_gpus_per_worker))
         range_agg_grads = _unpack_tensors(range_agg_grads, tensor_packer)
@@ -761,7 +760,7 @@ class MultiWorkerAllReduce(AllReduceCrossDeviceOps):
     assert not remaining_grads
 
     return _ungroup_and_make_mirrored(aggregated_grads, destinations,
-                                      aggregation)
+                                      reduce_op)
 
 
 # TODO(yuefengz): support in-graph collective all-reduce.
@@ -790,20 +789,20 @@ class CollectiveAllReduce(CrossDeviceOps):
     self._num_workers = num_workers
     self._num_gpus_per_worker = num_gpus_per_worker
     self._all_reduce_merge_scope = all_reduce_merge_scope
-    self._collective_keys = collective_keys or cross_tower_utils.CollectiveKeys(
-    )
+    self._collective_keys = (collective_keys or
+                             cross_device_utils.CollectiveKeys())
     super(CollectiveAllReduce, self).__init__()
 
   # TODO(yuefengz, tucker): is indexed slices supported by collective ops?
-  def _reduce(self, aggregation, per_replica_value, destinations):
-    if cross_tower_utils.contains_indexed_slices(per_replica_value):
+  def _reduce(self, reduce_op, per_replica_value, destinations):
+    if cross_device_utils.contains_indexed_slices(per_replica_value):
       raise ValueError(
           "`IndexSlices` is not supported for Collective All-Reduce.")
     if context.executing_eagerly():
       raise ValueError(
           "Eager execution is not supported for Collective All-Reduce")
 
-    all_reduced = self._batch_all_reduce(aggregation, [per_replica_value])[0]
+    all_reduced = self._batch_all_reduce(reduce_op, [per_replica_value])[0]
     if _devices_match(per_replica_value, destinations):
       return all_reduced
     else:
@@ -819,8 +818,8 @@ class CollectiveAllReduce(CrossDeviceOps):
 
       return value_lib.Mirrored(index)
 
-  def _batch_reduce(self, aggregation, value_destination_pairs):
-    if cross_tower_utils.contains_indexed_slices(value_destination_pairs):
+  def _batch_reduce(self, reduce_op, value_destination_pairs):
+    if cross_device_utils.contains_indexed_slices(value_destination_pairs):
       raise ValueError(
           "`IndexSlices` is not supported for Collective All-Reduce.")
     if context.executing_eagerly():
@@ -829,7 +828,7 @@ class CollectiveAllReduce(CrossDeviceOps):
 
     all_devices_match = _all_devices_match(value_destination_pairs)
     if all_devices_match:
-      return self._batch_all_reduce(aggregation,
+      return self._batch_all_reduce(reduce_op,
                                     [v[0] for v in value_destination_pairs])
     else:
       if not all_devices_match:
@@ -838,11 +837,11 @@ class CollectiveAllReduce(CrossDeviceOps):
             "destinations are different.", 10)
 
       return [
-          self._reduce(aggregation, t, destinations=v)
+          self._reduce(reduce_op, t, destinations=v)
           for t, v in value_destination_pairs
       ]
 
-  def _batch_all_reduce(self, aggregation, per_replica_values):
+  def _batch_all_reduce(self, reduce_op, per_replica_values):
     """All-reduce across all workers in a batch."""
     if context.executing_eagerly():
       raise ValueError(
@@ -871,7 +870,7 @@ class CollectiveAllReduce(CrossDeviceOps):
       with ops.name_scope("allreduce"):
         for grad_and_vars in chunk:
           scaled_grads = [g for g, _ in grad_and_vars]
-          collective_reduced = cross_tower_utils.build_collective_reduce(
+          collective_reduced = cross_device_utils.build_collective_reduce(
               scaled_grads, self._num_workers, self._collective_keys, "Add",
               "Id")
           result = []
@@ -883,7 +882,7 @@ class CollectiveAllReduce(CrossDeviceOps):
     return _ungroup_and_make_mirrored(
         new_device_grads,
         per_replica_values[0].devices,
-        aggregation,
+        reduce_op,
         num_between_graph_workers=self._num_workers)
 
 
@@ -917,15 +916,15 @@ def _choose_all_reduce_algorithm(device_links):
 
 
 def choose_the_best(devices, session_config=None):
-  """Find the best subclass of CrossDeviceOps given a tensorflow session.
+  """Find the best subclass of CrossDeviceOps given a session config.
 
   Args:
-    devices: a list of devices passed for distribute strategy.
-    session_config: a tensorflow session config or None. If None, it will make
-      deciesion based on all local devices.
+    devices: a list of devices passed to `tf.distribute.Strategy`.
+    session_config: a `tf.ConfigProto` or `None`. If `None`, it will make
+      decision based on all local devices.
 
   Returns:
-    a subclass of CrossDeviceOps.
+    A subclass of `CrossDeviceOps`.
   """
   requested_devices = set([device_util.canonicalize(d) for d in devices])
   machine_devices = device_lib.list_local_devices(session_config=session_config)
@@ -938,13 +937,13 @@ def choose_the_best(devices, session_config=None):
           "Device is available but not used by distribute strategy: %s", d.name)
 
   if len(using_devices) != len(requested_devices):
-    logging.warning("Not all devices in distribute strategy are visible by "
-                    "TensorFlow sessions.")
+    logging.warning("Not all devices in `tf.distribute.Strategy` are visible "
+                    "to TensorFlow.")
     return ReductionToOneDeviceCrossDeviceOps()
 
-  if any([d.device_type.lower() != "gpu" for d in using_devices]):
-    logging.warning("Not all devices in DistributionStrategy are visible to "
-                    "TensorFlow session.")
+  if any(d.device_type.lower() != "gpu" for d in using_devices):
+    logging.warning("Not all devices in `tf.distribute.Strategy` are visible "
+                    "to TensorFlow.")
     return ReductionToOneDeviceCrossDeviceOps()
 
   device_links = [[] for _ in range(len(using_devices))]
diff --git a/tensorflow/contrib/distribute/python/cross_tower_utils.py b/tensorflow/python/distribute/cross_device_utils.py
similarity index 99%
rename from tensorflow/contrib/distribute/python/cross_tower_utils.py
rename to tensorflow/python/distribute/cross_device_utils.py
index 50b3cf31e59..0faadd7e0cf 100644
--- a/tensorflow/contrib/distribute/python/cross_tower_utils.py
+++ b/tensorflow/python/distribute/cross_device_utils.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Utilities for cross_tower_ops."""
+"""Utilities for cross_device_ops."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -21,8 +21,8 @@ from __future__ import print_function
 import collections as pycoll
 import threading
 
-from tensorflow.contrib.all_reduce.python import all_reduce
-from tensorflow.contrib.distribute.python import values as value_lib
+from tensorflow.python.distribute import all_reduce
+from tensorflow.python.distribute import values as value_lib
 from tensorflow.python.framework import device as pydev
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -420,7 +420,7 @@ def sum_gradients_all_reduce(dev_prefixes, replica_grads, num_workers, alg,
   Returns:
     list of reduced tensors
   """
-  alg_contains_shuffle = any([n in alg for n in ['pscpu', 'psgpu']])
+  alg_contains_shuffle = any(n in alg for n in ['pscpu', 'psgpu'])
   is_hierarchical = '/' in alg
   if 'pscpu' in alg:
     aux_devices = [prefix + '/cpu:0' for prefix in dev_prefixes]
diff --git a/tensorflow/python/training/device_util.py b/tensorflow/python/distribute/device_util.py
similarity index 98%
rename from tensorflow/python/training/device_util.py
rename to tensorflow/python/distribute/device_util.py
index 70e1ca4b5d7..34474582adf 100644
--- a/tensorflow/python/training/device_util.py
+++ b/tensorflow/python/distribute/device_util.py
@@ -50,7 +50,7 @@ def canonicalize(d, default=None):
   # Fill in missing device fields using defaults.
   result = tf_device.DeviceSpec(
       replica=0, task=0, device_type="CPU", device_index=0)
-  if context.executing_eagerly():
+  if ops.executing_eagerly_outside_functions():
     result.job = "localhost"
   if default:
     result.merge_from(tf_device.DeviceSpec.from_string(default))
diff --git a/tensorflow/python/training/device_util_test.py b/tensorflow/python/distribute/device_util_test.py
similarity index 95%
rename from tensorflow/python/training/device_util_test.py
rename to tensorflow/python/distribute/device_util_test.py
index cdbb08229d2..2f0d7ed3b31 100644
--- a/tensorflow/python/training/device_util_test.py
+++ b/tensorflow/python/distribute/device_util_test.py
@@ -18,14 +18,16 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.distribute import device_util
 from tensorflow.python.eager import context
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
-from tensorflow.python.training import device_util
 
 
 class DeviceUtilTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testCurrentDeviceWithGlobalGraph(self):
     with ops.device("/cpu:0"):
       self.assertEqual(device_util.current(), "/device:CPU:0")
@@ -49,6 +51,7 @@ class DeviceUtilTest(test.TestCase):
         self.assertEqual(device_util.current(),
                          "/job:localhost/replica:0/task:0/device:CPU:0")
 
+  @test_util.run_deprecated_v1
   def testCanonicalizeWithoutDefaultDevice(self):
     self.assertEqual(
         device_util.canonicalize("/cpu:0"),
diff --git a/tensorflow/python/distribute/distribute_coordinator.py b/tensorflow/python/distribute/distribute_coordinator.py
index 520413102be..c0f9b8a1fdf 100644
--- a/tensorflow/python/distribute/distribute_coordinator.py
+++ b/tensorflow/python/distribute/distribute_coordinator.py
@@ -245,7 +245,7 @@ class _WorkerContext(object):
     else:
       session_config = self._session_config
 
-    if not self._strategy or self._strategy.should_init:
+    if not self._strategy or self._strategy.extended.experimental_should_init:
       logging.info("Creating chief session creator with config: %r", config)
       return monitored_session.ChiefSessionCreator(
           scaffold,
@@ -261,6 +261,10 @@ class _WorkerContext(object):
           config=session_config,
           max_wait_secs=max_wait_secs)
 
+  @property
+  def session_config(self):
+    return copy.deepcopy(self._session_config)
+
   @property
   def has_barrier(self):
     """Whether the barrier is set or not."""
@@ -301,15 +305,20 @@ class _WorkerContext(object):
     """Returns number of workers in the cluster, including chief."""
     return self._num_workers
 
+  @property
+  def experimental_should_init(self):
+    """Whether to run init ops."""
+    return self._strategy.extended.experimental_should_init
+
   @property
   def should_checkpoint(self):
     """Whether to save checkpoint."""
-    return self._strategy.should_checkpoint
+    return self._strategy.extended.should_checkpoint
 
   @property
   def should_save_summary(self):
     """Whether to save summaries."""
-    return self._strategy.should_save_summary
+    return self._strategy.extended.should_save_summary
 
 
 def _run_single_worker(worker_fn,
@@ -623,10 +632,10 @@ def run_distribute_coordinator(worker_fn,
   The `strategy` object is expected to be a DistributionStrategy object which
   has implemented methods needed by distributed coordinator such as
   `configure(session_config, cluster_spec, task_type, task_id)` which configures
-  the strategy object for a specific task and `should_init` property which
-  instructs the distribute coordinator whether to run init ops for a task. The
-  distribute coordinator will make a copy of the `strategy` object, call its
-  `configure` method and pass it to `worker_fn` as an argument.
+  the strategy object for a specific task and `experimental_should_init`
+  property which instructs the distribute coordinator whether to run init ops
+  for a task. The distribute coordinator will make a copy of the `strategy`
+  object, call its `configure` method and pass it to `worker_fn` as an argument.
 
   The `worker_fn` defines the training logic and is called under a its own
   worker context which can be accessed to via `get_current_worker_context`. A
@@ -749,7 +758,7 @@ def run_distribute_coordinator(worker_fn,
     # The client must know the cluster but servers in the cluster don't have to
     # know the client.
     if task_type in [_TaskType.CLIENT, None]:
-      if strategy.between_graph:
+      if strategy.extended.experimental_between_graph:
         return _run_between_graph_client(worker_fn, strategy, eval_fn,
                                          eval_strategy, cluster_spec,
                                          session_config, rpc_layer)
@@ -795,7 +804,7 @@ def run_distribute_coordinator(worker_fn,
         environment=environment)
 
     if task_type in [_TaskType.CHIEF, _TaskType.WORKER]:
-      if strategy.between_graph:
+      if strategy.extended.experimental_between_graph:
         # All jobs run `worker_fn` if between-graph.
         _run_single_worker(worker_fn, strategy, cluster_spec, task_type,
                            task_id, session_config, rpc_layer)
diff --git a/tensorflow/python/distribute/distribute_coordinator_test.py b/tensorflow/python/distribute/distribute_coordinator_test.py
index 5d336648ce9..f2cb950aada 100644
--- a/tensorflow/python/distribute/distribute_coordinator_test.py
+++ b/tensorflow/python/distribute/distribute_coordinator_test.py
@@ -47,6 +47,7 @@ from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 from tensorflow.python.training import monitored_session
+from tensorflow.python.training import session_manager
 
 
 CHIEF = distribute_coordinator._TaskType.CHIEF
@@ -78,6 +79,19 @@ def _strip_protocol(target):
     return target
 
 
+class MockExtended(object):
+
+  def __init__(self,
+               between_graph=False,
+               should_init=None,
+               should_checkpoint=None,
+               should_save_summary=None):
+    self.experimental_between_graph = between_graph
+    self.experimental_should_init = should_init
+    self.should_checkpoint = should_checkpoint
+    self.should_save_summary = should_save_summary
+
+
 class MockStrategy(object):
 
   def __init__(self,
@@ -85,39 +99,33 @@ class MockStrategy(object):
                should_init=None,
                should_checkpoint=None,
                should_save_summary=None):
-    self._between_graph = between_graph
-    self._should_init = should_init
-    self._should_checkpoint = should_checkpoint
-    self._should_save_summary = should_save_summary
-
-  @property
-  def between_graph(self):
-    return self._between_graph
+    self.extended = MockExtended(between_graph, should_init, should_checkpoint,
+                                 should_save_summary)
 
   def configure(self,
                 session_config=None,
                 cluster_spec=None,
                 task_type=None,
                 task_id=None):
-    if self._should_init is None:
+    if self.extended.experimental_should_init is None:
       if task_id == 0:
-        self._should_init = True
+        self.extended.experimental_should_init = True
       else:
-        self._should_init = False
-    if self._should_checkpoint is None:
+        self.extended.experimental_should_init = False
+    if self.extended.should_checkpoint is None:
       if task_id == 0:
-        self._should_checkpoint = True
+        self.extended.should_checkpoint = True
       else:
-        self._should_checkpoint = False
-    if self._should_save_summary is None:
+        self.extended.should_checkpoint = False
+    if self.extended.should_save_summary is None:
       if task_id == 0:
-        self._should_save_summary = True
+        self.extended.should_save_summary = True
       else:
-        self._should_save_summary = False
+        self.extended.should_save_summary = False
 
     if session_config:
       if (cluster_spec and task_type and task_id is not None and
-          self._between_graph):
+          self.extended.experimental_between_graph):
         session_config.intra_op_parallelism_threads += 1
         if task_type in ["chief", "worker"]:
           session_config.device_filters.extend(
@@ -126,18 +134,6 @@ class MockStrategy(object):
         session_config.inter_op_parallelism_threads += 1
         session_config.device_filters.append("/job:somejob")
 
-  @property
-  def should_init(self):
-    return self._should_init
-
-  @property
-  def should_checkpoint(self):
-    return self._should_checkpoint
-
-  @property
-  def should_save_summary(self):
-    return self._should_save_summary
-
 
 class MockServer(object):
 
@@ -372,9 +368,12 @@ class DistributeCoordinatorTestBase(test.TestCase):
     context = distribute_coordinator_context.get_current_worker_context()
     self.assertTrue(context is not None)
 
-    self.assertEqual(context._strategy.should_init, strategy.should_init)
-    self.assertEqual(context.should_checkpoint, strategy.should_checkpoint)
-    self.assertEqual(context.should_save_summary, strategy.should_save_summary)
+    self.assertEqual(context._strategy.extended.experimental_should_init,
+                     strategy.extended.experimental_should_init)
+    self.assertEqual(context.should_checkpoint,
+                     strategy.extended.should_checkpoint)
+    self.assertEqual(context.should_save_summary,
+                     strategy.extended.should_save_summary)
 
     task_type = str(context.task_type)
     task_id = context.task_id or 0
@@ -384,7 +383,8 @@ class DistributeCoordinatorTestBase(test.TestCase):
       while len(self._strategy_property[task_type]) <= task_id:
         self._strategy_property[task_type].append(None)
       self._strategy_property[task_type][task_id] = (
-          context._strategy.should_init, context.should_checkpoint,
+          context._strategy.extended.experimental_should_init,
+          context.should_checkpoint,
           context.should_save_summary)
 
   def _run_mock_std_server(self,
@@ -930,4 +930,14 @@ class RunStandardTensorflowServerTest(test.TestCase):
 if __name__ == "__main__":
   # TODO(yuefengz): find a smart way to terminite std server threads.
   with test.mock.patch.object(sys, "exit", os._exit):
+    # Reduce `recovery_wait_secs` from 30 seconds so the test completes quickly.
+    orig_init = session_manager.SessionManager.__init__
+
+    def new_init(*args, **kwargs):
+      kwargs.pop("recovery_wait_secs", None)
+      kwargs["recovery_wait_secs"] = 0.5
+      orig_init(*args, **kwargs)
+
+    session_manager.SessionManager.__init__ = new_init
+
     test.main()
diff --git a/tensorflow/python/distribute/distribute_lib.py b/tensorflow/python/distribute/distribute_lib.py
new file mode 100644
index 00000000000..eddd6ff8b16
--- /dev/null
+++ b/tensorflow/python/distribute/distribute_lib.py
@@ -0,0 +1,1682 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Library for running a computation across multiple devices."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import copy
+import threading
+import weakref
+import enum
+
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.distribute import device_util
+from tensorflow.python.distribute import distribution_strategy_context
+from tensorflow.python.distribute import reduce_util
+from tensorflow.python.eager import context as eager_context
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops.losses import losses_impl
+from tensorflow.python.platform import tf_logging
+from tensorflow.python.util import nest
+from tensorflow.python.util.tf_export import tf_export
+from tensorflow.tools.docs import doc_controls
+
+
+# ------------------------------------------------------------------------------
+# Context tracking whether in a strategy.update() or .update_non_slot() call.
+
+
+_update_device = threading.local()
+
+
+def get_update_device():
+  """Get the current device if in a `tf.distribute.Strategy.update()` call."""
+  try:
+    return _update_device.current
+  except AttributeError:
+    return None
+
+
+class UpdateContext(object):
+  """Context manager when you are in `update()` or `update_non_slot()`."""
+
+  def __init__(self, device):
+    self._device = device
+    self._old_device = None
+
+  def __enter__(self):
+    self._old_device = get_update_device()
+    _update_device.current = self._device
+
+  def __exit__(self, exception_type, exception_value, traceback):
+    del exception_type, exception_value, traceback
+    _update_device.current = self._old_device
+
+
+# ------------------------------------------------------------------------------
+# Public utility functions.
+
+
+@tf_export("distribute.get_loss_reduction")
+def get_loss_reduction():
+  """`tf.distribute.ReduceOp` corresponding to the last loss reduction."""
+  loss_reduction = ops.get_default_graph()._last_loss_reduction  # pylint: disable=protected-access
+  if (loss_reduction == losses_impl.Reduction.SUM or
+      loss_reduction == losses_impl.ReductionV2.SUM):
+    return reduce_util.ReduceOp.SUM
+  return reduce_util.ReduceOp.MEAN
+
+
+# ------------------------------------------------------------------------------
+# Internal API for validating the current thread mode
+
+
+def _require_cross_replica_context_extended(extended):
+  """Verify in cross-replica context."""
+  context = _get_per_thread_mode()
+  cross_replica = context.cross_replica_context
+  if cross_replica is not None and cross_replica.extended is extended:
+    return
+  strategy = extended._container_strategy()  # pylint: disable=protected-access
+  # We have an error to report, figure out the right message.
+  if context.distribution_strategy is not strategy:
+    _wrong_strategy_scope(strategy, context)
+  assert cross_replica is None
+  raise RuntimeError("Method requires being in cross-replica context, use "
+                     "get_replica_context().merge_call()")
+
+
+def _wrong_strategy_scope(strategy, context):
+  # Figure out the right error message.
+  if not distribution_strategy_context.has_distribution_strategy():
+    raise RuntimeError(
+        'Need to be inside "with strategy.scope()" for %s' %
+        (strategy,))
+  else:
+    raise RuntimeError(
+        "Mixing different tf.distribute.Strategy objects: %s is not %s" %
+        (context.distribution_strategy, strategy))
+
+
+def require_replica_context(replica_ctx):
+  """Verify in `replica_ctx` replica context."""
+  context = _get_per_thread_mode()
+  if context.replica_context is replica_ctx: return
+  # We have an error to report, figure out the right message.
+  if context.replica_context is None:
+    raise RuntimeError("Need to be inside `call_for_each_replica()`")
+  if context.distribution_strategy is replica_ctx.distribution_strategy:
+    # Two different ReplicaContexts with the same tf.distribute.Strategy.
+    raise RuntimeError("Mismatching ReplicaContext.")
+  raise RuntimeError(
+      "Mismatching tf.distribute.Strategy objects: %s is not %s." %
+      (context.distribution_strategy, replica_ctx.distribution_strategy))
+
+
+def _require_distribution_strategy_scope_strategy(strategy):
+  """Verify in a `strategy.scope()` in this thread."""
+  context = _get_per_thread_mode()
+  if context.distribution_strategy is strategy: return
+  _wrong_strategy_scope(strategy, context)
+
+
+def _require_distribution_strategy_scope_extended(extended):
+  """Verify in a `distribution_strategy.scope()` in this thread."""
+  context = _get_per_thread_mode()
+  if context.distribution_strategy.extended is extended: return
+  # Report error.
+  strategy = extended._container_strategy()  # pylint: disable=protected-access
+  _wrong_strategy_scope(strategy, context)
+
+
+# ------------------------------------------------------------------------------
+# Internal context managers used to implement the DistributionStrategy
+# base class
+
+
+class _CurrentDistributionContext(object):
+  """Context manager setting the current `tf.distribute.Strategy`.
+
+  Also: overrides the variable creator and optionally the current device.
+  """
+
+  def __init__(self,
+               strategy,
+               var_creator_scope,
+               var_scope=None,
+               default_device=None):
+    self._context = distribution_strategy_context._CrossReplicaThreadMode(  # pylint: disable=protected-access
+        strategy)
+    self._var_creator_scope = var_creator_scope
+    self._var_scope = var_scope
+    if default_device:
+      self._device_scope = ops.device(default_device)
+    else:
+      self._device_scope = None
+
+  def __enter__(self):
+    _push_per_thread_mode(self._context)
+    if self._var_scope:
+      self._var_scope.__enter__()
+    self._var_creator_scope.__enter__()
+    if self._device_scope:
+      self._device_scope.__enter__()
+    return self._context.distribution_strategy
+
+  def __exit__(self, exception_type, exception_value, traceback):
+    if self._device_scope:
+      self._device_scope.__exit__(exception_type, exception_value, traceback)
+    self._var_creator_scope.__exit__(exception_type, exception_value, traceback)
+    if self._var_scope:
+      self._var_scope.__exit__(exception_type, exception_value, traceback)
+    _pop_per_thread_mode()
+
+
+class _SameScopeAgainContext(object):
+  """Trivial context manager when you are already in `scope()`."""
+
+  def __init__(self, strategy):
+    self._distribution_strategy = strategy
+
+  def __enter__(self):
+    return self._distribution_strategy
+
+  def __exit__(self, exception_type, exception_value, traceback):
+    del exception_type, exception_value, traceback
+
+
+# TODO(yuefengz): add more replication modes.
+@tf_export("distribute.InputReplicationMode")
+class InputReplicationMode(enum.Enum):
+  """Replication mode for input function."""
+
+  # The input function will be called on each worker independently, creating as
+  # many input pipelines as number of workers. Replicas will dequeue from the
+  # local Dataset on their worker. Distribution Strategy doesn't manage any
+  # state sharing between such separate input pipelines.
+  PER_WORKER = "PER_WORKER"
+
+
+@tf_export("distribute.InputContext")
+class InputContext(object):
+  """A class wrapping information needed by an input function.
+
+  This is a context class that is passed to the user's input fn and contains
+  information about the compute replicas and input pipelines. The number of
+  compute replicas (in sync training) helps compute per input pipeline batch
+  size from the desired global batch size. Input pipeline information can be
+  used to return a different subset of the input in each input pipeline (for
+  e.g. shard the input pipeline, use a different input source etc).
+  """
+
+  def __init__(self,
+               num_input_pipelines=1,
+               input_pipeline_id=0,
+               num_replicas_in_sync=1):
+    """Initializes an InputContext object.
+
+    Args:
+      num_input_pipelines: the number of input pipelines in a cluster.
+      input_pipeline_id: the current input pipeline id, should be an int in
+        [0,`num_input_pipelines`).
+      num_replicas_in_sync: the number of replicas that are in sync.
+    """
+    self._num_input_pipelines = num_input_pipelines
+    self._input_pipeline_id = input_pipeline_id
+    self._num_replicas_in_sync = num_replicas_in_sync
+
+  @property
+  def num_replicas_in_sync(self):
+    """Returns the number of compute replicas in sync."""
+    return self._num_replicas_in_sync
+
+  @property
+  def input_pipeline_id(self):
+    """Returns the input pipeline ID."""
+    return self._input_pipeline_id
+
+  @property
+  def num_input_pipelines(self):
+    """Returns the number of input pipelines."""
+    return self._num_input_pipelines
+
+  def get_per_replica_batch_size(self, global_batch_size):
+    """Returns the per-replica batch size.
+
+    Args:
+      global_batch_size: the global batch size which should be divisible by
+        `num_replicas_in_sync`.
+
+    Returns:
+      the per-replica batch size.
+
+    Raises:
+      ValueError: if `global_batch_size` not divisible by
+        `num_replicas_in_sync`.
+    """
+    if global_batch_size % self._num_replicas_in_sync != 0:
+      raise ValueError("The `global_batch_size` %r is not divisible by "
+                       "`num_replicas_in_sync` %r " %
+                       (global_batch_size, self._num_replicas_in_sync))
+    return global_batch_size // self._num_replicas_in_sync
+
+
+# ------------------------------------------------------------------------------
+# Base classes for all distribution strategies.
+
+
+@tf_export("distribute.Strategy")
+class DistributionStrategy(object):
+  """A list of devices with a state & compute distribution policy.
+
+  See [tensorflow/contrib/distribute/README.md](
+  https://www.tensorflow.org/code/tensorflow/contrib/distribute/README.md)
+  for overview and examples.
+  """
+
+  # TODO(josh11b): Raise an exception if variable partitioning requested before
+  #   we add support.
+  # TODO(josh11b): Also `parameter_device_index` property?
+  # TODO(josh11b): `map()`
+  # TODO(josh11b): ClusterSpec/ClusterResolver
+  # TODO(josh11b): Partitioned computations, state; sharding
+  # TODO(josh11b): Model parallelism: "replicas" with multiple devices; shuffling
+  # TODO(josh11b): List of replicas with their worker and parameter devices
+  #   (where the parameter devices may overlap in the ps case).
+
+  def __init__(self, extended):
+    self._extended = extended
+
+  @property
+  def extended(self):
+    """`tf.distribute.StrategyExtended` with additional methods."""
+    return self._extended
+
+  def scope(self):
+    """Returns a context manager selecting this Strategy as current.
+
+    Inside a `with strategy.scope():` code block, this thread
+    will use a variable creator set by `strategy`, and will
+    enter its "cross-replica context".
+
+    Returns:
+      A context manager.
+    """
+    return self._extended._scope(self)  # pylint: disable=protected-access
+
+  @doc_controls.do_not_generate_docs  # DEPRECATED, moving to `extended`
+  def read_var(self, v):
+    """DEPRECATED: use extended.read_var() instead."""
+    return self._extended.read_var(v)
+
+  @doc_controls.do_not_generate_docs  # DEPRECATED, moving to `extended`
+  def colocate_vars_with(self, colocate_with_variable):
+    """DEPRECATED: use extended.colocate_vars_with() instead."""
+    return self._extended.colocate_vars_with(colocate_with_variable)
+
+  @doc_controls.do_not_generate_docs  # DEPRECATED
+  def distribute_dataset(self, dataset_fn):
+    """Return a `dataset` split across all replicas.  DEPRECATED.
+
+    DEPRECATED: Please use `make_dataset_iterator` or
+    `make_input_fn_iterator` instead.
+
+    Suitable for providing input to `extended.call_for_each_replica()` by
+    creating an iterator:
+
+    ```
+    def dataset_fn():
+      return tf.data.Dataset.from_tensors([[1.]]).repeat()
+
+    with strategy.scope():
+      distributed_dataset = strategy.distribute_dataset(dataset_fn)
+      iterator = distributed_dataset.make_initializable_iterator()
+      replica_results = strategy.extended.call_for_each_replica(
+          replica_fn, args=(iterator.get_next(),))
+    ```
+
+    Args:
+      dataset_fn: A function that returns a `tf.data.Dataset`.
+
+    Returns:
+      A `PerReplicaDataset` that will produce data for each replica.
+    """
+    return self._extended._distribute_dataset(dataset_fn)  # pylint: disable=protected-access
+
+  def make_dataset_iterator(self, dataset):
+    """Makes an iterator for input provided via input_dataset.
+
+    Data from the given dataset will be distributed evenly across all the
+    compute replicas. We will assume that the input dataset is batched by the
+    global batch size. With this assumption, we will make a best effort to
+    divide each batch across all the replicas (one or more workers).
+    If this effort fails, an error will be thrown, and the user should instead
+    use `make_input_fn_iterator` which provides more control to the user, and
+    does not try to divide a batch across replicas.
+
+    The user could also use `make_input_fn_iterator` if they want to
+    customize which input is fed to which replica/worker etc.
+
+    Args:
+      dataset: `tf.data.Dataset` that will be distributed evenly across all
+        replicas.
+
+    Returns:
+      An `tf.distribute.InputIterator` which returns inputs for each step of the
+      computation.  User should call `initialize` on the returned iterator.
+    """
+    return self._extended._make_dataset_iterator(dataset)  # pylint: disable=protected-access
+
+  def make_input_fn_iterator(self,
+                             input_fn,
+                             replication_mode=InputReplicationMode.PER_WORKER):
+    """Returns an iterator split across replicas created from an input function.
+
+    The `input_fn` should take an `tf.distribute.InputContext` object where
+    information about input sharding can be accessed:
+
+    ```
+    def input_fn(input_context):
+      d = tf.data.Dataset.from_tensors([[1.]]).repeat()
+      return d.shard(input_context.num_input_pipelines,
+                     input_context.input_pipeline_id)
+    with strategy.scope():
+      iterator = strategy.make_input_fn_iterator(
+          input_fn)
+      replica_results = strategy.extended.call_for_each_replica(
+          replica_fn, iterator.get_next())
+    ```
+
+    Args:
+      input_fn: A function that returns a `tf.data.Dataset`. This function is
+        expected to take an `tf.distribute.InputContext` object.
+      replication_mode: an enum value of `tf.distribute.InputReplicationMode`.
+        Only `PER_WORKER` is supported currently.
+
+    Returns:
+      An iterator object that can be initialized and fetched next element.
+    """
+    if replication_mode != InputReplicationMode.PER_WORKER:
+      raise ValueError(
+          "Input replication mode not supported: %r" % replication_mode)
+    return self.extended._make_input_fn_iterator(  # pylint: disable=protected-access
+        input_fn, replication_mode=replication_mode)
+
+  @doc_controls.do_not_generate_docs  # DEPRECATED, moving to `extended`
+  def broadcast(self, tensor, destinations=None):
+    """DEPRECATED: use extended.broadcast_to() instead."""
+    return self._extended.broadcast_to(tensor, destinations)
+
+  @doc_controls.do_not_generate_docs  # Use experimental_initialize() instead.
+  def initialize(self):
+    """DEPRECATED: Use `experimental_initialize()` instead."""
+    return self._extended._initialize()  # pylint: disable=protected-access
+
+  def experimental_initialize(self):
+    """Any initialization to be done before running any computations.
+
+    In eager mode, it executes any initialization as a side effect.
+    In graph mode, it creates the initialization ops and returns them.
+
+    For example, TPU initialize_system ops.
+
+    Returns:
+      A list of ops to execute.
+    """
+    return self._extended._initialize()  # pylint: disable=protected-access
+
+  @doc_controls.do_not_generate_docs  # Use experimental_finalize() instead.
+  def finalize(self):
+    """DEPRECATED: Use `experimental_finalize()` instead."""
+    return self._extended._finalize()  # pylint: disable=protected-access
+
+  def experimental_finalize(self):
+    """Any final actions to be done at the end of all computations.
+
+    In eager mode, it executes any finalize actions as a side effect.
+    In graph mode, it creates the finalize ops and returns them.
+
+    For example, TPU shutdown ops.
+
+    Returns:
+      A list of ops to execute.
+    """
+    return self._extended._finalize()  # pylint: disable=protected-access
+
+  @doc_controls.do_not_generate_docs  # DEPRECATED, moving to `extended`
+  def run_steps_on_dataset(self, fn, iterator, iterations=1,
+                           initial_loop_values=None):
+    """DEPRECATED: use extended.experimental_run_steps_on_iterator() instead."""
+    return self._extended.experimental_run_steps_on_iterator(
+        fn, iterator, iterations, initial_loop_values)
+
+  @doc_controls.do_not_generate_docs  # DEPRECATED, moving to `extended`
+  def call_for_each_replica(self, fn, *args, **kwargs):
+    """DEPRECATED: use extended.call_for_each_replica() instead."""
+    # Handle old *args, **kwargs, and new args=(...), kwargs={...}, to
+    # allow transition.
+    a = kwargs.pop("args", None)
+    if a is not None:
+      if args:
+        raise ValueError(
+            "Can't pass *args and args=... to call_for_each_replica")
+      args = a
+    k = kwargs.pop("kwargs", None)
+    if k is not None:
+      if kwargs:
+        raise ValueError(
+            "Can't pass **kwargs and kwargs=... to call_for_each_replica")
+      kwargs = k
+    kwargs.pop("run_concurrently", None)  # Ignore old option.
+    return self._extended.call_for_each_replica(fn, args, kwargs)
+
+  def reduce(self, reduce_op, value):
+    """Reduce `value` across replicas.
+
+    Args:
+      reduce_op: A `tf.distribute.ReduceOp` value specifying how values should
+        be combined.
+      value: A "per replica" value to be combined into a single tensor.
+
+    Returns:
+      A `Tensor`.
+    """
+    _require_cross_replica_context_extended(self._extended)
+    return self._extended._reduce(reduce_op, value)  # pylint: disable=protected-access
+
+  @doc_controls.do_not_generate_docs  # DEPRECATED, moving to `extended`
+  def batch_reduce(self, aggregation, value_destination_pairs):
+    """DEPRECATED: use extended.batch_reduce_to() instead."""
+    return self._extended.batch_reduce_to(aggregation, value_destination_pairs)
+
+  @doc_controls.do_not_generate_docs  # DEPRECATED, moving to `extended`
+  def update(self, var, fn, *args, **kwargs):
+    """DEPRECATED: use extended.update() instead."""
+    group = kwargs.pop("group", True)
+    # We temporarily support "grouped" in addition to "group" for backward-
+    # compatibility.
+    group = kwargs.pop("grouped", True) and group
+    # Handle old *args, **kwargs, and new args=(...), kwargs={...}, to
+    # allow transition.
+    a = kwargs.pop("args", None)
+    if a is not None:
+      if args:
+        raise ValueError(
+            "Can't pass *args and args=... to update")
+      args = a
+    k = kwargs.pop("kwargs", None)
+    if k is not None:
+      if kwargs:
+        raise ValueError(
+            "Can't pass **kwargs and kwargs=... to update")
+      kwargs = k
+    return self._extended.update(var, fn, args, kwargs, group)
+
+  @doc_controls.do_not_generate_docs  # DEPRECATED, moving to `extended`
+  def update_non_slot(self, colocate_with, fn, *args, **kwargs):
+    """DEPRECATED: use extended.update_non_slot() instead."""
+    group = kwargs.pop("group", True)
+    # We temporarily support "grouped" in addition to "group" for backward-
+    # compatibility.
+    group = kwargs.pop("grouped", True) and group
+    # Handle old *args, **kwargs, and new args=(...), kwargs={...}, to
+    # allow transition.
+    a = kwargs.pop("args", None)
+    if a is not None:
+      if args:
+        raise ValueError(
+            "Can't pass *args and args=... to update_non_slot")
+      args = a
+    k = kwargs.pop("kwargs", None)
+    if k is not None:
+      if kwargs:
+        raise ValueError(
+            "Can't pass **kwargs and kwargs=... to update_non_slot")
+      kwargs = k
+    return self._extended.update_non_slot(
+        colocate_with, fn, args, kwargs, group)
+
+  @doc_controls.do_not_generate_docs  # DEPRECATED, -> `DistributedValues`
+  def unwrap(self, value):
+    """Returns the list of all per-replica values contained in `value`.
+
+    Args:
+      value: A value returned by `extended.call_for_each_replica()` or a
+        variable created in `scope`.
+
+    Returns:
+      A list of values contained in `value`. If `value` represents a single
+      value, this returns `[value].`
+    """
+    return self._extended._unwrap(value)  # pylint: disable=protected-access
+
+  @doc_controls.do_not_generate_docs  # DEPRECATED, moving to `extended`
+  def value_container(self, value):
+    """DEPRECATED: use extended.value_container() instead."""
+    return self._extended.value_container(value)
+
+  @doc_controls.do_not_generate_docs  # DEPRECATED, -> `DistributedValues`
+  def group(self, value, name=None):
+    """Shortcut for `tf.group(self.unwrap(value))`."""
+    return self._extended._group(value, name)  # pylint: disable=protected-access
+
+  @property
+  @doc_controls.do_not_generate_docs  # DEPRECATED, moving to `extended`
+  def require_static_shapes(self):
+    """DEPRECATED: use extended.require_static_shapes instead."""
+    return self._extended.experimental_require_static_shapes
+
+  @property
+  def num_replicas_in_sync(self):
+    """Returns number of replicas over which gradients are aggregated."""
+    return self._extended._num_replicas_in_sync  # pylint: disable=protected-access
+
+  @property
+  @doc_controls.do_not_generate_docs  # DEPRECATED, moving to `extended`
+  def worker_devices(self):
+    """DEPRECATED: use extended.worker_devices instead."""
+    return self._extended.worker_devices
+
+  @property
+  @doc_controls.do_not_generate_docs  # DEPRECATED, moving to `extended`
+  def parameter_devices(self):
+    """DEPRECATED: use extended.parameter_devices instead."""
+    return self._extended.parameter_devices
+
+  @doc_controls.do_not_generate_docs  # DEPRECATED, moving to `extended`
+  def non_slot_devices(self, var_list):
+    """DEPRECATED: use extended.non_slot_devices instead."""
+    return self._extended.non_slot_devices(var_list)
+
+  @property
+  @doc_controls.do_not_generate_docs  # DEPRECATED, moving to `extended`
+  def between_graph(self):
+    """DEPRECATED: use extended.experimental_between_graph instead."""
+    return self._extended.experimental_between_graph
+
+  @doc_controls.do_not_generate_docs  # DEPRECATED, being replaced by a new API.
+  def configure(self,
+                session_config=None,
+                cluster_spec=None,
+                task_type=None,
+                task_id=None):
+    # pylint: disable=g-doc-return-or-yield,g-doc-args
+    """DEPRECATED: use `update_config_proto` instead.
+
+    Configures the strategy class.
+
+    DEPRECATED: This method's functionality has been split into the strategy
+    constructor and `update_config_proto`. In the future, we will allow passing
+    cluster and config_proto to the constructor to configure the strategy. And
+    `update_config_proto` can be used to update the config_proto based on the
+    specific strategy.
+    """
+    return self._extended._configure(  # pylint: disable=protected-access
+        session_config, cluster_spec, task_type, task_id)
+
+  def update_config_proto(self, config_proto):
+    """Returns a copy of `config_proto` modified for use with this strategy.
+
+    The updated config has something needed to run a strategy, e.g.
+    configuration to run collective ops, or device filters to improve
+    distributed training performance.
+
+    Args:
+      config_proto: a `tf.ConfigProto` object.
+
+    Returns:
+      The updated copy of the `config_proto`.
+    """
+    return self._extended._update_config_proto(config_proto)  # pylint: disable=protected-access
+
+  @property
+  @doc_controls.do_not_generate_docs  # DEPRECATED, moving to `extended`
+  def should_init(self):
+    """DEPRECATED: use extended.should_init instead."""
+    return self._extended.experimental_should_init
+
+  @property
+  @doc_controls.do_not_generate_docs  # DEPRECATED, moving to `extended`
+  def should_checkpoint(self):
+    """DEPRECATED: use extended.should_checkpoint instead."""
+    return self._extended.should_checkpoint
+
+  @property
+  @doc_controls.do_not_generate_docs  # DEPRECATED, moving to `extended`
+  def should_save_summary(self):
+    """DEPRECATED: use extended.should_save_summary instead."""
+    return self._extended.should_save_summary
+
+  def __deepcopy__(self, memo):
+    # First do a regular deepcopy of `self`.
+    cls = self.__class__
+    result = cls.__new__(cls)
+    memo[id(self)] = result
+    for k, v in self.__dict__.items():
+      setattr(result, k, copy.deepcopy(v, memo))
+    # One little fix-up: we want `result._extended` to reference `result`
+    # instead of `self`.
+    result._extended._container_strategy_weakref = weakref.ref(result)  # pylint: disable=protected-access
+    return result
+
+  def __copy__(self):
+    raise RuntimeError("Must only deepcopy DistributionStrategy.")
+
+
+@tf_export("distribute.StrategyExtended")
+class DistributionStrategyExtended(object):
+  """Additional APIs for algorithms that need to be distribution-aware.
+
+  The intent is that you can write an algorithm in a stylized way and
+  it will be usable with a variety of different
+  `tf.distribute.Strategy`
+  implementations. Each descendant will implement a different strategy
+  for distributing the algorithm across multiple devices/machines.
+  Furthermore, these changes can be hidden inside the specific layers
+  and other library classes that need special treatment to run in a
+  distributed setting, so that most users' model definition code can
+  run unchanged. The `tf.distribute.Strategy` API works the same way
+  with eager and graph execution.
+
+  First let's introduce a few high-level concepts:
+
+  * _Data parallelism_ is where we run multiple copies of the model
+    on different slices of the input data. This is in contrast to
+    _model parallelism_ where we divide up a single copy of a model
+    across multiple devices.
+    Note: we only support data parallelism for now, but
+    hope to add support for model parallelism in the future.
+  * A _replica_ is one copy of the model, running on one slice of the
+    input data.
+  * _Synchronous_, or more commonly _sync_, training is where the
+    updates from each replica are aggregated together before updating
+    the model variables. This is in contrast to _asynchronous_, or
+    _async_ training, where each replica updates the model variables
+    independently.
+  * Furthermore you might run your computation on multiple devices
+    on one machine (or "host"), or on multiple machines/hosts.
+    If you are running on multiple machines, you might have a
+    single master host that drives computation across all of them,
+    or you might have multiple clients driving the computation
+    asynchronously.
+
+  To distribute an algorithm, we might use some of these ingredients:
+
+  * Parameter servers: These are hosts that hold a single copy of
+    parameters/variables. All replicas that want to operate on a variable
+    retrieve it at the beginning of a step and send an update to be
+    applied at the end of the step. Can support either sync or async
+    training.
+  * Mirrored variables: These are variables that are copied to multiple
+    devices, where we keep the copies in sync by applying the same
+    updates to every copy. Normally would only be used with sync training.
+  * Reductions and Allreduce: A _reduction_ is some method of
+    aggregating multiple values into one value, like "sum" or
+    "mean". If doing sync training, we will perform a reduction on the
+    gradients to a parameter from all replicas before applying the
+    update. Allreduce is an algorithm for performing a reduction on
+    values from multiple devices and making the result available on
+    all of those devices.
+  * In the future we will have support for TensorFlow's partitioned
+    variables, where a single variable is split across multiple
+    devices.
+
+  We have then a few approaches we want to support:
+
+  * Code written (as if) with no knowledge of class `tf.distribute.Strategy`.
+    This code should work as before, even if some of the layers, etc.
+    used by that code are written to be distribution-aware. This is done
+    by having a default `tf.distribute.Strategy` that gives ordinary behavior,
+    and by default being in a single replica context.
+  * Ordinary model code that you want to run using a specific
+    `tf.distribute.Strategy`. This can be as simple as:
+
+    ```
+    with my_strategy.scope():
+      iterator = my_strategy.make_dataset_iterator(dataset)
+      session.run(iterator.initialize())
+      replica_train_ops = my_strategy.extended.call_for_each_replica(
+          replica_fn, args=(iterator.get_next(),))
+      train_op = my_strategy.group(replica_train_ops)
+    ```
+
+    This takes an ordinary `dataset` and `replica_fn` and runs it
+    distributed using a particular `tf.distribute.Strategy` in
+    `my_strategy`. Any variables created in `replica_fn` are created
+    using `my_strategy`'s policy, and library functions called by
+    `replica_fn` can use the `get_replica_context()` API to get enhanced
+    behavior in this case.
+
+  * If you want to write a distributed algorithm, you may use any of
+    the `tf.distribute.Strategy` APIs inside a
+    `with my_strategy.scope():` block of code.
+
+  Lower-level concepts:
+
+  * Wrapped values: In order to represent values parallel across devices
+    (either replicas or the devices associated with a particular value), we
+    wrap them in a "PerReplica" or "Mirrored" object that contains a map
+    from device to values. "PerReplica" is used when the value may be
+    different across replicas, and "Mirrored" when the value are the same.
+  * Unwrapping and merging: Consider calling a function `fn` on multiple
+    replicas, like `extended.call_for_each_replica(fn, args=[w])` with an
+    argument `w` that is a wrapped value. This means `w` will have a map taking
+    replica device `d0` to `w0`, replica device `d1` to `w1`,
+    etc. `extended.call_for_each_replica()` unwraps `w` before calling `fn`, so
+    it calls `fn(w0)` on `d0`, `fn(w1)` on `d1`, etc.  It then merges the return
+    values from `fn()`, which can possibly result in wrapped values. For
+    example, let's say `fn()` returns a tuple with three components: `(x, a,
+    v0)` from replica 0, `(x, b, v1)` on replica 1, etc. If the first component
+    is the same object `x` from every replica, then the first component of the
+    merged result will also be `x`. If the second component is different (`a`,
+    `b`, ...)  from each replica, then the merged value will have a wrapped map
+    from replica device to the different values. If the third component is the
+    members of a mirrored variable (`v` maps `d0` to `v0`, `d1` to `v1`, etc.),
+    then the merged result will be that mirrored variable (`v`).
+  * Replica context vs. Cross-replica context: _replica context_ is when we
+    are in some function that is being called once for each replica.
+    Otherwise we are in cross-replica context, which is useful for
+    calling `tf.distribute.Strategy` methods which operate across the
+    replicas (like `reduce_to()`). By default you start in a replica context
+    (the default "single replica context") and then some methods can
+    switch you back and forth, as described below.
+  * Worker devices vs. parameter devices: Most replica computations will
+    happen on worker devices. Since we don't yet support model
+    parallelism, there will be one worker device per replica. When using
+    parameter servers (see above), the set of devices holding
+    variables may be different, otherwise the parameter devices might
+    match the worker devices.
+  * Non-slot devices are some subset of the parameter devices where we
+    put all the non-slot variables. We need to ensure that all
+    non-slot variables are allocated on the same device, or mirrored
+    across the same set of devices. If you have some variable you want
+    to colocate all the non-slot variables with, you can use
+    `colocate_vars_with()` to get the remaining non-slot variables on
+    the same device.  Otherwise you can use `non_slot_devices()` to
+    pick a consistent set of devices to pass to both
+    `colocate_vars_with()` and `update_non_slot()`.
+
+  When using a `tf.distribute.Strategy`, we have a new type dimension
+  called _locality_ that says what values are compatible with which
+  APIs:
+
+  * T: different value for each replica (e.g. a PerReplica-wrapped value).
+  * M: value is "mirrored" across replicas, i.e. there are copies with the
+    same value on each replica (e.g. a Mirrored-wrapped value).
+  * V(`v`): value is "mirrored" across all the devices which have a
+    copy of variable `v` (also a Mirrored-wrapped value, but over
+    parameter devices instead of worker devices).
+  * N: value is "mirrored" across all the "non-slot" devices
+
+  Rules for methods with respect to locality and single-replica vs.
+  cross-replica context:
+
+  * `with d.scope()`: default single-replica context -> cross-replica context
+    for `d`
+  * `with d.extended.colocate_vars_with(v)`: in replica/cross-replica context,
+    variables will be created with locality V(`v`). That is, if we write
+    `with d.extended.colocate_vars_with(v1): v2 = tf.get_variable(...)`,
+    then `v2` will have locality V(`v1`), i.e. locality V(`v2`) will equal
+    V(`v1`).
+  * `with d.extended.colocate_vars_with(d.extended.non_slot_devices(...))`: in
+    replica/cross-replica context, variables will be created with locality N
+  * `v = tf.get_variable(...)`: in replica/cross-replica context, creates
+    a variable (which by definition will have locality V(`v`), though
+    will match another locality if inside a `colocate_vars_with`
+    scope).
+  * `d.make_dataset_iterator(dataset)` (or the deprecated
+    `d.distribute_dataset(dataset).make_one_shot_iterator()`): in cross-replica
+    context, produces an iterator with locality T
+  * `d.extended.broadcast_to(t)`: in cross-replica context, produces a value
+    with locality M
+  * `d.extended.broadcast_to(t, v)`: in cross-replica context, produces a value
+    with locality V(`v`)
+  * `d.extended.call_for_each_replica(fn, ...)`: in cross-replica context, runs
+    `fn()` in a replica context (and so may call `get_replica_context()` and
+    use its API, including `merge_call()` to get back to cross-replica
+    context), once for each replica. May use values with locality T or
+    M, and any variable.
+  * `d.extended.reduce_to(m, t, t)`: in cross-replica context, accepts t with
+    locality T and produces a value with locality M.
+  * `d.extended.reduce_to(m, t, v)`: in cross-replica context, accepts t with
+    locality T and produces a value with locality V(`v`).
+  * `d.extended.batch_reduce_to(m, [(t, v)]): see `d.extended.reduce_to()`
+  * `d.extended.update(v, fn, ...)`: in cross-replica context, runs `fn()` once
+    for each device `v` is copied to, all inputs should have locality
+    V(`v`), output will have locality V(`v`) as well.
+  * `d.extended.update_non_slot(d.extended.non_slot_devices(), fn)`: in
+    cross-replica context, like `d.extended.update()` except with locality N.
+  * `d.extended.read_var(v)`: Gets the (read-only) value of the variable `v` (on
+    the device determined by the current device scope), aggregating
+    across replicas for replica-local variables. Frequently, this will be
+    done automatically when using `v` in an expression or fetching it in
+    a cross-replica context, but this function can be used to force that
+    conversion happens at a particular point in time (for example, to
+    add the result of the conversion to a graph collection).
+
+  The standard pattern for updating variables is to:
+
+  1. Create an input iterator with `d.make_dataset_iterator()`.
+  2. Define each replica `d.extended.call_for_each_replica()` up to the point of
+     getting a list of gradient, variable pairs.
+  3. Call `d.extended.reduce_to(VariableAggregation.SUM, t, v)` or
+     `d.extended.batch_reduce_to()` to sum the gradients (with locality T)
+     into values with locality V(`v`).
+  4. Call `d.extended.update(v)` for each variable to update its value.
+
+  Steps 3 and 4 are done automatically by class `Optimizer` if you call
+  its `apply_gradients` method in a replica context. Otherwise you can
+  manually call its `_distributed_apply` method in a cross-replica context.
+
+  Another thing you might want to do in the middle of your replica function is
+  an all-reduce of some intermediate value, using `d.extended.reduce_to()` or
+  `d.extended.batch_reduce_to()`. You simply provide the same tensor as the
+  input and destination.
+
+  Layers should expect to be called in a replica context, and can use
+  the `tf.distribute.get_replica_context` function to get a
+  `tf.distribute.ReplicaContext` object. The
+  `ReplicaContext` object has a `merge_call()` method for entering
+  cross-replica context where you can use `reduce_to()` (or
+  `batch_reduce_to()`) and then optionally `update()` to update state.
+
+  You may use this API whether or not a `tf.distribute.Strategy` is
+  being used, since there is a default implementation of
+  `ReplicaContext` and `tf.distribute.Strategy`.
+
+  NOTE for new `tf.distribute.Strategy` implementations: Please put all logic
+  in a subclass of `tf.distribute.StrategyExtended`. The only code needed for
+  the `tf.distribute.Strategy` subclass is for instantiating your subclass of
+  `tf.distribute.StrategyExtended` in the `__init__` method.
+  """
+
+  def __init__(self, container_strategy):
+    self._container_strategy_weakref = weakref.ref(container_strategy)
+    self._default_device = None
+    # This property is used to determine if we should set drop_remainder=True
+    # when creating Datasets from numpy array inputs.
+    self._require_static_shapes = False
+
+  def _container_strategy(self):
+    """Get the containing `DistributionStrategy`.
+
+    This should not generally be needed except when creating a new
+    `ReplicaContext` and to validate that the caller is in the correct
+    `scope()`.
+
+    Returns:
+      The `DistributionStrategy` such that `strategy.extended` is `self`.
+    """
+    container_strategy = self._container_strategy_weakref()
+    assert container_strategy is not None
+    return container_strategy
+
+  def _scope(self, strategy):
+    """Implementation of DistributionStrategy.scope()."""
+    if distribution_strategy_context.has_distribution_strategy():
+      _require_cross_replica_context_extended(self)
+      return _SameScopeAgainContext(strategy)
+
+    def creator_with_resource_vars(*args, **kwargs):
+      _require_distribution_strategy_scope_extended(self)
+      kwargs["use_resource"] = True
+      return self._create_variable(*args, **kwargs)
+
+    def distributed_getter(getter, *args, **kwargs):
+      if not self._allow_variable_partition():
+        if kwargs.pop("partitioner", None) is not None:
+          tf_logging.log_first_n(
+              tf_logging.WARN, "Partitioned variables are disabled when using "
+              "current tf.distribute.Strategy.", 1)
+      return getter(*args, **kwargs)
+
+    return _CurrentDistributionContext(
+        strategy,
+        variable_scope.variable_creator_scope(creator_with_resource_vars),
+        variable_scope.variable_scope(
+            variable_scope.get_variable_scope(),
+            custom_getter=distributed_getter), self._default_device)
+
+  def _allow_variable_partition(self):
+    return False
+
+  def _create_variable(self, next_creator, *args, **kwargs):
+    # Note: should support "colocate_with" argument.
+    raise NotImplementedError("must be implemented in descendants")
+
+  def read_var(self, v):
+    """Reads the value of a variable.
+
+    Returns the aggregate value of a replica-local variable, or the
+    (read-only) value of any other variable.
+
+    Args:
+      v: A variable allocated within the scope of this `tf.distribute.Strategy`.
+
+    Returns:
+      A tensor representing the value of `v`, aggregated across replicas if
+      necessary.
+    """
+    raise NotImplementedError("must be implemented in descendants")
+
+  def colocate_vars_with(self, colocate_with_variable):
+    """Scope that controls which devices variables will be created on.
+
+    No operations should be added to the graph inside this scope, it
+    should only be used when creating variables (some implementations
+    work by changing variable creation, others work by using a
+    tf.colocate_with() scope).
+
+    This may only be used inside `self.scope()`.
+
+    Example usage:
+
+    ```
+    with strategy.scope():
+      var1 = tf.get_variable(...)
+      with strategy.extended.colocate_vars_with(v1):
+        # var2 and var3 will be created on the same device(s) as var1
+        var2 = tf.get_variable(...)
+        var3 = tf.get_variable(...)
+
+      def fn(v1, v2, v3):
+        # operates on v1 from var1, v2 from var2, and v3 from var3
+
+      # `fn` runs on every device `v1` is on, `v2` and `v3` will be there too.
+      strategy.extended.update(v1, fn, args=(v2, v3))
+    ```
+
+    Args:
+      colocate_with_variable: A created in `self.scope()`. Variables created
+        while in the returned context manager will be on the same set of
+        devices as `colocate_with_variable`.
+
+    Returns:
+      A context manager.
+    """
+    def create_colocated_variable(next_creator, *args, **kwargs):
+      _require_distribution_strategy_scope_extended(self)
+      kwargs["use_resource"] = True
+      kwargs["colocate_with"] = colocate_with_variable
+      return next_creator(*args, **kwargs)
+
+    _require_distribution_strategy_scope_extended(self)
+    return variable_scope.variable_creator_scope(create_colocated_variable)
+
+  def _call_dataset_fn(self, dataset_fn):
+    """Call the `dataset_fn` with `input_context` as argument."""
+    result = dataset_fn()
+    if not isinstance(result, dataset_ops.DatasetV2):
+      raise ValueError(
+          "dataset_fn() must return a tf.data.Dataset when using a "
+          "tf.distribute.Strategy.")
+    return result
+
+  # TODO(josh11b): `PerReplicaDataset` currently only implements a few methods of
+  # Dataset API such as make_one_shot_iterator and make_initializable_iterator.
+  # Extend to implement more functionality of datasets.
+  def _distribute_dataset(self, dataset_fn):
+    raise NotImplementedError("must be implemented in descendants")
+
+  def _make_dataset_iterator(self, dataset):
+    raise NotImplementedError("must be implemented in descendants")
+
+  def _make_input_fn_iterator(self, input_fn, replication_mode):
+    raise NotImplementedError("must be implemented in descendants")
+
+  def broadcast_to(self, tensor, destinations):
+    """Mirror a tensor on one device to all worker devices.
+
+    Args:
+      tensor: A Tensor value to broadcast.
+      destinations: A mirrored variable or device string specifying the
+        destination devices to copy `tensor` to.
+
+    Returns:
+      A value mirrored to `destinations` devices.
+    """
+    # TODO(josh11b): More docstring
+    _require_cross_replica_context_extended(self)
+    assert not isinstance(destinations, (list, tuple))
+    return self._broadcast_to(tensor, destinations)
+
+  def _broadcast_to(self, tensor, destinations):
+    raise NotImplementedError("must be implemented in descendants")
+
+  def _initialize(self):
+    return []
+
+  def _finalize(self):
+    return []
+
+  def experimental_run_steps_on_iterator(self, fn, iterator, iterations=1,
+                                         initial_loop_values=None):
+    """Run `fn` with input from `iterator` for `iterations` times.
+
+    This method can be used to run a step function for training a number of
+    times using input from a dataset.
+
+    Args:
+      fn: function to run using this distribution strategy. The function must
+        have the following signature: `def fn(context, inputs)`.
+        `context` is an instance of `MultiStepContext` that will be passed when
+        `fn` is run. `context` can be used to specify the outputs to be returned
+        from `fn` by calling `context.set_last_step_output`. It can also be used
+        to capture non tensor outputs by `context.set_non_tensor_output`.
+        See `MultiStepContext` documentation for more information.
+        `inputs` will have same type/structure as `iterator.get_next()`.
+        Typically, `fn` will use `call_for_each_replica` method of the strategy
+        to distribute the computation over multiple replicas.
+      iterator: Iterator of a dataset that represents the input for `fn`. The
+        caller is responsible for initializing the iterator as needed.
+      iterations: (Optional) Number of iterations that `fn` should be run.
+        Defaults to 1.
+      initial_loop_values: (Optional) Initial values to be passed into the
+        loop that runs `fn`. Defaults to `None`. # TODO(priyag): Remove
+        initial_loop_values argument when we have a mechanism to infer the
+        outputs of `fn`.
+
+    Returns:
+      Returns the `MultiStepContext` object which has the following properties,
+      among other things:
+        - run_op: An op that runs `fn` `iterations` times.
+        - last_step_outputs: A dictionary containing tensors set using
+        `context.set_last_step_output`. Evaluating this returns the value of
+        the tensors after the last iteration.
+        - non_tensor_outputs: A dictionatry containing anything that was set by
+          `fn` by calling `context.set_non_tensor_output`.
+    """
+    _require_cross_replica_context_extended(self)
+    return self._experimental_run_steps_on_iterator(
+        fn, iterator, iterations, initial_loop_values)
+
+  def _experimental_run_steps_on_iterator(self, fn, iterator, iterations,
+                                          initial_loop_values):
+    raise NotImplementedError("must be implemented in descendants")
+
+  def call_for_each_replica(self, fn, args=(), kwargs=None):
+    """Run `fn` once per replica.
+
+    `fn` may call `tf.get_replica_context()` to access methods such as
+    `replica_id_in_sync_group` and `merge_call()`.
+
+    `merge_call()` is used to communicate between the replicas and
+    re-enter the cross-replica context. All replicas pause their execution
+    having encountered a `merge_call()` call. After that the
+    `merge_fn`-function is executed. Its results are then unwrapped and
+    given back to each replica call. After that execution resumes until
+    `fn` is complete or encounters another `merge_call()`.  Example:
+
+    ```python
+    # Called once in "cross-replica" context.
+    def merge_fn(distribution, three_plus_replica_id):
+      # sum the values across replicas
+      return sum(distribution.unwrap(three_plus_replica_id))
+
+    # Called once per replica in `distribution`, in a "replica" context.
+    def fn(three):
+      replica_ctx = tf.get_replica_context()
+      v = three + replica_ctx.replica_id_in_sync_group
+      # Computes the sum of the `v` values across all replicas.
+      s = replica_ctx.merge_call(merge_fn, args=(v,))
+      return s + v
+
+    with distribution.scope():
+      # in "cross-replica" context
+      ...
+      merged_results = distribution.call_for_each_replica(fn, args=[3])
+      # merged_results has the values from every replica execution of `fn`.
+      print(distribution.unwrap(merged_results))  # Prints a list
+    ```
+
+    Args:
+      fn: function to run (will be run once per replica).
+      args: Tuple or list with positional arguments for `fn`.
+      kwargs: Dict with keyword arguments for `fn`.
+
+    Returns:
+      Merged return value of `fn` across all replicas.
+    """
+    _require_cross_replica_context_extended(self)
+    if kwargs is None:
+      kwargs = {}
+    return self._call_for_each_replica(fn, args, kwargs)
+
+  def _call_for_each_replica(self, fn, args, kwargs):
+    raise NotImplementedError("must be implemented in descendants")
+
+  def _reduce(self, reduce_op, value):
+    # Default implementation until we have an implementation for each strategy.
+    return self._unwrap(self._reduce_to(
+        reduce_op, value, device_util.current() or "/device:CPU:0"))[0]
+
+  def reduce_to(self, reduce_op, value, destinations):
+    """Combine (via e.g. sum or mean) values across replicas.
+
+    Args:
+      reduce_op: Reduction type, an instance of `tf.distribute.ReduceOp` enum.
+        DEPRECATED but still accepted values:
+        `tf.VariableAggregation.SUM`,
+        `tf.VariableAggregation.MEAN`,
+      value: A per-replica value with one value per replica.
+      destinations: A mirrored variable, a per-replica tensor, or a device
+        string. The return value will be copied to all destination devices (or
+        all the devices where the `destinations` value resides). To perform an
+        all-reduction, pass `value` to `destinations`.
+
+    Returns:
+      A value mirrored to `destinations`.
+    """
+    # TODO(josh11b): More docstring
+    _require_cross_replica_context_extended(self)
+    assert not isinstance(destinations, (list, tuple))
+
+    # TODO(priyag): Remove this when all callers have been updated.
+    if isinstance(reduce_op, variable_scope.VariableAggregation):
+      assert reduce_op in (
+          variable_scope.VariableAggregation.SUM,
+          variable_scope.VariableAggregation.MEAN,
+      )
+      reduce_op = reduce_util.ReduceOp.from_variable_aggregation(reduce_op)
+    assert (reduce_op == reduce_util.ReduceOp.SUM or
+            reduce_op == reduce_util.ReduceOp.MEAN)
+    return self._reduce_to(reduce_op, value, destinations)
+
+  def _reduce_to(self, reduce_op, value, destinations):
+    raise NotImplementedError("must be implemented in descendants")
+
+  def batch_reduce_to(self, reduce_op, value_destination_pairs):
+    """Combine multiple `reduce_to` calls into one for faster execution.
+
+    Args:
+      reduce_op: Reduction type, an instance of `tf.distribute.ReduceOp` enum.
+        DEPRECATED but still accepted values:
+        `tf.VariableAggregation.SUM`,
+        `tf.VariableAggregation.MEAN`,
+      value_destination_pairs: A sequence of (value, destinations)
+        pairs. See `reduce_to()` for a description.
+
+    Returns:
+      A list of mirrored values, one per pair in `value_destination_pairs`.
+    """
+    # TODO(josh11b): More docstring
+    _require_cross_replica_context_extended(self)
+
+    # TODO(priyag): Remove this when all callers have been updated.
+    if isinstance(reduce_op, variable_scope.VariableAggregation):
+      assert reduce_op in [
+          variable_scope.VariableAggregation.SUM,
+          variable_scope.VariableAggregation.MEAN,
+      ]
+      reduce_op = reduce_util.ReduceOp.from_variable_aggregation(reduce_op)
+    return self._batch_reduce_to(reduce_op, value_destination_pairs)
+
+  def _batch_reduce_to(self, reduce_op, value_destination_pairs):
+    return [
+        self.reduce_to(reduce_op, t, destinations=v)
+        for t, v in value_destination_pairs
+    ]
+
+  def update(self, var, fn, args=(), kwargs=None, group=True):
+    """Run `fn` to update `var` using inputs mirrored to the same devices.
+
+    If `var` is mirrored across multiple devices, then this implements
+    logic like:
+
+    ```
+    results = {}
+    for device, v in var:
+      with tf.device(device):
+        # args and kwargs will be unwrapped if they are mirrored.
+        results[device] = fn(v, *args, **kwargs)
+    return merged(results)
+    ```
+
+    Otherwise this returns `fn(var, *args, **kwargs)` colocated with `var`.
+
+    Neither `args` nor `kwargs` may contain per-replica values.
+    If they contain mirrored values, they will be unwrapped before
+    calling `fn`.
+
+    Args:
+      var: Variable, possibly mirrored to multiple devices, to operate on.
+      fn: Function to call. Should take the variable as the first argument.
+      args: Tuple or list. Additional positional arguments to pass to `fn()`.
+      kwargs: Dict with keyword arguments to pass to `fn()`.
+      group: Boolean. Defaults to True. If False, the return value will be
+        unwrapped.
+
+    Returns:
+      By default, the merged return value of `fn` across all replicas.  The
+      merged result has dependencies to make sure that if it is evaluated at
+      all, the side effects (updates) will happen on every replica. If instead
+      "group=False" is specified, this function will return a nest of lists
+      where each list has an element per replica, and the caller is responsible
+      for ensuring all elements are executed.
+    """
+    _require_cross_replica_context_extended(self)
+    if kwargs is None:
+      kwargs = {}
+    return self._update(var, fn, args, kwargs, group)
+
+  def _update(self, var, fn, args, kwargs, group):
+    raise NotImplementedError("must be implemented in descendants")
+
+  def update_non_slot(
+      self, colocate_with, fn, args=(), kwargs=None, group=True):
+    """Runs `fn(*args, **kwargs)` on `colocate_with` devices.
+
+    Args:
+      colocate_with: The return value of `non_slot_devices()`.
+      fn: Function to execute.
+      args: Tuple or list. Positional arguments to pass to `fn()`.
+      kwargs: Dict with keyword arguments to pass to `fn()`.
+      group: Boolean. Defaults to True. If False, the return value will be
+        unwrapped.
+
+    Returns:
+      Return value of `fn`, possibly merged across devices.
+    """
+    _require_cross_replica_context_extended(self)
+    if kwargs is None:
+      kwargs = {}
+    return self._update_non_slot(colocate_with, fn, args, kwargs, group)
+
+  def _update_non_slot(self, colocate_with, fn, args, kwargs, group):
+    raise NotImplementedError("must be implemented in descendants")
+
+  def _unwrap(self, distributed_value):
+    raise NotImplementedError("must be implemented in descendants")
+
+  def value_container(self, value):
+    """Returns the container that this per-replica `value` belongs to.
+
+    Args:
+      value: A value returned by `call_for_each_replica()` or a variable
+        created in `scope()`.
+
+    Returns:
+      A container that `value` belongs to.
+      If value does not belong to any container (including the case of
+      container having been destroyed), returns the value itself.
+      `value in unwrap(value_container(value))` will always be true.
+    """
+    raise NotImplementedError("must be implemented in descendants")
+
+  def _group(self, value, name=None):
+    """Shortcut for `tf.group(distribution.unwrap(value))`."""
+    value = nest.flatten(self._unwrap(value))
+
+    if len(value) != 1 or name is not None:
+      return control_flow_ops.group(value, name=name)
+    # Special handling for the common case of one op.
+    v, = value
+    if hasattr(v, "op"):
+      v = v.op
+    return v
+
+  @property
+  def experimental_require_static_shapes(self):
+    return self._require_static_shapes
+
+  @property
+  def _num_replicas_in_sync(self):
+    """Returns number of replicas over which gradients are aggregated."""
+    raise NotImplementedError("must be implemented in descendants")
+
+  @property
+  def worker_devices(self):
+    """Returns the list of devices used to run `call_for_each_replica()` calls.
+    """
+    # TODO(josh11b): More docstring
+    raise NotImplementedError("must be implemented in descendants")
+
+  @property
+  def parameter_devices(self):
+    """Returns the list of devices used for variable and `update` placement."""
+    # TODO(josh11b): More docstring
+    raise NotImplementedError("must be implemented in descendants")
+
+  def non_slot_devices(self, var_list):
+    """Device(s) for non-slot variables.
+
+    Create variables on these devices in a
+    `with colocate_vars_with(non_slot_devices(...)):` block.
+    Update those using `update_non_slot()`.
+
+    Args:
+      var_list: The list of variables being optimized, needed with the
+        default `tf.distribute.Strategy`.
+    """
+    raise NotImplementedError("must be implemented in descendants")
+
+  @property
+  def experimental_between_graph(self):
+    """Whether the strategy uses between-graph replication or not.
+
+      This is expected to return a constant value that will not be changed
+      throughout its life cycle.
+    """
+    raise NotImplementedError("must be implemented in descendants")
+
+  def _configure(self,
+                 session_config=None,
+                 cluster_spec=None,
+                 task_type=None,
+                 task_id=None):
+    """Configures the strategy class."""
+    del session_config, cluster_spec, task_type, task_id
+
+  def _update_config_proto(self, config_proto):
+    return copy.deepcopy(config_proto)
+
+  @property
+  def experimental_should_init(self):
+    """Whether initialization is needed."""
+    raise NotImplementedError("must be implemented in descendants")
+
+  @property
+  def should_checkpoint(self):
+    """Whether checkpointing is needed."""
+    raise NotImplementedError("must be implemented in descendants")
+
+  @property
+  def should_save_summary(self):
+    """Whether saving summaries is needed."""
+    raise NotImplementedError("must be implemented in descendants")
+
+
+# A note about the difference between the context managers
+# `ReplicaContext` (defined here) and `_CurrentDistributionContext`
+# (defined above) used by `DistributionStrategy.scope()`:
+#
+# * a ReplicaContext is only present during a `call_for_each_replica()`
+#   call (except during a `merge_run` call) and in such a scope it
+#   will be returned by calls to `get_replica_context()`.  Implementers of new
+#   DistributionStrategy descendants will frequently also need to
+#   define a descendant of ReplicaContext, and are responsible for
+#   entering and exiting this context.
+#
+# * DistributionStrategy.scope() sets up a variable_creator scope that
+#   changes variable creation calls (e.g. to make mirrored
+#   variables). This is intended as an outer scope that users enter once
+#   around their model creation and graph definition. There is no
+#   anticipated need to define descendants of _CurrentDistributionContext.
+#   It sets the current DistributionStrategy for purposes of
+#   `get_strategy()` and `has_strategy()`
+#   and switches the thread mode to a "cross-replica context".
+@tf_export("distribute.ReplicaContext")
+class ReplicaContext(object):
+  """`tf.distribute.Strategy` API when in a replica context.
+
+  To be used inside your replicated step function, such as in a
+  `tf.distribute.StrategyExtended.call_for_each_replica` call.
+  """
+
+  def __init__(self, strategy, replica_id_in_sync_group):
+    self._distribution_strategy = strategy
+    self._thread_context = distribution_strategy_context._InReplicaThreadMode(  # pylint: disable=protected-access
+        self)
+    self._replica_id_in_sync_group = replica_id_in_sync_group
+
+  def __enter__(self):
+    _push_per_thread_mode(self._thread_context)
+
+  def __exit__(self, exception_type, exception_value, traceback):
+    _pop_per_thread_mode()
+
+  def merge_call(self, merge_fn, args=(), kwargs=None):
+    """Merge args across replicas and run `merge_fn` in a cross-replica context.
+
+    This allows communication and coordination when there are multiple calls
+    to a model function triggered by a call to
+    `strategy.extended.call_for_each_replica(model_fn, ...)`.
+
+    See `tf.distribute.StrategyExtended.call_for_each_replica` for an
+    explanation.
+
+    If not inside a distributed scope, this is equivalent to:
+
+    ```
+    strategy = tf.distribute.get_strategy()
+    with cross-replica-context(strategy):
+      return merge_fn(strategy, *args, **kwargs)
+    ```
+
+    Args:
+      merge_fn: function that joins arguments from threads that are given as
+        PerReplica. It accepts `tf.distribute.Strategy` object as
+        the first argument.
+      args: List or tuple with positional per-thread arguments for `merge_fn`.
+      kwargs: Dict with keyword per-thread arguments for `merge_fn`.
+
+    Returns:
+      The return value of `merge_fn`, except for `PerReplica` values which are
+      unpacked.
+    """
+    require_replica_context(self)
+    if kwargs is None:
+      kwargs = {}
+    return self._merge_call(merge_fn, args, kwargs)
+
+  def _merge_call(self, merge_fn, args, kwargs):
+    """Default implementation for single replica."""
+    _push_per_thread_mode(  # thread-local, so not needed with multiple threads
+        distribution_strategy_context._CrossReplicaThreadMode(  # pylint: disable=protected-access
+            self._distribution_strategy))
+    try:
+      return merge_fn(self._distribution_strategy, *args, **kwargs)
+    finally:
+      _pop_per_thread_mode()
+
+  @property
+  def num_replicas_in_sync(self):
+    """Returns number of replicas over which gradients are aggregated."""
+    return self._distribution_strategy.num_replicas_in_sync
+
+  @property
+  def replica_id_in_sync_group(self):
+    """Which replica is being defined, from 0 to `num_replicas_in_sync - 1`."""
+    require_replica_context(self)
+    return self._replica_id_in_sync_group
+
+  @property
+  @doc_controls.do_not_generate_docs  # DEPRECATED, use `strategy`
+  def distribution_strategy(self):
+    """DEPRECATED: use `self.stratgey` instead."""
+    return self._distribution_strategy
+
+  @property
+  def strategy(self):
+    """The current `tf.distribute.Strategy` object."""
+    return self._distribution_strategy
+
+  @property
+  def devices(self):
+    """The devices this replica is to be executed on, as a list of strings."""
+    require_replica_context(self)
+    return [device_util.current()]
+
+  # TODO(josh11b): Implement `start_all_reduce(method, t)` for efficient
+  # all-reduce. It would return a function returning the result of reducing `t`
+  # across all replicas. The caller would wait to call this function until they
+  # needed the reduce result, allowing an efficient implementation:
+  # * With eager execution, the reduction could be performed asynchronously
+  #   in the background, not blocking until the result was needed.
+  # * When constructing a graph, it could batch up all reduction requests up
+  #   to that point that the first result is needed. Most likely this can be
+  #   implemented in terms of `merge_call()` and `batch_reduce_to()`.
+
+# ------------------------------------------------------------------------------
+
+
+class _DefaultDistributionStrategy(DistributionStrategy):
+  """Default `tf.distribute.Strategy` if none is explicitly selected."""
+
+  def __init__(self):
+    super(_DefaultDistributionStrategy, self).__init__(
+        _DefaultDistributionExtended(self))
+
+
+class _DefaultDistributionExtended(DistributionStrategyExtended):
+  """Implementation of _DefaultDistributionStrategy."""
+
+  def _scope(self, strategy):
+    """Context manager setting a variable creator and `self` as current."""
+    if distribution_strategy_context.has_distribution_strategy():
+      raise RuntimeError("Must not nest tf.distribute.Strategy scopes.")
+
+    def creator(next_creator, *args, **kwargs):
+      _require_distribution_strategy_scope_strategy(strategy)
+      return next_creator(*args, **kwargs)
+
+    return _CurrentDistributionContext(
+        strategy, variable_scope.variable_creator_scope(creator))
+
+  def colocate_vars_with(self, colocate_with_variable):
+    """Does not require `self.scope`."""
+    _require_distribution_strategy_scope_extended(self)
+    return ops.colocate_with(colocate_with_variable)
+
+  def _distribute_dataset(self, dataset_fn):
+    return self._call_dataset_fn(dataset_fn)
+
+  def _make_dataset_iterator(self, dataset):
+    return _DefaultDistributionExtended.DefaultInputIterator(dataset)
+
+  def _make_input_fn_iterator(self,
+                              input_fn,
+                              replication_mode=InputReplicationMode.PER_WORKER):
+    return input_fn(InputContext()).make_initializable_iterator()
+
+  def _broadcast_to(self, tensor, destinations):
+    if destinations is None:
+      return tensor
+    else:
+      raise NotImplementedError("TODO")
+
+  def _call_for_each_replica(self, fn, args, kwargs):
+    with ReplicaContext(
+        self._container_strategy(),
+        replica_id_in_sync_group=constant_op.constant(0, dtypes.int32)):
+      return fn(*args, **kwargs)
+
+  def _reduce_to(self, reduce_op, value, destinations):
+    # TODO(josh11b): Use destinations?
+    del reduce_op, destinations
+    return value
+
+  def _update(self, var, fn, args, kwargs, group):
+    # The implementations of _update() and _update_non_slot() are identical
+    # except _update() passes `var` as the first argument to `fn()`.
+    return self._update_non_slot(var, fn, (var,) + tuple(args), kwargs, group)
+
+  def _update_non_slot(self, colocate_with, fn, args, kwargs, should_group):
+    # TODO(josh11b): Figure out what we should be passing to UpdateContext()
+    # once that value is used for something.
+    with ops.colocate_with(colocate_with), UpdateContext(colocate_with):
+      result = fn(*args, **kwargs)
+      if should_group:
+        return result
+      else:
+        return nest.map_structure(self._unwrap, result)
+
+  def read_var(self, replica_local_var):
+    return array_ops.identity(replica_local_var)
+
+  def _unwrap(self, distributed_value):
+    return [distributed_value]
+
+  def value_container(self, value):
+    return value
+
+  @property
+  def _num_replicas_in_sync(self):
+    return 1
+
+  @property
+  def worker_devices(self):
+    raise RuntimeError("worker_devices() method unsupported by default "
+                       "tf.distribute.Strategy.")
+
+  @property
+  def parameter_devices(self):
+    raise RuntimeError("parameter_devices() method unsupported by default "
+                       "tf.distribute.Strategy.")
+
+  def non_slot_devices(self, var_list):
+    return min(var_list, key=lambda x: x.name)
+
+  # TODO(priyag): This should inherit from `InputIterator`, once dependency
+  # issues have been resolved.
+  class DefaultInputIterator(object):
+    """Default implementation of `InputIterator` for default strategy."""
+
+    def __init__(self, dataset):
+      self._dataset = dataset
+      if eager_context.executing_eagerly():
+        self._iterator = dataset.make_one_shot_iterator()
+      else:
+        self._iterator = dataset.make_initializable_iterator()
+
+    def get_next(self):
+      return self._iterator.get_next()
+
+    def initialize(self):
+      if eager_context.executing_eagerly():
+        self._iterator = self._dataset.make_one_shot_iterator()
+        return []
+      else:
+        return [self._iterator.initializer]
+
+  # TODO(priyag): Delete this once all strategies use global batch size.
+  @property
+  def _global_batch_size(self):
+    return True
+
+
+# ------------------------------------------------------------------------------
+# We haven't yet implemented deserialization for DistributedVariables.
+# So here we catch any attempts to deserialize variables
+# when using distribution strategies.
+# pylint: disable=protected-access
+_original_from_proto = resource_variable_ops._from_proto_fn
+
+
+def _from_proto_fn(v, import_scope=None):
+  if distribution_strategy_context.has_distribution_strategy():
+    raise NotImplementedError(
+        "Deserialization of variables is not yet supported when using a "
+        "tf.distribute.Strategy.")
+  else:
+    return _original_from_proto(v, import_scope=import_scope)
+
+resource_variable_ops._from_proto_fn = _from_proto_fn
+# pylint: enable=protected-access
+
+
+#-------------------------------------------------------------------------------
+# Shorthand for some methods from distribution_strategy_context.
+_push_per_thread_mode = distribution_strategy_context._push_per_thread_mode  # pylint: disable=protected-access
+_get_per_thread_mode = distribution_strategy_context._get_per_thread_mode  # pylint: disable=protected-access
+_pop_per_thread_mode = distribution_strategy_context._pop_per_thread_mode  # pylint: disable=protected-access
diff --git a/tensorflow/python/training/distribute_test.py b/tensorflow/python/distribute/distribute_lib_test.py
similarity index 75%
rename from tensorflow/python/training/distribute_test.py
rename to tensorflow/python/distribute/distribute_lib_test.py
index 0a7bbd56870..d63d1fe3c32 100644
--- a/tensorflow/python/training/distribute_test.py
+++ b/tensorflow/python/distribute/distribute_lib_test.py
@@ -18,13 +18,15 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.distribute import distribute_lib
+from tensorflow.python.distribute import distribution_strategy_context
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.platform import test
-from tensorflow.python.training import distribute
-from tensorflow.python.training import distribution_strategy_context
 
 
-class _TestReplicaContext(distribute.ReplicaContext):
+class _TestReplicaContext(distribute_lib.ReplicaContext):
 
   def merge_call(self, fn, *args, **kwargs):
     return kwargs["test_arg"]
@@ -38,10 +40,18 @@ def _get_test_variable(name, synchronization, aggregation):
   }
 
 
-class _TestStrategy(distribute.DistributionStrategy):
+class _TestStrategy(distribute_lib.DistributionStrategy):
+
+  def __init__(self):
+    super(_TestStrategy, self).__init__(_TestExtended(self))
+
+
+class _TestExtended(distribute_lib.DistributionStrategyExtended):
 
   def _call_for_each_replica(self, fn, args, kwargs):
-    with _TestReplicaContext(self, replica_id=0):
+    with _TestReplicaContext(
+        self._container_strategy(),
+        replica_id_in_sync_group=constant_op.constant(0, dtypes.int32)):
       return fn(*args, **kwargs)
 
   def _create_variable(self, next_creator, *args, **kwargs):
@@ -53,6 +63,7 @@ def _assert_in_default_state(t):
   t.assertIs(distribution_strategy_context._get_default_replica_context(),
              distribution_strategy_context.get_replica_context())
   t.assertIs(None, distribution_strategy_context.get_cross_replica_context())
+  t.assertFalse(distribution_strategy_context.in_cross_replica_context())
   t.assertIs(distribution_strategy_context._get_default_distribution_strategy(),
              distribution_strategy_context.get_distribution_strategy())
   t.assertFalse(distribution_strategy_context.has_distribution_strategy())
@@ -69,6 +80,7 @@ class TestStrategyTest(test.TestCase):
       self.assertTrue(replica_context is not None)
       self.assertIs(None,
                     distribution_strategy_context.get_cross_replica_context())
+      self.assertFalse(distribution_strategy_context.in_cross_replica_context())
       self.assertTrue(distribution_strategy_context.has_distribution_strategy())
       self.assertIs(dist,
                     distribution_strategy_context.get_distribution_strategy())
@@ -80,9 +92,9 @@ class TestStrategyTest(test.TestCase):
                            variable_scope.variable(1.0, name="bar"))
 
     with self.assertRaises(RuntimeError):
-      dist.call_for_each_replica(run_fn)
+      dist.extended.call_for_each_replica(run_fn)
     with dist.scope():
-      dist.call_for_each_replica(run_fn)
+      dist.extended.call_for_each_replica(run_fn)
     _assert_in_default_state(self)
 
   def testScope(self):
@@ -92,6 +104,7 @@ class TestStrategyTest(test.TestCase):
       self.assertIs(None, distribution_strategy_context.get_replica_context())
       self.assertIs(dist,
                     distribution_strategy_context.get_cross_replica_context())
+      self.assertTrue(distribution_strategy_context.in_cross_replica_context())
       self.assertTrue(distribution_strategy_context.has_distribution_strategy())
       self.assertIs(dist,
                     distribution_strategy_context.get_distribution_strategy())
@@ -131,6 +144,7 @@ class DefaultDistributionStrategyTest(test.TestCase):
       self.assertIs(None, distribution_strategy_context.get_replica_context())
       self.assertIs(dist,
                     distribution_strategy_context.get_cross_replica_context())
+      self.assertTrue(distribution_strategy_context.in_cross_replica_context())
       self.assertIs(dist,
                     distribution_strategy_context.get_distribution_strategy())
       self.assertFalse(
@@ -140,9 +154,26 @@ class DefaultDistributionStrategyTest(test.TestCase):
     replica_ctx = distribution_strategy_context.get_replica_context()
     self.assertIs(distribution_strategy_context._get_default_replica_context(),
                   replica_ctx)
-    self.assertEqual("foo_bar", replica_ctx.merge_call(merge_fn, "bar"))
+    self.assertEqual("foo_bar", replica_ctx.merge_call(merge_fn, args=("bar",)))
     _assert_in_default_state(self)
 
 
+class InputContextTest(test.TestCase):
+
+  def testProperties(self):
+    input_context = distribute_lib.InputContext(
+        num_input_pipelines=2, input_pipeline_id=1, num_replicas_in_sync=6)
+    self.assertEqual(6, input_context.num_replicas_in_sync)
+    self.assertEqual(1, input_context.input_pipeline_id)
+    self.assertEqual(2, input_context.num_input_pipelines)
+
+  def testPerReplicaBatchSize(self):
+    input_context = distribute_lib.InputContext(
+        num_input_pipelines=2, input_pipeline_id=1, num_replicas_in_sync=6)
+    self.assertEqual(2, input_context.get_per_replica_batch_size(12))
+    with self.assertRaises(ValueError):
+      input_context.get_per_replica_batch_size(13)
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/distribute/distribution_strategy_context.py b/tensorflow/python/distribute/distribution_strategy_context.py
new file mode 100644
index 00000000000..78e096e2867
--- /dev/null
+++ b/tensorflow/python/distribute/distribution_strategy_context.py
@@ -0,0 +1,236 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utility to get distribution strategy related contexts."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import ops
+from tensorflow.python.util.lazy_loader import LazyLoader
+from tensorflow.python.util.tf_export import tf_export
+
+
+# There is a circular dependency between this and `distribute` module. So we
+# load it lazily to workaround this.
+distribute_lib = LazyLoader(
+    "distribute_lib", globals(),
+    "tensorflow.python.distribute.distribute_lib")
+
+# ------------------------------------------------------------------------------
+# Internal API for setting the current thread mode as being either in a
+# replica or cross-replica context for a particular distribution strategy.
+
+
+class _ThreadMode(object):
+
+  def __init__(self, dist, cross, replica):
+    self.distribution_strategy = dist
+    self.cross_replica_context = cross
+    self.replica_context = replica
+
+
+class _CrossReplicaThreadMode(_ThreadMode):
+
+  def __init__(self, distribution_strategy):
+    _ThreadMode.__init__(
+        self, distribution_strategy, distribution_strategy, None)
+
+
+class _InReplicaThreadMode(_ThreadMode):
+
+  def __init__(self, replica_ctx):
+    _ThreadMode.__init__(
+        self, replica_ctx.distribution_strategy, None, replica_ctx)
+
+
+def _push_per_thread_mode(context):
+  ops.get_default_graph()._distribution_strategy_stack.append(context)  # pylint: disable=protected-access
+
+
+def _pop_per_thread_mode():
+  ops.get_default_graph()._distribution_strategy_stack.pop(-1)  # pylint: disable=protected-access
+
+
+class _DefaultReplicaThreadMode(_ThreadMode):
+  """Type of default value returned by `_get_per_thread_mode()`.
+
+  Used when the thread-local stack is empty.
+  """
+
+  def __init__(self):
+    _ThreadMode.__init__(self, _get_default_distribution_strategy(), None,
+                         _get_default_replica_context())
+
+
+def _get_per_thread_mode():
+  try:
+    return ops.get_default_graph()._distribution_strategy_stack[-1]  # pylint: disable=protected-access
+  except (AttributeError, IndexError):
+    return _get_default_replica_mode()
+
+
+# ------------------------------------------------------------------------------
+# Public API for accessing the current thread mode
+
+
+@tf_export("distribute.get_replica_context")
+def get_replica_context():
+  """Returns the current `tf.distribute.ReplicaContext` or `None`.
+
+  Returns `None` if in a cross-replica context.
+
+  Note that execution:
+
+  1. starts in the default (single-replica) replica context (this function
+     will return the default `ReplicaContext` object);
+  2. switches to cross-replica context (in which case this will return
+     `None`) when entering a `with tf.distribute.Strategy.scope():` block;
+  3. switches to a (non-default) replica context inside
+     `extended.call_for_each_replica(fn, ...)`;
+  4. if `fn` calls `get_replica_context().merge_call(merge_fn, ...)`, then
+     inside `merge_fn` you are back in the cross-replica context (and again
+     this function will return `None`).
+
+  Note that you can also go directly from step 1 to 4 to switch to a
+  cross-replica context for the default `tf.distribute.Strategy`. You may
+  also switch from the cross-replica context of 4 to a replica context by
+  calling `extended.call_for_each_replica()`, jumping back to step 3.
+
+  Most `tf.distribute.Strategy` methods may only be executed in
+  a cross-replica context, in a replica context you should use the
+  `ReplicaContext` API instead.
+
+  Returns:
+    The current `ReplicaContext` object when in a replica context scope,
+    else `None`.
+
+    Within a particular block, exactly one of these two things will be true:
+
+    * `get_replica_context()` returns non-`None`, or
+    * `tf.distribute.is_cross_replica_context()` returns True.
+  """
+  return _get_per_thread_mode().replica_context
+
+
+def get_cross_replica_context():
+  """Returns the current tf.distribute.Strategy if in a cross-replica context.
+
+  DEPRECATED: Please use `in_cross_replica_context()` and
+  `get_distribution_strategy()` instead.
+
+  Note that execution:
+
+  1. starts in the default (single-replica) replica context;
+  2. switches to cross-replica context when entering a
+     `with tf.distribute.Strategy.scope():` block;
+  3. switches to a (non-default) replica context inside
+     `call_for_each_replica(fn, ...)`;
+  4. if `fn` calls `get_replica_context()->merge_call(merge_fn, ...)`, then
+     inside `merge_fn` you are back in the cross-replica context.
+
+  Note that you can also go directly from step 1 to 4 to switch to a
+  cross-replica context for the default `tf.distribute.Strategy`. You may
+  also switch from the cross-replica context of 4 to a replica context by
+  calling `call_for_each_replica()`, jumping back to step 3.
+
+  Most `tf.distribute.Strategy` methods may only be executed in
+  a cross-replica context.
+
+  Returns:
+    Returns the current `tf.distribute.Strategy` object in a cross-replica
+    context, or `None`.
+
+    Exactly one of `get_replica_context()` and `get_cross_replica_context()`
+    will return `None` in a particular block.
+  """
+  return _get_per_thread_mode().cross_replica_context
+
+
+@tf_export("distribute.in_cross_replica_context")
+def in_cross_replica_context():
+  """Returns True if in a cross-replica context.
+
+  See `tf.distribute.get_replica_context` for details.
+
+  Returns:
+    True if in a cross-replica context (`get_replica_context()` returns
+    `None`), or False if in a replica context (`get_replica_context()` returns
+    non-`None`).
+  """
+  return _get_per_thread_mode().cross_replica_context is not None
+
+
+@tf_export("distribute.get_strategy")
+def get_distribution_strategy():
+  """Returns the current `tf.distribute.Strategy` object.
+
+  Typically only used in a cross-replica context:
+
+  ```
+  if tf.distribute.in_cross_replica_context():
+    strategy = tf.distribute.get_strategy()
+    ...
+  ```
+
+  Returns:
+    A `tf.distribute.Strategy` object. Inside a
+    `with distribution_strategy.scope()` block, it returns
+    `distribution_strategy`, otherwise it returns the default
+    (single-replica) `tf.distribute.Strategy` object.
+  """
+  return _get_per_thread_mode().distribution_strategy
+
+
+@tf_export("distribute.has_strategy")
+def has_distribution_strategy():
+  """Return if there is a current non-default `tf.distribute.Strategy`.
+
+  Returns:
+    True if inside a `with strategy.scope():`.
+  """
+  return get_distribution_strategy() is not _get_default_distribution_strategy()
+
+
+# ------------------------------------------------------------------------------
+# Defaults that are used when no distribution strategy is explicitly created.
+# We create them lazily in a function so that we can workaround the circular
+# dependency on distribute_lib. See lazy loader at the top of this file.
+
+_defaults = {
+    "distribution_strategy": None,
+    "replica_context": None,
+    "replica_mode": None
+}
+
+
+def _get_default_distribution_strategy():
+  if _defaults["distribution_strategy"] is None:
+    _defaults["distribution_strategy"] = (
+        distribute_lib._DefaultDistributionStrategy())  # pylint: disable=protected-access
+  return _defaults["distribution_strategy"]
+
+
+def _get_default_replica_context():
+  if _defaults["replica_context"] is None:
+    _defaults["replica_context"] = distribute_lib.ReplicaContext(
+        _get_default_distribution_strategy(), replica_id_in_sync_group=0)
+  return _defaults["replica_context"]
+
+
+def _get_default_replica_mode():
+  if _defaults["replica_mode"] is None:
+    _defaults["replica_mode"] = _DefaultReplicaThreadMode()
+  return _defaults["replica_mode"]
diff --git a/tensorflow/python/distribute/estimator_training.py b/tensorflow/python/distribute/estimator_training.py
index 227b00fb3e5..549fa8fb8aa 100644
--- a/tensorflow/python/distribute/estimator_training.py
+++ b/tensorflow/python/distribute/estimator_training.py
@@ -308,7 +308,7 @@ def estimator_train(estimator, train_distributed_fn, hooks):
     raise ValueError('Only `STANDALONE_CLIENT` mode is supported when you call '
                      '`estimator.train`')
 
-  if estimator._config._train_distribute.between_graph:
+  if estimator._config._train_distribute.extended.experimental_between_graph:
     # TODO(yuefengz): remove this limitation once we figure out how to merge
     # return values from `_worker_fn`s.
     raise ValueError('`Estimator.train` API is not supported for %s with '
@@ -356,7 +356,7 @@ def estimator_evaluate(estimator, evaluate_distributed_fn, hooks):
     raise ValueError('Only `STANDALONE_CLIENT` mode is supported when you call '
                      '`Estimator.train`')
 
-  if estimator._config._eval_distribute.between_graph:
+  if estimator._config._eval_distribute.extended.experimental_between_graph:
     # TODO(yuefengz): remove this limitation once we figure out how to merge
     # return values from `_worker_fn`s.
     raise ValueError('`Estimator.evaluate` API is not supported for %s with '
diff --git a/tensorflow/contrib/distribute/python/input_ops.py b/tensorflow/python/distribute/input_ops.py
similarity index 89%
rename from tensorflow/contrib/distribute/python/input_ops.py
rename to tensorflow/python/distribute/input_ops.py
index ac1ccd64b32..2ded209701e 100644
--- a/tensorflow/contrib/distribute/python/input_ops.py
+++ b/tensorflow/python/distribute/input_ops.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.data.experimental.ops import filter_for_shard_ops
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import readers
 from tensorflow.python.data.util import nest
@@ -41,7 +42,8 @@ def auto_shard_dataset(dataset, num_shards, index):
     dataset: A `tf.data.Dataset` instance, typically the result of a bunch of
       dataset transformations.
     num_shards: A `tf.int64` scalar `tf.Tensor`, representing the number of
-        shards operating in parallel. Same usage as in `Dataset.shard`.
+        shards operating in parallel. Same usage as in
+        `tf.data.experimental.filter_for_shard`.
     index: A `tf.int64` scalar `tf.Tensor`, representing the worker index.
       Same usage as in `Dataset.shard`.
 
@@ -74,13 +76,15 @@ def auto_shard_dataset(dataset, num_shards, index):
         # constructor. Eventually we will change all cases to clone datasets
         # instead of updating in-place.
         return dataset._clone(
-            filenames=dataset._filenames.shard(num_shards, index))
+            filenames=dataset._filenames.apply(
+                filter_for_shard_ops.filter_for_shard(num_shards, index)))
       elif isinstance(dataset, dataset_ops.RangeDataset):
-        return dataset.shard(num_shards, index)
+        return dataset.apply(
+            filter_for_shard_ops.filter_for_shard(num_shards, index))
       elif hasattr(dataset, "_map_func"):
         # TODO(priyag): Make this check more robust by enforcing some common
         # property on all map/flatmap/interleave datasets.
-        map_func_def = dataset._map_func.definition
+        map_func_def = dataset._map_func.function.definition
         for node in map_func_def.node_def:
           if node.op in _READER_DATASET_OPS:
             found_reader_op = True
@@ -102,6 +106,11 @@ def auto_shard_dataset(dataset, num_shards, index):
               dataset._input_dataset, found_reader_op)
           return dataset
 
+    if isinstance(dataset, dataset_ops.DatasetV1Adapter):
+      dataset._dataset = _auto_shard_impl(
+          dataset._dataset, found_reader_op)
+      return dataset
+
     # TODO(priyag): Make _input_dataset(s) a common property of all datasets to
     # make this check more robust.
     if hasattr(dataset, "_input_dataset"):
@@ -137,6 +146,7 @@ def auto_shard_dataset(dataset, num_shards, index):
     # TODO(priyag): This will shard the filenames before any shuffling of the
     # filename dataset. It might be desirable to shard after shuffling
     # filenames? If so, how do we achieve that?
-    return dataset.shard(num_shards, index)
+    return dataset.apply(
+        filter_for_shard_ops.filter_for_shard(num_shards, index))
 
   return _auto_shard_impl(dataset=dataset, found_reader_op=False)
diff --git a/tensorflow/contrib/distribute/python/input_ops_test.py b/tensorflow/python/distribute/input_ops_test.py
similarity index 89%
rename from tensorflow/contrib/distribute/python/input_ops_test.py
rename to tensorflow/python/distribute/input_ops_test.py
index 559de97bb1f..dcf946ba477 100644
--- a/tensorflow/contrib/distribute/python/input_ops_test.py
+++ b/tensorflow/python/distribute/input_ops_test.py
@@ -20,10 +20,11 @@ from __future__ import print_function
 
 import os
 
-from tensorflow.contrib.distribute.python import input_ops
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import readers
+from tensorflow.python.distribute import input_ops
 from tensorflow.python.framework import errors
+from tensorflow.python.framework import test_util
 from tensorflow.python.lib.io import python_io
 from tensorflow.python.platform import test
 from tensorflow.python.util import compat
@@ -92,10 +93,11 @@ class AutoShardDatasetTest(test.TestCase):
     with self.cached_session() as sess:
       for f in range(self._shard_index, self._num_files, self._num_shards):
         for r in range(self._num_records):
-          self.assertAllEqual(record_fn(r, f), sess.run(next_element))
+          self.assertAllEqual(record_fn(r, f), self.evaluate(next_element))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
+        self.evaluate(next_element)
 
+  @test_util.run_deprecated_v1
   def testTFRecordDataset(self):
     dataset = readers.TFRecordDataset(self._createTFRecordFiles())
     dataset = input_ops.auto_shard_dataset(
@@ -103,6 +105,7 @@ class AutoShardDatasetTest(test.TestCase):
 
     self._verifySimpleShardingOutput(dataset, self._record)
 
+  @test_util.run_deprecated_v1
   def testFlatMap(self):
     dataset = dataset_ops.Dataset.from_tensor_slices(
         self._createTFRecordFiles())
@@ -112,6 +115,7 @@ class AutoShardDatasetTest(test.TestCase):
 
     self._verifySimpleShardingOutput(dataset, self._record)
 
+  @test_util.run_deprecated_v1
   def testInterleave(self):
     dataset = dataset_ops.Dataset.from_tensor_slices(
         self._createTFRecordFiles())
@@ -124,9 +128,10 @@ class AutoShardDatasetTest(test.TestCase):
     # contain records in order of files.
     self._verifySimpleShardingOutput(dataset, self._record)
 
+  @test_util.run_deprecated_v1
   def testListfiles(self):
     filenames = self._createTFRecordFiles()
-    file_pattern = filenames[0].rsplit("/", 1)[0] + "/tf_record.*.txt"
+    file_pattern = filenames[0].rsplit(os.sep, 1)[0] + "/tf_record.*.txt"
     dataset = dataset_ops.Dataset.list_files(file_pattern, shuffle=False)
     dataset = dataset.flat_map(readers.TFRecordDataset)
     dataset = input_ops.auto_shard_dataset(
@@ -138,12 +143,13 @@ class AutoShardDatasetTest(test.TestCase):
       actual, expected = [], []
       for f in range(self._shard_index, self._num_files, self._num_shards):
         for r in range(self._num_records):
-          actual.append(sess.run(next_element))
+          actual.append(self.evaluate(next_element))
           expected.append(self._record(r, f))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
+        self.evaluate(next_element)
       self.assertAllEqual(expected, actual)
 
+  @test_util.run_deprecated_v1
   def testComplexPipeline(self):
     # Setup a complex input pipeline.
     batch_size = 2
@@ -171,9 +177,9 @@ class AutoShardDatasetTest(test.TestCase):
       num_iterations = (self._num_files * self._num_records * num_epochs) // (
           self._num_shards * batch_size)
       for _ in range(num_iterations):
-        actual.extend(sess.run(next_element))
+        actual.extend(self.evaluate(next_element))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
+        self.evaluate(next_element)
 
       expected = []
       for f in range(0, self._num_files, self._num_shards):
@@ -183,6 +189,7 @@ class AutoShardDatasetTest(test.TestCase):
 
       self.assertAllEqual(sorted(expected), sorted(actual))
 
+  @test_util.run_deprecated_v1
   def testZip(self):
     dataset1 = readers.TFRecordDataset(self._createTFRecordFiles())
     dataset2 = readers.TextLineDataset(self._createTextFiles())
@@ -193,6 +200,7 @@ class AutoShardDatasetTest(test.TestCase):
     record_fn = lambda r, f: (self._record(r, f), self._text_line(r, f))
     self._verifySimpleShardingOutput(dataset, record_fn)
 
+  @test_util.run_deprecated_v1
   def testConcat(self):
     dataset1 = readers.TFRecordDataset(self._createTFRecordFiles())
     dataset2 = readers.TextLineDataset(self._createTextFiles())
@@ -205,13 +213,15 @@ class AutoShardDatasetTest(test.TestCase):
     with self.cached_session() as sess:
       for f in range(self._shard_index, self._num_files, self._num_shards):
         for r in range(self._num_records):
-          self.assertAllEqual(self._record(r, f), sess.run(next_element))
+          self.assertAllEqual(self._record(r, f), self.evaluate(next_element))
       for f in range(self._shard_index, self._num_files, self._num_shards):
         for r in range(self._num_records):
-          self.assertAllEqual(self._text_line(r, f), sess.run(next_element))
+          self.assertAllEqual(
+              self._text_line(r, f), self.evaluate(next_element))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
+        self.evaluate(next_element)
 
+  @test_util.run_deprecated_v1
   def testTextLineReader(self):
     dataset = readers.TextLineDataset(self._createTextFiles())
     dataset = input_ops.auto_shard_dataset(
@@ -219,6 +229,7 @@ class AutoShardDatasetTest(test.TestCase):
 
     self._verifySimpleShardingOutput(dataset, self._text_line)
 
+  @test_util.run_deprecated_v1
   def testTextLineReaderWithFlatMap(self):
     dataset = dataset_ops.Dataset.from_tensor_slices(self._createTextFiles())
     dataset = dataset.flat_map(readers.TextLineDataset)
@@ -227,6 +238,7 @@ class AutoShardDatasetTest(test.TestCase):
 
     self._verifySimpleShardingOutput(dataset, self._text_line)
 
+  @test_util.run_deprecated_v1
   def testFixedLengthReader(self):
     dataset = readers.FixedLengthRecordDataset(
         self._createFixedLengthRecordFiles(), self._record_bytes)
@@ -235,6 +247,7 @@ class AutoShardDatasetTest(test.TestCase):
 
     self._verifySimpleShardingOutput(dataset, self._fixed_length_record)
 
+  @test_util.run_deprecated_v1
   def testFixedLengthReaderWithFlatMap(self):
     dataset = dataset_ops.Dataset.from_tensor_slices(
         self._createFixedLengthRecordFiles())
diff --git a/tensorflow/python/distribute/mirrored_strategy.py b/tensorflow/python/distribute/mirrored_strategy.py
new file mode 100644
index 00000000000..0775920a7d6
--- /dev/null
+++ b/tensorflow/python/distribute/mirrored_strategy.py
@@ -0,0 +1,908 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Class MirroredStrategy implementing DistributionStrategy."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import contextlib
+import copy
+import functools
+import threading
+
+from tensorflow.python import pywrap_tensorflow
+from tensorflow.python.distribute import cross_device_ops as cross_device_ops_lib
+from tensorflow.python.distribute import device_util
+from tensorflow.python.distribute import distribute_lib
+from tensorflow.python.distribute import multi_worker_util
+from tensorflow.python.distribute import reduce_util
+from tensorflow.python.distribute import shared_variable_creator
+from tensorflow.python.distribute import values
+from tensorflow.python.eager import context
+from tensorflow.python.eager import tape
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import device as tf_device
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.training import coordinator
+from tensorflow.python.util import nest
+
+
+# TODO(josh11b): Replace asserts in this file with if ...: raise ...
+
+
+@contextlib.contextmanager
+def _enter_graph(g):
+  if context.executing_eagerly():
+    with g.as_default(), context.eager_mode():
+      yield
+  else:
+    with g.as_default():
+      yield
+
+
+def _cpu_device(device):
+  cpu_device = tf_device.DeviceSpec.from_string(device)
+  cpu_device.merge_from(tf_device.DeviceSpec(device_type="CPU", device_index=0))
+  return cpu_device.to_string()
+
+
+class _RequestedStop(Exception):  # pylint: disable=g-bad-exception-name
+  pass
+
+
+# _call_for_each_replica and _reduce_non_distributed_value are not members of
+# MirroredStrategy so that they are generally not allowed to use anything
+# specific to MirroredStrategy and thus can be shared with other distribution
+# strategies.
+
+
+# TODO(yuefengz): maybe create a common class for those who need to call this
+# _call_for_each_replica.
+def _call_for_each_replica(distribution, fn, args, kwargs):
+  """Run `fn` in separate threads, once per replica/worker device.
+
+  Args:
+    distribution: the DistributionStrategy object.
+    fn: function to run (will be run once per device, each in its own thread).
+    args: positional arguments for `fn`
+    kwargs: keyword arguments for `fn`.
+
+  Returns:
+    Merged return value of `fn` across all replicas.
+
+  Raises:
+    RuntimeError: If fn() calls get_replica_context().merge_call() a different
+        number of times from the available devices.
+  """
+  # TODO(josh11b): Add this option once we add synchronization to variable
+  # creation. Until then, this is pretty unsafe to use.
+  run_concurrently = False
+  if not context.executing_eagerly():
+    # Needed for per-thread device, etc. contexts in graph mode.
+    ops.get_default_graph().switch_to_thread_local()
+
+  coord = coordinator.Coordinator(clean_stop_exception_types=(_RequestedStop,))
+
+  shared_variable_store = {}
+
+  # TODO(isaprykin): Create these threads once instead of during every run()
+  # call.
+  threads = []
+  for index, d in enumerate(distribution.extended.worker_devices):
+    variable_creator_fn = shared_variable_creator.make_fn(
+        shared_variable_store, index)
+    t = MirroredExtended._MirroredReplicaThread(  # pylint: disable=protected-access
+        distribution, coord, d, variable_creator_fn, fn,
+        *values.select_device(d, args), **values.select_device(d, kwargs))
+    threads.append(t)
+
+  for t in threads:
+    t.start()
+
+  # When `fn` starts `should_run` event is set on _MirroredReplicaThread
+  # (`MRT`) threads. The execution waits until
+  # `MRT.has_paused` is set, which indicates that either `fn` is
+  # complete or a `get_replica_context().merge_call()` is called.  If `fn` is
+  # complete, then `MRT.done` is set to True.  Otherwise, arguments
+  # of `get_replica_context().merge_call` from all paused threads are grouped
+  # and the `merge_fn` is performed.  Results of the
+  # `get_replica_context().merge_call` are then set to `MRT.merge_result`.
+  # Each such `get_replica_context().merge_call` call returns the
+  # `MRT.merge_result` for that thread when `MRT.should_run` event
+  # is reset again. Execution of `fn` resumes.
+
+  try:
+    with coord.stop_on_exception():
+      all_done = False
+      while not all_done and not coord.should_stop():
+        done = []
+        if run_concurrently:
+          for t in threads:
+            t.should_run.set()
+          for t in threads:
+            t.has_paused.wait()
+            t.has_paused.clear()
+            if coord.should_stop():
+              return None
+            done.append(t.done)
+        else:
+          for t in threads:
+            t.should_run.set()
+            t.has_paused.wait()
+            t.has_paused.clear()
+            if coord.should_stop():
+              return None
+            done.append(t.done)
+        if coord.should_stop():
+          return None
+        all_done = all(done)
+        if not all_done:
+          if any(done):
+            raise RuntimeError("Some replicas made a different number of "
+                               "replica_context().merge_call() calls.")
+          # get_replica_context().merge_call() case
+          merge_args = values.regroup({t.device: t.merge_args for t in threads})
+          merge_kwargs = values.regroup(
+              {t.device: t.merge_kwargs for t in threads})
+          # We capture the name_scope of the MRT when we call merge_fn
+          # to ensure that if we have opened a name scope in the MRT,
+          # it will be respected when executing the merge function. We only
+          # capture the name_scope from the first MRT and assume it is
+          # the same for all other MRTs.
+          mtt_captured_name_scope = threads[0].captured_name_scope
+          with ops.name_scope(mtt_captured_name_scope):
+            merge_result = threads[0].merge_fn(distribution, *merge_args,
+                                               **merge_kwargs)
+          for t in threads:
+            t.merge_result = values.select_device(t.device, merge_result)
+  finally:
+    for t in threads:
+      t.should_run.set()
+    coord.join(threads)
+
+  return values.regroup({t.device: t.main_result for t in threads})
+
+
+def _reduce_non_distributed_value(extended, reduce_op, value, destinations):
+  """Reduce a non-DistributedValue `value` to `destinations`."""
+  if isinstance(value, values.DistributedValues):
+    raise ValueError("You are passing a `DistributedValue` to "
+                     "`_reduce_non_distributed_value`, which is not allowed.")
+
+  # If the same value is present on all replicas then the PerReplica value will
+  # be a single value. We also handle the case when `value` is a single value
+  # and equal to 0.
+  if value == 0:
+    return 0
+  # If there is only a single value and the reduce op is MEAN,
+  # that value should be on all destinations.
+  if reduce_op == reduce_util.ReduceOp.MEAN:
+    return value
+
+  cross_device_ops_lib.validate_destinations(destinations)
+  # We do not support a reduce op of SUM if the value is the same across
+  # all replicas. We call this as part of assign functions for MirroredVariables
+  # and summing up identical values across replicas is not clearly defined.
+  if (len(extended.worker_devices) != 1 or
+      not cross_device_ops_lib.check_destinations(destinations)):
+    raise ValueError("A non-DistributedValues value %s cannot be reduced with "
+                     "the given reduce op %s." % (value, reduce_op))
+  # TODO(anjalisridhar): Moves these methods to a device utility file?
+  devices = cross_device_ops_lib.get_devices_from(destinations)
+  if len(devices) == 1:
+    with ops.device(devices[0]):
+      return array_ops.identity(value)
+  else:
+    value_updates = {}
+    for d in devices:
+      with ops.device(d):
+        value_updates[d] = array_ops.identity(value)
+    return values.Mirrored(value_updates)
+
+
+def _create_mirrored_variable(devices, real_mirrored_creator, *args, **kwargs):  # pylint: disable=g-missing-docstring
+  # Figure out what collections this variable should be added to.
+  # We'll add the MirroredVariable to those collections instead.
+  collections = kwargs.pop("collections", None)
+  if collections is None:
+    collections = [ops.GraphKeys.GLOBAL_VARIABLES]
+  kwargs["collections"] = []
+
+  # Get synchronization value
+  synchronization = kwargs.get("synchronization",
+                               variable_scope.VariableSynchronization.ON_WRITE)
+  if synchronization == variable_scope.VariableSynchronization.NONE:
+    raise ValueError("`NONE` variable synchronization mode is not "
+                     "supported with `Mirrored` distribution strategy. Please"
+                     " change the `synchronization` for variable: " +
+                     kwargs["name"])
+  elif synchronization == variable_scope.VariableSynchronization.ON_READ:
+    # Variables that are to be synced on read are replica local.
+    is_replica_local = True
+    kwargs["trainable"] = False
+  elif (synchronization == variable_scope.VariableSynchronization.ON_WRITE or
+        synchronization == variable_scope.VariableSynchronization.AUTO):
+    # `AUTO` synchronization for `MirroredStrategy` is `ON_WRITE`.
+    is_replica_local = False
+  else:
+    raise ValueError("Invalid variable synchronization mode: " +
+                     synchronization + " for variable: " + kwargs["name"])
+
+  # Get aggregation value
+  aggregation = kwargs.pop("aggregation",
+                           variable_scope.VariableAggregation.NONE)
+  if aggregation not in (
+      variable_scope.VariableAggregation.NONE,
+      variable_scope.VariableAggregation.SUM,
+      variable_scope.VariableAggregation.MEAN,
+      variable_scope.VariableAggregation.ONLY_FIRST_REPLICA
+  ):
+    raise ValueError("Invalid variable aggregation mode: " + aggregation +
+                     " for variable: " + kwargs["name"])
+
+  # Ignore user-specified caching device, not needed for mirrored variables.
+  kwargs.pop("caching_device", None)
+
+  # TODO(josh11b,apassos): It would be better if variable initialization
+  # was never recorded on the tape instead of having to do this manually
+  # here.
+  with tape.stop_recording():
+    index = real_mirrored_creator(devices, *args, **kwargs)
+
+    if is_replica_local:
+      result = values.ReplicaLocalVariable(
+          index, index[devices[0]], aggregation)
+    else:
+      result = values.MirroredVariable(index, index[devices[0]], aggregation)
+
+  # Add the wrapped variable to the requested collections.
+  # The handling of eager mode and the global step matches
+  # ResourceVariable._init_from_args().
+  if not context.executing_eagerly():
+    g = ops.get_default_graph()
+    # If "trainable" is True, next_creator() will add the member variables
+    # to the TRAINABLE_VARIABLES collection, so we manually remove
+    # them and replace with the MirroredVariable. We can't set
+    # "trainable" to False for next_creator() since that causes functions
+    # like implicit_gradients to skip those variables.
+    if kwargs.get("trainable", True):
+      collections.append(ops.GraphKeys.TRAINABLE_VARIABLES)
+      l = g.get_collection_ref(ops.GraphKeys.TRAINABLE_VARIABLES)
+      for v in index.values():
+        if v in l:
+          l.remove(v)
+    g.add_to_collections(collections, result)
+  elif ops.GraphKeys.GLOBAL_STEP in collections:
+    ops.add_to_collections(ops.GraphKeys.GLOBAL_STEP, result)
+
+  return result
+
+
+def _is_device_list_local(devices):
+  """Checks whether the devices list is for local or multi-worker.
+
+  Args:
+    devices: a list of device strings, either local for remote devices.
+
+  Returns:
+    a boolean indicating whether these device strings are for local or for
+    remote.
+
+  Raises:
+    ValueError: if device strings are not consistent.
+  """
+  all_local = None
+  for d in devices:
+    d_spec = tf_device.DeviceSpec().parse_from_string(d)
+    is_local = d_spec.job in (None, "localhost")
+
+    if all_local is None:  # Determine all_local from first device.
+      all_local = is_local
+
+    if all_local:
+      if not is_local:
+        raise ValueError("Local device string cannot have job specified other "
+                         "than 'localhost'")
+    else:
+      if is_local:
+        raise ValueError("Remote device string must have job specified.")
+      if d_spec.task is None:
+        raise ValueError("Remote device string must have task specified.")
+  return all_local
+
+
+def _cluster_spec_to_device_list(cluster_spec, num_gpus_per_worker):
+  """Returns a device list given a cluster spec."""
+  cluster_spec = multi_worker_util.normalize_cluster_spec(cluster_spec)
+  devices = []
+  for task_type in ("chief", "worker"):
+    for task_id in range(len(cluster_spec.as_dict().get(task_type, []))):
+      if num_gpus_per_worker is 0:
+        devices.append("/job:%s/task:%d" % (task_type, task_id))
+      else:
+        devices.extend([
+            "/job:%s/task:%d/device:GPU:%i" % (task_type, task_id, gpu_id)
+            for gpu_id in range(num_gpus_per_worker)
+        ])
+  return devices
+
+
+def _group_device_list(devices):
+  """Groups the devices list by task_type and task_id.
+
+  Args:
+    devices: a list of device strings for remote devices.
+
+  Returns:
+    a dict of list of device strings mapping from task_type to a list of devices
+    for the task_type in the asceding order of task_id.
+  """
+  assert not _is_device_list_local(devices)
+  device_dict = {}
+
+  for d in devices:
+    d_spec = tf_device.DeviceSpec().parse_from_string(d)
+
+    # Create an entry for the task_type.
+    if d_spec.job not in device_dict:
+      device_dict[d_spec.job] = []
+
+    # Fill the device list for task_type until it covers the task_id.
+    while len(device_dict[d_spec.job]) <= d_spec.task:
+      device_dict[d_spec.job].append([])
+
+    device_dict[d_spec.job][d_spec.task].append(d)
+
+  return device_dict
+
+
+def _infer_num_gpus_per_worker(devices):
+  """Infers the number of GPUs on each worker.
+
+  Currently to make multi-worker cross device ops work, we need all workers to
+  have the same number of GPUs.
+
+  Args:
+    devices: a list of device strings, can be either local devices or remote
+      devices.
+
+  Returns:
+    number of GPUs per worker.
+
+  Raises:
+    ValueError if workers have different number of GPUs or GPU indices are not
+    consecutive and starting from 0.
+  """
+  if _is_device_list_local(devices):
+    return len([d for d in devices if "GPU" in d.upper()])
+  else:
+    device_dict = _group_device_list(devices)
+    num_gpus = None
+    for _, devices_in_task in device_dict.items():
+      for device_in_task in devices_in_task:
+        if num_gpus is None:
+          num_gpus = len([d for d in device_in_task if "GPU" in d.upper()])
+
+        # Verify other workers have the same number of GPUs.
+        elif (
+            num_gpus != len([d for d in device_in_task if "GPU" in d.upper()])):
+          raise ValueError("All workers should have the same number of GPUs.")
+
+        for d in device_in_task:
+          d_spec = tf_device.DeviceSpec().parse_from_string(d)
+          if (d_spec.device_type.upper() == "GPU" and
+              d_spec.device_index >= num_gpus):
+            raise ValueError("Device_index on a worker should be consecutive "
+                             "and start from 0.")
+    return num_gpus
+
+
+def all_local_devices(num_gpus=None):
+  if num_gpus is None:
+    num_gpus = context.num_gpus()
+  return (tuple("/device:GPU:%d" % i for i in range(num_gpus)) or
+          ("/device:CPU:0",))
+
+
+class MirroredStrategy(distribute_lib.DistributionStrategy):
+  """Mirrors vars to distribute across multiple devices and machines.
+
+  This strategy uses one replica per device and sync replication for its
+  multi-GPU version.
+
+  The multi-worker version will be added in the fture.
+
+  Args:
+    devices: a list of device strings.
+    cross_device_ops: optional, a descedant of `CrossDeviceOps`. If this is not
+      set, nccl will be use by default.
+  """
+
+  def __init__(self, devices=None, cross_device_ops=None):
+    extended = MirroredExtended(
+        self, devices=devices, cross_device_ops=cross_device_ops)
+    super(MirroredStrategy, self).__init__(extended)
+
+
+class MirroredExtended(distribute_lib.DistributionStrategyExtended):
+  """Implementation of MirroredStrategy."""
+
+  def __init__(self, container_strategy, devices=None, cross_device_ops=None):
+    super(MirroredExtended, self).__init__(container_strategy)
+    if devices is None:
+      devices = all_local_devices()
+    if not devices:
+      raise ValueError("Got an empty `devices` list. Please make sure the "
+                       "`devices` you pass in is not empty.")
+    self._cross_device_ops = cross_device_ops
+    self._initialize_strategy(devices)
+
+  def _initialize_strategy(self, devices):
+    # The _initialize_strategy method is intended to be used by distribute
+    # coordinator as well.
+    if _is_device_list_local(devices):
+      self._initialize_local(devices)
+    else:
+      self._initialize_multi_worker(devices)
+
+  def _initialize_local(self, devices):
+    """Initializes the object for local training."""
+    self._local_mode = True
+    assert devices, "Must specify at least one device."
+    assert len(set(devices)) == len(devices), (
+        "No duplicates allowed in `devices` argument.")
+    # TODO(josh11b): Require at least 2 devices?
+    self._devices = [device_util.resolve(d) for d in devices]
+    self._canonical_device_set = set(self._devices)
+    self._device_index = values.PerReplica(
+        {d: i for i, d in enumerate(devices)})
+
+    self._inferred_cross_device_ops = cross_device_ops_lib.choose_the_best(
+        devices)
+
+  def _initialize_multi_worker(self, devices):
+    """Initializes the object for multi-worker training."""
+    self._local_mode = False
+
+    assert devices, "Must specify at least one device."
+    assert len(set(devices)) == len(devices), (
+        "No duplicates allowed in `devices` argument.")
+    # TODO(josh11b): Require at least 2 devices?
+    self._devices = [device_util.resolve(d) for d in devices]
+    self._canonical_device_set = set(self._devices)
+    self._device_index = values.PerReplica(
+        {d: i for i, d in enumerate(devices)})
+
+    device_dict = _group_device_list(devices)
+    self._workers = []
+    self._worker_devices = []
+    for job in ["chief", "worker"]:
+      for task in range(len(device_dict.get(job, []))):
+        worker = "/job:%s/task:%d" % (job, task)
+        self._workers.append(worker)
+        self._worker_devices.append((worker, device_dict[job][task]))
+
+    # Setting `_default_device` will add a device scope in the
+    # distribution.scope. We set the default device to the first worker. When
+    # users specify device under distribution.scope by
+    #   with tf.device("/cpu:0"):
+    #     ...
+    # their ops will end up on the cpu device of its first worker, e.g.
+    # "/job:worker/task:0/device:CPU:0". Note this is not used in replica mode.
+    self._default_device = self._workers[0]
+
+    self._inferred_cross_device_ops = cross_device_ops_lib.MultiWorkerAllReduce(
+        self._workers, _infer_num_gpus_per_worker(self._devices))
+
+  def _create_variable(self, next_creator, *args, **kwargs):
+    """Create a mirrored variable. See `DistributionStrategy.scope`."""
+    colocate_with = kwargs.pop("colocate_with", None)
+    devices = self._get_devices_from(colocate_with)
+
+    def _real_mirrored_creator(devices, *args, **kwargs):  # pylint: disable=g-missing-docstring
+      index = {}
+      for i, d in enumerate(devices):
+        with ops.init_scope(), ops.device(d):
+          if i > 0:
+            # Give replicas meaningful distinct names:
+            var0name = index[devices[0]].name.split(":")[0]
+            # We append a / to variable names created on replicas with id > 0 to
+            # ensure that we ignore the name scope and instead use the given
+            # name as the absolute name of the variable.
+            kwargs["name"] = "%s/replica_%d/" % (var0name, i)
+            # Initialize replicas with the same value:
+            def initial_value_fn(device=d):
+              if context.executing_eagerly():
+                init_value = index[devices[0]].value()
+                return array_ops.identity(init_value)
+              else:
+                with ops.device(device):
+                  init_value = index[devices[0]].initial_value
+                  return array_ops.identity(init_value)
+            kwargs["initial_value"] = initial_value_fn
+          with context.context().device_policy(context.DEVICE_PLACEMENT_SILENT):
+            # Don't record operations (e.g. other variable reads) during
+            # variable creation.
+            with tape.stop_recording():
+              v = next_creator(*args, **kwargs)
+          assert not isinstance(v, values.DistributedVariable)
+          index[d] = v
+      return index
+
+    return _create_mirrored_variable(devices, _real_mirrored_creator, *args,
+                                     **kwargs)
+
+  def _distribute_dataset(self, dataset_fn):
+    if self._local_mode:
+      return values.PerReplicaDataset(
+          self._call_dataset_fn(dataset_fn), self._devices)
+    else:
+      return values.MultiWorkerDataset(
+          functools.partial(self._call_dataset_fn, dataset_fn),
+          self._worker_devices,
+          auto_shard=False)
+
+  def _make_dataset_iterator(self, dataset):
+    if self._local_mode:
+      worker = device_util.canonicalize("/device:CPU:0")
+      worker_device_pairs = [(worker, self._devices)]
+    else:
+      worker_device_pairs = self._worker_devices
+
+    return values.DatasetIterator(dataset, worker_device_pairs,
+                                  self._num_replicas_in_sync)
+
+  def _make_input_fn_iterator(
+      self,
+      input_fn,
+      replication_mode=distribute_lib.InputReplicationMode.PER_WORKER):
+    input_contexts = []
+    if self._local_mode:
+      num_workers = 1
+      worker = device_util.canonicalize("/device:CPU:0")
+      worker_device_pairs = [(worker, self._devices)]
+    else:
+      num_workers = len(self._worker_devices)
+      worker_device_pairs = self._worker_devices
+
+    for i in range(num_workers):
+      input_contexts.append(distribute_lib.InputContext(
+          num_input_pipelines=num_workers,
+          input_pipeline_id=i,
+          num_replicas_in_sync=self._num_replicas_in_sync))
+    return values.InputFunctionIterator(
+        input_fn, worker_device_pairs, input_contexts)
+
+  # TODO(priyag): Deal with OutOfRange errors once b/111349762 is fixed.
+  def _experimental_run_steps_on_iterator(self, fn, iterator, iterations,
+                                          initial_loop_values=None):
+    if initial_loop_values is None:
+      initial_loop_values = {}
+    initial_loop_values = nest.flatten(initial_loop_values)
+
+    ctx = values.MultiStepContext()
+    def body(i, *args):
+      """A wrapper around `fn` to create the while loop body."""
+      del args
+      fn_inputs = iterator.get_next()
+      if not isinstance(fn_inputs, tuple):
+        fn_inputs = (fn_inputs,)
+      fn_result = fn(ctx, fn_inputs)
+      for (name, output) in ctx.last_step_outputs.items():
+        # Convert all outputs to tensors, potentially from `DistributedValues`.
+        ctx.last_step_outputs[name] = self._unwrap(output)
+      flat_last_step_outputs = nest.flatten(ctx.last_step_outputs)
+      with ops.control_dependencies([fn_result]):
+        return [i + 1] + flat_last_step_outputs
+
+    # We capture the control_flow_context at this point, before we run `fn`
+    # inside a while_loop. This is useful in cases where we might need to exit
+    # these contexts and get back to the outer context to do some things, for
+    # e.g. create an op which should be evaluated only once at the end of the
+    # loop on the host. One such usage is in creating metrics' value op.
+    self._outer_control_flow_context = (
+        ops.get_default_graph()._get_control_flow_context())  # pylint: disable=protected-access
+
+    cond = lambda i, *args: i < iterations
+    i = constant_op.constant(0)
+    loop_result = control_flow_ops.while_loop(
+        cond, body, [i] + initial_loop_values, name="",
+        parallel_iterations=1, back_prop=False, swap_memory=False,
+        return_same_structure=True)
+    del self._outer_control_flow_context
+
+    ctx.run_op = control_flow_ops.group(loop_result)
+
+    # Convert the last_step_outputs from a list to the original dict structure
+    # of last_step_outputs.
+    last_step_tensor_outputs = loop_result[1:]
+    last_step_tensor_outputs_dict = nest.pack_sequence_as(
+        ctx.last_step_outputs, last_step_tensor_outputs)
+
+    for name, reduce_op in ctx._last_step_outputs_reduce_ops.items():  # pylint: disable=protected-access
+      output = last_step_tensor_outputs_dict[name]
+      # For outputs that have already been reduced, wrap them in a Mirrored
+      # container, else in a PerReplica container.
+      if reduce_op is None:
+        last_step_tensor_outputs_dict[name] = values.regroup(
+            {d: t for d, t in zip(self._devices, output)}, values.PerReplica)
+      else:
+        assert len(output) == 1
+        last_step_tensor_outputs_dict[name] = output[0]
+
+    ctx._set_last_step_outputs(last_step_tensor_outputs_dict)  # pylint: disable=protected-access
+    return ctx
+
+  def _broadcast_to(self, tensor, destinations):
+    # This is both a fast path for Python constants, and a way to delay
+    # converting Python values to a tensor until we know what type it
+    # should be converted to. Otherwise we have trouble with:
+    #   global_step.assign_add(1)
+    # since the `1` gets broadcast as an int32 but global_step is int64.
+    if isinstance(tensor, (float, int)):
+      return tensor
+    # TODO(josh11b): In eager mode, use one thread per device, or async mode.
+    return self._get_cross_device_ops().broadcast(
+        tensor, destinations or self._devices)
+
+  def _call_for_each_replica(self, fn, args, kwargs):
+    return _call_for_each_replica(self._container_strategy(), fn, args, kwargs)
+
+  def _configure(self,
+                 session_config=None,
+                 cluster_spec=None,
+                 task_type=None,
+                 task_id=None):
+    del task_type, task_id
+
+    if session_config:
+      session_config.CopyFrom(self._update_config_proto(session_config))
+
+    if cluster_spec:
+      # TODO(yuefengz): remove the following code once cluster_resolver is
+      # added.
+      num_gpus_per_worker = _infer_num_gpus_per_worker(self._devices)
+      multi_worker_devices = _cluster_spec_to_device_list(
+          cluster_spec, num_gpus_per_worker)
+      self._initialize_multi_worker(multi_worker_devices)
+
+  def _update_config_proto(self, config_proto):
+    updated_config = copy.deepcopy(config_proto)
+    updated_config.isolate_session_state = True
+    return updated_config
+
+  def _get_cross_device_ops(self):
+    return self._cross_device_ops or self._inferred_cross_device_ops
+
+  def _reduce_to(self, reduce_op, value, destinations):
+    assert not isinstance(value, values.Mirrored)
+    if not isinstance(value, values.DistributedValues):
+      # This function handles reducing values that are not PerReplica or
+      # Mirrored values. For example, the same value could be present on all
+      # replicas in which case `value` would be a single value or value could
+      # be 0.
+      return _reduce_non_distributed_value(self, reduce_op, value,
+                                           destinations)
+    return self._get_cross_device_ops().reduce(
+        reduce_op, value, destinations=destinations)
+
+  def _batch_reduce_to(self, reduce_op, value_destination_pairs):
+    return self._get_cross_device_ops().batch_reduce(reduce_op,
+                                                     value_destination_pairs)
+
+  def _update(self, var, fn, args, kwargs, group):
+    # TODO(josh11b): In eager mode, use one thread per device.
+    assert isinstance(var, values.DistributedVariable)
+    updates = {}
+    for d, v in var._index.items():  # pylint: disable=protected-access
+      name = "update_%d" % self._device_index.get(d)
+      with ops.device(d), distribute_lib.UpdateContext(d), ops.name_scope(name):
+        # If args and kwargs are not mirrored, the value is returned as is.
+        updates[d] = fn(v,
+                        *values.select_device_mirrored(d, args),
+                        **values.select_device_mirrored(d, kwargs))
+    return values.update_regroup(self, updates, group)
+
+  def _update_non_slot(self, colocate_with, fn, args, kwargs, group):
+    assert isinstance(colocate_with, list)
+    # TODO(josh11b): In eager mode, use one thread per device.
+    updates = {}
+    for d in colocate_with:
+      name = "update_%d" % self._device_index.get(d)
+      with ops.device(d), distribute_lib.UpdateContext(d), ops.name_scope(name):
+        updates[d] = fn(*values.select_device_mirrored(d, args),
+                        **values.select_device_mirrored(d, kwargs))
+    return values.update_regroup(self, updates, group)
+
+  def read_var(self, replica_local_var):
+    """Read the aggregate value of a replica-local variable."""
+    if isinstance(replica_local_var, values.ReplicaLocalVariable):
+      return replica_local_var._get_cross_replica()  # pylint: disable=protected-access
+    assert isinstance(replica_local_var, values.Mirrored)
+    return array_ops.identity(replica_local_var.get())
+
+  def _unwrap(self, val):
+    if isinstance(val, values.DistributedValues):
+      # Return in a deterministic order.
+      if set(val.devices) == self._canonical_device_set:
+        return [val.get(device=d) for d in self._devices]
+      return [val.get(device=d) for d in sorted(val.devices)]
+    return [val]
+
+  def value_container(self, val):
+    return values.value_container(val)
+
+  @property
+  def _num_replicas_in_sync(self):
+    return len(self._devices)
+
+  @property
+  def worker_devices(self):
+    # Make a copy to prevent users from accidentally mutating our copy.
+    return list(self._devices)
+
+  @property
+  def parameter_devices(self):
+    return list(self._devices)
+
+  @property
+  def experimental_between_graph(self):
+    return False
+
+  @property
+  def experimental_should_init(self):
+    return True
+
+  @property
+  def should_checkpoint(self):
+    return True
+
+  @property
+  def should_save_summary(self):
+    return True
+
+  def non_slot_devices(self, var_list):
+    del var_list
+    return list(self._devices)
+
+  def _get_devices_from(self, colocate_with=None):
+    if colocate_with is None:
+      return self._devices
+    else:
+      return cross_device_ops_lib.get_devices_from(colocate_with)
+
+  # TODO(priyag): Delete this once all strategies use global batch size.
+  @property
+  def _global_batch_size(self):
+    return True
+
+  class _MirroredReplicaThread(threading.Thread):
+    """A thread that runs() a function on a device."""
+
+    def __init__(self, dist, coord, device, variable_creator_fn, fn, *args,
+                 **kwargs):
+      super(MirroredExtended._MirroredReplicaThread, self).__init__()  # pylint: disable=protected-access
+      self.coord = coord
+      self.distribution = dist
+      self.device = device
+      self.replica_id = dist.extended.worker_devices.index(device)
+      self.variable_creator_fn = variable_creator_fn
+      # State needed to run and return the results of `fn`.
+      self.main_fn = fn
+      self.main_args = args
+      self.main_kwargs = kwargs
+      self.main_result = None
+      self.done = False
+      # State needed to run the next merge_call() (if any) requested via
+      # ReplicaContext.
+      self.merge_fn = None
+      self.merge_args = None
+      self.merge_kwargs = None
+      self.merge_result = None
+      self.captured_name_scope = None
+      # We use a thread.Event for the main thread to signal when this
+      # thread should start running (`should_run`), and another for
+      # this thread to transfer control back to the main thread
+      # (`has_paused`, either when it gets to a
+      # `get_replica_context().merge_call` or when `fn` returns). In
+      # either case the event starts cleared, is signaled by calling
+      # set(). The receiving thread waits for the signal by calling
+      # wait() and then immediately clearing the event using clear().
+      self.should_run = threading.Event()
+      self.has_paused = threading.Event()
+      # These fields have to do with inheriting various contexts from the
+      # parent thread:
+      # pylint: disable=protected-access
+      self.context_mode = context.context()._eager_context.mode
+      if not context.context()._context_handle:
+        context.context()._initialize_handle_and_devices()
+      self.context_device_policy = (
+          pywrap_tensorflow.TFE_ContextGetDevicePlacementPolicy(
+              context.context()._context_handle))
+      self.graph = ops.get_default_graph()
+      self._variable_creator_stack = self.graph._variable_creator_stack[:]
+      self._captured_var_scope = variable_scope.get_variable_scope()
+      # Adding a "/" at end lets us re-enter this scope later.
+      self._name_scope = self.graph.get_name_scope()
+      if self._name_scope:
+        self._name_scope += "/"
+      if self.replica_id > 0:
+        if not self._name_scope:
+          self._name_scope = ""
+        self._name_scope += "replica_%d/" % self.replica_id
+
+    def run(self):
+      # pylint: disable=protected-access
+      self.graph._variable_creator_stack = self._variable_creator_stack
+      self.should_run.wait()
+      self.should_run.clear()
+      try:
+        if self.coord.should_stop():
+          return
+        with self.coord.stop_on_exception(), \
+            context.context()._mode(self.context_mode), \
+            context.context().device_policy(self.context_device_policy), \
+            _enter_graph(self.graph), \
+            MirroredReplicaContext(self.distribution, constant_op.constant(
+                self.replica_id, dtypes.int32)), \
+            ops.device(self.device), \
+            ops.name_scope(self._name_scope), \
+            variable_scope.variable_scope(
+                self._captured_var_scope, reuse=self.replica_id > 0), \
+            variable_scope.variable_creator_scope(self.variable_creator_fn):
+          self.main_result = self.main_fn(*self.main_args, **self.main_kwargs)
+          self.done = True
+      finally:
+        self.has_paused.set()
+
+
+class MirroredReplicaContext(distribute_lib.ReplicaContext):
+  """ReplicaContext used in MirroredStrategy.call_for_each_replica().
+
+  Opened in `_MirroredReplicaThread`, to allow the user to invoke
+  `MirroredStrategy`'s specific implementation of `merge_call()`,
+  which works by delegating the function and its arguments to
+  the main thread (the one that invoked
+  `MirroredStrategy.call_for_each_replica()`).
+  """
+
+  def _merge_call(self, fn, args, kwargs):
+    """Delegate to the main thread to actually perform merge_call()."""
+    t = threading.current_thread()  # a _MirroredReplicaThread
+    t.merge_fn = fn
+    t.merge_args = args
+    t.merge_kwargs = kwargs
+    t.captured_name_scope = t.graph.get_name_scope()
+    # Adding a "/" at end lets us re-enter this scope later.
+    if t.captured_name_scope:
+      t.captured_name_scope += "/"
+    t.has_paused.set()
+    t.should_run.wait()
+    t.should_run.clear()
+    if t.coord.should_stop():
+      raise _RequestedStop()
+    return t.merge_result
+
+  @property
+  def devices(self):
+    distribute_lib.require_replica_context(self)
+    replica_id = tensor_util.constant_value(self._replica_id_in_sync_group)
+    return [self._distribution_strategy.extended.worker_devices[replica_id]]
diff --git a/tensorflow/python/distribute/multi_worker_util.py b/tensorflow/python/distribute/multi_worker_util.py
index 360733eff64..2986a6726a5 100644
--- a/tensorflow/python/distribute/multi_worker_util.py
+++ b/tensorflow/python/distribute/multi_worker_util.py
@@ -45,6 +45,33 @@ def normalize_cluster_spec(cluster_spec):
   return cluster_spec
 
 
+# TODO(yuefengz): add more validations.
+def _validate_cluster_spec(cluster_spec, task_type, task_id):
+  """Validates `cluster_spec`.
+
+  It checks
+  1) whether there is such a task type as `task_type` in the
+  `cluster_spec`.
+  2) whether there is at most one "chief" job.
+  3) whether the `task_id` is smaller than the number of `task_type`.
+
+  Args:
+    cluster_spec: a dict, `ClusterDef` or `ClusterSpec` object to be validated.
+    task_type: string indicating the type of the task.
+    task_id: task_id: the id of the `task_type` in this cluster.
+  Throws:
+    ValueError: if `cluster_spec` fails any check.
+  """
+  cluster_spec = normalize_cluster_spec(cluster_spec).as_dict()
+  if task_type and task_type not in cluster_spec:
+    raise ValueError("`task_type` %r not found in cluster_spec." % task_type)
+  if len(cluster_spec.get("chief", [])) > 1:
+    raise ValueError("There must be at most one 'chief' job.")
+  if task_id >= len(cluster_spec[task_type]):
+    raise ValueError(
+        "The `task_id` %d exceeds the maximum id of %s." % (task_id, task_type))
+
+
 def is_chief(cluster_spec, task_type, task_id):
   """Returns whether the given task is chief in the cluster.
 
@@ -61,20 +88,73 @@ def is_chief(cluster_spec, task_type, task_id):
     ValueError: if `task_type` is not in the `cluster_spec` or `task_id` exceeds
       the maximum id of the `task_type`.
   """
-  cluster_spec = normalize_cluster_spec(cluster_spec)
-  if task_type not in cluster_spec.jobs:
-    raise ValueError(
-        "The task_type \"%s\" is not in the `cluster_spec`." % task_type)
-  if task_id >= cluster_spec.num_tasks(task_type):
-    raise ValueError("The `task_id` %d exceeds the maximum id of %s." % (
-        task_id, task_type))
+  _validate_cluster_spec(cluster_spec, task_type, task_id)
+  cluster_spec = normalize_cluster_spec(cluster_spec).as_dict()
 
   if task_type == "chief":
     return True
 
   # If chief not in the cluster_spec, use the first worker as chief. This is
   # common in CollectiveAllReduceStrategy.
-  if ("chief" not in cluster_spec.jobs and task_type == "worker" and
-      task_id == 0):
+  if ("chief" not in cluster_spec and task_type == "worker" and task_id == 0):
     return True
   return False
+
+
+def worker_count(cluster_spec, task_type):
+  """Returns the number of workers in the cluster."""
+  _validate_cluster_spec(cluster_spec, task_type, task_id=0)
+  cluster_spec = normalize_cluster_spec(cluster_spec).as_dict()
+
+  # Other jobs such as "ps" shouldn't call this function.
+  if task_type not in ["chief", "worker", "evaluator"]:
+    raise ValueError("Unexpected `task_type` %r" % task_type)
+
+  if task_type == "evaluator":
+    # The "evaluator" is in its own cluster or its own partition of a cluster.
+    # So we don't have to count "chief" or "worker" if the current task is an
+    # "evaluator".
+    return len(cluster_spec["evaluator"])
+  else:
+    # In the non-evaluator case, we return the total number of "chief" and
+    # "worker" tasks as the "chief" is also a worker.
+    return (len(cluster_spec.get("chief", [])) + len(
+        cluster_spec.get("worker", [])))
+
+
+def id_in_cluster(cluster_spec, task_type, task_id):
+  """Returns a unique id for the task in the `task_type`'s cluster.
+
+  It returns an id ranging from [0, `worker_count(task_type, task_id)`).
+
+  Note: this function assumes that "evaluate" job is in its own cluster or its
+  own partition of a cluster.
+
+  Args:
+    cluster_spec: a dict, `ClusterDef` or `ClusterSpec` object to be validated.
+    task_type: string indicating the type of the task.
+    task_id: the id of the `task_type` in this cluster.
+
+  Returns:
+    an int indicating the unique id.
+
+  Throws:
+    ValueError: if `task_type` is not "chief", "worker" or "evaluator".
+  """
+  _validate_cluster_spec(cluster_spec, task_type, task_id)
+  cluster_spec = normalize_cluster_spec(cluster_spec).as_dict()
+
+  # The "chief" job has always id 0 and there is at most one and "worker" jobs
+  # come after it.
+  if task_type == "chief":
+    return 0
+
+  if task_type == "worker":
+    return task_id + len(cluster_spec.get("chief", []))
+
+  # The "evaluator" is in its own cluster or its own partition of a cluster.
+  if task_type == "evaluator":
+    return task_id
+
+  # We currently don't assign ids to other tasks.
+  raise ValueError("There is no id for task_type %r" % task_type)
diff --git a/tensorflow/python/distribute/multi_worker_util_test.py b/tensorflow/python/distribute/multi_worker_util_test.py
index bdc49725c77..9e1596eefdf 100644
--- a/tensorflow/python/distribute/multi_worker_util_test.py
+++ b/tensorflow/python/distribute/multi_worker_util_test.py
@@ -95,7 +95,7 @@ class IsChiefTest(test.TestCase):
     self.assertFalse(multi_worker_util.is_chief(cluster_spec, "worker", 1))
 
     with self.assertRaisesRegexp(
-        ValueError, "The task_type \"chief\" is not in the `cluster_spec`."):
+        ValueError, "`task_type` 'chief' not found in cluster_spec."):
       multi_worker_util.is_chief(cluster_spec, "chief", 0)
 
     with self.assertRaisesRegexp(
@@ -103,5 +103,94 @@ class IsChiefTest(test.TestCase):
       multi_worker_util.is_chief(cluster_spec, "worker", 2)
 
 
+class NumWorkersTest(test.TestCase):
+
+  def testCountWorker(self):
+    cluster_spec = {
+        "chief": ["127.0.0.1:1234"],
+        "worker": ["127.0.0.1:8964", "127.0.0.1:2333"],
+        "ps": ["127.0.0.1:1926", "127.0.0.1:3141"]
+    }
+    self.assertEqual(
+        multi_worker_util.worker_count(cluster_spec, task_type="chief"), 3)
+    self.assertEqual(
+        multi_worker_util.worker_count(cluster_spec, task_type="worker"), 3)
+
+  def testCountEvaluator(self):
+    cluster_spec = {
+        "chief": ["127.0.0.1:1234"],
+        "worker": ["127.0.0.1:8964", "127.0.0.1:2333"],
+        "evaluator": ["127.0.0.1:7566"]
+    }
+    self.assertEqual(
+        multi_worker_util.worker_count(cluster_spec, task_type="evaluator"), 1)
+
+  def testTaskTypeNotFound(self):
+    cluster_spec = {}
+    with self.assertRaisesRegexp(
+        ValueError, "`task_type` 'worker' not found in cluster_spec."):
+      multi_worker_util.worker_count(cluster_spec, task_type="worker")
+
+  def testCountPs(self):
+    cluster_spec = {
+        "chief": ["127.0.0.1:1234"],
+        "ps": ["127.0.0.1:1926", "127.0.0.1:3141"]
+    }
+    # A "ps" job shouldn't call this method.
+    with self.assertRaisesRegexp(ValueError, "Unexpected `task_type` 'ps'"):
+      multi_worker_util.worker_count(cluster_spec, task_type="ps")
+
+
+class IdInClusterTest(test.TestCase):
+
+  def testChiefId(self):
+    cluster_spec = {
+        "chief": ["127.0.0.1:1234"],
+        "worker": ["127.0.0.1:8964", "127.0.0.1:2333"],
+        "ps": ["127.0.0.1:1926", "127.0.0.1:3141"]
+    }
+    self.assertEqual(
+        multi_worker_util.id_in_cluster(cluster_spec, "chief", 0), 0)
+
+  def testWorkerId(self):
+    cluster_spec = {
+        "chief": ["127.0.0.1:1234"],
+        "worker": ["127.0.0.1:8964", "127.0.0.1:2333"],
+        "ps": ["127.0.0.1:1926", "127.0.0.1:3141"]
+    }
+    self.assertEqual(
+        multi_worker_util.id_in_cluster(cluster_spec, "worker", 1), 2)
+
+    cluster_spec = {
+        "worker": ["127.0.0.1:8964", "127.0.0.1:2333"],
+        "ps": ["127.0.0.1:1926", "127.0.0.1:3141"]
+    }
+    self.assertEqual(
+        multi_worker_util.id_in_cluster(cluster_spec, "worker", 1), 1)
+
+  def testEvaluatorId(self):
+    cluster_spec = {
+        "chief": ["127.0.0.1:1234"],
+        "worker": ["127.0.0.1:8964", "127.0.0.1:2333"],
+        "evaluator": ["127.0.0.1:7566"]
+    }
+    self.assertEqual(
+        multi_worker_util.id_in_cluster(cluster_spec, "evaluator", 0), 0)
+
+  def testPsId(self):
+    cluster_spec = {"chief": ["127.0.0.1:1234"], "ps": ["127.0.0.1:7566"]}
+    with self.assertRaisesRegexp(ValueError,
+                                 "There is no id for task_type 'ps'"):
+      multi_worker_util.id_in_cluster(cluster_spec, "ps", 0)
+
+  def testMultipleChiefs(self):
+    cluster_spec = {
+        "chief": ["127.0.0.1:8258", "127.0.0.1:7566"],
+    }
+    with self.assertRaisesRegexp(ValueError,
+                                 "There must be at most one 'chief' job."):
+      multi_worker_util.id_in_cluster(cluster_spec, "chief", 0)
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/distribute/reduce_util.py b/tensorflow/python/distribute/reduce_util.py
new file mode 100644
index 00000000000..2b2a4e9dba8
--- /dev/null
+++ b/tensorflow/python/distribute/reduce_util.py
@@ -0,0 +1,53 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utilites for reduce operations."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import enum
+
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.util.tf_export import tf_export
+
+
+@tf_export("distribute.ReduceOp")
+class ReduceOp(enum.Enum):
+  """Indicates how a set of values should be reduced.
+
+  * `SUM`: Add all the values.
+  * `MEAN`: Take the arithmetic mean ("average") of the values.
+
+  TODO(priyag): Add the following types:
+  * `MIN`: Return the minimum of all values.
+  * `MAX`: Return the maximum of all values.
+  """
+
+  SUM = "SUM"
+  MEAN = "MEAN"
+
+  @staticmethod
+  def from_variable_aggregation(aggregation):
+    mapping = {
+        variable_scope.VariableAggregation.SUM: ReduceOp.SUM,
+        variable_scope.VariableAggregation.MEAN: ReduceOp.MEAN,
+    }
+
+    reduce_op = mapping.get(aggregation)
+    if not reduce_op:
+      raise ValueError("Could not convert from `tf.VariableAggregation` %s to"
+                       "`tf.distribute.ReduceOp` type" % aggregation)
+    return reduce_op
diff --git a/tensorflow/contrib/distribute/python/shared_variable_creator.py b/tensorflow/python/distribute/shared_variable_creator.py
similarity index 100%
rename from tensorflow/contrib/distribute/python/shared_variable_creator.py
rename to tensorflow/python/distribute/shared_variable_creator.py
diff --git a/tensorflow/contrib/distribute/python/shared_variable_creator_test.py b/tensorflow/python/distribute/shared_variable_creator_test.py
similarity index 97%
rename from tensorflow/contrib/distribute/python/shared_variable_creator_test.py
rename to tensorflow/python/distribute/shared_variable_creator_test.py
index 2a9ab51fcfd..4ddc29f2567 100644
--- a/tensorflow/contrib/distribute/python/shared_variable_creator_test.py
+++ b/tensorflow/python/distribute/shared_variable_creator_test.py
@@ -18,7 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.distribute.python import shared_variable_creator
+from tensorflow.python.distribute import shared_variable_creator
 from tensorflow.python.eager import test
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import variable_scope
diff --git a/tensorflow/contrib/distribute/python/values.py b/tensorflow/python/distribute/values.py
similarity index 77%
rename from tensorflow/contrib/distribute/python/values.py
rename to tensorflow/python/distribute/values.py
index a1629735353..01a1680a246 100644
--- a/tensorflow/contrib/distribute/python/values.py
+++ b/tensorflow/python/distribute/values.py
@@ -23,11 +23,18 @@ from __future__ import print_function
 
 import collections
 import contextlib
+import operator
 import weakref
 import six
 
-from tensorflow.contrib.distribute.python import input_ops
+from tensorflow.python.data.experimental.ops import batching
+from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import multi_device_iterator_ops
+from tensorflow.python.distribute import device_util
+from tensorflow.python.distribute import distribute_lib
+from tensorflow.python.distribute import distribution_strategy_context
+from tensorflow.python.distribute import input_ops
+from tensorflow.python.distribute import reduce_util
 from tensorflow.python.eager import context
 from tensorflow.python.eager import tape
 from tensorflow.python.framework import device as tf_device
@@ -38,10 +45,6 @@ from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gen_resource_variable_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variable_scope as vs
-from tensorflow.python.ops import variables as variables_lib
-from tensorflow.python.training import device_util
-from tensorflow.python.training import distribute as distribute_lib
-from tensorflow.python.training import distribution_strategy_context
 from tensorflow.python.training import saver
 from tensorflow.python.training.checkpointable import base as checkpointable
 from tensorflow.python.util import nest
@@ -97,10 +100,21 @@ class DistributedValues(object):
   # DistributionStrategy implementations.
 
 
+# NOTE(josh11b,apassos): It would be great if we could inspect the values this was
+# initialized with and use that to generate the overloaded operators here.
+# Unfortunately, Python's rules for special methods don't allow this, see
+# https://docs.python.org/3/reference/datamodel.html#special-method-names
+# "if a class defines a method named __getitem__(), and x is an instance of
+# this class, then x[i] is roughly equivalent to type(x).__getitem__(x, i)."
+# In particular, these special methods don't go through __getattr__, and
+# it will only use those methods if they are defined in the class, not the
+# object.
 class DistributedDelegate(DistributedValues):
   """A map from device to values; acts as the same type as the values."""
 
   def __getattr__(self, name):
+    # TODO(priyag): This needs to be made robust against pitfalls from mix use
+    # __getattr__ and @property. See b/120402273.
     return getattr(self.get(), name)
 
   # pylint: disable=multiple-statements
@@ -316,6 +330,14 @@ class DistributedVariable(DistributedDelegate):
 ops.register_dense_tensor_like_type(DistributedVariable)
 
 
+def _apply_aggregation(strategy, value, aggregation, destinations):
+  if aggregation == vs.VariableAggregation.ONLY_FIRST_REPLICA:
+    return strategy.broadcast(strategy.unwrap(value)[0],
+                              destinations=destinations)
+  reduce_op = reduce_util.ReduceOp.from_variable_aggregation(aggregation)
+  return strategy.extended.reduce_to(reduce_op, value, destinations)
+
+
 class _MirroredSaveable(saver.BaseSaverBuilder.ResourceVariableSaveable):
   """Class for defining how to restore a MirroredVariable."""
 
@@ -373,14 +395,11 @@ class MirroredVariable(DistributedVariable, Mirrored,
                          "MirroredVariable in Replica Context.")
 
       def merge_fn(strategy, value, *other_args, **other_kwargs):
-        return strategy.update(
-            self, f,
-            strategy.reduce(
-                aggregation=self._aggregation, value=value, destinations=self),
-            *other_args, **other_kwargs)
+        v = _apply_aggregation(strategy, value, self._aggregation, self)
+        return strategy.update(self, f, v, *other_args, **other_kwargs)
 
       return distribution_strategy_context.get_replica_context().merge_call(
-          merge_fn, *args, **kwargs)
+          merge_fn, args=args, kwargs=kwargs)
 
   def assign_sub(self, *args, **kwargs):
     assign_sub_fn = lambda var, *a, **kw: var.assign_sub(*a, **kw)
@@ -614,14 +633,11 @@ class TPUMirroredVariable(checkpointable.CheckpointableBase):
                          "TPUMirroredVariable in Replica Context.")
 
       def merge_fn(strategy, value, *other_args, **other_kwargs):
-        return strategy.update(
-            self, f,
-            strategy.reduce(
-                aggregation=self._aggregation, value=value, destinations=self),
-            *other_args, **other_kwargs)
+        v = _apply_aggregation(strategy, value, self._aggregation, self)
+        return strategy.update(self, f, v, *other_args, **other_kwargs)
 
       return distribution_strategy_context.get_replica_context().merge_call(
-          merge_fn, *args, **kwargs)
+          merge_fn, args=args, kwargs=kwargs)
 
   @contextlib.contextmanager
   def _handle_graph(self, handle):
@@ -1058,18 +1074,18 @@ def select_device_mirrored(device, structured):
   return nest.map_structure(_get_mirrored, structured)
 
 
-def update_regroup(strategy, updates, should_group):
+def update_regroup(extended, updates, group):
   """Regroup for an update, with dependencies to ensure all updates execute."""
   regrouped = regroup(updates, Mirrored)
-  if not should_group:
-    return nest.map_structure(strategy.unwrap, regrouped)
+  if not group:
+    return nest.map_structure(extended._unwrap, regrouped)  # pylint: disable=protected-access
   grouped_flat = []
   for u in nest.flatten(regrouped):
     if isinstance(u, DistributedValues):
-      g = strategy.group(u)
+      g = extended._group(u)  # pylint: disable=protected-access
       if u.is_tensor_like:
         # Make sure we run all updates. Without this, something like
-        # session.run(strategy.update(...)) may only update one replica.
+        # session.run(extended.update(...)) may only update one replica.
         index = {}
         for d in u.devices:
           with ops.device(d), ops.control_dependencies([g]):
@@ -1155,7 +1171,7 @@ class PerReplicaDataset(object):
     # Eager mode prefetching would error out in constructor. Only remaining
     # case is non-prefetching in eager mode. We delegate to
     # PerReplicaDataIterator to handle that case.
-    dataset_iterator = self._dataset.make_one_shot_iterator()
+    dataset_iterator = dataset_ops.make_one_shot_iterator(self._dataset)
     return PerReplicaDataIterator(
         dataset_iterator, self._devices, prefetch_on_device=False)
 
@@ -1170,7 +1186,7 @@ class PerReplicaDataset(object):
       dataset_iterator = multi_device_iterator_ops.MultiDeviceIterator(
           self._dataset, self._devices)
     else:
-      dataset_iterator = self._dataset.make_initializable_iterator()
+      dataset_iterator = dataset_ops.make_initializable_iterator(self._dataset)
     return PerReplicaDataIterator(
         dataset_iterator,
         self._devices,
@@ -1252,22 +1268,34 @@ class MultiWorkerDataset(object):
     """Initialize the MultiWorkerDataset object.
 
     Args:
-      dataset_fn: a function that returns a `tf.data.Dataset`.
+      dataset_fn: a function or a list of functions that returns a
+        `tf.data.Dataset`.
       worker_device_pairs: a list of (worker, list of devices on that worker)
-        pairs.
+        pairs; it must have same length with `dataset_fn` if `dataset_fn` is a
+        list.
       prefetch_on_device: whether to prefetch to devices.
       auto_shard: whether to auto-shard the dataset.
     """
+    if isinstance(dataset_fn, list):
+      if len(dataset_fn) != len(worker_device_pairs):
+        raise ValueError("If `dataset_fn` is a list, it must have same length "
+                         "as `worker_device_pairs`")
+      if auto_shard:
+        raise ValueError(
+            "If `dataset_fn` is a list, `auto_shard` is not supported.")
     self._worker_device_pairs = worker_device_pairs
     self._datasets = []
     # TODO(yuefengz, priyag): support different set of jobs for input
     # processing.
     for i, (worker, worker_devices) in enumerate(worker_device_pairs):
       with ops.device(worker):
-        worker_input = dataset_fn()
-        if auto_shard:
-          worker_input = input_ops.auto_shard_dataset(
-              worker_input, len(worker_device_pairs), i)
+        if isinstance(dataset_fn, list):
+          worker_input = dataset_fn[i]()
+        else:
+          worker_input = dataset_fn()
+          if auto_shard:
+            worker_input = input_ops.auto_shard_dataset(
+                worker_input, len(worker_device_pairs), i)
         dataset = PerReplicaDataset(
             worker_input, worker_devices, prefetch_on_device=prefetch_on_device)
         self._datasets.append((worker, dataset))
@@ -1276,36 +1304,337 @@ class MultiWorkerDataset(object):
     iterators = []
     for worker, dataset in self._datasets:
       with ops.device(worker):
-        iterators.append((worker, dataset.make_one_shot_iterator()))
+        iterators.append((worker, dataset_ops.make_one_shot_iterator(dataset)))
     return MultiWorkerDataIterator(iterators, self._worker_device_pairs)
 
   def make_initializable_iterator(self):
     iterators = []
     for worker, dataset in self._datasets:
       with ops.device(worker):
-        iterators.append((worker, dataset.make_initializable_iterator()))
+        iterators.append(
+            (worker, dataset_ops.make_initializable_iterator(dataset)))
     return MultiWorkerDataIterator(iterators, self._worker_device_pairs)
 
 
+class InputIterator(object):
+  """An input iterator, intended to be passed to `DistributionStrategy.run`."""
+
+  def get_next(self):
+    """Returns the next inputs for all replicas."""
+    raise NotImplementedError("must be implemented in descendants")
+
+  def initialize(self):
+    """Initialize the underlying input dataset, when applicable.
+
+    In eager mode, this will create a new iterator and return it.
+    In graph mode, this will initialize the same underlying iterator(s).
+
+    Users are required to call this if
+    - This iterator was returned from a call to `make_input_fn_iterator` with an
+      input function that returns a dataset.
+    - Or this iterator was returned from a call to `make_dataset_iterator`.
+
+    Returns:
+      A list of initialization ops to be executed.
+    """
+    raise NotImplementedError("must be implemented in descendants")
+
+
+class InputIteratorImpl(InputIterator):
+  """Common implementation for all input iterators."""
+
+  def __init__(self, worker_device_pairs, iterators):
+    if not worker_device_pairs:
+      raise ValueError("Should have at least one worker for input iterator.")
+
+    self._iterators = iterators
+    self._worker_device_pairs = worker_device_pairs
+    self._is_eager = context.executing_eagerly()
+
+  def get_next(self, name=None):
+    """Returns the next input from the iterator for all replicas."""
+    assert self._is_eager == context.executing_eagerly(), (
+        "Iterator should be created and used in same execution mode.")
+
+    index = {}
+    for i, (worker, worker_devices) in enumerate(self._worker_device_pairs):
+      if name is not None:
+        d = tf_device.DeviceSpec.from_string(worker)
+        new_name = "%s_%s_%d" % (name, d.job, d.task)
+      else:
+        new_name = None
+      with ops.device(worker):
+        data_per_worker = self._iterators[i].get_next(new_name)
+
+      # Ungroup these per-replica value so as to get a flat map from devices to
+      # values.
+      for d in worker_devices:
+        v = select_device(d, data_per_worker)
+        if d in index:
+          raise ValueError("Duplicated devices in worker_device_pairs: %r" % v)
+        index[d] = v
+
+    return regroup(index)
+
+  def initialize(self):
+    """Initialze underlying iterators.
+
+    Returns:
+      A list of any initializer ops that should be run.
+    """
+    assert self._is_eager == context.executing_eagerly(), (
+        "Iterator should be created and used in same execution mode.")
+
+    init_ops = []
+    for it in self._iterators:
+      init_ops.extend(it.initialize())
+    return init_ops
+
+  # TODO(priyag): Remove when we switch to using `MultiDeviceIterator` for TPUs.
+  @property
+  def output_classes(self):
+    return self._iterators[0].output_classes
+
+  # TODO(priyag): Remove when we switch to using `MultiDeviceIterator` for TPUs.
+  @property
+  def output_shapes(self):
+    return self._iterators[0].output_shapes
+
+  # TODO(priyag): Remove when we switch to using `MultiDeviceIterator` for TPUs.
+  @property
+  def output_types(self):
+    return self._iterators[0].output_types
+
+  # TODO(priyag): Remove when we switch to using `MultiDeviceIterator` for TPUs.
+  def get_iterator(self, worker):
+    for i, (w, _) in enumerate(self._worker_device_pairs):
+      if worker == w:
+        return self._iterators[i]
+    return None
+
+
+class InputFunctionIterator(InputIteratorImpl):
+  """Iterator created from input function."""
+
+  def __init__(self, input_fn, worker_device_pairs, input_contexts):
+    """Make an iterator for input provided via an input function.
+
+    Currently implements PER_WORKER mode, in which the `input_fn` is called
+    once on each worker.
+
+    TODO(priyag): Add other replication modes.
+    TODO(priyag): Allow taking input function that returns a callable that
+    returns nest of tensors.
+
+    Args:
+      input_fn: Input function that returns a `tf.data.Dataset` object.
+      worker_device_pairs: A list of (worker, list of devices on that worker)
+        pairs.
+      input_contexts: A list of `InputContext` instances to be passed to call(s)
+        to `input_fn`. Length and order should match worker order in
+        `worker_device_pairs`.
+    """
+    if len(worker_device_pairs) != len(input_contexts):
+      raise ValueError(
+          "Number of worker_device_pairs (%d) is not same as number of"
+          "input_contexts (%d)" % (
+              len(worker_device_pairs), len(input_contexts)))
+
+    iterators = []
+    for (worker, devices), ctx in zip(worker_device_pairs, input_contexts):
+      # TODO(priyag): We should probably explicitly specify CPU device on worker.
+      with ops.device(worker):
+        result = input_fn(ctx)
+        if not isinstance(result, dataset_ops.DatasetV2):
+          raise ValueError("input_fn must return a tf.data.Dataset.")
+        iterator = _SingleWorkerDatasetIterator(result, worker, devices)
+        iterators.append(iterator)
+
+    super(InputFunctionIterator, self).__init__(
+        worker_device_pairs, iterators)
+
+
+class DatasetIterator(InputIteratorImpl):
+  """Iterator created from input dataset."""
+
+  def __init__(self, dataset, worker_device_pairs, split_batch_by=None):
+    """Make an iterator for the dataset on given devices.
+
+    If `split_batch_by` is not None, we "split" each batch of the
+    dataset by `split_batch_by` value. To achieve this, we first unbatch the
+    input dataset and then rebatch it with the per replica batch size that is
+    calculated using `global_batch_size // split_batch_by`.
+    The currently supported datasets are as follows:
+    `dataset.batch()` is the last operation on the dataset OR
+    `dataset.apply(map_and_batch)` is the last operation on the dataset OR
+    `dataset.batch().prefetch()` are the last 2 operations on the dataset OR
+    `dataset.apply(map_and_batch).prefetch()` are the last 2 operations.
+
+    TODO(priyag): Support multi worker / host cases properly by cloning
+    and sharding the dataset on each worker. Current setup will only work in
+    some cases, such as in-graph multi worker GPU case. If the input pipeline
+    has random shuffling (with a different seed on each worker), each worker
+    will see random input from the same overall dataset in each step. Otherwise,
+    each worker will see the same input in each step.
+
+    Args:
+      dataset: `tf.data.Dataset` that will be used as the input source.
+      worker_device_pairs: A list of (worker, list of devices on that worker)
+        pairs.
+      split_batch_by: Optional integer. If present, we "split" each batch of the
+        dataset by `split_batch_by` value.
+    """
+    if split_batch_by:
+      dataset = _split_dataset_batch(dataset, split_batch_by)
+
+    iterators = []
+    for worker, worker_devices in worker_device_pairs:
+      with ops.device(worker):
+        iterator = _SingleWorkerDatasetIterator(dataset, worker, worker_devices)
+        iterators.append(iterator)
+
+    super(DatasetIterator, self).__init__(worker_device_pairs, iterators)
+
+
+class _SingleWorkerDatasetIterator(object):
+  """Iterator for a single `tf.data.Dataset`."""
+
+  def __init__(self, dataset, worker, devices):
+    """Create iterator for the `dataset` to fetch data to worker's `devices` .
+
+    `MultiDeviceIterator` is used to prefetch input to the devices on the
+    given worker. `MultiDeviceIterator` doesn't work in eager mode yet.
+
+    Args:
+      dataset: A `tf.data.Dataset` instance.
+      worker: Worker on which ops should be created.
+      devices: Distribute data from `dataset` to these devices.
+    """
+    self._dataset = dataset
+    self._worker = worker
+    self._devices = devices
+    self._is_eager = context.executing_eagerly()
+    self._make_iterator()
+
+  def _make_iterator(self):
+    """Make appropriate iterator on the dataset."""
+    with ops.device(self._worker):
+      if self._is_eager:
+        # TODO(rohanj): Enable prefetching in eager mode.
+        # TODO(priyag): Measure the performance of this approach vs calling
+        # get_next on the original dataset N times.
+        dataset = self._dataset.batch(len(self._devices), drop_remainder=True)
+        iterator = dataset_ops.make_one_shot_iterator(dataset)
+      else:
+        iterator = multi_device_iterator_ops.MultiDeviceIterator(
+            self._dataset, self._devices)
+    self._iterator = iterator
+
+  def get_next(self, name=None):
+    """Get next element from the underlying iterator."""
+    with ops.device(self._worker):
+      if self._is_eager:
+        # Batched dataset case.
+        batch = self._iterator.get_next(name=name)
+        index = {}
+        for i, d in enumerate(self._devices):
+          index[d] = nest.map_structure(operator.itemgetter(i), batch)
+          with ops.device(d):
+            index[d] = nest.map_structure(array_ops.identity, index[d])
+      else:
+        # MultiDeviceIterator case.
+        data_list = self._iterator.get_next()
+        index = dict(zip(self._devices, data_list))
+
+      return regroup(index)
+
+  def initialize(self):
+    """Initialze underlying iterator.
+
+    In eager execution, this simply recreates the underlying iterator.
+    In graph execution, it returns the initializer ops for the underlying
+    iterator.
+
+    Returns:
+      A list of any initializer ops that should be run.
+    """
+    if self._is_eager:
+      self._make_iterator()
+      return []
+    else:
+      return [self._iterator.initializer]
+
+  @property
+  def output_classes(self):
+    return self._iterator.output_classes
+
+  @property
+  def output_shapes(self):
+    return self._iterator.output_shapes
+
+  @property
+  def output_types(self):
+    return self._iterator.output_types
+
+
+def _split_dataset_batch(dataset, split_batch_by):
+  """Divide a batch-ed dataset's batches into smaller batches."""
+  # TODO(sourabhbajaj): Remove this in lieu of distributed datasets
+  # pylint: disable=protected-access
+  def _get_batch_dataset(d):
+    """Get the underlying batch dataset from the dataset object."""
+    if isinstance(d, dataset_ops.DatasetV1Adapter):
+      d = d._dataset
+
+    if isinstance(d, (dataset_ops.BatchDataset, batching._MapAndBatchDataset)):
+      return d
+    elif isinstance(d, dataset_ops.PrefetchDataset):
+      return _get_batch_dataset(d._input_dataset)
+    raise ValueError(
+        "Unable to get batched dataset from the input dataset. `batch` "
+        "`map_and_batch` need to be the last operations on the dataset. "
+        "The batch operations can be followed by a prefetch.")
+
+  batched_dataset = _get_batch_dataset(dataset)
+  batch_size = batched_dataset._batch_size
+  drop_remainder = batched_dataset._drop_remainder
+  # pylint: enable=protected-access
+
+  if tensor_util.is_tensor(batch_size):
+    batch_size = tensor_util.constant_value(batch_size)
+
+  if tensor_util.is_tensor(drop_remainder):
+    drop_remainder = tensor_util.constant_value(drop_remainder)
+
+  if batch_size % split_batch_by:
+    raise ValueError(
+        "Batch size %s cannot be sharded evenly across replicas %s" % (
+            batch_size, split_batch_by))
+  new_batch_size = batch_size // split_batch_by
+
+  dataset = dataset.apply(batching.unbatch())
+  return dataset.batch(new_batch_size, drop_remainder=drop_remainder)
+
+
 class MultiStepContext(object):
   """A context object that can be used to capture things when running steps.
 
   This context object is useful when running multiple steps at a time using the
-  `run_steps_on_dataset` API. For e.g. it allows the user's step function to
-  specify which outputs to emit at what frequency. Currently it supports
-  capturing output from the last step, as well as capturing non tensor outputs.
-  In the future it will be augmented to support other use cases such as output
-  each N steps.
+  `experimental_run_steps_on_iterator` API. For e.g. it allows the user's step
+  function to specify which outputs to emit at what frequency. Currently it
+  supports capturing output from the last step, as well as capturing non tensor
+  outputs.  In the future it will be augmented to support other use cases such
+  as output each N steps.
   """
 
   def __init__(self):
-    """Initializes an output context.
+    """Initialize an output context.
 
     Returns:
       A context object.
     """
     self._last_step_outputs = {}
-    self._last_step_outputs_aggregations = {}
+    self._last_step_outputs_reduce_ops = {}
     self._non_tensor_outputs = {}
 
   @property
@@ -1315,8 +1644,8 @@ class MultiStepContext(object):
     Keys in the dictionary are names of tensors to be captured, as specified
     when `set_last_step_output` is called.
     Values in the dictionary are the tensors themselves. If
-    `set_last_step_output` was called with an `aggregation` for this output,
-    then the value is the aggregated value.
+    `set_last_step_output` was called with a `reduce_op` for this output,
+    then the value is the reduced value.
 
     Returns:
       A dictionary with last step outputs.
@@ -1329,8 +1658,7 @@ class MultiStepContext(object):
       raise ValueError("Need a dictionary to set last_step_outputs.")
     self._last_step_outputs = outputs
 
-  def set_last_step_output(self, name, output,
-                           aggregation=variables_lib.VariableAggregation.NONE):
+  def set_last_step_output(self, name, output, reduce_op=None):
     """Set `output` with `name` to be outputted from the last step.
 
     Args:
@@ -1338,39 +1666,36 @@ class MultiStepContext(object):
         name.
       output: The tensors that should be outputted with `name`. See below for
         actual types supported.
-      aggregation: Aggregation method to use to aggregate outputs from multiple
+      reduce_op: Reduction method to use to reduce outputs from multiple
         replicas. Required if `set_last_step_output` is called in a replica
         context. Optional in cross_replica_context.
-        When present, the outputs from all the replicas are aggregated using the
+        When present, the outputs from all the replicas are reduced using the
         current distribution strategy's `reduce` method. Hence, the type of
         `output` must be what's supported by the corresponding `reduce` method.
-        For e.g. if using MirroredStrategy and aggregation is set, output
+        For e.g. if using MirroredStrategy and reduction is set, output
         must be a `PerReplica` value.
-        The aggregation method is also recorded in a dictionary
-        `_last_step_outputs_aggregations` for later interpreting of the
+        The reduce method is also recorded in a dictionary
+        `_last_step_outputs_reduce_ops` for later interpreting of the
         outputs as already reduced or not.
-
     """
     if distribution_strategy_context.get_cross_replica_context():
-      self._last_step_outputs_aggregations[name] = aggregation
-      if aggregation is variables_lib.VariableAggregation.NONE:
+      self._last_step_outputs_reduce_ops[name] = reduce_op
+      if reduce_op is None:
         self._last_step_outputs[name] = output
       else:
         distribution = distribution_strategy_context.get_distribution_strategy()
-        self._last_step_outputs[name] = distribution.reduce(
-            aggregation, output, destinations="/device:CPU:0")
+        self._last_step_outputs[name] = distribution.reduce(reduce_op, output)
     else:
-      assert aggregation is not variables_lib.VariableAggregation.NONE
+      assert reduce_op is not None
       def merge_fn(distribution, value):
-        self._last_step_outputs[name] = distribution.reduce(
-            aggregation, value, destinations="/device:CPU:0")
+        self._last_step_outputs[name] = distribution.reduce(reduce_op, value)
         # Setting this inside the `merge_fn` because all replicas share the same
         # context object, so it's more robust to set it only once (even if all
         # the replicas are trying to set the same value).
-        self._last_step_outputs_aggregations[name] = aggregation
+        self._last_step_outputs_reduce_ops[name] = reduce_op
 
       distribution_strategy_context.get_replica_context().merge_call(
-          merge_fn, output)
+          merge_fn, args=(output,))
 
   @property
   def non_tensor_outputs(self):
@@ -1384,10 +1709,10 @@ class MultiStepContext(object):
     else:
       def merge_fn(distribution, value):
         # NOTE(priyag): For non tensor outputs, we simply return all the values
-        # in a list as aggregation doesn't make sense on non tensors.
+        # in a list as reduction doesn't make sense on non tensors.
         self._non_tensor_outputs[name] = distribution.unwrap(value)
       distribution_strategy_context.get_replica_context().merge_call(
-          merge_fn, output)
+          merge_fn, args=(output,))
 
 
 def value_container(val):
@@ -1452,14 +1777,11 @@ class AggregatingVariable(checkpointable.CheckpointableBase):
                          "a variable in Replica Context.")
 
       def merge_fn(strategy, value, *other_args, **other_kwargs):
-        return strategy.update(
-            self, f,
-            strategy.reduce(
-                aggregation=self._aggregation, value=value, destinations=self),
-            *other_args, **other_kwargs)
+        v = _apply_aggregation(strategy, value, self._aggregation, self)
+        return strategy.update(self, f, v, *other_args, **other_kwargs)
 
       return distribution_strategy_context.get_replica_context().merge_call(
-          merge_fn, *args, **kwargs)
+          merge_fn, args=args, kwargs=kwargs)
 
   def assign_sub(self, *args, **kwargs):
     assign_sub_fn = lambda var, *a, **kw: var.assign_sub(*a, **kw)
diff --git a/tensorflow/python/eager/BUILD b/tensorflow/python/eager/BUILD
index 362e8e3b832..f43cf9327a1 100644
--- a/tensorflow/python/eager/BUILD
+++ b/tensorflow/python/eager/BUILD
@@ -174,6 +174,23 @@ cuda_py_test(
     ],
 )
 
+cuda_py_test(
+    name = "function_gradients_test",
+    size = "medium",
+    srcs = ["function_gradients_test.py"],
+    additional_deps = [
+        ":backprop",
+        ":context",
+        ":def_function",
+        ":function",
+        ":test",
+        "@absl_py//absl/testing:parameterized",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:resource_variable_ops",
+    ],
+    shard_count = 5,
+)
+
 cuda_py_test(
     name = "function_test",
     size = "medium",
@@ -193,7 +210,7 @@ cuda_py_test(
         "//tensorflow/python:math_ops",
         "//tensorflow/python:resource_variable_ops",
     ],
-    shard_count = 20,
+    shard_count = 15,
 )
 
 py_library(
@@ -238,6 +255,18 @@ py_library(
     ],
 )
 
+py_test(
+    name = "execution_callbacks_test",
+    srcs = ["execution_callbacks_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":execution_callbacks",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+    ],
+)
+
 py_library(
     name = "graph_only_ops",
     srcs = ["graph_only_ops.py"],
@@ -318,6 +347,7 @@ py_library(
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:execute",
         "//tensorflow/python/eager:tape",
+        "//tensorflow/python/ops/parallel_for:control_flow_ops",
         "@six_archive//:six",
     ],
 )
diff --git a/tensorflow/python/eager/backprop.py b/tensorflow/python/eager/backprop.py
index 844c9b52e7f..29f9b2cda3a 100644
--- a/tensorflow/python/eager/backprop.py
+++ b/tensorflow/python/eager/backprop.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 
 import functools
 import operator
+import sys
 
 import six
 
@@ -33,6 +34,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import gen_array_ops
 from tensorflow.python.ops import gen_math_ops
 from tensorflow.python.ops import math_ops
@@ -42,9 +44,20 @@ from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import nest
 from tensorflow.python.util import tf_contextlib
 from tensorflow.python.util import tf_inspect
+from tensorflow.python.util.lazy_loader import LazyLoader
 from tensorflow.python.util.tf_export import tf_export
 
 
+# Note that we need to lazy load the following two modules to avoid creating
+# circular dependencies.
+# TODO(b/119775953): fix the circular dependencies.
+pfor_ops = LazyLoader(
+    "pfor_ops", globals(),
+    "tensorflow.python.ops.parallel_for.control_flow_ops")
+
+function = LazyLoader("function", globals(),
+                      "tensorflow.python.eager.function")
+
 _op_attr_type_cache = {}
 
 
@@ -536,11 +549,11 @@ def _aggregate_grads(gradients):
 
   if len(gradients) == 1:
     return gradients[0]
-  if all([isinstance(g, ops.Tensor) for g in gradients]):
+  if all(isinstance(g, ops.Tensor) for g in gradients):
     return gen_math_ops.add_n(gradients)
   else:
-    assert all([isinstance(g, (ops.Tensor, ops.IndexedSlices))
-                for g in gradients])
+    assert all(isinstance(g, (ops.Tensor, ops.IndexedSlices))
+               for g in gradients)
     indexed_slices_list = []
     for grad in gradients:
       # TODO(xpan): Support nested IndexedSlices and core IndexedSlices
@@ -776,6 +789,8 @@ class GradientTape(object):
         context.context().end_step()
       except AttributeError:
         pass
+      except TypeError:
+        pass
 
   def watch(self, tensor):
     """Ensures that `tensor` is being traced by this tape.
@@ -935,3 +950,213 @@ class GradientTape(object):
 
     grad = nest.pack_sequence_as(sources, flat_grad)
     return grad
+
+  def jacobian(self,
+               target,
+               sources,
+               unconnected_gradients=UnconnectedGradients.NONE,
+               parallel_iterations=None,
+               experimental_use_pfor=True):
+    """Computes the jacobian using operations recorded in context of this tape.
+
+    See http://en.wikipedia.org/wiki/jacobian_matrix_and_determinant for the
+    definition of a Jacobian.
+
+    Example usage:
+
+    with tf.GradientTape() as g:
+      x  = tf.constant([1.0, 2.0])
+      g.watch(x)
+      y = x * x
+    jacobian = g.jacobian(y, x)
+    # jacobian value is [[2., 0.], [0., 4.]]
+
+    Args:
+      target: Tensor to be differentiated.
+      sources: a list or nested structure of Tensors or Variables. `target`
+        will be differentiated against elements in `sources`.
+      unconnected_gradients: a value which can either hold 'none' or 'zero' and
+        alters the value which will be returned if the target and sources are
+        unconnected. The possible values and effects are detailed in
+        'UnconnectedGradients' and it defaults to 'none'.
+      parallel_iterations: A knob to control how many iterations are dispatched
+        in parallel. This knob can be used to control the total memory usage.
+      experimental_use_pfor: If true, vectorizes the jacobian computation. Else
+        falls back to a sequential while_loop. Vectorization can sometimes fail
+        or lead to excessive memory usage. This option can be used to disable
+        vectorization in such cases.
+
+    Returns:
+      a list or nested structure of Tensors (or IndexedSlices, or None),
+      one for each element in `sources`. Returned structure is the same as
+      the structure of `sources`.
+
+    Raises:
+      RuntimeError: If called on a non-persistent tape with eager execution
+        enabled and without enabling experimental_use_pfor.
+      ValueError: If vectorization of jacobian computation fails.
+    """
+    flat_sources = nest.flatten(sources)
+    target_static_shape = target.shape
+    target_shape = array_ops.shape(target)
+    # Note that we push and pop the tape here and below. This is needed since we
+    # need gradients through the enclosed operations.
+    self._push_tape()
+    target = array_ops.reshape(target, [-1])
+    self._pop_tape()
+
+    def loop_fn(i):
+      self._push_tape()
+      y = array_ops.gather(target, i)
+      self._pop_tape()
+      return self.gradient(y, flat_sources,
+                           unconnected_gradients=unconnected_gradients)
+
+    try:
+      target_size = int(target.shape[0])
+    except TypeError:
+      target_size = array_ops.shape(target)[0]
+
+    if experimental_use_pfor:
+      try:
+        output = pfor_ops.pfor(loop_fn, target_size,
+                               parallel_iterations=parallel_iterations)
+      except ValueError as err:
+        six.reraise(
+            ValueError,
+            ValueError(
+                str(err) + "\nEncountered an exception while vectorizing the "
+                "jacobian computation. Vectorization can be disabled by setting"
+                " experimental_use_pfor to False."),
+            sys.exc_info()[2])
+    else:
+      if context.executing_eagerly() and not self._persistent:
+        raise RuntimeError(
+            "GradientTape must be created with persistent=True"
+            " to compute the jacobian with eager execution enabled and with "
+            " experimental_use_pfor set to False.")
+      output = pfor_ops.for_loop(
+          loop_fn, [target.dtype] * len(flat_sources), target_size,
+          parallel_iterations=parallel_iterations)
+
+    for i, out in enumerate(output):
+      if out is not None:
+        new_shape = array_ops.concat(
+            [target_shape, array_ops.shape(out)[1:]], axis=0)
+        out = array_ops.reshape(out, new_shape)
+        if context.executing_eagerly():
+          out.set_shape(target_static_shape.concatenate(flat_sources[i].shape))
+      output[i] = out
+
+    return nest.pack_sequence_as(sources, output)
+
+  def batch_jacobian(self,
+                     target,
+                     source,
+                     unconnected_gradients=UnconnectedGradients.NONE,
+                     parallel_iterations=None,
+                     experimental_use_pfor=True):
+    """Computes and stacks per-example jacobians.
+
+    See http://en.wikipedia.org/wiki/jacobian_matrix_and_determinant for the
+    definition of a Jacobian.  This function is essentially an efficient
+    implementation of the following:
+    `tf.stack([self.jacobian(y[i], x[i]) for i in range(x.shape[0])])`.
+
+    Note that compared to `GradientTape.jacobian` which computes gradient of
+    each output value w.r.t each input value, this function is useful when
+    `target[i,...] is independent of `source[j,...]` for `j != i`. This
+    independence assumption allows more efficient computation as compared to
+    `GradientTape.jacobian`. The output, as well as intermediate activations,
+    are lower dimensional and avoid a bunch of redundant zeros which would
+    result in the jacobian computation given the independence assumption.
+
+    Example usage:
+    with tf.GradientTape() as g:
+      x = tf.constant([[1, 2], [3, 4]], dtype=tf.float32)
+      g.watch(x)
+      y = x * x
+    batch_jacobian = g.batch_jacobian(y, x)
+    # batch_jacobian is [[[2,  0], [0,  4]], [[6,  0], [0,  8]]]
+
+    Args:
+      target: A tensor with rank 2 or higher and with shape [b, y1, ..., y_n].
+        `target[i,...]` should only depend on `source[i,...]`.
+      source: A tensor with rank 2 or higher and with shape [b, x1, ..., x_m].
+      unconnected_gradients: a value which can either hold 'none' or 'zero' and
+        alters the value which will be returned if the target and sources are
+        unconnected. The possible values and effects are detailed in
+        'UnconnectedGradients' and it defaults to 'none'.
+      parallel_iterations: A knob to control how many iterations are dispatched
+        in parallel. This knob can be used to control the total memory usage.
+      experimental_use_pfor: If true, uses pfor for computing the Jacobian. Else
+        uses a tf.while_loop.
+
+    Returns:
+      A tensor `t` with shape [b, y_1, ..., y_n, x1, ..., x_m] where `t[i, ...]`
+      is the jacobian of `target[i, ...]` w.r.t. `source[i, ...]`, i.e. stacked
+      per-example jacobians.
+
+    Raises:
+      RuntimeError: If called on a non-persistent tape with eager execution
+        enabled and without enabling experimental_use_pfor.
+      ValueError: If vectorization of jacobian computation fails or if first
+        dimension of `target` and `source` do not match.
+    """
+    target_shape = target.shape
+    if not target_shape.with_rank_at_least(2)[0].is_compatible_with(
+        source.shape.with_rank_at_least(2)[0]):
+      raise ValueError(
+          "Need first dimension of target shape (%s) and "
+          "source shape (%s) to match." % (target.shape, source.shape))
+    if target_shape.is_fully_defined():
+      batch_size = int(target_shape[0])
+      target_row_size = target_shape.num_elements() // batch_size
+    else:
+      target_shape = array_ops.shape(target)
+      batch_size = target_shape[0]
+      target_row_size = array_ops.size(target) // batch_size
+    source_shape = array_ops.shape(source)
+    # Flatten target to 2-D.
+    # Note that we push and pop the tape here and below. This is needed since we
+    # need gradients through the enclosed operations.
+    self._push_tape()
+    with ops.control_dependencies(
+        [check_ops.assert_equal(batch_size, source_shape[0])]):
+      target = array_ops.reshape(target, [batch_size, target_row_size])
+    self._pop_tape()
+
+    def loop_fn(i):
+      self._push_tape()
+      y = array_ops.gather(target, i, axis=1)
+      self._pop_tape()
+      return self.gradient(y, source,
+                           unconnected_gradients=unconnected_gradients)
+
+    if experimental_use_pfor:
+      try:
+        output = pfor_ops.pfor(loop_fn, target_row_size,
+                               parallel_iterations=parallel_iterations)
+      except ValueError as err:
+        six.reraise(
+            ValueError,
+            ValueError(
+                str(err) + "\nEncountered an exception while vectorizing the "
+                "batch_jacobian computation. Vectorization can be disabled by "
+                "setting experimental_use_pfor to False."),
+            sys.exc_info()[2])
+    else:
+      if context.executing_eagerly() and not self._persistent:
+        raise RuntimeError(
+            "GradientTape must be created with persistent=True"
+            " to compute the batch_jacobian with eager execution enabled and "
+            " with experimental_use_pfor set to False.")
+      output = pfor_ops.for_loop(loop_fn, target.dtype, target_row_size,
+                                 parallel_iterations=parallel_iterations)
+    if output is None:
+      return None
+    output = array_ops.reshape(output,
+                               [target_row_size, batch_size, -1])
+    output = array_ops.transpose(output, [1, 0, 2])
+    new_shape = array_ops.concat([target_shape, source_shape[1:]], axis=0)
+    return array_ops.reshape(output, new_shape)
diff --git a/tensorflow/python/eager/backprop_test.py b/tensorflow/python/eager/backprop_test.py
index 274d5320df0..3cec40a48f7 100644
--- a/tensorflow/python/eager/backprop_test.py
+++ b/tensorflow/python/eager/backprop_test.py
@@ -74,7 +74,7 @@ class BackpropTest(test.TestCase):
       tf_g1 = embedding_ops.embedding_lookup(tf_var, tf_ind1)
       tf_g2 = embedding_ops.embedding_lookup(tf_var, tf_ind2)
       tf_g3 = embedding_ops.embedding_lookup(tf_var, tf_ind3)
-      tf_g4 = math_ops.reduce_sum(tf_var * 2.0, reduction_indices=(0, 1))
+      tf_g4 = math_ops.reduce_sum(tf_var * 2.0, axis=(0, 1))
       tf_y = tf_g1 * tf_g2 * tf_g3 * tf_g4
       tf_grad = gradients.gradients(tf_y, [tf_var])[0]
 
@@ -215,7 +215,7 @@ class BackpropTest(test.TestCase):
       self.assertAllClose(tf_grad.values.eval(), grad.values)
 
       tf_opt.apply_gradients([(tf_grad, tf_embedding)]).run()
-      expected = tf_embedding.eval()
+      expected = self.evaluate(tf_embedding)
     opt.apply_gradients([(grad, embedding)])
     self.assertAllClose(expected, embedding.read_value())
 
@@ -233,6 +233,68 @@ class BackpropTest(test.TestCase):
     self.assertTrue(ordered_variables[0] is v0)
     self.assertTrue(ordered_variables[1] is v1)
 
+  def testTapeNoOpGradient(self):
+    x = constant_op.constant(3.0)
+    with backprop.GradientTape() as t:
+      t.watch(x)
+      y = x
+    self.assertEqual(t.gradient(y, x).numpy(), 1.0)
+
+  def testTapeIdentityGradientIsIdentity(self):
+    x = constant_op.constant(3.0)
+    with backprop.GradientTape() as t:
+      t.watch(x)
+      y = array_ops.identity(x)
+    self.assertEqual(t.gradient(y, x).numpy(), 1.0)
+
+  def testTapeGradientMultiTargetOneIsSource(self):
+    x = constant_op.constant(2.0)
+    with backprop.GradientTape() as t:
+      t.watch(x)
+      y = x*x
+    self.assertEqual(t.gradient([x, y], x).numpy(), 5.0)
+
+  def testTapeNoOpGradientWithMultiTargetAllSource(self):
+    x = constant_op.constant(3.0)
+    with backprop.GradientTape() as t:
+      t.watch(x)
+      y = x
+    self.assertEqual(t.gradient([y, y], x).numpy(), 2.0)
+
+  def testTapeNoOpGradientWithMultiTargetMultiSource(self):
+    x = constant_op.constant(3.0)
+    y = constant_op.constant(5.0)
+    with backprop.GradientTape() as t:
+      t.watch(x)
+      t.watch(y)
+      z = y * y
+    self.assertAllEqual(t.gradient([x, y, z], [x, y]), [1.0, 11.0])
+
+  def testTapeNoOpOnVariableIsIdentity(self):
+    v0 = resource_variable_ops.ResourceVariable(1.0)
+    with backprop.GradientTape() as t:
+      y = v0.read_value()
+    self.assertEqual(t.gradient(y, v0).numpy(), 1.0)
+
+  @test_util.assert_no_new_tensors
+  @test_util.assert_no_garbage_created
+  def testTapeNoOpGradient2By2(self):
+    a_2_by_2 = constant_op.constant(2.0, shape=[2, 2])
+    with backprop.GradientTape(persistent=True) as tape:
+      tape.watch(a_2_by_2)
+    dy_dy = tape.gradient(a_2_by_2, [a_2_by_2])[0]
+    self.assertAllEqual(dy_dy.numpy(),
+                        constant_op.constant(1.0, shape=[2, 2]).numpy())
+
+  @test_util.assert_no_new_pyobjects_executing_eagerly
+  def testTapeNoOpGradientMultiTarget2By2(self):
+    a_2_by_2 = constant_op.constant(2.0, shape=[2, 2])
+    with backprop.GradientTape(persistent=True) as tape:
+      tape.watch(a_2_by_2)
+    dy_dy = tape.gradient([a_2_by_2, a_2_by_2], [a_2_by_2])[0]
+    self.assertAllEqual(dy_dy.numpy(),
+                        constant_op.constant(2.0, shape=[2, 2]).numpy())
+
   def testTapeStopRecording(self):
     with backprop.GradientTape() as t:
       x = resource_variable_ops.ResourceVariable(1.0)
@@ -1165,5 +1227,192 @@ class BackpropTest(test.TestCase):
       self.assertAllEqual(da[0], tf_da[0].eval())
 
 
+@test_util.run_all_in_graph_and_eager_modes
+class JacobianTest(test.TestCase):
+
+  def _jacobian(self, experimental_use_pfor):
+    persistent = context.executing_eagerly and not experimental_use_pfor
+    with backprop.GradientTape(persistent=persistent) as g:
+      x = constant_op.constant([1., 2.])
+      y = constant_op.constant([3., 4.])
+      g.watch(x)
+      g.watch(y)
+      z = x * x * y
+    jacobian = g.jacobian(z, [x, y],
+                          experimental_use_pfor=experimental_use_pfor)
+    answer = [array_ops.diag(2 * x * y), array_ops.diag(x * x)]
+    return jacobian, answer
+
+  def testPfor(self):
+    jacobian, answer = self._jacobian(experimental_use_pfor=True)
+    for j, a in zip(jacobian, answer):
+      self.assertAllEqual(a, j)
+
+  def testWhileLoop(self):
+    jacobian, answer = self._jacobian(experimental_use_pfor=False)
+    for j, a in zip(jacobian, answer):
+      self.assertAllEqual(a, j)
+
+  def testPforDefun(self):
+
+    @function.defun
+    def _f():
+      return self._jacobian(experimental_use_pfor=True)
+
+    jacobian, answer = _f()
+    for j, a in zip(jacobian, answer):
+      self.assertAllEqual(a, j)
+
+  def testWhileLoopDefun(self):
+
+    @function.defun
+    def _f():
+      return self._jacobian(experimental_use_pfor=False)
+
+    jacobian, answer = _f()
+    for j, a in zip(jacobian, answer):
+      self.assertAllEqual(a, j)
+
+  def testPersistentTape(self):
+    if not context.executing_eagerly():
+      return
+    with backprop.GradientTape() as g:
+      x = constant_op.constant([1.0, 2.0])
+      g.watch(x)
+      y = x * x
+    with self.assertRaisesRegexp(RuntimeError, 'persistent'):
+      g.jacobian(y, x, experimental_use_pfor=False)
+
+  def testPforException(self):
+    var = variables.Variable([1.])
+
+    @custom_gradient.custom_gradient
+    def op(x):
+      def grad(_):
+        # Note that we perform a stateful operation here that will not be
+        # compatible with parallel for construct.
+        with ops.control_dependencies(
+            [var.assign(random_ops.random_uniform([1]))]):
+          return constant_op.constant(1.)
+      return x, grad
+
+    with backprop.GradientTape() as g:
+      x = constant_op.constant([1., 2.])
+      g.watch(x)
+      y = op(x)
+    with self.assertRaisesRegexp(ValueError, 'No converter'):
+      g.jacobian(y, x, experimental_use_pfor=True)
+
+  def test_parallel_iterations(self):
+    with backprop.GradientTape(persistent=True) as g:
+      x = constant_op.constant([[1., 2], [3, 4]])
+      g.watch(x)
+      y = math_ops.matmul(x, x)
+    self.assertAllClose(g.jacobian(y, x, parallel_iterations=2),
+                        g.jacobian(y, x, parallel_iterations=3))
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class BatchJacobianTest(test.TestCase):
+
+  def _batch_jacobian(self, experimental_use_pfor):
+    persistent = context.executing_eagerly and not experimental_use_pfor
+    with backprop.GradientTape(persistent=persistent) as g:
+      x = constant_op.constant([[1., 2.], [3., 4.]])
+      y = constant_op.constant([[3., 4.], [5., 6.]])
+      g.watch(x)
+      z = x * x * y
+    batch_jacobian = g.batch_jacobian(
+        z, x, experimental_use_pfor=experimental_use_pfor)
+    answer = array_ops.stack([array_ops.diag(2 * x[0] * y[0]),
+                              array_ops.diag(2 * x[1] * y[1])])
+    return batch_jacobian, answer
+
+  def testPfor(self):
+    batch_jacobian, answer = self._batch_jacobian(experimental_use_pfor=True)
+    self.assertAllEqual(answer, batch_jacobian)
+
+  def testWhileLoop(self):
+    batch_jacobian, answer = self._batch_jacobian(experimental_use_pfor=False)
+    self.assertAllEqual(answer, batch_jacobian)
+
+  def testPforDefun(self):
+
+    @function.defun
+    def _f():
+      return self._batch_jacobian(experimental_use_pfor=True)
+
+    batch_jacobian, answer = _f()
+    self.assertAllEqual(answer, batch_jacobian)
+
+  def testWhileLoopDefun(self):
+
+    @function.defun
+    def _f():
+      return self._batch_jacobian(experimental_use_pfor=False)
+
+    batch_jacobian, answer = _f()
+    self.assertAllEqual(answer, batch_jacobian)
+
+  def testPersistentTape(self):
+    if not context.executing_eagerly():
+      return
+    with backprop.GradientTape() as g:
+      x = constant_op.constant([[1.0, 2.0]])
+      g.watch(x)
+      y = x * x
+    with self.assertRaisesRegexp(RuntimeError, 'persistent'):
+      g.batch_jacobian(y, x, experimental_use_pfor=False)
+
+  def testBadShape(self):
+    x = random_ops.random_uniform([2, 3])
+    with backprop.GradientTape() as g:
+      y = array_ops.concat([x, x], axis=0)
+    with self.assertRaisesRegexp(ValueError, 'Need first dimension'):
+      g.batch_jacobian(y, x)
+
+  def testBadInputRank(self):
+    x = random_ops.random_uniform([2])
+    with backprop.GradientTape() as g:
+      y = random_ops.random_uniform([2, 2])
+    with self.assertRaisesRegexp(ValueError, 'must have rank at least 2'):
+      g.batch_jacobian(y, x)
+
+  def testBadOutputRank(self):
+    x = random_ops.random_uniform([2, 2])
+    with backprop.GradientTape() as g:
+      y = random_ops.random_uniform([2])
+    with self.assertRaisesRegexp(ValueError, 'must have rank at least 2'):
+      g.batch_jacobian(y, x)
+
+  def testPforException(self):
+    var = variables.Variable([1.])
+
+    @custom_gradient.custom_gradient
+    def op(x):
+      def grad(_):
+        # Note that we perform a stateful operation here that will not be
+        # compatible with parallel for construct.
+        with ops.control_dependencies(
+            [var.assign(random_ops.random_uniform([1]))]):
+          return constant_op.constant(1.)
+      return x, grad
+
+    with backprop.GradientTape() as g:
+      x = constant_op.constant([[1.], [2.]])
+      g.watch(x)
+      y = op(x)
+    with self.assertRaisesRegexp(ValueError, 'No converter'):
+      g.batch_jacobian(y, x, experimental_use_pfor=True)
+
+  def test_parallel_iterations(self):
+    with backprop.GradientTape(persistent=True) as g:
+      x = constant_op.constant([[1., 2], [3, 4]])
+      g.watch(x)
+      w = constant_op.constant([[1., 2, 3, 4], [5, 6, 7, 8]])
+      y = math_ops.matmul(x, w)
+    self.assertAllClose(g.batch_jacobian(y, x, parallel_iterations=2),
+                        g.batch_jacobian(y, x, parallel_iterations=3))
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/eager/benchmarks_test.py b/tensorflow/python/eager/benchmarks_test.py
index 886715867c8..31a7efca82b 100644
--- a/tensorflow/python/eager/benchmarks_test.py
+++ b/tensorflow/python/eager/benchmarks_test.py
@@ -80,7 +80,6 @@ class SubclassedKerasModel(keras.Model):
 
   def __init__(self, initializer="ones"):
     super(SubclassedKerasModel, self).__init__()
-    self._can_use_graph_functions = True
     self.layer_a = keras.layers.Dense(
         64, kernel_initializer=initializer, bias_initializer="zeros")
     self.layer_b = keras.layers.Dense(
@@ -733,38 +732,38 @@ class MicroBenchmarks(test.Benchmark):
     assert np.equal(func(), make_keras_model()(data)).all()
     self._run(func, 30000)
 
-  def _benchmark_keras_model_fit(self, model):
+  def _benchmark_keras_model_fit(self, model, run_eagerly=False):
     data = random_ops.random_uniform((10, 10), minval=-1, maxval=1)
     labels = random_ops.random_uniform((10, 10), minval=-1, maxval=1)
     dataset = dataset_ops.Dataset.from_tensors((data, labels)).repeat()
     model.compile(
         gradient_descent.GradientDescentOptimizer(learning_rate=0.001),
-        loss="mse")
+        loss="mse", run_eagerly=run_eagerly)
     func = lambda: model.fit(dataset, epochs=1, steps_per_epoch=1000, verbose=0)
     # First call is more expensive (creates variables etc.), discount that.
     model.fit(dataset, epochs=1, steps_per_epoch=1, verbose=0)
 
     self._run(func, 1)
 
-  def _benchmark_keras_model_evaluate(self, model):
+  def _benchmark_keras_model_evaluate(self, model, run_eagerly=False):
     data = random_ops.random_uniform((10, 10), minval=-1, maxval=1)
     labels = random_ops.random_uniform((10, 10), minval=-1, maxval=1)
     dataset = dataset_ops.Dataset.from_tensors((data, labels)).repeat()
     model.compile(
         gradient_descent.GradientDescentOptimizer(learning_rate=0.001),
-        loss="mse")
+        loss="mse", run_eagerly=run_eagerly)
     func = lambda: model.evaluate(dataset, steps=1000, verbose=0)
     # First call is more expensive (creates variables etc.), discount that.
     model.evaluate(dataset, steps=1, verbose=0)
 
     self._run(func, 1)
 
-  def _benchmark_keras_model_predict(self, model):
+  def _benchmark_keras_model_predict(self, model, run_eagerly=False):
     data = random_ops.random_uniform((10, 10), minval=-1, maxval=1)
     dataset = dataset_ops.Dataset.from_tensors(tuple([data])).repeat()
     model.compile(
         gradient_descent.GradientDescentOptimizer(learning_rate=0.001),
-        loss="mse")
+        loss="mse", run_eagerly=run_eagerly)
     func = lambda: model.predict(dataset, steps=1000, verbose=0)
     # First call is more expensive (creates variables etc.), discount that.
     model.predict(dataset, steps=1, verbose=0)
@@ -780,10 +779,9 @@ class MicroBenchmarks(test.Benchmark):
       model = SubclassedKerasModel(initializer="glorot_uniform")
       self._benchmark_keras_model_fit(model)
 
-  def benchmark_keras_model_subclassed_fit_disable_defun(self):
+  def benchmark_keras_model_subclassed_fit_run_model_eagerly(self):
     model = SubclassedKerasModel(initializer="glorot_uniform")
-    model._can_use_graph_functions = False
-    self._benchmark_keras_model_fit(model)
+    self._benchmark_keras_model_fit(model, run_eagerly=True)
 
   def benchmark_keras_model_functional_fit(self):
     model = make_keras_model(initializer="glorot_uniform")
@@ -794,10 +792,9 @@ class MicroBenchmarks(test.Benchmark):
       model = make_keras_model(initializer="glorot_uniform")
       self._benchmark_keras_model_fit(model)
 
-  def benchmark_keras_model_functional_fit_disable_defun(self):
+  def benchmark_keras_model_functional_fit_run_model_eagerly(self):
     model = make_keras_model(initializer="glorot_uniform")
-    model._can_use_graph_functions = False
-    self._benchmark_keras_model_fit(model)
+    self._benchmark_keras_model_fit(model, run_eagerly=True)
 
   def benchmark_keras_model_sequential_fit(self):
     model = make_sequential_keras_model(initializer="glorot_uniform")
@@ -808,64 +805,57 @@ class MicroBenchmarks(test.Benchmark):
       model = make_sequential_keras_model(initializer="glorot_uniform")
       self._benchmark_keras_model_fit(model)
 
-  def benchmark_keras_model_sequential_fit_disable_defun(self):
+  def benchmark_keras_model_sequential_fit_run_model_eagerly(self):
     model = make_sequential_keras_model(initializer="glorot_uniform")
-    model._can_use_graph_functions = False
-    self._benchmark_keras_model_fit(model)
+    self._benchmark_keras_model_fit(model, run_eagerly=True)
 
   def benchmark_keras_model_subclassed_evaluate(self):
     model = SubclassedKerasModel(initializer="glorot_uniform")
     self._benchmark_keras_model_evaluate(model)
 
-  def benchmark_keras_model_subclassed_evaluate_disable_defun(self):
+  def benchmark_keras_model_subclassed_evaluate_run_model_eagerly(self):
     model = SubclassedKerasModel(initializer="glorot_uniform")
-    model._can_use_graph_functions = False
-    self._benchmark_keras_model_evaluate(model)
+    self._benchmark_keras_model_evaluate(model, run_eagerly=True)
 
   def benchmark_keras_model_functional_evaluate(self):
     model = make_keras_model(initializer="glorot_uniform")
     self._benchmark_keras_model_evaluate(model)
 
-  def benchmark_keras_model_functional_evaluate_disable_defun(self):
+  def benchmark_keras_model_functional_evaluate_run_model_eagerly(self):
     model = make_keras_model(initializer="glorot_uniform")
-    model._can_use_graph_functions = False
-    self._benchmark_keras_model_evaluate(model)
+    self._benchmark_keras_model_evaluate(model, run_eagerly=True)
 
   def benchmark_keras_model_sequential_evaluate(self):
     model = make_sequential_keras_model(initializer="glorot_uniform")
     self._benchmark_keras_model_evaluate(model)
 
-  def benchmark_keras_model_sequential_evaluate_disable_defun(self):
+  def benchmark_keras_model_sequential_evaluate_run_model_eagerly(self):
     model = make_sequential_keras_model(initializer="glorot_uniform")
-    model._can_use_graph_functions = False
-    self._benchmark_keras_model_evaluate(model)
+    self._benchmark_keras_model_evaluate(model, run_eagerly=True)
 
   def benchmark_keras_model_subclassed_predict(self):
     model = SubclassedKerasModel(initializer="glorot_uniform")
     self._benchmark_keras_model_predict(model)
 
-  def benchmark_keras_model_subclassed_predict_disable_defun(self):
+  def benchmark_keras_model_subclassed_predict_run_model_eagerly(self):
     model = SubclassedKerasModel(initializer="glorot_uniform")
-    model._can_use_graph_functions = False
-    self._benchmark_keras_model_predict(model)
+    self._benchmark_keras_model_predict(model, run_eagerly=True)
 
   def benchmark_keras_model_functional_predict(self):
     model = make_keras_model(initializer="glorot_uniform")
     self._benchmark_keras_model_predict(model)
 
-  def benchmark_keras_model_functional_predict_disable_defun(self):
+  def benchmark_keras_model_functional_predict_run_model_eagerly(self):
     model = make_keras_model(initializer="glorot_uniform")
-    model._can_use_graph_functions = False
-    self._benchmark_keras_model_predict(model)
+    self._benchmark_keras_model_predict(model, run_eagerly=True)
 
   def benchmark_keras_model_sequential_predict(self):
     model = make_sequential_keras_model(initializer="glorot_uniform")
     self._benchmark_keras_model_predict(model)
 
-  def benchmark_keras_model_sequential_predict_disable_defun(self):
+  def benchmark_keras_model_sequential_predict_run_model_eagerly(self):
     model = make_sequential_keras_model(initializer="glorot_uniform")
-    model._can_use_graph_functions = False
-    self._benchmark_keras_model_predict(model)
+    self._benchmark_keras_model_predict(model, run_eagerly=True)
 
   def benchmarkScan(self):
     elems = math_ops.range(1600)
diff --git a/tensorflow/python/eager/context.py b/tensorflow/python/eager/context.py
index e3fef524bf9..2f6b038dda9 100644
--- a/tensorflow/python/eager/context.py
+++ b/tensorflow/python/eager/context.py
@@ -25,7 +25,6 @@ import random
 import threading
 
 from tensorflow.core.protobuf import config_pb2
-from tensorflow.core.protobuf import rewriter_config_pb2
 from tensorflow.python import pywrap_tensorflow
 from tensorflow.python import tf2
 from tensorflow.python.framework import c_api_util
@@ -86,21 +85,21 @@ class FunctionCallOptions(object):
   Eager functions are functions decorated with tf.contrib.eager.defun.
   """
 
-  def __init__(self, executor_type=None, rewriter_config=None):
+  def __init__(self, executor_type=None, config_proto=None):
     """Constructor.
 
     Args:
       executor_type: (optional) name of the executor to be used to execute the
         eager function. If None or an empty string, the default Tensorflow
         executor will be used.
-      rewriter_config: (optional) a rewriter_config_pb2.RewriterConfig proto or
+      config_proto: (optional) a `config_pb2.ConfigProto` proto or
         a serialized string of that proto.
         The config used by Grappler when optimizing the function graph.
         Each concrete function is optimized the first time is called. Changing
-        rewriter_config after the first call has no effect.
-        If rewriter_config is None, an empty RewriterConfig will be used.
+        config_proto after the first call has no effect.
+        If config_proto is None, an empty RewriterConfig will be used.
     """
-    self.rewriter_config_serialized = rewriter_config
+    self.config_proto_serialized = config_proto
     self.executor_type = executor_type
 
   @property
@@ -112,24 +111,22 @@ class FunctionCallOptions(object):
     self._executor_type = executor_type
 
   @property
-  def rewriter_config_serialized(self):
-    return self._rewriter_config_serialized
+  def config_proto_serialized(self):
+    return self._config_proto_serialized
 
-  @rewriter_config_serialized.setter
-  def rewriter_config_serialized(self, config):
-    if isinstance(config, rewriter_config_pb2.RewriterConfig):
-      self._rewriter_config_serialized = config.SerializeToString()
+  @config_proto_serialized.setter
+  def config_proto_serialized(self, config):
+    if isinstance(config, config_pb2.ConfigProto):
+      self._config_proto_serialized = config.SerializeToString()
     elif isinstance(config, str):
-      self._rewriter_config_serialized = config
+      self._config_proto_serialized = config
     elif config is None:
-      self._rewriter_config_serialized = rewriter_config_pb2.RewriterConfig(
-      ).SerializeToString()
+      self._config_proto_serialized = (
+          config_pb2.ConfigProto().SerializeToString())
     else:
-      raise ValueError(
-          "the rewriter config must be either a "
-          "rewriter_config_pb2.RewriterConfig, or a serialized string of that "
-          "proto or None. got: {}"
-          .format(type(config)))
+      raise ValueError("the rewriter config must be either a "
+                       "config_pb2.ConfigProto, or a serialized string of that "
+                       "proto or None. got: {}".format(type(config)))
 
 
 # TODO(agarwal): better name ?
@@ -152,14 +149,12 @@ class _EagerContext(threading.local):
 
     # Default rewriter config corresponds to turning all default grappler
     # optimizations on.
-    base_config = rewriter_config_pb2.RewriterConfig()
+    base_config = config_pb2.ConfigProto()
 
-    if config is not None and config.HasField(
-        "graph_options") and config.graph_options.HasField("rewrite_options"):
-      base_config.Merge(config.graph_options.rewrite_options)
+    if config is not None:
+      base_config.MergeFrom(config)
 
-    self.function_call_options = FunctionCallOptions(
-        rewriter_config=base_config)
+    self.function_call_options = FunctionCallOptions(config_proto=base_config)
 
 
 ContextSwitch = collections.namedtuple(
@@ -897,21 +892,21 @@ def export_run_metadata():
   return context().export_run_metadata()
 
 
-def function_rewriter_config(rewriter_config):
+def function_config_proto(config_proto):
   """Context manager for setting the grappler rewrite config.
 
   This config is used by Grappler when optimizing the function graph.
 
   Args:
-    rewriter_config: a rewriter_config_pb2.RewriterConfig proto or
+    config_proto: a `config_pb2.ConfigProto` proto or
       a serialized string of that proto or None. If None, the default instance
-      of rewriter_config_pb2.RewriterConfig will be used.
+      of `config_pb2.ConfigProto` will be used.
 
   Returns:
     A context manager.
   """
   def _set_options_func(options):
-    options.rewriter_config_serialized = rewriter_config
+    options.config_proto_serialized = config_proto
 
   return context().function_call_options(_set_options_func)
 
diff --git a/tensorflow/python/eager/def_function.py b/tensorflow/python/eager/def_function.py
index cad6721c702..6bacd7a962f 100644
--- a/tensorflow/python/eager/def_function.py
+++ b/tensorflow/python/eager/def_function.py
@@ -214,7 +214,7 @@ class PolymorphicFunction(object):
                python_function,
                name,
                input_signature=None,
-               autograph=False,
+               autograph=True,
                experimental_autograph_options=None):
     """Initializes a polymorphic function.
 
@@ -503,7 +503,7 @@ class PolymorphicFunction(object):
 @tf_export("function", v1=[])
 def function(func=None,
              input_signature=None,
-             autograph=False,
+             autograph=True,
              experimental_autograph_options=None):
   """Creates a callable TensorFlow graph from a Python function.
 
@@ -552,9 +552,9 @@ def function(func=None,
     return x + tf.to_float(c)
 
   assert int(c) == 0
-  assert f(1.0) == 3.0
+  assert f(1.0) == 2.0
   assert int(c) == 1
-  assert f(1.0) == 4.0
+  assert f(1.0) == 3.0
   assert int(c) == 2
   ```
 
diff --git a/tensorflow/python/eager/def_function_test.py b/tensorflow/python/eager/def_function_test.py
index f0f71a219e6..4100a10044c 100644
--- a/tensorflow/python/eager/def_function_test.py
+++ b/tensorflow/python/eager/def_function_test.py
@@ -17,6 +17,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import functools
 
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import def_function
@@ -149,9 +150,9 @@ class DefFunctionTest(test.TestCase):
 
       result = fn(3.0)
 
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       self.assertAllEqual(sess.run(state[0]), 2.0)
-      self.assertAllEqual(sess.run(result), 6.0)
+      self.assertAllEqual(self.evaluate(result), 6.0)
 
   def testLegacyGraphModeVariablesNonTrivialInitializer(self):
     with ops.Graph().as_default(), self.test_session() as sess:
@@ -168,9 +169,9 @@ class DefFunctionTest(test.TestCase):
 
       result = fn(3.0)
 
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       self.assertAllEqual(sess.run(state[0]), 6.0)
-      self.assertAllEqual(sess.run(result), 18.0)
+      self.assertAllEqual(self.evaluate(result), 18.0)
 
   def testLegacyGraphModeInputDependentInitializerFails(self):
     with ops.Graph().as_default():
@@ -207,6 +208,18 @@ class DefFunctionTest(test.TestCase):
     m1 = MyModel()
     self.assertAllEqual(m1.apply(3.0), 6.0)
 
+  def test_functools_partial(self):
+    self.assertAllClose(
+        3.,
+        def_function.function(functools.partial(lambda x, y: x + y, 1.))(
+            constant_op.constant(2.)))
+
+  def test_unspecified_default_argument(self):
+    wrapped = def_function.function(
+        lambda x, y=2: x + y,
+        input_signature=[tensor_spec.TensorSpec((), dtypes.int32)])
+    self.assertEqual(3, wrapped(constant_op.constant(1)).numpy())
+
   def test_optimizer(self):
     x = constant_op.constant([[3., 4.]])
     y = constant_op.constant([2.])
diff --git a/tensorflow/python/eager/execution_callbacks.py b/tensorflow/python/eager/execution_callbacks.py
index 80ff4459d60..28b6b84a82c 100644
--- a/tensorflow/python/eager/execution_callbacks.py
+++ b/tensorflow/python/eager/execution_callbacks.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import contextlib
 import functools
 
 import numpy as np
@@ -28,8 +29,13 @@ from tensorflow.python.eager import core
 from tensorflow.python.eager import execute
 from tensorflow.python.platform import tf_logging as logging
 
-_DEFAULT_CALLBACK_ACTION = "raise"
-_VALID_CALLBACK_ACTIONS = (None, "ignore", "print", "raise", "warn")
+IGNORE = "ignore"
+PRINT = "print"
+RAISE = "raise"
+WARN = "warn"
+
+_DEFAULT_CALLBACK_ACTION = RAISE
+_VALID_CALLBACK_ACTIONS = (None, IGNORE, PRINT, RAISE, WARN)
 
 
 # TODO(cais): Consider moving this exception class to errors_impl.py.
@@ -335,3 +341,38 @@ def seterr(inf_or_nan=None):
           functools.partial(inf_nan_callback, action=inf_or_nan))
 
   return old_settings
+
+
+@contextlib.contextmanager
+def errstate(inf_or_nan=None):
+  """Context manager setting error state.
+
+  Example:
+  ```
+  c = tf.log(0.)  # -inf
+
+  with errstate(inf_or_nan="raise"):
+    tf.log(0.)  # <-- Raises InfOrNanError.
+  ```
+
+  Args:
+    inf_or_nan: Set action for infinity (`inf`) and NaN (`nan`) values.
+      Possible values: `{IGNORE, PRINT, RAISE, WARN}`.
+      `IGNORE`: take no action when `inf` values appear.
+      `PRINT`: print a warning to `stdout`.
+      `RAISE`: raise an `InfOrNanError`.
+      `WARN`: print a warning using `tf.logging.warn`.
+      A value of `None` leads to no change in the action of the condition.
+
+  Yields:
+    None.
+
+  Raises:
+    ValueError: If the value of any keyword arguments is invalid.
+  """
+  if not context.executing_eagerly():
+    yield
+  else:
+    old_settings = seterr(inf_or_nan=inf_or_nan)
+    yield
+    seterr(**old_settings)
diff --git a/tensorflow/python/eager/execution_callbacks_test.py b/tensorflow/python/eager/execution_callbacks_test.py
new file mode 100644
index 00000000000..5594ab5f12a
--- /dev/null
+++ b/tensorflow/python/eager/execution_callbacks_test.py
@@ -0,0 +1,55 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for eager execution_callbacks."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.eager import execution_callbacks
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import test
+
+
+def log_zero():
+  """Computes `log(0.0)`."""
+  return math_ops.log(constant_op.constant(0.))
+
+
+class ExecutionCallbacksTest(test.TestCase):
+
+  def test_errstate_inf_raise(self):
+    with execution_callbacks.errstate(inf_or_nan=execution_callbacks.RAISE):
+      with self.assertRaises(execution_callbacks.InfOrNanError):
+        log_zero()
+
+  def test_errstate_inf_ignore(self):
+    with execution_callbacks.errstate(inf_or_nan=execution_callbacks.IGNORE):
+      self.assertEqual(-float("inf"), log_zero().numpy())
+
+  def test_errstate_nesting(self):
+    with execution_callbacks.errstate(inf_or_nan=execution_callbacks.RAISE):
+      with execution_callbacks.errstate(inf_or_nan=execution_callbacks.IGNORE):
+        self.assertEqual(-float("inf"), log_zero().numpy())
+
+      with self.assertRaises(execution_callbacks.InfOrNanError):
+        log_zero()
+
+
+if __name__ == "__main__":
+  ops.enable_eager_execution()
+  test.main()
diff --git a/tensorflow/python/eager/function.py b/tensorflow/python/eager/function.py
index c429dd359bb..9d05a660b1f 100644
--- a/tensorflow/python/eager/function.py
+++ b/tensorflow/python/eager/function.py
@@ -48,6 +48,7 @@ from tensorflow.python.ops import custom_gradient
 from tensorflow.python.ops import functional_ops
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import compat
 from tensorflow.python.util import nest
 from tensorflow.python.util import tf_decorator
@@ -66,6 +67,11 @@ WHITELIST_FUNCTION_ATTRIBUTE_REGEX = [
     BACKWARD_FUNCTION_ATTRIBUTE_NAME
 ]
 
+CacheKey = collections.namedtuple("CacheKey", [
+    "input_signature", "parent_graph", "device_functions", "colocation_stack",
+    "uses_xla"
+])
+
 
 def _parse_func_attrs(attributes):
   """Convert the keyword arguments into function_def attributes.
@@ -83,8 +89,8 @@ def _parse_func_attrs(attributes):
   """
   attrs = {}
   for key, value in attributes.items():
-    if not any([re.match(reg, key)
-                for reg in WHITELIST_FUNCTION_ATTRIBUTE_REGEX]):
+    if not any(re.match(reg, key)
+               for reg in WHITELIST_FUNCTION_ATTRIBUTE_REGEX):
       raise ValueError("Attribute name is not whitelisted. "
                        "Whitelisted: prefix %s, got: %s" %
                        (WHITELIST_FUNCTION_ATTRIBUTE_REGEX, key))
@@ -260,7 +266,7 @@ class _EagerDefinedFunction(object):
           f=self,
           tout=self._output_types,
           executing_eagerly=executing_eagerly,
-          config=function_call_options.rewriter_config_serialized,
+          config=function_call_options.config_proto_serialized,
           executor_type=function_call_options.executor_type)
 
     if executing_eagerly:
@@ -418,7 +424,10 @@ class Function(object):
 
     if (tape.should_record(tensor_inputs) or
         tape.should_record(self._captured_inputs)):
-      return self._backprop_call(args)
+      if context.executing_eagerly():
+        return self._eager_backprop_call(args)
+      else:
+        return self._backprop_call_with_delayed_rewrite(args)
 
     # Only need to override the gradient in graph mode and when we have outputs.
     if context.executing_eagerly() or not self.outputs:
@@ -444,37 +453,34 @@ class Function(object):
       name: The name to register the gradient as.
     """
     @ops.RegisterGradient(name)
-    def grad_fn(op, *doutputs):  # pylint: disable=unused-variable
-      """Gradients of this function."""
-      if self._backward_graph_function is None:
-        self._construct_backprop_function()
+    def _registered_grad_fn(op, *doutputs):  # pylint: disable=unused-variable
+      return self._grad_fn(op, *doutputs)
 
-      # pylint: disable=protected-access
-      self._forward_function.add_to_graph(op.graph)
-      num_inference_outputs = self._inference_function._num_outputs
+  def _grad_fn(self, op, *doutputs):
+    """Gradients of this function."""
+    if self._backward_graph_function is None:
+      self._construct_backprop_function()
 
-      # Rewrite an inference call op to be a forward call op
-      if op.get_attr("f").name.encode() == self._inference_function.name:
-        func = attr_value_pb2.AttrValue(
-            func=attr_value_pb2.NameAttrList(
-                name=self._forward_function.name))
-        op._set_attr("f", func)
-        types = attr_value_pb2.AttrValue.ListValue(
-            type=self._forward_function._output_types)
-        op._set_attr("Tout", attr_value_pb2.AttrValue(list=types))
-        for i in range(
-            num_inference_outputs, len(self._forward_function._output_types)):
-          t = ops.Tensor(op, i, self._forward_function._output_types[i])
-          t.set_shape(self._forward_function._output_shapes[i])
-          func_graph_output = self._forward_function._func_graph_outputs[i]
-          custom_gradient.copy_handle_data(func_graph_output, t)
-          op._outputs.append(t)
-      # pylint: enable=protected-access
-      # Compute the gradients using the side outputs
-      side_outputs = op.outputs[num_inference_outputs:]
-      args = list(doutputs[:num_inference_outputs]) + list(side_outputs)
-      return self._backward_graph_function._call_flat(  # pylint: disable=protected-access
-          (a for a in args if a is not None))
+    # pylint: disable=protected-access
+    self._forward_function.add_to_graph(op.graph)
+    num_inference_outputs = self._inference_function._num_outputs
+
+    # Rewrite an inference call op to be a forward call op
+    if op.get_attr("f").name.encode() == self._inference_function.name:
+      op._set_func_attr("f", self._forward_function.name)
+      op._set_type_list_attr("Tout", self._forward_function._output_types)
+      op._add_outputs(
+          self._forward_function._output_types[num_inference_outputs:],
+          self._forward_function._output_shapes[num_inference_outputs:])
+      for i in range(num_inference_outputs, len(op.outputs)):
+        func_graph_output = self._forward_function._func_graph_outputs[i]
+        custom_gradient.copy_handle_data(func_graph_output, op.outputs[i])
+    # pylint: enable=protected-access
+    # Compute the gradients using the side outputs
+    side_outputs = op.outputs[num_inference_outputs:]
+    args = list(doutputs[:num_inference_outputs]) + list(side_outputs)
+    return self._backward_graph_function._call_flat(  # pylint: disable=protected-access
+        (a for a in args if a is not None))
 
   @property
   def name(self):
@@ -617,10 +623,13 @@ class Function(object):
         self._func_graph.outputs + backwards_graph_captures,
         forward_function_attr)
 
-  def _backprop_call(self, args):
+  def _eager_backprop_call(self, args):
     """Calls the forward function and records the result on a tape.
 
-    (Only records results on a tape if the function has outputs)
+    This method fully constructs the forward and backward functions before
+    calling the function and recording them on the tape.
+
+    (Only records results on a tape if the function has outputs).
 
     Args:
       args: All inputs to the function, including resolved captured inputs
@@ -662,6 +671,46 @@ class Function(object):
                           args, backward_function)
     return self._build_call_outputs(real_outputs)
 
+  def _backprop_call_with_delayed_rewrite(self, args):
+    """Calls the inference function and records the result on a tape.
+
+    The recorded backwards function will construct the backwards graph and
+    rewrite the inference function to the forward function. This only happens
+    if the recorded backwards function ends up being used to compute gradients.
+
+    This approach avoids constructing unnecessary graphs, but it only works if
+    we are calling this function when not executing eagerly.
+
+    (Only records results on a tape if the function has outputs)
+
+    Args:
+      args: All inputs to the function, including resolved captured inputs
+
+    Returns:
+      The call output.
+    """
+    ctx = context.context()
+
+    if not self._gradient_name:
+      self._gradient_name = "PartitionedCall-%s" % ops.uid()
+      self._register_gradient(self._gradient_name)
+    with ops.get_default_graph().gradient_override_map(
+        {"PartitionedCall": self._gradient_name,
+         "StatefulPartitionedCall": self._gradient_name}):
+      outputs = self._inference_function.call(ctx, args)
+
+    if isinstance(outputs, ops.Operation) or outputs is None:
+      return outputs
+
+    call_op = outputs[0].op
+
+    def backward_function(*args):
+      return self._grad_fn(call_op, *args)
+
+    tape.record_operation(self._inference_function.signature.name, outputs,
+                          args, backward_function)
+    return self._build_call_outputs(outputs)
+
   def _build_call_outputs(self, result):
     """Maps the fdef output list to actual output structure.
 
@@ -724,7 +773,7 @@ class PolymorphicFunction(object):
                name,
                input_signature=None,
                attributes=None,
-               autograph=False):
+               autograph=True):
     """Initializes a polymorphic function.
 
     Args:
@@ -927,17 +976,17 @@ class PolymorphicFunction(object):
     """Computes the cache key given inputs and execution context."""
     if self._input_signature is None:
       inputs = (args, kwargs) if kwargs else args
-      cache_key = pywrap_tensorflow.TFE_Py_EncodeArg(inputs)
+      input_signature = pywrap_tensorflow.TFE_Py_EncodeArg(inputs)
     else:
       del args, kwargs
-      cache_key = self._flat_input_signature
+      input_signature = self._flat_input_signature
 
     ctx = context.context()
     with ops.init_scope():
       # The graph, or whether we're executing eagerly, should be a part of the
       # cache key so we don't improperly capture tensors such as variables.
       executing_eagerly = ctx.executing_eagerly()
-      execution_context = executing_eagerly or ops.get_default_graph()
+      parent_graph = None if executing_eagerly else ops.get_default_graph()
 
     # pylint: disable=protected-access
     default_graph = ops.get_default_graph()
@@ -966,8 +1015,8 @@ class PolymorphicFunction(object):
       else:
         device_functions = ()
     # pylint: enable=protected-access
-    return (cache_key, execution_context, device_functions, colocation_stack,
-            uses_xla)
+    return CacheKey(input_signature, parent_graph, device_functions,
+                    colocation_stack, uses_xla)
 
   def _canonicalize_function_inputs(self, *args, **kwargs):
     """Canonicalizes `args` and `kwargs`.
@@ -1039,16 +1088,21 @@ class PolymorphicFunction(object):
       return inputs, kwargs
     else:
       assert not kwargs
+      signature_relevant_inputs = inputs[:len(self._input_signature)]
       try:
-        nest.assert_same_structure(self._input_signature, inputs)
+        nest.assert_same_structure(self._input_signature,
+                                   signature_relevant_inputs)
       except (ValueError, TypeError):
         raise ValueError("Structure of Python function inputs does not match "
                          "input_signature.")
-      if any(not pywrap_tensorflow.IsTensor(arg) for arg in flat_inputs):
+      signature_inputs_flat = nest.flatten(signature_relevant_inputs)
+      if any(not pywrap_tensorflow.IsTensor(arg)
+             for arg in signature_inputs_flat):
         raise ValueError("When input_signature is provided, all inputs to "
                          "the Python function must be Tensors.")
       if any(not spec.is_compatible_with(other)
-             for spec, other in zip(self._flat_input_signature, flat_inputs)):
+             for spec, other in zip(self._flat_input_signature,
+                                    signature_inputs_flat)):
         raise ValueError("Python inputs incompatible with input_signature: "
                          "inputs (%s), input_signature (%s)" %
                          (str(inputs), str(self._input_signature)))
@@ -1083,6 +1137,9 @@ class PolymorphicFunction(object):
                         "must be hashable.")
 
       if graph_function is None:
+        logging.vlog(1,
+                     "Creating new FuncGraph for Python function %r (key: %r)",
+                     self._python_function, cache_key)
         if self._input_signature is None:
           arglen = len(args)
         else:
@@ -1137,7 +1194,7 @@ def validate_signature(signature):
                     "a possibly nested sequence of TensorSpec objects.")
 
 
-def defun(func=None, input_signature=None, autograph=False):
+def defun(func=None, input_signature=None, autograph=True):
   """Compiles a Python function into a callable TensorFlow graph.
 
   `defun` (short for "define function") trace-compiles a Python function
@@ -1470,7 +1527,7 @@ def defun(func=None, input_signature=None, autograph=False):
 def defun_with_attributes(func=None,
                           input_signature=None,
                           attributes=None,
-                          autograph=False):
+                          autograph=True):
   """Compiles a Python function into a callable TensorFlow graph.
 
   This function supports adding extra function attributes. See detailed
diff --git a/tensorflow/python/eager/function_gradients_test.py b/tensorflow/python/eager/function_gradients_test.py
new file mode 100644
index 00000000000..9b83f57089a
--- /dev/null
+++ b/tensorflow/python/eager/function_gradients_test.py
@@ -0,0 +1,756 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+
+from tensorflow.core.protobuf import config_pb2
+from tensorflow.python.eager import backprop
+from tensorflow.python.eager import context
+from tensorflow.python.eager import def_function
+from tensorflow.python.eager import function
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import gradients_impl
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+from tensorflow.python.util import nest
+
+
+class FunctionGradientsTest(test.TestCase, parameterized.TestCase):
+
+  def testGraphModeWithGradients(self):
+    v = resource_variable_ops.ResourceVariable(1.0, name='v')
+
+    @def_function.function
+    def step():
+      def inner():
+        return v * v
+
+      return backprop.implicit_grad(inner)()[0][0]
+
+    self.assertAllEqual(step(), 2.0)
+
+  def testGraphGradientVariable(self):
+    with ops.Graph().as_default(), self.cached_session():
+      v = variables.Variable(1.0)
+
+      @def_function.function
+      def f():
+        return 2.0 * v
+
+      node = f()
+      grads, = gradients_impl.gradients(node, v)
+      v.initializer.run()
+      self.assertAllEqual(grads.eval(), 2.0)
+      self.assertEqual(grads.shape, v.shape)
+
+  def testSymGradGatherNd(self):
+    with ops.Graph().as_default(), self.cached_session() as sess:
+
+      @def_function.function
+      def f(x):
+        return array_ops.gather_nd(x, [[0]])
+
+      c = constant_op.constant([[2.]])
+      f_c = f(c)
+      g, = gradients_impl.gradients(f_c, c)
+      self.assertAllEqual(self.evaluate(g).values, [[1.0]])
+
+  def testNoSymGradNestedDefun(self):
+
+    @def_function.function
+    def outer():
+
+      @def_function.function
+      def f(x):
+        return array_ops.gather_nd(x, [[0]])
+
+      c = constant_op.constant([[2.]])
+      f_c = f(c)
+      g, = gradients_impl.gradients(f_c, c)
+      self.assertIsInstance(g, ops.IndexedSlices)
+
+    outer()
+
+  def testGraphFunctionWithGradients(self):
+    v = resource_variable_ops.ResourceVariable(1.0, name='v')
+
+    @def_function.function
+    def step():
+      def inner():
+        return v * v
+
+      return backprop.implicit_grad(inner)()[0][0]
+
+    step_op = step.get_concrete_function()
+    self.assertEqual(step_op.output_dtypes, dtypes.float32)
+    self.assertEqual(step_op.output_shapes, tensor_shape.TensorShape([]))
+    self.assertAllEqual(step_op(), 2.0)
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testDefunCondGradient(self):
+
+    @def_function.function
+    def f(x):
+      return control_flow_ops.cond(x > 0.5, lambda: 2 * x, lambda: 3 * x)
+
+    with backprop.GradientTape() as t:
+      x = constant_op.constant(1.0)
+      t.watch(x)
+      y = f(x)
+    self.assertAllEqual(self.evaluate(t.gradient(y, x)), 2.0)
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testGraphLoopGradient(self):
+
+    @def_function.function
+    def f(x):
+      return control_flow_ops.while_loop(lambda _, i: i < 2,
+                                         lambda x, i: (2*x, i + 1),
+                                         [x, 0])[0]
+
+    with backprop.GradientTape() as t:
+      x = constant_op.constant(1.0)
+      t.watch(x)
+      y = f(x)
+    self.assertAllEqual(self.evaluate(t.gradient(y, x)), 4.0)
+
+  def testDefunDifferentiable(self):
+    v = resource_variable_ops.ResourceVariable(1.0)
+
+    @def_function.function
+    def f():
+      return v * v
+
+    self.assertAllEqual(backprop.implicit_grad(f)()[0][0], 2.0)
+
+  def testDefunCanBeDifferentiatedTwice(self):
+    v = resource_variable_ops.ResourceVariable(1.0)
+
+    @def_function.function
+    def f():
+      return v * v
+
+    self.assertAllEqual(backprop.implicit_grad(f)()[0][0], 2.0)
+    # Ensure that v is watched again.
+    self.assertAllEqual(backprop.implicit_grad(f)()[0][0], 2.0)
+
+  def testSymbolicGradientVariableNoneNotZerosLike(self):
+    with ops.Graph().as_default():
+      v = variables.Variable(1.0)
+
+      @def_function.function
+      def f(x, v):
+        v.read_value()
+        return x * x
+
+      x = constant_op.constant(1.0)
+      l = f(x, v)
+      _, dv = gradients_impl.gradients(l, [x, v])
+      with self.cached_session():
+        v.initializer.run()
+        self.assertEqual(dv, None)
+
+  def testDefunCallBackprop(self):
+
+    @def_function.function
+    def f(x):
+      return math_ops.add(x, x)
+
+    @def_function.function
+    def g(x):
+      return backprop.gradients_function(f, [0])(x)[0]
+
+    self.assertAllEqual(2, g(constant_op.constant(2.)))
+
+  @test_util.run_deprecated_v1
+  def testGraphModeEagerGradError(self):
+    with context.graph_mode():
+      def f():
+        x = variable_scope.get_variable(
+            'v', initializer=constant_op.constant(1.0))
+        return x * constant_op.constant(2.0)
+
+      with self.assertRaisesRegexp(ValueError,
+                                   'No trainable variables were accessed'):
+        backprop.implicit_val_and_grad(f)()
+
+  def testDefunCallBackpropUsingSameObjectForMultipleArguments(self):
+
+    @def_function.function
+    def g(x):
+      return backprop.gradients_function(math_ops.multiply, [0, 1])(x, x)
+
+    def np_g(x):
+      return [d.numpy() for d in g(x)]
+
+    x = constant_op.constant(1.)
+    self.assertAllEqual([1., 1.], np_g(x))
+    self.assertAllEqual([1., 1.], np_g(1.))
+
+  def testGradientTensorConversionWithDefun(self):
+    three = resource_variable_ops.ResourceVariable(3.0, name='v')
+
+    @def_function.function
+    def f(x):
+      return math_ops.add(x, three)
+
+    def g(x):
+      return f(x)
+
+    g = backprop.implicit_grad(g)(constant_op.constant(1.0))[0][0]
+    self.assertAllEqual(g, 1.0)
+
+  def testGradient(self):
+    matmul = def_function.function(math_ops.matmul)
+
+    def sq(x):
+      return matmul(x, x, transpose_a=True)
+
+    t = constant_op.constant([[1.0, 2.0], [3.0, 4.0]])
+    grad_t, = backprop.gradients_function(sq, [0])(t)
+    self.assertAllEqual(grad_t, [[6, 6], [14, 14]])
+
+  def testGradientInFunction(self):
+
+    @def_function.function
+    def f(x):
+      return backprop.gradients_function(lambda y: y * y, [0])(x)[0]
+
+    self.assertAllEqual(f(constant_op.constant(1.0)), 2.0)
+
+  def testGradientOfGatherWithDefun(self):
+    v = resource_variable_ops.ResourceVariable([0.0, 1.0, 2.0])
+
+    def sum_gather():
+      return math_ops.reduce_sum(array_ops.gather(v, [1, 2]))
+
+    grad_fn = backprop.implicit_grad(sum_gather)
+    gradient = grad_fn()
+    defun_grad_fn = backprop.implicit_grad(def_function.function(sum_gather))
+    defun_gradient = defun_grad_fn()
+    self.assertEqual(len(gradient), len(defun_gradient))
+
+    gradient = gradient[0][0]
+    defun_gradient = defun_gradient[0][0]
+    self.assertAllEqual(gradient.values, defun_gradient.values)
+    self.assertAllEqual(gradient.indices, defun_gradient.indices)
+    self.assertAllEqual(gradient.dense_shape, defun_gradient.dense_shape)
+
+  def testDifferentiableFunctionNoneOutputs(self):
+
+    @def_function.function
+    def my_function(x):
+      return x, None
+
+    def wrapper(x):
+      return my_function(x)[0]
+
+    g = backprop.gradients_function(wrapper, [0])(constant_op.constant(0.0))
+    self.assertAllEqual(g[0], 1.)
+
+    @def_function.function
+    def foo(a):
+      return None, a * a
+
+    x = constant_op.constant(5.0)
+    with backprop.GradientTape() as tp:
+      tp.watch(x)
+      none, r = foo(x)
+    g = tp.gradient(r, x)
+
+    self.assertIs(none, None)
+    self.assertAllEqual(r, 25.0)
+    self.assertAllEqual(g, 2 * 5.0)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testNestedDifferentiableFunction(self):
+    @def_function.function
+    def inner_fn(a, b):
+      return a * math_ops.add(a, b)
+
+    @def_function.function
+    def outer_fn(x):
+      return inner_fn(x, 1.0)
+
+    x = constant_op.constant(5.0)
+    with backprop.GradientTape() as tp:
+      tp.watch(x)
+      result = outer_fn(x)
+    grad = tp.gradient(result, x)
+
+    self.assertAllEqual(grad, 2 * 5.0 + 1.0)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testDeeplyNestedDifferentiableFunction(self):
+    @def_function.function
+    def inner_inner_fn(a, b):
+      return math_ops.add(a, b)
+
+    @def_function.function
+    def inner_fn(a, b):
+      return inner_inner_fn(a, b)
+
+    @def_function.function
+    def middle_fn(a, b):
+      return a * inner_fn(a, b)
+
+    @def_function.function
+    def outer_fn(x):
+      return middle_fn(x, 1.0)
+
+    x = constant_op.constant(5.0)
+    with backprop.GradientTape() as tp:
+      tp.watch(x)
+      result = outer_fn(x)
+    grad = tp.gradient(result, x)
+
+    self.assertAllEqual(grad, 2 * 5.0 + 1.0)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testDeeplyNestedDifferentiableFunctionWithMultipleGradCalls(self):
+    @def_function.function
+    def inner_fn(a, b):
+      return math_ops.add(a, b)
+
+    @def_function.function
+    def middle_fn(a, b):
+      return math_ops.mul(a, inner_fn(a, b))
+
+    @def_function.function
+    def outer_fn(x):
+      return middle_fn(x, 3.0)
+
+    x = constant_op.constant(5.0)
+    self.assertAllEqual(outer_fn(x), 5.0 * (5.0 + 3.0))
+
+    with backprop.GradientTape() as tp:
+      tp.watch(x)
+      result = outer_fn(x)
+    grad = tp.gradient(result, x)
+
+    self.assertAllEqual(grad, 2 * 5.0 + 3.0)
+    self.assertAllEqual(outer_fn(x), 5.0 * (5.0 + 3.0))
+    self.assertAllEqual(middle_fn(3.0, x), 3.0 * (3.0 + 5.0))
+
+    with backprop.GradientTape() as tp:
+      tp.watch(x)
+      result = outer_fn(x)
+    grad = tp.gradient(result, x)
+
+    self.assertAllEqual(grad, 2 * 5.0 + 3.0)
+
+    y = constant_op.constant(4.0)
+    with backprop.GradientTape() as tp:
+      tp.watch(y)
+      result = outer_fn(y)
+    grad = tp.gradient(result, y)
+
+    self.assertAllEqual(grad, 2 * 4.0 + 3.0)
+
+    with backprop.GradientTape() as tp:
+      tp.watch(y)
+      result = inner_fn(y, y)
+    grad = tp.gradient(result, y)
+
+    self.assertAllEqual(grad, 2.0)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testDeeplyNestedDifferentiableFunctionGradientTapeInDefun(self):
+    @def_function.function
+    def inner_inner_fn(a, b):
+      return math_ops.add(a, b)
+
+    @def_function.function
+    def inner_fn(a, b):
+      return inner_inner_fn(a, b)
+
+    @def_function.function
+    def middle_fn(a, b):
+      return a * inner_fn(a, b)
+
+    @def_function.function
+    def outer_fn(x):
+      with backprop.GradientTape() as tp:
+        tp.watch(x)
+        result = middle_fn(x, 1.0)
+      grad = tp.gradient(result, x)
+      return grad
+
+    x = constant_op.constant(5.0)
+    grad = outer_fn(x)
+    self.assertAllEqual(grad, 2 * 5.0 + 1.0)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testDeeplyNestedDifferentiableFunctionGradientTapeInNestedDefun(self):
+    @def_function.function
+    def inner_inner_fn(a, b):
+      return math_ops.add(a, b)
+
+    @def_function.function
+    def inner_fn(a, b):
+      return inner_inner_fn(a, b)
+
+    @def_function.function
+    def middle_fn(a, b):
+      return a * inner_fn(a, b)
+
+    @def_function.function
+    def almost_outer_fn(x):
+      with backprop.GradientTape() as tp:
+        tp.watch(x)
+        result = middle_fn(x, 1.0)
+      grad = tp.gradient(result, x)
+      return grad
+
+    @def_function.function
+    def outer_fn(x):
+      return almost_outer_fn(x)
+
+    x = constant_op.constant(5.0)
+    grad = outer_fn(x)
+    self.assertAllEqual(grad, 2 * 5.0 + 1.0)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testDeeplyNestedDifferentiableFunctionGradientTapeInMultNestedDefun(self):
+    @def_function.function
+    def inner_inner_fn(a, b):
+      return math_ops.add(a, b)
+
+    @def_function.function
+    def inner_fn(a, b):
+      return inner_inner_fn(a, b)
+
+    @def_function.function
+    def middle_fn(a, b):
+      return a * inner_fn(a, b)
+
+    @def_function.function
+    def almost_outer_fn(x):
+      with backprop.GradientTape() as tp:
+        tp.watch(x)
+        result = middle_fn(x, 1.0)
+      grad = tp.gradient(result, x)
+      return grad
+
+    @def_function.function
+    def outer_fn(x):
+      return almost_outer_fn(x)
+
+    @def_function.function
+    def outer_outer_fn(x):
+      return outer_fn(x)
+
+    x = constant_op.constant(5.0)
+    grad = outer_outer_fn(x)
+    self.assertAllEqual(grad, 2 * 5.0 + 1.0)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testDeeplyNestedDifferentiableFunctionTFGradientInDefun(self):
+    @def_function.function
+    def inner_inner_fn(a, b):
+      return math_ops.add(a, b)
+
+    @def_function.function
+    def inner_fn(a, b):
+      return inner_inner_fn(a, b)
+
+    @def_function.function
+    def middle_fn(a, b):
+      return a * inner_fn(a, b)
+
+    @def_function.function
+    def outer_fn(x):
+      result = middle_fn(x, 1.0)
+      return gradients_impl.gradients(result, [x])[0]
+
+    x = constant_op.constant(5.0)
+    grad = outer_fn(x)
+    self.assertAllEqual(grad, 2 * 5.0 + 1.0)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testDeeplyNestedDifferentiableFunctionTFGradientInNestedDefun(self):
+    @def_function.function
+    def inner_inner_fn(a, b):
+      return math_ops.add(a, b)
+
+    @def_function.function
+    def inner_fn(a, b):
+      return inner_inner_fn(a, b)
+
+    @def_function.function
+    def middle_fn(a, b):
+      return a * inner_fn(a, b)
+
+    @def_function.function
+    def almost_outer_fn(x):
+      result = middle_fn(x, 1.0)
+      return gradients_impl.gradients(result, [x])[0]
+
+    @def_function.function
+    def outer_fn(x):
+      return almost_outer_fn(x)
+
+    x = constant_op.constant(5.0)
+    grad = outer_fn(x)
+    self.assertAllEqual(grad, 2 * 5.0 + 1.0)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testDeeplyNestedDifferentiableFunctionTFGradientInMultNestedDefun(self):
+    @def_function.function
+    def inner_inner_fn(a, b):
+      return math_ops.add(a, b)
+
+    @def_function.function
+    def inner_fn(a, b):
+      return inner_inner_fn(a, b)
+
+    @def_function.function
+    def middle_fn(a, b):
+      return a * inner_fn(a, b)
+
+    @def_function.function
+    def almost_outer_fn(x):
+      result = middle_fn(x, 1.0)
+      return gradients_impl.gradients(result, [x])[0]
+
+    @def_function.function
+    def outer_fn(x):
+      return almost_outer_fn(x)
+
+    @def_function.function
+    def outer_outer_fn(x):
+      return outer_fn(x)
+
+    x = constant_op.constant(5.0)
+    grad = outer_outer_fn(x)
+    self.assertAllEqual(grad, 2 * 5.0 + 1.0)
+
+  def testDeeplyNestedDifferentiableFunctionWithVariable(self):
+    var = variables.Variable(constant_op.constant(1.0))
+
+    @def_function.function
+    def inner_fn(a, b):
+      return math_ops.add(a, b)
+
+    @def_function.function
+    def middle_fn(a, b):
+      return a * inner_fn(a, b)
+
+    @def_function.function
+    def outer_fn(x):
+      return middle_fn(x, var)
+
+    x = constant_op.constant(5.0)
+    with backprop.GradientTape() as tp:
+      tp.watch(x)
+      result = outer_fn(x)
+    grad = tp.gradient(result, x)
+
+    self.assertAllEqual(grad, 2 * 5.0 + 1.0)
+
+  def testDeeplyNestedDifferentiableFunctionWithVariableMultipleGradCalls(self):
+    v = variables.Variable(constant_op.constant(3.0))
+
+    @def_function.function
+    def inner_fn(a, b):
+      return math_ops.add(a, b)
+
+    @def_function.function
+    def middle_fn(a, b):
+      return math_ops.mul(a, inner_fn(a, b))
+
+    @def_function.function
+    def outer_fn(x):
+      return middle_fn(x, v)
+
+    x = constant_op.constant(5.0)
+    self.assertAllEqual(outer_fn(x), 5.0 * (5.0 + 3.0))
+
+    with backprop.GradientTape() as tp:
+      tp.watch(x)
+      result = outer_fn(x)
+    grad = tp.gradient(result, x)
+
+    self.assertAllEqual(grad, 2 * 5.0 + 3.0)
+    self.assertAllEqual(outer_fn(x), 5.0 * (5.0 + 3.0))
+    self.assertAllEqual(middle_fn(v, x), 3.0 * (3.0 + 5.0))
+
+    with backprop.GradientTape() as tp:
+      tp.watch(x)
+      result = outer_fn(x)
+    grad = tp.gradient(result, x)
+
+    self.assertAllEqual(grad, 2 * 5.0 + 3.0)
+
+    y = constant_op.constant(4.0)
+    with backprop.GradientTape() as tp:
+      tp.watch(y)
+      result = outer_fn(y)
+    grad = tp.gradient(result, y)
+
+    self.assertAllEqual(grad, 2 * 4.0 + 3.0)
+
+    v.assign(constant_op.constant(1.5))
+    with backprop.GradientTape() as tp:
+      tp.watch(y)
+      result = outer_fn(y)
+    grad = tp.gradient(result, y)
+
+    self.assertAllEqual(grad, 2 * 4.0 + 1.5)
+
+    with backprop.GradientTape() as tp:
+      tp.watch(y)
+      result = inner_fn(y, v)
+    grad = tp.gradient(result, y)
+
+    self.assertAllEqual(grad, 1.0)
+
+  def testDeeplyNestedDifferentiableFunctionWithVariableMultipleTFGrads(self):
+    with context.graph_mode(), self.cached_session():
+      v = resource_variable_ops.ResourceVariable(3.0)
+      v.initializer.run()
+
+      @def_function.function
+      def inner_fn(a, b):
+        return math_ops.add(a, b)
+
+      @def_function.function
+      def middle_fn(a, b):
+        return math_ops.mul(a, inner_fn(a, b))
+
+      @def_function.function
+      def outer_fn(x):
+        return middle_fn(x, v)
+
+      x = constant_op.constant(5.0)
+      self.assertAllEqual(outer_fn(x).eval(), 5.0 * (5.0 + 3.0))
+
+      grad, = gradients_impl.gradients(outer_fn(x), x)
+
+      self.assertAllEqual(grad, 2 * 5.0 + 3.0)
+      self.assertAllEqual(outer_fn(x), 5.0 * (5.0 + 3.0))
+      self.assertAllEqual(middle_fn(v, x), 3.0 * (3.0 + 5.0))
+
+      grad, = gradients_impl.gradients(outer_fn(x), x)
+
+      self.assertAllEqual(grad, 2 * 5.0 + 3.0)
+
+      y = constant_op.constant(4.0)
+      grad, = gradients_impl.gradients(outer_fn(y), y)
+      self.assertAllEqual(grad, 2 * 4.0 + 3.0)
+
+      self.evaluate(v.assign(constant_op.constant(1.5)))
+      grad, = gradients_impl.gradients(outer_fn(y), y)
+
+      self.assertAllEqual(grad, 2 * 4.0 + 1.5)
+
+      grad, = gradients_impl.gradients(inner_fn(y, v), y)
+      self.assertAllEqual(grad, 1.0)
+
+  def testNestedDifferentiableFunctionNoneOutputs(self):
+    @def_function.function
+    def foo(a, b):
+      return None, a * math_ops.add(a, b), None, 2*a
+
+    @def_function.function
+    def bar(x):
+      return foo(x, 1.0)
+
+    x = constant_op.constant(5.0)
+    with backprop.GradientTape(persistent=True) as tp:
+      tp.watch(x)
+      none1, r1, none2, r2 = bar(x)
+    g1 = tp.gradient(r1, x)
+    g2 = tp.gradient(r2, x)
+
+    self.assertAllEqual(r1, 30.0)
+    self.assertAllEqual(r2, 10.0)
+    self.assertIs(none1, None)
+    self.assertIs(none2, None)
+    self.assertAllEqual(g1, 2 * 5.0 + 1.0)
+    self.assertAllEqual(g2, 2.0)
+
+  def testGradientWithKeywordArguments(self):
+    matmul = def_function.function(math_ops.matmul)
+
+    def sq(x):
+      return matmul(a=x, b=x, transpose_a=True)
+
+    t = constant_op.constant([[1.0, 2.0], [3.0, 4.0]])
+    grad_t, = backprop.gradients_function(sq, [0])(t)
+    self.assertAllEqual(grad_t, [[6, 6], [14, 14]])
+
+    with backprop.GradientTape(persistent=True) as tape:
+      tape.watch(t)
+      one = matmul(t, b=t, transpose_a=True)
+      two = matmul(b=t, a=t, transpose_a=True)
+      three = matmul(a=t, b=t, transpose_a=True)
+
+    for output in [one, two, three]:
+      self.assertAllEqual(tape.gradient(output, t), [[6, 6], [14, 14]])
+
+  def testGradientInFunctionWithKeywordArguments(self):
+
+    @def_function.function
+    def f(x):
+      return backprop.gradients_function(lambda y: y * y, [0])(x)[0]
+
+    self.assertAllEqual(f(x=constant_op.constant(1.0)), 2.0)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testBackwardNone(self):
+    model = variables.Variable(1.0, name='model')
+    count = variables.Variable(0)
+
+    @function.defun
+    def forward_pass(value):
+      count.assign_add(1)
+      residuals = value - model
+      loss = 0.5 * math_ops.reduce_mean(math_ops.pow(residuals, 2))
+      # Note: count is an integer, so its doutput will be None
+      return loss, count
+
+    def reduce_fn(x):
+      if context.executing_eagerly():
+        with backprop.GradientTape() as t:
+          loss, count = forward_pass(x)
+        return t.gradient(loss, model), count
+      loss, count = forward_pass(x)
+      grad_only = gradients_impl.gradients(loss, model)
+      return grad_only, count
+
+    g, _ = reduce_fn(constant_op.constant([7.0]))
+
+    self.evaluate(variables.global_variables_initializer())
+    self.assertAllEqual(nest.flatten(self.evaluate(g)), [-6.0])
+
+
+if __name__ == '__main__':
+  ops.enable_eager_execution(
+      config=config_pb2.ConfigProto(device_count={'CPU': 4}))
+  test.main()
diff --git a/tensorflow/python/eager/function_test.py b/tensorflow/python/eager/function_test.py
index 2af08689f87..71afbd24d8d 100644
--- a/tensorflow/python/eager/function_test.py
+++ b/tensorflow/python/eager/function_test.py
@@ -29,7 +29,6 @@ import numpy
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.core.protobuf import rewriter_config_pb2
 from tensorflow.python import keras
-from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
 from tensorflow.python.eager import function
@@ -48,7 +47,6 @@ from tensorflow.python.layers import convolutional
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import clip_ops
 from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import list_ops
 from tensorflow.python.ops import math_ops
@@ -102,10 +100,10 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
       _ = x * y
       return x + y
 
-    # The default config allows everything.
-    rewrites = rewriter_config_pb2.RewriterConfig()
+    # The default config allows all rewrites.
+    config_proto = config_pb2.ConfigProto()
 
-    with context.function_rewriter_config(rewrites):
+    with context.function_config_proto(config_proto):
       t = constant_op.constant(1.0)
       self.assertAllEqual(add(t, t).numpy(), 2.0)
 
@@ -149,31 +147,22 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
     out = a_times_b(pair({'a': t}, {'b': t}))
     self.assertAllEqual(out, math_ops.matmul(t, t).numpy())
 
-  def testGraphModeWithGradients(self):
-    v = resource_variable_ops.ResourceVariable(1.0, name='v')
+  def testNestedOutputsGraphMode(self):
+    matmul = def_function.function(math_ops.matmul)
 
-    @def_function.function
-    def step():
-      def inner():
-        return v * v
+    pair = collections.namedtuple('pair', ['a', 'b'])
 
-      return backprop.implicit_grad(inner)()[0][0]
+    @def_function.function()
+    def pairs_mul(pair_a, pair_b):
+      return pair(matmul(pair_a.a, pair_b.a), matmul(pair_a.b, pair_b.b))
 
-    self.assertAllEqual(step(), 2.0)
+    a = constant_op.constant([[1.0, 2.0], [1.0, 2.0]])
+    b = constant_op.constant([[3.0, 4.0], [3.0, 4.0]])
 
-  def testGraphGradientVariable(self):
-    with ops.Graph().as_default(), self.cached_session():
-      v = variables.Variable(1.0)
-
-      @def_function.function
-      def f():
-        return 2.0 * v
-
-      node = f()
-      grads, = gradients_impl.gradients(node, v)
-      v.initializer.run()
-      self.assertAllEqual(grads.eval(), 2.0)
-      self.assertEqual(grads.shape, v.shape)
+    out = pairs_mul(pair(a, b), pair(b, a))
+    expected = pair(math_ops.matmul(a, b).numpy(),
+                    math_ops.matmul(b, a).numpy())
+    self.assertAllClose(out, expected)
 
   def testGraphEagerIsolation(self):
 
@@ -314,34 +303,6 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
     random_seed.set_random_seed(1)
     self.assertAllEqual(f(), x)
 
-  def testSymGradGatherNd(self):
-    with ops.Graph().as_default(), self.cached_session() as sess:
-
-      @def_function.function
-      def f(x):
-        return array_ops.gather_nd(x, [[0]])
-
-      c = constant_op.constant([[2.]])
-      f_c = f(c)
-      g, = gradients_impl.gradients(f_c, c)
-      self.assertAllEqual(sess.run(g).values, [[1.0]])
-
-  def testNoSymGradNestedDefun(self):
-
-    @def_function.function
-    def outer():
-
-      @def_function.function
-      def f(x):
-        return array_ops.gather_nd(x, [[0]])
-
-      c = constant_op.constant([[2.]])
-      f_c = f(c)
-      g, = gradients_impl.gradients(f_c, c)
-      self.assertIsInstance(g, ops.IndexedSlices)
-
-    outer()
-
   def testNestedInputsGraphFunction(self):
     matmul = def_function.function(math_ops.matmul)
 
@@ -378,21 +339,6 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
     self.assertAllEqual(a, math_ops.matmul(t, t).numpy())
     self.assertAllEqual(b['b'].numpy(), 1.0)
 
-  def testGraphFunctionWithGradients(self):
-    v = resource_variable_ops.ResourceVariable(1.0, name='v')
-
-    @def_function.function
-    def step():
-      def inner():
-        return v * v
-
-      return backprop.implicit_grad(inner)()[0][0]
-
-    step_op = step.get_concrete_function()
-    self.assertEqual(step_op.output_dtypes, dtypes.float32)
-    self.assertEqual(step_op.output_shapes, tensor_shape.TensorShape([]))
-    self.assertAllEqual(step_op(), 2.0)
-
   def testGraphFunctionNoneOutput(self):
     @def_function.function
     def fn(unused_a, unused_b):
@@ -404,34 +350,6 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
     self.assertEqual(fn_op.output_shapes, None)
     self.assertAllEqual(fn_op(x, x), None)
 
-  @test_util.run_in_graph_and_eager_modes()
-  def testDefunCondGradient(self):
-
-    @def_function.function
-    def f(x):
-      return control_flow_ops.cond(x > 0.5, lambda: 2 * x, lambda: 3 * x)
-
-    with backprop.GradientTape() as t:
-      x = constant_op.constant(1.0)
-      t.watch(x)
-      y = f(x)
-    self.assertAllEqual(self.evaluate(t.gradient(y, x)), 2.0)
-
-  @test_util.run_in_graph_and_eager_modes()
-  def testGraphLoopGradient(self):
-
-    @def_function.function
-    def f(x):
-      return control_flow_ops.while_loop(lambda _, i: i < 2,
-                                         lambda x, i: (2*x, i + 1),
-                                         [x, 0])[0]
-
-    with backprop.GradientTape() as t:
-      x = constant_op.constant(1.0)
-      t.watch(x)
-      y = f(x)
-    self.assertAllEqual(self.evaluate(t.gradient(y, x)), 4.0)
-
   def testDefunNumpyArraysConvertedToTensors(self):
 
     def f(x):
@@ -625,27 +543,7 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
     self.assertIsInstance(
         self.v, resource_variable_ops.ResourceVariable)
 
-  def testDefunDifferentiable(self):
-    v = resource_variable_ops.ResourceVariable(1.0)
-
-    @def_function.function
-    def f():
-      return v * v
-
-    self.assertAllEqual(backprop.implicit_grad(f)()[0][0], 2.0)
-
-  def testDefunCanBeDifferentiatedTwice(self):
-    v = resource_variable_ops.ResourceVariable(1.0)
-
-    @def_function.function
-    def f():
-      return v * v
-
-    self.assertAllEqual(backprop.implicit_grad(f)()[0][0], 2.0)
-    # Ensure that v is watched again.
-    self.assertAllEqual(backprop.implicit_grad(f)()[0][0], 2.0)
-
-  def testRunMetadata(self):
+  def disabled_testRunMetadata(self):
 
     @def_function.function
     def f(x):
@@ -683,23 +581,7 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
       variables.global_variables_initializer().run()
       call = def_function.function(o.call)
       op = call()
-      self.assertAllEqual(sess.run(op), 2.0)
-
-  def testSymbolicGradientVariableNoneNotZerosLike(self):
-    with ops.Graph().as_default():
-      v = variables.Variable(1.0)
-
-      @def_function.function
-      def f(x, v):
-        v.read_value()
-        return x * x
-
-      x = constant_op.constant(1.0)
-      l = f(x, v)
-      _, dv = gradients_impl.gradients(l, [x, v])
-      with self.cached_session():
-        v.initializer.run()
-        self.assertEqual(dv, None)
+      self.assertAllEqual(self.evaluate(op), 2.0)
 
   def testGraphModeManyFunctions(self):
     with ops.Graph().as_default(), self.cached_session():
@@ -742,42 +624,6 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
 
     self.assertAllEqual(8, g(constant_op.constant(2)))
 
-  def testDefunCallBackprop(self):
-
-    @def_function.function
-    def f(x):
-      return math_ops.add(x, x)
-
-    @def_function.function
-    def g(x):
-      return backprop.gradients_function(f, [0])(x)[0]
-
-    self.assertAllEqual(2, g(constant_op.constant(2.)))
-
-  def testGraphModeEagerGradError(self):
-    with context.graph_mode():
-      def f():
-        x = variable_scope.get_variable(
-            'v', initializer=constant_op.constant(1.0))
-        return x * constant_op.constant(2.0)
-
-      with self.assertRaisesRegexp(ValueError,
-                                   'No trainable variables were accessed'):
-        backprop.implicit_val_and_grad(f)()
-
-  def testDefunCallBackpropUsingSameObjectForMultipleArguments(self):
-
-    @def_function.function
-    def g(x):
-      return backprop.gradients_function(math_ops.multiply, [0, 1])(x, x)
-
-    def np_g(x):
-      return [d.numpy() for d in g(x)]
-
-    x = constant_op.constant(1.)
-    self.assertAllEqual([1., 1.], np_g(x))
-    self.assertAllEqual([1., 1.], np_g(1.))
-
   def testCallShape(self):
 
     @def_function.function
@@ -808,37 +654,6 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
 
     g(three)
 
-  def testGradientTensorConversionWithDefun(self):
-    three = resource_variable_ops.ResourceVariable(3.0, name='v')
-
-    @def_function.function
-    def f(x):
-      return math_ops.add(x, three)
-
-    def g(x):
-      return f(x)
-
-    g = backprop.implicit_grad(g)(constant_op.constant(1.0))[0][0]
-    self.assertAllEqual(g, 1.0)
-
-  def testGradient(self):
-    matmul = def_function.function(math_ops.matmul)
-
-    def sq(x):
-      return matmul(x, x, transpose_a=True)
-
-    t = constant_op.constant([[1.0, 2.0], [3.0, 4.0]])
-    grad_t, = backprop.gradients_function(sq, [0])(t)
-    self.assertAllEqual(grad_t, [[6, 6], [14, 14]])
-
-  def testGradientInFunction(self):
-
-    @def_function.function
-    def f(x):
-      return backprop.gradients_function(lambda y: y * y, [0])(x)[0]
-
-    self.assertAllEqual(f(constant_op.constant(1.0)), 2.0)
-
   def testGatherResourceWithDefun(self):
     with ops.device('cpu:0'):
       v = resource_variable_ops.ResourceVariable([0.0, 1.0, 2.0])
@@ -849,24 +664,6 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
     defined = def_function.function(sum_gather)
     self.assertAllEqual(sum_gather(), defined())
 
-  def testGradientOfGatherWithDefun(self):
-    v = resource_variable_ops.ResourceVariable([0.0, 1.0, 2.0])
-
-    def sum_gather():
-      return math_ops.reduce_sum(array_ops.gather(v, [1, 2]))
-
-    grad_fn = backprop.implicit_grad(sum_gather)
-    gradient = grad_fn()
-    defun_grad_fn = backprop.implicit_grad(def_function.function(sum_gather))
-    defun_gradient = defun_grad_fn()
-    self.assertEqual(len(gradient), len(defun_gradient))
-
-    gradient = gradient[0][0]
-    defun_gradient = defun_gradient[0][0]
-    self.assertAllEqual(gradient.values, defun_gradient.values)
-    self.assertAllEqual(gradient.indices, defun_gradient.indices)
-    self.assertAllEqual(gradient.dense_shape, defun_gradient.dense_shape)
-
   def testReturningIndexedSlicesWithDefun(self):
 
     def validate(indexed_slice):
@@ -1012,440 +809,6 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
     shape = constant_op.constant([2, 1]).gpu()
     reshape(value, shape)  # No error is raised
 
-  def testDifferentiableFunctionNoneOutputs(self):
-
-    @def_function.function
-    def my_function(x):
-      return x, None
-
-    def wrapper(x):
-      return my_function(x)[0]
-
-    g = backprop.gradients_function(wrapper, [0])(constant_op.constant(0.0))
-    self.assertAllEqual(g[0], 1.)
-
-    @def_function.function
-    def foo(a):
-      return None, a * a
-
-    x = constant_op.constant(5.0)
-    with backprop.GradientTape() as tp:
-      tp.watch(x)
-      none, r = foo(x)
-    g = tp.gradient(r, x)
-
-    self.assertIs(none, None)
-    self.assertAllEqual(r, 25.0)
-    self.assertAllEqual(g, 2 * 5.0)
-
-  @test_util.run_in_graph_and_eager_modes
-  def testNestedDifferentiableFunction(self):
-    @def_function.function
-    def inner_fn(a, b):
-      return a * math_ops.add(a, b)
-
-    @def_function.function
-    def outer_fn(x):
-      return inner_fn(x, 1.0)
-
-    x = constant_op.constant(5.0)
-    with backprop.GradientTape() as tp:
-      tp.watch(x)
-      result = outer_fn(x)
-    grad = tp.gradient(result, x)
-
-    self.assertAllEqual(grad, 2 * 5.0 + 1.0)
-
-  @test_util.run_in_graph_and_eager_modes
-  def testDeeplyNestedDifferentiableFunction(self):
-    @def_function.function
-    def inner_inner_fn(a, b):
-      return math_ops.add(a, b)
-
-    @def_function.function
-    def inner_fn(a, b):
-      return inner_inner_fn(a, b)
-
-    @def_function.function
-    def middle_fn(a, b):
-      return a * inner_fn(a, b)
-
-    @def_function.function
-    def outer_fn(x):
-      return middle_fn(x, 1.0)
-
-    x = constant_op.constant(5.0)
-    with backprop.GradientTape() as tp:
-      tp.watch(x)
-      result = outer_fn(x)
-    grad = tp.gradient(result, x)
-
-    self.assertAllEqual(grad, 2 * 5.0 + 1.0)
-
-  @test_util.run_in_graph_and_eager_modes
-  def testDeeplyNestedDifferentiableFunctionWithMultipleGradCalls(self):
-    @def_function.function
-    def inner_fn(a, b):
-      return math_ops.add(a, b)
-
-    @def_function.function
-    def middle_fn(a, b):
-      return math_ops.mul(a, inner_fn(a, b))
-
-    @def_function.function
-    def outer_fn(x):
-      return middle_fn(x, 3.0)
-
-    x = constant_op.constant(5.0)
-    self.assertAllEqual(outer_fn(x), 5.0 * (5.0 + 3.0))
-
-    with backprop.GradientTape() as tp:
-      tp.watch(x)
-      result = outer_fn(x)
-    grad = tp.gradient(result, x)
-
-    self.assertAllEqual(grad, 2 * 5.0 + 3.0)
-    self.assertAllEqual(outer_fn(x), 5.0 * (5.0 + 3.0))
-    self.assertAllEqual(middle_fn(3.0, x), 3.0 * (3.0 + 5.0))
-
-    with backprop.GradientTape() as tp:
-      tp.watch(x)
-      result = outer_fn(x)
-    grad = tp.gradient(result, x)
-
-    self.assertAllEqual(grad, 2 * 5.0 + 3.0)
-
-    y = constant_op.constant(4.0)
-    with backprop.GradientTape() as tp:
-      tp.watch(y)
-      result = outer_fn(y)
-    grad = tp.gradient(result, y)
-
-    self.assertAllEqual(grad, 2 * 4.0 + 3.0)
-
-    with backprop.GradientTape() as tp:
-      tp.watch(y)
-      result = inner_fn(y, y)
-    grad = tp.gradient(result, y)
-
-    self.assertAllEqual(grad, 2.0)
-
-  @test_util.run_in_graph_and_eager_modes
-  def testDeeplyNestedDifferentiableFunctionGradientTapeInDefun(self):
-    @def_function.function
-    def inner_inner_fn(a, b):
-      return math_ops.add(a, b)
-
-    @def_function.function
-    def inner_fn(a, b):
-      return inner_inner_fn(a, b)
-
-    @def_function.function
-    def middle_fn(a, b):
-      return a * inner_fn(a, b)
-
-    @def_function.function
-    def outer_fn(x):
-      with backprop.GradientTape() as tp:
-        tp.watch(x)
-        result = middle_fn(x, 1.0)
-      grad = tp.gradient(result, x)
-      return grad
-
-    x = constant_op.constant(5.0)
-    grad = outer_fn(x)
-    self.assertAllEqual(grad, 2 * 5.0 + 1.0)
-
-  @test_util.run_in_graph_and_eager_modes
-  def testDeeplyNestedDifferentiableFunctionGradientTapeInNestedDefun(self):
-    @def_function.function
-    def inner_inner_fn(a, b):
-      return math_ops.add(a, b)
-
-    @def_function.function
-    def inner_fn(a, b):
-      return inner_inner_fn(a, b)
-
-    @def_function.function
-    def middle_fn(a, b):
-      return a * inner_fn(a, b)
-
-    @def_function.function
-    def almost_outer_fn(x):
-      with backprop.GradientTape() as tp:
-        tp.watch(x)
-        result = middle_fn(x, 1.0)
-      grad = tp.gradient(result, x)
-      return grad
-
-    @def_function.function
-    def outer_fn(x):
-      return almost_outer_fn(x)
-
-    x = constant_op.constant(5.0)
-    grad = outer_fn(x)
-    self.assertAllEqual(grad, 2 * 5.0 + 1.0)
-
-  @test_util.run_in_graph_and_eager_modes
-  def testDeeplyNestedDifferentiableFunctionGradientTapeInMultNestedDefun(self):
-    @def_function.function
-    def inner_inner_fn(a, b):
-      return math_ops.add(a, b)
-
-    @def_function.function
-    def inner_fn(a, b):
-      return inner_inner_fn(a, b)
-
-    @def_function.function
-    def middle_fn(a, b):
-      return a * inner_fn(a, b)
-
-    @def_function.function
-    def almost_outer_fn(x):
-      with backprop.GradientTape() as tp:
-        tp.watch(x)
-        result = middle_fn(x, 1.0)
-      grad = tp.gradient(result, x)
-      return grad
-
-    @def_function.function
-    def outer_fn(x):
-      return almost_outer_fn(x)
-
-    @def_function.function
-    def outer_outer_fn(x):
-      return outer_fn(x)
-
-    x = constant_op.constant(5.0)
-    grad = outer_outer_fn(x)
-    self.assertAllEqual(grad, 2 * 5.0 + 1.0)
-
-  @test_util.run_in_graph_and_eager_modes
-  def testDeeplyNestedDifferentiableFunctionTFGradientInDefun(self):
-    @def_function.function
-    def inner_inner_fn(a, b):
-      return math_ops.add(a, b)
-
-    @def_function.function
-    def inner_fn(a, b):
-      return inner_inner_fn(a, b)
-
-    @def_function.function
-    def middle_fn(a, b):
-      return a * inner_fn(a, b)
-
-    @def_function.function
-    def outer_fn(x):
-      result = middle_fn(x, 1.0)
-      return gradients_impl.gradients(result, [x])[0]
-
-    x = constant_op.constant(5.0)
-    grad = outer_fn(x)
-    self.assertAllEqual(grad, 2 * 5.0 + 1.0)
-
-  @test_util.run_in_graph_and_eager_modes
-  def testDeeplyNestedDifferentiableFunctionTFGradientInNestedDefun(self):
-    @def_function.function
-    def inner_inner_fn(a, b):
-      return math_ops.add(a, b)
-
-    @def_function.function
-    def inner_fn(a, b):
-      return inner_inner_fn(a, b)
-
-    @def_function.function
-    def middle_fn(a, b):
-      return a * inner_fn(a, b)
-
-    @def_function.function
-    def almost_outer_fn(x):
-      result = middle_fn(x, 1.0)
-      return gradients_impl.gradients(result, [x])[0]
-
-    @def_function.function
-    def outer_fn(x):
-      return almost_outer_fn(x)
-
-    x = constant_op.constant(5.0)
-    grad = outer_fn(x)
-    self.assertAllEqual(grad, 2 * 5.0 + 1.0)
-
-  @test_util.run_in_graph_and_eager_modes
-  def testDeeplyNestedDifferentiableFunctionTFGradientInMultNestedDefun(self):
-    @def_function.function
-    def inner_inner_fn(a, b):
-      return math_ops.add(a, b)
-
-    @def_function.function
-    def inner_fn(a, b):
-      return inner_inner_fn(a, b)
-
-    @def_function.function
-    def middle_fn(a, b):
-      return a * inner_fn(a, b)
-
-    @def_function.function
-    def almost_outer_fn(x):
-      result = middle_fn(x, 1.0)
-      return gradients_impl.gradients(result, [x])[0]
-
-    @def_function.function
-    def outer_fn(x):
-      return almost_outer_fn(x)
-
-    @def_function.function
-    def outer_outer_fn(x):
-      return outer_fn(x)
-
-    x = constant_op.constant(5.0)
-    grad = outer_outer_fn(x)
-    self.assertAllEqual(grad, 2 * 5.0 + 1.0)
-
-  def testDeeplyNestedDifferentiableFunctionWithVariable(self):
-    var = variables.Variable(constant_op.constant(1.0))
-
-    @def_function.function
-    def inner_fn(a, b):
-      return math_ops.add(a, b)
-
-    @def_function.function
-    def middle_fn(a, b):
-      return a * inner_fn(a, b)
-
-    @def_function.function
-    def outer_fn(x):
-      return middle_fn(x, var)
-
-    x = constant_op.constant(5.0)
-    with backprop.GradientTape() as tp:
-      tp.watch(x)
-      result = outer_fn(x)
-    grad = tp.gradient(result, x)
-
-    self.assertAllEqual(grad, 2 * 5.0 + 1.0)
-
-  def testDeeplyNestedDifferentiableFunctionWithVariableMultipleGradCalls(self):
-    v = variables.Variable(constant_op.constant(3.0))
-
-    @def_function.function
-    def inner_fn(a, b):
-      return math_ops.add(a, b)
-
-    @def_function.function
-    def middle_fn(a, b):
-      return math_ops.mul(a, inner_fn(a, b))
-
-    @def_function.function
-    def outer_fn(x):
-      return middle_fn(x, v)
-
-    x = constant_op.constant(5.0)
-    self.assertAllEqual(outer_fn(x), 5.0 * (5.0 + 3.0))
-
-    with backprop.GradientTape() as tp:
-      tp.watch(x)
-      result = outer_fn(x)
-    grad = tp.gradient(result, x)
-
-    self.assertAllEqual(grad, 2 * 5.0 + 3.0)
-    self.assertAllEqual(outer_fn(x), 5.0 * (5.0 + 3.0))
-    self.assertAllEqual(middle_fn(v, x), 3.0 * (3.0 + 5.0))
-
-    with backprop.GradientTape() as tp:
-      tp.watch(x)
-      result = outer_fn(x)
-    grad = tp.gradient(result, x)
-
-    self.assertAllEqual(grad, 2 * 5.0 + 3.0)
-
-    y = constant_op.constant(4.0)
-    with backprop.GradientTape() as tp:
-      tp.watch(y)
-      result = outer_fn(y)
-    grad = tp.gradient(result, y)
-
-    self.assertAllEqual(grad, 2 * 4.0 + 3.0)
-
-    v.assign(constant_op.constant(1.5))
-    with backprop.GradientTape() as tp:
-      tp.watch(y)
-      result = outer_fn(y)
-    grad = tp.gradient(result, y)
-
-    self.assertAllEqual(grad, 2 * 4.0 + 1.5)
-
-    with backprop.GradientTape() as tp:
-      tp.watch(y)
-      result = inner_fn(y, v)
-    grad = tp.gradient(result, y)
-
-    self.assertAllEqual(grad, 1.0)
-
-  def testDeeplyNestedDifferentiableFunctionWithVariableMultipleTFGrads(self):
-    with context.graph_mode(), self.cached_session():
-      v = resource_variable_ops.ResourceVariable(3.0)
-      v.initializer.run()
-
-      @def_function.function
-      def inner_fn(a, b):
-        return math_ops.add(a, b)
-
-      @def_function.function
-      def middle_fn(a, b):
-        return math_ops.mul(a, inner_fn(a, b))
-
-      @def_function.function
-      def outer_fn(x):
-        return middle_fn(x, v)
-
-      x = constant_op.constant(5.0)
-      self.assertAllEqual(outer_fn(x).eval(), 5.0 * (5.0 + 3.0))
-
-      grad, = gradients_impl.gradients(outer_fn(x), x)
-
-      self.assertAllEqual(grad, 2 * 5.0 + 3.0)
-      self.assertAllEqual(outer_fn(x), 5.0 * (5.0 + 3.0))
-      self.assertAllEqual(middle_fn(v, x), 3.0 * (3.0 + 5.0))
-
-      grad, = gradients_impl.gradients(outer_fn(x), x)
-
-      self.assertAllEqual(grad, 2 * 5.0 + 3.0)
-
-      y = constant_op.constant(4.0)
-      grad, = gradients_impl.gradients(outer_fn(y), y)
-      self.assertAllEqual(grad, 2 * 4.0 + 3.0)
-
-      self.evaluate(v.assign(constant_op.constant(1.5)))
-      grad, = gradients_impl.gradients(outer_fn(y), y)
-
-      self.assertAllEqual(grad, 2 * 4.0 + 1.5)
-
-      grad, = gradients_impl.gradients(inner_fn(y, v), y)
-      self.assertAllEqual(grad, 1.0)
-
-  def testNestedDifferentiableFunctionNoneOutputs(self):
-    @def_function.function
-    def foo(a, b):
-      return None, a * math_ops.add(a, b), None, 2*a
-
-    @def_function.function
-    def bar(x):
-      return foo(x, 1.0)
-
-    x = constant_op.constant(5.0)
-    with backprop.GradientTape(persistent=True) as tp:
-      tp.watch(x)
-      none1, r1, none2, r2 = bar(x)
-    g1 = tp.gradient(r1, x)
-    g2 = tp.gradient(r2, x)
-
-    self.assertAllEqual(r1, 30.0)
-    self.assertAllEqual(r2, 10.0)
-    self.assertIs(none1, None)
-    self.assertIs(none2, None)
-    self.assertAllEqual(g1, 2 * 5.0 + 1.0)
-    self.assertAllEqual(g2, 2.0)
-
   def testNoneOutput(self):
 
     @def_function.function
@@ -1925,8 +1288,9 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
       defined(array_ops.ones([2, 1]))
 
     # Wrong number of arguments.
-    with self.assertRaisesRegexp(ValueError,
-                                 'Structure of Python function inputs.*'):
+    with self.assertRaisesRegexp(
+        ValueError,
+        'Arguments and signature arguments do not match.*'):
       defined(array_ops.ones([2]), array_ops.ones([2]))
     with self.assertRaisesRegexp(ValueError,
                                  'Structure of Python function inputs.*'):
@@ -1945,10 +1309,16 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
       else:
         return -1.0 * a
 
-    signature = [tensor_spec.TensorSpec([], dtypes.float32)] * 2
+    signature = [
+        tensor_spec.TensorSpec([], dtypes.float32),
+        tensor_spec.TensorSpec([], dtypes.bool),
+    ]
     defined = def_function.function(foo, input_signature=signature)
     a = constant_op.constant(1.0)
-    with self.assertRaises(TypeError):
+    with self.assertRaisesRegexp(
+        ValueError,
+        'When input_signature is provided, all inputs to '
+        'the Python function must be Tensors.'):
       defined(a, training=True)
 
   def testInputSignatureWithKeywordPositionalArgs(self):
@@ -2039,33 +1409,6 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
     self.assertAllEqual(six, 2.0)
     self.assertAllEqual(seven, 2.0)
 
-  def testGradientWithKeywordArguments(self):
-    matmul = def_function.function(math_ops.matmul)
-
-    def sq(x):
-      return matmul(a=x, b=x, transpose_a=True)
-
-    t = constant_op.constant([[1.0, 2.0], [3.0, 4.0]])
-    grad_t, = backprop.gradients_function(sq, [0])(t)
-    self.assertAllEqual(grad_t, [[6, 6], [14, 14]])
-
-    with backprop.GradientTape(persistent=True) as tape:
-      tape.watch(t)
-      one = matmul(t, b=t, transpose_a=True)
-      two = matmul(b=t, a=t, transpose_a=True)
-      three = matmul(a=t, b=t, transpose_a=True)
-
-    for output in [one, two, three]:
-      self.assertAllEqual(tape.gradient(output, t), [[6, 6], [14, 14]])
-
-  def testGradientInFunctionWithKeywordArguments(self):
-
-    @def_function.function
-    def f(x):
-      return backprop.gradients_function(lambda y: y * y, [0])(x)[0]
-
-    self.assertAllEqual(f(x=constant_op.constant(1.0)), 2.0)
-
   def testDefuningInstanceMethod(self):
 
     integer = constant_op.constant(2, dtypes.int64)
@@ -2339,33 +1682,6 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
         # pylint: disable=protected-access
         self.assertEqual(len(graph._functions), 3)
 
-  @test_util.run_in_graph_and_eager_modes
-  def testBackwardNone(self):
-    model = variables.Variable(1.0, name='model')
-    count = variables.Variable(0)
-
-    @function.defun
-    def forward_pass(value):
-      count.assign_add(1)
-      residuals = value - model
-      loss = 0.5 * math_ops.reduce_mean(math_ops.pow(residuals, 2))
-      # Note: count is an integer, so its doutput will be None
-      return loss, count
-
-    def reduce_fn(x):
-      if context.executing_eagerly():
-        with backprop.GradientTape() as t:
-          loss, count = forward_pass(x)
-        return t.gradient(loss, model), count
-      loss, count = forward_pass(x)
-      grad_only = gradients_impl.gradients(loss, model)
-      return grad_only, count
-
-    g, _ = reduce_fn(constant_op.constant([7.0]))
-
-    self.evaluate(variables.global_variables_initializer())
-    self.assertAllEqual(nest.flatten(self.evaluate(g)), [-6.0])
-
   def testCallingFunctionWithDifferentVariables(self):
 
     @function.defun
@@ -2403,8 +1719,7 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
                                  'be Tensors;.*'):
       graph_function('Not a Tensor.')
 
-  # TODO(scottzhu): Revive the test once the grappler plugin is updated.
-  def disabled_testSwapImplementationWithGrapplerPlugin(self):
+  def testSwapImplementationWithGrapplerPlugin(self):
     rewrites = rewriter_config_pb2.RewriterConfig()
     # function_optimizer has to be turn off, otherwise it will delete the
     # registered function if it does not get called.
@@ -2441,7 +1756,7 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
 
       function.register(cpu_boost, x)
       y = gpu_boost(x)
-      y_value = sess.run(y)
+      y_value = self.evaluate(y)
 
       if test.is_gpu_available():
         self.assertEqual(y_value, 5.0)
diff --git a/tensorflow/python/eager/graph_only_ops_test.py b/tensorflow/python/eager/graph_only_ops_test.py
index 3cf3a61a62b..914b4d9a95a 100644
--- a/tensorflow/python/eager/graph_only_ops_test.py
+++ b/tensorflow/python/eager/graph_only_ops_test.py
@@ -29,12 +29,14 @@ from tensorflow.python.platform import test
 
 class GraphOnlyOpsTest(test_util.TensorFlowTestCase):
 
+  @test_util.run_deprecated_v1
   def testGraphZerosLike(self):
     x = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.int32)
     z_tf = graph_only_ops.graph_zeros_like(x)
     with self.cached_session():
-      self.assertAllClose(np.zeros((2, 3)), z_tf.eval())
+      self.assertAllClose(np.zeros((2, 3)), self.evaluate(z_tf))
 
+  @test_util.run_deprecated_v1
   def testGraphPlaceholder(self):
     x_tf = graph_only_ops.graph_placeholder(dtypes.int32, shape=(1,))
     y_tf = math_ops.square(x_tf)
diff --git a/tensorflow/python/eager/pywrap_tensor.cc b/tensorflow/python/eager/pywrap_tensor.cc
index 55f0896e3b4..206b96eef60 100644
--- a/tensorflow/python/eager/pywrap_tensor.cc
+++ b/tensorflow/python/eager/pywrap_tensor.cc
@@ -220,6 +220,14 @@ TFE_TensorHandle* ConvertToEagerTensor(PyObject* value, PyObject* dtype) {
       return nullptr;
     }
   }
+  tensorflow::Safe_PyObjectPtr value_decrefer;
+  if (PyArray_CheckAnyScalarExact(value)) {
+    // Convert numpy scalars to numpy arrays.
+    value = PyArray_FromScalar(value, nullptr);
+    // The returned value needs to be DECREF'd, but the original value was
+    // created in python code, and doesn't need to be DECREF'd.
+    value_decrefer.reset(value);
+  }
   if (PyArray_Check(value)) {
     int desired_np_dtype = -1;
     if (desired_dtype >= 0) {
@@ -439,8 +447,8 @@ int EagerTensor_init(EagerTensor* self, PyObject* args, PyObject* kwds) {
       PyErr_SetString(
           PyExc_TypeError,
           tensorflow::strings::StrCat(
-              "Cannot convert value ", TFE_GetPythonString(value_str.get()),
-              " to EagerTensor with requested dtype: ",
+              "Cannot convert provided value to EagerTensor. Provided value: ",
+              TFE_GetPythonString(value_str.get()), " Requested dtype: ",
               tensorflow::DataTypeString(
                   static_cast<tensorflow::DataType>(desired_dtype)))
               .c_str());
@@ -672,11 +680,29 @@ static PyObject* EagerTensor_device(EagerTensor* self) {
 #endif
 }
 
+// Getter `backing_device`.
+static PyObject* EagerTensor_backing_device(EagerTensor* self) {
+  const char* device =
+      TFE_TensorHandleBackingDeviceName(self->handle, self->status);
+  if (MaybeRaiseExceptionFromTFStatus(self->status, PyExc_ValueError)) {
+    // Cleanup self->status before returning.
+    TF_SetStatus(self->status, TF_OK, "");
+    return nullptr;
+  }
+#if PY_MAJOR_VERSION >= 3
+  return PyUnicode_FromString(device);
+#else
+  return PyBytes_FromString(device);
+#endif
+}
+
 static PyGetSetDef EagerTensor_getseters[] = {
     {const_cast<char*>("_id"), (getter)EagerTensor_getid, nullptr,
      const_cast<char*>("_id"), nullptr},
     {const_cast<char*>("device"), (getter)EagerTensor_device, nullptr,
      const_cast<char*>("device"), nullptr},
+    {const_cast<char*>("backing_device"), (getter)EagerTensor_backing_device,
+     nullptr, const_cast<char*>("backing_device"), nullptr},
     {const_cast<char*>("_handle_data"), (getter)EagerTensor_tensor_handle,
      (setter)EagerTensor_settensor_handle, const_cast<char*>("_tensor_handle"),
      nullptr},
diff --git a/tensorflow/python/eager/pywrap_tfe_src.cc b/tensorflow/python/eager/pywrap_tfe_src.cc
index 6ca8eadbdeb..9ce500bc08e 100644
--- a/tensorflow/python/eager/pywrap_tfe_src.cc
+++ b/tensorflow/python/eager/pywrap_tfe_src.cc
@@ -1645,6 +1645,29 @@ PyObject* TFE_Py_TapeGradient(PyObject* tape, PyObject* target,
   if (PyErr_Occurred()) {
     return nullptr;
   }
+  tensorflow::gtl::FlatSet<tensorflow::int64> sources_set(sources_vec.begin(),
+                                                          sources_vec.end());
+
+  tensorflow::Safe_PyObjectPtr seq =
+      tensorflow::make_safe(PySequence_Fast(target, "expected a sequence"));
+  int len = PySequence_Fast_GET_SIZE(seq.get());
+  tensorflow::gtl::FlatMap<tensorflow::int64, PyTapeTensor>
+      source_tensors_that_are_targets;
+  for (int i = 0; i < len; ++i) {
+    tensorflow::int64 target_id = target_vec[i];
+    if (sources_set.find(target_id) != sources_set.end()) {
+      auto tensor = PySequence_Fast_GET_ITEM(seq.get(), i);
+      source_tensors_that_are_targets.insert(
+          std::make_pair(target_id, TapeTensorFromTensor(tensor)));
+    }
+    if (PyErr_Occurred()) {
+      return nullptr;
+    }
+  }
+  if (PyErr_Occurred()) {
+    return nullptr;
+  }
+
   std::vector<PyObject*> outgrad_vec;
   if (output_gradients != Py_None) {
     outgrad_vec = MakeTensorList(output_gradients);
@@ -1659,7 +1682,8 @@ PyObject* TFE_Py_TapeGradient(PyObject* tape, PyObject* target,
   }
   std::vector<PyObject*> result;
   status->status = tape_obj->tape->ComputeGradient(
-      *py_vspace, target_vec, sources_vec, outgrad_vec, &result);
+      *py_vspace, target_vec, sources_vec, source_tensors_that_are_targets,
+      outgrad_vec, &result);
   if (!status->status.ok()) {
     if (PyErr_Occurred()) {
       // Do not propagate the erroneous status as that would swallow the
@@ -2279,8 +2303,10 @@ bool ConvertToTensor(
       PyErr_SetString(
           PyExc_TypeError,
           tensorflow::strings::StrCat(
-              "Cannot convert value ", TFE_GetPythonString(input_str.get()),
-              " to EagerTensor with requested dtype: ", desired_dtype)
+              "Cannot convert provided value to EagerTensor. Provided value: ",
+              TFE_GetPythonString(input_str.get()), " Requested dtype: ",
+              tensorflow::DataTypeString(
+                  static_cast<tensorflow::DataType>(desired_dtype)))
               .c_str());
       return false;
     }
diff --git a/tensorflow/python/eager/tape.py b/tensorflow/python/eager/tape.py
index 1326f097130..e501b403a39 100644
--- a/tensorflow/python/eager/tape.py
+++ b/tensorflow/python/eager/tape.py
@@ -63,7 +63,7 @@ def watch_variable(tape, variable):
   """Marks this variable to be watched by the given tape."""
   strategy = distribution_strategy_context.get_distribution_strategy()
   if distribution_strategy_context.get_replica_context():
-    variables = [strategy.value_container(variable)]
+    variables = [strategy.extended.value_container(variable)]
   else:
     variables = strategy.unwrap(variable)
   for var in variables:
@@ -78,7 +78,7 @@ def variable_accessed(variable):
   """
   strategy = distribution_strategy_context.get_distribution_strategy()
   if distribution_strategy_context.get_replica_context():
-    variables = [strategy.value_container(variable)]
+    variables = [strategy.extended.value_container(variable)]
   else:
     variables = strategy.unwrap(variable)
   for var in variables:
diff --git a/tensorflow/python/eager/tape_test.py b/tensorflow/python/eager/tape_test.py
index acd0e569f11..48d3b8ac6ee 100644
--- a/tensorflow/python/eager/tape_test.py
+++ b/tensorflow/python/eager/tape_test.py
@@ -80,8 +80,8 @@ class TapeTest(test.TestCase):
       tf_e = tf_d + tf_f
       tf_da, tf_db = gradients_impl.gradients(tf_e, [tf_a, tf_b])
 
-      self.assertAllEqual(da, tf_da.eval())
-      self.assertAllEqual(db, tf_db.eval())
+      self.assertAllEqual(da, self.evaluate(tf_da))
+      self.assertAllEqual(db, self.evaluate(tf_db))
 
   def testBasicFunctional(self):
 
@@ -142,8 +142,8 @@ class TapeTest(test.TestCase):
       tf_rr = 2 * math_ops.reduce_sum(tf_mm)
       tf_da, tf_db = gradients_impl.gradients(tf_rr, [tf_a, tf_b])
 
-      self.assertAllEqual(da, tf_da.eval())
-      self.assertAllEqual(db, tf_db.eval())
+      self.assertAllEqual(da, self.evaluate(tf_da))
+      self.assertAllEqual(db, self.evaluate(tf_db))
 
   def testGcTwoOutputs(self):
 
diff --git a/tensorflow/python/eager/tensor_test.py b/tensorflow/python/eager/tensor_test.py
index f61d8478177..0ee2ff68c20 100644
--- a/tensorflow/python/eager/tensor_test.py
+++ b/tensorflow/python/eager/tensor_test.py
@@ -95,6 +95,18 @@ class TFETensorTest(test_util.TensorFlowTestCase):
     t = _create_tensor(values)
     self.assertAllEqual(values, t)
 
+  @test_util.assert_no_new_pyobjects_executing_eagerly
+  def testNumpyDtypeSurvivesThroughTensorConversion(self):
+    scalar_creators = [np.int32, np.int64, np.float32, np.float64]
+    conversion_functions = [ops.convert_to_tensor, constant_op.constant]
+
+    for scalar_creator in scalar_creators:
+      for conversion_function in conversion_functions:
+        np_val = scalar_creator(3)
+        tensor_val = conversion_function(np_val)
+        self.assertEqual(tensor_val.numpy().dtype, np_val.dtype)
+        self.assertEqual(tensor_val.numpy(), np_val)
+
   def testNumpyValueWithCast(self):
     values = np.array([3.0], dtype=np.float32)
     t = _create_tensor(values, dtype=dtypes.float64)
@@ -128,6 +140,23 @@ class TFETensorTest(test_util.TensorFlowTestCase):
     tensor = constant_op.constant(numpy_tensor)
     self.assertAllEqual(numpy_tensor.ndim, tensor.ndim)
 
+  def testLenAgreesWithNumpy(self):
+    numpy_tensor = np.asarray(1.0)
+    tensor = constant_op.constant(numpy_tensor)
+    with self.assertRaises(TypeError):
+      len(numpy_tensor)
+    with self.assertRaisesRegexp(
+        TypeError, r"Scalar tensor has no `len[(][)]`"):
+      len(tensor)
+
+    numpy_tensor = np.asarray([1.0, 2.0, 3.0])
+    tensor = constant_op.constant(numpy_tensor)
+    self.assertAllEqual(len(numpy_tensor), len(tensor))
+
+    numpy_tensor = np.asarray([[1.0, 2.0, 3.0], [1.0, 2.0, 3.0]])
+    tensor = constant_op.constant(numpy_tensor)
+    self.assertAllEqual(len(numpy_tensor), len(tensor))
+
   def testCopy(self):
     t = constant_op.constant(1.0)
     tt = copy.copy(t)
@@ -158,9 +187,13 @@ class TFETensorTest(test_util.TensorFlowTestCase):
     self.assertEqual(dtypes.float64, t.dtype)
 
   def testBool(self):
-    t = _create_tensor(False)
-    if t:
-      self.assertFalse(True)
+    self.assertFalse(bool(_create_tensor(False)))
+    self.assertFalse(bool(_create_tensor([False])))
+    self.assertFalse(bool(_create_tensor([[False]])))
+    self.assertFalse(bool(_create_tensor([0])))
+    self.assertFalse(bool(_create_tensor([0.])))
+    self.assertTrue(bool(_create_tensor([1])))
+    self.assertTrue(bool(_create_tensor([1.])))
 
   def testIntDowncast(self):
     t = _create_tensor(3)
@@ -306,6 +339,14 @@ class TFETensorTest(test_util.TensorFlowTestCase):
   def testConvertToTensorAllowsOverflow(self):
     _ = ops.convert_to_tensor(123456789, dtype=dtypes.uint8)
 
+  def testEagerTensorError(self):
+    with self.assertRaisesRegexp(
+        TypeError,
+        "Cannot convert provided value to EagerTensor. "
+        "Provided value.*Requested dtype.*"):
+      _ = ops.convert_to_tensor(1., dtype=dtypes.int32)
+
+
 
 class TFETensorUtilTest(test_util.TensorFlowTestCase):
 
diff --git a/tensorflow/python/eager/test.py b/tensorflow/python/eager/test.py
index 33ee797678e..a45deac962d 100644
--- a/tensorflow/python/eager/test.py
+++ b/tensorflow/python/eager/test.py
@@ -24,6 +24,6 @@ from tensorflow.python.platform.test import *  # pylint: disable=wildcard-import
 
 
 # TODO(akshayka): Do away with this file.
-def main(argv=None):
+def main(argv=None):  # pylint: disable=function-redefined
   _ops.enable_eager_execution()
   _test.main(argv)
diff --git a/tensorflow/python/eager/wrap_function.py b/tensorflow/python/eager/wrap_function.py
index 48266437ef5..2b39e99a4ea 100644
--- a/tensorflow/python/eager/wrap_function.py
+++ b/tensorflow/python/eager/wrap_function.py
@@ -26,6 +26,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.util import nest
+from tensorflow.python.util.tf_export import tf_export
 
 
 class VariableHolder(object):
@@ -45,6 +46,7 @@ class VariableHolder(object):
       return self._fn(*args, **kwargs)
 
 
+# TODO(allenl): make this checkpointable
 class WrappedFunction(function.Function):
   """Wraps a tf V1 piece of code in a function."""
 
@@ -77,6 +79,7 @@ class WrappedFunction(function.Function):
     return pruned_fn
 
 
+@tf_export(v1=["wrap_function"])
 def wrap_function(fn, signature, name=None):
   """Wraps the TF 1.x function fn into a graph function.
 
@@ -109,6 +112,21 @@ def wrap_function(fn, signature, name=None):
   assert float(f_sub(1.0)) == 3.0
   ```
 
+  Both `tf.compat.v1.wrap_function` and `tf.function` create a callable
+  TensorFlow graph. But while `tf.function` runs all stateful operations
+  (e.g. `tf.print`) and sequences operations to provide the same semantics as
+  eager execution, `wrap_function` is closer to the behavior of `session.run` in
+  TensorFlow 1.x. It will not run any operations unless they are required to
+  compute the function's outputs, either through a data dependency or a control
+  dependency. Nor will it sequence operations.
+
+  Unlike `tf.function`, `wrap_function` will only trace the Python function
+  once. As with placeholders in TF 1.x, shapes and dtypes must be provided to
+  `wrap_function`'s `signature` argument.
+
+  Since it is only traced once, variables and state may be created inside the
+  function and owned by the function wrapper object.
+
   Args:
     fn: python function to be wrapped
     signature: the placeholder and python arguments to be passed to the
diff --git a/tensorflow/python/feature_column/feature_column.py b/tensorflow/python/feature_column/feature_column.py
index b7a6a88535f..a858d92608d 100644
--- a/tensorflow/python/feature_column/feature_column.py
+++ b/tensorflow/python/feature_column/feature_column.py
@@ -230,7 +230,7 @@ def _internal_input_layer(features,
       return _get_logits()
 
 
-@tf_export('feature_column.input_layer')
+@tf_export(v1=['feature_column.input_layer'])
 def input_layer(features,
                 feature_columns,
                 weight_collections=None,
@@ -365,7 +365,7 @@ class InputLayer(object):
     return self._input_layer_template.weights
 
 
-@tf_export('feature_column.linear_model')
+@tf_export(v1=['feature_column.linear_model'])
 def linear_model(features,
                  feature_columns,
                  units=1,
@@ -746,7 +746,7 @@ def _transform_features(features, feature_columns):
   return outputs
 
 
-@tf_export('feature_column.make_parse_example_spec')
+@tf_export(v1=['feature_column.make_parse_example_spec'])
 def make_parse_example_spec(feature_columns):
   """Creates parsing spec dictionary from input feature_columns.
 
@@ -807,11 +807,14 @@ def make_parse_example_spec(feature_columns):
   return result
 
 
-@tf_export('feature_column.embedding_column')
-def embedding_column(
-    categorical_column, dimension, combiner='mean', initializer=None,
-    ckpt_to_load_from=None, tensor_name_in_ckpt=None, max_norm=None,
-    trainable=True):
+def _embedding_column(categorical_column,
+                      dimension,
+                      combiner='mean',
+                      initializer=None,
+                      ckpt_to_load_from=None,
+                      tensor_name_in_ckpt=None,
+                      max_norm=None,
+                      trainable=True):
   """`_DenseColumn` that converts from sparse, categorical input.
 
   Use this when your inputs are sparse, but you want to convert them to a dense
@@ -919,178 +922,11 @@ def embedding_column(
       trainable=trainable)
 
 
-@tf_export('feature_column.shared_embedding_columns')
-def shared_embedding_columns(
-    categorical_columns, dimension, combiner='mean', initializer=None,
-    shared_embedding_collection_name=None, ckpt_to_load_from=None,
-    tensor_name_in_ckpt=None, max_norm=None, trainable=True):
-  """List of dense columns that convert from sparse, categorical input.
-
-  This is similar to `embedding_column`, except that it produces a list of
-  embedding columns that share the same embedding weights.
-
-  Use this when your inputs are sparse and of the same type (e.g. watched and
-  impression video IDs that share the same vocabulary), and you want to convert
-  them to a dense representation (e.g., to feed to a DNN).
-
-  Inputs must be a list of categorical columns created by any of the
-  `categorical_column_*` function. They must all be of the same type and have
-  the same arguments except `key`. E.g. they can be
-  categorical_column_with_vocabulary_file with the same vocabulary_file. Some or
-  all columns could also be weighted_categorical_column.
-
-  Here is an example embedding of two features for a DNNClassifier model:
-
-  ```python
-  watched_video_id = categorical_column_with_vocabulary_file(
-      'watched_video_id', video_vocabulary_file, video_vocabulary_size)
-  impression_video_id = categorical_column_with_vocabulary_file(
-      'impression_video_id', video_vocabulary_file, video_vocabulary_size)
-  columns = shared_embedding_columns(
-      [watched_video_id, impression_video_id], dimension=10)
-
-  estimator = tf.estimator.DNNClassifier(feature_columns=columns, ...)
-
-  label_column = ...
-  def input_fn():
-    features = tf.parse_example(
-        ..., features=make_parse_example_spec(columns + [label_column]))
-    labels = features.pop(label_column.name)
-    return features, labels
-
-  estimator.train(input_fn=input_fn, steps=100)
-  ```
-
-  Here is an example using `shared_embedding_columns` with model_fn:
-
-  ```python
-  def model_fn(features, ...):
-    watched_video_id = categorical_column_with_vocabulary_file(
-        'watched_video_id', video_vocabulary_file, video_vocabulary_size)
-    impression_video_id = categorical_column_with_vocabulary_file(
-        'impression_video_id', video_vocabulary_file, video_vocabulary_size)
-    columns = shared_embedding_columns(
-        [watched_video_id, impression_video_id], dimension=10)
-    dense_tensor = input_layer(features, columns)
-    # Form DNN layers, calculate loss, and return EstimatorSpec.
-    ...
-  ```
-
-  Args:
-    categorical_columns: List of categorical columns created by a
-      `categorical_column_with_*` function. These columns produce the sparse IDs
-      that are inputs to the embedding lookup. All columns must be of the same
-      type and have the same arguments except `key`. E.g. they can be
-      categorical_column_with_vocabulary_file with the same vocabulary_file.
-      Some or all columns could also be weighted_categorical_column.
-    dimension: An integer specifying dimension of the embedding, must be > 0.
-    combiner: A string specifying how to reduce if there are multiple entries
-      in a single row. Currently 'mean', 'sqrtn' and 'sum' are supported, with
-      'mean' the default. 'sqrtn' often achieves good accuracy, in particular
-      with bag-of-words columns. Each of this can be thought as example level
-      normalizations on the column. For more information, see
-      `tf.embedding_lookup_sparse`.
-    initializer: A variable initializer function to be used in embedding
-      variable initialization. If not specified, defaults to
-      `tf.truncated_normal_initializer` with mean `0.0` and standard deviation
-      `1/sqrt(dimension)`.
-    shared_embedding_collection_name: Optional name of the collection where
-      shared embedding weights are added. If not given, a reasonable name will
-      be chosen based on the names of `categorical_columns`. This is also used
-      in `variable_scope` when creating shared embedding weights.
-    ckpt_to_load_from: String representing checkpoint name/pattern from which to
-      restore column weights. Required if `tensor_name_in_ckpt` is not `None`.
-    tensor_name_in_ckpt: Name of the `Tensor` in `ckpt_to_load_from` from
-      which to restore the column weights. Required if `ckpt_to_load_from` is
-      not `None`.
-    max_norm: If not `None`, each embedding is clipped if its l2-norm is
-      larger than this value, before combining.
-    trainable: Whether or not the embedding is trainable. Default is True.
-
-  Returns:
-    A list of dense columns that converts from sparse input. The order of
-    results follows the ordering of `categorical_columns`.
-
-  Raises:
-    ValueError: if `dimension` not > 0.
-    ValueError: if any of the given `categorical_columns` is of different type
-      or has different arguments than the others.
-    ValueError: if exactly one of `ckpt_to_load_from` and `tensor_name_in_ckpt`
-      is specified.
-    ValueError: if `initializer` is specified and is not callable.
-    RuntimeError: if eager execution is enabled.
-  """
-  if context.executing_eagerly():
-    raise RuntimeError('shared_embedding_columns are not supported when eager '
-                       'execution is enabled.')
-
-  if (dimension is None) or (dimension < 1):
-    raise ValueError('Invalid dimension {}.'.format(dimension))
-  if (ckpt_to_load_from is None) != (tensor_name_in_ckpt is None):
-    raise ValueError('Must specify both `ckpt_to_load_from` and '
-                     '`tensor_name_in_ckpt` or none of them.')
-
-  if (initializer is not None) and (not callable(initializer)):
-    raise ValueError('initializer must be callable if specified.')
-  if initializer is None:
-    initializer = init_ops.truncated_normal_initializer(
-        mean=0.0, stddev=1. / math.sqrt(dimension))
-
-  # Sort the columns so the default collection name is deterministic even if the
-  # user passes columns from an unsorted collection, such as dict.values().
-  sorted_columns = sorted(categorical_columns, key=lambda x: x.name)
-
-  c0 = sorted_columns[0]
-  num_buckets = c0._num_buckets  # pylint: disable=protected-access
-  if not isinstance(c0, _CategoricalColumn):
-    raise ValueError(
-        'All categorical_columns must be subclasses of _CategoricalColumn. '
-        'Given: {}, of type: {}'.format(c0, type(c0)))
-  if isinstance(c0, _WeightedCategoricalColumn):
-    c0 = c0.categorical_column
-  for c in sorted_columns[1:]:
-    if isinstance(c, _WeightedCategoricalColumn):
-      c = c.categorical_column
-    if not isinstance(c, type(c0)):
-      raise ValueError(
-          'To use shared_embedding_column, all categorical_columns must have '
-          'the same type, or be weighted_categorical_column of the same type. '
-          'Given column: {} of type: {} does not match given column: {} of '
-          'type: {}'.format(c0, type(c0), c, type(c)))
-    if num_buckets != c._num_buckets:  # pylint: disable=protected-access
-      raise ValueError(
-          'To use shared_embedding_column, all categorical_columns must have '
-          'the same number of buckets. Given column: {} with buckets: {} does  '
-          'not match column: {} with buckets: {}'.format(
-              c0, num_buckets, c, c._num_buckets))  # pylint: disable=protected-access
-
-  if not shared_embedding_collection_name:
-    shared_embedding_collection_name = '_'.join(c.name for c in sorted_columns)
-    shared_embedding_collection_name += '_shared_embedding'
-
-  result = []
-  for column in categorical_columns:
-    result.append(
-        _SharedEmbeddingColumn(
-            categorical_column=column,
-            initializer=initializer,
-            dimension=dimension,
-            combiner=combiner,
-            shared_embedding_collection_name=shared_embedding_collection_name,
-            ckpt_to_load_from=ckpt_to_load_from,
-            tensor_name_in_ckpt=tensor_name_in_ckpt,
-            max_norm=max_norm,
-            trainable=trainable))
-
-  return result
-
-
-@tf_export('feature_column.numeric_column')
-def numeric_column(key,
-                   shape=(1,),
-                   default_value=None,
-                   dtype=dtypes.float32,
-                   normalizer_fn=None):
+def _numeric_column(key,
+                    shape=(1,),
+                    default_value=None,
+                    dtype=dtypes.float32,
+                    normalizer_fn=None):
   """Represents real valued or numerical features.
 
   Example:
@@ -1161,8 +997,7 @@ def numeric_column(key,
       normalizer_fn=normalizer_fn)
 
 
-@tf_export('feature_column.bucketized_column')
-def bucketized_column(source_column, boundaries):
+def _bucketized_column(source_column, boundaries):
   """Represents discretized dense input.
 
   Buckets include the left boundary, and exclude the right boundary. Namely,
@@ -1258,10 +1093,9 @@ def _assert_key_is_string(key):
             type(key), key))
 
 
-@tf_export('feature_column.categorical_column_with_hash_bucket')
-def categorical_column_with_hash_bucket(key,
-                                        hash_bucket_size,
-                                        dtype=dtypes.string):
+def _categorical_column_with_hash_bucket(key,
+                                         hash_bucket_size,
+                                         dtype=dtypes.string):
   """Represents sparse feature where ids are set by hashing.
 
   Use this when your sparse features are in string or integer format, and you
@@ -1317,13 +1151,12 @@ def categorical_column_with_hash_bucket(key,
   return _HashedCategoricalColumn(key, hash_bucket_size, dtype)
 
 
-@tf_export('feature_column.categorical_column_with_vocabulary_file')
-def categorical_column_with_vocabulary_file(key,
-                                            vocabulary_file,
-                                            vocabulary_size=None,
-                                            num_oov_buckets=0,
-                                            default_value=None,
-                                            dtype=dtypes.string):
+def _categorical_column_with_vocabulary_file(key,
+                                             vocabulary_file,
+                                             vocabulary_size=None,
+                                             num_oov_buckets=0,
+                                             default_value=None,
+                                             dtype=dtypes.string):
   """A `_CategoricalColumn` with a vocabulary file.
 
   Use this when your inputs are in string or integer format, and you have a
@@ -1437,9 +1270,11 @@ def categorical_column_with_vocabulary_file(key,
       dtype=dtype)
 
 
-@tf_export('feature_column.categorical_column_with_vocabulary_list')
-def categorical_column_with_vocabulary_list(
-    key, vocabulary_list, dtype=None, default_value=-1, num_oov_buckets=0):
+def _categorical_column_with_vocabulary_list(key,
+                                             vocabulary_list,
+                                             dtype=None,
+                                             default_value=-1,
+                                             num_oov_buckets=0):
   """A `_CategoricalColumn` with in-memory vocabulary.
 
   Use this when your inputs are in string or integer format, and you have an
@@ -1548,8 +1383,7 @@ def categorical_column_with_vocabulary_list(
       default_value=default_value, num_oov_buckets=num_oov_buckets)
 
 
-@tf_export('feature_column.categorical_column_with_identity')
-def categorical_column_with_identity(key, num_buckets, default_value=None):
+def _categorical_column_with_identity(key, num_buckets, default_value=None):
   """A `_CategoricalColumn` that returns identity values.
 
   Use this when your inputs are integers in the range `[0, num_buckets)`, and
@@ -1616,8 +1450,7 @@ def categorical_column_with_identity(key, num_buckets, default_value=None):
       key=key, num_buckets=num_buckets, default_value=default_value)
 
 
-@tf_export('feature_column.indicator_column')
-def indicator_column(categorical_column):
+def _indicator_column(categorical_column):
   """Represents multi-hot representation of given categorical column.
 
   - For DNN model, `indicator_column` can be used to wrap any
@@ -1651,9 +1484,9 @@ def indicator_column(categorical_column):
   return _IndicatorColumn(categorical_column)
 
 
-@tf_export('feature_column.weighted_categorical_column')
-def weighted_categorical_column(
-    categorical_column, weight_feature_key, dtype=dtypes.float32):
+def _weighted_categorical_column(categorical_column,
+                                 weight_feature_key,
+                                 dtype=dtypes.float32):
   """Applies weight values to a `_CategoricalColumn`.
 
   Use this when each of your sparse inputs has both an ID and a value. For
@@ -1726,8 +1559,7 @@ def weighted_categorical_column(
       dtype=dtype)
 
 
-@tf_export('feature_column.crossed_column')
-def crossed_column(keys, hash_bucket_size, hash_key=None):
+def _crossed_column(keys, hash_bucket_size, hash_key=None):
   """Returns a column for performing crosses of categorical features.
 
   Crossed features will be hashed according to `hash_bucket_size`. Conceptually,
diff --git a/tensorflow/python/feature_column/feature_column_lib.py b/tensorflow/python/feature_column/feature_column_lib.py
index 3b818f18b5b..68a2712425c 100644
--- a/tensorflow/python/feature_column/feature_column_lib.py
+++ b/tensorflow/python/feature_column/feature_column_lib.py
@@ -20,4 +20,5 @@ from __future__ import print_function
 
 # pylint: disable=unused-import,line-too-long,wildcard-import
 from tensorflow.python.feature_column.feature_column import *
+from tensorflow.python.feature_column.feature_column_v2 import *
 # pylint: enable=unused-import,line-too-long
diff --git a/tensorflow/python/feature_column/feature_column_test.py b/tensorflow/python/feature_column/feature_column_test.py
index 1ae510250cf..daa0a3b3a4b 100644
--- a/tensorflow/python/feature_column/feature_column_test.py
+++ b/tensorflow/python/feature_column/feature_column_test.py
@@ -30,7 +30,8 @@ from tensorflow.core.protobuf import rewriter_config_pb2
 from tensorflow.python.client import session
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
-from tensorflow.python.feature_column import feature_column_lib as fc
+from tensorflow.python.feature_column import feature_column as fc
+from tensorflow.python.feature_column import feature_column_v2 as fc_new
 from tensorflow.python.feature_column.feature_column import _CategoricalColumn
 from tensorflow.python.feature_column.feature_column import _DenseColumn
 from tensorflow.python.feature_column.feature_column import _FeatureColumn
@@ -169,6 +170,7 @@ class LazyColumnTest(test.TestCase):
         TypeError, '"key" must be either a "str" or "_FeatureColumn".'):
       builder.get(NotAFeatureColumn())
 
+  @test_util.run_deprecated_v1
   def test_expand_dim_rank_1_sparse_tensor_empty_batch(self):
     # empty 1-D sparse tensor:
     builder = _LazyBuilder(features={'a': sparse_tensor.SparseTensor(
@@ -184,8 +186,9 @@ class LazyColumnTest(test.TestCase):
 
 class NumericColumnTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def test_defaults(self):
-    a = fc.numeric_column('aaa')
+    a = fc._numeric_column('aaa')
     self.assertEqual('aaa', a.key)
     self.assertEqual('aaa', a.name)
     self.assertEqual('aaa', a._var_scope_name)
@@ -196,53 +199,53 @@ class NumericColumnTest(test.TestCase):
 
   def test_key_should_be_string(self):
     with self.assertRaisesRegexp(ValueError, 'key must be a string.'):
-      fc.numeric_column(key=('aaa',))
+      fc._numeric_column(key=('aaa',))
 
   def test_shape_saved_as_tuple(self):
-    a = fc.numeric_column('aaa', shape=[1, 2], default_value=[[3, 2.]])
+    a = fc._numeric_column('aaa', shape=[1, 2], default_value=[[3, 2.]])
     self.assertEqual((1, 2), a.shape)
 
   def test_default_value_saved_as_tuple(self):
-    a = fc.numeric_column('aaa', default_value=4.)
+    a = fc._numeric_column('aaa', default_value=4.)
     self.assertEqual((4.,), a.default_value)
-    a = fc.numeric_column('aaa', shape=[1, 2], default_value=[[3, 2.]])
+    a = fc._numeric_column('aaa', shape=[1, 2], default_value=[[3, 2.]])
     self.assertEqual(((3., 2.),), a.default_value)
 
   def test_shape_and_default_value_compatibility(self):
-    fc.numeric_column('aaa', shape=[2], default_value=[1, 2.])
+    fc._numeric_column('aaa', shape=[2], default_value=[1, 2.])
     with self.assertRaisesRegexp(ValueError, 'The shape of default_value'):
-      fc.numeric_column('aaa', shape=[2], default_value=[1, 2, 3.])
-    fc.numeric_column(
+      fc._numeric_column('aaa', shape=[2], default_value=[1, 2, 3.])
+    fc._numeric_column(
         'aaa', shape=[3, 2], default_value=[[2, 3], [1, 2], [2, 3.]])
     with self.assertRaisesRegexp(ValueError, 'The shape of default_value'):
-      fc.numeric_column(
+      fc._numeric_column(
           'aaa', shape=[3, 1], default_value=[[2, 3], [1, 2], [2, 3.]])
     with self.assertRaisesRegexp(ValueError, 'The shape of default_value'):
-      fc.numeric_column(
+      fc._numeric_column(
           'aaa', shape=[3, 3], default_value=[[2, 3], [1, 2], [2, 3.]])
 
   def test_default_value_type_check(self):
-    fc.numeric_column(
+    fc._numeric_column(
         'aaa', shape=[2], default_value=[1, 2.], dtype=dtypes.float32)
-    fc.numeric_column(
+    fc._numeric_column(
         'aaa', shape=[2], default_value=[1, 2], dtype=dtypes.int32)
     with self.assertRaisesRegexp(TypeError, 'must be compatible with dtype'):
-      fc.numeric_column(
+      fc._numeric_column(
           'aaa', shape=[2], default_value=[1, 2.], dtype=dtypes.int32)
     with self.assertRaisesRegexp(TypeError,
                                  'default_value must be compatible with dtype'):
-      fc.numeric_column('aaa', default_value=['string'])
+      fc._numeric_column('aaa', default_value=['string'])
 
   def test_shape_must_be_positive_integer(self):
     with self.assertRaisesRegexp(TypeError, 'shape dimensions must be integer'):
-      fc.numeric_column(
+      fc._numeric_column(
           'aaa', shape=[
               1.0,
           ])
 
     with self.assertRaisesRegexp(ValueError,
                                  'shape dimensions must be greater than 0'):
-      fc.numeric_column(
+      fc._numeric_column(
           'aaa', shape=[
               0,
           ])
@@ -250,20 +253,21 @@ class NumericColumnTest(test.TestCase):
   def test_dtype_is_convertible_to_float(self):
     with self.assertRaisesRegexp(ValueError,
                                  'dtype must be convertible to float'):
-      fc.numeric_column('aaa', dtype=dtypes.string)
+      fc._numeric_column('aaa', dtype=dtypes.string)
 
   def test_scalar_default_value_fills_the_shape(self):
-    a = fc.numeric_column('aaa', shape=[2, 3], default_value=2.)
+    a = fc._numeric_column('aaa', shape=[2, 3], default_value=2.)
     self.assertEqual(((2., 2., 2.), (2., 2., 2.)), a.default_value)
 
   def test_parse_spec(self):
-    a = fc.numeric_column('aaa', shape=[2, 3], dtype=dtypes.int32)
+    a = fc._numeric_column('aaa', shape=[2, 3], dtype=dtypes.int32)
     self.assertEqual({
         'aaa': parsing_ops.FixedLenFeature((2, 3), dtype=dtypes.int32)
     }, a._parse_example_spec)
 
+  @test_util.run_deprecated_v1
   def test_parse_example_no_default_value(self):
-    price = fc.numeric_column('price', shape=[2])
+    price = fc._numeric_column('price', shape=[2])
     data = example_pb2.Example(features=feature_pb2.Features(
         feature={
             'price':
@@ -277,8 +281,9 @@ class NumericColumnTest(test.TestCase):
     with self.cached_session():
       self.assertAllEqual([[20., 110.]], features['price'].eval())
 
+  @test_util.run_deprecated_v1
   def test_parse_example_with_default_value(self):
-    price = fc.numeric_column('price', shape=[2], default_value=11.)
+    price = fc._numeric_column('price', shape=[2], default_value=11.)
     data = example_pb2.Example(features=feature_pb2.Features(
         feature={
             'price':
@@ -301,29 +306,31 @@ class NumericColumnTest(test.TestCase):
 
   def test_normalizer_fn_must_be_callable(self):
     with self.assertRaisesRegexp(TypeError, 'must be a callable'):
-      fc.numeric_column('price', normalizer_fn='NotACallable')
+      fc._numeric_column('price', normalizer_fn='NotACallable')
 
+  @test_util.run_deprecated_v1
   def test_normalizer_fn_transform_feature(self):
 
     def _increment_two(input_tensor):
       return input_tensor + 2.
 
-    price = fc.numeric_column('price', shape=[2], normalizer_fn=_increment_two)
+    price = fc._numeric_column('price', shape=[2], normalizer_fn=_increment_two)
     output = _transform_features({'price': [[1., 2.], [5., 6.]]}, [price])
     with self.cached_session():
       self.assertAllEqual([[3., 4.], [7., 8.]], output[price].eval())
 
+  @test_util.run_deprecated_v1
   def test_get_dense_tensor(self):
 
     def _increment_two(input_tensor):
       return input_tensor + 2.
 
-    price = fc.numeric_column('price', shape=[2], normalizer_fn=_increment_two)
+    price = fc._numeric_column('price', shape=[2], normalizer_fn=_increment_two)
     builder = _LazyBuilder({'price': [[1., 2.], [5., 6.]]})
     self.assertEqual(builder.get(price), price._get_dense_tensor(builder))
 
   def test_sparse_tensor_not_supported(self):
-    price = fc.numeric_column('price')
+    price = fc._numeric_column('price')
     builder = _LazyBuilder({
         'price':
             sparse_tensor.SparseTensor(
@@ -332,109 +339,113 @@ class NumericColumnTest(test.TestCase):
     with self.assertRaisesRegexp(ValueError, 'must be a Tensor'):
       price._transform_feature(builder)
 
+  @test_util.run_deprecated_v1
   def test_deep_copy(self):
-    a = fc.numeric_column('aaa', shape=[1, 2], default_value=[[3., 2.]])
+    a = fc._numeric_column('aaa', shape=[1, 2], default_value=[[3., 2.]])
     a_copy = copy.deepcopy(a)
     self.assertEqual(a_copy.name, 'aaa')
     self.assertEqual(a_copy.shape, (1, 2))
     self.assertEqual(a_copy.default_value, ((3., 2.),))
 
   def test_numpy_default_value(self):
-    a = fc.numeric_column(
+    a = fc._numeric_column(
         'aaa', shape=[1, 2], default_value=np.array([[3., 2.]]))
     self.assertEqual(a.default_value, ((3., 2.),))
 
+  @test_util.run_deprecated_v1
   def test_linear_model(self):
-    price = fc.numeric_column('price')
+    price = fc._numeric_column('price')
     with ops.Graph().as_default():
       features = {'price': [[1.], [5.]]}
       predictions = fc.linear_model(features, [price])
       bias = get_linear_model_bias()
       price_var = get_linear_model_column_var(price)
       with _initialized_session() as sess:
-        self.assertAllClose([0.], bias.eval())
-        self.assertAllClose([[0.]], price_var.eval())
-        self.assertAllClose([[0.], [0.]], predictions.eval())
+        self.assertAllClose([0.], self.evaluate(bias))
+        self.assertAllClose([[0.]], self.evaluate(price_var))
+        self.assertAllClose([[0.], [0.]], self.evaluate(predictions))
         sess.run(price_var.assign([[10.]]))
-        self.assertAllClose([[10.], [50.]], predictions.eval())
+        self.assertAllClose([[10.], [50.]], self.evaluate(predictions))
 
+  @test_util.run_deprecated_v1
   def test_keras_linear_model(self):
-    price = fc.numeric_column('price')
+    price = fc._numeric_column('price')
     with ops.Graph().as_default():
       features = {'price': [[1.], [5.]]}
       predictions = get_keras_linear_model_predictions(features, [price])
       bias = get_linear_model_bias()
       price_var = get_linear_model_column_var(price)
       with _initialized_session() as sess:
-        self.assertAllClose([0.], bias.eval())
-        self.assertAllClose([[0.]], price_var.eval())
-        self.assertAllClose([[0.], [0.]], predictions.eval())
+        self.assertAllClose([0.], self.evaluate(bias))
+        self.assertAllClose([[0.]], self.evaluate(price_var))
+        self.assertAllClose([[0.], [0.]], self.evaluate(predictions))
         sess.run(price_var.assign([[10.]]))
-        self.assertAllClose([[10.], [50.]], predictions.eval())
+        self.assertAllClose([[10.], [50.]], self.evaluate(predictions))
 
 
 class BucketizedColumnTest(test.TestCase):
 
   def test_invalid_source_column_type(self):
-    a = fc.categorical_column_with_hash_bucket('aaa', hash_bucket_size=10)
+    a = fc._categorical_column_with_hash_bucket('aaa', hash_bucket_size=10)
     with self.assertRaisesRegexp(
         ValueError,
         'source_column must be a column generated with numeric_column'):
-      fc.bucketized_column(a, boundaries=[0, 1])
+      fc._bucketized_column(a, boundaries=[0, 1])
 
   def test_invalid_source_column_shape(self):
-    a = fc.numeric_column('aaa', shape=[2, 3])
+    a = fc._numeric_column('aaa', shape=[2, 3])
     with self.assertRaisesRegexp(
         ValueError, 'source_column must be one-dimensional column'):
-      fc.bucketized_column(a, boundaries=[0, 1])
+      fc._bucketized_column(a, boundaries=[0, 1])
 
   def test_invalid_boundaries(self):
-    a = fc.numeric_column('aaa')
+    a = fc._numeric_column('aaa')
     with self.assertRaisesRegexp(
         ValueError, 'boundaries must be a sorted list'):
-      fc.bucketized_column(a, boundaries=None)
+      fc._bucketized_column(a, boundaries=None)
     with self.assertRaisesRegexp(
         ValueError, 'boundaries must be a sorted list'):
-      fc.bucketized_column(a, boundaries=1.)
+      fc._bucketized_column(a, boundaries=1.)
     with self.assertRaisesRegexp(
         ValueError, 'boundaries must be a sorted list'):
-      fc.bucketized_column(a, boundaries=[1, 0])
+      fc._bucketized_column(a, boundaries=[1, 0])
     with self.assertRaisesRegexp(
         ValueError, 'boundaries must be a sorted list'):
-      fc.bucketized_column(a, boundaries=[1, 1])
+      fc._bucketized_column(a, boundaries=[1, 1])
 
   def test_name(self):
-    a = fc.numeric_column('aaa', dtype=dtypes.int32)
-    b = fc.bucketized_column(a, boundaries=[0, 1])
+    a = fc._numeric_column('aaa', dtype=dtypes.int32)
+    b = fc._bucketized_column(a, boundaries=[0, 1])
     self.assertEqual('aaa_bucketized', b.name)
 
   def test_var_scope_name(self):
-    a = fc.numeric_column('aaa', dtype=dtypes.int32)
-    b = fc.bucketized_column(a, boundaries=[0, 1])
+    a = fc._numeric_column('aaa', dtype=dtypes.int32)
+    b = fc._bucketized_column(a, boundaries=[0, 1])
     self.assertEqual('aaa_bucketized', b._var_scope_name)
 
   def test_parse_spec(self):
-    a = fc.numeric_column('aaa', shape=[2], dtype=dtypes.int32)
-    b = fc.bucketized_column(a, boundaries=[0, 1])
+    a = fc._numeric_column('aaa', shape=[2], dtype=dtypes.int32)
+    b = fc._bucketized_column(a, boundaries=[0, 1])
     self.assertEqual({
         'aaa': parsing_ops.FixedLenFeature((2,), dtype=dtypes.int32)
     }, b._parse_example_spec)
 
   def test_variable_shape(self):
-    a = fc.numeric_column('aaa', shape=[2], dtype=dtypes.int32)
-    b = fc.bucketized_column(a, boundaries=[0, 1])
+    a = fc._numeric_column('aaa', shape=[2], dtype=dtypes.int32)
+    b = fc._bucketized_column(a, boundaries=[0, 1])
     # Column 'aaa` has shape [2] times three buckets -> variable_shape=[2, 3].
     self.assertAllEqual((2, 3), b._variable_shape)
 
   def test_num_buckets(self):
-    a = fc.numeric_column('aaa', shape=[2], dtype=dtypes.int32)
-    b = fc.bucketized_column(a, boundaries=[0, 1])
+    a = fc._numeric_column('aaa', shape=[2], dtype=dtypes.int32)
+    b = fc._bucketized_column(a, boundaries=[0, 1])
     # Column 'aaa` has shape [2] times three buckets -> num_buckets=6.
     self.assertEqual(6, b._num_buckets)
 
+  @test_util.run_deprecated_v1
   def test_parse_example(self):
-    price = fc.numeric_column('price', shape=[2])
-    bucketized_price = fc.bucketized_column(price, boundaries=[0, 50])
+    price = fc._numeric_column('price', shape=[2])
+    bucketized_price = fc._bucketized_column(price, boundaries=[0, 50])
     data = example_pb2.Example(features=feature_pb2.Features(
         feature={
             'price':
@@ -448,9 +459,10 @@ class BucketizedColumnTest(test.TestCase):
     with self.cached_session():
       self.assertAllEqual([[20., 110.]], features['price'].eval())
 
+  @test_util.run_deprecated_v1
   def test_transform_feature(self):
-    price = fc.numeric_column('price', shape=[2])
-    bucketized_price = fc.bucketized_column(price, boundaries=[0, 2, 4, 6])
+    price = fc._numeric_column('price', shape=[2])
+    bucketized_price = fc._bucketized_column(price, boundaries=[0, 2, 4, 6])
     with ops.Graph().as_default():
       transformed_tensor = _transform_features({
           'price': [[-1., 1.], [5., 6.]]
@@ -461,24 +473,22 @@ class BucketizedColumnTest(test.TestCase):
 
   def test_get_dense_tensor_one_input_value(self):
     """Tests _get_dense_tensor() for input with shape=[1]."""
-    price = fc.numeric_column('price', shape=[1])
-    bucketized_price = fc.bucketized_column(price, boundaries=[0, 2, 4, 6])
+    price = fc._numeric_column('price', shape=[1])
+    bucketized_price = fc._bucketized_column(price, boundaries=[0, 2, 4, 6])
     with ops.Graph().as_default():
       builder = _LazyBuilder({'price': [[-1.], [1.], [5.], [6.]]})
       with _initialized_session():
         bucketized_price_tensor = bucketized_price._get_dense_tensor(builder)
         self.assertAllClose(
             # One-hot tensor.
-            [[[1., 0., 0., 0., 0.]],
-             [[0., 1., 0., 0., 0.]],
-             [[0., 0., 0., 1., 0.]],
-             [[0., 0., 0., 0., 1.]]],
-            bucketized_price_tensor.eval())
+            [[[1., 0., 0., 0., 0.]], [[0., 1., 0., 0., 0.]],
+             [[0., 0., 0., 1., 0.]], [[0., 0., 0., 0., 1.]]],
+            self.evaluate(bucketized_price_tensor))
 
   def test_get_dense_tensor_two_input_values(self):
     """Tests _get_dense_tensor() for input with shape=[2]."""
-    price = fc.numeric_column('price', shape=[2])
-    bucketized_price = fc.bucketized_column(price, boundaries=[0, 2, 4, 6])
+    price = fc._numeric_column('price', shape=[2])
+    bucketized_price = fc._bucketized_column(price, boundaries=[0, 2, 4, 6])
     with ops.Graph().as_default():
       builder = _LazyBuilder({'price': [[-1., 1.], [5., 6.]]})
       with _initialized_session():
@@ -487,12 +497,12 @@ class BucketizedColumnTest(test.TestCase):
             # One-hot tensor.
             [[[1., 0., 0., 0., 0.], [0., 1., 0., 0., 0.]],
              [[0., 0., 0., 1., 0.], [0., 0., 0., 0., 1.]]],
-            bucketized_price_tensor.eval())
+            self.evaluate(bucketized_price_tensor))
 
   def test_get_sparse_tensors_one_input_value(self):
     """Tests _get_sparse_tensors() for input with shape=[1]."""
-    price = fc.numeric_column('price', shape=[1])
-    bucketized_price = fc.bucketized_column(price, boundaries=[0, 2, 4, 6])
+    price = fc._numeric_column('price', shape=[1])
+    bucketized_price = fc._bucketized_column(price, boundaries=[0, 2, 4, 6])
     with ops.Graph().as_default():
       builder = _LazyBuilder({'price': [[-1.], [1.], [5.], [6.]]})
       with _initialized_session() as sess:
@@ -506,8 +516,8 @@ class BucketizedColumnTest(test.TestCase):
 
   def test_get_sparse_tensors_two_input_values(self):
     """Tests _get_sparse_tensors() for input with shape=[2]."""
-    price = fc.numeric_column('price', shape=[2])
-    bucketized_price = fc.bucketized_column(price, boundaries=[0, 2, 4, 6])
+    price = fc._numeric_column('price', shape=[2])
+    bucketized_price = fc._bucketized_column(price, boundaries=[0, 2, 4, 6])
     with ops.Graph().as_default():
       builder = _LazyBuilder({'price': [[-1., 1.], [5., 6.]]})
       with _initialized_session() as sess:
@@ -522,8 +532,8 @@ class BucketizedColumnTest(test.TestCase):
         self.assertAllEqual([2, 2], id_tensor_value.dense_shape)
 
   def test_sparse_tensor_input_not_supported(self):
-    price = fc.numeric_column('price')
-    bucketized_price = fc.bucketized_column(price, boundaries=[0, 1])
+    price = fc._numeric_column('price')
+    bucketized_price = fc._bucketized_column(price, boundaries=[0, 1])
     builder = _LazyBuilder({
         'price':
             sparse_tensor.SparseTensor(
@@ -532,9 +542,10 @@ class BucketizedColumnTest(test.TestCase):
     with self.assertRaisesRegexp(ValueError, 'must be a Tensor'):
       bucketized_price._transform_feature(builder)
 
+  @test_util.run_deprecated_v1
   def test_deep_copy(self):
-    a = fc.numeric_column('aaa', shape=[2])
-    a_bucketized = fc.bucketized_column(a, boundaries=[0, 1])
+    a = fc._numeric_column('aaa', shape=[2])
+    a_bucketized = fc._bucketized_column(a, boundaries=[0, 1])
     a_bucketized_copy = copy.deepcopy(a_bucketized)
     self.assertEqual(a_bucketized_copy.name, 'aaa_bucketized')
     self.assertAllEqual(a_bucketized_copy._variable_shape, (2, 3))
@@ -542,45 +553,48 @@ class BucketizedColumnTest(test.TestCase):
 
   def test_linear_model_one_input_value(self):
     """Tests linear_model() for input with shape=[1]."""
-    price = fc.numeric_column('price', shape=[1])
-    bucketized_price = fc.bucketized_column(price, boundaries=[0, 2, 4, 6])
+    price = fc._numeric_column('price', shape=[1])
+    bucketized_price = fc._bucketized_column(price, boundaries=[0, 2, 4, 6])
     with ops.Graph().as_default():
       features = {'price': [[-1.], [1.], [5.], [6.]]}
       predictions = fc.linear_model(features, [bucketized_price])
       bias = get_linear_model_bias()
       bucketized_price_var = get_linear_model_column_var(bucketized_price)
       with _initialized_session() as sess:
-        self.assertAllClose([0.], bias.eval())
+        self.assertAllClose([0.], self.evaluate(bias))
         # One weight variable per bucket, all initialized to zero.
-        self.assertAllClose(
-            [[0.], [0.], [0.], [0.], [0.]], bucketized_price_var.eval())
-        self.assertAllClose([[0.], [0.], [0.], [0.]], predictions.eval())
+        self.assertAllClose([[0.], [0.], [0.], [0.], [0.]],
+                            self.evaluate(bucketized_price_var))
+        self.assertAllClose([[0.], [0.], [0.], [0.]],
+                            self.evaluate(predictions))
         sess.run(bucketized_price_var.assign(
             [[10.], [20.], [30.], [40.], [50.]]))
         # price -1. is in the 0th bucket, whose weight is 10.
         # price 1. is in the 1st bucket, whose weight is 20.
         # price 5. is in the 3rd bucket, whose weight is 40.
         # price 6. is in the 4th bucket, whose weight is 50.
-        self.assertAllClose([[10.], [20.], [40.], [50.]], predictions.eval())
+        self.assertAllClose([[10.], [20.], [40.], [50.]],
+                            self.evaluate(predictions))
         sess.run(bias.assign([1.]))
-        self.assertAllClose([[11.], [21.], [41.], [51.]], predictions.eval())
+        self.assertAllClose([[11.], [21.], [41.], [51.]],
+                            self.evaluate(predictions))
 
   def test_linear_model_two_input_values(self):
     """Tests linear_model() for input with shape=[2]."""
-    price = fc.numeric_column('price', shape=[2])
-    bucketized_price = fc.bucketized_column(price, boundaries=[0, 2, 4, 6])
+    price = fc._numeric_column('price', shape=[2])
+    bucketized_price = fc._bucketized_column(price, boundaries=[0, 2, 4, 6])
     with ops.Graph().as_default():
       features = {'price': [[-1., 1.], [5., 6.]]}
       predictions = fc.linear_model(features, [bucketized_price])
       bias = get_linear_model_bias()
       bucketized_price_var = get_linear_model_column_var(bucketized_price)
       with _initialized_session() as sess:
-        self.assertAllClose([0.], bias.eval())
+        self.assertAllClose([0.], self.evaluate(bias))
         # One weight per bucket per input column, all initialized to zero.
         self.assertAllClose(
             [[0.], [0.], [0.], [0.], [0.], [0.], [0.], [0.], [0.], [0.]],
-            bucketized_price_var.eval())
-        self.assertAllClose([[0.], [0.]], predictions.eval())
+            self.evaluate(bucketized_price_var))
+        self.assertAllClose([[0.], [0.]], self.evaluate(predictions))
         sess.run(bucketized_price_var.assign(
             [[10.], [20.], [30.], [40.], [50.],
              [60.], [70.], [80.], [90.], [100.]]))
@@ -590,14 +604,14 @@ class BucketizedColumnTest(test.TestCase):
         # 2nd example:
         #   price 5. is in the 3rd bucket, whose weight is 40.
         #   price 6. is in the 9th bucket, whose weight is 100.
-        self.assertAllClose([[80.], [140.]], predictions.eval())
+        self.assertAllClose([[80.], [140.]], self.evaluate(predictions))
         sess.run(bias.assign([1.]))
-        self.assertAllClose([[81.], [141.]], predictions.eval())
+        self.assertAllClose([[81.], [141.]], self.evaluate(predictions))
 
   def test_keras_linear_model_one_input_value(self):
     """Tests _LinearModel for input with shape=[1]."""
-    price = fc.numeric_column('price', shape=[1])
-    bucketized_price = fc.bucketized_column(price, boundaries=[0, 2, 4, 6])
+    price = fc._numeric_column('price', shape=[1])
+    bucketized_price = fc._bucketized_column(price, boundaries=[0, 2, 4, 6])
     with ops.Graph().as_default():
       features = {'price': [[-1.], [1.], [5.], [6.]]}
       predictions = get_keras_linear_model_predictions(features,
@@ -605,25 +619,28 @@ class BucketizedColumnTest(test.TestCase):
       bias = get_linear_model_bias()
       bucketized_price_var = get_linear_model_column_var(bucketized_price)
       with _initialized_session() as sess:
-        self.assertAllClose([0.], bias.eval())
+        self.assertAllClose([0.], self.evaluate(bias))
         # One weight variable per bucket, all initialized to zero.
         self.assertAllClose([[0.], [0.], [0.], [0.], [0.]],
-                            bucketized_price_var.eval())
-        self.assertAllClose([[0.], [0.], [0.], [0.]], predictions.eval())
+                            self.evaluate(bucketized_price_var))
+        self.assertAllClose([[0.], [0.], [0.], [0.]],
+                            self.evaluate(predictions))
         sess.run(
             bucketized_price_var.assign([[10.], [20.], [30.], [40.], [50.]]))
         # price -1. is in the 0th bucket, whose weight is 10.
         # price 1. is in the 1st bucket, whose weight is 20.
         # price 5. is in the 3rd bucket, whose weight is 40.
         # price 6. is in the 4th bucket, whose weight is 50.
-        self.assertAllClose([[10.], [20.], [40.], [50.]], predictions.eval())
+        self.assertAllClose([[10.], [20.], [40.], [50.]],
+                            self.evaluate(predictions))
         sess.run(bias.assign([1.]))
-        self.assertAllClose([[11.], [21.], [41.], [51.]], predictions.eval())
+        self.assertAllClose([[11.], [21.], [41.], [51.]],
+                            self.evaluate(predictions))
 
   def test_keras_linear_model_two_input_values(self):
     """Tests _LinearModel for input with shape=[2]."""
-    price = fc.numeric_column('price', shape=[2])
-    bucketized_price = fc.bucketized_column(price, boundaries=[0, 2, 4, 6])
+    price = fc._numeric_column('price', shape=[2])
+    bucketized_price = fc._bucketized_column(price, boundaries=[0, 2, 4, 6])
     with ops.Graph().as_default():
       features = {'price': [[-1., 1.], [5., 6.]]}
       predictions = get_keras_linear_model_predictions(features,
@@ -631,12 +648,12 @@ class BucketizedColumnTest(test.TestCase):
       bias = get_linear_model_bias()
       bucketized_price_var = get_linear_model_column_var(bucketized_price)
       with _initialized_session() as sess:
-        self.assertAllClose([0.], bias.eval())
+        self.assertAllClose([0.], self.evaluate(bias))
         # One weight per bucket per input column, all initialized to zero.
         self.assertAllClose(
             [[0.], [0.], [0.], [0.], [0.], [0.], [0.], [0.], [0.], [0.]],
-            bucketized_price_var.eval())
-        self.assertAllClose([[0.], [0.]], predictions.eval())
+            self.evaluate(bucketized_price_var))
+        self.assertAllClose([[0.], [0.]], self.evaluate(predictions))
         sess.run(
             bucketized_price_var.assign([[10.], [20.], [30.], [40.], [50.],
                                          [60.], [70.], [80.], [90.], [100.]]))
@@ -646,15 +663,16 @@ class BucketizedColumnTest(test.TestCase):
         # 2nd example:
         #   price 5. is in the 3rd bucket, whose weight is 40.
         #   price 6. is in the 9th bucket, whose weight is 100.
-        self.assertAllClose([[80.], [140.]], predictions.eval())
+        self.assertAllClose([[80.], [140.]], self.evaluate(predictions))
         sess.run(bias.assign([1.]))
-        self.assertAllClose([[81.], [141.]], predictions.eval())
+        self.assertAllClose([[81.], [141.]], self.evaluate(predictions))
 
 
 class HashedCategoricalColumnTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def test_defaults(self):
-    a = fc.categorical_column_with_hash_bucket('aaa', 10)
+    a = fc._categorical_column_with_hash_bucket('aaa', 10)
     self.assertEqual('aaa', a.name)
     self.assertEqual('aaa', a._var_scope_name)
     self.assertEqual('aaa', a.key)
@@ -663,25 +681,26 @@ class HashedCategoricalColumnTest(test.TestCase):
 
   def test_key_should_be_string(self):
     with self.assertRaisesRegexp(ValueError, 'key must be a string.'):
-      fc.categorical_column_with_hash_bucket(('key',), 10)
+      fc._categorical_column_with_hash_bucket(('key',), 10)
 
   def test_bucket_size_should_be_given(self):
     with self.assertRaisesRegexp(ValueError, 'hash_bucket_size must be set.'):
-      fc.categorical_column_with_hash_bucket('aaa', None)
+      fc._categorical_column_with_hash_bucket('aaa', None)
 
   def test_bucket_size_should_be_positive(self):
     with self.assertRaisesRegexp(ValueError,
                                  'hash_bucket_size must be at least 1'):
-      fc.categorical_column_with_hash_bucket('aaa', 0)
+      fc._categorical_column_with_hash_bucket('aaa', 0)
 
   def test_dtype_should_be_string_or_integer(self):
-    fc.categorical_column_with_hash_bucket('aaa', 10, dtype=dtypes.string)
-    fc.categorical_column_with_hash_bucket('aaa', 10, dtype=dtypes.int32)
+    fc._categorical_column_with_hash_bucket('aaa', 10, dtype=dtypes.string)
+    fc._categorical_column_with_hash_bucket('aaa', 10, dtype=dtypes.int32)
     with self.assertRaisesRegexp(ValueError, 'dtype must be string or integer'):
-      fc.categorical_column_with_hash_bucket('aaa', 10, dtype=dtypes.float32)
+      fc._categorical_column_with_hash_bucket('aaa', 10, dtype=dtypes.float32)
 
+  @test_util.run_deprecated_v1
   def test_deep_copy(self):
-    original = fc.categorical_column_with_hash_bucket('aaa', 10)
+    original = fc._categorical_column_with_hash_bucket('aaa', 10)
     for column in (original, copy.deepcopy(original)):
       self.assertEqual('aaa', column.name)
       self.assertEqual(10, column.hash_bucket_size)
@@ -689,19 +708,20 @@ class HashedCategoricalColumnTest(test.TestCase):
       self.assertEqual(dtypes.string, column.dtype)
 
   def test_parse_spec_string(self):
-    a = fc.categorical_column_with_hash_bucket('aaa', 10)
+    a = fc._categorical_column_with_hash_bucket('aaa', 10)
     self.assertEqual({
         'aaa': parsing_ops.VarLenFeature(dtypes.string)
     }, a._parse_example_spec)
 
   def test_parse_spec_int(self):
-    a = fc.categorical_column_with_hash_bucket('aaa', 10, dtype=dtypes.int32)
+    a = fc._categorical_column_with_hash_bucket('aaa', 10, dtype=dtypes.int32)
     self.assertEqual({
         'aaa': parsing_ops.VarLenFeature(dtypes.int32)
     }, a._parse_example_spec)
 
+  @test_util.run_deprecated_v1
   def test_parse_example(self):
-    a = fc.categorical_column_with_hash_bucket('aaa', 10)
+    a = fc._categorical_column_with_hash_bucket('aaa', 10)
     data = example_pb2.Example(features=feature_pb2.Features(
         feature={
             'aaa':
@@ -721,8 +741,9 @@ class HashedCategoricalColumnTest(test.TestCase):
               dense_shape=[1, 2]),
           features['aaa'].eval())
 
+  @test_util.run_deprecated_v1
   def test_strings_should_be_hashed(self):
-    hashed_sparse = fc.categorical_column_with_hash_bucket('wire', 10)
+    hashed_sparse = fc._categorical_column_with_hash_bucket('wire', 10)
     wire_tensor = sparse_tensor.SparseTensor(
         values=['omar', 'stringer', 'marlo'],
         indices=[[0, 0], [1, 0], [1, 1]],
@@ -739,11 +760,11 @@ class HashedCategoricalColumnTest(test.TestCase):
                           output.dense_shape.eval())
 
   def test_tensor_dtype_should_be_string_or_integer(self):
-    string_fc = fc.categorical_column_with_hash_bucket(
+    string_fc = fc._categorical_column_with_hash_bucket(
         'a_string', 10, dtype=dtypes.string)
-    int_fc = fc.categorical_column_with_hash_bucket(
+    int_fc = fc._categorical_column_with_hash_bucket(
         'a_int', 10, dtype=dtypes.int32)
-    float_fc = fc.categorical_column_with_hash_bucket(
+    float_fc = fc._categorical_column_with_hash_bucket(
         'a_float', 10, dtype=dtypes.string)
     int_tensor = sparse_tensor.SparseTensor(
         values=[101],
@@ -768,7 +789,7 @@ class HashedCategoricalColumnTest(test.TestCase):
       builder.get(float_fc)
 
   def test_dtype_should_match_with_tensor(self):
-    hashed_sparse = fc.categorical_column_with_hash_bucket(
+    hashed_sparse = fc._categorical_column_with_hash_bucket(
         'wire', 10, dtype=dtypes.int64)
     wire_tensor = sparse_tensor.SparseTensor(
         values=['omar'], indices=[[0, 0]], dense_shape=[1, 1])
@@ -776,8 +797,9 @@ class HashedCategoricalColumnTest(test.TestCase):
     with self.assertRaisesRegexp(ValueError, 'dtype must be compatible'):
       builder.get(hashed_sparse)
 
+  @test_util.run_deprecated_v1
   def test_ints_should_be_hashed(self):
-    hashed_sparse = fc.categorical_column_with_hash_bucket(
+    hashed_sparse = fc._categorical_column_with_hash_bucket(
         'wire', 10, dtype=dtypes.int64)
     wire_tensor = sparse_tensor.SparseTensor(
         values=[101, 201, 301],
@@ -790,8 +812,9 @@ class HashedCategoricalColumnTest(test.TestCase):
     with self.cached_session():
       self.assertAllEqual(expected_values, output.values.eval())
 
+  @test_util.run_deprecated_v1
   def test_int32_64_is_compatible(self):
-    hashed_sparse = fc.categorical_column_with_hash_bucket(
+    hashed_sparse = fc._categorical_column_with_hash_bucket(
         'wire', 10, dtype=dtypes.int64)
     wire_tensor = sparse_tensor.SparseTensor(
         values=constant_op.constant([101, 201, 301], dtype=dtypes.int32),
@@ -804,8 +827,9 @@ class HashedCategoricalColumnTest(test.TestCase):
     with self.cached_session():
       self.assertAllEqual(expected_values, output.values.eval())
 
+  @test_util.run_deprecated_v1
   def test_get_sparse_tensors(self):
-    hashed_sparse = fc.categorical_column_with_hash_bucket('wire', 10)
+    hashed_sparse = fc._categorical_column_with_hash_bucket('wire', 10)
     builder = _LazyBuilder({
         'wire':
             sparse_tensor.SparseTensor(
@@ -818,7 +842,7 @@ class HashedCategoricalColumnTest(test.TestCase):
     self.assertEqual(builder.get(hashed_sparse), id_weight_pair.id_tensor)
 
   def test_get_sparse_tensors_weight_collections(self):
-    column = fc.categorical_column_with_hash_bucket('aaa', 10)
+    column = fc._categorical_column_with_hash_bucket('aaa', 10)
     inputs = sparse_tensor.SparseTensor(
         values=['omar', 'stringer', 'marlo'],
         indices=[[0, 0], [1, 0], [1, 1]],
@@ -832,15 +856,17 @@ class HashedCategoricalColumnTest(test.TestCase):
         [], ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES))
     self.assertItemsEqual([], ops.get_collection('my_weights'))
 
+  @test_util.run_deprecated_v1
   def test_get_sparse_tensors_dense_input(self):
-    hashed_sparse = fc.categorical_column_with_hash_bucket('wire', 10)
+    hashed_sparse = fc._categorical_column_with_hash_bucket('wire', 10)
     builder = _LazyBuilder({'wire': (('omar', ''), ('stringer', 'marlo'))})
     id_weight_pair = hashed_sparse._get_sparse_tensors(builder)
     self.assertIsNone(id_weight_pair.weight_tensor)
     self.assertEqual(builder.get(hashed_sparse), id_weight_pair.id_tensor)
 
+  @test_util.run_deprecated_v1
   def test_linear_model(self):
-    wire_column = fc.categorical_column_with_hash_bucket('wire', 4)
+    wire_column = fc._categorical_column_with_hash_bucket('wire', 4)
     self.assertEqual(4, wire_column._num_buckets)
     with ops.Graph().as_default():
       predictions = fc.linear_model({
@@ -852,16 +878,18 @@ class HashedCategoricalColumnTest(test.TestCase):
       bias = get_linear_model_bias()
       wire_var = get_linear_model_column_var(wire_column)
       with _initialized_session():
-        self.assertAllClose((0.,), bias.eval())
-        self.assertAllClose(((0.,), (0.,), (0.,), (0.,)), wire_var.eval())
-        self.assertAllClose(((0.,), (0.,)), predictions.eval())
+        self.assertAllClose((0.,), self.evaluate(bias))
+        self.assertAllClose(((0.,), (0.,), (0.,), (0.,)),
+                            self.evaluate(wire_var))
+        self.assertAllClose(((0.,), (0.,)), self.evaluate(predictions))
         wire_var.assign(((1.,), (2.,), (3.,), (4.,))).eval()
         # 'marlo' -> 3: wire_var[3] = 4
         # 'skywalker' -> 2, 'omar' -> 2: wire_var[2] + wire_var[2] = 3+3 = 6
-        self.assertAllClose(((4.,), (6.,)), predictions.eval())
+        self.assertAllClose(((4.,), (6.,)), self.evaluate(predictions))
 
+  @test_util.run_deprecated_v1
   def test_keras_linear_model(self):
-    wire_column = fc.categorical_column_with_hash_bucket('wire', 4)
+    wire_column = fc._categorical_column_with_hash_bucket('wire', 4)
     self.assertEqual(4, wire_column._num_buckets)
     with ops.Graph().as_default():
       predictions = get_keras_linear_model_predictions({
@@ -874,13 +902,14 @@ class HashedCategoricalColumnTest(test.TestCase):
       bias = get_linear_model_bias()
       wire_var = get_linear_model_column_var(wire_column)
       with _initialized_session():
-        self.assertAllClose((0.,), bias.eval())
-        self.assertAllClose(((0.,), (0.,), (0.,), (0.,)), wire_var.eval())
-        self.assertAllClose(((0.,), (0.,)), predictions.eval())
+        self.assertAllClose((0.,), self.evaluate(bias))
+        self.assertAllClose(((0.,), (0.,), (0.,), (0.,)),
+                            self.evaluate(wire_var))
+        self.assertAllClose(((0.,), (0.,)), self.evaluate(predictions))
         wire_var.assign(((1.,), (2.,), (3.,), (4.,))).eval()
         # 'marlo' -> 3: wire_var[3] = 4
         # 'skywalker' -> 2, 'omar' -> 2: wire_var[2] + wire_var[2] = 3+3 = 6
-        self.assertAllClose(((4.,), (6.,)), predictions.eval())
+        self.assertAllClose(((4.,), (6.,)), self.evaluate(predictions))
 
 
 class CrossedColumnTest(test.TestCase):
@@ -888,100 +917,102 @@ class CrossedColumnTest(test.TestCase):
   def test_keys_empty(self):
     with self.assertRaisesRegexp(
         ValueError, 'keys must be a list with length > 1'):
-      fc.crossed_column([], 10)
+      fc._crossed_column([], 10)
 
   def test_keys_length_one(self):
     with self.assertRaisesRegexp(
         ValueError, 'keys must be a list with length > 1'):
-      fc.crossed_column(['a'], 10)
+      fc._crossed_column(['a'], 10)
 
   def test_key_type_unsupported(self):
     with self.assertRaisesRegexp(ValueError, 'Unsupported key type'):
-      fc.crossed_column(['a', fc.numeric_column('c')], 10)
+      fc._crossed_column(['a', fc._numeric_column('c')], 10)
 
     with self.assertRaisesRegexp(
         ValueError, 'categorical_column_with_hash_bucket is not supported'):
-      fc.crossed_column(
-          ['a', fc.categorical_column_with_hash_bucket('c', 10)], 10)
+      fc._crossed_column(
+          ['a', fc._categorical_column_with_hash_bucket('c', 10)], 10)
 
   def test_hash_bucket_size_negative(self):
     with self.assertRaisesRegexp(
         ValueError, 'hash_bucket_size must be > 1'):
-      fc.crossed_column(['a', 'c'], -1)
+      fc._crossed_column(['a', 'c'], -1)
 
   def test_hash_bucket_size_zero(self):
     with self.assertRaisesRegexp(
         ValueError, 'hash_bucket_size must be > 1'):
-      fc.crossed_column(['a', 'c'], 0)
+      fc._crossed_column(['a', 'c'], 0)
 
   def test_hash_bucket_size_none(self):
     with self.assertRaisesRegexp(
         ValueError, 'hash_bucket_size must be > 1'):
-      fc.crossed_column(['a', 'c'], None)
+      fc._crossed_column(['a', 'c'], None)
 
   def test_name(self):
-    a = fc.numeric_column('a', dtype=dtypes.int32)
-    b = fc.bucketized_column(a, boundaries=[0, 1])
-    crossed1 = fc.crossed_column(['d1', 'd2'], 10)
+    a = fc._numeric_column('a', dtype=dtypes.int32)
+    b = fc._bucketized_column(a, boundaries=[0, 1])
+    crossed1 = fc._crossed_column(['d1', 'd2'], 10)
 
-    crossed2 = fc.crossed_column([b, 'c', crossed1], 10)
+    crossed2 = fc._crossed_column([b, 'c', crossed1], 10)
     self.assertEqual('a_bucketized_X_c_X_d1_X_d2', crossed2.name)
 
   def test_name_ordered_alphabetically(self):
     """Tests that the name does not depend on the order of given columns."""
-    a = fc.numeric_column('a', dtype=dtypes.int32)
-    b = fc.bucketized_column(a, boundaries=[0, 1])
-    crossed1 = fc.crossed_column(['d1', 'd2'], 10)
+    a = fc._numeric_column('a', dtype=dtypes.int32)
+    b = fc._bucketized_column(a, boundaries=[0, 1])
+    crossed1 = fc._crossed_column(['d1', 'd2'], 10)
 
-    crossed2 = fc.crossed_column([crossed1, 'c', b], 10)
+    crossed2 = fc._crossed_column([crossed1, 'c', b], 10)
     self.assertEqual('a_bucketized_X_c_X_d1_X_d2', crossed2.name)
 
   def test_name_leaf_keys_ordered_alphabetically(self):
     """Tests that the name does not depend on the order of given columns."""
-    a = fc.numeric_column('a', dtype=dtypes.int32)
-    b = fc.bucketized_column(a, boundaries=[0, 1])
-    crossed1 = fc.crossed_column(['d2', 'c'], 10)
+    a = fc._numeric_column('a', dtype=dtypes.int32)
+    b = fc._bucketized_column(a, boundaries=[0, 1])
+    crossed1 = fc._crossed_column(['d2', 'c'], 10)
 
-    crossed2 = fc.crossed_column([crossed1, 'd1', b], 10)
+    crossed2 = fc._crossed_column([crossed1, 'd1', b], 10)
     self.assertEqual('a_bucketized_X_c_X_d1_X_d2', crossed2.name)
 
   def test_var_scope_name(self):
-    a = fc.numeric_column('a', dtype=dtypes.int32)
-    b = fc.bucketized_column(a, boundaries=[0, 1])
-    crossed1 = fc.crossed_column(['d1', 'd2'], 10)
+    a = fc._numeric_column('a', dtype=dtypes.int32)
+    b = fc._bucketized_column(a, boundaries=[0, 1])
+    crossed1 = fc._crossed_column(['d1', 'd2'], 10)
 
-    crossed2 = fc.crossed_column([b, 'c', crossed1], 10)
+    crossed2 = fc._crossed_column([b, 'c', crossed1], 10)
     self.assertEqual('a_bucketized_X_c_X_d1_X_d2', crossed2._var_scope_name)
 
   def test_parse_spec(self):
-    a = fc.numeric_column('a', shape=[2], dtype=dtypes.int32)
-    b = fc.bucketized_column(a, boundaries=[0, 1])
-    crossed = fc.crossed_column([b, 'c'], 10)
+    a = fc._numeric_column('a', shape=[2], dtype=dtypes.int32)
+    b = fc._bucketized_column(a, boundaries=[0, 1])
+    crossed = fc._crossed_column([b, 'c'], 10)
     self.assertEqual({
         'a': parsing_ops.FixedLenFeature((2,), dtype=dtypes.int32),
         'c': parsing_ops.VarLenFeature(dtypes.string),
     }, crossed._parse_example_spec)
 
   def test_num_buckets(self):
-    a = fc.numeric_column('a', shape=[2], dtype=dtypes.int32)
-    b = fc.bucketized_column(a, boundaries=[0, 1])
-    crossed = fc.crossed_column([b, 'c'], 15)
+    a = fc._numeric_column('a', shape=[2], dtype=dtypes.int32)
+    b = fc._bucketized_column(a, boundaries=[0, 1])
+    crossed = fc._crossed_column([b, 'c'], 15)
     self.assertEqual(15, crossed._num_buckets)
 
+  @test_util.run_deprecated_v1
   def test_deep_copy(self):
-    a = fc.numeric_column('a', dtype=dtypes.int32)
-    b = fc.bucketized_column(a, boundaries=[0, 1])
-    crossed1 = fc.crossed_column(['d1', 'd2'], 10)
-    crossed2 = fc.crossed_column([b, 'c', crossed1], 15, hash_key=5)
+    a = fc._numeric_column('a', dtype=dtypes.int32)
+    b = fc._bucketized_column(a, boundaries=[0, 1])
+    crossed1 = fc._crossed_column(['d1', 'd2'], 10)
+    crossed2 = fc._crossed_column([b, 'c', crossed1], 15, hash_key=5)
     crossed2_copy = copy.deepcopy(crossed2)
     self.assertEqual('a_bucketized_X_c_X_d1_X_d2', crossed2_copy.name,)
     self.assertEqual(15, crossed2_copy.hash_bucket_size)
     self.assertEqual(5, crossed2_copy.hash_key)
 
+  @test_util.run_deprecated_v1
   def test_parse_example(self):
-    price = fc.numeric_column('price', shape=[2])
-    bucketized_price = fc.bucketized_column(price, boundaries=[0, 50])
-    price_cross_wire = fc.crossed_column([bucketized_price, 'wire'], 10)
+    price = fc._numeric_column('price', shape=[2])
+    bucketized_price = fc._bucketized_column(price, boundaries=[0, 50])
+    price_cross_wire = fc._crossed_column([bucketized_price, 'wire'], 10)
     data = example_pb2.Example(features=feature_pb2.Features(
         feature={
             'price':
@@ -1004,12 +1035,13 @@ class CrossedColumnTest(test.TestCase):
       self.assertAllEqual([b'omar', b'stringer'], wire_sparse.values.eval())
       self.assertAllEqual([1, 2], wire_sparse.dense_shape.eval())
 
+  @test_util.run_deprecated_v1
   def test_transform_feature(self):
-    price = fc.numeric_column('price', shape=[2])
-    bucketized_price = fc.bucketized_column(price, boundaries=[0, 50])
+    price = fc._numeric_column('price', shape=[2])
+    bucketized_price = fc._bucketized_column(price, boundaries=[0, 50])
     hash_bucket_size = 10
-    price_cross_wire = fc.crossed_column(
-        [bucketized_price, 'wire'], hash_bucket_size)
+    price_cross_wire = fc._crossed_column([bucketized_price, 'wire'],
+                                          hash_bucket_size)
     features = {
         'price': constant_op.constant([[1., 2.], [5., 6.]]),
         'wire': sparse_tensor.SparseTensor(
@@ -1020,18 +1052,19 @@ class CrossedColumnTest(test.TestCase):
     outputs = _transform_features(features, [price_cross_wire])
     output = outputs[price_cross_wire]
     with self.cached_session() as sess:
-      output_val = sess.run(output)
+      output_val = self.evaluate(output)
       self.assertAllEqual(
           [[0, 0], [0, 1], [1, 0], [1, 1], [1, 2], [1, 3]], output_val.indices)
       for val in output_val.values:
         self.assertIn(val, list(range(hash_bucket_size)))
       self.assertAllEqual([2, 4], output_val.dense_shape)
 
+  @test_util.run_deprecated_v1
   def test_get_sparse_tensors(self):
-    a = fc.numeric_column('a', dtype=dtypes.int32, shape=(2,))
-    b = fc.bucketized_column(a, boundaries=(0, 1))
-    crossed1 = fc.crossed_column(['d1', 'd2'], 10)
-    crossed2 = fc.crossed_column([b, 'c', crossed1], 15, hash_key=5)
+    a = fc._numeric_column('a', dtype=dtypes.int32, shape=(2,))
+    b = fc._bucketized_column(a, boundaries=(0, 1))
+    crossed1 = fc._crossed_column(['d1', 'd2'], 10)
+    crossed2 = fc._crossed_column([b, 'c', crossed1], 15, hash_key=5)
     with ops.Graph().as_default():
       builder = _LazyBuilder({
           'a':
@@ -1069,9 +1102,9 @@ class CrossedColumnTest(test.TestCase):
 
   def test_get_sparse_tensors_simple(self):
     """Same as test_get_sparse_tensors, but with simpler values."""
-    a = fc.numeric_column('a', dtype=dtypes.int32, shape=(2,))
-    b = fc.bucketized_column(a, boundaries=(0, 1))
-    crossed = fc.crossed_column([b, 'c'], hash_bucket_size=5, hash_key=5)
+    a = fc._numeric_column('a', dtype=dtypes.int32, shape=(2,))
+    b = fc._bucketized_column(a, boundaries=(0, 1))
+    crossed = fc._crossed_column([b, 'c'], hash_bucket_size=5, hash_key=5)
     with ops.Graph().as_default():
       builder = _LazyBuilder({
           'a':
@@ -1094,14 +1127,15 @@ class CrossedColumnTest(test.TestCase):
         self.assertAllEqual(expected_values, id_tensor_eval.values)
         self.assertAllEqual((2, 4), id_tensor_eval.dense_shape)
 
+  @test_util.run_deprecated_v1
   def test_linear_model(self):
     """Tests linear_model.
 
     Uses data from test_get_sparse_tesnsors_simple.
     """
-    a = fc.numeric_column('a', dtype=dtypes.int32, shape=(2,))
-    b = fc.bucketized_column(a, boundaries=(0, 1))
-    crossed = fc.crossed_column([b, 'c'], hash_bucket_size=5, hash_key=5)
+    a = fc._numeric_column('a', dtype=dtypes.int32, shape=(2,))
+    b = fc._bucketized_column(a, boundaries=(0, 1))
+    crossed = fc._crossed_column([b, 'c'], hash_bucket_size=5, hash_key=5)
     with ops.Graph().as_default():
       predictions = fc.linear_model({
           'a': constant_op.constant(((-1., .5), (.5, 1.))),
@@ -1113,15 +1147,15 @@ class CrossedColumnTest(test.TestCase):
       bias = get_linear_model_bias()
       crossed_var = get_linear_model_column_var(crossed)
       with _initialized_session() as sess:
-        self.assertAllClose((0.,), bias.eval())
-        self.assertAllClose(
-            ((0.,), (0.,), (0.,), (0.,), (0.,)), crossed_var.eval())
-        self.assertAllClose(((0.,), (0.,)), predictions.eval())
+        self.assertAllClose((0.,), self.evaluate(bias))
+        self.assertAllClose(((0.,), (0.,), (0.,), (0.,), (0.,)),
+                            self.evaluate(crossed_var))
+        self.assertAllClose(((0.,), (0.,)), self.evaluate(predictions))
         sess.run(crossed_var.assign(((1.,), (2.,), (3.,), (4.,), (5.,))))
         # Expected ids after cross = (1, 0, 1, 3, 4, 2)
-        self.assertAllClose(((3.,), (14.,)), predictions.eval())
+        self.assertAllClose(((3.,), (14.,)), self.evaluate(predictions))
         sess.run(bias.assign((.1,)))
-        self.assertAllClose(((3.1,), (14.1,)), predictions.eval())
+        self.assertAllClose(((3.1,), (14.1,)), self.evaluate(predictions))
 
   def test_linear_model_with_weights(self):
     class _TestColumnWithWeights(_CategoricalColumn):
@@ -1155,7 +1189,7 @@ class CrossedColumnTest(test.TestCase):
             id_tensor=ids_and_weights[0], weight_tensor=ids_and_weights[1])
 
     t = _TestColumnWithWeights()
-    crossed = fc.crossed_column([t, 'c'], hash_bucket_size=5, hash_key=5)
+    crossed = fc._crossed_column([t, 'c'], hash_bucket_size=5, hash_key=5)
     with ops.Graph().as_default():
       with self.assertRaisesRegexp(
           ValueError,
@@ -1175,14 +1209,15 @@ class CrossedColumnTest(test.TestCase):
                 dense_shape=(2, 2)),
         }, (crossed,))
 
+  @test_util.run_deprecated_v1
   def test_keras_linear_model(self):
     """Tests _LinearModel.
 
     Uses data from test_get_sparse_tesnsors_simple.
     """
-    a = fc.numeric_column('a', dtype=dtypes.int32, shape=(2,))
-    b = fc.bucketized_column(a, boundaries=(0, 1))
-    crossed = fc.crossed_column([b, 'c'], hash_bucket_size=5, hash_key=5)
+    a = fc._numeric_column('a', dtype=dtypes.int32, shape=(2,))
+    b = fc._bucketized_column(a, boundaries=(0, 1))
+    crossed = fc._crossed_column([b, 'c'], hash_bucket_size=5, hash_key=5)
     with ops.Graph().as_default():
       predictions = get_keras_linear_model_predictions({
           'a':
@@ -1196,15 +1231,15 @@ class CrossedColumnTest(test.TestCase):
       bias = get_linear_model_bias()
       crossed_var = get_linear_model_column_var(crossed)
       with _initialized_session() as sess:
-        self.assertAllClose((0.,), bias.eval())
+        self.assertAllClose((0.,), self.evaluate(bias))
         self.assertAllClose(((0.,), (0.,), (0.,), (0.,), (0.,)),
-                            crossed_var.eval())
-        self.assertAllClose(((0.,), (0.,)), predictions.eval())
+                            self.evaluate(crossed_var))
+        self.assertAllClose(((0.,), (0.,)), self.evaluate(predictions))
         sess.run(crossed_var.assign(((1.,), (2.,), (3.,), (4.,), (5.,))))
         # Expected ids after cross = (1, 0, 1, 3, 4, 2)
-        self.assertAllClose(((3.,), (14.,)), predictions.eval())
+        self.assertAllClose(((3.,), (14.,)), self.evaluate(predictions))
         sess.run(bias.assign((.1,)))
-        self.assertAllClose(((3.1,), (14.1,)), predictions.eval())
+        self.assertAllClose(((3.1,), (14.1,)), self.evaluate(predictions))
 
   def test_keras_linear_model_with_weights(self):
 
@@ -1242,7 +1277,7 @@ class CrossedColumnTest(test.TestCase):
             id_tensor=ids_and_weights[0], weight_tensor=ids_and_weights[1])
 
     t = _TestColumnWithWeights()
-    crossed = fc.crossed_column([t, 'c'], hash_bucket_size=5, hash_key=5)
+    crossed = fc._crossed_column([t, 'c'], hash_bucket_size=5, hash_key=5)
     with ops.Graph().as_default():
       with self.assertRaisesRegexp(
           ValueError,
@@ -1331,31 +1366,31 @@ class LinearModelTest(test.TestCase):
     with self.assertRaisesRegexp(
         ValueError, 'Expected feature_columns to be iterable, found dict.'):
       fc.linear_model(
-          features={'a': [[0]]}, feature_columns={'a': fc.numeric_column('a')})
+          features={'a': [[0]]}, feature_columns={'a': fc._numeric_column('a')})
 
   def test_raises_if_duplicate_name(self):
     with self.assertRaisesRegexp(
         ValueError, 'Duplicate feature column name found for columns'):
       fc.linear_model(
           features={'a': [[0]]},
-          feature_columns=[fc.numeric_column('a'),
-                           fc.numeric_column('a')])
+          feature_columns=[fc._numeric_column('a'),
+                           fc._numeric_column('a')])
 
   def test_dense_bias(self):
-    price = fc.numeric_column('price')
+    price = fc._numeric_column('price')
     with ops.Graph().as_default():
       features = {'price': [[1.], [5.]]}
       predictions = fc.linear_model(features, [price])
       bias = get_linear_model_bias()
       price_var = get_linear_model_column_var(price)
       with _initialized_session() as sess:
-        self.assertAllClose([0.], bias.eval())
+        self.assertAllClose([0.], self.evaluate(bias))
         sess.run(price_var.assign([[10.]]))
         sess.run(bias.assign([5.]))
-        self.assertAllClose([[15.], [55.]], predictions.eval())
+        self.assertAllClose([[15.], [55.]], self.evaluate(predictions))
 
   def test_sparse_bias(self):
-    wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
+    wire_cast = fc._categorical_column_with_hash_bucket('wire_cast', 4)
     with ops.Graph().as_default():
       wire_tensor = sparse_tensor.SparseTensor(
           values=['omar', 'stringer', 'marlo'],  # hashed to = [2, 0, 3]
@@ -1366,15 +1401,16 @@ class LinearModelTest(test.TestCase):
       bias = get_linear_model_bias()
       wire_cast_var = get_linear_model_column_var(wire_cast)
       with _initialized_session() as sess:
-        self.assertAllClose([0.], bias.eval())
-        self.assertAllClose([[0.], [0.], [0.], [0.]], wire_cast_var.eval())
+        self.assertAllClose([0.], self.evaluate(bias))
+        self.assertAllClose([[0.], [0.], [0.], [0.]],
+                            self.evaluate(wire_cast_var))
         sess.run(wire_cast_var.assign([[10.], [100.], [1000.], [10000.]]))
         sess.run(bias.assign([5.]))
-        self.assertAllClose([[1005.], [10015.]], predictions.eval())
+        self.assertAllClose([[1005.], [10015.]], self.evaluate(predictions))
 
   def test_dense_and_sparse_bias(self):
-    wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
-    price = fc.numeric_column('price')
+    wire_cast = fc._categorical_column_with_hash_bucket('wire_cast', 4)
+    price = fc._numeric_column('price')
     with ops.Graph().as_default():
       wire_tensor = sparse_tensor.SparseTensor(
           values=['omar', 'stringer', 'marlo'],  # hashed to = [2, 0, 3]
@@ -1389,7 +1425,7 @@ class LinearModelTest(test.TestCase):
         sess.run(wire_cast_var.assign([[10.], [100.], [1000.], [10000.]]))
         sess.run(bias.assign([5.]))
         sess.run(price_var.assign([[10.]]))
-        self.assertAllClose([[1015.], [10065.]], predictions.eval())
+        self.assertAllClose([[1015.], [10065.]], self.evaluate(predictions))
 
   def test_dense_and_sparse_column(self):
     """When the column is both dense and sparse, uses sparse tensors."""
@@ -1442,25 +1478,25 @@ class LinearModelTest(test.TestCase):
         sess.run(dense_and_sparse_column_var.assign(
             [[10.], [100.], [1000.], [10000.]]))
         sess.run(bias.assign([5.]))
-        self.assertAllClose([[1005.], [10015.]], predictions.eval())
+        self.assertAllClose([[1005.], [10015.]], self.evaluate(predictions))
 
   def test_dense_multi_output(self):
-    price = fc.numeric_column('price')
+    price = fc._numeric_column('price')
     with ops.Graph().as_default():
       features = {'price': [[1.], [5.]]}
       predictions = fc.linear_model(features, [price], units=3)
       bias = get_linear_model_bias()
       price_var = get_linear_model_column_var(price)
       with _initialized_session() as sess:
-        self.assertAllClose(np.zeros((3,)), bias.eval())
-        self.assertAllClose(np.zeros((1, 3)), price_var.eval())
+        self.assertAllClose(np.zeros((3,)), self.evaluate(bias))
+        self.assertAllClose(np.zeros((1, 3)), self.evaluate(price_var))
         sess.run(price_var.assign([[10., 100., 1000.]]))
         sess.run(bias.assign([5., 6., 7.]))
         self.assertAllClose([[15., 106., 1007.], [55., 506., 5007.]],
-                            predictions.eval())
+                            self.evaluate(predictions))
 
   def test_sparse_multi_output(self):
-    wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
+    wire_cast = fc._categorical_column_with_hash_bucket('wire_cast', 4)
     with ops.Graph().as_default():
       wire_tensor = sparse_tensor.SparseTensor(
           values=['omar', 'stringer', 'marlo'],  # hashed to = [2, 0, 3]
@@ -1471,29 +1507,29 @@ class LinearModelTest(test.TestCase):
       bias = get_linear_model_bias()
       wire_cast_var = get_linear_model_column_var(wire_cast)
       with _initialized_session() as sess:
-        self.assertAllClose(np.zeros((3,)), bias.eval())
-        self.assertAllClose(np.zeros((4, 3)), wire_cast_var.eval())
+        self.assertAllClose(np.zeros((3,)), self.evaluate(bias))
+        self.assertAllClose(np.zeros((4, 3)), self.evaluate(wire_cast_var))
         sess.run(
             wire_cast_var.assign([[10., 11., 12.], [100., 110., 120.], [
                 1000., 1100., 1200.
             ], [10000., 11000., 12000.]]))
         sess.run(bias.assign([5., 6., 7.]))
         self.assertAllClose([[1005., 1106., 1207.], [10015., 11017., 12019.]],
-                            predictions.eval())
+                            self.evaluate(predictions))
 
   def test_dense_multi_dimension(self):
-    price = fc.numeric_column('price', shape=2)
+    price = fc._numeric_column('price', shape=2)
     with ops.Graph().as_default():
       features = {'price': [[1., 2.], [5., 6.]]}
       predictions = fc.linear_model(features, [price])
       price_var = get_linear_model_column_var(price)
       with _initialized_session() as sess:
-        self.assertAllClose([[0.], [0.]], price_var.eval())
+        self.assertAllClose([[0.], [0.]], self.evaluate(price_var))
         sess.run(price_var.assign([[10.], [100.]]))
-        self.assertAllClose([[210.], [650.]], predictions.eval())
+        self.assertAllClose([[210.], [650.]], self.evaluate(predictions))
 
   def test_sparse_multi_rank(self):
-    wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
+    wire_cast = fc._categorical_column_with_hash_bucket('wire_cast', 4)
     with ops.Graph().as_default():
       wire_tensor = array_ops.sparse_placeholder(dtypes.string)
       wire_value = sparse_tensor.SparseTensorValue(
@@ -1504,7 +1540,7 @@ class LinearModelTest(test.TestCase):
       predictions = fc.linear_model(features, [wire_cast])
       wire_cast_var = get_linear_model_column_var(wire_cast)
       with _initialized_session() as sess:
-        self.assertAllClose(np.zeros((4, 1)), wire_cast_var.eval())
+        self.assertAllClose(np.zeros((4, 1)), self.evaluate(wire_cast_var))
         self.assertAllClose(
             np.zeros((2, 1)),
             predictions.eval(feed_dict={wire_tensor: wire_value}))
@@ -1514,7 +1550,7 @@ class LinearModelTest(test.TestCase):
             predictions.eval(feed_dict={wire_tensor: wire_value}))
 
   def test_sparse_combiner(self):
-    wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
+    wire_cast = fc._categorical_column_with_hash_bucket('wire_cast', 4)
     with ops.Graph().as_default():
       wire_tensor = sparse_tensor.SparseTensor(
           values=['omar', 'stringer', 'marlo'],  # hashed to = [2, 0, 3]
@@ -1528,11 +1564,11 @@ class LinearModelTest(test.TestCase):
       with _initialized_session() as sess:
         sess.run(wire_cast_var.assign([[10.], [100.], [1000.], [10000.]]))
         sess.run(bias.assign([5.]))
-        self.assertAllClose([[1005.], [5010.]], predictions.eval())
+        self.assertAllClose([[1005.], [5010.]], self.evaluate(predictions))
 
   def test_sparse_combiner_with_negative_weights(self):
-    wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
-    wire_cast_weights = fc.weighted_categorical_column(wire_cast, 'weights')
+    wire_cast = fc._categorical_column_with_hash_bucket('wire_cast', 4)
+    wire_cast_weights = fc._weighted_categorical_column(wire_cast, 'weights')
 
     with ops.Graph().as_default():
       wire_tensor = sparse_tensor.SparseTensor(
@@ -1550,25 +1586,25 @@ class LinearModelTest(test.TestCase):
       with _initialized_session() as sess:
         sess.run(wire_cast_var.assign([[10.], [100.], [1000.], [10000.]]))
         sess.run(bias.assign([5.]))
-        self.assertAllClose([[1005.], [-9985.]], predictions.eval())
+        self.assertAllClose([[1005.], [-9985.]], self.evaluate(predictions))
 
   def test_dense_multi_dimension_multi_output(self):
-    price = fc.numeric_column('price', shape=2)
+    price = fc._numeric_column('price', shape=2)
     with ops.Graph().as_default():
       features = {'price': [[1., 2.], [5., 6.]]}
       predictions = fc.linear_model(features, [price], units=3)
       bias = get_linear_model_bias()
       price_var = get_linear_model_column_var(price)
       with _initialized_session() as sess:
-        self.assertAllClose(np.zeros((3,)), bias.eval())
-        self.assertAllClose(np.zeros((2, 3)), price_var.eval())
+        self.assertAllClose(np.zeros((3,)), self.evaluate(bias))
+        self.assertAllClose(np.zeros((2, 3)), self.evaluate(price_var))
         sess.run(price_var.assign([[1., 2., 3.], [10., 100., 1000.]]))
         sess.run(bias.assign([2., 3., 4.]))
         self.assertAllClose([[23., 205., 2007.], [67., 613., 6019.]],
-                            predictions.eval())
+                            self.evaluate(predictions))
 
   def test_raises_if_shape_mismatch(self):
-    price = fc.numeric_column('price', shape=2)
+    price = fc._numeric_column('price', shape=2)
     with ops.Graph().as_default():
       features = {'price': [[1.], [5.]]}
       with self.assertRaisesRegexp(
@@ -1577,22 +1613,22 @@ class LinearModelTest(test.TestCase):
         fc.linear_model(features, [price])
 
   def test_dense_reshaping(self):
-    price = fc.numeric_column('price', shape=[1, 2])
+    price = fc._numeric_column('price', shape=[1, 2])
     with ops.Graph().as_default():
       features = {'price': [[[1., 2.]], [[5., 6.]]]}
       predictions = fc.linear_model(features, [price])
       bias = get_linear_model_bias()
       price_var = get_linear_model_column_var(price)
       with _initialized_session() as sess:
-        self.assertAllClose([0.], bias.eval())
-        self.assertAllClose([[0.], [0.]], price_var.eval())
-        self.assertAllClose([[0.], [0.]], predictions.eval())
+        self.assertAllClose([0.], self.evaluate(bias))
+        self.assertAllClose([[0.], [0.]], self.evaluate(price_var))
+        self.assertAllClose([[0.], [0.]], self.evaluate(predictions))
         sess.run(price_var.assign([[10.], [100.]]))
-        self.assertAllClose([[210.], [650.]], predictions.eval())
+        self.assertAllClose([[210.], [650.]], self.evaluate(predictions))
 
   def test_dense_multi_column(self):
-    price1 = fc.numeric_column('price1', shape=2)
-    price2 = fc.numeric_column('price2')
+    price1 = fc._numeric_column('price1', shape=2)
+    price2 = fc._numeric_column('price2')
     with ops.Graph().as_default():
       features = {
           'price1': [[1., 2.], [5., 6.]],
@@ -1603,18 +1639,18 @@ class LinearModelTest(test.TestCase):
       price1_var = get_linear_model_column_var(price1)
       price2_var = get_linear_model_column_var(price2)
       with _initialized_session() as sess:
-        self.assertAllClose([0.], bias.eval())
-        self.assertAllClose([[0.], [0.]], price1_var.eval())
-        self.assertAllClose([[0.]], price2_var.eval())
-        self.assertAllClose([[0.], [0.]], predictions.eval())
+        self.assertAllClose([0.], self.evaluate(bias))
+        self.assertAllClose([[0.], [0.]], self.evaluate(price1_var))
+        self.assertAllClose([[0.]], self.evaluate(price2_var))
+        self.assertAllClose([[0.], [0.]], self.evaluate(predictions))
         sess.run(price1_var.assign([[10.], [100.]]))
         sess.run(price2_var.assign([[1000.]]))
         sess.run(bias.assign([7.]))
-        self.assertAllClose([[3217.], [4657.]], predictions.eval())
+        self.assertAllClose([[3217.], [4657.]], self.evaluate(predictions))
 
   def test_fills_cols_to_vars(self):
-    price1 = fc.numeric_column('price1', shape=2)
-    price2 = fc.numeric_column('price2')
+    price1 = fc._numeric_column('price1', shape=2)
+    price2 = fc._numeric_column('price2')
     with ops.Graph().as_default():
       features = {'price1': [[1., 2.], [5., 6.]], 'price2': [[3.], [4.]]}
       cols_to_vars = {}
@@ -1627,8 +1663,8 @@ class LinearModelTest(test.TestCase):
       self.assertAllEqual(cols_to_vars[price2], [price2_var])
 
   def test_fills_cols_to_vars_partitioned_variables(self):
-    price1 = fc.numeric_column('price1', shape=2)
-    price2 = fc.numeric_column('price2', shape=3)
+    price1 = fc._numeric_column('price1', shape=2)
+    price2 = fc._numeric_column('price2', shape=3)
     with ops.Graph().as_default():
       features = {
           'price1': [[1., 2.], [6., 7.]],
@@ -1653,13 +1689,13 @@ class LinearModelTest(test.TestCase):
     # Provide three _DenseColumn's to input_layer: a _NumericColumn, a
     # _BucketizedColumn, and an _EmbeddingColumn.  Only the _EmbeddingColumn
     # creates a Variable.
-    apple_numeric_column = fc.numeric_column('apple_numeric_column')
-    banana_dense_feature = fc.numeric_column('banana_dense_feature')
-    banana_dense_feature_bucketized = fc.bucketized_column(
+    apple_numeric_column = fc._numeric_column('apple_numeric_column')
+    banana_dense_feature = fc._numeric_column('banana_dense_feature')
+    banana_dense_feature_bucketized = fc._bucketized_column(
         banana_dense_feature, boundaries=[0.])
-    cherry_sparse_column = fc.categorical_column_with_hash_bucket(
+    cherry_sparse_column = fc._categorical_column_with_hash_bucket(
         'cherry_sparse_feature', hash_bucket_size=5)
-    dragonfruit_embedding_column = fc.embedding_column(
+    dragonfruit_embedding_column = fc._embedding_column(
         cherry_sparse_column, dimension=10)
     with ops.Graph().as_default():
       features = {
@@ -1684,7 +1720,7 @@ class LinearModelTest(test.TestCase):
       self.assertItemsEqual(input_layer_inputs, output_tensors)
 
   def test_dense_collection(self):
-    price = fc.numeric_column('price')
+    price = fc._numeric_column('price')
     with ops.Graph().as_default() as g:
       features = {'price': [[1.], [5.]]}
       fc.linear_model(features, [price], weight_collections=['my-vars'])
@@ -1695,7 +1731,7 @@ class LinearModelTest(test.TestCase):
       self.assertIn(price_var, my_vars)
 
   def test_sparse_collection(self):
-    wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
+    wire_cast = fc._categorical_column_with_hash_bucket('wire_cast', 4)
     with ops.Graph().as_default() as g:
       wire_tensor = sparse_tensor.SparseTensor(
           values=['omar'], indices=[[0, 0]], dense_shape=[1, 1])
@@ -1709,7 +1745,7 @@ class LinearModelTest(test.TestCase):
       self.assertIn(wire_cast_var, my_vars)
 
   def test_dense_trainable_default(self):
-    price = fc.numeric_column('price')
+    price = fc._numeric_column('price')
     with ops.Graph().as_default() as g:
       features = {'price': [[1.], [5.]]}
       fc.linear_model(features, [price])
@@ -1720,7 +1756,7 @@ class LinearModelTest(test.TestCase):
       self.assertIn(price_var, trainable_vars)
 
   def test_sparse_trainable_default(self):
-    wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
+    wire_cast = fc._categorical_column_with_hash_bucket('wire_cast', 4)
     with ops.Graph().as_default() as g:
       wire_tensor = sparse_tensor.SparseTensor(
           values=['omar'], indices=[[0, 0]], dense_shape=[1, 1])
@@ -1733,7 +1769,7 @@ class LinearModelTest(test.TestCase):
       self.assertIn(wire_cast_var, trainable_vars)
 
   def test_dense_trainable_false(self):
-    price = fc.numeric_column('price')
+    price = fc._numeric_column('price')
     with ops.Graph().as_default() as g:
       features = {'price': [[1.], [5.]]}
       fc.linear_model(features, [price], trainable=False)
@@ -1741,7 +1777,7 @@ class LinearModelTest(test.TestCase):
       self.assertEqual([], trainable_vars)
 
   def test_sparse_trainable_false(self):
-    wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
+    wire_cast = fc._categorical_column_with_hash_bucket('wire_cast', 4)
     with ops.Graph().as_default() as g:
       wire_tensor = sparse_tensor.SparseTensor(
           values=['omar'], indices=[[0, 0]], dense_shape=[1, 1])
@@ -1751,9 +1787,9 @@ class LinearModelTest(test.TestCase):
       self.assertEqual([], trainable_vars)
 
   def test_column_order(self):
-    price_a = fc.numeric_column('price_a')
-    price_b = fc.numeric_column('price_b')
-    wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
+    price_a = fc._numeric_column('price_a')
+    price_b = fc._numeric_column('price_b')
+    wire_cast = fc._categorical_column_with_hash_bucket('wire_cast', 4)
     with ops.Graph().as_default() as g:
       features = {
           'price_a': [[1.]],
@@ -1787,8 +1823,8 @@ class LinearModelTest(test.TestCase):
       self.assertIn('wire_cast', my_vars[2].name)
 
   def test_static_batch_size_mismatch(self):
-    price1 = fc.numeric_column('price1')
-    price2 = fc.numeric_column('price2')
+    price1 = fc._numeric_column('price1')
+    price2 = fc._numeric_column('price2')
     with ops.Graph().as_default():
       features = {
           'price1': [[1.], [5.], [7.]],  # batchsize = 3
@@ -1800,9 +1836,9 @@ class LinearModelTest(test.TestCase):
       fc.linear_model(features, [price1, price2])
 
   def test_subset_of_static_batch_size_mismatch(self):
-    price1 = fc.numeric_column('price1')
-    price2 = fc.numeric_column('price2')
-    price3 = fc.numeric_column('price3')
+    price1 = fc._numeric_column('price1')
+    price2 = fc._numeric_column('price2')
+    price3 = fc._numeric_column('price3')
     with ops.Graph().as_default():
       features = {
           'price1': array_ops.placeholder(dtype=dtypes.int64),  # batchsize = 3
@@ -1815,8 +1851,8 @@ class LinearModelTest(test.TestCase):
         fc.linear_model(features, [price1, price2, price3])
 
   def test_runtime_batch_size_mismatch(self):
-    price1 = fc.numeric_column('price1')
-    price2 = fc.numeric_column('price2')
+    price1 = fc._numeric_column('price1')
+    price2 = fc._numeric_column('price2')
     with ops.Graph().as_default():
       features = {
           'price1': array_ops.placeholder(dtype=dtypes.int64),  # batchsize = 3
@@ -1830,8 +1866,8 @@ class LinearModelTest(test.TestCase):
               predictions, feed_dict={features['price1']: [[1.], [5.], [7.]]})
 
   def test_runtime_batch_size_matches(self):
-    price1 = fc.numeric_column('price1')
-    price2 = fc.numeric_column('price2')
+    price1 = fc._numeric_column('price1')
+    price2 = fc._numeric_column('price2')
     with ops.Graph().as_default():
       features = {
           'price1': array_ops.placeholder(dtype=dtypes.int64),  # batchsize = 2
@@ -1846,10 +1882,16 @@ class LinearModelTest(test.TestCase):
                 features['price2']: [[1.], [5.]],
             })
 
+  @test_util.run_deprecated_v1
   def test_with_1d_sparse_tensor(self):
-    price = fc.numeric_column('price')
-    price_buckets = fc.bucketized_column(price, boundaries=[0., 10., 100.,])
-    body_style = fc.categorical_column_with_vocabulary_list(
+    price = fc._numeric_column('price')
+    price_buckets = fc._bucketized_column(
+        price, boundaries=[
+            0.,
+            10.,
+            100.,
+        ])
+    body_style = fc._categorical_column_with_vocabulary_list(
         'body-style', vocabulary_list=['hardtop', 'wagon', 'sedan'])
 
     # Provides 1-dim tensor and dense tensor.
@@ -1873,14 +1915,21 @@ class LinearModelTest(test.TestCase):
       sess.run(body_style_var.assign([[-10.], [-100.], [-1000.]]))
       sess.run(bias.assign([5.]))
 
-      self.assertAllClose([[10 - 1000 + 5.], [1000 - 10 + 5.]], sess.run(net))
+      self.assertAllClose([[10 - 1000 + 5.], [1000 - 10 + 5.]],
+                          self.evaluate(net))
 
+  @test_util.run_deprecated_v1
   def test_with_1d_unknown_shape_sparse_tensor(self):
-    price = fc.numeric_column('price')
-    price_buckets = fc.bucketized_column(price, boundaries=[0., 10., 100.,])
-    body_style = fc.categorical_column_with_vocabulary_list(
+    price = fc._numeric_column('price')
+    price_buckets = fc._bucketized_column(
+        price, boundaries=[
+            0.,
+            10.,
+            100.,
+        ])
+    body_style = fc._categorical_column_with_vocabulary_list(
         'body-style', vocabulary_list=['hardtop', 'wagon', 'sedan'])
-    country = fc.categorical_column_with_vocabulary_list(
+    country = fc._categorical_column_with_vocabulary_list(
         'country', vocabulary_list=['US', 'JP', 'CA'])
 
     # Provides 1-dim tensor and dense tensor.
@@ -1917,8 +1966,9 @@ class LinearModelTest(test.TestCase):
                                   features['country']: country_data
                               }))
 
+  @test_util.run_deprecated_v1
   def test_with_rank_0_feature(self):
-    price = fc.numeric_column('price')
+    price = fc._numeric_column('price')
     features = {
         'price': constant_op.constant(0),
     }
@@ -1939,7 +1989,7 @@ class LinearModelTest(test.TestCase):
         sess.run(net, feed_dict={features['price']: np.array(1)})
 
   def test_multiple_linear_models(self):
-    price = fc.numeric_column('price')
+    price = fc._numeric_column('price')
     with ops.Graph().as_default():
       features1 = {'price': [[1.], [5.]]}
       features2 = {'price': [[2.], [10.]]}
@@ -1950,14 +2000,14 @@ class LinearModelTest(test.TestCase):
       price_var1 = get_linear_model_column_var(price, name='linear_model')
       price_var2 = get_linear_model_column_var(price, name='linear_model_1')
       with _initialized_session() as sess:
-        self.assertAllClose([0.], bias1.eval())
+        self.assertAllClose([0.], self.evaluate(bias1))
         sess.run(price_var1.assign([[10.]]))
         sess.run(bias1.assign([5.]))
-        self.assertAllClose([[15.], [55.]], predictions1.eval())
-        self.assertAllClose([0.], bias2.eval())
+        self.assertAllClose([[15.], [55.]], self.evaluate(predictions1))
+        self.assertAllClose([0.], self.evaluate(bias2))
         sess.run(price_var2.assign([[10.]]))
         sess.run(bias2.assign([5.]))
-        self.assertAllClose([[25.], [105.]], predictions2.eval())
+        self.assertAllClose([[25.], [105.]], self.evaluate(predictions2))
 
 
 class _LinearModelTest(test.TestCase):
@@ -1996,31 +2046,31 @@ class _LinearModelTest(test.TestCase):
     with self.assertRaisesRegexp(
         ValueError, 'Expected feature_columns to be iterable, found dict.'):
       fc.linear_model(
-          features={'a': [[0]]}, feature_columns={'a': fc.numeric_column('a')})
+          features={'a': [[0]]}, feature_columns={'a': fc._numeric_column('a')})
 
   def test_raises_if_duplicate_name(self):
     with self.assertRaisesRegexp(
         ValueError, 'Duplicate feature column name found for columns'):
       get_keras_linear_model_predictions(
           features={'a': [[0]]},
-          feature_columns=[fc.numeric_column('a'),
-                           fc.numeric_column('a')])
+          feature_columns=[fc._numeric_column('a'),
+                           fc._numeric_column('a')])
 
   def test_dense_bias(self):
-    price = fc.numeric_column('price')
+    price = fc._numeric_column('price')
     with ops.Graph().as_default():
       features = {'price': [[1.], [5.]]}
       predictions = get_keras_linear_model_predictions(features, [price])
       bias = get_linear_model_bias()
       price_var = get_linear_model_column_var(price)
       with _initialized_session() as sess:
-        self.assertAllClose([0.], bias.eval())
+        self.assertAllClose([0.], self.evaluate(bias))
         sess.run(price_var.assign([[10.]]))
         sess.run(bias.assign([5.]))
-        self.assertAllClose([[15.], [55.]], predictions.eval())
+        self.assertAllClose([[15.], [55.]], self.evaluate(predictions))
 
   def test_sparse_bias(self):
-    wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
+    wire_cast = fc._categorical_column_with_hash_bucket('wire_cast', 4)
     with ops.Graph().as_default():
       wire_tensor = sparse_tensor.SparseTensor(
           values=['omar', 'stringer', 'marlo'],  # hashed to = [2, 0, 3]
@@ -2031,15 +2081,16 @@ class _LinearModelTest(test.TestCase):
       bias = get_linear_model_bias()
       wire_cast_var = get_linear_model_column_var(wire_cast)
       with _initialized_session() as sess:
-        self.assertAllClose([0.], bias.eval())
-        self.assertAllClose([[0.], [0.], [0.], [0.]], wire_cast_var.eval())
+        self.assertAllClose([0.], self.evaluate(bias))
+        self.assertAllClose([[0.], [0.], [0.], [0.]],
+                            self.evaluate(wire_cast_var))
         sess.run(wire_cast_var.assign([[10.], [100.], [1000.], [10000.]]))
         sess.run(bias.assign([5.]))
-        self.assertAllClose([[1005.], [10015.]], predictions.eval())
+        self.assertAllClose([[1005.], [10015.]], self.evaluate(predictions))
 
   def test_dense_and_sparse_bias(self):
-    wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
-    price = fc.numeric_column('price')
+    wire_cast = fc._categorical_column_with_hash_bucket('wire_cast', 4)
+    price = fc._numeric_column('price')
     with ops.Graph().as_default():
       wire_tensor = sparse_tensor.SparseTensor(
           values=['omar', 'stringer', 'marlo'],  # hashed to = [2, 0, 3]
@@ -2055,7 +2106,7 @@ class _LinearModelTest(test.TestCase):
         sess.run(wire_cast_var.assign([[10.], [100.], [1000.], [10000.]]))
         sess.run(bias.assign([5.]))
         sess.run(price_var.assign([[10.]]))
-        self.assertAllClose([[1015.], [10065.]], predictions.eval())
+        self.assertAllClose([[1015.], [10065.]], self.evaluate(predictions))
 
   def test_dense_and_sparse_column(self):
     """When the column is both dense and sparse, uses sparse tensors."""
@@ -2114,10 +2165,10 @@ class _LinearModelTest(test.TestCase):
             dense_and_sparse_column_var.assign([[10.], [100.], [1000.],
                                                 [10000.]]))
         sess.run(bias.assign([5.]))
-        self.assertAllClose([[1005.], [10015.]], predictions.eval())
+        self.assertAllClose([[1005.], [10015.]], self.evaluate(predictions))
 
   def test_dense_multi_output(self):
-    price = fc.numeric_column('price')
+    price = fc._numeric_column('price')
     with ops.Graph().as_default():
       features = {'price': [[1.], [5.]]}
       predictions = get_keras_linear_model_predictions(
@@ -2125,15 +2176,15 @@ class _LinearModelTest(test.TestCase):
       bias = get_linear_model_bias()
       price_var = get_linear_model_column_var(price)
       with _initialized_session() as sess:
-        self.assertAllClose(np.zeros((3,)), bias.eval())
-        self.assertAllClose(np.zeros((1, 3)), price_var.eval())
+        self.assertAllClose(np.zeros((3,)), self.evaluate(bias))
+        self.assertAllClose(np.zeros((1, 3)), self.evaluate(price_var))
         sess.run(price_var.assign([[10., 100., 1000.]]))
         sess.run(bias.assign([5., 6., 7.]))
         self.assertAllClose([[15., 106., 1007.], [55., 506., 5007.]],
-                            predictions.eval())
+                            self.evaluate(predictions))
 
   def test_sparse_multi_output(self):
-    wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
+    wire_cast = fc._categorical_column_with_hash_bucket('wire_cast', 4)
     with ops.Graph().as_default():
       wire_tensor = sparse_tensor.SparseTensor(
           values=['omar', 'stringer', 'marlo'],  # hashed to = [2, 0, 3]
@@ -2145,29 +2196,29 @@ class _LinearModelTest(test.TestCase):
       bias = get_linear_model_bias()
       wire_cast_var = get_linear_model_column_var(wire_cast)
       with _initialized_session() as sess:
-        self.assertAllClose(np.zeros((3,)), bias.eval())
-        self.assertAllClose(np.zeros((4, 3)), wire_cast_var.eval())
+        self.assertAllClose(np.zeros((3,)), self.evaluate(bias))
+        self.assertAllClose(np.zeros((4, 3)), self.evaluate(wire_cast_var))
         sess.run(
             wire_cast_var.assign([[10., 11., 12.], [100., 110., 120.],
                                   [1000., 1100.,
                                    1200.], [10000., 11000., 12000.]]))
         sess.run(bias.assign([5., 6., 7.]))
         self.assertAllClose([[1005., 1106., 1207.], [10015., 11017., 12019.]],
-                            predictions.eval())
+                            self.evaluate(predictions))
 
   def test_dense_multi_dimension(self):
-    price = fc.numeric_column('price', shape=2)
+    price = fc._numeric_column('price', shape=2)
     with ops.Graph().as_default():
       features = {'price': [[1., 2.], [5., 6.]]}
       predictions = get_keras_linear_model_predictions(features, [price])
       price_var = get_linear_model_column_var(price)
       with _initialized_session() as sess:
-        self.assertAllClose([[0.], [0.]], price_var.eval())
+        self.assertAllClose([[0.], [0.]], self.evaluate(price_var))
         sess.run(price_var.assign([[10.], [100.]]))
-        self.assertAllClose([[210.], [650.]], predictions.eval())
+        self.assertAllClose([[210.], [650.]], self.evaluate(predictions))
 
   def test_sparse_multi_rank(self):
-    wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
+    wire_cast = fc._categorical_column_with_hash_bucket('wire_cast', 4)
     with ops.Graph().as_default():
       wire_tensor = array_ops.sparse_placeholder(dtypes.string)
       wire_value = sparse_tensor.SparseTensorValue(
@@ -2178,7 +2229,7 @@ class _LinearModelTest(test.TestCase):
       predictions = get_keras_linear_model_predictions(features, [wire_cast])
       wire_cast_var = get_linear_model_column_var(wire_cast)
       with _initialized_session() as sess:
-        self.assertAllClose(np.zeros((4, 1)), wire_cast_var.eval())
+        self.assertAllClose(np.zeros((4, 1)), self.evaluate(wire_cast_var))
         self.assertAllClose(
             np.zeros((2, 1)),
             predictions.eval(feed_dict={wire_tensor: wire_value}))
@@ -2188,7 +2239,7 @@ class _LinearModelTest(test.TestCase):
             predictions.eval(feed_dict={wire_tensor: wire_value}))
 
   def test_sparse_combiner(self):
-    wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
+    wire_cast = fc._categorical_column_with_hash_bucket('wire_cast', 4)
     with ops.Graph().as_default():
       wire_tensor = sparse_tensor.SparseTensor(
           values=['omar', 'stringer', 'marlo'],  # hashed to = [2, 0, 3]
@@ -2202,10 +2253,10 @@ class _LinearModelTest(test.TestCase):
       with _initialized_session() as sess:
         sess.run(wire_cast_var.assign([[10.], [100.], [1000.], [10000.]]))
         sess.run(bias.assign([5.]))
-        self.assertAllClose([[1005.], [5010.]], predictions.eval())
+        self.assertAllClose([[1005.], [5010.]], self.evaluate(predictions))
 
   def test_dense_multi_dimension_multi_output(self):
-    price = fc.numeric_column('price', shape=2)
+    price = fc._numeric_column('price', shape=2)
     with ops.Graph().as_default():
       features = {'price': [[1., 2.], [5., 6.]]}
       predictions = get_keras_linear_model_predictions(
@@ -2213,15 +2264,15 @@ class _LinearModelTest(test.TestCase):
       bias = get_linear_model_bias()
       price_var = get_linear_model_column_var(price)
       with _initialized_session() as sess:
-        self.assertAllClose(np.zeros((3,)), bias.eval())
-        self.assertAllClose(np.zeros((2, 3)), price_var.eval())
+        self.assertAllClose(np.zeros((3,)), self.evaluate(bias))
+        self.assertAllClose(np.zeros((2, 3)), self.evaluate(price_var))
         sess.run(price_var.assign([[1., 2., 3.], [10., 100., 1000.]]))
         sess.run(bias.assign([2., 3., 4.]))
         self.assertAllClose([[23., 205., 2007.], [67., 613., 6019.]],
-                            predictions.eval())
+                            self.evaluate(predictions))
 
   def test_raises_if_shape_mismatch(self):
-    price = fc.numeric_column('price', shape=2)
+    price = fc._numeric_column('price', shape=2)
     with ops.Graph().as_default():
       features = {'price': [[1.], [5.]]}
       with self.assertRaisesRegexp(
@@ -2230,22 +2281,22 @@ class _LinearModelTest(test.TestCase):
         get_keras_linear_model_predictions(features, [price])
 
   def test_dense_reshaping(self):
-    price = fc.numeric_column('price', shape=[1, 2])
+    price = fc._numeric_column('price', shape=[1, 2])
     with ops.Graph().as_default():
       features = {'price': [[[1., 2.]], [[5., 6.]]]}
       predictions = get_keras_linear_model_predictions(features, [price])
       bias = get_linear_model_bias()
       price_var = get_linear_model_column_var(price)
       with _initialized_session() as sess:
-        self.assertAllClose([0.], bias.eval())
-        self.assertAllClose([[0.], [0.]], price_var.eval())
-        self.assertAllClose([[0.], [0.]], predictions.eval())
+        self.assertAllClose([0.], self.evaluate(bias))
+        self.assertAllClose([[0.], [0.]], self.evaluate(price_var))
+        self.assertAllClose([[0.], [0.]], self.evaluate(predictions))
         sess.run(price_var.assign([[10.], [100.]]))
-        self.assertAllClose([[210.], [650.]], predictions.eval())
+        self.assertAllClose([[210.], [650.]], self.evaluate(predictions))
 
   def test_dense_multi_column(self):
-    price1 = fc.numeric_column('price1', shape=2)
-    price2 = fc.numeric_column('price2')
+    price1 = fc._numeric_column('price1', shape=2)
+    price2 = fc._numeric_column('price2')
     with ops.Graph().as_default():
       features = {'price1': [[1., 2.], [5., 6.]], 'price2': [[3.], [4.]]}
       predictions = get_keras_linear_model_predictions(features,
@@ -2254,18 +2305,18 @@ class _LinearModelTest(test.TestCase):
       price1_var = get_linear_model_column_var(price1)
       price2_var = get_linear_model_column_var(price2)
       with _initialized_session() as sess:
-        self.assertAllClose([0.], bias.eval())
-        self.assertAllClose([[0.], [0.]], price1_var.eval())
-        self.assertAllClose([[0.]], price2_var.eval())
-        self.assertAllClose([[0.], [0.]], predictions.eval())
+        self.assertAllClose([0.], self.evaluate(bias))
+        self.assertAllClose([[0.], [0.]], self.evaluate(price1_var))
+        self.assertAllClose([[0.]], self.evaluate(price2_var))
+        self.assertAllClose([[0.], [0.]], self.evaluate(predictions))
         sess.run(price1_var.assign([[10.], [100.]]))
         sess.run(price2_var.assign([[1000.]]))
         sess.run(bias.assign([7.]))
-        self.assertAllClose([[3217.], [4657.]], predictions.eval())
+        self.assertAllClose([[3217.], [4657.]], self.evaluate(predictions))
 
   def test_fills_cols_to_vars(self):
-    price1 = fc.numeric_column('price1', shape=2)
-    price2 = fc.numeric_column('price2')
+    price1 = fc._numeric_column('price1', shape=2)
+    price2 = fc._numeric_column('price2')
     with ops.Graph().as_default():
       features = {'price1': [[1., 2.], [5., 6.]], 'price2': [[3.], [4.]]}
       cols_to_vars = {}
@@ -2279,8 +2330,8 @@ class _LinearModelTest(test.TestCase):
       self.assertAllEqual(cols_to_vars[price2], [price2_var])
 
   def test_fills_cols_to_vars_partitioned_variables(self):
-    price1 = fc.numeric_column('price1', shape=2)
-    price2 = fc.numeric_column('price2', shape=3)
+    price1 = fc._numeric_column('price1', shape=2)
+    price2 = fc._numeric_column('price2', shape=3)
     with ops.Graph().as_default():
       features = {
           'price1': [[1., 2.], [6., 7.]],
@@ -2303,7 +2354,7 @@ class _LinearModelTest(test.TestCase):
         self.assertAllEqual([[0.]], cols_to_vars[price2][1].eval())
 
   def test_dense_collection(self):
-    price = fc.numeric_column('price')
+    price = fc._numeric_column('price')
     with ops.Graph().as_default() as g:
       features = {'price': [[1.], [5.]]}
       get_keras_linear_model_predictions(
@@ -2315,7 +2366,7 @@ class _LinearModelTest(test.TestCase):
       self.assertIn(price_var, my_vars)
 
   def test_sparse_collection(self):
-    wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
+    wire_cast = fc._categorical_column_with_hash_bucket('wire_cast', 4)
     with ops.Graph().as_default() as g:
       wire_tensor = sparse_tensor.SparseTensor(
           values=['omar'], indices=[[0, 0]], dense_shape=[1, 1])
@@ -2329,7 +2380,7 @@ class _LinearModelTest(test.TestCase):
       self.assertIn(wire_cast_var, my_vars)
 
   def test_dense_trainable_default(self):
-    price = fc.numeric_column('price')
+    price = fc._numeric_column('price')
     with ops.Graph().as_default() as g:
       features = {'price': [[1.], [5.]]}
       get_keras_linear_model_predictions(features, [price])
@@ -2340,7 +2391,7 @@ class _LinearModelTest(test.TestCase):
       self.assertIn(price_var, trainable_vars)
 
   def test_sparse_trainable_default(self):
-    wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
+    wire_cast = fc._categorical_column_with_hash_bucket('wire_cast', 4)
     with ops.Graph().as_default() as g:
       wire_tensor = sparse_tensor.SparseTensor(
           values=['omar'], indices=[[0, 0]], dense_shape=[1, 1])
@@ -2353,7 +2404,7 @@ class _LinearModelTest(test.TestCase):
       self.assertIn(wire_cast_var, trainable_vars)
 
   def test_dense_trainable_false(self):
-    price = fc.numeric_column('price')
+    price = fc._numeric_column('price')
     with ops.Graph().as_default() as g:
       features = {'price': [[1.], [5.]]}
       get_keras_linear_model_predictions(features, [price], trainable=False)
@@ -2361,7 +2412,7 @@ class _LinearModelTest(test.TestCase):
       self.assertEqual([], trainable_vars)
 
   def test_sparse_trainable_false(self):
-    wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
+    wire_cast = fc._categorical_column_with_hash_bucket('wire_cast', 4)
     with ops.Graph().as_default() as g:
       wire_tensor = sparse_tensor.SparseTensor(
           values=['omar'], indices=[[0, 0]], dense_shape=[1, 1])
@@ -2371,9 +2422,9 @@ class _LinearModelTest(test.TestCase):
       self.assertEqual([], trainable_vars)
 
   def test_column_order(self):
-    price_a = fc.numeric_column('price_a')
-    price_b = fc.numeric_column('price_b')
-    wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
+    price_a = fc._numeric_column('price_a')
+    price_b = fc._numeric_column('price_b')
+    wire_cast = fc._categorical_column_with_hash_bucket('wire_cast', 4)
     with ops.Graph().as_default() as g:
       features = {
           'price_a': [[1.]],
@@ -2407,8 +2458,8 @@ class _LinearModelTest(test.TestCase):
       self.assertIn('wire_cast', my_vars[2].name)
 
   def test_static_batch_size_mismatch(self):
-    price1 = fc.numeric_column('price1')
-    price2 = fc.numeric_column('price2')
+    price1 = fc._numeric_column('price1')
+    price2 = fc._numeric_column('price2')
     with ops.Graph().as_default():
       features = {
           'price1': [[1.], [5.], [7.]],  # batchsize = 3
@@ -2420,9 +2471,9 @@ class _LinearModelTest(test.TestCase):
       get_keras_linear_model_predictions(features, [price1, price2])
 
   def test_subset_of_static_batch_size_mismatch(self):
-    price1 = fc.numeric_column('price1')
-    price2 = fc.numeric_column('price2')
-    price3 = fc.numeric_column('price3')
+    price1 = fc._numeric_column('price1')
+    price2 = fc._numeric_column('price2')
+    price3 = fc._numeric_column('price3')
     with ops.Graph().as_default():
       features = {
           'price1': array_ops.placeholder(dtype=dtypes.int64),  # batchsize = 3
@@ -2435,8 +2486,8 @@ class _LinearModelTest(test.TestCase):
         get_keras_linear_model_predictions(features, [price1, price2, price3])
 
   def test_runtime_batch_size_mismatch(self):
-    price1 = fc.numeric_column('price1')
-    price2 = fc.numeric_column('price2')
+    price1 = fc._numeric_column('price1')
+    price2 = fc._numeric_column('price2')
     with ops.Graph().as_default():
       features = {
           'price1': array_ops.placeholder(dtype=dtypes.int64),  # batchsize = 3
@@ -2451,8 +2502,8 @@ class _LinearModelTest(test.TestCase):
               predictions, feed_dict={features['price1']: [[1.], [5.], [7.]]})
 
   def test_runtime_batch_size_matches(self):
-    price1 = fc.numeric_column('price1')
-    price2 = fc.numeric_column('price2')
+    price1 = fc._numeric_column('price1')
+    price2 = fc._numeric_column('price2')
     with ops.Graph().as_default():
       features = {
           'price1': array_ops.placeholder(dtype=dtypes.int64),  # batchsize = 2
@@ -2468,15 +2519,16 @@ class _LinearModelTest(test.TestCase):
                 features['price2']: [[1.], [5.]],
             })
 
+  @test_util.run_deprecated_v1
   def test_with_1d_sparse_tensor(self):
-    price = fc.numeric_column('price')
-    price_buckets = fc.bucketized_column(
+    price = fc._numeric_column('price')
+    price_buckets = fc._bucketized_column(
         price, boundaries=[
             0.,
             10.,
             100.,
         ])
-    body_style = fc.categorical_column_with_vocabulary_list(
+    body_style = fc._categorical_column_with_vocabulary_list(
         'body-style', vocabulary_list=['hardtop', 'wagon', 'sedan'])
 
     # Provides 1-dim tensor and dense tensor.
@@ -2506,19 +2558,21 @@ class _LinearModelTest(test.TestCase):
       sess.run(body_style_var.assign([[-10.], [-100.], [-1000.]]))
       sess.run(bias.assign([5.]))
 
-      self.assertAllClose([[10 - 1000 + 5.], [1000 - 10 + 5.]], sess.run(net))
+      self.assertAllClose([[10 - 1000 + 5.], [1000 - 10 + 5.]],
+                          self.evaluate(net))
 
+  @test_util.run_deprecated_v1
   def test_with_1d_unknown_shape_sparse_tensor(self):
-    price = fc.numeric_column('price')
-    price_buckets = fc.bucketized_column(
+    price = fc._numeric_column('price')
+    price_buckets = fc._bucketized_column(
         price, boundaries=[
             0.,
             10.,
             100.,
         ])
-    body_style = fc.categorical_column_with_vocabulary_list(
+    body_style = fc._categorical_column_with_vocabulary_list(
         'body-style', vocabulary_list=['hardtop', 'wagon', 'sedan'])
-    country = fc.categorical_column_with_vocabulary_list(
+    country = fc._categorical_column_with_vocabulary_list(
         'country', vocabulary_list=['US', 'JP', 'CA'])
 
     # Provides 1-dim tensor and dense tensor.
@@ -2554,8 +2608,9 @@ class _LinearModelTest(test.TestCase):
                                   features['country']: country_data
                               }))
 
+  @test_util.run_deprecated_v1
   def test_with_rank_0_feature(self):
-    price = fc.numeric_column('price')
+    price = fc._numeric_column('price')
     features = {
         'price': constant_op.constant(0),
     }
@@ -2581,7 +2636,7 @@ class InputLayerTest(test.TestCase):
   @test_util.run_in_graph_and_eager_modes
   def test_retrieving_input(self):
     features = {'a': [0.]}
-    input_layer = InputLayer(fc.numeric_column('a'))
+    input_layer = InputLayer(fc._numeric_column('a'))
     inputs = self.evaluate(input_layer(features))
     self.assertAllClose([[0.]], inputs)
 
@@ -2593,8 +2648,8 @@ class InputLayerTest(test.TestCase):
           dense_shape=(3, 3))
 
       # Create feature columns (categorical and embedding).
-      categorical_column = fc.categorical_column_with_identity(key='a',
-                                                               num_buckets=3)
+      categorical_column = fc._categorical_column_with_identity(
+          key='a', num_buckets=3)
       embedding_dimension = 2
       def _embedding_column_initializer(shape, dtype, partition_info):
         del shape  # unused
@@ -2605,7 +2660,8 @@ class InputLayerTest(test.TestCase):
             (0, 1),  # id 1
             (1, 1))  # id 2
         return embedding_values
-      embedding_column = fc.embedding_column(
+
+      embedding_column = fc._embedding_column(
           categorical_column,
           dimension=embedding_dimension,
           initializer=_embedding_column_initializer)
@@ -2636,8 +2692,8 @@ class InputLayerTest(test.TestCase):
           dense_shape=(3, 3))
 
       # Create feature columns (categorical and embedding).
-      categorical_column = fc.categorical_column_with_identity(key='a',
-                                                               num_buckets=3)
+      categorical_column = fc._categorical_column_with_identity(
+          key='a', num_buckets=3)
       embedding_dimension = 2
 
       def _embedding_column_initializer(shape, dtype, partition_info):
@@ -2650,7 +2706,7 @@ class InputLayerTest(test.TestCase):
             (1, 1))  # id 2
         return embedding_values
 
-      embedding_column = fc.embedding_column(
+      embedding_column = fc._embedding_column(
           categorical_column,
           dimension=embedding_dimension,
           initializer=_embedding_column_initializer)
@@ -2687,56 +2743,56 @@ class FunctionalInputLayerTest(test.TestCase):
       fc.input_layer(
           features={'a': [[0]]},
           feature_columns=[
-              fc.categorical_column_with_hash_bucket('wire_cast', 4)
+              fc._categorical_column_with_hash_bucket('wire_cast', 4)
           ])
 
   def test_does_not_support_dict_columns(self):
     with self.assertRaisesRegexp(
         ValueError, 'Expected feature_columns to be iterable, found dict.'):
       fc.input_layer(
-          features={'a': [[0]]}, feature_columns={'a': fc.numeric_column('a')})
+          features={'a': [[0]]}, feature_columns={'a': fc._numeric_column('a')})
 
   def test_bare_column(self):
     with ops.Graph().as_default():
       features = features = {'a': [0.]}
-      net = fc.input_layer(features, fc.numeric_column('a'))
+      net = fc.input_layer(features, fc._numeric_column('a'))
       with _initialized_session():
-        self.assertAllClose([[0.]], net.eval())
+        self.assertAllClose([[0.]], self.evaluate(net))
 
   def test_column_generator(self):
     with ops.Graph().as_default():
       features = features = {'a': [0.], 'b': [1.]}
-      columns = (fc.numeric_column(key) for key in features)
+      columns = (fc._numeric_column(key) for key in features)
       net = fc.input_layer(features, columns)
       with _initialized_session():
-        self.assertAllClose([[0., 1.]], net.eval())
+        self.assertAllClose([[0., 1.]], self.evaluate(net))
 
   def test_raises_if_duplicate_name(self):
     with self.assertRaisesRegexp(
         ValueError, 'Duplicate feature column name found for columns'):
       fc.input_layer(
           features={'a': [[0]]},
-          feature_columns=[fc.numeric_column('a'),
-                           fc.numeric_column('a')])
+          feature_columns=[fc._numeric_column('a'),
+                           fc._numeric_column('a')])
 
   def test_one_column(self):
-    price = fc.numeric_column('price')
+    price = fc._numeric_column('price')
     with ops.Graph().as_default():
       features = {'price': [[1.], [5.]]}
       net = fc.input_layer(features, [price])
       with _initialized_session():
-        self.assertAllClose([[1.], [5.]], net.eval())
+        self.assertAllClose([[1.], [5.]], self.evaluate(net))
 
   def test_multi_dimension(self):
-    price = fc.numeric_column('price', shape=2)
+    price = fc._numeric_column('price', shape=2)
     with ops.Graph().as_default():
       features = {'price': [[1., 2.], [5., 6.]]}
       net = fc.input_layer(features, [price])
       with _initialized_session():
-        self.assertAllClose([[1., 2.], [5., 6.]], net.eval())
+        self.assertAllClose([[1., 2.], [5., 6.]], self.evaluate(net))
 
   def test_raises_if_shape_mismatch(self):
-    price = fc.numeric_column('price', shape=2)
+    price = fc._numeric_column('price', shape=2)
     with ops.Graph().as_default():
       features = {'price': [[1.], [5.]]}
       with self.assertRaisesRegexp(
@@ -2745,16 +2801,16 @@ class FunctionalInputLayerTest(test.TestCase):
         fc.input_layer(features, [price])
 
   def test_reshaping(self):
-    price = fc.numeric_column('price', shape=[1, 2])
+    price = fc._numeric_column('price', shape=[1, 2])
     with ops.Graph().as_default():
       features = {'price': [[[1., 2.]], [[5., 6.]]]}
       net = fc.input_layer(features, [price])
       with _initialized_session():
-        self.assertAllClose([[1., 2.], [5., 6.]], net.eval())
+        self.assertAllClose([[1., 2.], [5., 6.]], self.evaluate(net))
 
   def test_multi_column(self):
-    price1 = fc.numeric_column('price1', shape=2)
-    price2 = fc.numeric_column('price2')
+    price1 = fc._numeric_column('price1', shape=2)
+    price2 = fc._numeric_column('price2')
     with ops.Graph().as_default():
       features = {
           'price1': [[1., 2.], [5., 6.]],
@@ -2762,19 +2818,19 @@ class FunctionalInputLayerTest(test.TestCase):
       }
       net = fc.input_layer(features, [price1, price2])
       with _initialized_session():
-        self.assertAllClose([[1., 2., 3.], [5., 6., 4.]], net.eval())
+        self.assertAllClose([[1., 2., 3.], [5., 6., 4.]], self.evaluate(net))
 
   def test_fills_cols_to_vars(self):
     # Provide three _DenseColumn's to input_layer: a _NumericColumn, a
     # _BucketizedColumn, and an _EmbeddingColumn.  Only the _EmbeddingColumn
     # creates a Variable.
-    price1 = fc.numeric_column('price1')
-    dense_feature = fc.numeric_column('dense_feature')
-    dense_feature_bucketized = fc.bucketized_column(
+    price1 = fc._numeric_column('price1')
+    dense_feature = fc._numeric_column('dense_feature')
+    dense_feature_bucketized = fc._bucketized_column(
         dense_feature, boundaries=[0.])
-    some_sparse_column = fc.categorical_column_with_hash_bucket(
+    some_sparse_column = fc._categorical_column_with_hash_bucket(
         'sparse_feature', hash_bucket_size=5)
-    some_embedding_column = fc.embedding_column(
+    some_embedding_column = fc._embedding_column(
         some_sparse_column, dimension=10)
     with ops.Graph().as_default():
       features = {
@@ -2793,24 +2849,25 @@ class FunctionalInputLayerTest(test.TestCase):
                             variables_lib.Variable)
       self.assertAllEqual(cols_to_vars[some_embedding_column][0].shape, [5, 10])
 
+  @test_util.run_deprecated_v1
   def test_fills_cols_to_vars_shared_embedding(self):
     # Provide 5 DenseColumn's to input_layer: a NumericColumn, a
     # BucketizedColumn, an EmbeddingColumn, two SharedEmbeddingColumns. The
     # EmbeddingColumn creates a Variable and the two SharedEmbeddingColumns
     # shared one variable.
-    price1 = fc.numeric_column('price1')
-    dense_feature = fc.numeric_column('dense_feature')
-    dense_feature_bucketized = fc.bucketized_column(
+    price1 = fc._numeric_column('price1')
+    dense_feature = fc._numeric_column('dense_feature')
+    dense_feature_bucketized = fc._bucketized_column(
         dense_feature, boundaries=[0.])
-    some_sparse_column = fc.categorical_column_with_hash_bucket(
+    some_sparse_column = fc._categorical_column_with_hash_bucket(
         'sparse_feature', hash_bucket_size=5)
-    some_embedding_column = fc.embedding_column(
+    some_embedding_column = fc._embedding_column(
         some_sparse_column, dimension=10)
-    categorical_column_a = fc.categorical_column_with_identity(
+    categorical_column_a = fc._categorical_column_with_identity(
         key='aaa', num_buckets=3)
-    categorical_column_b = fc.categorical_column_with_identity(
+    categorical_column_b = fc._categorical_column_with_identity(
         key='bbb', num_buckets=3)
-    shared_embedding_a, shared_embedding_b = fc.shared_embedding_columns(
+    shared_embedding_a, shared_embedding_b = fc_new.shared_embedding_columns(
         [categorical_column_a, categorical_column_b], dimension=2)
     with ops.Graph().as_default():
       features = {
@@ -2850,13 +2907,13 @@ class FunctionalInputLayerTest(test.TestCase):
       self.assertAllEqual(cols_to_vars[shared_embedding_a][0].shape, [3, 2])
 
   def test_fills_cols_to_vars_partitioned_variables(self):
-    price1 = fc.numeric_column('price1')
-    dense_feature = fc.numeric_column('dense_feature')
-    dense_feature_bucketized = fc.bucketized_column(
+    price1 = fc._numeric_column('price1')
+    dense_feature = fc._numeric_column('dense_feature')
+    dense_feature_bucketized = fc._bucketized_column(
         dense_feature, boundaries=[0.])
-    some_sparse_column = fc.categorical_column_with_hash_bucket(
+    some_sparse_column = fc._categorical_column_with_hash_bucket(
         'sparse_feature', hash_bucket_size=5)
-    some_embedding_column = fc.embedding_column(
+    some_embedding_column = fc._embedding_column(
         some_sparse_column, dimension=10)
     with ops.Graph().as_default():
       features = {
@@ -2883,8 +2940,8 @@ class FunctionalInputLayerTest(test.TestCase):
       self.assertAllEqual(cols_to_vars[some_embedding_column][2].shape, [1, 10])
 
   def test_column_order(self):
-    price_a = fc.numeric_column('price_a')
-    price_b = fc.numeric_column('price_b')
+    price_a = fc._numeric_column('price_a')
+    price_b = fc._numeric_column('price_b')
     with ops.Graph().as_default():
       features = {
           'price_a': [[1.]],
@@ -2893,11 +2950,11 @@ class FunctionalInputLayerTest(test.TestCase):
       net1 = fc.input_layer(features, [price_a, price_b])
       net2 = fc.input_layer(features, [price_b, price_a])
       with _initialized_session():
-        self.assertAllClose([[1., 3.]], net1.eval())
-        self.assertAllClose([[1., 3.]], net2.eval())
+        self.assertAllClose([[1., 3.]], self.evaluate(net1))
+        self.assertAllClose([[1., 3.]], self.evaluate(net2))
 
   def test_fails_for_categorical_column(self):
-    animal = fc.categorical_column_with_identity('animal', num_buckets=4)
+    animal = fc._categorical_column_with_identity('animal', num_buckets=4)
     with ops.Graph().as_default():
       features = {
           'animal':
@@ -2908,8 +2965,8 @@ class FunctionalInputLayerTest(test.TestCase):
         fc.input_layer(features, [animal])
 
   def test_static_batch_size_mismatch(self):
-    price1 = fc.numeric_column('price1')
-    price2 = fc.numeric_column('price2')
+    price1 = fc._numeric_column('price1')
+    price2 = fc._numeric_column('price2')
     with ops.Graph().as_default():
       features = {
           'price1': [[1.], [5.], [7.]],  # batchsize = 3
@@ -2921,9 +2978,9 @@ class FunctionalInputLayerTest(test.TestCase):
         fc.input_layer(features, [price1, price2])
 
   def test_subset_of_static_batch_size_mismatch(self):
-    price1 = fc.numeric_column('price1')
-    price2 = fc.numeric_column('price2')
-    price3 = fc.numeric_column('price3')
+    price1 = fc._numeric_column('price1')
+    price2 = fc._numeric_column('price2')
+    price3 = fc._numeric_column('price3')
     with ops.Graph().as_default():
       features = {
           'price1': array_ops.placeholder(dtype=dtypes.int64),  # batchsize = 3
@@ -2936,8 +2993,8 @@ class FunctionalInputLayerTest(test.TestCase):
         fc.input_layer(features, [price1, price2, price3])
 
   def test_runtime_batch_size_mismatch(self):
-    price1 = fc.numeric_column('price1')
-    price2 = fc.numeric_column('price2')
+    price1 = fc._numeric_column('price1')
+    price2 = fc._numeric_column('price2')
     with ops.Graph().as_default():
       features = {
           'price1': array_ops.placeholder(dtype=dtypes.int64),  # batchsize = 3
@@ -2950,8 +3007,8 @@ class FunctionalInputLayerTest(test.TestCase):
           sess.run(net, feed_dict={features['price1']: [[1.], [5.], [7.]]})
 
   def test_runtime_batch_size_matches(self):
-    price1 = fc.numeric_column('price1')
-    price2 = fc.numeric_column('price2')
+    price1 = fc._numeric_column('price1')
+    price2 = fc._numeric_column('price2')
     with ops.Graph().as_default():
       features = {
           'price1': array_ops.placeholder(dtype=dtypes.int64),  # batchsize = 2
@@ -2967,9 +3024,9 @@ class FunctionalInputLayerTest(test.TestCase):
             })
 
   def test_multiple_layers_with_same_embedding_column(self):
-    some_sparse_column = fc.categorical_column_with_hash_bucket(
+    some_sparse_column = fc._categorical_column_with_hash_bucket(
         'sparse_feature', hash_bucket_size=5)
-    some_embedding_column = fc.embedding_column(
+    some_embedding_column = fc._embedding_column(
         some_sparse_column, dimension=10)
 
     with ops.Graph().as_default():
@@ -2990,13 +3047,14 @@ class FunctionalInputLayerTest(test.TestCase):
           expected_var_names,
           [v.name for v in ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)])
 
+  @test_util.run_deprecated_v1
   def test_multiple_layers_with_same_shared_embedding_column(self):
-    categorical_column_a = fc.categorical_column_with_identity(
+    categorical_column_a = fc._categorical_column_with_identity(
         key='aaa', num_buckets=3)
-    categorical_column_b = fc.categorical_column_with_identity(
+    categorical_column_b = fc._categorical_column_with_identity(
         key='bbb', num_buckets=3)
     embedding_dimension = 2
-    embedding_column_b, embedding_column_a = fc.shared_embedding_columns(
+    embedding_column_b, embedding_column_a = fc_new.shared_embedding_columns(
         [categorical_column_b, categorical_column_a],
         dimension=embedding_dimension)
 
@@ -3023,13 +3081,14 @@ class FunctionalInputLayerTest(test.TestCase):
           ['input_layer/aaa_bbb_shared_embedding/embedding_weights:0'],
           [v.name for v in ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)])
 
+  @test_util.run_deprecated_v1
   def test_multiple_layers_with_same_shared_embedding_column_diff_graphs(self):
-    categorical_column_a = fc.categorical_column_with_identity(
+    categorical_column_a = fc._categorical_column_with_identity(
         key='aaa', num_buckets=3)
-    categorical_column_b = fc.categorical_column_with_identity(
+    categorical_column_b = fc._categorical_column_with_identity(
         key='bbb', num_buckets=3)
     embedding_dimension = 2
-    embedding_column_b, embedding_column_a = fc.shared_embedding_columns(
+    embedding_column_b, embedding_column_a = fc_new.shared_embedding_columns(
         [categorical_column_b, categorical_column_a],
         dimension=embedding_dimension)
     all_cols = [embedding_column_a, embedding_column_b]
@@ -3074,6 +3133,7 @@ class FunctionalInputLayerTest(test.TestCase):
           ['input_layer/aaa_bbb_shared_embedding/embedding_weights:0'],
           [v.name for v in ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)])
 
+  @test_util.run_deprecated_v1
   def test_with_1d_sparse_tensor(self):
     embedding_values = (
         (1., 2., 3., 4., 5.),  # id 0
@@ -3085,18 +3145,18 @@ class FunctionalInputLayerTest(test.TestCase):
       return embedding_values
 
     # price has 1 dimension in input_layer
-    price = fc.numeric_column('price')
+    price = fc._numeric_column('price')
 
     # one_hot_body_style has 3 dims in input_layer.
-    body_style = fc.categorical_column_with_vocabulary_list(
+    body_style = fc._categorical_column_with_vocabulary_list(
         'body-style', vocabulary_list=['hardtop', 'wagon', 'sedan'])
-    one_hot_body_style = fc.indicator_column(body_style)
+    one_hot_body_style = fc._indicator_column(body_style)
 
     # embedded_body_style has 5 dims in input_layer.
-    country = fc.categorical_column_with_vocabulary_list(
+    country = fc._categorical_column_with_vocabulary_list(
         'country', vocabulary_list=['US', 'JP', 'CA'])
-    embedded_country = fc.embedding_column(country, dimension=5,
-                                           initializer=_initializer)
+    embedded_country = fc._embedding_column(
+        country, dimension=5, initializer=_initializer)
 
     # Provides 1-dim tensor and dense tensor.
     features = {
@@ -3124,6 +3184,7 @@ class FunctionalInputLayerTest(test.TestCase):
            [1., 0., 0., 1., 2., 3., 4., 5., 12.]],
           sess.run(net))
 
+  @test_util.run_deprecated_v1
   def test_with_1d_unknown_shape_sparse_tensor(self):
     embedding_values = (
         (1., 2.),  # id 0
@@ -3135,17 +3196,17 @@ class FunctionalInputLayerTest(test.TestCase):
       return embedding_values
 
     # price has 1 dimension in input_layer
-    price = fc.numeric_column('price')
+    price = fc._numeric_column('price')
 
     # one_hot_body_style has 3 dims in input_layer.
-    body_style = fc.categorical_column_with_vocabulary_list(
+    body_style = fc._categorical_column_with_vocabulary_list(
         'body-style', vocabulary_list=['hardtop', 'wagon', 'sedan'])
-    one_hot_body_style = fc.indicator_column(body_style)
+    one_hot_body_style = fc._indicator_column(body_style)
 
     # embedded_body_style has 5 dims in input_layer.
-    country = fc.categorical_column_with_vocabulary_list(
+    country = fc._categorical_column_with_vocabulary_list(
         'country', vocabulary_list=['US', 'JP', 'CA'])
-    embedded_country = fc.embedding_column(
+    embedded_country = fc._embedding_column(
         country, dimension=2, initializer=_initializer)
 
     # Provides 1-dim tensor and dense tensor.
@@ -3183,9 +3244,10 @@ class FunctionalInputLayerTest(test.TestCase):
                   features['country']: country_data
               }))
 
+  @test_util.run_deprecated_v1
   def test_with_rank_0_feature(self):
     # price has 1 dimension in input_layer
-    price = fc.numeric_column('price')
+    price = fc._numeric_column('price')
     features = {
         'price': constant_op.constant(0),
     }
@@ -3313,8 +3375,9 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
         'python/feature_column/testdata/wire_vocabulary.txt')
     self._wire_vocabulary_size = 3
 
+  @test_util.run_deprecated_v1
   def test_defaults(self):
-    column = fc.categorical_column_with_vocabulary_file(
+    column = fc._categorical_column_with_vocabulary_file(
         key='aaa', vocabulary_file='path_to_file', vocabulary_size=3)
     self.assertEqual('aaa', column.name)
     self.assertEqual('aaa', column._var_scope_name)
@@ -3326,22 +3389,30 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
 
   def test_key_should_be_string(self):
     with self.assertRaisesRegexp(ValueError, 'key must be a string.'):
-      fc.categorical_column_with_vocabulary_file(
+      fc._categorical_column_with_vocabulary_file(
           key=('aaa',), vocabulary_file='path_to_file', vocabulary_size=3)
 
+  @test_util.run_deprecated_v1
   def test_all_constructor_args(self):
-    column = fc.categorical_column_with_vocabulary_file(
-        key='aaa', vocabulary_file='path_to_file', vocabulary_size=3,
-        num_oov_buckets=4, dtype=dtypes.int32)
+    column = fc._categorical_column_with_vocabulary_file(
+        key='aaa',
+        vocabulary_file='path_to_file',
+        vocabulary_size=3,
+        num_oov_buckets=4,
+        dtype=dtypes.int32)
     self.assertEqual(7, column._num_buckets)
     self.assertEqual({
         'aaa': parsing_ops.VarLenFeature(dtypes.int32)
     }, column._parse_example_spec)
 
+  @test_util.run_deprecated_v1
   def test_deep_copy(self):
-    original = fc.categorical_column_with_vocabulary_file(
-        key='aaa', vocabulary_file='path_to_file', vocabulary_size=3,
-        num_oov_buckets=4, dtype=dtypes.int32)
+    original = fc._categorical_column_with_vocabulary_file(
+        key='aaa',
+        vocabulary_file='path_to_file',
+        vocabulary_size=3,
+        num_oov_buckets=4,
+        dtype=dtypes.int32)
     for column in (original, copy.deepcopy(original)):
       self.assertEqual('aaa', column.name)
       self.assertEqual(7, column._num_buckets)
@@ -3351,16 +3422,17 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
 
   def test_vocabulary_file_none(self):
     with self.assertRaisesRegexp(ValueError, 'Missing vocabulary_file'):
-      fc.categorical_column_with_vocabulary_file(
+      fc._categorical_column_with_vocabulary_file(
           key='aaa', vocabulary_file=None, vocabulary_size=3)
 
   def test_vocabulary_file_empty_string(self):
     with self.assertRaisesRegexp(ValueError, 'Missing vocabulary_file'):
-      fc.categorical_column_with_vocabulary_file(
+      fc._categorical_column_with_vocabulary_file(
           key='aaa', vocabulary_file='', vocabulary_size=3)
 
+  @test_util.run_deprecated_v1
   def test_invalid_vocabulary_file(self):
-    column = fc.categorical_column_with_vocabulary_file(
+    column = fc._categorical_column_with_vocabulary_file(
         key='aaa', vocabulary_file='file_does_not_exist', vocabulary_size=10)
     inputs = sparse_tensor.SparseTensorValue(
         indices=((0, 0), (1, 0), (1, 1)),
@@ -3373,16 +3445,19 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
 
   def test_invalid_vocabulary_size(self):
     with self.assertRaisesRegexp(ValueError, 'Invalid vocabulary_size'):
-      fc.categorical_column_with_vocabulary_file(
-          key='aaa', vocabulary_file=self._wire_vocabulary_file_name,
+      fc._categorical_column_with_vocabulary_file(
+          key='aaa',
+          vocabulary_file=self._wire_vocabulary_file_name,
           vocabulary_size=-1)
     with self.assertRaisesRegexp(ValueError, 'Invalid vocabulary_size'):
-      fc.categorical_column_with_vocabulary_file(
-          key='aaa', vocabulary_file=self._wire_vocabulary_file_name,
+      fc._categorical_column_with_vocabulary_file(
+          key='aaa',
+          vocabulary_file=self._wire_vocabulary_file_name,
           vocabulary_size=0)
 
+  @test_util.run_deprecated_v1
   def test_too_large_vocabulary_size(self):
-    column = fc.categorical_column_with_vocabulary_file(
+    column = fc._categorical_column_with_vocabulary_file(
         key='aaa',
         vocabulary_file=self._wire_vocabulary_file_name,
         vocabulary_size=self._wire_vocabulary_size + 1)
@@ -3397,20 +3472,24 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
 
   def test_invalid_num_oov_buckets(self):
     with self.assertRaisesRegexp(ValueError, 'Invalid num_oov_buckets'):
-      fc.categorical_column_with_vocabulary_file(
-          key='aaa', vocabulary_file='path', vocabulary_size=3,
+      fc._categorical_column_with_vocabulary_file(
+          key='aaa',
+          vocabulary_file='path',
+          vocabulary_size=3,
           num_oov_buckets=-1)
 
   def test_invalid_dtype(self):
     with self.assertRaisesRegexp(ValueError, 'dtype must be string or integer'):
-      fc.categorical_column_with_vocabulary_file(
-          key='aaa', vocabulary_file='path', vocabulary_size=3,
+      fc._categorical_column_with_vocabulary_file(
+          key='aaa',
+          vocabulary_file='path',
+          vocabulary_size=3,
           dtype=dtypes.float64)
 
   def test_invalid_buckets_and_default_value(self):
     with self.assertRaisesRegexp(
         ValueError, 'both num_oov_buckets and default_value'):
-      fc.categorical_column_with_vocabulary_file(
+      fc._categorical_column_with_vocabulary_file(
           key='aaa',
           vocabulary_file=self._wire_vocabulary_file_name,
           vocabulary_size=self._wire_vocabulary_size,
@@ -3418,7 +3497,7 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
           default_value=2)
 
   def test_invalid_input_dtype_int32(self):
-    column = fc.categorical_column_with_vocabulary_file(
+    column = fc._categorical_column_with_vocabulary_file(
         key='aaa',
         vocabulary_file=self._wire_vocabulary_file_name,
         vocabulary_size=self._wire_vocabulary_size,
@@ -3431,7 +3510,7 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
       column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
 
   def test_invalid_input_dtype_string(self):
-    column = fc.categorical_column_with_vocabulary_file(
+    column = fc._categorical_column_with_vocabulary_file(
         key='aaa',
         vocabulary_file=self._warriors_vocabulary_file_name,
         vocabulary_size=self._warriors_vocabulary_size,
@@ -3443,8 +3522,9 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
     with self.assertRaisesRegexp(ValueError, 'dtype must be compatible'):
       column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
 
+  @test_util.run_deprecated_v1
   def test_parse_example(self):
-    a = fc.categorical_column_with_vocabulary_file(
+    a = fc._categorical_column_with_vocabulary_file(
         key='aaa', vocabulary_file='path_to_file', vocabulary_size=3)
     data = example_pb2.Example(features=feature_pb2.Features(
         feature={
@@ -3465,8 +3545,9 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
               dense_shape=[1, 2]),
           features['aaa'].eval())
 
+  @test_util.run_deprecated_v1
   def test_get_sparse_tensors(self):
-    column = fc.categorical_column_with_vocabulary_file(
+    column = fc._categorical_column_with_vocabulary_file(
         key='aaa',
         vocabulary_file=self._wire_vocabulary_file_name,
         vocabulary_size=self._wire_vocabulary_size)
@@ -3485,8 +3566,9 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
               dense_shape=inputs.dense_shape),
           id_weight_pair.id_tensor.eval())
 
+  @test_util.run_deprecated_v1
   def test_get_sparse_tensors_none_vocabulary_size(self):
-    column = fc.categorical_column_with_vocabulary_file(
+    column = fc._categorical_column_with_vocabulary_file(
         key='aaa', vocabulary_file=self._wire_vocabulary_file_name)
     inputs = sparse_tensor.SparseTensorValue(
         indices=((0, 0), (1, 0), (1, 1)),
@@ -3503,8 +3585,9 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
                                       dense_shape=inputs.dense_shape),
                                   id_weight_pair.id_tensor.eval())
 
+  @test_util.run_deprecated_v1
   def test_transform_feature(self):
-    column = fc.categorical_column_with_vocabulary_file(
+    column = fc._categorical_column_with_vocabulary_file(
         key='aaa',
         vocabulary_file=self._wire_vocabulary_file_name,
         vocabulary_size=self._wire_vocabulary_size)
@@ -3514,16 +3597,15 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
         dense_shape=(2, 2))
     id_tensor = _transform_features({'aaa': inputs}, [column])[column]
     with _initialized_session():
-      _assert_sparse_tensor_value(self,
-                                  sparse_tensor.SparseTensorValue(
-                                      indices=inputs.indices,
-                                      values=np.array(
-                                          (2, -1, 0), dtype=np.int64),
-                                      dense_shape=inputs.dense_shape),
-                                  id_tensor.eval())
+      _assert_sparse_tensor_value(
+          self,
+          sparse_tensor.SparseTensorValue(
+              indices=inputs.indices,
+              values=np.array((2, -1, 0), dtype=np.int64),
+              dense_shape=inputs.dense_shape), self.evaluate(id_tensor))
 
   def test_get_sparse_tensors_weight_collections(self):
-    column = fc.categorical_column_with_vocabulary_file(
+    column = fc._categorical_column_with_vocabulary_file(
         key='aaa',
         vocabulary_file=self._wire_vocabulary_file_name,
         vocabulary_size=self._wire_vocabulary_size)
@@ -3540,8 +3622,9 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
         [], ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES))
     self.assertItemsEqual([], ops.get_collection('my_weights'))
 
+  @test_util.run_deprecated_v1
   def test_get_sparse_tensors_dense_input(self):
-    column = fc.categorical_column_with_vocabulary_file(
+    column = fc._categorical_column_with_vocabulary_file(
         key='aaa',
         vocabulary_file=self._wire_vocabulary_file_name,
         vocabulary_size=self._wire_vocabulary_size)
@@ -3559,8 +3642,9 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
               dense_shape=(2, 2)),
           id_weight_pair.id_tensor.eval())
 
+  @test_util.run_deprecated_v1
   def test_get_sparse_tensors_default_value_in_vocabulary(self):
-    column = fc.categorical_column_with_vocabulary_file(
+    column = fc._categorical_column_with_vocabulary_file(
         key='aaa',
         vocabulary_file=self._wire_vocabulary_file_name,
         vocabulary_size=self._wire_vocabulary_size,
@@ -3580,8 +3664,9 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
               dense_shape=inputs.dense_shape),
           id_weight_pair.id_tensor.eval())
 
+  @test_util.run_deprecated_v1
   def test_get_sparse_tensors_with_oov_buckets(self):
-    column = fc.categorical_column_with_vocabulary_file(
+    column = fc._categorical_column_with_vocabulary_file(
         key='aaa',
         vocabulary_file=self._wire_vocabulary_file_name,
         vocabulary_size=self._wire_vocabulary_size,
@@ -3601,11 +3686,12 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
               dense_shape=inputs.dense_shape),
           id_weight_pair.id_tensor.eval())
 
+  @test_util.run_deprecated_v1
   def test_get_sparse_tensors_small_vocabulary_size(self):
     # 'marlo' is the last entry in our vocabulary file, so be setting
     # `vocabulary_size` to 1 less than number of entries in file, we take
     # 'marlo' out of the vocabulary.
-    column = fc.categorical_column_with_vocabulary_file(
+    column = fc._categorical_column_with_vocabulary_file(
         key='aaa',
         vocabulary_file=self._wire_vocabulary_file_name,
         vocabulary_size=self._wire_vocabulary_size - 1)
@@ -3624,8 +3710,9 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
               dense_shape=inputs.dense_shape),
           id_weight_pair.id_tensor.eval())
 
+  @test_util.run_deprecated_v1
   def test_get_sparse_tensors_int32(self):
-    column = fc.categorical_column_with_vocabulary_file(
+    column = fc._categorical_column_with_vocabulary_file(
         key='aaa',
         vocabulary_file=self._warriors_vocabulary_file_name,
         vocabulary_size=self._warriors_vocabulary_size,
@@ -3645,9 +3732,10 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
               dense_shape=inputs.dense_shape),
           id_weight_pair.id_tensor.eval())
 
+  @test_util.run_deprecated_v1
   def test_get_sparse_tensors_int32_dense_input(self):
     default_value = -100
-    column = fc.categorical_column_with_vocabulary_file(
+    column = fc._categorical_column_with_vocabulary_file(
         key='aaa',
         vocabulary_file=self._warriors_vocabulary_file_name,
         vocabulary_size=self._warriors_vocabulary_size,
@@ -3667,8 +3755,9 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
               dense_shape=(3, 3)),
           id_weight_pair.id_tensor.eval())
 
+  @test_util.run_deprecated_v1
   def test_get_sparse_tensors_int32_with_oov_buckets(self):
-    column = fc.categorical_column_with_vocabulary_file(
+    column = fc._categorical_column_with_vocabulary_file(
         key='aaa',
         vocabulary_file=self._warriors_vocabulary_file_name,
         vocabulary_size=self._warriors_vocabulary_size,
@@ -3689,8 +3778,9 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
               dense_shape=inputs.dense_shape),
           id_weight_pair.id_tensor.eval())
 
+  @test_util.run_deprecated_v1
   def test_linear_model(self):
-    wire_column = fc.categorical_column_with_vocabulary_file(
+    wire_column = fc._categorical_column_with_vocabulary_file(
         key='wire',
         vocabulary_file=self._wire_vocabulary_file_name,
         vocabulary_size=self._wire_vocabulary_size,
@@ -3706,16 +3796,18 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
       bias = get_linear_model_bias()
       wire_var = get_linear_model_column_var(wire_column)
       with _initialized_session():
-        self.assertAllClose((0.,), bias.eval())
-        self.assertAllClose(((0.,), (0.,), (0.,), (0.,)), wire_var.eval())
-        self.assertAllClose(((0.,), (0.,)), predictions.eval())
+        self.assertAllClose((0.,), self.evaluate(bias))
+        self.assertAllClose(((0.,), (0.,), (0.,), (0.,)),
+                            self.evaluate(wire_var))
+        self.assertAllClose(((0.,), (0.,)), self.evaluate(predictions))
         wire_var.assign(((1.,), (2.,), (3.,), (4.,))).eval()
         # 'marlo' -> 2: wire_var[2] = 3
         # 'skywalker' -> 3, 'omar' -> 0: wire_var[3] + wire_var[0] = 4+1 = 5
-        self.assertAllClose(((3.,), (5.,)), predictions.eval())
+        self.assertAllClose(((3.,), (5.,)), self.evaluate(predictions))
 
+  @test_util.run_deprecated_v1
   def test_keras_linear_model(self):
-    wire_column = fc.categorical_column_with_vocabulary_file(
+    wire_column = fc._categorical_column_with_vocabulary_file(
         key='wire',
         vocabulary_file=self._wire_vocabulary_file_name,
         vocabulary_size=self._wire_vocabulary_size,
@@ -3732,19 +3824,20 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
       bias = get_linear_model_bias()
       wire_var = get_linear_model_column_var(wire_column)
       with _initialized_session():
-        self.assertAllClose((0.,), bias.eval())
-        self.assertAllClose(((0.,), (0.,), (0.,), (0.,)), wire_var.eval())
-        self.assertAllClose(((0.,), (0.,)), predictions.eval())
+        self.assertAllClose((0.,), self.evaluate(bias))
+        self.assertAllClose(((0.,), (0.,), (0.,), (0.,)),
+                            self.evaluate(wire_var))
+        self.assertAllClose(((0.,), (0.,)), self.evaluate(predictions))
         wire_var.assign(((1.,), (2.,), (3.,), (4.,))).eval()
         # 'marlo' -> 2: wire_var[2] = 3
         # 'skywalker' -> 3, 'omar' -> 0: wire_var[3] + wire_var[0] = 4+1 = 5
-        self.assertAllClose(((3.,), (5.,)), predictions.eval())
+        self.assertAllClose(((3.,), (5.,)), self.evaluate(predictions))
 
 
 class VocabularyListCategoricalColumnTest(test.TestCase):
 
   def test_defaults_string(self):
-    column = fc.categorical_column_with_vocabulary_list(
+    column = fc._categorical_column_with_vocabulary_list(
         key='aaa', vocabulary_list=('omar', 'stringer', 'marlo'))
     self.assertEqual('aaa', column.name)
     self.assertEqual('aaa', column.key)
@@ -3756,11 +3849,11 @@ class VocabularyListCategoricalColumnTest(test.TestCase):
 
   def test_key_should_be_string(self):
     with self.assertRaisesRegexp(ValueError, 'key must be a string.'):
-      fc.categorical_column_with_vocabulary_list(
+      fc._categorical_column_with_vocabulary_list(
           key=('aaa',), vocabulary_list=('omar', 'stringer', 'marlo'))
 
   def test_defaults_int(self):
-    column = fc.categorical_column_with_vocabulary_list(
+    column = fc._categorical_column_with_vocabulary_list(
         key='aaa', vocabulary_list=(12, 24, 36))
     self.assertEqual('aaa', column.name)
     self.assertEqual('aaa', column.key)
@@ -3770,17 +3863,21 @@ class VocabularyListCategoricalColumnTest(test.TestCase):
         'aaa': parsing_ops.VarLenFeature(dtypes.int64)
     }, column._parse_example_spec)
 
+  @test_util.run_deprecated_v1
   def test_all_constructor_args(self):
-    column = fc.categorical_column_with_vocabulary_list(
-        key='aaa', vocabulary_list=(12, 24, 36), dtype=dtypes.int32,
+    column = fc._categorical_column_with_vocabulary_list(
+        key='aaa',
+        vocabulary_list=(12, 24, 36),
+        dtype=dtypes.int32,
         default_value=-99)
     self.assertEqual(3, column._num_buckets)
     self.assertEqual({
         'aaa': parsing_ops.VarLenFeature(dtypes.int32)
     }, column._parse_example_spec)
 
+  @test_util.run_deprecated_v1
   def test_deep_copy(self):
-    original = fc.categorical_column_with_vocabulary_list(
+    original = fc._categorical_column_with_vocabulary_list(
         key='aaa', vocabulary_list=(12, 24, 36), dtype=dtypes.int32)
     for column in (original, copy.deepcopy(original)):
       self.assertEqual('aaa', column.name)
@@ -3791,65 +3888,65 @@ class VocabularyListCategoricalColumnTest(test.TestCase):
 
   def test_invalid_dtype(self):
     with self.assertRaisesRegexp(ValueError, 'dtype must be string or integer'):
-      fc.categorical_column_with_vocabulary_list(
-          key='aaa', vocabulary_list=('omar', 'stringer', 'marlo'),
+      fc._categorical_column_with_vocabulary_list(
+          key='aaa',
+          vocabulary_list=('omar', 'stringer', 'marlo'),
           dtype=dtypes.float32)
 
   def test_invalid_mapping_dtype(self):
     with self.assertRaisesRegexp(
         ValueError, r'vocabulary dtype must be string or integer'):
-      fc.categorical_column_with_vocabulary_list(
+      fc._categorical_column_with_vocabulary_list(
           key='aaa', vocabulary_list=(12., 24., 36.))
 
   def test_mismatched_int_dtype(self):
     with self.assertRaisesRegexp(
         ValueError, r'dtype.*and vocabulary dtype.*do not match'):
-      fc.categorical_column_with_vocabulary_list(
-          key='aaa', vocabulary_list=('omar', 'stringer', 'marlo'),
+      fc._categorical_column_with_vocabulary_list(
+          key='aaa',
+          vocabulary_list=('omar', 'stringer', 'marlo'),
           dtype=dtypes.int32)
 
   def test_mismatched_string_dtype(self):
     with self.assertRaisesRegexp(
         ValueError, r'dtype.*and vocabulary dtype.*do not match'):
-      fc.categorical_column_with_vocabulary_list(
+      fc._categorical_column_with_vocabulary_list(
           key='aaa', vocabulary_list=(12, 24, 36), dtype=dtypes.string)
 
   def test_none_mapping(self):
     with self.assertRaisesRegexp(
         ValueError, r'vocabulary_list.*must be non-empty'):
-      fc.categorical_column_with_vocabulary_list(
+      fc._categorical_column_with_vocabulary_list(
           key='aaa', vocabulary_list=None)
 
   def test_empty_mapping(self):
     with self.assertRaisesRegexp(
         ValueError, r'vocabulary_list.*must be non-empty'):
-      fc.categorical_column_with_vocabulary_list(
+      fc._categorical_column_with_vocabulary_list(
           key='aaa', vocabulary_list=tuple([]))
 
   def test_duplicate_mapping(self):
     with self.assertRaisesRegexp(ValueError, 'Duplicate keys'):
-      fc.categorical_column_with_vocabulary_list(
+      fc._categorical_column_with_vocabulary_list(
           key='aaa', vocabulary_list=(12, 24, 12))
 
   def test_invalid_num_oov_buckets(self):
     with self.assertRaisesRegexp(ValueError, 'Invalid num_oov_buckets'):
-      fc.categorical_column_with_vocabulary_list(
-          key='aaa', vocabulary_list=(12, 24, 36),
-          num_oov_buckets=-1)
+      fc._categorical_column_with_vocabulary_list(
+          key='aaa', vocabulary_list=(12, 24, 36), num_oov_buckets=-1)
 
   def test_invalid_buckets_and_default_value(self):
     with self.assertRaisesRegexp(
         ValueError, 'both num_oov_buckets and default_value'):
-      fc.categorical_column_with_vocabulary_list(
+      fc._categorical_column_with_vocabulary_list(
           key='aaa',
           vocabulary_list=(12, 24, 36),
           num_oov_buckets=100,
           default_value=2)
 
   def test_invalid_input_dtype_int32(self):
-    column = fc.categorical_column_with_vocabulary_list(
-        key='aaa',
-        vocabulary_list=('omar', 'stringer', 'marlo'))
+    column = fc._categorical_column_with_vocabulary_list(
+        key='aaa', vocabulary_list=('omar', 'stringer', 'marlo'))
     inputs = sparse_tensor.SparseTensorValue(
         indices=((0, 0), (1, 0), (1, 1)),
         values=(12, 24, 36),
@@ -3858,9 +3955,8 @@ class VocabularyListCategoricalColumnTest(test.TestCase):
       column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
 
   def test_invalid_input_dtype_string(self):
-    column = fc.categorical_column_with_vocabulary_list(
-        key='aaa',
-        vocabulary_list=(12, 24, 36))
+    column = fc._categorical_column_with_vocabulary_list(
+        key='aaa', vocabulary_list=(12, 24, 36))
     inputs = sparse_tensor.SparseTensorValue(
         indices=((0, 0), (1, 0), (1, 1)),
         values=('omar', 'stringer', 'marlo'),
@@ -3868,8 +3964,9 @@ class VocabularyListCategoricalColumnTest(test.TestCase):
     with self.assertRaisesRegexp(ValueError, 'dtype must be compatible'):
       column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
 
+  @test_util.run_deprecated_v1
   def test_parse_example_string(self):
-    a = fc.categorical_column_with_vocabulary_list(
+    a = fc._categorical_column_with_vocabulary_list(
         key='aaa', vocabulary_list=('omar', 'stringer', 'marlo'))
     data = example_pb2.Example(features=feature_pb2.Features(
         feature={
@@ -3890,8 +3987,9 @@ class VocabularyListCategoricalColumnTest(test.TestCase):
               dense_shape=[1, 2]),
           features['aaa'].eval())
 
+  @test_util.run_deprecated_v1
   def test_parse_example_int(self):
-    a = fc.categorical_column_with_vocabulary_list(
+    a = fc._categorical_column_with_vocabulary_list(
         key='aaa', vocabulary_list=(11, 21, 31))
     data = example_pb2.Example(features=feature_pb2.Features(
         feature={
@@ -3912,10 +4010,10 @@ class VocabularyListCategoricalColumnTest(test.TestCase):
               dense_shape=[1, 2]),
           features['aaa'].eval())
 
+  @test_util.run_deprecated_v1
   def test_get_sparse_tensors(self):
-    column = fc.categorical_column_with_vocabulary_list(
-        key='aaa',
-        vocabulary_list=('omar', 'stringer', 'marlo'))
+    column = fc._categorical_column_with_vocabulary_list(
+        key='aaa', vocabulary_list=('omar', 'stringer', 'marlo'))
     inputs = sparse_tensor.SparseTensorValue(
         indices=((0, 0), (1, 0), (1, 1)),
         values=('marlo', 'skywalker', 'omar'),
@@ -3931,10 +4029,10 @@ class VocabularyListCategoricalColumnTest(test.TestCase):
               dense_shape=inputs.dense_shape),
           id_weight_pair.id_tensor.eval())
 
+  @test_util.run_deprecated_v1
   def test_transform_feature(self):
-    column = fc.categorical_column_with_vocabulary_list(
-        key='aaa',
-        vocabulary_list=('omar', 'stringer', 'marlo'))
+    column = fc._categorical_column_with_vocabulary_list(
+        key='aaa', vocabulary_list=('omar', 'stringer', 'marlo'))
     inputs = sparse_tensor.SparseTensorValue(
         indices=((0, 0), (1, 0), (1, 1)),
         values=('marlo', 'skywalker', 'omar'),
@@ -3946,13 +4044,11 @@ class VocabularyListCategoricalColumnTest(test.TestCase):
           sparse_tensor.SparseTensorValue(
               indices=inputs.indices,
               values=np.array((2, -1, 0), dtype=np.int64),
-              dense_shape=inputs.dense_shape),
-          id_tensor.eval())
+              dense_shape=inputs.dense_shape), self.evaluate(id_tensor))
 
   def test_get_sparse_tensors_weight_collections(self):
-    column = fc.categorical_column_with_vocabulary_list(
-        key='aaa',
-        vocabulary_list=('omar', 'stringer', 'marlo'))
+    column = fc._categorical_column_with_vocabulary_list(
+        key='aaa', vocabulary_list=('omar', 'stringer', 'marlo'))
     inputs = sparse_tensor.SparseTensor(
         values=['omar', 'stringer', 'marlo'],
         indices=[[0, 0], [1, 0], [1, 1]],
@@ -3966,10 +4062,10 @@ class VocabularyListCategoricalColumnTest(test.TestCase):
         [], ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES))
     self.assertItemsEqual([], ops.get_collection('my_weights'))
 
+  @test_util.run_deprecated_v1
   def test_get_sparse_tensors_dense_input(self):
-    column = fc.categorical_column_with_vocabulary_list(
-        key='aaa',
-        vocabulary_list=('omar', 'stringer', 'marlo'))
+    column = fc._categorical_column_with_vocabulary_list(
+        key='aaa', vocabulary_list=('omar', 'stringer', 'marlo'))
     id_weight_pair = column._get_sparse_tensors(
         _LazyBuilder({
             'aaa': (('marlo', ''), ('skywalker', 'omar'))
@@ -3984,8 +4080,9 @@ class VocabularyListCategoricalColumnTest(test.TestCase):
               dense_shape=(2, 2)),
           id_weight_pair.id_tensor.eval())
 
+  @test_util.run_deprecated_v1
   def test_get_sparse_tensors_default_value_in_vocabulary(self):
-    column = fc.categorical_column_with_vocabulary_list(
+    column = fc._categorical_column_with_vocabulary_list(
         key='aaa',
         vocabulary_list=('omar', 'stringer', 'marlo'),
         default_value=2)
@@ -4004,8 +4101,9 @@ class VocabularyListCategoricalColumnTest(test.TestCase):
               dense_shape=inputs.dense_shape),
           id_weight_pair.id_tensor.eval())
 
+  @test_util.run_deprecated_v1
   def test_get_sparse_tensors_with_oov_buckets(self):
-    column = fc.categorical_column_with_vocabulary_list(
+    column = fc._categorical_column_with_vocabulary_list(
         key='aaa',
         vocabulary_list=('omar', 'stringer', 'marlo'),
         num_oov_buckets=100)
@@ -4024,8 +4122,9 @@ class VocabularyListCategoricalColumnTest(test.TestCase):
               dense_shape=inputs.dense_shape),
           id_weight_pair.id_tensor.eval())
 
+  @test_util.run_deprecated_v1
   def test_get_sparse_tensors_int32(self):
-    column = fc.categorical_column_with_vocabulary_list(
+    column = fc._categorical_column_with_vocabulary_list(
         key='aaa',
         vocabulary_list=np.array((30, 35, 11, 23, 22), dtype=np.int32),
         dtype=dtypes.int32)
@@ -4044,9 +4143,10 @@ class VocabularyListCategoricalColumnTest(test.TestCase):
               dense_shape=inputs.dense_shape),
           id_weight_pair.id_tensor.eval())
 
+  @test_util.run_deprecated_v1
   def test_get_sparse_tensors_int32_dense_input(self):
     default_value = -100
-    column = fc.categorical_column_with_vocabulary_list(
+    column = fc._categorical_column_with_vocabulary_list(
         key='aaa',
         vocabulary_list=np.array((30, 35, 11, 23, 22), dtype=np.int32),
         dtype=dtypes.int32,
@@ -4067,8 +4167,9 @@ class VocabularyListCategoricalColumnTest(test.TestCase):
               dense_shape=(3, 3)),
           id_weight_pair.id_tensor.eval())
 
+  @test_util.run_deprecated_v1
   def test_get_sparse_tensors_int32_with_oov_buckets(self):
-    column = fc.categorical_column_with_vocabulary_list(
+    column = fc._categorical_column_with_vocabulary_list(
         key='aaa',
         vocabulary_list=np.array((30, 35, 11, 23, 22), dtype=np.int32),
         dtype=dtypes.int32,
@@ -4088,8 +4189,9 @@ class VocabularyListCategoricalColumnTest(test.TestCase):
               dense_shape=inputs.dense_shape),
           id_weight_pair.id_tensor.eval())
 
+  @test_util.run_deprecated_v1
   def test_linear_model(self):
-    wire_column = fc.categorical_column_with_vocabulary_list(
+    wire_column = fc._categorical_column_with_vocabulary_list(
         key='aaa',
         vocabulary_list=('omar', 'stringer', 'marlo'),
         num_oov_buckets=1)
@@ -4104,16 +4206,18 @@ class VocabularyListCategoricalColumnTest(test.TestCase):
       bias = get_linear_model_bias()
       wire_var = get_linear_model_column_var(wire_column)
       with _initialized_session():
-        self.assertAllClose((0.,), bias.eval())
-        self.assertAllClose(((0.,), (0.,), (0.,), (0.,)), wire_var.eval())
-        self.assertAllClose(((0.,), (0.,)), predictions.eval())
+        self.assertAllClose((0.,), self.evaluate(bias))
+        self.assertAllClose(((0.,), (0.,), (0.,), (0.,)),
+                            self.evaluate(wire_var))
+        self.assertAllClose(((0.,), (0.,)), self.evaluate(predictions))
         wire_var.assign(((1.,), (2.,), (3.,), (4.,))).eval()
         # 'marlo' -> 2: wire_var[2] = 3
         # 'skywalker' -> 3, 'omar' -> 0: wire_var[3] + wire_var[0] = 4+1 = 5
-        self.assertAllClose(((3.,), (5.,)), predictions.eval())
+        self.assertAllClose(((3.,), (5.,)), self.evaluate(predictions))
 
+  @test_util.run_deprecated_v1
   def test_keras_linear_model(self):
-    wire_column = fc.categorical_column_with_vocabulary_list(
+    wire_column = fc._categorical_column_with_vocabulary_list(
         key='aaa',
         vocabulary_list=('omar', 'stringer', 'marlo'),
         num_oov_buckets=1)
@@ -4129,19 +4233,20 @@ class VocabularyListCategoricalColumnTest(test.TestCase):
       bias = get_linear_model_bias()
       wire_var = get_linear_model_column_var(wire_column)
       with _initialized_session():
-        self.assertAllClose((0.,), bias.eval())
-        self.assertAllClose(((0.,), (0.,), (0.,), (0.,)), wire_var.eval())
-        self.assertAllClose(((0.,), (0.,)), predictions.eval())
+        self.assertAllClose((0.,), self.evaluate(bias))
+        self.assertAllClose(((0.,), (0.,), (0.,), (0.,)),
+                            self.evaluate(wire_var))
+        self.assertAllClose(((0.,), (0.,)), self.evaluate(predictions))
         wire_var.assign(((1.,), (2.,), (3.,), (4.,))).eval()
         # 'marlo' -> 2: wire_var[2] = 3
         # 'skywalker' -> 3, 'omar' -> 0: wire_var[3] + wire_var[0] = 4+1 = 5
-        self.assertAllClose(((3.,), (5.,)), predictions.eval())
+        self.assertAllClose(((3.,), (5.,)), self.evaluate(predictions))
 
 
 class IdentityCategoricalColumnTest(test.TestCase):
 
   def test_constructor(self):
-    column = fc.categorical_column_with_identity(key='aaa', num_buckets=3)
+    column = fc._categorical_column_with_identity(key='aaa', num_buckets=3)
     self.assertEqual('aaa', column.name)
     self.assertEqual('aaa', column.key)
     self.assertEqual('aaa', column._var_scope_name)
@@ -4152,10 +4257,11 @@ class IdentityCategoricalColumnTest(test.TestCase):
 
   def test_key_should_be_string(self):
     with self.assertRaisesRegexp(ValueError, 'key must be a string.'):
-      fc.categorical_column_with_identity(key=('aaa',), num_buckets=3)
+      fc._categorical_column_with_identity(key=('aaa',), num_buckets=3)
 
+  @test_util.run_deprecated_v1
   def test_deep_copy(self):
-    original = fc.categorical_column_with_identity(key='aaa', num_buckets=3)
+    original = fc._categorical_column_with_identity(key='aaa', num_buckets=3)
     for column in (original, copy.deepcopy(original)):
       self.assertEqual('aaa', column.name)
       self.assertEqual(3, column._num_buckets)
@@ -4165,24 +4271,24 @@ class IdentityCategoricalColumnTest(test.TestCase):
 
   def test_invalid_num_buckets_zero(self):
     with self.assertRaisesRegexp(ValueError, 'num_buckets 0 < 1'):
-      fc.categorical_column_with_identity(key='aaa', num_buckets=0)
+      fc._categorical_column_with_identity(key='aaa', num_buckets=0)
 
   def test_invalid_num_buckets_negative(self):
     with self.assertRaisesRegexp(ValueError, 'num_buckets -1 < 1'):
-      fc.categorical_column_with_identity(key='aaa', num_buckets=-1)
+      fc._categorical_column_with_identity(key='aaa', num_buckets=-1)
 
   def test_invalid_default_value_too_small(self):
     with self.assertRaisesRegexp(ValueError, 'default_value -1 not in range'):
-      fc.categorical_column_with_identity(
+      fc._categorical_column_with_identity(
           key='aaa', num_buckets=3, default_value=-1)
 
   def test_invalid_default_value_too_big(self):
     with self.assertRaisesRegexp(ValueError, 'default_value 3 not in range'):
-      fc.categorical_column_with_identity(
+      fc._categorical_column_with_identity(
           key='aaa', num_buckets=3, default_value=3)
 
   def test_invalid_input_dtype(self):
-    column = fc.categorical_column_with_identity(key='aaa', num_buckets=3)
+    column = fc._categorical_column_with_identity(key='aaa', num_buckets=3)
     inputs = sparse_tensor.SparseTensorValue(
         indices=((0, 0), (1, 0), (1, 1)),
         values=('omar', 'stringer', 'marlo'),
@@ -4190,8 +4296,9 @@ class IdentityCategoricalColumnTest(test.TestCase):
     with self.assertRaisesRegexp(ValueError, 'Invalid input, not integer'):
       column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
 
+  @test_util.run_deprecated_v1
   def test_parse_example(self):
-    a = fc.categorical_column_with_identity(key='aaa', num_buckets=30)
+    a = fc._categorical_column_with_identity(key='aaa', num_buckets=30)
     data = example_pb2.Example(features=feature_pb2.Features(
         feature={
             'aaa':
@@ -4211,8 +4318,9 @@ class IdentityCategoricalColumnTest(test.TestCase):
               dense_shape=[1, 2]),
           features['aaa'].eval())
 
+  @test_util.run_deprecated_v1
   def test_get_sparse_tensors(self):
-    column = fc.categorical_column_with_identity(key='aaa', num_buckets=3)
+    column = fc._categorical_column_with_identity(key='aaa', num_buckets=3)
     inputs = sparse_tensor.SparseTensorValue(
         indices=((0, 0), (1, 0), (1, 1)),
         values=(0, 1, 0),
@@ -4228,8 +4336,9 @@ class IdentityCategoricalColumnTest(test.TestCase):
               dense_shape=inputs.dense_shape),
           id_weight_pair.id_tensor.eval())
 
+  @test_util.run_deprecated_v1
   def test_transform_feature(self):
-    column = fc.categorical_column_with_identity(key='aaa', num_buckets=3)
+    column = fc._categorical_column_with_identity(key='aaa', num_buckets=3)
     inputs = sparse_tensor.SparseTensorValue(
         indices=((0, 0), (1, 0), (1, 1)),
         values=(0, 1, 0),
@@ -4241,11 +4350,10 @@ class IdentityCategoricalColumnTest(test.TestCase):
           sparse_tensor.SparseTensorValue(
               indices=inputs.indices,
               values=np.array((0, 1, 0), dtype=np.int64),
-              dense_shape=inputs.dense_shape),
-          id_tensor.eval())
+              dense_shape=inputs.dense_shape), self.evaluate(id_tensor))
 
   def test_get_sparse_tensors_weight_collections(self):
-    column = fc.categorical_column_with_identity(key='aaa', num_buckets=3)
+    column = fc._categorical_column_with_identity(key='aaa', num_buckets=3)
     inputs = sparse_tensor.SparseTensorValue(
         indices=((0, 0), (1, 0), (1, 1)),
         values=(0, 1, 0),
@@ -4259,8 +4367,9 @@ class IdentityCategoricalColumnTest(test.TestCase):
         [], ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES))
     self.assertItemsEqual([], ops.get_collection('my_weights'))
 
+  @test_util.run_deprecated_v1
   def test_get_sparse_tensors_dense_input(self):
-    column = fc.categorical_column_with_identity(key='aaa', num_buckets=3)
+    column = fc._categorical_column_with_identity(key='aaa', num_buckets=3)
     id_weight_pair = column._get_sparse_tensors(
         _LazyBuilder({
             'aaa': ((0, -1), (1, 0))
@@ -4275,8 +4384,9 @@ class IdentityCategoricalColumnTest(test.TestCase):
               dense_shape=(2, 2)),
           id_weight_pair.id_tensor.eval())
 
+  @test_util.run_deprecated_v1
   def test_get_sparse_tensors_with_inputs_too_small(self):
-    column = fc.categorical_column_with_identity(key='aaa', num_buckets=3)
+    column = fc._categorical_column_with_identity(key='aaa', num_buckets=3)
     inputs = sparse_tensor.SparseTensorValue(
         indices=((0, 0), (1, 0), (1, 1)),
         values=(1, -1, 0),
@@ -4288,8 +4398,9 @@ class IdentityCategoricalColumnTest(test.TestCase):
           errors.OpError, 'assert_greater_or_equal_0'):
         id_weight_pair.id_tensor.eval()
 
+  @test_util.run_deprecated_v1
   def test_get_sparse_tensors_with_inputs_too_big(self):
-    column = fc.categorical_column_with_identity(key='aaa', num_buckets=3)
+    column = fc._categorical_column_with_identity(key='aaa', num_buckets=3)
     inputs = sparse_tensor.SparseTensorValue(
         indices=((0, 0), (1, 0), (1, 1)),
         values=(1, 99, 0),
@@ -4301,8 +4412,9 @@ class IdentityCategoricalColumnTest(test.TestCase):
           errors.OpError, 'assert_less_than_num_buckets'):
         id_weight_pair.id_tensor.eval()
 
+  @test_util.run_deprecated_v1
   def test_get_sparse_tensors_with_default_value(self):
-    column = fc.categorical_column_with_identity(
+    column = fc._categorical_column_with_identity(
         key='aaa', num_buckets=4, default_value=3)
     inputs = sparse_tensor.SparseTensorValue(
         indices=((0, 0), (1, 0), (1, 1)),
@@ -4319,8 +4431,9 @@ class IdentityCategoricalColumnTest(test.TestCase):
               dense_shape=inputs.dense_shape),
           id_weight_pair.id_tensor.eval())
 
+  @test_util.run_deprecated_v1
   def test_get_sparse_tensors_with_default_value_and_placeholder_inputs(self):
-    column = fc.categorical_column_with_identity(
+    column = fc._categorical_column_with_identity(
         key='aaa', num_buckets=4, default_value=3)
     input_indices = array_ops.placeholder(dtype=dtypes.int64)
     input_values = array_ops.placeholder(dtype=dtypes.int32)
@@ -4344,8 +4457,9 @@ class IdentityCategoricalColumnTest(test.TestCase):
               input_shape: (2, 2),
           }))
 
+  @test_util.run_deprecated_v1
   def test_linear_model(self):
-    column = fc.categorical_column_with_identity(key='aaa', num_buckets=3)
+    column = fc._categorical_column_with_identity(key='aaa', num_buckets=3)
     self.assertEqual(3, column._num_buckets)
     with ops.Graph().as_default():
       predictions = fc.linear_model({
@@ -4357,16 +4471,17 @@ class IdentityCategoricalColumnTest(test.TestCase):
       bias = get_linear_model_bias()
       weight_var = get_linear_model_column_var(column)
       with _initialized_session():
-        self.assertAllClose((0.,), bias.eval())
-        self.assertAllClose(((0.,), (0.,), (0.,)), weight_var.eval())
-        self.assertAllClose(((0.,), (0.,)), predictions.eval())
+        self.assertAllClose((0.,), self.evaluate(bias))
+        self.assertAllClose(((0.,), (0.,), (0.,)), self.evaluate(weight_var))
+        self.assertAllClose(((0.,), (0.,)), self.evaluate(predictions))
         weight_var.assign(((1.,), (2.,), (3.,))).eval()
         # weight_var[0] = 1
         # weight_var[2] + weight_var[1] = 3+2 = 5
-        self.assertAllClose(((1.,), (5.,)), predictions.eval())
+        self.assertAllClose(((1.,), (5.,)), self.evaluate(predictions))
 
+  @test_util.run_deprecated_v1
   def test_keras_linear_model(self):
-    column = fc.categorical_column_with_identity(key='aaa', num_buckets=3)
+    column = fc._categorical_column_with_identity(key='aaa', num_buckets=3)
     self.assertEqual(3, column._num_buckets)
     with ops.Graph().as_default():
       predictions = get_keras_linear_model_predictions({
@@ -4379,13 +4494,13 @@ class IdentityCategoricalColumnTest(test.TestCase):
       bias = get_linear_model_bias()
       weight_var = get_linear_model_column_var(column)
       with _initialized_session():
-        self.assertAllClose((0.,), bias.eval())
-        self.assertAllClose(((0.,), (0.,), (0.,)), weight_var.eval())
-        self.assertAllClose(((0.,), (0.,)), predictions.eval())
+        self.assertAllClose((0.,), self.evaluate(bias))
+        self.assertAllClose(((0.,), (0.,), (0.,)), self.evaluate(weight_var))
+        self.assertAllClose(((0.,), (0.,)), self.evaluate(predictions))
         weight_var.assign(((1.,), (2.,), (3.,))).eval()
         # weight_var[0] = 1
         # weight_var[2] + weight_var[1] = 3+2 = 5
-        self.assertAllClose(((1.,), (5.,)), predictions.eval())
+        self.assertAllClose(((1.,), (5.,)), self.evaluate(predictions))
 
 
 class TransformFeaturesTest(test.TestCase):
@@ -4393,9 +4508,9 @@ class TransformFeaturesTest(test.TestCase):
   # All transform tests are distributed in column test.
   # Here we only test multi column case and naming
   def transform_multi_column(self):
-    bucketized_price = fc.bucketized_column(
-        fc.numeric_column('price'), boundaries=[0, 2, 4, 6])
-    hashed_sparse = fc.categorical_column_with_hash_bucket('wire', 10)
+    bucketized_price = fc._bucketized_column(
+        fc._numeric_column('price'), boundaries=[0, 2, 4, 6])
+    hashed_sparse = fc._categorical_column_with_hash_bucket('wire', 10)
     with ops.Graph().as_default():
       features = {
           'price': [[-1.], [5.]],
@@ -4452,32 +4567,33 @@ class TransformFeaturesTest(test.TestCase):
 class IndicatorColumnTest(test.TestCase):
 
   def test_indicator_column(self):
-    a = fc.categorical_column_with_hash_bucket('a', 4)
-    indicator_a = fc.indicator_column(a)
+    a = fc._categorical_column_with_hash_bucket('a', 4)
+    indicator_a = fc._indicator_column(a)
     self.assertEqual(indicator_a.categorical_column.name, 'a')
     self.assertEqual(indicator_a.name, 'a_indicator')
     self.assertEqual(indicator_a._var_scope_name, 'a_indicator')
     self.assertEqual(indicator_a._variable_shape, [1, 4])
 
-    b = fc.categorical_column_with_hash_bucket('b', hash_bucket_size=100)
-    indicator_b = fc.indicator_column(b)
+    b = fc._categorical_column_with_hash_bucket('b', hash_bucket_size=100)
+    indicator_b = fc._indicator_column(b)
     self.assertEqual(indicator_b.categorical_column.name, 'b')
     self.assertEqual(indicator_b.name, 'b_indicator')
     self.assertEqual(indicator_b._var_scope_name, 'b_indicator')
     self.assertEqual(indicator_b._variable_shape, [1, 100])
 
   def test_1D_shape_succeeds(self):
-    animal = fc.indicator_column(
-        fc.categorical_column_with_hash_bucket('animal', 4))
+    animal = fc._indicator_column(
+        fc._categorical_column_with_hash_bucket('animal', 4))
     builder = _LazyBuilder({'animal': ['fox', 'fox']})
     output = builder.get(animal)
     with self.cached_session():
-      self.assertAllEqual([[0., 0., 1., 0.], [0., 0., 1., 0.]], output.eval())
+      self.assertAllEqual([[0., 0., 1., 0.], [0., 0., 1., 0.]],
+                          self.evaluate(output))
 
   def test_2D_shape_succeeds(self):
     # TODO(ispir/cassandrax): Swith to categorical_column_with_keys when ready.
-    animal = fc.indicator_column(
-        fc.categorical_column_with_hash_bucket('animal', 4))
+    animal = fc._indicator_column(
+        fc._categorical_column_with_hash_bucket('animal', 4))
     builder = _LazyBuilder({
         'animal':
             sparse_tensor.SparseTensor(
@@ -4487,11 +4603,12 @@ class IndicatorColumnTest(test.TestCase):
     })
     output = builder.get(animal)
     with self.cached_session():
-      self.assertAllEqual([[0., 0., 1., 0.], [0., 0., 1., 0.]], output.eval())
+      self.assertAllEqual([[0., 0., 1., 0.], [0., 0., 1., 0.]],
+                          self.evaluate(output))
 
   def test_multi_hot(self):
-    animal = fc.indicator_column(
-        fc.categorical_column_with_identity('animal', num_buckets=4))
+    animal = fc._indicator_column(
+        fc._categorical_column_with_identity('animal', num_buckets=4))
 
     builder = _LazyBuilder({
         'animal':
@@ -4500,11 +4617,11 @@ class IndicatorColumnTest(test.TestCase):
     })
     output = builder.get(animal)
     with self.cached_session():
-      self.assertAllEqual([[0., 2., 0., 0.]], output.eval())
+      self.assertAllEqual([[0., 2., 0., 0.]], self.evaluate(output))
 
   def test_multi_hot2(self):
-    animal = fc.indicator_column(
-        fc.categorical_column_with_identity('animal', num_buckets=4))
+    animal = fc._indicator_column(
+        fc._categorical_column_with_identity('animal', num_buckets=4))
     builder = _LazyBuilder({
         'animal':
             sparse_tensor.SparseTensor(
@@ -4512,20 +4629,22 @@ class IndicatorColumnTest(test.TestCase):
     })
     output = builder.get(animal)
     with self.cached_session():
-      self.assertAllEqual([[0., 1., 1., 0.]], output.eval())
+      self.assertAllEqual([[0., 1., 1., 0.]], self.evaluate(output))
 
+  @test_util.run_deprecated_v1
   def test_deep_copy(self):
-    a = fc.categorical_column_with_hash_bucket('a', 4)
-    column = fc.indicator_column(a)
+    a = fc._categorical_column_with_hash_bucket('a', 4)
+    column = fc._indicator_column(a)
     column_copy = copy.deepcopy(column)
     self.assertEqual(column_copy.categorical_column.name, 'a')
     self.assertEqual(column.name, 'a_indicator')
     self.assertEqual(column._variable_shape, [1, 4])
 
+  @test_util.run_deprecated_v1
   def test_parse_example(self):
-    a = fc.categorical_column_with_vocabulary_list(
+    a = fc._categorical_column_with_vocabulary_list(
         key='aaa', vocabulary_list=('omar', 'stringer', 'marlo'))
-    a_indicator = fc.indicator_column(a)
+    a_indicator = fc._indicator_column(a)
     data = example_pb2.Example(features=feature_pb2.Features(
         feature={
             'aaa':
@@ -4545,10 +4664,11 @@ class IndicatorColumnTest(test.TestCase):
               dense_shape=[1, 2]),
           features['aaa'].eval())
 
+  @test_util.run_deprecated_v1
   def test_transform(self):
-    a = fc.categorical_column_with_vocabulary_list(
+    a = fc._categorical_column_with_vocabulary_list(
         key='aaa', vocabulary_list=('omar', 'stringer', 'marlo'))
-    a_indicator = fc.indicator_column(a)
+    a_indicator = fc._indicator_column(a)
     features = {
         'aaa': sparse_tensor.SparseTensorValue(
             indices=((0, 0), (1, 0), (1, 1)),
@@ -4557,51 +4677,56 @@ class IndicatorColumnTest(test.TestCase):
     }
     indicator_tensor = _transform_features(features, [a_indicator])[a_indicator]
     with _initialized_session():
-      self.assertAllEqual([[0, 0, 1], [1, 0, 0]], indicator_tensor.eval())
+      self.assertAllEqual([[0, 0, 1], [1, 0, 0]],
+                          self.evaluate(indicator_tensor))
 
+  @test_util.run_deprecated_v1
   def test_transform_with_weighted_column(self):
     # Github issue 12557
-    ids = fc.categorical_column_with_vocabulary_list(
+    ids = fc._categorical_column_with_vocabulary_list(
         key='ids', vocabulary_list=('a', 'b', 'c'))
-    weights = fc.weighted_categorical_column(ids, 'weights')
-    indicator = fc.indicator_column(weights)
+    weights = fc._weighted_categorical_column(ids, 'weights')
+    indicator = fc._indicator_column(weights)
     features = {
         'ids': constant_op.constant([['c', 'b', 'a', 'c']]),
         'weights': constant_op.constant([[2., 4., 6., 1.]])
     }
     indicator_tensor = _transform_features(features, [indicator])[indicator]
     with _initialized_session():
-      self.assertAllEqual([[6., 4., 3.]], indicator_tensor.eval())
+      self.assertAllEqual([[6., 4., 3.]], self.evaluate(indicator_tensor))
 
+  @test_util.run_deprecated_v1
   def test_transform_with_missing_value_in_weighted_column(self):
     # Github issue 12583
-    ids = fc.categorical_column_with_vocabulary_list(
+    ids = fc._categorical_column_with_vocabulary_list(
         key='ids', vocabulary_list=('a', 'b', 'c'))
-    weights = fc.weighted_categorical_column(ids, 'weights')
-    indicator = fc.indicator_column(weights)
+    weights = fc._weighted_categorical_column(ids, 'weights')
+    indicator = fc._indicator_column(weights)
     features = {
         'ids': constant_op.constant([['c', 'b', 'unknown']]),
         'weights': constant_op.constant([[2., 4., 6.]])
     }
     indicator_tensor = _transform_features(features, [indicator])[indicator]
     with _initialized_session():
-      self.assertAllEqual([[0., 4., 2.]], indicator_tensor.eval())
+      self.assertAllEqual([[0., 4., 2.]], self.evaluate(indicator_tensor))
 
+  @test_util.run_deprecated_v1
   def test_transform_with_missing_value_in_categorical_column(self):
     # Github issue 12583
-    ids = fc.categorical_column_with_vocabulary_list(
+    ids = fc._categorical_column_with_vocabulary_list(
         key='ids', vocabulary_list=('a', 'b', 'c'))
-    indicator = fc.indicator_column(ids)
+    indicator = fc._indicator_column(ids)
     features = {
         'ids': constant_op.constant([['c', 'b', 'unknown']]),
     }
     indicator_tensor = _transform_features(features, [indicator])[indicator]
     with _initialized_session():
-      self.assertAllEqual([[0., 1., 1.]], indicator_tensor.eval())
+      self.assertAllEqual([[0., 1., 1.]], self.evaluate(indicator_tensor))
 
+  @test_util.run_deprecated_v1
   def test_linear_model(self):
-    animal = fc.indicator_column(
-        fc.categorical_column_with_identity('animal', num_buckets=4))
+    animal = fc._indicator_column(
+        fc._categorical_column_with_identity('animal', num_buckets=4))
     with ops.Graph().as_default():
       features = {
           'animal':
@@ -4613,14 +4738,15 @@ class IndicatorColumnTest(test.TestCase):
       weight_var = get_linear_model_column_var(animal)
       with _initialized_session():
         # All should be zero-initialized.
-        self.assertAllClose([[0.], [0.], [0.], [0.]], weight_var.eval())
-        self.assertAllClose([[0.]], predictions.eval())
+        self.assertAllClose([[0.], [0.], [0.], [0.]], self.evaluate(weight_var))
+        self.assertAllClose([[0.]], self.evaluate(predictions))
         weight_var.assign([[1.], [2.], [3.], [4.]]).eval()
-        self.assertAllClose([[2. + 3.]], predictions.eval())
+        self.assertAllClose([[2. + 3.]], self.evaluate(predictions))
 
+  @test_util.run_deprecated_v1
   def test_keras_linear_model(self):
-    animal = fc.indicator_column(
-        fc.categorical_column_with_identity('animal', num_buckets=4))
+    animal = fc._indicator_column(
+        fc._categorical_column_with_identity('animal', num_buckets=4))
     with ops.Graph().as_default():
       features = {
           'animal':
@@ -4632,14 +4758,15 @@ class IndicatorColumnTest(test.TestCase):
       weight_var = get_linear_model_column_var(animal)
       with _initialized_session():
         # All should be zero-initialized.
-        self.assertAllClose([[0.], [0.], [0.], [0.]], weight_var.eval())
-        self.assertAllClose([[0.]], predictions.eval())
+        self.assertAllClose([[0.], [0.], [0.], [0.]], self.evaluate(weight_var))
+        self.assertAllClose([[0.]], self.evaluate(predictions))
         weight_var.assign([[1.], [2.], [3.], [4.]]).eval()
-        self.assertAllClose([[2. + 3.]], predictions.eval())
+        self.assertAllClose([[2. + 3.]], self.evaluate(predictions))
 
+  @test_util.run_deprecated_v1
   def test_input_layer(self):
-    animal = fc.indicator_column(
-        fc.categorical_column_with_identity('animal', num_buckets=4))
+    animal = fc._indicator_column(
+        fc._categorical_column_with_identity('animal', num_buckets=4))
     with ops.Graph().as_default():
       features = {
           'animal':
@@ -4648,16 +4775,17 @@ class IndicatorColumnTest(test.TestCase):
       }
       net = fc.input_layer(features, [animal])
       with _initialized_session():
-        self.assertAllClose([[0., 1., 1., 0.]], net.eval())
+        self.assertAllClose([[0., 1., 1., 0.]], self.evaluate(net))
 
 
 class EmbeddingColumnTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def test_defaults(self):
-    categorical_column = fc.categorical_column_with_identity(
+    categorical_column = fc._categorical_column_with_identity(
         key='aaa', num_buckets=3)
     embedding_dimension = 2
-    embedding_column = fc.embedding_column(
+    embedding_column = fc._embedding_column(
         categorical_column, dimension=embedding_dimension)
     self.assertIs(categorical_column, embedding_column.categorical_column)
     self.assertEqual(embedding_dimension, embedding_column.dimension)
@@ -4674,15 +4802,20 @@ class EmbeddingColumnTest(test.TestCase):
         'aaa': parsing_ops.VarLenFeature(dtypes.int64)
     }, embedding_column._parse_example_spec)
 
+  @test_util.run_deprecated_v1
   def test_all_constructor_args(self):
-    categorical_column = fc.categorical_column_with_identity(
+    categorical_column = fc._categorical_column_with_identity(
         key='aaa', num_buckets=3)
     embedding_dimension = 2
-    embedding_column = fc.embedding_column(
-        categorical_column, dimension=embedding_dimension,
-        combiner='my_combiner', initializer=lambda: 'my_initializer',
-        ckpt_to_load_from='my_ckpt', tensor_name_in_ckpt='my_ckpt_tensor',
-        max_norm=42., trainable=False)
+    embedding_column = fc._embedding_column(
+        categorical_column,
+        dimension=embedding_dimension,
+        combiner='my_combiner',
+        initializer=lambda: 'my_initializer',
+        ckpt_to_load_from='my_ckpt',
+        tensor_name_in_ckpt='my_ckpt_tensor',
+        max_norm=42.,
+        trainable=False)
     self.assertIs(categorical_column, embedding_column.categorical_column)
     self.assertEqual(embedding_dimension, embedding_column.dimension)
     self.assertEqual('my_combiner', embedding_column.combiner)
@@ -4698,15 +4831,20 @@ class EmbeddingColumnTest(test.TestCase):
         'aaa': parsing_ops.VarLenFeature(dtypes.int64)
     }, embedding_column._parse_example_spec)
 
+  @test_util.run_deprecated_v1
   def test_deep_copy(self):
-    categorical_column = fc.categorical_column_with_identity(
+    categorical_column = fc._categorical_column_with_identity(
         key='aaa', num_buckets=3)
     embedding_dimension = 2
-    original = fc.embedding_column(
-        categorical_column, dimension=embedding_dimension,
-        combiner='my_combiner', initializer=lambda: 'my_initializer',
-        ckpt_to_load_from='my_ckpt', tensor_name_in_ckpt='my_ckpt_tensor',
-        max_norm=42., trainable=False)
+    original = fc._embedding_column(
+        categorical_column,
+        dimension=embedding_dimension,
+        combiner='my_combiner',
+        initializer=lambda: 'my_initializer',
+        ckpt_to_load_from='my_ckpt',
+        tensor_name_in_ckpt='my_ckpt_tensor',
+        max_norm=42.,
+        trainable=False)
     for embedding_column in (original, copy.deepcopy(original)):
       self.assertEqual('aaa', embedding_column.categorical_column.name)
       self.assertEqual(3, embedding_column.categorical_column._num_buckets)
@@ -4727,16 +4865,19 @@ class EmbeddingColumnTest(test.TestCase):
           'aaa': parsing_ops.VarLenFeature(dtypes.int64)
       }, embedding_column._parse_example_spec)
 
+  @test_util.run_deprecated_v1
   def test_invalid_initializer(self):
-    categorical_column = fc.categorical_column_with_identity(
+    categorical_column = fc._categorical_column_with_identity(
         key='aaa', num_buckets=3)
     with self.assertRaisesRegexp(ValueError, 'initializer must be callable'):
-      fc.embedding_column(categorical_column, dimension=2, initializer='not_fn')
+      fc._embedding_column(
+          categorical_column, dimension=2, initializer='not_fn')
 
+  @test_util.run_deprecated_v1
   def test_parse_example(self):
-    a = fc.categorical_column_with_vocabulary_list(
+    a = fc._categorical_column_with_vocabulary_list(
         key='aaa', vocabulary_list=('omar', 'stringer', 'marlo'))
-    a_embedded = fc.embedding_column(a, dimension=2)
+    a_embedded = fc._embedding_column(a, dimension=2)
     data = example_pb2.Example(features=feature_pb2.Features(
         feature={
             'aaa':
@@ -4756,9 +4897,10 @@ class EmbeddingColumnTest(test.TestCase):
               dense_shape=[1, 2]),
           features['aaa'].eval())
 
+  @test_util.run_deprecated_v1
   def test_transform_feature(self):
-    a = fc.categorical_column_with_identity(key='aaa', num_buckets=3)
-    a_embedded = fc.embedding_column(a, dimension=2)
+    a = fc._categorical_column_with_identity(key='aaa', num_buckets=3)
+    a_embedded = fc._embedding_column(a, dimension=2)
     features = {
         'aaa': sparse_tensor.SparseTensor(
             indices=((0, 0), (1, 0), (1, 1)),
@@ -4769,9 +4911,10 @@ class EmbeddingColumnTest(test.TestCase):
     output_a = outputs[a]
     output_embedded = outputs[a_embedded]
     with _initialized_session():
-      _assert_sparse_tensor_value(
-          self, output_a.eval(), output_embedded.eval())
+      _assert_sparse_tensor_value(self, self.evaluate(output_a),
+                                  self.evaluate(output_embedded))
 
+  @test_util.run_deprecated_v1
   def test_get_dense_tensor(self):
     # Inputs.
     vocabulary_size = 3
@@ -4810,10 +4953,11 @@ class EmbeddingColumnTest(test.TestCase):
     )
 
     # Build columns.
-    categorical_column = fc.categorical_column_with_identity(
+    categorical_column = fc._categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
-    embedding_column = fc.embedding_column(
-        categorical_column, dimension=embedding_dimension,
+    embedding_column = fc._embedding_column(
+        categorical_column,
+        dimension=embedding_dimension,
         initializer=_initializer)
 
     # Provide sparse input and get dense result.
@@ -4828,8 +4972,9 @@ class EmbeddingColumnTest(test.TestCase):
                           tuple([v.name for v in global_vars]))
     with _initialized_session():
       self.assertAllEqual(embedding_values, global_vars[0].eval())
-      self.assertAllEqual(expected_lookups, embedding_lookup.eval())
+      self.assertAllEqual(expected_lookups, self.evaluate(embedding_lookup))
 
+  @test_util.run_deprecated_v1
   def test_get_dense_tensor_3d(self):
     # Inputs.
     vocabulary_size = 4
@@ -4870,10 +5015,11 @@ class EmbeddingColumnTest(test.TestCase):
     )
 
     # Build columns.
-    categorical_column = fc.categorical_column_with_identity(
+    categorical_column = fc._categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
-    embedding_column = fc.embedding_column(
-        categorical_column, dimension=embedding_dimension,
+    embedding_column = fc._embedding_column(
+        categorical_column,
+        dimension=embedding_dimension,
         initializer=_initializer)
 
     # Provide sparse input and get dense result.
@@ -4888,8 +5034,9 @@ class EmbeddingColumnTest(test.TestCase):
                           tuple([v.name for v in global_vars]))
     with _initialized_session():
       self.assertAllEqual(embedding_values, global_vars[0].eval())
-      self.assertAllEqual(expected_lookups, embedding_lookup.eval())
+      self.assertAllEqual(expected_lookups, self.evaluate(embedding_lookup))
 
+  @test_util.run_deprecated_v1
   def test_get_dense_tensor_weight_collections(self):
     sparse_input = sparse_tensor.SparseTensorValue(
         # example 0, ids [2]
@@ -4901,9 +5048,9 @@ class EmbeddingColumnTest(test.TestCase):
         dense_shape=(4, 5))
 
     # Build columns.
-    categorical_column = fc.categorical_column_with_identity(
+    categorical_column = fc._categorical_column_with_identity(
         key='aaa', num_buckets=3)
-    embedding_column = fc.embedding_column(categorical_column, dimension=2)
+    embedding_column = fc._embedding_column(categorical_column, dimension=2)
 
     # Provide sparse input and get dense result.
     embedding_column._get_dense_tensor(
@@ -4919,6 +5066,7 @@ class EmbeddingColumnTest(test.TestCase):
     self.assertItemsEqual(
         ('embedding_weights:0',), tuple([v.name for v in my_vars]))
 
+  @test_util.run_deprecated_v1
   def test_get_dense_tensor_placeholder_inputs(self):
     # Inputs.
     vocabulary_size = 3
@@ -4957,10 +5105,11 @@ class EmbeddingColumnTest(test.TestCase):
     )
 
     # Build columns.
-    categorical_column = fc.categorical_column_with_identity(
+    categorical_column = fc._categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
-    embedding_column = fc.embedding_column(
-        categorical_column, dimension=embedding_dimension,
+    embedding_column = fc._embedding_column(
+        categorical_column,
+        dimension=embedding_dimension,
         initializer=_initializer)
 
     # Provide sparse input and get dense result.
@@ -4989,6 +5138,7 @@ class EmbeddingColumnTest(test.TestCase):
               input_shape: sparse_input.dense_shape,
           }))
 
+  @test_util.run_deprecated_v1
   def test_get_dense_tensor_restore_from_ckpt(self):
     # Inputs.
     vocabulary_size = 3
@@ -5025,10 +5175,11 @@ class EmbeddingColumnTest(test.TestCase):
     )
 
     # Build columns.
-    categorical_column = fc.categorical_column_with_identity(
+    categorical_column = fc._categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
-    embedding_column = fc.embedding_column(
-        categorical_column, dimension=embedding_dimension,
+    embedding_column = fc._embedding_column(
+        categorical_column,
+        dimension=embedding_dimension,
         ckpt_to_load_from=ckpt_path,
         tensor_name_in_ckpt=ckpt_tensor)
 
@@ -5044,8 +5195,9 @@ class EmbeddingColumnTest(test.TestCase):
         ('embedding_weights:0',), tuple([v.name for v in global_vars]))
     with _initialized_session():
       self.assertAllEqual(embedding_values, global_vars[0].eval())
-      self.assertAllEqual(expected_lookups, embedding_lookup.eval())
+      self.assertAllEqual(expected_lookups, self.evaluate(embedding_lookup))
 
+  @test_util.run_deprecated_v1
   def test_linear_model(self):
     # Inputs.
     batch_size = 4
@@ -5070,10 +5222,11 @@ class EmbeddingColumnTest(test.TestCase):
       return zeros_embedding_values
 
     # Build columns.
-    categorical_column = fc.categorical_column_with_identity(
+    categorical_column = fc._categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
-    embedding_column = fc.embedding_column(
-        categorical_column, dimension=embedding_dimension,
+    embedding_column = fc._embedding_column(
+        categorical_column,
+        dimension=embedding_dimension,
         initializer=_initializer)
 
     with ops.Graph().as_default():
@@ -5100,11 +5253,13 @@ class EmbeddingColumnTest(test.TestCase):
           'linear_model/aaa_embedding/weights:0']
       with _initialized_session():
         # Predictions with all zero weights.
-        self.assertAllClose(np.zeros((1,)), bias.eval())
-        self.assertAllClose(zeros_embedding_values, embedding_weights.eval())
+        self.assertAllClose(np.zeros((1,)), self.evaluate(bias))
+        self.assertAllClose(zeros_embedding_values,
+                            self.evaluate(embedding_weights))
         self.assertAllClose(
-            np.zeros((embedding_dimension, 1)), linear_weights.eval())
-        self.assertAllClose(np.zeros((batch_size, 1)), predictions.eval())
+            np.zeros((embedding_dimension, 1)), self.evaluate(linear_weights))
+        self.assertAllClose(
+            np.zeros((batch_size, 1)), self.evaluate(predictions))
 
         # Predictions with all non-zero weights.
         embedding_weights.assign((
@@ -5119,8 +5274,10 @@ class EmbeddingColumnTest(test.TestCase):
         # example 3, ids [1], embedding[3] = [3, 5]
         # sum(embeddings * linear_weights)
         # = [4*7 + 6*11, 4*2 + 6*3.5, 4*0 + 6*0, 4*3 + 6*5] = [94, 29, 0, 42]
-        self.assertAllClose(((94.,), (29.,), (0.,), (42.,)), predictions.eval())
+        self.assertAllClose(((94.,), (29.,), (0.,), (42.,)),
+                            self.evaluate(predictions))
 
+  @test_util.run_deprecated_v1
   def test_keras_linear_model(self):
     # Inputs.
     batch_size = 4
@@ -5146,9 +5303,9 @@ class EmbeddingColumnTest(test.TestCase):
       return zeros_embedding_values
 
     # Build columns.
-    categorical_column = fc.categorical_column_with_identity(
+    categorical_column = fc._categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
-    embedding_column = fc.embedding_column(
+    embedding_column = fc._embedding_column(
         categorical_column,
         dimension=embedding_dimension,
         initializer=_initializer)
@@ -5176,11 +5333,13 @@ class EmbeddingColumnTest(test.TestCase):
       linear_weights = trainable_vars['linear_model/aaa_embedding/weights:0']
       with _initialized_session():
         # Predictions with all zero weights.
-        self.assertAllClose(np.zeros((1,)), bias.eval())
-        self.assertAllClose(zeros_embedding_values, embedding_weights.eval())
+        self.assertAllClose(np.zeros((1,)), self.evaluate(bias))
+        self.assertAllClose(zeros_embedding_values,
+                            self.evaluate(embedding_weights))
         self.assertAllClose(
-            np.zeros((embedding_dimension, 1)), linear_weights.eval())
-        self.assertAllClose(np.zeros((batch_size, 1)), predictions.eval())
+            np.zeros((embedding_dimension, 1)), self.evaluate(linear_weights))
+        self.assertAllClose(
+            np.zeros((batch_size, 1)), self.evaluate(predictions))
 
         # Predictions with all non-zero weights.
         embedding_weights.assign((
@@ -5195,8 +5354,10 @@ class EmbeddingColumnTest(test.TestCase):
         # example 3, ids [1], embedding[3] = [3, 5]
         # sum(embeddings * linear_weights)
         # = [4*7 + 6*11, 4*2 + 6*3.5, 4*0 + 6*0, 4*3 + 6*5] = [94, 29, 0, 42]
-        self.assertAllClose(((94.,), (29.,), (0.,), (42.,)), predictions.eval())
+        self.assertAllClose(((94.,), (29.,), (0.,), (42.,)),
+                            self.evaluate(predictions))
 
+  @test_util.run_deprecated_v1
   def test_input_layer(self):
     # Inputs.
     vocabulary_size = 3
@@ -5235,10 +5396,11 @@ class EmbeddingColumnTest(test.TestCase):
     )
 
     # Build columns.
-    categorical_column = fc.categorical_column_with_identity(
+    categorical_column = fc._categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
-    embedding_column = fc.embedding_column(
-        categorical_column, dimension=embedding_dimension,
+    embedding_column = fc._embedding_column(
+        categorical_column,
+        dimension=embedding_dimension,
         initializer=_initializer)
 
     # Provide sparse input and get dense result.
@@ -5255,8 +5417,9 @@ class EmbeddingColumnTest(test.TestCase):
         tuple([v.name for v in trainable_vars]))
     with _initialized_session():
       self.assertAllEqual(embedding_values, trainable_vars[0].eval())
-      self.assertAllEqual(expected_lookups, input_layer.eval())
+      self.assertAllEqual(expected_lookups, self.evaluate(input_layer))
 
+  @test_util.run_deprecated_v1
   def test_input_layer_not_trainable(self):
     # Inputs.
     vocabulary_size = 3
@@ -5295,11 +5458,13 @@ class EmbeddingColumnTest(test.TestCase):
     )
 
     # Build columns.
-    categorical_column = fc.categorical_column_with_identity(
+    categorical_column = fc._categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
-    embedding_column = fc.embedding_column(
-        categorical_column, dimension=embedding_dimension,
-        initializer=_initializer, trainable=False)
+    embedding_column = fc._embedding_column(
+        categorical_column,
+        dimension=embedding_dimension,
+        initializer=_initializer,
+        trainable=False)
 
     # Provide sparse input and get dense result.
     input_layer = fc.input_layer({'aaa': sparse_input}, (embedding_column,))
@@ -5313,18 +5478,19 @@ class EmbeddingColumnTest(test.TestCase):
         [], ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES))
     with _initialized_session():
       self.assertAllEqual(embedding_values, global_vars[0].eval())
-      self.assertAllEqual(expected_lookups, input_layer.eval())
+      self.assertAllEqual(expected_lookups, self.evaluate(input_layer))
 
 
 class SharedEmbeddingColumnTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def test_defaults(self):
-    categorical_column_a = fc.categorical_column_with_identity(
+    categorical_column_a = fc._categorical_column_with_identity(
         key='aaa', num_buckets=3)
-    categorical_column_b = fc.categorical_column_with_identity(
+    categorical_column_b = fc._categorical_column_with_identity(
         key='bbb', num_buckets=3)
     embedding_dimension = 2
-    embedding_column_b, embedding_column_a = fc.shared_embedding_columns(
+    embedding_column_b, embedding_column_a = fc_new.shared_embedding_columns(
         [categorical_column_b, categorical_column_a],
         dimension=embedding_dimension)
     self.assertIs(categorical_column_a, embedding_column_a.categorical_column)
@@ -5362,13 +5528,14 @@ class SharedEmbeddingColumnTest(test.TestCase):
         'bbb': parsing_ops.VarLenFeature(dtypes.int64)
     }, embedding_column_b._parse_example_spec)
 
+  @test_util.run_deprecated_v1
   def test_all_constructor_args(self):
-    categorical_column_a = fc.categorical_column_with_identity(
+    categorical_column_a = fc._categorical_column_with_identity(
         key='aaa', num_buckets=3)
-    categorical_column_b = fc.categorical_column_with_identity(
+    categorical_column_b = fc._categorical_column_with_identity(
         key='bbb', num_buckets=3)
     embedding_dimension = 2
-    embedding_column_a, embedding_column_b = fc.shared_embedding_columns(
+    embedding_column_a, embedding_column_b = fc_new.shared_embedding_columns(
         [categorical_column_a, categorical_column_b],
         dimension=embedding_dimension,
         combiner='my_combiner',
@@ -5413,13 +5580,14 @@ class SharedEmbeddingColumnTest(test.TestCase):
         'bbb': parsing_ops.VarLenFeature(dtypes.int64)
     }, embedding_column_b._parse_example_spec)
 
+  @test_util.run_deprecated_v1
   def test_deep_copy(self):
-    categorical_column_a = fc.categorical_column_with_identity(
+    categorical_column_a = fc._categorical_column_with_identity(
         key='aaa', num_buckets=3)
-    categorical_column_b = fc.categorical_column_with_identity(
+    categorical_column_b = fc._categorical_column_with_identity(
         key='bbb', num_buckets=3)
     embedding_dimension = 2
-    original_a, _ = fc.shared_embedding_columns(
+    original_a, _ = fc_new.shared_embedding_columns(
         [categorical_column_a, categorical_column_b],
         dimension=embedding_dimension,
         combiner='my_combiner',
@@ -5427,7 +5595,8 @@ class SharedEmbeddingColumnTest(test.TestCase):
         shared_embedding_collection_name='shared_embedding_collection_name',
         ckpt_to_load_from='my_ckpt',
         tensor_name_in_ckpt='my_ckpt_tensor',
-        max_norm=42., trainable=False)
+        max_norm=42.,
+        trainable=False)
     for embedding_column_a in (original_a, copy.deepcopy(original_a)):
       self.assertEqual('aaa', embedding_column_a.categorical_column.name)
       self.assertEqual(3, embedding_column_a.categorical_column._num_buckets)
@@ -5450,55 +5619,60 @@ class SharedEmbeddingColumnTest(test.TestCase):
           'aaa': parsing_ops.VarLenFeature(dtypes.int64)
       }, embedding_column_a._parse_example_spec)
 
+  @test_util.run_deprecated_v1
   def test_invalid_initializer(self):
-    categorical_column_a = fc.categorical_column_with_identity(
+    categorical_column_a = fc._categorical_column_with_identity(
         key='aaa', num_buckets=3)
-    categorical_column_b = fc.categorical_column_with_identity(
+    categorical_column_b = fc._categorical_column_with_identity(
         key='bbb', num_buckets=3)
     with self.assertRaisesRegexp(ValueError, 'initializer must be callable'):
-      fc.shared_embedding_columns(
-          [categorical_column_a, categorical_column_b], dimension=2,
+      fc_new.shared_embedding_columns(
+          [categorical_column_a, categorical_column_b],
+          dimension=2,
           initializer='not_fn')
 
+  @test_util.run_deprecated_v1
   def test_incompatible_column_type(self):
-    categorical_column_a = fc.categorical_column_with_identity(
+    categorical_column_a = fc._categorical_column_with_identity(
         key='aaa', num_buckets=3)
-    categorical_column_b = fc.categorical_column_with_identity(
+    categorical_column_b = fc._categorical_column_with_identity(
         key='bbb', num_buckets=3)
-    categorical_column_c = fc.categorical_column_with_hash_bucket(
+    categorical_column_c = fc._categorical_column_with_hash_bucket(
         key='ccc', hash_bucket_size=3)
     with self.assertRaisesRegexp(
         ValueError,
         'all categorical_columns must have the same type.*'
         '_IdentityCategoricalColumn.*_HashedCategoricalColumn'):
-      fc.shared_embedding_columns(
+      fc_new.shared_embedding_columns(
           [categorical_column_a, categorical_column_b, categorical_column_c],
           dimension=2)
 
+  @test_util.run_deprecated_v1
   def test_weighted_categorical_column_ok(self):
-    categorical_column_a = fc.categorical_column_with_identity(
+    categorical_column_a = fc._categorical_column_with_identity(
         key='aaa', num_buckets=3)
-    weighted_categorical_column_a = fc.weighted_categorical_column(
+    weighted_categorical_column_a = fc._weighted_categorical_column(
         categorical_column_a, weight_feature_key='aaa_weights')
-    categorical_column_b = fc.categorical_column_with_identity(
+    categorical_column_b = fc._categorical_column_with_identity(
         key='bbb', num_buckets=3)
-    weighted_categorical_column_b = fc.weighted_categorical_column(
+    weighted_categorical_column_b = fc._weighted_categorical_column(
         categorical_column_b, weight_feature_key='bbb_weights')
-    fc.shared_embedding_columns(
+    fc_new.shared_embedding_columns(
         [weighted_categorical_column_a, categorical_column_b], dimension=2)
-    fc.shared_embedding_columns(
+    fc_new.shared_embedding_columns(
         [categorical_column_a, weighted_categorical_column_b], dimension=2)
-    fc.shared_embedding_columns(
+    fc_new.shared_embedding_columns(
         [weighted_categorical_column_a, weighted_categorical_column_b],
         dimension=2)
 
+  @test_util.run_deprecated_v1
   def test_parse_example(self):
-    a = fc.categorical_column_with_vocabulary_list(
+    a = fc._categorical_column_with_vocabulary_list(
         key='aaa', vocabulary_list=('omar', 'stringer', 'marlo'))
-    b = fc.categorical_column_with_vocabulary_list(
+    b = fc._categorical_column_with_vocabulary_list(
         key='bbb', vocabulary_list=('omar', 'stringer', 'marlo'))
-    a_embedded, b_embedded = fc.shared_embedding_columns(
-        [a, b], dimension=2)
+    a_embedded, b_embedded = fc_new.shared_embedding_columns([a, b],
+                                                             dimension=2)
     data = example_pb2.Example(features=feature_pb2.Features(
         feature={
             'aaa':
@@ -5529,11 +5703,12 @@ class SharedEmbeddingColumnTest(test.TestCase):
               dense_shape=[1, 2]),
           features['bbb'].eval())
 
+  @test_util.run_deprecated_v1
   def test_transform_feature(self):
-    a = fc.categorical_column_with_identity(key='aaa', num_buckets=3)
-    b = fc.categorical_column_with_identity(key='bbb', num_buckets=3)
-    a_embedded, b_embedded = fc.shared_embedding_columns(
-        [a, b], dimension=2)
+    a = fc._categorical_column_with_identity(key='aaa', num_buckets=3)
+    b = fc._categorical_column_with_identity(key='bbb', num_buckets=3)
+    a_embedded, b_embedded = fc_new.shared_embedding_columns([a, b],
+                                                             dimension=2)
     features = {
         'aaa': sparse_tensor.SparseTensor(
             indices=((0, 0), (1, 0), (1, 1)),
@@ -5550,11 +5725,12 @@ class SharedEmbeddingColumnTest(test.TestCase):
     output_b = outputs[b]
     output_b_embedded = outputs[b_embedded]
     with _initialized_session():
-      _assert_sparse_tensor_value(
-          self, output_a.eval(), output_a_embedded.eval())
-      _assert_sparse_tensor_value(
-          self, output_b.eval(), output_b_embedded.eval())
+      _assert_sparse_tensor_value(self, self.evaluate(output_a),
+                                  self.evaluate(output_a_embedded))
+      _assert_sparse_tensor_value(self, self.evaluate(output_b),
+                                  self.evaluate(output_b_embedded))
 
+  @test_util.run_deprecated_v1
   def test_get_dense_tensor(self):
     # Inputs.
     vocabulary_size = 3
@@ -5598,13 +5774,14 @@ class SharedEmbeddingColumnTest(test.TestCase):
     )
 
     # Build columns.
-    categorical_column_a = fc.categorical_column_with_identity(
+    categorical_column_a = fc._categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
-    categorical_column_b = fc.categorical_column_with_identity(
+    categorical_column_b = fc._categorical_column_with_identity(
         key='bbb', num_buckets=vocabulary_size)
-    embedding_column_a, embedding_column_b = fc.shared_embedding_columns(
+    embedding_column_a, embedding_column_b = fc_new.shared_embedding_columns(
         [categorical_column_a, categorical_column_b],
-        dimension=embedding_dimension, initializer=_initializer)
+        dimension=embedding_dimension,
+        initializer=_initializer)
 
     # Provide sparse input and get dense result.
     embedding_lookup_a = embedding_column_a._get_dense_tensor(
@@ -5618,10 +5795,11 @@ class SharedEmbeddingColumnTest(test.TestCase):
                           tuple([v.name for v in global_vars]))
     embedding_var = global_vars[0]
     with _initialized_session():
-      self.assertAllEqual(embedding_values, embedding_var.eval())
-      self.assertAllEqual(expected_lookups_a, embedding_lookup_a.eval())
-      self.assertAllEqual(expected_lookups_b, embedding_lookup_b.eval())
+      self.assertAllEqual(embedding_values, self.evaluate(embedding_var))
+      self.assertAllEqual(expected_lookups_a, self.evaluate(embedding_lookup_a))
+      self.assertAllEqual(expected_lookups_b, self.evaluate(embedding_lookup_b))
 
+  @test_util.run_deprecated_v1
   def test_get_dense_tensor_weight_collections(self):
     # Inputs.
     vocabulary_size = 3
@@ -5651,11 +5829,11 @@ class SharedEmbeddingColumnTest(test.TestCase):
       return embedding_values
 
     # Build columns.
-    categorical_column_a = fc.categorical_column_with_identity(
+    categorical_column_a = fc._categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
-    categorical_column_b = fc.categorical_column_with_identity(
+    categorical_column_b = fc._categorical_column_with_identity(
         key='bbb', num_buckets=vocabulary_size)
-    embedding_column_a, embedding_column_b = fc.shared_embedding_columns(
+    embedding_column_a, embedding_column_b = fc_new.shared_embedding_columns(
         [categorical_column_a, categorical_column_b],
         dimension=embedding_dimension,
         initializer=_initializer)
@@ -5674,6 +5852,7 @@ class SharedEmbeddingColumnTest(test.TestCase):
         ('input_layer/aaa_bbb_shared_embedding/embedding_weights:0',),
         tuple(v.name for v in my_vars))
 
+  @test_util.run_deprecated_v1
   def test_get_dense_tensor_placeholder_inputs(self):
     # Inputs.
     vocabulary_size = 3
@@ -5712,13 +5891,14 @@ class SharedEmbeddingColumnTest(test.TestCase):
       return embedding_values
 
     # Build columns.
-    categorical_column_a = fc.categorical_column_with_identity(
+    categorical_column_a = fc._categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
-    categorical_column_b = fc.categorical_column_with_identity(
+    categorical_column_b = fc._categorical_column_with_identity(
         key='bbb', num_buckets=vocabulary_size)
-    embedding_column_a, embedding_column_b = fc.shared_embedding_columns(
+    embedding_column_a, embedding_column_b = fc_new.shared_embedding_columns(
         [categorical_column_a, categorical_column_b],
-        dimension=embedding_dimension, initializer=_initializer)
+        dimension=embedding_dimension,
+        initializer=_initializer)
 
     # Provide sparse input and get dense result.
     embedding_lookup_a = embedding_column_a._get_dense_tensor(
@@ -5729,6 +5909,7 @@ class SharedEmbeddingColumnTest(test.TestCase):
     with _initialized_session() as sess:
       sess.run([embedding_lookup_a, embedding_lookup_b], feed_dict=feed_dict)
 
+  @test_util.run_deprecated_v1
   def test_linear_model(self):
     # Inputs.
     batch_size = 2
@@ -5752,13 +5933,14 @@ class SharedEmbeddingColumnTest(test.TestCase):
       return zeros_embedding_values
 
     # Build columns.
-    categorical_column_a = fc.categorical_column_with_identity(
+    categorical_column_a = fc._categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
-    categorical_column_b = fc.categorical_column_with_identity(
+    categorical_column_b = fc._categorical_column_with_identity(
         key='bbb', num_buckets=vocabulary_size)
-    embedding_column_a, embedding_column_b = fc.shared_embedding_columns(
+    embedding_column_a, embedding_column_b = fc_new.shared_embedding_columns(
         [categorical_column_a, categorical_column_b],
-        dimension=embedding_dimension, initializer=_initializer)
+        dimension=embedding_dimension,
+        initializer=_initializer)
 
     with ops.Graph().as_default():
       predictions = fc.linear_model({
@@ -5790,13 +5972,15 @@ class SharedEmbeddingColumnTest(test.TestCase):
           'linear_model/aaa_bbb_shared_embedding_1/weights:0']
       with _initialized_session():
         # Predictions with all zero weights.
-        self.assertAllClose(np.zeros((1,)), bias.eval())
-        self.assertAllClose(zeros_embedding_values, embedding_weights.eval())
+        self.assertAllClose(np.zeros((1,)), self.evaluate(bias))
+        self.assertAllClose(zeros_embedding_values,
+                            self.evaluate(embedding_weights))
         self.assertAllClose(
-            np.zeros((embedding_dimension, 1)), linear_weights_a.eval())
+            np.zeros((embedding_dimension, 1)), self.evaluate(linear_weights_a))
         self.assertAllClose(
-            np.zeros((embedding_dimension, 1)), linear_weights_b.eval())
-        self.assertAllClose(np.zeros((batch_size, 1)), predictions.eval())
+            np.zeros((embedding_dimension, 1)), self.evaluate(linear_weights_b))
+        self.assertAllClose(
+            np.zeros((batch_size, 1)), self.evaluate(predictions))
 
         # Predictions with all non-zero weights.
         embedding_weights.assign((
@@ -5814,8 +5998,9 @@ class SharedEmbeddingColumnTest(test.TestCase):
         # example 1, ids [], embedding[1] = 0, 0]
         # sum(embeddings * linear_weights)
         # = [3*1 + 5*2, 3*0 +5*0] = [13, 0]
-        self.assertAllClose([[94. + 13.], [29.]], predictions.eval())
+        self.assertAllClose([[94. + 13.], [29.]], self.evaluate(predictions))
 
+  @test_util.run_deprecated_v1
   def test_keras_linear_model(self):
     # Inputs.
     batch_size = 2
@@ -5842,11 +6027,11 @@ class SharedEmbeddingColumnTest(test.TestCase):
       return zeros_embedding_values
 
     # Build columns.
-    categorical_column_a = fc.categorical_column_with_identity(
+    categorical_column_a = fc._categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
-    categorical_column_b = fc.categorical_column_with_identity(
+    categorical_column_b = fc._categorical_column_with_identity(
         key='bbb', num_buckets=vocabulary_size)
-    embedding_column_a, embedding_column_b = fc.shared_embedding_columns(
+    embedding_column_a, embedding_column_b = fc_new.shared_embedding_columns(
         [categorical_column_a, categorical_column_b],
         dimension=embedding_dimension,
         initializer=_initializer)
@@ -5881,13 +6066,15 @@ class SharedEmbeddingColumnTest(test.TestCase):
           'linear_model/aaa_bbb_shared_embedding_1/weights:0']
       with _initialized_session():
         # Predictions with all zero weights.
-        self.assertAllClose(np.zeros((1,)), bias.eval())
-        self.assertAllClose(zeros_embedding_values, embedding_weights.eval())
+        self.assertAllClose(np.zeros((1,)), self.evaluate(bias))
+        self.assertAllClose(zeros_embedding_values,
+                            self.evaluate(embedding_weights))
         self.assertAllClose(
-            np.zeros((embedding_dimension, 1)), linear_weights_a.eval())
+            np.zeros((embedding_dimension, 1)), self.evaluate(linear_weights_a))
         self.assertAllClose(
-            np.zeros((embedding_dimension, 1)), linear_weights_b.eval())
-        self.assertAllClose(np.zeros((batch_size, 1)), predictions.eval())
+            np.zeros((embedding_dimension, 1)), self.evaluate(linear_weights_b))
+        self.assertAllClose(
+            np.zeros((batch_size, 1)), self.evaluate(predictions))
 
         # Predictions with all non-zero weights.
         embedding_weights.assign((
@@ -5905,7 +6092,7 @@ class SharedEmbeddingColumnTest(test.TestCase):
         # example 1, ids [], embedding[1] = 0, 0]
         # sum(embeddings * linear_weights)
         # = [3*1 + 5*2, 3*0 +5*0] = [13, 0]
-        self.assertAllClose([[94. + 13.], [29.]], predictions.eval())
+        self.assertAllClose([[94. + 13.], [29.]], self.evaluate(predictions))
 
   def _test_input_layer(self, trainable=True):
     # Inputs.
@@ -5949,13 +6136,14 @@ class SharedEmbeddingColumnTest(test.TestCase):
     )
 
     # Build columns.
-    categorical_column_a = fc.categorical_column_with_identity(
+    categorical_column_a = fc._categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
-    categorical_column_b = fc.categorical_column_with_identity(
+    categorical_column_b = fc._categorical_column_with_identity(
         key='bbb', num_buckets=vocabulary_size)
-    embedding_column_a, embedding_column_b = fc.shared_embedding_columns(
+    embedding_column_a, embedding_column_b = fc_new.shared_embedding_columns(
         [categorical_column_a, categorical_column_b],
-        dimension=embedding_dimension, initializer=_initializer,
+        dimension=embedding_dimension,
+        initializer=_initializer,
         trainable=trainable)
 
     # Provide sparse input and get dense result.
@@ -5978,20 +6166,23 @@ class SharedEmbeddingColumnTest(test.TestCase):
     shared_embedding_vars = global_vars
     with _initialized_session():
       self.assertAllEqual(embedding_values, shared_embedding_vars[0].eval())
-      self.assertAllEqual(expected_lookups, input_layer.eval())
+      self.assertAllEqual(expected_lookups, self.evaluate(input_layer))
 
+  @test_util.run_deprecated_v1
   def test_input_layer(self):
     self._test_input_layer()
 
+  @test_util.run_deprecated_v1
   def test_input_layer_no_trainable(self):
     self._test_input_layer(trainable=False)
 
 
 class WeightedCategoricalColumnTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def test_defaults(self):
-    column = fc.weighted_categorical_column(
-        categorical_column=fc.categorical_column_with_identity(
+    column = fc._weighted_categorical_column(
+        categorical_column=fc._categorical_column_with_identity(
             key='ids', num_buckets=3),
         weight_feature_key='values')
     self.assertEqual('ids_weighted_by_values', column.name)
@@ -6002,10 +6193,11 @@ class WeightedCategoricalColumnTest(test.TestCase):
         'values': parsing_ops.VarLenFeature(dtypes.float32)
     }, column._parse_example_spec)
 
+  @test_util.run_deprecated_v1
   def test_deep_copy(self):
     """Tests deepcopy of categorical_column_with_hash_bucket."""
-    original = fc.weighted_categorical_column(
-        categorical_column=fc.categorical_column_with_identity(
+    original = fc._weighted_categorical_column(
+        categorical_column=fc._categorical_column_with_identity(
             key='ids', num_buckets=3),
         weight_feature_key='values')
     for column in (original, copy.deepcopy(original)):
@@ -6018,23 +6210,23 @@ class WeightedCategoricalColumnTest(test.TestCase):
 
   def test_invalid_dtype_none(self):
     with self.assertRaisesRegexp(ValueError, 'is not convertible to float'):
-      fc.weighted_categorical_column(
-          categorical_column=fc.categorical_column_with_identity(
+      fc._weighted_categorical_column(
+          categorical_column=fc._categorical_column_with_identity(
               key='ids', num_buckets=3),
           weight_feature_key='values',
           dtype=None)
 
   def test_invalid_dtype_string(self):
     with self.assertRaisesRegexp(ValueError, 'is not convertible to float'):
-      fc.weighted_categorical_column(
-          categorical_column=fc.categorical_column_with_identity(
+      fc._weighted_categorical_column(
+          categorical_column=fc._categorical_column_with_identity(
               key='ids', num_buckets=3),
           weight_feature_key='values',
           dtype=dtypes.string)
 
   def test_invalid_input_dtype(self):
-    column = fc.weighted_categorical_column(
-        categorical_column=fc.categorical_column_with_identity(
+    column = fc._weighted_categorical_column(
+        categorical_column=fc._categorical_column_with_identity(
             key='ids', num_buckets=3),
         weight_feature_key='values')
     strings = sparse_tensor.SparseTensorValue(
@@ -6046,14 +6238,14 @@ class WeightedCategoricalColumnTest(test.TestCase):
 
   def test_column_name_collision(self):
     with self.assertRaisesRegexp(ValueError, r'Parse config.*already exists'):
-      fc.weighted_categorical_column(
-          categorical_column=fc.categorical_column_with_identity(
+      fc._weighted_categorical_column(
+          categorical_column=fc._categorical_column_with_identity(
               key='aaa', num_buckets=3),
           weight_feature_key='aaa')._parse_example_spec()
 
   def test_missing_weights(self):
-    column = fc.weighted_categorical_column(
-        categorical_column=fc.categorical_column_with_identity(
+    column = fc._weighted_categorical_column(
+        categorical_column=fc._categorical_column_with_identity(
             key='ids', num_buckets=3),
         weight_feature_key='values')
     inputs = sparse_tensor.SparseTensorValue(
@@ -6064,10 +6256,12 @@ class WeightedCategoricalColumnTest(test.TestCase):
         ValueError, 'values is not in features dictionary'):
       _transform_features({'ids': inputs}, (column,))
 
+  @test_util.run_deprecated_v1
   def test_parse_example(self):
-    a = fc.categorical_column_with_vocabulary_list(
+    a = fc._categorical_column_with_vocabulary_list(
         key='aaa', vocabulary_list=('omar', 'stringer', 'marlo'))
-    a_weighted = fc.weighted_categorical_column(a, weight_feature_key='weights')
+    a_weighted = fc._weighted_categorical_column(
+        a, weight_feature_key='weights')
     data = example_pb2.Example(features=feature_pb2.Features(
         feature={
             'aaa':
@@ -6098,9 +6292,10 @@ class WeightedCategoricalColumnTest(test.TestCase):
               dense_shape=[1, 2]),
           features['weights'].eval())
 
+  @test_util.run_deprecated_v1
   def test_transform_features(self):
-    column = fc.weighted_categorical_column(
-        categorical_column=fc.categorical_column_with_identity(
+    column = fc._weighted_categorical_column(
+        categorical_column=fc._categorical_column_with_identity(
             key='ids', num_buckets=3),
         weight_feature_key='values')
     inputs = sparse_tensor.SparseTensorValue(
@@ -6121,19 +6316,18 @@ class WeightedCategoricalColumnTest(test.TestCase):
           sparse_tensor.SparseTensorValue(
               indices=inputs.indices,
               values=np.array(inputs.values, dtype=np.int64),
-              dense_shape=inputs.dense_shape),
-          id_tensor.eval())
+              dense_shape=inputs.dense_shape), self.evaluate(id_tensor))
       _assert_sparse_tensor_value(
           self,
           sparse_tensor.SparseTensorValue(
               indices=weights.indices,
               values=np.array(weights.values, dtype=np.float32),
-              dense_shape=weights.dense_shape),
-          weight_tensor.eval())
+              dense_shape=weights.dense_shape), self.evaluate(weight_tensor))
 
+  @test_util.run_deprecated_v1
   def test_transform_features_dense_input(self):
-    column = fc.weighted_categorical_column(
-        categorical_column=fc.categorical_column_with_identity(
+    column = fc._weighted_categorical_column(
+        categorical_column=fc._categorical_column_with_identity(
             key='ids', num_buckets=3),
         weight_feature_key='values')
     weights = sparse_tensor.SparseTensorValue(
@@ -6150,19 +6344,18 @@ class WeightedCategoricalColumnTest(test.TestCase):
           sparse_tensor.SparseTensorValue(
               indices=((0, 0), (1, 0), (1, 1)),
               values=np.array((0, 1, 0), dtype=np.int64),
-              dense_shape=(2, 2)),
-          id_tensor.eval())
+              dense_shape=(2, 2)), self.evaluate(id_tensor))
       _assert_sparse_tensor_value(
           self,
           sparse_tensor.SparseTensorValue(
               indices=weights.indices,
               values=np.array(weights.values, dtype=np.float32),
-              dense_shape=weights.dense_shape),
-          weight_tensor.eval())
+              dense_shape=weights.dense_shape), self.evaluate(weight_tensor))
 
+  @test_util.run_deprecated_v1
   def test_transform_features_dense_weights(self):
-    column = fc.weighted_categorical_column(
-        categorical_column=fc.categorical_column_with_identity(
+    column = fc._weighted_categorical_column(
+        categorical_column=fc._categorical_column_with_identity(
             key='ids', num_buckets=3),
         weight_feature_key='values')
     inputs = sparse_tensor.SparseTensorValue(
@@ -6179,19 +6372,18 @@ class WeightedCategoricalColumnTest(test.TestCase):
           sparse_tensor.SparseTensorValue(
               indices=inputs.indices,
               values=np.array(inputs.values, dtype=np.int64),
-              dense_shape=inputs.dense_shape),
-          id_tensor.eval())
+              dense_shape=inputs.dense_shape), self.evaluate(id_tensor))
       _assert_sparse_tensor_value(
           self,
           sparse_tensor.SparseTensorValue(
               indices=((0, 0), (1, 0), (1, 1)),
               values=np.array((.5, 1., .1), dtype=np.float32),
-              dense_shape=(2, 2)),
-          weight_tensor.eval())
+              dense_shape=(2, 2)), self.evaluate(weight_tensor))
 
+  @test_util.run_deprecated_v1
   def test_keras_linear_model(self):
-    column = fc.weighted_categorical_column(
-        categorical_column=fc.categorical_column_with_identity(
+    column = fc._weighted_categorical_column(
+        categorical_column=fc._categorical_column_with_identity(
             key='ids', num_buckets=3),
         weight_feature_key='values')
     with ops.Graph().as_default():
@@ -6210,18 +6402,18 @@ class WeightedCategoricalColumnTest(test.TestCase):
       bias = get_linear_model_bias()
       weight_var = get_linear_model_column_var(column)
       with _initialized_session():
-        self.assertAllClose((0.,), bias.eval())
-        self.assertAllClose(((0.,), (0.,), (0.,)), weight_var.eval())
-        self.assertAllClose(((0.,), (0.,)), predictions.eval())
+        self.assertAllClose((0.,), self.evaluate(bias))
+        self.assertAllClose(((0.,), (0.,), (0.,)), self.evaluate(weight_var))
+        self.assertAllClose(((0.,), (0.,)), self.evaluate(predictions))
         weight_var.assign(((1.,), (2.,), (3.,))).eval()
         # weight_var[0] * weights[0, 0] = 1 * .5 = .5
         # weight_var[2] * weights[1, 0] + weight_var[1] * weights[1, 1]
         # = 3*1 + 2*.1 = 3+.2 = 3.2
-        self.assertAllClose(((.5,), (3.2,)), predictions.eval())
+        self.assertAllClose(((.5,), (3.2,)), self.evaluate(predictions))
 
   def test_keras_linear_model_mismatched_shape(self):
-    column = fc.weighted_categorical_column(
-        categorical_column=fc.categorical_column_with_identity(
+    column = fc._weighted_categorical_column(
+        categorical_column=fc._categorical_column_with_identity(
             key='ids', num_buckets=3),
         weight_feature_key='values')
     with ops.Graph().as_default():
@@ -6241,8 +6433,8 @@ class WeightedCategoricalColumnTest(test.TestCase):
         }, (column,))
 
   def test_keras_linear_model_mismatched_dense_values(self):
-    column = fc.weighted_categorical_column(
-        categorical_column=fc.categorical_column_with_identity(
+    column = fc._weighted_categorical_column(
+        categorical_column=fc._categorical_column_with_identity(
             key='ids', num_buckets=3),
         weight_feature_key='values')
     with ops.Graph().as_default():
@@ -6263,11 +6455,11 @@ class WeightedCategoricalColumnTest(test.TestCase):
           rewriter_config_pb2.RewriterConfig.OFF)
       with _initialized_session(config):
         with self.assertRaisesRegexp(errors.OpError, 'Incompatible shapes'):
-          predictions.eval()
+          self.evaluate(predictions)
 
   def test_keras_linear_model_mismatched_dense_shape(self):
-    column = fc.weighted_categorical_column(
-        categorical_column=fc.categorical_column_with_identity(
+    column = fc._weighted_categorical_column(
+        categorical_column=fc._categorical_column_with_identity(
             key='ids', num_buckets=3),
         weight_feature_key='values')
     with ops.Graph().as_default():
@@ -6282,18 +6474,19 @@ class WeightedCategoricalColumnTest(test.TestCase):
       bias = get_linear_model_bias()
       weight_var = get_linear_model_column_var(column)
       with _initialized_session():
-        self.assertAllClose((0.,), bias.eval())
-        self.assertAllClose(((0.,), (0.,), (0.,)), weight_var.eval())
-        self.assertAllClose(((0.,), (0.,)), predictions.eval())
+        self.assertAllClose((0.,), self.evaluate(bias))
+        self.assertAllClose(((0.,), (0.,), (0.,)), self.evaluate(weight_var))
+        self.assertAllClose(((0.,), (0.,)), self.evaluate(predictions))
         weight_var.assign(((1.,), (2.,), (3.,))).eval()
         # weight_var[0] * weights[0, 0] = 1 * .5 = .5
         # weight_var[2] * weights[1, 0] + weight_var[1] * weights[1, 1]
         # = 3*1 + 2*.1 = 3+.2 = 3.2
-        self.assertAllClose(((.5,), (3.2,)), predictions.eval())
+        self.assertAllClose(((.5,), (3.2,)), self.evaluate(predictions))
 
+  @test_util.run_deprecated_v1
   def test_linear_model(self):
-    column = fc.weighted_categorical_column(
-        categorical_column=fc.categorical_column_with_identity(
+    column = fc._weighted_categorical_column(
+        categorical_column=fc._categorical_column_with_identity(
             key='ids', num_buckets=3),
         weight_feature_key='values')
     with ops.Graph().as_default():
@@ -6310,18 +6503,18 @@ class WeightedCategoricalColumnTest(test.TestCase):
       bias = get_linear_model_bias()
       weight_var = get_linear_model_column_var(column)
       with _initialized_session():
-        self.assertAllClose((0.,), bias.eval())
-        self.assertAllClose(((0.,), (0.,), (0.,)), weight_var.eval())
-        self.assertAllClose(((0.,), (0.,)), predictions.eval())
+        self.assertAllClose((0.,), self.evaluate(bias))
+        self.assertAllClose(((0.,), (0.,), (0.,)), self.evaluate(weight_var))
+        self.assertAllClose(((0.,), (0.,)), self.evaluate(predictions))
         weight_var.assign(((1.,), (2.,), (3.,))).eval()
         # weight_var[0] * weights[0, 0] = 1 * .5 = .5
         # weight_var[2] * weights[1, 0] + weight_var[1] * weights[1, 1]
         # = 3*1 + 2*.1 = 3+.2 = 3.2
-        self.assertAllClose(((.5,), (3.2,)), predictions.eval())
+        self.assertAllClose(((.5,), (3.2,)), self.evaluate(predictions))
 
   def test_linear_model_mismatched_shape(self):
-    column = fc.weighted_categorical_column(
-        categorical_column=fc.categorical_column_with_identity(
+    column = fc._weighted_categorical_column(
+        categorical_column=fc._categorical_column_with_identity(
             key='ids', num_buckets=3),
         weight_feature_key='values')
     with ops.Graph().as_default():
@@ -6339,8 +6532,8 @@ class WeightedCategoricalColumnTest(test.TestCase):
         }, (column,))
 
   def test_linear_model_mismatched_dense_values(self):
-    column = fc.weighted_categorical_column(
-        categorical_column=fc.categorical_column_with_identity(
+    column = fc._weighted_categorical_column(
+        categorical_column=fc._categorical_column_with_identity(
             key='ids', num_buckets=3),
         weight_feature_key='values')
     with ops.Graph().as_default():
@@ -6361,11 +6554,11 @@ class WeightedCategoricalColumnTest(test.TestCase):
           rewriter_config_pb2.RewriterConfig.OFF)
       with _initialized_session(config):
         with self.assertRaisesRegexp(errors.OpError, 'Incompatible shapes'):
-          predictions.eval()
+          self.evaluate(predictions)
 
   def test_linear_model_mismatched_dense_shape(self):
-    column = fc.weighted_categorical_column(
-        categorical_column=fc.categorical_column_with_identity(
+    column = fc._weighted_categorical_column(
+        categorical_column=fc._categorical_column_with_identity(
             key='ids', num_buckets=3),
         weight_feature_key='values')
     with ops.Graph().as_default():
@@ -6379,14 +6572,14 @@ class WeightedCategoricalColumnTest(test.TestCase):
       bias = get_linear_model_bias()
       weight_var = get_linear_model_column_var(column)
       with _initialized_session():
-        self.assertAllClose((0.,), bias.eval())
-        self.assertAllClose(((0.,), (0.,), (0.,)), weight_var.eval())
-        self.assertAllClose(((0.,), (0.,)), predictions.eval())
+        self.assertAllClose((0.,), self.evaluate(bias))
+        self.assertAllClose(((0.,), (0.,), (0.,)), self.evaluate(weight_var))
+        self.assertAllClose(((0.,), (0.,)), self.evaluate(predictions))
         weight_var.assign(((1.,), (2.,), (3.,))).eval()
         # weight_var[0] * weights[0, 0] = 1 * .5 = .5
         # weight_var[2] * weights[1, 0] + weight_var[1] * weights[1, 1]
         # = 3*1 + 2*.1 = 3+.2 = 3.2
-        self.assertAllClose(((.5,), (3.2,)), predictions.eval())
+        self.assertAllClose(((.5,), (3.2,)), self.evaluate(predictions))
 
   # TODO(ptucker): Add test with embedding of weighted categorical.
 
diff --git a/tensorflow/python/feature_column/feature_column_v2.py b/tensorflow/python/feature_column/feature_column_v2.py
index 9b4a7e882f9..63089264942 100644
--- a/tensorflow/python/feature_column/feature_column_v2.py
+++ b/tensorflow/python/feature_column/feature_column_v2.py
@@ -165,6 +165,7 @@ from tensorflow.python.training import checkpoint_utils
 from tensorflow.python.training.checkpointable import tracking
 from tensorflow.python.util import deprecation
 from tensorflow.python.util import nest
+from tensorflow.python.util.tf_export import tf_export
 
 
 _FEATURE_COLUMN_DEPRECATION_DATE = '2018-11-30'
@@ -258,7 +259,7 @@ class StateManager(object):
 
 
 class _StateManagerImpl(StateManager):
-  """Manages the state of FeatureLayer and LinearModel."""
+  """Manages the state of DenseFeatures and LinearLayer."""
 
   def __init__(self, layer, trainable):
     """Creates an _StateManagerImpl object.
@@ -302,7 +303,8 @@ class _StateManagerImpl(StateManager):
     raise ValueError('Variable does not exist.')
 
 
-class FeatureLayer(Layer):
+@tf_export('keras.layers.DenseFeatures', v1=[])
+class DenseFeatures(Layer):
   """A layer that produces a dense `Tensor` based on given `feature_columns`.
 
   Generally a single example in training data is described with FeatureColumns.
@@ -318,7 +320,7 @@ class FeatureLayer(Layer):
   keywords_embedded = embedding_column(
       categorical_column_with_hash_bucket("keywords", 10K), dimensions=16)
   columns = [price, keywords_embedded, ...]
-  feature_layer = FeatureLayer(columns)
+  feature_layer = DenseFeatures(columns)
 
   features = tf.parse_example(..., features=make_parse_example_spec(columns))
   dense_tensor = feature_layer(features)
@@ -333,7 +335,7 @@ class FeatureLayer(Layer):
                trainable=True,
                name=None,
                **kwargs):
-    """Constructs a FeatureLayer.
+    """Constructs a DenseFeatures.
 
     Args:
       feature_columns: An iterable containing the FeatureColumns to use as
@@ -344,13 +346,14 @@ class FeatureLayer(Layer):
         `indicator_column`.
       trainable: If `True` also add the variable to the graph collection
         `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
-      name: Name to give to the FeatureLayer.
+      name: Name to give to the DenseFeatures.
       **kwargs: Keyword arguments to construct a layer.
 
     Raises:
       ValueError: if an item in `feature_columns` is not a `DenseColumn`.
     """
-    super(FeatureLayer, self).__init__(name=name, trainable=trainable, **kwargs)
+    super(DenseFeatures, self).__init__(
+        name=name, trainable=trainable, **kwargs)
 
     self._feature_columns = _normalize_feature_columns(feature_columns)
     self._feature_columns = sorted(self._feature_columns, key=lambda x: x.name)
@@ -371,7 +374,7 @@ class FeatureLayer(Layer):
       with variable_scope._pure_variable_scope(self.name):  # pylint: disable=protected-access
         with variable_scope._pure_variable_scope(column.name):  # pylint: disable=protected-access
           column.create_state(self._state_manager)
-      super(FeatureLayer, self).build(None)
+      super(DenseFeatures, self).build(None)
 
   def call(self, features, cols_to_output_tensors=None):
     """Returns a dense tensor corresponding to the `feature_columns`.
@@ -515,6 +518,7 @@ class _LinearModelLayer(Layer):
       return predictions
 
 
+@tf_export('keras.layers.LinearModel', v1=[])
 class LinearModel(training.Model):
   """Produces a linear prediction `Tensor` based on given `feature_columns`.
 
@@ -522,7 +526,7 @@ class LinearModel(training.Model):
   Weighted sum refers to logits in classification problems. It refers to the
   prediction itself for linear regression problems.
 
-  Note on supported columns: `LinearModel` treats categorical columns as
+  Note on supported columns: `LinearLayer` treats categorical columns as
   `indicator_column`s. To be specific, assume the input as `SparseTensor` looks
   like:
 
@@ -547,7 +551,7 @@ class LinearModel(training.Model):
   keywords = categorical_column_with_hash_bucket("keywords", 10K)
   keywords_price = crossed_column('keywords', price_buckets, ...)
   columns = [price_buckets, keywords, keywords_price ...]
-  linear_model = LinearModel(columns)
+  linear_model = LinearLayer(columns)
 
   features = tf.parse_example(..., features=make_parse_example_spec(columns))
   prediction = linear_model(features)
@@ -561,7 +565,7 @@ class LinearModel(training.Model):
                trainable=True,
                name=None,
                **kwargs):
-    """Constructs a LinearModel.
+    """Constructs a LinearLayer.
 
     Args:
       feature_columns: An iterable containing the FeatureColumns to use as
@@ -650,7 +654,7 @@ class LinearModel(training.Model):
     return self.layer.bias
 
 
-def _transform_features(features, feature_columns, state_manager):
+def _transform_features_v2(features, feature_columns, state_manager):
   """Returns transformed features based on features columns passed in.
 
   Please note that most probably you would not need to use this function. Please
@@ -695,7 +699,8 @@ def _transform_features(features, feature_columns, state_manager):
   return outputs
 
 
-def make_parse_example_spec(feature_columns):
+@tf_export('feature_column.make_parse_example_spec', v1=[])
+def make_parse_example_spec_v2(feature_columns):
   """Creates parsing spec dictionary from input feature_columns.
 
   The returned dictionary can be used as arg 'features' in `tf.parse_example`.
@@ -754,10 +759,15 @@ def make_parse_example_spec(feature_columns):
   return result
 
 
-def embedding_column(
-    categorical_column, dimension, combiner='mean', initializer=None,
-    ckpt_to_load_from=None, tensor_name_in_ckpt=None, max_norm=None,
-    trainable=True):
+@tf_export('feature_column.embedding_column')
+def embedding_column(categorical_column,
+                     dimension,
+                     combiner='mean',
+                     initializer=None,
+                     ckpt_to_load_from=None,
+                     tensor_name_in_ckpt=None,
+                     max_norm=None,
+                     trainable=True):
   """`DenseColumn` that converts from sparse, categorical input.
 
   Use this when your inputs are sparse, but you want to convert them to a dense
@@ -854,6 +864,180 @@ def embedding_column(
       trainable=trainable)
 
 
+@tf_export(v1=['feature_column.shared_embedding_columns'])
+def shared_embedding_columns(categorical_columns,
+                             dimension,
+                             combiner='mean',
+                             initializer=None,
+                             shared_embedding_collection_name=None,
+                             ckpt_to_load_from=None,
+                             tensor_name_in_ckpt=None,
+                             max_norm=None,
+                             trainable=True):
+  """List of dense columns that convert from sparse, categorical input.
+
+  This is similar to `embedding_column`, except that it produces a list of
+  embedding columns that share the same embedding weights.
+
+  Use this when your inputs are sparse and of the same type (e.g. watched and
+  impression video IDs that share the same vocabulary), and you want to convert
+  them to a dense representation (e.g., to feed to a DNN).
+
+  Inputs must be a list of categorical columns created by any of the
+  `categorical_column_*` function. They must all be of the same type and have
+  the same arguments except `key`. E.g. they can be
+  categorical_column_with_vocabulary_file with the same vocabulary_file. Some or
+  all columns could also be weighted_categorical_column.
+
+  Here is an example embedding of two features for a DNNClassifier model:
+
+  ```python
+  watched_video_id = categorical_column_with_vocabulary_file(
+      'watched_video_id', video_vocabulary_file, video_vocabulary_size)
+  impression_video_id = categorical_column_with_vocabulary_file(
+      'impression_video_id', video_vocabulary_file, video_vocabulary_size)
+  columns = shared_embedding_columns(
+      [watched_video_id, impression_video_id], dimension=10)
+
+  estimator = tf.estimator.DNNClassifier(feature_columns=columns, ...)
+
+  label_column = ...
+  def input_fn():
+    features = tf.parse_example(
+        ..., features=make_parse_example_spec(columns + [label_column]))
+    labels = features.pop(label_column.name)
+    return features, labels
+
+  estimator.train(input_fn=input_fn, steps=100)
+  ```
+
+  Here is an example using `shared_embedding_columns` with model_fn:
+
+  ```python
+  def model_fn(features, ...):
+    watched_video_id = categorical_column_with_vocabulary_file(
+        'watched_video_id', video_vocabulary_file, video_vocabulary_size)
+    impression_video_id = categorical_column_with_vocabulary_file(
+        'impression_video_id', video_vocabulary_file, video_vocabulary_size)
+    columns = shared_embedding_columns(
+        [watched_video_id, impression_video_id], dimension=10)
+    dense_tensor = input_layer(features, columns)
+    # Form DNN layers, calculate loss, and return EstimatorSpec.
+    ...
+  ```
+
+  Args:
+    categorical_columns: List of categorical columns created by a
+      `categorical_column_with_*` function. These columns produce the sparse IDs
+      that are inputs to the embedding lookup. All columns must be of the same
+      type and have the same arguments except `key`. E.g. they can be
+      categorical_column_with_vocabulary_file with the same vocabulary_file.
+      Some or all columns could also be weighted_categorical_column.
+    dimension: An integer specifying dimension of the embedding, must be > 0.
+    combiner: A string specifying how to reduce if there are multiple entries in
+      a single row. Currently 'mean', 'sqrtn' and 'sum' are supported, with
+      'mean' the default. 'sqrtn' often achieves good accuracy, in particular
+      with bag-of-words columns. Each of this can be thought as example level
+      normalizations on the column. For more information, see
+      `tf.embedding_lookup_sparse`.
+    initializer: A variable initializer function to be used in embedding
+      variable initialization. If not specified, defaults to
+      `tf.truncated_normal_initializer` with mean `0.0` and standard deviation
+      `1/sqrt(dimension)`.
+    shared_embedding_collection_name: Optional name of the collection where
+      shared embedding weights are added. If not given, a reasonable name will
+      be chosen based on the names of `categorical_columns`. This is also used
+      in `variable_scope` when creating shared embedding weights.
+    ckpt_to_load_from: String representing checkpoint name/pattern from which to
+      restore column weights. Required if `tensor_name_in_ckpt` is not `None`.
+    tensor_name_in_ckpt: Name of the `Tensor` in `ckpt_to_load_from` from which
+      to restore the column weights. Required if `ckpt_to_load_from` is not
+      `None`.
+    max_norm: If not `None`, each embedding is clipped if its l2-norm is larger
+      than this value, before combining.
+    trainable: Whether or not the embedding is trainable. Default is True.
+
+  Returns:
+    A list of dense columns that converts from sparse input. The order of
+    results follows the ordering of `categorical_columns`.
+
+  Raises:
+    ValueError: if `dimension` not > 0.
+    ValueError: if any of the given `categorical_columns` is of different type
+      or has different arguments than the others.
+    ValueError: if exactly one of `ckpt_to_load_from` and `tensor_name_in_ckpt`
+      is specified.
+    ValueError: if `initializer` is specified and is not callable.
+    RuntimeError: if eager execution is enabled.
+  """
+  if context.executing_eagerly():
+    raise RuntimeError('shared_embedding_columns are not supported when eager '
+                       'execution is enabled.')
+
+  if (dimension is None) or (dimension < 1):
+    raise ValueError('Invalid dimension {}.'.format(dimension))
+  if (ckpt_to_load_from is None) != (tensor_name_in_ckpt is None):
+    raise ValueError('Must specify both `ckpt_to_load_from` and '
+                     '`tensor_name_in_ckpt` or none of them.')
+
+  if (initializer is not None) and (not callable(initializer)):
+    raise ValueError('initializer must be callable if specified.')
+  if initializer is None:
+    initializer = init_ops.truncated_normal_initializer(
+        mean=0.0, stddev=1. / math.sqrt(dimension))
+
+  # Sort the columns so the default collection name is deterministic even if the
+  # user passes columns from an unsorted collection, such as dict.values().
+  sorted_columns = sorted(categorical_columns, key=lambda x: x.name)
+
+  c0 = sorted_columns[0]
+  num_buckets = c0._num_buckets  # pylint: disable=protected-access
+  if not isinstance(c0, fc_old._CategoricalColumn):  # pylint: disable=protected-access
+    raise ValueError(
+        'All categorical_columns must be subclasses of _CategoricalColumn. '
+        'Given: {}, of type: {}'.format(c0, type(c0)))
+  if isinstance(c0,
+                (fc_old._WeightedCategoricalColumn, WeightedCategoricalColumn)):  # pylint: disable=protected-access
+    c0 = c0.categorical_column
+  for c in sorted_columns[1:]:
+    if isinstance(
+        c, (fc_old._WeightedCategoricalColumn, WeightedCategoricalColumn)):  # pylint: disable=protected-access
+      c = c.categorical_column
+    if not isinstance(c, type(c0)):
+      raise ValueError(
+          'To use shared_embedding_column, all categorical_columns must have '
+          'the same type, or be weighted_categorical_column of the same type. '
+          'Given column: {} of type: {} does not match given column: {} of '
+          'type: {}'.format(c0, type(c0), c, type(c)))
+    if num_buckets != c._num_buckets:  # pylint: disable=protected-access
+      raise ValueError(
+          'To use shared_embedding_column, all categorical_columns must have '
+          'the same number of buckets. Given column: {} with buckets: {} does  '
+          'not match column: {} with buckets: {}'.format(
+              c0, num_buckets, c, c._num_buckets))  # pylint: disable=protected-access
+
+  if not shared_embedding_collection_name:
+    shared_embedding_collection_name = '_'.join(c.name for c in sorted_columns)
+    shared_embedding_collection_name += '_shared_embedding'
+
+  result = []
+  for column in categorical_columns:
+    result.append(
+        fc_old._SharedEmbeddingColumn(  # pylint: disable=protected-access
+            categorical_column=column,
+            initializer=initializer,
+            dimension=dimension,
+            combiner=combiner,
+            shared_embedding_collection_name=shared_embedding_collection_name,
+            ckpt_to_load_from=ckpt_to_load_from,
+            tensor_name_in_ckpt=tensor_name_in_ckpt,
+            max_norm=max_norm,
+            trainable=trainable))
+
+  return result
+
+
+@tf_export('feature_column.shared_embedding_columns', v1=[])
 def shared_embedding_columns_v2(categorical_columns,
                                 dimension,
                                 combiner='mean',
@@ -1019,6 +1203,7 @@ def shared_embedding_columns_v2(categorical_columns,
   return result
 
 
+@tf_export('feature_column.numeric_column')
 def numeric_column(key,
                    shape=(1,),
                    default_value=None,
@@ -1094,6 +1279,7 @@ def numeric_column(key,
       normalizer_fn=normalizer_fn)
 
 
+@tf_export('feature_column.bucketized_column')
 def bucketized_column(source_column, boundaries):
   """Represents discretized dense input.
 
@@ -1190,6 +1376,7 @@ def _assert_key_is_string(key):
             type(key), key))
 
 
+@tf_export('feature_column.categorical_column_with_hash_bucket')
 def categorical_column_with_hash_bucket(key,
                                         hash_bucket_size,
                                         dtype=dtypes.string):
@@ -1248,6 +1435,7 @@ def categorical_column_with_hash_bucket(key,
   return HashedCategoricalColumn(key, hash_bucket_size, dtype)
 
 
+@tf_export(v1=['feature_column.categorical_column_with_vocabulary_file'])
 def categorical_column_with_vocabulary_file(key,
                                             vocabulary_file,
                                             vocabulary_size=None,
@@ -1325,6 +1513,97 @@ def categorical_column_with_vocabulary_file(key,
   Returns:
     A `CategoricalColumn` with a vocabulary file.
 
+  Raises:
+    ValueError: `vocabulary_file` is missing or cannot be opened.
+    ValueError: `vocabulary_size` is missing or < 1.
+    ValueError: `num_oov_buckets` is a negative integer.
+    ValueError: `num_oov_buckets` and `default_value` are both specified.
+    ValueError: `dtype` is neither string nor integer.
+  """
+  return categorical_column_with_vocabulary_file_v2(
+      key, vocabulary_file, vocabulary_size,
+      dtype, default_value,
+      num_oov_buckets)
+
+
+@tf_export('feature_column.categorical_column_with_vocabulary_file', v1=[])
+def categorical_column_with_vocabulary_file_v2(key,
+                                               vocabulary_file,
+                                               vocabulary_size=None,
+                                               dtype=dtypes.string,
+                                               default_value=None,
+                                               num_oov_buckets=0):
+  """A `CategoricalColumn` with a vocabulary file.
+
+  Use this when your inputs are in string or integer format, and you have a
+  vocabulary file that maps each value to an integer ID. By default,
+  out-of-vocabulary values are ignored. Use either (but not both) of
+  `num_oov_buckets` and `default_value` to specify how to include
+  out-of-vocabulary values.
+
+  For input dictionary `features`, `features[key]` is either `Tensor` or
+  `SparseTensor`. If `Tensor`, missing values can be represented by `-1` for int
+  and `''` for string, which will be dropped by this feature column.
+
+  Example with `num_oov_buckets`:
+  File '/us/states.txt' contains 50 lines, each with a 2-character U.S. state
+  abbreviation. All inputs with values in that file are assigned an ID 0-49,
+  corresponding to its line number. All other values are hashed and assigned an
+  ID 50-54.
+
+  ```python
+  states = categorical_column_with_vocabulary_file(
+      key='states', vocabulary_file='/us/states.txt', vocabulary_size=50,
+      num_oov_buckets=5)
+  columns = [states, ...]
+  features = tf.parse_example(..., features=make_parse_example_spec(columns))
+  linear_prediction = linear_model(features, columns)
+  ```
+
+  Example with `default_value`:
+  File '/us/states.txt' contains 51 lines - the first line is 'XX', and the
+  other 50 each have a 2-character U.S. state abbreviation. Both a literal 'XX'
+  in input, and other values missing from the file, will be assigned ID 0. All
+  others are assigned the corresponding line number 1-50.
+
+  ```python
+  states = categorical_column_with_vocabulary_file(
+      key='states', vocabulary_file='/us/states.txt', vocabulary_size=51,
+      default_value=0)
+  columns = [states, ...]
+  features = tf.parse_example(..., features=make_parse_example_spec(columns))
+  linear_prediction, _, _ = linear_model(features, columns)
+  ```
+
+  And to make an embedding with either:
+
+  ```python
+  columns = [embedding_column(states, 3),...]
+  features = tf.parse_example(..., features=make_parse_example_spec(columns))
+  dense_tensor = input_layer(features, columns)
+  ```
+
+  Args:
+    key: A unique string identifying the input feature. It is used as the
+      column name and the dictionary key for feature parsing configs, feature
+      `Tensor` objects, and feature columns.
+    vocabulary_file: The vocabulary file name.
+    vocabulary_size: Number of the elements in the vocabulary. This must be no
+      greater than length of `vocabulary_file`, if less than length, later
+      values are ignored. If None, it is set to the length of `vocabulary_file`.
+    dtype: The type of features. Only string and integer types are supported.
+    default_value: The integer ID value to return for out-of-vocabulary feature
+      values, defaults to `-1`. This can not be specified with a positive
+      `num_oov_buckets`.
+    num_oov_buckets: Non-negative integer, the number of out-of-vocabulary
+      buckets. All out-of-vocabulary inputs will be assigned IDs in the range
+      `[vocabulary_size, vocabulary_size+num_oov_buckets)` based on a hash of
+      the input value. A positive `num_oov_buckets` can not be specified with
+      `default_value`.
+
+  Returns:
+    A `CategoricalColumn` with a vocabulary file.
+
   Raises:
     ValueError: `vocabulary_file` is missing or cannot be opened.
     ValueError: `vocabulary_size` is missing or < 1.
@@ -1367,8 +1646,12 @@ def categorical_column_with_vocabulary_file(key,
       dtype=dtype)
 
 
-def categorical_column_with_vocabulary_list(
-    key, vocabulary_list, dtype=None, default_value=-1, num_oov_buckets=0):
+@tf_export('feature_column.categorical_column_with_vocabulary_list')
+def categorical_column_with_vocabulary_list(key,
+                                            vocabulary_list,
+                                            dtype=None,
+                                            default_value=-1,
+                                            num_oov_buckets=0):
   """A `CategoricalColumn` with in-memory vocabulary.
 
   Use this when your inputs are in string or integer format, and you have an
@@ -1480,6 +1763,7 @@ def categorical_column_with_vocabulary_list(
       num_oov_buckets=num_oov_buckets)
 
 
+@tf_export('feature_column.categorical_column_with_identity')
 def categorical_column_with_identity(key, num_buckets, default_value=None):
   """A `CategoricalColumn` that returns identity values.
 
@@ -1547,6 +1831,7 @@ def categorical_column_with_identity(key, num_buckets, default_value=None):
       key=key, number_buckets=num_buckets, default_value=default_value)
 
 
+@tf_export('feature_column.indicator_column')
 def indicator_column(categorical_column):
   """Represents multi-hot representation of given categorical column.
 
@@ -1581,8 +1866,10 @@ def indicator_column(categorical_column):
   return IndicatorColumn(categorical_column)
 
 
-def weighted_categorical_column(
-    categorical_column, weight_feature_key, dtype=dtypes.float32):
+@tf_export('feature_column.weighted_categorical_column')
+def weighted_categorical_column(categorical_column,
+                                weight_feature_key,
+                                dtype=dtypes.float32):
   """Applies weight values to a `CategoricalColumn`.
 
   Use this when each of your sparse inputs has both an ID and a value. For
@@ -1655,6 +1942,7 @@ def weighted_categorical_column(
       dtype=dtype)
 
 
+@tf_export('feature_column.crossed_column')
 def crossed_column(keys, hash_bucket_size, hash_key=None):
   """Returns a column for performing crosses of categorical features.
 
@@ -2120,7 +2408,7 @@ def _create_categorical_column_weighted_sum(
     weight_tensor = sparse_ops.sparse_reshape(
         weight_tensor, [array_ops.shape(weight_tensor)[0], -1])
 
-  return _safe_embedding_lookup_sparse(
+  return embedding_ops.safe_embedding_lookup_sparse(
       weight_var,
       id_tensor,
       sparse_weights=weight_tensor,
@@ -2731,7 +3019,7 @@ class EmbeddingColumn(
       })
 
     # Return embedding lookup result.
-    return _safe_embedding_lookup_sparse(
+    return embedding_ops.safe_embedding_lookup_sparse(
         embedding_weights=embedding_weights,
         sparse_ids=sparse_ids,
         sparse_weights=sparse_weights,
@@ -2890,7 +3178,7 @@ class EmbeddingColumn(
 def _raise_shared_embedding_column_error():
   raise ValueError('SharedEmbeddingColumns are not supported in '
                    '`linear_model` or `input_layer`. Please use '
-                   '`FeatureLayer` or `LinearModel` instead.')
+                   '`DenseFeatures` or `LinearModel` instead.')
 
 
 class SharedEmbeddingColumnCreator(tracking.Checkpointable):
@@ -3002,7 +3290,7 @@ class SharedEmbeddingColumn(
       embedding_weights = self.shared_embedding_column_creator.embedding_weights
 
       # Return embedding lookup result.
-      return _safe_embedding_lookup_sparse(
+      return embedding_ops.safe_embedding_lookup_sparse(
           embedding_weights=embedding_weights,
           sparse_ids=sparse_ids,
           sparse_weights=sparse_weights,
@@ -3687,9 +3975,13 @@ class WeightedCategoricalColumn(
 
   def transform_feature(self, transformation_cache, state_manager):
     """Applies weights to tensor generated from `categorical_column`'."""
+    print('WeightedCategoricalColumn.transform_feature: ', self.name)
+    print('Weight feature key: ', self.weight_feature_key)
     weight_tensor = transformation_cache.get(self.weight_feature_key,
                                              state_manager)
+    print('Weight tensor before: ', weight_tensor)
     weight_tensor = self._transform_weight_tensor(weight_tensor)
+    print('Weight tensor after: ', weight_tensor)
     return (transformation_cache.get(self.categorical_column, state_manager),
             weight_tensor)
 
@@ -3703,7 +3995,9 @@ class WeightedCategoricalColumn(
 
   def get_sparse_tensors(self, transformation_cache, state_manager):
     """See `CategoricalColumn` base class."""
+    print('WeightedCategoricalColumn.get_sparse_tensors: ', self.name)
     tensors = transformation_cache.get(self, state_manager)
+    print('tensors[1]: ', tensors[1])
     return CategoricalColumn.IdWeightPair(tensors[0], tensors[1])
 
   @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE,
@@ -3898,142 +4192,6 @@ def _collect_leaf_level_keys(cross):
   return leaf_level_keys
 
 
-# TODO(zakaria): Move this to embedding_ops and make it public.
-def _safe_embedding_lookup_sparse(embedding_weights,
-                                  sparse_ids,
-                                  sparse_weights=None,
-                                  combiner='mean',
-                                  default_id=None,
-                                  name=None,
-                                  partition_strategy='div',
-                                  max_norm=None):
-  """Lookup embedding results, accounting for invalid IDs and empty features.
-
-  The partitioned embedding in `embedding_weights` must all be the same shape
-  except for the first dimension. The first dimension is allowed to vary as the
-  vocabulary size is not necessarily a multiple of `P`.  `embedding_weights`
-  may be a `PartitionedVariable` as returned by using `tf.get_variable()` with a
-  partitioner.
-
-  Invalid IDs (< 0) are pruned from input IDs and weights, as well as any IDs
-  with non-positive weight. For an entry with no features, the embedding vector
-  for `default_id` is returned, or the 0-vector if `default_id` is not supplied.
-
-  The ids and weights may be multi-dimensional. Embeddings are always aggregated
-  along the last dimension.
-
-  Args:
-    embedding_weights:  A list of `P` float `Tensor`s or values representing
-        partitioned embedding `Tensor`s.  Alternatively, a `PartitionedVariable`
-        created by partitioning along dimension 0.  The total unpartitioned
-        shape should be `[e_0, e_1, ..., e_m]`, where `e_0` represents the
-        vocab size and `e_1, ..., e_m` are the embedding dimensions.
-    sparse_ids: `SparseTensor` of shape `[d_0, d_1, ..., d_n]` containing the
-        ids. `d_0` is typically batch size.
-    sparse_weights: `SparseTensor` of same shape as `sparse_ids`, containing
-        float weights corresponding to `sparse_ids`, or `None` if all weights
-        are be assumed to be 1.0.
-    combiner: A string specifying how to combine embedding results for each
-        entry. Currently "mean", "sqrtn" and "sum" are supported, with "mean"
-        the default.
-    default_id: The id to use for an entry with no features.
-    name: A name for this operation (optional).
-    partition_strategy: A string specifying the partitioning strategy.
-        Currently `"div"` and `"mod"` are supported. Default is `"div"`.
-    max_norm: If not `None`, all embeddings are l2-normalized to max_norm before
-        combining.
-
-
-  Returns:
-    Dense `Tensor` of shape `[d_0, d_1, ..., d_{n-1}, e_1, ..., e_m]`.
-
-  Raises:
-    ValueError: if `embedding_weights` is empty.
-  """
-  if embedding_weights is None:
-    raise ValueError('Missing embedding_weights %s.' % embedding_weights)
-  if isinstance(embedding_weights, variables.PartitionedVariable):
-    embedding_weights = list(embedding_weights)  # get underlying Variables.
-  if not isinstance(embedding_weights, list):
-    embedding_weights = [embedding_weights]
-  if len(embedding_weights) < 1:
-    raise ValueError('Missing embedding_weights %s.' % embedding_weights)
-
-  dtype = sparse_weights.dtype if sparse_weights is not None else None
-  # TODO(rohanj): Look into removing this convert_to_tensor call.
-  embedding_weights = [
-      ops.convert_to_tensor(w, dtype=dtype) for w in embedding_weights
-  ]
-
-  with ops.name_scope(name, 'embedding_lookup',
-                      embedding_weights + [sparse_ids,
-                                           sparse_weights]) as scope:
-    # Reshape higher-rank sparse ids and weights to linear segment ids.
-    original_shape = sparse_ids.dense_shape
-    original_rank_dim = tensor_shape.dimension_value(
-        sparse_ids.dense_shape.get_shape()[0])
-    original_rank = (
-        array_ops.size(original_shape)
-        if original_rank_dim is None
-        else original_rank_dim)
-    sparse_ids = sparse_ops.sparse_reshape(sparse_ids, [
-        math_ops.reduce_prod(
-            array_ops.slice(original_shape, [0], [original_rank - 1])),
-        array_ops.gather(original_shape, original_rank - 1)])
-    if sparse_weights is not None:
-      sparse_weights = sparse_tensor_lib.SparseTensor(
-          sparse_ids.indices,
-          sparse_weights.values, sparse_ids.dense_shape)
-
-    # Prune invalid ids and weights.
-    sparse_ids, sparse_weights = _prune_invalid_ids(sparse_ids, sparse_weights)
-    if combiner != 'sum':
-      sparse_ids, sparse_weights = _prune_invalid_weights(
-          sparse_ids, sparse_weights)
-
-    # Fill in dummy values for empty features, if necessary.
-    sparse_ids, is_row_empty = sparse_ops.sparse_fill_empty_rows(sparse_ids,
-                                                                 default_id or
-                                                                 0)
-    if sparse_weights is not None:
-      sparse_weights, _ = sparse_ops.sparse_fill_empty_rows(sparse_weights, 1.0)
-
-    result = embedding_ops.embedding_lookup_sparse(
-        embedding_weights,
-        sparse_ids,
-        sparse_weights,
-        combiner=combiner,
-        partition_strategy=partition_strategy,
-        name=None if default_id is None else scope,
-        max_norm=max_norm)
-
-    if default_id is None:
-      # Broadcast is_row_empty to the same shape as embedding_lookup_result,
-      # for use in Select.
-      is_row_empty = array_ops.tile(
-          array_ops.reshape(is_row_empty, [-1, 1]),
-          array_ops.stack([1, array_ops.shape(result)[1]]))
-
-      result = array_ops.where(is_row_empty,
-                               array_ops.zeros_like(result),
-                               result,
-                               name=scope)
-
-    # Reshape back from linear ids back into higher-dimensional dense result.
-    final_result = array_ops.reshape(
-        result,
-        array_ops.concat([
-            array_ops.slice(
-                math_ops.cast(original_shape, dtypes.int32), [0],
-                [original_rank - 1]),
-            array_ops.slice(array_ops.shape(result), [1], [-1])
-        ], 0))
-    final_result.set_shape(tensor_shape.unknown_shape(
-        (tensor_shape.Dimension(original_rank_dim) - 1).value).concatenate(
-            result.get_shape()[1:]))
-    return final_result
-
-
 def _prune_invalid_ids(sparse_ids, sparse_weights):
   """Prune invalid IDs (< 0) from the input ids and weights."""
   is_id_valid = math_ops.greater_equal(sparse_ids.values, 0)
@@ -4089,10 +4247,14 @@ class IndicatorColumn(
           sp_ids=id_tensor,
           sp_values=weight_tensor,
           vocab_size=int(self._variable_shape[-1]))
-      # Remove (?, -1) index
+      # Remove (?, -1) index.
       weighted_column = sparse_ops.sparse_slice(weighted_column, [0, 0],
                                                 weighted_column.dense_shape)
-      return sparse_ops.sparse_tensor_to_dense(weighted_column)
+      # Use scatter_nd to merge duplicated indices if existed,
+      # instead of sparse_tensor_to_dense.
+      return array_ops.scatter_nd(weighted_column.indices,
+                                  weighted_column.values,
+                                  weighted_column.dense_shape)
 
     dense_id_tensor = sparse_ops.sparse_tensor_to_dense(
         id_tensor, default_value=-1)
@@ -4534,7 +4696,10 @@ def deserialize_feature_column(config,
         'Expected FeatureColumn class, instead found: {}'.format(cls))
 
   # Always deserialize the FeatureColumn, in order to get the name.
-  new_instance = cls._from_config(cls_config, columns_by_name=columns_by_name)  # pylint: disable=protected-access
+  new_instance = cls._from_config(  # pylint: disable=protected-access
+      cls_config,
+      custom_objects=custom_objects,
+      columns_by_name=columns_by_name)
 
   # If the name already exists, re-use the column from columns_by_name,
   # (new_instance remains unused).
diff --git a/tensorflow/python/feature_column/feature_column_v2_test.py b/tensorflow/python/feature_column/feature_column_v2_test.py
index a26b8600568..0755c0b6ac2 100644
--- a/tensorflow/python/feature_column/feature_column_v2_test.py
+++ b/tensorflow/python/feature_column/feature_column_v2_test.py
@@ -31,7 +31,6 @@ from tensorflow.python import keras
 from tensorflow.python.client import session
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
-from tensorflow.python.estimator.inputs import numpy_io
 from tensorflow.python.feature_column import feature_column as fc_old
 from tensorflow.python.feature_column import feature_column_v2 as fc
 from tensorflow.python.framework import constant_op
@@ -50,6 +49,7 @@ from tensorflow.python.platform import test
 from tensorflow.python.training import coordinator
 from tensorflow.python.training import queue_runner_impl
 from tensorflow.python.training import rmsprop
+from tensorflow_estimator.python.estimator.inputs import numpy_io
 
 
 def _initialized_session(config=None):
@@ -218,6 +218,7 @@ class LazyColumnTest(test.TestCase):
         TypeError, '"key" must be either a "str" or "FeatureColumn".'):
       transformation_cache.get(NotAFeatureColumn(), None)
 
+  @test_util.run_deprecated_v1
   def test_expand_dim_rank_1_sparse_tensor_empty_batch(self):
     # empty 1-D sparse tensor:
     transformation_cache = fc.FeatureTransformationCache(
@@ -228,15 +229,16 @@ class LazyColumnTest(test.TestCase):
                     dense_shape=[0],
                     values=np.array([]))
         })
-    with self.cached_session():
-      spv = transformation_cache.get('a', None).eval()
-      self.assertAllEqual(np.array([0, 1], dtype=np.int64), spv.dense_shape)
-      self.assertAllEqual(
-          np.reshape(np.array([], dtype=np.int64), (0, 2)), spv.indices)
+
+    spv = self.evaluate(transformation_cache.get('a', None))
+    self.assertAllEqual(np.array([0, 1], dtype=np.int64), spv.dense_shape)
+    self.assertAllEqual(
+        np.reshape(np.array([], dtype=np.int64), (0, 2)), spv.indices)
 
 
 class NumericColumnTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def test_defaults(self):
     a = fc.numeric_column('aaa')
     self.assertEqual('aaa', a.key)
@@ -315,59 +317,67 @@ class NumericColumnTest(test.TestCase):
         'aaa': parsing_ops.FixedLenFeature((2, 3), dtype=dtypes.int32)
     }, a.parse_example_spec)
 
+  @test_util.run_deprecated_v1
   def test_parse_example_no_default_value(self):
     price = fc.numeric_column('price', shape=[2])
-    data = example_pb2.Example(features=feature_pb2.Features(
-        feature={
-            'price':
-                feature_pb2.Feature(float_list=feature_pb2.FloatList(
-                    value=[20., 110.]))
-        }))
+    data = example_pb2.Example(
+        features=feature_pb2.Features(
+            feature={
+                'price':
+                    feature_pb2.Feature(
+                        float_list=feature_pb2.FloatList(value=[20., 110.]))
+            }))
     features = parsing_ops.parse_example(
         serialized=[data.SerializeToString()],
-        features=fc.make_parse_example_spec([price]))
+        features=fc.make_parse_example_spec_v2([price]))
     self.assertIn('price', features)
-    with self.cached_session():
-      self.assertAllEqual([[20., 110.]], features['price'].eval())
 
+    self.assertAllEqual([[20., 110.]], self.evaluate(features['price']))
+
+  @test_util.run_deprecated_v1
   def test_parse_example_with_default_value(self):
     price = fc.numeric_column('price', shape=[2], default_value=11.)
-    data = example_pb2.Example(features=feature_pb2.Features(
-        feature={
-            'price':
-                feature_pb2.Feature(float_list=feature_pb2.FloatList(
-                    value=[20., 110.]))
-        }))
-    no_data = example_pb2.Example(features=feature_pb2.Features(
-        feature={
-            'something_else':
-                feature_pb2.Feature(float_list=feature_pb2.FloatList(
-                    value=[20., 110.]))
-        }))
+    data = example_pb2.Example(
+        features=feature_pb2.Features(
+            feature={
+                'price':
+                    feature_pb2.Feature(
+                        float_list=feature_pb2.FloatList(value=[20., 110.]))
+            }))
+    no_data = example_pb2.Example(
+        features=feature_pb2.Features(
+            feature={
+                'something_else':
+                    feature_pb2.Feature(
+                        float_list=feature_pb2.FloatList(value=[20., 110.]))
+            }))
     features = parsing_ops.parse_example(
         serialized=[data.SerializeToString(),
                     no_data.SerializeToString()],
-        features=fc.make_parse_example_spec([price]))
+        features=fc.make_parse_example_spec_v2([price]))
     self.assertIn('price', features)
-    with self.cached_session():
-      self.assertAllEqual([[20., 110.], [11., 11.]], features['price'].eval())
+
+    self.assertAllEqual([[20., 110.], [11., 11.]],
+                        self.evaluate(features['price']))
 
   def test_normalizer_fn_must_be_callable(self):
     with self.assertRaisesRegexp(TypeError, 'must be a callable'):
       fc.numeric_column('price', normalizer_fn='NotACallable')
 
+  @test_util.run_deprecated_v1
   def test_normalizer_fn_transform_feature(self):
 
     def _increment_two(input_tensor):
       return input_tensor + 2.
 
     price = fc.numeric_column('price', shape=[2], normalizer_fn=_increment_two)
-    output = fc._transform_features({
+    output = fc._transform_features_v2({
         'price': [[1., 2.], [5., 6.]]
     }, [price], None)
-    with self.cached_session():
-      self.assertAllEqual([[3., 4.], [7., 8.]], output[price].eval())
 
+    self.assertAllEqual([[3., 4.], [7., 8.]], self.evaluate(output[price]))
+
+  @test_util.run_deprecated_v1
   def test_get_dense_tensor(self):
 
     def _increment_two(input_tensor):
@@ -391,6 +401,7 @@ class NumericColumnTest(test.TestCase):
     with self.assertRaisesRegexp(ValueError, 'must be a Tensor'):
       price.transform_feature(transformation_cache, None)
 
+  @test_util.run_deprecated_v1
   def test_deep_copy(self):
     a = fc.numeric_column('aaa', shape=[1, 2], default_value=[[3., 2.]])
     a_copy = copy.deepcopy(a)
@@ -403,6 +414,7 @@ class NumericColumnTest(test.TestCase):
         'aaa', shape=[1, 2], default_value=np.array([[3., 2.]]))
     self.assertEqual(a.default_value, ((3., 2.),))
 
+  @test_util.run_deprecated_v1
   def test_linear_model(self):
     price = fc.numeric_column('price')
     with ops.Graph().as_default():
@@ -411,11 +423,11 @@ class NumericColumnTest(test.TestCase):
       predictions = model(features)
       price_var, bias = model.variables
       with _initialized_session() as sess:
-        self.assertAllClose([0.], bias.eval())
-        self.assertAllClose([[0.]], price_var.eval())
-        self.assertAllClose([[0.], [0.]], predictions.eval())
+        self.assertAllClose([0.], self.evaluate(bias))
+        self.assertAllClose([[0.]], self.evaluate(price_var))
+        self.assertAllClose([[0.], [0.]], self.evaluate(predictions))
         sess.run(price_var.assign([[10.]]))
-        self.assertAllClose([[10.], [50.]], predictions.eval())
+        self.assertAllClose([[10.], [50.]], self.evaluate(predictions))
 
   def test_old_linear_model(self):
     price = fc.numeric_column('price')
@@ -425,12 +437,13 @@ class NumericColumnTest(test.TestCase):
       bias = get_linear_model_bias()
       price_var = get_linear_model_column_var(price)
       with _initialized_session() as sess:
-        self.assertAllClose([0.], bias.eval())
-        self.assertAllClose([[0.]], price_var.eval())
-        self.assertAllClose([[0.], [0.]], predictions.eval())
+        self.assertAllClose([0.], self.evaluate(bias))
+        self.assertAllClose([[0.]], self.evaluate(price_var))
+        self.assertAllClose([[0.], [0.]], self.evaluate(predictions))
         sess.run(price_var.assign([[10.]]))
-        self.assertAllClose([[10.], [50.]], predictions.eval())
+        self.assertAllClose([[10.], [50.]], self.evaluate(predictions))
 
+  @test_util.run_deprecated_v1
   def test_serialization(self):
 
     def _increment_two(input_tensor):
@@ -471,17 +484,17 @@ class BucketizedColumnTest(test.TestCase):
 
   def test_invalid_boundaries(self):
     a = fc.numeric_column('aaa')
-    with self.assertRaisesRegexp(
-        ValueError, 'boundaries must be a sorted list'):
+    with self.assertRaisesRegexp(ValueError,
+                                 'boundaries must be a sorted list'):
       fc.bucketized_column(a, boundaries=None)
-    with self.assertRaisesRegexp(
-        ValueError, 'boundaries must be a sorted list'):
+    with self.assertRaisesRegexp(ValueError,
+                                 'boundaries must be a sorted list'):
       fc.bucketized_column(a, boundaries=1.)
-    with self.assertRaisesRegexp(
-        ValueError, 'boundaries must be a sorted list'):
+    with self.assertRaisesRegexp(ValueError,
+                                 'boundaries must be a sorted list'):
       fc.bucketized_column(a, boundaries=[1, 0])
-    with self.assertRaisesRegexp(
-        ValueError, 'boundaries must be a sorted list'):
+    with self.assertRaisesRegexp(ValueError,
+                                 'boundaries must be a sorted list'):
       fc.bucketized_column(a, boundaries=[1, 1])
 
   def test_name(self):
@@ -491,7 +504,7 @@ class BucketizedColumnTest(test.TestCase):
     self.assertEqual('aaa_bucketized', b.name)
 
   def test_is_v2_column_old_numeric(self):
-    a = fc_old.numeric_column('aaa', dtype=dtypes.int32)
+    a = fc_old._numeric_column('aaa', dtype=dtypes.int32)
     b = fc.bucketized_column(a, boundaries=[0, 1])
     self.assertFalse(b._is_v2_column)
     self.assertEqual('aaa_bucketized', b.name)
@@ -515,32 +528,38 @@ class BucketizedColumnTest(test.TestCase):
     # Column 'aaa` has shape [2] times three buckets -> num_buckets=6.
     self.assertEqual(6, b.num_buckets)
 
+  @test_util.run_deprecated_v1
   def test_parse_example(self):
     price = fc.numeric_column('price', shape=[2])
     bucketized_price = fc.bucketized_column(price, boundaries=[0, 50])
-    data = example_pb2.Example(features=feature_pb2.Features(
-        feature={
-            'price':
-                feature_pb2.Feature(float_list=feature_pb2.FloatList(
-                    value=[20., 110.]))
-        }))
+    data = example_pb2.Example(
+        features=feature_pb2.Features(
+            feature={
+                'price':
+                    feature_pb2.Feature(
+                        float_list=feature_pb2.FloatList(value=[20., 110.]))
+            }))
     features = parsing_ops.parse_example(
         serialized=[data.SerializeToString()],
-        features=fc.make_parse_example_spec([bucketized_price]))
+        features=fc.make_parse_example_spec_v2([bucketized_price]))
     self.assertIn('price', features)
-    with self.cached_session():
-      self.assertAllEqual([[20., 110.]], features['price'].eval())
 
+    self.assertAllEqual([[20., 110.]], self.evaluate(features['price']))
+
+  @test_util.run_deprecated_v1
   def test_transform_feature(self):
     price = fc.numeric_column('price', shape=[2])
     bucketized_price = fc.bucketized_column(price, boundaries=[0, 2, 4, 6])
     with ops.Graph().as_default():
-      transformed_tensor = fc._transform_features({
+      transformed_tensor = fc._transform_features_v2({
           'price': [[-1., 1.], [5., 6.]]
       }, [bucketized_price], None)
-      with _initialized_session():
-        self.assertAllEqual([[0, 1], [3, 4]],
-                            transformed_tensor[bucketized_price].eval())
+
+      self.evaluate(variables_lib.global_variables_initializer())
+      self.evaluate(lookup_ops.tables_initializer())
+
+      self.assertAllEqual([[0, 1], [3, 4]],
+                          self.evaluate(transformed_tensor[bucketized_price]))
 
   def test_get_dense_tensor_one_input_value(self):
     """Tests _get_dense_tensor() for input with shape=[1]."""
@@ -550,16 +569,17 @@ class BucketizedColumnTest(test.TestCase):
       transformation_cache = fc.FeatureTransformationCache({
           'price': [[-1.], [1.], [5.], [6.]]
       })
-      with _initialized_session():
-        bucketized_price_tensor = bucketized_price.get_dense_tensor(
-            transformation_cache, None)
-        self.assertAllClose(
-            # One-hot tensor.
-            [[[1., 0., 0., 0., 0.]],
-             [[0., 1., 0., 0., 0.]],
-             [[0., 0., 0., 1., 0.]],
-             [[0., 0., 0., 0., 1.]]],
-            bucketized_price_tensor.eval())
+
+      self.evaluate(variables_lib.global_variables_initializer())
+      self.evaluate(lookup_ops.tables_initializer())
+
+      bucketized_price_tensor = bucketized_price.get_dense_tensor(
+          transformation_cache, None)
+      self.assertAllClose(
+          # One-hot tensor.
+          [[[1., 0., 0., 0., 0.]], [[0., 1., 0., 0., 0.]],
+           [[0., 0., 0., 1., 0.]], [[0., 0., 0., 0., 1.]]],
+          self.evaluate(bucketized_price_tensor))
 
   def test_get_dense_tensor_two_input_values(self):
     """Tests _get_dense_tensor() for input with shape=[2]."""
@@ -569,14 +589,17 @@ class BucketizedColumnTest(test.TestCase):
       transformation_cache = fc.FeatureTransformationCache({
           'price': [[-1., 1.], [5., 6.]]
       })
-      with _initialized_session():
-        bucketized_price_tensor = bucketized_price.get_dense_tensor(
-            transformation_cache, None)
-        self.assertAllClose(
-            # One-hot tensor.
-            [[[1., 0., 0., 0., 0.], [0., 1., 0., 0., 0.]],
-             [[0., 0., 0., 1., 0.], [0., 0., 0., 0., 1.]]],
-            bucketized_price_tensor.eval())
+
+      self.evaluate(variables_lib.global_variables_initializer())
+      self.evaluate(lookup_ops.tables_initializer())
+
+      bucketized_price_tensor = bucketized_price.get_dense_tensor(
+          transformation_cache, None)
+      self.assertAllClose(
+          # One-hot tensor.
+          [[[1., 0., 0., 0., 0.], [0., 1., 0., 0., 0.]],
+           [[0., 0., 0., 1., 0.], [0., 0., 0., 0., 1.]]],
+          self.evaluate(bucketized_price_tensor))
 
   def test_get_sparse_tensors_one_input_value(self):
     """Tests _get_sparse_tensors() for input with shape=[1]."""
@@ -591,8 +614,8 @@ class BucketizedColumnTest(test.TestCase):
             transformation_cache, None)
         self.assertIsNone(id_weight_pair.weight_tensor)
         id_tensor_value = sess.run(id_weight_pair.id_tensor)
-        self.assertAllEqual(
-            [[0, 0], [1, 0], [2, 0], [3, 0]], id_tensor_value.indices)
+        self.assertAllEqual([[0, 0], [1, 0], [2, 0], [3, 0]],
+                            id_tensor_value.indices)
         self.assertAllEqual([0, 1, 3, 4], id_tensor_value.values)
         self.assertAllEqual([4, 1], id_tensor_value.dense_shape)
 
@@ -609,8 +632,8 @@ class BucketizedColumnTest(test.TestCase):
             transformation_cache, None)
         self.assertIsNone(id_weight_pair.weight_tensor)
         id_tensor_value = sess.run(id_weight_pair.id_tensor)
-        self.assertAllEqual(
-            [[0, 0], [0, 1], [1, 0], [1, 1]], id_tensor_value.indices)
+        self.assertAllEqual([[0, 0], [0, 1], [1, 0], [1, 1]],
+                            id_tensor_value.indices)
         # Values 0-4 correspond to the first column of the input price.
         # Values 5-9 correspond to the second column of the input price.
         self.assertAllEqual([0, 6, 3, 9], id_tensor_value.values)
@@ -627,6 +650,7 @@ class BucketizedColumnTest(test.TestCase):
     with self.assertRaisesRegexp(ValueError, 'must be a Tensor'):
       bucketized_price.transform_feature(transformation_cache, None)
 
+  @test_util.run_deprecated_v1
   def test_deep_copy(self):
     a = fc.numeric_column('aaa', shape=[2])
     a_bucketized = fc.bucketized_column(a, boundaries=[0, 1])
@@ -645,20 +669,23 @@ class BucketizedColumnTest(test.TestCase):
       predictions = model(features)
       bucketized_price_var, bias = model.variables
       with _initialized_session() as sess:
-        self.assertAllClose([0.], bias.eval())
+        self.assertAllClose([0.], self.evaluate(bias))
         # One weight variable per bucket, all initialized to zero.
-        self.assertAllClose(
-            [[0.], [0.], [0.], [0.], [0.]], bucketized_price_var.eval())
-        self.assertAllClose([[0.], [0.], [0.], [0.]], predictions.eval())
-        sess.run(bucketized_price_var.assign(
-            [[10.], [20.], [30.], [40.], [50.]]))
+        self.assertAllClose([[0.], [0.], [0.], [0.], [0.]],
+                            self.evaluate(bucketized_price_var))
+        self.assertAllClose([[0.], [0.], [0.], [0.]],
+                            self.evaluate(predictions))
+        sess.run(
+            bucketized_price_var.assign([[10.], [20.], [30.], [40.], [50.]]))
         # price -1. is in the 0th bucket, whose weight is 10.
         # price 1. is in the 1st bucket, whose weight is 20.
         # price 5. is in the 3rd bucket, whose weight is 40.
         # price 6. is in the 4th bucket, whose weight is 50.
-        self.assertAllClose([[10.], [20.], [40.], [50.]], predictions.eval())
+        self.assertAllClose([[10.], [20.], [40.], [50.]],
+                            self.evaluate(predictions))
         sess.run(bias.assign([1.]))
-        self.assertAllClose([[11.], [21.], [41.], [51.]], predictions.eval())
+        self.assertAllClose([[11.], [21.], [41.], [51.]],
+                            self.evaluate(predictions))
 
   def test_linear_model_two_input_values(self):
     """Tests linear_model() for input with shape=[2]."""
@@ -670,24 +697,24 @@ class BucketizedColumnTest(test.TestCase):
       predictions = model(features)
       bucketized_price_var, bias = model.variables
       with _initialized_session() as sess:
-        self.assertAllClose([0.], bias.eval())
+        self.assertAllClose([0.], self.evaluate(bias))
         # One weight per bucket per input column, all initialized to zero.
         self.assertAllClose(
             [[0.], [0.], [0.], [0.], [0.], [0.], [0.], [0.], [0.], [0.]],
-            bucketized_price_var.eval())
-        self.assertAllClose([[0.], [0.]], predictions.eval())
-        sess.run(bucketized_price_var.assign(
-            [[10.], [20.], [30.], [40.], [50.],
-             [60.], [70.], [80.], [90.], [100.]]))
+            self.evaluate(bucketized_price_var))
+        self.assertAllClose([[0.], [0.]], self.evaluate(predictions))
+        sess.run(
+            bucketized_price_var.assign([[10.], [20.], [30.], [40.], [50.],
+                                         [60.], [70.], [80.], [90.], [100.]]))
         # 1st example:
         #   price -1. is in the 0th bucket, whose weight is 10.
         #   price 1. is in the 6th bucket, whose weight is 70.
         # 2nd example:
         #   price 5. is in the 3rd bucket, whose weight is 40.
         #   price 6. is in the 9th bucket, whose weight is 100.
-        self.assertAllClose([[80.], [140.]], predictions.eval())
+        self.assertAllClose([[80.], [140.]], self.evaluate(predictions))
         sess.run(bias.assign([1.]))
-        self.assertAllClose([[81.], [141.]], predictions.eval())
+        self.assertAllClose([[81.], [141.]], self.evaluate(predictions))
 
   def test_old_linear_model_one_input_value(self):
     """Tests linear_model() for input with shape=[1]."""
@@ -699,20 +726,23 @@ class BucketizedColumnTest(test.TestCase):
       bias = get_linear_model_bias()
       bucketized_price_var = get_linear_model_column_var(bucketized_price)
       with _initialized_session() as sess:
-        self.assertAllClose([0.], bias.eval())
+        self.assertAllClose([0.], self.evaluate(bias))
         # One weight variable per bucket, all initialized to zero.
         self.assertAllClose([[0.], [0.], [0.], [0.], [0.]],
-                            bucketized_price_var.eval())
-        self.assertAllClose([[0.], [0.], [0.], [0.]], predictions.eval())
+                            self.evaluate(bucketized_price_var))
+        self.assertAllClose([[0.], [0.], [0.], [0.]],
+                            self.evaluate(predictions))
         sess.run(
             bucketized_price_var.assign([[10.], [20.], [30.], [40.], [50.]]))
         # price -1. is in the 0th bucket, whose weight is 10.
         # price 1. is in the 1st bucket, whose weight is 20.
         # price 5. is in the 3rd bucket, whose weight is 40.
         # price 6. is in the 4th bucket, whose weight is 50.
-        self.assertAllClose([[10.], [20.], [40.], [50.]], predictions.eval())
+        self.assertAllClose([[10.], [20.], [40.], [50.]],
+                            self.evaluate(predictions))
         sess.run(bias.assign([1.]))
-        self.assertAllClose([[11.], [21.], [41.], [51.]], predictions.eval())
+        self.assertAllClose([[11.], [21.], [41.], [51.]],
+                            self.evaluate(predictions))
 
   def test_old_linear_model_two_input_values(self):
     """Tests linear_model() for input with shape=[2]."""
@@ -724,12 +754,12 @@ class BucketizedColumnTest(test.TestCase):
       bias = get_linear_model_bias()
       bucketized_price_var = get_linear_model_column_var(bucketized_price)
       with _initialized_session() as sess:
-        self.assertAllClose([0.], bias.eval())
+        self.assertAllClose([0.], self.evaluate(bias))
         # One weight per bucket per input column, all initialized to zero.
         self.assertAllClose(
             [[0.], [0.], [0.], [0.], [0.], [0.], [0.], [0.], [0.], [0.]],
-            bucketized_price_var.eval())
-        self.assertAllClose([[0.], [0.]], predictions.eval())
+            self.evaluate(bucketized_price_var))
+        self.assertAllClose([[0.], [0.]], self.evaluate(predictions))
         sess.run(
             bucketized_price_var.assign([[10.], [20.], [30.], [40.], [50.],
                                          [60.], [70.], [80.], [90.], [100.]]))
@@ -739,13 +769,13 @@ class BucketizedColumnTest(test.TestCase):
         # 2nd example:
         #   price 5. is in the 3rd bucket, whose weight is 40.
         #   price 6. is in the 9th bucket, whose weight is 100.
-        self.assertAllClose([[80.], [140.]], predictions.eval())
+        self.assertAllClose([[80.], [140.]], self.evaluate(predictions))
         sess.run(bias.assign([1.]))
-        self.assertAllClose([[81.], [141.]], predictions.eval())
+        self.assertAllClose([[81.], [141.]], self.evaluate(predictions))
 
   def test_old_linear_model_one_input_value_old_numeric(self):
     """Tests linear_model() for input with shape=[1]."""
-    price = fc_old.numeric_column('price', shape=[1])
+    price = fc_old._numeric_column('price', shape=[1])
     bucketized_price = fc.bucketized_column(price, boundaries=[0, 2, 4, 6])
     with ops.Graph().as_default():
       features = {'price': [[-1.], [1.], [5.], [6.]]}
@@ -753,21 +783,25 @@ class BucketizedColumnTest(test.TestCase):
       bias = get_linear_model_bias()
       bucketized_price_var = get_linear_model_column_var(bucketized_price)
       with _initialized_session() as sess:
-        self.assertAllClose([0.], bias.eval())
+        self.assertAllClose([0.], self.evaluate(bias))
         # One weight variable per bucket, all initialized to zero.
         self.assertAllClose([[0.], [0.], [0.], [0.], [0.]],
-                            bucketized_price_var.eval())
-        self.assertAllClose([[0.], [0.], [0.], [0.]], predictions.eval())
+                            self.evaluate(bucketized_price_var))
+        self.assertAllClose([[0.], [0.], [0.], [0.]],
+                            self.evaluate(predictions))
         sess.run(
             bucketized_price_var.assign([[10.], [20.], [30.], [40.], [50.]]))
         # price -1. is in the 0th bucket, whose weight is 10.
         # price 1. is in the 1st bucket, whose weight is 20.
         # price 5. is in the 3rd bucket, whose weight is 40.
         # price 6. is in the 4th bucket, whose weight is 50.
-        self.assertAllClose([[10.], [20.], [40.], [50.]], predictions.eval())
+        self.assertAllClose([[10.], [20.], [40.], [50.]],
+                            self.evaluate(predictions))
         sess.run(bias.assign([1.]))
-        self.assertAllClose([[11.], [21.], [41.], [51.]], predictions.eval())
+        self.assertAllClose([[11.], [21.], [41.], [51.]],
+                            self.evaluate(predictions))
 
+  @test_util.run_deprecated_v1
   def test_serialization(self):
     price = fc.numeric_column('price', shape=[2])
     bucketized_price = fc.bucketized_column(price, boundaries=[0, 2, 4, 6])
@@ -800,6 +834,7 @@ class BucketizedColumnTest(test.TestCase):
 
 class HashedCategoricalColumnTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def test_defaults(self):
     a = fc.categorical_column_with_hash_bucket('aaa', 10)
     self.assertEqual('aaa', a.name)
@@ -827,6 +862,7 @@ class HashedCategoricalColumnTest(test.TestCase):
     with self.assertRaisesRegexp(ValueError, 'dtype must be string or integer'):
       fc.categorical_column_with_hash_bucket('aaa', 10, dtype=dtypes.float32)
 
+  @test_util.run_deprecated_v1
   def test_deep_copy(self):
     original = fc.categorical_column_with_hash_bucket('aaa', 10)
     for column in (original, copy.deepcopy(original)):
@@ -847,45 +883,50 @@ class HashedCategoricalColumnTest(test.TestCase):
         'aaa': parsing_ops.VarLenFeature(dtypes.int32)
     }, a.parse_example_spec)
 
+  @test_util.run_deprecated_v1
   def test_parse_example(self):
     a = fc.categorical_column_with_hash_bucket('aaa', 10)
-    data = example_pb2.Example(features=feature_pb2.Features(
-        feature={
-            'aaa':
-                feature_pb2.Feature(bytes_list=feature_pb2.BytesList(
-                    value=[b'omar', b'stringer']))
-        }))
+    data = example_pb2.Example(
+        features=feature_pb2.Features(
+            feature={
+                'aaa':
+                    feature_pb2.Feature(
+                        bytes_list=feature_pb2.BytesList(
+                            value=[b'omar', b'stringer']))
+            }))
     features = parsing_ops.parse_example(
         serialized=[data.SerializeToString()],
-        features=fc.make_parse_example_spec([a]))
+        features=fc.make_parse_example_spec_v2([a]))
     self.assertIn('aaa', features)
-    with self.cached_session():
-      _assert_sparse_tensor_value(
-          self,
-          sparse_tensor.SparseTensorValue(
-              indices=[[0, 0], [0, 1]],
-              values=np.array([b'omar', b'stringer'], dtype=np.object_),
-              dense_shape=[1, 2]),
-          features['aaa'].eval())
 
+    _assert_sparse_tensor_value(
+        self,
+        sparse_tensor.SparseTensorValue(
+            indices=[[0, 0], [0, 1]],
+            values=np.array([b'omar', b'stringer'], dtype=np.object_),
+            dense_shape=[1, 2]), self.evaluate(features['aaa']))
+
+  @test_util.run_deprecated_v1
   def test_strings_should_be_hashed(self):
     hashed_sparse = fc.categorical_column_with_hash_bucket('wire', 10)
     wire_tensor = sparse_tensor.SparseTensor(
         values=['omar', 'stringer', 'marlo'],
         indices=[[0, 0], [1, 0], [1, 1]],
         dense_shape=[2, 2])
-    outputs = fc._transform_features({
+    outputs = fc._transform_features_v2({
         'wire': wire_tensor
     }, [hashed_sparse], None)
     output = outputs[hashed_sparse]
     # Check exact hashed output. If hashing changes this test will break.
     expected_values = [6, 4, 1]
-    with self.cached_session():
-      self.assertEqual(dtypes.int64, output.values.dtype)
-      self.assertAllEqual(expected_values, output.values.eval())
-      self.assertAllEqual(wire_tensor.indices.eval(), output.indices.eval())
-      self.assertAllEqual(wire_tensor.dense_shape.eval(),
-                          output.dense_shape.eval())
+
+    self.assertEqual(dtypes.int64, output.values.dtype)
+    self.assertAllEqual(expected_values, self.evaluate(output.values))
+    self.assertAllEqual(
+        self.evaluate(wire_tensor.indices), self.evaluate(output.indices))
+    self.assertAllEqual(
+        self.evaluate(wire_tensor.dense_shape),
+        self.evaluate(output.dense_shape))
 
   def test_tensor_dtype_should_be_string_or_integer(self):
     string_fc = fc.categorical_column_with_hash_bucket(
@@ -895,17 +936,11 @@ class HashedCategoricalColumnTest(test.TestCase):
     float_fc = fc.categorical_column_with_hash_bucket(
         'a_float', 10, dtype=dtypes.string)
     int_tensor = sparse_tensor.SparseTensor(
-        values=[101],
-        indices=[[0, 0]],
-        dense_shape=[1, 1])
+        values=[101], indices=[[0, 0]], dense_shape=[1, 1])
     string_tensor = sparse_tensor.SparseTensor(
-        values=['101'],
-        indices=[[0, 0]],
-        dense_shape=[1, 1])
+        values=['101'], indices=[[0, 0]], dense_shape=[1, 1])
     float_tensor = sparse_tensor.SparseTensor(
-        values=[101.],
-        indices=[[0, 0]],
-        dense_shape=[1, 1])
+        values=[101.], indices=[[0, 0]], dense_shape=[1, 1])
     transformation_cache = fc.FeatureTransformationCache({
         'a_int': int_tensor,
         'a_string': string_tensor,
@@ -925,6 +960,7 @@ class HashedCategoricalColumnTest(test.TestCase):
     with self.assertRaisesRegexp(ValueError, 'dtype must be compatible'):
       transformation_cache.get(hashed_sparse, None)
 
+  @test_util.run_deprecated_v1
   def test_ints_should_be_hashed(self):
     hashed_sparse = fc.categorical_column_with_hash_bucket(
         'wire', 10, dtype=dtypes.int64)
@@ -936,9 +972,10 @@ class HashedCategoricalColumnTest(test.TestCase):
     output = transformation_cache.get(hashed_sparse, None)
     # Check exact hashed output. If hashing changes this test will break.
     expected_values = [3, 7, 5]
-    with self.cached_session():
-      self.assertAllEqual(expected_values, output.values.eval())
 
+    self.assertAllEqual(expected_values, self.evaluate(output.values))
+
+  @test_util.run_deprecated_v1
   def test_int32_64_is_compatible(self):
     hashed_sparse = fc.categorical_column_with_hash_bucket(
         'wire', 10, dtype=dtypes.int64)
@@ -950,9 +987,10 @@ class HashedCategoricalColumnTest(test.TestCase):
     output = transformation_cache.get(hashed_sparse, None)
     # Check exact hashed output. If hashing changes this test will break.
     expected_values = [3, 7, 5]
-    with self.cached_session():
-      self.assertAllEqual(expected_values, output.values.eval())
 
+    self.assertAllEqual(expected_values, self.evaluate(output.values))
+
+  @test_util.run_deprecated_v1
   def test_get_sparse_tensors(self):
     hashed_sparse = fc.categorical_column_with_hash_bucket('wire', 10)
     transformation_cache = fc.FeatureTransformationCache({
@@ -968,6 +1006,7 @@ class HashedCategoricalColumnTest(test.TestCase):
     self.assertEqual(
         transformation_cache.get(hashed_sparse, None), id_weight_pair.id_tensor)
 
+  @test_util.run_deprecated_v1
   def test_get_sparse_tensors_dense_input(self):
     hashed_sparse = fc.categorical_column_with_hash_bucket('wire', 10)
     transformation_cache = fc.FeatureTransformationCache({
@@ -979,6 +1018,7 @@ class HashedCategoricalColumnTest(test.TestCase):
     self.assertEqual(
         transformation_cache.get(hashed_sparse, None), id_weight_pair.id_tensor)
 
+  @test_util.run_deprecated_v1
   def test_linear_model(self):
     wire_column = fc.categorical_column_with_hash_bucket('wire', 4)
     self.assertEqual(4, wire_column.num_buckets)
@@ -992,14 +1032,17 @@ class HashedCategoricalColumnTest(test.TestCase):
                   dense_shape=(2, 2))
       })
       wire_var, bias = model.variables
-      with _initialized_session():
-        self.assertAllClose((0.,), bias.eval())
-        self.assertAllClose(((0.,), (0.,), (0.,), (0.,)), wire_var.eval())
-        self.assertAllClose(((0.,), (0.,)), predictions.eval())
-        wire_var.assign(((1.,), (2.,), (3.,), (4.,))).eval()
-        # 'marlo' -> 3: wire_var[3] = 4
-        # 'skywalker' -> 2, 'omar' -> 2: wire_var[2] + wire_var[2] = 3+3 = 6
-        self.assertAllClose(((4.,), (6.,)), predictions.eval())
+
+      self.evaluate(variables_lib.global_variables_initializer())
+      self.evaluate(lookup_ops.tables_initializer())
+
+      self.assertAllClose((0.,), self.evaluate(bias))
+      self.assertAllClose(((0.,), (0.,), (0.,), (0.,)), self.evaluate(wire_var))
+      self.assertAllClose(((0.,), (0.,)), self.evaluate(predictions))
+      self.evaluate(wire_var.assign(((1.,), (2.,), (3.,), (4.,))))
+      # 'marlo' -> 3: wire_var[3] = 4
+      # 'skywalker' -> 2, 'omar' -> 2: wire_var[2] + wire_var[2] = 3+3 = 6
+      self.assertAllClose(((4.,), (6.,)), self.evaluate(predictions))
 
   def test_old_linear_model(self):
     wire_column = fc.categorical_column_with_hash_bucket('wire', 4)
@@ -1014,15 +1057,19 @@ class HashedCategoricalColumnTest(test.TestCase):
       }, (wire_column,))
       bias = get_linear_model_bias()
       wire_var = get_linear_model_column_var(wire_column)
-      with _initialized_session():
-        self.assertAllClose((0.,), bias.eval())
-        self.assertAllClose(((0.,), (0.,), (0.,), (0.,)), wire_var.eval())
-        self.assertAllClose(((0.,), (0.,)), predictions.eval())
-        wire_var.assign(((1.,), (2.,), (3.,), (4.,))).eval()
-        # 'marlo' -> 3: wire_var[3] = 4
-        # 'skywalker' -> 2, 'omar' -> 2: wire_var[2] + wire_var[2] = 3+3 = 6
-        self.assertAllClose(((4.,), (6.,)), predictions.eval())
 
+      self.evaluate(variables_lib.global_variables_initializer())
+      self.evaluate(lookup_ops.tables_initializer())
+
+      self.assertAllClose((0.,), self.evaluate(bias))
+      self.assertAllClose(((0.,), (0.,), (0.,), (0.,)), self.evaluate(wire_var))
+      self.assertAllClose(((0.,), (0.,)), self.evaluate(predictions))
+      self.evaluate(wire_var.assign(((1.,), (2.,), (3.,), (4.,))))
+      # 'marlo' -> 3: wire_var[3] = 4
+      # 'skywalker' -> 2, 'omar' -> 2: wire_var[2] + wire_var[2] = 3+3 = 6
+      self.assertAllClose(((4.,), (6.,)), self.evaluate(predictions))
+
+  @test_util.run_deprecated_v1
   def test_serialization(self):
     wire_column = fc.categorical_column_with_hash_bucket('wire', 4)
     self.assertEqual(['wire'], wire_column.parents)
@@ -1041,13 +1088,13 @@ class HashedCategoricalColumnTest(test.TestCase):
 class CrossedColumnTest(test.TestCase):
 
   def test_keys_empty(self):
-    with self.assertRaisesRegexp(
-        ValueError, 'keys must be a list with length > 1'):
+    with self.assertRaisesRegexp(ValueError,
+                                 'keys must be a list with length > 1'):
       fc.crossed_column([], 10)
 
   def test_keys_length_one(self):
-    with self.assertRaisesRegexp(
-        ValueError, 'keys must be a list with length > 1'):
+    with self.assertRaisesRegexp(ValueError,
+                                 'keys must be a list with length > 1'):
       fc.crossed_column(['a'], 10)
 
   def test_key_type_unsupported(self):
@@ -1060,18 +1107,15 @@ class CrossedColumnTest(test.TestCase):
           ['a', fc.categorical_column_with_hash_bucket('c', 10)], 10)
 
   def test_hash_bucket_size_negative(self):
-    with self.assertRaisesRegexp(
-        ValueError, 'hash_bucket_size must be > 1'):
+    with self.assertRaisesRegexp(ValueError, 'hash_bucket_size must be > 1'):
       fc.crossed_column(['a', 'c'], -1)
 
   def test_hash_bucket_size_zero(self):
-    with self.assertRaisesRegexp(
-        ValueError, 'hash_bucket_size must be > 1'):
+    with self.assertRaisesRegexp(ValueError, 'hash_bucket_size must be > 1'):
       fc.crossed_column(['a', 'c'], 0)
 
   def test_hash_bucket_size_none(self):
-    with self.assertRaisesRegexp(
-        ValueError, 'hash_bucket_size must be > 1'):
+    with self.assertRaisesRegexp(ValueError, 'hash_bucket_size must be > 1'):
       fc.crossed_column(['a', 'c'], None)
 
   def test_name(self):
@@ -1085,7 +1129,7 @@ class CrossedColumnTest(test.TestCase):
     self.assertEqual('a_bucketized_X_c_X_d1_X_d2', crossed2.name)
 
   def test_is_v2_column(self):
-    a = fc_old.numeric_column('a', dtype=dtypes.int32)
+    a = fc_old._numeric_column('a', dtype=dtypes.int32)
     b = fc.bucketized_column(a, boundaries=[0, 1])
     crossed1 = fc.crossed_column(['d1', 'd2'], 10)
     self.assertTrue(crossed1._is_v2_column)
@@ -1127,65 +1171,76 @@ class CrossedColumnTest(test.TestCase):
     crossed = fc.crossed_column([b, 'c'], 15)
     self.assertEqual(15, crossed.num_buckets)
 
+  @test_util.run_deprecated_v1
   def test_deep_copy(self):
     a = fc.numeric_column('a', dtype=dtypes.int32)
     b = fc.bucketized_column(a, boundaries=[0, 1])
     crossed1 = fc.crossed_column(['d1', 'd2'], 10)
     crossed2 = fc.crossed_column([b, 'c', crossed1], 15, hash_key=5)
     crossed2_copy = copy.deepcopy(crossed2)
-    self.assertEqual('a_bucketized_X_c_X_d1_X_d2', crossed2_copy.name,)
+    self.assertEqual(
+        'a_bucketized_X_c_X_d1_X_d2',
+        crossed2_copy.name,
+    )
     self.assertEqual(15, crossed2_copy.hash_bucket_size)
     self.assertEqual(5, crossed2_copy.hash_key)
 
+  @test_util.run_deprecated_v1
   def test_parse_example(self):
     price = fc.numeric_column('price', shape=[2])
     bucketized_price = fc.bucketized_column(price, boundaries=[0, 50])
     price_cross_wire = fc.crossed_column([bucketized_price, 'wire'], 10)
-    data = example_pb2.Example(features=feature_pb2.Features(
-        feature={
-            'price':
-                feature_pb2.Feature(float_list=feature_pb2.FloatList(
-                    value=[20., 110.])),
-            'wire':
-                feature_pb2.Feature(bytes_list=feature_pb2.BytesList(
-                    value=[b'omar', b'stringer'])),
-        }))
+    data = example_pb2.Example(
+        features=feature_pb2.Features(
+            feature={
+                'price':
+                    feature_pb2.Feature(
+                        float_list=feature_pb2.FloatList(value=[20., 110.])),
+                'wire':
+                    feature_pb2.Feature(
+                        bytes_list=feature_pb2.BytesList(
+                            value=[b'omar', b'stringer'])),
+            }))
     features = parsing_ops.parse_example(
         serialized=[data.SerializeToString()],
-        features=fc.make_parse_example_spec([price_cross_wire]))
+        features=fc.make_parse_example_spec_v2([price_cross_wire]))
     self.assertIn('price', features)
     self.assertIn('wire', features)
-    with self.cached_session():
-      self.assertAllEqual([[20., 110.]], features['price'].eval())
-      wire_sparse = features['wire']
-      self.assertAllEqual([[0, 0], [0, 1]], wire_sparse.indices.eval())
-      # Use byte constants to pass the open-source test.
-      self.assertAllEqual([b'omar', b'stringer'], wire_sparse.values.eval())
-      self.assertAllEqual([1, 2], wire_sparse.dense_shape.eval())
 
+    self.assertAllEqual([[20., 110.]], self.evaluate(features['price']))
+    wire_sparse = features['wire']
+    self.assertAllEqual([[0, 0], [0, 1]], self.evaluate(wire_sparse.indices))
+    # Use byte constants to pass the open-source test.
+    self.assertAllEqual([b'omar', b'stringer'],
+                        self.evaluate(wire_sparse.values))
+    self.assertAllEqual([1, 2], self.evaluate(wire_sparse.dense_shape))
+
+  @test_util.run_deprecated_v1
   def test_transform_feature(self):
     price = fc.numeric_column('price', shape=[2])
     bucketized_price = fc.bucketized_column(price, boundaries=[0, 50])
     hash_bucket_size = 10
-    price_cross_wire = fc.crossed_column(
-        [bucketized_price, 'wire'], hash_bucket_size)
+    price_cross_wire = fc.crossed_column([bucketized_price, 'wire'],
+                                         hash_bucket_size)
     features = {
-        'price': constant_op.constant([[1., 2.], [5., 6.]]),
-        'wire': sparse_tensor.SparseTensor(
-            values=['omar', 'stringer', 'marlo'],
-            indices=[[0, 0], [1, 0], [1, 1]],
-            dense_shape=[2, 2]),
+        'price':
+            constant_op.constant([[1., 2.], [5., 6.]]),
+        'wire':
+            sparse_tensor.SparseTensor(
+                values=['omar', 'stringer', 'marlo'],
+                indices=[[0, 0], [1, 0], [1, 1]],
+                dense_shape=[2, 2]),
     }
-    outputs = fc._transform_features(features, [price_cross_wire], None)
+    outputs = fc._transform_features_v2(features, [price_cross_wire], None)
     output = outputs[price_cross_wire]
-    with self.cached_session() as sess:
-      output_val = sess.run(output)
-      self.assertAllEqual(
-          [[0, 0], [0, 1], [1, 0], [1, 1], [1, 2], [1, 3]], output_val.indices)
-      for val in output_val.values:
-        self.assertIn(val, list(range(hash_bucket_size)))
-      self.assertAllEqual([2, 4], output_val.dense_shape)
+    output_val = self.evaluate(output)
+    self.assertAllEqual([[0, 0], [0, 1], [1, 0], [1, 1], [1, 2], [1, 3]],
+                        output_val.indices)
+    for val in output_val.values:
+      self.assertIn(val, list(range(hash_bucket_size)))
+    self.assertAllEqual([2, 4], output_val.dense_shape)
 
+  @test_util.run_deprecated_v1
   def test_get_sparse_tensors(self):
     a = fc.numeric_column('a', dtype=dtypes.int32, shape=(2,))
     b = fc.bucketized_column(a, boundaries=(0, 1))
@@ -1212,19 +1267,21 @@ class CrossedColumnTest(test.TestCase):
                   dense_shape=(2, 2)),
       })
       id_weight_pair = crossed2.get_sparse_tensors(transformation_cache, None)
-      with _initialized_session():
-        id_tensor_eval = id_weight_pair.id_tensor.eval()
-        self.assertAllEqual(
-            ((0, 0), (0, 1), (1, 0), (1, 1), (1, 2), (1, 3), (1, 4), (1, 5),
-             (1, 6), (1, 7), (1, 8), (1, 9), (1, 10), (1, 11), (1, 12), (1, 13),
-             (1, 14), (1, 15)),
-            id_tensor_eval.indices)
-        # Check exact hashed output. If hashing changes this test will break.
-        # All values are within [0, hash_bucket_size).
-        expected_values = (
-            6, 14, 0, 13, 8, 8, 10, 12, 2, 0, 1, 9, 8, 12, 2, 0, 10, 11)
-        self.assertAllEqual(expected_values, id_tensor_eval.values)
-        self.assertAllEqual((2, 16), id_tensor_eval.dense_shape)
+
+      self.evaluate(variables_lib.global_variables_initializer())
+      self.evaluate(lookup_ops.tables_initializer())
+
+      id_tensor_eval = self.evaluate(id_weight_pair.id_tensor)
+      self.assertAllEqual(
+          ((0, 0), (0, 1), (1, 0), (1, 1), (1, 2), (1, 3), (1, 4), (1, 5),
+           (1, 6), (1, 7), (1, 8), (1, 9), (1, 10), (1, 11), (1, 12), (1, 13),
+           (1, 14), (1, 15)), id_tensor_eval.indices)
+      # Check exact hashed output. If hashing changes this test will break.
+      # All values are within [0, hash_bucket_size).
+      expected_values = (6, 14, 0, 13, 8, 8, 10, 12, 2, 0, 1, 9, 8, 12, 2, 0,
+                         10, 11)
+      self.assertAllEqual(expected_values, id_tensor_eval.values)
+      self.assertAllEqual((2, 16), id_tensor_eval.dense_shape)
 
   def test_get_sparse_tensors_simple(self):
     """Same as test_get_sparse_tensors, but with simpler values."""
@@ -1242,17 +1299,20 @@ class CrossedColumnTest(test.TestCase):
                   dense_shape=(2, 2)),
       })
       id_weight_pair = crossed.get_sparse_tensors(transformation_cache, None)
-      with _initialized_session():
-        id_tensor_eval = id_weight_pair.id_tensor.eval()
-        self.assertAllEqual(
-            ((0, 0), (0, 1), (1, 0), (1, 1), (1, 2), (1, 3)),
-            id_tensor_eval.indices)
-        # Check exact hashed output. If hashing changes this test will break.
-        # All values are within [0, hash_bucket_size).
-        expected_values = (1, 0, 1, 3, 4, 2)
-        self.assertAllEqual(expected_values, id_tensor_eval.values)
-        self.assertAllEqual((2, 4), id_tensor_eval.dense_shape)
 
+      self.evaluate(variables_lib.global_variables_initializer())
+      self.evaluate(lookup_ops.tables_initializer())
+
+      id_tensor_eval = self.evaluate(id_weight_pair.id_tensor)
+      self.assertAllEqual(((0, 0), (0, 1), (1, 0), (1, 1), (1, 2), (1, 3)),
+                          id_tensor_eval.indices)
+      # Check exact hashed output. If hashing changes this test will break.
+      # All values are within [0, hash_bucket_size).
+      expected_values = (1, 0, 1, 3, 4, 2)
+      self.assertAllEqual(expected_values, id_tensor_eval.values)
+      self.assertAllEqual((2, 4), id_tensor_eval.dense_shape)
+
+  @test_util.run_deprecated_v1
   def test_linear_model(self):
     """Tests linear_model.
 
@@ -1274,15 +1334,15 @@ class CrossedColumnTest(test.TestCase):
       })
       crossed_var, bias = model.variables
       with _initialized_session() as sess:
-        self.assertAllClose((0.,), bias.eval())
-        self.assertAllClose(
-            ((0.,), (0.,), (0.,), (0.,), (0.,)), crossed_var.eval())
-        self.assertAllClose(((0.,), (0.,)), predictions.eval())
+        self.assertAllClose((0.,), self.evaluate(bias))
+        self.assertAllClose(((0.,), (0.,), (0.,), (0.,), (0.,)),
+                            self.evaluate(crossed_var))
+        self.assertAllClose(((0.,), (0.,)), self.evaluate(predictions))
         sess.run(crossed_var.assign(((1.,), (2.,), (3.,), (4.,), (5.,))))
         # Expected ids after cross = (1, 0, 1, 3, 4, 2)
-        self.assertAllClose(((3.,), (14.,)), predictions.eval())
+        self.assertAllClose(((3.,), (14.,)), self.evaluate(predictions))
         sess.run(bias.assign((.1,)))
-        self.assertAllClose(((3.1,), (14.1,)), predictions.eval())
+        self.assertAllClose(((3.1,), (14.1,)), self.evaluate(predictions))
 
   def test_linear_model_with_weights(self):
 
@@ -1301,10 +1361,11 @@ class CrossedColumnTest(test.TestCase):
       @property
       def parse_example_spec(self):
         return {
-            self.name: parsing_ops.VarLenFeature(dtypes.int32),
-            '{}_weights'.format(self.name): parsing_ops.VarLenFeature(
-                dtypes.float32),
-            }
+            self.name:
+                parsing_ops.VarLenFeature(dtypes.int32),
+            '{}_weights'.format(self.name):
+                parsing_ops.VarLenFeature(dtypes.float32),
+        }
 
       @property
       def num_buckets(self):
@@ -1367,15 +1428,15 @@ class CrossedColumnTest(test.TestCase):
       bias = get_linear_model_bias()
       crossed_var = get_linear_model_column_var(crossed)
       with _initialized_session() as sess:
-        self.assertAllClose((0.,), bias.eval())
+        self.assertAllClose((0.,), self.evaluate(bias))
         self.assertAllClose(((0.,), (0.,), (0.,), (0.,), (0.,)),
-                            crossed_var.eval())
-        self.assertAllClose(((0.,), (0.,)), predictions.eval())
+                            self.evaluate(crossed_var))
+        self.assertAllClose(((0.,), (0.,)), self.evaluate(predictions))
         sess.run(crossed_var.assign(((1.,), (2.,), (3.,), (4.,), (5.,))))
         # Expected ids after cross = (1, 0, 1, 3, 4, 2)
-        self.assertAllClose(((3.,), (14.,)), predictions.eval())
+        self.assertAllClose(((3.,), (14.,)), self.evaluate(predictions))
         sess.run(bias.assign((.1,)))
-        self.assertAllClose(((3.1,), (14.1,)), predictions.eval())
+        self.assertAllClose(((3.1,), (14.1,)), self.evaluate(predictions))
 
   def test_old_linear_model_with_weights(self):
 
@@ -1461,7 +1522,7 @@ class CrossedColumnTest(test.TestCase):
 
     Uses data from test_get_sparse_tesnsors_simple.
     """
-    a = fc_old.numeric_column('a', dtype=dtypes.int32, shape=(2,))
+    a = fc_old._numeric_column('a', dtype=dtypes.int32, shape=(2,))
     b = fc.bucketized_column(a, boundaries=(0, 1))
     crossed = fc.crossed_column([b, 'c'], hash_bucket_size=5, hash_key=5)
     with ops.Graph().as_default():
@@ -1477,16 +1538,17 @@ class CrossedColumnTest(test.TestCase):
       bias = get_linear_model_bias()
       crossed_var = get_linear_model_column_var(crossed)
       with _initialized_session() as sess:
-        self.assertAllClose((0.,), bias.eval())
+        self.assertAllClose((0.,), self.evaluate(bias))
         self.assertAllClose(((0.,), (0.,), (0.,), (0.,), (0.,)),
-                            crossed_var.eval())
-        self.assertAllClose(((0.,), (0.,)), predictions.eval())
+                            self.evaluate(crossed_var))
+        self.assertAllClose(((0.,), (0.,)), self.evaluate(predictions))
         sess.run(crossed_var.assign(((1.,), (2.,), (3.,), (4.,), (5.,))))
         # Expected ids after cross = (1, 0, 1, 3, 4, 2)
-        self.assertAllClose(((3.,), (14.,)), predictions.eval())
+        self.assertAllClose(((3.,), (14.,)), self.evaluate(predictions))
         sess.run(bias.assign((.1,)))
-        self.assertAllClose(((3.1,), (14.1,)), predictions.eval())
+        self.assertAllClose(((3.1,), (14.1,)), self.evaluate(predictions))
 
+  @test_util.run_deprecated_v1
   def test_serialization(self):
     a = fc.numeric_column('a', dtype=dtypes.int32, shape=(2,))
     b = fc.bucketized_column(a, boundaries=(0, 1))
@@ -1528,7 +1590,6 @@ class CrossedColumnTest(test.TestCase):
     self.assertIs(b, new_crossed.keys[0])
 
 
-
 class LinearModelTest(test.TestCase):
 
   def test_raises_if_empty_feature_columns(self):
@@ -1581,7 +1642,7 @@ class LinearModelTest(test.TestCase):
       features = [[1.], [5.]]
       model = fc.LinearModel([price])
       with self.assertRaisesRegexp(ValueError, 'We expected a dictionary here'):
-        predictions = model(features)
+        model(features)
 
   def test_dense_bias(self):
     price = fc.numeric_column('price')
@@ -1591,10 +1652,10 @@ class LinearModelTest(test.TestCase):
       predictions = model(features)
       price_var, bias = model.variables
       with _initialized_session() as sess:
-        self.assertAllClose([0.], bias.eval())
+        self.assertAllClose([0.], self.evaluate(bias))
         sess.run(price_var.assign([[10.]]))
         sess.run(bias.assign([5.]))
-        self.assertAllClose([[15.], [55.]], predictions.eval())
+        self.assertAllClose([[15.], [55.]], self.evaluate(predictions))
 
   def test_sparse_bias(self):
     wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
@@ -1608,11 +1669,12 @@ class LinearModelTest(test.TestCase):
       predictions = model(features)
       wire_cast_var, bias = model.variables
       with _initialized_session() as sess:
-        self.assertAllClose([0.], bias.eval())
-        self.assertAllClose([[0.], [0.], [0.], [0.]], wire_cast_var.eval())
+        self.assertAllClose([0.], self.evaluate(bias))
+        self.assertAllClose([[0.], [0.], [0.], [0.]],
+                            self.evaluate(wire_cast_var))
         sess.run(wire_cast_var.assign([[10.], [100.], [1000.], [10000.]]))
         sess.run(bias.assign([5.]))
-        self.assertAllClose([[1005.], [10015.]], predictions.eval())
+        self.assertAllClose([[1005.], [10015.]], self.evaluate(predictions))
 
   def test_dense_and_sparse_bias(self):
     wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
@@ -1630,7 +1692,7 @@ class LinearModelTest(test.TestCase):
         sess.run(wire_cast_var.assign([[10.], [100.], [1000.], [10000.]]))
         sess.run(bias.assign([5.]))
         sess.run(price_var.assign([[10.]]))
-        self.assertAllClose([[1015.], [10065.]], predictions.eval())
+        self.assertAllClose([[1015.], [10065.]], self.evaluate(predictions))
 
   def test_dense_and_sparse_column(self):
     """When the column is both dense and sparse, uses sparse tensors."""
@@ -1682,10 +1744,11 @@ class LinearModelTest(test.TestCase):
       predictions = model(features)
       dense_and_sparse_column_var, bias = model.variables
       with _initialized_session() as sess:
-        sess.run(dense_and_sparse_column_var.assign(
-            [[10.], [100.], [1000.], [10000.]]))
+        sess.run(
+            dense_and_sparse_column_var.assign([[10.], [100.], [1000.],
+                                                [10000.]]))
         sess.run(bias.assign([5.]))
-        self.assertAllClose([[1005.], [10015.]], predictions.eval())
+        self.assertAllClose([[1005.], [10015.]], self.evaluate(predictions))
 
   def test_dense_multi_output(self):
     price = fc.numeric_column('price')
@@ -1695,12 +1758,12 @@ class LinearModelTest(test.TestCase):
       predictions = model(features)
       price_var, bias = model.variables
       with _initialized_session() as sess:
-        self.assertAllClose(np.zeros((3,)), bias.eval())
-        self.assertAllClose(np.zeros((1, 3)), price_var.eval())
+        self.assertAllClose(np.zeros((3,)), self.evaluate(bias))
+        self.assertAllClose(np.zeros((1, 3)), self.evaluate(price_var))
         sess.run(price_var.assign([[10., 100., 1000.]]))
         sess.run(bias.assign([5., 6., 7.]))
         self.assertAllClose([[15., 106., 1007.], [55., 506., 5007.]],
-                            predictions.eval())
+                            self.evaluate(predictions))
 
   def test_sparse_multi_output(self):
     wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
@@ -1714,15 +1777,15 @@ class LinearModelTest(test.TestCase):
       predictions = model(features)
       wire_cast_var, bias = model.variables
       with _initialized_session() as sess:
-        self.assertAllClose(np.zeros((3,)), bias.eval())
-        self.assertAllClose(np.zeros((4, 3)), wire_cast_var.eval())
+        self.assertAllClose(np.zeros((3,)), self.evaluate(bias))
+        self.assertAllClose(np.zeros((4, 3)), self.evaluate(wire_cast_var))
         sess.run(
-            wire_cast_var.assign([[10., 11., 12.], [100., 110., 120.], [
-                1000., 1100., 1200.
-            ], [10000., 11000., 12000.]]))
+            wire_cast_var.assign([[10., 11., 12.], [100., 110., 120.],
+                                  [1000., 1100., 1200.],
+                                  [10000., 11000., 12000.]]))
         sess.run(bias.assign([5., 6., 7.]))
         self.assertAllClose([[1005., 1106., 1207.], [10015., 11017., 12019.]],
-                            predictions.eval())
+                            self.evaluate(predictions))
 
   def test_dense_multi_dimension(self):
     price = fc.numeric_column('price', shape=2)
@@ -1732,9 +1795,9 @@ class LinearModelTest(test.TestCase):
       predictions = model(features)
       price_var, _ = model.variables
       with _initialized_session() as sess:
-        self.assertAllClose([[0.], [0.]], price_var.eval())
+        self.assertAllClose([[0.], [0.]], self.evaluate(price_var))
         sess.run(price_var.assign([[10.], [100.]]))
-        self.assertAllClose([[210.], [650.]], predictions.eval())
+        self.assertAllClose([[210.], [650.]], self.evaluate(predictions))
 
   def test_sparse_multi_rank(self):
     wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
@@ -1749,7 +1812,7 @@ class LinearModelTest(test.TestCase):
       predictions = model(features)
       wire_cast_var, _ = model.variables
       with _initialized_session() as sess:
-        self.assertAllClose(np.zeros((4, 1)), wire_cast_var.eval())
+        self.assertAllClose(np.zeros((4, 1)), self.evaluate(wire_cast_var))
         self.assertAllClose(
             np.zeros((2, 1)),
             predictions.eval(feed_dict={wire_tensor: wire_value}))
@@ -1772,7 +1835,7 @@ class LinearModelTest(test.TestCase):
       with _initialized_session() as sess:
         sess.run(wire_cast_var.assign([[10.], [100.], [1000.], [10000.]]))
         sess.run(bias.assign([5.]))
-        self.assertAllClose([[1005.], [5010.]], predictions.eval())
+        self.assertAllClose([[1005.], [5010.]], self.evaluate(predictions))
 
   def test_sparse_combiner_with_negative_weights(self):
     wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
@@ -1793,7 +1856,7 @@ class LinearModelTest(test.TestCase):
       with _initialized_session() as sess:
         sess.run(wire_cast_var.assign([[10.], [100.], [1000.], [10000.]]))
         sess.run(bias.assign([5.]))
-        self.assertAllClose([[1005.], [-9985.]], predictions.eval())
+        self.assertAllClose([[1005.], [-9985.]], self.evaluate(predictions))
 
   def test_dense_multi_dimension_multi_output(self):
     price = fc.numeric_column('price', shape=2)
@@ -1803,12 +1866,12 @@ class LinearModelTest(test.TestCase):
       predictions = model(features)
       price_var, bias = model.variables
       with _initialized_session() as sess:
-        self.assertAllClose(np.zeros((3,)), bias.eval())
-        self.assertAllClose(np.zeros((2, 3)), price_var.eval())
+        self.assertAllClose(np.zeros((3,)), self.evaluate(bias))
+        self.assertAllClose(np.zeros((2, 3)), self.evaluate(price_var))
         sess.run(price_var.assign([[1., 2., 3.], [10., 100., 1000.]]))
         sess.run(bias.assign([2., 3., 4.]))
         self.assertAllClose([[23., 205., 2007.], [67., 613., 6019.]],
-                            predictions.eval())
+                            self.evaluate(predictions))
 
   def test_raises_if_shape_mismatch(self):
     price = fc.numeric_column('price', shape=2)
@@ -1828,32 +1891,29 @@ class LinearModelTest(test.TestCase):
       predictions = model(features)
       price_var, bias = model.variables
       with _initialized_session() as sess:
-        self.assertAllClose([0.], bias.eval())
-        self.assertAllClose([[0.], [0.]], price_var.eval())
-        self.assertAllClose([[0.], [0.]], predictions.eval())
+        self.assertAllClose([0.], self.evaluate(bias))
+        self.assertAllClose([[0.], [0.]], self.evaluate(price_var))
+        self.assertAllClose([[0.], [0.]], self.evaluate(predictions))
         sess.run(price_var.assign([[10.], [100.]]))
-        self.assertAllClose([[210.], [650.]], predictions.eval())
+        self.assertAllClose([[210.], [650.]], self.evaluate(predictions))
 
   def test_dense_multi_column(self):
     price1 = fc.numeric_column('price1', shape=2)
     price2 = fc.numeric_column('price2')
     with ops.Graph().as_default():
-      features = {
-          'price1': [[1., 2.], [5., 6.]],
-          'price2': [[3.], [4.]]
-      }
+      features = {'price1': [[1., 2.], [5., 6.]], 'price2': [[3.], [4.]]}
       model = fc.LinearModel([price1, price2])
       predictions = model(features)
       price1_var, price2_var, bias = model.variables
       with _initialized_session() as sess:
-        self.assertAllClose([0.], bias.eval())
-        self.assertAllClose([[0.], [0.]], price1_var.eval())
-        self.assertAllClose([[0.]], price2_var.eval())
-        self.assertAllClose([[0.], [0.]], predictions.eval())
+        self.assertAllClose([0.], self.evaluate(bias))
+        self.assertAllClose([[0.], [0.]], self.evaluate(price1_var))
+        self.assertAllClose([[0.]], self.evaluate(price2_var))
+        self.assertAllClose([[0.], [0.]], self.evaluate(predictions))
         sess.run(price1_var.assign([[10.], [100.]]))
         sess.run(price2_var.assign([[1000.]]))
         sess.run(bias.assign([7.]))
-        self.assertAllClose([[3217.], [4657.]], predictions.eval())
+        self.assertAllClose([[3217.], [4657.]], self.evaluate(predictions))
 
   def test_dense_trainable_default(self):
     price = fc.numeric_column('price')
@@ -2046,6 +2106,7 @@ class LinearModelTest(test.TestCase):
                 features['price2']: [[1.], [5.]],
             })
 
+  @test_util.run_deprecated_v1
   def test_with_numpy_input_fn(self):
     price = fc.numeric_column('price')
     price_buckets = fc.bucketized_column(
@@ -2078,11 +2139,13 @@ class LinearModelTest(test.TestCase):
       sess.run(body_style_var.assign([[-10.], [-100.], [-1000.]]))
       sess.run(bias.assign([5.]))
 
-      self.assertAllClose([[10 - 1000 + 5.], [100 - 10 + 5.]], sess.run(net))
+      self.assertAllClose([[10 - 1000 + 5.], [100 - 10 + 5.]],
+                          self.evaluate(net))
 
       coord.request_stop()
       coord.join(threads)
 
+  @test_util.run_deprecated_v1
   def test_with_1d_sparse_tensor(self):
     price = fc.numeric_column('price')
     price_buckets = fc.bucketized_column(
@@ -2096,11 +2159,16 @@ class LinearModelTest(test.TestCase):
 
     # Provides 1-dim tensor and dense tensor.
     features = {
-        'price': constant_op.constant([-1., 12.,]),
-        'body-style': sparse_tensor.SparseTensor(
-            indices=((0,), (1,)),
-            values=('sedan', 'hardtop'),
-            dense_shape=(2,)),
+        'price':
+            constant_op.constant([
+                -1.,
+                12.,
+            ]),
+        'body-style':
+            sparse_tensor.SparseTensor(
+                indices=((0,), (1,)),
+                values=('sedan', 'hardtop'),
+                dense_shape=(2,)),
     }
     self.assertEqual(1, features['price'].shape.ndims)
     self.assertEqual(1, features['body-style'].dense_shape.get_shape()[0])
@@ -2114,8 +2182,10 @@ class LinearModelTest(test.TestCase):
       sess.run(body_style_var.assign([[-10.], [-100.], [-1000.]]))
       sess.run(bias.assign([5.]))
 
-      self.assertAllClose([[10 - 1000 + 5.], [1000 - 10 + 5.]], sess.run(net))
+      self.assertAllClose([[10 - 1000 + 5.], [1000 - 10 + 5.]],
+                          self.evaluate(net))
 
+  @test_util.run_deprecated_v1
   def test_with_1d_unknown_shape_sparse_tensor(self):
     price = fc.numeric_column('price')
     price_buckets = fc.bucketized_column(
@@ -2140,9 +2210,7 @@ class LinearModelTest(test.TestCase):
 
     price_data = np.array([-1., 12.])
     body_style_data = sparse_tensor.SparseTensorValue(
-        indices=((0,), (1,)),
-        values=('sedan', 'hardtop'),
-        dense_shape=(2,))
+        indices=((0,), (1,)), values=('sedan', 'hardtop'), dense_shape=(2,))
     country_data = np.array(['US', 'CA'])
 
     model = fc.LinearModel([price_buckets, body_style, country])
@@ -2162,6 +2230,7 @@ class LinearModelTest(test.TestCase):
                                   features['country']: country_data
                               }))
 
+  @test_util.run_deprecated_v1
   def test_with_rank_0_feature(self):
     price = fc.numeric_column('price')
     features = {
@@ -2197,14 +2266,14 @@ class LinearModelTest(test.TestCase):
       price_var1, bias1 = model1.variables
       price_var2, bias2 = model2.variables
       with _initialized_session() as sess:
-        self.assertAllClose([0.], bias1.eval())
+        self.assertAllClose([0.], self.evaluate(bias1))
         sess.run(price_var1.assign([[10.]]))
         sess.run(bias1.assign([5.]))
-        self.assertAllClose([[15.], [55.]], predictions1.eval())
-        self.assertAllClose([0.], bias2.eval())
+        self.assertAllClose([[15.], [55.]], self.evaluate(predictions1))
+        self.assertAllClose([0.], self.evaluate(bias2))
         sess.run(price_var2.assign([[10.]]))
         sess.run(bias2.assign([5.]))
-        self.assertAllClose([[25.], [105.]], predictions2.eval())
+        self.assertAllClose([[25.], [105.]], self.evaluate(predictions2))
 
 
 class OldLinearModelTest(test.TestCase):
@@ -2272,10 +2341,10 @@ class OldLinearModelTest(test.TestCase):
       bias = get_linear_model_bias()
       price_var = get_linear_model_column_var(price)
       with _initialized_session() as sess:
-        self.assertAllClose([0.], bias.eval())
+        self.assertAllClose([0.], self.evaluate(bias))
         sess.run(price_var.assign([[10.]]))
         sess.run(bias.assign([5.]))
-        self.assertAllClose([[15.], [55.]], predictions.eval())
+        self.assertAllClose([[15.], [55.]], self.evaluate(predictions))
 
   def test_sparse_bias(self):
     wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
@@ -2289,11 +2358,12 @@ class OldLinearModelTest(test.TestCase):
       bias = get_linear_model_bias()
       wire_cast_var = get_linear_model_column_var(wire_cast)
       with _initialized_session() as sess:
-        self.assertAllClose([0.], bias.eval())
-        self.assertAllClose([[0.], [0.], [0.], [0.]], wire_cast_var.eval())
+        self.assertAllClose([0.], self.evaluate(bias))
+        self.assertAllClose([[0.], [0.], [0.], [0.]],
+                            self.evaluate(wire_cast_var))
         sess.run(wire_cast_var.assign([[10.], [100.], [1000.], [10000.]]))
         sess.run(bias.assign([5.]))
-        self.assertAllClose([[1005.], [10015.]], predictions.eval())
+        self.assertAllClose([[1005.], [10015.]], self.evaluate(predictions))
 
   def test_dense_and_sparse_bias(self):
     wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
@@ -2312,7 +2382,7 @@ class OldLinearModelTest(test.TestCase):
         sess.run(wire_cast_var.assign([[10.], [100.], [1000.], [10000.]]))
         sess.run(bias.assign([5.]))
         sess.run(price_var.assign([[10.]]))
-        self.assertAllClose([[1015.], [10065.]], predictions.eval())
+        self.assertAllClose([[1015.], [10065.]], self.evaluate(predictions))
 
   def test_dense_and_sparse_column(self):
     """When the column is both dense and sparse, uses sparse tensors."""
@@ -2394,7 +2464,7 @@ class OldLinearModelTest(test.TestCase):
             dense_and_sparse_column_var.assign([[10.], [100.], [1000.],
                                                 [10000.]]))
         sess.run(bias.assign([5.]))
-        self.assertAllClose([[1005.], [10015.]], predictions.eval())
+        self.assertAllClose([[1005.], [10015.]], self.evaluate(predictions))
 
   def test_dense_multi_output(self):
     price = fc.numeric_column('price')
@@ -2404,12 +2474,12 @@ class OldLinearModelTest(test.TestCase):
       bias = get_linear_model_bias()
       price_var = get_linear_model_column_var(price)
       with _initialized_session() as sess:
-        self.assertAllClose(np.zeros((3,)), bias.eval())
-        self.assertAllClose(np.zeros((1, 3)), price_var.eval())
+        self.assertAllClose(np.zeros((3,)), self.evaluate(bias))
+        self.assertAllClose(np.zeros((1, 3)), self.evaluate(price_var))
         sess.run(price_var.assign([[10., 100., 1000.]]))
         sess.run(bias.assign([5., 6., 7.]))
         self.assertAllClose([[15., 106., 1007.], [55., 506., 5007.]],
-                            predictions.eval())
+                            self.evaluate(predictions))
 
   def test_sparse_multi_output(self):
     wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
@@ -2423,15 +2493,15 @@ class OldLinearModelTest(test.TestCase):
       bias = get_linear_model_bias()
       wire_cast_var = get_linear_model_column_var(wire_cast)
       with _initialized_session() as sess:
-        self.assertAllClose(np.zeros((3,)), bias.eval())
-        self.assertAllClose(np.zeros((4, 3)), wire_cast_var.eval())
+        self.assertAllClose(np.zeros((3,)), self.evaluate(bias))
+        self.assertAllClose(np.zeros((4, 3)), self.evaluate(wire_cast_var))
         sess.run(
             wire_cast_var.assign([[10., 11., 12.], [100., 110., 120.],
                                   [1000., 1100., 1200.],
                                   [10000., 11000., 12000.]]))
         sess.run(bias.assign([5., 6., 7.]))
         self.assertAllClose([[1005., 1106., 1207.], [10015., 11017., 12019.]],
-                            predictions.eval())
+                            self.evaluate(predictions))
 
   def test_dense_multi_dimension(self):
     price = fc.numeric_column('price', shape=2)
@@ -2440,9 +2510,9 @@ class OldLinearModelTest(test.TestCase):
       predictions = fc_old.linear_model(features, [price])
       price_var = get_linear_model_column_var(price)
       with _initialized_session() as sess:
-        self.assertAllClose([[0.], [0.]], price_var.eval())
+        self.assertAllClose([[0.], [0.]], self.evaluate(price_var))
         sess.run(price_var.assign([[10.], [100.]]))
-        self.assertAllClose([[210.], [650.]], predictions.eval())
+        self.assertAllClose([[210.], [650.]], self.evaluate(predictions))
 
   def test_sparse_multi_rank(self):
     wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
@@ -2456,7 +2526,7 @@ class OldLinearModelTest(test.TestCase):
       predictions = fc_old.linear_model(features, [wire_cast])
       wire_cast_var = get_linear_model_column_var(wire_cast)
       with _initialized_session() as sess:
-        self.assertAllClose(np.zeros((4, 1)), wire_cast_var.eval())
+        self.assertAllClose(np.zeros((4, 1)), self.evaluate(wire_cast_var))
         self.assertAllClose(
             np.zeros((2, 1)),
             predictions.eval(feed_dict={wire_tensor: wire_value}))
@@ -2480,7 +2550,7 @@ class OldLinearModelTest(test.TestCase):
       with _initialized_session() as sess:
         sess.run(wire_cast_var.assign([[10.], [100.], [1000.], [10000.]]))
         sess.run(bias.assign([5.]))
-        self.assertAllClose([[1005.], [5010.]], predictions.eval())
+        self.assertAllClose([[1005.], [5010.]], self.evaluate(predictions))
 
   def test_sparse_combiner_with_negative_weights(self):
     wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
@@ -2502,7 +2572,7 @@ class OldLinearModelTest(test.TestCase):
       with _initialized_session() as sess:
         sess.run(wire_cast_var.assign([[10.], [100.], [1000.], [10000.]]))
         sess.run(bias.assign([5.]))
-        self.assertAllClose([[1005.], [-9985.]], predictions.eval())
+        self.assertAllClose([[1005.], [-9985.]], self.evaluate(predictions))
 
   def test_dense_multi_dimension_multi_output(self):
     price = fc.numeric_column('price', shape=2)
@@ -2512,12 +2582,12 @@ class OldLinearModelTest(test.TestCase):
       bias = get_linear_model_bias()
       price_var = get_linear_model_column_var(price)
       with _initialized_session() as sess:
-        self.assertAllClose(np.zeros((3,)), bias.eval())
-        self.assertAllClose(np.zeros((2, 3)), price_var.eval())
+        self.assertAllClose(np.zeros((3,)), self.evaluate(bias))
+        self.assertAllClose(np.zeros((2, 3)), self.evaluate(price_var))
         sess.run(price_var.assign([[1., 2., 3.], [10., 100., 1000.]]))
         sess.run(bias.assign([2., 3., 4.]))
         self.assertAllClose([[23., 205., 2007.], [67., 613., 6019.]],
-                            predictions.eval())
+                            self.evaluate(predictions))
 
   def test_raises_if_shape_mismatch(self):
     price = fc.numeric_column('price', shape=2)
@@ -2536,11 +2606,11 @@ class OldLinearModelTest(test.TestCase):
       bias = get_linear_model_bias()
       price_var = get_linear_model_column_var(price)
       with _initialized_session() as sess:
-        self.assertAllClose([0.], bias.eval())
-        self.assertAllClose([[0.], [0.]], price_var.eval())
-        self.assertAllClose([[0.], [0.]], predictions.eval())
+        self.assertAllClose([0.], self.evaluate(bias))
+        self.assertAllClose([[0.], [0.]], self.evaluate(price_var))
+        self.assertAllClose([[0.], [0.]], self.evaluate(predictions))
         sess.run(price_var.assign([[10.], [100.]]))
-        self.assertAllClose([[210.], [650.]], predictions.eval())
+        self.assertAllClose([[210.], [650.]], self.evaluate(predictions))
 
   def test_dense_multi_column(self):
     price1 = fc.numeric_column('price1', shape=2)
@@ -2552,14 +2622,14 @@ class OldLinearModelTest(test.TestCase):
       price1_var = get_linear_model_column_var(price1)
       price2_var = get_linear_model_column_var(price2)
       with _initialized_session() as sess:
-        self.assertAllClose([0.], bias.eval())
-        self.assertAllClose([[0.], [0.]], price1_var.eval())
-        self.assertAllClose([[0.]], price2_var.eval())
-        self.assertAllClose([[0.], [0.]], predictions.eval())
+        self.assertAllClose([0.], self.evaluate(bias))
+        self.assertAllClose([[0.], [0.]], self.evaluate(price1_var))
+        self.assertAllClose([[0.]], self.evaluate(price2_var))
+        self.assertAllClose([[0.], [0.]], self.evaluate(predictions))
         sess.run(price1_var.assign([[10.], [100.]]))
         sess.run(price2_var.assign([[1000.]]))
         sess.run(bias.assign([7.]))
-        self.assertAllClose([[3217.], [4657.]], predictions.eval())
+        self.assertAllClose([[3217.], [4657.]], self.evaluate(predictions))
 
   def test_fills_cols_to_vars(self):
     price1 = fc.numeric_column('price1', shape=2)
@@ -2589,15 +2659,18 @@ class OldLinearModelTest(test.TestCase):
           partitioner=partitioned_variables.fixed_size_partitioner(2, axis=0)):
         fc_old.linear_model(
             features, [price1, price2], cols_to_vars=cols_to_vars)
-      with _initialized_session():
-        self.assertEqual([0.], cols_to_vars['bias'][0].eval())
-        # Partitioning shards the [2, 1] price1 var into 2 [1, 1] Variables.
-        self.assertAllEqual([[0.]], cols_to_vars[price1][0].eval())
-        self.assertAllEqual([[0.]], cols_to_vars[price1][1].eval())
-        # Partitioning shards the [3, 1] price2 var into a [2, 1] Variable and
-        # a [1, 1] Variable.
-        self.assertAllEqual([[0.], [0.]], cols_to_vars[price2][0].eval())
-        self.assertAllEqual([[0.]], cols_to_vars[price2][1].eval())
+
+      self.evaluate(variables_lib.global_variables_initializer())
+      self.evaluate(lookup_ops.tables_initializer())
+
+      self.assertEqual([0.], self.evaluate(cols_to_vars['bias'][0]))
+      # Partitioning shards the [2, 1] price1 var into 2 [1, 1] Variables.
+      self.assertAllEqual([[0.]], self.evaluate(cols_to_vars[price1][0]))
+      self.assertAllEqual([[0.]], self.evaluate(cols_to_vars[price1][1]))
+      # Partitioning shards the [3, 1] price2 var into a [2, 1] Variable and
+      # a [1, 1] Variable.
+      self.assertAllEqual([[0.], [0.]], self.evaluate(cols_to_vars[price2][0]))
+      self.assertAllEqual([[0.]], self.evaluate(cols_to_vars[price2][1]))
 
   def test_fills_cols_to_output_tensors(self):
     # Provide three _DenseColumn's to input_layer: a _NumericColumn, a
@@ -2795,6 +2868,7 @@ class OldLinearModelTest(test.TestCase):
                 features['price2']: [[1.], [5.]],
             })
 
+  @test_util.run_deprecated_v1
   def test_with_1d_sparse_tensor(self):
     price = fc.numeric_column('price')
     price_buckets = fc.bucketized_column(
@@ -2832,8 +2906,10 @@ class OldLinearModelTest(test.TestCase):
       sess.run(body_style_var.assign([[-10.], [-100.], [-1000.]]))
       sess.run(bias.assign([5.]))
 
-      self.assertAllClose([[10 - 1000 + 5.], [1000 - 10 + 5.]], sess.run(net))
+      self.assertAllClose([[10 - 1000 + 5.], [1000 - 10 + 5.]],
+                          self.evaluate(net))
 
+  @test_util.run_deprecated_v1
   def test_with_1d_unknown_shape_sparse_tensor(self):
     price = fc.numeric_column('price')
     price_buckets = fc.bucketized_column(
@@ -2879,6 +2955,7 @@ class OldLinearModelTest(test.TestCase):
                                   features['country']: country_data
                               }))
 
+  @test_util.run_deprecated_v1
   def test_with_rank_0_feature(self):
     price = fc.numeric_column('price')
     features = {
@@ -2912,26 +2989,27 @@ class OldLinearModelTest(test.TestCase):
       price_var1 = get_linear_model_column_var(price, name='linear_model')
       price_var2 = get_linear_model_column_var(price, name='linear_model_1')
       with _initialized_session() as sess:
-        self.assertAllClose([0.], bias1.eval())
+        self.assertAllClose([0.], self.evaluate(bias1))
         sess.run(price_var1.assign([[10.]]))
         sess.run(bias1.assign([5.]))
-        self.assertAllClose([[15.], [55.]], predictions1.eval())
-        self.assertAllClose([0.], bias2.eval())
+        self.assertAllClose([[15.], [55.]], self.evaluate(predictions1))
+        self.assertAllClose([0.], self.evaluate(bias2))
         sess.run(price_var2.assign([[10.]]))
         sess.run(bias2.assign([5.]))
-        self.assertAllClose([[25.], [105.]], predictions2.eval())
+        self.assertAllClose([[25.], [105.]], self.evaluate(predictions2))
 
+  @test_util.run_deprecated_v1
   def test_linear_model_v1_shared_embedding_all_other_v2(self):
     price = fc.numeric_column('price')  # v2
     some_sparse_column = fc.categorical_column_with_hash_bucket(
         'sparse_feature', hash_bucket_size=5)  # v2
     some_embedding_column = fc.embedding_column(
         some_sparse_column, dimension=10)  # v2
-    categorical_column_a = fc_old.categorical_column_with_identity(
+    categorical_column_a = fc.categorical_column_with_identity(
         key='aaa', num_buckets=3)  # v2
-    categorical_column_b = fc_old.categorical_column_with_identity(
+    categorical_column_b = fc.categorical_column_with_identity(
         key='bbb', num_buckets=3)  # v2
-    shared_embedding_a, shared_embedding_b = fc_old.shared_embedding_columns(
+    shared_embedding_a, shared_embedding_b = fc.shared_embedding_columns(
         [categorical_column_a, categorical_column_b], dimension=2)  # v1
     all_cols = [
         price, some_embedding_column, shared_embedding_a, shared_embedding_b
@@ -2954,9 +3032,13 @@ class OldLinearModelTest(test.TestCase):
       }
       fc_old.linear_model(features, all_cols)
       bias = get_linear_model_bias()
-      with _initialized_session():
-        self.assertAllClose([0.], bias.eval())
 
+      self.evaluate(variables_lib.global_variables_initializer())
+      self.evaluate(lookup_ops.tables_initializer())
+
+      self.assertAllClose([0.], self.evaluate(bias))
+
+  @test_util.run_deprecated_v1
   def test_linear_model_v1_shared_embedding_with_v2_cat_all_other_v2(self):
     price = fc.numeric_column('price')  # v2
     some_sparse_column = fc.categorical_column_with_hash_bucket(
@@ -2967,7 +3049,7 @@ class OldLinearModelTest(test.TestCase):
         key='aaa', num_buckets=3)  # v2
     categorical_column_b = fc.categorical_column_with_identity(
         key='bbb', num_buckets=3)  # v2
-    shared_embedding_a, shared_embedding_b = fc_old.shared_embedding_columns(
+    shared_embedding_a, shared_embedding_b = fc.shared_embedding_columns(
         [categorical_column_a, categorical_column_b], dimension=2)  # v1
     all_cols = [
         price, some_embedding_column, shared_embedding_a, shared_embedding_b
@@ -2990,20 +3072,24 @@ class OldLinearModelTest(test.TestCase):
       }
       fc_old.linear_model(features, all_cols)
       bias = get_linear_model_bias()
-      with _initialized_session():
-        self.assertAllClose([0.], bias.eval())
 
+      self.evaluate(variables_lib.global_variables_initializer())
+      self.evaluate(lookup_ops.tables_initializer())
+
+      self.assertAllClose([0.], self.evaluate(bias))
+
+  @test_util.run_deprecated_v1
   def test_linear_model_v1_v2_mix(self):
     price = fc.numeric_column('price')  # v2
-    some_sparse_column = fc_old.categorical_column_with_hash_bucket(
+    some_sparse_column = fc.categorical_column_with_hash_bucket(
         'sparse_feature', hash_bucket_size=5)  # v1
-    some_embedding_column = fc_old.embedding_column(
+    some_embedding_column = fc.embedding_column(
         some_sparse_column, dimension=10)  # v1
-    categorical_column_a = fc_old.categorical_column_with_identity(
+    categorical_column_a = fc.categorical_column_with_identity(
         key='aaa', num_buckets=3)  # v2
-    categorical_column_b = fc_old.categorical_column_with_identity(
+    categorical_column_b = fc.categorical_column_with_identity(
         key='bbb', num_buckets=3)  # v2
-    shared_embedding_a, shared_embedding_b = fc_old.shared_embedding_columns(
+    shared_embedding_a, shared_embedding_b = fc.shared_embedding_columns(
         [categorical_column_a, categorical_column_b], dimension=2)  # v1
     all_cols = [
         price, some_embedding_column, shared_embedding_a, shared_embedding_b
@@ -3026,14 +3112,18 @@ class OldLinearModelTest(test.TestCase):
       }
       fc_old.linear_model(features, all_cols)
       bias = get_linear_model_bias()
-      with _initialized_session():
-        self.assertAllClose([0.], bias.eval())
 
+      self.evaluate(variables_lib.global_variables_initializer())
+      self.evaluate(lookup_ops.tables_initializer())
+
+      self.assertAllClose([0.], self.evaluate(bias))
+
+  @test_util.run_deprecated_v1
   def test_linear_model_v2_shared_embedding_all_other_v1(self):
-    price = fc_old.numeric_column('price')  # v1
-    some_sparse_column = fc_old.categorical_column_with_hash_bucket(
+    price = fc.numeric_column('price')  # v1
+    some_sparse_column = fc.categorical_column_with_hash_bucket(
         'sparse_feature', hash_bucket_size=5)  # v1
-    some_embedding_column = fc_old.embedding_column(
+    some_embedding_column = fc.embedding_column(
         some_sparse_column, dimension=10)  # v1
     categorical_column_a = fc.categorical_column_with_identity(
         key='aaa', num_buckets=3)  # v2
@@ -3065,13 +3155,13 @@ class OldLinearModelTest(test.TestCase):
         fc_old.linear_model(features, all_cols)
 
 
-class FeatureLayerTest(test.TestCase):
+class DenseFeaturesTest(test.TestCase):
 
   @test_util.run_in_graph_and_eager_modes()
   def test_retrieving_input(self):
     features = {'a': [0.]}
-    feature_layer = fc.FeatureLayer(fc.numeric_column('a'))
-    inputs = self.evaluate(feature_layer(features))
+    dense_features = fc.DenseFeatures(fc.numeric_column('a'))
+    inputs = self.evaluate(dense_features(features))
     self.assertAllClose([[0.]], inputs)
 
   def test_reuses_variables(self):
@@ -3085,6 +3175,7 @@ class FeatureLayerTest(test.TestCase):
       categorical_column = fc.categorical_column_with_identity(
           key='a', num_buckets=3)
       embedding_dimension = 2
+
       def _embedding_column_initializer(shape, dtype, partition_info):
         del shape  # unused
         del dtype  # unused
@@ -3100,11 +3191,11 @@ class FeatureLayerTest(test.TestCase):
           dimension=embedding_dimension,
           initializer=_embedding_column_initializer)
 
-      feature_layer = fc.FeatureLayer([embedding_column])
+      dense_features = fc.DenseFeatures([embedding_column])
       features = {'a': sparse_input}
 
-      inputs = feature_layer(features)
-      variables = feature_layer.variables
+      inputs = dense_features(features)
+      variables = dense_features.variables
 
       # Sanity check: test that the inputs are correct.
       self.assertAllEqual([[1, 0], [0, 1], [1, 1]], inputs)
@@ -3112,13 +3203,13 @@ class FeatureLayerTest(test.TestCase):
       # Check that only one variable was created.
       self.assertEqual(1, len(variables))
 
-      # Check that invoking feature_layer on the same features does not create
+      # Check that invoking dense_features on the same features does not create
       # additional variables
-      _ = feature_layer(features)
+      _ = dense_features(features)
       self.assertEqual(1, len(variables))
-      self.assertEqual(variables[0], feature_layer.variables[0])
+      self.assertEqual(variables[0], dense_features.variables[0])
 
-  def test_feature_column_feature_layer_gradient(self):
+  def test_feature_column_dense_features_gradient(self):
     with context.eager_mode():
       sparse_input = sparse_tensor.SparseTensor(
           indices=((0, 0), (1, 0), (2, 0)),
@@ -3145,11 +3236,11 @@ class FeatureLayerTest(test.TestCase):
           dimension=embedding_dimension,
           initializer=_embedding_column_initializer)
 
-      feature_layer = fc.FeatureLayer([embedding_column])
+      dense_features = fc.DenseFeatures([embedding_column])
       features = {'a': sparse_input}
 
       def scale_matrix():
-        matrix = feature_layer(features)
+        matrix = dense_features(features)
         return 2 * matrix
 
       # Sanity check: Verify that scale_matrix returns the correct output.
@@ -3167,11 +3258,11 @@ class FeatureLayerTest(test.TestCase):
   def test_raises_if_empty_feature_columns(self):
     with self.assertRaisesRegexp(ValueError,
                                  'feature_columns must not be empty'):
-      fc.FeatureLayer(feature_columns=[])(features={})
+      fc.DenseFeatures(feature_columns=[])(features={})
 
   def test_should_be_dense_column(self):
     with self.assertRaisesRegexp(ValueError, 'must be a DenseColumn'):
-      fc.FeatureLayer(feature_columns=[
+      fc.DenseFeatures(feature_columns=[
           fc.categorical_column_with_hash_bucket('wire_cast', 4)
       ])(
           features={
@@ -3181,7 +3272,7 @@ class FeatureLayerTest(test.TestCase):
   def test_does_not_support_dict_columns(self):
     with self.assertRaisesRegexp(
         ValueError, 'Expected feature_columns to be iterable, found dict.'):
-      fc.FeatureLayer(feature_columns={'a': fc.numeric_column('a')})(
+      fc.DenseFeatures(feature_columns={'a': fc.numeric_column('a')})(
           features={
               'a': [[0]]
           })
@@ -3189,22 +3280,28 @@ class FeatureLayerTest(test.TestCase):
   def test_bare_column(self):
     with ops.Graph().as_default():
       features = features = {'a': [0.]}
-      net = fc.FeatureLayer(fc.numeric_column('a'))(features)
-      with _initialized_session():
-        self.assertAllClose([[0.]], net.eval())
+      net = fc.DenseFeatures(fc.numeric_column('a'))(features)
+
+      self.evaluate(variables_lib.global_variables_initializer())
+      self.evaluate(lookup_ops.tables_initializer())
+
+      self.assertAllClose([[0.]], self.evaluate(net))
 
   def test_column_generator(self):
     with ops.Graph().as_default():
       features = features = {'a': [0.], 'b': [1.]}
       columns = (fc.numeric_column(key) for key in features)
-      net = fc.FeatureLayer(columns)(features)
-      with _initialized_session():
-        self.assertAllClose([[0., 1.]], net.eval())
+      net = fc.DenseFeatures(columns)(features)
+
+      self.evaluate(variables_lib.global_variables_initializer())
+      self.evaluate(lookup_ops.tables_initializer())
+
+      self.assertAllClose([[0., 1.]], self.evaluate(net))
 
   def test_raises_if_duplicate_name(self):
     with self.assertRaisesRegexp(
         ValueError, 'Duplicate feature column name found for columns'):
-      fc.FeatureLayer(
+      fc.DenseFeatures(
           feature_columns=[fc.numeric_column('a'),
                            fc.numeric_column('a')])(
                                features={
@@ -3215,17 +3312,23 @@ class FeatureLayerTest(test.TestCase):
     price = fc.numeric_column('price')
     with ops.Graph().as_default():
       features = {'price': [[1.], [5.]]}
-      net = fc.FeatureLayer([price])(features)
-      with _initialized_session():
-        self.assertAllClose([[1.], [5.]], net.eval())
+      net = fc.DenseFeatures([price])(features)
+
+      self.evaluate(variables_lib.global_variables_initializer())
+      self.evaluate(lookup_ops.tables_initializer())
+
+      self.assertAllClose([[1.], [5.]], self.evaluate(net))
 
   def test_multi_dimension(self):
     price = fc.numeric_column('price', shape=2)
     with ops.Graph().as_default():
       features = {'price': [[1., 2.], [5., 6.]]}
-      net = fc.FeatureLayer([price])(features)
-      with _initialized_session():
-        self.assertAllClose([[1., 2.], [5., 6.]], net.eval())
+      net = fc.DenseFeatures([price])(features)
+
+      self.evaluate(variables_lib.global_variables_initializer())
+      self.evaluate(lookup_ops.tables_initializer())
+
+      self.assertAllClose([[1., 2.], [5., 6.]], self.evaluate(net))
 
   def test_compute_output_shape(self):
     price1 = fc.numeric_column('price1', shape=2)
@@ -3235,12 +3338,15 @@ class FeatureLayerTest(test.TestCase):
           'price1': [[1., 2.], [5., 6.]],
           'price2': [[3., 4., 5., 6.], [7., 8., 9., 10.]]
       }
-      feature_layer = fc.FeatureLayer([price1, price2])
-      self.assertEqual((None, 6), feature_layer.compute_output_shape((None,)))
-      net = feature_layer(features)
-      with _initialized_session():
-        self.assertAllClose(
-            [[1., 2., 3., 4., 5., 6.], [5., 6., 7., 8., 9., 10.]], net.eval())
+      dense_features = fc.DenseFeatures([price1, price2])
+      self.assertEqual((None, 6), dense_features.compute_output_shape((None,)))
+      net = dense_features(features)
+
+      self.evaluate(variables_lib.global_variables_initializer())
+      self.evaluate(lookup_ops.tables_initializer())
+
+      self.assertAllClose([[1., 2., 3., 4., 5., 6.], [5., 6., 7., 8., 9., 10.]],
+                          self.evaluate(net))
 
   def test_raises_if_shape_mismatch(self):
     price = fc.numeric_column('price', shape=2)
@@ -3249,27 +3355,30 @@ class FeatureLayerTest(test.TestCase):
       with self.assertRaisesRegexp(
           Exception,
           r'Cannot reshape a tensor with 2 elements to shape \[2,2\]'):
-        fc.FeatureLayer([price])(features)
+        fc.DenseFeatures([price])(features)
 
   def test_reshaping(self):
     price = fc.numeric_column('price', shape=[1, 2])
     with ops.Graph().as_default():
       features = {'price': [[[1., 2.]], [[5., 6.]]]}
-      net = fc.FeatureLayer([price])(features)
-      with _initialized_session():
-        self.assertAllClose([[1., 2.], [5., 6.]], net.eval())
+      net = fc.DenseFeatures([price])(features)
+
+      self.evaluate(variables_lib.global_variables_initializer())
+      self.evaluate(lookup_ops.tables_initializer())
+
+      self.assertAllClose([[1., 2.], [5., 6.]], self.evaluate(net))
 
   def test_multi_column(self):
     price1 = fc.numeric_column('price1', shape=2)
     price2 = fc.numeric_column('price2')
     with ops.Graph().as_default():
-      features = {
-          'price1': [[1., 2.], [5., 6.]],
-          'price2': [[3.], [4.]]
-      }
-      net = fc.FeatureLayer([price1, price2])(features)
-      with _initialized_session():
-        self.assertAllClose([[1., 2., 3.], [5., 6., 4.]], net.eval())
+      features = {'price1': [[1., 2.], [5., 6.]], 'price2': [[3.], [4.]]}
+      net = fc.DenseFeatures([price1, price2])(features)
+
+      self.evaluate(variables_lib.global_variables_initializer())
+      self.evaluate(lookup_ops.tables_initializer())
+
+      self.assertAllClose([[1., 2., 3.], [5., 6., 4.]], self.evaluate(net))
 
   def test_cols_to_output_tensors(self):
     price1 = fc.numeric_column('price1', shape=2)
@@ -3277,12 +3386,16 @@ class FeatureLayerTest(test.TestCase):
     with ops.Graph().as_default():
       cols_dict = {}
       features = {'price1': [[1., 2.], [5., 6.]], 'price2': [[3.], [4.]]}
-      feature_layer = fc.FeatureLayer([price1, price2])
-      net = feature_layer(features, cols_dict)
-      with _initialized_session():
-        self.assertAllClose([[1., 2.], [5., 6.]], cols_dict[price1].eval())
-        self.assertAllClose([[3.], [4.]], cols_dict[price2].eval())
-        self.assertAllClose([[1., 2., 3.], [5., 6., 4.]], net.eval())
+      dense_features = fc.DenseFeatures([price1, price2])
+      net = dense_features(features, cols_dict)
+
+      self.evaluate(variables_lib.global_variables_initializer())
+      self.evaluate(lookup_ops.tables_initializer())
+
+      self.assertAllClose([[1., 2.], [5., 6.]],
+                          self.evaluate(cols_dict[price1]))
+      self.assertAllClose([[3.], [4.]], self.evaluate(cols_dict[price2]))
+      self.assertAllClose([[1., 2., 3.], [5., 6., 4.]], self.evaluate(net))
 
   def test_column_order(self):
     price_a = fc.numeric_column('price_a')
@@ -3292,11 +3405,14 @@ class FeatureLayerTest(test.TestCase):
           'price_a': [[1.]],
           'price_b': [[3.]],
       }
-      net1 = fc.FeatureLayer([price_a, price_b])(features)
-      net2 = fc.FeatureLayer([price_b, price_a])(features)
-      with _initialized_session():
-        self.assertAllClose([[1., 3.]], net1.eval())
-        self.assertAllClose([[1., 3.]], net2.eval())
+      net1 = fc.DenseFeatures([price_a, price_b])(features)
+      net2 = fc.DenseFeatures([price_b, price_a])(features)
+
+      self.evaluate(variables_lib.global_variables_initializer())
+      self.evaluate(lookup_ops.tables_initializer())
+
+      self.assertAllClose([[1., 3.]], self.evaluate(net1))
+      self.assertAllClose([[1., 3.]], self.evaluate(net2))
 
   def test_fails_for_categorical_column(self):
     animal = fc.categorical_column_with_identity('animal', num_buckets=4)
@@ -3307,7 +3423,7 @@ class FeatureLayerTest(test.TestCase):
                   indices=[[0, 0], [0, 1]], values=[1, 2], dense_shape=[1, 2])
       }
       with self.assertRaisesRegexp(Exception, 'must be a DenseColumn'):
-        fc.FeatureLayer([animal])(features)
+        fc.DenseFeatures([animal])(features)
 
   def test_static_batch_size_mismatch(self):
     price1 = fc.numeric_column('price1')
@@ -3320,7 +3436,7 @@ class FeatureLayerTest(test.TestCase):
       with self.assertRaisesRegexp(
           ValueError,
           'Batch size \(first dimension\) of each feature must be same.'):  # pylint: disable=anomalous-backslash-in-string
-        fc.FeatureLayer([price1, price2])(features)
+        fc.DenseFeatures([price1, price2])(features)
 
   def test_subset_of_static_batch_size_mismatch(self):
     price1 = fc.numeric_column('price1')
@@ -3335,7 +3451,7 @@ class FeatureLayerTest(test.TestCase):
       with self.assertRaisesRegexp(
           ValueError,
           'Batch size \(first dimension\) of each feature must be same.'):  # pylint: disable=anomalous-backslash-in-string
-        fc.FeatureLayer([price1, price2, price3])(features)
+        fc.DenseFeatures([price1, price2, price3])(features)
 
   def test_runtime_batch_size_mismatch(self):
     price1 = fc.numeric_column('price1')
@@ -3345,7 +3461,7 @@ class FeatureLayerTest(test.TestCase):
           'price1': array_ops.placeholder(dtype=dtypes.int64),  # batchsize = 3
           'price2': [[3.], [4.]]  # batchsize = 2
       }
-      net = fc.FeatureLayer([price1, price2])(features)
+      net = fc.DenseFeatures([price1, price2])(features)
       with _initialized_session() as sess:
         with self.assertRaisesRegexp(errors.OpError,
                                      'Dimensions of inputs should match'):
@@ -3359,7 +3475,7 @@ class FeatureLayerTest(test.TestCase):
           'price1': array_ops.placeholder(dtype=dtypes.int64),  # batchsize = 2
           'price2': array_ops.placeholder(dtype=dtypes.int64),  # batchsize = 2
       }
-      net = fc.FeatureLayer([price1, price2])(features)
+      net = fc.DenseFeatures([price1, price2])(features)
       with _initialized_session() as sess:
         sess.run(
             net,
@@ -3379,19 +3495,20 @@ class FeatureLayerTest(test.TestCase):
           'sparse_feature': [['a'], ['x']],
       }
       all_cols = [some_embedding_column]
-      fc.FeatureLayer(all_cols)(features)
-      fc.FeatureLayer(all_cols)(features)
+      fc.DenseFeatures(all_cols)(features)
+      fc.DenseFeatures(all_cols)(features)
       # Make sure that 2 variables get created in this case.
       self.assertEqual(2, len(
           ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)))
       expected_var_names = [
-          'feature_layer/sparse_feature_embedding/embedding_weights:0',
-          'feature_layer_1/sparse_feature_embedding/embedding_weights:0'
+          'dense_features/sparse_feature_embedding/embedding_weights:0',
+          'dense_features_1/sparse_feature_embedding/embedding_weights:0'
       ]
       self.assertItemsEqual(
           expected_var_names,
           [v.name for v in ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)])
 
+  @test_util.run_deprecated_v1
   def test_multiple_layers_with_same_shared_embedding_column(self):
     categorical_column_a = fc.categorical_column_with_identity(
         key='aaa', num_buckets=3)
@@ -3416,8 +3533,8 @@ class FeatureLayerTest(test.TestCase):
                   dense_shape=(2, 2)),
       }
       all_cols = [embedding_column_a, embedding_column_b]
-      fc.FeatureLayer(all_cols)(features)
-      fc.FeatureLayer(all_cols)(features)
+      fc.DenseFeatures(all_cols)(features)
+      fc.DenseFeatures(all_cols)(features)
       # Make sure that only 1 variable gets created in this case.
       self.assertEqual(1, len(
           ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)))
@@ -3425,6 +3542,7 @@ class FeatureLayerTest(test.TestCase):
           ['aaa_bbb_shared_embedding:0'],
           [v.name for v in ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)])
 
+  @test_util.run_deprecated_v1
   def test_multiple_layers_with_same_shared_embedding_column_diff_graphs(self):
     categorical_column_a = fc.categorical_column_with_identity(
         key='aaa', num_buckets=3)
@@ -3449,7 +3567,7 @@ class FeatureLayerTest(test.TestCase):
                   values=(1, 2, 1),
                   dense_shape=(2, 2)),
       }
-      fc.FeatureLayer(all_cols)(features)
+      fc.DenseFeatures(all_cols)(features)
       # Make sure that only 1 variable gets created in this case.
       self.assertEqual(1, len(
           ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)))
@@ -3468,7 +3586,7 @@ class FeatureLayerTest(test.TestCase):
                   dense_shape=(2, 2)),
       }
 
-      fc.FeatureLayer(all_cols)(features1)
+      fc.DenseFeatures(all_cols)(features1)
       # Make sure that only 1 variable gets created in this case.
       self.assertEqual(1, len(
           ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)))
@@ -3476,23 +3594,25 @@ class FeatureLayerTest(test.TestCase):
           ['aaa_bbb_shared_embedding:0'],
           [v.name for v in ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)])
 
+  @test_util.run_deprecated_v1
   def test_with_numpy_input_fn(self):
     embedding_values = (
         (1., 2., 3., 4., 5.),  # id 0
         (6., 7., 8., 9., 10.),  # id 1
         (11., 12., 13., 14., 15.)  # id 2
     )
+
     def _initializer(shape, dtype, partition_info):
       del shape, dtype, partition_info
       return embedding_values
 
-    # price has 1 dimension in feature_layer
+    # price has 1 dimension in dense_features
     price = fc.numeric_column('price')
     body_style = fc.categorical_column_with_vocabulary_list(
         'body-style', vocabulary_list=['hardtop', 'wagon', 'sedan'])
-    # one_hot_body_style has 3 dims in feature_layer.
+    # one_hot_body_style has 3 dims in dense_features.
     one_hot_body_style = fc.indicator_column(body_style)
-    # embedded_body_style has 5 dims in feature_layer.
+    # embedded_body_style has 5 dims in dense_features.
     embedded_body_style = fc.embedding_column(
         body_style, dimension=5, initializer=_initializer)
 
@@ -3504,7 +3624,7 @@ class FeatureLayerTest(test.TestCase):
         batch_size=2,
         shuffle=False)
     features = input_fn()
-    net = fc.FeatureLayer([price, one_hot_body_style, embedded_body_style])(
+    net = fc.DenseFeatures([price, one_hot_body_style, embedded_body_style])(
         features)
     self.assertEqual(1 + 3 + 5, net.shape[1])
     with _initialized_session() as sess:
@@ -3513,33 +3633,33 @@ class FeatureLayerTest(test.TestCase):
 
       # Each row is formed by concatenating `embedded_body_style`,
       # `one_hot_body_style`, and `price` in order.
-      self.assertAllEqual(
-          [[11., 12., 13., 14., 15., 0., 0., 1., 11.],
-           [1., 2., 3., 4., 5., 1., 0., 0., 12]],
-          sess.run(net))
+      self.assertAllEqual([[11., 12., 13., 14., 15., 0., 0., 1., 11.],
+                           [1., 2., 3., 4., 5., 1., 0., 0., 12]], sess.run(net))
 
       coord.request_stop()
       coord.join(threads)
 
+  @test_util.run_deprecated_v1
   def test_with_1d_sparse_tensor(self):
     embedding_values = (
         (1., 2., 3., 4., 5.),  # id 0
         (6., 7., 8., 9., 10.),  # id 1
         (11., 12., 13., 14., 15.)  # id 2
     )
+
     def _initializer(shape, dtype, partition_info):
       del shape, dtype, partition_info
       return embedding_values
 
-    # price has 1 dimension in feature_layer
+    # price has 1 dimension in dense_features
     price = fc.numeric_column('price')
 
-    # one_hot_body_style has 3 dims in feature_layer.
+    # one_hot_body_style has 3 dims in dense_features.
     body_style = fc.categorical_column_with_vocabulary_list(
         'body-style', vocabulary_list=['hardtop', 'wagon', 'sedan'])
     one_hot_body_style = fc.indicator_column(body_style)
 
-    # embedded_body_style has 5 dims in feature_layer.
+    # embedded_body_style has 5 dims in dense_features.
     country = fc.categorical_column_with_vocabulary_list(
         'country', vocabulary_list=['US', 'JP', 'CA'])
     embedded_country = fc.embedding_column(
@@ -3547,49 +3667,56 @@ class FeatureLayerTest(test.TestCase):
 
     # Provides 1-dim tensor and dense tensor.
     features = {
-        'price': constant_op.constant([11., 12.,]),
-        'body-style': sparse_tensor.SparseTensor(
-            indices=((0,), (1,)),
-            values=('sedan', 'hardtop'),
-            dense_shape=(2,)),
+        'price':
+            constant_op.constant([
+                11.,
+                12.,
+            ]),
+        'body-style':
+            sparse_tensor.SparseTensor(
+                indices=((0,), (1,)),
+                values=('sedan', 'hardtop'),
+                dense_shape=(2,)),
         # This is dense tensor for the categorical_column.
-        'country': constant_op.constant(['CA', 'US']),
+        'country':
+            constant_op.constant(['CA', 'US']),
     }
     self.assertEqual(1, features['price'].shape.ndims)
     self.assertEqual(1, features['body-style'].dense_shape.get_shape()[0])
     self.assertEqual(1, features['country'].shape.ndims)
 
-    net = fc.FeatureLayer([price, one_hot_body_style, embedded_country])(
+    net = fc.DenseFeatures([price, one_hot_body_style, embedded_country])(
         features)
     self.assertEqual(1 + 3 + 5, net.shape[1])
     with _initialized_session() as sess:
 
       # Each row is formed by concatenating `embedded_body_style`,
       # `one_hot_body_style`, and `price` in order.
-      self.assertAllEqual(
-          [[0., 0., 1., 11., 12., 13., 14., 15., 11.],
-           [1., 0., 0., 1., 2., 3., 4., 5., 12.]],
-          sess.run(net))
+      self.assertAllEqual([[0., 0., 1., 11., 12., 13., 14., 15., 11.],
+                           [1., 0., 0., 1., 2., 3., 4., 5., 12.]],
+                          sess.run(net))
 
+  @test_util.run_deprecated_v1
   def test_with_1d_unknown_shape_sparse_tensor(self):
     embedding_values = (
         (1., 2.),  # id 0
         (6., 7.),  # id 1
         (11., 12.)  # id 2
     )
+
     def _initializer(shape, dtype, partition_info):
       del shape, dtype, partition_info
       return embedding_values
 
-    # price has 1 dimension in feature_layer
+    # price has 1 dimension in dense_features
     price = fc.numeric_column('price')
 
-    # one_hot_body_style has 3 dims in feature_layer.
+    # one_hot_body_style has 3 dims in dense_features.
     body_style = fc.categorical_column_with_vocabulary_list(
         'body-style', vocabulary_list=['hardtop', 'wagon', 'sedan'])
     one_hot_body_style = fc.indicator_column(body_style)
 
-    # embedded_body_style has 5 dims in feature_layer.
+    # embedded_body_style has 5 dims in dense_features.
     country = fc.categorical_column_with_vocabulary_list(
         'country', vocabulary_list=['US', 'JP', 'CA'])
     embedded_country = fc.embedding_column(
@@ -3608,12 +3735,10 @@ class FeatureLayerTest(test.TestCase):
 
     price_data = np.array([11., 12.])
     body_style_data = sparse_tensor.SparseTensorValue(
-        indices=((0,), (1,)),
-        values=('sedan', 'hardtop'),
-        dense_shape=(2,))
+        indices=((0,), (1,)), values=('sedan', 'hardtop'), dense_shape=(2,))
     country_data = np.array([['US'], ['CA']])
 
-    net = fc.FeatureLayer([price, one_hot_body_style, embedded_country])(
+    net = fc.DenseFeatures([price, one_hot_body_style, embedded_country])(
         features)
     self.assertEqual(1 + 3 + 2, net.shape[1])
     with _initialized_session() as sess:
@@ -3630,8 +3755,9 @@ class FeatureLayerTest(test.TestCase):
                   features['country']: country_data
               }))
 
+  @test_util.run_deprecated_v1
   def test_with_rank_0_feature(self):
-    # price has 1 dimension in feature_layer
+    # price has 1 dimension in dense_features
     price = fc.numeric_column('price')
     features = {
         'price': constant_op.constant(0),
@@ -3640,13 +3766,13 @@ class FeatureLayerTest(test.TestCase):
 
     # Static rank 0 should fail
     with self.assertRaisesRegexp(ValueError, 'Feature .* cannot have rank 0'):
-      fc.FeatureLayer([price])(features)
+      fc.DenseFeatures([price])(features)
 
     # Dynamic rank 0 should fail
     features = {
         'price': array_ops.placeholder(dtypes.float32),
     }
-    net = fc.FeatureLayer([price])(features)
+    net = fc.DenseFeatures([price])(features)
     self.assertEqual(1, net.shape[1])
     with _initialized_session() as sess:
       with self.assertRaisesOpError('Feature .* cannot have rank 0'):
@@ -3779,16 +3905,22 @@ class FunctionalInputLayerTest(test.TestCase):
     with ops.Graph().as_default():
       features = features = {'a': [0.]}
       net = fc_old.input_layer(features, fc.numeric_column('a'))
-      with _initialized_session():
-        self.assertAllClose([[0.]], net.eval())
+
+      self.evaluate(variables_lib.global_variables_initializer())
+      self.evaluate(lookup_ops.tables_initializer())
+
+      self.assertAllClose([[0.]], self.evaluate(net))
 
   def test_column_generator(self):
     with ops.Graph().as_default():
       features = features = {'a': [0.], 'b': [1.]}
       columns = (fc.numeric_column(key) for key in features)
       net = fc_old.input_layer(features, columns)
-      with _initialized_session():
-        self.assertAllClose([[0., 1.]], net.eval())
+
+      self.evaluate(variables_lib.global_variables_initializer())
+      self.evaluate(lookup_ops.tables_initializer())
+
+      self.assertAllClose([[0., 1.]], self.evaluate(net))
 
   def test_raises_if_duplicate_name(self):
     with self.assertRaisesRegexp(
@@ -3803,16 +3935,22 @@ class FunctionalInputLayerTest(test.TestCase):
     with ops.Graph().as_default():
       features = {'price': [[1.], [5.]]}
       net = fc_old.input_layer(features, [price])
-      with _initialized_session():
-        self.assertAllClose([[1.], [5.]], net.eval())
+
+      self.evaluate(variables_lib.global_variables_initializer())
+      self.evaluate(lookup_ops.tables_initializer())
+
+      self.assertAllClose([[1.], [5.]], self.evaluate(net))
 
   def test_multi_dimension(self):
     price = fc.numeric_column('price', shape=2)
     with ops.Graph().as_default():
       features = {'price': [[1., 2.], [5., 6.]]}
       net = fc_old.input_layer(features, [price])
-      with _initialized_session():
-        self.assertAllClose([[1., 2.], [5., 6.]], net.eval())
+
+      self.evaluate(variables_lib.global_variables_initializer())
+      self.evaluate(lookup_ops.tables_initializer())
+
+      self.assertAllClose([[1., 2.], [5., 6.]], self.evaluate(net))
 
   def test_raises_if_shape_mismatch(self):
     price = fc.numeric_column('price', shape=2)
@@ -3828,8 +3966,11 @@ class FunctionalInputLayerTest(test.TestCase):
     with ops.Graph().as_default():
       features = {'price': [[[1., 2.]], [[5., 6.]]]}
       net = fc_old.input_layer(features, [price])
-      with _initialized_session():
-        self.assertAllClose([[1., 2.], [5., 6.]], net.eval())
+
+      self.evaluate(variables_lib.global_variables_initializer())
+      self.evaluate(lookup_ops.tables_initializer())
+
+      self.assertAllClose([[1., 2.], [5., 6.]], self.evaluate(net))
 
   def test_multi_column(self):
     price1 = fc.numeric_column('price1', shape=2)
@@ -3837,8 +3978,11 @@ class FunctionalInputLayerTest(test.TestCase):
     with ops.Graph().as_default():
       features = {'price1': [[1., 2.], [5., 6.]], 'price2': [[3.], [4.]]}
       net = fc_old.input_layer(features, [price1, price2])
-      with _initialized_session():
-        self.assertAllClose([[1., 2., 3.], [5., 6., 4.]], net.eval())
+
+      self.evaluate(variables_lib.global_variables_initializer())
+      self.evaluate(lookup_ops.tables_initializer())
+
+      self.assertAllClose([[1., 2., 3.], [5., 6., 4.]], self.evaluate(net))
 
   def test_fills_cols_to_vars(self):
     # Provide three _DenseColumn's to input_layer: a _NumericColumn, a
@@ -3869,6 +4013,7 @@ class FunctionalInputLayerTest(test.TestCase):
                             variables_lib.Variable)
       self.assertAllEqual(cols_to_vars[some_embedding_column][0].shape, [5, 10])
 
+  @test_util.run_deprecated_v1
   def test_fills_cols_to_vars_shared_embedding(self):
     # Provide 5 DenseColumn's to input_layer: a NumericColumn, a
     # BucketizedColumn, an EmbeddingColumn, two SharedEmbeddingColumns. The
@@ -3882,11 +4027,11 @@ class FunctionalInputLayerTest(test.TestCase):
         'sparse_feature', hash_bucket_size=5)
     some_embedding_column = fc.embedding_column(
         some_sparse_column, dimension=10)
-    categorical_column_a = fc_old.categorical_column_with_identity(
+    categorical_column_a = fc.categorical_column_with_identity(
         key='aaa', num_buckets=3)
-    categorical_column_b = fc_old.categorical_column_with_identity(
+    categorical_column_b = fc.categorical_column_with_identity(
         key='bbb', num_buckets=3)
-    shared_embedding_a, shared_embedding_b = fc_old.shared_embedding_columns(
+    shared_embedding_a, shared_embedding_b = fc.shared_embedding_columns(
         [categorical_column_a, categorical_column_b], dimension=2)
     with ops.Graph().as_default():
       features = {
@@ -3968,9 +4113,12 @@ class FunctionalInputLayerTest(test.TestCase):
       }
       net1 = fc_old.input_layer(features, [price_a, price_b])
       net2 = fc_old.input_layer(features, [price_b, price_a])
-      with _initialized_session():
-        self.assertAllClose([[1., 3.]], net1.eval())
-        self.assertAllClose([[1., 3.]], net2.eval())
+
+      self.evaluate(variables_lib.global_variables_initializer())
+      self.evaluate(lookup_ops.tables_initializer())
+
+      self.assertAllClose([[1., 3.]], self.evaluate(net1))
+      self.assertAllClose([[1., 3.]], self.evaluate(net2))
 
   def test_fails_for_categorical_column(self):
     animal = fc.categorical_column_with_identity('animal', num_buckets=4)
@@ -4066,6 +4214,7 @@ class FunctionalInputLayerTest(test.TestCase):
           expected_var_names,
           [v.name for v in ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)])
 
+  @test_util.run_deprecated_v1
   def test_with_1d_sparse_tensor(self):
     embedding_values = (
         (1., 2., 3., 4., 5.),  # id 0
@@ -4122,6 +4271,7 @@ class FunctionalInputLayerTest(test.TestCase):
                            [1., 0., 0., 1., 2., 3., 4., 5., 12.]],
                           sess.run(net))
 
+  @test_util.run_deprecated_v1
   def test_with_1d_unknown_shape_sparse_tensor(self):
     embedding_values = (
         (1., 2.),  # id 0
@@ -4180,6 +4330,7 @@ class FunctionalInputLayerTest(test.TestCase):
                   features['country']: country_data
               }))
 
+  @test_util.run_deprecated_v1
   def test_with_rank_0_feature(self):
     # price has 1 dimension in input_layer
     price = fc.numeric_column('price')
@@ -4220,12 +4371,19 @@ class MakeParseExampleSpecTest(test.TestCase):
     def transform_feature(self, transformation_cache, state_manager):
       pass
 
+    def _transform_feature(self, inputs):
+      pass
+
     @property
     def parse_example_spec(self):
       return self.parse_spec
 
+    @property
+    def _parse_example_spec(self):
+      return self.parse_spec
+
   def test_no_feature_columns(self):
-    actual = fc.make_parse_example_spec([])
+    actual = fc.make_parse_example_spec_v2([])
     self.assertDictEqual({}, actual)
 
   def test_invalid_type(self):
@@ -4235,15 +4393,17 @@ class MakeParseExampleSpecTest(test.TestCase):
     with self.assertRaisesRegexp(
         ValueError,
         'All feature_columns must be FeatureColumn instances.*invalid_column'):
-      fc.make_parse_example_spec(
-          (self._TestFeatureColumn({key1: parse_spec1}), 'invalid_column'))
+      fc.make_parse_example_spec_v2((self._TestFeatureColumn({
+          key1: parse_spec1
+      }), 'invalid_column'))
 
   def test_one_feature_column(self):
     key1 = 'key1'
     parse_spec1 = parsing_ops.FixedLenFeature(
         shape=(2,), dtype=dtypes.float32, default_value=0.)
-    actual = fc.make_parse_example_spec(
-        (self._TestFeatureColumn({key1: parse_spec1}),))
+    actual = fc.make_parse_example_spec_v2((self._TestFeatureColumn({
+        key1: parse_spec1
+    }),))
     self.assertDictEqual({key1: parse_spec1}, actual)
 
   def test_two_feature_columns(self):
@@ -4252,9 +4412,11 @@ class MakeParseExampleSpecTest(test.TestCase):
         shape=(2,), dtype=dtypes.float32, default_value=0.)
     key2 = 'key2'
     parse_spec2 = parsing_ops.VarLenFeature(dtype=dtypes.string)
-    actual = fc.make_parse_example_spec(
-        (self._TestFeatureColumn({key1: parse_spec1}),
-         self._TestFeatureColumn({key2: parse_spec2})))
+    actual = fc.make_parse_example_spec_v2((self._TestFeatureColumn({
+        key1: parse_spec1
+    }), self._TestFeatureColumn({
+        key2: parse_spec2
+    })))
     self.assertDictEqual({key1: parse_spec1, key2: parse_spec2}, actual)
 
   def test_equal_keys_different_parse_spec(self):
@@ -4265,17 +4427,21 @@ class MakeParseExampleSpecTest(test.TestCase):
     with self.assertRaisesRegexp(
         ValueError,
         'feature_columns contain different parse_spec for key key1'):
-      fc.make_parse_example_spec(
-          (self._TestFeatureColumn({key1: parse_spec1}),
-           self._TestFeatureColumn({key1: parse_spec2})))
+      fc.make_parse_example_spec_v2((self._TestFeatureColumn({
+          key1: parse_spec1
+      }), self._TestFeatureColumn({
+          key1: parse_spec2
+      })))
 
   def test_equal_keys_equal_parse_spec(self):
     key1 = 'key1'
     parse_spec1 = parsing_ops.FixedLenFeature(
         shape=(2,), dtype=dtypes.float32, default_value=0.)
-    actual = fc.make_parse_example_spec(
-        (self._TestFeatureColumn({key1: parse_spec1}),
-         self._TestFeatureColumn({key1: parse_spec1})))
+    actual = fc.make_parse_example_spec_v2((self._TestFeatureColumn({
+        key1: parse_spec1
+    }), self._TestFeatureColumn({
+        key1: parse_spec1
+    })))
     self.assertDictEqual({key1: parse_spec1}, actual)
 
   def test_multiple_features_dict(self):
@@ -4287,11 +4453,17 @@ class MakeParseExampleSpecTest(test.TestCase):
     parse_spec2 = parsing_ops.VarLenFeature(dtype=dtypes.string)
     key3 = 'key3'
     parse_spec3 = parsing_ops.VarLenFeature(dtype=dtypes.int32)
-    actual = fc.make_parse_example_spec(
-        (self._TestFeatureColumn({key1: parse_spec1}),
-         self._TestFeatureColumn({key2: parse_spec2, key3: parse_spec3})))
-    self.assertDictEqual(
-        {key1: parse_spec1, key2: parse_spec2, key3: parse_spec3}, actual)
+    actual = fc.make_parse_example_spec_v2((self._TestFeatureColumn({
+        key1: parse_spec1
+    }), self._TestFeatureColumn({
+        key2: parse_spec2,
+        key3: parse_spec3
+    })))
+    self.assertDictEqual({
+        key1: parse_spec1,
+        key2: parse_spec2,
+        key3: parse_spec3
+    }, actual)
 
 
 def _assert_sparse_tensor_value(test_case, expected, actual):
@@ -4299,7 +4471,8 @@ def _assert_sparse_tensor_value(test_case, expected, actual):
   test_case.assertAllEqual(expected.indices, actual.indices)
 
   test_case.assertEqual(
-      np.array(expected.values).dtype, np.array(actual.values).dtype)
+      np.array(expected.values).dtype,
+      np.array(actual.values).dtype)
   test_case.assertAllEqual(expected.values, actual.values)
 
   test_case.assertEqual(np.int64, np.array(actual.dense_shape).dtype)
@@ -4321,6 +4494,7 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
         'python/feature_column/testdata/wire_vocabulary.txt')
     self._wire_vocabulary_size = 3
 
+  @test_util.run_deprecated_v1
   def test_defaults(self):
     column = fc.categorical_column_with_vocabulary_file(
         key='aaa', vocabulary_file='path_to_file', vocabulary_size=3)
@@ -4337,19 +4511,27 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
       fc.categorical_column_with_vocabulary_file(
           key=('aaa',), vocabulary_file='path_to_file', vocabulary_size=3)
 
+  @test_util.run_deprecated_v1
   def test_all_constructor_args(self):
     column = fc.categorical_column_with_vocabulary_file(
-        key='aaa', vocabulary_file='path_to_file', vocabulary_size=3,
-        num_oov_buckets=4, dtype=dtypes.int32)
+        key='aaa',
+        vocabulary_file='path_to_file',
+        vocabulary_size=3,
+        num_oov_buckets=4,
+        dtype=dtypes.int32)
     self.assertEqual(7, column.num_buckets)
     self.assertEqual({
         'aaa': parsing_ops.VarLenFeature(dtypes.int32)
     }, column.parse_example_spec)
 
+  @test_util.run_deprecated_v1
   def test_deep_copy(self):
     original = fc.categorical_column_with_vocabulary_file(
-        key='aaa', vocabulary_file='path_to_file', vocabulary_size=3,
-        num_oov_buckets=4, dtype=dtypes.int32)
+        key='aaa',
+        vocabulary_file='path_to_file',
+        vocabulary_size=3,
+        num_oov_buckets=4,
+        dtype=dtypes.int32)
     for column in (original, copy.deepcopy(original)):
       self.assertEqual('aaa', column.name)
       self.assertEqual(7, column.num_buckets)
@@ -4367,6 +4549,7 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
       fc.categorical_column_with_vocabulary_file(
           key='aaa', vocabulary_file='', vocabulary_size=3)
 
+  @test_util.run_deprecated_v1
   def test_invalid_vocabulary_file(self):
     column = fc.categorical_column_with_vocabulary_file(
         key='aaa', vocabulary_file='file_does_not_exist', vocabulary_size=10)
@@ -4379,19 +4562,21 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
             'aaa': inputs
         }), None)
     with self.assertRaisesRegexp(errors.OpError, 'file_does_not_exist'):
-      with self.cached_session():
-        lookup_ops.tables_initializer().run()
+      self.evaluate(lookup_ops.tables_initializer())
 
   def test_invalid_vocabulary_size(self):
     with self.assertRaisesRegexp(ValueError, 'Invalid vocabulary_size'):
       fc.categorical_column_with_vocabulary_file(
-          key='aaa', vocabulary_file=self._wire_vocabulary_file_name,
+          key='aaa',
+          vocabulary_file=self._wire_vocabulary_file_name,
           vocabulary_size=-1)
     with self.assertRaisesRegexp(ValueError, 'Invalid vocabulary_size'):
       fc.categorical_column_with_vocabulary_file(
-          key='aaa', vocabulary_file=self._wire_vocabulary_file_name,
+          key='aaa',
+          vocabulary_file=self._wire_vocabulary_file_name,
           vocabulary_size=0)
 
+  @test_util.run_deprecated_v1
   def test_too_large_vocabulary_size(self):
     column = fc.categorical_column_with_vocabulary_file(
         key='aaa',
@@ -4406,24 +4591,27 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
             'aaa': inputs
         }), None)
     with self.assertRaisesRegexp(errors.OpError, 'Invalid vocab_size'):
-      with self.cached_session():
-        lookup_ops.tables_initializer().run()
+      self.evaluate(lookup_ops.tables_initializer())
 
   def test_invalid_num_oov_buckets(self):
     with self.assertRaisesRegexp(ValueError, 'Invalid num_oov_buckets'):
       fc.categorical_column_with_vocabulary_file(
-          key='aaa', vocabulary_file='path', vocabulary_size=3,
+          key='aaa',
+          vocabulary_file='path',
+          vocabulary_size=3,
           num_oov_buckets=-1)
 
   def test_invalid_dtype(self):
     with self.assertRaisesRegexp(ValueError, 'dtype must be string or integer'):
       fc.categorical_column_with_vocabulary_file(
-          key='aaa', vocabulary_file='path', vocabulary_size=3,
+          key='aaa',
+          vocabulary_file='path',
+          vocabulary_size=3,
           dtype=dtypes.float64)
 
   def test_invalid_buckets_and_default_value(self):
-    with self.assertRaisesRegexp(
-        ValueError, 'both num_oov_buckets and default_value'):
+    with self.assertRaisesRegexp(ValueError,
+                                 'both num_oov_buckets and default_value'):
       fc.categorical_column_with_vocabulary_file(
           key='aaa',
           vocabulary_file=self._wire_vocabulary_file_name,
@@ -4463,28 +4651,31 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
               'aaa': inputs
           }), None)
 
+  @test_util.run_deprecated_v1
   def test_parse_example(self):
     a = fc.categorical_column_with_vocabulary_file(
         key='aaa', vocabulary_file='path_to_file', vocabulary_size=3)
-    data = example_pb2.Example(features=feature_pb2.Features(
-        feature={
-            'aaa':
-                feature_pb2.Feature(bytes_list=feature_pb2.BytesList(
-                    value=[b'omar', b'stringer']))
-        }))
+    data = example_pb2.Example(
+        features=feature_pb2.Features(
+            feature={
+                'aaa':
+                    feature_pb2.Feature(
+                        bytes_list=feature_pb2.BytesList(
+                            value=[b'omar', b'stringer']))
+            }))
     features = parsing_ops.parse_example(
         serialized=[data.SerializeToString()],
-        features=fc.make_parse_example_spec([a]))
+        features=fc.make_parse_example_spec_v2([a]))
     self.assertIn('aaa', features)
-    with self.cached_session():
-      _assert_sparse_tensor_value(
-          self,
-          sparse_tensor.SparseTensorValue(
-              indices=[[0, 0], [0, 1]],
-              values=np.array([b'omar', b'stringer'], dtype=np.object_),
-              dense_shape=[1, 2]),
-          features['aaa'].eval())
 
+    _assert_sparse_tensor_value(
+        self,
+        sparse_tensor.SparseTensorValue(
+            indices=[[0, 0], [0, 1]],
+            values=np.array([b'omar', b'stringer'], dtype=np.object_),
+            dense_shape=[1, 2]), self.evaluate(features['aaa']))
+
+  @test_util.run_deprecated_v1
   def test_get_sparse_tensors(self):
     column = fc.categorical_column_with_vocabulary_file(
         key='aaa',
@@ -4499,15 +4690,19 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
             'aaa': inputs
         }), None)
     self.assertIsNone(id_weight_pair.weight_tensor)
-    with _initialized_session():
-      _assert_sparse_tensor_value(
-          self,
-          sparse_tensor.SparseTensorValue(
-              indices=inputs.indices,
-              values=np.array((2, -1, 0), dtype=np.int64),
-              dense_shape=inputs.dense_shape),
-          id_weight_pair.id_tensor.eval())
 
+    self.evaluate(variables_lib.global_variables_initializer())
+    self.evaluate(lookup_ops.tables_initializer())
+
+    _assert_sparse_tensor_value(
+        self,
+        sparse_tensor.SparseTensorValue(
+            indices=inputs.indices,
+            values=np.array((2, -1, 0), dtype=np.int64),
+            dense_shape=inputs.dense_shape),
+        self.evaluate(id_weight_pair.id_tensor))
+
+  @test_util.run_deprecated_v1
   def test_get_sparse_tensors_none_vocabulary_size(self):
     column = fc.categorical_column_with_vocabulary_file(
         key='aaa', vocabulary_file=self._wire_vocabulary_file_name)
@@ -4520,15 +4715,19 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
             'aaa': inputs
         }), None)
     self.assertIsNone(id_weight_pair.weight_tensor)
-    with _initialized_session():
-      _assert_sparse_tensor_value(self,
-                                  sparse_tensor.SparseTensorValue(
-                                      indices=inputs.indices,
-                                      values=np.array(
-                                          (2, -1, 0), dtype=np.int64),
-                                      dense_shape=inputs.dense_shape),
-                                  id_weight_pair.id_tensor.eval())
 
+    self.evaluate(variables_lib.global_variables_initializer())
+    self.evaluate(lookup_ops.tables_initializer())
+
+    _assert_sparse_tensor_value(
+        self,
+        sparse_tensor.SparseTensorValue(
+            indices=inputs.indices,
+            values=np.array((2, -1, 0), dtype=np.int64),
+            dense_shape=inputs.dense_shape),
+        self.evaluate(id_weight_pair.id_tensor))
+
+  @test_util.run_deprecated_v1
   def test_transform_feature(self):
     column = fc.categorical_column_with_vocabulary_file(
         key='aaa',
@@ -4538,16 +4737,21 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
         indices=((0, 0), (1, 0), (1, 1)),
         values=('marlo', 'skywalker', 'omar'),
         dense_shape=(2, 2))
-    id_tensor = fc._transform_features({'aaa': inputs}, [column], None)[column]
-    with _initialized_session():
-      _assert_sparse_tensor_value(self,
-                                  sparse_tensor.SparseTensorValue(
-                                      indices=inputs.indices,
-                                      values=np.array(
-                                          (2, -1, 0), dtype=np.int64),
-                                      dense_shape=inputs.dense_shape),
-                                  id_tensor.eval())
+    id_tensor = fc._transform_features_v2({
+        'aaa': inputs
+    }, [column], None)[column]
 
+    self.evaluate(variables_lib.global_variables_initializer())
+    self.evaluate(lookup_ops.tables_initializer())
+
+    _assert_sparse_tensor_value(
+        self,
+        sparse_tensor.SparseTensorValue(
+            indices=inputs.indices,
+            values=np.array((2, -1, 0), dtype=np.int64),
+            dense_shape=inputs.dense_shape), self.evaluate(id_tensor))
+
+  @test_util.run_deprecated_v1
   def test_get_sparse_tensors_dense_input(self):
     column = fc.categorical_column_with_vocabulary_file(
         key='aaa',
@@ -4558,15 +4762,18 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
             'aaa': (('marlo', ''), ('skywalker', 'omar'))
         }), None)
     self.assertIsNone(id_weight_pair.weight_tensor)
-    with _initialized_session():
-      _assert_sparse_tensor_value(
-          self,
-          sparse_tensor.SparseTensorValue(
-              indices=((0, 0), (1, 0), (1, 1)),
-              values=np.array((2, -1, 0), dtype=np.int64),
-              dense_shape=(2, 2)),
-          id_weight_pair.id_tensor.eval())
 
+    self.evaluate(variables_lib.global_variables_initializer())
+    self.evaluate(lookup_ops.tables_initializer())
+
+    _assert_sparse_tensor_value(
+        self,
+        sparse_tensor.SparseTensorValue(
+            indices=((0, 0), (1, 0), (1, 1)),
+            values=np.array((2, -1, 0), dtype=np.int64),
+            dense_shape=(2, 2)), self.evaluate(id_weight_pair.id_tensor))
+
+  @test_util.run_deprecated_v1
   def test_get_sparse_tensors_default_value_in_vocabulary(self):
     column = fc.categorical_column_with_vocabulary_file(
         key='aaa',
@@ -4582,15 +4789,19 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
             'aaa': inputs
         }), None)
     self.assertIsNone(id_weight_pair.weight_tensor)
-    with _initialized_session():
-      _assert_sparse_tensor_value(
-          self,
-          sparse_tensor.SparseTensorValue(
-              indices=inputs.indices,
-              values=np.array((2, 2, 0), dtype=np.int64),
-              dense_shape=inputs.dense_shape),
-          id_weight_pair.id_tensor.eval())
 
+    self.evaluate(variables_lib.global_variables_initializer())
+    self.evaluate(lookup_ops.tables_initializer())
+
+    _assert_sparse_tensor_value(
+        self,
+        sparse_tensor.SparseTensorValue(
+            indices=inputs.indices,
+            values=np.array((2, 2, 0), dtype=np.int64),
+            dense_shape=inputs.dense_shape),
+        self.evaluate(id_weight_pair.id_tensor))
+
+  @test_util.run_deprecated_v1
   def test_get_sparse_tensors_with_oov_buckets(self):
     column = fc.categorical_column_with_vocabulary_file(
         key='aaa',
@@ -4606,15 +4817,19 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
             'aaa': inputs
         }), None)
     self.assertIsNone(id_weight_pair.weight_tensor)
-    with _initialized_session():
-      _assert_sparse_tensor_value(
-          self,
-          sparse_tensor.SparseTensorValue(
-              indices=inputs.indices,
-              values=np.array((2, 33, 0, 62), dtype=np.int64),
-              dense_shape=inputs.dense_shape),
-          id_weight_pair.id_tensor.eval())
 
+    self.evaluate(variables_lib.global_variables_initializer())
+    self.evaluate(lookup_ops.tables_initializer())
+
+    _assert_sparse_tensor_value(
+        self,
+        sparse_tensor.SparseTensorValue(
+            indices=inputs.indices,
+            values=np.array((2, 33, 0, 62), dtype=np.int64),
+            dense_shape=inputs.dense_shape),
+        self.evaluate(id_weight_pair.id_tensor))
+
+  @test_util.run_deprecated_v1
   def test_get_sparse_tensors_small_vocabulary_size(self):
     # 'marlo' is the last entry in our vocabulary file, so be setting
     # `vocabulary_size` to 1 less than number of entries in file, we take
@@ -4632,15 +4847,19 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
             'aaa': inputs
         }), None)
     self.assertIsNone(id_weight_pair.weight_tensor)
-    with _initialized_session():
-      _assert_sparse_tensor_value(
-          self,
-          sparse_tensor.SparseTensorValue(
-              indices=inputs.indices,
-              values=np.array((-1, -1, 0), dtype=np.int64),
-              dense_shape=inputs.dense_shape),
-          id_weight_pair.id_tensor.eval())
 
+    self.evaluate(variables_lib.global_variables_initializer())
+    self.evaluate(lookup_ops.tables_initializer())
+
+    _assert_sparse_tensor_value(
+        self,
+        sparse_tensor.SparseTensorValue(
+            indices=inputs.indices,
+            values=np.array((-1, -1, 0), dtype=np.int64),
+            dense_shape=inputs.dense_shape),
+        self.evaluate(id_weight_pair.id_tensor))
+
+  @test_util.run_deprecated_v1
   def test_get_sparse_tensors_int32(self):
     column = fc.categorical_column_with_vocabulary_file(
         key='aaa',
@@ -4656,15 +4875,19 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
             'aaa': inputs
         }), None)
     self.assertIsNone(id_weight_pair.weight_tensor)
-    with _initialized_session():
-      _assert_sparse_tensor_value(
-          self,
-          sparse_tensor.SparseTensorValue(
-              indices=inputs.indices,
-              values=np.array((2, -1, 0, 4), dtype=np.int64),
-              dense_shape=inputs.dense_shape),
-          id_weight_pair.id_tensor.eval())
 
+    self.evaluate(variables_lib.global_variables_initializer())
+    self.evaluate(lookup_ops.tables_initializer())
+
+    _assert_sparse_tensor_value(
+        self,
+        sparse_tensor.SparseTensorValue(
+            indices=inputs.indices,
+            values=np.array((2, -1, 0, 4), dtype=np.int64),
+            dense_shape=inputs.dense_shape),
+        self.evaluate(id_weight_pair.id_tensor))
+
+  @test_util.run_deprecated_v1
   def test_get_sparse_tensors_int32_dense_input(self):
     default_value = -100
     column = fc.categorical_column_with_vocabulary_file(
@@ -4678,15 +4901,18 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
             'aaa': ((11, -1, -1), (100, 30, -1), (-1, -1, 22))
         }), None)
     self.assertIsNone(id_weight_pair.weight_tensor)
-    with _initialized_session():
-      _assert_sparse_tensor_value(
-          self,
-          sparse_tensor.SparseTensorValue(
-              indices=((0, 0), (1, 0), (1, 1), (2, 2)),
-              values=np.array((2, default_value, 0, 4), dtype=np.int64),
-              dense_shape=(3, 3)),
-          id_weight_pair.id_tensor.eval())
 
+    self.evaluate(variables_lib.global_variables_initializer())
+    self.evaluate(lookup_ops.tables_initializer())
+
+    _assert_sparse_tensor_value(
+        self,
+        sparse_tensor.SparseTensorValue(
+            indices=((0, 0), (1, 0), (1, 1), (2, 2)),
+            values=np.array((2, default_value, 0, 4), dtype=np.int64),
+            dense_shape=(3, 3)), self.evaluate(id_weight_pair.id_tensor))
+
+  @test_util.run_deprecated_v1
   def test_get_sparse_tensors_int32_with_oov_buckets(self):
     column = fc.categorical_column_with_vocabulary_file(
         key='aaa',
@@ -4703,15 +4929,19 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
             'aaa': inputs
         }), None)
     self.assertIsNone(id_weight_pair.weight_tensor)
-    with _initialized_session():
-      _assert_sparse_tensor_value(
-          self,
-          sparse_tensor.SparseTensorValue(
-              indices=inputs.indices,
-              values=np.array((2, 60, 0, 4), dtype=np.int64),
-              dense_shape=inputs.dense_shape),
-          id_weight_pair.id_tensor.eval())
 
+    self.evaluate(variables_lib.global_variables_initializer())
+    self.evaluate(lookup_ops.tables_initializer())
+
+    _assert_sparse_tensor_value(
+        self,
+        sparse_tensor.SparseTensorValue(
+            indices=inputs.indices,
+            values=np.array((2, 60, 0, 4), dtype=np.int64),
+            dense_shape=inputs.dense_shape),
+        self.evaluate(id_weight_pair.id_tensor))
+
+  @test_util.run_deprecated_v1
   def test_linear_model(self):
     wire_column = fc.categorical_column_with_vocabulary_file(
         key='wire',
@@ -4729,14 +4959,17 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
                   dense_shape=(2, 2))
       })
       wire_var, bias = model.variables
-      with _initialized_session():
-        self.assertAllClose((0.,), bias.eval())
-        self.assertAllClose(((0.,), (0.,), (0.,), (0.,)), wire_var.eval())
-        self.assertAllClose(((0.,), (0.,)), predictions.eval())
-        wire_var.assign(((1.,), (2.,), (3.,), (4.,))).eval()
-        # 'marlo' -> 2: wire_var[2] = 3
-        # 'skywalker' -> 3, 'omar' -> 0: wire_var[3] + wire_var[0] = 4+1 = 5
-        self.assertAllClose(((3.,), (5.,)), predictions.eval())
+
+      self.evaluate(variables_lib.global_variables_initializer())
+      self.evaluate(lookup_ops.tables_initializer())
+
+      self.assertAllClose((0.,), self.evaluate(bias))
+      self.assertAllClose(((0.,), (0.,), (0.,), (0.,)), self.evaluate(wire_var))
+      self.assertAllClose(((0.,), (0.,)), self.evaluate(predictions))
+      self.evaluate(wire_var.assign(((1.,), (2.,), (3.,), (4.,))))
+      # 'marlo' -> 2: wire_var[2] = 3
+      # 'skywalker' -> 3, 'omar' -> 0: wire_var[3] + wire_var[0] = 4+1 = 5
+      self.assertAllClose(((3.,), (5.,)), self.evaluate(predictions))
 
   def test_old_linear_model(self):
     wire_column = fc.categorical_column_with_vocabulary_file(
@@ -4755,15 +4988,19 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
       }, (wire_column,))
       bias = get_linear_model_bias()
       wire_var = get_linear_model_column_var(wire_column)
-      with _initialized_session():
-        self.assertAllClose((0.,), bias.eval())
-        self.assertAllClose(((0.,), (0.,), (0.,), (0.,)), wire_var.eval())
-        self.assertAllClose(((0.,), (0.,)), predictions.eval())
-        wire_var.assign(((1.,), (2.,), (3.,), (4.,))).eval()
-        # 'marlo' -> 2: wire_var[2] = 3
-        # 'skywalker' -> 3, 'omar' -> 0: wire_var[3] + wire_var[0] = 4+1 = 5
-        self.assertAllClose(((3.,), (5.,)), predictions.eval())
 
+      self.evaluate(variables_lib.global_variables_initializer())
+      self.evaluate(lookup_ops.tables_initializer())
+
+      self.assertAllClose((0.,), self.evaluate(bias))
+      self.assertAllClose(((0.,), (0.,), (0.,), (0.,)), self.evaluate(wire_var))
+      self.assertAllClose(((0.,), (0.,)), self.evaluate(predictions))
+      self.evaluate(wire_var.assign(((1.,), (2.,), (3.,), (4.,))))
+      # 'marlo' -> 2: wire_var[2] = 3
+      # 'skywalker' -> 3, 'omar' -> 0: wire_var[3] + wire_var[0] = 4+1 = 5
+      self.assertAllClose(((3.,), (5.,)), self.evaluate(predictions))
+
+  @test_util.run_deprecated_v1
   def test_serialization(self):
     wire_column = fc.categorical_column_with_vocabulary_file(
         key='wire',
@@ -4815,15 +5052,19 @@ class VocabularyListCategoricalColumnTest(test.TestCase):
         'aaa': parsing_ops.VarLenFeature(dtypes.int64)
     }, column.parse_example_spec)
 
+  @test_util.run_deprecated_v1
   def test_all_constructor_args(self):
     column = fc.categorical_column_with_vocabulary_list(
-        key='aaa', vocabulary_list=(12, 24, 36), dtype=dtypes.int32,
+        key='aaa',
+        vocabulary_list=(12, 24, 36),
+        dtype=dtypes.int32,
         default_value=-99)
     self.assertEqual(3, column.num_buckets)
     self.assertEqual({
         'aaa': parsing_ops.VarLenFeature(dtypes.int32)
     }, column.parse_example_spec)
 
+  @test_util.run_deprecated_v1
   def test_deep_copy(self):
     original = fc.categorical_column_with_vocabulary_list(
         key='aaa', vocabulary_list=(12, 24, 36), dtype=dtypes.int32)
@@ -4837,37 +5078,39 @@ class VocabularyListCategoricalColumnTest(test.TestCase):
   def test_invalid_dtype(self):
     with self.assertRaisesRegexp(ValueError, 'dtype must be string or integer'):
       fc.categorical_column_with_vocabulary_list(
-          key='aaa', vocabulary_list=('omar', 'stringer', 'marlo'),
+          key='aaa',
+          vocabulary_list=('omar', 'stringer', 'marlo'),
           dtype=dtypes.float32)
 
   def test_invalid_mapping_dtype(self):
-    with self.assertRaisesRegexp(
-        ValueError, r'vocabulary dtype must be string or integer'):
+    with self.assertRaisesRegexp(ValueError,
+                                 r'vocabulary dtype must be string or integer'):
       fc.categorical_column_with_vocabulary_list(
           key='aaa', vocabulary_list=(12., 24., 36.))
 
   def test_mismatched_int_dtype(self):
-    with self.assertRaisesRegexp(
-        ValueError, r'dtype.*and vocabulary dtype.*do not match'):
+    with self.assertRaisesRegexp(ValueError,
+                                 r'dtype.*and vocabulary dtype.*do not match'):
       fc.categorical_column_with_vocabulary_list(
-          key='aaa', vocabulary_list=('omar', 'stringer', 'marlo'),
+          key='aaa',
+          vocabulary_list=('omar', 'stringer', 'marlo'),
           dtype=dtypes.int32)
 
   def test_mismatched_string_dtype(self):
-    with self.assertRaisesRegexp(
-        ValueError, r'dtype.*and vocabulary dtype.*do not match'):
+    with self.assertRaisesRegexp(ValueError,
+                                 r'dtype.*and vocabulary dtype.*do not match'):
       fc.categorical_column_with_vocabulary_list(
           key='aaa', vocabulary_list=(12, 24, 36), dtype=dtypes.string)
 
   def test_none_mapping(self):
-    with self.assertRaisesRegexp(
-        ValueError, r'vocabulary_list.*must be non-empty'):
+    with self.assertRaisesRegexp(ValueError,
+                                 r'vocabulary_list.*must be non-empty'):
       fc.categorical_column_with_vocabulary_list(
           key='aaa', vocabulary_list=None)
 
   def test_empty_mapping(self):
-    with self.assertRaisesRegexp(
-        ValueError, r'vocabulary_list.*must be non-empty'):
+    with self.assertRaisesRegexp(ValueError,
+                                 r'vocabulary_list.*must be non-empty'):
       fc.categorical_column_with_vocabulary_list(
           key='aaa', vocabulary_list=tuple([]))
 
@@ -4879,12 +5122,11 @@ class VocabularyListCategoricalColumnTest(test.TestCase):
   def test_invalid_num_oov_buckets(self):
     with self.assertRaisesRegexp(ValueError, 'Invalid num_oov_buckets'):
       fc.categorical_column_with_vocabulary_list(
-          key='aaa', vocabulary_list=(12, 24, 36),
-          num_oov_buckets=-1)
+          key='aaa', vocabulary_list=(12, 24, 36), num_oov_buckets=-1)
 
   def test_invalid_buckets_and_default_value(self):
-    with self.assertRaisesRegexp(
-        ValueError, 'both num_oov_buckets and default_value'):
+    with self.assertRaisesRegexp(ValueError,
+                                 'both num_oov_buckets and default_value'):
       fc.categorical_column_with_vocabulary_list(
           key='aaa',
           vocabulary_list=(12, 24, 36),
@@ -4893,8 +5135,7 @@ class VocabularyListCategoricalColumnTest(test.TestCase):
 
   def test_invalid_input_dtype_int32(self):
     column = fc.categorical_column_with_vocabulary_list(
-        key='aaa',
-        vocabulary_list=('omar', 'stringer', 'marlo'))
+        key='aaa', vocabulary_list=('omar', 'stringer', 'marlo'))
     inputs = sparse_tensor.SparseTensorValue(
         indices=((0, 0), (1, 0), (1, 1)),
         values=(12, 24, 36),
@@ -4907,8 +5148,7 @@ class VocabularyListCategoricalColumnTest(test.TestCase):
 
   def test_invalid_input_dtype_string(self):
     column = fc.categorical_column_with_vocabulary_list(
-        key='aaa',
-        vocabulary_list=(12, 24, 36))
+        key='aaa', vocabulary_list=(12, 24, 36))
     inputs = sparse_tensor.SparseTensorValue(
         indices=((0, 0), (1, 0), (1, 1)),
         values=('omar', 'stringer', 'marlo'),
@@ -4919,54 +5159,56 @@ class VocabularyListCategoricalColumnTest(test.TestCase):
               'aaa': inputs
           }), None)
 
+  @test_util.run_deprecated_v1
   def test_parse_example_string(self):
     a = fc.categorical_column_with_vocabulary_list(
         key='aaa', vocabulary_list=('omar', 'stringer', 'marlo'))
-    data = example_pb2.Example(features=feature_pb2.Features(
-        feature={
-            'aaa':
-                feature_pb2.Feature(bytes_list=feature_pb2.BytesList(
-                    value=[b'omar', b'stringer']))
-        }))
+    data = example_pb2.Example(
+        features=feature_pb2.Features(
+            feature={
+                'aaa':
+                    feature_pb2.Feature(
+                        bytes_list=feature_pb2.BytesList(
+                            value=[b'omar', b'stringer']))
+            }))
     features = parsing_ops.parse_example(
         serialized=[data.SerializeToString()],
-        features=fc.make_parse_example_spec([a]))
+        features=fc.make_parse_example_spec_v2([a]))
     self.assertIn('aaa', features)
-    with self.cached_session():
-      _assert_sparse_tensor_value(
-          self,
-          sparse_tensor.SparseTensorValue(
-              indices=[[0, 0], [0, 1]],
-              values=np.array([b'omar', b'stringer'], dtype=np.object_),
-              dense_shape=[1, 2]),
-          features['aaa'].eval())
 
+    _assert_sparse_tensor_value(
+        self,
+        sparse_tensor.SparseTensorValue(
+            indices=[[0, 0], [0, 1]],
+            values=np.array([b'omar', b'stringer'], dtype=np.object_),
+            dense_shape=[1, 2]), self.evaluate(features['aaa']))
+
+  @test_util.run_deprecated_v1
   def test_parse_example_int(self):
     a = fc.categorical_column_with_vocabulary_list(
         key='aaa', vocabulary_list=(11, 21, 31))
-    data = example_pb2.Example(features=feature_pb2.Features(
-        feature={
-            'aaa':
-                feature_pb2.Feature(int64_list=feature_pb2.Int64List(
-                    value=[11, 21]))
-        }))
+    data = example_pb2.Example(
+        features=feature_pb2.Features(
+            feature={
+                'aaa':
+                    feature_pb2.Feature(
+                        int64_list=feature_pb2.Int64List(value=[11, 21]))
+            }))
     features = parsing_ops.parse_example(
         serialized=[data.SerializeToString()],
-        features=fc.make_parse_example_spec([a]))
+        features=fc.make_parse_example_spec_v2([a]))
     self.assertIn('aaa', features)
-    with self.cached_session():
-      _assert_sparse_tensor_value(
-          self,
-          sparse_tensor.SparseTensorValue(
-              indices=[[0, 0], [0, 1]],
-              values=[11, 21],
-              dense_shape=[1, 2]),
-          features['aaa'].eval())
 
+    _assert_sparse_tensor_value(
+        self,
+        sparse_tensor.SparseTensorValue(
+            indices=[[0, 0], [0, 1]], values=[11, 21], dense_shape=[1, 2]),
+        self.evaluate(features['aaa']))
+
+  @test_util.run_deprecated_v1
   def test_get_sparse_tensors(self):
     column = fc.categorical_column_with_vocabulary_list(
-        key='aaa',
-        vocabulary_list=('omar', 'stringer', 'marlo'))
+        key='aaa', vocabulary_list=('omar', 'stringer', 'marlo'))
     inputs = sparse_tensor.SparseTensorValue(
         indices=((0, 0), (1, 0), (1, 1)),
         values=('marlo', 'skywalker', 'omar'),
@@ -4976,51 +5218,61 @@ class VocabularyListCategoricalColumnTest(test.TestCase):
             'aaa': inputs
         }), None)
     self.assertIsNone(id_weight_pair.weight_tensor)
-    with _initialized_session():
-      _assert_sparse_tensor_value(
-          self,
-          sparse_tensor.SparseTensorValue(
-              indices=inputs.indices,
-              values=np.array((2, -1, 0), dtype=np.int64),
-              dense_shape=inputs.dense_shape),
-          id_weight_pair.id_tensor.eval())
 
+    self.evaluate(variables_lib.global_variables_initializer())
+    self.evaluate(lookup_ops.tables_initializer())
+
+    _assert_sparse_tensor_value(
+        self,
+        sparse_tensor.SparseTensorValue(
+            indices=inputs.indices,
+            values=np.array((2, -1, 0), dtype=np.int64),
+            dense_shape=inputs.dense_shape),
+        self.evaluate(id_weight_pair.id_tensor))
+
+  @test_util.run_deprecated_v1
   def test_transform_feature(self):
     column = fc.categorical_column_with_vocabulary_list(
-        key='aaa',
-        vocabulary_list=('omar', 'stringer', 'marlo'))
+        key='aaa', vocabulary_list=('omar', 'stringer', 'marlo'))
     inputs = sparse_tensor.SparseTensorValue(
         indices=((0, 0), (1, 0), (1, 1)),
         values=('marlo', 'skywalker', 'omar'),
         dense_shape=(2, 2))
-    id_tensor = fc._transform_features({'aaa': inputs}, [column], None)[column]
-    with _initialized_session():
-      _assert_sparse_tensor_value(
-          self,
-          sparse_tensor.SparseTensorValue(
-              indices=inputs.indices,
-              values=np.array((2, -1, 0), dtype=np.int64),
-              dense_shape=inputs.dense_shape),
-          id_tensor.eval())
+    id_tensor = fc._transform_features_v2({
+        'aaa': inputs
+    }, [column], None)[column]
 
+    self.evaluate(variables_lib.global_variables_initializer())
+    self.evaluate(lookup_ops.tables_initializer())
+
+    _assert_sparse_tensor_value(
+        self,
+        sparse_tensor.SparseTensorValue(
+            indices=inputs.indices,
+            values=np.array((2, -1, 0), dtype=np.int64),
+            dense_shape=inputs.dense_shape), self.evaluate(id_tensor))
+
+  @test_util.run_deprecated_v1
   def test_get_sparse_tensors_dense_input(self):
     column = fc.categorical_column_with_vocabulary_list(
-        key='aaa',
-        vocabulary_list=('omar', 'stringer', 'marlo'))
+        key='aaa', vocabulary_list=('omar', 'stringer', 'marlo'))
     id_weight_pair = column.get_sparse_tensors(
         fc.FeatureTransformationCache({
             'aaa': (('marlo', ''), ('skywalker', 'omar'))
         }), None)
     self.assertIsNone(id_weight_pair.weight_tensor)
-    with _initialized_session():
-      _assert_sparse_tensor_value(
-          self,
-          sparse_tensor.SparseTensorValue(
-              indices=((0, 0), (1, 0), (1, 1)),
-              values=np.array((2, -1, 0), dtype=np.int64),
-              dense_shape=(2, 2)),
-          id_weight_pair.id_tensor.eval())
 
+    self.evaluate(variables_lib.global_variables_initializer())
+    self.evaluate(lookup_ops.tables_initializer())
+
+    _assert_sparse_tensor_value(
+        self,
+        sparse_tensor.SparseTensorValue(
+            indices=((0, 0), (1, 0), (1, 1)),
+            values=np.array((2, -1, 0), dtype=np.int64),
+            dense_shape=(2, 2)), self.evaluate(id_weight_pair.id_tensor))
+
+  @test_util.run_deprecated_v1
   def test_get_sparse_tensors_default_value_in_vocabulary(self):
     column = fc.categorical_column_with_vocabulary_list(
         key='aaa',
@@ -5035,15 +5287,19 @@ class VocabularyListCategoricalColumnTest(test.TestCase):
             'aaa': inputs
         }), None)
     self.assertIsNone(id_weight_pair.weight_tensor)
-    with _initialized_session():
-      _assert_sparse_tensor_value(
-          self,
-          sparse_tensor.SparseTensorValue(
-              indices=inputs.indices,
-              values=np.array((2, 2, 0), dtype=np.int64),
-              dense_shape=inputs.dense_shape),
-          id_weight_pair.id_tensor.eval())
 
+    self.evaluate(variables_lib.global_variables_initializer())
+    self.evaluate(lookup_ops.tables_initializer())
+
+    _assert_sparse_tensor_value(
+        self,
+        sparse_tensor.SparseTensorValue(
+            indices=inputs.indices,
+            values=np.array((2, 2, 0), dtype=np.int64),
+            dense_shape=inputs.dense_shape),
+        self.evaluate(id_weight_pair.id_tensor))
+
+  @test_util.run_deprecated_v1
   def test_get_sparse_tensors_with_oov_buckets(self):
     column = fc.categorical_column_with_vocabulary_list(
         key='aaa',
@@ -5058,15 +5314,19 @@ class VocabularyListCategoricalColumnTest(test.TestCase):
             'aaa': inputs
         }), None)
     self.assertIsNone(id_weight_pair.weight_tensor)
-    with _initialized_session():
-      _assert_sparse_tensor_value(
-          self,
-          sparse_tensor.SparseTensorValue(
-              indices=inputs.indices,
-              values=np.array((2, 33, 0, 62), dtype=np.int64),
-              dense_shape=inputs.dense_shape),
-          id_weight_pair.id_tensor.eval())
 
+    self.evaluate(variables_lib.global_variables_initializer())
+    self.evaluate(lookup_ops.tables_initializer())
+
+    _assert_sparse_tensor_value(
+        self,
+        sparse_tensor.SparseTensorValue(
+            indices=inputs.indices,
+            values=np.array((2, 33, 0, 62), dtype=np.int64),
+            dense_shape=inputs.dense_shape),
+        self.evaluate(id_weight_pair.id_tensor))
+
+  @test_util.run_deprecated_v1
   def test_get_sparse_tensors_int32(self):
     column = fc.categorical_column_with_vocabulary_list(
         key='aaa',
@@ -5081,15 +5341,19 @@ class VocabularyListCategoricalColumnTest(test.TestCase):
             'aaa': inputs
         }), None)
     self.assertIsNone(id_weight_pair.weight_tensor)
-    with _initialized_session():
-      _assert_sparse_tensor_value(
-          self,
-          sparse_tensor.SparseTensorValue(
-              indices=inputs.indices,
-              values=np.array((2, -1, 0, 4), dtype=np.int64),
-              dense_shape=inputs.dense_shape),
-          id_weight_pair.id_tensor.eval())
 
+    self.evaluate(variables_lib.global_variables_initializer())
+    self.evaluate(lookup_ops.tables_initializer())
+
+    _assert_sparse_tensor_value(
+        self,
+        sparse_tensor.SparseTensorValue(
+            indices=inputs.indices,
+            values=np.array((2, -1, 0, 4), dtype=np.int64),
+            dense_shape=inputs.dense_shape),
+        self.evaluate(id_weight_pair.id_tensor))
+
+  @test_util.run_deprecated_v1
   def test_get_sparse_tensors_int32_dense_input(self):
     default_value = -100
     column = fc.categorical_column_with_vocabulary_list(
@@ -5104,15 +5368,18 @@ class VocabularyListCategoricalColumnTest(test.TestCase):
                          dtype=np.int32)
         }), None)
     self.assertIsNone(id_weight_pair.weight_tensor)
-    with _initialized_session():
-      _assert_sparse_tensor_value(
-          self,
-          sparse_tensor.SparseTensorValue(
-              indices=((0, 0), (1, 0), (1, 1), (2, 2)),
-              values=np.array((2, default_value, 0, 4), dtype=np.int64),
-              dense_shape=(3, 3)),
-          id_weight_pair.id_tensor.eval())
 
+    self.evaluate(variables_lib.global_variables_initializer())
+    self.evaluate(lookup_ops.tables_initializer())
+
+    _assert_sparse_tensor_value(
+        self,
+        sparse_tensor.SparseTensorValue(
+            indices=((0, 0), (1, 0), (1, 1), (2, 2)),
+            values=np.array((2, default_value, 0, 4), dtype=np.int64),
+            dense_shape=(3, 3)), self.evaluate(id_weight_pair.id_tensor))
+
+  @test_util.run_deprecated_v1
   def test_get_sparse_tensors_int32_with_oov_buckets(self):
     column = fc.categorical_column_with_vocabulary_list(
         key='aaa',
@@ -5128,15 +5395,19 @@ class VocabularyListCategoricalColumnTest(test.TestCase):
             'aaa': inputs
         }), None)
     self.assertIsNone(id_weight_pair.weight_tensor)
-    with _initialized_session():
-      _assert_sparse_tensor_value(
-          self,
-          sparse_tensor.SparseTensorValue(
-              indices=inputs.indices,
-              values=np.array((2, 60, 0, 4), dtype=np.int64),
-              dense_shape=inputs.dense_shape),
-          id_weight_pair.id_tensor.eval())
 
+    self.evaluate(variables_lib.global_variables_initializer())
+    self.evaluate(lookup_ops.tables_initializer())
+
+    _assert_sparse_tensor_value(
+        self,
+        sparse_tensor.SparseTensorValue(
+            indices=inputs.indices,
+            values=np.array((2, 60, 0, 4), dtype=np.int64),
+            dense_shape=inputs.dense_shape),
+        self.evaluate(id_weight_pair.id_tensor))
+
+  @test_util.run_deprecated_v1
   def test_linear_model(self):
     wire_column = fc.categorical_column_with_vocabulary_list(
         key='aaa',
@@ -5153,14 +5424,17 @@ class VocabularyListCategoricalColumnTest(test.TestCase):
                   dense_shape=(2, 2))
       })
       wire_var, bias = model.variables
-      with _initialized_session():
-        self.assertAllClose((0.,), bias.eval())
-        self.assertAllClose(((0.,), (0.,), (0.,), (0.,)), wire_var.eval())
-        self.assertAllClose(((0.,), (0.,)), predictions.eval())
-        wire_var.assign(((1.,), (2.,), (3.,), (4.,))).eval()
-        # 'marlo' -> 2: wire_var[2] = 3
-        # 'skywalker' -> 3, 'omar' -> 0: wire_var[3] + wire_var[0] = 4+1 = 5
-        self.assertAllClose(((3.,), (5.,)), predictions.eval())
+
+      self.evaluate(variables_lib.global_variables_initializer())
+      self.evaluate(lookup_ops.tables_initializer())
+
+      self.assertAllClose((0.,), self.evaluate(bias))
+      self.assertAllClose(((0.,), (0.,), (0.,), (0.,)), self.evaluate(wire_var))
+      self.assertAllClose(((0.,), (0.,)), self.evaluate(predictions))
+      self.evaluate(wire_var.assign(((1.,), (2.,), (3.,), (4.,))))
+      # 'marlo' -> 2: wire_var[2] = 3
+      # 'skywalker' -> 3, 'omar' -> 0: wire_var[3] + wire_var[0] = 4+1 = 5
+      self.assertAllClose(((3.,), (5.,)), self.evaluate(predictions))
 
   def test_old_linear_model(self):
     wire_column = fc.categorical_column_with_vocabulary_list(
@@ -5178,15 +5452,19 @@ class VocabularyListCategoricalColumnTest(test.TestCase):
       }, (wire_column,))
       bias = get_linear_model_bias()
       wire_var = get_linear_model_column_var(wire_column)
-      with _initialized_session():
-        self.assertAllClose((0.,), bias.eval())
-        self.assertAllClose(((0.,), (0.,), (0.,), (0.,)), wire_var.eval())
-        self.assertAllClose(((0.,), (0.,)), predictions.eval())
-        wire_var.assign(((1.,), (2.,), (3.,), (4.,))).eval()
-        # 'marlo' -> 2: wire_var[2] = 3
-        # 'skywalker' -> 3, 'omar' -> 0: wire_var[3] + wire_var[0] = 4+1 = 5
-        self.assertAllClose(((3.,), (5.,)), predictions.eval())
 
+      self.evaluate(variables_lib.global_variables_initializer())
+      self.evaluate(lookup_ops.tables_initializer())
+
+      self.assertAllClose((0.,), self.evaluate(bias))
+      self.assertAllClose(((0.,), (0.,), (0.,), (0.,)), self.evaluate(wire_var))
+      self.assertAllClose(((0.,), (0.,)), self.evaluate(predictions))
+      self.evaluate(wire_var.assign(((1.,), (2.,), (3.,), (4.,))))
+      # 'marlo' -> 2: wire_var[2] = 3
+      # 'skywalker' -> 3, 'omar' -> 0: wire_var[3] + wire_var[0] = 4+1 = 5
+      self.assertAllClose(((3.,), (5.,)), self.evaluate(predictions))
+
+  @test_util.run_deprecated_v1
   def test_serialization(self):
     wire_column = fc.categorical_column_with_vocabulary_list(
         key='aaa',
@@ -5208,7 +5486,6 @@ class VocabularyListCategoricalColumnTest(test.TestCase):
                      fc.VocabularyListCategoricalColumn._from_config(config))
 
 
-
 class IdentityCategoricalColumnTest(test.TestCase):
 
   def test_constructor(self):
@@ -5225,6 +5502,7 @@ class IdentityCategoricalColumnTest(test.TestCase):
     with self.assertRaisesRegexp(ValueError, 'key must be a string.'):
       fc.categorical_column_with_identity(key=('aaa',), num_buckets=3)
 
+  @test_util.run_deprecated_v1
   def test_deep_copy(self):
     original = fc.categorical_column_with_identity(key='aaa', num_buckets=3)
     for column in (original, copy.deepcopy(original)):
@@ -5264,63 +5542,70 @@ class IdentityCategoricalColumnTest(test.TestCase):
               'aaa': inputs
           }), None)
 
+  @test_util.run_deprecated_v1
   def test_parse_example(self):
     a = fc.categorical_column_with_identity(key='aaa', num_buckets=30)
-    data = example_pb2.Example(features=feature_pb2.Features(
-        feature={
-            'aaa':
-                feature_pb2.Feature(int64_list=feature_pb2.Int64List(
-                    value=[11, 21]))
-        }))
+    data = example_pb2.Example(
+        features=feature_pb2.Features(
+            feature={
+                'aaa':
+                    feature_pb2.Feature(
+                        int64_list=feature_pb2.Int64List(value=[11, 21]))
+            }))
     features = parsing_ops.parse_example(
         serialized=[data.SerializeToString()],
-        features=fc.make_parse_example_spec([a]))
+        features=fc.make_parse_example_spec_v2([a]))
     self.assertIn('aaa', features)
-    with self.cached_session():
-      _assert_sparse_tensor_value(
-          self,
-          sparse_tensor.SparseTensorValue(
-              indices=[[0, 0], [0, 1]],
-              values=np.array([11, 21], dtype=np.int64),
-              dense_shape=[1, 2]),
-          features['aaa'].eval())
 
+    _assert_sparse_tensor_value(
+        self,
+        sparse_tensor.SparseTensorValue(
+            indices=[[0, 0], [0, 1]],
+            values=np.array([11, 21], dtype=np.int64),
+            dense_shape=[1, 2]), self.evaluate(features['aaa']))
+
+  @test_util.run_deprecated_v1
   def test_get_sparse_tensors(self):
     column = fc.categorical_column_with_identity(key='aaa', num_buckets=3)
     inputs = sparse_tensor.SparseTensorValue(
-        indices=((0, 0), (1, 0), (1, 1)),
-        values=(0, 1, 0),
-        dense_shape=(2, 2))
+        indices=((0, 0), (1, 0), (1, 1)), values=(0, 1, 0), dense_shape=(2, 2))
     id_weight_pair = column.get_sparse_tensors(
         fc.FeatureTransformationCache({
             'aaa': inputs
         }), None)
     self.assertIsNone(id_weight_pair.weight_tensor)
-    with _initialized_session():
-      _assert_sparse_tensor_value(
-          self,
-          sparse_tensor.SparseTensorValue(
-              indices=inputs.indices,
-              values=np.array((0, 1, 0), dtype=np.int64),
-              dense_shape=inputs.dense_shape),
-          id_weight_pair.id_tensor.eval())
 
+    self.evaluate(variables_lib.global_variables_initializer())
+    self.evaluate(lookup_ops.tables_initializer())
+
+    _assert_sparse_tensor_value(
+        self,
+        sparse_tensor.SparseTensorValue(
+            indices=inputs.indices,
+            values=np.array((0, 1, 0), dtype=np.int64),
+            dense_shape=inputs.dense_shape),
+        self.evaluate(id_weight_pair.id_tensor))
+
+  @test_util.run_deprecated_v1
   def test_transform_feature(self):
     column = fc.categorical_column_with_identity(key='aaa', num_buckets=3)
     inputs = sparse_tensor.SparseTensorValue(
-        indices=((0, 0), (1, 0), (1, 1)),
-        values=(0, 1, 0),
-        dense_shape=(2, 2))
-    id_tensor = fc._transform_features({'aaa': inputs}, [column], None)[column]
-    with _initialized_session():
-      _assert_sparse_tensor_value(
-          self,
-          sparse_tensor.SparseTensorValue(
-              indices=inputs.indices,
-              values=np.array((0, 1, 0), dtype=np.int64),
-              dense_shape=inputs.dense_shape),
-          id_tensor.eval())
+        indices=((0, 0), (1, 0), (1, 1)), values=(0, 1, 0), dense_shape=(2, 2))
+    id_tensor = fc._transform_features_v2({
+        'aaa': inputs
+    }, [column], None)[column]
 
+    self.evaluate(variables_lib.global_variables_initializer())
+    self.evaluate(lookup_ops.tables_initializer())
+
+    _assert_sparse_tensor_value(
+        self,
+        sparse_tensor.SparseTensorValue(
+            indices=inputs.indices,
+            values=np.array((0, 1, 0), dtype=np.int64),
+            dense_shape=inputs.dense_shape), self.evaluate(id_tensor))
+
+  @test_util.run_deprecated_v1
   def test_get_sparse_tensors_dense_input(self):
     column = fc.categorical_column_with_identity(key='aaa', num_buckets=3)
     id_weight_pair = column.get_sparse_tensors(
@@ -5328,47 +5613,53 @@ class IdentityCategoricalColumnTest(test.TestCase):
             'aaa': ((0, -1), (1, 0))
         }), None)
     self.assertIsNone(id_weight_pair.weight_tensor)
-    with _initialized_session():
-      _assert_sparse_tensor_value(
-          self,
-          sparse_tensor.SparseTensorValue(
-              indices=((0, 0), (1, 0), (1, 1)),
-              values=np.array((0, 1, 0), dtype=np.int64),
-              dense_shape=(2, 2)),
-          id_weight_pair.id_tensor.eval())
 
+    self.evaluate(variables_lib.global_variables_initializer())
+    self.evaluate(lookup_ops.tables_initializer())
+
+    _assert_sparse_tensor_value(
+        self,
+        sparse_tensor.SparseTensorValue(
+            indices=((0, 0), (1, 0), (1, 1)),
+            values=np.array((0, 1, 0), dtype=np.int64),
+            dense_shape=(2, 2)), self.evaluate(id_weight_pair.id_tensor))
+
+  @test_util.run_deprecated_v1
   def test_get_sparse_tensors_with_inputs_too_small(self):
     column = fc.categorical_column_with_identity(key='aaa', num_buckets=3)
     inputs = sparse_tensor.SparseTensorValue(
-        indices=((0, 0), (1, 0), (1, 1)),
-        values=(1, -1, 0),
-        dense_shape=(2, 2))
+        indices=((0, 0), (1, 0), (1, 1)), values=(1, -1, 0), dense_shape=(2, 2))
     id_weight_pair = column.get_sparse_tensors(
         fc.FeatureTransformationCache({
             'aaa': inputs
         }), None)
     self.assertIsNone(id_weight_pair.weight_tensor)
-    with _initialized_session():
-      with self.assertRaisesRegexp(
-          errors.OpError, 'assert_greater_or_equal_0'):
-        id_weight_pair.id_tensor.eval()
 
+    self.evaluate(variables_lib.global_variables_initializer())
+    self.evaluate(lookup_ops.tables_initializer())
+
+    with self.assertRaisesRegexp(errors.OpError, 'assert_greater_or_equal_0'):
+      self.evaluate(id_weight_pair.id_tensor)
+
+  @test_util.run_deprecated_v1
   def test_get_sparse_tensors_with_inputs_too_big(self):
     column = fc.categorical_column_with_identity(key='aaa', num_buckets=3)
     inputs = sparse_tensor.SparseTensorValue(
-        indices=((0, 0), (1, 0), (1, 1)),
-        values=(1, 99, 0),
-        dense_shape=(2, 2))
+        indices=((0, 0), (1, 0), (1, 1)), values=(1, 99, 0), dense_shape=(2, 2))
     id_weight_pair = column.get_sparse_tensors(
         fc.FeatureTransformationCache({
             'aaa': inputs
         }), None)
     self.assertIsNone(id_weight_pair.weight_tensor)
-    with _initialized_session():
-      with self.assertRaisesRegexp(
-          errors.OpError, 'assert_less_than_num_buckets'):
-        id_weight_pair.id_tensor.eval()
 
+    self.evaluate(variables_lib.global_variables_initializer())
+    self.evaluate(lookup_ops.tables_initializer())
+
+    with self.assertRaisesRegexp(errors.OpError,
+                                 'assert_less_than_num_buckets'):
+      self.evaluate(id_weight_pair.id_tensor)
+
+  @test_util.run_deprecated_v1
   def test_get_sparse_tensors_with_default_value(self):
     column = fc.categorical_column_with_identity(
         key='aaa', num_buckets=4, default_value=3)
@@ -5381,15 +5672,19 @@ class IdentityCategoricalColumnTest(test.TestCase):
             'aaa': inputs
         }), None)
     self.assertIsNone(id_weight_pair.weight_tensor)
-    with _initialized_session():
-      _assert_sparse_tensor_value(
-          self,
-          sparse_tensor.SparseTensorValue(
-              indices=inputs.indices,
-              values=np.array((1, 3, 3), dtype=np.int64),
-              dense_shape=inputs.dense_shape),
-          id_weight_pair.id_tensor.eval())
 
+    self.evaluate(variables_lib.global_variables_initializer())
+    self.evaluate(lookup_ops.tables_initializer())
+
+    _assert_sparse_tensor_value(
+        self,
+        sparse_tensor.SparseTensorValue(
+            indices=inputs.indices,
+            values=np.array((1, 3, 3), dtype=np.int64),
+            dense_shape=inputs.dense_shape),
+        self.evaluate(id_weight_pair.id_tensor))
+
+  @test_util.run_deprecated_v1
   def test_get_sparse_tensors_with_default_value_and_placeholder_inputs(self):
     column = fc.categorical_column_with_identity(
         key='aaa', num_buckets=4, default_value=3)
@@ -5397,14 +5692,15 @@ class IdentityCategoricalColumnTest(test.TestCase):
     input_values = array_ops.placeholder(dtype=dtypes.int32)
     input_shape = array_ops.placeholder(dtype=dtypes.int64)
     inputs = sparse_tensor.SparseTensorValue(
-        indices=input_indices,
-        values=input_values,
-        dense_shape=input_shape)
+        indices=input_indices, values=input_values, dense_shape=input_shape)
     id_weight_pair = column.get_sparse_tensors(
         fc.FeatureTransformationCache({
             'aaa': inputs
         }), None)
     self.assertIsNone(id_weight_pair.weight_tensor)
+
+    self.evaluate(variables_lib.global_variables_initializer())
+    self.evaluate(lookup_ops.tables_initializer())
     with _initialized_session():
       _assert_sparse_tensor_value(
           self,
@@ -5412,12 +5708,14 @@ class IdentityCategoricalColumnTest(test.TestCase):
               indices=np.array(((0, 0), (1, 0), (1, 1)), dtype=np.int64),
               values=np.array((1, 3, 3), dtype=np.int64),
               dense_shape=np.array((2, 2), dtype=np.int64)),
-          id_weight_pair.id_tensor.eval(feed_dict={
-              input_indices: ((0, 0), (1, 0), (1, 1)),
-              input_values: (1, -1, 99),
-              input_shape: (2, 2),
-          }))
+          id_weight_pair.id_tensor.eval(
+              feed_dict={
+                  input_indices: ((0, 0), (1, 0), (1, 1)),
+                  input_values: (1, -1, 99),
+                  input_shape: (2, 2),
+              }))
 
+  @test_util.run_deprecated_v1
   def test_linear_model(self):
     column = fc.categorical_column_with_identity(key='aaa', num_buckets=3)
     self.assertEqual(3, column.num_buckets)
@@ -5431,14 +5729,17 @@ class IdentityCategoricalColumnTest(test.TestCase):
                   dense_shape=(2, 2))
       })
       weight_var, bias = model.variables
-      with _initialized_session():
-        self.assertAllClose((0.,), bias.eval())
-        self.assertAllClose(((0.,), (0.,), (0.,)), weight_var.eval())
-        self.assertAllClose(((0.,), (0.,)), predictions.eval())
-        weight_var.assign(((1.,), (2.,), (3.,))).eval()
-        # weight_var[0] = 1
-        # weight_var[2] + weight_var[1] = 3+2 = 5
-        self.assertAllClose(((1.,), (5.,)), predictions.eval())
+
+      self.evaluate(variables_lib.global_variables_initializer())
+      self.evaluate(lookup_ops.tables_initializer())
+
+      self.assertAllClose((0.,), self.evaluate(bias))
+      self.assertAllClose(((0.,), (0.,), (0.,)), self.evaluate(weight_var))
+      self.assertAllClose(((0.,), (0.,)), self.evaluate(predictions))
+      self.evaluate(weight_var.assign(((1.,), (2.,), (3.,))))
+      # weight_var[0] = 1
+      # weight_var[2] + weight_var[1] = 3+2 = 5
+      self.assertAllClose(((1.,), (5.,)), self.evaluate(predictions))
 
   def test_old_linear_model(self):
     column = fc.categorical_column_with_identity(key='aaa', num_buckets=3)
@@ -5453,15 +5754,19 @@ class IdentityCategoricalColumnTest(test.TestCase):
       }, (column,))
       bias = get_linear_model_bias()
       weight_var = get_linear_model_column_var(column)
-      with _initialized_session():
-        self.assertAllClose((0.,), bias.eval())
-        self.assertAllClose(((0.,), (0.,), (0.,)), weight_var.eval())
-        self.assertAllClose(((0.,), (0.,)), predictions.eval())
-        weight_var.assign(((1.,), (2.,), (3.,))).eval()
-        # weight_var[0] = 1
-        # weight_var[2] + weight_var[1] = 3+2 = 5
-        self.assertAllClose(((1.,), (5.,)), predictions.eval())
 
+      self.evaluate(variables_lib.global_variables_initializer())
+      self.evaluate(lookup_ops.tables_initializer())
+
+      self.assertAllClose((0.,), self.evaluate(bias))
+      self.assertAllClose(((0.,), (0.,), (0.,)), self.evaluate(weight_var))
+      self.assertAllClose(((0.,), (0.,)), self.evaluate(predictions))
+      self.evaluate(weight_var.assign(((1.,), (2.,), (3.,))))
+      # weight_var[0] = 1
+      # weight_var[2] + weight_var[1] = 3+2 = 5
+      self.assertAllClose(((1.,), (5.,)), self.evaluate(predictions))
+
+  @test_util.run_deprecated_v1
   def test_serialization(self):
     column = fc.categorical_column_with_identity(key='aaa', num_buckets=3)
 
@@ -5494,13 +5799,18 @@ class TransformFeaturesTest(test.TestCase):
                   indices=[[0, 0], [1, 0], [1, 1]],
                   dense_shape=[2, 2])
       }
-      transformed = fc._transform_features(
+      transformed = fc._transform_features_v2(
           features, [bucketized_price, hashed_sparse], None)
-      with _initialized_session():
-        self.assertIn(bucketized_price.name, transformed[bucketized_price].name)
-        self.assertAllEqual([[0], [3]], transformed[bucketized_price].eval())
-        self.assertIn(hashed_sparse.name, transformed[hashed_sparse].name)
-        self.assertAllEqual([6, 4, 1], transformed[hashed_sparse].values.eval())
+
+      self.evaluate(variables_lib.global_variables_initializer())
+      self.evaluate(lookup_ops.tables_initializer())
+
+      self.assertIn(bucketized_price.name, transformed[bucketized_price].name)
+      self.assertAllEqual([[0], [3]],
+                          self.evaluate(transformed[bucketized_price]))
+      self.assertIn(hashed_sparse.name, transformed[hashed_sparse].name)
+      self.assertAllEqual([6, 4, 1],
+                          self.evaluate(transformed[hashed_sparse].values))
 
   def test_column_order(self):
     """When the column is both dense and sparse, uses sparse tensors."""
@@ -5531,12 +5841,12 @@ class TransformFeaturesTest(test.TestCase):
       column1 = _LoggerColumn('1')
       column2 = _LoggerColumn('2')
       call_logger = {'count': 0}
-      fc._transform_features({}, [column1, column2], None)
+      fc._transform_features_v2({}, [column1, column2], None)
       self.assertEqual(0, column1.call_order)
       self.assertEqual(1, column2.call_order)
 
       call_logger = {'count': 0}
-      fc._transform_features({}, [column2, column1], None)
+      fc._transform_features_v2({}, [column2, column1], None)
       self.assertEqual(0, column1.call_order)
       self.assertEqual(1, column2.call_order)
 
@@ -5551,7 +5861,7 @@ class IndicatorColumnTest(test.TestCase):
     self.assertEqual(indicator_a.variable_shape, [1, 4])
     self.assertTrue(indicator_a._is_v2_column)
 
-    b = fc_old.categorical_column_with_hash_bucket('b', hash_bucket_size=100)
+    b = fc_old._categorical_column_with_hash_bucket('b', hash_bucket_size=100)
     indicator_b = fc.indicator_column(b)
     self.assertEqual(indicator_b.categorical_column.name, 'b')
     self.assertEqual(indicator_b.name, 'b_indicator')
@@ -5565,8 +5875,9 @@ class IndicatorColumnTest(test.TestCase):
         'animal': ['fox', 'fox']
     })
     output = transformation_cache.get(animal, None)
-    with self.cached_session():
-      self.assertAllEqual([[0., 0., 1., 0.], [0., 0., 1., 0.]], output.eval())
+
+    self.assertAllEqual([[0., 0., 1., 0.], [0., 0., 1., 0.]],
+                        self.evaluate(output))
 
   def test_2D_shape_succeeds(self):
     # TODO(ispir/cassandrax): Swith to categorical_column_with_keys when ready.
@@ -5580,8 +5891,9 @@ class IndicatorColumnTest(test.TestCase):
                 dense_shape=[2, 1])
     })
     output = transformation_cache.get(animal, None)
-    with self.cached_session():
-      self.assertAllEqual([[0., 0., 1., 0.], [0., 0., 1., 0.]], output.eval())
+
+    self.assertAllEqual([[0., 0., 1., 0.], [0., 0., 1., 0.]],
+                        self.evaluate(output))
 
   def test_multi_hot(self):
     animal = fc.indicator_column(
@@ -5593,8 +5905,8 @@ class IndicatorColumnTest(test.TestCase):
                 indices=[[0, 0], [0, 1]], values=[1, 1], dense_shape=[1, 2])
     })
     output = transformation_cache.get(animal, None)
-    with self.cached_session():
-      self.assertAllEqual([[0., 2., 0., 0.]], output.eval())
+
+    self.assertAllEqual([[0., 2., 0., 0.]], self.evaluate(output))
 
   def test_multi_hot2(self):
     animal = fc.indicator_column(
@@ -5605,9 +5917,10 @@ class IndicatorColumnTest(test.TestCase):
                 indices=[[0, 0], [0, 1]], values=[1, 2], dense_shape=[1, 2])
     })
     output = transformation_cache.get(animal, None)
-    with self.cached_session():
-      self.assertAllEqual([[0., 1., 1., 0.]], output.eval())
 
+    self.assertAllEqual([[0., 1., 1., 0.]], self.evaluate(output))
+
+  @test_util.run_deprecated_v1
   def test_deep_copy(self):
     a = fc.categorical_column_with_hash_bucket('a', 4)
     column = fc.indicator_column(a)
@@ -5616,44 +5929,52 @@ class IndicatorColumnTest(test.TestCase):
     self.assertEqual(column.name, 'a_indicator')
     self.assertEqual(column.variable_shape, [1, 4])
 
+  @test_util.run_deprecated_v1
   def test_parse_example(self):
     a = fc.categorical_column_with_vocabulary_list(
         key='aaa', vocabulary_list=('omar', 'stringer', 'marlo'))
     a_indicator = fc.indicator_column(a)
-    data = example_pb2.Example(features=feature_pb2.Features(
-        feature={
-            'aaa':
-                feature_pb2.Feature(bytes_list=feature_pb2.BytesList(
-                    value=[b'omar', b'stringer']))
-        }))
+    data = example_pb2.Example(
+        features=feature_pb2.Features(
+            feature={
+                'aaa':
+                    feature_pb2.Feature(
+                        bytes_list=feature_pb2.BytesList(
+                            value=[b'omar', b'stringer']))
+            }))
     features = parsing_ops.parse_example(
         serialized=[data.SerializeToString()],
-        features=fc.make_parse_example_spec([a_indicator]))
+        features=fc.make_parse_example_spec_v2([a_indicator]))
     self.assertIn('aaa', features)
-    with self.cached_session():
-      _assert_sparse_tensor_value(
-          self,
-          sparse_tensor.SparseTensorValue(
-              indices=[[0, 0], [0, 1]],
-              values=np.array([b'omar', b'stringer'], dtype=np.object_),
-              dense_shape=[1, 2]),
-          features['aaa'].eval())
 
+    _assert_sparse_tensor_value(
+        self,
+        sparse_tensor.SparseTensorValue(
+            indices=[[0, 0], [0, 1]],
+            values=np.array([b'omar', b'stringer'], dtype=np.object_),
+            dense_shape=[1, 2]), self.evaluate(features['aaa']))
+
+  @test_util.run_deprecated_v1
   def test_transform(self):
     a = fc.categorical_column_with_vocabulary_list(
         key='aaa', vocabulary_list=('omar', 'stringer', 'marlo'))
     a_indicator = fc.indicator_column(a)
     features = {
-        'aaa': sparse_tensor.SparseTensorValue(
-            indices=((0, 0), (1, 0), (1, 1)),
-            values=('marlo', 'skywalker', 'omar'),
-            dense_shape=(2, 2))
+        'aaa':
+            sparse_tensor.SparseTensorValue(
+                indices=((0, 0), (1, 0), (1, 1)),
+                values=('marlo', 'skywalker', 'omar'),
+                dense_shape=(2, 2))
     }
-    indicator_tensor = fc._transform_features(features, [a_indicator],
-                                              None)[a_indicator]
-    with _initialized_session():
-      self.assertAllEqual([[0, 0, 1], [1, 0, 0]], indicator_tensor.eval())
+    indicator_tensor = fc._transform_features_v2(features, [a_indicator],
+                                                 None)[a_indicator]
 
+    self.evaluate(variables_lib.global_variables_initializer())
+    self.evaluate(lookup_ops.tables_initializer())
+
+    self.assertAllEqual([[0, 0, 1], [1, 0, 0]], self.evaluate(indicator_tensor))
+
+  @test_util.run_deprecated_v1
   def test_transform_with_weighted_column(self):
     # Github issue 12557
     ids = fc.categorical_column_with_vocabulary_list(
@@ -5661,14 +5982,18 @@ class IndicatorColumnTest(test.TestCase):
     weights = fc.weighted_categorical_column(ids, 'weights')
     indicator = fc.indicator_column(weights)
     features = {
-        'ids': constant_op.constant([['c', 'b', 'a']]),
-        'weights': constant_op.constant([[2., 4., 6.]])
+        'ids': constant_op.constant([['c', 'b', 'a', 'c']]),
+        'weights': constant_op.constant([[2., 4., 6., 1.]])
     }
-    indicator_tensor = fc._transform_features(features, [indicator],
-                                              None)[indicator]
-    with _initialized_session():
-      self.assertAllEqual([[6., 4., 2.]], indicator_tensor.eval())
+    indicator_tensor = fc._transform_features_v2(features, [indicator],
+                                                 None)[indicator]
 
+    self.evaluate(variables_lib.global_variables_initializer())
+    self.evaluate(lookup_ops.tables_initializer())
+
+    self.assertAllEqual([[6., 4., 3.]], self.evaluate(indicator_tensor))
+
+  @test_util.run_deprecated_v1
   def test_transform_with_missing_value_in_weighted_column(self):
     # Github issue 12583
     ids = fc.categorical_column_with_vocabulary_list(
@@ -5679,11 +6004,15 @@ class IndicatorColumnTest(test.TestCase):
         'ids': constant_op.constant([['c', 'b', 'unknown']]),
         'weights': constant_op.constant([[2., 4., 6.]])
     }
-    indicator_tensor = fc._transform_features(features, [indicator],
-                                              None)[indicator]
-    with _initialized_session():
-      self.assertAllEqual([[0., 4., 2.]], indicator_tensor.eval())
+    indicator_tensor = fc._transform_features_v2(features, [indicator],
+                                                 None)[indicator]
 
+    self.evaluate(variables_lib.global_variables_initializer())
+    self.evaluate(lookup_ops.tables_initializer())
+
+    self.assertAllEqual([[0., 4., 2.]], self.evaluate(indicator_tensor))
+
+  @test_util.run_deprecated_v1
   def test_transform_with_missing_value_in_categorical_column(self):
     # Github issue 12583
     ids = fc.categorical_column_with_vocabulary_list(
@@ -5692,11 +6021,15 @@ class IndicatorColumnTest(test.TestCase):
     features = {
         'ids': constant_op.constant([['c', 'b', 'unknown']]),
     }
-    indicator_tensor = fc._transform_features(features, [indicator],
-                                              None)[indicator]
-    with _initialized_session():
-      self.assertAllEqual([[0., 1., 1.]], indicator_tensor.eval())
+    indicator_tensor = fc._transform_features_v2(features, [indicator],
+                                                 None)[indicator]
 
+    self.evaluate(variables_lib.global_variables_initializer())
+    self.evaluate(lookup_ops.tables_initializer())
+
+    self.assertAllEqual([[0., 1., 1.]], self.evaluate(indicator_tensor))
+
+  @test_util.run_deprecated_v1
   def test_linear_model(self):
     animal = fc.indicator_column(
         fc.categorical_column_with_identity('animal', num_buckets=4))
@@ -5710,12 +6043,15 @@ class IndicatorColumnTest(test.TestCase):
       model = fc.LinearModel([animal])
       predictions = model(features)
       weight_var, _ = model.variables
-      with _initialized_session():
-        # All should be zero-initialized.
-        self.assertAllClose([[0.], [0.], [0.], [0.]], weight_var.eval())
-        self.assertAllClose([[0.]], predictions.eval())
-        weight_var.assign([[1.], [2.], [3.], [4.]]).eval()
-        self.assertAllClose([[2. + 3.]], predictions.eval())
+
+      self.evaluate(variables_lib.global_variables_initializer())
+      self.evaluate(lookup_ops.tables_initializer())
+
+      # All should be zero-initialized.
+      self.assertAllClose([[0.], [0.], [0.], [0.]], self.evaluate(weight_var))
+      self.assertAllClose([[0.]], self.evaluate(predictions))
+      self.evaluate(weight_var.assign([[1.], [2.], [3.], [4.]]))
+      self.assertAllClose([[2. + 3.]], self.evaluate(predictions))
 
   def test_old_linear_model(self):
     animal = fc.indicator_column(
@@ -5729,16 +6065,19 @@ class IndicatorColumnTest(test.TestCase):
 
       predictions = fc_old.linear_model(features, [animal])
       weight_var = get_linear_model_column_var(animal)
-      with _initialized_session():
-        # All should be zero-initialized.
-        self.assertAllClose([[0.], [0.], [0.], [0.]], weight_var.eval())
-        self.assertAllClose([[0.]], predictions.eval())
-        weight_var.assign([[1.], [2.], [3.], [4.]]).eval()
-        self.assertAllClose([[2. + 3.]], predictions.eval())
+
+      self.evaluate(variables_lib.global_variables_initializer())
+      self.evaluate(lookup_ops.tables_initializer())
+
+      # All should be zero-initialized.
+      self.assertAllClose([[0.], [0.], [0.], [0.]], self.evaluate(weight_var))
+      self.assertAllClose([[0.]], self.evaluate(predictions))
+      self.evaluate(weight_var.assign([[1.], [2.], [3.], [4.]]))
+      self.assertAllClose([[2. + 3.]], self.evaluate(predictions))
 
   def test_old_linear_model_old_categorical(self):
     animal = fc.indicator_column(
-        fc_old.categorical_column_with_identity('animal', num_buckets=4))
+        fc_old._categorical_column_with_identity('animal', num_buckets=4))
     with ops.Graph().as_default():
       features = {
           'animal':
@@ -5748,14 +6087,18 @@ class IndicatorColumnTest(test.TestCase):
 
       predictions = fc_old.linear_model(features, [animal])
       weight_var = get_linear_model_column_var(animal)
-      with _initialized_session():
-        # All should be zero-initialized.
-        self.assertAllClose([[0.], [0.], [0.], [0.]], weight_var.eval())
-        self.assertAllClose([[0.]], predictions.eval())
-        weight_var.assign([[1.], [2.], [3.], [4.]]).eval()
-        self.assertAllClose([[2. + 3.]], predictions.eval())
 
-  def test_feature_layer(self):
+      self.evaluate(variables_lib.global_variables_initializer())
+      self.evaluate(lookup_ops.tables_initializer())
+
+      # All should be zero-initialized.
+      self.assertAllClose([[0.], [0.], [0.], [0.]], self.evaluate(weight_var))
+      self.assertAllClose([[0.]], self.evaluate(predictions))
+      self.evaluate(weight_var.assign([[1.], [2.], [3.], [4.]]))
+      self.assertAllClose([[2. + 3.]], self.evaluate(predictions))
+
+  @test_util.run_deprecated_v1
+  def test_dense_features(self):
     animal = fc.indicator_column(
         fc.categorical_column_with_identity('animal', num_buckets=4))
     with ops.Graph().as_default():
@@ -5764,10 +6107,14 @@ class IndicatorColumnTest(test.TestCase):
               sparse_tensor.SparseTensor(
                   indices=[[0, 0], [0, 1]], values=[1, 2], dense_shape=[1, 2])
       }
-      net = fc.FeatureLayer([animal])(features)
-      with _initialized_session():
-        self.assertAllClose([[0., 1., 1., 0.]], net.eval())
+      net = fc.DenseFeatures([animal])(features)
 
+      self.evaluate(variables_lib.global_variables_initializer())
+      self.evaluate(lookup_ops.tables_initializer())
+
+      self.assertAllClose([[0., 1., 1., 0.]], self.evaluate(net))
+
+  @test_util.run_deprecated_v1
   def test_input_layer(self):
     animal = fc.indicator_column(
         fc.categorical_column_with_identity('animal', num_buckets=4))
@@ -5778,12 +6125,15 @@ class IndicatorColumnTest(test.TestCase):
                   indices=[[0, 0], [0, 1]], values=[1, 2], dense_shape=[1, 2])
       }
       net = fc_old.input_layer(features, [animal])
-      with _initialized_session():
-        self.assertAllClose([[0., 1., 1., 0.]], net.eval())
+
+      self.evaluate(variables_lib.global_variables_initializer())
+      self.evaluate(lookup_ops.tables_initializer())
+
+      self.assertAllClose([[0., 1., 1., 0.]], self.evaluate(net))
 
   def test_input_layer_old_categorical(self):
     animal = fc.indicator_column(
-        fc_old.categorical_column_with_identity('animal', num_buckets=4))
+        fc_old._categorical_column_with_identity('animal', num_buckets=4))
     with ops.Graph().as_default():
       features = {
           'animal':
@@ -5791,9 +6141,13 @@ class IndicatorColumnTest(test.TestCase):
                   indices=[[0, 0], [0, 1]], values=[1, 2], dense_shape=[1, 2])
       }
       net = fc_old.input_layer(features, [animal])
-      with _initialized_session():
-        self.assertAllClose([[0., 1., 1., 0.]], net.eval())
 
+      self.evaluate(variables_lib.global_variables_initializer())
+      self.evaluate(lookup_ops.tables_initializer())
+
+      self.assertAllClose([[0., 1., 1., 0.]], self.evaluate(net))
+
+  @test_util.run_deprecated_v1
   def test_serialization(self):
     parent = fc.categorical_column_with_identity('animal', num_buckets=4)
     animal = fc.indicator_column(parent)
@@ -5822,7 +6176,6 @@ class IndicatorColumnTest(test.TestCase):
     self.assertIs(parent, new_animal.categorical_column)
 
 
-
 class _TestStateManager(fc.StateManager):
 
   def __init__(self, trainable=True):
@@ -5864,6 +6217,7 @@ class _TestStateManager(fc.StateManager):
 
 class EmbeddingColumnTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def test_defaults(self):
     categorical_column = fc.categorical_column_with_identity(
         key='aaa', num_buckets=3)
@@ -5885,22 +6239,27 @@ class EmbeddingColumnTest(test.TestCase):
     self.assertTrue(embedding_column._is_v2_column)
 
   def test_is_v2_column(self):
-    categorical_column = fc_old.categorical_column_with_identity(
+    categorical_column = fc_old._categorical_column_with_identity(
         key='aaa', num_buckets=3)
     embedding_dimension = 2
     embedding_column = fc.embedding_column(
         categorical_column, dimension=embedding_dimension)
     self.assertFalse(embedding_column._is_v2_column)
 
+  @test_util.run_deprecated_v1
   def test_all_constructor_args(self):
     categorical_column = fc.categorical_column_with_identity(
         key='aaa', num_buckets=3)
     embedding_dimension = 2
     embedding_column = fc.embedding_column(
-        categorical_column, dimension=embedding_dimension,
-        combiner='my_combiner', initializer=lambda: 'my_initializer',
-        ckpt_to_load_from='my_ckpt', tensor_name_in_ckpt='my_ckpt_tensor',
-        max_norm=42., trainable=False)
+        categorical_column,
+        dimension=embedding_dimension,
+        combiner='my_combiner',
+        initializer=lambda: 'my_initializer',
+        ckpt_to_load_from='my_ckpt',
+        tensor_name_in_ckpt='my_ckpt_tensor',
+        max_norm=42.,
+        trainable=False)
     self.assertIs(categorical_column, embedding_column.categorical_column)
     self.assertEqual(embedding_dimension, embedding_column.dimension)
     self.assertEqual('my_combiner', embedding_column.combiner)
@@ -5914,15 +6273,20 @@ class EmbeddingColumnTest(test.TestCase):
         'aaa': parsing_ops.VarLenFeature(dtypes.int64)
     }, embedding_column.parse_example_spec)
 
+  @test_util.run_deprecated_v1
   def test_deep_copy(self):
     categorical_column = fc.categorical_column_with_identity(
         key='aaa', num_buckets=3)
     embedding_dimension = 2
     original = fc.embedding_column(
-        categorical_column, dimension=embedding_dimension,
-        combiner='my_combiner', initializer=lambda: 'my_initializer',
-        ckpt_to_load_from='my_ckpt', tensor_name_in_ckpt='my_ckpt_tensor',
-        max_norm=42., trainable=False)
+        categorical_column,
+        dimension=embedding_dimension,
+        combiner='my_combiner',
+        initializer=lambda: 'my_initializer',
+        ckpt_to_load_from='my_ckpt',
+        tensor_name_in_ckpt='my_ckpt_tensor',
+        max_norm=42.,
+        trainable=False)
     for embedding_column in (original, copy.deepcopy(original)):
       self.assertEqual('aaa', embedding_column.categorical_column.name)
       self.assertEqual(3, embedding_column.categorical_column.num_buckets)
@@ -5942,51 +6306,60 @@ class EmbeddingColumnTest(test.TestCase):
           'aaa': parsing_ops.VarLenFeature(dtypes.int64)
       }, embedding_column.parse_example_spec)
 
+  @test_util.run_deprecated_v1
   def test_invalid_initializer(self):
     categorical_column = fc.categorical_column_with_identity(
         key='aaa', num_buckets=3)
     with self.assertRaisesRegexp(ValueError, 'initializer must be callable'):
       fc.embedding_column(categorical_column, dimension=2, initializer='not_fn')
 
+  @test_util.run_deprecated_v1
   def test_parse_example(self):
     a = fc.categorical_column_with_vocabulary_list(
         key='aaa', vocabulary_list=('omar', 'stringer', 'marlo'))
     a_embedded = fc.embedding_column(a, dimension=2)
-    data = example_pb2.Example(features=feature_pb2.Features(
-        feature={
-            'aaa':
-                feature_pb2.Feature(bytes_list=feature_pb2.BytesList(
-                    value=[b'omar', b'stringer']))
-        }))
+    data = example_pb2.Example(
+        features=feature_pb2.Features(
+            feature={
+                'aaa':
+                    feature_pb2.Feature(
+                        bytes_list=feature_pb2.BytesList(
+                            value=[b'omar', b'stringer']))
+            }))
     features = parsing_ops.parse_example(
         serialized=[data.SerializeToString()],
-        features=fc.make_parse_example_spec([a_embedded]))
+        features=fc.make_parse_example_spec_v2([a_embedded]))
     self.assertIn('aaa', features)
-    with self.cached_session():
-      _assert_sparse_tensor_value(
-          self,
-          sparse_tensor.SparseTensorValue(
-              indices=[[0, 0], [0, 1]],
-              values=np.array([b'omar', b'stringer'], dtype=np.object_),
-              dense_shape=[1, 2]),
-          features['aaa'].eval())
 
+    _assert_sparse_tensor_value(
+        self,
+        sparse_tensor.SparseTensorValue(
+            indices=[[0, 0], [0, 1]],
+            values=np.array([b'omar', b'stringer'], dtype=np.object_),
+            dense_shape=[1, 2]), self.evaluate(features['aaa']))
+
+  @test_util.run_deprecated_v1
   def test_transform_feature(self):
     a = fc.categorical_column_with_identity(key='aaa', num_buckets=3)
     a_embedded = fc.embedding_column(a, dimension=2)
     features = {
-        'aaa': sparse_tensor.SparseTensor(
-            indices=((0, 0), (1, 0), (1, 1)),
-            values=(0, 1, 0),
-            dense_shape=(2, 2))
+        'aaa':
+            sparse_tensor.SparseTensor(
+                indices=((0, 0), (1, 0), (1, 1)),
+                values=(0, 1, 0),
+                dense_shape=(2, 2))
     }
-    outputs = fc._transform_features(features, [a, a_embedded], None)
+    outputs = fc._transform_features_v2(features, [a, a_embedded], None)
     output_a = outputs[a]
     output_embedded = outputs[a_embedded]
-    with _initialized_session():
-      _assert_sparse_tensor_value(
-          self, output_a.eval(), output_embedded.eval())
 
+    self.evaluate(variables_lib.global_variables_initializer())
+    self.evaluate(lookup_ops.tables_initializer())
+
+    _assert_sparse_tensor_value(self, self.evaluate(output_a),
+                                self.evaluate(output_embedded))
+
+  @test_util.run_deprecated_v1
   def test_get_dense_tensor(self):
     # Inputs.
     vocabulary_size = 3
@@ -6006,6 +6379,7 @@ class EmbeddingColumnTest(test.TestCase):
         (3., 5.),  # id 1
         (7., 11.)  # id 2
     )
+
     def _initializer(shape, dtype, partition_info):
       self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
       self.assertEqual(dtypes.float32, dtype)
@@ -6028,7 +6402,8 @@ class EmbeddingColumnTest(test.TestCase):
     categorical_column = fc.categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
     embedding_column = fc.embedding_column(
-        categorical_column, dimension=embedding_dimension,
+        categorical_column,
+        dimension=embedding_dimension,
         initializer=_initializer)
     state_manager = _TestStateManager()
     embedding_column.create_state(state_manager)
@@ -6043,10 +6418,14 @@ class EmbeddingColumnTest(test.TestCase):
     global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
     self.assertItemsEqual(('embedding_weights:0',),
                           tuple([v.name for v in global_vars]))
-    with _initialized_session():
-      self.assertAllEqual(embedding_values, global_vars[0].eval())
-      self.assertAllEqual(expected_lookups, embedding_lookup.eval())
 
+    self.evaluate(variables_lib.global_variables_initializer())
+    self.evaluate(lookup_ops.tables_initializer())
+
+    self.assertAllEqual(embedding_values, self.evaluate(global_vars[0]))
+    self.assertAllEqual(expected_lookups, self.evaluate(embedding_lookup))
+
+  @test_util.run_deprecated_v1
   def test_get_dense_tensor_old_categorical(self):
     # Inputs.
     vocabulary_size = 3
@@ -6086,7 +6465,7 @@ class EmbeddingColumnTest(test.TestCase):
     )
 
     # Build columns.
-    categorical_column = fc_old.categorical_column_with_identity(
+    categorical_column = fc_old._categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
     embedding_column = fc.embedding_column(
         categorical_column,
@@ -6103,10 +6482,14 @@ class EmbeddingColumnTest(test.TestCase):
     global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
     self.assertItemsEqual(('embedding_weights:0',),
                           tuple([v.name for v in global_vars]))
-    with _initialized_session():
-      self.assertAllEqual(embedding_values, global_vars[0].eval())
-      self.assertAllEqual(expected_lookups, embedding_lookup.eval())
 
+    self.evaluate(variables_lib.global_variables_initializer())
+    self.evaluate(lookup_ops.tables_initializer())
+
+    self.assertAllEqual(embedding_values, self.evaluate(global_vars[0]))
+    self.assertAllEqual(expected_lookups, self.evaluate(embedding_lookup))
+
+  @test_util.run_deprecated_v1
   def test_get_dense_tensor_3d(self):
     # Inputs.
     vocabulary_size = 4
@@ -6122,11 +6505,12 @@ class EmbeddingColumnTest(test.TestCase):
     # Embedding variable.
     embedding_dimension = 3
     embedding_values = (
-        (1., 2., 4.),   # id 0
-        (3., 5., 1.),   # id 1
+        (1., 2., 4.),  # id 0
+        (3., 5., 1.),  # id 1
         (7., 11., 2.),  # id 2
-        (2., 7., 12.)   # id 3
+        (2., 7., 12.)  # id 3
     )
+
     def _initializer(shape, dtype, partition_info):
       self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
       self.assertEqual(dtypes.float32, dtype)
@@ -6150,7 +6534,8 @@ class EmbeddingColumnTest(test.TestCase):
     categorical_column = fc.categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
     embedding_column = fc.embedding_column(
-        categorical_column, dimension=embedding_dimension,
+        categorical_column,
+        dimension=embedding_dimension,
         initializer=_initializer)
     state_manager = _TestStateManager()
     embedding_column.create_state(state_manager)
@@ -6165,10 +6550,14 @@ class EmbeddingColumnTest(test.TestCase):
     global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
     self.assertItemsEqual(('embedding_weights:0',),
                           tuple([v.name for v in global_vars]))
-    with _initialized_session():
-      self.assertAllEqual(embedding_values, global_vars[0].eval())
-      self.assertAllEqual(expected_lookups, embedding_lookup.eval())
 
+    self.evaluate(variables_lib.global_variables_initializer())
+    self.evaluate(lookup_ops.tables_initializer())
+
+    self.assertAllEqual(embedding_values, self.evaluate(global_vars[0]))
+    self.assertAllEqual(expected_lookups, self.evaluate(embedding_lookup))
+
+  @test_util.run_deprecated_v1
   def test_get_dense_tensor_placeholder_inputs(self):
     # Inputs.
     vocabulary_size = 3
@@ -6188,6 +6577,7 @@ class EmbeddingColumnTest(test.TestCase):
         (3., 5.),  # id 1
         (7., 11.)  # id 2
     )
+
     def _initializer(shape, dtype, partition_info):
       self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
       self.assertEqual(dtypes.float32, dtype)
@@ -6210,7 +6600,8 @@ class EmbeddingColumnTest(test.TestCase):
     categorical_column = fc.categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
     embedding_column = fc.embedding_column(
-        categorical_column, dimension=embedding_dimension,
+        categorical_column,
+        dimension=embedding_dimension,
         initializer=_initializer)
     state_manager = _TestStateManager()
     embedding_column.create_state(state_manager)
@@ -6230,17 +6621,23 @@ class EmbeddingColumnTest(test.TestCase):
 
     # Assert expected embedding variable and lookups.
     global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
-    self.assertItemsEqual(
-        ('embedding_weights:0',), tuple([v.name for v in global_vars]))
-    with _initialized_session():
-      self.assertAllEqual(embedding_values, global_vars[0].eval())
-      self.assertAllEqual(expected_lookups, embedding_lookup.eval(
-          feed_dict={
-              input_indices: sparse_input.indices,
-              input_values: sparse_input.values,
-              input_shape: sparse_input.dense_shape,
-          }))
+    self.assertItemsEqual(('embedding_weights:0',),
+                          tuple([v.name for v in global_vars]))
 
+    self.evaluate(variables_lib.global_variables_initializer())
+    self.evaluate(lookup_ops.tables_initializer())
+    with _initialized_session():
+      self.assertAllEqual(embedding_values, self.evaluate(global_vars[0]))
+      self.assertAllEqual(
+          expected_lookups,
+          embedding_lookup.eval(
+              feed_dict={
+                  input_indices: sparse_input.indices,
+                  input_values: sparse_input.values,
+                  input_shape: sparse_input.dense_shape,
+              }))
+
+  @test_util.run_deprecated_v1
   def test_get_dense_tensor_restore_from_ckpt(self):
     # Inputs.
     vocabulary_size = 3
@@ -6280,7 +6677,8 @@ class EmbeddingColumnTest(test.TestCase):
     categorical_column = fc.categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
     embedding_column = fc.embedding_column(
-        categorical_column, dimension=embedding_dimension,
+        categorical_column,
+        dimension=embedding_dimension,
         ckpt_to_load_from=ckpt_path,
         tensor_name_in_ckpt=ckpt_tensor)
     state_manager = _TestStateManager()
@@ -6294,12 +6692,16 @@ class EmbeddingColumnTest(test.TestCase):
 
     # Assert expected embedding variable and lookups.
     global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
-    self.assertItemsEqual(
-        ('embedding_weights:0',), tuple([v.name for v in global_vars]))
-    with _initialized_session():
-      self.assertAllEqual(embedding_values, global_vars[0].eval())
-      self.assertAllEqual(expected_lookups, embedding_lookup.eval())
+    self.assertItemsEqual(('embedding_weights:0',),
+                          tuple([v.name for v in global_vars]))
 
+    self.evaluate(variables_lib.global_variables_initializer())
+    self.evaluate(lookup_ops.tables_initializer())
+
+    self.assertAllEqual(embedding_values, self.evaluate(global_vars[0]))
+    self.assertAllEqual(expected_lookups, self.evaluate(embedding_lookup))
+
+  @test_util.run_deprecated_v1
   def test_linear_model(self):
     # Inputs.
     batch_size = 4
@@ -6317,6 +6719,7 @@ class EmbeddingColumnTest(test.TestCase):
     embedding_dimension = 2
     embedding_shape = (vocabulary_size, embedding_dimension)
     zeros_embedding_values = np.zeros(embedding_shape)
+
     def _initializer(shape, dtype, partition_info):
       self.assertAllEqual(embedding_shape, shape)
       self.assertEqual(dtypes.float32, dtype)
@@ -6343,39 +6746,45 @@ class EmbeddingColumnTest(test.TestCase):
           expected_var_names,
           [v.name for v in ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)])
       trainable_vars = {
-          v.name: v for v in ops.get_collection(
-              ops.GraphKeys.TRAINABLE_VARIABLES)
+          v.name: v
+          for v in ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
       }
       self.assertItemsEqual(expected_var_names, trainable_vars.keys())
       bias = trainable_vars['linear_model/bias_weights:0']
       embedding_weights = trainable_vars[
           'linear_model/aaa_embedding/embedding_weights:0']
-      linear_weights = trainable_vars[
-          'linear_model/aaa_embedding/weights:0']
-      with _initialized_session():
-        # Predictions with all zero weights.
-        self.assertAllClose(np.zeros((1,)), bias.eval())
-        self.assertAllClose(zeros_embedding_values, embedding_weights.eval())
-        self.assertAllClose(
-            np.zeros((embedding_dimension, 1)), linear_weights.eval())
-        self.assertAllClose(np.zeros((batch_size, 1)), predictions.eval())
+      linear_weights = trainable_vars['linear_model/aaa_embedding/weights:0']
 
-        # Predictions with all non-zero weights.
-        embedding_weights.assign((
-            (1., 2.),  # id 0
-            (3., 5.),  # id 1
-            (7., 11.)  # id 2
-        )).eval()
-        linear_weights.assign(((4.,), (6.,))).eval()
-        # example 0, ids [2], embedding[0] = [7, 11]
-        # example 1, ids [0, 1], embedding[1] = mean([1, 2] + [3, 5]) = [2, 3.5]
-        # example 2, ids [], embedding[2] = [0, 0]
-        # example 3, ids [1], embedding[3] = [3, 5]
-        # sum(embeddings * linear_weights)
-        # = [4*7 + 6*11, 4*2 + 6*3.5, 4*0 + 6*0, 4*3 + 6*5] = [94, 29, 0, 42]
-        self.assertAllClose(((94.,), (29.,), (0.,), (42.,)), predictions.eval())
+      self.evaluate(variables_lib.global_variables_initializer())
+      self.evaluate(lookup_ops.tables_initializer())
 
-  def test_feature_layer(self):
+      # Predictions with all zero weights.
+      self.assertAllClose(np.zeros((1,)), self.evaluate(bias))
+      self.assertAllClose(zeros_embedding_values,
+                          self.evaluate(embedding_weights))
+      self.assertAllClose(
+          np.zeros((embedding_dimension, 1)), self.evaluate(linear_weights))
+      self.assertAllClose(np.zeros((batch_size, 1)), self.evaluate(predictions))
+
+      # Predictions with all non-zero weights.
+      self.evaluate(
+          embedding_weights.assign((
+              (1., 2.),  # id 0
+              (3., 5.),  # id 1
+              (7., 11.)  # id 2
+          )))
+      self.evaluate(linear_weights.assign(((4.,), (6.,))))
+      # example 0, ids [2], embedding[0] = [7, 11]
+      # example 1, ids [0, 1], embedding[1] = mean([1, 2] + [3, 5]) = [2, 3.5]
+      # example 2, ids [], embedding[2] = [0, 0]
+      # example 3, ids [1], embedding[3] = [3, 5]
+      # sum(embeddings * linear_weights)
+      # = [4*7 + 6*11, 4*2 + 6*3.5, 4*0 + 6*0, 4*3 + 6*5] = [94, 29, 0, 42]
+      self.assertAllClose(((94.,), (29.,), (0.,), (42.,)),
+                          self.evaluate(predictions))
+
+  @test_util.run_deprecated_v1
+  def test_dense_features(self):
     # Inputs.
     vocabulary_size = 3
     sparse_input = sparse_tensor.SparseTensorValue(
@@ -6394,6 +6803,7 @@ class EmbeddingColumnTest(test.TestCase):
         (3., 5.),  # id 1
         (7., 11.)  # id 2
     )
+
     def _initializer(shape, dtype, partition_info):
       self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
       self.assertEqual(dtypes.float32, dtype)
@@ -6421,23 +6831,27 @@ class EmbeddingColumnTest(test.TestCase):
         initializer=_initializer)
 
     # Provide sparse input and get dense result.
-    l = fc.FeatureLayer((embedding_column,))
-    feature_layer = l({'aaa': sparse_input})
+    l = fc.DenseFeatures((embedding_column,))
+    dense_features = l({'aaa': sparse_input})
 
     # Assert expected embedding variable and lookups.
     global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
-    self.assertItemsEqual(('feature_layer/aaa_embedding/embedding_weights:0',),
+    self.assertItemsEqual(('dense_features/aaa_embedding/embedding_weights:0',),
                           tuple([v.name for v in global_vars]))
     for v in global_vars:
       self.assertTrue(isinstance(v, variables_lib.RefVariable))
     trainable_vars = ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
-    self.assertItemsEqual(('feature_layer/aaa_embedding/embedding_weights:0',),
+    self.assertItemsEqual(('dense_features/aaa_embedding/embedding_weights:0',),
                           tuple([v.name for v in trainable_vars]))
-    with _initialized_session():
-      self.assertAllEqual(embedding_values, trainable_vars[0].eval())
-      self.assertAllEqual(expected_lookups, feature_layer.eval())
 
-  def test_feature_layer_not_trainable(self):
+    self.evaluate(variables_lib.global_variables_initializer())
+    self.evaluate(lookup_ops.tables_initializer())
+
+    self.assertAllEqual(embedding_values, self.evaluate(trainable_vars[0]))
+    self.assertAllEqual(expected_lookups, self.evaluate(dense_features))
+
+  @test_util.run_deprecated_v1
+  def test_dense_features_not_trainable(self):
     # Inputs.
     vocabulary_size = 3
     sparse_input = sparse_tensor.SparseTensorValue(
@@ -6456,6 +6870,7 @@ class EmbeddingColumnTest(test.TestCase):
         (3., 5.),  # id 1
         (7., 11.)  # id 2
     )
+
     def _initializer(shape, dtype, partition_info):
       self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
       self.assertEqual(dtypes.float32, dtype)
@@ -6484,18 +6899,24 @@ class EmbeddingColumnTest(test.TestCase):
         trainable=False)
 
     # Provide sparse input and get dense result.
-    feature_layer = fc.FeatureLayer((embedding_column,))({'aaa': sparse_input})
+    dense_features = fc.DenseFeatures((embedding_column,))({
+        'aaa': sparse_input
+    })
 
     # Assert expected embedding variable and lookups.
     global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
-    self.assertItemsEqual(('feature_layer/aaa_embedding/embedding_weights:0',),
+    self.assertItemsEqual(('dense_features/aaa_embedding/embedding_weights:0',),
                           tuple([v.name for v in global_vars]))
-    self.assertItemsEqual(
-        [], ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES))
-    with _initialized_session():
-      self.assertAllEqual(embedding_values, global_vars[0].eval())
-      self.assertAllEqual(expected_lookups, feature_layer.eval())
+    self.assertItemsEqual([],
+                          ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES))
 
+    self.evaluate(variables_lib.global_variables_initializer())
+    self.evaluate(lookup_ops.tables_initializer())
+
+    self.assertAllEqual(embedding_values, self.evaluate(global_vars[0]))
+    self.assertAllEqual(expected_lookups, self.evaluate(dense_features))
+
+  @test_util.run_deprecated_v1
   def test_input_layer(self):
     # Inputs.
     vocabulary_size = 3
@@ -6554,9 +6975,12 @@ class EmbeddingColumnTest(test.TestCase):
     trainable_vars = ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
     self.assertItemsEqual(('input_layer/aaa_embedding/embedding_weights:0',),
                           tuple([v.name for v in trainable_vars]))
-    with _initialized_session():
-      self.assertAllEqual(embedding_values, trainable_vars[0].eval())
-      self.assertAllEqual(expected_lookups, feature_layer.eval())
+
+    self.evaluate(variables_lib.global_variables_initializer())
+    self.evaluate(lookup_ops.tables_initializer())
+
+    self.assertAllEqual(embedding_values, self.evaluate(trainable_vars[0]))
+    self.assertAllEqual(expected_lookups, self.evaluate(feature_layer))
 
   def test_old_linear_model(self):
     # Inputs.
@@ -6611,28 +7035,34 @@ class EmbeddingColumnTest(test.TestCase):
       embedding_weights = trainable_vars[
           'linear_model/aaa_embedding/embedding_weights:0']
       linear_weights = trainable_vars['linear_model/aaa_embedding/weights:0']
-      with _initialized_session():
-        # Predictions with all zero weights.
-        self.assertAllClose(np.zeros((1,)), bias.eval())
-        self.assertAllClose(zeros_embedding_values, embedding_weights.eval())
-        self.assertAllClose(
-            np.zeros((embedding_dimension, 1)), linear_weights.eval())
-        self.assertAllClose(np.zeros((batch_size, 1)), predictions.eval())
 
-        # Predictions with all non-zero weights.
-        embedding_weights.assign((
-            (1., 2.),  # id 0
-            (3., 5.),  # id 1
-            (7., 11.)  # id 2
-        )).eval()
-        linear_weights.assign(((4.,), (6.,))).eval()
-        # example 0, ids [2], embedding[0] = [7, 11]
-        # example 1, ids [0, 1], embedding[1] = mean([1, 2] + [3, 5]) = [2, 3.5]
-        # example 2, ids [], embedding[2] = [0, 0]
-        # example 3, ids [1], embedding[3] = [3, 5]
-        # sum(embeddings * linear_weights)
-        # = [4*7 + 6*11, 4*2 + 6*3.5, 4*0 + 6*0, 4*3 + 6*5] = [94, 29, 0, 42]
-        self.assertAllClose(((94.,), (29.,), (0.,), (42.,)), predictions.eval())
+      self.evaluate(variables_lib.global_variables_initializer())
+      self.evaluate(lookup_ops.tables_initializer())
+
+      # Predictions with all zero weights.
+      self.assertAllClose(np.zeros((1,)), self.evaluate(bias))
+      self.assertAllClose(zeros_embedding_values,
+                          self.evaluate(embedding_weights))
+      self.assertAllClose(
+          np.zeros((embedding_dimension, 1)), self.evaluate(linear_weights))
+      self.assertAllClose(np.zeros((batch_size, 1)), self.evaluate(predictions))
+
+      # Predictions with all non-zero weights.
+      self.evaluate(
+          embedding_weights.assign((
+              (1., 2.),  # id 0
+              (3., 5.),  # id 1
+              (7., 11.)  # id 2
+          )))
+      self.evaluate(linear_weights.assign(((4.,), (6.,))))
+      # example 0, ids [2], embedding[0] = [7, 11]
+      # example 1, ids [0, 1], embedding[1] = mean([1, 2] + [3, 5]) = [2, 3.5]
+      # example 2, ids [], embedding[2] = [0, 0]
+      # example 3, ids [1], embedding[3] = [3, 5]
+      # sum(embeddings * linear_weights)
+      # = [4*7 + 6*11, 4*2 + 6*3.5, 4*0 + 6*0, 4*3 + 6*5] = [94, 29, 0, 42]
+      self.assertAllClose(((94.,), (29.,), (0.,), (42.,)),
+                          self.evaluate(predictions))
 
   def test_old_linear_model_old_categorical(self):
     # Inputs.
@@ -6659,7 +7089,7 @@ class EmbeddingColumnTest(test.TestCase):
       return zeros_embedding_values
 
     # Build columns.
-    categorical_column = fc_old.categorical_column_with_identity(
+    categorical_column = fc_old._categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
     embedding_column = fc.embedding_column(
         categorical_column,
@@ -6687,29 +7117,36 @@ class EmbeddingColumnTest(test.TestCase):
       embedding_weights = trainable_vars[
           'linear_model/aaa_embedding/embedding_weights:0']
       linear_weights = trainable_vars['linear_model/aaa_embedding/weights:0']
-      with _initialized_session():
-        # Predictions with all zero weights.
-        self.assertAllClose(np.zeros((1,)), bias.eval())
-        self.assertAllClose(zeros_embedding_values, embedding_weights.eval())
-        self.assertAllClose(
-            np.zeros((embedding_dimension, 1)), linear_weights.eval())
-        self.assertAllClose(np.zeros((batch_size, 1)), predictions.eval())
 
-        # Predictions with all non-zero weights.
-        embedding_weights.assign((
-            (1., 2.),  # id 0
-            (3., 5.),  # id 1
-            (7., 11.)  # id 2
-        )).eval()
-        linear_weights.assign(((4.,), (6.,))).eval()
-        # example 0, ids [2], embedding[0] = [7, 11]
-        # example 1, ids [0, 1], embedding[1] = mean([1, 2] + [3, 5]) = [2, 3.5]
-        # example 2, ids [], embedding[2] = [0, 0]
-        # example 3, ids [1], embedding[3] = [3, 5]
-        # sum(embeddings * linear_weights)
-        # = [4*7 + 6*11, 4*2 + 6*3.5, 4*0 + 6*0, 4*3 + 6*5] = [94, 29, 0, 42]
-        self.assertAllClose(((94.,), (29.,), (0.,), (42.,)), predictions.eval())
+      self.evaluate(variables_lib.global_variables_initializer())
+      self.evaluate(lookup_ops.tables_initializer())
 
+      # Predictions with all zero weights.
+      self.assertAllClose(np.zeros((1,)), self.evaluate(bias))
+      self.assertAllClose(zeros_embedding_values,
+                          self.evaluate(embedding_weights))
+      self.assertAllClose(
+          np.zeros((embedding_dimension, 1)), self.evaluate(linear_weights))
+      self.assertAllClose(np.zeros((batch_size, 1)), self.evaluate(predictions))
+
+      # Predictions with all non-zero weights.
+      self.evaluate(
+          embedding_weights.assign((
+              (1., 2.),  # id 0
+              (3., 5.),  # id 1
+              (7., 11.)  # id 2
+          )))
+      self.evaluate(linear_weights.assign(((4.,), (6.,))))
+      # example 0, ids [2], embedding[0] = [7, 11]
+      # example 1, ids [0, 1], embedding[1] = mean([1, 2] + [3, 5]) = [2, 3.5]
+      # example 2, ids [], embedding[2] = [0, 0]
+      # example 3, ids [1], embedding[3] = [3, 5]
+      # sum(embeddings * linear_weights)
+      # = [4*7 + 6*11, 4*2 + 6*3.5, 4*0 + 6*0, 4*3 + 6*5] = [94, 29, 0, 42]
+      self.assertAllClose(((94.,), (29.,), (0.,), (42.,)),
+                          self.evaluate(predictions))
+
+  @test_util.run_deprecated_v1
   def test_serialization(self):
 
     def _initializer(shape, dtype, partition_info):
@@ -6763,6 +7200,7 @@ class EmbeddingColumnTest(test.TestCase):
 
 class SharedEmbeddingColumnTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def test_defaults(self):
     categorical_column_a = fc.categorical_column_with_identity(
         key='aaa', num_buckets=3)
@@ -6787,6 +7225,7 @@ class SharedEmbeddingColumnTest(test.TestCase):
         'bbb': parsing_ops.VarLenFeature(dtypes.int64)
     }, embedding_column_b.parse_example_spec)
 
+  @test_util.run_deprecated_v1
   def test_all_constructor_args(self):
     categorical_column_a = fc.categorical_column_with_identity(
         key='aaa', num_buckets=3)
@@ -6818,6 +7257,7 @@ class SharedEmbeddingColumnTest(test.TestCase):
         'bbb': parsing_ops.VarLenFeature(dtypes.int64)
     }, embedding_column_b.parse_example_spec)
 
+  @test_util.run_deprecated_v1
   def test_deep_copy(self):
     categorical_column_a = fc.categorical_column_with_identity(
         key='aaa', num_buckets=3)
@@ -6849,6 +7289,7 @@ class SharedEmbeddingColumnTest(test.TestCase):
           'aaa': parsing_ops.VarLenFeature(dtypes.int64)
       }, embedding_column_a.parse_example_spec)
 
+  @test_util.run_deprecated_v1
   def test_invalid_initializer(self):
     categorical_column_a = fc.categorical_column_with_identity(
         key='aaa', num_buckets=3)
@@ -6860,6 +7301,7 @@ class SharedEmbeddingColumnTest(test.TestCase):
           dimension=2,
           initializer='not_fn')
 
+  @test_util.run_deprecated_v1
   def test_incompatible_column_type(self):
     categorical_column_a = fc.categorical_column_with_identity(
         key='aaa', num_buckets=3)
@@ -6874,6 +7316,7 @@ class SharedEmbeddingColumnTest(test.TestCase):
           [categorical_column_a, categorical_column_b, categorical_column_c],
           dimension=2)
 
+  @test_util.run_deprecated_v1
   def test_weighted_categorical_column_ok(self):
     categorical_column_a = fc.categorical_column_with_identity(
         key='aaa', num_buckets=3)
@@ -6891,82 +7334,90 @@ class SharedEmbeddingColumnTest(test.TestCase):
         [weighted_categorical_column_a, weighted_categorical_column_b],
         dimension=2)
 
+  @test_util.run_deprecated_v1
   def test_parse_example(self):
     a = fc.categorical_column_with_vocabulary_list(
         key='aaa', vocabulary_list=('omar', 'stringer', 'marlo'))
     b = fc.categorical_column_with_vocabulary_list(
         key='bbb', vocabulary_list=('omar', 'stringer', 'marlo'))
     a_embedded, b_embedded = fc.shared_embedding_columns_v2([a, b], dimension=2)
-    data = example_pb2.Example(features=feature_pb2.Features(
-        feature={
-            'aaa':
-                feature_pb2.Feature(bytes_list=feature_pb2.BytesList(
-                    value=[b'omar', b'stringer'])),
-            'bbb':
-                feature_pb2.Feature(bytes_list=feature_pb2.BytesList(
-                    value=[b'stringer', b'marlo'])),
-        }))
+    data = example_pb2.Example(
+        features=feature_pb2.Features(
+            feature={
+                'aaa':
+                    feature_pb2.Feature(
+                        bytes_list=feature_pb2.BytesList(
+                            value=[b'omar', b'stringer'])),
+                'bbb':
+                    feature_pb2.Feature(
+                        bytes_list=feature_pb2.BytesList(
+                            value=[b'stringer', b'marlo'])),
+            }))
     features = parsing_ops.parse_example(
         serialized=[data.SerializeToString()],
-        features=fc.make_parse_example_spec([a_embedded, b_embedded]))
+        features=fc.make_parse_example_spec_v2([a_embedded, b_embedded]))
     self.assertIn('aaa', features)
     self.assertIn('bbb', features)
-    with self.cached_session():
-      _assert_sparse_tensor_value(
-          self,
-          sparse_tensor.SparseTensorValue(
-              indices=[[0, 0], [0, 1]],
-              values=np.array([b'omar', b'stringer'], dtype=np.object_),
-              dense_shape=[1, 2]),
-          features['aaa'].eval())
-      _assert_sparse_tensor_value(
-          self,
-          sparse_tensor.SparseTensorValue(
-              indices=[[0, 0], [0, 1]],
-              values=np.array([b'stringer', b'marlo'], dtype=np.object_),
-              dense_shape=[1, 2]),
-          features['bbb'].eval())
 
+    _assert_sparse_tensor_value(
+        self,
+        sparse_tensor.SparseTensorValue(
+            indices=[[0, 0], [0, 1]],
+            values=np.array([b'omar', b'stringer'], dtype=np.object_),
+            dense_shape=[1, 2]), self.evaluate(features['aaa']))
+    _assert_sparse_tensor_value(
+        self,
+        sparse_tensor.SparseTensorValue(
+            indices=[[0, 0], [0, 1]],
+            values=np.array([b'stringer', b'marlo'], dtype=np.object_),
+            dense_shape=[1, 2]), self.evaluate(features['bbb']))
+
+  @test_util.run_deprecated_v1
   def test_transform_feature(self):
     a = fc.categorical_column_with_identity(key='aaa', num_buckets=3)
     b = fc.categorical_column_with_identity(key='bbb', num_buckets=3)
     a_embedded, b_embedded = fc.shared_embedding_columns_v2([a, b], dimension=2)
     features = {
-        'aaa': sparse_tensor.SparseTensor(
-            indices=((0, 0), (1, 0), (1, 1)),
-            values=(0, 1, 0),
-            dense_shape=(2, 2)),
-        'bbb': sparse_tensor.SparseTensor(
-            indices=((0, 0), (1, 0), (1, 1)),
-            values=(1, 2, 1),
-            dense_shape=(2, 2)),
+        'aaa':
+            sparse_tensor.SparseTensor(
+                indices=((0, 0), (1, 0), (1, 1)),
+                values=(0, 1, 0),
+                dense_shape=(2, 2)),
+        'bbb':
+            sparse_tensor.SparseTensor(
+                indices=((0, 0), (1, 0), (1, 1)),
+                values=(1, 2, 1),
+                dense_shape=(2, 2)),
     }
-    outputs = fc._transform_features(features, [a, a_embedded, b, b_embedded],
-                                     None)
+    outputs = fc._transform_features_v2(features,
+                                        [a, a_embedded, b, b_embedded], None)
     output_a = outputs[a]
     output_a_embedded = outputs[a_embedded]
     output_b = outputs[b]
     output_b_embedded = outputs[b_embedded]
-    with _initialized_session():
-      _assert_sparse_tensor_value(
-          self, output_a.eval(), output_a_embedded.eval())
-      _assert_sparse_tensor_value(
-          self, output_b.eval(), output_b_embedded.eval())
 
+    self.evaluate(variables_lib.global_variables_initializer())
+    self.evaluate(lookup_ops.tables_initializer())
+
+    _assert_sparse_tensor_value(self, self.evaluate(output_a),
+                                self.evaluate(output_a_embedded))
+    _assert_sparse_tensor_value(self, self.evaluate(output_b),
+                                self.evaluate(output_b_embedded))
+
+  @test_util.run_deprecated_v1
   def test_get_dense_tensor(self):
     # Inputs.
     vocabulary_size = 3
     # -1 values are ignored.
-    input_a = np.array(
-        [[2, -1, -1],  # example 0, ids [2]
-         [0, 1, -1]])  # example 1, ids [0, 1]
-    input_b = np.array(
-        [[0, -1, -1],  # example 0, ids [0]
-         [-1, -1, -1]])  # example 1, ids []
-    input_features = {
-        'aaa': input_a,
-        'bbb': input_b
-    }
+    input_a = np.array([
+        [2, -1, -1],  # example 0, ids [2]
+        [0, 1, -1]
+    ])  # example 1, ids [0, 1]
+    input_b = np.array([
+        [0, -1, -1],  # example 0, ids [0]
+        [-1, -1, -1]
+    ])  # example 1, ids []
+    input_features = {'aaa': input_a, 'bbb': input_b}
 
     # Embedding variable.
     embedding_dimension = 2
@@ -6975,6 +7426,7 @@ class SharedEmbeddingColumnTest(test.TestCase):
         (3., 5.),  # id 1
         (7., 11.)  # id 2
     )
+
     def _initializer(shape, dtype, partition_info):
       self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
       self.assertEqual(dtypes.float32, dtype)
@@ -7016,21 +7468,27 @@ class SharedEmbeddingColumnTest(test.TestCase):
     self.assertItemsEqual(('aaa_bbb_shared_embedding:0',),
                           tuple([v.name for v in global_vars]))
     embedding_var = global_vars[0]
-    with _initialized_session():
-      self.assertAllEqual(embedding_values, embedding_var.eval())
-      self.assertAllEqual(expected_lookups_a, embedding_lookup_a.eval())
-      self.assertAllEqual(expected_lookups_b, embedding_lookup_b.eval())
 
+    self.evaluate(variables_lib.global_variables_initializer())
+    self.evaluate(lookup_ops.tables_initializer())
+
+    self.assertAllEqual(embedding_values, self.evaluate(embedding_var))
+    self.assertAllEqual(expected_lookups_a, self.evaluate(embedding_lookup_a))
+    self.assertAllEqual(expected_lookups_b, self.evaluate(embedding_lookup_b))
+
+  @test_util.run_deprecated_v1
   def test_get_dense_tensor_placeholder_inputs(self):
     # Inputs.
     vocabulary_size = 3
     # -1 values are ignored.
-    input_a = np.array(
-        [[2, -1, -1],  # example 0, ids [2]
-         [0, 1, -1]])  # example 1, ids [0, 1]
-    input_b = np.array(
-        [[0, -1, -1],  # example 0, ids [0]
-         [-1, -1, -1]])  # example 1, ids []
+    input_a = np.array([
+        [2, -1, -1],  # example 0, ids [2]
+        [0, 1, -1]
+    ])  # example 1, ids [0, 1]
+    input_b = np.array([
+        [0, -1, -1],  # example 0, ids [0]
+        [-1, -1, -1]
+    ])  # example 1, ids []
     # Specify shape, because dense input must have rank specified.
     input_a_placeholder = array_ops.placeholder(
         dtype=dtypes.int64, shape=[None, 3])
@@ -7052,6 +7510,7 @@ class SharedEmbeddingColumnTest(test.TestCase):
         (3., 5.),  # id 1
         (7., 11.)  # id 2
     )
+
     def _initializer(shape, dtype, partition_info):
       self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
       self.assertEqual(dtypes.float32, dtype)
@@ -7077,22 +7536,26 @@ class SharedEmbeddingColumnTest(test.TestCase):
     with _initialized_session() as sess:
       sess.run([embedding_lookup_a, embedding_lookup_b], feed_dict=feed_dict)
 
+  @test_util.run_deprecated_v1
   def test_linear_model(self):
     # Inputs.
     batch_size = 2
     vocabulary_size = 3
     # -1 values are ignored.
-    input_a = np.array(
-        [[2, -1, -1],  # example 0, ids [2]
-         [0, 1, -1]])  # example 1, ids [0, 1]
-    input_b = np.array(
-        [[0, -1, -1],  # example 0, ids [0]
-         [-1, -1, -1]])  # example 1, ids []
+    input_a = np.array([
+        [2, -1, -1],  # example 0, ids [2]
+        [0, 1, -1]
+    ])  # example 1, ids [0, 1]
+    input_b = np.array([
+        [0, -1, -1],  # example 0, ids [0]
+        [-1, -1, -1]
+    ])  # example 1, ids []
 
     # Embedding variable.
     embedding_dimension = 2
     embedding_shape = (vocabulary_size, embedding_dimension)
     zeros_embedding_values = np.zeros(embedding_shape)
+
     def _initializer(shape, dtype, partition_info):
       self.assertAllEqual(embedding_shape, shape)
       self.assertEqual(dtypes.float32, dtype)
@@ -7128,8 +7591,8 @@ class SharedEmbeddingColumnTest(test.TestCase):
           expected_var_names,
           [v.name for v in ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)])
       trainable_vars = {
-          v.name: v for v in ops.get_collection(
-              ops.GraphKeys.TRAINABLE_VARIABLES)
+          v.name: v
+          for v in ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
       }
       self.assertItemsEqual(expected_var_names, trainable_vars.keys())
       bias = trainable_vars['linear_model/bias_weights:0']
@@ -7138,35 +7601,40 @@ class SharedEmbeddingColumnTest(test.TestCase):
           'linear_model/aaa_shared_embedding/weights:0']
       linear_weights_b = trainable_vars[
           'linear_model/bbb_shared_embedding/weights:0']
-      with _initialized_session():
-        # Predictions with all zero weights.
-        self.assertAllClose(np.zeros((1,)), bias.eval())
-        self.assertAllClose(zeros_embedding_values, embedding_weights.eval())
-        self.assertAllClose(
-            np.zeros((embedding_dimension, 1)), linear_weights_a.eval())
-        self.assertAllClose(
-            np.zeros((embedding_dimension, 1)), linear_weights_b.eval())
-        self.assertAllClose(np.zeros((batch_size, 1)), predictions.eval())
 
-        # Predictions with all non-zero weights.
-        embedding_weights.assign((
-            (1., 2.),  # id 0
-            (3., 5.),  # id 1
-            (7., 11.)  # id 2
-        )).eval()
-        linear_weights_a.assign(((4.,), (6.,))).eval()
-        # example 0, ids [2], embedding[0] = [7, 11]
-        # example 1, ids [0, 1], embedding[1] = mean([1, 2] + [3, 5]) = [2, 3.5]
-        # sum(embeddings * linear_weights)
-        # = [4*7 + 6*11, 4*2 + 6*3.5] = [94, 29]
-        linear_weights_b.assign(((3.,), (5.,))).eval()
-        # example 0, ids [0], embedding[0] = [1, 2]
-        # example 1, ids [], embedding[1] = 0, 0]
-        # sum(embeddings * linear_weights)
-        # = [3*1 + 5*2, 3*0 +5*0] = [13, 0]
-        self.assertAllClose([[94. + 13.], [29.]], predictions.eval())
+      self.evaluate(variables_lib.global_variables_initializer())
+      self.evaluate(lookup_ops.tables_initializer())
 
-  def _test_feature_layer(self, trainable=True):
+      # Predictions with all zero weights.
+      self.assertAllClose(np.zeros((1,)), self.evaluate(bias))
+      self.assertAllClose(zeros_embedding_values,
+                          self.evaluate(embedding_weights))
+      self.assertAllClose(
+          np.zeros((embedding_dimension, 1)), self.evaluate(linear_weights_a))
+      self.assertAllClose(
+          np.zeros((embedding_dimension, 1)), self.evaluate(linear_weights_b))
+      self.assertAllClose(np.zeros((batch_size, 1)), self.evaluate(predictions))
+
+      # Predictions with all non-zero weights.
+      self.evaluate(
+          embedding_weights.assign((
+              (1., 2.),  # id 0
+              (3., 5.),  # id 1
+              (7., 11.)  # id 2
+          )))
+      self.evaluate(linear_weights_a.assign(((4.,), (6.,))))
+      # example 0, ids [2], embedding[0] = [7, 11]
+      # example 1, ids [0, 1], embedding[1] = mean([1, 2] + [3, 5]) = [2, 3.5]
+      # sum(embeddings * linear_weights)
+      # = [4*7 + 6*11, 4*2 + 6*3.5] = [94, 29]
+      self.evaluate(linear_weights_b.assign(((3.,), (5.,))))
+      # example 0, ids [0], embedding[0] = [1, 2]
+      # example 1, ids [], embedding[1] = 0, 0]
+      # sum(embeddings * linear_weights)
+      # = [3*1 + 5*2, 3*0 +5*0] = [13, 0]
+      self.assertAllClose([[94. + 13.], [29.]], self.evaluate(predictions))
+
+  def _test_dense_features(self, trainable=True):
     # Inputs.
     vocabulary_size = 3
     sparse_input_a = sparse_tensor.SparseTensorValue(
@@ -7201,6 +7669,7 @@ class SharedEmbeddingColumnTest(test.TestCase):
         (3., 5.),  # id 1
         (7., 11.)  # id 2
     )
+
     def _initializer(shape, dtype, partition_info):
       self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
       self.assertEqual(dtypes.float32, dtype)
@@ -7252,7 +7721,7 @@ class SharedEmbeddingColumnTest(test.TestCase):
     }
 
     # Provide sparse input and get dense result.
-    feature_layer = fc.FeatureLayer(
+    dense_features = fc.DenseFeatures(
         feature_columns=(embedding_column_b, embedding_column_a,
                          embedding_column_c, embedding_column_d))(
                              features)
@@ -7272,16 +7741,23 @@ class SharedEmbeddingColumnTest(test.TestCase):
     else:
       self.assertItemsEqual([], tuple([v.name for v in trainable_vars]))
     shared_embedding_vars = global_vars
-    with _initialized_session():
-      self.assertAllEqual(embedding_values, shared_embedding_vars[0].eval())
-      self.assertAllEqual(expected_lookups, feature_layer.eval())
 
-  def test_feature_layer(self):
-    self._test_feature_layer()
+    self.evaluate(variables_lib.global_variables_initializer())
+    self.evaluate(lookup_ops.tables_initializer())
 
-  def test_feature_layer_no_trainable(self):
-    self._test_feature_layer(trainable=False)
+    self.assertAllEqual(embedding_values,
+                        self.evaluate(shared_embedding_vars[0]))
+    self.assertAllEqual(expected_lookups, self.evaluate(dense_features))
 
+  @test_util.run_deprecated_v1
+  def test_dense_features(self):
+    self._test_dense_features()
+
+  @test_util.run_deprecated_v1
+  def test_dense_features_no_trainable(self):
+    self._test_dense_features(trainable=False)
+
+  @test_util.run_deprecated_v1
   def test_serialization(self):
 
     def _initializer(shape, dtype, partition_info):
@@ -7302,9 +7778,9 @@ class SharedEmbeddingColumnTest(test.TestCase):
     # TODO(rohanj): Add tests for (from|get)_config once implemented
 
 
-
 class WeightedCategoricalColumnTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def test_defaults(self):
     column = fc.weighted_categorical_column(
         categorical_column=fc.categorical_column_with_identity(
@@ -7320,11 +7796,12 @@ class WeightedCategoricalColumnTest(test.TestCase):
 
   def test_is_v2_column(self):
     column = fc.weighted_categorical_column(
-        categorical_column=fc_old.categorical_column_with_identity(
+        categorical_column=fc_old._categorical_column_with_identity(
             key='ids', num_buckets=3),
         weight_feature_key='values')
     self.assertFalse(column._is_v2_column)
 
+  @test_util.run_deprecated_v1
   def test_deep_copy(self):
     """Tests deepcopy of categorical_column_with_hash_bucket."""
     original = fc.weighted_categorical_column(
@@ -7365,7 +7842,7 @@ class WeightedCategoricalColumnTest(test.TestCase):
         values=('omar', 'stringer', 'marlo'),
         dense_shape=(2, 2))
     with self.assertRaisesRegexp(ValueError, 'Bad dtype'):
-      fc._transform_features({
+      fc._transform_features_v2({
           'ids': strings,
           'values': strings
       }, (column,), None)
@@ -7386,77 +7863,79 @@ class WeightedCategoricalColumnTest(test.TestCase):
         indices=((0, 0), (1, 0), (1, 1)),
         values=('omar', 'stringer', 'marlo'),
         dense_shape=(2, 2))
-    with self.assertRaisesRegexp(
-        ValueError, 'values is not in features dictionary'):
-      fc._transform_features({'ids': inputs}, (column,), None)
+    with self.assertRaisesRegexp(ValueError,
+                                 'values is not in features dictionary'):
+      fc._transform_features_v2({'ids': inputs}, (column,), None)
 
+  @test_util.run_deprecated_v1
   def test_parse_example(self):
     a = fc.categorical_column_with_vocabulary_list(
         key='aaa', vocabulary_list=('omar', 'stringer', 'marlo'))
     a_weighted = fc.weighted_categorical_column(a, weight_feature_key='weights')
-    data = example_pb2.Example(features=feature_pb2.Features(
-        feature={
-            'aaa':
-                feature_pb2.Feature(bytes_list=feature_pb2.BytesList(
-                    value=[b'omar', b'stringer'])),
-            'weights':
-                feature_pb2.Feature(float_list=feature_pb2.FloatList(
-                    value=[1., 10.]))
-        }))
+    data = example_pb2.Example(
+        features=feature_pb2.Features(
+            feature={
+                'aaa':
+                    feature_pb2.Feature(
+                        bytes_list=feature_pb2.BytesList(
+                            value=[b'omar', b'stringer'])),
+                'weights':
+                    feature_pb2.Feature(
+                        float_list=feature_pb2.FloatList(value=[1., 10.]))
+            }))
     features = parsing_ops.parse_example(
         serialized=[data.SerializeToString()],
-        features=fc.make_parse_example_spec([a_weighted]))
+        features=fc.make_parse_example_spec_v2([a_weighted]))
     self.assertIn('aaa', features)
     self.assertIn('weights', features)
-    with self.cached_session():
-      _assert_sparse_tensor_value(
-          self,
-          sparse_tensor.SparseTensorValue(
-              indices=[[0, 0], [0, 1]],
-              values=np.array([b'omar', b'stringer'], dtype=np.object_),
-              dense_shape=[1, 2]),
-          features['aaa'].eval())
-      _assert_sparse_tensor_value(
-          self,
-          sparse_tensor.SparseTensorValue(
-              indices=[[0, 0], [0, 1]],
-              values=np.array([1., 10.], dtype=np.float32),
-              dense_shape=[1, 2]),
-          features['weights'].eval())
 
+    _assert_sparse_tensor_value(
+        self,
+        sparse_tensor.SparseTensorValue(
+            indices=[[0, 0], [0, 1]],
+            values=np.array([b'omar', b'stringer'], dtype=np.object_),
+            dense_shape=[1, 2]), self.evaluate(features['aaa']))
+    _assert_sparse_tensor_value(
+        self,
+        sparse_tensor.SparseTensorValue(
+            indices=[[0, 0], [0, 1]],
+            values=np.array([1., 10.], dtype=np.float32),
+            dense_shape=[1, 2]), self.evaluate(features['weights']))
+
+  @test_util.run_deprecated_v1
   def test_transform_features(self):
     column = fc.weighted_categorical_column(
         categorical_column=fc.categorical_column_with_identity(
             key='ids', num_buckets=3),
         weight_feature_key='values')
     inputs = sparse_tensor.SparseTensorValue(
-        indices=((0, 0), (1, 0), (1, 1)),
-        values=(0, 1, 0),
-        dense_shape=(2, 2))
+        indices=((0, 0), (1, 0), (1, 1)), values=(0, 1, 0), dense_shape=(2, 2))
     weights = sparse_tensor.SparseTensorValue(
         indices=((0, 0), (1, 0), (1, 1)),
         values=(0.5, 1.0, 0.1),
         dense_shape=(2, 2))
-    id_tensor, weight_tensor = fc._transform_features({
+    id_tensor, weight_tensor = fc._transform_features_v2({
         'ids': inputs,
         'values': weights,
     }, (column,), None)[column]
-    with _initialized_session():
-      _assert_sparse_tensor_value(
-          self,
-          sparse_tensor.SparseTensorValue(
-              indices=inputs.indices,
-              values=np.array(inputs.values, dtype=np.int64),
-              dense_shape=inputs.dense_shape),
-          id_tensor.eval())
-      _assert_sparse_tensor_value(
-          self,
-          sparse_tensor.SparseTensorValue(
-              indices=weights.indices,
-              values=np.array(weights.values, dtype=np.float32),
-              dense_shape=weights.dense_shape),
-          weight_tensor.eval())
 
+    self.evaluate(variables_lib.global_variables_initializer())
+    self.evaluate(lookup_ops.tables_initializer())
+
+    _assert_sparse_tensor_value(
+        self,
+        sparse_tensor.SparseTensorValue(
+            indices=inputs.indices,
+            values=np.array(inputs.values, dtype=np.int64),
+            dense_shape=inputs.dense_shape), self.evaluate(id_tensor))
+    _assert_sparse_tensor_value(
+        self,
+        sparse_tensor.SparseTensorValue(
+            indices=weights.indices,
+            values=np.array(weights.values, dtype=np.float32),
+            dense_shape=weights.dense_shape), self.evaluate(weight_tensor))
+
+  @test_util.run_deprecated_v1
   def test_transform_features_dense_input(self):
     column = fc.weighted_categorical_column(
         categorical_column=fc.categorical_column_with_identity(
@@ -7466,55 +7945,57 @@ class WeightedCategoricalColumnTest(test.TestCase):
         indices=((0, 0), (1, 0), (1, 1)),
         values=(0.5, 1.0, 0.1),
         dense_shape=(2, 2))
-    id_tensor, weight_tensor = fc._transform_features({
+    id_tensor, weight_tensor = fc._transform_features_v2({
         'ids': ((0, -1), (1, 0)),
         'values': weights,
     }, (column,), None)[column]
-    with _initialized_session():
-      _assert_sparse_tensor_value(
-          self,
-          sparse_tensor.SparseTensorValue(
-              indices=((0, 0), (1, 0), (1, 1)),
-              values=np.array((0, 1, 0), dtype=np.int64),
-              dense_shape=(2, 2)),
-          id_tensor.eval())
-      _assert_sparse_tensor_value(
-          self,
-          sparse_tensor.SparseTensorValue(
-              indices=weights.indices,
-              values=np.array(weights.values, dtype=np.float32),
-              dense_shape=weights.dense_shape),
-          weight_tensor.eval())
 
+    self.evaluate(variables_lib.global_variables_initializer())
+    self.evaluate(lookup_ops.tables_initializer())
+
+    _assert_sparse_tensor_value(
+        self,
+        sparse_tensor.SparseTensorValue(
+            indices=((0, 0), (1, 0), (1, 1)),
+            values=np.array((0, 1, 0), dtype=np.int64),
+            dense_shape=(2, 2)), self.evaluate(id_tensor))
+    _assert_sparse_tensor_value(
+        self,
+        sparse_tensor.SparseTensorValue(
+            indices=weights.indices,
+            values=np.array(weights.values, dtype=np.float32),
+            dense_shape=weights.dense_shape), self.evaluate(weight_tensor))
+
+  @test_util.run_deprecated_v1
   def test_transform_features_dense_weights(self):
     column = fc.weighted_categorical_column(
         categorical_column=fc.categorical_column_with_identity(
             key='ids', num_buckets=3),
         weight_feature_key='values')
     inputs = sparse_tensor.SparseTensorValue(
-        indices=((0, 0), (1, 0), (1, 1)),
-        values=(2, 1, 0),
-        dense_shape=(2, 2))
-    id_tensor, weight_tensor = fc._transform_features({
+        indices=((0, 0), (1, 0), (1, 1)), values=(2, 1, 0), dense_shape=(2, 2))
+    id_tensor, weight_tensor = fc._transform_features_v2({
         'ids': inputs,
         'values': ((.5, 0.), (1., .1)),
     }, (column,), None)[column]
-    with _initialized_session():
-      _assert_sparse_tensor_value(
-          self,
-          sparse_tensor.SparseTensorValue(
-              indices=inputs.indices,
-              values=np.array(inputs.values, dtype=np.int64),
-              dense_shape=inputs.dense_shape),
-          id_tensor.eval())
-      _assert_sparse_tensor_value(
-          self,
-          sparse_tensor.SparseTensorValue(
-              indices=((0, 0), (1, 0), (1, 1)),
-              values=np.array((.5, 1., .1), dtype=np.float32),
-              dense_shape=(2, 2)),
-          weight_tensor.eval())
 
+    self.evaluate(variables_lib.global_variables_initializer())
+    self.evaluate(lookup_ops.tables_initializer())
+
+    _assert_sparse_tensor_value(
+        self,
+        sparse_tensor.SparseTensorValue(
+            indices=inputs.indices,
+            values=np.array(inputs.values, dtype=np.int64),
+            dense_shape=inputs.dense_shape), self.evaluate(id_tensor))
+    _assert_sparse_tensor_value(
+        self,
+        sparse_tensor.SparseTensorValue(
+            indices=((0, 0), (1, 0), (1, 1)),
+            values=np.array((.5, 1., .1), dtype=np.float32),
+            dense_shape=(2, 2)), self.evaluate(weight_tensor))
+
+  @test_util.run_deprecated_v1
   def test_linear_model(self):
     column = fc.weighted_categorical_column(
         categorical_column=fc.categorical_column_with_identity(
@@ -7535,15 +8016,18 @@ class WeightedCategoricalColumnTest(test.TestCase):
                   dense_shape=(2, 2))
       })
       weight_var, bias = model.variables
-      with _initialized_session():
-        self.assertAllClose((0.,), bias.eval())
-        self.assertAllClose(((0.,), (0.,), (0.,)), weight_var.eval())
-        self.assertAllClose(((0.,), (0.,)), predictions.eval())
-        weight_var.assign(((1.,), (2.,), (3.,))).eval()
-        # weight_var[0] * weights[0, 0] = 1 * .5 = .5
-        # weight_var[2] * weights[1, 0] + weight_var[1] * weights[1, 1]
-        # = 3*1 + 2*.1 = 3+.2 = 3.2
-        self.assertAllClose(((.5,), (3.2,)), predictions.eval())
+
+      self.evaluate(variables_lib.global_variables_initializer())
+      self.evaluate(lookup_ops.tables_initializer())
+
+      self.assertAllClose((0.,), self.evaluate(bias))
+      self.assertAllClose(((0.,), (0.,), (0.,)), self.evaluate(weight_var))
+      self.assertAllClose(((0.,), (0.,)), self.evaluate(predictions))
+      self.evaluate(weight_var.assign(((1.,), (2.,), (3.,))))
+      # weight_var[0] * weights[0, 0] = 1 * .5 = .5
+      # weight_var[2] * weights[1, 0] + weight_var[1] * weights[1, 1]
+      # = 3*1 + 2*.1 = 3+.2 = 3.2
+      self.assertAllClose(((.5,), (3.2,)), self.evaluate(predictions))
 
   def test_linear_model_mismatched_shape(self):
     column = fc.weighted_categorical_column(
@@ -7589,7 +8073,7 @@ class WeightedCategoricalColumnTest(test.TestCase):
           rewriter_config_pb2.RewriterConfig.OFF)
       with _initialized_session(config):
         with self.assertRaisesRegexp(errors.OpError, 'Incompatible shapes'):
-          predictions.eval()
+          self.evaluate(predictions)
 
   def test_linear_model_mismatched_dense_shape(self):
     column = fc.weighted_categorical_column(
@@ -7607,15 +8091,18 @@ class WeightedCategoricalColumnTest(test.TestCase):
           'values': ((.5,), (1.,), (.1,))
       })
       weight_var, bias = model.variables
-      with _initialized_session():
-        self.assertAllClose((0.,), bias.eval())
-        self.assertAllClose(((0.,), (0.,), (0.,)), weight_var.eval())
-        self.assertAllClose(((0.,), (0.,)), predictions.eval())
-        weight_var.assign(((1.,), (2.,), (3.,))).eval()
-        # weight_var[0] * weights[0, 0] = 1 * .5 = .5
-        # weight_var[2] * weights[1, 0] + weight_var[1] * weights[1, 1]
-        # = 3*1 + 2*.1 = 3+.2 = 3.2
-        self.assertAllClose(((.5,), (3.2,)), predictions.eval())
+
+      self.evaluate(variables_lib.global_variables_initializer())
+      self.evaluate(lookup_ops.tables_initializer())
+
+      self.assertAllClose((0.,), self.evaluate(bias))
+      self.assertAllClose(((0.,), (0.,), (0.,)), self.evaluate(weight_var))
+      self.assertAllClose(((0.,), (0.,)), self.evaluate(predictions))
+      self.evaluate(weight_var.assign(((1.,), (2.,), (3.,))))
+      # weight_var[0] * weights[0, 0] = 1 * .5 = .5
+      # weight_var[2] * weights[1, 0] + weight_var[1] * weights[1, 1]
+      # = 3*1 + 2*.1 = 3+.2 = 3.2
+      self.assertAllClose(((.5,), (3.2,)), self.evaluate(predictions))
 
   def test_old_linear_model(self):
     column = fc.weighted_categorical_column(
@@ -7637,15 +8124,18 @@ class WeightedCategoricalColumnTest(test.TestCase):
       }, (column,))
       bias = get_linear_model_bias()
       weight_var = get_linear_model_column_var(column)
-      with _initialized_session():
-        self.assertAllClose((0.,), bias.eval())
-        self.assertAllClose(((0.,), (0.,), (0.,)), weight_var.eval())
-        self.assertAllClose(((0.,), (0.,)), predictions.eval())
-        weight_var.assign(((1.,), (2.,), (3.,))).eval()
-        # weight_var[0] * weights[0, 0] = 1 * .5 = .5
-        # weight_var[2] * weights[1, 0] + weight_var[1] * weights[1, 1]
-        # = 3*1 + 2*.1 = 3+.2 = 3.2
-        self.assertAllClose(((.5,), (3.2,)), predictions.eval())
+
+      self.evaluate(variables_lib.global_variables_initializer())
+      self.evaluate(lookup_ops.tables_initializer())
+
+      self.assertAllClose((0.,), self.evaluate(bias))
+      self.assertAllClose(((0.,), (0.,), (0.,)), self.evaluate(weight_var))
+      self.assertAllClose(((0.,), (0.,)), self.evaluate(predictions))
+      self.evaluate(weight_var.assign(((1.,), (2.,), (3.,))))
+      # weight_var[0] * weights[0, 0] = 1 * .5 = .5
+      # weight_var[2] * weights[1, 0] + weight_var[1] * weights[1, 1]
+      # = 3*1 + 2*.1 = 3+.2 = 3.2
+      self.assertAllClose(((.5,), (3.2,)), self.evaluate(predictions))
 
   def test_old_linear_model_mismatched_shape(self):
     column = fc.weighted_categorical_column(
@@ -7690,7 +8180,7 @@ class WeightedCategoricalColumnTest(test.TestCase):
           rewriter_config_pb2.RewriterConfig.OFF)
       with _initialized_session(config):
         with self.assertRaisesRegexp(errors.OpError, 'Incompatible shapes'):
-          predictions.eval()
+          self.evaluate(predictions)
 
   def test_old_linear_model_mismatched_dense_shape(self):
     column = fc.weighted_categorical_column(
@@ -7708,19 +8198,22 @@ class WeightedCategoricalColumnTest(test.TestCase):
       }, (column,))
       bias = get_linear_model_bias()
       weight_var = get_linear_model_column_var(column)
-      with _initialized_session():
-        self.assertAllClose((0.,), bias.eval())
-        self.assertAllClose(((0.,), (0.,), (0.,)), weight_var.eval())
-        self.assertAllClose(((0.,), (0.,)), predictions.eval())
-        weight_var.assign(((1.,), (2.,), (3.,))).eval()
-        # weight_var[0] * weights[0, 0] = 1 * .5 = .5
-        # weight_var[2] * weights[1, 0] + weight_var[1] * weights[1, 1]
-        # = 3*1 + 2*.1 = 3+.2 = 3.2
-        self.assertAllClose(((.5,), (3.2,)), predictions.eval())
+
+      self.evaluate(variables_lib.global_variables_initializer())
+      self.evaluate(lookup_ops.tables_initializer())
+
+      self.assertAllClose((0.,), self.evaluate(bias))
+      self.assertAllClose(((0.,), (0.,), (0.,)), self.evaluate(weight_var))
+      self.assertAllClose(((0.,), (0.,)), self.evaluate(predictions))
+      self.evaluate(weight_var.assign(((1.,), (2.,), (3.,))))
+      # weight_var[0] * weights[0, 0] = 1 * .5 = .5
+      # weight_var[2] * weights[1, 0] + weight_var[1] * weights[1, 1]
+      # = 3*1 + 2*.1 = 3+.2 = 3.2
+      self.assertAllClose(((.5,), (3.2,)), self.evaluate(predictions))
 
   def test_old_linear_model_old_categorical(self):
     column = fc.weighted_categorical_column(
-        categorical_column=fc_old.categorical_column_with_identity(
+        categorical_column=fc_old._categorical_column_with_identity(
             key='ids', num_buckets=3),
         weight_feature_key='values')
     with ops.Graph().as_default():
@@ -7738,18 +8231,22 @@ class WeightedCategoricalColumnTest(test.TestCase):
       }, (column,))
       bias = get_linear_model_bias()
       weight_var = get_linear_model_column_var(column)
-      with _initialized_session():
-        self.assertAllClose((0.,), bias.eval())
-        self.assertAllClose(((0.,), (0.,), (0.,)), weight_var.eval())
-        self.assertAllClose(((0.,), (0.,)), predictions.eval())
-        weight_var.assign(((1.,), (2.,), (3.,))).eval()
-        # weight_var[0] * weights[0, 0] = 1 * .5 = .5
-        # weight_var[2] * weights[1, 0] + weight_var[1] * weights[1, 1]
-        # = 3*1 + 2*.1 = 3+.2 = 3.2
-        self.assertAllClose(((.5,), (3.2,)), predictions.eval())
+
+      self.evaluate(variables_lib.global_variables_initializer())
+      self.evaluate(lookup_ops.tables_initializer())
+
+      self.assertAllClose((0.,), self.evaluate(bias))
+      self.assertAllClose(((0.,), (0.,), (0.,)), self.evaluate(weight_var))
+      self.assertAllClose(((0.,), (0.,)), self.evaluate(predictions))
+      self.evaluate(weight_var.assign(((1.,), (2.,), (3.,))))
+      # weight_var[0] * weights[0, 0] = 1 * .5 = .5
+      # weight_var[2] * weights[1, 0] + weight_var[1] * weights[1, 1]
+      # = 3*1 + 2*.1 = 3+.2 = 3.2
+      self.assertAllClose(((.5,), (3.2,)), self.evaluate(predictions))
 
   # TODO(ptucker): Add test with embedding of weighted categorical.
 
+  @test_util.run_deprecated_v1
   def test_serialization(self):
     categorical_column = fc.categorical_column_with_identity(
         key='ids', num_buckets=3)
diff --git a/tensorflow/python/framework/auto_control_deps.py b/tensorflow/python/framework/auto_control_deps.py
index 9a9ee46aabb..30dc959e9a9 100644
--- a/tensorflow/python/framework/auto_control_deps.py
+++ b/tensorflow/python/framework/auto_control_deps.py
@@ -21,9 +21,11 @@ from __future__ import print_function
 from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes as dtypes_module
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import control_flow_util
+from tensorflow.python.ops import tensor_array_ops
 from tensorflow.python.util import nest
 from tensorflow.python.util import tf_decorator
 
@@ -70,6 +72,17 @@ class AutomaticControlDependencies(object):
       self._returned_tensors.add(indices)
       self._returned_tensors.add(values)
       return ops.IndexedSlices(values, indices, dense_shape=tensor.dense_shape)
+    elif isinstance(tensor, sparse_tensor.SparseTensor):
+      values = array_ops.identity(tensor.values)
+      indices = array_ops.identity(tensor.indices)
+      self._returned_tensors.add(indices)
+      self._returned_tensors.add(values)
+      return sparse_tensor.SparseTensor(
+          indices, values, dense_shape=tensor.dense_shape)
+    elif isinstance(tensor, tensor_array_ops.TensorArray):
+      flow = array_ops.identity(tensor.flow)
+      self._returned_tensors.add(flow)
+      return tensor_array_ops.build_ta_with_new_flow(tensor, flow)
     # We want to make the return values depend on the stateful operations, but
     # we don't want to introduce a cycle, so we make the return value the result
     # of a new identity operation that the stateful operations definitely don't
diff --git a/tensorflow/python/framework/constant_op.py b/tensorflow/python/framework/constant_op.py
index 53d84b2dc76..ade0797dcdb 100644
--- a/tensorflow/python/framework/constant_op.py
+++ b/tensorflow/python/framework/constant_op.py
@@ -114,8 +114,9 @@ def convert_to_eager_tensor(value, ctx, dtype=None):
     return ops.EagerTensor(value, handle, device, dtype)
 
 
-@tf_export("constant")
-def constant(value, dtype=None, shape=None, name="Const", verify_shape=False):
+@tf_export(v1=["constant"])
+def constant_v1(
+    value, dtype=None, shape=None, name="Const", verify_shape=False):
   """Creates a constant tensor.
 
   The resulting tensor is populated with values of type `dtype`, as
@@ -174,6 +175,79 @@ def constant(value, dtype=None, shape=None, name="Const", verify_shape=False):
   Raises:
     TypeError: if shape is incorrectly specified or unsupported.
   """
+  return _constant_impl(value, dtype, shape, name, verify_shape=verify_shape,
+                        allow_broadcast=False)
+
+
+@tf_export("constant", v1=[])
+def constant(value, dtype=None, shape=None, name="Const"):
+  """Creates a constant tensor.
+
+  The resulting tensor is populated with values of type `dtype`, as
+  specified by arguments `value` and (optionally) `shape` (see examples
+  below).
+
+  The argument `value` can be a constant value, or a list of values of type
+  `dtype`. If `value` is a list, then the length of the list must be less
+  than or equal to the number of elements implied by the `shape` argument (if
+  specified). In the case where the list length is less than the number of
+  elements specified by `shape`, the last element in the list will be used
+  to fill the remaining entries.
+
+  The argument `shape` is optional. If present, it specifies the dimensions of
+  the resulting tensor. If not present, the shape of `value` is used.
+
+  If the argument `dtype` is not specified, then the type is inferred from
+  the type of `value`.
+
+  For example:
+
+  ```python
+  # Constant 1-D Tensor populated with value list.
+  tensor = tf.constant([1, 2, 3, 4, 5, 6]) => [1 2 3 4 5 6]
+
+  # Constant 1-D Tensor populated with value list.
+  tensor = tf.constant([1, 2, 3, 4, 5, 6], shape=(2,3))
+       => [[1 2 3], [4 5 6]]
+
+  # Constant 2-D tensor populated with scalar value -1.
+  tensor = tf.constant(-1.0, shape=[2, 3]) => [[-1. -1. -1.]
+                                               [-1. -1. -1.]]
+  ```
+
+  `tf.constant` differs from `tf.fill` in a few ways:
+
+  *   `tf.constant` supports arbitrary constants, not just uniform scalar
+      Tensors like `tf.fill`.
+  *   `tf.constant` creates a `Const` node in the computation graph with the
+      exact value at graph construction time. On the other hand, `tf.fill`
+      creates an Op in the graph that is expanded at runtime.
+  *   Because `tf.constant` only embeds constant values in the graph, it does
+      not support dynamic shapes based on other runtime Tensors, whereas
+      `tf.fill` does.
+
+  Args:
+    value:          A constant value (or list) of output type `dtype`.
+
+    dtype:          The type of the elements of the resulting tensor.
+
+    shape:          Optional dimensions of resulting tensor.
+
+    name:           Optional name for the tensor.
+
+  Returns:
+    A Constant Tensor.
+
+  Raises:
+    TypeError: if shape is incorrectly specified or unsupported.
+  """
+  return _constant_impl(value, dtype, shape, name, verify_shape=False,
+                        allow_broadcast=True)
+
+
+def _constant_impl(
+    value, dtype, shape, name, verify_shape, allow_broadcast):
+  """Implementation of constant."""
   ctx = context.context()
   if ctx.executing_eagerly():
     t = convert_to_eager_tensor(value, ctx, dtype)
@@ -205,7 +279,8 @@ def constant(value, dtype=None, shape=None, name="Const", verify_shape=False):
   tensor_value = attr_value_pb2.AttrValue()
   tensor_value.tensor.CopyFrom(
       tensor_util.make_tensor_proto(
-          value, dtype=dtype, shape=shape, verify_shape=verify_shape))
+          value, dtype=dtype, shape=shape, verify_shape=verify_shape,
+          allow_broadcast=allow_broadcast))
   dtype_value = attr_value_pb2.AttrValue(type=tensor_value.tensor.dtype)
   const_tensor = g.create_op(
       "Const", [], [dtype_value.type],
diff --git a/tensorflow/python/framework/device.py b/tensorflow/python/framework/device.py
index 7f6e0a75a5c..e7ac6444a4a 100644
--- a/tensorflow/python/framework/device.py
+++ b/tensorflow/python/framework/device.py
@@ -23,7 +23,7 @@ import threading
 from tensorflow.python.util.tf_export import tf_export
 
 
-@tf_export("DeviceSpec")
+@tf_export(v1=["DeviceSpec"])
 class DeviceSpec(object):
   """Represents a (possibly partial) specification for a TensorFlow device.
 
diff --git a/tensorflow/python/framework/dtypes.py b/tensorflow/python/framework/dtypes.py
index 48e9f0524e8..f7a12d27df7 100644
--- a/tensorflow/python/framework/dtypes.py
+++ b/tensorflow/python/framework/dtypes.py
@@ -18,6 +18,7 @@ from __future__ import division
 from __future__ import print_function
 
 import numpy as np
+from six.moves import builtins
 
 from tensorflow.core.framework import types_pb2
 from tensorflow.python import pywrap_tensorflow
@@ -548,8 +549,8 @@ _NP_TO_TF = frozenset([
     (np.int8, int8),
     (np.complex64, complex64),
     (np.complex128, complex128),
-    (np.object, string),
-    (np.bool, bool),
+    (np.object_, string),
+    (np.bool_, bool),
     (_np_qint8, qint8),
     (_np_quint8, quint8),
     (_np_qint16, qint16),
@@ -658,8 +659,9 @@ tf_export(
         __name__, "QUANTIZED_DTYPES")
 
 _PYTHON_TO_TF = {
-    float: float32,
-    bool: bool,
+    builtins.float: float32,
+    builtins.bool: bool,
+    builtins.object: string
 }
 
 
diff --git a/tensorflow/python/framework/dtypes_test.py b/tensorflow/python/framework/dtypes_test.py
index a873670e046..719fdc0953a 100644
--- a/tensorflow/python/framework/dtypes_test.py
+++ b/tensorflow/python/framework/dtypes_test.py
@@ -81,10 +81,10 @@ class TypesTest(test_util.TensorFlowTestCase):
     self.assertIs(dtypes.int8, dtypes.as_dtype(np.int8))
     self.assertIs(dtypes.complex64, dtypes.as_dtype(np.complex64))
     self.assertIs(dtypes.complex128, dtypes.as_dtype(np.complex128))
-    self.assertIs(dtypes.string, dtypes.as_dtype(np.object))
+    self.assertIs(dtypes.string, dtypes.as_dtype(np.object_))
     self.assertIs(dtypes.string,
                   dtypes.as_dtype(np.array(["foo", "bar"]).dtype))
-    self.assertIs(dtypes.bool, dtypes.as_dtype(np.bool))
+    self.assertIs(dtypes.bool, dtypes.as_dtype(np.bool_))
     with self.assertRaises(TypeError):
       dtypes.as_dtype(np.dtype([("f1", np.uint), ("f2", np.int32)]))
 
diff --git a/tensorflow/python/framework/file_system_test.py b/tensorflow/python/framework/file_system_test.py
index 6901715e5d0..8687bc5a785 100644
--- a/tensorflow/python/framework/file_system_test.py
+++ b/tensorflow/python/framework/file_system_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import os
 
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.framework import load_library
 from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.ops import io_ops
@@ -36,13 +37,14 @@ class FileSystemTest(test.TestCase):
                                        "test_file_system.so")
     load_library.load_file_system_library(file_system_library)
 
+  @test_util.run_deprecated_v1
   def testBasic(self):
     with self.cached_session() as sess:
       reader = io_ops.WholeFileReader("test_reader")
       queue = data_flow_ops.FIFOQueue(99, [dtypes.string], shapes=())
       queue.enqueue_many([["test://foo"]]).run()
       queue.close().run()
-      key, value = sess.run(reader.read(queue))
+      key, value = self.evaluate(reader.read(queue))
     self.assertEqual(key, compat.as_bytes("test://foo"))
     self.assertEqual(value, compat.as_bytes("AAAAAAAAAA"))
 
diff --git a/tensorflow/python/framework/func_graph.py b/tensorflow/python/framework/func_graph.py
index c7a5d1ee201..f74d072e8e2 100644
--- a/tensorflow/python/framework/func_graph.py
+++ b/tensorflow/python/framework/func_graph.py
@@ -26,11 +26,13 @@ from tensorflow.python.eager import context
 from tensorflow.python.eager import tape
 from tensorflow.python.eager.graph_only_ops import graph_placeholder
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_spec
 from tensorflow.python.framework.auto_control_deps import AutomaticControlDependencies
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import custom_gradient
 from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import tensor_array_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.util import compat
 from tensorflow.python.util import nest
@@ -111,7 +113,7 @@ class FuncGraph(ops.Graph):
     # this stack from the default graph even in eager mode. Maybe it should be
     # part of the eager context? This would also allow us to remove a
     # get_default_graph() call from the function cache lookup.
-    self._distribution_strategy_stack = graph._distribution_strategy_stack
+    self._distribution_strategy_stack = list(graph._distribution_strategy_stack)
     # We ignore device placements from any outer scopes while tracing the
     # function when possible, to avoid hard-coding them in the function
     # graph. "Default" placements come from the PartitionedCallOp's placement,
@@ -372,7 +374,7 @@ def func_graph_from_py_func(name,
         # captured Operations).
         with ops.control_dependencies([x]):
           x = array_ops.identity(op_return_value)
-      else:
+      elif not isinstance(x, tensor_array_ops.TensorArray):
         try:
           x = ops.convert_to_tensor_or_indexed_slices(x)
         except (ValueError, TypeError):
@@ -395,9 +397,9 @@ def func_graph_from_py_func(name,
           return autograph.converted_call(
               original_func, None,
               autograph.ConversionOptions(
-                  verbose=True,
+                  verbose=autograph.Verbosity.BRIEF,
                   recursive=True,
-                  strip_decorators=(function.defun, def_function.function),
+                  strip_decorators=(def_function.function,),
                   optional_features=(),
               ), *args, **kwargs)
 
@@ -408,7 +410,8 @@ def func_graph_from_py_func(name,
 
       func_outputs = python_func(*func_args, **func_kwargs)
 
-      # invariant: `func_outputs` contains only Tensors and `None`s.
+      # invariant: `func_outputs` contains only Tensors, IndexedSlices,
+      # SparseTensors, TensorArrays and `None`s.
       func_outputs = nest.map_structure(convert, func_outputs)
 
       check_mutation(func_args_before, func_args)
@@ -495,7 +498,17 @@ def check_mutation(n1, n2):
 
 
 def flatten(sequence):
-  """A wrapper around `nest.flatten` that also unpacks `IndexedSlices`."""
+  """Like `nest.flatten` but also unpacks other Tensor-like objects.
+
+  Flattens non-tensor objects into their constituent tensors.
+
+  Args:
+    sequence: A nested structure of Tensors, IndexedSlices, SparseTensors and
+      TensorArrays.
+
+  Returns:
+    A list of tensors.
+  """
   # TODO(akshayka): Support `SparseTensor` in a similar fashion.
   flat_sequence = nest.flatten(sequence)
   outputs = []
@@ -505,11 +518,58 @@ def flatten(sequence):
         outputs.extend([item.values, item.indices, item.dense_shape])
       else:
         outputs.extend([item.values, item.indices])
+    elif isinstance(item, sparse_tensor.SparseTensor):
+      outputs.extend([item.indices, item.values, item.dense_shape])
+    elif isinstance(item, tensor_array_ops.TensorArray):
+      outputs.append(item.flow)
     else:
       outputs.append(item)
   return outputs
 
 
+def pack_sequence_as(structure, flat_sequence):
+  """Like `nest.pack_sequence_as` but also packs other Tensor-like objects.
+
+  Args:
+    structure: The structure to pack into. May contain Tensors, IndexedSlices,
+      TensorArrays or SparseTensors.
+    flat_sequence: An iterable containing tensors.
+
+  Returns:
+    A nested structure.
+
+  Raises:
+    AssertionError if `structure` and `flat_sequence` are not compatible.
+  """
+  flattened_structure = nest.flatten(structure)
+  flat_sequence_with_slices_and_tas = []
+  index = 0
+  for t in flattened_structure:
+    if isinstance(t, ops.IndexedSlices):
+      if t.dense_shape is not None:
+        flat_sequence_with_slices_and_tas.append(
+            ops.IndexedSlices(*flat_sequence[index:index + 3]))
+        index += 3
+      else:
+        flat_sequence_with_slices_and_tas.append(
+            ops.IndexedSlices(*flat_sequence[index:index + 2]))
+        index += 2
+    elif isinstance(t, sparse_tensor.SparseTensor):
+      flat_sequence_with_slices_and_tas.append(
+          sparse_tensor.SparseTensor(*flat_sequence[index:index + 3]))
+      index += 3
+    elif isinstance(t, tensor_array_ops.TensorArray):
+      flow = flat_sequence[index]
+      ta = tensor_array_ops.build_ta_with_new_flow(t, flow)
+      flat_sequence_with_slices_and_tas.append(ta)
+      index += 1
+    else:
+      flat_sequence_with_slices_and_tas.append(flat_sequence[index])
+      index += 1
+  assert len(flattened_structure) == len(flat_sequence_with_slices_and_tas)
+  return nest.pack_sequence_as(structure, flat_sequence_with_slices_and_tas)
+
+
 def _create_substitute_placeholder(value, name=None, dtype=None):
   """Creates a placeholder for `value` and propagates shape info to it."""
   # Note: setting ops.control_dependencies(None) ensures we always put
diff --git a/tensorflow/python/framework/function.py b/tensorflow/python/framework/function.py
index 230a5546414..622686ce005 100644
--- a/tensorflow/python/framework/function.py
+++ b/tensorflow/python/framework/function.py
@@ -874,7 +874,7 @@ def func_graph_from_py_func(func, arg_names, arg_types, name=None,
       # If func only returned one value, make it a tuple.
       if not isinstance(outputs, (list, tuple)):
         outputs = (outputs,)
-      if any([_ is None for _ in outputs]):
+      if any(_ is None for _ in outputs):
         raise ValueError("Function %s can not return None." % name)
     # Ensures each output is a Tensor in the function graph.
     outputs = [ops.convert_to_tensor(t) for t in outputs]
@@ -1190,7 +1190,7 @@ def get_extra_args():
 
 
 def _type_list_to_str(types):
-  if any([_ not in _DTYPE_TO_STR for _ in types]):
+  if any(_ not in _DTYPE_TO_STR for _ in types):
     raise ValueError("Unsupported dtypes: %s" % types)
   return "".join([_DTYPE_TO_STR[_] for _ in types])
 
diff --git a/tensorflow/python/framework/function_def_to_graph_test.py b/tensorflow/python/framework/function_def_to_graph_test.py
index b2ef64f8730..ddf1a6e74d2 100644
--- a/tensorflow/python/framework/function_def_to_graph_test.py
+++ b/tensorflow/python/framework/function_def_to_graph_test.py
@@ -25,6 +25,7 @@ from tensorflow.python.framework import function_def_to_graph
 from tensorflow.python.framework import graph_to_function_def
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import test_util
 from tensorflow.python.framework import test_ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
@@ -52,6 +53,7 @@ class FunctionDefToGraphTest(test.TestCase):
     fdef.signature.name = "_whats_in_a_name"
     return fdef
 
+  @test_util.run_deprecated_v1
   def testInputsAndOutputs(self):
     fdef = self._build_function_def()
     g = function_def_to_graph.function_def_to_graph(fdef)
@@ -186,6 +188,7 @@ class FunctionDefToGraphDefTest(test.TestCase):
     self.assertEqual(g.node[0].attr["shape"].shape.unknown_rank, False)
     self.assertFalse("shape" in g.node[2].attr)
 
+  @test_util.run_deprecated_v1
   def testFunctionCallsFromFunction(self):
     x = constant_op.constant(5.0)
     y = constant_op.constant(10.0)
diff --git a/tensorflow/python/framework/function_test.py b/tensorflow/python/framework/function_test.py
index 13ee6c5d2d7..d71f06ea528 100644
--- a/tensorflow/python/framework/function_test.py
+++ b/tensorflow/python/framework/function_test.py
@@ -35,6 +35,7 @@ from tensorflow.python.framework import function
 from tensorflow.python.framework import graph_to_function_def
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import test_util
 from tensorflow.python.framework.errors import InvalidArgumentError
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
@@ -102,8 +103,9 @@ class FunctionTest(test.TestCase):
       call = MyIdentityFunc([18.0])
       self.assertEqual("MyIdentity", call.op.name)
       with session.Session() as sess:
-        self.assertAllEqual([18.0], sess.run(call))
+        self.assertAllEqual([18.0], self.evaluate(call))
 
+  @test_util.run_deprecated_v1
   def testIdentityImplicitDeref(self):
 
     @function.Defun(dtypes.float32, func_name="MyIdentity")
@@ -116,8 +118,8 @@ class FunctionTest(test.TestCase):
       self.assertEqual("MyIdentity", call.op.name)
       for cfg in _OptimizerOptions():
         with session.Session(config=cfg) as sess:
-          sess.run(var.initializer)
-          self.assertAllEqual([18.0], sess.run(call))
+          self.evaluate(var.initializer)
+          self.assertAllEqual([18.0], self.evaluate(call))
 
   def testIdentityOutputName(self):
 
@@ -130,7 +132,7 @@ class FunctionTest(test.TestCase):
       call = MyIdentityFunc([18.0])
       self.assertEqual("MyIdentity", call.op.name)
       with session.Session() as sess:
-        self.assertAllEqual([18.0], sess.run(call))
+        self.assertAllEqual([18.0], self.evaluate(call))
 
   def testTooManyOutputNames(self):
 
@@ -158,7 +160,7 @@ class FunctionTest(test.TestCase):
       call = APlus2B([1.0], [2.0])
       self.assertEqual("APlus2B", call.op.name)
       with session.Session() as sess:
-        self.assertAllEqual([5.0], sess.run(call))
+        self.assertAllEqual([5.0], self.evaluate(call))
 
   def testFunctionWithNoOutput(self):
 
@@ -187,7 +189,7 @@ class FunctionTest(test.TestCase):
       call = APlus2B([1.0], [2.0])
       self.assertEqual("APlus2B", call.op.name)
       with session.Session() as sess:
-        self.assertAllEqual([5.0], sess.run(call))
+        self.assertAllEqual([5.0], self.evaluate(call))
 
   def testDefineFunctionDuplicateOutputs(self):
 
@@ -224,8 +226,8 @@ class FunctionTest(test.TestCase):
       call_g = XSquarePlusOneGrad([2.0], [0.1])
 
       with session.Session() as sess:
-        self.assertAllClose([5.0], sess.run(call_f))
-        self.assertAllClose([0.4], sess.run(call_g))
+        self.assertAllClose([5.0], self.evaluate(call_f))
+        self.assertAllClose([0.4], self.evaluate(call_g))
 
   def testTanhSymGrad(self):
 
@@ -322,6 +324,7 @@ class FunctionTest(test.TestCase):
       self.assertEqual(x.get_shape(), dx.get_shape())
       self.assertEqual(y.get_shape(), dy.get_shape())
 
+  @test_util.run_deprecated_v1
   def testSymGradAttr(self):
 
     @function.Defun(noinline=True)
@@ -365,7 +368,7 @@ class FunctionTest(test.TestCase):
       else:
         dx, dy = gradients_impl.gradients([z], [x, y])
       with session.Session() as sess:
-        dx_val, dy_val = sess.run([dx, dy])
+        dx_val, dy_val = self.evaluate([dx, dy])
         self.assertEqual([2.0], dx_val)
         self.assertEqual([0.0], dy_val)
 
@@ -387,7 +390,7 @@ class FunctionTest(test.TestCase):
       call = AConstant()
       self.assertEqual("AConstant", call.op.name)
       with session.Session() as sess:
-        self.assertAllEqual([42], sess.run(call))
+        self.assertAllEqual([42], self.evaluate(call))
 
   def testDefineFunctionNames(self):
 
@@ -438,6 +441,7 @@ class FunctionTest(test.TestCase):
                                    "assertion failed.*-3"):
         self.assertAllEqual(Foo(constant_op.constant(-3.0)).eval(), 6.0)
 
+  @test_util.run_deprecated_v1
   def testAssertWrapper(self):
 
     @function.Defun(dtypes.float32)
@@ -452,6 +456,7 @@ class FunctionTest(test.TestCase):
                                    "assertion"):
         _ = MyFn(100.0).eval()
 
+  @test_util.run_deprecated_v1
   def testWhileLoopCallsFunc(self):
     with self.session(use_gpu=True) as sess:
 
@@ -468,9 +473,10 @@ class FunctionTest(test.TestCase):
 
       loop = control_flow_ops.while_loop(lambda x: x < 1e5, Body, [1.0])
 
-      ans = sess.run(loop)
+      ans = self.evaluate(loop)
       self.assertAllClose(ans, 131072.)
 
+  @test_util.run_deprecated_v1
   def testControlFlowStrictness(self):
     """Inlined functions must not execute in a untaken control flow branch."""
 
@@ -517,6 +523,7 @@ class FunctionTest(test.TestCase):
                                    "assertion"):
         sess.run(loop, {pred: True, x: 3})
 
+  @test_util.run_deprecated_v1
   def testVar(self):
 
     @function.Defun(dtypes.float32)
@@ -532,6 +539,7 @@ class FunctionTest(test.TestCase):
       variables.global_variables_initializer().run()
       self.assertAllEqual(z.eval(), 101.)
 
+  @test_util.run_deprecated_v1
   def testResourceVarAsImplicitInput(self):
     g = ops.Graph()
     with g.as_default(), ops.device("cpu:0"):
@@ -552,8 +560,8 @@ class FunctionTest(test.TestCase):
 
     with self.session(graph=g):
       v.initializer.run()
-      self.assertAllEqual(expected_val.eval(), actual_val.eval())
-      self.assertAllEqual(expected_shape, actual_shape.eval())
+      self.assertAllEqual(expected_val.eval(), self.evaluate(actual_val))
+      self.assertAllEqual(expected_shape, self.evaluate(actual_shape))
 
   def testDefineErrors(self):
     with ops.Graph().as_default():
@@ -650,8 +658,8 @@ class FunctionTest(test.TestCase):
       # pylint: enable=unexpected-keyword-arg
       self.assertEqual("next", call2.op.name)
       with session.Session() as sess:
-        self.assertAllEqual([1], sess.run(call1))
-        self.assertAllEqual([0], sess.run(call2))
+        self.assertAllEqual([1], self.evaluate(call1))
+        self.assertAllEqual([0], self.evaluate(call2))
 
   def testNestedFunction(self):
 
@@ -707,6 +715,7 @@ class FunctionTest(test.TestCase):
     gdef = g.as_graph_def()
     self.assertEqual(0, len(gdef.library.function))
 
+  @test_util.run_deprecated_v1
   def testReduction(self):
     g = ops.Graph()
 
@@ -735,6 +744,7 @@ class FunctionTest(test.TestCase):
       self.assertAllClose(vals[0], vals[1])
       self.assertAllClose(vals[2], vals[3])
 
+  @test_util.run_deprecated_v1
   def testCapture(self):
     g = ops.Graph()
     with g.as_default():
@@ -781,6 +791,7 @@ class FunctionTest(test.TestCase):
         # NOTE: We still do not support capturing control deps.
         _ = Foo(x)
 
+  @test_util.run_deprecated_v1
   def testCaptureInWhileLoop(self):
     g = ops.Graph()
     with g.as_default():
@@ -794,8 +805,9 @@ class FunctionTest(test.TestCase):
       y = Foo()
 
     with self.session(graph=g) as sess:
-      self.assertEqual(sess.run(y), 10)
+      self.assertEqual(self.evaluate(y), 10)
 
+  @test_util.run_deprecated_v1
   def testCaptureInCond(self):
     g = ops.Graph()
     with g.as_default():
@@ -809,8 +821,8 @@ class FunctionTest(test.TestCase):
       z = Foo(False)
 
     with self.session(graph=g) as sess:
-      self.assertEqual(sess.run(y), 1)
-      self.assertEqual(sess.run(z), 2)
+      self.assertEqual(self.evaluate(y), 1)
+      self.assertEqual(self.evaluate(z), 2)
 
   def testStableName(self):
 
@@ -825,6 +837,7 @@ class FunctionTest(test.TestCase):
       self.assertEqual("Foo_aCYSbwBkR5A",
                        Foo.instantiate([dtypes.float32] * 3).name)
 
+  @test_util.run_deprecated_v1
   def testSignatureHash(self):
     # Foo.Inner and Bar.Inner have identical function body but have
     # different signatures. They should be treated as two different functions.
@@ -854,7 +867,7 @@ class FunctionTest(test.TestCase):
       z = Bar(x)
 
     with self.session(graph=g) as sess:
-      v0, v1 = sess.run([y, z])
+      v0, v1 = self.evaluate([y, z])
       self.assertAllEqual(v0, 20.)
       self.assertAllEqual(v1, 20.)
 
@@ -877,6 +890,7 @@ class FunctionTest(test.TestCase):
       y = Bar(array_ops.zeros([1, 2, 3]))
       self.assertAllEqual(y.get_shape().as_list(), [1, 1, 2, 3])
 
+  @test_util.run_deprecated_v1
   def testVariableReuse(self):
 
     def LinearWithReuse(input_tensor, reuse=None):
@@ -900,11 +914,12 @@ class FunctionTest(test.TestCase):
     self.assertEqual(global_vars[0].name, "linear/w:0")
 
     with session.Session() as sess:
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       output_val = sess.run(
           output_op, feed_dict={input_op: np.random.rand(32, 100)})
       self.assertEqual(output_val.shape, (32, 100))
 
+  @test_util.run_deprecated_v1
   def testFunctionCallInDifferentVariableScopes(self):
 
     @function.Defun(dtypes.float32)
@@ -928,7 +943,7 @@ class FunctionTest(test.TestCase):
     self.assertEqual(global_vars[0].name, "vs1/var:0")
 
     with session.Session() as sess:
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       out1, out2 = sess.run(
           [out1_op, out2_op], feed_dict={input_op: np.linspace(1, 10, 10)})
       self.assertAllEqual(out1, np.linspace(2, 11, 10))
@@ -968,6 +983,7 @@ class FunctionTest(test.TestCase):
       self.assertAllClose(
           np.array([1.0, 0.0]).astype(np.float32), sess.run(dinp, {inp: x}))
 
+  @test_util.run_deprecated_v1
   def testFunctionMarkedStateful(self):
 
     @function.Defun(dtypes.int32, dtypes.float32)
@@ -991,10 +1007,11 @@ class FunctionTest(test.TestCase):
     result_2 = Bar(constant_op.constant(100, dtype=dtypes.int64))
 
     with session.Session() as sess:
-      self.assertEqual(4.0, sess.run(result_1))
-      self.assertEqual(100, sess.run(result_2))
+      self.assertEqual(4.0, self.evaluate(result_1))
+      self.assertEqual(100, self.evaluate(result_2))
       self.assertEqual((4.0, 100), sess.run((result_1, result_2)))
 
+  @test_util.run_deprecated_v1
   def testStatefulFunction(self):
 
     @function.Defun()
@@ -1037,6 +1054,7 @@ class FunctionTest(test.TestCase):
         self.assertFalse(all(val3 == val1))
         self.assertFalse(all(val4 == val2))
 
+  @test_util.run_deprecated_v1
   def testSameFunctionOnTwoDevices(self):
 
     @function.Defun(dtypes.float32)
@@ -1052,10 +1070,11 @@ class FunctionTest(test.TestCase):
     for config in _OptimizerOptions():
       config.device_count["CPU"] = 2
       with session.Session(config=config) as sess:
-        self.assertEqual(42.0, sess.run(f_0))
-        self.assertEqual(44.0, sess.run(f_1))
+        self.assertEqual(42.0, self.evaluate(f_0))
+        self.assertEqual(44.0, self.evaluate(f_1))
         self.assertEqual((42.0, 44.0), sess.run((f_0, f_1)))
 
+  @test_util.run_deprecated_v1
   def testGuaranteedConstsAreCaptured(self):
     var = variables.Variable(1.0)
     const = array_ops.guarantee_const(var)
@@ -1076,9 +1095,10 @@ class FunctionTest(test.TestCase):
       return output
 
     with self.session(use_gpu=False) as sess:
-      sess.run(var.initializer)
+      self.evaluate(var.initializer)
       _ = sess.run(CapturesGuaranteedConst(), {also_not_const: 1.0})
 
+  @test_util.run_deprecated_v1
   def testSameFunctionDifferentGrads(self):
 
     def PartOne(x):
@@ -1127,7 +1147,7 @@ class FunctionTest(test.TestCase):
       dx2, = gradients_impl.gradients(ys=[y2], xs=[x2])
 
     with self.session(graph=g) as sess:
-      v0, v1, v2 = sess.run([dx0, dx1, dx2])
+      v0, v1, v2 = self.evaluate([dx0, dx1, dx2])
 
     self.assertAllEqual(v0, 2.)
     self.assertAllEqual(v1, 101.)
@@ -1150,6 +1170,7 @@ class FunctionsFromProtos(test.TestCase):
     self.assertEqual(func.declared_input_types, new_func.declared_input_types)
     self.assertEqual(func.captured_inputs, new_func.captured_inputs)
 
+  @test_util.run_deprecated_v1
   def testBasic(self):
 
     @function.Defun(dtypes.float32, dtypes.float32)
@@ -1359,6 +1380,7 @@ class FunctionsFromProtos(test.TestCase):
 
 class FunctionOverloadTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testBasic(self):
 
     @function.Defun()
@@ -1411,6 +1433,7 @@ class FunctionOverloadTest(test.TestCase):
 
 class FunctionCaptureByValueTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testCaptureByValue(self):
     g = ops.Graph()
     with g.as_default():
@@ -1532,7 +1555,7 @@ class UnrollLSTMTest(test.TestCase):
       tf_logging.info("time: %f txt size: %d gdef bin size: %d", finish - start,
                       len(str(gdef)), len(gdef.SerializeToString()))
       with g.as_default(), session.Session(config=cfg) as sess:
-        return sess.run(m)
+        return self.evaluate(m)
 
     mv0 = RunForward("complete")
     for cfg in _OptimizerOptions():
@@ -1561,7 +1584,7 @@ class UnrollLSTMTest(test.TestCase):
       tf_logging.info("time: %f txt size: %d gdef bin size: %d", finish - start,
                       len(str(gdef)), len(gdef.SerializeToString()))
       with g.as_default(), session.Session(config=cfg) as sess:
-        return sess.run(dw)
+        return self.evaluate(dw)
 
     d0 = RunForwardBackward("complete")
     for cfg in _OptimizerOptions():
@@ -1634,6 +1657,7 @@ class FunctionInlineControlTest(test.TestCase):
 
 class ModuleFunctionTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testBasic(self):
 
     @function.Defun(*[dtypes.float32] * 3)
@@ -1651,8 +1675,8 @@ class ModuleFunctionTest(test.TestCase):
       y = LinearWithCApi(a, b, c)
       z = Linear2WithCApi(a, b, c, d, e)
       with session.Session() as sess:
-        self.assertAllEqual([[1]], sess.run(y))
-        self.assertAllEqual([[5]], sess.run(z))
+        self.assertAllEqual([[1]], self.evaluate(y))
+        self.assertAllEqual([[5]], self.evaluate(z))
 
 
 class VariableHoistingTest(test.TestCase):
@@ -1704,8 +1728,8 @@ class VariableHoistingTest(test.TestCase):
     self.assertEqual("Foo/b", b.op.name)
 
     with self.session(graph=g) as sess:
-      sess.run(variables.global_variables_initializer())
-      w, b, x, y0, loss, dw, db = sess.run([w, b, x, y0, loss, dw, db])
+      self.evaluate(variables.global_variables_initializer())
+      w, b, x, y0, loss, dw, db = self.evaluate([w, b, x, y0, loss, dw, db])
 
     self.assertAllEqual(w.shape, (64, 64))
     self.assertAllClose(np.sum(w), 2050.44)
@@ -1717,10 +1741,12 @@ class VariableHoistingTest(test.TestCase):
     self.assertAllEqual(db.shape, (64,))
     self.assertAllClose(np.sum(db), 0.509, rtol=1e-2)
 
+  @test_util.run_deprecated_v1
   def testBasic(self):
     self._testSimpleModel(True)
     self._testSimpleModel(False)
 
+  @test_util.run_deprecated_v1
   def testBasicResource(self):
     self._testSimpleModel(True, use_resource=True)
     self._testSimpleModel(False, use_resource=True)
diff --git a/tensorflow/python/framework/graph_util_test.py b/tensorflow/python/framework/graph_util_test.py
index 563a177dd06..4e7408ad49f 100644
--- a/tensorflow/python/framework/graph_util_test.py
+++ b/tensorflow/python/framework/graph_util_test.py
@@ -29,6 +29,7 @@ from tensorflow.python.framework import graph_util
 from tensorflow.python.framework import importer
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import gen_state_ops
 from tensorflow.python.ops import math_ops  # pylint: disable=unused-import
 from tensorflow.python.ops import math_ops as math_ops_lib
@@ -102,6 +103,7 @@ class DeviceFunctionsTest(test.TestCase):
     self.assertDeviceEqual(var_5.device, "/device:GPU:0")
     self.assertDeviceEqual(var_6.device, "/device:CPU:0")
 
+  @test_util.run_deprecated_v1
   def testNestedDeviceFunctions(self):
     with ops.Graph().as_default():
       var_0 = variables.VariableV1(0)
@@ -210,8 +212,8 @@ class DeviceFunctionsTest(test.TestCase):
 
       with session.Session() as sess:
         init = variables.variables_initializer([variable_node])
-        sess.run(init)
-        output = sess.run(output_node)
+        self.evaluate(init)
+        output = self.evaluate(output_node)
         self.assertNear(4.0, output, 0.00001)
         variable_graph_def = sess.graph.as_graph_def()
 
@@ -242,8 +244,8 @@ class DeviceFunctionsTest(test.TestCase):
         output_node = math_ops_lib.multiply(
             variable_node, 2.0, name="output_node")
         with session.Session() as sess:
-          sess.run(variable_node.initializer)
-          output = sess.run(output_node)
+          self.evaluate(variable_node.initializer)
+          output = self.evaluate(output_node)
           self.assertNear(2.0, output, 0.00001)
           variable_graph_def = sess.graph.as_graph_def()
           # First get the constant_graph_def when variable_names_whitelist is
@@ -256,7 +258,7 @@ class DeviceFunctionsTest(test.TestCase):
 
           # Then initialize the unused variable, and get another
           # constant_graph_def when variable_names_whitelist is not set.
-          sess.run(another_variable.initializer)
+          self.evaluate(another_variable.initializer)
           constant_graph_def_without_variable_whitelist = (
               graph_util.convert_variables_to_constants(
                   sess, variable_graph_def, ["output_node"]))
@@ -295,7 +297,7 @@ class DeviceFunctionsTest(test.TestCase):
             ["Variable", "VariableV2", "VarHandleOp", "ReadVariableOp"])
       with session.Session() as sess:
         output_node = sess.graph.get_tensor_by_name("output_node:0")
-        output = sess.run(output_node)
+        output = self.evaluate(output_node)
         self.assertNear(2.0, output, 0.00001)
 
   def create_node_def(self, op, name, inputs):
diff --git a/tensorflow/python/framework/importer.py b/tensorflow/python/framework/importer.py
index c9ac27e7887..98c7aeccc4b 100644
--- a/tensorflow/python/framework/importer.py
+++ b/tensorflow/python/framework/importer.py
@@ -21,6 +21,7 @@ import contextlib
 
 from tensorflow.core.framework import graph_pb2
 from tensorflow.python import pywrap_tensorflow as c_api
+from tensorflow.python import tf2
 from tensorflow.python.framework import c_api_util
 from tensorflow.python.framework import device as pydev
 from tensorflow.python.framework import errors
@@ -253,7 +254,9 @@ def _ProcessNewOps(graph):
     # Find any device in the list of colocated ops that have a device, if it
     # exists.  We assume that if multiple ops have devices, they refer to the
     # same device.  Otherwise, a runtime error will occur since the colocation
-    # property cannot be guaranteed.
+    # property cannot be guaranteed.  Note in TF2 colocations have been removed
+    # from the public API and will be considered a hint, so there is no runtime
+    # error.
     #
     # One possible improvement is to try to check for compatibility of all
     # devices in this list at import time here, which would require
@@ -262,6 +265,10 @@ def _ProcessNewOps(graph):
       try:
         coloc_op = graph._get_operation_by_name_unsafe(coloc_op_name)  # pylint: disable=protected-access
       except KeyError:
+        # Do not error in TF2 if the colocation cannot be guaranteed
+        if tf2.enabled():
+          continue
+
         raise ValueError('Specified colocation to an op that '
                          'does not exist during import: %s in %s' %
                          (coloc_op_name, op.name))
@@ -431,17 +438,16 @@ def import_graph_def(graph_def,
     #
     # TODO(skyewm): fetch the TF_Functions directly from the TF_Graph
     # TODO(skyewm): avoid sending serialized FunctionDefs back to the TF_Graph
-    # TODO(b/74620627): move this after _ProcessNewOps outside the lock once
-    # _USE_C_SHAPES is removed.
-    if graph_def.library and graph_def.library.function:
-      # pylint: disable=protected-access
-      functions = function._from_library(graph_def.library)
-      for f in functions:
-        f.add_to_graph(graph)
-      # pylint: enable=protected-access
 
     _ProcessNewOps(graph)
 
+  if graph_def.library and graph_def.library.function:
+    # pylint: disable=protected-access
+    functions = function._from_library(graph_def.library)
+    for f in functions:
+      f.add_to_graph(graph)
+    # pylint: enable=protected-access
+
   # Treat input mappings that don't appear in the graph as an error, because
   # they are likely to be due to a typo.
   missing_unused_input_keys = (
diff --git a/tensorflow/python/framework/importer_test.py b/tensorflow/python/framework/importer_test.py
index 2b4d8e72995..66e80b55852 100644
--- a/tensorflow/python/framework/importer_test.py
+++ b/tensorflow/python/framework/importer_test.py
@@ -397,11 +397,11 @@ class ImportGraphDefTest(test.TestCase):
       # Run the imported graph.
       # TODO(b/76173421): make this work (currently DCHECKS)
       # with self.cached_session() as sess:
-      #   sess.run(imported_init)
-      #   self.assertEqual(sess.run(imported_var), 1.0)
-      #   self.assertEqual(sess.run(imported_assign), 2.0)
-      #   self.assertEqual(list(sess.run(imported_shape)), [])
-      #   self.assertEqual(list(sess.run(new_var_shape)), [])
+      #   self.evaluate(imported_init)
+      #   self.assertEqual(self.evaluate(imported_var), 1.0)
+      #   self.assertEqual(self.evaluate(imported_assign), 2.0)
+      #   self.assertEqual(list(self.evaluate(imported_shape)), [])
+      #   self.assertEqual(list(self.evaluate(new_var_shape)), [])
 
   def testWhileLoop(self):
     # Produce GraphDef containing while loop.
@@ -418,7 +418,7 @@ class ImportGraphDefTest(test.TestCase):
                                               return_elements=[r.name])
       self.assertEqual(imported_r.name, "import/" + r.name)
       with self.cached_session() as sess:
-        self.assertEqual(sess.run(imported_r), 10)
+        self.assertEqual(self.evaluate(imported_r), 10)
 
   def testImportWhileLoopInCond(self):
     # Produce GraphDef containing while loop.
@@ -458,7 +458,7 @@ class ImportGraphDefTest(test.TestCase):
           lambda i: i < 2, ImportFn, [0],
           shape_invariants=[tensor_shape.TensorShape(None)])
       with self.cached_session() as sess:
-        self.assertEqual(sess.run(out), 10)
+        self.assertEqual(self.evaluate(out), 10)
 
   def testTypeMismatchInGraphDef(self):
     # TODO(skyewm): improve error message
@@ -930,7 +930,7 @@ class ImportGraphDefTest(test.TestCase):
           name="",
           return_elements=["id:0"])
       with self.cached_session():
-        self.assertEqual(5.0, t.eval())
+        self.assertEqual(5.0, self.evaluate(t))
 
   def testInvalidInputForReturnOperations(self):
     with ops.Graph().as_default():
@@ -1071,7 +1071,7 @@ class ImportGraphDefTest(test.TestCase):
       tensor_input = np.ones(input_shape, dtype=np.float32)
       t = constant_op.constant(tensor_input, shape=input_shape)
       g = array_ops.identity(t)
-      g.eval()
+      self.evaluate(g)
 
   def testVersion(self):
     v0 = versions.GRAPH_DEF_VERSION_MIN_CONSUMER
@@ -1255,7 +1255,7 @@ class ImportGraphDefTest(test.TestCase):
     z = TestFunc()
 
     with self.cached_session():
-      z_val = z.eval()
+      z_val = self.evaluate(z)
       self.assertEqual(z_val, -2.0)
 
   def testImportGraphWithFunctionTwice(self):
diff --git a/tensorflow/python/framework/load_library.py b/tensorflow/python/framework/load_library.py
index 908a5f521e1..727f6aa44c2 100644
--- a/tensorflow/python/framework/load_library.py
+++ b/tensorflow/python/framework/load_library.py
@@ -31,6 +31,7 @@ from tensorflow.core.lib.core import error_codes_pb2  # pylint: disable=unused-i
 from tensorflow.python import pywrap_tensorflow as py_tf
 from tensorflow.python.lib.io import file_io
 from tensorflow.python.util import compat
+from tensorflow.python.util import deprecation
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -83,7 +84,8 @@ def load_op_library(library_filename):
   return module
 
 
-@tf_export('load_file_system_library')
+@deprecation.deprecated(date=None, instructions='Use tf.load_library instead.')
+@tf_export(v1=['load_file_system_library'])
 def load_file_system_library(library_filename):
   """Loads a TensorFlow plugin, containing file system implementation.
 
diff --git a/tensorflow/python/framework/meta_graph_test.py b/tensorflow/python/framework/meta_graph_test.py
index fc98b91a016..46ce4616a50 100644
--- a/tensorflow/python/framework/meta_graph_test.py
+++ b/tensorflow/python/framework/meta_graph_test.py
@@ -63,6 +63,7 @@ def _TestDir(test_name):
 
 class SimpleMetaGraphTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testNoVariables(self):
     test_dir = _TestDir("no_variables")
     filename = os.path.join(test_dir, "metafile")
@@ -116,6 +117,7 @@ class SimpleMetaGraphTest(test.TestCase):
                                   {new_input_tensor: input_feed_value})
       self.assertEqual(new_output_value, output_value)
 
+  @test_util.run_deprecated_v1
   def testStrippedOpListNestedFunctions(self):
     with self.cached_session():
       # Square two levels deep
@@ -158,6 +160,7 @@ class SimpleMetaGraphTest(test.TestCase):
     op_list = meta_graph.stripped_op_list_for_graph(graph)
     self.assertEqual(["Const"], [op.name for op in op_list.op])
 
+  @test_util.run_deprecated_v1
   def testDefaultAttrStripping(self):
     """Verifies that default attributes are stripped from a graph def."""
 
@@ -210,6 +213,7 @@ class SimpleMetaGraphTest(test.TestCase):
       self.assertEqual(node_def.attr["Tout"].type, dtypes.complex128)
       self.assertTrue(meta_graph_def.meta_info_def.stripped_default_attrs)
 
+  @test_util.run_deprecated_v1
   def testDefaultAttrStrippingNestedFunctions(self):
     """Verifies that default attributes are stripped from function node defs."""
     with self.cached_session():
@@ -261,6 +265,7 @@ class SimpleMetaGraphTest(test.TestCase):
       self.assertEqual(node_def.attr["attr_1"].i, 1)
       self.assertTrue(meta_graph_def.meta_info_def.stripped_default_attrs)
 
+  @test_util.run_deprecated_v1
   def testVariableObjectsAreSharedAmongCollections(self):
     with ops.Graph().as_default() as graph1:
       v = variables.Variable(3.0)
@@ -454,6 +459,7 @@ class ScopedMetaGraphTest(test.TestCase):
 
   # Verifies that we can export the subgraph under each layer and import
   # them into new layers in a new graph.
+  @test_util.run_deprecated_v1
   def testScopedExportAndImport(self):
     test_dir = _TestDir("scoped_export_import")
     filenames = [
@@ -492,8 +498,8 @@ class ScopedMetaGraphTest(test.TestCase):
       init_op = variables.global_variables_initializer()
       grad = gradients_impl.gradients([output], [var])
       with session.Session() as sess:
-        sess.run(init_op)
-        expected_grad_value = sess.run(grad)
+        self.evaluate(init_op)
+        expected_grad_value = self.evaluate(grad)
 
     # Restore the MetaGraphDef into a new Graph with an import scope.
     with ops.Graph().as_default():
@@ -518,10 +524,11 @@ class ScopedMetaGraphTest(test.TestCase):
       init_op = variables.global_variables_initializer()
 
       with session.Session() as sess:
-        sess.run(init_op)
-        actual_grad_value = sess.run(grad)
+        self.evaluate(init_op)
+        actual_grad_value = self.evaluate(grad)
         self.assertEqual(expected_grad_value, actual_grad_value)
 
+  @test_util.run_deprecated_v1
   def testImportWhileLoopInWhileLoop(self):
     # Create a simple while loop.
     with ops.Graph().as_default():
@@ -544,9 +551,10 @@ class ScopedMetaGraphTest(test.TestCase):
       _, x = control_flow_ops.while_loop(lambda i, x: i < 2, body, [0, 0.0],
                                          name="")
       with session.Session() as sess:
-        sess.run(variables.global_variables_initializer())
-        sess.run(x)
+        self.evaluate(variables.global_variables_initializer())
+        self.evaluate(x)
 
+  @test_util.run_deprecated_v1
   def testScopedImportUnderNameScope(self):
     graph = ops.Graph()
     with graph.as_default():
@@ -562,6 +570,7 @@ class ScopedMetaGraphTest(test.TestCase):
         self.assertEqual(list(imported_variables.values())[0].name,
                          "foo/bar/myvar:0")
 
+  @test_util.run_deprecated_v1
   def testScopedImportUnderNameScopeNoVarScope(self):
     graph = ops.Graph()
     with graph.as_default():
@@ -590,6 +599,7 @@ class ScopedMetaGraphTest(test.TestCase):
         self.assertEqual(list(imported_variables.values())[0].name,
                          "s" + suffix + "/v:0")
 
+  @test_util.run_deprecated_v1
   def testScopedImportWithSelectedCollections(self):
     meta_graph_filename = os.path.join(
         _TestDir("selected_collections_import"), "meta_graph.pb")
@@ -600,11 +610,11 @@ class ScopedMetaGraphTest(test.TestCase):
     with graph.as_default():
       variables.Variable(initial_value=1.0, trainable=True)
     self.assertTrue(
-        all([
+        all(
             graph.get_collection(key)
             for key in
             [ops.GraphKeys.GLOBAL_VARIABLES, ops.GraphKeys.TRAINABLE_VARIABLES]
-        ]))
+        ))
     meta_graph.export_scoped_meta_graph(
         filename=meta_graph_filename, graph=graph)
 
@@ -687,6 +697,7 @@ class ScopedMetaGraphTest(test.TestCase):
 
   # Verifies that we can export the subgraph containing a FIFOQueue under
   # "queue1" and import it into "new_queue1" in a new graph.
+  @test_util.run_deprecated_v1
   def testScopedWithQueue(self):
     test_dir = _TestDir("scoped_with_queue")
     orig_meta_graph = self._testScopedExportWithQueue(test_dir,
@@ -749,12 +760,15 @@ class ScopedMetaGraphTest(test.TestCase):
     for n, e in zip(nodes, expected):
       self.assertEqual([e], graph2.get_operation_by_name(n).get_attr("_class"))
 
+  @test_util.run_deprecated_v1
   def testExportNestedNames(self):
     self.doTestExportNestedNames(use_resource=False)
 
+  @test_util.run_deprecated_v1
   def testExportNestedNamesResource(self):
     self.doTestExportNestedNames(use_resource=True)
 
+  @test_util.run_deprecated_v1
   def testPotentialCycle(self):
     graph1 = ops.Graph()
     with graph1.as_default():
@@ -783,6 +797,7 @@ class ScopedMetaGraphTest(test.TestCase):
                   4.0, shape=[2, 2])
           })
 
+  @test_util.run_deprecated_v1
   def testClearDevices(self):
     graph1 = ops.Graph()
     with graph1.as_default():
@@ -842,6 +857,7 @@ class ScopedMetaGraphTest(test.TestCase):
 
 class MetaGraphWithVariableScopeTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testMetricsCollection(self):
 
     def _enqueue_vector(sess, queue, values, shape=None):
@@ -868,8 +884,8 @@ class MetaGraphWithVariableScopeTest(test.TestCase):
       _, update_op = metrics.mean(values)
 
       initializer = variables.local_variables_initializer()
-      sess.run(initializer)
-      sess.run(update_op)
+      self.evaluate(initializer)
+      self.evaluate(update_op)
 
     meta_graph.export_scoped_meta_graph(
         filename=meta_graph_filename, graph=graph)
@@ -880,7 +896,7 @@ class MetaGraphWithVariableScopeTest(test.TestCase):
     with self.session(graph=graph) as sess:
       meta_graph.import_scoped_meta_graph(meta_graph_filename)
       initializer = variables.local_variables_initializer()
-      sess.run(initializer)
+      self.evaluate(initializer)
 
     # Verifies that importing an old meta_graph where "local_variables"
     # collection is of node_list type works, but cannot build initializer
@@ -899,6 +915,7 @@ class MetaGraphWithVariableScopeTest(test.TestCase):
 
 class ExportImportAcrossScopesTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testPartionedVariables(self):
 
     def make_graph_with_partitioned_variables(use_resource):
diff --git a/tensorflow/python/framework/op_def_library.py b/tensorflow/python/framework/op_def_library.py
index 9955a9a2cdd..2318b32ef10 100644
--- a/tensorflow/python/framework/op_def_library.py
+++ b/tensorflow/python/framework/op_def_library.py
@@ -570,7 +570,7 @@ class OpDefLibrary(object):
                   "than minimum length %d." %
                   (input_name, op_type_name, len(values), num_attr.minimum))
           # All tensors must have the same base type.
-          if any([bt != base_types[0] for bt in base_types]):
+          if any(bt != base_types[0] for bt in base_types):
             raise TypeError(
                 "All tensors passed to '%s' of '%s' Op "
                 "must have the same type." %
diff --git a/tensorflow/python/framework/ops.py b/tensorflow/python/framework/ops.py
index aaa12bf71ff..1a26984809b 100644
--- a/tensorflow/python/framework/ops.py
+++ b/tensorflow/python/framework/ops.py
@@ -36,14 +36,13 @@ from tensorflow.core.framework import op_def_pb2
 from tensorflow.core.framework import versions_pb2
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python import pywrap_tensorflow as c_api
+from tensorflow.python import tf2
 from tensorflow.python.eager import context
 from tensorflow.python.eager import core
 from tensorflow.python.eager import tape
 from tensorflow.python.framework import c_api_util
-from tensorflow.python.framework import cpp_shape_inference_pb2
 from tensorflow.python.framework import device as pydev
 from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import error_interpolation
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import op_def_registry
 from tensorflow.python.framework import registry
@@ -318,22 +317,13 @@ class Tensor(_TensorLike):
     self._op = op
     self._value_index = value_index
     self._dtype = dtypes.as_dtype(dtype)
-
     # This will be set by self._as_tf_output().
     self._tf_output = None
-
     # This will be set by self.shape().
     self._shape_val = None
-
     # List of operations that use this Tensor as input.  We maintain this list
     # to easily navigate a computation graph.
     self._consumers = []
-
-    if not _USE_C_SHAPES:
-      # Attributes used for C++ shape inference. Not inspected, only forwarded.
-      # If set, will be a HandleData object from cpp_shape_inference.proto.
-      self._handle_data = None
-
     self._id = uid()
 
   @property
@@ -408,17 +398,7 @@ class Tensor(_TensorLike):
 
     """
     if self._shape_val is None:
-      if _USE_C_SHAPES:
-        self._shape_val = self._c_api_shape()
-      else:
-        # Call set_shape_and_handle_data_for_outputs in topological order on all
-        # ops that are needed to compute self.op's shape. We do this instead of
-        # having set_shape_and_handle_data_for_outputs recursively call
-        # Operation.shape on self.op.inputs to overflowing the call stack.
-        need_shapes = self._get_input_ops_without_shapes(self.op)
-        need_shapes.sort(key=lambda op: op._id)
-        for op in need_shapes:
-          set_shape_and_handle_data_for_outputs(op)
+      self._shape_val = self._c_api_shape()
     return self._shape_val
 
   def _get_input_ops_without_shapes(self, target_op):
@@ -533,14 +513,10 @@ class Tensor(_TensorLike):
       ValueError: If `shape` is not compatible with the current shape of
         this tensor.
     """
-    if _USE_C_SHAPES:  # pylint: disable=protected-access
-      # Reset cached shape.
-      self._shape_val = None
-    else:
-      self._shape_val = self.shape.merge_with(shape)
+    # Reset cached shape.
+    self._shape_val = None
 
-    # Update C shape even if _USE_C_SHAPES = False, since we still want
-    # set_shape to be reflected in the C API graph for when we run it.
+    # We want set_shape to be reflected in the C API graph for when we run it.
     if not isinstance(shape, tensor_shape.TensorShape):
       shape = tensor_shape.TensorShape(shape)
     dim_list = []
@@ -634,10 +610,7 @@ class Tensor(_TensorLike):
     return id(self) == id(other)
 
   def __copy__(self):
-    # Make sure _shape_val is computed before we copy.
     # TODO(b/77597810): get rid of Tensor copies.
-    if self._shape_val is None:
-      set_shape_and_handle_data_for_outputs(self.op)
     cls = self.__class__
     result = cls.__new__(cls)
     result.__dict__.update(self.__dict__)
@@ -774,6 +747,18 @@ class _EagerTensorBase(Tensor):
   def _numpy(self):
     raise NotImplementedError()
 
+  @property
+  def backing_device(self):
+    """Returns the name of the device holding this tensor's memory.
+
+    `.backing_device` is usually the same as `.device`, which returns
+    the device on which the kernel of the operation that produced this tensor
+    ran. However, some operations can produce tensors on a different device
+    (e.g., an operation that executes on the GPU but produces output tensors
+    in host memory).
+    """
+    raise NotImplementedError()
+
   def __copy__(self):
     # Eager Tensors are immutable so it's safe to return themselves as a copy.
     return self
@@ -890,6 +875,12 @@ class _EagerTensorBase(Tensor):
     """Returns the number of Tensor dimensions."""
     return self.shape.ndims
 
+  def __len__(self):
+    """Returns the length of the first dimension in the Tensor."""
+    if not self.shape.ndims:
+      raise TypeError("Scalar tensor has no `len()`")
+    return self._shape_tuple()[0]
+
   def _cpu_nograd(self):
     """A copy of this Tensor with contents backed by host memory.
 
@@ -918,13 +909,7 @@ class _EagerTensorBase(Tensor):
     return self._copy(context.context(), "GPU:" + str(gpu_index))
 
   def __bool__(self):
-    if self._shape_tuple() != ():  # pylint: disable=g-explicit-bool-comparison
-      raise ValueError(
-          "Non-scalar tensor %s cannot be converted to boolean." % repr(self))
-    if self.dtype != dtypes.bool:
-      raise ValueError(
-          "Non-boolean tensor %s cannot be converted to boolean." % repr(self))
-    return bool(self.cpu().numpy())
+    return bool(self.numpy())
 
   def __nonzero__(self):
     return self.__bool__()
@@ -1044,12 +1029,12 @@ def convert_to_tensor(value, dtype=None, name=None, preferred_dtype=None):
       `preferred_dtype` is not possible, this argument has no effect.
 
   Returns:
-    An `Output` based on `value`.
+    An `Tensor` based on `value`.
 
   Raises:
-    TypeError: If no conversion function is registered for `value`.
+    TypeError: If no conversion function is registered for `value` to `dtype`.
     RuntimeError: If a registered conversion function returns an invalid value.
-
+    ValueError: If the `value` is a tensor not of given `dtype` in graph mode.
   """
   return convert_to_tensor_v2(value, dtype, preferred_dtype, name)
 
@@ -1097,12 +1082,12 @@ def convert_to_tensor_v2(value, dtype=None, dtype_hint=None, name=None):
     name: Optional name to use if a new `Tensor` is created.
 
   Returns:
-    An `Output` based on `value`.
+    An `Tensor` based on `value`.
 
   Raises:
-    TypeError: If no conversion function is registered for `value`.
+    TypeError: If no conversion function is registered for `value` to `dtype`.
     RuntimeError: If a registered conversion function returns an invalid value.
-
+    ValueError: If the `value` is a tensor not of given `dtype` in graph mode.
   """
   return internal_convert_to_tensor(
       value=value,
@@ -1123,49 +1108,13 @@ def internal_convert_to_tensor(value,
                                preferred_dtype=None,
                                ctx=None,
                                accept_symbolic_tensors=True):
-  """Converts the given `value` to an `Tensor`.
-
-  This function converts Python objects of various types to `Tensor`
-  objects. It accepts `Tensor` objects, numpy arrays, Python lists,
-  and Python scalars. For example:
-
-  This function can be useful when composing a new operation in Python
-  All standard Python op constructors apply this function to each of their
-  Tensor-valued inputs, which allows those ops to accept numpy arrays, Python
-  lists, and scalars in addition to `Tensor` objects.
-
-  Args:
-    value: An object whose type has a registered `Tensor` conversion function.
-    dtype: Optional element type for the returned tensor. If missing, the
-      type is inferred from the type of `value`.
-    name: Optional name to use if a new `Tensor` is created.
-    as_ref: True if we want the mutable view of Variables, if applicable.
-    preferred_dtype: Optional element type for the returned tensor,
-      used when dtype is None. In some cases, a caller may not have a
-      dtype in mind when converting to a tensor, so preferred_dtype
-      can be used as a soft preference.  If the conversion to
-      `preferred_dtype` is not possible, this argument has no effect.
-    ctx: Optional: The value of context.context().
-    accept_symbolic_tensors: Whether Keras graph tensors should be accepted as
-      a valid tensor type during eager execution.
-      If False, this function will raise an exception if it is passed such
-      a tensor during eager eager execution.
-
-  Returns:
-    A `Tensor` based on `value`.
-
-  Raises:
-    TypeError: If no conversion function is registered for `value`.
-    RuntimeError: If a registered conversion function returns an invalid value.
-
-  """
+  """Implementation of the public convert_to_tensor."""
   if ctx is None: ctx = context.context()
   if isinstance(value, EagerTensor):
     if ctx.executing_eagerly():
-      # Fast path for EagerTensors that don't need any conversion.
-      # Note that we don't check that value's dtype matches the dtype
-      # argument.  We expect that the C runtime will do that checking
-      # when we execute the kernel.
+      if dtype is not None:
+        dtype = dtypes.as_dtype(dtype)
+        value = _TensorTensorConversionFunction(value, dtype=dtype)
       return value
     else:
       graph = get_default_graph()
@@ -2129,12 +2078,6 @@ class Operation(object):
       raise TypeError("tensor must be a Tensor: %s" % tensor)
     _assert_same_graph(self, tensor)
 
-    # Make sure output shapes are already computed for this op in case we create
-    # a cycle (we cannot compute shapes for cycles). Usually shapes are computed
-    # lazily upon request.
-    if not _USE_C_SHAPES:
-      set_shape_and_handle_data_for_outputs(self)
-
     # Reset cached inputs.
     self._inputs_val = None
     c_api.UpdateEdge(
@@ -2142,6 +2085,31 @@ class Operation(object):
         tensor._as_tf_output(),  # pylint: disable=protected-access
         self._tf_input(index))
 
+  def _add_while_inputs(self, tensors):
+    """See AddWhileInputHack in python_api.h.
+
+    NOTE: This is for TF internal use only. Please don't use it.
+
+    Args:
+      tensors: list of Tensors
+
+    Raises:
+      TypeError: if tensor is not a Tensor,
+        or if input tensor type is not convertible to dtype.
+      ValueError: if the Tensor is from a different graph.
+    """
+    for tensor in tensors:
+      if not isinstance(tensor, Tensor):
+        raise TypeError("tensor must be a Tensor: %s" % tensor)
+      _assert_same_graph(self, tensor)
+
+      # Reset cached inputs.
+      self._inputs_val = None
+      c_api.AddWhileInputHack(
+          self._graph._c_graph,  # pylint: disable=protected-access
+          tensor._as_tf_output(),  # pylint: disable=protected-access
+          self._c_op)
+
   def _add_control_inputs(self, ops):
     """Add a list of new control inputs to this operation.
 
@@ -2175,6 +2143,23 @@ class Operation(object):
     """Removes any control inputs to this operation."""
     c_api.RemoveAllControlInputs(self._graph._c_graph, self._c_op)  # pylint: disable=protected-access
 
+  def _add_outputs(self, types, shapes):
+    """Adds new Tensors to self.outputs.
+
+    Note: this is generally unsafe to use. This is used in certain situations in
+    conjunction with _set_type_list_attr.
+
+    Arguments:
+      types: list of DTypes
+      shapes: list of TensorShapes
+    """
+    assert len(types) == len(shapes)
+    orig_num_outputs = len(self.outputs)
+    for i in range(len(types)):
+      t = Tensor(self, orig_num_outputs + i, types[i])
+      self._outputs.append(t)
+      t.set_shape(shapes[i])
+
   def __str__(self):
     return str(self.node_def)
 
@@ -2387,6 +2372,25 @@ class Operation(object):
     finally:
       c_api.TF_DeleteBuffer(buf)
 
+  def _set_func_attr(self, attr_name, func_name):
+    """Private method used to set a function attribute in the node_def."""
+    func = attr_value_pb2.NameAttrList(name=func_name)
+    self._set_attr(attr_name, attr_value_pb2.AttrValue(func=func))
+
+  def _set_type_list_attr(self, attr_name, types):
+    """Private method used to set a function attribute in the node_def."""
+    if not types: return
+    if isinstance(types[0], dtypes.DType):
+      types = [dt.as_datatype_enum for dt in types]
+    types_list = attr_value_pb2.AttrValue.ListValue(type=types)
+    self._set_attr(attr_name, attr_value_pb2.AttrValue(list=types_list))
+
+  def _set_shape_list_attr(self, attr_name, shapes):
+    """Private method used to set a function attribute in the node_def."""
+    shapes = [s.as_proto() for s in shapes]
+    shapes_list = attr_value_pb2.AttrValue.ListValue(shape=shapes)
+    self._set_attr(attr_name, attr_value_pb2.AttrValue(list=shapes_list))
+
   def get_attr(self, name):
     """Returns the value of the attr of this op with the given `name`.
 
@@ -2399,7 +2403,7 @@ class Operation(object):
     Raises:
       ValueError: If this op does not have an attr with the given `name`.
     """
-    fields = ["s", "i", "f", "b", "type", "shape", "tensor", "func"]
+    fields = ("s", "i", "f", "b", "type", "shape", "tensor", "func")
     try:
       with c_api_util.tf_buffer() as buf:
         c_api.TF_OperationGetAttrValueProto(self._c_op, name, buf)
@@ -2410,25 +2414,21 @@ class Operation(object):
     x = attr_value_pb2.AttrValue()
     x.ParseFromString(data)
 
-    # Treat an empty oneof value as an empty list.
-    if not x.WhichOneof("value"):
+    oneof_value = x.WhichOneof("value")
+    if oneof_value is None:
       return []
-    if x.HasField("list"):
+    if oneof_value == "list":
       for f in fields:
         if getattr(x.list, f):
           if f == "type":
-            return [dtypes.as_dtype(x) for x in list(getattr(x.list, f))]
+            return [dtypes.as_dtype(t) for t in x.list.type]
           else:
             return list(getattr(x.list, f))
       return []
-    else:
-      for f in fields:
-        if x.HasField(f):
-          if f == "type":
-            return dtypes.as_dtype(getattr(x, f))
-          else:
-            return getattr(x, f)
-      assert False, "Unsupported field type in " + str(x)
+    if oneof_value == "type":
+      return dtypes.as_dtype(x.type)
+    assert oneof_value in fields, "Unsupported field type in " + str(x)
+    return getattr(x, oneof_value)
 
   def run(self, feed_dict=None, session=None):
     """Runs this operation in a `Session`.
@@ -2608,72 +2608,9 @@ class RegisterShape(object):
     return f
 
 
-# TODO(b/74620627): remove when _USE_C_SHAPES is removed
-def _set_shape_and_handle_data_for_outputs_c_api(op):
-  """Set shapes and resource handle data using info from the C API."""
-  assert not _USE_C_SHAPES
-  for output in op.outputs:
-    output._shape_val = output._c_api_shape()
-    # Set the resource handle data for compatibility with the Python shape
-    # inference code.
-    serialized = c_api.GetHandleShapeAndType(op._graph._c_graph,  # pylint: disable=protected-access
-                                             output._as_tf_output())
-    if serialized:
-      output._handle_data = (
-          cpp_shape_inference_pb2.CppShapeInferenceResult.HandleData
-          .FromString(compat.as_bytes(serialized)))
-    else:
-      output._handle_data = None
-
-
-# TODO(b/74620627): remove when _USE_C_SHAPES is removed
-def set_shape_and_handle_data_for_outputs(op):
-  """Set the shapes and resource handle data for op's outputs.
-
-  When _USE_C_SHAPES = False, this is lazily called when a tensor's shape is
-  first requested. Usually this should work automatically, but some edge cases
-  may require manually calling this first to make sure Tensor._shape_val and
-  Tensor._handle_data are set (e.g. manually overriding _handle_data, copying a
-  Tensor).
-  """
-  if _USE_C_SHAPES: return
-
-  if op.graph._is_function(op.type):
-    for output in op.outputs:
-      output._shape_val = tensor_shape.unknown_shape()
-    return
-
-  try:
-    shape_func = _shape_registry.lookup(op.type)
-  except LookupError:
-    try:
-      shape_func = _default_shape_function_registry.lookup(op.type)
-    except LookupError:
-      shape_func = _call_cpp_shape_fn_and_require_op
-
-  shapes = shape_func(op)
-  if shapes is None:
-    raise RuntimeError(
-        "Shape function for op %s did not return any shapes" % op)
-  elif isinstance(shapes, dict):
-    # Returned by call_cpp_shape_fn
-    shapes_dict = shapes
-    shapes = shapes_dict["shapes"]
-    handle_datas = shapes_dict["handle_data"]
-    for output, handle_data in zip(op.outputs, handle_datas):
-      # Don't override any existing handle data that may have been manually set.
-      # pylint: disable=protected-access
-      if output._handle_data is None:
-        output._handle_data = handle_data
-      # pylint: enable=protected-access
-
-  if len(op.outputs) != len(shapes):
-    raise RuntimeError(
-        "Shape function for op %s returned %d shapes but expected %d %s %s" %
-        (op, len(shapes), len(op.outputs), shape_func.__name__, str(shapes)))
-  for output, s in zip(op.outputs, shapes):
-    output._shape_val = tensor_shape.unknown_shape()
-    output._shape_val = output._shape_val.merge_with(s)
+def set_shape_and_handle_data_for_outputs(_):
+  """No op. TODO(b/74620627): Remove this."""
+  pass
 
 
 class OpStats(object):
@@ -2901,8 +2838,8 @@ class Graph(object):
     self._stack_state_is_thread_local = False
     self._thread_local = threading.local()
     # Functions that will be applied to choose a device if none is specified.
-    # After switch_to_thread_local(), self._thread_local._device_function_stack
-    # is used instead.
+    # In TF2.x or after switch_to_thread_local(),
+    # self._thread_local._device_function_stack is used instead.
     self._graph_device_function_stack = traceable_stack.TraceableStack()
     # Default original_op applied to new ops.
     self._default_original_op = None
@@ -2910,7 +2847,7 @@ class Graph(object):
     # WhileContext defined in ops/control_flow_ops.py
     self._control_flow_context = None
     # A new node will depend of the union of all of the nodes in the stack.
-    # After switch_to_thread_local(),
+    # In TF2.x or after switch_to_thread_local(),
     # self._thread_local._control_dependencies_stack is used instead.
     self._graph_control_dependencies_stack = []
     # Arbitrary collections of objects.
@@ -2934,7 +2871,7 @@ class Graph(object):
         producer=versions.GRAPH_DEF_VERSION,
         min_consumer=versions.GRAPH_DEF_VERSION_MIN_CONSUMER)
     self._building_function = False
-    # Stack of colocate_with ops. After switch_to_thread_local(),
+    # Stack of colocate_with ops. In TF2.x or after switch_to_thread_local(),
     # self._thread_local._colocation_stack is used instead.
     self._graph_colocation_stack = traceable_stack.TraceableStack()
     # Set of tensors that are dangerous to feed!
@@ -2967,6 +2904,8 @@ class Graph(object):
     # requirement (many custom ops do not have shape functions, and we don't
     # want to break these existing cases).
     c_api.SetRequireShapeInferenceFns(self._c_graph, False)
+    if tf2.enabled():
+      self.switch_to_thread_local()
 
   # Note: this method is private because the API of tf.Graph() is public and
   # frozen, and this functionality is still not ready for public visibility.
@@ -3391,36 +3330,6 @@ class Graph(object):
     self._create_op_helper(ret, compute_device=compute_device)
     return ret
 
-  def _make_colocation_conflict_message(self, op, colocation_op):
-    """Return detailed error message about device conflict due to colocation."""
-    # Example error message:
-    #   Tried to colocate op 'a' (defined at file1.py:149) having device
-    #   '/device:GPU:0' with op 'b' (defined at file2:96) which had an
-    #   incompatible device '/device:CPU:0'.
-    #
-    #   No node-device colocations were active during op 'a' creation.
-    #   Device assignments active during op 'a' creation:
-    #     with tf.device(/device:GPU:0): file1.py:148>
-    #
-    #   Node-device colocations active during op 'b' creation:
-    #     with tf.colocate_with(a): file2.py:93>
-    #   Device assignments active during op 'b' creation:
-    #     with tf.device(/cpu:0): file2.py:94
-    op_info = error_interpolation.compute_field_dict(op)
-    coloc_op_info = error_interpolation.compute_field_dict(colocation_op)
-    msg = ("Tried to colocate op '{op_name}'{op_loc} having device '{op_dev}' "
-           "with op '{coloc_op_name}'{coloc_op_loc} which had an incompatible "
-           "device '{coloc_op_dev}'.\n\n{op_summary}\n\n{coloc_op_summary}"
-           .format(op_name=op.name,
-                   op_loc=op_info["defined_at"],
-                   op_dev=op.device,
-                   op_summary=op_info["devs_and_colocs"],
-                   coloc_op_name=colocation_op.name,
-                   coloc_op_loc=coloc_op_info["defined_at"],
-                   coloc_op_dev=colocation_op.device,
-                   coloc_op_summary=coloc_op_info["devs_and_colocs"]))
-    return msg
-
   def _create_op_helper(self, op, compute_device=True):
     """Common logic for creating an op in this graph."""
     # Apply any additional attributes requested. Do not overwrite any existing
@@ -3473,12 +3382,9 @@ class Graph(object):
       for colocation_op in self._colocation_stack.peek_objs():
         all_colocation_groups.extend(colocation_op.colocation_groups())
         if colocation_op.device:
-          if (op.device and pydev.canonical_name(op.device) !=
-              pydev.canonical_name(colocation_op.device)):
-            msg = self._make_colocation_conflict_message(op, colocation_op)
-            logging.warning(msg)
-          else:
-            op._set_device(colocation_op.device)  # pylint: disable=protected-access
+          # pylint: disable=protected-access
+          op._set_device(colocation_op.device)
+          # pylint: enable=protected-access
 
       all_colocation_groups = sorted(set(all_colocation_groups))
       # pylint: disable=protected-access
@@ -3526,11 +3432,6 @@ class Graph(object):
 
     # pylint: disable=protected-access
     for op in new_ops:
-      # Operations created by the C API always retrieve shapes from the C API so
-      # we preserve the shapes of ops created in import_graph_def (from the
-      # "_output_shapes" attr of the imported NodeDef).
-      if not _USE_C_SHAPES:
-        _set_shape_and_handle_data_for_outputs_c_api(op)
       new_control_inputs = self._control_dependencies_for_inputs(op.inputs)
       op._add_control_inputs(new_control_inputs)
       op._control_flow_post_processing()
@@ -5482,7 +5383,7 @@ def inside_function():
   return get_default_graph().building_function
 
 
-@tf_export("enable_eager_execution")
+@tf_export(v1=["enable_eager_execution"])
 def enable_eager_execution(config=None,
                            device_policy=None,
                            execution_mode=None):
@@ -5553,6 +5454,17 @@ def enable_eager_execution(config=None,
         server_def=None)
 
 
+@tf_export(v1=["disable_eager_execution"])
+def disable_eager_execution():
+  """Disables eager execution.
+
+  This function can only be called before any Graphs, Ops, or Tensors have been
+  created. It can be used at the beginning of the program for complex migration
+  projects from TensorFlow 1.x to 2.x.
+  """
+  context.default_execution_mode = context.GRAPH_MODE
+
+
 def enable_eager_execution_internal(config=None,
                                     device_policy=None,
                                     execution_mode=None,
@@ -5560,6 +5472,7 @@ def enable_eager_execution_internal(config=None,
   """Enables eager execution for the lifetime of this program.
 
   Most of the doc string for enable_eager_execution is relevant here as well.
+
   Args:
     config: See enable_eager_execution doc string
     device_policy: See enable_eager_execution doc string
@@ -5652,7 +5565,7 @@ def eager_run(main=None, argv=None):
   app.run(main, argv)
 
 
-@tf_export("reset_default_graph")
+@tf_export(v1=["reset_default_graph"])
 def reset_default_graph():
   """Clears the default graph stack and resets the global default graph.
 
@@ -5671,7 +5584,7 @@ def reset_default_graph():
   _default_graph_stack.reset()
 
 
-@tf_export("get_default_graph")
+@tf_export(v1=["get_default_graph"])
 def get_default_graph():
   """Returns the default graph for the current thread.
 
@@ -5798,7 +5711,7 @@ def _get_graph_from_inputs(op_input_list, graph=None):
   return graph or get_default_graph()
 
 
-@tf_export("GraphKeys")
+@tf_export(v1=["GraphKeys"])
 class GraphKeys(object):
   """Standard names to use for graph collections.
 
@@ -6004,7 +5917,7 @@ def add_to_collections(names, value):
   get_default_graph().add_to_collections(names, value)
 
 
-@tf_export("get_collection_ref")
+@tf_export(v1=["get_collection_ref"])
 def get_collection_ref(key):
   """Wrapper for `Graph.get_collection_ref()` using the default graph.
 
@@ -6028,7 +5941,7 @@ def get_collection_ref(key):
   return get_default_graph().get_collection_ref(key)
 
 
-@tf_export("get_collection")
+@tf_export(v1=["get_collection"])
 def get_collection(key, scope=None):
   """Wrapper for `Graph.get_collection()` using the default graph.
 
diff --git a/tensorflow/python/framework/ops_test.py b/tensorflow/python/framework/ops_test.py
index 0fb17081e75..7baa02b446b 100644
--- a/tensorflow/python/framework/ops_test.py
+++ b/tensorflow/python/framework/ops_test.py
@@ -57,11 +57,13 @@ ops._set_call_cpp_shape_fn(common_shapes.call_cpp_shape_fn)
 
 class ResourceTest(test_util.TensorFlowTestCase):
 
+  @test_util.run_deprecated_v1
   def testBuildGraph(self):
     with self.cached_session():
       pt = test_ops.stub_resource_handle_op(container="a", shared_name="b")
       test_ops.resource_create_op(pt).run()
 
+  @test_util.run_deprecated_v1
   def testInitialize(self):
     with self.cached_session():
       handle = test_ops.stub_resource_handle_op(container="a", shared_name="b")
@@ -106,6 +108,7 @@ class TensorAndShapeTest(test_util.TensorFlowTestCase):
       c = a + b
       self.assertEqual([2, 3], c.shape)
 
+  @test_util.run_deprecated_v1
   def testUnknownDim(self):
     with self.cached_session():
       a = array_ops.placeholder(dtype=dtypes.float32, shape=[2, None, 3])
@@ -113,6 +116,7 @@ class TensorAndShapeTest(test_util.TensorFlowTestCase):
       c = a + b
       self.assertEqual([2, None, 3], c.shape.as_list())
 
+  @test_util.run_deprecated_v1
   def testUnknownShape(self):
     with self.cached_session():
       a = array_ops.placeholder(dtype=dtypes.float32, shape=None)
@@ -120,6 +124,7 @@ class TensorAndShapeTest(test_util.TensorFlowTestCase):
       c = a + b
       self.assertEqual(tensor_shape.unknown_shape(), c.shape)
 
+  @test_util.run_deprecated_v1
   def testScalarShape(self):
     with self.cached_session():
       a = array_ops.placeholder(dtype=dtypes.float32, shape=[])
@@ -127,6 +132,7 @@ class TensorAndShapeTest(test_util.TensorFlowTestCase):
       c = a + b
       self.assertEqual(tensor_shape.scalar(), c.shape)
 
+  @test_util.run_deprecated_v1
   def testShapeFunctionError(self):
     with self.cached_session():
       a = array_ops.ones([1, 2, 3])
@@ -140,15 +146,16 @@ class TensorAndShapeTest(test_util.TensorFlowTestCase):
 
 class IndexedSlicesTest(test_util.TensorFlowTestCase):
 
+  @test_util.run_in_graph_and_eager_modes
   def testToTensor(self):
-    with self.cached_session():
-      values = constant_op.constant([2, 3, 5, 7], shape=[2, 2])
-      indices = constant_op.constant([0, 2])
-      dense_shape = constant_op.constant([3, 2])
-      x = ops.IndexedSlices(values, indices, dense_shape)
-      tensor = ops.convert_to_tensor(x, name="tensor")
-      self.assertAllEqual(tensor.eval(), [[2, 3], [0, 0], [5, 7]])
+    values = constant_op.constant([2, 3, 5, 7], shape=[2, 2])
+    indices = constant_op.constant([0, 2])
+    dense_shape = constant_op.constant([3, 2])
+    x = ops.IndexedSlices(values, indices, dense_shape)
+    tensor = ops.convert_to_tensor(x, name="tensor")
+    self.assertAllEqual(self.evaluate(tensor), [[2, 3], [0, 0], [5, 7]])
 
+  @test_util.run_deprecated_v1
   def testNegation(self):
     with self.cached_session():
       values = constant_op.constant([2, 3, 5, 7], shape=[2, 2])
@@ -157,6 +164,7 @@ class IndexedSlicesTest(test_util.TensorFlowTestCase):
       self.assertAllEqual(x.values.eval(), [[-2, -3], [-5, -7]])
       self.assertAllEqual(x.indices.eval(), [0, 2])
 
+  @test_util.run_deprecated_v1
   def testScalarMul(self):
     with self.cached_session():
       values = constant_op.constant([2, 3, 5, 7], shape=[2, 2])
@@ -190,6 +198,7 @@ def _apply_op(g, *args, **kwargs):
 
 class OperationTest(test_util.TensorFlowTestCase):
 
+  @test_util.run_deprecated_v1
   def testNoInputs(self):
     op = test_ops.float_output_string_output(name="myop").a.op
     self.assertEqual(2, len(op.values()))
@@ -212,6 +221,7 @@ class OperationTest(test_util.TensorFlowTestCase):
     self.assertProtoEquals("op:'FloatOutputStringOutput' name:'myop'",
                            op.node_def)
 
+  @test_util.run_deprecated_v1
   def testNoOutputs(self):
     op1 = test_ops.float_output(name="myop1").op
     float_t, = op1.values()
@@ -227,6 +237,7 @@ class OperationTest(test_util.TensorFlowTestCase):
     self.assertProtoEquals("op:'FloatInput' name:'myop2' input:'myop1'",
                            op2.node_def)
 
+  @test_util.run_deprecated_v1
   def testInputsAndOutputs(self):
     op1 = test_ops.float_output(name="myop1").op
     self.assertEqual(1, len(op1.values()))
@@ -308,16 +319,17 @@ class OperationTest(test_util.TensorFlowTestCase):
     with self.assertRaises(ValueError):
       ops.Operation(ops._NodeDef("op", "invalid:0"), g)
 
+  @test_util.run_deprecated_v1
   def testNoShapeFunction(self):
     op = test_ops.a()
     self.assertEqual(tensor_shape.unknown_shape(), op.get_shape())
 
+  @test_util.run_in_graph_and_eager_modes
   def testConvertToTensorNestedArray(self):
-    with self.cached_session():
-      values = [[2], [3], [5], [7]]
-      tensor = ops.convert_to_tensor(values)
-      self.assertAllEqual((4, 1), tensor.get_shape().as_list())
-      self.assertAllEqual(values, tensor.eval())
+    values = [[2], [3], [5], [7]]
+    tensor = ops.convert_to_tensor(values)
+    self.assertAllEqual((4, 1), tensor.get_shape().as_list())
+    self.assertAllEqual(values, self.evaluate(tensor))
 
   def testShapeTuple(self):
     with self.cached_session():
@@ -333,57 +345,63 @@ class OperationTest(test_util.TensorFlowTestCase):
       converted = ops.convert_to_tensor(1)
       self.assertTrue(isinstance(converted, ops.EagerTensor))
 
+  @test_util.run_in_graph_and_eager_modes
   def testConvertToTensorNestedTuple(self):
-    with self.cached_session():
-      values = ((2,), (3,), (5,), (7,))
-      tensor = ops.convert_to_tensor(values)
-      self.assertAllEqual((4, 1), tensor.get_shape().as_list())
-      self.assertAllEqual(values, ops.convert_to_tensor(values).eval())
+    values = ((2,), (3,), (5,), (7,))
+    tensor = ops.convert_to_tensor(values)
+    self.assertAllEqual((4, 1), tensor.get_shape().as_list())
+    self.assertAllEqual(values, self.evaluate(ops.convert_to_tensor(values)))
 
+  @test_util.run_in_graph_and_eager_modes
   def testConvertToTensorNestedTensors(self):
-    with self.cached_session():
-      values = ((2,), (3,), (5,), (7,))
-      tensor = ops.convert_to_tensor(
-          [constant_op.constant(row) for row in values])
-      self.assertAllEqual((4, 1), tensor.get_shape().as_list())
-      self.assertAllEqual(values, tensor.eval())
-      tensor = ops.convert_to_tensor(
-          [[constant_op.constant(v) for v in row] for row in values])
-      self.assertAllEqual((4, 1), tensor.get_shape().as_list())
-      self.assertAllEqual(values, tensor.eval())
+    values = ((2,), (3,), (5,), (7,))
+    tensor = ops.convert_to_tensor(
+        [constant_op.constant(row) for row in values])
+    self.assertAllEqual((4, 1), tensor.get_shape().as_list())
+    self.assertAllEqual(values, self.evaluate(tensor))
+    tensor = ops.convert_to_tensor(
+        [[constant_op.constant(v) for v in row] for row in values])
+    self.assertAllEqual((4, 1), tensor.get_shape().as_list())
+    self.assertAllEqual(values, self.evaluate(tensor))
 
+  @test_util.run_in_graph_and_eager_modes
   def testConvertToTensorNestedMix(self):
-    with self.cached_session():
-      values = ([2], (3,), [constant_op.constant(5)], constant_op.constant([7]))
-      tensor = ops.convert_to_tensor(values)
-      self.assertAllEqual((4, 1), tensor.get_shape().as_list())
-      self.assertAllEqual(((2,), (3,), (5,), (7,)), tensor.eval())
+    values = ([2], (3,), [constant_op.constant(5)], constant_op.constant([7]))
+    tensor = ops.convert_to_tensor(values)
+    self.assertAllEqual((4, 1), tensor.get_shape().as_list())
+    self.assertAllEqual(((2,), (3,), (5,), (7,)), self.evaluate(tensor))
 
+  @test_util.run_in_graph_and_eager_modes
   def testConvertToTensorPreferred(self):
-    with self.cached_session():
-      values = [2, 3, 5, 7]
-      tensor = ops.convert_to_tensor(values, preferred_dtype=dtypes.float32)
-      self.assertEqual(dtypes.float32, tensor.dtype)
+    values = [2, 3, 5, 7]
+    tensor = ops.convert_to_tensor(values, preferred_dtype=dtypes.float32)
+    self.assertEqual(dtypes.float32, tensor.dtype)
 
-    with self.cached_session():
-      # Convert empty tensor to anything.
-      values = []
-      tensor = ops.convert_to_tensor(values, preferred_dtype=dtypes.int64)
-      self.assertEqual(dtypes.int64, tensor.dtype)
+    # Convert empty tensor to anything.
+    values = []
+    tensor = ops.convert_to_tensor(values, preferred_dtype=dtypes.int64)
+    self.assertEqual(dtypes.int64, tensor.dtype)
 
-    with self.cached_session():
-      # The preferred dtype is a type error and will convert to
-      # float32 instead.
-      values = [1.23]
-      tensor = ops.convert_to_tensor(values, preferred_dtype=dtypes.int64)
-      self.assertEqual(dtypes.float32, tensor.dtype)
+    # The preferred dtype is a type error and will convert to
+    # float32 instead.
+    values = [1.23]
+    tensor = ops.convert_to_tensor(values, preferred_dtype=dtypes.int64)
+    self.assertEqual(dtypes.float32, tensor.dtype)
 
+  @test_util.run_in_graph_and_eager_modes
   def testConvertToInvalidTensorType(self):
     with self.assertRaises(TypeError):
       # Forcing an invalid dtype should fail with a type error.
       values = [1.23]
-      _ = ops.convert_to_tensor(values, dtype=dtypes.int64)
+      ops.convert_to_tensor(values, dtype=dtypes.int64)
 
+  @test_util.run_in_graph_and_eager_modes
+  def testConvertToTensorFromInvalidTensor(self):
+    tensor = constant_op.constant(42.0, dtype=dtypes.float32)
+    with self.assertRaises(ValueError):
+      ops.convert_to_tensor(tensor, dtype=dtypes.int32)
+
+  @test_util.run_deprecated_v1
   def testNoConvert(self):
     # Operation cannot be converted to Tensor.
     op = control_flow_ops.no_op()
@@ -401,6 +419,7 @@ class OperationTest(test_util.TensorFlowTestCase):
         ops._NodeDef("None", "op1"), ops.Graph(), [], [dtypes.float32])
     self.assertEqual("<tf.Operation 'op1' type=None>", repr(op))
 
+  @test_util.run_deprecated_v1
   def testGetAttr(self):
     op = test_ops.default_attrs()
     self.assertEqual(op.get_attr("string_val"), b"abc")
@@ -446,6 +465,7 @@ class OperationTest(test_util.TensorFlowTestCase):
 
   # TODO(b/65162920): remove this test when users who are directly mutating the
   # node_def have been updated to proper usage.
+  @test_util.run_deprecated_v1
   def testSetAttr(self):
     op = test_ops.int_attr().op
     op._set_attr("foo", attr_value_pb2.AttrValue(i=2))
@@ -466,6 +486,7 @@ class OperationTest(test_util.TensorFlowTestCase):
     self.assertEqual(z.control_inputs, [x, y])
     self.assertEqual(x._control_outputs, [z])
 
+  @test_util.run_deprecated_v1
   def testRemoveAllControlInputs(self):
     a = constant_op.constant(1)
     with ops.control_dependencies([a]):
@@ -490,6 +511,7 @@ class OperationTest(test_util.TensorFlowTestCase):
     self.assertEqual(f.op.control_inputs, [])
     self.assertEqual(list(f.op.inputs), [d, e])
 
+  @test_util.run_deprecated_v1
   def testControlInputCycle(self):
     graph = ops.Graph()
     with graph.as_default():
@@ -503,7 +525,7 @@ class OperationTest(test_util.TensorFlowTestCase):
       with self.assertRaisesRegexp(
           errors.InvalidArgumentError,
           "Graph is invalid, contains a cycle with 2 nodes"):
-        sess.run(x)
+        self.evaluate(x)
 
   def testUpdateInput(self):
     g = ops.Graph()
@@ -517,21 +539,21 @@ class OperationTest(test_util.TensorFlowTestCase):
     self.assertEquals(x.consumers(), [])
     self.assertEquals(y.consumers(), [z.op, z.op])
     with session.Session(graph=g) as sess:
-      self.assertEquals(sess.run(z), 4)
+      self.assertEquals(self.evaluate(z), 4)
 
     z.op._update_input(0, x)  # pylint: disable=protected-access
     self.assertEquals(list(z.op.inputs), [x, y])
     self.assertEquals(x.consumers(), [z.op])
     self.assertEquals(y.consumers(), [z.op])
     with session.Session(graph=g) as sess:
-      self.assertEquals(sess.run(z), 3)
+      self.assertEquals(self.evaluate(z), 3)
 
     z.op._update_input(1, y)  # pylint: disable=protected-access
     self.assertEquals(list(z.op.inputs), [x, y])
     self.assertEquals(x.consumers(), [z.op])
     self.assertEquals(y.consumers(), [z.op])
     with session.Session(graph=g) as sess:
-      self.assertEquals(sess.run(z), 3)
+      self.assertEquals(self.evaluate(z), 3)
 
   def testUpdateInputGraphError(self):
     g_0 = ops.Graph()
@@ -557,7 +579,7 @@ class OperationTest(test_util.TensorFlowTestCase):
           errors.InvalidArgumentError,
           "Input 0 of node add was passed string from Const_1:0 incompatible "
           "with expected int32"):
-        sess.run(z)
+        self.evaluate(z)
 
   def testUpdateInputShapeError(self):
     g = ops.Graph()
@@ -582,6 +604,32 @@ class OperationTest(test_util.TensorFlowTestCase):
     ):
       x.op._update_input(1, x)  # pylint: disable=protected-access
 
+  @test_util.enable_control_flow_v2
+  def testAddWhileInput(self):
+    @eager_function.defun
+    def test():
+      output = control_flow_ops.while_loop(lambda x: x < 3, lambda x: x + 1,
+                                           [1])
+      while_op = output.op.inputs[0].op
+      self.assertEqual(while_op.type, "While")
+      orig_num_inputs = len(while_op.inputs)
+
+      new_input1 = constant_op.constant(1.0)
+      new_input2 = constant_op.constant(True)
+
+      while_op._set_type_list_attr("T",
+                                   [t.dtype for t in while_op.inputs] +
+                                   [new_input1.dtype, new_input2.dtype])
+
+      while_op._add_while_inputs([new_input1, new_input2])
+      # Can't add an edge beyond what's specified by "T"
+      with self.assertRaises(errors.OutOfRangeError):
+        while_op._add_while_inputs([new_input2])
+      self.assertEqual(len(while_op.inputs), orig_num_inputs + 2)  # pylint: disable=g-deprecated-assert
+
+    test()
+
+  @test_util.run_deprecated_v1
   def testOpDef(self):
     x = constant_op.constant(0)
     y = constant_op.constant(1)
@@ -681,6 +729,7 @@ class CreateOpTest(test_util.TensorFlowTestCase):
 # the low-level behavior.
 class CreateOpFromTFOperationTest(test_util.TensorFlowTestCase):
 
+  @test_util.run_deprecated_v1
   def testBasic(self):
     g = ops.Graph()
     with g.as_default():
@@ -701,7 +750,6 @@ class CreateOpFromTFOperationTest(test_util.TensorFlowTestCase):
     self.assertEqual(g.get_operation_by_name("myop"), op)
     self.assertEqual(g.get_tensor_by_name("myop:0"), op.outputs[0])
 
-  @test_util.enable_c_shapes
   def testShape(self):
     g = ops.Graph()
     with g.as_default():
@@ -732,6 +780,7 @@ class CreateOpFromTFOperationTest(test_util.TensorFlowTestCase):
     self.assertEqual(op3.name, "myop_2")
     self.assertEqual(op4.name, "myop_1_1")
 
+  @test_util.run_deprecated_v1
   def testCond(self):
     g = ops.Graph()
     with g.as_default():
@@ -761,6 +810,7 @@ class CreateOpFromTFOperationTest(test_util.TensorFlowTestCase):
                      "cond/cond_text")
     # pylint: enable=protected-access
 
+  @test_util.run_deprecated_v1
   def testWhileLoop(self):
     g = ops.Graph()
     with g.as_default():
@@ -790,6 +840,7 @@ class CreateOpFromTFOperationTest(test_util.TensorFlowTestCase):
                      "myloop/while_context")
     # pylint: enable=protected-access
 
+  @test_util.run_deprecated_v1
   def testWhileLoopWithInternalControlDep(self):
     g = ops.Graph()
     with g.as_default():
@@ -813,6 +864,7 @@ class CreateOpFromTFOperationTest(test_util.TensorFlowTestCase):
     # Internal control dep is preserved
     self.assertEqual(op.control_inputs, [c])
 
+  @test_util.run_deprecated_v1
   def testWhileLoopWithExternalControlDep(self):
     g = ops.Graph()
     with g.as_default():
@@ -946,6 +998,7 @@ class NameStackTest(test_util.TensorFlowTestCase):
     self.assertEqual("bar_2", g.unique_name("bar", mark_as_used=False))
     self.assertEqual("bar_2", g.unique_name("bar"))
 
+  @test_util.run_deprecated_v1
   def testNameAndVariableScope(self):
     with self.cached_session() as sess:
       with sess.graph.name_scope("l0"):
@@ -1076,6 +1129,13 @@ class DeviceTest(test_util.TensorFlowTestCase):
       node { name: "FloatOutput" op: "FloatOutput" }
     """, gd)
 
+  def testEagerBackingDevice(self):
+    with context.eager_mode():
+      with ops.device("/device:CPU:0"):
+        t = constant_op.constant(1.0)
+        self.assertRegexpMatches(t.device, "/device:CPU:0")
+        self.assertRegexpMatches(t.backing_device, "/device:CPU:0")
+
   def testDevicePartialString(self):
     g = ops.Graph()
     with g.device("/job:worker/replica:2"):
@@ -1665,6 +1725,7 @@ def _CopyOverrideGrad(op, x_grad):  # pylint: disable=invalid-name
 
 class RegistrationTest(test_util.TensorFlowTestCase):
 
+  @test_util.run_deprecated_v1
   def testRegisterGradients(self):
     x = test_ops.float_output()
     y = test_ops.copy_op(x)
@@ -1704,6 +1765,7 @@ class ComparisonTest(test_util.TensorFlowTestCase):
 
 class ControlDependenciesTest(test_util.TensorFlowTestCase):
 
+  @test_util.run_deprecated_v1
   def testBasic(self):
     g = ops.Graph()
     with g.as_default():
@@ -1947,6 +2009,7 @@ class OpScopeTest(test_util.TensorFlowTestCase):
       with ops.name_scope(None, "default2") as scope2:
         self.assertEqual(scope2, "default/default2/")
 
+  @test_util.run_deprecated_v1
   def testNoScopeName(self):
     g0 = ops.Graph()
     values = [
@@ -1960,6 +2023,7 @@ class OpScopeTest(test_util.TensorFlowTestCase):
       with ops.name_scope(None, None, values):
         pass
 
+  @test_util.run_deprecated_v1
   def testEmptyScopeName(self):
     g0 = ops.Graph()
     a = g0.create_op("A", [], [dtypes.float32])
@@ -1971,6 +2035,7 @@ class OpScopeTest(test_util.TensorFlowTestCase):
       self.assertEqual("", scope)
       self.assertEqual(g0, ops.get_default_graph())
 
+  @test_util.run_deprecated_v1
   def testDefaultScopeName(self):
     g0 = ops.Graph()
     a = g0.create_op("A", [], [dtypes.float32])
@@ -1995,12 +2060,14 @@ class OpScopeTest(test_util.TensorFlowTestCase):
       with ops.name_scope(scope_name, values=graph_elements + [a]):
         pass
 
+  @test_util.run_deprecated_v1
   def testTensor(self):
     g0 = ops.Graph()
     a = g0.create_op("A", [], [dtypes.float32])
     b = g0.create_op("B", [], [dtypes.float32])
     self._testGraphElements([a, b])
 
+  @test_util.run_deprecated_v1
   def testSparseTensor(self):
     g0 = ops.Graph()
     a = g0.create_op("A", [], [dtypes.float32])
@@ -2011,6 +2078,7 @@ class OpScopeTest(test_util.TensorFlowTestCase):
         _apply_op(g0, "Int64Output", [], [dtypes.int64]))
     self._testGraphElements([a, sparse, b])
 
+  @test_util.run_deprecated_v1
   def testVariable(self):
     g0 = ops.Graph()
     with g0.as_default():
@@ -2215,6 +2283,7 @@ class InitScopeTest(test_util.TensorFlowTestCase):
       self.assertEqual(4, int(compiled_outer(inner=compiled_inner)))
       self.assertEqual(7, int(compiled_outer(inner=compiled_inner)))
 
+  @test_util.run_deprecated_v1
   def testFallsBackToGlobalGraphWhenAllGraphsAreBuildingFunctions(self):
     with context.graph_mode():
       ops.reset_default_graph()
@@ -2351,6 +2420,7 @@ class GraphTest(test_util.TensorFlowTestCase):
     g.prevent_feeding(a)
     self.assertFalse(g.is_feedable(a))
 
+  @test_util.run_deprecated_v1
   def testPreventFetching(self):
     g = ops.Graph()
     a = constant_op.constant(2.0)
@@ -2391,7 +2461,7 @@ class GraphTest(test_util.TensorFlowTestCase):
       c = math_ops.add(a, b)
     # Create a session we can delete
     with session.Session(graph=g) as sess:
-      sess.run(c)
+      self.evaluate(c)
     # Delete all references and trigger gc
     del g
     del a
@@ -2407,7 +2477,7 @@ class GraphTest(test_util.TensorFlowTestCase):
         math_ops.add([1, 2], [1, 2, 3])
       a = constant_op.constant(1)
       with session.Session() as sess:
-        sess.run(a)
+        self.evaluate(a)
 
   def testRunnableAfterInvalidShapeWithKernelLabelMap(self):
     g = ops.Graph()
@@ -2417,7 +2487,7 @@ class GraphTest(test_util.TensorFlowTestCase):
           test_ops.kernel_label_required(1)
       a = constant_op.constant(1)
       with session.Session() as sess:
-        sess.run(a)
+        self.evaluate(a)
 
 
 class AttrScopeTest(test_util.TensorFlowTestCase):
@@ -2434,10 +2504,12 @@ class AttrScopeTest(test_util.TensorFlowTestCase):
       b = None
     return (a, b)
 
+  @test_util.run_deprecated_v1
   def testNoLabel(self):
     with self.cached_session():
       self.assertAllEqual((None, None), self._get_test_attrs())
 
+  @test_util.run_deprecated_v1
   def testLabelMap(self):
     with self.cached_session() as sess:
       a1 = self._get_test_attrs()
@@ -2472,11 +2544,13 @@ ops.RegisterShape("KernelLabel")(common_shapes.scalar_shape)
 
 class KernelLabelTest(test_util.TensorFlowTestCase):
 
+  @test_util.run_deprecated_v1
   def testNoLabel(self):
     with self.cached_session():
       self.assertAllEqual(b"My label is: default",
                           test_ops.kernel_label().eval())
 
+  @test_util.run_deprecated_v1
   def testLabelMap(self):
     with self.cached_session() as sess:
       default_1 = test_ops.kernel_label()
@@ -2491,12 +2565,14 @@ class KernelLabelTest(test_util.TensorFlowTestCase):
       # pylint: enable=protected-access
       default_3 = test_ops.kernel_label()
 
-      self.assertAllEqual(b"My label is: default", default_1.eval())
-      self.assertAllEqual(b"My label is: default", default_2.eval())
-      self.assertAllEqual(b"My label is: default", default_3.eval())
-      self.assertAllEqual(b"My label is: overload_1", overload_1_1.eval())
-      self.assertAllEqual(b"My label is: overload_1", overload_1_2.eval())
-      self.assertAllEqual(b"My label is: overload_2", overload_2.eval())
+      self.assertAllEqual(b"My label is: default", self.evaluate(default_1))
+      self.assertAllEqual(b"My label is: default", self.evaluate(default_2))
+      self.assertAllEqual(b"My label is: default", self.evaluate(default_3))
+      self.assertAllEqual(b"My label is: overload_1",
+                          self.evaluate(overload_1_1))
+      self.assertAllEqual(b"My label is: overload_1",
+                          self.evaluate(overload_1_2))
+      self.assertAllEqual(b"My label is: overload_2", self.evaluate(overload_2))
 
 
 class AsGraphDefTest(test_util.TensorFlowTestCase):
@@ -2591,6 +2667,7 @@ class StatisticsTest(test_util.TensorFlowTestCase):
 
 class DeviceStackTest(test_util.TensorFlowTestCase):
 
+  @test_util.run_deprecated_v1
   def testBasicDeviceAssignmentMetadata(self):
 
     def device_func(unused_op):
@@ -2622,6 +2699,7 @@ class DeviceStackTest(test_util.TensorFlowTestCase):
     expected_regex = r"device_func<.*ops_test.py, [0-9]+"
     self.assertRegexpMatches(func_description, expected_regex)
 
+  @test_util.run_deprecated_v1
   def testDeviceAssignmentMetadataForGraphDeviceAndTfDeviceFunctions(self):
 
     with ops.device("/cpu"):
@@ -2641,6 +2719,7 @@ class DeviceStackTest(test_util.TensorFlowTestCase):
 
 class ColocationGroupTest(test_util.TensorFlowTestCase):
 
+  @test_util.run_deprecated_v1
   def testBasic(self):
     a = constant_op.constant([2.0], name="a")
     with ops.colocate_with(a.op):
@@ -2651,6 +2730,7 @@ class ColocationGroupTest(test_util.TensorFlowTestCase):
     with self.assertRaises(ValueError):
       c.op.get_attr("_class")
 
+  @test_util.run_deprecated_v1
   def testBasicColocationMetadata(self):
     const_two = constant_op.constant([2.0], name="two")
     with ops.colocate_with(const_two.op):
@@ -2663,6 +2743,7 @@ class ColocationGroupTest(test_util.TensorFlowTestCase):
     # colocation statement.
     self.assertEqual("ops_test.py", os.path.basename(metadata.filename))
 
+  @test_util.run_deprecated_v1
   def testColocationDeviceInteraction(self):
     with ops.device("/cpu:0"):
       with ops.device("/device:GPU:0"):
@@ -2675,6 +2756,7 @@ class ColocationGroupTest(test_util.TensorFlowTestCase):
     self.assertEqual([b"loc:@a"], b.op.colocation_groups())
     self.assertEqual(a.op.device, b.op.device)
 
+  @test_util.run_deprecated_v1
   def testColocationCanonicalization(self):
     with ops.device("/device:GPU:0"):
       _ = constant_op.constant(2.0)
@@ -2690,6 +2772,7 @@ class ColocationGroupTest(test_util.TensorFlowTestCase):
     # inherits B's device name, after canonicalizing the names.
     self.assertEqual(b.op.device, c.op.device)
 
+  @test_util.run_deprecated_v1
   def testLocationOverrides(self):
     with ops.device("/cpu:0"):
       with ops.device("/device:GPU:0"):
@@ -2711,6 +2794,7 @@ class ColocationGroupTest(test_util.TensorFlowTestCase):
     self.assertEqual("/device:GPU:0", c.op.device)
     self.assertEqual("/device:CPU:0", d.op.device)
 
+  @test_util.run_deprecated_v1
   def testNestedColocateWith(self):
     a = constant_op.constant([2.0], name="a")
     with ops.colocate_with(a.op):
@@ -2720,6 +2804,7 @@ class ColocationGroupTest(test_util.TensorFlowTestCase):
     self.assertEqual([b"loc:@a"], b.op.colocation_groups())
     self.assertEqual([b"loc:@a"], c.op.colocation_groups())
 
+  @test_util.run_deprecated_v1
   def testMultiColocationGroups(self):
     a = constant_op.constant([2.0], name="a")
     b = constant_op.constant(3.0, name="b")
@@ -2728,6 +2813,7 @@ class ColocationGroupTest(test_util.TensorFlowTestCase):
         c = constant_op.constant(4.0)
     self.assertEqual(set([b"loc:@a", b"loc:@b"]), set(c.op.colocation_groups()))
 
+  @test_util.run_deprecated_v1
   def testColocationIgnoreStack(self):
     a = constant_op.constant([2.0], name="a")
     b = constant_op.constant(3.0, name="b")
@@ -2736,6 +2822,7 @@ class ColocationGroupTest(test_util.TensorFlowTestCase):
         c = constant_op.constant(4.0)
     self.assertEqual(set([b"loc:@b"]), set(c.op.colocation_groups()))
 
+  @test_util.run_deprecated_v1
   def testColocateWithReset(self):
     a = constant_op.constant([2.0], name="a")
     with ops.colocate_with(a.op):
@@ -2745,6 +2832,7 @@ class ColocationGroupTest(test_util.TensorFlowTestCase):
     self.assertEqual([b"loc:@a"], b.op.colocation_groups())
     self.assertEqual([b"loc:@c"], c.op.colocation_groups())
 
+  @test_util.run_deprecated_v1
   def testColocateWithInitialNoneThenNested(self):
     a = constant_op.constant([2.0], name="a")
     with ops.colocate_with(a.op):
@@ -2755,47 +2843,13 @@ class ColocationGroupTest(test_util.TensorFlowTestCase):
     self.assertEqual([b"loc:@b"], b.op.colocation_groups())
     self.assertEqual([b"loc:@b"], c.op.colocation_groups())
 
+  @test_util.run_deprecated_v1
   def testColocateVariables(self):
     a = variables.Variable([2.0], name="a")
     with ops.colocate_with(a.op):
       b = variables.Variable([3.0], name="b")
     self.assertEqual([b"loc:@a"], b.op.colocation_groups())
 
-  def testInconsistentDeviceWithinColocate(self):
-    with ops.device("/device:GPU:0"):
-      a = constant_op.constant([2.0], name="a")
-      with ops.colocate_with(a.op):
-        # This is allowed due to legacy but clearly wrong, since we
-        # should really be colocating with 'a'.  We allow devices to
-        # override colocate_with, but we log warnings to suggest that
-        # this is probably unintentional or misguided.
-        with ops.device("/cpu:0"):
-          b = constant_op.constant([3.0], name="b")
-
-    self.assertEqual("/device:CPU:0", b.device)
-
-  def testMakeColocationConflictMessage(self):
-    """Test that provides an example of a complicated error message."""
-    # We could test the message with any ops, but this test will be more
-    # instructive with a real colocation conflict.
-    with ops.device("/device:GPU:0"):
-      a = constant_op.constant([2.0], name="a")
-      with ops.colocate_with(a.op):
-        with ops.device("/cpu:0"):
-          b = constant_op.constant([3.0], name="b")
-    # The definition-location of the nodes will be wrong because of running
-    # from within a TF unittest.  The rest of the info should be correct.
-    message = ops.get_default_graph()._make_colocation_conflict_message(a.op,
-                                                                        b.op)
-    self.assertRegexpMatches(message,
-                             r"Tried to colocate op 'a' \(defined at.*\)")
-    self.assertRegexpMatches(message, "No node-device.*'a'")
-    self.assertRegexpMatches(message, "Device assignments active.*'a'")
-    self.assertRegexpMatches(message, "GPU:0")
-    self.assertRegexpMatches(message, "Node-device colocations active.*'b'")
-    self.assertRegexpMatches(message, "Device assignments active.*'b'")
-    self.assertRegexpMatches(message, "cpu:0")
-
 
 class DeprecatedTest(test_util.TensorFlowTestCase):
 
@@ -2918,6 +2972,7 @@ class NameScopeTest(test_util.TensorFlowTestCase):
 
 class TracebackTest(test_util.TensorFlowTestCase):
 
+  @test_util.run_deprecated_v1
   def testTracebackWithStartLines(self):
     with self.cached_session() as sess:
       a = constant_op.constant(2.0)
@@ -2939,6 +2994,7 @@ class TracebackTest(test_util.TensorFlowTestCase):
 
 class EnableEagerExecutionTest(test_util.TensorFlowTestCase):
 
+  @test_util.run_deprecated_v1
   def testBadArgumentsToEnableEagerExecution(self):
     with self.assertRaisesRegexp(TypeError, "config must be a tf.ConfigProto"):
       ops.enable_eager_execution(context.DEVICE_PLACEMENT_SILENT)
diff --git a/tensorflow/python/framework/python_op_gen.cc b/tensorflow/python/framework/python_op_gen.cc
index 465016b8087..d91f7b0bdde 100644
--- a/tensorflow/python/framework/python_op_gen.cc
+++ b/tensorflow/python/framework/python_op_gen.cc
@@ -142,6 +142,7 @@ class GenEagerPythonOp : public python_op_gen_internal::GenPythonOp {
   void AddEagerAttrs(const string& indentation);
   void AddEagerExecute(const string& indentation,
                        const string& num_outputs_expr);
+  void AddDispatch(const string& prefix);
 
   void AddAttrForArg(const string& attr, int arg_index) {
     gtl::InsertIfNotPresent(&inferred_attrs_, attr,
@@ -356,9 +357,14 @@ string GenEagerPythonOp::Code() {
 
 void GenEagerPythonOp::HandleGraphMode(const string& function_setup) {
   strings::StrAppend(&result_, "  # Add nodes to the TensorFlow graph.\n");
-  strings::StrAppend(&result_, function_setup,
-                     "  _, _, _op = _op_def_lib._apply_op_helper(\n");
-  AddBodyNoReturn("        ");
+  strings::StrAppend(&result_, function_setup);
+  if (api_def_.visibility() == ApiDef::VISIBLE) {
+    strings::StrAppend(&result_, "  try:\n  ");
+  }
+  strings::StrAppend(&result_, "  _, _, _op = _op_def_lib._apply_op_helper(\n");
+  AddBodyNoReturn(strings::StrCat("        \"", op_def_.name(), "\", "));
+  AddDispatch("  ");
+
   if (num_outs_ > 0) {
     strings::StrAppend(&result_, "  _result = _op.outputs[:]\n");
     // Special case handling for stateful op with single list output
@@ -628,6 +634,7 @@ void GenEagerPythonOp::AddEagerFunctionTeardown(
 bool GenEagerPythonOp::AddEagerFastPathAndGraphCode(
     const string& parameters, const std::vector<string>& output_sizes,
     const string& eager_not_allowed_error) {
+  strings::StrAppend(&result_, "@_dispatch.add_dispatch_list\n");
   AddExport();
   AddDefLine(function_name_, parameters);
   AddDocStringDescription();
@@ -758,6 +765,7 @@ void GenEagerPythonOp::AddEagerFastPathExecute() {
   strings::StrAppend(&result_, "      except _core._SymbolicException:\n");
   strings::StrAppend(&result_,
                      "        pass  # Add nodes to the TensorFlow graph.\n");
+  AddDispatch("      ");
 
   // Any errors thrown from execute need to be unwrapped from
   // _NotOkStatusException.
@@ -898,6 +906,19 @@ void GenEagerPythonOp::AddEagerExecute(const string& indentation,
                      WordWrap(return_prefix, return_args, kRightMargin), "\n");
 }
 
+void GenEagerPythonOp::AddDispatch(const string& prefix) {
+  if (api_def_.visibility() != ApiDef::VISIBLE) return;
+
+  strings::StrAppend(&result_, prefix, "except (TypeError, ValueError):\n");
+  strings::StrAppend(&result_, prefix, "  result = _dispatch.dispatch(\n");
+  AddBodyNoReturn(strings::StrCat(prefix, "        ", function_name_, ", "));
+  strings::StrAppend(&result_, prefix,
+                     "  if result is not "
+                     "_dispatch.OpDispatcher.NOT_SUPPORTED:\n");
+  strings::StrAppend(&result_, prefix, "    return result\n");
+  strings::StrAppend(&result_, prefix, "  raise\n");
+}
+
 string GetPythonOps(const OpList& ops, const ApiDefMap& api_defs,
                     const std::vector<string>& hidden_ops, bool require_shapes,
                     const string& source_file_name = "") {
@@ -937,6 +958,7 @@ from tensorflow.python.framework import op_def_registry as _op_def_registry
 from tensorflow.python.framework import ops as _ops
 from tensorflow.python.framework import op_def_library as _op_def_library
 from tensorflow.python.util.deprecation import deprecated_endpoints
+from tensorflow.python.util import dispatch as _dispatch
 from tensorflow.python.util.tf_export import tf_export
 
 )");
diff --git a/tensorflow/python/framework/python_op_gen_internal.cc b/tensorflow/python/framework/python_op_gen_internal.cc
index 65b9ad5c6a2..cbdeecfbfb9 100644
--- a/tensorflow/python/framework/python_op_gen_internal.cc
+++ b/tensorflow/python/framework/python_op_gen_internal.cc
@@ -804,8 +804,8 @@ void GenPythonOp::AddDocStringOutputs() {
 }
 
 void GenPythonOp::AddBody(const string& prefix) {
-  const string apply_prefix =
-      strings::StrCat(prefix, "_result = _op_def_lib.apply_op(");
+  const string apply_prefix = strings::StrCat(
+      prefix, "_result = _op_def_lib.apply_op(\"", op_def_.name(), "\", ");
   AddBodyNoReturn(apply_prefix);
   if (num_outs_ > 1) {
     strings::StrAppend(&result_, prefix, "_result = _", op_def_.name(),
@@ -815,7 +815,7 @@ void GenPythonOp::AddBody(const string& prefix) {
 }
 
 void GenPythonOp::AddBodyNoReturn(const string& apply_prefix) {
-  string args = strings::StrCat("\"", op_def_.name(), "\", ");
+  string args;
   for (size_t i = 0; i < param_names_.size(); ++i) {
     strings::StrAppend(&args, AvoidPythonReserved(param_names_[i].GetName()),
                        "=", param_names_[i].GetRenameTo(), ", ");
diff --git a/tensorflow/python/framework/random_seed.py b/tensorflow/python/framework/random_seed.py
index 777bb2fe8c5..6b7f56a92cc 100644
--- a/tensorflow/python/framework/random_seed.py
+++ b/tensorflow/python/framework/random_seed.py
@@ -34,7 +34,7 @@ def _truncate_seed(seed):
   return seed % _MAXINT32  # Truncate to fit into 32-bit integer
 
 
-@tf_export('random.get_seed', v1=['random.get_seed', 'get_seed'])
+@tf_export(v1=['random.get_seed', 'get_seed'])
 @deprecation.deprecated_endpoints('get_seed')
 def get_seed(op_seed):
   """Returns the local seeds an operation should use given an op-specific seed.
@@ -45,7 +45,7 @@ def get_seed(op_seed):
   graph, or for only specific operations.
 
   For details on how the graph-level seed interacts with op seeds, see
-  `tf.set_random_seed`.
+  `tf.random.set_random_seed`.
 
   Args:
     op_seed: integer.
@@ -82,7 +82,7 @@ def get_seed(op_seed):
   return seeds
 
 
-@tf_export('random.set_random_seed', 'set_random_seed')
+@tf_export(v1=['random.set_random_seed', 'set_random_seed'])
 def set_random_seed(seed):
   """Sets the graph-level random seed.
 
@@ -154,7 +154,7 @@ def set_random_seed(seed):
   sessions, set a graph-level seed:
 
   ```python
-  tf.set_random_seed(1234)
+  tf.random.set_random_seed(1234)
   a = tf.random_uniform([1])
   b = tf.random_normal([1])
 
@@ -182,3 +182,103 @@ def set_random_seed(seed):
     context.set_global_seed(seed)
   else:
     ops.get_default_graph().seed = seed
+
+
+@tf_export('random.set_seed', v1=[])
+def set_seed(seed):
+  """Sets the graph-level random seed.
+
+  Operations that rely on a random seed actually derive it from two seeds:
+  the graph-level and operation-level seeds. This sets the graph-level seed.
+
+  Its interactions with operation-level seeds is as follows:
+
+    1. If neither the graph-level nor the operation seed is set:
+      A random seed is used for this op.
+    2. If the graph-level seed is set, but the operation seed is not:
+      The system deterministically picks an operation seed in conjunction
+      with the graph-level seed so that it gets a unique random sequence.
+    3. If the graph-level seed is not set, but the operation seed is set:
+      A default graph-level seed and the specified operation seed are used to
+      determine the random sequence.
+    4. If both the graph-level and the operation seed are set:
+      Both seeds are used in conjunction to determine the random sequence.
+
+  To illustrate the user-visible effects, consider these examples:
+
+  To generate different sequences across sessions, set neither
+  graph-level nor op-level seeds:
+
+  ```python
+  a = tf.random_uniform([1])
+  b = tf.random_normal([1])
+
+  print("Session 1")
+  with tf.Session() as sess1:
+    print(sess1.run(a))  # generates 'A1'
+    print(sess1.run(a))  # generates 'A2'
+    print(sess1.run(b))  # generates 'B1'
+    print(sess1.run(b))  # generates 'B2'
+
+  print("Session 2")
+  with tf.Session() as sess2:
+    print(sess2.run(a))  # generates 'A3'
+    print(sess2.run(a))  # generates 'A4'
+    print(sess2.run(b))  # generates 'B3'
+    print(sess2.run(b))  # generates 'B4'
+  ```
+
+  To generate the same repeatable sequence for an op across sessions, set the
+  seed for the op:
+
+  ```python
+  a = tf.random_uniform([1], seed=1)
+  b = tf.random_normal([1])
+
+  # Repeatedly running this block with the same graph will generate the same
+  # sequence of values for 'a', but different sequences of values for 'b'.
+  print("Session 1")
+  with tf.Session() as sess1:
+    print(sess1.run(a))  # generates 'A1'
+    print(sess1.run(a))  # generates 'A2'
+    print(sess1.run(b))  # generates 'B1'
+    print(sess1.run(b))  # generates 'B2'
+
+  print("Session 2")
+  with tf.Session() as sess2:
+    print(sess2.run(a))  # generates 'A1'
+    print(sess2.run(a))  # generates 'A2'
+    print(sess2.run(b))  # generates 'B3'
+    print(sess2.run(b))  # generates 'B4'
+  ```
+
+  To make the random sequences generated by all ops be repeatable across
+  sessions, set a graph-level seed:
+
+  ```python
+  tf.random.set_seed(1234)
+  a = tf.random_uniform([1])
+  b = tf.random_normal([1])
+
+  # Repeatedly running this block with the same graph will generate the same
+  # sequences of 'a' and 'b'.
+  print("Session 1")
+  with tf.Session() as sess1:
+    print(sess1.run(a))  # generates 'A1'
+    print(sess1.run(a))  # generates 'A2'
+    print(sess1.run(b))  # generates 'B1'
+    print(sess1.run(b))  # generates 'B2'
+
+  print("Session 2")
+  with tf.Session() as sess2:
+    print(sess2.run(a))  # generates 'A1'
+    print(sess2.run(a))  # generates 'A2'
+    print(sess2.run(b))  # generates 'B1'
+    print(sess2.run(b))  # generates 'B2'
+  ```
+
+  Args:
+    seed: integer.
+  """
+  # TODO(go/tf2-random): change doc, update to match design doc
+  set_random_seed(seed)
diff --git a/tensorflow/python/framework/registry.py b/tensorflow/python/framework/registry.py
index 2e45acb4995..4357c76bd6c 100644
--- a/tensorflow/python/framework/registry.py
+++ b/tensorflow/python/framework/registry.py
@@ -23,10 +23,9 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import traceback
-
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import compat
+from tensorflow.python.util import tf_stack
 
 
 # Registry mechanism below is based on mapreduce.python.mrpython.Register.
@@ -57,15 +56,17 @@ class Registry(object):
     if name in self._registry:
       (filename, line_number, function_name, _) = (
           self._registry[name][_LOCATION_TAG])
-      raise KeyError("Registering two %s with name '%s' !"
+      raise KeyError("Registering two %s with name '%s'! "
                      "(Previous registration was in %s %s:%d)" %
                      (self._name, name, function_name, filename, line_number))
 
     logging.vlog(1, "Registering %s (%s) in %s.", name, candidate, self._name)
     # stack trace is [this_function, Register(), user_function,...]
     # so the user function is #2.
-    stack = traceback.extract_stack()
-    self._registry[name] = {_TYPE_TAG: candidate, _LOCATION_TAG: stack[2]}
+    stack = tf_stack.extract_stack()
+    user_function = stack[2]
+    location_tag = tf_stack.convert_stack([user_function])[0]
+    self._registry[name] = {_TYPE_TAG: candidate, _LOCATION_TAG: location_tag}
 
   def list(self):
     """Lists registered items.
diff --git a/tensorflow/python/framework/registry_test.py b/tensorflow/python/framework/registry_test.py
index a821e16f260..1a0d3f200d9 100644
--- a/tensorflow/python/framework/registry_test.py
+++ b/tensorflow/python/framework/registry_test.py
@@ -45,7 +45,9 @@ class RegistryTest(test.TestCase):
   def testDuplicate(self):
     myreg = registry.Registry('testbar')
     myreg.register(bar, 'Bar')
-    with self.assertRaises(KeyError):
+    with self.assertRaisesRegexp(
+        KeyError, r'Registering two testbar with name \'Bar\'! '
+        r'\(Previous registration was in [^ ]+ .*.py:[0-9]+\)'):
       myreg.register(bar, 'Bar')
 
 
diff --git a/tensorflow/python/framework/smart_cond_test.py b/tensorflow/python/framework/smart_cond_test.py
index b8a9672b06d..f964c87f024 100644
--- a/tensorflow/python/framework/smart_cond_test.py
+++ b/tensorflow/python/framework/smart_cond_test.py
@@ -35,6 +35,7 @@ def raise_exception():
 
 class SmartCondTest(test_util.TensorFlowTestCase):
 
+  @test_util.run_deprecated_v1
   def testTrue(self):
     with ops.Graph().as_default():
       with session.Session():
@@ -44,6 +45,7 @@ class SmartCondTest(test_util.TensorFlowTestCase):
                                   lambda: math_ops.multiply(y, 5))
         self.assertEqual(z.eval(), 32)
 
+  @test_util.run_deprecated_v1
   def testFalse(self):
     with ops.Graph().as_default():
       with session.Session():
@@ -99,6 +101,7 @@ class SmartCondTest(test_util.TensorFlowTestCase):
 
 class SmartCaseTest(test_util.TensorFlowTestCase):
 
+  @test_util.run_deprecated_v1
   def testTrue(self):
     x = array_ops.placeholder(dtype=dtypes.int32, shape=[])
     conditions = [(True, lambda: constant_op.constant(1)),
@@ -109,9 +112,10 @@ class SmartCaseTest(test_util.TensorFlowTestCase):
                               exclusive=True)
     with session.Session() as sess:
       # No feed_dict necessary
-      self.assertEqual(sess.run(y), 1)
-      self.assertEqual(sess.run(z), 1)
+      self.assertEqual(self.evaluate(y), 1)
+      self.assertEqual(self.evaluate(z), 1)
 
+  @test_util.run_deprecated_v1
   def testFalse(self):
     conditions = [(False, raise_exception)]
     y = smart_cond.smart_case(conditions,
@@ -121,9 +125,10 @@ class SmartCaseTest(test_util.TensorFlowTestCase):
                               default=lambda: constant_op.constant(1),
                               exclusive=True)
     with session.Session() as sess:
-      self.assertEqual(sess.run(y), 1)
-      self.assertEqual(sess.run(z), 1)
+      self.assertEqual(self.evaluate(y), 1)
+      self.assertEqual(self.evaluate(z), 1)
 
+  @test_util.run_deprecated_v1
   def testMix(self):
     x = array_ops.placeholder(dtype=dtypes.int32, shape=[])
     y = constant_op.constant(10)
diff --git a/tensorflow/python/framework/sparse_tensor_test.py b/tensorflow/python/framework/sparse_tensor_test.py
index 22423c4f58c..a999c12ca89 100644
--- a/tensorflow/python/framework/sparse_tensor_test.py
+++ b/tensorflow/python/framework/sparse_tensor_test.py
@@ -46,11 +46,11 @@ class SparseTensorTest(test_util.TensorFlowTestCase):
       self.assertEqual(sp.get_shape(), (4, 5))
 
       with self.cached_session() as sess:
-        value = sp.eval()
+        value = self.evaluate(sp)
         self.assertAllEqual(indices, value.indices)
         self.assertAllEqual(values, value.values)
         self.assertAllEqual(shape, value.dense_shape)
-        sess_run_value = sess.run(sp)
+        sess_run_value = self.evaluate(sp)
         self.assertAllEqual(sess_run_value.indices, value.indices)
         self.assertAllEqual(sess_run_value.values, value.values)
         self.assertAllEqual(sess_run_value.dense_shape, value.dense_shape)
@@ -65,6 +65,7 @@ class SparseTensorTest(test_util.TensorFlowTestCase):
         sparse_tensor.is_sparse(
             sparse_tensor.SparseTensorValue([[0]], [0], [1])))
 
+  @test_util.run_deprecated_v1
   def testConsumers(self):
     sp = sparse_tensor.SparseTensor([[0, 0], [1, 2]], [1.0, 3.0], [3, 4])
     w = ops.convert_to_tensor(np.ones([4, 1], np.float32))
@@ -85,8 +86,9 @@ class ConvertToTensorOrSparseTensorTest(test_util.TensorFlowTestCase):
       value = [42, 43]
       from_value = sparse_tensor.convert_to_tensor_or_sparse_tensor(
           value)
-      self.assertAllEqual(value, from_value.eval())
+      self.assertAllEqual(value, self.evaluate(from_value))
 
+  @test_util.run_deprecated_v1
   def test_convert_sparse(self):
     with self.cached_session():
       indices = [[0, 1], [1, 0]]
diff --git a/tensorflow/python/framework/subscribe_test.py b/tensorflow/python/framework/subscribe_test.py
index cab426844d4..61c6ea65190 100644
--- a/tensorflow/python/framework/subscribe_test.py
+++ b/tensorflow/python/framework/subscribe_test.py
@@ -43,6 +43,7 @@ class SubscribeTest(test_util.TensorFlowTestCase):
     self.assertTrue(
         all(subscribe._is_subscribed_identity(x) for x in container))
 
+  @test_util.run_deprecated_v1
   def testSideEffect(self):
     a = constant_op.constant(1)
     b = constant_op.constant(1)
@@ -66,15 +67,16 @@ class SubscribeTest(test_util.TensorFlowTestCase):
     self.assertTrue(c.op in d.op.control_inputs)
 
     with self.cached_session() as sess:
-      c_out = sess.run([c])
-      n_out = sess.run([n])
-      d_out = sess.run([d])
+      c_out = self.evaluate([c])
+      n_out = self.evaluate([n])
+      d_out = self.evaluate([d])
 
     self.assertEqual(n_out, [-2])
     self.assertEqual(c_out, [2])
     self.assertEqual(d_out, [42])
     self.assertEqual(shared, [2, 2, 2])
 
+  @test_util.run_deprecated_v1
   def testSupportedTypes(self):
     """Confirm that supported types are correctly detected and handled."""
 
@@ -120,6 +122,7 @@ class SubscribeTest(test_util.TensorFlowTestCase):
       subscribe.subscribe(c.name,
                           lambda t: script_ops.py_func(sub, [t], [t.dtype]))
 
+  @test_util.run_deprecated_v1
   def testCaching(self):
     """Confirm caching of control output is recalculated between calls."""
     a = constant_op.constant(1)
@@ -145,13 +148,14 @@ class SubscribeTest(test_util.TensorFlowTestCase):
                             lambda t: script_ops.py_func(sub, [t], [t.dtype]))
 
     with self.cached_session() as sess:
-      c_out = sess.run([c])
-      d_out = sess.run([d])
+      c_out = self.evaluate([c])
+      d_out = self.evaluate([d])
 
     self.assertEqual(c_out, [42])
     self.assertEqual(d_out, [11])
     self.assertEqual(shared, {2: 1, 1: 1})
 
+  @test_util.run_deprecated_v1
   def testIsSubscribedIdentity(self):
     """Confirm subscribed identity ops are correctly detected."""
     a = constant_op.constant(1)
@@ -165,6 +169,7 @@ class SubscribeTest(test_util.TensorFlowTestCase):
     self.assertFalse(subscribe._is_subscribed_identity(idop))
     self.assertTrue(subscribe._is_subscribed_identity(c_sub))
 
+  @test_util.run_deprecated_v1
   def testSubscribeExtend(self):
     """Confirm side effect are correctly added for different input types."""
     a = constant_op.constant(1)
@@ -205,11 +210,12 @@ class SubscribeTest(test_util.TensorFlowTestCase):
 
     # Expect the three side effect graphs to have been evaluated.
     with self.cached_session() as sess:
-      sess.run([c_sub])
+      self.evaluate([c_sub])
     self.assertIn('graph1', shared)
     self.assertIn('graph2', shared)
     self.assertIn('graph3', shared)
 
+  @test_util.run_deprecated_v1
   def testSubscribeVariable(self):
     """Confirm that variables can be subscribed."""
     v1 = variables.VariableV1(0.0)
@@ -229,25 +235,26 @@ class SubscribeTest(test_util.TensorFlowTestCase):
 
     with self.cached_session() as sess:
       # Initialize the variables first.
-      sess.run([v1.initializer])
-      sess.run([v2.initializer])
+      self.evaluate([v1.initializer])
+      self.evaluate([v2.initializer])
 
       # Expect the side effects to be triggered when evaluating the add op as
       # it will read the value of the variable.
-      sess.run([add])
+      self.evaluate([add])
       self.assertEqual(1, len(shared))
 
       # Expect the side effect not to be triggered when evaluating the assign
       # op as it will not access the 'read' output of the variable.
-      sess.run([assign_v1])
+      self.evaluate([assign_v1])
       self.assertEqual(1, len(shared))
 
-      sess.run([add])
+      self.evaluate([add])
       self.assertEqual(2, len(shared))
 
       # Make sure the values read from the variable match the expected ones.
       self.assertEqual([0.0, 3.0], shared)
 
+  @test_util.run_deprecated_v1
   def testResourceType(self):
     """Confirm that subscribe correctly handles tensors with 'resource' type."""
     tensor_array = tensor_array_ops.TensorArray(
@@ -273,9 +280,10 @@ class SubscribeTest(test_util.TensorFlowTestCase):
     self.assertFalse(subscribe._is_subscribed_identity(tensor_array.handle))
 
     with self.cached_session() as sess:
-      sess.run([reader])
+      self.evaluate([reader])
     self.assertEqual(0, len(shared))
 
+  @test_util.run_deprecated_v1
   def testMultipleOutputs(self):
     """Handle subscriptions to multiple outputs from the same op."""
     sparse_tensor_1 = sparse_tensor.SparseTensor(
@@ -304,11 +312,12 @@ class SubscribeTest(test_util.TensorFlowTestCase):
                         lambda t: script_ops.py_func(sub, [t], [t.dtype]))
 
     with self.cached_session() as sess:
-      sess.run([neg])
+      self.evaluate([neg])
 
     # All three ops have been processed.
     self.assertEqual(3, len(shared))
 
+  @test_util.run_deprecated_v1
   def test_subscribe_tensors_on_different_devices(self):
     """Side effect ops are added with the same device of the subscribed op."""
     c1 = constant_op.constant(10)
@@ -335,6 +344,7 @@ class SubscribeTest(test_util.TensorFlowTestCase):
     self.assertEqual(add.device, add_sub.device)
     self.assertEqual(mul.device, mul_sub.device)
 
+  @test_util.run_deprecated_v1
   def test_subscribe_tensors_within_control_flow_context(self):
     """Side effect ops are added with the same control flow context."""
     c1 = constant_op.constant(10)
@@ -375,7 +385,7 @@ class SubscribeTest(test_util.TensorFlowTestCase):
     self.assertIsNot(context(subscriptions[0]), context(subscriptions[1]))
 
     with self.cached_session() as sess:
-      sess.run(cond)
+      self.evaluate(cond)
 
     self.assertEqual(3, len(results))
 
diff --git a/tensorflow/python/framework/tensor_shape.py b/tensorflow/python/framework/tensor_shape.py
index 5a58d271488..960a3dad738 100644
--- a/tensorflow/python/framework/tensor_shape.py
+++ b/tensorflow/python/framework/tensor_shape.py
@@ -169,7 +169,7 @@ def dimension_at_index(shape, index):
     return shape.dims[index]
 
 
-@tf_export("Dimension")
+@tf_export(v1=["Dimension"])
 class Dimension(object):
   """Represents the value of one dimension in a TensorShape."""
 
diff --git a/tensorflow/python/framework/tensor_spec.py b/tensorflow/python/framework/tensor_spec.py
index fbea930fe0e..c44636edc4e 100644
--- a/tensorflow/python/framework/tensor_spec.py
+++ b/tensorflow/python/framework/tensor_spec.py
@@ -24,14 +24,15 @@ from tensorflow.python.framework import common_shapes
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export("TensorSpec")
 class TensorSpec(object):
   """Describes a tf.Tensor.
 
-  A TensorSpec allows an API to describe the Tensors that it accepts or
-  returns, before that Tensor exists. This allows dynamic and flexible graph
-  construction and configuration.
+  Metadata for describing the `tf.Tensor` objects accepted or returned
+  by some TensorFlow APIs.
   """
 
   __slots__ = ["_shape", "_shape_tuple", "_dtype", "_name"]
@@ -69,11 +70,6 @@ class TensorSpec(object):
     else:
       raise ValueError("`tensor` should be a tf.Tensor")
 
-  @classmethod
-  def is_bounded(cls):
-    del cls
-    return False
-
   @property
   def shape(self):
     """Returns the `TensorShape` that represents the shape of the tensor."""
@@ -86,21 +82,21 @@ class TensorSpec(object):
 
   @property
   def name(self):
-    """Returns the name of the described tensor."""
+    """Returns the (optionally provided) name of the described tensor."""
     return self._name
 
-  @property
-  def is_discrete(self):
-    """Whether spec is discrete."""
-    return self.dtype.is_integer
-
-  @property
-  def is_continuous(self):
-    """Whether spec is continuous."""
-    return self.dtype.is_floating
-
   def is_compatible_with(self, spec_or_tensor):
-    """True if the shape and dtype of `spec_or_tensor` are compatible."""
+    """Returns True if spec_or_tensor is compatible with this TensorSpec.
+
+    Two tensors are considered compatible if they have the same dtype
+    and their shapes are compatible (see `tf.TensorShape.is_compatible_with`).
+
+    Args:
+      spec_or_tensor: A tf.TensorSpec or a tf.Tensor
+
+    Returns:
+      True if spec_or_tensor is compatible with self.
+    """
     return (self._dtype.is_compatible_with(spec_or_tensor.dtype) and
             self._shape.is_compatible_with(spec_or_tensor.shape))
 
@@ -188,11 +184,6 @@ class BoundedTensorSpec(TensorSpec):
     self._maximum = np.array(maximum, dtype=self.dtype.as_numpy_dtype())
     self._maximum.setflags(write=False)
 
-  @classmethod
-  def is_bounded(cls):
-    del cls
-    return True
-
   @classmethod
   def from_spec(cls, spec):
     dtype = dtypes.as_dtype(spec.dtype)
@@ -223,4 +214,3 @@ class BoundedTensorSpec(TensorSpec):
   def __reduce__(self):
     return BoundedTensorSpec, (self._shape, self._dtype, self._minimum,
                                self._maximum, self._name)
-
diff --git a/tensorflow/python/framework/tensor_spec_test.py b/tensorflow/python/framework/tensor_spec_test.py
index 40611e5f840..75c197df09e 100644
--- a/tensorflow/python/framework/tensor_spec_test.py
+++ b/tensorflow/python/framework/tensor_spec_test.py
@@ -45,6 +45,7 @@ class TensorSpecTest(test_util.TensorFlowTestCase):
     desc = tensor_spec.TensorSpec(shape=None, dtype=dtypes.float32)
     self.assertEqual(desc.shape, tensor_shape.TensorShape(None))
 
+  @test_util.run_deprecated_v1
   def testShapeCompatibility(self):
     unknown = array_ops.placeholder(dtypes.int64)
     partial = array_ops.placeholder(dtypes.int64, shape=[None, 1])
@@ -75,6 +76,7 @@ class TensorSpecTest(test_util.TensorFlowTestCase):
     self.assertFalse(desc_rank3.is_compatible_with(full))
     self.assertTrue(desc_rank3.is_compatible_with(rank3))
 
+  @test_util.run_deprecated_v1
   def testTypeCompatibility(self):
     floats = array_ops.placeholder(dtypes.float32, shape=[10, 10])
     ints = array_ops.placeholder(dtypes.int32, shape=[10, 10])
@@ -106,6 +108,7 @@ class TensorSpecTest(test_util.TensorFlowTestCase):
     spec_2 = tensor_spec.TensorSpec.from_spec(spec_1)
     self.assertEqual(spec_1, spec_2)
 
+  @test_util.run_deprecated_v1
   def testFromTensor(self):
     zero = constant_op.constant(0)
     spec = tensor_spec.TensorSpec.from_tensor(zero)
@@ -113,6 +116,7 @@ class TensorSpecTest(test_util.TensorFlowTestCase):
     self.assertEqual(spec.shape, [])
     self.assertEqual(spec.name, "Const")
 
+  @test_util.run_deprecated_v1
   def testFromPlaceholder(self):
     unknown = array_ops.placeholder(dtypes.int64, name="unknown")
     partial = array_ops.placeholder(dtypes.float32,
@@ -134,22 +138,6 @@ class TensorSpecTest(test_util.TensorFlowTestCase):
     self.assertEqual(bounded_spec.dtype, spec.dtype)
     self.assertEqual(bounded_spec.name, spec.name)
 
-  def testIsDiscrete(self):
-    discrete_spec = tensor_spec.TensorSpec((1, 2), dtypes.int32)
-    continuous_spec = tensor_spec.TensorSpec((1, 2), dtypes.float32)
-    self.assertTrue(discrete_spec.is_discrete)
-    self.assertFalse(continuous_spec.is_discrete)
-
-  def testIsContinuous(self):
-    discrete_spec = tensor_spec.TensorSpec((1, 2), dtypes.int32)
-    continuous_spec = tensor_spec.TensorSpec((1, 2), dtypes.float32)
-    self.assertFalse(discrete_spec.is_continuous)
-    self.assertTrue(continuous_spec.is_continuous)
-
-  def testIsBounded(self):
-    unbounded_spec = tensor_spec.TensorSpec((1, 2), dtypes.int32)
-    self.assertFalse(unbounded_spec.is_bounded())
-
   def testSerialization(self):
     desc = tensor_spec.TensorSpec([1, 5], dtypes.float32, "test")
     self.assertEqual(pickle.loads(pickle.dumps(desc)), desc)
@@ -165,11 +153,6 @@ class BoundedTensorSpecTest(test_util.TensorFlowTestCase):
     with self.assertRaisesRegexp(ValueError, "not compatible"):
       tensor_spec.BoundedTensorSpec((3, 5), dtypes.uint8, 0, (1, 1, 1))
 
-  def testIsBounded(self):
-    bounded_spec = tensor_spec.BoundedTensorSpec(
-        (1, 2), dtypes.int32, minimum=0, maximum=1)
-    self.assertTrue(bounded_spec.is_bounded())
-
   def testMinimumMaximumAttributes(self):
     spec = tensor_spec.BoundedTensorSpec(
         (1, 2, 3), dtypes.float32, 0, (5, 5, 5))
diff --git a/tensorflow/python/framework/tensor_util.py b/tensorflow/python/framework/tensor_util.py
index 9db94f5288c..f98f301b38a 100644
--- a/tensorflow/python/framework/tensor_util.py
+++ b/tensorflow/python/framework/tensor_util.py
@@ -371,8 +371,10 @@ def _AssertCompatible(values, dtype):
                       (dtype.name, repr(mismatch), type(mismatch).__name__))
 
 
+# pylint: disable=invalid-name
 @tf_export(v1=["make_tensor_proto"])
-def make_tensor_proto(values, dtype=None, shape=None, verify_shape=False):
+def make_tensor_proto(values, dtype=None, shape=None, verify_shape=False,
+                      allow_broadcast=False):
   """Create a TensorProto.
 
   Args:
@@ -380,6 +382,8 @@ def make_tensor_proto(values, dtype=None, shape=None, verify_shape=False):
     dtype:          Optional tensor_pb2 DataType value.
     shape:          List of integers representing the dimensions of tensor.
     verify_shape:   Boolean that enables verification of a shape of values.
+    allow_broadcast:Boolean that enables allowing scalars and 1 length vector
+        broadcasting. Cannot be true when verify_shape is true.
 
   Returns:
     A `TensorProto`. Depending on the type, it may contain data in the
@@ -416,6 +420,8 @@ def make_tensor_proto(values, dtype=None, shape=None, verify_shape=False):
   can not have more elements than what "shape" specifies.
 
   """
+  if allow_broadcast and verify_shape:
+    raise ValueError("allow_broadcast and verify_shape are not both allowed.")
   if isinstance(values, tensor_pb2.TensorProto):
     return values
 
@@ -504,15 +510,22 @@ def make_tensor_proto(values, dtype=None, shape=None, verify_shape=False):
     shape_size = np.prod(shape, dtype=np.int64)
     is_same_size = shape_size == nparray.size
 
-    if verify_shape:
-      if not nparray.shape == tuple(shape):
+    if allow_broadcast:
+      if nparray.shape == (1,) or nparray.shape == tuple():
+        pass
+      elif nparray.size != shape_size:
         raise TypeError("Expected Tensor's shape: %s, got %s." %
                         (tuple(shape), nparray.shape))
 
-    if nparray.size > shape_size:
-      raise ValueError(
-          "Too many elements provided. Needed at most %d, but received %d" %
-          (shape_size, nparray.size))
+    else:
+      if verify_shape and nparray.shape != tuple(shape):
+        raise TypeError("Expected Tensor's shape: %s, got %s." %
+                        (tuple(shape), nparray.shape))
+
+      if nparray.size > shape_size:
+        raise ValueError(
+            "Too many elements provided. Needed at most %d, but received %d" %
+            (shape_size, nparray.size))
 
   tensor_proto = tensor_pb2.TensorProto(
       dtype=numpy_dtype.as_datatype_enum,
@@ -560,6 +573,7 @@ def make_tensor_proto(values, dtype=None, shape=None, verify_shape=False):
   append_fn(tensor_proto, proto_values)
 
   return tensor_proto
+# pylint: enable=invalid-name
 
 
 @tf_export("make_ndarray")
diff --git a/tensorflow/python/framework/tensor_util_test.py b/tensorflow/python/framework/tensor_util_test.py
index bdf759f2204..00337546186 100644
--- a/tensorflow/python/framework/tensor_util_test.py
+++ b/tensorflow/python/framework/tensor_util_test.py
@@ -758,6 +758,7 @@ class TensorUtilTest(test.TestCase):
     self.assertFalse(tensor_util.ShapeEquals(t, [1, 4]))
     self.assertFalse(tensor_util.ShapeEquals(t, [4]))
 
+  @test_util.run_deprecated_v1
   def testMockArray(self):
 
     class MockArray(object):
@@ -771,7 +772,7 @@ class TensorUtilTest(test.TestCase):
     with self.cached_session() as sess:
       ma = MockArray(np.array([10, 20, 30]))
       t = ops.convert_to_tensor(ma)
-      a = sess.run(t)
+      a = self.evaluate(t)
       self.assertEquals(np.int64, a.dtype)
       self.assertAllClose(np.array([10, 20, 30], dtype=np.int64), a)
 
@@ -787,6 +788,7 @@ class ConstantValueTest(test.TestCase):
     tf_val = constant_op.constant(np_val)
     self.assertAllClose(np_val, tensor_util.constant_value(tf_val))
 
+  @test_util.run_deprecated_v1
   def testUnknown(self):
     tf_val = gen_state_ops.variable(
         shape=[3, 4, 7],
@@ -815,12 +817,14 @@ class ConstantValueTest(test.TestCase):
     c_val = tensor_util.constant_value(tf_val)
     self.assertEqual(6, c_val)
 
+  @test_util.run_deprecated_v1
   def testSizeOfScalar(self):
     tf_val = array_ops.size(constant_op.constant(0.0))
     c_val = tensor_util.constant_value(tf_val)
     self.assertEqual(1, c_val)
     self.assertEqual(np.ndarray, type(c_val))
 
+  @test_util.run_deprecated_v1
   def testRank(self):
     tf_val = array_ops.rank(constant_op.constant(0.0, shape=[1, 2, 3]))
     c_val = tensor_util.constant_value(tf_val)
@@ -852,6 +856,7 @@ class ConstantValueTest(test.TestCase):
     c_val = tensor_util.constant_value(tf_val)
     self.assertAllClose(np_val.astype(np.float64), c_val)
 
+  @test_util.run_deprecated_v1
   def testConcat(self):
     np_val = np.random.rand(3, 4, 7).astype(np.float32)
     tf_val = array_ops.concat(
@@ -871,6 +876,7 @@ class ConstantValueTest(test.TestCase):
     c_val = tensor_util.constant_value(tf_val)
     self.assertIs(None, c_val)
 
+  @test_util.run_deprecated_v1
   def testPack_Axis0(self):
     inputs = [np.random.rand(4, 7) for _ in range(3)]
     np_val = np.array(inputs)
@@ -883,6 +889,7 @@ class ConstantValueTest(test.TestCase):
     c_val = tensor_util.constant_value(tf_val)
     self.assertIs(None, c_val)
 
+  @test_util.run_deprecated_v1
   def testPack_Axis1(self):
     inputs = [np.random.rand(4, 7) for _ in range(3)]
     tf_val = array_ops.stack(inputs, axis=1)
@@ -894,6 +901,7 @@ class ConstantValueTest(test.TestCase):
     c_val = tensor_util.constant_value(tf_val)
     self.assertIs(None, c_val)
 
+  @test_util.run_deprecated_v1
   def testPack_Partial_Axis0(self):
     input_ = np.random.rand(4, 7)
     tf_val = array_ops.stack([input_, array_ops.placeholder(dtypes.float32)])
@@ -901,6 +909,7 @@ class ConstantValueTest(test.TestCase):
     self.assertAllClose(input_, c_val[0])
     self.assertIsNone(c_val[1])
 
+  @test_util.run_deprecated_v1
   def testPack_Partial_Axis1(self):
     input_ = np.random.rand(4, 7)
     tf_val = array_ops.stack([input_, array_ops.placeholder(dtypes.float32)],
@@ -966,12 +975,14 @@ class ConstantValueAsShapeTest(test.TestCase):
     c_val = tensor_util.constant_value_as_shape(tf_val)
     self.assertEqual([None, 1, None], c_val.as_list())
 
+  @test_util.run_deprecated_v1
   def testPack(self):
     tf_val = array_ops.stack(
         [constant_op.constant(16), 37, array_ops.placeholder(dtypes.int32)])
     c_val = tensor_util.constant_value_as_shape(tf_val)
     self.assertEqual([16, 37, None], c_val.as_list())
 
+  @test_util.run_deprecated_v1
   def testConcat(self):
     tf_val = array_ops.concat(
         [[16, 37], array_ops.placeholder(
@@ -985,6 +996,7 @@ class ConstantValueAsShapeTest(test.TestCase):
     c_val = tensor_util.constant_value_as_shape(tf_val)
     self.assertEqual([16, 37, None, 48], c_val.as_list())
 
+  @test_util.run_deprecated_v1
   def testSlice(self):
     tf_val = array_ops.placeholder(dtypes.int32, shape=(4,))[0:2]
     c_val = tensor_util.constant_value_as_shape(tf_val)
diff --git a/tensorflow/python/framework/test_util.py b/tensorflow/python/framework/test_util.py
index fd55ad2af9e..fc1a5fbe856 100644
--- a/tensorflow/python/framework/test_util.py
+++ b/tensorflow/python/framework/test_util.py
@@ -50,6 +50,7 @@ from tensorflow.core.framework import graph_pb2
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.core.protobuf import rewriter_config_pb2
 from tensorflow.python import pywrap_tensorflow
+from tensorflow.python import tf2
 from tensorflow.python.client import device_lib
 from tensorflow.python.client import session
 from tensorflow.python.eager import context
@@ -66,6 +67,7 @@ from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import versions
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import tensor_array_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import googletest
 from tensorflow.python.platform import tf_logging as logging
@@ -114,8 +116,28 @@ def assert_ops_in_graph(expected_ops, graph):
   return actual_ops
 
 
-@tf_export("test.assert_equal_graph_def")
-def assert_equal_graph_def(actual, expected, checkpoint_v2=False):
+@tf_export("test.assert_equal_graph_def", v1=[])
+def assert_equal_graph_def_v2(actual, expected):
+  """Asserts that two `GraphDef`s are (mostly) the same.
+
+  Compares two `GraphDef` protos for equality, ignoring versions and ordering of
+  nodes, attrs, and control inputs.  Node names are used to match up nodes
+  between the graphs, so the naming of nodes must be consistent. This function
+  ignores randomized attribute values that may appear in V2 checkpoints.
+
+  Args:
+    actual: The `GraphDef` we have.
+    expected: The `GraphDef` we expected.
+
+  Raises:
+    AssertionError: If the `GraphDef`s do not match.
+    TypeError: If either argument is not a `GraphDef`.
+  """
+  assert_equal_graph_def(actual, expected, checkpoint_v2=True)
+
+
+@tf_export(v1=["test.assert_equal_graph_def"])
+def assert_equal_graph_def_v1(actual, expected, checkpoint_v2=False):
   """Asserts that two `GraphDef`s are (mostly) the same.
 
   Compares two `GraphDef` protos for equality, ignoring versions and ordering of
@@ -132,6 +154,10 @@ def assert_equal_graph_def(actual, expected, checkpoint_v2=False):
     AssertionError: If the `GraphDef`s do not match.
     TypeError: If either argument is not a `GraphDef`.
   """
+  assert_equal_graph_def(actual, expected, checkpoint_v2)
+
+
+def assert_equal_graph_def(actual, expected, checkpoint_v2=False):
   if not isinstance(actual, graph_pb2.GraphDef):
     raise TypeError(
         "Expected tf.GraphDef for actual, got %s" % type(actual).__name__)
@@ -354,53 +380,12 @@ def skip_if(condition):
 
 
 def enable_c_shapes(fn):
-  """Decorator for enabling C shapes on a test.
-
-  Note this enables the C shapes after running the test class's setup/teardown
-  methods.
-
-  Args:
-    fn: the function to be wrapped
-
-  Returns:
-    The wrapped function
-  """
-
-  # pylint: disable=protected-access
-  def wrapper(*args, **kwargs):
-    prev_value = ops._USE_C_SHAPES
-    ops._USE_C_SHAPES = True
-    try:
-      fn(*args, **kwargs)
-    finally:
-      ops._USE_C_SHAPES = prev_value
-
-  # pylint: enable=protected-access
-
-  return wrapper
+  """No-op. TODO(b/74620627): Remove this."""
+  return fn
 
 
 def with_c_shapes(cls):
-  """Adds methods that call original methods but with C API shapes enabled.
-
-  Note this enables C shapes in new methods after running the test class's
-  setup method.
-
-  Args:
-    cls: class to decorate
-
-  Returns:
-    cls with new test methods added
-  """
-  # If C shapes are already enabled, don't do anything. Some tests break if the
-  # same test is run twice, so this allows us to turn on the C shapes by default
-  # without breaking these tests.
-  if ops._USE_C_SHAPES:
-    return cls
-
-  for name, value in cls.__dict__.copy().items():
-    if callable(value) and name.startswith("test"):
-      setattr(cls, name + "WithCShapes", enable_c_shapes(value))
+  """No-op. TODO(b/74620627): Remove this."""
   return cls
 
 
@@ -423,13 +408,40 @@ def enable_control_flow_v2(fn):
   def wrapper(*args, **kwargs):
     enable_cond_v2_old = control_flow_ops.ENABLE_COND_V2
     enable_while_v2_old = control_flow_ops.ENABLE_WHILE_V2
+    enable_tensor_array_v2_old = tensor_array_ops.ENABLE_TENSOR_ARRAY_V2
     control_flow_ops.ENABLE_COND_V2 = True
     control_flow_ops.ENABLE_WHILE_V2 = True
+    tensor_array_ops.ENABLE_TENSOR_ARRAY_V2 = True
     try:
       fn(*args, **kwargs)
     finally:
       control_flow_ops.ENABLE_COND_V2 = enable_cond_v2_old
       control_flow_ops.ENABLE_WHILE_V2 = enable_while_v2_old
+      tensor_array_ops.ENABLE_TENSOR_ARRAY_V2 = enable_tensor_array_v2_old
+
+  return wrapper
+
+
+def enable_tensor_array_v2(fn):
+  """Decorator for enabling _GraphTensorArrayV2 on a test.
+
+  Note this enables _GraphTensorArrayV2 after running the test class's
+  setup/teardown methods.
+
+  Args:
+    fn: the function to be wrapped
+
+  Returns:
+    The wrapped function
+  """
+
+  def wrapper(*args, **kwargs):
+    enable_tensor_array_v2_old = tensor_array_ops.ENABLE_TENSOR_ARRAY_V2
+    tensor_array_ops.ENABLE_TENSOR_ARRAY_V2 = True
+    try:
+      fn(*args, **kwargs)
+    finally:
+      tensor_array_ops.ENABLE_TENSOR_ARRAY_V2 = enable_tensor_array_v2_old
 
   return wrapper
 
@@ -881,8 +893,8 @@ def run_all_in_graph_and_eager_modes(cls):
   """Execute all test methods in the given class with and without eager."""
   base_decorator = run_in_graph_and_eager_modes
   for name, value in cls.__dict__.copy().items():
-    if callable(value) and name.startswith(
-        "test") and not name.startswith("testSkipEager"):
+    if callable(value) and name.startswith("test") and not (
+        name.startswith("testSkipEager") or name.startswith("test_skip_eager")):
       setattr(cls, name, base_decorator(value))
   return cls
 
@@ -949,7 +961,7 @@ def run_in_graph_and_eager_modes(func=None,
   def decorator(f):
     if tf_inspect.isclass(f):
       raise ValueError(
-          "`run_test_in_graph_and_eager_modes` only supports test methods. "
+          "`run_in_graph_and_eager_modes` only supports test methods. "
           "Did you mean to use `run_all_in_graph_and_eager_modes`?")
 
     def decorated(self, *args, **kwargs):
@@ -994,6 +1006,174 @@ def run_in_graph_and_eager_modes(func=None,
   return decorator
 
 
+def run_deprecated_v1(func=None):
+  """Execute the decorated test in graph mode.
+
+  This function returns a decorator intended to be applied to tests that have
+  not been updated to a style that is compatible with both TensorFlow 1.x and
+  2.x. When this decorated is applied, the test body will be run in
+  an environment where API calls construct graphs instead of executing eagerly.
+
+  Args:
+    func: function to be annotated. If `func` is None, this method returns a
+      decorator the can be applied to a function. If `func` is not None this
+      returns the decorator applied to `func`.
+  Returns:
+    Returns a decorator that will run the decorated test method in graph mode.
+  """
+
+  def decorator(f):
+    if tf_inspect.isclass(f):
+      raise ValueError("`run_deprecated_v1` only supports test methods.")
+
+    def decorated(self, *args, **kwargs):
+      if tf2.enabled():
+        with context.graph_mode():
+          f(self, *args, **kwargs)
+      else:
+        f(self, *args, **kwargs)
+
+    return decorated
+
+  if func is not None:
+    return decorator(func)
+
+  return decorator
+
+
+def run_v1_only(reason, func=None):
+  """Execute the decorated test only if running in v1 mode.
+
+  This function is intended to be applied to tests that exercise v1 only
+  functionality. If the test is run in v2 mode it will simply be skipped.
+
+  Args:
+    reason: string giving a reason for limiting the test to v1 only.
+    func: function to be annotated. If `func` is None, this method returns a
+      decorator the can be applied to a function. If `func` is not None this
+      returns the decorator applied to `func`.
+
+  Returns:
+    Returns a decorator that will conditionally skip the decorated test method.
+  """
+
+  def decorator(f):
+    if tf_inspect.isclass(f):
+      raise ValueError("`run_v1_only` only supports test methods.")
+
+    def decorated(self, *args, **kwargs):
+      if tf2.enabled():
+        self.skipTest(reason)
+
+      f(self, *args, **kwargs)
+
+    return decorated
+
+  if func is not None:
+    return decorator(func)
+
+  return decorator
+
+
+def run_v2_only(func=None):
+  """Execute the decorated test only if running in v2 mode.
+
+  This function is intended to be applied to tests that exercise v2 only
+  functionality. If the test is run in v1 mode it will simply be skipped.
+
+  Args:
+    func: function to be annotated. If `func` is None, this method returns a
+      decorator the can be applied to a function. If `func` is not None this
+      returns the decorator applied to `func`.
+
+  Returns:
+    Returns a decorator that will conditionally skip the decorated test method.
+  """
+
+  def decorator(f):
+    if tf_inspect.isclass(f):
+      raise ValueError("`run_v2_only` only supports test methods.")
+
+    def decorated(self, *args, **kwargs):
+      if not tf2.enabled():
+        self.skipTest("Test is only comptaible in v2")
+
+      f(self, *args, **kwargs)
+
+    return decorated
+
+  if func is not None:
+    return decorator(func)
+
+  return decorator
+
+
+def run_gpu_only(func=None):
+  """Execute the decorated test only if a GPU is available.
+
+  This function is intended to be applied to tests that require the precense
+  of a GPU. If a GPU is absent, it will simply be skipped.
+
+  Args:
+    func: function to be annotated. If `func` is None, this method returns a
+      decorator the can be applied to a function. If `func` is not None this
+      returns the decorator applied to `func`.
+
+  Returns:
+    Returns a decorator that will conditionally skip the decorated test method.
+  """
+
+  def decorator(f):
+    if tf_inspect.isclass(f):
+      raise ValueError("`run_gpu_only` only supports test methods.")
+
+    def decorated(self, *args, **kwargs):
+      if not is_gpu_available():
+        self.skipTest("Test requires GPU")
+
+      f(self, *args, **kwargs)
+
+    return decorated
+
+  if func is not None:
+    return decorator(func)
+
+  return decorator
+
+
+def run_cuda_only(func=None):
+  """Execute the decorated test only if a GPU is available.
+
+  This function is intended to be applied to tests that require the precense
+  of a CUDA GPU. If a CUDA GPU is absent, it will simply be skipped.
+
+  Args:
+    func: function to be annotated. If `func` is None, this method returns a
+      decorator the can be applied to a function. If `func` is not None this
+      returns the decorator applied to `func`.
+
+  Returns:
+    Returns a decorator that will conditionally skip the decorated test method.
+  """
+
+  def decorator(f):
+    if tf_inspect.isclass(f):
+      raise ValueError("`run_cuda_only` only supports test methods.")
+
+    def decorated(self, *args, **kwargs):
+      if not is_gpu_available(cuda_only=True):
+        self.skipTest("Test requires CUDA GPU")
+
+      f(self, *args, **kwargs)
+
+    return decorated
+
+  if func is not None:
+    return decorator(func)
+
+  return decorator
+
+
 @tf_export("test.is_gpu_available")
 def is_gpu_available(cuda_only=False, min_cuda_compute_capability=None):
   """Returns whether TensorFlow can access a GPU.
@@ -1033,7 +1213,7 @@ def is_gpu_available(cuda_only=False, min_cuda_compute_capability=None):
         return True
     return False
   except errors_impl.NotFoundError as e:
-    if not all([x in str(e) for x in ["CUDA", "not find"]]):
+    if not all(x in str(e) for x in ["CUDA", "not find"]):
       raise e
     else:
       logging.error(str(e))
@@ -1051,6 +1231,27 @@ def device(use_gpu):
     yield
 
 
+@contextlib.contextmanager
+def use_gpu():
+  """Uses gpu when requested and available."""
+  with device(use_gpu=True):
+    yield
+
+
+@contextlib.contextmanager
+def force_gpu():
+  """Force the gpu to be used."""
+  with ops.device("/device:GPU:0"):
+    yield
+
+
+@contextlib.contextmanager
+def force_cpu():
+  """Force the cpu to be used."""
+  with ops.device("/device:CPU:0"):
+    yield
+
+
 class CapturedWrites(object):
   """A utility class to load the captured writes made to a stream."""
 
diff --git a/tensorflow/python/framework/test_util_test.py b/tensorflow/python/framework/test_util_test.py
index cbefe864814..dfdced5a988 100644
--- a/tensorflow/python/framework/test_util_test.py
+++ b/tensorflow/python/framework/test_util_test.py
@@ -49,6 +49,7 @@ from tensorflow.python.platform import googletest
 
 class TestUtilTest(test_util.TensorFlowTestCase, parameterized.TestCase):
 
+  @test_util.run_deprecated_v1
   def test_assert_ops_in_graph(self):
     with self.test_session():
       constant_op.constant(["hello", "taffy"], name="hello")
@@ -60,6 +61,7 @@ class TestUtilTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     self.assertRaises(ValueError, test_util.assert_ops_in_graph,
                       {"hello": "Variable"}, ops.get_default_graph())
 
+  @test_util.run_deprecated_v1
   def test_session_functions(self):
     with self.test_session() as sess:
       sess_ref = weakref.ref(sess)
@@ -551,6 +553,7 @@ class TestUtilTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     with self.assertRaises(AssertionError):
       self.assertAllLessEqual(x, 95.0)
 
+  @test_util.run_deprecated_v1
   def testAssertAllInRangeWithNonNumericValuesFails(self):
     s1 = constant_op.constant("Hello, ", name="s1")
     c = constant_op.constant([1 + 2j, -3 + 5j], name="c")
@@ -614,6 +617,7 @@ class TestUtilTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     with self.assertRaises(AssertionError):
       self.assertAllInSet(x, (42,))
 
+  @test_util.run_deprecated_v1
   def testRandomSeed(self):
     # Call setUp again for WithCApi case (since it makes a new defeault graph
     # after setup).
@@ -681,7 +685,7 @@ class TestUtilTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     self.assertIsNone(test_util.get_node_def_from_graph("bar", graph_def))
 
   def test_run_in_eager_and_graph_modes_test_class(self):
-    msg = "`run_test_in_graph_and_eager_modes` only supports test methods.*"
+    msg = "`run_in_graph_and_eager_modes` only supports test methods.*"
     with self.assertRaisesRegexp(ValueError, msg):
       @test_util.run_in_graph_and_eager_modes()
       class Foo(object):
@@ -706,6 +710,7 @@ class TestUtilTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     test_util.run_in_graph_and_eager_modes(_test)(self)
     self.assertEqual(modes, ["graph"])
 
+  @test_util.run_deprecated_v1
   def test_run_in_graph_and_eager_modes_setup_in_same_mode(self):
     modes = []
     mode_name = lambda: "eager" if context.executing_eagerly() else "graph"
diff --git a/tensorflow/python/grappler/constant_folding_test.py b/tensorflow/python/grappler/constant_folding_test.py
index ab1d0ed25b9..30c1e146814 100644
--- a/tensorflow/python/grappler/constant_folding_test.py
+++ b/tensorflow/python/grappler/constant_folding_test.py
@@ -61,7 +61,7 @@ class ConstantFoldingTest(test.TestCase):
           back_prop=False,
           parallel_iterations=1)
       with session.Session() as sess:
-        y_v = sess.run(y)
+        y_v = self.evaluate(y)
         self.assertAllEqual(np.zeros([10, 20, 30]), y_v)
 
 
diff --git a/tensorflow/python/grappler/cost_analyzer_test.py b/tensorflow/python/grappler/cost_analyzer_test.py
index b8225b81a52..ee3e289f65d 100644
--- a/tensorflow/python/grappler/cost_analyzer_test.py
+++ b/tensorflow/python/grappler/cost_analyzer_test.py
@@ -38,6 +38,7 @@ from tensorflow.python.training import adam
 
 class CostAnalysisTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testBasicCost(self):
     """Make sure arguments can be passed correctly."""
     a = constant_op.constant(10, name="a")
@@ -62,6 +63,7 @@ class CostAnalysisTest(test.TestCase):
     # Also print the report to make it easier to debug
     print("{}".format(report))
 
+  @test_util.run_deprecated_v1
   def testVerbose(self):
     """Make sure the full report is generated with verbose=True."""
     a = constant_op.constant(10, name="a")
@@ -81,6 +83,7 @@ class CostAnalysisTest(test.TestCase):
     # Also print the report to make it easier to debug
     print("{}".format(report))
 
+  @test_util.run_deprecated_v1
   def testSmallNetworkCost(self):
     image = array_ops.placeholder(dtypes.float32, shape=[1, 28, 28, 1])
     label = array_ops.placeholder(dtypes.float32, shape=[1, 10])
@@ -96,8 +99,8 @@ class CostAnalysisTest(test.TestCase):
     b_fc = variables.Variable(random_ops.truncated_normal([10], stddev=0.1))
     y_conv = nn_ops.softmax(math_ops.matmul(h_conv_flat, w_fc) + b_fc)
 
-    cross_entropy = math_ops.reduce_mean(-math_ops.reduce_sum(
-        label * math_ops.log(y_conv), reduction_indices=[1]))
+    cross_entropy = math_ops.reduce_mean(
+        -math_ops.reduce_sum(label * math_ops.log(y_conv), axis=[1]))
     _ = adam.AdamOptimizer(1e-4).minimize(cross_entropy)
 
     mg = meta_graph.create_meta_graph_def(graph=ops.get_default_graph())
@@ -129,6 +132,7 @@ class CostAnalysisTest(test.TestCase):
       # self.assertTrue(0 < upper)
       # self.assertTrue(lower <= upper)
 
+  @test_util.run_deprecated_v1
   def testBasicMemory(self):
     """Make sure arguments can be passed correctly."""
     with test_util.device(use_gpu=False):
diff --git a/tensorflow/python/grappler/cost_analyzer_tool.py b/tensorflow/python/grappler/cost_analyzer_tool.py
index e6229e18566..7dbaf449cad 100644
--- a/tensorflow/python/grappler/cost_analyzer_tool.py
+++ b/tensorflow/python/grappler/cost_analyzer_tool.py
@@ -25,8 +25,8 @@ from google.protobuf import message
 from google.protobuf import text_format
 from tensorflow.contrib.fused_conv.ops import gen_fused_conv2d_bias_activation_op  # pylint: disable=unused-import
 from tensorflow.core.framework import graph_pb2
+from tensorflow.core.protobuf import config_pb2
 from tensorflow.core.protobuf import meta_graph_pb2
-from tensorflow.core.protobuf import rewriter_config_pb2
 from tensorflow.core.protobuf import saved_model_pb2
 from tensorflow.python.framework import importer
 from tensorflow.python.framework import ops
@@ -79,10 +79,11 @@ def get_metagraph():
 
 def main(_):
   metagraph = get_metagraph()
-  rewriter_config = rewriter_config_pb2.RewriterConfig()
+  config = config_pb2.ConfigProto()
   if FLAGS.rewriter_config is not None:
-    text_format.Merge(FLAGS.rewriter_config, rewriter_config)
-  optimized_graph = tf_optimizer.OptimizeGraph(rewriter_config, metagraph)
+    text_format.Merge(FLAGS.rewriter_config,
+                      config.graph_options.rewrite_options)
+  optimized_graph = tf_optimizer.OptimizeGraph(config, metagraph)
   metagraph.graph_def.CopyFrom(optimized_graph)
 
   report = cost_analyzer.GenerateCostReport(metagraph, FLAGS.per_node_report,
diff --git a/tensorflow/python/grappler/datasets_test.py b/tensorflow/python/grappler/datasets_test.py
index bd870ad8de4..6937301ab25 100644
--- a/tensorflow/python/grappler/datasets_test.py
+++ b/tensorflow/python/grappler/datasets_test.py
@@ -48,7 +48,7 @@ class GrapplerTest(test.TestCase):
     for test_case in test_cases:
       with ops.Graph().as_default() as g:
         dataset = dataset_ops.Dataset.from_tensors(test_case['tensor'])
-        iterator = dataset.make_one_shot_iterator()
+        iterator = dataset_ops.make_one_shot_iterator(dataset)
         get_next = iterator.get_next()
         train_op = ops.get_collection_ref(ops.GraphKeys.TRAIN_OP)
         train_op.append(get_next)
@@ -73,7 +73,7 @@ class GrapplerTest(test.TestCase):
     for test_case in test_cases:
       with ops.Graph().as_default() as g:
         dataset = dataset_ops.Dataset.from_tensor_slices(test_case['tensor'])
-        iterator = dataset.make_one_shot_iterator()
+        iterator = dataset_ops.make_one_shot_iterator(dataset)
         get_next = iterator.get_next()
         train_op = ops.get_collection_ref(ops.GraphKeys.TRAIN_OP)
         train_op.append(get_next)
@@ -109,7 +109,7 @@ class GrapplerTest(test.TestCase):
             make_generator(test_case['tensor']),
             dtypes.int64,
             output_shapes=test_case['shape'])
-        iterator = dataset.make_one_shot_iterator()
+        iterator = dataset_ops.make_one_shot_iterator(dataset)
         get_next = iterator.get_next()
         train_op = ops.get_collection_ref(ops.GraphKeys.TRAIN_OP)
         train_op.append(get_next)
@@ -122,7 +122,7 @@ class GrapplerTest(test.TestCase):
   def testRange(self):
     with ops.Graph().as_default() as g:
       dataset = dataset_ops.Dataset.range(42)
-      iterator = dataset.make_one_shot_iterator()
+      iterator = dataset_ops.make_one_shot_iterator(dataset)
       get_next = iterator.get_next()
       train_op = ops.get_collection_ref(ops.GraphKeys.TRAIN_OP)
       train_op.append(get_next)
@@ -148,7 +148,7 @@ class GrapplerTest(test.TestCase):
       with ops.Graph().as_default() as g:
         dataset = dataset_ops.Dataset.from_tensors(test_case['tensor'])
         dataset = fn(dataset, test_case['tensor'], test_case['shape'])
-        iterator = dataset.make_one_shot_iterator()
+        iterator = dataset_ops.make_one_shot_iterator(dataset)
         get_next = iterator.get_next()
         train_op = ops.get_collection_ref(ops.GraphKeys.TRAIN_OP)
         train_op.append(get_next)
@@ -252,7 +252,7 @@ class GrapplerTest(test.TestCase):
       with ops.Graph().as_default() as g:
         dataset = dataset_ops.Dataset.from_tensors(test_case['tensor'])
         dataset = dataset.batch(42)
-        iterator = dataset.make_one_shot_iterator()
+        iterator = dataset_ops.make_one_shot_iterator(dataset)
         get_next = iterator.get_next()
         train_op = ops.get_collection_ref(ops.GraphKeys.TRAIN_OP)
         train_op.append(get_next)
@@ -281,7 +281,7 @@ class GrapplerTest(test.TestCase):
       with ops.Graph().as_default() as g:
         dataset = dataset_ops.Dataset.from_tensors(test_case['tensor'])
         dataset = dataset.padded_batch(42, padded_shapes=test_case['shape'][1:])
-        iterator = dataset.make_one_shot_iterator()
+        iterator = dataset_ops.make_one_shot_iterator(dataset)
         get_next = iterator.get_next()
         train_op = ops.get_collection_ref(ops.GraphKeys.TRAIN_OP)
         train_op.append(get_next)
@@ -318,7 +318,7 @@ class GrapplerTest(test.TestCase):
           return dataset_fn
 
         dataset = dataset.flat_map(make_dataset(test_case['tensor']))
-        iterator = dataset.make_one_shot_iterator()
+        iterator = dataset_ops.make_one_shot_iterator(dataset)
         get_next = iterator.get_next()
         train_op = ops.get_collection_ref(ops.GraphKeys.TRAIN_OP)
         train_op.append(get_next)
@@ -353,7 +353,7 @@ class GrapplerTest(test.TestCase):
 
         dataset = dataset.interleave(
             make_dataset(test_case['tensor']), cycle_length=42)
-        iterator = dataset.make_one_shot_iterator()
+        iterator = dataset_ops.make_one_shot_iterator(dataset)
         get_next = iterator.get_next()
         train_op = ops.get_collection_ref(ops.GraphKeys.TRAIN_OP)
         train_op.append(get_next)
@@ -382,7 +382,7 @@ class GrapplerTest(test.TestCase):
       with ops.Graph().as_default() as g:
         dataset = dataset_ops.Dataset.from_tensors(test_case['tensor'])
         dataset = dataset.map(array_ops.transpose)
-        iterator = dataset.make_one_shot_iterator()
+        iterator = dataset_ops.make_one_shot_iterator(dataset)
         get_next = iterator.get_next()
         train_op = ops.get_collection_ref(ops.GraphKeys.TRAIN_OP)
         train_op.append(get_next)
diff --git a/tensorflow/python/grappler/graph_placer.py b/tensorflow/python/grappler/graph_placer.py
index 654013b23c5..9c05ad81790 100644
--- a/tensorflow/python/grappler/graph_placer.py
+++ b/tensorflow/python/grappler/graph_placer.py
@@ -19,8 +19,8 @@ from __future__ import division
 from __future__ import print_function
 
 import time
+from tensorflow.core.protobuf import config_pb2
 from tensorflow.core.protobuf import meta_graph_pb2
-from tensorflow.core.protobuf import rewriter_config_pb2
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops as tf_ops
 from tensorflow.python.grappler import cluster as gcluster
@@ -54,9 +54,9 @@ def PlaceGraph(metagraph,
     cluster = gcluster.Cluster()
 
   # Optimize the metagraph to speedup the placement
-  rewriter_config = rewriter_config_pb2.RewriterConfig()
+  config = config_pb2.ConfigProto()
   optimized_graph = tf_optimizer.OptimizeGraph(
-      rewriter_config, metagraph, verbose=verbose, cluster=cluster)
+      config, metagraph, verbose=verbose, cluster=cluster)
   optimized_metagraph = meta_graph_pb2.MetaGraphDef()
   optimized_metagraph.CopyFrom(metagraph)
   optimized_metagraph.graph_def.CopyFrom(optimized_graph)
diff --git a/tensorflow/python/grappler/item_test.py b/tensorflow/python/grappler/item_test.py
index d3d96c646cd..78604b259ca 100644
--- a/tensorflow/python/grappler/item_test.py
+++ b/tensorflow/python/grappler/item_test.py
@@ -24,6 +24,7 @@ from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import meta_graph
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import test_util
 from tensorflow.python.grappler import item
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gen_array_ops
@@ -107,6 +108,7 @@ class ItemTest(test.TestCase):
     newest_tf_item = grappler_item.tf_item
     self.assertEqual(new_tf_item, newest_tf_item)
 
+  @test_util.run_deprecated_v1
   def testColocationContraints(self):
     with ops.Graph().as_default() as g:
       c = constant_op.constant([10])
diff --git a/tensorflow/python/grappler/layout_optimizer_test.py b/tensorflow/python/grappler/layout_optimizer_test.py
index 8cc971c61d5..98f2e6d7181 100644
--- a/tensorflow/python/grappler/layout_optimizer_test.py
+++ b/tensorflow/python/grappler/layout_optimizer_test.py
@@ -29,6 +29,7 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import random_seed
+from tensorflow.python.framework import test_util
 from tensorflow.python.grappler import cluster as gcluster
 from tensorflow.python.grappler import tf_optimizer
 from tensorflow.python.layers import convolutional as conv_layers
@@ -241,7 +242,7 @@ class LayoutOptimizerTest(test.TestCase):
       if restore:
         saver.restore(sess, checkpoint_path)
       else:
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
 
       np.random.seed(0)
       for _ in range(2):
@@ -262,7 +263,7 @@ class LayoutOptimizerTest(test.TestCase):
       output = _two_layer_model(x)
 
       with session.Session(config=_get_config(False)) as sess:
-        output_val_ref = sess.run(output)
+        output_val_ref = self.evaluate(output)
 
       with session.Session(config=_get_config()) as sess:
         metadata = config_pb2.RunMetadata()
@@ -365,7 +366,7 @@ class LayoutOptimizerTest(test.TestCase):
       output = array_ops.identity(pad)
 
       with session.Session(config=_get_config(False)) as sess:
-        output_val_ref = sess.run(output)
+        output_val_ref = self.evaluate(output)
 
       with session.Session(config=_get_config()) as sess:
         metadata = config_pb2.RunMetadata()
@@ -396,7 +397,7 @@ class LayoutOptimizerTest(test.TestCase):
       output = array_ops.identity(reduce_sum)
 
       with session.Session(config=_get_config(False)) as sess:
-        output_val_ref = sess.run(output)
+        output_val_ref = self.evaluate(output)
 
       with session.Session(config=_get_config()) as sess:
         metadata = config_pb2.RunMetadata()
@@ -425,7 +426,7 @@ class LayoutOptimizerTest(test.TestCase):
       output = array_ops.identity(cast)
 
       with session.Session(config=_get_config(False)) as sess:
-        output_val_ref = sess.run(output)
+        output_val_ref = self.evaluate(output)
 
       with session.Session(config=_get_config()) as sess:
         metadata = config_pb2.RunMetadata()
@@ -456,7 +457,7 @@ class LayoutOptimizerTest(test.TestCase):
       output = array_ops.identity(squeeze)
 
       with session.Session(config=_get_config(False)) as sess:
-        output_val_ref = sess.run(output)
+        output_val_ref = self.evaluate(output)
 
       with session.Session(config=_get_config()) as sess:
         metadata = config_pb2.RunMetadata()
@@ -486,7 +487,7 @@ class LayoutOptimizerTest(test.TestCase):
       output = array_ops.identity(squeeze)
 
       with session.Session(config=_get_config(False)) as sess:
-        output_val_ref = sess.run(output)
+        output_val_ref = self.evaluate(output)
 
       with session.Session(config=_get_config()) as sess:
         metadata = config_pb2.RunMetadata()
@@ -516,7 +517,7 @@ class LayoutOptimizerTest(test.TestCase):
       output = array_ops.identity(squeeze)
 
       with session.Session(config=_get_config(False)) as sess:
-        output_val_ref = sess.run(output)
+        output_val_ref = self.evaluate(output)
 
       with session.Session(config=_get_config()) as sess:
         metadata = config_pb2.RunMetadata()
@@ -545,7 +546,7 @@ class LayoutOptimizerTest(test.TestCase):
       output = array_ops.identity(reduce_sum)
 
       with session.Session(config=_get_config(False)) as sess:
-        output_val_ref = sess.run(output)
+        output_val_ref = self.evaluate(output)
 
       with session.Session(config=_get_config()) as sess:
         metadata = config_pb2.RunMetadata()
@@ -574,7 +575,7 @@ class LayoutOptimizerTest(test.TestCase):
       output = array_ops.identity(reduce_sum)
 
       with session.Session(config=_get_config(False)) as sess:
-        output_val_ref = sess.run(output)
+        output_val_ref = self.evaluate(output)
 
       with session.Session(config=_get_config()) as sess:
         metadata = config_pb2.RunMetadata()
@@ -603,7 +604,7 @@ class LayoutOptimizerTest(test.TestCase):
       output = array_ops.identity(reduce_sum)
 
       with session.Session(config=_get_config(False)) as sess:
-        output_val_ref = sess.run(output)
+        output_val_ref = self.evaluate(output)
 
       with session.Session(config=_get_config()) as sess:
         metadata = config_pb2.RunMetadata()
@@ -632,7 +633,7 @@ class LayoutOptimizerTest(test.TestCase):
       output = array_ops.identity(reduce_sum)
 
       with session.Session(config=_get_config(False)) as sess:
-        output_val_ref = sess.run(output)
+        output_val_ref = self.evaluate(output)
 
       with session.Session(config=_get_config()) as sess:
         metadata = config_pb2.RunMetadata()
@@ -662,7 +663,7 @@ class LayoutOptimizerTest(test.TestCase):
       output = array_ops.identity(reduce_sum)
 
       with session.Session(config=_get_config(False)) as sess:
-        output_val_ref = sess.run(output)
+        output_val_ref = self.evaluate(output)
 
       with session.Session(config=_get_config()) as sess:
         metadata = config_pb2.RunMetadata()
@@ -691,7 +692,7 @@ class LayoutOptimizerTest(test.TestCase):
       output = array_ops.identity(reduce_sum)
 
       with session.Session(config=_get_config(False)) as sess:
-        output_val_ref = sess.run(output)
+        output_val_ref = self.evaluate(output)
 
       with session.Session(config=_get_config()) as sess:
         metadata = config_pb2.RunMetadata()
@@ -724,7 +725,7 @@ class LayoutOptimizerTest(test.TestCase):
       output = array_ops.identity(concat)
 
       with session.Session(config=_get_config(False)) as sess:
-        output_val_ref = sess.run(output)
+        output_val_ref = self.evaluate(output)
 
       with session.Session(config=_get_config()) as sess:
         metadata = config_pb2.RunMetadata()
@@ -835,7 +836,7 @@ class LayoutOptimizerTest(test.TestCase):
       output = array_ops.identity(reverse)
 
       with session.Session(config=_get_config(False)) as sess:
-        output_val_ref = sess.run(output)
+        output_val_ref = self.evaluate(output)
 
       with session.Session(config=_get_config()) as sess:
         metadata = config_pb2.RunMetadata()
@@ -905,7 +906,7 @@ class LayoutOptimizerTest(test.TestCase):
       output = array_ops.identity(select)
 
       with session.Session(config=_get_config(False)) as sess:
-        output_val_ref = sess.run(output)
+        output_val_ref = self.evaluate(output)
 
       with session.Session(config=_get_config()) as sess:
         metadata = config_pb2.RunMetadata()
@@ -966,7 +967,7 @@ class LayoutOptimizerTest(test.TestCase):
       output = array_ops.identity(select)
 
       with session.Session(config=_get_config(False)) as sess:
-        output_val_ref = sess.run(output)
+        output_val_ref = self.evaluate(output)
 
       with session.Session(config=_get_config()) as sess:
         metadata = config_pb2.RunMetadata()
@@ -1179,7 +1180,7 @@ class LayoutOptimizerTest(test.TestCase):
       output = array_ops.identity(s)
 
       with session.Session(config=_get_config(False)) as sess:
-        output_val_ref = sess.run(output)
+        output_val_ref = self.evaluate(output)
 
       with session.Session(config=_get_config()) as sess:
         metadata = config_pb2.RunMetadata()
@@ -1214,7 +1215,7 @@ class LayoutOptimizerTest(test.TestCase):
       output = array_ops.identity(s)
 
       with session.Session(config=_get_config(False)) as sess:
-        output_val_ref = sess.run(output)
+        output_val_ref = self.evaluate(output)
 
       with session.Session(config=_get_config()) as sess:
         metadata = config_pb2.RunMetadata()
@@ -1347,7 +1348,7 @@ class LayoutOptimizerTest(test.TestCase):
       output = _loop()
 
       with session.Session(config=_get_config(False)) as sess:
-        output_val_ref = sess.run(output)
+        output_val_ref = self.evaluate(output)
 
       with session.Session(config=_get_config()) as sess:
         metadata = config_pb2.RunMetadata()
@@ -1374,7 +1375,7 @@ class LayoutOptimizerTest(test.TestCase):
       output = _loop_with_branch()
 
       with session.Session(config=_get_config(False)) as sess:
-        output_val_ref = sess.run(output)
+        output_val_ref = self.evaluate(output)
 
       with session.Session(config=_get_config()) as sess:
         metadata = config_pb2.RunMetadata()
@@ -1398,7 +1399,7 @@ class LayoutOptimizerTest(test.TestCase):
       output = _loop_with_vec_and_4d()
 
       with session.Session(config=_get_config(False)) as sess:
-        output_val_ref = sess.run(output)
+        output_val_ref = self.evaluate(output)
 
       with session.Session(config=_get_config()) as sess:
         metadata = config_pb2.RunMetadata()
@@ -1422,7 +1423,7 @@ class LayoutOptimizerTest(test.TestCase):
       output = _model_with_second_port()
 
       with session.Session(config=_get_config(False)) as sess:
-        output_val_ref = sess.run(output)
+        output_val_ref = self.evaluate(output)
 
       with session.Session(config=_get_config()) as sess:
         metadata = config_pb2.RunMetadata()
@@ -1441,13 +1442,16 @@ class LayoutOptimizerTest(test.TestCase):
       self._assert_trans_nchw_to_nhwc('Add-0-0', nodes)
       self.assertAllClose(output_val_ref, output_val, atol=1e-3)
 
+  @test_util.run_deprecated_v1
   def testGradient(self):
     meta_graph = _simple_metagraph()
-    rewrite_options = rewriter_config_pb2.RewriterConfig(
-        layout_optimizer=rewriter_config_pb2.RewriterConfig.ON,
-        min_graph_nodes=-1)
+    config = config_pb2.ConfigProto()
+    config.graph_options.rewrite_options.CopyFrom(
+        rewriter_config_pb2.RewriterConfig(
+            layout_optimizer=rewriter_config_pb2.RewriterConfig.ON,
+            min_graph_nodes=-1))
     optimized_graph = tf_optimizer.OptimizeGraph(
-        rewrite_options, meta_graph, cluster=_get_cluster())
+        config, meta_graph, cluster=_get_cluster())
 
     found = 0
     for node in optimized_graph.node:
@@ -1456,13 +1460,16 @@ class LayoutOptimizerTest(test.TestCase):
         self.assertEqual(node.attr['data_format'].s, b'NCHW')
     self.assertEqual(found, 5)
 
+  @test_util.run_deprecated_v1
   def testDepthwise(self):
     meta_graph = _simple_metagraph(depthwise=True)
-    rewrite_options = rewriter_config_pb2.RewriterConfig(
-        layout_optimizer=rewriter_config_pb2.RewriterConfig.ON,
-        min_graph_nodes=-1)
+    config = config_pb2.ConfigProto()
+    config.graph_options.rewrite_options.CopyFrom(
+        rewriter_config_pb2.RewriterConfig(
+            layout_optimizer=rewriter_config_pb2.RewriterConfig.ON,
+            min_graph_nodes=-1))
     optimized_graph = tf_optimizer.OptimizeGraph(
-        rewrite_options, meta_graph, cluster=_get_cluster())
+        config, meta_graph, cluster=_get_cluster())
 
     found = 0
     for node in optimized_graph.node:
diff --git a/tensorflow/python/grappler/memory_optimizer_test.py b/tensorflow/python/grappler/memory_optimizer_test.py
index 03b42f64539..6eb16fbd39e 100644
--- a/tensorflow/python/grappler/memory_optimizer_test.py
+++ b/tensorflow/python/grappler/memory_optimizer_test.py
@@ -25,6 +25,7 @@ from tensorflow.python.client import session
 from tensorflow.python.framework import meta_graph
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import random_seed
+from tensorflow.python.framework import test_util
 from tensorflow.python.grappler import tf_optimizer
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn
@@ -37,6 +38,7 @@ from tensorflow.python.training import training as train
 class MemoryOptimizerSwapTest(test.TestCase):
   """Tests the Grappler memory optimizer."""
 
+  @test_util.run_deprecated_v1
   def testNoSwapping(self):
     """Make sure the graph is preserved when there is nothing to swap."""
     a = variables.VariableV1(10, name='a')
@@ -49,15 +51,18 @@ class MemoryOptimizerSwapTest(test.TestCase):
     graph_size = len(mg.graph_def.node)
     nodes = [node.name for node in mg.graph_def.node]
 
-    rewriter_config = rewriter_config_pb2.RewriterConfig(
-        disable_model_pruning=True,
-        constant_folding=rewriter_config_pb2.RewriterConfig.OFF,
-        memory_optimization=rewriter_config_pb2.RewriterConfig.MANUAL)
-    graph = tf_optimizer.OptimizeGraph(rewriter_config, mg)
+    config = config_pb2.ConfigProto()
+    config.graph_options.rewrite_options.CopyFrom(
+        rewriter_config_pb2.RewriterConfig(
+            disable_model_pruning=True,
+            constant_folding=rewriter_config_pb2.RewriterConfig.OFF,
+            memory_optimization=rewriter_config_pb2.RewriterConfig.MANUAL))
+    graph = tf_optimizer.OptimizeGraph(config, mg)
 
     self.assertEqual(len(graph.node), graph_size)
     self.assertItemsEqual([node.name for node in graph.node], nodes)
 
+  @test_util.run_deprecated_v1
   def testSimpleSwap(self):
     """Check that the swap annotations are followed."""
     a = variables.VariableV1(10, name='a')
@@ -72,13 +77,15 @@ class MemoryOptimizerSwapTest(test.TestCase):
     mg = meta_graph.create_meta_graph_def(graph=ops.get_default_graph())
     graph_size = len(mg.graph_def.node)
 
-    rewriter_config = rewriter_config_pb2.RewriterConfig(
-        disable_model_pruning=True,
-        meta_optimizer_iterations=rewriter_config_pb2.RewriterConfig.ONE,
-        constant_folding=rewriter_config_pb2.RewriterConfig.OFF,
-        memory_optimization=rewriter_config_pb2.RewriterConfig.MANUAL,
-        min_graph_nodes=-1)
-    graph = tf_optimizer.OptimizeGraph(rewriter_config, mg)
+    config = config_pb2.ConfigProto()
+    config.graph_options.rewrite_options.CopyFrom(
+        rewriter_config_pb2.RewriterConfig(
+            disable_model_pruning=True,
+            meta_optimizer_iterations=rewriter_config_pb2.RewriterConfig.ONE,
+            constant_folding=rewriter_config_pb2.RewriterConfig.OFF,
+            memory_optimization=rewriter_config_pb2.RewriterConfig.MANUAL,
+            min_graph_nodes=-1))
+    graph = tf_optimizer.OptimizeGraph(config, mg)
 
     self.assertEqual(len(graph.node), graph_size + 2)
     self.assertTrue(
@@ -127,7 +134,8 @@ class MemoryOptimizerRecomputeTest(test.TestCase):
   def testRewritingDefaultGradientNames(self):
     """Tests that rewriting occurs with default gradient names."""
     (original_metagraph, _, _, _) = self._GetMetaGraph()
-    rewritten_graph_def = tf_optimizer.OptimizeGraph(
+    config = config_pb2.ConfigProto()
+    config.graph_options.rewrite_options.CopyFrom(
         rewriter_config_pb2.RewriterConfig(
             disable_model_pruning=True,
             constant_folding=rewriter_config_pb2.RewriterConfig.OFF,
@@ -135,8 +143,9 @@ class MemoryOptimizerRecomputeTest(test.TestCase):
             layout_optimizer=rewriter_config_pb2.RewriterConfig.OFF,
             arithmetic_optimization=rewriter_config_pb2.RewriterConfig.OFF,
             min_graph_nodes=-1,
-            memory_optimization=rewriter_config_pb2.RewriterConfig.
-            RECOMPUTATION_HEURISTICS), original_metagraph)
+            memory_optimization=(
+                rewriter_config_pb2.RewriterConfig.RECOMPUTATION_HEURISTICS)))
+    rewritten_graph_def = tf_optimizer.OptimizeGraph(config, original_metagraph)
     self.assertGreater(
         len(rewritten_graph_def.node),
         len(original_metagraph.graph_def.node))
@@ -153,7 +162,8 @@ class MemoryOptimizerRecomputeTest(test.TestCase):
     """Tests that rewriting occurs with non-standard gradient names."""
     (original_metagraph, _, _, _) = self._GetMetaGraph(
         optimizer_scope_name='optimizer')
-    rewritten_graph_def = tf_optimizer.OptimizeGraph(
+    config = config_pb2.ConfigProto()
+    config.graph_options.rewrite_options.CopyFrom(
         rewriter_config_pb2.RewriterConfig(
             disable_model_pruning=True,
             constant_folding=rewriter_config_pb2.RewriterConfig.OFF,
@@ -161,11 +171,11 @@ class MemoryOptimizerRecomputeTest(test.TestCase):
             layout_optimizer=rewriter_config_pb2.RewriterConfig.OFF,
             arithmetic_optimization=rewriter_config_pb2.RewriterConfig.OFF,
             min_graph_nodes=-1,
-            memory_optimization=rewriter_config_pb2.RewriterConfig.
-            RECOMPUTATION_HEURISTICS,
+            memory_optimization=rewriter_config_pb2.RewriterConfig
+            .RECOMPUTATION_HEURISTICS,
             # Checks that name scope "gradients/" also match sub-scope.
-            memory_optimizer_target_node_name_scope='gradients/'),
-        original_metagraph)
+            memory_optimizer_target_node_name_scope='gradients/'))
+    rewritten_graph_def = tf_optimizer.OptimizeGraph(config, original_metagraph)
     self.assertGreater(
         len(rewritten_graph_def.node),
         len(original_metagraph.graph_def.node))
@@ -182,18 +192,19 @@ class MemoryOptimizerRecomputeTest(test.TestCase):
     """Tests that rewriting occurs with non-standard gradient names."""
     (original_metagraph, _, _,
      _) = self._GetMetaGraph(optimizer_scope_name='foo/bar')
-    rewritten_graph_def = tf_optimizer.OptimizeGraph(
+    config = config_pb2.ConfigProto()
+    config.graph_options.rewrite_options.CopyFrom(
         rewriter_config_pb2.RewriterConfig(
             disable_model_pruning=True,
             constant_folding=rewriter_config_pb2.RewriterConfig.OFF,
             dependency_optimization=rewriter_config_pb2.RewriterConfig.OFF,
             layout_optimizer=rewriter_config_pb2.RewriterConfig.OFF,
             arithmetic_optimization=rewriter_config_pb2.RewriterConfig.OFF,
-            memory_optimization=rewriter_config_pb2.RewriterConfig.
-            RECOMPUTATION_HEURISTICS,
+            memory_optimization=rewriter_config_pb2.RewriterConfig
+            .RECOMPUTATION_HEURISTICS,
             # This should not match anything.
-            memory_optimizer_target_node_name_scope='r/gradients/'),
-        original_metagraph)
+            memory_optimizer_target_node_name_scope='r/gradients/'))
+    rewritten_graph_def = tf_optimizer.OptimizeGraph(config, original_metagraph)
     self.assertEqual(
         len(rewritten_graph_def.node), len(original_metagraph.graph_def.node))
     self.assertEqual(0,
@@ -223,10 +234,10 @@ class MemoryOptimizerRecomputeTest(test.TestCase):
       train_op = graph.get_operation_by_name(train_op_name)
       loss_op = graph.get_tensor_by_name(loss_op_name)
       with session.Session(config=config, graph=graph) as sess:
-        sess.run(init_op)
-        sess.run(train_op)
-        sess.run(train_op)
-        return sess.run(loss_op)
+        self.evaluate(init_op)
+        self.evaluate(train_op)
+        self.evaluate(train_op)
+        return self.evaluate(loss_op)
 
   def testRecomputationRewritingNoErrors(self):
     """Tests that graph output is not significantly different with rewriting."""
@@ -287,8 +298,8 @@ class MemoryOptimizerRecomputeTest(test.TestCase):
           rewrite_options=manual_memory_config)
       session_config = config_pb2.ConfigProto(graph_options=graph_options)
       with session.Session(config=session_config) as sess:
-        sess.run(init_op)
-        sess.run(train_op)
+        self.evaluate(init_op)
+        self.evaluate(train_op)
 
   def testHintDoesRewrite(self):
     graph = self._annotated_graph()[0]
@@ -298,11 +309,12 @@ class MemoryOptimizerRecomputeTest(test.TestCase):
         0,
         len([node for node in metagraph.graph_def.node
              if 'Recomputed/' in node.name]))
-    rewritten_graph_def = tf_optimizer.OptimizeGraph(
+    config = config_pb2.ConfigProto()
+    config.graph_options.rewrite_options.CopyFrom(
         rewriter_config_pb2.RewriterConfig(
             min_graph_nodes=-1,
-            memory_optimization=rewriter_config_pb2.RewriterConfig.MANUAL),
-        metagraph)
+            memory_optimization=rewriter_config_pb2.RewriterConfig.MANUAL))
+    rewritten_graph_def = tf_optimizer.OptimizeGraph(config, metagraph)
     self.assertEqual(
         9,
         len([node for node in rewritten_graph_def.node
diff --git a/tensorflow/python/grappler/model_analyzer_test.py b/tensorflow/python/grappler/model_analyzer_test.py
index ec172755f1a..d000cfa1ba2 100644
--- a/tensorflow/python/grappler/model_analyzer_test.py
+++ b/tensorflow/python/grappler/model_analyzer_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import meta_graph
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.grappler import model_analyzer
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
@@ -28,6 +29,7 @@ from tensorflow.python.platform import test
 
 class PyWrapOptimizeGraphTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testBasic(self):
     """Make sure arguments can be passed correctly."""
     a = constant_op.constant([10, 11], name="a")
@@ -49,6 +51,7 @@ class PyWrapOptimizeGraphTest(test.TestCase):
     # Also print the report to make it easier to debug
     print("{}".format(report))
 
+  @test_util.run_deprecated_v1
   def testDebugMode(self):
     """Make sure arguments can be passed correctly."""
     a = constant_op.constant([10, 11], name="a")
diff --git a/tensorflow/python/grappler/tf_optimizer.i b/tensorflow/python/grappler/tf_optimizer.i
index 39ca71e99af..b746c3ec261 100644
--- a/tensorflow/python/grappler/tf_optimizer.i
+++ b/tensorflow/python/grappler/tf_optimizer.i
@@ -34,8 +34,8 @@ limitations under the License.
   $1 = &temp;
 }
 
-%typemap(in) const tensorflow::RewriterConfig& (
-    tensorflow::RewriterConfig temp) {
+%typemap(in) const tensorflow::ConfigProto& (
+    tensorflow::ConfigProto temp) {
   char* c_string;
   Py_ssize_t py_size;
   if (PyBytes_AsStringAndSize($input, &c_string, &py_size) == -1) {
@@ -46,7 +46,7 @@ limitations under the License.
   if (!temp.ParseFromString(string(c_string, py_size))) {
     PyErr_SetString(
         PyExc_TypeError,
-        "The RewriterConfig could not be parsed as a valid protocol buffer");
+        "The ConfigProto could not be parsed as a valid protocol buffer");
     SWIG_fail;
   }
   $1 = &temp;
@@ -67,20 +67,20 @@ limitations under the License.
   #include "tensorflow/core/grappler/clusters/utils.h"
   #include "tensorflow/core/grappler/clusters/virtual_cluster.h"
   #include "tensorflow/core/grappler/optimizers/meta_optimizer.h"
+  #include "tensorflow/core/protobuf/config.pb.h"
   #include "tensorflow/core/protobuf/meta_graph.pb.h"
-  #include "tensorflow/core/protobuf/rewriter_config.pb.h"
   #include "tensorflow/core/public/session_options.h"
 
 
 void DetectDevices(std::unordered_map<string, tensorflow::DeviceProperties>* device_map) {
   tensorflow::SessionOptions options;
-  std::vector<tensorflow::Device*> devices;
+  std::vector<std::unique_ptr<tensorflow::Device>> devices;
   tensorflow::Status status = tensorflow::DeviceFactory::AddDevices(options, "", &devices);
   if (!status.ok()) {
     return;
   }
 
-  for (const tensorflow::Device* device : devices) {
+  for (const std::unique_ptr<tensorflow::Device>& device : devices) {
     tensorflow::DeviceProperties& prop = (*device_map)[device->name()];
     prop = tensorflow::grappler::GetDeviceInfo(device->parsed_name());
 
@@ -88,13 +88,12 @@ void DetectDevices(std::unordered_map<string, tensorflow::DeviceProperties>* dev
     // available device memory.
     const tensorflow::DeviceAttributes& attr = device->attributes();
     prop.set_memory_size(attr.memory_limit());
-    delete device;
   }
 }
 
 PyObject* TF_OptimizeGraph(
       GCluster cluster,
-      const tensorflow::RewriterConfig& rewriter_config,
+      const tensorflow::ConfigProto& config_proto,
       const tensorflow::MetaGraphDef& metagraph,
       bool verbose, const string& graph_id, TF_Status* out_status) {
     tensorflow::grappler::ItemConfig item_config;
@@ -110,7 +109,7 @@ PyObject* TF_OptimizeGraph(
 
     tensorflow::DeviceBase* cpu_device = nullptr;
     tensorflow::GraphDef out_graph;
-    tensorflow::grappler::MetaOptimizer optimizer(cpu_device, rewriter_config);
+    tensorflow::grappler::MetaOptimizer optimizer(cpu_device, config_proto);
     tensorflow::Status status = optimizer.Optimize(cluster.get(), *grappler_item, &out_graph);
     if (verbose) {
       optimizer.PrintResult();
@@ -127,7 +126,7 @@ PyObject* TF_OptimizeGraph(
 // Wrap this function
 PyObject* TF_OptimizeGraph(
     GCluster cluster,
-    const tensorflow::RewriterConfig& rewriter_config,
+    const tensorflow::ConfigProto& config_proto,
     const tensorflow::MetaGraphDef& metagraph, bool verbose,
     const string& graph_id, TF_Status* out_status);
 
diff --git a/tensorflow/python/grappler/tf_optimizer.py b/tensorflow/python/grappler/tf_optimizer.py
index a73a4a98fc5..e72667b6f31 100644
--- a/tensorflow/python/grappler/tf_optimizer.py
+++ b/tensorflow/python/grappler/tf_optimizer.py
@@ -19,22 +19,26 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.core.framework import graph_pb2
+from tensorflow.core.protobuf import config_pb2
 from tensorflow.python import pywrap_tensorflow as tf_opt
 from tensorflow.python.framework import errors
 from tensorflow.python.grappler import cluster as gcluster
 
 
-def OptimizeGraph(rewriter_config,
+def OptimizeGraph(config_proto,
                   metagraph,
                   verbose=True,
                   graph_id=b'graph_to_optimize',
                   cluster=None):
   """Optimize the provided metagraph."""
+  if not isinstance(config_proto, config_pb2.ConfigProto):
+    raise TypeError('Expected config_proto to be a ConfigProto, saw type %s' %
+                    type(config_proto))
   with errors.raise_exception_on_not_ok_status() as status:
     if cluster is None:
       cluster = gcluster.Cluster()
     ret_from_swig = tf_opt.TF_OptimizeGraph(cluster.tf_cluster,
-                                            rewriter_config.SerializeToString(),
+                                            config_proto.SerializeToString(),
                                             metagraph.SerializeToString(),
                                             verbose, graph_id, status)
   if ret_from_swig is None:
diff --git a/tensorflow/python/grappler/tf_optimizer_test.py b/tensorflow/python/grappler/tf_optimizer_test.py
index eca0f679829..06ccaa813f2 100644
--- a/tensorflow/python/grappler/tf_optimizer_test.py
+++ b/tensorflow/python/grappler/tf_optimizer_test.py
@@ -17,12 +17,13 @@
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
-from tensorflow.core.protobuf import rewriter_config_pb2
+from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import meta_graph
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import test_util
 from tensorflow.python.grappler import item as gitem
 from tensorflow.python.grappler import tf_optimizer
 from tensorflow.python.ops import array_ops
@@ -34,6 +35,7 @@ from tensorflow.python.platform import test
 
 class PyWrapOptimizeGraphTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testBasic(self):
     """Make sure arguments can be passed correctly."""
     a = constant_op.constant(10, name='a')
@@ -45,15 +47,17 @@ class PyWrapOptimizeGraphTest(test.TestCase):
     train_op.append(d)
     mg = meta_graph.create_meta_graph_def(graph=ops.get_default_graph())
 
-    rewriter_config = rewriter_config_pb2.RewriterConfig()
+    config = config_pb2.ConfigProto()
+    rewriter_config = config.graph_options.rewrite_options
     rewriter_config.optimizers.append('constfold')
     rewriter_config.min_graph_nodes = -1
 
-    graph = tf_optimizer.OptimizeGraph(rewriter_config, mg)
+    graph = tf_optimizer.OptimizeGraph(config, mg)
 
     self.assertEqual(len(graph.node), 1)
     self.assertItemsEqual([node.name for node in graph.node], ['d'])
 
+  @test_util.run_deprecated_v1
   def testKeepNodes(self):
     g = ops.Graph()
     with g.as_default():
@@ -68,18 +72,21 @@ class PyWrapOptimizeGraphTest(test.TestCase):
 
     # Optimize the graph.
     mg = meta_graph.create_meta_graph_def(graph=g)
-    rewriter_config = rewriter_config_pb2.RewriterConfig()
+    config = config_pb2.ConfigProto()
+    rewriter_config = config.graph_options.rewrite_options
     rewriter_config.min_graph_nodes = -1
-    optimized_graph = tf_optimizer.OptimizeGraph(rewriter_config, mg)
+    optimized_graph = tf_optimizer.OptimizeGraph(config, mg)
 
     # Check that the nodes referenced in various collections have been preserved
-    self.assertEqual(len(optimized_graph.node), 5)
-    self.assertEqual(d.op.name, optimized_graph.node[0].name)
-    self.assertEqual(a1.op.name, optimized_graph.node[1].name)
-    self.assertEqual('Variable/initial_value', optimized_graph.node[2].name)
-    self.assertEqual(a2.op.name, optimized_graph.node[3].name)
-    self.assertEqual('Variable/Assign', optimized_graph.node[4].name)
+    optimized_graph_nodes = [node.name for node in optimized_graph.node]
+    expected_nodes = [
+        d.op.name, a1.op.name, a2.op.name, 'Variable/initial_value',
+        'Variable/Assign'
+    ]
+    self.assertEqual(len(optimized_graph_nodes), len(expected_nodes))
+    self.assertAllInSet(optimized_graph_nodes, expected_nodes)
 
+  @test_util.run_deprecated_v1
   def testLoops(self):
     g = ops.Graph()
     with g.as_default():
@@ -110,9 +117,10 @@ class PyWrapOptimizeGraphTest(test.TestCase):
 
     # Optimize the graph.
     mg = meta_graph.create_meta_graph_def(graph=g)
-    rewriter_config = rewriter_config_pb2.RewriterConfig()
+    config = config_pb2.ConfigProto()
+    rewriter_config = config.graph_options.rewrite_options
     rewriter_config.min_graph_nodes = -1
-    optimized_graph = tf_optimizer.OptimizeGraph(rewriter_config, mg)
+    optimized_graph = tf_optimizer.OptimizeGraph(config, mg)
     mg.graph_def.CopyFrom(optimized_graph)
 
     # Check that the nodes referenced in various collections have been preserved
diff --git a/tensorflow/python/keras/BUILD b/tensorflow/python/keras/BUILD
index 6f38d822e70..ab5628b1d23 100755
--- a/tensorflow/python/keras/BUILD
+++ b/tensorflow/python/keras/BUILD
@@ -3,10 +3,10 @@
 
 licenses(["notice"])  # Apache 2.0
 
-exports_files(["LICENSE"])
-
 package(default_visibility = ["//visibility:public"])
 
+exports_files(["LICENSE"])
+
 load("//tensorflow:tensorflow.bzl", "py_test")
 load("//tensorflow:tensorflow.bzl", "cuda_py_test")
 
@@ -122,8 +122,10 @@ py_library(
         "constraints.py",
         "engine/__init__.py",
         "engine/base_layer.py",
+        "engine/base_layer_utils.py",
         "engine/distributed_training_utils.py",
         "engine/input_layer.py",
+        "engine/input_spec.py",
         "engine/network.py",
         "engine/saving.py",
         "engine/sequential.py",
@@ -141,11 +143,14 @@ py_library(
         "regularizers.py",
         "utils/data_utils.py",
         "utils/io_utils.py",
+        "utils/losses_utils.py",
     ],
     srcs_version = "PY2AND3",
     deps = [
         ":backend",
         "//tensorflow/python/data",
+        "//tensorflow/python/distribute:reduce_util",
+        "//tensorflow/python/keras/optimizer_v2",
         "//tensorflow/python/training/checkpointable:data_structures",
         "//tensorflow/tools/docs:doc_controls",
         "@six_archive//:six",
@@ -180,7 +185,6 @@ py_library(
         ":engine",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:cudnn_rnn_ops_gen",
-        "//tensorflow/python:distribute",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:embedding_ops",
         "//tensorflow/python:framework_ops",
@@ -194,6 +198,7 @@ py_library(
         "//tensorflow/python:tensor_array_ops",
         "//tensorflow/python:tensor_shape",
         "//tensorflow/python:util",
+        "//tensorflow/python/distribute:distribute_lib",
     ],
 )
 
@@ -264,6 +269,7 @@ py_test(
     name = "optimizers_test",
     size = "medium",
     srcs = ["optimizers_test.py"],
+    shard_count = 2,
     srcs_version = "PY2AND3",
     tags = ["notsan"],
     deps = [
@@ -271,6 +277,7 @@ py_test(
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:training",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -300,6 +307,7 @@ py_test(
         ":keras",
         "//tensorflow/python:client_testlib",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -318,7 +326,7 @@ py_test(
 
 py_test(
     name = "advanced_activations_test",
-    size = "small",
+    size = "medium",
     srcs = ["layers/advanced_activations_test.py"],
     srcs_version = "PY2AND3",
     deps = [
@@ -369,7 +377,7 @@ cuda_py_test(
 
 py_test(
     name = "pooling_test",
-    size = "medium",
+    size = "large",
     srcs = ["layers/pooling_test.py"],
     srcs_version = "PY2AND3",
     deps = [
@@ -491,12 +499,13 @@ py_test(
         ":keras",
         "//tensorflow/python:client_testlib",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
 py_test(
     name = "recurrent_test",
-    size = "medium",
+    size = "large",
     srcs = ["layers/recurrent_test.py"],
     srcs_version = "PY2AND3",
     deps = [
@@ -506,6 +515,19 @@ py_test(
     ],
 )
 
+cuda_py_test(
+    name = "unified_lstm_test",
+    size = "medium",
+    srcs = ["layers/unified_lstm_test.py"],
+    additional_deps = [
+        ":keras",
+        "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
+    ],
+    shard_count = 4,
+)
+
 py_test(
     name = "serialization_test",
     size = "small",
@@ -577,6 +599,17 @@ py_test(
     ],
 )
 
+py_test(
+    name = "tf_utils_test",
+    size = "small",
+    srcs = ["utils/tf_utils_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":keras",
+        "//tensorflow/python:client_testlib",
+    ],
+)
+
 py_test(
     name = "io_utils_test",
     size = "small",
@@ -707,15 +740,33 @@ py_test(
 )
 
 py_test(
-    name = "training_generator_test",
-    size = "enormous",
-    srcs = ["engine/training_generator_test.py"],
+    name = "training_dataset_test",
+    size = "medium",
+    srcs = ["engine/training_dataset_test.py"],
     srcs_version = "PY2AND3",
-    tags = ["notsan"],
     deps = [
         ":keras",
         "//tensorflow/python:client_testlib",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
+py_test(
+    name = "training_generator_test",
+    size = "enormous",
+    srcs = ["engine/training_generator_test.py"],
+    shard_count = 3,
+    srcs_version = "PY2AND3",
+    tags = [
+        "no_oss",
+        "notsan",
+    ],
+    deps = [
+        ":keras",
+        "//tensorflow/python:client_testlib",
+        "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -730,6 +781,7 @@ py_test(
         "//tensorflow/python:client_testlib",
         "//tensorflow/python/feature_column:feature_column_py",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -763,6 +815,7 @@ py_test(
     name = "model_subclassing_test",
     size = "medium",
     srcs = ["model_subclassing_test.py"],
+    shard_count = 2,
     srcs_version = "PY2AND3",
     tags = ["notsan"],
     deps = [
diff --git a/tensorflow/python/keras/activations_test.py b/tensorflow/python/keras/activations_test.py
index ad238cb0a9b..6b7bfb698b8 100644
--- a/tensorflow/python/keras/activations_test.py
+++ b/tensorflow/python/keras/activations_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python import keras
+from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
 
 
@@ -67,6 +68,7 @@ class KerasActivationsTest(test.TestCase):
     expected = _ref_softmax(test_values[0, 0])
     self.assertAllClose(result[0, 0], expected, rtol=1e-05)
 
+  @test_util.run_deprecated_v1
   def test_selu(self):
     x = keras.backend.placeholder(ndim=2)
     f = keras.backend.function([x], [keras.activations.selu(x)])
@@ -124,6 +126,7 @@ class KerasActivationsTest(test.TestCase):
     expected = sigmoid(test_values)
     self.assertAllClose(result, expected, rtol=1e-05)
 
+  @test_util.run_deprecated_v1
   def test_hard_sigmoid(self):
     def ref_hard_sigmoid(x):
       x = (x * 0.2) + 0.5
@@ -147,6 +150,7 @@ class KerasActivationsTest(test.TestCase):
     # No negative values in test values...
     self.assertAllClose(result, test_values, rtol=1e-05)
 
+  @test_util.run_deprecated_v1
   def test_elu(self):
     with self.cached_session():
       x = keras.backend.placeholder(ndim=2)
diff --git a/tensorflow/python/keras/backend.py b/tensorflow/python/keras/backend.py
index dd9b0c07e70..1277746716a 100644
--- a/tensorflow/python/keras/backend.py
+++ b/tensorflow/python/keras/backend.py
@@ -25,6 +25,7 @@ import collections
 import itertools
 import json
 import os
+import threading
 import weakref
 
 import numpy as np
@@ -73,9 +74,9 @@ py_sum = sum
 # while executing eagerly (such as the functional API for model-building).
 _GRAPH = None
 
-# This is the default internal TF session used by Keras.
-# It can be set manually via `set_session(sess)`.
-_SESSION = None
+# This is a thread local object that will hold the default internal TF session
+# used by Keras. It can be set manually via `set_session(sess)`.
+_SESSION = threading.local()
 
 # This dictionary holds a mapping {graph: learning_phase}.
 # A learning phase is a bool tensor used to run Keras models in
@@ -337,7 +338,7 @@ def clear_session():
   global _GRAPH_TF_OPTIMIZERS  # pylint: disable=global-variable-not-assigned
   ops.reset_default_graph()
   reset_uids()
-  _SESSION = None
+  _SESSION.session = None
   graph = get_graph()
   with graph.as_default():
     phase = array_ops.placeholder_with_default(
@@ -376,27 +377,22 @@ def learning_phase():
   Returns:
       Learning phase (scalar integer tensor or Python integer).
   """
-  with ops.init_scope():
-    # We always check & set the learning phase inside the init_scope,
-    # otherwise the wrong default_graph will be used to look up the learning
-    # phase inside of functions & defuns.
-    #
-    # This is because functions & defuns (both in graph & in eager mode)
-    # will always execute non-eagerly using a function-specific default
-    # subgraph.
-    if context.executing_eagerly():
-      if _DUMMY_EAGER_GRAPH not in _GRAPH_LEARNING_PHASES:
-        # Fallback to inference mode as default.
-        return 0
-      return _GRAPH_LEARNING_PHASES[_DUMMY_EAGER_GRAPH]
+  if context.executing_eagerly():
+    if _DUMMY_EAGER_GRAPH not in _GRAPH_LEARNING_PHASES:
+      # Fallback to inference mode as default.
+      return 0
+    return _GRAPH_LEARNING_PHASES[_DUMMY_EAGER_GRAPH]
+  return symbolic_learning_phase()
 
-    graph = get_graph()
-    with graph.as_default():
-      if graph not in _GRAPH_LEARNING_PHASES:
-        phase = array_ops.placeholder_with_default(
-            False, shape=(), name='keras_learning_phase')
-        _GRAPH_LEARNING_PHASES[graph] = phase
-      return _GRAPH_LEARNING_PHASES[graph]
+
+def symbolic_learning_phase():
+  graph = get_graph()
+  with graph.as_default():
+    if graph not in _GRAPH_LEARNING_PHASES:
+      phase = array_ops.placeholder_with_default(
+          False, shape=(), name='keras_learning_phase')
+      _GRAPH_LEARNING_PHASES[graph] = phase
+    return _GRAPH_LEARNING_PHASES[graph]
 
 
 @tf_export('keras.backend.set_learning_phase')
@@ -449,6 +445,20 @@ def learning_phase_scope(value):
         _GRAPH_LEARNING_PHASES[get_graph()] = previous_value
 
 
+def _get_session():
+  """Returns the session object for the current thread."""
+  global _SESSION
+  default_session = ops.get_default_session()
+  if default_session is not None:
+    session = default_session
+  else:
+    if getattr(_SESSION, 'session', None) is None:
+      _SESSION.session = session_module.Session(
+          config=get_default_session_config())
+    session = _SESSION.session
+  return session
+
+
 @tf_export(v1=['keras.backend.get_session'])
 def get_session():
   """Returns the TF session to be used by the backend.
@@ -466,14 +476,7 @@ def get_session():
   Returns:
       A TensorFlow session.
   """
-  global _SESSION
-  default_session = ops.get_default_session()
-  if default_session is not None:
-    session = default_session
-  else:
-    if _SESSION is None:
-      _SESSION = session_module.Session(config=get_default_session_config())
-    session = _SESSION
+  session = _get_session()
   if not _MANUAL_VAR_INIT:
     with session.graph.as_default():
       _initialize_variables(session)
@@ -498,7 +501,7 @@ def set_session(session):
       session: A TF Session.
   """
   global _SESSION
-  _SESSION = session
+  _SESSION.session = session
 
 
 def get_default_session_config():
@@ -2322,7 +2325,7 @@ def concatenate(tensors, axis=-1):
     else:
       axis = 0
 
-  if py_all([is_sparse(x) for x in tensors]):
+  if py_all(is_sparse(x) for x in tensors):
     return sparse_ops.sparse_concat(axis, tensors)
   else:
     return array_ops.concat([to_dense(x) for x in tensors], axis)
@@ -2552,7 +2555,7 @@ def arange(start, stop=None, step=1, dtype='int32'):
     result = cast(result, dtype)
   return result
 
-
+@tf_export('keras.backend.tile')
 def tile(x, n):
   """Creates a tensor by tiling `x` by `n`.
 
@@ -3123,20 +3126,20 @@ class EagerExecutionFunction(object):
             updates_ops.append(update)
 
       # We set the update ops to run at the end by conditioning it on output[0]
-      if updates and not outputs:
+      if updates and not self.outputs:
         # Edge case; never happens in practice
         raise ValueError('Cannot create a Keras backend function with updates'
                          ' but no outputs during eager execution.')
       with ops.control_dependencies(updates_ops):
-        outputs[0] = array_ops.identity(outputs[0])
+        self.outputs[0] = array_ops.identity(self.outputs[0])
 
     # Prepare graph function
     # TODO(fchollet): can we restrict `captures` to variables actually used in
     # the relevant subgraph?
-    graph.inputs = inputs + list(graph.captures.values())
-    graph.outputs = outputs
+    graph.inputs = self.inputs + list(graph.captures.values())
+    graph.outputs = self.outputs
     graph_fn = eager_function.Function(graph)
-    graph_fn._num_positional_args = len(inputs)
+    graph_fn._num_positional_args = len(self.inputs)
     graph_fn._arg_keywords = []
     self._graph_fn = graph_fn
 
@@ -3158,8 +3161,13 @@ class EagerExecutionFunction(object):
         if value is None:
           raise ValueError(
               'You must feed a value for placeholder %s' % (tensor,))
-      converted_inputs.append(
-          ops.convert_to_tensor(value, dtype=tensor.dtype))
+      if not isinstance(value, ops.Tensor):
+        value = ops.convert_to_tensor(value, dtype=tensor.dtype)
+      if value.dtype != tensor.dtype:
+        # Temporary workaround due to `convert_to_tensor` not casting floats.
+        # See b/119637405
+        value = math_ops.cast(value, tensor.dtype)
+      converted_inputs.append(value)
     outputs = self._graph_fn(*converted_inputs)
     return [x.numpy() for x in outputs]
 
@@ -3181,7 +3189,7 @@ def function(inputs, outputs, updates=None, name=None, **kwargs):
   Raises:
       ValueError: if invalid kwargs are passed in or if in eager execution.
   """
-  if context.executing_eagerly():
+  if ops.executing_eagerly_outside_functions():
     if kwargs:
       raise ValueError('Session keyword arguments are not support during '
                        'eager execution. You passed: %s' % (kwargs,))
@@ -3242,7 +3250,8 @@ def rnn(step_function,
         constants=None,
         unroll=False,
         input_length=None,
-        time_major=False):
+        time_major=False,
+        zero_output_for_mask=False):
   """Iterates over the time dimension of a tensor.
 
   Arguments:
@@ -3280,7 +3289,9 @@ def rnn(step_function,
           RNN calculation. However, most TensorFlow data is batch-major, so by
           default this function accepts input and emits output in batch-major
           form.
-
+      zero_output_for_mask: Boolean. If True, the output for masked timestep
+          will be zeros, whereas in the False case, output from previous
+          timestep is returned.
   Returns:
       A tuple, `(last_output, outputs, new_states)`.
           last_output: the latest output of the rnn, of shape `(samples, ...)`
@@ -3332,13 +3343,13 @@ def rnn(step_function,
   # So we need to broadcast the mask to match the shape of inputs.
   # That's what the tile call does, it just repeats the mask along its
   # second dimension n times.
-  def _expand_mask(mask_t, input_t):
+  def _expand_mask(mask_t, input_t, fixed_dim=1):
     assert not nest.is_sequence(mask_t)
     assert not nest.is_sequence(input_t)
     rank_diff = len(input_t.shape) - len(mask_t.shape)
     for _ in range(rank_diff):
       mask_t = array_ops.expand_dims(mask_t)
-    expand_dims = [1] + input_t.shape.as_list()[1:]
+    expand_dims = [1] * fixed_dim + input_t.shape.as_list()[fixed_dim:]
     return array_ops.tile(mask_t, expand_dims)
 
   if unroll:
@@ -3397,6 +3408,17 @@ def rnn(step_function,
       last_output = successive_outputs[-1]
       new_states = successive_states[-1]
       outputs = array_ops.stack(successive_outputs)
+
+      if zero_output_for_mask:
+        last_output = array_ops.where(
+            _expand_mask(mask_list[-1], last_output),
+            last_output,
+            zeros_like(last_output))
+        outputs = array_ops.where(
+            _expand_mask(mask, outputs, fixed_dim=2),
+            outputs,
+            zeros_like(outputs))
+
     else:
       for i in range(time_steps):
         inp = _get_input_tensor(i)
@@ -3490,11 +3512,12 @@ def rnn(step_function,
                                            tuple(states) + tuple(constants))
         # mask output
         flat_output = nest.flatten(output)
-        flat_previous_output = nest.flatten(prev_output)
+        flat_mask_output = (flat_zero_output if zero_output_for_mask
+                            else nest.flatten(prev_output))
         tiled_mask_t = tuple(_expand_mask(mask_t, o) for o in flat_output)
         flat_new_output = tuple(
-            array_ops.where(m, o, po) for m, o, po in zip(
-                tiled_mask_t, flat_output, flat_previous_output))
+            array_ops.where(m, o, zo) for m, o, zo in zip(
+                tiled_mask_t, flat_output, flat_mask_output))
 
         # mask states
         flat_state = nest.flatten(states)
@@ -3503,8 +3526,8 @@ def rnn(step_function,
           new_state.set_shape(state.shape)
         tiled_mask_t = tuple(_expand_mask(mask_t, s) for s in flat_state)
         flat_final_state = tuple(
-            array_ops.where(m, o, po)
-            for m, o, po in zip(tiled_mask_t, flat_new_state, flat_state))
+            array_ops.where(m, s, ps)
+            for m, s, ps in zip(tiled_mask_t, flat_new_state, flat_state))
         new_states = nest.pack_sequence_as(new_states, flat_final_state)
 
         output_ta_t = tuple(
@@ -3552,12 +3575,12 @@ def rnn(step_function,
           **while_loop_kwargs)
       new_states = final_outputs[2:]
 
-    last_time = final_outputs[0]
     output_ta = final_outputs[1]
 
     outputs = tuple(o.stack() for o in output_ta)
+    last_output = tuple(o[-1] for o in outputs)
+
     outputs = nest.pack_sequence_as(output_time_zero, outputs)
-    last_output = tuple(o.read(last_time - 1) for o in output_ta)
     last_output = nest.pack_sequence_as(output_time_zero, last_output)
 
   # static shape inference
@@ -3662,13 +3685,13 @@ def in_train_phase(x, alt, training=None):
   if training is None:
     training = learning_phase()
 
-  if training is 1 or training is True:
+  if training == 1 or training is True:
     if callable(x):
       return x()
     else:
       return x
 
-  elif training is 0 or training is False:
+  elif training == 0 or training is False:
     if callable(alt):
       return alt()
     else:
diff --git a/tensorflow/python/keras/backend_test.py b/tensorflow/python/keras/backend_test.py
index d8aa3e9b529..fdb9b281cb2 100644
--- a/tensorflow/python/keras/backend_test.py
+++ b/tensorflow/python/keras/backend_test.py
@@ -136,7 +136,7 @@ class BackendUtilsTest(test.TestCase):
       x = keras.Input((3,))
       y = keras.layers.BatchNormalization()(x)
       if not context.executing_eagerly():
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         sess.run(y, feed_dict={x: np.random.random((2, 3))})
 
   def test_learning_phase_scope(self):
@@ -1069,13 +1069,13 @@ class BackendNNOpsTest(test.TestCase, parameterized.TestCase):
                                                              initial_states,
                                                              **kwargs)
         # check static shape inference
-        self.assertEquals(last_output.get_shape().as_list(),
-                          [num_samples, output_dim])
-        self.assertEquals(outputs.get_shape().as_list(),
-                          [num_samples, timesteps, output_dim])
+        self.assertEqual(last_output.get_shape().as_list(),
+                         [num_samples, output_dim])
+        self.assertEqual(outputs.get_shape().as_list(),
+                         [num_samples, timesteps, output_dim])
         for state in new_states:
-          self.assertEquals(state.get_shape().as_list(),
-                            [num_samples, output_dim])
+          self.assertEqual(state.get_shape().as_list(),
+                           [num_samples, output_dim])
 
         last_output_list[i].append(keras.backend.eval(last_output))
         outputs_list[i].append(keras.backend.eval(outputs))
@@ -1173,7 +1173,7 @@ class BackendNNOpsTest(test.TestCase, parameterized.TestCase):
         self.assertEqual(outputs.get_shape().as_list(),
                          [num_samples, timesteps, output_dim])
         # for state in new_states:
-        #   self.assertEquals(state.get_shape().as_list(),
+        #   self.assertEqual(state.get_shape().as_list(),
         #                     [num_samples, output_dim])
         self.assertEqual(new_states[0].get_shape().as_list(),
                          [num_samples, output_dim])
@@ -1223,6 +1223,121 @@ class BackendNNOpsTest(test.TestCase, parameterized.TestCase):
       for s, u_s in zip(additional_state_list[2], additional_state_list[3]):
         self.assertAllClose(s, u_s, atol=1e-04)
 
+  def test_rnn_output_and_state_masking_independent(self):
+    num_samples = 2
+    num_timesteps = 4
+    state_and_io_size = 2
+    mask_last_num_timesteps = 2  # for second sample only
+
+    # a step function that just outputs inputs,
+    # but increments states +1 per timestep
+    def step_function(inputs, states):
+      return inputs, [s + 1 for s in states]
+
+    inputs_vals = np.random.random(
+        (num_samples, num_timesteps, state_and_io_size))
+    initial_state_vals = np.random.random((num_samples, state_and_io_size))
+    # masking of two last timesteps for second sample only
+    mask_vals = np.ones((num_samples, num_timesteps))
+    mask_vals[1, -mask_last_num_timesteps:] = 0
+
+    # outputs expected to be same as inputs for the first sample
+    expected_outputs = inputs_vals.copy()
+    # but for the second sample all outputs in masked region should be the same
+    # as last output before masked region
+    expected_outputs[1, -mask_last_num_timesteps:] = \
+        expected_outputs[1, -(mask_last_num_timesteps + 1)]
+
+    expected_last_state = initial_state_vals.copy()
+    # first state should be incremented for every timestep (no masking)
+    expected_last_state[0] += num_timesteps
+    # second state should not be incremented for last two timesteps
+    expected_last_state[1] += (num_timesteps - mask_last_num_timesteps)
+
+    # verify same expected output for `unroll=true/false`
+    inputs = keras.backend.variable(inputs_vals)
+    initial_states = [keras.backend.variable(initial_state_vals)]
+    mask = keras.backend.variable(mask_vals)
+    for unroll in [True, False]:
+      _, outputs, last_states = keras.backend.rnn(
+          step_function,
+          inputs,
+          initial_states,
+          mask=mask,
+          unroll=unroll,
+          input_length=num_timesteps if unroll else None)
+
+      self.assertAllClose(keras.backend.eval(outputs), expected_outputs)
+      self.assertAllClose(keras.backend.eval(
+          last_states[0]), expected_last_state)
+
+  def test_rnn_output_num_dim_larger_than_2_masking(self):
+    num_samples = 3
+    num_timesteps = 4
+    num_features = 5
+
+    def step_function(inputs, states):
+      outputs = keras.backend.tile(keras.backend.expand_dims(inputs), [1, 1, 2])
+      return outputs, [keras.backend.identity(s) for s in states]
+      # Note: cannot just return states (which can be a problem) ->
+      # tensorflow/python/ops/resource_variable_ops.py", line 824, in set_shape
+      # NotImplementedError: ResourceVariable does not implement set_shape()
+
+    inputs_vals = np.random.random((num_samples, num_timesteps, num_features))
+    initial_state_vals = np.random.random((num_samples, 6))
+    mask_vals = np.ones((num_samples, num_timesteps))
+    mask_vals[-1, -1] = 0  # final timestep masked for last sample
+
+    expected_outputs = np.repeat(inputs_vals[..., None], repeats=2, axis=-1)
+    # for the last sample, the final timestep (in masked region) should be the
+    # same as the second to final output (before masked region)
+    expected_outputs[-1, -1] = expected_outputs[-1, -2]
+
+    inputs = keras.backend.variable(inputs_vals)
+    initial_states = [keras.backend.variable(initial_state_vals)]
+    mask = keras.backend.variable(mask_vals)
+    for unroll in [True, False]:
+      _, outputs, _ = keras.backend.rnn(
+          step_function,
+          inputs,
+          initial_states,
+          mask=mask,
+          unroll=unroll,
+          input_length=num_timesteps if unroll else None)
+
+      self.assertAllClose(keras.backend.eval(outputs), expected_outputs)
+
+  def test_rnn_state_num_dim_larger_than_2_masking(self):
+    num_samples = 3
+    num_timesteps = 4
+
+    def step_function(inputs, states):
+      return inputs, [s + 1 for s in states]
+
+    inputs_vals = np.random.random((num_samples, num_timesteps, 5))
+    initial_state_vals = np.random.random((num_samples, 6, 7))
+    mask_vals = np.ones((num_samples, num_timesteps))
+    mask_vals[0, -2:] = 0  # final two timesteps masked for first sample
+
+    expected_last_state = initial_state_vals.copy()
+    expected_last_state[0] += (num_timesteps - 2)
+    expected_last_state[1:] += num_timesteps
+
+    inputs = keras.backend.variable(inputs_vals)
+    initial_states = [keras.backend.variable(initial_state_vals)]
+    mask = keras.backend.variable(mask_vals)
+    for unroll in [True, False]:
+      _, _, last_states = keras.backend.rnn(
+          step_function,
+          inputs,
+          initial_states,
+          mask=mask,
+          unroll=unroll,
+          input_length=num_timesteps if unroll else None)
+
+      self.assertAllClose(
+          keras.backend.eval(last_states[0]), expected_last_state)
+
   def test_normalize_batch_in_training(self):
     val = np.random.random((10, 3, 10, 10))
     x = keras.backend.variable(val)
@@ -1307,6 +1422,7 @@ class TestCTC(test.TestCase):
                 decode_truth[i] == keras.backend.eval(decode_pred_tf[i])))
       self.assertAllClose(log_prob_truth, log_prob_pred)
 
+  @test_util.run_deprecated_v1
   def test_ctc_batch_cost(self):
     with self.cached_session():
       label_lens = np.expand_dims(np.asarray([5, 4]), 1)
@@ -1392,6 +1508,7 @@ class TestRandomOps(test.TestCase):
 
 class BackendGraphTests(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def test_is_placeholder(self):
     x = keras.backend.placeholder(shape=(1,))
     self.assertEqual(keras.backend.is_placeholder(x), True)
@@ -1431,6 +1548,7 @@ class BackendGraphTests(test.TestCase):
     output_values = f([None, None])
     self.assertEqual(output_values, [5., 6.])
 
+  @test_util.run_deprecated_v1
   def test_function_tf_feed_symbols(self):
     # Test Keras backend functions with TF tensor inputs.
     with self.cached_session():
@@ -1464,6 +1582,7 @@ class BackendGraphTests(test.TestCase):
       outs = f([y5, y2, None])
       self.assertEqual(outs, [11., 2.])
 
+  @test_util.run_deprecated_v1
   def test_function_tf_fetches(self):
     # Additional operations can be passed to tf.Session().run() via its
     # `fetches` arguments. In contrast to `updates` argument of
@@ -1486,6 +1605,7 @@ class BackendGraphTests(test.TestCase):
       self.assertEqual(keras.backend.get_session().run(fetches=[x, y]),
                        [11., 5.])
 
+  @test_util.run_deprecated_v1
   def test_function_tf_feed_dict(self):
     # Additional substitutions can be passed to `tf.Session().run()` via its
     # `feed_dict` arguments. Note that the feed_dict is passed once in the
@@ -1518,6 +1638,7 @@ class BackendGraphTests(test.TestCase):
       self.assertEqual(keras.backend.get_session().run(fetches=[x, y]),
                        [30., 40.])
 
+  @test_util.run_deprecated_v1
   def test_function_tf_run_options_with_run_metadata(self):
     with self.cached_session():
       x_placeholder = keras.backend.placeholder(shape=())
@@ -1543,6 +1664,7 @@ class BackendGraphTests(test.TestCase):
       self.assertEqual(output1, [30.])
       self.assertEqual(len(run_metadata.partition_graphs), 0)
 
+  @test_util.run_deprecated_v1
   def test_function_fetch_callbacks(self):
 
     class CallbackStub(object):
@@ -1579,6 +1701,7 @@ class BackendGraphTests(test.TestCase):
     x = keras.backend.placeholder(shape=(3, 4), sparse=True)
     self.assertEqual(x.get_shape().as_list(), [3, 4])
 
+  @test_util.run_deprecated_v1
   def test_batch_normalization(self):
     # No eager CPU kernel.
     g_val = np.random.random((3,))
diff --git a/tensorflow/python/keras/callbacks.py b/tensorflow/python/keras/callbacks.py
index fde17cb6bc4..2d7d5a415d4 100644
--- a/tensorflow/python/keras/callbacks.py
+++ b/tensorflow/python/keras/callbacks.py
@@ -24,7 +24,6 @@ import copy
 import csv
 import io
 import json
-import math
 import os
 import time
 
@@ -35,7 +34,6 @@ from tensorflow.python.data.ops import iterator_ops
 from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
 from tensorflow.python.keras import backend as K
-from tensorflow.python.keras.engine.training_utils import standardize_input_data
 from tensorflow.python.keras.utils.data_utils import Sequence
 from tensorflow.python.keras.utils.generic_utils import Progbar
 from tensorflow.python.ops import array_ops
@@ -54,17 +52,14 @@ except ImportError:
   requests = None
 
 
+# pylint: disable=protected-access
 def configure_callbacks(callbacks,
                         model,
                         do_validation=False,
-                        val_inputs=None,
-                        val_targets=None,
-                        val_sample_weights=None,
                         batch_size=None,
                         epochs=None,
                         steps_per_epoch=None,
                         samples=None,
-                        validation_steps=None,
                         verbose=1,
                         count_mode='steps',
                         mode='train'):
@@ -74,17 +69,10 @@ def configure_callbacks(callbacks,
       callbacks: List of Callbacks.
       model: Model being trained.
       do_validation: Whether or not validation loop will be run.
-      val_inputs: Inputs to Model for validation loop. Can be any
-        data format Keras accepts.
-      val_targets: Targets for Model for validation loop. Can be any
-        data format Keras accepts.
-      val_sample_weights: Sample weights for Model for validation loop.
-        Can be any data format Keras accepts.
       batch_size: Number of samples per batch.
       epochs: Number of epoch to train.
       steps_per_epoch: Number of batches to run per training epoch.
       samples: Number of training samples.
-      validation_steps: Number of batches to run per validation epoch.
       verbose: int, 0 or 1. Keras logging verbosity to pass to ProgbarLogger.
       count_mode: One of 'steps' or 'samples'. Per-batch or per-sample count.
       mode: String. One of 'train', 'test', or 'predict'. Which loop mode to
@@ -114,24 +102,17 @@ def configure_callbacks(callbacks,
   callback_list = CallbackList(callbacks)
 
   # Set callback model
-  callback_model = model._get_callback_model()  # pylint: disable=protected-access
-  if do_validation and val_inputs and not context.executing_eagerly():
-    # Need to create the eval_function before start of the first epoch
-    # because TensorBoard callback on_epoch_begin adds summary to the
-    # list of fetches of the eval_function
-    callback_model._make_eval_function()  # pylint: disable=protected-access
+  callback_model = model._get_callback_model()
   callback_list.set_model(callback_model)
 
   # Set callback parameters
   callback_metrics = []
   # When we have deferred build scenario with iterator input, we will compile
   # when we standardize first batch of data.
-  if mode != 'predict' and model._is_compiled:  # pylint: disable=protected-access
+  if mode != 'predict' and hasattr(model, 'metrics_names'):
     callback_metrics = copy.copy(model.metrics_names)
     if do_validation:
       callback_metrics += ['val_' + n for n in model.metrics_names]
-  if validation_steps is None and isinstance(val_inputs, Sequence):
-    validation_steps = len(val_inputs)
   callback_params = {
       'batch_size': batch_size,
       'epochs': epochs,
@@ -140,27 +121,19 @@ def configure_callbacks(callbacks,
       'verbose': verbose,
       'do_validation': do_validation,
       'metrics': callback_metrics,
-      'validation_steps': validation_steps
   }
   callback_list.set_params(callback_params)
 
-  # Pass validation data to callbacks
-  # TODO(omalleyt): remove this once val hooks are ready.
-  if not val_inputs:
-    val_data = []
-  elif _is_generator_like(val_inputs):
-    val_data = val_inputs
-  else:
-    val_data = val_inputs + val_targets
-    if val_sample_weights:
-      val_data += val_sample_weights
-    if not isinstance(K.learning_phase(), int):
-      val_data += [0.]
-  for cbk in callbacks:
-    cbk.validation_data = val_data
+  if (do_validation and not model._distribution_strategy and
+      not model.run_eagerly):
+    # Need to create the eval_function before start of the first epoch
+    # because TensorBoard callback on_epoch_begin adds summary to the
+    # list of fetches of the eval_function
+    callback_model._make_eval_function()
 
   callback_list.model.stop_training = False
   return callback_list
+# pylint: enable=protected-access
 
 
 def _is_generator_like(data):
@@ -491,7 +464,8 @@ class ProgbarLogger(Callback):
       self.progbar = Progbar(
           target=self.target,
           verbose=self.verbose,
-          stateful_metrics=self.stateful_metrics)
+          stateful_metrics=self.stateful_metrics,
+          unit_name='step' if self.use_steps else 'sample')
 
   def on_batch_begin(self, batch, logs=None):
     if self.seen < self.target:
@@ -953,6 +927,7 @@ class TensorBoard(Callback):
     self.batch_size = batch_size
     self._current_batch = 0
     self._total_batches_seen = 0
+    self._total_val_batches_seen = 0
     self.embeddings_freq = embeddings_freq
     self.embeddings_layer_names = embeddings_layer_names
     self.embeddings_metadata = embeddings_metadata
@@ -1041,8 +1016,10 @@ class TensorBoard(Callback):
     # If both embedding_freq and embeddings_data are available, we will
     # visualize embeddings.
     if self.embeddings_freq and self.embeddings_data is not None:
-      self.embeddings_data = standardize_input_data(self.embeddings_data,
-                                                    model.input_names)
+      # Avoid circular dependency.
+      from tensorflow.python.keras.engine import training_utils  # pylint: disable=g-import-not-at-top
+      self.embeddings_data = training_utils.standardize_input_data(
+          self.embeddings_data, model.input_names)
 
       # If embedding_layer_names are not provided, get all of the embedding
       # layers from the model.
@@ -1107,10 +1084,8 @@ class TensorBoard(Callback):
       projector.visualize_embeddings(self.writer, config)
 
   def _fetch_callback(self, summary):
-    self.writer.add_summary(
-        summary,
-        self._epoch + self._current_val_batch / self._validation_batches)
-    self._current_val_batch += 1
+    self.writer.add_summary(summary, self._total_val_batches_seen)
+    self._total_val_batches_seen += 1
 
   def _write_custom_summaries(self, step, logs=None):
     """Writes metrics out as custom scalar summaries.
@@ -1141,22 +1116,6 @@ class TensorBoard(Callback):
         self.writer.add_summary(summary, step)
     self.writer.flush()
 
-  def on_train_begin(self, logs=None):
-    """Checks if histogram summaries can be run."""
-    # will never be set when in eager
-    if self.histogram_freq:
-      if self.params.get('validation_steps', None) is not None:
-        self._validation_batches = self.params['validation_steps']
-      elif self.validation_data:
-        self._validation_batches = math.ceil(
-            self.validation_data[0].shape[0] / self.batch_size)
-      else:
-        raise ValueError('If printing histograms, validation data must be '
-                         'provided.')
-      if self._validation_batches == 0:
-        raise ValueError(
-            'If printing histograms, validation data must have length > 0.')
-
   def on_batch_end(self, batch, logs=None):
     """Writes scalar summaries for metrics on every training batch."""
     # Don't output batch_size and batch number as Tensorboard summaries
@@ -1177,7 +1136,6 @@ class TensorBoard(Callback):
     # check if histogram summary should be run for this epoch
     if self.histogram_freq and epoch % self.histogram_freq == 0:
       self._epoch = epoch
-      self._current_val_batch = 0
       # pylint: disable=protected-access
       # add the histogram summary op if it should run this epoch
       if self.merged not in self.model._eval_function.fetches:
diff --git a/tensorflow/python/keras/callbacks_test.py b/tensorflow/python/keras/callbacks_test.py
index 9d9ede22c01..6c9a382b327 100644
--- a/tensorflow/python/keras/callbacks_test.py
+++ b/tensorflow/python/keras/callbacks_test.py
@@ -36,7 +36,6 @@ from tensorflow.python.framework import test_util
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.platform import test
 from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.summary.writer import writer_cache
 from tensorflow.python.training import adam
 
 try:
@@ -404,6 +403,7 @@ class KerasCallbacksTest(test.TestCase):
           float(keras.backend.get_value(
               model.optimizer.lr)) - 0.01 / 4) < keras.backend.epsilon()
 
+  @test_util.run_deprecated_v1
   def test_ReduceLROnPlateau(self):
     with self.cached_session():
       np.random.seed(1337)
@@ -675,6 +675,7 @@ class KerasCallbacksTest(test.TestCase):
       self.assertEqual(len(loss), 1)
       self.assertEqual(loss[0], np.inf)
 
+  @test_util.run_deprecated_v1
   def test_TensorBoard(self):
     np.random.seed(1337)
 
@@ -778,78 +779,7 @@ class KerasCallbacksTest(test.TestCase):
           data_generator(True), len(x_train), epochs=2, callbacks=cbks)
       assert os.path.exists(temp_dir)
 
-  def test_TensorBoard_histogram_freq_must_have_validation_data(self):
-    np.random.seed(1337)
-    tmpdir = self.get_temp_dir()
-    self.addCleanup(shutil.rmtree, tmpdir, ignore_errors=True)
-
-    with self.cached_session():
-      filepath = os.path.join(tmpdir, 'logs')
-
-      (x_train, y_train), (x_test, y_test) = testing_utils.get_test_data(
-          train_samples=TRAIN_SAMPLES,
-          test_samples=TEST_SAMPLES,
-          input_shape=(INPUT_DIM,),
-          num_classes=NUM_CLASSES)
-      y_test = keras.utils.to_categorical(y_test)
-      y_train = keras.utils.to_categorical(y_train)
-
-      def data_generator(train):
-        if train:
-          max_batch_index = len(x_train) // BATCH_SIZE
-        else:
-          max_batch_index = len(x_test) // BATCH_SIZE
-        i = 0
-        while 1:
-          if train:
-            # simulate multi-input/output models
-            yield (x_train[i * BATCH_SIZE: (i + 1) * BATCH_SIZE],
-                   y_train[i * BATCH_SIZE: (i + 1) * BATCH_SIZE])
-          else:
-            yield (x_test[i * BATCH_SIZE: (i + 1) * BATCH_SIZE],
-                   y_test[i * BATCH_SIZE: (i + 1) * BATCH_SIZE])
-          i += 1
-          i %= max_batch_index
-
-      inp = keras.Input((INPUT_DIM,))
-      hidden = keras.layers.Dense(2, activation='relu')(inp)
-      hidden = keras.layers.Dropout(0.1)(hidden)
-      output = keras.layers.Dense(NUM_CLASSES, activation='softmax')(hidden)
-      model = keras.models.Model(inputs=inp, outputs=output)
-      model.compile(loss='categorical_crossentropy',
-                    optimizer='sgd',
-                    metrics=['accuracy'])
-
-      # we must generate new callbacks for each test, as they aren't stateless
-      def callbacks_factory(histogram_freq):
-        return [keras.callbacks.TensorBoard(
-            log_dir=filepath,
-            histogram_freq=histogram_freq,
-            write_images=True, write_grads=True,
-            batch_size=5)]
-
-      # fit w/o validation data should raise ValueError if histogram_freq > 0
-      cbs = callbacks_factory(histogram_freq=1)
-      with self.assertRaises(ValueError):
-        model.fit(
-            x_train, y_train, batch_size=BATCH_SIZE, callbacks=cbs, epochs=3)
-
-      for cb in cbs:
-        cb.on_train_end()
-
-      # fit generator without validation data should raise ValueError if
-      # histogram_freq > 0
-      cbs = callbacks_factory(histogram_freq=1)
-      with self.assertRaises(ValueError):
-        model.fit_generator(
-            data_generator(True), len(x_train), epochs=2, callbacks=cbs)
-
-      for cb in cbs:
-        cb.on_train_end()
-
-      # Make sure file writer cache is clear to avoid failures during cleanup.
-      writer_cache.FileWriterCache.clear()
-
+  @test_util.run_deprecated_v1
   def test_TensorBoard_multi_input_output(self):
     np.random.seed(1337)
     tmpdir = self.get_temp_dir()
@@ -921,6 +851,7 @@ class KerasCallbacksTest(test.TestCase):
                           callbacks=callbacks_factory(histogram_freq=1))
       assert os.path.isdir(filepath)
 
+  @test_util.run_deprecated_v1
   def test_Tensorboard_histogram_summaries_in_test_function(self):
 
     class FileWriterStub(object):
@@ -996,8 +927,9 @@ class KerasCallbacksTest(test.TestCase):
           epochs=3,
           verbose=0)
 
-      self.assertAllEqual(tsb.writer.steps_seen, [0, 0.5, 1, 1.5, 2, 2.5])
+      self.assertAllEqual(tsb.writer.steps_seen, [0, 1, 2, 3, 4, 5])
 
+  @test_util.run_deprecated_v1
   def test_Tensorboard_histogram_summaries_with_generator(self):
     np.random.seed(1337)
     tmpdir = self.get_temp_dir()
@@ -1129,6 +1061,7 @@ class KerasCallbacksTest(test.TestCase):
 
       assert os.path.exists(temp_dir)
 
+  @test_util.run_deprecated_v1
   def test_Tensorboard_batch_logging(self):
 
     class FileWriterStub(object):
@@ -1163,6 +1096,7 @@ class KerasCallbacksTest(test.TestCase):
     self.assertEqual(tb_cbk.writer.summary_values, [0., 1., 2., 3., 4.])
     self.assertEqual(tb_cbk.writer.summary_tags, ['batch_acc'] * 5)
 
+  @test_util.run_deprecated_v1
   def test_Tensorboard_epoch_and_batch_logging(self):
 
     class FileWriterStub(object):
@@ -1234,6 +1168,7 @@ class KerasCallbacksTest(test.TestCase):
 
     self.assertTrue(os.path.exists(temp_dir))
 
+  @test_util.run_deprecated_v1
   def test_TensorBoard_update_freq(self):
 
     class FileWriterStub(object):
@@ -1325,6 +1260,7 @@ class KerasCallbacksTest(test.TestCase):
             callbacks=cbks,
             epochs=1)
 
+  @test_util.run_deprecated_v1
   def test_fit_generator_with_callback(self):
 
     class TestCallback(keras.callbacks.Callback):
diff --git a/tensorflow/python/keras/engine/__init__.py b/tensorflow/python/keras/engine/__init__.py
index 26aed34766f..005f6462ffa 100644
--- a/tensorflow/python/keras/engine/__init__.py
+++ b/tensorflow/python/keras/engine/__init__.py
@@ -20,10 +20,10 @@ from __future__ import print_function
 
 # TODO(fchollet): Remove hourglass imports once external code is done importing
 # non-public APIs.
-from tensorflow.python.keras.engine.base_layer import InputSpec
 from tensorflow.python.keras.engine.base_layer import Layer
 from tensorflow.python.keras.engine.input_layer import Input
 from tensorflow.python.keras.engine.input_layer import InputLayer
+from tensorflow.python.keras.engine.input_spec import InputSpec
 from tensorflow.python.keras.utils.layer_utils import get_source_inputs
 
 del absolute_import
diff --git a/tensorflow/python/keras/engine/base_layer.py b/tensorflow/python/keras/engine/base_layer.py
index 23419ae1503..8e353003429 100644
--- a/tensorflow/python/keras/engine/base_layer.py
+++ b/tensorflow/python/keras/engine/base_layer.py
@@ -18,8 +18,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import collections as collections_lib
-import enum  # pylint: disable=g-bad-import-order
 import functools
 import inspect  # Necessary supplement to tf_inspect to deal with variadic args.
 
@@ -36,13 +34,14 @@ from tensorflow.python.keras import backend
 from tensorflow.python.keras import constraints
 from tensorflow.python.keras import initializers
 from tensorflow.python.keras import regularizers
+from tensorflow.python.keras.engine import base_layer_utils
+from tensorflow.python.keras.engine import input_spec
 from tensorflow.python.keras.utils import generic_utils
 from tensorflow.python.keras.utils import tf_utils
 # A module that only depends on `keras.layers` import these from here.
 from tensorflow.python.keras.utils.generic_utils import to_snake_case  # pylint: disable=unused-import
 from tensorflow.python.keras.utils.tf_utils import is_tensor_or_tensor_list  # pylint: disable=unused-import
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variables as tf_variables
 from tensorflow.python.training.checkpointable import base as checkpointable
@@ -54,20 +53,6 @@ from tensorflow.python.util.tf_export import tf_export
 from tensorflow.tools.docs import doc_controls
 
 
-class CallConvention(enum.Enum):
-  """Calling conventions for passing `Layer` inputs to `Layer.call`."""
-  # The Layer takes inputs as its first argument, named "inputs" for
-  # compatibility with the signature of Layer.__call__. This is the mode assumed
-  # for Layers which are not subclassed Models.
-  EXPLICIT_INPUTS_ARGUMENT = 1
-  # The Layer takes a single positional argument, not named "inputs". It's
-  # treated like an "inputs" argument.
-  SINGLE_POSITIONAL_ARGUMENT = 2
-  # The Layer has multiple positional arguments to which its inputs should be
-  # bound.
-  POSITIONAL_ARGUMENTS_ARE_INPUTS = 3
-
-
 @tf_export('keras.layers.Layer')
 class Layer(checkpointable.CheckpointableBase):
   """Base layer class.
@@ -102,10 +87,6 @@ class Layer(checkpointable.CheckpointableBase):
     name: The name of the layer (string).
     dtype: Default dtype of the layer's weights (default of `None` means use the
       type of the first input).
-    trainable_variables: List of trainable variables.
-    non_trainable_variables: List of non-trainable variables.
-    variables: List of all variables of this layer, trainable and
-      non-trainable.
     updates: List of update ops of this layer.
     losses: List of losses added by this layer.
     trainable_weights: List of variables to be included in backprop.
@@ -150,9 +131,9 @@ class Layer(checkpointable.CheckpointableBase):
     self.built = False
     # Provides information about which inputs are compatible with the layer.
     self.input_spec = None
+    self.supports_masking = False
 
     self._init_set_name(name)
-
     self._activity_regularizer = kwargs.pop('activity_regularizer', None)
     self._trainable_weights = []
     self._non_trainable_weights = []
@@ -170,29 +151,25 @@ class Layer(checkpointable.CheckpointableBase):
     # in eager mode or graph mode alternatively, we need to keep track of
     # eager losses and symbolic losses via separate attributes.
     self._eager_losses = []
+    # A list of metric instances corresponding to the symbolic metric tensors
+    # added using the `add_metric` API.
+    self._metrics = []
+    # TODO(psv): Remove this property.
+    # A dictionary that maps metric names to metric result tensors. The results
+    # are the running averages of metric values over an epoch.
+    self._metrics_tensors = {}
     self._dtype = None if dtype is None else dtypes.as_dtype(dtype).name
     self._call_fn_args = function_utils.fn_args(self.call)
     self._compute_previous_mask = ('mask' in self._call_fn_args or
                                    hasattr(self, 'compute_mask'))
-    self._call_convention = CallConvention.EXPLICIT_INPUTS_ARGUMENT
+    self._call_convention = (base_layer_utils
+                             .CallConvention.EXPLICIT_INPUTS_ARGUMENT)
 
     # These lists will be filled via successive calls
     # to self._add_inbound_node().
     self._inbound_nodes = []
     self._outbound_nodes = []
 
-    self.supports_masking = False
-
-    # Mark if a layer supports using graph functions in the eager
-    # fit/predict/evaluate loop
-    # TODO(kaftan): merge this with the _static_graph_friendly flag once
-    # enough eager function bugs involving control flow / tensorarrays have
-    # been fixed,  and static-graph-friendly layers will almost always work in
-    # eager graph functions.
-    # We conservatively make this flag opt-in for now to avoid causing existing
-    # custom layers to crash.
-    self._can_use_graph_functions = False
-
     call_argspec = tf_inspect.getfullargspec(self.call)
     if 'training' in call_argspec.args:
       self._expects_training_arg = True
@@ -200,7 +177,7 @@ class Layer(checkpointable.CheckpointableBase):
       self._expects_training_arg = False
 
     # Whether the `call` method can be used to build a TF graph without issues.
-    self._static_graph_friendly = True
+    self._call_is_graph_friendly = True
 
     # Manage input shape information if passed.
     if 'input_shape' in kwargs or 'batch_input_shape' in kwargs:
@@ -222,286 +199,34 @@ class Layer(checkpointable.CheckpointableBase):
     else:
       self._initial_weights = None
 
-  @property
-  def _is_static_graph_friendly(self):
-    return self._static_graph_friendly
-
-  @_is_static_graph_friendly.setter
-  def _is_static_graph_friendly(self, value):
-    if value not in {True, False}:
-      raise ValueError('`static_graph_friendly` requires a boolean value. '
-                       'Received: {}'.format(value))
-    self._static_graph_friendly = value
-
-  def _init_set_name(self, name, zero_based=True):
-    if not name:
-      self._name = unique_layer_name(
-          generic_utils.to_snake_case(self.__class__.__name__),
-          zero_based=zero_based)
-    else:
-      self._name = name
-
-  @property
-  def dtype(self):
-    return self._dtype
-
-  @property
-  def name(self):
-    return self._name
-
-  @property
-  def activity_regularizer(self):
-    """Optional regularizer function for the output of this layer."""
-    return self._activity_regularizer
-
-  @activity_regularizer.setter
-  def activity_regularizer(self, regularizer):
-    """Optional regularizer function for the output of this layer."""
-    self._activity_regularizer = self._no_dependency(regularizer)
-
-  @property
-  def trainable_weights(self):
-    return self._trainable_weights if self.trainable else []
-
-  @property
-  def non_trainable_weights(self):
-    if self.trainable:
-      return self._non_trainable_weights
-    else:
-      return self._trainable_weights + self._non_trainable_weights
-
-  @property
-  def trainable_variables(self):
-    return self.trainable_weights
-
-  @property
-  def non_trainable_variables(self):
-    return self.non_trainable_weights
-
-  @property
-  def weights(self):
-    """Returns the list of all layer variables/weights.
-
-    Returns:
-      A list of variables.
-    """
-    return self.trainable_weights + self.non_trainable_weights
-
-  @property
-  def variables(self):
-    """Returns the list of all layer variables/weights.
-
-    Returns:
-      A list of variables.
-    """
-    return self.weights
-
-  @property
-  def updates(self):
-    if context.executing_eagerly():
-      raise RuntimeError('Layer.updates not supported in Eager mode.')
-    if not self.trainable and not self.stateful:
-      return []
-    return self._updates
-
-  @doc_controls.for_subclass_implementers
-  def add_update(self, updates, inputs=None):
-    """Add update op(s), potentially dependent on layer inputs.
-
-    Weight updates (for instance, the updates of the moving mean and variance
-    in a BatchNormalization layer) may be dependent on the inputs passed
-    when calling a layer. Hence, when reusing the same layer on
-    different inputs `a` and `b`, some entries in `layer.updates` may be
-    dependent on `a` and some on `b`. This method automatically keeps track
-    of dependencies.
-
-    The `get_updates_for` method allows to retrieve the updates relevant to a
-    specific set of inputs.
-
-    This call is ignored when eager execution is enabled (in that case, variable
-    updates are run on the fly and thus do not need to be tracked for later
-    execution).
-
-    Arguments:
-      updates: Update op, or list/tuple of update ops.
-      inputs: If anything other than None is passed, it signals the updates
-        are conditional on some of the layer's inputs,
-        and thus they should only be run where these inputs are available.
-        This is the case for BatchNormalization updates, for instance.
-        If None, the updates will be taken into account unconditionally,
-        and you are responsible for making sure that any dependency they might
-        have is available at runtime.
-        A step counter might fall into this category.
-    """
-    if context.executing_eagerly():
-      return  # Updates already applied when in eager mode.
-
-    def process_update(x):
-      if isinstance(x, ops.Operation):
-        return x
-      elif hasattr(x, 'op'):
-        return x.op
-      else:
-        return ops.convert_to_tensor(x)
-
-    updates = generic_utils.to_list(updates)
-    updates = [process_update(x) for x in updates]
-    self._updates += updates
-    if inputs is None:
-      for u in updates:
-        u._unconditional_update = True  # pylint: disable=protected-access
-    else:
-      for u in updates:
-        u._unconditional_update = False  # pylint: disable=protected-access
-
-  def get_updates_for(self, inputs):
-    """Retrieves updates relevant to a specific set of inputs.
-
-    Arguments:
-      inputs: Input tensor or list/tuple of input tensors.
-
-    Returns:
-      List of update ops of the layer that depend on `inputs`.
-
-    Raises:
-      RuntimeError: If called in Eager mode.
-    """
-    if context.executing_eagerly():
-      raise RuntimeError('`get_updates_for()` not supported in Eager mode.')
-
-    # Updates disabled if layer is not trainable and not explicitly stateful.
-    if not self.trainable and not self.stateful:
-      return []
-
-    if inputs is None:
-      # Requesting unconditional updates.
-      return [x for x in self.updates if x._unconditional_update]  # pylint: disable=protected-access
-
-    # Requesting input-conditional updates.
-    inputs = nest.flatten(inputs)
-    reachable = tf_utils.get_reachable_from_inputs(inputs, self.updates)
-    updates = []
-    for update in self.updates:
-      if update in reachable:
-        updates.append(update)
-    return updates
-
-  @property
-  def losses(self):
-    """Losses which are associated with this `Layer`.
-
-    Variable regularization tensors are created when this property is accessed,
-    so it is eager safe: accessing `losses` under a `tf.GradientTape` will
-    propagate gradients back to the corresponding variables.
-
-    Returns:
-      A list of tensors.
-    """
-    collected_losses = []
-    if context.executing_eagerly():
-      collected_losses.extend(self._eager_losses)
-    else:
-      collected_losses.extend(self._losses)
-    for regularizer in self._callable_losses:
-      loss_tensor = regularizer()
-      if loss_tensor is not None:
-        collected_losses.append(loss_tensor)
-    return collected_losses
-
-  @doc_controls.for_subclass_implementers
-  def add_loss(self, losses, inputs=None):
-    """Add loss tensor(s), potentially dependent on layer inputs.
-
-    Some losses (for instance, activity regularization losses) may be dependent
-    on the inputs passed when calling a layer. Hence, when reusing the same
-    layer on different inputs `a` and `b`, some entries in `layer.losses` may
-    be dependent on `a` and some on `b`. This method automatically keeps track
-    of dependencies.
-
-    The `get_losses_for` method allows to retrieve the losses relevant to a
-    specific set of inputs.
-
-    Note that `add_loss` is not supported when executing eagerly. Instead,
-    variable regularizers may be added through `add_variable`. Activity
-    regularization is not supported directly (but such losses may be returned
-    from `Layer.call()`).
-
-    Arguments:
-      losses: Loss tensor, or list/tuple of tensors. Rather than tensors, losses
-        may also be zero-argument callables which create a loss tensor.
-      inputs: Ignored when executing eagerly. If anything other than None is
-        passed, it signals the losses are conditional on some of the layer's
-        inputs, and thus they should only be run where these inputs are
-        available. This is the case for activity regularization losses, for
-        instance. If `None` is passed, the losses are assumed
-        to be unconditional, and will apply across all dataflows of the layer
-        (e.g. weight regularization losses).
-    """
-    losses = generic_utils.to_list(losses)
-
-    def _tag_unconditional(loss):
-      if callable(loss):
-        loss = loss()
-      if loss is None:
-        return None  # Will be filtered out when computing the .losses property
-      if not tensor_util.is_tensor(loss):
-        loss = ops.convert_to_tensor(loss, dtype=backend.floatx())
-      loss._unconditional_loss = (inputs is None)  # pylint: disable=protected-access
-      return loss
-
-    for loss in losses:
-      if callable(loss):
-        self._callable_losses.append(
-            functools.partial(_tag_unconditional, loss))
-      else:
-        if context.executing_eagerly():
-          self._eager_losses.append(_tag_unconditional(loss))
-        else:
-          self._losses.append(_tag_unconditional(loss))
-
-  def get_losses_for(self, inputs):
-    """Retrieves losses relevant to a specific set of inputs.
-
-    Arguments:
-      inputs: Input tensor or list/tuple of input tensors.
-
-    Returns:
-      List of loss tensors of the layer that depend on `inputs`.
-
-    Raises:
-      RuntimeError: If called in Eager mode.
-    """
-    if context.executing_eagerly():
-      raise RuntimeError('Layer.get_losses_for not supported in Eager mode.')
-
-    if inputs is None:
-      # Requesting unconditional losses.
-      return [x for x in self.losses if x._unconditional_loss]  # pylint: disable=protected-access
-
-    # Requesting input-conditional losses.
-    inputs = nest.flatten(inputs)
-    # Retrieve the set of tensors in the TF graph that depend on `inputs`.
-    # The losses we want to return will be part of this set.
-    # To avoid unnecessary work, we stop the search in case all of
-    # `self.losses` have been retrieved.
-    reachable = tf_utils.get_reachable_from_inputs(inputs, self.losses)
-    losses = []
-    for loss in self.losses:
-      if loss in reachable:
-        losses.append(loss)
-    return losses
-
-  def _name_scope(self):
-    return self.name
-
   def build(self, input_shape):
-    """Creates the variables of the layer."""
+    """Creates the variables of the layer (optional, for subclass implementers).
+
+    This is a method that implementers of subclasses of `Layer` or `Model`
+    can override if they need a state-creation step in-between
+    layer instantiation and layer call.
+
+    This is typically used to create the weights of `Layer` subclasses.
+
+    Arguments:
+      input_shape: Instance of `TensorShape`, or list of instances of
+        `TensorShape` if the layer expects a list of inputs
+        (one instance per input).
+    """
     self.built = True
 
   @doc_controls.for_subclass_implementers
-  def add_variable(self, *args, **kwargs):
-    """Alias for `add_weight`."""
-    return self.add_weight(*args, **kwargs)
+  def call(self, inputs, **kwargs):  # pylint: disable=unused-argument
+    """This is where the layer's logic lives.
+
+    Arguments:
+        inputs: Input tensor, or list/tuple of input tensors.
+        **kwargs: Additional keyword arguments.
+
+    Returns:
+        A tensor or list/tuple of tensors.
+    """
+    return inputs
 
   @doc_controls.for_subclass_implementers
   def add_weight(self,
@@ -604,7 +329,7 @@ class Layer(checkpointable.CheckpointableBase):
         shape=shape,
         # TODO(allenl): a `make_variable` equivalent should be added as a
         # `Checkpointable` method.
-        getter=getter or make_variable,
+        getter=getter or base_layer_utils.make_variable,
         # Manage errors in Layer rather than Checkpointable.
         overwrite=True,
         initializer=initializer,
@@ -630,341 +355,45 @@ class Layer(checkpointable.CheckpointableBase):
       self._non_trainable_weights.append(variable)
     return variable
 
-  def _handle_weight_regularization(self, name, variable, regularizer):
-    """Create lambdas which compute regularization losses."""
+  def get_config(self):
+    """Returns the config of the layer.
 
-    def _loss_for_variable(v):
-      """Creates a regularization loss `Tensor` for variable `v`."""
-      with ops.colocate_with(v):
-        with ops.name_scope(name + '/Regularizer'):
-          regularization = regularizer(v)
-      return regularization
+    A layer config is a Python dictionary (serializable)
+    containing the configuration of a layer.
+    The same layer can be reinstantiated later
+    (without its trained weights) from this configuration.
 
-    if isinstance(variable, tf_variables.PartitionedVariable):
-      for v in variable:
-        self.add_loss(functools.partial(_loss_for_variable, v))
-    else:
-      self.add_loss(functools.partial(_loss_for_variable, variable))
+    The config of a layer does not include connectivity
+    information, nor the layer class name. These are handled
+    by `Network` (one layer of abstraction above).
 
-  def _handle_activity_regularization(self, inputs, outputs):
-    # Apply activity regularization.
-    # Note that it should be applied every time the layer creates a new
-    # output, since it is output-specific.
-    if self._activity_regularizer:
-      output_list = nest.flatten(outputs)
-      with ops.name_scope('ActivityRegularizer'):
-        for output in output_list:
-          activity_loss = self._activity_regularizer(output)
-          batch_size = math_ops.cast(
-              array_ops.shape(output)[0], activity_loss.dtype)
-          # Make activity regularization strength batch-agnostic.
-          mean_activity_loss = activity_loss / batch_size
-          self.add_loss(mean_activity_loss, inputs=inputs)
+    Returns:
+        Python dictionary.
+    """
+    config = {'name': self.name, 'trainable': self.trainable}
+    if hasattr(self, '_batch_input_shape'):
+      config['batch_input_shape'] = self._batch_input_shape
+    if hasattr(self, 'dtype'):
+      config['dtype'] = self.dtype
+    return config
 
-  @doc_controls.for_subclass_implementers
-  def call(self, inputs, **kwargs):  # pylint: disable=unused-argument
-    """This is where the layer's logic lives.
+  @classmethod
+  def from_config(cls, config):
+    """Creates a layer from its config.
+
+    This method is the reverse of `get_config`,
+    capable of instantiating the same layer from the config
+    dictionary. It does not handle layer connectivity
+    (handled by Network), nor weights (handled by `set_weights`).
 
     Arguments:
-        inputs: Input tensor, or list/tuple of input tensors.
-        **kwargs: Additional keyword arguments.
+        config: A Python dictionary, typically the
+            output of get_config.
 
     Returns:
-        A tensor or list/tuple of tensors.
+        A layer instance.
     """
-    return inputs
-
-  def __call__(self, inputs, *args, **kwargs):
-    """Wraps `call`, applying pre- and post-processing steps.
-
-    Arguments:
-      inputs: input tensor(s).
-      *args: additional positional arguments to be passed to `self.call`.
-      **kwargs: additional keyword arguments to be passed to `self.call`.
-
-    Returns:
-      Output tensor(s).
-
-    Note:
-      - The following optional keyword arguments are reserved for specific uses:
-        * `training`: Boolean scalar tensor of Python boolean indicating
-          whether the `call` is meant for training or inference.
-        * `mask`: Boolean input mask.
-      - If the layer's `call` method takes a `mask` argument (as some Keras
-        layers do), its default value will be set to the mask generated
-        for `inputs` by the previous layer (if `input` did come from
-        a layer that generated a corresponding mask, i.e. if it came from
-        a Keras layer with masking support.
-
-    Raises:
-      ValueError: if the layer's `call` method returns None (an invalid value).
-    """
-    input_list = nest.flatten(inputs)
-
-    if context.executing_eagerly():
-      # Accept NumPy inputs by converting to Tensors when executing eagerly.
-      if all([isinstance(x, (np.ndarray, float, int)) for x in input_list]):
-        inputs = nest.map_structure(ops.convert_to_tensor, inputs)
-        input_list = nest.flatten(inputs)
-
-    # We will attempt to build a TF graph if & only if all inputs are symbolic.
-    # This is always the case in graph mode. It can also be the case in eager
-    # mode when all inputs can be traced back to `keras.Input()` (when building
-    # models using the functional API).
-    build_graph = tf_utils.are_all_symbolic_tensors(input_list)
-    executing_eagerly = context.executing_eagerly()
-
-    # Handle Keras mask propagation from previous layer to current layer.
-    previous_mask = None
-    if build_graph and (not hasattr(self, '_compute_previous_mask') or
-                        self._compute_previous_mask):
-      previous_mask = collect_previous_mask(inputs)
-      if not hasattr(self, '_call_fn_args'):
-        self._call_fn_args = self._no_dependency(
-            function_utils.fn_args(self.call))
-      if ('mask' in self._call_fn_args and 'mask' not in kwargs and
-          not generic_utils.is_all_none(previous_mask)):
-        # The previous layer generated a mask, and mask was not explicitly pass
-        # to __call__, hence we set previous_mask as the default value.
-        kwargs['mask'] = previous_mask
-
-    input_shapes = None
-
-    with ops.name_scope(self._name_scope()):
-      if not self.built:
-        # Check input assumptions set before layer building, e.g. input rank.
-        self._assert_input_compatibility(inputs)
-        if input_list and self._dtype is None:
-          try:
-            self._dtype = input_list[0].dtype.base_dtype.name
-          except AttributeError:
-            pass
-
-        if all(hasattr(x, 'shape') for x in input_list):
-          input_shapes = nest.map_structure(lambda x: x.shape, inputs)
-
-        if (not hasattr(self, '_is_graph_network') or
-            self.__class__.__name__ == 'Sequential' or
-            not hasattr(self.build, '_is_default')):
-          # Only if self is a layer, an instance of a sequential model, or
-          # the user has manually overwritten the build method do we need to
-          # build it.
-          self.build(input_shapes)
-        # We must set self.built since user defined build functions are not
-        # constrained to set self.built.
-        self.built = True
-
-      # Check input assumptions set after layer building, e.g. input shape.
-      if build_graph:
-        # Symbolic execution on symbolic tensors. We will attempt to build
-        # the corresponding TF subgraph inside `backend.get_graph()`
-        self._assert_input_compatibility(inputs)
-        graph = backend.get_graph()
-        with graph.as_default():
-          if not executing_eagerly:
-            # In graph mode, failure to build the layer's graph
-            # implies a user-side bug. We don't catch exceptions.
-            outputs = self.call(inputs, *args, **kwargs)
-          else:
-            try:
-              outputs = self.call(inputs, *args, **kwargs)
-            except Exception:  # pylint: disable=broad-except
-              # Any issue during graph-building means we will later run the
-              # model in eager mode, whether the issue was related to
-              # graph mode or not. This provides a nice debugging experience.
-              self._is_static_graph_friendly = False
-              # We will use static shape inference to return symbolic tensors
-              # matching the specifications of the layer outputs.
-              # Since we have set `self._is_static_graph_friendly = False`,
-              # we will never attempt to run the underlying TF graph (which is
-              # disconnected).
-              # TODO(fchollet): consider py_func as an alternative, which
-              # would enable us to run the underlying graph if needed.
-              input_shapes = nest.map_structure(lambda x: x.shape, inputs)
-              output_shapes = self.compute_output_shape(input_shapes)
-              outputs = nest.map_structure(
-                  lambda shape: backend.placeholder(shape, dtype=self.dtype),
-                  output_shapes)
-
-          if outputs is None:
-            raise ValueError('A layer\'s `call` method should return a '
-                             'Tensor or a list of Tensors, not None '
-                             '(layer: ' + self.name + ').')
-          self._handle_activity_regularization(inputs, outputs)
-          self._set_mask_metadata(inputs, outputs, previous_mask)
-          if have_all_keras_metadata(inputs):
-            inputs, outputs = self._set_connectivity_metadata_(
-                inputs, outputs, args, kwargs)
-          if hasattr(self, '_set_inputs') and not self.inputs:
-            # Subclassed network: explicitly set metadata normally set by
-            # a call to self._set_inputs().
-            # This is not relevant in eager execution.
-            self._set_inputs(inputs, outputs)
-      else:
-        # Eager execution on data tensors.
-        outputs = self.call(inputs, *args, **kwargs)
-        self._handle_activity_regularization(inputs, outputs)
-        return outputs
-
-    if not context.executing_eagerly():
-      # Optionally load weight values specified at layer instantiation.
-      # TODO(fchollet): consider enabling this with eager execution too.
-      if (hasattr(self, '_initial_weights') and
-          self._initial_weights is not None):
-        self.set_weights(self._initial_weights)
-        del self._initial_weights
-    return outputs
-
-  def apply(self, inputs, *args, **kwargs):
-    """Apply the layer on a input.
-
-    This simply wraps `self.__call__`.
-
-    Arguments:
-      inputs: Input tensor(s).
-      *args: additional positional arguments to be passed to `self.call`.
-      **kwargs: additional keyword arguments to be passed to `self.call`.
-
-    Returns:
-      Output tensor(s).
-    """
-    return self.__call__(inputs, *args, **kwargs)
-
-  def _set_mask_metadata(self, inputs, outputs, previous_mask):
-    # In some cases the mask of the outputs has already been computed by
-    # inner layers and does not need to be recomputed by this layer.
-    mask_already_computed = all(
-        hasattr(x, '_keras_mask') for x in generic_utils.to_list(outputs))
-    if hasattr(self, 'compute_mask') and not mask_already_computed:
-      output_mask = self.compute_mask(inputs, previous_mask)
-    else:
-      output_mask = None
-    if isinstance(outputs, (list, tuple)):
-      if output_mask is None:
-        output_mask = [None for _ in range(len(outputs))]
-      for x, m in zip(outputs, output_mask):
-        try:
-          x._keras_mask = m  # pylint: disable=protected-access
-        except AttributeError:
-          pass  # C type such as dict. Masking not supported in this case.
-    else:
-      try:
-        outputs._keras_mask = output_mask  # pylint: disable=protected-access
-      except AttributeError:
-        pass  # C type such as dict. Masking not supported in this case.
-
-  def _set_connectivity_metadata_(self, inputs, outputs, args, kwargs):
-    call_convention = getattr(self, '_call_convention',
-                              CallConvention.EXPLICIT_INPUTS_ARGUMENT)
-    if args:
-      if call_convention == CallConvention.EXPLICIT_INPUTS_ARGUMENT:
-        raise TypeError(
-            'This layer ("{}") takes an `inputs` argument in `call()`, '
-            'and only the `inputs` argument may be specified as a positional '
-            'argument. Pass everything else as a keyword argument '
-            '(those arguments will not be tracked '
-            'as inputs to the layer).'.format(self.name))
-      elif call_convention == CallConvention.SINGLE_POSITIONAL_ARGUMENT:
-        raise TypeError(
-            'This layer ("{}") takes a single positional argument in `call()`,'
-            ' which is by convention the `inputs` argument, '
-            'and only this argument may be specified as a positional argument. '
-            'Pass everything else as a keyword argument '
-            '(those arguments will not be tracked '
-            'as inputs to the layer).'.format(self.name))
-
-    # If the layer returns tensors from its inputs, unmodified,
-    # we copy them to avoid loss of tensor metadata.
-    output_ls = nest.flatten(outputs)
-    output_ls_copy = []
-    for x in output_ls:
-      if x in nest.flatten(inputs):
-        with ops.name_scope(self.name):
-          x = array_ops.identity(x)
-      output_ls_copy.append(x)
-    if len(output_ls_copy) == 1:
-      outputs = output_ls_copy[0]
-    else:
-      outputs = output_ls_copy
-
-    inputs, kwargs = self._inputs_from_call_args(
-        call_args=(inputs,) + args, call_kwargs=kwargs)
-    # Add an inbound node to the layer, so it can keep track of this call.
-    # This updates the layer history of the output tensor(s).
-    kwargs.pop('mask', None)  # `mask` should not be serialized.
-    self._add_inbound_node(
-        input_tensors=inputs, output_tensors=outputs, arguments=kwargs)
-    return inputs, outputs
-
-  def _inputs_from_call_args(self, call_args, call_kwargs):
-    """Get Layer inputs from __call__ *args and **kwargs.
-
-    Args:
-      call_args: The positional arguments passed to __call__.
-      call_kwargs: The keyword argument dict passed to __call__.
-
-    Returns:
-      A tuple of (inputs, non_input_kwargs). These may be the same objects as
-      were passed in (call_args and call_kwargs).
-    """
-    call_convention = getattr(self, '_call_convention',
-                              CallConvention.EXPLICIT_INPUTS_ARGUMENT)
-    if (call_convention in (
-        CallConvention.EXPLICIT_INPUTS_ARGUMENT,
-        CallConvention.SINGLE_POSITIONAL_ARGUMENT)):
-      assert len(call_args) == 1  # TypeError raised earlier in __call__.
-      return call_args[0], call_kwargs
-    else:
-      call_arg_spec = tf_inspect.getfullargspec(self.call)
-      # There is no explicit "inputs" argument expected or provided to
-      # call(). Arguments which have default values are considered non-inputs,
-      # and arguments without are considered inputs.
-      if call_arg_spec.defaults:
-        if call_arg_spec.varargs is not None:
-          raise TypeError(
-              'Layers may not accept both positional arguments and '
-              'arguments with default values (unable to determine which '
-              'are inputs to the layer). '
-              'Issue occurred with layer "%s"' % (self.name))
-        keyword_arg_names = set(
-            call_arg_spec.args[-len(call_arg_spec.defaults):])
-      else:
-        keyword_arg_names = set()
-        # Training is never an input argument name, to allow signatures like
-        # call(x, training).
-      keyword_arg_names.add('training')
-      _, unwrapped_call = tf_decorator.unwrap(self.call)
-      bound_args = inspect.getcallargs(
-          unwrapped_call, *call_args, **call_kwargs)
-      if call_arg_spec.varkw is not None:
-        var_kwargs = bound_args.pop(call_arg_spec.varkw)
-        bound_args.update(var_kwargs)
-        keyword_arg_names = keyword_arg_names.union(var_kwargs.keys())
-      all_args = call_arg_spec.args
-      if all_args and bound_args[all_args[0]] is self:
-        # Ignore the 'self' argument of methods
-        bound_args.pop(call_arg_spec.args[0])
-        all_args = all_args[1:]
-      non_input_arg_values = {}
-      input_arg_values = []
-      remaining_args_are_keyword = False
-      for argument_name in all_args:
-        if argument_name in keyword_arg_names:
-          remaining_args_are_keyword = True
-        else:
-          if remaining_args_are_keyword:
-            raise TypeError(
-                'Found a positional argument in a layer call after a non-input '
-                'argument. All arguments after "training" must be keyword '
-                'arguments, and are not tracked as inputs to the layer. '
-                'Issue occurred with layer "%s"' % (self.name))
-        if remaining_args_are_keyword:
-          non_input_arg_values[argument_name] = bound_args[argument_name]
-        else:
-          input_arg_values.append(bound_args[argument_name])
-      if call_arg_spec.varargs is not None:
-        input_arg_values.extend(bound_args[call_arg_spec.varargs])
-      return input_arg_values, non_input_arg_values
+    return cls(**config)
 
   def compute_output_shape(self, input_shape):
     """Computes the output shape of the layer.
@@ -989,15 +418,15 @@ class Layer(checkpointable.CheckpointableBase):
       # use `compute_output_shape` manually (these users will have to
       # implement `compute_output_shape` themselves).
       self.build(input_shape)
-
       with context.graph_mode():
         graph = func_graph.FuncGraph('graph')
         with graph.as_default():
           if isinstance(input_shape, list):
-            inputs = [generate_placeholders_from_shape(shape)
+            inputs = [base_layer_utils.generate_placeholders_from_shape(shape)
                       for shape in input_shape]
           else:
-            inputs = generate_placeholders_from_shape(input_shape)
+            inputs = base_layer_utils.generate_placeholders_from_shape(
+                input_shape)
 
           try:
             if self._expects_training_arg:
@@ -1042,86 +471,425 @@ class Layer(checkpointable.CheckpointableBase):
     # carry over the input mask
     return mask
 
-  def _add_inbound_node(self,
-                        input_tensors,
-                        output_tensors,
-                        arguments=None):
-    """Internal method to create an inbound node for the layer.
+  def __call__(self, inputs, *args, **kwargs):
+    """Wraps `call`, applying pre- and post-processing steps.
 
     Arguments:
-        input_tensors: list of input tensors.
-        output_tensors: list of output tensors.
-        arguments: dictionary of keyword arguments that were passed to the
-            `call` method of the layer at the call that created the node.
-    """
-    input_tensors = nest.flatten(input_tensors)
-    output_tensors = nest.flatten(output_tensors)
-
-    # Collect input tensor(s) coordinates.
-    inbound_layers = []
-    node_indices = []
-    tensor_indices = []
-    for x in input_tensors:
-      assert hasattr(x, '_keras_history')
-      inbound_layer, node_index, tensor_index = x._keras_history  # pylint: disable=protected-access
-      inbound_layers.append(inbound_layer)
-      node_indices.append(node_index)
-      tensor_indices.append(tensor_index)
-
-    # Create node, add it to inbound nodes.
-    Node(
-        self,
-        inbound_layers=inbound_layers,
-        node_indices=node_indices,
-        tensor_indices=tensor_indices,
-        input_tensors=input_tensors,
-        output_tensors=output_tensors,
-        arguments=arguments)
-
-    # Update tensor history metadata.
-    for i in range(len(output_tensors)):
-      # The metadata attribute consists of 1) a layer instance
-      # 2) a node index for the layer, 3) a tensor index for the node.
-      # The allows layer reuse (multiple nodes per layer) and multi-output
-      # or multi-input layers (e.g. a layer can return multiple tensors,
-      # and each can be sent to a different layer).
-      output_tensors[i]._keras_history = (self, len(self._inbound_nodes) - 1, i)  # pylint: disable=protected-access
-
-  def _get_node_attribute_at_index(self, node_index, attr, attr_name):
-    """Private utility to retrieves an attribute (e.g. inputs) from a node.
-
-    This is used to implement the methods:
-        - get_input_shape_at
-        - get_output_shape_at
-        - get_input_at
-        etc...
-
-    Arguments:
-        node_index: Integer index of the node from which
-            to retrieve the attribute.
-        attr: Exact node attribute name.
-        attr_name: Human-readable attribute name, for error messages.
+      inputs: input tensor(s).
+      *args: additional positional arguments to be passed to `self.call`.
+      **kwargs: additional keyword arguments to be passed to `self.call`.
 
     Returns:
-        The layer's attribute `attr` at the node of index `node_index`.
+      Output tensor(s).
+
+    Note:
+      - The following optional keyword arguments are reserved for specific uses:
+        * `training`: Boolean scalar tensor of Python boolean indicating
+          whether the `call` is meant for training or inference.
+        * `mask`: Boolean input mask.
+      - If the layer's `call` method takes a `mask` argument (as some Keras
+        layers do), its default value will be set to the mask generated
+        for `inputs` by the previous layer (if `input` did come from
+        a layer that generated a corresponding mask, i.e. if it came from
+        a Keras layer with masking support.
 
     Raises:
-        RuntimeError: If the layer has no inbound nodes, or if called in Eager
-        mode.
-        ValueError: If the index provided does not match any node.
+      ValueError: if the layer's `call` method returns None (an invalid value).
     """
-    if not self._inbound_nodes:
-      raise RuntimeError('The layer has never been called '
-                         'and thus has no defined ' + attr_name + '.')
-    if not len(self._inbound_nodes) > node_index:
-      raise ValueError('Asked to get ' + attr_name + ' at node ' +
-                       str(node_index) + ', but the layer has only ' +
-                       str(len(self._inbound_nodes)) + ' inbound nodes.')
-    values = getattr(self._inbound_nodes[node_index], attr)
-    if len(values) == 1:
-      return values[0]
+    input_list = nest.flatten(inputs)
+
+    if context.executing_eagerly():
+      # Accept NumPy inputs by converting to Tensors when executing eagerly.
+      if all(isinstance(x, (np.ndarray, float, int)) for x in input_list):
+        inputs = nest.map_structure(ops.convert_to_tensor, inputs)
+        input_list = nest.flatten(inputs)
+
+    # We will attempt to build a TF graph if & only if all inputs are symbolic.
+    # This is always the case in graph mode. It can also be the case in eager
+    # mode when all inputs can be traced back to `keras.Input()` (when building
+    # models using the functional API).
+    build_graph = tf_utils.are_all_symbolic_tensors(input_list)
+    executing_eagerly = context.executing_eagerly()
+
+    # Handle Keras mask propagation from previous layer to current layer.
+    previous_mask = None
+    if build_graph and (not hasattr(self, '_compute_previous_mask') or
+                        self._compute_previous_mask):
+      previous_mask = base_layer_utils.collect_previous_mask(inputs)
+      if not hasattr(self, '_call_fn_args'):
+        self._call_fn_args = self._no_dependency(
+            function_utils.fn_args(self.call))
+      if ('mask' in self._call_fn_args and 'mask' not in kwargs and
+          not generic_utils.is_all_none(previous_mask)):
+        # The previous layer generated a mask, and mask was not explicitly pass
+        # to __call__, hence we set previous_mask as the default value.
+        kwargs['mask'] = previous_mask
+
+    input_shapes = None
+
+    with ops.name_scope(self._name_scope()):
+      if not self.built:
+        # Build layer if applicable (if the `build` method has been overridden).
+        self._maybe_build(inputs)
+        # We must set self.built since user defined build functions are not
+        # constrained to set self.built.
+        self.built = True
+
+      # Check input assumptions set after layer building, e.g. input shape.
+      if build_graph:
+        # Symbolic execution on symbolic tensors. We will attempt to build
+        # the corresponding TF subgraph inside `backend.get_graph()`
+        input_spec.assert_input_compatibility(
+            self.input_spec, inputs, self.name)
+        graph = backend.get_graph()
+        with graph.as_default():
+          if not executing_eagerly:
+            # In graph mode, failure to build the layer's graph
+            # implies a user-side bug. We don't catch exceptions.
+            outputs = self.call(inputs, *args, **kwargs)
+          else:
+            try:
+              outputs = self.call(inputs, *args, **kwargs)
+            except Exception:  # pylint: disable=broad-except
+              # Any issue during graph-building means we will later run the
+              # model in eager mode, whether the issue was related to
+              # graph mode or not. This provides a nice debugging experience.
+              self._call_is_graph_friendly = False
+              # We will use static shape inference to return symbolic tensors
+              # matching the specifications of the layer outputs.
+              # Since we have set `self._call_is_graph_friendly = False`,
+              # we will never attempt to run the underlying TF graph (which is
+              # disconnected).
+              # TODO(fchollet): consider py_func as an alternative, which
+              # would enable us to run the underlying graph if needed.
+              input_shapes = nest.map_structure(lambda x: x.shape, inputs)
+              output_shapes = self.compute_output_shape(input_shapes)
+              outputs = nest.map_structure(
+                  lambda shape: backend.placeholder(shape, dtype=self.dtype),
+                  output_shapes)
+
+          if outputs is None:
+            raise ValueError('A layer\'s `call` method should return a '
+                             'Tensor or a list of Tensors, not None '
+                             '(layer: ' + self.name + ').')
+          self._handle_activity_regularization(inputs, outputs)
+          self._set_mask_metadata(inputs, outputs, previous_mask)
+          if base_layer_utils.have_all_keras_metadata(inputs):
+            inputs, outputs = self._set_connectivity_metadata_(
+                inputs, outputs, args, kwargs)
+          if hasattr(self, '_set_inputs') and not self.inputs:
+            # Subclassed network: explicitly set metadata normally set by
+            # a call to self._set_inputs().
+            # This is not relevant in eager execution.
+            self._set_inputs(inputs, outputs)
+      else:
+        # Eager execution on data tensors.
+        outputs = self.call(inputs, *args, **kwargs)
+        self._handle_activity_regularization(inputs, outputs)
+        return outputs
+
+    if not context.executing_eagerly():
+      # Optionally load weight values specified at layer instantiation.
+      # TODO(fchollet): consider enabling this with eager execution too.
+      if (hasattr(self, '_initial_weights') and
+          self._initial_weights is not None):
+        self.set_weights(self._initial_weights)
+        del self._initial_weights
+    return outputs
+
+  @property
+  def dtype(self):
+    return self._dtype
+
+  @property
+  def name(self):
+    return self._name
+
+  @property
+  def activity_regularizer(self):
+    """Optional regularizer function for the output of this layer."""
+    return self._activity_regularizer
+
+  @activity_regularizer.setter
+  def activity_regularizer(self, regularizer):
+    """Optional regularizer function for the output of this layer."""
+    self._activity_regularizer = self._no_dependency(regularizer)
+
+  @property
+  def trainable_weights(self):
+    return self._trainable_weights if self.trainable else []
+
+  @property
+  def non_trainable_weights(self):
+    if self.trainable:
+      return self._non_trainable_weights
     else:
-      return values
+      return self._trainable_weights + self._non_trainable_weights
+
+  @property
+  def weights(self):
+    """Returns the list of all layer variables/weights.
+
+    Returns:
+      A list of variables.
+    """
+    return self.trainable_weights + self.non_trainable_weights
+
+  @property
+  def updates(self):
+    if not self.trainable and not self.stateful:
+      return []
+    return self._updates
+
+  @property
+  def losses(self):
+    """Losses which are associated with this `Layer`.
+
+    Variable regularization tensors are created when this property is accessed,
+    so it is eager safe: accessing `losses` under a `tf.GradientTape` will
+    propagate gradients back to the corresponding variables.
+
+    Returns:
+      A list of tensors.
+    """
+    collected_losses = []
+    if context.executing_eagerly():
+      collected_losses.extend(self._eager_losses)
+    else:
+      collected_losses.extend(self._losses)
+    for regularizer in self._callable_losses:
+      loss_tensor = regularizer()
+      if loss_tensor is not None:
+        collected_losses.append(loss_tensor)
+    return collected_losses
+
+  @doc_controls.for_subclass_implementers
+  def add_loss(self, losses, inputs=None):
+    """Add loss tensor(s), potentially dependent on layer inputs.
+
+    Some losses (for instance, activity regularization losses) may be dependent
+    on the inputs passed when calling a layer. Hence, when reusing the same
+    layer on different inputs `a` and `b`, some entries in `layer.losses` may
+    be dependent on `a` and some on `b`. This method automatically keeps track
+    of dependencies.
+
+    The `get_losses_for` method allows to retrieve the losses relevant to a
+    specific set of inputs.
+
+    Note that `add_loss` is not supported when executing eagerly. Instead,
+    variable regularizers may be added through `add_variable`. Activity
+    regularization is not supported directly (but such losses may be returned
+    from `Layer.call()`).
+
+    Arguments:
+      losses: Loss tensor, or list/tuple of tensors. Rather than tensors, losses
+        may also be zero-argument callables which create a loss tensor.
+      inputs: Ignored when executing eagerly. If anything other than None is
+        passed, it signals the losses are conditional on some of the layer's
+        inputs, and thus they should only be run where these inputs are
+        available. This is the case for activity regularization losses, for
+        instance. If `None` is passed, the losses are assumed
+        to be unconditional, and will apply across all dataflows of the layer
+        (e.g. weight regularization losses).
+    """
+    losses = generic_utils.to_list(losses)
+
+    def _tag_unconditional(loss):
+      if callable(loss):
+        loss = loss()
+      if loss is None:
+        return None  # Will be filtered out when computing the .losses property
+      if not tensor_util.is_tensor(loss):
+        loss = ops.convert_to_tensor(loss, dtype=backend.floatx())
+      loss._unconditional_loss = (inputs is None)  # pylint: disable=protected-access
+      return loss
+
+    for loss in losses:
+      if callable(loss):
+        self._callable_losses.append(
+            functools.partial(_tag_unconditional, loss))
+      else:
+        if context.executing_eagerly():
+          self._eager_losses.append(_tag_unconditional(loss))
+        else:
+          self._losses.append(_tag_unconditional(loss))
+
+  @doc_controls.for_subclass_implementers
+  def add_metric(self, value, aggregation=None, name=None):
+    """Adds metric tensor to the layer.
+
+    Args:
+      value: Metric tensor.
+      aggregation: Sample-wise metric reduction function. If `aggregation=None`,
+        it indicates that the metric tensor provided has been aggregated
+        already. eg, `model.add_metric(BinaryAccuracy(name='acc')(y_true,
+        y_pred))`. If aggregation='mean', the given metric tensor will be
+        sample-wise reduced using `mean` function. eg, `model.add_metric(
+        tf.reduce_mean(outputs), name='output_mean', aggregation='mean')`.
+      name: String metric name.
+
+    Raises:
+      ValueError: If `aggregation` is anything other than None or `mean`.
+    """
+    if aggregation is not None and aggregation != 'mean':
+      raise ValueError(
+          'We currently support only `mean` sample-wise metric aggregation. '
+          'You provided aggregation=`%s`' % aggregation)
+
+    if tf_utils.is_symbolic_tensor(value):
+      self._symbolic_add_metric(value, aggregation, name)
+    else:
+      self._eager_add_metric(value, aggregation, name)
+
+  @doc_controls.for_subclass_implementers
+  def add_update(self, updates, inputs=None):
+    """Add update op(s), potentially dependent on layer inputs.
+
+    Weight updates (for instance, the updates of the moving mean and variance
+    in a BatchNormalization layer) may be dependent on the inputs passed
+    when calling a layer. Hence, when reusing the same layer on
+    different inputs `a` and `b`, some entries in `layer.updates` may be
+    dependent on `a` and some on `b`. This method automatically keeps track
+    of dependencies.
+
+    The `get_updates_for` method allows to retrieve the updates relevant to a
+    specific set of inputs.
+
+    This call is ignored when eager execution is enabled (in that case, variable
+    updates are run on the fly and thus do not need to be tracked for later
+    execution).
+
+    Arguments:
+      updates: Update op, or list/tuple of update ops.
+      inputs: If anything other than None is passed, it signals the updates
+        are conditional on some of the layer's inputs,
+        and thus they should only be run where these inputs are available.
+        This is the case for BatchNormalization updates, for instance.
+        If None, the updates will be taken into account unconditionally,
+        and you are responsible for making sure that any dependency they might
+        have is available at runtime.
+        A step counter might fall into this category.
+    """
+    if context.executing_eagerly():
+      return  # Updates already applied when in eager mode.
+
+    def process_update(x):
+      if isinstance(x, ops.Operation):
+        return x
+      elif hasattr(x, 'op'):
+        return x.op
+      else:
+        return ops.convert_to_tensor(x)
+
+    updates = generic_utils.to_list(updates)
+    updates = [process_update(x) for x in updates]
+    self._updates += updates
+    if inputs is None:
+      for u in updates:
+        u._unconditional_update = True  # pylint: disable=protected-access
+    else:
+      for u in updates:
+        u._unconditional_update = False  # pylint: disable=protected-access
+
+  def set_weights(self, weights):
+    """Sets the weights of the layer, from Numpy arrays.
+
+    Arguments:
+        weights: a list of Numpy arrays. The number
+            of arrays and their shape must match
+            number of the dimensions of the weights
+            of the layer (i.e. it should match the
+            output of `get_weights`).
+
+    Raises:
+        ValueError: If the provided weights list does not match the
+            layer's specifications.
+    """
+    params = self.weights
+    if len(params) != len(weights):
+      raise ValueError('You called `set_weights(weights)` on layer "' +
+                       self.name + '" with a  weight list of length ' +
+                       str(len(weights)) + ', but the layer was expecting ' +
+                       str(len(params)) + ' weights. Provided weights: ' +
+                       str(weights)[:50] + '...')
+    if not params:
+      return
+    weight_value_tuples = []
+    param_values = backend.batch_get_value(params)
+    for pv, p, w in zip(param_values, params, weights):
+      if pv.shape != w.shape:
+        raise ValueError('Layer weight shape ' + str(pv.shape) +
+                         ' not compatible with '
+                         'provided weight shape ' + str(w.shape))
+      weight_value_tuples.append((p, w))
+    backend.batch_set_value(weight_value_tuples)
+
+  def get_weights(self):
+    """Returns the current weights of the layer.
+
+    Returns:
+        Weights values as a list of numpy arrays.
+    """
+    params = self.weights
+    return backend.batch_get_value(params)
+
+  def get_updates_for(self, inputs):
+    """Retrieves updates relevant to a specific set of inputs.
+
+    Arguments:
+      inputs: Input tensor or list/tuple of input tensors.
+
+    Returns:
+      List of update ops of the layer that depend on `inputs`.
+
+    Raises:
+      RuntimeError: If called in Eager mode.
+    """
+    # Updates disabled if layer is not trainable and not explicitly stateful.
+    if not self.trainable and not self.stateful:
+      return []
+
+    if inputs is None:
+      # Requesting unconditional updates.
+      return [x for x in self.updates if x._unconditional_update]  # pylint: disable=protected-access
+
+    # Requesting input-conditional updates.
+    inputs = nest.flatten(inputs)
+    reachable = tf_utils.get_reachable_from_inputs(inputs, self.updates)
+    updates = []
+    for update in self.updates:
+      if update in reachable:
+        updates.append(update)
+    return updates
+
+  def get_losses_for(self, inputs):
+    """Retrieves losses relevant to a specific set of inputs.
+
+    Arguments:
+      inputs: Input tensor or list/tuple of input tensors.
+
+    Returns:
+      List of loss tensors of the layer that depend on `inputs`.
+
+    Raises:
+      RuntimeError: If called in Eager mode.
+    """
+    if inputs is None:
+      # Requesting unconditional losses.
+      return [x for x in self.losses if x._unconditional_loss]  # pylint: disable=protected-access
+
+    # Requesting input-conditional losses.
+    inputs = nest.flatten(inputs)
+    # Retrieve the set of tensors in the TF graph that depend on `inputs`.
+    # The losses we want to return will be part of this set.
+    # To avoid unnecessary work, we stop the search in case all of
+    # `self.losses` have been retrieved.
+    reachable = tf_utils.get_reachable_from_inputs(inputs, self.losses)
+    losses = []
+    for loss in self.losses:
+      if loss in reachable:
+        losses.append(loss)
+    return losses
 
   def get_input_mask_at(self, node_index):
     """Retrieves the input mask tensor(s) of a layer at a given node.
@@ -1376,8 +1144,7 @@ class Layer(checkpointable.CheckpointableBase):
                          ', but the layer isn\'t built. '
                          'You can build it manually via: `' + self.name +
                          '.build(batch_input_shape)`.')
-    weight_shapes = [w.shape.as_list() for w in self.weights]
-    return int(sum([np.prod(w) for w in weight_shapes]))
+    return int(sum(np.prod(w.shape.as_list()) for w in self.weights))
 
   @property
   def output_shape(self):
@@ -1429,231 +1196,401 @@ class Layer(checkpointable.CheckpointableBase):
     """Deprecated, do NOT use! Only for compatibility with external Keras."""
     return self._outbound_nodes
 
-  def _assert_input_compatibility(self, inputs):
-    """Checks compatibility between the layer and provided inputs.
+  ##############################################################################
+  # Methods & attributes below are public aliases of other methods.            #
+  ##############################################################################
 
-    This checks that the tensor(s) `inputs` verify the input assumptions
-    of the layer (if any). If not, a clear and actional exception gets raised.
+  def apply(self, inputs, *args, **kwargs):
+    """Apply the layer on a input.
+
+    This is an alias of `self.__call__`.
 
     Arguments:
-        inputs: input tensor or list of input tensors.
+      inputs: Input tensor(s).
+      *args: additional positional arguments to be passed to `self.call`.
+      **kwargs: additional keyword arguments to be passed to `self.call`.
+
+    Returns:
+      Output tensor(s).
+    """
+    return self.__call__(inputs, *args, **kwargs)
+
+  @doc_controls.for_subclass_implementers
+  def add_variable(self, *args, **kwargs):
+    """Alias for `add_weight`."""
+    return self.add_weight(*args, **kwargs)
+
+  @property
+  def variables(self):
+    """Returns the list of all layer variables/weights.
+
+    Alias of `self.weights`.
+
+    Returns:
+      A list of variables.
+    """
+    return self.weights
+
+  @property
+  def trainable_variables(self):
+    return self.trainable_weights
+
+  @property
+  def non_trainable_variables(self):
+    return self.non_trainable_weights
+
+  ##############################################################################
+  # Methods & attributes below are all private and only used by the framework. #
+  ##############################################################################
+
+  def _name_scope(self):
+    return self.name
+
+  def _init_set_name(self, name, zero_based=True):
+    if not name:
+      self._name = base_layer_utils.unique_layer_name(
+          generic_utils.to_snake_case(self.__class__.__name__),
+          zero_based=zero_based)
+    else:
+      self._name = name
+
+  def _get_existing_metric(self, name=None):
+    match = [m for m in self._metrics if m.name == name]
+    if not match:
+      return
+    if len(match) > 1:
+      raise ValueError(
+          'Please provide different names for the metrics you have added. '
+          'We found {} metrics with the name: "{}"'.format(len(match), name))
+    return match[0]
+
+  def _eager_add_metric(self, value, aggregation=None, name=None):
+    # If the given metric is available in `metrics` list we just update state
+    # on it, otherwise we create a new metric instance and
+    # add it to the `metrics` list.
+    match = self._get_existing_metric(name)
+    if match:
+      match(value)  # Update the metric state.
+      return
+    else:
+      if aggregation is None:
+        raise ValueError('We do not support adding an aggregated metric tensor '
+                         'in `call` in eager execution.')
+      metric_obj, _ = base_layer_utils.create_mean_metric(value, name)
+      self._metrics.append(metric_obj)
+
+  def _symbolic_add_metric(self, value, aggregation=None, name=None):
+    if aggregation is None:
+      # Iterate over the metrics and check if the given metric exists already.
+      # This can happen when a metric instance is created in subclassed model
+      # layer `__init__` and we have tracked that instance already in
+      # model.__setattr__.
+      match = self._get_existing_metric(name)
+      if match:
+        result_tensor = value
+        if match.name not in self._metrics_tensors:
+          self._metrics_tensors[match.name] = result_tensor
+          return
+        else:
+          raise ValueError(
+              'We currently do not support reusing a metric instance.')
+      else:
+        # We track the instance using the metadata on the result tensor.
+        result_tensor = value
+        metric_obj = result_tensor._metric_obj
+    else:
+      # If a non-aggregated tensor is given as input (ie. `aggregation` is
+      # explicitly set to `mean`), we wrap the tensor in `Mean` metric.
+      metric_obj, result_tensor = base_layer_utils.create_mean_metric(
+          value, name)
+    self._metrics.append(metric_obj)
+    self._metrics_tensors[metric_obj.name] = result_tensor
+
+  def _handle_weight_regularization(self, name, variable, regularizer):
+    """Create lambdas which compute regularization losses."""
+
+    def _loss_for_variable(v):
+      """Creates a regularization loss `Tensor` for variable `v`."""
+      with ops.colocate_with(v):
+        with ops.name_scope(name + '/Regularizer'):
+          regularization = regularizer(v)
+      return regularization
+
+    if isinstance(variable, tf_variables.PartitionedVariable):
+      for v in variable:
+        self.add_loss(functools.partial(_loss_for_variable, v))
+    else:
+      self.add_loss(functools.partial(_loss_for_variable, variable))
+
+  def _handle_activity_regularization(self, inputs, outputs):
+    # Apply activity regularization.
+    # Note that it should be applied every time the layer creates a new
+    # output, since it is output-specific.
+    if self._activity_regularizer:
+      output_list = nest.flatten(outputs)
+      with ops.name_scope('ActivityRegularizer'):
+        for output in output_list:
+          activity_loss = self._activity_regularizer(output)
+          batch_size = math_ops.cast(
+              array_ops.shape(output)[0], activity_loss.dtype)
+          # Make activity regularization strength batch-agnostic.
+          mean_activity_loss = activity_loss / batch_size
+          self.add_loss(mean_activity_loss, inputs=inputs)
+
+  def _set_mask_metadata(self, inputs, outputs, previous_mask):
+    # In some cases the mask of the outputs has already been computed by
+    # inner layers and does not need to be recomputed by this layer.
+    mask_already_computed = all(
+        hasattr(x, '_keras_mask') for x in generic_utils.to_list(outputs))
+    if hasattr(self, 'compute_mask') and not mask_already_computed:
+      output_mask = self.compute_mask(inputs, previous_mask)
+    else:
+      output_mask = None
+    if isinstance(outputs, (list, tuple)):
+      if output_mask is None:
+        output_mask = [None for _ in range(len(outputs))]
+      for x, m in zip(outputs, output_mask):
+        try:
+          x._keras_mask = m  # pylint: disable=protected-access
+        except AttributeError:
+          pass  # C type such as dict. Masking not supported in this case.
+    else:
+      try:
+        outputs._keras_mask = output_mask  # pylint: disable=protected-access
+      except AttributeError:
+        pass  # C type such as dict. Masking not supported in this case.
+
+  def _set_connectivity_metadata_(self, inputs, outputs, args, kwargs):
+    call_convention = getattr(
+        self, '_call_convention',
+        base_layer_utils.CallConvention.EXPLICIT_INPUTS_ARGUMENT)
+    if args:
+      if call_convention == (base_layer_utils
+                             .CallConvention.EXPLICIT_INPUTS_ARGUMENT):
+        raise TypeError(
+            'This layer ("{}") takes an `inputs` argument in `call()`, '
+            'and only the `inputs` argument may be specified as a positional '
+            'argument. Pass everything else as a keyword argument '
+            '(those arguments will not be tracked '
+            'as inputs to the layer).'.format(self.name))
+      elif call_convention == (base_layer_utils
+                               .CallConvention.SINGLE_POSITIONAL_ARGUMENT):
+        raise TypeError(
+            'This layer ("{}") takes a single positional argument in `call()`,'
+            ' which is by convention the `inputs` argument, '
+            'and only this argument may be specified as a positional argument. '
+            'Pass everything else as a keyword argument '
+            '(those arguments will not be tracked '
+            'as inputs to the layer).'.format(self.name))
+
+    # If the layer returns tensors from its inputs, unmodified,
+    # we copy them to avoid loss of tensor metadata.
+    output_ls = nest.flatten(outputs)
+    output_ls_copy = []
+    for x in output_ls:
+      if x in nest.flatten(inputs):
+        with ops.name_scope(self.name):
+          x = array_ops.identity(x)
+      output_ls_copy.append(x)
+    if len(output_ls_copy) == 1:
+      outputs = output_ls_copy[0]
+    else:
+      outputs = output_ls_copy
+
+    inputs, kwargs = self._inputs_from_call_args(
+        call_args=(inputs,) + args, call_kwargs=kwargs)
+    # Add an inbound node to the layer, so it can keep track of this call.
+    # This updates the layer history of the output tensor(s).
+    kwargs.pop('mask', None)  # `mask` should not be serialized.
+    self._add_inbound_node(
+        input_tensors=inputs, output_tensors=outputs, arguments=kwargs)
+    return inputs, outputs
+
+  def _inputs_from_call_args(self, call_args, call_kwargs):
+    """Get Layer inputs from __call__ *args and **kwargs.
+
+    Args:
+      call_args: The positional arguments passed to __call__.
+      call_kwargs: The keyword argument dict passed to __call__.
+
+    Returns:
+      A tuple of (inputs, non_input_kwargs). These may be the same objects as
+      were passed in (call_args and call_kwargs).
+    """
+    call_convention = getattr(
+        self, '_call_convention',
+        base_layer_utils.CallConvention.EXPLICIT_INPUTS_ARGUMENT)
+    if (call_convention in (
+        base_layer_utils.CallConvention.EXPLICIT_INPUTS_ARGUMENT,
+        base_layer_utils.CallConvention.SINGLE_POSITIONAL_ARGUMENT)):
+      assert len(call_args) == 1  # TypeError raised earlier in __call__.
+      return call_args[0], call_kwargs
+    else:
+      call_arg_spec = tf_inspect.getfullargspec(self.call)
+      # There is no explicit "inputs" argument expected or provided to
+      # call(). Arguments which have default values are considered non-inputs,
+      # and arguments without are considered inputs.
+      if call_arg_spec.defaults:
+        if call_arg_spec.varargs is not None:
+          raise TypeError(
+              'Layers may not accept both positional arguments and '
+              'arguments with default values (unable to determine which '
+              'are inputs to the layer). '
+              'Issue occurred with layer "%s"' % (self.name))
+        keyword_arg_names = set(
+            call_arg_spec.args[-len(call_arg_spec.defaults):])
+      else:
+        keyword_arg_names = set()
+        # Training is never an input argument name, to allow signatures like
+        # call(x, training).
+      keyword_arg_names.add('training')
+      _, unwrapped_call = tf_decorator.unwrap(self.call)
+      bound_args = inspect.getcallargs(
+          unwrapped_call, *call_args, **call_kwargs)
+      if call_arg_spec.varkw is not None:
+        var_kwargs = bound_args.pop(call_arg_spec.varkw)
+        bound_args.update(var_kwargs)
+        keyword_arg_names = keyword_arg_names.union(var_kwargs.keys())
+      all_args = call_arg_spec.args
+      if all_args and bound_args[all_args[0]] is self:
+        # Ignore the 'self' argument of methods
+        bound_args.pop(call_arg_spec.args[0])
+        all_args = all_args[1:]
+      non_input_arg_values = {}
+      input_arg_values = []
+      remaining_args_are_keyword = False
+      for argument_name in all_args:
+        if argument_name in keyword_arg_names:
+          remaining_args_are_keyword = True
+        else:
+          if remaining_args_are_keyword:
+            raise TypeError(
+                'Found a positional argument in a layer call after a non-input '
+                'argument. All arguments after "training" must be keyword '
+                'arguments, and are not tracked as inputs to the layer. '
+                'Issue occurred with layer "%s"' % (self.name))
+        if remaining_args_are_keyword:
+          non_input_arg_values[argument_name] = bound_args[argument_name]
+        else:
+          input_arg_values.append(bound_args[argument_name])
+      if call_arg_spec.varargs is not None:
+        input_arg_values.extend(bound_args[call_arg_spec.varargs])
+      return input_arg_values, non_input_arg_values
+
+  def _add_inbound_node(self,
+                        input_tensors,
+                        output_tensors,
+                        arguments=None):
+    """Internal method to create an inbound node for the layer.
+
+    Arguments:
+        input_tensors: list of input tensors.
+        output_tensors: list of output tensors.
+        arguments: dictionary of keyword arguments that were passed to the
+            `call` method of the layer at the call that created the node.
+    """
+    input_tensors = nest.flatten(input_tensors)
+    output_tensors = nest.flatten(output_tensors)
+
+    # Collect input tensor(s) coordinates.
+    inbound_layers = []
+    node_indices = []
+    tensor_indices = []
+    for x in input_tensors:
+      assert hasattr(x, '_keras_history')
+      inbound_layer, node_index, tensor_index = x._keras_history  # pylint: disable=protected-access
+      inbound_layers.append(inbound_layer)
+      node_indices.append(node_index)
+      tensor_indices.append(tensor_index)
+
+    # Create node, add it to inbound nodes.
+    Node(
+        self,
+        inbound_layers=inbound_layers,
+        node_indices=node_indices,
+        tensor_indices=tensor_indices,
+        input_tensors=input_tensors,
+        output_tensors=output_tensors,
+        arguments=arguments)
+
+    # Update tensor history metadata.
+    for i in range(len(output_tensors)):
+      # The metadata attribute consists of 1) a layer instance
+      # 2) a node index for the layer, 3) a tensor index for the node.
+      # The allows layer reuse (multiple nodes per layer) and multi-output
+      # or multi-input layers (e.g. a layer can return multiple tensors,
+      # and each can be sent to a different layer).
+      output_tensors[i]._keras_history = (self, len(self._inbound_nodes) - 1, i)  # pylint: disable=protected-access
+
+  def _get_node_attribute_at_index(self, node_index, attr, attr_name):
+    """Private utility to retrieves an attribute (e.g. inputs) from a node.
+
+    This is used to implement the methods:
+        - get_input_shape_at
+        - get_output_shape_at
+        - get_input_at
+        etc...
+
+    Arguments:
+        node_index: Integer index of the node from which
+            to retrieve the attribute.
+        attr: Exact node attribute name.
+        attr_name: Human-readable attribute name, for error messages.
+
+    Returns:
+        The layer's attribute `attr` at the node of index `node_index`.
 
     Raises:
-        ValueError: in case of mismatch between
-            the provided inputs and the expectations of the layer.
+        RuntimeError: If the layer has no inbound nodes, or if called in Eager
+        mode.
+        ValueError: If the index provided does not match any node.
     """
-    if not self.input_spec:
-      return
-    if not isinstance(self.input_spec, (list, tuple)):
-      input_spec = nest.flatten(self.input_spec)
+    if not self._inbound_nodes:
+      raise RuntimeError('The layer has never been called '
+                         'and thus has no defined ' + attr_name + '.')
+    if not len(self._inbound_nodes) > node_index:
+      raise ValueError('Asked to get ' + attr_name + ' at node ' +
+                       str(node_index) + ', but the layer has only ' +
+                       str(len(self._inbound_nodes)) + ' inbound nodes.')
+    values = getattr(self._inbound_nodes[node_index], attr)
+    if len(values) == 1:
+      return values[0]
     else:
-      input_spec = self.input_spec
-    inputs = nest.flatten(inputs)
-    if len(inputs) != len(input_spec):
-      raise ValueError('Layer ' + self.name + ' expects ' +
-                       str(len(input_spec)) + ' inputs, '
-                       'but it received ' + str(len(inputs)) +
-                       ' input tensors. Inputs received: ' + str(inputs))
-    for input_index, (x, spec) in enumerate(zip(inputs, input_spec)):
-      if spec is None:
-        continue
+      return values
 
-      if (spec.ndim is not None or
-          spec.min_ndim is not None or
-          spec.max_ndim is not None):
-        if x.shape.ndims is None:
-          raise ValueError('Input ' + str(input_index) + ' of layer ' +
-                           self.name + ' is incompatible with the layer: '
-                           'its rank is undefined, but the layer requires a '
-                           'defined rank.')
+  @property
+  def _static_graph_friendly(self):
+    """Whether the layer can be called to create a static graph.
 
-      # Check ndim.
-      if spec.ndim is not None:
-        ndim = x.shape.ndims
-        if ndim != spec.ndim:
-          raise ValueError('Input ' + str(input_index) + ' of layer ' +
-                           self.name + ' is incompatible with the layer: '
-                           'expected ndim=' + str(spec.ndim) + ', found ndim=' +
-                           str(ndim) + '. Full shape received: ' +
-                           str(x.shape.as_list()))
-      if spec.max_ndim is not None:
-        ndim = x.shape.ndims
-        if ndim is not None and ndim > spec.max_ndim:
-          raise ValueError('Input ' + str(input_index) + ' of layer ' +
-                           self.name + ' is incompatible with the layer: '
-                           'expected max_ndim=' + str(spec.max_ndim) +
-                           ', found ndim=' + str(ndim))
-      if spec.min_ndim is not None:
-        ndim = x.shape.ndims
-        if ndim is not None and ndim < spec.min_ndim:
-          raise ValueError('Input ' + str(input_index) + ' of layer ' +
-                           self.name + ' is incompatible with the layer: '
-                           ': expected min_ndim=' + str(spec.min_ndim) +
-                           ', found ndim=' + str(ndim) +
-                           '. Full shape received: ' +
-                           str(x.shape.as_list()))
-      # Check dtype.
-      if spec.dtype is not None:
-        if x.dtype != spec.dtype:
-          raise ValueError('Input ' + str(input_index) + ' of layer ' +
-                           self.name + ' is incompatible with the layer: '
-                           'expected dtype=' + str(spec.dtype) +
-                           ', found dtype=' + str(x.dtype))
-      # Check specific shape axes.
-      if spec.axes:
-        shape = x.shape.as_list()
-        if shape is not None:
-          for axis, value in spec.axes.items():
-            if hasattr(value, 'value'):
-              value = value.value
-            if value is not None and shape[int(axis)] not in {value, None}:
-              raise ValueError(
-                  'Input ' + str(input_index) + ' of layer ' + self.name + ' is'
-                  ' incompatible with the layer: expected axis ' + str(axis) +
-                  ' of input shape to have value ' + str(value) +
-                  ' but received input with shape ' + str(shape))
-      # Check shape.
-      if spec.shape is not None:
-        shape = x.shape.as_list()
-        if shape is not None:
-          for spec_dim, dim in zip(spec.shape, shape):
-            if spec_dim is not None and dim is not None:
-              if spec_dim != dim:
-                raise ValueError('Input ' + str(input_index) +
-                                 ' is incompatible with layer ' + self.name +
-                                 ': expected shape=' + str(spec.shape) +
-                                 ', found shape=' + str(shape))
-
-  def set_weights(self, weights):
-    """Sets the weights of the layer, from Numpy arrays.
-
-    Arguments:
-        weights: a list of Numpy arrays. The number
-            of arrays and their shape must match
-            number of the dimensions of the weights
-            of the layer (i.e. it should match the
-            output of `get_weights`).
-
-    Raises:
-        ValueError: If the provided weights list does not match the
-            layer's specifications.
-    """
-    params = self.weights
-    if len(params) != len(weights):
-      raise ValueError('You called `set_weights(weights)` on layer "' +
-                       self.name + '" with a  weight list of length ' +
-                       str(len(weights)) + ', but the layer was expecting ' +
-                       str(len(params)) + ' weights. Provided weights: ' +
-                       str(weights)[:50] + '...')
-    if not params:
-      return
-    weight_value_tuples = []
-    param_values = backend.batch_get_value(params)
-    for pv, p, w in zip(param_values, params, weights):
-      if pv.shape != w.shape:
-        raise ValueError('Layer weight shape ' + str(pv.shape) +
-                         ' not compatible with '
-                         'provided weight shape ' + str(w.shape))
-      weight_value_tuples.append((p, w))
-    backend.batch_set_value(weight_value_tuples)
-
-  def get_weights(self):
-    """Returns the current weights of the layer.
+    Because of nesting, there are two components to being "graph-friendly":
+      1) all inner layers are graph-friendly
+      2) the way they are composed is graph-friendly.
+    We denote the latter as "_call_is_graph_friendly", and define
+    "_static_graph_friendly" as being the combination of
+    "_call_is_graph_friendly" and "all inner layers are _static_graph_friendly".
+    For atomic layers (no inner layers), this is just "_call_is_graph_friendly".
 
     Returns:
-        Weights values as a list of numpy arrays.
+      Boolean.
     """
-    params = self.weights
-    return backend.batch_get_value(params)
+    return self._call_is_graph_friendly
 
-  def get_config(self):
-    """Returns the config of the layer.
-
-    A layer config is a Python dictionary (serializable)
-    containing the configuration of a layer.
-    The same layer can be reinstantiated later
-    (without its trained weights) from this configuration.
-
-    The config of a layer does not include connectivity
-    information, nor the layer class name. These are handled
-    by `Network` (one layer of abstraction above).
-
-    Returns:
-        Python dictionary.
-    """
-    config = {'name': self.name, 'trainable': self.trainable}
-    if hasattr(self, '_batch_input_shape'):
-      config['batch_input_shape'] = self._batch_input_shape
-    if hasattr(self, 'dtype'):
-      config['dtype'] = self.dtype
-    return config
-
-  @classmethod
-  def from_config(cls, config):
-    """Creates a layer from its config.
-
-    This method is the reverse of `get_config`,
-    capable of instantiating the same layer from the config
-    dictionary. It does not handle layer connectivity
-    (handled by Network), nor weights (handled by `set_weights`).
-
-    Arguments:
-        config: A Python dictionary, typically the
-            output of get_config.
-
-    Returns:
-        A layer instance.
-    """
-    return cls(**config)
-
-
-@tf_export(
-    'keras.layers.InputSpec', v1=['keras.layers.InputSpec', 'layers.InputSpec'])
-class InputSpec(object):
-  """Specifies the ndim, dtype and shape of every input to a layer.
-
-  Every layer should expose (if appropriate) an `input_spec` attribute:
-  a list of instances of InputSpec (one per input tensor).
-
-  A None entry in a shape is compatible with any dimension,
-  a None shape is compatible with any shape.
-
-  Arguments:
-      dtype: Expected DataType of the input.
-      shape: Shape tuple, expected shape of the input
-          (may include None for unchecked axes).
-      ndim: Integer, expected rank of the input.
-      max_ndim: Integer, maximum rank of the input.
-      min_ndim: Integer, minimum rank of the input.
-      axes: Dictionary mapping integer axes to
-          a specific dimension value.
-  """
-
-  def __init__(self,
-               dtype=None,
-               shape=None,
-               ndim=None,
-               max_ndim=None,
-               min_ndim=None,
-               axes=None):
-    self.dtype = dtype
-    self.shape = shape
-    if shape is not None:
-      self.ndim = len(shape)
-    else:
-      self.ndim = ndim
-    self.max_ndim = max_ndim
-    self.min_ndim = min_ndim
-    self.axes = axes or {}
-
-  def __repr__(self):
-    spec = [('dtype=' + str(self.dtype)) if self.dtype else '',
-            ('shape=' + str(self.shape)) if self.shape else '',
-            ('ndim=' + str(self.ndim)) if self.ndim else '',
-            ('max_ndim=' + str(self.max_ndim)) if self.max_ndim else '',
-            ('min_ndim=' + str(self.min_ndim)) if self.min_ndim else '',
-            ('axes=' + str(self.axes)) if self.axes else '']
-    return 'InputSpec(%s)' % ', '.join(x for x in spec if x)
+  def _maybe_build(self, inputs):
+    # Check input assumptions set before layer building, e.g. input rank.
+    input_spec.assert_input_compatibility(
+        self.input_spec, inputs, self.name)
+    input_list = nest.flatten(inputs)
+    if input_list and self._dtype is None:
+      try:
+        self._dtype = input_list[0].dtype.base_dtype.name
+      except AttributeError:
+        pass
+    input_shapes = None
+    if all(hasattr(x, 'shape') for x in input_list):
+      input_shapes = nest.map_structure(lambda x: x.shape, inputs)
+    # Only call `build` if the user has manually overridden the build method.
+    if not hasattr(self.build, '_is_default'):
+      self.build(input_shapes)
 
 
 class Node(object):
@@ -1768,192 +1705,12 @@ class Node(object):
     }
 
 
-def unique_layer_name(name, name_uid_map=None, avoid_names=None, namespace='',
-                      zero_based=False):
-  """Makes a layer name (or arbitrary string) unique within a TensorFlow graph.
-
-  Arguments:
-    name: String name to make unique.
-    name_uid_map: An optional defaultdict(int) to use when creating unique
-      names. If None (default), uses a per-Graph dictionary.
-    avoid_names: An optional set or dict with names which should not be used. If
-      None (default) does not avoid any names.
-    namespace: Gets a name which is unique within the (graph, namespace). Layers
-      which are not Networks use a blank namespace and so get graph-global
-      names.
-    zero_based: If True, name sequences start with no suffix (e.g. "dense",
-      "dense_1"). If False, naming is one-based ("dense_1", "dense_2").
-
-  Returns:
-    Unique string name.
-
-  Example:
-
-  ```python
-  _unique_layer_name('dense')  # dense_1
-  _unique_layer_name('dense')  # dense_2
-  ```
-  """
-  if name_uid_map is None:
-    name_uid_map = get_default_graph_uid_map()
-  if avoid_names is None:
-    avoid_names = set()
-  proposed_name = None
-  while proposed_name is None or proposed_name in avoid_names:
-    name_key = (namespace, name)
-    if zero_based:
-      number = name_uid_map[name_key]
-      if number:
-        proposed_name = name + '_' + str(number)
-      else:
-        proposed_name = name
-      name_uid_map[name_key] += 1
-    else:
-      name_uid_map[name_key] += 1
-      proposed_name = name + '_' + str(name_uid_map[name_key])
-  return proposed_name
-
-
-def have_all_keras_metadata(iterable_or_element):
-  if not isinstance(iterable_or_element, (list, tuple)):
-    iterable = [iterable_or_element]
-  else:
-    iterable = nest.flatten(iterable_or_element)
-  return all([hasattr(x, '_keras_history') for x in iterable])
-
-
-def collect_previous_mask(input_tensors):
-  """Retrieves the output mask(s) of the previous node.
-
-  Arguments:
-      input_tensors: A tensor or list of tensors.
-
-  Returns:
-      A mask tensor or list of mask tensors.
-  """
-  input_tensors = nest.flatten(input_tensors)
-  masks = []
-  for x in input_tensors:
-    if hasattr(x, '_keras_mask'):
-      mask = x._keras_mask  # pylint: disable=protected-access
-      masks.append(mask)
-    else:
-      masks.append(None)
-  if len(masks) == 1:
-    return masks[0]
-  return masks
-
-
-def get_default_graph_uid_map():
-  # TODO(fchollet): refactor this into backend.
-  graph = ops.get_default_graph()
-  name_uid_map = backend.PER_GRAPH_LAYER_NAME_UIDS.get(graph, None)
-  if name_uid_map is None:
-    name_uid_map = collections_lib.defaultdict(int)
-    backend.PER_GRAPH_LAYER_NAME_UIDS[graph] = name_uid_map
-  return name_uid_map
-
-
-def make_variable(name,
-                  shape=None,
-                  dtype=dtypes.float32,
-                  initializer=None,
-                  partition_info=None,
-                  trainable=None,
-                  caching_device=None,
-                  validate_shape=True,
-                  constraint=None,
-                  use_resource=None,
-                  collections=None,
-                  synchronization=tf_variables.VariableSynchronization.AUTO,
-                  aggregation=tf_variables.VariableAggregation.NONE,
-                  partitioner=None):  # pylint: disable=unused-argument
-  """Temporary util to create a variable (relies on `variable_scope.variable`).
-
-  Some reuse-related technicalities prevent us from using
-  `variable_scope.get_variable()` directly, so we use a subcomponent
-  that has fewer constraints (`variable_scope.variable()`).
-
-  In the longer term, it seems like a similar "default variable creator" method
-  should exist in `CheckpointableBase` instead. When this happens, we can get
-  rid of this temporary solution.
-
-  TODO(fchollet): remove this method when no longer needed.
-  TODO(fchollet): handle `partitioner` argument.
-
-  Arguments:
-    name: Variable name.
-    shape: Variable shape.
-    dtype: The type of the variable. Defaults to `self.dtype` or `float32`.
-    initializer: Initializer instance (callable).
-    partition_info: Not handled at this time.
-    trainable: Whether the variable should be part of the layer's
-      "trainable_variables" (e.g. variables, biases)
-      or "non_trainable_variables" (e.g. BatchNorm mean, stddev).
-      Note, if the current variable scope is marked as non-trainable
-      then this parameter is ignored and any added variables are also
-      marked as non-trainable. `trainable` defaults to `True` unless
-      `synchronization` is set to `ON_READ`.
-    caching_device: Passed to `tf.Variable`.
-    validate_shape: Passed to `tf.Variable`.
-    constraint: Constraint instance (callable).
-    use_resource: Whether to use a `ResourceVariable`.
-    collections: List of graph collections keys. The new variable is added to
-      these collections. Defaults to `[GraphKeys.GLOBAL_VARIABLES]`.
-    synchronization: Indicates when a distributed a variable will be
-      aggregated. Accepted values are constants defined in the class
-      `tf.VariableSynchronization`. By default the synchronization is set to
-      `AUTO` and the current `DistributionStrategy` chooses
-      when to synchronize. If `synchronization` is set to `ON_READ`,
-      `trainable` must not be set to `True`.
-    aggregation: Indicates how a distributed variable will be aggregated.
-      Accepted values are constants defined in the class
-      `tf.VariableAggregation`.
-    partitioner: Not handled at this time.
-
-  Returns:
-    Variable instance.
-  """
-  initializing_from_value = False
-  if initializer is not None and not callable(initializer):
-    initializing_from_value = True
-
-  with ops.init_scope():
-    if initializing_from_value:
-      init_val = initializer
-      variable_dtype = None
-    else:
-      # Instantiate initializer if provided initializer is a type object.
-      if isinstance(initializer, type(init_ops.Initializer)):
-        initializer = initializer(dtype=dtype)
-      init_val = lambda: initializer(  # pylint: disable=g-long-lambda
-          shape, dtype=dtype, partition_info=partition_info)
-      variable_dtype = dtype.base_dtype
-  if use_resource is None:
-    use_resource = True
-
-  # TODO(apassos,rohanj) figure out how to remove collections from here so we
-  # can remove the V1.
-  v = tf_variables.VariableV1(
-      initial_value=init_val,
-      name=name,
-      trainable=trainable,
-      caching_device=caching_device,
-      dtype=variable_dtype,
-      validate_shape=validate_shape,
-      constraint=constraint,
-      use_resource=use_resource,
-      collections=collections,
-      synchronization=synchronization,
-      aggregation=aggregation)
-  return v
-
-
 def default(method):
   """Decorates a method to detect overrides in subclasses."""
   method._is_default = True
   return method
 
 
-def generate_placeholders_from_shape(shape):
-  return array_ops.placeholder(shape=shape, dtype=backend.floatx())
+# Avoid breaking users who directly import this symbol from this file.
+# TODO(fchollet): remove this.
+InputSpec = input_spec.InputSpec  # pylint:disable=invalid-name
diff --git a/tensorflow/python/keras/engine/base_layer_test.py b/tensorflow/python/keras/engine/base_layer_test.py
index 704589349a8..798775b6a5b 100644
--- a/tensorflow/python/keras/engine/base_layer_test.py
+++ b/tensorflow/python/keras/engine/base_layer_test.py
@@ -81,14 +81,14 @@ class BaseLayerTest(test.TestCase):
     inputs = keras.Input((3,))
     outputs = DynamicLayer1()(inputs)
     model = keras.Model(inputs, outputs)
-    self.assertEqual(model._is_static_graph_friendly, False)
+    self.assertEqual(model._static_graph_friendly, False)
     model.compile(RMSPropOptimizer(0.001), loss='mse')
     model.train_on_batch(np.random.random((2, 3)), np.random.random((2, 3)))
 
     inputs = keras.Input((3,))
     outputs = DynamicLayer2()(inputs)
     model = keras.Model(inputs, outputs)
-    self.assertEqual(model._is_static_graph_friendly, False)
+    self.assertEqual(model._static_graph_friendly, False)
     model.compile(RMSPropOptimizer(0.001), loss='mse')
     model.train_on_batch(np.random.random((2, 3)), np.random.random((2, 3)))
 
@@ -102,7 +102,7 @@ class BaseLayerTest(test.TestCase):
     outputs = inner_model(x)
 
     model = keras.Model(inputs, outputs)
-    self.assertEqual(model._is_static_graph_friendly, False)
+    self.assertEqual(model._static_graph_friendly, False)
     model.compile(RMSPropOptimizer(0.001), loss='mse')
     model.train_on_batch(np.random.random((2, 3)), np.random.random((2, 3)))
 
@@ -116,7 +116,7 @@ class BaseLayerTest(test.TestCase):
     inputs = keras.Input((3,))
     outputs = InvalidLayer()(inputs)
     model = keras.Model(inputs, outputs)
-    self.assertEqual(model._is_static_graph_friendly, False)
+    self.assertEqual(model._static_graph_friendly, False)
     model.compile(RMSPropOptimizer(0.001), loss='mse')
     with self.assertRaisesRegexp(ValueError, 'You did something wrong!'):
       model.train_on_batch(np.random.random((2, 3)), np.random.random((2, 3)))
diff --git a/tensorflow/python/keras/engine/base_layer_utils.py b/tensorflow/python/keras/engine/base_layer_utils.py
new file mode 100644
index 00000000000..d2f947f1772
--- /dev/null
+++ b/tensorflow/python/keras/engine/base_layer_utils.py
@@ -0,0 +1,236 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Contains private utilities used mainly by the base Layer class."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections as collections_lib
+import enum
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.keras import backend
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import variables as tf_variables
+from tensorflow.python.util import nest
+
+
+class CallConvention(enum.Enum):
+  """Calling conventions for passing `Layer` inputs to `Layer.call`."""
+  # The Layer takes inputs as its first argument, named "inputs" for
+  # compatibility with the signature of Layer.__call__. This is the mode assumed
+  # for Layers which are not subclassed Models.
+  EXPLICIT_INPUTS_ARGUMENT = 1
+  # The Layer takes a single positional argument, not named "inputs". It's
+  # treated like an "inputs" argument.
+  SINGLE_POSITIONAL_ARGUMENT = 2
+  # The Layer has multiple positional arguments to which its inputs should be
+  # bound.
+  POSITIONAL_ARGUMENTS_ARE_INPUTS = 3
+
+
+def create_mean_metric(value, name=None):
+  # TODO(psv): Remove this import when b/110718070 is fixed.
+  from tensorflow.python.keras import metrics as metrics_module  # pylint: disable=g-import-not-at-top
+  metric_obj = metrics_module.Mean(name=name)
+  result = metric_obj(value)
+  return metric_obj, result
+
+
+def make_variable(name,
+                  shape=None,
+                  dtype=dtypes.float32,
+                  initializer=None,
+                  partition_info=None,
+                  trainable=None,
+                  caching_device=None,
+                  validate_shape=True,
+                  constraint=None,
+                  use_resource=None,
+                  collections=None,
+                  synchronization=tf_variables.VariableSynchronization.AUTO,
+                  aggregation=tf_variables.VariableAggregation.NONE,
+                  partitioner=None):  # pylint: disable=unused-argument
+  """Temporary util to create a variable (relies on `variable_scope.variable`).
+
+  Some reuse-related technicalities prevent us from using
+  `variable_scope.get_variable()` directly, so we use a subcomponent
+  that has fewer constraints (`variable_scope.variable()`).
+
+  In the longer term, it seems like a similar "default variable creator" method
+  should exist in `CheckpointableBase` instead. When this happens, we can get
+  rid of this temporary solution.
+
+  TODO(fchollet): remove this method when no longer needed.
+  TODO(fchollet): handle `partitioner` argument.
+
+  Arguments:
+    name: Variable name.
+    shape: Variable shape.
+    dtype: The type of the variable. Defaults to `self.dtype` or `float32`.
+    initializer: Initializer instance (callable).
+    partition_info: Not handled at this time.
+    trainable: Whether the variable should be part of the layer's
+      "trainable_variables" (e.g. variables, biases)
+      or "non_trainable_variables" (e.g. BatchNorm mean, stddev).
+      Note, if the current variable scope is marked as non-trainable
+      then this parameter is ignored and any added variables are also
+      marked as non-trainable. `trainable` defaults to `True` unless
+      `synchronization` is set to `ON_READ`.
+    caching_device: Passed to `tf.Variable`.
+    validate_shape: Passed to `tf.Variable`.
+    constraint: Constraint instance (callable).
+    use_resource: Whether to use a `ResourceVariable`.
+    collections: List of graph collections keys. The new variable is added to
+      these collections. Defaults to `[GraphKeys.GLOBAL_VARIABLES]`.
+    synchronization: Indicates when a distributed a variable will be
+      aggregated. Accepted values are constants defined in the class
+      `tf.VariableSynchronization`. By default the synchronization is set to
+      `AUTO` and the current `DistributionStrategy` chooses
+      when to synchronize. If `synchronization` is set to `ON_READ`,
+      `trainable` must not be set to `True`.
+    aggregation: Indicates how a distributed variable will be aggregated.
+      Accepted values are constants defined in the class
+      `tf.VariableAggregation`.
+    partitioner: Not handled at this time.
+
+  Returns:
+    Variable instance.
+  """
+  initializing_from_value = False
+  if initializer is not None and not callable(initializer):
+    initializing_from_value = True
+
+  with ops.init_scope():
+    if initializing_from_value:
+      init_val = initializer
+      variable_dtype = None
+    else:
+      # Instantiate initializer if provided initializer is a type object.
+      if isinstance(initializer, type(init_ops.Initializer)):
+        initializer = initializer(dtype=dtype)
+      init_val = lambda: initializer(  # pylint: disable=g-long-lambda
+          shape, dtype=dtype, partition_info=partition_info)
+      variable_dtype = dtype.base_dtype
+  if use_resource is None:
+    use_resource = True
+
+  # TODO(apassos,rohanj) figure out how to remove collections from here so we
+  # can remove the V1.
+  v = tf_variables.VariableV1(
+      initial_value=init_val,
+      name=name,
+      trainable=trainable,
+      caching_device=caching_device,
+      dtype=variable_dtype,
+      validate_shape=validate_shape,
+      constraint=constraint,
+      use_resource=use_resource,
+      collections=collections,
+      synchronization=synchronization,
+      aggregation=aggregation)
+  return v
+
+
+def get_default_graph_uid_map():
+  # TODO(fchollet): refactor this into backend.
+  graph = ops.get_default_graph()
+  name_uid_map = backend.PER_GRAPH_LAYER_NAME_UIDS.get(graph, None)
+  if name_uid_map is None:
+    name_uid_map = collections_lib.defaultdict(int)
+    backend.PER_GRAPH_LAYER_NAME_UIDS[graph] = name_uid_map
+  return name_uid_map
+
+
+def unique_layer_name(name, name_uid_map=None, avoid_names=None, namespace='',
+                      zero_based=False):
+  """Makes a layer name (or arbitrary string) unique within a TensorFlow graph.
+
+  Arguments:
+    name: String name to make unique.
+    name_uid_map: An optional defaultdict(int) to use when creating unique
+      names. If None (default), uses a per-Graph dictionary.
+    avoid_names: An optional set or dict with names which should not be used. If
+      None (default) does not avoid any names.
+    namespace: Gets a name which is unique within the (graph, namespace). Layers
+      which are not Networks use a blank namespace and so get graph-global
+      names.
+    zero_based: If True, name sequences start with no suffix (e.g. "dense",
+      "dense_1"). If False, naming is one-based ("dense_1", "dense_2").
+
+  Returns:
+    Unique string name.
+
+  Example:
+
+  ```python
+  _unique_layer_name('dense')  # dense_1
+  _unique_layer_name('dense')  # dense_2
+  ```
+  """
+  if name_uid_map is None:
+    name_uid_map = get_default_graph_uid_map()
+  if avoid_names is None:
+    avoid_names = set()
+  proposed_name = None
+  while proposed_name is None or proposed_name in avoid_names:
+    name_key = (namespace, name)
+    if zero_based:
+      number = name_uid_map[name_key]
+      if number:
+        proposed_name = name + '_' + str(number)
+      else:
+        proposed_name = name
+      name_uid_map[name_key] += 1
+    else:
+      name_uid_map[name_key] += 1
+      proposed_name = name + '_' + str(name_uid_map[name_key])
+  return proposed_name
+
+
+def collect_previous_mask(input_tensors):
+  """Retrieves the output mask(s) of the previous node.
+
+  Arguments:
+      input_tensors: A tensor or list of tensors.
+
+  Returns:
+      A mask tensor or list of mask tensors.
+  """
+  input_tensors = nest.flatten(input_tensors)
+  masks = []
+  for x in input_tensors:
+    if hasattr(x, '_keras_mask'):
+      mask = x._keras_mask  # pylint: disable=protected-access
+      masks.append(mask)
+    else:
+      masks.append(None)
+  if len(masks) == 1:
+    return masks[0]
+  return masks
+
+
+def have_all_keras_metadata(iterable_or_element):
+  if not isinstance(iterable_or_element, (list, tuple)):
+    iterable = [iterable_or_element]
+  else:
+    iterable = nest.flatten(iterable_or_element)
+  return all(hasattr(x, '_keras_history') for x in iterable)
+
+
+def generate_placeholders_from_shape(shape):
+  return array_ops.placeholder(shape=shape, dtype=backend.floatx())
diff --git a/tensorflow/python/keras/engine/distributed_training_utils.py b/tensorflow/python/keras/engine/distributed_training_utils.py
index f939b7565a8..d100182381e 100644
--- a/tensorflow/python/keras/engine/distributed_training_utils.py
+++ b/tensorflow/python/keras/engine/distributed_training_utils.py
@@ -22,15 +22,17 @@ import numpy as np
 from tensorflow.python.client import session as session_module
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import iterator_ops
+from tensorflow.python.distribute import distribute_coordinator_context as dc_context
+from tensorflow.python.distribute import distribute_lib
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.keras import backend as K
 from tensorflow.python.keras import callbacks
+from tensorflow.python.keras.optimizer_v2 import optimizer_v2
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.training import distribute as distribute_lib
 from tensorflow.python.util import nest
 
 
@@ -52,14 +54,18 @@ def set_weights(distribution_strategy, dist_model, weights):
     num_param = len(layer.weights)
     layer_weights = weights[:num_param]
     for sw, w in zip(layer.weights, layer_weights):
-      assign_ops.append(distribution_strategy.unwrap(sw.assign(w)))
-
+      if ops.executing_eagerly_outside_functions():
+        sw.assign(w)
+      else:
+        assign_ops.append(distribution_strategy.unwrap(sw.assign(w)))
     weights = weights[num_param:]
-  K.get_session().run(assign_ops)
+
+  if not ops.executing_eagerly_outside_functions():
+    K.get_session().run(assign_ops)
 
 
 def unwrap_values(distribution_strategy, grouped_inputs, grouped_outputs,
-                  grouped_updates, grouped_session_args,
+                  grouped_updates=None, grouped_session_args=None,
                   with_loss_tensor=False):
   """Unwrap and return the list of values contained in the PerDevice parameters.
 
@@ -92,11 +98,8 @@ def unwrap_values(distribution_strategy, grouped_inputs, grouped_outputs,
                                         grouped_inputs)
   if with_loss_tensor:
     # reduce loss tensor before adding it to the list of fetches
-    loss = distribution_strategy.unwrap(
-        distribution_strategy.reduce(distribute_lib.get_loss_reduction(),
-                                     grouped_outputs[0],
-                                     destinations='/device:CPU:0'))[0]
-
+    loss = distribution_strategy.reduce(distribute_lib.get_loss_reduction(),
+                                        grouped_outputs[0])
     all_outputs = flatten_perdevice_values(distribution_strategy,
                                            grouped_outputs[1:])
     all_outputs = [loss] + all_outputs
@@ -104,20 +107,25 @@ def unwrap_values(distribution_strategy, grouped_inputs, grouped_outputs,
     all_outputs = flatten_perdevice_values(distribution_strategy,
                                            grouped_outputs)
 
-  all_updates = flatten_perdevice_values(distribution_strategy,
-                                         grouped_updates)
+  if grouped_updates:
+    all_updates = flatten_perdevice_values(distribution_strategy,
+                                           grouped_updates)
+  else:
+    all_updates = None
 
   all_session_args = {}
-  grouped_feed_dict = grouped_session_args.get('feed_dict')
-  if grouped_feed_dict:
-    all_session_args['feed_dict'] = flatten_perdevice_values(
-        distribution_strategy, grouped_feed_dict)
+  if grouped_session_args:
+    grouped_feed_dict = grouped_session_args.get('feed_dict')
+    if grouped_feed_dict:
+      all_session_args['feed_dict'] = flatten_perdevice_values(
+          distribution_strategy, grouped_feed_dict)
 
-  grouped_fetches = grouped_session_args.get('fetches')
-  if grouped_fetches:
-    all_session_args['fetches'] = flatten_perdevice_values(
-        distribution_strategy, grouped_fetches)
+    grouped_fetches = grouped_session_args.get('fetches')
+    if grouped_fetches:
+      all_session_args['fetches'] = flatten_perdevice_values(
+          distribution_strategy, grouped_fetches)
 
+  # TODO(priyag): Return only non empty/None values
   return all_inputs, all_outputs, all_updates, all_session_args
 
 
@@ -144,11 +152,14 @@ def flatten_perdevice_values(distribution_strategy, perdevice_values):
           for e in distribution_strategy.unwrap(flattened)]
 
 
-def validate_callbacks(input_callbacks):
+def validate_callbacks(input_callbacks, optimizer, current_strategy):
   """Validate whether given callbacks are supported by DistributionStrategy.
 
   Args:
     input_callbacks: List of callbacks passed by the user to fit.
+    optimizer: Optimizer instance used to train the model.
+    current_strategy: The DistributionStrategy used to distribute training
+      and validation.
 
   Raises:
     ValueError: If `LearningRateScheduler` or `ReduceLROnPlateau` is one of the
@@ -170,12 +181,18 @@ def validate_callbacks(input_callbacks):
                         'these attributes are not set. You can access each of '
                         'the individual distributed models using the '
                         '`_grouped_model` attribute of your original model.')
-      if isinstance(callback, callbacks.LearningRateScheduler):
-        raise ValueError('LearningRateScheduler callback is not supported with '
-                         'DistributionStrategy.')
-      if isinstance(callback, callbacks.ReduceLROnPlateau):
-        raise ValueError('ReduceLROnPlateau callback is not supported with '
-                         'DistributionStrategy.')
+      if isinstance(callback, (callbacks.LearningRateScheduler,
+                               callbacks.ReduceLROnPlateau)):
+        strategy_name = current_strategy.__class__.__name__
+        # TODO(anjalisridhar): We might need to add a condition for multi
+        # worker strategy when we support it in Keras.
+        if is_tpu_strategy(current_strategy):
+          raise ValueError('%s callback is not supported with %s.' %
+                           (callback, strategy_name))
+
+        if not isinstance(optimizer, optimizer_v2.OptimizerV2):
+          raise ValueError('You must specify a Keras Optimizer V2 when using '
+                           '%s callback with DistributionStrategy.' % callback)
 
       # If users want to use the TensorBoard callback they cannot use certain
       # features of the callback that involve accessing model attributes and
@@ -293,19 +310,64 @@ def validate_all_tensor_shapes(x, x_values):
                        ' inputs {}'.format(x))
 
 
+def _wait_for_variable_initialization(session):
+  """Utility to wait for variables to be initialized."""
+  all_variables = K._get_variables(K.get_graph())  # pylint: disable=protected-access
+  candidate_vars = []
+  for v in all_variables:
+    if not getattr(v, '_keras_initialized', False):
+      candidate_vars.append(v)
+
+  if not candidate_vars:
+    return
+
+  while True:
+    is_initialized = session.run(
+        [variables.is_variable_initialized(v) for v in candidate_vars])
+    uninitialized_vars = []
+    for flag, v in zip(is_initialized, candidate_vars):
+      if not flag:
+        uninitialized_vars.append(v)
+      v._keras_initialized = True  # pylint: disable=protected-access
+    if not uninitialized_vars:
+      break
+
+
+def init_restore_or_wait_for_variables():
+  """Initialize or restore variables or wait for variables to be initialized."""
+  session = K._get_session()  # pylint: disable=protected-access
+  worker_context = dc_context.get_current_worker_context()
+  if not worker_context or worker_context.experimental_should_init:
+    # TODO(yuefengz): if checkpoints exit, restore from checkpoint.
+    K._initialize_variables(session)  # pylint: disable=protected-access
+  else:
+    _wait_for_variable_initialization(session)
+
+
 def configure_and_create_session(distribution_strategy):
   """Configure session config and create a session with it."""
   # TODO(priyag): Throw error if a session already exists.
   session_config = K.get_default_session_config()
-  distribution_strategy.configure(session_config)
 
-  if distribution_strategy.__class__.__name__ == 'TPUStrategy':
-    # TODO(priyag): Remove this workaround when Distributed Coordinator is
-    # integrated with keras and we can create a session from there.
-    master = distribution_strategy._tpu_cluster_resolver.master()  # pylint: disable=protected-access
+  if is_tpu_strategy(distribution_strategy):
+    # TODO(priyag, yuefengz): Remove this workaround when Distribute
+    # Coordinator is integrated with keras and we can create a session from
+    # there.
+    distribution_strategy.configure(session_config)
+    master = distribution_strategy.extended._tpu_cluster_resolver.master()  # pylint: disable=protected-access
     session = session_module.Session(config=session_config, target=master)
   else:
-    session = session_module.Session(config=session_config)
+    worker_context = dc_context.get_current_worker_context()
+    if worker_context:
+      dc_session_config = worker_context.session_config
+      # Merge the default session config to the one from distribute coordinator,
+      # which is fine for now since they don't have conflicting configurations.
+      dc_session_config.MergeFrom(session_config)
+      session = session_module.Session(
+          config=dc_session_config, target=worker_context.master_target)
+    else:
+      distribution_strategy.configure(session_config)
+      session = session_module.Session(config=session_config)
 
   K.set_session(session)
 
@@ -334,11 +396,15 @@ def validate_inputs(x, y, distribution_strategy):
                      'Iterator. You must pass a `tf.data.Dataset` object or a '
                      'numpy array as input.')
 
-  if distribution_strategy.__class__.__name__ == 'TPUStrategy':
+  if is_tpu_strategy(distribution_strategy):
     for i in [x, y]:
-      if isinstance(i, dataset_ops.Dataset):
+      if isinstance(i, dataset_ops.DatasetV2):
         shapes = nest.flatten(i.output_shapes)
-        if any([not s.is_fully_defined() for s in shapes]):
+        try:
+          s = next(s for s in shapes if not s.is_fully_defined())
+        except StopIteration:
+          continue
+        else:
           raise ValueError(
               'Using TPUs currently requires fully defined shapes. Either use '
               'set_shape() on the input tensors or use '
@@ -346,37 +412,97 @@ def validate_inputs(x, y, distribution_strategy):
               'Found unknown shape {} in input {}.'.format(s, i))
 
 
-def get_input_batch_params(first_x_value, batch_size, distribution_strategy):
+# TODO(b/118776054): Currently we support global batch size for TPUStrategy and
+# core MirroredStrategy only. Remove this check when contrib MirroredStrategy is
+# no longer needed.
+def global_batch_size_supported(distribution_strategy):
+  return distribution_strategy.extended._global_batch_size  # pylint: disable=protected-access
+
+
+# TODO(sourabhbajaj): Remove this once we use the same API for all strategies.
+def is_tpu_strategy(strategy):
+  """We're executing TPU Strategy."""
+  return strategy is not None and strategy.__class__.__name__ == 'TPUStrategy'
+
+
+def get_input_params(distribution_strategy, first_x_value, steps, batch_size,
+                     is_training=False):
   """Calculate the number of batches and steps/steps_per_epoch.
 
   Args:
+    distribution_strategy: The DistributionStrategy used to compile the model.
     first_x_value: This is the first input numpy array that is passed in as the
       model input.
-    batch_size: The specified batch_size or the default batch_size of 32.
-    distribution_strategy: The current DistributionStrategy used to compile the
-      model.
+    steps:  The specified number of steps.
+    batch_size: The specified batch_size.
+    is_training: Boolean to relax the constraints on consuming all the training
+      samples to keep compatibility till we support partial batches.
 
   Returns:
-    The steps or steps_per_epoch argument depending on if a user is
-    calling `fit`, `evaluate` or `predict`.
+    steps: The steps or steps_per_epoch argument depending on if a user is
+        calling `fit`, `evaluate` or `predict`. If the is_training flag is set
+        we don't require the number of samples to be used completely.
+    batch_size: The batch size to be used in model iterations.
 
   Raises:
     ValueError: If the number of batches or steps evaluates to 0.
 
   """
-  num_batches = first_x_value.shape[0] // batch_size
-  if not num_batches:
-    raise ValueError('Please specify a batch_size that is smaller than'
-                     'the number of input samples %d.' % first_x_value.shape[0])
-  steps = num_batches // distribution_strategy.num_replicas_in_sync
-  if not steps:
-    # TODO(anjalisridhar): Number of replicas in the error message may not
-    # convey what we want to the user. Is there another terminology that we can
-    # use that is consistent across different strategies?
-    raise ValueError('The number of batches %d is smaller than the number '
-                     'of replicas %d used for DistributionStrategy. ' %
-                     (num_batches, distribution_strategy.num_replicas_in_sync))
-  return steps
+  num_samples = first_x_value.shape[0]
+  # TODO(b/118776054): Use global batch size for Keras/DS support.
+  # Currently this is only supported in TPUStrategy and CoreMirroredStrategy.
+  use_per_replica_batch = not global_batch_size_supported(
+      distribution_strategy)
+
+  if steps is None:
+    if batch_size is None:
+      # If neither the batch size or number of steps are set. We choose the
+      # global batch size as the minimum of number of samples and 32. 32 is
+      # chosen to provide backward compatibility.
+      global_batch_size = min(num_samples, 32)
+    else:
+      # If the user provided the batch size we need to handle the case
+      # between different strategies that use the global/per-replica batch size
+      global_batch_size = batch_size
+      if use_per_replica_batch:
+        global_batch_size *= distribution_strategy.num_replicas_in_sync
+    if not is_training and num_samples % global_batch_size:
+      raise ValueError('The number of samples %s is not divisible by '
+                       'batch size %s.' % (num_samples, global_batch_size))
+    steps = num_samples // global_batch_size
+  else:
+    if batch_size is None:
+      # We calculate the batch size based on the number of steps specified
+      if num_samples % steps:
+        raise ValueError('The number of samples %s is not divisible by '
+                         'steps %s. Please change the number of steps to a '
+                         'value that can consume all the samples' % (
+                             num_samples, steps))
+      global_batch_size = num_samples // steps
+    else:
+      # If the user provided the batch size we need to handle the case
+      # between different strategies that use the global/per-replica batch size
+      global_batch_size = batch_size
+      if use_per_replica_batch:
+        global_batch_size *= distribution_strategy.num_replicas_in_sync
+
+      if num_samples < (global_batch_size * steps):
+        raise ValueError('Number of samples %s is less than samples required '
+                         'for specified batch_size %s and steps %s' % (
+                             num_samples, global_batch_size, steps))
+
+  # We need to return the per replica or global batch size based on the strategy
+  if use_per_replica_batch:
+    if global_batch_size % distribution_strategy.num_replicas_in_sync:
+      raise ValueError(
+          'The batch size (%s) could not be sharded evenly across the sync '
+          'replicas (%s) in the distribution strategy.' % (
+              global_batch_size, distribution_strategy.num_replicas_in_sync))
+    batch_size = global_batch_size // distribution_strategy.num_replicas_in_sync
+  else:
+    batch_size = global_batch_size
+
+  return steps, batch_size
 
 
 def get_batch_dimension(iterator):
@@ -387,33 +513,6 @@ def get_batch_dimension(iterator):
   return dims[0] if dims else None
 
 
-def get_batch_size(num_replicas, num_samples, steps):
-  """Calculate and return batch size for numpy inputs.
-
-  Args:
-    num_replicas: Number of devices over which the model input is distributed.
-    num_samples: Total number of input samples in the input numpy arrays.
-    steps: Number of steps that we run the model for.
-
-  Returns:
-    batch size used to create the Dataset object from the input numpy arrays.
-
-  """
-  if num_samples % steps != 0:
-    logging.warning('The number of input samples %d is not evenly '
-                    'divisible by the number of steps %d. '
-                    'Some samples will not be processed as expected.' %
-                    (num_samples, steps))
-  global_batch_size = num_samples // steps
-  if global_batch_size % num_replicas != 0:
-    logging.warning('The total number of batches per step %d is not evenly '
-                    'divisible by the number of replicas %d used in '
-                    'DistributionStrategy. Some samples will not be processed '
-                    'as expected.' %
-                    (global_batch_size, num_replicas))
-  return global_batch_size // num_replicas
-
-
 def get_cpu_device(distribution_strategy):
   """Returns the CPU device of the TPU host or the default CPU device string.
 
@@ -429,12 +528,12 @@ def get_cpu_device(distribution_strategy):
     NotImplementedError: We currently don't support copying numpy data to
     multiple hosts in the case of Cloud TPU pods.
   """
-  if distribution_strategy.__class__.__name__ == 'TPUStrategy':
-    if distribution_strategy.num_hosts > 1:
+  if is_tpu_strategy(distribution_strategy):
+    if distribution_strategy.extended.num_hosts > 1:
       raise NotImplementedError('TPUDistributionStrategy does not '
                                 'support numpy inputs when running on Cloud'
                                 'TPU pods.')
-    return distribution_strategy.get_host_cpu_device(0)
+    return distribution_strategy.extended.get_host_cpu_device(0)
   else:
     # For all strategies except TPUDistributionStrategy
     # TODO(anjalisridhar): We may need to modify this when we add support for
diff --git a/tensorflow/python/keras/engine/feature_columns_integration_test.py b/tensorflow/python/keras/engine/feature_columns_integration_test.py
index e0478ee357b..b7549e013c9 100644
--- a/tensorflow/python/keras/engine/feature_columns_integration_test.py
+++ b/tensorflow/python/keras/engine/feature_columns_integration_test.py
@@ -18,11 +18,13 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.python import keras
 from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.feature_column import feature_column_v2 as fc
+from tensorflow.python.eager import context
+from tensorflow.python.feature_column import feature_column_lib as fc
 from tensorflow.python.framework import test_util as tf_test_util
 from tensorflow.python.keras import metrics as metrics_module
 from tensorflow.python.platform import test
@@ -33,7 +35,7 @@ class TestDNNModel(keras.models.Model):
 
   def __init__(self, feature_columns, units, name=None, **kwargs):
     super(TestDNNModel, self).__init__(name=name, **kwargs)
-    self._input_layer = fc.FeatureLayer(feature_columns, name='input_layer')
+    self._input_layer = fc.DenseFeatures(feature_columns, name='input_layer')
     self._dense_layer = keras.layers.Dense(units, name='dense_layer')
 
   def call(self, features):
@@ -42,7 +44,7 @@ class TestDNNModel(keras.models.Model):
     return net
 
 
-class FeatureColumnsIntegrationTest(test.TestCase):
+class FeatureColumnsIntegrationTest(test.TestCase, parameterized.TestCase):
   """Most Sequential model API tests are covered in `training_test.py`.
 
   """
@@ -51,7 +53,7 @@ class FeatureColumnsIntegrationTest(test.TestCase):
   def test_sequential_model(self):
     columns = [fc.numeric_column('a')]
     model = keras.models.Sequential([
-        fc.FeatureLayer(columns),
+        fc.DenseFeatures(columns),
         keras.layers.Dense(64, activation='relu'),
         keras.layers.Dense(20, activation='softmax')
     ])
@@ -72,7 +74,7 @@ class FeatureColumnsIntegrationTest(test.TestCase):
   def test_sequential_model_with_ds_input(self):
     columns = [fc.numeric_column('a')]
     model = keras.models.Sequential([
-        fc.FeatureLayer(columns),
+        fc.DenseFeatures(columns),
         keras.layers.Dense(64, activation='relu'),
         keras.layers.Dense(20, activation='softmax')
     ])
@@ -112,8 +114,10 @@ class FeatureColumnsIntegrationTest(test.TestCase):
     dnn_model.evaluate(x=x, y=y, batch_size=5)
     dnn_model.predict(x=x, batch_size=5)
 
+  @parameterized.parameters(True, False)
   @tf_test_util.run_in_graph_and_eager_modes
-  def test_subclassed_model_with_feature_columns_with_ds_input(self):
+  def test_subclassed_model_with_feature_columns_with_ds_input(self,
+                                                               run_eagerly):
     col_a = fc.numeric_column('a')
     col_b = fc.numeric_column('b')
 
@@ -122,7 +126,8 @@ class FeatureColumnsIntegrationTest(test.TestCase):
     dnn_model.compile(
         optimizer=rmsprop.RMSPropOptimizer(learning_rate=0.001),
         loss='categorical_crossentropy',
-        metrics=['accuracy'])
+        metrics=['accuracy'],
+        run_eagerly=run_eagerly and context.executing_eagerly())
 
     y = np.random.randint(20, size=(100, 1))
     y = keras.utils.to_categorical(y, num_classes=20)
@@ -140,10 +145,10 @@ class FeatureColumnsIntegrationTest(test.TestCase):
     col_a = fc.numeric_column('a')
     col_b = fc.numeric_column('b')
 
-    feature_layer = fc.FeatureLayer([col_a, col_b], name='fc')
+    feature_layer = fc.DenseFeatures([col_a, col_b], name='fc')
     dense = keras.layers.Dense(4)
 
-    # This seems problematic.... We probably need something for FeatureLayer
+    # This seems problematic.... We probably need something for DenseFeatures
     # the way Input is for InputLayer.
     output = dense(feature_layer)
 
@@ -167,11 +172,11 @@ class FeatureColumnsIntegrationTest(test.TestCase):
     col_b = fc.numeric_column('b')
     col_c = fc.numeric_column('c')
 
-    fc1 = fc.FeatureLayer([col_a, col_b], name='fc1')
-    fc2 = fc.FeatureLayer([col_b, col_c], name='fc2')
+    fc1 = fc.DenseFeatures([col_a, col_b], name='fc1')
+    fc2 = fc.DenseFeatures([col_b, col_c], name='fc2')
     dense = keras.layers.Dense(4)
 
-    # This seems problematic.... We probably need something for FeatureLayer
+    # This seems problematic.... We probably need something for DenseFeatures
     # the way Input is for InputLayer.
     output = dense(fc1) + dense(fc2)
 
diff --git a/tensorflow/python/keras/engine/input_layer.py b/tensorflow/python/keras/engine/input_layer.py
index 4e96106004f..9874efe2bcc 100644
--- a/tensorflow/python/keras/engine/input_layer.py
+++ b/tensorflow/python/keras/engine/input_layer.py
@@ -19,12 +19,10 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.eager import context
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.keras import backend
 from tensorflow.python.keras.engine import base_layer
 from tensorflow.python.keras.utils import tf_utils
-from tensorflow.python.ops import array_ops
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -84,7 +82,6 @@ class InputLayer(base_layer.Layer):
     self.sparse = sparse
     self.batch_size = batch_size
     self.supports_masking = True
-    self._can_use_graph_functions = True
 
     if isinstance(input_shape, tensor_shape.TensorShape):
       input_shape = tuple(input_shape.as_list())
@@ -95,19 +92,19 @@ class InputLayer(base_layer.Layer):
       else:
         batch_input_shape = None
       graph = backend.get_graph()
-      with context.graph_mode():
-        with graph.as_default():
-          # In graph mode, create a graph placeholder to call the layer on.
-          if sparse:
-            input_tensor = array_ops.sparse_placeholder(
-                shape=batch_input_shape,
-                dtype=dtype,
-                name=self.name)
-          else:
-            input_tensor = array_ops.placeholder(
-                shape=batch_input_shape,
-                dtype=dtype,
-                name=self.name)
+      with graph.as_default():
+        # In graph mode, create a graph placeholder to call the layer on.
+        if sparse:
+          input_tensor = backend.placeholder(
+              shape=batch_input_shape,
+              dtype=dtype,
+              name=self.name,
+              sparse=True)
+        else:
+          input_tensor = backend.placeholder(
+              shape=batch_input_shape,
+              dtype=dtype,
+              name=self.name)
 
       self.is_placeholder = True
       self._batch_input_shape = batch_input_shape
diff --git a/tensorflow/python/keras/engine/input_spec.py b/tensorflow/python/keras/engine/input_spec.py
new file mode 100644
index 00000000000..7277c16fe51
--- /dev/null
+++ b/tensorflow/python/keras/engine/input_spec.py
@@ -0,0 +1,170 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+# pylint: disable=protected-access
+"""Contains the InputSpec class."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from six.moves import zip  # pylint: disable=redefined-builtin
+
+from tensorflow.python.util import nest
+from tensorflow.python.util.tf_export import tf_export
+
+
+@tf_export('keras.layers.InputSpec',
+           v1=['keras.layers.InputSpec', 'layers.InputSpec'])
+class InputSpec(object):
+  """Specifies the ndim, dtype and shape of every input to a layer.
+
+  Every layer should expose (if appropriate) an `input_spec` attribute:
+  a list of instances of InputSpec (one per input tensor).
+
+  A None entry in a shape is compatible with any dimension,
+  a None shape is compatible with any shape.
+
+  Arguments:
+      dtype: Expected DataType of the input.
+      shape: Shape tuple, expected shape of the input
+          (may include None for unchecked axes).
+      ndim: Integer, expected rank of the input.
+      max_ndim: Integer, maximum rank of the input.
+      min_ndim: Integer, minimum rank of the input.
+      axes: Dictionary mapping integer axes to
+          a specific dimension value.
+  """
+
+  def __init__(self,
+               dtype=None,
+               shape=None,
+               ndim=None,
+               max_ndim=None,
+               min_ndim=None,
+               axes=None):
+    self.dtype = dtype
+    self.shape = shape
+    if shape is not None:
+      self.ndim = len(shape)
+    else:
+      self.ndim = ndim
+    self.max_ndim = max_ndim
+    self.min_ndim = min_ndim
+    self.axes = axes or {}
+
+  def __repr__(self):
+    spec = [('dtype=' + str(self.dtype)) if self.dtype else '',
+            ('shape=' + str(self.shape)) if self.shape else '',
+            ('ndim=' + str(self.ndim)) if self.ndim else '',
+            ('max_ndim=' + str(self.max_ndim)) if self.max_ndim else '',
+            ('min_ndim=' + str(self.min_ndim)) if self.min_ndim else '',
+            ('axes=' + str(self.axes)) if self.axes else '']
+    return 'InputSpec(%s)' % ', '.join(x for x in spec if x)
+
+
+def assert_input_compatibility(input_spec, inputs, layer_name):
+  """Checks compatibility between the layer and provided inputs.
+
+  This checks that the tensor(s) `inputs` verify the input assumptions
+  of a layer (if any). If not, a clear and actional exception gets raised.
+
+  Arguments:
+      input_spec: An InputSpec instance, or None.
+      inputs: Input tensor or list of input tensors.
+      layer_name: String, name of the layer (for error message formatting).
+
+  Raises:
+      ValueError: in case of mismatch between
+          the provided inputs and the expectations of the layer.
+  """
+  if not input_spec:
+    return
+  if not isinstance(input_spec, (list, tuple)):
+    input_spec = nest.flatten(input_spec)
+
+  inputs = nest.flatten(inputs)
+  if len(inputs) != len(input_spec):
+    raise ValueError('Layer ' + layer_name + ' expects ' +
+                     str(len(input_spec)) + ' inputs, '
+                     'but it received ' + str(len(inputs)) +
+                     ' input tensors. Inputs received: ' + str(inputs))
+  for input_index, (x, spec) in enumerate(zip(inputs, input_spec)):
+    if spec is None:
+      continue
+
+    if (spec.ndim is not None or
+        spec.min_ndim is not None or
+        spec.max_ndim is not None):
+      if x.shape.ndims is None:
+        raise ValueError('Input ' + str(input_index) + ' of layer ' +
+                         layer_name + ' is incompatible with the layer: '
+                         'its rank is undefined, but the layer requires a '
+                         'defined rank.')
+
+    # Check ndim.
+    if spec.ndim is not None:
+      ndim = x.shape.ndims
+      if ndim != spec.ndim:
+        raise ValueError('Input ' + str(input_index) + ' of layer ' +
+                         layer_name + ' is incompatible with the layer: '
+                         'expected ndim=' + str(spec.ndim) + ', found ndim=' +
+                         str(ndim) + '. Full shape received: ' +
+                         str(x.shape.as_list()))
+    if spec.max_ndim is not None:
+      ndim = x.shape.ndims
+      if ndim is not None and ndim > spec.max_ndim:
+        raise ValueError('Input ' + str(input_index) + ' of layer ' +
+                         layer_name + ' is incompatible with the layer: '
+                         'expected max_ndim=' + str(spec.max_ndim) +
+                         ', found ndim=' + str(ndim))
+    if spec.min_ndim is not None:
+      ndim = x.shape.ndims
+      if ndim is not None and ndim < spec.min_ndim:
+        raise ValueError('Input ' + str(input_index) + ' of layer ' +
+                         layer_name + ' is incompatible with the layer: '
+                         ': expected min_ndim=' + str(spec.min_ndim) +
+                         ', found ndim=' + str(ndim) +
+                         '. Full shape received: ' +
+                         str(x.shape.as_list()))
+    # Check dtype.
+    if spec.dtype is not None:
+      if x.dtype != spec.dtype:
+        raise ValueError('Input ' + str(input_index) + ' of layer ' +
+                         layer_name + ' is incompatible with the layer: '
+                         'expected dtype=' + str(spec.dtype) +
+                         ', found dtype=' + str(x.dtype))
+    # Check specific shape axes.
+    if spec.axes:
+      shape = x.shape.as_list()
+      if shape is not None:
+        for axis, value in spec.axes.items():
+          if hasattr(value, 'value'):
+            value = value.value
+          if value is not None and shape[int(axis)] not in {value, None}:
+            raise ValueError(
+                'Input ' + str(input_index) + ' of layer ' + layer_name + ' is'
+                ' incompatible with the layer: expected axis ' + str(axis) +
+                ' of input shape to have value ' + str(value) +
+                ' but received input with shape ' + str(shape))
+    # Check shape.
+    if spec.shape is not None:
+      shape = x.shape.as_list()
+      if shape is not None:
+        for spec_dim, dim in zip(spec.shape, shape):
+          if spec_dim is not None and dim is not None:
+            if spec_dim != dim:
+              raise ValueError('Input ' + str(input_index) +
+                               ' is incompatible with layer ' + layer_name +
+                               ': expected shape=' + str(spec.shape) +
+                               ', found shape=' + str(shape))
diff --git a/tensorflow/python/keras/engine/network.py b/tensorflow/python/keras/engine/network.py
index 9b58180e3d3..7e6cc7bfeef 100644
--- a/tensorflow/python/keras/engine/network.py
+++ b/tensorflow/python/keras/engine/network.py
@@ -36,7 +36,9 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.keras import backend
 from tensorflow.python.keras.engine import base_layer
+from tensorflow.python.keras.engine import base_layer_utils
 from tensorflow.python.keras.engine import saving
+from tensorflow.python.keras.engine import training_utils
 from tensorflow.python.keras.utils import generic_utils
 from tensorflow.python.keras.utils import layer_utils
 from tensorflow.python.keras.utils import tf_utils
@@ -112,11 +114,6 @@ class Network(base_layer.Layer):
     self.trainable = True
     self._is_compiled = False
     self._expects_training_arg = False
-    # A list of "extra" variables assigned to attributes of this class, included
-    # in self.weights and self.variables. Always empty for graph networks (but
-    # included in base_init to avoid excessive special casing when retrieving
-    # the value).
-    self._extra_variables = []
     # In many internal cases one needs to compute both the model's output
     # and its output mask without relying on `__call__` (which would do both and
     # set mask metadata), but for models, computing the mask requires to
@@ -134,12 +131,19 @@ class Network(base_layer.Layer):
       self.optimizer = None
 
     # Private attributes to implement compatibility with Layer.
+    self._trainable_weights = []
+    self._non_trainable_weights = []
     self._updates = []  # Used in symbolic mode only.
     self._losses = []
     self._eager_losses = []
+    # A list of metric instances corresponding to the symbolic metric tensors
+    # added using the `add_metric` API.
+    self._metrics = []
+    # A dictionary that maps metric names to metric result tensors.
+    self._metrics_tensors = {}
     self._scope = None  # Never used.
     self._reuse = None  # Never used.
-    self._can_use_graph_functions = False
+    self._call_is_graph_friendly = True
     if context.executing_eagerly():
       self._graph = None
     else:
@@ -160,7 +164,8 @@ class Network(base_layer.Layer):
 
   @checkpointable.no_automatic_dependency_tracking
   def _init_graph_network(self, inputs, outputs, name=None):
-    self._call_convention = base_layer.CallConvention.EXPLICIT_INPUTS_ARGUMENT
+    self._call_convention = (base_layer_utils
+                             .CallConvention.EXPLICIT_INPUTS_ARGUMENT)
     # Normalize and set self.inputs, self.outputs.
     if isinstance(inputs, (list, tuple)):
       self.inputs = list(inputs)  # Tensor or list of tensors.
@@ -170,43 +175,7 @@ class Network(base_layer.Layer):
       self.outputs = list(outputs)
     else:
       self.outputs = [outputs]
-
-    # Check for redundancy in inputs.
-    if len(set(self.inputs)) != len(self.inputs):
-      raise ValueError('The list of inputs passed to the model '
-                       'is redundant. '
-                       'All inputs should only appear once.'
-                       ' Found: ' + str(self.inputs))
-    for x in self.inputs:
-      # Check that x has appropriate `_keras_history` metadata.
-      if not hasattr(x, '_keras_history'):
-        cls_name = self.__class__.__name__
-        raise ValueError('Input tensors to a ' + cls_name + ' ' +
-                         'must come from `tf.keras.Input`. '
-                         'Received: ' + str(x) +
-                         ' (missing previous layer metadata).')
-      # Check that x is an input tensor.
-      # pylint: disable=protected-access
-      layer, node_index, tensor_index = x._keras_history
-      if len(layer._inbound_nodes) > 1 or (
-          layer._inbound_nodes and layer._inbound_nodes[0].inbound_layers):
-        cls_name = self.__class__.__name__
-        logging.warning(cls_name + ' inputs must come from '
-                        '`tf.keras.Input` (thus holding past layer metadata), '
-                        'they cannot be the output of '
-                        'a previous non-Input layer. '
-                        'Here, a tensor specified as '
-                        'input to "' + self.name + '" was not an Input tensor, '
-                        'it was generated by layer ' + layer.name + '.\n'
-                        'Note that input tensors are '
-                        'instantiated via `tensor = tf.keras.Input(shape)`.\n'
-                        'The tensor that caused the issue was: ' + str(x.name))
-    for x in self.outputs:
-      if not hasattr(x, '_keras_history'):
-        cls_name = self.__class__.__name__
-        raise ValueError('Output tensors to a ' + cls_name + ' must be '
-                         'the output of a TensorFlow `Layer` '
-                         '(thus holding past layer metadata). Found: ' + str(x))
+    self._validate_graph_inputs_and_outputs()
 
     self._base_init(name=name)
     self._compute_previous_mask = (
@@ -258,10 +227,6 @@ class Network(base_layer.Layer):
 
     self._track_layers(layers)
 
-    # A Graph network supports defun-ed eager loops if all of its layers do.
-    self._can_use_graph_functions = all(
-        layer._can_use_graph_functions for layer in layers)
-
     # Create the node linking internal inputs to internal outputs.
     base_layer.Node(
         outbound_layer=self,
@@ -282,9 +247,7 @@ class Network(base_layer.Layer):
       if layer.is_placeholder:
         self._feed_input_names.append(layer.name)
         self._feed_input_shapes.append(backend.int_shape(self.inputs[i]))
-        # layer.input gives an error in eager mode
-        if not context.executing_eagerly():
-          self._feed_inputs.append(layer.input)
+        self._feed_inputs.append(layer.input)
     for layer in self._output_layers:
       self.output_names.append(layer.name)
 
@@ -301,16 +264,15 @@ class Network(base_layer.Layer):
     self.outputs = []
     self.inputs = []
     self.built = False
-    self._static_graph_friendly = True
 
   @property
-  def _is_static_graph_friendly(self):
+  def _static_graph_friendly(self):
     if self._is_graph_network:
-      return all(layer._is_static_graph_friendly for layer in self.layers)
-    return self._static_graph_friendly
+      return all(layer._static_graph_friendly for layer in self.layers)
+    return self._call_is_graph_friendly
 
   def _determine_call_convention(self, call_argspec):
-    """Decides how `self.call()` is invoked. See base_layer.CallConvention."""
+    """Decides how `self.call()` is invoked. See `CallConvention`."""
     if call_argspec.varargs:
       may_take_single_argument = False
     else:
@@ -342,11 +304,11 @@ class Network(base_layer.Layer):
               "Model.call() takes a single positional argument (to which "
               "inputs are passed by convention) and a separate 'inputs' "
               "argument. Unable to determine which arguments are inputs.")
-        return base_layer.CallConvention.SINGLE_POSITIONAL_ARGUMENT
+        return base_layer_utils.CallConvention.SINGLE_POSITIONAL_ARGUMENT
     if 'inputs' in call_argspec.args:
-      return base_layer.CallConvention.EXPLICIT_INPUTS_ARGUMENT
+      return base_layer_utils.CallConvention.EXPLICIT_INPUTS_ARGUMENT
     else:
-      return base_layer.CallConvention.POSITIONAL_ARGUMENTS_ARE_INPUTS
+      return base_layer_utils.CallConvention.POSITIONAL_ARGUMENTS_ARE_INPUTS
 
   def _track_layers(self, layers):
     """Add Checkpointable dependencies on a list of Layers."""
@@ -415,44 +377,26 @@ class Network(base_layer.Layer):
             # simply by assigning them to attributes.
           not self._is_graph_network
           and isinstance(value, variables.Variable)):
-        self._extra_variables.append(value)
+        if value.trainable:
+          # Could already be added via `add_weight`.
+          if value not in self._trainable_weights:
+            self._trainable_weights.append(value)
+        else:
+          if value not in self._non_trainable_weights:
+            self._non_trainable_weights.append(value)
+
+    # Keeping track of metric instance created in subclassed model/layer.
+    # We do this so that we can maintain the correct order of metrics by adding
+    # the instance to the `metrics` list as soon as it is created.
+    from tensorflow.python.keras import metrics as metrics_module  # pylint: disable=g-import-not-at-top
+    if isinstance(value, metrics_module.Metric):
+      self._metrics.append(value)
     super(Network, self).__setattr__(name, value)
 
-  def add_variable(self, name, shape, dtype=None, initializer=None,
-                   regularizer=None, trainable=True, constraint=None):
-    if self._is_graph_network:
-      raise NotImplementedError('`add_variable` is not supported on Networks.')
-    else:
-      raise NotImplementedError(
-          '`add_variable` is not supported on Networks. However, you may '
-          'assign variables to attributes and they will show up in the weights '
-          'and variables properties.')
-
-  def add_weight(self,
-                 name,
-                 shape,
-                 dtype=None,
-                 initializer=None,
-                 regularizer=None,
-                 trainable=None,
-                 constraint=None,
-                 partitioner=None,
-                 use_resource=None,
-                 synchronization=variables.VariableSynchronization.AUTO,
-                 aggregation=variables.VariableAggregation.NONE,
-                 **kwargs):
-    if self._is_graph_network:
-      raise NotImplementedError('`add_weight` is not supported on Networks.')
-    else:
-      raise NotImplementedError(
-          '`add_weight` is not supported on Networks. However, you may '
-          'assign variables to attributes and they will show up in the weights '
-          'and variables properties.')
-
   @property
   def stateful(self):
-    return any([(hasattr(layer, 'stateful') and layer.stateful)
-                for layer in self.layers])
+    return any((hasattr(layer, 'stateful') and layer.stateful)
+               for layer in self.layers)
 
   def reset_states(self):
     for layer in self.layers:
@@ -557,14 +501,13 @@ class Network(base_layer.Layer):
 
   @property
   def _unfiltered_updates(self):
-    if context.executing_eagerly():
-      return []
     updates = []
     for layer in self.layers:
       if isinstance(layer, Network):
         updates += layer._unfiltered_updates
       else:
         updates += layer.updates
+    updates += self._updates
     return updates
 
   @property
@@ -641,9 +584,6 @@ class Network(base_layer.Layer):
     Returns:
         A list of update ops.
     """
-    if context.executing_eagerly():
-      return []
-
     if not self.trainable and not self.stateful:
       return []
 
@@ -659,7 +599,7 @@ class Network(base_layer.Layer):
       else:
         relevant_inputs.append(inputs)
     if not relevant_inputs:
-      return updates
+      return list(set(updates))
 
     reachable = tf_utils.get_reachable_from_inputs(relevant_inputs, updates)
     relevant_conditional_updates = [x for x in updates if x in reachable]
@@ -667,8 +607,7 @@ class Network(base_layer.Layer):
         x for x in updates if x._unconditional_update]  # pylint: disable=protected-access
     # A layer could be used multiple times in a nested structure,
     # so the updates list must be de-duped.
-    return list(set(
-        relevant_conditional_updates + unconditional_updates + self._updates))
+    return list(set(relevant_conditional_updates + unconditional_updates))
 
   @property
   def losses(self):
@@ -728,14 +667,38 @@ class Network(base_layer.Layer):
     return checkpointable_layer_utils.gather_trainable_weights(
         trainable=self.trainable,
         sub_layers=self._layers,
-        extra_variables=self._extra_variables)
+        extra_variables=self._trainable_weights)
 
   @property
   def non_trainable_weights(self):
     return checkpointable_layer_utils.gather_non_trainable_weights(
         trainable=self.trainable,
         sub_layers=self._layers,
-        extra_variables=self._extra_variables)
+        extra_variables=self._non_trainable_weights + self._trainable_weights)
+
+  @property
+  def metrics(self):
+    """Returns the network's symbolic metrics.
+
+    Model overrides this function to include the metrics from `compile` API.
+    """
+    metrics = []
+    for layer in self.layers:
+      metrics += layer._metrics  # pylint: disable=protected-access
+    return metrics + self._metrics
+
+  @property
+  def _all_metrics_tensors(self):
+    """Returns the network's symbolic metric tensors."""
+    # TODO(psv): Remove this property.
+    metrics_tensors = {}
+    for layer in self.layers:
+      if isinstance(layer, Network):
+        metrics_tensors.update(layer._all_metrics_tensors)
+      else:
+        metrics_tensors.update(layer._metrics_tensors)
+    metrics_tensors.update(self._metrics_tensors)
+    return metrics_tensors
 
   @property
   def input_spec(self):
@@ -771,6 +734,11 @@ class Network(base_layer.Layer):
     This is to be used for subclassed models, which do not know at instantiation
     time what their inputs look like.
 
+    This method only exists for users who want to call `model.build()` in a
+    standalone way (as a substitute for calling the model on real data to
+    build it). It will never be called by the framework (and thus it will
+    never throw unexpected errors in an unrelated workflow).
+
     Args:
      input_shape: Single tuple, TensorShape, or list of shapes, where shapes
          are tuples, integers, or TensorShapes.
@@ -807,48 +775,53 @@ class Network(base_layer.Layer):
       # in a Graph. Since tf.Variable is compatible with both eager execution
       # and graph building, the variables created after building the model in
       # a Graph are still valid when executing eagerly.
-      with context.graph_mode():
-        graph = func_graph.FuncGraph('graph')
-        with graph.as_default():
-          if isinstance(input_shape, list):
-            x = [base_layer.generate_placeholders_from_shape(shape)
-                 for shape in input_shape]
+      if context.executing_eagerly():
+        graph = func_graph.FuncGraph('build_graph')
+      else:
+        graph = backend.get_graph()
+      with graph.as_default():
+        if isinstance(input_shape, list):
+          x = [base_layer_utils.generate_placeholders_from_shape(shape)
+               for shape in input_shape]
+        else:
+          x = base_layer_utils.generate_placeholders_from_shape(input_shape)
+
+        kwargs = {}
+        call_signature = tf_inspect.getfullargspec(self.call)
+        call_args = call_signature.args
+        # Exclude `self`, `inputs`, and any argument with a default value.
+        if len(call_args) > 2:
+          if call_signature.defaults:
+            call_args = call_args[2:-len(call_signature.defaults)]
           else:
-            x = base_layer.generate_placeholders_from_shape(input_shape)
-
-          kwargs = {}
-          num_call_args = len(tf_inspect.getfullargspec(self.call).args)
-          if self._expects_training_arg and num_call_args == 3:
-            # Has call signature of call(self, input, training)
-            kwargs['training'] = False
-          elif num_call_args > 2:
-            # Has invalid call signature of call(self, input, *args, **kwargs)
-            raise ValueError('Currently, you cannot build your model if it has '
-                             'positional or keyword arguments that are not '
-                             'inputs to the model, but are required for its '
-                             '`call` method. Instead, in order to instantiate '
-                             'and build your model, `call` your model on real '
-                             'tensor data with all expected call arguments.')
-
-          try:
-            self.call(x, **kwargs)
-          except (errors.InvalidArgumentError, TypeError):
-            raise ValueError('You cannot build your model by calling `build` '
-                             'if your layers do not support float type inputs. '
-                             'Instead, in order to instantiate and build your '
-                             'model, `call` your model on real tensor data (of '
-                             'the correct dtype).')
-
+            call_args = call_args[2:]
+          for arg in call_args:
+            if arg == 'training':
+              # Case where `training` is a positional arg with no default.
+              kwargs['training'] = False
+            else:
+              # Has invalid call signature with unknown positional arguments.
+              raise ValueError(
+                  'Currently, you cannot build your model if it has '
+                  'positional or keyword arguments that are not '
+                  'inputs to the model, but are required for its '
+                  '`call` method. Instead, in order to instantiate '
+                  'and build your model, `call` your model on real '
+                  'tensor data with all expected call arguments.')
+        elif len(call_args) < 2:
+          # Signature without `inputs`.
+          raise ValueError('You can only call `build` on a model if its `call` '
+                           'method accepts an `inputs` argument.')
+        try:
+          self.call(x, **kwargs)
+        except (errors.InvalidArgumentError, TypeError):
+          raise ValueError('You cannot build your model by calling `build` '
+                           'if your layers do not support float type inputs. '
+                           'Instead, in order to instantiate and build your '
+                           'model, `call` your model on real tensor data (of '
+                           'the correct dtype).')
     if self._layers:
       self._track_layers(self._layers)
-    if self.layers:
-      for layer in self.layers:
-        if not layer.built:
-          raise ValueError('Layer: {} was not built in your model. Calling '
-                           '`build` manually on a subclassed model is only '
-                           'allowed for models with a static topology. '
-                           'In this case, you can build your model by '
-                           'calling it on real tensor data.'.format(layer))
     self.built = True
 
   def call(self, inputs, training=None, mask=None):
@@ -895,9 +868,7 @@ class Network(base_layer.Layer):
 
   def compute_output_shape(self, input_shape):
     if not self._is_graph_network:
-      if context.executing_eagerly():
-        return super(Network, self).compute_output_shape(input_shape)
-      raise NotImplementedError
+      return super(Network, self).compute_output_shape(input_shape)
 
     if isinstance(input_shape, list):
       input_shapes = []
@@ -1686,6 +1657,62 @@ class Network(base_layer.Layer):
                               positions=positions,
                               print_fn=print_fn)
 
+  def _validate_graph_inputs_and_outputs(self):
+    """Validates the inputs and outputs of a Graph Network."""
+    # Check for redundancy in inputs.
+    if len(set(self.inputs)) != len(self.inputs):
+      raise ValueError('The list of inputs passed to the model '
+                       'is redundant. '
+                       'All inputs should only appear once.'
+                       ' Found: ' + str(self.inputs))
+
+    for x in self.inputs:
+      # Check that x has appropriate `_keras_history` metadata.
+      if not hasattr(x, '_keras_history'):
+        cls_name = self.__class__.__name__
+        raise ValueError('Input tensors to a ' + cls_name + ' ' +
+                         'must come from `tf.keras.Input`. '
+                         'Received: ' + str(x) +
+                         ' (missing previous layer metadata).')
+      # Check that x is an input tensor.
+      # pylint: disable=protected-access
+      layer, _, _ = x._keras_history
+      if len(layer._inbound_nodes) > 1 or (
+          layer._inbound_nodes and layer._inbound_nodes[0].inbound_layers):
+        cls_name = self.__class__.__name__
+        logging.warning(cls_name + ' inputs must come from '
+                        '`tf.keras.Input` (thus holding past layer metadata), '
+                        'they cannot be the output of '
+                        'a previous non-Input layer. '
+                        'Here, a tensor specified as '
+                        'input to "' + self.name + '" was not an Input tensor, '
+                        'it was generated by layer ' + layer.name + '.\n'
+                        'Note that input tensors are '
+                        'instantiated via `tensor = tf.keras.Input(shape)`.\n'
+                        'The tensor that caused the issue was: ' + str(x.name))
+
+    # Check compatibility of batch sizes of Input Layers.
+    input_batch_sizes = [
+        training_utils.get_static_batch_size(x._keras_history[0])
+        for x in self.inputs
+    ]
+    consistent_batch_size = None
+    for batch_size in input_batch_sizes:
+      if batch_size is not None:
+        if (consistent_batch_size is not None and
+            batch_size != consistent_batch_size):
+          raise ValueError('The specified batch sizes of the Input Layers'
+                           ' are incompatible. Found batch sizes: {}'.format(
+                               input_batch_sizes))
+        consistent_batch_size = batch_size
+
+    for x in self.outputs:
+      if not hasattr(x, '_keras_history'):
+        cls_name = self.__class__.__name__
+        raise ValueError('Output tensors to a ' + cls_name + ' must be '
+                         'the output of a TensorFlow `Layer` '
+                         '(thus holding past layer metadata). Found: ' + str(x))
+
 
 def _is_hdf5_filepath(filepath):
   return (filepath.endswith('.h5') or filepath.endswith('.keras') or
diff --git a/tensorflow/python/keras/engine/saving.py b/tensorflow/python/keras/engine/saving.py
index 61bff7fff23..54d9e32fb25 100644
--- a/tensorflow/python/keras/engine/saving.py
+++ b/tensorflow/python/keras/engine/saving.py
@@ -79,6 +79,10 @@ def save_model(model, filepath, overwrite=True, include_optimizer=True):
 
   from tensorflow.python.keras import __version__ as keras_version  # pylint: disable=g-import-not-at-top
 
+  # TODO(psv) Add warning when we save models that contain non-serializable
+  # entities like metrics added using `add_metric` and losses added using
+  # `add_loss.`
+
   if not isinstance(filepath, h5py.File):
     # If file exists and should not be overwritten.
     if not overwrite and os.path.isfile(filepath):
@@ -126,8 +130,8 @@ def save_model(model, filepath, overwrite=True, include_optimizer=True):
                     'config': model.optimizer.get_config()
                 },
                 'loss': model.loss,
-                'metrics': model.metrics,
-                'weighted_metrics': model.weighted_metrics,
+                'metrics': model._compile_metrics,
+                'weighted_metrics': model._compile_weighted_metrics,
                 'sample_weight_mode': model.sample_weight_mode,
                 'loss_weights': model.loss_weights,
             },
@@ -913,7 +917,7 @@ def save_attributes_to_hdf5_group(group, name, data):
   chunked_data = np.array_split(data_npy, num_chunks)
 
   # This will never loop forever thanks to the test above.
-  while any([x.nbytes > HDF5_OBJECT_HEADER_LIMIT for x in chunked_data]):
+  while any(x.nbytes > HDF5_OBJECT_HEADER_LIMIT for x in chunked_data):
     num_chunks += 1
     chunked_data = np.array_split(data_npy, num_chunks)
 
diff --git a/tensorflow/python/keras/engine/saving_test.py b/tensorflow/python/keras/engine/saving_test.py
index f376f081cfe..6d9d9a2fcae 100644
--- a/tensorflow/python/keras/engine/saving_test.py
+++ b/tensorflow/python/keras/engine/saving_test.py
@@ -32,6 +32,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.keras.engine import saving
 from tensorflow.python.keras.engine import training
+from tensorflow.python.lib.io import file_io
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.platform import test
@@ -288,6 +289,7 @@ class TestWeightSavingAndLoading(test.TestCase, parameterized.TestCase):
                                  r'element\(s\)\.'):
       saving.load_weights_from_hdf5_group_by_name(f_model, model.layers)
 
+  @test_util.run_deprecated_v1
   def test_sequential_weight_loading_group_name_with_incorrect_shape(self):
     if h5py is None:
       return
@@ -330,6 +332,7 @@ class TestWeightSavingAndLoading(test.TestCase, parameterized.TestCase):
 
 class TestWholeModelSaving(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def test_sequential_model_saving(self):
     if h5py is None:
       self.skipTest('h5py required to run this test')
@@ -382,6 +385,7 @@ class TestWholeModelSaving(test.TestCase):
       out2 = new_model.predict(x)
       self.assertAllClose(out, out2, atol=1e-05)
 
+  @test_util.run_deprecated_v1
   def test_sequential_model_saving_without_input_shape(self):
     if h5py is None:
       self.skipTest('h5py required to run this test')
@@ -442,6 +446,7 @@ class TestWholeModelSaving(test.TestCase):
       out2 = new_model.predict(x)
       self.assertAllClose(out, out2, atol=1e-05)
 
+  @test_util.run_deprecated_v1
   def test_sequential_model_saving_2(self):
     if h5py is None:
       self.skipTest('h5py required to run this test')
@@ -478,6 +483,7 @@ class TestWholeModelSaving(test.TestCase):
       out2 = model.predict(x)
       self.assertAllClose(out, out2, atol=1e-05)
 
+  @test_util.run_deprecated_v1
   def test_functional_model_saving(self):
     if h5py is None:
       self.skipTest('h5py required to run this test')
@@ -629,6 +635,7 @@ class TestWholeModelSaving(test.TestCase):
       os.close(fd)
       os.remove(fname)
 
+  @test_util.run_deprecated_v1
   def test_saving_model_with_long_weights_names(self):
     if h5py is None:
       self.skipTest('h5py required to run this test')
@@ -674,6 +681,7 @@ class TestWholeModelSaving(test.TestCase):
       os.close(fd)
       os.remove(fname)
 
+  @test_util.run_deprecated_v1
   def test_model_saving_to_pre_created_h5py_file(self):
     if h5py is None:
       self.skipTest('h5py required to run this test')
@@ -715,7 +723,6 @@ class TestWholeModelSaving(test.TestCase):
       os.close(fd)
       os.remove(fname)
 
-
   def test_saving_constant_initializer_with_numpy(self):
     if h5py is None:
       self.skipTest('h5py required to run this test')
@@ -749,6 +756,7 @@ class SubclassedModel(training.Model):
 
 class TestWeightSavingAndLoadingTFFormat(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def test_keras_optimizer_warning(self):
     graph = ops.Graph()
     with graph.as_default(), self.session(graph):
@@ -992,5 +1000,57 @@ class TestWeightSavingAndLoadingTFFormat(test.TestCase):
         AssertionError, 'Nothing except the root object matched'):
       m.load_weights(save_path)
 
+  @test_util.run_in_graph_and_eager_modes
+  def test_directory_passed(self):
+    m = keras.Model()
+    v = m.add_weight(name='v', shape=[])
+    self.evaluate(v.assign(42.))
+    prefix = os.path.join(self.get_temp_dir(), '{}'.format(ops.uid()), 'ckpt/')
+    m.save_weights(prefix)
+    self.evaluate(v.assign(2.))
+    m.load_weights(prefix)
+    self.assertEqual(42., self.evaluate(v))
+
+  @test_util.run_in_graph_and_eager_modes
+  def test_relative_path(self):
+    m = keras.Model()
+    v = m.add_weight(name='v', shape=[])
+    os.chdir(self.get_temp_dir())
+
+    prefix = 'ackpt'
+    self.evaluate(v.assign(42.))
+    m.save_weights(prefix)
+    self.assertTrue(file_io.file_exists('ackpt.index'))
+    self.evaluate(v.assign(1.))
+    m.load_weights(prefix)
+    self.assertEqual(42., self.evaluate(v))
+
+    prefix = 'subdir/ackpt'
+    self.evaluate(v.assign(43.))
+    m.save_weights(prefix)
+    self.assertTrue(file_io.file_exists('subdir/ackpt.index'))
+    self.evaluate(v.assign(2.))
+    m.load_weights(prefix)
+    self.assertEqual(43., self.evaluate(v))
+
+    prefix = 'ackpt/'
+    self.evaluate(v.assign(44.))
+    m.save_weights(prefix)
+    self.assertTrue(file_io.file_exists('ackpt/.index'))
+    self.evaluate(v.assign(3.))
+    m.load_weights(prefix)
+    self.assertEqual(44., self.evaluate(v))
+
+  @test_util.run_in_graph_and_eager_modes
+  def test_nonexistant_prefix_directory(self):
+    m = keras.Model()
+    v = m.add_weight(name='v', shape=[])
+    self.evaluate(v.assign(42.))
+    prefix = os.path.join(self.get_temp_dir(), '{}'.format(ops.uid()), 'bckpt')
+    m.save_weights(prefix)
+    self.evaluate(v.assign(2.))
+    m.load_weights(prefix)
+    self.assertEqual(42., self.evaluate(v))
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/engine/sequential.py b/tensorflow/python/keras/engine/sequential.py
index 5ce4ca4df41..3255613f6af 100644
--- a/tensorflow/python/keras/engine/sequential.py
+++ b/tensorflow/python/keras/engine/sequential.py
@@ -25,6 +25,7 @@ from tensorflow.python.eager import context
 from tensorflow.python.framework import ops
 from tensorflow.python.keras import layers as layer_module
 from tensorflow.python.keras.engine import base_layer
+from tensorflow.python.keras.engine import training_utils
 from tensorflow.python.keras.engine.input_layer import Input
 from tensorflow.python.keras.engine.input_layer import InputLayer
 from tensorflow.python.keras.engine.network import Network
@@ -120,8 +121,8 @@ class Sequential(Model):
     return layers[:]
 
   @property
-  def _is_static_graph_friendly(self):
-    return all(layer._is_static_graph_friendly for layer in self.layers)
+  def _static_graph_friendly(self):
+    return all(layer._static_graph_friendly for layer in self.layers)
 
   @checkpointable.no_automatic_dependency_tracking
   def add(self, layer):
@@ -150,7 +151,7 @@ class Sequential(Model):
         assert len(layer._inbound_nodes[-1].output_tensors) == 1
         set_inputs = True
       else:
-        batch_shape, dtype = get_input_shape_and_dtype(layer)
+        batch_shape, dtype = training_utils.get_input_shape_and_dtype(layer)
         if batch_shape:
           # Instantiate an input layer.
           x = Input(
@@ -190,8 +191,6 @@ class Sequential(Model):
       self._layers.append(layer)
     if self._layers:
       self._track_layers(self._layers)
-    self._can_use_graph_functions = all(
-        layer._can_use_graph_functions for layer in self.layers)
 
   @checkpointable.no_automatic_dependency_tracking
   def pop(self):
@@ -213,23 +212,17 @@ class Sequential(Model):
       self.outputs = [self.layers[-1].output]
       self._init_graph_network(self.inputs, self.outputs, name=self.name)
       self.built = True
-    self._can_use_graph_functions = all(
-        layer._can_use_graph_functions for layer in self.layers)
 
+  @base_layer.default
   def build(self, input_shape=None):
     if self._is_graph_network:
       self._init_graph_network(self.inputs, self.outputs, name=self.name)
     else:
       if input_shape is None:
         raise ValueError('You must provide an `input_shape` argument.')
+      input_shape = tuple(input_shape)
       self._build_input_shape = input_shape
-      shape = input_shape
-      for layer in self.layers:
-        if not layer.built:
-          with ops.name_scope(layer._name_scope()):
-            layer.build(shape)
-          layer.built = True
-        shape = layer.compute_output_shape(shape)
+      super(Sequential, self).build(input_shape)
     self.built = True
 
   def call(self, inputs, training=None, mask=None):
@@ -241,8 +234,8 @@ class Sequential(Model):
     return outputs
 
   def _call_and_compute_mask(self, inputs, training=None, mask=None):
-    if not self.built:
-      self.build(inputs.shape)
+    if not self.built and self._is_graph_network:
+      self._init_graph_network(self.inputs, self.outputs, name=self.name)
 
     x = inputs
     for layer in self.layers:
@@ -255,6 +248,11 @@ class Sequential(Model):
       if isinstance(layer, Network) and layer._compute_output_and_mask_jointly:
         x, mask = layer._call_and_compute_mask(x, **kwargs)
       else:
+        if not layer.built:
+          # Build layer if applicable.
+          with ops.name_scope(layer._name_scope()):
+            layer._maybe_build(x)
+          layer.built = True
         x = layer.call(x, **kwargs)
         if layer.supports_masking:
           mask = layer.compute_mask(x, mask)
@@ -362,38 +360,3 @@ class Sequential(Model):
     if self.layers and hasattr(self.layers[0], 'input_spec'):
       return self.layers[0].input_spec
     return None
-
-
-def get_input_shape_and_dtype(layer):
-  """Retrieve input shape and input dtype of layer if applicable.
-
-  Args:
-    layer: Layer (or model) instance.
-
-  Returns:
-    Tuple (input_shape, input_dtype). Both could be None if the layer
-      does not have a defined input shape.
-
-  Raises:
-    ValueError: in case an empty Sequential or Graph Network is passed.
-  """
-  if ((isinstance(layer, Model) and layer._is_graph_network)
-      or isinstance(layer, Sequential)):
-    # We were passed a model as first layer.
-    # This requires a specific way to figure out the
-    # input shape and dtype.
-    if not layer.layers:
-      raise ValueError('Cannot add an empty model '
-                       'to a `Sequential` model.')
-    # In case of nested models: recover the first layer
-    # of the deepest model to infer input shape and dtype.
-    layer = layer.layers[0]
-    while ((isinstance(layer, Model) and layer._is_graph_network)
-           or isinstance(layer, Sequential)):
-      layer = layer.layers[0]
-
-  if hasattr(layer, '_batch_input_shape'):
-    batch_shape = layer._batch_input_shape
-    dtype = layer.dtype
-    return batch_shape, dtype
-  return None, None
diff --git a/tensorflow/python/keras/engine/sequential_test.py b/tensorflow/python/keras/engine/sequential_test.py
index ea8fdf675a0..b6d2510897f 100644
--- a/tensorflow/python/keras/engine/sequential_test.py
+++ b/tensorflow/python/keras/engine/sequential_test.py
@@ -124,7 +124,7 @@ class TestSequential(test.TestCase, parameterized.TestCase):
     dataset = dataset_ops.Dataset.from_tensor_slices((x, y))
     dataset = dataset.repeat(100)
     dataset = dataset.batch(10)
-    iterator = dataset.make_one_shot_iterator()
+    iterator = dataset_ops.make_one_shot_iterator(dataset)
 
     model.fit(iterator, epochs=1, steps_per_epoch=steps_per_epoch)
     self.assertTrue(model.built)
@@ -132,6 +132,7 @@ class TestSequential(test.TestCase, parameterized.TestCase):
     self.assertFalse(model._is_graph_network)
 
   @parameterized.parameters((True,), (False,))
+  @tf_test_util.run_deprecated_v1
   def test_training_and_eval_methods_on_symbolic_tensors(self, deferred):
     with self.cached_session():
 
@@ -219,6 +220,7 @@ class TestSequential(test.TestCase, parameterized.TestCase):
     inner_model.trainable = True
     self.assertEqual(len(model.trainable_weights), 4)
 
+  @tf_test_util.run_deprecated_v1
   def test_sequential_update_disabling(self):
     val_a = np.random.random((10, 4))
     val_out = np.random.random((10, 4))
@@ -294,7 +296,6 @@ class TestSequential(test.TestCase, parameterized.TestCase):
 
     model.build((None, 10))
     self.assertTrue(model.built)
-    self.assertTrue(model.layers[-1].built)
     self.assertEqual(len(model.weights), 8)
 
   @tf_test_util.run_in_graph_and_eager_modes
@@ -362,29 +363,6 @@ class TestSequentialEagerIntegration(test.TestCase):
     y = np.random.random((2, 5))
     model.fit(x, y, epochs=1)
 
-  @tf_test_util.run_in_graph_and_eager_modes
-  def test_sequential_can_use_graph_functions(self):
-    model = testing_utils.get_small_sequential_mlp(4, 3)
-    self.assertTrue(model._can_use_graph_functions)
-    inner_model = testing_utils.get_small_sequential_mlp(4, 5)
-    model.add(inner_model)
-
-    self.assertTrue(model._can_use_graph_functions)
-
-    inner_model_two = testing_utils.get_small_sequential_mlp(5, 7)
-    self.assertTrue(inner_model_two._can_use_graph_functions)
-
-    layer = keras.layers.Lambda(lambda x: x)
-    layer._can_use_graph_functions = False
-    inner_model_two.add(layer)
-    self.assertFalse(inner_model_two._can_use_graph_functions)
-
-    model.add(inner_model_two)
-    self.assertFalse(model._can_use_graph_functions)
-
-    model.pop()
-    self.assertTrue(model._can_use_graph_functions)
-
   @tf_test_util.run_in_graph_and_eager_modes
   def test_sequential_model_fails_with_dict_inputs(self):
     num_classes = 5
diff --git a/tensorflow/python/keras/engine/topology_test.py b/tensorflow/python/keras/engine/topology_test.py
index b4a4babf259..03bfd35589c 100644
--- a/tensorflow/python/keras/engine/topology_test.py
+++ b/tensorflow/python/keras/engine/topology_test.py
@@ -42,6 +42,7 @@ except ImportError:
 
 class TopologyConstructionTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def test_get_updates(self):
 
     class MyLayer(keras.layers.Layer):
@@ -115,6 +116,7 @@ class TopologyConstructionTest(test.TestCase):
     self.assertEqual(len(layer.get_updates_for(x1)), 2)
     self.assertEqual(len(layer.get_updates_for(None)), 0)
 
+  @test_util.run_deprecated_v1
   def test_get_losses(self):
 
     class MyLayer(keras.layers.Layer):
@@ -268,6 +270,7 @@ class TopologyConstructionTest(test.TestCase):
     self.assertEqual(test_layer.input_shape, [(None, 32), (None, 32)])
     self.assertEqual(test_layer.output_shape, (None, 32))
 
+  @test_util.run_deprecated_v1
   def testBasicNetwork(self):
     # minimum viable network
     x = input_layer_lib.Input(shape=(32,))
@@ -341,6 +344,7 @@ class TopologyConstructionTest(test.TestCase):
     self.assertListEqual(model.trainable_weights, [])
     self.assertListEqual(model.non_trainable_weights, weights)
 
+  @test_util.run_deprecated_v1
   def test_layer_call_arguments(self):
     # Test the ability to pass and serialize arguments to `call`.
     inp = keras.layers.Input(shape=(2,))
@@ -491,6 +495,7 @@ class TopologyConstructionTest(test.TestCase):
       fn_outputs = fn([input_a_np, input_b_np])
       self.assertListEqual([x.shape for x in fn_outputs], [(10, 64), (10, 5)])
 
+  @test_util.run_deprecated_v1
   def test_recursion(self):
     with self.cached_session():
       a = keras.layers.Input(shape=(32,), name='input_a')
@@ -675,6 +680,7 @@ class TopologyConstructionTest(test.TestCase):
     with self.assertRaises(Exception):
       keras.models.Model([j, k], [m, n, 0])
 
+  @test_util.run_deprecated_v1
   def test_raw_tf_compatibility(self):
     # test calling layers/models on TF tensors
     a = keras.layers.Input(shape=(32,), name='input_a')
@@ -719,6 +725,7 @@ class TopologyConstructionTest(test.TestCase):
     model = keras.models.Model(a, b)
     self.assertEqual(model.output_mask.get_shape().as_list(), [None, 10])
 
+  @test_util.run_deprecated_v1
   def testMaskingSingleInput(self):
 
     class MaskedLayer(keras.layers.Layer):
@@ -756,6 +763,7 @@ class TopologyConstructionTest(test.TestCase):
       y_2 = network(x_2)
       self.assertEqual(y_2.get_shape().as_list(), [None, 32])
 
+  @test_util.run_deprecated_v1
   def test_activity_regularization_with_model_composition(self):
 
     def reg(x):
@@ -825,6 +833,7 @@ class TopologyConstructionTest(test.TestCase):
       output_val_2 = m2.predict(x_val)
       self.assertAllClose(output_val, output_val_2, atol=1e-6)
 
+  @test_util.run_deprecated_v1
   def test_explicit_training_argument(self):
     with self.cached_session():
       a = keras.layers.Input(shape=(2,))
@@ -1145,6 +1154,7 @@ class DefaultShapeInferenceBehaviorTest(test.TestCase):
 
 class GraphUtilsTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testGetReachableFromInputs(self):
 
     with self.cached_session():
diff --git a/tensorflow/python/keras/engine/training.py b/tensorflow/python/keras/engine/training.py
index cb96e3e5d20..2236bcf27c5 100644
--- a/tensorflow/python/keras/engine/training.py
+++ b/tensorflow/python/keras/engine/training.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import collections
 import weakref
 import numpy as np
 
@@ -26,6 +27,7 @@ from tensorflow.python.data.ops import iterator_ops
 from tensorflow.python.eager import context
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.keras import backend as K
 from tensorflow.python.keras import losses
@@ -40,6 +42,8 @@ from tensorflow.python.keras.engine import training_utils
 from tensorflow.python.keras.engine.network import Network
 from tensorflow.python.keras.utils import data_utils
 from tensorflow.python.keras.utils.generic_utils import slice_arrays
+from tensorflow.python.keras.utils.losses_utils import squeeze_or_expand_dimensions
+from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import optimizer as tf_optimizer_module
 from tensorflow.python.training.checkpointable import base as checkpointable
@@ -121,11 +125,8 @@ class Model(Network):
     # initializing _distribution_strategy here since it is possible to call
     # predict on a model without compiling it.
     self._distribution_strategy = None
-    # This flag must be disabled upon model mutation, such as changing the model
-    # layers or recompiling the model to use a different optimizer. New function
-    # definitions are generated whenever this flag is disabled, ensuring that
-    # internal graph functions are always using the current model structure.
-    self._built_graph_functions = False
+
+    self.run_eagerly = None
 
   def _set_sample_weight_attributes(self, sample_weight_mode,
                                     skip_target_weighing_indices):
@@ -177,25 +178,66 @@ class Model(Network):
       metric_name = '%s_%s' % (self.output_names[output_index], metric_name)
     j = 1
     base_metric_name = metric_name
-    while metric_name in self.metrics_names:
+    while metric_name in self._compile_metrics_names:
       metric_name = '%s_%d' % (base_metric_name, j)
       j += 1
 
     return metric_name
 
+  @property
+  def metrics(self):
+    """Returns the model's metrics added using `compile`, `add_metric` APIs."""
+    metrics = []
+    if self._is_compiled:
+      metrics += self._compile_stateful_metric_functions
+    return metrics + super(Model, self).metrics
+
+  @property
+  def metrics_names(self):
+    """Returns the model's display labels for all outputs."""
+    metrics_names = []
+    if self._is_compiled:
+      metrics_names += self._compile_metrics_names  # Includes names of losses.
+
+    # Add metric names from layers.
+    for layer in self.layers:
+      metrics_names += [m.name for m in layer._metrics]  # pylint: disable=protected-access
+    metrics_names += [m.name for m in self._metrics]
+    return metrics_names
+
+  @property
+  def _all_metrics_tensors(self):
+    """Returns the network's symbolic metric tensors."""
+    metrics_tensors = {}
+    if self._is_compiled:
+      metrics_tensors.update(self._compile_metrics_tensors)
+    metrics_tensors.update(super(Model, self)._all_metrics_tensors)
+    return metrics_tensors
+
+  @property
+  def _all_stateful_metrics_tensors(self):
+    """Returns the network's symbolic metric tensors."""
+    metrics_tensors = {}
+    if self._is_compiled:
+      metrics_tensors.update(self._compile_stateful_metrics_tensors)
+    metrics_tensors.update(super(Model, self)._all_metrics_tensors)
+    return metrics_tensors
+
   def _init_metric_attributes(self):
     """Initialized model metric attributes."""
     # List of all metric names in the model.
-    self.metrics_names = ['loss']
-    # List of all aggregated metric result tensors. This includes aggregated
-    # loss result tensors.
-    self._stateful_metrics_tensors = []
-    # List of all metric result tensors (aggregated or not - based on the
-    # values given in compile.)
-    self.metrics_tensors = []
+    self._compile_metrics_names = ['loss']
     # List of stateful metric functions. Used for resetting metric state during
-    # training/eval. This includes loss functions.
-    self.stateful_metric_functions = []
+    # training/eval.
+    # This includes loss functions when there are multiple outputs.
+    self._compile_stateful_metric_functions = []
+    # Dict of all aggregated metric result tensors. This includes aggregated
+    # loss result tensors when there are multiple outputs.
+    self._compile_stateful_metrics_tensors = {}
+    # Dict of all metric result tensors (aggregated or not - based on the
+    # values given in compile.). This includes aggregated loss result tensors
+    # when there are multiple outputs.
+    self._compile_metrics_tensors = {}
 
   def _set_per_output_metric_attributes(self, metrics_dict, output_index):
     """Sets the metric attributes on the model for the given output.
@@ -204,24 +246,39 @@ class Model(Network):
       metrics_dict: A dict with metric names as keys and metric fns as values.
       output_index: The index of the model output for which the metric
         attributes are added.
-    """
-    for metric_name, (_, stateful_metric_fn) in metrics_dict.items():
-      metric_name = self._add_unique_metric_name(metric_name, output_index)
-      # Keep track of metric name.
-      self.metrics_names.append(metric_name)
 
-      # Keep track of stateful metric function.
-      self.stateful_metric_functions.append(stateful_metric_fn)
+    Returns:
+      Metrics dict updated with unique metric names as keys.
+    """
+    updated_metrics_dict = collections.OrderedDict()
+    for metric_name, (metric_fn, stateful_metric_fn) in metrics_dict.items():
+      metric_name = self._add_unique_metric_name(metric_name, output_index)
+      updated_metrics_dict[metric_name] = (metric_fn, stateful_metric_fn)
+      # Keep track of metric name, function and stateful function.
+      self._compile_metrics_names.append(metric_name)
+      self._compile_stateful_metric_functions.append(stateful_metric_fn)
+    return updated_metrics_dict
 
   def _set_metric_attributes(self, outputs, skip_target_indices=None):
     """Sets the metric attributes on the model for all the model outputs."""
     skip_target_indices = skip_target_indices or []
+    updated_per_output_metrics = []
+    updated_per_output_weighted_metrics = []
     for i in range(len(outputs)):
       if i in skip_target_indices:
+        updated_per_output_metrics.append(self._per_output_metrics[i])
+        updated_per_output_weighted_metrics.append(
+            self._per_output_weighted_metrics[i])
         continue
-      self._set_per_output_metric_attributes(self._per_output_metrics[i], i)
-      self._set_per_output_metric_attributes(
-          self._per_output_weighted_metrics[i], i)
+      updated_per_output_metrics.append(
+          self._set_per_output_metric_attributes(self._per_output_metrics[i],
+                                                 i))
+      updated_per_output_weighted_metrics.append(
+          self._set_per_output_metric_attributes(
+              self._per_output_weighted_metrics[i], i))
+
+    self._per_output_metrics = updated_per_output_metrics
+    self._per_output_weighted_metrics = updated_per_output_weighted_metrics
 
   def _handle_per_output_metrics(self,
                                  metrics_dict,
@@ -256,17 +313,17 @@ class Model(Network):
           weighted_metric_fn = training_utils.weighted_masked_objective(fn)
           return weighted_metric_fn(y_true, y_pred, weights=weights, mask=mask)
 
-        def _track_metric_tensors(stateless_result, stateful_result):
-          self.metrics_tensors.append(stateless_result)
-          self._stateful_metrics_tensors.append(stateful_result)
+        def _track_metric_tensors(name, stateless_result, stateful_result):
+          self._compile_metrics_tensors[name] = stateless_result
+          self._compile_stateful_metrics_tensors[name] = stateful_result
 
         if isinstance(metric_fn, metrics_module.Metric):
           # If the given metric fn is stateful, call the fn and return result.
           metric_result = _call_stateful_fn(metric_fn)
           metric_results.append(metric_result)
-          if not context.executing_eagerly():
-            _track_metric_tensors(metric_result, metric_result)
-        elif context.executing_eagerly():
+          if not self.run_eagerly:
+            _track_metric_tensors(metric_name, metric_result, metric_result)
+        elif self.run_eagerly:
           # In eager mode, if the given metric fn is not stateful, we invoke the
           # given fn or its stateful version based on the given flag.
           if return_stateful_result:
@@ -279,7 +336,8 @@ class Model(Network):
           # stateless fns.
           stateful_metric_result = _call_stateful_fn(stateful_fn)
           metric_result = _call_stateless_fn(metric_fn)
-          _track_metric_tensors(metric_result, stateful_metric_result)
+          _track_metric_tensors(metric_name, metric_result,
+                                stateful_metric_result)
 
     return metric_results
 
@@ -307,6 +365,7 @@ class Model(Network):
     skip_target_indices = skip_target_indices or []
     metric_results = []
     with K.name_scope('metrics'):
+      # Invoke all metrics added using `compile`.
       for i in range(len(outputs)):
         if i in skip_target_indices:
           continue
@@ -328,8 +387,48 @@ class Model(Network):
                 output_mask,
                 weights=sample_weights[i],
                 return_stateful_result=return_stateful_result))
+
+    # Add metric results from the `add_metric` metrics in eager mode.
+    if context.executing_eagerly():
+      for m in self.metrics:
+        if m not in self._compile_stateful_metric_functions:
+          metric_results.append(m.result())
     return metric_results
 
+  @property
+  def run_eagerly(self):
+    """Settable attribute indicating whether the model should run eagerly.
+
+    Running eagerly means that your model will be run step by step,
+    like Python code. Your model might run slower, but it should become easier
+    for you to debug it by stepping into individual layer calls.
+
+    By default, we will attempt to compile your model to a static graph to
+    deliver the best execution performance.
+
+    Returns:
+      Boolean, whether the model should run eagerly.
+    """
+    if self._run_eagerly is True and not context.executing_eagerly():
+      raise ValueError('You can only set `run_eagerly=True` if eager execution '
+                       'is enabled.')
+    if self._static_graph_friendly:
+      if self._run_eagerly is None:
+        return False
+      else:
+        return self._run_eagerly
+    else:
+      if self._run_eagerly is False:
+        # TODO(fchollet): consider using py_func to enable this.
+        raise ValueError('Your model contains layers that can only be '
+                         'successfully run in eager execution. '
+                         'You cannot set `run_eagerly=False`.')
+      return context.executing_eagerly()
+
+  @run_eagerly.setter
+  def run_eagerly(self, value):
+    self._run_eagerly = value
+
   @checkpointable.no_automatic_dependency_tracking
   def compile(self,
               optimizer,
@@ -391,9 +490,8 @@ class Model(Network):
         ValueError: In case of invalid arguments for
             `optimizer`, `loss`, `metrics` or `sample_weight_mode`.
     """
-    # The correct graph function may have changed,
-    # already-built ones must be updated
-    self._built_graph_functions = False
+    run_eagerly = kwargs.pop('run_eagerly', None)
+    self._run_eagerly = run_eagerly
 
     # Validate that arguments passed by the user to `compile` are supported by
     # DistributionStrategy.
@@ -403,9 +501,6 @@ class Model(Network):
         raise NotImplementedError(
             'optimizer must be an instance of '
             'tf.train.Optimizer, not a %s' % type(optimizer))
-      if context.executing_eagerly():
-        raise NotImplementedError('DistributionStrategy is not supported '
-                                  'when eager execution is enabled.')
       if sample_weight_mode:
         raise NotImplementedError('sample_weight_mode is not supported with '
                                   'DistributionStrategy.')
@@ -417,11 +512,12 @@ class Model(Network):
                          'DistributionStrategy.')
 
     loss = loss or {}
-    if context.executing_eagerly() and not isinstance(
+    if self.run_eagerly and not isinstance(
         optimizer, (tf_optimizer_module.Optimizer, optimizers.TFOptimizer)):
       raise ValueError(
-          'optimizer must be an instance of tf.train.Optimizer, not '
-          'a %s' % type(optimizer))
+          'When running a model in eager execution, the optimizer must be an '
+          'instance of tf.train.Optimizer. Received: '
+          '%s' % optimizer)
 
     self.optimizer = optimizers.get(optimizer)
     # We've disabled automatic dependency tracking for this method, but do want
@@ -430,12 +526,14 @@ class Model(Network):
       self._track_checkpointable(
           self.optimizer, name='optimizer', overwrite=True)
     self.loss = loss
-    self.metrics = metrics or []
+    self._compile_metrics = metrics or []
     self.loss_weights = loss_weights
     self.sample_weight_mode = sample_weight_mode
-    self.weighted_metrics = weighted_metrics
-    if context.executing_eagerly() and target_tensors is not None:
-      raise ValueError('target_tensors is not supported in Eager mode.')
+    self._compile_weighted_metrics = weighted_metrics
+    if self.run_eagerly and target_tensors is not None:
+      raise ValueError(
+          'target_tensors argument is not supported when '
+          'running a model eagerly.')
     self.target_tensors = target_tensors
 
     # Set DistributionStrategy specific parameters.
@@ -445,6 +543,8 @@ class Model(Network):
     if self._distribution_strategy is not None:
       distributed_training_utils.configure_and_create_session(
           self._distribution_strategy)
+    # Initialize model metric attributes.
+    self._init_metric_attributes()
     if not self.built:
       # Model is not compilable because it does not know its number of inputs
       # and outputs, nor their shapes and names. We will compile after the first
@@ -468,16 +568,16 @@ class Model(Network):
               '" missing from loss dictionary. We assume '
               'this was done on purpose. The fit and evaluate APIs will not be '
               'expecting any data to be passed to "' + name + '".')
-        loss_functions.append(losses.get(loss.get(name)))
+        loss_functions.append(training_utils.get_loss_function(loss.get(name)))
     elif isinstance(loss, list):
       if len(loss) != len(self.outputs):
         raise ValueError('When passing a list as loss, '
                          'it should have one entry per model outputs. '
                          'The model has ' + str(len(self.outputs)) +
                          ' outputs, but you passed loss=' + str(loss))
-      loss_functions = [losses.get(l) for l in loss]
+      loss_functions = [training_utils.get_loss_function(l) for l in loss]
     else:
-      loss_function = losses.get(loss)
+      loss_function = training_utils.get_loss_function(loss)
       loss_functions = [loss_function for _ in range(len(self.outputs))]
     self.loss_functions = loss_functions
 
@@ -493,7 +593,7 @@ class Model(Network):
         skip_target_weighing_indices.append(i)
 
     # Prepare output masks.
-    if not context.executing_eagerly():
+    if not self.run_eagerly:
       masks = [getattr(x, '_keras_mask', None) for x in self.outputs]
       if not isinstance(masks, list):
         masks = [masks]
@@ -524,11 +624,8 @@ class Model(Network):
                       str(loss_weights) + ' - expected a list of dicts.')
     self.loss_weights_list = loss_weights_list
 
-    # Initialize model metric attributes.
-    self._init_metric_attributes()
-
     # Initialization for Eager mode execution.
-    if context.executing_eagerly():
+    if self.run_eagerly:
       # Prepare sample weights.
       self._set_sample_weight_attributes(sample_weight_mode,
                                          skip_target_weighing_indices)
@@ -541,7 +638,7 @@ class Model(Network):
       self.total_loss = None
       for i in range(len(self.outputs)):
         if len(self.outputs) > 1:
-          self.metrics_names.append(self.output_names[i] + '_loss')
+          self._compile_metrics_names.append(self.output_names[i] + '_loss')
 
       # Set metric attributes on model.
       self._set_metric_attributes(
@@ -555,145 +652,167 @@ class Model(Network):
       self._collected_trainable_weights = self.trainable_weights
       return
 
-    # Prepare targets of model.
-    self.targets = []
-    self._feed_targets = []
-    if target_tensors not in (None, []):
-      if isinstance(target_tensors, list):
-        if len(target_tensors) != len(self.outputs):
-          raise ValueError(
-              'When passing a list as `target_tensors`, '
-              'it should have one entry per model output. '
-              'The model has ' + str(len(self.outputs)) +
-              ' outputs, but you passed target_tensors=' + str(target_tensors))
-      elif isinstance(target_tensors, dict):
-        for name in target_tensors:
-          if name not in self.output_names:
+    with K.get_graph().as_default():
+      # Prepare targets of model.
+      self.targets = []
+      self._feed_targets = []
+      if target_tensors not in (None, []):
+        if isinstance(target_tensors, list):
+          if len(target_tensors) != len(self.outputs):
             raise ValueError(
-                'Unknown entry in `target_tensors` '
-                'dictionary: "' + name + '". '
-                'Only expected the following keys: ' + str(self.output_names))
-        tmp_target_tensors = []
-        for name in self.output_names:
-          tmp_target_tensors.append(target_tensors.get(name, None))
-        target_tensors = tmp_target_tensors
-      elif tensor_util.is_tensor(target_tensors):
-        target_tensors = [target_tensors]
-      else:
-        raise TypeError('Expected `target_tensors` to be a list or tuple or '
-                        'dict or a single tensor, but got:', target_tensors)
-
-    for i in range(len(self.outputs)):
-      if i in skip_target_indices:
-        self.targets.append(None)
-      else:
-        shape = K.int_shape(self.outputs[i])
-        name = self.output_names[i]
-        if target_tensors not in (None, []):
-          target = target_tensors[i]
+                'When passing a list as `target_tensors`, '
+                'it should have one entry per model output. '
+                'The model has %s outputs, but you passed target_tensors=%s' %
+                (len(self.outputs), target_tensors))
+        elif isinstance(target_tensors, dict):
+          for name in target_tensors:
+            if name not in self.output_names:
+              raise ValueError(
+                  'Unknown entry in `target_tensors` '
+                  'dictionary: "' + name + '". '
+                  'Only expected the following keys: ' + str(self.output_names))
+          tmp_target_tensors = []
+          for name in self.output_names:
+            tmp_target_tensors.append(target_tensors.get(name, None))
+          target_tensors = tmp_target_tensors
+        elif tensor_util.is_tensor(target_tensors):
+          target_tensors = [target_tensors]
         else:
-          target = None
-        if target is None or K.is_placeholder(target):
-          if target is None:
-            target = K.placeholder(
-                ndim=len(shape),
-                name=name + '_target',
-                sparse=K.is_sparse(self.outputs[i]),
-                dtype=K.dtype(self.outputs[i]))
-          self._feed_targets.append(target)
-          self._feed_outputs.append(self.outputs[i])
-          self._feed_output_names.append(name)
-          self._feed_output_shapes.append(shape)
-          self._feed_loss_fns.append(self.loss_functions[i])
-        else:
-          skip_target_weighing_indices.append(i)
-        self.targets.append(target)
+          raise TypeError('Expected `target_tensors` to be a list or tuple or '
+                          'dict or a single tensor, but got:', target_tensors)
 
-    # Prepare sample weights.
-    self._set_sample_weight_attributes(sample_weight_mode,
-                                       skip_target_weighing_indices)
-    # Save all metric attributes per output of the model.
-    self._cache_output_metric_attributes(metrics, weighted_metrics)
-
-    # Compute total loss.
-    total_loss = None
-    with K.name_scope('loss'):
       for i in range(len(self.outputs)):
         if i in skip_target_indices:
-          continue
-        y_true = self.targets[i]
-        y_pred = self.outputs[i]
-        loss_fn = loss_functions[i]
-        sample_weight = self.sample_weights[i]
-        mask = masks[i]
-        loss_weight = loss_weights_list[i]
-        with K.name_scope(self.output_names[i] + '_loss'):
-          weighted_loss = training_utils.weighted_masked_objective(loss_fn)
-          output_loss = weighted_loss(y_true, y_pred, sample_weight, mask)
+          self.targets.append(None)
+        else:
+          shape = K.int_shape(self.outputs[i])
+          name = self.output_names[i]
+          if target_tensors not in (None, []):
+            target = target_tensors[i]
+          else:
+            target = None
+          if target is None or K.is_placeholder(target):
+            if target is None:
+              target_dtype = losses.LABEL_DTYPES_FOR_LOSSES.get(
+                  self.loss_functions[i],
+                  K.dtype(self.outputs[i]))
 
-        if len(self.outputs) > 1:
-          # Keep track of the un-aggregated loss result tensor.
-          self.metrics_tensors.append(output_loss)
+              target = K.placeholder(
+                  ndim=len(shape),
+                  name=name + '_target',
+                  sparse=K.is_sparse(self.outputs[i]),
+                  dtype=target_dtype)
+            self._feed_targets.append(target)
+            self._feed_outputs.append(self.outputs[i])
+            self._feed_output_names.append(name)
+            self._feed_output_shapes.append(shape)
+            self._feed_loss_fns.append(self.loss_functions[i])
+          else:
+            skip_target_weighing_indices.append(i)
+          self.targets.append(target)
 
-          # Keep track of stateful result tensor and function for the loss.
-          mean_wrapped_loss = metrics_module.MeanMetricWrapper(
-              loss_fn, name=loss_fn.__name__)
-          result_tensor = training_utils.call_metric_function(
-              mean_wrapped_loss,
-              y_true,
-              y_pred,
-              weights=sample_weight,
-              mask=mask)
-          self._stateful_metrics_tensors.append(result_tensor)
-          self.stateful_metric_functions.append(mean_wrapped_loss)
+      # Prepare sample weights.
+      self._set_sample_weight_attributes(sample_weight_mode,
+                                         skip_target_weighing_indices)
+      # Save all metric attributes per output of the model.
+      self._cache_output_metric_attributes(metrics, weighted_metrics)
 
-          self.metrics_names.append(self.output_names[i] + '_loss')
+      # Compute total loss.
+      total_loss = None
+      with K.name_scope('loss'):
+        for i in range(len(self.outputs)):
+          if i in skip_target_indices:
+            continue
+          y_true = self.targets[i]
+          y_pred = self.outputs[i]
+          loss_fn = loss_functions[i]
+          sample_weight = self.sample_weights[i]
+          mask = masks[i]
+          loss_weight = loss_weights_list[i]
+          with K.name_scope(self.output_names[i] + '_loss'):
+            if isinstance(loss_fn, losses.Loss):
+              if mask is not None:
+                mask = math_ops.cast(mask, y_pred.dtype)
+                # Update weights with mask.
+                if sample_weight is None:
+                  sample_weight = mask
+                else:
+                  # Update dimensions of weights to match with mask if possible.
+                  mask, _, sample_weight = squeeze_or_expand_dimensions(
+                      mask, None, sample_weight)
+                  sample_weight *= mask
+              output_loss = loss_fn(y_true, y_pred, sample_weight=sample_weight)
+            else:
+              weighted_loss = training_utils.weighted_masked_objective(loss_fn)
+              output_loss = weighted_loss(y_true, y_pred, sample_weight, mask)
+
+          if len(self.outputs) > 1:
+            # Keep track of the un-aggregated loss result tensor.
+            self._compile_metrics_tensors[self.output_names[i] +
+                                          '_loss'] = output_loss
+
+            # Keep track of stateful result tensor and function for the loss.
+            loss_name = loss_fn.name if isinstance(
+                loss_fn, losses.Loss) else loss_fn.__name__
+            mean_wrapped_loss = metrics_module.MeanMetricWrapper(
+                loss_fn, name=loss_name)
+            result_tensor = training_utils.call_metric_function(
+                mean_wrapped_loss,
+                y_true,
+                y_pred,
+                weights=sample_weight,
+                mask=mask)
+            self._compile_stateful_metrics_tensors[self.output_names[i] +
+                                                   '_loss'] = result_tensor
+            self._compile_stateful_metric_functions.append(mean_wrapped_loss)
+
+            self._compile_metrics_names.append(self.output_names[i] + '_loss')
+          if total_loss is None:
+            total_loss = loss_weight * output_loss
+          else:
+            total_loss += loss_weight * output_loss
         if total_loss is None:
-          total_loss = loss_weight * output_loss
-        else:
-          total_loss += loss_weight * output_loss
-      if total_loss is None:
-        if not self.losses:
-          raise ValueError('The model cannot be compiled '
-                           'because it has no loss to optimize.')
-        else:
-          total_loss = 0.
+          if not self.losses:
+            raise ValueError('The model cannot be compiled '
+                             'because it has no loss to optimize.')
+          else:
+            total_loss = 0.
 
-      # Add regularization penalties
-      # and other layer-specific losses.
-      for loss_tensor in self.losses:
-        total_loss += loss_tensor
+        # Add regularization penalties
+        # and other layer-specific losses.
+        for loss_tensor in self.losses:
+          total_loss += loss_tensor
 
-    # Set metric attributes on model.
-    self._set_metric_attributes(
-        self.outputs,
-        skip_target_indices=skip_target_indices,
-    )
-    # Invoke metric functions for all the outputs.
-    self._handle_metrics(
-        self.outputs,
-        masks=masks,
-        targets=self.targets,
-        skip_target_indices=skip_target_indices,
-        sample_weights=self.sample_weights)
+      # Set metric attributes on model.
+      self._set_metric_attributes(
+          self.outputs,
+          skip_target_indices=skip_target_indices,
+      )
+      # Invoke metric functions for all the outputs.
+      self._handle_metrics(
+          self.outputs,
+          masks=masks,
+          targets=self.targets,
+          skip_target_indices=skip_target_indices,
+          sample_weights=self.sample_weights)
 
-    # Prepare gradient updates and state updates.
-    self.total_loss = total_loss
+      # Prepare gradient updates and state updates.
+      self.total_loss = total_loss
 
-    # Functions for train, test and predict will
-    # be compiled lazily when required.
-    # This saves time when the user is not using all functions.
-    self._function_kwargs = kwargs
+      # Functions for train, test and predict will
+      # be compiled lazily when required.
+      # This saves time when the user is not using all functions.
+      self._function_kwargs = kwargs
 
-    self._fit_function = None
-    self._eval_function = None
-    self.train_function = None
-    self.test_function = None
-    self.predict_function = None
+      self._fit_function = None
+      self._eval_function = None
+      self.train_function = None
+      self.test_function = None
+      self.predict_function = None
 
-    # Collected trainable weights, sorted in topological order.
-    trainable_weights = self.trainable_weights
-    self._collected_trainable_weights = trainable_weights
+      # Collected trainable weights, sorted in topological order.
+      trainable_weights = self.trainable_weights
+      self._collected_trainable_weights = trainable_weights
 
   def _check_trainable_weights_consistency(self):
     """Check trainable weights count consistency.
@@ -721,21 +840,24 @@ class Model(Network):
       inputs = (self._feed_inputs +
                 self._feed_targets +
                 self._feed_sample_weights)
-      if not isinstance(K.learning_phase(), int):
-        inputs += [K.learning_phase()]
+      if not isinstance(K.symbolic_learning_phase(), int):
+        inputs += [K.symbolic_learning_phase()]
+
+      with K.get_graph().as_default():
+        with K.name_scope('training'):
+          with K.name_scope(self.optimizer.__class__.__name__):
+            # Training updates
+            updates = self.optimizer.get_updates(
+                params=self._collected_trainable_weights, loss=self.total_loss)
+      # Unconditional updates
+      updates += self.get_updates_for(None)
+      # Conditional updates relevant to this model
+      updates += self.get_updates_for(self.inputs)
+      # Add stateful metrics updates.
+      if metric_updates is not None:
+        updates += metric_updates
 
       with K.name_scope('training'):
-        with K.name_scope(self.optimizer.__class__.__name__):
-          # Training updates
-          updates = self.optimizer.get_updates(
-              params=self._collected_trainable_weights, loss=self.total_loss)
-        # Unconditional updates
-        updates += self.get_updates_for(None)
-        # Conditional updates relevant to this model
-        updates += self.get_updates_for(self.inputs)
-        # Add stateful metrics updates.
-        if metric_updates is not None:
-          updates += metric_updates
         # Gets loss and metrics. Updates weights at each call.
         fn = K.function(
             inputs,
@@ -746,18 +868,24 @@ class Model(Network):
         setattr(self, fn_name, fn)
 
   def _make_train_function(self):
+    metrics_tensors = [
+        self._all_metrics_tensors[m] for m in self.metrics_names[1:]
+    ]
     self._make_train_function_helper('train_function',
-                                     [self.total_loss] + self.metrics_tensors)
+                                     [self.total_loss] + metrics_tensors)
 
   def _make_fit_function(self):
     # TODO(psv/anjalisridhar): Remove updates after we fix b/118841692
     # Stateful metrics updates
     metric_updates = []
-    for m in self.stateful_metric_functions:
+    for m in self.metrics:
       metric_updates += m.updates
+
+    metrics_tensors = [
+        self._all_stateful_metrics_tensors[m] for m in self.metrics_names[1:]
+    ]
     self._make_train_function_helper(
-        '_fit_function', [self.total_loss] + self._stateful_metrics_tensors,
-        metric_updates)
+        '_fit_function', [self.total_loss] + metrics_tensors, metric_updates)
 
   def _make_test_function_helper(self, fn_name, outputs, metric_updates=None):
     if not hasattr(self, fn_name):
@@ -766,49 +894,53 @@ class Model(Network):
       inputs = (self._feed_inputs +
                 self._feed_targets +
                 self._feed_sample_weights)
-      if not isinstance(K.learning_phase(), int):
-        inputs += [K.learning_phase()]
-      updates = self.state_updates
-      # Add stateful metrics updates.
-      if metric_updates is not None:
-        updates += metric_updates
-      # Return loss and metrics, no gradient updates.
-      # Does update the network states.
-      fn = K.function(
-          inputs,
-          outputs,
-          updates=updates,
-          name='test_function',
-          **self._function_kwargs)
-      setattr(self, fn_name, fn)
+
+      with K.name_scope('evaluation'):
+        updates = self.state_updates
+        # Add stateful metrics updates.
+        if metric_updates is not None:
+          updates += metric_updates
+        # Return loss and metrics, no gradient updates.
+        # Does update the network states.
+        fn = K.function(
+            inputs,
+            outputs,
+            updates=updates,
+            name='test_function',
+            **self._function_kwargs)
+        setattr(self, fn_name, fn)
 
   def _make_test_function(self):
+    metrics_tensors = [
+        self._all_metrics_tensors[m] for m in self.metrics_names[1:]
+    ]
     self._make_test_function_helper('test_function',
-                                    [self.total_loss] + self.metrics_tensors)
+                                    [self.total_loss] + metrics_tensors)
 
   def _make_eval_function(self):
-    self._make_test_function_helper(
-        '_eval_function', [self.total_loss] + self._stateful_metrics_tensors)
+    metrics_tensors = [
+        self._all_stateful_metrics_tensors[m] for m in self.metrics_names[1:]
+    ]
+    self._make_test_function_helper('_eval_function',
+                                    [self.total_loss] + metrics_tensors)
 
   def _make_predict_function(self):
     if not hasattr(self, 'predict_function'):
       self.predict_function = None
     if self.predict_function is None:
-      if not isinstance(K.learning_phase(), int):
-        inputs = self._feed_inputs + [K.learning_phase()]
-      else:
-        inputs = self._feed_inputs
+      inputs = self._feed_inputs
       # Gets network outputs. Does not update weights.
       # Does update the network states.
       kwargs = getattr(self, '_function_kwargs', {})
-      self.predict_function = K.function(
-          inputs,
-          self.outputs,
-          updates=self.state_updates,
-          name='predict_function',
-          **kwargs)
+      with K.name_scope('predict'):
+        self.predict_function = K.function(
+            inputs,
+            self.outputs,
+            updates=self.state_updates,
+            name='predict_function',
+            **kwargs)
 
-  def _get_execution_function(self, mode):
+  def _make_execution_function(self, mode):
     if mode == 'train':
       self._make_fit_function()
       return self._fit_function
@@ -873,7 +1005,8 @@ class Model(Network):
                                 'when using DistributionStrategy.')
 
     if (sample_weight is not None and sample_weight.all() and
-        self._distribution_strategy.__class__.__name__ == 'TPUStrategy'):
+        distributed_training_utils.is_tpu_strategy(
+            self._distribution_strategy)):
       raise NotImplementedError('`sample_weight` is currently not supported '
                                 'when using TPUStrategy.')
 
@@ -882,18 +1015,13 @@ class Model(Network):
     # TODO(anjalisridhar): Remove this check once we refactor the
     # _standardize_user_data code path. This check is already present elsewhere
     # in the codebase.
-    if check_steps and isinstance(x, dataset_ops.Dataset) and steps is None:
+    if check_steps and isinstance(x, dataset_ops.DatasetV2) and steps is None:
       raise ValueError('When using Datasets as input, '
                        'you should specify the `{steps_name}` argument.'
                        .format(steps_name=steps_name))
 
     first_x_value = nest.flatten(x)[0]
     if isinstance(first_x_value, np.ndarray):
-      assert steps is not None
-      x_shape = first_x_value.shape
-      if batch_size is None:
-        batch_size = distributed_training_utils.get_batch_size(
-            self._distribution_strategy.num_replicas_in_sync, x_shape[0], steps)
       # We need to use the drop_remainder argument to allow for a static
       # input shape which is required for TPUs.
       drop_remainder = self._distribution_strategy.require_static_shapes
@@ -928,19 +1056,15 @@ class Model(Network):
         var_x = distributed_training_utils.get_var_for_numpy(
             self._distribution_strategy, x)
         x = dataset_ops.Dataset.from_tensor_slices(var_x)
-        x = x.repeat()
         x = x.batch(batch_size, drop_remainder=drop_remainder)
 
-    assert isinstance(x, dataset_ops.Dataset)
+    assert isinstance(x, dataset_ops.DatasetV2)
 
-    # TODO(anjalisridhar): We want distribute_dataset() to accept a Dataset or a
-    # function which returns a Dataset. Currently distribute_dataset() only
-    # accepts a function that returns a Dataset. Once we add support for being
-    # able to clone a Dataset on multiple workers we can remove this lambda.
-    result = self._distribution_strategy.distribute_dataset(lambda: x)
-    iterator = result.make_initializable_iterator()
     with self._distribution_strategy.scope():
-      K.get_session().run(iterator.initializer)
+      iterator = self._distribution_strategy.make_dataset_iterator(x)
+      init_op = iterator.initialize()
+      if not context.executing_eagerly():
+        K.get_session().run(init_op)
 
     training_utils.validate_iterator_input(x, y, sample_weight,
                                            validation_split)
@@ -1025,14 +1149,14 @@ class Model(Network):
           shuffle=shuffle)
       return iterator, None, None
 
-    if isinstance(x, dataset_ops.Dataset):
+    if isinstance(x, dataset_ops.DatasetV2):
       if context.executing_eagerly():
-        x = x.make_one_shot_iterator()
+        x = iter(x)
       else:
         if x in self._dataset_iterator_cache:
           x = self._dataset_iterator_cache[x]
         else:
-          iterator = x.make_initializable_iterator()
+          iterator = dataset_ops.make_initializable_iterator(x)
           self._dataset_iterator_cache[x] = iterator
           x = iterator
         K.get_session().run(x.initializer)
@@ -1052,7 +1176,7 @@ class Model(Network):
     # For eager iterators, when we have to process multiple batches of samples,
     # we will standardize the data when we actually loop over iterator and get
     # the batches. For now, we just return the iterator as is.
-    if is_x_eager_iterator and steps is not None:
+    if is_x_eager_iterator:
       return x, y, sample_weight
 
     # If input data is a dataset iterator in graph mode or if it is an eager
@@ -1096,6 +1220,8 @@ class Model(Network):
     all_inputs = []
     is_build_called = False
     is_compile_called = False
+    # Whether this is a subclassed model that expects dictionary inputs
+    # rather than list inputs (e.g. FeatureColumn-based models).
     dict_inputs = False
     if not self.inputs:
       # We need to use `x` to set the model inputs.
@@ -1122,9 +1248,16 @@ class Model(Network):
       # to match the value shapes.
       if not self.inputs:
         is_build_called = True
-        self._set_inputs(x)
+        cast_inputs = x
+        if training_utils.has_tensors(x):
+          cast_inputs = training_utils.cast_if_floating_dtype(x)
+        self._set_inputs(cast_inputs)
     else:
       dict_inputs = isinstance(self.inputs, dict)
+    if dict_inputs and context.executing_eagerly():
+      # No support for graph functions when the model expects dictionary inputs
+      # (i.e. FeatureColumn-based models).
+      self.run_eagerly = True
 
     if y is not None:
       if not self.optimizer:
@@ -1134,6 +1267,8 @@ class Model(Network):
       if not self._is_compiled:
         # On-the-fly compilation of the model.
         # We need to use `y` to set the model targets.
+        if training_utils.has_tensors(y):
+          y = training_utils.cast_if_floating_dtype(y)
         if isinstance(y, (list, tuple)):
           if not all(isinstance(v, np.ndarray) or
                      tensor_util.is_tensor(v) for v in y):
@@ -1158,19 +1293,22 @@ class Model(Network):
                              'TensorFlow tensors. '
                              'You passed: x=' + str(x) + '; y=' + str(y))
 
-        if context.executing_eagerly():
+        if self.run_eagerly:
           target_tensors = None
         else:
           # Handle target tensors if any passed.
           if not isinstance(y, (list, tuple)):
             y = [y]
-          target_tensors = [v for v in y if tensor_util.is_tensor(v)]
+          target_tensors = [v for v in y if _is_symbolic_tensor(v)]
         is_compile_called = True
-        self.compile(optimizer=self.optimizer,
-                     loss=self.loss,
-                     metrics=self.metrics,
-                     loss_weights=self.loss_weights,
-                     target_tensors=target_tensors)
+        self.compile(
+            optimizer=self.optimizer,
+            loss=self.loss,
+            metrics=self._compile_metrics,
+            weighted_metrics=self._compile_weighted_metrics,
+            loss_weights=self.loss_weights,
+            target_tensors=target_tensors,
+            run_eagerly=self.run_eagerly)
 
     # In graph mode, if we had just set inputs and targets as symbolic tensors
     # by invoking build and compile on the model respectively, we do not have to
@@ -1178,15 +1316,15 @@ class Model(Network):
     # part of the graph.
     # Note: in this case, `any` and `all` are equivalent since we disallow
     # mixed symbolic/value inputs.
-    if (not context.executing_eagerly() and is_build_called and
+    if (not self.run_eagerly and is_build_called and
         is_compile_called and
-        any(tensor_util.is_tensor(v) for v in all_inputs)):
+        any(_is_symbolic_tensor(v) for v in all_inputs)):
       return [], [], []
 
     # What follows is input validation and standardization to list format,
     # in the case where all inputs are value arrays.
 
-    if context.executing_eagerly():
+    if self.run_eagerly:
       # In eager mode, do not do shape validation
       # since the network has no input nodes (placeholders) to be fed.
       feed_input_names = self.input_names
@@ -1242,7 +1380,9 @@ class Model(Network):
       y = training_utils.standardize_input_data(
           y,
           feed_output_names,
-          feed_output_shapes,
+          # Don't enforce target shapes to match output shapes.
+          # Precise checks will be run in `check_loss_and_target_compatibility`.
+          shapes=None,
           check_batch_axis=False,  # Don't enforce the batch size.
           exception_prefix='target')
 
@@ -1260,7 +1400,7 @@ class Model(Network):
       # Check that all arrays have the same length.
       if not self._distribution_strategy:
         training_utils.check_array_lengths(x, y, sample_weights)
-        if self._is_graph_network and not context.executing_eagerly():
+        if self._is_graph_network and not self.run_eagerly:
           # Additional checks to avoid users mistakenly using improper loss fns.
           training_utils.check_loss_and_target_compatibility(
               y, self._feed_loss_fns, feed_output_shapes)
@@ -1315,8 +1455,7 @@ class Model(Network):
 
     if self.__class__.__name__ == 'Sequential' and not self.built:
       if tensor_util.is_tensor(inputs):
-        input_shape = (None,) + tuple(inputs.get_shape().as_list()[1:])
-        self.build(input_shape=input_shape)
+        input_shape = (None,) + tuple(inputs.shape.as_list()[1:])
       elif isinstance(inputs, dict):
         # We assert that the first layer is a FeatureLayer.
         if not training_utils.is_feature_layer(self.layers[0]):
@@ -1324,10 +1463,9 @@ class Model(Network):
                            'which doesn\'t have FeatureLayer as the first layer'
                            ' is an error.')
         input_shape = (None,)
-        self.build(input_shape=input_shape)
       else:
-        input_shape = (None,) + inputs.shape[1:]
-        self.build(input_shape=input_shape)
+        input_shape = (None,) + tuple(inputs.shape[1:])
+      self._build_input_shape = input_shape
 
     # On-the-fly setting of symbolic model inputs (either by using the tensor
     # provided, or by creating a placeholder if Numpy data was provided).
@@ -1346,10 +1484,11 @@ class Model(Network):
         self._feed_input_names.append(k)
         self._feed_input_shapes.append(K.int_shape(v))
 
+    # TODO(fchollet): consider calling `_maybe_build` before calling the model.
+
     if outputs is None:
       # Obtain symbolic outputs by calling the model.
-      graph = K.get_graph()
-      with graph.as_default():
+      with K.get_graph().as_default():
         if self._expects_training_arg:
           outputs = self.call(inputs, training=training)
         else:
@@ -1509,7 +1648,6 @@ class Model(Network):
     """
     # TODO(fchollet): this method may be creating reference cycles, which would
     # lead to accumulating garbage in memory when called in a loop. Investigate.
-
     if data_utils.is_generator_or_sequence(x):
       training_utils.check_generator_arguments(y, sample_weight)
       return self.fit_generator(
@@ -1527,9 +1665,6 @@ class Model(Network):
           shuffle=shuffle,
           initial_epoch=initial_epoch)
 
-    # Backwards compatibility
-    if batch_size is None and steps_per_epoch is None:
-      batch_size = 32
     # Legacy support
     if 'nb_epoch' in kwargs:
       logging.warning(
@@ -1541,15 +1676,21 @@ class Model(Network):
 
     # Validate and standardize user data.
     if self._distribution_strategy:
-      distributed_training_utils.validate_callbacks(callbacks)
+      distributed_training_utils.validate_callbacks(callbacks, self.optimizer,
+                                                    self._distribution_strategy)
 
       distributed_training_utils.validate_inputs(
           x, y, self._distribution_strategy)
 
       first_x_value = nest.flatten(x)[0]
-      if not steps_per_epoch and isinstance(first_x_value, np.ndarray):
-        steps_per_epoch = distributed_training_utils.get_input_batch_params(
-            first_x_value, batch_size, self._distribution_strategy)
+      if isinstance(first_x_value, np.ndarray):
+        steps_per_epoch, batch_size = (
+            distributed_training_utils.get_input_params(
+                self._distribution_strategy, first_x_value, steps_per_epoch,
+                batch_size, is_training=True))
+
+    batch_size = self._validate_or_infer_batch_size(batch_size, steps_per_epoch,
+                                                    x)
 
     x, y, sample_weights = self._standardize_user_data(
         x,
@@ -1567,7 +1708,7 @@ class Model(Network):
     if validation_data:
       if (isinstance(validation_data, iterator_ops.Iterator) or
           isinstance(validation_data, iterator_ops.EagerIterator) or
-          isinstance(validation_data, dataset_ops.Dataset)):
+          isinstance(validation_data, dataset_ops.DatasetV2)):
         val_x = validation_data
         val_y = None
         val_sample_weight = None
@@ -1590,9 +1731,10 @@ class Model(Network):
         distributed_training_utils.validate_inputs(
             val_x, val_y, self._distribution_strategy)
         first_valx_value = nest.flatten(val_x)[0]
-        if not validation_steps and isinstance(first_valx_value, np.ndarray):
-          validation_steps = distributed_training_utils.get_input_batch_params(
-              first_valx_value, batch_size, self._distribution_strategy)
+        if isinstance(first_valx_value, np.ndarray):
+          validation_steps, _ = distributed_training_utils.get_input_params(
+              self._distribution_strategy, first_valx_value, validation_steps,
+              batch_size)
 
       val_x, val_y, val_sample_weights = self._standardize_user_data(
           val_x,
@@ -1622,27 +1764,25 @@ class Model(Network):
       val_y = None
       val_sample_weights = None
 
-    if context.executing_eagerly():
-      return training_eager.fit_loop(
-          self,
-          inputs=x,
-          targets=y,
-          sample_weights=sample_weights,
-          class_weight=class_weight,
+    if (self.run_eagerly or (isinstance(x, iterator_ops.EagerIterator) and
+                             not self._distribution_strategy)):
+      return training_generator.fit_generator(
+          self, (x, y, sample_weights),
+          steps_per_epoch=steps_per_epoch,
           batch_size=batch_size,
           epochs=epochs,
+          shuffle=shuffle,
           verbose=verbose,
           callbacks=callbacks,
-          val_inputs=val_x,
-          val_targets=val_y,
-          val_sample_weights=val_sample_weights,
-          shuffle=shuffle,
-          initial_epoch=initial_epoch,
-          steps_per_epoch=steps_per_epoch,
-          validation_steps=validation_steps)
-    elif self._distribution_strategy:
-      return training_distributed.fit_loop(
-          self, x,
+          validation_data=validation_data,
+          validation_steps=validation_steps,
+          workers=0,
+          initial_epoch=initial_epoch)
+    elif distributed_training_utils.is_tpu_strategy(
+        self._distribution_strategy):
+      return training_distributed.experimental_fit_loop(
+          self,
+          x,
           epochs=epochs,
           verbose=verbose,
           callbacks=callbacks,
@@ -1757,19 +1897,16 @@ class Model(Network):
           max_queue_size=max_queue_size,
           workers=workers,
           use_multiprocessing=use_multiprocessing)
-
-    # Backwards compatibility.
-    if batch_size is None and steps is None:
-      batch_size = 32
-
     # Validate and standardize user data.
     if self._distribution_strategy:
       distributed_training_utils.validate_inputs(
           x, y, self._distribution_strategy)
       first_x_value = nest.flatten(x)[0]
-      if isinstance(first_x_value, np.ndarray) and not steps:
-        steps = distributed_training_utils.get_input_batch_params(
-            first_x_value, batch_size, self._distribution_strategy)
+      if isinstance(first_x_value, np.ndarray):
+        steps, batch_size = distributed_training_utils.get_input_params(
+            self._distribution_strategy, first_x_value, steps, batch_size)
+
+    batch_size = self._validate_or_infer_batch_size(batch_size, steps, x)
 
     x, y, sample_weights = self._standardize_user_data(
         x,
@@ -1780,21 +1917,18 @@ class Model(Network):
         steps_name='steps',
         steps=steps)
 
-    if context.executing_eagerly():
-      return training_eager.test_loop(
-          self,
-          inputs=x,
-          targets=y,
-          sample_weights=sample_weights,
+    if (self.run_eagerly or (isinstance(x, iterator_ops.EagerIterator) and
+                             not self._distribution_strategy)):
+      return training_generator.evaluate_generator(
+          self, (x, y, sample_weights),
+          steps=steps,
           batch_size=batch_size,
           verbose=verbose,
-          steps=steps)
-    elif self._distribution_strategy:
-      return training_distributed.test_loop(
-          self,
-          iterator=x,
-          verbose=verbose,
-          steps=steps)
+          workers=0)
+    elif distributed_training_utils.is_tpu_strategy(
+        self._distribution_strategy):
+      return training_distributed.experimental_test_loop(
+          self, iterator=x, verbose=verbose, steps=steps)
     else:
       return training_arrays.test_loop(
           self,
@@ -1868,37 +2002,57 @@ class Model(Network):
           max_queue_size=max_queue_size,
           workers=workers,
           use_multiprocessing=use_multiprocessing)
-
-    # Backwards compatibility.
-    if batch_size is None and steps is None:
-      batch_size = 32
-
     if self._distribution_strategy:
       distributed_training_utils.validate_inputs(
           x, None, self._distribution_strategy)
       first_x_value = nest.flatten(x)[0]
-      if isinstance(first_x_value, np.ndarray) and not steps:
-        steps = distributed_training_utils.get_input_batch_params(
-            first_x_value, batch_size, self._distribution_strategy)
+      if isinstance(first_x_value, np.ndarray):
+        steps, batch_size = distributed_training_utils.get_input_params(
+            self._distribution_strategy, first_x_value, steps, batch_size)
+
+    batch_size = self._validate_or_infer_batch_size(batch_size, steps, x)
 
     # Validate and standardize user data.
-    # TODO(anjalisridhar): We don't pass batch_size here for some reason. This
-    # means that we end up calculating it twice which we should avoid.
-    x, _, _ = self._standardize_user_data(
-        x, check_steps=True, steps_name='steps', steps=steps)
+    if self._distribution_strategy:
+      x, _, _ = self._standardize_user_data(
+          x, check_steps=True, steps_name='steps', steps=steps,
+          batch_size=batch_size)
+    else:
+      # TODO(anjalisridhar): We don't pass batch_size here for some reason. This
+      # means we need to special case distribution strategy which needs the
+      # batch size.
+      x, _, _ = self._standardize_user_data(
+          x, check_steps=True, steps_name='steps', steps=steps)
 
-    if context.executing_eagerly():
-      return training_eager.predict_loop(
-          self, x, batch_size=batch_size, verbose=verbose, steps=steps)
-    elif self._distribution_strategy:
-      results = training_distributed.predict_loop(
+    if (self.run_eagerly or (isinstance(x, iterator_ops.EagerIterator) and
+                             not self._distribution_strategy)):
+      return training_generator.predict_generator(
+          self,
+          x,
+          steps=steps,
+          batch_size=batch_size,
+          verbose=verbose,
+          workers=0)
+    elif distributed_training_utils.is_tpu_strategy(
+        self._distribution_strategy):
+      return training_distributed.experimental_predict_loop(
           self, x, verbose=verbose, steps=steps)
-      return results
     else:
       return training_arrays.predict_loop(
           self, x, batch_size=batch_size, verbose=verbose, steps=steps)
 
-  def train_on_batch(self, x, y=None, sample_weight=None, class_weight=None):
+  def reset_metrics(self):
+    """Resets the state of metrics."""
+    if hasattr(self, 'metrics'):
+      for m in self.metrics:
+        m.reset_states()
+
+  def train_on_batch(self,
+                     x,
+                     y=None,
+                     sample_weight=None,
+                     class_weight=None,
+                     reset_metrics=True):
     """Runs a single gradient update on a single batch of data.
 
     Arguments:
@@ -1926,6 +2080,9 @@ class Model(Network):
           weight (float) to apply to the model's loss for the samples from this
           class during training. This can be useful to tell the model to "pay
           more attention" to samples from an under-represented class.
+        reset_metrics: If `True`, the metrics returned will be only for this
+          batch. If `False`, the metrics will be statefully accumulated across
+          batches.
 
     Returns:
         Scalar training loss
@@ -1944,23 +2101,30 @@ class Model(Network):
     x, y, sample_weights = self._standardize_user_data(
         x, y, sample_weight=sample_weight, class_weight=class_weight)
 
-    if context.executing_eagerly():
+    if self.run_eagerly:
       outputs = training_eager.train_on_batch(
           self, x, y, sample_weights=sample_weights)
     else:
-      if not isinstance(K.learning_phase(), int):
-        ins = x + y + sample_weights + [1]
+      if not isinstance(K.symbolic_learning_phase(), int):
+        ins = x + y + sample_weights + [True]
       else:
         ins = x + y + sample_weights
 
-      self._make_train_function()
-      outputs = self.train_function(ins)  # pylint: disable=not-callable
+      if reset_metrics:
+        self._make_train_function()
+        outputs = self.train_function(ins)  # pylint: disable=not-callable
+      else:
+        self._make_fit_function()
+        outputs = self._fit_function(ins)  # pylint: disable=not-callable
+
+    if reset_metrics:
+      self.reset_metrics()
 
     if len(outputs) == 1:
       return outputs[0]
     return outputs
 
-  def test_on_batch(self, x, y=None, sample_weight=None):
+  def test_on_batch(self, x, y=None, sample_weight=None, reset_metrics=True):
     """Test the model on a single batch of samples.
 
     Arguments:
@@ -1986,6 +2150,9 @@ class Model(Network):
             In this case you should make sure to specify
             sample_weight_mode="temporal" in compile(). This argument is not
             supported when `x` is a dataset or a dataset iterator.
+        reset_metrics: If `True`, the metrics returned will be only for this
+          batch. If `False`, the metrics will be statefully accumulated across
+          batches.
 
     Returns:
         Scalar test loss (if the model has a single output and no metrics)
@@ -2003,16 +2170,20 @@ class Model(Network):
     x, y, sample_weights = self._standardize_user_data(
         x, y, sample_weight=sample_weight)
 
-    if context.executing_eagerly():
+    if self.run_eagerly:
       outputs = training_eager.test_on_batch(
           self, x, y, sample_weights=sample_weights)
     else:
-      if not isinstance(K.learning_phase(), int):
-        ins = x + y + sample_weights + [0]
+      inputs = x + y + sample_weights
+      if reset_metrics:
+        self._make_test_function()
+        outputs = self.test_function(inputs)  # pylint: disable=not-callable
       else:
-        ins = x + y + sample_weights
-      self._make_test_function()
-      outputs = self.test_function(ins)  # pylint: disable=not-callable
+        self._make_eval_function()
+        outputs = self._eval_function(inputs)  # pylint: disable=not-callable
+
+    if reset_metrics:
+      self.reset_metrics()
 
     if len(outputs) == 1:
       return outputs[0]
@@ -2041,28 +2212,21 @@ class Model(Network):
                                 'models compiled with DistributionStrategy.')
     # Validate and standardize user data.
     inputs, _, _ = self._standardize_user_data(x)
-    if context.executing_eagerly():
-      if (isinstance(x, iterator_ops.EagerIterator) or
-          (isinstance(x, dataset_ops.Dataset) and context.executing_eagerly())):
+    if self.run_eagerly:
+      if (isinstance(inputs, iterator_ops.EagerIterator) or
+          (isinstance(inputs, dataset_ops.DatasetV2))):
         inputs = training_utils.cast_if_floating_dtype(inputs)
-      else:
+      elif isinstance(inputs, collections.Sequence):
         inputs = [
-            ops.convert_to_tensor(val, dtype=K.floatx()) for val in inputs
-        ]
+            ops.convert_to_tensor(val, dtype=K.floatx()) for val in inputs]
       return self(inputs)  # pylint: disable=not-callable
 
-    if not context.executing_eagerly():
-      if not isinstance(K.learning_phase(), int):
-        ins = inputs + [0]
-      else:
-        ins = inputs
+    self._make_predict_function()
+    outputs = self.predict_function(inputs)
 
-      self._make_predict_function()
-      outputs = self.predict_function(ins)
-
-      if len(outputs) == 1:
-        return outputs[0]
-      return outputs
+    if len(outputs) == 1:
+      return outputs[0]
+    return outputs
 
   def fit_generator(self,
                     generator,
@@ -2172,11 +2336,6 @@ class Model(Network):
     if self._distribution_strategy:
       raise NotImplementedError('`fit_generator` is not supported for '
                                 'models compiled with DistributionStrategy.')
-
-    if not self.built and not self._is_graph_network:
-      raise NotImplementedError(
-          '`fit_generator` is not yet enabled for unbuilt Model subclasses')
-
     return training_generator.fit_generator(
         self,
         generator,
@@ -2243,12 +2402,6 @@ class Model(Network):
     if self._distribution_strategy:
       raise NotImplementedError('`evaluate_generator` is not supported for '
                                 'models compiled with DistributionStrategy.')
-
-    if not self.built and not self._is_graph_network:
-      raise NotImplementedError(
-          '`evaluate_generator` is not yet enabled for '
-          'unbuilt Model subclasses')
-
     return training_generator.evaluate_generator(
         self,
         generator,
@@ -2300,11 +2453,6 @@ class Model(Network):
     if self._distribution_strategy:
       raise NotImplementedError('`predict_generator` is not supported for '
                                 'models compiled with DistributionStrategy.')
-
-    if not self.built and not self._is_graph_network:
-      raise NotImplementedError(
-          '`predict_generator` is not yet enabled for unbuilt Model subclasses')
-
     return training_generator.predict_generator(
         self,
         generator,
@@ -2336,15 +2484,63 @@ class Model(Network):
     self._replicated_model = DistributedCallbackModel(first_replicated_model)
     self._replicated_model.set_original_model(self)
 
+  def _validate_or_infer_batch_size(self, batch_size, steps, x):
+    """Validates that the `batch_size` provided is consistent with InputLayer.
+
+    It's possible that the user specified a static batch size in their
+    InputLayer. If so, this method checks the provided `batch_size` and `x`
+    arguments are consistent with this static batch size. Also, if
+    `batch_size` is `None`, this method will attempt to infer the batch size
+    from the static batch size of the InputLayer.
+
+    Arguments:
+      batch_size: The batch_size provided as an argument to
+        fit/evaluate/predict.
+      steps: The steps provided as an argument to fit/evaluate/predict.
+      x: The data passed as `x` to fit/evaluate/predict.
+
+    Returns:
+      The validated batch_size, auto-inferred from the first layer if not
+      provided.
+    """
+    first_layer = super(Model,
+                        self).layers[0]  # Avoids the override in Sequential.
+    static_batch_size = training_utils.get_static_batch_size(first_layer)
+    if static_batch_size is not None:
+
+      # Check `batch_size` argument is consistent with InputLayer.
+      if batch_size is not None and batch_size != static_batch_size:
+        raise ValueError('The `batch_size` argument value ' + str(batch_size) +
+                         ' is incompatible with the specified batch size '
+                         'of your Input Layer: ' + str(static_batch_size))
+
+      # Check Dataset/Iterator batch size is consistent with InputLayer.
+      if isinstance(x, (dataset_ops.DatasetV2, iterator_ops.Iterator,
+                        iterator_ops.EagerIterator)):
+        ds_batch_size = tensor_shape.as_dimension(
+            nest.flatten(x.output_shapes)[0][0]).value
+        if ds_batch_size is not None and ds_batch_size != static_batch_size:
+          raise ValueError('The batch output shape of your `Dataset` is ' +
+                           str(ds_batch_size) + ' which is incompatible '
+                           'with the specified batch size of your Input '
+                           'Layer: ' + str(static_batch_size))
+
+      # Set inferred batch size from the InputLayer.
+      if steps is None:
+        batch_size = static_batch_size
+
+    if batch_size is None and steps is None:
+      # Backwards compatibility
+      batch_size = 32
+    return batch_size
+
 
 class DistributedCallbackModel(Model):
   """Model that is used for callbacks with DistributionStrategy."""
 
   def __init__(self, model):
     super(DistributedCallbackModel, self).__init__()
-    # TODO(anjalisridhar): Right now the only attributes set are the layer and
-    # weights. We may need to set additional attributes as needed since we have
-    # not called compile on this model.
+    self.optimizer = model.optimizer
 
   def set_original_model(self, orig_model):
     self._original_model = orig_model
@@ -2376,3 +2572,7 @@ class DistributedCallbackModel(Model):
       logging.warning('You are accessing attribute ' + item + ' of the '
                       'DistributedCallbackModel that may not have been set '
                       'correctly.')
+
+
+def _is_symbolic_tensor(x):
+  return tensor_util.is_tensor(x) and not isinstance(x, ops.EagerTensor)
diff --git a/tensorflow/python/keras/engine/training_arrays.py b/tensorflow/python/keras/engine/training_arrays.py
index a2a13b9bd60..e9dfbcbcc07 100644
--- a/tensorflow/python/keras/engine/training_arrays.py
+++ b/tensorflow/python/keras/engine/training_arrays.py
@@ -23,9 +23,11 @@ import functools
 
 import numpy as np
 
+from tensorflow.python.eager import context
 from tensorflow.python.framework import errors
 from tensorflow.python.keras import backend as K
 from tensorflow.python.keras import callbacks as cbks
+from tensorflow.python.keras.engine import training_distributed
 from tensorflow.python.keras.engine import training_utils
 from tensorflow.python.keras.utils.generic_utils import make_batches
 from tensorflow.python.keras.utils.generic_utils import slice_arrays
@@ -37,91 +39,6 @@ except ImportError:
   issparse = None
 
 
-class Aggregator(object):
-  """Abstract base class used to aggregate batch-level outputs of a loop.
-
-  Arguments:
-    use_steps: Whether the loop is using `step` or `batch_size`.
-    num_samples_or_steps: Either `batch_size*num_batches` or `steps`.
-  """
-
-  def __init__(self, use_steps, num_samples_or_steps):
-    self.use_steps = use_steps
-    self.num_samples_or_steps = num_samples_or_steps
-    self.results = []
-
-  def create(self, batch_outs):
-    """Create the initial results from the first batch outputs.
-
-    Arguments:
-      batch_outs: A list of batch-level outputs.
-    """
-    raise NotImplementedError
-
-  def aggregate(self, batch_outs, batch_start=None, batch_end=None):
-    """Aggregate batch-level results into total results.
-
-    Arguments:
-      batch_outs: A list of batch-level outputs.
-      batch_start: The start index of this batch. Always `None` if `use_steps`
-        is `True`.
-      batch_end: The end index of this batch. Always `None` if `use_steps` is
-        `True`.
-    """
-    raise NotImplementedError
-
-  def finalize(self):
-    """Prepare the total results to be returned."""
-    raise NotImplementedError
-
-
-class MetricsAggregator(Aggregator):
-  """Aggregator that calculates loss and metrics info."""
-
-  def create(self, batch_outs):
-    self.results = [0.] * len(batch_outs)
-
-  def aggregate(self, batch_outs, batch_start=None, batch_end=None):
-    # Loss.
-    if self.use_steps:
-      self.results[0] += batch_outs[0]
-    else:
-      self.results[0] += batch_outs[0] * (batch_end - batch_start)
-    # Metrics (always stateful, just grab current values.)
-    self.results[1:] = batch_outs[1:]
-
-  def finalize(self):
-    self.results[0] /= self.num_samples_or_steps
-
-
-class OutputsAggregator(Aggregator):
-  """Aggregator that concatenates outputs."""
-
-  def create(self, batch_outs):
-    if self.use_steps:
-      # Cannot pre-allocate the returned NumPy arrays bc
-      # batch sizes are unknown. Concatenate batches at the end.
-      for _ in batch_outs:
-        self.results.append([])
-    else:
-      # Pre-allocate NumPy arrays.
-      for batch_out in batch_outs:
-        shape = (self.num_samples_or_steps,) + batch_out.shape[1:]
-        self.results.append(np.zeros(shape, dtype=batch_out.dtype))
-
-  def aggregate(self, batch_outs, batch_start=None, batch_end=None):
-    if self.use_steps:
-      for i, batch_out in enumerate(batch_outs):
-        self.results[i].append(batch_out)
-    else:
-      for i, batch_out in enumerate(batch_outs):
-        self.results[i][batch_start:batch_end] = batch_out
-
-  def finalize(self):
-    if self.use_steps:
-      self.results = [np.concatenate(result, axis=0) for result in self.results]
-
-
 def _get_model_feed(model, mode):
   if mode == 'predict':
     feed = model._feed_inputs
@@ -151,13 +68,6 @@ def _print_train_info(inputs, val_inputs, steps_per_epoch, verbose):
           (inputs[0].shape[0], val_inputs[0].shape[0]))
 
 
-def _get_progbar(model, count_mode):
-  stateful_metric_names = None
-  if hasattr(model, 'metrics_names'):
-    stateful_metric_names = model.metrics_names[1:]  # Exclude `loss`
-  return cbks.ProgbarLogger(count_mode, stateful_metrics=stateful_metric_names)
-
-
 def _get_num_samples_or_steps(ins, batch_size, steps_per_epoch):
   """Returns total number of samples (when training in batch mode) or steps."""
   if steps_per_epoch:
@@ -166,16 +76,50 @@ def _get_num_samples_or_steps(ins, batch_size, steps_per_epoch):
                                           'steps_per_epoch')
 
 
-def _make_logs(model, outputs, mode, prefix=''):
-  """Used to make logs to send to `on_batch_end` methods."""
-  logs = {}
-  # TODO(omalleyt): handle outputs in prediction when Callback
-  # hooks are ready.
-  if mode in ['train', 'test']:
-    if hasattr(model, 'metrics_names'):
-      for label, output in zip(model.metrics_names, outputs):
-        logs[prefix + label] = output
-  return logs
+def _prepare_feed_values(model, inputs, targets, sample_weights, mode):
+  """Prepare feed values to the model execution function.
+
+  Arguments:
+    model: Model to prepare feed values for.
+    inputs: List or dict of model inputs.
+    targets: Optional list of model targets.
+    sample_weights: Optional list of sample weight arrays.
+    mode: One of 'train'/'test'/'predict'.
+
+  Returns:
+    Feed values for the model in the given mode.
+  """
+  if model._distribution_strategy:
+    def get_distributed_inputs():
+      return training_distributed._prepare_feed_values(
+          model, inputs, targets, sample_weights, mode)
+
+    # In the eager case, we want to call the input method per step, so return
+    # a lambda from here that can be called. Note that this is applicable only
+    # in Distribution Strategy case as it follows the same code path for both
+    # eager and graph modes.
+    # TODO(priyag,omalleyt): Either we should move the training DS with
+    # EagerIterator to use training_generator code path, or figure out how to
+    # set a symbolic Iterator out of a Dataset when in eager mode.
+    if context.executing_eagerly():
+      return get_distributed_inputs
+    else:
+      return get_distributed_inputs()
+
+  inputs = training_utils.ModelInputs(inputs).as_list()
+  targets = targets or []
+  sample_weights = sample_weights or []
+  ins = inputs + targets + sample_weights
+  if mode == 'train' and not isinstance(K.symbolic_learning_phase(), int):
+    ins += [True]
+  return ins
+
+
+def _make_execution_function(model, mode):
+  """Makes function to run one step of model execution."""
+  if model._distribution_strategy:
+    return training_distributed._make_execution_function(model, mode)
+  return model._make_execution_function(mode)
 
 
 def model_iteration(model,
@@ -238,19 +182,18 @@ def model_iteration(model,
   if mode == 'train':
     _print_train_info(inputs, val_inputs, steps_per_epoch, verbose)
 
+  # Enter DistributionStrategy scope.
+  if model._distribution_strategy:
+    scope = model._distribution_strategy.scope()
+    scope.__enter__()
+
   # Get step function and loop type.
-  f = model._get_execution_function(mode)
+  f = _make_execution_function(model, mode)
   use_steps = steps_per_epoch is not None
   do_validation = val_inputs is not None
 
   # Prepare input data.
-  inputs = training_utils.ModelInputs(inputs).as_list()
-  targets = targets or []
-  sample_weights = sample_weights or []
-  learning_phase_input = []
-  if not isinstance(K.learning_phase(), int):
-    learning_phase_input = [1] if mode == 'train' else [0]
-  ins = inputs + targets + sample_weights + learning_phase_input
+  ins = _prepare_feed_values(model, inputs, targets, sample_weights, mode)
   num_samples_or_steps = _get_num_samples_or_steps(ins, batch_size,
                                                    steps_per_epoch)
 
@@ -260,24 +203,19 @@ def model_iteration(model,
       callbacks,
       model,
       do_validation=do_validation,
-      val_inputs=val_inputs,
-      val_targets=val_targets,
-      val_sample_weights=val_sample_weights,
       batch_size=batch_size,
       epochs=epochs,
       steps_per_epoch=steps_per_epoch,
       samples=num_samples_or_steps,
-      validation_steps=validation_steps,
       verbose=0,  # Handle ProgBarLogger separately in this loop.
-      count_mode=count_mode,
       mode=mode)
   # TODO(omalleyt): Handle ProgBar as part of Callbacks once hooks are ready.
-  progbar = _get_progbar(model, count_mode)
+  progbar = training_utils.get_progbar(model, count_mode)
   progbar.params = callbacks.params
   progbar.params['verbose'] = verbose
 
   # Find beforehand arrays that need sparse-to-dense conversion.
-  if issparse is not None:
+  if issparse is not None and not use_steps:
     indices_for_conversion_to_dense = []
     feed = _get_model_feed(model, mode)
     for i, (input_data, feed_tensor) in enumerate(zip(ins, feed)):
@@ -286,9 +224,14 @@ def model_iteration(model,
 
   # Select aggregation method.
   if mode == 'predict':
-    aggregator = OutputsAggregator(use_steps, num_samples_or_steps)
+    aggregator = training_utils.OutputsAggregator(use_steps,
+                                                  num_samples_or_steps)
   else:
-    aggregator = MetricsAggregator(use_steps, num_samples_or_steps)
+    aggregator = training_utils.MetricsAggregator(use_steps,
+                                                  num_samples_or_steps)
+
+  if model._distribution_strategy:
+    training_distributed._copy_weights_to_distributed_model(model)
 
   callbacks.model.stop_training = False
   callbacks._call_begin_hook(mode)
@@ -298,10 +241,9 @@ def model_iteration(model,
       break
 
     # Setup work for each epoch
-    results = []
     epoch_logs = {}
-    if hasattr(model, 'stateful_metric_functions'):
-      for m in model.stateful_metric_functions:
+    if hasattr(model, 'metrics'):
+      for m in model.metrics:
         m.reset_states()
     callbacks.on_epoch_begin(epoch, epoch_logs, mode=mode)
     progbar.on_epoch_begin(epoch, epoch_logs)
@@ -315,26 +257,31 @@ def model_iteration(model,
 
         # Get outputs.
         try:
-          batch_outs = f(ins)
+          # `ins` can be callable in DistributionStrategy + eager case.
+          actual_inputs = ins() if callable(ins) else ins
+          batch_outs = f(actual_inputs)
         except errors.OutOfRangeError:
           logging.warning('Your dataset iterator ran out of data; '
                           'interrupting training. Make sure that your dataset '
                           'can generate at least `steps_per_epoch * epochs` '
                           'batches (in this case, %d batches). You may need to'
                           'use the repeat() function when building your '
-                          'dataset.' %
-                          steps_per_epoch * epochs)
+                          'dataset.' % steps_per_epoch * epochs)
           break
         if not isinstance(batch_outs, list):
           batch_outs = [batch_outs]
 
+        if model._distribution_strategy:
+          batch_outs = training_distributed._per_device_aggregate_batch(
+              batch_outs, model, mode)
+
         # Aggregate results.
         if step == 0:
           aggregator.create(batch_outs)
         aggregator.aggregate(batch_outs)
 
         # Callbacks batch end.
-        batch_logs.update(_make_logs(model, batch_outs, mode))
+        batch_logs.update(training_utils.make_logs(model, batch_outs, mode))
         callbacks._call_batch_hook(mode, 'end', step, batch_logs)
         progbar.on_batch_end(step, batch_logs)
 
@@ -365,8 +312,9 @@ def model_iteration(model,
                           'pass shuffle="batch".')
 
         # Sparse to dense conversion.
-        for i in indices_for_conversion_to_dense:
-          ins_batch[i] = ins_batch[i].toarray()
+        if issparse is not None:
+          for i in indices_for_conversion_to_dense:
+            ins_batch[i] = ins_batch[i].toarray()
 
         # Callbacks batch_begin.
         batch_logs = {'batch': batch_index, 'size': len(batch_ids)}
@@ -384,7 +332,7 @@ def model_iteration(model,
         aggregator.aggregate(batch_outs, batch_start, batch_end)
 
         # Callbacks batch end.
-        batch_logs.update(_make_logs(model, batch_outs, mode))
+        batch_logs.update(training_utils.make_logs(model, batch_outs, mode))
         callbacks._call_batch_hook(mode, 'end', batch_index, batch_logs)
         progbar.on_batch_end(batch_index, batch_logs)
 
@@ -393,7 +341,7 @@ def model_iteration(model,
 
     aggregator.finalize()
     results = aggregator.results
-    epoch_logs.update(_make_logs(model, results, mode))
+    epoch_logs.update(training_utils.make_logs(model, results, mode))
     if len(results) == 1:
       results = results[0]
 
@@ -411,12 +359,17 @@ def model_iteration(model,
           mode='test')
       if not isinstance(val_results, list):
         val_results = [val_results]
-      epoch_logs.update(_make_logs(model, val_results, mode, prefix='val_'))
+      epoch_logs.update(
+          training_utils.make_logs(model, val_results, mode, prefix='val_'))
 
     callbacks.on_epoch_end(epoch, epoch_logs, mode=mode)
     progbar.on_epoch_end(epoch, epoch_logs)
   callbacks._call_end_hook(mode)
 
+  if model._distribution_strategy:
+    training_distributed._copy_weights_to_original_model(model, mode)
+    scope.__exit__(None, None, None)
+
   if mode == 'train':
     return model.history
   return results
diff --git a/tensorflow/python/keras/engine/training_dataset_test.py b/tensorflow/python/keras/engine/training_dataset_test.py
new file mode 100644
index 00000000000..1b2d0e88e36
--- /dev/null
+++ b/tensorflow/python/keras/engine/training_dataset_test.py
@@ -0,0 +1,344 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for training routines."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import logging
+
+from absl.testing import parameterized
+
+import numpy as np
+
+from tensorflow.python import keras
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util as tf_test_util
+from tensorflow.python.keras import metrics as metrics_module
+from tensorflow.python.keras import testing_utils
+from tensorflow.python.ops.losses import losses_impl
+from tensorflow.python.platform import test
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.training.rmsprop import RMSPropOptimizer
+
+
+class TestTrainingWithDatasetIterators(test.TestCase, parameterized.TestCase):
+
+  @parameterized.parameters(
+      {'model': 'functional'},
+      {'model': 'subclass'},
+  )
+  @tf_test_util.run_in_graph_and_eager_modes
+  def test_training_and_eval_methods_on_iterators_single_io(self, model):
+    if model == 'functional':
+      model = testing_utils.get_small_functional_mlp(1, 4, input_dim=3)
+    elif model == 'subclass':
+      model = testing_utils.get_small_sequential_mlp(1, 4)
+    optimizer = RMSPropOptimizer(learning_rate=0.001)
+    loss = 'mse'
+    metrics = ['mae', metrics_module.CategoricalAccuracy()]
+    model.compile(optimizer, loss, metrics=metrics)
+
+    inputs = np.zeros((10, 3), np.float32)
+    targets = np.zeros((10, 4), np.float32)
+    dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets))
+    dataset = dataset.repeat(100)
+    dataset = dataset.batch(10)
+    iterator = dataset_ops.make_one_shot_iterator(dataset)
+
+    model.fit(iterator, epochs=1, steps_per_epoch=2, verbose=1)
+    model.evaluate(iterator, steps=2, verbose=1)
+    model.predict(iterator, steps=2)
+
+    # Test with validation data
+    model.fit(iterator,
+              epochs=1, steps_per_epoch=2, verbose=0,
+              validation_data=iterator, validation_steps=2)
+    # Test with validation split
+    with self.assertRaisesRegexp(
+        ValueError, '`validation_split` argument is not supported '
+        'when input `x` is a dataset or a dataset iterator'):
+      model.fit(iterator,
+                epochs=1, steps_per_epoch=2, verbose=0,
+                validation_split=0.5, validation_steps=2)
+
+    # Test with sample weight.
+    sample_weight = np.random.random((10,))
+    with self.assertRaisesRegexp(
+        ValueError, '`sample_weight` argument is not supported '
+        'when input `x` is a dataset or a dataset iterator'):
+      model.fit(
+          iterator,
+          epochs=1,
+          steps_per_epoch=2,
+          verbose=0,
+          sample_weight=sample_weight)
+
+    # Test invalid usage
+    with self.assertRaisesRegexp(ValueError,
+                                 'you should not specify a target'):
+      model.fit(iterator, iterator,
+                epochs=1, steps_per_epoch=2, verbose=0)
+
+    with self.assertRaisesRegexp(
+        ValueError, 'you should specify the `steps_per_epoch` argument'):
+      model.fit(iterator, epochs=1, verbose=0)
+    with self.assertRaisesRegexp(ValueError,
+                                 'you should specify the `steps` argument'):
+      model.evaluate(iterator, verbose=0)
+    with self.assertRaisesRegexp(ValueError,
+                                 'you should specify the `steps` argument'):
+      model.predict(iterator, verbose=0)
+
+  @tf_test_util.run_in_graph_and_eager_modes
+  def test_get_next_op_created_once(self):
+    model = testing_utils.get_small_functional_mlp(1, 4, input_dim=3)
+    optimizer = RMSPropOptimizer(learning_rate=0.001)
+    loss = 'mse'
+    metrics = ['mae']
+    model.compile(optimizer, loss, metrics=metrics)
+
+    inputs = np.zeros((10, 3), np.float32)
+    targets = np.zeros((10, 4), np.float32)
+    dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets))
+    dataset = dataset.repeat(100)
+    dataset = dataset.batch(10)
+    iterator = dataset_ops.make_one_shot_iterator(dataset)
+
+    model.fit(iterator, epochs=1, steps_per_epoch=2, verbose=1)
+    # Finalize graph to make sure we are not appending another iterator
+    # get_next op in the graph.
+    ops.get_default_graph().finalize()
+    model.fit(iterator, epochs=1, steps_per_epoch=2, verbose=1)
+
+  @tf_test_util.run_in_graph_and_eager_modes
+  def test_iterators_running_out_of_data(self):
+    model = testing_utils.get_small_functional_mlp(1, 4, input_dim=3)
+    optimizer = RMSPropOptimizer(learning_rate=0.001)
+    loss = 'mse'
+    metrics = ['mae']
+    model.compile(optimizer, loss, metrics=metrics)
+
+    inputs = np.zeros((10, 3), np.float32)
+    targets = np.zeros((10, 4), np.float32)
+    dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets))
+    dataset = dataset.repeat(2)
+    dataset = dataset.batch(10)
+    iterator = dataset_ops.make_one_shot_iterator(dataset)
+
+    with test.mock.patch.object(logging, 'warning') as mock_log:
+      model.fit(iterator, epochs=1, steps_per_epoch=3, verbose=0)
+      self.assertRegexpMatches(
+          str(mock_log.call_args),
+          'dataset iterator ran out of data')
+
+
+class TestTrainingWithDataset(test.TestCase, parameterized.TestCase):
+
+  @tf_test_util.run_in_graph_and_eager_modes
+  def test_calling_model_on_same_dataset(self):
+    model = testing_utils.get_small_functional_mlp(1, 4, input_dim=3)
+    optimizer = RMSPropOptimizer(learning_rate=0.001)
+    loss = 'mse'
+    metrics = ['mae']
+    model.compile(optimizer, loss, metrics=metrics)
+
+    inputs = np.zeros((10, 3), np.float32)
+    targets = np.zeros((10, 4), np.float32)
+    dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets))
+    dataset = dataset.repeat(100)
+    dataset = dataset.batch(10)
+
+    # Call fit with validation data
+    model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=0,
+              validation_data=dataset, validation_steps=2)
+    # Finalize the graph to make sure new ops aren't added when calling on the
+    # same dataset
+    ops.get_default_graph().finalize()
+    model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=0,
+              validation_data=dataset, validation_steps=2)
+
+  @tf_test_util.run_in_graph_and_eager_modes
+  def test_training_and_eval_methods_on_dataset(self):
+    model = testing_utils.get_small_functional_mlp(1, 4, input_dim=3)
+    optimizer = RMSPropOptimizer(learning_rate=0.001)
+    loss = 'mse'
+    metrics = ['mae', metrics_module.CategoricalAccuracy()]
+    model.compile(optimizer, loss, metrics=metrics)
+
+    inputs = np.zeros((10, 3), np.float32)
+    targets = np.zeros((10, 4), np.float32)
+    dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets))
+    dataset = dataset.repeat(100)
+    dataset = dataset.batch(10)
+
+    model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=1)
+    model.evaluate(dataset, steps=2, verbose=1)
+    model.predict(dataset, steps=2)
+
+    # Test with validation data
+    model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=0,
+              validation_data=dataset, validation_steps=2)
+
+    # Test with validation split
+    with self.assertRaisesRegexp(
+        ValueError, '`validation_split` argument is not supported '
+        'when input `x` is a dataset or a dataset iterator'):
+      model.fit(dataset,
+                epochs=1, steps_per_epoch=2, verbose=0,
+                validation_split=0.5, validation_steps=2)
+
+    # Test with sample weight.
+    sample_weight = np.random.random((10,))
+    with self.assertRaisesRegexp(
+        ValueError, '`sample_weight` argument is not supported '
+        'when input `x` is a dataset or a dataset iterator'):
+      model.fit(
+          dataset,
+          epochs=1,
+          steps_per_epoch=2,
+          verbose=0,
+          sample_weight=sample_weight)
+
+    # Test invalid usage
+    with self.assertRaisesRegexp(ValueError,
+                                 'you should not specify a target'):
+      model.fit(dataset, dataset,
+                epochs=1, steps_per_epoch=2, verbose=0)
+
+    with self.assertRaisesRegexp(
+        ValueError, 'you should specify the `steps_per_epoch` argument'):
+      model.fit(dataset, epochs=1, verbose=0)
+    with self.assertRaisesRegexp(ValueError,
+                                 'you should specify the `steps` argument'):
+      model.evaluate(dataset, verbose=0)
+    with self.assertRaisesRegexp(ValueError,
+                                 'you should specify the `steps` argument'):
+      model.predict(dataset, verbose=0)
+
+  @tf_test_util.run_in_graph_and_eager_modes
+  def test_dataset_with_sample_weights(self):
+    model = testing_utils.get_small_functional_mlp(1, 4, input_dim=3)
+    optimizer = RMSPropOptimizer(learning_rate=0.001)
+    loss = 'mse'
+    metrics = ['mae', metrics_module.CategoricalAccuracy()]
+    model.compile(optimizer, loss, metrics=metrics)
+
+    inputs = np.zeros((10, 3), np.float32)
+    targets = np.zeros((10, 4), np.float32)
+    sample_weights = np.ones((10), np.float32)
+    dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets,
+                                                      sample_weights))
+    dataset = dataset.repeat(100)
+    dataset = dataset.batch(10)
+
+    model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=1)
+    model.evaluate(dataset, steps=2, verbose=1)
+    model.predict(dataset, steps=2)
+
+  @parameterized.parameters(
+      {'model': 'functional'},
+      {'model': 'subclass'},
+  )
+  @tf_test_util.run_in_graph_and_eager_modes
+  def test_dataset_with_sparse_labels(self, model):
+    if model == 'functional':
+      model = testing_utils.get_small_functional_mlp(1, 4, input_dim=3)
+    elif model == 'subclass':
+      model = testing_utils.get_small_sequential_mlp(1, 4)
+
+    for loss in ['sparse_categorical_crossentropy',
+                 losses_impl.sparse_softmax_cross_entropy]:
+      optimizer = RMSPropOptimizer(learning_rate=0.001)
+      model.compile(optimizer, loss)
+
+      inputs = np.zeros((10, 3), dtype=np.float32)
+      targets = np.random.randint(0, 4, size=10, dtype=np.int32)
+      dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets))
+      dataset = dataset.repeat(100)
+      dataset = dataset.batch(10)
+
+      model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=1)
+
+  @tf_test_util.run_deprecated_v1
+  def test_dataset_input_shape_validation(self):
+    with self.cached_session():
+      model = testing_utils.get_small_functional_mlp(1, 4, input_dim=3)
+      model.compile(optimizer=RMSPropOptimizer(learning_rate=0.001), loss='mse')
+
+      # User forgets to batch the dataset
+      inputs = np.zeros((10, 3))
+      targets = np.zeros((10, 4))
+      dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets))
+      dataset = dataset.repeat(100)
+
+      with self.assertRaisesRegexp(
+          ValueError,
+          r'expected (.*?) to have shape \(3,\) but got array with shape \(1,\)'
+      ):
+        model.train_on_batch(dataset)
+
+      # Wrong input shape
+      inputs = np.zeros((10, 5))
+      targets = np.zeros((10, 4))
+      dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets))
+      dataset = dataset.repeat(100)
+      dataset = dataset.batch(10)
+
+      with self.assertRaisesRegexp(ValueError,
+                                   r'expected (.*?) to have shape \(3,\)'):
+        model.train_on_batch(dataset)
+
+
+class TestMetricsWithDatasetIterators(test.TestCase):
+
+  @tf_test_util.run_in_graph_and_eager_modes
+  def test_metrics_correctness_with_iterator(self):
+    model = keras.Sequential()
+    model.add(
+        keras.layers.Dense(
+            8, activation='relu', input_dim=4, kernel_initializer='ones'))
+    model.add(
+        keras.layers.Dense(
+            1, activation='sigmoid', kernel_initializer='ones'))
+    model.compile(
+        loss='binary_crossentropy',
+        metrics=['accuracy', metrics_module.BinaryAccuracy()],
+        optimizer=RMSPropOptimizer(learning_rate=0.001))
+
+    np.random.seed(123)
+    x = np.random.randint(10, size=(100, 4)).astype(np.float32)
+    y = np.random.randint(2, size=(100, 1)).astype(np.float32)
+    dataset = dataset_ops.Dataset.from_tensor_slices((x, y))
+    dataset = dataset.batch(10)
+    iterator = dataset_ops.make_one_shot_iterator(dataset)
+    outs = model.evaluate(iterator, steps=10)
+    self.assertEqual(np.around(outs[1], decimals=1), 0.5)
+    self.assertEqual(np.around(outs[2], decimals=1), 0.5)
+
+    y = np.zeros((100, 1), dtype=np.float32)
+    dataset = dataset_ops.Dataset.from_tensor_slices((x, y))
+    dataset = dataset.repeat(100)
+    dataset = dataset.batch(10)
+    iterator = dataset_ops.make_one_shot_iterator(dataset)
+    outs = model.evaluate(iterator, steps=10)
+    self.assertEqual(outs[1], 0.)
+    self.assertEqual(outs[2], 0.)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/keras/engine/training_distributed.py b/tensorflow/python/keras/engine/training_distributed.py
index 808d7c9f333..473f06ded70 100644
--- a/tensorflow/python/keras/engine/training_distributed.py
+++ b/tensorflow/python/keras/engine/training_distributed.py
@@ -19,9 +19,12 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import enum
+import enum  # pylint: disable=g-bad-import-order
 import numpy as np
 
+from tensorflow.python.distribute import distribute_lib
+from tensorflow.python.distribute import reduce_util as ds_reduce_util
+from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
@@ -34,9 +37,7 @@ from tensorflow.python.keras.engine import distributed_training_utils
 from tensorflow.python.keras.utils.generic_utils import Progbar
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import variable_scope
 from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.training import distribute as distribute_lib
 from tensorflow.python.util import nest
 
 
@@ -48,180 +49,15 @@ class _Mode(enum.Enum):
 # TODO(priyag, sourabhbajaj): Refactor this file to address code duplication.
 
 
-def fit_loop(
-    model,
-    iterator,
-    epochs=100,
-    verbose=1,
-    callbacks=None,
-    val_iterator=None,
-    initial_epoch=0,
-    steps_per_epoch=None,
-    validation_steps=None):
-  """Fit loop for training with DistributionStrategy.
-
-  Arguments:
-      model: Keras Model instance.
-      iterator: Iterator for input data.
-      epochs: Number of times to iterate over the data
-      verbose: Integer, Verbosity mode, 0, 1 or 2
-      callbacks: List of callbacks to be called during training
-      val_iterator: Iterator for validation data.
-      initial_epoch: Epoch at which to start training
-          (useful for resuming a previous training run)
-      steps_per_epoch: Total number of steps (batches of samples)
-          before declaring one epoch finished and starting the
-          next epoch. Ignored with the default value of `None`.
-      validation_steps: Number of steps to run validation for
-          (only if doing validation from data tensors).
-          Ignored with the default value of `None`.
-
-  Returns:
-      `History` object.
-
-  Raises:
-      ValueError: in case of invalid arguments.
-  """
-  current_strategy = model._distribution_strategy
-
-  # TODO(priyag, sourabhbajaj): Remove this when the codepaths are merged.
-  if current_strategy.__class__.__name__ == 'TPUStrategy':
-    return _experimental_fit_loop(
-        model, iterator, epochs, verbose, callbacks, initial_epoch,
-        steps_per_epoch, val_iterator, validation_steps)
-
-  if not model._grouped_model:
-    clone_model_on_replicas(model, current_strategy, make_callback_model=True)
-
-  def _per_device_fit_function(model):
-    model._make_fit_function()
-    return (model._fit_function.inputs, model._fit_function.outputs,
-            model._fit_function.updates_op, model._fit_function.session_kwargs)
-
-  inputs, targets, sample_weights = _get_input_from_iterator(iterator, model)
-  with current_strategy.scope():
-    # Create train ops on each of the devices when we call
-    # `_per_device_fit_function`.
-    (grouped_inputs, grouped_outputs, grouped_updates,
-     grouped_session_args) = current_strategy.call_for_each_replica(
-         _per_device_fit_function, args=(model._grouped_model,))
-    # Unwrap all the per device values returned from `call_for_each_replica`.
-    # Unwrapping per device values gives you a list of values that can be
-    # used to construct a new train function that is composed of update ops on
-    # all the devices over which the model is distributed.
-    (all_inputs, all_outputs, all_updates,
-     all_session_args) = distributed_training_utils.unwrap_values(
-         current_strategy, grouped_inputs, grouped_outputs,
-         grouped_updates, grouped_session_args, with_loss_tensor=True)
-
-    # Dataset inputs and targets are also per devices values that need to be
-    # unwrapped.
-    dataset_inputs = distributed_training_utils.flatten_perdevice_values(
-        current_strategy, inputs)
-    dataset_targets = distributed_training_utils.flatten_perdevice_values(
-        current_strategy, targets)
-
-    # Create a train function that is composed of all the parameters above.
-    distributed_fit_function = K.function(
-        all_inputs,
-        all_outputs,
-        updates=all_updates,
-        name='distributed_fit_function',
-        **all_session_args)
-
-    # We need to set sample_weights to None since there are sample weight
-    # placeholders that are created with default values.
-    sample_weights = [None for _ in range(
-        len(model.outputs) * current_strategy.num_replicas_in_sync)]
-    if not isinstance(K.learning_phase(), int):
-      ins = dataset_inputs + dataset_targets + sample_weights + [1]
-    else:
-      ins = dataset_inputs + dataset_targets
-
-    do_validation = False
-    if validation_steps:
-      do_validation = True
-
-    # Copy the weights from the original model to each of the replicated models.
-    orig_model_weights = model.get_weights()
-    distributed_model = current_strategy.unwrap(model._grouped_model)[0]
-    distributed_training_utils.set_weights(
-        current_strategy, distributed_model, orig_model_weights)
-
-    callbacks = cbks.configure_callbacks(
-        callbacks,
-        model,
-        do_validation=do_validation,
-        val_inputs=None,
-        val_targets=None,
-        epochs=epochs,
-        steps_per_epoch=steps_per_epoch,
-        verbose=verbose)
-    out_labels = model.metrics_names or []
-    callbacks.on_train_begin()
-
-    assert steps_per_epoch is not None
-
-    for epoch in range(initial_epoch, epochs):
-      # Reset stateful metrics
-      for m in model.stateful_metric_functions:
-        m.reset_states()
-      callbacks.on_epoch_begin(epoch)
-      epoch_logs = {}
-      for step_index in range(steps_per_epoch):
-        batch_logs = {'batch': step_index, 'size': 1}
-        callbacks.on_batch_begin(step_index, batch_logs)
-        try:
-          outs = distributed_fit_function(ins)
-        except errors.OutOfRangeError:
-          logging.warning('Your dataset iterator ran out of data; '
-                          'interrupting training. Make sure that your dataset '
-                          'can generate at least `steps_per_epoch * epochs` '
-                          'batches (in this case, %d batches).' %
-                          steps_per_epoch * epochs)
-          break
-
-        if not isinstance(outs, list):
-          outs = [outs]
-        for l, o in zip(out_labels, outs):
-          batch_logs[l] = o
-        callbacks.on_batch_end(step_index, batch_logs)
-        if callbacks.model.stop_training:
-          break
-      if do_validation:
-        val_outs = test_loop(
-            model,
-            val_iterator,
-            steps=validation_steps,
-            verbose=0)
-        if not isinstance(val_outs, list):
-          val_outs = [val_outs]
-        # Same labels assumed.
-        for l, o in zip(out_labels, val_outs):
-          epoch_logs['val_' + l] = o
-
-      callbacks.on_epoch_end(epoch, epoch_logs)
-      if callbacks.model.stop_training:
-        break
-    callbacks.on_train_end()
-
-    # Copy the weights back from the replicated model to the original model.
-    updated_weights = current_strategy.unwrap(
-        model._grouped_model)[0].get_weights()
-    model.set_weights(updated_weights)
-    return model.history
-
-
-def _experimental_fit_loop(
-    model,
-    iterator,
-    epochs=100,
-    verbose=1,
-    callbacks=None,
-    initial_epoch=0,
-    steps_per_epoch=None,
-    val_iterator=None,
-    validation_steps=None):
+def experimental_fit_loop(model,
+                          iterator,
+                          epochs=100,
+                          verbose=1,
+                          callbacks=None,
+                          initial_epoch=0,
+                          steps_per_epoch=None,
+                          val_iterator=None,
+                          validation_steps=None):
   """Fit loop for training with TPU DistributionStrategy.
 
   Arguments:
@@ -259,11 +95,12 @@ def _experimental_fit_loop(
   K.set_learning_phase(1)
   out_labels = model.metrics_names or []
 
-  def step_fn(ctx, inputs, targets):
+  def step_fn(ctx, inputs):
     """Clones the model and calls make_fit_function."""
     # TODO(priyag, sourabhbajaj): The model gets cloned every time
     # fit/test/predict is called. We should look into caching this keyed on
     # input shapes.
+    inputs, targets = inputs
     clone_model_on_replicas(
         model,
         current_strategy,
@@ -273,7 +110,7 @@ def _experimental_fit_loop(
         mode=_Mode.TRAIN)
 
     (grouped_inputs, grouped_outputs, grouped_updates,
-     grouped_session_args) = current_strategy.call_for_each_replica(
+     grouped_session_args) = current_strategy.extended.call_for_each_replica(
          _per_device_fit_function, args=(model._grouped_model_train,))
     (all_inputs, all_outputs, all_updates,
      all_session_args) = distributed_training_utils.unwrap_values(
@@ -288,12 +125,12 @@ def _experimental_fit_loop(
 
     for label, output in zip(out_labels, combined_fn.outputs):
       if label == 'loss':
-        aggregation = distribute_lib.get_loss_reduction()
+        reduce_op = distribute_lib.get_loss_reduction()
       else:
-        # We aggregate all other metrics using mean for now. This is temporary
+        # We reduce all other metrics using mean for now. This is temporary
         # workaround until new metrics are in place.
-        aggregation = variable_scope.VariableAggregation.MEAN
-      ctx.set_last_step_output(label, output, aggregation)
+        reduce_op = ds_reduce_util.ReduceOp.MEAN
+      ctx.set_last_step_output(label, output, reduce_op)
 
     # TODO(priyag, sourabhbajaj): Ignoring these things from the combined_fn:
     # feed_dict, session kwargs, run options, run_metadata for now. These should
@@ -303,19 +140,20 @@ def _experimental_fit_loop(
   # Add initial dummy values for loss and other metric tensors.
   initial_loop_values = {}
   initial_loop_values['loss'] = constant_op.constant(1e7)
-  for name, tensor in zip(model.metrics_names[1:], model.metrics_tensors):
+  for name in model.metrics_names[1:]:
+    tensor = model._all_stateful_metrics_tensors[name]
     initial_loop_values[name] = array_ops.zeros(tensor.shape, tensor.dtype)
 
   if steps_per_epoch is None:
     raise ValueError('`steps_per_epoch` should be specified when calling '
                      '`fit` on the model.')
   steps_per_run = K.variable(
-      value=min(steps_per_epoch, current_strategy.steps_per_run),
+      value=min(steps_per_epoch, current_strategy.extended.steps_per_run),
       dtype='int32',
       name='steps_per_run')
 
   with current_strategy.scope():
-    ctx = current_strategy.run_steps_on_dataset(
+    ctx = current_strategy.extended.experimental_run_steps_on_iterator(
         step_fn, iterator, iterations=steps_per_run,
         initial_loop_values=initial_loop_values)
 
@@ -334,17 +172,16 @@ def _experimental_fit_loop(
       callbacks,
       model,
       do_validation=do_validation,
-      val_inputs=None,
-      val_targets=None,
       epochs=epochs,
       steps_per_epoch=steps_per_epoch,
       verbose=verbose)
 
   # Calculate the steps each time on the device.
-  steps_to_run = [current_strategy.steps_per_run] * (
-      steps_per_epoch // current_strategy.steps_per_run)
-  if steps_per_epoch % current_strategy.steps_per_run:
-    steps_to_run.append(steps_per_epoch % current_strategy.steps_per_run)
+  steps_to_run = [current_strategy.extended.steps_per_run] * (
+      steps_per_epoch // current_strategy.extended.steps_per_run)
+  if steps_per_epoch % current_strategy.extended.steps_per_run:
+    steps_to_run.append(
+        steps_per_epoch % current_strategy.extended.steps_per_run)
 
   callbacks.on_train_begin()
   for epoch in range(initial_epoch, epochs):
@@ -384,7 +221,7 @@ def _experimental_fit_loop(
             model._grouped_model_train)[0].get_weights()
         model.set_weights(updated_weights)
 
-      val_outs = _experimental_test_loop(
+      val_outs = experimental_test_loop(  # pylint: disable=undefined-variable
           model,
           val_iterator,
           steps=validation_steps,
@@ -411,105 +248,11 @@ def _experimental_fit_loop(
   return model.history
 
 
-def test_loop(model, iterator, verbose=0, steps=None):
-  """Test loop for evaluating with DistributionStrategy.
-
-  Arguments:
-      model: Keras Model instance.
-      iterator: Iterator for input data.
-      verbose: Integer, Verbosity mode 0 or 1.
-      steps: Total number of steps (batches of samples)
-          before declaring predictions finished.
-          Ignored with the default value of `None`.
-
-  Returns:
-      Scalar loss (if the model has a single output and no metrics)
-      or list of scalars (if the model has multiple outputs
-      and/or metrics). The attribute `model.metrics_names` will give you
-      the display labels for the outputs.
-  """
-  current_strategy = model._distribution_strategy
-
-  # TODO(priyag, sourabhbajaj): Remove this when the codepaths are merged.
-  if current_strategy.__class__.__name__ == 'TPUStrategy':
-    return _experimental_test_loop(model, iterator, verbose, steps)
-
-  if not model._grouped_model:
-    clone_model_on_replicas(model, current_strategy)
-
-  def _per_device_eval_function(model):
-    model._make_eval_function()
-    return (model._eval_function.inputs, model._eval_function.outputs,
-            model._eval_function.updates_op,
-            model._eval_function.session_kwargs)
-
-  inputs, targets, sample_weights = _get_input_from_iterator(iterator, model)
-  with current_strategy.scope():
-    (grouped_inputs, grouped_outputs, grouped_updates,
-     grouped_session_args) = current_strategy.call_for_each_replica(
-         _per_device_eval_function, args=(model._grouped_model,))
-
-    (all_inputs, all_outputs, all_updates,
-     all_session_args) = distributed_training_utils.unwrap_values(
-         current_strategy, grouped_inputs, grouped_outputs, grouped_updates,
-         grouped_session_args, with_loss_tensor=True)
-
-    dataset_inputs = distributed_training_utils.flatten_perdevice_values(
-        current_strategy, inputs)
-    dataset_targets = distributed_training_utils.flatten_perdevice_values(
-        current_strategy, targets)
-
-    distributed_test_function = K.function(
-        all_inputs, all_outputs,
-        updates=all_updates,
-        name='distributed_test_function',
-        **all_session_args)
-
-    # We need to set sample_weights to None since there are sample weight
-    # placeholders that are created with default values.
-    sample_weights = [None for _ in range(
-        len(model.outputs) * current_strategy.num_replicas_in_sync)]
-    if not isinstance(K.learning_phase(), int):
-      ins = dataset_inputs + dataset_targets + sample_weights + [0]
-    else:
-      ins = dataset_inputs + dataset_targets
-
-    for m in model.stateful_metric_functions:
-      m.reset_states()
-
-    outs = []
-    if verbose == 1:
-      progbar = Progbar(target=steps)
-
-    # Copy the weights from the original model to each of the replicated models.
-    orig_model_weights = model.get_weights()
-    distributed_model = current_strategy.unwrap(model._grouped_model)[0]
-    distributed_training_utils.set_weights(
-        current_strategy, distributed_model, orig_model_weights)
-
-    assert steps is not None
-    for step in range(steps):
-      batch_outs = distributed_test_function(ins)
-      if isinstance(batch_outs, list):
-        if step == 0:
-          outs = [0.] * len(batch_outs)
-        outs[0] += batch_outs[0]  # index 0 = 'loss'
-        outs[1:] = batch_outs[1:]
-      else:
-        if step == 0:
-          outs.append(0.)
-        outs[0] += batch_outs  # index 0 = 'loss'
-      if verbose >= 1:
-        progbar.update(step + 1)
-    outs[0] /= steps  # index 0 = 'loss'
-
-    if len(outs) == 1:
-      return outs[0]
-    return outs
-
-
-def _experimental_test_loop(model, iterator, verbose=0, steps=None,
-                            initialize_finalize_strategy=True):
+def experimental_test_loop(model,
+                           iterator,
+                           verbose=0,
+                           steps=None,
+                           initialize_finalize_strategy=True):
   """Test loop for evaluating with TPU DistributionStrategy.
 
   Arguments:
@@ -541,11 +284,12 @@ def _experimental_test_loop(model, iterator, verbose=0, steps=None,
   # TODO(priyag, sourabhbajaj): This should likely not be hardcoded here.
   K.set_learning_phase(0)
 
-  def step_fn(ctx, inputs, targets):
+  def step_fn(ctx, inputs):
     """Clones the model and calls make_eval_function."""
     # TODO(priyag, sourabhbajaj): The model gets cloned every time
     # fit/test/predict is called. We should look into caching this keyed on
     # input shapes.
+    inputs, targets = inputs
     clone_model_on_replicas(
         model,
         current_strategy,
@@ -555,7 +299,7 @@ def _experimental_test_loop(model, iterator, verbose=0, steps=None,
         mode=_Mode.TEST)
 
     (grouped_inputs, grouped_outputs, grouped_updates,
-     grouped_session_args) = current_strategy.call_for_each_replica(
+     grouped_session_args) = current_strategy.extended.call_for_each_replica(
          _per_device_eval_function, args=(model._grouped_model_test,))
 
     (all_inputs, all_outputs, all_updates,
@@ -571,25 +315,26 @@ def _experimental_test_loop(model, iterator, verbose=0, steps=None,
 
     for label, output in zip(model.metrics_names, combined_fn.outputs):
       if label == 'loss':
-        aggregation = distribute_lib.get_loss_reduction()
+        reduce_op = distribute_lib.get_loss_reduction()
       else:
-        # We aggregate all other metrics using mean for now. This is temporary
+        # We reduce all other metrics using mean for now. This is temporary
         # workaround until new metrics are in place.
-        aggregation = variable_scope.VariableAggregation.MEAN
-      ctx.set_last_step_output(label, output, aggregation)
+        reduce_op = ds_reduce_util.ReduceOp.MEAN
+      ctx.set_last_step_output(label, output, reduce_op)
 
     return combined_fn.updates_op
 
   # Add initial dummy values for loss and other metric tensors.
   initial_loop_values = {}
   initial_loop_values['loss'] = constant_op.constant(1e7)
-  for name, tensor in zip(model.metrics_names[1:], model.metrics_tensors):
+  for name in model.metrics_names[1:]:
+    tensor = model._all_stateful_metrics_tensors[name]
     initial_loop_values[name] = array_ops.zeros(tensor.shape, tensor.dtype)
 
   with current_strategy.scope():
     # TODO(priyag): Use steps_per_run when we use new metrics as they will
     # allow handling metric computation at each step using variables.
-    ctx = current_strategy.run_steps_on_dataset(
+    ctx = current_strategy.extended.experimental_run_steps_on_iterator(
         step_fn, iterator, iterations=1,
         initial_loop_values=initial_loop_values)
 
@@ -625,103 +370,7 @@ def _experimental_test_loop(model, iterator, verbose=0, steps=None,
   return outs
 
 
-def predict_loop(model, iterator, verbose=0, steps=None):
-  """Predict loop for predicting with DistributionStrategy.
-
-  Arguments:
-      model: Keras Model instance.
-      iterator: Iterator for input data.
-      verbose: Integer, Verbosity mode 0 or 1.
-      steps: Total number of steps (batches of samples)
-          before declaring `_predict_loop` finished.
-          Ignored with the default value of `None`.
-
-  Returns:
-      Array of predictions (if the model has a single output)
-      or list of arrays of predictions
-      (if the model has multiple outputs).
-  """
-  current_strategy = model._distribution_strategy
-
-  # TODO(priyag, sourabhbajaj): Remove this when the codepaths are merged.
-  if current_strategy.__class__.__name__ == 'TPUStrategy':
-    return _experimental_predict_loop(model, iterator, verbose, steps)
-
-  if not model._grouped_model:
-    clone_model_on_replicas(model, current_strategy)
-
-  def _per_device_predict_function(model):
-    model._make_predict_function()
-    return (model.predict_function.inputs,
-            model.predict_function.outputs,
-            model.predict_function.updates_op,
-            model.predict_function.session_kwargs)
-
-  inputs, _, _ = _get_input_from_iterator(iterator, model)
-  with current_strategy.scope():
-    (grouped_inputs, grouped_outputs, grouped_updates,
-     grouped_session_args) = current_strategy.call_for_each_replica(
-         _per_device_predict_function, args=(model._grouped_model,))
-
-    (all_inputs, all_outputs, all_updates,
-     all_session_args) = distributed_training_utils.unwrap_values(
-         current_strategy, grouped_inputs, grouped_outputs, grouped_updates,
-         grouped_session_args)
-
-    dataset_inputs = distributed_training_utils.flatten_perdevice_values(
-        current_strategy, inputs)
-
-    distributed_predict_function = K.function(
-        all_inputs, all_outputs,
-        updates=all_updates,
-        name='distributed_predict_function',
-        **all_session_args)
-
-    if not isinstance(K.learning_phase(), int):
-      ins = dataset_inputs + [0]
-    else:
-      ins = dataset_inputs
-
-    if verbose == 1:
-      progbar = Progbar(target=steps)
-
-    # Copy the weights from the original model to each of the replicated models.
-    orig_model_weights = model.get_weights()
-    distributed_model = current_strategy.unwrap(model._grouped_model)[0]
-    distributed_training_utils.set_weights(
-        current_strategy, distributed_model, orig_model_weights)
-
-    num_replicas = current_strategy.num_replicas_in_sync
-    # Since we do not know how many samples we will see, we cannot
-    # pre-allocate the returned Numpy arrays. Instead, we store one array per
-    # batch seen and concatenate them upon returning.
-    unconcatenated_outs = []
-    assert steps is not None
-    for step in range(steps):
-      batch_outs = distributed_predict_function(ins)
-      if not isinstance(batch_outs, list):
-        batch_outs = [batch_outs]
-      if step == 0:
-        # batch_outs gives you the number of model outputs. In the distributed
-        # case this will be number of model_outputs * num_replicas.
-        for _ in range(len(model.outputs)):
-          unconcatenated_outs.append([])
-      for i in range(len(model.outputs)):
-        nested_outs = batch_outs[i * num_replicas:
-                                 i * num_replicas + num_replicas]
-        outs = nest.flatten(nested_outs)
-        unconcatenated_outs[i].extend(outs)
-      if verbose >= 1:
-        progbar.update(step + 1)
-    if len(unconcatenated_outs) == 1:
-      return np.concatenate(unconcatenated_outs[0], axis=0)
-    return [
-        np.concatenate(unconcatenated_outs[i], axis=0)
-        for i in range(len(unconcatenated_outs))
-    ]
-
-
-def _experimental_predict_loop(model, iterator, verbose=0, steps=None):
+def experimental_predict_loop(model, iterator, verbose=0, steps=None):
   """Predict loop for predicting with TPU DistributionStrategy.
 
   Arguments:
@@ -750,7 +399,7 @@ def _experimental_predict_loop(model, iterator, verbose=0, steps=None):
             model.predict_function.updates_op,
             model.predict_function.session_kwargs)
 
-  def step_fn(ctx, *inputs):
+  def step_fn(ctx, inputs):
     """Clones the model and calls make_predict_function."""
 
     # TODO(priyag, sourabhbajaj): The model gets cloned every time
@@ -764,7 +413,7 @@ def _experimental_predict_loop(model, iterator, verbose=0, steps=None):
         mode=_Mode.PREDICT)
 
     (grouped_inputs, grouped_outputs, grouped_updates,
-     grouped_session_args) = current_strategy.call_for_each_replica(
+     grouped_session_args) = current_strategy.extended.call_for_each_replica(
          _per_device_predict_function, args=(model._grouped_model_predict,))
 
     (all_inputs, all_outputs, all_updates,
@@ -795,7 +444,7 @@ def _experimental_predict_loop(model, iterator, verbose=0, steps=None):
 
   with current_strategy.scope():
     # TODO(priyag, sourabhbajaj): Support steps_per_run if/when we add outfeed.
-    ctx = current_strategy.run_steps_on_dataset(
+    ctx = current_strategy.extended.experimental_run_steps_on_iterator(
         step_fn, iterator, iterations=1,
         initial_loop_values=initial_loop_values)
 
@@ -835,7 +484,17 @@ def _experimental_predict_loop(model, iterator, verbose=0, steps=None):
   ]
 
 
-def _clone_and_build_model(model, inputs=None, targets=None):
+def _custom_compile_for_predict(model):
+  """Custom compile for TPU predict mode."""
+  model.total_loss = None
+  model._fit_function = None
+  model._eval_function = None
+  model.train_function = None
+  model.test_function = None
+  model.predict_function = None
+
+
+def _clone_and_build_model(model, inputs=None, targets=None, mode=None):
   """Clone and build the given keras_model."""
   # We need to set the import here since we run into a circular dependency
   # error.
@@ -862,23 +521,27 @@ def _clone_and_build_model(model, inputs=None, targets=None):
 
   if isinstance(targets, tuple):
     targets = nest.flatten(targets)
-  cloned_model.compile(
-      optimizer,
-      model.loss,
-      metrics=metrics_module.clone_metrics(model.metrics),
-      loss_weights=model.loss_weights,
-      sample_weight_mode=model.sample_weight_mode,
-      weighted_metrics=metrics_module.clone_metrics(model.weighted_metrics),
-      target_tensors=targets)
+  if mode == _Mode.PREDICT:
+    _custom_compile_for_predict(cloned_model)
+  else:
+    cloned_model.compile(
+        optimizer,
+        model.loss,
+        metrics=metrics_module.clone_metrics(model._compile_metrics),
+        loss_weights=model.loss_weights,
+        sample_weight_mode=model.sample_weight_mode,
+        weighted_metrics=metrics_module.clone_metrics(
+            model._compile_weighted_metrics),
+        target_tensors=targets)
   return cloned_model
 
 
 def clone_model_on_replicas(model, strategy, make_callback_model=False,
                             inputs=None, targets=None, mode=None):
   """Create a cloned model on each replica."""
-  with strategy.scope():
-    grouped_model = strategy.call_for_each_replica(
-        _clone_and_build_model, args=(model, inputs, targets))
+  with K.get_graph().as_default(), strategy.scope():
+    grouped_model = strategy.extended.call_for_each_replica(
+        _clone_and_build_model, args=(model, inputs, targets, mode))
     if mode is _Mode.TRAIN:
       model._grouped_model_train = grouped_model
     elif mode is _Mode.TEST:
@@ -915,3 +578,149 @@ def _get_input_from_iterator(iterator, model):
   model._standardize_weights(x_values, y_values,
                              sample_weight=sample_weights_values)
   return x, y, sample_weights
+
+
+def _make_execution_function(model, mode):
+  """Makes function to run one step of distributed model execution."""
+  if context.executing_eagerly():
+    return _make_eager_execution_function(model, mode)
+
+  strategy = model._distribution_strategy
+  if not model._grouped_model:
+    clone_model_on_replicas(
+        model, strategy, make_callback_model=(mode == 'train'))
+
+  def _per_device_function(model):
+    f = model._make_execution_function(mode)
+    return (f.inputs, f.outputs, f.updates_op, f.session_kwargs)
+
+  with strategy.scope():
+    # Create train ops on each of the devices when we call
+    # `_per_device_fit_function`.
+    (grouped_inputs, grouped_outputs, grouped_updates,
+     grouped_session_args) = strategy.extended.call_for_each_replica(
+         _per_device_function, args=(model._grouped_model,))
+
+    if mode == 'train':
+      # Initialize the variables in the replicated model. This is necessary for
+      # multi-worker training because on some workers, initialization is not
+      # needed. This method does initialization or waiting for initialization
+      # according to the context object of distribute coordinator.
+      distributed_training_utils.init_restore_or_wait_for_variables()
+
+    # Unwrap all the per device values returned from `call_for_each_replica`.
+    # Unwrapping per device values gives you a list of values that can be
+    # used to construct a new train function that is composed of update ops on
+    # all the devices over which the model is distributed.
+    (all_inputs, all_outputs, all_updates,
+     all_session_args) = distributed_training_utils.unwrap_values(
+         strategy,
+         grouped_inputs,
+         grouped_outputs,
+         grouped_updates,
+         grouped_session_args,
+         with_loss_tensor=(mode != 'predict'))
+
+    return K.function(
+        all_inputs,
+        all_outputs,
+        updates=all_updates,
+        name='distributed_{}_function'.format(mode),
+        **all_session_args)
+
+
+def _make_eager_execution_function(model, mode):
+  """Makes function to run one step of distributed model eager execution."""
+  strategy = model._distribution_strategy
+  if not model._grouped_model:
+    clone_model_on_replicas(
+        model, strategy, make_callback_model=(mode == 'train'))
+
+  def _per_device_function(model):
+    f = model._make_execution_function(mode)
+    return (f.inputs, f.outputs)
+
+  # NOTE(priyag): Try creating a new FuncGraph within DS scope instead of using
+  # the global one.
+  with K.get_graph().as_default(), strategy.scope():
+    # Create train ops on each of the devices when we call
+    # `_per_device_fit_function`.
+    (grouped_inputs, grouped_outputs) = strategy.call_for_each_replica(
+        _per_device_function, args=(model._grouped_model,))
+
+    # Unwrap all the per device values returned from `call_for_each_replica`.
+    # Unwrapping per device values gives you a list of values that can be
+    # used to construct a new train function that is composed of inptus/outputs
+    # on all the devices over which the model is distributed.
+    (all_inputs, all_outputs, _, _) = distributed_training_utils.unwrap_values(
+        strategy,
+        grouped_inputs,
+        grouped_outputs,
+        with_loss_tensor=(mode != 'predict'))
+
+    return K.function(
+        all_inputs,
+        all_outputs,
+        name='eager_distributed_{}_function'.format(mode))
+
+
+def _prepare_feed_values(model, inputs, targets, sample_weights, mode):
+  """Prepare feed values to the model execution function.
+
+  Arguments:
+    model: Model to prepare feed values for.
+    inputs: List or dict of model inputs.
+    targets: Optional list of model targets.
+    sample_weights: Optional list of sample weight arrays.
+    mode: One of 'train'/'test'/'predict'.
+
+  Returns:
+    Feed values for the model in the given mode.
+  """
+  strategy = model._distribution_strategy
+  inputs, targets, sample_weights = _get_input_from_iterator(inputs, model)
+  inputs = distributed_training_utils.flatten_perdevice_values(strategy, inputs)
+  targets = distributed_training_utils.flatten_perdevice_values(
+      strategy, targets)
+  if mode == 'predict':
+    sample_weights = []
+    targets = []
+  else:
+    sample_weights = [
+        None for _ in range(len(model.outputs) * strategy.num_replicas_in_sync)
+    ]
+  ins = inputs + targets + sample_weights
+  if mode == 'train' and not isinstance(K.symbolic_learning_phase(), int):
+    ins += [True]
+  return ins
+
+
+def _copy_weights_to_distributed_model(model):
+  """Copies weights from original model to distributed models."""
+  if model._distribution_strategy:
+    # Copy the weights from the original model to each of the replicated models.
+    orig_model_weights = model.get_weights()
+    distributed_model = model._distribution_strategy.unwrap(
+        model._grouped_model)[0]
+    distributed_training_utils.set_weights(
+        model._distribution_strategy, distributed_model, orig_model_weights)
+
+
+def _copy_weights_to_original_model(model, mode):
+  """Copies weights from first distributed model back to original model."""
+  if model._distribution_strategy and mode == 'train':
+    updated_weights = model._distribution_strategy.unwrap(
+        model._grouped_model)[0].get_weights()
+    model.set_weights(updated_weights)
+
+
+def _per_device_aggregate_batch(batch_outs, model, mode):
+  """Aggregates the per-device batch-level outputs from a distributed step."""
+  if model._distribution_strategy is not None and mode == 'predict':
+    total_batch_outs = []
+    for i in range(len(model.outputs)):
+      num_replicas = model._distribution_strategy.num_replicas_in_sync
+      nested_outs = batch_outs[i * num_replicas:i * num_replicas + num_replicas]
+      total_batch_outs.append(np.concatenate(nest.flatten(nested_outs)))
+    return total_batch_outs
+  return batch_outs
diff --git a/tensorflow/python/keras/engine/training_eager.py b/tensorflow/python/keras/engine/training_eager.py
index 9131df5cd0a..1d1cec1c507 100644
--- a/tensorflow/python/keras/engine/training_eager.py
+++ b/tensorflow/python/keras/engine/training_eager.py
@@ -19,30 +19,20 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import copy
-import threading
+import collections
 
-import numpy as np
-
-from tensorflow.python.data.ops import iterator_ops
-from tensorflow.python.eager import function as eager_function
 from tensorflow.python.eager.backprop import GradientTape
-from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.keras import backend
-from tensorflow.python.keras import callbacks as cbks
-from tensorflow.python.keras import metrics as metrics_module
+from tensorflow.python.keras import losses as losses_module
 from tensorflow.python.keras.engine import training_utils
 from tensorflow.python.keras.utils import generic_utils
+from tensorflow.python.keras.utils.losses_utils import squeeze_or_expand_dimensions
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import tf_logging as logging
 
 
-# A lock for assigning polymorphic functions to models in a thread-safe way
-_graph_function_building_lock = threading.Lock()
-
-
 def _eager_loss_fn(outputs, targets, loss_fn, output_name):
   with backend.name_scope(output_name + '_loss'):
     loss = loss_fn(targets, outputs)
@@ -133,11 +123,24 @@ def _model_loss(model,
       else:
         weights = None
       mask = masks[i]
-
-      weighted_masked_fn = training_utils.weighted_masked_objective(loss_fn)
       with backend.name_scope(model.output_names[i] + '_loss'):
-        output_loss = weighted_masked_fn(
-            targets[i], outs[i], weights, mask=mask)
+        if isinstance(loss_fn, losses_module.Loss):
+          if mask is not None:
+            mask = math_ops.cast(mask, outs[i].dtype)
+            # Update weights with mask.
+            if weights is None:
+              weights = mask
+            else:
+              # Update dimensions of weights to match with mask if possible.
+              mask, _, weights = squeeze_or_expand_dimensions(
+                  mask, None, weights)
+              weights *= mask
+          output_loss = loss_fn(targets[i], outs[i], sample_weight=weights)
+        else:
+          weighted_masked_fn = training_utils.weighted_masked_objective(loss_fn)
+          output_loss = weighted_masked_fn(
+              targets[i], outs[i], weights, mask=mask)
+
       # If the number of outputs is 1 then we don't append the loss metric
       # associated with each model output. When there are multiple outputs
       # associated with a model, each output's loss is calculated and returned
@@ -171,412 +174,6 @@ def _model_loss(model,
   return outs, total_loss, loss_metrics, aggregated_loss_metrics, masks
 
 
-def _maybe_build_graph_functions(model):
-  """Constructs polymorphic functions to use for fit, evaluate and predict."""
-  # We lock this function to ensure thread-safety in case users are
-  # hypothetically trying to call '.predict' on a model in multiple threads
-  # at once when the graph functions were never previously built.
-  with _graph_function_building_lock:
-    if not model._built_graph_functions:
-      model._eager_process_single_batch_graph_function = eager_function.defun(
-          _process_single_batch
-      )
-      model._eager_model_loss_graph_function = eager_function.defun(_model_loss)
-      model._eager_call_graph_function = eager_function.defun(model.call)
-      model._built_graph_functions = True
-
-
-def _maybe_graph_function_model_loss(model,
-                                     inputs,
-                                     targets,
-                                     output_loss_metrics=None,
-                                     sample_weights=None,
-                                     training=False):
-  """Compute model loss, using defun if the model supports it."""
-  if model._can_use_graph_functions:
-    _maybe_build_graph_functions(model)
-    return model._eager_model_loss_graph_function(
-        model,
-        inputs,
-        targets,
-        output_loss_metrics=output_loss_metrics,
-        sample_weights=sample_weights,
-        training=training)
-  else:
-    return _model_loss(
-        model,
-        inputs,
-        targets,
-        output_loss_metrics=output_loss_metrics,
-        sample_weights=sample_weights,
-        training=training)
-
-
-def _maybe_graph_function_model_call(model, *args, **kwargs):
-  """Compute model loss, using defun if the model supports it."""
-  if model._can_use_graph_functions:
-    _maybe_build_graph_functions(model)
-    return model._eager_call_graph_function(*args, **kwargs)
-  else:
-    return model.call(*args, **kwargs)
-
-
-def iterator_fit_loop(model,
-                      inputs,
-                      class_weight,
-                      steps_per_epoch,
-                      epoch_logs,
-                      val_inputs=None,
-                      val_targets=None,
-                      val_sample_weights=None,
-                      epochs=1,
-                      verbose=1,
-                      callbacks=None,
-                      validation_steps=None,
-                      do_validation=False,
-                      batch_size=None,
-                      output_loss_metrics=None):
-  """Fit function for eager execution when input is given as dataset iterator.
-
-  Updates the given epoch logs.
-
-  Arguments:
-      model: Instance of the `Model`.
-      inputs: Input dataset iterator.
-      class_weight: Optional class-weight array to weight the importance of
-          samples in `inputs` based on the class they belong to, as conveyed by
-          the targets from the `inputs` iterator.
-      steps_per_epoch: Total number of steps (batches of samples)
-          before declaring one epoch finished and starting the
-          next epoch.
-      epoch_logs: Dictionary of logs from every epoch.
-      val_inputs: Input data for validation.
-      val_targets: Target data for validation.
-      val_sample_weights: Sample weight data for validation.
-      epochs: Number of times to iterate over the data
-      verbose: Verbosity mode, 0, 1 or 2
-      callbacks: CallbackList instance. Controls callbacks during training.
-      validation_steps: Number of steps to run validation for (only if doing
-        validation from data tensors). Ignored with default value of `None`.
-      do_validation: Boolean value indicating whether we should do validation.
-      batch_size: int, val_inputs and val_targets will be evaled batch by
-        batch with size batch_size if they are array.
-      output_loss_metrics: List of metrics that are used to aggregated output
-        loss values.
-
-  Raises:
-      ValueError: In case of mismatch between given number of inputs and
-        expectations of the model.
-  """
-  assert isinstance(inputs, iterator_ops.EagerIterator)
-
-  # make sure either x,y or x,y,sample_weights is provided
-  if (not isinstance(inputs.output_shapes, (list, tuple)) or
-      len(inputs.output_shapes) not in (2, 3)):
-    raise ValueError('Please provide either inputs and targets '
-                     'or inputs, targets, and sample_weights')
-
-  for step_index in range(steps_per_epoch):
-    batch_logs = {'batch': step_index, 'size': 1}
-    callbacks.on_batch_begin(step_index, batch_logs)
-
-    # Get data from the iterator.
-    try:
-      next_element = inputs.get_next()
-    except errors.OutOfRangeError:
-      logging.warning(
-          'Your dataset iterator ran out of data; interrupting training. Make '
-          'sure that your dataset can generate at least '
-          '`steps_per_epoch * epochs` batches (in this case, %d batches). You '
-          'may need to use the repeat() function when building your '
-          'dataset.' % steps_per_epoch * epochs)
-      break
-
-    if len(inputs.output_shapes) == 2:
-      x, y = next_element
-      sample_weights = None
-    else:
-      x, y, sample_weights = next_element
-
-    # Validate and standardize data.
-    x, y, sample_weights = model._standardize_user_data(
-        x, y, sample_weight=sample_weights, class_weight=class_weight)
-    x = training_utils.cast_if_floating_dtype(x)
-    y = training_utils.cast_if_floating_dtype(y)
-    if sample_weights:
-      sample_weights = [
-          training_utils.cast_if_floating_dtype(
-              ops.convert_to_tensor(val, dtype=backend.floatx()))
-          if val is not None else None for val in sample_weights
-      ]
-
-    # Set stateful_metrics in callbacks. We do not do this before the
-    # `steps_per_epoch` loop because model will be compiled only in the first
-    # iteration of this loop in the deferred build scenario.
-    if step_index == 0:
-      for cbk in callbacks:
-        if (isinstance(cbk, cbks.BaseLogger) or
-            isinstance(cbk, cbks.ProgbarLogger)):
-          cbk.stateful_metrics = model.metrics_names[1:]  # Exclude `loss`
-
-    if step_index == 0 and not callbacks.params['metrics']:
-      callback_metrics = copy.copy(model.metrics_names)
-      if do_validation:
-        callback_metrics += ['val_' + n for n in model.metrics_names]
-      callbacks.set_params({
-          'batch_size': batch_size,
-          'epochs': epochs,
-          'steps': steps_per_epoch,
-          'verbose': verbose,
-          'do_validation': do_validation,
-          'metrics': callback_metrics or [],
-          'validation_steps': validation_steps
-      })
-
-    # Train model.
-    outs, loss, _, aggregated_loss_metrics, masks = \
-      _maybe_graph_function_process_single_batch(
-          model,
-          x,
-          y,
-          output_loss_metrics=output_loss_metrics,
-          sample_weights=sample_weights,
-          training=True)
-    outs = generic_utils.to_list(outs)
-
-    # Calculate metrics.
-    for l, o in zip(model.metrics_names, outs):
-      batch_logs[l] = o
-    metrics_results = _eager_metrics_fn(
-        model, outs, y, sample_weights=sample_weights, masks=masks)
-    batch_logs['loss'] = tensor_util.constant_value(backend.mean(loss))
-
-    for k, v in zip(
-        model.metrics_names,
-        [backend.mean(loss)] + aggregated_loss_metrics + metrics_results):
-      batch_logs[k] = tensor_util.constant_value(v)
-    callbacks.on_batch_end(step_index, batch_logs)
-    if callbacks.model.stop_training:
-      break
-
-    if step_index == steps_per_epoch - 1:
-      if do_validation:
-        val_outs = test_loop(
-            model,
-            val_inputs,
-            val_targets,
-            sample_weights=val_sample_weights,
-            steps=validation_steps,
-            verbose=0,
-            batch_size=batch_size)
-        if not isinstance(val_outs, list):
-          val_outs = [val_outs]
-        # Same labels assumed.
-        for l, o in zip(model.metrics_names, val_outs):
-          epoch_logs['val_' + l] = o
-
-
-def iterator_test_loop(model, inputs, steps, verbose=0):
-  """Test function for eager execution when input is given as dataset iterator.
-
-  Arguments:
-      model: Model instance that is being evaluated in Eager mode.
-      inputs: Input dataset iterator.
-      steps: Total number of steps (batches of samples) before declaring
-      predictions finished.
-      verbose: Verbosity mode.
-
-  Returns:
-      Scalar loss (if the model has a single output and no metrics)
-      or list of scalars (if the model has multiple outputs
-      and/or metrics). The attribute `model.metrics_names` will give you
-      the display labels for the scalar outputs.
-
-  Raises:
-      ValueError: In case of mismatch between given number of inputs and
-        expectations of the model.
-  """
-  assert isinstance(inputs, iterator_ops.EagerIterator)
-  # make sure either x,y or x,y,sample_weights is provided
-  if (not isinstance(inputs.output_shapes, (list, tuple)) or
-      len(inputs.output_shapes) < 2 or len(inputs.output_shapes) > 3):
-    raise ValueError('Please provide either inputs and targets'
-                     'or inputs, targets, and sample_weights')
-  outs = []
-
-  # Create metric wrapper for the losses.
-  output_loss_metrics = []
-  for i in range(len(model.outputs)):
-    loss_fn = model.loss_functions[i]
-    mean_wrapped_loss = metrics_module.MeanMetricWrapper(
-        loss_fn, name=loss_fn.__name__)
-    output_loss_metrics.append(mean_wrapped_loss)
-
-  num_samples = 0
-  if verbose == 1:
-    progbar = generic_utils.Progbar(target=steps)
-  for step_index in range(steps):
-    # Get data from the iterator.
-    try:
-      next_element = inputs.get_next()
-    except errors.OutOfRangeError:
-      logging.warning(
-          'Your dataset iterator ran out of data interrupting testing. '
-          'Make sure that your dataset can generate at least `steps` batches '
-          '(in this case, %d batches). You may need to use the repeat() '
-          'function when building your dataset.', steps)
-      break
-
-    if len(inputs.output_shapes) == 2:
-      x, y = next_element
-      sample_weights = None
-    else:
-      x, y, sample_weights = next_element
-
-    # Validate and standardize data.
-    x, y, sample_weights = model._standardize_user_data(
-        x, y, sample_weight=sample_weights)
-    x = training_utils.cast_if_floating_dtype(x)
-    y = training_utils.cast_if_floating_dtype(y)
-    if sample_weights:
-      sample_weights = [
-          training_utils.cast_if_floating_dtype(
-              ops.convert_to_tensor(val, dtype=backend.floatx()))
-          if val is not None else None for val in sample_weights
-      ]
-
-    if step_index == 0:
-      # Get stateful metrics indices. We do not do this before the `steps` loop
-      # because model will be compiled only in the first iteration of this loop
-      # in the deferred build scenario.
-      if hasattr(model, 'metrics'):
-        for m in model.stateful_metric_functions:
-          m.reset_states()
-      for m in output_loss_metrics:
-        m.reset_states()
-
-    # Calculate model output, loss values.
-    loss_outs, loss, _, aggregated_loss_metrics, masks = \
-      _maybe_graph_function_model_loss(
-          model,
-          x,
-          y,
-          output_loss_metrics=output_loss_metrics,
-          sample_weights=sample_weights,
-          training=False)
-    metrics_results = _eager_metrics_fn(
-        model, loss_outs, y, sample_weights=sample_weights, masks=masks)
-    batch_outs = []
-    for _, v in zip(
-        model.metrics_names,
-        [backend.mean(loss)] + aggregated_loss_metrics + metrics_results):
-      batch_outs.append(tensor_util.constant_value(v))
-
-    # Get current step size.
-    if isinstance(x, list):
-      step_size = x[0].get_shape().as_list()[0]
-    elif isinstance(x, dict):
-      step_size = list(x.values())[0].get_shape().as_list()[0]
-    else:
-      step_size = x.get_shape().as_list()[0]
-
-    # Accumulate results in output array.
-    if not isinstance(batch_outs, list):
-      batch_outs = [batch_outs]
-    if step_index == 0:
-      for _ in enumerate(batch_outs):
-        outs.append(0.)
-    outs[0] += batch_outs[0] * step_size  # index 0 = 'loss'
-    outs[1:] = batch_outs[1:]
-
-    # Calculate sample size.
-    num_samples += step_size
-    if verbose == 1:
-      progbar.update(step_index + 1)
-
-  outs[0] /= num_samples  # index 0 = 'loss'
-  if len(outs) == 1:
-    return outs[0]
-  return outs
-
-
-def iterator_predict_loop(model, inputs, steps, verbose=0):
-  """Predict function for eager execution when input is dataset iterator.
-
-  Arguments:
-      model: Instance of `Model`.
-      inputs: Input dataset iterator.
-      steps: Total number of steps (batches of samples) before declaring
-          `_predict_loop` finished.
-      verbose: Verbosity mode.
-
-  Returns:
-      Array of predictions (if the model has a single output)
-      or list of arrays of predictions (if the model has multiple outputs).
-
-  Raises:
-      ValueError: In case of mismatch between given number of inputs and
-        expectations of the model.
-  """
-  assert isinstance(inputs, iterator_ops.EagerIterator)
-  if not isinstance(inputs.output_shapes,
-                    (list, tuple)) or len(inputs.output_shapes) > 3:
-    raise ValueError(
-        'Please provide data as a list or tuple of 1, 2, or 3 elements '
-        ' - `(input)`, or `(input, target)`, or `(input, target,'
-        'sample_weights)`. Received %s. We do not use the `target` or'
-        '`sample_weights` value here.' % inputs.output_shapes)
-  outs = []
-  if verbose == 1:
-    progbar = generic_utils.Progbar(target=steps)
-
-  for step_index in range(steps):
-    # Get data from the iterator.
-    try:
-      next_element = inputs.get_next()
-    except errors.OutOfRangeError:
-      logging.warning(
-          'Your dataset iterator ran out of data; interrupting prediction. '
-          'Make sure that your dataset can generate at least `steps` batches '
-          '(in this case, %d batches). You may need to use the repeat() '
-          'function when building your dataset.', steps)
-      break
-
-    # expects a tuple, where first element of tuple represents inputs
-    x = next_element[0]
-
-    # Validate and standardize data.
-    x, _, _ = model._standardize_user_data(x)
-    x = training_utils.cast_if_floating_dtype(x)
-
-    if isinstance(x, list) and len(x) == 1:
-      x = x[0]
-
-    if model._expects_training_arg:
-      batch_outs = _maybe_graph_function_model_call(model, x, training=False)
-    else:
-      batch_outs = _maybe_graph_function_model_call(model, x)
-    if not isinstance(batch_outs, list):
-      batch_outs = [batch_outs]
-
-    # We collect the results from every step and then concatenate them once
-    # in the end. This is an expensive process. We are doing this because we
-    # do not know the number of samples beforehand.
-    if step_index == 0:
-      for _ in batch_outs:
-        outs.append([])
-    for i, batch_out in enumerate(batch_outs):
-      outs[i].append(backend.get_value(batch_out))
-
-    if verbose == 1:
-      progbar.update(step_index + 1)
-  for i, out in enumerate(outs):
-    outs[i] = np.concatenate(tuple(out), axis=0)
-  if len(outs) == 1:
-    return outs[0]
-  return outs
-
-
 def _process_single_batch(model,
                           inputs,
                           targets,
@@ -630,32 +227,6 @@ def _process_single_batch(model,
     return outs, loss, loss_metrics, aggregated_loss_metrics, masks
 
 
-def _maybe_graph_function_process_single_batch(model,
-                                               inputs,
-                                               targets,
-                                               output_loss_metrics=None,
-                                               sample_weights=None,
-                                               training=False):
-  """Process a single batch, using defun if the model supports it."""
-  if model._can_use_graph_functions:
-    _maybe_build_graph_functions(model)
-    return model._eager_process_single_batch_graph_function(
-        model,
-        inputs,
-        targets,
-        output_loss_metrics=output_loss_metrics,
-        sample_weights=sample_weights,
-        training=training)
-  else:
-    return _process_single_batch(
-        model,
-        inputs,
-        targets,
-        output_loss_metrics=output_loss_metrics,
-        sample_weights=sample_weights,
-        training=training)
-
-
 def train_on_batch(model, inputs, targets, sample_weights=None):
   """Calculates the loss and gradient updates for one input batch.
 
@@ -668,25 +239,25 @@ def train_on_batch(model, inputs, targets, sample_weights=None):
   Returns:
       total loss and the loss associated with each output.
   """
-  if len(inputs) and tensor_util.is_tensor(inputs[0]):
-    inputs = training_utils.cast_if_floating_dtype(inputs)
-    targets = training_utils.cast_if_floating_dtype(targets)
-  else:
-    inputs = [
-        ops.convert_to_tensor(val, dtype=backend.floatx()) for val in inputs
-    ]
-    targets = [
-        ops.convert_to_tensor(val, dtype=backend.floatx()) for val in targets
-    ]
+  if isinstance(inputs, collections.Sequence):
+    if len(inputs) and tensor_util.is_tensor(inputs[0]):
+      inputs = training_utils.cast_if_floating_dtype(inputs)
+      targets = training_utils.cast_if_floating_dtype(targets)
+    else:
+      inputs = [
+          ops.convert_to_tensor(val, dtype=backend.floatx()) for val in inputs
+      ]
+      targets = [
+          ops.convert_to_tensor(val, dtype=backend.floatx()) for val in targets
+      ]
   if sample_weights:
     sample_weights = [
         ops.convert_to_tensor(val, dtype=backend.floatx())
         if val is not None else None for val in sample_weights
     ]
 
-  outs, loss, loss_metrics, _, masks = \
-    _maybe_graph_function_process_single_batch(
-        model, inputs, targets, sample_weights=sample_weights, training=True)
+  outs, loss, loss_metrics, _, masks = _process_single_batch(
+      model, inputs, targets, sample_weights=sample_weights, training=True)
   if not isinstance(outs, list):
     outs = [outs]
   metrics_results = _eager_metrics_fn(
@@ -695,7 +266,7 @@ def train_on_batch(model, inputs, targets, sample_weights=None):
       targets,
       sample_weights=sample_weights,
       masks=masks,
-      return_stateful_result=False)
+      return_stateful_result=True)
   loss = generic_utils.to_list(loss)
 
   return [
@@ -716,22 +287,23 @@ def test_on_batch(model, inputs, targets, sample_weights=None):
   Returns:
       total loss, loss and metrics associated with each output.
   """
-  if len(inputs) and tensor_util.is_tensor(inputs[0]):
-    inputs = training_utils.cast_if_floating_dtype(inputs)
-    targets = training_utils.cast_if_floating_dtype(targets)
-  else:
-    inputs = [
-        ops.convert_to_tensor(val, dtype=backend.floatx()) for val in inputs
-    ]
-    targets = [
-        ops.convert_to_tensor(val, dtype=backend.floatx()) for val in targets
-    ]
+  if isinstance(inputs, collections.Sequence):
+    if len(inputs) and tensor_util.is_tensor(inputs[0]):
+      inputs = training_utils.cast_if_floating_dtype(inputs)
+      targets = training_utils.cast_if_floating_dtype(targets)
+    else:
+      inputs = [
+          ops.convert_to_tensor(val, dtype=backend.floatx()) for val in inputs
+      ]
+      targets = [
+          ops.convert_to_tensor(val, dtype=backend.floatx()) for val in targets
+      ]
   if sample_weights:
     sample_weights = [
         ops.convert_to_tensor(val, dtype=backend.floatx())
         if val is not None else None for val in sample_weights
     ]
-  outs, loss, loss_metrics, _, masks = _maybe_graph_function_model_loss(
+  outs, loss, loss_metrics, _, masks = _model_loss(
       model, inputs, targets, sample_weights=sample_weights, training=False)
   if not isinstance(outs, list):
     outs = [outs]
@@ -741,184 +313,10 @@ def test_on_batch(model, inputs, targets, sample_weights=None):
       targets,
       sample_weights=sample_weights,
       masks=masks,
-      return_stateful_result=False)
+      return_stateful_result=True)
   loss = generic_utils.to_list(loss)
 
   return [
       tensor_util.constant_value(v)
       for v in loss + loss_metrics + metrics_results
   ]
-
-
-def fit_loop(model,
-             inputs,
-             targets,
-             sample_weights=None,
-             class_weight=None,
-             val_inputs=None,
-             val_targets=None,
-             val_sample_weights=None,
-             batch_size=None,
-             epochs=1,
-             verbose=1,
-             callbacks=None,
-             shuffle=True,
-             initial_epoch=0,
-             steps_per_epoch=None,
-             validation_steps=None):
-  """Fit function for eager execution.
-
-  Arguments:
-      model: Instance of the model that is being executed in Eager mode.
-      inputs: List of input arrays.
-      targets: List of target arrays.
-      sample_weights: Optional list of sample weight arrays.
-      class_weight: Optional class-weight array to weight the importance of
-          samples in `inputs` based on the class they belong to, as conveyed by
-          `targets`.
-      val_inputs: Input data for validation.
-      val_targets: Target data for validation.
-      val_sample_weights: Sample weight data for validation.
-      batch_size: Integer batch size or None if unknown.
-      epochs: Number of times to iterate over the data
-      verbose: Verbosity mode, 0, 1 or 2
-      callbacks: List of callbacks to be called during training
-      shuffle: Whether to shuffle the data at the beginning of each epoch
-      initial_epoch: Epoch at which to start training
-          (useful for resuming a previous training run)
-      steps_per_epoch: Total number of steps (batches of samples)
-          before declaring one epoch finished and starting the
-          next epoch. Ignored with the default value of `None`.
-      validation_steps: Number of steps to run validation for (only if doing
-        validation from data tensors). Ignored with default value of `None`.
-
-  Returns:
-      `History` object.
-
-  Raises:
-    ValueError: In case of invalid argument values.
-  """
-  # Convert training inputs to an EagerIterator
-  inputs, steps_per_epoch = training_utils.convert_to_iterator(
-      x=inputs,
-      y=targets,
-      sample_weights=sample_weights,
-      batch_size=batch_size,
-      steps_per_epoch=steps_per_epoch,
-      epochs=epochs,
-      shuffle=shuffle)
-  # Required for eager execution
-  with backend.learning_phase_scope(1):
-    do_validation = val_inputs is not None
-    callbacks = cbks.configure_callbacks(
-        callbacks,
-        model,
-        do_validation=do_validation,
-        batch_size=batch_size,
-        epochs=epochs,
-        steps_per_epoch=steps_per_epoch,
-        val_inputs=val_inputs,
-        val_targets=val_targets,
-        val_sample_weights=val_sample_weights,
-        validation_steps=validation_steps,
-        verbose=verbose)
-
-    # Create metric wrapper for the losses.
-    output_loss_metrics = []
-    for i in range(len(model.outputs)):
-      loss_fn = model.loss_functions[i]
-      mean_wrapped_loss = metrics_module.MeanMetricWrapper(
-          loss_fn, name=loss_fn.__name__)
-      output_loss_metrics.append(mean_wrapped_loss)
-
-    callbacks.on_train_begin()
-    for epoch in range(initial_epoch, epochs):
-      if model._is_compiled:  # Model may not be compiled the first time.
-        # Reset stateful metrics
-        for m in model.stateful_metric_functions:
-          m.reset_states()
-
-      for m in output_loss_metrics:
-        m.reset_states()
-
-      callbacks.on_epoch_begin(epoch)
-      epoch_logs = {}
-      iterator_fit_loop(
-          model,
-          inputs,
-          class_weight,
-          steps_per_epoch=steps_per_epoch,
-          epoch_logs=epoch_logs,
-          val_inputs=val_inputs,
-          val_targets=val_targets,
-          val_sample_weights=val_sample_weights,
-          epochs=epochs,
-          verbose=verbose,
-          callbacks=callbacks,
-          validation_steps=validation_steps,
-          do_validation=do_validation,
-          batch_size=batch_size,
-          output_loss_metrics=output_loss_metrics)
-      callbacks.on_epoch_end(epoch, epoch_logs)
-      if callbacks.model.stop_training:
-        break
-  callbacks.on_train_end()
-  return model.history
-
-
-def test_loop(model, inputs, targets,
-              sample_weights=None,
-              batch_size=None,
-              verbose=0,
-              steps=None):
-  """Test function for eager execution.
-
-  Arguments:
-      model: Model instance that is being evaluated in Eager mode.
-      inputs: List of input arrays.
-      targets: List of target arrays.
-      sample_weights: Optional list of sample weight arrays.
-      batch_size: integer batch size or `None`.
-      verbose: verbosity mode.
-      steps: Total number of steps (batches of samples)
-          before declaring predictions finished.
-          Ignored with the default value of `None`.
-
-  Returns:
-      Scalar loss (if the model has a single output and no metrics)
-      or list of scalars (if the model has multiple outputs
-      and/or metrics). The attribute `model.metrics_names` will give you
-      the display labels for the scalar outputs.
-  """
-  inputs, steps = training_utils.convert_to_iterator(
-      x=inputs,
-      y=targets,
-      sample_weights=sample_weights,
-      batch_size=batch_size,
-      steps_per_epoch=steps,
-      is_validation=True)
-  with backend.learning_phase_scope(0):
-    return iterator_test_loop(model, inputs, steps, verbose=verbose)
-
-
-def predict_loop(model, inputs, batch_size=32, verbose=0, steps=None):
-  """Predict function for eager execution.
-
-  Arguments:
-      model: Instance of `Model`.
-      inputs: List of input arrays.
-      batch_size: integer batch size.
-      verbose: verbosity mode.
-      steps: Total number of steps (batches of samples)
-          before declaring `_predict_loop` finished.
-          Ignored with the default value of `None`.
-
-  Returns:
-      Array of predictions (if the model has a single output)
-      or list of arrays of predictions
-      (if the model has multiple outputs).
-  """
-  with backend.learning_phase_scope(0):
-    inputs, steps = training_utils.convert_to_iterator(
-        x=inputs, batch_size=batch_size, steps_per_epoch=steps)
-    return iterator_predict_loop(model, inputs, steps, verbose=verbose)
diff --git a/tensorflow/python/keras/engine/training_eager_test.py b/tensorflow/python/keras/engine/training_eager_test.py
index 76aaf1643b0..3fabbb17edc 100644
--- a/tensorflow/python/keras/engine/training_eager_test.py
+++ b/tensorflow/python/keras/engine/training_eager_test.py
@@ -20,10 +20,10 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python import keras
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.eager import context
 from tensorflow.python.framework import ops
-from tensorflow.python.framework import test_util as tf_test_util
 from tensorflow.python.keras import metrics as metrics_module
 from tensorflow.python.platform import test
 from tensorflow.python.training.rmsprop import RMSPropOptimizer
@@ -51,6 +51,7 @@ class TrainingTest(test.TestCase):
         loss,
         metrics=metrics,
         loss_weights=loss_weights,
+        run_eagerly=True,
         sample_weight_mode=None)
 
     input_a = keras.backend.zeros(shape=(10, 3))
@@ -111,7 +112,7 @@ class TrainingTest(test.TestCase):
     optimizer = RMSPropOptimizer(learning_rate=0.001)
     loss = 'mse'
     metrics = ['mae', metrics_module.CategoricalAccuracy()]
-    model.compile(optimizer, loss, metrics=metrics)
+    model.compile(optimizer, loss, metrics=metrics, run_eagerly=True)
 
     inputs = keras.backend.zeros(shape=(10, 3))
     targets = keras.backend.zeros(shape=(10, 4))
@@ -129,29 +130,34 @@ class TrainingTest(test.TestCase):
     x = keras.layers.Input(shape=(3,), name='input')
     y = keras.layers.Dense(4, name='dense')(x)
     model = keras.Model(x, y)
-    model.compile(optimizer=RMSPropOptimizer(learning_rate=0.001), loss='mse')
+    model.compile(optimizer=RMSPropOptimizer(learning_rate=0.001),
+                  loss='mse',
+                  run_eagerly=True)
 
     x = keras.backend.zeros(shape=(10, 3))
     y = keras.backend.zeros(shape=(10, 4))
     dataset = dataset_ops.Dataset.from_tensor_slices((x, y)).repeat(10).batch(5)
-    iterator = dataset.make_one_shot_iterator()
+    iterator = dataset_ops.make_one_shot_iterator(dataset)
     validation_dataset = dataset_ops.Dataset.from_tensor_slices(
         (x, y)).repeat(10).batch(5)
-    validation_iterator = validation_dataset.make_one_shot_iterator()
+    validation_iterator = dataset_ops.make_one_shot_iterator(validation_dataset)
 
     with self.assertRaisesRegexp(
         ValueError, r'specify .* `steps_per_epoch`'):
       model.fit(iterator, epochs=1, verbose=0)
-    with self.assertRaisesRegexp(
-        ValueError, r'provide either `batch_size` or `validation_steps`'):
-      model.fit(iterator, steps_per_epoch=2, epochs=1, verbose=0,
-                validation_data=(x, y))
-    with self.assertRaisesRegexp(
-        ValueError, r'provide either `batch_size` or `validation_steps`'):
+    if not context.executing_eagerly():
+      # In eager execution, `keras.backend.zeros` returns value tensors
+      # which can be used for validation without a `validation_steps` argument.
+      with self.assertRaisesRegexp(
+          ValueError, r'provide either `batch_size` or `validation_steps`'):
+        model.fit(iterator, steps_per_epoch=2, epochs=1, verbose=0,
+                  validation_data=(x, y))
+    with self.assertRaisesRegexp(ValueError,
+                                 'specify the `validation_steps` argument.'):
       model.fit(iterator, steps_per_epoch=2, epochs=1, verbose=0,
                 validation_data=validation_dataset)
-    with self.assertRaisesRegexp(
-        ValueError, r'provide either `batch_size` or `validation_steps`'):
+    with self.assertRaisesRegexp(ValueError,
+                                 'specify the `validation_steps` argument.'):
       model.fit(iterator, steps_per_epoch=2, epochs=1, verbose=0,
                 validation_data=validation_iterator)
 
@@ -160,25 +166,31 @@ class TrainingTest(test.TestCase):
     model.add(keras.layers.Dense(4, input_shape=(3,)))
     optimizer = RMSPropOptimizer(learning_rate=0.001)
     model.compile(
-        optimizer, 'mse', metrics=['mae',
-                                   metrics_module.CategoricalAccuracy()])
+        optimizer,
+        loss='mse',
+        metrics=['mae', metrics_module.CategoricalAccuracy()],
+        run_eagerly=True)
 
     x = np.random.random((10, 3))
     y = np.random.random((10, 4))
 
-    def iterator():
+    def numpy_iterator():
       while True:
         yield x, y
 
-    model.fit_generator(iterator(), steps_per_epoch=3, epochs=1)
-    model.evaluate_generator(iterator(), steps=3)
-    out = model.predict_generator(iterator(), steps=3)
+    model.fit_generator(numpy_iterator(), steps_per_epoch=3, epochs=1)
+    model.evaluate_generator(numpy_iterator(), steps=3)
+
+    def inference_numpy_iterator():
+      while True:
+        yield x
+
+    out = model.predict_generator(inference_numpy_iterator(), steps=3)
     self.assertEqual(out.shape, (30, 4))
 
 
 class CorrectnessTest(test.TestCase):
 
-  @tf_test_util.run_in_graph_and_eager_modes
   def test_loss_correctness(self):
     # Test that training loss is the same in eager and graph
     # (by comparing it to a reference value in a deterministic case)
@@ -191,14 +203,14 @@ class CorrectnessTest(test.TestCase):
                                  activation='softmax',
                                  kernel_initializer='ones'))
     model.compile(loss='sparse_categorical_crossentropy',
-                  optimizer=RMSPropOptimizer(learning_rate=0.001))
+                  optimizer=RMSPropOptimizer(learning_rate=0.001),
+                  run_eagerly=False)
     x = np.ones((100, 4))
     np.random.seed(123)
     y = np.random.randint(0, 1, size=(100, 1))
     history = model.fit(x, y, epochs=1, batch_size=10)
     self.assertAlmostEqual(history.history['loss'][-1], 0.6173, 4)
 
-  @tf_test_util.run_in_graph_and_eager_modes
   def test_loss_correctness_with_iterator(self):
     # Test that training loss is the same in eager and graph
     # (by comparing it to a reference value in a deterministic case)
@@ -210,14 +222,15 @@ class CorrectnessTest(test.TestCase):
         keras.layers.Dense(2, activation='softmax', kernel_initializer='ones'))
     model.compile(
         loss='sparse_categorical_crossentropy',
-        optimizer=RMSPropOptimizer(learning_rate=0.001))
+        optimizer=RMSPropOptimizer(learning_rate=0.001),
+        run_eagerly=True)
     x = np.ones((100, 4), dtype=np.float32)
     np.random.seed(123)
     y = np.random.randint(0, 1, size=(100, 1))
     dataset = dataset_ops.Dataset.from_tensor_slices((x, y))
     dataset = dataset.repeat(100)
     dataset = dataset.batch(10)
-    iterator = dataset.make_one_shot_iterator()
+    iterator = dataset_ops.make_one_shot_iterator(dataset)
     history = model.fit(iterator, epochs=1, steps_per_epoch=10)
     self.assertAlmostEqual(history.history['loss'][-1], 0.6173, 4)
 
diff --git a/tensorflow/python/keras/engine/training_generator.py b/tensorflow/python/keras/engine/training_generator.py
index b5e3a039767..0abf0b82709 100644
--- a/tensorflow/python/keras/engine/training_generator.py
+++ b/tensorflow/python/keras/engine/training_generator.py
@@ -19,412 +19,433 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import functools
+import math
+
 import numpy as np
 
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.ops import iterator_ops
 from tensorflow.python.eager import context
+from tensorflow.python.framework import errors
+from tensorflow.python.keras import backend
 from tensorflow.python.keras import callbacks as cbks
-from tensorflow.python.keras.utils.data_utils import GeneratorEnqueuer
-from tensorflow.python.keras.utils.data_utils import iter_sequence_infinite
-from tensorflow.python.keras.utils.data_utils import OrderedEnqueuer
-from tensorflow.python.keras.utils.data_utils import Sequence
-from tensorflow.python.keras.utils.generic_utils import Progbar
+from tensorflow.python.keras.engine import training_utils
+from tensorflow.python.keras.utils import data_utils
+from tensorflow.python.keras.utils import generic_utils
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.util import nest
 
 
-def fit_generator(model,
-                  generator,
-                  steps_per_epoch=None,
-                  epochs=1,
-                  verbose=1,
-                  callbacks=None,
-                  validation_data=None,
-                  validation_steps=None,
-                  class_weight=None,
-                  max_queue_size=10,
-                  workers=1,
-                  use_multiprocessing=False,
-                  shuffle=True,
-                  initial_epoch=0):
-  """See docstring for `Model.fit_generator`."""
-  epoch = initial_epoch
+def model_iteration(model,
+                    data,
+                    steps_per_epoch=None,
+                    epochs=1,
+                    verbose=1,
+                    callbacks=None,
+                    validation_data=None,
+                    validation_steps=None,
+                    class_weight=None,
+                    max_queue_size=10,
+                    workers=1,
+                    use_multiprocessing=False,
+                    shuffle=True,
+                    initial_epoch=0,
+                    mode='train',
+                    batch_size=None,
+                    **kwargs):
+  """Loop function for arrays of data with modes 'train'/'test'/'predict'.
 
-  do_validation = bool(validation_data)
-  if not context.executing_eagerly():
-    model._make_train_function()
-    if do_validation:
-      model._make_test_function()
+  Arguments:
+      model: Keras Model instance.
+      data: Either a tuple of NumPy/Tensor inputs (i.e. `(x,)` or `(x, y)` or
+        `(x, y, sample_weights)`) or a generator or
+        `keras.utils.data_utils.Sequence` object or Eager Iterator or Dataset.
+      steps_per_epoch: Total number of steps (batches of samples) before
+        declaring one epoch finished and starting the next epoch. Ignored with
+        the default value of `None`.
+      epochs: Number of times to iterate over the data.
+      verbose: Verbosity mode, 0, 1 or 2.
+      callbacks: List of callbacks to be called during training.
+      validation_data: Either a tuple of NumPy/Tensor inputs (i.e. `(x,)` or
+        `(x, y)` or `(x, y, sample_weights)`) or a generator or
+        `keras.utils.data_utils.Sequence` object or Eager Iterator or Dataset.
+      validation_steps: Total number of steps (batches of samples) before
+        declaring validation finished.
+      class_weight: Dictionary mapping class indices to a weight for the class.
+      max_queue_size: Integer. Maximum size for the generator queue. If
+        unspecified, `max_queue_size` will default to 10.
+      workers: Integer. Maximum number of processes to spin up when using
+        process-based threading. If unspecified, `workers` will default to 1. If
+        0, will execute the generator on the main thread.
+      use_multiprocessing: Boolean. If `True`, use process-based threading. If
+        unspecified, `use_multiprocessing` will default to `False`. Note that
+        because this implementation relies on multiprocessing, you should not
+        pass non-picklable arguments to the generator as they can't be passed
+        easily to children processes.
+      shuffle: Boolean. Whether to shuffle the order of the batches at the
+        beginning of each epoch. Only used with instances of `Sequence`
+        (`keras.utils.Sequence`). Has no effect when `steps_per_epoch` is not
+        `None`.
+      initial_epoch: Epoch at which to start training (useful for resuming a
+        previous training run).
+      mode: One of 'train'/'test'/'predict'.
+      batch_size: Integer batch size or None if unknown. Will only be used if
+        `data` is in NumPy/Tensor format.
+      **kwargs: Additional arguments for backwards compatibility. `steps` is
+        accepted as an alias for `steps_per_epoch`.
 
-  is_sequence = isinstance(generator, Sequence)
-  if not is_sequence and use_multiprocessing and workers > 1:
-    logging.warning(
-        UserWarning('Using a generator with `use_multiprocessing=True`'
-                    ' and multiple workers may duplicate your data.'
-                    ' Please consider using the`keras.utils.Sequence'
-                    ' class.'))
-  if steps_per_epoch is None:
-    if is_sequence:
-      steps_per_epoch = len(generator)
-    else:
-      raise ValueError('`steps_per_epoch=None` is only valid for a'
-                       ' generator based on the `keras.utils.Sequence`'
-                       ' class. Please specify `steps_per_epoch` or use'
-                       ' the `keras.utils.Sequence` class.')
+  Returns:
+      - In 'train' mode: `History` object.
+      - In 'test' mode: Evaluation metrics.
+      - In 'predict' mode: Outputs of the Model called on inputs.
 
-  # python 2 has 'next', 3 has '__next__'
-  # avoid any explicit version checks
-  val_gen = (
-      hasattr(validation_data, 'next') or
-      hasattr(validation_data, '__next__') or
-      isinstance(validation_data, Sequence))
-  if (val_gen and not isinstance(validation_data, Sequence) and
-      not validation_steps):
-    raise ValueError('`validation_steps=None` is only valid for a'
-                     ' generator based on the `keras.utils.Sequence`'
-                     ' class. Please specify `validation_steps` or use'
-                     ' the `keras.utils.Sequence` class.')
+  Raises:
+      ValueError: in case of invalid arguments.
+  """
+  if 'steps' in kwargs:
+    steps_per_epoch = kwargs['steps']
 
-  enqueuer = None
-  val_enqueuer = None
+  # Convert to a format that supports `next(generator)`.
+  generator, steps_per_epoch = convert_to_generator_like(
+      data,
+      steps_per_epoch=steps_per_epoch,
+      batch_size=batch_size,
+      epochs=epochs - initial_epoch,
+      shuffle=shuffle)
 
-  try:
-    val_x, val_y, val_sample_weights = validation_data, None, None
-    if do_validation and not val_gen:
-      # Prepare data for validation
-      if len(validation_data) == 2:
-        val_x, val_y = validation_data  # pylint: disable=unpacking-non-sequence
-        val_sample_weights = None
-      elif len(validation_data) == 3:
-        val_x, val_y, val_sample_weights = validation_data  # pylint: disable=unpacking-non-sequence
-      else:
-        raise ValueError(
-            '`validation_data` should be a tuple '
-            '`(val_x, val_y, val_sample_weight)` '
-            'or `(val_x, val_y)`. Found: ' + str(validation_data))
-      val_x, val_y, val_sample_weights = model._standardize_user_data(
-          val_x, val_y, val_sample_weights)
+  do_validation = validation_data is not None
+  should_set_learning_phase = context.executing_eagerly() and model.run_eagerly
+  is_sequence = isinstance(generator, data_utils.Sequence)
+  _validate_arguments(is_sequence, use_multiprocessing, workers,
+                      steps_per_epoch, validation_data, validation_steps, mode,
+                      kwargs)
 
-    callbacks = cbks.configure_callbacks(
-        callbacks,
-        model,
-        do_validation=do_validation,
-        val_inputs=val_x,
-        val_targets=val_y,
-        val_sample_weights=val_sample_weights,
-        epochs=epochs,
-        validation_steps=validation_steps,
-        steps_per_epoch=steps_per_epoch,
-        verbose=verbose)
+  batch_function = _make_execution_function(
+      model, mode, class_weight=class_weight)
 
-    if workers > 0:
-      if is_sequence:
-        enqueuer = OrderedEnqueuer(
-            generator,
-            use_multiprocessing=use_multiprocessing,
-            shuffle=shuffle)
-      else:
-        enqueuer = GeneratorEnqueuer(
-            generator,
-            use_multiprocessing=use_multiprocessing)
-      enqueuer.start(workers=workers, max_queue_size=max_queue_size)
-      output_generator = enqueuer.get()
-    else:
-      if is_sequence:
-        output_generator = iter_sequence_infinite(generator)
-      else:
-        output_generator = generator
+  # Create the queue for the generator.
+  output_generator, enqueuer = _make_enqueued_generator(
+      generator,
+      workers=workers,
+      use_multiprocessing=use_multiprocessing,
+      max_queue_size=max_queue_size,
+      shuffle=shuffle)
 
-    callbacks.on_train_begin()
-    # Construct epoch logs.
+  num_samples_or_steps, use_steps = _get_num_samples_or_steps(
+      data, steps_per_epoch)
+
+  count_mode = 'steps' if use_steps else 'samples'
+  callbacks = cbks.configure_callbacks(
+      callbacks,
+      model,
+      do_validation=do_validation,
+      epochs=epochs,
+      steps_per_epoch=steps_per_epoch,
+      batch_size=batch_size,
+      samples=num_samples_or_steps,
+      verbose=0,  # Handle ProgBar as part of Callbacks once hooks are ready.
+      mode=mode)
+  # TODO(omalleyt): Handle ProgBar as part of Callbacks once hooks are ready.
+  progbar = training_utils.get_progbar(model, count_mode)
+  progbar.params = callbacks.params
+  progbar.params['verbose'] = verbose
+
+  if mode == 'predict':
+    aggregator = training_utils.OutputsAggregator(True, steps_per_epoch)
+  else:
+    aggregator = training_utils.MetricsAggregator(True, steps_per_epoch)
+
+  if should_set_learning_phase:
+    old_learning_phase = backend.learning_phase()
+    backend.set_learning_phase(1 if mode == 'train' else 0)
+
+  callbacks.model.stop_training = False
+  callbacks._call_begin_hook(mode)
+  progbar.on_train_begin()
+  for epoch in range(initial_epoch, epochs):
+    if callbacks.model.stop_training:
+      break
+
+    # Setup work for each epoch.
+    model.reset_metrics()
     epoch_logs = {}
-    while epoch < epochs:
-      for m in model.stateful_metric_functions:
-        m.reset_states()
-      callbacks.on_epoch_begin(epoch)
-      steps_done = 0
-      batch_index = 0
-      while steps_done < steps_per_epoch:
-        generator_output = next(output_generator)
+    callbacks.on_epoch_begin(epoch, epoch_logs, mode=mode)
+    progbar.on_epoch_begin(epoch, epoch_logs)
 
-        if not hasattr(generator_output, '__len__'):
-          raise ValueError('Output of generator should be '
-                           'a tuple `(x, y, sample_weight)` '
-                           'or `(x, y)`. Found: ' + str(generator_output))
+    for step in range(steps_per_epoch):
+      batch_data = _get_next_batch(output_generator, mode)
+      if batch_data is None:
+        callbacks.model.stop_training = True
+        break
 
-        if len(generator_output) == 2:
-          x, y = generator_output
-          sample_weight = None
-        elif len(generator_output) == 3:
-          x, y, sample_weight = generator_output
-        else:
-          raise ValueError('Output of generator should be '
-                           'a tuple `(x, y, sample_weight)` '
-                           'or `(x, y)`. Found: ' + str(generator_output))
-        # build batch logs
-        batch_logs = {}
-        if isinstance(x, list):
-          batch_size = x[0].shape[0]
-        elif isinstance(x, dict):
-          batch_size = list(x.values())[0].shape[0]
-        else:
-          batch_size = x.shape[0]
-        batch_logs['batch'] = batch_index
-        batch_logs['size'] = batch_size
-        callbacks.on_batch_begin(batch_index, batch_logs)
+      # `batch_size` used for validation data if validation
+      # data is NumPy/EagerTensors.
+      batch_size = int(nest.flatten(batch_data)[0].shape[0])
 
-        outs = model.train_on_batch(
-            x, y, sample_weight=sample_weight, class_weight=class_weight)
+      # Callbacks batch begin.
+      batch_logs = {'batch': step, 'size': batch_size}
+      callbacks._call_batch_hook(mode, 'begin', step, batch_logs)
+      progbar.on_batch_begin(step, batch_logs)
 
-        if not isinstance(outs, list):
-          outs = [outs]
-        for l, o in zip(model.metrics_names, outs):
-          batch_logs[l] = o
+      batch_outs = batch_function(*batch_data)
+      if not isinstance(batch_outs, list):
+        batch_outs = [batch_outs]
 
-        callbacks.on_batch_end(batch_index, batch_logs)
+      # Aggregate results.
+      if step == 0:
+        aggregator.create(batch_outs)
+      aggregator.aggregate(batch_outs)
 
-        batch_index += 1
-        steps_done += 1
+      # Callbacks batch end.
+      batch_logs.update(training_utils.make_logs(model, batch_outs, mode))
+      callbacks._call_batch_hook(mode, 'end', step, batch_logs)
+      progbar.on_batch_end(step, batch_logs)
 
-        # Epoch finished.
-        if steps_done >= steps_per_epoch and do_validation:
-          if val_gen:
-            val_outs = evaluate_generator(
-                model,
-                validation_data,
-                validation_steps,
-                workers=workers,
-                use_multiprocessing=use_multiprocessing,
-                max_queue_size=max_queue_size)
-          else:
-            # No need for try/except because
-            # data has already been validated.
-            val_outs = model.evaluate(
-                val_x,
-                val_y,
-                batch_size=batch_size,
-                sample_weight=val_sample_weights,
-                verbose=0)
-          if not isinstance(val_outs, list):
-            val_outs = [val_outs]
-          # Same labels assumed.
-          for l, o in zip(model.metrics_names, val_outs):
-            epoch_logs['val_' + l] = o
-
-        if callbacks.model.stop_training:
-          break
-
-      callbacks.on_epoch_end(epoch, epoch_logs)
-      epoch += 1
       if callbacks.model.stop_training:
         break
 
-  finally:
-    try:
-      if enqueuer is not None:
-        enqueuer.stop()
-    finally:
-      if val_enqueuer is not None:
-        val_enqueuer.stop()
+    aggregator.finalize()
+    results = aggregator.results
+    epoch_logs.update(training_utils.make_logs(model, results, mode))
+    if len(results) == 1:
+      results = results[0]
 
-  callbacks.on_train_end()
-  return model.history
+    # Run the test loop every epoch during training.
+    if do_validation and not callbacks.model.stop_training:
+      val_results = model_iteration(
+          model,
+          validation_data,
+          steps_per_epoch=validation_steps,
+          batch_size=batch_size,
+          class_weight=class_weight,
+          workers=workers,
+          use_multiprocessing=use_multiprocessing,
+          max_queue_size=max_queue_size,
+          mode='test')
+
+      if not isinstance(val_results, list):
+        val_results = [val_results]
+      epoch_logs.update(
+          training_utils.make_logs(model, val_results, mode, prefix='val_'))
+
+    callbacks.on_epoch_end(epoch, epoch_logs, mode=mode)
+    progbar.on_epoch_end(epoch, epoch_logs)
+  callbacks._call_end_hook(mode)
+
+  if enqueuer is not None:
+    enqueuer.stop()
+
+  if should_set_learning_phase:
+    backend.set_learning_phase(old_learning_phase)
+
+  if mode == 'train':
+    return model.history
+  return results
 
 
-def evaluate_generator(model,
-                       generator,
-                       steps=None,
-                       max_queue_size=10,
-                       workers=1,
-                       use_multiprocessing=False,
-                       verbose=0):
-  """See docstring for `Model.evaluate_generator`."""
-  if not context.executing_eagerly():
-    model._make_test_function()
+# Maintain compatibility with the existing names.
+fit_generator = functools.partial(model_iteration, mode='train')
+evaluate_generator = functools.partial(model_iteration, mode='test')
+predict_generator = functools.partial(model_iteration, mode='predict')
 
-  if hasattr(model, 'metrics'):
-    for m in model.stateful_metric_functions:
-      m.reset_states()
 
-  steps_done = 0
-  all_outs = []
-  batch_sizes = []
-  is_sequence = isinstance(generator, Sequence)
+def _get_next_batch(output_generator, mode):
+  """Retrieves the next batch of input data."""
+  try:
+    generator_output = next(output_generator)
+  except (errors.OutOfRangeError, StopIteration):
+    # Returning `None` will trigger looping to stop.
+    logging.warning('Your dataset iterator ran out of data.')
+    return None
+  if not isinstance(generator_output, tuple):
+    if mode == 'predict':
+      # Always wrap in a tuple.
+      return (generator_output,)
+    else:
+      raise ValueError('Output of generator should be '
+                       'a tuple `(x, y, sample_weight)` '
+                       'or `(x, y)`. Found: ' + str(generator_output))
+
+  if len(generator_output) < 1 or len(generator_output) > 3:
+    raise ValueError('Output of generator should be '
+                     'a tuple `(x, y, sample_weight)` '
+                     'or `(x, y)` or (x,). Found: ' + str(generator_output))
+  return generator_output
+
+
+def _validate_arguments(is_sequence, use_multiprocessing, workers,
+                        steps_per_epoch, validation_data, validation_steps,
+                        mode, kwargs):
+  """Raises errors if arguments are invalid.
+
+  Arguments:
+    is_sequence: Boolean, whether data is a `keras.utils.data_utils.Sequence`
+      instance.
+    use_multiprocessing: Boolean. If `True`, use process-based threading. If
+      unspecified, `use_multiprocessing` will default to `False`. Note that
+      because this implementation relies on multiprocessing, you should not pass
+      non-picklable arguments to the generator as they can't be passed easily to
+      children processes.
+    workers: Integer. Maximum number of processes to spin up when using
+      process-based threading. If unspecified, `workers` will default to 1. If
+      0, will execute the generator on the main thread.
+    steps_per_epoch: Total number of steps (batches of samples) before declaring
+      one epoch finished and starting the next epoch. Ignored with the default
+      value of `None`.
+    validation_data: Either a tuple of NumPy/Tensor inputs (i.e. `(x,)` or `(x,
+      y)` or `(x, y, sample_weights)`) or a generator or
+      `keras.utils.data_utils.Sequence` object or Eager Iterator or Dataset.
+    validation_steps: Total number of steps (batches of samples) before
+      declaring validation finished.
+    mode: One of 'train'/'test'/'predict'.
+    kwargs: Additional arguments for backwards compatibility.
+
+  Raises:
+    ValueError: If `steps_per_epoch` or `validation_steps` are not passed
+      for data types that require them, or if unrecognized keyword
+      arguments are passed.
+  """
   if not is_sequence and use_multiprocessing and workers > 1:
     logging.warning(
         UserWarning('Using a generator with `use_multiprocessing=True`'
                     ' and multiple workers may duplicate your data.'
-                    ' Please consider using the`keras.utils.Sequence'
+                    ' Please consider using the `keras.utils.Sequence`'
                     ' class.'))
-  if steps is None:
-    if is_sequence:
-      steps = len(generator)
-    else:
-      raise ValueError('`steps=None` is only valid for a generator'
-                       ' based on the `keras.utils.Sequence` class.'
-                       ' Please specify `steps` or use the'
-                       ' `keras.utils.Sequence` class.')
+
+  if steps_per_epoch is None:
+    arg_name = 'steps_per_epoch' if mode == 'train' else 'steps'
+    raise ValueError('Please specify the number of steps via the '
+                     '`{}` argument.'.format(arg_name))
+
+  val_gen = (
+      data_utils.is_generator_or_sequence(validation_data) or
+      isinstance(validation_data, iterator_ops.EagerIterator) or
+      isinstance(validation_data, dataset_ops.DatasetV2))
+  if (val_gen and not isinstance(validation_data, data_utils.Sequence) and
+      not validation_steps):
+    raise ValueError('Please specify the `validation_steps` argument.')
+
+  if any(k != 'steps' for k in kwargs):
+    raise ValueError('Invalid arguments passed: {}'.format(
+        [k for k in kwargs if k != 'steps']))
+
+
+def convert_to_generator_like(data,
+                              batch_size=None,
+                              steps_per_epoch=None,
+                              epochs=1,
+                              shuffle=False):
+  """Make a generator out of NumPy or EagerTensor inputs.
+
+  Arguments:
+    data: Either a generator or `keras.utils.data_utils.Sequence` object or
+      `Dataset` or `EagerIterator` or a {1,2,3}-tuple of NumPy arrays or
+      EagerTensors. If a tuple, the elements represent `(x, y, sample_weights)`
+      and may be `None` or `[None]`.
+    batch_size: Used when creating a generator out of tuples of NumPy arrays or
+      EagerTensors.
+    steps_per_epoch: Steps of the generator to run each epoch.
+    epochs: Total number of epochs to run.
+    shuffle: Whether the data should be shuffled.
+
+  Returns:
+    - Generator or `keras.utils.data_utils.Sequence` or EagerIterator.
+
+  Raises:
+    - ValueError: If `batch_size` is not provided for NumPy or EagerTensor
+      inputs.
+  """
+  if isinstance(data, tuple):
+    # Scrub `Nones` that might have been passed for `targets`, `sample_weights`.
+    data = tuple(
+        ele for ele in data if not all(e is None for e in nest.flatten(ele)))
+    if len(data) == 1:
+      data = data[0]
+
+  if data_utils.is_generator_or_sequence(data) or isinstance(
+      data, iterator_ops.EagerIterator):
+    if isinstance(data, data_utils.Sequence):
+      steps_per_epoch = len(data)
+    return data, steps_per_epoch
+  if isinstance(data, dataset_ops.DatasetV2):
+    return dataset_ops.make_one_shot_iterator(data), steps_per_epoch
+
+  # Create generator from NumPy or EagerTensor Input.
+  num_samples = int(nest.flatten(data)[0].shape[0])
+  if batch_size is None:
+    raise ValueError('You must specify `batch_size`')
+  steps_per_epoch = int(math.ceil(num_samples / batch_size))
+
+  def _gen(data):
+    """Makes a generator out of a structure of NumPy/EagerTensors."""
+    index_array = np.arange(num_samples)
+    for _ in range(epochs):
+      if shuffle:
+        np.random.shuffle(index_array)
+      batches = generic_utils.make_batches(num_samples, batch_size)
+      for (batch_start, batch_end) in batches:
+        batch_ids = index_array[batch_start:batch_end]
+        flat_batch_data = training_utils.slice_arrays(
+            nest.flatten(data), batch_ids, contiguous=(not shuffle))
+        yield nest.pack_sequence_as(data, flat_batch_data)
+
+  return _gen(data), steps_per_epoch
+
+
+def _make_enqueued_generator(generator,
+                             workers=1,
+                             use_multiprocessing=False,
+                             max_queue_size=10,
+                             shuffle=False):
+  """Create a buffered queue of next elements of the generator."""
+  is_sequence = isinstance(generator, data_utils.Sequence)
   enqueuer = None
-
-  try:
-    if workers > 0:
-      if is_sequence:
-        enqueuer = OrderedEnqueuer(
-            generator, use_multiprocessing=use_multiprocessing)
-      else:
-        enqueuer = GeneratorEnqueuer(
-            generator,
-            use_multiprocessing=use_multiprocessing)
-      enqueuer.start(workers=workers, max_queue_size=max_queue_size)
-      output_generator = enqueuer.get()
-    else:
-      if is_sequence:
-        output_generator = iter_sequence_infinite(generator)
-      else:
-        output_generator = generator
-
-    if verbose == 1:
-      progbar = Progbar(target=steps)
-
-    while steps_done < steps:
-      generator_output = next(output_generator)
-      if not hasattr(generator_output, '__len__'):
-        raise ValueError('Output of generator should be a tuple '
-                         '(x, y, sample_weight) '
-                         'or (x, y). Found: ' + str(generator_output))
-      if len(generator_output) == 2:
-        x, y = generator_output
-        sample_weight = None
-      elif len(generator_output) == 3:
-        x, y, sample_weight = generator_output
-      else:
-        raise ValueError('Output of generator should be a tuple '
-                         '(x, y, sample_weight) '
-                         'or (x, y). Found: ' + str(generator_output))
-      outs = model.test_on_batch(x, y, sample_weight=sample_weight)
-
-      if isinstance(x, list):
-        batch_size = x[0].shape[0]
-      elif isinstance(x, dict):
-        batch_size = list(x.values())[0].shape[0]
-      else:
-        batch_size = x.shape[0]
-      if batch_size == 0:
-        raise ValueError('Received an empty batch. '
-                         'Batches should at least contain one item.')
-      all_outs.append(outs)
-
-      steps_done += 1
-      batch_sizes.append(batch_size)
-      if verbose == 1:
-        progbar.update(steps_done)
-
-  finally:
-    if enqueuer is not None:
-      enqueuer.stop()
-
-  if not isinstance(outs, list):
-    return np.average(np.asarray(all_outs), weights=batch_sizes)
-  else:
-    averages = [float(all_outs[-1][0])]  # index 0 = 'loss'
-    averages.extend([
-        np.average([out[i]
-                    for out in all_outs], weights=batch_sizes)
-        for i in range(1, len(outs))
-    ])
-    return averages
-
-
-def predict_generator(model,
-                      generator,
-                      steps=None,
-                      max_queue_size=10,
-                      workers=1,
-                      use_multiprocessing=False,
-                      verbose=0):
-  """See docstring for `Model.predict_generator`."""
-  if not context.executing_eagerly():
-    model._make_predict_function()
-
-  steps_done = 0
-  all_outs = []
-  is_sequence = isinstance(generator, Sequence)
-  if not is_sequence and use_multiprocessing and workers > 1:
-    logging.warning(
-        UserWarning('Using a generator with `use_multiprocessing=True`'
-                    ' and multiple workers may duplicate your data.'
-                    ' Please consider using the`keras.utils.Sequence'
-                    ' class.'))
-  if steps is None:
+  if workers > 0:
     if is_sequence:
-      steps = len(generator)
+      enqueuer = data_utils.OrderedEnqueuer(
+          generator, use_multiprocessing=use_multiprocessing, shuffle=shuffle)
     else:
-      raise ValueError('`steps=None` is only valid for a generator'
-                       ' based on the `keras.utils.Sequence` class.'
-                       ' Please specify `steps` or use the'
-                       ' `keras.utils.Sequence` class.')
-  enqueuer = None
-
-  try:
-    if workers > 0:
-      if is_sequence:
-        enqueuer = OrderedEnqueuer(
-            generator, use_multiprocessing=use_multiprocessing)
-      else:
-        enqueuer = GeneratorEnqueuer(
-            generator,
-            use_multiprocessing=use_multiprocessing)
-      enqueuer.start(workers=workers, max_queue_size=max_queue_size)
-      output_generator = enqueuer.get()
-    else:
-      if is_sequence:
-        output_generator = iter_sequence_infinite(generator)
-      else:
-        output_generator = generator
-
-    if verbose == 1:
-      progbar = Progbar(target=steps)
-
-    while steps_done < steps:
-      generator_output = next(output_generator)
-      if isinstance(generator_output, tuple):
-        # Compatibility with the generators
-        # used for training.
-        if len(generator_output) == 2:
-          x, _ = generator_output
-        elif len(generator_output) == 3:
-          x, _, _ = generator_output
-        else:
-          raise ValueError('Output of generator should be '
-                           'a tuple `(x, y, sample_weight)` '
-                           'or `(x, y)`. Found: ' + str(generator_output))
-      else:
-        # Assumes a generator that only
-        # yields inputs (not targets and sample weights).
-        x = generator_output
-
-      outs = model.predict_on_batch(x)
-      if not isinstance(outs, list):
-        outs = [outs]
-
-      if not all_outs:
-        for out in outs:
-          all_outs.append([])
-
-      for i, out in enumerate(outs):
-        all_outs[i].append(out)
-      steps_done += 1
-      if verbose == 1:
-        progbar.update(steps_done)
-
-  finally:
-    if enqueuer is not None:
-      enqueuer.stop()
-
-  if len(all_outs) == 1:
-    if steps_done == 1:
-      return all_outs[0][0]
-    else:
-      return np.concatenate(all_outs[0])
-  if steps_done == 1:
-    return [out[0] for out in all_outs]
+      enqueuer = data_utils.GeneratorEnqueuer(
+          generator, use_multiprocessing=use_multiprocessing)
+    enqueuer.start(workers=workers, max_queue_size=max_queue_size)
+    output_generator = enqueuer.get()
   else:
-    return [np.concatenate(out) for out in all_outs]
+    if is_sequence:
+      output_generator = data_utils.iter_sequence_infinite(generator)
+    else:
+      output_generator = generator
+  return output_generator, enqueuer
+
+
+def _make_execution_function(model, mode, class_weight=None):
+  """Makes function to run one step of model execution."""
+  if mode == 'train':
+    if not context.executing_eagerly():
+      model._make_fit_function()
+    f = functools.partial(model.train_on_batch, class_weight=class_weight)
+  elif mode == 'test':
+    if not context.executing_eagerly():
+      model._make_eval_function()
+    f = model.test_on_batch
+  else:
+    # Match signature of other modes to allow
+    # 1, 2, or 3-tuples from generator
+    def predict_on_batch(x, y=None, sample_weights=None):  # pylint: disable=unused-argument
+      return model.predict_on_batch(x)
+
+    f = predict_on_batch
+
+  # Maintain stateful metrics across batch-level calls.
+  if mode != 'predict':
+    f = functools.partial(f, reset_metrics=False)
+
+  return f
+
+
+def _get_num_samples_or_steps(data, steps_per_epoch):
+  """Returns number of samples or steps, and whether to use steps count mode."""
+  flat_inputs = nest.flatten(data)
+  if hasattr(flat_inputs[0], 'shape'):
+    return int(flat_inputs[0].shape[0]), False
+  return steps_per_epoch, True
diff --git a/tensorflow/python/keras/engine/training_generator_test.py b/tensorflow/python/keras/engine/training_generator_test.py
index 88e89434242..8941428e43a 100644
--- a/tensorflow/python/keras/engine/training_generator_test.py
+++ b/tensorflow/python/keras/engine/training_generator_test.py
@@ -21,220 +21,274 @@ from __future__ import print_function
 import os
 import unittest
 
+from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.python import keras
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.ops import iterator_ops
+from tensorflow.python.eager import context
 from tensorflow.python.framework import test_util as tf_test_util
 from tensorflow.python.keras import metrics as metrics_module
+from tensorflow.python.keras import testing_utils
+from tensorflow.python.keras.engine import training_generator
 from tensorflow.python.platform import test
 from tensorflow.python.training.rmsprop import RMSPropOptimizer
+from tensorflow.python.util import nest
 
 
-class TestGeneratorMethods(test.TestCase):
+def custom_generator(mode=2):
+  batch_size = 10
+  num_samples = 50
+  arr_data = np.random.random((num_samples, 2))
+  arr_labels = np.random.random((num_samples, 4))
+  arr_weights = np.random.random((num_samples,))
+  i = 0
+  while True:
+    batch_index = i * batch_size % num_samples
+    i += 1
+    start = batch_index
+    end = start + batch_size
+    x = arr_data[start: end]
+    y = arr_labels[start: end]
+    w = arr_weights[start: end]
+    if mode == 1:
+      yield x
+    elif mode == 2:
+      yield x, y
+    else:
+      yield x, y, w
+
+
+@tf_test_util.run_all_in_graph_and_eager_modes
+class TestGeneratorMethods(test.TestCase, parameterized.TestCase):
 
   @unittest.skipIf(
       os.name == 'nt',
       'use_multiprocessing=True does not work on windows properly.')
-  def test_generator_methods(self):
-    arr_data = np.random.random((50, 2))
-    arr_labels = np.random.random((50,))
+  @parameterized.parameters('sequential', 'functional')
+  def test_fit_generator_method(self, model_type):
+    if model_type == 'sequential':
+      model = testing_utils.get_small_sequential_mlp(
+          num_hidden=3, num_classes=4, input_dim=2)
+    else:
+      model = testing_utils.get_small_functional_mlp(
+          num_hidden=3, num_classes=4, input_dim=2)
+    model.compile(
+        loss='mse',
+        optimizer='sgd',
+        metrics=['mae', metrics_module.CategoricalAccuracy()])
 
-    def custom_generator():
-      batch_size = 10
-      num_samples = 50
-      while True:
-        batch_index = np.random.randint(0, num_samples - batch_size)
-        start = batch_index
-        end = start + batch_size
-        x = arr_data[start: end]
-        y = arr_labels[start: end]
-        yield x, y
+    model.fit_generator(custom_generator(),
+                        steps_per_epoch=5,
+                        epochs=1,
+                        verbose=1,
+                        max_queue_size=10,
+                        workers=4,
+                        use_multiprocessing=True)
+    model.fit_generator(custom_generator(),
+                        steps_per_epoch=5,
+                        epochs=1,
+                        verbose=1,
+                        max_queue_size=10,
+                        use_multiprocessing=False)
+    model.fit_generator(custom_generator(),
+                        steps_per_epoch=5,
+                        epochs=1,
+                        verbose=1,
+                        max_queue_size=10,
+                        use_multiprocessing=False,
+                        validation_data=custom_generator(),
+                        validation_steps=10)
+    model.fit_generator(custom_generator(),
+                        steps_per_epoch=5,
+                        validation_data=custom_generator(),
+                        validation_steps=1,
+                        workers=0)
 
-    with self.cached_session():
-      x = keras.Input((2,))
-      y = keras.layers.Dense(1)(x)
-      fn_model = keras.models.Model(x, y)
-      fn_model.compile(
-          loss='mse',
-          optimizer='sgd',
-          metrics=['mae', metrics_module.CategoricalAccuracy()])
+  @unittest.skipIf(
+      os.name == 'nt',
+      'use_multiprocessing=True does not work on windows properly.')
+  @parameterized.parameters('sequential', 'functional')
+  def test_evaluate_generator_method(self, model_type):
+    if model_type == 'sequential':
+      model = testing_utils.get_small_sequential_mlp(
+          num_hidden=3, num_classes=4, input_dim=2)
+    else:
+      model = testing_utils.get_small_functional_mlp(
+          num_hidden=3, num_classes=4, input_dim=2)
+    model.compile(
+        loss='mse',
+        optimizer='sgd',
+        metrics=['mae', metrics_module.CategoricalAccuracy()])
+    model.summary()
 
-      seq_model = keras.models.Sequential()
-      seq_model.add(keras.layers.Dense(1, input_shape=(2,)))
-      seq_model.compile(loss='mse', optimizer='sgd')
+    model.evaluate_generator(custom_generator(),
+                             steps=5,
+                             max_queue_size=10,
+                             workers=2,
+                             verbose=1,
+                             use_multiprocessing=True)
+    model.evaluate_generator(custom_generator(),
+                             steps=5,
+                             max_queue_size=10,
+                             use_multiprocessing=False)
+    model.evaluate_generator(custom_generator(),
+                             steps=5,
+                             max_queue_size=10,
+                             use_multiprocessing=False,
+                             workers=0)
 
-      for model in [fn_model, seq_model]:
-        model.fit_generator(custom_generator(),
-                            steps_per_epoch=5,
-                            epochs=1,
-                            verbose=1,
+  @unittest.skipIf(
+      os.name == 'nt',
+      'use_multiprocessing=True does not work on windows properly.')
+  @parameterized.parameters('sequential', 'functional')
+  def test_predict_generator_method(self, model_type):
+    if model_type == 'sequential':
+      model = testing_utils.get_small_sequential_mlp(
+          num_hidden=3, num_classes=4, input_dim=2)
+    else:
+      model = testing_utils.get_small_functional_mlp(
+          num_hidden=3, num_classes=4, input_dim=2)
+    model.compile(
+        loss='mse',
+        optimizer='sgd',
+        metrics=['mae', metrics_module.CategoricalAccuracy()])
+
+    model.predict_generator(custom_generator(),
+                            steps=5,
                             max_queue_size=10,
-                            workers=4,
+                            workers=2,
                             use_multiprocessing=True)
-        model.fit_generator(custom_generator(),
-                            steps_per_epoch=5,
-                            epochs=1,
-                            verbose=1,
+    model.predict_generator(custom_generator(),
+                            steps=5,
                             max_queue_size=10,
                             use_multiprocessing=False)
-        model.fit_generator(custom_generator(),
-                            steps_per_epoch=5,
-                            epochs=1,
-                            verbose=1,
+    model.predict_generator(custom_generator(),
+                            steps=5,
+                            max_queue_size=10,
+                            workers=0)
+    # Test generator with just inputs (no targets)
+    model.predict_generator(custom_generator(mode=1),
+                            steps=5,
+                            max_queue_size=10,
+                            workers=2,
+                            use_multiprocessing=True)
+    model.predict_generator(custom_generator(mode=1),
+                            steps=5,
+                            max_queue_size=10,
+                            use_multiprocessing=False)
+    model.predict_generator(custom_generator(mode=1),
+                            steps=5,
                             max_queue_size=10,
-                            use_multiprocessing=False,
-                            validation_data=custom_generator(),
-                            validation_steps=10)
-        model.fit_generator(custom_generator(),
-                            steps_per_epoch=5,
-                            validation_data=custom_generator(),
-                            validation_steps=1,
                             workers=0)
-        model.predict_generator(custom_generator(),
-                                steps=5,
-                                max_queue_size=10,
-                                workers=2,
-                                use_multiprocessing=True)
-        model.predict_generator(custom_generator(),
-                                steps=5,
-                                max_queue_size=10,
-                                use_multiprocessing=False)
-        model.predict_generator(custom_generator(),
-                                steps=5,
-                                max_queue_size=10,
-                                workers=0)
-        model.evaluate_generator(custom_generator(),
-                                 steps=5,
-                                 max_queue_size=10,
-                                 workers=2,
-                                 verbose=1,
-                                 use_multiprocessing=True)
-        model.evaluate_generator(custom_generator(),
-                                 steps=5,
-                                 max_queue_size=10,
-                                 use_multiprocessing=False)
-        model.evaluate_generator(custom_generator(),
-                                 steps=5,
-                                 max_queue_size=10,
-                                 use_multiprocessing=False,
-                                 workers=0)
 
   def test_generator_methods_with_sample_weights(self):
-    arr_data = np.random.random((50, 2))
-    arr_labels = np.random.random((50,))
-    arr_sample_weights = np.random.random((50,))
+    model = keras.models.Sequential()
+    model.add(keras.layers.Dense(4, input_shape=(2,)))
+    model.compile(
+        loss='mse',
+        optimizer='sgd',
+        metrics=['mae', metrics_module.CategoricalAccuracy()])
 
-    def custom_generator():
-      batch_size = 10
-      num_samples = 50
-      while True:
-        batch_index = np.random.randint(0, num_samples - batch_size)
-        start = batch_index
-        end = start + batch_size
-        x = arr_data[start: end]
-        y = arr_labels[start: end]
-        w = arr_sample_weights[start: end]
-        yield x, y, w
+    model.fit_generator(custom_generator(mode=3),
+                        steps_per_epoch=5,
+                        epochs=1,
+                        verbose=1,
+                        max_queue_size=10,
+                        use_multiprocessing=False)
+    model.fit_generator(custom_generator(mode=3),
+                        steps_per_epoch=5,
+                        epochs=1,
+                        verbose=1,
+                        max_queue_size=10,
+                        use_multiprocessing=False,
+                        validation_data=custom_generator(mode=3),
+                        validation_steps=10)
+    model.predict_generator(custom_generator(mode=3),
+                            steps=5,
+                            max_queue_size=10,
+                            use_multiprocessing=False)
+    model.evaluate_generator(custom_generator(mode=3),
+                             steps=5,
+                             max_queue_size=10,
+                             use_multiprocessing=False)
 
-    with self.cached_session():
-      model = keras.models.Sequential()
-      model.add(keras.layers.Dense(1, input_shape=(2,)))
-      model.compile(
-          loss='mse',
-          optimizer='sgd',
-          metrics=['mae', metrics_module.CategoricalAccuracy()])
+  def test_generator_methods_invalid_use_case(self):
 
-      model.fit_generator(custom_generator(),
+    def invalid_generator():
+      while 1:
+        yield 0
+
+    model = keras.models.Sequential()
+    model.add(keras.layers.Dense(4, input_shape=(2,)))
+    model.compile(loss='mse', optimizer='sgd')
+
+    with self.assertRaises(ValueError):
+      model.fit_generator(invalid_generator(),
                           steps_per_epoch=5,
                           epochs=1,
                           verbose=1,
                           max_queue_size=10,
                           use_multiprocessing=False)
+    with self.assertRaises(ValueError):
       model.fit_generator(custom_generator(),
                           steps_per_epoch=5,
                           epochs=1,
                           verbose=1,
                           max_queue_size=10,
                           use_multiprocessing=False,
-                          validation_data=custom_generator(),
+                          validation_data=invalid_generator(),
                           validation_steps=10)
-      model.predict_generator(custom_generator(),
+    with self.assertRaises(AttributeError):
+      model.predict_generator(invalid_generator(),
                               steps=5,
                               max_queue_size=10,
                               use_multiprocessing=False)
-      model.evaluate_generator(custom_generator(),
+    with self.assertRaises(ValueError):
+      model.evaluate_generator(invalid_generator(),
                                steps=5,
                                max_queue_size=10,
                                use_multiprocessing=False)
 
-  def test_generator_methods_invalid_use_case(self):
+  def test_generator_input_to_fit_eval_predict(self):
+    val_data = np.ones([10, 10], np.float32), np.ones([10, 1], np.float32)
 
-    def custom_generator():
-      while 1:
-        yield 0
+    def ones_generator():
+      while True:
+        yield np.ones([10, 10], np.float32), np.ones([10, 1], np.float32)
 
-    with self.cached_session():
-      model = keras.models.Sequential()
-      model.add(keras.layers.Dense(1, input_shape=(2,)))
-      model.compile(loss='mse', optimizer='sgd')
+    inputs = keras.layers.Input(shape=(10,))
+    x = keras.layers.Dense(10, activation='relu')(inputs)
+    outputs = keras.layers.Dense(1, activation='sigmoid')(x)
+    model = keras.Model(inputs, outputs)
 
-      with self.assertRaises(ValueError):
-        model.fit_generator(custom_generator(),
-                            steps_per_epoch=5,
-                            epochs=1,
-                            verbose=1,
-                            max_queue_size=10,
-                            use_multiprocessing=False)
-      with self.assertRaises(ValueError):
-        model.fit_generator(custom_generator(),
-                            steps_per_epoch=5,
-                            epochs=1,
-                            verbose=1,
-                            max_queue_size=10,
-                            use_multiprocessing=False,
-                            validation_data=custom_generator(),
-                            validation_steps=10)
-      with self.assertRaises(AttributeError):
-        model.predict_generator(custom_generator(),
-                                steps=5,
-                                max_queue_size=10,
-                                use_multiprocessing=False)
-      with self.assertRaises(ValueError):
-        model.evaluate_generator(custom_generator(),
-                                 steps=5,
-                                 max_queue_size=10,
-                                 use_multiprocessing=False)
+    model.compile(RMSPropOptimizer(0.001), 'binary_crossentropy')
+    model.fit(
+        ones_generator(),
+        steps_per_epoch=2,
+        validation_data=val_data,
+        epochs=2)
+    model.evaluate(ones_generator(), steps=2)
+    model.predict(ones_generator(), steps=2)
+
+
+@tf_test_util.run_all_in_graph_and_eager_modes
+class TestGeneratorMethodsWithSequences(test.TestCase):
 
   def test_training_with_sequences(self):
 
     class DummySequence(keras.utils.Sequence):
 
       def __getitem__(self, idx):
-        return np.zeros([10, 2]), np.ones([10])
+        return np.zeros([10, 2]), np.ones([10, 4])
 
       def __len__(self):
         return 10
 
-    arr_data = np.random.random((50, 2))
-    arr_labels = np.random.random((50,))
-    arr_sample_weights = np.random.random((50,))
-
-    def custom_generator():
-      batch_size = 10
-      num_samples = 50
-      while True:
-        batch_index = np.random.randint(0, num_samples - batch_size)
-        start = batch_index
-        end = start + batch_size
-        x = arr_data[start: end]
-        y = arr_labels[start: end]
-        w = arr_sample_weights[start: end]
-        yield x, y, w
-
-    with self.cached_session():
-      model = keras.models.Sequential()
-      model.add(keras.layers.Dense(1, input_shape=(2,)))
-      model.compile(loss='mse', optimizer='sgd')
+    model = keras.models.Sequential()
+    model.add(keras.layers.Dense(4, input_shape=(2,)))
+    model.compile(loss='mse', optimizer='sgd')
 
     model.fit_generator(DummySequence(),
                         steps_per_epoch=10,
@@ -251,29 +305,6 @@ class TestGeneratorMethods(test.TestCase):
                         workers=0,
                         use_multiprocessing=False)
 
-  @tf_test_util.run_in_graph_and_eager_modes
-  def test_generator_input_to_fit_eval_predict(self):
-    val_data = np.ones([10, 10], np.float32), np.ones([10, 1], np.float32)
-
-    def custom_generator():
-      while True:
-        yield np.ones([10, 10], np.float32), np.ones([10, 1], np.float32)
-
-    inputs = keras.layers.Input(shape=(10,))
-    x = keras.layers.Dense(10, activation='relu')(inputs)
-    outputs = keras.layers.Dense(1, activation='sigmoid')(x)
-    model = keras.Model(inputs, outputs)
-
-    model.compile(RMSPropOptimizer(0.001), 'binary_crossentropy')
-    model.fit(
-        custom_generator(),
-        steps_per_epoch=2,
-        validation_data=val_data,
-        epochs=2)
-    model.evaluate(custom_generator(), steps=2)
-    model.predict(custom_generator(), steps=2)
-
-  @tf_test_util.run_in_graph_and_eager_modes
   def test_sequence_input_to_fit_eval_predict(self):
     val_data = np.ones([10, 10], np.float32), np.ones([10, 1], np.float32)
 
@@ -303,5 +334,56 @@ class TestGeneratorMethods(test.TestCase):
       model.fit(CustomSequence(), sample_weight=np.ones([10, 1]))
 
 
+@tf_test_util.run_all_in_graph_and_eager_modes
+class TestConvertToGeneratorLike(test.TestCase, parameterized.TestCase):
+  simple_inputs = (np.ones((10, 10)), np.ones((10, 1)))
+  nested_inputs = ((np.ones((10, 10)), np.ones((10, 20))), (np.ones((10, 1)),
+                                                            np.ones((10, 3))))
+
+  def _make_dataset(self, inputs, batches):
+    return dataset_ops.DatasetV2.from_tensors(inputs).repeat(batches)
+
+  def _make_iterator(self, inputs, batches):
+    return dataset_ops.make_one_shot_iterator(
+        self._make_dataset(inputs, batches))
+
+  def _make_generator(self, inputs, batches):
+
+    def _gen():
+      for _ in range(batches):
+        yield inputs
+
+    return _gen()
+
+  def _make_numpy(self, inputs, _):
+    return inputs
+
+  @parameterized.named_parameters(
+      ('simple_dataset', _make_dataset, simple_inputs),
+      ('simple_iterator', _make_iterator, simple_inputs),
+      ('simple_generator', _make_generator, simple_inputs),
+      ('simple_numpy', _make_numpy, simple_inputs),
+      ('nested_dataset', _make_dataset, nested_inputs),
+      ('nested_iterator', _make_iterator, nested_inputs),
+      ('nested_generator', _make_generator, nested_inputs),
+      ('nested_numpy', _make_numpy, nested_inputs))
+  def test_convert_to_generator_like(self, input_fn, inputs):
+    expected_batches = 5
+    data = input_fn(self, inputs, expected_batches)
+
+    # Dataset and Iterator not supported in Legacy Graph mode.
+    if (not context.executing_eagerly() and
+        isinstance(data, (dataset_ops.DatasetV2, iterator_ops.Iterator))):
+      return
+
+    generator, steps = training_generator.convert_to_generator_like(
+        data, batch_size=2, steps_per_epoch=expected_batches)
+    self.assertEqual(steps, expected_batches)
+
+    for _ in range(expected_batches):
+      outputs = next(generator)
+    nest.assert_same_structure(outputs, inputs)
+
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/engine/training_gpu_test.py b/tensorflow/python/keras/engine/training_gpu_test.py
index 596d085f3fa..45dcfe43995 100644
--- a/tensorflow/python/keras/engine/training_gpu_test.py
+++ b/tensorflow/python/keras/engine/training_gpu_test.py
@@ -69,7 +69,7 @@ class TrainingGPUTest(test.TestCase):
       return simple_model
 
     if test.is_gpu_available(cuda_only=True):
-      with self.session(use_gpu=True):
+      with test_util.use_gpu():
         losses_to_test = ['sparse_categorical_crossentropy',
                           'categorical_crossentropy', 'binary_crossentropy']
 
diff --git a/tensorflow/python/keras/engine/training_test.py b/tensorflow/python/keras/engine/training_test.py
index 3cb24255d15..8e3b61be0c1 100644
--- a/tensorflow/python/keras/engine/training_test.py
+++ b/tensorflow/python/keras/engine/training_test.py
@@ -37,6 +37,7 @@ from tensorflow.python.keras import testing_utils
 from tensorflow.python.keras.callbacks import Callback
 from tensorflow.python.keras.engine.training_utils import weighted_masked_objective
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import sparse_ops
 from tensorflow.python.ops import variables as variables_lib
 from tensorflow.python.platform import test
@@ -448,6 +449,7 @@ class TrainingTest(test.TestCase):
           optimizer=keras.optimizers.Adam(lr=0.0001),
           metrics=['accuracy'])
 
+  @tf_test_util.run_deprecated_v1
   def test_that_trainable_disables_updates(self):
     val_a = np.random.random((10, 4))
     val_out = np.random.random((10, 4))
@@ -544,6 +546,152 @@ class TrainingTest(test.TestCase):
               'val_loss', 'val_weighted_mean_absolute_error'
           ]))
 
+  @tf_test_util.run_in_graph_and_eager_modes
+  def test_mismatched_output_shape_and_target_shape(self):
+    model = keras.Sequential([
+        keras.layers.Dense(2, input_shape=(3, 4)),
+        keras.layers.Dense(5),
+    ])
+    model.compile(RMSPropOptimizer(learning_rate=0.001),
+                  loss='sparse_categorical_crossentropy')
+    # Test with Numpy data
+    x_train = np.random.random((10, 3, 4))
+    y_train = np.random.randint(0, 5, size=(10, 3))
+    model.fit(x_train, y_train, batch_size=5, epochs=1)
+
+    # Test with iterator
+    dataset = dataset_ops.Dataset.from_tensor_slices((x_train, y_train))
+    dataset = dataset.repeat(10)
+    dataset = dataset.batch(10)
+    iterator = dataset_ops.make_one_shot_iterator(dataset)
+    model.fit(iterator, epochs=1, steps_per_epoch=2)
+
+    if context.executing_eagerly():
+      # Test with eager execution
+      model.compile(RMSPropOptimizer(learning_rate=0.001),
+                    loss='sparse_categorical_crossentropy',
+                    run_eagerly=True)
+      model.fit(x_train, y_train, batch_size=5, epochs=1)
+
+      # Test with eager execution and iterator
+      model.fit(iterator, epochs=1, steps_per_epoch=2)
+
+  def test_losses_in_defun(self):
+    with context.eager_mode():
+      layer = keras.layers.Dense(1, kernel_regularizer='l1')
+      layer(array_ops.ones([1, 10]))
+
+      @function.defun
+      def get_losses():
+        return layer.losses
+
+      self.assertAllEqual(
+          self.evaluate(layer.losses), self.evaluate(get_losses()))
+
+  @tf_test_util.run_in_graph_and_eager_modes
+  def test_logging(self):
+    mock_stdout = io.BytesIO() if six.PY2 else io.StringIO()
+    model = keras.models.Sequential()
+    model.add(keras.layers.Dense(10, activation='relu'))
+    model.add(keras.layers.Dense(1, activation='sigmoid'))
+    model.compile(
+        RMSPropOptimizer(learning_rate=0.001), loss='binary_crossentropy')
+    with test.mock.patch.object(sys, 'stdout', mock_stdout):
+      model.fit(
+          np.ones((10, 10), 'float32'), np.ones((10, 1), 'float32'), epochs=10)
+    self.assertTrue('Epoch 5/10' in mock_stdout.getvalue())
+
+  @tf_test_util.run_in_graph_and_eager_modes
+  def test_training_with_loss_instance(self):
+    a = keras.layers.Input(shape=(3,), name='input_a')
+    b = keras.layers.Input(shape=(3,), name='input_b')
+
+    dense = keras.layers.Dense(4, name='dense')
+    c = dense(a)
+    d = dense(b)
+    e = keras.layers.Dropout(0.5, name='dropout')(c)
+
+    model = keras.models.Model([a, b], [d, e])
+    loss_weights = [1., 0.5]
+    model.compile(
+        RMSPropOptimizer(learning_rate=0.001),
+        loss=keras.losses.MeanSquaredError(),
+        metrics=[metrics_module.CategoricalAccuracy(), 'mae'],
+        loss_weights=loss_weights)
+
+    input_a_np = np.random.random((10, 3))
+    input_b_np = np.random.random((10, 3))
+
+    output_d_np = np.random.random((10, 4))
+    output_e_np = np.random.random((10, 4))
+
+    model.fit([input_a_np, input_b_np], [output_d_np, output_e_np],
+              epochs=1,
+              batch_size=5)
+
+  @tf_test_util.run_in_graph_and_eager_modes
+  def test_static_batch_in_input_layer(self):
+
+    class Counter(keras.callbacks.Callback):
+
+      def __init__(self):
+        self.batches = 0
+
+      def on_batch_end(self, batch, logs=None):
+        self.batches += 1
+
+    x, y = np.ones((64, 10), 'float32'), np.ones((64, 1), 'float32')
+
+    for batch_size, expected_batches in [(None, 2), (4, 16)]:
+      inputs = keras.Input(batch_size=batch_size, shape=(10,))
+      outputs = keras.layers.Dense(1, activation='sigmoid')(inputs)
+      model = keras.Model(inputs, outputs)
+
+      model.compile(keras.optimizer_v2.adam.Adam(0.001), 'binary_crossentropy')
+      counter = Counter()
+      model.fit(x, y, callbacks=[counter])
+      self.assertEqual(counter.batches, expected_batches)
+
+      model = keras.Sequential(
+          [keras.layers.Dense(1, batch_input_shape=(batch_size, 10))])
+      model.compile(keras.optimizer_v2.adam.Adam(0.001), 'binary_crossentropy')
+      counter = Counter()
+      model.fit(x, y, callbacks=[counter])
+      self.assertEqual(counter.batches, expected_batches)
+
+  @tf_test_util.run_in_graph_and_eager_modes
+  def test_static_batch_in_input_layer_consistency_checks(self):
+    x, y = np.ones((64, 10), 'float32'), np.ones((64, 1), 'float32')
+
+    inputs = keras.Input(batch_size=2, shape=(10,))
+    outputs = keras.layers.Dense(1, activation='sigmoid')(inputs)
+    model = keras.Model(inputs, outputs)
+    model.compile(keras.optimizer_v2.adam.Adam(0.001), 'binary_crossentropy')
+    with self.assertRaisesRegexp(ValueError,
+                                 'incompatible with the specified batch size'):
+      model.fit(x, y, batch_size=4)
+
+    data = dataset_ops.DatasetV2.from_tensor_slices((x, y))
+    data = data.batch(4, drop_remainder=True)
+    with self.assertRaisesRegexp(ValueError,
+                                 'incompatible with the specified batch size'):
+      model.fit(data, steps_per_epoch=16)
+
+  @tf_test_util.run_in_graph_and_eager_modes
+  def test_compatible_batch_size_functional_model(self):
+
+    class MyLayer(keras.layers.Layer):
+
+      def call(self, inputs):
+        return array_ops.concat(inputs, axis=0)
+
+    input1 = keras.Input(batch_size=2, shape=(10,))
+    input2 = keras.Input(batch_size=3, shape=(10,))
+    outputs = MyLayer()([input1, input2])
+    with self.assertRaisesRegexp(ValueError,
+                                 'specified batch sizes of the Input Layers'):
+      keras.Model([input1, input2], outputs)
+
 
 class TestExceptionsAndWarnings(test.TestCase):
 
@@ -1062,6 +1210,7 @@ class LossMaskingTest(test.TestCase):
 
 class TestDynamicTrainability(test.TestCase):
 
+  @tf_test_util.run_deprecated_v1
   def test_trainable_warning(self):
     with self.cached_session():
       x = np.random.random((5, 3))
@@ -1075,6 +1224,7 @@ class TestDynamicTrainability(test.TestCase):
       model.train_on_batch(x, y)
       self.assertRaises(Warning)
 
+  @tf_test_util.run_deprecated_v1
   def test_trainable_argument(self):
     with self.cached_session():
       x = np.random.random((5, 3))
@@ -1205,6 +1355,7 @@ class TestDynamicTrainability(test.TestCase):
 
 class TestTrainingWithDataTensors(test.TestCase):
 
+  @tf_test_util.run_deprecated_v1
   def test_training_and_eval_methods_on_symbolic_tensors_single_io(self):
     with self.cached_session():
       x = keras.layers.Input(shape=(3,), name='input')
@@ -1245,6 +1396,7 @@ class TestTrainingWithDataTensors(test.TestCase):
                 epochs=1, steps_per_epoch=2, verbose=0,
                 validation_data=(inputs, targets), validation_steps=2)
 
+  @tf_test_util.run_deprecated_v1
   def test_training_and_eval_methods_on_symbolic_tensors_multi_io(self):
     with self.cached_session():
       a = keras.layers.Input(shape=(3,), name='input_a')
@@ -1340,6 +1492,7 @@ class TestTrainingWithDataTensors(test.TestCase):
       model.predict([input_a_tf, input_b_tf], steps=2)
       model.test_on_batch([input_a_tf, input_b_tf], [output_d_tf, output_e_tf])
 
+  @tf_test_util.run_deprecated_v1
   def test_model_with_input_feed_tensor(self):
     """We test building a model with a TF variable as input.
 
@@ -1518,6 +1671,7 @@ class TestTrainingWithDataTensors(test.TestCase):
       # evaluate
       _ = model.evaluate(input_a_np, [output_a_np])
 
+  @tf_test_util.run_deprecated_v1
   def test_model_with_external_loss(self):
     with self.cached_session():
       # None loss, only regularization loss.
@@ -1713,6 +1867,7 @@ class TestTrainingWithDataTensors(test.TestCase):
       model.train_on_batch(input_val, None,
                            sample_weight={'dense_a': np.random.random((10,))})
 
+  @tf_test_util.run_deprecated_v1
   def test_model_custom_target_tensors(self):
     with self.cached_session():
       a = keras.Input(shape=(3,), name='input_a')
@@ -1774,264 +1929,6 @@ class TestTrainingWithDataTensors(test.TestCase):
                            [output_a_np, output_b_np])
 
 
-class TestTrainingWithDatasetIterators(test.TestCase):
-
-  @tf_test_util.run_in_graph_and_eager_modes
-  def test_training_and_eval_methods_on_iterators_single_io(self):
-    model = testing_utils.get_small_functional_mlp(1, 4, input_dim=3)
-    optimizer = RMSPropOptimizer(learning_rate=0.001)
-    loss = 'mse'
-    metrics = ['mae', metrics_module.CategoricalAccuracy()]
-    model.compile(optimizer, loss, metrics=metrics)
-
-    inputs = np.zeros((10, 3))
-    targets = np.zeros((10, 4))
-    dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets))
-    dataset = dataset.repeat(100)
-    dataset = dataset.batch(10)
-    iterator = dataset.make_one_shot_iterator()
-
-    model.fit(iterator, epochs=1, steps_per_epoch=2, verbose=1)
-    model.evaluate(iterator, steps=2, verbose=1)
-    model.predict(iterator, steps=2)
-    model.train_on_batch(iterator)
-    model.test_on_batch(iterator)
-    model.predict_on_batch(iterator)
-
-    # Test with validation data
-    model.fit(iterator,
-              epochs=1, steps_per_epoch=2, verbose=0,
-              validation_data=iterator, validation_steps=2)
-    # Test with validation split
-    with self.assertRaisesRegexp(
-        ValueError, '`validation_split` argument is not supported '
-        'when input `x` is a dataset or a dataset iterator'):
-      model.fit(iterator,
-                epochs=1, steps_per_epoch=2, verbose=0,
-                validation_split=0.5, validation_steps=2)
-
-    # Test with sample weight.
-    sample_weight = np.random.random((10,))
-    with self.assertRaisesRegexp(
-        ValueError, '`sample_weight` argument is not supported '
-        'when input `x` is a dataset or a dataset iterator'):
-      model.fit(
-          iterator,
-          epochs=1,
-          steps_per_epoch=2,
-          verbose=0,
-          sample_weight=sample_weight)
-
-    # Test invalid usage
-    with self.assertRaisesRegexp(ValueError,
-                                 'you should not specify a target'):
-      model.fit(iterator, iterator,
-                epochs=1, steps_per_epoch=2, verbose=0)
-
-    with self.assertRaisesRegexp(
-        ValueError, 'you should specify the `steps_per_epoch` argument'):
-      model.fit(iterator, epochs=1, verbose=0)
-    with self.assertRaisesRegexp(ValueError,
-                                 'you should specify the `steps` argument'):
-      model.evaluate(iterator, verbose=0)
-    with self.assertRaisesRegexp(ValueError,
-                                 'you should specify the `steps` argument'):
-      model.predict(iterator, verbose=0)
-
-  @tf_test_util.run_in_graph_and_eager_modes
-  def test_get_next_op_created_once(self):
-    model = testing_utils.get_small_functional_mlp(1, 4, input_dim=3)
-    optimizer = RMSPropOptimizer(learning_rate=0.001)
-    loss = 'mse'
-    metrics = ['mae']
-    model.compile(optimizer, loss, metrics=metrics)
-
-    inputs = np.zeros((10, 3))
-    targets = np.zeros((10, 4))
-    dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets))
-    dataset = dataset.repeat(100)
-    dataset = dataset.batch(10)
-    iterator = dataset.make_one_shot_iterator()
-
-    model.fit(iterator, epochs=1, steps_per_epoch=2, verbose=1)
-    # Finalize graph to make sure we are not appending another iterator
-    # get_next op in the graph.
-    ops.get_default_graph().finalize()
-    model.fit(iterator, epochs=1, steps_per_epoch=2, verbose=1)
-
-  @tf_test_util.run_in_graph_and_eager_modes
-  def test_iterators_running_out_of_data(self):
-    model = testing_utils.get_small_functional_mlp(1, 4, input_dim=3)
-    optimizer = RMSPropOptimizer(learning_rate=0.001)
-    loss = 'mse'
-    metrics = ['mae']
-    model.compile(optimizer, loss, metrics=metrics)
-
-    inputs = np.zeros((10, 3))
-    targets = np.zeros((10, 4))
-    dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets))
-    dataset = dataset.repeat(2)
-    dataset = dataset.batch(10)
-    iterator = dataset.make_one_shot_iterator()
-
-    with test.mock.patch.object(logging, 'warning') as mock_log:
-      model.fit(iterator, epochs=1, steps_per_epoch=3, verbose=0)
-      self.assertRegexpMatches(
-          str(mock_log.call_args),
-          'dataset iterator ran out of data')
-
-
-class TestTrainingWithDataset(test.TestCase):
-
-  @tf_test_util.run_in_graph_and_eager_modes
-  def test_calling_model_on_same_dataset(self):
-    model = testing_utils.get_small_functional_mlp(1, 4, input_dim=3)
-    optimizer = RMSPropOptimizer(learning_rate=0.001)
-    loss = 'mse'
-    metrics = ['mae']
-    model.compile(optimizer, loss, metrics=metrics)
-
-    inputs = np.zeros((10, 3))
-    targets = np.zeros((10, 4))
-    dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets))
-    dataset = dataset.repeat(100)
-    dataset = dataset.batch(10)
-
-    # Call fit with validation data
-    model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=0,
-              validation_data=dataset, validation_steps=2)
-    # Finalize the graph to make sure new ops aren't added when calling on the
-    # same dataset
-    ops.get_default_graph().finalize()
-    model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=0,
-              validation_data=dataset, validation_steps=2)
-
-  @tf_test_util.run_in_graph_and_eager_modes
-  def test_training_and_eval_methods_on_dataset(self):
-    model = testing_utils.get_small_functional_mlp(1, 4, input_dim=3)
-    optimizer = RMSPropOptimizer(learning_rate=0.001)
-    loss = 'mse'
-    metrics = ['mae', metrics_module.CategoricalAccuracy()]
-    model.compile(optimizer, loss, metrics=metrics)
-
-    inputs = np.zeros((10, 3))
-    targets = np.zeros((10, 4))
-    dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets))
-    dataset = dataset.repeat(100)
-    dataset = dataset.batch(10)
-
-    model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=1)
-    model.evaluate(dataset, steps=2, verbose=1)
-    model.predict(dataset, steps=2)
-    model.train_on_batch(dataset)
-    model.predict_on_batch(dataset)
-
-    # Test with validation data
-    model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=0,
-              validation_data=dataset, validation_steps=2)
-
-    # Test with validation split
-    with self.assertRaisesRegexp(
-        ValueError, '`validation_split` argument is not supported '
-        'when input `x` is a dataset or a dataset iterator'):
-      model.fit(dataset,
-                epochs=1, steps_per_epoch=2, verbose=0,
-                validation_split=0.5, validation_steps=2)
-
-    # Test with sample weight.
-    sample_weight = np.random.random((10,))
-    with self.assertRaisesRegexp(
-        ValueError, '`sample_weight` argument is not supported '
-        'when input `x` is a dataset or a dataset iterator'):
-      model.fit(
-          dataset,
-          epochs=1,
-          steps_per_epoch=2,
-          verbose=0,
-          sample_weight=sample_weight)
-
-    # Test invalid usage
-    with self.assertRaisesRegexp(ValueError,
-                                 'you should not specify a target'):
-      model.fit(dataset, dataset,
-                epochs=1, steps_per_epoch=2, verbose=0)
-
-    with self.assertRaisesRegexp(
-        ValueError, 'you should specify the `steps_per_epoch` argument'):
-      model.fit(dataset, epochs=1, verbose=0)
-    with self.assertRaisesRegexp(ValueError,
-                                 'you should specify the `steps` argument'):
-      model.evaluate(dataset, verbose=0)
-    with self.assertRaisesRegexp(ValueError,
-                                 'you should specify the `steps` argument'):
-      model.predict(dataset, verbose=0)
-
-  @tf_test_util.run_in_graph_and_eager_modes
-  def test_dataset_with_sample_weights(self):
-    model = testing_utils.get_small_functional_mlp(1, 4, input_dim=3)
-    optimizer = RMSPropOptimizer(learning_rate=0.001)
-    loss = 'mse'
-    metrics = ['mae', metrics_module.CategoricalAccuracy()]
-    model.compile(optimizer, loss, metrics=metrics)
-
-    inputs = np.zeros((10, 3), np.float32)
-    targets = np.zeros((10, 4), np.float32)
-    sample_weights = np.ones((10), np.float32)
-    dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets,
-                                                      sample_weights))
-    dataset = dataset.repeat(100)
-    dataset = dataset.batch(10)
-
-    model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=1)
-    model.evaluate(dataset, steps=2, verbose=1)
-    model.predict(dataset, steps=2)
-    model.train_on_batch(dataset)
-    model.predict_on_batch(dataset)
-
-  @tf_test_util.run_in_graph_and_eager_modes
-  def test_dataset_with_sparse_labels(self):
-    model = testing_utils.get_small_functional_mlp(1, 4, input_dim=3)
-    optimizer = RMSPropOptimizer(learning_rate=0.001)
-    loss = 'sparse_categorical_crossentropy'
-    model.compile(optimizer, loss)
-
-    inputs = np.zeros((10, 3))
-    targets = np.random.randint(0, 4, size=10, dtype=np.int32)
-    dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets))
-    dataset = dataset.repeat(100)
-    dataset = dataset.batch(10)
-
-    model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=1)
-
-  def test_dataset_input_shape_validation(self):
-    with self.cached_session():
-      model = testing_utils.get_small_functional_mlp(1, 4, input_dim=3)
-      model.compile(optimizer=RMSPropOptimizer(learning_rate=0.001), loss='mse')
-
-      # User forgets to batch the dataset
-      inputs = np.zeros((10, 3))
-      targets = np.zeros((10, 4))
-      dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets))
-      dataset = dataset.repeat(100)
-
-      with self.assertRaisesRegexp(
-          ValueError,
-          r'expected (.*?) to have shape \(3,\) but got array with shape \(1,\)'
-      ):
-        model.train_on_batch(dataset)
-
-      # Wrong input shape
-      inputs = np.zeros((10, 5))
-      targets = np.zeros((10, 4))
-      dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets))
-      dataset = dataset.repeat(100)
-      dataset = dataset.batch(10)
-
-      with self.assertRaisesRegexp(ValueError,
-                                   r'expected (.*?) to have shape \(3,\)'):
-        model.train_on_batch(dataset)
-
-
 class TestTrainingWithMetrics(test.TestCase):
   """Training tests related to metrics."""
 
@@ -2095,39 +1992,6 @@ class TestTrainingWithMetrics(test.TestCase):
     self.assertEqual(outs[1], 0.)
     self.assertEqual(outs[2], 0.)
 
-  @tf_test_util.run_in_graph_and_eager_modes
-  def test_metrics_correctness_with_iterator(self):
-    model = keras.Sequential()
-    model.add(
-        keras.layers.Dense(
-            8, activation='relu', input_dim=4, kernel_initializer='ones'))
-    model.add(
-        keras.layers.Dense(
-            1, activation='sigmoid', kernel_initializer='ones'))
-    model.compile(
-        loss='binary_crossentropy',
-        metrics=['accuracy', metrics_module.BinaryAccuracy()],
-        optimizer=RMSPropOptimizer(learning_rate=0.001))
-
-    np.random.seed(123)
-    x = np.random.randint(10, size=(100, 4)).astype(np.float32)
-    y = np.random.randint(2, size=(100, 1)).astype(np.float32)
-    dataset = dataset_ops.Dataset.from_tensor_slices((x, y))
-    dataset = dataset.batch(10)
-    iterator = dataset.make_one_shot_iterator()
-    outs = model.evaluate(iterator, steps=10)
-    self.assertEqual(np.around(outs[1], decimals=1), 0.5)
-    self.assertEqual(np.around(outs[2], decimals=1), 0.5)
-
-    y = np.zeros((100, 1), dtype=np.float32)
-    dataset = dataset_ops.Dataset.from_tensor_slices((x, y))
-    dataset = dataset.repeat(100)
-    dataset = dataset.batch(10)
-    iterator = dataset.make_one_shot_iterator()
-    outs = model.evaluate(iterator, steps=10)
-    self.assertEqual(outs[1], 0.)
-    self.assertEqual(outs[2], 0.)
-
   @tf_test_util.run_in_graph_and_eager_modes
   def test_metrics_correctness_with_weighted_metrics(self):
     np.random.seed(1337)
@@ -2153,7 +2017,7 @@ class TestTrainingWithMetrics(test.TestCase):
 
     w = np.array([[3., 4.], [1., 2.]])
     outs = model.evaluate(x, y, sample_weight=w)
-    self.assertArrayNear(outs, [0.3, 0.7, 0.3], .001)
+    self.assertArrayNear(outs, [0.75, 0.7, 0.3], .001)
 
     # Verify that metric value is same with arbitrary weights and batch size.
     x = np.random.random((50, 2, 1))
@@ -2223,32 +2087,331 @@ class TestTrainingWithMetrics(test.TestCase):
       # verify that masking is combined with sample weights.
       w = np.array([3, 2, 4])
       scores = model.train_on_batch(x, y, sample_weight=w)
-      self.assertArrayNear(scores, [0.2, 0.8], 0.1)
+      self.assertArrayNear(scores, [0.3328, 0.8], 0.001)
+
+  @tf_test_util.run_deprecated_v1
+  def test_add_metric_with_tensor_on_model_in_graph_mode(self):
+    with self.cached_session():
+      x = keras.layers.Input(shape=(1,))
+      y = keras.layers.Dense(1, kernel_initializer='ones')(x)
+      model = keras.models.Model(x, y)
+      model.add_metric(
+          math_ops.reduce_sum(y), name='metric_1', aggregation='mean')
+
+      # test with a metric which does not have the standard signature:
+      # (y_true, y_pred, sample_Weight)
+      model.add_metric(metrics_module.Mean(name='metric_2')(y))
+      model.compile('sgd', loss='mse')
+
+      inputs = np.ones(shape=(10, 1))
+      targets = np.ones(shape=(10, 1))
+      history = model.fit(
+          inputs,
+          targets,
+          epochs=2,
+          batch_size=5,
+          validation_data=(inputs, targets))
+      self.assertEqual(history.history['metric_1'][-1], 5)
+      self.assertEqual(history.history['metric_2'][-1], 1)
+      self.assertEqual(history.history['val_metric_1'][-1], 5)
+      self.assertEqual(history.history['val_metric_2'][-1], 1)
+
+      eval_results = model.evaluate(inputs, targets, batch_size=5)
+      self.assertEqual(eval_results[-1], 1)
+      self.assertEqual(eval_results[-2], 5)
+
+      model.predict(inputs, batch_size=5)
+      model.train_on_batch(inputs, targets)
+      model.test_on_batch(inputs, targets)
 
   @tf_test_util.run_in_graph_and_eager_modes
-  def test_logging(self):
-    mock_stdout = io.BytesIO() if six.PY2 else io.StringIO()
-    model = keras.models.Sequential()
-    model.add(keras.layers.Dense(10, activation='relu'))
-    model.add(keras.layers.Dense(1, activation='sigmoid'))
-    model.compile(
-        RMSPropOptimizer(learning_rate=0.001), loss='binary_crossentropy')
-    with test.mock.patch.object(sys, 'stdout', mock_stdout):
-      model.fit(
-          np.ones((10, 10), 'float32'), np.ones((10, 1), 'float32'), epochs=10)
-    self.assertTrue('Epoch 5/10' in mock_stdout.getvalue())
+  def test_add_metric_in_model_call(self):
 
-  def test_losses_in_defun(self):
+    class TestModel(keras.Model):
+
+      def __init__(self):
+        super(TestModel, self).__init__(name='test_model')
+        self.dense1 = keras.layers.Dense(2, kernel_initializer='ones')
+        self.mean = metrics_module.Mean(name='metric_1')
+
+      def call(self, x):
+        self.add_metric(
+            math_ops.reduce_sum(x), name='metric_2', aggregation='mean')
+        # Provide same name as in the instance created in __init__
+        # for eager mode
+        self.add_metric(self.mean(x), name='metric_1')
+        return self.dense1(x)
+
+    model = TestModel()
+    model.compile(loss='mse', optimizer=RMSPropOptimizer(0.01))
+
+    x = np.ones(shape=(10, 1))
+    y = np.ones(shape=(10, 2))
+    history = model.fit(x, y, epochs=2, batch_size=5, validation_data=(x, y))
+    self.assertAlmostEqual(history.history['metric_1'][-1], 1, 0)
+    self.assertAlmostEqual(history.history['val_metric_1'][-1], 1, 0)
+    self.assertAlmostEqual(history.history['metric_2'][-1], 5, 0)
+    self.assertAlmostEqual(history.history['val_metric_2'][-1], 5, 0)
+
+    eval_results = model.evaluate(x, y, batch_size=5)
+    self.assertAlmostEqual(eval_results[1], 1, 0)
+    self.assertAlmostEqual(eval_results[2], 5, 0)
+
+    model.predict(x, batch_size=5)
+    model.train_on_batch(x, y)
+    model.test_on_batch(x, y)
+
+  def test_add_metric_in_model_call_run_eagerly(self):
     with context.eager_mode():
-      layer = keras.layers.Dense(1, kernel_regularizer='l1')
-      layer(array_ops.ones([1, 10]))
 
-      @function.defun
-      def get_losses():
-        return layer.losses
+      class TestModel(keras.Model):
+
+        def __init__(self):
+          super(TestModel, self).__init__(name='test_model')
+          self.dense1 = keras.layers.Dense(2, kernel_initializer='ones')
+          self.mean = metrics_module.Mean(name='metric_1')
+
+        def call(self, x):
+          self.add_metric(
+              math_ops.reduce_sum(x), name='metric_2', aggregation='mean')
+          # Provide same name as in the instance created in __init__
+          # for eager mode
+          self.add_metric(self.mean(x), name='metric_1')
+          return self.dense1(x)
+
+      model = TestModel()
+      model.compile(
+          loss='mse', optimizer=RMSPropOptimizer(0.01), run_eagerly=True)
+
+      x = np.ones(shape=(10, 1))
+      y = np.ones(shape=(10, 2))
+      history = model.fit(x, y, epochs=2, batch_size=5, validation_data=(x, y))
+      self.assertAlmostEqual(history.history['metric_1'][-1], 1, 0)
+      self.assertAlmostEqual(history.history['val_metric_1'][-1], 1, 0)
+      self.assertAlmostEqual(history.history['metric_2'][-1], 5, 0)
+      self.assertAlmostEqual(history.history['val_metric_2'][-1], 5, 0)
+
+      eval_results = model.evaluate(x, y, batch_size=5)
+      self.assertAlmostEqual(eval_results[1], 1, 0)
+      self.assertAlmostEqual(eval_results[2], 5, 0)
+
+      model.predict(x, batch_size=5)
+      model.train_on_batch(x, y)
+      model.test_on_batch(x, y)
+
+  @tf_test_util.run_in_graph_and_eager_modes
+  def test_add_metric_in_layer_call(self):
+
+    class TestLayer(keras.layers.Layer):
+
+      def build(self, input_shape):
+        self.a = self.add_variable(
+            'a', (1, 1), initializer='ones', trainable=False)
+        self.built = True
+
+      def call(self, inputs):
+        self.add_metric(
+            math_ops.reduce_sum(inputs), name='metric_1', aggregation='mean')
+        return inputs + 1
+
+    model = keras.Sequential()
+    model.add(TestLayer(input_shape=(1,)))
+    model.add(keras.layers.Dense(2, kernel_initializer='ones'))
+    model.compile(loss='mse', optimizer=RMSPropOptimizer(0.01))
+
+    x = np.ones(shape=(10, 1))
+    y = np.ones(shape=(10, 2))
+    history = model.fit(x, y, epochs=2, batch_size=5, validation_data=(x, y))
+    self.assertEqual(history.history['metric_1'][-1], 5)
+    self.assertAlmostEqual(history.history['val_metric_1'][-1], 5, 0)
+
+  def test_add_metric_in_layer_call_run_eagerly(self):
+    with context.eager_mode():
+
+      class TestLayer(keras.layers.Layer):
+
+        def build(self, input_shape):
+          self.a = self.add_variable(
+              'a', (1, 1), initializer='ones', trainable=False)
+          self.built = True
+
+        def call(self, inputs):
+          self.add_metric(
+              math_ops.reduce_sum(inputs), name='metric_1', aggregation='mean')
+          return inputs + 1
+
+      model = keras.Sequential()
+      model.add(TestLayer(input_shape=(1,)))
+      model.add(keras.layers.Dense(2, kernel_initializer='ones'))
+      model.compile(
+          loss='mse', optimizer=RMSPropOptimizer(0.01), run_eagerly=True)
+
+      x = np.ones(shape=(10, 1))
+      y = np.ones(shape=(10, 2))
+      history = model.fit(x, y, epochs=2, batch_size=5, validation_data=(x, y))
+      self.assertEqual(history.history['metric_1'][-1], 5)
+      self.assertAlmostEqual(history.history['val_metric_1'][-1], 5, 0)
+
+  @tf_test_util.run_deprecated_v1
+  def test_model_metrics_list(self):
+    with self.cached_session():
+      x = keras.layers.Input(shape=(1,))
+      y = keras.layers.Dense(1, kernel_initializer='ones')(x)
+      model = keras.models.Model(x, y)
+      model.add_metric(
+          math_ops.reduce_sum(y), name='metric_1', aggregation='mean')
+      model.add_metric(metrics_module.Mean(name='metric_2')(y))
+      model.compile('sgd', loss='mse', metrics=['acc'])
+
+      # Verify that the metrics added using `compile` and `add_metric` API are
+      # included
+      self.assertEqual(model._compile_metrics, ['acc'])
+      names = []
+      for m in model.metrics:
+        if isinstance(m, metrics_module.Metric):
+          names.append(m.name)
+        else:
+          names.append(m.__name__)
+      self.assertEqual(names, ['binary_accuracy', 'metric_1', 'metric_2'])
+
+  def test_model_eager_metrics_list(self):
+    with context.eager_mode():
+
+      class TestModel(keras.Model):
+
+        def __init__(self):
+          super(TestModel, self).__init__(name='test_model')
+          self.dense1 = keras.layers.Dense(2, kernel_initializer='ones')
+
+        def call(self, x):
+          self.add_metric(
+              math_ops.reduce_sum(x), name='metric_1', aggregation='mean')
+          return self.dense1(x)
+
+      model = TestModel()
+      model.compile(
+          loss='mse',
+          optimizer=RMSPropOptimizer(0.01),
+          metrics=['acc'],
+          run_eagerly=True)
+      x = np.ones(shape=(10, 1))
+      y = np.ones(shape=(10, 2))
+      model.fit(x, y, epochs=2, batch_size=5, validation_data=(x, y))
+
+      self.assertEqual(model._compile_metrics, ['acc'])
+      names = []
+      for m in model.metrics:
+        if isinstance(m, metrics_module.Metric):
+          names.append(m.name)
+        else:
+          names.append(m.__name__)
+      self.assertEqual(names, ['categorical_accuracy', 'metric_1'])
+
+  @tf_test_util.run_in_graph_and_eager_modes
+  def test_multiple_add_metric_calls(self):
+
+    class TestModel(keras.Model):
+
+      def __init__(self):
+        super(TestModel, self).__init__(name='test_model')
+        self.dense1 = keras.layers.Dense(2, kernel_initializer='ones')
+        self.mean1 = metrics_module.Mean(name='metric_1')
+        self.mean2 = metrics_module.Mean(name='metric_2')
+
+      def call(self, x):
+        self.add_metric(self.mean2(x), name='metric_2')
+        self.add_metric(self.mean1(x), name='metric_1')
+        self.add_metric(
+            math_ops.reduce_sum(x), name='metric_3', aggregation='mean')
+        return self.dense1(x)
+
+    model = TestModel()
+    model.compile(loss='mse', optimizer=RMSPropOptimizer(0.01))
+
+    x = np.ones(shape=(10, 1))
+    y = np.ones(shape=(10, 2))
+    history = model.fit(x, y, epochs=2, batch_size=5, validation_data=(x, y))
+    self.assertAlmostEqual(history.history['metric_1'][-1], 1, 0)
+    self.assertAlmostEqual(history.history['metric_2'][-1], 1, 0)
+    self.assertAlmostEqual(history.history['metric_3'][-1], 5, 0)
+
+    eval_results = model.evaluate(x, y, batch_size=5)
+    self.assertArrayNear(eval_results[1:4], [1, 1, 5], 0.1)
+
+    model.predict(x, batch_size=5)
+    model.train_on_batch(x, y)
+    model.test_on_batch(x, y)
+
+  def test_invalid_metric_tensor_in_call(self):
+    with context.eager_mode():
+
+      class TestLayer(keras.layers.Layer):
+
+        def call(self, inputs):
+          self.add_metric(metrics_module.Mean(name='metric_1')(inputs))
+          return inputs + 1
+
+      model = keras.Sequential()
+      model.add(TestLayer(input_shape=(1,)))
+      model.add(keras.layers.Dense(2, kernel_initializer='ones'))
+      model.compile(
+          loss='mse', optimizer=RMSPropOptimizer(0.01), run_eagerly=True)
+
+      x = np.ones(shape=(10, 1))
+      y = np.ones(shape=(10, 2))
+      with self.assertRaisesRegexp(
+          ValueError,
+          'We do not support adding an aggregated metric tensor in `call` in '
+          'eager execution.'):
+        model.fit(x, y, epochs=2, batch_size=5, validation_data=(x, y))
+
+  @tf_test_util.run_in_graph_and_eager_modes
+  def test_duplicate_metric_name_in_add_metric(self):
+
+    class TestModel(keras.Model):
+
+      def __init__(self):
+        super(TestModel, self).__init__(name='test_model')
+        self.dense1 = keras.layers.Dense(2, kernel_initializer='ones')
+        self.mean = metrics_module.Mean(name='metric_1')
+        self.mean2 = metrics_module.Mean(name='metric_1')
+
+      def call(self, x):
+        self.add_metric(self.mean(x), name='metric_1')
+        return self.dense1(x)
+
+    model = TestModel()
+    model.compile(loss='mse', optimizer=RMSPropOptimizer(0.01))
+
+    x = np.ones(shape=(10, 1))
+    y = np.ones(shape=(10, 2))
+    with self.assertRaisesRegexp(
+        ValueError,
+        'Please provide different names for the metrics you have added. '
+        'We found 2 metrics with the name: "metric_1"'):
+      model.fit(x, y, epochs=2, batch_size=5, validation_data=(x, y))
+
+  @tf_test_util.run_in_graph_and_eager_modes
+  def test_multiple_no_name_input_to_add_metric(self):
+
+    class TestModel(keras.Model):
+
+      def __init__(self):
+        super(TestModel, self).__init__(name='test_model')
+        self.dense1 = keras.layers.Dense(2, kernel_initializer='ones')
+
+      def call(self, x):
+        self.add_metric(math_ops.reduce_sum(x), aggregation='mean')
+        self.add_metric(math_ops.reduce_sum(x), aggregation='mean')
+        return self.dense1(x)
+
+    model = TestModel()
+    model.compile(loss='mse', optimizer=RMSPropOptimizer(0.01))
+    x = np.ones(shape=(10, 1))
+    y = np.ones(shape=(10, 2))
+    model.fit(x, y, epochs=2, batch_size=5, validation_data=(x, y))
+    self.assertEqual([m.name for m in model.metrics], ['mean', 'mean_1'])
 
-      self.assertAllEqual(self.evaluate(layer.losses),
-                          self.evaluate(get_losses()))
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/engine/training_utils.py b/tensorflow/python/keras/engine/training_utils.py
index e563b7a23df..0157fe084c2 100644
--- a/tensorflow/python/keras/engine/training_utils.py
+++ b/tensorflow/python/keras/engine/training_utils.py
@@ -18,163 +18,176 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import abc
 from collections import OrderedDict
 import copy
-import math
 
 import numpy as np
 import six
 
-from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import iterator_ops
 from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.keras import backend as K
+from tensorflow.python.keras import callbacks as cbks
 from tensorflow.python.keras import losses
 from tensorflow.python.keras import metrics as metrics_module
 from tensorflow.python.keras.engine import base_layer
+from tensorflow.python.keras.utils import generic_utils
+from tensorflow.python.keras.utils.losses_utils import squeeze_or_expand_dimensions
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import weights_broadcast_ops
 from tensorflow.python.util import nest
 
 
-def _map_nested(data, func):
-  """Maps each nested element using func."""
-  if isinstance(data, list):
-    return [_map_nested(nested_data, func) for nested_data in data]
-  elif isinstance(data, tuple):
-    return tuple(_map_nested(nested_data, func) for nested_data in data)
-  elif isinstance(data, dict):
-    return {
-        k: _map_nested(nested_data, func) for k, nested_data in data.items()
-    }
-  else:
-    return func(data)
+@six.add_metaclass(abc.ABCMeta)
+class Aggregator(object):
+  """Abstract base class used to aggregate batch-level outputs of a loop.
+
+  Attributes:
+    use_steps: Whether the loop is using `step` or `batch_size`.
+    num_samples_or_steps: Either `batch_size*num_batches` or `steps`.
+    results: What to return at the end of the aggregation loop.
+  """
+
+  def __init__(self, use_steps, num_samples_or_steps):
+    self.use_steps = use_steps
+    self.num_samples_or_steps = num_samples_or_steps
+    self.results = []
+
+  @abc.abstractmethod
+  def create(self, batch_outs):
+    """Creates the initial results from the first batch outputs.
+
+    Arguments:
+      batch_outs: A list of batch-level outputs.
+    """
+    NotImplementedError('Must be implemented in subclasses.')
+
+  @abc.abstractmethod
+  def aggregate(self, batch_outs, batch_start=None, batch_end=None):
+    """Aggregates batch-level results into total results.
+
+    Arguments:
+      batch_outs: A list of batch-level outputs.
+      batch_start: The start index of this batch. Always `None` if `use_steps`
+        is `True`.
+      batch_end: The end index of this batch. Always `None` if `use_steps` is
+        `True`.
+    """
+    NotImplementedError('Must be implemented in subclasses.')
+
+  @abc.abstractmethod
+  def finalize(self):
+    """Prepares the total results to be returned."""
+    NotImplementedError('Must be implemented in subclasses.')
 
 
-def _nested_all(data, cond_func):
-  """Checks if all elements in a nested structure satisfy cond_func."""
-  if isinstance(data, (tuple, list)):
-    return all([_nested_all(nested_data, cond_func) for nested_data in data])
-  elif isinstance(data, dict):
-    return all(
-        [_nested_all(nested_data, cond_func) for nested_data in data.values()])
-  else:
-    return cond_func(data)
+class MetricsAggregator(Aggregator):
+  """Aggregator that calculates loss and metrics info."""
+
+  def create(self, batch_outs):
+    self.results = [0.] * len(batch_outs)
+
+  def aggregate(self, batch_outs, batch_start=None, batch_end=None):
+    # Loss.
+    if self.use_steps:
+      self.results[0] += batch_outs[0]
+    else:
+      self.results[0] += batch_outs[0] * (batch_end - batch_start)
+    # Metrics (always stateful, just grab current values.)
+    self.results[1:] = batch_outs[1:]
+
+  def finalize(self):
+    self.results[0] /= self.num_samples_or_steps
 
 
-def _nested_any(data, cond_func):
-  """Checks if any nested_elements in a nested structure satisfy cond_func."""
-  if isinstance(data, (tuple, list)):
-    return any([_nested_any(nested_data, cond_func) for nested_data in data])
-  elif isinstance(data, dict):
-    return any(
-        [_nested_any(nested_data, cond_func) for nested_data in data.values()])
-  else:
-    return cond_func(data)
+class OutputsAggregator(Aggregator):
+  """Aggregator that concatenates outputs."""
+
+  def create(self, batch_outs):
+    if self.use_steps:
+      # Cannot pre-allocate the returned NumPy arrays bc
+      # batch sizes are unknown. Concatenate batches at the end.
+      for _ in batch_outs:
+        self.results.append([])
+    else:
+      # Pre-allocate NumPy arrays.
+      for batch_out in batch_outs:
+        shape = (self.num_samples_or_steps,) + batch_out.shape[1:]
+        self.results.append(np.zeros(shape, dtype=batch_out.dtype))
+
+  def aggregate(self, batch_outs, batch_start=None, batch_end=None):
+    if self.use_steps:
+      for i, batch_out in enumerate(batch_outs):
+        self.results[i].append(batch_out)
+    else:
+      for i, batch_out in enumerate(batch_outs):
+        self.results[i][batch_start:batch_end] = batch_out
+
+  def finalize(self):
+    if self.use_steps:
+      self.results = [np.concatenate(result, axis=0) for result in self.results]
 
 
-def _convert_lists_to_tuples(data):
-  """Converts all lists to tuples, since Datasets expect tuples."""
-  if isinstance(data, (tuple, list)):
-    return tuple(_convert_lists_to_tuples(nested_data) for nested_data in data)
-  elif isinstance(data, dict):
-    return {
-        k: _convert_lists_to_tuples(nested_data)
-        for k, nested_data in data.items()
-    }
-  else:
-    return data
+def make_logs(model, outputs, mode, prefix=''):
+  """Computes logs for sending to `on_batch_end` methods."""
+  logs = {}
+  # TODO(omalleyt): handle outputs in prediction when Callback
+  # hooks are ready.
+  if mode in ['train', 'test']:
+    if hasattr(model, 'metrics_names'):
+      for label, output in zip(model.metrics_names, outputs):
+        logs[prefix + label] = output
+  return logs
 
 
-def _get_batch_axis_size(data):
-  """Returns batch axis shape for nested data."""
-  if isinstance(data, (tuple, list)):
-    return _get_batch_axis_size(data[0])
-  elif isinstance(data, dict):
-    return _get_batch_axis_size(list(data.values()))
-  else:
-    return int(data.shape[0])
+def get_progbar(model, count_mode):
+  """Get Progbar."""
+  stateful_metric_names = None
+  if hasattr(model, 'metrics_names'):
+    stateful_metric_names = model.metrics_names[1:]  # Exclude `loss`
+  return cbks.ProgbarLogger(count_mode, stateful_metrics=stateful_metric_names)
 
 
-def convert_to_iterator(x=None,
-                        y=None,
-                        sample_weights=None,
-                        batch_size=None,
-                        steps_per_epoch=None,
-                        epochs=1,
-                        shuffle=False,
-                        is_validation=False):
-  """Converts NumPy arrays or EagerTensors to an EagerIterator.
+def slice_arrays(arrays, indices, contiguous=True):
+  """Slices batches out of provided arrays (workaround for eager tensors).
 
-  Combines all provided data into a single EagerIterator.
+  Unfortunately eager tensors don't have the same slicing behavior as
+  Numpy arrays (they follow the same slicing behavior as symbolic TF tensors),
+  hence we cannot use `generic_utils.slice_arrays` directly
+  and we have to implement this workaround based on `concat`. This has a
+  performance cost.
 
   Arguments:
-      x: NumPy array or EagerTensor,  or list of Numpy arrays or EagerTensors
-        representing inputs to a model.
-      y: Optional. NumPy array or EagerTensor, or list of Numpy arrays or
-        EagerTensors representing targets of a model.
-      sample_weights: Optional NumPy array or EagerTensor representing sample
-        weights.
-      batch_size: Used to batch data and calculate how many steps EagerIterator
-        should take per epoch.
-      steps_per_epoch: If provided, how many steps EagerIterator should take per
-        epoch.
-      epochs: Epochs to repeat iterator for.
-      shuffle: Whether to shuffle data after each epoch.
-      is_validation: Whether this call is for validation during a training
-        (e.g., `fit()`) call. This info is used to construct error messages
-        (if any).
-
-  Raises:
-      ValueError: if steps_per_epoch cannot be calculated from the data
-      provided.
+    arrays: Single array or list of arrays.
+    indices: List of indices in the array that should be included in the output
+      batch.
+    contiguous: Boolean flag indicating whether the indices are contiguous.
 
   Returns:
-      (Iterator, steps_per_epoch).
-
+    Slice of data (either single array or list of arrays).
   """
-  if isinstance(x, iterator_ops.EagerIterator):
-    return x, steps_per_epoch
-
-  if not _nested_any(sample_weights, lambda x: x is None):
-    data = (x, y, sample_weights)
-  elif not _nested_any(y, lambda x: x is None):
-    data = (x, y)
+  converted_to_list = False
+  if not isinstance(arrays, list):
+    converted_to_list = True
+    arrays = [arrays]
+  if any(tensor_util.is_tensor(x) for x in arrays):
+    if not contiguous:
+      entries = [[x[i:i + 1] for i in indices] for x in arrays]
+      slices = [array_ops.concat(x, axis=0) for x in entries]
+    else:
+      slices = [x[indices[0]:indices[-1] + 1] for x in arrays]
   else:
-    # always wrap in a tuple, so we know y, sample_weights weren't set
-    # even when x has multiple elements
-    data = (x,)
+    slices = generic_utils.slice_arrays(arrays, indices)
 
-  data = _convert_lists_to_tuples(data)
-  if steps_per_epoch is None and batch_size is not None:
-    num_samples = _get_batch_axis_size(data)
-    steps_per_epoch = int(math.ceil(num_samples / batch_size))
-
-  if steps_per_epoch is None:
-    alternative_arg_name = (
-        'validation_steps' if is_validation else 'steps_per_epoch')
-    raise ValueError(
-        'Could not determine how to convert EagerTensors into EagerIterator. '
-        'Please provide either `batch_size` or '
-        '`%s`.' % alternative_arg_name)
-
-  # TODO(omalleyt) for NumPy arrays in graph mode
-  # placeholder ops should be used
-  # this is only ideal for eager mode
-  dataset = dataset_ops.Dataset.from_tensor_slices(data)
-
-  if batch_size is not None:
-    dataset = dataset.batch(batch_size)
-  if shuffle:
-    dataset = dataset.shuffle(buffer_size=10000)
-  dataset = dataset.repeat(epochs)
-  iterator = dataset.make_one_shot_iterator()
-
-  return iterator, steps_per_epoch
+  if converted_to_list:
+    slices = slices[0]
+  return slices
 
 
 def check_num_samples(ins,
@@ -224,9 +237,9 @@ def standardize_single_array(x):
     return None
   if x.shape is not None and len(x.shape) == 1:
     if tensor_util.is_tensor(x):
-      return array_ops.expand_dims(x, axis=1)
+      x = array_ops.expand_dims(x, axis=1)
     else:
-      return np.expand_dims(x, 1)
+      x = np.expand_dims(x, 1)
   return x
 
 
@@ -629,15 +642,14 @@ def weighted_masked_objective(fn):
         weights = mask
       else:
         # Update dimensions of weights to match with mask if possible.
-        mask, _, weights = metrics_module.squeeze_or_expand_dimensions(
-            mask, None, weights)
+        mask, _, weights = squeeze_or_expand_dimensions(mask, None, weights)
         weights *= mask
 
     # Apply sample weighting.
     if weights is not None:
 
       # Update dimensions of weights to match with values if possible.
-      score_array, _, weights = metrics_module.squeeze_or_expand_dimensions(
+      score_array, _, weights = squeeze_or_expand_dimensions(
           score_array, None, weights)
       try:
         # Broadcast weights if possible.
@@ -651,7 +663,7 @@ def weighted_masked_objective(fn):
       score_array = math_ops.multiply(score_array, weights)
       score_array = math_ops.reduce_sum(score_array)
       weights = math_ops.reduce_sum(weights)
-      score_array = metrics_module.safe_div(score_array, weights)
+      score_array = math_ops.div_no_nan(score_array, weights)
     return K.mean(score_array)
 
   return weighted
@@ -835,12 +847,22 @@ def call_metric_function(metric_fn, y_true, y_pred, weights=None, mask=None):
     return metric_fn(y_true, y_pred, sample_weight=mask)
 
   # Update dimensions of weights to match with mask.
-  mask, _, weights = metrics_module.squeeze_or_expand_dimensions(
-      mask, None, weights)
+  mask, _, weights = squeeze_or_expand_dimensions(mask, None, weights)
   weights *= mask
   return metric_fn(y_true, y_pred, sample_weight=weights)
 
 
+def get_loss_function(loss):
+  """Returns the loss function corresponding to the given loss input."""
+  if loss is None or isinstance(loss, losses.Loss):
+    return loss
+
+  # TODO(psv): After we have added all V2 losses, update this function.
+  if loss in ['mse', 'MSE', 'mean_squared_error']:
+    return losses.MeanSquaredError()
+  return losses.get(loss)
+
+
 def validate_iterator_input(x, y, sample_weight, validation_split=None):
   """Validates user input arguments when a dataset iterator is passed.
 
@@ -1053,9 +1075,11 @@ class ModelInputs(object):
     self._inputs = inputs
     self._is_dict = isinstance(self._inputs, dict)
     self._is_single_input = not isinstance(self._inputs, (list, tuple, dict))
+
     self._flattened_inputs = []
     self._input_names = []
-    if isinstance(self._inputs, dict):
+
+    if self._is_dict:
       for k in sorted(self._inputs.keys()):
         self._flattened_inputs.append(self._inputs[k])
         self._input_names.append(k)
@@ -1064,7 +1088,6 @@ class ModelInputs(object):
       self._input_names = [
           'input_%d' % (i + 1) for i in range(len(self._flattened_inputs))
       ]
-    assert len(self._input_names) == len(self._flattened_inputs)
 
   def get_input_names(self):
     """Returns keys to name inputs by.
@@ -1074,56 +1097,29 @@ class ModelInputs(object):
     """
     return self._input_names
 
-  def _get(self, return_single_as_list=False):
-    """Returns provided inputs, potentially transformed.
-
-    Inputs are returned in the same format they were provided i.e. lists
-    are returned as lists, single entries as single entries (unless
-    `return_single_as_list` is true), dictionaries as dictionaries.
-
-    Args:
-      return_single_as_list: Returns a list of size 1 for single entry case.
-    """
-    if self._is_dict:
-      return dict(zip(self._input_names, self._flattened_inputs))
-    if self._is_single_input and not return_single_as_list:
-      return self._flattened_inputs[0]
-    return self._flattened_inputs
-
-  def get_input_values(self):
-    """Returns input values passed in."""
-    if context.executing_eagerly():
-      for i in range(len(self._flattened_inputs)):
-        v = self._flattened_inputs[i]
-        if tensor_util.is_tensor(v):
-          v = cast_single_tensor(v)
-        else:
-          v = ops.convert_to_tensor(v, dtype=K.floatx())
-        self._flattened_inputs[i] = v
-    return self._get(return_single_as_list=False)
-
   def get_symbolic_inputs(self, return_single_as_list=False):
     """Returns inputs to be set as self.inputs for a model."""
     for i in range(len(self._flattened_inputs)):
       k = self._input_names[i]
       v = self._flattened_inputs[i]
-      if context.executing_eagerly():
-        v = K.placeholder((None,) + tuple(v.shape[1:]), name=k)
-      else:
-        if isinstance(v, list):
-          v = np.asarray(v)
-          if v.ndim == 1:
-            v = np.expand_dims(v, 1)
-        if isinstance(v, (np.ndarray)):
-          # We fix the placeholder shape except the batch size.
-          # This is suboptimal, but it is the best we can do with the info
-          # we have. The user should call `model._set_inputs(placeholders)`
-          # to specify custom placeholders if the need arises.
-          shape = (None,) + v.shape[1:]
-          v = K.placeholder(shape=shape, name=k)
+      if isinstance(v, (list, float, int)):
+        v = np.asarray(v)
+        if v.ndim == 1:
+          v = np.expand_dims(v, 1)
+      if isinstance(v, (np.ndarray, ops.EagerTensor)):
+        # We fix the placeholder shape except the batch size.
+        # This is suboptimal, but it is the best we can do with the info
+        # we have. The user should call `model._set_inputs(placeholders)`
+        # to specify custom placeholders if the need arises.
+        shape = (None,) + tuple(v.shape[1:])
+        v = K.placeholder(shape=shape, name=k)
       self._flattened_inputs[i] = v
 
-    return self._get(return_single_as_list)
+    if self._is_dict:
+      return dict(zip(self._input_names, self._flattened_inputs))
+    if self._is_single_input and not return_single_as_list:
+      return self._flattened_inputs[0]
+    return self._flattened_inputs
 
   def as_dict(self):
     """An iterable over a dictionary version of inputs."""
@@ -1133,3 +1129,54 @@ class ModelInputs(object):
   def as_list(self):
     """Returning the inputs as a list."""
     return self._flattened_inputs
+
+
+# Allow use of methods not exposed to the user.
+# pylint: disable=protected-access
+def get_input_shape_and_dtype(layer):
+  """Retrieves input shape and input dtype of layer if applicable.
+
+  Args:
+    layer: Layer (or model) instance.
+
+  Returns:
+    Tuple (input_shape, input_dtype). Both could be None if the layer
+      does not have a defined input shape.
+
+  Raises:
+    ValueError: in case an empty Sequential or Graph Network is passed.
+  """
+
+  def _is_graph_model(layer):
+    return ((hasattr(layer, '_is_graph_network') and layer._is_graph_network) or
+            layer.__class__.__name__ == 'Sequential')
+
+  # In case of nested models: recover the first layer
+  # of the deepest model to infer input shape and dtype.
+  # Subclassed Models may not have been built so can't be checked.
+  while _is_graph_model(layer):
+    if not layer.layers:
+      raise ValueError('An empty Model cannot be used as a Layer.')
+    layer = layer.layers[0]
+
+  if hasattr(layer, '_batch_input_shape'):
+    return layer._batch_input_shape, layer.dtype
+  return None, None
+
+
+# pylint: enable=protected-access
+
+
+def get_static_batch_size(layer):
+  """Gets the static batch size of a Layer.
+
+  Arguments:
+    layer: a `Layer` instance.
+
+  Returns:
+    The static batch size of a Layer.
+  """
+  batch_input_shape, _ = get_input_shape_and_dtype(layer)
+  if batch_input_shape is not None:
+    return tensor_shape.as_dimension(batch_input_shape[0]).value
+  return None
diff --git a/tensorflow/python/keras/engine/training_utils_test.py b/tensorflow/python/keras/engine/training_utils_test.py
index 7b217cf3732..44ea23998fe 100644
--- a/tensorflow/python/keras/engine/training_utils_test.py
+++ b/tensorflow/python/keras/engine/training_utils_test.py
@@ -21,185 +21,39 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python.eager import context
-from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
-from tensorflow.python.framework import test_util
 from tensorflow.python.keras.engine import training_utils
 from tensorflow.python.keras.utils import tf_utils
 from tensorflow.python.platform import test
 
 
-class TrainingUtilTest(test.TestCase):
-
-  @test_util.run_in_graph_and_eager_modes
-  def test_convert_to_iterator_single_numpy(self):
-    batch_size = 2
-    a = np.ones([10, 10])
-    iterator, steps_per_epoch = training_utils.convert_to_iterator(
-        x=a, batch_size=batch_size)
-    self.assertEquals(steps_per_epoch, 5)
-
-    expected_batch = a[:batch_size, :]
-    actual_batch, = iterator.get_next()
-    self.assertAllEqual(expected_batch, actual_batch)
-
-  @test_util.run_in_graph_and_eager_modes
-  def test_convert_to_iterator_single_tensor(self):
-    batch_size = 2
-    a = ops.convert_to_tensor(np.ones([10, 10]))
-    iterator, steps_per_epoch = training_utils.convert_to_iterator(
-        x=a, batch_size=batch_size)
-    self.assertEquals(steps_per_epoch, 5)
-
-    expected_batch = a[:batch_size, :]
-    actual_batch, = iterator.get_next()
-    self.assertAllEqual(expected_batch, actual_batch)
-
-  @test_util.run_in_graph_and_eager_modes
-  def test_convert_to_iterator_y(self):
-    batch_size = 2
-    a = np.ones([10, 100])
-    b = np.ones([10, 10])
-    iterator, steps_per_epoch = training_utils.convert_to_iterator(
-        x=a, y=b, batch_size=batch_size)
-    self.assertEquals(steps_per_epoch, 5)
-
-    expected_x = a[:batch_size, :]
-    expected_y = b[:batch_size, :]
-    actual_x, actual_y = iterator.get_next()
-    self.assertAllEqual(expected_x, actual_x)
-    self.assertAllEqual(expected_y, actual_y)
-
-  @test_util.run_in_graph_and_eager_modes
-  def test_convert_to_iterator_sample_weights(self):
-    batch_size = 2
-    a = ops.convert_to_tensor(np.ones([10, 100]))
-    b = ops.convert_to_tensor(np.ones([10, 10]))
-    sw = ops.convert_to_tensor(np.ones([10]))
-    iterator, steps_per_epoch = training_utils.convert_to_iterator(
-        x=a, y=b, sample_weights=sw, batch_size=batch_size)
-    self.assertEquals(steps_per_epoch, 5)
-
-    expected_x = a[:batch_size, :]
-    expected_y = b[:batch_size, :]
-    expected_sw = sw[:batch_size]
-    actual_x, actual_y, actual_sw = iterator.get_next()
-    self.assertAllEqual(expected_x, actual_x)
-    self.assertAllEqual(expected_y, actual_y)
-    self.assertAllEqual(expected_sw, actual_sw)
-
-  @test_util.run_in_graph_and_eager_modes
-  def test_convert_to_iterator_nested(self):
-    batch_size = 2
-    x = {'1': np.ones([10, 100]), '2': [np.zeros([10, 10]), np.ones([10, 20])]}
-    iterator, steps_per_epoch = training_utils.convert_to_iterator(
-        x=x, batch_size=batch_size)
-    self.assertEquals(steps_per_epoch, 5)
-
-    expected_x1 = x['1'][:batch_size, :]
-    expected_x2_0 = x['2'][0][:batch_size, :]
-    expected_x2_1 = x['2'][1][:batch_size, :]
-
-    actual_x, = iterator.get_next()
-    actual_x1 = actual_x['1'][:batch_size, :]
-    actual_x2_0 = actual_x['2'][0][:batch_size, :]
-    actual_x2_1 = actual_x['2'][1][:batch_size, :]
-
-    self.assertAllEqual(expected_x1, actual_x1)
-    self.assertAllEqual(expected_x2_0, actual_x2_0)
-    self.assertAllEqual(expected_x2_1, actual_x2_1)
-
-  @test_util.run_in_graph_and_eager_modes
-  def test_convert_to_iterator_epochs(self):
-    batch_size = 2
-    a = np.ones([10, 10])
-    iterator, steps_per_epoch = training_utils.convert_to_iterator(
-        x=a, batch_size=batch_size, epochs=2)
-    self.assertEquals(steps_per_epoch, 5)
-
-    expected_batch = a[:batch_size, :]
-    # loop through one whole epoch
-    for _ in range(6):
-      actual_batch, = iterator.get_next()
-    self.assertAllEqual(expected_batch, actual_batch)
-
-  @test_util.run_in_graph_and_eager_modes
-  def test_convert_to_iterator_insufficient_info(self):
-    # with batch_size and steps_per_epoch not set
-    with self.assertRaises(ValueError):
-      a = np.ones([10, 10])
-      _ = training_utils.convert_to_iterator(x=a)
-
-  def test_nested_all(self):
-    nested_data = {'a': True, 'b': [True, True, (False, True)]}
-    all_true = training_utils._nested_all(nested_data, lambda x: x)
-    self.assertEquals(all_true, False)
-
-    nested_data = {'a': True, 'b': [True, True, (True, True)]}
-    all_true = training_utils._nested_all(nested_data, lambda x: x)
-    self.assertEquals(all_true, True)
-
-  def test_nested_any(self):
-    nested_data = [False, {'a': False, 'b': (False, True)}]
-    any_true = training_utils._nested_any(nested_data, lambda x: x)
-    self.assertEquals(any_true, True)
-
-    nested_data = [False, {'a': False, 'b': (False, False)}]
-    any_true = training_utils._nested_any(nested_data, lambda x: x)
-    self.assertEquals(any_true, False)
-
-  def test_check_array_lengths(self):
-    training_utils.check_array_lengths(None, None, None)
-    a_np = np.random.random((4, 3, 3))
-    training_utils.check_array_lengths(a_np, a_np, a_np)
-    training_utils.check_array_lengths(
-        [a_np, a_np], [a_np, a_np], [a_np, a_np])
-    training_utils.check_array_lengths([None], [None], [None])
-
-    b_np = np.random.random((3, 4))
-    with self.assertRaises(ValueError):
-      training_utils.check_array_lengths([a_np], [b_np], None)
-
-
 class ModelInputsTest(test.TestCase):
 
   def test_single_thing(self):
     a = np.ones(10)
     model_inputs = training_utils.ModelInputs(a)
-    self.assertEquals(['input_1'], model_inputs.get_input_names())
-    vals = model_inputs.get_input_values()
-    self.assertAllEqual(np.ones(10), vals)
-    self.assertFalse(tensor_util.is_tensor(vals))
+    self.assertEqual(['input_1'], model_inputs.get_input_names())
     vals = model_inputs.get_symbolic_inputs()
     self.assertTrue(tensor_util.is_tensor(vals))
     vals = model_inputs.get_symbolic_inputs(return_single_as_list=True)
-    self.assertEquals(1, len(vals))
+    self.assertEqual(1, len(vals))
     self.assertTrue(tensor_util.is_tensor(vals[0]))
 
   def test_single_thing_eager(self):
     with context.eager_mode():
       a = np.ones(10)
       model_inputs = training_utils.ModelInputs(a)
-      self.assertEquals(['input_1'], model_inputs.get_input_names())
-      val = model_inputs.get_input_values()
-      self.assertAllEqual(np.ones(10), val)
-      self.assertTrue(tensor_util.is_tensor(val))
+      self.assertEqual(['input_1'], model_inputs.get_input_names())
       val = model_inputs.get_symbolic_inputs()
       self.assertTrue(tf_utils.is_symbolic_tensor(val))
       vals = model_inputs.get_symbolic_inputs(return_single_as_list=True)
-      self.assertEquals(1, len(vals))
+      self.assertEqual(1, len(vals))
       self.assertTrue(tf_utils.is_symbolic_tensor(vals[0]))
 
   def test_list(self):
     a = [np.ones(10), np.ones(20)]
     model_inputs = training_utils.ModelInputs(a)
-    self.assertEquals(['input_1', 'input_2'], model_inputs.get_input_names())
-    vals = model_inputs.get_input_values()
-    self.assertEqual(2, len(vals))
-    self.assertAllEqual(np.ones(10), vals[0])
-    self.assertAllEqual(np.ones(20), vals[1])
-    self.assertFalse(tensor_util.is_tensor(vals[0]))
-    self.assertFalse(tensor_util.is_tensor(vals[1]))
+    self.assertEqual(['input_1', 'input_2'], model_inputs.get_input_names())
     vals = model_inputs.get_symbolic_inputs()
     self.assertTrue(tensor_util.is_tensor(vals[0]))
     self.assertTrue(tensor_util.is_tensor(vals[1]))
@@ -208,13 +62,7 @@ class ModelInputsTest(test.TestCase):
     with context.eager_mode():
       a = [np.ones(10), np.ones(20)]
       model_inputs = training_utils.ModelInputs(a)
-      self.assertEquals(['input_1', 'input_2'], model_inputs.get_input_names())
-      vals = model_inputs.get_input_values()
-      self.assertEqual(2, len(vals))
-      self.assertAllEqual(np.ones(10), vals[0])
-      self.assertAllEqual(np.ones(20), vals[1])
-      self.assertTrue(tensor_util.is_tensor(vals[0]))
-      self.assertTrue(tensor_util.is_tensor(vals[1]))
+      self.assertEqual(['input_1', 'input_2'], model_inputs.get_input_names())
       vals = model_inputs.get_symbolic_inputs()
       self.assertTrue(tf_utils.is_symbolic_tensor(vals[0]))
       self.assertTrue(tf_utils.is_symbolic_tensor(vals[1]))
@@ -222,12 +70,7 @@ class ModelInputsTest(test.TestCase):
   def test_dict(self):
     a = {'b': np.ones(10), 'a': np.ones(20)}
     model_inputs = training_utils.ModelInputs(a)
-    self.assertEquals(['a', 'b'], model_inputs.get_input_names())
-    vals = model_inputs.get_input_values()
-    self.assertAllEqual(np.ones(20), vals['a'])
-    self.assertAllEqual(np.ones(10), vals['b'])
-    self.assertFalse(tensor_util.is_tensor(vals['a']))
-    self.assertFalse(tensor_util.is_tensor(vals['b']))
+    self.assertEqual(['a', 'b'], model_inputs.get_input_names())
     vals = model_inputs.get_symbolic_inputs()
     self.assertTrue(tensor_util.is_tensor(vals['a']))
     self.assertTrue(tensor_util.is_tensor(vals['b']))
@@ -236,12 +79,7 @@ class ModelInputsTest(test.TestCase):
     with context.eager_mode():
       a = {'b': np.ones(10), 'a': np.ones(20)}
       model_inputs = training_utils.ModelInputs(a)
-      self.assertEquals(['a', 'b'], model_inputs.get_input_names())
-      vals = model_inputs.get_input_values()
-      self.assertAllEqual(np.ones(20), vals['a'])
-      self.assertAllEqual(np.ones(10), vals['b'])
-      self.assertTrue(tensor_util.is_tensor(vals['a']))
-      self.assertTrue(tensor_util.is_tensor(vals['b']))
+      self.assertEqual(['a', 'b'], model_inputs.get_input_names())
       vals = model_inputs.get_symbolic_inputs()
       self.assertTrue(tf_utils.is_symbolic_tensor(vals['a']))
       self.assertTrue(tf_utils.is_symbolic_tensor(vals['b']))
diff --git a/tensorflow/python/keras/estimator/__init__.py b/tensorflow/python/keras/estimator/__init__.py
index b244beb5b58..dcd06008970 100644
--- a/tensorflow/python/keras/estimator/__init__.py
+++ b/tensorflow/python/keras/estimator/__init__.py
@@ -24,23 +24,54 @@ from tensorflow.python.util.tf_export import tf_export
 # As long as you depend //third_party/py/tensorflow:tensorflow target
 # everything will work as normal.
 
-try:
-  from tensorflow.python.estimator import keras as keras_lib  # pylint: disable=g-import-not-at-top
-  model_to_estimator = tf_export('keras.estimator.model_to_estimator')(
-      keras_lib.model_to_estimator)
-except Exception:  # pylint: disable=broad-except
 
-  # pylint: disable=unused-argument
-  def stub_model_to_estimator(keras_model=None,
-                              keras_model_path=None,
-                              custom_objects=None,
-                              model_dir=None,
-                              config=None):
+# LINT.IfChange
+@tf_export('keras.estimator.model_to_estimator')
+def model_to_estimator(
+    keras_model=None,
+    keras_model_path=None,
+    custom_objects=None,
+    model_dir=None,
+    config=None):
+  """Constructs an `Estimator` instance from given keras model.
+
+  For usage example, please see:
+  [Creating estimators from Keras
+  Models](https://tensorflow.org/guide/estimators#model_to_estimator).
+
+  Args:
+    keras_model: A compiled Keras model object. This argument is mutually
+      exclusive with `keras_model_path`.
+    keras_model_path: Path to a compiled Keras model saved on disk, in HDF5
+      format, which can be generated with the `save()` method of a Keras model.
+      This argument is mutually exclusive with `keras_model`.
+    custom_objects: Dictionary for custom objects.
+    model_dir: Directory to save `Estimator` model parameters, graph, summary
+      files for TensorBoard, etc.
+    config: `RunConfig` to config `Estimator`.
+
+  Returns:
+    An Estimator from given keras model.
+
+  Raises:
+    ValueError: if neither keras_model nor keras_model_path was given.
+    ValueError: if both keras_model and keras_model_path was given.
+    ValueError: if the keras_model_path is a GCS URI.
+    ValueError: if keras_model has not been compiled.
+  """
+  try:
+    from tensorflow_estimator.python.estimator import keras as keras_lib  # pylint: disable=g-import-not-at-top
+  except ImportError:
     raise NotImplementedError(
         'tf.keras.estimator.model_to_estimator function not available in your '
         'installation.')
-  # pylint: enable=unused-argument
+  return keras_lib.model_to_estimator(
+      keras_model=keras_model,
+      keras_model_path=keras_model_path,
+      custom_objects=custom_objects,
+      model_dir=model_dir,
+      config=config)
+
+# LINT.ThenChange(//third_party/tensorflow_estimator/python/estimator/keras.py)
 
-  model_to_estimator = tf_export('keras.estimator.model_to_estimator')(
-      stub_model_to_estimator)
 
diff --git a/tensorflow/python/keras/initializers_test.py b/tensorflow/python/keras/initializers_test.py
index 2b758a98f30..4f91bea1e33 100644
--- a/tensorflow/python/keras/initializers_test.py
+++ b/tensorflow/python/keras/initializers_test.py
@@ -22,6 +22,7 @@ import numpy as np
 
 from tensorflow.python import keras
 from tensorflow.python.ops import init_ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
 
 
@@ -38,6 +39,7 @@ class KerasInitializersTest(test.TestCase):
     output_2 = keras.backend.get_value(variable)
     self.assertAllClose(output, output_2, atol=1e-4)
 
+  @test_util.run_deprecated_v1
   def test_uniform(self):
     tensor_shape = (9, 6, 7)
     with self.cached_session():
@@ -47,6 +49,7 @@ class KerasInitializersTest(test.TestCase):
                    tensor_shape,
                    target_mean=0., target_max=1, target_min=-1)
 
+  @test_util.run_deprecated_v1
   def test_normal(self):
     tensor_shape = (8, 12, 99)
     with self.cached_session():
@@ -54,6 +57,7 @@ class KerasInitializersTest(test.TestCase):
                    tensor_shape,
                    target_mean=0., target_std=1)
 
+  @test_util.run_deprecated_v1
   def test_truncated_normal(self):
     tensor_shape = (12, 99, 7)
     with self.cached_session():
@@ -69,6 +73,7 @@ class KerasInitializersTest(test.TestCase):
       self._runner(keras.initializers.Constant(2), tensor_shape,
                    target_mean=2, target_max=2, target_min=2)
 
+  @test_util.run_deprecated_v1
   def test_lecun_uniform(self):
     tensor_shape = (5, 6, 4, 2)
     with self.cached_session():
@@ -77,6 +82,7 @@ class KerasInitializersTest(test.TestCase):
       self._runner(keras.initializers.lecun_uniform(seed=123), tensor_shape,
                    target_mean=0., target_std=std)
 
+  @test_util.run_deprecated_v1
   def test_glorot_uniform(self):
     tensor_shape = (5, 6, 4, 2)
     with self.cached_session():
@@ -85,6 +91,7 @@ class KerasInitializersTest(test.TestCase):
       self._runner(keras.initializers.glorot_uniform(seed=123), tensor_shape,
                    target_mean=0., target_std=std)
 
+  @test_util.run_deprecated_v1
   def test_he_uniform(self):
     tensor_shape = (5, 6, 4, 2)
     with self.cached_session():
@@ -93,6 +100,7 @@ class KerasInitializersTest(test.TestCase):
       self._runner(keras.initializers.he_uniform(seed=123), tensor_shape,
                    target_mean=0., target_std=std)
 
+  @test_util.run_deprecated_v1
   def test_lecun_normal(self):
     tensor_shape = (5, 6, 4, 2)
     with self.cached_session():
@@ -101,6 +109,7 @@ class KerasInitializersTest(test.TestCase):
       self._runner(keras.initializers.lecun_normal(seed=123), tensor_shape,
                    target_mean=0., target_std=std)
 
+  @test_util.run_deprecated_v1
   def test_glorot_normal(self):
     tensor_shape = (5, 6, 4, 2)
     with self.cached_session():
@@ -109,6 +118,7 @@ class KerasInitializersTest(test.TestCase):
       self._runner(keras.initializers.glorot_normal(seed=123), tensor_shape,
                    target_mean=0., target_std=std)
 
+  @test_util.run_deprecated_v1
   def test_he_normal(self):
     tensor_shape = (5, 6, 4, 2)
     with self.cached_session():
@@ -117,6 +127,7 @@ class KerasInitializersTest(test.TestCase):
       self._runner(keras.initializers.he_normal(seed=123), tensor_shape,
                    target_mean=0., target_std=std)
 
+  @test_util.run_deprecated_v1
   def test_orthogonal(self):
     tensor_shape = (20, 20)
     with self.cached_session():
diff --git a/tensorflow/python/keras/integration_test.py b/tensorflow/python/keras/integration_test.py
index 3c0f73b1c3a..23f54385057 100644
--- a/tensorflow/python/keras/integration_test.py
+++ b/tensorflow/python/keras/integration_test.py
@@ -22,6 +22,7 @@ import numpy as np
 
 from tensorflow.python import keras
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.layers import core as tf_core_layers
 from tensorflow.python.ops import nn
@@ -34,6 +35,7 @@ class KerasIntegrationTest(test.TestCase):
   def test_version(self):
     self.assertTrue(keras.__version__.endswith('-tf'))
 
+  @test_util.run_deprecated_v1
   def test_vector_classification_sequential(self):
     with self.cached_session():
       np.random.seed(1337)
@@ -59,6 +61,7 @@ class KerasIntegrationTest(test.TestCase):
                           verbose=2)
       self.assertGreater(history.history['val_acc'][-1], 0.7)
 
+  @test_util.run_deprecated_v1
   def test_vector_classification_functional(self):
     with self.cached_session():
       np.random.seed(1337)
@@ -83,6 +86,7 @@ class KerasIntegrationTest(test.TestCase):
                           verbose=2)
       self.assertGreater(history.history['val_acc'][-1], 0.7)
 
+  @test_util.run_deprecated_v1
   def test_temporal_classification_sequential(self):
     with self.cached_session():
       np.random.seed(1337)
@@ -105,6 +109,7 @@ class KerasIntegrationTest(test.TestCase):
                           verbose=2)
       self.assertGreater(history.history['val_acc'][-1], 0.7)
 
+  @test_util.run_deprecated_v1
   def test_temporal_classification_sequential_tf_rnn(self):
     with self.cached_session():
       np.random.seed(1337)
@@ -163,6 +168,7 @@ class KerasIntegrationTest(test.TestCase):
                           verbose=2)
       self.assertGreater(history.history['val_acc'][-1], 0.7)
 
+  @test_util.run_deprecated_v1
   def test_video_classification_functional(self):
     with self.cached_session():
       np.random.seed(1337)
@@ -191,6 +197,7 @@ class KerasIntegrationTest(test.TestCase):
                           verbose=2)
       self.assertGreater(history.history['val_acc'][-1], 0.7)
 
+  @test_util.run_deprecated_v1
   def test_vector_classification_shared_sequential(self):
     # Test that Sequential models that feature internal updates
     # and internal losses can be shared.
@@ -225,6 +232,7 @@ class KerasIntegrationTest(test.TestCase):
                           verbose=2)
       self.assertGreater(history.history['val_acc'][-1], 0.7)
 
+  @test_util.run_deprecated_v1
   def test_vector_classification_shared_model(self):
     # Test that functional models that feature internal updates
     # and internal losses can be shared.
@@ -312,6 +320,15 @@ class KerasIntegrationTest(test.TestCase):
                           verbose=0)
       self.assertGreater(history.history['val_acc'][-1], 0.7)
 
+  def test_regularizers_with_get_variable(self):
+    # Test case for GitHub issue 22470.
+    with self.cached_session():
+      v = variable_scope.get_variable(
+          "v",
+          shape=[4, 4],
+          initializer=keras.initializers.glorot_uniform(),
+          regularizer=keras.regularizers.l2(0.))
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/layers/__init__.py b/tensorflow/python/keras/layers/__init__.py
index 7268040b028..49990b6bf4f 100644
--- a/tensorflow/python/keras/layers/__init__.py
+++ b/tensorflow/python/keras/layers/__init__.py
@@ -22,7 +22,7 @@ from __future__ import print_function
 # pylint: disable=g-bad-import-order
 from tensorflow.python.keras.engine.input_layer import Input
 from tensorflow.python.keras.engine.input_layer import InputLayer
-from tensorflow.python.keras.engine.base_layer import InputSpec
+from tensorflow.python.keras.engine.input_spec import InputSpec
 from tensorflow.python.keras.engine.base_layer import Layer
 
 # Advanced activations.
diff --git a/tensorflow/python/keras/layers/advanced_activations.py b/tensorflow/python/keras/layers/advanced_activations.py
index b0dffced3ea..35ac7830b2e 100644
--- a/tensorflow/python/keras/layers/advanced_activations.py
+++ b/tensorflow/python/keras/layers/advanced_activations.py
@@ -22,8 +22,8 @@ from tensorflow.python.keras import backend as K
 from tensorflow.python.keras import constraints
 from tensorflow.python.keras import initializers
 from tensorflow.python.keras import regularizers
-from tensorflow.python.keras.engine.base_layer import InputSpec
 from tensorflow.python.keras.engine.base_layer import Layer
+from tensorflow.python.keras.engine.input_spec import InputSpec
 from tensorflow.python.keras.utils import tf_utils
 from tensorflow.python.ops import math_ops
 from tensorflow.python.util.tf_export import tf_export
@@ -54,7 +54,6 @@ class LeakyReLU(Layer):
     super(LeakyReLU, self).__init__(**kwargs)
     self.supports_masking = True
     self.alpha = K.cast_to_floatx(alpha)
-    self._can_use_graph_functions = True
 
   def call(self, inputs):
     return K.relu(inputs, alpha=self.alpha)
@@ -118,7 +117,6 @@ class PReLU(Layer):
       self.shared_axes = [shared_axes]
     else:
       self.shared_axes = list(shared_axes)
-    self._can_use_graph_functions = True
 
   @tf_utils.shape_type_conversion
   def build(self, input_shape):
@@ -193,7 +191,6 @@ class ELU(Layer):
     super(ELU, self).__init__(**kwargs)
     self.supports_masking = True
     self.alpha = K.cast_to_floatx(alpha)
-    self._can_use_graph_functions = True
 
   def call(self, inputs):
     return K.elu(inputs, self.alpha)
@@ -233,7 +230,6 @@ class ThresholdedReLU(Layer):
     super(ThresholdedReLU, self).__init__(**kwargs)
     self.supports_masking = True
     self.theta = K.cast_to_floatx(theta)
-    self._can_use_graph_functions = True
 
   def call(self, inputs, mask=None):
     return inputs * math_ops.cast(
@@ -269,7 +265,6 @@ class Softmax(Layer):
     super(Softmax, self).__init__(**kwargs)
     self.supports_masking = True
     self.axis = axis
-    self._can_use_graph_functions = True
 
   def call(self, inputs):
     return K.softmax(inputs, axis=self.axis)
@@ -324,7 +319,6 @@ class ReLU(Layer):
     self.max_value = max_value
     self.negative_slope = K.cast_to_floatx(negative_slope)
     self.threshold = K.cast_to_floatx(threshold)
-    self._can_use_graph_functions = True
 
   def call(self, inputs):
     # alpha is used for leaky relu slope in activations instead of
diff --git a/tensorflow/python/keras/layers/convolutional.py b/tensorflow/python/keras/layers/convolutional.py
index 0671a5a36d6..6564d6e8fdb 100644
--- a/tensorflow/python/keras/layers/convolutional.py
+++ b/tensorflow/python/keras/layers/convolutional.py
@@ -26,8 +26,8 @@ from tensorflow.python.keras import backend
 from tensorflow.python.keras import constraints
 from tensorflow.python.keras import initializers
 from tensorflow.python.keras import regularizers
-from tensorflow.python.keras.engine.base_layer import InputSpec
 from tensorflow.python.keras.engine.base_layer import Layer
+from tensorflow.python.keras.engine.input_spec import InputSpec
 # imports for backwards namespace compatibility
 # pylint: disable=unused-import
 from tensorflow.python.keras.layers.pooling import AveragePooling1D
@@ -120,7 +120,6 @@ class Conv(Layer):
         name=name,
         activity_regularizer=regularizers.get(activity_regularizer),
         **kwargs)
-    self._can_use_graph_functions = True
     self.rank = rank
     self.filters = filters
     self.kernel_size = conv_utils.normalize_tuple(
@@ -1916,7 +1915,6 @@ class UpSampling1D(Layer):
     super(UpSampling1D, self).__init__(**kwargs)
     self.size = int(size)
     self.input_spec = InputSpec(ndim=3)
-    self._can_use_graph_functions = True
 
   def compute_output_shape(self, input_shape):
     input_shape = tensor_shape.TensorShape(input_shape).as_list()
@@ -1983,7 +1981,6 @@ class UpSampling2D(Layer):
                        'or `"bilinear"`.')
     self.interpolation = interpolation
     self.input_spec = InputSpec(ndim=4)
-    self._can_use_graph_functions = True
 
   def compute_output_shape(self, input_shape):
     input_shape = tensor_shape.TensorShape(input_shape).as_list()
@@ -2054,7 +2051,6 @@ class UpSampling3D(Layer):
     self.size = conv_utils.normalize_tuple(size, 3, 'size')
     self.input_spec = InputSpec(ndim=5)
     super(UpSampling3D, self).__init__(**kwargs)
-    self._can_use_graph_functions = True
 
   def compute_output_shape(self, input_shape):
     input_shape = tensor_shape.TensorShape(input_shape).as_list()
@@ -2109,7 +2105,6 @@ class ZeroPadding1D(Layer):
 
   def __init__(self, padding=1, **kwargs):
     super(ZeroPadding1D, self).__init__(**kwargs)
-    self._can_use_graph_functions = True
     self.padding = conv_utils.normalize_tuple(padding, 2, 'padding')
     self.input_spec = InputSpec(ndim=3)
 
@@ -2175,7 +2170,6 @@ class ZeroPadding2D(Layer):
 
   def __init__(self, padding=(1, 1), data_format=None, **kwargs):
     super(ZeroPadding2D, self).__init__(**kwargs)
-    self._can_use_graph_functions = True
     self.data_format = conv_utils.normalize_data_format(data_format)
     if isinstance(padding, int):
       self.padding = ((padding, padding), (padding, padding))
@@ -2280,7 +2274,6 @@ class ZeroPadding3D(Layer):
 
   def __init__(self, padding=(1, 1, 1), data_format=None, **kwargs):
     super(ZeroPadding3D, self).__init__(**kwargs)
-    self._can_use_graph_functions = True
     self.data_format = conv_utils.normalize_data_format(data_format)
     if isinstance(padding, int):
       self.padding = ((padding, padding), (padding, padding), (padding,
@@ -2375,7 +2368,6 @@ class Cropping1D(Layer):
     super(Cropping1D, self).__init__(**kwargs)
     self.cropping = conv_utils.normalize_tuple(cropping, 2, 'cropping')
     self.input_spec = InputSpec(ndim=3)
-    self._can_use_graph_functions = True
 
   def compute_output_shape(self, input_shape):
     input_shape = tensor_shape.TensorShape(input_shape).as_list()
@@ -2475,7 +2467,6 @@ class Cropping2D(Layer):
                        '((top_crop, bottom_crop), (left_crop, right_crop)). '
                        'Found: ' + str(cropping))
     self.input_spec = InputSpec(ndim=4)
-    self._can_use_graph_functions = True
 
   def compute_output_shape(self, input_shape):
     input_shape = tensor_shape.TensorShape(input_shape).as_list()
@@ -2609,7 +2600,6 @@ class Cropping3D(Layer):
           ' (left_dim3_crop, right_dim2_crop)). '
           'Found: ' + str(cropping))
     self.input_spec = InputSpec(ndim=5)
-    self._can_use_graph_functions = True
 
   def compute_output_shape(self, input_shape):
     input_shape = tensor_shape.TensorShape(input_shape).as_list()
diff --git a/tensorflow/python/keras/layers/convolutional_recurrent.py b/tensorflow/python/keras/layers/convolutional_recurrent.py
index 100542129bf..cf3861da218 100644
--- a/tensorflow/python/keras/layers/convolutional_recurrent.py
+++ b/tensorflow/python/keras/layers/convolutional_recurrent.py
@@ -26,8 +26,8 @@ from tensorflow.python.keras import backend as K
 from tensorflow.python.keras import constraints
 from tensorflow.python.keras import initializers
 from tensorflow.python.keras import regularizers
-from tensorflow.python.keras.engine.base_layer import InputSpec
 from tensorflow.python.keras.engine.base_layer import Layer
+from tensorflow.python.keras.engine.input_spec import InputSpec
 from tensorflow.python.keras.layers.recurrent import _generate_dropout_mask
 from tensorflow.python.keras.layers.recurrent import _standardize_args
 from tensorflow.python.keras.layers.recurrent import RNN
diff --git a/tensorflow/python/keras/layers/core.py b/tensorflow/python/keras/layers/core.py
index e5c37be0aa1..56dd70558cc 100644
--- a/tensorflow/python/keras/layers/core.py
+++ b/tensorflow/python/keras/layers/core.py
@@ -34,8 +34,8 @@ from tensorflow.python.keras import backend as K
 from tensorflow.python.keras import constraints
 from tensorflow.python.keras import initializers
 from tensorflow.python.keras import regularizers
-from tensorflow.python.keras.engine.base_layer import InputSpec
 from tensorflow.python.keras.engine.base_layer import Layer
+from tensorflow.python.keras.engine.input_spec import InputSpec
 from tensorflow.python.keras.utils import conv_utils
 from tensorflow.python.keras.utils import generic_utils
 from tensorflow.python.keras.utils import tf_utils
@@ -81,7 +81,6 @@ class Masking(Layer):
     super(Masking, self).__init__(**kwargs)
     self.supports_masking = True
     self.mask_value = mask_value
-    self._can_use_graph_functions = True
 
   def compute_mask(self, inputs, mask=None):
     return K.any(math_ops.not_equal(inputs, self.mask_value), axis=-1)
@@ -125,7 +124,6 @@ class Dropout(Layer):
     self.noise_shape = noise_shape
     self.seed = seed
     self.supports_masking = True
-    self._can_use_graph_functions = True
 
   def _get_noise_shape(self, inputs):
     # Subclasses of `Dropout` may implement `_get_noise_shape(self, inputs)`,
@@ -326,7 +324,6 @@ class Activation(Layer):
     super(Activation, self).__init__(**kwargs)
     self.supports_masking = True
     self.activation = activations.get(activation)
-    self._can_use_graph_functions = True
 
   def call(self, inputs):
     return self.activation(inputs)
@@ -379,7 +376,6 @@ class Reshape(Layer):
   def __init__(self, target_shape, **kwargs):
     super(Reshape, self).__init__(**kwargs)
     self.target_shape = tuple(target_shape)
-    self._can_use_graph_functions = True
 
   def _fix_unknown_dimension(self, input_shape, output_shape):
     """Find and replace a missing dimension in an output shape.
@@ -488,7 +484,6 @@ class Permute(Layer):
           'The set of indices in `dims` must be consecutive and start from 1.' %
           (dims,))
     self.input_spec = InputSpec(ndim=len(self.dims) + 1)
-    self._can_use_graph_functions = True
 
   def compute_output_shape(self, input_shape):
     input_shape = tensor_shape.TensorShape(input_shape).as_list()
@@ -540,7 +535,6 @@ class Flatten(Layer):
     super(Flatten, self).__init__(**kwargs)
     self.data_format = conv_utils.normalize_data_format(data_format)
     self.input_spec = InputSpec(min_ndim=2)
-    self._can_use_graph_functions = True
 
   def call(self, inputs):
     if self.data_format == 'channels_first':
@@ -600,7 +594,6 @@ class RepeatVector(Layer):
     super(RepeatVector, self).__init__(**kwargs)
     self.n = n
     self.input_spec = InputSpec(ndim=2)
-    self._can_use_graph_functions = True
 
   def compute_output_shape(self, input_shape):
     input_shape = tensor_shape.TensorShape(input_shape).as_list()
@@ -929,7 +922,6 @@ class Dense(Layer):
 
     self.supports_masking = True
     self.input_spec = InputSpec(min_ndim=2)
-    self._can_use_graph_functions = True
 
   def build(self, input_shape):
     input_shape = tensor_shape.TensorShape(input_shape)
@@ -1029,7 +1021,6 @@ class ActivityRegularization(Layer):
     self.supports_masking = True
     self.l1 = l1
     self.l2 = l2
-    self._can_use_graph_functions = True
 
   def compute_output_shape(self, input_shape):
     return input_shape
diff --git a/tensorflow/python/keras/layers/cudnn_recurrent.py b/tensorflow/python/keras/layers/cudnn_recurrent.py
index beacdf25156..16692753afb 100644
--- a/tensorflow/python/keras/layers/cudnn_recurrent.py
+++ b/tensorflow/python/keras/layers/cudnn_recurrent.py
@@ -25,7 +25,8 @@ from tensorflow.python.keras import backend as K
 from tensorflow.python.keras import constraints
 from tensorflow.python.keras import initializers
 from tensorflow.python.keras import regularizers
-from tensorflow.python.keras.engine.base_layer import InputSpec
+from tensorflow.python.keras.engine.input_spec import InputSpec
+from tensorflow.python.keras.layers import recurrent
 from tensorflow.python.keras.layers.recurrent import RNN
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_cudnn_rnn_ops
@@ -80,11 +81,6 @@ class _CuDNNRNN(RNN):
     self._num_inputs = None
     self._vector_shape = constant_op.constant([-1])
 
-  def _canonical_to_params(self, weights, biases):
-    weights = [array_ops.reshape(x, self._vector_shape) for x in weights]
-    biases = [array_ops.reshape(x, self._vector_shape) for x in biases]
-    return array_ops.concat(weights + biases, axis=0)
-
   def call(self, inputs, mask=None, training=None, initial_state=None):
     if isinstance(mask, list):
       mask = mask[0]
@@ -279,7 +275,7 @@ class CuDNNGRU(_CuDNNRNN):
     input_h = initial_state[0]
     input_h = array_ops.expand_dims(input_h, axis=0)
 
-    params = self._canonical_to_params(
+    params = recurrent._canonical_to_params(    # pylint: disable=protected-access
         weights=[
             self.kernel[:, self.units:self.units * 2],
             self.kernel[:, :self.units],
@@ -296,7 +292,7 @@ class CuDNNGRU(_CuDNNRNN):
             self.bias[self.units * 3:self.units * 4],
             self.bias[self.units * 5:],
         ],
-    )
+        shape=self._vector_shape)
 
     outputs, h, _, _ = gen_cudnn_rnn_ops.cudnn_rnn(
         inputs,
@@ -474,7 +470,7 @@ class CuDNNLSTM(_CuDNNRNN):
     input_h = array_ops.expand_dims(input_h, axis=0)
     input_c = array_ops.expand_dims(input_c, axis=0)
 
-    params = self._canonical_to_params(
+    params = recurrent._canonical_to_params(    # pylint: disable=protected-access
         weights=[
             self.kernel[:, :self.units],
             self.kernel[:, self.units:self.units * 2],
@@ -495,7 +491,7 @@ class CuDNNLSTM(_CuDNNRNN):
             self.bias[self.units * 6:self.units * 7],
             self.bias[self.units * 7:],
         ],
-    )
+        shape=self._vector_shape)
 
     outputs, h, c, _ = gen_cudnn_rnn_ops.cudnn_rnn(
         inputs,
diff --git a/tensorflow/python/keras/layers/embeddings.py b/tensorflow/python/keras/layers/embeddings.py
index 5d805ea684f..e8a8575705a 100644
--- a/tensorflow/python/keras/layers/embeddings.py
+++ b/tensorflow/python/keras/layers/embeddings.py
@@ -45,11 +45,11 @@ class Embedding(Layer):
     model = Sequential()
     model.add(Embedding(1000, 64, input_length=10))
     # the model will take as input an integer matrix of size (batch,
-    input_length).
+    # input_length).
     # the largest integer (i.e. word index) in the input should be no larger
-    than 999 (vocabulary size).
+    # than 999 (vocabulary size).
     # now model.output_shape == (None, 10, 64), where None is the batch
-    dimension.
+    # dimension.
 
     input_array = np.random.randint(1000, size=(32, 10))
 
@@ -116,7 +116,6 @@ class Embedding(Layer):
     self.mask_zero = mask_zero
     self.supports_masking = mask_zero
     self.input_length = input_length
-    self._can_use_graph_functions = True
 
   @tf_utils.shape_type_conversion
   def build(self, input_shape):
diff --git a/tensorflow/python/keras/layers/local.py b/tensorflow/python/keras/layers/local.py
index 30b83eaf50c..d2c4aaa125e 100644
--- a/tensorflow/python/keras/layers/local.py
+++ b/tensorflow/python/keras/layers/local.py
@@ -23,8 +23,8 @@ from tensorflow.python.keras import backend as K
 from tensorflow.python.keras import constraints
 from tensorflow.python.keras import initializers
 from tensorflow.python.keras import regularizers
-from tensorflow.python.keras.engine.base_layer import InputSpec
 from tensorflow.python.keras.engine.base_layer import Layer
+from tensorflow.python.keras.engine.input_spec import InputSpec
 from tensorflow.python.keras.utils import conv_utils
 from tensorflow.python.keras.utils import tf_utils
 from tensorflow.python.util.tf_export import tf_export
@@ -154,7 +154,6 @@ class LocallyConnected1D(Layer):
     self.bias_constraint = constraints.get(bias_constraint)
     self.implementation = implementation
     self.input_spec = InputSpec(ndim=3)
-    self._can_use_graph_functions = True
 
   @tf_utils.shape_type_conversion
   def build(self, input_shape):
@@ -430,7 +429,6 @@ class LocallyConnected2D(Layer):
     self.bias_constraint = constraints.get(bias_constraint)
     self.implementation = implementation
     self.input_spec = InputSpec(ndim=4)
-    self._can_use_graph_functions = True
 
   @tf_utils.shape_type_conversion
   def build(self, input_shape):
diff --git a/tensorflow/python/keras/layers/local_test.py b/tensorflow/python/keras/layers/local_test.py
index 300e7c96545..e4f4d0a639a 100644
--- a/tensorflow/python/keras/layers/local_test.py
+++ b/tensorflow/python/keras/layers/local_test.py
@@ -28,39 +28,43 @@ from tensorflow.python.training.rmsprop import RMSPropOptimizer
 
 
 class LocallyConnected1DLayersTest(test.TestCase):
+  # TODO(fchollet): investigate why LocallyConnected1D
+  # fails inside a graph function in an eager context (fails with error
+  # "Incompatible shapes between op input and calculated input gradient").
 
-  @tf_test_util.run_in_graph_and_eager_modes
+  @tf_test_util.run_deprecated_v1
   def test_locallyconnected_1d(self):
-    num_samples = 2
-    num_steps = 8
-    input_dim = 5
-    filter_length = 3
-    filters = 4
+    with self.cached_session():
+      num_samples = 2
+      num_steps = 8
+      input_dim = 5
+      filter_length = 3
+      filters = 4
 
-    for padding in ['valid', 'same']:
-      for strides in [1]:
-        if padding == 'same' and strides != 1:
-          continue
-        for data_format in ['channels_first', 'channels_last']:
-          for implementation in [1, 2]:
-            kwargs = {
-                'filters': filters,
-                'kernel_size': filter_length,
-                'padding': padding,
-                'strides': strides,
-                'data_format': data_format,
-                'implementation': implementation
-            }
+      for padding in ['valid', 'same']:
+        for strides in [1]:
+          if padding == 'same' and strides != 1:
+            continue
+          for data_format in ['channels_first', 'channels_last']:
+            for implementation in [1, 2]:
+              kwargs = {
+                  'filters': filters,
+                  'kernel_size': filter_length,
+                  'padding': padding,
+                  'strides': strides,
+                  'data_format': data_format,
+                  'implementation': implementation
+              }
 
-            if padding == 'same' and implementation == 1:
-              self.assertRaises(ValueError,
-                                keras.layers.LocallyConnected1D,
-                                **kwargs)
-            else:
-              testing_utils.layer_test(
-                  keras.layers.LocallyConnected1D,
-                  kwargs=kwargs,
-                  input_shape=(num_samples, num_steps, input_dim))
+              if padding == 'same' and implementation == 1:
+                self.assertRaises(ValueError,
+                                  keras.layers.LocallyConnected1D,
+                                  **kwargs)
+              else:
+                testing_utils.layer_test(
+                    keras.layers.LocallyConnected1D,
+                    kwargs=kwargs,
+                    input_shape=(num_samples, num_steps, input_dim))
 
   def test_locallyconnected_1d_regularization(self):
     num_samples = 2
@@ -113,30 +117,63 @@ class LocallyConnected1DLayersTest(test.TestCase):
 
 
 class LocallyConnected2DLayersTest(test.TestCase):
+  # TODO(fchollet): investigate why LocallyConnected2D
+  # fails inside a graph function in an eager context (fails with error
+  # "Incompatible shapes between op input and calculated input gradient").
 
-  @tf_test_util.run_in_graph_and_eager_modes
+  @tf_test_util.run_deprecated_v1
   def test_locallyconnected_2d(self):
-    num_samples = 8
-    filters = 3
-    stack_size = 4
-    num_row = 6
-    num_col = 10
+    with self.cached_session():
+      num_samples = 8
+      filters = 3
+      stack_size = 4
+      num_row = 6
+      num_col = 10
 
-    for padding in ['valid', 'same']:
-      for strides in [(1, 1), (2, 2)]:
-        for implementation in [1, 2]:
-          if padding == 'same' and strides != (1, 1):
-            continue
+      for padding in ['valid', 'same']:
+        for strides in [(1, 1), (2, 2)]:
+          for implementation in [1, 2]:
+            if padding == 'same' and strides != (1, 1):
+              continue
 
+            kwargs = {
+                'filters': filters,
+                'kernel_size': 3,
+                'padding': padding,
+                'kernel_regularizer': 'l2',
+                'bias_regularizer': 'l2',
+                'strides': strides,
+                'data_format': 'channels_last',
+                'implementation': implementation
+            }
+
+            if padding == 'same' and implementation == 1:
+              self.assertRaises(ValueError,
+                                keras.layers.LocallyConnected2D,
+                                **kwargs)
+            else:
+              testing_utils.layer_test(
+                  keras.layers.LocallyConnected2D,
+                  kwargs=kwargs,
+                  input_shape=(num_samples, num_row, num_col, stack_size))
+
+  @tf_test_util.run_deprecated_v1
+  def test_locallyconnected_2d_channels_first(self):
+    with self.cached_session():
+      num_samples = 8
+      filters = 3
+      stack_size = 4
+      num_row = 6
+      num_col = 10
+
+      for implementation in [1, 2]:
+        for padding in ['valid', 'same']:
           kwargs = {
               'filters': filters,
               'kernel_size': 3,
-              'padding': padding,
-              'kernel_regularizer': 'l2',
-              'bias_regularizer': 'l2',
-              'strides': strides,
-              'data_format': 'channels_last',
-              'implementation': implementation
+              'data_format': 'channels_first',
+              'implementation': implementation,
+              'padding': padding
           }
 
           if padding == 'same' and implementation == 1:
@@ -149,34 +186,6 @@ class LocallyConnected2DLayersTest(test.TestCase):
                 kwargs=kwargs,
                 input_shape=(num_samples, num_row, num_col, stack_size))
 
-  @tf_test_util.run_in_graph_and_eager_modes
-  def test_locallyconnected_2d_channels_first(self):
-    num_samples = 8
-    filters = 3
-    stack_size = 4
-    num_row = 6
-    num_col = 10
-
-    for implementation in [1, 2]:
-      for padding in ['valid', 'same']:
-        kwargs = {
-            'filters': filters,
-            'kernel_size': 3,
-            'data_format': 'channels_first',
-            'implementation': implementation,
-            'padding': padding
-        }
-
-        if padding == 'same' and implementation == 1:
-          self.assertRaises(ValueError,
-                            keras.layers.LocallyConnected2D,
-                            **kwargs)
-        else:
-          testing_utils.layer_test(
-              keras.layers.LocallyConnected2D,
-              kwargs=kwargs,
-              input_shape=(num_samples, num_row, num_col, stack_size))
-
   def test_locallyconnected_2d_regularization(self):
     num_samples = 2
     filters = 3
@@ -226,64 +235,67 @@ class LocallyConnected2DLayersTest(test.TestCase):
 
 class LocallyConnectedImplementationModeTest(test.TestCase):
 
-  @tf_test_util.run_in_graph_and_eager_modes
+  @tf_test_util.run_deprecated_v1
   def test_locallyconnected_implementation(self):
-    num_samples = 4
-    num_classes = 3
-    num_epochs = 2
+    with self.cached_session():
+      num_samples = 4
+      num_classes = 3
+      num_epochs = 2
 
-    np.random.seed(1)
-    targets = np.random.randint(0, num_classes, (num_samples,))
+      np.random.seed(1)
+      targets = np.random.randint(0, num_classes, (num_samples,))
 
-    for width in [1, 6]:
-      for height in [7]:
-        for filters in [2]:
-          for data_format in ['channels_first', 'channels_last']:
-            inputs = get_inputs(
-                data_format, filters, height, num_samples, width)
+      for width in [1, 6]:
+        for height in [7]:
+          for filters in [2]:
+            for data_format in ['channels_first', 'channels_last']:
+              inputs = get_inputs(
+                  data_format, filters, height, num_samples, width)
 
-            for kernel_x in [(3,)]:
-              for kernel_y in [()] if width == 1 else [(2,)]:
-                for stride_x in [(1,)]:
-                  for stride_y in [()] if width == 1 else [(3,)]:
-                    for layers in [2]:
-                      kwargs = {
-                          'layers': layers,
-                          'filters': filters,
-                          'kernel_size': kernel_x + kernel_y,
-                          'strides': stride_x + stride_y,
-                          'data_format': data_format,
-                          'num_classes': num_classes,
-                          'input_shape': inputs.shape
-                      }
+              for kernel_x in [(3,)]:
+                for kernel_y in [()] if width == 1 else [(2,)]:
+                  for stride_x in [(1,)]:
+                    for stride_y in [()] if width == 1 else [(3,)]:
+                      for layers in [2]:
+                        kwargs = {
+                            'layers': layers,
+                            'filters': filters,
+                            'kernel_size': kernel_x + kernel_y,
+                            'strides': stride_x + stride_y,
+                            'data_format': data_format,
+                            'num_classes': num_classes
+                        }
+                        model_1 = get_model(implementation=1, **kwargs)
+                        model_2 = get_model(implementation=2, **kwargs)
 
-                      model_1 = get_model(implementation=1, **kwargs)
-                      model_2 = get_model(implementation=2, **kwargs)
+                        # Build models.
+                        model_1.train_on_batch(inputs, targets)
+                        model_2.train_on_batch(inputs, targets)
 
-                      copy_model_weights(model_2, model_1)
+                        # Copy weights.
+                        copy_model_weights(model_2, model_1)
 
-                      # Compare outputs at initialization.
-                      out_1 = model_1.call(inputs)
-                      out_2 = model_2.call(inputs)
-                      self.assertAllCloseAccordingToType(out_1, out_2,
-                                                         rtol=1e-5, atol=1e-5)
+                        # Compare outputs at initialization.
+                        out_1 = model_1.call(inputs)
+                        out_2 = model_2.call(inputs)
+                        self.assertAllCloseAccordingToType(out_1, out_2,
+                                                           rtol=1e-5, atol=1e-5)
 
-                      # Train.
-                      model_1.fit(x=inputs,
-                                  y=targets,
-                                  epochs=num_epochs,
-                                  batch_size=num_samples)
+                        # Train.
+                        model_1.fit(x=inputs,
+                                    y=targets,
+                                    epochs=num_epochs,
+                                    batch_size=num_samples)
+                        model_2.fit(x=inputs,
+                                    y=targets,
+                                    epochs=num_epochs,
+                                    batch_size=num_samples)
 
-                      model_2.fit(x=inputs,
-                                  y=targets,
-                                  epochs=num_epochs,
-                                  batch_size=num_samples)
-
-                      # Compare outputs after a few training steps.
-                      out_1 = model_1.call(inputs)
-                      out_2 = model_2.call(inputs)
-                      self.assertAllCloseAccordingToType(out_1, out_2,
-                                                         rtol=1e-5, atol=1e-5)
+                        # Compare outputs after a few training steps.
+                        out_1 = model_1.call(inputs)
+                        out_2 = model_2.call(inputs)
+                        self.assertAllCloseAccordingToType(
+                            out_1, out_2, atol=2e-4)
 
   @tf_test_util.run_in_graph_and_eager_modes
   def test_make_2d(self):
@@ -360,8 +372,7 @@ def get_model(implementation,
               strides,
               layers,
               num_classes,
-              data_format,
-              input_shape):
+              data_format):
   model = keras.Sequential()
 
   if len(kernel_size) == 1:
@@ -390,7 +401,6 @@ def get_model(implementation,
       metrics=[keras.metrics.categorical_accuracy],
       loss=xent
   )
-  model.build(input_shape)
   return model
 
 
diff --git a/tensorflow/python/keras/layers/lstm_test.py b/tensorflow/python/keras/layers/lstm_test.py
index 9db697871fe..3f89cc398ed 100644
--- a/tensorflow/python/keras/layers/lstm_test.py
+++ b/tensorflow/python/keras/layers/lstm_test.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.python import keras
@@ -30,7 +31,7 @@ from tensorflow.python.training.rmsprop import RMSPropOptimizer
 
 
 @tf_test_util.run_all_in_graph_and_eager_modes
-class LSTMLayerTest(test.TestCase):
+class LSTMLayerTest(test.TestCase, parameterized.TestCase):
 
   def test_return_sequences_LSTM(self):
     num_samples = 2
@@ -56,7 +57,7 @@ class LSTMLayerTest(test.TestCase):
     layer = keras.layers.LSTM(units, return_sequences=True)
     model.add(layer)
     outputs = model.layers[-1].output
-    self.assertEquals(outputs.get_shape().as_list(), [None, timesteps, units])
+    self.assertEqual(outputs.get_shape().as_list(), [None, timesteps, units])
 
   def test_dynamic_behavior_LSTM(self):
     num_samples = 2
@@ -83,17 +84,17 @@ class LSTMLayerTest(test.TestCase):
                 'recurrent_dropout': 0.1},
         input_shape=(num_samples, timesteps, embedding_dim))
 
-  def test_implementation_mode_LSTM(self):
+  @parameterized.parameters([0, 1, 2])
+  def test_implementation_mode_LSTM(self, implementation_mode):
     num_samples = 2
     timesteps = 3
     embedding_dim = 4
     units = 2
-    for mode in [0, 1, 2]:
-      testing_utils.layer_test(
-          keras.layers.LSTM,
-          kwargs={'units': units,
-                  'implementation': mode},
-          input_shape=(num_samples, timesteps, embedding_dim))
+    testing_utils.layer_test(
+        keras.layers.LSTM,
+        kwargs={'units': units,
+                'implementation': implementation_mode},
+        input_shape=(num_samples, timesteps, embedding_dim))
 
   def test_constraints_LSTM(self):
     embedding_dim = 4
@@ -114,6 +115,7 @@ class LSTMLayerTest(test.TestCase):
     self.assertEqual(layer.cell.recurrent_kernel.constraint, r_constraint)
     self.assertEqual(layer.cell.bias.constraint, b_constraint)
 
+  @tf_test_util.run_deprecated_v1
   def test_with_masking_layer_LSTM(self):
     layer_class = keras.layers.LSTM
     inputs = np.random.random((2, 3, 4))
@@ -126,6 +128,7 @@ class LSTMLayerTest(test.TestCase):
                   optimizer=RMSPropOptimizer(0.01))
     model.fit(inputs, targets, epochs=1, batch_size=2, verbose=1)
 
+  @tf_test_util.run_deprecated_v1
   def test_masking_with_stacking_LSTM(self):
     inputs = np.random.random((2, 3, 4))
     targets = np.abs(np.random.random((2, 3, 5)))
@@ -311,6 +314,7 @@ class LSTMLayerTest(test.TestCase):
 
 class LSTMLayerGraphOnlyTest(test.TestCase):
 
+  @tf_test_util.run_deprecated_v1
   def test_statefulness_LSTM(self):
     num_samples = 2
     timesteps = 3
@@ -374,6 +378,7 @@ class LSTMLayerGraphOnlyTest(test.TestCase):
 
       self.assertAllClose(out7, out6, atol=1e-5)
 
+  @tf_test_util.run_deprecated_v1
   def test_regularizers_LSTM(self):
     embedding_dim = 4
     layer_class = keras.layers.LSTM
diff --git a/tensorflow/python/keras/layers/merge.py b/tensorflow/python/keras/layers/merge.py
index 0ded0e42ed3..45e705c6960 100644
--- a/tensorflow/python/keras/layers/merge.py
+++ b/tensorflow/python/keras/layers/merge.py
@@ -40,7 +40,6 @@ class _Merge(Layer):
 
   def __init__(self, **kwargs):
     super(_Merge, self).__init__(**kwargs)
-    self._can_use_graph_functions = True
     self.supports_masking = True
 
   def _merge_function(self, inputs):
@@ -213,7 +212,7 @@ class _Merge(Layer):
     if len(mask) != len(inputs):
       raise ValueError('The lists `inputs` and `mask` '
                        'should have the same length.')
-    if all([m is None for m in mask]):
+    if all(m is None for m in mask):
       return None
     masks = [array_ops.expand_dims(m, axis=0) for m in mask if m is not None]
     return K.all(K.concatenate(masks, axis=0), axis=0, keepdims=False)
@@ -369,7 +368,6 @@ class Concatenate(_Merge):
 
   def __init__(self, axis=-1, **kwargs):
     super(Concatenate, self).__init__(**kwargs)
-    self._can_use_graph_functions = True
     self.axis = axis
     self.supports_masking = True
     self._reshape_required = False
@@ -380,7 +378,7 @@ class Concatenate(_Merge):
     if not isinstance(input_shape, list) or len(input_shape) < 2:
       raise ValueError('A `Concatenate` layer should be called '
                        'on a list of at least 2 inputs')
-    if all([shape is None for shape in input_shape]):
+    if all(shape is None for shape in input_shape):
       return
     reduced_inputs_shapes = [list(shape) for shape in input_shape]
     shape_set = set()
@@ -420,7 +418,7 @@ class Concatenate(_Merge):
     if len(mask) != len(inputs):
       raise ValueError('The lists `inputs` and `mask` '
                        'should have the same length.')
-    if all([m is None for m in mask]):
+    if all(m is None for m in mask):
       return None
     # Make a list of masks while making sure
     # the dimensionality of each mask
@@ -467,7 +465,6 @@ class Dot(_Merge):
 
   def __init__(self, axes, normalize=False, **kwargs):
     super(Dot, self).__init__(**kwargs)
-    self._can_use_graph_functions = True
     if not isinstance(axes, int):
       if not isinstance(axes, (list, tuple)):
         raise TypeError('Invalid type for `axes` - '
diff --git a/tensorflow/python/keras/layers/merge_test.py b/tensorflow/python/keras/layers/merge_test.py
index 698c5662b6f..fcb161ae20a 100644
--- a/tensorflow/python/keras/layers/merge_test.py
+++ b/tensorflow/python/keras/layers/merge_test.py
@@ -207,6 +207,7 @@ class MergeLayersGraphOnlyTest(test.TestCase):
       mask = layer.output_mask
       self.assertListEqual(mask.get_shape().as_list(), [None, 4])
 
+  @tf_test_util.run_deprecated_v1
   def test_merge_add_dynamic_shape(self):
     with self.cached_session():
       i1 = array_ops.placeholder(shape=(4, None), dtype='float32')
diff --git a/tensorflow/python/keras/layers/noise.py b/tensorflow/python/keras/layers/noise.py
index e7c0478513d..cb7cee3ebc3 100644
--- a/tensorflow/python/keras/layers/noise.py
+++ b/tensorflow/python/keras/layers/noise.py
@@ -55,7 +55,6 @@ class GaussianNoise(Layer):
     super(GaussianNoise, self).__init__(**kwargs)
     self.supports_masking = True
     self.stddev = stddev
-    self._can_use_graph_functions = True
 
   def call(self, inputs, training=None):
 
@@ -100,7 +99,6 @@ class GaussianDropout(Layer):
     super(GaussianDropout, self).__init__(**kwargs)
     self.supports_masking = True
     self.rate = rate
-    self._can_use_graph_functions = True
 
   def call(self, inputs, training=None):
     if 0 < self.rate < 1:
@@ -155,7 +153,6 @@ class AlphaDropout(Layer):
     self.noise_shape = noise_shape
     self.seed = seed
     self.supports_masking = True
-    self._can_use_graph_functions = True
 
   def _get_noise_shape(self, inputs):
     return self.noise_shape if self.noise_shape else array_ops.shape(inputs)
diff --git a/tensorflow/python/keras/layers/normalization.py b/tensorflow/python/keras/layers/normalization.py
index 3d3bf647e6f..d9584976555 100644
--- a/tensorflow/python/keras/layers/normalization.py
+++ b/tensorflow/python/keras/layers/normalization.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python import tf2
 from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -26,8 +27,8 @@ from tensorflow.python.keras import backend as K
 from tensorflow.python.keras import constraints
 from tensorflow.python.keras import initializers
 from tensorflow.python.keras import regularizers
-from tensorflow.python.keras.engine.base_layer import InputSpec
 from tensorflow.python.keras.engine.base_layer import Layer
+from tensorflow.python.keras.engine.input_spec import InputSpec
 from tensorflow.python.keras.utils import tf_utils
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import init_ops
@@ -40,8 +41,8 @@ from tensorflow.python.training import distribution_strategy_context
 from tensorflow.python.util.tf_export import tf_export
 
 
-@tf_export('keras.layers.BatchNormalization')
-class BatchNormalization(Layer):
+@tf_export('keras.layers.BatchNormalization', v1=[])
+class BatchNormalizationV2(Layer):
   """Batch normalization layer (Ioffe and Szegedy, 2014).
 
   Normalize the activations of the previous layer at each batch,
@@ -84,8 +85,10 @@ class BatchNormalization(Layer):
       and should be neither too small (which would add noise) nor too large
       (which would give stale estimates). Note that `momentum` is still applied
       to get the means and variances for inference.
-    fused: if `None` or `True`, use a faster, fused implementation if possible.
-      If `False`, use the system recommended implementation.
+    fused: if `True`, use a faster, fused implementation, or raise a ValueError
+      if the fused implementation cannot be used. If `None`, use the faster
+      implementation if possible. If False, do not used the fused
+      implementation.
     trainable: Boolean, if `True` also add variables to the graph collection
       `GraphKeys.TRAINABLE_VARIABLES` (see tf.Variable).
     virtual_batch_size: An `int`. By default, `virtual_batch_size` is `None`,
@@ -120,6 +123,9 @@ class BatchNormalization(Layer):
         Internal Covariate Shift](https://arxiv.org/abs/1502.03167)
   """
 
+  # The BatchNormalizationV1 subclass sets this to False to use the V1 behavior.
+  _USE_V2_BEHAVIOR = True
+
   def __init__(self,
                axis=-1,
                momentum=0.99,
@@ -143,13 +149,15 @@ class BatchNormalization(Layer):
                adjustment=None,
                name=None,
                **kwargs):
-    super(BatchNormalization, self).__init__(
+    super(BatchNormalizationV2, self).__init__(
         name=name, trainable=trainable, **kwargs)
-    self._can_use_graph_functions = True
     if isinstance(axis, list):
       self.axis = axis[:]
-    else:
+    elif isinstance(axis, int):
       self.axis = axis
+    else:
+      raise TypeError('axis must be int or list, type given: %s'
+                      % type(self.axis))
     self.momentum = momentum
     self.epsilon = epsilon
     self.center = center
@@ -166,7 +174,14 @@ class BatchNormalization(Layer):
     self.renorm = renorm
     self.virtual_batch_size = virtual_batch_size
     self.adjustment = adjustment
-    if fused is None:
+    if self._USE_V2_BEHAVIOR:
+      if fused:
+        self._raise_if_fused_cannot_be_used()
+      # We leave fused as None if self._fused_can_be_used()==True, since we
+      # still may set it to False in self.build() if the input rank is not 4.
+      elif fused is None and not self._fused_can_be_used():
+        fused = False
+    elif fused is None:
       fused = True
     self.supports_masking = True
 
@@ -182,6 +197,38 @@ class BatchNormalization(Layer):
       self.renorm_clipping = renorm_clipping
       self.renorm_momentum = renorm_momentum
 
+  def _raise_if_fused_cannot_be_used(self):
+    """Raises a ValueError if fused implementation cannot be used.
+
+    In addition to the checks done in this function, the input tensors rank must
+    be 4. The input rank check can only be done once the input shape is known.
+    """
+    # Currently fused batch norm doesn't support renorm. It also only supports a
+    # channel dimension on axis 1 or 3, when no virtual batch size or adjustment
+    # is used.
+    if self.renorm:
+      raise ValueError('Passing both fused=True and renorm=True is '
+                       'unsupported')
+    axis = [self.axis] if isinstance(self.axis, int) else self.axis
+    # Axis -3 is equivalent to 1, and axis -1 is equivalent to 3, because the
+    # input rank is required to be 4 (which is checked later).
+    if len(axis) > 1 or axis[0] not in (-3, -1, 1, 3):
+      raise ValueError('Passing fused=True is only supported when axis is 1 '
+                       'or 3')
+    if self.virtual_batch_size is not None:
+      raise ValueError('Passing fused=True is unsupported when '
+                       'virtual_batch_size is specified.')
+    if self.adjustment is not None:
+      raise ValueError('Passing fused=True is unsupported when '
+                       'adjustment is specified.')
+
+  def _fused_can_be_used(self):
+    try:
+      self._raise_if_fused_cannot_be_used()
+      return True
+    except ValueError:
+      return False
+
   def build(self, input_shape):
     input_shape = tensor_shape.TensorShape(input_shape)
     if not input_shape.ndims:
@@ -192,10 +239,6 @@ class BatchNormalization(Layer):
     if isinstance(self.axis, int):
       self.axis = [self.axis]
 
-    if not isinstance(self.axis, list):
-      raise TypeError('axis must be int or list, type given: %s'
-                      % type(self.axis))
-
     for idx, x in enumerate(self.axis):
       if x < 0:
         self.axis[idx] = ndims + x
@@ -220,16 +263,18 @@ class BatchNormalization(Layer):
         raise ValueError('When using virtual_batch_size, adjustment cannot '
                          'be specified')
 
-    if self.fused:
-      # Currently fused batch norm doesn't support renorm. It also only supports
-      # an input tensor of rank 4 and a channel dimension on axis 1 or 3.
+    if self.fused in (None, True):
       # TODO(yaozhang): if input is not 4D, reshape it to 4D and reshape the
       # output back to its original shape accordingly.
-      self.fused = (not self.renorm and
-                    ndims == 4 and
-                    self.axis in [[1], [3]] and
-                    self.virtual_batch_size is None and
-                    self.adjustment is None)
+      if self._USE_V2_BEHAVIOR:
+        if self.fused is None:
+          self.fused = (ndims == 4)
+        elif self.fused and ndims != 4:
+          raise ValueError('Batch normalization layers with fused=True only '
+                           'support 4D input tensors.')
+      else:
+        assert self.fused is not None
+        self.fused = (ndims == 4 and self._fused_can_be_used())
       # TODO(chrisying): fused batch norm is currently not supported for
       # multi-axis batch norm and by extension virtual batches. In some cases,
       # it might be possible to use fused batch norm but would require reshaping
@@ -492,6 +537,9 @@ class BatchNormalization(Layer):
 
     return (r, d, new_mean, new_variance)
 
+  def _moments(self, inputs, reduction_axes, keep_dims):
+    return nn.moments(inputs, reduction_axes, keep_dims=keep_dims)
+
   def call(self, inputs, training=None):
     if training is None:
       training = K.learning_phase()
@@ -563,7 +611,8 @@ class BatchNormalization(Layer):
       # Some of the computations here are not necessary when training==False
       # but not a constant. However, this makes the code simpler.
       keep_dims = self.virtual_batch_size is not None or len(self.axis) > 1
-      mean, variance = nn.moments(inputs, reduction_axes, keep_dims=keep_dims)
+      mean, variance = self._moments(
+          inputs, reduction_axes, keep_dims=keep_dims)
 
       moving_mean = self.moving_mean
       moving_variance = self.moving_variance
@@ -669,5 +718,36 @@ class BatchNormalization(Layer):
                       'layer cannot be serialized and has been omitted from '
                       'the layer config. It will not be included when '
                       're-creating the layer from the saved config.')
-    base_config = super(BatchNormalization, self).get_config()
+    base_config = super(BatchNormalizationV2, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
+
+
+def _replace_in_v2_docstring(old, new):
+  string = BatchNormalizationV2.__doc__
+  if old not in string:
+    raise ValueError('Could not find following string in BatchNormalizationV2 '
+                     'docstring: "{}"'.format(old))
+  return string.replace(old, new)
+
+
+@tf_export(v1=['keras.layers.BatchNormalization'])  # pylint: disable=missing-docstring
+class BatchNormalizationV1(BatchNormalizationV2):
+
+  __doc__ = _replace_in_v2_docstring(
+      '''
+    fused: if `True`, use a faster, fused implementation, or raise a ValueError
+      if the fused implementation cannot be used. If `None`, use the faster
+      implementation if possible. If False, do not used the fused
+      implementation.''',
+
+      '''
+    fused: if `None` or `True`, use a faster, fused implementation if possible.
+      If `False`, use the system recommended implementation.''')
+
+  _USE_V2_BEHAVIOR = False
+
+
+if tf2.enabled():
+  BatchNormalization = BatchNormalizationV2
+else:
+  BatchNormalization = BatchNormalizationV1
diff --git a/tensorflow/python/keras/layers/normalization_test.py b/tensorflow/python/keras/layers/normalization_test.py
index 92e41287077..9138c0a08a3 100644
--- a/tensorflow/python/keras/layers/normalization_test.py
+++ b/tensorflow/python/keras/layers/normalization_test.py
@@ -23,6 +23,7 @@ import numpy as np
 from tensorflow.python import keras
 from tensorflow.python.framework import test_util as tf_test_util
 from tensorflow.python.keras import testing_utils
+from tensorflow.python.keras.layers import normalization
 from tensorflow.python.platform import test
 from tensorflow.python.training import gradient_descent
 
@@ -54,6 +55,14 @@ class NormalizationLayersTest(test.TestCase):
         kwargs={'scale': False,
                 'center': False},
         input_shape=(3, 3))
+    testing_utils.layer_test(
+        normalization.BatchNormalizationV2,
+        kwargs={'fused': True},
+        input_shape=(3, 3, 3, 3))
+    testing_utils.layer_test(
+        normalization.BatchNormalizationV2,
+        kwargs={'fused': None},
+        input_shape=(3, 3, 3))
 
   def test_batchnorm_weights(self):
     layer = keras.layers.BatchNormalization(scale=False, center=False)
@@ -78,15 +87,18 @@ class NormalizationLayersTest(test.TestCase):
     self.assertEqual(layer.gamma.constraint, max_norm)
     self.assertEqual(layer.beta.constraint, max_norm)
 
-  def test_batchnorm_correctness(self):
+  def _test_batchnorm_correctness(self, dtype, use_v2=True, fused=False):
     model = keras.models.Sequential()
-    norm = keras.layers.BatchNormalization(input_shape=(10,), momentum=0.8)
+    layer_ctor = (normalization.BatchNormalizationV2 if use_v2
+                  else normalization.BatchNormalizationV1)
+    norm = layer_ctor(input_shape=(2, 2, 2), momentum=0.8, fused=fused)
     model.add(norm)
     model.compile(loss='mse',
                   optimizer=gradient_descent.GradientDescentOptimizer(0.01))
 
     # centered on 5.0, variance 10.0
-    x = np.random.normal(loc=5.0, scale=10.0, size=(1000, 10))
+    x = (np.random.normal(loc=5.0, scale=10.0, size=(1000, 2, 2, 2))
+         .astype(dtype))
     model.fit(x, x, epochs=4, verbose=0)
     out = model.predict(x)
     out -= keras.backend.eval(norm.beta)
@@ -95,23 +107,15 @@ class NormalizationLayersTest(test.TestCase):
     np.testing.assert_allclose(out.mean(), 0.0, atol=1e-1)
     np.testing.assert_allclose(out.std(), 1.0, atol=1e-1)
 
+  def test_batchnorm_correctness(self):
+    self._test_batchnorm_correctness(np.float32)
+    self._test_batchnorm_correctness(np.float32, fused=True)
+    self._test_batchnorm_correctness(np.float32, use_v2=False)
+
   def test_batchnorm_mixed_precision(self):
-    model = keras.models.Sequential()
-    norm = keras.layers.BatchNormalization(input_shape=(10,), momentum=0.8)
-    model.add(norm)
-    model.compile(loss='mse',
-                  optimizer=gradient_descent.GradientDescentOptimizer(0.01))
-
-    # centered on 5.0, variance 10.0
-    x = np.random.normal(
-        loc=5.0, scale=10.0, size=(1000, 10)).astype(np.float16)
-    model.fit(x, x, epochs=4, verbose=0)
-    out = model.predict(x)
-    out -= keras.backend.eval(norm.beta)
-    out /= keras.backend.eval(norm.gamma)
-
-    np.testing.assert_allclose(out.mean(), 0.0, atol=1e-1)
-    np.testing.assert_allclose(out.std(), 1.0, atol=1e-1)
+    self._test_batchnorm_correctness(np.float16)
+    self._test_batchnorm_correctness(np.float16, fused=True)
+    self._test_batchnorm_correctness(np.float16, use_v2=False)
 
   def test_batchnorm_convnet(self):
     if test.is_gpu_available(cuda_only=True):
@@ -151,6 +155,77 @@ class NormalizationLayersTest(test.TestCase):
     np.testing.assert_allclose(np.mean(out, axis=(0, 1, 2)), 0.0, atol=1e-1)
     np.testing.assert_allclose(np.std(out, axis=(0, 1, 2)), 1.0, atol=1e-1)
 
+  def test_v1_fused_attribute(self):
+    norm = normalization.BatchNormalizationV1()
+    inp = keras.layers.Input((4, 4, 4))
+    norm(inp)
+    self.assertEqual(norm.fused, True)
+
+    norm = normalization.BatchNormalizationV1(fused=False)
+    self.assertEqual(norm.fused, False)
+    inp = keras.layers.Input(shape=(4, 4, 4))
+    norm(inp)
+    self.assertEqual(norm.fused, False)
+
+    norm = normalization.BatchNormalizationV1(virtual_batch_size=2)
+    self.assertEqual(norm.fused, True)
+    inp = keras.layers.Input(shape=(2, 2, 2))
+    norm(inp)
+    self.assertEqual(norm.fused, False)
+
+  def test_v2_fused_attribute(self):
+    norm = normalization.BatchNormalizationV2()
+    self.assertEqual(norm.fused, None)
+    inp = keras.layers.Input(shape=(4, 4, 4))
+    norm(inp)
+    self.assertEqual(norm.fused, True)
+
+    norm = normalization.BatchNormalizationV2()
+    self.assertEqual(norm.fused, None)
+    inp = keras.layers.Input(shape=(4, 4))
+    norm(inp)
+    self.assertEqual(norm.fused, False)
+
+    norm = normalization.BatchNormalizationV2(virtual_batch_size=2)
+    self.assertEqual(norm.fused, False)
+    inp = keras.layers.Input(shape=(4, 4, 4))
+    norm(inp)
+    self.assertEqual(norm.fused, False)
+
+    norm = normalization.BatchNormalizationV2(fused=False)
+    self.assertEqual(norm.fused, False)
+    inp = keras.layers.Input(shape=(4, 4, 4))
+    norm(inp)
+    self.assertEqual(norm.fused, False)
+
+    norm = normalization.BatchNormalizationV2(fused=True, axis=[3])
+    self.assertEqual(norm.fused, True)
+    inp = keras.layers.Input(shape=(4, 4, 4))
+    norm(inp)
+    self.assertEqual(norm.fused, True)
+
+    with self.assertRaisesRegexp(ValueError, 'fused.*renorm'):
+      normalization.BatchNormalizationV2(fused=True, renorm=True)
+
+    with self.assertRaisesRegexp(ValueError, 'fused.*when axis is 1 or 3'):
+      normalization.BatchNormalizationV2(fused=True, axis=2)
+
+    with self.assertRaisesRegexp(ValueError, 'fused.*when axis is 1 or 3'):
+      normalization.BatchNormalizationV2(fused=True, axis=[1, 3])
+
+    with self.assertRaisesRegexp(ValueError, 'fused.*virtual_batch_size'):
+      normalization.BatchNormalizationV2(fused=True, virtual_batch_size=2)
+
+    with self.assertRaisesRegexp(ValueError, 'fused.*adjustment'):
+      normalization.BatchNormalizationV2(fused=True,
+                                         adjustment=lambda _: (1, 0))
+
+    norm = normalization.BatchNormalizationV2(fused=True)
+    self.assertEqual(norm.fused, True)
+    inp = keras.layers.Input(shape=(4, 4))
+    with self.assertRaisesRegexp(ValueError, '4D input tensors'):
+      norm(inp)
+
 
 class NormalizationLayersGraphModeOnlyTest(test.TestCase):
 
@@ -226,6 +301,7 @@ class NormalizationLayersGraphModeOnlyTest(test.TestCase):
       x2 = model.predict(val_a)
       self.assertAllClose(x1, x2, atol=1e-7)
 
+  @tf_test_util.run_deprecated_v1
   def test_batchnorm_trainable(self):
     """Tests that batchnorm layer is trainable when learning phase is enabled.
 
diff --git a/tensorflow/python/keras/layers/pooling.py b/tensorflow/python/keras/layers/pooling.py
index b8d6b03664f..a0744cddad6 100644
--- a/tensorflow/python/keras/layers/pooling.py
+++ b/tensorflow/python/keras/layers/pooling.py
@@ -22,8 +22,8 @@ import functools
 
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.keras import backend
-from tensorflow.python.keras.engine.base_layer import InputSpec
 from tensorflow.python.keras.engine.base_layer import Layer
+from tensorflow.python.keras.engine.input_spec import InputSpec
 from tensorflow.python.keras.utils import conv_utils
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
@@ -58,7 +58,6 @@ class Pooling1D(Layer):
                padding='valid', data_format='channels_last',
                name=None, **kwargs):
     super(Pooling1D, self).__init__(name=name, **kwargs)
-    self._can_use_graph_functions = True
     if data_format is None:
       data_format = backend.image_data_format()
     if strides is None:
@@ -231,7 +230,6 @@ class Pooling2D(Layer):
                padding='valid', data_format=None,
                name=None, **kwargs):
     super(Pooling2D, self).__init__(name=name, **kwargs)
-    self._can_use_graph_functions = True
     if data_format is None:
       data_format = backend.image_data_format()
     if strides is None:
@@ -427,7 +425,6 @@ class Pooling3D(Layer):
                padding='valid', data_format='channels_last',
                name=None, **kwargs):
     super(Pooling3D, self).__init__(name=name, **kwargs)
-    self._can_use_graph_functions = True
     if data_format is None:
       data_format = backend.image_data_format()
     if strides is None:
@@ -599,7 +596,6 @@ class GlobalPooling1D(Layer):
 
   def __init__(self, data_format='channels_last', **kwargs):
     super(GlobalPooling1D, self).__init__(**kwargs)
-    self._can_use_graph_functions = True
     self.input_spec = InputSpec(ndim=3)
     self.data_format = conv_utils.normalize_data_format(data_format)
 
@@ -705,7 +701,6 @@ class GlobalPooling2D(Layer):
 
   def __init__(self, data_format=None, **kwargs):
     super(GlobalPooling2D, self).__init__(**kwargs)
-    self._can_use_graph_functions = True
     self.data_format = conv_utils.normalize_data_format(data_format)
     self.input_spec = InputSpec(ndim=4)
 
@@ -804,7 +799,6 @@ class GlobalPooling3D(Layer):
 
   def __init__(self, data_format=None, **kwargs):
     super(GlobalPooling3D, self).__init__(**kwargs)
-    self._can_use_graph_functions = True
     self.data_format = conv_utils.normalize_data_format(data_format)
     self.input_spec = InputSpec(ndim=5)
 
diff --git a/tensorflow/python/keras/layers/recurrent.py b/tensorflow/python/keras/layers/recurrent.py
index 979187c719f..d9502dfc5b7 100644
--- a/tensorflow/python/keras/layers/recurrent.py
+++ b/tensorflow/python/keras/layers/recurrent.py
@@ -19,20 +19,27 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import uuid
+
 import numpy as np
 
 from tensorflow.python.eager import context
+from tensorflow.python.eager import function
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.keras import activations
 from tensorflow.python.keras import backend as K
 from tensorflow.python.keras import constraints
 from tensorflow.python.keras import initializers
 from tensorflow.python.keras import regularizers
-from tensorflow.python.keras.engine.base_layer import InputSpec
 from tensorflow.python.keras.engine.base_layer import Layer
+from tensorflow.python.keras.engine.input_spec import InputSpec
 from tensorflow.python.keras.utils import generic_utils
 from tensorflow.python.keras.utils import tf_utils
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gen_cudnn_rnn_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training.checkpointable import base as checkpointable
@@ -446,6 +453,9 @@ class RNN(Layer):
                        'an attribute `state_size` '
                        '(tuple of integers, '
                        'one integer per RNN state).')
+    # If True, the output for masked timestep will be zeros, whereas in the
+    # False case, output from previous timestep is returned for masked timestep.
+    self.zero_output_for_mask = kwargs.pop('zero_output_for_mask', False)
     super(RNN, self).__init__(**kwargs)
     self.cell = cell
     if isinstance(cell, checkpointable.CheckpointableBase):
@@ -743,35 +753,12 @@ class RNN(Layer):
            training=None,
            initial_state=None,
            constants=None):
-    # input shape: `(samples, time (padded with zeros), input_dim)`
-    # note that the .build() method of subclasses MUST define
-    # self.input_spec and self.state_spec with complete input shapes.
-    if isinstance(inputs, list):
-      # get initial_state from full input spec
-      # as they could be copied to multiple GPU.
-      if self._num_constants is None:
-        initial_state = inputs[1:]
-      else:
-        initial_state = inputs[1:-self._num_constants]
-        constants = inputs[-self._num_constants:]
-      if len(initial_state) == 0:
-        initial_state = None
-      inputs = inputs[0]
-    if initial_state is not None:
-      pass
-    elif self.stateful:
-      initial_state = self.states
-    else:
-      initial_state = self.get_initial_state(inputs)
+    inputs, initial_state, constants = self._process_inputs(
+        inputs, initial_state, constants)
 
     if isinstance(mask, list):
       mask = mask[0]
 
-    if len(initial_state) != len(self.states):
-      raise ValueError(
-          'Layer has ' + str(len(self.states)) + ' states but was passed ' +
-          str(len(initial_state)) + ' initial states.')
-
     if nest.is_sequence(inputs):
       # In the case of nested input, use the first element for shape check.
       input_shape = K.int_shape(nest.flatten(inputs)[0])
@@ -829,7 +816,8 @@ class RNN(Layer):
         mask=mask,
         unroll=self.unroll,
         input_length=timesteps,
-        time_major=self.time_major)
+        time_major=self.time_major,
+        zero_output_for_mask=self.zero_output_for_mask)
     if self.stateful:
       updates = []
       for i in range(len(states)):
@@ -850,6 +838,34 @@ class RNN(Layer):
     else:
       return output
 
+  def _process_inputs(self, inputs, initial_state, constants):
+    # input shape: `(samples, time (padded with zeros), input_dim)`
+    # note that the .build() method of subclasses MUST define
+    # self.input_spec and self.state_spec with complete input shapes.
+    if isinstance(inputs, list):
+      # get initial_state from full input spec
+      # as they could be copied to multiple GPU.
+      if self._num_constants is None:
+        initial_state = inputs[1:]
+      else:
+        initial_state = inputs[1:-self._num_constants]
+        constants = inputs[-self._num_constants:]
+      if len(initial_state) == 0:
+        initial_state = None
+      inputs = inputs[0]
+    if initial_state is not None:
+      pass
+    elif self.stateful:
+      initial_state = self.states
+    else:
+      initial_state = self.get_initial_state(inputs)
+
+    if len(initial_state) != len(self.states):
+      raise ValueError('Layer has ' + str(len(self.states)) +
+                       ' states but was passed ' + str(len(initial_state)) +
+                       ' initial states.')
+    return inputs, initial_state, constants
+
   def reset_states(self, states=None):
     if not self.stateful:
       raise AttributeError('Layer must be stateful.')
@@ -923,6 +939,8 @@ class RNN(Layer):
     }
     if self._num_constants is not None:
       config['num_constants'] = self._num_constants
+    if self.zero_output_for_mask:
+      config['zero_output_for_mask'] = self.zero_output_for_mask
 
     cell_config = self.cell.get_config()
     config['cell'] = {
@@ -2515,6 +2533,416 @@ class LSTM(RNN):
     return cls(**config)
 
 
+class UnifiedLSTM(LSTM):
+  """Long Short-Term Memory layer - Hochreiter 1997.
+
+  `UnifiedLSTM` unifies the implementations between standard `LSTM` layer and
+  `CuDNNLSTM` layer. Based on available runtime hardware and constrains,
+  `UnifiedLSTM` will choose different implementations to maximize the
+  performance. For instance, if GPU is available and all the parameters meet the
+  requirement of CuDNN kernel, `UnifiedLSTM` will use CuDNN kernel for the
+  calculation.
+
+  Arguments:
+    units: Positive integer, dimensionality of the output space.
+    activation: Activation function to use.
+        Default: hyperbolic tangent (`tanh`). If you pass `None`, no activation
+          is applied
+        (ie. "linear" activation: `a(x) = x`).
+    recurrent_activation: Activation function to use for the recurrent step.
+        Default: hard sigmoid (`hard_sigmoid`). If you pass `None`, no
+          activation is applied
+        (ie. "linear" activation: `a(x) = x`).
+    use_bias: Boolean, whether the layer uses a bias vector.
+    kernel_initializer: Initializer for the `kernel` weights matrix, used for
+      the linear transformation of the inputs..
+    recurrent_initializer: Initializer for the `recurrent_kernel` weights
+      matrix, used for the linear transformation of the recurrent state..
+    bias_initializer: Initializer for the bias vector.
+    unit_forget_bias: Boolean. If True, add 1 to the bias of the forget gate at
+      initialization. Setting it to true will also force
+      `bias_initializer="zeros"`. This is recommended in [Jozefowicz et
+          al.](http://www.jmlr.org/proceedings/papers/v37/jozefowicz15.pdf)
+    kernel_regularizer: Regularizer function applied to the `kernel` weights
+      matrix.
+    recurrent_regularizer: Regularizer function applied to the
+      `recurrent_kernel` weights matrix.
+    bias_regularizer: Regularizer function applied to the bias vector.
+    activity_regularizer: Regularizer function applied to the output of the
+      layer (its "activation")..
+    kernel_constraint: Constraint function applied to the `kernel` weights
+      matrix.
+    recurrent_constraint: Constraint function applied to the `recurrent_kernel`
+      weights matrix.
+    bias_constraint: Constraint function applied to the bias vector.
+    dropout: Float between 0 and 1. Fraction of the units to drop for the linear
+      transformation of the inputs.
+    recurrent_dropout: Float between 0 and 1. Fraction of the units to drop for
+      the linear transformation of the recurrent state.
+    implementation: Implementation mode, either 1 or 2. Mode 1 will structure
+      its operations as a larger number of smaller dot products and additions,
+      whereas mode 2 will batch them into fewer, larger operations. These modes
+      will have different performance profiles on different hardware and for
+      different applications.
+    return_sequences: Boolean. Whether to return the last output. in the output
+      sequence, or the full sequence.
+    return_state: Boolean. Whether to return the last state in addition to the
+      output.
+    go_backwards: Boolean (default False). If True, process the input sequence
+      backwards and return the reversed sequence.
+    stateful: Boolean (default False). If True, the last state for each sample
+      at index i in a batch will be used as initial state for the sample of
+      index i in the following batch.
+    unroll: Boolean (default False). If True, the network will be unrolled, else
+      a symbolic loop will be used. Unrolling can speed-up a RNN, although it
+      tends to be more memory-intensive. Unrolling is only suitable for short
+      sequences.
+  """
+
+  def __init__(self,
+               units,
+               activation='tanh',
+               recurrent_activation='hard_sigmoid',
+               use_bias=True,
+               kernel_initializer='glorot_uniform',
+               recurrent_initializer='orthogonal',
+               bias_initializer='zeros',
+               unit_forget_bias=True,
+               kernel_regularizer=None,
+               recurrent_regularizer=None,
+               bias_regularizer=None,
+               activity_regularizer=None,
+               kernel_constraint=None,
+               recurrent_constraint=None,
+               bias_constraint=None,
+               dropout=0.,
+               recurrent_dropout=0.,
+               implementation=1,
+               return_sequences=False,
+               return_state=False,
+               go_backwards=False,
+               stateful=False,
+               time_major=False,
+               unroll=False,
+               **kwargs):
+    # return_runtime is a flag for testing, which shows the real backend
+    # implementation chosen by grappler in graph mode.
+    self.return_runtime = kwargs.pop('return_runtime', False)
+
+    super(UnifiedLSTM, self).__init__(
+        units,
+        activation=activation,
+        recurrent_activation=recurrent_activation,
+        use_bias=use_bias,
+        kernel_initializer=kernel_initializer,
+        recurrent_initializer=recurrent_initializer,
+        bias_initializer=bias_initializer,
+        unit_forget_bias=unit_forget_bias,
+        kernel_regularizer=kernel_regularizer,
+        recurrent_regularizer=recurrent_regularizer,
+        bias_regularizer=bias_regularizer,
+        activity_regularizer=activity_regularizer,
+        kernel_constraint=kernel_constraint,
+        recurrent_constraint=recurrent_constraint,
+        bias_constraint=bias_constraint,
+        dropout=dropout,
+        recurrent_dropout=recurrent_dropout,
+        implementation=implementation,
+        return_sequences=return_sequences,
+        return_state=return_state,
+        go_backwards=go_backwards,
+        stateful=stateful,
+        time_major=time_major,
+        unroll=unroll,
+        **kwargs)
+
+    self.state_spec = [
+        InputSpec(shape=(None, dim)) for dim in (self.units, self.units)
+    ]
+    self._num_constants = None
+    self._num_inputs = None
+    self.could_use_cudnn = (
+        activation == 'tanh' and dropout == 0 and not unroll and use_bias and
+        unit_forget_bias)
+
+  def build(self, input_shape):
+    super(UnifiedLSTM, self).build(input_shape)
+    if self.could_use_cudnn:
+      # Add a new set of bias for CuDNN implementation only. Standard LSTM only
+      # has bias for recurrent kernel, while CuDNN LSTM has an extra set for
+      # input gate as well.
+      self.cudnn_bias = self.add_weight(
+          shape=(self.units * 4,),
+          name='cudnn_bias',
+          use_resource=True,
+          initializer=self.bias_initializer,
+          regularizer=self.bias_regularizer,
+          constraint=self.bias_constraint)
+    self.built = True
+
+  def call(self, inputs, mask=None, training=None, initial_state=None):
+    # LSTM does not support constants. Ignore it during process.
+    inputs, initial_state, _ = self._process_inputs(inputs, initial_state, None)
+
+    if isinstance(mask, list):
+      mask = mask[0]
+
+    input_shape = K.int_shape(inputs)
+    timesteps = input_shape[0] if self.time_major else input_shape[1]
+
+    if mask is not None or not self.could_use_cudnn:
+      # CuDNN does not support masking, fall back to use the normal LSTM.
+      kwargs = {'training': training}
+
+      def step(inputs, states):
+        return self.cell.call(inputs, states, **kwargs)
+
+      last_output, outputs, states = K.rnn(
+          step,
+          inputs,
+          initial_state,
+          constants=None,
+          go_backwards=self.go_backwards,
+          mask=mask,
+          unroll=self.unroll,
+          input_length=timesteps,
+          time_major=self.time_major,
+          zero_output_for_mask=self.zero_output_for_mask)
+      runtime = constant_op.constant(
+          'unknown', dtype=dtypes.string, name='runtime')
+    else:
+      # Use the new defun approach for backend implementation swap.
+      # Note that different implementations need to have same function
+      # signature, eg, the tensor parameters need to have same shape and dtypes.
+      # Since the CuDNN has an extra set of bias, those bias will be passed to
+      # both normal and CuDNN implementations.
+      if self.go_backwards:
+        # Reverse time axis.
+        inputs = K.reverse(inputs, 1)
+
+      combined_bias = array_ops.concat([self.cudnn_bias, self.cell.bias], 0)
+
+      # Each time a defun function is called, we will give a unique identifiable
+      # API name, so that the grappler won't get confused when it sees multiple
+      # LSTM layer added into same graph, and it will be able to pair up the
+      # different implementations across them.
+      experimental_api_name = 'lstm_' + str(uuid.uuid4())
+      standard_lstm_attributes = {
+          'experimental_api_implements': experimental_api_name,
+          'experimental_api_preferred_device': 'CPU',
+      }
+      cudnn_lstm_attributes = {
+          'experimental_api_implements': experimental_api_name,
+          'experimental_api_preferred_device': 'GPU',
+      }
+      defun_standard_lstm = function.defun_with_attributes(
+          standard_lstm, attributes=standard_lstm_attributes)
+      defun_cudnn_lstm = function.defun_with_attributes(
+          cudnn_lstm, attributes=cudnn_lstm_attributes)
+
+      if ops.executing_eagerly_outside_functions():
+        # Under eager context, the device placement is already known. Prefer the
+        # GPU implementation here.
+        if context.num_gpus() > 0:
+          last_output, outputs, new_h, new_c, runtime = defun_cudnn_lstm(
+              inputs, initial_state[0], initial_state[1], self.cell.kernel,
+              self.cell.recurrent_kernel, combined_bias, self.time_major)
+        else:
+          last_output, outputs, new_h, new_c, runtime = defun_standard_lstm(
+              inputs, initial_state[0], initial_state[1], self.cell.kernel,
+              self.cell.recurrent_kernel, combined_bias, self.activation,
+              self.recurrent_activation, self.time_major)
+      else:
+        # Call the normal LSTM impl and register the CuDNN impl function. The
+        # grappler will kick in during session execution to optimize the graph.
+        last_output, outputs, new_h, new_c, runtime = defun_standard_lstm(
+            inputs, initial_state[0], initial_state[1], self.cell.kernel,
+            self.cell.recurrent_kernel, combined_bias, self.activation,
+            self.recurrent_activation, self.time_major)
+
+        function.register(defun_cudnn_lstm, inputs, initial_state[0],
+                          initial_state[1], self.cell.kernel,
+                          self.cell.recurrent_kernel, combined_bias,
+                          self.time_major)
+      states = [new_h, new_c]
+
+    if self.stateful:
+      updates = []
+      for i in range(len(states)):
+        updates.append(state_ops.assign(self.states[i], states[i]))
+      self.add_update(updates, inputs)
+
+    if self.return_sequences:
+      output = outputs
+    else:
+      output = last_output
+
+    if self.return_state:
+      return [output] + states
+    elif self.return_runtime:
+      return output, runtime
+    else:
+      return output
+
+  @property
+  def trainable_weights(self):
+    if self.trainable:
+      weights = []
+      weights += self.cell.trainable_weights
+      if getattr(self, 'cudnn_bias', None) is not None:
+        weights += [self.cudnn_bias]
+      return weights
+    return []
+
+  @property
+  def non_trainable_weights(self):
+    if not self.trainable:
+      weights = []
+      weights += self.cell.non_trainable_weights
+      if getattr(self, 'cudnn_bias', None) is not None:
+        weights += [self.cudnn_bias]
+      return weights
+    return []
+
+  @property
+  def losses(self):
+    losses = []
+    losses += self.cell.losses
+    return losses + self._losses
+
+  @property
+  def updates(self):
+    updates = []
+    updates += self.cell.updates
+    return updates + self._updates
+
+  def get_weights(self):
+    weights = []
+    weights += self.cell.weights
+    if getattr(self, 'cudnn_bias', None) is not None:
+      weights += [self.cudnn_bias]
+    return K.batch_get_value(weights)
+
+  def set_weights(self, weights):
+    tuples = []
+    cell_weights = weights[:len(self.cell.weights)]
+    if cell_weights:
+      tuples.append((self.cell.weights, cell_weights))
+    if getattr(self, 'cudnn_bias', None) is not None:
+      cudnn_bias_weights = weights[len(self.cell.weights):]
+      if cudnn_bias_weights:
+        tuples.append((self.cudnn_bias, cudnn_bias_weights))
+    K.batch_set_value(tuples)
+
+
+def _canonical_to_params(weights, biases, shape):
+  """Utility function convert variable to CuDNN compatible parameter."""
+  weights = [array_ops.reshape(x, shape) for x in weights]
+  biases = [array_ops.reshape(x, shape) for x in biases]
+  return array_ops.concat(weights + biases, axis=0)
+
+
+def standard_lstm(inputs, init_h, init_c, kernel, recurrent_kernel, bias,
+                  activation, recurrent_activation, time_major):
+  """LSTM with standard kernel implementation.
+
+  This implementation can be run on all types for hardware.
+
+  This implementation lifts out all the layer weights and make them function
+  parameters. It has same number of tensor input params as the CuDNN
+  counterpart. The RNN step logic has been simplified, eg dropout and mask is
+  removed since CuDNN implementation does not support that.
+
+  Note that the first half of the bias tensor should be ignored by this impl.
+  The CuDNN impl need an extra set of input gate bias. In order to make the both
+  function take same shape of parameter, that extra set of bias is also feed
+  here.
+
+  Args:
+    inputs: input tensor of LSTM layer.
+    init_h: initial state tensor for the cell output.
+    init_c: initial state tensor for the cell hidden state.
+    kernel: weights for cell kernel.
+    recurrent_kernel: weights for cell recurrent kernel.
+    bias: weights for cell kernel bias and recurrent bias. Only recurrent bias
+      is used in this case.
+    activation: Activation function to use for output.
+    recurrent_activation: Activation function to use for hidden recurrent state.
+    time_major: boolean, whether the inputs are in the format of
+      [time, batch, feature] or [batch, time, feature].
+
+  Returns:
+    last_output: output tensor for the last timestep, which has shape
+      [batch, units].
+    outputs: output tensor for all timesteps, which has shape
+      [batch, time, units].
+    state_0: the cell output, which has same shape as init_h.
+    state_1: the cell hidden state, which has same shape as init_c.
+    runtime: constant string tensor which indicate real runtime hardware. This
+      value is for testing purpose and should be used by user.
+  """
+  input_shape = K.int_shape(inputs)
+  timesteps = input_shape[0] if time_major else input_shape[1]
+
+  # Only use the second half of the bias weights.
+  _, real_bias = array_ops.split(bias, 2)
+
+  def step(cell_inputs, cell_states):
+    """Step function that will be used by Keras RNN backend."""
+    h_tm1 = cell_states[0]  # previous memory state
+    c_tm1 = cell_states[1]  # previous carry state
+
+    z = K.dot(cell_inputs, kernel)
+    z += K.dot(h_tm1, recurrent_kernel)
+    z = K.bias_add(z, real_bias)
+
+    z0, z1, z2, z3 = array_ops.split(z, 4, axis=1)
+
+    i = recurrent_activation(z0)
+    f = recurrent_activation(z1)
+    c = f * c_tm1 + i * activation(z2)
+    o = recurrent_activation(z3)
+
+    h = o * activation(c)
+    return h, [h, c]
+
+  last_output, outputs, new_states = K.rnn(
+      step,
+      inputs, [init_h, init_c],
+      constants=None,
+      unroll=False,
+      time_major=time_major,
+      input_length=timesteps)
+  return last_output, outputs, new_states[0], new_states[
+      1], constant_op.constant('cpu', dtype=dtypes.string, name='runtime')
+
+
+def cudnn_lstm(inputs, input_h, input_c, kernel, recurrent_kernel, bias,
+               time_major):
+  """LSTM with CuDNN implementation which is only available for GPU."""
+  if not time_major:
+    inputs = array_ops.transpose(inputs, perm=(1, 0, 2))
+  input_h = array_ops.expand_dims(input_h, axis=0)
+  input_c = array_ops.expand_dims(input_c, axis=0)
+
+  weights = array_ops.split(kernel, 4, axis=1)
+  weights += array_ops.split(recurrent_kernel, 4, axis=1)
+  params = _canonical_to_params(
+      weights=weights,
+      biases=array_ops.split(bias, 8),
+      shape=constant_op.constant([-1]))
+
+  outputs, h, c, _ = gen_cudnn_rnn_ops.cudnn_rnn(
+      inputs, input_h=input_h, input_c=input_c, params=params)
+  if not time_major:
+    outputs = array_ops.transpose(outputs, perm=[1, 0, 2])
+  h = h[0]
+  c = c[0]
+  last_output = outputs[:, -1, :]
+  return last_output, outputs, h, c, constant_op.constant(
+      'cudnn', dtype=dtypes.string, name='runtime')
+
+
 def _generate_dropout_mask(ones, rate, training=None, count=1):
   def dropped_inputs():
     return K.dropout(ones, rate)
diff --git a/tensorflow/python/keras/layers/recurrent_test.py b/tensorflow/python/keras/layers/recurrent_test.py
index bb14a7a5056..b1449069e32 100644
--- a/tensorflow/python/keras/layers/recurrent_test.py
+++ b/tensorflow/python/keras/layers/recurrent_test.py
@@ -1013,8 +1013,8 @@ class RNNTest(test.TestCase):
         inputs, _ = cell(inputs, initial_state)
         output = inputs
         if not context.executing_eagerly():
-          sess.run(variables_lib.global_variables_initializer())
-          output = sess.run(output)
+          self.evaluate(variables_lib.global_variables_initializer())
+          output = self.evaluate(output)
         return output
 
     random_seed.set_random_seed(12345)
@@ -1079,6 +1079,32 @@ class RNNTest(test.TestCase):
     # Expect last output to be the same as last output before masking
     self.assertAllClose(y_np, x_np[:, 1, :])
 
+  def test_zero_output_for_masking(self):
+
+    for unroll in [True, False]:
+      cell = keras.layers.SimpleRNNCell(5)
+      x = keras.Input((5, 5))
+      mask = keras.layers.Masking()
+      layer = keras.layers.RNN(
+          cell, return_sequences=True, zero_output_for_mask=True, unroll=unroll)
+      masked_input = mask(x)
+      y = layer(masked_input)
+      model = keras.models.Model(x, y)
+      model.compile(optimizer=rmsprop.RMSPropOptimizer(learning_rate=0.001),
+                    loss='mse')
+
+      np_x = np.ones((6, 5, 5))
+      result_1 = model.predict(np_x)
+
+      # set the time 4 and 5 for last record to be zero (masked).
+      np_x[5, 3:] = 0
+      result_2 = model.predict(np_x)
+
+      # expect the result_2 has same output, except the time 4,5 for last
+      # record.
+      result_1[5, 3:] = 0
+      self.assertAllClose(result_1, result_2)
+
 
 class Minimal2DRNNCell(keras.layers.Layer):
   """The minimal 2D RNN cell is a simple combination of 2 1-D RNN cell.
diff --git a/tensorflow/python/keras/layers/simplernn_test.py b/tensorflow/python/keras/layers/simplernn_test.py
index 93456b5e3a7..b49b159b719 100644
--- a/tensorflow/python/keras/layers/simplernn_test.py
+++ b/tensorflow/python/keras/layers/simplernn_test.py
@@ -98,6 +98,7 @@ class SimpleRNNLayerTest(test.TestCase):
     self.assertEqual(layer.cell.recurrent_kernel.constraint, r_constraint)
     self.assertEqual(layer.cell.bias.constraint, b_constraint)
 
+  @tf_test_util.run_deprecated_v1
   def test_with_masking_layer_SimpleRNN(self):
     layer_class = keras.layers.SimpleRNN
     inputs = np.random.random((2, 3, 4))
@@ -120,6 +121,7 @@ class SimpleRNNLayerTest(test.TestCase):
 
 class SimpleRNNLayerGraphOnlyTest(test.TestCase):
 
+  @tf_test_util.run_deprecated_v1
   def test_statefulness_SimpleRNN(self):
     num_samples = 2
     timesteps = 3
@@ -183,6 +185,7 @@ class SimpleRNNLayerGraphOnlyTest(test.TestCase):
 
       np.testing.assert_allclose(out7, out6, atol=1e-5)
 
+  @tf_test_util.run_deprecated_v1
   def test_regularizers_SimpleRNN(self):
     embedding_dim = 4
     layer_class = keras.layers.SimpleRNN
diff --git a/tensorflow/python/keras/layers/unified_lstm_test.py b/tensorflow/python/keras/layers/unified_lstm_test.py
new file mode 100644
index 00000000000..d229d14312f
--- /dev/null
+++ b/tensorflow/python/keras/layers/unified_lstm_test.py
@@ -0,0 +1,724 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for UnifiedLSTM layer."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import time
+
+from absl.testing import parameterized
+import numpy as np
+
+from tensorflow.core.protobuf import config_pb2
+from tensorflow.core.protobuf import rewriter_config_pb2
+from tensorflow.python import keras
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
+from tensorflow.python.keras import testing_utils
+from tensorflow.python.keras.layers.cudnn_recurrent import CuDNNLSTM
+from tensorflow.python.keras.layers.recurrent import UnifiedLSTM
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import gen_math_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.ops.losses import losses
+from tensorflow.python.platform import test
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.training import gradient_descent
+
+
+# Global config for grappler setting that is used for graph mode test.
+_rewrites = rewriter_config_pb2.RewriterConfig()
+_rewrites.function_optimization = rewriter_config_pb2.RewriterConfig.OFF
+_customer_optimizer = _rewrites.custom_optimizers.add()
+_customer_optimizer.name = 'ExperimentalImplementationSelector'
+_rewrites.min_graph_nodes = -1
+_graph_options = config_pb2.GraphOptions(rewrite_options=_rewrites)
+_config = config_pb2.ConfigProto(graph_options=_graph_options)
+
+
+class UnifiedLSTMTest(test.TestCase, parameterized.TestCase):
+
+  @test_util.run_deprecated_v1
+  def test_unifiedLSTM(self):
+    input_shape = 10
+    rnn_state_size = 8
+    output_shape = 8
+    timestep = 4
+    batch = 100
+    epoch = 1
+
+    with self.cached_session(config=_config, use_gpu=True) as sess:
+      (x_train, y_train), _ = testing_utils.get_test_data(
+          train_samples=batch,
+          test_samples=0,
+          input_shape=(timestep, input_shape),
+          num_classes=output_shape)
+      y_train = keras.utils.to_categorical(y_train, output_shape)
+
+      layer = UnifiedLSTM(rnn_state_size, return_runtime=True)
+
+      inputs = array_ops.placeholder(
+          dtypes.float32, shape=(None, timestep, input_shape), name='inputs')
+      predict = array_ops.placeholder(
+          dtypes.float32, shape=(None, output_shape), name='predict')
+
+      outputs, runtime = layer(inputs)
+      loss = losses.softmax_cross_entropy(predict, outputs)
+      optimizer = gradient_descent.GradientDescentOptimizer(0.001)
+      train_op = optimizer.minimize(loss)
+
+      sess.run([variables.global_variables_initializer()])
+      existing_loss = 0
+      for _ in range(epoch):
+        loss_value, _, runtime_value = sess.run([loss, train_op, runtime], {
+            inputs: x_train,
+            predict: y_train
+        })
+        if test.is_gpu_available():
+          self.assertEqual(runtime_value, b'cudnn')
+        else:
+          self.assertEqual(runtime_value, b'cpu')
+        # Make sure the loss is updated for every epoch
+        # (layer weights properly updated).
+        self.assertNotEqual(existing_loss, loss_value)
+        existing_loss = loss_value
+
+  @test_util.run_deprecated_v1
+  def test_unifiedLSTM_with_cond(self):
+    # This test is to demonstrate the graph rewrite of grappler plugin under
+    # the condition that the function returns different number of internal
+    # states.
+    input_shape = 10
+    rnn_state_size = 8
+    output_shape = 8
+    timestep = 4
+    batch = 100
+    epoch = 1
+
+    with self.cached_session(config=_config, use_gpu=True) as sess:
+      (x_train, y_train), _ = testing_utils.get_test_data(
+          train_samples=batch,
+          test_samples=0,
+          input_shape=(timestep, input_shape),
+          num_classes=output_shape)
+      y_train = keras.utils.to_categorical(y_train, output_shape)
+
+      layer = UnifiedLSTM(rnn_state_size, return_runtime=True)
+
+      inputs = array_ops.placeholder(
+          dtypes.float32, shape=(None, timestep, input_shape), name='inputs')
+      predict = array_ops.placeholder(
+          dtypes.float32, shape=(None, output_shape), name='predict')
+
+      zeros = array_ops.zeros([batch, output_shape])
+      dummy_runtime = constant_op.constant(
+          'unknown', dtype=dtypes.string, name='runtime')
+      a = constant_op.constant(0)
+      b = constant_op.constant(1)
+      # Will always run the lstm layer.
+      outputs, runtime = control_flow_ops.cond(
+          gen_math_ops.less(a, b),
+          lambda: layer(inputs),
+          lambda: (zeros, dummy_runtime))
+      loss = losses.softmax_cross_entropy(predict, outputs)
+      optimizer = gradient_descent.GradientDescentOptimizer(0.001)
+      train_op = optimizer.minimize(loss)
+
+      sess.run([variables.global_variables_initializer()])
+      existing_loss = 0
+
+      for _ in range(epoch):
+        loss_value, _, runtime_value = sess.run([loss, train_op, runtime], {
+            inputs: x_train,
+            predict: y_train
+        })
+        if test.is_gpu_available():
+          self.assertEqual(runtime_value, b'cudnn')
+        else:
+          self.assertEqual(runtime_value, b'cpu')
+        # Make sure the loss is updated for every epoch
+        # (layer weights properly updated).
+        self.assertNotEqual(existing_loss, loss_value)
+        existing_loss = loss_value
+
+  @test_util.run_in_graph_and_eager_modes(config=_config)
+  def test_keras_model_with_lstm(self):
+    input_shape = 10
+    rnn_state_size = 8
+    output_shape = 8
+    timestep = 4
+    batch = 100
+    epoch = 10
+
+    (x_train, y_train), _ = testing_utils.get_test_data(
+        train_samples=batch,
+        test_samples=0,
+        input_shape=(timestep, input_shape),
+        num_classes=output_shape)
+    y_train = keras.utils.to_categorical(y_train, output_shape)
+
+    layer = UnifiedLSTM(rnn_state_size)
+
+    inputs = keras.layers.Input(
+        shape=[timestep, input_shape], dtype=dtypes.float32)
+
+    outputs = layer(inputs)
+    model = keras.models.Model(inputs, outputs)
+    model.compile('rmsprop', loss='mse')
+    model.fit(x_train, y_train, epochs=epoch)
+    model.evaluate(x_train, y_train)
+
+  @test_util.run_in_graph_and_eager_modes(config=_config)
+  def test_return_sequences_LSTM(self):
+    num_samples = 2
+    timesteps = 3
+    embedding_dim = 4
+    units = 2
+    testing_utils.layer_test(
+        UnifiedLSTM,
+        kwargs={
+            'units': units,
+            'return_sequences': True
+        },
+        input_shape=(num_samples, timesteps, embedding_dim))
+
+  @test_util.run_in_graph_and_eager_modes(config=_config)
+  def test_static_shape_inference_LSTM(self):
+    # Github issue: 15165
+    timesteps = 3
+    embedding_dim = 4
+    units = 2
+
+    model = keras.models.Sequential()
+    inputs = keras.layers.Dense(
+        embedding_dim, input_shape=(timesteps, embedding_dim))
+    model.add(inputs)
+    layer = UnifiedLSTM(units, return_sequences=True)
+    model.add(layer)
+    outputs = model.layers[-1].output
+    self.assertEqual(outputs.get_shape().as_list(), [None, timesteps, units])
+
+  @test_util.run_in_graph_and_eager_modes(config=_config)
+  def test_dynamic_behavior_LSTM(self):
+    num_samples = 2
+    timesteps = 3
+    embedding_dim = 4
+    units = 2
+    layer = UnifiedLSTM(units, input_shape=(None, embedding_dim))
+    model = keras.models.Sequential()
+    model.add(layer)
+    model.compile(gradient_descent.GradientDescentOptimizer(0.001), 'mse')
+    x = np.random.random((num_samples, timesteps, embedding_dim))
+    y = np.random.random((num_samples, units))
+    model.train_on_batch(x, y)
+
+  @test_util.run_in_graph_and_eager_modes(config=_config)
+  def test_dropout_LSTM(self):
+    num_samples = 2
+    timesteps = 3
+    embedding_dim = 4
+    units = 2
+    testing_utils.layer_test(
+        UnifiedLSTM,
+        kwargs={
+            'units': units,
+            'dropout': 0.1,
+            'recurrent_dropout': 0.1
+        },
+        input_shape=(num_samples, timesteps, embedding_dim))
+
+  @parameterized.parameters([0, 1, 2])
+  @test_util.run_in_graph_and_eager_modes(config=_config)
+  def test_implementation_mode_LSTM(self, implementation_mode):
+    num_samples = 2
+    timesteps = 3
+    embedding_dim = 4
+    units = 2
+    testing_utils.layer_test(
+        UnifiedLSTM,
+        kwargs={
+            'units': units,
+            'implementation': implementation_mode
+        },
+        input_shape=(num_samples, timesteps, embedding_dim))
+
+  @test_util.run_in_graph_and_eager_modes(config=_config)
+  def test_constraints_LSTM(self):
+    embedding_dim = 4
+    layer_class = UnifiedLSTM
+    k_constraint = keras.constraints.max_norm(0.01)
+    r_constraint = keras.constraints.max_norm(0.01)
+    b_constraint = keras.constraints.max_norm(0.01)
+    layer = layer_class(
+        5,
+        return_sequences=False,
+        weights=None,
+        input_shape=(None, embedding_dim),
+        kernel_constraint=k_constraint,
+        recurrent_constraint=r_constraint,
+        bias_constraint=b_constraint)
+    layer.build((None, None, embedding_dim))
+    self.assertEqual(layer.cell.kernel.constraint, k_constraint)
+    self.assertEqual(layer.cell.recurrent_kernel.constraint, r_constraint)
+    self.assertEqual(layer.cell.bias.constraint, b_constraint)
+
+  @test_util.run_in_graph_and_eager_modes(config=_config)
+  def test_with_masking_layer_LSTM(self):
+    layer_class = UnifiedLSTM
+    inputs = np.random.random((2, 3, 4))
+    targets = np.abs(np.random.random((2, 3, 5)))
+    targets /= targets.sum(axis=-1, keepdims=True)
+    model = keras.models.Sequential()
+    model.add(keras.layers.Masking(input_shape=(3, 4)))
+    model.add(layer_class(units=5, return_sequences=True, unroll=False))
+    model.compile(
+        loss='categorical_crossentropy',
+        optimizer=gradient_descent.GradientDescentOptimizer(0.01))
+    model.fit(inputs, targets, epochs=1, batch_size=2, verbose=1)
+
+  @test_util.run_in_graph_and_eager_modes(config=_config)
+  def test_stacking_LSTM(self):
+    inputs = np.random.random((2, 3, 4))
+    targets = np.abs(np.random.random((2, 3, 5)))
+    targets /= targets.sum(axis=-1, keepdims=True)
+    model = keras.models.Sequential()
+    model.add(UnifiedLSTM(10, return_sequences=True, unroll=False))
+    model.add(UnifiedLSTM(5, return_sequences=True, unroll=False))
+    model.compile(
+        loss='categorical_crossentropy',
+        optimizer=gradient_descent.GradientDescentOptimizer(0.01))
+    model.fit(inputs, targets, epochs=1, batch_size=2, verbose=1)
+
+  @test_util.run_in_graph_and_eager_modes(config=_config)
+  def test_masking_with_stacking_LSTM(self):
+    inputs = np.random.random((2, 3, 4))
+    targets = np.abs(np.random.random((2, 3, 5)))
+    targets /= targets.sum(axis=-1, keepdims=True)
+    model = keras.models.Sequential()
+    model.add(keras.layers.Masking(input_shape=(3, 4)))
+    model.add(UnifiedLSTM(10, return_sequences=True, unroll=False))
+    model.add(UnifiedLSTM(5, return_sequences=True, unroll=False))
+    model.compile(
+        loss='categorical_crossentropy',
+        optimizer=gradient_descent.GradientDescentOptimizer(0.01))
+    model.fit(inputs, targets, epochs=1, batch_size=2, verbose=1)
+
+  @test_util.run_in_graph_and_eager_modes(config=_config)
+  def test_from_config_LSTM(self):
+    layer_class = UnifiedLSTM
+    for stateful in (False, True):
+      l1 = layer_class(units=1, stateful=stateful)
+      l2 = layer_class.from_config(l1.get_config())
+      assert l1.get_config() == l2.get_config()
+
+  @test_util.run_in_graph_and_eager_modes(config=_config)
+  def test_specify_initial_state_keras_tensor(self):
+    num_states = 2
+    timesteps = 3
+    embedding_dim = 4
+    units = 3
+    num_samples = 2
+
+    # Test with Keras tensor
+    inputs = keras.Input((timesteps, embedding_dim))
+    initial_state = [keras.Input((units,)) for _ in range(num_states)]
+    layer = UnifiedLSTM(units)
+    if len(initial_state) == 1:
+      output = layer(inputs, initial_state=initial_state[0])
+    else:
+      output = layer(inputs, initial_state=initial_state)
+    assert initial_state[0] in layer._inbound_nodes[0].input_tensors
+
+    model = keras.models.Model([inputs] + initial_state, output)
+    model.compile(
+        loss='categorical_crossentropy',
+        optimizer=gradient_descent.GradientDescentOptimizer(0.01))
+
+    inputs = np.random.random((num_samples, timesteps, embedding_dim))
+    initial_state = [
+        np.random.random((num_samples, units)) for _ in range(num_states)
+    ]
+    targets = np.random.random((num_samples, units))
+    model.train_on_batch([inputs] + initial_state, targets)
+
+  @test_util.run_in_graph_and_eager_modes(config=_config)
+  def DISABLED_test_specify_initial_state_non_keras_tensor(self):
+    num_states = 2
+    timesteps = 3
+    embedding_dim = 4
+    units = 3
+    num_samples = 2
+
+    # Test with non-Keras tensor
+    inputs = keras.Input((timesteps, embedding_dim))
+    initial_state = [
+        keras.backend.random_normal_variable((num_samples, units), 0, 1)
+        for _ in range(num_states)
+    ]
+    layer = UnifiedLSTM(units)
+    output = layer(inputs, initial_state=initial_state)
+
+    model = keras.models.Model(inputs, output)
+    model.compile(
+        loss='categorical_crossentropy',
+        optimizer=gradient_descent.GradientDescentOptimizer(0.01))
+
+    inputs = np.random.random((num_samples, timesteps, embedding_dim))
+    targets = np.random.random((num_samples, units))
+    model.train_on_batch(inputs, targets)
+
+  @test_util.run_in_graph_and_eager_modes(config=_config)
+  def test_reset_states_with_values(self):
+    num_states = 2
+    timesteps = 3
+    embedding_dim = 4
+    units = 3
+    num_samples = 2
+
+    layer = UnifiedLSTM(units, stateful=True)
+    layer.build((num_samples, timesteps, embedding_dim))
+    layer.reset_states()
+    assert len(layer.states) == num_states
+    assert layer.states[0] is not None
+    self.assertAllClose(
+        keras.backend.eval(layer.states[0]),
+        np.zeros(keras.backend.int_shape(layer.states[0])),
+        atol=1e-4)
+    state_shapes = [keras.backend.int_shape(state) for state in layer.states]
+    values = [np.ones(shape) for shape in state_shapes]
+    if len(values) == 1:
+      values = values[0]
+    layer.reset_states(values)
+    self.assertAllClose(
+        keras.backend.eval(layer.states[0]),
+        np.ones(keras.backend.int_shape(layer.states[0])),
+        atol=1e-4)
+
+    # Test with invalid data
+    with self.assertRaises(ValueError):
+      layer.reset_states([1] * (len(layer.states) + 1))
+
+  @test_util.run_in_graph_and_eager_modes(config=_config)
+  def test_specify_state_with_masking(self):
+    num_states = 2
+    timesteps = 3
+    embedding_dim = 4
+    units = 3
+    num_samples = 2
+
+    inputs = keras.Input((timesteps, embedding_dim))
+    _ = keras.layers.Masking()(inputs)
+    initial_state = [keras.Input((units,)) for _ in range(num_states)]
+    output = UnifiedLSTM(units)(inputs, initial_state=initial_state)
+
+    model = keras.models.Model([inputs] + initial_state, output)
+    model.compile(
+        loss='categorical_crossentropy',
+        optimizer=gradient_descent.GradientDescentOptimizer(0.01))
+
+    inputs = np.random.random((num_samples, timesteps, embedding_dim))
+    initial_state = [
+        np.random.random((num_samples, units)) for _ in range(num_states)
+    ]
+    targets = np.random.random((num_samples, units))
+    model.train_on_batch([inputs] + initial_state, targets)
+
+  @test_util.run_in_graph_and_eager_modes(config=_config)
+  def test_return_state(self):
+    num_states = 2
+    timesteps = 3
+    embedding_dim = 4
+    units = 3
+    num_samples = 2
+
+    inputs = keras.Input(batch_shape=(num_samples, timesteps, embedding_dim))
+    layer = UnifiedLSTM(units, return_state=True, stateful=True)
+    outputs = layer(inputs)
+    state = outputs[1:]
+    assert len(state) == num_states
+    model = keras.models.Model(inputs, state[0])
+
+    inputs = np.random.random((num_samples, timesteps, embedding_dim))
+    state = model.predict(inputs)
+    self.assertAllClose(keras.backend.eval(layer.states[0]), state, atol=1e-4)
+
+  @test_util.run_in_graph_and_eager_modes(config=_config)
+  def test_state_reuse(self):
+    timesteps = 3
+    embedding_dim = 4
+    units = 3
+    num_samples = 2
+
+    inputs = keras.Input(batch_shape=(num_samples, timesteps, embedding_dim))
+    layer = UnifiedLSTM(units, return_state=True, return_sequences=True)
+    outputs = layer(inputs)
+    output, state = outputs[0], outputs[1:]
+    output = UnifiedLSTM(units)(output, initial_state=state)
+    model = keras.models.Model(inputs, output)
+
+    inputs = np.random.random((num_samples, timesteps, embedding_dim))
+    model.predict(inputs)
+
+  @test_util.run_in_graph_and_eager_modes(config=_config)
+  def test_initial_states_as_other_inputs(self):
+    timesteps = 3
+    embedding_dim = 4
+    units = 3
+    num_samples = 2
+    num_states = 2
+    layer_class = UnifiedLSTM
+
+    # Test with Keras tensor
+    main_inputs = keras.Input((timesteps, embedding_dim))
+    initial_state = [keras.Input((units,)) for _ in range(num_states)]
+    inputs = [main_inputs] + initial_state
+
+    layer = layer_class(units)
+    output = layer(inputs)
+    assert initial_state[0] in layer._inbound_nodes[0].input_tensors
+
+    model = keras.models.Model(inputs, output)
+    model.compile(
+        loss='categorical_crossentropy',
+        optimizer=gradient_descent.GradientDescentOptimizer(0.01))
+
+    main_inputs = np.random.random((num_samples, timesteps, embedding_dim))
+    initial_state = [
+        np.random.random((num_samples, units)) for _ in range(num_states)
+    ]
+    targets = np.random.random((num_samples, units))
+    model.train_on_batch([main_inputs] + initial_state, targets)
+
+
+class LSTMLayerGraphOnlyTest(test.TestCase):
+
+  def test_statefulness_LSTM(self):
+    num_samples = 2
+    timesteps = 3
+    embedding_dim = 4
+    units = 2
+    layer_class = UnifiedLSTM
+    with self.cached_session(config=_config):
+      model = keras.models.Sequential()
+      model.add(
+          keras.layers.Embedding(
+              4,
+              embedding_dim,
+              mask_zero=True,
+              input_length=timesteps,
+              batch_input_shape=(num_samples, timesteps)))
+      layer = layer_class(
+          units, return_sequences=False, stateful=True, weights=None)
+      model.add(layer)
+      model.compile(
+          optimizer=gradient_descent.GradientDescentOptimizer(0.01), loss='mse')
+      out1 = model.predict(np.ones((num_samples, timesteps)))
+      self.assertEqual(out1.shape, (num_samples, units))
+
+      # train once so that the states change
+      model.train_on_batch(
+          np.ones((num_samples, timesteps)), np.ones((num_samples, units)))
+      out2 = model.predict(np.ones((num_samples, timesteps)))
+
+      # if the state is not reset, output should be different
+      self.assertNotEqual(out1.max(), out2.max())
+
+      # check that output changes after states are reset
+      # (even though the model itself didn't change)
+      layer.reset_states()
+      out3 = model.predict(np.ones((num_samples, timesteps)))
+      self.assertNotEqual(out2.max(), out3.max())
+
+      # check that container-level reset_states() works
+      model.reset_states()
+      out4 = model.predict(np.ones((num_samples, timesteps)))
+      self.assertAllClose(out3, out4, atol=1e-5)
+
+      # check that the call to `predict` updated the states
+      out5 = model.predict(np.ones((num_samples, timesteps)))
+      self.assertNotEqual(out4.max(), out5.max())
+
+      # Check masking
+      layer.reset_states()
+
+      left_padded_input = np.ones((num_samples, timesteps))
+      left_padded_input[0, :1] = 0
+      left_padded_input[1, :2] = 0
+      out6 = model.predict(left_padded_input)
+
+      layer.reset_states()
+
+      right_padded_input = np.ones((num_samples, timesteps))
+      right_padded_input[0, -1:] = 0
+      right_padded_input[1, -2:] = 0
+      out7 = model.predict(right_padded_input)
+
+      self.assertAllClose(out7, out6, atol=1e-5)
+
+  def test_regularizers_LSTM(self):
+    embedding_dim = 4
+    layer_class = UnifiedLSTM
+    with self.cached_session(config=_config):
+      layer = layer_class(
+          5,
+          return_sequences=False,
+          weights=None,
+          input_shape=(None, embedding_dim),
+          kernel_regularizer=keras.regularizers.l1(0.01),
+          recurrent_regularizer=keras.regularizers.l1(0.01),
+          bias_regularizer='l2',
+          activity_regularizer='l1')
+      layer.build((None, None, 2))
+      self.assertEqual(len(layer.losses), 3)
+      x = keras.backend.variable(np.ones((2, 3, 2)))
+      layer(x)
+      self.assertEqual(len(layer.get_losses_for(x)), 1)
+
+
+class UnifiedLSTMPerformanceTest(test.TestCase):
+
+  def _measure_performance(self, test_config, model, x_train, y_train):
+    batch = test_config['batch']
+    epoch = test_config['epoch']
+    warmup_epoch = test_config['warmup_epoch']
+
+    # warm up the model
+    model.fit(x_train, y_train, batch_size=batch, epochs=warmup_epoch)
+    start_time = time.time()
+    model.fit(x_train, y_train, batch_size=batch, epochs=epoch - warmup_epoch)
+    end_time = time.time()
+    return (end_time - start_time) / (epoch - warmup_epoch)
+
+  def _time_performance_run_cudnn_lstm(self, test_config, x_train, y_train):
+    # Get the performance number for standard Cudnn LSTM
+    input_shape = test_config['input_shape']
+    rnn_state_size = test_config['rnn_state_size']
+    timestep = test_config['timestep']
+
+    cudnn_lstm_layer = CuDNNLSTM(rnn_state_size)
+    inputs = keras.layers.Input(
+        shape=[timestep, input_shape], dtype=dtypes.float32)
+
+    outputs = cudnn_lstm_layer(inputs)
+    model = keras.models.Model(inputs, outputs)
+    model.compile('sgd', 'mse')
+
+    sec_per_epoch = self._measure_performance(
+        test_config, model, x_train, y_train)
+    logging.info('Average performance for %s per epoch is: %s',
+                 'CuDNN LSTM', sec_per_epoch)
+    return sec_per_epoch
+
+  def _time_performance_run_unifed_lstm_gpu(
+      self, test_config, x_train, y_train):
+    # Get performance number for Unified_LSTM with grappler swap the impl
+    input_shape = test_config['input_shape']
+    rnn_state_size = test_config['rnn_state_size']
+    timestep = test_config['timestep']
+
+    layer = UnifiedLSTM(rnn_state_size)
+    inputs = keras.layers.Input(
+        shape=[timestep, input_shape], dtype=dtypes.float32)
+
+    outputs = layer(inputs)
+    model = keras.models.Model(inputs, outputs)
+    model.compile('sgd', 'mse')
+
+    sec_per_epoch = self._measure_performance(
+        test_config, model, x_train, y_train)
+    logging.info('Average performance for %s per epoch is: %s',
+                 'Unified LSTM', sec_per_epoch)
+    return sec_per_epoch
+
+  def _time_performance_run_normal_lstm(
+      self, test_config, x_train, y_train):
+    # Get performance number for standard LSTM on GPU.
+    input_shape = test_config['input_shape']
+    rnn_state_size = test_config['rnn_state_size']
+    timestep = test_config['timestep']
+
+    layer = keras.layers.LSTM(rnn_state_size)
+    inputs = keras.layers.Input(
+        shape=[timestep, input_shape], dtype=dtypes.float32)
+
+    outputs = layer(inputs)
+    model = keras.models.Model(inputs, outputs)
+    model.compile('sgd', 'mse')
+
+    sec_per_epoch = self._measure_performance(
+        test_config, model, x_train, y_train)
+    logging.info('Average performance for %s per epoch is: %s',
+                 'Normal LSTM', sec_per_epoch)
+    return sec_per_epoch
+
+  @test_util.run_in_graph_and_eager_modes(config=_config, use_gpu=True)
+  def test_performance_with_standard_cudnn_impl(self):
+    if not test.is_gpu_available():
+      self.skipTest('performance test will only run on GPU')
+
+    batch = 64
+    num_batch = 10
+    test_config = {
+        'input_shape': 128,
+        'rnn_state_size': 64,
+        'output_shape': 64,
+        'timestep': 50,
+        'batch': batch,
+        'epoch': 20,
+        # The performance for warmup epoch is ignored.
+        'warmup_epoch': 1,
+    }
+    (x_train, y_train), _ = testing_utils.get_test_data(
+        train_samples=(batch * num_batch),
+        test_samples=0,
+        input_shape=(test_config['timestep'], test_config['input_shape']),
+        num_classes=test_config['output_shape'])
+    y_train = keras.utils.to_categorical(y_train, test_config['output_shape'])
+
+    cudnn_duration = self._time_performance_run_cudnn_lstm(
+        test_config, x_train, y_train)
+    unified_lstm_gpu_duration = self._time_performance_run_unifed_lstm_gpu(
+        test_config, x_train, y_train)
+    normal_lstm_duration = self._time_performance_run_normal_lstm(
+        test_config, x_train, y_train)
+
+    cudnn_vs_unified = cudnn_duration / unified_lstm_gpu_duration
+    unified_vs_normal = normal_lstm_duration / unified_lstm_gpu_duration
+
+    # TODO(scottzhu): reeanble the test after moving it to benchmark test suite.
+    # The current test has performance flakiness issue.
+    logging.info('Expect the performance of Unified LSTM is within 80% of '
+                 'CuDNN LSTM, got {0:.2f}%'.format(cudnn_vs_unified * 100))
+    logging.info('Expect the performance of Unified LSTM is more than 5 times'
+                 ' of normal LSTM, got {0:.2f}'.format(unified_vs_normal))
+
+    # Assert the performance diff should be within 80% of the native cudnn.
+    # self.assertGreaterEqual(
+    #     cudnn_vs_unified, 0.80,
+    #     'Expect the performance of Unified LSTM is within 80% of CuDNN LSTM, '
+    #     'but got {0:.2f}%'.format(cudnn_vs_unified * 100))
+    # # Assert the performance diff between CPU impl and GPU impl should be more
+    # # than 5 times.
+    # self.assertGreaterEqual(
+    #     unified_vs_normal, 5,
+    #     'Expect the performance of Unified LSTM is more than 5 times of '
+    #     'normal LSTM, but got {0:.2f}'.format(unified_vs_normal))
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/keras/layers/wrappers.py b/tensorflow/python/keras/layers/wrappers.py
index d40f7a2e809..67b154141ef 100644
--- a/tensorflow/python/keras/layers/wrappers.py
+++ b/tensorflow/python/keras/layers/wrappers.py
@@ -23,8 +23,8 @@ import copy
 
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.keras import backend as K
-from tensorflow.python.keras.engine.base_layer import InputSpec
 from tensorflow.python.keras.engine.base_layer import Layer
+from tensorflow.python.keras.engine.input_spec import InputSpec
 from tensorflow.python.keras.layers.recurrent import _standardize_args
 from tensorflow.python.keras.utils import generic_utils
 from tensorflow.python.keras.utils import tf_utils
@@ -389,6 +389,10 @@ class Bidirectional(Wrapper):
       raise ValueError('Invalid merge mode. '
                        'Merge mode should be one of '
                        '{"sum", "mul", "ave", "concat", None}')
+    if getattr(layer, 'zero_output_for_mask', None) is not None:
+      # Force the zero_output_for_mask to be True if it presents.
+      layer.zero_output_for_mask = True
+
     self.forward_layer = copy.copy(layer)
     config = layer.get_config()
     config['go_backwards'] = not config['go_backwards']
diff --git a/tensorflow/python/keras/layers/wrappers_test.py b/tensorflow/python/keras/layers/wrappers_test.py
index 9584b0186c4..b9196416ddd 100644
--- a/tensorflow/python/keras/layers/wrappers_test.py
+++ b/tensorflow/python/keras/layers/wrappers_test.py
@@ -192,8 +192,8 @@ class TimeDistributedTest(test.TestCase):
     x = keras.layers.Input(shape=(3, 2))
     layer = keras.layers.TimeDistributed(keras.layers.BatchNormalization())
     _ = layer(x)
-    self.assertEquals(len(layer.updates), 2)
-    self.assertEquals(len(layer.trainable_weights), 2)
+    self.assertEqual(len(layer.updates), 2)
+    self.assertEqual(len(layer.trainable_weights), 2)
     layer.trainable = False
     assert not layer.updates
     assert not layer.trainable_weights
@@ -201,6 +201,7 @@ class TimeDistributedTest(test.TestCase):
     assert len(layer.updates) == 2
     assert len(layer.trainable_weights) == 2
 
+  @tf_test_util.run_deprecated_v1
   def test_TimeDistributed_with_masked_embedding_and_unspecified_shape(self):
     with self.cached_session():
       # test with unspecified shape and Embeddings with mask_zero
@@ -233,6 +234,7 @@ class TimeDistributedTest(test.TestCase):
         self.assertAllEqual(mask_outputs_val[i], ref_mask_val[i])
       self.assertIs(mask_outputs[-1], None)  # final layer
 
+  @tf_test_util.run_deprecated_v1
   def test_TimeDistributed_with_masking_layer(self):
     with self.cached_session():
       # test with Masking layer
@@ -375,6 +377,7 @@ class BidirectionalTest(test.TestCase):
       model.compile(loss='mse', optimizer='sgd')
       model.fit(x, y, epochs=1, batch_size=1)
 
+  @tf_test_util.run_deprecated_v1
   def test_Bidirectional_merged_value(self):
     rnn = keras.layers.LSTM
     samples = 2
@@ -505,6 +508,7 @@ class BidirectionalTest(test.TestCase):
       layer.trainable = True
       assert len(layer.trainable_weights) == 6
 
+  @tf_test_util.run_deprecated_v1
   def test_Bidirectional_updates(self):
     with self.cached_session():
       x = keras.layers.Input(shape=(3, 2))
@@ -635,6 +639,34 @@ class BidirectionalTest(test.TestCase):
       y_np_3 = model.predict([x_np, s_fw_np, s_bk_np, c_np])
       self.assertAllClose(y_np, y_np_3, atol=1e-4)
 
+  def test_Bidirectional_with_masking(self):
+    rnn = keras.layers.LSTM
+    samples = 2
+    dim = 5
+    timesteps = 3
+    units = 3
+    merge_mode = 'concat'
+    x = np.random.rand(samples, timesteps, dim)
+    # clear the first record's timestep 2, and expect the output of timestep 2
+    # is also 0s.
+    x[0, 2] = 0
+
+    with self.cached_session():
+      inputs = keras.Input((timesteps, dim))
+      masked_inputs = keras.layers.Masking()(inputs)
+      wrapped = keras.layers.Bidirectional(
+          rnn(units, return_sequences=True),
+          merge_mode=merge_mode)
+      outputs = _to_list(wrapped(masked_inputs, training=True))
+      self.assertEqual(len(outputs), 1)
+      self.assertEqual(outputs[0].get_shape().as_list(),
+                       [None, timesteps, units * 2])
+
+      model = keras.Model(inputs, outputs)
+      y = _to_list(model.predict(x))
+      self.assertEqual(len(y), 1)
+      self.assertAllClose(y[0][0, 2], np.zeros(units * 2))
+
 
 def _to_list(ls):
   if isinstance(ls, list):
diff --git a/tensorflow/python/keras/losses.py b/tensorflow/python/keras/losses.py
index 9f548bfe040..4c584d0ff05 100644
--- a/tensorflow/python/keras/losses.py
+++ b/tensorflow/python/keras/losses.py
@@ -19,16 +19,382 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import abc
+
 import six
 
+from tensorflow.python.framework import ops
 from tensorflow.python.keras import backend as K
 from tensorflow.python.keras.utils.generic_utils import deserialize_keras_object
 from tensorflow.python.keras.utils.generic_utils import serialize_keras_object
+from tensorflow.python.keras.utils.losses_utils import compute_weighted_loss
+from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn
+from tensorflow.python.ops.losses import losses_impl
 from tensorflow.python.util.tf_export import tf_export
 
 
+class Loss(object):
+  """Loss base class.
+
+  To be implemented by subclasses:
+  * `call()`: Contains the logic for loss calculation using `y_true`, `y_pred`.
+
+  Example subclass implementation:
+  ```
+  class MeanSquaredError(Loss):
+    def call(self, y_true, y_pred):
+      y_pred = ops.convert_to_tensor(y_pred)
+      y_true = math_ops.cast(y_true, y_pred.dtype)
+      return K.mean(math_ops.square(y_pred - y_true), axis=-1)
+  ```
+
+  Args:
+    reduction: Type of `tf.losses.Reduction` to apply to loss. Default value is
+      `SUM_OVER_BATCH_SIZE`.
+    name: Optional name for the op.
+  """
+
+  def __init__(self,
+               reduction=losses_impl.ReductionV2.SUM_OVER_BATCH_SIZE,
+               name=None):
+    self.reduction = reduction
+    self.name = name
+
+  def __call__(self, y_true, y_pred, sample_weight=None):
+    """Invokes the `Loss` instance.
+
+    Args:
+      y_true: Ground truth values.
+      y_pred: The predicted values.
+      sample_weight: Optional `Tensor` whose rank is either 0, or the same rank
+        as `y_true`, or is broadcastable to `y_true`. `sample_weight` acts as a
+        coefficient for the loss. If a scalar is provided, then the loss is
+        simply scaled by the given value. If `sample_weight` is a tensor of size
+        `[batch_size]`, then the total loss for each sample of the batch is
+        rescaled by the corresponding element in the `sample_weight` vector. If
+        the shape of `sample_weight` matches the shape of `y_pred`, then the
+        loss of each measurable element of `y_pred` is scaled by the
+        corresponding value of `sample_weight`.
+
+    Returns:
+      Weighted loss float `Tensor`. If `reduction` is `NONE`, this has the same
+        shape as `y_true`; otherwise, it is scalar.
+
+    Raises:
+      ValueError: If the shape of `sample_weight` is invalid.
+    """
+    with ops.name_scope(self.name, format(self.__class__.__name__),
+                        (y_pred, y_true, sample_weight)):
+      losses = self.call(y_true, y_pred)
+      return compute_weighted_loss(
+          losses, sample_weight, reduction=self.reduction)
+
+  @classmethod
+  def from_config(cls, config):
+    """Instantiates a `Loss` from its config (output of `get_config()`).
+
+    Args:
+        config: Output of `get_config()`.
+
+    Returns:
+        A `Loss` instance.
+    """
+    return cls(**config)
+
+  def get_config(self):
+    return {'reduction': self.reduction, 'name': self.name}
+
+  @abc.abstractmethod
+  def call(self, y_true, y_pred):
+    """Invokes the `Loss` instance.
+
+    Args:
+      y_true: Ground truth values, with the same shape as 'y_pred'.
+      y_pred: The predicted values.
+    """
+    NotImplementedError('Must be implemented in subclasses.')
+
+
+@tf_export('keras.losses.MeanSquaredError')
+class MeanSquaredError(Loss):
+  """Computes the mean of squares of errors between labels and predictions.
+
+  For example, if `y_true` is [0., 0., 1., 1.] and `y_pred` is [1., 1., 1., 0.]
+  then the mean squared error value is 3/4 (0.75).
+
+  Usage:
+
+  ```python
+  mse = tf.keras.losses.MeanSquaredError()
+  loss = mse([0., 0., 1., 1.], [1., 1., 1., 0.])
+  print('Loss: ', loss.numpy())  # Loss: 0.75
+  ```
+
+  Usage with tf.keras API:
+
+  ```python
+  model = keras.models.Model(inputs, outputs)
+  model.compile('sgd', loss=tf.keras.losses.MeanSquaredError())
+  ```
+  """
+
+  def call(self, y_true, y_pred):
+    """Invokes the `MeanSquaredError` instance.
+
+    Args:
+      y_true: Ground truth values.
+      y_pred: The predicted values.
+
+    Returns:
+      Mean squared error losses.
+    """
+    y_pred = ops.convert_to_tensor(y_pred)
+    y_true = math_ops.cast(y_true, y_pred.dtype)
+    return mean_squared_error(y_true, y_pred)
+
+
+@tf_export('keras.losses.MeanAbsoluteError')
+class MeanAbsoluteError(Loss):
+  """Computes the mean of absolute difference between labels and predictions.
+
+  For example, if `y_true` is [0., 0., 1., 1.] and `y_pred` is [1., 1., 1., 0.]
+  then the mean absolute error value is 3/4 (0.75).
+
+  Usage:
+
+  ```python
+  mae = tf.keras.losses.MeanAbsoluteError()
+  loss = mae([0., 0., 1., 1.], [1., 1., 1., 0.])
+  print('Loss: ', loss.numpy())  # Loss: 0.75
+  ```
+
+  Usage with tf.keras API:
+
+  ```python
+  model = keras.models.Model(inputs, outputs)
+  model.compile('sgd', loss=tf.keras.losses.MeanAbsoluteError())
+  ```
+  """
+
+  def call(self, y_true, y_pred):
+    """Invokes the `MeanAbsoluteError` instance.
+
+    Args:
+      y_true: Ground truth values.
+      y_pred: The predicted values.
+
+    Returns:
+      Mean absolute error losses.
+    """
+    y_pred = ops.convert_to_tensor(y_pred)
+    y_true = math_ops.cast(y_true, y_pred.dtype)
+    return mean_absolute_error(y_true, y_pred)
+
+
+@tf_export('keras.losses.MeanAbsolutePercentageError')
+class MeanAbsolutePercentageError(Loss):
+  """Computes the mean absolute percentage error between `y_true` and `y_pred`.
+
+  For example, if `y_true` is [0., 0., 1., 1.] and `y_pred` is [1., 1., 1., 0.]
+  then the mean absolute percentage error value is 5e+08.
+
+  Usage:
+
+  ```python
+  mape = tf.keras.losses.MeanAbsolutePercentageError()
+  loss = mape([0., 0., 1., 1.], [1., 1., 1., 0.])
+  print('Loss: ', loss.numpy())  # Loss: 5e+08
+  ```
+
+  Usage with tf.keras API:
+
+  ```python
+  model = keras.models.Model(inputs, outputs)
+  model.compile('sgd', loss=tf.keras.losses.MeanAbsolutePercentageError())
+  ```
+  """
+
+  def call(self, y_true, y_pred):
+    """Invokes the `MeanAbsolutePercentageError` instance.
+
+    Args:
+      y_true: Ground truth values.
+      y_pred: The predicted values.
+
+    Returns:
+      Mean absolute percentage error losses.
+    """
+    y_pred = ops.convert_to_tensor(y_pred)
+    y_true = math_ops.cast(y_true, y_pred.dtype)
+    return mean_absolute_percentage_error(y_true, y_pred)
+
+
+@tf_export('keras.losses.MeanSquaredLogarithmicError')
+class MeanSquaredLogarithmicError(Loss):
+  """Computes the mean squared logarithmic error between `y_true` and `y_pred`.
+
+  For example, if `y_true` is [0., 0., 1., 1.] and `y_pred` is [1., 1., 1., 0.]
+  then the mean squared logarithmic error value is 0.36034.
+
+  Usage:
+
+  ```python
+  msle = tf.keras.losses.MeanSquaredLogarithmicError()
+  loss = msle([0., 0., 1., 1.], [1., 1., 1., 0.])
+  print('Loss: ', loss.numpy())  # Loss: 0.36034
+  ```
+
+  Usage with tf.keras API:
+
+  ```python
+  model = keras.models.Model(inputs, outputs)
+  model.compile('sgd', loss=tf.keras.losses.MeanSquaredLogarithmicError())
+  ```
+  """
+
+  def call(self, y_true, y_pred):
+    """Invokes the `MeanSquaredLogarithmicError` instance.
+
+    Args:
+      y_true: Ground truth values.
+      y_pred: The predicted values.
+
+    Returns:
+      Mean squared logarithmic error losses.
+    """
+    y_pred = ops.convert_to_tensor(y_pred)
+    y_true = math_ops.cast(y_true, y_pred.dtype)
+    return mean_squared_logarithmic_error(y_true, y_pred)
+
+
+@tf_export('keras.losses.BinaryCrossentropy')
+class BinaryCrossentropy(Loss):
+  """Computes the binary cross entropy loss between the labels and predictions.
+
+  Usage:
+
+  ```python
+  bce = tf.keras.losses.BinaryCrossentropy()
+  loss = bce([0., 0., 1., 1.], [1., 1., 1., 0.])
+  print('Loss: ', loss.numpy())  # Loss: 12.007
+  ```
+
+  Usage with tf.keras API:
+
+  ```python
+  model = keras.models.Model(inputs, outputs)
+  model.compile('sgd', loss=tf.keras.losses.BinaryCrossentropy())
+  ````
+
+  Args:
+    from_logits: Whether `output` is expected to be a logits tensor. By default,
+      we consider that `output` encodes a probability distribution.
+    label_smoothing: If greater than `0` then smooth the labels.
+    reduction: Type of `tf.losses.Reduction` to apply to loss. Default value is
+      `SUM_OVER_BATCH_SIZE`.
+    name: Optional name for the op.
+  """
+
+  def __init__(self,
+               from_logits=False,
+               label_smoothing=0,
+               reduction=losses_impl.ReductionV2.SUM_OVER_BATCH_SIZE,
+               name=None):
+    super(BinaryCrossentropy, self).__init__(reduction=reduction, name=name)
+    self.from_logits = from_logits
+    self.label_smoothing = label_smoothing
+
+  def call(self, y_true, y_pred):
+    """Invokes the `BinaryCrossentropy` instance.
+
+    Args:
+      y_true: Ground truth values.
+      y_pred: The predicted values.
+
+    Returns:
+      Binary cross entropy losses.
+    """
+    y_pred = ops.convert_to_tensor(y_pred)
+    y_true = math_ops.cast(y_true, y_pred.dtype)
+
+    if self.label_smoothing > 0:
+      y_true = y_true * (1 - self.label_smoothing) + 0.5 * self.label_smoothing
+
+    return binary_crossentropy(y_true, y_pred, from_logits=self.from_logits)
+
+
+@tf_export('keras.losses.CategoricalCrossentropy')
+class CategoricalCrossentropy(Loss):
+  """Computes categorical cross entropy loss between the `y_true` and `y_pred`.
+
+  Usage:
+
+  ```python
+  cce = tf.keras.losses.CategoricalCrossentropy()
+  loss = cce(
+    [[1., 0., 0.], [0., 1., 0.], [0., 0., 1.]],
+    [[.9, .05, .05], [.5, .89, .6], [.05, .01, .94]])
+  print('Loss: ', loss.numpy())  # Loss: 0.3239
+  ```
+
+  Usage with tf.keras API:
+
+  ```python
+  model = keras.models.Model(inputs, outputs)
+  model.compile('sgd', loss=tf.keras.losses.CategoricalCrossentropy())
+  ````
+
+  Args:
+    from_logits: Whether `output` is expected to be a logits tensor. By default,
+      we consider that `output` encodes a probability distribution.
+    label_smoothing: If greater than `0` then smooth the labels. This option is
+      currently not supported when `y_pred` is a sparse input (not one-hot).
+    reduction: Type of `tf.losses.Reduction` to apply to loss. Default value is
+      `SUM_OVER_BATCH_SIZE`.
+    name: Optional name for the op.
+  """
+
+  def __init__(self,
+               from_logits=False,
+               label_smoothing=0,
+               reduction=losses_impl.ReductionV2.SUM_OVER_BATCH_SIZE,
+               name=None):
+    super(CategoricalCrossentropy, self).__init__(
+        reduction=reduction, name=name)
+    self.from_logits = from_logits
+    self.label_smoothing = label_smoothing
+
+  def call(self, y_true, y_pred):
+    """Invokes the `CategoricalCrossentropy` instance.
+
+    Args:
+      y_true: Ground truth values.
+      y_pred: The predicted values.
+
+    Returns:
+      Categorical cross entropy losses.
+    """
+    y_pred = ops.convert_to_tensor(y_pred)
+    y_true = ops.convert_to_tensor(y_true)
+    is_sparse = y_pred.shape != y_true.shape
+
+    if is_sparse:
+      return sparse_categorical_crossentropy(
+          y_true, y_pred, from_logits=self.from_logits)
+    else:
+      y_true = math_ops.cast(y_true, y_pred.dtype)
+      if self.label_smoothing > 0:
+        num_classes = math_ops.cast(array_ops.shape(y_true)[1], y_pred.dtype)
+        smooth_positives = 1.0 - self.label_smoothing
+        smooth_negatives = self.label_smoothing / num_classes
+        y_true = y_true * smooth_positives + smooth_negatives
+
+      return categorical_crossentropy(
+          y_true, y_pred, from_logits=self.from_logits)
+
+
 @tf_export('keras.metrics.mean_squared_error',
            'keras.metrics.mse',
            'keras.metrics.MSE',
@@ -116,20 +482,22 @@ def logcosh(y_true, y_pred):
 
 @tf_export('keras.metrics.categorical_crossentropy',
            'keras.losses.categorical_crossentropy')
-def categorical_crossentropy(y_true, y_pred):
-  return K.categorical_crossentropy(y_true, y_pred)
+def categorical_crossentropy(y_true, y_pred, from_logits=False):
+  return K.categorical_crossentropy(y_true, y_pred, from_logits=from_logits)
 
 
 @tf_export('keras.metrics.sparse_categorical_crossentropy',
            'keras.losses.sparse_categorical_crossentropy')
-def sparse_categorical_crossentropy(y_true, y_pred):
-  return K.sparse_categorical_crossentropy(y_true, y_pred)
+def sparse_categorical_crossentropy(y_true, y_pred, from_logits=False):
+  return K.sparse_categorical_crossentropy(
+      y_true, y_pred, from_logits=from_logits)
 
 
 @tf_export('keras.metrics.binary_crossentropy',
            'keras.losses.binary_crossentropy')
-def binary_crossentropy(y_true, y_pred):
-  return K.mean(K.binary_crossentropy(y_true, y_pred), axis=-1)
+def binary_crossentropy(y_true, y_pred, from_logits=False):
+  return K.mean(
+      K.binary_crossentropy(y_true, y_pred, from_logits=from_logits), axis=-1)
 
 
 @tf_export('keras.metrics.kullback_leibler_divergence',
@@ -159,6 +527,40 @@ def cosine_proximity(y_true, y_pred):
   return -math_ops.reduce_sum(y_true * y_pred, axis=-1)
 
 
+class CosineProximity(Loss):
+  """Computes the cosine distance between `y_true` and `y_pred`.
+
+  Usage:
+
+  ```python
+  cosine_loss = tf.losses.CosineProximity()
+  loss = cosine_loss([0., 1., 1.], [1., 0., 1.])
+  print('Loss: ', loss.numpy())  # Loss: -0.5
+  ```
+
+  Usage with tf.keras API:
+
+  ```python
+  model = keras.models.Model(inputs, outputs)
+  model.compile('sgd', loss=tf.losses.CosineProximity())
+  ```
+  """
+
+  def call(self, y_true, y_pred):
+    """Calculates the cosine proximity loss.
+
+    Args:
+      y_true: Ground truth values.
+      y_pred: The predicted values.
+
+    Returns:
+      Cosine distance loss.
+    """
+    y_pred = ops.convert_to_tensor(y_pred)
+    y_true = math_ops.cast(y_true, y_pred.dtype)
+    return cosine_proximity(y_true, y_pred)
+
+
 # Aliases.
 
 mse = MSE = mean_squared_error
@@ -197,3 +599,9 @@ def get(identifier):
   else:
     raise ValueError('Could not interpret '
                      'loss function identifier:', identifier)
+
+
+LABEL_DTYPES_FOR_LOSSES = {
+    losses_impl.sparse_softmax_cross_entropy: 'int32',
+    sparse_categorical_crossentropy: 'int32'
+}
diff --git a/tensorflow/python/keras/losses_test.py b/tensorflow/python/keras/losses_test.py
index c7015270acc..d2791cdcd3b 100644
--- a/tensorflow/python/keras/losses_test.py
+++ b/tensorflow/python/keras/losses_test.py
@@ -24,6 +24,11 @@ import shutil
 import numpy as np
 
 from tensorflow.python import keras
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops.losses import losses_impl
 from tensorflow.python.platform import test
 
 try:
@@ -138,5 +143,633 @@ class KerasLossesTest(test.TestCase):
         loaded_model.predict(np.random.rand(128, 2))
 
 
+@test_util.run_all_in_graph_and_eager_modes
+class MeanSquaredErrorTest(test.TestCase):
+
+  def test_config(self):
+    mse_obj = keras.losses.MeanSquaredError(
+        reduction=losses_impl.ReductionV2.SUM, name='mse_1')
+    self.assertEqual(mse_obj.name, 'mse_1')
+    self.assertEqual(mse_obj.reduction, losses_impl.ReductionV2.SUM)
+
+  def test_all_correct_unweighted(self):
+    mse_obj = keras.losses.MeanSquaredError()
+    y_true = constant_op.constant([4, 8, 12, 8, 1, 3], shape=(2, 3))
+    loss = mse_obj(y_true, y_true)
+    self.assertAlmostEqual(self.evaluate(loss), 0.0, 3)
+
+  def test_unweighted(self):
+    mse_obj = keras.losses.MeanSquaredError()
+    y_true = constant_op.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
+    y_pred = constant_op.constant([4, 8, 12, 8, 1, 3],
+                                  shape=(2, 3),
+                                  dtype=dtypes.float32)
+    loss = mse_obj(y_true, y_pred)
+    self.assertAlmostEqual(self.evaluate(loss), 49.5, 3)
+
+  def test_scalar_weighted(self):
+    mse_obj = keras.losses.MeanSquaredError()
+    y_true = constant_op.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
+    y_pred = constant_op.constant([4, 8, 12, 8, 1, 3],
+                                  shape=(2, 3),
+                                  dtype=dtypes.float32)
+    loss = mse_obj(y_true, y_pred, sample_weight=2.3)
+    self.assertAlmostEqual(self.evaluate(loss), 113.85, 3)
+
+  def test_sample_weighted(self):
+    mse_obj = keras.losses.MeanSquaredError()
+    y_true = constant_op.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
+    y_pred = constant_op.constant([4, 8, 12, 8, 1, 3],
+                                  shape=(2, 3),
+                                  dtype=dtypes.float32)
+    sample_weight = constant_op.constant([1.2, 3.4], shape=(2, 1))
+    loss = mse_obj(y_true, y_pred, sample_weight=sample_weight)
+    self.assertAlmostEqual(self.evaluate(loss), 767.8 / 6, 3)
+
+  def test_timestep_weighted(self):
+    mse_obj = keras.losses.MeanSquaredError()
+    y_true = constant_op.constant([1, 9, 2, -5, -2, 6], shape=(2, 3, 1))
+    y_pred = constant_op.constant([4, 8, 12, 8, 1, 3],
+                                  shape=(2, 3, 1),
+                                  dtype=dtypes.float32)
+    sample_weight = constant_op.constant([3, 6, 5, 0, 4, 2], shape=(2, 3))
+    loss = mse_obj(y_true, y_pred, sample_weight=sample_weight)
+    self.assertAlmostEqual(self.evaluate(loss), 587 / 6, 3)
+
+  def test_zero_weighted(self):
+    mse_obj = keras.losses.MeanSquaredError()
+    y_true = constant_op.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
+    y_pred = constant_op.constant([4, 8, 12, 8, 1, 3],
+                                  shape=(2, 3),
+                                  dtype=dtypes.float32)
+    loss = mse_obj(y_true, y_pred, sample_weight=0)
+    self.assertAlmostEqual(self.evaluate(loss), 0.0, 3)
+
+  def test_invalid_sample_weight(self):
+    mse_obj = keras.losses.MeanSquaredError()
+    y_true = constant_op.constant([1, 9, 2, -5, -2, 6], shape=(2, 3, 1))
+    y_pred = constant_op.constant([4, 8, 12, 8, 1, 3], shape=(2, 3, 1))
+    sample_weight = constant_op.constant([3, 6, 5, 0], shape=(2, 2))
+    with self.assertRaisesRegexp(
+        ValueError, r'Shapes \(2, 2\) and \(2, 3\) are incompatible'):
+      mse_obj(y_true, y_pred, sample_weight=sample_weight)
+
+  def test_no_reduction(self):
+    mse_obj = keras.losses.MeanSquaredError(
+        reduction=losses_impl.ReductionV2.NONE)
+    y_true = constant_op.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
+    y_pred = constant_op.constant([4, 8, 12, 8, 1, 3],
+                                  shape=(2, 3),
+                                  dtype=dtypes.float32)
+    loss = mse_obj(y_true, y_pred, sample_weight=2.3)
+    loss = self.evaluate(loss)
+    self.assertArrayNear(loss, [84.3333, 143.3666], 1e-3)
+
+  def test_sum_reduction(self):
+    mse_obj = keras.losses.MeanSquaredError(
+        reduction=losses_impl.ReductionV2.SUM)
+    y_true = constant_op.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
+    y_pred = constant_op.constant([4, 8, 12, 8, 1, 3],
+                                  shape=(2, 3),
+                                  dtype=dtypes.float32)
+    loss = mse_obj(y_true, y_pred, sample_weight=2.3)
+    self.assertAlmostEqual(self.evaluate(loss), 227.69998, 3)
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class MeanAbsoluteErrorTest(test.TestCase):
+
+  def test_config(self):
+    mae_obj = keras.losses.MeanAbsoluteError(
+        reduction=losses_impl.ReductionV2.SUM, name='mae_1')
+    self.assertEqual(mae_obj.name, 'mae_1')
+    self.assertEqual(mae_obj.reduction, losses_impl.ReductionV2.SUM)
+
+  def test_all_correct_unweighted(self):
+    mae_obj = keras.losses.MeanAbsoluteError()
+    y_true = constant_op.constant([4, 8, 12, 8, 1, 3], shape=(2, 3))
+    loss = mae_obj(y_true, y_true)
+    self.assertAlmostEqual(self.evaluate(loss), 0.0, 3)
+
+  def test_unweighted(self):
+    mae_obj = keras.losses.MeanAbsoluteError()
+    y_true = constant_op.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
+    y_pred = constant_op.constant([4, 8, 12, 8, 1, 3],
+                                  shape=(2, 3),
+                                  dtype=dtypes.float32)
+    loss = mae_obj(y_true, y_pred)
+    self.assertAlmostEqual(self.evaluate(loss), 5.5, 3)
+
+  def test_scalar_weighted(self):
+    mae_obj = keras.losses.MeanAbsoluteError()
+    y_true = constant_op.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
+    y_pred = constant_op.constant([4, 8, 12, 8, 1, 3],
+                                  shape=(2, 3),
+                                  dtype=dtypes.float32)
+    loss = mae_obj(y_true, y_pred, sample_weight=2.3)
+    self.assertAlmostEqual(self.evaluate(loss), 12.65, 3)
+
+  def test_sample_weighted(self):
+    mae_obj = keras.losses.MeanAbsoluteError()
+    y_true = constant_op.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
+    y_pred = constant_op.constant([4, 8, 12, 8, 1, 3],
+                                  shape=(2, 3),
+                                  dtype=dtypes.float32)
+    sample_weight = constant_op.constant([1.2, 3.4], shape=(2, 1))
+    loss = mae_obj(y_true, y_pred, sample_weight=sample_weight)
+    self.assertAlmostEqual(self.evaluate(loss), 81.4 / 6, 3)
+
+  def test_timestep_weighted(self):
+    mae_obj = keras.losses.MeanAbsoluteError()
+    y_true = constant_op.constant([1, 9, 2, -5, -2, 6], shape=(2, 3, 1))
+    y_pred = constant_op.constant([4, 8, 12, 8, 1, 3],
+                                  shape=(2, 3, 1),
+                                  dtype=dtypes.float32)
+    sample_weight = constant_op.constant([3, 6, 5, 0, 4, 2], shape=(2, 3))
+    loss = mae_obj(y_true, y_pred, sample_weight=sample_weight)
+    self.assertAlmostEqual(self.evaluate(loss), 83 / 6, 3)
+
+  def test_zero_weighted(self):
+    mae_obj = keras.losses.MeanAbsoluteError()
+    y_true = constant_op.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
+    y_pred = constant_op.constant([4, 8, 12, 8, 1, 3],
+                                  shape=(2, 3),
+                                  dtype=dtypes.float32)
+    loss = mae_obj(y_true, y_pred, sample_weight=0)
+    self.assertAlmostEqual(self.evaluate(loss), 0.0, 3)
+
+  def test_invalid_sample_weight(self):
+    mae_obj = keras.losses.MeanAbsoluteError()
+    y_true = constant_op.constant([1, 9, 2, -5, -2, 6], shape=(2, 3, 1))
+    y_pred = constant_op.constant([4, 8, 12, 8, 1, 3], shape=(2, 3, 1))
+    sample_weight = constant_op.constant([3, 6, 5, 0], shape=(2, 2))
+    with self.assertRaisesRegexp(
+        ValueError, r'Shapes \(2, 2\) and \(2, 3\) are incompatible'):
+      mae_obj(y_true, y_pred, sample_weight=sample_weight)
+
+  def test_no_reduction(self):
+    mae_obj = keras.losses.MeanAbsoluteError(
+        reduction=losses_impl.ReductionV2.NONE)
+    y_true = constant_op.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
+    y_pred = constant_op.constant([4, 8, 12, 8, 1, 3],
+                                  shape=(2, 3),
+                                  dtype=dtypes.float32)
+    loss = mae_obj(y_true, y_pred, sample_weight=2.3)
+    loss = self.evaluate(loss)
+    self.assertArrayNear(loss, [10.7333, 14.5666], 1e-3)
+
+  def test_sum_reduction(self):
+    mae_obj = keras.losses.MeanAbsoluteError(
+        reduction=losses_impl.ReductionV2.SUM)
+    y_true = constant_op.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
+    y_pred = constant_op.constant([4, 8, 12, 8, 1, 3],
+                                  shape=(2, 3),
+                                  dtype=dtypes.float32)
+    loss = mae_obj(y_true, y_pred, sample_weight=2.3)
+    self.assertAlmostEqual(self.evaluate(loss), 25.29999, 3)
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class MeanAbsolutePercentageErrorTest(test.TestCase):
+
+  def test_config(self):
+    mape_obj = keras.losses.MeanAbsolutePercentageError(
+        reduction=losses_impl.ReductionV2.SUM, name='mape_1')
+    self.assertEqual(mape_obj.name, 'mape_1')
+    self.assertEqual(mape_obj.reduction, losses_impl.ReductionV2.SUM)
+
+  def test_unweighted(self):
+    mape_obj = keras.losses.MeanAbsolutePercentageError()
+    y_true = constant_op.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
+    y_pred = constant_op.constant([4, 8, 12, 8, 1, 3],
+                                  shape=(2, 3),
+                                  dtype=dtypes.float32)
+    loss = mape_obj(y_true, y_pred)
+    self.assertAlmostEqual(self.evaluate(loss), 211.8518, 3)
+
+  def test_scalar_weighted(self):
+    mape_obj = keras.losses.MeanAbsolutePercentageError()
+    y_true = constant_op.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
+    y_pred = constant_op.constant([4, 8, 12, 8, 1, 3],
+                                  shape=(2, 3),
+                                  dtype=dtypes.float32)
+    loss = mape_obj(y_true, y_pred, sample_weight=2.3)
+    self.assertAlmostEqual(self.evaluate(loss), 487.259, 3)
+
+  def test_sample_weighted(self):
+    mape_obj = keras.losses.MeanAbsolutePercentageError()
+    y_true = constant_op.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
+    y_pred = constant_op.constant([4, 8, 12, 8, 1, 3],
+                                  shape=(2, 3),
+                                  dtype=dtypes.float32)
+    sample_weight = constant_op.constant([1.2, 3.4], shape=(2, 1))
+    loss = mape_obj(y_true, y_pred, sample_weight=sample_weight)
+    self.assertAlmostEqual(self.evaluate(loss), 422.8888, 3)
+
+  def test_timestep_weighted(self):
+    mape_obj = keras.losses.MeanAbsolutePercentageError()
+    y_true = constant_op.constant([1, 9, 2, -5, -2, 6], shape=(2, 3, 1))
+    y_pred = constant_op.constant([4, 8, 12, 8, 1, 3],
+                                  shape=(2, 3, 1),
+                                  dtype=dtypes.float32)
+    sample_weight = constant_op.constant([3, 6, 5, 0, 4, 2], shape=(2, 3))
+    loss = mape_obj(y_true, y_pred, sample_weight=sample_weight)
+    self.assertAlmostEqual(self.evaluate(loss), 694.4445, 3)
+
+  def test_zero_weighted(self):
+    mape_obj = keras.losses.MeanAbsolutePercentageError()
+    y_true = constant_op.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
+    y_pred = constant_op.constant([4, 8, 12, 8, 1, 3],
+                                  shape=(2, 3),
+                                  dtype=dtypes.float32)
+    loss = mape_obj(y_true, y_pred, sample_weight=0)
+    self.assertAlmostEqual(self.evaluate(loss), 0.0, 3)
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class MeanSquaredLogarithmicErrorTest(test.TestCase):
+
+  def test_config(self):
+    msle_obj = keras.losses.MeanSquaredLogarithmicError(
+        reduction=losses_impl.ReductionV2.SUM, name='mape_1')
+    self.assertEqual(msle_obj.name, 'mape_1')
+    self.assertEqual(msle_obj.reduction, losses_impl.ReductionV2.SUM)
+
+  def test_unweighted(self):
+    msle_obj = keras.losses.MeanSquaredLogarithmicError()
+    y_true = constant_op.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
+    y_pred = constant_op.constant([4, 8, 12, 8, 1, 3],
+                                  shape=(2, 3),
+                                  dtype=dtypes.float32)
+    loss = msle_obj(y_true, y_pred)
+    self.assertAlmostEqual(self.evaluate(loss), 1.4370, 3)
+
+  def test_scalar_weighted(self):
+    msle_obj = keras.losses.MeanSquaredLogarithmicError()
+    y_true = constant_op.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
+    y_pred = constant_op.constant([4, 8, 12, 8, 1, 3],
+                                  shape=(2, 3),
+                                  dtype=dtypes.float32)
+    loss = msle_obj(y_true, y_pred, sample_weight=2.3)
+    self.assertAlmostEqual(self.evaluate(loss), 3.3051, 3)
+
+  def test_sample_weighted(self):
+    msle_obj = keras.losses.MeanSquaredLogarithmicError()
+    y_true = constant_op.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
+    y_pred = constant_op.constant([4, 8, 12, 8, 1, 3],
+                                  shape=(2, 3),
+                                  dtype=dtypes.float32)
+    sample_weight = constant_op.constant([1.2, 3.4], shape=(2, 1))
+    loss = msle_obj(y_true, y_pred, sample_weight=sample_weight)
+    self.assertAlmostEqual(self.evaluate(loss), 3.7856, 3)
+
+  def test_timestep_weighted(self):
+    msle_obj = keras.losses.MeanSquaredLogarithmicError()
+    y_true = constant_op.constant([1, 9, 2, -5, -2, 6], shape=(2, 3, 1))
+    y_pred = constant_op.constant([4, 8, 12, 8, 1, 3],
+                                  shape=(2, 3, 1),
+                                  dtype=dtypes.float32)
+    sample_weight = constant_op.constant([3, 6, 5, 0, 4, 2], shape=(2, 3))
+    loss = msle_obj(y_true, y_pred, sample_weight=sample_weight)
+    self.assertAlmostEqual(self.evaluate(loss), 2.6473, 3)
+
+  def test_zero_weighted(self):
+    msle_obj = keras.losses.MeanSquaredLogarithmicError()
+    y_true = constant_op.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
+    y_pred = constant_op.constant([4, 8, 12, 8, 1, 3],
+                                  shape=(2, 3),
+                                  dtype=dtypes.float32)
+    loss = msle_obj(y_true, y_pred, sample_weight=0)
+    self.assertAlmostEqual(self.evaluate(loss), 0.0, 3)
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class CosineProximityTest(test.TestCase):
+
+  def test_config(self):
+    cosine_obj = keras.losses.CosineProximity(
+        reduction=losses_impl.ReductionV2.SUM, name='cosine_loss')
+    self.assertEqual(cosine_obj.name, 'cosine_loss')
+    self.assertEqual(cosine_obj.reduction, losses_impl.ReductionV2.SUM)
+
+  def test_unweighted(self):
+    cosine_obj = keras.losses.CosineProximity()
+    y_true = constant_op.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
+    y_pred = constant_op.constant([4, 8, 12, 8, 1, 3],
+                                  shape=(2, 3),
+                                  dtype=dtypes.float32)
+    loss = cosine_obj(y_true, y_pred)
+    self.assertAlmostEqual(self.evaluate(loss), -0.18722, 3)
+
+  def test_scalar_weighted(self):
+    cosine_obj = keras.losses.CosineProximity()
+    y_true = constant_op.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
+    y_pred = constant_op.constant([4, 8, 12, 8, 1, 3],
+                                  shape=(2, 3),
+                                  dtype=dtypes.float32)
+    loss = cosine_obj(y_true, y_pred, sample_weight=2.3)
+    self.assertAlmostEqual(self.evaluate(loss), -0.43060, 3)
+
+  def test_sample_weighted(self):
+    cosine_obj = keras.losses.CosineProximity()
+    y_true = constant_op.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
+    y_pred = constant_op.constant([4, 8, 12, 8, 1, 3],
+                                  shape=(2, 3),
+                                  dtype=dtypes.float32)
+    sample_weight = constant_op.constant([1.2, 3.4], shape=(2, 1))
+    loss = cosine_obj(y_true, y_pred, sample_weight=sample_weight)
+    self.assertAlmostEqual(self.evaluate(loss), 0.15599, 3)
+
+  def test_timestep_weighted(self):
+    cosine_obj = keras.losses.CosineProximity()
+    y_true = constant_op.constant([1, 9, 2, -5, -2, 6], shape=(2, 3, 1))
+    y_pred = constant_op.constant([4, 8, 12, 8, 1, 3],
+                                  shape=(2, 3, 1),
+                                  dtype=dtypes.float32)
+    sample_weight = constant_op.constant([3, 6, 5, 0, 4, 2], shape=(2, 3))
+    loss = cosine_obj(y_true, y_pred, sample_weight=sample_weight)
+    self.assertAlmostEqual(self.evaluate(loss), -2.0000, 3)
+
+  def test_zero_weighted(self):
+    cosine_obj = keras.losses.CosineProximity()
+    y_true = constant_op.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
+    y_pred = constant_op.constant([4, 8, 12, 8, 1, 3],
+                                  shape=(2, 3),
+                                  dtype=dtypes.float32)
+    loss = cosine_obj(y_true, y_pred, sample_weight=0)
+    self.assertAlmostEqual(self.evaluate(loss), 0., 3)
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class BinaryCrossentropyTest(test.TestCase):
+
+  def test_config(self):
+    bce_obj = keras.losses.BinaryCrossentropy(
+        reduction=losses_impl.ReductionV2.SUM, name='bce_1')
+    self.assertEqual(bce_obj.name, 'bce_1')
+    self.assertEqual(bce_obj.reduction, losses_impl.ReductionV2.SUM)
+
+  def test_all_correct_unweighted(self):
+    y_true = constant_op.constant([[1, 0, 0], [0, 1, 0], [0, 0, 1]],
+                                  dtype=dtypes.float32)
+    bce_obj = keras.losses.BinaryCrossentropy()
+    loss = bce_obj(y_true, y_true)
+    self.assertAlmostEqual(self.evaluate(loss), 0.0, 3)
+
+    # Test with logits.
+    logits = constant_op.constant([[100.0, -100.0, -100.0],
+                                   [-100.0, 100.0, -100.0],
+                                   [-100.0, -100.0, 100.0]])
+    bce_obj = keras.losses.BinaryCrossentropy(from_logits=True)
+    loss = bce_obj(y_true, logits)
+    self.assertAlmostEqual(self.evaluate(loss), 0.0, 3)
+
+  def test_unweighted(self):
+    bce_obj = keras.losses.BinaryCrossentropy()
+    y_true = constant_op.constant([1, 0, 1, 0, 0, 1], shape=(2, 3))
+    y_pred = constant_op.constant([1, 1, 1, 0, 1, 0],
+                                  shape=(2, 3),
+                                  dtype=dtypes.float32)
+    loss = bce_obj(y_true, y_pred)
+    self.assertAlmostEqual(self.evaluate(loss), 8.0004, 3)
+
+    # Test with logits.
+    logits = constant_op.constant([10., 10., 10., -10., 10, -10],
+                                  shape=(2, 3),
+                                  dtype=dtypes.float32)
+    bce_obj = keras.losses.BinaryCrossentropy(from_logits=True)
+    loss = bce_obj(y_true, logits)
+    self.assertAlmostEqual(self.evaluate(loss), 5., 3)
+
+  def test_scalar_weighted(self):
+    bce_obj = keras.losses.BinaryCrossentropy()
+    y_true = constant_op.constant([1, 0, 1, 0, 0, 1], shape=(2, 3))
+    y_pred = constant_op.constant([1, 1, 1, 0, 1, 0],
+                                  shape=(2, 3),
+                                  dtype=dtypes.float32)
+    loss = bce_obj(y_true, y_pred, sample_weight=2.3)
+    self.assertAlmostEqual(self.evaluate(loss), 18.4010, 3)
+
+    # Test with logits.
+    y_true = array_ops.ones((32, 1))
+    logits = array_ops.ones((32, 1), dtype=dtypes.float32)
+    bce_obj = keras.losses.BinaryCrossentropy(from_logits=True)
+    loss = bce_obj(y_true, logits, sample_weight=2.3)
+    self.assertAlmostEqual(self.evaluate(loss), 0.7205, 3)
+
+  def test_sample_weighted(self):
+    bce_obj = keras.losses.BinaryCrossentropy()
+    y_true = constant_op.constant([1, 0, 1, 0, 0, 1], shape=(2, 3))
+    y_pred = constant_op.constant([1, 1, 1, 0, 1, 0],
+                                  shape=(2, 3),
+                                  dtype=dtypes.float64)
+    sample_weight = constant_op.constant([1.2, 3.4], shape=(2, 1))
+    loss = bce_obj(y_true, y_pred, sample_weight=sample_weight)
+    self.assertAlmostEqual(self.evaluate(loss), 21.4907, 3)
+
+    # Test with logits.
+    y_true = constant_op.constant([[0, 0, 1], [1, 0, 0], [0, 1, 0]])
+    logits = constant_op.constant(
+        [[100.0, -100.0, -100.0], [-100.0, 100.0, -100.0],
+         [-100.0, -100.0, 100.0]],
+        dtype=dtypes.float64)
+    weights = constant_op.constant([3, 2, 8])
+    bce_obj = keras.losses.BinaryCrossentropy(from_logits=True)
+    loss = bce_obj(y_true, logits, sample_weight=weights)
+    self.assertAlmostEqual(self.evaluate(loss), 288.8888, 3)
+
+  def test_no_reduction(self):
+    y_true = constant_op.constant(((1, 0, 1), (1, 1, 0), (0, 1, 1)))
+    logits = constant_op.constant(((100.0, -100.0, 100.0),
+                                   (100.0, -100.0, 100.0),
+                                   (100.0, 100.0, -100.0)))
+    bce_obj = keras.losses.BinaryCrossentropy(
+        from_logits=True, reduction=losses_impl.ReductionV2.NONE)
+    loss = bce_obj(y_true, logits)
+    self.assertAllClose((0., 66.6666, 66.6666), self.evaluate(loss), 3)
+
+  def test_label_smoothing(self):
+    logits = constant_op.constant([[100.0, -100.0, -100.0]])
+    y_true = constant_op.constant([[1, 0, 1]])
+    label_smoothing = 0.1
+    # Loss: max(x, 0) - x * z + log(1 + exp(-abs(x)))
+    # Label smoothing: z' = z * (1 - L) + 0.5L
+    #                  1  = 1 - 0.5L
+    #                  0  = 0.5L
+    # Applying the above two fns to the given input:
+    # (100 - 100 * (1 - 0.5 L)  + 0 +
+    #  0   + 100 * (0.5 L)      + 0 +
+    #  0   + 100 * (1 - 0.5 L)  + 0) * (1/3)
+    #  = (100 + 50L) * 1/3
+    bce_obj = keras.losses.BinaryCrossentropy(
+        from_logits=True, label_smoothing=label_smoothing)
+    loss = bce_obj(y_true, logits)
+    expected_value = (100.0 + 50.0 * label_smoothing) / 3.0
+    self.assertAlmostEqual(self.evaluate(loss), expected_value, 3)
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class CategoricalCrossentropyTest(test.TestCase):
+
+  def test_config(self):
+    cce_obj = keras.losses.CategoricalCrossentropy(
+        reduction=losses_impl.ReductionV2.SUM, name='bce_1')
+    self.assertEqual(cce_obj.name, 'bce_1')
+    self.assertEqual(cce_obj.reduction, losses_impl.ReductionV2.SUM)
+
+  def test_all_correct_unweighted(self):
+    y_true = constant_op.constant([[1, 0, 0], [0, 1, 0], [0, 0, 1]],
+                                  dtype=dtypes.int64)
+    y_pred = constant_op.constant([[1., 0., 0.], [0., 1., 0.], [0., 0., 1.]],
+                                  dtype=dtypes.float32)
+    cce_obj = keras.losses.CategoricalCrossentropy()
+    loss = cce_obj(y_true, y_pred)
+    self.assertAlmostEqual(self.evaluate(loss), 0.0, 3)
+
+    # Test with logits.
+    logits = constant_op.constant([[10., 0., 0.], [0., 10., 0.], [0., 0., 10.]])
+    cce_obj = keras.losses.CategoricalCrossentropy(from_logits=True)
+    loss = cce_obj(y_true, logits)
+    self.assertAlmostEqual(self.evaluate(loss), 0.0, 3)
+
+  def test_unweighted(self):
+    cce_obj = keras.losses.CategoricalCrossentropy()
+    y_true = constant_op.constant([[1, 0, 0], [0, 1, 0], [0, 0, 1]])
+    y_pred = constant_op.constant(
+        [[.9, .05, .05], [.5, .89, .6], [.05, .01, .94]], dtype=dtypes.float32)
+    loss = cce_obj(y_true, y_pred)
+    self.assertAlmostEqual(self.evaluate(loss), .3239, 3)
+
+    # Test with logits.
+    logits = constant_op.constant([[8., 1., 1.], [0., 9., 1.], [2., 3., 5.]])
+    cce_obj = keras.losses.CategoricalCrossentropy(from_logits=True)
+    loss = cce_obj(y_true, logits)
+    self.assertAlmostEqual(self.evaluate(loss), .0573, 3)
+
+  def test_scalar_weighted(self):
+    cce_obj = keras.losses.CategoricalCrossentropy()
+    y_true = constant_op.constant([[1, 0, 0], [0, 1, 0], [0, 0, 1]])
+    y_pred = constant_op.constant(
+        [[.9, .05, .05], [.5, .89, .6], [.05, .01, .94]], dtype=dtypes.float32)
+    loss = cce_obj(y_true, y_pred, sample_weight=2.3)
+    self.assertAlmostEqual(self.evaluate(loss), .7449, 3)
+
+    # Test with logits.
+    logits = constant_op.constant([[8., 1., 1.], [0., 9., 1.], [2., 3., 5.]])
+    cce_obj = keras.losses.CategoricalCrossentropy(from_logits=True)
+    loss = cce_obj(y_true, logits, sample_weight=2.3)
+    self.assertAlmostEqual(self.evaluate(loss), .1317, 3)
+
+  def test_sample_weighted(self):
+    cce_obj = keras.losses.CategoricalCrossentropy()
+    y_true = constant_op.constant([[1, 0, 0], [0, 1, 0], [0, 0, 1]])
+    y_pred = constant_op.constant(
+        [[.9, .05, .05], [.5, .89, .6], [.05, .01, .94]], dtype=dtypes.float32)
+    sample_weight = constant_op.constant([[1.2], [3.4], [5.6]], shape=(3, 1))
+    loss = cce_obj(y_true, y_pred, sample_weight=sample_weight)
+    self.assertAlmostEqual(self.evaluate(loss), 1.0696, 3)
+
+    # Test with logits.
+    logits = constant_op.constant([[8., 1., 1.], [0., 9., 1.], [2., 3., 5.]])
+    cce_obj = keras.losses.CategoricalCrossentropy(from_logits=True)
+    loss = cce_obj(y_true, logits, sample_weight=sample_weight)
+    self.assertAlmostEqual(self.evaluate(loss), 0.31829, 3)
+
+  def test_no_reduction(self):
+    y_true = constant_op.constant([[1, 0, 0], [0, 1, 0], [0, 0, 1]])
+    logits = constant_op.constant([[8., 1., 1.], [0., 9., 1.], [2., 3., 5.]])
+    cce_obj = keras.losses.CategoricalCrossentropy(
+        from_logits=True, reduction=losses_impl.ReductionV2.NONE)
+    loss = cce_obj(y_true, logits)
+    self.assertAllClose((0.001822, 0.000459, 0.169846), self.evaluate(loss), 3)
+
+  def test_label_smoothing(self):
+    logits = constant_op.constant([[100.0, -100.0, -100.0]])
+    y_true = constant_op.constant([[1, 0, 0]])
+    label_smoothing = 0.1
+    # Softmax Cross Entropy Loss: -\sum_i p_i \log q_i
+    # where for a softmax activation
+    # \log q_i = x_i - \log \sum_j \exp x_j
+    #          = x_i - x_max - \log \sum_j \exp (x_j - x_max)
+    # For our activations, [100, -100, -100]
+    # \log ( exp(0) + exp(-200) + exp(-200) ) = 0
+    # so our log softmaxes become: [0, -200, -200]
+    # Label smoothing: z' = z * (1 - L) + L/n
+    #                  1  = 1 - L + L/n
+    #                  0  = L/n
+    # Applying the above two fns to the given input:
+    # -0 * (1 - L + L/n) + 200 * L/n + 200 * L/n = 400 L/n
+    cce_obj = keras.losses.CategoricalCrossentropy(
+        from_logits=True, label_smoothing=label_smoothing)
+    loss = cce_obj(y_true, logits)
+    expected_value = 400.0 * label_smoothing / 3.0
+    self.assertAlmostEqual(self.evaluate(loss), expected_value, 3)
+
+  def test_all_correct_unweighted_sparse(self):
+    y_true = constant_op.constant([[0], [1], [2]], dtype=dtypes.int64)
+    y_pred = constant_op.constant([[1., 0., 0.], [0., 1., 0.], [0., 0., 1.]],
+                                  dtype=dtypes.float32)
+    cce_obj = keras.losses.CategoricalCrossentropy()
+    loss = cce_obj(y_true, y_pred)
+    self.assertAlmostEqual(self.evaluate(loss), 0.0, 3)
+
+    # Test with logits.
+    logits = constant_op.constant([[10., 0., 0.], [0., 10., 0.], [0., 0., 10.]])
+    cce_obj = keras.losses.CategoricalCrossentropy(from_logits=True)
+    loss = cce_obj(y_true, logits)
+    self.assertAlmostEqual(self.evaluate(loss), 0.0, 3)
+
+  def test_unweighted_sparse(self):
+    cce_obj = keras.losses.CategoricalCrossentropy()
+    y_true = constant_op.constant([0, 1, 2])
+    y_pred = constant_op.constant(
+        [[.9, .05, .05], [.5, .89, .6], [.05, .01, .94]], dtype=dtypes.float32)
+    loss = cce_obj(y_true, y_pred)
+    self.assertAlmostEqual(self.evaluate(loss), .3239, 3)
+
+    # Test with logits.
+    logits = constant_op.constant([[8., 1., 1.], [0., 9., 1.], [2., 3., 5.]])
+    cce_obj = keras.losses.CategoricalCrossentropy(from_logits=True)
+    loss = cce_obj(y_true, logits)
+    self.assertAlmostEqual(self.evaluate(loss), .0573, 3)
+
+  def test_scalar_weighted_sparse(self):
+    cce_obj = keras.losses.CategoricalCrossentropy()
+    y_true = constant_op.constant([[0], [1], [2]])
+    y_pred = constant_op.constant(
+        [[.9, .05, .05], [.5, .89, .6], [.05, .01, .94]], dtype=dtypes.float32)
+    loss = cce_obj(y_true, y_pred, sample_weight=2.3)
+    self.assertAlmostEqual(self.evaluate(loss), .7449, 3)
+
+    # Test with logits.
+    logits = constant_op.constant([[8., 1., 1.], [0., 9., 1.], [2., 3., 5.]])
+    cce_obj = keras.losses.CategoricalCrossentropy(from_logits=True)
+    loss = cce_obj(y_true, logits, sample_weight=2.3)
+    self.assertAlmostEqual(self.evaluate(loss), .1317, 3)
+
+  def test_sample_weighted_sparse(self):
+    cce_obj = keras.losses.CategoricalCrossentropy()
+    y_true = constant_op.constant([[0], [1], [2]])
+    y_pred = constant_op.constant(
+        [[.9, .05, .05], [.5, .89, .6], [.05, .01, .94]], dtype=dtypes.float32)
+    sample_weight = constant_op.constant([[1.2], [3.4], [5.6]], shape=(3, 1))
+    loss = cce_obj(y_true, y_pred, sample_weight=sample_weight)
+    self.assertAlmostEqual(self.evaluate(loss), 1.0696, 3)
+
+    # Test with logits.
+    logits = constant_op.constant([[8., 1., 1.], [0., 9., 1.], [2., 3., 5.]])
+    cce_obj = keras.losses.CategoricalCrossentropy(from_logits=True)
+    loss = cce_obj(y_true, logits, sample_weight=sample_weight)
+    self.assertAlmostEqual(self.evaluate(loss), 0.31829, 3)
+
+  def test_no_reduction_sparse(self):
+    y_true = constant_op.constant([[0], [1], [2]])
+    logits = constant_op.constant([[8., 1., 1.], [0., 9., 1.], [2., 3., 5.]])
+    cce_obj = keras.losses.CategoricalCrossentropy(
+        from_logits=True, reduction=losses_impl.ReductionV2.NONE)
+    loss = cce_obj(y_true, logits)
+    self.assertAllClose((0.001822, 0.000459, 0.169846), self.evaluate(loss), 3)
+
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/metrics.py b/tensorflow/python/keras/metrics.py
index 2ea64055979..3c2682e4c6f 100644
--- a/tensorflow/python/keras/metrics.py
+++ b/tensorflow/python/keras/metrics.py
@@ -24,9 +24,10 @@ import functools
 import sys
 import types
 import weakref
+from enum import Enum
+import numpy as np
 import six
 
-from tensorflow.python.compat import compat
 from tensorflow.python.eager import context
 from tensorflow.python.eager import function
 from tensorflow.python.framework import dtypes
@@ -48,8 +49,10 @@ from tensorflow.python.keras.losses import sparse_categorical_crossentropy
 from tensorflow.python.keras.losses import squared_hinge
 from tensorflow.python.keras.utils.generic_utils import deserialize_keras_object
 from tensorflow.python.keras.utils.generic_utils import serialize_keras_object
+from tensorflow.python.keras.utils.generic_utils import to_list
+from tensorflow.python.keras.utils.losses_utils import squeeze_or_expand_dimensions
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import confusion_matrix
+from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
@@ -63,13 +66,6 @@ from tensorflow.python.util.tf_export import tf_export
 from tensorflow.tools.docs import doc_controls
 
 
-def check_is_tensor_or_operation(x, name):
-  """Raises type error if the given input is not a tensor or operation."""
-  if not (isinstance(x, ops.Tensor) or isinstance(x, ops.Operation)):
-    raise TypeError('{0} must be a Tensor or Operation, given: {1}'.format(
-        name, x))
-
-
 def clone_metric(metric):
   """Returns a clone of the metric if stateful, otherwise returns it as is."""
   if isinstance(metric, Metric):
@@ -102,8 +98,6 @@ def update_state_wrapper(update_state_fn):
     update_op = update_state_fn(*args, **kwargs)
     if update_op is not None:  # update_op will be None in eager execution.
       metric_obj.add_update(update_op, inputs=True)
-      check_is_tensor_or_operation(
-          update_op, 'Metric {0}\'s update'.format(metric_obj.name))
     return update_op
 
   return tf_decorator.make_decorator(update_state_fn, decorated)
@@ -128,7 +122,7 @@ def result_wrapper(result_fn):
     `merge_call()`.
   """
 
-  def decorated(metric_obj, *args):
+  def decorated(_, *args):
     """Decorated function with merge_call."""
     replica_context = distribution_strategy_context.get_replica_context()
     if replica_context is None:  # if in cross replica context already
@@ -147,9 +141,8 @@ def result_wrapper(result_fn):
 
       # Wrapping result in merge_call. merge_call is used when we want to leave
       # replica mode and compute a value in cross replica mode.
-      result_t = replica_context.merge_call(merge_fn_wrapper, result_fn, *args)
-    check_is_tensor_or_operation(result_t,
-                                 'Metric {0}\'s result'.format(metric_obj.name))
+      result_t = replica_context.merge_call(
+          merge_fn_wrapper, args=(result_fn,) + args)
     return result_t
 
   return tf_decorator.make_decorator(result_fn, decorated)
@@ -170,108 +163,169 @@ def weakmethod(method):
   return inner
 
 
-def safe_div(numerator, denominator):
-  """Computes a safe divide which returns 0 if the denominator is zero.
+class _ConfusionMatrix(Enum):
+  TRUE_POSITIVES = 'tp'
+  FALSE_POSITIVES = 'fp'
+  TRUE_NEGATIVES = 'tn'
+  FALSE_NEGATIVES = 'fn'
 
-  Note that the function contains an additional conditional check that is
-  necessary for avoiding situations where the loss is zero causing NaNs to
-  creep into the gradient computation.
+
+def _assert_thresholds_range(thresholds):
+  invalid_thresholds = [t for t in thresholds if t < 0 or t > 1]
+  if any(invalid_thresholds):
+    raise ValueError('Threshold values must be in [0, 1]. Invalid values: {}'
+                     .format(invalid_thresholds))
+
+
+def _update_confusion_matrix_variables(variables_to_update,
+                                       y_true,
+                                       y_pred,
+                                       thresholds,
+                                       sample_weight=None):
+  """Returns op to update the given confusion matrix variables.
+
+  For every pair of values in y_true and y_pred:
+
+  true_positive: y_true == True and y_pred > thresholds
+  false_negatives: y_true == True and y_pred <= thresholds
+  true_negatives: y_true == False and y_pred <= thresholds
+  false_positive: y_true == False and y_pred > thresholds
+
+  The results will be weighted and added together. When multiple thresholds are
+  provided, we will repeat the same for every threshold.
+
+  For estimation of these metrics over a stream of data, the function creates an
+  `update_op` operation that updates the given variables.
+
+  If `sample_weight` is `None`, weights default to 1.
+  Use weights of 0 to mask values.
 
   Args:
-    numerator: An arbitrary `Tensor`.
-    denominator: A `Tensor` whose shape matches `numerator` and whose values are
-      assumed to be non-negative.
+    variables_to_update: Dictionary with 'tp', 'fn', 'tn', 'fp' as valid keys
+      and corresponding variables to update as values.
+    y_true: A `Tensor` whose shape matches `y_pred`. Will be cast to `bool`.
+    y_pred: A floating point `Tensor` of arbitrary shape and whose values are in
+      the range `[0, 1]`.
+    thresholds: A float value or a python list or tuple of float thresholds in
+      `[0, 1]`.
+    sample_weight: Optional `Tensor` whose rank is either 0, or the same rank as
+      `y_true`, and must be broadcastable to `y_true` (i.e., all dimensions must
+      be either `1`, or the same as the corresponding `y_true` dimension).
 
   Returns:
-    The element-wise value of the numerator divided by the denominator.
+    Update op.
+
+  Raises:
+    ValueError: If `y_pred` and `y_true` have mismatched shapes, or if
+      `sample_weight` is not `None` and its shape doesn't match `y_pred`, or if
+      `variables_to_update` contains invalid keys.
   """
-  if compat.forward_compatible(2018, 11, 1):
-    return math_ops.div_no_nan(numerator, denominator)
-  return array_ops.where(
-      math_ops.greater(denominator, 0),
-      math_ops.div(numerator,
-                   array_ops.where(
-                       math_ops.equal(denominator, 0),
-                       array_ops.ones_like(denominator), denominator)),
-      array_ops.zeros_like(numerator))
+  if variables_to_update is None:
+    return
+  y_true = ops.convert_to_tensor(y_true)
+  y_pred = ops.convert_to_tensor(y_pred)
+  y_pred.shape.assert_is_compatible_with(y_true.shape)
 
+  if not any(
+      key for key in variables_to_update if key in list(_ConfusionMatrix)):
+    raise ValueError(
+        'Please provide at least one valid confusion matrix '
+        'variable to update. Valid variable key options are: "{}". '
+        'Received: "{}"'.format(
+            list(_ConfusionMatrix), variables_to_update.keys()))
 
-def squeeze_or_expand_dimensions(y_pred, y_true, sample_weight):
-  """Squeeze or expand last dimension if needed.
+  invalid_keys = [
+      key for key in variables_to_update if key not in list(_ConfusionMatrix)
+  ]
+  if invalid_keys:
+    raise ValueError(
+        'Invalid keys: {}. Valid variable key options are: "{}"'.format(
+            invalid_keys, list(_ConfusionMatrix)))
 
-  1. Squeezes last dim of `y_pred` or `y_true` if their rank differs by 1
-  (using `confusion_matrix.remove_squeezable_dimensions`).
-  2. Squeezes or expands last dim of `sample_weight` if its rank differs by 1
-  from the new rank of `y_pred`.
-  If `sample_weight` is scalar, it is kept scalar.
+  with ops.control_dependencies([
+      check_ops.assert_greater_equal(
+          y_pred,
+          math_ops.cast(0.0, dtype=y_pred.dtype),
+          message='predictions must be >= 0'),
+      check_ops.assert_less_equal(
+          y_pred,
+          math_ops.cast(1.0, dtype=y_pred.dtype),
+          message='predictions must be <= 1')
+  ]):
+    y_pred, y_true, sample_weight = squeeze_or_expand_dimensions(
+        math_ops.cast(y_pred, dtype=dtypes.float32),
+        math_ops.cast(y_true, dtype=dtypes.bool), sample_weight)
 
-  This will use static shape if available. Otherwise, it will add graph
-  operations, which could result in a performance hit.
+  thresholds = to_list(thresholds)
+  num_thresholds = len(thresholds)
+  num_predictions = array_ops.size(y_pred)
 
-  Args:
-    y_pred: Predicted values, a `Tensor` of arbitrary dimensions.
-    y_true: Optional label `Tensor` whose dimensions match `y_pred`.
-    sample_weight: Optional weight scalar or `Tensor` whose dimensions match
-      `y_pred`.
+  # Reshape predictions and labels.
+  predictions_2d = array_ops.reshape(y_pred, [1, -1])
+  labels_2d = array_ops.reshape(
+      math_ops.cast(y_true, dtype=dtypes.bool), [1, -1])
 
-  Returns:
-    Tuple of `y_pred`, `y_true` and `sample_weight`. Each of them possibly has
-    the last dimension squeezed,
-    `sample_weight` could be extended by one dimension.
-  """
-  if y_true is not None:
-    # squeeze last dim of `y_pred` or `y_true` if their rank differs by 1
-    y_true, y_pred = confusion_matrix.remove_squeezable_dimensions(
-        y_true, y_pred)
+  # Tile the thresholds for every prediction.
+  thresh_tiled = array_ops.tile(
+      array_ops.expand_dims(array_ops.constant(thresholds), 1),
+      array_ops.stack([1, num_predictions]))
 
-  if sample_weight is None:
-    return y_pred, y_true, None
+  # Tile the predictions for every threshold.
+  preds_tiled = array_ops.tile(predictions_2d, [num_thresholds, 1])
 
-  sample_weight = ops.convert_to_tensor(sample_weight)
-  weights_shape = sample_weight.get_shape()
-  weights_rank = weights_shape.ndims
-  if weights_rank == 0:  # If weights is scalar, do nothing.
-    return y_pred, y_true, sample_weight
+  # Compare predictions and threshold.
+  pred_is_pos = math_ops.greater(preds_tiled, thresh_tiled)
 
-  y_pred_shape = y_pred.get_shape()
-  y_pred_rank = y_pred_shape.ndims
-  if (y_pred_rank is not None) and (weights_rank is not None):
-    # Use static rank.
-    if weights_rank - y_pred_rank == 1:
-      sample_weight = array_ops.squeeze(sample_weight, [-1])
-    elif y_pred_rank - weights_rank == 1:
-      sample_weight = array_ops.expand_dims(sample_weight, [-1])
-    return y_pred, y_true, sample_weight
+  # Tile labels by number of thresholds
+  label_is_pos = array_ops.tile(labels_2d, [num_thresholds, 1])
 
-  # Use dynamic rank.
-  weights_rank_tensor = array_ops.rank(sample_weight)
-  rank_diff = weights_rank_tensor - array_ops.rank(y_pred)
-  maybe_squeeze_weights = lambda: array_ops.squeeze(sample_weight, [-1])
+  if sample_weight is not None:
+    weights = weights_broadcast_ops.broadcast_weights(
+        math_ops.cast(sample_weight, dtype=dtypes.float32), y_pred)
+    weights_tiled = array_ops.tile(
+        array_ops.reshape(weights, [1, -1]), [num_thresholds, 1])
+  else:
+    weights_tiled = None
 
-  def _maybe_expand_weights():
-    return control_flow_ops.cond(
-        math_ops.equal(rank_diff,
-                       -1), lambda: array_ops.expand_dims(sample_weight, [-1]),
-        lambda: sample_weight)
+  update_ops = []
 
-  def _maybe_adjust_weights():
-    return control_flow_ops.cond(
-        math_ops.equal(rank_diff, 1), maybe_squeeze_weights,
-        _maybe_expand_weights)
+  def weighted_assign_add(label, pred, weights, var):
+    label_and_pred = math_ops.cast(
+        math_ops.logical_and(label, pred), dtype=dtypes.float32)
+    if weights is not None:
+      label_and_pred *= weights
+    return state_ops.assign_add(var, math_ops.reduce_sum(label_and_pred, 1))
 
-  # squeeze or expand last dim of `sample_weight` if its rank differs by 1
-  # from the new rank of `y_pred`.
-  sample_weight = control_flow_ops.cond(
-      math_ops.equal(weights_rank_tensor, 0), lambda: sample_weight,
-      _maybe_adjust_weights)
-  return y_pred, y_true, sample_weight
+  loop_vars = {
+      _ConfusionMatrix.TRUE_POSITIVES: (label_is_pos, pred_is_pos),
+  }
+  update_tn = _ConfusionMatrix.TRUE_NEGATIVES in variables_to_update
+  update_fp = _ConfusionMatrix.FALSE_POSITIVES in variables_to_update
+  update_fn = _ConfusionMatrix.FALSE_NEGATIVES in variables_to_update
+
+  if update_fn or update_tn:
+    pred_is_neg = math_ops.logical_not(pred_is_pos)
+    loop_vars[_ConfusionMatrix.FALSE_NEGATIVES] = (label_is_pos, pred_is_neg)
+
+  if update_fp or update_tn:
+    label_is_neg = math_ops.logical_not(label_is_pos)
+    loop_vars[_ConfusionMatrix.FALSE_POSITIVES] = (label_is_neg, pred_is_pos)
+    if update_tn:
+      loop_vars[_ConfusionMatrix.TRUE_NEGATIVES] = (label_is_neg, pred_is_neg)
+
+  for matrix_cond, (label, pred) in loop_vars.items():
+    if matrix_cond in variables_to_update:
+      update_ops.append(
+          weighted_assign_add(label, pred, weights_tiled,
+                              variables_to_update[matrix_cond]))
+  return control_flow_ops.group(update_ops)
 
 
 @six.add_metaclass(abc.ABCMeta)
 class Metric(Layer):
   """Encapsulates metric logic and state.
 
-  Usage with eager execution:
+  Usage:
 
   ```python
   m = SomeMetric(...)
@@ -280,19 +334,6 @@ class Metric(Layer):
   print('Final result: ', m.result().numpy())
   ```
 
-  Usage with graph execution:
-
-  ```python
-  m = SomeMetric(...)
-  init_op = tf.variables_initializer(m.variables)  # Initialize variables
-  with tf.Session() as sess:
-    sess.run(init_op)
-    for input in ...:
-      update_op = m.update_state(input)
-      sess.run(update_op)
-    print('Final result: ', sess.run(m.result()))
-  ```
-
   Usage with tf.keras API:
 
   ```python
@@ -388,9 +429,20 @@ class Metric(Layer):
     Returns:
       The metric value tensor.
     """
-    update_op = self.update_state(*args, **kwargs)  # pylint: disable=not-callable
+    update_op = self.update_state(*args, **kwargs)
     with ops.control_dependencies([update_op]):
-      return self.result()  # pylint: disable=not-callable
+      result_t = self.result()
+
+      # We are adding the metric object as metadata on the result tensor.
+      # This is required when we want to use a metric with `add_metric` API on
+      # a Model/Layer in graph mode. This metric instance will later be used
+      # to reset variable state after each epoch of training.
+      # Example:
+      #   model = Model()
+      #   model.add_metric(Mean()(values), name='mean')
+      if not context.executing_eagerly():
+        result_t._metric_obj = self  # pylint: disable=protected-access
+      return result_t
 
   def reset_states(self):
     """Resets all of the metric state variables.
@@ -459,15 +511,35 @@ class Metric(Layer):
   ### End: For use by subclasses ###
 
 
+@tf_export('metrics.Mean', 'keras.metrics.Mean')
 class Mean(Metric):
   """Computes the (weighted) mean of the given values.
 
+  For example, if values is [1, 3, 5, 7] then the mean is 4.
+  If the weights were specified as [1, 1, 0, 0] then the mean would be 2.
+
   This metric creates two variables, `total` and `count` that are used to
   compute the average of `values`. This average is ultimately returned as `mean`
   which is an idempotent operation that simply divides `total` by `count`.
 
   If `sample_weight` is `None`, weights default to 1.
   Use `sample_weight` of 0 to mask values.
+
+  Usage:
+
+  ```python
+  m = tf.metrics.Mean()
+  m.update_state([1, 3, 5, 7])
+  print('Final result: ', m.result().numpy())  # Final result: 4.0
+  ```
+
+  Usage with tf.keras API:
+
+  ```python
+  model = keras.models.Model(inputs, outputs)
+  model.add_metric(metrics_module.Mean(name='mean_1')(outputs))
+  model.compile('sgd', loss='mse')
+  ```
   """
 
   def __init__(self, name='mean', dtype=None):
@@ -525,11 +597,10 @@ class Mean(Metric):
     # updated.
     update_total_op = state_ops.assign_add(self.total, values)
     with ops.control_dependencies([update_total_op]):
-      update_count_op = state_ops.assign_add(self.count, num_values)
-      return ops.convert_to_tensor(update_count_op)
+      return state_ops.assign_add(self.count, num_values)
 
   def result(self):
-    return safe_div(self.total, self.count)
+    return math_ops.div_no_nan(self.total, self.count)
 
 
 class MeanMetricWrapper(Mean):
@@ -574,14 +645,20 @@ class MeanMetricWrapper(Mean):
         matches, sample_weight=sample_weight)
 
   def get_config(self):
-    config = self._fn_kwargs
+    config = {'fn': self._fn}
+    config.update(self._fn_kwargs)
     base_config = super(MeanMetricWrapper, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
 
 
-class BinaryAccuracy(MeanMetricWrapper):
+@tf_export('metrics.Accuracy', 'keras.metrics.Accuracy')
+class Accuracy(MeanMetricWrapper):
   """Calculates how often predictions matches labels.
 
+  For example, if `y_true` is [1, 2, 3, 4] and `y_pred` is [0, 2, 3, 4]
+  then the accuracy is 3/4 or .75.  If the weights were specified as
+  [1, 1, 0, 0] then the accuracy would be 1/2 or .5.
+
   This metric creates two local variables, `total` and `count` that are used to
   compute the frequency with which `y_pred` matches `y_true`. This frequency is
   ultimately returned as `binary accuracy`: an idempotent operation that simply
@@ -589,6 +666,63 @@ class BinaryAccuracy(MeanMetricWrapper):
 
   If `sample_weight` is `None`, weights default to 1.
   Use `sample_weight` of 0 to mask values.
+
+  Usage:
+
+  ```python
+  m = tf.metrics.Accuracy()
+  m.update_state([1, 2, 3, 4], [0, 2, 3, 4])
+  print('Final result: ', m.result().numpy())  # Final result: 0.75
+  ```
+
+  Usage with tf.keras API:
+
+  ```python
+  model = keras.models.Model(inputs, outputs)
+  model.compile('sgd', loss='mse', metrics=[tf.metrics.Accuracy()])
+  ```
+  """
+
+  def __init__(self, name='accuracy', dtype=None):
+    super(Accuracy, self).__init__(accuracy, name, dtype=dtype)
+
+  @classmethod
+  def from_config(cls, config):
+    if 'fn' in config:
+      config.pop('fn')
+    return super(Accuracy, cls).from_config(config)
+
+
+@tf_export('metrics.BinaryAccuracy', 'keras.metrics.BinaryAccuracy')
+class BinaryAccuracy(MeanMetricWrapper):
+  """Calculates how often predictions matches labels.
+
+  For example, if `y_true` is [1, 1, 0, 0] and `y_pred` is [0.98, 1, 0, 0.6]
+  then the binary accuracy is 3/4 or .75.  If the weights were specified as
+  [1, 0, 0, 1] then the binary accuracy would be 1/2 or .5.
+
+  This metric creates two local variables, `total` and `count` that are used to
+  compute the frequency with which `y_pred` matches `y_true`. This frequency is
+  ultimately returned as `binary accuracy`: an idempotent operation that simply
+  divides `total` by `count`.
+
+  If `sample_weight` is `None`, weights default to 1.
+  Use `sample_weight` of 0 to mask values.
+
+  Usage:
+
+  ```python
+  m = tf.metrics.BinaryAccuracy()
+  m.update_state([1, 1, 0, 0], [0.98, 1, 0, 0.6])
+  print('Final result: ', m.result().numpy())  # Final result: 0.75
+  ```
+
+  Usage with tf.keras API:
+
+  ```python
+  model = keras.models.Model(inputs, outputs)
+  model.compile('sgd', loss='mse', metrics=[tf.metrics.BinaryAccuracy()])
+  ```
   """
 
   def __init__(self, name='binary_accuracy', dtype=None, threshold=0.5):
@@ -603,17 +737,48 @@ class BinaryAccuracy(MeanMetricWrapper):
     super(BinaryAccuracy, self).__init__(
         binary_accuracy, name, dtype=dtype, threshold=threshold)
 
+  @classmethod
+  def from_config(cls, config):
+    if 'fn' in config:
+      config.pop('fn')
+    return super(BinaryAccuracy, cls).from_config(config)
 
+
+@tf_export(
+    'metrics.CategoricalAccuracy', 'keras.metrics.CategoricalAccuracy')
 class CategoricalAccuracy(MeanMetricWrapper):
   """Calculates how often predictions matches labels.
 
+  For example, if `y_true` is [[0, 0, 1], [0, 1, 0]] and `y_pred` is
+  [[0.1, 0.9, 0.8], [0.05, 0.95, 0]] then the categorical accuracy is 1/2 or .5.
+  If the weights were specified as [0.7, 0.3] then the categorical accuracy
+  would be .3.
+
   This metric creates two local variables, `total` and `count` that are used to
   compute the frequency with which `y_pred` matches `y_true`. This frequency is
   ultimately returned as `categorical accuracy`: an idempotent operation that
   simply divides `total` by `count`.
 
+  `y_pred` and `y_true` should be passed in as vectors of probabilities, rather
+  than as labels. If necessary, use `tf.one_hot` to expand `y_true` as a vector.
+
   If `sample_weight` is `None`, weights default to 1.
   Use `sample_weight` of 0 to mask values.
+
+  Usage:
+
+  ```python
+  m = tf.metrics.CategoricalAccuracy()
+  m.update_state([[0, 0, 1], [0, 1, 0]], [[0.1, 0.9, 0.8], [0.05, 0.95, 0]])
+  print('Final result: ', m.result().numpy())  # Final result: 0.5
+  ```
+
+  Usage with tf.keras API:
+
+  ```python
+  model = keras.models.Model(inputs, outputs)
+  model.compile('sgd', loss='mse', metrics=[tf.metrics.CategoricalAccuracy()])
+  ```
   """
 
   def __init__(self, name='categorical_accuracy', dtype=None):
@@ -626,10 +791,24 @@ class CategoricalAccuracy(MeanMetricWrapper):
     super(CategoricalAccuracy, self).__init__(
         categorical_accuracy, name, dtype=dtype)
 
+  @classmethod
+  def from_config(cls, config):
+    if 'fn' in config:
+      config.pop('fn')
+    return super(CategoricalAccuracy, cls).from_config(config)
 
+
+@tf_export(
+    'metrics.SparseCategoricalAccuracy',
+    'keras.metrics.SparseCategoricalAccuracy')
 class SparseCategoricalAccuracy(MeanMetricWrapper):
   """Calculates how often predictions matches integer labels.
 
+  For example, if `y_true` is [[2], [1]] and `y_pred` is
+  [[0.1, 0.9, 0.8], [0.05, 0.95, 0]] then the categorical accuracy is 1/2 or .5.
+  If the weights were specified as [0.7, 0.3] then the categorical accuracy
+  would be .3.
+
   This metric creates two local variables, `total` and `count` that are used to
   compute the frequency with which `y_pred` matches `y_true`. This frequency is
   ultimately returned as `sparse categorical accuracy`: an idempotent operation
@@ -637,12 +816,710 @@ class SparseCategoricalAccuracy(MeanMetricWrapper):
 
   If `sample_weight` is `None`, weights default to 1.
   Use `sample_weight` of 0 to mask values.
+
+  Usage:
+
+  ```python
+  m = tf.metrics.SparseCategoricalAccuracy()
+  m.update_state([[2], [1]], [[0.1, 0.9, 0.8], [0.05, 0.95, 0]])
+  print('Final result: ', m.result().numpy())  # Final result: 0.5
+  ```
+
+  Usage with tf.keras API:
+
+  ```python
+  model = keras.models.Model(inputs, outputs)
+  model.compile(
+      'sgd',
+      loss='mse',
+      metrics=[tf.metrics.SparseCategoricalAccuracy()])
+  ```
   """
 
   def __init__(self, name='sparse_categorical_accuracy', dtype=None):
     super(SparseCategoricalAccuracy, self).__init__(
         sparse_categorical_accuracy, name, dtype=dtype)
 
+  @classmethod
+  def from_config(cls, config):
+    if 'fn' in config:
+      config.pop('fn')
+    return super(SparseCategoricalAccuracy, cls).from_config(config)
+
+
+class _ConfusionMatrixConditionCount(Metric):
+  """Calculates the number of the given confusion matrix condition."""
+
+  def __init__(self,
+               confusion_matrix_cond,
+               thresholds=None,
+               name=None,
+               dtype=None):
+    """Creates a `_ConfusionMatrixConditionCount` instance.
+
+    Args:
+      confusion_matrix_cond: One of `_ConfusionMatrix` conditions.
+      thresholds: (Optional) Defaults to 0.5. A float value or a python
+        list/tuple of float threshold values in [0, 1]. A threshold is compared
+        with prediction values to determine the truth value of predictions
+        (i.e., above the threshold is `true`, below is `false`). One metric
+        value is generated for each threshold value.
+      name: (Optional) string name of the metric instance.
+      dtype: (Optional) data type of the metric result.
+    """
+    super(_ConfusionMatrixConditionCount, self).__init__(name=name, dtype=dtype)
+    self._confusion_matrix_cond = confusion_matrix_cond
+    self.thresholds = 0.5 if thresholds is None else thresholds
+    thresholds = to_list(thresholds)
+    _assert_thresholds_range(thresholds)
+    self.accumulator = self.add_weight(
+        'accumulator',
+        shape=(len(thresholds),),
+        initializer=init_ops.zeros_initializer)
+
+  def update_state(self, y_true, y_pred, sample_weight=None):
+    """Accumulates the given confusion matrix condition statistics.
+
+    Args:
+      y_true: The ground truth values.
+      y_pred: The predicted values.
+      sample_weight: Optional weighting of each example. Defaults to 1. Can be a
+        `Tensor` whose rank is either 0, or the same rank as `y_true`, and must
+        be broadcastable to `y_true`.
+
+    Returns:
+      Update op.
+    """
+    return _update_confusion_matrix_variables({
+        self._confusion_matrix_cond: self.accumulator
+    }, y_true, y_pred, self.thresholds, sample_weight)
+
+  def result(self):
+    if isinstance(self.thresholds, (list, tuple)):
+      result = self.accumulator
+    else:
+      result = self.accumulator[0]
+    return ops.convert_to_tensor(result)
+
+  def reset_states(self):
+    num_thresholds = len(to_list(self.thresholds))
+    for v in self.variables:
+      K.set_value(v, np.zeros((num_thresholds,)))
+
+
+@tf_export('metrics.FalsePositives', 'keras.metrics.FalsePositives')
+class FalsePositives(_ConfusionMatrixConditionCount):
+  """Calculates the number of false positives.
+
+  For example, if `y_true` is [0, 1, 0, 0] and `y_pred` is [0, 0, 1, 1]
+  then the false positives value is 2.  If the weights were specified as
+  [0, 0, 1, 0] then the false positives value would be 1.
+
+  If `sample_weight` is given, calculates the sum of the weights of
+  false positives. This metric creates one local variable, `accumulator`
+  that is used to keep track of the number of false positives.
+
+  If `sample_weight` is `None`, weights default to 1.
+  Use `sample_weight` of 0 to mask values.
+
+  Usage:
+
+  ```python
+  m = tf.metrics.FalsePositives()
+  m.update_state([0, 1, 0, 0], [0, 0, 1, 1])
+  print('Final result: ', m.result().numpy())  # Final result: 2
+  ```
+
+  Usage with tf.keras API:
+
+  ```python
+  model = keras.models.Model(inputs, outputs)
+  model.compile('sgd', loss='mse', metrics=[tf.metrics.FalsePositives()])
+  ```
+  """
+
+  def __init__(self, thresholds=None, name=None, dtype=None):
+    """Creates a `FalsePositives` instance.
+
+    Args:
+      thresholds: (Optional) Defaults to 0.5. A float value or a python
+        list/tuple of float threshold values in [0, 1]. A threshold is compared
+        with prediction values to determine the truth value of predictions
+        (i.e., above the threshold is `true`, below is `false`). One metric
+        value is generated for each threshold value.
+      name: (Optional) string name of the metric instance.
+      dtype: (Optional) data type of the metric result.
+    """
+    super(FalsePositives, self).__init__(
+        confusion_matrix_cond=_ConfusionMatrix.FALSE_POSITIVES,
+        thresholds=thresholds,
+        name=name,
+        dtype=dtype)
+
+
+@tf_export('metrics.FalseNegatives', 'keras.metrics.FalseNegatives')
+class FalseNegatives(_ConfusionMatrixConditionCount):
+  """Calculates the number of false negatives.
+
+  For example, if `y_true` is [0, 1, 1, 1] and `y_pred` is [0, 1, 0, 0]
+  then the false negatives value is 2.  If the weights were specified as
+  [0, 0, 1, 0] then the false negatives value would be 1.
+
+  If `sample_weight` is given, calculates the sum of the weights of
+  false negatives. This metric creates one local variable, `accumulator`
+  that is used to keep track of the number of false negatives.
+
+  If `sample_weight` is `None`, weights default to 1.
+  Use `sample_weight` of 0 to mask values.
+
+  Usage:
+
+  ```python
+  m = tf.metrics.FalseNegatives()
+  m.update_state([0, 1, 1, 1], [0, 1, 0, 0])
+  print('Final result: ', m.result().numpy())  # Final result: 2
+  ```
+
+  Usage with tf.keras API:
+
+  ```python
+  model = keras.models.Model(inputs, outputs)
+  model.compile('sgd', loss='mse', metrics=[tf.metrics.FalseNegatives()])
+  ```
+  """
+
+  def __init__(self, thresholds=None, name=None, dtype=None):
+    """Creates a `FalseNegatives` instance.
+
+    Args:
+      thresholds: (Optional) Defaults to 0.5. A float value or a python
+        list/tuple of float threshold values in [0, 1]. A threshold is compared
+        with prediction values to determine the truth value of predictions
+        (i.e., above the threshold is `true`, below is `false`). One metric
+        value is generated for each threshold value.
+      name: (Optional) string name of the metric instance.
+      dtype: (Optional) data type of the metric result.
+    """
+    super(FalseNegatives, self).__init__(
+        confusion_matrix_cond=_ConfusionMatrix.FALSE_NEGATIVES,
+        thresholds=thresholds,
+        name=name,
+        dtype=dtype)
+
+
+@tf_export('metrics.TrueNegatives', 'keras.metrics.TrueNegatives')
+class TrueNegatives(_ConfusionMatrixConditionCount):
+  """Calculates the number of true negatives.
+
+  For example, if `y_true` is [0, 1, 0, 0] and `y_pred` is [1, 1, 0, 0]
+  then the true negatives value is 2.  If the weights were specified as
+  [0, 0, 1, 0] then the true negatives value would be 1.
+
+  If `sample_weight` is given, calculates the sum of the weights of
+  true negatives. This metric creates one local variable, `accumulator`
+  that is used to keep track of the number of true negatives.
+
+  If `sample_weight` is `None`, weights default to 1.
+  Use `sample_weight` of 0 to mask values.
+
+  Usage:
+
+  ```python
+  m = tf.metrics.TrueNegatives()
+  m.update_state([0, 1, 0, 0], [1, 1, 0, 0])
+  print('Final result: ', m.result().numpy())  # Final result: 2
+  ```
+
+  Usage with tf.keras API:
+
+  ```python
+  model = keras.models.Model(inputs, outputs)
+  model.compile('sgd', loss='mse', metrics=[tf.metrics.TrueNegatives()])
+  ```
+  """
+
+  def __init__(self, thresholds=None, name=None, dtype=None):
+    """Creates a `TrueNegatives` instance.
+
+    Args:
+      thresholds: (Optional) Defaults to 0.5. A float value or a python
+        list/tuple of float threshold values in [0, 1]. A threshold is compared
+        with prediction values to determine the truth value of predictions
+        (i.e., above the threshold is `true`, below is `false`). One metric
+        value is generated for each threshold value.
+      name: (Optional) string name of the metric instance.
+      dtype: (Optional) data type of the metric result.
+    """
+    super(TrueNegatives, self).__init__(
+        confusion_matrix_cond=_ConfusionMatrix.TRUE_NEGATIVES,
+        thresholds=thresholds,
+        name=name,
+        dtype=dtype)
+
+
+@tf_export('metrics.TruePositives', 'keras.metrics.TruePositives')
+class TruePositives(_ConfusionMatrixConditionCount):
+  """Calculates the number of true positives.
+
+  For example, if `y_true` is [0, 1, 1, 1] and `y_pred` is [1, 0, 1, 1]
+  then the true positives value is 2.  If the weights were specified as
+  [0, 0, 1, 0] then the true positives value would be 1.
+
+  If `sample_weight` is given, calculates the sum of the weights of
+  true positives. This metric creates one local variable, `true_positives`
+  that is used to keep track of the number of true positives.
+
+  If `sample_weight` is `None`, weights default to 1.
+  Use `sample_weight` of 0 to mask values.
+
+  Usage:
+
+  ```python
+  m = tf.metrics.TruePositives()
+  m.update_state([0, 1, 1, 1], [1, 0, 1, 1])
+  print('Final result: ', m.result().numpy())  # Final result: 2
+  ```
+
+  Usage with tf.keras API:
+
+  ```python
+  model = keras.models.Model(inputs, outputs)
+  model.compile('sgd', loss='mse', metrics=[tf.metrics.TruePositives()])
+  ```
+  """
+
+  def __init__(self, thresholds=None, name=None, dtype=None):
+    """Creates a `TruePositives` instance.
+
+    Args:
+      thresholds: (Optional) Defaults to 0.5. A float value or a python
+        list/tuple of float threshold values in [0, 1]. A threshold is compared
+        with prediction values to determine the truth value of predictions
+        (i.e., above the threshold is `true`, below is `false`). One metric
+        value is generated for each threshold value.
+      name: (Optional) string name of the metric instance.
+      dtype: (Optional) data type of the metric result.
+    """
+    super(TruePositives, self).__init__(
+        confusion_matrix_cond=_ConfusionMatrix.TRUE_POSITIVES,
+        thresholds=thresholds,
+        name=name,
+        dtype=dtype)
+
+
+@tf_export('metrics.Precision', 'keras.metrics.Precision')
+class Precision(Metric):
+  """Computes the precision of the predictions with respect to the labels.
+
+  For example, if `y_true` is [0, 1, 1, 1] and `y_pred` is [1, 0, 1, 1]
+  then the precision value is 2/(2+1) ie. 0.66. If the weights were specified as
+  [0, 0, 1, 0] then the precision value would be 1.
+
+  The metric creates two local variables, `true_positives` and `false_positives`
+  that are used to compute the precision. This value is ultimately returned as
+  `precision`, an idempotent operation that simply divides `true_positives`
+  by the sum of `true_positives` and `false_positives`.
+
+  If `sample_weight` is `None`, weights default to 1.
+  Use `sample_weight` of 0 to mask values.
+
+  Usage:
+
+  ```python
+  m = tf.metrics.Precision()
+  m.update_state([0, 1, 1, 1], [1, 0, 1, 1])
+  print('Final result: ', m.result().numpy())  # Final result: 0.66
+  ```
+
+  Usage with tf.keras API:
+
+  ```python
+  model = keras.models.Model(inputs, outputs)
+  model.compile('sgd', loss='mse', metrics=[tf.metrics.Precision()])
+  ```
+  """
+
+  def __init__(self, thresholds=None, name=None, dtype=None):
+    """Creates a `Precision` instance.
+
+    Args:
+      thresholds: (Optional) Defaults to 0.5. A float value or a python
+        list/tuple of float threshold values in [0, 1]. A threshold is compared
+        with prediction values to determine the truth value of predictions
+        (i.e., above the threshold is `true`, below is `false`). One metric
+        value is generated for each threshold value.
+      name: (Optional) string name of the metric instance.
+      dtype: (Optional) data type of the metric result.
+    """
+    super(Precision, self).__init__(name=name, dtype=dtype)
+    self.thresholds = 0.5 if thresholds is None else thresholds
+    thresholds = to_list(thresholds)
+    _assert_thresholds_range(thresholds)
+    self.tp = self.add_weight(
+        'true_positives',
+        shape=(len(thresholds),),
+        initializer=init_ops.zeros_initializer)
+    self.fp = self.add_weight(
+        'false_positives',
+        shape=(len(thresholds),),
+        initializer=init_ops.zeros_initializer)
+
+  def update_state(self, y_true, y_pred, sample_weight=None):
+    """Accumulates true positive and false positive statistics.
+
+    Args:
+      y_true: The ground truth values.
+      y_pred: The predicted values.
+      sample_weight: Optional weighting of each example. Defaults to 1. Can be a
+        `Tensor` whose rank is either 0, or the same rank as `y_true`, and must
+        be broadcastable to `y_true`.
+
+    Returns:
+      Update op.
+    """
+    return _update_confusion_matrix_variables({
+        _ConfusionMatrix.TRUE_POSITIVES: self.tp,
+        _ConfusionMatrix.FALSE_POSITIVES: self.fp
+    }, y_true, y_pred, self.thresholds, sample_weight)
+
+  def result(self):
+    result = math_ops.div_no_nan(self.tp, self.tp + self.fp)
+    return result if isinstance(self.thresholds, (list, tuple)) else result[0]
+
+  def reset_states(self):
+    num_thresholds = len(to_list(self.thresholds))
+    for v in self.variables:
+      K.set_value(v, np.zeros((num_thresholds,)))
+
+
+@tf_export('metrics.Recall', 'keras.metrics.Recall')
+class Recall(Metric):
+  """Computes the recall of the predictions with respect to the labels.
+
+  For example, if `y_true` is [0, 1, 1, 1] and `y_pred` is [1, 0, 1, 1]
+  then the recall value is 2/(2+1) ie. 0.66. If the weights were specified as
+  [0, 0, 1, 0] then the recall value would be 1.
+
+  This metric creates two local variables, `true_positives` and
+  `false_negatives`, that are used to compute the recall. This value is
+  ultimately returned as `recall`, an idempotent operation that simply divides
+  `true_positives` by the sum of `true_positives` and `false_negatives`.
+
+  If `sample_weight` is `None`, weights default to 1.
+  Use `sample_weight` of 0 to mask values.
+
+  Usage:
+
+  ```python
+  m = tf.metrics.Recall()
+  m.update_state([0, 1, 1, 1], [1, 0, 1, 1])
+  print('Final result: ', m.result().numpy())  # Final result: 0.66
+  ```
+
+  Usage with tf.keras API:
+
+  ```python
+  model = keras.models.Model(inputs, outputs)
+  model.compile('sgd', loss='mse', metrics=[tf.metrics.Recall()])
+  ```
+  """
+
+  def __init__(self, thresholds=None, name=None, dtype=None):
+    """Creates a `Recall` instance.
+
+    Args:
+      thresholds: (Optional) Defaults to 0.5. A float value or a python
+        list/tuple of float threshold values in [0, 1]. A threshold is compared
+        with prediction values to determine the truth value of predictions
+        (i.e., above the threshold is `true`, below is `false`). One metric
+        value is generated for each threshold value.
+      name: (Optional) string name of the metric instance.
+      dtype: (Optional) data type of the metric result.
+    """
+    super(Recall, self).__init__(name=name, dtype=dtype)
+    self.thresholds = 0.5 if thresholds is None else thresholds
+    thresholds = to_list(thresholds)
+    _assert_thresholds_range(thresholds)
+    self.tp = self.add_weight(
+        'true_positives',
+        shape=(len(thresholds),),
+        initializer=init_ops.zeros_initializer)
+    self.fn = self.add_weight(
+        'false_negatives',
+        shape=(len(thresholds),),
+        initializer=init_ops.zeros_initializer)
+
+  def update_state(self, y_true, y_pred, sample_weight=None):
+    """Accumulates true positive and false negative statistics.
+
+    Args:
+      y_true: The ground truth values.
+      y_pred: The predicted values.
+      sample_weight: Optional weighting of each example. Defaults to 1. Can be a
+        `Tensor` whose rank is either 0, or the same rank as `y_true`, and must
+        be broadcastable to `y_true`.
+
+    Returns:
+      Update op.
+    """
+    return _update_confusion_matrix_variables({
+        _ConfusionMatrix.TRUE_POSITIVES: self.tp,
+        _ConfusionMatrix.FALSE_NEGATIVES: self.fn
+    }, y_true, y_pred, self.thresholds, sample_weight)
+
+  def result(self):
+    result = math_ops.div_no_nan(self.tp, self.tp + self.fn)
+    return result if isinstance(self.thresholds, (list, tuple)) else result[0]
+
+  def reset_states(self):
+    num_thresholds = len(to_list(self.thresholds))
+    for v in self.variables:
+      K.set_value(v, np.zeros((num_thresholds,)))
+
+
+@six.add_metaclass(abc.ABCMeta)
+class SensitivitySpecificityBase(Metric):
+  """Abstract base class for computing sensitivity and specificity.
+
+  For additional information about specificity and sensitivity, see the
+  following: https://en.wikipedia.org/wiki/Sensitivity_and_specificity
+  """
+
+  def __init__(self, value, num_thresholds=200, name=None, dtype=None):
+    super(SensitivitySpecificityBase, self).__init__(name=name, dtype=dtype)
+    if num_thresholds <= 0:
+      raise ValueError('`num_thresholds` must be > 0.')
+    self.value = value
+    self.tp = self.add_weight(
+        'true_positives',
+        shape=(num_thresholds,),
+        initializer=init_ops.zeros_initializer)
+    self.tn = self.add_weight(
+        'true_negatives',
+        shape=(num_thresholds,),
+        initializer=init_ops.zeros_initializer)
+    self.fp = self.add_weight(
+        'false_positives',
+        shape=(num_thresholds,),
+        initializer=init_ops.zeros_initializer)
+    self.fn = self.add_weight(
+        'false_negatives',
+        shape=(num_thresholds,),
+        initializer=init_ops.zeros_initializer)
+
+    # Compute `num_thresholds` thresholds in [0, 1]
+    if num_thresholds == 1:
+      self.thresholds = [0.5]
+    else:
+      thresholds = [(i + 1) * 1.0 / (num_thresholds - 1)
+                    for i in range(num_thresholds - 2)]
+      self.thresholds = [0.0] + thresholds + [1.0]
+
+  def update_state(self, y_true, y_pred, sample_weight=None):
+    """Accumulates confusion matrix statistics.
+
+    Args:
+      y_true: The ground truth values.
+      y_pred: The predicted values.
+      sample_weight: Optional weighting of each example. Defaults to 1. Can be a
+        `Tensor` whose rank is either 0, or the same rank as `y_true`, and must
+        be broadcastable to `y_true`.
+
+    Returns:
+      Update op.
+    """
+    return _update_confusion_matrix_variables({
+        _ConfusionMatrix.TRUE_POSITIVES: self.tp,
+        _ConfusionMatrix.TRUE_NEGATIVES: self.tn,
+        _ConfusionMatrix.FALSE_POSITIVES: self.fp,
+        _ConfusionMatrix.FALSE_NEGATIVES: self.fn,
+    }, y_true, y_pred, self.thresholds, sample_weight)
+
+  def reset_states(self):
+    num_thresholds = len(self.thresholds)
+    for v in self.variables:
+      K.set_value(v, np.zeros((num_thresholds,)))
+
+
+class SensitivityAtSpecificity(SensitivitySpecificityBase):
+  """Computes the sensitivity at a given specificity.
+
+  `Sensitivity` measures the proportion of actual positives that are correctly
+  identified as such (tp / (tp + fn)).
+  `Specificity` measures the proportion of actual negatives that are correctly
+  identified as such (tn / (tn + fp)).
+
+  This metric creates four local variables, `true_positives`, `true_negatives`,
+  `false_positives` and `false_negatives` that are used to compute the
+  sensitivity at the given specificity. The threshold for the given specificity
+  value is computed and used to evaluate the corresponding sensitivity.
+
+  If `sample_weight` is `None`, weights default to 1.
+  Use `sample_weight` of 0 to mask values.
+
+  For additional information about specificity and sensitivity, see the
+  following: https://en.wikipedia.org/wiki/Sensitivity_and_specificity
+
+  Usage:
+
+  ```python
+  m = tf.metrics.SensitivityAtSpecificity(0.4, num_thresholds=1)
+  m.update_state([0, 0, 1, 1], [0, 0.5, 0.3, 0.9])
+  print('Final result: ', m.result().numpy())  # Final result: 0.5
+  ```
+
+  Usage with tf.keras API:
+
+  ```python
+  model = keras.models.Model(inputs, outputs)
+  model.compile(
+      'sgd',
+      loss='mse',
+      metrics=[tf.metrics.SensitivityAtSpecificity()])
+  ```
+  """
+
+  def __init__(self, specificity, num_thresholds=200, name=None, dtype=None):
+    """Creates a `SensitivityAtSpecificity` instance.
+
+    Args:
+      specificity: A scalar value in range `[0, 1]`.
+      num_thresholds: (Optional) Defaults to 200. The number of thresholds to
+        use for matching the given specificity.
+      name: (Optional) string name of the metric instance.
+      dtype: (Optional) data type of the metric result.
+    """
+    if specificity < 0 or specificity > 1:
+      raise ValueError('`specificity` must be in the range [0, 1].')
+    super(SensitivityAtSpecificity, self).__init__(
+        specificity, num_thresholds=num_thresholds, name=name, dtype=dtype)
+
+  def result(self):
+    # Calculate specificities at all the thresholds.
+    specificities = math_ops.div_no_nan(self.tn, self.tn + self.fp)
+
+    # Find the index of the threshold where the specificity is closest to the
+    # given specificity.
+    min_index = math_ops.argmin(
+        math_ops.abs(specificities - self.value), axis=0)
+    min_index = math_ops.cast(min_index, dtypes.int32)
+
+    # Compute sensitivity at that index.
+    return math_ops.div_no_nan(self.tp[min_index],
+                               self.tp[min_index] + self.fn[min_index])
+
+
+class SpecificityAtSensitivity(SensitivitySpecificityBase):
+  """Computes the specificity at a given sensitivity.
+
+  `Sensitivity` measures the proportion of actual positives that are correctly
+  identified as such (tp / (tp + fn)).
+  `Specificity` measures the proportion of actual negatives that are correctly
+  identified as such (tn / (tn + fp)).
+
+  This metric creates four local variables, `true_positives`, `true_negatives`,
+  `false_positives` and `false_negatives` that are used to compute the
+  specificity at the given sensitivity. The threshold for the given sensitivity
+  value is computed and used to evaluate the corresponding specificity.
+
+  If `sample_weight` is `None`, weights default to 1.
+  Use `sample_weight` of 0 to mask values.
+
+  For additional information about specificity and sensitivity, see the
+  following: https://en.wikipedia.org/wiki/Sensitivity_and_specificity
+
+  Usage:
+
+  ```python
+  m = tf.metrics.SpecificityAtSensitivity(0.8, num_thresholds=1)
+  m.update_state([0, 0, 1, 1], [0, 0.5, 0.3, 0.9])
+  print('Final result: ', m.result().numpy())  # Final result: 1.0
+  ```
+
+  Usage with tf.keras API:
+
+  ```python
+  model = keras.models.Model(inputs, outputs)
+  model.compile(
+      'sgd',
+      loss='mse',
+      metrics=[tf.metrics.SpecificityAtSensitivity()])
+  ```
+  """
+
+  def __init__(self, sensitivity, num_thresholds=200, name=None, dtype=None):
+    """Creates a `SpecificityAtSensitivity` instance.
+
+    Args:
+      sensitivity: A scalar value in range `[0, 1]`.
+      num_thresholds: (Optional) Defaults to 200. The number of thresholds to
+        use for matching the given specificity.
+      name: (Optional) string name of the metric instance.
+      dtype: (Optional) data type of the metric result.
+    """
+    if sensitivity < 0 or sensitivity > 1:
+      raise ValueError('`sensitivity` must be in the range [0, 1].')
+    super(SpecificityAtSensitivity, self).__init__(
+        sensitivity, num_thresholds=num_thresholds, name=name, dtype=dtype)
+
+  def result(self):
+    # Calculate sensitivities at all the thresholds.
+    sensitivities = math_ops.div_no_nan(self.tp, self.tp + self.fn)
+
+    # Find the index of the threshold where the sensitivity is closest to the
+    # given specificity.
+    min_index = math_ops.argmin(
+        math_ops.abs(sensitivities - self.value), axis=0)
+    min_index = math_ops.cast(min_index, dtypes.int32)
+
+    # Compute specificity at that index.
+    return math_ops.div_no_nan(self.tn[min_index],
+                               self.tn[min_index] + self.fp[min_index])
+
+
+class CosineProximity(MeanMetricWrapper):
+  """Computes the cosine distance between the labels and predictions.
+
+  For example, if `y_true` is [0, 1, 1], and `y_pred` is [1, 0, 1], the cosine
+  proximity is -0.5.
+
+  This metric keeps the average cosine distance between `predictions` and
+  `labels` over a stream of data.
+
+  Usage:
+  ```python
+  m = tf.metrics.CosineProximity()
+  m.update_state([0, 1, 1], [1, 0, 1])
+  print('Final result: ', m.result().numpy())  # Final result: -0.5
+  ```
+
+  Usage with tf.keras API:
+
+  ```python
+  model = keras.models.Model(inputs, outputs)
+  model.compile(
+      'sgd',
+      loss='mse',
+      metrics=[tf.metrics.CosineProximity()])
+  ```
+  """
+
+  def __init__(self, name='cosine_proximity', dtype=None):
+    super(CosineProximity, self).__init__(cosine, name, dtype=dtype)
+
+  @classmethod
+  def from_config(cls, config):
+    if 'fn' in config:
+      config.pop('fn')
+    return super(CosineProximity, cls).from_config(config)
+
+
+def accuracy(y_true, y_pred):
+  y_pred.get_shape().assert_is_compatible_with(y_true.get_shape())
+  if y_true.dtype != y_pred.dtype:
+    y_pred = math_ops.cast(y_pred, y_true.dtype)
+  return math_ops.cast(math_ops.equal(y_true, y_pred), K.floatx())
+
 
 @tf_export('keras.metrics.binary_accuracy')
 def binary_accuracy(y_true, y_pred, threshold=0.5):
diff --git a/tensorflow/python/keras/metrics_test.py b/tensorflow/python/keras/metrics_test.py
index 5f5565d4d5a..92398acd8e6 100644
--- a/tensorflow/python/keras/metrics_test.py
+++ b/tensorflow/python/keras/metrics_test.py
@@ -19,22 +19,25 @@ from __future__ import division
 from __future__ import print_function
 
 import os
+from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.python.eager import context
+from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.keras import backend as K
 from tensorflow.python.keras import layers
 from tensorflow.python.keras import metrics
-from tensorflow.python.keras.engine.training import Model
+from tensorflow.python.keras.models import Sequential
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 from tensorflow.python.training.checkpointable import util as checkpointable_utils
+from tensorflow.python.training.rmsprop import RMSPropOptimizer
 
 
 class KerasMetricsTest(test.TestCase):
@@ -47,7 +50,7 @@ class KerasMetricsTest(test.TestCase):
         output = metric(y_a, y_b)
         self.assertEqual(K.eval(output).shape, (6,))
 
-  def test_sparse_categorical_accuracy(self):
+  def test_sparse_categorical_accuracy_int(self):
     with self.cached_session():
       metric = metrics.sparse_categorical_accuracy
       y_true = K.variable(np.random.randint(0, 7, (6,)))
@@ -128,116 +131,6 @@ class KerasMetricsTest(test.TestCase):
       result = K.eval(metrics.top_k_categorical_accuracy(y_true, y_pred, k=1))
       self.assertEqual(result, 0.)
 
-  def test_stateful_metrics(self):
-    with self.cached_session():
-      np.random.seed(1334)
-
-      class BinaryTruePositives(layers.Layer):
-        """Stateful Metric to count the total true positives over all batches.
-
-        Assumes predictions and targets of shape `(samples, 1)`.
-
-        Arguments:
-            threshold: Float, lower limit on prediction value that counts as a
-                positive class prediction.
-            name: String, name for the metric.
-        """
-
-        def __init__(self, name='true_positives', **kwargs):
-          super(BinaryTruePositives, self).__init__(name=name, **kwargs)
-          self.true_positives = K.variable(value=0, dtype='int32')
-          self.stateful = True
-
-        def reset_states(self):
-          K.set_value(self.true_positives, 0)
-
-        def __call__(self, y_true, y_pred):
-          """Computes the number of true positives in a batch.
-
-          Args:
-              y_true: Tensor, batch_wise labels
-              y_pred: Tensor, batch_wise predictions
-
-          Returns:
-              The total number of true positives seen this epoch at the
-                  completion of the batch.
-          """
-          y_true = math_ops.cast(y_true, 'int32')
-          y_pred = math_ops.cast(math_ops.round(y_pred), 'int32')
-          correct_preds = math_ops.cast(math_ops.equal(y_pred, y_true), 'int32')
-          true_pos = math_ops.cast(
-              math_ops.reduce_sum(correct_preds * y_true), 'int32')
-          current_true_pos = self.true_positives * 1
-          self.add_update(
-              state_ops.assign_add(self.true_positives, true_pos),
-              inputs=[y_true, y_pred])
-          return current_true_pos + true_pos
-
-      metric_fn = BinaryTruePositives()
-      config = metrics.serialize(metric_fn)
-      metric_fn = metrics.deserialize(
-          config, custom_objects={'BinaryTruePositives': BinaryTruePositives})
-
-      # Test on simple model
-      inputs = layers.Input(shape=(2,))
-      outputs = layers.Dense(1, activation='sigmoid')(inputs)
-      model = Model(inputs, outputs)
-      model.compile(optimizer='sgd',
-                    loss='binary_crossentropy',
-                    metrics=['acc', metric_fn])
-
-      # Test fit, evaluate
-      samples = 100
-      x = np.random.random((samples, 2))
-      y = np.random.randint(2, size=(samples, 1))
-      val_samples = 10
-      val_x = np.random.random((val_samples, 2))
-      val_y = np.random.randint(2, size=(val_samples, 1))
-
-      history = model.fit(x, y,
-                          epochs=1,
-                          batch_size=10,
-                          validation_data=(val_x, val_y))
-      outs = model.evaluate(x, y, batch_size=10)
-      preds = model.predict(x)
-
-      def ref_true_pos(y_true, y_pred):
-        return np.sum(np.logical_and(y_pred > 0.5, y_true == 1))
-
-      # Test correctness (e.g. updates should have been run)
-      self.assertAllClose(outs[2], ref_true_pos(y, preds), atol=1e-5)
-
-      # Test correctness of the validation metric computation
-      val_preds = model.predict(val_x)
-      val_outs = model.evaluate(val_x, val_y, batch_size=10)
-      self.assertAllClose(
-          val_outs[2], ref_true_pos(val_y, val_preds), atol=1e-5)
-      self.assertAllClose(
-          val_outs[2], history.history['val_true_positives'][-1], atol=1e-5)
-
-      # Test with generators
-      gen = [(np.array([x0]), np.array([y0])) for x0, y0 in zip(x, y)]
-      val_gen = [(np.array([x0]), np.array([y0]))
-                 for x0, y0 in zip(val_x, val_y)]
-      history = model.fit_generator(iter(gen),
-                                    epochs=1,
-                                    steps_per_epoch=samples,
-                                    validation_data=iter(val_gen),
-                                    validation_steps=val_samples)
-      outs = model.evaluate_generator(iter(gen), steps=samples)
-      preds = model.predict_generator(iter(gen), steps=samples)
-
-      # Test correctness of the metric results
-      self.assertAllClose(outs[2], ref_true_pos(y, preds), atol=1e-5)
-
-      # Test correctness of the validation metric computation
-      val_preds = model.predict_generator(iter(val_gen), steps=val_samples)
-      val_outs = model.evaluate_generator(iter(val_gen), steps=val_samples)
-      self.assertAllClose(
-          val_outs[2], ref_true_pos(val_y, val_preds), atol=1e-5)
-      self.assertAllClose(
-          val_outs[2], history.history['val_true_positives'][-1], atol=1e-5)
-
   @test_util.run_in_graph_and_eager_modes(assert_no_eager_garbage=True)
   def test_mean(self):
     m = metrics.Mean(name='my_mean')
@@ -319,19 +212,19 @@ class KerasMetricsTest(test.TestCase):
       m = metrics.Mean()
       v = array_ops.placeholder(dtypes.float32)
       w = array_ops.placeholder(dtypes.float32)
-      sess.run(variables.variables_initializer(m.variables))
+      self.evaluate(variables.variables_initializer(m.variables))
 
       # check __call__()
       result_t = m(v, sample_weight=w)
       result = sess.run(result_t, feed_dict=({v: 100, w: 0.5}))
-      self.assertEqual(sess.run(m.total), 50)
-      self.assertEqual(sess.run(m.count), 0.5)
+      self.assertEqual(self.evaluate(m.total), 50)
+      self.assertEqual(self.evaluate(m.count), 0.5)
       self.assertEqual(result, 50 / 0.5)
 
       # check update_state() and result()
       result = sess.run(result_t, feed_dict=({v: [1, 5], w: [1, 0.2]}))
-      self.assertAlmostEqual(sess.run(m.total), 52, 2)  # 50 + 1 + 5 * 0.2
-      self.assertAlmostEqual(sess.run(m.count), 1.7, 2)  # 0.5 + 1.2
+      self.assertAlmostEqual(self.evaluate(m.total), 52, 2)  # 50 + 1 + 5 * 0.2
+      self.assertAlmostEqual(self.evaluate(m.count), 1.7, 2)  # 0.5 + 1.2
       self.assertAlmostEqual(result, 52 / 1.7, 2)
 
   @test_util.run_in_graph_and_eager_modes
@@ -365,6 +258,28 @@ class KerasMetricsTest(test.TestCase):
     self.assertEqual(200., self.evaluate(restore_mean.result()))
     self.assertEqual(3, self.evaluate(restore_mean.count))
 
+  @test_util.run_in_graph_and_eager_modes
+  def test_accuracy(self):
+    acc_obj = metrics.Accuracy(name='my acc')
+
+    # check config
+    self.assertEqual(acc_obj.name, 'my acc')
+    self.assertTrue(acc_obj.stateful)
+    self.assertEqual(len(acc_obj.variables), 2)
+    self.assertEqual(acc_obj.dtype, dtypes.float32)
+    self.evaluate(variables.variables_initializer(acc_obj.variables))
+
+    # verify that correct value is returned
+    update_op = acc_obj.update_state([[1], [2], [3], [4]], [[1], [2], [3], [4]])
+    self.evaluate(update_op)
+    result = self.evaluate(acc_obj.result())
+    self.assertEqual(result, 1)  # 2/2
+
+    # check with sample_weight
+    result_t = acc_obj([[2], [1]], [[2], [0]], sample_weight=[[0.5], [0.2]])
+    result = self.evaluate(result_t)
+    self.assertAlmostEqual(result, 0.96, 2)  # 4.5/4.7
+
   @test_util.run_in_graph_and_eager_modes
   def test_binary_accuracy(self):
     acc_obj = metrics.BinaryAccuracy(name='my acc')
@@ -398,11 +313,6 @@ class KerasMetricsTest(test.TestCase):
     result = self.evaluate(result_t)
     self.assertAlmostEqual(result, 0.67, 2)  # 4.5/6.7
 
-    # check incompatible shapes
-    with self.assertRaisesRegexp(ValueError,
-                                 r'Shapes \(1,\) and \(2,\) are incompatible'):
-      acc_obj.update_state([1, 1], [1])
-
   @test_util.run_in_graph_and_eager_modes
   def test_binary_accuracy_threshold(self):
     acc_obj = metrics.BinaryAccuracy(threshold=0.7)
@@ -436,47 +346,830 @@ class KerasMetricsTest(test.TestCase):
     self.assertAlmostEqual(result, 0.93, 2)  # 2.5/2.7
 
   @test_util.run_in_graph_and_eager_modes
-  def test_invalid_result(self):
+  def test_sparse_categorical_accuracy(self):
+    acc_obj = metrics.SparseCategoricalAccuracy(name='my acc')
 
-    class InvalidResult(metrics.Metric):
+    # check config
+    self.assertEqual(acc_obj.name, 'my acc')
+    self.assertTrue(acc_obj.stateful)
+    self.assertEqual(len(acc_obj.variables), 2)
+    self.assertEqual(acc_obj.dtype, dtypes.float32)
+    self.evaluate(variables.variables_initializer(acc_obj.variables))
 
-      def __init__(self, name='invalid-result', dtype=dtypes.float64):
-        super(InvalidResult, self).__init__(name=name, dtype=dtype)
+    # verify that correct value is returned
+    update_op = acc_obj.update_state([[2], [1]],
+                                     [[0.1, 0.1, 0.8], [0.05, 0.95, 0]])
+    self.evaluate(update_op)
+    result = self.evaluate(acc_obj.result())
+    self.assertEqual(result, 1)  # 2/2
 
-      def update_state(self, *args, **kwargs):
-        pass
+    # check with sample_weight
+    result_t = acc_obj([[2], [1]], [[0.1, 0.1, 0.8], [0.05, 0, 0.95]],
+                       [[0.5], [0.2]])
+    result = self.evaluate(result_t)
+    self.assertAlmostEqual(result, 0.93, 2)  # 2.5/2.7
 
-      def result(self):
-        return 1
 
-    invalid_result_obj = InvalidResult()
+def _get_simple_sequential_model(compile_metrics):
+  model = Sequential()
+  model.add(
+      layers.Dense(
+          3, activation='relu', input_dim=4, kernel_initializer='ones'))
+  model.add(layers.Dense(1, activation='sigmoid', kernel_initializer='ones'))
+  model.compile(
+      loss='mae',
+      metrics=compile_metrics,
+      optimizer=RMSPropOptimizer(learning_rate=0.001))
+  return model
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class FalsePositivesTest(test.TestCase):
+
+  def test_config(self):
+    fp_obj = metrics.FalsePositives(name='my_fp', thresholds=[0.4, 0.9])
+    self.assertEqual(fp_obj.name, 'my_fp')
+    self.assertEqual(len(fp_obj.variables), 1)
+    self.assertEqual(fp_obj.thresholds, [0.4, 0.9])
+
+  def test_unweighted(self):
+    fp_obj = metrics.FalsePositives()
+    self.evaluate(variables.variables_initializer(fp_obj.variables))
+
+    y_true = constant_op.constant(((0, 1, 0, 1, 0), (0, 0, 1, 1, 1),
+                                   (1, 1, 1, 1, 0), (0, 0, 0, 0, 1)))
+    y_pred = constant_op.constant(((0, 0, 1, 1, 0), (1, 1, 1, 1, 1),
+                                   (0, 1, 0, 1, 0), (1, 1, 1, 1, 1)))
+
+    update_op = fp_obj.update_state(y_true, y_pred)
+    self.evaluate(update_op)
+    result = fp_obj.result()
+    self.assertAllClose(7., result)
+
+  def test_weighted(self):
+    fp_obj = metrics.FalsePositives()
+    self.evaluate(variables.variables_initializer(fp_obj.variables))
+    y_true = constant_op.constant(((0, 1, 0, 1, 0), (0, 0, 1, 1, 1),
+                                   (1, 1, 1, 1, 0), (0, 0, 0, 0, 1)))
+    y_pred = constant_op.constant(((0, 0, 1, 1, 0), (1, 1, 1, 1, 1),
+                                   (0, 1, 0, 1, 0), (1, 1, 1, 1, 1)))
+    sample_weight = constant_op.constant((1., 1.5, 2., 2.5))
+    result = fp_obj(y_true, y_pred, sample_weight=sample_weight)
+    self.assertAllClose(14., self.evaluate(result))
+
+  def test_unweighted_with_thresholds(self):
+    fp_obj = metrics.FalsePositives(thresholds=[0.15, 0.5, 0.85])
+    self.evaluate(variables.variables_initializer(fp_obj.variables))
+
+    y_pred = constant_op.constant(((0.9, 0.2, 0.8, 0.1), (0.2, 0.9, 0.7, 0.6),
+                                   (0.1, 0.2, 0.4, 0.3), (0, 1, 0.7, 0.3)))
+    y_true = constant_op.constant(((0, 1, 1, 0), (1, 0, 0, 0), (0, 0, 0, 0),
+                                   (1, 1, 1, 1)))
+
+    update_op = fp_obj.update_state(y_true, y_pred)
+    self.evaluate(update_op)
+    result = fp_obj.result()
+    self.assertAllClose([7., 4., 2.], result)
+
+  def test_weighted_with_thresholds(self):
+    fp_obj = metrics.FalsePositives(thresholds=[0.15, 0.5, 0.85])
+    self.evaluate(variables.variables_initializer(fp_obj.variables))
+
+    y_pred = constant_op.constant(((0.9, 0.2, 0.8, 0.1), (0.2, 0.9, 0.7, 0.6),
+                                   (0.1, 0.2, 0.4, 0.3), (0, 1, 0.7, 0.3)))
+    y_true = constant_op.constant(((0, 1, 1, 0), (1, 0, 0, 0), (0, 0, 0, 0),
+                                   (1, 1, 1, 1)))
+    sample_weight = ((1.0, 2.0, 3.0, 5.0), (7.0, 11.0, 13.0, 17.0),
+                     (19.0, 23.0, 29.0, 31.0), (5.0, 15.0, 10.0, 0))
+
+    result = fp_obj(y_true, y_pred, sample_weight=sample_weight)
+    self.assertAllClose([125., 42., 12.], self.evaluate(result))
+
+  def test_threshold_limit(self):
     with self.assertRaisesRegexp(
-        TypeError,
-        'Metric invalid-result\'s result must be a Tensor or Operation, given:'
-    ):
-      invalid_result_obj.result()
+        ValueError,
+        r'Threshold values must be in \[0, 1\]. Invalid values: \[-1, 2\]'):
+      metrics.FalsePositives(thresholds=[-1, 0.5, 2])
 
-  @test_util.run_in_graph_and_eager_modes
-  def test_invalid_update(self):
+  def test_reset_states(self):
+    fp_obj = metrics.FalsePositives()
+    model = _get_simple_sequential_model([fp_obj])
+    x = np.ones((100, 4))
+    y = np.zeros((100, 1))
+    model.evaluate(x, y)
+    self.assertEqual(self.evaluate(fp_obj.accumulator), 100.)
+    model.evaluate(x, y)
+    self.assertEqual(self.evaluate(fp_obj.accumulator), 100.)
 
-    class InvalidUpdate(metrics.Metric):
 
-      def __init__(self, name='invalid-update', dtype=dtypes.float64):
-        super(InvalidUpdate, self).__init__(name=name, dtype=dtype)
+@test_util.run_all_in_graph_and_eager_modes
+class FalseNegativesTest(test.TestCase):
 
-      def update_state(self, *args, **kwargs):
-        return [1]
+  def test_config(self):
+    fn_obj = metrics.FalseNegatives(name='my_fn', thresholds=[0.4, 0.9])
+    self.assertEqual(fn_obj.name, 'my_fn')
+    self.assertEqual(len(fn_obj.variables), 1)
+    self.assertEqual(fn_obj.thresholds, [0.4, 0.9])
 
-      def result(self):
-        pass
+  def test_unweighted(self):
+    fn_obj = metrics.FalseNegatives()
+    self.evaluate(variables.variables_initializer(fn_obj.variables))
 
-    invalid_update_obj = InvalidUpdate()
+    y_true = constant_op.constant(((0, 1, 0, 1, 0), (0, 0, 1, 1, 1),
+                                   (1, 1, 1, 1, 0), (0, 0, 0, 0, 1)))
+    y_pred = constant_op.constant(((0, 0, 1, 1, 0), (1, 1, 1, 1, 1),
+                                   (0, 1, 0, 1, 0), (1, 1, 1, 1, 1)))
+
+    update_op = fn_obj.update_state(y_true, y_pred)
+    self.evaluate(update_op)
+    result = fn_obj.result()
+    self.assertAllClose(3., result)
+
+  def test_weighted(self):
+    fn_obj = metrics.FalseNegatives()
+    self.evaluate(variables.variables_initializer(fn_obj.variables))
+    y_true = constant_op.constant(((0, 1, 0, 1, 0), (0, 0, 1, 1, 1),
+                                   (1, 1, 1, 1, 0), (0, 0, 0, 0, 1)))
+    y_pred = constant_op.constant(((0, 0, 1, 1, 0), (1, 1, 1, 1, 1),
+                                   (0, 1, 0, 1, 0), (1, 1, 1, 1, 1)))
+    sample_weight = constant_op.constant((1., 1.5, 2., 2.5))
+    result = fn_obj(y_true, y_pred, sample_weight=sample_weight)
+    self.assertAllClose(5., self.evaluate(result))
+
+  def test_unweighted_with_thresholds(self):
+    fn_obj = metrics.FalseNegatives(thresholds=[0.15, 0.5, 0.85])
+    self.evaluate(variables.variables_initializer(fn_obj.variables))
+
+    y_pred = constant_op.constant(((0.9, 0.2, 0.8, 0.1), (0.2, 0.9, 0.7, 0.6),
+                                   (0.1, 0.2, 0.4, 0.3), (0, 1, 0.7, 0.3)))
+    y_true = constant_op.constant(((0, 1, 1, 0), (1, 0, 0, 0), (0, 0, 0, 0),
+                                   (1, 1, 1, 1)))
+
+    update_op = fn_obj.update_state(y_true, y_pred)
+    self.evaluate(update_op)
+    result = fn_obj.result()
+    self.assertAllClose([1., 4., 6.], result)
+
+  def test_weighted_with_thresholds(self):
+    fn_obj = metrics.FalseNegatives(thresholds=[0.15, 0.5, 0.85])
+    self.evaluate(variables.variables_initializer(fn_obj.variables))
+
+    y_pred = constant_op.constant(((0.9, 0.2, 0.8, 0.1), (0.2, 0.9, 0.7, 0.6),
+                                   (0.1, 0.2, 0.4, 0.3), (0, 1, 0.7, 0.3)))
+    y_true = constant_op.constant(((0, 1, 1, 0), (1, 0, 0, 0), (0, 0, 0, 0),
+                                   (1, 1, 1, 1)))
+    sample_weight = ((3.0,), (5.0,), (7.0,), (4.0,))
+
+    result = fn_obj(y_true, y_pred, sample_weight=sample_weight)
+    self.assertAllClose([4., 16., 23.], self.evaluate(result))
+
+  def test_reset_states(self):
+    fn_obj = metrics.FalseNegatives()
+    model = _get_simple_sequential_model([fn_obj])
+    x = np.zeros((100, 4))
+    y = np.ones((100, 1))
+    model.evaluate(x, y)
+    self.assertEqual(self.evaluate(fn_obj.accumulator), 100.)
+    model.evaluate(x, y)
+    self.assertEqual(self.evaluate(fn_obj.accumulator), 100.)
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class TrueNegativesTest(test.TestCase):
+
+  def test_config(self):
+    tn_obj = metrics.TrueNegatives(name='my_tn', thresholds=[0.4, 0.9])
+    self.assertEqual(tn_obj.name, 'my_tn')
+    self.assertEqual(len(tn_obj.variables), 1)
+    self.assertEqual(tn_obj.thresholds, [0.4, 0.9])
+
+  def test_unweighted(self):
+    tn_obj = metrics.TrueNegatives()
+    self.evaluate(variables.variables_initializer(tn_obj.variables))
+
+    y_true = constant_op.constant(((0, 1, 0, 1, 0), (0, 0, 1, 1, 1),
+                                   (1, 1, 1, 1, 0), (0, 0, 0, 0, 1)))
+    y_pred = constant_op.constant(((0, 0, 1, 1, 0), (1, 1, 1, 1, 1),
+                                   (0, 1, 0, 1, 0), (1, 1, 1, 1, 1)))
+
+    update_op = tn_obj.update_state(y_true, y_pred)
+    self.evaluate(update_op)
+    result = tn_obj.result()
+    self.assertAllClose(3., result)
+
+  def test_weighted(self):
+    tn_obj = metrics.TrueNegatives()
+    self.evaluate(variables.variables_initializer(tn_obj.variables))
+    y_true = constant_op.constant(((0, 1, 0, 1, 0), (0, 0, 1, 1, 1),
+                                   (1, 1, 1, 1, 0), (0, 0, 0, 0, 1)))
+    y_pred = constant_op.constant(((0, 0, 1, 1, 0), (1, 1, 1, 1, 1),
+                                   (0, 1, 0, 1, 0), (1, 1, 1, 1, 1)))
+    sample_weight = constant_op.constant((1., 1.5, 2., 2.5))
+    result = tn_obj(y_true, y_pred, sample_weight=sample_weight)
+    self.assertAllClose(4., self.evaluate(result))
+
+  def test_unweighted_with_thresholds(self):
+    tn_obj = metrics.TrueNegatives(thresholds=[0.15, 0.5, 0.85])
+    self.evaluate(variables.variables_initializer(tn_obj.variables))
+
+    y_pred = constant_op.constant(((0.9, 0.2, 0.8, 0.1), (0.2, 0.9, 0.7, 0.6),
+                                   (0.1, 0.2, 0.4, 0.3), (0, 1, 0.7, 0.3)))
+    y_true = constant_op.constant(((0, 1, 1, 0), (1, 0, 0, 0), (0, 0, 0, 0),
+                                   (1, 1, 1, 1)))
+
+    update_op = tn_obj.update_state(y_true, y_pred)
+    self.evaluate(update_op)
+    result = tn_obj.result()
+    self.assertAllClose([2., 5., 7.], result)
+
+  def test_weighted_with_thresholds(self):
+    tn_obj = metrics.TrueNegatives(thresholds=[0.15, 0.5, 0.85])
+    self.evaluate(variables.variables_initializer(tn_obj.variables))
+
+    y_pred = constant_op.constant(((0.9, 0.2, 0.8, 0.1), (0.2, 0.9, 0.7, 0.6),
+                                   (0.1, 0.2, 0.4, 0.3), (0, 1, 0.7, 0.3)))
+    y_true = constant_op.constant(((0, 1, 1, 0), (1, 0, 0, 0), (0, 0, 0, 0),
+                                   (1, 1, 1, 1)))
+    sample_weight = ((0.0, 2.0, 3.0, 5.0),)
+
+    result = tn_obj(y_true, y_pred, sample_weight=sample_weight)
+    self.assertAllClose([5., 15., 23.], self.evaluate(result))
+
+  def test_reset_states(self):
+    tn_obj = metrics.TrueNegatives()
+    model = _get_simple_sequential_model([tn_obj])
+    x = np.zeros((100, 4))
+    y = np.zeros((100, 1))
+    model.evaluate(x, y)
+    self.assertEqual(self.evaluate(tn_obj.accumulator), 100.)
+    model.evaluate(x, y)
+    self.assertEqual(self.evaluate(tn_obj.accumulator), 100.)
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class TruePositivesTest(test.TestCase):
+
+  def test_config(self):
+    tp_obj = metrics.TruePositives(name='my_tp', thresholds=[0.4, 0.9])
+    self.assertEqual(tp_obj.name, 'my_tp')
+    self.assertEqual(len(tp_obj.variables), 1)
+    self.assertEqual(tp_obj.thresholds, [0.4, 0.9])
+
+  def test_unweighted(self):
+    tp_obj = metrics.TruePositives()
+    self.evaluate(variables.variables_initializer(tp_obj.variables))
+
+    y_true = constant_op.constant(((0, 1, 0, 1, 0), (0, 0, 1, 1, 1),
+                                   (1, 1, 1, 1, 0), (0, 0, 0, 0, 1)))
+    y_pred = constant_op.constant(((0, 0, 1, 1, 0), (1, 1, 1, 1, 1),
+                                   (0, 1, 0, 1, 0), (1, 1, 1, 1, 1)))
+
+    update_op = tp_obj.update_state(y_true, y_pred)
+    self.evaluate(update_op)
+    result = tp_obj.result()
+    self.assertAllClose(7., result)
+
+  def test_weighted(self):
+    tp_obj = metrics.TruePositives()
+    self.evaluate(variables.variables_initializer(tp_obj.variables))
+    y_true = constant_op.constant(((0, 1, 0, 1, 0), (0, 0, 1, 1, 1),
+                                   (1, 1, 1, 1, 0), (0, 0, 0, 0, 1)))
+    y_pred = constant_op.constant(((0, 0, 1, 1, 0), (1, 1, 1, 1, 1),
+                                   (0, 1, 0, 1, 0), (1, 1, 1, 1, 1)))
+    sample_weight = constant_op.constant((1., 1.5, 2., 2.5))
+    result = tp_obj(y_true, y_pred, sample_weight=sample_weight)
+    self.assertAllClose(12., self.evaluate(result))
+
+  def test_unweighted_with_thresholds(self):
+    tp_obj = metrics.TruePositives(thresholds=[0.15, 0.5, 0.85])
+    self.evaluate(variables.variables_initializer(tp_obj.variables))
+
+    y_pred = constant_op.constant(((0.9, 0.2, 0.8, 0.1), (0.2, 0.9, 0.7, 0.6),
+                                   (0.1, 0.2, 0.4, 0.3), (0, 1, 0.7, 0.3)))
+    y_true = constant_op.constant(((0, 1, 1, 0), (1, 0, 0, 0), (0, 0, 0, 0),
+                                   (1, 1, 1, 1)))
+
+    update_op = tp_obj.update_state(y_true, y_pred)
+    self.evaluate(update_op)
+    result = tp_obj.result()
+    self.assertAllClose([6., 3., 1.], result)
+
+  def test_weighted_with_thresholds(self):
+    tp_obj = metrics.TruePositives(thresholds=[0.15, 0.5, 0.85])
+    self.evaluate(variables.variables_initializer(tp_obj.variables))
+
+    y_pred = constant_op.constant(((0.9, 0.2, 0.8, 0.1), (0.2, 0.9, 0.7, 0.6),
+                                   (0.1, 0.2, 0.4, 0.3), (0, 1, 0.7, 0.3)))
+    y_true = constant_op.constant(((0, 1, 1, 0), (1, 0, 0, 0), (0, 0, 0, 0),
+                                   (1, 1, 1, 1)))
+
+    result = tp_obj(y_true, y_pred, sample_weight=37.)
+    self.assertAllClose([222., 111., 37.], self.evaluate(result))
+
+  def test_reset_states(self):
+    tp_obj = metrics.TruePositives()
+    model = _get_simple_sequential_model([tp_obj])
+    x = np.ones((100, 4))
+    y = np.ones((100, 1))
+    model.evaluate(x, y)
+    self.assertEqual(self.evaluate(tp_obj.accumulator), 100.)
+    model.evaluate(x, y)
+    self.assertEqual(self.evaluate(tp_obj.accumulator), 100.)
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class PrecisionTest(test.TestCase):
+
+  def test_config(self):
+    p_obj = metrics.Precision(name='my_precision', thresholds=[0.4, 0.9])
+    self.assertEqual(p_obj.name, 'my_precision')
+    self.assertLen(p_obj.variables, 2)
+    self.assertEqual([v.name for v in p_obj.variables],
+                     ['true_positives:0', 'false_positives:0'])
+    self.assertEqual(p_obj.thresholds, [0.4, 0.9])
+
+  def test_value_is_idempotent(self):
+    p_obj = metrics.Precision(thresholds=[0.3, 0.72])
+    y_pred = random_ops.random_uniform(shape=(10, 3))
+    y_true = random_ops.random_uniform(shape=(10, 3))
+    update_op = p_obj.update_state(y_true, y_pred)
+    self.evaluate(variables.variables_initializer(p_obj.variables))
+
+    # Run several updates.
+    for _ in range(10):
+      self.evaluate(update_op)
+
+    # Then verify idempotency.
+    initial_precision = self.evaluate(p_obj.result())
+    for _ in range(10):
+      self.assertArrayNear(initial_precision, self.evaluate(p_obj.result()),
+                           1e-3)
+
+  def test_unweighted(self):
+    p_obj = metrics.Precision()
+    y_pred = constant_op.constant([1, 0, 1, 0], shape=(1, 4))
+    y_true = constant_op.constant([0, 1, 1, 0], shape=(1, 4))
+    self.evaluate(variables.variables_initializer(p_obj.variables))
+    result = p_obj(y_true, y_pred)
+    self.assertAlmostEqual(0.5, self.evaluate(result))
+
+  def test_unweighted_all_incorrect(self):
+    p_obj = metrics.Precision(thresholds=[0.5])
+    inputs = np.random.randint(0, 2, size=(100, 1))
+    y_pred = constant_op.constant(inputs)
+    y_true = constant_op.constant(1 - inputs)
+    self.evaluate(variables.variables_initializer(p_obj.variables))
+    result = p_obj(y_true, y_pred)
+    self.assertAlmostEqual(0, self.evaluate(result))
+
+  def test_weighted(self):
+    p_obj = metrics.Precision()
+    y_pred = constant_op.constant([[1, 0, 1, 0], [1, 0, 1, 0]])
+    y_true = constant_op.constant([[0, 1, 1, 0], [1, 0, 0, 1]])
+    self.evaluate(variables.variables_initializer(p_obj.variables))
+    result = p_obj(
+        y_true,
+        y_pred,
+        sample_weight=constant_op.constant([[1, 2, 3, 4], [4, 3, 2, 1]]))
+    weighted_tp = 3.0 + 4.0
+    weighted_positives = (1.0 + 3.0) + (4.0 + 2.0)
+    expected_precision = weighted_tp / weighted_positives
+    self.assertAlmostEqual(expected_precision, self.evaluate(result))
+
+  def test_div_by_zero(self):
+    p_obj = metrics.Precision()
+    y_pred = constant_op.constant([0, 0, 0, 0])
+    y_true = constant_op.constant([0, 0, 0, 0])
+    self.evaluate(variables.variables_initializer(p_obj.variables))
+    result = p_obj(y_true, y_pred)
+    self.assertEqual(0, self.evaluate(result))
+
+  def test_unweighted_with_threshold(self):
+    p_obj = metrics.Precision(thresholds=[0.5, 0.7])
+    y_pred = constant_op.constant([1, 0, 0.6, 0], shape=(1, 4))
+    y_true = constant_op.constant([0, 1, 1, 0], shape=(1, 4))
+    self.evaluate(variables.variables_initializer(p_obj.variables))
+    result = p_obj(y_true, y_pred)
+    self.assertArrayNear([0.5, 0.], self.evaluate(result), 0)
+
+  def test_weighted_with_threshold(self):
+    p_obj = metrics.Precision(thresholds=[0.5, 1.])
+    y_true = constant_op.constant([[0, 1], [1, 0]], shape=(2, 2))
+    y_pred = constant_op.constant([[1, 0], [0.6, 0]],
+                                  shape=(2, 2),
+                                  dtype=dtypes.float32)
+    weights = constant_op.constant([[4, 0], [3, 1]],
+                                   shape=(2, 2),
+                                   dtype=dtypes.float32)
+    self.evaluate(variables.variables_initializer(p_obj.variables))
+    result = p_obj(y_true, y_pred, sample_weight=weights)
+    weighted_tp = 0 + 3.
+    weighted_positives = (0 + 3.) + (4. + 0.)
+    expected_precision = weighted_tp / weighted_positives
+    self.assertArrayNear([expected_precision, 0], self.evaluate(result), 1e-3)
+
+  def test_multiple_updates(self):
+    p_obj = metrics.Precision(thresholds=[0.5, 1.])
+    y_true = constant_op.constant([[0, 1], [1, 0]], shape=(2, 2))
+    y_pred = constant_op.constant([[1, 0], [0.6, 0]],
+                                  shape=(2, 2),
+                                  dtype=dtypes.float32)
+    weights = constant_op.constant([[4, 0], [3, 1]],
+                                   shape=(2, 2),
+                                   dtype=dtypes.float32)
+    self.evaluate(variables.variables_initializer(p_obj.variables))
+    update_op = p_obj.update_state(y_true, y_pred, sample_weight=weights)
+    for _ in range(2):
+      self.evaluate(update_op)
+
+    weighted_tp = (0 + 3.) + (0 + 3.)
+    weighted_positives = ((0 + 3.) + (4. + 0.)) + ((0 + 3.) + (4. + 0.))
+    expected_precision = weighted_tp / weighted_positives
+    self.assertArrayNear([expected_precision, 0], self.evaluate(p_obj.result()),
+                         1e-3)
+
+  def test_reset_states(self):
+    p_obj = metrics.Precision()
+    model = _get_simple_sequential_model([p_obj])
+    x = np.concatenate((np.ones((50, 4)), np.ones((50, 4))))
+    y = np.concatenate((np.ones((50, 1)), np.zeros((50, 1))))
+    model.evaluate(x, y)
+    self.assertEqual(self.evaluate(p_obj.tp), 50.)
+    self.assertEqual(self.evaluate(p_obj.fp), 50.)
+    model.evaluate(x, y)
+    self.assertEqual(self.evaluate(p_obj.tp), 50.)
+    self.assertEqual(self.evaluate(p_obj.fp), 50.)
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class RecallTest(test.TestCase):
+
+  def test_config(self):
+    r_obj = metrics.Recall(name='my_recall', thresholds=[0.4, 0.9])
+    self.assertEqual(r_obj.name, 'my_recall')
+    self.assertLen(r_obj.variables, 2)
+    self.assertEqual([v.name for v in r_obj.variables],
+                     ['true_positives:0', 'false_negatives:0'])
+    self.assertEqual(r_obj.thresholds, [0.4, 0.9])
+
+  def test_value_is_idempotent(self):
+    r_obj = metrics.Recall(thresholds=[0.3, 0.72])
+    y_pred = random_ops.random_uniform(shape=(10, 3))
+    y_true = random_ops.random_uniform(shape=(10, 3))
+    update_op = r_obj.update_state(y_true, y_pred)
+    self.evaluate(variables.variables_initializer(r_obj.variables))
+
+    # Run several updates.
+    for _ in range(10):
+      self.evaluate(update_op)
+
+    # Then verify idempotency.
+    initial_recall = self.evaluate(r_obj.result())
+    for _ in range(10):
+      self.assertArrayNear(initial_recall, self.evaluate(r_obj.result()), 1e-3)
+
+  def test_unweighted(self):
+    r_obj = metrics.Recall()
+    y_pred = constant_op.constant([1, 0, 1, 0], shape=(1, 4))
+    y_true = constant_op.constant([0, 1, 1, 0], shape=(1, 4))
+    self.evaluate(variables.variables_initializer(r_obj.variables))
+    result = r_obj(y_true, y_pred)
+    self.assertAlmostEqual(0.5, self.evaluate(result))
+
+  def test_unweighted_all_incorrect(self):
+    r_obj = metrics.Recall(thresholds=[0.5])
+    inputs = np.random.randint(0, 2, size=(100, 1))
+    y_pred = constant_op.constant(inputs)
+    y_true = constant_op.constant(1 - inputs)
+    self.evaluate(variables.variables_initializer(r_obj.variables))
+    result = r_obj(y_true, y_pred)
+    self.assertAlmostEqual(0, self.evaluate(result))
+
+  def test_weighted(self):
+    r_obj = metrics.Recall()
+    y_pred = constant_op.constant([[1, 0, 1, 0], [0, 1, 0, 1]])
+    y_true = constant_op.constant([[0, 1, 1, 0], [1, 0, 0, 1]])
+    self.evaluate(variables.variables_initializer(r_obj.variables))
+    result = r_obj(
+        y_true,
+        y_pred,
+        sample_weight=constant_op.constant([[1, 2, 3, 4], [4, 3, 2, 1]]))
+    weighted_tp = 3.0 + 1.0
+    weighted_t = (2.0 + 3.0) + (4.0 + 1.0)
+    expected_recall = weighted_tp / weighted_t
+    self.assertAlmostEqual(expected_recall, self.evaluate(result))
+
+  def test_div_by_zero(self):
+    r_obj = metrics.Recall()
+    y_pred = constant_op.constant([0, 0, 0, 0])
+    y_true = constant_op.constant([0, 0, 0, 0])
+    self.evaluate(variables.variables_initializer(r_obj.variables))
+    result = r_obj(y_true, y_pred)
+    self.assertEqual(0, self.evaluate(result))
+
+  def test_unweighted_with_threshold(self):
+    r_obj = metrics.Recall(thresholds=[0.5, 0.7])
+    y_pred = constant_op.constant([1, 0, 0.6, 0], shape=(1, 4))
+    y_true = constant_op.constant([0, 1, 1, 0], shape=(1, 4))
+    self.evaluate(variables.variables_initializer(r_obj.variables))
+    result = r_obj(y_true, y_pred)
+    self.assertArrayNear([0.5, 0.], self.evaluate(result), 0)
+
+  def test_weighted_with_threshold(self):
+    r_obj = metrics.Recall(thresholds=[0.5, 1.])
+    y_true = constant_op.constant([[0, 1], [1, 0]], shape=(2, 2))
+    y_pred = constant_op.constant([[1, 0], [0.6, 0]],
+                                  shape=(2, 2),
+                                  dtype=dtypes.float32)
+    weights = constant_op.constant([[1, 4], [3, 2]],
+                                   shape=(2, 2),
+                                   dtype=dtypes.float32)
+    self.evaluate(variables.variables_initializer(r_obj.variables))
+    result = r_obj(y_true, y_pred, sample_weight=weights)
+    weighted_tp = 0 + 3.
+    weighted_positives = (0 + 3.) + (4. + 0.)
+    expected_recall = weighted_tp / weighted_positives
+    self.assertArrayNear([expected_recall, 0], self.evaluate(result), 1e-3)
+
+  def test_multiple_updates(self):
+    r_obj = metrics.Recall(thresholds=[0.5, 1.])
+    y_true = constant_op.constant([[0, 1], [1, 0]], shape=(2, 2))
+    y_pred = constant_op.constant([[1, 0], [0.6, 0]],
+                                  shape=(2, 2),
+                                  dtype=dtypes.float32)
+    weights = constant_op.constant([[1, 4], [3, 2]],
+                                   shape=(2, 2),
+                                   dtype=dtypes.float32)
+    self.evaluate(variables.variables_initializer(r_obj.variables))
+    update_op = r_obj.update_state(y_true, y_pred, sample_weight=weights)
+    for _ in range(2):
+      self.evaluate(update_op)
+
+    weighted_tp = (0 + 3.) + (0 + 3.)
+    weighted_positives = ((0 + 3.) + (4. + 0.)) + ((0 + 3.) + (4. + 0.))
+    expected_recall = weighted_tp / weighted_positives
+    self.assertArrayNear([expected_recall, 0], self.evaluate(r_obj.result()),
+                         1e-3)
+
+  def test_reset_states(self):
+    r_obj = metrics.Recall()
+    model = _get_simple_sequential_model([r_obj])
+    x = np.concatenate((np.ones((50, 4)), np.zeros((50, 4))))
+    y = np.concatenate((np.ones((50, 1)), np.ones((50, 1))))
+    model.evaluate(x, y)
+    self.assertEqual(self.evaluate(r_obj.tp), 50.)
+    self.assertEqual(self.evaluate(r_obj.fn), 50.)
+    model.evaluate(x, y)
+    self.assertEqual(self.evaluate(r_obj.tp), 50.)
+    self.assertEqual(self.evaluate(r_obj.fn), 50.)
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class SensitivityAtSpecificityTest(test.TestCase, parameterized.TestCase):
+
+  def test_config(self):
+    s_obj = metrics.SensitivityAtSpecificity(
+        0.4, num_thresholds=100, name='sensitivity_at_specificity_1')
+    self.assertEqual(s_obj.name, 'sensitivity_at_specificity_1')
+    self.assertLen(s_obj.variables, 4)
+    self.assertEqual(s_obj.value, 0.4)
+    self.assertLen(s_obj.thresholds, 100)
+
+  def test_value_is_idempotent(self):
+    s_obj = metrics.SensitivityAtSpecificity(0.7)
+    y_pred = random_ops.random_uniform((10, 3),
+                                       maxval=1,
+                                       dtype=dtypes.float32,
+                                       seed=1)
+    y_true = random_ops.random_uniform((10, 3),
+                                       maxval=2,
+                                       dtype=dtypes.int64,
+                                       seed=1)
+    update_op = s_obj.update_state(y_true, y_pred)
+    self.evaluate(variables.variables_initializer(s_obj.variables))
+
+    # Run several updates.
+    for _ in range(10):
+      self.evaluate(update_op)
+
+    # Then verify idempotency.
+    initial_sensitivity = self.evaluate(s_obj.result())
+    for _ in range(10):
+      self.assertAlmostEqual(initial_sensitivity, self.evaluate(s_obj.result()),
+                             1e-3)
+
+  def test_unweighted_all_correct(self):
+    s_obj = metrics.SensitivityAtSpecificity(0.7)
+    inputs = np.random.randint(0, 2, size=(100, 1))
+    y_pred = constant_op.constant(inputs, dtype=dtypes.float32)
+    y_true = constant_op.constant(inputs)
+    self.evaluate(variables.variables_initializer(s_obj.variables))
+    result = s_obj(y_true, y_pred)
+    self.assertAlmostEqual(1, self.evaluate(result))
+
+  def test_unweighted_high_specificity(self):
+    s_obj = metrics.SensitivityAtSpecificity(0.8)
+    pred_values = [0.0, 0.1, 0.2, 0.3, 0.4, 0.1, 0.45, 0.5, 0.8, 0.9]
+    label_values = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
+
+    y_pred = constant_op.constant(pred_values, dtype=dtypes.float32)
+    y_true = constant_op.constant(label_values)
+    self.evaluate(variables.variables_initializer(s_obj.variables))
+    result = s_obj(y_true, y_pred)
+    self.assertAlmostEqual(0.8, self.evaluate(result))
+
+  def test_unweighted_low_specificity(self):
+    s_obj = metrics.SensitivityAtSpecificity(0.4)
+    pred_values = [0.0, 0.1, 0.2, 0.3, 0.4, 0.01, 0.02, 0.25, 0.26, 0.26]
+    label_values = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
+
+    y_pred = constant_op.constant(pred_values, dtype=dtypes.float32)
+    y_true = constant_op.constant(label_values)
+    self.evaluate(variables.variables_initializer(s_obj.variables))
+    result = s_obj(y_true, y_pred)
+    self.assertAlmostEqual(0.6, self.evaluate(result))
+
+  @parameterized.parameters([dtypes.bool, dtypes.int32, dtypes.float32])
+  def test_weighted(self, label_dtype):
+    s_obj = metrics.SensitivityAtSpecificity(0.4)
+    pred_values = [0.0, 0.1, 0.2, 0.3, 0.4, 0.01, 0.02, 0.25, 0.26, 0.26]
+    label_values = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
+    weight_values = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
+
+    y_pred = constant_op.constant(pred_values, dtype=dtypes.float32)
+    y_true = math_ops.cast(label_values, dtype=label_dtype)
+    weights = constant_op.constant(weight_values)
+    self.evaluate(variables.variables_initializer(s_obj.variables))
+    result = s_obj(y_true, y_pred, sample_weight=weights)
+    self.assertAlmostEqual(0.675, self.evaluate(result))
+
+  def test_invalid_specificity(self):
     with self.assertRaisesRegexp(
-        TypeError,
-        'Metric invalid-update\'s update must be a Tensor or Operation, given:'
-    ):
-      invalid_update_obj.update_state()
+        ValueError, r'`specificity` must be in the range \[0, 1\].'):
+      metrics.SensitivityAtSpecificity(-1)
 
+  def test_invalid_num_thresholds(self):
+    with self.assertRaisesRegexp(ValueError, '`num_thresholds` must be > 0.'):
+      metrics.SensitivityAtSpecificity(0.4, num_thresholds=-1)
+
+  def test_reset_states(self):
+    s_obj = metrics.SensitivityAtSpecificity(0.5, num_thresholds=1)
+    model = _get_simple_sequential_model([s_obj])
+    x = np.concatenate((np.ones((25, 4)), np.zeros((25, 4)), np.zeros((25, 4)),
+                        np.ones((25, 4))))
+    y = np.concatenate((np.ones((25, 1)), np.zeros((25, 1)), np.ones((25, 1)),
+                        np.zeros((25, 1))))
+    model.evaluate(x, y)
+    self.assertEqual(self.evaluate(s_obj.tp), 25.)
+    self.assertEqual(self.evaluate(s_obj.fp), 25.)
+    self.assertEqual(self.evaluate(s_obj.fn), 25.)
+    self.assertEqual(self.evaluate(s_obj.tn), 25.)
+    model.evaluate(x, y)
+    self.assertEqual(self.evaluate(s_obj.tp), 25.)
+    self.assertEqual(self.evaluate(s_obj.fp), 25.)
+    self.assertEqual(self.evaluate(s_obj.fn), 25.)
+    self.assertEqual(self.evaluate(s_obj.tn), 25.)
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class SpecificityAtSensitivityTest(test.TestCase, parameterized.TestCase):
+
+  def test_config(self):
+    s_obj = metrics.SpecificityAtSensitivity(
+        0.4, num_thresholds=100, name='specificity_at_sensitivity_1')
+    self.assertEqual(s_obj.name, 'specificity_at_sensitivity_1')
+    self.assertLen(s_obj.variables, 4)
+    self.assertEqual(s_obj.value, 0.4)
+    self.assertLen(s_obj.thresholds, 100)
+
+  def test_value_is_idempotent(self):
+    s_obj = metrics.SpecificityAtSensitivity(0.7)
+    y_pred = random_ops.random_uniform((10, 3),
+                                       maxval=1,
+                                       dtype=dtypes.float32,
+                                       seed=1)
+    y_true = random_ops.random_uniform((10, 3),
+                                       maxval=2,
+                                       dtype=dtypes.int64,
+                                       seed=1)
+    update_op = s_obj.update_state(y_true, y_pred)
+    self.evaluate(variables.variables_initializer(s_obj.variables))
+
+    # Run several updates.
+    for _ in range(10):
+      self.evaluate(update_op)
+
+    # Then verify idempotency.
+    initial_specificity = self.evaluate(s_obj.result())
+    for _ in range(10):
+      self.assertAlmostEqual(initial_specificity, self.evaluate(s_obj.result()),
+                             1e-3)
+
+  def test_unweighted_all_correct(self):
+    s_obj = metrics.SpecificityAtSensitivity(0.7)
+    inputs = np.random.randint(0, 2, size=(100, 1))
+    y_pred = constant_op.constant(inputs, dtype=dtypes.float32)
+    y_true = constant_op.constant(inputs)
+    self.evaluate(variables.variables_initializer(s_obj.variables))
+    result = s_obj(y_true, y_pred)
+    self.assertAlmostEqual(1, self.evaluate(result))
+
+  def test_unweighted_high_sensitivity(self):
+    s_obj = metrics.SpecificityAtSensitivity(0.8)
+    pred_values = [0.0, 0.1, 0.2, 0.3, 0.4, 0.1, 0.45, 0.5, 0.8, 0.9]
+    label_values = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
+
+    y_pred = constant_op.constant(pred_values, dtype=dtypes.float32)
+    y_true = constant_op.constant(label_values)
+    self.evaluate(variables.variables_initializer(s_obj.variables))
+    result = s_obj(y_true, y_pred)
+    self.assertAlmostEqual(0.4, self.evaluate(result))
+
+  def test_unweighted_low_sensitivity(self):
+    s_obj = metrics.SpecificityAtSensitivity(0.4)
+    pred_values = [0.0, 0.1, 0.2, 0.3, 0.4, 0.01, 0.02, 0.25, 0.26, 0.26]
+    label_values = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
+
+    y_pred = constant_op.constant(pred_values, dtype=dtypes.float32)
+    y_true = constant_op.constant(label_values)
+    self.evaluate(variables.variables_initializer(s_obj.variables))
+    result = s_obj(y_true, y_pred)
+    self.assertAlmostEqual(0.6, self.evaluate(result))
+
+  @parameterized.parameters([dtypes.bool, dtypes.int32, dtypes.float32])
+  def test_weighted(self, label_dtype):
+    s_obj = metrics.SpecificityAtSensitivity(0.4)
+    pred_values = [0.0, 0.1, 0.2, 0.3, 0.4, 0.01, 0.02, 0.25, 0.26, 0.26]
+    label_values = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
+    weight_values = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
+
+    y_pred = constant_op.constant(pred_values, dtype=dtypes.float32)
+    y_true = math_ops.cast(label_values, dtype=label_dtype)
+    weights = constant_op.constant(weight_values)
+    self.evaluate(variables.variables_initializer(s_obj.variables))
+    result = s_obj(y_true, y_pred, sample_weight=weights)
+    self.assertAlmostEqual(0.4, self.evaluate(result))
+
+  def test_invalid_sensitivity(self):
+    with self.assertRaisesRegexp(
+        ValueError, r'`sensitivity` must be in the range \[0, 1\].'):
+      metrics.SpecificityAtSensitivity(-1)
+
+  def test_invalid_num_thresholds(self):
+    with self.assertRaisesRegexp(ValueError, '`num_thresholds` must be > 0.'):
+      metrics.SpecificityAtSensitivity(0.4, num_thresholds=-1)
+
+  def test_reset_states(self):
+    s_obj = metrics.SpecificityAtSensitivity(0.5, num_thresholds=1)
+    model = _get_simple_sequential_model([s_obj])
+    x = np.concatenate((np.ones((25, 4)), np.zeros((25, 4)), np.zeros((25, 4)),
+                        np.ones((25, 4))))
+    y = np.concatenate((np.ones((25, 1)), np.zeros((25, 1)), np.ones((25, 1)),
+                        np.zeros((25, 1))))
+    model.evaluate(x, y)
+    self.assertEqual(self.evaluate(s_obj.tp), 25.)
+    self.assertEqual(self.evaluate(s_obj.fp), 25.)
+    self.assertEqual(self.evaluate(s_obj.fn), 25.)
+    self.assertEqual(self.evaluate(s_obj.tn), 25.)
+    model.evaluate(x, y)
+    self.assertEqual(self.evaluate(s_obj.tp), 25.)
+    self.assertEqual(self.evaluate(s_obj.fp), 25.)
+    self.assertEqual(self.evaluate(s_obj.fn), 25.)
+    self.assertEqual(self.evaluate(s_obj.tn), 25.)
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class CosineProximityTest(test.TestCase):
+
+  def test_config(self):
+    cosine_obj = metrics.CosineProximity(name='my_cos', dtype=dtypes.int32)
+    self.assertEqual(cosine_obj.name, 'my_cos')
+    self.assertEqual(cosine_obj._dtype, dtypes.int32)
+
+  def test_unweighted(self):
+    cosine_obj = metrics.CosineProximity()
+    self.evaluate(variables.variables_initializer(cosine_obj.variables))
+
+    y_true = constant_op.constant(((0, 1, 0, 1, 0), (0, 0, 1, 1, 1),
+                                   (1, 1, 1, 1, 0), (0, 0, 0, 0, 1)))
+    y_pred = constant_op.constant(((0, 0, 1, 1, 0), (1, 1, 1, 1, 1),
+                                   (0, 1, 0, 1, 0), (1, 1, 1, 1, 1)))
+
+    update_op = cosine_obj.update_state(y_true, y_pred)
+    self.evaluate(update_op)
+    result = cosine_obj.result()
+    self.assertAllClose(-0.60723, result, atol=1e-5)
+
+  def test_weighted(self):
+    cosine_obj = metrics.CosineProximity()
+    self.evaluate(variables.variables_initializer(cosine_obj.variables))
+    y_true = constant_op.constant(((0, 1, 0, 1, 0), (0, 0, 1, 1, 1),
+                                   (1, 1, 1, 1, 0), (0, 0, 0, 0, 1)))
+    y_pred = constant_op.constant(((0, 0, 1, 1, 0), (1, 1, 1, 1, 1),
+                                   (0, 1, 0, 1, 0), (1, 1, 1, 1, 1)))
+    sample_weight = constant_op.constant((1., 1.5, 2., 2.5))
+    result = cosine_obj(y_true, y_pred, sample_weight=sample_weight)
+    self.assertAllClose(-0.59916, self.evaluate(result), atol=1e-5)
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/model_subclassing_test.py b/tensorflow/python/keras/model_subclassing_test.py
index aca058b1111..620275e50f8 100644
--- a/tensorflow/python/keras/model_subclassing_test.py
+++ b/tensorflow/python/keras/model_subclassing_test.py
@@ -25,6 +25,7 @@ import numpy as np
 from tensorflow.python import keras
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.eager import context
+from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
@@ -455,12 +456,12 @@ class ModelSubclassingTest(test.TestCase):
       model = SimpleTestModel(num_classes=num_classes, use_dp=True, use_bn=True)
       model.compile(loss='mse', optimizer=RMSPropOptimizer(learning_rate=0.001))
 
-      x = np.ones((num_samples, input_dim))
-      y = np.zeros((num_samples, num_classes))
+      x = np.ones((num_samples, input_dim), dtype=np.float32)
+      y = np.zeros((num_samples, num_classes), dtype=np.float32)
       dataset = dataset_ops.Dataset.from_tensor_slices((x, y))
       dataset = dataset.repeat(100)
       dataset = dataset.batch(10)
-      iterator = dataset.make_one_shot_iterator()
+      iterator = dataset_ops.make_one_shot_iterator(dataset)
 
       model.fit(iterator, epochs=2, steps_per_epoch=10, verbose=0)
       _ = model.evaluate(iterator, steps=10, verbose=0)
@@ -725,10 +726,41 @@ class ModelSubclassingTest(test.TestCase):
     _ = model.evaluate(x, y, verbose=0)
 
     self.assertEqual(len(model.weights), 16)
-    self.assertEqual(
-        len(model.non_trainable_weights), 4)
+    self.assertEqual(len(model.non_trainable_weights), 4)
     self.assertEqual(len(model.trainable_weights), 12)
 
+  def test_subclass_nested_in_sequential(self):
+    num_classes = 2
+    num_samples = 100
+    input_dim = 50
+
+    class Inner(keras.Model):
+
+      def __init__(self):
+        super(Inner, self).__init__()
+        self.dense1 = keras.layers.Dense(32, activation='relu')
+        self.dense2 = keras.layers.Dense(num_classes, activation='relu')
+        self.bn = keras.layers.BatchNormalization()
+
+      def call(self, inputs):
+        x = self.dense1(inputs)
+        x = self.dense2(x)
+        return self.bn(x)
+
+    model = keras.Sequential([Inner()])
+    model.compile(loss='mse',
+                  optimizer=RMSPropOptimizer(learning_rate=0.001),
+                  metrics=['acc'])
+
+    x = np.ones((num_samples, input_dim))
+    y = np.zeros((num_samples, num_classes))
+    model.fit(x, y, epochs=2, batch_size=32, verbose=0)
+    _ = model.evaluate(x, y, verbose=0)
+
+    self.assertEqual(len(model.weights), 8)
+    self.assertEqual(len(model.non_trainable_weights), 2)
+    self.assertEqual(len(model.trainable_weights), 6)
+
   def test_support_for_manual_training_arg(self):
     # In most cases, the `training` argument is left unspecified, in which
     # case it defaults to value corresponding to the Model method being used
@@ -819,9 +851,73 @@ class ModelSubclassingTest(test.TestCase):
     self.assertEqual([m.dense.kernel, m.dense.bias, m.not_trainable_var],
                      m.non_trainable_variables)
 
+  @test_util.run_in_graph_and_eager_modes
+  def test_add_weight_in_model(self):
+
+    class MyModel(keras.Model):
+
+      def __init__(self):
+        super(MyModel, self).__init__()
+        self.b = self.add_weight('bias', (10,))
+        self.c = self.add_weight('bias2', (10,), trainable=False)
+
+      def call(self, inputs):
+        return inputs + self.b + self.c
+
+    x = ops.convert_to_tensor(np.ones((10, 10), 'float32'))
+    model = MyModel()
+    model(x)
+    self.assertEqual(1, len(model.trainable_weights))
+    self.assertEqual(1, len(model.non_trainable_weights))
+    self.assertEqual(2, len(model.weights))
+
+    class MyModelCustomBuild(keras.Model):
+
+      def build(self, input_shape):
+        self.b = self.add_weight('bias', (10,))
+        self.c = self.add_weight('bias2', (10,), trainable=False)
+
+      def call(self, inputs):
+        return inputs + self.b + self.c
+
+    x = ops.convert_to_tensor(np.ones((10, 10), 'float32'))
+    model = MyModelCustomBuild()
+    model(x)
+    self.assertEqual(1, len(model.trainable_weights))
+    self.assertEqual(1, len(model.non_trainable_weights))
+    self.assertEqual(2, len(model.weights))
+
+  def test_add_update_in_model(self):
+
+    class MyModel(keras.Model):
+
+      def __init__(self):
+        super(MyModel, self).__init__()
+        self.b = self.add_weight('bias', (10,))
+        self.c = self.add_weight('bias2', (10,))
+
+      def call(self, inputs):
+        # Unconditional
+        self.add_update(self.b.assign(self.b * 2))
+        # Conditional
+        self.add_update(self.c.assign(inputs[1, :]), inputs)
+        return inputs + self.b + self.c
+
+    x = ops.convert_to_tensor(np.ones((10, 10), 'float32'))
+    model = MyModel()
+    model(x)
+
+    if context.executing_eagerly():
+      self.assertEqual(0, len(model.updates))
+    else:
+      self.assertEqual(2, len(model.updates))
+      self.assertEqual(1, len(model.get_updates_for(None)))
+      self.assertEqual(1, len(model.get_updates_for(x)))
+
 
 class GraphSpecificModelSubclassingTests(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def test_single_io_workflow_with_tensors(self):
     num_classes = 2
     num_samples = 10
@@ -839,6 +935,7 @@ class GraphSpecificModelSubclassingTests(test.TestCase):
       model.fit(x, y, epochs=2, steps_per_epoch=10, verbose=0)
       _ = model.evaluate(steps=10, verbose=0)
 
+  @test_util.run_deprecated_v1
   def test_multi_io_workflow_with_tensors(self):
     num_classes = (2, 3)
     num_samples = 10
@@ -858,6 +955,7 @@ class GraphSpecificModelSubclassingTests(test.TestCase):
       model.fit([x1, x2], [y1, y2], epochs=2, steps_per_epoch=10, verbose=0)
       _ = model.evaluate(steps=10, verbose=0)
 
+  @test_util.run_deprecated_v1
   def test_updates_and_losses_for_nested_models_in_subclassed_model(self):
 
     # Case 1: deferred-build sequential nested in subclass.
@@ -925,6 +1023,7 @@ class GraphSpecificModelSubclassingTests(test.TestCase):
       self.assertEqual(len(model.get_updates_for(x)), 2)
       self.assertEqual(len(model.get_losses_for(x)), 1)
 
+  @test_util.run_deprecated_v1
   def test_multi_io_workflow_with_numpy_arrays_and_custom_placeholders(self):
     num_classes = (2, 3)
     num_samples = 1000
@@ -974,6 +1073,16 @@ class TrainingNoDefaultModel(keras.Model):
     return self.dense1(x)
 
 
+class TrainingMaskingModel(keras.Model):
+
+  def __init__(self):
+    super(TrainingMaskingModel, self).__init__()
+    self.dense1 = keras.layers.Dense(1)
+
+  def call(self, x, training=False, mask=None):
+    return self.dense1(x)
+
+
 class CustomCallSignatureTests(test.TestCase):
 
   @test_util.run_in_graph_and_eager_modes
@@ -1003,6 +1112,19 @@ class CustomCallSignatureTests(test.TestCase):
                                     'has been properly built.'))
     self.assertTrue(model.built, 'Model should be built after calling `build`.')
 
+  @test_util.run_in_graph_and_eager_modes
+  def test_training_and_mask_args_call_build(self):
+    input_dim = 2
+
+    model = TrainingMaskingModel()
+    self.assertFalse(model.built, 'Model should not have been built')
+    self.assertFalse(model.weights, ('Model should have no weights since it '
+                                     'has not been built.'))
+    model.build((None, input_dim))
+    self.assertTrue(model.weights, ('Model should have weights now that it '
+                                    'has been properly built.'))
+    self.assertTrue(model.built, 'Model should be built after calling `build`.')
+
   @test_util.run_in_graph_and_eager_modes
   def test_custom_call_kwargs_and_build(self):
     first_input_shape = (2, 3)
diff --git a/tensorflow/python/keras/models.py b/tensorflow/python/keras/models.py
index 225c6c6af8e..2637191bb75 100644
--- a/tensorflow/python/keras/models.py
+++ b/tensorflow/python/keras/models.py
@@ -100,17 +100,19 @@ def _clone_functional_model(model, input_tensors=None):
       input_tensors = list(input_tensors)
     input_tensors = generic_utils.to_list(input_tensors)
     input_tensors_ = []
-    for i, x in enumerate(input_tensors):
-      if not K.is_keras_tensor(x):
-        name = model._input_layers[i].name
-        input_tensor = Input(tensor=x, name='input_wrapper_for_' + name)
+    for i in range(len(input_tensors)):
+      input_tensor = input_tensors[i]
+      if not K.is_keras_tensor(input_tensor):
+        original_input_layer = model._input_layers[i]
+        name = original_input_layer.name
+        input_tensor = Input(tensor=input_tensor,
+                             name='input_wrapper_for_' + name)
         input_tensors_.append(input_tensor)
         # Cache newly created input layer.
-        original_input_layer = x._keras_history[0]
         newly_created_input_layer = input_tensor._keras_history[0]
         layer_map[original_input_layer] = newly_created_input_layer
       else:
-        input_tensors_.append(x)
+        input_tensors_.append(input_tensor)
     input_tensors = input_tensors_
 
   for x, y in zip(model.inputs, input_tensors):
@@ -209,14 +211,17 @@ def _clone_sequential_model(model, input_tensors=None):
   # Use model._layers to ensure that all layers are cloned. The model's layers
   # property will exclude the initial InputLayer (if it exists) in the model,
   # resulting in a different Sequential model structure.
-  layers = [clone(layer) for layer in model._layers]
   if input_tensors is None:
+    layers = [clone(layer) for layer in model._layers]
     return Sequential(layers=layers, name=model.name)
   else:
     # If input tensors are provided, the original model's InputLayer is
     # overwritten with a different InputLayer.
-    if isinstance(layers[0], InputLayer):
-      layers = layers[1:]
+    layers = [
+        clone(layer)
+        for layer in model._layers
+        if not isinstance(layer, InputLayer)
+    ]
     if len(generic_utils.to_list(input_tensors)) != 1:
       raise ValueError('To clone a `Sequential` model, we expect '
                        ' at most one tensor '
@@ -304,8 +309,9 @@ def _in_place_subclassed_model_reset(model):
       attributes_cache[name] = value
       assert value in model._layers
     elif isinstance(
-        value, (list, tuple)) and name not in ('layers', '_layers',
-                                               'stateful_metric_functions'):
+        value,
+        (list, tuple)) and name not in ('layers', '_layers', 'metrics',
+                                        '_compile_stateful_metric_functions'):
       # Handle case: list/tuple of layers (also tracked by the Network API).
       if value and all(isinstance(val, Layer) for val in value):
         raise ValueError('We do not support the use of list-of-layers '
@@ -345,9 +351,6 @@ def _in_place_subclassed_model_reset(model):
           'targets',
           '_feed_targets',
           'sample_weight_modes',
-          'weighted_metrics',
-          'metrics_names',
-          'metrics_tensors',
           'total_loss',
           'sample_weights',
           '_feed_sample_weights',
@@ -495,10 +498,11 @@ def clone_and_build_model(
     clone.compile(
         optimizer,
         model.loss,
-        metrics=metrics_module.clone_metrics(model.metrics),
+        metrics=metrics_module.clone_metrics(model._compile_metrics),
         loss_weights=model.loss_weights,
         sample_weight_mode=model.sample_weight_mode,
-        weighted_metrics=metrics_module.clone_metrics(model.weighted_metrics),
+        weighted_metrics=metrics_module.clone_metrics(
+            model._compile_weighted_metrics),
         target_tensors=target_tensors)
 
   return clone
diff --git a/tensorflow/python/keras/models_test.py b/tensorflow/python/keras/models_test.py
index bf778f14971..b0872ae3abf 100644
--- a/tensorflow/python/keras/models_test.py
+++ b/tensorflow/python/keras/models_test.py
@@ -26,10 +26,12 @@ import numpy as np
 from tensorflow.python import keras
 from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.keras import backend as K
 from tensorflow.python.keras import metrics
 from tensorflow.python.keras import models
+from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.platform import test
@@ -67,6 +69,7 @@ def sequential_model(add_input_layer, include_input_shape=True):
 
 class TestModelCloning(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def test_clone_sequential_model(self):
     with self.cached_session():
       val_a = np.random.random((10, 4))
@@ -81,25 +84,28 @@ class TestModelCloning(test.TestCase):
       # With placeholder creation
       new_model = keras.models.clone_model(model)
       # update ops from batch norm needs to be included
-      self.assertEquals(len(new_model.get_updates_for(new_model.inputs)), 2)
+      self.assertEqual(len(new_model.get_updates_for(new_model.inputs)), 2)
       new_model.compile('rmsprop', 'mse')
       new_model.train_on_batch(val_a, val_out)
 
       # On top of new tensor
       input_a = keras.Input(shape=(4,))
       new_model = keras.models.clone_model(model, input_tensors=input_a)
-      self.assertEquals(len(new_model.get_updates_for(new_model.inputs)), 2)
+      self.assertEqual(len(new_model.get_updates_for(new_model.inputs)), 2)
       new_model.compile('rmsprop', 'mse')
       new_model.train_on_batch(val_a, val_out)
 
       # On top of new, non-Keras tensor
       input_a = keras.backend.variable(val_a)
       new_model = keras.models.clone_model(model, input_tensors=input_a)
-      self.assertEquals(len(new_model.get_updates_for(new_model.inputs)), 2)
+      self.assertEqual(len(new_model.get_updates_for(new_model.inputs)), 2)
       new_model.compile('rmsprop', 'mse')
       new_model.train_on_batch(None, val_out)
 
+  @test_util.run_deprecated_v1
   def test_clone_sequential_model_input_layer(self):
+
+    @test_util.run_deprecated_v1
     def test_input_layer(include_inputs):
       with self.cached_session():
         val_a = np.random.random((10, 4))
@@ -136,6 +142,7 @@ class TestModelCloning(test.TestCase):
     test_input_layer(True)
     test_input_layer(False)
 
+  @test_util.run_deprecated_v1
   def test_clone_functional_model(self):
     with self.cached_session():
       val_a = np.random.random((10, 4))
@@ -161,7 +168,7 @@ class TestModelCloning(test.TestCase):
     with self.cached_session():
       # With placeholder creation
       new_model = keras.models.clone_model(model)
-      self.assertEquals(len(new_model.get_updates_for(new_model.inputs)), 2)
+      self.assertEqual(len(new_model.get_updates_for(new_model.inputs)), 2)
       new_model.compile('rmsprop', 'mse')
       new_model.train_on_batch([val_a, val_b], val_out)
 
@@ -170,7 +177,7 @@ class TestModelCloning(test.TestCase):
       input_b = keras.Input(shape=(4,), name='b')
       new_model = keras.models.clone_model(
           model, input_tensors=[input_a, input_b])
-      self.assertEquals(len(new_model.get_updates_for(new_model.inputs)), 2)
+      self.assertEqual(len(new_model.get_updates_for(new_model.inputs)), 2)
       new_model.compile('rmsprop', 'mse')
       new_model.train_on_batch([val_a, val_b], val_out)
 
@@ -179,7 +186,7 @@ class TestModelCloning(test.TestCase):
       input_b = keras.backend.variable(val_b)
       new_model = keras.models.clone_model(
           model, input_tensors=[input_a, input_b])
-      self.assertEquals(len(new_model.get_updates_for(new_model.inputs)), 2)
+      self.assertEqual(len(new_model.get_updates_for(new_model.inputs)), 2)
       new_model.compile('rmsprop', 'mse')
       new_model.train_on_batch(None, val_out)
 
@@ -219,6 +226,34 @@ class TestModelCloning(test.TestCase):
     with self.assertRaises(ValueError):
       keras.models._clone_sequential_model(seq_model, input_tensors=y)
 
+  def test_functional_cloning_does_not_create_unnecessary_placeholders(self):
+    with ops.Graph().as_default():
+      x = keras.Input((4,))
+      y = keras.layers.Dense(4)(x)
+      model = keras.models.Model(x, y)
+    graph = ops.Graph()
+    with graph.as_default():
+      x = array_ops.ones((10, 4))
+      _ = keras.models.clone_model(model, input_tensors=[x])
+      has_placeholder = _has_placeholder(graph)
+      self.assertFalse(has_placeholder)
+
+  def test_sequential_cloning_does_not_create_unnecessary_placeholders(self):
+    with ops.Graph().as_default():
+      model = keras.models.Sequential()
+      model.add(keras.layers.Dense(4, input_shape=(4,)))
+    graph = ops.Graph()
+    with graph.as_default():
+      x = array_ops.ones((10, 4))
+      _ = keras.models.clone_model(model, input_tensors=[x])
+      has_placeholder = _has_placeholder(graph)
+      self.assertFalse(has_placeholder)
+
+
+def _has_placeholder(graph):
+  ops_types = [op.type for op in graph.get_operations()]
+  return any('Placeholder' in s for s in ops_types)
+
 
 class CheckpointingTests(test.TestCase):
 
@@ -331,7 +366,8 @@ class TestCloneAndBuildModel(test.TestCase):
     self.assertEqual('mse', model.loss)
     self.assertTrue(
         isinstance(model.optimizer, keras.optimizers.RMSprop))
-    self.assertEqual(['acc', metrics.categorical_accuracy], model.metrics)
+    self.assertEqual(['acc', metrics.categorical_accuracy],
+                     model._compile_metrics)
 
   def _clone_and_build_test_helper(self, model, is_subclassed=False):
     inp = np.random.random((10, 4))
@@ -366,6 +402,7 @@ class TestCloneAndBuildModel(test.TestCase):
       new_model.train_on_batch(inp, out)
       new_model.evaluate(inp, out)
 
+  @test_util.run_deprecated_v1
   def test_clone_and_build_compiled_sequential_model(self):
     with self.cached_session():
       model = keras.models.Sequential()
@@ -378,6 +415,7 @@ class TestCloneAndBuildModel(test.TestCase):
 
     self._clone_and_build_test_helper(model)
 
+  @test_util.run_deprecated_v1
   def test_clone_and_build_functional_model(self):
     with self.cached_session():
       input_a = keras.Input(shape=(4,))
@@ -394,6 +432,7 @@ class TestCloneAndBuildModel(test.TestCase):
 
     self._clone_and_build_test_helper(model)
 
+  @test_util.run_deprecated_v1
   def test_clone_and_build_subclassed_model(self):
     class SubclassedModel(keras.Model):
 
@@ -442,9 +481,11 @@ class TestCloneAndBuildModel(test.TestCase):
   def test_replace_tf_optimizer_iterations_variable(self):
     self.assert_optimizer_iterations_increases(adam.AdamOptimizer(0.01))
 
+  @test_util.run_deprecated_v1
   def test_replace_keras_optimizer_iterations_variable(self):
     self.assert_optimizer_iterations_increases('adam')
 
+  @test_util.run_deprecated_v1
   def test_clone_and_build_sequential_model_without_inputs_defined(self):
     with self.cached_session():
       model = sequential_model(False, False)
diff --git a/tensorflow/python/keras/optimizer_v2/BUILD b/tensorflow/python/keras/optimizer_v2/BUILD
index fb43775fdc4..b8f01249419 100644
--- a/tensorflow/python/keras/optimizer_v2/BUILD
+++ b/tensorflow/python/keras/optimizer_v2/BUILD
@@ -7,14 +7,21 @@ licenses(["notice"])  # Apache 2.0
 
 exports_files(["LICENSE"])
 
+load("//tensorflow:tensorflow.bzl", "py_test")
 load("//tensorflow:tensorflow.bzl", "cuda_py_test")
 
 py_library(
     name = "optimizer_v2",
     srcs = [
+        "adadelta.py",
+        "adagrad.py",
         "adam.py",
+        "adamax.py",
+        "ftrl.py",
         "gradient_descent.py",
+        "nadam.py",
         "optimizer_v2.py",
+        "rmsprop.py",
     ],
     srcs_version = "PY2AND3",
     deps = [
@@ -24,12 +31,107 @@ py_library(
         "//tensorflow/python:math_ops",
         "//tensorflow/python:resource_variable_ops",
         "//tensorflow/python:state_ops",
-        "//tensorflow/python:training",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python:variables",
+        "//tensorflow/python/distribute:reduce_util",
     ],
 )
 
+cuda_py_test(
+    name = "adagrad_test",
+    size = "medium",
+    srcs = ["adagrad_test.py"],
+    additional_deps = [
+        ":optimizer_v2",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:embedding_ops",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python:resources",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/eager:context",
+    ],
+    shard_count = 4,
+)
+
+cuda_py_test(
+    name = "adam_test",
+    size = "medium",
+    srcs = ["adam_test.py"],
+    additional_deps = [
+        ":optimizer_v2",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:embedding_ops",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python:resources",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/eager:context",
+    ],
+    shard_count = 4,
+)
+
+cuda_py_test(
+    name = "adamax_test",
+    size = "medium",
+    srcs = ["adamax_test.py"],
+    additional_deps = [
+        ":optimizer_v2",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:embedding_ops",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python:resources",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/eager:context",
+    ],
+    shard_count = 4,
+)
+
+cuda_py_test(
+    name = "adadelta_test",
+    size = "medium",
+    srcs = ["adadelta_test.py"],
+    additional_deps = [
+        ":optimizer_v2",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:embedding_ops",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python:resources",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/eager:context",
+    ],
+    shard_count = 4,
+)
+
+cuda_py_test(
+    name = "ftrl_test",
+    size = "medium",
+    srcs = ["ftrl_test.py"],
+    additional_deps = [
+        ":optimizer_v2",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:embedding_ops",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python:resources",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/eager:context",
+    ],
+    shard_count = 4,
+)
+
 cuda_py_test(
     name = "gradient_descent_test",
     size = "medium",
@@ -50,9 +152,53 @@ cuda_py_test(
 )
 
 cuda_py_test(
-    name = "optimizer_v2_test",
+    name = "nadam_test",
     size = "medium",
+    srcs = ["nadam_test.py"],
+    additional_deps = [
+        ":optimizer_v2",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:embedding_ops",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python:resources",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/eager:context",
+    ],
+    shard_count = 4,
+)
+
+py_test(
+    name = "optimizer_v2_test",
+    size = "large",
     srcs = ["optimizer_v2_test.py"],
+    shard_count = 4,
+    tags = [
+        "no_windows",
+    ],
+    deps = [
+        ":optimizer_v2",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:clip_ops",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:gradients",
+        "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python:state_ops",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/keras",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
+cuda_py_test(
+    name = "rmsprop_test",
+    size = "medium",
+    srcs = ["rmsprop_test.py"],
     additional_deps = [
         ":optimizer_v2",
         "//tensorflow/python/eager:def_function",
diff --git a/tensorflow/python/keras/optimizer_v2/adadelta.py b/tensorflow/python/keras/optimizer_v2/adadelta.py
new file mode 100644
index 00000000000..55b4eba1051
--- /dev/null
+++ b/tensorflow/python/keras/optimizer_v2/adadelta.py
@@ -0,0 +1,148 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Adadelta for TensorFlow."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.keras.optimizer_v2 import optimizer_v2
+from tensorflow.python.training import training_ops
+
+
+class Adadelta(optimizer_v2.OptimizerV2):
+  r"""Optimizer that implements the Adadelta algorithm.
+
+  Adadelta optimization is a stochastic gradient descent method that is based on
+  adaptive learning rate per dimension to address two drawbacks:
+    1) the continual decay of learning rates throughout training
+    2) the need for a manually selected global learning rate
+
+  Two accumulation steps are required:
+    1) the accumulation of gradients squared,
+    2) the accumulation of updates squared.
+
+  Initialization:
+
+  $$accum_g_0 := 0 \text{(Initialize gradient 2nd order moment vector)}$$
+  $$accum_x_0 := 0 \text{(Initialize variable update 2nd order moment vector)}$$
+
+  $$t := t + 1$$
+  $$accum_g_t := rho * accum_g_{t-1} + (1 - rho) * g * g$$
+  $$delta = -\sqrt{accum_x_{t-1}} / (\sqrt{accum_g_{t-1}} + \epsilon)$$
+  $$accum_x_t := rho * accum_x_{t-1} + (1 - rho) * delta * delta$$
+
+  References
+    See [M. D. Zeiler](http://arxiv.org/abs/1212.5701)
+      ([pdf](http://arxiv.org/pdf/1212.5701v1.pdf))
+
+  """
+
+  def __init__(self,
+               learning_rate=0.001,
+               rho=0.95,
+               epsilon=1e-7,
+               name='Adadelta',
+               **kwargs):
+    """Construct a new Adadelta optimizer.
+
+    Adadelta is a more robust extension of Adagrad that adapts learning rates
+    based on a moving window of gradient updates, instead of accumulating all
+    past gradients. This way, Adadelta continues learning even when many updates
+    have been done. Compared to Adagrad, in the original version of Adadelta you
+    don't have to set an initial learning rate. In this version, initial
+    learning rate can be set, as in most other Keras optimizers.
+
+    Args:
+      learning_rate: A `Tensor` or a floating point value. The learning rate.
+        To match the exact form in the original paper use 1.0.
+      rho: A `Tensor` or a floating point value. The decay rate.
+      epsilon: A `Tensor` or a floating point value.  A constant epsilon used
+               to better conditioning the grad update.
+      name: Optional name prefix for the operations created when applying
+        gradients.  Defaults to "Adadelta".
+      **kwargs: keyword arguments. Allowed to be {`decay`}
+
+    @compatibility(eager)
+    When eager execution is enabled, `learning_rate`, `rho`, and `epsilon` can
+    each be a callable that takes no arguments and returns the actual value to
+    use. This can be useful for changing these values across different
+    invocations of optimizer functions.
+    @end_compatibility
+    """
+    super(Adadelta, self).__init__(name, **kwargs)
+    self._set_hyper('learning_rate', learning_rate)
+    self._set_hyper('decay', self._initial_decay)
+    self._set_hyper('rho', rho)
+    self._set_hyper('epsilon', epsilon)
+
+  def _create_slots(self, var_list):
+    # Separate for-loops to respect the ordering of slot variables from v1.
+    for v in var_list:
+      self.add_slot(v, 'accum_grad')
+    for v in var_list:
+      self.add_slot(v, 'accum_var')
+
+  def set_weights(self, weights):
+    params = self.weights
+    # Override set_weights for backward compatibility of Keras V1 optimizer
+    # since it does not include iteration at head of the weight list. Set
+    # iteration to 0.
+    if len(params) == len(weights) + 1:
+      weights = [np.array(0)] + weights
+    super(Adadelta, self).set_weights(weights)
+
+  def _resource_apply_dense(self, grad, var):
+    var_dtype = var.dtype.base_dtype
+    lr_t = self._decayed_lr(var_dtype)
+    accum_grad = self.get_slot(var, 'accum_grad')
+    accum_var = self.get_slot(var, 'accum_var')
+    return training_ops.resource_apply_adadelta(
+        var.handle,
+        accum_grad.handle,
+        accum_var.handle,
+        lr_t,
+        self._get_hyper('rho', var_dtype),
+        self._get_hyper('epsilon', var_dtype),
+        grad,
+        use_locking=self._use_locking)
+
+  def _resource_apply_sparse(self, grad, var, indices):
+    var_dtype = var.dtype.base_dtype
+    lr_t = self._decayed_lr(var_dtype)
+    accum_grad = self.get_slot(var, 'accum_grad')
+    accum_var = self.get_slot(var, 'accum_var')
+    return training_ops.resource_sparse_apply_adadelta(
+        var.handle,
+        accum_grad.handle,
+        accum_var.handle,
+        lr_t,
+        self._get_hyper('rho', var_dtype),
+        self._get_hyper('epsilon', var_dtype),
+        grad,
+        indices,
+        use_locking=self._use_locking)
+
+  def get_config(self):
+    config = super(Adadelta, self).get_config()
+    config.update({
+        'learning_rate': self._serialize_hyperparameter('learning_rate'),
+        'decay': self._serialize_hyperparameter('decay'),
+        'rho': self._serialize_hyperparameter('rho'),
+        'epsilon': self._serialize_hyperparameter('epsilon'),
+    })
+    return config
diff --git a/tensorflow/python/keras/optimizer_v2/adadelta_test.py b/tensorflow/python/keras/optimizer_v2/adadelta_test.py
new file mode 100644
index 00000000000..0fb67d0cd16
--- /dev/null
+++ b/tensorflow/python/keras/optimizer_v2/adadelta_test.py
@@ -0,0 +1,170 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Adadelta Optimizer."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.eager import context
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
+from tensorflow.python.keras.optimizer_v2 import adadelta
+from tensorflow.python.ops import embedding_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+
+
+class AdadeltaOptimizerTest(test.TestCase):
+
+  def doTestBasic(self, use_resource=False, use_callable_params=False):
+    num_updates = 4  # number of ADADELTA steps to perform
+    for dtype in [dtypes.half, dtypes.float32]:
+      for grad in [0.2, 0.1, 0.01]:
+        for lr in [1.0, 0.5, 0.1]:
+          var0_init = [1.0, 2.0]
+          var1_init = [3.0, 4.0]
+          if use_resource:
+            var0 = resource_variable_ops.ResourceVariable(
+                var0_init, dtype=dtype)
+            var1 = resource_variable_ops.ResourceVariable(
+                var1_init, dtype=dtype)
+          else:
+            var0 = variables.Variable(var0_init, dtype=dtype)
+            var1 = variables.Variable(var1_init, dtype=dtype)
+
+          grads = constant_op.constant([grad, grad], dtype=dtype)
+
+          accum = 0.0
+          accum_update = 0.0
+
+          # ADADELTA gradient optimizer
+          rho = 0.95
+          epsilon = 1e-8
+          if use_callable_params:
+            adadelta_opt = adadelta.Adadelta(
+                learning_rate=lambda: lr,  # pylint: disable=cell-var-from-loop
+                rho=lambda: rho,  # pylint: disable=cell-var-from-loop
+                epsilon=lambda: epsilon)  # pylint: disable=cell-var-from-loop
+          else:
+            adadelta_opt = adadelta.Adadelta(
+                learning_rate=lr, rho=rho, epsilon=epsilon)
+          if not context.executing_eagerly():
+            adadelta_update = adadelta_opt.apply_gradients(
+                zip([grads, grads], [var0, var1]))
+            self.evaluate(variables.global_variables_initializer())
+
+            # Assign slots
+            slot = [None] * 2
+            slot_update = [None] * 2
+            slot[0] = adadelta_opt.get_slot(var0, "accum_grad")
+            self.assertEqual(slot[0].get_shape(), var0.get_shape())
+
+            slot_update[0] = adadelta_opt.get_slot(var0, "accum_var")
+            self.assertEqual(slot_update[0].get_shape(), var0.get_shape())
+
+            slot[1] = adadelta_opt.get_slot(var1, "accum_grad")
+            self.assertEqual(slot[1].get_shape(), var1.get_shape())
+
+            slot_update[1] = adadelta_opt.get_slot(var1, "accum_var")
+            self.assertEqual(slot_update[1].get_shape(), var1.get_shape())
+
+          # Fetch params to validate initial values
+          self.assertAllClose(var0_init, self.evaluate(var0))
+          self.assertAllClose(var1_init, self.evaluate(var1))
+
+          update = [None] * num_updates
+          tot_update = 0
+          for step in range(num_updates):
+            # Run adadelta update for comparison
+            if not context.executing_eagerly():
+              self.evaluate(adadelta_update)
+            else:
+              adadelta_opt.apply_gradients(zip([grads, grads], [var0, var1]))
+
+            # Perform initial update without previous accum values
+            accum = accum * rho + (grad**2) * (1 - rho)
+            update[step] = (
+                np.sqrt(accum_update + epsilon) *
+                (1. / np.sqrt(accum + epsilon)) * grad)
+            accum_update = (
+                accum_update * rho + (update[step]**2) * (1.0 - rho))
+            tot_update += update[step] * lr
+
+            if not context.executing_eagerly():
+              # Check that the accumulators have been updated
+              # TODO(lxuechen): This is hard to test in eager mode
+              for slot_idx in range(2):
+                self.assertAllCloseAccordingToType(
+                    np.array([accum, accum], dtype=dtype.as_numpy_dtype()),
+                    self.evaluate(slot[slot_idx]),
+                    rtol=1e-5)
+
+                self.assertAllCloseAccordingToType(
+                    np.array(
+                        [accum_update, accum_update],
+                        dtype=dtype.as_numpy_dtype()),
+                    self.evaluate(slot_update[slot_idx]),
+                    rtol=1e-5)
+
+              # Check that the parameters have been updated
+              self.assertAllCloseAccordingToType(
+                  np.array(
+                      [var0_init[0] - tot_update, var0_init[1] - tot_update],
+                      dtype=dtype.as_numpy_dtype()),
+                  self.evaluate(var0),
+                  rtol=1e-5)
+
+              self.assertAllCloseAccordingToType(
+                  np.array(
+                      [var1_init[0] - tot_update, var1_init[1] - tot_update],
+                      dtype=dtype.as_numpy_dtype()),
+                  self.evaluate(var1),
+                  rtol=1e-5)
+
+  @test_util.run_in_graph_and_eager_modes(reset_test=True)
+  def testResourceBasic(self):
+    self.doTestBasic(use_resource=True)
+
+  def testBasicCallableParams(self):
+    with context.eager_mode():
+      self.doTestBasic(use_resource=True, use_callable_params=True)
+
+  @test_util.run_deprecated_v1
+  def testMinimizeSparseResourceVariable(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.cached_session():
+        var0 = resource_variable_ops.ResourceVariable([[1.0, 2.0]], dtype=dtype)
+        x = constant_op.constant([[4.0], [5.0]], dtype=dtype)
+        pred = math_ops.matmul(embedding_ops.embedding_lookup([var0], [0]), x)
+        loss = pred * pred
+        sgd_op = adadelta.Adadelta(1.0, 1.0, 1.0).minimize(
+            loss, var_list=[var0])
+        variables.global_variables_initializer().run()
+        # Fetch params to validate initial values
+        self.assertAllCloseAccordingToType([[1.0, 2.0]], self.evaluate(var0))
+        # Run 1 step of sgd
+        sgd_op.run()
+        # Validate updated params
+        self.assertAllCloseAccordingToType([[-111, -138]], self.evaluate(var0))
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/keras/optimizer_v2/adagrad.py b/tensorflow/python/keras/optimizer_v2/adagrad.py
new file mode 100644
index 00000000000..670cad70e63
--- /dev/null
+++ b/tensorflow/python/keras/optimizer_v2/adagrad.py
@@ -0,0 +1,171 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Adagrad for TensorFlow."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.framework import ops
+from tensorflow.python.keras.optimizer_v2 import optimizer_v2
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import state_ops
+
+
+class Adagrad(optimizer_v2.OptimizerV2):
+  r"""Optimizer that implements the Adagrad algorithm.
+
+  Adagrad is an optimizer with parameter-specific learning rates,
+  which are adapted relative to how frequently a parameter gets
+  updated during training. The more updates a parameter receives,
+  the smaller the updates.
+
+  Initialization:
+
+  $$accum_g_0 := initial_accumulator_value$$
+
+  $$t := t + 1$$
+  $$accum_g_t := accum_g_{t-1} + g * g$$
+  $$theta_t := theta_{t-1} - lr * g / (\sqrt{accum_g_t} + \epsilon)$$
+
+  References
+    See [paper]
+      (http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf)
+    or this
+      [intro](https://ppasupat.github.io/a9online/uploads/proximal_notes.pdf).
+  """
+
+  def __init__(self,
+               learning_rate=0.001,
+               initial_accumulator_value=0.1,
+               epsilon=1e-7,
+               name='Adagrad',
+               **kwargs):
+    """Construct a new Adagrad optimizer.
+
+    Args:
+      learning_rate: A `Tensor` or a floating point value.  The learning rate.
+      initial_accumulator_value: A floating point value.
+        Starting value for the accumulators, must be positive.
+      epsilon: A floating point value.
+        Starting value for the accumulators, must be positive.
+      name: Optional name prefix for the operations created when applying
+        gradients.  Defaults to "Adagrad".
+      **kwargs: keyword arguments. Allowed to be {`decay`}
+
+    Raises:
+      ValueError: If the `initial_accumulator_value` or `epsilon` is invalid.
+
+    @compatibility(eager)
+    When eager execution is enabled, `learning_rate` can be a callable that
+    takes no arguments and returns the actual value to use. This can be useful
+    for changing these values across different invocations of optimizer
+    functions.
+    @end_compatibility
+    """
+    if initial_accumulator_value < 0.0:
+      raise ValueError('initial_accumulator_value must be non-negative: %s' %
+                       initial_accumulator_value)
+    if epsilon < 1e-7:
+      raise ValueError('epsilon must be larger than 1e-7: %s' % epsilon)
+    super(Adagrad, self).__init__(name, **kwargs)
+    self._set_hyper('learning_rate', learning_rate)
+    self._set_hyper('decay', self._initial_decay)
+    self._initial_accumulator_value = initial_accumulator_value
+    self._set_hyper('epsilon', epsilon)
+
+  def _create_slots(self, var_list):
+    for var in var_list:
+      dtype = var.dtype.base_dtype
+      init = init_ops.constant_initializer(
+          self._initial_accumulator_value, dtype=dtype)
+      self.add_slot(var, 'accumulator', init)
+
+  def set_weights(self, weights):
+    params = self.weights
+    # Override set_weights for backward compatibility of Keras V1 optimizer
+    # since it does not include iteration at head of the weight list. Set
+    # iteration to 0.
+    if len(params) == len(weights) + 1:
+      weights = [np.array(0)] + weights
+    super(Adagrad, self).set_weights(weights)
+
+  @classmethod
+  def from_config(cls, config, custom_objects=None):
+    """Creates an optimizer from its config.
+
+    This method is the reverse of `get_config`,
+    capable of instantiating the same optimizer from the config
+    dictionary.
+
+    Arguments:
+        config: A Python dictionary, typically the output of get_config.
+        custom_objects: A Python dictionary mapping names to additional Python
+          objects used to create this optimizer, such as a function used for a
+          hyperparameter.
+
+    Returns:
+        An optimizer instance.
+    """
+    if 'initial_accumulator_value' not in config:
+      config['initial_accumulator_value'] = 0.
+    if 'lr' in config:
+      config['learning_rate'] = config.pop('lr')
+    return cls(**config)
+
+  def _resource_apply_dense(self, grad, var):
+    var_dtype = var.dtype.base_dtype
+    lr_t = self._decayed_lr(var_dtype)
+    epsilon = self._get_hyper('epsilon', var_dtype)
+    acc = self.get_slot(var, 'accumulator')
+
+    acc_t = state_ops.assign_add(
+        acc, math_ops.square(grad), use_locking=self._use_locking)
+    var_update = state_ops.assign_sub(
+        var, lr_t * grad / (math_ops.sqrt(acc_t) + epsilon))
+    return var_update
+
+  def _resource_apply_sparse(self, grad, var, indices):
+
+    def _resource_scatter_add(x, i, v):
+      with ops.control_dependencies(
+          [resource_variable_ops.resource_scatter_add(x.handle, i, v)]):
+        return x.value()
+
+    var_dtype = var.dtype.base_dtype
+    lr_t = self._decayed_lr(var_dtype)
+    epsilon = self._get_hyper('epsilon', var_dtype)
+    acc = self.get_slot(var, 'accumulator')
+
+    acc_t = _resource_scatter_add(acc, indices, math_ops.square(grad))
+    acc_t_slice = array_ops.gather(acc_t, indices)
+    var_update = _resource_scatter_add(
+        var, indices, -lr_t * grad / (math_ops.sqrt(acc_t_slice) + epsilon))
+    return var_update
+
+  def get_config(self):
+    config = super(Adagrad, self).get_config()
+    config.update({
+        'learning_rate': self._serialize_hyperparameter('learning_rate'),
+        'decay': self._serialize_hyperparameter('decay'),
+        'initial_accumulator_value': self._initial_accumulator_value,
+        'epsilon': self._serialize_hyperparameter('epsilon'),
+    })
+    return config
diff --git a/tensorflow/python/keras/optimizer_v2/adagrad_test.py b/tensorflow/python/keras/optimizer_v2/adagrad_test.py
new file mode 100644
index 00000000000..b2c290178fe
--- /dev/null
+++ b/tensorflow/python/keras/optimizer_v2/adagrad_test.py
@@ -0,0 +1,400 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Functional tests for aggregate operations."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import copy
+
+import numpy as np
+
+from tensorflow.python.eager import context
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.keras.optimizer_v2 import adagrad
+from tensorflow.python.ops import embedding_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+
+
+def adagrad_update_numpy(param, accum, g_t, lr=0.001, epsilon=1e-7):
+  accum_t = accum + g_t * g_t
+  param_t = param - lr * g_t / (np.sqrt(accum_t) + epsilon)
+  return param_t, accum_t
+
+
+def sparse_adagrad_update_numpy(param,
+                                accum,
+                                gindexs,
+                                gvalues,
+                                lr=0.001,
+                                epsilon=1e-7):
+  accum_t = copy.deepcopy(accum)
+  param_t = copy.deepcopy(param)
+  # first loop accumulates repeated indices if necessary.
+  for i in range(len(gindexs)):
+    gindex = gindexs[i]
+    gvalue = gvalues[i]
+    accum_t[gindex] = accum_t[gindex] + gvalue * gvalue
+  for i in range(len(gindexs)):
+    gindex = gindexs[i]
+    gvalue = gvalues[i]
+    param_t[gindex] = param_t[gindex] - lr * gvalue / (
+        np.sqrt(accum_t[gindex]) + epsilon)
+  return param_t, accum_t
+
+
+class AdagradOptimizerTest(test.TestCase):
+
+  def doTestBasic(self, use_callable_params=False):
+    for dtype in [dtypes.float32, dtypes.float64]:
+      with self.cached_session():
+        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+        grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+        grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
+        var0 = resource_variable_ops.ResourceVariable(var0_np)
+        var1 = resource_variable_ops.ResourceVariable(var1_np)
+        grads0 = constant_op.constant(grads0_np)
+        grads1 = constant_op.constant(grads1_np)
+
+        learning_rate = lambda: 3.0
+        if not use_callable_params:
+          learning_rate = learning_rate()
+
+        ada_opt = adagrad.Adagrad(learning_rate)
+
+        accum0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+        accum1_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+
+        if not context.executing_eagerly():
+          ada_update = ada_opt.apply_gradients(
+              zip([grads0, grads1], [var0, var1]))
+          self.evaluate(variables.global_variables_initializer())
+
+        # Fetch params to validate initial values
+        v0_val, v1_val = self.evaluate([var0, var1])
+        self.assertAllClose([1.0, 2.0], v0_val)
+        self.assertAllClose([3.0, 4.0], v1_val)
+
+        # Run 3 steps of adagrad
+        for _ in range(3):
+          if not context.executing_eagerly():
+            self.evaluate(ada_update)
+          else:
+            ada_opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+          var0_np, accum0_np = adagrad_update_numpy(var0_np, accum0_np,
+                                                    grads0_np, 3.0)
+          var1_np, accum1_np = adagrad_update_numpy(var1_np, accum1_np,
+                                                    grads1_np, 3.0)
+          self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
+          self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
+
+  @test_util.run_in_graph_and_eager_modes(reset_test=True)
+  def testBasic(self):
+    self.doTestBasic()
+
+  def testBasicCallableParams(self):
+    with context.eager_mode():
+      self.doTestBasic(use_callable_params=True)
+
+  def testBasicWithLearningRateDecay(self):
+    for dtype in [dtypes.float32, dtypes.float64]:
+      with self.cached_session():
+        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+        grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+        grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
+        var0 = resource_variable_ops.ResourceVariable(var0_np)
+        var1 = resource_variable_ops.ResourceVariable(var1_np)
+        grads0 = constant_op.constant(grads0_np)
+        grads1 = constant_op.constant(grads1_np)
+
+        learning_rate = 3.0
+        decay = 0.5
+
+        ada_opt = adagrad.Adagrad(learning_rate, decay=decay)
+
+        accum0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+        accum1_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+
+        if not context.executing_eagerly():
+          ada_update = ada_opt.apply_gradients(
+              zip([grads0, grads1], [var0, var1]))
+          self.evaluate(variables.global_variables_initializer())
+
+        # Fetch params to validate initial values
+        v0_val, v1_val = self.evaluate([var0, var1])
+        self.assertAllClose([1.0, 2.0], v0_val)
+        self.assertAllClose([3.0, 4.0], v1_val)
+
+        # Run 3 steps of adagrad
+        for t in range(3):
+          if not context.executing_eagerly():
+            self.evaluate(ada_update)
+          else:
+            ada_opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+          lr_np = learning_rate / (1 + decay * t)
+          var0_np, accum0_np = adagrad_update_numpy(var0_np, accum0_np,
+                                                    grads0_np, lr_np)
+          var1_np, accum1_np = adagrad_update_numpy(var1_np, accum1_np,
+                                                    grads1_np, lr_np)
+          self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
+          self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
+
+  @test_util.run_deprecated_v1
+  def testMinimizeSparseResourceVariable(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.cached_session():
+        var0 = resource_variable_ops.ResourceVariable(
+            [[1.0, 2.0], [3.0, 4.0]], dtype=dtype)
+        x = constant_op.constant([[4.0], [5.0]], dtype=dtype)
+        pred = math_ops.matmul(embedding_ops.embedding_lookup([var0], [0]), x)
+        loss = pred * pred
+        sgd_op = adagrad.Adagrad(1.0).minimize(loss, var_list=[var0])
+        variables.global_variables_initializer().run()
+        # Fetch params to validate initial values
+        self.assertAllCloseAccordingToType(
+            [[1.0, 2.0], [3.0, 4.0]], var0.eval())
+        # Run 1 step of sgd
+        sgd_op.run()
+        # Validate updated params
+        self.assertAllCloseAccordingToType(
+            [[0, 1], [3, 4]], var0.eval(), atol=0.01)
+
+  @test_util.run_deprecated_v1
+  def testTensorLearningRate(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.cached_session():
+        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+        grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+        grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
+        var0 = resource_variable_ops.ResourceVariable(var0_np)
+        var1 = resource_variable_ops.ResourceVariable(var1_np)
+        grads0 = constant_op.constant(grads0_np)
+        grads1 = constant_op.constant(grads1_np)
+
+        learning_rate = constant_op.constant(3.0)
+        ada_opt = adagrad.Adagrad(learning_rate)
+        ada_update = ada_opt.apply_gradients(
+            zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+        # Fetch params to validate initial values
+        self.assertAllClose([1.0, 2.0], var0.eval())
+        self.assertAllClose([3.0, 4.0], var1.eval())
+        accum0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+        accum1_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+        # Run 3 steps of adagrad
+        for _ in range(3):
+          ada_update.run()
+          var0_np, accum0_np = adagrad_update_numpy(var0_np, accum0_np,
+                                                    grads0_np, learning_rate)
+          var1_np, accum1_np = adagrad_update_numpy(var1_np, accum1_np,
+                                                    grads1_np, learning_rate)
+          self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
+          self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
+
+  @test_util.run_deprecated_v1
+  def testSparseBasic(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.cached_session():
+        var0_np = np.array([1.0, 1.0, 2.0], dtype=dtype.as_numpy_dtype)
+        grads0_np = np.array([0.1, 0, 0.1], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([3.0, 3.0, 4.0], dtype=dtype.as_numpy_dtype)
+        grads1_np = np.array([0.01, 0, 0.01], dtype=dtype.as_numpy_dtype)
+
+        var0 = resource_variable_ops.ResourceVariable(var0_np)
+        var1 = resource_variable_ops.ResourceVariable(var1_np)
+        grads0_np_indices = np.array([0, 2], dtype=np.int32)
+        grads0 = ops.IndexedSlices(
+            constant_op.constant(grads0_np[grads0_np_indices]),
+            constant_op.constant(grads0_np_indices), constant_op.constant([3]))
+        grads1_np_indices = np.array([0, 2], dtype=np.int32)
+        grads1 = ops.IndexedSlices(
+            constant_op.constant(grads1_np[grads1_np_indices]),
+            constant_op.constant(grads1_np_indices), constant_op.constant([3]))
+        learning_rate = 3.0
+        ada_opt = adagrad.Adagrad(learning_rate)
+        ada_update = ada_opt.apply_gradients(
+            zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+
+        # Fetch params to validate initial values
+        self.assertAllClose([1.0, 1.0, 2.0], var0.eval())
+        self.assertAllClose([3.0, 3.0, 4.0], var1.eval())
+
+        accum0_np = np.array([0.1, 0.1, 0.1], dtype=dtype.as_numpy_dtype)
+        accum1_np = np.array([0.1, 0.1, 0.1], dtype=dtype.as_numpy_dtype)
+
+        # Run 3 step of sgd
+        for _ in range(3):
+          ada_update.run()
+
+          var0_np, accum0_np = sparse_adagrad_update_numpy(
+              var0_np, accum0_np, grads0_np_indices,
+              grads0_np[grads0_np_indices], learning_rate)
+          var1_np, accum1_np = sparse_adagrad_update_numpy(
+              var1_np, accum1_np, grads1_np_indices,
+              grads1_np[grads1_np_indices], learning_rate)
+          self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
+          self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
+
+  @test_util.run_deprecated_v1
+  def testSparseRepeatedIndices(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.cached_session():
+        var_np = np.array([[1.0], [2.0]], dtype=dtype.as_numpy_dtype)
+
+        repeated_index_update_var = resource_variable_ops.ResourceVariable(
+            var_np, dtype=dtype)
+        aggregated_update_var = resource_variable_ops.ResourceVariable(
+            var_np, dtype=dtype)
+        grad_repeated_index = ops.IndexedSlices(
+            constant_op.constant(
+                [0.1, 0.1], shape=[2, 1], dtype=dtype),
+            constant_op.constant([1, 1]),
+            constant_op.constant([2, 1]))
+        grad_aggregated = ops.IndexedSlices(
+            constant_op.constant(
+                [0.2], shape=[1, 1], dtype=dtype),
+            constant_op.constant([1]),
+            constant_op.constant([2, 1]))
+        repeated_update = adagrad.Adagrad(3.0).apply_gradients(
+            [(grad_repeated_index, repeated_index_update_var)])
+        aggregated_update = adagrad.Adagrad(3.0).apply_gradients(
+            [(grad_aggregated, aggregated_update_var)])
+        variables.global_variables_initializer().run()
+        self.assertAllClose(aggregated_update_var.eval(),
+                            repeated_index_update_var.eval())
+        for _ in range(3):
+          repeated_update.run()
+          aggregated_update.run()
+          self.assertAllClose(aggregated_update_var.eval(),
+                              repeated_index_update_var.eval())
+
+  @test_util.run_deprecated_v1
+  def testSparseRepeatedIndicesByEmbeddingLookUp(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.cached_session():
+        var_repeated = resource_variable_ops.ResourceVariable(
+            [1.0, 2.0], dtype=dtype)
+        loss_repeated = math_ops.reduce_sum(
+            embedding_ops.embedding_lookup(var_repeated, [0, 0]))
+        var_aggregated = resource_variable_ops.ResourceVariable(
+            [1.0, 2.0], dtype=dtype)
+        loss_aggregated = 2 * math_ops.reduce_sum(
+            embedding_ops.embedding_lookup(var_aggregated, [0]))
+        update_op_repeated = adagrad.Adagrad(2.0).minimize(
+            loss_repeated, var_list=[var_repeated])
+        update_op_aggregated = adagrad.Adagrad(2.0).minimize(
+            loss_aggregated, var_list=[var_aggregated])
+        variables.global_variables_initializer().run()
+        self.assertAllCloseAccordingToType(
+            var_repeated.eval(), var_aggregated.eval())
+        for _ in range(3):
+          update_op_repeated.run()
+          update_op_aggregated.run()
+          self.assertAllCloseAccordingToType(
+              var_repeated.eval(), var_aggregated.eval())
+
+  @test_util.run_deprecated_v1
+  def testSparseStability(self):
+    for dtype in [dtypes.half]:
+      with self.cached_session():
+        shape = [1, 6]
+        var0_np = np.array([[
+            0.00872496, -0.106952, 0.110467, 0.226505, -0.0147257, -0.0105945
+        ]],
+                           dtype=dtype.as_numpy_dtype)
+        var0 = resource_variable_ops.ResourceVariable(var0_np)
+        grads0_np = np.array([[
+            -5.91278e-05, 5.31673e-05, -2.5779e-06, 4.29153e-05, -8.4877e-05,
+            -9.48906e-05
+        ]],
+                             dtype=dtype.as_numpy_dtype)
+        grads0 = ops.IndexedSlices(
+            constant_op.constant(grads0_np), constant_op.constant([0]),
+            constant_op.constant(shape))
+        ada_opt = adagrad.Adagrad(1.0)
+        ada_update = ada_opt.apply_gradients(zip([grads0], [var0]))
+        slot0 = ada_opt.get_slot(var0, "accumulator")
+        init = variables.global_variables_initializer()
+        for _ in range(100):
+          init.run()
+          ada_update.run()
+          self.assertAllCloseAccordingToType(
+              np.array([[0.1, 0.1, 0.1, 0.1, 0.1, 0.1]]), slot0.eval())
+          self.assertAllCloseAccordingToType(
+              np.array([[
+                  0.00891194, -0.10712013, 0.11047515, 0.22636929, -0.0144573,
+                  -0.01029443
+              ]]), var0.eval())
+
+  @test_util.run_deprecated_v1
+  def testSharing(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.cached_session():
+        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+        grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+        grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
+
+        var0 = resource_variable_ops.ResourceVariable(var0_np)
+        var1 = resource_variable_ops.ResourceVariable(var1_np)
+        grads0 = constant_op.constant(grads0_np)
+        grads1 = constant_op.constant(grads1_np)
+
+        learning_rate = 3.0
+        ada_opt = adagrad.Adagrad(learning_rate)
+        # Apply the optimizer twice.  Both applications will use
+        # the same accums.
+        ada_update1 = ada_opt.apply_gradients(
+            zip([grads0, grads1], [var0, var1]))
+        ada_update2 = ada_opt.apply_gradients(
+            zip([grads0, grads1], [var0, var1]))
+        slot0 = ada_opt.get_slot(var0, "accumulator")
+        self.assertEqual(slot0.get_shape(), var0.get_shape())
+        slot1 = ada_opt.get_slot(var1, "accumulator")
+        self.assertEqual(slot1.get_shape(), var1.get_shape())
+        variables.global_variables_initializer().run()
+
+        # Fetch params to validate initial values.
+        self.assertAllClose([1.0, 2.0], var0.eval())
+        self.assertAllClose([3.0, 4.0], var1.eval())
+        # Mix the first and the second adagrad for 3 steps.
+        ada_update1.run()
+        ada_update2.run()
+        ada_update1.run()
+
+        accum0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+        accum1_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+        for _ in range(3):
+          var0_np, accum0_np = adagrad_update_numpy(var0_np, accum0_np,
+                                                    grads0_np, learning_rate)
+          var1_np, accum1_np = adagrad_update_numpy(var1_np, accum1_np,
+                                                    grads1_np, learning_rate)
+        self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
+        self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/keras/optimizer_v2/adam.py b/tensorflow/python/keras/optimizer_v2/adam.py
index b05811c419f..ef3d783f891 100644
--- a/tensorflow/python/keras/optimizer_v2/adam.py
+++ b/tensorflow/python/keras/optimizer_v2/adam.py
@@ -1,4 +1,4 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -17,8 +17,12 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.framework import ops
 from tensorflow.python.keras.optimizer_v2 import optimizer_v2
+from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import state_ops
 from tensorflow.python.training import training_ops
 
 
@@ -31,36 +35,62 @@ class Adam(optimizer_v2.OptimizerV2):
   requirement, invariant to diagonal rescaling of gradients, and is well suited
   for problems that are large in terms of data/parameters'.
 
+  Note, amsgrad is currently not supported and the argument can only be False.
+
   # References
       See [Kingma et al., 2014](http://arxiv.org/abs/1412.6980)
         ([pdf](http://arxiv.org/pdf/1412.6980.pdf)).
+      For AMSGrad see [Reddi et al., 2-18]
+        (https://openreview.net/pdf?id=ryQu7f-RZ)
   """
 
   def __init__(self,
                learning_rate=0.001,
                beta_1=0.9,
                beta_2=0.999,
-               epsilon=1e-8,
-               name='Adam'):
+               epsilon=1e-7,
+               amsgrad=False,
+               name='Adam',
+               **kwargs):
     r"""Construct a new Adam optimizer.
 
-    Initialization:
+    If amsgrad = False:
+      Initialization:
 
-    $$m_0 := 0 \text{(Initialize initial 1st moment vector)}$$
-    $$v_0 := 0 \text{(Initialize initial 2nd moment vector)}$$
-    $$t := 0 \text{(Initialize timestep)}$$
+      $$m_0 := 0 \text{(Initialize initial 1st moment vector)}$$
+      $$v_0 := 0 \text{(Initialize initial 2nd moment vector)}$$
+      $$t := 0 \text{(Initialize timestep)}$$
 
-    The update rule for `variable` with gradient `g` uses an optimization
-    described at the end of section2 of the paper:
+      The update rule for `variable` with gradient `g` uses an optimization
+      described at the end of section2 of the paper:
 
-    $$t := t + 1$$
-    $$lr_t := \text{learning\_rate} * \sqrt{1 - beta_2^t} / (1 - beta_1^t)$$
+      $$t := t + 1$$
+      $$lr_t := \text{learning\_rate} * \sqrt{1 - beta_2^t} / (1 - beta_1^t)$$
 
-    $$m_t := beta_1 * m_{t-1} + (1 - beta_1) * g$$
-    $$v_t := beta_2 * v_{t-1} + (1 - beta_2) * g * g$$
-    $$variable := variable - lr_t * m_t / (\sqrt{v_t} + \epsilon)$$
+      $$m_t := beta_1 * m_{t-1} + (1 - beta_1) * g$$
+      $$v_t := beta_2 * v_{t-1} + (1 - beta_2) * g * g$$
+      $$variable := variable - lr_t * m_t / (\sqrt{v_t} + \epsilon)$$
 
-    The default value of 1e-8 for epsilon might not be a good default in
+    If amsgrad = True:
+      Initialization:
+
+      $$m_0 := 0 \text{(Initialize initial 1st moment vector)}$$
+      $$v_0 := 0 \text{(Initialize initial 2nd moment vector)}$$
+      $$v_hat_0 := 0 \text{(Initialize initial 2nd moment vector)}$$
+      $$t := 0 \text{(Initialize timestep)}$$
+
+      The update rule for `variable` with gradient `g` uses an optimization
+      described at the end of section2 of the paper:
+
+      $$t := t + 1$$
+      $$lr_t := \text{learning\_rate} * \sqrt{1 - beta_2^t} / (1 - beta_1^t)$$
+
+      $$m_t := beta_1 * m_{t-1} + (1 - beta_1) * g$$
+      $$v_t := beta_2 * v_{t-1} + (1 - beta_2) * g * g$$
+      $$v_hat_t := max(v_hat_{t-1}, v_t)
+      $$variable := variable - lr_t * m_t / (\sqrt{v_hat_t} + \epsilon)$$
+
+    The default value of 1e-7 for epsilon might not be a good default in
     general. For example, when training an Inception network on ImageNet a
     current good choice is 1.0 or 0.1. Note that since AdamOptimizer uses the
     formulation just before Section 2.1 of the Kingma and Ba paper rather than
@@ -85,50 +115,142 @@ class Adam(optimizer_v2.OptimizerV2):
       epsilon: A small constant for numerical stability. This epsilon is
         "epsilon hat" in the Kingma and Ba paper (in the formula just before
         Section 2.1), not the epsilon in Algorithm 1 of the paper.
+      amsgrad: boolean. Whether to apply AMSGrad variant of this algorithm from
+        the paper "On the Convergence of Adam and beyond".
       name: Optional name for the operations created when applying gradients.
         Defaults to "Adam".  @compatibility(eager) When eager execution is
         enabled, `learning_rate`, `beta_1`, `beta_2`, and `epsilon` can each be
         a callable that takes no arguments and returns the actual value to use.
         This can be useful for changing these values across different
         invocations of optimizer functions. @end_compatibility
+      **kwargs: keyword arguments. Allowed to be {`decay`}
     """
 
-    super(Adam, self).__init__(name)
+    super(Adam, self).__init__(name, **kwargs)
     self._set_hyper('learning_rate', learning_rate)
+    self._set_hyper('decay', self._initial_decay)
     self._set_hyper('beta_1', beta_1)
     self._set_hyper('beta_2', beta_2)
     self._set_hyper('epsilon', epsilon)
+    self._amsgrad = amsgrad
 
   def _create_slots(self, var_list):
     # Create slots for the first and second moments.
+    # Separate for-loops to respect the ordering of slot variables from v1.
     for var in var_list:
       self.add_slot(var, 'm')
+    for var in var_list:
       self.add_slot(var, 'v')
+    if self._amsgrad:
+      for var in var_list:
+        self.add_slot(var, 'vhat')
+
+  def set_weights(self, weights):
+    params = self.weights
+    # If the weights are generated by Keras V1 optimizer, it includes vhats
+    # even without amsgrad, i.e, V1 optimizer has 3x + 1 variables, while V2
+    # optimizer has 2x + 1 variables. Filter vhats out for compatibility.
+    num_vars = int((len(params) - 1) / 2)
+    if len(weights) == 3 * num_vars + 1:
+      weights = weights[:len(params)]
+    super(Adam, self).set_weights(weights)
 
   def _resource_apply_dense(self, grad, var):
+    var_dtype = var.dtype.base_dtype
+    lr_t = self._decayed_lr(var_dtype)
     m = self.get_slot(var, 'm')
     v = self.get_slot(var, 'v')
-    # TODO(tanzheny): let optimizer have its own step counter, and let
-    # beta1_power and beta2_power depend on it.
-    return training_ops.resource_apply_adam(
-        var.handle,
-        m.handle,
-        v.handle,
-        math_ops.cast(self._get_hyper('beta_1'), grad.dtype.base_dtype),
-        math_ops.cast(self._get_hyper('beta_2'), grad.dtype.base_dtype),
-        math_ops.cast(self._get_hyper('learning_rate'), grad.dtype.base_dtype),
-        math_ops.cast(self._get_hyper('beta_1'), grad.dtype.base_dtype),
-        math_ops.cast(self._get_hyper('beta_2'), grad.dtype.base_dtype),
-        math_ops.cast(self._get_hyper('epsilon'), grad.dtype.base_dtype),
-        grad,
-        use_locking=self._use_locking)
+    beta_1_t = self._get_hyper('beta_1', var_dtype)
+    beta_2_t = self._get_hyper('beta_2', var_dtype)
+    epsilon = self._get_hyper('epsilon', var_dtype)
+    local_step = math_ops.cast(self.iterations + 1, var_dtype)
+    beta_1_power = math_ops.pow(beta_1_t, local_step)
+    beta_2_power = math_ops.pow(beta_2_t, local_step)
+    if not self._amsgrad:
+      return training_ops.resource_apply_adam(
+          var.handle,
+          m.handle,
+          v.handle,
+          beta_1_power,
+          beta_2_power,
+          lr_t,
+          beta_1_t,
+          beta_2_t,
+          epsilon,
+          grad,
+          use_locking=self._use_locking)
+    else:
+      vhat = self.get_slot(var, 'vhat')
+      return training_ops.resource_apply_adam_with_amsgrad(
+          var.handle,
+          m.handle,
+          v.handle,
+          vhat.handle,
+          beta_1_power,
+          beta_2_power,
+          lr_t,
+          beta_1_t,
+          beta_2_t,
+          epsilon,
+          grad,
+          use_locking=self._use_locking)
+
+  def _resource_apply_sparse(self, grad, var, indices):
+    var_dtype = var.dtype.base_dtype
+    lr_t = self._decayed_lr(var_dtype)
+    beta_1_t = self._get_hyper('beta_1', var_dtype)
+    beta_2_t = self._get_hyper('beta_2', var_dtype)
+    local_step = math_ops.cast(self.iterations + 1, var_dtype)
+    beta_1_power = math_ops.pow(beta_1_t, local_step)
+    beta_2_power = math_ops.pow(beta_2_t, local_step)
+    epsilon_t = self._get_hyper('epsilon', var_dtype)
+    lr = (lr_t * math_ops.sqrt(1 - beta_2_power) / (1 - beta_1_power))
+
+    # m_t = beta1 * m + (1 - beta1) * g_t
+    m = self.get_slot(var, 'm')
+    m_scaled_g_values = grad * (1 - beta_1_t)
+    m_t = state_ops.assign(m, m * beta_1_t, use_locking=self._use_locking)
+    with ops.control_dependencies([m_t]):
+      m_t = self._resource_scatter_add(m, indices, m_scaled_g_values)
+
+    # v_t = beta2 * v + (1 - beta2) * (g_t * g_t)
+    v = self.get_slot(var, 'v')
+    v_scaled_g_values = (grad * grad) * (1 - beta_2_t)
+    v_t = state_ops.assign(v, v * beta_2_t, use_locking=self._use_locking)
+    with ops.control_dependencies([v_t]):
+      v_t = self._resource_scatter_add(v, indices, v_scaled_g_values)
+
+    if not self._amsgrad:
+      v_sqrt = math_ops.sqrt(v_t)
+      var_update = state_ops.assign_sub(
+          var, lr * m_t / (v_sqrt + epsilon_t), use_locking=self._use_locking)
+      return control_flow_ops.group(*[var_update, m_t, v_t])
+    else:
+      v_hat = self.get_slot(var, 'vhat')
+      v_hat_t = math_ops.maximum(v_hat, v_t)
+      with ops.control_dependencies([v_hat_t]):
+        v_hat_t = state_ops.assign(
+            v_hat, v_hat_t, use_locking=self._use_locking)
+      v_hat_sqrt = math_ops.sqrt(v_hat_t)
+      var_update = state_ops.assign_sub(
+          var,
+          lr * m_t / (v_hat_sqrt + epsilon_t),
+          use_locking=self._use_locking)
+      return control_flow_ops.group(*[var_update, m_t, v_t, v_hat_t])
+
+  def _resource_scatter_add(self, x, i, v):
+    with ops.control_dependencies(
+        [resource_variable_ops.resource_scatter_add(x.handle, i, v)]):
+      return x.value()
 
   def get_config(self):
     config = super(Adam, self).get_config()
     config.update({
         'learning_rate': self._serialize_hyperparameter('learning_rate'),
+        'decay': self._serialize_hyperparameter('decay'),
         'beta_1': self._serialize_hyperparameter('beta_1'),
         'beta_2': self._serialize_hyperparameter('beta_2'),
         'epsilon': self._serialize_hyperparameter('epsilon'),
+        'amsgrad': self._amsgrad,
     })
     return config
diff --git a/tensorflow/python/keras/optimizer_v2/adam_test.py b/tensorflow/python/keras/optimizer_v2/adam_test.py
new file mode 100644
index 00000000000..3bbafe12f8e
--- /dev/null
+++ b/tensorflow/python/keras/optimizer_v2/adam_test.py
@@ -0,0 +1,508 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Adam."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.eager import context
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.keras import optimizers
+from tensorflow.python.keras.optimizer_v2 import adam
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+
+
+def adam_update_numpy(param,
+                      g_t,
+                      t,
+                      m,
+                      v,
+                      lr=0.001,
+                      beta1=0.9,
+                      beta2=0.999,
+                      epsilon=1e-7):
+  lr_t = lr * np.sqrt(1 - beta2**(t + 1)) / (1 - beta1**(t + 1))
+
+  m_t = beta1 * m + (1 - beta1) * g_t
+  v_t = beta2 * v + (1 - beta2) * g_t * g_t
+
+  param_t = param - lr_t * m_t / (np.sqrt(v_t) + epsilon)
+  return param_t, m_t, v_t
+
+
+def adam_update_numpy_amsgrad(param,
+                              g_t,
+                              t,
+                              m,
+                              v,
+                              vhat,
+                              lr=0.001,
+                              beta1=0.9,
+                              beta2=0.999,
+                              epsilon=1e-7):
+  lr_t = lr * np.sqrt(1 - beta2**(t + 1)) / (1 - beta1**(t + 1))
+
+  m_t = beta1 * m + (1 - beta1) * g_t
+  v_t = beta2 * v + (1 - beta2) * g_t * g_t
+  vhat_t = np.maximum(vhat, v_t)
+
+  param_t = param - lr_t * m_t / (np.sqrt(vhat_t) + epsilon)
+  return param_t, m_t, v_t, vhat_t
+
+
+def adam_sparse_update_numpy_amsgrad(param,
+                                     indices,
+                                     g_t,
+                                     t,
+                                     m,
+                                     v,
+                                     vhat,
+                                     lr=0.001,
+                                     beta1=0.9,
+                                     beta2=0.999,
+                                     epsilon=1e-7):
+  m_t, v_t, vhat_t, param_t = (np.copy(m), np.copy(v), np.copy(vhat),
+                               np.copy(param))
+  lr_t = lr * np.sqrt(1 - beta2**(t + 1)) / (1 - beta1**(t + 1))
+  m_t_slice = beta1 * m[indices] + (1 - beta1) * g_t
+  v_t_slice = beta2 * v[indices] + (1 - beta2) * g_t * g_t
+  m_t[indices] = m_t_slice
+  v_t[indices] = v_t_slice
+  v_hat_t = np.maximum(vhat_t, v_t)
+  v_hat_t_slice = v_hat_t[indices]
+  param_t_slice = param[indices] - (
+      lr_t * (m_t_slice / (np.sqrt(v_hat_t_slice) + epsilon)))
+  param_t[indices] = param_t_slice
+  return param_t, m_t, v_t, vhat_t
+
+
+def get_beta_accumulators(opt, dtype):
+  local_step = math_ops.cast(opt.iterations + 1, dtype)
+  beta_1_t = math_ops.cast(opt._get_hyper("beta_1"), dtype)
+  beta_1_power = math_ops.pow(beta_1_t, local_step)
+  beta_2_t = math_ops.cast(opt._get_hyper("beta_2"), dtype)
+  beta_2_power = math_ops.pow(beta_2_t, local_step)
+  return (beta_1_power, beta_2_power)
+
+
+class AdamOptimizerTest(test.TestCase):
+
+  @test_util.run_deprecated_v1
+  def testSparse(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.cached_session():
+        # Initialize variables for numpy implementation.
+        m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
+        var0_np = np.array([1.0, 1.0, 2.0], dtype=dtype.as_numpy_dtype)
+        grads0_np = np.array([0.1, 0.0, 0.1], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([3.0, 3.0, 4.0], dtype=dtype.as_numpy_dtype)
+        grads1_np = np.array([0.01, 0.0, 0.01], dtype=dtype.as_numpy_dtype)
+
+        var0 = resource_variable_ops.ResourceVariable(var0_np)
+        var1 = resource_variable_ops.ResourceVariable(var1_np)
+        grads0_np_indices = np.array([0, 2], dtype=np.int32)
+        grads0 = ops.IndexedSlices(
+            constant_op.constant(grads0_np[grads0_np_indices]),
+            constant_op.constant(grads0_np_indices), constant_op.constant([3]))
+        grads1_np_indices = np.array([0, 2], dtype=np.int32)
+        grads1 = ops.IndexedSlices(
+            constant_op.constant(grads1_np[grads1_np_indices]),
+            constant_op.constant(grads1_np_indices), constant_op.constant([3]))
+        opt = adam.Adam()
+        update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+
+        # Fetch params to validate initial values
+        self.assertAllClose([1.0, 1.0, 2.0], self.evaluate(var0))
+        self.assertAllClose([3.0, 3.0, 4.0], self.evaluate(var1))
+
+        beta_1_power, beta_2_power = get_beta_accumulators(opt, dtype)
+        # Run 3 steps of Adam
+        for t in range(3):
+          self.assertAllCloseAccordingToType(0.9**(t + 1),
+                                             self.evaluate(beta_1_power))
+          self.assertAllCloseAccordingToType(0.999**(t + 1),
+                                             self.evaluate(beta_2_power))
+          update.run()
+
+          var0_np, m0, v0 = adam_update_numpy(var0_np, grads0_np, t, m0, v0)
+          var1_np, m1, v1 = adam_update_numpy(var1_np, grads1_np, t, m1, v1)
+
+          # Validate updated params
+          self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
+          self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
+
+  @test_util.run_deprecated_v1
+  def testSparseDevicePlacement(self):
+    for index_dtype in [dtypes.int32, dtypes.int64]:
+      with self.cached_session(force_gpu=test.is_gpu_available()):
+        # If a GPU is available, tests that all optimizer ops can be placed on
+        # it (i.e. they have GPU kernels).
+        var = variables.Variable([[1.0], [2.0]])
+        indices = constant_op.constant([0, 1], dtype=index_dtype)
+        gathered_sum = math_ops.reduce_sum(array_ops.gather(var, indices))
+        optimizer = adam.Adam(3.0)
+        minimize_op = optimizer.minimize(gathered_sum, var_list=[var])
+        variables.global_variables_initializer().run()
+        minimize_op.run()
+
+  @test_util.run_deprecated_v1
+  def testSparseRepeatedIndices(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.cached_session():
+        repeated_index_update_var = variables.Variable(
+            [[1.0], [2.0]], dtype=dtype)
+        aggregated_update_var = variables.Variable(
+            [[1.0], [2.0]], dtype=dtype)
+        grad_repeated_index = ops.IndexedSlices(
+            constant_op.constant(
+                [0.1, 0.1], shape=[2, 1], dtype=dtype),
+            constant_op.constant([1, 1]),
+            constant_op.constant([2, 1]))
+        grad_aggregated = ops.IndexedSlices(
+            constant_op.constant(
+                [0.2], shape=[1, 1], dtype=dtype),
+            constant_op.constant([1]),
+            constant_op.constant([2, 1]))
+        repeated_update = adam.Adam().apply_gradients(
+            [(grad_repeated_index, repeated_index_update_var)])
+        aggregated_update = adam.Adam().apply_gradients(
+            [(grad_aggregated, aggregated_update_var)])
+        variables.global_variables_initializer().run()
+        self.assertAllClose(aggregated_update_var.eval(),
+                            self.evaluate(repeated_index_update_var))
+        for _ in range(3):
+          repeated_update.run()
+          aggregated_update.run()
+          self.assertAllClose(aggregated_update_var.eval(),
+                              self.evaluate(repeated_index_update_var))
+
+  def doTestBasic(self, use_callable_params=False):
+    for i, dtype in enumerate([dtypes.half, dtypes.float32, dtypes.float64]):
+      with self.session(graph=ops.Graph()):
+        # Initialize variables for numpy implementation.
+        m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
+        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+        grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+        grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
+
+        var0 = resource_variable_ops.ResourceVariable(
+            var0_np, name="var0_%d" % i)
+        var1 = resource_variable_ops.ResourceVariable(
+            var1_np, name="var1_%d" % i)
+        grads0 = constant_op.constant(grads0_np)
+        grads1 = constant_op.constant(grads1_np)
+
+        learning_rate = lambda: 0.001
+        beta1 = lambda: 0.9
+        beta2 = lambda: 0.999
+        epsilon = lambda: 1e-8
+        if not use_callable_params:
+          learning_rate = learning_rate()
+          beta1 = beta1()
+          beta2 = beta2()
+          epsilon = epsilon()
+
+        opt = adam.Adam(learning_rate=learning_rate)
+        if not context.executing_eagerly():
+          update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+
+        self.evaluate(variables.global_variables_initializer())
+        # Run 3 steps of Adam
+        for t in range(3):
+          beta_1_power, beta_2_power = get_beta_accumulators(opt, dtype)
+          self.assertAllCloseAccordingToType(0.9**(t + 1),
+                                             self.evaluate(beta_1_power))
+          self.assertAllCloseAccordingToType(0.999**(t + 1),
+                                             self.evaluate(beta_2_power))
+          if not context.executing_eagerly():
+            self.evaluate(update)
+          else:
+            opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+
+          var0_np, m0, v0 = adam_update_numpy(var0_np, grads0_np, t, m0, v0)
+          var1_np, m1, v1 = adam_update_numpy(var1_np, grads1_np, t, m1, v1)
+
+          # Validate updated params
+          self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
+          self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
+
+  @test_util.run_in_graph_and_eager_modes(reset_test=True)
+  def testResourceBasic(self):
+    self.doTestBasic()
+
+  def testBasicCallableParams(self):
+    with context.eager_mode():
+      self.doTestBasic(use_callable_params=True)
+
+  @test_util.run_in_graph_and_eager_modes(reset_test=True)
+  def testBasicWithAmsgrad(self):
+    for i, dtype in enumerate([dtypes.half, dtypes.float32, dtypes.float64]):
+      with self.session(graph=ops.Graph()):
+        # Initialize variables for numpy implementation.
+        m0, v0, v0hat, m1, v1, v1hat = 0.0, 0.0, 0.0, 0.0, 0.0, 0.0
+        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+        grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+        grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
+
+        var0 = resource_variable_ops.ResourceVariable(
+            var0_np, name="var0_%d" % i)
+        var1 = resource_variable_ops.ResourceVariable(
+            var1_np, name="var1_%d" % i)
+        grads0 = constant_op.constant(grads0_np)
+        grads1 = constant_op.constant(grads1_np)
+
+        opt = adam.Adam(amsgrad=True)
+        if not context.executing_eagerly():
+          update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+
+        self.evaluate(variables.global_variables_initializer())
+        # Run 3 steps of Adam
+        for t in range(3):
+          beta_1_power, beta_2_power = get_beta_accumulators(opt, dtype)
+          self.assertAllCloseAccordingToType(0.9**(t + 1),
+                                             self.evaluate(beta_1_power))
+          self.assertAllCloseAccordingToType(0.999**(t + 1),
+                                             self.evaluate(beta_2_power))
+          if not context.executing_eagerly():
+            self.evaluate(update)
+          else:
+            opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+
+          var0_np, m0, v0, v0hat = adam_update_numpy_amsgrad(
+              var0_np, grads0_np, t, m0, v0, v0hat)
+          var1_np, m1, v1, v1hat = adam_update_numpy_amsgrad(
+              var1_np, grads1_np, t, m1, v1, v1hat)
+
+          # Validate updated params
+          self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
+          self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
+
+  @test_util.run_in_graph_and_eager_modes
+  def testSparseWithAmsgrad(self):
+    # dtypes.half does not work on gpu + eager.
+    for dtype in [dtypes.float32, dtypes.float64]:
+      with self.cached_session():
+        m0 = np.array([[0.0], [0.0]])
+        v0 = np.array([[0.0], [0.0]])
+        v0hat = np.array([[0.0], [0.0]])
+        indices_np = np.array([1])
+        indices = constant_op.constant(indices_np, dtype=dtypes.int32)
+        var0_np = np.array([[1.0], [2.0]], dtype=dtype.as_numpy_dtype)
+        repeated_index_update_var = variables.Variable(var0_np, dtype=dtype)
+        aggregated_update_var = variables.Variable(var0_np, dtype=dtype)
+        grads0_np = np.array([[0.2]], dtype=dtype.as_numpy_dtype)
+        grad_repeated_index = ops.IndexedSlices(
+            constant_op.constant([0.1, 0.1], shape=[2, 1], dtype=dtype),
+            constant_op.constant([1, 1]), constant_op.constant([2, 1]))
+        grad_aggregated = ops.IndexedSlices(grads0_np, indices,
+                                            constant_op.constant([2, 1]))
+        opt_repeated = adam.Adam(amsgrad=True)
+        opt_aggregated = adam.Adam(amsgrad=True)
+        if not context.executing_eagerly():
+          repeated_update = opt_repeated.apply_gradients(
+              [(grad_repeated_index, repeated_index_update_var)])
+          aggregated_update = opt_aggregated.apply_gradients(
+              [(grad_aggregated, aggregated_update_var)])
+        self.evaluate(variables.global_variables_initializer())
+        self.assertAllClose(
+            self.evaluate(aggregated_update_var),
+            self.evaluate(repeated_index_update_var))
+        for t in range(3):
+          if not context.executing_eagerly():
+            self.evaluate(repeated_update)
+            self.evaluate(aggregated_update)
+          else:
+            opt_repeated.apply_gradients(
+                [(grad_repeated_index, repeated_index_update_var)])
+            opt_aggregated.apply_gradients(
+                [(grad_aggregated, aggregated_update_var)])
+
+          var0_np, m0, v0, v0hat = adam_sparse_update_numpy_amsgrad(
+              var0_np, indices_np, grads0_np, t, m0, v0, v0hat)
+
+          # Validate updated params
+          self.assertAllCloseAccordingToType(
+              var0_np, self.evaluate(aggregated_update_var))
+          self.assertAllCloseAccordingToType(
+              self.evaluate(aggregated_update_var),
+              self.evaluate(repeated_index_update_var))
+
+  @test_util.run_deprecated_v1
+  def testBasicWithLearningRateDecay(self):
+    for i, dtype in enumerate([dtypes.half, dtypes.float32, dtypes.float64]):
+      with self.session(graph=ops.Graph()):
+        # Initialize variables for numpy implementation.
+        m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
+        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+        grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+        grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
+
+        var0 = resource_variable_ops.ResourceVariable(
+            var0_np, name="var0_%d" % i)
+        var1 = resource_variable_ops.ResourceVariable(
+            var1_np, name="var1_%d" % i)
+        grads0 = constant_op.constant(grads0_np)
+        grads1 = constant_op.constant(grads1_np)
+
+        learning_rate = 0.001
+        beta_1 = 0.9
+        beta_2 = 0.999
+        epsilon = 1e-7
+        decay = 0.5
+
+        opt = adam.Adam(
+            learning_rate=learning_rate,
+            beta_1=beta_1,
+            beta_2=beta_2,
+            epsilon=epsilon,
+            decay=decay)
+        update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+
+        self.evaluate(variables.global_variables_initializer())
+        # Run 3 steps of Adam
+        for t in range(3):
+          self.evaluate(update)
+          lr_np = learning_rate / (1 + decay * t)
+
+          var0_np, m0, v0 = adam_update_numpy(
+              var0_np, grads0_np, t, m0, v0, lr=lr_np)
+          var1_np, m1, v1 = adam_update_numpy(
+              var1_np, grads1_np, t, m1, v1, lr=lr_np)
+
+          # Validate updated params
+          self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
+          self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
+
+  @test_util.run_deprecated_v1
+  def testTensorLearningRate(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.cached_session():
+        # Initialize variables for numpy implementation.
+        m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
+        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+        grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+        grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
+
+        var0 = variables.Variable(var0_np)
+        var1 = variables.Variable(var1_np)
+        grads0 = constant_op.constant(grads0_np)
+        grads1 = constant_op.constant(grads1_np)
+        opt = adam.Adam(constant_op.constant(0.001))
+        update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+
+        # Fetch params to validate initial values
+        self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+        self.assertAllClose([3.0, 4.0], self.evaluate(var1))
+
+        beta_1_power, beta_2_power = get_beta_accumulators(opt, dtype)
+        # Run 3 steps of Adam
+        for t in range(3):
+          self.assertAllCloseAccordingToType(0.9**(t + 1),
+                                             self.evaluate(beta_1_power))
+          self.assertAllCloseAccordingToType(0.999**(t + 1),
+                                             self.evaluate(beta_2_power))
+          update.run()
+
+          var0_np, m0, v0 = adam_update_numpy(var0_np, grads0_np, t, m0, v0)
+          var1_np, m1, v1 = adam_update_numpy(var1_np, grads1_np, t, m1, v1)
+
+          # Validate updated params
+          self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
+          self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
+
+  @test_util.run_deprecated_v1
+  def testSharing(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.cached_session():
+        # Initialize variables for numpy implementation.
+        m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
+        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+        grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+        grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
+
+        var0 = variables.Variable(var0_np)
+        var1 = variables.Variable(var1_np)
+        grads0 = constant_op.constant(grads0_np)
+        grads1 = constant_op.constant(grads1_np)
+        opt = adam.Adam()
+        update1 = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        update2 = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+
+        beta_1_power, beta_2_power = get_beta_accumulators(opt, dtype)
+
+        # Fetch params to validate initial values
+        self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+        self.assertAllClose([3.0, 4.0], self.evaluate(var1))
+
+        # Run 3 steps of intertwined Adam1 and Adam2.
+        for t in range(3):
+          self.assertAllCloseAccordingToType(0.9**(t + 1),
+                                             self.evaluate(beta_1_power))
+          self.assertAllCloseAccordingToType(0.999**(t + 1),
+                                             self.evaluate(beta_2_power))
+          if t % 2 == 0:
+            update1.run()
+          else:
+            update2.run()
+
+          var0_np, m0, v0 = adam_update_numpy(var0_np, grads0_np, t, m0, v0)
+          var1_np, m1, v1 = adam_update_numpy(var1_np, grads1_np, t, m1, v1)
+
+          # Validate updated params
+          self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
+          self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
+
+  def testSlotsUniqueEager(self):
+    with context.eager_mode():
+      v1 = resource_variable_ops.ResourceVariable(1.)
+      v2 = resource_variable_ops.ResourceVariable(1.)
+      opt = adam.Adam(1.)
+      opt.minimize(lambda: v1 + v2, var_list=[v1, v2])
+      # There should be iteration, and two unique slot variables for v1 and v2.
+      self.assertEqual(5, len(set(opt.variables())))
+      self.assertEqual(
+          self.evaluate(opt.variables()[0]), self.evaluate(opt.iterations))
+
+  def testSetWeightsFromV1AdamWithoutMinimize(self):
+    keras_v1_adam = optimizers.Adam()
+    keras_v2_adam = adam.Adam()
+    keras_v2_adam.set_weights(keras_v1_adam.get_weights())
+    keras_v1_iteration = keras_v1_adam.iterations
+    keras_v2_iteration = keras_v2_adam.iterations
+    self.evaluate(variables.global_variables_initializer())
+    self.assertEqual(
+        self.evaluate(keras_v1_iteration), self.evaluate(keras_v2_iteration))
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/keras/optimizer_v2/adamax.py b/tensorflow/python/keras/optimizer_v2/adamax.py
new file mode 100644
index 00000000000..ddd78584f85
--- /dev/null
+++ b/tensorflow/python/keras/optimizer_v2/adamax.py
@@ -0,0 +1,159 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Adamax for TensorFlow."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import ops
+from tensorflow.python.keras.optimizer_v2 import adam
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.training import training_ops
+
+
+class Adamax(adam.Adam):
+  """Optimizer that implements the Adamax algorithm.
+
+  It is a variant of Adam based on the infinity norm.
+  Default parameters follow those provided in the paper.
+  Adamax is sometimes superior to adam, specially in models with embeddings.
+
+  References
+    see Section 7 of [Kingma et al., 2014](http://arxiv.org/abs/1412.6980)
+      ([pdf](http://arxiv.org/pdf/1412.6980.pdf)).
+  """
+
+  def __init__(self,
+               learning_rate=0.001,
+               beta_1=0.9,
+               beta_2=0.999,
+               epsilon=1e-7,
+               name='Adamax',
+               **kwargs):
+    """Construct a new Adamax optimizer.
+
+    Initialization:
+
+    ```
+    m_0 <- 0 (Initialize initial 1st moment vector)
+    v_0 <- 0 (Initialize the exponentially weighted infinity norm)
+    t <- 0 (Initialize timestep)
+    ```
+
+    The update rule for `variable` with gradient `g` uses an optimization
+    described at the end of section 7.1 of the paper:
+
+    ```
+    t <- t + 1
+
+    m_t <- beta1 * m_{t-1} + (1 - beta1) * g
+    v_t <- max(beta2 * v_{t-1}, abs(g))
+    variable <- variable - learning_rate / (1 - beta1^t) * m_t / (v_t + epsilon)
+    ```
+
+    Similar to AdamOptimizer, the epsilon is added for numerical stability
+    (especially to get rid of division by zero when v_t = 0).
+
+    Contrast to AdamOptimizer, the sparse implementation of this algorithm
+    (used when the gradient is an IndexedSlices object, typically because of
+    `tf.gather` or an embedding lookup in the forward pass) only updates
+    variable slices and corresponding `m_t`, `v_t` terms when that part of
+    the variable was used in the forward pass. This means that the sparse
+    behavior is contrast to the dense behavior (similar to some momentum
+    implementations which ignore momentum unless a variable slice was actually
+    used).
+
+    Args:
+      learning_rate: A Tensor or a floating point value.  The learning rate.
+      beta_1: A float value or a constant float tensor. The exponential decay
+        rate for the 1st moment estimates.
+      beta_2: A float value or a constant float tensor. The exponential decay
+        rate for the exponentially weighted infinity norm.
+      epsilon: A small constant for numerical stability.
+      name: Optional name for the operations created when applying gradients.
+        Defaults to "Adamax".
+      **kwargs: keyword arguments. Allowed to be {`decay`}
+    """
+    # pylint: disable=useless-super-delegation
+    super(Adamax, self).__init__(
+        learning_rate=learning_rate,
+        beta_1=beta_1,
+        beta_2=beta_2,
+        epsilon=epsilon,
+        amsgrad=False,
+        name=name,
+        **kwargs)
+    # pylint: enable=useless-super-delegation
+
+  def _resource_apply_dense(self, grad, var):
+    var_dtype = var.dtype.base_dtype
+    lr_t = self._decayed_lr(var_dtype)
+    m = self.get_slot(var, 'm')
+    v = self.get_slot(var, 'v')
+    beta_1_t = self._get_hyper('beta_1', var_dtype)
+    beta_2_t = self._get_hyper('beta_2', var_dtype)
+    local_step = math_ops.cast(self.iterations + 1, var_dtype)
+    beta_1_power = math_ops.pow(beta_1_t, local_step)
+    return training_ops.resource_apply_ada_max(
+        var.handle,
+        m.handle,
+        v.handle,
+        beta_1_power,
+        lr_t,
+        beta_1_t,
+        beta_2_t,
+        self._get_hyper('epsilon', var_dtype),
+        grad,
+        use_locking=self._use_locking)
+
+  def _resource_apply_sparse(self, grad, var, indices):
+    var_dtype = var.dtype.base_dtype
+    lr_t = self._decayed_lr(var_dtype)
+
+    beta_1_t = self._get_hyper('beta_1', var_dtype)
+    beta_2_t = self._get_hyper('beta_2', var_dtype)
+    local_step = math_ops.cast(self.iterations + 1, var_dtype)
+    beta_1_power = math_ops.pow(beta_1_t, local_step)
+    epsilon_t = self._get_hyper('epsilon', var_dtype)
+
+    # m_t = beta1 * m + (1 - beta1) * g_t
+    m = self.get_slot(var, 'm')
+    m_slice = array_ops.gather(m, indices)
+    m_t_slice = m_slice * beta_1_t + grad * (1 - beta_1_t)
+    with ops.control_dependencies([m_t_slice]):
+      m_t = self._resource_scatter_update(m, indices, m_t_slice)
+
+    # u_t = max(beta2 * u, abs(g_t))
+    v = self.get_slot(var, 'v')
+    v_slice = array_ops.gather(v, indices)
+    v_t_slice = math_ops.maximum(v_slice * beta_2_t, math_ops.abs(grad))
+    with ops.control_dependencies([v_t_slice]):
+      v_t = self._resource_scatter_update(v, indices, v_t_slice)
+    # theta_t = theta - lr / (1 - beta1^t) * m_t / u_t
+    var_slice = -lr_t / (1 - beta_1_power) * (
+        m_t_slice / (v_t_slice + epsilon_t))
+    with ops.control_dependencies([var_slice]):
+      var_update = self._resource_scatter_add(var, indices, var_slice)
+    return control_flow_ops.group(*[var_update, m_t, v_t])
+
+  def _resource_scatter_update(self, x, i, v):
+    with ops.control_dependencies(
+        [resource_variable_ops.resource_scatter_update(
+            x.handle, i, v)]):
+      return x.value()
diff --git a/tensorflow/python/keras/optimizer_v2/adamax_test.py b/tensorflow/python/keras/optimizer_v2/adamax_test.py
new file mode 100644
index 00000000000..baf131fbb0c
--- /dev/null
+++ b/tensorflow/python/keras/optimizer_v2/adamax_test.py
@@ -0,0 +1,367 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Adamax."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.eager import context
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.keras.optimizer_v2 import adamax
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+
+
+def adamax_update_numpy(param,
+                        g_t,
+                        t,
+                        m,
+                        v,
+                        alpha=0.001,
+                        beta1=0.9,
+                        beta2=0.999,
+                        epsilon=1e-8):
+  m_t = beta1 * m + (1 - beta1) * g_t
+  v_t = np.maximum(beta2 * v, np.abs(g_t))
+  param_t = param - (alpha / (1 - beta1**(t + 1))) * (m_t / (v_t + epsilon))
+  return param_t, m_t, v_t
+
+
+def adamax_sparse_update_numpy(param,
+                               indices,
+                               g_t,
+                               t,
+                               m,
+                               v,
+                               alpha=0.001,
+                               beta1=0.9,
+                               beta2=0.999,
+                               epsilon=1e-8):
+  m_t, v_t, param_t = np.copy(m), np.copy(v), np.copy(param)
+  m_t_slice = beta1 * m[indices] + (1 - beta1) * g_t
+  v_t_slice = np.maximum(beta2 * v[indices], np.abs(g_t))
+  param_t_slice = param[indices] - (
+      (alpha / (1 - beta1**(t + 1))) * (m_t_slice / (v_t_slice + epsilon)))
+  m_t[indices] = m_t_slice
+  v_t[indices] = v_t_slice
+  param_t[indices] = param_t_slice
+  return param_t, m_t, v_t
+
+
+def get_beta_accumulators(opt, dtype):
+  local_step = math_ops.cast(opt.iterations + 1, dtype)
+  beta_1_t = math_ops.cast(opt._get_hyper("beta_1"), dtype)
+  beta_1_power = math_ops.pow(beta_1_t, local_step)
+  return beta_1_power
+
+
+class AdamaxOptimizerTest(test.TestCase):
+
+  def doTestSparse(self, use_resource=False):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.cached_session():
+        # Initialize variables for numpy implementation.
+        zero_slots = lambda: np.zeros((3), dtype=dtype.as_numpy_dtype)  # pylint: disable=cell-var-from-loop
+        m0, v0, m1, v1 = zero_slots(), zero_slots(), zero_slots(), zero_slots()
+        var0_np = np.array([1.0, 2.0, 3.0], dtype=dtype.as_numpy_dtype)
+        grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([4.0, 5.0, 6.0], dtype=dtype.as_numpy_dtype)
+        grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
+
+        var0 = resource_variable_ops.ResourceVariable(var0_np)
+        var1 = resource_variable_ops.ResourceVariable(var1_np)
+
+        grads0_np_indices = np.array([0, 1], dtype=np.int32)
+        grads0 = ops.IndexedSlices(
+            constant_op.constant(grads0_np),
+            constant_op.constant(grads0_np_indices), constant_op.constant([3]))
+        grads1_np_indices = np.array([2, 1], dtype=np.int32)
+        grads1 = ops.IndexedSlices(
+            constant_op.constant(grads1_np),
+            constant_op.constant(grads1_np_indices), constant_op.constant([3]))
+        opt = adamax.Adamax()
+        update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+
+        # Fetch params to validate initial values
+        self.assertAllClose([1.0, 2.0, 3.0], var0.eval())
+        self.assertAllClose([4.0, 5.0, 6.0], var1.eval())
+
+        beta1_power = get_beta_accumulators(opt, dtype)
+
+        # Run 3 steps of Adamax
+        for t in range(3):
+          self.assertAllCloseAccordingToType(0.9**(t + 1), beta1_power.eval())
+          update.run()
+
+          var0_np, m0, v0 = adamax_sparse_update_numpy(
+              var0_np, grads0_np_indices, grads0_np, t, m0, v0)
+          var1_np, m1, v1 = adamax_sparse_update_numpy(
+              var1_np, grads1_np_indices, grads1_np, t, m1, v1)
+
+          # Validate updated params
+          self.assertAllCloseAccordingToType(var0_np, var0.eval())
+          self.assertAllCloseAccordingToType(var1_np, var1.eval())
+
+  @test_util.run_deprecated_v1
+  def testResourceSparse(self):
+    self.doTestSparse(use_resource=True)
+
+  @test_util.run_deprecated_v1
+  def testSparseDevicePlacement(self):
+    for index_dtype in [dtypes.int32, dtypes.int64]:
+      with self.cached_session(force_gpu=test.is_gpu_available()):
+        # If a GPU is available, tests that all optimizer ops can be placed on
+        # it (i.e. they have GPU kernels).
+        var = variables.Variable([[1.0], [2.0]])
+        indices = constant_op.constant([0, 1], dtype=index_dtype)
+        gathered_sum = math_ops.reduce_sum(array_ops.gather(var, indices))
+        optimizer = adamax.Adamax(3.0)
+        minimize_op = optimizer.minimize(gathered_sum, var_list=[var])
+        variables.global_variables_initializer().run()
+        minimize_op.run()
+
+  @test_util.run_deprecated_v1
+  def testSparseRepeatedIndices(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.cached_session():
+        repeated_index_update_var = variables.Variable(
+            [[1.0], [2.0]], dtype=dtype)
+        aggregated_update_var = variables.Variable(
+            [[1.0], [2.0]], dtype=dtype)
+        grad_repeated_index = ops.IndexedSlices(
+            constant_op.constant(
+                [0.1, 0.1], shape=[2, 1], dtype=dtype),
+            constant_op.constant([1, 1]),
+            constant_op.constant([2, 1]))
+        grad_aggregated = ops.IndexedSlices(
+            constant_op.constant(
+                [0.2], shape=[1, 1], dtype=dtype),
+            constant_op.constant([1]),
+            constant_op.constant([2, 1]))
+        repeated_update = adamax.Adamax().apply_gradients(
+            [(grad_repeated_index, repeated_index_update_var)])
+        aggregated_update = adamax.Adamax().apply_gradients(
+            [(grad_aggregated, aggregated_update_var)])
+        variables.global_variables_initializer().run()
+        self.assertAllClose(aggregated_update_var.eval(),
+                            repeated_index_update_var.eval())
+        for _ in range(3):
+          repeated_update.run()
+          aggregated_update.run()
+          self.assertAllClose(aggregated_update_var.eval(),
+                              repeated_index_update_var.eval())
+
+  @test_util.run_in_graph_and_eager_modes(reset_test=True)
+  def testBasic(self):
+    for i, dtype in enumerate([dtypes.half, dtypes.float32, dtypes.float64]):
+      with self.session(graph=ops.Graph()):
+        # Initialize variables for numpy implementation.
+        m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
+        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+        grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+        grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
+
+        var0 = resource_variable_ops.ResourceVariable(
+            var0_np, name="var0_%d" % i)
+        var1 = resource_variable_ops.ResourceVariable(
+            var1_np, name="var1_%d" % i)
+
+        grads0 = constant_op.constant(grads0_np)
+        grads1 = constant_op.constant(grads1_np)
+
+        opt = adamax.Adamax()
+        if not context.executing_eagerly():
+          update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+
+        if not context.executing_eagerly():
+          self.evaluate(variables.global_variables_initializer())
+          # Fetch params to validate initial values
+          self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+          self.assertAllClose([3.0, 4.0], self.evaluate(var1))
+
+        # Run 3 steps of Adamax
+        for t in range(3):
+          beta_1_power = get_beta_accumulators(opt, dtype)
+          self.assertAllCloseAccordingToType(0.9**(t + 1),
+                                             self.evaluate(beta_1_power))
+          if not context.executing_eagerly():
+            self.evaluate(update)
+          else:
+            opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+
+          var0_np, m0, v0 = adamax_update_numpy(var0_np, grads0_np, t, m0, v0)
+          var1_np, m1, v1 = adamax_update_numpy(var1_np, grads1_np, t, m1, v1)
+
+          # Validate updated params
+          self.assertAllCloseAccordingToType(
+              var0_np, self.evaluate(var0), rtol=1e-2)
+          self.assertAllCloseAccordingToType(
+              var1_np, self.evaluate(var1), rtol=1e-2)
+
+  @test_util.run_in_graph_and_eager_modes(reset_test=True)
+  def testBasicWithLearningRateDecay(self):
+    for i, dtype in enumerate([dtypes.half, dtypes.float32, dtypes.float64]):
+      with self.session(graph=ops.Graph()):
+        # Initialize variables for numpy implementation.
+        m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
+        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+        grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+        grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
+
+        var0 = resource_variable_ops.ResourceVariable(
+            var0_np, name="var0_%d" % i)
+        var1 = resource_variable_ops.ResourceVariable(
+            var1_np, name="var1_%d" % i)
+
+        grads0 = constant_op.constant(grads0_np)
+        grads1 = constant_op.constant(grads1_np)
+
+        learning_rate = 0.001
+        decay = 0.002
+        opt = adamax.Adamax(learning_rate=learning_rate, decay=decay)
+        if not context.executing_eagerly():
+          update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+
+        if not context.executing_eagerly():
+          self.evaluate(variables.global_variables_initializer())
+          # Fetch params to validate initial values
+          self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+          self.assertAllClose([3.0, 4.0], self.evaluate(var1))
+
+        # Run 3 steps of Adamax
+        for t in range(3):
+          beta_1_power = get_beta_accumulators(opt, dtype)
+          self.assertAllCloseAccordingToType(0.9**(t + 1),
+                                             self.evaluate(beta_1_power))
+          if not context.executing_eagerly():
+            self.evaluate(update)
+          else:
+            opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+
+          lr = learning_rate / (1 + decay * t)
+
+          var0_np, m0, v0 = adamax_update_numpy(
+              var0_np, grads0_np, t, m0, v0, alpha=lr)
+          var1_np, m1, v1 = adamax_update_numpy(
+              var1_np, grads1_np, t, m1, v1, alpha=lr)
+
+          # Validate updated params
+          self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0),
+                                             rtol=1e-2)
+          self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1),
+                                             rtol=1e-2)
+
+  @test_util.run_deprecated_v1
+  def testTensorLearningRate(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.cached_session():
+        # Initialize variables for numpy implementation.
+        m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
+        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+        grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+        grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
+
+        var0 = variables.Variable(var0_np)
+        var1 = variables.Variable(var1_np)
+        grads0 = constant_op.constant(grads0_np)
+        grads1 = constant_op.constant(grads1_np)
+        opt = adamax.Adamax(constant_op.constant(0.001))
+        update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+
+        # Fetch params to validate initial values
+        self.assertAllClose([1.0, 2.0], var0.eval())
+        self.assertAllClose([3.0, 4.0], var1.eval())
+
+        beta1_power = get_beta_accumulators(opt, dtype)
+
+        # Run 3 steps of Adamax
+        for t in range(3):
+          self.assertAllCloseAccordingToType(0.9**(t + 1), beta1_power.eval())
+          update.run()
+
+          var0_np, m0, v0 = adamax_update_numpy(var0_np, grads0_np, t, m0, v0)
+          var1_np, m1, v1 = adamax_update_numpy(var1_np, grads1_np, t, m1, v1)
+
+          # Validate updated params
+          self.assertAllCloseAccordingToType(var0_np, var0.eval())
+          self.assertAllCloseAccordingToType(var1_np, var1.eval())
+
+  @test_util.run_deprecated_v1
+  def testSharing(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.cached_session():
+        # Initialize variables for numpy implementation.
+        m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
+        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+        grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+        grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
+
+        var0 = variables.Variable(var0_np)
+        var1 = variables.Variable(var1_np)
+        grads0 = constant_op.constant(grads0_np)
+        grads1 = constant_op.constant(grads1_np)
+        opt = adamax.Adamax()
+        update1 = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        update2 = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+
+        beta1_power = get_beta_accumulators(opt, dtype)
+
+        # Fetch params to validate initial values
+        self.assertAllClose([1.0, 2.0], var0.eval())
+        self.assertAllClose([3.0, 4.0], var1.eval())
+
+        # Run 3 steps of intertwined Adamax1 and Adamax2.
+        for t in range(3):
+          self.assertAllCloseAccordingToType(0.9**(t + 1), beta1_power.eval())
+          if t % 2 == 0:
+            update1.run()
+          else:
+            update2.run()
+
+          var0_np, m0, v0 = adamax_update_numpy(var0_np, grads0_np, t, m0, v0)
+          var1_np, m1, v1 = adamax_update_numpy(var1_np, grads1_np, t, m1, v1)
+
+          # Validate updated params
+          self.assertAllCloseAccordingToType(var0_np, var0.eval())
+          self.assertAllCloseAccordingToType(var1_np, var1.eval())
+
+  def testSlotsUniqueEager(self):
+    with context.eager_mode():
+      v1 = resource_variable_ops.ResourceVariable(1.)
+      v2 = resource_variable_ops.ResourceVariable(1.)
+      opt = adamax.Adamax(1.)
+      opt.minimize(lambda: v1 + v2, var_list=[v1, v2])
+      # There should be iteration, and two unique slot variables for v1 and v2.
+      self.assertEqual(5, len(set(opt.variables())))
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/keras/optimizer_v2/ftrl.py b/tensorflow/python/keras/optimizer_v2/ftrl.py
new file mode 100644
index 00000000000..e278e352f55
--- /dev/null
+++ b/tensorflow/python/keras/optimizer_v2/ftrl.py
@@ -0,0 +1,210 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Ftrl-proximal for TensorFlow."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.keras.optimizer_v2 import optimizer_v2
+from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.training import training_ops
+
+
+class Ftrl(optimizer_v2.OptimizerV2):
+  """Optimizer that implements the FTRL algorithm.
+
+  See this [paper](
+  https://www.eecs.tufts.edu/~dsculley/papers/ad-click-prediction.pdf).
+  This version has support for both online L2 (the L2 penalty given in the paper
+  above) and shrinkage-type L2 (which is the addition of an L2 penalty to the
+  loss function).
+  """
+
+  def __init__(self,
+               learning_rate,
+               learning_rate_power=-0.5,
+               initial_accumulator_value=0.1,
+               l1_regularization_strength=0.0,
+               l2_regularization_strength=0.0,
+               name='Ftrl',
+               l2_shrinkage_regularization_strength=0.0,
+               **kwargs):
+    r"""Construct a new FTRL optimizer.
+
+    Args:
+      learning_rate: A float value or a constant float `Tensor`.
+      learning_rate_power: A float value, must be less or equal to zero.
+        Controls how the learning rate decreases during training. Use zero for
+        a fixed learning rate.
+      initial_accumulator_value: The starting value for accumulators.
+        Only zero or positive values are allowed.
+      l1_regularization_strength: A float value, must be greater than or
+        equal to zero.
+      l2_regularization_strength: A float value, must be greater than or
+        equal to zero.
+      name: Optional name prefix for the operations created when applying
+        gradients.  Defaults to "Ftrl".
+      l2_shrinkage_regularization_strength: A float value, must be greater than
+        or equal to zero. This differs from L2 above in that the L2 above is a
+        stabilization penalty, whereas this L2 shrinkage is a magnitude penalty.
+        The FTRL formulation can be written as:
+        w_{t+1} = argmin_w(\hat{g}_{1:t}w + L1*||w||_1 + L2*||w||_2^2), where
+        \hat{g} = g + (2*L2_shrinkage*w), and g is the gradient of the loss
+        function w.r.t. the weights w.
+        Specifically, in the absence of L1 regularization, it is equivalent to
+        the following update rule:
+        w_{t+1} = w_t - lr_t / (1 + 2*L2*lr_t) * g_t -
+                  2*L2_shrinkage*lr_t / (1 + 2*L2*lr_t) * w_t
+        where lr_t is the learning rate at t.
+        When input is sparse shrinkage will only happen on the active weights.\
+      **kwargs: keyword arguments. Allowed to be {`decay`}
+
+    Raises:
+      ValueError: If one of the arguments is invalid.
+
+    References
+      See [paper]
+        (https://www.eecs.tufts.edu/~dsculley/papers/ad-click-prediction.pdf)
+    """
+    super(Ftrl, self).__init__(name, **kwargs)
+
+    if initial_accumulator_value < 0.0:
+      raise ValueError(
+          'initial_accumulator_value %f needs to be positive or zero' %
+          initial_accumulator_value)
+    if learning_rate_power > 0.0:
+      raise ValueError('learning_rate_power %f needs to be negative or zero' %
+                       learning_rate_power)
+    if l1_regularization_strength < 0.0:
+      raise ValueError(
+          'l1_regularization_strength %f needs to be positive or zero' %
+          l1_regularization_strength)
+    if l2_regularization_strength < 0.0:
+      raise ValueError(
+          'l2_regularization_strength %f needs to be positive or zero' %
+          l2_regularization_strength)
+    if l2_shrinkage_regularization_strength < 0.0:
+      raise ValueError(
+          'l2_shrinkage_regularization_strength %f needs to be positive'
+          ' or zero' % l2_shrinkage_regularization_strength)
+
+    self._set_hyper('learning_rate', learning_rate)
+    self._set_hyper('decay', self._initial_decay)
+    self._set_hyper('learning_rate_power', learning_rate_power)
+    self._set_hyper('l1_regularization_strength', l1_regularization_strength)
+    self._set_hyper('l2_regularization_strength', l2_regularization_strength)
+    self._initial_accumulator_value = initial_accumulator_value
+    self._l2_shrinkage_regularization_strength = (
+        l2_shrinkage_regularization_strength)
+
+  def _create_slots(self, var_list):
+    # Create the "accum" and "linear" slots.
+    for var in var_list:
+      dtype = var.dtype.base_dtype
+      init = init_ops.constant_initializer(
+          self._initial_accumulator_value, dtype=dtype)
+      self.add_slot(var, 'accumulator', init)
+      self.add_slot(var, 'linear')
+
+  def _resource_apply_dense(self, grad, var):
+    var_dtype = var.dtype.base_dtype
+    lr_t = self._decayed_lr(var_dtype)
+    learning_rate_power = self._get_hyper('learning_rate_power', var_dtype)
+    l1_regularization_strength = self._get_hyper('l1_regularization_strength',
+                                                 var_dtype)
+    l2_regularization_strength = self._get_hyper('l2_regularization_strength',
+                                                 var_dtype)
+    accum = self.get_slot(var, 'accumulator')
+    linear = self.get_slot(var, 'linear')
+    if self._l2_shrinkage_regularization_strength <= 0.0:
+      return training_ops.resource_apply_ftrl(
+          var.handle,
+          accum.handle,
+          linear.handle,
+          grad,
+          lr_t,
+          l1_regularization_strength,
+          l2_regularization_strength,
+          learning_rate_power,
+          use_locking=self._use_locking)
+    else:
+      return training_ops.resource_apply_ftrl_v2(
+          var.handle,
+          accum.handle,
+          linear.handle,
+          grad,
+          lr_t,
+          l1_regularization_strength,
+          l2_regularization_strength,
+          math_ops.cast(self._l2_shrinkage_regularization_strength, var_dtype),
+          learning_rate_power,
+          use_locking=self._use_locking)
+
+  def _resource_apply_sparse(self, grad, var, indices):
+    var_dtype = var.dtype.base_dtype
+    lr_t = self._decayed_lr(var_dtype)
+    learning_rate_power = self._get_hyper('learning_rate_power', var_dtype)
+    l1_regularization_strength = self._get_hyper('l1_regularization_strength',
+                                                 var_dtype)
+    l2_regularization_strength = self._get_hyper('l2_regularization_strength',
+                                                 var_dtype)
+    accum = self.get_slot(var, 'accumulator')
+    linear = self.get_slot(var, 'linear')
+    if self._l2_shrinkage_regularization_strength <= 0.0:
+      return training_ops.resource_sparse_apply_ftrl(
+          var.handle,
+          accum.handle,
+          linear.handle,
+          grad,
+          indices,
+          lr_t,
+          l1_regularization_strength,
+          l2_regularization_strength,
+          learning_rate_power,
+          use_locking=self._use_locking)
+    else:
+      return training_ops.resource_sparse_apply_ftrl_v2(
+          var.handle,
+          accum.handle,
+          linear.handle,
+          grad,
+          indices,
+          lr_t,
+          l1_regularization_strength,
+          l2_regularization_strength,
+          math_ops.cast(self._l2_shrinkage_regularization_strength, var_dtype),
+          learning_rate_power,
+          use_locking=self._use_locking)
+
+  def get_config(self):
+    config = super(Ftrl, self).get_config()
+    config.update({
+        'learning_rate':
+            self._serialize_hyperparameter('learning_rate'),
+        'decay':
+            self._serialize_hyperparameter('decay'),
+        'initial_accumulator_value':
+            self._initial_accumulator_value,
+        'learning_rate_power':
+            self._serialize_hyperparameter('learning_rate_power'),
+        'l1_regularization_strength':
+            self._serializer_hyperparameter('l1_regularization_strength'),
+        'l2_regularization_strength':
+            self._serializer_hyperparameter('l2_regularization_strength'),
+        'l2_shrinkage_regularization_strength':
+            self._l2_shrinkage_regularization_strength,
+    })
+    return config
diff --git a/tensorflow/python/keras/optimizer_v2/ftrl_test.py b/tensorflow/python/keras/optimizer_v2/ftrl_test.py
new file mode 100644
index 00000000000..bec400e8cbb
--- /dev/null
+++ b/tensorflow/python/keras/optimizer_v2/ftrl_test.py
@@ -0,0 +1,440 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Functional tests for Ftrl operations."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.keras.optimizer_v2 import ftrl
+from tensorflow.python.ops import embedding_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+from tensorflow.python.training import adagrad
+from tensorflow.python.training import gradient_descent
+
+
+class FtrlOptimizerTest(test.TestCase):
+
+  def doTestFtrlwithoutRegularization(self, use_resource=False):
+    for dtype in [dtypes.half, dtypes.float32]:
+      with self.cached_session() as sess:
+        if use_resource:
+          var0 = resource_variable_ops.ResourceVariable([0.0, 0.0], dtype=dtype)
+          var1 = resource_variable_ops.ResourceVariable([0.0, 0.0], dtype=dtype)
+        else:
+          var0 = variables.Variable([0.0, 0.0], dtype=dtype)
+          var1 = variables.Variable([0.0, 0.0], dtype=dtype)
+        grads0 = constant_op.constant([0.1, 0.2], dtype=dtype)
+        grads1 = constant_op.constant([0.01, 0.02], dtype=dtype)
+        opt = ftrl.Ftrl(
+            3.0,
+            initial_accumulator_value=0.1,
+            l1_regularization_strength=0.0,
+            l2_regularization_strength=0.0)
+        update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+
+        v0_val, v1_val = self.evaluate([var0, var1])
+        self.assertAllClose([0.0, 0.0], v0_val)
+        self.assertAllClose([0.0, 0.0], v1_val)
+
+        # Run 3 steps FTRL
+        for _ in range(3):
+          update.run()
+
+        v0_val, v1_val = self.evaluate([var0, var1])
+        self.assertAllCloseAccordingToType(
+            np.array([-2.60260963, -4.29698515]), v0_val)
+        self.assertAllCloseAccordingToType(
+            np.array([-0.28432083, -0.56694895]), v1_val)
+
+  @test_util.run_deprecated_v1
+  def testFtrlWithoutRegularization(self):
+    self.doTestFtrlwithoutRegularization(use_resource=False)
+
+  @test_util.run_deprecated_v1
+  def testResourceFtrlWithoutRegularization(self):
+    self.doTestFtrlwithoutRegularization(use_resource=True)
+
+  @test_util.run_deprecated_v1
+  def testFtrlwithoutRegularization2(self):
+    for dtype in [dtypes.half, dtypes.float32]:
+      with self.cached_session() as sess:
+        var0 = variables.Variable([1.0, 2.0], dtype=dtype)
+        var1 = variables.Variable([4.0, 3.0], dtype=dtype)
+        grads0 = constant_op.constant([0.1, 0.2], dtype=dtype)
+        grads1 = constant_op.constant([0.01, 0.02], dtype=dtype)
+
+        opt = ftrl.Ftrl(
+            3.0,
+            initial_accumulator_value=0.1,
+            l1_regularization_strength=0.0,
+            l2_regularization_strength=0.0)
+        update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+
+        v0_val, v1_val = self.evaluate([var0, var1])
+        self.assertAllCloseAccordingToType([1.0, 2.0], v0_val)
+        self.assertAllCloseAccordingToType([4.0, 3.0], v1_val)
+
+        # Run 3 steps FTRL
+        for _ in range(3):
+          update.run()
+        v0_val, v1_val = self.evaluate([var0, var1])
+        self.assertAllCloseAccordingToType(
+            np.array([-2.55607247, -3.98729396]), v0_val)
+        self.assertAllCloseAccordingToType(
+            np.array([-0.28232238, -0.56096673]), v1_val)
+
+  @test_util.run_deprecated_v1
+  def testMinimizeSparseResourceVariable(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.cached_session():
+        var0 = resource_variable_ops.ResourceVariable([[1.0, 2.0]], dtype=dtype)
+        x = constant_op.constant([[4.0], [5.0]], dtype=dtype)
+        pred = math_ops.matmul(embedding_ops.embedding_lookup([var0], [0]), x)
+        loss = pred * pred
+        sgd_op = ftrl.Ftrl(1.0).minimize(loss, var_list=[var0])
+        variables.global_variables_initializer().run()
+        # Fetch params to validate initial values
+        self.assertAllCloseAccordingToType([[1.0, 2.0]], self.evaluate(var0))
+        # Run 1 step of sgd
+        sgd_op.run()
+        # Validate updated params
+        self.assertAllCloseAccordingToType([[0, 1]],
+                                           self.evaluate(var0),
+                                           atol=0.01)
+
+  @test_util.run_deprecated_v1
+  def testFtrlWithL1(self):
+    for dtype in [dtypes.half, dtypes.float32]:
+      with self.cached_session() as sess:
+        var0 = variables.Variable([1.0, 2.0], dtype=dtype)
+        var1 = variables.Variable([4.0, 3.0], dtype=dtype)
+        grads0 = constant_op.constant([0.1, 0.2], dtype=dtype)
+        grads1 = constant_op.constant([0.01, 0.02], dtype=dtype)
+
+        opt = ftrl.Ftrl(
+            3.0,
+            initial_accumulator_value=0.1,
+            l1_regularization_strength=0.001,
+            l2_regularization_strength=0.0)
+        update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+
+        v0_val, v1_val = self.evaluate([var0, var1])
+        self.assertAllCloseAccordingToType([1.0, 2.0], v0_val)
+        self.assertAllCloseAccordingToType([4.0, 3.0], v1_val)
+
+        # Run 10 steps FTRL
+        for _ in range(10):
+          update.run()
+        v0_val, v1_val = self.evaluate([var0, var1])
+        self.assertAllCloseAccordingToType(
+            np.array([-7.66718769, -10.91273689]), v0_val)
+        self.assertAllCloseAccordingToType(
+            np.array([-0.93460727, -1.86147261]), v1_val)
+
+  @test_util.run_deprecated_v1
+  def testFtrlWithL1_L2(self):
+    for dtype in [dtypes.half, dtypes.float32]:
+      with self.cached_session() as sess:
+        var0 = variables.Variable([1.0, 2.0], dtype=dtype)
+        var1 = variables.Variable([4.0, 3.0], dtype=dtype)
+        grads0 = constant_op.constant([0.1, 0.2], dtype=dtype)
+        grads1 = constant_op.constant([0.01, 0.02], dtype=dtype)
+
+        opt = ftrl.Ftrl(
+            3.0,
+            initial_accumulator_value=0.1,
+            l1_regularization_strength=0.001,
+            l2_regularization_strength=2.0)
+        update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+
+        v0_val, v1_val = self.evaluate([var0, var1])
+        self.assertAllCloseAccordingToType([1.0, 2.0], v0_val)
+        self.assertAllCloseAccordingToType([4.0, 3.0], v1_val)
+
+        # Run 10 steps FTRL
+        for _ in range(10):
+          update.run()
+
+        v0_val, v1_val = self.evaluate([var0, var1])
+        self.assertAllCloseAccordingToType(
+            np.array([-0.24059935, -0.46829352]), v0_val)
+        self.assertAllCloseAccordingToType(
+            np.array([-0.02406147, -0.04830509]), v1_val)
+
+  @test_util.run_deprecated_v1
+  def testFtrlWithL1_L2_L2Shrinkage(self):
+    """Test the new FTRL op with support for l2 shrinkage.
+
+    The addition of this parameter which places a constant pressure on weights
+    towards the origin causes the gradient descent trajectory to differ. The
+    weights will tend to have smaller magnitudes with this parameter set.
+    """
+    for dtype in [dtypes.half, dtypes.float32]:
+      with self.cached_session() as sess:
+        var0 = variables.Variable([1.0, 2.0], dtype=dtype)
+        var1 = variables.Variable([4.0, 3.0], dtype=dtype)
+        grads0 = constant_op.constant([0.1, 0.2], dtype=dtype)
+        grads1 = constant_op.constant([0.01, 0.02], dtype=dtype)
+
+        opt = ftrl.Ftrl(
+            3.0,
+            initial_accumulator_value=0.1,
+            l1_regularization_strength=0.001,
+            l2_regularization_strength=2.0,
+            l2_shrinkage_regularization_strength=0.1)
+        update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+
+        v0_val, v1_val = self.evaluate([var0, var1])
+        self.assertAllCloseAccordingToType([1.0, 2.0], v0_val)
+        self.assertAllCloseAccordingToType([4.0, 3.0], v1_val)
+
+        # Run 10 steps FTRL
+        for _ in range(10):
+          update.run()
+
+        v0_val, v1_val = self.evaluate([var0, var1])
+        self.assertAllCloseAccordingToType(
+            np.array([-0.22578995, -0.44345796]), v0_val)
+        self.assertAllCloseAccordingToType(
+            np.array([-0.14378493, -0.13229476]), v1_val)
+
+  @test_util.run_deprecated_v1
+  def testFtrlWithL1_L2_L2ShrinkageSparse(self):
+    """Tests the new FTRL op with support for l2 shrinkage on sparse grads."""
+    for dtype in [dtypes.half, dtypes.float32]:
+      with self.cached_session() as sess:
+        var0 = variables.Variable([[1.0], [2.0]], dtype=dtype)
+        var1 = variables.Variable([[4.0], [3.0]], dtype=dtype)
+        grads0 = ops.IndexedSlices(
+            constant_op.constant([0.1], shape=[1, 1], dtype=dtype),
+            constant_op.constant([0]), constant_op.constant([2, 1]))
+        grads1 = ops.IndexedSlices(
+            constant_op.constant([0.02], shape=[1, 1], dtype=dtype),
+            constant_op.constant([1]), constant_op.constant([2, 1]))
+
+        opt = ftrl.Ftrl(
+            3.0,
+            initial_accumulator_value=0.1,
+            l1_regularization_strength=0.001,
+            l2_regularization_strength=2.0,
+            l2_shrinkage_regularization_strength=0.1)
+        update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+
+        v0_val, v1_val = self.evaluate([var0, var1])
+        self.assertAllCloseAccordingToType([[1.0], [2.0]], v0_val)
+        self.assertAllCloseAccordingToType([[4.0], [3.0]], v1_val)
+
+        # Run 10 steps FTRL
+        for _ in range(10):
+          update.run()
+
+        v0_val, v1_val = self.evaluate([var0, var1])
+        self.assertAllCloseAccordingToType([[-0.22578995], [2.]], v0_val)
+        self.assertAllCloseAccordingToType([[4.], [-0.13229476]], v1_val)
+
+  @test_util.run_deprecated_v1
+  def testFtrlWithL2ShrinkageDoesNotChangeLrSchedule(self):
+    """Verifies that l2 shrinkage in FTRL does not change lr schedule."""
+    for dtype in [dtypes.half, dtypes.float32]:
+      with self.cached_session() as sess:
+        var0 = variables.Variable([1.0, 2.0], dtype=dtype)
+        var1 = variables.Variable([1.0, 2.0], dtype=dtype)
+        grads0 = constant_op.constant([0.1, 0.2], dtype=dtype)
+        grads1 = constant_op.constant([0.1, 0.2], dtype=dtype)
+
+        opt0 = ftrl.Ftrl(
+            3.0,
+            initial_accumulator_value=0.1,
+            l1_regularization_strength=0.001,
+            l2_regularization_strength=2.0,
+            l2_shrinkage_regularization_strength=0.1)
+        opt1 = ftrl.Ftrl(
+            3.0,
+            initial_accumulator_value=0.1,
+            l1_regularization_strength=0.001,
+            l2_regularization_strength=2.0)
+        update0 = opt0.apply_gradients([(grads0, var0)])
+        update1 = opt1.apply_gradients([(grads1, var1)])
+        variables.global_variables_initializer().run()
+
+        v0_val, v1_val = self.evaluate([var0, var1])
+        self.assertAllCloseAccordingToType([1.0, 2.0], v0_val)
+        self.assertAllCloseAccordingToType([1.0, 2.0], v1_val)
+
+        # Run 10 steps FTRL
+        for _ in range(10):
+          update0.run()
+          update1.run()
+
+        v0_val, v1_val = self.evaluate([var0, var1])
+        # var0 is experiencing L2 shrinkage so it should be smaller than var1
+        # in magnitude.
+        self.assertTrue((v0_val**2 < v1_val**2).all())
+        accum0 = sess.run(opt0.get_slot(var0, "accumulator"))
+        accum1 = sess.run(opt1.get_slot(var1, "accumulator"))
+        # L2 shrinkage should not change how we update grad accumulator.
+        self.assertAllCloseAccordingToType(accum0, accum1)
+
+  def applyOptimizer(self, opt, dtype, steps=5, is_sparse=False):
+    if is_sparse:
+      var0 = variables.Variable([[0.0], [0.0]], dtype=dtype)
+      var1 = variables.Variable([[0.0], [0.0]], dtype=dtype)
+      grads0 = ops.IndexedSlices(
+          constant_op.constant([0.1], shape=[1, 1], dtype=dtype),
+          constant_op.constant([0]), constant_op.constant([2, 1]))
+      grads1 = ops.IndexedSlices(
+          constant_op.constant([0.02], shape=[1, 1], dtype=dtype),
+          constant_op.constant([1]), constant_op.constant([2, 1]))
+    else:
+      var0 = variables.Variable([0.0, 0.0], dtype=dtype)
+      var1 = variables.Variable([0.0, 0.0], dtype=dtype)
+      grads0 = constant_op.constant([0.1, 0.2], dtype=dtype)
+      grads1 = constant_op.constant([0.01, 0.02], dtype=dtype)
+
+    update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+    variables.global_variables_initializer().run()
+
+    sess = ops.get_default_session()
+    v0_val, v1_val = self.evaluate([var0, var1])
+    if is_sparse:
+      self.assertAllCloseAccordingToType([[0.0], [0.0]], v0_val)
+      self.assertAllCloseAccordingToType([[0.0], [0.0]], v1_val)
+    else:
+      self.assertAllCloseAccordingToType([0.0, 0.0], v0_val)
+      self.assertAllCloseAccordingToType([0.0, 0.0], v1_val)
+
+    # Run Ftrl for a few steps
+    for _ in range(steps):
+      update.run()
+
+    v0_val, v1_val = self.evaluate([var0, var1])
+    return v0_val, v1_val
+
+  # When variables are initialized with Zero, FTRL-Proximal has two properties:
+  # 1. Without L1&L2 but with fixed learning rate, FTRL-Proximal is identical
+  # with GradientDescent.
+  # 2. Without L1&L2 but with adaptive learning rate, FTRL-Proximal is identical
+  # with Adagrad.
+  # So, basing on these two properties, we test if our implementation of
+  # FTRL-Proximal performs same updates as Adagrad or GradientDescent.
+  @test_util.run_deprecated_v1
+  def testEquivAdagradwithoutRegularization(self):
+    for dtype in [dtypes.half, dtypes.float32]:
+      with self.cached_session():
+        val0, val1 = self.applyOptimizer(
+            ftrl.Ftrl(
+                3.0,
+                # Adagrad learning rate
+                learning_rate_power=-0.5,
+                initial_accumulator_value=0.1,
+                l1_regularization_strength=0.0,
+                l2_regularization_strength=0.0),
+            dtype)
+
+      with self.cached_session():
+        val2, val3 = self.applyOptimizer(
+            adagrad.AdagradOptimizer(3.0, initial_accumulator_value=0.1), dtype)
+
+      self.assertAllCloseAccordingToType(val0, val2)
+      self.assertAllCloseAccordingToType(val1, val3)
+
+  @test_util.run_deprecated_v1
+  def testEquivSparseAdagradwithoutRegularization(self):
+    for dtype in [dtypes.half, dtypes.float32]:
+      with self.cached_session():
+        val0, val1 = self.applyOptimizer(
+            ftrl.Ftrl(
+                3.0,
+                # Adagrad learning rate
+                learning_rate_power=-0.5,
+                initial_accumulator_value=0.1,
+                l1_regularization_strength=0.0,
+                l2_regularization_strength=0.0),
+            dtype,
+            is_sparse=True)
+
+      with self.cached_session():
+        val2, val3 = self.applyOptimizer(
+            adagrad.AdagradOptimizer(3.0, initial_accumulator_value=0.1),
+            dtype,
+            is_sparse=True)
+
+      self.assertAllCloseAccordingToType(val0, val2)
+      self.assertAllCloseAccordingToType(val1, val3)
+
+  @test_util.run_deprecated_v1
+  def testEquivSparseGradientDescentwithoutRegularization(self):
+    for dtype in [dtypes.half, dtypes.float32]:
+      with self.cached_session():
+        val0, val1 = self.applyOptimizer(
+            ftrl.Ftrl(
+                3.0,
+                # Fixed learning rate
+                learning_rate_power=-0.0,
+                initial_accumulator_value=0.1,
+                l1_regularization_strength=0.0,
+                l2_regularization_strength=0.0),
+            dtype,
+            is_sparse=True)
+
+      with self.cached_session():
+        val2, val3 = self.applyOptimizer(
+            gradient_descent.GradientDescentOptimizer(3.0),
+            dtype,
+            is_sparse=True)
+
+      self.assertAllCloseAccordingToType(val0, val2)
+      self.assertAllCloseAccordingToType(val1, val3)
+
+  @test_util.run_deprecated_v1
+  def testEquivGradientDescentwithoutRegularization(self):
+    for dtype in [dtypes.half, dtypes.float32]:
+      with self.cached_session():
+        val0, val1 = self.applyOptimizer(
+            ftrl.Ftrl(
+                3.0,
+                # Fixed learning rate
+                learning_rate_power=-0.0,
+                initial_accumulator_value=0.1,
+                l1_regularization_strength=0.0,
+                l2_regularization_strength=0.0),
+            dtype)
+
+      with self.cached_session():
+        val2, val3 = self.applyOptimizer(
+            gradient_descent.GradientDescentOptimizer(3.0), dtype)
+
+      self.assertAllCloseAccordingToType(val0, val2)
+      self.assertAllCloseAccordingToType(val1, val3)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/keras/optimizer_v2/gradient_descent.py b/tensorflow/python/keras/optimizer_v2/gradient_descent.py
index 90106c941cc..2b82b5e78de 100644
--- a/tensorflow/python/keras/optimizer_v2/gradient_descent.py
+++ b/tensorflow/python/keras/optimizer_v2/gradient_descent.py
@@ -19,7 +19,6 @@ from __future__ import print_function
 
 from tensorflow.python.framework import ops
 from tensorflow.python.keras.optimizer_v2 import optimizer_v2
-from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.training import training_ops
 
@@ -62,7 +61,8 @@ class SGD(optimizer_v2.OptimizerV2):
                learning_rate=0.001,
                momentum=0.0,
                nesterov=False,
-               name="SGD"):
+               name="SGD",
+               **kwargs):
     """Construct a new Stochastic Gradient Descent or Momentum optimizer.
 
     Arguments:
@@ -72,9 +72,11 @@ class SGD(optimizer_v2.OptimizerV2):
       nesterov: boolean. Whether to apply Nesterov momentum.
       name: Optional name prefix for the operations created when applying
         gradients.  Defaults to 'SGD'.
+      **kwargs: keyword arguments. Allowed to be {`decay`}
     """
-    super(SGD, self).__init__(name)
+    super(SGD, self).__init__(name, **kwargs)
     self._set_hyper("learning_rate", learning_rate)
+    self._set_hyper("decay", self._initial_decay)
 
     self._momentum = False
     if isinstance(momentum, ops.Tensor) or callable(momentum) or momentum > 0:
@@ -91,44 +93,44 @@ class SGD(optimizer_v2.OptimizerV2):
         self.add_slot(var, "momentum")
 
   def _resource_apply_dense(self, grad, var):
-    learning_rate = self._get_hyper("learning_rate")
+    var_dtype = var.dtype.base_dtype
+    lr_t = self._decayed_lr(var_dtype)
     if self._momentum:
       momentum_var = self.get_slot(var, "momentum")
-      return training_ops.resource_apply_momentum(
+      return training_ops.resource_apply_keras_momentum(
           var.handle,
           momentum_var.handle,
-          math_ops.cast(learning_rate, grad.dtype.base_dtype),
+          lr_t,
           grad,
-          math_ops.cast(self._get_hyper("momentum"), grad.dtype.base_dtype),
+          self._get_hyper("momentum", var_dtype),
           use_locking=self._use_locking,
           use_nesterov=self._nesterov)
     else:
       return training_ops.resource_apply_gradient_descent(
-          var.handle,
-          math_ops.cast(learning_rate, grad.dtype.base_dtype),
-          grad,
-          use_locking=self._use_locking)
+          var.handle, lr_t, grad, use_locking=self._use_locking)
 
   def _resource_apply_sparse_duplicate_indices(self, grad, var, indices):
     if self._momentum:
       return super(SGD, self)._resource_apply_sparse_duplicate_indices(
           grad, var, indices)
     else:
-      return resource_variable_ops.resource_scatter_add(
-          var.handle, indices, -grad * math_ops.cast(
-              self._get_hyper("learning_rate"), grad.dtype.base_dtype))
+      var_dtype = var.dtype.base_dtype
+      lr_t = self._decayed_lr(var_dtype)
+      return resource_variable_ops.resource_scatter_add(var.handle, indices,
+                                                        -grad * lr_t)
 
   def _resource_apply_sparse(self, grad, var, indices):
     # This method is only needed for momentum optimization.
-    learning_rate = self._get_hyper("learning_rate")
+    var_dtype = var.dtype.base_dtype
+    lr_t = self._decayed_lr(var_dtype)
     momentum_var = self.get_slot(var, "momentum")
-    return training_ops.resource_sparse_apply_momentum(
+    return training_ops.resource_sparse_apply_keras_momentum(
         var.handle,
         momentum_var.handle,
-        math_ops.cast(learning_rate, grad.dtype.base_dtype),
+        lr_t,
         grad,
         indices,
-        math_ops.cast(self._get_hyper("momentum"), grad.dtype.base_dtype),
+        self._get_hyper("momentum", var_dtype),
         use_locking=self._use_locking,
         use_nesterov=self._nesterov)
 
@@ -136,6 +138,7 @@ class SGD(optimizer_v2.OptimizerV2):
     config = super(SGD, self).get_config()
     config.update({
         "learning_rate": self._serialize_hyperparameter("learning_rate"),
+        "decay": self._serialize_hyperparameter("decay"),
         "momentum": self._serialize_hyperparameter("momentum"),
         "nesterov": self._nesterov,
     })
diff --git a/tensorflow/python/keras/optimizer_v2/gradient_descent_test.py b/tensorflow/python/keras/optimizer_v2/gradient_descent_test.py
index b84bf1a6ecc..0c64202da81 100644
--- a/tensorflow/python/keras/optimizer_v2/gradient_descent_test.py
+++ b/tensorflow/python/keras/optimizer_v2/gradient_descent_test.py
@@ -47,7 +47,6 @@ class GradientDescentOptimizerTest(test.TestCase):
         grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
         grads1 = constant_op.constant([0.01, 0.01], dtype=dtype)
         sgd = gradient_descent.SGD(3.0)
-        # self.assertFalse(sgd._initial_decay)
         sgd_op = sgd.apply_gradients(zip([grads0, grads1], [var0, var1]))
         self.evaluate(variables.global_variables_initializer())
         # Run 1 step of sgd
@@ -58,6 +57,43 @@ class GradientDescentOptimizerTest(test.TestCase):
         self.assertAllCloseAccordingToType([3.0 - 3.0 * 0.01, 4.0 - 3.0 * 0.01],
                                            self.evaluate(var1))
 
+  @test_util.run_in_graph_and_eager_modes
+  def testBasicWithLearningRateDecay(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.cached_session():
+        var0 = resource_variable_ops.ResourceVariable([1.0, 2.0], dtype=dtype)
+        var1 = resource_variable_ops.ResourceVariable([3.0, 4.0], dtype=dtype)
+        grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
+        grads1 = constant_op.constant([0.01, 0.01], dtype=dtype)
+        learning_rate = 3.0
+        decay = 0.5
+        sgd = gradient_descent.SGD(learning_rate=learning_rate, decay=decay)
+        if not context.executing_eagerly():
+          sgd_op = sgd.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        self.evaluate(variables.global_variables_initializer())
+        # Run 2 steps of sgd
+        if not context.executing_eagerly():
+          self.evaluate(sgd_op)
+        else:
+          sgd.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        # Validate updated params
+        self.assertAllCloseAccordingToType([1.0 - 3.0 * 0.1, 2.0 - 3.0 * 0.1],
+                                           self.evaluate(var0))
+        self.assertAllCloseAccordingToType([3.0 - 3.0 * 0.01, 4.0 - 3.0 * 0.01],
+                                           self.evaluate(var1))
+
+        if not context.executing_eagerly():
+          self.evaluate(sgd_op)
+        else:
+          sgd.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        # Validate updated params
+        self.assertAllCloseAccordingToType(
+            [1.0 - 3.0 * 0.1 - 2.0 * 0.1, 2.0 - 3.0 * 0.1 - 2.0 * 0.1],
+            self.evaluate(var0))
+        self.assertAllCloseAccordingToType(
+            [3.0 - 3.0 * 0.01 - 2.0 * 0.01, 4.0 - 3.0 * 0.01 - 2.0 * 0.01],
+            self.evaluate(var1))
+
   @test_util.run_in_graph_and_eager_modes
   def testBasicCallableParams(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
@@ -98,6 +134,7 @@ class GradientDescentOptimizerTest(test.TestCase):
                                            self.evaluate(var0))
         self.assertAllCloseAccordingToType([3.0 - 1.0], self.evaluate(var1))
 
+  @test_util.run_deprecated_v1
   def testMinimizeSparseResourceVariable(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
       with self.cached_session():
@@ -137,6 +174,7 @@ class GradientDescentOptimizerTest(test.TestCase):
         self.assertAllCloseAccordingToType([3.0 - 3.0 * 0.01, 4.0 - 3.0 * 0.01],
                                            self.evaluate(var1))
 
+  @test_util.run_deprecated_v1
   def testGradWrtRef(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
       with self.cached_session():
@@ -170,6 +208,37 @@ class GradientDescentOptimizerTest(test.TestCase):
         self.assertAllCloseAccordingToType([[3.0], [4.0 - 3.0 * 0.01]],
                                            self.evaluate(var1))
 
+  @test_util.run_deprecated_v1
+  def testSparseBasicWithLearningRateDecay(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.cached_session():
+        var0 = variables.Variable([[1.0], [2.0]], dtype=dtype)
+        var1 = variables.Variable([[3.0], [4.0]], dtype=dtype)
+        grads0 = ops.IndexedSlices(
+            constant_op.constant([0.1], shape=[1, 1], dtype=dtype),
+            constant_op.constant([0]), constant_op.constant([2, 1]))
+        grads1 = ops.IndexedSlices(
+            constant_op.constant([0.01], shape=[1, 1], dtype=dtype),
+            constant_op.constant([1]), constant_op.constant([2, 1]))
+        sgd_op = gradient_descent.SGD(
+            3.0, decay=0.5).apply_gradients(
+                zip([grads0, grads1], [var0, var1]))
+        self.evaluate(variables.global_variables_initializer())
+        # Run 2 steps of sgd
+        self.evaluate(sgd_op)
+        # Validate updated params
+        self.assertAllCloseAccordingToType([[1.0 - 3.0 * 0.1], [2.0]],
+                                           self.evaluate(var0))
+        self.assertAllCloseAccordingToType([[3.0], [4.0 - 3.0 * 0.01]],
+                                           self.evaluate(var1))
+
+        self.evaluate(sgd_op)
+        # Validate updated params
+        self.assertAllCloseAccordingToType(
+            [[1.0 - 3.0 * 0.1 - 2.0 * 0.1], [2.0]], self.evaluate(var0))
+        self.assertAllCloseAccordingToType(
+            [[3.0], [4.0 - 3.0 * 0.01 - 2.0 * 0.01]], self.evaluate(var1))
+
   def testCapturingInDefunWhileExecutingEagerly(self):
     with context.eager_mode():
       optimizer = gradient_descent.SGD(1.0)
@@ -194,10 +263,8 @@ class GradientDescentOptimizerTest(test.TestCase):
 class MomentumOptimizerTest(test.TestCase):
 
   def _update_nesterov_momentum_numpy(self, var, accum, g, lr, momentum):
-    var = var + accum * lr * momentum
-    accum = accum * momentum + g
-    var = var - lr * accum
-    var = var - accum * lr * momentum
+    accum = accum * momentum - g * lr
+    var += (accum * momentum - g * lr)
     return var, accum
 
   @test_util.run_in_graph_and_eager_modes
@@ -222,9 +289,9 @@ class MomentumOptimizerTest(test.TestCase):
 
         # Check we have slots
         slot0 = mom_opt.get_slot(var0, "momentum")
-        self.assertEquals(slot0.get_shape(), var0.get_shape())
+        self.assertEqual(slot0.get_shape(), var0.get_shape())
         slot1 = mom_opt.get_slot(var1, "momentum")
-        self.assertEquals(slot1.get_shape(), var1.get_shape())
+        self.assertEqual(slot1.get_shape(), var1.get_shape())
 
         # Step 1: the momentum accumulators where 0. So we should see a normal
         # update: v -= grad * learning_rate
@@ -232,9 +299,9 @@ class MomentumOptimizerTest(test.TestCase):
         self.evaluate(mom_update)
         # Check that the momentum accumulators have been updated.
         self.assertAllCloseAccordingToType(
-            np.array([0.1, 0.1]), self.evaluate(slot0))
+            np.array([-0.2, -0.2]), self.evaluate(slot0))
         self.assertAllCloseAccordingToType(
-            np.array([0.01, 0.01]), self.evaluate(slot1))
+            np.array([-0.02, -0.02]), self.evaluate(slot1))
         # Check that the parameters have been updated.
         self.assertAllCloseAccordingToType(
             np.array([1.0 - (0.1 * 2.0), 2.0 - (0.1 * 2.0)]),
@@ -248,11 +315,11 @@ class MomentumOptimizerTest(test.TestCase):
           mom_opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
         # Check that the momentum accumulators have been updated.
         self.assertAllCloseAccordingToType(
-            np.array([(0.9 * 0.1 + 0.1), (0.9 * 0.1 + 0.1)]),
+            np.array([(0.9 * (-0.2) - 2.0 * 0.1), (0.9 * (-0.2) - 2.0 * 0.1)]),
             self.evaluate(slot0))
         self.assertAllCloseAccordingToType(
-            np.array([(0.9 * 0.01 + 0.01), (0.9 * 0.01 + 0.01)]),
-            self.evaluate(slot1))
+            np.array([(0.9 * (-0.02) - 2.0 * 0.01),
+                      (0.9 * (-0.02) - 2.0 * 0.01)]), self.evaluate(slot1))
         # Check that the parameters have been updated.
         self.assertAllCloseAccordingToType(
             np.array([
@@ -265,6 +332,7 @@ class MomentumOptimizerTest(test.TestCase):
                 3.98 - ((0.9 * 0.01 + 0.01) * 2.0)
             ]), self.evaluate(var1))
 
+  @test_util.run_deprecated_v1
   def testNesterovMomentum(self):
     for dtype in [dtypes.float32, dtypes.float64]:
       with self.cached_session():
@@ -289,9 +357,10 @@ class MomentumOptimizerTest(test.TestCase):
               var0_np, accum0_np, var0_np * 10, 2.0, 0.9)
           var1_np, accum1_np = self._update_nesterov_momentum_numpy(
               var1_np, accum1_np, 3, 2.0, 0.9)
-          self.assertAllClose(var0_np, var0.eval())
-          self.assertAllClose(var1_np, var1.eval())
+          self.assertAllClose(var0_np, self.evaluate(var0))
+          self.assertAllClose(var1_np, self.evaluate(var1))
 
+  @test_util.run_deprecated_v1
   def testSparseNesterovMomentum(self):
     for dtype in [dtypes.float32, dtypes.float64]:
       with self.cached_session():
@@ -329,10 +398,11 @@ class MomentumOptimizerTest(test.TestCase):
               var0_np, accum0_np, var0_np * 10, 2.0, 0.9)
           var1_np, accum1_np = self._update_nesterov_momentum_numpy(
               var1_np, accum1_np, 3, 2.0, 0.9)
-          self.assertAllClose(var0_np, var0.eval())
-          self.assertAllClose(var1_np, var1.eval())
+          self.assertAllClose(var0_np, self.evaluate(var0))
+          self.assertAllClose(var1_np, self.evaluate(var1))
 
   @test_util.run_in_graph_and_eager_modes
+  @test_util.run_deprecated_v1
   def testMinimizeSparseResourceVariable(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
       # This test invokes the ResourceSparseApplyMomentum operation, which
@@ -386,6 +456,7 @@ class MomentumOptimizerTest(test.TestCase):
     self.evaluate(sgd_op)
     self.assertAllCloseAccordingToType([[1, 1], [0, 0]], self.evaluate(var0))
 
+  @test_util.run_deprecated_v1
   def testTensorLearningRateAndMomentum(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
       with self.cached_session():
@@ -401,43 +472,50 @@ class MomentumOptimizerTest(test.TestCase):
         variables.global_variables_initializer().run()
         # Check we have slots
         slot0 = mom_opt.get_slot(var0, "momentum")
-        self.assertEquals(slot0.get_shape(), var0.get_shape())
+        self.assertEqual(slot0.get_shape(), var0.get_shape())
         slot1 = mom_opt.get_slot(var1, "momentum")
-        self.assertEquals(slot1.get_shape(), var1.get_shape())
+        self.assertEqual(slot1.get_shape(), var1.get_shape())
 
         # Fetch params to validate initial values
-        self.assertAllClose([1.0, 2.0], var0.eval())
-        self.assertAllClose([3.0, 4.0], var1.eval())
+        self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+        self.assertAllClose([3.0, 4.0], self.evaluate(var1))
         # Step 1: the momentum accumulators where 0. So we should see a normal
         # update: v -= grad * learning_rate
         mom_update.run()
         # Check that the momentum accumulators have been updated.
-        self.assertAllCloseAccordingToType(np.array([0.1, 0.1]), slot0.eval())
-        self.assertAllCloseAccordingToType(np.array([0.01, 0.01]), slot1.eval())
+        self.assertAllCloseAccordingToType(
+            np.array([-0.2, -0.2]), self.evaluate(slot0))
+        self.assertAllCloseAccordingToType(
+            np.array([-0.02, -0.02]), self.evaluate(slot1))
         # Check that the parameters have been updated.
         self.assertAllCloseAccordingToType(
-            np.array([1.0 - (0.1 * 2.0), 2.0 - (0.1 * 2.0)]), var0.eval())
+            np.array([1.0 - (0.1 * 2.0), 2.0 - (0.1 * 2.0)]),
+            self.evaluate(var0))
         self.assertAllCloseAccordingToType(
-            np.array([3.0 - (0.01 * 2.0), 4.0 - (0.01 * 2.0)]), var1.eval())
+            np.array([3.0 - (0.01 * 2.0), 4.0 - (0.01 * 2.0)]),
+            self.evaluate(var1))
         # Step 2: the momentum accumulators contain the previous update.
         mom_update.run()
         # Check that the momentum accumulators have been updated.
         self.assertAllCloseAccordingToType(
-            np.array([(0.9 * 0.1 + 0.1), (0.9 * 0.1 + 0.1)]), slot0.eval())
+            np.array([(0.9 * (-0.2) - 2.0 * 0.1), (0.9 * (-0.2) - 2.0 * 0.1)]),
+            self.evaluate(slot0))
         self.assertAllCloseAccordingToType(
-            np.array([(0.9 * 0.01 + 0.01), (0.9 * 0.01 + 0.01)]), slot1.eval())
+            np.array([(0.9 * (-0.02) - 2.0 * 0.01),
+                      (0.9 * (-0.02) - 2.0 * 0.01)]), self.evaluate(slot1))
         # Check that the parameters have been updated.
         self.assertAllCloseAccordingToType(
             np.array([
                 1.0 - (0.1 * 2.0) - ((0.9 * 0.1 + 0.1) * 2.0),
                 2.0 - (0.1 * 2.0) - ((0.9 * 0.1 + 0.1) * 2.0)
-            ]), var0.eval())
+            ]), self.evaluate(var0))
         self.assertAllCloseAccordingToType(
             np.array([
                 2.98 - ((0.9 * 0.01 + 0.01) * 2.0),
                 3.98 - ((0.9 * 0.01 + 0.01) * 2.0)
-            ]), var1.eval())
+            ]), self.evaluate(var1))
 
+  @test_util.run_deprecated_v1
   def testSparse(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
       with self.cached_session():
@@ -456,57 +534,65 @@ class MomentumOptimizerTest(test.TestCase):
 
         # Check we have slots
         slot0 = mom_opt.get_slot(var0, "momentum")
-        self.assertEquals(slot0.get_shape(), var0.get_shape())
+        self.assertEqual(slot0.get_shape(), var0.get_shape())
         slot1 = mom_opt.get_slot(var1, "momentum")
-        self.assertEquals(slot1.get_shape(), var1.get_shape())
+        self.assertEqual(slot1.get_shape(), var1.get_shape())
 
         # Fetch params to validate initial values
-        self.assertAllClose([0, 0], var0.eval()[0])
-        self.assertAllClose([0, 0], var0.eval()[1])
-        self.assertAllClose([1, 1], var1.eval()[2])
+        self.assertAllClose([0, 0], self.evaluate(var0)[0])
+        self.assertAllClose([0, 0], self.evaluate(var0)[1])
+        self.assertAllClose([1, 1], self.evaluate(var1)[2])
 
         # Step 1: the momentum accumulators are 0. So we should see a normal
         # update: v -= grad * learning_rate
         mom_update.run()
         # Check that the momentum accumulators have been updated.
-        self.assertAllCloseAccordingToType(np.array([0, 0]), slot0.eval()[0])
-        self.assertAllCloseAccordingToType(np.array([.1, .1]), slot0.eval()[1])
         self.assertAllCloseAccordingToType(
-            np.array([.01, .01]),
-            slot1.eval()[2])
+            np.array([0, 0]),
+            self.evaluate(slot0)[0])
+        self.assertAllCloseAccordingToType(
+            np.array([-2.0 * .1, -2.0 * .1]),
+            self.evaluate(slot0)[1])
+        self.assertAllCloseAccordingToType(
+            np.array([-2.0 * .01, -2.0 * .01]),
+            self.evaluate(slot1)[2])
         # Check that the parameters have been updated.
-        self.assertAllCloseAccordingToType(np.array([0, 0]), var0.eval()[0])
+        self.assertAllCloseAccordingToType(
+            np.array([0, 0]),
+            self.evaluate(var0)[0])
         self.assertAllCloseAccordingToType(
             np.array([-(0.1 * 2.0), -(0.1 * 2.0)]),
-            var0.eval()[1])
+            self.evaluate(var0)[1])
         self.assertAllCloseAccordingToType(
             np.array([1.0 - (0.01 * 2.0), 1.0 - (0.01 * 2.0)]),
-            var1.eval()[2])
+            self.evaluate(var1)[2])
         # Step 2: the momentum accumulators contain the previous update.
         mom_update.run()
         # Check that the momentum accumulators have been updated.
-        self.assertAllClose(np.array([0, 0]), slot0.eval()[0])
+        self.assertAllClose(np.array([0, 0]), self.evaluate(slot0)[0])
         self.assertAllCloseAccordingToType(
-            np.array([(0.9 * 0.1 + 0.1), (0.9 * 0.1 + 0.1)]),
-            slot0.eval()[1])
+            np.array([(0.9 * (-0.2) - 2.0 * 0.1), (0.9 * (-0.2) - 2.0 * 0.1)]),
+            self.evaluate(slot0)[1])
         self.assertAllCloseAccordingToType(
-            np.array([(0.9 * 0.01 + 0.01), (0.9 * 0.01 + 0.01)]),
-            slot1.eval()[2])
+            np.array([(0.9 * (-0.02) - 2.0 * 0.01),
+                      (0.9 * (-0.02) - 2.0 * 0.01)]),
+            self.evaluate(slot1)[2])
         # Check that the parameters have been updated.
-        self.assertAllClose(np.array([0, 0]), var0.eval()[0])
+        self.assertAllClose(np.array([0, 0]), self.evaluate(var0)[0])
         self.assertAllCloseAccordingToType(
             np.array([
                 -(0.1 * 2.0) - ((0.9 * 0.1 + 0.1) * 2.0),
                 -(0.1 * 2.0) - ((0.9 * 0.1 + 0.1) * 2.0)
             ]),
-            var0.eval()[1])
+            self.evaluate(var0)[1])
         self.assertAllCloseAccordingToType(
             np.array([
                 0.98 - ((0.9 * 0.01 + 0.01) * 2.0),
                 0.98 - ((0.9 * 0.01 + 0.01) * 2.0)
             ]),
-            var1.eval()[2])
+            self.evaluate(var1)[2])
 
+  @test_util.run_deprecated_v1
   def testSharing(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
       with self.cached_session():
@@ -522,42 +608,48 @@ class MomentumOptimizerTest(test.TestCase):
         variables.global_variables_initializer().run()
 
         slot0 = mom_opt.get_slot(var0, "momentum")
-        self.assertEquals(slot0.get_shape(), var0.get_shape())
+        self.assertEqual(slot0.get_shape(), var0.get_shape())
         slot1 = mom_opt.get_slot(var1, "momentum")
-        self.assertEquals(slot1.get_shape(), var1.get_shape())
+        self.assertEqual(slot1.get_shape(), var1.get_shape())
 
         # Fetch params to validate initial values
-        self.assertAllClose([1.0, 2.0], var0.eval())
-        self.assertAllClose([3.0, 4.0], var1.eval())
+        self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+        self.assertAllClose([3.0, 4.0], self.evaluate(var1))
         # Step 1: the momentum accumulators where 0. So we should see a normal
         # update: v -= grad * learning_rate
         mom_update1.run()
         # Check that the momentum accumulators have been updated.
-        self.assertAllCloseAccordingToType(np.array([0.1, 0.1]), slot0.eval())
-        self.assertAllCloseAccordingToType(np.array([0.01, 0.01]), slot1.eval())
+        self.assertAllCloseAccordingToType(
+            np.array([-0.2, -0.2]), self.evaluate(slot0))
+        self.assertAllCloseAccordingToType(
+            np.array([-0.02, -0.02]), self.evaluate(slot1))
         # Check that the parameters have been updated.
         self.assertAllCloseAccordingToType(
-            np.array([1.0 - (0.1 * 2.0), 2.0 - (0.1 * 2.0)]), var0.eval())
+            np.array([1.0 - (0.1 * 2.0), 2.0 - (0.1 * 2.0)]),
+            self.evaluate(var0))
         self.assertAllCloseAccordingToType(
-            np.array([3.0 - (0.01 * 2.0), 4.0 - (0.01 * 2.0)]), var1.eval())
+            np.array([3.0 - (0.01 * 2.0), 4.0 - (0.01 * 2.0)]),
+            self.evaluate(var1))
         # Step 2: the second momentum accumulators contain the previous update.
         mom_update2.run()
         # Check that the momentum accumulators have been updated.
         self.assertAllCloseAccordingToType(
-            np.array([(0.9 * 0.1 + 0.1), (0.9 * 0.1 + 0.1)]), slot0.eval())
+            np.array([(0.9 * (-0.2) - 2.0 * 0.1), (0.9 * (-0.2) - 2.0 * 0.1)]),
+            self.evaluate(slot0))
         self.assertAllCloseAccordingToType(
-            np.array([(0.9 * 0.01 + 0.01), (0.9 * 0.01 + 0.01)]), slot1.eval())
+            np.array([(0.9 * (-0.02) - 2.0 * 0.01),
+                      (0.9 * (-0.02) - 2.0 * 0.01)]), self.evaluate(slot1))
         # Check that the parameters have been updated.
         self.assertAllCloseAccordingToType(
             np.array([
                 1.0 - (0.1 * 2.0) - ((0.9 * 0.1 + 0.1) * 2.0),
                 2.0 - (0.1 * 2.0) - ((0.9 * 0.1 + 0.1) * 2.0)
-            ]), var0.eval())
+            ]), self.evaluate(var0))
         self.assertAllCloseAccordingToType(
             np.array([
                 2.98 - ((0.9 * 0.01 + 0.01) * 2.0),
                 3.98 - ((0.9 * 0.01 + 0.01) * 2.0)
-            ]), var1.eval())
+            ]), self.evaluate(var1))
 
   @test_util.run_in_graph_and_eager_modes
   def testConfig(self):
diff --git a/tensorflow/python/keras/optimizer_v2/nadam.py b/tensorflow/python/keras/optimizer_v2/nadam.py
new file mode 100644
index 00000000000..00b095e0dc9
--- /dev/null
+++ b/tensorflow/python/keras/optimizer_v2/nadam.py
@@ -0,0 +1,143 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Nadam for TensorFlow."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import ops
+from tensorflow.python.keras.optimizer_v2 import adam
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import state_ops
+from tensorflow.python.training import training_ops
+
+
+class Nadam(adam.Adam):
+  r"""Optimizer that implements the NAdam algorithm.
+
+  Much like Adam is essentially RMSprop with momentum, Nadam is Adam with
+  Nesterov momentum.
+
+  Initialization:
+
+  $$m_0 := 0 \text{(Initialize initial 1st moment vector)}$$
+  $$v_0 := 0 \text{(Initialize initial 2nd moment vector)}$$
+  $$t := 0 \text{(Initialize timestep)}$$
+
+  Computes:
+  $$t := t + 1$$
+  $$lr_t := \text{learning\_rate} * \sqrt{1 - beta_2^t} / (1 - beta_1^t)$$
+  $$m_t := beta_1 * m_{t-1} + (1 - beta_1) * g$$
+  $$v_t := beta_2 * v_{t-1} + (1 - beta_2) * g * g$$
+  $$m_bar_t := beta_1 * v_t + (1 - beta_1) * g$$
+  $$theta_t := theta_{t-1} - lr_t * m_bar_t / (\sqrt{v_t} + \epsilon)$$
+
+  gradient is evaluated at theta(t) + momentum * v(t), and the variables always
+  store theta + beta_1 * m / sqrt(v) instead of theta.
+
+  References
+    See [Dozat, T., 2015](http://cs229.stanford.edu/proj2015/054_report.pdf).
+  """
+
+  def __init__(self,
+               learning_rate=0.001,
+               beta_1=0.9,
+               beta_2=0.999,
+               epsilon=1e-7,
+               name='Nadam',
+               **kwargs):
+    """Construct a new Nadam optimizer.
+
+    Args:
+      learning_rate: A Tensor or a floating point value.  The learning rate.
+      beta_1: A float value or a constant float tensor. The exponential decay
+        rate for the 1st moment estimates.
+      beta_2: A float value or a constant float tensor. The exponential decay
+        rate for the exponentially weighted infinity norm.
+      epsilon: A small constant for numerical stability.
+      name: Optional name for the operations created when applying gradients.
+        Defaults to "Adamax".
+      **kwargs: keyword arguments. Allowed to be {`decay`}
+    """
+
+    # pylint: disable=useless-super-delegation
+    super(Nadam, self).__init__(
+        learning_rate=learning_rate,
+        beta_1=beta_1,
+        beta_2=beta_2,
+        epsilon=epsilon,
+        amsgrad=False,
+        name=name,
+        **kwargs)
+    # pylint: enable=useless-super-delegation
+
+  def _resource_apply_dense(self, grad, var):
+    var_dtype = var.dtype.base_dtype
+    lr_t = self._decayed_lr(var_dtype)
+    m = self.get_slot(var, 'm')
+    v = self.get_slot(var, 'v')
+    beta_1_t = self._get_hyper('beta_1', var_dtype)
+    beta_2_t = self._get_hyper('beta_2', var_dtype)
+    local_step = math_ops.cast(self.iterations + 1, var_dtype)
+    beta_1_power = math_ops.pow(beta_1_t, local_step)
+    beta_2_power = math_ops.pow(beta_2_t, local_step)
+    return training_ops.resource_apply_adam(
+        var.handle,
+        m.handle,
+        v.handle,
+        beta_1_power,
+        beta_2_power,
+        lr_t,
+        beta_1_t,
+        beta_2_t,
+        self._get_hyper('epsilon', var_dtype),
+        grad,
+        use_locking=self._use_locking,
+        use_nesterov=True)
+
+  def _resource_apply_sparse(self, grad, var, indices):
+    var_dtype = var.dtype.base_dtype
+    lr_t = self._decayed_lr(var_dtype)
+    beta_1_t = self._get_hyper('beta_1', var_dtype)
+    beta_2_t = self._get_hyper('beta_2', var_dtype)
+    local_step = math_ops.cast(self.iterations + 1, var_dtype)
+    beta_1_power = math_ops.pow(beta_1_t, local_step)
+    beta_2_power = math_ops.pow(beta_2_t, local_step)
+    epsilon_t = self._get_hyper('epsilon', var_dtype)
+    lr = (lr_t * math_ops.sqrt(1 - beta_2_power) / (1 - beta_1_power))
+
+    # m_t = beta1 * m + (1 - beta1) * g_t
+    m = self.get_slot(var, 'm')
+    m_scaled_g_values = grad * (1 - beta_1_t)
+    m_t = state_ops.assign(m, m * beta_1_t, use_locking=self._use_locking)
+    with ops.control_dependencies([m_t]):
+      m_t = self._resource_scatter_add(m, indices, m_scaled_g_values)
+      # m_bar = (1 - beta1) * g_t + beta1 * m_t
+      m_bar = m_scaled_g_values + beta_1_t * array_ops.gather(m_t, indices)
+
+    # v_t = beta2 * v + (1 - beta2) * (g_t * g_t)
+    v = self.get_slot(var, 'v')
+    v_scaled_g_values = (grad * grad) * (1 - beta_2_t)
+    v_t = state_ops.assign(v, v * beta_2_t, use_locking=self._use_locking)
+    with ops.control_dependencies([v_t]):
+      v_t = self._resource_scatter_add(v, indices, v_scaled_g_values)
+
+    v_t_slice = array_ops.gather(v_t, indices)
+    v_sqrt = math_ops.sqrt(v_t_slice)
+    var_update = self._resource_scatter_add(var, indices,
+                                            -lr * m_bar / (v_sqrt + epsilon_t))
+    return control_flow_ops.group(*[var_update, m_bar, v_t])
diff --git a/tensorflow/python/keras/optimizer_v2/nadam_test.py b/tensorflow/python/keras/optimizer_v2/nadam_test.py
new file mode 100644
index 00000000000..d991e3117ca
--- /dev/null
+++ b/tensorflow/python/keras/optimizer_v2/nadam_test.py
@@ -0,0 +1,213 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Nadam."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.keras.optimizer_v2 import nadam
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+
+
+def get_beta_accumulators(opt, dtype):
+  local_step = math_ops.cast(opt.iterations + 1, dtype)
+  beta_1_t = math_ops.cast(opt._get_hyper("beta_1"), dtype)
+  beta_1_power = math_ops.pow(beta_1_t, local_step)
+  beta_2_t = math_ops.cast(opt._get_hyper("beta_2"), dtype)
+  beta_2_power = math_ops.pow(beta_2_t, local_step)
+  return (beta_1_power, beta_2_power)
+
+
+def nadam_update_numpy(param,
+                       g_t,
+                       t,
+                       m,
+                       v,
+                       alpha=0.001,
+                       beta1=0.9,
+                       beta2=0.999,
+                       epsilon=1e-8):
+  alpha_t = alpha * np.sqrt(1 - beta2**(t + 1)) / (1 - beta1**(t + 1))
+
+  m_t = beta1 * m + (1 - beta1) * g_t
+  v_t = beta2 * v + (1 - beta2) * g_t * g_t
+
+  m_bar = (1 - beta1) * g_t + beta1 * m_t
+
+  param_t = param - alpha_t * m_bar / (np.sqrt(v_t) + epsilon)
+  return param_t, m_t, v_t
+
+
+class NadamOptimizerTest(test.TestCase):
+
+  def doTestSparse(self, use_resource=False):
+    sparse_epsilon = 1e-7
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.cached_session():
+        # Initialize variables for numpy implementation.
+        m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
+        var0_np = np.array([1.0, 1.0, 2.0], dtype=dtype.as_numpy_dtype)
+        grads0_np = np.array([0.1, 0, 0.1], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([3.0, 3.0, 4.0], dtype=dtype.as_numpy_dtype)
+        grads1_np = np.array([0.01, 0, 0.01], dtype=dtype.as_numpy_dtype)
+
+        if use_resource:
+          var0 = resource_variable_ops.ResourceVariable(var0_np)
+          var1 = resource_variable_ops.ResourceVariable(var1_np)
+        else:
+          var0 = variables.Variable(var0_np)
+          var1 = variables.Variable(var1_np)
+        grads0_np_indices = np.array([0, 2], dtype=np.int32)
+        grads0 = ops.IndexedSlices(
+            constant_op.constant(grads0_np[grads0_np_indices]),
+            constant_op.constant(grads0_np_indices), constant_op.constant([3]))
+        grads1_np_indices = np.array([0, 2], dtype=np.int32)
+        grads1 = ops.IndexedSlices(
+            constant_op.constant(grads1_np[grads1_np_indices]),
+            constant_op.constant(grads1_np_indices), constant_op.constant([3]))
+        opt = nadam.Nadam(epsilon=sparse_epsilon)
+        update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+
+        # Fetch params to validate initial values
+        self.assertAllClose([1.0, 1.0, 2.0], var0.eval())
+        self.assertAllClose([3.0, 3.0, 4.0], var1.eval())
+
+        beta1_power, beta2_power = get_beta_accumulators(opt, dtype)
+
+        # Run 3 steps of Nadam
+        for t in range(3):
+          self.assertAllCloseAccordingToType(0.9**(t + 1), beta1_power.eval())
+          self.assertAllCloseAccordingToType(0.999**(t + 1), beta2_power.eval())
+          update.run()
+
+          var0_np, m0, v0 = nadam_update_numpy(
+              var0_np, grads0_np, t, m0, v0, epsilon=sparse_epsilon)
+          var1_np, m1, v1 = nadam_update_numpy(
+              var1_np, grads1_np, t, m1, v1, epsilon=sparse_epsilon)
+
+          # Validate updated params
+          self.assertAllCloseAccordingToType(var0_np, var0.eval())
+          self.assertAllCloseAccordingToType(var1_np, var1.eval())
+
+  @test_util.run_deprecated_v1
+  def testSparse(self):
+    self.doTestSparse(use_resource=False)
+
+  @test_util.run_deprecated_v1
+  def testResourceSparse(self):
+    self.doTestSparse(use_resource=True)
+
+  def doTestBasic(self, use_resource=False):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.cached_session():
+        # Initialize variables for numpy implementation.
+        m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
+        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+        grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+        grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
+
+        if use_resource:
+          var0 = resource_variable_ops.ResourceVariable(var0_np)
+          var1 = resource_variable_ops.ResourceVariable(var1_np)
+        else:
+          var0 = variables.Variable(var0_np)
+          var1 = variables.Variable(var1_np)
+        grads0 = constant_op.constant(grads0_np)
+        grads1 = constant_op.constant(grads1_np)
+        opt = nadam.Nadam()
+        update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+
+        # Fetch params to validate initial values
+        self.assertAllClose([1.0, 2.0], var0.eval())
+        self.assertAllClose([3.0, 4.0], var1.eval())
+
+        beta1_power, beta2_power = get_beta_accumulators(opt, dtype)
+
+        # Run 3 steps of Nadam
+        for t in range(3):
+          self.assertAllCloseAccordingToType(0.9**(t + 1), beta1_power.eval())
+          self.assertAllCloseAccordingToType(0.999**(t + 1), beta2_power.eval())
+          update.run()
+
+          var0_np, m0, v0 = nadam_update_numpy(var0_np, grads0_np, t, m0, v0)
+          var1_np, m1, v1 = nadam_update_numpy(var1_np, grads1_np, t, m1, v1)
+
+          # Validate updated params
+          self.assertAllCloseAccordingToType(var0_np, var0.eval())
+          self.assertAllCloseAccordingToType(var1_np, var1.eval())
+
+  @test_util.run_deprecated_v1
+  def testResourceBasic(self):
+    self.doTestBasic(use_resource=True)
+
+  @test_util.run_deprecated_v1
+  def testBasicWithLearningRateDecay(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.cached_session():
+        # Initialize variables for numpy implementation.
+        m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
+        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+        grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+        grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
+
+        var0 = resource_variable_ops.ResourceVariable(var0_np)
+        var1 = resource_variable_ops.ResourceVariable(var1_np)
+        grads0 = constant_op.constant(grads0_np)
+        grads1 = constant_op.constant(grads1_np)
+        learning_rate = 0.001
+        decay = 0.5
+        opt = nadam.Nadam(learning_rate=learning_rate, decay=decay)
+        update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+
+        # Fetch params to validate initial values
+        self.assertAllClose([1.0, 2.0], var0.eval())
+        self.assertAllClose([3.0, 4.0], var1.eval())
+
+        beta1_power, beta2_power = get_beta_accumulators(opt, dtype)
+
+        # Run 3 steps of Nadam
+        for t in range(3):
+          self.assertAllCloseAccordingToType(0.9**(t + 1), beta1_power.eval())
+          self.assertAllCloseAccordingToType(0.999**(t + 1), beta2_power.eval())
+          update.run()
+
+          lr = learning_rate / (1 + decay * t)
+          var0_np, m0, v0 = nadam_update_numpy(
+              var0_np, grads0_np, t, m0, v0, alpha=lr)
+          var1_np, m1, v1 = nadam_update_numpy(
+              var1_np, grads1_np, t, m1, v1, alpha=lr)
+
+          # Validate updated params
+          self.assertAllCloseAccordingToType(var0_np, var0.eval())
+          self.assertAllCloseAccordingToType(var1_np, var1.eval())
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/keras/optimizer_v2/optimizer_v2.py b/tensorflow/python/keras/optimizer_v2/optimizer_v2.py
index c6e1d57c5e4..b26b3cefc8c 100644
--- a/tensorflow/python/keras/optimizer_v2/optimizer_v2.py
+++ b/tensorflow/python/keras/optimizer_v2/optimizer_v2.py
@@ -24,16 +24,17 @@ import abc
 
 import six
 
+from tensorflow.python.distribute import reduce_util as ds_reduce_util
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.keras import backend
 from tensorflow.python.keras import initializers
-from tensorflow.python.keras.engine import base_layer
+from tensorflow.python.keras.engine import base_layer_utils
 from tensorflow.python.ops import gradients
-from tensorflow.python.ops import variable_scope
-from tensorflow.python.ops import variables
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import variables as tf_variables
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import distribution_strategy_context
 from tensorflow.python.training import optimizer as optimizer_v1
@@ -114,7 +115,7 @@ class OptimizerV2(optimizer_v1.Optimizer):
 
   """
 
-  def __init__(self, name):
+  def __init__(self, name, **kwargs):
     """Create a new Optimizer.
 
     This must be called by the constructors of subclasses.
@@ -128,6 +129,7 @@ class OptimizerV2(optimizer_v1.Optimizer):
     Args:
       name: A non-empty string.  The name to use for accumulators created
         for the optimizer.
+      **kwargs: keyword arguments. Allowed to be {`decay`}
 
     Raises:
       ValueError: If name is malformed.
@@ -140,6 +142,12 @@ class OptimizerV2(optimizer_v1.Optimizer):
     # dict: {variable name : {slot name : variable}}
     self._slots = {}
     self._weights = []
+
+    decay = kwargs.pop("decay", 0.0)
+    if decay < 0.:
+      raise ValueError("decay cannot be less than 0: {}".format(decay))
+    self._initial_decay = decay
+
     self._prepared = False
 
   def minimize(self,
@@ -296,6 +304,7 @@ class OptimizerV2(optimizer_v1.Optimizer):
       grads_and_vars = zip(reduced_grads, var_list)
 
     with ops.init_scope():
+      self._prepare()
       self._create_slots(var_list)
     update_ops = []
 
@@ -317,19 +326,21 @@ class OptimizerV2(optimizer_v1.Optimizer):
         return update_op
 
     with ops.name_scope(name, self._name) as name:
-      self._prepare()
       for grad, var in grads_and_vars:
         scope_name = ("" if ops.executing_eagerly_outside_functions() else
                       "_" + var.op.name)
-        with ops.name_scope("update" + scope_name), ops.colocate_with(var):
+        with ops.name_scope("update" + scope_name):
           update_ops.append(update_grad_to_var(grad, var))
       # control dependencies does not work in per replica mode, please change
       # this once b/118841692 is fixed.
       # with ops.control_dependencies(update_ops):
       #   apply_updates = self._iterations.assign_add(1).op
-      apply_updates = merge_update_step(update_ops, self.iteration)
+      apply_updates = merge_update_step(update_ops, self.iterations)
       return apply_updates
 
+  def get_updates(self, loss, params):
+    return [self.minimize(loss, params)]
+
   def _set_hyper(self, name, value):
     """set hyper `name` to value. value can be callable, tensor, numeric."""
     if name not in self._hyper:
@@ -342,9 +353,14 @@ class OptimizerV2(optimizer_v1.Optimizer):
       else:
         backend.set_value(self._hyper[name], value)
 
-  def _get_hyper(self, name):
+  def _get_hyper(self, name, dtype=None):
     value = self._hyper[name]
-    return self._call_if_callable(value)
+    if callable(value):
+      value = value()
+    if dtype:
+      return math_ops.cast(value, dtype)
+    else:
+      return value
 
   def __getattribute__(self, name):
     """Overridden to support hyperparameter access."""
@@ -371,12 +387,16 @@ class OptimizerV2(optimizer_v1.Optimizer):
     else:
       super(OptimizerV2, self).__setattr__(name, value)
 
-  def add_slot(self, var, slot_name):
+  def add_slot(self, var, slot_name, initializer="zeros"):
     var_key = _var_key(var)
     slot_dict = self._slots.setdefault(var_key, {})
     if slot_name not in slot_dict:
       slot_key = _get_slot_key_from_var(var, slot_name)
-      weight = self.add_weight(name=slot_key, shape=var.shape, dtype=var.dtype)
+      weight = self.add_weight(
+          name=slot_key,
+          shape=var.shape,
+          dtype=var.dtype,
+          initializer=initializer)
       slot_dict[slot_name] = weight
       self._weights.append(weight)
 
@@ -392,8 +412,10 @@ class OptimizerV2(optimizer_v1.Optimizer):
       self._iterations = self.add_weight(
           "iter",
           shape=[],
+          dtype=dtypes.int64,
           trainable=False,
-          aggregation=variables.VariableAggregation.ONLY_FIRST_REPLICA)
+          aggregation=tf_variables.VariableAggregation.ONLY_FIRST_REPLICA)
+      self._weights.append(self._iterations)
     for name, value in self._hyper.items():
       if isinstance(value, ops.Tensor) or callable(value):
         pass
@@ -403,15 +425,24 @@ class OptimizerV2(optimizer_v1.Optimizer):
             shape=[],
             trainable=False,
             initializer=value,
-            aggregation=variables.VariableAggregation.ONLY_FIRST_REPLICA)
+            aggregation=tf_variables.VariableAggregation.ONLY_FIRST_REPLICA)
     self._prepared = True
 
   @property
-  def iteration(self):
+  def iterations(self):
     if not self._prepared:
       self._prepare()
     return self._iterations
 
+  def _decayed_lr(self, var_dtype):
+    """Get decayed learning rate as a Tensor with dtype=var_dtype."""
+    lr_t = self._get_hyper("learning_rate", var_dtype)
+    if self._initial_decay > 0.:
+      local_step = math_ops.cast(self.iterations, var_dtype)
+      decay_t = self._get_hyper("decay", var_dtype)
+      lr_t = lr_t / (1. + decay_t * local_step)
+    return lr_t
+
   @abc.abstractmethod
   def get_config(self):
     """Returns the config of the optimimizer.
@@ -443,6 +474,8 @@ class OptimizerV2(optimizer_v1.Optimizer):
     Returns:
         An optimizer instance.
     """
+    if "lr" in config:
+      config["learning_rate"] = config.pop("lr")
     return cls(**config)
 
   def _serialize_hyperparameter(self, hyperparameter_name):
@@ -450,10 +483,14 @@ class OptimizerV2(optimizer_v1.Optimizer):
     value = self._get_hyper(hyperparameter_name)
     if callable(value):
       return value()
-    if isinstance(value, (ops.Tensor, variables.Variable)):
+    if isinstance(value, (ops.Tensor, tf_variables.Variable)):
       return backend.get_value(value)
     return value
 
+  def variables(self):
+    """Returns variables of this Optimizer based on the order created."""
+    return self._weights
+
   @property
   def weights(self):
     """Returns variables of this Optimizer based on the order created."""
@@ -490,15 +527,15 @@ class OptimizerV2(optimizer_v1.Optimizer):
                  dtype=None,
                  initializer="zeros",
                  trainable=None,
-                 synchronization=variables.VariableSynchronization.AUTO,
-                 aggregation=variables.VariableAggregation.NONE):
+                 synchronization=tf_variables.VariableSynchronization.AUTO,
+                 aggregation=tf_variables.VariableAggregation.NONE):
 
     if dtype is None:
       dtype = dtypes.float32
     if isinstance(initializer, six.string_types) or callable(initializer):
       initializer = initializers.get(initializer)
 
-    if synchronization == variables.VariableSynchronization.ON_READ:
+    if synchronization == tf_variables.VariableSynchronization.ON_READ:
       if trainable:
         raise ValueError(
             "Synchronization value can be set to "
@@ -514,7 +551,7 @@ class OptimizerV2(optimizer_v1.Optimizer):
     variable = self._add_variable_with_custom_getter(
         name=name,
         shape=shape,
-        getter=base_layer.make_variable,
+        getter=base_layer_utils.make_variable,
         overwrite=True,
         initializer=initializer,
         dtype=dtype,
@@ -522,6 +559,7 @@ class OptimizerV2(optimizer_v1.Optimizer):
         use_resource=True,
         synchronization=synchronization,
         aggregation=aggregation)
+    backend.track_variable(variable)
 
     return variable
 
@@ -561,7 +599,7 @@ def merge_update_step(update_ops, local_step):
     return incre_op
 
   return distribution_strategy_context.get_replica_context().merge_call(
-      merge_update_step_fn, update_ops, local_step)
+      merge_update_step_fn, args=(update_ops, local_step))
 
 
 def merge_grads(grads_and_vars):
@@ -569,11 +607,11 @@ def merge_grads(grads_and_vars):
 
   def merge_grad_fn(strategy, grads_and_vars):
     reduced_grads = strategy.batch_reduce(
-        variable_scope.VariableAggregation.MEAN, grads_and_vars)
+        ds_reduce_util.ReduceOp.MEAN, grads_and_vars)
     return reduced_grads
 
   return distribution_strategy_context.get_replica_context().merge_call(
-      merge_grad_fn, grads_and_vars)
+      merge_grad_fn, args=(grads_and_vars,))
 
 
 def _var_key(var):
diff --git a/tensorflow/python/keras/optimizer_v2/optimizer_v2_test.py b/tensorflow/python/keras/optimizer_v2/optimizer_v2_test.py
index 682deda23f0..158577fe64a 100644
--- a/tensorflow/python/keras/optimizer_v2/optimizer_v2_test.py
+++ b/tensorflow/python/keras/optimizer_v2/optimizer_v2_test.py
@@ -18,6 +18,13 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import os
+import tempfile
+
+from absl.testing import parameterized
+import numpy as np
+
+from tensorflow.python import keras
 from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
 from tensorflow.python.eager import function
@@ -25,15 +32,27 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
+from tensorflow.python.keras import backend
+from tensorflow.python.keras import callbacks
+from tensorflow.python.keras import optimizers
+from tensorflow.python.keras import testing_utils
+from tensorflow.python.keras.engine import input_layer
+from tensorflow.python.keras.engine import saving
+from tensorflow.python.keras.engine import sequential
+from tensorflow.python.keras.engine import training
+from tensorflow.python.keras.layers import core
 from tensorflow.python.keras.optimizer_v2 import adam
 from tensorflow.python.keras.optimizer_v2 import gradient_descent
+from tensorflow.python.keras.optimizer_v2 import optimizer_v2
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import clip_ops
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variables
+from tensorflow.python.platform import gfile
 from tensorflow.python.platform import test
+from tensorflow.python.training import momentum
 
 
 class OptimizerTest(test.TestCase):
@@ -279,8 +298,8 @@ class OptimizerTest(test.TestCase):
   def testIterationWithoutMinimize(self):
     with self.cached_session():
       sgd = gradient_descent.SGD(3.0)
-      self.evaluate(sgd.iteration.initializer)
-      self.assertEqual(0, self.evaluate(sgd.iteration))
+      self.evaluate(sgd.iterations.initializer)
+      self.assertEqual(0, self.evaluate(sgd.iterations))
 
   @test_util.run_in_graph_and_eager_modes
   def testSerializationWithinDefun(self):
@@ -341,8 +360,8 @@ class OptimizerTest(test.TestCase):
       opt2.set_weights(weights)
       self.evaluate([opt_op_1, opt_op_2])
       self.assertAllClose(self.evaluate(var1), self.evaluate(var2))
-      self.assertEqual(1, self.evaluate(opt1.iteration))
-      self.assertEqual(1, self.evaluate(opt2.iteration))
+      self.assertEqual(1, self.evaluate(opt1.iterations))
+      self.assertEqual(1, self.evaluate(opt2.iterations))
 
       var3 = resource_variable_ops.ResourceVariable([1.0, 2.0, 3.0],
                                                     dtype=dtypes.float32)
@@ -394,7 +413,231 @@ class OptimizerTest(test.TestCase):
     with self.assertRaises(AttributeError):
       opt.not_an_attr += 3
 
-  def testOptimizerWithFunction(self):
+  @test_util.run_in_graph_and_eager_modes
+  def testOptimizerWithKerasModel(self):
+    a = input_layer.Input(shape=(3,), name='input_a')
+    b = input_layer.Input(shape=(3,), name='input_b')
+
+    dense = core.Dense(4, name='dense')
+    c = dense(a)
+    d = dense(b)
+    e = core.Dropout(0.5, name='dropout')(c)
+
+    model = training.Model([a, b], [d, e])
+
+    optimizer = gradient_descent.SGD(learning_rate=0.001)
+    loss = 'mse'
+    model.compile(optimizer, loss, metrics=['mae'])
+
+    input_a_np = np.random.random((10, 3))
+    input_b_np = np.random.random((10, 3))
+
+    output_d_np = np.random.random((10, 4))
+    output_e_np = np.random.random((10, 4))
+
+    model.fit([input_a_np, input_b_np], [output_d_np, output_e_np],
+              epochs=1,
+              batch_size=5)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testOptimizerWithCallbacks(self):
+    input_np = np.random.random((10, 3))
+    output_np = np.random.random((10, 4))
+    a = input_layer.Input(shape=(3,), name='input_a')
+    model = sequential.Sequential()
+    model.add(core.Dense(4, name='dense'))
+    model.add(core.Dropout(0.5, name='dropout'))
+    model(a)
+    optimizer = gradient_descent.SGD(learning_rate=0.1)
+    model.compile(optimizer, loss='mse', metrics=['mae'])
+    # This does not reduce the LR after the first epoch (due to low delta).
+    cbks = [
+        callbacks.ReduceLROnPlateau(
+            monitor='val_loss', factor=0.1, min_delta=0, patience=1, cooldown=5)
+    ]
+    model.fit(
+        input_np,
+        output_np,
+        batch_size=10,
+        validation_data=(input_np, output_np),
+        callbacks=cbks,
+        epochs=5,
+        verbose=0)
+    self.assertAllClose(
+        float(backend.get_value(model.optimizer.lr)), 0.1, atol=1e-4)
+
+    # This should reduce the LR after the first epoch (due to high delta).
+    cbks = [
+        callbacks.ReduceLROnPlateau(
+            monitor='val_loss',
+            factor=0.1,
+            min_delta=10,
+            patience=1,
+            cooldown=5)
+    ]
+    model.fit(
+        input_np,
+        output_np,
+        batch_size=10,
+        validation_data=(input_np, output_np),
+        callbacks=cbks,
+        epochs=5,
+        verbose=2)
+    self.assertAllClose(
+        float(backend.get_value(model.optimizer.lr)), 0.01, atol=1e-4)
+
+
+class OptimizersCompatibilityTest(test.TestCase, parameterized.TestCase):
+
+  # TODO(tanzheny): remove test_numeric after algorithm for Momentum, Adam and
+  # NAdam has been unified: currently these three algorithms behave differently.
+  @parameterized.named_parameters(
+      ('adadelta', 'adadelta', True, True), ('adagrad', 'adagrad', True, True),
+      ('adam', 'adam', True, True), ('adamax', 'adamax', True, True),
+      ('nadam', 'nadam', True, False), ('momentum', 'momentum', True, True),
+      ('sgd', 'sgd', False, True))
+  def testOptimizersCompatibility(self, opt_str, test_weights, test_numeric):
+    np.random.seed(1331)
+    with self.cached_session():
+      train_samples = 20
+      input_dim = 3
+      num_classes = 2
+      (x, y), _ = testing_utils.get_test_data(
+          train_samples=train_samples,
+          test_samples=10,
+          input_shape=(input_dim,),
+          num_classes=num_classes)
+      y = keras.utils.to_categorical(y)
+
+      num_hidden = 5
+      model = testing_utils.get_small_sequential_mlp(
+          num_hidden=num_hidden, num_classes=num_classes, input_dim=input_dim)
+
+      old_mode = os.environ.get('TF2_BEHAVIOR', None)
+      # Disable tf2 to create V1 optimizer.
+      disable_tf2()
+      if opt_str == 'momentum':
+        opt_v1 = optimizers.SGD(momentum=0.9)
+      else:
+        opt_v1 = optimizers.get(opt_str)
+
+      # Test compile and fit with v1 optimizer.
+      model.compile(opt_v1, loss='categorical_crossentropy', metrics=[])
+      model.fit(x, y, batch_size=5, epochs=1)
+      model_dir = tempfile.mkdtemp()
+      gfile.MakeDirs(model_dir)
+      file_name = os.path.join(model_dir, 'model.h5')
+      model.save(file_name)
+
+      enable_tf2()
+      # Test load and fit with v2 optimizer.
+      model_2 = saving.load_model(file_name)
+      opt_v2 = model_2.optimizer
+      self.assertIsInstance(opt_v2, optimizer_v2.OptimizerV2)
+      # set_weights is called inside load_model but exception is swallowed,
+      # this call checks the weights can be set correctly.
+      if test_weights:
+        opt_v2.set_weights(opt_v1.get_weights())
+      if test_numeric:
+        hist_1 = model.fit(x, y, batch_size=5, epochs=1, shuffle=False)
+        hist_2 = model_2.fit(x, y, batch_size=5, epochs=1, shuffle=False)
+        self.assertAllClose(model.get_weights(), model_2.get_weights())
+        self.assertAllClose(model.get_weights(), model_2.get_weights())
+        self.assertAllClose(hist_1.history['loss'], hist_2.history['loss'])
+
+      if old_mode is not None:
+        os.environ['TF2_BEHAVIOR'] = old_mode
+
+  def testNumericEquivalenceForNesterovMomentum(self):
+    np.random.seed(1331)
+    with self.cached_session():
+      train_samples = 20
+      input_dim = 3
+      num_classes = 2
+      (x, y), _ = testing_utils.get_test_data(
+          train_samples=train_samples,
+          test_samples=10,
+          input_shape=(input_dim,),
+          num_classes=num_classes)
+      y = keras.utils.to_categorical(y)
+
+      num_hidden = 5
+      model_k_v1 = testing_utils.get_small_sequential_mlp(
+          num_hidden=num_hidden, num_classes=num_classes, input_dim=input_dim)
+      model_k_v2 = testing_utils.get_small_sequential_mlp(
+          num_hidden=num_hidden, num_classes=num_classes, input_dim=input_dim)
+      model_k_v2.set_weights(model_k_v1.get_weights())
+      model_tf = testing_utils.get_small_sequential_mlp(
+          num_hidden=num_hidden, num_classes=num_classes, input_dim=input_dim)
+      model_tf.set_weights(model_k_v2.get_weights())
+
+      opt_k_v1 = optimizers.SGD(lr=0.001, momentum=0.9, nesterov=True)
+      opt_k_v2 = gradient_descent.SGD(momentum=0.9, nesterov=True)
+      opt_tf = momentum.MomentumOptimizer(
+          learning_rate=0.001, momentum=0.9, use_nesterov=True)
+
+      model_k_v1.compile(opt_k_v1, loss='categorical_crossentropy', metrics=[])
+      model_k_v2.compile(opt_k_v2, loss='categorical_crossentropy', metrics=[])
+      model_tf.compile(opt_tf, loss='categorical_crossentropy', metrics=[])
+
+      hist_k_v1 = model_k_v1.fit(x, y, batch_size=5, epochs=10, shuffle=False)
+      hist_k_v2 = model_k_v2.fit(x, y, batch_size=5, epochs=10, shuffle=False)
+      hist_tf = model_tf.fit(x, y, batch_size=5, epochs=10, shuffle=False)
+
+      self.assertAllClose(model_k_v1.get_weights(), model_tf.get_weights())
+      self.assertAllClose(model_k_v1.get_weights(), model_k_v2.get_weights())
+      self.assertAllClose(opt_k_v1.get_weights(), opt_k_v2.get_weights())
+      self.assertAllClose(hist_k_v1.history['loss'], hist_tf.history['loss'])
+      self.assertAllClose(hist_k_v1.history['loss'], hist_k_v2.history['loss'])
+
+  def testNumericEquivalenceForAmsgrad(self):
+    np.random.seed(1331)
+    with self.cached_session():
+      train_samples = 20
+      input_dim = 3
+      num_classes = 2
+      (x, y), _ = testing_utils.get_test_data(
+          train_samples=train_samples,
+          test_samples=10,
+          input_shape=(input_dim,),
+          num_classes=num_classes)
+      y = keras.utils.to_categorical(y)
+
+      num_hidden = 5
+      model_k_v1 = testing_utils.get_small_sequential_mlp(
+          num_hidden=num_hidden, num_classes=num_classes, input_dim=input_dim)
+      model_k_v2 = testing_utils.get_small_sequential_mlp(
+          num_hidden=num_hidden, num_classes=num_classes, input_dim=input_dim)
+      model_k_v2.set_weights(model_k_v1.get_weights())
+
+      opt_k_v1 = optimizers.Adam(amsgrad=True)
+      opt_k_v2 = adam.Adam(amsgrad=True)
+
+      model_k_v1.compile(opt_k_v1, loss='categorical_crossentropy', metrics=[])
+      model_k_v2.compile(opt_k_v2, loss='categorical_crossentropy', metrics=[])
+
+      hist_k_v1 = model_k_v1.fit(x, y, batch_size=5, epochs=10, shuffle=False)
+      hist_k_v2 = model_k_v2.fit(x, y, batch_size=5, epochs=10, shuffle=False)
+
+      self.assertAllClose(model_k_v1.get_weights(), model_k_v2.get_weights())
+      self.assertAllClose(opt_k_v1.get_weights(), opt_k_v2.get_weights())
+      self.assertAllClose(hist_k_v1.history['loss'], hist_k_v2.history['loss'])
+
+
+def disable_tf2():
+  if 'TF2_BEHAVIOR' in os.environ:
+    del os.environ['TF2_BEHAVIOR']
+
+
+def enable_tf2():
+  os.environ['TF2_BEHAVIOR'] = 'enabled'
+
+
+# Note: These tests are kept in a separate class to avoid bugs in some
+# distributions of Python that break AutoGraph which is used by tf.function.
+class OptimizerWithFunctionTest(test.TestCase):
+
+  def testBasic(self):
     with context.eager_mode():
       var = resource_variable_ops.ResourceVariable([1.0, 2.0],
                                                    dtype=dtypes.float32)
@@ -406,10 +649,8 @@ class OptimizerTest(test.TestCase):
         opt.minimize(loss, [var])
         return var
 
-      self.assertAllClose([0., 1.], fn())
-      # This is just to test tf.function. The values needs to be updated
-      # when adam updates beta_1_power.
-      self.assertAllClose([-1.343838, -0.343838], fn())
+      self.assertAllClose([0., 1.], fn(), atol=1e-4)
+      self.assertAllClose([-1, 0.], fn(), atol=1e-4)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/keras/optimizer_v2/rmsprop.py b/tensorflow/python/keras/optimizer_v2/rmsprop.py
new file mode 100644
index 00000000000..6a5b334fc46
--- /dev/null
+++ b/tensorflow/python/keras/optimizer_v2/rmsprop.py
@@ -0,0 +1,196 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""RMSprop for TensorFlow."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import ops
+from tensorflow.python.keras.optimizer_v2 import optimizer_v2
+from tensorflow.python.training import training_ops
+
+
+class RMSprop(optimizer_v2.OptimizerV2):
+  r"""Optimizer that implements the RMSprop algorithm.
+
+  A detailed description of rmsprop.
+
+    - maintain a moving (discounted) average of the square of gradients
+    - divide gradient by the root of this average
+
+  $$mean_square_t = rho * mean_square{t-1} + (1-rho) * gradient ** 2$$
+  $$mom_t = momentum * mom_{t-1} + learning_rate * gradient / \sqrt{ /
+      mean_square_t + \epsilon}$$
+  $$variable_t := variable_{t-1} - mom_t
+
+  This implementation of RMSprop uses plain momentum, not Nesterov momentum.
+
+  The centered version additionally maintains a moving average of the
+  gradients, and uses that average to estimate the variance:
+
+  $$mean_grad_t = rho * mean_grad_{t-1} + (1-rho) * gradient$$
+  $$mean_square_t = rho * mean_square_{t-1} + (1-rho) * gradient ** 2$$
+  $$mom_t = momentum * mom_{t-1} + learning_rate * gradient /
+      sqrt(mean_square_t - mean_grad_t**2 + epsilon)$$
+  $$variable_t := variable_{t-1} - mom_t
+
+  References
+    See ([pdf]
+      http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf).
+  """
+
+  def __init__(self,
+               learning_rate=0.001,
+               rho=0.9,
+               momentum=0.0,
+               epsilon=1e-7,
+               centered=False,
+               name="RMSprop",
+               **kwargs):
+    """Construct a new RMSprop optimizer.
+
+    Note that in the dense implementation of this algorithm, variables and their
+    corresponding accumulators (momentum, gradient moving average, square
+    gradient moving average) will be updated even if the gradient is zero
+    (i.e. accumulators will decay, momentum will be applied). The sparse
+    implementation (used when the gradient is an `IndexedSlices` object,
+    typically because of `tf.gather` or an embedding lookup in the forward pass)
+    will not update variable slices or their accumulators unless those slices
+    were used in the forward pass (nor is there an "eventual" correction to
+    account for these omitted updates). This leads to more efficient updates for
+    large embedding lookup tables (where most of the slices are not accessed in
+    a particular graph execution), but differs from the published algorithm.
+
+    Args:
+      learning_rate: A Tensor or a floating point value.  The learning rate.
+      rho: Discounting factor for the history/coming gradient
+      momentum: A scalar tensor.
+      epsilon: Small value to avoid zero denominator.
+      centered: If True, gradients are normalized by the estimated variance of
+        the gradient; if False, by the uncentered second moment. Setting this to
+        True may help with training, but is slightly more expensive in terms of
+        computation and memory. Defaults to False.
+      name: Optional name prefix for the operations created when applying
+        gradients. Defaults to "RMSprop".  @compatibility(eager) When eager
+        execution is enabled, `learning_rate`, `decay`, `momentum`, and
+        `epsilon` can each be a callable that takes no arguments and returns the
+        actual value to use. This can be useful for changing these values across
+        different invocations of optimizer functions. @end_compatibility
+      **kwargs: keyword arguments. Allowed to be {`decay`}
+    """
+    super(RMSprop, self).__init__(name, **kwargs)
+    self._set_hyper("learning_rate", learning_rate)
+    self._set_hyper("decay", self._initial_decay)
+    self._set_hyper("rho", rho)
+
+    self._momentum = False
+    if isinstance(momentum, ops.Tensor) or callable(momentum) or momentum > 0:
+      self._momentum = True
+    if isinstance(momentum, (int, float)) and (momentum < 0 or momentum > 1):
+      raise ValueError("`momentum` must be between [0, 1].")
+    self._set_hyper("momentum", momentum)
+
+    self._set_hyper("epsilon", epsilon)
+    self._centered = centered
+
+  def _create_slots(self, var_list):
+    for var in var_list:
+      self.add_slot(var, "rms")
+      self.add_slot(var, "momentum")
+      if self._centered:
+        self.add_slot(var, "mg")
+
+  def _resource_apply_dense(self, grad, var):
+    var_dtype = var.dtype.base_dtype
+    lr_t = self._decayed_lr(var_dtype)
+    rms = self.get_slot(var, "rms")
+    mom = self.get_slot(var, "momentum")
+    rho = self._get_hyper("rho", var_dtype)
+    momentum = self._get_hyper("momentum", var_dtype)
+    epsilon = self._get_hyper("epsilon", var_dtype)
+    if self._centered:
+      mg = self.get_slot(var, "mg")
+      return training_ops.resource_apply_centered_rms_prop(
+          var.handle,
+          mg.handle,
+          rms.handle,
+          mom.handle,
+          lr_t,
+          rho,
+          momentum,
+          epsilon,
+          grad,
+          use_locking=self._use_locking)
+    else:
+      return training_ops.resource_apply_rms_prop(
+          var.handle,
+          rms.handle,
+          mom.handle,
+          lr_t,
+          rho,
+          momentum,
+          epsilon,
+          grad,
+          use_locking=self._use_locking)
+
+  def _resource_apply_sparse(self, grad, var, indices):
+    var_dtype = var.dtype.base_dtype
+    lr_t = self._decayed_lr(var_dtype)
+    rms = self.get_slot(var, "rms")
+    mom = self.get_slot(var, "momentum")
+    rho = self._get_hyper("rho", var_dtype)
+    momentum = self._get_hyper("momentum", var_dtype)
+    epsilon = self._get_hyper("epsilon", var_dtype)
+    if self._centered:
+      mg = self.get_slot(var, "mg")
+      return training_ops.resource_sparse_apply_centered_rms_prop(
+          var.handle,
+          mg.handle,
+          rms.handle,
+          mom.handle,
+          lr_t,
+          rho,
+          momentum,
+          epsilon,
+          grad,
+          indices,
+          use_locking=self._use_locking)
+    else:
+      return training_ops.resource_sparse_apply_rms_prop(
+          var.handle,
+          rms.handle,
+          mom.handle,
+          lr_t,
+          rho,
+          momentum,
+          epsilon,
+          grad,
+          indices,
+          use_locking=self._use_locking)
+
+  def get_config(self):
+    config = super(RMSprop, self).get_config()
+    config.update({
+        "learning_rate": self._serialize_hyperparameter("learning_rate"),
+        "decay": self._serialize_hyperparameter("decay"),
+        "rho": self._serialize_hyperparameter("rho"),
+        "momentum": self._serialize_hyperparameter("momentum"),
+        "epsilon": self._serialize_hyperparameter("epsilon"),
+        "centered": self._centered,
+    })
+    return config
+
+
+RMSProp = RMSprop
diff --git a/tensorflow/python/keras/optimizer_v2/rmsprop_test.py b/tensorflow/python/keras/optimizer_v2/rmsprop_test.py
new file mode 100644
index 00000000000..a8658a85507
--- /dev/null
+++ b/tensorflow/python/keras/optimizer_v2/rmsprop_test.py
@@ -0,0 +1,410 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for rmsprop."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import copy
+import itertools
+import math
+
+import numpy as np
+
+from tensorflow.python.eager import context
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.keras.optimizer_v2 import rmsprop
+from tensorflow.python.ops import embedding_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+
+_DATA_TYPES = [dtypes.half, dtypes.float32]
+
+_TEST_PARAM_VALUES = [
+    # learning_rate, rho, momentum, epsilon, centered
+    [0.05, 0.9, 0.0, 1e-3, True],
+    [0.05, 0.9, 0.0, 1e-3, False],
+    [0.1, 0.9, 0.0, 1e-3, True],
+    [0.01, 0.9, 0.0, 1e-5, True],
+    [0.01, 0.9, 0.9, 1e-5, True],
+]
+
+_TESTPARAMS = [
+    [data_type] + values
+    for data_type, values in itertools.product(_DATA_TYPES, _TEST_PARAM_VALUES)
+]
+
+
+class RMSpropOptimizerTest(test.TestCase):
+
+  def _rmsprop_update_numpy(self, var, g, mg, rms, mom, lr, rho, momentum,
+                            epsilon, centered):
+    rms_t = rms * rho + (1 - rho) * g * g
+    denom_t = rms_t + epsilon
+    if centered:
+      mg_t = mg * rho + (1 - rho) * g
+      denom_t -= mg_t * mg_t
+    else:
+      mg_t = mg
+    mom_t = momentum * mom + lr * g / np.sqrt(denom_t, dtype=denom_t.dtype)
+    var_t = var - mom_t
+    return var_t, mg_t, rms_t, mom_t
+
+  def _sparse_rmsprop_update_numpy(self, var, gindexs, gvalues, mg, rms, mom,
+                                   lr, rho, momentum, epsilon, centered):
+    mg_t = copy.deepcopy(mg)
+    rms_t = copy.deepcopy(rms)
+    mom_t = copy.deepcopy(mom)
+    var_t = copy.deepcopy(var)
+    for i in range(len(gindexs)):
+      gindex = gindexs[i]
+      gvalue = gvalues[i]
+      rms_t[gindex] = rms[gindex] * rho + (1 - rho) * gvalue * gvalue
+      denom_t = rms_t[gindex] + epsilon
+      if centered:
+        mg_t[gindex] = mg_t[gindex] * rho + (1 - rho) * gvalue
+        denom_t -= mg_t[gindex] * mg_t[gindex]
+      mom_t[gindex] = momentum * mom[gindex] + lr * gvalue / np.sqrt(denom_t)
+      var_t[gindex] = var[gindex] - mom_t[gindex]
+    return var_t, mg_t, rms_t, mom_t
+
+  @test_util.run_deprecated_v1
+  def testDense(self):
+    for (dtype, learning_rate, rho, momentum, epsilon, centered) in _TESTPARAMS:
+      with test_util.use_gpu():
+        # Initialize variables for numpy implementation.
+        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+        grads0_np = np.array([0.1, 0.2], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+        grads1_np = np.array([0.01, 0.2], dtype=dtype.as_numpy_dtype)
+
+        var0 = resource_variable_ops.ResourceVariable(var0_np, dtype=dtype)
+        var1 = resource_variable_ops.ResourceVariable(var1_np, dtype=dtype)
+        grads0 = constant_op.constant(grads0_np, dtype=dtype)
+        grads1 = constant_op.constant(grads1_np, dtype=dtype)
+        opt = rmsprop.RMSprop(
+            learning_rate=learning_rate,
+            rho=rho,
+            momentum=momentum,
+            epsilon=epsilon,
+            centered=centered)
+
+        update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        self.evaluate(variables.global_variables_initializer())
+
+        if centered:
+          mg0 = opt.get_slot(var0, "mg")
+          mg1 = opt.get_slot(var1, "mg")
+        else:
+          mg0 = None
+          mg1 = None
+
+        rms0 = opt.get_slot(var0, "rms")
+        self.assertTrue(rms0 is not None)
+        rms1 = opt.get_slot(var1, "rms")
+        self.assertTrue(rms1 is not None)
+        mom0 = opt.get_slot(var0, "momentum")
+        self.assertTrue(mom0 is not None)
+        mom1 = opt.get_slot(var1, "momentum")
+        self.assertTrue(mom1 is not None)
+
+        mg0_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
+        mg1_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
+        rms0_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
+        rms1_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
+        mom0_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
+        mom1_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
+
+        # Fetch params to validate initial values
+        self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+        self.assertAllClose([3.0, 4.0], self.evaluate(var1))
+
+        # Run 4 steps of RMSprop
+        for _ in range(1, 5):
+          self.evaluate(update)
+
+          var0_np, mg0_np, rms0_np, mom0_np = self._rmsprop_update_numpy(
+              var0_np, grads0_np, mg0_np, rms0_np, mom0_np, learning_rate, rho,
+              momentum, epsilon, centered)
+          var1_np, mg1_np, rms1_np, mom1_np = self._rmsprop_update_numpy(
+              var1_np, grads1_np, mg1_np, rms1_np, mom1_np, learning_rate, rho,
+              momentum, epsilon, centered)
+
+          # Validate updated params
+          if centered:
+            self.assertAllCloseAccordingToType(mg0_np, self.evaluate(mg0))
+            self.assertAllCloseAccordingToType(mg1_np, self.evaluate(mg1))
+          self.assertAllCloseAccordingToType(rms0_np, self.evaluate(rms0))
+          self.assertAllCloseAccordingToType(rms1_np, self.evaluate(rms1))
+          self.assertAllCloseAccordingToType(mom0_np, self.evaluate(mom0))
+          self.assertAllCloseAccordingToType(mom1_np, self.evaluate(mom1))
+          self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
+          self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
+
+  @test_util.run_deprecated_v1
+  def testDenseWithLearningRateDecay(self):
+    var0_np = np.array([1.0, 2.0])
+    grads0_np = np.array([0.1, 0.2])
+    var1_np = np.array([3.0, 4.0])
+    grads1_np = np.array([0.01, 0.2])
+
+    var0 = resource_variable_ops.ResourceVariable(var0_np)
+    var1 = resource_variable_ops.ResourceVariable(var1_np)
+    grads0 = constant_op.constant(grads0_np)
+    grads1 = constant_op.constant(grads1_np)
+    learning_rate = 0.01
+    rho = 0.9
+    momentum = 0.0
+    epsilon = 1e-7
+    centered = False
+    decay = 0.5
+    opt = rmsprop.RMSprop(
+        learning_rate=learning_rate,
+        rho=rho,
+        momentum=momentum,
+        epsilon=epsilon,
+        centered=centered,
+        decay=decay)
+
+    update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+    self.evaluate(variables.global_variables_initializer())
+
+    rms0 = opt.get_slot(var0, "rms")
+    self.assertTrue(rms0 is not None)
+    rms1 = opt.get_slot(var1, "rms")
+    self.assertTrue(rms1 is not None)
+    mom0 = opt.get_slot(var0, "momentum")
+    self.assertTrue(mom0 is not None)
+    mom1 = opt.get_slot(var1, "momentum")
+    self.assertTrue(mom1 is not None)
+
+    mg0_np = np.array([0.0, 0.0])
+    mg1_np = np.array([0.0, 0.0])
+    rms0_np = np.array([0.0, 0.0])
+    rms1_np = np.array([0.0, 0.0])
+    mom0_np = np.array([0.0, 0.0])
+    mom1_np = np.array([0.0, 0.0])
+
+    # Fetch params to validate initial values
+    self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+    self.assertAllClose([3.0, 4.0], self.evaluate(var1))
+
+    # Run 4 steps of RMSprop
+    for t in range(2):
+      self.evaluate(update)
+
+      lr = learning_rate / (1 + decay * t)
+      var0_np, mg0_np, rms0_np, mom0_np = self._rmsprop_update_numpy(
+          var0_np, grads0_np, mg0_np, rms0_np, mom0_np, lr, rho, momentum,
+          epsilon, centered)
+      var1_np, mg1_np, rms1_np, mom1_np = self._rmsprop_update_numpy(
+          var1_np, grads1_np, mg1_np, rms1_np, mom1_np, lr, rho, momentum,
+          epsilon, centered)
+
+      # Validate updated params
+      self.assertAllCloseAccordingToType(rms0_np, self.evaluate(rms0))
+      self.assertAllCloseAccordingToType(rms1_np, self.evaluate(rms1))
+      self.assertAllCloseAccordingToType(mom0_np, self.evaluate(mom0))
+      self.assertAllCloseAccordingToType(mom1_np, self.evaluate(mom1))
+      self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
+      self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
+
+  @test_util.run_deprecated_v1
+  def testMinimizeSparseResourceVariable(self):
+    for dtype in [dtypes.float32, dtypes.float64]:
+      with self.cached_session():
+        var0 = resource_variable_ops.ResourceVariable([[1.0, 2.0]], dtype=dtype)
+        x = constant_op.constant([[4.0], [5.0]], dtype=dtype)
+        pred = math_ops.matmul(embedding_ops.embedding_lookup([var0], [0]), x)
+        loss = pred * pred
+        sgd_op = rmsprop.RMSprop(
+            learning_rate=1.0,
+            rho=0.0,
+            momentum=0.0,
+            epsilon=0.0,
+            centered=False).minimize(
+                loss, var_list=[var0])
+        self.evaluate(variables.global_variables_initializer())
+        # Fetch params to validate initial values
+        self.assertAllCloseAccordingToType([[1.0, 2.0]], self.evaluate(var0))
+        # Run 1 step of sgd
+        self.evaluate(sgd_op)
+        # Validate updated params
+        self.assertAllCloseAccordingToType([[0., 1.]],
+                                           self.evaluate(var0),
+                                           atol=0.01)
+
+  @test_util.run_deprecated_v1
+  def testMinimizeSparseResourceVariableCentered(self):
+    for dtype in [dtypes.float32, dtypes.float64]:
+      with self.cached_session():
+        var0 = resource_variable_ops.ResourceVariable([[1.0, 2.0]], dtype=dtype)
+        x = constant_op.constant([[4.0], [5.0]], dtype=dtype)
+        pred = math_ops.matmul(embedding_ops.embedding_lookup([var0], [0]), x)
+        loss = pred * pred
+        sgd_op = rmsprop.RMSprop(
+            learning_rate=1.0,
+            rho=0.0,
+            momentum=0.0,
+            epsilon=1.0,
+            centered=True).minimize(
+                loss, var_list=[var0])
+        self.evaluate(variables.global_variables_initializer())
+        # Fetch params to validate initial values
+        self.assertAllCloseAccordingToType([[1.0, 2.0]], self.evaluate(var0))
+        # Run 1 step of sgd
+        self.evaluate(sgd_op)
+        # Validate updated params
+        self.assertAllCloseAccordingToType([[-111, -138]],
+                                           self.evaluate(var0),
+                                           atol=0.01)
+
+  @test_util.run_deprecated_v1
+  def testSparse(self):
+    for (dtype, learning_rate, rho, momentum, epsilon, centered) in _TESTPARAMS:
+      with test_util.use_gpu():
+        # Initialize variables for numpy implementation.
+        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+        grads0_np = np.array([0.1], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+        grads1_np = np.array([0.01], dtype=dtype.as_numpy_dtype)
+
+        var0 = variables.Variable(var0_np)
+        var1 = variables.Variable(var1_np)
+        grads0_np_indices = np.array([0], dtype=np.int32)
+        grads0 = ops.IndexedSlices(
+            constant_op.constant(grads0_np),
+            constant_op.constant(grads0_np_indices), constant_op.constant([1]))
+        grads1_np_indices = np.array([1], dtype=np.int32)
+        grads1 = ops.IndexedSlices(
+            constant_op.constant(grads1_np),
+            constant_op.constant(grads1_np_indices), constant_op.constant([1]))
+        opt = rmsprop.RMSprop(
+            learning_rate=learning_rate,
+            rho=rho,
+            momentum=momentum,
+            epsilon=epsilon,
+            centered=centered)
+        update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        self.evaluate(variables.global_variables_initializer())
+
+        if centered:
+          mg0 = opt.get_slot(var0, "mg")
+          self.assertEqual(mg0 is not None, centered)
+          mg1 = opt.get_slot(var1, "mg")
+          self.assertEqual(mg1 is not None, centered)
+        else:
+          mg0 = None
+          mg1 = None
+        rms0 = opt.get_slot(var0, "rms")
+        self.assertTrue(rms0 is not None)
+        rms1 = opt.get_slot(var1, "rms")
+        self.assertTrue(rms1 is not None)
+        mom0 = opt.get_slot(var0, "momentum")
+        self.assertTrue(mom0 is not None)
+        mom1 = opt.get_slot(var1, "momentum")
+        self.assertTrue(mom1 is not None)
+
+        mg0_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
+        mg1_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
+        rms0_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
+        rms1_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
+        mom0_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
+        mom1_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
+
+        # Fetch params to validate initial values
+        self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+        self.assertAllClose([3.0, 4.0], self.evaluate(var1))
+
+        # Run 4 steps of RMSprop
+        for _ in range(1, 5):
+          self.evaluate(update)
+
+          var0_np, mg0_np, rms0_np, mom0_np = self._sparse_rmsprop_update_numpy(
+              var0_np, grads0_np_indices, grads0_np, mg0_np, rms0_np, mom0_np,
+              learning_rate, rho, momentum, epsilon, centered)
+          var1_np, mg1_np, rms1_np, mom1_np = self._sparse_rmsprop_update_numpy(
+              var1_np, grads1_np_indices, grads1_np, mg1_np, rms1_np, mom1_np,
+              learning_rate, rho, momentum, epsilon, centered)
+
+          # Validate updated params
+          if centered:
+            self.assertAllCloseAccordingToType(mg0_np, self.evaluate(mg0))
+            self.assertAllCloseAccordingToType(mg1_np, self.evaluate(mg1))
+          self.assertAllCloseAccordingToType(rms0_np, self.evaluate(rms0))
+          self.assertAllCloseAccordingToType(rms1_np, self.evaluate(rms1))
+          self.assertAllCloseAccordingToType(mom0_np, self.evaluate(mom0))
+          self.assertAllCloseAccordingToType(mom1_np, self.evaluate(mom1))
+          self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
+          self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
+
+  def testCallableParams(self):
+    with context.eager_mode():
+      for dtype in [dtypes.half, dtypes.float32]:
+        var0 = resource_variable_ops.ResourceVariable([1.0, 2.0], dtype=dtype)
+        var1 = resource_variable_ops.ResourceVariable([3.0, 4.0], dtype=dtype)
+        grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
+        grads1 = constant_op.constant([0.01, 0.01], dtype=dtype)
+
+        learning_rate = lambda: 2.0
+        rho = lambda: 0.9
+        momentum = lambda: 0.0
+        epsilon = lambda: 1.0
+        opt = rmsprop.RMSprop(learning_rate, rho, momentum, epsilon)
+
+        # Fetch params to validate initial values
+        self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+        self.assertAllClose([3.0, 4.0], self.evaluate(var1))
+        # Step 1: the rms accumulators where 1. So we should see a normal
+        # update: v -= grad * learning_rate
+        opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        # Check the parameters.
+        self.assertAllCloseAccordingToType(
+            np.array([
+                1.0 - (0.1 * 2.0 / math.sqrt(0.001 + 1.0)),
+                2.0 - (0.1 * 2.0 / math.sqrt(0.001 + 1.0))
+            ]), self.evaluate(var0))
+        self.assertAllCloseAccordingToType(
+            np.array([
+                3.0 - (0.01 * 2.0 / math.sqrt(0.00001 + 1.0)),
+                4.0 - (0.01 * 2.0 / math.sqrt(0.00001 + 1.0))
+            ]), self.evaluate(var1))
+        # Step 2: the root mean square accumulators contain the previous update.
+        opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        # Check the parameters.
+        self.assertAllCloseAccordingToType(
+            np.array([
+                1.0 - (0.1 * 2.0 / math.sqrt(0.001 + 1.0)) -
+                (0.1 * 2.0 / math.sqrt(0.001 * 0.9 + 0.001 + 1.0)),
+                2.0 - (0.1 * 2.0 / math.sqrt(0.001 + 1.0)) -
+                (0.1 * 2.0 / math.sqrt(0.001 * 0.9 + 0.001 + 1.0))
+            ]), self.evaluate(var0))
+        self.assertAllCloseAccordingToType(
+            np.array([
+                3.0 - (0.01 * 2.0 / math.sqrt(0.00001 + 1.0)) -
+                (0.01 * 2.0 / math.sqrt(0.00001 * 0.9 + 1e-5 + 1.0)),
+                4.0 - (0.01 * 2.0 / math.sqrt(0.00001 + 1.0)) -
+                (0.01 * 2.0 / math.sqrt(0.00001 * 0.9 + 1e-5 + 1.0))
+            ]), self.evaluate(var1))
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/keras/optimizers.py b/tensorflow/python/keras/optimizers.py
index 715d80a116c..10466eb573d 100644
--- a/tensorflow/python/keras/optimizers.py
+++ b/tensorflow/python/keras/optimizers.py
@@ -22,7 +22,17 @@ from __future__ import print_function
 import six
 from six.moves import zip  # pylint: disable=redefined-builtin
 
+from tensorflow.python import tf2
+from tensorflow.python.framework import ops
 from tensorflow.python.keras import backend as K
+from tensorflow.python.keras.optimizer_v2 import adadelta as adadelta_v2
+from tensorflow.python.keras.optimizer_v2 import adagrad as adagrad_v2
+from tensorflow.python.keras.optimizer_v2 import adam as adam_v2
+from tensorflow.python.keras.optimizer_v2 import adamax as adamax_v2
+from tensorflow.python.keras.optimizer_v2 import gradient_descent as gradient_descent_v2
+from tensorflow.python.keras.optimizer_v2 import nadam as nadam_v2
+from tensorflow.python.keras.optimizer_v2 import optimizer_v2
+from tensorflow.python.keras.optimizer_v2 import rmsprop as rmsprop_v2
 from tensorflow.python.keras.utils.generic_utils import deserialize_keras_object
 from tensorflow.python.keras.utils.generic_utils import serialize_keras_object
 from tensorflow.python.ops import clip_ops
@@ -473,7 +483,7 @@ class Adam(Optimizer):
 
   def get_updates(self, loss, params):
     grads = self.get_gradients(loss, params)
-    self.updates = [state_ops.assign_add(self.iterations, 1)]
+    self.updates = []
 
     lr = self.lr
     if self.initial_decay > 0:
@@ -481,7 +491,8 @@ class Adam(Optimizer):
           1. / (1. + self.decay * math_ops.cast(self.iterations,
                                                 K.dtype(self.decay))))
 
-    t = math_ops.cast(self.iterations, K.floatx()) + 1
+    with ops.control_dependencies([state_ops.assign_add(self.iterations, 1)]):
+      t = math_ops.cast(self.iterations, K.floatx())
     lr_t = lr * (
         K.sqrt(1. - math_ops.pow(self.beta_2, t)) /
         (1. - math_ops.pow(self.beta_1, t)))
@@ -795,16 +806,27 @@ def deserialize(config, custom_objects=None):
   Returns:
       A Keras Optimizer instance.
   """
-  all_classes = {
-      'sgd': SGD,
-      'rmsprop': RMSprop,
-      'adagrad': Adagrad,
-      'adadelta': Adadelta,
-      'adam': Adam,
-      'adamax': Adamax,
-      'nadam': Nadam,
-      'tfoptimizer': TFOptimizer,
-  }
+  if tf2.enabled():
+    all_classes = {
+        'adadelta': adadelta_v2.Adadelta,
+        'adagrad': adagrad_v2.Adagrad,
+        'adam': adam_v2.Adam,
+        'adamax': adamax_v2.Adamax,
+        'nadam': nadam_v2.Nadam,
+        'rmsprop': rmsprop_v2.RMSprop,
+        'sgd': gradient_descent_v2.SGD
+    }
+  else:
+    all_classes = {
+        'adadelta': Adadelta,
+        'adagrad': Adagrad,
+        'adam': Adam,
+        'adamax': Adamax,
+        'nadam': Nadam,
+        'rmsprop': RMSprop,
+        'sgd': SGD,
+        'tfoptimizer': TFOptimizer
+    }
   # Make deserialization case-insensitive for built-in optimizers.
   if config['class_name'].lower() in all_classes:
     config['class_name'] = config['class_name'].lower()
@@ -833,17 +855,17 @@ def get(identifier):
   Raises:
       ValueError: If `identifier` cannot be interpreted.
   """
+  if isinstance(identifier, (Optimizer, optimizer_v2.OptimizerV2)):
+    return identifier
   # Wrap TF optimizer instances
-  if isinstance(identifier, tf_optimizer_module.Optimizer):
+  elif isinstance(identifier, tf_optimizer_module.Optimizer):
     opt = TFOptimizer(identifier)
     K.track_tf_optimizer(opt)
     return opt
-  if isinstance(identifier, dict):
+  elif isinstance(identifier, dict):
     return deserialize(identifier)
   elif isinstance(identifier, six.string_types):
     config = {'class_name': str(identifier), 'config': {}}
     return deserialize(config)
-  if isinstance(identifier, Optimizer):
-    return identifier
   else:
     raise ValueError('Could not interpret optimizer identifier:', identifier)
diff --git a/tensorflow/python/keras/optimizers_test.py b/tensorflow/python/keras/optimizers_test.py
index 9664f09fff0..d3cacb702c9 100644
--- a/tensorflow/python/keras/optimizers_test.py
+++ b/tensorflow/python/keras/optimizers_test.py
@@ -19,11 +19,14 @@ from __future__ import division
 from __future__ import print_function
 
 import gc
+import os
 import weakref
 
+from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.python import keras
+from tensorflow.python import tf2
 from tensorflow.python.eager import context
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
@@ -88,22 +91,26 @@ def _test_optimizer(optimizer, target=0.75):
 
 class KerasOptimizersTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def test_sgd(self):
     with self.cached_session():
       _test_optimizer(keras.optimizers.SGD(lr=0.01,
                                            momentum=0.9,
                                            nesterov=True))
 
+  @test_util.run_deprecated_v1
   def test_rmsprop(self):
     with self.cached_session():
       _test_optimizer(keras.optimizers.RMSprop())
       _test_optimizer(keras.optimizers.RMSprop(decay=1e-3))
 
+  @test_util.run_deprecated_v1
   def test_adagrad(self):
     with self.cached_session():
       _test_optimizer(keras.optimizers.Adagrad())
       _test_optimizer(keras.optimizers.Adagrad(decay=1e-3))
 
+  @test_util.run_deprecated_v1
   def test_adadelta(self):
     with self.cached_session():
       _test_optimizer(keras.optimizers.Adadelta(), target=0.6)
@@ -112,27 +119,32 @@ class KerasOptimizersTest(test.TestCase):
       # the accuracy.
       _test_optimizer(keras.optimizers.Adadelta(decay=1e-3), target=0.4)
 
+  @test_util.run_deprecated_v1
   def test_adam(self):
     with self.cached_session():
       _test_optimizer(keras.optimizers.Adam())
       _test_optimizer(keras.optimizers.Adam(decay=1e-3))
       _test_optimizer(keras.optimizers.Adam(amsgrad=True))
 
+  @test_util.run_deprecated_v1
   def test_adamax(self):
     with self.cached_session():
       _test_optimizer(keras.optimizers.Adamax())
       _test_optimizer(keras.optimizers.Adamax(decay=1e-3))
 
+  @test_util.run_deprecated_v1
   def test_nadam(self):
     with self.cached_session():
       _test_optimizer(keras.optimizers.Nadam())
 
+  @test_util.run_deprecated_v1
   def test_clipnorm(self):
     with self.cached_session():
       _test_optimizer(keras.optimizers.SGD(lr=0.01,
                                            momentum=0.9,
                                            clipnorm=0.5))
 
+  @test_util.run_deprecated_v1
   def test_clipvalue(self):
     with self.cached_session():
       _test_optimizer(keras.optimizers.SGD(lr=0.01,
@@ -208,5 +220,40 @@ class KerasOptimizersTest(test.TestCase):
       _ = keras.optimizers.Adam(clipnorm=-2.0)
 
 
+@test_util.run_all_in_graph_and_eager_modes
+class KerasV2OptimizersTest(test.TestCase, parameterized.TestCase):
+
+  @parameterized.named_parameters(
+      ('adadelta_tf2', 'adadelta', True), ('adadelta_tf1', 'adadelta', False),
+      ('adagrad_tf2', 'adagrad', True), ('adagrad_tf1', 'adagrad', False),
+      ('adam_tf2', 'adam', True), ('adam_tf1', 'adam', False),
+      ('adamax_tf2', 'adamax', True), ('adamax_tf1', 'adamax', False),
+      ('sgd_tf2', 'sgd', True), ('sgd_tf1', 'sgd', False),
+      ('nadam_tf2', 'nadam', True), ('nadam_tf1', 'nadam', False),
+      ('rmsprop_tf2', 'rmsprop', True), ('rmsprop_tf1', 'rmsprop', False))
+  def test_load_from_string(self, optimizer_string, tf2mode):
+    old_mode = os.environ.get('TF2_BEHAVIOR', None)
+    if tf2mode:
+      os.environ['TF2_BEHAVIOR'] = 'enabled'
+    else:
+      if 'TF2_BEHAVIOR' in os.environ:
+        del os.environ['TF2_BEHAVIOR']
+
+    # Sanity check.
+    self.assertEqual(tf2.enabled(), tf2mode)
+
+    model = keras.models.Sequential()
+    model.add(keras.layers.Dense(1, input_shape=(10,)))
+    model.compile(optimizer_string, 'binary_crossentropy')
+
+    self.assertEqual(optimizer_string,
+                     model.optimizer.__class__.__name__.lower())
+
+    model.fit(np.ones((10, 10), 'float32'), np.ones((10, 1), 'float32'))
+
+    if old_mode is not None:
+      os.environ['TF2_BEHAVIOR'] = old_mode
+
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/regularizers_test.py b/tensorflow/python/keras/regularizers_test.py
index bba4ebb287b..3d6b259d87d 100644
--- a/tensorflow/python/keras/regularizers_test.py
+++ b/tensorflow/python/keras/regularizers_test.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 
 from tensorflow.python import keras
 from tensorflow.python.keras import testing_utils
+from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
 
 
@@ -61,6 +62,7 @@ class KerasRegularizersTest(test.TestCase):
         model.fit(x_train, y_train, batch_size=10,
                   epochs=1, verbose=0)
 
+  @test_util.run_deprecated_v1
   def test_activity_regularization(self):
     with self.cached_session():
       (x_train, y_train), _ = get_data()
diff --git a/tensorflow/python/keras/testing_utils.py b/tensorflow/python/keras/testing_utils.py
index d342131a521..7827e166273 100644
--- a/tensorflow/python/keras/testing_utils.py
+++ b/tensorflow/python/keras/testing_utils.py
@@ -149,6 +149,10 @@ def layer_test(layer_cls, kwargs=None, input_shape=None, input_dtype=None,
     np.testing.assert_allclose(output, actual_output, rtol=1e-3)
 
   # test training mode (e.g. useful for dropout tests)
+  # Rebuild the model to avoid the graph being reused between predict() and
+  # train(). This was causing some error for layer with Defun as it body.
+  # See b/120160788 for more details. This should be mitigated after 2.0.
+  model = keras.models.Model(x, layer(x))
   model.compile(RMSPropOptimizer(0.01), 'mse', weighted_metrics=['acc'])
   model.train_on_batch(input_data, actual_output)
 
diff --git a/tensorflow/python/keras/utils/__init__.py b/tensorflow/python/keras/utils/__init__.py
index 8939044f71d..61940ad789c 100644
--- a/tensorflow/python/keras/utils/__init__.py
+++ b/tensorflow/python/keras/utils/__init__.py
@@ -34,6 +34,7 @@ from tensorflow.python.keras.utils.generic_utils import serialize_keras_object
 from tensorflow.python.keras.utils.io_utils import HDF5Matrix
 from tensorflow.python.keras.utils.layer_utils import convert_all_kernels_in_model
 from tensorflow.python.keras.utils.layer_utils import get_source_inputs
+from tensorflow.python.keras.utils.losses_utils import squeeze_or_expand_dimensions
 from tensorflow.python.keras.utils.multi_gpu_utils import multi_gpu_model
 from tensorflow.python.keras.utils.np_utils import normalize
 from tensorflow.python.keras.utils.np_utils import to_categorical
diff --git a/tensorflow/python/keras/utils/generic_utils.py b/tensorflow/python/keras/utils/generic_utils.py
index 375bd9d196c..c331ce430bd 100644
--- a/tensorflow/python/keras/utils/generic_utils.py
+++ b/tensorflow/python/keras/utils/generic_utils.py
@@ -319,14 +319,16 @@ class Progbar(object):
           will be displayed as-is. All others will be averaged
           by the progbar before display.
       interval: Minimum visual progress update interval (in seconds).
+      unit_name: Display name for step counts (usually "step" or "sample").
   """
 
   def __init__(self, target, width=30, verbose=1, interval=0.05,
-               stateful_metrics=None):
+               stateful_metrics=None, unit_name='step'):
     self.target = target
     self.width = width
     self.verbose = verbose
     self.interval = interval
+    self.unit_name = unit_name
     if stateful_metrics:
       self.stateful_metrics = set(stateful_metrics)
     else:
@@ -425,12 +427,12 @@ class Progbar(object):
 
         info = ' - ETA: %s' % eta_format
       else:
-        if time_per_unit >= 1:
-          info += ' %.0fs/step' % time_per_unit
+        if time_per_unit >= 1 or time_per_unit == 0:
+          info += ' %.0fs/%s' % (time_per_unit, self.unit_name)
         elif time_per_unit >= 1e-3:
-          info += ' %.0fms/step' % (time_per_unit * 1e3)
+          info += ' %.0fms/%s' % (time_per_unit * 1e3, self.unit_name)
         else:
-          info += ' %.0fus/step' % (time_per_unit * 1e6)
+          info += ' %.0fus/%s' % (time_per_unit * 1e6, self.unit_name)
 
       for k in self._values_order:
         info += ' - %s:' % k
diff --git a/tensorflow/python/keras/utils/layer_utils.py b/tensorflow/python/keras/utils/layer_utils.py
index 158a9a5e76d..60677be7351 100644
--- a/tensorflow/python/keras/utils/layer_utils.py
+++ b/tensorflow/python/keras/utils/layer_utils.py
@@ -77,7 +77,7 @@ def count_params(weights):
   Returns:
       The total number of scalars composing the weights
   """
-  return int(np.sum([np.prod(p.get_shape().as_list()) for p in set(weights)]))
+  return int(sum(np.prod(p.get_shape().as_list()) for p in set(weights)))
 
 
 def print_summary(model, line_length=None, positions=None, print_fn=None):
diff --git a/tensorflow/python/keras/utils/losses_utils.py b/tensorflow/python/keras/utils/losses_utils.py
new file mode 100644
index 00000000000..fc4b4ac7dfd
--- /dev/null
+++ b/tensorflow/python/keras/utils/losses_utils.py
@@ -0,0 +1,189 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+# pylint: disable=protected-access
+"""Utilities related to loss functions."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import ops
+from tensorflow.python.keras import backend as K
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import confusion_matrix
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import weights_broadcast_ops
+from tensorflow.python.ops.losses import losses_impl
+
+
+def squeeze_or_expand_dimensions(y_pred, y_true, sample_weight):
+  """Squeeze or expand last dimension if needed.
+
+  1. Squeezes last dim of `y_pred` or `y_true` if their rank differs by 1
+  (using `confusion_matrix.remove_squeezable_dimensions`).
+  2. Squeezes or expands last dim of `sample_weight` if its rank differs by 1
+  from the new rank of `y_pred`.
+  If `sample_weight` is scalar, it is kept scalar.
+
+  This will use static shape if available. Otherwise, it will add graph
+  operations, which could result in a performance hit.
+
+  Args:
+    y_pred: Predicted values, a `Tensor` of arbitrary dimensions.
+    y_true: Optional label `Tensor` whose dimensions match `y_pred`.
+    sample_weight: Optional weight scalar or `Tensor` whose dimensions match
+      `y_pred`.
+
+  Returns:
+    Tuple of `y_pred`, `y_true` and `sample_weight`. Each of them possibly has
+    the last dimension squeezed,
+    `sample_weight` could be extended by one dimension.
+  """
+  if y_true is not None:
+    # squeeze last dim of `y_pred` or `y_true` if their rank differs by 1
+    y_true, y_pred = confusion_matrix.remove_squeezable_dimensions(
+        y_true, y_pred)
+
+  if sample_weight is None:
+    return y_pred, y_true, None
+
+  sample_weight = ops.convert_to_tensor(sample_weight)
+  weights_shape = sample_weight.get_shape()
+  weights_rank = weights_shape.ndims
+  if weights_rank == 0:  # If weights is scalar, do nothing.
+    return y_pred, y_true, sample_weight
+
+  y_pred_shape = y_pred.get_shape()
+  y_pred_rank = y_pred_shape.ndims
+  if (y_pred_rank is not None) and (weights_rank is not None):
+    # Use static rank.
+    if weights_rank - y_pred_rank == 1:
+      sample_weight = array_ops.squeeze(sample_weight, [-1])
+    elif y_pred_rank - weights_rank == 1:
+      sample_weight = array_ops.expand_dims(sample_weight, [-1])
+    return y_pred, y_true, sample_weight
+
+  # Use dynamic rank.
+  weights_rank_tensor = array_ops.rank(sample_weight)
+  rank_diff = weights_rank_tensor - array_ops.rank(y_pred)
+  maybe_squeeze_weights = lambda: array_ops.squeeze(sample_weight, [-1])
+
+  def _maybe_expand_weights():
+    return control_flow_ops.cond(
+        math_ops.equal(rank_diff,
+                       -1), lambda: array_ops.expand_dims(sample_weight, [-1]),
+        lambda: sample_weight)
+
+  def _maybe_adjust_weights():
+    return control_flow_ops.cond(
+        math_ops.equal(rank_diff, 1), maybe_squeeze_weights,
+        _maybe_expand_weights)
+
+  # squeeze or expand last dim of `sample_weight` if its rank differs by 1
+  # from the new rank of `y_pred`.
+  sample_weight = control_flow_ops.cond(
+      math_ops.equal(weights_rank_tensor, 0), lambda: sample_weight,
+      _maybe_adjust_weights)
+  return y_pred, y_true, sample_weight
+
+
+def _safe_mean(losses, num_present):
+  """Computes a safe mean of the losses.
+
+  Args:
+    losses: `Tensor` whose elements contain individual loss measurements.
+    num_present: The number of measurable elements in `losses`.
+
+  Returns:
+    A scalar representing the mean of `losses`. If `num_present` is zero,
+      then zero is returned.
+  """
+  total_loss = math_ops.reduce_sum(losses)
+  return math_ops.div_no_nan(total_loss, num_present, name='value')
+
+
+def _num_elements(losses):
+  """Computes the number of elements in `losses` tensor."""
+  with ops.name_scope(None, 'num_elements', values=[losses]) as scope:
+    return math_ops.cast(array_ops.size(losses, name=scope), dtype=losses.dtype)
+
+
+def _reduce_weighted_loss(
+    weighted_losses, reduction=losses_impl.ReductionV2.SUM_OVER_BATCH_SIZE):
+  """Reduces the individual weighted loss measurements."""
+  if reduction == losses_impl.ReductionV2.NONE:
+    loss = weighted_losses
+  else:
+    loss = math_ops.reduce_sum(weighted_losses)
+    if reduction == losses_impl.ReductionV2.SUM_OVER_BATCH_SIZE:
+      loss = _safe_mean(loss, _num_elements(weighted_losses))
+  return loss
+
+
+def compute_weighted_loss(losses,
+                          sample_weight=None,
+                          reduction=losses_impl.ReductionV2.SUM_OVER_BATCH_SIZE,
+                          name=None):
+  """Computes the weighted loss.
+
+  Args:
+    losses: `Tensor` of shape `[batch_size, d1, ... dN]`.
+    sample_weight: Optional `Tensor` whose rank is either 0, or the same rank as
+      `losses`, or be broadcastable to `losses`.
+    reduction: Type of `tf.losses.Reduction` to apply to loss. Default value is
+      `SUM_OVER_BATCH_SIZE`.
+    name: Optional name for the op.
+
+  Raises:
+    ValueError: If the shape of `sample_weight` is not compatible with `losses`.
+
+  Returns:
+    Weighted loss `Tensor` of the same type as `losses`. If `reduction` is
+    `NONE`, this has the same shape as `losses`; otherwise, it is scalar.
+  """
+  losses_impl.ReductionV2.validate(reduction)
+  if sample_weight is None:
+    sample_weight = 1.0
+  with ops.name_scope(name, 'weighted_loss', (losses, sample_weight)):
+    # Save the `reduction` argument for loss normalization when distributing
+    # to multiple replicas.
+    # TODO(josh11b): Associate it with the returned op for more precision.
+    ops.get_default_graph()._last_loss_reduction = reduction  # pylint: disable=protected-access
+
+    # Update dimensions of `sample_weight` to match with `losses` if possible.
+    losses, _, sample_weight = squeeze_or_expand_dimensions(
+        losses, None, sample_weight)
+    losses = ops.convert_to_tensor(losses)
+    input_dtype = losses.dtype
+    losses = math_ops.to_float(losses)
+    sample_weight = math_ops.to_float(sample_weight)
+
+    try:
+      # Broadcast weights if possible.
+      sample_weight = weights_broadcast_ops.broadcast_weights(
+          sample_weight, losses)
+    except ValueError:
+      # Reduce values to same ndim as weight array.
+      ndim = K.ndim(losses)
+      weight_ndim = K.ndim(sample_weight)
+      losses = K.mean(losses, axis=list(range(weight_ndim, ndim)))
+
+    sample_weight.get_shape().assert_is_compatible_with(losses.get_shape())
+    weighted_losses = math_ops.multiply(losses, sample_weight)
+    # Apply reduction function to the individual weighted losses.
+    loss = _reduce_weighted_loss(weighted_losses, reduction)
+    # Convert the result back to the input type.
+    loss = math_ops.cast(loss, input_dtype)
+    return loss
diff --git a/tensorflow/python/keras/utils/multi_gpu_utils_test.py b/tensorflow/python/keras/utils/multi_gpu_utils_test.py
index 1780ab65871..8c1abd63248 100644
--- a/tensorflow/python/keras/utils/multi_gpu_utils_test.py
+++ b/tensorflow/python/keras/utils/multi_gpu_utils_test.py
@@ -158,7 +158,7 @@ class TestMultiGPUModel(test.TestCase):
       dataset = data.Dataset.from_tensor_slices((x_train, y_train))
       dataset = dataset.repeat()
       dataset = dataset.batch(4)
-      iterator = dataset.make_one_shot_iterator()
+      iterator = data.make_one_shot_iterator(dataset)
 
       inputs, targets = iterator.get_next()
 
diff --git a/tensorflow/python/keras/utils/tf_utils.py b/tensorflow/python/keras/utils/tf_utils.py
index 6b7c6c34a26..7b4c9e7239e 100644
--- a/tensorflow/python/keras/utils/tf_utils.py
+++ b/tensorflow/python/keras/utils/tf_utils.py
@@ -161,6 +161,9 @@ def are_all_symbolic_tensors(tensors):
   return all(is_symbolic_tensor(tensor) for tensor in tensors)
 
 
+_user_convertible_tensor_types = set()
+
+
 def is_symbolic_tensor(tensor):
   """Returns whether a tensor is symbolic (from a TF graph) or an eager tensor.
 
@@ -176,9 +179,40 @@ def is_symbolic_tensor(tensor):
   if isinstance(tensor, variables.Variable):
     return not context.executing_eagerly()
   if isinstance(tensor, (ops.Tensor, sparse_tensor.SparseTensor)):
-    try:
-      _ = tensor.graph
-      return True
-    except AttributeError:
-      return False
+    return hasattr(tensor, 'graph')
+  if isinstance(tensor, tuple(_user_convertible_tensor_types)):
+    return hasattr(ops.convert_to_tensor(tensor), 'graph')
   return False
+
+
+def register_symbolic_tensor_type(cls):
+  """Allows users to specify types regarded as symbolic `Tensor`s.
+
+  Used in conjunction with `tf.register_tensor_conversion_function`, calling
+  `tf.keras.utils.register_symbolic_tensor_type(cls)` allows non-`Tensor`
+  objects to be plumbed through Keras layers.
+
+  Example:
+
+  ```python
+  # One-time setup.
+  class Foo(object):
+    def __init__(self, input_):
+      self._input = input_
+    def value(self):
+      return tf.constant(42.)
+
+  tf.register_tensor_conversion_function(
+      Foo, lambda x, *args, **kwargs: x.value())
+
+  tf.keras.utils.register_symbolic_tensor_type(Foo)
+
+  # User-land.
+  layer = tf.keras.layers.Lambda(lambda input_: Foo(input_))
+  ```
+
+  Arguments:
+    cls: A `class` type which shall be regarded as a symbolic `Tensor`.
+  """
+  global _user_convertible_tensor_types
+  _user_convertible_tensor_types.add(cls)
diff --git a/tensorflow/python/keras/utils/tf_utils_test.py b/tensorflow/python/keras/utils/tf_utils_test.py
new file mode 100644
index 00000000000..9833a492993
--- /dev/null
+++ b/tensorflow/python/keras/utils/tf_utils_test.py
@@ -0,0 +1,134 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Keras TF utils."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python import keras
+from tensorflow.python.eager import context
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import test_util
+from tensorflow.python.keras.utils import tf_utils
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class TestIsSymbolicTensor(test.TestCase):
+
+  def test_default_behavior(self):
+    if context.executing_eagerly():
+      self.assertFalse(tf_utils.is_symbolic_tensor(
+          variables.Variable(name='blah', initial_value=0.)))
+      self.assertFalse(tf_utils.is_symbolic_tensor(
+          ops.convert_to_tensor(0.)))
+      self.assertFalse(tf_utils.is_symbolic_tensor(
+          sparse_tensor.SparseTensor(
+              indices=[[0, 0], [1, 2]], values=[1, 2], dense_shape=[3, 4])))
+    else:
+      self.assertTrue(tf_utils.is_symbolic_tensor(
+          variables.Variable(name='blah', initial_value=0.)))
+      self.assertTrue(tf_utils.is_symbolic_tensor(
+          ops.convert_to_tensor(0.)))
+      self.assertTrue(tf_utils.is_symbolic_tensor(
+          sparse_tensor.SparseTensor(
+              indices=[[0, 0], [1, 2]], values=[1, 2], dense_shape=[3, 4])))
+
+  def test_works_with_registered(self):
+
+    class CustomClass(object):
+
+      def value(self):
+        return ops.convert_to_tensor(42.)
+
+    ops.register_tensor_conversion_function(
+        CustomClass, lambda value, **_: value.value())
+
+    tf_utils.register_symbolic_tensor_type(CustomClass)
+
+    if context.executing_eagerly():
+      self.assertFalse(tf_utils.is_symbolic_tensor(
+          variables.Variable(name='blah', initial_value=0.)))
+      self.assertFalse(tf_utils.is_symbolic_tensor(
+          ops.convert_to_tensor(0.)))
+      self.assertFalse(tf_utils.is_symbolic_tensor(
+          sparse_tensor.SparseTensor(
+              indices=[[0, 0], [1, 2]], values=[1, 2], dense_shape=[3, 4])))
+      self.assertFalse(tf_utils.is_symbolic_tensor(CustomClass()))
+    else:
+      self.assertTrue(tf_utils.is_symbolic_tensor(
+          variables.Variable(name='blah', initial_value=0.)))
+      self.assertTrue(tf_utils.is_symbolic_tensor(
+          ops.convert_to_tensor(0.)))
+      self.assertTrue(tf_utils.is_symbolic_tensor(
+          sparse_tensor.SparseTensor(
+              indices=[[0, 0], [1, 2]], values=[1, 2], dense_shape=[3, 4])))
+      self.assertTrue(tf_utils.is_symbolic_tensor(CustomClass()))
+
+  def test_enables_nontensor_plumbing(self):
+    # Setup.
+
+    class Foo(object):
+
+      def __init__(self, input_):
+        self._input = input_
+        self.value = ops.convert_to_tensor(42.)
+
+    ops.register_tensor_conversion_function(
+        Foo, lambda x, *args, **kwargs: x.value)
+    tf_utils.register_symbolic_tensor_type(Foo)
+
+    class PlumbingLayer(keras.layers.Lambda):
+
+      def __init__(self, fn, **kwargs):
+        def _fn(*fargs, **fkwargs):
+          d = fn(*fargs, **fkwargs)
+          x = ops.convert_to_tensor(d)
+          d.shape = x.shape
+          d.get_shape = x.get_shape
+          return d, x
+        super(PlumbingLayer, self).__init__(_fn, **kwargs)
+        self._enter_dunder_call = False
+
+      def __call__(self, inputs, *args, **kwargs):
+        self._enter_dunder_call = True
+        d, _ = super(PlumbingLayer, self).__call__(inputs, *args, **kwargs)
+        self._enter_dunder_call = False
+        return d
+
+      def call(self, inputs, *args, **kwargs):
+        d, v = super(PlumbingLayer, self).call(inputs, *args, **kwargs)
+        if self._enter_dunder_call:
+          return d, v
+        return d
+
+    # User-land.
+    model = keras.Sequential([
+        keras.layers.InputLayer([]),
+        PlumbingLayer(Foo),  # Makes a `Foo` object.
+    ])
+    # Let's ensure Keras graph history is preserved by composing the models.
+    model = keras.Model(model.inputs, model(model.outputs))
+    # Now we instantiate the model and verify we have a `Foo` object, not a
+    # `Tensor`.
+    y = model(ops.convert_to_tensor(7.))
+    self.assertIsInstance(y, Foo)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/kernel_tests/BUILD b/tensorflow/python/kernel_tests/BUILD
index 6b990d3a926..97ac21b8adf 100644
--- a/tensorflow/python/kernel_tests/BUILD
+++ b/tensorflow/python/kernel_tests/BUILD
@@ -121,8 +121,10 @@ cuda_py_test(
         "@absl_py//absl/testing:parameterized",
         "//third_party/py/numpy",
         "//tensorflow/python:array_ops",
-        "//tensorflow/python:math_ops",
+        "//tensorflow/python:gradients_impl",
         "//tensorflow/python:list_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:tensor_shape",
         "//tensorflow/python/eager:context",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:framework_test_lib",
@@ -268,7 +270,7 @@ tf_py_test(
     ],
 )
 
-tf_py_test(
+cuda_py_test(
     name = "ctc_loss_op_test",
     size = "small",
     srcs = ["ctc_loss_op_test.py"],
@@ -659,6 +661,18 @@ cuda_py_test(
     ],
 )
 
+cuda_py_test(
+    name = "matrix_square_root_op_test",
+    size = "medium",
+    srcs = ["matrix_square_root_op_test.py"],
+    additional_deps = [
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:linalg_ops",
+    ],
+)
+
 cuda_py_test(
     name = "matrix_solve_op_test",
     size = "medium",
@@ -817,6 +831,7 @@ tf_py_test(
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:client",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
         "//tensorflow/python:io_ops",
         "//tensorflow/python:io_ops_gen",
     ],
@@ -1142,6 +1157,21 @@ cuda_py_test(
     ],
 )
 
+tf_py_test(
+    name = "unicode_encode_op_test",
+    size = "small",
+    srcs = ["unicode_encode_op_test.py"],
+    additional_deps = [
+        "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:errors",
+        "//tensorflow/python/ops/ragged:ragged_factory_ops",
+        "//tensorflow/python/ops/ragged:ragged_string_ops",
+    ],
+)
+
 tf_py_test(
     name = "unicode_transcode_op_test",
     size = "small",
@@ -1154,6 +1184,18 @@ tf_py_test(
     ],
 )
 
+tf_py_test(
+    name = "unicode_decode_op_test",
+    size = "small",
+    srcs = ["unicode_decode_op_test.py"],
+    additional_deps = [
+        "@absl_py//absl/testing:parameterized",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:string_ops",
+    ],
+)
+
 tf_py_test(
     name = "unique_op_test",
     size = "small",
@@ -1338,6 +1380,7 @@ cuda_py_test(
         "//tensorflow/python:test_ops",
         "//tensorflow/python:variables",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:def_function",
     ],
     shard_count = 10,
     tags = [
@@ -1449,6 +1492,7 @@ cuda_py_test(
     additional_deps = [
         "//third_party/py/numpy",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:def_function",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:check_ops",
         "//tensorflow/python:math_ops",
@@ -1736,9 +1780,11 @@ cuda_py_test(
         "//tensorflow/python:tensor_array_grad",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python:variables",
+        "//tensorflow/python:while_v2",
         "//tensorflow/python/data/ops:iterator_ops",
     ],
     grpc_enabled = True,
+    shard_count = 2,
     tags = ["no_windows"],
 )
 
@@ -2371,6 +2417,8 @@ cuda_py_test(
         "//tensorflow/python:tensor_array_ops",
         "//tensorflow/python:variables",
         "//tensorflow/python:variable_scope",
+        "//tensorflow/python:cond_v2",
+        "//tensorflow/python:while_v2",
         "//tensorflow/python/eager:backprop",
         "//tensorflow/python/eager:context",
     ],
@@ -2614,34 +2662,6 @@ cuda_py_test(
     tags = ["manual"],
 )
 
-cuda_py_test(
-    name = "dct_ops_test",
-    srcs = ["dct_ops_test.py"],
-    additional_deps = [
-        "//third_party/py/numpy",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:spectral_ops",
-        "//tensorflow/python:spectral_ops_test_util",
-    ],
-)
-
-cuda_py_test(
-    name = "fft_ops_test",
-    size = "medium",
-    srcs = ["fft_ops_test.py"],
-    additional_deps = [
-        "//third_party/py/numpy",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:spectral_ops",
-        "//tensorflow/python:spectral_ops_test_util",
-    ],
-    shard_count = 4,
-    tags = ["optonly"],
-)
-
 cuda_py_test(
     name = "pooling_ops_3d_test",
     size = "medium",
@@ -3330,7 +3350,9 @@ cuda_py_test(
         "//tensorflow/python:framework",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:gradients",
+        "//tensorflow/python:tensor_array_ops",
         "//tensorflow/python:training",
+        "//tensorflow/python:while_v2",
     ],
     grpc_enabled = True,
 )
diff --git a/tensorflow/python/kernel_tests/accumulate_n_test.py b/tensorflow/python/kernel_tests/accumulate_n_test.py
index 7889edc198f..5eece9c9413 100644
--- a/tensorflow/python/kernel_tests/accumulate_n_test.py
+++ b/tensorflow/python/kernel_tests/accumulate_n_test.py
@@ -32,6 +32,7 @@ from tensorflow.python.platform import googletest
 class AccumulateNV2Test(test_util.TensorFlowTestCase):
   """Tests of the new, differentiable version of accumulate_n."""
 
+  @test_util.run_deprecated_v1
   def testFloat(self):
     np.random.seed(12345)
     x = [np.random.random((1, 2, 3, 4, 5)) - 0.5 for _ in range(5)]
@@ -41,6 +42,7 @@ class AccumulateNV2Test(test_util.TensorFlowTestCase):
       self.assertAllClose(x[0] * 5,
                           math_ops.accumulate_n([tf_x[0]] * 5).eval())
 
+  @test_util.run_deprecated_v1
   def testInt(self):
     np.random.seed(54321)
     x = [np.random.randint(-128, 128, (5, 4, 3, 2, 1)) for _ in range(6)]
@@ -50,12 +52,14 @@ class AccumulateNV2Test(test_util.TensorFlowTestCase):
       self.assertAllEqual(x[0] * 6,
                           math_ops.accumulate_n([tf_x[0]] * 6).eval())
 
+  @test_util.run_deprecated_v1
   def testUnknownShape(self):
     with self.session(use_gpu=True):
       x0 = array_ops.placeholder(dtype=dtypes_lib.int32, shape=[None])
       acc = math_ops.accumulate_n([x0, x0], shape=[None])
       self.assertAllEqual([2, 4], acc.eval(feed_dict={x0: [1, 2]}))
 
+  @test_util.run_deprecated_v1
   def testGrad(self):
     np.random.seed(42)
     for num_inputs in range(1, 10):
@@ -65,7 +69,7 @@ class AccumulateNV2Test(test_util.TensorFlowTestCase):
             for _ in range(0, num_inputs)
         ]
         accum_n = math_ops.accumulate_n(input_vars)
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         accum_n_grad = gradients.gradients(accum_n, input_vars)
         self.assertAllEqual(
             np.repeat(1.0, num_inputs),  # d/dx (x + y + ...) = 1
@@ -88,13 +92,13 @@ class AccumulateNV2Test(test_util.TensorFlowTestCase):
       np_val = random_arrays[0]
       for random_array in random_arrays[1:]:
         np_val += random_array
-      self.assertAllClose(np_val, tf_val.eval())
+      self.assertAllClose(np_val, self.evaluate(tf_val))
 
   def testZeroArgs(self):
     with self.cached_session():
       with self.assertRaises(ValueError):
         tf_val = math_ops.accumulate_n([])
-        tf_val.eval()
+        self.evaluate(tf_val)
 
   def testWrongShape(self):
     with self.cached_session():
diff --git a/tensorflow/python/kernel_tests/ackermann_test.py b/tensorflow/python/kernel_tests/ackermann_test.py
index d267e497527..6c20b19be9e 100644
--- a/tensorflow/python/kernel_tests/ackermann_test.py
+++ b/tensorflow/python/kernel_tests/ackermann_test.py
@@ -20,12 +20,14 @@ from __future__ import print_function
 import os
 
 from tensorflow.python.framework import load_library
+from tensorflow.python.framework import test_util
 from tensorflow.python.platform import resource_loader
 from tensorflow.python.platform import test
 
 
 class AckermannTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testBasic(self):
     library_filename = os.path.join(resource_loader.get_data_files_path(),
                                     'ackermann_op.so')
diff --git a/tensorflow/python/kernel_tests/aggregate_ops_test.py b/tensorflow/python/kernel_tests/aggregate_ops_test.py
index 0f15319cb59..d9787cc3bf6 100644
--- a/tensorflow/python/kernel_tests/aggregate_ops_test.py
+++ b/tensorflow/python/kernel_tests/aggregate_ops_test.py
@@ -24,6 +24,7 @@ from tensorflow.core.framework import tensor_pb2
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import logging_ops
 from tensorflow.python.ops import math_ops
@@ -61,12 +62,13 @@ class AddNTest(test.TestCase):
       for dtype in self._supported_types():
         for count in range(1, self._MAX_N + 1):
           data = [self._buildData((2, 2), dtype) for _ in range(count)]
-          actual = sess.run(math_ops.add_n(data))
+          actual = self.evaluate(math_ops.add_n(data))
           expected = np.sum(np.vstack(
               [np.expand_dims(d, 0) for d in data]), axis=0)
           tol = 5e-3 if dtype == dtypes.float16 else 5e-7
           self.assertAllClose(expected, actual, rtol=tol, atol=tol)
 
+  @test_util.run_deprecated_v1
   def testUnknownShapes(self):
     np.random.seed(12345)
     with self.session(use_gpu=True) as sess:
@@ -80,6 +82,7 @@ class AddNTest(test.TestCase):
           tol = 5e-3 if dtype == dtypes.float16 else 5e-7
           self.assertAllClose(expected, actual, rtol=tol, atol=tol)
 
+  @test_util.run_deprecated_v1
   def testVariant(self):
 
     def create_constant_variant(value):
diff --git a/tensorflow/python/kernel_tests/argmax_op_test.py b/tensorflow/python/kernel_tests/argmax_op_test.py
index fa370c17b46..06ec0948c25 100644
--- a/tensorflow/python/kernel_tests/argmax_op_test.py
+++ b/tensorflow/python/kernel_tests/argmax_op_test.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
@@ -37,14 +38,14 @@ class ArgMaxTest(test.TestCase):
     with self.session(use_gpu=use_gpu):
       ans = method(x, axis=axis)
       if expected_err_re is None:
-        tf_ans = ans.eval()
+        tf_ans = self.evaluate(ans)
         # Defaults to int64 output.
         self.assertEqual(np.int64, tf_ans.dtype)
         self.assertAllEqual(tf_ans, expected_values)
         self.assertShapeEqual(expected_values, ans)
       else:
         with self.assertRaisesOpError(expected_err_re):
-          ans.eval()
+          self.evaluate(ans)
 
   def _testBothArg(self,
                    method,
@@ -79,7 +80,7 @@ class ArgMaxTest(test.TestCase):
     expected_values = x.argmax()
     with self.session(use_gpu=True):
       ans = math_ops.argmax(x, axis=0, output_type=dtypes.int32)
-      tf_ans = ans.eval()
+      tf_ans = self.evaluate(ans)
       self.assertEqual(np.int32, tf_ans.dtype)
       # The values are equal when comparing int32 to int64 because
       # the values don't have a range that exceeds 32-bit integers.
@@ -87,7 +88,7 @@ class ArgMaxTest(test.TestCase):
     expected_values = x.argmin()
     with self.session(use_gpu=True):
       ans = math_ops.argmin(x, axis=0, output_type=dtypes.int32)
-      tf_ans = ans.eval()
+      tf_ans = self.evaluate(ans)
       self.assertEqual(np.int32, tf_ans.dtype)
       self.assertAllEqual(tf_ans, expected_values)
 
@@ -110,12 +111,14 @@ class ArgMaxTest(test.TestCase):
             r"Reduction axis 0 is empty in shape \[0\]"):
           op([], 0).eval()
 
+  @test_util.run_deprecated_v1
   def testDefaultAxis(self):
     with self.cached_session():
       for op in math_ops.argmin, math_ops.argmax:
         ans = op([1]).eval()
         self.assertAllEqual(ans, 0)
 
+  @test_util.run_deprecated_v1
   def testOutputEmpty(self):
     with self.cached_session():
       for op in math_ops.argmin, math_ops.argmax:
diff --git a/tensorflow/python/kernel_tests/array_ops_test.py b/tensorflow/python/kernel_tests/array_ops_test.py
index c90794c7892..f4c442b7b19 100644
--- a/tensorflow/python/kernel_tests/array_ops_test.py
+++ b/tensorflow/python/kernel_tests/array_ops_test.py
@@ -25,6 +25,7 @@ import numpy as np
 
 from tensorflow.python.client import session
 from tensorflow.python.eager import context
+from tensorflow.python.eager import def_function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
@@ -32,6 +33,7 @@ from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import tensor_spec
 from tensorflow.python.framework import test_ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
@@ -46,24 +48,23 @@ from tensorflow.python.ops import variables
 from tensorflow.python.platform import test as test_lib
 
 
+@test_util.run_all_in_graph_and_eager_modes
 class BatchMatrixTransposeTest(test_util.TensorFlowTestCase):
 
   def testNonBatchMatrix(self):
     matrix = [[1, 2, 3], [4, 5, 6]]  # Shape (2, 3)
     expected_transposed = [[1, 4], [2, 5], [3, 6]]  # Shape (3, 2)
-    with self.cached_session():
-      transposed = array_ops.matrix_transpose(matrix)
-      self.assertEqual((3, 2), transposed.get_shape())
-      self.assertAllEqual(expected_transposed, transposed.eval())
+    transposed = array_ops.matrix_transpose(matrix)
+    self.assertEqual((3, 2), transposed.get_shape())
+    self.assertAllEqual(expected_transposed, transposed)
 
   def testConjugate(self):
     m = [[1 + 1j, 2 + 2j, 3 + 3j], [4 + 4j, 5 + 5j, 6 + 6j]]
     expected_transposed = [[1 - 1j, 4 - 4j], [2 - 2j, 5 - 5j], [3 - 3j, 6 - 6j]]
-    with self.cached_session():
-      matrix = ops.convert_to_tensor(m)
-      transposed = array_ops.matrix_transpose(matrix, conjugate=True)
-      self.assertEqual((3, 2), transposed.get_shape())
-      self.assertAllEqual(expected_transposed, transposed.eval())
+    matrix = ops.convert_to_tensor(m)
+    transposed = array_ops.matrix_transpose(matrix, conjugate=True)
+    self.assertEqual((3, 2), transposed.get_shape())
+    self.assertAllEqual(expected_transposed, transposed)
 
   def testBatchMatrix(self):
     matrix_0 = [[1, 2, 3], [4, 5, 6]]
@@ -72,43 +73,44 @@ class BatchMatrixTransposeTest(test_util.TensorFlowTestCase):
     matrix_1_t = [[11, 44], [22, 55], [33, 66]]
     batch_matrix = [matrix_0, matrix_1]  # Shape (2, 2, 3)
     expected_transposed = [matrix_0_t, matrix_1_t]  # Shape (2, 3, 2)
-    with self.cached_session():
-      transposed = array_ops.matrix_transpose(batch_matrix)
-      self.assertEqual((2, 3, 2), transposed.get_shape())
-      self.assertAllEqual(expected_transposed, transposed.eval())
+    transposed = array_ops.matrix_transpose(batch_matrix)
+    self.assertEqual((2, 3, 2), transposed.get_shape())
+    self.assertAllEqual(expected_transposed, transposed)
 
   def testNonBatchMatrixDynamicallyDefined(self):
-    matrix = [[1, 2, 3], [4, 5, 6]]  # Shape (2, 3)
+    # needs explicit `constant` because lists are not automatically
+    # converted to sensors when applying `transpose` below
+    matrix = constant_op.constant([[1, 2, 3], [4, 5, 6]])  # Shape (2, 3)
     expected_transposed = [[1, 4], [2, 5], [3, 6]]  # Shape (3, 2)
-    with self.cached_session():
-      matrix_ph = array_ops.placeholder(dtypes.int32)
-      transposed = array_ops.matrix_transpose(matrix_ph)
-      self.assertAllEqual(
-          expected_transposed, transposed.eval(feed_dict={
-              matrix_ph: matrix
-          }))
+    @def_function.function(input_signature=
+                           [tensor_spec.TensorSpec
+                            (shape=None, dtype=dtypes.int32)])
+    def transpose(matrix):
+      self.assertIs(matrix.shape.ndims, None)
+      return array_ops.matrix_transpose(matrix)
+    self.assertAllEqual(expected_transposed, transpose(matrix))
 
   def testBatchMatrixDynamicallyDefined(self):
     matrix_0 = [[1, 2, 3], [4, 5, 6]]
     matrix_0_t = [[1, 4], [2, 5], [3, 6]]
     matrix_1 = [[11, 22, 33], [44, 55, 66]]
     matrix_1_t = [[11, 44], [22, 55], [33, 66]]
-    batch_matrix = [matrix_0, matrix_1]  # Shape (2, 2, 3)
+    # needs explicit `constant` because lists are not automatically
+    # converted to sensors when applying `transpose` below
+    batch_matrix = constant_op.constant([matrix_0, matrix_1])  # Shape (2, 2, 3)
     expected_transposed = [matrix_0_t, matrix_1_t]  # Shape (2, 3, 2)
-    with self.cached_session():
-      batch_matrix_ph = array_ops.placeholder(dtypes.int32)
-      transposed = array_ops.matrix_transpose(batch_matrix_ph)
-      self.assertAllEqual(
-          expected_transposed,
-          transposed.eval(feed_dict={
-              batch_matrix_ph: batch_matrix
-          }))
+    @def_function.function(input_signature=
+                           [tensor_spec.TensorSpec
+                            (shape=None, dtype=dtypes.int32)])
+    def transpose(matrix):
+      self.assertIs(matrix.shape.ndims, None)
+      return array_ops.matrix_transpose(matrix)
+    self.assertAllEqual(expected_transposed, transpose(batch_matrix))
 
   def testTensorWithStaticRankLessThanTwoRaisesBecauseNotAMatrix(self):
     vector = [1, 2, 3]
-    with self.cached_session():
-      with self.assertRaisesRegexp(ValueError, "should be a "):
-        array_ops.matrix_transpose(vector)
+    with self.assertRaisesRegexp(ValueError, "should be a "):
+      array_ops.matrix_transpose(vector)
 
 
 class BooleanMaskTest(test_util.TensorFlowTestCase):
@@ -141,36 +143,43 @@ class BooleanMaskTest(test_util.TensorFlowTestCase):
 
       self.assertAllClose(masked_arr, masked_tensor.eval())
 
+  @test_util.run_deprecated_v1
   def testMaskDim1ArrDim2Axis1(self):
     ndims_mask = 1
     for arr_shape in [(1, 1), (2, 2), (2, 5)]:
       self.CheckVersusNumpy(ndims_mask, arr_shape, axis=1)
 
+  @test_util.run_deprecated_v1
   def testMaskDim2ArrDim2Axis1(self):
     ndims_mask = 2
     for arr_shape in [(1, 1), (2, 2), (2, 5)]:
       self.CheckVersusNumpy(ndims_mask, arr_shape, axis=1)
 
+  @test_util.run_deprecated_v1
   def testMaskDim1ArrDim1(self):
     ndims_mask = 1
     for arr_shape in [(1,), (2,), (3,), (10,)]:
       self.CheckVersusNumpy(ndims_mask, arr_shape)
 
+  @test_util.run_deprecated_v1
   def testMaskDim1ArrDim2(self):
     ndims_mask = 1
     for arr_shape in [(1, 1), (2, 2), (2, 5)]:
       self.CheckVersusNumpy(ndims_mask, arr_shape)
 
+  @test_util.run_deprecated_v1
   def testMaskDim2ArrDim2(self):
     ndims_mask = 2
     for arr_shape in [(1, 1), (2, 2), (2, 5)]:
       self.CheckVersusNumpy(ndims_mask, arr_shape)
 
+  @test_util.run_deprecated_v1
   def testMaskDim2ArrDim3(self):
     ndims_mask = 2
     for arr_shape in [(1, 1, 1), (1, 2, 2), (2, 2, 1)]:
       self.CheckVersusNumpy(ndims_mask, arr_shape)
 
+  @test_util.run_deprecated_v1
   def testEmptyInput2D(self):
     mask = np.array([True, False])
     arr = np.array([[], []]).astype(np.float32)
@@ -189,6 +198,7 @@ class BooleanMaskTest(test_util.TensorFlowTestCase):
     with self.cached_session():
       self.assertAllClose(numpy_result, tf_result.eval())
 
+  @test_util.run_deprecated_v1
   def testEmptyOutput(self):
     make_mask = lambda shape: np.zeros(shape, dtype=bool)
     for ndims_mask in range(1, 4):
@@ -197,6 +207,7 @@ class BooleanMaskTest(test_util.TensorFlowTestCase):
           arr_shape = np.random.randint(1, 5, size=ndims_arr)
           self.CheckVersusNumpy(ndims_mask, arr_shape, make_mask=make_mask)
 
+  @test_util.run_deprecated_v1
   def testWorksWithDimensionsEqualToNoneDuringGraphBuild(self):
     # The rank of the mask tensor must be specified. This is explained
     # in the docstring as well.
@@ -215,6 +226,7 @@ class BooleanMaskTest(test_util.TensorFlowTestCase):
           })
       np.testing.assert_allclose(masked_tensor, arr[mask])
 
+  @test_util.run_deprecated_v1
   def testMaskDimensionsSetToNoneRaises(self):
     # The rank of the mask tensor must be specified. This is explained
     # in the docstring as well.
@@ -281,6 +293,7 @@ class OperatorShapeTest(test_util.TensorFlowTestCase):
 
 class ReverseV2Test(test_util.TensorFlowTestCase):
 
+  @test_util.run_deprecated_v1
   def testReverse0DimAuto(self):
     x_np = 4
     for use_gpu in [False, True]:
@@ -325,6 +338,7 @@ class ReverseV2Test(test_util.TensorFlowTestCase):
 
   # This test covers the axis validation in the shape function
   # (no eval())
+  @test_util.run_deprecated_v1
   def testInvalidAxis(self):
     x_np = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.float32)
     with self.assertRaisesRegexp(ValueError,
@@ -343,6 +357,7 @@ class ReverseV2Test(test_util.TensorFlowTestCase):
   #
   # Note: this test passes placeholder as constant axis is validated
   # in shape function (see testInvalidAxis)
+  @test_util.run_deprecated_v1
   def testInvalid(self):
     x_np = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.float32)
     axis = array_ops.placeholder(dtypes.int32)
@@ -357,6 +372,7 @@ class ReverseV2Test(test_util.TensorFlowTestCase):
                                    "axis 0 specified more than once"):
         array_ops.reverse_v2(x_np, axis).eval(feed_dict={axis: [0, -2]})
 
+  @test_util.run_deprecated_v1
   def testReverse1DimAuto(self):
     for dtype in [
         np.uint8, np.int8, np.uint16, np.int16, np.int32, np.int64, np.bool,
@@ -365,6 +381,7 @@ class ReverseV2Test(test_util.TensorFlowTestCase):
     ]:
       self._reverse1DimAuto(dtype)
 
+  @test_util.run_deprecated_v1
   def testReverse2DimAuto(self):
     for dtype in [
         np.uint8, np.int8, np.uint16, np.int16, np.int32, np.int64, np.bool,
@@ -373,6 +390,7 @@ class ReverseV2Test(test_util.TensorFlowTestCase):
     ]:
       self._reverse2DimAuto(dtype)
 
+  @test_util.run_deprecated_v1
   def testUnknownDims(self):
     reverse_v2 = array_ops.reverse_v2
     data_t = array_ops.placeholder(dtypes.float32)
@@ -390,6 +408,7 @@ class ReverseV2Test(test_util.TensorFlowTestCase):
     reverse_2d_t = reverse_v2(data_2d_t, axis_2d_t)
     self.assertEqual(2, reverse_2d_t.get_shape().ndims)
 
+  @test_util.run_deprecated_v1
   def testReverseRowsOf3Channels(self):
     """Tests optimized code for reversing rows with last dim size = 3."""
     with self.session(use_gpu=True):
@@ -403,6 +422,7 @@ class ReverseV2Test(test_util.TensorFlowTestCase):
             np_answer = x_np[:, ::-1, :]
             self.assertAllEqual(x_tf, np_answer)
 
+  @test_util.run_deprecated_v1
   def testReverseRowsOf4Channels(self):
     with self.session(use_gpu=True):
       for reverse_f in [array_ops.reverse_v2, array_ops.reverse]:
@@ -452,6 +472,7 @@ class MeshgridTest(test_util.TensorFlowTestCase):
         for x_np, x_tf in zip(numpy_out, tf_out):
           self.assertAllEqual(x_np, x_tf.eval())
 
+  @test_util.run_deprecated_v1
   def testCompare(self):
     for t in (np.float16, np.float32, np.float64, np.int32, np.int64,
               np.complex64, np.complex128):
@@ -524,6 +545,7 @@ STRIDED_SLICE_TYPES = [
 class StridedSliceTest(test_util.TensorFlowTestCase):
   """Test the strided slice operation with variants of slices."""
 
+  @test_util.run_deprecated_v1
   def test_basic_slice(self):
     for tensor_type in STRIDED_SLICE_TYPES:
       with self.cached_session(use_gpu=True):
@@ -554,7 +576,8 @@ class StridedSliceTest(test_util.TensorFlowTestCase):
   def testInt64GPU(self):
     if not test_util.is_gpu_available():
       self.skipTest("No GPU available")
-    with self.session(use_gpu=True, force_gpu=True):
+
+    with test_util.force_gpu():
       x = constant_op.constant([1., 2., 3.])
       begin = constant_op.constant([2], dtype=dtypes.int64)
       end = constant_op.constant([3], dtype=dtypes.int64)
@@ -578,6 +601,7 @@ class StridedSliceTest(test_util.TensorFlowTestCase):
       v = variables.Variable([1., 2.])
       v[0]  # pylint: disable=pointless-statement
 
+  @test_util.run_deprecated_v1
   def testDegenerateSlices(self):
     with self.session(use_gpu=True):
       checker = StridedSliceChecker(self, StridedSliceChecker.REF_TENSOR)
@@ -588,6 +612,7 @@ class StridedSliceTest(test_util.TensorFlowTestCase):
       # empty interval in every dimension
       _ = checker[-1:0, 2:2, 2:3:-1]
 
+  @test_util.run_deprecated_v1
   def testEllipsis(self):
     with self.session(use_gpu=True):
       raw = [[[[[1, 2], [3, 4], [5, 6]]], [[[7, 8], [9, 10], [11, 12]]]]]
@@ -608,6 +633,7 @@ class StridedSliceTest(test_util.TensorFlowTestCase):
       with self.assertRaisesRegexp(ValueError, "Multiple ellipses"):
         _ = checker[..., :, ...].eval()
 
+  @test_util.run_deprecated_v1
   def testShrink(self):
     with self.session(use_gpu=True):
       raw = [[[[[1, 2, 4, 5], [5, 6, 7, 8], [9, 10, 11, 12]]],
@@ -618,6 +644,7 @@ class StridedSliceTest(test_util.TensorFlowTestCase):
       _ = checker[:, 0]
       _ = checker[:, :, 0]
 
+  @test_util.run_deprecated_v1
   def testBothNewAxisAndShrink(self):
     with self.session(use_gpu=True):
       ones = array_ops.placeholder(shape=[2, 2], dtype=dtypes.int16)
@@ -626,6 +653,7 @@ class StridedSliceTest(test_util.TensorFlowTestCase):
               feed_dict={ones: [[1, 1], [1, 1]]}),
           [[1, 1]])
 
+  @test_util.run_deprecated_v1
   def testTensorIndexing(self):
     with self.session(use_gpu=True):
       raw = [[[[[1, 2, 4, 5], [5, 6, 7, 8], [9, 10, 11, 12]]],
@@ -636,6 +664,7 @@ class StridedSliceTest(test_util.TensorFlowTestCase):
       _ = checker[..., bar:bar2]
       _ = checker[..., bar]
       _ = checker[..., 3]
+      _ = checker[..., 2 ** 64 // 2**63]  # Test longs in Python 2
 
   def testTensorIndexingTypeError(self):
     with self.session(use_gpu=True):
@@ -650,6 +679,7 @@ class StridedSliceTest(test_util.TensorFlowTestCase):
       with self.assertRaisesRegexp(TypeError, expected):
         _ = checker[constant_op.constant(0.0)]
 
+  @test_util.run_deprecated_v1
   def testExpand(self):
     with self.session(use_gpu=True):
       raw = [[[[[1, 2, 4, 5], [5, 6, 7, 8], [9, 10, 11, 12]]],
@@ -667,6 +697,7 @@ class StridedSliceTest(test_util.TensorFlowTestCase):
       # Ellipsis in middle of two newaxis
       _ = checker[np.newaxis, ..., np.newaxis]
 
+  @test_util.run_deprecated_v1
   def testExpandVariable(self):
     with self.session(use_gpu=True):
       x = variables.Variable(7, dtype=dtypes.int32)
@@ -675,6 +706,7 @@ class StridedSliceTest(test_util.TensorFlowTestCase):
       self.assertEqual(y.shape, (1,))
       self.assertAllEqual(y, (7,))
 
+  @test_util.run_deprecated_v1
   def testOptimizedCases(self):
     with self.session(use_gpu=True):
       checker = StridedSliceChecker(self,
@@ -704,6 +736,7 @@ class StridedSliceShapeChecker(object):
 class StridedSliceShapeTest(test_util.TensorFlowTestCase):
   """Test the shape inference of StridedSliceShapes."""
 
+  @test_util.run_deprecated_v1
   def testUnknown(self):
     with self.session(use_gpu=True):
       uncertain_tensor = array_ops.placeholder(dtypes.float32)
@@ -715,6 +748,7 @@ class StridedSliceShapeTest(test_util.TensorFlowTestCase):
     self.assertTrue(x is not None and y is not None or x is None and y is None)
     self.assertEqual(x.as_list(), y.as_list())
 
+  @test_util.run_deprecated_v1
   def testTensorShapeUncertain(self):
     with self.session(use_gpu=True):
       uncertain_tensor = array_ops.placeholder(
@@ -738,6 +772,7 @@ class StridedSliceShapeTest(test_util.TensorFlowTestCase):
       self.tensorShapeEqual(a[::-1, :, array_ops.newaxis, ::-2],
                             tensor_shape.TensorShape([5, None, 1, 4]))
 
+  @test_util.run_deprecated_v1
   def testTensorValuedIndexShape(self):
     with self.session(use_gpu=True):
       defined_shape_tensor = array_ops.placeholder(
@@ -794,6 +829,7 @@ class GradSliceChecker(object):
 class StridedSliceGradTest(test_util.TensorFlowTestCase):
   """Test that strided slice's custom gradient produces correct gradients."""
 
+  @test_util.run_deprecated_v1
   def testGradient(self):
     with self.session(use_gpu=True) as sess:
       var = variables.Variable(
@@ -823,18 +859,20 @@ class StridedSliceGradTest(test_util.TensorFlowTestCase):
       grad = GradSliceChecker(self, sess, var, np.array(8))
       _ = grad[tuple()]
 
+  @test_util.run_deprecated_v1
   def testInt64Indices(self):
     with self.session(use_gpu=True) as sess:
       a = math_ops.range(3, dtype=dtypes.float32)
       index = constant_op.constant(1, dtype=dtypes.int64)
       b = 2. * a[index]
       grad, = gradients_impl.gradients(b, a)
-      self.assertAllEqual(sess.run(grad), [0., 2., 0.])
+      self.assertAllEqual(self.evaluate(grad), [0., 2., 0.])
 
 
 class StridedSliceGradTypeTest(test_util.TensorFlowTestCase):
   """Test varied index types and host located memory."""
 
+  @test_util.run_deprecated_v1
   def testHostVsDevice(self):
     with self.session(use_gpu=True) as sess:
       var2 = variables.Variable(
@@ -842,20 +880,21 @@ class StridedSliceGradTypeTest(test_util.TensorFlowTestCase):
               math_ops.cast(math_ops.range(1, 5, 1), dtypes.float32),
               shape=(4, 1, 1)))
       varshape = variables.Variable([6, 4, 4], dtype=dtypes.int32)
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       begin = constant_op.constant([0, 0, 0])
       end = constant_op.constant([4, 1, 1])
       strides = constant_op.constant([1, 1, 1])
       foo = array_ops.strided_slice_grad(varshape, begin, end, strides, var2)
       sess.run(foo)
 
+  @test_util.run_deprecated_v1
   def testInt64Shape(self):
     with self.session(use_gpu=True) as sess:
       original_dy = array_ops.reshape(
           math_ops.cast(math_ops.range(1, 5, 1), dtypes.float32),
           shape=(4, 1, 1))
       original_shape = constant_op.constant([6, 4, 4], dtype=dtypes.int64)
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       begin = constant_op.constant([0, 0, 0], dtype=dtypes.int64)
       end = constant_op.constant([4, 1, 1], dtype=dtypes.int64)
       strides = constant_op.constant([1, 1, 1], dtype=dtypes.int64)
@@ -863,13 +902,14 @@ class StridedSliceGradTypeTest(test_util.TensorFlowTestCase):
                                         original_dy)
       sess.run(dx)
 
+  @test_util.run_deprecated_v1
   def testMixedIndexTypes(self):
     with self.session(use_gpu=True) as sess:
       original_dy = array_ops.reshape(
           math_ops.cast(math_ops.range(1, 5, 1), dtypes.float32),
           shape=(4, 1, 1))
       original_shape = constant_op.constant([6, 4, 4], dtype=dtypes.int64)
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       begin = constant_op.constant([0, 0, 0], dtype=dtypes.int32)
       end = constant_op.constant([4, 1, 1], dtype=dtypes.int64)
       strides = constant_op.constant([1, 1, 1], dtype=dtypes.int64)
@@ -971,6 +1011,7 @@ class StridedSliceAssignChecker(object):
 
 class SliceAssignTest(test_util.TensorFlowTestCase):
 
+  @test_util.run_deprecated_v1
   def testInvalidSlice(self):
     with self.cached_session() as sess:
       foo = constant_op.constant([1, 2, 3])
@@ -1008,12 +1049,15 @@ class SliceAssignTest(test_util.TensorFlowTestCase):
     checker2[...] = 6  # ellipsis
     checker2[None] = [6]  # new axis
 
+  @test_util.run_deprecated_v1
   def testSliceAssign(self):
     self.doTestSliceAssign(use_resource=False)
 
+  @test_util.run_deprecated_v1
   def testSliceAssignResource(self):
     self.doTestSliceAssign(use_resource=True)
 
+  @test_util.run_deprecated_v1
   def testUninitialized(self):
     with self.assertRaisesRegexp(
         errors.FailedPreconditionError,
@@ -1032,13 +1076,14 @@ class SliceAssignTest(test_util.TensorFlowTestCase):
     with self.assertRaises(TypeError):
       v[:].assign(too_large_val)
 
+  @test_util.run_deprecated_v1
   def testTypeErrorResource(self):
     init_val = constant_op.constant([1, 2], dtype=dtypes.int32)
     too_small_val = constant_op.constant([3, 4], dtype=dtypes.int8)
     too_large_val = constant_op.constant([3, 4], dtype=dtypes.int64)
     v = resource_variable_ops.ResourceVariable(init_val)
     with self.cached_session() as sess:
-      sess.run(v.initializer)
+      self.evaluate(v.initializer)
       with self.assertRaises(ValueError):
         sess.run(v[:].assign(too_large_val))
       with self.assertRaises(ValueError):
@@ -1088,6 +1133,7 @@ class SequenceMaskTest(test_util.TensorFlowTestCase):
       with self.assertRaisesRegexp(ValueError, "maxlen must be scalar"):
         array_ops.sequence_mask([10, 20], [10, 20])
 
+  @test_util.run_deprecated_v1
   def testOneDimensionalWithMaxlen(self):
     with self.cached_session():
       res = array_ops.sequence_mask(constant_op.constant([1, 3, 2]), 5)
@@ -1097,7 +1143,7 @@ class SequenceMaskTest(test_util.TensorFlowTestCase):
           [[True, False, False, False, False], [True, True, True, False, False],
            [True, True, False, False, False]])
 
-  @test_util.enable_c_shapes
+  @test_util.run_deprecated_v1
   def testOneDimensionalDtypeWithoutMaxlen(self):
     with self.cached_session():
       # test dtype and default maxlen:
@@ -1108,7 +1154,7 @@ class SequenceMaskTest(test_util.TensorFlowTestCase):
           res.eval(),
           [[0.0, 0.0, 0.0, 0.0], [1.0, 0.0, 0.0, 0.0], [1.0, 1.0, 1.0, 1.0]])
 
-  @test_util.enable_c_shapes
+  @test_util.run_deprecated_v1
   def testOneDimensionalWithoutMaxlen(self):
     with self.cached_session():
       res = array_ops.sequence_mask(
@@ -1120,7 +1166,6 @@ class SequenceMaskTest(test_util.TensorFlowTestCase):
            [True, False, False, False],
            [True, True, True, True]])
 
-  @test_util.enable_c_shapes
   def testTwoDimensional(self):
     with self.cached_session():
       res = array_ops.sequence_mask(constant_op.constant([[1, 3, 2]]), 5)
@@ -1138,11 +1183,13 @@ class SequenceMaskTest(test_util.TensorFlowTestCase):
           [[[0.0, 0.0, 0.0, 0.0], [1.0, 0.0, 0.0, 0.0], [1.0, 1.0, 1.0, 1.0]],
            [[1.0, 0.0, 0.0, 0.0], [1.0, 1.0, 0.0, 0.0], [1.0, 1.0, 1.0, 0.0]]])
 
+  @test_util.run_deprecated_v1
   def testUnknownShape(self):
     lengths = array_ops.placeholder(dtype=dtypes.int32)
     res = array_ops.sequence_mask(lengths)
     self.assertEqual(res.shape, None)
 
+  @test_util.run_deprecated_v1
   def testDtypes(self):
 
     def check_dtypes(lengths_dtype, maxlen_dtype):
@@ -1165,6 +1212,7 @@ class SequenceMaskTest(test_util.TensorFlowTestCase):
 class ConcatSliceResourceTest(test_util.TensorFlowTestCase):
 
   @test_util.run_in_graph_and_eager_modes
+  @test_util.run_deprecated_v1
   def testConcatSlice(self):
     r1 = test_ops.stub_resource_handle_op(container="a", shared_name="b")
     r2 = test_ops.stub_resource_handle_op(container="a", shared_name="c")
@@ -1187,18 +1235,18 @@ class IdentityTest(test_util.TensorFlowTestCase):
         self.assertAllEqual(x.numpy(), y.numpy())
         self.assertTrue(device in y.device.lower())
 
-      with ops.device("gpu:0"):
+      with test_util.force_gpu():
         a = constant_op.constant([[2], [3]], dtype=dtypes.float32)
-      with ops.device("gpu:0"):
+      with test_util.force_gpu():
         b = array_ops.identity(a)
         _test(a, b, "gpu")
-      with ops.device("cpu:0"):
+      with test_util.force_cpu():
         c = array_ops.identity(b)
         _test(b, c, "cpu")
-      with ops.device("cpu:0"):
+      with test_util.force_cpu():
         d = array_ops.identity(c)
         _test(c, d, "cpu")
-      with ops.device("gpu:0"):
+      with test_util.force_gpu():
         e = array_ops.identity(d)
         _test(d, e, "gpu")
 
@@ -1220,6 +1268,7 @@ class PadTest(test_util.TensorFlowTestCase):
 
 class InvertPermutationTest(test_util.TensorFlowTestCase):
 
+  @test_util.run_deprecated_v1
   def testInvertPermutation(self):
     for dtype in [dtypes.int32, dtypes.int64]:
       with self.cached_session(use_gpu=True):
@@ -1254,12 +1303,14 @@ class UnravelIndexTest(test_util.TensorFlowTestCase):
 
 class GuaranteeConstOpTest(test_util.TensorFlowTestCase):
 
+  @test_util.run_deprecated_v1
   def testSimple(self):
     with self.cached_session():
       a = array_ops.constant(10)
       guarantee_a = array_ops.guarantee_const(a)
       self.assertEqual(10, guarantee_a.eval())
 
+  @test_util.run_deprecated_v1
   def testVariables(self):
     with self.cached_session() as sess:
       for use_resource in [False, True]:
@@ -1268,9 +1319,10 @@ class GuaranteeConstOpTest(test_util.TensorFlowTestCase):
             initializer=init_ops.constant_initializer(10.0),
             use_resource=use_resource)
         guarantee_a = array_ops.guarantee_const(a)
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         self.assertEqual(10.0, guarantee_a.eval())
 
+  @test_util.run_deprecated_v1
   def testResourceRejection(self):
     with self.cached_session() as sess:
       a = variable_scope.get_variable(
@@ -1278,7 +1330,7 @@ class GuaranteeConstOpTest(test_util.TensorFlowTestCase):
           initializer=init_ops.constant_initializer(10.0),
           use_resource=True)
       guarantee_a = array_ops.guarantee_const(a.handle)
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       with self.assertRaisesWithPredicateMatch(errors.InvalidArgumentError,
                                                "cannot be a resource variable"):
         guarantee_a.eval()
@@ -1286,6 +1338,7 @@ class GuaranteeConstOpTest(test_util.TensorFlowTestCase):
 
 class SnapshotOpTest(test_util.TensorFlowTestCase):
 
+  @test_util.run_deprecated_v1
   def testInvertPermutation(self):
     for dtype in [dtypes.int32, dtypes.int64, dtypes.float32, dtypes.float64]:
       with self.cached_session(use_gpu=True):
diff --git a/tensorflow/python/kernel_tests/as_string_op_test.py b/tensorflow/python/kernel_tests/as_string_op_test.py
index dd4a90e5f65..287701a73e4 100644
--- a/tensorflow/python/kernel_tests/as_string_op_test.py
+++ b/tensorflow/python/kernel_tests/as_string_op_test.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import string_ops
 from tensorflow.python.platform import test
@@ -27,6 +28,7 @@ from tensorflow.python.platform import test
 
 class AsStringOpTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testFloat(self):
     float_inputs_ = [
         0, 1, -1, 0.5, 0.25, 0.125, float("INF"), float("NAN"), float("-INF")
@@ -78,6 +80,7 @@ class AsStringOpTest(test.TestCase):
         output = string_ops.as_string(input_, fill="ab")
         output.eval(feed_dict={input_: float_inputs_})
 
+  @test_util.run_deprecated_v1
   def testInt(self):
     # Cannot use values outside -128..127 for test, because we're also
     # testing int8
@@ -112,6 +115,7 @@ class AsStringOpTest(test.TestCase):
         output = string_ops.as_string(input_, precision=0)
         output.eval(feed_dict={input_: int_inputs_})
 
+  @test_util.run_deprecated_v1
   def testLargeInt(self):
     # Cannot use values outside -128..127 for test, because we're also
     # testing int8
@@ -130,6 +134,7 @@ class AsStringOpTest(test.TestCase):
       result = output.eval(feed_dict={input_: int_inputs_})
       self.assertAllEqual(s(result), ["%d" % x for x in int_inputs_])
 
+  @test_util.run_deprecated_v1
   def testHalfInt(self):
     s = lambda strs: [x.decode("ascii") for x in strs]
 
@@ -140,6 +145,7 @@ class AsStringOpTest(test.TestCase):
       result = output.eval(feed_dict={input_: int_inputs_})
       self.assertAllEqual(s(result), ["%d" % x for x in int_inputs_])
 
+  @test_util.run_deprecated_v1
   def testBool(self):
     bool_inputs_ = [False, True]
     s = lambda strs: [x.decode("ascii") for x in strs]
@@ -152,6 +158,7 @@ class AsStringOpTest(test.TestCase):
         result = output.eval(feed_dict={input_: bool_inputs_})
         self.assertAllEqual(s(result), ["false", "true"])
 
+  @test_util.run_deprecated_v1
   def testComplex(self):
     float_inputs_ = [
         0, 1, -1, 0.5, 0.25, 0.125, complex("INF"), complex("NAN"),
diff --git a/tensorflow/python/kernel_tests/atrous_conv2d_test.py b/tensorflow/python/kernel_tests/atrous_conv2d_test.py
index 1d82b3d0588..a13e325835c 100644
--- a/tensorflow/python/kernel_tests/atrous_conv2d_test.py
+++ b/tensorflow/python/kernel_tests/atrous_conv2d_test.py
@@ -22,6 +22,7 @@ import numpy as np
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import nn_impl
@@ -58,6 +59,7 @@ def _upsample_filters(filters, rate):
 
 class AtrousConv2DTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testAtrousConv2DForward(self):
     with self.session(use_gpu=True):
       # Input: [batch, height, width, input_depth]
@@ -79,8 +81,10 @@ class AtrousConv2DTest(test.TestCase):
                 y1 = nn_ops.atrous_conv2d(x, f, rate, padding=padding)
                 y2 = nn_ops.conv2d(
                     x, f_up, strides=[1, 1, 1, 1], padding=padding)
-                self.assertAllClose(y1.eval(), y2.eval(), rtol=1e-3, atol=1e-3)
+                self.assertAllClose(
+                    y1.eval(), self.evaluate(y2), rtol=1e-3, atol=1e-3)
 
+  @test_util.run_deprecated_v1
   def testAtrousSequence(self):
     """Tests optimization of sequence of atrous convolutions.
 
@@ -131,8 +135,10 @@ class AtrousConv2DTest(test.TestCase):
               y2 = nn_ops.conv2d(y2, f, strides=[1, 1, 1, 1], padding=padding)
               y2 = nn_ops.conv2d(y2, f, strides=[1, 1, 1, 1], padding=padding)
               y2 = array_ops.batch_to_space(y2, crops=pad, block_size=rate)
-              self.assertAllClose(y1.eval(), y2.eval(), rtol=1e-2, atol=1e-2)
+              self.assertAllClose(
+                  y1.eval(), self.evaluate(y2), rtol=1e-2, atol=1e-2)
 
+  @test_util.run_deprecated_v1
   def testGradient(self):
     with self.session(use_gpu=True):
       # Input: [batch, height, width, input_depth]
@@ -160,6 +166,7 @@ class AtrousConv2DTest(test.TestCase):
 
 class AtrousConv2DTransposeTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testAtrousConv2DTransposeForward(self):
     with self.session(use_gpu=True):
       # Input: [batch, height, width, input_depth]
@@ -193,11 +200,13 @@ class AtrousConv2DTransposeTest(test.TestCase):
                                                     padding)
                 y2 = nn_ops.conv2d_transpose(
                     x, f_up, y_shape, strides=[1, 1, 1, 1], padding=padding)
-                self.assertAllClose(y1.eval(), y2.eval(), rtol=1e-3, atol=1e-3)
+                self.assertAllClose(
+                    y1.eval(), self.evaluate(y2), rtol=1e-3, atol=1e-3)
 
 
 class AtrousDepthwiseConv2DTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testAtrousDepthwiseConv2DForward(self):
     strides = [1, 1, 1, 1]
     with self.session(use_gpu=True):
@@ -220,7 +229,8 @@ class AtrousDepthwiseConv2DTest(test.TestCase):
                 y1 = nn_impl.depthwise_conv2d(
                     x, f, strides, padding, rate=[rate, rate])
                 y2 = nn_impl.depthwise_conv2d(x, f_up, strides, padding)
-                self.assertAllClose(y1.eval(), y2.eval(), rtol=1e-3, atol=1e-3)
+                self.assertAllClose(
+                    y1.eval(), self.evaluate(y2), rtol=1e-3, atol=1e-3)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/kernel_tests/attention_ops_test.py b/tensorflow/python/kernel_tests/attention_ops_test.py
index 1e09ba5b65c..00dba9996dd 100644
--- a/tensorflow/python/kernel_tests/attention_ops_test.py
+++ b/tensorflow/python/kernel_tests/attention_ops_test.py
@@ -85,7 +85,7 @@ class ExtractGlimpseTest(test.TestCase):
 
     # Evaluate the TensorFlow Graph.
     with self.cached_session() as sess:
-      value_rows, value_cols = sess.run([glimpse_rows, glimpse_cols])
+      value_rows, value_cols = self.evaluate([glimpse_rows, glimpse_cols])
 
     # Check dimensions of returned glimpse.
     self.assertEqual(value_rows.shape[1], glimpse_sizes[0])
@@ -121,8 +121,7 @@ class ExtractGlimpseTest(test.TestCase):
     with self.cached_session():
       result = image_ops.extract_glimpse(empty_image, [1, 1], offsets)
       self.assertAllEqual(
-          np.zeros(
-              (0, 1, 1, 0), dtype=np.float32), result.eval())
+          np.zeros((0, 1, 1, 0), dtype=np.float32), self.evaluate(result))
 
   def testLargeCenterGlimpse(self):
     self._VerifyValues(
diff --git a/tensorflow/python/kernel_tests/barrier_ops_test.py b/tensorflow/python/kernel_tests/barrier_ops_test.py
index 4d36b3a4658..60fe6f0eecd 100644
--- a/tensorflow/python/kernel_tests/barrier_ops_test.py
+++ b/tensorflow/python/kernel_tests/barrier_ops_test.py
@@ -25,6 +25,7 @@ import numpy as np
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.platform import test
 
@@ -66,6 +67,7 @@ class BarrierTest(test.TestCase):
       attr { key: 'shared_name' value: { s: 'B' } }
       """, b.barrier_ref.op.node_def)
 
+  @test_util.run_deprecated_v1
   def testInsertMany(self):
     with self.cached_session():
       b = data_flow_ops.Barrier(
@@ -90,6 +92,7 @@ class BarrierTest(test.TestCase):
         data_flow_ops.Barrier(
             (dtypes.float32, dtypes.float32), shapes=((1,), (0,)), name="B")
 
+  @test_util.run_deprecated_v1
   def testInsertManyEmptyTensorUnknown(self):
     with self.cached_session():
       b = data_flow_ops.Barrier((dtypes.float32, dtypes.float32), name="B")
@@ -102,6 +105,7 @@ class BarrierTest(test.TestCase):
           ".*Tensors with no elements are not supported.*"):
         insert_0_op.run()
 
+  @test_util.run_deprecated_v1
   def testTakeMany(self):
     with self.cached_session() as sess:
       b = data_flow_ops.Barrier(
@@ -127,6 +131,7 @@ class BarrierTest(test.TestCase):
       self.assertEqual(values_0_val[idx], v0)
       self.assertEqual(values_1_val[idx], v1)
 
+  @test_util.run_deprecated_v1
   def testTakeManySmallBatch(self):
     with self.cached_session() as sess:
       b = data_flow_ops.Barrier(
@@ -191,6 +196,7 @@ class BarrierTest(test.TestCase):
       with self.assertRaisesOpError("is closed"):
         insert_1_3_op.run()
 
+  @test_util.run_deprecated_v1
   def testUseBarrierWithShape(self):
     with self.cached_session() as sess:
       b = data_flow_ops.Barrier(
@@ -220,6 +226,7 @@ class BarrierTest(test.TestCase):
       self.assertAllEqual(values_0_val[idx], v0)
       self.assertAllEqual(values_1_val[idx], v1)
 
+  @test_util.run_deprecated_v1
   def testParallelInsertMany(self):
     with self.cached_session() as sess:
       b = data_flow_ops.Barrier(dtypes.float32, shapes=())
@@ -229,7 +236,7 @@ class BarrierTest(test.TestCase):
       insert_ops = [b.insert_many(0, [k], [v]) for k, v in zip(keys, values)]
       take_t = b.take_many(10)
 
-      sess.run(insert_ops)
+      self.evaluate(insert_ops)
       self.assertEquals(size_t.eval(), [10])
 
       indices_val, keys_val, values_val = sess.run(
@@ -240,6 +247,7 @@ class BarrierTest(test.TestCase):
       idx = keys_val.tolist().index(k)
       self.assertEqual(values_val[idx], v)
 
+  @test_util.run_deprecated_v1
   def testParallelTakeMany(self):
     with self.cached_session() as sess:
       b = data_flow_ops.Barrier(dtypes.float32, shapes=())
@@ -274,6 +282,7 @@ class BarrierTest(test.TestCase):
     self.assertItemsEqual(
         zip(keys, values), [(k[0], v[0]) for k, v in zip(key_vals, value_vals)])
 
+  @test_util.run_deprecated_v1
   def testBlockingTakeMany(self):
     with self.cached_session() as sess:
       b = data_flow_ops.Barrier(dtypes.float32, shapes=())
@@ -296,6 +305,7 @@ class BarrierTest(test.TestCase):
         insert_op.run()
       t.join()
 
+  @test_util.run_deprecated_v1
   def testParallelInsertManyTakeMany(self):
     with self.cached_session() as sess:
       b = data_flow_ops.Barrier(
@@ -375,6 +385,7 @@ class BarrierTest(test.TestCase):
              2 + outer_indices_from_keys + inner_indices_from_keys)).T
         self.assertAllEqual(taken_i["values_1"], expected_values_1)
 
+  @test_util.run_deprecated_v1
   def testClose(self):
     with self.cached_session() as sess:
       b = data_flow_ops.Barrier(
@@ -433,6 +444,7 @@ class BarrierTest(test.TestCase):
       with self.assertRaisesOpError("is closed and has insufficient elements"):
         sess.run(take_t[0])
 
+  @test_util.run_deprecated_v1
   def testCancel(self):
     with self.cached_session() as sess:
       b = data_flow_ops.Barrier(
@@ -491,10 +503,11 @@ class BarrierTest(test.TestCase):
       b = data_flow_ops.Barrier(
           (dtypes.float32, dtypes.float32), shapes=((), ()), name="B")
       take_t = b.take_many(1, allow_small_batch=True)
-      sess.run(b.close(cancel))
+      self.evaluate(b.close(cancel))
       with self.assertRaisesOpError("is closed and has insufficient elements"):
-        sess.run(take_t)
+        self.evaluate(take_t)
 
+  @test_util.run_deprecated_v1
   def testClosedEmptyBarrierTakeManyAllowSmallBatchRaises(self):
     self._testClosedEmptyBarrierTakeManyAllowSmallBatchRaises(cancel=False)
     self._testClosedEmptyBarrierTakeManyAllowSmallBatchRaises(cancel=True)
@@ -569,9 +582,11 @@ class BarrierTest(test.TestCase):
           sorted(taken),
           [0] * (num_iterations // 2) + [10] * (num_iterations // 2))
 
+  @test_util.run_deprecated_v1
   def testParallelInsertManyTakeManyCloseHalfwayThrough(self):
     self._testParallelInsertManyTakeManyCloseHalfwayThrough(cancel=False)
 
+  @test_util.run_deprecated_v1
   def testParallelInsertManyTakeManyCancelHalfwayThrough(self):
     self._testParallelInsertManyTakeManyCloseHalfwayThrough(cancel=True)
 
@@ -669,12 +684,15 @@ class BarrierTest(test.TestCase):
       else:
         self.assertEqual(taken, [10] * num_iterations)
 
+  @test_util.run_deprecated_v1
   def testParallelPartialInsertManyTakeManyCloseHalfwayThrough(self):
     self._testParallelPartialInsertManyTakeManyCloseHalfwayThrough(cancel=False)
 
+  @test_util.run_deprecated_v1
   def testParallelPartialInsertManyTakeManyCancelHalfwayThrough(self):
     self._testParallelPartialInsertManyTakeManyCloseHalfwayThrough(cancel=True)
 
+  @test_util.run_deprecated_v1
   def testIncompatibleSharedBarrierErrors(self):
     with self.cached_session():
       # Do component types and shapes.
diff --git a/tensorflow/python/kernel_tests/base64_ops_test.py b/tensorflow/python/kernel_tests/base64_ops_test.py
index 1b399942efb..bb903d827f2 100644
--- a/tensorflow/python/kernel_tests/base64_ops_test.py
+++ b/tensorflow/python/kernel_tests/base64_ops_test.py
@@ -93,7 +93,7 @@ class Base64OpsTest(test_util.TensorFlowTestCase):
         decoded = string_ops.decode_base64(encoded)
 
         with self.cached_session() as sess:
-          encoded_value, decoded_value = sess.run([encoded, decoded])
+          encoded_value, decoded_value = self.evaluate([encoded, decoded])
 
         self.assertEqual(encoded_value.shape, msg.shape)
         self.assertEqual(decoded_value.shape, msg.shape)
diff --git a/tensorflow/python/kernel_tests/basic_gpu_test.py b/tensorflow/python/kernel_tests/basic_gpu_test.py
index 225c1b35ae5..1a8513d022d 100644
--- a/tensorflow/python/kernel_tests/basic_gpu_test.py
+++ b/tensorflow/python/kernel_tests/basic_gpu_test.py
@@ -44,13 +44,13 @@ class GPUBinaryOpsTest(test.TestCase):
       inx = ops.convert_to_tensor(x)
       iny = ops.convert_to_tensor(y)
       out = tf_func(inx, iny)
-      tf_gpu = sess.run(out)
+      tf_gpu = self.evaluate(out)
 
     with self.cached_session(use_gpu=False) as sess:
       inx = ops.convert_to_tensor(x)
       iny = ops.convert_to_tensor(y)
       out = tf_func(inx, iny)
-      tf_cpu = sess.run(out)
+      tf_cpu = self.evaluate(out)
 
     self.assertAllClose(tf_cpu, tf_gpu)
 
@@ -96,7 +96,7 @@ class MathBuiltinUnaryTest(test.TestCase):
     with self.cached_session(use_gpu=use_gpu) as sess:
       inx = ops.convert_to_tensor(x)
       ofunc = tf_func(inx)
-      tf_out = sess.run(ofunc)
+      tf_out = self.evaluate(ofunc)
     self.assertAllClose(np_out, tf_out)
 
   def _inv(self, x):
@@ -148,7 +148,7 @@ class MathBuiltinUnaryTest(test.TestCase):
       iny = ops.convert_to_tensor(y + 0.1)
       ofunc = inx / iny
       out_func2 = math_ops.floor(ofunc)
-      tf_out = sess.run(out_func2)
+      tf_out = self.evaluate(out_func2)
 
     self.assertAllClose(np_out, tf_out)
 
@@ -159,6 +159,7 @@ class BroadcastSimpleTest(test.TestCase):
     with self.cached_session(use_gpu=True) as sess:
       return sess.run(broadcast_gradient_args(xs, ys))
 
+  @test_util.run_deprecated_v1
   def testBroadcast(self):
     r0, r1 = self._GetGradientArgs([2, 3, 5], [1])
     self.assertAllEqual(r0, [])
@@ -214,11 +215,12 @@ class BroadcastSimpleTest(test.TestCase):
       inx = ops.convert_to_tensor(x)
       iny = ops.convert_to_tensor(y)
       out = tf_func(inx, iny)
-      tf_gpu = out.eval()
+      tf_gpu = self.evaluate(out)
     self.assertAllClose(np_ans, tf_gpu)
     self.assertShapeEqual(np_ans, out)
     # TODO(zhifengc/ke): make gradient checker work on GPU.
 
+  @test_util.run_deprecated_v1
   def testGradient(self):
     x = (1 + np.linspace(0, 5, np.prod([1, 3, 2]))).astype(np.float32).reshape(
         [1, 3, 2])
@@ -255,6 +257,7 @@ class GpuMultiSessionMemoryTest(test_util.TensorFlowTestCase):
           if len(results) != 1:
             break
 
+  @test_util.run_deprecated_v1
   def testConcurrentSessions(self):
     n_threads = 4
     threads = []
diff --git a/tensorflow/python/kernel_tests/batch_gather_op_test.py b/tensorflow/python/kernel_tests/batch_gather_op_test.py
index 547506d844d..7e0b3e1b5ea 100644
--- a/tensorflow/python/kernel_tests/batch_gather_op_test.py
+++ b/tensorflow/python/kernel_tests/batch_gather_op_test.py
@@ -23,6 +23,7 @@ import numpy as np
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
 
@@ -52,7 +53,7 @@ class GatherTest(test.TestCase, parameterized.TestCase):
         gather_t = array_ops.batch_gather(params, indices_tf)
         expected_result = np.array([3, 7])
         np_val = self._buildParams(expected_result, dtype)
-        gather_val = gather_t.eval()
+        gather_val = self.evaluate(gather_t)
         self.assertAllEqual(np_val, gather_val)
         self.assertEqual(np_val.shape, gather_t.get_shape())
 
@@ -68,7 +69,7 @@ class GatherTest(test.TestCase, parameterized.TestCase):
         gather_t = array_ops.batch_gather(params, indices_tf)
         expected_result = np.array([[3], [15]])
         np_val = self._buildParams(expected_result, dtype)
-        gather_val = gather_t.eval()
+        gather_val = self.evaluate(gather_t)
         self.assertAllEqual(np_val, gather_val)
         self.assertEqual(np_val.shape, gather_t.get_shape())
 
@@ -81,12 +82,13 @@ class GatherTest(test.TestCase, parameterized.TestCase):
         params = constant_op.constant(params_np)
         indices_tf = constant_op.constant(indices)
         gather_t = array_ops.batch_gather(params, indices_tf)
-        gather_val = gather_t.eval()
+        gather_val = self.evaluate(gather_t)
         expected_result = np.array([[[2, 0], [7, 5]], [[10, 8], [11, 15]]])
         np_val = self._buildParams(expected_result, dtype)
         self.assertAllEqual(np_val, gather_val)
         self.assertEqual(np_val.shape, gather_t.get_shape())
 
+  @test_util.run_deprecated_v1
   def testString(self):
     params = np.array([[b"asdf", b"zxcv"], [b"qwer", b"uiop"]])
     with self.cached_session():
@@ -94,6 +96,7 @@ class GatherTest(test.TestCase, parameterized.TestCase):
       self.assertAllEqual([[b"qwer", b"uiop"]],
                           array_ops.batch_gather(params, indices_tf).eval())
 
+  @test_util.run_deprecated_v1
   def testUnknownIndices(self):
     params = constant_op.constant([[0, 1, 2]])
     indices = array_ops.placeholder(dtypes.int32, shape=[None, None])
@@ -106,6 +109,7 @@ class GatherTest(test.TestCase, parameterized.TestCase):
       with self.assertRaisesOpError(r"indices\[0\] = 7 is not in \[0, 2\)"):
         array_ops.batch_gather(params, [7]).eval()
 
+  @test_util.run_deprecated_v1
   def testEmptySlices(self):
     with self.session(use_gpu=True):
       for dtype in _TEST_TYPES:
diff --git a/tensorflow/python/kernel_tests/batch_matmul_op_test.py b/tensorflow/python/kernel_tests/batch_matmul_op_test.py
index 8f6c089b423..a0ad8151b26 100644
--- a/tensorflow/python/kernel_tests/batch_matmul_op_test.py
+++ b/tensorflow/python/kernel_tests/batch_matmul_op_test.py
@@ -86,7 +86,7 @@ class BatchMatmulOpTest(test.TestCase):
     with self.cached_session(use_gpu=is_floating) as sess:
       if static_shape:
         z0 = math_ops.matmul(x, y, adjoint_a=adjoint_a, adjoint_b=adjoint_b)
-        z0_val = z0.eval()
+        z0_val = self.evaluate(z0)
       else:
         x_ph = array_ops.placeholder(x.dtype)
         y_ph = array_ops.placeholder(y.dtype)
diff --git a/tensorflow/python/kernel_tests/batch_scatter_ops_test.py b/tensorflow/python/kernel_tests/batch_scatter_ops_test.py
index 742a2048831..eefcdc508f0 100644
--- a/tensorflow/python/kernel_tests/batch_scatter_ops_test.py
+++ b/tensorflow/python/kernel_tests/batch_scatter_ops_test.py
@@ -22,6 +22,7 @@ import numpy as np
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
@@ -73,6 +74,7 @@ class ScatterTest(test.TestCase):
           tf_scatter(ref, indices, updates).eval()
           self.assertAllClose(ref.eval(), new)
 
+  @test_util.run_deprecated_v1
   def testVariableRankUpdate(self):
     vtypes = [np.float32, np.float64]
     for vtype in vtypes:
@@ -80,6 +82,7 @@ class ScatterTest(test.TestCase):
         self._VariableRankTest(
             state_ops.batch_scatter_update, vtype, itype)
 
+  @test_util.run_deprecated_v1
   def testBooleanScatterUpdate(self):
     with self.session(use_gpu=False) as session:
       var = variables.Variable([True, False])
@@ -91,8 +94,9 @@ class ScatterTest(test.TestCase):
 
       session.run([update0, update1])
 
-      self.assertAllEqual([False, True], var.eval())
+      self.assertAllEqual([False, True], self.evaluate(var))
 
+  @test_util.run_deprecated_v1
   def testScatterOutOfRange(self):
     params = np.array([1, 2, 3, 4, 5, 6]).astype(np.float32)
     updates = np.array([-3, -4, -5]).astype(np.float32)
diff --git a/tensorflow/python/kernel_tests/batchtospace_op_test.py b/tensorflow/python/kernel_tests/batchtospace_op_test.py
index 03f3f64353d..c422df8806f 100644
--- a/tensorflow/python/kernel_tests/batchtospace_op_test.py
+++ b/tensorflow/python/kernel_tests/batchtospace_op_test.py
@@ -27,6 +27,7 @@ import numpy as np
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_array_ops
 from tensorflow.python.ops import gradient_checker
@@ -50,6 +51,7 @@ class CppOpImpl(object):
 class BatchToSpaceDepthToSpace(test.TestCase, PythonOpImpl):
 
   # Verifies that: batch_to_space(x) = transpose(depth_to_space(transpose(x)))
+  @test_util.run_deprecated_v1
   def testDepthToSpaceTranspose(self):
     x = np.arange(20 * 5 * 8 * 7, dtype=np.float32).reshape([20, 5, 8, 7])
     block_size = 2
@@ -70,6 +72,7 @@ class BatchToSpaceDepthToSpaceCpp(BatchToSpaceDepthToSpace, CppOpImpl):
 
 class BatchToSpaceErrorHandlingTest(test.TestCase, PythonOpImpl):
 
+  @test_util.run_deprecated_v1
   def testInputWrongDimMissingBatch(self):
     # The input is missing the first dimension ("batch")
     x_np = [[[1], [2]], [[3], [4]]]
@@ -78,6 +81,7 @@ class BatchToSpaceErrorHandlingTest(test.TestCase, PythonOpImpl):
     with self.assertRaises(ValueError):
       _ = self.batch_to_space(x_np, crops, block_size)
 
+  @test_util.run_deprecated_v1
   def testBlockSize0(self):
     # The block size is 0.
     x_np = [[[[1], [2]], [[3], [4]]]]
@@ -87,6 +91,7 @@ class BatchToSpaceErrorHandlingTest(test.TestCase, PythonOpImpl):
       out_tf = self.batch_to_space(x_np, crops, block_size)
       out_tf.eval()
 
+  @test_util.run_deprecated_v1
   def testBlockSizeOne(self):
     # The block size is 1. The block size needs to be > 1.
     x_np = [[[[1], [2]], [[3], [4]]]]
@@ -96,6 +101,7 @@ class BatchToSpaceErrorHandlingTest(test.TestCase, PythonOpImpl):
       out_tf = self.batch_to_space(x_np, crops, block_size)
       out_tf.eval()
 
+  @test_util.run_deprecated_v1
   def testBlockSizeLarger(self):
     # The block size is too large for this input.
     x_np = [[[[1], [2]], [[3], [4]]]]
@@ -105,6 +111,7 @@ class BatchToSpaceErrorHandlingTest(test.TestCase, PythonOpImpl):
       out_tf = self.batch_to_space(x_np, crops, block_size)
       out_tf.eval()
 
+  @test_util.run_deprecated_v1
   def testBlockSizeSquaredNotDivisibleBatch(self):
     # The block size squared does not divide the batch.
     x_np = [[[[1], [2], [3]], [[3], [4], [7]]]]
@@ -113,6 +120,7 @@ class BatchToSpaceErrorHandlingTest(test.TestCase, PythonOpImpl):
     with self.assertRaises(ValueError):
       _ = self.batch_to_space(x_np, crops, block_size)
 
+  @test_util.run_deprecated_v1
   def testUnknownShape(self):
     t = self.batch_to_space(
         array_ops.placeholder(dtypes.float32),
@@ -160,28 +168,35 @@ class BatchToSpaceNDErrorHandlingTest(test.TestCase):
     self._testStaticShape(input_shape, block_shape, paddings, error)
     self._testDynamicShape(input_shape, block_shape, paddings)
 
+  @test_util.run_deprecated_v1
   def testInputWrongDimMissingBatch(self):
     self._testShape([2, 2], [2, 2], [[0, 0], [0, 0]], ValueError)
     self._testShape([2, 2, 3], [2, 2, 3], [[0, 0], [0, 0]], ValueError)
 
+  @test_util.run_deprecated_v1
   def testBlockSize0(self):
     # The block size is 0.
     self._testShape([1, 2, 2, 1], [0, 1], [[0, 0], [0, 0]], ValueError)
 
+  @test_util.run_deprecated_v1
   def testBlockSizeNegative(self):
     self._testShape([1, 2, 2, 1], [-1, 1], [[0, 0], [0, 0]], ValueError)
 
+  @test_util.run_deprecated_v1
   def testNegativePadding(self):
     self._testShape([1, 2, 2], [1, 1], [[0, -1], [0, 0]], ValueError)
 
+  @test_util.run_deprecated_v1
   def testCropTooLarge(self):
     # The amount to crop exceeds the padded size.
     self._testShape([1 * 2 * 2, 2, 3, 1], [2, 2], [[3, 2], [0, 0]], ValueError)
 
+  @test_util.run_deprecated_v1
   def testBlockSizeSquaredNotDivisibleBatch(self):
     # The batch dimension is not divisible by the product of the block_shape.
     self._testShape([3, 1, 1, 1], [2, 3], [[0, 0], [0, 0]], ValueError)
 
+  @test_util.run_deprecated_v1
   def testUnknownShape(self):
     # Verify that input shape and paddings shape can be unknown.
     _ = array_ops.batch_to_space_nd(
@@ -263,18 +278,21 @@ class BatchToSpaceGradientTest(test.TestCase, PythonOpImpl):
 
   # Don't use very large numbers as dimensions here as the result is tensor
   # with cartesian product of the dimensions.
+  @test_util.run_deprecated_v1
   def testSmall(self):
     block_size = 2
     crop_beg = 0
     crop_end = 0
     self._compare(1, 2, 3, 5, block_size, crop_beg, crop_end)
 
+  @test_util.run_deprecated_v1
   def testSmall2(self):
     block_size = 2
     crop_beg = 0
     crop_end = 0
     self._compare(2, 4, 3, 2, block_size, crop_beg, crop_end)
 
+  @test_util.run_deprecated_v1
   def testSmallCrop1x1(self):
     block_size = 2
     crop_beg = 1
@@ -316,14 +334,17 @@ class BatchToSpaceNDGradientTest(test.TestCase):
 
   # Don't use very large numbers as dimensions here as the result is tensor
   # with cartesian product of the dimensions.
+  @test_util.run_deprecated_v1
   def testSmall(self):
     for dtype in [dtypes.int64, dtypes.int32]:
       self._compare([1, 2, 3, 5], [2, 2], [[0, 0], [0, 0]], dtype)
 
+  @test_util.run_deprecated_v1
   def testSmall2(self):
     for dtype in [dtypes.int64, dtypes.int32]:
       self._compare([2, 4, 3, 2], [2, 2], [[0, 0], [0, 0]], dtype)
 
+  @test_util.run_deprecated_v1
   def testSmallCrop1x1(self):
     for dtype in [dtypes.int64, dtypes.int32]:
       self._compare([1, 2, 3, 5], [2, 2], [[1, 1], [1, 1]], dtype)
diff --git a/tensorflow/python/kernel_tests/bcast_ops_test.py b/tensorflow/python/kernel_tests/bcast_ops_test.py
index 3ec820aeada..ae00955ac29 100644
--- a/tensorflow/python/kernel_tests/bcast_ops_test.py
+++ b/tensorflow/python/kernel_tests/bcast_ops_test.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops.gen_array_ops import broadcast_args
 from tensorflow.python.ops.gen_array_ops import broadcast_gradient_args
 from tensorflow.python.platform import test
@@ -35,6 +36,7 @@ class BcastOpsTest(test.TestCase):
     with self.cached_session() as sess:
       return sess.run(broadcast_gradient_args(xs, ys))
 
+  @test_util.run_deprecated_v1
   def testBasic(self):
     r = self._GetBroadcastShape([2, 3, 5], [1])
     self.assertAllEqual(r, [2, 3, 5])
@@ -66,6 +68,7 @@ class BcastOpsTest(test.TestCase):
     r = self._GetBroadcastShape([3, 1], [2, 1, 5])
     self.assertAllEqual(r, [2, 3, 5])
 
+  @test_util.run_deprecated_v1
   def testBasicGradient(self):
     r0, r1 = self._GetGradientArgs([2, 3, 5], [1])
     self.assertAllEqual(r0, [])
@@ -107,6 +110,7 @@ class BcastOpsTest(test.TestCase):
     self.assertAllEqual(r0, [0, 2])
     self.assertAllEqual(r1, [1])
 
+  @test_util.run_deprecated_v1
   def testZeroDims(self):
     r = self._GetBroadcastShape([2, 0, 3, 0, 5], [3, 0, 5])
     self.assertAllEqual(r, [2, 0, 3, 0, 5])
@@ -120,6 +124,7 @@ class BcastOpsTest(test.TestCase):
     r = self._GetBroadcastShape([3, 1, 5], [2, 0, 3, 0, 5])
     self.assertAllEqual(r, [2, 0, 3, 0, 5])
 
+  @test_util.run_deprecated_v1
   def testZeroDimsGradient(self):
     r0, r1 = self._GetGradientArgs([2, 0, 3, 0, 5], [3, 0, 5])
     self.assertAllEqual(r0, [])
@@ -137,6 +142,7 @@ class BcastOpsTest(test.TestCase):
     self.assertAllEqual(r0, [0, 1, 3])
     self.assertAllEqual(r1, [])
 
+  @test_util.run_deprecated_v1
   def testDataTypes(self):
     for dtype in [dtypes.int32, dtypes.int64]:
       r = self._GetBroadcastShape(
diff --git a/tensorflow/python/kernel_tests/benchmark_test.py b/tensorflow/python/kernel_tests/benchmark_test.py
index 5777a5d0970..bffa5e6e8f4 100644
--- a/tensorflow/python/kernel_tests/benchmark_test.py
+++ b/tensorflow/python/kernel_tests/benchmark_test.py
@@ -21,9 +21,12 @@ import json
 import os
 import random
 
+import numpy as np
+
 from tensorflow.core.util import test_log_pb2
 from tensorflow.python.client import session
-from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import benchmark
 from tensorflow.python.platform import gfile
 from tensorflow.python.platform import test
@@ -64,11 +67,17 @@ class TestReportingBenchmark(test.Benchmark):
                 "other_key": "string"})
 
   def benchmark_times_an_op(self):
+    input_size = 5
     with session.Session(config=benchmark.benchmark_config()) as sess:
-      a = constant_op.constant(0.0)
+      a = array_ops.placeholder(dtype=dtypes.float32, shape=(input_size))
       a_plus_a = a + a
       return self.run_op_benchmark(
-          sess, a_plus_a, min_iters=1000, store_trace=True, name="op_benchmark")
+          sess,
+          a_plus_a,
+          feed_dict={a: np.arange(input_size)},
+          min_iters=1000,
+          store_trace=True,
+          name="op_benchmark")
 
 
 class BenchmarkTest(test.TestCase):
diff --git a/tensorflow/python/kernel_tests/betainc_op_test.py b/tensorflow/python/kernel_tests/betainc_op_test.py
index 92d21462d52..9dc34a60628 100644
--- a/tensorflow/python/kernel_tests/betainc_op_test.py
+++ b/tensorflow/python/kernel_tests/betainc_op_test.py
@@ -24,6 +24,7 @@ import numpy as np
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import gradients_impl
@@ -48,7 +49,7 @@ class BetaincTest(test.TestCase):
       tf_x_s = constant_op.constant(x_s, dtype=dtype)
       tf_out_t = math_ops.betainc(tf_a_s, tf_b_s, tf_x_s)
       with self.cached_session():
-        tf_out = tf_out_t.eval()
+        tf_out = self.evaluate(tf_out_t)
       scipy_out = special.betainc(a_s, b_s, x_s).astype(np_dt)
 
       # the scipy version of betainc uses a double-only implementation.
@@ -109,36 +110,42 @@ class BetaincTest(test.TestCase):
     except ImportError as e:
       tf_logging.warn("Cannot test special functions: %s" % str(e))
 
+  @test_util.run_deprecated_v1
   def testBetaIncFloat(self):
     a_s = np.abs(np.random.randn(10, 10) * 30)  # in (0, infty)
     b_s = np.abs(np.random.randn(10, 10) * 30)  # in (0, infty)
     x_s = np.random.rand(10, 10)  # in (0, 1)
     self._testBetaInc(a_s, b_s, x_s, dtypes.float32)
 
+  @test_util.run_deprecated_v1
   def testBetaIncDouble(self):
     a_s = np.abs(np.random.randn(10, 10) * 30)  # in (0, infty)
     b_s = np.abs(np.random.randn(10, 10) * 30)  # in (0, infty)
     x_s = np.random.rand(10, 10)  # in (0, 1)
     self._testBetaInc(a_s, b_s, x_s, dtypes.float64)
 
+  @test_util.run_deprecated_v1
   def testBetaIncDoubleVeryLargeValues(self):
     a_s = np.abs(np.random.randn(10, 10) * 1e15)  # in (0, infty)
     b_s = np.abs(np.random.randn(10, 10) * 1e15)  # in (0, infty)
     x_s = np.random.rand(10, 10)  # in (0, 1)
     self._testBetaInc(a_s, b_s, x_s, dtypes.float64)
 
+  @test_util.run_deprecated_v1
   def testBetaIncDoubleVerySmallValues(self):
     a_s = np.abs(np.random.randn(10, 10) * 1e-16)  # in (0, infty)
     b_s = np.abs(np.random.randn(10, 10) * 1e-16)  # in (0, infty)
     x_s = np.random.rand(10, 10)  # in (0, 1)
     self._testBetaInc(a_s, b_s, x_s, dtypes.float64)
 
+  @test_util.run_deprecated_v1
   def testBetaIncFloatVerySmallValues(self):
     a_s = np.abs(np.random.randn(10, 10) * 1e-8)  # in (0, infty)
     b_s = np.abs(np.random.randn(10, 10) * 1e-8)  # in (0, infty)
     x_s = np.random.rand(10, 10)  # in (0, 1)
     self._testBetaInc(a_s, b_s, x_s, dtypes.float32)
 
+  @test_util.run_deprecated_v1
   def testBetaIncFpropAndBpropAreNeverNAN(self):
     with self.cached_session() as sess:
       space = np.logspace(-8, 5).tolist()
@@ -159,6 +166,7 @@ class BetaincTest(test.TestCase):
       self.assertAllEqual(np.zeros_like(grads_x).astype(np.bool),
                           np.isnan(grads_x))
 
+  @test_util.run_deprecated_v1
   def testBetaIncGrads(self):
     err_tolerance = 1e-3
     with self.cached_session():
diff --git a/tensorflow/python/kernel_tests/bias_op_test.py b/tensorflow/python/kernel_tests/bias_op_test.py
index 749d6a791e3..66f442dbddb 100644
--- a/tensorflow/python/kernel_tests/bias_op_test.py
+++ b/tensorflow/python/kernel_tests/bias_op_test.py
@@ -22,6 +22,7 @@ import numpy as np
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import gradients_impl
@@ -89,10 +90,12 @@ class BiasAddTest(test.TestCase):
       self._testBiasNCHW(np_inputs, np_bias, use_gpu=True)
 
 
+  @test_util.run_deprecated_v1
   def testInputDims(self):
     with self.assertRaises(ValueError):
       nn_ops.bias_add([1, 2], [1])
 
+  @test_util.run_deprecated_v1
   def testBiasVec(self):
     with self.assertRaises(ValueError):
       nn_ops.bias_add(
@@ -101,6 +104,7 @@ class BiasAddTest(test.TestCase):
           array_ops.reshape(
               [1, 2], shape=[1, 2]))
 
+  @test_util.run_deprecated_v1
   def testBiasInputsMatch(self):
     with self.assertRaises(ValueError):
       nn_ops.bias_add(
@@ -109,23 +113,27 @@ class BiasAddTest(test.TestCase):
           array_ops.reshape(
               [1], shape=[1]))
 
+  @test_util.run_deprecated_v1
   def testIntTypes(self):
     for t in [np.int8, np.int16, np.int32, np.int64]:
       self._testAll(
           np.array([[10, 20, 30], [40, 50, 60]]).astype(t),
           np.array([1, 2, 3]).astype(t))
 
+  @test_util.run_deprecated_v1
   def testFloatTypes(self):
     for t in [np.float16, np.float32, np.float64]:
       self._testAll(
           np.random.rand(4, 3, 3).astype(t), np.random.rand(3).astype(t))
 
+  @test_util.run_deprecated_v1
   def test4DFloatTypes(self):
     for t in [np.float16, np.float32, np.float64]:
       self._testAll(
           np.random.rand(4, 3, 2, 3).astype(t),
           np.random.rand(3).astype(t))
 
+  @test_util.run_deprecated_v1
   def test5DFloatTypes(self):
     for t in [np.float16, np.float32, np.float64]:
       self._testAll(
@@ -187,6 +195,7 @@ class BiasAddTest(test.TestCase):
       self.assertAllClose(bias_jacob_t, bias_jacob_n, threshold, threshold)
       self.assertAllClose(grad_jacob_t, grad_jacob_n, threshold, threshold)
 
+  @test_util.run_deprecated_v1
   def testGradientTensor(self):
     # TODO(yongtang): BiasAddGrad with NCHW only works 4D. Reenable once
     # all dimensions are supported.
@@ -198,6 +207,7 @@ class BiasAddTest(test.TestCase):
         bias = np.array([1.3, 2.4], dtype=dtype.as_numpy_dtype)
         self._testGradient(np_input, bias, dtype, data_format, use_gpu)
 
+  @test_util.run_deprecated_v1
   def testGradientTensor4D(self):
     # BiasAddGrad with NCHW support 4D so all are enabled.
     for (data_format, use_gpu) in [("NHWC", False), ("NHWC", True),
@@ -209,11 +219,13 @@ class BiasAddTest(test.TestCase):
         bias = np.array([1.3, 2.4], dtype=dtype.as_numpy_dtype)
         self._testGradient(np_input, bias, dtype, data_format, use_gpu)
 
+  @test_util.run_deprecated_v1
   def testEmpty(self):
     np.random.seed(7)
     for shape in (0, 0), (2, 0), (0, 2), (4, 3, 0), (4, 0, 3), (0, 4, 3):
       self._testAll(np.random.randn(*shape), np.random.randn(shape[-1]))
 
+  @test_util.run_deprecated_v1
   def testEmptyGradient(self):
     # TODO(yongtang): BiasAddGrad with NCHW only works 4D. Reenable once
     # all dimensions are supported.
diff --git a/tensorflow/python/kernel_tests/bincount_op_test.py b/tensorflow/python/kernel_tests/bincount_op_test.py
index 49eb835847e..d064d736cf2 100644
--- a/tensorflow/python/kernel_tests/bincount_op_test.py
+++ b/tensorflow/python/kernel_tests/bincount_op_test.py
@@ -30,6 +30,7 @@ from tensorflow.python.platform import googletest
 
 class BincountTest(test_util.TensorFlowTestCase):
 
+  @test_util.run_deprecated_v1
   def test_empty(self):
     with self.session(use_gpu=True):
       self.assertAllEqual(
@@ -43,6 +44,7 @@ class BincountTest(test_util.TensorFlowTestCase):
           math_ops.bincount([], minlength=3, dtype=np.float64).eval().dtype,
           np.float64)
 
+  @test_util.run_deprecated_v1
   def test_values(self):
     with self.session(use_gpu=True):
       self.assertAllEqual(
@@ -58,12 +60,14 @@ class BincountTest(test_util.TensorFlowTestCase):
       self.assertAllEqual(
           math_ops.bincount(np.arange(10000)).eval(), np.ones(10000))
 
+  @test_util.run_deprecated_v1
   def test_maxlength(self):
     with self.session(use_gpu=True):
       self.assertAllEqual(math_ops.bincount([5], maxlength=3).eval(), [0, 0, 0])
       self.assertAllEqual(math_ops.bincount([1], maxlength=3).eval(), [0, 1])
       self.assertAllEqual(math_ops.bincount([], maxlength=3).eval(), [])
 
+  @test_util.run_deprecated_v1
   def test_random_with_weights(self):
     num_samples = 10000
     with self.session(use_gpu=True):
@@ -77,6 +81,7 @@ class BincountTest(test_util.TensorFlowTestCase):
         self.assertAllClose(
             math_ops.bincount(arr, weights).eval(), np.bincount(arr, weights))
 
+  @test_util.run_deprecated_v1
   def test_random_without_weights(self):
     num_samples = 10000
     with self.session(use_gpu=True):
@@ -87,6 +92,7 @@ class BincountTest(test_util.TensorFlowTestCase):
         self.assertAllClose(
             math_ops.bincount(arr, None).eval(), np.bincount(arr, weights))
 
+  @test_util.run_deprecated_v1
   def test_zero_weights(self):
     with self.session(use_gpu=True):
       self.assertAllEqual(
@@ -99,6 +105,7 @@ class BincountTest(test_util.TensorFlowTestCase):
       with self.assertRaises(errors.InvalidArgumentError):
         math_ops.bincount([1, 2, 3, -1, 6, 8]).eval()
 
+  @test_util.run_deprecated_v1
   def test_shape_function(self):
     # size must be scalar.
     with self.assertRaisesRegexp(
diff --git a/tensorflow/python/kernel_tests/bitcast_op_test.py b/tensorflow/python/kernel_tests/bitcast_op_test.py
index 79e0f36d242..b4f9a21a899 100644
--- a/tensorflow/python/kernel_tests/bitcast_op_test.py
+++ b/tensorflow/python/kernel_tests/bitcast_op_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
 
@@ -28,9 +29,9 @@ from tensorflow.python.platform import test
 class BitcastTest(test.TestCase):
 
   def _testBitcast(self, x, datatype, shape):
-    with self.session(use_gpu=True):
+    with test_util.use_gpu():
       tf_ans = array_ops.bitcast(x, datatype)
-      out = tf_ans.eval()
+      out = self.evaluate(tf_ans)
       buff_after = memoryview(out).tobytes()
       buff_before = memoryview(x).tobytes()
       self.assertEqual(buff_before, buff_after)
@@ -59,6 +60,7 @@ class BitcastTest(test.TestCase):
     shape = [3, 4]
     self._testBitcast(x, dtypes.int64, shape)
 
+  @test_util.run_deprecated_v1
   def testErrors(self):
     x = np.zeros([1, 1], np.int8)
     datatype = dtypes.int32
@@ -71,6 +73,7 @@ class BitcastTest(test.TestCase):
     shape = [4]
     self._testBitcast(x, datatype, shape)
 
+  @test_util.run_deprecated_v1
   def testUnknown(self):
     x = array_ops.placeholder(dtypes.float32)
     datatype = dtypes.int8
diff --git a/tensorflow/python/kernel_tests/boosted_trees/prediction_ops_test.py b/tensorflow/python/kernel_tests/boosted_trees/prediction_ops_test.py
index 7cdc67f83f0..6b04e8abf40 100644
--- a/tensorflow/python/kernel_tests/boosted_trees/prediction_ops_test.py
+++ b/tensorflow/python/kernel_tests/boosted_trees/prediction_ops_test.py
@@ -28,6 +28,7 @@ from tensorflow.python.platform import googletest
 class TrainingPredictionOpsTest(test_util.TensorFlowTestCase):
   """Tests prediction ops for training."""
 
+  @test_util.run_deprecated_v1
   def testCachedPredictionOnEmptyEnsemble(self):
     """Tests that prediction on a dummy ensemble does not fail."""
     with self.cached_session() as session:
@@ -61,6 +62,7 @@ class TrainingPredictionOpsTest(test_util.TensorFlowTestCase):
       self.assertAllClose(cached_node_ids, new_node_ids)
       self.assertAllClose([[0], [0]], logits_updates)
 
+  @test_util.run_deprecated_v1
   def testNoCachedPredictionButTreeExists(self):
     """Tests that predictions are updated once trees are added."""
     with self.cached_session() as session:
@@ -127,6 +129,7 @@ class TrainingPredictionOpsTest(test_util.TensorFlowTestCase):
       self.assertAllClose([2, 1], new_node_ids)
       self.assertAllClose([[0.1 * 8.79], [0.1 * 1.14]], logits_updates)
 
+  @test_util.run_deprecated_v1
   def testCachedPredictionIsCurrent(self):
     """Tests that prediction based on previous node in the tree works."""
     with self.cached_session() as session:
@@ -199,6 +202,7 @@ class TrainingPredictionOpsTest(test_util.TensorFlowTestCase):
       self.assertAllClose(cached_node_ids, new_node_ids)
       self.assertAllClose([[0], [0]], logits_updates)
 
+  @test_util.run_deprecated_v1
   def testCachedPredictionFromTheSameTree(self):
     """Tests that prediction based on previous node in the tree works."""
     with self.cached_session() as session:
@@ -313,6 +317,7 @@ class TrainingPredictionOpsTest(test_util.TensorFlowTestCase):
       # 1.65 and -3.875, and then multiply them by 0.1 (lr)
       self.assertAllClose([[0.1 * 1.65], [0.1 * -3.875]], logits_updates)
 
+  @test_util.run_deprecated_v1
   def testCachedPredictionFromPreviousTree(self):
     """Tests the predictions work when we have cache from previous trees."""
     with self.cached_session() as session:
@@ -445,6 +450,7 @@ class TrainingPredictionOpsTest(test_util.TensorFlowTestCase):
       #            change= 0.1(1.14+7.0-7.0)
       self.assertAllClose([[1], [0.114]], logits_updates)
 
+  @test_util.run_deprecated_v1
   def testCategoricalSplits(self):
     """Tests the training prediction work for categorical splits."""
     with self.cached_session() as session:
@@ -517,6 +523,7 @@ class TrainingPredictionOpsTest(test_util.TensorFlowTestCase):
       self.assertAllClose([3, 4, 2], new_node_ids)
       self.assertAllClose([[5.], [6.], [7.]], logits_updates)
 
+  @test_util.run_deprecated_v1
   def testCachedPredictionFromTheSameTreeWithPostPrunedNodes(self):
     """Tests that prediction based on previous node in the tree works."""
     with self.cached_session() as session:
@@ -647,6 +654,7 @@ class TrainingPredictionOpsTest(test_util.TensorFlowTestCase):
       self.assertAllClose([[0.01], [0.01], [0.0553], [0.0783], [0.01], [0.01]],
                           logits_updates + cached_values)
 
+  @test_util.run_deprecated_v1
   def testCachedPredictionFromThePreviousTreeWithPostPrunedNodes(self):
     """Tests that prediction based on previous node in the tree works."""
     with self.cached_session() as session:
@@ -792,6 +800,7 @@ class TrainingPredictionOpsTest(test_util.TensorFlowTestCase):
                            [root + 0.0783], [root + 0.01], [root + 0.01]],
                           logits_updates + cached_values)
 
+  @test_util.run_deprecated_v1
   def testCachedPredictionTheWholeTreeWasPruned(self):
     """Tests that prediction based on previous node in the tree works."""
     with self.cached_session() as session:
@@ -864,6 +873,7 @@ class TrainingPredictionOpsTest(test_util.TensorFlowTestCase):
 class PredictionOpsTest(test_util.TensorFlowTestCase):
   """Tests prediction ops for inference."""
 
+  @test_util.run_deprecated_v1
   def testPredictionOnEmptyEnsemble(self):
     """Tests that prediction on a empty ensemble does not fail."""
     with self.cached_session() as session:
@@ -886,6 +896,7 @@ class PredictionOpsTest(test_util.TensorFlowTestCase):
       logits = session.run(predict_op)
       self.assertAllClose(expected_logits, logits)
 
+  @test_util.run_deprecated_v1
   def testPredictionMultipleTree(self):
     """Tests the predictions work when we have multiple trees."""
     with self.cached_session() as session:
@@ -996,6 +1007,7 @@ class PredictionOpsTest(test_util.TensorFlowTestCase):
       logits = session.run(predict_op)
       self.assertAllClose(expected_logits, logits)
 
+  @test_util.run_deprecated_v1
   def testCategoricalSplits(self):
     """Tests the predictions work for categorical splits."""
     with self.cached_session() as session:
@@ -1062,6 +1074,7 @@ class PredictionOpsTest(test_util.TensorFlowTestCase):
 class FeatureContribsOpsTest(test_util.TensorFlowTestCase):
   """Tests feature contribs ops for model understanding."""
 
+  @test_util.run_deprecated_v1
   def testContribsForOnlyABiasNode(self):
     """Tests case when, after training, only left with a bias node.
 
@@ -1122,6 +1135,7 @@ class FeatureContribsOpsTest(test_util.TensorFlowTestCase):
       self.assertAllClose(feature_ids, expected_feature_ids)
       self.assertAllClose(logits_paths, expected_logits_paths)
 
+  @test_util.run_deprecated_v1
   def testContribsMultipleTreeWhenFirstTreeIsABiasNode(self):
     """Tests case when, after training, first tree contains only a bias node."""
     with self.cached_session() as session:
@@ -1219,6 +1233,7 @@ class FeatureContribsOpsTest(test_util.TensorFlowTestCase):
       self.assertAllClose(feature_ids, expected_feature_ids)
       self.assertAllClose(logits_paths, expected_logits_paths)
 
+  @test_util.run_deprecated_v1
   def testContribsMultipleTree(self):
     """Tests that the contribs work when we have multiple trees."""
     with self.cached_session() as session:
diff --git a/tensorflow/python/kernel_tests/boosted_trees/quantile_ops_test.py b/tensorflow/python/kernel_tests/boosted_trees/quantile_ops_test.py
index 12afb6a2ad8..390672febeb 100644
--- a/tensorflow/python/kernel_tests/boosted_trees/quantile_ops_test.py
+++ b/tensorflow/python/kernel_tests/boosted_trees/quantile_ops_test.py
@@ -82,6 +82,7 @@ class QuantileOpsTest(test_util.TensorFlowTestCase):
     self.max_elements = 1 << 16
     self.num_quantiles = constant_op.constant(3, dtype=dtypes.int64)
 
+  @test_util.run_deprecated_v1
   def testBasicQuantileBucketsSingleResource(self):
     with self.cached_session() as sess:
       quantile_accumulator_handle = self.create_resource("floats", self.eps,
@@ -98,14 +99,15 @@ class QuantileOpsTest(test_util.TensorFlowTestCase):
           quantile_accumulator_handle, num_features=2)
       quantiles = boosted_trees_ops.boosted_trees_bucketize(
           [self._feature_0, self._feature_1], buckets)
-      sess.run(summary_op)
-      sess.run(flush_op)
+      self.evaluate(summary_op)
+      self.evaluate(flush_op)
       self.assertAllClose(self._feature_0_boundaries, buckets[0].eval())
       self.assertAllClose(self._feature_1_boundaries, buckets[1].eval())
 
       self.assertAllClose(self._feature_0_quantiles, quantiles[0].eval())
       self.assertAllClose(self._feature_1_quantiles, quantiles[1].eval())
 
+  @test_util.run_deprecated_v1
   def testBasicQuantileBucketsMultipleResources(self):
     with self.cached_session() as sess:
       quantile_accumulator_handle_0 = self.create_resource("float_0", self.eps,
@@ -132,14 +134,15 @@ class QuantileOpsTest(test_util.TensorFlowTestCase):
           quantile_accumulator_handle_1, num_features=1)
       quantiles = boosted_trees_ops.boosted_trees_bucketize(
           [self._feature_0, self._feature_1], bucket_0 + bucket_1)
-      sess.run([summary_op_0, summary_op_1])
-      sess.run([flush_op_0, flush_op_1])
+      self.evaluate([summary_op_0, summary_op_1])
+      self.evaluate([flush_op_0, flush_op_1])
       self.assertAllClose(self._feature_0_boundaries, bucket_0[0].eval())
       self.assertAllClose(self._feature_1_boundaries, bucket_1[0].eval())
 
       self.assertAllClose(self._feature_0_quantiles, quantiles[0].eval())
       self.assertAllClose(self._feature_1_quantiles, quantiles[1].eval())
 
+  @test_util.run_deprecated_v1
   def testSaveRestoreAfterFlush(self):
     save_dir = os.path.join(self.get_temp_dir(), "save_restore")
     save_path = os.path.join(tempfile.mkdtemp(prefix=save_dir), "hash")
@@ -158,7 +161,7 @@ class QuantileOpsTest(test_util.TensorFlowTestCase):
                                             self._example_weights)
       with ops.control_dependencies([summaries]):
         flush = accumulator.flush()
-      sess.run(flush)
+      self.evaluate(flush)
       self.assertAllClose(self._feature_0_boundaries, buckets[0].eval())
       self.assertAllClose(self._feature_1_boundaries, buckets[1].eval())
       save.save(sess, save_path)
@@ -172,6 +175,7 @@ class QuantileOpsTest(test_util.TensorFlowTestCase):
       self.assertAllClose(self._feature_0_boundaries, buckets[0].eval())
       self.assertAllClose(self._feature_1_boundaries, buckets[1].eval())
 
+  @test_util.run_deprecated_v1
   def testSaveRestoreBeforeFlush(self):
     save_dir = os.path.join(self.get_temp_dir(), "save_restore")
     save_path = os.path.join(tempfile.mkdtemp(prefix=save_dir), "hash")
@@ -185,12 +189,12 @@ class QuantileOpsTest(test_util.TensorFlowTestCase):
 
       summaries = accumulator.add_summaries([self._feature_0, self._feature_1],
                                             self._example_weights)
-      sess.run(summaries)
+      self.evaluate(summaries)
       buckets = accumulator.get_bucket_boundaries()
       self.assertAllClose([], buckets[0].eval())
       self.assertAllClose([], buckets[1].eval())
       save.save(sess, save_path)
-      sess.run(accumulator.flush())
+      self.evaluate(accumulator.flush())
       self.assertAllClose(self._feature_0_boundaries, buckets[0].eval())
       self.assertAllClose(self._feature_1_boundaries, buckets[1].eval())
 
diff --git a/tensorflow/python/kernel_tests/boosted_trees/resource_ops_test.py b/tensorflow/python/kernel_tests/boosted_trees/resource_ops_test.py
index 65bb9ab55f0..0a34277bbdb 100644
--- a/tensorflow/python/kernel_tests/boosted_trees/resource_ops_test.py
+++ b/tensorflow/python/kernel_tests/boosted_trees/resource_ops_test.py
@@ -30,19 +30,21 @@ from tensorflow.python.platform import googletest
 class ResourceOpsTest(test_util.TensorFlowTestCase):
   """Tests resource_ops."""
 
+  @test_util.run_deprecated_v1
   def testCreate(self):
     with self.cached_session():
       ensemble = boosted_trees_ops.TreeEnsemble('ensemble')
       resources.initialize_resources(resources.shared_resources()).run()
       stamp_token = ensemble.get_stamp_token()
-      self.assertEqual(0, stamp_token.eval())
+      self.assertEqual(0, self.evaluate(stamp_token))
       (_, num_trees, num_finalized_trees, num_attempted_layers,
        nodes_range) = ensemble.get_states()
-      self.assertEqual(0, num_trees.eval())
-      self.assertEqual(0, num_finalized_trees.eval())
-      self.assertEqual(0, num_attempted_layers.eval())
-      self.assertAllEqual([0, 1], nodes_range.eval())
+      self.assertEqual(0, self.evaluate(num_trees))
+      self.assertEqual(0, self.evaluate(num_finalized_trees))
+      self.assertEqual(0, self.evaluate(num_attempted_layers))
+      self.assertAllEqual([0, 1], self.evaluate(nodes_range))
 
+  @test_util.run_deprecated_v1
   def testCreateWithProto(self):
     with self.cached_session():
       ensemble_proto = boosted_trees_pb2.TreeEnsemble()
@@ -154,12 +156,13 @@ class ResourceOpsTest(test_util.TensorFlowTestCase):
       resources.initialize_resources(resources.shared_resources()).run()
       (stamp_token, num_trees, num_finalized_trees, num_attempted_layers,
        nodes_range) = ensemble.get_states()
-      self.assertEqual(7, stamp_token.eval())
-      self.assertEqual(2, num_trees.eval())
-      self.assertEqual(1, num_finalized_trees.eval())
-      self.assertEqual(6, num_attempted_layers.eval())
-      self.assertAllEqual([16, 19], nodes_range.eval())
+      self.assertEqual(7, self.evaluate(stamp_token))
+      self.assertEqual(2, self.evaluate(num_trees))
+      self.assertEqual(1, self.evaluate(num_finalized_trees))
+      self.assertEqual(6, self.evaluate(num_attempted_layers))
+      self.assertAllEqual([16, 19], self.evaluate(nodes_range))
 
+  @test_util.run_deprecated_v1
   def testSerializeDeserialize(self):
     with self.cached_session():
       # Initialize.
@@ -167,11 +170,11 @@ class ResourceOpsTest(test_util.TensorFlowTestCase):
       resources.initialize_resources(resources.shared_resources()).run()
       (stamp_token, num_trees, num_finalized_trees, num_attempted_layers,
        nodes_range) = ensemble.get_states()
-      self.assertEqual(5, stamp_token.eval())
-      self.assertEqual(0, num_trees.eval())
-      self.assertEqual(0, num_finalized_trees.eval())
-      self.assertEqual(0, num_attempted_layers.eval())
-      self.assertAllEqual([0, 1], nodes_range.eval())
+      self.assertEqual(5, self.evaluate(stamp_token))
+      self.assertEqual(0, self.evaluate(num_trees))
+      self.assertEqual(0, self.evaluate(num_finalized_trees))
+      self.assertEqual(0, self.evaluate(num_attempted_layers))
+      self.assertAllEqual([0, 1], self.evaluate(nodes_range))
 
       # Deserialize.
       ensemble_proto = boosted_trees_pb2.TreeEnsemble()
@@ -219,18 +222,18 @@ class ResourceOpsTest(test_util.TensorFlowTestCase):
       ]):
         (stamp_token, num_trees, num_finalized_trees, num_attempted_layers,
          nodes_range) = ensemble.get_states()
-      self.assertEqual(3, stamp_token.eval())
-      self.assertEqual(1, num_trees.eval())
+      self.assertEqual(3, self.evaluate(stamp_token))
+      self.assertEqual(1, self.evaluate(num_trees))
       # This reads from metadata, not really counting the layers.
-      self.assertEqual(5, num_attempted_layers.eval())
-      self.assertEqual(0, num_finalized_trees.eval())
-      self.assertAllEqual([3, 7], nodes_range.eval())
+      self.assertEqual(5, self.evaluate(num_attempted_layers))
+      self.assertEqual(0, self.evaluate(num_finalized_trees))
+      self.assertAllEqual([3, 7], self.evaluate(nodes_range))
 
 
       # Serialize.
       new_ensemble_proto = boosted_trees_pb2.TreeEnsemble()
       new_stamp_token, new_serialized = ensemble.serialize()
-      self.assertEqual(3, new_stamp_token.eval())
+      self.assertEqual(3, self.evaluate(new_stamp_token))
       new_ensemble_proto.ParseFromString(new_serialized.eval())
       self.assertProtoEquals(ensemble_proto, new_ensemble_proto)
 
diff --git a/tensorflow/python/kernel_tests/boosted_trees/stats_ops_test.py b/tensorflow/python/kernel_tests/boosted_trees/stats_ops_test.py
index 09e9cfa3aff..e2e23486b5a 100644
--- a/tensorflow/python/kernel_tests/boosted_trees/stats_ops_test.py
+++ b/tensorflow/python/kernel_tests/boosted_trees/stats_ops_test.py
@@ -65,16 +65,16 @@ class StatsOpsTest(test_util.TensorFlowTestCase):
           min_node_weight=0,
           max_splits=max_splits)
 
-      self.assertAllEqual([[1, 2], [1, 2]], sess.run(node_ids_list))
+      self.assertAllEqual([[1, 2], [1, 2]], self.evaluate(node_ids_list))
       self.assertAllClose([[0.004775, 0.41184], [0.02823, 0.41184]],
-                          sess.run(gains_list))
-      self.assertAllEqual([[1, 1], [1, 1]], sess.run(thresholds_list))
+                          self.evaluate(gains_list))
+      self.assertAllEqual([[1, 1], [1, 1]], self.evaluate(thresholds_list))
       # The left node contrib will be later added to the previous node value to
       # make the left node value, and the same for right node contrib.
       self.assertAllClose([[[-.416667], [.568966]], [[-.6], [-.75]]],
-                          sess.run(left_node_contribs_list))
+                          self.evaluate(left_node_contribs_list))
       self.assertAllClose([[[-.592593], [-.75]], [[-.076923], [.568966]]],
-                          sess.run(right_node_contribs_list))
+                          self.evaluate(right_node_contribs_list))
 
   def testCalculateBestGainsWithL2(self):
     """Testing Gain calculation with L2."""
@@ -113,16 +113,16 @@ class StatsOpsTest(test_util.TensorFlowTestCase):
           min_node_weight=0,
           max_splits=max_splits)
 
-      self.assertAllEqual([[1, 2], [1, 2]], sess.run(node_ids_list))
+      self.assertAllEqual([[1, 2], [1, 2]], self.evaluate(node_ids_list))
       self.assertAllClose([[0., 0.33931375], [0.01879096, 0.33931375]],
-                          sess.run(gains_list))
-      self.assertAllEqual([[0, 1], [1, 1]], sess.run(thresholds_list))
+                          self.evaluate(gains_list))
+      self.assertAllEqual([[0, 1], [1, 1]], self.evaluate(thresholds_list))
       # The left node contrib will be later added to the previous node value to
       # make the left node value, and the same for right node contrib.
       self.assertAllClose([[[0.], [.485294]], [[-.5], [-.6]]],
-                          sess.run(left_node_contribs_list))
+                          self.evaluate(left_node_contribs_list))
       self.assertAllClose([[[-.424658], [-.6]], [[-.043478], [.485294]]],
-                          sess.run(right_node_contribs_list))
+                          self.evaluate(right_node_contribs_list))
 
   def testCalculateBestGainsWithL1(self):
     """Testing Gain calculation with L1."""
@@ -162,18 +162,18 @@ class StatsOpsTest(test_util.TensorFlowTestCase):
           min_node_weight=0,
           max_splits=max_splits)
 
-      self.assertAllEqual([[0, 1], [1, 1]], sess.run(thresholds_list))
+      self.assertAllEqual([[0, 1], [1, 1]], self.evaluate(thresholds_list))
 
-      self.assertAllEqual([[1, 2], [1, 2]], sess.run(node_ids_list))
+      self.assertAllEqual([[1, 2], [1, 2]], self.evaluate(node_ids_list))
       self.assertAllClose([[[0.0], [0.3965517]], [[-0.4], [-0.5]]],
-                          sess.run(left_node_contribs_list))
+                          self.evaluate(left_node_contribs_list))
 
       self.assertAllClose([[[-0.3333333], [-0.5]], [[0.0], [0.396552]]],
-                          sess.run(right_node_contribs_list))
+                          self.evaluate(right_node_contribs_list))
 
       # Gain should also include an adjustment of the gradient by l1.
       self.assertAllClose([[0.0, 0.191207], [0.01, 0.191207]],
-                          sess.run(gains_list))
+                          self.evaluate(gains_list))
 
   def testCalculateBestGainsWithTreeComplexity(self):
     """Testing Gain calculation with L2."""
@@ -214,18 +214,18 @@ class StatsOpsTest(test_util.TensorFlowTestCase):
           min_node_weight=0,
           max_splits=max_splits)
 
-      self.assertAllEqual([[1, 2], [1, 2]], sess.run(node_ids_list))
+      self.assertAllEqual([[1, 2], [1, 2]], self.evaluate(node_ids_list))
 
       self.assertAllClose([[-3., -2.66068625], [-2.98120904, -2.66068625]],
-                          sess.run(gains_list))
+                          self.evaluate(gains_list))
 
-      self.assertAllEqual([[0, 1], [1, 1]], sess.run(thresholds_list))
+      self.assertAllEqual([[0, 1], [1, 1]], self.evaluate(thresholds_list))
       # The left node contrib will be later added to the previous node value to
       # make the left node value, and the same for right node contrib.
       self.assertAllClose([[[0.], [.485294]], [[-.5], [-.6]]],
-                          sess.run(left_node_contribs_list))
+                          self.evaluate(left_node_contribs_list))
       self.assertAllClose([[[-.424658], [-.6]], [[-.043478], [.485294]]],
-                          sess.run(right_node_contribs_list))
+                          self.evaluate(right_node_contribs_list))
 
   def testCalculateBestGainsWithMinNodeWeight(self):
     """Testing Gain calculation without any regularization."""
@@ -266,13 +266,13 @@ class StatsOpsTest(test_util.TensorFlowTestCase):
 
       # We can't split node 1 on feature 1 and node 2 on feature 2 because of
       # the min node weight.
-      self.assertAllEqual([[2], [1]], sess.run(node_ids_list))
-      self.assertAllClose([[0.384314], [0.098013]], sess.run(gains_list))
-      self.assertAllEqual([[1], [1]], sess.run(thresholds_list))
+      self.assertAllEqual([[2], [1]], self.evaluate(node_ids_list))
+      self.assertAllClose([[0.384314], [0.098013]], self.evaluate(gains_list))
+      self.assertAllEqual([[1], [1]], self.evaluate(thresholds_list))
       self.assertAllClose([[[0.4852941]], [[-.6]]],
-                          sess.run(left_node_contribs_list))
+                          self.evaluate(left_node_contribs_list))
       self.assertAllClose([[[-0.75]], [[-0.014925]]],
-                          sess.run(right_node_contribs_list))
+                          self.evaluate(right_node_contribs_list))
 
   def testCalculateBestGainsWithMinNodeWeightNoSplitOnFeturePossible(self):
     """Testing Gain calculation without any regularization."""
@@ -311,9 +311,9 @@ class StatsOpsTest(test_util.TensorFlowTestCase):
            max_splits=max_splits)
 
       # We can't split either of the nodes on the first feature
-      self.assertEqual(2, len(sess.run(node_ids_list)))
-      self.assertAllEqual([], sess.run(node_ids_list)[0])
-      self.assertAllEqual([1], sess.run(node_ids_list)[1])
+      self.assertEqual(2, len(self.evaluate(node_ids_list)))
+      self.assertAllEqual([], self.evaluate(node_ids_list)[0])
+      self.assertAllEqual([1], self.evaluate(node_ids_list)[1])
 
       # Now check when we can't split on any feature
       (node_ids_list, _, _, _,
@@ -325,8 +325,9 @@ class StatsOpsTest(test_util.TensorFlowTestCase):
            tree_complexity=0.0,
            min_node_weight=10,
            max_splits=max_splits)
-      self.assertAllEqual([[], []], sess.run(node_ids_list))
+      self.assertAllEqual([[], []], self.evaluate(node_ids_list))
 
+  @test_util.run_deprecated_v1
   def testMakeStatsSummarySimple(self):
     """Simple test for MakeStatsSummary."""
     with self.cached_session():
@@ -359,7 +360,7 @@ class StatsOpsTest(test_util.TensorFlowTestCase):
               [[0., 0.], [.15, .36], [.06, .07], [.1, .2]],  # node 1
               [[-.33, .58], [0., 0.], [.3, .4], [0., 0.]],  # node 2
           ]],
-          result.eval())
+          self.evaluate(result))
 
   def testMakeStatsSummaryMultipleFeatures(self):
     """Tests that MakeStatsSummary works for multiple features."""
@@ -389,7 +390,7 @@ class StatsOpsTest(test_util.TensorFlowTestCase):
                   [[.3, .4], [0., 0.], [-.4, .5], [.07, .08]],  # node 2
               ],  # feature 1
           ],
-          result.eval())
+          self.evaluate(result))
 
   def _verify_precision(self, length):
     with self.cached_session():
@@ -408,7 +409,7 @@ class StatsOpsTest(test_util.TensorFlowTestCase):
           node_ids, gradients, hessians, [bucketized_features], max_splits,
           num_buckets)  # shape=[max_splits, num_buckets, num_features, 2]
 
-      self.assertAllClose([[[[2., 0.2]]]], result.eval())
+      self.assertAllClose([[[[2., 0.2]]]], self.evaluate(result))
 
   def testMakeStatsSummaryNumericalPrecisionSmallBatch(self):
     """Tests numeric precision."""
diff --git a/tensorflow/python/kernel_tests/boosted_trees/training_ops_test.py b/tensorflow/python/kernel_tests/boosted_trees/training_ops_test.py
index ea022820e44..afc0564fc5a 100644
--- a/tensorflow/python/kernel_tests/boosted_trees/training_ops_test.py
+++ b/tensorflow/python/kernel_tests/boosted_trees/training_ops_test.py
@@ -30,6 +30,7 @@ from tensorflow.python.platform import googletest
 class UpdateTreeEnsembleOpTest(test_util.TensorFlowTestCase):
   """Tests for growing tree ensemble from split candidates."""
 
+  @test_util.run_deprecated_v1
   def testGrowWithEmptyEnsemble(self):
     """Test growing an empty ensemble."""
     with self.cached_session() as session:
@@ -139,6 +140,7 @@ class UpdateTreeEnsembleOpTest(test_util.TensorFlowTestCase):
       self.assertEqual(new_stamp, 1)
       self.assertProtoEquals(expected_result, tree_ensemble)
 
+  @test_util.run_deprecated_v1
   def testBiasCenteringOnEmptyEnsemble(self):
     """Test growing with bias centering on an empty ensemble."""
     with self.cached_session() as session:
@@ -182,6 +184,7 @@ class UpdateTreeEnsembleOpTest(test_util.TensorFlowTestCase):
       self.assertEqual(new_stamp, 1)
       self.assertProtoEquals(expected_result, tree_ensemble)
 
+  @test_util.run_deprecated_v1
   def testGrowExistingEnsembleTreeNotFinalized(self):
     """Test growing an existing ensemble with the last tree not finalized."""
     with self.cached_session() as session:
@@ -366,6 +369,7 @@ class UpdateTreeEnsembleOpTest(test_util.TensorFlowTestCase):
       self.assertEqual(new_stamp, 1)
       self.assertProtoEquals(expected_result, tree_ensemble)
 
+  @test_util.run_deprecated_v1
   def testGrowExistingEnsembleTreeFinalized(self):
     """Test growing an existing ensemble with the last tree finalized."""
     with self.cached_session() as session:
@@ -515,6 +519,7 @@ class UpdateTreeEnsembleOpTest(test_util.TensorFlowTestCase):
       self.assertEqual(new_stamp, 1)
       self.assertProtoEquals(expected_result, tree_ensemble)
 
+  @test_util.run_deprecated_v1
   def testPrePruning(self):
     """Test growing an existing ensemble with pre-pruning."""
     with self.cached_session() as session:
@@ -671,6 +676,7 @@ class UpdateTreeEnsembleOpTest(test_util.TensorFlowTestCase):
       self.assertEqual(new_stamp, 1)
       self.assertProtoEquals(expected_result, tree_ensemble)
 
+  @test_util.run_deprecated_v1
   def testMetadataWhenCantSplitDueToEmptySplits(self):
     """Test that the metadata is updated even though we can't split."""
     with self.cached_session() as session:
@@ -782,6 +788,7 @@ class UpdateTreeEnsembleOpTest(test_util.TensorFlowTestCase):
       self.assertEqual(new_stamp, 1)
       self.assertProtoEquals(expected_result, tree_ensemble)
 
+  @test_util.run_deprecated_v1
   def testMetadataWhenCantSplitDuePrePruning(self):
     """Test metadata is updated correctly when no split due to prepruning."""
     with self.cached_session() as session:
@@ -917,6 +924,7 @@ class UpdateTreeEnsembleOpTest(test_util.TensorFlowTestCase):
       self.assertEqual(new_stamp, 1)
       self.assertProtoEquals(expected_result, tree_ensemble)
 
+  @test_util.run_deprecated_v1
   def testPostPruningOfSomeNodes(self):
     """Test growing an ensemble with post-pruning."""
     with self.cached_session() as session:
@@ -1251,6 +1259,7 @@ class UpdateTreeEnsembleOpTest(test_util.TensorFlowTestCase):
       self.assertEqual(new_stamp, 3)
       self.assertProtoEquals(expected_result, res_ensemble)
 
+  @test_util.run_deprecated_v1
   def testPostPruningOfAllNodes(self):
     """Test growing an ensemble with post-pruning, with all nodes are pruned."""
     with self.cached_session() as session:
@@ -1434,6 +1443,7 @@ class UpdateTreeEnsembleOpTest(test_util.TensorFlowTestCase):
       }
       """, res_ensemble)
 
+  @test_util.run_deprecated_v1
   def testPostPruningChangesNothing(self):
     """Test growing an ensemble with post-pruning with all gains >0."""
     with self.cached_session() as session:
diff --git a/tensorflow/python/kernel_tests/broadcast_to_ops_test.py b/tensorflow/python/kernel_tests/broadcast_to_ops_test.py
index 233c1664052..b9eb2391b49 100644
--- a/tensorflow/python/kernel_tests/broadcast_to_ops_test.py
+++ b/tensorflow/python/kernel_tests/broadcast_to_ops_test.py
@@ -29,6 +29,7 @@ from tensorflow.python.platform import test as test_lib
 
 class BroadcastToTest(test_util.TensorFlowTestCase):
 
+  @test_util.run_deprecated_v1
   def testBroadcastToBasic(self):
     for dtype in [np.uint8, np.uint16, np.int8, np.int16, np.int32, np.int64]:
       with self.session(use_gpu=True):
@@ -37,6 +38,7 @@ class BroadcastToTest(test_util.TensorFlowTestCase):
         v_np = np.broadcast_to(x, [3, 3])
         self.assertAllEqual(v_tf.eval(), v_np)
 
+  @test_util.run_deprecated_v1
   def testBroadcastToString(self):
     with self.session(use_gpu=True):
       x = np.array([b"1", b"2", b"3"])
@@ -44,6 +46,7 @@ class BroadcastToTest(test_util.TensorFlowTestCase):
       v_np = np.broadcast_to(x, [3, 3])
       self.assertAllEqual(v_tf.eval(), v_np)
 
+  @test_util.run_deprecated_v1
   def testBroadcastToBool(self):
     with self.session(use_gpu=True):
       x = np.array([True, False, True], dtype=np.bool)
@@ -51,6 +54,7 @@ class BroadcastToTest(test_util.TensorFlowTestCase):
       v_np = np.broadcast_to(x, [3, 3])
       self.assertAllEqual(v_tf.eval(), v_np)
 
+  @test_util.run_deprecated_v1
   def testBroadcastToShape(self):
     for input_dim in range(1, 6):
       for output_dim in range(input_dim, 6):
@@ -62,6 +66,7 @@ class BroadcastToTest(test_util.TensorFlowTestCase):
           v_np = np.broadcast_to(x, output_shape)
           self.assertAllEqual(v_tf.eval(), v_np)
 
+  @test_util.run_deprecated_v1
   def testBroadcastToScalar(self):
     with self.session(use_gpu=True):
       x = np.array(1, dtype=np.int32)
@@ -69,6 +74,7 @@ class BroadcastToTest(test_util.TensorFlowTestCase):
       v_np = np.broadcast_to(x, [3, 3])
       self.assertAllEqual(v_tf.eval(), v_np)
 
+  @test_util.run_deprecated_v1
   def testBroadcastScalarToNonScalar(self):
     with self.session(use_gpu=True):
       x = np.array(1.0, dtype=np.float)
@@ -76,6 +82,7 @@ class BroadcastToTest(test_util.TensorFlowTestCase):
       v_np = np.broadcast_to(x, [2, 3, 4])
       self.assertAllEqual(v_tf.eval(), v_np)
 
+  @test_util.run_deprecated_v1
   def testBroadcastToShapeTypeAndInference(self):
     for dtype in [dtypes.int32, dtypes.int64]:
       with self.cached_session(use_gpu=True):
@@ -89,6 +96,7 @@ class BroadcastToTest(test_util.TensorFlowTestCase):
         # check shape inference when shape input is constant
         self.assertAllEqual(shape, v_np.shape)
 
+  @test_util.run_deprecated_v1
   def testGradientForScalar(self):
     x = constant_op.constant(1, dtype=dtypes.float32)
     v = array_ops.broadcast_to(x, [2, 4, 3])
@@ -98,6 +106,7 @@ class BroadcastToTest(test_util.TensorFlowTestCase):
                                                     out.get_shape())
     self.assertLess(err, 1e-4)
 
+  @test_util.run_deprecated_v1
   def testGradientWithSameRank(self):
     x = constant_op.constant(np.reshape(np.arange(6), (2, 1, 3)),
                              dtype=dtypes.float32)
@@ -108,6 +117,7 @@ class BroadcastToTest(test_util.TensorFlowTestCase):
                                                     out, out.get_shape())
     self.assertLess(err, 1e-4)
 
+  @test_util.run_deprecated_v1
   def testGradientWithIncreasingRank(self):
     x = constant_op.constant([[1], [2]],
                              dtype=dtypes.float32)
@@ -118,6 +128,7 @@ class BroadcastToTest(test_util.TensorFlowTestCase):
                                                     out, out.get_shape())
     self.assertLess(err, 1e-4)
 
+  @test_util.run_deprecated_v1
   def testGradientWithBroadcastAllDimensions(self):
     x = constant_op.constant([[1, 2, 3], [4, 5, 6]], dtype=dtypes.float32)
     v = array_ops.broadcast_to(x, [5, 4, 6])
diff --git a/tensorflow/python/kernel_tests/bucketize_op_test.py b/tensorflow/python/kernel_tests/bucketize_op_test.py
index 57413e6af50..95df6943705 100644
--- a/tensorflow/python/kernel_tests/bucketize_op_test.py
+++ b/tensorflow/python/kernel_tests/bucketize_op_test.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import errors_impl
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 
@@ -32,7 +33,7 @@ class BucketizationOpTest(test.TestCase):
         boundaries=[0, 3, 8, 11])
     expected_out = [0, 1, 1, 2, 2, 3, 3, 4, 4]
     with self.session(use_gpu=True) as sess:
-      self.assertAllEqual(expected_out, sess.run(op))
+      self.assertAllEqual(expected_out, self.evaluate(op))
 
   def testFloat(self):
     op = math_ops._bucketize(
@@ -40,7 +41,7 @@ class BucketizationOpTest(test.TestCase):
         boundaries=[0., 3., 8., 11.])
     expected_out = [0, 1, 1, 2, 2, 3, 3, 4, 4]
     with self.session(use_gpu=True) as sess:
-      self.assertAllEqual(expected_out, sess.run(op))
+      self.assertAllEqual(expected_out, self.evaluate(op))
 
   def test2DInput(self):
     op = math_ops._bucketize(
@@ -48,15 +49,16 @@ class BucketizationOpTest(test.TestCase):
         boundaries=[0, 3, 8, 11])
     expected_out = [[0, 1, 1, 2, 2], [3, 3, 4, 4, 1]]
     with self.session(use_gpu=True) as sess:
-      self.assertAllEqual(expected_out, sess.run(op))
+      self.assertAllEqual(expected_out, self.evaluate(op))
 
+  @test_util.run_deprecated_v1
   def testInvalidBoundariesOrder(self):
     op = math_ops._bucketize(
         constant_op.constant([-5, 0]), boundaries=[0, 8, 3, 11])
     with self.session(use_gpu=True) as sess:
       with self.assertRaisesRegexp(
           errors_impl.InvalidArgumentError, "Expected sorted boundaries"):
-        sess.run(op)
+        self.evaluate(op)
 
   def testBoundariesNotList(self):
     with self.assertRaisesRegexp(
diff --git a/tensorflow/python/kernel_tests/candidate_sampler_ops_test.py b/tensorflow/python/kernel_tests/candidate_sampler_ops_test.py
index b19077db560..fa6eb5c9689 100644
--- a/tensorflow/python/kernel_tests/candidate_sampler_ops_test.py
+++ b/tensorflow/python/kernel_tests/candidate_sampler_ops_test.py
@@ -22,6 +22,7 @@ import numpy as np
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import candidate_sampling_ops
 from tensorflow.python.ops import math_ops
@@ -37,6 +38,7 @@ class RangeSamplerOpsTest(test.TestCase):
 
   TRUE_LABELS = [[1, 2], [0, 4], [3, 3]]
 
+  @test_util.run_deprecated_v1
   def testTrueCandidates(self):
     with self.cached_session() as sess:
       indices = constant_op.constant([0, 0, 1, 1, 2, 2])
@@ -55,7 +57,7 @@ class RangeSamplerOpsTest(test.TestCase):
           [[1, 2], [0, 4], [3, 3]], dtype=dtypes.int64)
       sampled_candidates, _, _ = candidate_sampling_ops.all_candidate_sampler(
           true_classes, self.NUM_TRUE, self.NUM_SAMPLED, True)
-      result = sampled_candidates.eval()
+      result = self.evaluate(sampled_candidates)
 
     expected_ids = [0, 1, 2, 3, 4]
     self.assertAllEqual(result, expected_ids)
@@ -68,7 +70,7 @@ class RangeSamplerOpsTest(test.TestCase):
       _, true_expected_count, _ = candidate_sampling_ops.all_candidate_sampler(
           true_classes, self.NUM_TRUE, self.NUM_SAMPLED, True)
       true_log_expected_count = math_ops.log(true_expected_count)
-      result = true_log_expected_count.eval()
+      result = self.evaluate(true_log_expected_count)
 
     self.assertAllEqual(result, [[0.0] * self.NUM_TRUE] * self.BATCH_SIZE)
     self.assertEqual(true_expected_count.get_shape(),
@@ -83,7 +85,7 @@ class RangeSamplerOpsTest(test.TestCase):
       _, _, sampled_expected_count = candidate_sampling_ops.all_candidate_sampler(  # pylint: disable=line-too-long
           true_classes, self.NUM_TRUE, self.NUM_SAMPLED, True)
       sampled_log_expected_count = math_ops.log(sampled_expected_count)
-      result = sampled_log_expected_count.eval()
+      result = self.evaluate(sampled_log_expected_count)
 
     self.assertAllEqual(result, [0.0] * self.NUM_SAMPLED)
     self.assertEqual(sampled_expected_count.get_shape(), [self.NUM_SAMPLED])
@@ -97,7 +99,7 @@ class RangeSamplerOpsTest(test.TestCase):
           true_classes, self.NUM_TRUE, self.NUM_SAMPLED, True)
       accidental_hits = candidate_sampling_ops.compute_accidental_hits(
           true_classes, sampled_candidates, self.NUM_TRUE)
-      indices, ids, weights = sess.run(accidental_hits)
+      indices, ids, weights = self.evaluate(accidental_hits)
 
     self.assertEqual(1, accidental_hits[0].get_shape().ndims)
     self.assertEqual(1, accidental_hits[1].get_shape().ndims)
@@ -106,6 +108,7 @@ class RangeSamplerOpsTest(test.TestCase):
       self.assertTrue(id_ in self.TRUE_LABELS[index])
       self.assertLess(weight, -1.0e37)
 
+  @test_util.run_deprecated_v1
   def testSeed(self):
 
     def draw(seed):
@@ -114,7 +117,7 @@ class RangeSamplerOpsTest(test.TestCase):
             [[1, 2], [0, 4], [3, 3]], dtype=dtypes.int64)
         sampled, _, _ = candidate_sampling_ops.log_uniform_candidate_sampler(
             true_classes, self.NUM_TRUE, self.NUM_SAMPLED, True, 5, seed=seed)
-        return sampled.eval()
+        return self.evaluate(sampled)
 
     # Non-zero seed. Repeatable.
     for seed in [1, 12, 123, 1234]:
diff --git a/tensorflow/python/kernel_tests/cast_op_test.py b/tensorflow/python/kernel_tests/cast_op_test.py
index a5dff5df629..b3187e16371 100644
--- a/tensorflow/python/kernel_tests/cast_op_test.py
+++ b/tensorflow/python/kernel_tests/cast_op_test.py
@@ -25,6 +25,7 @@ import platform
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import math_ops
@@ -90,10 +91,12 @@ class CastOpTest(test.TestCase):
     if x.dtype == np.float32 or x.dtype == np.float64:
       self._testTypes(x, use_gpu=True)
 
+  @test_util.run_deprecated_v1
   def testBasic(self):
     self._testAll(np.arange(-10, 10).reshape(2, 10))
     self._testAll(np.linspace(-10, 10, 17))
 
+  @test_util.run_deprecated_v1
   def testSmallValues(self):
     f4 = np.finfo(np.float32)
     f8 = np.finfo(np.float64)
@@ -107,11 +110,12 @@ class CastOpTest(test.TestCase):
     a = np.random.uniform(-100, 100, 100).astype(np.float32)
     with self.cached_session(use_gpu=False):
       b = math_ops.cast(math_ops.cast(a, dtypes.bfloat16), dtypes.float32)
-      self.assertAllClose(a, b.eval(), rtol=1 / 128.)
+      self.assertAllClose(a, self.evaluate(b), rtol=1 / 128.)
     with self.cached_session(use_gpu=True):
       b = math_ops.cast(math_ops.cast(a, dtypes.bfloat16), dtypes.float32)
-      self.assertAllClose(a, b.eval(), rtol=1 / 128.)
+      self.assertAllClose(a, self.evaluate(b), rtol=1 / 128.)
 
+  @test_util.run_deprecated_v1
   def testRandom(self):
     self._testAll(np.random.normal(0, 10, 210).reshape([2, 3, 5, 7]))
     self._testAll(np.random.normal(0, 1e6, 210).reshape([2, 3, 5, 7]))
@@ -124,6 +128,7 @@ class CastOpTest(test.TestCase):
         self._cast(
             x, dst_dtype, use_gpu=use_gpu), dst_dtype(expected))
 
+  @test_util.run_deprecated_v1
   def testIntToFloatBoundary(self):
     i4 = np.iinfo(np.int32)
     i8 = np.iinfo(np.int64)
@@ -138,6 +143,7 @@ class CastOpTest(test.TestCase):
     self._compare(i8.max, np.float64, i8.max, False)
     # NOTE: GPU does not support int32/int64 for casting.
 
+  @test_util.run_deprecated_v1
   def testInfNan(self):
     i4 = np.iinfo(np.int32)
     i8 = np.iinfo(np.int64)
@@ -181,14 +187,16 @@ class CastOpTest(test.TestCase):
   def testNotImplemented(self):
     self._OpError(np.arange(0, 10), dtypes.string, "Cast.*int64.*string.*")
 
+  @test_util.run_deprecated_v1
   def testCastToTypeOfVariable(self):
     with self.cached_session() as sess:
       x = variables.Variable(5, dtype=dtypes.float32)
       y = variables.Variable(True, dtype=dtypes.bool)
       cast = math_ops.cast(y, x.dtype)
       variables.global_variables_initializer().run()
-      self.assertEqual(1.0, sess.run(cast))
+      self.assertEqual(1.0, self.evaluate(cast))
 
+  @test_util.run_deprecated_v1
   def testGradients(self):
     t = [dtypes.float32, dtypes.float64, dtypes.complex64, dtypes.complex128]
     for src_t in t:
@@ -203,6 +211,7 @@ class CastOpTest(test.TestCase):
 
 class SparseTensorCastTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testCast(self):
     indices = constant_op.constant([[0], [1], [2]], dtypes.int64)
     values = constant_op.constant(np.array([1, 2, 3], np.int64))
@@ -229,7 +238,7 @@ class SaturateCastTest(test.TestCase):
               [lo, lo + 1, lo // 2, hi // 2, hi - 1, hi], dtype=in_type)
           y = math_ops.saturate_cast(x, dtype=out_type)
           self.assertEqual(y.dtype, out_type)
-          x, y = sess.run([x, y])
+          x, y = self.evaluate([x, y])
           correct = np.maximum(out_type.min, np.minimum(out_type.max, x))
           self.assertAllEqual(correct, y)
 
diff --git a/tensorflow/python/kernel_tests/check_ops_test.py b/tensorflow/python/kernel_tests/check_ops_test.py
index 88f5cd6f223..95bac85027b 100644
--- a/tensorflow/python/kernel_tests/check_ops_test.py
+++ b/tensorflow/python/kernel_tests/check_ops_test.py
@@ -25,6 +25,7 @@ from tensorflow.core.protobuf import config_pb2
 from tensorflow.core.protobuf import rewriter_config_pb2
 from tensorflow.python.client import session
 from tensorflow.python.eager import context
+from tensorflow.python.eager import def_function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
@@ -39,6 +40,69 @@ from tensorflow.python.ops import random_ops
 from tensorflow.python.platform import test
 
 
+class AssertV2Asserts(test.TestCase):
+
+  def test_passes_when_it_should(self):
+    # This is a v2 test and need to run eagerly
+    with context.eager_mode():
+      c1 = constant_op.constant(-1, name="minus_one", dtype=dtypes.int32)
+      c2 = constant_op.constant(2, name="two", dtype=dtypes.int32)
+      c3 = constant_op.constant([3., 3.], name="three", dtype=dtypes.float32)
+      c4 = constant_op.constant([3., 3.5], name="three_and_a_half",
+                                dtype=dtypes.float32)
+      scalar = c1
+      non_scalar = c3
+      integer = c1
+      non_integer = c3
+      positive = c2
+      negative = c1
+      cases = [
+          (check_ops.assert_equal_v2, (c1, c1), (c1, c2)),
+          (check_ops.assert_less_v2, (c1, c2), (c1, c1)),
+          (check_ops.assert_near_v2, (c3, c3), (c3, c4)),
+          (check_ops.assert_greater_v2, (c2, c1), (c1, c1)),
+          (check_ops.assert_negative_v2, (negative,), (positive,)),
+          (check_ops.assert_positive_v2, (positive,), (negative,)),
+          (check_ops.assert_less_equal_v2, (c1, c1), (c2, c1)),
+          (check_ops.assert_none_equal_v2, (c1, c2), (c3, c4)),
+          (check_ops.assert_non_negative_v2, (positive,), (negative,)),
+          (check_ops.assert_non_positive_v2, (negative,), (positive,)),
+          (check_ops.assert_greater_equal_v2, (c1, c1), (c1, c2)),
+          (check_ops.assert_type_v2, (c1, dtypes.int32), (c1, dtypes.float32),
+           TypeError),
+          (check_ops.assert_integer_v2, (integer,), (non_integer,),
+           TypeError),
+          (check_ops.assert_scalar_v2, (scalar,), (non_scalar,),
+           ValueError),
+          (check_ops.assert_rank_v2, (c1, 0), (c3, 2), ValueError),
+          (check_ops.assert_rank_in_v2, (c1, [0, 1]), (c1, [1, 2]),
+           ValueError),
+          (check_ops.assert_rank_at_least_v2, (non_scalar, 1), (scalar, 1),
+           ValueError),
+      ]
+
+      for case in cases:
+        fn = case[0]
+        passing_args = case[1]
+        failing_args = case[2]
+        error = errors.InvalidArgumentError if len(case) < 4 else case[3]
+
+        print("Testing %s passing properly." % fn)
+
+        fn(*passing_args)
+
+        print("Testing %s failing properly." % fn)
+
+        @def_function.function
+        def failing_fn():
+          fn(*failing_args, message="fail")  # pylint: disable=cell-var-from-loop
+
+        with self.assertRaisesRegexp(error, "fail"):
+          failing_fn()
+
+        del failing_fn
+
+
 class AssertProperIterableTest(test.TestCase):
 
   @test_util.run_in_graph_and_eager_modes
@@ -109,6 +173,7 @@ class AssertEqualTest(test.TestCase):
       assert x is None
 
   @test_util.run_in_graph_and_eager_modes
+  @test_util.run_deprecated_v1
   def test_raises_when_greater(self):
     # Static check
     static_small = constant_op.constant([1, 2], name="small")
@@ -116,6 +181,7 @@ class AssertEqualTest(test.TestCase):
     with self.assertRaisesRegexp(errors.InvalidArgumentError, "fail"):
       check_ops.assert_equal(static_big, static_small, message="fail")
 
+  @test_util.run_deprecated_v1
   def test_raises_when_greater_dynamic(self):
     with self.cached_session():
       small = array_ops.placeholder(dtypes.int32, name="small")
@@ -187,6 +253,7 @@ First 2 elements of y:
                                summarize=2)
 
   @test_util.run_in_graph_and_eager_modes
+  @test_util.run_deprecated_v1
   def test_raises_when_less(self):
     # Static check
     static_small = constant_op.constant([3, 1], name="small")
@@ -194,6 +261,7 @@ First 2 elements of y:
     with self.assertRaisesRegexp(errors.InvalidArgumentError, "fail"):
       check_ops.assert_equal(static_big, static_small, message="fail")
 
+  @test_util.run_deprecated_v1
   def test_raises_when_less_dynamic(self):
     with self.cached_session():
       small = array_ops.placeholder(dtypes.int32, name="small")
@@ -253,6 +321,7 @@ class AssertNoneEqualTest(test.TestCase):
     self.evaluate(out)
 
   @test_util.run_in_graph_and_eager_modes
+  @test_util.run_deprecated_v1
   def test_raises_when_equal(self):
     small = constant_op.constant([3, 1], name="small")
     with self.assertRaisesOpError("x != y did not hold"):
@@ -442,6 +511,7 @@ class AssertAllCloseTest(test.TestCase):
 class AssertLessTest(test.TestCase):
 
   @test_util.run_in_graph_and_eager_modes
+  @test_util.run_deprecated_v1
   def test_raises_when_equal(self):
     small = constant_op.constant([1, 2], name="small")
     with self.assertRaisesOpError("failure message.*\n*.* x < y did not hold"):
@@ -452,6 +522,7 @@ class AssertLessTest(test.TestCase):
       self.evaluate(out)
 
   @test_util.run_in_graph_and_eager_modes
+  @test_util.run_deprecated_v1
   def test_raises_when_greater(self):
     small = constant_op.constant([1, 2], name="small")
     big = constant_op.constant([3, 4], name="big")
@@ -518,6 +589,7 @@ class AssertLessEqualTest(test.TestCase):
     self.evaluate(out)
 
   @test_util.run_in_graph_and_eager_modes
+  @test_util.run_deprecated_v1
   def test_raises_when_greater(self):
     small = constant_op.constant([1, 2], name="small")
     big = constant_op.constant([3, 4], name="big")
@@ -573,6 +645,7 @@ class AssertLessEqualTest(test.TestCase):
 class AssertGreaterTest(test.TestCase):
 
   @test_util.run_in_graph_and_eager_modes
+  @test_util.run_deprecated_v1
   def test_raises_when_equal(self):
     small = constant_op.constant([1, 2], name="small")
     with self.assertRaisesOpError("fail"):
@@ -583,6 +656,7 @@ class AssertGreaterTest(test.TestCase):
       self.evaluate(out)
 
   @test_util.run_in_graph_and_eager_modes
+  @test_util.run_deprecated_v1
   def test_raises_when_less(self):
     small = constant_op.constant([1, 2], name="small")
     big = constant_op.constant([3, 4], name="big")
@@ -642,6 +716,7 @@ class AssertGreaterEqualTest(test.TestCase):
     self.evaluate(out)
 
   @test_util.run_in_graph_and_eager_modes
+  @test_util.run_deprecated_v1
   def test_raises_when_less(self):
     small = constant_op.constant([1, 2], name="small")
     big = constant_op.constant([3, 4], name="big")
@@ -706,6 +781,7 @@ class AssertNegativeTest(test.TestCase):
     self.evaluate(out)
 
   @test_util.run_in_graph_and_eager_modes
+  @test_util.run_deprecated_v1
   def test_raises_when_positive(self):
     doug = constant_op.constant([1, 2], name="doug")
     with self.assertRaisesOpError("fail"):
@@ -716,6 +792,7 @@ class AssertNegativeTest(test.TestCase):
       self.evaluate(out)
 
   @test_util.run_in_graph_and_eager_modes
+  @test_util.run_deprecated_v1
   def test_raises_when_zero(self):
     claire = constant_op.constant([0], name="claire")
     with self.assertRaisesOpError("x < 0 did not hold"):
@@ -738,6 +815,7 @@ class AssertNegativeTest(test.TestCase):
 class AssertPositiveTest(test.TestCase):
 
   @test_util.run_in_graph_and_eager_modes
+  @test_util.run_deprecated_v1
   def test_raises_when_negative(self):
     freddie = constant_op.constant([-1, -2], name="freddie")
     with self.assertRaisesOpError("fail"):
@@ -755,6 +833,7 @@ class AssertPositiveTest(test.TestCase):
     self.evaluate(out)
 
   @test_util.run_in_graph_and_eager_modes
+  @test_util.run_deprecated_v1
   def test_raises_when_zero(self):
     meechum = constant_op.constant([0], name="meechum")
     with self.assertRaisesOpError("x > 0 did not hold"):
@@ -777,26 +856,31 @@ class AssertPositiveTest(test.TestCase):
 class EnsureShapeTest(test.TestCase):
 
   # Static shape inference
+  @test_util.run_deprecated_v1
   def testStaticShape(self):
     placeholder = array_ops.placeholder(dtypes.int32)
     ensure_shape_op = check_ops.ensure_shape(placeholder, (3, 3, 3))
     self.assertEqual(ensure_shape_op.get_shape(), (3, 3, 3))
 
+  @test_util.run_deprecated_v1
   def testStaticShape_MergesShapes(self):
     placeholder = array_ops.placeholder(dtypes.int32, shape=(None, None, 3))
     ensure_shape_op = check_ops.ensure_shape(placeholder, (5, 4, None))
     self.assertEqual(ensure_shape_op.get_shape(), (5, 4, 3))
 
+  @test_util.run_deprecated_v1
   def testStaticShape_RaisesErrorWhenRankIncompatible(self):
     placeholder = array_ops.placeholder(dtypes.int32, shape=(None, None, 3))
     with self.assertRaises(ValueError):
       check_ops.ensure_shape(placeholder, (2, 3))
 
+  @test_util.run_deprecated_v1
   def testStaticShape_RaisesErrorWhenDimIncompatible(self):
     placeholder = array_ops.placeholder(dtypes.int32, shape=(None, None, 3))
     with self.assertRaises(ValueError):
       check_ops.ensure_shape(placeholder, (2, 2, 4))
 
+  @test_util.run_deprecated_v1
   def testStaticShape_CanSetUnknownShape(self):
     placeholder = array_ops.placeholder(dtypes.int32)
     derived = placeholder / 3
@@ -804,6 +888,7 @@ class EnsureShapeTest(test.TestCase):
     self.assertEqual(ensure_shape_op.get_shape(), None)
 
   # Dynamic shape check
+  @test_util.run_deprecated_v1
   def testEnsuresDynamicShape_RaisesError(self):
     placeholder = array_ops.placeholder(dtypes.int32)
     derived = math_ops.divide(placeholder, 3, name="MyDivide")
@@ -816,6 +901,7 @@ class EnsureShapeTest(test.TestCase):
           r"expected shape \[3,3,3\]."):
         sess.run(derived, feed_dict={placeholder: feed_val})
 
+  @test_util.run_deprecated_v1
   def testEnsuresDynamicShape_RaisesErrorDimUnknown(self):
     placeholder = array_ops.placeholder(dtypes.int32)
     derived = placeholder / 3
@@ -828,6 +914,7 @@ class EnsureShapeTest(test.TestCase):
           r"expected shape \[\?,\?,3\]."):
         sess.run(derived, feed_dict={placeholder: feed_val})
 
+  @test_util.run_deprecated_v1
   def testEnsuresDynamicShape(self):
     placeholder = array_ops.placeholder(dtypes.int32)
     derived = placeholder / 3
@@ -836,6 +923,7 @@ class EnsureShapeTest(test.TestCase):
     with self.cached_session() as sess:
       sess.run(derived, feed_dict={placeholder: feed_val})
 
+  @test_util.run_deprecated_v1
   def testEnsuresDynamicShape_WithUnknownDims(self):
     placeholder = array_ops.placeholder(dtypes.int32)
     derived = placeholder / 3
@@ -844,6 +932,7 @@ class EnsureShapeTest(test.TestCase):
     with self.cached_session() as sess:
       sess.run(derived, feed_dict={placeholder: feed_val})
 
+  @test_util.run_deprecated_v1
   def testGradient(self):
     placeholder = array_ops.placeholder(dtypes.float32)
     derived = check_ops.ensure_shape(placeholder, (None, None))
@@ -939,6 +1028,7 @@ class AssertRankTest(test.TestCase):
               tensor, desired_rank, message="fail")]):
         self.evaluate(array_ops.identity(tensor))
 
+  @test_util.run_deprecated_v1
   def test_rank_zero_tensor_raises_if_rank_too_small_dynamic_rank(self):
     with self.cached_session():
       tensor = array_ops.placeholder(dtypes.float32, name="my_tensor")
@@ -957,6 +1047,7 @@ class AssertRankTest(test.TestCase):
         [check_ops.assert_rank(tensor, desired_rank)]):
       self.evaluate(array_ops.identity(tensor))
 
+  @test_util.run_deprecated_v1
   def test_rank_zero_tensor_doesnt_raise_if_rank_just_right_dynamic_rank(self):
     with self.cached_session():
       tensor = array_ops.placeholder(dtypes.float32, name="my_tensor")
@@ -974,6 +1065,7 @@ class AssertRankTest(test.TestCase):
           [check_ops.assert_rank(tensor, desired_rank)]):
         self.evaluate(array_ops.identity(tensor))
 
+  @test_util.run_deprecated_v1
   def test_rank_one_tensor_raises_if_rank_too_large_dynamic_rank(self):
     with self.cached_session():
       tensor = array_ops.placeholder(dtypes.float32, name="my_tensor")
@@ -991,6 +1083,7 @@ class AssertRankTest(test.TestCase):
         [check_ops.assert_rank(tensor, desired_rank)]):
       self.evaluate(array_ops.identity(tensor))
 
+  @test_util.run_deprecated_v1
   def test_rank_one_tensor_doesnt_raise_if_rank_just_right_dynamic_rank(self):
     with self.cached_session():
       tensor = array_ops.placeholder(dtypes.float32, name="my_tensor")
@@ -1008,6 +1101,7 @@ class AssertRankTest(test.TestCase):
           [check_ops.assert_rank(tensor, desired_rank)]):
         self.evaluate(array_ops.identity(tensor))
 
+  @test_util.run_deprecated_v1
   def test_rank_one_tensor_raises_if_rank_too_small_dynamic_rank(self):
     with self.cached_session():
       tensor = array_ops.placeholder(dtypes.float32, name="my_tensor")
@@ -1023,6 +1117,7 @@ class AssertRankTest(test.TestCase):
     with self.assertRaisesRegexp(ValueError, "Rank must be a scalar"):
       check_ops.assert_rank(tensor, np.array([], dtype=np.int32))
 
+  @test_util.run_deprecated_v1
   def test_raises_if_rank_is_not_scalar_dynamic(self):
     with self.cached_session():
       tensor = constant_op.constant(
@@ -1040,6 +1135,7 @@ class AssertRankTest(test.TestCase):
                                  "must be of type <dtype: 'int32'>"):
       check_ops.assert_rank(tensor, .5)
 
+  @test_util.run_deprecated_v1
   def test_raises_if_rank_is_not_integer_dynamic(self):
     with self.cached_session():
       tensor = constant_op.constant(
@@ -1063,6 +1159,7 @@ class AssertRankInTest(test.TestCase):
           check_ops.assert_rank_in(tensor_rank0, (1, 2), message="fail")]):
         self.evaluate(array_ops.identity(tensor_rank0))
 
+  @test_util.run_deprecated_v1
   def test_rank_zero_tensor_raises_if_rank_mismatch_dynamic_rank(self):
     with self.cached_session():
       tensor_rank0 = array_ops.placeholder(dtypes.float32, name="my_tensor")
@@ -1079,6 +1176,7 @@ class AssertRankInTest(test.TestCase):
           check_ops.assert_rank_in(tensor_rank0, desired_ranks)]):
         self.evaluate(array_ops.identity(tensor_rank0))
 
+  @test_util.run_deprecated_v1
   def test_rank_zero_tensor_doesnt_raise_if_rank_matches_dynamic_rank(self):
     with self.cached_session():
       tensor_rank0 = array_ops.placeholder(dtypes.float32, name="my_tensor")
@@ -1095,6 +1193,7 @@ class AssertRankInTest(test.TestCase):
           check_ops.assert_rank_in(tensor_rank1, desired_ranks)]):
         self.evaluate(array_ops.identity(tensor_rank1))
 
+  @test_util.run_deprecated_v1
   def test_rank_one_tensor_doesnt_raise_if_rank_matches_dynamic_rank(self):
     with self.cached_session():
       tensor_rank1 = array_ops.placeholder(dtypes.float32, name="my_tensor")
@@ -1113,6 +1212,7 @@ class AssertRankInTest(test.TestCase):
           check_ops.assert_rank_in(tensor_rank1, (0, 2))]):
         self.evaluate(array_ops.identity(tensor_rank1))
 
+  @test_util.run_deprecated_v1
   def test_rank_one_tensor_raises_if_rank_mismatches_dynamic_rank(self):
     with self.cached_session():
       tensor_rank1 = array_ops.placeholder(dtypes.float32, name="my_tensor")
@@ -1132,6 +1232,7 @@ class AssertRankInTest(test.TestCase):
     with self.assertRaisesRegexp(ValueError, "Rank must be a scalar"):
       check_ops.assert_rank_in(tensor, desired_ranks)
 
+  @test_util.run_deprecated_v1
   def test_raises_if_rank_is_not_scalar_dynamic(self):
     with self.cached_session():
       tensor = constant_op.constant(
@@ -1154,6 +1255,7 @@ class AssertRankInTest(test.TestCase):
                                  "must be of type <dtype: 'int32'>"):
       check_ops.assert_rank_in(tensor, (1, .5,))
 
+  @test_util.run_deprecated_v1
   def test_raises_if_rank_is_not_integer_dynamic(self):
     with self.cached_session():
       tensor = constant_op.constant(
@@ -1177,6 +1279,7 @@ class AssertRankAtLeastTest(test.TestCase):
           [check_ops.assert_rank_at_least(tensor, desired_rank)]):
         self.evaluate(array_ops.identity(tensor))
 
+  @test_util.run_deprecated_v1
   def test_rank_zero_tensor_raises_if_rank_too_small_dynamic_rank(self):
     with self.cached_session():
       tensor = array_ops.placeholder(dtypes.float32, name="my_tensor")
@@ -1194,6 +1297,7 @@ class AssertRankAtLeastTest(test.TestCase):
         [check_ops.assert_rank_at_least(tensor, desired_rank)]):
       self.evaluate(array_ops.identity(tensor))
 
+  @test_util.run_deprecated_v1
   def test_rank_zero_tensor_doesnt_raise_if_rank_just_right_dynamic_rank(self):
     with self.cached_session():
       tensor = array_ops.placeholder(dtypes.float32, name="my_tensor")
@@ -1210,6 +1314,7 @@ class AssertRankAtLeastTest(test.TestCase):
         [check_ops.assert_rank_at_least(tensor, desired_rank)]):
       self.evaluate(array_ops.identity(tensor))
 
+  @test_util.run_deprecated_v1
   def test_rank_one_ten_doesnt_raise_if_rank_too_large_dynamic_rank(self):
     with self.cached_session():
       tensor = array_ops.placeholder(dtypes.float32, name="my_tensor")
@@ -1226,6 +1331,7 @@ class AssertRankAtLeastTest(test.TestCase):
         [check_ops.assert_rank_at_least(tensor, desired_rank)]):
       self.evaluate(array_ops.identity(tensor))
 
+  @test_util.run_deprecated_v1
   def test_rank_one_tensor_doesnt_raise_if_rank_just_right_dynamic_rank(self):
     with self.cached_session():
       tensor = array_ops.placeholder(dtypes.float32, name="my_tensor")
@@ -1243,6 +1349,7 @@ class AssertRankAtLeastTest(test.TestCase):
           [check_ops.assert_rank_at_least(tensor, desired_rank)]):
         self.evaluate(array_ops.identity(tensor))
 
+  @test_util.run_deprecated_v1
   def test_rank_one_tensor_raises_if_rank_too_small_dynamic_rank(self):
     with self.cached_session():
       tensor = array_ops.placeholder(dtypes.float32, name="my_tensor")
@@ -1256,6 +1363,7 @@ class AssertRankAtLeastTest(test.TestCase):
 class AssertNonNegativeTest(test.TestCase):
 
   @test_util.run_in_graph_and_eager_modes
+  @test_util.run_deprecated_v1
   def test_raises_when_negative(self):
     zoe = constant_op.constant([-1, -2], name="zoe")
     with self.assertRaisesOpError("x >= 0 did not hold"):
@@ -1292,6 +1400,7 @@ class AssertNonPositiveTest(test.TestCase):
     self.evaluate(out)
 
   @test_util.run_in_graph_and_eager_modes
+  @test_util.run_deprecated_v1
   def test_raises_when_positive(self):
     rachel = constant_op.constant([0, 2], name="rachel")
     with self.assertRaisesOpError("x <= 0 did not hold"):
diff --git a/tensorflow/python/kernel_tests/checkpoint_ops_test.py b/tensorflow/python/kernel_tests/checkpoint_ops_test.py
index 51611b75afb..b8c8c9edb5a 100644
--- a/tensorflow/python/kernel_tests/checkpoint_ops_test.py
+++ b/tensorflow/python/kernel_tests/checkpoint_ops_test.py
@@ -24,6 +24,7 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import gen_checkpoint_ops
 from tensorflow.python.ops import partitioned_variables
 from tensorflow.python.ops import variable_scope
@@ -48,6 +49,7 @@ class GenerateVocabRemappingTest(test.TestCase):
     with open(self.old_vocab_file, 'w') as f:
       f.write('\n'.join(['knitting', 'eminem', 'MISSING']) + '\n')
 
+  @test_util.run_deprecated_v1
   def test_generate_remapping_with_no_vocab_changes(self):
     """Tests where vocab does not change at all."""
     remapping, num_present = gen_checkpoint_ops.generate_vocab_remapping(
@@ -58,8 +60,8 @@ class GenerateVocabRemappingTest(test.TestCase):
     expected_remapping = range(0, 3)
     expected_num_present = 3
     with self.cached_session():
-      self.assertAllEqual(expected_remapping, remapping.eval())
-      self.assertAllEqual(expected_num_present, num_present.eval())
+      self.assertAllEqual(expected_remapping, self.evaluate(remapping))
+      self.assertAllEqual(expected_num_present, self.evaluate(num_present))
 
   def test_generate_remapping_with_shifted_vocab(self):
     """Tests where vocab is the same, but shifted / ordered differently."""
@@ -71,8 +73,8 @@ class GenerateVocabRemappingTest(test.TestCase):
     expected_remapping = [2, 0, 1]
     expected_num_present = 3
     with self.cached_session():
-      self.assertAllEqual(expected_remapping, remapping.eval())
-      self.assertAllEqual(expected_num_present, num_present.eval())
+      self.assertAllEqual(expected_remapping, self.evaluate(remapping))
+      self.assertAllEqual(expected_num_present, self.evaluate(num_present))
 
   def test_generate_remapping_with_offset(self):
     """Tests offset and num_new_vocab logic."""
@@ -84,8 +86,8 @@ class GenerateVocabRemappingTest(test.TestCase):
     expected_remapping = [0]
     expected_num_present = 1
     with self.cached_session():
-      self.assertAllEqual(expected_remapping, remapping.eval())
-      self.assertAllEqual(expected_num_present, num_present.eval())
+      self.assertAllEqual(expected_remapping, self.evaluate(remapping))
+      self.assertAllEqual(expected_num_present, self.evaluate(num_present))
 
   def test_generate_remapping_with_old_vocab_size(self):
     """Tests where old_vocab_size is specified."""
@@ -99,8 +101,8 @@ class GenerateVocabRemappingTest(test.TestCase):
     expected_remapping = [-1, 0, 1]
     expected_num_present = 2
     with self.cached_session():
-      self.assertAllEqual(expected_remapping, remapping.eval())
-      self.assertAllEqual(expected_num_present, num_present.eval())
+      self.assertAllEqual(expected_remapping, self.evaluate(remapping))
+      self.assertAllEqual(expected_num_present, self.evaluate(num_present))
 
 
 class LoadAndRemapMatrixTest(test.TestCase):
@@ -142,7 +144,7 @@ class LoadAndRemapMatrixTest(test.TestCase):
         num_cols=self.old_num_cols)
     with self.cached_session():
       self.assertAllClose(self.matrix_value[row_remapping],
-                          remapped_matrix.eval())
+                          self.evaluate(remapped_matrix))
 
     # No row remapping, new weight matrix has third col, then first col.
     row_remapping = list(range(self.old_num_rows))
@@ -157,7 +159,7 @@ class LoadAndRemapMatrixTest(test.TestCase):
         num_cols=len(col_remapping))
     with self.cached_session():
       self.assertAllClose(self.matrix_value[row_remapping][:, col_remapping],
-                          remapped_matrix.eval())
+                          self.evaluate(remapped_matrix))
 
     # Both row and column remappings.
     row_remapping = [1, 0, 4]
@@ -172,7 +174,7 @@ class LoadAndRemapMatrixTest(test.TestCase):
         num_cols=len(col_remapping))
     with self.cached_session():
       self.assertAllClose(self.matrix_value[row_remapping][:, col_remapping],
-                          remapped_matrix.eval())
+                          self.evaluate(remapped_matrix))
 
   def test_load_and_remap_with_init(self):
     """Tests the op's load and remap where there are missing entries."""
@@ -190,7 +192,8 @@ class LoadAndRemapMatrixTest(test.TestCase):
         [33, init_val, init_val, init_val, 1, init_val], [3, 2])
 
     with self.cached_session():
-      self.assertAllClose(expected_remapped_matrix, remapped_matrix.eval())
+      self.assertAllClose(expected_remapped_matrix,
+                          self.evaluate(remapped_matrix))
 
   def test_load_and_remap_all_missing_rows(self):
     """Tests when all the rows are missing and need to be initialized."""
@@ -207,7 +210,7 @@ class LoadAndRemapMatrixTest(test.TestCase):
     with self.cached_session():
       self.assertAllClose(
           np.reshape(initializing_values, (num_rows, self.old_num_cols)),
-          remapped_matrix.eval())
+          self.evaluate(remapped_matrix))
 
   def test_load_and_remap_all_missing_rows_and_cols(self):
     """Tests when all the rows & cols are missing and need to be initialized."""
@@ -225,7 +228,7 @@ class LoadAndRemapMatrixTest(test.TestCase):
     with self.cached_session():
       self.assertAllClose(
           np.reshape(initializing_values, (num_rows, num_cols)),
-          remapped_matrix.eval())
+          self.evaluate(remapped_matrix))
 
   def test_load_and_remap_invalid_remapping(self):
     """Tests that errors are raised when an ID maps to multiple new IDs.
@@ -244,7 +247,7 @@ class LoadAndRemapMatrixTest(test.TestCase):
         num_rows=len(invalid_remapping),
         num_cols=self.old_num_cols)
     with self.cached_session(), self.assertRaises(errors.UnimplementedError):
-      remapped_matrix.eval()
+      self.evaluate(remapped_matrix)
 
     # Invalid column remapping.
     remapped_matrix = gen_checkpoint_ops.load_and_remap_matrix(
@@ -256,7 +259,7 @@ class LoadAndRemapMatrixTest(test.TestCase):
         num_rows=self.old_num_rows,
         num_cols=len(invalid_remapping))
     with self.cached_session(), self.assertRaises(errors.UnimplementedError):
-      remapped_matrix.eval()
+      self.evaluate(remapped_matrix)
 
   def test_load_and_remap_incorrect_initializing_values(self):
     """Tests that errors are raised with incorrect number of init values."""
@@ -273,7 +276,7 @@ class LoadAndRemapMatrixTest(test.TestCase):
         num_rows=3,
         num_cols=2)
     with self.cached_session(), self.assertRaises(errors.InvalidArgumentError):
-      remapped_matrix.eval()
+      self.evaluate(remapped_matrix)
 
     remapped_matrix = gen_checkpoint_ops.load_and_remap_matrix(
         ckpt_path=[self.bundle_file],
@@ -285,7 +288,7 @@ class LoadAndRemapMatrixTest(test.TestCase):
         num_rows=3,
         num_cols=2)
     with self.cached_session(), self.assertRaises(errors.InvalidArgumentError):
-      remapped_matrix.eval()
+      self.evaluate(remapped_matrix)
 
 
 class LoadAndRemapMatrixWithMaxRowsTest(test.TestCase):
@@ -324,7 +327,7 @@ class LoadAndRemapMatrixWithMaxRowsTest(test.TestCase):
           num_rows=num_rows,
           num_cols=num_cols,
           max_rows_in_memory=max_rows_in_memory)
-      self.assertAllClose(np_value[::-1], remapped_matrix.eval())
+      self.assertAllClose(np_value[::-1], self.evaluate(remapped_matrix))
 
       # Tests loading the tensor (except for the first and last rows), with
       # uninitialized values. Requires num_rows to be at least 3 since we're
@@ -348,7 +351,7 @@ class LoadAndRemapMatrixWithMaxRowsTest(test.TestCase):
           np.vstack([
               np.tile(42, [prefix_rows, num_cols]), np_value[1:-1],
               np.tile(42, [suffix_rows, num_cols])
-          ]), remapped_matrix.eval())
+          ]), self.evaluate(remapped_matrix))
 
       # Tests when everything is taken from initializing_values.
       new_rows = 7
@@ -365,8 +368,9 @@ class LoadAndRemapMatrixWithMaxRowsTest(test.TestCase):
           max_rows_in_memory=max_rows_in_memory)
       self.assertAllClose(
           np.reshape(initializing_values, (new_rows, num_cols)),
-          remapped_matrix.eval())
+          self.evaluate(remapped_matrix))
 
+  @test_util.run_deprecated_v1
   def test_loading_rows_divisible_by_max_rows(self):
     """Tests loading normal var when rows are evenly divisible by max_rows."""
     self._test_loading_variable_with_max_rows(
@@ -375,6 +379,7 @@ class LoadAndRemapMatrixWithMaxRowsTest(test.TestCase):
         # 9 is evenly divisible by 3.
         max_rows_in_memory=3)
 
+  @test_util.run_deprecated_v1
   def test_loading_rows_not_divisible_by_max_rows(self):
     """Tests loading normal var when rows aren't divisible by max_rows."""
     self._test_loading_variable_with_max_rows(
@@ -383,6 +388,7 @@ class LoadAndRemapMatrixWithMaxRowsTest(test.TestCase):
         # 9 is not evenly divisible by 4.
         max_rows_in_memory=4)
 
+  @test_util.run_deprecated_v1
   def test_loading_rows_less_than_max_rows(self):
     """Tests loading normal var as a single slice.
 
@@ -394,6 +400,7 @@ class LoadAndRemapMatrixWithMaxRowsTest(test.TestCase):
         # 10 > 9.
         max_rows_in_memory=10)
 
+  @test_util.run_deprecated_v1
   def test_loading_no_max_rows(self):
     """Tests loading normal var as a single slice with no valid max_rows."""
     self._test_loading_variable_with_max_rows(
@@ -401,6 +408,7 @@ class LoadAndRemapMatrixWithMaxRowsTest(test.TestCase):
         partitioner=None,
         max_rows_in_memory=-1)
 
+  @test_util.run_deprecated_v1
   def test_loading_partitions_equals_max_rows(self):
     """Tests loading partitioned var sliced on partition boundary."""
     self._test_loading_variable_with_max_rows(
@@ -410,6 +418,7 @@ class LoadAndRemapMatrixWithMaxRowsTest(test.TestCase):
         # exactly 3 rows.
         max_rows_in_memory=3)
 
+  @test_util.run_deprecated_v1
   def test_loading_partitions_greater_than_max_rows(self):
     """Tests loading partitioned var with more slices than partitions."""
     self._test_loading_variable_with_max_rows(
@@ -419,6 +428,7 @@ class LoadAndRemapMatrixWithMaxRowsTest(test.TestCase):
         # row at a time.
         max_rows_in_memory=1)
 
+  @test_util.run_deprecated_v1
   def test_loading_partitions_less_than_max_rows(self):
     """Tests loading partitioned var as a single slice.
 
@@ -429,6 +439,7 @@ class LoadAndRemapMatrixWithMaxRowsTest(test.TestCase):
         partitioner=partitioned_variables.fixed_size_partitioner(3),
         max_rows_in_memory=10)
 
+  @test_util.run_deprecated_v1
   def test_loading_partitions_no_max_rows(self):
     """Tests loading partitioned var as single slice with no valid max_rows."""
     self._test_loading_variable_with_max_rows(
diff --git a/tensorflow/python/kernel_tests/cholesky_op_test.py b/tensorflow/python/kernel_tests/cholesky_op_test.py
index e96b2772665..f3947236b1f 100644
--- a/tensorflow/python/kernel_tests/cholesky_op_test.py
+++ b/tensorflow/python/kernel_tests/cholesky_op_test.py
@@ -26,6 +26,7 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes as dtypes_lib
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gen_linalg_ops
@@ -97,7 +98,7 @@ def TriAngInvCompositeGrad(l, grad):
 class CholeskyOpTest(test.TestCase):
 
   def _verifyCholeskyBase(self, sess, x, chol, verification):
-    chol_np, verification_np = sess.run([chol, verification])
+    chol_np, verification_np = self.evaluate([chol, verification])
     self.assertAllClose(x, verification_np)
     self.assertShapeEqual(x, chol)
     # Check that the cholesky is lower triangular, and has positive diagonal
@@ -145,6 +146,7 @@ class CholeskyOpTest(test.TestCase):
       matrices[i] = np.dot(matrices[i].T.conj(), matrices[i])
     self._verifyCholesky(matrices)
 
+  @test_util.run_deprecated_v1
   def testNonSquareMatrix(self):
     with self.assertRaises(ValueError):
       linalg_ops.cholesky(np.array([[1., 2., 3.], [3., 4., 5.]]))
@@ -175,6 +177,7 @@ class CholeskyOpTest(test.TestCase):
     self._verifyCholesky(np.empty([0, 2, 2]))
     self._verifyCholesky(np.empty([2, 0, 0]))
 
+  @test_util.run_deprecated_v1
   def testConcurrentExecutesWithoutError(self):
     with self.session(use_gpu=True) as sess:
       matrix1 = random_ops.random_normal([5, 5], seed=42)
@@ -183,8 +186,8 @@ class CholeskyOpTest(test.TestCase):
       matrix2 = math_ops.matmul(matrix2, matrix2, adjoint_a=True)
       c1 = linalg_ops.cholesky(matrix1)
       c2 = linalg_ops.cholesky(matrix2)
-      c1_val, c2_val = sess.run([c1, c2])
-      self.assertAllEqual(c1_val, c2_val)
+      c1_val, c2_val = self.evaluate([c1, c2])
+      self.assertAllClose(c1_val, c2_val)
 
 
 class CholeskyGradTest(test.TestCase):
@@ -193,18 +196,21 @@ class CholeskyGradTest(test.TestCase):
   def getShapes(self, shapeList):
     return ((elem, int(np.floor(1.2 * elem))) for elem in shapeList)
 
+  @test_util.run_deprecated_v1
   def testSmallMatrices(self):
     np.random.seed(0)
     shapes = self.getShapes([1, 2, 10])
     self.runFiniteDifferences(
         shapes, dtypes=(dtypes_lib.float32, dtypes_lib.float64))
 
+  @test_util.run_deprecated_v1
   def testSmallMatricesComplex(self):
     np.random.seed(0)
     shapes = self.getShapes([1, 2, 10])
     self.runFiniteDifferences(
         shapes, dtypes=(dtypes_lib.complex64, dtypes_lib.complex128))
 
+  @test_util.run_deprecated_v1
   def testOneBlockMatrices(self):
     np.random.seed(0)
     shapes = self.getShapes([self._backprop_block_size + 1])
@@ -213,12 +219,14 @@ class CholeskyGradTest(test.TestCase):
         dtypes=(dtypes_lib.float32, dtypes_lib.float64),
         scalarTest=True)
 
+  @test_util.run_deprecated_v1
   def testTwoBlockMatrixFloat(self):
     np.random.seed(0)
     shapes = self.getShapes([2 * self._backprop_block_size + 1])
     self.runFiniteDifferences(
         shapes, dtypes=(dtypes_lib.float32,), scalarTest=True)
 
+  @test_util.run_deprecated_v1
   def testTwoBlockMatrixDouble(self):
     np.random.seed(0)
     shapes = self.getShapes([2 * self._backprop_block_size + 1])
@@ -231,6 +239,7 @@ class CholeskyGradTest(test.TestCase):
     self.runFiniteDifferences(
         shapes, dtypes=(dtypes_lib.complex64,), scalarTest=True)
 
+  @test_util.run_deprecated_v1
   def testTwoBlockMatrixComplexDouble(self):
     np.random.seed(0)
     shapes = self.getShapes([2 * self._backprop_block_size + 1])
diff --git a/tensorflow/python/kernel_tests/clip_ops_test.py b/tensorflow/python/kernel_tests/clip_ops_test.py
index efd7eee8474..45f1e6152a2 100644
--- a/tensorflow/python/kernel_tests/clip_ops_test.py
+++ b/tensorflow/python/kernel_tests/clip_ops_test.py
@@ -24,10 +24,12 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import clip_ops
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import gradients_impl
+from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 
 
@@ -55,7 +57,7 @@ class ClipTest(test.TestCase):
       np_ans = [[-4.4, 2.0, 3.0], [4.0, 4.4, 4.4]]
       clip_value = 4.4
       ans = clip_ops.clip_by_value(x, -clip_value, clip_value)
-      tf_ans = ans.eval()
+      tf_ans = self.evaluate(ans)
 
     self.assertAllClose(np_ans, tf_ans)
 
@@ -71,7 +73,7 @@ class ClipTest(test.TestCase):
         clip_value_min = 2
         clip_value_max = 4
         ans = clip_ops.clip_by_value(x, clip_value_min, clip_value_max)
-        tf_ans = ans.eval()
+        tf_ans = self.evaluate(ans)
 
       self.assertAllClose(np_ans, tf_ans)
 
@@ -88,7 +90,7 @@ class ClipTest(test.TestCase):
             [2, 2, 2, 3, 3, 3], shape=[2, 3], dtype=dtype)
         clip_value_max = 4
         ans = clip_ops.clip_by_value(x, clip_value_min, clip_value_max)
-        tf_ans = ans.eval()
+        tf_ans = self.evaluate(ans)
 
       self.assertAllClose(np_ans, tf_ans)
 
@@ -105,7 +107,7 @@ class ClipTest(test.TestCase):
         clip_value_max = constant_op.constant(
             [6, 6, 6, 6, 6, 6], shape=[2, 3], dtype=dtype)
         ans = clip_ops.clip_by_value(x, clip_value_min, clip_value_max)
-        tf_ans = ans.eval()
+        tf_ans = self.evaluate(ans)
 
       self.assertAllClose(np_ans, tf_ans)
 
@@ -123,7 +125,7 @@ class ClipTest(test.TestCase):
         clip_value_max = constant_op.constant(
             [5, 5, 5, 7, 7, 7], shape=[2, 3], dtype=dtype)
         ans = clip_ops.clip_by_value(x, clip_value_min, clip_value_max)
-        tf_ans = ans.eval()
+        tf_ans = self.evaluate(ans)
 
       self.assertAllClose(np_ans, tf_ans)
 
@@ -144,7 +146,7 @@ class ClipTest(test.TestCase):
       np_ans = [float('NaN'), 4.0, -4.0]
       clip_value = 4.0
       ans = clip_ops.clip_by_value(x, -clip_value, clip_value)
-      tf_ans = ans.eval()
+      tf_ans = self.evaluate(ans)
 
     self.assertAllClose(np_ans, tf_ans)
 
@@ -157,14 +159,15 @@ class ClipTest(test.TestCase):
       np_ans = [[-2.4, 0.0, 0.0], [3.2, 0.0, 0.0]]
       clip_norm = 4.0
       ans = clip_ops.clip_by_norm(x, clip_norm)
-      tf_ans = ans.eval()
+      tf_ans = self.evaluate(ans)
 
       ans = clip_ops.clip_by_norm(x, clip_norm)
-      tf_ans_tensor = ans.eval()
+      tf_ans_tensor = self.evaluate(ans)
 
     self.assertAllClose(np_ans, tf_ans)
     self.assertAllClose(np_ans, tf_ans_tensor)
 
+  @test_util.run_deprecated_v1
   def testClipByNormGradientZeros(self):
     with self.session(use_gpu=True):
       x = array_ops.zeros([3])
@@ -188,7 +191,7 @@ class ClipTest(test.TestCase):
       np_ans = [[-3.0, 0.0, 0.0], [4.0, 0.0, 0.0]]
       clip_norm = 6.0
       ans = clip_ops.clip_by_norm(x, clip_norm)
-      tf_ans = ans.eval()
+      tf_ans = self.evaluate(ans)
 
     self.assertAllClose(np_ans, tf_ans)
 
@@ -200,7 +203,7 @@ class ClipTest(test.TestCase):
       np_ans = [[0.0, 0.0, 0.0], [0.0, 0.0, 0.0]]
       clip_norm = 6.0
       ans = clip_ops.clip_by_norm(x, clip_norm)
-      tf_ans = ans.eval()
+      tf_ans = self.evaluate(ans)
 
     self.assertAllClose(np_ans, tf_ans)
 
@@ -212,7 +215,7 @@ class ClipTest(test.TestCase):
       np_ans = [[-2.4, 0.0, 0.0], [3.2, 0.0, 3.0]]
       clip_norm = 4.0
       ans = clip_ops.clip_by_norm(x, clip_norm, [0])
-      tf_ans = ans.eval()
+      tf_ans = self.evaluate(ans)
 
     self.assertAllClose(np_ans, tf_ans)
 
@@ -224,7 +227,7 @@ class ClipTest(test.TestCase):
       np_ans = [[-3.0, 0.0, 0.0], [3.2, 0.0, 2.4]]
       clip_norm = 4.0
       ans = clip_ops.clip_by_norm(x, clip_norm, [1])
-      tf_ans = ans.eval()
+      tf_ans = self.evaluate(ans)
 
     self.assertAllClose(np_ans, tf_ans)
 
@@ -236,11 +239,12 @@ class ClipTest(test.TestCase):
       np_ans = [[-3.0, 0.0, 0.0], [4.0, 0.0, 3.0]]
       clip_norm = 6.0
       ans = clip_ops.clip_by_norm(x, clip_norm, [1])
-      tf_ans = ans.eval()
+      tf_ans = self.evaluate(ans)
 
     self.assertAllClose(np_ans, tf_ans)
 
   # ClipByGlobalNorm tests
+  @test_util.run_deprecated_v1
   def testClipByGlobalNormClipped(self):
     # Norm clipping when clip_norm < 5
     with self.session(use_gpu=True):
@@ -256,12 +260,13 @@ class ClipTest(test.TestCase):
       ans, norm = clip_ops.clip_by_global_norm((x0, x1), clip_norm)
       tf_ans_1 = ans[0].eval()
       tf_ans_2 = ans[1].eval()
-      tf_norm = norm.eval()
+      tf_norm = self.evaluate(norm)
 
     self.assertAllClose(tf_norm, 5.0)
     self.assertAllClose(np_ans_0, tf_ans_1)
     self.assertAllClose(np_ans_1, tf_ans_2)
 
+  @test_util.run_deprecated_v1
   def testClipByGlobalNormClippedTensor(self):
     # Norm clipping when clip_norm < 5
     with self.session(use_gpu=True):
@@ -277,12 +282,13 @@ class ClipTest(test.TestCase):
       ans, norm = clip_ops.clip_by_global_norm((x0, x1), clip_norm)
       tf_ans_1 = ans[0].eval()
       tf_ans_2 = ans[1].eval()
-      tf_norm = norm.eval()
+      tf_norm = self.evaluate(norm)
 
     self.assertAllClose(tf_norm, 5.0)
     self.assertAllClose(np_ans_0, tf_ans_1)
     self.assertAllClose(np_ans_1, tf_ans_2)
 
+  @test_util.run_deprecated_v1
   def testClipByGlobalNormSupportsNone(self):
     # Norm clipping when clip_norm < 5
     with self.session(use_gpu=True):
@@ -300,12 +306,13 @@ class ClipTest(test.TestCase):
       self.assertTrue(ans[3] is None)
       tf_ans_1 = ans[0].eval()
       tf_ans_2 = ans[2].eval()
-      tf_norm = norm.eval()
+      tf_norm = self.evaluate(norm)
 
     self.assertAllClose(tf_norm, 5.0)
     self.assertAllClose(np_ans_0, tf_ans_1)
     self.assertAllClose(np_ans_1, tf_ans_2)
 
+  @test_util.run_deprecated_v1
   def testClipByGlobalNormWithIndexedSlicesClipped(self):
     # Norm clipping when clip_norm < 5
     with self.session(use_gpu=True):
@@ -322,7 +329,7 @@ class ClipTest(test.TestCase):
       ans, norm = clip_ops.clip_by_global_norm([x0, x1], clip_norm)
       tf_ans_1 = ans[0].eval()
       tf_ans_2 = ans[1].values.eval()
-      tf_norm = norm.eval()
+      tf_norm = self.evaluate(norm)
 
     self.assertAllClose(tf_norm, 5.0)
     self.assertAllClose(np_ans_0, tf_ans_1)
@@ -339,6 +346,7 @@ class ClipTest(test.TestCase):
     self.assertEqual(dense_shape, slices.dense_shape)
     self.assertEqual(dense_shape, modified_slices.dense_shape)
 
+  @test_util.run_deprecated_v1
   def testClipByGlobalNormNotClipped(self):
     # No norm clipping when clip_norm >= 5
     with self.session(use_gpu=True):
@@ -352,12 +360,13 @@ class ClipTest(test.TestCase):
       ans, norm = clip_ops.clip_by_global_norm([x0, x1], clip_norm)
       tf_ans_1 = ans[0].eval()
       tf_ans_2 = ans[1].eval()
-      tf_norm = norm.eval()
+      tf_norm = self.evaluate(norm)
 
     self.assertAllClose(tf_norm, 5.0)
     self.assertAllClose(np_ans_0, tf_ans_1)
     self.assertAllClose(np_ans_1, tf_ans_2)
 
+  @test_util.run_deprecated_v1
   def testClipByGlobalNormZero(self):
     # No norm clipping when norm = 0
     with self.session(use_gpu=True):
@@ -371,12 +380,13 @@ class ClipTest(test.TestCase):
       ans, norm = clip_ops.clip_by_global_norm([x0, x1], clip_norm)
       tf_ans_1 = ans[0].eval()
       tf_ans_2 = ans[1].eval()
-      tf_norm = norm.eval()
+      tf_norm = self.evaluate(norm)
 
     self.assertAllClose(tf_norm, 0.0)
     self.assertAllClose(np_ans_0, tf_ans_1)
     self.assertAllClose(np_ans_1, tf_ans_2)
 
+  @test_util.run_deprecated_v1
   def testClipByGlobalNormInf(self):
     with self.session(use_gpu=True):
       x0 = constant_op.constant([-2.0, 0.0, np.inf, 4.0, 0.0, 0.0],
@@ -386,7 +396,7 @@ class ClipTest(test.TestCase):
 
       ans, norm = clip_ops.clip_by_global_norm([x0, x1], clip_norm)
       with self.assertRaisesRegexp(errors.InvalidArgumentError, "global norm"):
-        norm.eval()
+        self.evaluate(norm)
       with self.assertRaisesRegexp(errors.InvalidArgumentError, "global norm"):
         ans[0].eval()
       with self.assertRaisesRegexp(errors.InvalidArgumentError, "global norm"):
@@ -400,7 +410,7 @@ class ClipTest(test.TestCase):
       np_ans = [[-2.88, 0.0, 0.0], [3.84, 0.0, 0.0]]
       clip_norm = 0.8
       ans = clip_ops.clip_by_average_norm(x, clip_norm)
-      tf_ans = ans.eval()
+      tf_ans = self.evaluate(ans)
 
     self.assertAllClose(np_ans, tf_ans)
 
@@ -412,7 +422,7 @@ class ClipTest(test.TestCase):
       np_ans = [[-2.88, 0.0, 0.0], [3.84, 0.0, 0.0]]
       clip_norm = constant_op.constant(0.8)
       ans = clip_ops.clip_by_average_norm(x, clip_norm)
-      tf_ans = ans.eval()
+      tf_ans = self.evaluate(ans)
 
     self.assertAllClose(np_ans, tf_ans)
 
@@ -424,7 +434,7 @@ class ClipTest(test.TestCase):
       np_ans = [[-3.0, 0.0, 0.0], [4.0, 0.0, 0.0]]
       clip_norm = 0.9
       ans = clip_ops.clip_by_average_norm(x, clip_norm)
-      tf_ans = ans.eval()
+      tf_ans = self.evaluate(ans)
 
     self.assertAllClose(np_ans, tf_ans)
 
@@ -436,10 +446,26 @@ class ClipTest(test.TestCase):
       np_ans = [[0.0, 0.0, 0.0], [0.0, 0.0, 0.0]]
       clip_norm = 0.9
       ans = clip_ops.clip_by_average_norm(x, clip_norm)
-      tf_ans = ans.eval()
+      tf_ans = self.evaluate(ans)
 
     self.assertAllClose(np_ans, tf_ans)
 
+  def testClipByAverageNormReplacedWithClipByNorm(self):
+    # Check clip_by_average_norm(t) is the same as
+    # clip_by_norm(t, clip_norm * tf.to_float(tf.size(t)))
+    with self.session(use_gpu=True):
+      x = constant_op.constant([-3.0, 0.0, 0.0, 4.0, 0.0, 0.0], shape=[2, 3])
+      # Average norm of x = sqrt(3^2 + 4^2) / 6 = 0.83333333
+      # expected answer [[-2.88, 0.0, 0.0], [3.84, 0.0, 0.0]]
+      clip_norm = constant_op.constant(0.8)
+      with_norm = clip_ops.clip_by_average_norm(x, clip_norm)
+      without_norm = clip_ops.clip_by_norm(
+          x, clip_norm * math_ops.to_float(array_ops.size(x)))
+      clip_by_average_norm_ans = self.evaluate(with_norm)
+      clip_by_norm_ans = self.evaluate(without_norm)
+      self.assertAllClose(clip_by_average_norm_ans, clip_by_norm_ans)
+
+  @test_util.run_deprecated_v1
   def testClipByValueEmptyTensor(self):
     # Test case for GitHub issue 19337
     zero = array_ops.placeholder(dtype=dtypes.float32, shape=None)
diff --git a/tensorflow/python/kernel_tests/compare_and_bitpack_op_test.py b/tensorflow/python/kernel_tests/compare_and_bitpack_op_test.py
index f27a0fc4722..215ea97f36d 100644
--- a/tensorflow/python/kernel_tests/compare_and_bitpack_op_test.py
+++ b/tensorflow/python/kernel_tests/compare_and_bitpack_op_test.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 
 import numpy as np
 
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 
@@ -30,15 +31,15 @@ class CompareAndBitpackTest(test.TestCase):
                              x, threshold,
                              truth,
                              expected_err_re=None):
-    with self.cached_session(use_gpu=True):
+    with test_util.use_gpu():
       ans = math_ops.compare_and_bitpack(x, threshold)
       if expected_err_re is None:
-        tf_ans = ans.eval()
+        tf_ans = self.evaluate(ans)
         self.assertShapeEqual(truth, ans)
         self.assertAllEqual(tf_ans, truth)
       else:
         with self.assertRaisesOpError(expected_err_re):
-          ans.eval()
+          self.evaluate(ans)
 
   def _testBasic(self, dtype):
     rows = 371
diff --git a/tensorflow/python/kernel_tests/concat_op_test.py b/tensorflow/python/kernel_tests/concat_op_test.py
index 92d09986e6c..474760a93ff 100644
--- a/tensorflow/python/kernel_tests/concat_op_test.py
+++ b/tensorflow/python/kernel_tests/concat_op_test.py
@@ -23,6 +23,7 @@ import numpy as np
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_array_ops
 from tensorflow.python.ops import gradient_checker
@@ -34,6 +35,7 @@ from tensorflow.python.platform import test
 
 class ConcatOpTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testHStack(self):
     with self.session(use_gpu=True):
       p1 = array_ops.placeholder(dtypes.float32, shape=[4, 4])
@@ -49,6 +51,7 @@ class ConcatOpTest(test.TestCase):
     self.assertAllEqual(result[:4, :], params[p1])
     self.assertAllEqual(result[4:, :], params[p2])
 
+  @test_util.run_deprecated_v1
   def testVStack(self):
     with self.session(use_gpu=True):
       p1 = array_ops.placeholder(dtypes.float32, shape=[4, 4])
@@ -65,25 +68,25 @@ class ConcatOpTest(test.TestCase):
     self.assertAllEqual(result[:, 4:], params[p2])
 
   def testInt32GPU(self):
-    with self.session(use_gpu=True):
+    with test_util.use_gpu():
       p1 = np.random.rand(2, 3).astype("i")
       p2 = np.random.rand(2, 3).astype("i")
       x1 = constant_op.constant(p1)
       x2 = constant_op.constant(p2)
       c = array_ops.concat([x1, x2], 0)
-      result = c.eval()
+      result = self.evaluate(c)
     self.assertAllEqual(result[:2, :], p1)
     self.assertAllEqual(result[2:, :], p2)
 
   def testRefType(self):
-    with self.session(use_gpu=True):
+    with test_util.use_gpu():
       p1 = np.random.rand(4, 4).astype("f")
       p2 = np.random.rand(4, 4).astype("f")
       v1 = variables.Variable(p1)
       v2 = variables.Variable(p2)
       c = array_ops.concat([v1, v2], 0)
-      variables.global_variables_initializer().run()
-      result = c.eval()
+      self.evaluate(variables.global_variables_initializer())
+      result = self.evaluate(c)
 
     self.assertEqual(result.shape, c.get_shape())
     self.assertAllEqual(result[:4, :], p1)
@@ -137,6 +140,7 @@ class ConcatOpTest(test.TestCase):
       else:
         self.assertAllClose(result[ind], params[p[i]], 0.01)
 
+  @test_util.run_deprecated_v1
   def testRandom(self):
     self._testRandom(dtypes.bool)
     self._testRandom(dtypes.float32)
@@ -147,6 +151,7 @@ class ConcatOpTest(test.TestCase):
     self._testRandom(dtypes.complex64)
     self._testRandom(dtypes.complex128)
 
+  @test_util.run_deprecated_v1
   def testInvalidConcatDimTypeAndShape(self):
     a = variables.Variable(constant_op.constant(1.0, shape=[1]))
     b = variables.Variable(constant_op.constant(2.0, shape=[1]))
@@ -172,7 +177,7 @@ class ConcatOpTest(test.TestCase):
     # Test both positive and negative concat axis.
     # -2 and 1 correspond to the same axis for 3-dimensional tensors.
     for axis in [-2, 1]:
-      with self.cached_session(use_gpu=True):
+      with test_util.use_gpu():
         inp = []
         inp_tensors = []
         for x in [1, 2, 6]:
@@ -195,15 +200,17 @@ class ConcatOpTest(test.TestCase):
             grad_inp.flatten(), shape=output_shape)
         grad = gradients_impl.gradients([c], inp_tensors, [grad_tensor])
         concated_grad = array_ops.concat(grad, axis)
-        result = concated_grad.eval()
+        result = self.evaluate(concated_grad)
     self.assertAllEqual(result, grad_inp)
 
+  @test_util.run_deprecated_v1
   def testGradientsSimple(self):
     self._testGradientsSimple(dtypes.float32)
     self._testGradientsSimple(dtypes.complex64)
 
+  @test_util.run_deprecated_v1
   def testGradientsFirstDim(self):
-    with self.session(use_gpu=True):
+    with test_util.use_gpu():
       inp = []
       inp_tensors = []
       for x in [1, 2, 6]:
@@ -222,15 +229,16 @@ class ConcatOpTest(test.TestCase):
           grad_inp.flatten(), shape=output_shape)
       grad = gradients_impl.gradients([c], inp_tensors, [grad_tensor])
       concated_grad = array_ops.concat(grad, 0)
-      result = concated_grad.eval()
+      result = self.evaluate(concated_grad)
 
     self.assertAllEqual(result, grad_inp)
 
+  @test_util.run_deprecated_v1
   def testGradientsLastDim(self):
     # Test both positive and negative concat axis.
     # -1 and 2 correspond to the same axis for 3-dimensional tensors.
     for axis in [-1, 2]:
-      with self.cached_session(use_gpu=True):
+      with test_util.use_gpu():
         inp = []
         inp_tensors = []
         for x in [1, 2, 6]:
@@ -249,7 +257,7 @@ class ConcatOpTest(test.TestCase):
             grad_inp.flatten(), shape=output_shape)
         grad = gradients_impl.gradients([c], inp_tensors, [grad_tensor])
         concated_grad = array_ops.concat(grad, axis)
-        result = concated_grad.eval()
+        result = self.evaluate(concated_grad)
 
     self.assertAllEqual(result, grad_inp)
 
@@ -261,7 +269,7 @@ class ConcatOpTest(test.TestCase):
     # Random dim to concat on
     concat_dim = np.random.randint(5)
     concat_dim_sizes = np.random.randint(1, 5, size=num_tensors)
-    with self.cached_session(use_gpu=True):
+    with test_util.use_gpu():
       inp = []
       inp_tensors = []
       for x in concat_dim_sizes:
@@ -279,14 +287,16 @@ class ConcatOpTest(test.TestCase):
       grad_tensor = constant_op.constant(grad_inp.flatten(), shape=output_shape)
       grad = gradients_impl.gradients([c], inp_tensors, [grad_tensor])
       concated_grad = array_ops.concat(grad, concat_dim)
-      result = concated_grad.eval()
+      result = self.evaluate(concated_grad)
 
     self.assertAllEqual(result, grad_inp)
 
+  @test_util.run_deprecated_v1
   def testGradientsRandom(self):
     for _ in range(5):
       self._RunAndVerifyGradientsRandom()
 
+  @test_util.run_deprecated_v1
   def testGradientWithUnknownInputDim(self):
     with self.session(use_gpu=True):
       x = array_ops.placeholder(dtypes.float32)
@@ -308,6 +318,7 @@ class ConcatOpTest(test.TestCase):
 
       self.assertAllEqual(result, grad_inp)
 
+  @test_util.run_deprecated_v1
   def testShapeError(self):
     # Rank doesn't match.
     with self.assertRaises(ValueError):
@@ -337,6 +348,7 @@ class ConcatOpTest(test.TestCase):
            constant_op.constant(20.0, shape=[4, 4, 4])
           ], -4)
 
+  @test_util.run_deprecated_v1
   def testShapeWithUnknownConcatDim(self):
     p1 = array_ops.placeholder(dtypes.float32)
     c1 = constant_op.constant(10.0, shape=[4, 4, 4, 4])
@@ -355,10 +367,11 @@ class ConcatOpTest(test.TestCase):
     with self.assertRaises(ValueError):
       array_ops.concat([p1, c1, p2, c3], dim)
 
+  @test_util.run_deprecated_v1
   def testZeroSize(self):
     # Verify that concat doesn't crash and burn for zero size inputs
     np.random.seed(7)
-    with self.session(use_gpu=True) as sess:
+    with test_util.use_gpu():
       for shape0 in (), (2,):
         axis = len(shape0)
         for shape1 in (), (3,):
@@ -370,12 +383,13 @@ class ConcatOpTest(test.TestCase):
               # TODO(irving): Make tf.concat handle map, then drop list().
               xs = list(map(constant_op.constant, [x0, x1]))
               c = array_ops.concat(xs, axis)
-              self.assertAllEqual(c.eval(), correct)
+              self.assertAllEqual(self.evaluate(c), correct)
               # Check gradients
               dc = np.random.randn(*c.get_shape().as_list())
-              dxs = sess.run(gradients_impl.gradients(c, xs, dc))
+              dxs = self.evaluate(gradients_impl.gradients(c, xs, dc))
               self.assertAllEqual(dc, np.concatenate(dxs, axis=axis))
 
+  @test_util.run_deprecated_v1
   def testTensorConcatDim0Grad(self):
     x_shapes = [[20, 7, 3], [10, 7, 3], [14, 7, 3]]
     output_shape = [44, 7, 3]
@@ -390,6 +404,7 @@ class ConcatOpTest(test.TestCase):
                                                     output_shape)
     self.assertLess(err, 1e-11)
 
+  @test_util.run_deprecated_v1
   def testTensorConcatDim1Grad(self):
     x_shapes = [[20, 7, 3], [20, 3, 3], [20, 1, 3]]
     output_shape = [20, 11, 3]
@@ -404,6 +419,7 @@ class ConcatOpTest(test.TestCase):
                                                     output_shape)
     self.assertLess(err, 1e-11)
 
+  @test_util.run_deprecated_v1
   def testIndexedSlicesConcatDim0Grad(self):
     x_shapes = [[20, 7, 3], [10, 7, 3], [14, 7, 3]]
     output_shape = [4, 7, 3]
@@ -419,6 +435,7 @@ class ConcatOpTest(test.TestCase):
                                                     output_shape)
     self.assertLess(err, 1e-11)
 
+  @test_util.run_deprecated_v1
   def testIndexedSlicesConcatDim1Grad(self):
     x_shapes = [[20, 7, 3], [20, 3, 3], [20, 1, 3]]
     output_shape = [4, 11, 3]
@@ -434,6 +451,7 @@ class ConcatOpTest(test.TestCase):
                                                     output_shape)
     self.assertLess(err, 1e-11)
 
+  @test_util.run_deprecated_v1
   def testIndexedSlicesConcatDim2Grad(self):
     x_shapes = [[20, 7, 3], [20, 7, 1], [20, 7, 2]]
     output_shape = [4, 7, 6]
@@ -449,6 +467,7 @@ class ConcatOpTest(test.TestCase):
                                                     output_shape)
     self.assertLess(err, 1e-11)
 
+  @test_util.run_deprecated_v1
   def testIndexedSlicesConcatDim1Grad_UnknownInputDim(self):
     x_shapes = [[20, 7, 3], [20, 3, 3], [20, 1, 3]]
     output_shape = [4, 11, 3]
@@ -473,21 +492,22 @@ class ConcatOpTest(test.TestCase):
   def testConcatTuple(self):
     c1 = np.random.rand(4, 4)
     c2 = np.random.rand(4, 4)
-    with self.cached_session():
-      concat_list_t = array_ops.concat([c1, c2], 0)
-      concat_tuple_t = array_ops.concat((c1, c2), 0)
-      self.assertAllEqual(concat_list_t.eval(), concat_tuple_t.eval())
+    concat_list_t = array_ops.concat([c1, c2], 0)
+    concat_tuple_t = array_ops.concat((c1, c2), 0)
+    self.assertAllEqual(
+        self.evaluate(concat_list_t), self.evaluate(concat_tuple_t))
 
+  @test_util.run_deprecated_v1
   def testConcatNoScalars(self):
-    with self.cached_session():
-      scalar = constant_op.constant(7)
-      dim = array_ops.placeholder(dtypes.int32)
-      with self.assertRaisesRegexp(
-          ValueError, r"Can't concatenate scalars \(use tf\.stack instead\)"):
-        array_ops.concat([scalar, scalar, scalar], dim)
+    scalar = constant_op.constant(7)
+    dim = array_ops.placeholder(dtypes.int32)
+    with self.assertRaisesRegexp(
+        ValueError, r"Can't concatenate scalars \(use tf\.stack instead\)"):
+      array_ops.concat([scalar, scalar, scalar], dim)
 
   # important as gpu implementation could fail if
   # shared memory is not large for all the inputs
+  @test_util.run_deprecated_v1
   def testConcatLargeNumberOfTensors(self):
     with self.session(use_gpu=True):
       for concat_dim in range(2):
@@ -523,33 +543,34 @@ class ConcatOpTest(test.TestCase):
           self.assertAllEqual(result[index], params[p[i]])
 
   def testConcatEmpty(self):
-    with self.session(use_gpu=True):
+    with test_util.use_gpu():
       t1 = []
       t2 = []
-      output = gen_array_ops.concat_v2([t1, t2], 0).eval()
-      self.assertFalse(output)  # Checks that output is empty
+      output = gen_array_ops.concat_v2([t1, t2], 0)
+      self.assertFalse(self.evaluate(output))  # Checks that output is empty
 
+  @test_util.run_deprecated_v1
   def testConcatInvalidAxis(self):
     with self.assertRaises(ValueError):
-      with self.session(use_gpu=True):
+      with test_util.use_gpu():
         t1 = [1]
         t2 = [2]
         gen_array_ops.concat_v2([t1, t2], 1).eval()
 
   def testConcatNegativeAxis(self):
-    with self.session(use_gpu=True):
+    with test_util.use_gpu():
       t1 = [[1, 2, 3], [4, 5, 6]]
       t2 = [[7, 8, 9], [10, 11, 12]]
 
       c = gen_array_ops.concat_v2([t1, t2], -2)
       self.assertEqual([4, 3], c.get_shape().as_list())
-      output = c.eval()
+      output = self.evaluate(c)
       self.assertAllEqual([[1, 2, 3], [4, 5, 6], [7, 8, 9], [10, 11, 12]],
                           output)
 
       c = gen_array_ops.concat_v2([t1, t2], -1)
       self.assertEqual([2, 6], c.get_shape().as_list())
-      output = c.eval()
+      output = self.evaluate(c)
       self.assertAllEqual([[1, 2, 3, 7, 8, 9], [4, 5, 6, 10, 11, 12]], output)
 
   def _testGradientsForAxis(
@@ -578,6 +599,7 @@ class ConcatOpTest(test.TestCase):
       result = concated_grad.eval(feed_dict=feed_dict)
       self.assertAllEqual(result, grad_inp)
 
+  @test_util.run_deprecated_v1
   def testGradientsNegativeAxis(self):
     x1 = [[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]
     x2 = [[7.0, 8.0, 9.0], [10.0, 11.0, 12.0]]
@@ -608,78 +630,78 @@ class ConcatOpTest(test.TestCase):
 
   def testConcatAxisType(self):
     for dtype in [dtypes.int32, dtypes.int64]:
-      with self.cached_session(use_gpu=True):
+      with test_util.use_gpu():
         t1 = [[1, 2, 3], [4, 5, 6]]
         t2 = [[7, 8, 9], [10, 11, 12]]
 
         c = gen_array_ops.concat_v2([t1, t2],
                                     constant_op.constant(1, dtype=dtype))
         self.assertEqual([2, 6], c.get_shape().as_list())
-        output = c.eval()
+        output = self.evaluate(c)
         self.assertAllEqual([[1, 2, 3, 7, 8, 9], [4, 5, 6, 10, 11, 12]], output)
 
 class ConcatOffsetTest(test.TestCase):
 
   def testBasic(self):
-    with self.session(use_gpu=True) as sess:
+    with test_util.use_gpu():
       cdim = constant_op.constant(1, dtypes.int32)
       s0 = constant_op.constant([2, 3, 5], dtypes.int32)
       s1 = constant_op.constant([2, 7, 5], dtypes.int32)
       s2 = constant_op.constant([2, 20, 5], dtypes.int32)
       off = gen_array_ops.concat_offset(cdim, [s0, s1, s2])
-      ans = sess.run(off)
+      ans = self.evaluate(off)
       self.assertAllEqual(ans, [[0, 0, 0], [0, 3, 0], [0, 10, 0]])
 
+  @test_util.run_deprecated_v1
   def testNotVector(self):
-    with self.cached_session() as sess:
-      cdim = constant_op.constant(1, dtypes.int32)
-      s0 = constant_op.constant([[2, 3, 5]], dtypes.int32)
-      s1 = constant_op.constant([[2, 7, 5]], dtypes.int32)
-      off = gen_array_ops.concat_offset(cdim, [s0, s1])
-      with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
-                                   r"should be a vector"):
-        sess.run(off)
+    cdim = constant_op.constant(1, dtypes.int32)
+    s0 = constant_op.constant([[2, 3, 5]], dtypes.int32)
+    s1 = constant_op.constant([[2, 7, 5]], dtypes.int32)
+    off = gen_array_ops.concat_offset(cdim, [s0, s1])
+    with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
+                                 r"should be a vector"):
+      self.evaluate(off)
 
+  @test_util.run_deprecated_v1
   def testConcatDimOutOfRange(self):
-    with self.cached_session() as sess:
-      cdim = constant_op.constant(4, dtypes.int32)
-      s0 = constant_op.constant([2, 3, 5], dtypes.int32)
-      s1 = constant_op.constant([2, 7, 5], dtypes.int32)
-      off = gen_array_ops.concat_offset(cdim, [s0, s1])
-      with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
-                                   r"Concat dim is out of range: 4 vs. 3"):
-        sess.run(off)
+    cdim = constant_op.constant(4, dtypes.int32)
+    s0 = constant_op.constant([2, 3, 5], dtypes.int32)
+    s1 = constant_op.constant([2, 7, 5], dtypes.int32)
+    off = gen_array_ops.concat_offset(cdim, [s0, s1])
+    with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
+                                 r"Concat dim is out of range: 4 vs. 3"):
+      self.evaluate(off)
 
+  @test_util.run_deprecated_v1
   def testDimMismatch(self):
-    with self.cached_session() as sess:
-      cdim = constant_op.constant(1, dtypes.int32)
-      s0 = constant_op.constant([2, 3, 5], dtypes.int32)
-      s1 = constant_op.constant([2, 7, 5, 10], dtypes.int32)
-      off = gen_array_ops.concat_offset(cdim, [s0, s1])
-      with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
-                                   r"should contain 3 elem"):
-        sess.run(off)
+    cdim = constant_op.constant(1, dtypes.int32)
+    s0 = constant_op.constant([2, 3, 5], dtypes.int32)
+    s1 = constant_op.constant([2, 7, 5, 10], dtypes.int32)
+    off = gen_array_ops.concat_offset(cdim, [s0, s1])
+    with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
+                                 r"should contain 3 elem"):
+      self.evaluate(off)
 
+  @test_util.run_deprecated_v1
   def testSizeMismatch(self):
-    with self.cached_session() as sess:
-      cdim = constant_op.constant(1, dtypes.int32)
-      s0 = constant_op.constant([2, 3, 5], dtypes.int32)
-      s1 = constant_op.constant([2, 7, 10], dtypes.int32)
-      off = gen_array_ops.concat_offset(cdim, [s0, s1])
-      with self.assertRaisesRegexp(
-          errors_impl.InvalidArgumentError,
-          r"All dimensions except 1 must match. Input 1 has shape \[2 7 10\] "
-          r"and doesn't match input 0 with shape \[2 3 5\]."):
-        sess.run(off)
+    cdim = constant_op.constant(1, dtypes.int32)
+    s0 = constant_op.constant([2, 3, 5], dtypes.int32)
+    s1 = constant_op.constant([2, 7, 10], dtypes.int32)
+    off = gen_array_ops.concat_offset(cdim, [s0, s1])
+    with self.assertRaisesRegexp(
+        errors_impl.InvalidArgumentError,
+        r"All dimensions except 1 must match. Input 1 has shape \[2 7 10\] "
+        r"and doesn't match input 0 with shape \[2 3 5\]."):
+      self.evaluate(off)
 
   def testNegativeDim(self):
-    with self.session(use_gpu=True) as sess:
+    with test_util.use_gpu():
       cdim = constant_op.constant(-2, dtypes.int32)
       s0 = constant_op.constant([2, 3, 5], dtypes.int32)
       s1 = constant_op.constant([2, 7, 5], dtypes.int32)
       s2 = constant_op.constant([2, 20, 5], dtypes.int32)
       off = gen_array_ops.concat_offset(cdim, [s0, s1, s2])
-      ans = sess.run(off)
+      ans = self.evaluate(off)
       self.assertAllEqual(ans, [[0, 0, 0], [0, 3, 0], [0, 10, 0]])
 
       cdim = constant_op.constant(-3, dtypes.int32)
@@ -687,7 +709,7 @@ class ConcatOffsetTest(test.TestCase):
       s1 = constant_op.constant([1, 3, 5], dtypes.int32)
       s2 = constant_op.constant([3, 3, 5], dtypes.int32)
       off = gen_array_ops.concat_offset(cdim, [s0, s1, s2])
-      ans = sess.run(off)
+      ans = self.evaluate(off)
       self.assertAllEqual(ans, [[0, 0, 0], [2, 0, 0], [3, 0, 0]])
 
 
diff --git a/tensorflow/python/kernel_tests/cond_v2_test.py b/tensorflow/python/kernel_tests/cond_v2_test.py
index b077b853edb..1f4b37ce2a4 100644
--- a/tensorflow/python/kernel_tests/cond_v2_test.py
+++ b/tensorflow/python/kernel_tests/cond_v2_test.py
@@ -20,6 +20,7 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.core.protobuf import config_pb2
+from tensorflow.python.eager import context
 from tensorflow.python.eager import function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -32,6 +33,7 @@ from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import tensor_array_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 from tensorflow.python.training import saver
@@ -66,6 +68,7 @@ class CondV2Test(test.TestCase):
       self.assertEqual(expected_val, actual_val)
       self.assertEqual(expected_grad_val, actual_grad_val)
 
+  @test_util.run_deprecated_v1
   def testBasic(self):
     x = constant_op.constant(1.0, name="x")
     y = constant_op.constant(2.0, name="y")
@@ -80,6 +83,7 @@ class CondV2Test(test.TestCase):
     self._testCond(true_fn, false_fn, [x, y])
     self._testCond(true_fn, false_fn, [y])
 
+  @test_util.run_deprecated_v1
   def testMultipleOutputs(self):
     x = constant_op.constant(1.0, name="x")
     y = constant_op.constant(3.0, name="y")
@@ -94,6 +98,7 @@ class CondV2Test(test.TestCase):
     self._testCond(true_fn, false_fn, [x, y])
     self._testCond(true_fn, false_fn, [y])
 
+  @test_util.run_deprecated_v1
   def testBasic2(self):
     x = constant_op.constant(1.0, name="x")
     y = constant_op.constant(2.0, name="y")
@@ -108,6 +113,7 @@ class CondV2Test(test.TestCase):
     self._testCond(true_fn, false_fn, [x, y])
     self._testCond(true_fn, false_fn, [y])
 
+  @test_util.run_deprecated_v1
   def testNoInputs(self):
     with self.cached_session() as sess:
       pred = array_ops.placeholder(dtypes.bool, name="pred")
@@ -124,7 +130,7 @@ class CondV2Test(test.TestCase):
       self.assertEqual(sess.run(out, {pred: False}), (2.0,))
 
   def _createCond(self, name):
-    """Helper function for testDefaultName."""
+    """Creates a cond_v2 call and returns the output tensor and the cond op."""
     pred = constant_op.constant(True, name="pred")
     x = constant_op.constant(1.0, name="x")
 
@@ -137,11 +143,11 @@ class CondV2Test(test.TestCase):
     output = cond_v2.cond_v2(pred, true_fn, false_fn, name=name)
     cond_op = output.op.inputs[0].op
     self.assertEqual(cond_op.type, "If")
-    return cond_op
+    return output, cond_op
 
   def testDefaultName(self):
     with ops.Graph().as_default():
-      cond_op = self._createCond(None)
+      _, cond_op = self._createCond(None)
       self.assertEqual(cond_op.name, "cond")
       self.assertRegexpMatches(
           cond_op.get_attr("then_branch").name, r"cond_true_\d*")
@@ -150,14 +156,14 @@ class CondV2Test(test.TestCase):
 
     with ops.Graph().as_default():
       with ops.name_scope("foo"):
-        cond1_op = self._createCond("")
+        _, cond1_op = self._createCond("")
         self.assertEqual(cond1_op.name, "foo/cond")
         self.assertRegexpMatches(
             cond1_op.get_attr("then_branch").name, r"foo_cond_true_\d*")
         self.assertRegexpMatches(
             cond1_op.get_attr("else_branch").name, r"foo_cond_false_\d*")
 
-        cond2_op = self._createCond(None)
+        _, cond2_op = self._createCond(None)
         self.assertEqual(cond2_op.name, "foo/cond_1")
         self.assertRegexpMatches(
             cond2_op.get_attr("then_branch").name, r"foo_cond_1_true_\d*")
@@ -538,6 +544,7 @@ class CondV2Test(test.TestCase):
               pred_inner: False
           }), [5., 0.])
 
+  @test_util.run_deprecated_v1
   def testSecondDerivative(self):
     with self.cached_session() as sess:
       pred = array_ops.placeholder(dtypes.bool, name="pred")
@@ -610,11 +617,11 @@ class CondV2Test(test.TestCase):
   def testLowering(self):
     with ops.Graph().as_default() as g:
       with self.session(graph=g) as sess:
-        out_cond = self._createCond("cond")
+        cond_output, _ = self._createCond("cond")
 
         run_options = config_pb2.RunOptions(output_partition_graphs=True)
         run_metadata = config_pb2.RunMetadata()
-        sess.run(out_cond, options=run_options, run_metadata=run_metadata)
+        sess.run(cond_output, options=run_options, run_metadata=run_metadata)
 
         # If lowering was enabled, there should be a `Switch` node
         switch_found = any(
@@ -634,17 +641,18 @@ class CondV2Test(test.TestCase):
         self.assertFalse(if_found,
                          "An `If` op was found, but it should be lowered.")
 
+  @test_util.run_deprecated_v1
   def testLoweringDisabledInXLA(self):
     with self.session(graph=ops.Graph()) as sess:
       # Build the cond_v2 in an XLA context
       xla_context = control_flow_ops.XLAControlFlowContext()
       xla_context.Enter()
-      out_cond = self._createCond("cond")
+      cond_output, _ = self._createCond("cond")
       xla_context.Exit()
 
       run_options = config_pb2.RunOptions(output_partition_graphs=True)
       run_metadata = config_pb2.RunMetadata()
-      sess.run(out_cond, options=run_options, run_metadata=run_metadata)
+      sess.run(cond_output, options=run_options, run_metadata=run_metadata)
 
       # Lowering disabled in XLA, there should be no `Switch` node
       switch_found = any(
@@ -666,6 +674,110 @@ class CondV2Test(test.TestCase):
           if_found,
           "An `If` op was not found, but the graph should not be lowered.")
 
+  @test_util.run_deprecated_v1
+  def testLoweringDisabledWithSingleThreadedExecutorContext(self):
+    with self.session(graph=ops.Graph()) as sess:
+      @function.defun
+      def _add_cond(x):
+        return cond_v2.cond_v2(
+            constant_op.constant(True, name="pred"),
+            lambda: x,
+            lambda: x + 1)
+
+      x = array_ops.placeholder(shape=None, dtype=dtypes.float32)
+      with context.function_executor_type("SINGLE_THREADED_EXECUTOR"):
+        out_cond = _add_cond(x)
+
+      # The fact that sess.run() succeeds means lowering is disabled, because
+      # the single threaded executor does not support cond v1 ops.
+      sess.run(out_cond, feed_dict={x: 1.0})
+
+  @test_util.enable_control_flow_v2
+  def testStructuredOutputs(self):
+    x = constant_op.constant(1.0, name="x")
+    y = constant_op.constant(3.0, name="y")
+
+    def true_fn():
+      return ((x * y,), y)
+
+    def false_fn():
+      return ((x,), y * 3.0)
+
+    output = control_flow_ops.cond(
+        constant_op.constant(False), true_fn, false_fn)
+    self.assertEqual(self.evaluate(output[0][0]), 1.)
+    self.assertEqual(self.evaluate(output[1]), 9.)
+
+  @test_util.enable_control_flow_v2
+  @test_util.run_deprecated_v1
+  def testRaisesOutputStructuresMismatch(self):
+    x = constant_op.constant(1.0, name="x")
+    y = constant_op.constant(3.0, name="y")
+
+    def true_fn():
+      return x * y, y
+
+    def false_fn():
+      return ((x,), y * 3.0)
+
+    with self.assertRaisesRegexp(
+        ValueError, "Outputs of true_fn and false_fn must"
+        " have the same structure"):
+      control_flow_ops.cond(constant_op.constant(False), true_fn, false_fn)
+
+  @test_util.enable_control_flow_v2
+  def testCondAndTensorArray(self):
+    x = math_ops.range(-5, 5)
+    output = tensor_array_ops.TensorArray(dtype=dtypes.int32, size=x.shape[0])
+
+    def loop_body(i, output):
+
+      def if_true():
+        return output.write(i, x[i]**2)
+
+      def if_false():
+        return output.write(i, x[i])
+
+      output = control_flow_ops.cond(x[i] > 0, if_true, if_false)
+      return i + 1, output
+
+    _, output = control_flow_ops.while_loop(
+        lambda i, arr: i < x.shape[0],
+        loop_body,
+        loop_vars=(constant_op.constant(0), output))
+    output_t = output.stack()
+    self.assertAllEqual(
+        self.evaluate(output_t), [-5, -4, -3, -2, -1, 0, 1, 4, 9, 16])
+
+  @test_util.enable_control_flow_v2
+  def testCondAndTensorArrayInDefun(self):
+
+    @function.defun
+    def f():
+      x = math_ops.range(-5, 5)
+      output = tensor_array_ops.TensorArray(dtype=dtypes.int32, size=x.shape[0])
+
+      def loop_body(i, output):
+
+        def if_true():
+          return output.write(i, x[i]**2)
+
+        def if_false():
+          return output.write(i, x[i])
+
+        output = control_flow_ops.cond(x[i] > 0, if_true, if_false)
+        return i + 1, output
+
+      _, output = control_flow_ops.while_loop(
+          lambda i, arr: i < x.shape[0],
+          loop_body,
+          loop_vars=(constant_op.constant(0), output))
+      return output.stack()
+
+    output_t = f()
+    self.assertAllEqual(
+        self.evaluate(output_t), [-5, -4, -3, -2, -1, 0, 1, 4, 9, 16])
+
 
 class CondV2CollectionTest(test.TestCase):
 
diff --git a/tensorflow/python/kernel_tests/conditional_accumulator_test.py b/tensorflow/python/kernel_tests/conditional_accumulator_test.py
index 97ab23fe49b..5847e4639bb 100644
--- a/tensorflow/python/kernel_tests/conditional_accumulator_test.py
+++ b/tensorflow/python/kernel_tests/conditional_accumulator_test.py
@@ -26,6 +26,7 @@ from tensorflow.python.framework import dtypes as dtypes_lib
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.ops import math_ops
@@ -79,11 +80,13 @@ class ConditionalAccumulatorTest(test.TestCase):
       attr { key: 'reduction_type' value {s: 'MEAN'} }
       """, q.accumulator_ref.op.node_def)
 
+  @test_util.run_deprecated_v1
   def testAccumulatorSizeEmpty(self):
     with self.cached_session():
       q = data_flow_ops.ConditionalAccumulator(dtypes_lib.float32, name="Q")
       self.assertEqual(q.num_accumulated().eval(), 0)
 
+  @test_util.run_deprecated_v1
   def testAccumulatorSetGlobalStep(self):
     with self.cached_session():
       q = data_flow_ops.ConditionalAccumulator(
@@ -91,6 +94,7 @@ class ConditionalAccumulatorTest(test.TestCase):
       set_global_step_op = q.set_global_step(1)
       set_global_step_op.run()
 
+  @test_util.run_deprecated_v1
   def testAccumulatorApplyGradFloat32(self):
     with self.cached_session():
       q = data_flow_ops.ConditionalAccumulator(
@@ -98,6 +102,7 @@ class ConditionalAccumulatorTest(test.TestCase):
       accum_op = q.apply_grad((10.0,))
       accum_op.run()
 
+  @test_util.run_deprecated_v1
   def testDtypes(self):
     with self.cached_session() as sess:
       dtypes = [dtypes_lib.float16, dtypes_lib.float32, dtypes_lib.float64]
@@ -111,10 +116,11 @@ class ConditionalAccumulatorTest(test.TestCase):
         for e in elems:
           q.apply_grad((e,)).run()
 
-        result = sess.run(q.take_grad(1))
+        result = self.evaluate(q.take_grad(1))
 
         self.assertEqual(sum(elems) / len(elems), result)
 
+  @test_util.run_deprecated_v1
   def testAccumulatorMultipleAccumulators(self):
     with self.cached_session():
       q_f32_0 = data_flow_ops.ConditionalAccumulator(
@@ -134,6 +140,7 @@ class ConditionalAccumulatorTest(test.TestCase):
         result = accums[i].take_grad(1).eval()
         self.assertEqual(result, i + 10.0)
 
+  @test_util.run_deprecated_v1
   def testAccumulatorApplyAndTakeGradWithShape(self):
     with self.cached_session():
       q = data_flow_ops.ConditionalAccumulator(
@@ -149,12 +156,13 @@ class ConditionalAccumulatorTest(test.TestCase):
         accum_op.run()
 
       is_all_equal = True
-      val = takeg_t.eval()
+      val = self.evaluate(takeg_t)
       for i in range(len(val)):
         for j in range(len(val[i])):
           is_all_equal &= (val[i][j] == elems_ave[i][j])
       self.assertTrue(is_all_equal)
 
+  @test_util.run_deprecated_v1
   def testAccumulatorApplyGradWithWrongShape(self):
     q = data_flow_ops.ConditionalAccumulator(
         dtypes_lib.float32, name="Q", shape=(3, 2))
@@ -165,6 +173,7 @@ class ConditionalAccumulatorTest(test.TestCase):
     with self.assertRaises(ValueError):
       q.apply_grad([[1.0], [2.0], [3.0]])
 
+  @test_util.run_deprecated_v1
   def testAccumulatorDynamicShape(self):
     with self.cached_session() as sess:
       q = data_flow_ops.ConditionalAccumulator(
@@ -184,12 +193,13 @@ class ConditionalAccumulatorTest(test.TestCase):
         sess.run(accum_op, feed_dict={x: elem})
 
       is_all_equal = True
-      val = takeg_t.eval()
+      val = self.evaluate(takeg_t)
       for i in range(len(val)):
         for j in range(len(val[i])):
           is_all_equal &= (val[i][j] == elems_ave[i][j])
       self.assertTrue(is_all_equal)
 
+  @test_util.run_deprecated_v1
   def testAccumulatorWrongDynamicShape(self):
     with self.cached_session() as sess:
       q = data_flow_ops.ConditionalAccumulator(
@@ -208,6 +218,7 @@ class ConditionalAccumulatorTest(test.TestCase):
       with self.assertRaises(errors_impl.InvalidArgumentError):
         sess.run(accum_op, feed_dict={x: [[1.0], [2.0], [3.0]]})
 
+  @test_util.run_deprecated_v1
   def testAccumulatorSizeAfterApplyGrad(self):
     with self.cached_session():
       q = data_flow_ops.ConditionalAccumulator(
@@ -219,6 +230,7 @@ class ConditionalAccumulatorTest(test.TestCase):
       accum_op.run()
       self.assertEqual(q.num_accumulated().eval(), 2)
 
+  @test_util.run_deprecated_v1
   def testAccumulatorSizeAfterApplyGradAndTakeGrad(self):
     with self.cached_session():
       q = data_flow_ops.ConditionalAccumulator(
@@ -247,6 +259,7 @@ class ConditionalAccumulatorTest(test.TestCase):
       extract_t.op.run()
       self.assertEqual(q.num_accumulated().eval(), 0)
 
+  @test_util.run_deprecated_v1
   def testAccumulatorTakeGradMean(self):
     with self.cached_session():
       q = data_flow_ops.ConditionalAccumulator(
@@ -259,7 +272,7 @@ class ConditionalAccumulatorTest(test.TestCase):
       for accum_op in accum_ops:
         accum_op.run()
 
-      val = takeg_t.eval()
+      val = self.evaluate(takeg_t)
       self.assertEqual(15.0, val)
 
       accum_ops = [q.apply_grad((x,), local_step=1) for x in elems]
@@ -268,9 +281,10 @@ class ConditionalAccumulatorTest(test.TestCase):
       for accum_op in accum_ops:
         accum_op.run()
 
-      val = takeg_t.eval()
+      val = self.evaluate(takeg_t)
       self.assertEqual(15.0, val)
 
+  @test_util.run_deprecated_v1
   def testAccumulatorTakeGradSum(self):
     with self.cached_session():
       q = data_flow_ops.ConditionalAccumulator(
@@ -286,7 +300,7 @@ class ConditionalAccumulatorTest(test.TestCase):
       for accum_op in accum_ops:
         accum_op.run()
 
-      val = takeg_t.eval()
+      val = self.evaluate(takeg_t)
       self.assertEqual(30.0, val)
 
       accum_ops = [q.apply_grad((x,), local_step=1) for x in elems]
@@ -295,9 +309,10 @@ class ConditionalAccumulatorTest(test.TestCase):
       for accum_op in accum_ops:
         accum_op.run()
 
-      val = takeg_t.eval()
+      val = self.evaluate(takeg_t)
       self.assertEqual(30.0, val)
 
+  @test_util.run_deprecated_v1
   def testAccumulatorTakeGradInvalidReductionType(self):
     with self.assertRaises(ValueError):
       data_flow_ops.ConditionalAccumulator(
@@ -306,6 +321,7 @@ class ConditionalAccumulatorTest(test.TestCase):
           shape=tensor_shape.TensorShape([1]),
           reduction_type="Invalid")
 
+  @test_util.run_deprecated_v1
   def testAccumulatorInvalidTakeGrad(self):
     with self.cached_session():
       q = data_flow_ops.ConditionalAccumulator(
@@ -319,8 +335,9 @@ class ConditionalAccumulatorTest(test.TestCase):
         accum_op.run()
 
       with self.assertRaises(errors_impl.InvalidArgumentError):
-        takeg_t.eval()
+        self.evaluate(takeg_t)
 
+  @test_util.run_deprecated_v1
   def testAccumulatorRepeatedTakeGradMean(self):
     with self.cached_session():
       q = data_flow_ops.ConditionalAccumulator(
@@ -334,7 +351,7 @@ class ConditionalAccumulatorTest(test.TestCase):
       for accum_op in accum_ops:
         accum_op.run()
 
-      val = takeg_t.eval()
+      val = self.evaluate(takeg_t)
       self.assertEqual(elems_ave, val)
 
       elems = [20.0, 30.0]
@@ -345,9 +362,10 @@ class ConditionalAccumulatorTest(test.TestCase):
       for accum_op in accum_ops:
         accum_op.run()
 
-      val = takeg_t.eval()
+      val = self.evaluate(takeg_t)
       self.assertEqual(elems_ave + 0.0, val)
 
+  @test_util.run_deprecated_v1
   def testAccumulatorRepeatedTakeGradSum(self):
     with self.cached_session():
       q = data_flow_ops.ConditionalAccumulator(
@@ -364,7 +382,7 @@ class ConditionalAccumulatorTest(test.TestCase):
       for accum_op in accum_ops:
         accum_op.run()
 
-      val = takeg_t.eval()
+      val = self.evaluate(takeg_t)
       self.assertEqual(elems_sum, val)
 
       elems = [20.0, 30.0]
@@ -375,9 +393,10 @@ class ConditionalAccumulatorTest(test.TestCase):
       for accum_op in accum_ops:
         accum_op.run()
 
-      val = takeg_t.eval()
+      val = self.evaluate(takeg_t)
       self.assertEqual(elems_sum, val)
 
+  @test_util.run_deprecated_v1
   def testAccumulatorIncrementGlobalStep(self):
     with self.cached_session():
       q = data_flow_ops.ConditionalAccumulator(
@@ -392,8 +411,9 @@ class ConditionalAccumulatorTest(test.TestCase):
       variables.global_variables_initializer().run()
       for _ in range(3):
         set_global_step_op.run()
-        inc_global_step.eval()
+        self.evaluate(inc_global_step)
 
+  @test_util.run_deprecated_v1
   def testAccumulatorSetGlobalStepPreventsAccumulation(self):
     with self.cached_session():
       q = data_flow_ops.ConditionalAccumulator(
@@ -410,11 +430,12 @@ class ConditionalAccumulatorTest(test.TestCase):
           accum_op.run()
         takeg_t = q.take_grad(1)
 
-        val = takeg_t.eval()
+        val = self.evaluate(takeg_t)
         self.assertEqual(0.0 + sum(x for x in local_steps
                                    if x >= ls) / sum(1 for x in local_steps
                                                      if x >= ls), val)
 
+  @test_util.run_deprecated_v1
   def testParallelApplyGrad(self):
     with self.cached_session() as sess:
       q = data_flow_ops.ConditionalAccumulator(
@@ -424,7 +445,7 @@ class ConditionalAccumulatorTest(test.TestCase):
       takeg_t = q.take_grad(1)
 
       def apply_grad(accum_op):
-        sess.run(accum_op)
+        self.evaluate(accum_op)
 
       threads = [
           self.checkedThread(
@@ -436,10 +457,11 @@ class ConditionalAccumulatorTest(test.TestCase):
       for thread in threads:
         thread.join()
 
-      val = takeg_t.eval()
+      val = self.evaluate(takeg_t)
 
       self.assertEqual(val, sum(elems) / len(elems))
 
+  @test_util.run_deprecated_v1
   def testParallelTakeGrad(self):
     with self.cached_session() as sess:
       q = data_flow_ops.ConditionalAccumulator(
@@ -451,14 +473,14 @@ class ConditionalAccumulatorTest(test.TestCase):
       def apply_grad():
         for accum_op in accum_ops:
           time.sleep(1.0)
-          sess.run(accum_op)
+          self.evaluate(accum_op)
 
       apply_grad_thread = self.checkedThread(target=apply_grad)
 
       results = []
 
       def take_grad():
-        results.append(sess.run(takeg_t))
+        results.append(self.evaluate(takeg_t))
 
       threads = [self.checkedThread(target=take_grad) for _ in range(10)]
 
@@ -472,6 +494,7 @@ class ConditionalAccumulatorTest(test.TestCase):
 
       self.assertItemsEqual(elems, results)
 
+  @test_util.run_deprecated_v1
   def testAccumulatorApplyAndBlockingTake(self):
     with self.cached_session() as sess:
       q = data_flow_ops.ConditionalAccumulator(
@@ -485,12 +508,12 @@ class ConditionalAccumulatorTest(test.TestCase):
       def apply_grad():
         time.sleep(1.0)
         for accum_op in accum_ops:
-          sess.run(accum_op)
+          self.evaluate(accum_op)
 
       return_array = []
 
       def take_grad():
-        return_array.append(sess.run(takeg_t))
+        return_array.append(self.evaluate(takeg_t))
 
       accum_thread = self.checkedThread(target=apply_grad)
       takeg_thread = self.checkedThread(target=take_grad)
@@ -503,8 +526,9 @@ class ConditionalAccumulatorTest(test.TestCase):
 
   def _blocking_takeg(self, sess, takeg_op):
     with self.assertRaisesOpError("was cancelled"):
-      sess.run(takeg_op)
+      self.evaluate(takeg_op)
 
+  @test_util.run_deprecated_v1
   def testAccumulatorCancel(self):
     with self.cached_session() as sess:
       q = data_flow_ops.ConditionalAccumulator(
diff --git a/tensorflow/python/kernel_tests/confusion_matrix_test.py b/tensorflow/python/kernel_tests/confusion_matrix_test.py
index bc24345261e..ae13c8e32e5 100644
--- a/tensorflow/python/kernel_tests/confusion_matrix_test.py
+++ b/tensorflow/python/kernel_tests/confusion_matrix_test.py
@@ -71,9 +71,11 @@ class ConfusionMatrixTest(test.TestCase):
 
     self._testConfMatrix(labels=labels, predictions=predictions, truth=truth)
 
+  @test_util.run_deprecated_v1
   def testInt32Basic(self):
     self._testBasic(dtype=np.int32)
 
+  @test_util.run_deprecated_v1
   def testInt64Basic(self):
     self._testBasic(dtype=np.int64)
 
@@ -111,9 +113,11 @@ class ConfusionMatrixTest(test.TestCase):
       self.assertEqual(cm_out.dtype, np_dtype)
       self.assertAllClose(cm_out, truth, atol=1e-10)
 
+  @test_util.run_deprecated_v1
   def testOnTensors_int32(self):
     self._testConfMatrixOnTensors(dtypes.int32, np.int32)
 
+  @test_util.run_deprecated_v1
   def testOnTensors_int64(self):
     self._testConfMatrixOnTensors(dtypes.int64, np.int64)
 
@@ -133,9 +137,11 @@ class ConfusionMatrixTest(test.TestCase):
 
     self._testConfMatrix(labels=labels, predictions=predictions, truth=truth)
 
+  @test_util.run_deprecated_v1
   def testInt32DifferentLabels(self, dtype=np.int32):
     self._testDifferentLabelsInPredictionAndTarget(dtype)
 
+  @test_util.run_deprecated_v1
   def testInt64DifferentLabels(self, dtype=np.int64):
     self._testDifferentLabelsInPredictionAndTarget(dtype)
 
@@ -155,12 +161,15 @@ class ConfusionMatrixTest(test.TestCase):
 
     self._testConfMatrix(labels=labels, predictions=predictions, truth=truth)
 
+  @test_util.run_deprecated_v1
   def testInt32MultipleLabels(self, dtype=np.int32):
     self._testMultipleLabels(dtype)
 
+  @test_util.run_deprecated_v1
   def testInt64MultipleLabels(self, dtype=np.int64):
     self._testMultipleLabels(dtype)
 
+  @test_util.run_deprecated_v1
   def testWeighted(self):
     labels = np.arange(5, dtype=np.int32)
     predictions = np.arange(5, dtype=np.int32)
@@ -177,6 +186,7 @@ class ConfusionMatrixTest(test.TestCase):
     self._testConfMatrix(
         labels=labels, predictions=predictions, weights=weights, truth=truth)
 
+  @test_util.run_deprecated_v1
   def testLabelsTooLarge(self):
     labels = np.asarray([1, 1, 0, 3, 5], dtype=np.int32)
     predictions = np.asarray([2, 1, 0, 2, 2], dtype=np.int32)
@@ -191,6 +201,7 @@ class ConfusionMatrixTest(test.TestCase):
       self._testConfMatrix(
           labels=labels, predictions=predictions, num_classes=3, truth=None)
 
+  @test_util.run_deprecated_v1
   def testPredictionsTooLarge(self):
     labels = np.asarray([1, 1, 0, 2, 2], dtype=np.int32)
     predictions = np.asarray([2, 1, 0, 3, 5], dtype=np.int32)
@@ -205,6 +216,7 @@ class ConfusionMatrixTest(test.TestCase):
       self._testConfMatrix(
           labels=labels, predictions=predictions, num_classes=3, truth=None)
 
+  @test_util.run_deprecated_v1
   def testInvalidRank_predictionsTooBig(self):
     labels = np.asarray([1, 2, 3])
     predictions = np.asarray([[1, 2, 3]])
@@ -212,6 +224,7 @@ class ConfusionMatrixTest(test.TestCase):
                             confusion_matrix.confusion_matrix, predictions,
                             labels)
 
+  @test_util.run_deprecated_v1
   def testInvalidRank_predictionsTooSmall(self):
     labels = np.asarray([[1, 2, 3]])
     predictions = np.asarray([1, 2, 3])
@@ -219,6 +232,7 @@ class ConfusionMatrixTest(test.TestCase):
                             confusion_matrix.confusion_matrix, predictions,
                             labels)
 
+  @test_util.run_deprecated_v1
   def testInputDifferentSize(self):
     labels = np.asarray([1, 2])
     predictions = np.asarray([1, 2, 3])
@@ -232,7 +246,7 @@ class ConfusionMatrixTest(test.TestCase):
     with self.cached_session():
       cm = confusion_matrix.confusion_matrix(
           labels, predictions, dtype=dtypes.int32)
-      tf_cm = cm.eval()
+      tf_cm = self.evaluate(cm)
     self.assertEqual(tf_cm.dtype, np.int32)
 
   def testOutputIsInt64(self):
@@ -241,12 +255,13 @@ class ConfusionMatrixTest(test.TestCase):
     with self.cached_session():
       cm = confusion_matrix.confusion_matrix(
           labels, predictions, dtype=dtypes.int64)
-      tf_cm = cm.eval()
+      tf_cm = self.evaluate(cm)
     self.assertEqual(tf_cm.dtype, np.int64)
 
 
 class RemoveSqueezableDimensionsTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testBothScalarShape(self):
     label_values = 1.0
     prediction_values = 0.0
@@ -261,8 +276,8 @@ class RemoveSqueezableDimensionsTest(test.TestCase):
             labels_placeholder, predictions_placeholder))
 
     with self.cached_session():
-      self.assertAllEqual(label_values, static_labels.eval())
-      self.assertAllEqual(prediction_values, static_predictions.eval())
+      self.assertAllEqual(label_values, self.evaluate(static_labels))
+      self.assertAllEqual(prediction_values, self.evaluate(static_predictions))
       feed_dict = {
           labels_placeholder: label_values,
           predictions_placeholder: prediction_values
@@ -272,6 +287,7 @@ class RemoveSqueezableDimensionsTest(test.TestCase):
       self.assertAllEqual(
           prediction_values, dynamic_predictions.eval(feed_dict=feed_dict))
 
+  @test_util.run_deprecated_v1
   def testSameShape(self):
     label_values = np.ones(shape=(2, 3, 1))
     prediction_values = np.zeros_like(label_values)
@@ -286,8 +302,8 @@ class RemoveSqueezableDimensionsTest(test.TestCase):
             labels_placeholder, predictions_placeholder))
 
     with self.cached_session():
-      self.assertAllEqual(label_values, static_labels.eval())
-      self.assertAllEqual(prediction_values, static_predictions.eval())
+      self.assertAllEqual(label_values, self.evaluate(static_labels))
+      self.assertAllEqual(prediction_values, self.evaluate(static_predictions))
       feed_dict = {
           labels_placeholder: label_values,
           predictions_placeholder: prediction_values
@@ -297,6 +313,7 @@ class RemoveSqueezableDimensionsTest(test.TestCase):
       self.assertAllEqual(
           prediction_values, dynamic_predictions.eval(feed_dict=feed_dict))
 
+  @test_util.run_deprecated_v1
   def testSameShapeExpectedRankDiff0(self):
     label_values = np.ones(shape=(2, 3, 1))
     prediction_values = np.zeros_like(label_values)
@@ -311,8 +328,8 @@ class RemoveSqueezableDimensionsTest(test.TestCase):
             labels_placeholder, predictions_placeholder, expected_rank_diff=0))
 
     with self.cached_session():
-      self.assertAllEqual(label_values, static_labels.eval())
-      self.assertAllEqual(prediction_values, static_predictions.eval())
+      self.assertAllEqual(label_values, self.evaluate(static_labels))
+      self.assertAllEqual(prediction_values, self.evaluate(static_predictions))
       feed_dict = {
           labels_placeholder: label_values,
           predictions_placeholder: prediction_values
@@ -322,6 +339,7 @@ class RemoveSqueezableDimensionsTest(test.TestCase):
       self.assertAllEqual(
           prediction_values, dynamic_predictions.eval(feed_dict=feed_dict))
 
+  @test_util.run_deprecated_v1
   def testSqueezableLabels(self):
     label_values = np.ones(shape=(2, 3, 1))
     prediction_values = np.zeros(shape=(2, 3))
@@ -337,8 +355,8 @@ class RemoveSqueezableDimensionsTest(test.TestCase):
 
     expected_label_values = np.reshape(label_values, newshape=(2, 3))
     with self.cached_session():
-      self.assertAllEqual(expected_label_values, static_labels.eval())
-      self.assertAllEqual(prediction_values, static_predictions.eval())
+      self.assertAllEqual(expected_label_values, self.evaluate(static_labels))
+      self.assertAllEqual(prediction_values, self.evaluate(static_predictions))
       feed_dict = {
           labels_placeholder: label_values,
           predictions_placeholder: prediction_values
@@ -348,6 +366,7 @@ class RemoveSqueezableDimensionsTest(test.TestCase):
       self.assertAllEqual(
           prediction_values, dynamic_predictions.eval(feed_dict=feed_dict))
 
+  @test_util.run_deprecated_v1
   def testSqueezableLabelsExpectedRankDiffPlus1(self):
     label_values = np.ones(shape=(2, 3, 1))
     prediction_values = np.zeros(shape=(2, 3, 5))
@@ -363,8 +382,8 @@ class RemoveSqueezableDimensionsTest(test.TestCase):
 
     expected_label_values = np.reshape(label_values, newshape=(2, 3))
     with self.cached_session():
-      self.assertAllEqual(expected_label_values, static_labels.eval())
-      self.assertAllEqual(prediction_values, static_predictions.eval())
+      self.assertAllEqual(expected_label_values, self.evaluate(static_labels))
+      self.assertAllEqual(prediction_values, self.evaluate(static_predictions))
       feed_dict = {
           labels_placeholder: label_values,
           predictions_placeholder: prediction_values
@@ -374,6 +393,7 @@ class RemoveSqueezableDimensionsTest(test.TestCase):
       self.assertAllEqual(
           prediction_values, dynamic_predictions.eval(feed_dict=feed_dict))
 
+  @test_util.run_deprecated_v1
   def testSqueezablePredictions(self):
     label_values = np.ones(shape=(2, 3))
     prediction_values = np.zeros(shape=(2, 3, 1))
@@ -389,8 +409,9 @@ class RemoveSqueezableDimensionsTest(test.TestCase):
 
     expected_prediction_values = np.reshape(prediction_values, newshape=(2, 3))
     with self.cached_session():
-      self.assertAllEqual(label_values, static_labels.eval())
-      self.assertAllEqual(expected_prediction_values, static_predictions.eval())
+      self.assertAllEqual(label_values, self.evaluate(static_labels))
+      self.assertAllEqual(expected_prediction_values,
+                          self.evaluate(static_predictions))
       feed_dict = {
           labels_placeholder: label_values,
           predictions_placeholder: prediction_values
@@ -401,6 +422,7 @@ class RemoveSqueezableDimensionsTest(test.TestCase):
           expected_prediction_values,
           dynamic_predictions.eval(feed_dict=feed_dict))
 
+  @test_util.run_deprecated_v1
   def testSqueezablePredictionsExpectedRankDiffMinus1(self):
     label_values = np.ones(shape=(2, 3, 5))
     prediction_values = np.zeros(shape=(2, 3, 1))
@@ -416,8 +438,9 @@ class RemoveSqueezableDimensionsTest(test.TestCase):
 
     expected_prediction_values = np.reshape(prediction_values, newshape=(2, 3))
     with self.cached_session():
-      self.assertAllEqual(label_values, static_labels.eval())
-      self.assertAllEqual(expected_prediction_values, static_predictions.eval())
+      self.assertAllEqual(label_values, self.evaluate(static_labels))
+      self.assertAllEqual(expected_prediction_values,
+                          self.evaluate(static_predictions))
       feed_dict = {
           labels_placeholder: label_values,
           predictions_placeholder: prediction_values
@@ -428,6 +451,7 @@ class RemoveSqueezableDimensionsTest(test.TestCase):
           expected_prediction_values,
           dynamic_predictions.eval(feed_dict=feed_dict))
 
+  @test_util.run_deprecated_v1
   def testUnsqueezableLabels(self):
     label_values = np.ones(shape=(2, 3, 2))
     prediction_values = np.zeros(shape=(2, 3))
@@ -453,6 +477,7 @@ class RemoveSqueezableDimensionsTest(test.TestCase):
       self.assertAllEqual(
           prediction_values, dynamic_predictions.eval(feed_dict=feed_dict))
 
+  @test_util.run_deprecated_v1
   def testUnsqueezablePredictions(self):
     label_values = np.ones(shape=(2, 3))
     prediction_values = np.zeros(shape=(2, 3, 2))
diff --git a/tensorflow/python/kernel_tests/constant_op_test.py b/tensorflow/python/kernel_tests/constant_op_test.py
index 38b8c0c146f..583082c2aa2 100644
--- a/tensorflow/python/kernel_tests/constant_op_test.py
+++ b/tensorflow/python/kernel_tests/constant_op_test.py
@@ -70,6 +70,7 @@ class ConstantTest(test.TestCase):
     with self.assertRaises(TypeError):
       constant_op.constant(dtypes_lib.string, "[,]")
 
+  @test_util.run_deprecated_v1
   def testBFloat16(self):
     bfloat16 = dtypes_lib.bfloat16.as_numpy_dtype
     self._testAll(np.arange(-15, 15).reshape([2, 3, 5]).astype(bfloat16))
@@ -77,36 +78,42 @@ class ConstantTest(test.TestCase):
         np.random.normal(size=30).reshape([2, 3, 5]).astype(bfloat16))
     self._testAll(np.empty((2, 0, 5)).astype(bfloat16))
 
+  @test_util.run_deprecated_v1
   def testHalf(self):
     self._testAll(np.arange(-15, 15).reshape([2, 3, 5]).astype(np.float16))
     self._testAll(
         np.random.normal(size=30).reshape([2, 3, 5]).astype(np.float16))
     self._testAll(np.empty((2, 0, 5)).astype(np.float16))
 
+  @test_util.run_deprecated_v1
   def testFloat(self):
     self._testAll(np.arange(-15, 15).reshape([2, 3, 5]).astype(np.float32))
     self._testAll(
         np.random.normal(size=30).reshape([2, 3, 5]).astype(np.float32))
     self._testAll(np.empty((2, 0, 5)).astype(np.float32))
 
+  @test_util.run_deprecated_v1
   def testDouble(self):
     self._testAll(np.arange(-15, 15).reshape([2, 3, 5]).astype(np.float64))
     self._testAll(
         np.random.normal(size=30).reshape([2, 3, 5]).astype(np.float64))
     self._testAll(np.empty((2, 0, 5)).astype(np.float64))
 
+  @test_util.run_deprecated_v1
   def testInt32(self):
     self._testAll(np.arange(-15, 15).reshape([2, 3, 5]).astype(np.int32))
     self._testAll((100 * np.random.normal(size=30)).reshape([2, 3, 5]).astype(
         np.int32))
     self._testAll(np.empty((2, 0, 5)).astype(np.int32))
 
+  @test_util.run_deprecated_v1
   def testInt64(self):
     self._testAll(np.arange(-15, 15).reshape([2, 3, 5]).astype(np.int64))
     self._testAll((100 * np.random.normal(size=30)).reshape([2, 3, 5]).astype(
         np.int64))
     self._testAll(np.empty((2, 0, 5)).astype(np.int64))
 
+  @test_util.run_deprecated_v1
   def testComplex64(self):
     self._testAll(
         np.complex(1, 2) *
@@ -116,6 +123,7 @@ class ConstantTest(test.TestCase):
         np.random.normal(size=30).reshape([2, 3, 5]).astype(np.complex64))
     self._testAll(np.empty((2, 0, 5)).astype(np.complex64))
 
+  @test_util.run_deprecated_v1
   def testComplex128(self):
     self._testAll(
         np.complex(1, 2) *
@@ -125,12 +133,14 @@ class ConstantTest(test.TestCase):
         np.random.normal(size=30).reshape([2, 3, 5]).astype(np.complex128))
     self._testAll(np.empty((2, 0, 5)).astype(np.complex128))
 
+  @test_util.run_deprecated_v1
   def testString(self):
     self._testCpu(
         np.array([compat.as_bytes(str(x)) for x in np.arange(-15, 15)]).reshape(
             [2, 3, 5]))
     self._testCpu(np.empty((2, 0, 5)).astype(np.str_))
 
+  @test_util.run_deprecated_v1
   def testVariant(self):
     # TODO(ebrevdo): Re-enable use_gpu=True once non-DMA Variant
     # copying between CPU and GPU is supported.
@@ -161,6 +171,7 @@ class ConstantTest(test.TestCase):
           message="Variant storing an int, decoded const value:").op
       logging_const_op.run()
 
+  @test_util.run_deprecated_v1
   def testStringWithNulls(self):
     with self.cached_session():
       val = ops.convert_to_tensor(b"\0\0\0\0").eval()
@@ -219,16 +230,28 @@ class ConstantTest(test.TestCase):
 
   def testShapeInconsistent(self):
     with ops.Graph().as_default():
-      c = constant_op.constant([1, 2, 3, 4, 5, 6, 7], shape=[10])
+      c = constant_op.constant_v1([1, 2, 3, 4, 5, 6, 7], shape=[10])
+    self.assertEqual(c.get_shape(), [10])
+
+    with ops.Graph().as_default():
+      with self.assertRaisesRegexp(
+          TypeError, "Expected Tensor's shape"):
+        c = constant_op.constant([1, 2, 3, 4, 5, 6, 7], shape=[10])
+
+  def testPromotionShapes(self):
+    with ops.Graph().as_default():
+      c = constant_op.constant([7], shape=[10])
+    self.assertEqual(c.get_shape(), [10])
+    with ops.Graph().as_default():
+      c = constant_op.constant(3, shape=[10])
     self.assertEqual(c.get_shape(), [10])
 
   # pylint: disable=g-long-lambda
   def testShapeWrong(self):
     with ops.Graph().as_default():
-      with self.assertRaisesWithPredicateMatch(
-          ValueError,
-          lambda e: ("Too many elements provided. Needed at most 5, "
-                     "but received 7" == str(e))):
+      with self.assertRaisesRegexp(ValueError, "Too many elements provided."):
+        constant_op.constant_v1([1, 2, 3, 4, 5, 6, 7], shape=[5])
+      with self.assertRaisesRegexp(TypeError, "Expected Tensor's shape"):
         constant_op.constant([1, 2, 3, 4, 5, 6, 7], shape=[5])
 
   # pylint: enable=g-long-lambda
@@ -253,6 +276,7 @@ class ConstantTest(test.TestCase):
                                    "GraphDef cannot be larger than 2GB."):
         g.as_graph_def()
 
+  @test_util.run_deprecated_v1
   def testSparseValuesRaiseErrors(self):
     with self.assertRaisesRegexp(ValueError,
                                  "setting an array element with a sequence"):
@@ -282,29 +306,29 @@ class AsTensorTest(test.TestCase):
     with self.cached_session():
       x = ops.convert_to_tensor(tensor_shape.TensorShape([]))
       self.assertEqual(dtypes_lib.int32, x.dtype)
-      self.assertAllEqual([], x.eval())
+      self.assertAllEqual([], self.evaluate(x))
 
       x = ops.convert_to_tensor(tensor_shape.TensorShape([1, 2, 3]))
       self.assertEqual(dtypes_lib.int32, x.dtype)
-      self.assertAllEqual([1, 2, 3], x.eval())
+      self.assertAllEqual([1, 2, 3], self.evaluate(x))
 
       x = ops.convert_to_tensor(tensor_shape.TensorShape([2**31-1, 2, 3]))
       self.assertEqual(dtypes_lib.int32, x.dtype)
-      self.assertAllEqual([2**31-1, 2, 3], x.eval())
+      self.assertAllEqual([2**31 - 1, 2, 3], self.evaluate(x))
 
       x = ops.convert_to_tensor(tensor_shape.TensorShape([2**31-1, 2, 3]),
                                 dtype=dtypes_lib.int32)
       self.assertEqual(dtypes_lib.int32, x.dtype)
-      self.assertAllEqual([2**31-1, 2, 3], x.eval())
+      self.assertAllEqual([2**31 - 1, 2, 3], self.evaluate(x))
 
       x = ops.convert_to_tensor(tensor_shape.TensorShape([2**31, 2, 3]))
       self.assertEqual(dtypes_lib.int64, x.dtype)
-      self.assertAllEqual([2**31, 2, 3], x.eval())
+      self.assertAllEqual([2**31, 2, 3], self.evaluate(x))
 
       x = ops.convert_to_tensor(tensor_shape.TensorShape([2**31, 2, 3]),
                                 dtype=dtypes_lib.int64)
       self.assertEqual(dtypes_lib.int64, x.dtype)
-      self.assertAllEqual([2**31, 2, 3], x.eval())
+      self.assertAllEqual([2**31, 2, 3], self.evaluate(x))
 
       with self.assertRaisesRegexp(
           ValueError, "a dimension is too large .2147483648."):
@@ -314,11 +338,11 @@ class AsTensorTest(test.TestCase):
       x = ops.convert_to_tensor(
           tensor_shape.TensorShape([1, 2, 3]), dtype=dtypes_lib.int64)
       self.assertEqual(dtypes_lib.int64, x.dtype)
-      self.assertAllEqual([1, 2, 3], x.eval())
+      self.assertAllEqual([1, 2, 3], self.evaluate(x))
 
       x = array_ops.reshape(
           array_ops.zeros([6]), tensor_shape.TensorShape([2, 3]))
-      self.assertAllEqual([[0.0, 0.0, 0.0], [0.0, 0.0, 0.0]], x.eval())
+      self.assertAllEqual([[0.0, 0.0, 0.0], [0.0, 0.0, 0.0]], self.evaluate(x))
 
     with self.assertRaisesRegexp(ValueError, "partially known"):
       ops.convert_to_tensor(tensor_shape.TensorShape(None))
@@ -330,16 +354,17 @@ class AsTensorTest(test.TestCase):
       ops.convert_to_tensor(
           tensor_shape.TensorShape([1, 2, 3]), dtype=dtypes_lib.float32)
 
+  @test_util.run_deprecated_v1
   def testAsTensorForDimensionInput(self):
     with self.cached_session():
       x = ops.convert_to_tensor(tensor_shape.TensorShape([1, 2, 3])[1])
       self.assertEqual(dtypes_lib.int32, x.dtype)
-      self.assertAllEqual(2, x.eval())
+      self.assertAllEqual(2, self.evaluate(x))
 
       x = ops.convert_to_tensor(
           tensor_shape.TensorShape([1, 2, 3])[1], dtype=dtypes_lib.int64)
       self.assertEqual(dtypes_lib.int64, x.dtype)
-      self.assertAllEqual(2, x.eval())
+      self.assertAllEqual(2, self.evaluate(x))
 
     shape = tensor_shape.TensorShape(None)
     if shape._v2_behavior:
@@ -372,7 +397,7 @@ class ZerosTest(test.TestCase):
     with self.cached_session():
       ret = array_ops.zeros(shape)
       self.assertEqual(shape, ret.get_shape())
-      return ret.eval()
+      return self.evaluate(ret)
 
   def testConst(self):
     self.assertTrue(
@@ -383,7 +408,7 @@ class ZerosTest(test.TestCase):
     self.assertEqual(0, self._Zeros(()))
     with self.cached_session():
       scalar = array_ops.zeros(constant_op.constant([], dtype=dtypes_lib.int32))
-      self.assertEqual(0, scalar.eval())
+      self.assertEqual(0, self.evaluate(scalar))
 
   def testDynamicSizes(self):
     np_ans = np.array([[0] * 3] * 2)
@@ -392,11 +417,12 @@ class ZerosTest(test.TestCase):
       d = array_ops.fill([2, 3], 12., name="fill")
       # Constructs a tensor of zeros of the same dimensions as "d".
       z = array_ops.zeros(array_ops.shape(d))
-      out = z.eval()
+      out = self.evaluate(z)
     self.assertAllEqual(np_ans, out)
     self.assertShapeEqual(np_ans, d)
     self.assertShapeEqual(np_ans, z)
 
+  @test_util.run_deprecated_v1
   def testDtype(self):
     with self.cached_session():
       d = array_ops.fill([2, 3], 12., name="fill")
@@ -420,13 +446,13 @@ class ZerosTest(test.TestCase):
         z = array_ops.zeros([2, 3], dtype=dtype)
         self.assertEqual(z.dtype, dtype)
         self.assertEqual([2, 3], z.get_shape())
-        z_value = z.eval()
+        z_value = self.evaluate(z)
         self.assertFalse(np.any(z_value))
         self.assertEqual((2, 3), z_value.shape)
         z = array_ops.zeros(array_ops.shape(d), dtype=dtype)
         self.assertEqual(z.dtype, dtype)
         self.assertEqual([2, 3], z.get_shape())
-        z_value = z.eval()
+        z_value = self.evaluate(z)
         self.assertFalse(np.any(z_value))
         self.assertEqual((2, 3), z_value.shape)
 
@@ -465,6 +491,7 @@ class ZerosLikeTest(test.TestCase):
       self.assertFalse(np.any(z_value))
       self.assertEqual((2, 3), z_value.shape)
 
+  @test_util.run_deprecated_v1
   def testZerosLikeCPU(self):
     for dtype in [
         dtypes_lib.half, dtypes_lib.float32, dtypes_lib.float64,
@@ -475,6 +502,7 @@ class ZerosLikeTest(test.TestCase):
       self._compareZeros(dtype, fully_defined_shape=False, use_gpu=False)
       self._compareZeros(dtype, fully_defined_shape=True, use_gpu=False)
 
+  @test_util.run_deprecated_v1
   def testZerosLikeGPU(self):
     for dtype in [
         dtypes_lib.half, dtypes_lib.float32, dtypes_lib.float64,
@@ -484,11 +512,13 @@ class ZerosLikeTest(test.TestCase):
       self._compareZeros(dtype, fully_defined_shape=False, use_gpu=True)
       self._compareZeros(dtype, fully_defined_shape=True, use_gpu=True)
 
+  @test_util.run_deprecated_v1
   def testZerosLikePartialShape(self):
     d = array_ops.placeholder(dtypes_lib.float32, shape=[None, 4, None])
     z = array_ops.zeros_like(d)
     self.assertEqual(d.get_shape().as_list(), z.get_shape().as_list())
 
+  @test_util.run_deprecated_v1
   def testZerosLikeDtype(self):
     # Make sure zeros_like works even for dtypes that cannot be cast between
     with self.cached_session():
@@ -502,6 +532,7 @@ class ZerosLikeTest(test.TestCase):
           self.assertEqual(y.shape, shape)
           self.assertAllEqual(y, np.zeros(shape, dtype=out_type))
 
+  @test_util.run_deprecated_v1
   def testZerosLikeVariant(self):
     # TODO(ebrevdo): Re-enable use_gpu=True once non-DMA Variant
     # copying between CPU and GPU is supported AND we register a
@@ -538,7 +569,7 @@ class OnesTest(test.TestCase):
     with self.cached_session():
       ret = array_ops.ones(shape)
       self.assertEqual(shape, ret.get_shape())
-      return ret.eval()
+      return self.evaluate(ret)
 
   def testConst(self):
     self.assertTrue(np.array_equal(self._Ones([2, 3]), np.array([[1] * 3] * 2)))
@@ -548,7 +579,7 @@ class OnesTest(test.TestCase):
     self.assertEqual(1, self._Ones(()))
     with self.cached_session():
       scalar = array_ops.ones(constant_op.constant([], dtype=dtypes_lib.int32))
-      self.assertEqual(1, scalar.eval())
+      self.assertEqual(1, self.evaluate(scalar))
 
   def testDynamicSizes(self):
     np_ans = np.array([[1] * 3] * 2)
@@ -557,11 +588,12 @@ class OnesTest(test.TestCase):
       d = array_ops.fill([2, 3], 12., name="fill")
       # Constructs a tensor of ones of the same dimensions as "d".
       z = array_ops.ones(array_ops.shape(d))
-      out = z.eval()
+      out = self.evaluate(z)
     self.assertAllEqual(np_ans, out)
     self.assertShapeEqual(np_ans, d)
     self.assertShapeEqual(np_ans, z)
 
+  @test_util.run_deprecated_v1
   def testAutoPack(self):
     with self.cached_session():
       h = array_ops.placeholder(dtypes_lib.int32, shape=[])
@@ -570,6 +602,7 @@ class OnesTest(test.TestCase):
       out = z.eval(feed_dict={h: 4, w: 16})
     self.assertAllEqual(out, np.array([[1] * 16] * 4))
 
+  @test_util.run_deprecated_v1
   def testDtype(self):
     with self.cached_session():
       d = array_ops.fill([2, 3], 12., name="fill")
@@ -617,12 +650,13 @@ class OnesLikeTest(test.TestCase):
         z_var = array_ops.ones_like(d)
         # Test that the type is correct
         self.assertEqual(z_var.dtype, dtype)
-        z_value = z_var.eval()
+        z_value = self.evaluate(z_var)
 
       # Test that the value is correct
       self.assertTrue(np.array_equal(z_value, np.array([[1] * 3] * 2)))
       self.assertEqual([2, 3], z_var.get_shape())
 
+  @test_util.run_deprecated_v1
   def testOnesLikePartialShape(self):
     d = array_ops.placeholder(dtypes_lib.float32, shape=[None, 4, None])
     z = array_ops.ones_like(d)
@@ -634,7 +668,7 @@ class FillTest(test.TestCase):
   def _compare(self, dims, val, np_ans, use_gpu):
     with self.cached_session(use_gpu=use_gpu):
       tf_ans = array_ops.fill(dims, val, name="fill")
-      out = tf_ans.eval()
+      out = self.evaluate(tf_ans)
     self.assertAllClose(np_ans, out)
     # Fill does not set the shape.
     # self.assertShapeEqual(np_ans, tf_ans)
@@ -667,12 +701,14 @@ class FillTest(test.TestCase):
     np_ans = np.array([[0.15 + 0.3j] * 3] * 2).astype(np.complex128)
     self._compareAll([2, 3], np_ans[0][0], np_ans)
 
+  @test_util.run_deprecated_v1
   def testFillString(self):
     np_ans = np.array([[b"yolo"] * 3] * 2)
     with self.session(use_gpu=False):
       tf_ans = array_ops.fill([2, 3], np_ans[0][0], name="fill").eval()
     self.assertAllEqual(np_ans, tf_ans)
 
+  @test_util.run_deprecated_v1
   def testFillNegative(self):
     with self.cached_session():
       for shape in (-1,), (2, -1), (-1, 2), (-2), (-3):
@@ -686,6 +722,7 @@ class FillTest(test.TestCase):
         with self.assertRaises(errors_impl.InvalidArgumentError):
           fill_t.eval({dims: shape})
 
+  @test_util.run_deprecated_v1
   def testShapeFunctionEdgeCases(self):
     # Non-vector dimensions.
     with self.assertRaises(ValueError):
@@ -704,6 +741,7 @@ class FillTest(test.TestCase):
             dtypes_lib.int32, shape=()), 17], 1.0)
     self.assertEqual([None, 17], f.get_shape().as_list())
 
+  @test_util.run_deprecated_v1
   def testGradient(self):
     with self.cached_session():
       in_v = constant_op.constant(5.0)
@@ -716,6 +754,7 @@ class FillTest(test.TestCase):
 
 class PlaceholderTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testDtype(self):
     with self.cached_session():
       p = array_ops.placeholder(dtypes_lib.float32, shape=(10, 10), name="p")
@@ -726,8 +765,9 @@ class PlaceholderTest(test.TestCase):
 
       with self.assertRaisesOpError(
           "must feed a value for placeholder tensor 'p' with dtype float"):
-        p_identity.eval()
+        self.evaluate(p_identity)
 
+  @test_util.run_deprecated_v1
   def testShape(self):
     with self.cached_session():
       p = array_ops.placeholder(dtypes_lib.float32, shape=(10, 10), name="p")
@@ -739,12 +779,13 @@ class PlaceholderTest(test.TestCase):
       with self.assertRaisesOpError(
           "must feed a value for placeholder tensor 'p' with dtype float and "
           r"shape \[10,10\]"):
-        p_identity.eval()
+        self.evaluate(p_identity)
 
       with self.assertRaisesWithPredicateMatch(
           ValueError, lambda e: "Cannot feed value of shape" in str(e)):
         p_identity.eval(feed_dict={p: feed_array[:5, :5]})
 
+  @test_util.run_deprecated_v1
   def testUnknownShape(self):
     with self.cached_session():
       p = array_ops.placeholder(dtypes_lib.float32, shape=None, name="p")
@@ -757,12 +798,14 @@ class PlaceholderTest(test.TestCase):
       self.assertAllClose(
           p_identity.eval(feed_dict={p: feed_array}), feed_array)
 
+  @test_util.run_deprecated_v1
   def testScalarShape(self):
     with self.cached_session():
       p = array_ops.placeholder(dtypes_lib.float32, shape=[], name="p")
       p_identity = array_ops.identity(p)
       self.assertAllClose(p_identity.eval(feed_dict={p: 5}), 5)
 
+  @test_util.run_deprecated_v1
   def testPartialShape(self):
     with self.cached_session():
       p = array_ops.placeholder(dtypes_lib.float32, shape=[None, 3], name="p")
@@ -775,6 +818,7 @@ class PlaceholderTest(test.TestCase):
           ValueError, lambda e: "Cannot feed value of shape" in str(e)):
         p_identity.eval(feed_dict={p: feed_array[:5, :2]})
 
+  @test_util.run_deprecated_v1
   def testPartialShapeWhenNotFed(self):
     with self.cached_session():
       p = array_ops.placeholder(dtypes_lib.float32, shape=[None, 3], name="p")
@@ -783,8 +827,9 @@ class PlaceholderTest(test.TestCase):
       # Should trigger an operator error, not a shape error.
       with self.assertRaisesOpError(
           "must feed a value for placeholder tensor 'p' with dtype float"):
-        p_identity.eval()
+        self.evaluate(p_identity)
 
+  @test_util.run_deprecated_v1
   def testControlDependency(self):
     with self.cached_session():
       p = array_ops.placeholder(dtypes_lib.int32, shape=[], name="p")
@@ -794,10 +839,12 @@ class PlaceholderTest(test.TestCase):
       val = np.array(2).astype(np.int)
       self.assertEqual(10, d.eval(feed_dict={p: val}))
 
+  @test_util.run_deprecated_v1
   def testBadShape(self):
     with self.assertRaises(ValueError):
       array_ops.placeholder(dtypes_lib.float32, shape=(-1, 10))
 
+  @test_util.run_deprecated_v1
   def testTensorStr(self):
     a = array_ops.placeholder(dtypes_lib.float32, shape=None, name="a")
     self.assertEqual("<tf.Tensor 'a:0' shape=<unknown> dtype=float32>", repr(a))
@@ -813,6 +860,7 @@ class PlaceholderTest(test.TestCase):
       self.assertEqual(
           "<tf.Tensor 'c:0' shape=(32, ?, 2) dtype=qint32>", repr(c))
 
+  @test_util.run_deprecated_v1
   def testOldGraph(self):
     # Load graph generated from earlier version of TF where
     # placeholder shape was not set.
@@ -892,36 +940,40 @@ versions {
 
 class PlaceholderWithDefaultTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testFullShape(self):
     with self.session(force_gpu=test_util.is_gpu_available()):
       p = array_ops.placeholder_with_default([[2, 2], [2, 2]], shape=[2, 2])
       a = array_ops.identity(p)
-      self.assertAllEqual([[2, 2], [2, 2]], a.eval())
+      self.assertAllEqual([[2, 2], [2, 2]], self.evaluate(a))
       self.assertAllEqual(
           [[3, 3], [3, 3]], a.eval(feed_dict={p: [[3, 3], [3, 3]]}))
 
       with self.assertRaises(ValueError):
         a.eval(feed_dict={p: [[6, 6, 6], [6, 6, 6]]})
 
+  @test_util.run_deprecated_v1
   def testPartialShape(self):
     with self.session(force_gpu=test_util.is_gpu_available()):
       p = array_ops.placeholder_with_default([1, 2, 3], shape=[None])
       a = array_ops.identity(p)
-      self.assertAllEqual([1, 2, 3], a.eval())
+      self.assertAllEqual([1, 2, 3], self.evaluate(a))
       self.assertAllEqual([3, 37], a.eval(feed_dict={p: [3, 37]}))
 
       with self.assertRaises(ValueError):
         a.eval(feed_dict={p: [[2, 2], [2, 2]]})
 
+  @test_util.run_deprecated_v1
   def testNoShape(self):
     with self.session(force_gpu=test_util.is_gpu_available()):
       p = array_ops.placeholder_with_default([17], shape=None)
       a = array_ops.identity(p)
-      self.assertAllEqual([17], a.eval())
+      self.assertAllEqual([17], self.evaluate(a))
       self.assertAllEqual([3, 37], a.eval(feed_dict={p: [3, 37]}))
       self.assertAllEqual(
           [[3, 3], [3, 3]], a.eval(feed_dict={p: [[3, 3], [3, 3]]}))
 
+  @test_util.run_deprecated_v1
   def testGradient(self):
     with self.session(force_gpu=test_util.is_gpu_available()):
       x = array_ops.placeholder(dtypes_lib.float32, [5, 7])
diff --git a/tensorflow/python/kernel_tests/control_flow_ops_py_test.py b/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
index 19b067e4499..c80c95da846 100644
--- a/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
+++ b/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
@@ -139,8 +139,9 @@ class ControlFlowTest(test.TestCase):
 
       self.assertTrue(isinstance(v2, ops.Tensor))
       variables.global_variables_initializer().run()
-      self.assertEqual(9, v2.eval())
+      self.assertEqual(9, self.evaluate(v2))
 
+  @test_util.run_deprecated_v1
   def testRefEnter(self):
     with self.cached_session():
       v = variables.VariableV1(7)
@@ -152,8 +153,9 @@ class ControlFlowTest(test.TestCase):
       v2 = control_flow_ops.with_dependencies([op], enter_v)
       v3 = control_flow_ops.exit(v2)
       variables.global_variables_initializer().run()
-      self.assertEqual(9, v3.eval())
+      self.assertEqual(9, self.evaluate(v3))
 
+  @test_util.run_deprecated_v1
   def testRefSwitch(self):
     with self.cached_session():
       v = variables.VariableV1(7)
@@ -162,7 +164,7 @@ class ControlFlowTest(test.TestCase):
       v1 = control_flow_ops._SwitchRefOrTensor(v._ref(), p)  # pylint: disable=protected-access
       v2 = state_ops.assign(v1[1], 9)
       variables.global_variables_initializer().run()
-      self.assertEqual(9, v2.eval())
+      self.assertEqual(9, self.evaluate(v2))
 
   def testEnterMulExit(self):
     with self.cached_session():
@@ -173,9 +175,10 @@ class ControlFlowTest(test.TestCase):
       mul_op = math_ops.multiply(enter_data, enter_five)
       exit_op = control_flow_ops.exit(mul_op)
 
-      result = exit_op.eval()
+      result = self.evaluate(exit_op)
     self.assertAllEqual(np.array([x * 5 for x in [1, 2, 3, 4, 5, 6]]), result)
 
+  @test_util.run_deprecated_v1
   def testEnterShapePropagation(self):
     with self.cached_session():
       v = variables.Variable([0.0, 0.0], dtype=dtypes.float32)
@@ -214,7 +217,7 @@ class ControlFlowTest(test.TestCase):
       with self.assertRaisesWithPredicateMatch(
           errors_impl.InvalidArgumentError,
           lambda e: "Retval[0] does not have value" in str(e)):
-        dead_branch.eval()
+        self.evaluate(dead_branch)
 
   def testSwitchMergeLess(self):
     with self.cached_session():
@@ -225,7 +228,7 @@ class ControlFlowTest(test.TestCase):
       switch_op = control_flow_ops.switch(data, less_op)
       merge_op = control_flow_ops.merge(switch_op)[0]
 
-      result = merge_op.eval()
+      result = self.evaluate(merge_op)
     self.assertAllEqual(np.arange(1, 7), result)
 
   def testSwitchMergeAddIdentity(self):
@@ -238,7 +241,7 @@ class ControlFlowTest(test.TestCase):
       id_op = array_ops.identity(switch_op[1])
       merge_op = control_flow_ops.merge([add_op, id_op])[0]
 
-      result = merge_op.eval()
+      result = self.evaluate(merge_op)
     self.assertAllEqual(np.array([x + 1 for x in [1, 2, 3, 4, 5, 6]]), result)
 
   def testSwitchMergeAddMul(self):
@@ -252,7 +255,7 @@ class ControlFlowTest(test.TestCase):
       mul_op = math_ops.multiply(switch_op[1], five)
       merge_op = control_flow_ops.merge([add_op, mul_op])[0]
 
-      result = merge_op.eval()
+      result = self.evaluate(merge_op)
     self.assertAllEqual(np.array([x * 5 for x in [1, 2, 3, 4, 5, 6]]), result)
 
   def testLoop_false(self):
@@ -269,9 +272,10 @@ class ControlFlowTest(test.TestCase):
       next_n = control_flow_ops.next_iteration(switch_n[0])
       merge_n.op._update_input(1, next_n)
 
-      result = exit_n.eval()
+      result = self.evaluate(exit_n)
     self.assertAllEqual(10, result)
 
+  @test_util.run_deprecated_v1
   def testLoop_1(self):
     with self.cached_session():
       zero = constant_op.constant(0)
@@ -295,7 +299,7 @@ class ControlFlowTest(test.TestCase):
       merge_i.op._update_input(1, next_i)
 
       exit_i = control_flow_ops.exit(switch_i[0])
-      result = exit_i.eval()
+      result = self.evaluate(exit_i)
     self.assertAllEqual(10, result)
 
   def testLoop_2(self):
@@ -321,7 +325,7 @@ class ControlFlowTest(test.TestCase):
       merge_i.op._update_input(1, next_i)
 
       exit_i = control_flow_ops.exit(switch_i[0])
-      result = exit_i.eval()
+      result = self.evaluate(exit_i)
     self.assertAllEqual(10, result)
 
   def testDifferentFrame(self):
@@ -333,6 +337,7 @@ class ControlFlowTest(test.TestCase):
       with self.assertRaisesOpError("has inputs from different frames"):
         res.eval(feed_dict={data: 1.0})
 
+  @test_util.run_deprecated_v1
   def testCondBool(self):
     values = constant_op.constant(10)
     fn1 = lambda: math_ops.add(values, 1)
@@ -340,6 +345,7 @@ class ControlFlowTest(test.TestCase):
     with self.assertRaisesRegexp(TypeError, "must not be a Python bool"):
       _ = control_flow_ops.cond(False, fn1, fn2)
 
+  @test_util.run_deprecated_v1
   def testCondInt(self):
     p = array_ops.placeholder(dtypes.bool, shape=[])
     v = constant_op.constant(10)
@@ -389,7 +395,6 @@ class ControlFlowTest(test.TestCase):
             with self.assertRaisesRegexp(ValueError, "may not be fed"):
               sess.run(r, feed_dict={t: 3})
 
-  @test_util.disable_control_flow_v2("b/113296180 (IndexedSlices)")
   def testCondIndexedSlices(self):
     with self.cached_session():
       values = constant_op.constant(10)
@@ -405,7 +410,6 @@ class ControlFlowTest(test.TestCase):
     self.assertAllEqual(11, val)
     self.assertAllEqual(0, ind)
 
-  @test_util.disable_control_flow_v2("b/113296161 (SparseTensors)")
   def testCondSparseTensor(self):
     with self.cached_session():
       values = constant_op.constant([2.0, 4.0], name="values")
@@ -437,6 +441,19 @@ class ControlFlowTest(test.TestCase):
 
       self.assertEqual(1.0, control_flow_ops.cond(rv, case, lambda: t).eval())
 
+  def testCondWithTensorArrayGrad(self):
+    with self.cached_session() as sess:
+      with ops.device(test.gpu_device_name()):
+        pred = array_ops.placeholder(dtypes.bool, [])
+        x = constant_op.constant([1.0, 2.0, 3.0])
+        y = control_flow_ops.cond(
+            pred, lambda: functional_ops.map_fn(lambda z: z * 2.0, x),
+            lambda: constant_op.constant([1.0, 1.0, 1.0]))
+        g = gradients_impl.gradients(y, x)[0]
+
+      self.assertAllEqual(sess.run(g, {pred: True}), [2.0, 2.0, 2.0])
+      self.assertAllEqual(sess.run(g, {pred: False}), [0.0, 0.0, 0.0])
+
   @test_util.disable_control_flow_v2("b/113293074")
   def testCondIndexedSlicesDifferentTypes(self):
     with self.cached_session():
@@ -478,7 +495,7 @@ class ControlFlowTest(test.TestCase):
       fn2 = lambda: math_ops.subtract(x, 1)
       r = control_flow_ops.cond(pred, fn1, fn2)
 
-      result = r.eval()
+      result = self.evaluate(r)
     self.assertAllEqual(11, result)
 
   def testCond_1(self):
@@ -494,7 +511,7 @@ class ControlFlowTest(test.TestCase):
       r = control_flow_ops.cond(
           math_ops.less(1, 0), lambda: math_ops.add(x, 1),
           lambda: math_ops.subtract(x, 1))
-      result = r.eval()
+      result = self.evaluate(r)
     self.assertAllEqual(9, result)
 
   def testCond_3(self):
@@ -507,7 +524,7 @@ class ControlFlowTest(test.TestCase):
       fn3 = lambda: math_ops.add(control_flow_ops.cond(pred, fn1, fn2), 1)
       r = control_flow_ops.cond(pred, fn3, fn2)
 
-      result = r.eval()
+      result = self.evaluate(r)
     self.assertAllEqual(12, result)
 
   @test_util.run_in_graph_and_eager_modes
@@ -534,9 +551,9 @@ class ControlFlowTest(test.TestCase):
         result = f().eval()
         self.assertEqual(True, result)
         # Only second cond result was fetched, so v1 assign shouldn't run.
-        self.assertEqual(7, v1.eval())
-        self.assertEqual(2, v2.eval())
-        self.assertEqual(7, v3.eval())
+        self.assertEqual(7, self.evaluate(v1))
+        self.assertEqual(2, self.evaluate(v2))
+        self.assertEqual(7, self.evaluate(v3))
 
     result = f_defun()
     self.assertEqual(True, self.evaluate(result))
@@ -557,10 +574,9 @@ class ControlFlowTest(test.TestCase):
 
       for i in range(10):
         alive, count = body(i)
-      self.assertAllEqual(4, count.eval())
+      self.assertAllEqual(4, self.evaluate(count))
 
   def testCond_6(self):
-
     with self.cached_session():
       v1 = variables.Variable([7])
 
@@ -571,7 +587,7 @@ class ControlFlowTest(test.TestCase):
       r = control_flow_ops.cond(pred, fn1, fn2)
 
       variables.global_variables_initializer().run()
-      result = r.eval()
+      result = self.evaluate(r)
       self.assertAllEqual(np.array([7]), result)
 
   def testCond_7(self):
@@ -582,8 +598,89 @@ class ControlFlowTest(test.TestCase):
       fn1 = lambda: [math_ops.add(x, 1), math_ops.add(x, 2)]
       fn2 = lambda: [y, y]
       r = control_flow_ops.cond(pred, fn1, fn2)
-      self.assertAllEqual([11, 12], sess.run(r))
+      self.assertAllEqual([11, 12], self.evaluate(r))
 
+  def testCondListOutput(self):
+    with self.cached_session() as sess:
+      x = constant_op.constant(10)
+      y = constant_op.constant(200)
+      pred = math_ops.less(1, 2)
+      fn1 = lambda: [math_ops.add(x, y), math_ops.add(x, y)]
+      fn2 = lambda: [y, y]
+      r = control_flow_ops.cond(pred, fn1, fn2)
+      test_result = self.evaluate(r)
+      self.assertListEqual([210, 210], test_result)
+
+  def testTupleOutput(self):
+    with self.cached_session() as sess:
+      x = constant_op.constant(10)
+      y = constant_op.constant(200)
+      pred = math_ops.less(1, 2)
+      fn1 = lambda: (math_ops.add(x, y), math_ops.add(x, y))
+      fn2 = lambda: (y, y)
+      r = control_flow_ops.cond(pred, fn1, fn2)
+      test_result = self.evaluate(r)
+      self.assertTupleEqual((210, 210), test_result)
+
+  def testDictOutput(self):
+    with self.cached_session() as sess:
+      x = constant_op.constant(10)
+      y = constant_op.constant(200)
+      pred = math_ops.less(1, 2)
+      fn1 = lambda: {"a": math_ops.add(x, y), "b": math_ops.add(x, y)}
+      fn2 = lambda: {"a": y, "b": y}
+      r = control_flow_ops.cond(pred, fn1, fn2)
+      test_result = self.evaluate(r)
+      self.assertDictEqual({"a": 210, "b": 210}, test_result)
+
+  @test_util.run_deprecated_v1
+  def testEmbeddedListOutput(self):
+    with self.cached_session() as sess:
+      x = constant_op.constant(10)
+      y = constant_op.constant(200)
+      pred = math_ops.less(1, 2)
+      fn1 = lambda: [[math_ops.add(x, y), math_ops.add(x, y)]]
+      fn2 = lambda: [[y, y]]
+      # Pass strict=True flag as cond_v2 allows for tensors to be
+      # in nested output structures as singletons
+      r = control_flow_ops.cond(pred, fn1, fn2, strict=True)
+      test_result = self.evaluate(r)
+      self.assertListEqual([[210, 210]], test_result)
+
+  def testEmbeddedTupleOutput(self):
+    with self.cached_session() as sess:
+      x = constant_op.constant(10)
+      y = constant_op.constant(200)
+      pred = math_ops.less(1, 2)
+      fn1 = lambda: ((math_ops.add(x, y), math_ops.add(x, y)))
+      fn2 = lambda: ((y, y))
+      r = control_flow_ops.cond(pred, fn1, fn2)
+      test_result = self.evaluate(r)
+      self.assertTupleEqual(((210, 210)), test_result)
+
+  def testEmbeddedDictOutput(self):
+    with self.cached_session() as sess:
+      x = constant_op.constant(10)
+      y = constant_op.constant(200)
+      pred = math_ops.less(1, 2)
+      fn1 = lambda: {"a": {"c": math_ops.add(x, y)}, "b": {"d": math_ops.add(x, y)}}
+      fn2 = lambda: {"a": {"c": y}, "b": {"d": y}}
+      r = control_flow_ops.cond(pred, fn1, fn2)
+      test_result = self.evaluate(r)
+      self.assertDictEqual({"a": {"c": 210}, "b": {"d": 210}}, test_result)
+
+  def testCheckNestedOutputStruct(self):
+    with self.cached_session() as sess:
+      x = constant_op.constant(10)
+      y = constant_op.constant(200)
+      pred = math_ops.less(1, 2)
+      fn1 = lambda: {"a": math_ops.add(x, y), "b": math_ops.add(x, y)}
+      fn2 = lambda: {"c": y, "d": y}
+      with self.assertRaisesRegexp(ValueError, "The two structures don't have the same nested structure"):
+        r = control_flow_ops.cond(pred, fn1, fn2)
+        self.evaluate(r)
+
+  @test_util.run_deprecated_v1
   def testCondRef(self):
 
     with self.cached_session():
@@ -596,9 +693,10 @@ class ControlFlowTest(test.TestCase):
       true_fn = lambda: x
       false_fn = lambda: constant_op.constant([2.0])
       r = control_flow_ops.cond(constant_op.constant(False), true_fn, false_fn)
-      self.assertAllEqual([2.0], r.eval())
+      self.assertAllEqual([2.0], self.evaluate(r))
 
   @test_util.disable_control_flow_v2("b/79881896 (control deps)")
+  @test_util.run_deprecated_v1
   def testCondWithControl(self):
     with self.cached_session():
       control_holder = array_ops.placeholder(dtypes.float32, shape=())
@@ -612,7 +710,7 @@ class ControlFlowTest(test.TestCase):
       r = control_flow_ops.cond(
           constant_op.constant(True), true_branch,
           lambda: constant_op.constant(1))
-      self.assertEqual(5, r.eval())
+      self.assertEqual(5, self.evaluate(r))
 
   def testUninitializedRefIdentity(self):
     with self.cached_session() as sess:
@@ -636,7 +734,7 @@ class ControlFlowTest(test.TestCase):
       with ops.control_dependencies([v_t_op]):
         orig_v = array_ops.identity(v)
       merged_op = control_flow_ops.merge([assign_v, orig_v])
-      self.assertAllEqual([1.0], sess.run(merged_op.output))
+      self.assertAllEqual([1.0], self.evaluate(merged_op.output))
 
   def testCondSwitchIdentity(self):
     # Make sure the recv identity is not removed by optimization.
@@ -650,7 +748,7 @@ class ControlFlowTest(test.TestCase):
         return control_flow_ops.Assert(False, ["Wrong branch!!!"])
 
       r = control_flow_ops.cond(pred, fn1, fn2)
-      sess.run(r)
+      self.evaluate(r)
 
   def testCondRecvIdentity(self):
     # Make sure the switch identity is not removed by optimization.
@@ -666,7 +764,7 @@ class ControlFlowTest(test.TestCase):
           return control_flow_ops.Assert(False, ["Wrong branch!!!"])
 
       r = control_flow_ops.cond(pred, fn1, fn2)
-      sess.run(r)
+      self.evaluate(r)
 
   def testCondGrad_1(self):
     with self.cached_session():
@@ -677,8 +775,9 @@ class ControlFlowTest(test.TestCase):
       r = control_flow_ops.cond(pred, fn1, fn2)
 
       grad = gradients_impl.gradients(r, [x])[0]
-      self.assertAllEqual(1.0, grad.eval())
+      self.assertAllEqual(1.0, self.evaluate(grad))
 
+  @test_util.run_deprecated_v1
   def testCondGrad_2(self):
     with self.cached_session():
       c = array_ops.placeholder(dtypes.int32, shape=[])
@@ -694,6 +793,7 @@ class ControlFlowTest(test.TestCase):
 
   @test_util.disable_control_flow_v2(
       "b/110550782 (gradient w.r.t external variable)")
+  @test_util.run_deprecated_v1
   def testCondGrad_3(self):
     with self.cached_session():
       c = array_ops.placeholder(dtypes.int32, shape=[])
@@ -711,6 +811,35 @@ class ControlFlowTest(test.TestCase):
       self.assertAllEqual(980.0, r.eval(feed_dict={c: 1}))
       self.assertAllEqual(30.0, r.eval(feed_dict={c: 3}))
 
+  @test_util.run_deprecated_v1
+  def testCondGradMultiDevice(self):
+    config = config_pb2.ConfigProto(device_count={"CPU": 2},
+                                    allow_soft_placement=True)
+    with self.cached_session(use_gpu=True, config=config) as sess:
+      pred = array_ops.placeholder(dtypes.bool, [])
+      x = array_ops.placeholder(dtypes.float32)
+      y = array_ops.placeholder(dtypes.float32)
+
+      with ops.device("/cpu:0"):
+        z = control_flow_ops.cond(pred, lambda: x * y * 2.0, lambda: 2.0)
+
+      with ops.device("/cpu:1"):
+        grad = gradients_impl.gradients(z, x)[0]
+
+      self.assertEqual(sess.run(grad, {pred: True, x: 1.0, y: 2.0}), 4.0)
+      self.assertEqual(sess.run(grad, {pred: False, x: 1.0, y: 2.0}), 0.0)
+
+      with ops.device("/cpu:0"):
+        grad_grad = gradients_impl.gradients(grad, x)[0]
+
+      # v1 control flow gets None second derivative for some reason.
+      if not control_flow_ops.ENABLE_COND_V2:
+        self.assertIsNone(grad_grad)
+        return
+
+      self.assertEqual(sess.run(grad_grad, {pred: True, x: 1.0, y: 2.0}), 0.0)
+      self.assertEqual(sess.run(grad_grad, {pred: False, x: 1.0, y: 2.0}), 0.0)
+
   def testNestedCond_Simple(self):
     with self.cached_session():
       x = constant_op.constant(0., name="X")
@@ -718,15 +847,16 @@ class ControlFlowTest(test.TestCase):
           constant_op.constant(True), lambda: x,
           lambda: control_flow_ops.cond(x < 1., lambda: x, lambda: x))
       result = gradients_impl.gradients(y, x)[0]
-      self.assertEqual(1.0, result.eval())
+      self.assertEqual(1.0, self.evaluate(result))
 
       z = control_flow_ops.cond(
           constant_op.constant(False), lambda: x,
           lambda: control_flow_ops.cond(x < 1., lambda: x, lambda: x))
       result = gradients_impl.gradients(z, x)[0]
-      self.assertEqual(1.0, result.eval())
+      self.assertEqual(1.0, self.evaluate(result))
 
   @test_util.disable_control_flow_v2("b/113327884")
+  @test_util.run_deprecated_v1
   def testCondGrad_Gather(self):
     with self.cached_session() as sess:
       v1 = variables.Variable([1.0, 42.0])
@@ -740,16 +870,26 @@ class ControlFlowTest(test.TestCase):
       # Should just be [1, 1], but possibly a sparse representation
       gv, gi = sess.run([grad.values, grad.indices], feed_dict={c: 1})
       dense_gv = [
-          sum([y for (x, y) in zip(gi, gv) if x == i]) for i in range(2)
+          sum(y for (x, y) in zip(gi, gv) if x == i) for i in range(2)
       ]
       self.assertAllEqual(dense_gv, [1.0, 1.0])
       # Should be [0, 2], as the else forwards v1[1] twice
       gv, gi = sess.run([grad.values, grad.indices], feed_dict={c: 3})
       dense_gv = [
-          sum([y for (x, y) in zip(gi, gv) if x == i]) for i in range(2)
+          sum(y for (x, y) in zip(gi, gv) if x == i) for i in range(2)
       ]
       self.assertAllEqual(dense_gv, [0.0, 2.0])
 
+  def testCondPredicateTensor(self):
+    """Regression test for lowering predicate from non-first output of an op."""
+
+    @eager_function.defun
+    def foo():
+      return constant_op.constant("foo"), constant_op.constant(True)
+
+    r = control_flow_ops.cond(foo()[1], lambda: 1.0, lambda: 2.0)
+    self.assertEqual(self.evaluate(r), 1.0)
+
   # TODO(b/117945658): reenable
   @test_util.run_in_graph_and_eager_modes
   def DISABLED_testCondAutoControlDeps(self):
@@ -863,7 +1003,7 @@ class ControlFlowTest(test.TestCase):
       c = lambda x: math_ops.less(x, 10000)
       b = lambda x: math_ops.add(x, 1)
       r = control_flow_ops.while_loop(c, b, [n], parallel_iterations=20)
-      self.assertEqual(10000, r.eval())
+      self.assertEqual(10000, self.evaluate(r))
 
   @test_util.disable_control_flow_v2("b/79881896 (control deps)")
   def testWhileExternalControlDependencies(self):
@@ -894,10 +1034,11 @@ class ControlFlowTest(test.TestCase):
 
       result = control_flow_ops.while_loop(cond=lambda i: i < 5,
                                            body=body_fn, loop_vars=[0])
-      result.eval()
+      self.evaluate(result)
       self.assertAllEqual(v.eval(), 1.0)
 
   @test_util.disable_control_flow_v2("b/113324949 (RefVariable)")
+  @test_util.run_deprecated_v1
   def testWhileWithRefs_1(self):
     with self.cached_session() as sess:
       x = variables.VariableV1(0)._ref()  # pylint: disable=protected-access
@@ -917,7 +1058,7 @@ class ControlFlowTest(test.TestCase):
       self.assertEqual(r[0].dtype, dtypes.int32)
       self.assertEqual(r[1].dtype, dtypes.int32_ref)
 
-      value_i, value_x = sess.run(r)
+      value_i, value_x = self.evaluate(r)
 
     self.assertEqual(100, value_i)
     self.assertEqual(0, value_x)
@@ -926,19 +1067,19 @@ class ControlFlowTest(test.TestCase):
     with self.cached_session():
       s = constant_op.constant(0)
       r = isum(s)
-      self.assertAllEqual(45, r.eval())
+      self.assertAllEqual(45, self.evaluate(r))
 
   def testWhileWithMaximumIterations(self):
     with self.cached_session():
       s = constant_op.constant([1, 2, 3, 4, 5])
       r = isum(s, maximum_iterations=3)
-      self.assertAllEqual([1 + 3, 2 + 3, 3 + 3, 4 + 3, 5 + 3], r.eval())
+      self.assertAllEqual([1 + 3, 2 + 3, 3 + 3, 4 + 3, 5 + 3], self.evaluate(r))
 
   def testWhileWithMaximumIterationsAndSingleArgument(self):
     with self.cached_session():
       r = control_flow_ops.while_loop(
           lambda i: i < 3, lambda i: i + 1, [0], maximum_iterations=1)
-      self.assertEqual(1, r.eval())
+      self.assertEqual(1, self.evaluate(r))
 
   @test_util.disable_control_flow_v2("b/115776323 (max_iters)")
   def testSingleNestedMaximumIterationsWhileLoopGradientInXLAContext(self):
@@ -1137,6 +1278,7 @@ class ControlFlowTest(test.TestCase):
 
   # Have more than 10 parallel iterations and hence exercise k-bound
   # most of the time.
+  @test_util.run_deprecated_v1
   def testWhile_3(self):
     with self.cached_session():
 
@@ -1157,6 +1299,7 @@ class ControlFlowTest(test.TestCase):
       result = r[3].eval()
     self.assertAllEqual(10100, result)
 
+  @test_util.run_deprecated_v1
   def testWhile_4(self):
     with self.cached_session():
 
@@ -1231,7 +1374,7 @@ class ControlFlowTest(test.TestCase):
       c = lambda x: math_ops.less(x, 10.0)
       b = lambda x: math_ops.add(x, 1.0)
       r = control_flow_ops.while_loop(c, b, [n])
-      self.assertAllClose(10.0, r.eval())
+      self.assertAllClose(10.0, self.evaluate(r))
 
   def testWhile_Gpu_1(self):
     self._testWhile_Gpu_1(use_gpu=False)
@@ -1247,7 +1390,7 @@ class ControlFlowTest(test.TestCase):
           return math_ops.add(x, 1.0)
 
       r = control_flow_ops.while_loop(c, b, [n])
-      self.assertAllClose(10.0, r.eval())
+      self.assertAllClose(10.0, self.evaluate(r))
 
   def testWhile_Gpu_2(self):
     self._testWhile_Gpu_2(use_gpu=False)
@@ -1268,15 +1411,16 @@ class ControlFlowTest(test.TestCase):
           c, _b, [i, m],
           [i.get_shape(), tensor_shape.unknown_shape()])
       r = r[1] * array_ops.ones([8, 8])
-      self.assertAllEqual(np.ones((8, 8)), r.eval())
+      self.assertAllEqual(np.ones((8, 8)), self.evaluate(r))
 
+  @test_util.run_deprecated_v1
   def testWhileWithNonTensorInput_Scalar(self):
     with self.cached_session():
       n = 0
       c = lambda x: x < 10000
       b = lambda x: x + 1
       r = control_flow_ops.while_loop(c, b, [n], parallel_iterations=20)
-      self.assertEqual(10000, r.eval())
+      self.assertEqual(10000, self.evaluate(r))
 
   def testWhileWithNonTensorInput_Vector(self):
     with self.cached_session():
@@ -1284,7 +1428,7 @@ class ControlFlowTest(test.TestCase):
       c = lambda x: x[0] < 10000
       b = lambda x: array_ops.stack([x[0] + 1])
       r = control_flow_ops.while_loop(c, b, [n], parallel_iterations=20)
-      self.assertEqual([10000], r.eval())
+      self.assertEqual([10000], self.evaluate(r))
 
   def testWhileShapeInference(self):
     with self.cached_session():
@@ -1344,6 +1488,7 @@ class ControlFlowTest(test.TestCase):
             [i.get_shape(), tensor_shape.TensorShape([5])])
 
   @test_util.disable_control_flow_v2("b/116282023 (IndexedSlices)")
+  @test_util.run_deprecated_v1
   def testWhileShapeInferenceIndexedSlices(self):
     with self.cached_session():
       values = constant_op.constant([[2.0, 4.0], [3.0, 5.0]], name="values")
@@ -1396,7 +1541,7 @@ class ControlFlowTest(test.TestCase):
       c = lambda x: math_ops.less(x, 200)
       b = lambda x: math_ops.add(x, cpu_sum(n))
       r = control_flow_ops.while_loop(c, b, [n])
-      self.assertEqual(225, r.eval())
+      self.assertEqual(225, self.evaluate(r))
 
   def testNestedWhile_1(self):
     self._testNestedWhile_1(use_gpu=False)
@@ -1428,7 +1573,7 @@ class ControlFlowTest(test.TestCase):
 
       r = control_flow_ops.while_loop(
           outer_c, outer_b, [s0], parallel_iterations=1)
-      self.assertEqual(1048576.0, r.eval())
+      self.assertEqual(1048576.0, self.evaluate(r))
 
   def testNestedWhile_2(self):
     self._testNestedWhile_2(use_gpu=False)
@@ -1450,6 +1595,7 @@ class ControlFlowTest(test.TestCase):
           condition, body, [n, r], parallel_iterations=1)
       self.assertAllEqual(12, res[1].eval())
 
+  @test_util.run_deprecated_v1
   def testWhileWithControl_2(self):
     with self.cached_session():
       r = constant_op.constant(0)
@@ -1462,7 +1608,7 @@ class ControlFlowTest(test.TestCase):
 
       res = control_flow_ops.while_loop(
           condition, body, [r], parallel_iterations=1)
-      self.assertAllEqual(12, res.eval())
+      self.assertAllEqual(12, self.evaluate(res))
 
   def testWhileWithControl_3(self):
     with self.cached_session() as sess:
@@ -1509,7 +1655,7 @@ class ControlFlowTest(test.TestCase):
       with ops.control_dependencies([control_flow_ops.no_op()]):
         loop = control_flow_ops.while_loop(cond, body,
                                            (constant_op.constant(5),))
-      self.assertEqual(0, sess.run(loop))
+      self.assertEqual(0, self.evaluate(loop))
 
   @test_util.disable_control_flow_v2("b/113324949 (ref vars)")
   def testWhileCondWithControl_1(self):
@@ -1531,8 +1677,8 @@ class ControlFlowTest(test.TestCase):
 
       r = control_flow_ops.while_loop(loop_condition, loop_body, (i0,))
       variables.global_variables_initializer().run()
-      self.assertEqual(4, r.eval())
-      self.assertAllClose(65536.0, v.eval())
+      self.assertEqual(4, self.evaluate(r))
+      self.assertAllClose(65536.0, self.evaluate(v))
 
   @test_util.disable_control_flow_v2("b/113324949 (ref vars)")
   def testWhileCondExitControl(self):
@@ -1556,8 +1702,8 @@ class ControlFlowTest(test.TestCase):
           constant_op.constant(False), lambda: constant_op.constant(1.0),
           false_branch)
       variables.global_variables_initializer().run()
-      self.assertEqual(6.0, r.eval())
-      self.assertEqual(99, v.eval())
+      self.assertEqual(6.0, self.evaluate(r))
+      self.assertEqual(99, self.evaluate(v))
 
   def testCondWhile_1(self):
 
@@ -1568,7 +1714,7 @@ class ControlFlowTest(test.TestCase):
       r = control_flow_ops.cond(
           math_ops.less(0, 1), lambda: control_flow_ops.while_loop(c, b, [n]),
           lambda: n)
-      self.assertAllEqual(10, r.eval())
+      self.assertAllEqual(10, self.evaluate(r))
 
   def testCondWhile_2(self):
 
@@ -1579,7 +1725,7 @@ class ControlFlowTest(test.TestCase):
       r = control_flow_ops.cond(
           math_ops.less(1, 0), lambda: math_ops.add(n, 1),
           lambda: control_flow_ops.while_loop(c, b, [n]))
-      self.assertAllEqual(10, r.eval())
+      self.assertAllEqual(10, self.evaluate(r))
 
   def _testCondWhile_3(self, use_gpu):
     with self.cached_session(use_gpu=use_gpu) as sess:
@@ -1604,6 +1750,7 @@ class ControlFlowTest(test.TestCase):
       self.assertEqual([2.0], sess.run(r1, {p: False}))
 
   @test_util.disable_control_flow_v2("b/116743589")
+  @test_util.run_deprecated_v1
   def testCondWhile_3(self):
     self._testCondWhile_3(use_gpu=False)
     self._testCondWhile_3(use_gpu=True)
@@ -1622,7 +1769,7 @@ class ControlFlowTest(test.TestCase):
           lambda: math_ops.add(x, one), lambda: math_ops.subtract(x, one))
       # pylint: enable=undefined-variable
       r = control_flow_ops.while_loop(c, b, [i])
-      self.assertAllEqual(10, r.eval())
+      self.assertAllEqual(10, self.evaluate(r))
 
   def testWhileCond_2(self):
 
@@ -1631,7 +1778,7 @@ class ControlFlowTest(test.TestCase):
       c = lambda x: math_ops.less(x, 10)
       b = lambda x: control_flow_ops.cond(constant_op.constant(True), lambda: math_ops.add(x, 1), lambda: n)
       r = control_flow_ops.while_loop(c, b, [n])
-      self.assertAllEqual(10, r.eval())
+      self.assertAllEqual(10, self.evaluate(r))
 
   def testWhileCond_3(self):
 
@@ -1645,10 +1792,41 @@ class ControlFlowTest(test.TestCase):
                                           lambda: math_ops.subtract(x, 1))
       # pylint: enable=undefined-variable
       r = control_flow_ops.while_loop(c, b, [n])
-      self.assertAllEqual(10, r.eval())
+      self.assertAllEqual(10, self.evaluate(r))
+
+  @test_util.run_deprecated_v1
+  def testWhileCondGradMultiDevice(self):
+    config = config_pb2.ConfigProto(device_count={"CPU": 2},
+                                    allow_soft_placement=True)
+    with self.cached_session(use_gpu=True, config=config) as sess:
+      pred = array_ops.placeholder(dtypes.bool, [])
+      x_init = constant_op.constant(1.0)
+
+      with ops.device("/cpu:0"):
+        z = control_flow_ops.while_loop(
+            lambda i, _: i < 3,
+            lambda i, x: (i + 1, control_flow_ops.cond(
+                pred, lambda: x * 2.0, lambda: 10.0)),
+            [0, x_init])
+
+      with ops.device("/cpu:1"):
+        grad = gradients_impl.gradients(z, x_init)[0]
+
+      self.assertEqual(sess.run(grad, {pred: True}), 8.0)
+      self.assertEqual(sess.run(grad, {pred: False}), 0.0)
+
+      if not control_flow_ops.ENABLE_WHILE_V2:
+        return
+
+      with ops.device("/cpu:0"):
+        grad_grad = gradients_impl.gradients(grad, x_init)[0]
+
+      self.assertEqual(sess.run(grad_grad, {pred: True}), 0.0)
+      self.assertEqual(sess.run(grad_grad, {pred: False}), 0.0)
 
   # NOTE: It is ok to have parallel_iterations > 1
   @test_util.disable_control_flow_v2("b/113324949 (RefVariable)")
+  @test_util.run_deprecated_v1
   def testWhileUpdateVariable_1(self):
     with self.cached_session():
       select = variables.Variable([3.0, 4.0, 5.0])
@@ -1667,8 +1845,8 @@ class ControlFlowTest(test.TestCase):
       r = control_flow_ops.while_loop(
           loop_iterator, loop_body, [n], parallel_iterations=1)
       variables.global_variables_initializer().run()
-      self.assertEqual(3, r.eval())
-      result = select.eval()
+      self.assertEqual(3, self.evaluate(r))
+      result = self.evaluate(select)
       self.assertAllClose(np.array([10.0, 10.0, 10.0]), result)
 
   @test_util.disable_control_flow_v2("b/113324949 (RefVariable)")
@@ -1692,13 +1870,14 @@ class ControlFlowTest(test.TestCase):
       r = control_flow_ops.while_loop(
           loop_iterator, loop_body, [n], parallel_iterations=1)
       variables.global_variables_initializer().run()
-      self.assertEqual(3, r.eval())
-      result1 = select1.eval()
+      self.assertEqual(3, self.evaluate(r))
+      result1 = self.evaluate(select1)
       self.assertAllClose(np.array([10.0, 10.0, 10.0]), result1)
-      result2 = select2.eval()
+      result2 = self.evaluate(select2)
       self.assertAllClose(np.array([10.0, 10.0, 10.0]), result2)
 
   @test_util.disable_control_flow_v2("b/113324949 (RefVariable)")
+  @test_util.run_deprecated_v1
   def testWhileUpdateVariable_3(self):
     with self.cached_session():
       select = variables.Variable([3.0, 4.0, 5.0])
@@ -1721,6 +1900,7 @@ class ControlFlowTest(test.TestCase):
     self.assertAllClose(np.array([10.0, 10.0, 10.0]), result)
 
   @test_util.disable_control_flow_v2("b/113324949 (RefVariable)")
+  @test_util.run_deprecated_v1
   def testWhileUpdateVariable_4(self):
     with self.cached_session():
       var_a = variables.Variable(0, name="a")
@@ -1744,11 +1924,12 @@ class ControlFlowTest(test.TestCase):
       lpa = control_flow_ops.while_loop(
           pred, loop_body, [c], parallel_iterations=1)
 
-      self.assertEqual(0, var_b.eval())
-      lpa.eval()  # Run the loop
-      self.assertEqual(10, var_b.eval())
+      self.assertEqual(0, self.evaluate(var_b))
+      self.evaluate(lpa)  # Run the loop
+      self.assertEqual(10, self.evaluate(var_b))
 
   @test_util.disable_control_flow_v2("b/113324949 (RefVariable)")
+  @test_util.run_deprecated_v1
   def testWhileUpdateVariable_5(self):
     with self.cached_session():
       # Create some variables.
@@ -1773,10 +1954,10 @@ class ControlFlowTest(test.TestCase):
       lpa = control_flow_ops.while_loop(
           pred, loop_body, [var_b], parallel_iterations=1, name="loop")
 
-      self.assertEqual(0, var_b.eval())
-      lpa.eval()  # Run the loop
-      self.assertEqual(10, var_a.eval())
-      self.assertEqual(10, var_b.eval())
+      self.assertEqual(0, self.evaluate(var_b))
+      self.evaluate(lpa)  # Run the loop
+      self.assertEqual(10, self.evaluate(var_a))
+      self.assertEqual(10, self.evaluate(var_b))
 
   @test_util.disable_control_flow_v2("b/113324949 (RefVariable)")
   def testWhileUpdateVariable_6(self):
@@ -1803,10 +1984,10 @@ class ControlFlowTest(test.TestCase):
       lpa = control_flow_ops.while_loop(
           pred, loop_body, [c], parallel_iterations=1, name="loop")
 
-      self.assertEqual(0, var_b.eval())
-      lpa.eval()  # Run the loop
-      self.assertEqual(55, var_b.eval())
-      self.assertEqual(10, var_a.eval())
+      self.assertEqual(0, self.evaluate(var_b))
+      self.evaluate(lpa)  # Run the loop
+      self.assertEqual(55, self.evaluate(var_b))
+      self.assertEqual(10, self.evaluate(var_a))
 
   def testWhileQueue_1(self):
     with self.cached_session():
@@ -1822,7 +2003,7 @@ class ControlFlowTest(test.TestCase):
         return ni
 
       r = control_flow_ops.while_loop(c, b, [i], parallel_iterations=1)
-      self.assertEqual([10], r.eval())
+      self.assertEqual([10], self.evaluate(r))
       for i in xrange(10):
         self.assertEqual([i], q.dequeue().eval())
 
@@ -1858,7 +2039,7 @@ class ControlFlowTest(test.TestCase):
           b1, [r, x],
           [r.get_shape(), tensor_shape.unknown_shape()],
           parallel_iterations=1)
-      self.assertEqual(45, rx.eval())
+      self.assertEqual(45, self.evaluate(rx))
 
   def _testWhileGrad_ColocateGradients(self, colocate):
     gpu_dev_name = test.gpu_device_name() if test.is_gpu_available(
@@ -1893,7 +2074,7 @@ class ControlFlowTest(test.TestCase):
         self.assertFalse(gpu_dev_name in dev)
 
     with self.session(graph=graph) as sess:
-      self.assertAllClose(1024.0, sess.run(r))
+      self.assertAllClose(1024.0, self.evaluate(r))
 
   @test_util.disable_control_flow_v2("b/116351701 (colocation)")
   def testWhileGrad_ColocateGradients(self):
@@ -1909,7 +2090,7 @@ class ControlFlowTest(test.TestCase):
       r = control_flow_ops.cond(math_ops.less(1, 2), lambda: r, lambda: v)
 
       r = gradients_impl.gradients(r, v)[0]
-      self.assertAllClose(1024.0, r.eval())
+      self.assertAllClose(1024.0, self.evaluate(r))
 
   def testWhileGrad_Shape(self):
     with self.cached_session():
@@ -1928,6 +2109,7 @@ class ControlFlowTest(test.TestCase):
       self.assertEqual([None], r.get_shape().as_list())
       self.assertAllClose([810.0, 2560.0], r.eval(feed_dict={x: [3.0, 4.0]}))
 
+  @test_util.run_deprecated_v1
   def testWhileGrad_BaseShape(self):
     with self.cached_session() as sess:
       x = array_ops.placeholder(dtypes.float32, [None])
@@ -1949,7 +2131,7 @@ class ControlFlowTest(test.TestCase):
       r = math_ops.multiply(r, r)
 
       r = gradients_impl.gradients(r, v)[0]
-      self.assertEqual(524288.0, r.eval())
+      self.assertEqual(524288.0, self.evaluate(r))
 
   def testWhileGrad_LoopAdd(self):
     with self.cached_session():
@@ -1960,7 +2142,7 @@ class ControlFlowTest(test.TestCase):
       r = math_ops.add(r, r)
 
       r = gradients_impl.gradients(r, v)[0]
-      self.assertAllClose(2048.0, r.eval())
+      self.assertAllClose(2048.0, self.evaluate(r))
 
   def _testWhileGrad_Mul(self, use_gpu, p_iters):
     with self.cached_session(use_gpu=use_gpu) as sess:
@@ -1971,11 +2153,12 @@ class ControlFlowTest(test.TestCase):
       r = control_flow_ops.while_loop(c, b, [v], parallel_iterations=p_iters)
 
       grad_a, grad_v = gradients_impl.gradients(r, [a, v])
-      grad_a_val, grad_v_val = sess.run([grad_a, grad_v])
+      grad_a_val, grad_v_val = self.evaluate([grad_a, grad_v])
       self.assertAllClose(216.0, grad_a_val)
       self.assertAllClose(81.0, grad_v_val)
 
   @test_util.disable_control_flow_v2("b/116630618 (parallel_iters: times out)")
+  @test_util.run_deprecated_v1
   def testWhileGrad_Mul(self):
     self._testWhileGrad_Mul(use_gpu=False, p_iters=1)
     self._testWhileGrad_Mul(use_gpu=False, p_iters=10)
@@ -2003,14 +2186,13 @@ class ControlFlowTest(test.TestCase):
 
       r = control_flow_ops.while_loop(c, b, [v])
       r = gradients_impl.gradients(r, v)[0]
-      self.assertAllClose(512.0, r.eval())
+      self.assertAllClose(512.0, self.evaluate(r))
 
+  @test_util.run_deprecated_v1
   def testNestedWhileCondWhileGrad(self):
-    if control_flow_ops.ENABLE_WHILE_V2 and test_util.is_gpu_available():
-      self.skipTest("b/118459209")
     self._testNestedWhileCondWhileGrad(use_gpu=False)
 
-  @test_util.disable_control_flow_v2("b/118459209")
+  @test_util.run_deprecated_v1
   def testNestedWhileCondWhileGradGpu(self):
     self._testNestedWhileCondWhileGrad(use_gpu=True)
 
@@ -2026,6 +2208,7 @@ class ControlFlowTest(test.TestCase):
       variables.global_variables_initializer().run()
       self.assertAllClose(216.0, r[0].eval())
 
+  @test_util.run_deprecated_v1
   def testWhileGrad_ResourceVariable(self):
     with self.cached_session():
       a = resource_variable_ops.ResourceVariable(3.0)
@@ -2040,7 +2223,7 @@ class ControlFlowTest(test.TestCase):
 
   def testWhileGradInCond(self):
 
-    with self.cached_session():
+    with self.cached_session() as sess:
       n = ops.convert_to_tensor(1.0, name="n")
       x = array_ops.placeholder(dtypes.float32, shape=None)
       c = lambda n: math_ops.less(n, 10.0)
@@ -2051,10 +2234,15 @@ class ControlFlowTest(test.TestCase):
                                         [tensor_shape.unknown_shape()])
         return gradients_impl.gradients(r, x)
 
-      r = control_flow_ops.cond(math_ops.less(1, 2), fn1, lambda: x)
-      self.assertAllClose(9.0, r.eval(feed_dict={x: 1.0}))
+      #placed lambda function return tensor in list and set strict flag to True
+      #as cond_v2 implementation preserves nested output structures even with singeltons
+      r = control_flow_ops.cond(math_ops.less(1, 2), fn1, lambda: [x], strict=True)
+      #cannot run eval() on list object so use sess.run() and save output
+      result = sess.run(r,feed_dict={x: 1.0})
+      self.assertAllClose([9.0], result)
 
   @test_util.disable_control_flow_v2("b/116340060")
+  @test_util.run_deprecated_v1
   def testGradInWhileWrtInitialLoopVal(self):
     with self.cached_session():
       x = array_ops.placeholder(dtypes.float32, shape=(), name="x")
@@ -2102,7 +2290,7 @@ class ControlFlowTest(test.TestCase):
     i, x = control_flow_ops.while_loop(lambda i, x: i < 3, outer_body, [0, 0.0])
 
     with self.cached_session() as sess:
-      i_val, x_val = sess.run([i, x])
+      i_val, x_val = self.evaluate([i, x])
       self.assertEqual(i_val, 3)
       self.assertAllClose(x_val, 1.0)
 
@@ -2131,7 +2319,7 @@ class ControlFlowTest(test.TestCase):
 
       r_flattened = nest.flatten(r)
       self.assertEqual([100.0, 1.0, 102.0, 3.0, 4.0 + 100 * 2.0],
-                       sess.run(r_flattened))
+                       self.evaluate(r_flattened))
 
   def testWhile_NestedBadArityFails(self):
     with self.cached_session():
@@ -2172,6 +2360,7 @@ class ControlFlowTest(test.TestCase):
       r = gradients_impl.gradients([rx], y)
       self.assertAllClose(120.0, r[0].eval())
 
+  @test_util.run_deprecated_v1
   def testWhileGrad_Dependency(self):
     with self.cached_session():
       i = constant_op.constant(0, name="i")
@@ -2223,6 +2412,7 @@ class ControlFlowTest(test.TestCase):
       variables.global_variables_initializer().run()
       self.assertAllClose(np.ones([2, 3]), sess.run(grad[0]))
 
+  @test_util.run_deprecated_v1
   def testWhileGrad_Const(self):
     with self.cached_session() as sess:
       c0 = constant_op.constant(0.0, name="c0")
@@ -2295,10 +2485,11 @@ class ControlFlowTest(test.TestCase):
       with ops.control_dependencies([x_f]):
         y_f_d = array_ops.identity(y_f, name="y_f_d")
 
-      self.assertAllClose(2.0, y_f_d.eval())  # y_f_d = 1.0 + 1.0
+      self.assertAllClose(2.0, self.evaluate(y_f_d))  # y_f_d = 1.0 + 1.0
       g = gradients_impl.gradients([y_f_d], [x])[0]
       self.assertTrue(g is not None)
-      self.assertAllClose(1.0, g.eval())  # y_f_d = x + 1.0, dy_f_d/dx = 1.0
+      self.assertAllClose(1.0,
+                          self.evaluate(g))  # y_f_d = x + 1.0, dy_f_d/dx = 1.0
 
   def _testNestedWhileGrad_Simple(self, use_gpu):
     with self.cached_session(use_gpu=use_gpu):
@@ -2314,8 +2505,9 @@ class ControlFlowTest(test.TestCase):
       r = control_flow_ops.while_loop(c, b, [v])
 
       r = gradients_impl.gradients(r, v)[0]
-      self.assertAllClose(8.0, r.eval())
+      self.assertAllClose(8.0, self.evaluate(r))
 
+  @test_util.run_deprecated_v1
   def testNestedWhileGrad_Simple(self):
     self._testNestedWhileGrad_Simple(use_gpu=False)
     self._testNestedWhileGrad_Simple(use_gpu=True)
@@ -2341,8 +2533,9 @@ class ControlFlowTest(test.TestCase):
       r = control_flow_ops.while_loop(c, b, [v])
 
       r = gradients_impl.gradients(r, v)[0]
-      self.assertAllClose(256.0, r.eval())
+      self.assertAllClose(256.0, self.evaluate(r))
 
+  @test_util.run_deprecated_v1
   def testNestedWhileGrad_ParallelInner(self):
     with self.cached_session():
       v = constant_op.constant(1.0)
@@ -2364,10 +2557,8 @@ class ControlFlowTest(test.TestCase):
       r = control_flow_ops.while_loop(c, b, [v])
 
       r = gradients_impl.gradients(r, v)[0]
-      self.assertAllClose(512.0, r.eval())
+      self.assertAllClose(512.0, self.evaluate(r))
 
-  @test_util.disable_control_flow_v2("unsupported: resource creation in body. "
-                                     "Enable with new TAs b/117675481")
   def testNestedWhileGrad_ParallelIterations(self):
     # Make sure the stack pushes and pops of an inner loop are executed in
     # the sequential order of the iterations of its outer loop.
@@ -2386,9 +2577,9 @@ class ControlFlowTest(test.TestCase):
       res = outer_loop(inp)
       optimizer = adam.AdamOptimizer(learning_rate=0.001)
       train_op = optimizer.minimize(math_ops.reduce_mean(math_ops.square(res)))
-      sess.run(variables.global_variables_initializer())
-      sess.run(train_op)
-      self.assertAllClose(2.999, var.eval())
+      self.evaluate(variables.global_variables_initializer())
+      self.evaluate(train_op)
+      self.assertAllClose(2.999, self.evaluate(var))
 
   def _testWhileCondGrad_Simple(self, use_gpu):
     with self.cached_session(use_gpu=use_gpu):
@@ -2404,14 +2595,16 @@ class ControlFlowTest(test.TestCase):
       # pylint: enable=undefined-variable
       r = control_flow_ops.while_loop(c, b, [v])
       r = gradients_impl.gradients(r, v)[0]
-      self.assertAllClose(1024.0, r.eval())
+      self.assertAllClose(1024.0, self.evaluate(r))
 
   @test_util.disable_control_flow_v2("b/117519152")
+  @test_util.run_deprecated_v1
   def testWhileCondGrad_Simple(self):
     self._testWhileCondGrad_Simple(use_gpu=False)
     self._testWhileCondGrad_Simple(use_gpu=True)
 
   @test_util.disable_control_flow_v2("b/117276490")
+  @test_util.run_deprecated_v1
   def testWhileCondGrad_UnknownShape(self):
     with self.cached_session() as sess:
       v = array_ops.placeholder(dtypes.float32)
@@ -2429,6 +2622,7 @@ class ControlFlowTest(test.TestCase):
       r = sess.run(r, feed_dict={v: 2.0})
       self.assertAllClose(1024.0, r)
 
+  @test_util.run_deprecated_v1
   def testWhileGrad_Concat(self):
     with self.cached_session() as sess:
       x = variable_scope.get_variable("x", initializer=[[1., 2.]])
@@ -2446,11 +2640,11 @@ class ControlFlowTest(test.TestCase):
           [i0.get_shape(), tensor_shape.TensorShape([None, 2])])
       s = math_ops.reduce_sum(h)
 
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       optimizer = gradient_descent.GradientDescentOptimizer(0.01)
       op = optimizer.minimize(s)
-      sess.run(op)
-      self.assertAllClose([[0.98000002, 1.98000002]], sess.run(x))
+      self.evaluate(op)
+      self.assertAllClose([[0.98000002, 1.98000002]], self.evaluate(x))
 
   @test_util.disable_control_flow_v2("b/113324949 (RefVariable)")
   def testWhileWithRefsWithGradients_1(self):
@@ -2501,9 +2695,10 @@ class ControlFlowTest(test.TestCase):
 
       _, r = control_flow_ops.while_loop(c, b, [i, x])
       r = gradients_impl.gradients(r.values, values)[0]
-      self.assertAllClose(np.array([1024.0, 1024.0]), r.eval())
+      self.assertAllClose(np.array([1024.0, 1024.0]), self.evaluate(r))
 
   @test_util.disable_control_flow_v2("b/116328420 (SparseTensor)")
+  @test_util.run_deprecated_v1
   def testWhileGrad_SparseTensor(self):
     with self.cached_session():
       values = constant_op.constant([2.0, 4.0], name="values")
@@ -2524,7 +2719,7 @@ class ControlFlowTest(test.TestCase):
 
       _, r = control_flow_ops.while_loop(c, b, [i, x])
       r = gradients_impl.gradients(r.values, values)[0]
-      self.assertAllClose(np.array([1024.0, 1024.0]), r.eval())
+      self.assertAllClose(np.array([1024.0, 1024.0]), self.evaluate(r))
 
   @test_util.disable_control_flow_v2("b/115920078 (gradients)")
   def testCallGradInLoop(self):
@@ -2544,10 +2739,9 @@ class ControlFlowTest(test.TestCase):
 
       output_grad = control_flow_ops.while_loop(
           c, b, [i0, constant_op.constant(0.0)])
-      self.assertAllClose(600.0, sess.run(output_grad)[1])
+      self.assertAllClose(600.0, self.evaluate(output_grad)[1])
 
-  @test_util.disable_control_flow_v2("unsupported: resource creation in body. "
-                                     "Enable with new TAs b/117675481")
+  @test_util.run_deprecated_v1
   def testWhileAndTensorArray(self):
     with self.cached_session() as sess:
       param = constant_op.constant(2.0)
@@ -2565,8 +2759,9 @@ class ControlFlowTest(test.TestCase):
 
       r = control_flow_ops.while_loop(c, b, [n0, y0], parallel_iterations=1)
       r = gradients_impl.gradients(r, param)[0]
-      self.assertAllClose(107520.0, sess.run(r))
+      self.assertAllClose(107520.0, self.evaluate(r))
 
+  @test_util.run_deprecated_v1
   def testWhileGrad_StopGrad(self):
     with self.cached_session():
       x = constant_op.constant(3.0, name="x")
@@ -2582,9 +2777,9 @@ class ControlFlowTest(test.TestCase):
       rx, ry = control_flow_ops.while_loop(c, b, [x, y])
 
       r = gradients_impl.gradients(rx, y)[0]
-      self.assertEqual(136.0, r.eval())
+      self.assertEqual(136.0, self.evaluate(r))
       r = gradients_impl.gradients(ry, y)[0]
-      self.assertEqual(32.0, r.eval())
+      self.assertEqual(32.0, self.evaluate(r))
 
       r = gradients_impl.gradients(array_ops.stop_gradient(rx), y)[0]
       self.assertEqual(r, None)
@@ -2602,14 +2797,15 @@ class ControlFlowTest(test.TestCase):
       self.assertEqual(r, None)
 
       r = gradients_impl.gradients(math_ops.add(rx, ry), y)[0]
-      self.assertEqual(168.0, r.eval())
+      self.assertEqual(168.0, self.evaluate(r))
       r = gradients_impl.gradients(
           math_ops.add(rx, array_ops.stop_gradient(ry)), y)[0]
-      self.assertEqual(136.0, r.eval())
+      self.assertEqual(136.0, self.evaluate(r))
       r = gradients_impl.gradients(
           math_ops.add(array_ops.stop_gradient(rx), ry), y)[0]
-      self.assertEqual(32.0, r.eval())
+      self.assertEqual(32.0, self.evaluate(r))
 
+  @test_util.run_deprecated_v1
   def testWhileGrad_StopGradInside(self):
     with self.cached_session():
       x = constant_op.constant(3.0, name="x")
@@ -2625,10 +2821,11 @@ class ControlFlowTest(test.TestCase):
       rx, _ = control_flow_ops.while_loop(c, b, [x, y])
 
       r = gradients_impl.gradients(rx, y)[0]
-      self.assertAllClose(0.0, r.eval())
+      self.assertAllClose(0.0, self.evaluate(r))
       r = gradients_impl.gradients(rx, x)[0]
-      self.assertAllClose(156.0, r.eval())
+      self.assertAllClose(156.0, self.evaluate(r))
 
+  @test_util.run_deprecated_v1
   def testWhileGrad_StopGradInsideNoShape(self):
     with self.cached_session() as sess:
       x = array_ops.placeholder(dtypes.float32)
@@ -2650,9 +2847,10 @@ class ControlFlowTest(test.TestCase):
       self.assertAllClose([156.0, 400.0], sess.run(r, feed_dict=feed_dict))
       name = "gradients/while/stopped_grad"
       all_ops = x.graph.get_operations()
-      self.assertFalse(any([name in op.name for op in all_ops]))
+      self.assertFalse(any(name in op.name for op in all_ops))
 
   @test_util.disable_control_flow_v2("b/117954949")
+  @test_util.run_deprecated_v1
   def testWhileGradGradFail(self):
     theta = variables.Variable(initial_value=1.)
 
@@ -2667,6 +2865,7 @@ class ControlFlowTest(test.TestCase):
     grad_theta_stopped = array_ops.stop_gradient(grad_theta)
     gradients_impl.gradients(grad_theta_stopped, theta)
 
+  @test_util.run_deprecated_v1
   def testStopGradOnWhileGrad(self):
     with self.cached_session():
       x = constant_op.constant(2.0, name="x")
@@ -2681,9 +2880,10 @@ class ControlFlowTest(test.TestCase):
       r = math_ops.add(math_ops.square(y), rx)
       r = math_ops.add(r, rg)
       r = gradients_impl.gradients(r, y)[0]
-      self.assertEqual(388.0, r.eval())
+      self.assertEqual(388.0, self.evaluate(r))
 
   @test_util.disable_control_flow_v2("b/113324949 (RefVariable)")
+  @test_util.run_deprecated_v1
   def testWhileGradientWithNontrainablePath1(self):
     q = variables.Variable([7., 8.])
 
@@ -2698,8 +2898,8 @@ class ControlFlowTest(test.TestCase):
     dy_dq, = gradients_impl.gradients(y, q)
     self.assertIsNotNone(dy_dq)
     with self.cached_session() as sess:
-      sess.run(q.initializer)
-      self.assertAllClose([0., 0.], sess.run(dy_dq))
+      self.evaluate(q.initializer)
+      self.assertAllClose([0., 0.], self.evaluate(dy_dq))
 
   @test_util.disable_control_flow_v2("b/113324949 (RefVariable)")
   def testWhileGradientWithNontrainablePath2(self):
@@ -2716,8 +2916,8 @@ class ControlFlowTest(test.TestCase):
     dy_dq, = gradients_impl.gradients(y, q)
     self.assertIsNotNone(dy_dq)
     with self.cached_session() as sess:
-      sess.run(q.initializer)
-      self.assertAllClose([1., 1.], sess.run(dy_dq))
+      self.evaluate(q.initializer)
+      self.assertAllClose([1., 1.], self.evaluate(dy_dq))
 
   @test_util.disable_control_flow_v2("b/115920078 (gradients)")
   def testIssue16504(self):
@@ -2767,7 +2967,7 @@ class ControlFlowTest(test.TestCase):
       z = math_ops.add(r, array_ops.stop_gradient(math_ops.reduce_sum(grads)))
       result = gradients_impl.gradients(z, vars_)[0]
       variables.global_variables_initializer().run()
-      self.assertEqual(5.0, result.eval())
+      self.assertEqual(5.0, self.evaluate(result))
 
   def testOneValueCond(self):
 
@@ -2785,6 +2985,7 @@ class ControlFlowTest(test.TestCase):
       # False case: c = 0 is not >= 1
       self.assertEqual([2], i.eval(feed_dict={c: 0}))
 
+  @test_util.run_deprecated_v1
   def testExampleCond(self):
 
     with self.cached_session():
@@ -2828,7 +3029,7 @@ class ControlFlowTest(test.TestCase):
       r4 = control_flow_ops.case(
           [(x < y, f1), (x < y, f2)], default=f3, exclusive=True)
       with self.assertRaisesOpError("Input error:"):
-        r4.eval()
+        self.evaluate(r4)
 
       # Check that the default is called if none of the others are
       r5 = control_flow_ops.case({x > y: f1}, default=f3)
@@ -2874,19 +3075,19 @@ class ControlFlowTest(test.TestCase):
           ((x > y, a), (x > y, b)), default=c, exclusive=True)
 
       variables.global_variables_initializer().run()
-      self.assertAllEqual(sess.run([v0, v1, v2]), [-1] * 3)
-      self.assertEqual(2, r2.eval())
-      self.assertAllEqual(sess.run([v0, v1, v2]), [-1, -1, 2])
+      self.assertAllEqual(self.evaluate([v0, v1, v2]), [-1] * 3)
+      self.assertEqual(2, self.evaluate(r2))
+      self.assertAllEqual(self.evaluate([v0, v1, v2]), [-1, -1, 2])
 
       variables.global_variables_initializer().run()
-      self.assertAllEqual(sess.run([v0, v1, v2]), [-1] * 3)
-      self.assertEqual(1, r1.eval())
-      self.assertAllEqual(sess.run([v0, v1, v2]), [-1, 1, -1])
+      self.assertAllEqual(self.evaluate([v0, v1, v2]), [-1] * 3)
+      self.assertEqual(1, self.evaluate(r1))
+      self.assertAllEqual(self.evaluate([v0, v1, v2]), [-1, 1, -1])
 
       variables.global_variables_initializer().run()
-      self.assertAllEqual(sess.run([v0, v1, v2]), [-1] * 3)
-      self.assertEqual(0, r0.eval())
-      self.assertAllEqual(sess.run([v0, v1, v2]), [0, -1, -1])
+      self.assertAllEqual(self.evaluate([v0, v1, v2]), [-1] * 3)
+      self.assertEqual(0, self.evaluate(r0))
+      self.assertAllEqual(self.evaluate([v0, v1, v2]), [0, -1, -1])
 
   @test_util.disable_control_flow_v2("b/113324949 (ref vars)")
   def testOneOpCond(self):
@@ -2907,15 +3108,15 @@ class ControlFlowTest(test.TestCase):
       self.assertTrue(isinstance(i, ops.Tensor))
       variables.global_variables_initializer().run()
 
-      self.assertEqual(0, v.eval())
+      self.assertEqual(0, self.evaluate(v))
 
       # True case: c = 2 is >= 1, v is set to 1.
       self.assertEqual(1, i.eval(feed_dict={c.name: 2}))
-      self.assertEqual(1, v.eval())
+      self.assertEqual(1, self.evaluate(v))
 
       # False case: c = 0 is not >= 1, v is set to 2.
       self.assertEqual(2, i.eval(feed_dict={c.name: 0}))
-      self.assertEqual(2, v.eval())
+      self.assertEqual(2, self.evaluate(v))
 
   def testWithOpsDependencies(self):
     with self.cached_session() as sess:
@@ -2924,7 +3125,7 @@ class ControlFlowTest(test.TestCase):
 
       # Fetching v directly will result in an uninitialized error
       with self.assertRaisesOpError("Attempting to use uninitialized value"):
-        sess.run([c, v])
+        self.evaluate([c, v])
 
       # Use a control dependency to ensure init_variable is run
       # while asking for c
@@ -2932,7 +3133,7 @@ class ControlFlowTest(test.TestCase):
           name="real_tensor",
           output_tensor=v._ref(),  # pylint: disable=protected-access
           dependencies=[v.initializer])
-      c_val, real_v_val = sess.run([c, real_v])
+      c_val, real_v_val = self.evaluate([c, real_v])
 
     # Ensure the result of 'real_c' is the same as 'c'
     self.assertAllEqual(10, c_val)
@@ -2957,14 +3158,14 @@ class ControlFlowTest(test.TestCase):
 
       # Fetching v directly will result in an uninitialized error
       with self.assertRaisesOpError("Attempting to use uninitialized value"):
-        v.eval()
+        self.evaluate(v)
 
       # Get the value of 'c2_with_c1_dep', which should cause 'v'
       # to be initialized.
-      self.assertAllEqual(20, c2_with_c1_dep.eval())
+      self.assertAllEqual(20, self.evaluate(c2_with_c1_dep))
 
       # Ensure that 'v' is initialized
-      self.assertAllClose(0.0, v.eval())
+      self.assertAllClose(0.0, self.evaluate(v))
 
   def testWithIndexedSlicesDependencies(self):
     with self.cached_session():
@@ -2979,13 +3180,15 @@ class ControlFlowTest(test.TestCase):
 
       # Fetching gather_v_at_1 will result in an uninitialized error
       with self.assertRaisesOpError("Attempting to use uninitialized value"):
-        gather_v_at_1.eval()
+        self.evaluate(gather_v_at_1)
 
       # Getting gather_v_at_1_after_init will work, and initialize v.
-      self.assertAllEqual([[10.0, 11.0]], gather_v_at_1_after_init.eval())
+      self.assertAllEqual([[10.0, 11.0]],
+                          self.evaluate(gather_v_at_1_after_init))
 
       # Double check that 'v' is initialized
-      self.assertAllClose([[0.0, 1.0], [10.0, 11.0], [20.0, 21.0]], v.eval())
+      self.assertAllClose([[0.0, 1.0], [10.0, 11.0], [20.0, 21.0]],
+                          self.evaluate(v))
 
   def testDependenciesDevice(self):
     with ops.Graph().as_default():
@@ -3019,11 +3222,11 @@ class ControlFlowTest(test.TestCase):
       init = control_flow_ops.group(v1.initializer, v2.initializer)
       # Fetching v1 directly will result in an uninitialized error
       with self.assertRaisesOpError("Attempting to use uninitialized value"):
-        v1.eval()
+        self.evaluate(v1)
 
       # Runs "init" before fetching v1 and v2.
       init.run()
-      v1_val, v2_val = sess.run([v1, v2])
+      v1_val, v2_val = self.evaluate([v1, v2])
 
     # Ensure that v1 and v2 are initialized
     self.assertAllClose([0.0], v1_val)
@@ -3034,6 +3237,7 @@ class ControlFlowTest(test.TestCase):
     self.assertEqual(op.type, "NoOp")
     self.assertEqual(op.control_inputs, [])
 
+  @test_util.run_deprecated_v1
   def testMergeShapes(self):
     # All inputs unknown.
     p1 = array_ops.placeholder(dtypes.float32)
@@ -3088,6 +3292,7 @@ class ControlFlowTest(test.TestCase):
     self.assertEqual([None, None], m.get_shape().as_list())
     self.assertEqual([], index.get_shape())
 
+  @test_util.run_deprecated_v1
   def testRefSelect(self):
     index = array_ops.placeholder(dtypes.int32)
 
@@ -3121,6 +3326,7 @@ class ControlFlowTest(test.TestCase):
     s = control_flow_ops.ref_select(index, [v1, v2])
     self.assertEqual(None, s.get_shape())
 
+  @test_util.run_deprecated_v1
   def testRunLoopTensor(self):
     with self.cached_session() as sess:
       tensor_list = []
@@ -3134,7 +3340,7 @@ class ControlFlowTest(test.TestCase):
 
       result = control_flow_ops.while_loop(condition, body,
                                            [constant_op.constant(4)])
-      self.assertEqual(10, sess.run(result))
+      self.assertEqual(10, self.evaluate(result))
 
       # Ensure that we cannot run a tensor that escapes the loop body
       # accidentally.
@@ -3184,7 +3390,7 @@ class ControlFlowTest(test.TestCase):
       cond = constant_op.constant(True, dtypes.bool)
       v_f, v_t = control_flow_ops.switch(constant_qint, cond)
       result = control_flow_ops.merge([v_f, v_t])
-      sess.run(result)
+      self.evaluate(result)
 
   def testQIntRefSwitchMerge(self):
     with self.cached_session(use_gpu=test.is_gpu_available()) as sess:
@@ -3192,13 +3398,22 @@ class ControlFlowTest(test.TestCase):
           shape=[1], dtype=dtypes.qint8, name="v", container="", shared_name="")
       assign_op = state_ops.assign(
           var_qint, constant_op.constant(np.array([42]), dtypes.qint8))
-      sess.run(assign_op)
+      self.evaluate(assign_op)
 
       cond = constant_op.constant(True, dtypes.bool)
       v_f, v_t = control_flow_ops.ref_switch(var_qint, cond)
       result = control_flow_ops.ref_merge([v_f, v_t])
-      sess.run(result)
+      self.evaluate(result)
 
+  def testUInt64SwitchMerge(self):
+    with self.cached_session(force_gpu=test.is_gpu_available()) as sess:
+      constant_uint64 = constant_op.constant(np.array([42]), dtypes.uint64)
+      cond = constant_op.constant(True, dtypes.bool)
+      v_f, v_t = control_flow_ops.switch(constant_uint64, cond)
+      result = control_flow_ops.merge([v_f, v_t])
+      self.evaluate(result)
+
+  @test_util.run_deprecated_v1
   def testQIntArgAndRet(self):
 
     @function.Defun(dtypes.qint8)
@@ -3208,7 +3423,7 @@ class ControlFlowTest(test.TestCase):
     with self.cached_session(force_gpu=test.is_gpu_available()) as sess:
       qint = constant_op.constant(np.array([42]), dtypes.qint8)
       result = func(qint)
-      sess.run(result)
+      self.evaluate(result)
 
 
 class ControlFlowContextCheckTest(test.TestCase):
@@ -3246,6 +3461,7 @@ class ControlFlowContextCheckTest(test.TestCase):
         "is in a while loop. See info log for more details."):
       math_ops.add(1, while_tensor)
 
+  @test_util.run_deprecated_v1
   def testInvalidContextInCond(self):
     # Accessing a while loop tensor in cond is illegal.
     while_tensor = self._getWhileTensor()
@@ -3314,6 +3530,7 @@ class ControlFlowContextCheckTest(test.TestCase):
 
     control_flow_ops.while_loop(lambda i: i < 5, body, [0])
 
+  @test_util.run_deprecated_v1
   def testInvalidNestedContexts(self):
     # Accessing a tensor from a while context in a different while context, all
     # inside a cond context, is illegal.
@@ -3347,21 +3564,22 @@ class TupleTest(test.TestCase):
 
         # v1 is not initialized.
         with self.assertRaisesOpError("Attempting to use uninitialized value"):
-          v1.eval()
+          self.evaluate(v1)
 
         # v2 is not initialized.
         with self.assertRaisesOpError("Attempting to use uninitialized value"):
-          v2.eval()
+          self.evaluate(v2)
 
         if v1_first:
           # Getting t1 initializes v2.
-          self.assertAllClose([3.0], t1.eval())
-          self.assertAllClose([10.0], v2.eval())
+          self.assertAllClose([3.0], self.evaluate(t1))
+          self.assertAllClose([10.0], self.evaluate(v2))
         else:
           # Getting t2 initializes v1.
-          self.assertAllClose([30.0], t2.eval())
-          self.assertAllClose([1.0], v1.eval())
+          self.assertAllClose([30.0], self.evaluate(t2))
+          self.assertAllClose([1.0], self.evaluate(v1))
 
+  @test_util.run_deprecated_v1
   def testIndexedSlices(self):
     for v1_first in [True, False]:
       with self.cached_session():
@@ -3385,22 +3603,22 @@ class TupleTest(test.TestCase):
 
         # v1 is not initialized.
         with self.assertRaisesOpError("Attempting to use uninitialized value"):
-          v1.eval()
+          self.evaluate(v1)
 
         # v2 is not initialized.
         with self.assertRaisesOpError("Attempting to use uninitialized value"):
-          v2.eval()
+          self.evaluate(v2)
 
         if v1_first:
           # Getting g1 initializes v2.
-          self.assertAllClose([[10.0, 11.0]], g1.eval())
+          self.assertAllClose([[10.0, 11.0]], self.evaluate(g1))
           self.assertAllClose([[0.1, 1.1], [10.1, 11.1], [20.1, 21.1]],
-                              v2.eval())
+                              self.evaluate(v2))
         else:
           # Getting g2 initializes v1.
-          self.assertAllClose([[10.1, 11.1]], g2.eval())
+          self.assertAllClose([[10.1, 11.1]], self.evaluate(g2))
           self.assertAllClose([[0.0, 1.0], [10.0, 11.0], [20.0, 21.0]],
-                              v1.eval())
+                              self.evaluate(v1))
 
   def testAcceptTensorsAsControlInputs(self):
     with self.cached_session():
@@ -3410,13 +3628,14 @@ class TupleTest(test.TestCase):
           [constant_op.constant(0)], control_inputs=[assign])
 
       # Should trigger the assign.
-      t.eval()
+      self.evaluate(t)
 
-      self.assertEquals(1, var.eval())
+      self.assertEquals(1, self.evaluate(var))
 
 
 class AssertTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testGuardedAssertDoesNotCopyWhenTrue(self):
     with self.session(use_gpu=True) as sess:
       with ops.device(test.gpu_device_name()):
@@ -3513,7 +3732,7 @@ class WhileOpBenchmark(test.Benchmark):
     with session.Session() as sess, ops.device(default_device):
       # Get the initial id i, input x, and kernel.
       i, x, kernel = self._getInitVariables()
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
 
       if static_unroll:
         for _ in xrange(steps):
@@ -3532,11 +3751,11 @@ class WhileOpBenchmark(test.Benchmark):
 
       for _ in xrange(3):
         # exclude warm up time
-        sess.run(r)
+        self.evaluate(r)
 
       start_time = time.time()
       for _ in xrange(num_iters):
-        sess.run(r)
+        self.evaluate(r)
       return (time.time() - start_time) / num_iters
 
   def benchmarkWhileOpCrossDevicePlacement(self):
diff --git a/tensorflow/python/kernel_tests/conv1d_test.py b/tensorflow/python/kernel_tests/conv1d_test.py
index 8540875d75e..e8463323df9 100644
--- a/tensorflow/python/kernel_tests/conv1d_test.py
+++ b/tensorflow/python/kernel_tests/conv1d_test.py
@@ -43,7 +43,7 @@ class Conv1DTest(test.TestCase):
         with self.cached_session(use_gpu=test.is_gpu_available()):
           c = nn_ops.conv1d(x, filters, stride, padding="VALID")
           reduced = array_ops.squeeze(c)
-          output = reduced.eval()
+          output = self.evaluate(reduced)
           if stride == 1:
             self.assertEqual(len(output), 3)
             self.assertAllClose(output,
@@ -69,7 +69,7 @@ class Conv1DTest(test.TestCase):
           1.0, shape=f_shape, name="filter", dtype=dtypes.float32)
       output = nn_ops.conv1d_transpose(
           x, f, y_shape, stride=stride, padding="VALID")
-      value = output.eval()
+      value = self.evaluate(output)
 
       cache_values = np.zeros(y_shape, dtype=np.float32)
 
diff --git a/tensorflow/python/kernel_tests/conv2d_backprop_filter_grad_test.py b/tensorflow/python/kernel_tests/conv2d_backprop_filter_grad_test.py
index af6ffc1d195..7b3b560b240 100644
--- a/tensorflow/python/kernel_tests/conv2d_backprop_filter_grad_test.py
+++ b/tensorflow/python/kernel_tests/conv2d_backprop_filter_grad_test.py
@@ -22,6 +22,7 @@ import numpy as np
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import nn_ops
@@ -31,6 +32,7 @@ from tensorflow.python.platform import test
 
 class Conv2DBackpropFilterGradTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testGradient(self):
     with self.cached_session():
       for padding in ["SAME", "VALID"]:
diff --git a/tensorflow/python/kernel_tests/conv2d_transpose_test.py b/tensorflow/python/kernel_tests/conv2d_transpose_test.py
index 6f9992a317f..c603c086306 100644
--- a/tensorflow/python/kernel_tests/conv2d_transpose_test.py
+++ b/tensorflow/python/kernel_tests/conv2d_transpose_test.py
@@ -53,7 +53,7 @@ class Conv2DTransposeTest(test.TestCase):
           1.0, shape=f_shape, name="filter", dtype=dtypes.float32)
       output = nn_ops.conv2d_transpose(
           x, f, y_shape, strides=strides, padding="SAME")
-      value = output.eval()
+      value = self.evaluate(output)
 
       # We count the number of cells being added at the locations in the output.
       # At the center, #cells=kernel_height * kernel_width
@@ -91,7 +91,7 @@ class Conv2DTransposeTest(test.TestCase):
           1.0, shape=f_shape, name="filter", dtype=dtypes.float32)
       output = nn_ops.conv2d_transpose(
           x, f, y_shape, strides=strides, padding="SAME")
-      value = output.eval()
+      value = self.evaluate(output)
 
       for n in xrange(x_shape[0]):
         for k in xrange(f_shape[2]):
@@ -124,7 +124,7 @@ class Conv2DTransposeTest(test.TestCase):
           1.0, shape=f_shape, name="filter", dtype=dtypes.float32)
       output = nn_ops.conv2d_transpose(
           x, f, y_shape, strides=strides, padding="VALID")
-      value = output.eval()
+      value = self.evaluate(output)
 
       cache_values = np.zeros(y_shape, dtype=np.float32)
 
@@ -155,6 +155,7 @@ class Conv2DTransposeTest(test.TestCase):
 
     self.assertAllClose(cache_values, value)
 
+  @test_util.run_deprecated_v1
   def testGradient(self):
     x_shape = [2, 6, 4, 3]
     f_shape = [3, 3, 2, 3]
@@ -195,7 +196,7 @@ class Conv2DTransposeTest(test.TestCase):
         output = nn_ops.conv2d_transpose(
             x, f, y_shape, strides=strides, padding="SAME", data_format="NCHW")
 
-        value = output.eval()
+        value = self.evaluate(output)
         for n in xrange(x_shape[0]):
           for k in xrange(f_shape[2]):
             for w in xrange(y_shape[3]):
@@ -230,7 +231,7 @@ class Conv2DTransposeTest(test.TestCase):
         output = nn_ops.conv2d_transpose(
             x, f, y_shape, strides=strides, padding="SAME", data_format="NCHW")
 
-        value = output.eval()
+        value = self.evaluate(output)
         for n in xrange(x_shape[0]):
           for k in xrange(f_shape[2]):
             for w in xrange(y_shape[3]):
@@ -265,7 +266,7 @@ class Conv2DTransposeTest(test.TestCase):
         output = nn_ops.conv2d_transpose(
             x, f, y_shape, strides=strides, padding="VALID", data_format="NCHW")
 
-        value = output.eval()
+        value = self.evaluate(output)
         cache_values = np.zeros(y_shape, dtype=np.float32)
         # The amount of padding added
         pad = 1
@@ -293,7 +294,6 @@ class Conv2DTransposeTest(test.TestCase):
 
         self.assertAllClose(cache_values, value)
 
-  @test_util.enable_c_shapes
   def testConv2DTransposeShapeInference(self):
     # Test case for 8972
     initializer = random_ops.truncated_normal(
diff --git a/tensorflow/python/kernel_tests/conv3d_backprop_filter_v2_grad_test.py b/tensorflow/python/kernel_tests/conv3d_backprop_filter_v2_grad_test.py
index 89b64068ace..7e913febed3 100644
--- a/tensorflow/python/kernel_tests/conv3d_backprop_filter_v2_grad_test.py
+++ b/tensorflow/python/kernel_tests/conv3d_backprop_filter_v2_grad_test.py
@@ -22,6 +22,7 @@ import numpy as np
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import nn_ops
@@ -31,6 +32,7 @@ from tensorflow.python.platform import test
 
 class Conv3DBackpropFilterV2GradTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testGradient(self):
     with self.cached_session():
       for padding in ["SAME", "VALID"]:
diff --git a/tensorflow/python/kernel_tests/conv3d_transpose_test.py b/tensorflow/python/kernel_tests/conv3d_transpose_test.py
index 2527b837692..22ba5b90375 100644
--- a/tensorflow/python/kernel_tests/conv3d_transpose_test.py
+++ b/tensorflow/python/kernel_tests/conv3d_transpose_test.py
@@ -23,6 +23,7 @@ from six.moves import xrange  # pylint: disable=redefined-builtin
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import nn_ops
 import tensorflow.python.ops.nn_grad  # pylint: disable=unused-import
@@ -48,7 +49,7 @@ class Conv3DTransposeTest(test.TestCase):
           1.0, shape=f_shape, name="filter", dtype=dtypes.float32)
       output = nn_ops.conv3d_transpose(
           x, f, y_shape, strides=strides, padding="SAME")
-      value = output.eval()
+      value = self.evaluate(output)
 
       # We count the number of cells being added at the locations in the output.
       # At the center, #cells = kernel_depth * kernel_height * kernel_width
@@ -98,7 +99,7 @@ class Conv3DTransposeTest(test.TestCase):
           1.0, shape=f_shape, name="filter", dtype=dtypes.float32)
       output = nn_ops.conv3d_transpose(
           x, f, y_shape, strides=strides, padding="SAME")
-      value = output.eval()
+      value = self.evaluate(output)
 
       for n in xrange(x_shape[0]):
         for k in xrange(f_shape[3]):
@@ -119,6 +120,7 @@ class Conv3DTransposeTest(test.TestCase):
                   target = 3.0
                 self.assertAllClose(target, value[n, d, h, w, k])
 
+  @test_util.run_deprecated_v1
   def testConv3DTransposeShapeMismatch(self):
     # Test case for GitHub issue 18460
     x_shape = [2, 2, 3, 4, 3]
@@ -146,7 +148,7 @@ class Conv3DTransposeTest(test.TestCase):
         output = nn_ops.conv3d_transpose(
             x_value, f_value, constant_op.constant(y_shape, dtype=dtype),
             strides=strides, padding="SAME")
-        output.eval()
+        self.evaluate(output)
 
   def testConv3DTransposeValid(self):
     with self.cached_session():
@@ -165,7 +167,7 @@ class Conv3DTransposeTest(test.TestCase):
           1.0, shape=f_shape, name="filter", dtype=dtypes.float32)
       output = nn_ops.conv3d_transpose(
           x, f, y_shape, strides=strides, padding="VALID")
-      value = output.eval()
+      value = self.evaluate(output)
 
       cache_values = np.zeros(y_shape, dtype=np.float32)
 
@@ -201,6 +203,7 @@ class Conv3DTransposeTest(test.TestCase):
 
     self.assertAllClose(cache_values, value)
 
+  @test_util.run_deprecated_v1
   def testGradient(self):
     x_shape = [2, 3, 4, 3, 2]
     f_shape = [3, 3, 3, 2, 2]
diff --git a/tensorflow/python/kernel_tests/conv_ops_3d_test.py b/tensorflow/python/kernel_tests/conv_ops_3d_test.py
index c4a9cdcf8e0..4a689b3fdfa 100644
--- a/tensorflow/python/kernel_tests/conv_ops_3d_test.py
+++ b/tensorflow/python/kernel_tests/conv_ops_3d_test.py
@@ -52,11 +52,11 @@ class Conv3DTest(test.TestCase):
   def _DtypesToTest(self, use_gpu):
     if use_gpu:
       if not test_util.CudaSupportsHalfMatMulAndConv():
-        return [dtypes.float32]
+        return [dtypes.float64, dtypes.float32]
       else:
         # It is important that float32 comes before float16 here,
         # as we will be using its gradients as reference for fp16 gradients.
-        return [dtypes.float32, dtypes.float16]
+        return [dtypes.float64, dtypes.float32, dtypes.float16]
     else:
       return [dtypes.float64, dtypes.float32, dtypes.float16]
 
@@ -109,7 +109,7 @@ class Conv3DTest(test.TestCase):
         results.append(result)
 
       with self.cached_session() as sess:
-        values = sess.run(results)
+        values = self.evaluate(results)
         for value in values:
           print("expected = ", expected)
           print("actual = ", value)
@@ -184,8 +184,8 @@ class Conv3DTest(test.TestCase):
         computed_results.append(computed)
         tolerance = 1e-2 if use_gpu else 1e-5
         with self.cached_session() as sess:
-          expected_values = sess.run(expected_results)
-          computed_values = sess.run(computed_results)
+          expected_values = self.evaluate(expected_results)
+          computed_values = self.evaluate(computed_results)
           for e_value, c_value in zip(expected_values, computed_values):
             print("expected = ", e_value)
             print("actual = ", c_value)
@@ -462,6 +462,7 @@ class Conv3DTest(test.TestCase):
       self._ConstructAndTestGradientForConfig(data_format=data_format,
                                               use_gpu=use_gpu, **kwargs)
 
+  @test_util.run_deprecated_v1
   def testInputGradientValidPaddingStrideOne(self):
     self.ConstructAndTestGradient(
         batch=2,
@@ -473,6 +474,7 @@ class Conv3DTest(test.TestCase):
         padding="VALID",
         test_input=True)
 
+  @test_util.run_deprecated_v1
   def testFilterGradientValidPaddingStrideOne(self):
     self.ConstructAndTestGradient(
         batch=4,
@@ -484,6 +486,7 @@ class Conv3DTest(test.TestCase):
         padding="VALID",
         test_input=False)
 
+  @test_util.run_deprecated_v1
   def testInputGradientValidPaddingStrideTwo(self):
     self.ConstructAndTestGradient(
         batch=2,
@@ -495,6 +498,7 @@ class Conv3DTest(test.TestCase):
         padding="VALID",
         test_input=True)
 
+  @test_util.run_deprecated_v1
   def testFilterGradientValidPaddingStrideTwo(self):
     self.ConstructAndTestGradient(
         batch=2,
@@ -506,6 +510,7 @@ class Conv3DTest(test.TestCase):
         padding="VALID",
         test_input=False)
 
+  @test_util.run_deprecated_v1
   def testInputGradientValidPaddingStrideThree(self):
     self.ConstructAndTestGradient(
         batch=2,
@@ -517,6 +522,7 @@ class Conv3DTest(test.TestCase):
         padding="VALID",
         test_input=True)
 
+  @test_util.run_deprecated_v1
   def testFilterGradientValidPaddingStrideThree(self):
     self.ConstructAndTestGradient(
         batch=2,
@@ -528,6 +534,7 @@ class Conv3DTest(test.TestCase):
         padding="VALID",
         test_input=False)
 
+  @test_util.run_deprecated_v1
   def testInputGradientSamePaddingStrideOne(self):
     self.ConstructAndTestGradient(
         batch=2,
@@ -539,6 +546,7 @@ class Conv3DTest(test.TestCase):
         padding="SAME",
         test_input=True)
 
+  @test_util.run_deprecated_v1
   def testFilterGradientSamePaddingStrideOne(self):
     self.ConstructAndTestGradient(
         batch=2,
@@ -550,6 +558,7 @@ class Conv3DTest(test.TestCase):
         padding="SAME",
         test_input=False)
 
+  @test_util.run_deprecated_v1
   def testInputGradientSamePaddingStrideTwo(self):
     self.ConstructAndTestGradient(
         batch=2,
@@ -561,6 +570,7 @@ class Conv3DTest(test.TestCase):
         padding="SAME",
         test_input=True)
 
+  @test_util.run_deprecated_v1
   def testFilterGradientSamePaddingStrideTwo(self):
     self.ConstructAndTestGradient(
         batch=4,
@@ -572,6 +582,7 @@ class Conv3DTest(test.TestCase):
         padding="SAME",
         test_input=False)
 
+  @test_util.run_deprecated_v1
   def testInputGradientSamePaddingStrideThree(self):
     self.ConstructAndTestGradient(
         batch=2,
@@ -583,6 +594,7 @@ class Conv3DTest(test.TestCase):
         padding="SAME",
         test_input=True)
 
+  @test_util.run_deprecated_v1
   def testFilterGradientSamePaddingStrideThree(self):
     self.ConstructAndTestGradient(
         batch=2,
@@ -594,6 +606,7 @@ class Conv3DTest(test.TestCase):
         padding="SAME",
         test_input=False)
 
+  @test_util.run_deprecated_v1
   def testInputGradientSamePaddingDifferentStrides(self):
     self.ConstructAndTestGradient(
         batch=1,
@@ -605,6 +618,7 @@ class Conv3DTest(test.TestCase):
         padding="SAME",
         test_input=True)
 
+  @test_util.run_deprecated_v1
   def testFilterGradientKernelSizeMatchesInputSize(self):
     self.ConstructAndTestGradient(
         batch=2,
@@ -616,6 +630,7 @@ class Conv3DTest(test.TestCase):
         padding="VALID",
         test_input=False)
 
+  @test_util.run_deprecated_v1
   def testInputGradientKernelSizeMatchesInputSize(self):
     self.ConstructAndTestGradient(
         batch=2,
@@ -640,6 +655,7 @@ class Conv3DTest(test.TestCase):
 
   # Test the fast path in gemm_pack_rhs/mkldnn_gemm_pack, when channel
   # dimension is a multiple of packet size.
+  @test_util.run_deprecated_v1
   def testInputGradientValidPaddingStrideOneFastPath(self):
     self.ConstructAndTestGradient(
         batch=2,
@@ -651,6 +667,7 @@ class Conv3DTest(test.TestCase):
         padding="VALID",
         test_input=True)
 
+  @test_util.run_deprecated_v1
   def testFilterGradientValidPaddingStrideOneFastPath(self):
     self.ConstructAndTestGradient(
         batch=2,
@@ -715,8 +732,8 @@ class Conv3DTest(test.TestCase):
         expected_grad = gradients_impl.gradients(expected, t1
                                                  if mode == "input" else t2)[0]
         # "values" consists of two tensors for two backprops
-        actual_value = sess.run(actual_grad)
-        expected_value = sess.run(expected_grad)
+        actual_value = self.evaluate(actual_grad)
+        expected_value = self.evaluate(expected_grad)
         self.assertShapeEqual(actual_value, actual_grad)
         self.assertShapeEqual(expected_value, expected_grad)
       print("expected = ", expected_value)
diff --git a/tensorflow/python/kernel_tests/conv_ops_test.py b/tensorflow/python/kernel_tests/conv_ops_test.py
index 0ccbbf155c5..2f6f3bb383b 100644
--- a/tensorflow/python/kernel_tests/conv_ops_test.py
+++ b/tensorflow/python/kernel_tests/conv_ops_test.py
@@ -908,8 +908,8 @@ class Conv2DTest(test.TestCase):
         conv = gradients_impl.gradients(conv_forward, t1)[0]
         conv_2 = gradients_impl.gradients(conv_forward_2, t1)[0]
         # "values" consists of two tensors for two backprops
-        value = sess.run(conv)
-        value_2 = sess.run(conv_2)
+        value = self.evaluate(conv)
+        value_2 = self.evaluate(conv_2)
         self.assertShapeEqual(value, conv)
         self.assertShapeEqual(value_2, conv_2)
       tf_logging.info("expected = ", value_2)
@@ -961,8 +961,8 @@ class Conv2DTest(test.TestCase):
           conv_forward_2 = test_util.NCHWToNHWC(conv_forward_2)
         conv = gradients_impl.gradients(conv_forward, t2)[0]
         conv_2 = gradients_impl.gradients(conv_forward, t2)[0]
-        value = sess.run(conv)
-        value_2 = sess.run(conv_2)
+        value = self.evaluate(conv)
+        value_2 = self.evaluate(conv_2)
         self.assertShapeEqual(value, conv)
         self.assertShapeEqual(value_2, conv_2)
       tf_logging.info("expected = ", value_2)
@@ -1545,7 +1545,7 @@ class DepthwiseConv2DTest(test.TestCase):
       t2 = constant_op.constant(x2, shape=filter_in_sizes)
       conv = nn_impl.depthwise_conv2d(
           t1, t2, strides=[1, stride, stride, 1], padding=padding)
-      value = sess.run(conv)
+      value = self.evaluate(conv)
     tf_logging.info("value = ", value)
     self.assertArrayNear(expected, np.ravel(value), 1e-5)
     self.assertShapeEqual(value, conv)
@@ -1667,9 +1667,9 @@ class SeparableConv2DTest(test.TestCase):
       if data_format == "NCHW":
         conv = array_ops.transpose(conv, [0, 2, 3, 1])
 
-      value = sess.run(conv)
+      value = self.evaluate(conv)
     tf_logging.info("value = ", value)
-    self.assertArrayNear(expected, np.ravel(value), 1e-5)
+    self.assertArrayNear(expected, np.ravel(value), 1e-3)
     self.assertShapeEqual(value, conv)
 
   def _testSeparableConv2D(self, data_format):
@@ -1774,10 +1774,10 @@ class DeepConv2DTest(test.TestCase):
       conv = nn_ops.conv2d(t1, t2, strides=strides, padding=padding)
 
       os.environ["TF_USE_DEEP_CONV2D"] = "0"
-      values_expect = sess.run([conv])
+      values_expect = self.evaluate([conv])
 
       os.environ["TF_USE_DEEP_CONV2D"] = "1"
-      values_test = sess.run([conv])
+      values_test = self.evaluate([conv])
 
       self.assertAllClose(values_expect, values_test, rtol=1e-5, atol=1e-5)
 
diff --git a/tensorflow/python/kernel_tests/cross_grad_test.py b/tensorflow/python/kernel_tests/cross_grad_test.py
index 0bd4006d6ac..b397133fd73 100644
--- a/tensorflow/python/kernel_tests/cross_grad_test.py
+++ b/tensorflow/python/kernel_tests/cross_grad_test.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import math_ops
@@ -26,6 +27,7 @@ from tensorflow.python.platform import test
 
 class CrossOpTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testGradientRandomValues(self):
     with self.cached_session():
       us = [2, 3]
diff --git a/tensorflow/python/kernel_tests/ctc_decoder_ops_test.py b/tensorflow/python/kernel_tests/ctc_decoder_ops_test.py
index d818fbd75ce..0d86d13c715 100644
--- a/tensorflow/python/kernel_tests/ctc_decoder_ops_test.py
+++ b/tensorflow/python/kernel_tests/ctc_decoder_ops_test.py
@@ -25,6 +25,7 @@ from six.moves import zip_longest
 
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import ctc_ops
 from tensorflow.python.platform import test
@@ -94,6 +95,7 @@ class CTCGreedyDecoderTest(test.TestCase):
         with self.assertRaisesOpError(expected_err_re):
           sess.run(decoded_unwrapped + [log_probability])
 
+  @test_util.run_deprecated_v1
   def testCTCGreedyDecoder(self):
     """Test two batch entries - best path decoder."""
     max_time_steps = 6
@@ -170,6 +172,7 @@ class CTCGreedyDecoderTest(test.TestCase):
     self._testCTCDecoder(ctc_ops.ctc_greedy_decoder, inputs, seq_lens,
                          log_prob_truth, decode_truth)
 
+  @test_util.run_deprecated_v1
   def testCTCDecoderBeamSearch(self):
     """Test one batch, two beams - hibernating beam search."""
     # max_time_steps == 8
diff --git a/tensorflow/python/kernel_tests/ctc_loss_op_test.py b/tensorflow/python/kernel_tests/ctc_loss_op_test.py
index cfc7cb98aa6..e6b5835079e 100644
--- a/tensorflow/python/kernel_tests/ctc_loss_op_test.py
+++ b/tensorflow/python/kernel_tests/ctc_loss_op_test.py
@@ -23,9 +23,16 @@ import numpy as np
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import random_seed
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import ctc_ops
 from tensorflow.python.ops import gradients_impl
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import sparse_ops
 from tensorflow.python.platform import test
 
 
@@ -52,6 +59,24 @@ def SimpleSparseTensorFrom(x):
   return sparse_tensor.SparseTensor(x_ix, x_val, x_shape)
 
 
+def _ctc_loss_v2(labels, inputs, sequence_length,
+                 preprocess_collapse_repeated=False,
+                 ctc_merge_repeated=True,
+                 ignore_longer_outputs_than_inputs=False,
+                 time_major=True):
+  """Call ctc_loss_v2 with v1 args."""
+  assert not preprocess_collapse_repeated
+  assert ctc_merge_repeated
+  assert not ignore_longer_outputs_than_inputs
+  return ctc_ops.ctc_loss_v2(
+      labels=labels,
+      logits=inputs,
+      logit_length=sequence_length,
+      label_length=None,
+      blank_index=-1,
+      logits_time_major=time_major)
+
+
 class CTCLossTest(test.TestCase):
 
   def _testCTCLoss(self,
@@ -66,7 +91,7 @@ class CTCLossTest(test.TestCase):
     inputs_t = constant_op.constant(inputs)
 
     with self.cached_session(use_gpu=False) as sess:
-      loss = ctc_ops.ctc_loss(
+      loss = _ctc_loss_v2(
           inputs=inputs_t, labels=labels, sequence_length=seq_lens)
       grad = gradients_impl.gradients(loss, [inputs_t])[0]
 
@@ -74,13 +99,14 @@ class CTCLossTest(test.TestCase):
       self.assertShapeEqual(grad_truth, grad)
 
       if expected_err_re is None:
-        (tf_loss, tf_grad) = sess.run([loss, grad])
+        (tf_loss, tf_grad) = self.evaluate([loss, grad])
         self.assertAllClose(tf_loss, loss_truth, atol=1e-6)
         self.assertAllClose(tf_grad, grad_truth, atol=1e-6)
       else:
         with self.assertRaisesOpError(expected_err_re):
-          sess.run([loss, grad])
+          self.evaluate([loss, grad])
 
+  @test_util.run_deprecated_v1
   def testBasic(self):
     """Test two batch entries."""
     # Input and ground truth from Alex Graves' implementation.
@@ -216,6 +242,7 @@ class CTCLossTest(test.TestCase):
 
     self._testCTCLoss(inputs, seq_lens, labels, loss_truth, grad_truth)
 
+  @test_util.run_deprecated_v1
   def test_time_major(self):
     """Testing time_major param.
 
@@ -234,17 +261,18 @@ class CTCLossTest(test.TestCase):
     inputs_t_transposed = constant_op.constant(inputs.transpose(1, 0, 2))
 
     with self.session(use_gpu=False) as sess:
-      loss = ctc_ops.ctc_loss(
+      loss = _ctc_loss_v2(
           inputs=inputs_t, labels=labels, sequence_length=seq_lens)
-      loss_transposed = ctc_ops.ctc_loss(
+      loss_transposed = _ctc_loss_v2(
           inputs=inputs_t_transposed,
           labels=labels,
           sequence_length=seq_lens,
           time_major=False)
 
-      (tf_loss, tf_loss_transposed) = sess.run([loss, loss_transposed])
+      (tf_loss, tf_loss_transposed) = self.evaluate([loss, loss_transposed])
       self.assertAllEqual(tf_loss, tf_loss_transposed)
 
+  @test_util.run_deprecated_v1
   def testInvalidSecondGradient(self):
     inputs = np.random.randn(2, 2, 3).astype(np.float32)
     inputs_t = constant_op.constant(inputs)
@@ -253,7 +281,7 @@ class CTCLossTest(test.TestCase):
     v = [1.0]
 
     with self.session(use_gpu=False):
-      loss = ctc_ops.ctc_loss(
+      loss = _ctc_loss_v2(
           inputs=inputs_t, labels=labels, sequence_length=seq_lens)
       # Taking ths second gradient should fail, since it is not
       # yet supported.
@@ -261,6 +289,7 @@ class CTCLossTest(test.TestCase):
                                    "explicitly disabled"):
         _ = gradients_impl._hessian_vector_product(loss, [inputs_t], v)
 
+  @test_util.run_deprecated_v1
   def testEmptyBatch(self):
     inputs = constant_op.constant([], dtype=dtypes.float32, shape=(1, 0, 2))
     sequence_lengths = constant_op.constant([], dtype=dtypes.int32)
@@ -272,7 +301,546 @@ class CTCLossTest(test.TestCase):
     with self.session(use_gpu=False) as sess:
       with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
                                    "batch_size must not be 0"):
-        sess.run(ctc_ops.ctc_loss(labels, inputs, sequence_lengths))
+        sess.run(_ctc_loss_v2(labels, inputs, sequence_lengths))
+
+
+class CTCLossTestV2(test.TestCase):
+
+  @test_util.run_deprecated_v1
+  def testCtcLossV2(self):
+    random_seed.set_random_seed(5)
+
+    batch_size = 8
+    num_labels = 6
+    max_label_length = 5
+    num_frames = 12
+
+    labels = random_ops.random_uniform(
+        [batch_size, max_label_length], minval=1, maxval=num_labels,
+        dtype=dtypes.int64)
+    logits = random_ops.random_uniform([num_frames, batch_size, num_labels])
+
+    label_length = random_ops.random_uniform(
+        [batch_size], minval=2, maxval=max_label_length, dtype=dtypes.int64)
+    label_mask = array_ops.sequence_mask(
+        label_length, maxlen=max_label_length, dtype=label_length.dtype)
+    labels *= label_mask
+    logit_length = [num_frames] * batch_size
+
+    ref_loss = ctc_ops.ctc_loss_v2(
+        labels=labels,
+        logits=logits,
+        label_length=label_length,
+        logit_length=logit_length)
+    ref_grad = gradients_impl.gradients(ref_loss, [logits])
+
+    sparse_labels = ctc_ops.dense_labels_to_sparse(labels, label_length)
+
+    def assert_same_loss_and_grads(loss):
+      with self.cached_session() as sess:
+        self.assertAllClose(*self.evaluate([loss, ref_loss]))
+        grad = gradients_impl.gradients(loss, [logits])
+        self.assertAllClose(
+            *self.evaluate([grad, ref_grad]), rtol=2e-06, atol=2e-06)
+
+    assert_same_loss_and_grads(
+        ctc_ops.ctc_loss_v2(
+            labels=sparse_labels,
+            logits=logits,
+            label_length=label_length,
+            logit_length=logit_length,
+            blank_index=0))
+
+  @test_util.run_deprecated_v1
+  def testCtcLossDenseIsSameAsCtcLoss(self):
+    with ops.device("/GPU:0" if test.is_gpu_available() else "/CPU:0"):
+      random_seed.set_random_seed(5)
+
+      batch_size = 8
+      num_labels = 6
+      label_length = 5
+      num_frames = 12
+      logits = random_ops.random_uniform([num_frames, batch_size, num_labels])
+      labels = random_ops.random_uniform(
+          [batch_size, label_length], minval=1, maxval=num_labels,
+          dtype=dtypes.int64)
+
+      label_lengths = random_ops.random_uniform(
+          [batch_size], minval=2, maxval=label_length, dtype=dtypes.int64)
+      label_mask = array_ops.sequence_mask(
+          label_lengths, maxlen=label_length, dtype=label_lengths.dtype)
+      labels *= label_mask
+
+      logit_lengths = [num_frames] * batch_size
+
+      ctc_loss = ctc_ops.ctc_loss_dense(
+          labels=labels,
+          logits=logits,
+          label_length=label_lengths,
+          logit_length=logit_lengths)
+      ctc_loss_grads = gradients_impl.gradients(ctc_loss, [logits])[0]
+
+      # Shift labels down by one (move blank from 0 to num_labels -1)
+      tf_ctc_loss_labels = math_ops.cast(labels, dtypes.int32) - 1
+      tf_nn_ctc_logits = array_ops.concat([
+          logits[:, :, 1:],
+          logits[:, :, 0:1],
+      ], axis=2)
+
+      tf_ctc_loss_labels = ctc_ops.dense_labels_to_sparse(
+          tf_ctc_loss_labels, label_lengths)
+
+      tf_nn_ctc_loss = ctc_ops.ctc_loss(
+          labels=tf_ctc_loss_labels,
+          inputs=tf_nn_ctc_logits,
+          sequence_length=logit_lengths,
+          time_major=True)
+      tf_nn_ctc_grads = gradients_impl.gradients(tf_nn_ctc_loss, [logits])[0]
+
+      with self.cached_session() as sess:
+        for _ in range(32):
+          self.assertAllClose(*self.evaluate([ctc_loss, tf_nn_ctc_loss]))
+          self.assertAllClose(
+              *self.evaluate([ctc_loss_grads, tf_nn_ctc_grads]),
+              rtol=2e-06,
+              atol=2e-06)
+
+  @test_util.run_deprecated_v1
+  def testCtcLossDenseUniqueFastPathIsSameAsCtcLoss(self):
+    random_seed.set_random_seed(5)
+
+    batch_size = 8
+    num_labels = 6
+    label_length = 5
+    num_frames = 12
+    logits = random_ops.random_uniform([num_frames, batch_size, num_labels])
+    labels = random_ops.random_uniform(
+        [batch_size, label_length], minval=1, maxval=num_labels,
+        dtype=dtypes.int64)
+
+    label_lengths = random_ops.random_uniform(
+        [batch_size], minval=2, maxval=label_length, dtype=dtypes.int64)
+    label_mask = array_ops.sequence_mask(
+        label_lengths, maxlen=label_length, dtype=label_lengths.dtype)
+    labels *= label_mask
+
+    logit_lengths = [num_frames] * batch_size
+
+    ctc_loss = ctc_ops.ctc_loss_dense(
+        labels=labels,
+        logits=logits,
+        label_length=label_lengths,
+        logit_length=logit_lengths,
+        unique=ctc_ops.ctc_unique_labels(labels))
+    ctc_loss_grads = gradients_impl.gradients(ctc_loss, [logits])[0]
+
+    # Shift labels down by one (move blank from 0 to num_labels -1)
+    tf_ctc_loss_labels = math_ops.cast(labels, dtypes.int32) - 1
+    tf_nn_ctc_logits = array_ops.concat([
+        logits[:, :, 1:],
+        logits[:, :, 0:1],
+    ], axis=2)
+
+    tf_ctc_loss_labels = ctc_ops.dense_labels_to_sparse(
+        tf_ctc_loss_labels, label_lengths)
+
+    tf_nn_ctc_loss = ctc_ops.ctc_loss(
+        labels=tf_ctc_loss_labels,
+        inputs=tf_nn_ctc_logits,
+        sequence_length=logit_lengths,
+        time_major=True)
+    tf_nn_ctc_grads = gradients_impl.gradients(tf_nn_ctc_loss, [logits])[0]
+
+    with self.cached_session() as sess:
+      for _ in range(32):
+        self.assertAllClose(*self.evaluate([ctc_loss, tf_nn_ctc_loss]))
+        self.assertAllClose(
+            *self.evaluate([ctc_loss_grads, tf_nn_ctc_grads]),
+            rtol=2e-06,
+            atol=2e-06)
+
+  @test_util.run_deprecated_v1
+  def testCtcLossDenseWithBlankIndexIsSameAsCtcLoss(self):
+    random_seed.set_random_seed(5)
+
+    batch_size = 8
+    num_labels = 6
+    label_length = 5
+    num_frames = 12
+    logits = random_ops.random_uniform([num_frames, batch_size, num_labels])
+    labels = random_ops.random_uniform(
+        [batch_size, label_length], minval=0, maxval=num_labels-1,
+        dtype=dtypes.int64)
+
+    label_lengths = random_ops.random_uniform(
+        [batch_size], minval=2, maxval=label_length, dtype=dtypes.int64)
+    label_mask = array_ops.sequence_mask(
+        label_lengths, maxlen=label_length, dtype=label_lengths.dtype)
+    labels *= label_mask
+
+    logit_lengths = [num_frames] * batch_size
+
+    tf_ctc_loss_labels = math_ops.cast(labels, dtypes.int32)
+    tf_ctc_loss_labels = ctc_ops.dense_labels_to_sparse(
+        tf_ctc_loss_labels, label_lengths)
+
+    tf_nn_ctc_loss = ctc_ops.ctc_loss(
+        labels=tf_ctc_loss_labels,
+        inputs=logits,
+        sequence_length=logit_lengths,
+        time_major=True)
+    tf_nn_ctc_grads = gradients_impl.gradients(tf_nn_ctc_loss, [logits])[0]
+
+    # Shift the blank logits/labels to be somewhere in the middle.
+    blank_index = 2
+    shifted_logits = array_ops.concat([
+        logits[:, :, :blank_index],
+        logits[:, :, -1:],
+        logits[:, :, blank_index:-1],
+    ], axis=2)
+    shifted_labels = array_ops.where(labels < blank_index, labels, labels + 1)
+
+    ctc_loss = ctc_ops.ctc_loss_dense(
+        labels=shifted_labels,
+        logits=shifted_logits,
+        label_length=label_lengths,
+        logit_length=logit_lengths,
+        blank_index=blank_index)
+    ctc_loss_grads = gradients_impl.gradients(ctc_loss, [logits])[0]
+
+    with self.cached_session() as sess:
+      for _ in range(32):
+        self.assertAllClose(*self.evaluate([ctc_loss, tf_nn_ctc_loss]))
+        self.assertAllClose(
+            *self.evaluate([ctc_loss_grads, tf_nn_ctc_grads]),
+            rtol=2e-06,
+            atol=2e-06)
+
+  @test_util.run_deprecated_v1
+  def testCtcLossDenseWithNegativeBlankIndexIsSameAsCtcLoss(self):
+    with ops.device("/GPU:0" if test.is_gpu_available() else "/CPU:0"):
+      random_seed.set_random_seed(5)
+
+      batch_size = 8
+      num_labels = 6
+      label_length = 5
+      num_frames = 12
+      logits = random_ops.random_uniform([num_frames, batch_size, num_labels])
+      labels = random_ops.random_uniform(
+          [batch_size, label_length], minval=0, maxval=num_labels-1,
+          dtype=dtypes.int64)
+
+      label_lengths = random_ops.random_uniform(
+          [batch_size], minval=2, maxval=label_length, dtype=dtypes.int64)
+      label_mask = array_ops.sequence_mask(
+          label_lengths, maxlen=label_length, dtype=label_lengths.dtype)
+      labels *= label_mask
+
+      logit_lengths = [num_frames] * batch_size
+
+      ctc_loss = ctc_ops.ctc_loss_dense(
+          labels=labels,
+          logits=logits,
+          label_length=label_lengths,
+          logit_length=logit_lengths,
+          blank_index=-1)
+      ctc_loss_grads = gradients_impl.gradients(ctc_loss, [logits])[0]
+
+      tf_ctc_loss_labels = math_ops.cast(labels, dtypes.int32)
+      tf_ctc_loss_labels = ctc_ops.dense_labels_to_sparse(
+          tf_ctc_loss_labels, label_lengths)
+
+      tf_nn_ctc_loss = ctc_ops.ctc_loss(
+          labels=tf_ctc_loss_labels,
+          inputs=logits,
+          sequence_length=logit_lengths,
+          time_major=True)
+      tf_nn_ctc_grads = gradients_impl.gradients(tf_nn_ctc_loss, [logits])[0]
+
+      with self.cached_session() as sess:
+        for _ in range(32):
+          self.assertAllClose(*self.evaluate([ctc_loss, tf_nn_ctc_loss]))
+          self.assertAllClose(
+              *self.evaluate([ctc_loss_grads, tf_nn_ctc_grads]),
+              rtol=2e-06,
+              atol=2e-06)
+
+  @test_util.run_deprecated_v1
+  def testCollapseRepeated(self):
+    collapsed, new_seq_lengths = ctc_ops.collapse_repeated(
+        labels=[[1, 3, 3, 3, 0],
+                [1, 4, 4, 4, 0],
+                [4, 2, 2, 9, 4]],
+        seq_length=[4, 5, 5])
+    self.assertAllEqual(new_seq_lengths, [2, 3, 4])
+    self.assertAllEqual(
+        collapsed,
+        [[1, 3, 0, 0],
+         [1, 4, 0, 0],
+         [4, 2, 9, 4]])
+
+  @test_util.run_deprecated_v1
+  def testCollapseRepeatedPreservesDtypes(self):
+    collapsed, new_seq_lengths = ctc_ops.collapse_repeated(
+        labels=constant_op.constant(
+            [[1, 3, 3, 3, 0],
+             [1, 4, 4, 4, 0],
+             [4, 2, 2, 9, 4]],
+            dtype=dtypes.int64),
+        seq_length=constant_op.constant([4, 5, 5], dtype=dtypes.int64))
+    self.assertEqual(new_seq_lengths.dtype, dtypes.int64)
+    self.assertEqual(collapsed.dtype, dtypes.int64)
+    self.assertAllEqual(new_seq_lengths, [2, 3, 4])
+    self.assertAllEqual(
+        collapsed,
+        [[1, 3, 0, 0],
+         [1, 4, 0, 0],
+         [4, 2, 9, 4]])
+
+  @test_util.run_deprecated_v1
+  def testCollapseRepeatedExtraPadding(self):
+    collapsed, new_seq_lengths = ctc_ops.collapse_repeated(
+        labels=[[1, 3, 3, 3, 0, 0, 0],
+                [1, 4, 4, 4, 0, 1, 2],
+                [4, 2, 2, 9, 4, 0, 0]],
+        seq_length=[4, 5, 5])
+    self.assertAllEqual(new_seq_lengths, [2, 3, 4])
+    self.assertAllEqual(
+        collapsed,
+        [[1, 3, 0, 0],
+         [1, 4, 0, 0],
+         [4, 2, 9, 4]])
+
+  @test_util.run_deprecated_v1
+  def testCollapseRepeatedFrontRepeats(self):
+    collapsed, new_seq_lengths = ctc_ops.collapse_repeated(
+        labels=[[1, 1, 1, 2, 2],
+                [1, 1, 1, 2, 2],
+                [1, 1, 1, 2, 2]],
+        seq_length=[5, 4, 3])
+    self.assertAllEqual(new_seq_lengths, [2, 2, 1])
+    self.assertAllEqual(
+        collapsed,
+        [[1, 2],
+         [1, 2],
+         [1, 0]])
+
+  @test_util.run_deprecated_v1
+  def testCollapseRepeatedAllLabelsTheSame(self):
+    collapsed, new_seq_lengths = ctc_ops.collapse_repeated(
+        labels=[[1, 1, 1, 1, 1],
+                [1, 1, 1, 1, 1],
+                [1, 1, 1, 1, 1]],
+        seq_length=[4, 5, 1])
+    self.assertAllEqual(new_seq_lengths, [1, 1, 1])
+    self.assertAllEqual(
+        collapsed,
+        [[1],
+         [1],
+         [1]])
+
+  def testDenseSequencesToSparse(self):
+    labels = [[1, 3, 3, 3, 0],
+              [1, 4, 4, 4, 0],
+              [4, 2, 2, 9, 4]]
+    length = [4, 5, 5]
+    sparse = ctc_ops.dense_labels_to_sparse(labels, length)
+    new_dense = sparse_ops.sparse_tensor_to_dense(sparse)
+
+    self.assertAllEqual(labels, new_dense)
+
+    padded_labels = [[1, 3, 3, 3, 0, 0, 0, 0],
+                     [1, 4, 4, 4, 0, 0, 0, 0],
+                     [4, 2, 2, 9, 4, 0, 0, 0]]
+    length = [4, 5, 5]
+    sparse = ctc_ops.dense_labels_to_sparse(padded_labels, length)
+    padded_dense = sparse_ops.sparse_tensor_to_dense(sparse)
+
+    self.assertAllEqual(padded_dense, new_dense)
+
+  @test_util.run_deprecated_v1
+  def testUnique(self):
+    labels = [
+        [3, 4, 4, 3],
+        [1, 1, 1, 0],
+    ]
+    unique, idx = ctc_ops.ctc_unique_labels(labels)
+    self.assertAllEqual([
+        [3, 4, 0, 0],
+        [1, 0, 0, 0],
+    ], unique)
+    self.assertAllEqual([
+        [0, 1, 1, 0],
+        [0, 0, 0, 1],
+    ], idx)
+
+  @test_util.run_deprecated_v1
+  def testSumStates(self):
+    idx = [
+        [0, 1, 0, 1],
+        [0, 0, 0, 1],
+    ]
+    states = math_ops.log([
+        [[1.0, 2.0, 3.0, 4.0],
+         [5.0, 6.0, 7.0, 8.0]],
+        [[0.1, 0.2, 0.3, 0.4],
+         [0.5, 0.6, 0.7, 0.8]],
+    ])
+    sum_of_states = math_ops.exp(ctc_ops._sum_states(idx, states))
+    self.assertAllClose([
+        [[4.0, 6.0, 0.0, 0.0],
+         [18.0, 8.0, 0.0, 0.0]],
+        [[0.4, 0.6, 0.0, 0.0],
+         [1.8, 0.8, 0.0, 0.0]]
+    ], sum_of_states)
+
+  @test_util.run_deprecated_v1
+  def testStateToOlabel(self):
+    labels = [
+        [3, 4, 3, 4],
+        [1, 1, 1, 0],
+    ]
+    num_labels = 8
+
+    # 3 frames, 2 batch, 10 states (5 label, 5 blank).
+    states = [
+        [[0.11, 0.12, 0.13, 0.14, 0.15, 0.16, 0.17, 0.18, 0.19, 0.20],
+         [0.21, 0.22, 0.23, 0.24, 0.25, 0.26, 0.27, 0.28, 0.29, 0.30]],
+        [[1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 2.0],
+         [2.1, 2.2, 2.3, 2.4, 2.5, 2.6, 2.7, 2.8, 2.9, 3.0]],
+        [[11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0],
+         [21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0]],
+    ]
+    labels = ops.convert_to_tensor(labels)
+    states = math_ops.log(states)
+    olabel = ctc_ops._state_to_olabel(labels, num_labels, states)
+    olabel = math_ops.exp(olabel)
+    blank = olabel[:, :, 0]
+    self.assertAllClose(blank, [
+        [0.16 + 0.17 + 0.18 + 0.19 + 0.20,
+         0.26 + 0.27 + 0.28 + 0.29 + 0.30],
+        [1.6 + 1.7 + 1.8 + 1.9 + 2.0,
+         2.6 + 2.7 + 2.8 + 2.9 + 3.0],
+        [16.0 + 17.0 + 18.0 + 19.0 + 20.0,
+         26.0 + 27.0 + 28.0 + 29.0 + 30.0]
+    ])
+    self.assertAllClose(olabel[:, :, 1:], [
+        [[0.0, 0.0, 0.12 + 0.14, 0.13 + 0.15, 0.0, 0.0, 0.0],
+         [0.22 + 0.23 + 0.24, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]],
+        [[0.0, 0.0, 1.2 + 1.4, 1.3 + 1.5, 0.0, 0.0, 0.0],
+         [2.2 + 2.3 + 2.4, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]],
+        [[0.0, 0.0, 12.0 + 14.0, 13.0 + 15.0, 0.0, 0.0, 0.0],
+         [22.0 + 23.0 + 24.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]],
+    ])
+
+  @test_util.run_deprecated_v1
+  def testStateToOlabelUnique(self):
+    labels = [
+        [3, 4, 3, 4],
+        [1, 1, 1, 0],
+    ]
+    num_labels = 8
+
+    # 3 frames, 2 batch, 10 states (5 label, 5 blank).
+    states = [
+        [[0.11, 0.12, 0.13, 0.14, 0.15, 0.16, 0.17, 0.18, 0.19, 0.20],
+         [0.21, 0.22, 0.23, 0.24, 0.25, 0.26, 0.27, 0.28, 0.29, 0.30]],
+        [[1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 2.0],
+         [2.1, 2.2, 2.3, 2.4, 2.5, 2.6, 2.7, 2.8, 2.9, 3.0]],
+        [[11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0],
+         [21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0]],
+    ]
+    labels = ops.convert_to_tensor(labels)
+    states = math_ops.log(states)
+    olabel = ctc_ops._state_to_olabel_unique(
+        labels, num_labels, states, ctc_ops.ctc_unique_labels(labels))
+    olabel = math_ops.exp(olabel)
+    blank = olabel[:, :, 0]
+    self.assertAllClose(blank, [
+        [0.16 + 0.17 + 0.18 + 0.19 + 0.20,
+         0.26 + 0.27 + 0.28 + 0.29 + 0.30],
+        [1.6 + 1.7 + 1.8 + 1.9 + 2.0,
+         2.6 + 2.7 + 2.8 + 2.9 + 3.0],
+        [16.0 + 17.0 + 18.0 + 19.0 + 20.0,
+         26.0 + 27.0 + 28.0 + 29.0 + 30.0]])
+    self.assertAllClose(olabel[:, :, 1:], [
+        [[0.0, 0.0, 0.12 + 0.14, 0.13 + 0.15, 0.0, 0.0, 0.0],
+         [0.22 + 0.23 + 0.24, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]],
+        [[0.0, 0.0, 1.2 + 1.4, 1.3 + 1.5, 0.0, 0.0, 0.0],
+         [2.2 + 2.3 + 2.4, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]],
+        [[0.0, 0.0, 12.0 + 14.0, 13.0 + 15.0, 0.0, 0.0, 0.0],
+         [22.0 + 23.0 + 24.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]],
+    ])
+
+  @test_util.run_deprecated_v1
+  def testScan(self):
+    with ops.device("/GPU:0" if test.is_gpu_available() else "/CPU:0"):
+      out = ctc_ops._scan(
+          lambda accum, elem: accum + elem,
+          constant_op.constant([1.0, 2.0, 3.0]), 23.0)
+      self.assertAllEqual([24.0, 26.0, 29.0], out)
+
+      out = ctc_ops._scan(
+          lambda a, e: a + e,
+          constant_op.constant([1.0, 2.0, 3.0]), 23.0,
+          inclusive=True)
+      self.assertAllEqual([23.0, 24.0, 26.0, 29.0], out)
+
+      out = ctc_ops._scan(
+          lambda a, e: a + e,
+          constant_op.constant([1.0, 2.0, 3.0]), 23.0,
+          reverse=True)
+      self.assertAllEqual([29.0, 28.0, 26.0], out)
+
+      out = ctc_ops._scan(
+          lambda a, e: a + e,
+          constant_op.constant([1.0, 2.0, 3.0]), 23.0,
+          reverse=True,
+          inclusive=True)
+      self.assertAllEqual([29.0, 28.0, 26.0, 23.0], out)
+
+      out = ctc_ops._scan(
+          lambda a, e: a + e,
+          constant_op.constant([[0.0, 1.0], [2.0, 3.0], [4.0, 5.0]]),
+          constant_op.constant([23.0, 24.0]))
+      self.assertAllEqual([[23.0, 25.0], [25.0, 28.0], [29.0, 33.0]], out)
+
+  @test_util.run_deprecated_v1
+  def testScanCapturesVariables(self):
+    with self.cached_session() as sess:
+      x = random_ops.random_uniform([])
+      fn = lambda accum, elem: accum + x * elem
+      out = ctc_ops._scan(fn, constant_op.constant([0.0, 1.0, 2.0]), 23.0)
+      self.assertAllEqual(*sess.run([
+          [23.0 + x * 0.0, 23.0 + x * 1.0, 23.0 + x * 3.0], out
+      ]))
+
+  @test_util.run_deprecated_v1
+  def testScanMultipleAccumulators(self):
+    with ops.device("/GPU:0" if test.is_gpu_available() else "/CPU:0"):
+      def fn(accum, elem):
+        accum_a, accum_b = accum
+        return accum_a + elem, accum_b * elem
+      out = ctc_ops._scan(
+          fn, constant_op.constant([1.0, 2.0, 3.0]),
+          (23.0, constant_op.constant([1.0, 2.0])))
+      a, b = out
+      self.assertAllEqual([24.0, 26.0, 29.0], a)
+      self.assertAllEqual([[1.0, 2.0], [2.0, 4.0], [6.0, 12.0]], b)
+
+  @test_util.run_deprecated_v1
+  def testScanMultipleElements(self):
+    with ops.device("/GPU:0" if test.is_gpu_available() else "/CPU:0"):
+      def fn(accum, elem):
+        elem_a, elem_b = elem
+        return accum + (elem_a * elem_b)
+      elems_a = constant_op.constant([1.0, 2.0, 3.0])
+      elems_b = constant_op.constant([[1.0, 2.0], [2.0, 3.0], [3.0, 4.0]])
+      out = ctc_ops._scan(
+          fn, (elems_a, elems_b),
+          initial=constant_op.constant([0.0, 0.0]))
+      self.assertAllEqual(
+          [[1.0, 2.0], [5.0, 8.0], [14.0, 20.0]], out)
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/cwise_ops_binary_test.py b/tensorflow/python/kernel_tests/cwise_ops_binary_test.py
index 8028f93a8c5..49dbbb125a1 100644
--- a/tensorflow/python/kernel_tests/cwise_ops_binary_test.py
+++ b/tensorflow/python/kernel_tests/cwise_ops_binary_test.py
@@ -77,23 +77,23 @@ class BinaryOpTest(test.TestCase):
 
   def _compareCpu(self, x, y, np_func, tf_func, also_compare_variables=False):
     np_ans = np_func(x, y)
-    with self.test_session(use_gpu=False):
+    with test_util.force_cpu():
       inx = ops.convert_to_tensor(x)
       iny = ops.convert_to_tensor(y)
       out = tf_func(inx, iny)
-      tf_cpu = out.eval()
+      tf_cpu = self.evaluate(out)
       # Test that the op takes precedence over numpy operators.
-      np_left = tf_func(x, iny).eval()
-      np_right = tf_func(inx, y).eval()
+      np_left = self.evaluate(tf_func(x, iny))
+      np_right = self.evaluate(tf_func(inx, y))
 
       if also_compare_variables:
         var_x = variables.Variable(x)
         var_y = variables.Variable(y)
-        variables.global_variables_initializer().run()
+        self.evaluate(variables.global_variables_initializer())
         print(type(x), type(y), type(var_x), type(var_y))
         print(type(tf_func(x, var_y)), type(tf_func(var_x, y)))
-        np_var_left = tf_func(x, var_y).eval()
-        np_var_right = tf_func(var_x, y).eval()
+        np_var_left = self.evaluate(tf_func(x, var_y))
+        np_var_right = self.evaluate(tf_func(var_x, y))
 
     if np_ans.dtype != np.object:
       self.assertAllClose(np_ans, tf_cpu)
@@ -174,11 +174,11 @@ class BinaryOpTest(test.TestCase):
 
   def _compareGpu(self, x, y, np_func, tf_func):
     np_ans = np_func(x, y)
-    with self.test_session(force_gpu=test_util.is_gpu_available()):
+    with test_util.use_gpu():
       inx = ops.convert_to_tensor(x)
       iny = ops.convert_to_tensor(y)
       out = tf_func(inx, iny)
-      tf_gpu = out.eval()
+      tf_gpu = self.evaluate(out)
     self.assertAllClose(np_ans, tf_gpu)
     self.assertShapeEqual(np_ans, out)
     # TODO(zhifengc/ke): make gradient checker work on GPU.
@@ -196,6 +196,7 @@ class BinaryOpTest(test.TestCase):
         self._compareGradientY(x, y, np_func, tf_func)
       self._compareGpu(x, y, np_func, tf_func)
 
+  @test_util.run_deprecated_v1
   def testFloatBasic(self):
     x = np.linspace(-5, 20, 15).reshape(1, 3, 5).astype(np.float32)
     y = np.linspace(20, -5, 15).reshape(1, 3, 5).astype(np.float32)
@@ -233,6 +234,7 @@ class BinaryOpTest(test.TestCase):
     except ImportError as e:
       tf_logging.warn("Cannot test special functions: %s" % str(e))
 
+  @test_util.run_deprecated_v1
   def testFloatDifferentShapes(self):
     x = np.array([1, 2, 3, 4]).reshape(2, 2).astype(np.float32)
     y = np.array([1, 2]).reshape(2, 1).astype(np.float32)
@@ -252,14 +254,17 @@ class BinaryOpTest(test.TestCase):
     y = np.array([1, 2]).reshape(2, 1).astype(np.int32)
     var_x = variables.Variable(x)
     var_y = variables.Variable(y)
+
     with self.cached_session() as sess:
-      sess.run([var_x.initializer, var_y.initializer])
-      left_result = (var_x * y).eval()
-      right_result = (x * var_y).eval()
+      self.evaluate([var_x.initializer, var_y.initializer])
+      left_result = self.evaluate(var_x * y)
+      right_result = self.evaluate(x * var_y)
+
     np_result = x * y
     self.assertAllEqual(np_result, left_result)
     self.assertAllEqual(np_result, right_result)
 
+  @test_util.run_deprecated_v1
   def testDoubleBasic(self):
     x = np.linspace(-5, 20, 15).reshape(1, 3, 5).astype(np.float64)
     y = np.linspace(20, -5, 15).reshape(1, 3, 5).astype(np.float64)
@@ -351,6 +356,7 @@ class BinaryOpTest(test.TestCase):
     self._compareBoth(x, y, np.floor_divide, _FLOORDIV)
     self._compareBoth(x, y, np.mod, _MOD)
 
+  @test_util.run_deprecated_v1
   def testComplex64Basic(self):
     x = np.complex(1, 1) * np.linspace(-10, 10, 6).reshape(1, 3, 2).astype(
         np.complex64)
@@ -365,6 +371,7 @@ class BinaryOpTest(test.TestCase):
     self._compareBoth(x, y, np.multiply, _MUL)
     self._compareBoth(x, y + 0.1, np.true_divide, _TRUEDIV)
 
+  @test_util.run_deprecated_v1
   def testComplex128Basic(self):
     x = np.complex(1, 1) * np.linspace(-10, 10, 6).reshape(1, 3, 2).astype(
         np.complex128)
@@ -382,10 +389,10 @@ class BinaryOpTest(test.TestCase):
   def testStringComparison(self):
     x = np.array([["abc", "bh"], ["c", ""]])
     y = np.array([["abc", "bh"], ["def", "hi"]])
-    with self.test_session(use_gpu=False) as sess:
+    with test_util.force_cpu():
       cmp_eq = math_ops.equal(x, y)
       cmp_not_eq = math_ops.not_equal(x, y)
-      values = sess.run([cmp_eq, cmp_not_eq])
+      values = self.evaluate([cmp_eq, cmp_not_eq])
       self.assertAllEqual([[True, True], [False, False]], values[0])
       self.assertAllEqual([[False, False], [True, True]], values[1])
 
@@ -478,198 +485,263 @@ class BinaryOpTest(test.TestCase):
     ]
     self._testBCastByFunc(funcs, xs, ys)
 
+  @test_util.run_deprecated_v1
   def testBCast_0A(self):
     self._testBCastA([1, 3, 2], [1])
 
+  @test_util.run_deprecated_v1
   def testBCast_0B(self):
     self._testBCastB([1, 3, 2], [1])
 
+  @test_util.run_deprecated_v1
   def testBCast_0C(self):
     self._testBCastC([1, 3, 2], [1])
 
+  @test_util.run_deprecated_v1
   def testBCast_0D(self):
     self._testBCastD([1, 3, 2], [1])
 
+  @test_util.run_deprecated_v1
   def testBCast_1A(self):
     self._testBCastA([1, 3, 2], [2])
 
+  @test_util.run_deprecated_v1
   def testBCast_1B(self):
     self._testBCastB([1, 3, 2], [2])
 
+  @test_util.run_deprecated_v1
   def testBCast_1C(self):
     self._testBCastC([1, 3, 2], [2])
 
+  @test_util.run_deprecated_v1
   def testBCast_1D(self):
     self._testBCastD([1, 3, 2], [2])
 
+  @test_util.run_deprecated_v1
   def testBCast_2A(self):
     self._testBCastA([1, 3, 2], [3, 2])
 
+  @test_util.run_deprecated_v1
   def testBCast_2B(self):
     self._testBCastB([1, 3, 2], [3, 2])
 
+  @test_util.run_deprecated_v1
   def testBCast_2C(self):
     self._testBCastC([1, 3, 2], [3, 2])
 
+  @test_util.run_deprecated_v1
   def testBCast_2D(self):
     self._testBCastD([1, 3, 2], [3, 2])
 
+  @test_util.run_deprecated_v1
   def testBCast_3A(self):
     self._testBCastA([1, 3, 2], [3, 1])
 
+  @test_util.run_deprecated_v1
   def testBCast_3B(self):
     self._testBCastB([1, 3, 2], [3, 1])
 
+  @test_util.run_deprecated_v1
   def testBCast_3C(self):
     self._testBCastC([1, 3, 2], [3, 1])
 
+  @test_util.run_deprecated_v1
   def testBCast_3D(self):
     self._testBCastD([1, 3, 2], [3, 1])
 
+  @test_util.run_deprecated_v1
   def testBCast_4A(self):
     self._testBCastA([1, 3, 2], [1, 3, 2])
 
+  @test_util.run_deprecated_v1
   def testBCast_4B(self):
     self._testBCastB([1, 3, 2], [1, 3, 2])
 
+  @test_util.run_deprecated_v1
   def testBCast_4C(self):
     self._testBCastC([1, 3, 2], [1, 3, 2])
 
+  @test_util.run_deprecated_v1
   def testBCast_4D(self):
     self._testBCastD([1, 3, 2], [1, 3, 2])
 
+  @test_util.run_deprecated_v1
   def testBCast_5A(self):
     self._testBCastA([1, 3, 2], [2, 3, 1])
 
+  @test_util.run_deprecated_v1
   def testBCast_5B(self):
     self._testBCastB([1, 3, 2], [2, 3, 1])
 
+  @test_util.run_deprecated_v1
   def testBCast_5C(self):
     self._testBCastC([1, 3, 2], [2, 3, 1])
 
+  @test_util.run_deprecated_v1
   def testBCast_5D(self):
     self._testBCastD([1, 3, 2], [2, 3, 1])
 
+  @test_util.run_deprecated_v1
   def testBCast_6A(self):
     self._testBCastA([1, 3, 2], [2, 1, 1])
 
+  @test_util.run_deprecated_v1
   def testBCast_6B(self):
     self._testBCastB([1, 3, 2], [2, 1, 1])
 
+  @test_util.run_deprecated_v1
   def testBCast_6C(self):
     self._testBCastC([1, 3, 2], [2, 1, 1])
 
+  @test_util.run_deprecated_v1
   def testBCast_6D(self):
     self._testBCastD([1, 3, 2], [2, 1, 1])
 
+  @test_util.run_deprecated_v1
   def testBCast_7A(self):
     self._testBCastA([1, 3, 2], [1, 3, 1])
 
+  @test_util.run_deprecated_v1
   def testBCast_7B(self):
     self._testBCastB([1, 3, 2], [1, 3, 1])
 
+  @test_util.run_deprecated_v1
   def testBCast_7C(self):
     self._testBCastC([1, 3, 2], [1, 3, 1])
 
+  @test_util.run_deprecated_v1
   def testBCast_7D(self):
     self._testBCastD([1, 3, 2], [1, 3, 1])
 
+  @test_util.run_deprecated_v1
   def testBCast_8A(self):
     self._testBCastA([2, 1, 5], [2, 3, 1])
 
+  @test_util.run_deprecated_v1
   def testBCast_8B(self):
     self._testBCastB([2, 1, 5], [2, 3, 1])
 
+  @test_util.run_deprecated_v1
   def testBCast_8C(self):
     self._testBCastC([2, 1, 5], [2, 3, 1])
 
+  @test_util.run_deprecated_v1
   def testBCast_8D(self):
     self._testBCastD([2, 1, 5], [2, 3, 1])
 
+  @test_util.run_deprecated_v1
   def testBCast_9A(self):
     self._testBCastA([2, 0, 5], [2, 0, 1])
 
+  @test_util.run_deprecated_v1
   def testBCast_9B(self):
     self._testBCastB([2, 0, 5], [2, 0, 1])
 
+  @test_util.run_deprecated_v1
   def testBCast_9C(self):
     self._testBCastC([2, 0, 5], [2, 0, 1])
 
+  @test_util.run_deprecated_v1
   def testBCast_9D(self):
     self._testBCastD([2, 0, 5], [2, 0, 1])
 
+  @test_util.run_deprecated_v1
   def testBCast_10A(self):
     self._testBCastA([2, 3, 0], [2, 3, 1])
 
+  @test_util.run_deprecated_v1
   def testBCast_10B(self):
     self._testBCastB([2, 3, 0], [2, 3, 1])
 
+  @test_util.run_deprecated_v1
   def testBCast_10C(self):
     self._testBCastC([2, 3, 0], [2, 3, 1])
 
+  @test_util.run_deprecated_v1
   def testBCast_10D(self):
     self._testBCastD([2, 3, 0], [2, 3, 1])
 
+  @test_util.run_deprecated_v1
   def testBCast_11A(self):
     self._testBCastA([1, 3, 2], [1, 3, 2])
 
+  @test_util.run_deprecated_v1
   def testBCast_11B(self):
     self._testBCastB([1, 3, 2], [1, 3, 2])
 
+  @test_util.run_deprecated_v1
   def testBCast_11C(self):
     self._testBCastC([1, 3, 2], [1, 3, 2])
 
+  @test_util.run_deprecated_v1
   def testBCast_11D(self):
     self._testBCastD([1, 3, 2], [1, 3, 2])
 
+  @test_util.run_deprecated_v1
   def testBCast_12A(self):
     self._testBCastA([1, 1, 1, 1, 3, 2], [1, 3, 2])
 
+  @test_util.run_deprecated_v1
   def testBCast_12B(self):
     self._testBCastB([1, 1, 1, 1, 3, 2], [1, 3, 2])
 
+  @test_util.run_deprecated_v1
   def testBCast_12C(self):
     self._testBCastC([1, 1, 1, 1, 3, 2], [1, 3, 2])
 
+  @test_util.run_deprecated_v1
   def testBCast_12D(self):
     self._testBCastD([1, 1, 1, 1, 3, 2], [1, 3, 2])
 
+  @test_util.run_deprecated_v1
   def testBCast_13A(self):
     self._testBCastA([1, 3, 2, 1, 1], [1])
 
+  @test_util.run_deprecated_v1
   def testBCast_13B(self):
     self._testBCastB([1, 3, 2, 1, 1], [1])
 
+  @test_util.run_deprecated_v1
   def testBCast_13C(self):
     self._testBCastC([1, 3, 2, 1, 1], [1])
 
+  @test_util.run_deprecated_v1
   def testBCast_13D(self):
     self._testBCastD([1, 3, 2, 1, 1], [1])
 
+  @test_util.run_deprecated_v1
   def testBCast_14A(self):
     self._testBCastA([2, 3, 1, 1, 5], [1])
 
+  @test_util.run_deprecated_v1
   def testBCast_14B(self):
     self._testBCastB([2, 3, 1, 1, 5], [1])
 
+  @test_util.run_deprecated_v1
   def testBCast_14C(self):
     self._testBCastC([2, 3, 1, 1, 5], [1])
 
+  @test_util.run_deprecated_v1
   def testBCast_14D(self):
     self._testBCastD([2, 3, 1, 1, 5], [1])
 
+  @test_util.run_deprecated_v1
   def testBCast_15A(self):
     self._testBCastA([10, 3, 1, 2], [3, 1, 2])
 
+  @test_util.run_deprecated_v1
   def testBCast_15B(self):
     self._testBCastB([10, 3, 1, 2], [3, 1, 2])
 
+  @test_util.run_deprecated_v1
   def testBCast_15C(self):
     self._testBCastC([10, 3, 1, 2], [3, 1, 2])
 
+  @test_util.run_deprecated_v1
   def testBCast_15D(self):
     self._testBCastD([10, 3, 1, 2], [3, 1, 2])
 
+  @test_util.run_deprecated_v1
   def testMismatchedDimensions(self):
     for func in [
         math_ops.add, math_ops.subtract, math_ops.multiply, math_ops.div, _ADD,
@@ -681,6 +753,7 @@ class BinaryOpTest(test.TestCase):
             ops.convert_to_tensor([10.0, 20.0, 30.0]),
             ops.convert_to_tensor([[40.0, 50.0], [60.0, 70.0]]))
 
+  @test_util.run_deprecated_v1
   def testZeroPowGrad(self):
     with self.cached_session():
       for dtype in (np.float16, np.float32, np.float64, np.complex64,
@@ -691,6 +764,7 @@ class BinaryOpTest(test.TestCase):
         error = gradient_checker.compute_gradient_error(y, [], z, [])
         self.assertEqual(error, 0)
 
+  @test_util.run_deprecated_v1
   def testComplexPowGrad(self):
     with self.cached_session():
       for dtype in np.complex64, np.complex128:
@@ -716,39 +790,39 @@ class BinaryOpTest(test.TestCase):
 
   def testPowNegativeExponent(self):
     for dtype in [np.int32, np.int64]:
-      with self.test_session(use_gpu=False) as sess:
+      with test_util.force_cpu():
         with self.assertRaisesRegexp(
             errors_impl.InvalidArgumentError,
             "Integers to negative integer powers are not allowed"):
           x = np.array([5, 2]).astype(dtype)
           y = np.array([-2, 3]).astype(dtype)
-          sess.run(math_ops.pow(x, y))
+          self.evaluate(math_ops.pow(x, y))
 
-      with self.test_session(use_gpu=False) as sess:
+      with test_util.force_cpu():
         with self.assertRaisesRegexp(
             errors_impl.InvalidArgumentError,
             "Integers to negative integer powers are not allowed"):
           x = np.array([5, 2]).astype(dtype)
           y = np.array([2, -3]).astype(dtype)
-          sess.run(math_ops.pow(x, y))
+          self.evaluate(math_ops.pow(x, y))
 
-      with self.test_session(use_gpu=False) as sess:
+      with test_util.force_cpu():
         with self.assertRaisesRegexp(
             errors_impl.InvalidArgumentError,
             "Integers to negative integer powers are not allowed"):
           x = np.array([5, 2]).astype(dtype)
           y = -3
-          sess.run(math_ops.pow(x, y))
+          self.evaluate(math_ops.pow(x, y))
 
 
 class ComparisonOpTest(test.TestCase):
 
   def _compareScalar(self, func, x, y, dtype):
-    with self.test_session(force_gpu=test_util.is_gpu_available()):
+    with test_util.use_gpu():
       out = func(
           ops.convert_to_tensor(np.array([x]).astype(dtype)),
           ops.convert_to_tensor(np.array([y]).astype(dtype)))
-      ret = out.eval()
+      ret = self.evaluate(out)
     return ret[0]
 
   def testScalarCompareScalar(self):
@@ -777,9 +851,9 @@ class ComparisonOpTest(test.TestCase):
 
   def _compare(self, x, y, np_func, tf_func):
     np_ans = np_func(x, y)
-    with self.test_session(force_gpu=test_util.is_gpu_available()):
+    with test_util.use_gpu():
       out = tf_func(ops.convert_to_tensor(x), ops.convert_to_tensor(y))
-      tf_ans = out.eval()
+      tf_ans = self.evaluate(out)
     self.assertAllEqual(np_ans, tf_ans)
 
   def testTensorCompareTensor(self):
@@ -859,6 +933,7 @@ class ComparisonOpTest(test.TestCase):
     self._testBCastByFunc(
         np.not_equal, math_ops.not_equal, include_complex=True)
 
+  @test_util.run_deprecated_v1
   def testShapeMismatch(self):
     dtypes = [np.float16, np.float32, np.float64, np.int32, np.int64]
     funcs = [
diff --git a/tensorflow/python/kernel_tests/cwise_ops_test.py b/tensorflow/python/kernel_tests/cwise_ops_test.py
index c5311ad834a..9bb7d8b8b12 100644
--- a/tensorflow/python/kernel_tests/cwise_ops_test.py
+++ b/tensorflow/python/kernel_tests/cwise_ops_test.py
@@ -84,11 +84,11 @@ def _default_tolerance(dtype):
 class ComparisonOpTest(test.TestCase):
 
   def _compareScalar(self, func, x, y, dtype):
-    with self.test_session(force_gpu=test_util.is_gpu_available()):
+    with test_util.use_gpu():
       out = func(
           ops.convert_to_tensor(np.array([x]).astype(dtype)),
           ops.convert_to_tensor(np.array([y]).astype(dtype)))
-      ret = out.eval()
+      ret = self.evaluate(out)
     return ret[0]
 
   def testScalarCompareScalar(self):
@@ -117,9 +117,9 @@ class ComparisonOpTest(test.TestCase):
 
   def _compare(self, x, y, np_func, tf_func):
     np_ans = np_func(x, y)
-    with self.test_session(force_gpu=test_util.is_gpu_available()):
+    with test_util.use_gpu():
       out = tf_func(ops.convert_to_tensor(x), ops.convert_to_tensor(y))
-      tf_ans = out.eval()
+      tf_ans = self.evaluate(out)
     self.assertAllEqual(np_ans, tf_ans)
 
   def testTensorCompareTensor(self):
@@ -199,6 +199,7 @@ class ComparisonOpTest(test.TestCase):
     self._testBCastByFunc(
         np.not_equal, math_ops.not_equal, include_complex=True)
 
+  @test_util.run_deprecated_v1
   def testShapeMismatch(self):
     dtypes = [np.float16, np.float32, np.float64, np.int32, np.int64]
     funcs = [
@@ -218,22 +219,20 @@ class LogicalOpTest(test.TestCase):
 
   def _compareBinary(self, x, y, np_func, tf_func, use_gpu=False):
     np_ans = np_func(x, y)
-    with self.test_session(use_gpu=use_gpu,
-                           force_gpu=use_gpu and test_util.is_gpu_available()):
+    with test_util.device(use_gpu=use_gpu):
       inx = ops.convert_to_tensor(x)
       iny = ops.convert_to_tensor(y)
       out = tf_func(inx, iny)
-      tf_val = out.eval()
+      tf_val = self.evaluate(out)
     self.assertEqual(out.dtype, dtypes_lib.bool)
     self.assertAllEqual(np_ans, tf_val)
     self.assertShapeEqual(np_ans, out)
 
   def _not(self, x, use_gpu=False):
     np_ans = np.logical_not(x)
-    with self.test_session(use_gpu=use_gpu,
-                           force_gpu=use_gpu and test_util.is_gpu_available()):
+    with test_util.device(use_gpu=use_gpu):
       out = math_ops.logical_not(ops.convert_to_tensor(x))
-      tf_val = out.eval()
+      tf_val = self.evaluate(out)
     self.assertEqual(out.dtype, dtypes_lib.bool)
     self.assertAllEqual(np_ans, tf_val)
     self.assertShapeEqual(np_ans, out)
@@ -282,6 +281,7 @@ class LogicalOpTest(test.TestCase):
         self._compareBinary(x, y, np.logical_or, math_ops.logical_or, use_gpu)
         self._compareBinary(x, y, np.logical_xor, math_ops.logical_xor, use_gpu)
 
+  @test_util.run_deprecated_v1
   def testShapeMismatch(self):
     x = np.random.randint(0, 2, 6).astype(np.bool).reshape(1, 3, 2)
     y = np.random.randint(0, 2, 6).astype(np.bool).reshape(3, 2, 1)
@@ -290,6 +290,7 @@ class LogicalOpTest(test.TestCase):
           ValueError, lambda e: "Dimensions must" in str(e)):
         f(x, y)
 
+  @test_util.run_deprecated_v1
   def testUsingAsPythonValueFails(self):
     # Ensure that we raise an error when the user attempts to treat a
     # `Tensor` as a Python `bool`.
@@ -316,10 +317,9 @@ class SelectOpTest(test.TestCase):
 
   def _compare(self, c, x, y, use_gpu):
     np_ans = np.where(c, x, y)
-    with self.test_session(use_gpu=use_gpu,
-                           force_gpu=use_gpu and test_util.is_gpu_available()):
+    with test_util.device(use_gpu=use_gpu):
       out = array_ops.where(c, x, y)
-      tf_ans = out.eval()
+      tf_ans = self.evaluate(out)
     self.assertAllEqual(np_ans, tf_ans)
     self.assertShapeEqual(np_ans, out)
 
@@ -399,6 +399,7 @@ class SelectOpTest(test.TestCase):
       if t in [np.float16, np.float32, np.float64]:
         self._compare(c, xt, yt, use_gpu=True)
 
+  @test_util.run_deprecated_v1
   def testGradients(self):
     c = np.random.randint(0, 2, 6).astype(np.bool).reshape(1, 3, 2)
     x = np.random.rand(1, 3, 2) * 100
@@ -418,6 +419,7 @@ class SelectOpTest(test.TestCase):
         self._compareGradientX(c, xt, yt)
         self._compareGradientY(c, xt, yt)
 
+  @test_util.run_deprecated_v1
   def testShapeMismatch(self):
     c = np.random.randint(0, 2, 6).astype(np.bool).reshape(1, 3, 2)
     x = np.random.rand(1, 3, 2) * 100
@@ -431,6 +433,7 @@ class SelectOpTest(test.TestCase):
       with self.assertRaises(ValueError):
         array_ops.where(c, xt, yt)
 
+  @test_util.run_deprecated_v1
   def testEmptyTensor(self):
     c = np.random.randint(0, 3, 0).astype(np.bool).reshape(1, 3, 0)
     x = np.random.rand(1, 3, 0) * 100
@@ -442,6 +445,7 @@ class SelectOpTest(test.TestCase):
       z = array_ops.where(c, xt, yt).eval()
       self.assertAllEqual(z_expected, z)
 
+  @test_util.run_deprecated_v1
   def testNan(self):
     """Verify that nans don't propagate where they shouldn't."""
     with self.cached_session():
@@ -460,10 +464,9 @@ class BatchSelectOpTest(test.TestCase):
     np_ans = np.dstack(
         [x_i if c_i else y_i for c_i, x_i, y_i in zip(c, x, y)]).transpose(
             [2, 0, 1])
-    with self.test_session(use_gpu=use_gpu,
-                           force_gpu=use_gpu and test_util.is_gpu_available()):
+    with test_util.device(use_gpu=use_gpu):
       out = array_ops.where(c, x, y)
-      tf_ans = out.eval()
+      tf_ans = self.evaluate(out)
     self.assertAllEqual(np_ans, tf_ans)
     self.assertShapeEqual(np_ans, out)
 
@@ -529,6 +532,7 @@ class BatchSelectOpTest(test.TestCase):
       if t in [np.float16, np.float32, np.float64]:
         self._compare(c, xt, yt, use_gpu=True)
 
+  @test_util.run_deprecated_v1
   def testGradients(self):
     c = np.random.randint(0, 2, 16).astype(np.bool)
     x = np.random.rand(16, 2, 8) * 100
@@ -548,6 +552,7 @@ class BatchSelectOpTest(test.TestCase):
         self._compareGradientX(c, xt, yt)
         self._compareGradientY(c, xt, yt)
 
+  @test_util.run_deprecated_v1
   def testShapeMismatch(self):
     c = np.random.randint(0, 2, 8).astype(np.bool)
     x = np.random.rand(16, 3, 2) * 100
@@ -566,13 +571,11 @@ class MinMaxOpTest(test.TestCase):
 
   def _compare(self, x, y, use_gpu):
     np_min, np_max = np.minimum(x, y), np.maximum(x, y)
-    with self.test_session(
-        use_gpu=use_gpu,
-        force_gpu=use_gpu and test_util.is_gpu_available()) as sess:
+    with test_util.device(use_gpu=use_gpu):
       inx = ops.convert_to_tensor(x)
       iny = ops.convert_to_tensor(y)
       omin, omax = math_ops.minimum(inx, iny), math_ops.maximum(inx, iny)
-      tf_min, tf_max = sess.run([omin, omax])
+      tf_min, tf_max = self.evaluate([omin, omax])
     self.assertAllEqual(np_min, tf_min)
     self.assertAllEqual(np_max, tf_max)
 
@@ -628,6 +631,7 @@ class MinMaxOpTest(test.TestCase):
     elif x.dtype == np.float64:
       self.assertAllClose(jacob_t, jacob_n, rtol=1e-5, atol=1e-5)
 
+  @test_util.run_deprecated_v1
   def testGradients(self):
     x = np.random.rand(1, 3, 2) * 100.
     # ensure x != y
@@ -641,16 +645,16 @@ class MinMaxOpTest(test.TestCase):
 class MathOpsOverloadTest(test.TestCase):
 
   def _computeTensorAndLiteral(self, x, y, dtype, func):
-    with self.test_session(use_gpu=False):
+    with test_util.force_cpu():
       inx = ops.convert_to_tensor(x, dtype=dtype)
       z = func(inx, y)  # Should use __add__, __sub__, etc.
-      return z.eval()
+      return self.evaluate(z)
 
   def _computeLiteralAndTensor(self, x, y, dtype, func):
-    with self.test_session(use_gpu=False):
+    with test_util.force_cpu():
       iny = ops.convert_to_tensor(y, dtype=dtype)
       z = func(x, iny)  # Should use __radd__, __rsub__, etc.
-      return z.eval()
+      return self.evaluate(z)
 
   def _compareBinary(self, x, y, dtype, np_func, tf_func):
     np_ans = np_func(x, y).astype(dtype.as_numpy_dtype)
@@ -661,9 +665,9 @@ class MathOpsOverloadTest(test.TestCase):
 
   def _compareUnary(self, x, dtype, np_func, tf_func):
     np_ans = np_func(x).astype(dtype.as_numpy_dtype)
-    with self.test_session(use_gpu=False):
-      self.assertAllClose(np_ans,
-                          tf_func(ops.convert_to_tensor(x, dtype=dtype)).eval())
+    with test_util.force_cpu():
+      self.assertAllClose(
+          np_ans, self.evaluate(tf_func(ops.convert_to_tensor(x, dtype=dtype))))
 
   def testOverload(self):
     dtypes = [
@@ -730,13 +734,11 @@ class IsFiniteInfNanTest(test.TestCase):
 
   def _compare(self, x, use_gpu):
     np_finite, np_inf, np_nan = np.isfinite(x), np.isinf(x), np.isnan(x)
-    with self.test_session(
-        use_gpu=use_gpu,
-        force_gpu=use_gpu and test_util.is_gpu_available()) as sess:
+    with test_util.device(use_gpu=use_gpu):
       inx = ops.convert_to_tensor(x)
       ofinite, oinf, onan = math_ops.is_finite(inx), math_ops.is_inf(
           inx), math_ops.is_nan(inx)
-      tf_finite, tf_inf, tf_nan = sess.run([ofinite, oinf, onan])
+      tf_finite, tf_inf, tf_nan = self.evaluate([ofinite, oinf, onan])
     self.assertAllEqual(np_inf, tf_inf)
     self.assertAllEqual(np_nan, tf_nan)
     self.assertAllEqual(np_finite, tf_finite)
@@ -773,31 +775,33 @@ class IsFiniteInfNanTest(test.TestCase):
           x = np.full((size,), value, dtype=dtype)
           np_y = np.sqrt(x)
           np_nan = np.isnan(np_y)
-          with self.test_session(force_gpu=test_util.is_gpu_available()):
+          with test_util.use_gpu():
             tf_y = math_ops.sqrt(x)
             tf_nan = math_ops.is_nan(tf_y)
             if value < 0:
-              self.assertAllEqual(np_nan, tf_nan.eval())
+              self.assertAllEqual(np_nan, self.evaluate(tf_nan))
             else:
-              self.assertAllCloseAccordingToType(np_y, tf_y.eval())
+              self.assertAllCloseAccordingToType(np_y, self.evaluate(tf_y))
 
 
 class RoundingTest(test.TestCase):
 
   def _compare_values(self, x, y=None):
     y = np.rint(x) if y is None else np.asarray(y)
-    with self.cached_session() as sess:
-      tf_rint = math_ops.rint(x)
-      np_rint = sess.run(tf_rint)
+
+    tf_rint = math_ops.rint(x)
+    np_rint = self.evaluate(tf_rint)
+
     self.assertAllEqual(y, np_rint)
     self.assertShapeEqual(y, tf_rint)
 
   def _compare(self, x):
     np_floor, np_ceil = np.floor(x), np.ceil(x)
-    with self.cached_session() as sess:
-      inx = ops.convert_to_tensor(x)
-      ofloor, oceil = math_ops.floor(inx), math_ops.ceil(inx)
-      tf_floor, tf_ceil = sess.run([ofloor, oceil])
+
+    inx = ops.convert_to_tensor(x)
+    ofloor, oceil = math_ops.floor(inx), math_ops.ceil(inx)
+    tf_floor, tf_ceil = self.evaluate([ofloor, oceil])
+
     self.assertAllEqual(np_floor, tf_floor)
     self.assertAllEqual(np_ceil, tf_ceil)
     self.assertShapeEqual(np_floor, ofloor)
@@ -828,12 +832,13 @@ class ComplexMakeRealImagTest(test.TestCase):
 
   def _compareMake(self, real, imag, use_gpu):
     np_ans = real + (1j) * imag
-    with self.test_session(use_gpu=use_gpu,
-                           force_gpu=use_gpu and test_util.is_gpu_available()):
+
+    with test_util.device(use_gpu=use_gpu):
       real = ops.convert_to_tensor(real)
       imag = ops.convert_to_tensor(imag)
       tf_ans = math_ops.complex(real, imag)
-      out = tf_ans.eval()
+      out = self.evaluate(tf_ans)
+
     self.assertAllEqual(np_ans, out)
     self.assertShapeEqual(np_ans, tf_ans)
 
@@ -848,17 +853,17 @@ class ComplexMakeRealImagTest(test.TestCase):
   def _compareRealImag(self, cplx, use_gpu):
     np_real, np_imag = np.real(cplx), np.imag(cplx)
     np_zeros = np_real * 0
-    with self.test_session(use_gpu=use_gpu,
-                           force_gpu=use_gpu and test_util.is_gpu_available()):
+
+    with test_util.device(use_gpu=use_gpu):
       inx = ops.convert_to_tensor(cplx)
       tf_real = math_ops.real(inx)
       tf_imag = math_ops.imag(inx)
       tf_real_real = math_ops.real(tf_real)
       tf_imag_real = math_ops.imag(tf_real)
-      self.assertAllEqual(np_real, tf_real.eval())
-      self.assertAllEqual(np_imag, tf_imag.eval())
-      self.assertAllEqual(np_real, tf_real_real.eval())
-      self.assertAllEqual(np_zeros, tf_imag_real.eval())
+      self.assertAllEqual(np_real, self.evaluate(tf_real))
+      self.assertAllEqual(np_imag, self.evaluate(tf_imag))
+      self.assertAllEqual(np_real, self.evaluate(tf_real_real))
+      self.assertAllEqual(np_zeros, self.evaluate(tf_imag_real))
 
   def testRealImag64(self):
     real = (np.arange(-3, 3) / 4.).reshape([1, 3, 2]).astype(np.float32)
@@ -876,12 +881,12 @@ class ComplexMakeRealImagTest(test.TestCase):
 
   def _compareAngle(self, cplx, use_gpu):
     np_angle = np.angle(cplx)
-    with self.test_session(
-        use_gpu=use_gpu,
-        force_gpu=use_gpu and test_util.is_gpu_available()) as sess:
+
+    with test_util.device(use_gpu=use_gpu):
       inx = ops.convert_to_tensor(cplx)
       tf_angle = math_ops.angle(inx)
-      tf_angle_val = sess.run(tf_angle)
+      tf_angle_val = self.evaluate(tf_angle)
+
     self.assertAllEqual(np_angle, tf_angle_val)
     self.assertShapeEqual(np_angle, tf_angle)
 
@@ -903,6 +908,7 @@ class ComplexMakeRealImagTest(test.TestCase):
     # build failures on GPU (See #10643 for context).
     # self._compareAngle(cplx, use_gpu=True)
 
+  @test_util.run_deprecated_v1
   def testRealReal(self):
     for dtype in (dtypes_lib.int32, dtypes_lib.int64, dtypes_lib.float32,
                   dtypes_lib.float64):
@@ -912,11 +918,10 @@ class ComplexMakeRealImagTest(test.TestCase):
 
   def _compareConj(self, cplx, use_gpu):
     np_ans = np.conj(cplx)
-    with self.test_session(use_gpu=use_gpu,
-                           force_gpu=use_gpu and test_util.is_gpu_available()):
+    with test_util.device(use_gpu=use_gpu):
       inx = ops.convert_to_tensor(cplx)
       tf_conj = math_ops.conj(inx)
-      tf_ans = tf_conj.eval()
+      tf_ans = self.evaluate(tf_conj)
     self.assertAllEqual(np_ans, tf_ans)
     self.assertShapeEqual(np_ans, tf_conj)
 
@@ -934,6 +939,7 @@ class ComplexMakeRealImagTest(test.TestCase):
     self._compareConj(cplx, use_gpu=False)
     self._compareConj(cplx, use_gpu=True)
 
+  @test_util.run_deprecated_v1
   def testConjReal(self):
     for dtype in (dtypes_lib.int32, dtypes_lib.int64, dtypes_lib.float16,
                   dtypes_lib.float32, dtypes_lib.float64):
@@ -941,6 +947,7 @@ class ComplexMakeRealImagTest(test.TestCase):
       y = math_ops.conj(x)
       self.assertEqual(x, y)
 
+  @test_util.run_deprecated_v1
   def testConjString(self):
     x = array_ops.placeholder(dtypes_lib.string)
     with self.assertRaisesRegexp(TypeError,
@@ -977,6 +984,7 @@ class ComplexMakeRealImagTest(test.TestCase):
             x_, list(x.shape), z, [1], x_init_value=x, delta=epsilon)
         self.assertAllClose(jacob_t, jacob_n, rtol=epsilon, atol=epsilon)
 
+  @test_util.run_deprecated_v1
   def testGradient(self):
     # complex64
     data = np.arange(1, 2, 0.10).reshape([5, 2]).astype(np.float32)
@@ -1012,6 +1020,7 @@ class ComplexMakeRealImagTest(test.TestCase):
           inp, list(data.shape), loss, [1], x_init_value=data, delta=epsilon)
     self.assertAllClose(jacob_t, jacob_n, rtol=epsilon, atol=epsilon)
 
+  @test_util.run_deprecated_v1
   def testMulGradient(self):
     data = np.arange(1, 2, 0.125).reshape([2, 4]).astype(np.float32)
     self._compareMulGradient(data)
@@ -1032,13 +1041,13 @@ class AccumulateTest(test.TestCase):
       np_val = random_arrays[0]
       for random_array in random_arrays[1:]:
         np_val += random_array
-      self.assertAllClose(np_val, tf_val.eval())
+      self.assertAllClose(np_val, self.evaluate(tf_val))
 
   def testZeroArgs(self):
     with self.cached_session():
       with self.assertRaises(ValueError):
         tf_val = math_ops.accumulate_n([])
-        tf_val.eval()
+        self.evaluate(tf_val)
 
   def testWrongShape(self):
     with self.cached_session():
@@ -1070,7 +1079,7 @@ class PolyvalTest(test.TestCase):
     np_val = np.polyval(coeffs, x)
     with self.cached_session():
       tf_val = math_ops.polyval(coeffs, x)
-      self.assertAllClose(np_val, tf_val.eval())
+      self.assertAllClose(np_val, self.evaluate(tf_val))
 
   def testSimple(self):
     for dtype in [
@@ -1093,7 +1102,7 @@ class PolyvalTest(test.TestCase):
         np_val = np.polyval(coeffs, x)
         with self.cached_session():
           tf_val = math_ops.polyval(coeffs, x)
-          self.assertAllClose(np_val, tf_val.eval())
+          self.assertAllClose(np_val, self.evaluate(tf_val))
 
   def testEmpty(self):
     x = np.random.rand(2, 2).astype(np.float32)
@@ -1101,7 +1110,7 @@ class PolyvalTest(test.TestCase):
     np_val = np.polyval(coeffs, x)
     with self.cached_session():
       tf_val = math_ops.polyval(coeffs, x)
-      self.assertAllClose(np_val, tf_val.eval())
+      self.assertAllClose(np_val, self.evaluate(tf_val))
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/kernel_tests/cwise_ops_unary_test.py b/tensorflow/python/kernel_tests/cwise_ops_unary_test.py
index 77f182784eb..709a20f3d0d 100644
--- a/tensorflow/python/kernel_tests/cwise_ops_unary_test.py
+++ b/tensorflow/python/kernel_tests/cwise_ops_unary_test.py
@@ -76,7 +76,7 @@ class UnaryOpTest(test.TestCase):
     if grad_atol is None:
       grad_atol = _default_tolerance(x.dtype)
     np_ans = np_func(x)
-    with self.test_session(use_gpu=False):
+    with self.cached_session(use_gpu=False):
       inx = ops.convert_to_tensor(x)
       if x.dtype in (np.float32, np.float64,
                      dtypes_lib.bfloat16.as_numpy_dtype):
@@ -84,7 +84,7 @@ class UnaryOpTest(test.TestCase):
         np_ans *= 1.1
       else:
         y = tf_func(inx)
-      tf_cpu = y.eval()
+      tf_cpu = self.evaluate(y)
       self.assertShapeEqual(np_ans, y)
       if x.dtype == np.float16:
         self.assertAllClose(np_ans, tf_cpu, rtol=1e-3, atol=1e-3)
@@ -121,26 +121,24 @@ class UnaryOpTest(test.TestCase):
   def _check(self, result_tensor, result_np, input_sp_t, tol):
     self.assertTrue(isinstance(result_tensor, sparse_tensor.SparseTensor))
     self.assertTrue(isinstance(input_sp_t, sparse_tensor.SparseTensor))
-    self.assertAllEqual(input_sp_t.indices.eval(), result_tensor.indices.eval())
-    self.assertAllEqual(input_sp_t.dense_shape.eval(),
-                        result_tensor.dense_shape.eval())
+    self.assertAllEqual(input_sp_t.indices, result_tensor.indices)
+    self.assertAllEqual(input_sp_t.dense_shape, result_tensor.dense_shape)
     if tol is None:
-      self.assertAllClose(result_np, result_tensor.values.eval())
+      self.assertAllClose(result_np, result_tensor.values)
     else:
-      self.assertAllClose(
-          result_np, result_tensor.values.eval(), rtol=tol, atol=tol)
+      self.assertAllClose(result_np, result_tensor.values, rtol=tol, atol=tol)
 
   def _compareSparseCpu(self, x, np_func, tf_func, tol):
     x_sp, x_sp_vals = _sparsify(x)
     res_np = np_func(x_sp_vals)
-    with self.test_session(use_gpu=False):
+    with test_util.force_cpu():
       self._check(tf_func(x_sp), res_np, x_sp, tol)
 
   def _compareGpu(self, x, np_func, tf_func):
     np_ans = np_func(x)
-    with self.test_session(force_gpu=test_util.is_gpu_available()):
+    with test_util.use_gpu():
       result = tf_func(ops.convert_to_tensor(x))
-      tf_gpu = result.eval()
+      tf_gpu = self.evaluate(result)
     if x.dtype == np.float16:
       self.assertAllClose(np_ans, tf_gpu, rtol=1e-3, atol=1e-3)
     else:
@@ -150,7 +148,7 @@ class UnaryOpTest(test.TestCase):
   def _compareSparseGpu(self, x, np_func, tf_func, tol):
     x_sp, x_sp_vals = _sparsify(x)
     res_np = np_func(x_sp_vals)
-    with self.test_session(force_gpu=test_util.is_gpu_available()):
+    with test_util.use_gpu():
       self._check(tf_func(x_sp), res_np, x_sp, tol)
 
   def _compareBoth(self, x, np_func, tf_func):
@@ -186,6 +184,7 @@ class UnaryOpTest(test.TestCase):
 
     return func
 
+  @test_util.run_deprecated_v1
   def testFloatBasic(self):
     x = np.arange(-3, 3).reshape(1, 3, 2).astype(np.float32)
     w = x - x.min() + 1.02  # all greater than 1
@@ -240,12 +239,14 @@ class UnaryOpTest(test.TestCase):
     self._compareBothSparse(y, np.sign, math_ops.sign)
     self._compareBothSparse(x, np.vectorize(math.erf), math_ops.erf)
 
+  @test_util.run_deprecated_v1
   def testFloatTanhEdge(self):
     x = np.arange(40, 40 + 6).reshape(6).astype(np.float32)
     self._compareBoth(x, np.tanh, math_ops.tanh)
     x = np.arange(-40, -40 + 6).reshape(6).astype(np.float32)
     self._compareBoth(x, np.tanh, math_ops.tanh)
 
+  @test_util.run_deprecated_v1
   def testFloatEmpty(self):
     x = np.empty((2, 0, 5), dtype=np.float32)
     self._compareBoth(x, np.abs, math_ops.abs)
@@ -291,6 +292,7 @@ class UnaryOpTest(test.TestCase):
     self._compareBothSparse(x, np.sign, math_ops.sign)
     self._compareBothSparse(x, np.sign, math_ops.erf)
 
+  @test_util.run_deprecated_v1
   def testDoubleBasic(self):
     x = np.arange(-3, 3).reshape(1, 3, 2).astype(np.float64)
     w = x - x.min() + 1.02  # all greater than 1
@@ -344,6 +346,7 @@ class UnaryOpTest(test.TestCase):
     self._compareBothSparse(y, np.sign, math_ops.sign)
     self._compareBothSparse(x, np.vectorize(math.erf), math_ops.erf)
 
+  @test_util.run_deprecated_v1
   def testHalfBasic(self):
     x = np.arange(-3, 3).reshape(1, 3, 2).astype(np.float16)
     y = (x + .5).astype(np.float16)  # no zero
@@ -416,6 +419,7 @@ class UnaryOpTest(test.TestCase):
     self._compareCpu(x, np.square, math_ops.square)
     self._compareBothSparse(x, np.square, math_ops.square)
 
+  @test_util.run_deprecated_v1
   def testComplex64Basic(self):
     x = np.complex(1, 1) * np.arange(-3, 3).reshape(1, 3, 2).astype(
         np.complex64)
@@ -460,6 +464,7 @@ class UnaryOpTest(test.TestCase):
     self._compareBoth(y, complex_sign, math_ops.sign)
     self._compareBothSparse(y, complex_sign, math_ops.sign)
 
+  @test_util.run_deprecated_v1
   def testComplex128Basic(self):
     x = np.complex(1, 1) * np.arange(-3, 3).reshape(1, 3, 2).astype(
         np.complex128)
@@ -499,6 +504,7 @@ class UnaryOpTest(test.TestCase):
     self._compareBoth(y, complex_sign, math_ops.sign)
     self._compareBothSparse(y, complex_sign, math_ops.sign)
 
+  @test_util.run_deprecated_v1
   def testGradGrad(self):
     np.random.seed(7)
     shape = (5,)
diff --git a/tensorflow/python/kernel_tests/decode_bmp_op_test.py b/tensorflow/python/kernel_tests/decode_bmp_op_test.py
index eebaffbe13a..5e7991382ed 100644
--- a/tensorflow/python/kernel_tests/decode_bmp_op_test.py
+++ b/tensorflow/python/kernel_tests/decode_bmp_op_test.py
@@ -61,7 +61,7 @@ class DecodeBmpOpTest(test.TestCase):
     decode = array_ops.squeeze(image_ops.decode_bmp(img_in))
 
     with self.cached_session():
-      decoded = decode.eval()
+      decoded = self.evaluate(decode)
       self.assertAllEqual(decoded, img_bytes)
 
   def testGrayscale(self):
@@ -136,7 +136,7 @@ class DecodeBmpOpTest(test.TestCase):
     decode = image_ops.decode_bmp(img_in)
 
     with self.cached_session():
-      decoded = decode.eval()
+      decoded = self.evaluate(decode)
       self.assertAllEqual(decoded, img_bytes)
 
 
diff --git a/tensorflow/python/kernel_tests/decode_compressed_op_test.py b/tensorflow/python/kernel_tests/decode_compressed_op_test.py
index 1cc1c7da30a..fd871c00906 100644
--- a/tensorflow/python/kernel_tests/decode_compressed_op_test.py
+++ b/tensorflow/python/kernel_tests/decode_compressed_op_test.py
@@ -24,6 +24,7 @@ import zlib
 from six import BytesIO
 
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import parsing_ops
 from tensorflow.python.platform import test
@@ -42,6 +43,7 @@ class DecodeCompressedOpTest(test.TestCase):
         f.write(bytes_in)
       return out.getvalue()
 
+  @test_util.run_deprecated_v1
   def testDecompress(self):
     for compression_type in ["ZLIB", "GZIP", ""]:
       with self.cached_session():
@@ -55,6 +57,7 @@ class DecodeCompressedOpTest(test.TestCase):
                                   self._compress(b"bBbb", compression_type)]})
         self.assertAllEqual([b"AaAA", b"bBbb"], result)
 
+  @test_util.run_deprecated_v1
   def testDecompressWithRaw(self):
     for compression_type in ["ZLIB", "GZIP", ""]:
       with self.cached_session():
diff --git a/tensorflow/python/kernel_tests/decode_image_op_test.py b/tensorflow/python/kernel_tests/decode_image_op_test.py
index 0975f964b58..ba5770001ad 100644
--- a/tensorflow/python/kernel_tests/decode_image_op_test.py
+++ b/tensorflow/python/kernel_tests/decode_image_op_test.py
@@ -23,6 +23,7 @@ import os.path
 import numpy as np
 
 from tensorflow.python.framework import errors_impl
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import image_ops
 from tensorflow.python.ops import io_ops
 import tensorflow.python.ops.nn_grad  # pylint: disable=unused-import
@@ -40,10 +41,11 @@ class DecodeImageOpTest(test.TestCase):
       bmp0 = io_ops.read_file(path)
       image0 = image_ops.decode_image(bmp0)
       image1 = image_ops.decode_bmp(bmp0)
-      bmp0, image0, image1 = sess.run([bmp0, image0, image1])
+      bmp0, image0, image1 = self.evaluate([bmp0, image0, image1])
       self.assertEqual(len(bmp0), 4194)
       self.assertAllEqual(image0, image1)
 
+  @test_util.run_deprecated_v1
   def testGif(self):
     # Read some real GIFs
     path = os.path.join(prefix_path, "gif", "testdata", "scan.gif")
@@ -56,7 +58,7 @@ class DecodeImageOpTest(test.TestCase):
       gif0 = io_ops.read_file(path)
       image0 = image_ops.decode_image(gif0)
       image1 = image_ops.decode_gif(gif0)
-      gif0, image0, image1 = sess.run([gif0, image0, image1])
+      gif0, image0, image1 = self.evaluate([gif0, image0, image1])
 
       self.assertEqual(image0.shape, shape)
       self.assertAllEqual(image0, image1)
@@ -76,8 +78,9 @@ class DecodeImageOpTest(test.TestCase):
 
         bad_channels = image_ops.decode_image(gif0, channels=1)
         with self.assertRaises(errors_impl.InvalidArgumentError):
-          bad_channels.eval()
+          self.evaluate(bad_channels)
 
+  @test_util.run_deprecated_v1
   def testJpeg(self):
     # Read a real jpeg and verify shape
     path = os.path.join(prefix_path, "jpeg", "testdata", "jpeg_merge_test1.jpg")
@@ -85,14 +88,14 @@ class DecodeImageOpTest(test.TestCase):
       jpeg0 = io_ops.read_file(path)
       image0 = image_ops.decode_image(jpeg0)
       image1 = image_ops.decode_jpeg(jpeg0)
-      jpeg0, image0, image1 = sess.run([jpeg0, image0, image1])
+      jpeg0, image0, image1 = self.evaluate([jpeg0, image0, image1])
       self.assertEqual(len(jpeg0), 3771)
       self.assertEqual(image0.shape, (256, 128, 3))
       self.assertAllEqual(image0, image1)
 
       bad_channels = image_ops.decode_image(jpeg0, channels=4)
       with self.assertRaises(errors_impl.InvalidArgumentError):
-        bad_channels.eval()
+        self.evaluate(bad_channels)
 
   def testPng(self):
     # Read some real PNGs, converting to different channel numbers
@@ -104,16 +107,17 @@ class DecodeImageOpTest(test.TestCase):
           png0 = io_ops.read_file(path)
           image0 = image_ops.decode_image(png0, channels=channels)
           image1 = image_ops.decode_png(png0, channels=channels)
-          png0, image0, image1 = sess.run([png0, image0, image1])
+          png0, image0, image1 = self.evaluate([png0, image0, image1])
           self.assertEqual(image0.shape, (26, 51, channels or channels_in))
           self.assertAllEqual(image0, image1)
 
+  @test_util.run_deprecated_v1
   def testInvalidBytes(self):
     image_bytes = b"ThisIsNotAnImage!"
     decode = image_ops.decode_image(image_bytes)
     with self.cached_session():
       with self.assertRaises(errors_impl.InvalidArgumentError):
-        decode.eval()
+        self.evaluate(decode)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/kernel_tests/decode_jpeg_op_test.py b/tensorflow/python/kernel_tests/decode_jpeg_op_test.py
index 66b3e0f22fd..f8fc28062f4 100644
--- a/tensorflow/python/kernel_tests/decode_jpeg_op_test.py
+++ b/tensorflow/python/kernel_tests/decode_jpeg_op_test.py
@@ -80,7 +80,7 @@ class DecodeJpegBenchmark(test.Benchmark):
           initializer=image_ops.encode_jpeg(tiled_image))
 
     with session.Session() as sess:
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       images = []
       for _ in xrange(parallelism):
         if crop_window is None:
@@ -105,11 +105,11 @@ class DecodeJpegBenchmark(test.Benchmark):
 
       for _ in xrange(3):
         # Skip warm up time.
-        sess.run(r)
+        self.evaluate(r)
 
       start_time = time.time()
       for _ in xrange(num_iters):
-        sess.run(r)
+        self.evaluate(r)
       end_time = time.time()
     return end_time - start_time
 
diff --git a/tensorflow/python/kernel_tests/decode_png_op_test.py b/tensorflow/python/kernel_tests/decode_png_op_test.py
index 8f36343667f..5a0b742a6a4 100644
--- a/tensorflow/python/kernel_tests/decode_png_op_test.py
+++ b/tensorflow/python/kernel_tests/decode_png_op_test.py
@@ -47,7 +47,7 @@ class DecodePngOpTest(test.TestCase):
             img_in, dtype=dtypes.uint16))
 
     with self.cached_session():
-      decoded = decode.eval()
+      decoded = self.evaluate(decode)
       self.assertAllEqual(decoded, img_bytes)
 
 
diff --git a/tensorflow/python/kernel_tests/decode_raw_op_test.py b/tensorflow/python/kernel_tests/decode_raw_op_test.py
index dcc984811cb..008e59ba3e6 100644
--- a/tensorflow/python/kernel_tests/decode_raw_op_test.py
+++ b/tensorflow/python/kernel_tests/decode_raw_op_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import parsing_ops
 from tensorflow.python.platform import test
@@ -28,6 +29,7 @@ from tensorflow.python.platform import test
 
 class DecodeRawOpTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testToUint8(self):
     with self.cached_session():
       in_bytes = array_ops.placeholder(dtypes.string, shape=[2])
@@ -46,6 +48,7 @@ class DecodeRawOpTest(test.TestCase):
           "element 1 has size 5 != 6"):
         decode.eval(feed_dict={in_bytes: ["short", "longer"]})
 
+  @test_util.run_deprecated_v1
   def testToInt16(self):
     with self.cached_session():
       in_bytes = array_ops.placeholder(dtypes.string, shape=[None])
@@ -61,6 +64,7 @@ class DecodeRawOpTest(test.TestCase):
           "size of int16"):
         decode.eval(feed_dict={in_bytes: ["123", "456"]})
 
+  @test_util.run_deprecated_v1
   def testEndianness(self):
     with self.cached_session():
       in_bytes = array_ops.placeholder(dtypes.string, shape=[None])
@@ -73,6 +77,7 @@ class DecodeRawOpTest(test.TestCase):
       result = decode_be.eval(feed_dict={in_bytes: ["\x01\x02\x03\x04"]})
       self.assertAllEqual([[0x01020304]], result)
 
+  @test_util.run_deprecated_v1
   def testToFloat16(self):
     with self.cached_session():
       in_bytes = array_ops.placeholder(dtypes.string, shape=[None])
@@ -84,6 +89,7 @@ class DecodeRawOpTest(test.TestCase):
 
       self.assertAllEqual(expected_result, result)
 
+  @test_util.run_deprecated_v1
   def testEmptyStringInput(self):
     with self.cached_session():
       in_bytes = array_ops.placeholder(dtypes.string, shape=[None])
@@ -93,6 +99,7 @@ class DecodeRawOpTest(test.TestCase):
         result = decode.eval(feed_dict={in_bytes: [""] * num_inputs})
         self.assertEqual((num_inputs, 0), result.shape)
 
+  @test_util.run_deprecated_v1
   def testToUInt16(self):
     with self.cached_session():
       in_bytes = array_ops.placeholder(dtypes.string, shape=[None])
diff --git a/tensorflow/python/kernel_tests/denormal_test.py b/tensorflow/python/kernel_tests/denormal_test.py
index 71a528c4aa2..80a3033ecc4 100644
--- a/tensorflow/python/kernel_tests/denormal_test.py
+++ b/tensorflow/python/kernel_tests/denormal_test.py
@@ -22,6 +22,7 @@ import numpy as np
 import platform
 
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
 
@@ -50,10 +51,12 @@ class DenormalTest(test.TestCase):
           # Make sure the flags don't leak out
           self.testPythonHasDenormals()
 
+  @test_util.run_deprecated_v1
   def testFlushDenormalsCPU(self):
     # On CPUs, the processor flags flush for both single and double precision.
     self._flushDenormalsTest(use_gpu=False, dtypes=(np.float32, np.float64))
 
+  @test_util.run_deprecated_v1
   def testFlushDenormalsGPU(self):
     # On GPUs, only single precision can flush to zero.
     self._flushDenormalsTest(use_gpu=True, dtypes=(np.float32,))
diff --git a/tensorflow/python/kernel_tests/dense_update_ops_no_tsan_test.py b/tensorflow/python/kernel_tests/dense_update_ops_no_tsan_test.py
index affbaf159d8..4f74e1e7412 100644
--- a/tensorflow/python/kernel_tests/dense_update_ops_no_tsan_test.py
+++ b/tensorflow/python/kernel_tests/dense_update_ops_no_tsan_test.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 
 import numpy as np
 
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import state_ops
@@ -32,6 +33,7 @@ class AssignOpTest(test.TestCase):
   # NOTE(mrry): We exclude thess tests from the TSAN TAP target, because they
   #   contain benign and deliberate data races when multiple threads update
   #   the same parameters without a lock.
+  @test_util.run_deprecated_v1
   def testParallelUpdateWithoutLocking(self):
     with self.cached_session() as sess:
       ones_t = array_ops.fill([1024, 1024], 1.0)
@@ -43,7 +45,7 @@ class AssignOpTest(test.TestCase):
       variables.global_variables_initializer().run()
 
       def run_add(add_op):
-        sess.run(add_op)
+        self.evaluate(add_op)
 
       threads = [
           self.checkedThread(
@@ -54,11 +56,12 @@ class AssignOpTest(test.TestCase):
       for t in threads:
         t.join()
 
-      vals = p.eval()
+      vals = self.evaluate(p)
       ones = np.ones((1024, 1024)).astype(np.float32)
       self.assertTrue((vals >= ones).all())
       self.assertTrue((vals <= ones * 20).all())
 
+  @test_util.run_deprecated_v1
   def testParallelAssignWithoutLocking(self):
     with self.cached_session() as sess:
       ones_t = array_ops.fill([1024, 1024], float(1))
@@ -70,7 +73,7 @@ class AssignOpTest(test.TestCase):
       variables.global_variables_initializer().run()
 
       def run_assign(assign_op):
-        sess.run(assign_op)
+        self.evaluate(assign_op)
 
       threads = [
           self.checkedThread(
@@ -81,7 +84,7 @@ class AssignOpTest(test.TestCase):
       for t in threads:
         t.join()
 
-      vals = p.eval()
+      vals = self.evaluate(p)
 
       # Assert every element is taken from one of the assignments.
       self.assertTrue((vals > 0).all())
@@ -91,6 +94,7 @@ class AssignOpTest(test.TestCase):
   # contain non-benign but known data races between the variable assignment and
   # returning the output tensors. This issue will be resolved with the new
   # resource variables.
+  @test_util.run_deprecated_v1
   def testParallelUpdateWithLocking(self):
     with self.cached_session() as sess:
       zeros_t = array_ops.fill([1024, 1024], 0.0)
@@ -103,7 +107,7 @@ class AssignOpTest(test.TestCase):
       p.initializer.run()
 
       def run_add(add_op):
-        sess.run(add_op)
+        self.evaluate(add_op)
 
       threads = [
           self.checkedThread(
@@ -114,10 +118,11 @@ class AssignOpTest(test.TestCase):
       for t in threads:
         t.join()
 
-      vals = p.eval()
+      vals = self.evaluate(p)
       ones = np.ones((1024, 1024)).astype(np.float32)
       self.assertAllEqual(vals, ones * 20)
 
+  @test_util.run_deprecated_v1
   def testParallelAssignWithLocking(self):
     with self.cached_session() as sess:
       zeros_t = array_ops.fill([1024, 1024], 0.0)
@@ -131,7 +136,7 @@ class AssignOpTest(test.TestCase):
       p.initializer.run()
 
       def run_assign(assign_op):
-        sess.run(assign_op)
+        self.evaluate(assign_op)
 
       threads = [
           self.checkedThread(
@@ -142,7 +147,7 @@ class AssignOpTest(test.TestCase):
       for t in threads:
         t.join()
 
-      vals = p.eval()
+      vals = self.evaluate(p)
 
       # Assert every element is the same, and taken from one of the assignments.
       self.assertTrue(vals[0, 0] > 0)
diff --git a/tensorflow/python/kernel_tests/dense_update_ops_test.py b/tensorflow/python/kernel_tests/dense_update_ops_test.py
index 3e0a03d634f..309da88bef7 100644
--- a/tensorflow/python/kernel_tests/dense_update_ops_test.py
+++ b/tensorflow/python/kernel_tests/dense_update_ops_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variables
@@ -36,8 +37,8 @@ class AssignOpTest(test.TestCase):
       p = variables.Variable(x)
       assign = state_ops.assign(p, y)
       p.initializer.run()
-      new_value = assign.eval()
-      return p.eval(), new_value
+      new_value = self.evaluate(assign)
+      return self.evaluate(p), new_value
 
   def _initAssignAddFetch(self, x, y, use_gpu=False):
     """Initialize a param to init, and compute param += y."""
@@ -45,8 +46,8 @@ class AssignOpTest(test.TestCase):
       p = variables.Variable(x)
       add = state_ops.assign_add(p, y)
       p.initializer.run()
-      new_value = add.eval()
-      return p.eval(), new_value
+      new_value = self.evaluate(add)
+      return self.evaluate(p), new_value
 
   def _initAssignSubFetch(self, x, y, use_gpu=False):
     """Initialize a param to init, and compute param -= y."""
@@ -54,8 +55,8 @@ class AssignOpTest(test.TestCase):
       p = variables.Variable(x)
       sub = state_ops.assign_sub(p, y)
       p.initializer.run()
-      new_value = sub.eval()
-      return p.eval(), new_value
+      new_value = self.evaluate(sub)
+      return self.evaluate(p), new_value
 
   def _testTypes(self, vals):
     for dtype in [np.float32, np.float64, np.int32, np.int64]:
@@ -81,23 +82,26 @@ class AssignOpTest(test.TestCase):
         self.assertAllEqual(x - y, var_value)
         self.assertAllEqual(x - y, op_value)
 
+  @test_util.run_deprecated_v1
   def testBasic(self):
     self._testTypes(np.arange(0, 20).reshape([4, 5]))
 
+  @test_util.run_deprecated_v1
   def testAssignNonStrictShapeChecking(self):
     with self.cached_session():
       data = array_ops.fill([1024, 1024], 0)
       p = variables.VariableV1([1])
       a = state_ops.assign(p, data, validate_shape=False)
       a.op.run()
-      self.assertAllEqual(p.eval(), data.eval())
+      self.assertAllEqual(p.eval(), self.evaluate(data))
 
       # Assign to yet another shape
       data2 = array_ops.fill([10, 10], 1)
       a2 = state_ops.assign(p, data2, validate_shape=False)
       a2.op.run()
-      self.assertAllEqual(p.eval(), data2.eval())
+      self.assertAllEqual(p.eval(), self.evaluate(data2))
 
+  @test_util.run_deprecated_v1
   def testInitRequiredAssignAdd(self):
     with self.cached_session():
       p = variables.VariableV1(array_ops.fill([1024, 1024], 1), dtypes.int32)
@@ -105,6 +109,7 @@ class AssignOpTest(test.TestCase):
       with self.assertRaisesOpError("use uninitialized"):
         a.op.run()
 
+  @test_util.run_deprecated_v1
   def testInitRequiredAssignSub(self):
     with self.cached_session():
       p = variables.VariableV1(array_ops.fill([1024, 1024], 1), dtypes.int32)
diff --git a/tensorflow/python/kernel_tests/depthtospace_op_test.py b/tensorflow/python/kernel_tests/depthtospace_op_test.py
index 13a28caf1fd..96c9b5258e2 100644
--- a/tensorflow/python/kernel_tests/depthtospace_op_test.py
+++ b/tensorflow/python/kernel_tests/depthtospace_op_test.py
@@ -53,12 +53,14 @@ class DepthToSpaceTest(test.TestCase):
         output_nhwc = test_util.NCHWToNHWC(output_nchw)
         self.assertAllEqual(output_nhwc.eval(), outputs)
 
+  @test_util.run_deprecated_v1
   def testBasic(self):
     x_np = [[[[1, 2, 3, 4]]]]
     block_size = 2
     x_out = [[[[1], [2]], [[3], [4]]]]
     self._testOne(x_np, block_size, x_out)
 
+  @test_util.run_deprecated_v1
   def testBasicFloat16(self):
     x_np = [[[[1, 2, 3, 4]]]]
     block_size = 2
@@ -67,6 +69,7 @@ class DepthToSpaceTest(test.TestCase):
 
   # Tests for larger input dimensions. To make sure elements are
   # correctly ordered spatially.
+  @test_util.run_deprecated_v1
   def testBlockSize2(self):
     x_np = [[[[1, 2, 3, 4],
               [5, 6, 7, 8]],
@@ -79,6 +82,7 @@ class DepthToSpaceTest(test.TestCase):
               [[11], [12], [15], [16]]]]
     self._testOne(x_np, block_size, x_out)
 
+  @test_util.run_deprecated_v1
   def testBlockSize2Batch10(self):
     block_size = 2
     def batch_input_elt(i):
@@ -106,15 +110,16 @@ class DepthToSpaceTest(test.TestCase):
       # test NHWC (default) on CPU
       x_tf = array_ops.depth_to_space(input_nhwc, block_size)
       self.assertAllEqual(x_tf.shape, x_out.shape)
-      x_tf.eval()
+      self.evaluate(x_tf)
     if test.is_gpu_available():
       with self.cached_session(use_gpu=True):
         # test NHWC (default) on GPU
         x_tf = array_ops.depth_to_space(input_nhwc, block_size)
         self.assertAllEqual(x_tf.shape, x_out.shape)
-        x_tf.eval()
+        self.evaluate(x_tf)
 
   # Tests for different width and height.
+  @test_util.run_deprecated_v1
   def testNonSquare(self):
     x_np = [[[[1, 10, 2, 20, 3, 30, 4, 40]],
              [[5, 50, 6, 60, 7, 70, 8, 80]],
@@ -130,6 +135,7 @@ class DepthToSpaceTest(test.TestCase):
 
   # Tests for larger input dimensions. To make sure elements are
   # correctly ordered spatially.
+  @test_util.run_deprecated_v1
   def testBlockSize4FlatInput(self):
     x_np = [[[[1, 2, 5, 6, 3, 4, 7, 8, 9, 10, 13, 14, 11, 12, 15, 16]]]]
     block_size = 4
@@ -141,6 +147,7 @@ class DepthToSpaceTest(test.TestCase):
 
   # Tests for larger input depths.
   # To make sure elements are properly interleaved in depth.
+  @test_util.run_deprecated_v1
   def testDepthInterleaved(self):
     x_np = [[[[1, 10, 2, 20, 3, 30, 4, 40]]]]
     block_size = 2
@@ -150,6 +157,7 @@ class DepthToSpaceTest(test.TestCase):
 
   # Tests for larger input depths. Here an odd depth.
   # To make sure elements are properly interleaved in depth.
+  @test_util.run_deprecated_v1
   def testDepthInterleavedDepth3(self):
     x_np = [[[[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]]]]
     block_size = 2
@@ -159,6 +167,7 @@ class DepthToSpaceTest(test.TestCase):
 
   # Tests for larger input depths.
   # To make sure elements are properly interleaved in depth.
+  @test_util.run_deprecated_v1
   def testDepthInterleavedLarger(self):
     x_np = [[[[1, 10, 2, 20, 3, 30, 4, 40],
               [5, 50, 6, 60, 7, 70, 8, 80]],
@@ -175,6 +184,7 @@ class DepthToSpaceTest(test.TestCase):
 
   # Tests for a block larger for the depth. In this case should raise an
   # exception.
+  @test_util.run_deprecated_v1
   def testBlockSizeTooLarge(self):
     x_np = [[[[1, 2, 3, 4],
               [5, 6, 7, 8]],
@@ -185,18 +195,20 @@ class DepthToSpaceTest(test.TestCase):
     # divisible by 16.
     with self.assertRaises(ValueError):
       out_tf = array_ops.depth_to_space(x_np, block_size)
-      out_tf.eval()
+      self.evaluate(out_tf)
 
   # Test when the block size is 0.
+  @test_util.run_deprecated_v1
   def testBlockSize0(self):
     x_np = [[[[1], [2]],
              [[3], [4]]]]
     block_size = 0
     with self.assertRaises(ValueError):
       out_tf = array_ops.depth_to_space(x_np, block_size)
-      out_tf.eval()
+      self.evaluate(out_tf)
 
   # Test when the block size is 1. The block size should be > 1.
+  @test_util.run_deprecated_v1
   def testBlockSizeOne(self):
     x_np = [[[[1, 1, 1, 1],
               [2, 2, 2, 2]],
@@ -205,8 +217,9 @@ class DepthToSpaceTest(test.TestCase):
     block_size = 1
     with self.assertRaises(ValueError):
       out_tf = array_ops.depth_to_space(x_np, block_size)
-      out_tf.eval()
+      self.evaluate(out_tf)
 
+  @test_util.run_deprecated_v1
   def testBlockSizeLargerThanInput(self):
     # The block size is too large for this input.
     x_np = [[[[1], [2]],
@@ -214,8 +227,9 @@ class DepthToSpaceTest(test.TestCase):
     block_size = 10
     with self.assertRaises(ValueError):
       out_tf = array_ops.space_to_depth(x_np, block_size)
-      out_tf.eval()
+      self.evaluate(out_tf)
 
+  @test_util.run_deprecated_v1
   def testBlockSizeNotDivisibleDepth(self):
     # The depth is not divisible by the square of the block size.
     x_np = [[[[1, 1, 1, 1],
@@ -226,6 +240,7 @@ class DepthToSpaceTest(test.TestCase):
     with self.assertRaises(ValueError):
       _ = array_ops.space_to_depth(x_np, block_size)
 
+  @test_util.run_deprecated_v1
   def testUnknownShape(self):
     t = array_ops.depth_to_space(
         array_ops.placeholder(dtypes.float32), block_size=4)
@@ -277,7 +292,7 @@ class DepthToSpaceTest(test.TestCase):
       actual = array_ops.depth_to_space(t, block_size, data_format=data_format)
 
     with self.session(use_gpu=use_gpu) as sess:
-      actual_vals, expected_vals = sess.run([actual, expected])
+      actual_vals, expected_vals = self.evaluate([actual, expected])
       self.assertTrue(np.array_equal(actual_vals, expected_vals))
 
   def testAgainstTranspose(self):
@@ -343,11 +358,13 @@ class DepthToSpaceGradientTest(test.TestCase):
 
   # Don't use very large numbers as dimensions here, as the result is tensor
   # with cartesian product of the dimensions.
+  @test_util.run_deprecated_v1
   def testSmall(self):
     block_size = 2
     self._compare(3, 2, 5, 3, block_size, "NHWC")
     self._compare(3, 2, 5, 3, block_size, "NCHW")
 
+  @test_util.run_deprecated_v1
   def testSmall2(self):
     block_size = 3
     self._compare(1, 2, 3, 2, block_size, "NHWC")
diff --git a/tensorflow/python/kernel_tests/depthwise_conv_op_test.py b/tensorflow/python/kernel_tests/depthwise_conv_op_test.py
index 77b27c6c7e0..f6d834c2f85 100644
--- a/tensorflow/python/kernel_tests/depthwise_conv_op_test.py
+++ b/tensorflow/python/kernel_tests/depthwise_conv_op_test.py
@@ -162,7 +162,7 @@ class DepthwiseConv2DTest(test.TestCase):
         conv_native = array_ops.transpose(conv_native, [0, 2, 3, 1])
 
       try:
-        native_result = sess.run(conv_native)
+        native_result = self.evaluate(conv_native)
       except errors.InvalidArgumentError as e:
         # Grouped convolution kernel is only registered for cuDNN 7. Silently
         # return when we are running on an earlier version or without GPU.
@@ -174,7 +174,7 @@ class DepthwiseConv2DTest(test.TestCase):
 
       conv_interface = nn_impl.depthwise_conv2d(
           t1, t2, strides=[1, stride, stride, 1], padding=padding)
-      interface_result = sess.run(conv_interface)
+      interface_result = self.evaluate(conv_interface)
 
     tf_logging.info(
         "data_type: %r, use_gpu: %r, grouped_conv: %r, max diff = %f",
@@ -269,7 +269,7 @@ class DepthwiseConv2DTest(test.TestCase):
       t2 = constant_op.constant(x2, shape=filter_in_sizes)
       conv = nn_ops.depthwise_conv2d_native(
           t1, t2, strides=[1, stride, stride, 1], padding=padding)
-      value = sess.run(conv)
+      value = self.evaluate(conv)
     tf_logging.info("value = %r", value)
     self.assertArrayNear(expected, np.ravel(value), 1e-5)
     self.assertShapeEqual(value, conv)
@@ -528,7 +528,7 @@ class DepthwiseConv2DTest(test.TestCase):
         t2 = constant_op.constant(x2, shape=output_sizes)
         backprop = nn_ops.depthwise_conv2d_native_backprop_input(
             t0, t1, t2, strides=[1, stride, stride, 1], padding=padding)
-        ret = backprop.eval()
+        ret = self.evaluate(backprop)
         self.assertShapeEqual(ret, backprop)
         return ret
 
@@ -548,7 +548,7 @@ class DepthwiseConv2DTest(test.TestCase):
         t2 = constant_op.constant(x2, shape=output_sizes)
         backprop = nn_ops.depthwise_conv2d_native_backprop_input(
             t0, t1, t2, strides=[1, stride, stride, 1], padding=padding)
-        ret = backprop.eval()
+        ret = self.evaluate(backprop)
         self.assertShapeEqual(ret, backprop)
         return ret
 
@@ -580,7 +580,7 @@ class DepthwiseConv2DTest(test.TestCase):
         t2 = constant_op.constant(x2, shape=output_sizes)
         backprop = nn_ops.depthwise_conv2d_native_backprop_filter(
             t0, t1, t2, strides=[1, stride, stride, 1], padding=padding)
-        ret = backprop.eval()
+        ret = self.evaluate(backprop)
         self.assertShapeEqual(ret, backprop)
         return ret
 
@@ -600,7 +600,7 @@ class DepthwiseConv2DTest(test.TestCase):
         t2 = constant_op.constant(x2, shape=output_sizes)
         backprop = nn_ops.depthwise_conv2d_native_backprop_filter(
             t0, t1, t2, strides=[1, stride, stride, 1], padding=padding)
-        ret = backprop.eval()
+        ret = self.evaluate(backprop)
         self.assertShapeEqual(ret, backprop)
         return ret
 
diff --git a/tensorflow/python/kernel_tests/determinant_op_test.py b/tensorflow/python/kernel_tests/determinant_op_test.py
index da33b2848b7..d6ef9e70b83 100644
--- a/tensorflow/python/kernel_tests/determinant_op_test.py
+++ b/tensorflow/python/kernel_tests/determinant_op_test.py
@@ -23,6 +23,7 @@ import numpy as np
 from tensorflow.python.client import session
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gen_linalg_ops
 from tensorflow.python.ops import linalg_ops
@@ -35,7 +36,7 @@ from tensorflow.python.platform import test
 class DeterminantOpTest(test.TestCase):
 
   def _compareDeterminantBase(self, matrix_x, tf_ans):
-    out = tf_ans.eval()
+    out = self.evaluate(tf_ans)
     shape = matrix_x.shape
     if shape[-1] == 0 and shape[-2] == 0:
       np_ans = np.ones(shape[:-2]).astype(matrix_x.dtype)
@@ -54,15 +55,15 @@ class DeterminantOpTest(test.TestCase):
       np_ans = np_ans.astype(matrix_x.dtype)
 
     self.assertShapeEqual(np_ans, abs_log_det_tf)
-    sign_tf_val = sign_tf.eval()
-    abs_log_det_tf_val = abs_log_det_tf.eval()
+    sign_tf_val = self.evaluate(sign_tf)
+    abs_log_det_tf_val = self.evaluate(abs_log_det_tf)
     self.assertAllClose(
         sign_tf_val * np.exp(abs_log_det_tf_val),
         np_sign * np.exp(np_ans),
         atol=5e-5)
 
   def _compareDeterminant(self, matrix_x):
-    with self.cached_session(use_gpu=True):
+    with test_util.use_gpu():
       self._compareDeterminantBase(matrix_x,
                                    linalg_ops.matrix_determinant(matrix_x))
       self._compareLogDeterminantBase(
@@ -155,7 +156,7 @@ class DeterminantOpTest(test.TestCase):
       matrix2 = random_ops.random_normal([5, 5], seed=42)
       det1 = linalg_ops.matrix_determinant(matrix1)
       det2 = linalg_ops.matrix_determinant(matrix2)
-      det1_val, det2_val = sess.run([det1, det2])
+      det1_val, det2_val = self.evaluate([det1, det2])
       self.assertEqual(det1_val, det2_val)
 
 
diff --git a/tensorflow/python/kernel_tests/diag_op_test.py b/tensorflow/python/kernel_tests/diag_op_test.py
index 9e43258fa2d..ed2a9e8e47e 100644
--- a/tensorflow/python/kernel_tests/diag_op_test.py
+++ b/tensorflow/python/kernel_tests/diag_op_test.py
@@ -22,6 +22,7 @@ import numpy as np
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes as dtypes_lib
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import gradients_impl
@@ -31,6 +32,7 @@ from tensorflow.python.platform import tf_logging
 
 class MatrixDiagTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testVector(self):
     with self.session(use_gpu=True):
       v = np.array([1.0, 2.0, 3.0])
@@ -49,6 +51,7 @@ class MatrixDiagTest(test.TestCase):
       self.assertEqual((2, 3, 3), v_batch_diag.get_shape())
       self.assertAllEqual(v_batch_diag.eval(), mat_batch)
 
+  @test_util.run_deprecated_v1
   def testBatchVector(self):
     self._testBatchVector(np.float32)
     self._testBatchVector(np.float64)
@@ -56,16 +59,19 @@ class MatrixDiagTest(test.TestCase):
     self._testBatchVector(np.int64)
     self._testBatchVector(np.bool)
 
+  @test_util.run_deprecated_v1
   def testInvalidShape(self):
     with self.assertRaisesRegexp(ValueError, "must be at least rank 1"):
       array_ops.matrix_diag(0)
 
+  @test_util.run_deprecated_v1
   def testInvalidShapeAtEval(self):
     with self.session(use_gpu=True):
       v = array_ops.placeholder(dtype=dtypes_lib.float32)
       with self.assertRaisesOpError("input must be at least 1-dim"):
         array_ops.matrix_diag(v).eval(feed_dict={v: 0.0})
 
+  @test_util.run_deprecated_v1
   def testGrad(self):
     shapes = ((3,), (7, 4))
     with self.session(use_gpu=True):
@@ -81,6 +87,7 @@ class MatrixDiagTest(test.TestCase):
 
 class MatrixSetDiagTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testSquare(self):
     with self.session(use_gpu=True):
       v = np.array([1.0, 2.0, 3.0])
@@ -89,8 +96,9 @@ class MatrixSetDiagTest(test.TestCase):
                                [1.0, 1.0, 3.0]])
       output = array_ops.matrix_set_diag(mat, v)
       self.assertEqual((3, 3), output.get_shape())
-      self.assertAllEqual(mat_set_diag, output.eval())
+      self.assertAllEqual(mat_set_diag, self.evaluate(output))
 
+  @test_util.run_deprecated_v1
   def testRectangular(self):
     with self.session(use_gpu=True):
       v = np.array([3.0, 4.0])
@@ -98,14 +106,14 @@ class MatrixSetDiagTest(test.TestCase):
       expected = np.array([[3.0, 1.0, 0.0], [1.0, 4.0, 1.0]])
       output = array_ops.matrix_set_diag(mat, v)
       self.assertEqual((2, 3), output.get_shape())
-      self.assertAllEqual(expected, output.eval())
+      self.assertAllEqual(expected, self.evaluate(output))
 
       v = np.array([3.0, 4.0])
       mat = np.array([[0.0, 1.0], [1.0, 0.0], [1.0, 1.0]])
       expected = np.array([[3.0, 1.0], [1.0, 4.0], [1.0, 1.0]])
       output = array_ops.matrix_set_diag(mat, v)
       self.assertEqual((3, 2), output.get_shape())
-      self.assertAllEqual(expected, output.eval())
+      self.assertAllEqual(expected, self.evaluate(output))
 
   def _testSquareBatch(self, dtype):
     with self.cached_session(use_gpu=True):
@@ -121,8 +129,9 @@ class MatrixSetDiagTest(test.TestCase):
 
       output = array_ops.matrix_set_diag(mat_batch, v_batch)
       self.assertEqual((2, 3, 3), output.get_shape())
-      self.assertAllEqual(mat_set_diag_batch, output.eval())
+      self.assertAllEqual(mat_set_diag_batch, self.evaluate(output))
 
+  @test_util.run_deprecated_v1
   def testSquareBatch(self):
     self._testSquareBatch(np.float32)
     self._testSquareBatch(np.float64)
@@ -130,6 +139,7 @@ class MatrixSetDiagTest(test.TestCase):
     self._testSquareBatch(np.int64)
     self._testSquareBatch(np.bool)
 
+  @test_util.run_deprecated_v1
   def testRectangularBatch(self):
     with self.session(use_gpu=True):
       v_batch = np.array([[-1.0, -2.0], [-4.0, -5.0]])
@@ -140,14 +150,16 @@ class MatrixSetDiagTest(test.TestCase):
                                      [[-4.0, 0.0, 4.0], [0.0, -5.0, 0.0]]])
       output = array_ops.matrix_set_diag(mat_batch, v_batch)
       self.assertEqual((2, 2, 3), output.get_shape())
-      self.assertAllEqual(mat_set_diag_batch, output.eval())
+      self.assertAllEqual(mat_set_diag_batch, self.evaluate(output))
 
+  @test_util.run_deprecated_v1
   def testInvalidShape(self):
     with self.assertRaisesRegexp(ValueError, "must be at least rank 2"):
       array_ops.matrix_set_diag(0, [0])
     with self.assertRaisesRegexp(ValueError, "must be at least rank 1"):
       array_ops.matrix_set_diag([[0]], 0)
 
+  @test_util.run_deprecated_v1
   def testInvalidShapeAtEval(self):
     with self.session(use_gpu=True):
       v = array_ops.placeholder(dtype=dtypes_lib.float32)
@@ -157,6 +169,7 @@ class MatrixSetDiagTest(test.TestCase):
           r"but received input shape: \[1,1\] and diagonal shape: \[\]"):
         array_ops.matrix_set_diag([[v]], v).eval(feed_dict={v: 0.0})
 
+  @test_util.run_deprecated_v1
   def testGrad(self):
     shapes = ((3, 4, 4), (3, 3, 4), (3, 4, 3), (7, 4, 8, 8))
     with self.session(use_gpu=True):
@@ -178,6 +191,7 @@ class MatrixSetDiagTest(test.TestCase):
             y.get_shape().as_list())
         self.assertLess(error_x_diag, 1e-4)
 
+  @test_util.run_deprecated_v1
   def testGradWithNoShapeInformation(self):
     with self.session(use_gpu=True) as sess:
       v = array_ops.placeholder(dtype=dtypes_lib.float32)
@@ -200,6 +214,7 @@ class MatrixSetDiagTest(test.TestCase):
 
 class MatrixDiagPartTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testSquare(self):
     with self.session(use_gpu=True):
       v = np.array([1.0, 2.0, 3.0])
@@ -208,6 +223,7 @@ class MatrixDiagPartTest(test.TestCase):
       self.assertEqual((3,), mat_diag.get_shape())
       self.assertAllEqual(mat_diag.eval(), v)
 
+  @test_util.run_deprecated_v1
   def testRectangular(self):
     with self.session(use_gpu=True):
       mat = np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]])
@@ -228,6 +244,7 @@ class MatrixDiagPartTest(test.TestCase):
       self.assertEqual((2, 3), mat_batch_diag.get_shape())
       self.assertAllEqual(mat_batch_diag.eval(), v_batch)
 
+  @test_util.run_deprecated_v1
   def testSquareBatch(self):
     self._testSquareBatch(np.float32)
     self._testSquareBatch(np.float64)
@@ -235,6 +252,7 @@ class MatrixDiagPartTest(test.TestCase):
     self._testSquareBatch(np.int64)
     self._testSquareBatch(np.bool)
 
+  @test_util.run_deprecated_v1
   def testRectangularBatch(self):
     with self.session(use_gpu=True):
       v_batch = np.array([[1.0, 2.0], [4.0, 5.0]])
@@ -245,16 +263,19 @@ class MatrixDiagPartTest(test.TestCase):
       self.assertEqual((2, 2), mat_batch_diag.get_shape())
       self.assertAllEqual(mat_batch_diag.eval(), v_batch)
 
+  @test_util.run_deprecated_v1
   def testInvalidShape(self):
     with self.assertRaisesRegexp(ValueError, "must be at least rank 2"):
       array_ops.matrix_diag_part(0)
 
+  @test_util.run_deprecated_v1
   def testInvalidShapeAtEval(self):
     with self.session(use_gpu=True):
       v = array_ops.placeholder(dtype=dtypes_lib.float32)
       with self.assertRaisesOpError("input must be at least 2-dim"):
         array_ops.matrix_diag_part(v).eval(feed_dict={v: 0.0})
 
+  @test_util.run_deprecated_v1
   def testGrad(self):
     shapes = ((3, 3), (2, 3), (3, 2), (5, 3, 3))
     with self.session(use_gpu=True):
@@ -273,9 +294,9 @@ class DiagTest(test.TestCase):
   def _diagOp(self, diag, dtype, expected_ans, use_gpu):
     with self.cached_session(use_gpu=use_gpu):
       tf_ans = array_ops.diag(ops.convert_to_tensor(diag.astype(dtype)))
-      out = tf_ans.eval()
+      out = self.evaluate(tf_ans)
       tf_ans_inv = array_ops.diag_part(expected_ans)
-      inv_out = tf_ans_inv.eval()
+      inv_out = self.evaluate(tf_ans_inv)
     self.assertAllClose(out, expected_ans)
     self.assertAllClose(inv_out, diag)
     self.assertShapeEqual(expected_ans, tf_ans)
@@ -407,6 +428,7 @@ class DiagTest(test.TestCase):
           dtype=dtype)
       self.diagOp(x, dtype, expected_ans)
 
+  @test_util.run_deprecated_v1
   def testInvalidRank(self):
     with self.assertRaisesRegexp(ValueError, "must be at least rank 1"):
       array_ops.diag(0.0)
@@ -421,7 +443,7 @@ class DiagPartOpTest(test.TestCase):
     with self.cached_session(use_gpu=use_gpu):
       tensor = ops.convert_to_tensor(tensor.astype(dtype))
       tf_ans_inv = array_ops.diag_part(tensor)
-      inv_out = tf_ans_inv.eval()
+      inv_out = self.evaluate(tf_ans_inv)
     self.assertAllClose(inv_out, expected_ans)
     self.assertShapeEqual(expected_ans, tf_ans_inv)
 
@@ -445,7 +467,7 @@ class DiagPartOpTest(test.TestCase):
         t = ops.convert_to_tensor(x.astype(np.float32))
         t.set_shape(shape)
         tf_ans = array_ops.diag_part(t)
-        out = tf_ans.eval()
+        out = self.evaluate(tf_ans)
       self.assertAllClose(out, expected_ans)
       self.assertShapeEqual(expected_ans, tf_ans)
 
@@ -476,6 +498,7 @@ class DiagPartOpTest(test.TestCase):
     self.diagPartOp(x, np.complex64, expected_ans)
     self.diagPartOp(x, np.complex128, expected_ans)
 
+  @test_util.run_deprecated_v1
   def testOddRank(self):
     w = np.random.rand(2)
     x = np.random.rand(2, 2, 2)
@@ -484,6 +507,7 @@ class DiagPartOpTest(test.TestCase):
     with self.assertRaises(ValueError):
       array_ops.diag_part(0.0)
 
+  @test_util.run_deprecated_v1
   def testUnevenDimensions(self):
     w = np.random.rand(2, 5)
     x = np.random.rand(2, 1, 2, 3)
@@ -493,6 +517,7 @@ class DiagPartOpTest(test.TestCase):
 
 class DiagGradOpTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testDiagGrad(self):
     np.random.seed(0)
     shapes = ((3,), (3, 3), (3, 3, 3))
@@ -513,6 +538,7 @@ class DiagGradOpTest(test.TestCase):
 
 class DiagGradPartOpTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testDiagPartGrad(self):
     np.random.seed(0)
     shapes = ((3, 3), (3, 3, 3, 3))
diff --git a/tensorflow/python/kernel_tests/distributions/bernoulli_test.py b/tensorflow/python/kernel_tests/distributions/bernoulli_test.py
index 37b35ba51a8..e6d560b4bc4 100644
--- a/tensorflow/python/kernel_tests/distributions/bernoulli_test.py
+++ b/tensorflow/python/kernel_tests/distributions/bernoulli_test.py
@@ -151,6 +151,7 @@ class BernoulliTest(test.TestCase):
       self.assertAllClose(self.evaluate(dist.prob(x)), expected_pmf)
       self.assertAllClose(self.evaluate(dist.log_prob(x)), np.log(expected_pmf))
 
+  @test_util.run_deprecated_v1
   def testPmfCorrectBroadcastDynamicShape(self):
     with self.cached_session():
       p = array_ops.placeholder(dtype=dtypes.float32)
@@ -167,6 +168,7 @@ class BernoulliTest(test.TestCase):
           }), [[0.2, 0.7, 0.4]])
 
   @test_util.run_in_graph_and_eager_modes
+  @test_util.run_deprecated_v1
   def testPmfInvalid(self):
     p = [0.1, 0.2, 0.7]
     dist = bernoulli.Bernoulli(probs=p, validate_args=True)
@@ -193,6 +195,7 @@ class BernoulliTest(test.TestCase):
         self.evaluate(
             bernoulli.Bernoulli(probs=p, validate_args=False).log_prob(samps)))
 
+  @test_util.run_deprecated_v1
   def testBroadcasting(self):
     with self.cached_session():
       p = array_ops.placeholder(dtypes.float32)
@@ -207,6 +210,7 @@ class BernoulliTest(test.TestCase):
               p: [0.5, 0.5, 0.5]
           }))
 
+  @test_util.run_deprecated_v1
   def testPmfShapes(self):
     with self.cached_session():
       p = array_ops.placeholder(dtypes.float32, shape=[None, 1])
@@ -276,6 +280,7 @@ class BernoulliTest(test.TestCase):
     grad_p = tape.gradient(samples, p)
     self.assertIsNone(grad_p)
 
+  @test_util.run_deprecated_v1
   def testSampleActsLikeSampleN(self):
     with self.cached_session() as sess:
       p = [0.2, 0.6]
diff --git a/tensorflow/python/kernel_tests/distributions/bijector_test.py b/tensorflow/python/kernel_tests/distributions/bijector_test.py
index e20f59f48ac..a0e0a36fecc 100644
--- a/tensorflow/python/kernel_tests/distributions/bijector_test.py
+++ b/tensorflow/python/kernel_tests/distributions/bijector_test.py
@@ -132,6 +132,7 @@ class BijectorTestEventNdims(test.TestCase):
     with self.assertRaisesRegexp(ValueError, "Expected scalar"):
       bij.inverse_log_det_jacobian(1., event_ndims=(1, 2))
 
+  @test_util.run_deprecated_v1
   def testBijectorDynamicEventNdims(self):
     bij = BrokenBijector(validate_args=True)
     event_ndims = array_ops.placeholder(dtype=np.int32, shape=None)
@@ -301,6 +302,7 @@ class BijectorReduceEventDimsTest(test.TestCase):
         8.,
         self.evaluate(bij.inverse_log_det_jacobian(x, event_ndims=2)))
 
+  @test_util.run_deprecated_v1
   def testHandlesNonStaticEventNdims(self):
     x_ = [[[1., 2.], [3., 4.]]]
     x = array_ops.placeholder_with_default(x_, shape=None)
diff --git a/tensorflow/python/kernel_tests/distributions/categorical_test.py b/tensorflow/python/kernel_tests/distributions/categorical_test.py
index c6bb06eab30..ec1d4ed2070 100644
--- a/tensorflow/python/kernel_tests/distributions/categorical_test.py
+++ b/tensorflow/python/kernel_tests/distributions/categorical_test.py
@@ -25,6 +25,7 @@ from tensorflow.python.eager import backprop
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import tensor_util
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import math_ops
@@ -44,6 +45,7 @@ def make_categorical(batch_shape, num_classes, dtype=dtypes.int32):
 
 class CategoricalTest(test.TestCase, parameterized.TestCase):
 
+  @test_util.run_deprecated_v1
   def testP(self):
     p = [0.2, 0.8]
     dist = categorical.Categorical(probs=p)
@@ -51,6 +53,7 @@ class CategoricalTest(test.TestCase, parameterized.TestCase):
       self.assertAllClose(p, dist.probs.eval())
       self.assertAllEqual([2], dist.logits.get_shape())
 
+  @test_util.run_deprecated_v1
   def testLogits(self):
     p = np.array([0.2, 0.8], dtype=np.float32)
     logits = np.log(p) - 50.
@@ -61,6 +64,7 @@ class CategoricalTest(test.TestCase, parameterized.TestCase):
       self.assertAllClose(dist.probs.eval(), p)
       self.assertAllClose(dist.logits.eval(), logits)
 
+  @test_util.run_deprecated_v1
   def testShapes(self):
     with self.cached_session():
       for batch_shape in ([], [1], [2, 3, 4]):
@@ -107,6 +111,7 @@ class CategoricalTest(test.TestCase, parameterized.TestCase):
       self.assertEqual(dist.dtype, dtype)
       self.assertEqual(dist.dtype, dist.sample(5).dtype)
 
+  @test_util.run_deprecated_v1
   def testUnknownShape(self):
     with self.cached_session():
       logits = array_ops.placeholder(dtype=dtypes.float32)
@@ -121,18 +126,21 @@ class CategoricalTest(test.TestCase, parameterized.TestCase):
           feed_dict={logits: [[-1000.0, 1000.0], [1000.0, -1000.0]]})
       self.assertAllEqual([1, 0], sample_value_batch)
 
+  @test_util.run_deprecated_v1
   def testPMFWithBatch(self):
     histograms = [[0.2, 0.8], [0.6, 0.4]]
     dist = categorical.Categorical(math_ops.log(histograms) - 50.)
     with self.cached_session():
       self.assertAllClose(dist.prob([0, 1]).eval(), [0.2, 0.4])
 
+  @test_util.run_deprecated_v1
   def testPMFNoBatch(self):
     histograms = [0.2, 0.8]
     dist = categorical.Categorical(math_ops.log(histograms) - 50.)
     with self.cached_session():
       self.assertAllClose(dist.prob(0).eval(), 0.2)
 
+  @test_util.run_deprecated_v1
   def testCDFWithDynamicEventShapeKnownNdims(self):
     """Test that dynamically-sized events with unknown shape work."""
     batch_size = 2
@@ -184,6 +192,7 @@ class CategoricalTest(test.TestCase, parameterized.TestCase):
     actual_cdf = self.evaluate(cdf_op)
     self.assertAllClose(actual_cdf, expected_cdf)
 
+  @test_util.run_deprecated_v1
   def testCDFWithBatch(self):
     histograms = [[0.1, 0.2, 0.3, 0.25, 0.15],
                   [0.0, 0.75, 0.2, 0.05, 0.0]]
@@ -195,6 +204,7 @@ class CategoricalTest(test.TestCase, parameterized.TestCase):
     with self.cached_session():
       self.assertAllClose(cdf_op.eval(), expected_cdf)
 
+  @test_util.run_deprecated_v1
   def testCDFNoBatch(self):
     histogram = [0.1, 0.2, 0.3, 0.4]
     event = 2
@@ -205,6 +215,7 @@ class CategoricalTest(test.TestCase, parameterized.TestCase):
     with self.cached_session():
       self.assertAlmostEqual(cdf_op.eval(), expected_cdf)
 
+  @test_util.run_deprecated_v1
   def testCDFBroadcasting(self):
     # shape: [batch=2, n_bins=3]
     histograms = [[0.2, 0.1, 0.7],
@@ -287,7 +298,7 @@ class CategoricalTest(test.TestCase, parameterized.TestCase):
     }
 
     with self.cached_session() as sess:
-      run_result = sess.run(to_run)
+      run_result = self.evaluate(to_run)
 
     self.assertAllEqual(run_result["cat_prob"].shape,
                         run_result["norm_prob"].shape)
@@ -298,6 +309,7 @@ class CategoricalTest(test.TestCase, parameterized.TestCase):
     self.assertAllEqual(run_result["cat_log_cdf"].shape,
                         run_result["norm_log_cdf"].shape)
 
+  @test_util.run_deprecated_v1
   def testLogPMF(self):
     logits = np.log([[0.2, 0.8], [0.6, 0.4]]) - 50.
     dist = categorical.Categorical(logits)
@@ -305,6 +317,7 @@ class CategoricalTest(test.TestCase, parameterized.TestCase):
       self.assertAllClose(dist.log_prob([0, 1]).eval(), np.log([0.2, 0.4]))
       self.assertAllClose(dist.log_prob([0.0, 1.0]).eval(), np.log([0.2, 0.4]))
 
+  @test_util.run_deprecated_v1
   def testEntropyNoBatch(self):
     logits = np.log([0.2, 0.8]) - 50.
     dist = categorical.Categorical(logits)
@@ -312,6 +325,7 @@ class CategoricalTest(test.TestCase, parameterized.TestCase):
       self.assertAllClose(dist.entropy().eval(),
                           -(0.2 * np.log(0.2) + 0.8 * np.log(0.8)))
 
+  @test_util.run_deprecated_v1
   def testEntropyWithBatch(self):
     logits = np.log([[0.2, 0.8], [0.6, 0.4]]) - 50.
     dist = categorical.Categorical(logits)
@@ -321,6 +335,7 @@ class CategoricalTest(test.TestCase, parameterized.TestCase):
           -(0.6 * np.log(0.6) + 0.4 * np.log(0.4))
       ])
 
+  @test_util.run_deprecated_v1
   def testEntropyGradient(self):
     with self.cached_session() as sess:
       logits = constant_op.constant([[1., 2., 3.], [2., 5., 1.]])
@@ -355,7 +370,7 @@ class CategoricalTest(test.TestCase, parameterized.TestCase):
       samples = dist.sample(n, seed=123)
       samples.set_shape([n, 1, 2])
       self.assertEqual(samples.dtype, dtypes.int32)
-      sample_values = samples.eval()
+      sample_values = self.evaluate(samples)
       self.assertFalse(np.any(sample_values < 0))
       self.assertFalse(np.any(sample_values > 1))
       self.assertAllClose(
@@ -371,7 +386,7 @@ class CategoricalTest(test.TestCase, parameterized.TestCase):
       dist = categorical.Categorical(math_ops.log(histograms) - 50.)
       samples = dist.sample((100, 100), seed=123)
       prob = dist.prob(samples)
-      prob_val = prob.eval()
+      prob_val = self.evaluate(prob)
       self.assertAllClose(
           [0.2**2 + 0.8**2], [prob_val[:, :, :, 0].mean()], atol=1e-2)
       self.assertAllClose(
@@ -393,26 +408,26 @@ class CategoricalTest(test.TestCase, parameterized.TestCase):
       dist = categorical.Categorical(math_ops.log(histograms) - 50.)
 
       prob = dist.prob(1)
-      self.assertAllClose([[0.8, 0.6]], prob.eval())
+      self.assertAllClose([[0.8, 0.6]], self.evaluate(prob))
 
       prob = dist.prob([1])
-      self.assertAllClose([[0.8, 0.6]], prob.eval())
+      self.assertAllClose([[0.8, 0.6]], self.evaluate(prob))
 
       prob = dist.prob([0, 1])
-      self.assertAllClose([[0.2, 0.6]], prob.eval())
+      self.assertAllClose([[0.2, 0.6]], self.evaluate(prob))
 
       prob = dist.prob([[0, 1]])
-      self.assertAllClose([[0.2, 0.6]], prob.eval())
+      self.assertAllClose([[0.2, 0.6]], self.evaluate(prob))
 
       prob = dist.prob([[[0, 1]]])
-      self.assertAllClose([[[0.2, 0.6]]], prob.eval())
+      self.assertAllClose([[[0.2, 0.6]]], self.evaluate(prob))
 
       prob = dist.prob([[1, 0], [0, 1]])
-      self.assertAllClose([[0.8, 0.4], [0.2, 0.6]], prob.eval())
+      self.assertAllClose([[0.8, 0.4], [0.2, 0.6]], self.evaluate(prob))
 
       prob = dist.prob([[[1, 1], [1, 0]], [[1, 0], [0, 1]]])
       self.assertAllClose([[[0.8, 0.6], [0.8, 0.4]], [[0.8, 0.4], [0.2, 0.6]]],
-                          prob.eval())
+                          self.evaluate(prob))
 
   def testLogPMFShape(self):
     with self.cached_session():
@@ -440,12 +455,14 @@ class CategoricalTest(test.TestCase, parameterized.TestCase):
     self.assertEqual(3, log_prob.get_shape().ndims)
     self.assertAllEqual([2, 2, 2], log_prob.get_shape())
 
+  @test_util.run_deprecated_v1
   def testMode(self):
     with self.cached_session():
       histograms = [[[0.2, 0.8], [0.6, 0.4]]]
       dist = categorical.Categorical(math_ops.log(histograms) - 50.)
       self.assertAllEqual(dist.mode().eval(), [[1, 0]])
 
+  @test_util.run_deprecated_v1
   def testCategoricalCategoricalKL(self):
 
     def np_softmax(logits):
@@ -462,7 +479,7 @@ class CategoricalTest(test.TestCase, parameterized.TestCase):
           b = categorical.Categorical(logits=b_logits)
 
           kl = kullback_leibler.kl_divergence(a, b)
-          kl_val = sess.run(kl)
+          kl_val = self.evaluate(kl)
           # Make sure KL(a||a) is 0
           kl_same = sess.run(kullback_leibler.kl_divergence(a, a))
 
diff --git a/tensorflow/python/kernel_tests/distributions/dirichlet_multinomial_test.py b/tensorflow/python/kernel_tests/distributions/dirichlet_multinomial_test.py
index d558ca09cc6..c530037e1ed 100644
--- a/tensorflow/python/kernel_tests/distributions/dirichlet_multinomial_test.py
+++ b/tensorflow/python/kernel_tests/distributions/dirichlet_multinomial_test.py
@@ -22,6 +22,7 @@ from tensorflow.python.eager import backprop
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.distributions import dirichlet_multinomial
@@ -36,6 +37,7 @@ class DirichletMultinomialTest(test.TestCase):
   def setUp(self):
     self._rng = np.random.RandomState(42)
 
+  @test_util.run_deprecated_v1
   def testSimpleShapes(self):
     with self.cached_session():
       alpha = np.random.rand(3)
@@ -45,6 +47,7 @@ class DirichletMultinomialTest(test.TestCase):
       self.assertEqual(tensor_shape.TensorShape([3]), dist.event_shape)
       self.assertEqual(tensor_shape.TensorShape([]), dist.batch_shape)
 
+  @test_util.run_deprecated_v1
   def testComplexShapes(self):
     with self.cached_session():
       alpha = np.random.rand(3, 2, 2)
@@ -55,6 +58,7 @@ class DirichletMultinomialTest(test.TestCase):
       self.assertEqual(tensor_shape.TensorShape([2]), dist.event_shape)
       self.assertEqual(tensor_shape.TensorShape([3, 2]), dist.batch_shape)
 
+  @test_util.run_deprecated_v1
   def testNproperty(self):
     alpha = [[1., 2, 3]]
     n = [[5.]]
@@ -63,6 +67,7 @@ class DirichletMultinomialTest(test.TestCase):
       self.assertEqual([1, 1], dist.total_count.get_shape())
       self.assertAllClose(n, dist.total_count.eval())
 
+  @test_util.run_deprecated_v1
   def testAlphaProperty(self):
     alpha = [[1., 2, 3]]
     with self.cached_session():
@@ -70,6 +75,7 @@ class DirichletMultinomialTest(test.TestCase):
       self.assertEqual([1, 3], dist.concentration.get_shape())
       self.assertAllClose(alpha, dist.concentration.eval())
 
+  @test_util.run_deprecated_v1
   def testPmfNandCountsAgree(self):
     alpha = [[1., 2, 3]]
     n = [[5.]]
@@ -83,6 +89,7 @@ class DirichletMultinomialTest(test.TestCase):
           "last-dimension must sum to `self.total_count`"):
         dist.prob([3., 3, 0]).eval()
 
+  @test_util.run_deprecated_v1
   def testPmfNonIntegerCounts(self):
     alpha = [[1., 2, 3]]
     n = [[5.]]
@@ -110,7 +117,7 @@ class DirichletMultinomialTest(test.TestCase):
       counts = [1., 0]
       dist = ds.DirichletMultinomial(1., alpha)
       pmf = dist.prob(counts)
-      self.assertAllClose(1 / 3., pmf.eval())
+      self.assertAllClose(1 / 3., self.evaluate(pmf))
       self.assertEqual((), pmf.get_shape())
 
   def testPmfBothZeroBatchesNontrivialN(self):
@@ -122,7 +129,7 @@ class DirichletMultinomialTest(test.TestCase):
       counts = [3., 2]
       dist = ds.DirichletMultinomial(5., alpha)
       pmf = dist.prob(counts)
-      self.assertAllClose(1 / 7., pmf.eval())
+      self.assertAllClose(1 / 7., self.evaluate(pmf))
       self.assertEqual((), pmf.get_shape())
 
   def testPmfBothZeroBatchesMultidimensionalN(self):
@@ -134,7 +141,7 @@ class DirichletMultinomialTest(test.TestCase):
       n = np.full([4, 3], 5., dtype=np.float32)
       dist = ds.DirichletMultinomial(n, alpha)
       pmf = dist.prob(counts)
-      self.assertAllClose([[1 / 7., 1 / 7., 1 / 7.]] * 4, pmf.eval())
+      self.assertAllClose([[1 / 7., 1 / 7., 1 / 7.]] * 4, self.evaluate(pmf))
       self.assertEqual((4, 3), pmf.get_shape())
 
   def testPmfAlphaStretchedInBroadcastWhenSameRank(self):
@@ -145,7 +152,7 @@ class DirichletMultinomialTest(test.TestCase):
       counts = [[1., 0], [0., 1]]
       dist = ds.DirichletMultinomial([1.], alpha)
       pmf = dist.prob(counts)
-      self.assertAllClose([1 / 3., 2 / 3.], pmf.eval())
+      self.assertAllClose([1 / 3., 2 / 3.], self.evaluate(pmf))
       self.assertAllEqual([2], pmf.get_shape())
 
   def testPmfAlphaStretchedInBroadcastWhenLowerRank(self):
@@ -155,7 +162,7 @@ class DirichletMultinomialTest(test.TestCase):
       alpha = [1., 2]
       counts = [[1., 0], [0., 1]]
       pmf = ds.DirichletMultinomial(1., alpha).prob(counts)
-      self.assertAllClose([1 / 3., 2 / 3.], pmf.eval())
+      self.assertAllClose([1 / 3., 2 / 3.], self.evaluate(pmf))
       self.assertAllEqual([2], pmf.get_shape())
 
   def testPmfCountsStretchedInBroadcastWhenSameRank(self):
@@ -165,7 +172,7 @@ class DirichletMultinomialTest(test.TestCase):
       alpha = [[1., 2], [2., 3]]
       counts = [[1., 0]]
       pmf = ds.DirichletMultinomial([1., 1.], alpha).prob(counts)
-      self.assertAllClose([1 / 3., 2 / 5.], pmf.eval())
+      self.assertAllClose([1 / 3., 2 / 5.], self.evaluate(pmf))
       self.assertAllEqual([2], pmf.get_shape())
 
   def testPmfCountsStretchedInBroadcastWhenLowerRank(self):
@@ -175,9 +182,10 @@ class DirichletMultinomialTest(test.TestCase):
       alpha = [[1., 2], [2., 3]]
       counts = [1., 0]
       pmf = ds.DirichletMultinomial(1., alpha).prob(counts)
-      self.assertAllClose([1 / 3., 2 / 5.], pmf.eval())
+      self.assertAllClose([1 / 3., 2 / 5.], self.evaluate(pmf))
       self.assertAllEqual([2], pmf.get_shape())
 
+  @test_util.run_deprecated_v1
   def testPmfForOneVoteIsTheMeanWithOneRecordInput(self):
     # The probabilities of one vote falling into class k is the mean for class
     # k.
@@ -194,6 +202,7 @@ class DirichletMultinomialTest(test.TestCase):
         self.assertAllEqual([3], mean.shape)
         self.assertAllEqual([], pmf.shape)
 
+  @test_util.run_deprecated_v1
   def testMeanDoubleTwoVotes(self):
     # The probabilities of two votes falling into class k for
     # DirichletMultinomial(2, alpha) is twice as much as the probability of one
@@ -215,6 +224,7 @@ class DirichletMultinomialTest(test.TestCase):
         self.assertAllClose(mean2[class_num], 2 * mean1[class_num])
         self.assertAllEqual([3], mean1.shape)
 
+  @test_util.run_deprecated_v1
   def testCovarianceFromSampling(self):
     # We will test mean, cov, var, stddev on a DirichletMultinomial constructed
     # via broadcast between alpha, n.
@@ -289,7 +299,7 @@ class DirichletMultinomialTest(test.TestCase):
         expected_covariance = n * (n + alpha_0) / (1 + alpha_0) * shared_matrix
 
         self.assertEqual([2, 2], covariance.get_shape())
-        self.assertAllClose(expected_covariance, covariance.eval())
+        self.assertAllClose(expected_covariance, self.evaluate(covariance))
 
   def testCovarianceNAlphaBroadcast(self):
     alpha_v = [1., 2, 3]
@@ -327,7 +337,7 @@ class DirichletMultinomialTest(test.TestCase):
           ns * (ns + alpha_0) / (1 + alpha_0))[..., array_ops.newaxis]
 
       self.assertEqual([4, 3, 3], covariance.get_shape())
-      self.assertAllClose(expected_covariance, covariance.eval())
+      self.assertAllClose(expected_covariance, self.evaluate(covariance))
 
   def testCovarianceMultidimensional(self):
     alpha = np.random.rand(3, 5, 4).astype(np.float32)
@@ -353,7 +363,7 @@ class DirichletMultinomialTest(test.TestCase):
     with self.cached_session():
       dist = ds.DirichletMultinomial(0., alpha)
       pmf = dist.prob(counts)
-      self.assertAllClose(1.0, pmf.eval())
+      self.assertAllClose(1.0, self.evaluate(pmf))
       self.assertEqual((), pmf.get_shape())
 
   def testLargeTauGivesPreciseProbabilities(self):
@@ -368,7 +378,7 @@ class DirichletMultinomialTest(test.TestCase):
     with self.cached_session():
       dist = ds.DirichletMultinomial(1., alpha)
       pmf = dist.prob(counts)
-      self.assertAllClose(0.8, pmf.eval(), atol=1e-4)
+      self.assertAllClose(0.8, self.evaluate(pmf), atol=1e-4)
       self.assertEqual((), pmf.get_shape())
 
     # Two (three sided) coin flips.  Prob[coin 3] = 0.8.
@@ -376,7 +386,7 @@ class DirichletMultinomialTest(test.TestCase):
     with self.cached_session():
       dist = ds.DirichletMultinomial(2., alpha)
       pmf = dist.prob(counts)
-      self.assertAllClose(0.8**2, pmf.eval(), atol=1e-2)
+      self.assertAllClose(0.8**2, self.evaluate(pmf), atol=1e-2)
       self.assertEqual((), pmf.get_shape())
 
     # Three (three sided) coin flips.
@@ -384,7 +394,7 @@ class DirichletMultinomialTest(test.TestCase):
     with self.cached_session():
       dist = ds.DirichletMultinomial(3., alpha)
       pmf = dist.prob(counts)
-      self.assertAllClose(3 * 0.1 * 0.8 * 0.8, pmf.eval(), atol=1e-2)
+      self.assertAllClose(3 * 0.1 * 0.8 * 0.8, self.evaluate(pmf), atol=1e-2)
       self.assertEqual((), pmf.get_shape())
 
   def testSmallTauPrefersCorrelatedResults(self):
@@ -399,7 +409,7 @@ class DirichletMultinomialTest(test.TestCase):
     with self.cached_session():
       dist = ds.DirichletMultinomial(1., alpha)
       pmf = dist.prob(counts)
-      self.assertAllClose(0.5, pmf.eval())
+      self.assertAllClose(0.5, self.evaluate(pmf))
       self.assertEqual((), pmf.get_shape())
 
     # If there are two draws, it is much more likely that they are the same.
@@ -409,9 +419,10 @@ class DirichletMultinomialTest(test.TestCase):
       dist = ds.DirichletMultinomial(2., alpha)
       pmf_same = dist.prob(counts_same)
       pmf_different = dist.prob(counts_different)
-      self.assertLess(5 * pmf_different.eval(), pmf_same.eval())
+      self.assertLess(5 * self.evaluate(pmf_different), self.evaluate(pmf_same))
       self.assertEqual((), pmf_same.get_shape())
 
+  @test_util.run_deprecated_v1
   def testNonStrictTurnsOffAllChecks(self):
     # Make totally invalid input.
     with self.cached_session():
@@ -421,6 +432,7 @@ class DirichletMultinomialTest(test.TestCase):
       dist = ds.DirichletMultinomial(n, alpha, validate_args=False)
       dist.prob(counts).eval()  # Should not raise.
 
+  @test_util.run_deprecated_v1
   def testSampleUnbiasedNonScalarBatch(self):
     with self.cached_session() as sess:
       dist = ds.DirichletMultinomial(
@@ -450,6 +462,7 @@ class DirichletMultinomialTest(test.TestCase):
       self.assertAllClose(
           actual_covariance_, sample_covariance_, atol=0., rtol=0.20)
 
+  @test_util.run_deprecated_v1
   def testSampleUnbiasedScalarBatch(self):
     with self.cached_session() as sess:
       dist = ds.DirichletMultinomial(
diff --git a/tensorflow/python/kernel_tests/distributions/identity_bijector_test.py b/tensorflow/python/kernel_tests/distributions/identity_bijector_test.py
index e35a8e1cdd7..62b562387d0 100644
--- a/tensorflow/python/kernel_tests/distributions/identity_bijector_test.py
+++ b/tensorflow/python/kernel_tests/distributions/identity_bijector_test.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops.distributions import bijector_test_util
 from tensorflow.python.ops.distributions import identity_bijector
 from tensorflow.python.platform import test
@@ -41,6 +42,7 @@ class IdentityBijectorTest(test.TestCase):
         self.evaluate(
             bijector.forward_log_det_jacobian(x, event_ndims=3)))
 
+  @test_util.run_deprecated_v1
   def testScalarCongruency(self):
     with self.cached_session():
       bijector = identity_bijector.Identity()
diff --git a/tensorflow/python/kernel_tests/distributions/kullback_leibler_test.py b/tensorflow/python/kernel_tests/distributions/kullback_leibler_test.py
index e77e1117d49..1e967de570f 100644
--- a/tensorflow/python/kernel_tests/distributions/kullback_leibler_test.py
+++ b/tensorflow/python/kernel_tests/distributions/kullback_leibler_test.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops.distributions import kullback_leibler
 from tensorflow.python.ops.distributions import normal
@@ -45,6 +46,7 @@ class KLTest(test.TestCase):
     a = MyDist(loc=0.0, scale=1.0)
     self.assertEqual("OK", kullback_leibler.kl_divergence(a, a, name="OK"))
 
+  @test_util.run_deprecated_v1
   def testDomainErrorExceptions(self):
 
     class MyDistException(normal.Normal):
@@ -63,17 +65,17 @@ class KLTest(test.TestCase):
       kl = kullback_leibler.kl_divergence(a, a, allow_nan_stats=False)
       with self.assertRaisesOpError(
           "KL calculation between .* and .* returned NaN values"):
-        kl.eval()
+        self.evaluate(kl)
       with self.assertRaisesOpError(
           "KL calculation between .* and .* returned NaN values"):
         a.kl_divergence(a).eval()
       a = MyDistException(loc=0.0, scale=1.0, allow_nan_stats=True)
       kl_ok = kullback_leibler.kl_divergence(a, a)
-      self.assertAllEqual([float("nan")], kl_ok.eval())
+      self.assertAllEqual([float("nan")], self.evaluate(kl_ok))
       self_kl_ok = a.kl_divergence(a)
-      self.assertAllEqual([float("nan")], self_kl_ok.eval())
+      self.assertAllEqual([float("nan")], self.evaluate(self_kl_ok))
       cross_ok = a.cross_entropy(a)
-      self.assertAllEqual([float("nan")], cross_ok.eval())
+      self.assertAllEqual([float("nan")], self.evaluate(cross_ok))
 
   def testRegistrationFailures(self):
 
diff --git a/tensorflow/python/kernel_tests/distributions/multinomial_test.py b/tensorflow/python/kernel_tests/distributions/multinomial_test.py
index 3840d7331ca..b3f3416a52f 100644
--- a/tensorflow/python/kernel_tests/distributions/multinomial_test.py
+++ b/tensorflow/python/kernel_tests/distributions/multinomial_test.py
@@ -127,7 +127,7 @@ class MultinomialTest(test.TestCase):
       p = [0.5, 0.5]
       counts = [1., 0]
       pmf = multinomial.Multinomial(total_count=1., probs=p).prob(counts)
-      self.assertAllClose(0.5, pmf.eval())
+      self.assertAllClose(0.5, self.evaluate(pmf))
       self.assertEqual((), pmf.get_shape())
 
   def testPmfBothZeroBatchesNontrivialN(self):
@@ -138,7 +138,7 @@ class MultinomialTest(test.TestCase):
       dist = multinomial.Multinomial(total_count=5., probs=p)
       pmf = dist.prob(counts)
       # 5 choose 3 = 5 choose 2 = 10. 10 * (.9)^2 * (.1)^3 = 81/10000.
-      self.assertAllClose(81. / 10000, pmf.eval())
+      self.assertAllClose(81. / 10000, self.evaluate(pmf))
       self.assertEqual((), pmf.get_shape())
 
   def testPmfPStretchedInBroadcastWhenSameRank(self):
@@ -146,7 +146,7 @@ class MultinomialTest(test.TestCase):
       p = [[0.1, 0.9]]
       counts = [[1., 0], [0, 1]]
       pmf = multinomial.Multinomial(total_count=1., probs=p).prob(counts)
-      self.assertAllClose([0.1, 0.9], pmf.eval())
+      self.assertAllClose([0.1, 0.9], self.evaluate(pmf))
       self.assertEqual((2), pmf.get_shape())
 
   def testPmfPStretchedInBroadcastWhenLowerRank(self):
@@ -154,7 +154,7 @@ class MultinomialTest(test.TestCase):
       p = [0.1, 0.9]
       counts = [[1., 0], [0, 1]]
       pmf = multinomial.Multinomial(total_count=1., probs=p).prob(counts)
-      self.assertAllClose([0.1, 0.9], pmf.eval())
+      self.assertAllClose([0.1, 0.9], self.evaluate(pmf))
       self.assertEqual((2), pmf.get_shape())
 
   def testPmfCountsStretchedInBroadcastWhenSameRank(self):
@@ -182,7 +182,7 @@ class MultinomialTest(test.TestCase):
       # [2]
       counts = [2., 1]
       pmf = multinomial.Multinomial(total_count=n, probs=p).prob(counts)
-      pmf.eval()
+      self.evaluate(pmf)
       self.assertEqual(pmf.get_shape(), (2, 2))
 
   def testPmfShapeCountsPStretchedN(self):
@@ -191,7 +191,7 @@ class MultinomialTest(test.TestCase):
       counts = [3., 2]
       n = np.full([4, 3], 5., dtype=np.float32)
       pmf = multinomial.Multinomial(total_count=n, probs=p).prob(counts)
-      pmf.eval()
+      self.evaluate(pmf)
       self.assertEqual((4, 3), pmf.get_shape())
 
   def testMultinomialMean(self):
diff --git a/tensorflow/python/kernel_tests/distributions/normal_test.py b/tensorflow/python/kernel_tests/distributions/normal_test.py
index 6625a88843f..f2a193e69bd 100644
--- a/tensorflow/python/kernel_tests/distributions/normal_test.py
+++ b/tensorflow/python/kernel_tests/distributions/normal_test.py
@@ -511,6 +511,7 @@ class NormalTest(test.TestCase):
     self.assertAllEqual(self.evaluate(normal.event_shape_tensor()), [])
     self.assertEqual(normal.event_shape, tensor_shape.TensorShape([]))
 
+  @test_util.run_deprecated_v1
   def testNormalShapeWithPlaceholders(self):
     mu = array_ops.placeholder(dtype=dtypes.float32)
     sigma = array_ops.placeholder(dtype=dtypes.float32)
diff --git a/tensorflow/python/kernel_tests/distributions/special_math_test.py b/tensorflow/python/kernel_tests/distributions/special_math_test.py
index cc43e121686..d97fcfa655f 100644
--- a/tensorflow/python/kernel_tests/distributions/special_math_test.py
+++ b/tensorflow/python/kernel_tests/distributions/special_math_test.py
@@ -104,6 +104,7 @@ class NdtriTest(test.TestCase):
     x = special_math.ndtri(p)
     self.assertAllClose(expected_x, self.evaluate(x), atol=0.)
 
+  @test_util.run_deprecated_v1
   def testNdtriDynamicShape(self):
     """Verifies that ndtri computation is correct."""
     with self.cached_session() as sess:
@@ -213,9 +214,11 @@ class NdtrTest(test.TestCase):
         rtol=error_spec.rtol,
         atol=error_spec.atol)
 
+  @test_util.run_deprecated_v1
   def test_float32(self):
     self._test_grid(np.float32, self._grid32, self._error32)
 
+  @test_util.run_deprecated_v1
   def test_float64(self):
     self._test_grid(np.float64, self._grid64, self._error64)
 
@@ -338,10 +341,12 @@ class NdtrGradientTest(test.TestCase):
           rtol=error_spec.rtol,
           atol=error_spec.atol)
 
+  @test_util.run_deprecated_v1
   def test_float32(self):
     self._test_grad_accuracy(np.float32, self._grid, self._error32)
     self._test_grad_finite(np.float32)
 
+  @test_util.run_deprecated_v1
   def test_float64(self):
     self._test_grad_accuracy(np.float64, self._grid, self._error64)
     self._test_grad_finite(np.float64)
@@ -362,7 +367,7 @@ class ErfInvTest(test.TestCase):
 
       expected_x = special.erfinv(x)
       x = special_math.erfinv(x)
-      self.assertAllClose(expected_x, x.eval(), atol=0.)
+      self.assertAllClose(expected_x, self.evaluate(x), atol=0.)
 
   def testErfInvIntegerInput(self):
     with self.cached_session():
@@ -418,6 +423,7 @@ class LogCDFLaplaceTest(test.TestCase):
           rtol=error_spec.rtol,
           atol=error_spec.atol)
 
+  @test_util.run_deprecated_v1
   def test_float32_lower_and_mid_segment_scipy_float32_ok(self):
     # Choose values mild enough that we can use scipy in float32, which will
     # allow for a high accuracy match to scipy (since we both use float32).
@@ -427,6 +433,7 @@ class LogCDFLaplaceTest(test.TestCase):
         GridSpec(min=-10, max=self.CUTOFF_FLOAT32_UPPER - 5, shape=[100]),
         ErrorSpec(rtol=5e-4, atol=0))
 
+  @test_util.run_deprecated_v1
   def test_float32_all_segments_with_scipy_float64_ok(self):
     # Choose values outside the range where scipy float32 works.
     # Let scipy use float64.  This means we
@@ -437,6 +444,7 @@ class LogCDFLaplaceTest(test.TestCase):
         GridSpec(min=-50, max=self.CUTOFF_FLOAT32_UPPER + 5, shape=[100]),
         ErrorSpec(rtol=0.05, atol=0))
 
+  @test_util.run_deprecated_v1
   def test_float32_extreme_values_result_and_gradient_finite_and_nonzero(self):
     with self.cached_session() as sess:
       # On the lower branch, log_cdf_laplace(x) = x, so we know this will be
@@ -448,7 +456,7 @@ class LogCDFLaplaceTest(test.TestCase):
       actual = sm.log_cdf_laplace(grid)
       grad = gradients_impl.gradients(actual, grid)[0]
 
-      actual_, grad_ = sess.run([actual, grad])
+      actual_, grad_ = self.evaluate([actual, grad])
 
       # isfinite checks for NaN and Inf.
       self.assertAllTrue(np.isfinite(actual_))
@@ -456,6 +464,7 @@ class LogCDFLaplaceTest(test.TestCase):
       self.assertFalse(np.any(actual_ == 0))
       self.assertFalse(np.any(grad_ == 0))
 
+  @test_util.run_deprecated_v1
   def test_float64_extreme_values_result_and_gradient_finite_and_nonzero(self):
     with self.cached_session() as sess:
       # On the lower branch, log_cdf_laplace(x) = x, so we know this will be
@@ -467,7 +476,7 @@ class LogCDFLaplaceTest(test.TestCase):
       actual = sm.log_cdf_laplace(grid)
       grad = gradients_impl.gradients(actual, grid)[0]
 
-      actual_, grad_ = sess.run([actual, grad])
+      actual_, grad_ = self.evaluate([actual, grad])
 
       # isfinite checks for NaN and Inf.
       self.assertAllTrue(np.isfinite(actual_))
diff --git a/tensorflow/python/kernel_tests/distributions/util_test.py b/tensorflow/python/kernel_tests/distributions/util_test.py
index f4e651b25bb..030ad601bf4 100644
--- a/tensorflow/python/kernel_tests/distributions/util_test.py
+++ b/tensorflow/python/kernel_tests/distributions/util_test.py
@@ -59,6 +59,7 @@ def _logit(x):
 
 class AssertCloseTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testAssertIntegerForm(self):
     # This should only be detected as an integer.
     x = array_ops.placeholder(dtypes.float32)
@@ -112,6 +113,7 @@ class MaybeGetStaticTest(test.TestCase):
     self.assertAllClose(
         np.array(2.), du.maybe_get_static_value(x, dtype=np.float64))
 
+  @test_util.run_deprecated_v1
   def testGetStaticPlaceholder(self):
     x = array_ops.placeholder(dtype=dtypes.int32, shape=[1])
     self.assertEqual(None, du.maybe_get_static_value(x))
@@ -235,6 +237,7 @@ class GetLogitsAndProbsTest(test.TestCase):
         probs=p4, multidimensional=True, validate_args=False)
     self.evaluate(prob)
 
+  @test_util.run_deprecated_v1
   def testProbsMultidimShape(self):
     with self.cached_session():
       with self.assertRaises(ValueError):
@@ -249,6 +252,7 @@ class GetLogitsAndProbsTest(test.TestCase):
             probs=p, multidimensional=True, validate_args=True)
         prob.eval(feed_dict={p: np.ones([int(2**11+1)])})
 
+  @test_util.run_deprecated_v1
   def testLogitsMultidimShape(self):
     with self.cached_session():
       with self.assertRaises(ValueError):
@@ -266,6 +270,7 @@ class GetLogitsAndProbsTest(test.TestCase):
 
 class EmbedCheckCategoricalEventShapeTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testTooSmall(self):
     with self.cached_session():
       with self.assertRaises(ValueError):
@@ -280,6 +285,7 @@ class EmbedCheckCategoricalEventShapeTest(test.TestCase):
             param)
         checked_param.eval(feed_dict={param: np.ones([1])})
 
+  @test_util.run_deprecated_v1
   def testTooLarge(self):
     with self.cached_session():
       with self.assertRaises(ValueError):
@@ -305,6 +311,7 @@ class EmbedCheckCategoricalEventShapeTest(test.TestCase):
 
 class EmbedCheckIntegerCastingClosedTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testCorrectlyAssertsNonnegative(self):
     with self.cached_session():
       with self.assertRaisesOpError("Elements must be non-negative"):
@@ -313,6 +320,7 @@ class EmbedCheckIntegerCastingClosedTest(test.TestCase):
             x, target_dtype=dtypes.int16)
         x_checked.eval(feed_dict={x: np.array([1, -1], dtype=np.float16)})
 
+  @test_util.run_deprecated_v1
   def testCorrectlyAssersIntegerForm(self):
     with self.cached_session():
       with self.assertRaisesOpError("Elements must be int16-equivalent."):
@@ -321,6 +329,7 @@ class EmbedCheckIntegerCastingClosedTest(test.TestCase):
             x, target_dtype=dtypes.int16)
         x_checked.eval(feed_dict={x: np.array([1, 1.5], dtype=np.float16)})
 
+  @test_util.run_deprecated_v1
   def testCorrectlyAssertsLargestPossibleInteger(self):
     with self.cached_session():
       with self.assertRaisesOpError("Elements cannot exceed 32767."):
@@ -329,6 +338,7 @@ class EmbedCheckIntegerCastingClosedTest(test.TestCase):
             x, target_dtype=dtypes.int16)
         x_checked.eval(feed_dict={x: np.array([1, 2**15], dtype=np.int32)})
 
+  @test_util.run_deprecated_v1
   def testCorrectlyAssertsSmallestPossibleInteger(self):
     with self.cached_session():
       with self.assertRaisesOpError("Elements cannot be smaller than 0."):
@@ -369,6 +379,7 @@ class LogCombinationsTest(test.TestCase):
 
 class DynamicShapeTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testSameDynamicShape(self):
     with self.cached_session():
       scalar = constant_op.constant(2.0)
@@ -493,6 +504,7 @@ class RotateTransposeTest(test.TestCase):
             self._np_rotate_transpose(x, shift), self.evaluate(y))
         self.assertAllEqual(np.roll(x.shape, shift), y.get_shape().as_list())
 
+  @test_util.run_deprecated_v1
   def testRollDynamic(self):
     with self.cached_session() as sess:
       x = array_ops.placeholder(dtypes.float32)
@@ -511,6 +523,7 @@ class RotateTransposeTest(test.TestCase):
 
 class PickVectorTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testCorrectlyPicksVector(self):
     with self.cached_session():
       x = np.arange(10, 12)
@@ -529,36 +542,42 @@ class PickVectorTest(test.TestCase):
 
 class PreferStaticRankTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testNonEmptyConstantTensor(self):
     x = array_ops.zeros((2, 3, 4))
     rank = du.prefer_static_rank(x)
     self.assertIsInstance(rank, np.ndarray)
     self.assertEqual(3, rank)
 
+  @test_util.run_deprecated_v1
   def testEmptyConstantTensor(self):
     x = constant_op.constant([])
     rank = du.prefer_static_rank(x)
     self.assertIsInstance(rank, np.ndarray)
     self.assertEqual(1, rank)
 
+  @test_util.run_deprecated_v1
   def testScalarTensor(self):
     x = constant_op.constant(1.)
     rank = du.prefer_static_rank(x)
     self.assertIsInstance(rank, np.ndarray)
     self.assertEqual(0, rank)
 
+  @test_util.run_deprecated_v1
   def testDynamicRankEndsUpBeingNonEmpty(self):
     x = array_ops.placeholder(np.float64, shape=None)
     rank = du.prefer_static_rank(x)
     with self.cached_session():
       self.assertAllEqual(2, rank.eval(feed_dict={x: np.zeros((2, 3))}))
 
+  @test_util.run_deprecated_v1
   def testDynamicRankEndsUpBeingEmpty(self):
     x = array_ops.placeholder(np.int32, shape=None)
     rank = du.prefer_static_rank(x)
     with self.cached_session():
       self.assertAllEqual(1, rank.eval(feed_dict={x: []}))
 
+  @test_util.run_deprecated_v1
   def testDynamicRankEndsUpBeingScalar(self):
     x = array_ops.placeholder(np.int32, shape=None)
     rank = du.prefer_static_rank(x)
@@ -568,36 +587,42 @@ class PreferStaticRankTest(test.TestCase):
 
 class PreferStaticShapeTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testNonEmptyConstantTensor(self):
     x = array_ops.zeros((2, 3, 4))
     shape = du.prefer_static_shape(x)
     self.assertIsInstance(shape, np.ndarray)
     self.assertAllEqual(np.array([2, 3, 4]), shape)
 
+  @test_util.run_deprecated_v1
   def testEmptyConstantTensor(self):
     x = constant_op.constant([])
     shape = du.prefer_static_shape(x)
     self.assertIsInstance(shape, np.ndarray)
     self.assertAllEqual(np.array([0]), shape)
 
+  @test_util.run_deprecated_v1
   def testScalarTensor(self):
     x = constant_op.constant(1.)
     shape = du.prefer_static_shape(x)
     self.assertIsInstance(shape, np.ndarray)
     self.assertAllEqual(np.array([]), shape)
 
+  @test_util.run_deprecated_v1
   def testDynamicShapeEndsUpBeingNonEmpty(self):
     x = array_ops.placeholder(np.float64, shape=None)
     shape = du.prefer_static_shape(x)
     with self.cached_session():
       self.assertAllEqual((2, 3), shape.eval(feed_dict={x: np.zeros((2, 3))}))
 
+  @test_util.run_deprecated_v1
   def testDynamicShapeEndsUpBeingEmpty(self):
     x = array_ops.placeholder(np.int32, shape=None)
     shape = du.prefer_static_shape(x)
     with self.cached_session():
       self.assertAllEqual(np.array([0]), shape.eval(feed_dict={x: []}))
 
+  @test_util.run_deprecated_v1
   def testDynamicShapeEndsUpBeingScalar(self):
     x = array_ops.placeholder(np.int32, shape=None)
     shape = du.prefer_static_shape(x)
@@ -607,24 +632,28 @@ class PreferStaticShapeTest(test.TestCase):
 
 class PreferStaticValueTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testNonEmptyConstantTensor(self):
     x = array_ops.zeros((2, 3, 4))
     value = du.prefer_static_value(x)
     self.assertIsInstance(value, np.ndarray)
     self.assertAllEqual(np.zeros((2, 3, 4)), value)
 
+  @test_util.run_deprecated_v1
   def testEmptyConstantTensor(self):
     x = constant_op.constant([])
     value = du.prefer_static_value(x)
     self.assertIsInstance(value, np.ndarray)
     self.assertAllEqual(np.array([]), value)
 
+  @test_util.run_deprecated_v1
   def testScalarTensor(self):
     x = constant_op.constant(1.)
     value = du.prefer_static_value(x)
     self.assertIsInstance(value, np.ndarray)
     self.assertAllEqual(np.array(1.), value)
 
+  @test_util.run_deprecated_v1
   def testDynamicValueEndsUpBeingNonEmpty(self):
     x = array_ops.placeholder(np.float64, shape=None)
     value = du.prefer_static_value(x)
@@ -632,12 +661,14 @@ class PreferStaticValueTest(test.TestCase):
       self.assertAllEqual(np.zeros((2, 3)),
                           value.eval(feed_dict={x: np.zeros((2, 3))}))
 
+  @test_util.run_deprecated_v1
   def testDynamicValueEndsUpBeingEmpty(self):
     x = array_ops.placeholder(np.int32, shape=None)
     value = du.prefer_static_value(x)
     with self.cached_session():
       self.assertAllEqual(np.array([]), value.eval(feed_dict={x: []}))
 
+  @test_util.run_deprecated_v1
   def testDynamicValueEndsUpBeingScalar(self):
     x = array_ops.placeholder(np.int32, shape=None)
     value = du.prefer_static_value(x)
@@ -698,43 +729,55 @@ class FillTriangularTest(test.TestCase):
     self.assertAllClose(expected, actual_, rtol=1e-8, atol=1e-9)
     self.assertAllClose(x_, grad_actual_, rtol=1e-8, atol=1e-9)
 
+  @test_util.run_deprecated_v1
   def testCorrectlyMakes1x1TriLower(self):
     self._run_test(self._rng.randn(3, int(1*2/2)))
 
+  @test_util.run_deprecated_v1
   def testCorrectlyMakesNoBatchTriLower(self):
     self._run_test(self._rng.randn(int(4*5/2)))
 
+  @test_util.run_deprecated_v1
   def testCorrectlyMakesBatchTriLower(self):
     self._run_test(self._rng.randn(2, 3, int(3*4/2)))
 
+  @test_util.run_deprecated_v1
   def testCorrectlyMakesBatchTriLowerUnknownShape(self):
     self._run_test(self._rng.randn(2, 3, int(3*4/2)), use_deferred_shape=True)
 
+  @test_util.run_deprecated_v1
   def testCorrectlyMakesBatch7x7TriLowerUnknownShape(self):
     self._run_test(self._rng.randn(2, 3, int(7*8/2)), use_deferred_shape=True)
 
+  @test_util.run_deprecated_v1
   def testCorrectlyMakesBatch7x7TriLower(self):
     self._run_test(self._rng.randn(2, 3, int(7*8/2)))
 
+  @test_util.run_deprecated_v1
   def testCorrectlyMakes1x1TriUpper(self):
     self._run_test(self._rng.randn(3, int(1*2/2)), upper=True)
 
+  @test_util.run_deprecated_v1
   def testCorrectlyMakesNoBatchTriUpper(self):
     self._run_test(self._rng.randn(int(4*5/2)), upper=True)
 
+  @test_util.run_deprecated_v1
   def testCorrectlyMakesBatchTriUpper(self):
     self._run_test(self._rng.randn(2, 2, int(3*4/2)), upper=True)
 
+  @test_util.run_deprecated_v1
   def testCorrectlyMakesBatchTriUpperUnknownShape(self):
     self._run_test(self._rng.randn(2, 2, int(3*4/2)),
                    use_deferred_shape=True,
                    upper=True)
 
+  @test_util.run_deprecated_v1
   def testCorrectlyMakesBatch7x7TriUpperUnknownShape(self):
     self._run_test(self._rng.randn(2, 3, int(7*8/2)),
                    use_deferred_shape=True,
                    upper=True)
 
+  @test_util.run_deprecated_v1
   def testCorrectlyMakesBatch7x7TriUpper(self):
     self._run_test(self._rng.randn(2, 3, int(7*8/2)), upper=True)
 
@@ -773,6 +816,7 @@ class ReduceWeightedLogSumExp(test.TestCase):
       m = np.squeeze(m, axis=axis)
     return m + np.log(sgn * sum_), sgn
 
+  @test_util.run_deprecated_v1
   def testNoWeights(self):
     logx_ = np.array([[0., -1, 1000.],
                       [0, 1, -1000.],
@@ -805,7 +849,7 @@ class ReduceWeightedLogSumExp(test.TestCase):
       w = constant_op.constant(w_)
       actual, actual_sgn = du.reduce_weighted_logsumexp(
           logx, w, axis=-1, return_sign=True)
-      [actual_, actual_sgn_] = sess.run([actual, actual_sgn])
+      [actual_, actual_sgn_] = self.evaluate([actual, actual_sgn])
     self.assertAllEqual(expected, actual_)
     self.assertAllEqual([-1., -1, 1], actual_sgn_)
 
@@ -823,7 +867,7 @@ class ReduceWeightedLogSumExp(test.TestCase):
       w = constant_op.constant(w_)
       actual, actual_sgn = du.reduce_weighted_logsumexp(
           logx, w, axis=-1, return_sign=True, keep_dims=True)
-      [actual_, actual_sgn_] = sess.run([actual, actual_sgn])
+      [actual_, actual_sgn_] = self.evaluate([actual, actual_sgn])
     self.assertAllEqual(expected, actual_)
     self.assertAllEqual([[-1.], [-1], [1]], actual_sgn_)
 
@@ -903,6 +947,7 @@ class SoftplusTest(test.TestCase):
     self.assertAllEqual(np.ones_like(tf_softplus_inverse).astype(np.bool),
                         np.isfinite(tf_softplus_inverse))
 
+  @test_util.run_deprecated_v1
   def testNumbers(self):
     for t in [np.float16, np.float32, np.float64]:
       lower = {np.float16: -15, np.float32: -50, np.float64: -50}.get(t, -100)
@@ -933,6 +978,7 @@ class SoftplusTest(test.TestCase):
           ],
           use_gpu=True)
 
+  @test_util.run_deprecated_v1
   def testGradient(self):
     with self.cached_session():
       x = constant_op.constant(
@@ -949,6 +995,7 @@ class SoftplusTest(test.TestCase):
     tf_logging.vlog(2, "softplus (float) gradient err = ", err)
     self.assertLess(err, 1e-4)
 
+  @test_util.run_deprecated_v1
   def testInverseSoftplusGradientNeverNan(self):
     with self.cached_session():
       # Note that this range contains both zero and inf.
@@ -958,6 +1005,7 @@ class SoftplusTest(test.TestCase):
       # Equivalent to `assertAllFalse` (if it existed).
       self.assertAllEqual(np.zeros_like(grads).astype(np.bool), np.isnan(grads))
 
+  @test_util.run_deprecated_v1
   def testInverseSoftplusGradientFinite(self):
     with self.cached_session():
       # This range of x is all finite, and so is 1 / x.  So the
diff --git a/tensorflow/python/kernel_tests/division_future_test.py b/tensorflow/python/kernel_tests/division_future_test.py
index e477bdc73b9..85c85809d3f 100644
--- a/tensorflow/python/kernel_tests/division_future_test.py
+++ b/tensorflow/python/kernel_tests/division_future_test.py
@@ -65,7 +65,7 @@ class DivisionTestCase(test.TestCase):
                 tf_floordiv = tf_x // tf_y
                 check(floordiv, tf_floordiv)
       # Do only one sess.run for speed
-      for f, (x, y) in zip(checks, sess.run(tensors)):
+      for f, (x, y) in zip(checks, self.evaluate(tensors)):
         f(x, y)
 
 
diff --git a/tensorflow/python/kernel_tests/division_past_test.py b/tensorflow/python/kernel_tests/division_past_test.py
index 63951b5b382..38bb18631ab 100644
--- a/tensorflow/python/kernel_tests/division_past_test.py
+++ b/tensorflow/python/kernel_tests/division_past_test.py
@@ -64,7 +64,7 @@ class DivisionTestCase(test.TestCase):
                 tf_floordiv = tf_x // tf_y
                 check(floordiv, tf_floordiv)
       # Do only one sess.run for speed
-      for f, (x, y) in zip(checks, sess.run(tensors)):
+      for f, (x, y) in zip(checks, self.evaluate(tensors)):
         f(x, y)
 
 
diff --git a/tensorflow/python/kernel_tests/draw_bounding_box_op_test.py b/tensorflow/python/kernel_tests/draw_bounding_box_op_test.py
index c6558762809..6aa757e293e 100644
--- a/tensorflow/python/kernel_tests/draw_bounding_box_op_test.py
+++ b/tensorflow/python/kernel_tests/draw_bounding_box_op_test.py
@@ -87,7 +87,7 @@ class DrawBoundingBoxOpTest(test.TestCase):
       image = array_ops.expand_dims(image, 0)
       image = image_ops.draw_bounding_boxes(image, bboxes)
       with self.cached_session(use_gpu=False) as sess:
-        op_drawn_image = np.squeeze(sess.run(image), 0)
+        op_drawn_image = np.squeeze(self.evaluate(image), 0)
         self.assertAllEqual(test_drawn_image, op_drawn_image)
 
   def testDrawBoundingBoxRGBColorCycling(self):
diff --git a/tensorflow/python/kernel_tests/duplicate_op_test.py b/tensorflow/python/kernel_tests/duplicate_op_test.py
index 654267a5825..fef3127d4a8 100644
--- a/tensorflow/python/kernel_tests/duplicate_op_test.py
+++ b/tensorflow/python/kernel_tests/duplicate_op_test.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 import os
 
 from tensorflow.python.framework import load_library
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import resource_loader
 from tensorflow.python.platform import test
@@ -27,6 +28,7 @@ from tensorflow.python.platform import test
 
 class DuplicateOpTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testBasic(self):
     library_filename = os.path.join(resource_loader.get_data_files_path(),
                                     'duplicate_op.so')
diff --git a/tensorflow/python/kernel_tests/dynamic_partition_op_test.py b/tensorflow/python/kernel_tests/dynamic_partition_op_test.py
index 07da855a017..8c448194076 100644
--- a/tensorflow/python/kernel_tests/dynamic_partition_op_test.py
+++ b/tensorflow/python/kernel_tests/dynamic_partition_op_test.py
@@ -25,6 +25,7 @@ from six.moves import xrange  # pylint: disable=redefined-builtin
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.ops import gradients_impl
@@ -34,13 +35,14 @@ from tensorflow.python.platform import test
 
 class DynamicPartitionTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testSimpleOneDimensional(self):
     with self.session(use_gpu=True) as sess:
       data = constant_op.constant([0, 13, 2, 39, 4, 17], dtype=dtypes.float32)
       indices = constant_op.constant([0, 0, 2, 3, 2, 1])
       partitions = data_flow_ops.dynamic_partition(
           data, indices, num_partitions=4)
-      partition_vals = sess.run(partitions)
+      partition_vals = self.evaluate(partitions)
 
     self.assertEqual(4, len(partition_vals))
     self.assertAllEqual([0, 13], partition_vals[0])
@@ -54,6 +56,7 @@ class DynamicPartitionTest(test.TestCase):
     self.assertEqual([None], partitions[2].get_shape().as_list())
     self.assertEqual([None], partitions[3].get_shape().as_list())
 
+  @test_util.run_deprecated_v1
   def testSimpleTwoDimensional(self):
     with self.session(use_gpu=True) as sess:
       data = constant_op.constant([[0, 1, 2], [3, 4, 5], [6, 7, 8], [9, 10, 11],
@@ -62,7 +65,7 @@ class DynamicPartitionTest(test.TestCase):
       indices = constant_op.constant([0, 0, 2, 3, 2, 1])
       partitions = data_flow_ops.dynamic_partition(
           data, indices, num_partitions=4)
-      partition_vals = sess.run(partitions)
+      partition_vals = self.evaluate(partitions)
 
     self.assertEqual(4, len(partition_vals))
     self.assertAllEqual([[0, 1, 2], [3, 4, 5]], partition_vals[0])
@@ -87,7 +90,7 @@ class DynamicPartitionTest(test.TestCase):
       indices = constant_op.constant(indices_list, dtype=dtypes.int32)
       partitions = data_flow_ops.dynamic_partition(
           data, indices, num_partitions=2)
-      partition_vals = sess.run(partitions)
+      partition_vals = self.evaluate(partitions)
 
     self.assertEqual(2, len(partition_vals))
     self.assertAllEqual(part1, partition_vals[0])
@@ -109,7 +112,7 @@ class DynamicPartitionTest(test.TestCase):
       indices = constant_op.constant(indices_list, dtype=dtypes.int32)
       partitions = data_flow_ops.dynamic_partition(
           data, indices, num_partitions=num_partitions)
-      partition_vals = sess.run(partitions)
+      partition_vals = self.evaluate(partitions)
 
     self.assertEqual(num_partitions, len(partition_vals))
     for i in range(num_partitions):
@@ -125,7 +128,7 @@ class DynamicPartitionTest(test.TestCase):
       indices = constant_op.constant(indices_list, dtype=dtypes.int32)
       partitions = data_flow_ops.dynamic_partition(
           data, indices, num_partitions=2)
-      partition_vals = sess.run(partitions)
+      partition_vals = self.evaluate(partitions)
 
     self.assertEqual(2, len(partition_vals))
     self.assertAllEqual([3 + 4j, 7 + 8j], partition_vals[0])
@@ -138,7 +141,7 @@ class DynamicPartitionTest(test.TestCase):
       indices = 3
       partitions = data_flow_ops.dynamic_partition(
           data, indices, num_partitions=4)
-      partition_vals = sess.run(partitions)
+      partition_vals = self.evaluate(partitions)
 
     self.assertEqual(4, len(partition_vals))
     self.assertAllEqual(np.array([], dtype=np.float64).reshape(-1, 4),
@@ -151,6 +154,7 @@ class DynamicPartitionTest(test.TestCase):
                                  dtype=np.float64).reshape(-1, 4),
                         partition_vals[3])
 
+  @test_util.run_deprecated_v1
   def testHigherRank(self):
     np.random.seed(7)
     with self.session(use_gpu=True) as sess:
@@ -164,7 +168,7 @@ class DynamicPartitionTest(test.TestCase):
             outputs = data_flow_ops.dynamic_partition(
                 data_t, partitions_t, num_partitions=n)
             self.assertEqual(n, len(outputs))
-            outputs_val = sess.run(outputs)
+            outputs_val = self.evaluate(outputs)
             for i, output in enumerate(outputs_val):
               self.assertAllEqual(output, data[partitions == i])
 
@@ -183,7 +187,7 @@ class DynamicPartitionTest(test.TestCase):
       indices = constant_op.constant(indices_list, dtype=dtypes.int32)
       partitions = data_flow_ops.dynamic_partition(
           data, indices, num_partitions=4)
-      partition_vals = sess.run(partitions)
+      partition_vals = self.evaluate(partitions)
 
     self.assertEqual(4, len(partition_vals))
     self.assertAllEqual([], partition_vals[0])
@@ -199,7 +203,7 @@ class DynamicPartitionTest(test.TestCase):
       indices = constant_op.constant(indices_list, dtype=dtypes.int32)
       partitions = data_flow_ops.dynamic_partition(
           data, indices, num_partitions=3)
-      partition_vals = sess.run(partitions)
+      partition_vals = self.evaluate(partitions)
 
     self.assertEqual(3, len(partition_vals))
     self.assertAllEqual([[]], partition_vals[0])
@@ -215,7 +219,7 @@ class DynamicPartitionTest(test.TestCase):
       indices = constant_op.constant(indices_list, dtype=dtypes.int32)
       partitions = data_flow_ops.dynamic_partition(
           data, indices, num_partitions=2)
-      partition_vals = sess.run(partitions)
+      partition_vals = self.evaluate(partitions)
 
     self.assertEqual(2, len(partition_vals))
     self.assertAllEqual([], partition_vals[0])
@@ -236,7 +240,7 @@ class DynamicPartitionTest(test.TestCase):
       indices = constant_op.constant(indices_list, dtype=dtypes.int32)
       partitions = data_flow_ops.dynamic_partition(
           data, indices, num_partitions=2)
-      partition_vals = sess.run(partitions)
+      partition_vals = self.evaluate(partitions)
 
     self.assertEqual(2, len(partition_vals))
     self.assertAllEqual([6], partition_vals[0])
@@ -257,7 +261,7 @@ class DynamicPartitionTest(test.TestCase):
       indices = constant_op.constant(indices_list, dtype=dtypes.int32)
       partitions = data_flow_ops.dynamic_partition(
           data, indices, num_partitions=5)
-      partition_vals = sess.run(partitions)
+      partition_vals = self.evaluate(partitions)
 
     self.assertEqual(5, len(partition_vals))
     self.assertAllEqual([5], partition_vals[0])
@@ -281,12 +285,13 @@ class DynamicPartitionTest(test.TestCase):
       indices = constant_op.constant(indices_list, dtype=dtypes.int32)
       partitions = data_flow_ops.dynamic_partition(
           data, indices, num_partitions=40)
-      partition_vals = sess.run(partitions)
+      partition_vals = self.evaluate(partitions)
 
     self.assertEqual(40, len(partition_vals))
     for i in range(40):
       self.assertAllEqual([], partition_vals[i])
 
+  @test_util.run_deprecated_v1
   def testErrorIndexOutOfRange(self):
     with self.cached_session() as sess:
       data = constant_op.constant([[0, 1, 2], [3, 4, 5], [6, 7, 8], [9, 10, 11],
@@ -295,16 +300,18 @@ class DynamicPartitionTest(test.TestCase):
       partitions = data_flow_ops.dynamic_partition(
           data, indices, num_partitions=4)
       with self.assertRaisesOpError(r"partitions\[2\] = 99 is not in \[0, 4\)"):
-        sess.run(partitions)
+        self.evaluate(partitions)
 
+  @test_util.run_deprecated_v1
   def testScalarIndexOutOfRange(self):
     with self.cached_session() as sess:
       bad = 17
       data = np.zeros(5)
       partitions = data_flow_ops.dynamic_partition(data, bad, num_partitions=7)
       with self.assertRaisesOpError(r"partitions = 17 is not in \[0, 7\)"):
-        sess.run(partitions)
+        self.evaluate(partitions)
 
+  @test_util.run_deprecated_v1
   def testHigherRankIndexOutOfRange(self):
     with self.cached_session() as sess:
       shape = (2, 3)
@@ -320,6 +327,7 @@ class DynamicPartitionTest(test.TestCase):
               r"partitions\[%d,%d\] = 17 is not in \[0, 7\)" % (i, j)):
             sess.run(partitions, feed_dict={indices: bad})
 
+  @test_util.run_deprecated_v1
   def testErrorWrongDimsIndices(self):
     data = constant_op.constant([[0], [1], [2]])
     indices = constant_op.constant([[0], [0]])
@@ -335,7 +343,7 @@ class DynamicPartitionTest(test.TestCase):
     self.assertEqual(len(inds), x.shape[0])
     partitioned = data_flow_ops.dynamic_partition(x, inds, 16)
     with self.cached_session() as sess:
-      res = sess.run(partitioned)
+      res = self.evaluate(partitioned)
     self.assertEqual(res[-1].shape[0], 192)
 
 
diff --git a/tensorflow/python/kernel_tests/dynamic_stitch_op_test.py b/tensorflow/python/kernel_tests/dynamic_stitch_op_test.py
index c3f67d29aa4..4f338880aa3 100644
--- a/tensorflow/python/kernel_tests/dynamic_stitch_op_test.py
+++ b/tensorflow/python/kernel_tests/dynamic_stitch_op_test.py
@@ -22,6 +22,7 @@ import numpy as np
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.ops import math_ops
@@ -36,18 +37,19 @@ class DynamicStitchTestBase(object):
     self.stitch_op = stitch_op
 
   def testScalar(self):
-    with self.session(use_gpu=True):
+    with test_util.use_gpu():
       indices = [constant_op.constant(0), constant_op.constant(1)]
       data = [constant_op.constant(40), constant_op.constant(60)]
       for step in -1, 1:
         stitched_t = self.stitch_op(indices[::step], data)
-        stitched_val = stitched_t.eval()
+        stitched_val = self.evaluate(stitched_t)
         self.assertAllEqual([40, 60][::step], stitched_val)
         # Dimension 0 is max(flatten(indices))+1.
         self.assertEqual([2], stitched_t.get_shape().as_list())
 
+  @test_util.run_deprecated_v1
   def testShapeInferenceForScalarWithNonConstantIndices(self):
-    with self.session(use_gpu=True):
+    with test_util.use_gpu():
       indices = [
           array_ops.placeholder(dtype=dtypes.int32),
           constant_op.constant(1)
@@ -61,7 +63,7 @@ class DynamicStitchTestBase(object):
         self.assertEqual([None], stitched_t.get_shape().as_list())
 
   def testSimpleOneDimensional(self):
-    with self.session(use_gpu=True):
+    with test_util.use_gpu():
       # Test various datatypes in the simple case to ensure that the op was
       # registered under those types.
       dtypes_to_test = [
@@ -78,23 +80,23 @@ class DynamicStitchTestBase(object):
                 constant_op.constant([10, 60, 20, 30, 50]), dtype=dtype)
         ]
         stitched_t = self.stitch_op(indices, data)
-        stitched_val = stitched_t.eval()
+        stitched_val = self.evaluate(stitched_t)
         self.assertAllEqual([0, 10, 20, 30, 40, 50, 60, 70], stitched_val)
         # Dimension 0 is max(flatten(indices))+1.
         self.assertEqual([8], stitched_t.get_shape().as_list())
 
   def testOneListOneDimensional(self):
-    with self.session(use_gpu=True):
+    with test_util.use_gpu():
       indices = [constant_op.constant([1, 6, 2, 3, 5, 0, 4, 7])]
       data = [constant_op.constant([10, 60, 20, 30, 50, 0, 40, 70])]
       stitched_t = self.stitch_op(indices, data)
-      stitched_val = stitched_t.eval()
+      stitched_val = self.evaluate(stitched_t)
       self.assertAllEqual([0, 10, 20, 30, 40, 50, 60, 70], stitched_val)
       # Dimension 0 is max(flatten(indices))+1.
       self.assertEqual([8], stitched_t.get_shape().as_list())
 
   def testSimpleTwoDimensional(self):
-    with self.session(use_gpu=True):
+    with test_util.use_gpu():
       indices = [
           constant_op.constant([0, 4, 7]),
           constant_op.constant([1, 6]),
@@ -106,14 +108,14 @@ class DynamicStitchTestBase(object):
           constant_op.constant([[20, 21], [30, 31], [50, 51]])
       ]
       stitched_t = self.stitch_op(indices, data)
-      stitched_val = stitched_t.eval()
+      stitched_val = self.evaluate(stitched_t)
       self.assertAllEqual([[0, 1], [10, 11], [20, 21], [30, 31], [40, 41],
                            [50, 51], [60, 61], [70, 71]], stitched_val)
       # Dimension 0 is max(flatten(indices))+1.
       self.assertEqual([8, 2], stitched_t.get_shape().as_list())
 
   def testZeroSizeTensor(self):
-    with self.session(use_gpu=True):
+    with test_util.use_gpu():
       indices = [
           constant_op.constant([0, 4, 7]),
           constant_op.constant([1, 6]),
@@ -127,12 +129,13 @@ class DynamicStitchTestBase(object):
           array_ops.zeros([0, 2], dtype=dtypes.int32)
       ]
       stitched_t = self.stitch_op(indices, data)
-      stitched_val = stitched_t.eval()
+      stitched_val = self.evaluate(stitched_t)
       self.assertAllEqual([[0, 1], [10, 11], [20, 21], [30, 31], [40, 41],
                            [50, 51], [60, 61], [70, 71]], stitched_val)
       # Dimension 0 is max(flatten(indices))+1.
       self.assertEqual([8, 2], stitched_t.get_shape().as_list())
 
+  @test_util.run_deprecated_v1
   def testHigherRank(self):
     with self.session(use_gpu=True) as sess:
       indices = [
@@ -147,7 +150,7 @@ class DynamicStitchTestBase(object):
                                 [[1., 2.], [31., 32.]]])
       ]
       stitched_t = self.stitch_op(indices, data)
-      stitched_val = stitched_t.eval()
+      stitched_val = self.evaluate(stitched_t)
       correct = 10. * np.arange(7)[:, None] + [1., 2.]
       self.assertAllEqual(correct, stitched_val)
       self.assertEqual([7, 2], stitched_t.get_shape().as_list())
@@ -157,8 +160,9 @@ class DynamicStitchTestBase(object):
                                        stitched_grad)
       self.assertEqual(grads[:3], [None] * 3)  # Indices have no gradients
       for datum, grad in zip(data, sess.run(grads[3:])):
-        self.assertAllEqual(7. * datum.eval(), grad)
+        self.assertAllEqual(7. * self.evaluate(datum), grad)
 
+  @test_util.run_deprecated_v1
   def testErrorIndicesMultiDimensional(self):
     indices = [
         constant_op.constant([0, 4, 7]),
@@ -171,6 +175,7 @@ class DynamicStitchTestBase(object):
     with self.assertRaises(ValueError):
       self.stitch_op(indices, data)
 
+  @test_util.run_deprecated_v1
   def testErrorDataNumDimsMismatch(self):
     indices = [
         constant_op.constant([0, 4, 7]),
@@ -183,6 +188,7 @@ class DynamicStitchTestBase(object):
     with self.assertRaises(ValueError):
       self.stitch_op(indices, data)
 
+  @test_util.run_deprecated_v1
   def testErrorDataDimSizeMismatch(self):
     indices = [
         constant_op.constant([0, 4, 5]),
@@ -195,6 +201,7 @@ class DynamicStitchTestBase(object):
     with self.assertRaises(ValueError):
       self.stitch_op(indices, data)
 
+  @test_util.run_deprecated_v1
   def testErrorDataAndIndicesSizeMismatch(self):
     indices = [
         constant_op.constant([0, 4, 7]),
@@ -222,16 +229,17 @@ class ParallelDynamicStitchTest(DynamicStitchTestBase, test.TestCase):
     DynamicStitchTestBase.__init__(self, data_flow_ops.parallel_dynamic_stitch)
 
   def testScalar(self):
-    with self.session(use_gpu=True):
+    with test_util.use_gpu():
       indices = [constant_op.constant(0), constant_op.constant(1)]
       data = [constant_op.constant(40.0), constant_op.constant(60.0)]
       for step in -1, 1:
         stitched_t = data_flow_ops.dynamic_stitch(indices[::step], data)
-        stitched_val = stitched_t.eval()
+        stitched_val = self.evaluate(stitched_t)
         self.assertAllEqual([40.0, 60.0][::step], stitched_val)
         # Dimension 0 is max(flatten(indices))+1.
         self.assertEqual([2], stitched_t.get_shape().as_list())
 
+  @test_util.run_deprecated_v1
   def testHigherRank(self):
     with self.session(use_gpu=True) as sess:
       indices = [
@@ -246,7 +254,7 @@ class ParallelDynamicStitchTest(DynamicStitchTestBase, test.TestCase):
               [[[51, 52], [21, 22]], [[1, 2], [31, 32]]], dtype=dtypes.float32)
       ]
       stitched_t = data_flow_ops.dynamic_stitch(indices, data)
-      stitched_val = stitched_t.eval()
+      stitched_val = self.evaluate(stitched_t)
       correct = 10 * np.arange(7)[:, None] + [1.0, 2.0]
       self.assertAllEqual(correct, stitched_val)
       self.assertEqual([7, 2], stitched_t.get_shape().as_list())
@@ -256,7 +264,7 @@ class ParallelDynamicStitchTest(DynamicStitchTestBase, test.TestCase):
                                        stitched_grad)
       self.assertEqual(grads[:3], [None] * 3)  # Indices have no gradients
       for datum, grad in zip(data, sess.run(grads[3:])):
-        self.assertAllEqual(7.0 * datum.eval(), grad)
+        self.assertAllEqual(7.0 * self.evaluate(datum), grad)
 
   # GPU version unit tests
   def testScalarGPU(self):
@@ -265,11 +273,12 @@ class ParallelDynamicStitchTest(DynamicStitchTestBase, test.TestCase):
       data = [constant_op.constant(40.0), constant_op.constant(60.0)]
       for step in -1, 1:
         stitched_t = data_flow_ops.dynamic_stitch(indices[::step], data)
-        stitched_val = stitched_t.eval()
+        stitched_val = self.evaluate(stitched_t)
         self.assertAllEqual([40.0, 60.0][::step], stitched_val)
         # Dimension 0 is max(flatten(indices))+1.
         self.assertEqual([2], stitched_t.get_shape().as_list())
 
+  @test_util.run_deprecated_v1
   def testHigherRankGPU(self):
     with self.cached_session() as sess:
       indices = [
@@ -284,7 +293,7 @@ class ParallelDynamicStitchTest(DynamicStitchTestBase, test.TestCase):
               [[[51, 52], [21, 22]], [[1, 2], [31, 32]]], dtype=dtypes.float32)
       ]
       stitched_t = data_flow_ops.dynamic_stitch(indices, data)
-      stitched_val = stitched_t.eval()
+      stitched_val = self.evaluate(stitched_t)
       correct = 10 * np.arange(7)[:, None] + [1.0, 2.0]
       self.assertAllEqual(correct, stitched_val)
       self.assertEqual([7, 2], stitched_t.get_shape().as_list())
@@ -294,7 +303,7 @@ class ParallelDynamicStitchTest(DynamicStitchTestBase, test.TestCase):
                                        stitched_grad)
       self.assertEqual(grads[:3], [None] * 3)  # Indices have no gradients
       for datum, grad in zip(data, sess.run(grads[3:])):
-        self.assertAllEqual(7.0 * datum.eval(), grad)
+        self.assertAllEqual(7.0 * self.evaluate(datum), grad)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/kernel_tests/edit_distance_op_test.py b/tensorflow/python/kernel_tests/edit_distance_op_test.py
index dab5eee7f50..4a06ab770aa 100644
--- a/tensorflow/python/kernel_tests/edit_distance_op_test.py
+++ b/tensorflow/python/kernel_tests/edit_distance_op_test.py
@@ -49,11 +49,11 @@ class EditDistanceTest(test.TestCase):
 
     if expected_err_re is None:
       self.assertEqual(edit_distance.get_shape(), expected_shape)
-      output = edit_distance.eval()
+      output = self.evaluate(edit_distance)
       self.assertAllClose(output, expected_output)
     else:
       with self.assertRaisesOpError(expected_err_re):
-        edit_distance.eval()
+        self.evaluate(edit_distance)
 
   def _testEditDistance(self,
                         hypothesis,
diff --git a/tensorflow/python/kernel_tests/embedding_ops_test.py b/tensorflow/python/kernel_tests/embedding_ops_test.py
index 008d6fbf577..6019245d0f8 100644
--- a/tensorflow/python/kernel_tests/embedding_ops_test.py
+++ b/tensorflow/python/kernel_tests/embedding_ops_test.py
@@ -28,6 +28,7 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.ops import embedding_ops
@@ -76,7 +77,7 @@ class ScatterAddSubTest(test.TestCase):
       # p = init
       variables.global_variables_initializer().run()
       # p += vals
-      result = p2.eval()
+      result = self.evaluate(p2)
     # Compute the expected 'p' using numpy operations.
     for i, ind in enumerate(indices):
       if scatter_op == state_ops.scatter_add:
@@ -87,16 +88,19 @@ class ScatterAddSubTest(test.TestCase):
             vals_shape[0], -1)[i, :])
     self.assertTrue(all((p_init == result).ravel()))
 
+  @test_util.run_deprecated_v1
   def testNoRepetitions(self):
     self._TestCase([2, 2], [1])
     self._TestCase([4, 4, 4], [2, 0])
     self._TestCase([43, 20, 10, 10], [42, 5, 6, 1, 3, 5, 7, 9])
 
+  @test_util.run_deprecated_v1
   def testWithRepetitions(self):
     self._TestCase([2, 2], [1, 1])
     self._TestCase([5, 3, 9, 5], [2, 0, 4, 1, 3, 1, 4, 0, 4, 3])
     self._TestCase([32, 4, 4], [31] * 8)
 
+  @test_util.run_deprecated_v1
   def testRandom(self):
     # Random shapes of rank 4, random indices
     for _ in range(5):
@@ -104,6 +108,7 @@ class ScatterAddSubTest(test.TestCase):
       indices = np.random.randint(shape[0], size=2 * shape[0])
       self._TestCase(_AsLong(list(shape)), list(indices))
 
+  @test_util.run_deprecated_v1
   def testSubRandom(self):
     # Random shapes of rank 4, random indices
     for _ in range(5):
@@ -111,6 +116,7 @@ class ScatterAddSubTest(test.TestCase):
       indices = np.random.randint(shape[0], size=2 * shape[0])
       self._TestCase(_AsLong(list(shape)), list(indices), state_ops.scatter_sub)
 
+  @test_util.run_deprecated_v1
   def testWrongShape(self):
     # Indices and values mismatch.
     var = variables.Variable(
@@ -241,6 +247,7 @@ class EmbeddingLookupTest(test.TestCase):
   # both the ids are in the first shard, one of the resulting lookup
   # vector is going to be empty. The subsequent DivOp fails because of that.
   # TODO(keveman): Disabling the test until the underlying problem is fixed.
+  @test_util.run_deprecated_v1
   def testSimpleSharded(self):
     with self.cached_session():
       num_shards = 2
@@ -257,6 +264,7 @@ class EmbeddingLookupTest(test.TestCase):
     self.assertAllEqual(np_result, tf_result)
     self.assertShapeEqual(np_result, embedding)
 
+  @test_util.run_deprecated_v1
   def testMaxNorm(self):
     with self.cached_session():
       embeddings = constant_op.constant([[2.0]])
@@ -267,6 +275,7 @@ class EmbeddingLookupTest(test.TestCase):
 
       self.assertAllEqual(embedding.eval(), [[1.0]])
 
+  @test_util.run_deprecated_v1
   def testMaxNormNontrivial(self):
     with self.cached_session():
       embeddings = constant_op.constant([[2.0, 4.0], [3.0, 1.0]])
@@ -278,8 +287,9 @@ class EmbeddingLookupTest(test.TestCase):
       norms = math_ops.sqrt(
           math_ops.reduce_sum(embeddings * embeddings, axis=1))
       normalized = embeddings / array_ops.stack([norms, norms], axis=1)
-      self.assertAllEqual(embedding.eval(), 2 * normalized.eval())
+      self.assertAllEqual(embedding.eval(), 2 * self.evaluate(normalized))
 
+  @test_util.run_deprecated_v1
   def testSimpleShardedPartitionedVariable(self):
     with self.cached_session() as sess:
       num_shards = 2
@@ -294,7 +304,7 @@ class EmbeddingLookupTest(test.TestCase):
       variables.global_variables_initializer().run()
       params_values = [params[p_i.name] for p_i in p]
       # Test that the PartitionedVariable components equal the list in p
-      p_var_val = sess.run(list(p_variable))
+      p_var_val = self.evaluate(list(p_variable))
       # Actual test
       tf_result = embedding.eval(feed_dict=feed_dict)
     np_result, _, _ = _EmbeddingResult(params, id_vals, num_shards, vocab_size)
@@ -302,6 +312,7 @@ class EmbeddingLookupTest(test.TestCase):
     self.assertAllEqual(np_result, tf_result)
     self.assertShapeEqual(np_result, embedding)
 
+  @test_util.run_deprecated_v1
   def testSimpleShardedPartitionedResourceVariable(self):
     with self.cached_session() as sess:
       num_shards = 2
@@ -316,15 +327,16 @@ class EmbeddingLookupTest(test.TestCase):
       variables.global_variables_initializer().run()
       params_values = [params[p_i.name] for p_i in p]
       # Test that the PartitionedVariable components equal the list in p
-      p_var_val = sess.run(list(p_variable))
+      p_var_val = self.evaluate(list(p_variable))
       # Actual test
       print(ops.get_default_graph().as_graph_def())
-      tf_result = embedding.eval()
+      tf_result = self.evaluate(embedding)
     np_result, _, _ = _EmbeddingResult(params, id_vals, num_shards, vocab_size)
     self.assertAllEqual(params_values, p_var_val)
     self.assertAllEqual(np_result, tf_result)
     self.assertShapeEqual(np_result, embedding)
 
+  @test_util.run_deprecated_v1
   def testShardedModPartitioningInt32Ids(self):
     with self.cached_session():
       num_shards = 5
@@ -347,6 +359,7 @@ class EmbeddingLookupTest(test.TestCase):
     self.assertAllEqual(np_result, tf_result)
     self.assertShapeEqual(np_result, embedding)
 
+  @test_util.run_deprecated_v1
   def testShardedModPartitioningInt64Ids(self):
     with self.cached_session():
       num_shards = 5
@@ -369,6 +382,7 @@ class EmbeddingLookupTest(test.TestCase):
     self.assertAllEqual(np_result, tf_result)
     self.assertShapeEqual(np_result, embedding)
 
+  @test_util.run_deprecated_v1
   def testShardedDivPartitioningInt32Ids(self):
     with self.cached_session():
       num_shards = 5
@@ -393,6 +407,7 @@ class EmbeddingLookupTest(test.TestCase):
     self.assertAllEqual(np_result, tf_result)
     self.assertShapeEqual(np_result, embedding)
 
+  @test_util.run_deprecated_v1
   def testShardedDivPartitioningInt32IdsPartitionedVariable(self):
     with self.cached_session():
       num_shards = 5
@@ -418,6 +433,7 @@ class EmbeddingLookupTest(test.TestCase):
     self.assertAllEqual(np_result, tf_result)
     self.assertShapeEqual(np_result, embedding)
 
+  @test_util.run_deprecated_v1
   def testShardedDivPartitioningInt64Ids(self):
     with self.cached_session():
       num_shards = 5
@@ -442,6 +458,7 @@ class EmbeddingLookupTest(test.TestCase):
     self.assertAllEqual(np_result, tf_result)
     self.assertShapeEqual(np_result, embedding)
 
+  @test_util.run_deprecated_v1
   def testShardedDivPartitioningUnknownParamShape(self):
     with self.cached_session():
       num_shards = 5
@@ -468,6 +485,7 @@ class EmbeddingLookupTest(test.TestCase):
         params, id_vals, num_shards, vocab_size, partition_strategy="div")
     self.assertAllEqual(np_result, tf_result)
 
+  @test_util.run_deprecated_v1
   def testGradientsEmbeddingLookup(self):
     vocab_size = 9
     num_ids = 10
@@ -488,6 +506,7 @@ class EmbeddingLookupTest(test.TestCase):
               x, x_shape, y, y_shape, x_init_value=x_init_value)
         self.assertLess(err, 1e-4)
 
+  @test_util.run_deprecated_v1
   def testGradientsEmbeddingLookupWithComputedParams(self):
     vocab_size = 9
     num_ids = 5
@@ -526,6 +545,7 @@ class EmbeddingLookupTest(test.TestCase):
         ids = constant_op.constant([0, 1, 1, 17], dtype=dtypes.int32)
       embedding_ops.embedding_lookup(p, ids)
 
+  @test_util.run_deprecated_v1
   def testHigherRank(self):
     np.random.seed(8)
     with self.cached_session():
@@ -546,6 +566,7 @@ class EmbeddingLookupTest(test.TestCase):
             sharded = embedding_ops.embedding_lookup(split_params, ids).eval()
             self.assertAllEqual(simple, sharded)
 
+  @test_util.run_deprecated_v1
   def testHigherRankMaxNorm(self):
     np.random.seed(8)
     with self.cached_session():
@@ -574,6 +595,7 @@ class EmbeddingLookupTest(test.TestCase):
                 split_params, ids, max_norm=1.0).eval()
             self.assertAllEqual(simple, sharded)
 
+  @test_util.run_deprecated_v1
   def testTransform(self):
     # This tests all combinations of:
     #   - ids rank 0, 1, >1
@@ -648,6 +670,7 @@ class EmbeddingLookupSparseTest(test.TestCase):
       index += num_val
     return grouped_vals
 
+  @test_util.run_deprecated_v1
   def testEmbeddingLookupSparse(self):
     vocab_size = 13
     batch_size = 10
@@ -706,6 +729,7 @@ class EmbeddingLookupSparseTest(test.TestCase):
         atol = rtol
         self.assertAllClose(np_embedding_sum, tf_embedding_sum, rtol, atol)
 
+  @test_util.run_deprecated_v1
   def testGradientsEmbeddingLookupSparse(self):
     vocab_size = 12
     batch_size = 4
@@ -733,6 +757,7 @@ class EmbeddingLookupSparseTest(test.TestCase):
             x, x_shape, y, y_shape, x_init_value=x_init_value)
       self.assertLess(err, 1e-5 if dtype == dtypes.float64 else 2e-3)
 
+  @test_util.run_deprecated_v1
   def testIncompatibleShapes(self):
     with self.cached_session():
       x, _, _ = _EmbeddingParams(1, 10, dtype=dtypes.float32)
@@ -758,11 +783,13 @@ class SafeEmbeddingLookupSparseTest(test.TestCase):
     assert num_shards > 0
     assert num_shards <= vocab_size
 
-    embedding_weights = partitioned_variables.create_partitioned_variables(
+    initializer = init_ops.truncated_normal_initializer(
+        mean=0.0, stddev=1.0 / math.sqrt(vocab_size), dtype=dtypes.float32)
+    embedding_weights = list(variable_scope.get_variable(
+        name="embedding_weights",
         shape=[vocab_size, embed_dim],
-        slicing=[num_shards, 1],
-        initializer=init_ops.truncated_normal_initializer(
-            mean=0.0, stddev=1.0 / math.sqrt(vocab_size), dtype=dtypes.float32))
+        partitioner=partitioned_variables.fixed_size_partitioner(num_shards),
+        initializer=initializer))
     for w in embedding_weights:
       w.initializer.run()
     embedding_weights = [w.eval() for w in embedding_weights]
@@ -818,6 +845,7 @@ class SafeEmbeddingLookupSparseTest(test.TestCase):
 
     return sparse_ids, sparse_weights
 
+  @test_util.run_deprecated_v1
   def test_safe_embedding_lookup_sparse_return_zero_vector(self):
     with self.cached_session():
       embedding_weights = self._random_weights()
@@ -831,6 +859,7 @@ class SafeEmbeddingLookupSparseTest(test.TestCase):
           [(1.0 * embedding_weights[0][0] + 2.0 * embedding_weights[0][1]) /
            3.0, [0] * 4, [0] * 4, embedding_weights[0][2], [0] * 4])
 
+  @test_util.run_deprecated_v1
   def test_safe_embedding_lookup_sparse_return_special_vector(self):
     with self.cached_session():
       embedding_weights = self._random_weights()
@@ -845,6 +874,7 @@ class SafeEmbeddingLookupSparseTest(test.TestCase):
            3.0, embedding_weights[0][3], embedding_weights[0][3],
            embedding_weights[0][2], embedding_weights[0][3]])
 
+  @test_util.run_deprecated_v1
   def test_safe_embedding_lookup_sparse_no_weights(self):
     with self.cached_session():
       embedding_weights = self._random_weights()
@@ -859,6 +889,7 @@ class SafeEmbeddingLookupSparseTest(test.TestCase):
            [0] * 4, embedding_weights[0][2], (
                embedding_weights[0][0] + embedding_weights[0][1]) / 2.0])
 
+  @test_util.run_deprecated_v1
   def test_safe_embedding_lookup_sparse_partitioned(self):
     with self.cached_session():
       embedding_weights = self._random_weights(num_shards=3)
@@ -873,6 +904,7 @@ class SafeEmbeddingLookupSparseTest(test.TestCase):
                            [0] * 4, [0] * 4, embedding_weights[2],
                            (embedding_weights[0] + embedding_weights[1]) / 2.0])
 
+  @test_util.run_deprecated_v1
   def test_safe_embedding_lookup_sparse_partitioned_inconsistent_weights(self):
     with self.cached_session():
       embedding_weights = self._random_weights(num_shards=3)
@@ -888,6 +920,7 @@ class SafeEmbeddingLookupSparseTest(test.TestCase):
       self.assertRaises(ValueError, embedding_ops.safe_embedding_lookup_sparse,
                         embedding_weights, sparse_ids, sparse_weights)
 
+  @test_util.run_deprecated_v1
   def test_safe_embedding_lookup_sparse_3d_return_zero_vector(self):
     with self.cached_session():
       embedding_weights = self._random_weights()
@@ -901,6 +934,7 @@ class SafeEmbeddingLookupSparseTest(test.TestCase):
           [0] * 4, [0] * 4
       ], [embedding_weights[0][2], [0] * 4, [0] * 4]])
 
+  @test_util.run_deprecated_v1
   def test_safe_embedding_lookup_sparse_3d_return_special_vector(self):
     with self.cached_session():
       embedding_weights = self._random_weights()
@@ -917,6 +951,7 @@ class SafeEmbeddingLookupSparseTest(test.TestCase):
                 embedding_weights[0][3]
             ]])
 
+  @test_util.run_deprecated_v1
   def test_safe_embedding_lookup_sparse_3d_no_weights(self):
     with self.cached_session():
       embedding_weights = self._random_weights()
@@ -933,6 +968,7 @@ class SafeEmbeddingLookupSparseTest(test.TestCase):
               (embedding_weights[0][0] + embedding_weights[0][1]) / 2.0, [0] * 4
           ]])
 
+  @test_util.run_deprecated_v1
   def test_safe_embedding_lookup_sparse_3d_partitioned(self):
     with self.cached_session():
       embedding_weights = self._random_weights(num_shards=3)
@@ -949,6 +985,7 @@ class SafeEmbeddingLookupSparseTest(test.TestCase):
           (embedding_weights[0] + embedding_weights[1]) / 2.0, [0] * 4
       ]])
 
+  @test_util.run_deprecated_v1
   def test_safe_embedding_lookup_sparse_3d_partitioned_inconsistent_weights(
       self):
     with self.cached_session():
@@ -968,6 +1005,7 @@ class SafeEmbeddingLookupSparseTest(test.TestCase):
 
 class DynamicStitchOpTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testCint32Cpu(self):
     with self.session(use_gpu=False):
       indices = [
@@ -981,6 +1019,7 @@ class DynamicStitchOpTest(test.TestCase):
       self.assertAllEqual(
           data_flow_ops.dynamic_stitch(indices, values).eval(), [12, 23, 1, 2])
 
+  @test_util.run_deprecated_v1
   def testCint32Gpu(self):
     with self.session(use_gpu=True):
       indices = [
@@ -994,6 +1033,7 @@ class DynamicStitchOpTest(test.TestCase):
       self.assertAllEqual(
           data_flow_ops.dynamic_stitch(indices, values).eval(), [12, 23, 1, 2])
 
+  @test_util.run_deprecated_v1
   def testInt32Cpu(self):
     with self.session(use_gpu=False):
       indices = [
@@ -1007,6 +1047,7 @@ class DynamicStitchOpTest(test.TestCase):
       self.assertAllEqual(
           data_flow_ops.dynamic_stitch(indices, values).eval(), [12, 23, 1, 2])
 
+  @test_util.run_deprecated_v1
   def testInt32Gpu(self):
     with self.session(use_gpu=True):
       indices = [
@@ -1020,6 +1061,7 @@ class DynamicStitchOpTest(test.TestCase):
       self.assertAllEqual(
           data_flow_ops.dynamic_stitch(indices, values).eval(), [12, 23, 1, 2])
 
+  @test_util.run_deprecated_v1
   def testSumGradArgs(self):
     with self.session(use_gpu=False):
       indices = [
@@ -1034,6 +1076,7 @@ class DynamicStitchOpTest(test.TestCase):
           data_flow_ops.dynamic_stitch(indices, values).eval(), [2, 3, 1, 1])
 
   # We expect that the values are merged in order.
+  @test_util.run_deprecated_v1
   def testStitchOrder(self):
     with self.cached_session():
       indices = []
@@ -1049,6 +1092,7 @@ class DynamicStitchOpTest(test.TestCase):
 
 class ParallelDynamicStitchOpTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testCint32Cpu(self):
     with self.session(use_gpu=False):
       indices = [
@@ -1063,6 +1107,7 @@ class ParallelDynamicStitchOpTest(test.TestCase):
           data_flow_ops.parallel_dynamic_stitch(indices, values).eval(),
           [12, 23, 1, 2, 34, 3, 45])
 
+  @test_util.run_deprecated_v1
   def testInt32Cpu(self):
     with self.session(use_gpu=False):
       indices = [
@@ -1077,6 +1122,7 @@ class ParallelDynamicStitchOpTest(test.TestCase):
           data_flow_ops.parallel_dynamic_stitch(indices, values).eval(),
           [12, 23, 1, 2, 3, 34, 45, 56])
 
+  @test_util.run_deprecated_v1
   def testSimple(self):
     with self.session(use_gpu=False):
       indices = [ops.convert_to_tensor([0, 1]), ops.convert_to_tensor([2, 3])]
diff --git a/tensorflow/python/kernel_tests/extract_image_patches_grad_test.py b/tensorflow/python/kernel_tests/extract_image_patches_grad_test.py
index 7d9d4e51752..7ba2dc6c209 100644
--- a/tensorflow/python/kernel_tests/extract_image_patches_grad_test.py
+++ b/tensorflow/python/kernel_tests/extract_image_patches_grad_test.py
@@ -23,6 +23,7 @@ import numpy as np
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import random_seed as random_seed_lib
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import gradients_impl
@@ -78,6 +79,7 @@ class ExtractImagePatchesGradTest(test.TestCase):
       },
   ]
 
+  @test_util.run_deprecated_v1
   def testGradient(self):
     # Set graph seed for determinism.
     random_seed = 42
@@ -102,6 +104,7 @@ class ExtractImagePatchesGradTest(test.TestCase):
           print('extract_image_patches gradient err: %.4e' % err)
           self.assertLess(err, 1e-4)
 
+  @test_util.run_deprecated_v1
   def testConstructGradientWithLargeImages(self):
     batch_size = 4
     height = 1024
diff --git a/tensorflow/python/kernel_tests/extract_image_patches_op_test.py b/tensorflow/python/kernel_tests/extract_image_patches_op_test.py
index 61436f24cfe..bb3c0ae8069 100644
--- a/tensorflow/python/kernel_tests/extract_image_patches_op_test.py
+++ b/tensorflow/python/kernel_tests/extract_image_patches_op_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
 
@@ -43,7 +44,7 @@ class ExtractImagePatches(test.TestCase):
     strides = [1] + strides + [1]
     rates = [1] + rates + [1]
 
-    with self.session(use_gpu=True):
+    with test_util.use_gpu():
       out_tensor = array_ops.extract_image_patches(
           constant_op.constant(image),
           ksizes=ksizes,
@@ -51,7 +52,7 @@ class ExtractImagePatches(test.TestCase):
           rates=rates,
           padding=padding,
           name="im2col")
-      self.assertAllClose(patches, out_tensor.eval())
+      self.assertAllClose(patches, self.evaluate(out_tensor))
 
   def testKsize1x1Stride1x1Rate1x1(self):
     """Verifies that for 1x1 kernel the output equals the input."""
diff --git a/tensorflow/python/kernel_tests/extract_volume_patches_op_test.py b/tensorflow/python/kernel_tests/extract_volume_patches_op_test.py
index bbb3fef85b4..88f7df8fbb6 100644
--- a/tensorflow/python/kernel_tests/extract_volume_patches_op_test.py
+++ b/tensorflow/python/kernel_tests/extract_volume_patches_op_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
 
@@ -45,14 +46,14 @@ class ExtractVolumePatches(test.TestCase):
     ksizes = [1] + ksizes + [1]
     strides = [1] + strides + [1]
 
-    with self.cached_session(use_gpu=True):
+    with test_util.use_gpu():
       out_tensor = array_ops.extract_volume_patches(
           constant_op.constant(image),
           ksizes=ksizes,
           strides=strides,
           padding=padding,
           name="im2col_3d")
-      self.assertAllClose(patches, out_tensor.eval())
+      self.assertAllClose(patches, self.evaluate(out_tensor))
 
   # pylint: disable=bad-whitespace
   def testKsize1x1x1Stride1x1x1(self):
diff --git a/tensorflow/python/kernel_tests/fifo_queue_test.py b/tensorflow/python/kernel_tests/fifo_queue_test.py
index 8961c4b13c2..9655351a01e 100644
--- a/tensorflow/python/kernel_tests/fifo_queue_test.py
+++ b/tensorflow/python/kernel_tests/fifo_queue_test.py
@@ -159,7 +159,7 @@ class FIFOQueueTest(test.TestCase):
 
       # Run one producer thread for each element in elems.
       def enqueue(enqueue_op):
-        sess.run(enqueue_op)
+        self.evaluate(enqueue_op)
 
       threads = [
           self.checkedThread(
@@ -191,7 +191,7 @@ class FIFOQueueTest(test.TestCase):
       results = []
 
       def dequeue():
-        results.append(sess.run(dequeued_t))
+        results.append(self.evaluate(dequeued_t))
 
       threads = [self.checkedThread(target=dequeue) for _ in enqueue_ops]
       for thread in threads:
@@ -211,7 +211,7 @@ class FIFOQueueTest(test.TestCase):
         enqueue_op.run()
 
       for i in xrange(len(elems)):
-        vals = dequeued_t.eval()
+        vals = self.evaluate(dequeued_t)
         self.assertEqual([elems[i]], vals)
 
   def testDequeueHalf(self):
@@ -225,7 +225,7 @@ class FIFOQueueTest(test.TestCase):
         enqueue_op.run()
 
       for i in xrange(len(elems)):
-        vals = dequeued_t.eval()
+        vals = self.evaluate(dequeued_t)
         self.assertEqual([elems[i]], vals)
 
   def testEnqueueAndBlockingDequeue(self):
@@ -240,13 +240,13 @@ class FIFOQueueTest(test.TestCase):
         # TODO(mrry): Figure out how to do this without sleeping.
         time.sleep(0.1)
         for enqueue_op in enqueue_ops:
-          sess.run(enqueue_op)
+          self.evaluate(enqueue_op)
 
       results = []
 
       def dequeue():
         for _ in xrange(len(elems)):
-          results.append(sess.run(dequeued_t))
+          results.append(self.evaluate(dequeued_t))
 
       enqueue_thread = self.checkedThread(target=enqueue)
       dequeue_thread = self.checkedThread(target=dequeue)
@@ -269,7 +269,7 @@ class FIFOQueueTest(test.TestCase):
         enqueue_op.run()
 
       for i in xrange(len(elems)):
-        x_val, y_val = sess.run(dequeued_t)
+        x_val, y_val = self.evaluate(dequeued_t)
         x, y = elems[i]
         self.assertEqual([x], x_val)
         self.assertEqual([y], y_val)
@@ -288,9 +288,9 @@ class FIFOQueueTest(test.TestCase):
       self.assertEqual([], size.get_shape())
 
       enqueue_op.run()
-      self.assertEqual(1, size.eval())
+      self.assertEqual(1, self.evaluate(size))
       dequeued_t.op.run()
-      self.assertEqual(0, size.eval())
+      self.assertEqual(0, self.evaluate(size))
 
   def testEnqueueMany(self):
     with self.cached_session():
@@ -302,7 +302,7 @@ class FIFOQueueTest(test.TestCase):
       enqueue_op.run()
 
       for i in range(8):
-        vals = dequeued_t.eval()
+        vals = self.evaluate(dequeued_t)
         self.assertEqual([elems[i % 4]], vals)
 
   def testEmptyEnqueueMany(self):
@@ -313,9 +313,9 @@ class FIFOQueueTest(test.TestCase):
       enqueue_op = q.enqueue_many((empty_t,))
       size_t = q.size()
 
-      self.assertEqual([0], size_t.eval())
+      self.assertEqual([0], self.evaluate(size_t))
       enqueue_op.run()
-      self.assertEqual([0], size_t.eval())
+      self.assertEqual([0], self.evaluate(size_t))
 
   def testEmptyDequeueMany(self):
     with self.cached_session():
@@ -323,9 +323,9 @@ class FIFOQueueTest(test.TestCase):
       enqueue_op = q.enqueue((10.0,))
       dequeued_t = q.dequeue_many(0)
 
-      self.assertEqual([], dequeued_t.eval().tolist())
+      self.assertEqual([], self.evaluate(dequeued_t).tolist())
       enqueue_op.run()
-      self.assertEqual([], dequeued_t.eval().tolist())
+      self.assertEqual([], self.evaluate(dequeued_t).tolist())
 
   def testEmptyDequeueUpTo(self):
     with self.cached_session():
@@ -333,9 +333,9 @@ class FIFOQueueTest(test.TestCase):
       enqueue_op = q.enqueue((10.0,))
       dequeued_t = q.dequeue_up_to(0)
 
-      self.assertEqual([], dequeued_t.eval().tolist())
+      self.assertEqual([], self.evaluate(dequeued_t).tolist())
       enqueue_op.run()
-      self.assertEqual([], dequeued_t.eval().tolist())
+      self.assertEqual([], self.evaluate(dequeued_t).tolist())
 
   def testEmptyDequeueManyWithNoShape(self):
     with self.cached_session():
@@ -356,7 +356,7 @@ class FIFOQueueTest(test.TestCase):
       enqueue_op.run()
 
       for i in range(8):
-        float_val, int_val = sess.run(dequeued_t)
+        float_val, int_val = self.evaluate(dequeued_t)
         self.assertEqual(float_elems[i % 4], float_val)
         self.assertAllEqual(int_elems[i % 4], int_val)
 
@@ -369,8 +369,8 @@ class FIFOQueueTest(test.TestCase):
 
       enqueue_op.run()
 
-      self.assertAllEqual(elems[0:4], dequeued_t.eval())
-      self.assertAllEqual(elems[4:8], dequeued_t.eval())
+      self.assertAllEqual(elems[0:4], self.evaluate(dequeued_t))
+      self.assertAllEqual(elems[4:8], self.evaluate(dequeued_t))
 
   def testDequeueUpToNoBlocking(self):
     with self.cached_session():
@@ -381,8 +381,8 @@ class FIFOQueueTest(test.TestCase):
 
       enqueue_op.run()
 
-      self.assertAllEqual(elems[0:4], dequeued_t.eval())
-      self.assertAllEqual(elems[4:8], dequeued_t.eval())
+      self.assertAllEqual(elems[0:4], self.evaluate(dequeued_t))
+      self.assertAllEqual(elems[4:8], self.evaluate(dequeued_t))
 
   def testMultiDequeueMany(self):
     with self.cached_session() as sess:
@@ -399,17 +399,17 @@ class FIFOQueueTest(test.TestCase):
 
       enqueue_op.run()
 
-      float_val, int_val = sess.run(dequeued_t)
+      float_val, int_val = self.evaluate(dequeued_t)
       self.assertAllEqual(float_elems[0:4], float_val)
       self.assertAllEqual(int_elems[0:4], int_val)
       self.assertEqual(float_val.shape, dequeued_t[0].get_shape())
       self.assertEqual(int_val.shape, dequeued_t[1].get_shape())
 
-      float_val, int_val = sess.run(dequeued_t)
+      float_val, int_val = self.evaluate(dequeued_t)
       self.assertAllEqual(float_elems[4:8], float_val)
       self.assertAllEqual(int_elems[4:8], int_val)
 
-      float_val, int_val = sess.run(dequeued_single_t)
+      float_val, int_val = self.evaluate(dequeued_single_t)
       self.assertAllEqual(float_elems[8], float_val)
       self.assertAllEqual(int_elems[8], int_val)
       self.assertEqual(float_val.shape, dequeued_single_t[0].get_shape())
@@ -429,13 +429,13 @@ class FIFOQueueTest(test.TestCase):
 
       enqueue_op.run()
 
-      float_val, int_val = sess.run(dequeued_t)
+      float_val, int_val = self.evaluate(dequeued_t)
       self.assertAllEqual(float_elems[0:4], float_val)
       self.assertAllEqual(int_elems[0:4], int_val)
       self.assertEqual([None], dequeued_t[0].get_shape().as_list())
       self.assertEqual([None, 2], dequeued_t[1].get_shape().as_list())
 
-      float_val, int_val = sess.run(dequeued_t)
+      float_val, int_val = self.evaluate(dequeued_t)
       self.assertAllEqual(float_elems[4:8], float_val)
       self.assertAllEqual(int_elems[4:8], int_val)
 
@@ -518,7 +518,7 @@ class FIFOQueueTest(test.TestCase):
                                    r"Expected \[2,3,3\], got \[2,3,4\]"):
         sess.run([enqueue_op],
                  feed_dict={elems_bad: np.array([1] * 24).reshape((2, 3, 4))})
-        dequeued_t.eval()
+        self.evaluate(dequeued_t)
 
   def testParallelEnqueueMany(self):
     with self.cached_session() as sess:
@@ -529,7 +529,7 @@ class FIFOQueueTest(test.TestCase):
 
       # Enqueue 100 items in parallel on 10 threads.
       def enqueue():
-        sess.run(enqueue_op)
+        self.evaluate(enqueue_op)
 
       threads = [self.checkedThread(target=enqueue) for _ in range(10)]
       for thread in threads:
@@ -552,7 +552,7 @@ class FIFOQueueTest(test.TestCase):
       dequeued_elems = []
 
       def dequeue():
-        dequeued_elems.extend(sess.run(dequeued_t))
+        dequeued_elems.extend(self.evaluate(dequeued_t))
 
       threads = [self.checkedThread(target=dequeue) for _ in range(10)]
       for thread in threads:
@@ -576,7 +576,7 @@ class FIFOQueueTest(test.TestCase):
       dequeued_elems = []
 
       def dequeue():
-        dequeued_elems.extend(sess.run(dequeued_t))
+        dequeued_elems.extend(self.evaluate(dequeued_t))
 
       threads = [self.checkedThread(target=dequeue) for _ in range(10)]
       for thread in threads:
@@ -596,11 +596,11 @@ class FIFOQueueTest(test.TestCase):
 
       def enqueue():
         for _ in xrange(100):
-          sess.run(enqueue_op)
+          self.evaluate(enqueue_op)
 
       def dequeue():
         for _ in xrange(100):
-          self.assertTrue(sess.run(dequeued_t) in (10.0, 20.0))
+          self.assertTrue(self.evaluate(dequeued_t) in (10.0, 20.0))
 
       enqueue_threads = [self.checkedThread(target=enqueue) for _ in range(10)]
       dequeue_threads = [self.checkedThread(target=dequeue) for _ in range(10)]
@@ -632,7 +632,7 @@ class FIFOQueueTest(test.TestCase):
 
       def dequeue():
         for i in xrange(250):
-          self.assertEqual(i, sess.run(dequeued_t))
+          self.assertEqual(i, self.evaluate(dequeued_t))
 
       dequeue_thread = self.checkedThread(target=dequeue)
       dequeue_thread.start()
@@ -663,7 +663,7 @@ class FIFOQueueTest(test.TestCase):
       dequeuemany_t = q.dequeue_many(count_placeholder)
 
       def enqueue():
-        sess.run(enqueue_op)
+        self.evaluate(enqueue_op)
 
       enqueue_thread = self.checkedThread(target=enqueue)
       enqueue_thread.start()
@@ -672,7 +672,7 @@ class FIFOQueueTest(test.TestCase):
       while elements_dequeued < 250:
         # With equal probability, run Dequeue or dequeue_many.
         if random.random() > 0.5:
-          self.assertEqual(elements_dequeued, dequeued_t.eval())
+          self.assertEqual(elements_dequeued, self.evaluate(dequeued_t))
           elements_dequeued += 1
         else:
           count = random.randint(0, min(20, 250 - elements_dequeued))
@@ -701,10 +701,10 @@ class FIFOQueueTest(test.TestCase):
         # The enqueue_op should run after the dequeue op has blocked.
         # TODO(mrry): Figure out how to do this without sleeping.
         time.sleep(0.1)
-        sess.run(enqueue_op)
+        self.evaluate(enqueue_op)
 
       def dequeue():
-        dequeued_elems.extend(sess.run(dequeued_t).tolist())
+        dequeued_elems.extend(self.evaluate(dequeued_t).tolist())
 
       enqueue_thread = self.checkedThread(target=enqueue)
       dequeue_thread = self.checkedThread(target=dequeue)
@@ -728,10 +728,10 @@ class FIFOQueueTest(test.TestCase):
         # The enqueue_op should run after the dequeue op has blocked.
         # TODO(mrry): Figure out how to do this without sleeping.
         time.sleep(0.1)
-        sess.run(enqueue_op)
+        self.evaluate(enqueue_op)
 
       def dequeue():
-        dequeued_elems.extend(sess.run(dequeued_t).tolist())
+        dequeued_elems.extend(self.evaluate(dequeued_t).tolist())
 
       enqueue_thread = self.checkedThread(target=enqueue)
       dequeue_thread = self.checkedThread(target=dequeue)
@@ -778,12 +778,12 @@ class FIFOQueueTest(test.TestCase):
       enqueue_op.run()
       close_op.run()
       for elem in elems:
-        self.assertEqual([elem], dequeued_t.eval())
+        self.assertEqual([elem], self.evaluate(dequeued_t))
 
       # Expect the operation to fail due to the queue being closed.
       with self.assertRaisesRegexp(errors_impl.OutOfRangeError,
                                    "is closed and has insufficient"):
-        dequeued_t.eval()
+        self.evaluate(dequeued_t)
 
   def testBlockingDequeueFromClosedQueue(self):
     with self.cached_session() as sess:
@@ -797,11 +797,11 @@ class FIFOQueueTest(test.TestCase):
 
       def dequeue():
         for elem in elems:
-          self.assertEqual([elem], sess.run(dequeued_t))
+          self.assertEqual([elem], self.evaluate(dequeued_t))
         # Expect the operation to fail due to the queue being closed.
         with self.assertRaisesRegexp(errors_impl.OutOfRangeError,
                                      "is closed and has insufficient"):
-          sess.run(dequeued_t)
+          self.evaluate(dequeued_t)
 
       dequeue_thread = self.checkedThread(target=dequeue)
       dequeue_thread.start()
@@ -821,7 +821,7 @@ class FIFOQueueTest(test.TestCase):
         # Expect the operation to fail due to the queue being closed.
         with self.assertRaisesRegexp(errors_impl.OutOfRangeError,
                                      "is closed and has insufficient"):
-          sess.run(dequeued_t)
+          self.evaluate(dequeued_t)
 
       dequeue_thread = self.checkedThread(target=dequeue)
       dequeue_thread.start()
@@ -842,11 +842,11 @@ class FIFOQueueTest(test.TestCase):
       enqueue_op.run()
 
       def dequeue():
-        self.assertAllEqual(elems, sess.run(dequeued_t))
+        self.assertAllEqual(elems, self.evaluate(dequeued_t))
         # Expect the operation to fail due to the queue being closed.
         with self.assertRaisesRegexp(errors_impl.OutOfRangeError,
                                      "is closed and has insufficient"):
-          sess.run(dequeued_t)
+          self.evaluate(dequeued_t)
 
       dequeue_thread = self.checkedThread(target=dequeue)
       dequeue_thread.start()
@@ -867,11 +867,11 @@ class FIFOQueueTest(test.TestCase):
       enqueue_op.run()
 
       def dequeue():
-        self.assertAllEqual(elems[:3], sess.run(dequeued_t))
+        self.assertAllEqual(elems[:3], self.evaluate(dequeued_t))
         # Expect the operation to fail due to the queue being closed.
         with self.assertRaisesRegexp(errors_impl.OutOfRangeError,
                                      "is closed and has insufficient"):
-          sess.run(dequeued_t)
+          self.evaluate(dequeued_t)
 
       dequeue_thread = self.checkedThread(target=dequeue)
       dequeue_thread.start()
@@ -892,8 +892,8 @@ class FIFOQueueTest(test.TestCase):
       enqueue_op.run()
 
       def dequeue():
-        self.assertAllEqual(elems[:3], sess.run(dequeued_t))
-        self.assertAllEqual(elems[3:], sess.run(dequeued_t))
+        self.assertAllEqual(elems[:3], self.evaluate(dequeued_t))
+        self.assertAllEqual(elems[3:], self.evaluate(dequeued_t))
 
       dequeue_thread = self.checkedThread(target=dequeue)
       dequeue_thread.start()
@@ -913,16 +913,16 @@ class FIFOQueueTest(test.TestCase):
       cleanup_dequeue_t = q.dequeue()
 
       def enqueue():
-        sess.run(enqueue_op)
+        self.evaluate(enqueue_op)
 
       def dequeue():
-        self.assertAllEqual(elems[0:3], sess.run(dequeued_t))
+        self.assertAllEqual(elems[0:3], self.evaluate(dequeued_t))
         with self.assertRaises(errors_impl.OutOfRangeError):
-          sess.run(dequeued_t)
-        self.assertEqual(elems[3], sess.run(cleanup_dequeue_t))
+          self.evaluate(dequeued_t)
+        self.assertEqual(elems[3], self.evaluate(cleanup_dequeue_t))
 
       def close():
-        sess.run(close_op)
+        self.evaluate(close_op)
 
       enqueue_thread = self.checkedThread(target=enqueue)
       enqueue_thread.start()
@@ -955,7 +955,7 @@ class FIFOQueueTest(test.TestCase):
 
       def dequeue():
         with self.assertRaises(errors_impl.OutOfRangeError):
-          sess.run([dequeued_a_t, dequeued_b_t])
+          self.evaluate([dequeued_a_t, dequeued_b_t])
 
       dequeue_thread = self.checkedThread(target=dequeue)
       dequeue_thread.start()
@@ -968,7 +968,7 @@ class FIFOQueueTest(test.TestCase):
       # Test that the elements in the partially-dequeued batch are
       # restored in the correct order.
       for elem_a, elem_b in zip(elems_a, elems_b):
-        val_a, val_b = sess.run([cleanup_dequeue_a_t, cleanup_dequeue_b_t])
+        val_a, val_b = self.evaluate([cleanup_dequeue_a_t, cleanup_dequeue_b_t])
         self.assertEqual(elem_a, val_a)
         self.assertEqual(elem_b, val_b)
       self.assertEqual(0, q.size().eval())
@@ -983,7 +983,7 @@ class FIFOQueueTest(test.TestCase):
         # Expect the operation to fail due to the queue being closed.
         with self.assertRaisesRegexp(errors_impl.OutOfRangeError,
                                      "is closed and has insufficient"):
-          sess.run(dequeued_t)
+          self.evaluate(dequeued_t)
 
       dequeue_thread = self.checkedThread(target=dequeue)
       dequeue_thread.start()
@@ -1003,7 +1003,7 @@ class FIFOQueueTest(test.TestCase):
         # Expect the operation to fail due to the queue being closed.
         with self.assertRaisesRegexp(errors_impl.OutOfRangeError,
                                      "is closed and has insufficient"):
-          sess.run(dequeued_t)
+          self.evaluate(dequeued_t)
 
       dequeue_thread = self.checkedThread(target=dequeue)
       dequeue_thread.start()
@@ -1051,7 +1051,7 @@ class FIFOQueueTest(test.TestCase):
       enqueue_op.run()
 
       def blocking_enqueue():
-        sess.run(blocking_enqueue_op)
+        self.evaluate(blocking_enqueue_op)
 
       thread = self.checkedThread(target=blocking_enqueue)
       thread.start()
@@ -1059,8 +1059,8 @@ class FIFOQueueTest(test.TestCase):
       # TODO(mrry): Figure out how to do this without sleeping.
       time.sleep(0.1)
       for elem in elems:
-        self.assertEqual([elem], dequeued_t.eval())
-      self.assertEqual([50.0], dequeued_t.eval())
+        self.assertEqual([elem], self.evaluate(dequeued_t))
+      self.assertEqual([50.0], self.evaluate(dequeued_t))
       thread.join()
 
   def testBlockingEnqueueManyToFullQueue(self):
@@ -1074,7 +1074,7 @@ class FIFOQueueTest(test.TestCase):
       enqueue_op.run()
 
       def blocking_enqueue():
-        sess.run(blocking_enqueue_op)
+        self.evaluate(blocking_enqueue_op)
 
       thread = self.checkedThread(target=blocking_enqueue)
       thread.start()
@@ -1082,10 +1082,10 @@ class FIFOQueueTest(test.TestCase):
       # TODO(mrry): Figure out how to do this without sleeping.
       time.sleep(0.1)
       for elem in elems:
-        self.assertEqual([elem], dequeued_t.eval())
+        self.assertEqual([elem], self.evaluate(dequeued_t))
         time.sleep(0.01)
-      self.assertEqual([50.0], dequeued_t.eval())
-      self.assertEqual([60.0], dequeued_t.eval())
+      self.assertEqual([50.0], self.evaluate(dequeued_t))
+      self.assertEqual([60.0], self.evaluate(dequeued_t))
 
       # Make sure the thread finishes before exiting.
       thread.join()
@@ -1103,7 +1103,7 @@ class FIFOQueueTest(test.TestCase):
 
       def blocking_enqueue():
         # Expect the operation to succeed once the dequeue op runs.
-        sess.run(blocking_enqueue_op)
+        self.evaluate(blocking_enqueue_op)
 
       enqueue_thread = self.checkedThread(target=blocking_enqueue)
       enqueue_thread.start()
@@ -1113,18 +1113,18 @@ class FIFOQueueTest(test.TestCase):
       time.sleep(0.1)
 
       def close():
-        sess.run(close_op)
+        self.evaluate(close_op)
 
       close_thread = self.checkedThread(target=close)
       close_thread.start()
 
       # The dequeue will unblock both threads.
-      self.assertEqual(10.0, dequeued_t.eval())
+      self.assertEqual(10.0, self.evaluate(dequeued_t))
       enqueue_thread.join()
       close_thread.join()
 
       for elem in [20.0, 30.0, 40.0, 50.0]:
-        self.assertEqual(elem, dequeued_t.eval())
+        self.assertEqual(elem, self.evaluate(dequeued_t))
       self.assertEqual(0, q.size().eval())
 
   def testBlockingEnqueueManyBeforeClose(self):
@@ -1138,7 +1138,7 @@ class FIFOQueueTest(test.TestCase):
       enqueue_op.run()
 
       def blocking_enqueue():
-        sess.run(blocking_enqueue_op)
+        self.evaluate(blocking_enqueue_op)
 
       enqueue_thread = self.checkedThread(target=blocking_enqueue)
       enqueue_thread.start()
@@ -1148,17 +1148,17 @@ class FIFOQueueTest(test.TestCase):
       time.sleep(0.1)
 
       def close():
-        sess.run(close_op)
+        self.evaluate(close_op)
 
       close_thread = self.checkedThread(target=close)
       close_thread.start()
 
       # The dequeue will unblock both threads.
-      self.assertEqual(10.0, dequeued_t.eval())
+      self.assertEqual(10.0, self.evaluate(dequeued_t))
       enqueue_thread.join()
       close_thread.join()
       for elem in [20.0, 30.0, 50.0, 60.0]:
-        self.assertEqual(elem, dequeued_t.eval())
+        self.assertEqual(elem, self.evaluate(dequeued_t))
 
   def testDoesNotLoseValue(self):
     with self.cached_session():
@@ -1266,19 +1266,19 @@ class FIFOQueueTest(test.TestCase):
 
   def _blockingDequeue(self, sess, dequeue_op):
     with self.assertRaisesOpError("was cancelled"):
-      sess.run(dequeue_op)
+      self.evaluate(dequeue_op)
 
   def _blockingDequeueMany(self, sess, dequeue_many_op):
     with self.assertRaisesOpError("was cancelled"):
-      sess.run(dequeue_many_op)
+      self.evaluate(dequeue_many_op)
 
   def _blockingEnqueue(self, sess, enqueue_op):
     with self.assertRaisesOpError("was cancelled"):
-      sess.run(enqueue_op)
+      self.evaluate(enqueue_op)
 
   def _blockingEnqueueMany(self, sess, enqueue_many_op):
     with self.assertRaisesOpError("was cancelled"):
-      sess.run(enqueue_many_op)
+      self.evaluate(enqueue_many_op)
 
   def testResetOfBlockingOperation(self):
     with self.cached_session() as sess:
@@ -1321,7 +1321,7 @@ class FIFOQueueTest(test.TestCase):
       def blocking_enqueue():
         enq_done.append(False)
         # This will fill the queue and then block until enough dequeues happen.
-        sess.run(enq)
+        self.evaluate(enq)
         enq_done.append(True)
 
       thread = self.checkedThread(target=blocking_enqueue)
@@ -1331,14 +1331,14 @@ class FIFOQueueTest(test.TestCase):
       results = []
       results.append(deq.eval())  # Will only complete after the enqueue starts.
       self.assertEqual(len(enq_done), 1)
-      self.assertEqual(sess.run(size_op), 5)
+      self.assertEqual(self.evaluate(size_op), 5)
 
       for _ in range(3):
         results.append(deq.eval())
 
       time.sleep(0.1)
       self.assertEqual(len(enq_done), 1)
-      self.assertEqual(sess.run(size_op), 5)
+      self.assertEqual(self.evaluate(size_op), 5)
 
       # This dequeue will unblock the thread.
       results.append(deq.eval())
@@ -1364,7 +1364,7 @@ class FIFOQueueTest(test.TestCase):
 
       def blocking_dequeue():
         # Will only complete after 4 enqueues complete.
-        results.extend(sess.run(deq))
+        results.extend(self.evaluate(deq))
 
       thread = self.checkedThread(target=blocking_dequeue)
       thread.start()
@@ -1373,7 +1373,7 @@ class FIFOQueueTest(test.TestCase):
         # TODO(mrry): Figure out how to do this without sleeping.
         time.sleep(0.1)
         self.assertEqual(len(results), 0)
-        sess.run(enq)
+        self.evaluate(enq)
 
       # Enough enqueued to unblock the dequeue
       thread.join()
@@ -1405,7 +1405,7 @@ class FIFOQueueTest(test.TestCase):
       q.enqueue_many(input_tuple).run()
 
       output_tuple_t = q.dequeue_many(32)
-      output_tuple = sess.run(output_tuple_t)
+      output_tuple = self.evaluate(output_tuple_t)
 
       for (input_elem, output_elem) in zip(input_tuple, output_tuple):
         self.assertAllEqual(input_elem, output_elem)
@@ -1507,10 +1507,10 @@ class FIFOQueueDictTest(test.TestCase):
       enqueue_op4 = q.enqueue_many({"f": [40.0, 50.0]})
       dequeue = q.dequeue()
       dequeue_2 = q.dequeue_many(2)
-      sess.run(enqueue_op)
-      sess.run(enqueue_op2)
-      sess.run(enqueue_op3)
-      sess.run(enqueue_op4)
+      self.evaluate(enqueue_op)
+      self.evaluate(enqueue_op2)
+      self.evaluate(enqueue_op3)
+      self.evaluate(enqueue_op4)
       f = sess.run(dequeue["f"])
       self.assertEqual(10.0, f)
       f = sess.run(dequeue_2["f"])
@@ -1565,10 +1565,10 @@ class FIFOQueueDictTest(test.TestCase):
       })
       dequeue = q.dequeue()
       dequeue_2 = q.dequeue_many(2)
-      sess.run(enqueue_op)
-      sess.run(enqueue_op2)
-      sess.run(enqueue_op3)
-      sess.run(enqueue_op4)
+      self.evaluate(enqueue_op)
+      self.evaluate(enqueue_op2)
+      self.evaluate(enqueue_op3)
+      self.evaluate(enqueue_op4)
       i, f, s = sess.run([dequeue["i"], dequeue["f"], dequeue["s"]])
       self.assertEqual(123, i)
       self.assertEqual(10.0, f)
@@ -1597,7 +1597,7 @@ class FIFOQueueWithTimeoutTest(test.TestCase):
       # until operation_timeout_in_ms.
       with self.assertRaisesRegexp(errors_impl.DeadlineExceededError,
                                    "Timed out waiting for notification"):
-        sess.run(dequeued_t)
+        self.evaluate(dequeued_t)
 
   def testReusableAfterTimeout(self):
     with self.cached_session() as sess:
@@ -1613,8 +1613,8 @@ class FIFOQueueWithTimeoutTest(test.TestCase):
                                    "Timed out waiting for notification"):
         sess.run(dequeued_t, options=config_pb2.RunOptions(timeout_in_ms=10))
 
-      sess.run(enqueue_op)
-      self.assertEqual(37, sess.run(dequeued_t))
+      self.evaluate(enqueue_op)
+      self.assertEqual(37, self.evaluate(dequeued_t))
 
 
 class QueueContainerTest(test.TestCase):
diff --git a/tensorflow/python/kernel_tests/fractional_avg_pool_op_test.py b/tensorflow/python/kernel_tests/fractional_avg_pool_op_test.py
index f89d2062f1e..0d5928aefac 100644
--- a/tensorflow/python/kernel_tests/fractional_avg_pool_op_test.py
+++ b/tensorflow/python/kernel_tests/fractional_avg_pool_op_test.py
@@ -24,6 +24,7 @@ import numpy as np
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_nn_ops
 from tensorflow.python.ops import gradient_checker
@@ -37,7 +38,6 @@ class FractionalAvgTest(test.TestCase):
   # Random number generate with seed.
   _PRNG = np.random.RandomState(341261000)
   _SEED = 341261001
-  _SEED2 = 341261002
 
   def _AvgPoolAlongRows(self, input_matrix, row_seq, overlapping):
     """Perform average pool along row of a 2-D matrix based on row_seq.
@@ -128,15 +128,13 @@ class FractionalAvgTest(test.TestCase):
       None
     """
     with self.cached_session() as sess:
-      p, r, c = nn_ops.fractional_avg_pool(
+      p, r, c = nn_ops.fractional_avg_pool_v2(
           input_tensor,
           pooling_ratio,
           pseudo_random,
           overlapping,
-          deterministic=True,
-          seed=self._SEED,
-          seed2=self._SEED2)
-      actual, row_seq, col_seq = sess.run([p, r, c])
+          seed=self._SEED)
+      actual, row_seq, col_seq = self.evaluate([p, r, c])
       expected = self._GetExpectedFractionalAvgPoolResult(input_tensor, row_seq,
                                                           col_seq, overlapping)
       self.assertShapeEqual(expected, p)
@@ -161,15 +159,13 @@ class FractionalAvgTest(test.TestCase):
       rand_mat = self._PRNG.randint(10, size=tensor_shape)
       pooling_ratio = [1, math.sqrt(2), math.sqrt(2), 1]
       with self.cached_session() as sess:
-        p, r, c = nn_ops.fractional_avg_pool(
+        p, r, c = nn_ops.fractional_avg_pool_v2(
             rand_mat.astype(np.float32),
             pooling_ratio,
             pseudo_random,
             overlapping,
-            deterministic=True,
-            seed=self._SEED,
-            seed2=self._SEED2)
-        tensor_output, row_seq, col_seq = sess.run([p, r, c])
+            seed=self._SEED)
+        tensor_output, row_seq, col_seq = self.evaluate([p, r, c])
         expected_result = self._GetExpectedFractionalAvgPoolResult(
             rand_mat.astype(np.float32), row_seq, col_seq, overlapping)
         print("row sequence:")
@@ -214,12 +210,6 @@ class FractionalAvgTest(test.TestCase):
 
   def testIntegerTensorInput(self):
     """Test FractionalAvgPool works fine when input tensor is integer type.
-
-    I would have used _ValidateFractionalAvgPoolResult function to automate this
-    process, however, there's rounding issue. It is caused by numpy.mean cast
-    integer input to numpy.float64 for intermediate use. While for
-    fractional_avg_pool, the mean operation is integer division (trucated).  So,
-    for this test case, I will hard code a simple matrix.
     """
     pseudo_random = True
     overlapping = True
@@ -234,29 +224,9 @@ class FractionalAvgTest(test.TestCase):
         [4, 4, 5, 9, 7, 2]
     ])
     # pyformat: enable
-    with self.cached_session() as sess:
-      # Since deterministic = True, seed and seed2 are fixed. Therefore r, and c
-      # are the same each time. We can have an expected result precomputed.
-      # r = [0, 2, 4, 6]
-      # c = [0, 1, 3, 4, 6]
-
-      # pyformat: disable
-      expected = np.array([
-          [6, 5, 3, 5],
-          [5, 5, 4, 5],
-          [5, 4, 7, 5]
-      ]).reshape((1, 3, 4, 1))
-      # pyformat: enable
-      p, unused_r, unused_c = nn_ops.fractional_avg_pool(
-          mat.reshape(tensor_shape), [1, math.sqrt(3), math.sqrt(2), 1],
-          pseudo_random,
-          overlapping,
-          deterministic=True,
-          seed=self._SEED,
-          seed2=self._SEED2)
-      actual = sess.run(p)
-      self.assertShapeEqual(expected, p)
-      self.assertAllClose(expected, actual)
+    self._ValidateFractionalAvgPoolResult(mat.reshape(tensor_shape),
+                                          [1, math.sqrt(3), math.sqrt(2), 1],
+                                          pseudo_random, overlapping)
 
   def testDifferentTensorShapes(self):
     """Test different shapes of input tensor.
@@ -312,6 +282,7 @@ class FractionalAvgTest(test.TestCase):
     self._ValidateFractionalAvgPoolResult(rand_mat, [1, 2, 2, 1], pseudo_random,
                                           overlapping)
 
+  @test_util.run_deprecated_v1
   def testDifferentInputTensorShape(self):
     """Runs the operation in one session with different input tensor shapes."""
     with self.cached_session() as sess:
@@ -320,14 +291,12 @@ class FractionalAvgTest(test.TestCase):
       pooling_ratio = [1, 1.5, 1.5, 1]
       pseudo_random = False
       overlapping = False
-      p, r, c = nn_ops.fractional_avg_pool(
+      p, r, c = nn_ops.fractional_avg_pool_v2(
           input_holder,
           pooling_ratio,
           pseudo_random,
           overlapping,
-          deterministic=True,
-          seed=self._SEED,
-          seed2=self._SEED2)
+          seed=self._SEED)
       # First run.
       input_a = np.zeros([3, 32, 32, 3])
       actual, row_seq, col_seq = sess.run([p, r, c], {input_holder: input_a})
@@ -372,7 +341,6 @@ class FractionalAvgPoolGradTest(test.TestCase):
   """
   _PRNG = np.random.RandomState(341261004)
   _SEED = 341261005
-  _SEED2 = 341261006
 
   def _GenerateRandomInputTensor(self, shape):
     num_elements = 1
@@ -398,7 +366,7 @@ class FractionalAvgPoolGradTest(test.TestCase):
               padding = "VALID"
               output_tensor = nn_ops.avg_pool(input_tensor, window_size,
                                               stride_size, padding)
-              output_data = output_tensor.eval()
+              output_data = self.evaluate(output_tensor)
               num_elements = 1
               for dim_size in output_data.shape:
                 num_elements *= dim_size
@@ -407,7 +375,7 @@ class FractionalAvgPoolGradTest(test.TestCase):
               input_backprop_tensor = gen_nn_ops.avg_pool_grad(
                   input_tensor.get_shape(), output_backprop, window_size,
                   stride_size, padding)
-              input_backprop = input_backprop_tensor.eval()
+              input_backprop = self.evaluate(input_backprop_tensor)
               row_seq = list(range(0, num_rows + 1, row_window_size))
               col_seq = list(range(0, num_cols + 1, col_window_size))
               fap_input_backprop_tensor = gen_nn_ops.fractional_avg_pool_grad(
@@ -416,7 +384,7 @@ class FractionalAvgPoolGradTest(test.TestCase):
                   row_seq,
                   col_seq,
                   overlapping=False)
-              fap_input_backprop = fap_input_backprop_tensor.eval()
+              fap_input_backprop = self.evaluate(fap_input_backprop_tensor)
               self.assertShapeEqual(input_backprop, fap_input_backprop_tensor)
               self.assertAllClose(input_backprop, fap_input_backprop)
 
@@ -437,7 +405,7 @@ class FractionalAvgPoolGradTest(test.TestCase):
               padding = "VALID"
               output_tensor = nn_ops.avg_pool(input_tensor, window_size,
                                               stride_size, padding)
-              output_data = output_tensor.eval()
+              output_data = self.evaluate(output_tensor)
               num_elements = 1
               for dim_size in output_data.shape:
                 num_elements *= dim_size
@@ -446,7 +414,7 @@ class FractionalAvgPoolGradTest(test.TestCase):
               input_backprop_tensor = gen_nn_ops.avg_pool_grad(
                   input_tensor.get_shape(), output_backprop, window_size,
                   stride_size, padding)
-              input_backprop = input_backprop_tensor.eval()
+              input_backprop = self.evaluate(input_backprop_tensor)
               row_seq = list(range(0, num_rows, row_window_size - 1))
               col_seq = list(range(0, num_cols, col_window_size - 1))
               row_seq[-1] += 1
@@ -457,10 +425,11 @@ class FractionalAvgPoolGradTest(test.TestCase):
                   row_seq,
                   col_seq,
                   overlapping=True)
-              fap_input_backprop = fap_input_backprop_tensor.eval()
+              fap_input_backprop = self.evaluate(fap_input_backprop_tensor)
               self.assertShapeEqual(input_backprop, fap_input_backprop_tensor)
               self.assertAllClose(input_backprop, fap_input_backprop)
 
+  @test_util.run_deprecated_v1
   def testAllInputOptionsThroughGradientError(self):
     input_shape = (1, 7, 13, 1)
     input_data = self._GenerateRandomInputTensor(input_shape)
@@ -470,15 +439,13 @@ class FractionalAvgPoolGradTest(test.TestCase):
       for overlapping in True, False:
         with self.cached_session() as _:
           input_tensor = constant_op.constant(input_data, shape=input_shape)
-          output_tensor, unused_a, unused_b = nn_ops.fractional_avg_pool(
+          output_tensor, unused_a, unused_b = nn_ops.fractional_avg_pool_v2(
               input_tensor,
               pooling_ratio,
               pseudo_random=pseudo_random,
               overlapping=overlapping,
-              deterministic=True,
-              seed=self._SEED,
-              seed2=self._SEED2)
-          output_data = output_tensor.eval()
+              seed=self._SEED)
+          output_data = self.evaluate(output_tensor)
           output_shape = output_data.shape
           # error_margin and delta setting is similar to avg_pool_grad.
           error_margin = 1e-4
@@ -491,6 +458,7 @@ class FractionalAvgPoolGradTest(test.TestCase):
               delta=1e-2)
           self.assertLess(gradient_error, error_margin)
 
+  @test_util.run_deprecated_v1
   def testDifferentTensorShapesThroughGradientError(self):
     pseudo_random = True
     overlapping = True
@@ -503,15 +471,13 @@ class FractionalAvgPoolGradTest(test.TestCase):
             input_data = self._GenerateRandomInputTensor(input_shape)
             with self.cached_session() as _:
               input_tensor = constant_op.constant(input_data, shape=input_shape)
-              output_tensor, unused_a, unused_b = nn_ops.fractional_avg_pool(
+              output_tensor, unused_a, unused_b = nn_ops.fractional_avg_pool_v2(
                   input_tensor,
                   pooling_ratio,
                   pseudo_random=pseudo_random,
                   overlapping=overlapping,
-                  deterministic=True,
-                  seed=self._SEED,
-                  seed2=self._SEED2)
-              output_data = output_tensor.eval()
+                  seed=self._SEED)
+              output_data = self.evaluate(output_tensor)
               output_shape = output_data.shape
               # error_margin and delta setting is similar to avg_pool_grad.
               error_margin = 1e-4
@@ -524,6 +490,7 @@ class FractionalAvgPoolGradTest(test.TestCase):
                   delta=1e-2)
               self.assertLess(gradient_error, error_margin)
 
+  @test_util.run_deprecated_v1
   def testLargePoolingRatioThroughGradientError(self):
     input_shape = (1, 17, 23, 1)
     input_data = self._GenerateRandomInputTensor(input_shape)
@@ -534,14 +501,12 @@ class FractionalAvgPoolGradTest(test.TestCase):
 
     with self.cached_session() as _:
       input_tensor = constant_op.constant(input_data, shape=input_shape)
-      output_tensor, unused_a, unused_b = nn_ops.fractional_avg_pool(
+      output_tensor, unused_a, unused_b = nn_ops.fractional_avg_pool_v2(
           input_tensor,
           pooling_ratio,
           pseudo_random=pseudo_random,
           overlapping=overlapping,
-          deterministic=True,
-          seed=self._SEED,
-          seed2=self._SEED2)
+          seed=self._SEED)
       # error_margin and delta setting is similar to avg_pool_grad.
       error_margin = 1e-4
       gradient_error = gradient_checker.compute_gradient_error(
diff --git a/tensorflow/python/kernel_tests/fractional_max_pool_op_test.py b/tensorflow/python/kernel_tests/fractional_max_pool_op_test.py
index 9b94ca85547..fa886cc215a 100644
--- a/tensorflow/python/kernel_tests/fractional_max_pool_op_test.py
+++ b/tensorflow/python/kernel_tests/fractional_max_pool_op_test.py
@@ -24,6 +24,7 @@ import numpy as np
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_nn_ops
 from tensorflow.python.ops import gradient_checker
@@ -37,7 +38,6 @@ class FractionalMaxPoolTest(test.TestCase):
   # Random number generate with seed.
   _PRNG = np.random.RandomState(341261)
   _SEED = 123456
-  _SEED2 = 654321
 
   def _MaxPoolAlongRows(self, input_matrix, row_seq, overlapping):
     """Perform max pool along row of a 2-D matrix based on row_seq.
@@ -128,15 +128,13 @@ class FractionalMaxPoolTest(test.TestCase):
       None
     """
     with self.cached_session() as sess:
-      p, r, c = nn_ops.fractional_max_pool(
+      p, r, c = nn_ops.fractional_max_pool_v2(
           input_tensor,
           pooling_ratio,
           pseudo_random,
           overlapping,
-          deterministic=True,
-          seed=self._SEED,
-          seed2=self._SEED2)
-      actual, row_seq, col_seq = sess.run([p, r, c])
+          seed=self._SEED)
+      actual, row_seq, col_seq = self.evaluate([p, r, c])
       expected = self._GetExpectedFractionalMaxPoolResult(input_tensor, row_seq,
                                                           col_seq, overlapping)
       self.assertShapeEqual(expected, p)
@@ -161,15 +159,13 @@ class FractionalMaxPoolTest(test.TestCase):
       rand_mat = self._PRNG.randint(10, size=tensor_shape)
       pooling_ratio = [1, math.sqrt(2), math.sqrt(2), 1]
       with self.cached_session() as sess:
-        p, r, c = nn_ops.fractional_max_pool(
+        p, r, c = nn_ops.fractional_max_pool_v2(
             rand_mat,
             pooling_ratio,
             pseudo_random,
             overlapping,
-            deterministic=True,
-            seed=self._SEED,
-            seed2=self._SEED2)
-        tensor_output, row_seq, col_seq = sess.run([p, r, c])
+            seed=self._SEED)
+        tensor_output, row_seq, col_seq = self.evaluate([p, r, c])
         expected_result = self._GetExpectedFractionalMaxPoolResult(rand_mat,
                                                                    row_seq,
                                                                    col_seq,
@@ -283,6 +279,7 @@ class FractionalMaxPoolTest(test.TestCase):
     self._ValidateFractionalMaxPoolResult(rand_mat, [1, 2, 2, 1], pseudo_random,
                                           overlapping)
 
+  @test_util.run_deprecated_v1
   def testDifferentInputTensorShape(self):
     """Runs the operation in one session with different input tensor shapes."""
     with self.cached_session() as sess:
@@ -291,14 +288,12 @@ class FractionalMaxPoolTest(test.TestCase):
       pooling_ratio = [1, 1.5, 1.5, 1]
       pseudo_random = False
       overlapping = False
-      p, r, c = nn_ops.fractional_max_pool(
+      p, r, c = nn_ops.fractional_max_pool_v2(
           input_holder,
           pooling_ratio,
           pseudo_random,
           overlapping,
-          deterministic=True,
-          seed=self._SEED,
-          seed2=self._SEED2)
+          seed=self._SEED)
       # First run.
       input_a = np.zeros([3, 32, 32, 3])
       actual, row_seq, col_seq = sess.run([p, r, c], {input_holder: input_a})
@@ -344,7 +339,6 @@ class FractionalMaxPoolGradTest(test.TestCase):
 
   _PRNG = np.random.RandomState(341261)
   _SEED = 123456
-  _SEED2 = 654321
 
   def _GenerateUniqueRandomInputTensor(self, shape):
     """Generate 'unqiue' random input tensor.
@@ -382,12 +376,12 @@ class FractionalMaxPoolGradTest(test.TestCase):
               padding = "VALID"
               output_tensor = nn_ops.max_pool(input_tensor, window_size,
                                               stride_size, padding)
-              output_data = output_tensor.eval()
+              output_data = self.evaluate(output_tensor)
               output_backprop = self._PRNG.randint(100, size=output_data.shape)
               input_backprop_tensor = gen_nn_ops.max_pool_grad(
                   input_tensor, output_tensor, output_backprop, window_size,
                   stride_size, padding)
-              input_backprop = input_backprop_tensor.eval()
+              input_backprop = self.evaluate(input_backprop_tensor)
               row_seq = list(range(0, num_rows + 1, row_window_size))
               col_seq = list(range(0, num_cols + 1, col_window_size))
               fmp_input_backprop_tensor = gen_nn_ops.fractional_max_pool_grad(
@@ -397,7 +391,7 @@ class FractionalMaxPoolGradTest(test.TestCase):
                   row_seq,
                   col_seq,
                   overlapping=False)
-              fmp_input_backprop = fmp_input_backprop_tensor.eval()
+              fmp_input_backprop = self.evaluate(fmp_input_backprop_tensor)
               self.assertShapeEqual(input_backprop, fmp_input_backprop_tensor)
               self.assertAllClose(input_backprop, fmp_input_backprop)
 
@@ -417,12 +411,12 @@ class FractionalMaxPoolGradTest(test.TestCase):
               padding = "VALID"
               output_tensor = nn_ops.max_pool(input_tensor, window_size,
                                               stride_size, padding)
-              output_data = output_tensor.eval()
+              output_data = self.evaluate(output_tensor)
               output_backprop = self._PRNG.randint(100, size=output_data.shape)
               input_backprop_tensor = gen_nn_ops.max_pool_grad(
                   input_tensor, output_tensor, output_backprop, window_size,
                   stride_size, padding)
-              input_backprop = input_backprop_tensor.eval()
+              input_backprop = self.evaluate(input_backprop_tensor)
               row_seq = list(range(0, num_rows, row_window_size - 1))
               col_seq = list(range(0, num_cols, col_window_size - 1))
               row_seq[-1] += 1
@@ -434,10 +428,11 @@ class FractionalMaxPoolGradTest(test.TestCase):
                   row_seq,
                   col_seq,
                   overlapping=True)
-              fmp_input_backprop = fmp_input_backprop_tensor.eval()
+              fmp_input_backprop = self.evaluate(fmp_input_backprop_tensor)
               self.assertShapeEqual(input_backprop, fmp_input_backprop_tensor)
               self.assertAllClose(input_backprop, fmp_input_backprop)
 
+  @test_util.run_deprecated_v1
   def testAllInputOptionsThroughGradientError(self):
     input_shape = (1, 7, 13, 1)
     input_data = self._GenerateUniqueRandomInputTensor(input_shape)
@@ -449,15 +444,13 @@ class FractionalMaxPoolGradTest(test.TestCase):
       for overlapping in True, False:
         with self.cached_session() as _:
           input_tensor = constant_op.constant(input_data, shape=input_shape)
-          output_tensor, unused_a, unused_b = nn_ops.fractional_max_pool(
+          output_tensor, unused_a, unused_b = nn_ops.fractional_max_pool_v2(
               input_tensor,
               pooling_ratio,
               pseudo_random=pseudo_random,
               overlapping=overlapping,
-              deterministic=True,
-              seed=self._SEED,
-              seed2=self._SEED2)
-          output_data = output_tensor.eval()
+              seed=self._SEED)
+          output_data = self.evaluate(output_tensor)
           output_shape = output_data.shape
           # error_margin and delta setting is similar to max_pool_grad.
           error_margin = 1e-3
@@ -470,6 +463,7 @@ class FractionalMaxPoolGradTest(test.TestCase):
               delta=1e-2)
           self.assertLess(gradient_error, error_margin)
 
+  @test_util.run_deprecated_v1
   def testDifferentTensorShapesThroughGradientError(self):
     pseudo_random = True
     overlapping = True
@@ -484,15 +478,13 @@ class FractionalMaxPoolGradTest(test.TestCase):
             input_data += self._PRNG.random_sample(input_shape)
             with self.cached_session() as _:
               input_tensor = constant_op.constant(input_data, shape=input_shape)
-              output_tensor, unused_a, unused_b = nn_ops.fractional_max_pool(
+              output_tensor, unused_a, unused_b = nn_ops.fractional_max_pool_v2(
                   input_tensor,
                   pooling_ratio,
                   pseudo_random=pseudo_random,
                   overlapping=overlapping,
-                  deterministic=True,
-                  seed=self._SEED,
-                  seed2=self._SEED2)
-              output_data = output_tensor.eval()
+                  seed=self._SEED)
+              output_data = self.evaluate(output_tensor)
               output_shape = output_data.shape
               # error_margin and delta setting is similar to max_pool_grad.
               error_margin = 1e-3
@@ -505,6 +497,7 @@ class FractionalMaxPoolGradTest(test.TestCase):
                   delta=1e-2)
               self.assertLess(gradient_error, error_margin)
 
+  @test_util.run_deprecated_v1
   def testLargePoolingRatioThroughGradientError(self):
     input_shape = (1, 17, 23, 1)
     input_data = self._GenerateUniqueRandomInputTensor(input_shape)
@@ -517,14 +510,12 @@ class FractionalMaxPoolGradTest(test.TestCase):
 
     with self.cached_session() as _:
       input_tensor = constant_op.constant(input_data, shape=input_shape)
-      output_tensor, unused_a, unused_b = nn_ops.fractional_max_pool(
+      output_tensor, unused_a, unused_b = nn_ops.fractional_max_pool_v2(
           input_tensor,
           pooling_ratio,
           pseudo_random=pseudo_random,
           overlapping=overlapping,
-          deterministic=True,
-          seed=self._SEED,
-          seed2=self._SEED2)
+          seed=self._SEED)
       # error_margin and delta setting is similar to max_pool_grad.
       error_margin = 1e-3
       gradient_error = gradient_checker.compute_gradient_error(
@@ -592,7 +583,7 @@ class FractionalMaxPoolGradTest(test.TestCase):
           row_seq,
           col_seq,
           overlapping=False)
-      input_backprop_not_overlapping = r.eval()
+      input_backprop_not_overlapping = self.evaluate(r)
       self.assertShapeEqual(
           np.reshape(expected_input_backprop_not_overlapping, input_size), r)
       self.assertAllClose(expected_input_backprop_not_overlapping,
@@ -602,7 +593,7 @@ class FractionalMaxPoolGradTest(test.TestCase):
           output_data_overlapping, shape=output_size)
       r = gen_nn_ops.fractional_max_pool_grad(
           input_tensor, output_tensor, grad, row_seq, col_seq, overlapping=True)
-      input_backprop_overlapping = r.eval()
+      input_backprop_overlapping = self.evaluate(r)
       self.assertShapeEqual(
           np.reshape(expected_input_backprop_overlapping, input_size), r)
       self.assertAllClose(expected_input_backprop_overlapping,
diff --git a/tensorflow/python/kernel_tests/functional_ops_test.py b/tensorflow/python/kernel_tests/functional_ops_test.py
index 04c1032722c..c489623fe56 100644
--- a/tensorflow/python/kernel_tests/functional_ops_test.py
+++ b/tensorflow/python/kernel_tests/functional_ops_test.py
@@ -56,6 +56,7 @@ def simple_scoped_fn(a, x):
     return math_ops.multiply(math_ops.add(a, x), two)
 
 
+@test_util.with_control_flow_v2
 class FunctionalOpsTest(test.TestCase):
 
   @test_util.run_in_graph_and_eager_modes
@@ -100,6 +101,7 @@ class FunctionalOpsTest(test.TestCase):
                              (elems, other_elems), initializer)
     self.assertAllEqual([1.0, 2.0, 3.0], self.evaluate(r))
 
+  @test_util.run_deprecated_v1
   def testFoldl_Scoped(self):
     with self.cached_session() as sess:
       with variable_scope.variable_scope("root") as varscope:
@@ -152,6 +154,7 @@ class FunctionalOpsTest(test.TestCase):
                              initializer)
     self.assertAllEqual(1, self.evaluate(r))
 
+  @test_util.run_deprecated_v1
   def testFoldr_Scoped(self):
     with self.cached_session() as sess:
       with variable_scope.variable_scope("root") as varscope:
@@ -172,6 +175,7 @@ class FunctionalOpsTest(test.TestCase):
         self.assertAllEqual(1282, self.evaluate(r))
 
   # pylint: disable=unnecessary-lambda
+  @test_util.run_deprecated_v1
   def testFold_Grad(self):
     with self.cached_session():
       elems = constant_op.constant([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], name="data")
@@ -213,6 +217,7 @@ class FunctionalOpsTest(test.TestCase):
     with self.assertRaisesRegexp(ValueError, "not a scalar"):
       functional_ops.map_fn(lambda x: x, 1)
 
+  @test_util.run_deprecated_v1
   def testMap_Scoped(self):
     with self.cached_session() as sess:
 
@@ -244,6 +249,7 @@ class FunctionalOpsTest(test.TestCase):
         self.assertEqual(len(variables.trainable_variables()), 1)
         self.assertAllEqual(doubles, self.evaluate(r))
 
+  @test_util.run_deprecated_v1
   def testMap_Grad(self):
     with self.cached_session():
       param = constant_op.constant(2.0)
@@ -380,6 +386,7 @@ class FunctionalOpsTest(test.TestCase):
         ValueError, "two structures don't have the same nested structure"):
       functional_ops.scan(lambda a, x: (a, -a), elems, initializer)
 
+  @test_util.run_deprecated_v1
   def testScan_Scoped(self):
     with self.cached_session() as sess:
       with variable_scope.variable_scope("root") as varscope:
@@ -424,6 +431,7 @@ class FunctionalOpsTest(test.TestCase):
     #   t_1 == 1, b == 4.5,       y == 0.5, returns b * y * x = 9
     self.assertAllClose([1., 1., 2.25, 9.], self.evaluate(r))
 
+  @test_util.run_deprecated_v1
   def testScan_Control(self):
     with self.cached_session() as sess:
       s = array_ops.placeholder(dtypes.float32, shape=[None])
@@ -435,6 +443,7 @@ class FunctionalOpsTest(test.TestCase):
           np.array([1.0, 3.0, 9.0]), sess.run(c, {s: [1, 3, 3],
                                                   b: True}))
 
+  @test_util.run_deprecated_v1
   def testScan_Grad(self):
     with self.cached_session():
       elems = constant_op.constant([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], name="data")
@@ -447,6 +456,7 @@ class FunctionalOpsTest(test.TestCase):
       r = gradients_impl.gradients(r, v)[0]
       self.assertAllEqual(873.0, self.evaluate(r))
 
+  @test_util.run_deprecated_v1
   def testScanGradientWithPartStopGradient(self):
     a = variables.Variable(0.0, name="a")
     b = variables.Variable(0.0, name="b")
@@ -457,7 +467,7 @@ class FunctionalOpsTest(test.TestCase):
     grad = gradients_impl.gradients(ys=[loss], xs=[a, b])
     with self.test_session(use_gpu=True) as sess:
       variables.global_variables_initializer().run()
-      sess.run(grad)
+      self.evaluate(grad)
 
   @test_util.run_in_graph_and_eager_modes
   def testFoldShape(self):
@@ -476,12 +486,15 @@ class FunctionalOpsTest(test.TestCase):
     y = functional_ops.map_fn(lambda e: e, x)
     self.assertAllEqual(y.get_shape(), self.evaluate(y).shape)
 
+  @test_util.run_deprecated_v1
   def testMapUnknownShape(self):
     x = array_ops.placeholder(dtypes.float32)
     y = functional_ops.map_fn(lambda e: e, x)
     self.assertIs(None, y.get_shape().dims)
 
+  @test_util.disable_control_flow_v2("b/119323354")
   @test_util.run_in_graph_and_eager_modes
+  @test_util.run_deprecated_v1
   def testMapEmptyScalar(self):
     map_return = functional_ops.map_fn(lambda x: 1, constant_op.constant([]))
     self.assertAllEqual([0], map_return.get_shape().dims)
@@ -489,6 +502,8 @@ class FunctionalOpsTest(test.TestCase):
 
   # TODO(akshayka): this test fails in eager: the iterable is of length 0 so
   # so the body of the while loop never executes
+  @test_util.disable_control_flow_v2("b/119323354")
+  @test_util.run_deprecated_v1
   def testMapEmptyTensor(self):
     with self.cached_session():
       map_return = functional_ops.map_fn(lambda x: array_ops.zeros([3, 2]),
@@ -509,6 +524,7 @@ class FunctionalOpsTest(test.TestCase):
 
   # TODO(akshayka): this test fails in eager: the iterable is of length 0 so
   # so the body of the while loop never executes
+  @test_util.run_deprecated_v1
   def testScanEmptyTensor(self):
     with self.cached_session():
       x = functional_ops.scan(
@@ -516,6 +532,7 @@ class FunctionalOpsTest(test.TestCase):
       self.assertAllEqual([0, 2, 4], x.get_shape())
       self.assertAllEqual(x.get_shape(), self.evaluate(x).shape)
 
+  @test_util.run_deprecated_v1
   def testScanUnknownShape(self):
     x = array_ops.placeholder(dtypes.float32)
     initializer = array_ops.placeholder(dtypes.float32)
@@ -526,6 +543,7 @@ class FunctionalOpsTest(test.TestCase):
     y = functional_ops.scan(fn, x, initializer=initializer)
     self.assertIs(None, y.get_shape().dims)
 
+  @test_util.run_deprecated_v1
   def testScanVaryingShape(self):
     with self.cached_session() as sess:
       x = array_ops.placeholder(dtype=dtypes.float32, shape=[None, 2])
@@ -542,6 +560,7 @@ class FunctionalOpsTest(test.TestCase):
       sess.run([result, result_t, result_grad, result_t_grad],
                feed_dict={x: [[1.0, 2.0]]})
 
+  @test_util.run_deprecated_v1
   def testRemoteFunction(self):
     worker_config = config_pb2.ConfigProto()
     worker_config.device_count["CPU"] = 2
@@ -564,10 +583,11 @@ class FunctionalOpsTest(test.TestCase):
           target="/job:worker/replica:0/task:0/cpu:1")
 
     with session.Session(worker[0].target) as sess:
-      sess.run(variables.global_variables_initializer())
-      mul = sess.run(remote_op)
+      self.evaluate(variables.global_variables_initializer())
+      mul = self.evaluate(remote_op)
       self.assertEqual(mul, [6])
 
+  @test_util.run_deprecated_v1
   def testRemoteFunctionDirectSession(self):
     worker_config = config_pb2.ConfigProto()
     worker_config.device_count["CPU"] = 2
@@ -588,10 +608,11 @@ class FunctionalOpsTest(test.TestCase):
           target="/job:localhost/replica:0/task:0/cpu:1")
 
     with self.test_session(config=worker_config) as sess:
-      sess.run(variables.global_variables_initializer())
-      mul = sess.run(remote_op)
+      self.evaluate(variables.global_variables_initializer())
+      mul = self.evaluate(remote_op)
       self.assertEqual(mul, [6])
 
+  @test_util.run_deprecated_v1
   def testRemoteFunctionSameDeviceDirectSession(self):
 
     @function.Defun(dtypes.int32, dtypes.int32)
@@ -607,8 +628,8 @@ class FunctionalOpsTest(test.TestCase):
           args=[a, b], Tout=[dtypes.int32], f=_remote_fn, target="/cpu:0")
 
     with self.cached_session() as sess:
-      sess.run(variables.global_variables_initializer())
-      mul = sess.run(remote_op)
+      self.evaluate(variables.global_variables_initializer())
+      mul = self.evaluate(remote_op)
       self.assertEqual(mul, [6])
 
   def testRemoteFunctionCPUGPU(self):
@@ -631,8 +652,8 @@ class FunctionalOpsTest(test.TestCase):
           target="/job:localhost/replica:0/task:0/device:GPU:0")[0] + 3.0
 
     with self.cached_session() as sess:
-      sess.run(variables.global_variables_initializer())
-      mul = sess.run(remote_op)
+      self.evaluate(variables.global_variables_initializer())
+      mul = self.evaluate(remote_op)
       self.assertEqual(mul, 9.0)
 
   def testRemoteFunctionGPUCPU(self):
@@ -655,8 +676,8 @@ class FunctionalOpsTest(test.TestCase):
           target="/job:localhost/replica:0/task:0/cpu:0")[0] + 3.0
 
     with self.cached_session() as sess:
-      sess.run(variables.global_variables_initializer())
-      mul = sess.run(remote_op)
+      self.evaluate(variables.global_variables_initializer())
+      mul = self.evaluate(remote_op)
       self.assertEqual(mul, 9.0)
 
   def testRemoteFunctionGPUCPUStrings(self):
@@ -674,9 +695,10 @@ class FunctionalOpsTest(test.TestCase):
           args=[a], Tout=[dtypes.string], f=_remote_fn, target="/cpu:0")
 
     with self.cached_session() as sess:
-      ret = sess.run(remote_op)
+      ret = self.evaluate(remote_op)
       self.assertAllEqual(ret, [b"a"])
 
+  @test_util.run_deprecated_v1
   def testRemoteFunctionCrossProcess(self):
     workers, _ = test_util.create_local_cluster(2, 1)
 
@@ -696,10 +718,11 @@ class FunctionalOpsTest(test.TestCase):
           target="/job:worker/replica:0/task:1/cpu:0")[0] + 3.0
 
     with session.Session(workers[0].target) as sess:
-      sess.run(variables.global_variables_initializer())
-      mul = sess.run(remote_op)
+      self.evaluate(variables.global_variables_initializer())
+      mul = self.evaluate(remote_op)
       self.assertEqual(mul, 9)
 
+  @test_util.run_deprecated_v1
   def testIf(self):
 
     @function.Defun(dtypes.float32)
@@ -739,6 +762,7 @@ class FunctionalOpsTest(test.TestCase):
           self.assertAllEqual(Run(sess, 20.), 210.)
           self.assertAllEqual(Run(sess, 100.), 5050.)
 
+  @test_util.run_deprecated_v1
   def testWhileLowering(self):
 
     def Run(n, fetch_by_name):
@@ -766,13 +790,14 @@ class FunctionalOpsTest(test.TestCase):
           else:
             fetch = "my_while:1"
         with self.session(graph=g, use_gpu=use_gpu) as sess:
-          return sess.run(fetch)
+          return self.evaluate(fetch)
 
     self.assertAllEqual(Run(20., False), 210.)
     self.assertAllEqual(Run(20., True), 210.)
     self.assertAllEqual(Run(100., False), 5050.)
     self.assertAllEqual(Run(100., True), 5050.)
 
+  @test_util.run_deprecated_v1
   def testWhileError(self):
     for use_gpu in (True, False):
       with ops.Graph().as_default() as g:
@@ -854,11 +879,11 @@ class FunctionalOpsTest(test.TestCase):
           result_binary = functional_ops.While(
               [1.0, 0., 0.],
               function.Defun(*[dtypes.float32] * 3)(TestCond), TestBinary)
-          sess.run(variables.global_variables_initializer())
+          self.evaluate(variables.global_variables_initializer())
           assert len(result_unary) == 2
-          self.assertEqual([10.0, 54.0], sess.run(result_unary))
+          self.assertEqual([10.0, 54.0], self.evaluate(result_unary))
           assert len(result_binary) == 3
-          self.assertEqual([10.0, 54.0, 9.0], sess.run(result_binary))
+          self.assertEqual([10.0, 54.0, 9.0], self.evaluate(result_binary))
 
           def TestCondCapture(n, *args):
             del args
@@ -889,7 +914,7 @@ class FunctionalOpsTest(test.TestCase):
                 100, 0, -1, [0.], Body, rewrite_with_while=rewrite_with_while)
             [0],
         ]
-        xvals = sess.run(xs)
+        xvals = self.evaluate(xs)
       self.assertAllEqual(210, xvals[0])
       self.assertAllEqual(5050, xvals[1])
 
@@ -919,6 +944,7 @@ class FunctionalOpsTest(test.TestCase):
     self.assertTrue("TestBody_Cond" in names)
     self.assertTrue("TestBody_Body" in names)
 
+  @test_util.run_deprecated_v1
   def testForCapturedInputs(self):
     v = variables.Variable(1.0)
 
@@ -946,16 +972,16 @@ class FunctionalOpsTest(test.TestCase):
         result_binary = functional_ops.For(
             1, 10, 1, [0., 0.], TestBinary,
             rewrite_with_while=rewrite_with_while)
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         assert not result_nullary
         # The nullary variant doesn't return anything so we can't easily run it.
         # As a total hack, fetch the operation by name and run it.
         sess.run(ops.get_default_graph().get_operation_by_name(
             "While" if rewrite_with_while else "For"))
         assert len(result_unary) == 1
-        self.assertEqual([54.0], sess.run(result_unary))
+        self.assertEqual([54.0], self.evaluate(result_unary))
         assert len(result_binary) == 2
-        self.assertEqual([54.0, 9.0], sess.run(result_binary))
+        self.assertEqual([54.0, 9.0], self.evaluate(result_binary))
 
   def _tfMLP(self, xval, wsval, bsval, rewrite_with_while):
     # On GPU, don't rewrite using a while loop.
@@ -974,7 +1000,7 @@ class FunctionalOpsTest(test.TestCase):
           MLP,
           rewrite_with_while=rewrite_with_while)[0]
 
-      return ret.eval()
+      return self.evaluate(ret)
 
   def _npMLP(self, xval, wsval, bsval):
     for i in range(wsval.shape[0]):
@@ -993,12 +1019,15 @@ class FunctionalOpsTest(test.TestCase):
     tf_for_ans = self._tfMLP(xval, wsval, bsval, rewrite_with_while)
     self.assertAllClose(np_ans, tf_for_ans)
 
+  @test_util.run_deprecated_v1
   def testForMLP(self):
     self._testForMLP(False)
 
+  @test_util.run_deprecated_v1
   def testForMLPWhile(self):
     self._testForMLP(True)
 
+  @test_util.run_deprecated_v1
   def testForError(self):
 
     @function.Defun(dtypes.int32, dtypes.float32)
@@ -1021,6 +1050,7 @@ class FunctionalOpsTest(test.TestCase):
           "For loop body returned 2 arguments. Expected: 1"):
         functional_ops.For(0, 10, 1, [0.0], ReturnsTooManyArgs)[0].eval()
 
+  @test_util.run_deprecated_v1
   def testGradient(self):
 
     @function.Defun(dtypes.float32)
@@ -1038,14 +1068,15 @@ class FunctionalOpsTest(test.TestCase):
       avals = [Poly(a), Grad(a)]
       b = constant_op.constant(1.)
       bvals = [Poly(b), Grad(b)]
-      self.assertAllEqual(sess.run(avals), [8., 4.])
-      self.assertAllEqual(sess.run(bvals), [17., 16.])
+      self.assertAllEqual(self.evaluate(avals), [8., 4.])
+      self.assertAllEqual(self.evaluate(bvals), [17., 16.])
 
 
 # TODO(akshayka): Replace `function.Defun` with tf.contrib.eager.defun` in the
 # below test cases.
 class PartitionedCallTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testBasicSingleDevice(self):
 
     @function.Defun(*[dtypes.float32] * 2)
@@ -1061,6 +1092,7 @@ class PartitionedCallTest(test.TestCase):
                   constant_op.constant(2.)], f=Body))
     self.assertEqual(output, 6.)
 
+  @test_util.run_deprecated_v1
   def testBasicMultiDevice(self):
     config = config_pb2.ConfigProto(device_count={"CPU": 3})
 
@@ -1104,6 +1136,7 @@ class PartitionedCallTest(test.TestCase):
                   constant_op.constant(2.)], f=Body))
     self.assertEqual(output, 6.)
 
+  @test_util.run_deprecated_v1
   def testBasicNoDeviceAnnotations(self):
 
     @function.Defun(*[dtypes.float32] * 2)
@@ -1118,6 +1151,7 @@ class PartitionedCallTest(test.TestCase):
                   constant_op.constant(2.)], f=Body))
     self.assertEqual(output, 6.)
 
+  @test_util.run_deprecated_v1
   def testShardsRunOnRequestedDevices(self):
     config = config_pb2.ConfigProto(device_count={"CPU": 4})
 
@@ -1147,6 +1181,7 @@ class PartitionedCallTest(test.TestCase):
     self.assertIn(compat.as_bytes("CPU:1"), outputs[1])
     self.assertIn(compat.as_bytes("CPU:2"), outputs[2])
 
+  @test_util.run_deprecated_v1
   def testAssignAddResourceVariable(self):
 
     v = resource_variable_ops.ResourceVariable(1.0)
@@ -1190,14 +1225,15 @@ class PartitionedCallTest(test.TestCase):
             allow_soft_placement=False,
             log_device_placement=True,
             device_count={"CPU": 2})) as sess:
-      sess.run(variables.global_variables_initializer())
-      expected = sess.run(sum_gather())
+      self.evaluate(variables.global_variables_initializer())
+      expected = self.evaluate(sum_gather())
       result = sess.run(
           functional_ops.partitioned_call(
               args=defined.captured_inputs, f=defined))
       self.assertAllEqual(expected, result)
 
   # Use an invalid executor name to test the plumbing of the executor_type attr.
+  @test_util.run_deprecated_v1
   def testExecutorTypeAttrExecutorNotFound(self):
     @function.Defun(dtypes.int32)
     def AddFive(x):
diff --git a/tensorflow/python/kernel_tests/gather_nd_op_test.py b/tensorflow/python/kernel_tests/gather_nd_op_test.py
index ee761435d84..320ffc9674b 100644
--- a/tensorflow/python/kernel_tests/gather_nd_op_test.py
+++ b/tensorflow/python/kernel_tests/gather_nd_op_test.py
@@ -27,6 +27,7 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import variables
@@ -40,7 +41,7 @@ class GatherNdTest(test.TestCase):
       params = constant_op.constant(np.array([8, 1, 2, 3, 7, 5], dtype=dtype))
       indices = constant_op.constant([[4], [4], [0]])
       gather_nd_t = array_ops.gather_nd(params, indices)
-      gather_nd_val = gather_nd_t.eval()
+      gather_nd_val = self.evaluate(gather_nd_t)
 
     self.assertAllEqual(np.array([7, 7, 8], dtype=dtype), gather_nd_val)
     self.assertEqual([3], gather_nd_t.get_shape())
@@ -54,26 +55,27 @@ class GatherNdTest(test.TestCase):
     self._testSimpleDtype(np.complex128)
     self._testSimpleDtype("|S")  # byte strings in python2 + 3
 
+  @test_util.run_deprecated_v1
   def testEmptyIndicesAndParamsOKButJustEmptyParamsFails(self):
     with self.session(use_gpu=True):
       params = np.ones((3, 3), dtype=np.float32)
 
       indices_empty = np.empty((0, 2), dtype=np.int32)
       gather_nd_ok_t = array_ops.gather_nd(params, indices_empty)
-      gather_nd_ok_val = gather_nd_ok_t.eval()
+      gather_nd_ok_val = self.evaluate(gather_nd_ok_t)
       self.assertEqual([0], gather_nd_ok_t.get_shape())
       self.assertAllClose(np.empty((0,), dtype=np.float32), gather_nd_ok_val)
 
       indices_empty = np.empty((0, 1), dtype=np.int32)
       gather_nd_ok_t = array_ops.gather_nd(params, indices_empty)
-      gather_nd_ok_val = gather_nd_ok_t.eval()
+      gather_nd_ok_val = self.evaluate(gather_nd_ok_t)
       self.assertEqual([0, 3], gather_nd_ok_t.get_shape())
       self.assertAllClose(np.empty((0, 3), dtype=np.float32), gather_nd_ok_val)
 
       params_empty = np.empty((0, 3), dtype=np.float32)
       indices_empty = np.empty((0, 2), dtype=np.int32)
       gather_nd_ok_t = array_ops.gather_nd(params_empty, indices_empty)
-      gather_nd_ok_val = gather_nd_ok_t.eval()
+      gather_nd_ok_val = self.evaluate(gather_nd_ok_t)
       self.assertEqual([0], gather_nd_ok_t.get_shape())
       self.assertAllClose(np.empty((0,), dtype=np.float32), gather_nd_ok_val)
 
@@ -82,7 +84,7 @@ class GatherNdTest(test.TestCase):
       gather_nd_break_t = array_ops.gather_nd(params_empty, indices_nonempty)
       with self.assertRaisesOpError(
           r"Requested more than 0 entries, but params is empty."):
-        gather_nd_break_t.eval()
+        self.evaluate(gather_nd_break_t)
       self.assertAllClose(np.empty((0,), dtype=np.float32), gather_nd_ok_val)
 
   def testIndexScalar(self):
@@ -91,7 +93,7 @@ class GatherNdTest(test.TestCase):
           [[-8, -1, -2, -3, -7, -5], [8, 1, 2, 3, 7, 5]], dtype=np.float32).T
       indices = constant_op.constant([4, 1])
       gather_nd_t = array_ops.gather_nd(params, indices)
-      gather_nd_val = gather_nd_t.eval()
+      gather_nd_val = self.evaluate(gather_nd_t)
       self.assertEqual([], gather_nd_t.get_shape())
       self.assertAllEqual(np.array(7), gather_nd_val)
 
@@ -101,7 +103,7 @@ class GatherNdTest(test.TestCase):
           [[-8, -1, -2, -3, -7, -5], [8, 1, 2, 3, 7, 5]], dtype=np.float32).T
       indices = constant_op.constant([4])
       gather_nd_t = array_ops.gather_nd(params, indices)
-      gather_nd_val = gather_nd_t.eval()
+      gather_nd_val = self.evaluate(gather_nd_t)
       self.assertEqual([2], gather_nd_t.get_shape())
       self.assertAllEqual(np.array([-7, 7]), gather_nd_val)
 
@@ -111,7 +113,7 @@ class GatherNdTest(test.TestCase):
           [[-8, -1, -2, -3, -7, -5], [8, 1, 2, 3, 7, 5]], dtype=np.float32).T
       indices = constant_op.constant([[4], [4], [0]])
       gather_nd_t = array_ops.gather_nd(params, indices)
-      gather_nd_val = gather_nd_t.eval()
+      gather_nd_val = self.evaluate(gather_nd_t)
 
     self.assertEqual([3, 2], gather_nd_t.get_shape())
     self.assertAllEqual(np.array([[-7, 7], [-7, 7], [-8, 8]]), gather_nd_val)
@@ -125,7 +127,7 @@ class GatherNdTest(test.TestCase):
       params_t = constant_op.constant(params)
       indices = constant_op.constant([[4], [4], [0]])
       gather_nd_t = array_ops.gather_nd(params_t, indices)
-      gather_nd_val = gather_nd_t.eval()
+      gather_nd_val = self.evaluate(gather_nd_t)
 
     self.assertEqual([3, 2, 2], gather_nd_t.get_shape())
     self.assertAllEqual(params[[4, 4, 0]], gather_nd_val)
@@ -140,7 +142,7 @@ class GatherNdTest(test.TestCase):
       indices = constant_op.constant(
           [[], []], dtype=dtypes.int32)  # Size (2, 0)
       gather_nd_t = array_ops.gather_nd(params_t, indices)
-      gather_nd_val = gather_nd_t.eval()
+      gather_nd_val = self.evaluate(gather_nd_t)
 
     self.assertEqual([2, 6, 2, 2], gather_nd_t.get_shape())
     self.assertAllEqual(
@@ -156,7 +158,7 @@ class GatherNdTest(test.TestCase):
       params_t = constant_op.constant(params)
       indices = constant_op.constant([[[3], [2], [1]], [[4], [4], [0]]])
       gather_nd_t = array_ops.gather_nd(params_t, indices)
-      gather_nd_val = gather_nd_t.eval()
+      gather_nd_val = self.evaluate(gather_nd_t)
 
     self.assertEqual([2, 3, 2, 2], gather_nd_t.get_shape())
     self.assertAllEqual(params[[3, 2, 1, 4, 4, 0]].reshape(2, 3, 2, 2),
@@ -168,7 +170,7 @@ class GatherNdTest(test.TestCase):
       params = np.random.rand(*shape)
       indices = np.vstack([np.random.randint(0, s, size=2000) for s in shape]).T
       gather_nd_t = array_ops.gather_nd(params, indices)
-      gather_nd_val = gather_nd_t.eval()
+      gather_nd_val = self.evaluate(gather_nd_t)
 
     expected = params[tuple(indices.T)]
     self.assertAllEqual(expected, gather_nd_val)
@@ -181,7 +183,7 @@ class GatherNdTest(test.TestCase):
       indices = np.vstack([np.random.randint(0, s, size=2000) for s in shape]).T
       indices_reshaped = indices.reshape([10, 10, 20, 5])
       gather_nd_t = array_ops.gather_nd(params, indices_reshaped)
-      gather_nd_val = gather_nd_t.eval()
+      gather_nd_val = self.evaluate(gather_nd_t)
 
     expected = params[tuple(indices.T)]
     self.assertAllEqual(expected.reshape([10, 10, 20]), gather_nd_val)
@@ -190,6 +192,7 @@ class GatherNdTest(test.TestCase):
   def assertIndexedSlices(self, t):
     self.assertIsInstance(t, ops.IndexedSlices)
 
+  @test_util.run_deprecated_v1
   def testUnknownIndices(self):
     params = constant_op.constant([[0, 1, 2]])
     indices = array_ops.placeholder(dtypes.int32)
@@ -198,6 +201,7 @@ class GatherNdTest(test.TestCase):
     self.assertEqual(None, shape.ndims)
     self.assertEqual(None, tensor_shape.dimension_value(shape[0]))
 
+  @test_util.run_deprecated_v1
   def testBadIndicesCPU(self):
     with self.session(use_gpu=False):
       params = [0, 1, 2]
@@ -205,7 +209,7 @@ class GatherNdTest(test.TestCase):
       gather_nd = array_ops.gather_nd(params, indices)
       with self.assertRaisesOpError(
           r"indices\[0,1\] = \[7\] does not index into param shape \[3\]"):
-        gather_nd.eval()
+        self.evaluate(gather_nd)
 
   def _disabledTestBadIndicesGPU(self):
     # TODO disabled due to different behavior on GPU and CPU
@@ -218,8 +222,9 @@ class GatherNdTest(test.TestCase):
       gather_nd = array_ops.gather_nd(params, indices)
       with self.assertRaisesOpError(
           r"indices\[0,1\] = \[7\] does not index into param shape \[3\]"):
-        gather_nd.eval()
+        self.evaluate(gather_nd)
 
+  @test_util.run_deprecated_v1
   def testBadIndicesWithSlicesCPU(self):
     with self.session(use_gpu=False):
       params = [[0, 1, 2]]
@@ -227,7 +232,7 @@ class GatherNdTest(test.TestCase):
       gather_nd = array_ops.gather_nd(params, indices)
       with self.assertRaisesOpError(
           r"indices\[0,2\] = \[1\] does not index into param shape \[1,3\]"):
-        gather_nd.eval()
+        self.evaluate(gather_nd)
 
   def _disabledTestBadIndicesWithSlicesGPU(self):
     # TODO disabled due to different behavior on GPU and CPU
@@ -240,8 +245,9 @@ class GatherNdTest(test.TestCase):
       gather_nd = array_ops.gather_nd(params, indices)
       with self.assertRaisesOpError(
           r"indices\[0,2\] = \[1\] does not index into param shape \[1,3\]"):
-        gather_nd.eval()
+        self.evaluate(gather_nd)
 
+  @test_util.run_deprecated_v1
   def testGradientsRank2Elements(self):
     indices = constant_op.constant([[0, 0], [1, 1]], dtype=dtypes.int32)
     inputs = constant_op.constant([[1, 2], [3, 4]], dtype=dtypes.float64)
@@ -251,8 +257,9 @@ class GatherNdTest(test.TestCase):
     grads = gradients_impl.gradients([outputs], [inputs], [grad_vals])[0]
     expected_grads = np.array([[1, 0], [0, 2]], dtype=np.float64)
     with self.session(use_gpu=True):
-      assert np.array_equal(expected_grads, grads.eval())
+      assert np.array_equal(expected_grads, self.evaluate(grads))
 
+  @test_util.run_deprecated_v1
   def testGradientsRank2Slices(self):
     indices = constant_op.constant([[1], [0]], dtype=dtypes.int32)
     inputs = constant_op.constant([[1, 2], [3, 4]], dtype=dtypes.float64)
@@ -265,6 +272,7 @@ class GatherNdTest(test.TestCase):
       self.assertIndexedSlices(grads)
       self.assertAllEqual(expected_grads, ops.convert_to_tensor(grads).eval())
 
+  @test_util.run_deprecated_v1
   def testGradientsRank3Elements(self):
     indices = constant_op.constant(
         [[[0, 1], [1, 0]], [[0, 0], [1, 1]]], dtype=dtypes.int32)
@@ -278,8 +286,9 @@ class GatherNdTest(test.TestCase):
     expected_grads = np.array(
         [[[5, 6], [1, 2]], [[3, 4], [7, 8]]], dtype=np.float64)
     with self.session(use_gpu=True):
-      self.assertAllEqual(expected_grads, grads.eval())
+      self.assertAllEqual(expected_grads, self.evaluate(grads))
 
+  @test_util.run_deprecated_v1
   def testGradientsRank7Elements(self):
     # Shape [1,1,2,1,1,2,2]
     indices = constant_op.constant(
@@ -307,8 +316,9 @@ class GatherNdTest(test.TestCase):
             [[[[3, 4], [7, 8]]]]
         ]]], dtype=np.float64)
     with self.session(use_gpu=True):
-      self.assertAllEqual(expected_grads, grads.eval())
+      self.assertAllEqual(expected_grads, self.evaluate(grads))
 
+  @test_util.run_deprecated_v1
   def testGradientsInt64Indices(self):
     indices = constant_op.constant(
         [[[0, 1], [1, 0]], [[0, 0], [1, 1]]], dtype=dtypes.int64)
@@ -322,8 +332,9 @@ class GatherNdTest(test.TestCase):
     expected_grads = np.array(
         [[[5, 6], [1, 2]], [[3, 4], [7, 8]]], dtype=np.float64)
     with self.session(use_gpu=True):
-      self.assertAllEqual(expected_grads, grads.eval())
+      self.assertAllEqual(expected_grads, self.evaluate(grads))
 
+  @test_util.run_deprecated_v1
   def testGradientsRank2SlicesWithEmptySpace(self):
     indices = constant_op.constant([[2], [0], [5]], dtype=dtypes.int32)
     inputs = constant_op.constant(
@@ -361,10 +372,10 @@ class GatherNdOpBenchmark(test.Benchmark):
       gather_op = array_ops.gather_nd(t_params, t_indices)
       variables.global_variables_initializer().run()
       for _ in range(10):
-        gather_op.eval()
+        self.evaluate(gather_op)
       t1 = time.time()
       for _ in range(1000):
-        gather_op.eval()
+        self.evaluate(gather_op)
       t2 = time.time()
       self.report_benchmark(iters=1000, wall_time=(t2 - t1) / 1000.0)
 
diff --git a/tensorflow/python/kernel_tests/gather_op_test.py b/tensorflow/python/kernel_tests/gather_op_test.py
index bdafc52ab5e..fc86068c3fc 100644
--- a/tensorflow/python/kernel_tests/gather_op_test.py
+++ b/tensorflow/python/kernel_tests/gather_op_test.py
@@ -23,6 +23,7 @@ import numpy as np
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.platform import test
@@ -50,7 +51,7 @@ class GatherTest(test.TestCase):
           params = constant_op.constant(params_np)
           indices_tf = constant_op.constant(indices)
           gather_t = array_ops.gather(params, indices_tf)
-          gather_val = gather_t.eval()
+          gather_val = self.evaluate(gather_t)
           np_val = params_np[indices]
           self.assertAllEqual(np_val, gather_val)
           self.assertEqual(np_val.shape, gather_t.get_shape())
@@ -65,7 +66,7 @@ class GatherTest(test.TestCase):
           params = constant_op.constant(params_np)
           indices = constant_op.constant(2)
           gather_t = array_ops.gather(params, indices, axis=axis)
-          gather_val = gather_t.eval()
+          gather_val = self.evaluate(gather_t)
           self.assertAllEqual(np.take(params_np, 2, axis=axis), gather_val)
           expected_shape = data.shape[:axis] + data.shape[axis + 1:]
           self.assertEqual(expected_shape, gather_t.get_shape())
@@ -81,12 +82,13 @@ class GatherTest(test.TestCase):
           # The indices must be in bounds for any axis.
           indices = constant_op.constant([0, 1, 0, 2])
           gather_t = array_ops.gather(params, indices, axis=axis)
-          gather_val = gather_t.eval()
+          gather_val = self.evaluate(gather_t)
           self.assertAllEqual(np.take(params_np, [0, 1, 0, 2], axis=axis),
                               gather_val)
           expected_shape = data.shape[:axis] + (4,) + data.shape[axis + 1:]
           self.assertEqual(expected_shape, gather_t.get_shape())
 
+  @test_util.run_deprecated_v1
   def testHigherRank(self):
     # We check that scalar and empty indices shapes work as well
     shape = (2, 1, 3, 2)
@@ -142,9 +144,13 @@ class GatherTest(test.TestCase):
               source_slice = ((slice(None),) * outer_dims + (source_index,) +
                               (slice(None),) * inner_dims)
               correct_params_grad[dest_slice] += gather_grad[source_slice]
-            self.assertAllClose(correct_params_grad, params_grad.eval(),
-                                atol=2e-6, rtol=2e-6)
+            self.assertAllClose(
+                correct_params_grad,
+                self.evaluate(params_grad),
+                atol=2e-6,
+                rtol=2e-6)
 
+  @test_util.run_deprecated_v1
   def testString(self):
     params = np.array([[b"asdf", b"zxcv"], [b"qwer", b"uiop"]])
     with self.cached_session():
@@ -153,6 +159,7 @@ class GatherTest(test.TestCase):
       self.assertAllEqual([b"asdf", b"qwer"],
                           array_ops.gather(params, 0, axis=1).eval())
 
+  @test_util.run_deprecated_v1
   def testUInt32AndUInt64(self):
     for unsigned_type in (dtypes.uint32, dtypes.uint64):
       params = self._buildParams(
@@ -162,12 +169,14 @@ class GatherTest(test.TestCase):
                             array_ops.gather(params, 1, axis=0).eval())
         self.assertAllEqual([1, 7], array_ops.gather(params, 0, axis=1).eval())
 
+  @test_util.run_deprecated_v1
   def testUnknownIndices(self):
     params = constant_op.constant([[0, 1, 2]])
     indices = array_ops.placeholder(dtypes.int32)
     gather_t = array_ops.gather(params, indices)
     self.assertEqual(None, gather_t.get_shape())
 
+  @test_util.run_deprecated_v1
   def testUnknownAxis(self):
     params = constant_op.constant([[0, 1, 2]])
     indices = constant_op.constant([[0, 0], [0, 0]])
@@ -201,6 +210,7 @@ class GatherTest(test.TestCase):
       with self.assertRaisesOpError(r"indices\[0,0\] = 7 is not in \[0, 3\)"):
         array_ops.gather(params, [[7]], axis=1).eval()
 
+  @test_util.run_deprecated_v1
   def testBadAxis(self):
     with self.session(use_gpu=True):
       params = [0, 1, 2]
@@ -217,6 +227,7 @@ class GatherTest(test.TestCase):
           array_ops.gather(params_ph, indices, axis=bad_axis).eval(
               feed_dict={params_ph: params})
 
+  @test_util.run_deprecated_v1
   def testEmptySlices(self):
     with self.session(use_gpu=True):
       for dtype in _TEST_TYPES:
diff --git a/tensorflow/python/kernel_tests/gradient_correctness_test.py b/tensorflow/python/kernel_tests/gradient_correctness_test.py
index 291a69ebac6..0148de5047a 100644
--- a/tensorflow/python/kernel_tests/gradient_correctness_test.py
+++ b/tensorflow/python/kernel_tests/gradient_correctness_test.py
@@ -22,6 +22,7 @@ import numpy as np
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
@@ -29,37 +30,42 @@ from tensorflow.python.platform import test
 
 class GradientCorrectnessTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testMultipleOutputChainedGradients(self):
     with self.cached_session() as sess:
       x = constant_op.constant(1.0, dtype=dtypes.float32)
       yexp = math_ops.exp(x)
       yexplog = math_ops.log(yexp)
       grads = gradients_impl.gradients([yexp, yexplog], [x])
-      grad_vals = sess.run(grads)
+      grad_vals = self.evaluate(grads)
       exp1_plus_one = (1.0 + np.exp(1.0)).astype(np.float32)
       # [dexp(x)/dx + d(log(exp(x)))/dx] @ x=1 == exp(1) + 1
       self.assertAllClose(grad_vals[0], exp1_plus_one)
 
+  @test_util.run_deprecated_v1
   def testIdentityGradient(self):
     x = constant_op.constant(3.)
     dx_dx, = gradients_impl.gradients(x, x)
     with self.cached_session() as sess:
-      self.assertAllClose(1., sess.run(dx_dx))
+      self.assertAllClose(1., self.evaluate(dx_dx))
 
+  @test_util.run_deprecated_v1
   def testIntegerIdentityGradient(self):
     x = constant_op.constant(3)
     dx_dx, = gradients_impl.gradients(x, x)
     with self.cached_session() as sess:
-      self.assertAllClose(1, sess.run(dx_dx))
+      self.assertAllClose(1, self.evaluate(dx_dx))
 
+  @test_util.run_deprecated_v1
   def testGradientWithIntegerPath(self):
     x = constant_op.constant([3.9, 4.1])
     k = math_ops.to_float(math_ops.to_int32(x))
     y = x * k
     dy_dx, = gradients_impl.gradients(y, x)
     with self.cached_session() as sess:
-      self.assertAllClose([3., 4.], sess.run(dy_dx))
+      self.assertAllClose([3., 4.], self.evaluate(dy_dx))
 
+  @test_util.run_deprecated_v1
   def testNoIntegerGradient1(self):
     x = constant_op.constant([3.9, 4.1])
     k = math_ops.to_float(math_ops.to_int32(x))
@@ -67,6 +73,7 @@ class GradientCorrectnessTest(test.TestCase):
     dy_dx, = gradients_impl.gradients(y, x)
     self.assertIsNone(dy_dx)
 
+  @test_util.run_deprecated_v1
   def testNoIntegerGradient2(self):
     k = constant_op.constant([3, 4])
     x = math_ops.to_float(k)
@@ -74,18 +81,21 @@ class GradientCorrectnessTest(test.TestCase):
     dy_dk, = gradients_impl.gradients(y, k)
     self.assertIsNone(dy_dk)
 
+  @test_util.run_deprecated_v1
   def testNoIntegerGradient3(self):
     k = constant_op.constant([3, 4])
     m = k * k
     dm_dk, = gradients_impl.gradients(m, k)
     self.assertIsNone(dm_dk)
 
+  @test_util.run_deprecated_v1
   def testNoIntegerGradient4(self):
     k = constant_op.constant([3, 4])
     m = k * k * k
     dm_dk, = gradients_impl.gradients(m, k)
     self.assertIsNone(dm_dk)
 
+  @test_util.run_deprecated_v1
   def testNoIntegerGradient5(self):
     k = constant_op.constant([3, 4])
     m = k * k
@@ -93,6 +103,7 @@ class GradientCorrectnessTest(test.TestCase):
     dn_dk, = gradients_impl.gradients(n, k)
     self.assertIsNone(dn_dk)
 
+  @test_util.run_deprecated_v1
   def testNoIntegerGradient6(self):
     k = constant_op.constant(3)
     x = math_ops.to_float(k)
diff --git a/tensorflow/python/kernel_tests/huge_slice_op_test.py b/tensorflow/python/kernel_tests/huge_slice_op_test.py
index 8646d74c96f..4074946350a 100644
--- a/tensorflow/python/kernel_tests/huge_slice_op_test.py
+++ b/tensorflow/python/kernel_tests/huge_slice_op_test.py
@@ -33,11 +33,11 @@ class SliceTest(test.TestCase):
       a_large = array_ops.tile(
           constant_op.constant(np.array([False, True] * 4)), [2**29 + 3])
       slice_t = array_ops.slice(a_large, np.asarray([3]).astype(np.int64), [3])
-      slice_val = slice_t.eval()
+      slice_val = self.evaluate(slice_t)
       self.assertAllEqual([True, False, True], slice_val)
 
       slice_t = array_ops.slice(
           a_large, constant_op.constant([long(2)**32 + 3], dtype=dtypes.int64),
           [3])
-      slice_val = slice_t.eval()
+      slice_val = self.evaluate(slice_t)
       self.assertAllEqual([True, False, True], slice_val)
diff --git a/tensorflow/python/kernel_tests/identity_n_op_py_test.py b/tensorflow/python/kernel_tests/identity_n_op_py_test.py
index 518733cd8e9..a1110d640f0 100644
--- a/tensorflow/python/kernel_tests/identity_n_op_py_test.py
+++ b/tensorflow/python/kernel_tests/identity_n_op_py_test.py
@@ -21,12 +21,14 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
 
 
 class IdentityNOpTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testInt32String_6(self):
     with self.cached_session() as sess:
       [value0, value1] = sess.run(
@@ -36,6 +38,7 @@ class IdentityNOpTest(test.TestCase):
     self.assertAllEqual(
         np.array([b"a", b"b", b"C", b"d", b"E", b"f", b"g"]), value1)
 
+  @test_util.run_deprecated_v1
   def testInt32_shapes(self):
     with self.cached_session() as sess:
       inp0 = constant_op.constant([10, 20, 30, 40, 50, 60], shape=[2, 3])
@@ -50,6 +53,7 @@ class IdentityNOpTest(test.TestCase):
         np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9], [10, 11, 12], [13, 14, 15]]),
         value2)
 
+  @test_util.run_deprecated_v1
   def testString(self):
     source = [b"A", b"b", b"C", b"d", b"E", b"f"]
     with self.cached_session() as sess:
diff --git a/tensorflow/python/kernel_tests/identity_op_py_test.py b/tensorflow/python/kernel_tests/identity_op_py_test.py
index 88ea10c22a3..1a6794e896f 100644
--- a/tensorflow/python/kernel_tests/identity_op_py_test.py
+++ b/tensorflow/python/kernel_tests/identity_op_py_test.py
@@ -22,6 +22,7 @@ import numpy as np
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_array_ops
 from tensorflow.python.ops import variables
@@ -30,17 +31,20 @@ from tensorflow.python.platform import test
 
 class IdentityOpTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testInt32_6(self):
     with self.cached_session():
       value = array_ops.identity([1, 2, 3, 4, 5, 6]).eval()
     self.assertAllEqual(np.array([1, 2, 3, 4, 5, 6]), value)
 
+  @test_util.run_deprecated_v1
   def testInt32_2_3(self):
     with self.cached_session():
       inp = constant_op.constant([10, 20, 30, 40, 50, 60], shape=[2, 3])
       value = array_ops.identity(inp).eval()
     self.assertAllEqual(np.array([[10, 20, 30], [40, 50, 60]]), value)
 
+  @test_util.run_deprecated_v1
   def testString(self):
     source = [b"A", b"b", b"C", b"d", b"E", b"f"]
     with self.cached_session():
@@ -58,6 +62,7 @@ class IdentityOpTest(test.TestCase):
       self.assertEquals(shape,
                         array_ops.identity(np.array(array_2x3)).get_shape())
 
+  @test_util.run_deprecated_v1
   def testRefIdentityShape(self):
     with self.cached_session():
       shape = [2, 3]
diff --git a/tensorflow/python/kernel_tests/in_topk_op_test.py b/tensorflow/python/kernel_tests/in_topk_op_test.py
index 6fdb497bc6f..507822b3142 100644
--- a/tensorflow/python/kernel_tests/in_topk_op_test.py
+++ b/tensorflow/python/kernel_tests/in_topk_op_test.py
@@ -32,7 +32,7 @@ class InTopKTest(test.TestCase):
     np_ans = np.array(expected)
     with self.cached_session():
       precision = nn_ops.in_top_k(predictions, target, k)
-      out = precision.eval()
+      out = self.evaluate(precision)
       self.assertAllClose(np_ans, out)
       self.assertShapeEqual(np_ans, precision)
 
@@ -77,7 +77,7 @@ class InTopKTest(test.TestCase):
     np_ans = np.array([False, True])
     with self.cached_session():
       precision = nn_ops.in_top_k(predictions, target, k)
-      out = precision.eval()
+      out = self.evaluate(precision)
       self.assertAllClose(np_ans, out)
       self.assertShapeEqual(np_ans, precision)
 
diff --git a/tensorflow/python/kernel_tests/init_ops_test.py b/tensorflow/python/kernel_tests/init_ops_test.py
index 70bfbf8544a..09b9944baa1 100644
--- a/tensorflow/python/kernel_tests/init_ops_test.py
+++ b/tensorflow/python/kernel_tests/init_ops_test.py
@@ -25,6 +25,7 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import random_seed
+from tensorflow.python.framework import test_util
 from tensorflow.python.layers import convolutional
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import init_ops
@@ -106,6 +107,7 @@ def _init_sampler(tc, init, num):
 
 class ConstantInitializersTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testZerosInitializer(self):
     with self.session(use_gpu=True):
       shape = [2, 3]
@@ -114,6 +116,7 @@ class ConstantInitializersTest(test.TestCase):
       x.initializer.run()
       self.assertAllEqual(x.eval(), np.zeros(shape))
 
+  @test_util.run_deprecated_v1
   def testOnesInitializer(self):
     with self.session(use_gpu=True):
       shape = [2, 3]
@@ -122,6 +125,7 @@ class ConstantInitializersTest(test.TestCase):
       x.initializer.run()
       self.assertAllEqual(x.eval(), np.ones(shape))
 
+  @test_util.run_deprecated_v1
   def testConstantZeroInitializer(self):
     with self.session(use_gpu=True):
       shape = [2, 3]
@@ -130,6 +134,7 @@ class ConstantInitializersTest(test.TestCase):
       x.initializer.run()
       self.assertAllEqual(x.eval(), np.zeros(shape))
 
+  @test_util.run_deprecated_v1
   def testConstantOneInitializer(self):
     with self.session(use_gpu=True):
       shape = [2, 3]
@@ -138,6 +143,7 @@ class ConstantInitializersTest(test.TestCase):
       x.initializer.run()
       self.assertAllEqual(x.eval(), np.ones(shape))
 
+  @test_util.run_deprecated_v1
   def testConstantIntInitializer(self):
     with self.session(use_gpu=True):
       shape = [2, 3]
@@ -150,6 +156,7 @@ class ConstantInitializersTest(test.TestCase):
       self.assertEqual(x.dtype.base_dtype, dtypes.int32)
       self.assertAllEqual(x.eval(), 7 * np.ones(shape, dtype=np.int32))
 
+  @test_util.run_deprecated_v1
   def testConstantTupleInitializer(self):
     with self.session(use_gpu=True):
       shape = [3]
@@ -173,6 +180,7 @@ class ConstantInitializersTest(test.TestCase):
       for a, e in zip(actual, expected):
         self.assertEqual(a, e)
 
+  @test_util.run_deprecated_v1
   def testNDimConstantInitializer(self):
     value = [0, 1, 2, 3, 4, 5]
     shape = [2, 3]
@@ -199,6 +207,7 @@ class ConstantInitializersTest(test.TestCase):
         e = expected[i] if i < len(expected) else expected[-1]
         self.assertEqual(a, e)
 
+  @test_util.run_deprecated_v1
   def testNDimConstantInitializerLessValues(self):
     value = [0, 1, 2, 3, 4, 5]
     shape = [2, 4]
@@ -222,6 +231,7 @@ class ConstantInitializersTest(test.TestCase):
           shape=shape,
           initializer=init)
 
+  @test_util.run_deprecated_v1
   def testNDimConstantInitializerMoreValues(self):
     value = [0, 1, 2, 3, 4, 5, 6, 7]
     shape = [2, 3]
@@ -243,18 +253,21 @@ class ConstantInitializersTest(test.TestCase):
 
 class RandomNormalInitializationTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testInitializerIdentical(self):
     for dtype in [dtypes.float32, dtypes.float64]:
       init1 = init_ops.random_normal_initializer(0.0, 1.0, seed=1, dtype=dtype)
       init2 = init_ops.random_normal_initializer(0.0, 1.0, seed=1, dtype=dtype)
       self.assertTrue(identicaltest(self, init1, init2))
 
+  @test_util.run_deprecated_v1
   def testInitializerDifferent(self):
     for dtype in [dtypes.float32, dtypes.float64]:
       init1 = init_ops.random_normal_initializer(0.0, 1.0, seed=1, dtype=dtype)
       init2 = init_ops.random_normal_initializer(0.0, 1.0, seed=2, dtype=dtype)
       self.assertFalse(identicaltest(self, init1, init2))
 
+  @test_util.run_deprecated_v1
   def testDuplicatedInitializer(self):
     init = init_ops.random_normal_initializer(0.0, 1.0)
     self.assertFalse(duplicated_initializer(self, init, 1))
@@ -270,6 +283,7 @@ class RandomNormalInitializationTest(test.TestCase):
 
 class TruncatedNormalInitializationTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testInitializerIdentical(self):
     for dtype in [dtypes.float32, dtypes.float64]:
       init1 = init_ops.truncated_normal_initializer(
@@ -278,6 +292,7 @@ class TruncatedNormalInitializationTest(test.TestCase):
           0.0, 1.0, seed=1, dtype=dtype)
       self.assertTrue(identicaltest(self, init1, init2))
 
+  @test_util.run_deprecated_v1
   def testInitializerDifferent(self):
     for dtype in [dtypes.float32, dtypes.float64]:
       init1 = init_ops.truncated_normal_initializer(
@@ -286,6 +301,7 @@ class TruncatedNormalInitializationTest(test.TestCase):
           0.0, 1.0, seed=2, dtype=dtype)
       self.assertFalse(identicaltest(self, init1, init2))
 
+  @test_util.run_deprecated_v1
   def testDuplicatedInitializer(self):
     init = init_ops.truncated_normal_initializer(0.0, 1.0)
     self.assertFalse(duplicated_initializer(self, init, 1))
@@ -301,18 +317,21 @@ class TruncatedNormalInitializationTest(test.TestCase):
 
 class RandomUniformInitializationTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testInitializerIdentical(self):
     for dtype in [dtypes.float32, dtypes.float64, dtypes.int64]:
       init1 = init_ops.random_uniform_initializer(0, 7, seed=1, dtype=dtype)
       init2 = init_ops.random_uniform_initializer(0, 7, seed=1, dtype=dtype)
       self.assertTrue(identicaltest(self, init1, init2))
 
+  @test_util.run_deprecated_v1
   def testInitializerDifferent(self):
     for dtype in [dtypes.float32, dtypes.float64, dtypes.int32, dtypes.int64]:
       init1 = init_ops.random_uniform_initializer(0, 7, seed=1, dtype=dtype)
       init2 = init_ops.random_uniform_initializer(0, 7, seed=2, dtype=dtype)
       self.assertFalse(identicaltest(self, init1, init2))
 
+  @test_util.run_deprecated_v1
   def testDuplicatedInitializer(self):
     init = init_ops.random_uniform_initializer(0.0, 1.0)
     self.assertFalse(duplicated_initializer(self, init, 1))
@@ -320,6 +339,7 @@ class RandomUniformInitializationTest(test.TestCase):
 
 class UniformUnitScalingInitializationTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testInitializerIdentical(self):
     for dtype in [dtypes.float32, dtypes.float64]:
       init1 = init_ops.uniform_unit_scaling_initializer(seed=1, dtype=dtype)
@@ -331,6 +351,7 @@ class UniformUnitScalingInitializationTest(test.TestCase):
           1.5, seed=1, dtype=dtype)
       self.assertTrue(identicaltest(self, init3, init4))
 
+  @test_util.run_deprecated_v1
   def testInitializerDifferent(self):
     for dtype in [dtypes.float32, dtypes.float64]:
       init1 = init_ops.uniform_unit_scaling_initializer(seed=1, dtype=dtype)
@@ -341,6 +362,7 @@ class UniformUnitScalingInitializationTest(test.TestCase):
       self.assertFalse(identicaltest(self, init1, init3))
       self.assertFalse(identicaltest(self, init2, init3))
 
+  @test_util.run_deprecated_v1
   def testZeroSize(self):
     shape = [0, 2]
     with self.cached_session():
@@ -349,8 +371,9 @@ class UniformUnitScalingInitializationTest(test.TestCase):
           shape=shape,
           initializer=init_ops.uniform_unit_scaling_initializer())
       variables.global_variables_initializer().run()
-      self.assertAllEqual(shape, x.eval().shape)
+      self.assertAllEqual(shape, self.evaluate(x).shape)
 
+  @test_util.run_deprecated_v1
   def testDuplicatedInitializer(self):
     init = init_ops.uniform_unit_scaling_initializer()
     self.assertFalse(duplicated_initializer(self, init, 1))
@@ -364,6 +387,7 @@ class UniformUnitScalingInitializationTest(test.TestCase):
 
 class VarianceScalingInitializationTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testTruncatedNormalDistribution(self):
     shape = [100, 100]
     expect_mean = 0.
@@ -381,6 +405,7 @@ class VarianceScalingInitializationTest(test.TestCase):
     self.assertNear(np.mean(x), expect_mean, err=1e-2)
     self.assertNear(np.var(x), expect_var, err=1e-2)
 
+  @test_util.run_deprecated_v1
   def testNormalDistribution(self):
     shape = [100, 100]
     expect_mean = 0.
@@ -397,6 +422,7 @@ class VarianceScalingInitializationTest(test.TestCase):
     self.assertNear(np.mean(x), expect_mean, err=1e-2)
     self.assertNear(np.var(x), expect_var, err=1e-2)
 
+  @test_util.run_deprecated_v1
   def testUntruncatedNormalDistribution(self):
     shape = [100, 100]
     expect_mean = 0.
@@ -414,6 +440,7 @@ class VarianceScalingInitializationTest(test.TestCase):
     self.assertNear(np.mean(x), expect_mean, err=1e-2)
     self.assertNear(np.var(x), expect_var, err=1e-2)
 
+  @test_util.run_deprecated_v1
   def testUniformDistribution(self):
     shape = [100, 100]
     expect_mean = 0.
@@ -435,7 +462,7 @@ class RangeTest(test.TestCase):
       tf_ans = math_ops.range(start, limit, delta, name="range")
       self.assertEqual([len(np.arange(start, limit, delta))],
                        tf_ans.get_shape())
-      return tf_ans.eval()
+      return self.evaluate(tf_ans)
 
   def testBasic(self):
     self.assertTrue(
@@ -449,6 +476,7 @@ class RangeTest(test.TestCase):
             self._Range(100, 500, 100), np.array([100, 200, 300, 400])))
     self.assertEqual(math_ops.range(0, 5, 1).dtype, dtypes.int32)
 
+  @test_util.run_deprecated_v1
   def testLimitOnly(self):
     with self.session(use_gpu=True):
       self.assertAllEqual(np.arange(5), math_ops.range(5).eval())
@@ -524,7 +552,7 @@ class LinSpaceTest(test.TestCase):
       with self.session(graph=graph, force_gpu=self.force_gpu):
         tf_ans = math_ops.linspace(start, stop, num, name="linspace")
         self.assertEqual([num], tf_ans.get_shape())
-        return tf_ans.eval()
+        return self.evaluate(tf_ans)
 
   def testPositive(self):
     for self.force_gpu in self._gpu_modes():
@@ -583,18 +611,21 @@ class DeviceTest(test.TestCase):
 
 class OrthogonalInitializerTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testInitializerIdentical(self):
     for dtype in [dtypes.float32, dtypes.float64]:
       init1 = init_ops.orthogonal_initializer(seed=1, dtype=dtype)
       init2 = init_ops.orthogonal_initializer(seed=1, dtype=dtype)
       self.assertTrue(identicaltest(self, init1, init2, (10, 10)))
 
+  @test_util.run_deprecated_v1
   def testInitializerDifferent(self):
     for dtype in [dtypes.float32, dtypes.float64]:
       init1 = init_ops.orthogonal_initializer(seed=1, dtype=dtype)
       init2 = init_ops.orthogonal_initializer(seed=2, dtype=dtype)
       self.assertFalse(identicaltest(self, init1, init2, (10, 10)))
 
+  @test_util.run_deprecated_v1
   def testDuplicatedInitializer(self):
     init = init_ops.orthogonal_initializer()
     self.assertFalse(duplicated_initializer(self, init, 1, (10, 10)))
@@ -608,6 +639,7 @@ class OrthogonalInitializerTest(test.TestCase):
     with self.session(graph=ops.Graph(), use_gpu=True):
       self.assertRaises(ValueError, init1, shape=[5])
 
+  @test_util.run_deprecated_v1
   def testGain(self):
     shape = (10, 10)
     for dtype in [dtypes.float32, dtypes.float64]:
@@ -616,8 +648,9 @@ class OrthogonalInitializerTest(test.TestCase):
       with self.session(graph=ops.Graph(), use_gpu=True):
         t1 = init1(shape).eval()
         t2 = init2(shape).eval()
-      return np.allclose(t1, t2 / 3.14, rtol=1e-15, atol=1e-15)
+      self.assertAllClose(t1, t2 / 3.14)
 
+  @test_util.run_deprecated_v1
   def testShapesValues(self):
     for dtype in [dtypes.float32, dtypes.float64]:
       for shape in [(10, 10), (10, 9, 8), (100, 5, 5), (50, 40), (40, 50)]:
@@ -639,18 +672,21 @@ class OrthogonalInitializerTest(test.TestCase):
 
 class ConvolutionDeltaOrthogonalInitializerTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testInitializerIdentical(self):
     for dtype in [dtypes.float32, dtypes.float64]:
       init1 = init_ops.convolutional_delta_orthogonal(seed=1, dtype=dtype)
       init2 = init_ops.convolutional_delta_orthogonal(seed=1, dtype=dtype)
       self.assertTrue(identicaltest(self, init1, init2, (3, 3, 10, 10)))
 
+  @test_util.run_deprecated_v1
   def testInitializerDifferent(self):
     for dtype in [dtypes.float32, dtypes.float64]:
       init1 = init_ops.convolutional_delta_orthogonal(seed=1, dtype=dtype)
       init2 = init_ops.convolutional_delta_orthogonal(seed=2, dtype=dtype)
       self.assertFalse(identicaltest(self, init1, init2, (3, 3, 10, 10)))
 
+  @test_util.run_deprecated_v1
   def testDuplicatedInitializer(self):
     init = init_ops.convolutional_delta_orthogonal()
     self.assertFalse(duplicated_initializer(self, init, 1, (3, 3, 10, 10)))
@@ -665,6 +701,7 @@ class ConvolutionDeltaOrthogonalInitializerTest(test.TestCase):
     with self.session(graph=ops.Graph(), use_gpu=True):
       self.assertRaises(ValueError, init1, shape=[3, 3, 6, 5])
 
+  @test_util.run_deprecated_v1
   def testGain(self):
     shape = (3, 3, 10, 10)
     for dtype in [dtypes.float32, dtypes.float64]:
@@ -674,8 +711,9 @@ class ConvolutionDeltaOrthogonalInitializerTest(test.TestCase):
       with self.session(graph=ops.Graph(), use_gpu=True):
         t1 = init1(shape).eval()
         t2 = init2(shape).eval()
-      return np.allclose(t1, t2 / 3.14, rtol=1e-15, atol=1e-15)
+      self.assertAllClose(t1, t2 / 3.14)
 
+  @test_util.run_deprecated_v1
   def testShapesValues(self):
     gain = 3.14
     for dtype in [dtypes.float32]:
@@ -704,14 +742,14 @@ class ConvolutionDeltaOrthogonalInitializerTest(test.TestCase):
         ratio = outputs_2norm / inputs_2norm
         my_ops = variables.global_variables_initializer()
         with self.session(use_gpu=True) as sess:
-          sess.run(my_ops)
+          self.evaluate(my_ops)
           # Check the shape of the outputs
-          t = outputs.eval()
+          t = self.evaluate(outputs)
           self.assertAllEqual(t.shape, outputs_shape)
           # Check isometry of the delta-orthogonal kernel.
-          self.assertAllClose(sess.run(ratio), np.sqrt(gain),
-                              rtol=tol, atol=tol)
+          self.assertAllClose(self.evaluate(ratio), gain, rtol=tol, atol=tol)
 
+  @test_util.run_deprecated_v1
   def testNonuniformity(self):
     value = 0
     abs_value = 0
@@ -724,7 +762,7 @@ class ConvolutionDeltaOrthogonalInitializerTest(test.TestCase):
                                         initializer=
                                         init_ops.convolutional_delta_orthogonal)
         x.initializer.run()
-        y = x.eval()[1, 1, :, :]
+        y = self.evaluate(x)[1, 1, :, :]
         determinant = np.linalg.det(y)
         value += determinant
         abs_value += np.abs(determinant)
@@ -739,18 +777,21 @@ class ConvolutionDeltaOrthogonalInitializerTest(test.TestCase):
 
 class ConvolutionOrthogonal1dInitializerTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testInitializerIdentical(self):
     for dtype in [dtypes.float32, dtypes.float64]:
       init1 = init_ops.convolutional_orthogonal_1d(seed=1, dtype=dtype)
       init2 = init_ops.convolutional_orthogonal_1d(seed=1, dtype=dtype)
       self.assertTrue(identicaltest(self, init1, init2, (3, 10, 10)))
 
+  @test_util.run_deprecated_v1
   def testInitializerDifferent(self):
     for dtype in [dtypes.float32, dtypes.float64]:
       init1 = init_ops.convolutional_orthogonal_1d(seed=1, dtype=dtype)
       init2 = init_ops.convolutional_orthogonal_1d(seed=2, dtype=dtype)
       self.assertFalse(identicaltest(self, init1, init2, (3, 10, 10)))
 
+  @test_util.run_deprecated_v1
   def testDuplicatedInitializer(self):
     init = init_ops.convolutional_orthogonal_1d()
     self.assertFalse(duplicated_initializer(self, init, 1, (3, 10, 10)))
@@ -765,6 +806,7 @@ class ConvolutionOrthogonal1dInitializerTest(test.TestCase):
     with self.session(graph=ops.Graph(), use_gpu=True):
       self.assertRaises(ValueError, init1, shape=[3, 6, 5])
 
+  @test_util.run_deprecated_v1
   def testGain(self):
     shape = (3, 10, 10)
     for dtype in [dtypes.float32, dtypes.float64]:
@@ -774,8 +816,9 @@ class ConvolutionOrthogonal1dInitializerTest(test.TestCase):
       with self.session(graph=ops.Graph(), use_gpu=True):
         t1 = init1(shape).eval()
         t2 = init2(shape).eval()
-      return np.allclose(t1, t2 / 3.14, rtol=1e-15, atol=1e-15)
+      self.assertAllClose(t1, t2 / 3.14)
 
+  @test_util.run_deprecated_v1
   def testNonuniformity(self):
     value = 0
     abs_value = 0
@@ -800,6 +843,7 @@ class ConvolutionOrthogonal1dInitializerTest(test.TestCase):
       # Compute the sum of the absolute values of 'count' determinants
       self.assertAllClose(abs_value, count, rtol=tol, atol=tol)
 
+  @test_util.run_deprecated_v1
   def testShapesValues(self):
     def circular_pad(input_, width, kernel_size):
       """Pad input_ for computing (circular) convolution.
@@ -843,28 +887,31 @@ class ConvolutionOrthogonal1dInitializerTest(test.TestCase):
       ratio = outputs_2norm / inputs_2norm
       my_ops = variables.global_variables_initializer()
       with self.session(use_gpu=True) as sess:
-        sess.run(my_ops)
+        self.evaluate(my_ops)
         # Check the shape of the outputs
-        t = outputs.eval()
+        t = self.evaluate(outputs)
         self.assertAllEqual(t.shape, outputs_shape)
         # Check isometry of the orthogonal kernel.
-        self.assertAllClose(sess.run(ratio), np.sqrt(gain), rtol=tol, atol=tol)
+        self.assertAllClose(self.evaluate(ratio), gain, rtol=tol, atol=tol)
 
 
 class ConvolutionOrthogonal2dInitializerTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testInitializerIdentical(self):
     for dtype in [dtypes.float32, dtypes.float64]:
       init1 = init_ops.convolutional_orthogonal_2d(seed=1, dtype=dtype)
       init2 = init_ops.convolutional_orthogonal_2d(seed=1, dtype=dtype)
       self.assertTrue(identicaltest(self, init1, init2, (3, 3, 10, 10)))
 
+  @test_util.run_deprecated_v1
   def testInitializerDifferent(self):
     for dtype in [dtypes.float32, dtypes.float64]:
       init1 = init_ops.convolutional_orthogonal_2d(seed=1, dtype=dtype)
       init2 = init_ops.convolutional_orthogonal_2d(seed=2, dtype=dtype)
       self.assertFalse(identicaltest(self, init1, init2, (3, 3, 10, 10)))
 
+  @test_util.run_deprecated_v1
   def testDuplicatedInitializer(self):
     init = init_ops.convolutional_orthogonal_2d()
     self.assertFalse(duplicated_initializer(self, init, 1, (3, 3, 10, 10)))
@@ -879,6 +926,7 @@ class ConvolutionOrthogonal2dInitializerTest(test.TestCase):
     with self.session(graph=ops.Graph(), use_gpu=True):
       self.assertRaises(ValueError, init1, shape=[3, 3, 6, 5])
 
+  @test_util.run_deprecated_v1
   def testGain(self):
     shape = (3, 3, 10, 10)
     for dtype in [dtypes.float32, dtypes.float64]:
@@ -888,8 +936,9 @@ class ConvolutionOrthogonal2dInitializerTest(test.TestCase):
       with self.session(graph=ops.Graph(), use_gpu=True):
         t1 = init1(shape).eval()
         t2 = init2(shape).eval()
-      return np.allclose(t1, t2 / 3.14, rtol=1e-15, atol=1e-15)
+      self.assertAllClose(t1, t2 / 3.14)
 
+  @test_util.run_deprecated_v1
   def testShapesValues(self):
     def circular_pad(input_, width, kernel_size):
       """Pad input_ for computing (circular) convolution.
@@ -938,28 +987,31 @@ class ConvolutionOrthogonal2dInitializerTest(test.TestCase):
       ratio = outputs_2norm / inputs_2norm
       my_ops = variables.global_variables_initializer()
       with self.session(use_gpu=True) as sess:
-        sess.run(my_ops)
+        self.evaluate(my_ops)
         # Check the shape of the outputs
-        t = outputs.eval()
+        t = self.evaluate(outputs)
         self.assertAllEqual(t.shape, outputs_shape)
         # Check isometry of the orthogonal kernel.
-        self.assertAllClose(sess.run(ratio), np.sqrt(gain), rtol=tol, atol=tol)
+        self.assertAllClose(self.evaluate(ratio), gain, rtol=tol, atol=tol)
 
 
 class ConvolutionOrthogonal3dInitializerTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testInitializerIdentical(self):
     for dtype in [dtypes.float32, dtypes.float64]:
       init1 = init_ops.convolutional_orthogonal_3d(seed=1, dtype=dtype)
       init2 = init_ops.convolutional_orthogonal_3d(seed=1, dtype=dtype)
       self.assertTrue(identicaltest(self, init1, init2, (3, 3, 3, 10, 10)))
 
+  @test_util.run_deprecated_v1
   def testInitializerDifferent(self):
     for dtype in [dtypes.float32, dtypes.float64]:
       init1 = init_ops.convolutional_orthogonal_3d(seed=1, dtype=dtype)
       init2 = init_ops.convolutional_orthogonal_3d(seed=2, dtype=dtype)
       self.assertFalse(identicaltest(self, init1, init2, (3, 3, 3, 10, 10)))
 
+  @test_util.run_deprecated_v1
   def testDuplicatedInitializer(self):
     init = init_ops.convolutional_orthogonal_3d()
     self.assertFalse(duplicated_initializer(self, init, 1, (3, 3, 3, 10, 10)))
@@ -974,6 +1026,7 @@ class ConvolutionOrthogonal3dInitializerTest(test.TestCase):
     with self.session(graph=ops.Graph(), use_gpu=True):
       self.assertRaises(ValueError, init1, shape=[3, 3, 3, 6, 5])
 
+  @test_util.run_deprecated_v1
   def testGain(self):
     shape = (3, 3, 3, 10, 10)
     for dtype in [dtypes.float32, dtypes.float64]:
@@ -983,8 +1036,9 @@ class ConvolutionOrthogonal3dInitializerTest(test.TestCase):
       with self.session(graph=ops.Graph(), use_gpu=True):
         t1 = init1(shape).eval()
         t2 = init2(shape).eval()
-      return np.allclose(t1, t2 / 3.14, rtol=1e-15, atol=1e-15)
+      self.assertAllClose(t1, t2 / 3.14)
 
+  @test_util.run_deprecated_v1
   def testNonuniformity(self):
     value = 0
     abs_value = 0
@@ -1009,6 +1063,7 @@ class ConvolutionOrthogonal3dInitializerTest(test.TestCase):
       # Compute the sum of the absolute values of 'count' determinants
       self.assertAllClose(abs_value, count, rtol=tol, atol=tol)
 
+  @test_util.run_deprecated_v1
   def testShapesValues(self):
     def circular_pad(input_, width, kernel_size):
       """Padding input_ for computing circular convolution.
@@ -1063,12 +1118,12 @@ class ConvolutionOrthogonal3dInitializerTest(test.TestCase):
       ratio = outputs_2norm / inputs_2norm
       my_ops = variables.global_variables_initializer()
       with self.cached_session(use_gpu=True) as sess:
-        sess.run(my_ops)
+        self.evaluate(my_ops)
         # Check the shape of the outputs
-        t = outputs.eval()
+        t = self.evaluate(outputs)
         self.assertAllEqual(t.shape, outputs_shape)
         # Check isometry of the orthogonal kernel.
-        self.assertAllClose(sess.run(ratio), np.sqrt(gain), rtol=tol, atol=tol)
+        self.assertAllClose(self.evaluate(ratio), gain, rtol=tol, atol=tol)
 
 
 class IdentityInitializerTest(test.TestCase):
@@ -1084,12 +1139,14 @@ class IdentityInitializerTest(test.TestCase):
       self.assertRaises(ValueError, init, shape=[5])
       self.assertRaises(ValueError, init, shape=[])
 
+  @test_util.run_deprecated_v1
   def testNonSquare(self):
     init = init_ops.identity_initializer()
     shape = (10, 5)
     with self.session(graph=ops.Graph(), use_gpu=True):
       self.assertAllClose(init(shape).eval(), np.eye(*shape))
 
+  @test_util.run_deprecated_v1
   def testGain(self):
     shape = (10, 10)
     for dtype in [dtypes.float32, dtypes.float64]:
@@ -1100,6 +1157,7 @@ class IdentityInitializerTest(test.TestCase):
       with self.session(graph=ops.Graph(), use_gpu=True):
         self.assertAllClose(init_custom(shape).eval(), np.eye(*shape) * 0.9)
 
+  @test_util.run_deprecated_v1
   def testPartitions(self):
     shape = (10, 10)
     init = init_ops.identity_initializer()
diff --git a/tensorflow/python/kernel_tests/inplace_ops_test.py b/tensorflow/python/kernel_tests/inplace_ops_test.py
index 51d16861dd8..9eaaac7a248 100644
--- a/tensorflow/python/kernel_tests/inplace_ops_test.py
+++ b/tensorflow/python/kernel_tests/inplace_ops_test.py
@@ -31,6 +31,7 @@ from tensorflow.python.platform import test as test_lib
 
 class InplaceOpsTest(test_util.TensorFlowTestCase):
 
+  @test_util.run_deprecated_v1
   def testBasicUpdate(self):
     for dtype in [dtypes.float32, dtypes.int32, dtypes.int64]:
       with self.session(use_gpu=True):
@@ -48,6 +49,7 @@ class InplaceOpsTest(test_util.TensorFlowTestCase):
         y[5, :] = 7
         self.assertAllClose(x.eval(), y)
 
+  @test_util.run_deprecated_v1
   def testBasicUpdateBool(self):
     with self.session(use_gpu=True):
       x = array_ops.ones([7, 3], dtypes.bool)
@@ -65,6 +67,7 @@ class InplaceOpsTest(test_util.TensorFlowTestCase):
       y[5, :] = False
       self.assertAllClose(x.eval(), y)
 
+  @test_util.run_deprecated_v1
   def testBasicAdd(self):
     for dtype in [dtypes.float32, dtypes.int32, dtypes.int64]:
       with self.cached_session(use_gpu=True):
@@ -84,6 +87,7 @@ class InplaceOpsTest(test_util.TensorFlowTestCase):
         y[:, :] += 99
         self.assertAllClose(x.eval(), y)
 
+  @test_util.run_deprecated_v1
   def testBasicSub(self):
     for dtype in [dtypes.float32, dtypes.int32, dtypes.int64]:
       with self.cached_session(use_gpu=True):
@@ -103,6 +107,7 @@ class InplaceOpsTest(test_util.TensorFlowTestCase):
         y[:, :] -= 99
         self.assertAllClose(x.eval(), y)
 
+  @test_util.run_deprecated_v1
   def testRandom(self):
     with self.session(use_gpu=True):
       d0, d1, d2 = 100, 3, 5
@@ -123,6 +128,7 @@ class InplaceOpsTest(test_util.TensorFlowTestCase):
           y[idx, :] -= val
         self.assertAllClose(x.eval(), y)
 
+  @test_util.run_deprecated_v1
   def testRandom1D(self):
     with self.session(use_gpu=True):
       d0 = 100
@@ -149,7 +155,7 @@ class InplaceOpsTest(test_util.TensorFlowTestCase):
       y = inplace_ops.alias_inplace_add(x, [0], [[1, 2, 3]])
       with ops.control_dependencies([y]):
         z = array_ops.identity(x)
-        _, vy, vz = sess.run([x, y, z])
+        _, vy, vz = self.evaluate([x, y, z])
       self.assertAllClose(vy, vz)
 
   def testError(self):
@@ -164,6 +170,7 @@ class InplaceOpsTest(test_util.TensorFlowTestCase):
                                    "i and x shape doesn't match"):
         _ = inplace_ops.inplace_update([[1.]], [0, 1], [[10]]).eval()
 
+  @test_util.run_deprecated_v1
   def testEmpty(self):
     for dtype in [
         dtypes.float32, dtypes.float64, dtypes.int32, dtypes.int64, dtypes.bool,
diff --git a/tensorflow/python/kernel_tests/io_ops_test.py b/tensorflow/python/kernel_tests/io_ops_test.py
index afa24195cb3..c5df5231bf6 100644
--- a/tensorflow/python/kernel_tests/io_ops_test.py
+++ b/tensorflow/python/kernel_tests/io_ops_test.py
@@ -23,6 +23,7 @@ import os
 import shutil
 import tempfile
 
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import io_ops
 from tensorflow.python.platform import test
 from tensorflow.python.util import compat
@@ -30,6 +31,7 @@ from tensorflow.python.util import compat
 
 class IoOpsTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testReadFile(self):
     cases = ['', 'Some contents', 'Неки садржаји на српском']
     for contents in cases:
@@ -53,7 +55,7 @@ class IoOpsTest(test.TestCase):
         pass
       with self.cached_session() as sess:
         w = io_ops.write_file(temp.name, contents)
-        sess.run(w)
+        self.evaluate(w)
         with open(temp.name, 'rb') as f:
           file_contents = f.read()
         self.assertEqual(file_contents, contents)
@@ -67,7 +69,7 @@ class IoOpsTest(test.TestCase):
       filepath = os.path.join(subdir, 'subdir2', 'filename')
       with self.cached_session() as sess:
         w = io_ops.write_file(filepath, contents)
-        sess.run(w)
+        self.evaluate(w)
         with open(filepath, 'rb') as f:
           file_contents = f.read()
         self.assertEqual(file_contents, contents)
@@ -78,6 +80,7 @@ class IoOpsTest(test.TestCase):
         compat.as_bytes(files[i].name) for i in range(len(files))
         if i in indices)
 
+  @test_util.run_deprecated_v1
   def testMatchingFiles(self):
     cases = [
         'ABcDEF.GH', 'ABzDEF.GH', 'ABasdfjklDEF.GH', 'AB3DEF.GH', 'AB4DEF.GH',
diff --git a/tensorflow/python/kernel_tests/large_concat_op_test.py b/tensorflow/python/kernel_tests/large_concat_op_test.py
index 1b23e747764..bf6fa9ea71f 100644
--- a/tensorflow/python/kernel_tests/large_concat_op_test.py
+++ b/tensorflow/python/kernel_tests/large_concat_op_test.py
@@ -35,7 +35,7 @@ class LargeConcatOpTest(test.TestCase):
     with self.session(use_gpu=False):
       # TODO(dga):  Add more depth to this test to validate correctness,
       # not just non-crashingness, once other large tensor fixes have gone in.
-      _ = onezeros.eval()
+      _ = self.evaluate(onezeros)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/kernel_tests/linalg/BUILD b/tensorflow/python/kernel_tests/linalg/BUILD
index be2e31cb5ad..ba9e64979a4 100644
--- a/tensorflow/python/kernel_tests/linalg/BUILD
+++ b/tensorflow/python/kernel_tests/linalg/BUILD
@@ -40,6 +40,44 @@ cuda_py_test(
     ],
 )
 
+cuda_py_test(
+    name = "linear_operator_adjoint_test",
+    size = "medium",
+    srcs = ["linear_operator_adjoint_test.py"],
+    additional_deps = [
+        "//tensorflow/python/ops/linalg",
+        "//third_party/py/numpy",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform_test",
+    ],
+    shard_count = 5,
+    tags = [
+        "noasan",  # times out, b/63678675
+        "optonly",  # times out
+    ],
+)
+
+cuda_py_test(
+    name = "linear_operator_algebra_test",
+    size = "small",
+    srcs = ["linear_operator_algebra_test.py"],
+    additional_deps = [
+        "//tensorflow/python/ops/linalg",
+        "//third_party/py/numpy",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
 cuda_py_test(
     name = "linear_operator_block_diag_test",
     size = "medium",
@@ -89,7 +127,6 @@ cuda_py_test(
     size = "medium",
     srcs = ["linear_operator_circulant_test.py"],
     additional_deps = [
-        "//tensorflow/python/ops/linalg",
         "//third_party/py/numpy",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:spectral_ops_test_util",
@@ -99,6 +136,8 @@ cuda_py_test(
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:platform_test",
+        "//tensorflow/python/ops/linalg",
+        "//tensorflow/python/ops/signal",
     ],
     shard_count = 5,
     tags = [
@@ -150,6 +189,28 @@ cuda_py_test(
     ],
 )
 
+cuda_py_test(
+    name = "linear_operator_inversion_test",
+    size = "medium",
+    srcs = ["linear_operator_inversion_test.py"],
+    additional_deps = [
+        "//tensorflow/python/ops/linalg",
+        "//third_party/py/numpy",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform_test",
+    ],
+    shard_count = 5,
+    tags = [
+        "noasan",  # times out, b/63678675
+        "optonly",  # times out
+    ],
+)
+
 cuda_py_test(
     name = "linear_operator_full_matrix_test",
     size = "medium",
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_addition_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_addition_test.py
index 628ed998c54..627349c69b3 100644
--- a/tensorflow/python/kernel_tests/linalg/linear_operator_addition_test.py
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_addition_test.py
@@ -19,6 +19,7 @@ from __future__ import print_function
 
 import numpy as np
 
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import linalg_ops
 from tensorflow.python.ops.linalg import linalg as linalg_lib
 from tensorflow.python.ops.linalg import linear_operator_addition
@@ -69,6 +70,7 @@ class LinearOperatorAdditionCorrectnessTest(test.TestCase):
     with self.assertRaisesRegexp(TypeError, "contain only LinearOperator"):
       add_operators([1, 2])
 
+  @test_util.run_deprecated_v1
   def test_two_diag_operators(self):
     op_a = linalg.LinearOperatorDiag(
         [1., 1.], is_positive_definite=True, name="A")
@@ -89,6 +91,7 @@ class LinearOperatorAdditionCorrectnessTest(test.TestCase):
       # Enforce particular name for this simple case
       self.assertEqual("Add/B__A/", op.name)
 
+  @test_util.run_deprecated_v1
   def test_three_diag_operators(self):
     op1 = linalg.LinearOperatorDiag(
         [1., 1.], is_positive_definite=True, name="op1")
@@ -109,6 +112,7 @@ class LinearOperatorAdditionCorrectnessTest(test.TestCase):
       # Positive definite ==> non-singular
       self.assertTrue(op.is_non_singular)
 
+  @test_util.run_deprecated_v1
   def test_diag_tril_diag(self):
     op1 = linalg.LinearOperatorDiag(
         [1., 1.], is_non_singular=True, name="diag_a")
@@ -134,6 +138,7 @@ class LinearOperatorAdditionCorrectnessTest(test.TestCase):
       # Since no custom hint was provided, we default to None (unknown).
       self.assertEqual(None, op.is_non_singular)
 
+  @test_util.run_deprecated_v1
   def test_matrix_diag_tril_diag_uses_custom_name(self):
     op0 = linalg.LinearOperatorFullMatrix(
         [[-1., -1.], [-1., -1.]], name="matrix")
@@ -217,6 +222,7 @@ class LinearOperatorOrderOfAdditionTest(test.TestCase):
     self.assertEqual(1, len(op_sum))
     self.assertIsInstance(op_sum[0], linalg.LinearOperatorLowerTriangular)
 
+  @test_util.run_deprecated_v1
   def test_cannot_add_everything_so_return_more_than_one_operator(self):
     diag1 = linalg.LinearOperatorDiag([1.])
     diag2 = linalg.LinearOperatorDiag([2.])
@@ -261,6 +267,7 @@ class AddAndReturnScaledIdentityTest(test.TestCase):
   def setUp(self):
     self._adder = linear_operator_addition._AddAndReturnScaledIdentity()
 
+  @test_util.run_deprecated_v1
   def test_identity_plus_identity(self):
     id1 = linalg.LinearOperatorIdentity(num_rows=2)
     id2 = linalg.LinearOperatorIdentity(num_rows=2, batch_shape=[3])
@@ -279,6 +286,7 @@ class AddAndReturnScaledIdentityTest(test.TestCase):
     self.assertTrue(operator.is_non_singular)
     self.assertEqual("my_operator", operator.name)
 
+  @test_util.run_deprecated_v1
   def test_identity_plus_scaled_identity(self):
     id1 = linalg.LinearOperatorIdentity(num_rows=2, batch_shape=[3])
     id2 = linalg.LinearOperatorScaledIdentity(num_rows=2, multiplier=2.2)
@@ -297,6 +305,7 @@ class AddAndReturnScaledIdentityTest(test.TestCase):
     self.assertTrue(operator.is_non_singular)
     self.assertEqual("my_operator", operator.name)
 
+  @test_util.run_deprecated_v1
   def test_scaled_identity_plus_scaled_identity(self):
     id1 = linalg.LinearOperatorScaledIdentity(
         num_rows=2, multiplier=[2.2, 2.2, 2.2])
@@ -322,6 +331,7 @@ class AddAndReturnDiagTest(test.TestCase):
   def setUp(self):
     self._adder = linear_operator_addition._AddAndReturnDiag()
 
+  @test_util.run_deprecated_v1
   def test_identity_plus_identity_returns_diag(self):
     id1 = linalg.LinearOperatorIdentity(num_rows=2)
     id2 = linalg.LinearOperatorIdentity(num_rows=2, batch_shape=[3])
@@ -340,6 +350,7 @@ class AddAndReturnDiagTest(test.TestCase):
     self.assertTrue(operator.is_non_singular)
     self.assertEqual("my_operator", operator.name)
 
+  @test_util.run_deprecated_v1
   def test_diag_plus_diag(self):
     diag1 = rng.rand(2, 3, 4)
     diag2 = rng.rand(4)
@@ -366,6 +377,7 @@ class AddAndReturnTriLTest(test.TestCase):
   def setUp(self):
     self._adder = linear_operator_addition._AddAndReturnTriL()
 
+  @test_util.run_deprecated_v1
   def test_diag_plus_tril(self):
     diag = linalg.LinearOperatorDiag([1., 2.])
     tril = linalg.LinearOperatorLowerTriangular([[10., 0.], [30., 0.]])
@@ -389,6 +401,7 @@ class AddAndReturnMatrixTest(test.TestCase):
   def setUp(self):
     self._adder = linear_operator_addition._AddAndReturnMatrix()
 
+  @test_util.run_deprecated_v1
   def test_diag_plus_diag(self):
     diag1 = linalg.LinearOperatorDiag([1., 2.])
     diag2 = linalg.LinearOperatorDiag([-1., 3.])
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_adjoint_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_adjoint_test.py
new file mode 100644
index 00000000000..1bed4b5268e
--- /dev/null
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_adjoint_test.py
@@ -0,0 +1,118 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops.linalg import linalg as linalg_lib
+from tensorflow.python.ops.linalg import linear_operator_adjoint
+from tensorflow.python.ops.linalg import linear_operator_test_util
+from tensorflow.python.platform import test
+
+linalg = linalg_lib
+
+LinearOperatorAdjoint = linear_operator_adjoint.LinearOperatorAdjoint  # pylint: disable=invalid-name
+
+
+class LinearOperatorAdjointTest(
+    linear_operator_test_util.SquareLinearOperatorDerivedClassTest):
+  """Most tests done in the base class LinearOperatorDerivedClassTest."""
+
+  def setUp(self):
+    self._atol[dtypes.complex64] = 1e-5
+    self._rtol[dtypes.complex64] = 1e-5
+
+  def _operator_and_matrix(self,
+                           build_info,
+                           dtype,
+                           use_placeholder,
+                           ensure_self_adjoint_and_pd=False):
+    shape = list(build_info.shape)
+
+    if ensure_self_adjoint_and_pd:
+      matrix = linear_operator_test_util.random_positive_definite_matrix(
+          shape, dtype, force_well_conditioned=True)
+    else:
+      matrix = linear_operator_test_util.random_tril_matrix(
+          shape, dtype, force_well_conditioned=True, remove_upper=True)
+
+    lin_op_matrix = matrix
+
+    if use_placeholder:
+      lin_op_matrix = array_ops.placeholder_with_default(matrix, shape=None)
+
+    if ensure_self_adjoint_and_pd:
+      operator = LinearOperatorAdjoint(
+          linalg.LinearOperatorFullMatrix(
+              lin_op_matrix, is_positive_definite=True, is_self_adjoint=True))
+    else:
+      operator = LinearOperatorAdjoint(
+          linalg.LinearOperatorLowerTriangular(lin_op_matrix))
+
+    return operator, linalg.adjoint(matrix)
+
+  def test_base_operator_hint_used(self):
+    # The matrix values do not effect auto-setting of the flags.
+    matrix = [[1., 0.], [1., 1.]]
+    operator = linalg.LinearOperatorFullMatrix(
+        matrix,
+        is_positive_definite=True,
+        is_non_singular=True,
+        is_self_adjoint=False)
+    operator_adjoint = LinearOperatorAdjoint(operator)
+    self.assertTrue(operator_adjoint.is_positive_definite)
+    self.assertTrue(operator_adjoint.is_non_singular)
+    self.assertFalse(operator_adjoint.is_self_adjoint)
+
+  def test_supplied_hint_used(self):
+    # The matrix values do not effect auto-setting of the flags.
+    matrix = [[1., 0.], [1., 1.]]
+    operator = linalg.LinearOperatorFullMatrix(matrix)
+    operator_adjoint = LinearOperatorAdjoint(
+        operator,
+        is_positive_definite=True,
+        is_non_singular=True,
+        is_self_adjoint=False)
+    self.assertTrue(operator_adjoint.is_positive_definite)
+    self.assertTrue(operator_adjoint.is_non_singular)
+    self.assertFalse(operator_adjoint.is_self_adjoint)
+
+  def test_contradicting_hints_raise(self):
+    # The matrix values do not effect auto-setting of the flags.
+    matrix = [[1., 0.], [1., 1.]]
+    operator = linalg.LinearOperatorFullMatrix(
+        matrix, is_positive_definite=False)
+    with self.assertRaisesRegexp(ValueError, "positive-definite"):
+      LinearOperatorAdjoint(operator, is_positive_definite=True)
+
+    operator = linalg.LinearOperatorFullMatrix(matrix, is_self_adjoint=False)
+    with self.assertRaisesRegexp(ValueError, "self-adjoint"):
+      LinearOperatorAdjoint(operator, is_self_adjoint=True)
+
+  def test_name(self):
+    matrix = [[11., 0.], [1., 8.]]
+    operator = linalg.LinearOperatorFullMatrix(
+        matrix, name="my_operator", is_non_singular=True)
+
+    operator = LinearOperatorAdjoint(operator)
+
+    self.assertEqual("my_operator_adjoint", operator.name)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_algebra_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_algebra_test.py
new file mode 100644
index 00000000000..8e296c026c0
--- /dev/null
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_algebra_test.py
@@ -0,0 +1,133 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for registration mechanisms."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.ops.linalg import cholesky_registrations  # pylint: disable=unused-import
+from tensorflow.python.ops.linalg import linear_operator
+from tensorflow.python.ops.linalg import linear_operator_algebra
+from tensorflow.python.ops.linalg import matmul_registrations  # pylint: disable=unused-import
+from tensorflow.python.platform import test
+
+# pylint: disable=protected-access
+_CHOLESKY_DECOMPS = linear_operator_algebra._CHOLESKY_DECOMPS
+_MATMUL = linear_operator_algebra._MATMUL
+_registered_cholesky = linear_operator_algebra._registered_cholesky
+_registered_matmul = linear_operator_algebra._registered_matmul
+# pylint: enable=protected-access
+
+
+class CholeskyTest(test.TestCase):
+
+  def testRegistration(self):
+
+    class CustomLinOp(linear_operator.LinearOperator):
+
+      def _matmul(self, a):
+        pass
+
+      def _shape(self):
+        return tensor_shape.TensorShape([1, 1])
+
+      def _shape_tensor(self):
+        pass
+
+    # Register Cholesky to a lambda that spits out the name parameter
+    @linear_operator_algebra.RegisterCholesky(CustomLinOp)
+    def _cholesky(a):  # pylint: disable=unused-argument,unused-variable
+      return "OK"
+
+    with self.assertRaisesRegexp(ValueError, "positive definite"):
+      CustomLinOp(dtype=None, is_self_adjoint=True).cholesky()
+
+    with self.assertRaisesRegexp(ValueError, "self adjoint"):
+      CustomLinOp(dtype=None, is_positive_definite=True).cholesky()
+
+    custom_linop = CustomLinOp(
+        dtype=None, is_self_adjoint=True, is_positive_definite=True)
+    self.assertEqual("OK", custom_linop.cholesky())
+
+  def testRegistrationFailures(self):
+
+    class CustomLinOp(linear_operator.LinearOperator):
+      pass
+
+    with self.assertRaisesRegexp(TypeError, "must be callable"):
+      linear_operator_algebra.RegisterCholesky(CustomLinOp)("blah")
+
+    # First registration is OK
+    linear_operator_algebra.RegisterCholesky(CustomLinOp)(lambda a: None)
+
+    # Second registration fails
+    with self.assertRaisesRegexp(ValueError, "has already been registered"):
+      linear_operator_algebra.RegisterCholesky(CustomLinOp)(lambda a: None)
+
+  def testExactCholeskyRegistrationsAllMatch(self):
+    for (k, v) in _CHOLESKY_DECOMPS.items():
+      self.assertEqual(v, _registered_cholesky(k[0]))
+
+
+class MatmulTest(test.TestCase):
+
+  def testRegistration(self):
+
+    class CustomLinOp(linear_operator.LinearOperator):
+
+      def _matmul(self, a):
+        pass
+
+      def _shape(self):
+        return tensor_shape.TensorShape([1, 1])
+
+      def _shape_tensor(self):
+        pass
+
+    # Register Matmul to a lambda that spits out the name parameter
+    @linear_operator_algebra.RegisterMatmul(CustomLinOp, CustomLinOp)
+    def _matmul(a, b):  # pylint: disable=unused-argument,unused-variable
+      return "OK"
+
+    custom_linop = CustomLinOp(
+        dtype=None, is_self_adjoint=True, is_positive_definite=True)
+    self.assertEqual("OK", custom_linop.matmul(custom_linop))
+
+  def testRegistrationFailures(self):
+
+    class CustomLinOp(linear_operator.LinearOperator):
+      pass
+
+    with self.assertRaisesRegexp(TypeError, "must be callable"):
+      linear_operator_algebra.RegisterMatmul(CustomLinOp, CustomLinOp)("blah")
+
+    # First registration is OK
+    linear_operator_algebra.RegisterMatmul(
+        CustomLinOp, CustomLinOp)(lambda a: None)
+
+    # Second registration fails
+    with self.assertRaisesRegexp(ValueError, "has already been registered"):
+      linear_operator_algebra.RegisterMatmul(
+          CustomLinOp, CustomLinOp)(lambda a: None)
+
+  def testExactMatmulRegistrationsAllMatch(self):
+    for (k, v) in _MATMUL.items():
+      self.assertEqual(v, _registered_matmul(k[0], k[1]))
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_block_diag_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_block_diag_test.py
index 30951b1b0eb..f0cc5d709f9 100644
--- a/tensorflow/python/kernel_tests/linalg/linear_operator_block_diag_test.py
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_block_diag_test.py
@@ -23,6 +23,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops.linalg import linalg as linalg_lib
 from tensorflow.python.ops.linalg import linear_operator_block_diag as block_diag
+from tensorflow.python.ops.linalg import linear_operator_lower_triangular as lower_triangular
 from tensorflow.python.ops.linalg import linear_operator_test_util
 from tensorflow.python.ops.linalg import linear_operator_util
 from tensorflow.python.platform import test
@@ -78,7 +79,9 @@ class SquareLinearOperatorBlockDiagTest(
         build_info((2, 1, 5, 5), blocks=[(2, 1, 2, 2), (1, 3, 3)]),
     ]
 
-  def _operator_and_matrix(self, build_info, dtype, use_placeholder):
+  def _operator_and_matrix(
+      self, build_info, dtype, use_placeholder,
+      ensure_self_adjoint_and_pd=False):
     shape = list(build_info.shape)
     expected_blocks = (
         build_info.__dict__["blocks"] if "blocks" in build_info.__dict__
@@ -98,7 +101,11 @@ class SquareLinearOperatorBlockDiagTest(
 
     operator = block_diag.LinearOperatorBlockDiag(
         [linalg.LinearOperatorFullMatrix(
-            l, is_square=True) for l in lin_op_matrices])
+            l,
+            is_square=True,
+            is_self_adjoint=True if ensure_self_adjoint_and_pd else None,
+            is_positive_definite=True if ensure_self_adjoint_and_pd else None)
+         for l in lin_op_matrices])
 
     # Should be auto-set.
     self.assertTrue(operator.is_square)
@@ -129,6 +136,40 @@ class SquareLinearOperatorBlockDiagTest(
     self.assertTrue(operator.is_non_singular)
     self.assertFalse(operator.is_self_adjoint)
 
+  def test_block_diag_cholesky_type(self):
+    matrix = [[1., 0.], [0., 1.]]
+    operator = block_diag.LinearOperatorBlockDiag(
+        [
+            linalg.LinearOperatorFullMatrix(
+                matrix,
+                is_positive_definite=True,
+                is_self_adjoint=True,
+            ),
+            linalg.LinearOperatorFullMatrix(
+                matrix,
+                is_positive_definite=True,
+                is_self_adjoint=True,
+            ),
+        ],
+        is_positive_definite=True,
+        is_self_adjoint=True,
+    )
+    cholesky_factor = operator.cholesky()
+    self.assertTrue(isinstance(
+        cholesky_factor,
+        block_diag.LinearOperatorBlockDiag))
+    self.assertEqual(2, len(cholesky_factor.operators))
+    self.assertTrue(
+        isinstance(
+            cholesky_factor.operators[0],
+            lower_triangular.LinearOperatorLowerTriangular)
+    )
+    self.assertTrue(
+        isinstance(
+            cholesky_factor.operators[1],
+            lower_triangular.LinearOperatorLowerTriangular)
+    )
+
   def test_is_non_singular_auto_set(self):
     # Matrix with two positive eigenvalues, 11 and 8.
     # The matrix values do not effect auto-setting of the flags.
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_circulant_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_circulant_test.py
index f1e151ebd86..6366083ac5b 100644
--- a/tensorflow/python/kernel_tests/linalg/linear_operator_circulant_test.py
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_circulant_test.py
@@ -21,12 +21,14 @@ import contextlib
 import numpy as np
 
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import spectral_ops_test_util
 from tensorflow.python.ops.linalg import linalg
 from tensorflow.python.ops.linalg import linear_operator_circulant
 from tensorflow.python.ops.linalg import linear_operator_test_util
+from tensorflow.python.ops.signal import fft_ops
 from tensorflow.python.platform import test
 
 rng = np.random.RandomState(0)
@@ -75,8 +77,8 @@ class LinearOperatorCirculantBaseTest(object):
       x = np.zeros([domain_dimension])
       # x is a basis vector.
       x[m] = 1.0
-      fft_x = math_ops.fft(x.astype(np.complex64))
-      h_convolve_x = math_ops.ifft(spectrum * fft_x)
+      fft_x = fft_ops.fft(x.astype(np.complex64))
+      h_convolve_x = fft_ops.ifft(spectrum * fft_x)
       matrix_rows.append(h_convolve_x)
     matrix = array_ops.stack(matrix_rows, axis=-1)
     return math_ops.cast(matrix, dtype)
@@ -97,7 +99,9 @@ class LinearOperatorCirculantTestSelfAdjointOperator(
     # real, the matrix will not be real.
     return [dtypes.complex64]
 
-  def _operator_and_matrix(self, build_info, dtype, use_placeholder):
+  def _operator_and_matrix(
+      self, build_info, dtype, use_placeholder,
+      ensure_self_adjoint_and_pd=False):
     shape = build_info.shape
     # For this test class, we are creating real spectrums.
     # We also want the spectrum to have eigenvalues bounded away from zero.
@@ -105,6 +109,8 @@ class LinearOperatorCirculantTestSelfAdjointOperator(
     # spectrum is bounded away from zero.
     spectrum = linear_operator_test_util.random_sign_uniform(
         shape=self._shape_to_spectrum_shape(shape), minval=1., maxval=2.)
+    if ensure_self_adjoint_and_pd:
+      spectrum = math_ops.abs(spectrum)
     # If dtype is complex, cast spectrum to complex.  The imaginary part will be
     # zero, so the operator will still be self-adjoint.
     spectrum = math_ops.cast(spectrum, dtype)
@@ -115,12 +121,16 @@ class LinearOperatorCirculantTestSelfAdjointOperator(
       lin_op_spectrum = array_ops.placeholder_with_default(spectrum, shape=None)
 
     operator = linalg.LinearOperatorCirculant(
-        lin_op_spectrum, is_self_adjoint=True, input_output_dtype=dtype)
+        lin_op_spectrum,
+        is_self_adjoint=True,
+        is_positive_definite=True if ensure_self_adjoint_and_pd else None,
+        input_output_dtype=dtype)
 
     mat = self._spectrum_to_circulant_1d(spectrum, shape, dtype=dtype)
 
     return operator, mat
 
+  @test_util.run_deprecated_v1
   def test_simple_hermitian_spectrum_gives_operator_with_zero_imag_part(self):
     with self.cached_session():
       spectrum = math_ops.cast([1., 1j, -1j], dtypes.complex64)
@@ -129,7 +139,8 @@ class LinearOperatorCirculantTestSelfAdjointOperator(
       matrix = operator.to_dense()
       imag_matrix = math_ops.imag(matrix)
       eps = np.finfo(np.float32).eps
-      np.testing.assert_allclose(0, imag_matrix.eval(), rtol=0, atol=eps * 3)
+      np.testing.assert_allclose(
+          0, self.evaluate(imag_matrix), rtol=0, atol=eps * 3)
 
 
 class LinearOperatorCirculantTestHermitianSpectrum(
@@ -146,7 +157,9 @@ class LinearOperatorCirculantTestHermitianSpectrum(
   def _dtypes_to_test(self):
     return [dtypes.float32, dtypes.complex64]
 
-  def _operator_and_matrix(self, build_info, dtype, use_placeholder):
+  def _operator_and_matrix(
+      self, build_info, dtype, use_placeholder,
+      ensure_self_adjoint_and_pd=False):
     shape = build_info.shape
     # For this test class, we are creating Hermitian spectrums.
     # We also want the spectrum to have eigenvalues bounded away from zero.
@@ -160,14 +173,14 @@ class LinearOperatorCirculantTestHermitianSpectrum(
     #  = IFFT[EvenPartOf[pre_spectrum]]
     # is the IFFT of something that is also bounded away from zero.
     # Therefore, FFT[pre_h] would be a well-conditioned spectrum.
-    pre_h = math_ops.ifft(pre_spectrum_c)
+    pre_h = fft_ops.ifft(pre_spectrum_c)
 
     # A spectrum is Hermitian iff it is the DFT of a real convolution kernel.
     # So we will make spectrum = FFT[h], for real valued h.
     h = math_ops.real(pre_h)
     h_c = _to_complex(h)
 
-    spectrum = math_ops.fft(h_c)
+    spectrum = fft_ops.fft(h_c)
 
     lin_op_spectrum = spectrum
 
@@ -175,12 +188,17 @@ class LinearOperatorCirculantTestHermitianSpectrum(
       lin_op_spectrum = array_ops.placeholder_with_default(spectrum, shape=None)
 
     operator = linalg.LinearOperatorCirculant(
-        lin_op_spectrum, input_output_dtype=dtype)
+        lin_op_spectrum,
+        input_output_dtype=dtype,
+        is_positive_definite=True if ensure_self_adjoint_and_pd else None,
+        is_self_adjoint=True if ensure_self_adjoint_and_pd else None,
+    )
 
     mat = self._spectrum_to_circulant_1d(spectrum, shape, dtype=dtype)
 
     return operator, mat
 
+  @test_util.run_deprecated_v1
   def test_simple_hermitian_spectrum_gives_operator_with_zero_imag_part(self):
     with self.cached_session():
       spectrum = math_ops.cast([1., 1j, -1j], dtypes.complex64)
@@ -189,7 +207,8 @@ class LinearOperatorCirculantTestHermitianSpectrum(
       matrix = operator.to_dense()
       imag_matrix = math_ops.imag(matrix)
       eps = np.finfo(np.float32).eps
-      np.testing.assert_allclose(0, imag_matrix.eval(), rtol=0, atol=eps * 3)
+      np.testing.assert_allclose(
+          0, self.evaluate(imag_matrix), rtol=0, atol=eps * 3)
 
 
 class LinearOperatorCirculantTestNonHermitianSpectrum(
@@ -205,7 +224,16 @@ class LinearOperatorCirculantTestNonHermitianSpectrum(
   def _dtypes_to_test(self):
     return [dtypes.complex64]
 
-  def _operator_and_matrix(self, build_info, dtype, use_placeholder):
+  # Skip Cholesky since we are explicitly testing non-hermitian
+  # spectra.
+  @property
+  def _tests_to_skip(self):
+    return ["cholesky"]
+
+  def _operator_and_matrix(
+      self, build_info, dtype, use_placeholder,
+      ensure_self_adjoint_and_pd=False):
+    del ensure_self_adjoint_and_pd
     shape = build_info.shape
     # Will be well conditioned enough to get accurate solves.
     spectrum = linear_operator_test_util.random_sign_uniform(
@@ -226,6 +254,7 @@ class LinearOperatorCirculantTestNonHermitianSpectrum(
 
     return operator, mat
 
+  @test_util.run_deprecated_v1
   def test_simple_hermitian_spectrum_gives_operator_with_zero_imag_part(self):
     with self.cached_session():
       spectrum = math_ops.cast([1., 1j, -1j], dtypes.complex64)
@@ -234,8 +263,10 @@ class LinearOperatorCirculantTestNonHermitianSpectrum(
       matrix = operator.to_dense()
       imag_matrix = math_ops.imag(matrix)
       eps = np.finfo(np.float32).eps
-      np.testing.assert_allclose(0, imag_matrix.eval(), rtol=0, atol=eps * 3)
+      np.testing.assert_allclose(
+          0, self.evaluate(imag_matrix), rtol=0, atol=eps * 3)
 
+  @test_util.run_deprecated_v1
   def test_simple_positive_real_spectrum_gives_self_adjoint_pos_def_oper(self):
     with self.cached_session() as sess:
       spectrum = math_ops.cast([6., 4, 2], dtypes.complex64)
@@ -248,10 +279,11 @@ class LinearOperatorCirculantTestNonHermitianSpectrum(
       operator.assert_positive_definite().run()  # Should not fail
       operator.assert_self_adjoint().run()  # Should not fail
 
+  @test_util.run_deprecated_v1
   def test_defining_operator_using_real_convolution_kernel(self):
     with self.cached_session():
       convolution_kernel = [1., 2., 1.]
-      spectrum = math_ops.fft(
+      spectrum = fft_ops.fft(
           math_ops.cast(convolution_kernel, dtypes.complex64))
 
       # spectrum is shape [3] ==> operator is shape [3, 3]
@@ -269,15 +301,16 @@ class LinearOperatorCirculantTestNonHermitianSpectrum(
       # Make spectrum the FFT of a real convolution kernel h.  This ensures that
       # spectrum is Hermitian.
       h = linear_operator_test_util.random_normal(shape=(3, 4))
-      spectrum = math_ops.fft(math_ops.cast(h, dtypes.complex64))
+      spectrum = fft_ops.fft(math_ops.cast(h, dtypes.complex64))
       operator = linalg.LinearOperatorCirculant(
           spectrum, input_output_dtype=dtypes.complex64)
       matrix = operator.to_dense()
       imag_matrix = math_ops.imag(matrix)
       eps = np.finfo(np.float32).eps
       np.testing.assert_allclose(
-          0, imag_matrix.eval(), rtol=0, atol=eps * 3 * 4)
+          0, self.evaluate(imag_matrix), rtol=0, atol=eps * 3 * 4)
 
+  @test_util.run_deprecated_v1
   def test_convolution_kernel_same_as_first_row_of_to_dense(self):
     spectrum = [[3., 2., 1.], [2., 1.5, 1.]]
     with self.cached_session():
@@ -287,8 +320,9 @@ class LinearOperatorCirculantTestNonHermitianSpectrum(
 
       self.assertAllEqual((2, 3), h.get_shape())
       self.assertAllEqual((2, 3, 3), c.get_shape())
-      self.assertAllClose(h.eval(), c.eval()[:, :, 0])
+      self.assertAllClose(h.eval(), self.evaluate(c)[:, :, 0])
 
+  @test_util.run_deprecated_v1
   def test_assert_non_singular_fails_for_singular_operator(self):
     spectrum = math_ops.cast([0, 4, 2j + 2], dtypes.complex64)
     operator = linalg.LinearOperatorCirculant(spectrum)
@@ -296,12 +330,14 @@ class LinearOperatorCirculantTestNonHermitianSpectrum(
       with self.assertRaisesOpError("Singular operator"):
         operator.assert_non_singular().run()
 
+  @test_util.run_deprecated_v1
   def test_assert_non_singular_does_not_fail_for_non_singular_operator(self):
     spectrum = math_ops.cast([-3j, 4, 2j + 2], dtypes.complex64)
     operator = linalg.LinearOperatorCirculant(spectrum)
     with self.cached_session():
       operator.assert_non_singular().run()  # Should not fail
 
+  @test_util.run_deprecated_v1
   def test_assert_positive_definite_fails_for_non_positive_definite(self):
     spectrum = math_ops.cast([6., 4, 2j], dtypes.complex64)
     operator = linalg.LinearOperatorCirculant(spectrum)
@@ -309,6 +345,7 @@ class LinearOperatorCirculantTestNonHermitianSpectrum(
       with self.assertRaisesOpError("Not positive definite"):
         operator.assert_positive_definite().run()
 
+  @test_util.run_deprecated_v1
   def test_assert_positive_definite_does_not_fail_when_pos_def(self):
     spectrum = math_ops.cast([6., 4, 2j + 2], dtypes.complex64)
     operator = linalg.LinearOperatorCirculant(spectrum)
@@ -397,8 +434,8 @@ class LinearOperatorCirculant2DBaseTest(object):
         x = np.zeros(block_shape)
         # x is a basis vector.
         x[n0, n1] = 1.0
-        fft_x = math_ops.fft2d(x.astype(np.complex64))
-        h_convolve_x = math_ops.ifft2d(spectrum * fft_x)
+        fft_x = fft_ops.fft2d(x.astype(np.complex64))
+        h_convolve_x = fft_ops.ifft2d(spectrum * fft_x)
         # We want the flat version of the action of the operator on a basis
         # vector, not the block version.
         h_convolve_x = array_ops.reshape(h_convolve_x, shape[:-1])
@@ -421,7 +458,9 @@ class LinearOperatorCirculant2DTestHermitianSpectrum(
   def _dtypes_to_test(self):
     return [dtypes.float32, dtypes.complex64]
 
-  def _operator_and_matrix(self, build_info, dtype, use_placeholder):
+  def _operator_and_matrix(
+      self, build_info, dtype, use_placeholder,
+      ensure_self_adjoint_and_pd=False):
     shape = build_info.shape
     # For this test class, we are creating Hermitian spectrums.
     # We also want the spectrum to have eigenvalues bounded away from zero.
@@ -435,14 +474,14 @@ class LinearOperatorCirculant2DTestHermitianSpectrum(
     #  = IFFT[EvenPartOf[pre_spectrum]]
     # is the IFFT of something that is also bounded away from zero.
     # Therefore, FFT[pre_h] would be a well-conditioned spectrum.
-    pre_h = math_ops.ifft2d(pre_spectrum_c)
+    pre_h = fft_ops.ifft2d(pre_spectrum_c)
 
     # A spectrum is Hermitian iff it is the DFT of a real convolution kernel.
     # So we will make spectrum = FFT[h], for real valued h.
     h = math_ops.real(pre_h)
     h_c = _to_complex(h)
 
-    spectrum = math_ops.fft2d(h_c)
+    spectrum = fft_ops.fft2d(h_c)
 
     lin_op_spectrum = spectrum
 
@@ -450,7 +489,10 @@ class LinearOperatorCirculant2DTestHermitianSpectrum(
       lin_op_spectrum = array_ops.placeholder_with_default(spectrum, shape=None)
 
     operator = linalg.LinearOperatorCirculant2D(
-        lin_op_spectrum, input_output_dtype=dtype)
+        lin_op_spectrum,
+        is_positive_definite=True if ensure_self_adjoint_and_pd else None,
+        is_self_adjoint=True if ensure_self_adjoint_and_pd else None,
+        input_output_dtype=dtype)
 
     mat = self._spectrum_to_circulant_2d(spectrum, shape, dtype=dtype)
 
@@ -470,7 +512,14 @@ class LinearOperatorCirculant2DTestNonHermitianSpectrum(
   def _dtypes_to_test(self):
     return [dtypes.complex64]
 
-  def _operator_and_matrix(self, build_info, dtype, use_placeholder):
+  @property
+  def _tests_to_skip(self):
+    return ["cholesky"]
+
+  def _operator_and_matrix(
+      self, build_info, dtype, use_placeholder,
+      ensure_self_adjoint_and_pd=False):
+    del ensure_self_adjoint_and_pd
     shape = build_info.shape
     # Will be well conditioned enough to get accurate solves.
     spectrum = linear_operator_test_util.random_sign_uniform(
@@ -491,6 +540,7 @@ class LinearOperatorCirculant2DTestNonHermitianSpectrum(
 
     return operator, mat
 
+  @test_util.run_deprecated_v1
   def test_real_hermitian_spectrum_gives_real_symmetric_operator(self):
     with self.cached_session() as sess:
       # This is a real and hermitian spectrum.
@@ -508,6 +558,7 @@ class LinearOperatorCirculant2DTestNonHermitianSpectrum(
       np.testing.assert_allclose(0, imag_matrix, atol=1e-6)
       self.assertAllClose(matrix, matrix_transpose, atol=0)
 
+  @test_util.run_deprecated_v1
   def test_real_spectrum_gives_self_adjoint_operator(self):
     with self.cached_session() as sess:
       # This is a real and hermitian spectrum.
@@ -519,9 +570,10 @@ class LinearOperatorCirculant2DTestNonHermitianSpectrum(
       self.assertEqual(matrix_tensor.dtype,
                        linear_operator_circulant._DTYPE_COMPLEX)
       matrix_h = linalg.adjoint(matrix_tensor)
-      matrix, matrix_h = sess.run([matrix_tensor, matrix_h])
+      matrix, matrix_h = self.evaluate([matrix_tensor, matrix_h])
       self.assertAllClose(matrix, matrix_h, atol=0)
 
+  @test_util.run_deprecated_v1
   def test_assert_non_singular_fails_for_singular_operator(self):
     spectrum = math_ops.cast([[0, 4], [2j + 2, 3.]], dtypes.complex64)
     operator = linalg.LinearOperatorCirculant2D(spectrum)
@@ -529,12 +581,14 @@ class LinearOperatorCirculant2DTestNonHermitianSpectrum(
       with self.assertRaisesOpError("Singular operator"):
         operator.assert_non_singular().run()
 
+  @test_util.run_deprecated_v1
   def test_assert_non_singular_does_not_fail_for_non_singular_operator(self):
     spectrum = math_ops.cast([[-3j, 4], [2j + 2, 3.]], dtypes.complex64)
     operator = linalg.LinearOperatorCirculant2D(spectrum)
     with self.cached_session():
       operator.assert_non_singular().run()  # Should not fail
 
+  @test_util.run_deprecated_v1
   def test_assert_positive_definite_fails_for_non_positive_definite(self):
     spectrum = math_ops.cast([[6., 4], [2j, 3.]], dtypes.complex64)
     operator = linalg.LinearOperatorCirculant2D(spectrum)
@@ -542,6 +596,7 @@ class LinearOperatorCirculant2DTestNonHermitianSpectrum(
       with self.assertRaisesOpError("Not positive definite"):
         operator.assert_positive_definite().run()
 
+  @test_util.run_deprecated_v1
   def test_assert_positive_definite_does_not_fail_when_pos_def(self):
     spectrum = math_ops.cast([[6., 4], [2j + 2, 3.]], dtypes.complex64)
     operator = linalg.LinearOperatorCirculant2D(spectrum)
@@ -580,6 +635,7 @@ class LinearOperatorCirculant3DTest(test.TestCase):
       with spectral_ops_test_util.fft_kernel_label_map():
         yield sess
 
+  @test_util.run_deprecated_v1
   def test_real_spectrum_gives_self_adjoint_operator(self):
     with self.cached_session() as sess:
       # This is a real and hermitian spectrum.
@@ -593,16 +649,17 @@ class LinearOperatorCirculant3DTest(test.TestCase):
                        linear_operator_circulant._DTYPE_COMPLEX)
       matrix_h = linalg.adjoint(matrix_tensor)
 
-      matrix, matrix_h = sess.run([matrix_tensor, matrix_h])
+      matrix, matrix_h = self.evaluate([matrix_tensor, matrix_h])
       self.assertAllEqual((2, 2 * 3 * 5, 2 * 3 * 5), matrix.shape)
       self.assertAllClose(matrix, matrix_h)
 
+  @test_util.run_deprecated_v1
   def test_defining_operator_using_real_convolution_kernel(self):
     with self.cached_session():
       convolution_kernel = linear_operator_test_util.random_normal(
           shape=(2, 2, 3, 5), dtype=dtypes.float32)
       # Convolution kernel is real ==> spectrum is Hermitian.
-      spectrum = math_ops.fft3d(
+      spectrum = fft_ops.fft3d(
           math_ops.cast(convolution_kernel, dtypes.complex64))
 
       # spectrum is Hermitian ==> operator is real.
@@ -615,6 +672,7 @@ class LinearOperatorCirculant3DTest(test.TestCase):
       self.assertAllEqual((2, 2 * 3 * 5, 2 * 3 * 5), matrix.shape)
       np.testing.assert_allclose(0, np.imag(matrix), atol=1e-6)
 
+  @test_util.run_deprecated_v1
   def test_defining_spd_operator_by_taking_real_part(self):
     with self.cached_session() as sess:
       # S is real and positive.
@@ -634,7 +692,7 @@ class LinearOperatorCirculant3DTest(test.TestCase):
       #         =      H1  +      H2
       # where H1 is real since it is Hermitian,
       # and H2 is imaginary since it is anti-Hermitian.
-      ifft_s = math_ops.ifft3d(math_ops.cast(s, dtypes.complex64))
+      ifft_s = fft_ops.ifft3d(math_ops.cast(s, dtypes.complex64))
 
       # Throw away H2, keep H1.
       real_ifft_s = math_ops.real(ifft_s)
@@ -642,7 +700,7 @@ class LinearOperatorCirculant3DTest(test.TestCase):
       # This is the perfect spectrum!
       # spectrum = DFT[H1]
       #          = S1,
-      fft_real_ifft_s = math_ops.fft3d(
+      fft_real_ifft_s = fft_ops.fft3d(
           math_ops.cast(real_ifft_s, dtypes.complex64))
 
       # S1 is Hermitian ==> operator is real.
@@ -665,7 +723,7 @@ class LinearOperatorCirculant3DTest(test.TestCase):
       # S2 is anti-Hermitian ==> operator is imaginary.
       # S2 is real ==> operator is self-adjoint.
       imag_ifft_s = math_ops.imag(ifft_s)
-      fft_imag_ifft_s = math_ops.fft3d(
+      fft_imag_ifft_s = fft_ops.fft3d(
           1j * math_ops.cast(imag_ifft_s, dtypes.complex64))
       operator_imag = linalg.LinearOperatorCirculant3D(fft_imag_ifft_s)
 
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_composition_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_composition_test.py
index 02f56db5962..214b73aa2f3 100644
--- a/tensorflow/python/kernel_tests/linalg/linear_operator_composition_test.py
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_composition_test.py
@@ -21,6 +21,7 @@ import numpy as np
 
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.linalg import linalg as linalg_lib
@@ -42,8 +43,12 @@ class SquareLinearOperatorCompositionTest(
     self._rtol[dtypes.float32] = 1e-4
     self._rtol[dtypes.complex64] = 1e-4
 
+  @property
+  def _tests_to_skip(self):
+    # Cholesky not implemented.
+    return ["cholesky"]
+
   def _operator_and_matrix(self, build_info, dtype, use_placeholder):
-    sess = ops.get_default_session()
     shape = list(build_info.shape)
 
     # Either 1 or 2 matrices, depending.
@@ -175,6 +180,7 @@ class NonSquareLinearOperatorCompositionTest(
 
     return operator, mat
 
+  @test_util.run_deprecated_v1
   def test_static_shapes(self):
     operators = [
         linalg.LinearOperatorFullMatrix(rng.rand(2, 3, 4)),
@@ -183,6 +189,7 @@ class NonSquareLinearOperatorCompositionTest(
     operator = linalg.LinearOperatorComposition(operators)
     self.assertAllEqual((2, 3, 5), operator.shape)
 
+  @test_util.run_deprecated_v1
   def test_shape_tensors_when_statically_available(self):
     operators = [
         linalg.LinearOperatorFullMatrix(rng.rand(2, 3, 4)),
@@ -192,6 +199,7 @@ class NonSquareLinearOperatorCompositionTest(
     with self.cached_session():
       self.assertAllEqual((2, 3, 5), operator.shape_tensor().eval())
 
+  @test_util.run_deprecated_v1
   def test_shape_tensors_when_only_dynamically_available(self):
     mat_1 = rng.rand(1, 2, 3, 4)
     mat_2 = rng.rand(1, 2, 4, 5)
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_diag_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_diag_test.py
index 0758349531e..dcbc0dd7c97 100644
--- a/tensorflow/python/kernel_tests/linalg/linear_operator_diag_test.py
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_diag_test.py
@@ -17,6 +17,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import linalg_ops
 from tensorflow.python.ops import math_ops
@@ -32,17 +33,26 @@ class LinearOperatorDiagTest(
     linear_operator_test_util.SquareLinearOperatorDerivedClassTest):
   """Most tests done in the base class LinearOperatorDerivedClassTest."""
 
-  def _operator_and_matrix(self, build_info, dtype, use_placeholder):
+  def _operator_and_matrix(
+      self, build_info, dtype, use_placeholder,
+      ensure_self_adjoint_and_pd=False):
     shape = list(build_info.shape)
     diag = linear_operator_test_util.random_sign_uniform(
         shape[:-1], minval=1., maxval=2., dtype=dtype)
 
+    if ensure_self_adjoint_and_pd:
+      # Abs on complex64 will result in a float32, so we cast back up.
+      diag = math_ops.cast(math_ops.abs(diag), dtype=dtype)
+
     lin_op_diag = diag
 
     if use_placeholder:
       lin_op_diag = array_ops.placeholder_with_default(diag, shape=None)
 
-    operator = linalg.LinearOperatorDiag(lin_op_diag)
+    operator = linalg.LinearOperatorDiag(
+        lin_op_diag,
+        is_self_adjoint=True if ensure_self_adjoint_and_pd else None,
+        is_positive_definite=True if ensure_self_adjoint_and_pd else None)
 
     matrix = array_ops.matrix_diag(diag)
 
@@ -71,6 +81,7 @@ class LinearOperatorDiagTest(
       with self.assertRaisesOpError("non-positive real.*not positive definite"):
         operator.assert_positive_definite().run()
 
+  @test_util.run_deprecated_v1
   def test_assert_positive_definite_does_not_raise_if_pd_and_complex(self):
     with self.cached_session():
       x = [1., 2.]
@@ -87,6 +98,7 @@ class LinearOperatorDiagTest(
       with self.assertRaisesOpError("Singular operator"):
         operator.assert_non_singular().run()
 
+  @test_util.run_deprecated_v1
   def test_assert_non_singular_does_not_raise_for_complex_nonsingular(self):
     with self.cached_session():
       x = [1., 0.]
@@ -104,6 +116,7 @@ class LinearOperatorDiagTest(
       with self.assertRaisesOpError("imaginary.*not self-adjoint"):
         operator.assert_self_adjoint().run()
 
+  @test_util.run_deprecated_v1
   def test_assert_self_adjoint_does_not_raise_for_diag_with_zero_imag(self):
     with self.cached_session():
       x = [1., 0.]
@@ -138,12 +151,52 @@ class LinearOperatorDiagTest(
       operator_matmul = operator.matmul(x)
       mat_matmul = math_ops.matmul(mat, x)
       self.assertAllEqual(operator_matmul.get_shape(), mat_matmul.get_shape())
-      self.assertAllClose(*sess.run([operator_matmul, mat_matmul]))
+      self.assertAllClose(*self.evaluate([operator_matmul, mat_matmul]))
 
       operator_solve = operator.solve(x)
       mat_solve = linalg_ops.matrix_solve(mat, x)
       self.assertAllEqual(operator_solve.get_shape(), mat_solve.get_shape())
-      self.assertAllClose(*sess.run([operator_solve, mat_solve]))
+      self.assertAllClose(*self.evaluate([operator_solve, mat_solve]))
+
+  def test_diag_matmul(self):
+    operator1 = linalg_lib.LinearOperatorDiag([2., 3.])
+    operator2 = linalg_lib.LinearOperatorDiag([1., 2.])
+    operator3 = linalg_lib.LinearOperatorScaledIdentity(
+        num_rows=2, multiplier=3.)
+    operator_matmul = operator1.matmul(operator2)
+    self.assertTrue(isinstance(
+        operator_matmul,
+        linalg_lib.LinearOperatorDiag))
+    self.assertAllClose([2., 6.], self.evaluate(operator_matmul.diag))
+
+    operator_matmul = operator2.matmul(operator1)
+    self.assertTrue(isinstance(
+        operator_matmul,
+        linalg_lib.LinearOperatorDiag))
+    self.assertAllClose([2., 6.], self.evaluate(operator_matmul.diag))
+
+    operator_matmul = operator1.matmul(operator3)
+    self.assertTrue(isinstance(
+        operator_matmul,
+        linalg_lib.LinearOperatorDiag))
+    self.assertAllClose([6., 9.], self.evaluate(operator_matmul.diag))
+
+    operator_matmul = operator3.matmul(operator1)
+    self.assertTrue(isinstance(
+        operator_matmul,
+        linalg_lib.LinearOperatorDiag))
+    self.assertAllClose([6., 9.], self.evaluate(operator_matmul.diag))
+
+  def test_diag_cholesky_type(self):
+    diag = [1., 3., 5., 8.]
+    operator = linalg.LinearOperatorDiag(
+        diag,
+        is_positive_definite=True,
+        is_self_adjoint=True,
+    )
+    self.assertTrue(isinstance(
+        operator.cholesky(),
+        linalg.LinearOperatorDiag))
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_full_matrix_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_full_matrix_test.py
index 8c2d2cf0774..aff0b1ae14c 100644
--- a/tensorflow/python/kernel_tests/linalg/linear_operator_full_matrix_test.py
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_full_matrix_test.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.linalg import linalg as linalg_lib
@@ -33,7 +34,9 @@ class SquareLinearOperatorFullMatrixTest(
     linear_operator_test_util.SquareLinearOperatorDerivedClassTest):
   """Most tests done in the base class LinearOperatorDerivedClassTest."""
 
-  def _operator_and_matrix(self, build_info, dtype, use_placeholder):
+  def _operator_and_matrix(
+      self, build_info, dtype, use_placeholder,
+      ensure_self_adjoint_and_pd=False):
     shape = list(build_info.shape)
 
     matrix = linear_operator_test_util.random_positive_definite_matrix(
@@ -44,7 +47,12 @@ class SquareLinearOperatorFullMatrixTest(
     if use_placeholder:
       lin_op_matrix = array_ops.placeholder_with_default(matrix, shape=None)
 
-    operator = linalg.LinearOperatorFullMatrix(lin_op_matrix, is_square=True)
+    # Set the hints to none to test non-symmetric PD code paths.
+    operator = linalg.LinearOperatorFullMatrix(
+        lin_op_matrix,
+        is_square=True,
+        is_self_adjoint=True if ensure_self_adjoint_and_pd else None,
+        is_positive_definite=True if ensure_self_adjoint_and_pd else None)
 
     return operator, matrix
 
@@ -62,6 +70,7 @@ class SquareLinearOperatorFullMatrixTest(
     # Auto-detected.
     self.assertTrue(operator.is_square)
 
+  @test_util.run_deprecated_v1
   def test_assert_non_singular_raises_if_cond_too_big_but_finite(self):
     with self.cached_session():
       tril = linear_operator_test_util.random_tril_matrix(
@@ -123,7 +132,13 @@ class SquareLinearOperatorFullMatrixSymmetricPositiveDefiniteTest(
   def _dtypes_to_test(self):
     return [dtypes.float32, dtypes.float64]
 
-  def _operator_and_matrix(self, build_info, dtype, use_placeholder):
+  def _operator_and_matrix(
+      self, build_info, dtype, use_placeholder,
+      ensure_self_adjoint_and_pd=False):
+
+    # Matrix is always symmetric and positive definite in this class.
+    del ensure_self_adjoint_and_pd
+
     shape = list(build_info.shape)
 
     matrix = linear_operator_test_util.random_positive_definite_matrix(
@@ -134,7 +149,11 @@ class SquareLinearOperatorFullMatrixSymmetricPositiveDefiniteTest(
     if use_placeholder:
       lin_op_matrix = array_ops.placeholder_with_default(matrix, shape=None)
 
-    operator = linalg.LinearOperatorFullMatrix(lin_op_matrix, is_square=True)
+    operator = linalg.LinearOperatorFullMatrix(
+        lin_op_matrix,
+        is_square=True,
+        is_self_adjoint=True,
+        is_positive_definite=True)
 
     return operator, matrix
 
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_identity_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_identity_test.py
index 465a8194dd9..2da5e712d77 100644
--- a/tensorflow/python/kernel_tests/linalg/linear_operator_identity_test.py
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_identity_test.py
@@ -20,8 +20,10 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import linalg_ops
+from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops.linalg import linalg as linalg_lib
 from tensorflow.python.ops.linalg import linear_operator_test_util
@@ -41,7 +43,12 @@ class LinearOperatorIdentityTest(
     # 16bit.
     return [dtypes.float32, dtypes.float64, dtypes.complex64, dtypes.complex128]
 
-  def _operator_and_matrix(self, build_info, dtype, use_placeholder):
+  def _operator_and_matrix(
+      self, build_info, dtype, use_placeholder,
+      ensure_self_adjoint_and_pd=False):
+    # Identity matrix is already Hermitian Positive Definite.
+    del ensure_self_adjoint_and_pd
+
     shape = list(build_info.shape)
     assert shape[-1] == shape[-2]
 
@@ -54,16 +61,19 @@ class LinearOperatorIdentityTest(
 
     return operator, mat
 
+  @test_util.run_deprecated_v1
   def test_assert_positive_definite(self):
     with self.cached_session():
       operator = linalg_lib.LinearOperatorIdentity(num_rows=2)
       operator.assert_positive_definite().run()  # Should not fail
 
+  @test_util.run_deprecated_v1
   def test_assert_non_singular(self):
     with self.cached_session():
       operator = linalg_lib.LinearOperatorIdentity(num_rows=2)
       operator.assert_non_singular().run()  # Should not fail
 
+  @test_util.run_deprecated_v1
   def test_assert_self_adjoint(self):
     with self.cached_session():
       operator = linalg_lib.LinearOperatorIdentity(num_rows=2)
@@ -77,7 +87,7 @@ class LinearOperatorIdentityTest(
           num_rows=2, dtype=dtypes.float16)
       x = rng.randn(2, 3).astype(np.float16)
       y = operator.matmul(x)
-      self.assertAllClose(x, y.eval())
+      self.assertAllClose(x, self.evaluate(y))
 
   def test_non_scalar_num_rows_raises_static(self):
     with self.assertRaisesRegexp(ValueError, "must be a 0-D Tensor"):
@@ -103,6 +113,7 @@ class LinearOperatorIdentityTest(
     with self.assertRaisesRegexp(ValueError, "must be non-negative"):
       linalg_lib.LinearOperatorIdentity(num_rows=2, batch_shape=[-2])
 
+  @test_util.run_deprecated_v1
   def test_non_scalar_num_rows_raises_dynamic(self):
     with self.cached_session():
       num_rows = array_ops.placeholder(dtypes.int32)
@@ -111,6 +122,7 @@ class LinearOperatorIdentityTest(
       with self.assertRaisesOpError("must be a 0-D Tensor"):
         operator.to_dense().eval(feed_dict={num_rows: [2]})
 
+  @test_util.run_deprecated_v1
   def test_negative_num_rows_raises_dynamic(self):
     with self.cached_session():
       num_rows = array_ops.placeholder(dtypes.int32)
@@ -119,6 +131,7 @@ class LinearOperatorIdentityTest(
       with self.assertRaisesOpError("must be non-negative"):
         operator.to_dense().eval(feed_dict={num_rows: -2})
 
+  @test_util.run_deprecated_v1
   def test_non_1d_batch_shape_raises_dynamic(self):
     with self.cached_session():
       batch_shape = array_ops.placeholder(dtypes.int32)
@@ -127,6 +140,7 @@ class LinearOperatorIdentityTest(
       with self.assertRaisesOpError("must be a 1-D"):
         operator.to_dense().eval(feed_dict={batch_shape: 2})
 
+  @test_util.run_deprecated_v1
   def test_negative_batch_shape_raises_dynamic(self):
     with self.cached_session():
       batch_shape = array_ops.placeholder(dtypes.int32)
@@ -141,6 +155,7 @@ class LinearOperatorIdentityTest(
     with self.assertRaisesRegexp(ValueError, "Dimensions.*not compatible"):
       operator.matmul(x)
 
+  @test_util.run_deprecated_v1
   def test_wrong_matrix_dimensions_raises_dynamic(self):
     num_rows = array_ops.placeholder(dtypes.int32)
     x = array_ops.placeholder(dtypes.float32)
@@ -164,8 +179,9 @@ class LinearOperatorIdentityTest(
       expected = x
 
       self.assertAllEqual(operator_matmul.get_shape(), expected.get_shape())
-      self.assertAllClose(*sess.run([operator_matmul, expected]))
+      self.assertAllClose(*self.evaluate([operator_matmul, expected]))
 
+  @test_util.run_deprecated_v1
   def test_default_batch_shape_broadcasts_with_everything_dynamic(self):
     # These cannot be done in the automated (base test class) tests since they
     # test shapes that tf.batch_matmul cannot handle.
@@ -201,8 +217,9 @@ class LinearOperatorIdentityTest(
 
       operator_matmul = operator.matmul(x)
       self.assertAllEqual(operator_matmul.get_shape(), expected.get_shape())
-      self.assertAllClose(*sess.run([operator_matmul, expected]))
+      self.assertAllClose(*self.evaluate([operator_matmul, expected]))
 
+  @test_util.run_deprecated_v1
   def test_broadcast_matmul_dynamic_shapes(self):
     # These cannot be done in the automated (base test class) tests since they
     # test shapes that tf.batch_matmul cannot handle.
@@ -242,6 +259,16 @@ class LinearOperatorIdentityTest(
           is_non_singular=None,
       )
 
+  def test_identity_cholesky_type(self):
+    operator = linalg_lib.LinearOperatorIdentity(
+        num_rows=2,
+        is_positive_definite=True,
+        is_self_adjoint=True,
+    )
+    self.assertTrue(isinstance(
+        operator.cholesky(),
+        linalg_lib.LinearOperatorIdentity))
+
 
 class LinearOperatorScaledIdentityTest(
     linear_operator_test_util.SquareLinearOperatorDerivedClassTest):
@@ -253,7 +280,10 @@ class LinearOperatorScaledIdentityTest(
     # 16bit.
     return [dtypes.float32, dtypes.float64, dtypes.complex64, dtypes.complex128]
 
-  def _operator_and_matrix(self, build_info, dtype, use_placeholder):
+  def _operator_and_matrix(
+      self, build_info, dtype, use_placeholder,
+      ensure_self_adjoint_and_pd=False):
+
     shape = list(build_info.shape)
     assert shape[-1] == shape[-2]
 
@@ -266,6 +296,9 @@ class LinearOperatorScaledIdentityTest(
     multiplier = linear_operator_test_util.random_sign_uniform(
         shape=batch_shape, minval=1., maxval=2., dtype=dtype)
 
+    if ensure_self_adjoint_and_pd:
+      # Abs on complex64 will result in a float32, so we cast back up.
+      multiplier = math_ops.cast(math_ops.abs(multiplier), dtype=dtype)
 
     # Nothing to feed since LinearOperatorScaledIdentity takes no Tensor args.
     lin_op_multiplier = multiplier
@@ -275,7 +308,10 @@ class LinearOperatorScaledIdentityTest(
           multiplier, shape=None)
 
     operator = linalg_lib.LinearOperatorScaledIdentity(
-        num_rows, lin_op_multiplier)
+        num_rows,
+        lin_op_multiplier,
+        is_self_adjoint=True if ensure_self_adjoint_and_pd else None,
+        is_positive_definite=True if ensure_self_adjoint_and_pd else None)
 
     multiplier_matrix = array_ops.expand_dims(
         array_ops.expand_dims(multiplier, -1), -1)
@@ -284,6 +320,7 @@ class LinearOperatorScaledIdentityTest(
 
     return operator, matrix
 
+  @test_util.run_deprecated_v1
   def test_assert_positive_definite_does_not_raise_when_positive(self):
     with self.cached_session():
       operator = linalg_lib.LinearOperatorScaledIdentity(
@@ -297,6 +334,7 @@ class LinearOperatorScaledIdentityTest(
       with self.assertRaisesOpError("not positive definite"):
         operator.assert_positive_definite().run()
 
+  @test_util.run_deprecated_v1
   def test_assert_non_singular_does_not_raise_when_non_singular(self):
     with self.cached_session():
       operator = linalg_lib.LinearOperatorScaledIdentity(
@@ -310,6 +348,7 @@ class LinearOperatorScaledIdentityTest(
       with self.assertRaisesOpError("was singular"):
         operator.assert_non_singular().run()
 
+  @test_util.run_deprecated_v1
   def test_assert_self_adjoint_does_not_raise_when_self_adjoint(self):
     with self.cached_session():
       operator = linalg_lib.LinearOperatorScaledIdentity(
@@ -332,7 +371,7 @@ class LinearOperatorScaledIdentityTest(
           num_rows=2, multiplier=multiplier)
       x = rng.randn(2, 3).astype(np.float16)
       y = operator.matmul(x)
-      self.assertAllClose(multiplier[..., None, None] * x, y.eval())
+      self.assertAllClose(multiplier[..., None, None] * x, self.evaluate(y))
 
   def test_non_scalar_num_rows_raises_static(self):
     # Many "test_...num_rows" tests are performed in LinearOperatorIdentity.
@@ -347,6 +386,7 @@ class LinearOperatorScaledIdentityTest(
     with self.assertRaisesRegexp(ValueError, "Dimensions.*not compatible"):
       operator.matmul(x)
 
+  @test_util.run_deprecated_v1
   def test_wrong_matrix_dimensions_raises_dynamic(self):
     num_rows = array_ops.placeholder(dtypes.int32)
     x = array_ops.placeholder(dtypes.float32)
@@ -378,13 +418,13 @@ class LinearOperatorScaledIdentityTest(
       expected = x * 2.2 + zeros
       operator_matmul = operator.matmul(x)
       self.assertAllEqual(operator_matmul.get_shape(), expected.get_shape())
-      self.assertAllClose(*sess.run([operator_matmul, expected]))
+      self.assertAllClose(*self.evaluate([operator_matmul, expected]))
 
       # Test solve
       expected = x / 2.2 + zeros
       operator_solve = operator.solve(x)
       self.assertAllEqual(operator_solve.get_shape(), expected.get_shape())
-      self.assertAllClose(*sess.run([operator_solve, expected]))
+      self.assertAllClose(*self.evaluate([operator_solve, expected]))
 
   def test_broadcast_matmul_and_solve_scalar_scale_multiplier(self):
     # These cannot be done in the automated (base test class) tests since they
@@ -404,13 +444,13 @@ class LinearOperatorScaledIdentityTest(
       expected = x * 2.2
       operator_matmul = operator.matmul(x)
       self.assertAllEqual(operator_matmul.get_shape(), expected.get_shape())
-      self.assertAllClose(*sess.run([operator_matmul, expected]))
+      self.assertAllClose(*self.evaluate([operator_matmul, expected]))
 
       # Test solve
       expected = x / 2.2
       operator_solve = operator.solve(x)
       self.assertAllEqual(operator_solve.get_shape(), expected.get_shape())
-      self.assertAllClose(*sess.run([operator_solve, expected]))
+      self.assertAllClose(*self.evaluate([operator_solve, expected]))
 
   def test_is_x_flags(self):
     operator = linalg_lib.LinearOperatorScaledIdentity(
@@ -420,6 +460,41 @@ class LinearOperatorScaledIdentityTest(
     self.assertTrue(operator.is_non_singular)
     self.assertTrue(operator.is_self_adjoint is None)
 
+  def test_identity_matmul(self):
+    operator1 = linalg_lib.LinearOperatorIdentity(num_rows=2)
+    operator2 = linalg_lib.LinearOperatorScaledIdentity(
+        num_rows=2, multiplier=3.)
+    self.assertTrue(isinstance(
+        operator1.matmul(operator1),
+        linalg_lib.LinearOperatorIdentity))
+
+    self.assertTrue(isinstance(
+        operator1.matmul(operator1),
+        linalg_lib.LinearOperatorIdentity))
+
+    operator_matmul = operator1.matmul(operator2)
+    self.assertTrue(isinstance(
+        operator_matmul,
+        linalg_lib.LinearOperatorScaledIdentity))
+    self.assertAllClose(3., self.evaluate(operator_matmul.multiplier))
+
+    operator_matmul = operator2.matmul(operator1)
+    self.assertTrue(isinstance(
+        operator_matmul,
+        linalg_lib.LinearOperatorScaledIdentity))
+    self.assertAllClose(3., self.evaluate(operator_matmul.multiplier))
+
+  def test_scaled_identity_cholesky_type(self):
+    operator = linalg_lib.LinearOperatorScaledIdentity(
+        num_rows=2,
+        multiplier=3.,
+        is_positive_definite=True,
+        is_self_adjoint=True,
+    )
+    self.assertTrue(isinstance(
+        operator.cholesky(),
+        linalg_lib.LinearOperatorScaledIdentity))
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_inversion_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_inversion_test.py
new file mode 100644
index 00000000000..9344c526ee8
--- /dev/null
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_inversion_test.py
@@ -0,0 +1,130 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops.linalg import linalg as linalg_lib
+from tensorflow.python.ops.linalg import linear_operator_inversion
+from tensorflow.python.ops.linalg import linear_operator_test_util
+from tensorflow.python.platform import test
+
+linalg = linalg_lib
+
+LinearOperatorInversion = linear_operator_inversion.LinearOperatorInversion  # pylint: disable=invalid-name
+
+
+class LinearOperatorInversionTest(
+    linear_operator_test_util.SquareLinearOperatorDerivedClassTest):
+  """Most tests done in the base class LinearOperatorDerivedClassTest."""
+
+  def setUp(self):
+    self._atol[dtypes.complex64] = 1e-5
+    self._rtol[dtypes.complex64] = 1e-5
+
+  def _operator_and_matrix(self,
+                           build_info,
+                           dtype,
+                           use_placeholder,
+                           ensure_self_adjoint_and_pd=False):
+    shape = list(build_info.shape)
+
+    if ensure_self_adjoint_and_pd:
+      matrix = linear_operator_test_util.random_positive_definite_matrix(
+          shape, dtype, force_well_conditioned=True)
+    else:
+      matrix = linear_operator_test_util.random_tril_matrix(
+          shape, dtype, force_well_conditioned=True, remove_upper=True)
+
+    lin_op_matrix = matrix
+
+    if use_placeholder:
+      lin_op_matrix = array_ops.placeholder_with_default(matrix, shape=None)
+
+    if ensure_self_adjoint_and_pd:
+      operator = LinearOperatorInversion(
+          linalg.LinearOperatorFullMatrix(
+              lin_op_matrix, is_positive_definite=True, is_self_adjoint=True))
+    else:
+      operator = LinearOperatorInversion(
+          linalg.LinearOperatorLowerTriangular(lin_op_matrix))
+
+    return operator, linalg.inv(matrix)
+
+  def test_base_operator_hint_used(self):
+    # The matrix values do not effect auto-setting of the flags.
+    matrix = [[1., 0.], [1., 1.]]
+    operator = linalg.LinearOperatorFullMatrix(
+        matrix,
+        is_positive_definite=True,
+        is_non_singular=True,
+        is_self_adjoint=False)
+    operator_inv = LinearOperatorInversion(operator)
+    self.assertTrue(operator_inv.is_positive_definite)
+    self.assertTrue(operator_inv.is_non_singular)
+    self.assertFalse(operator_inv.is_self_adjoint)
+
+  def test_supplied_hint_used(self):
+    # The matrix values do not effect auto-setting of the flags.
+    matrix = [[1., 0.], [1., 1.]]
+    operator = linalg.LinearOperatorFullMatrix(matrix)
+    operator_inv = LinearOperatorInversion(
+        operator,
+        is_positive_definite=True,
+        is_non_singular=True,
+        is_self_adjoint=False)
+    self.assertTrue(operator_inv.is_positive_definite)
+    self.assertTrue(operator_inv.is_non_singular)
+    self.assertFalse(operator_inv.is_self_adjoint)
+
+  def test_contradicting_hints_raise(self):
+    # The matrix values do not effect auto-setting of the flags.
+    matrix = [[1., 0.], [1., 1.]]
+    operator = linalg.LinearOperatorFullMatrix(
+        matrix, is_positive_definite=False)
+    with self.assertRaisesRegexp(ValueError, "positive-definite"):
+      LinearOperatorInversion(operator, is_positive_definite=True)
+
+    operator = linalg.LinearOperatorFullMatrix(matrix, is_self_adjoint=False)
+    with self.assertRaisesRegexp(ValueError, "self-adjoint"):
+      LinearOperatorInversion(operator, is_self_adjoint=True)
+
+  def test_singular_raises(self):
+    # The matrix values do not effect auto-setting of the flags.
+    matrix = [[1., 1.], [1., 1.]]
+
+    operator = linalg.LinearOperatorFullMatrix(matrix, is_non_singular=False)
+    with self.assertRaisesRegexp(ValueError, "is_non_singular"):
+      LinearOperatorInversion(operator)
+
+    operator = linalg.LinearOperatorFullMatrix(matrix)
+    with self.assertRaisesRegexp(ValueError, "is_non_singular"):
+      LinearOperatorInversion(operator, is_non_singular=False)
+
+  def test_name(self):
+    matrix = [[11., 0.], [1., 8.]]
+    operator = linalg.LinearOperatorFullMatrix(
+        matrix, name="my_operator", is_non_singular=True)
+
+    operator = LinearOperatorInversion(operator)
+
+    self.assertEqual("my_operator_inv", operator.name)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_kronecker_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_kronecker_test.py
index f039b60f648..513b2468032 100644
--- a/tensorflow/python/kernel_tests/linalg/linear_operator_kronecker_test.py
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_kronecker_test.py
@@ -21,9 +21,11 @@ import numpy as np
 
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops.linalg import linalg as linalg_lib
 from tensorflow.python.ops.linalg import linear_operator_kronecker as kronecker
+from tensorflow.python.ops.linalg import linear_operator_lower_triangular as lower_triangular
 from tensorflow.python.ops.linalg import linear_operator_test_util
 from tensorflow.python.ops.linalg import linear_operator_util
 from tensorflow.python.platform import test
@@ -52,6 +54,7 @@ def _kronecker_dense(factors):
 
 class KroneckerDenseTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testKroneckerDenseMatrix(self):
     x = ops.convert_to_tensor([[2., 3.], [1., 2.]], dtype=dtypes.float32)
     y = ops.convert_to_tensor([[1., 2.], [5., -1.]], dtype=dtypes.float32)
@@ -69,8 +72,8 @@ class KroneckerDenseTest(test.TestCase):
         [5., 10., -1., -2.]], dtype=dtypes.float32)
 
     with self.cached_session():
-      self.assertAllClose(_kronecker_dense([x, y]).eval(), z.eval())
-      self.assertAllClose(_kronecker_dense([y, x]).eval(), w.eval())
+      self.assertAllClose(_kronecker_dense([x, y]).eval(), self.evaluate(z))
+      self.assertAllClose(_kronecker_dense([y, x]).eval(), self.evaluate(w))
 
 
 class SquareLinearOperatorKroneckerTest(
@@ -99,7 +102,12 @@ class SquareLinearOperatorKroneckerTest(
   def _tests_to_skip(self):
     return ["det", "solve", "solve_with_broadcast"]
 
-  def _operator_and_matrix(self, build_info, dtype, use_placeholder):
+  def _operator_and_matrix(
+      self, build_info, dtype, use_placeholder,
+      ensure_self_adjoint_and_pd=False):
+    # Kronecker products constructed below will be from symmetric
+    # positive-definite matrices.
+    del ensure_self_adjoint_and_pd
     shape = list(build_info.shape)
     expected_factors = build_info.__dict__["factors"]
     matrices = [
@@ -116,7 +124,11 @@ class SquareLinearOperatorKroneckerTest(
 
     operator = kronecker.LinearOperatorKronecker(
         [linalg.LinearOperatorFullMatrix(
-            l, is_square=True) for l in lin_op_matrices])
+            l,
+            is_square=True,
+            is_self_adjoint=True,
+            is_positive_definite=True)
+         for l in lin_op_matrices])
 
     matrices = linear_operator_util.broadcast_matrix_batch_dims(matrices)
 
@@ -180,6 +192,40 @@ class SquareLinearOperatorKroneckerTest(
     with self.assertRaisesRegexp(ValueError, ">=1 operators"):
       kronecker.LinearOperatorKronecker([])
 
+  def test_kronecker_cholesky_type(self):
+    matrix = [[1., 0.], [0., 1.]]
+    operator = kronecker.LinearOperatorKronecker(
+        [
+            linalg.LinearOperatorFullMatrix(
+                matrix,
+                is_positive_definite=True,
+                is_self_adjoint=True,
+            ),
+            linalg.LinearOperatorFullMatrix(
+                matrix,
+                is_positive_definite=True,
+                is_self_adjoint=True,
+            ),
+        ],
+        is_positive_definite=True,
+        is_self_adjoint=True,
+    )
+    cholesky_factor = operator.cholesky()
+    self.assertTrue(isinstance(
+        cholesky_factor,
+        kronecker.LinearOperatorKronecker))
+    self.assertEqual(2, len(cholesky_factor.operators))
+    self.assertTrue(
+        isinstance(
+            cholesky_factor.operators[0],
+            lower_triangular.LinearOperatorLowerTriangular)
+    )
+    self.assertTrue(
+        isinstance(
+            cholesky_factor.operators[1],
+            lower_triangular.LinearOperatorLowerTriangular)
+    )
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_low_rank_update_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_low_rank_update_test.py
index 207e5edf818..2920f3ae7eb 100644
--- a/tensorflow/python/kernel_tests/linalg/linear_operator_low_rank_update_test.py
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_low_rank_update_test.py
@@ -69,7 +69,8 @@ class BaseLinearOperatorLowRankUpdatetest(object):
     return linear_operator_test_util.random_uniform(
         diag_shape, minval=1e-4, maxval=1., dtype=dtype)
 
-  def _operator_and_matrix(self, build_info, dtype, use_placeholder):
+  def _operator_and_matrix(self, build_info, dtype, use_placeholder,
+                           ensure_self_adjoint_and_pd=False):
     # Recall A = L + UDV^H
     shape = list(build_info.shape)
     diag_shape = shape[:-1]
@@ -93,7 +94,7 @@ class BaseLinearOperatorLowRankUpdatetest(object):
     lin_op_v = v
 
     # D
-    if self._is_diag_update_positive:
+    if self._is_diag_update_positive or ensure_self_adjoint_and_pd:
       diag_update = self._gen_positive_diag(dtype, diag_update_shape)
     else:
       diag_update = linear_operator_test_util.random_normal(
@@ -178,6 +179,10 @@ class LinearOperatorLowRankUpdatetestWithDiagCannotUseCholesky(
     linear_operator_test_util.SquareLinearOperatorDerivedClassTest):
   """A = L + UDU^H, D !> 0, L > 0 ==> A !> 0 and we cannot use a Cholesky."""
 
+  @property
+  def _tests_to_skip(self):
+    return ["cholesky"]
+
   _use_diag_update = True
   _is_diag_update_positive = False
   _use_v = False
@@ -217,6 +222,10 @@ class LinearOperatorLowRankUpdatetestNoDiagCannotUseCholesky(
     linear_operator_test_util.SquareLinearOperatorDerivedClassTest):
   """A = L + UV^H, L > 0 ==> A is not symmetric and we cannot use a Cholesky."""
 
+  @property
+  def _tests_to_skip(self):
+    return ["cholesky"]
+
   _use_diag_update = False
   _is_diag_update_positive = None
   _use_v = True
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_lower_triangular_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_lower_triangular_test.py
index e3c8f5cb688..bd41f9ed9d3 100644
--- a/tensorflow/python/kernel_tests/linalg/linear_operator_lower_triangular_test.py
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_lower_triangular_test.py
@@ -18,6 +18,7 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.linalg import linalg as linalg_lib
 from tensorflow.python.ops.linalg import linear_operator_test_util
 from tensorflow.python.platform import test
@@ -29,6 +30,11 @@ class LinearOperatorLowerTriangularTest(
     linear_operator_test_util.SquareLinearOperatorDerivedClassTest):
   """Most tests done in the base class LinearOperatorDerivedClassTest."""
 
+  @property
+  def _tests_to_skip(self):
+    # Cholesky does not make sense for triangular matrices.
+    return ["cholesky"]
+
   def _operator_and_matrix(self, build_info, dtype, use_placeholder):
     shape = list(build_info.shape)
     # Upper triangle will be nonzero, but ignored.
@@ -71,6 +77,30 @@ class LinearOperatorLowerTriangularTest(
     with self.assertRaisesRegexp(ValueError, "at least 2 dimensions"):
       linalg.LinearOperatorLowerTriangular([1.])
 
+  def test_triangular_diag_matmul(self):
+    operator1 = linalg_lib.LinearOperatorLowerTriangular(
+        [[1., 0., 0.], [2., 1., 0.], [2., 3., 3.]])
+    operator2 = linalg_lib.LinearOperatorDiag([2., 2., 3.])
+    operator_matmul = operator1.matmul(operator2)
+    self.assertTrue(isinstance(
+        operator_matmul,
+        linalg_lib.LinearOperatorLowerTriangular))
+    self.assertAllClose(
+        math_ops.matmul(
+            operator1.to_dense(),
+            operator2.to_dense()),
+        self.evaluate(operator_matmul.to_dense()))
+
+    operator_matmul = operator2.matmul(operator1)
+    self.assertTrue(isinstance(
+        operator_matmul,
+        linalg_lib.LinearOperatorLowerTriangular))
+    self.assertAllClose(
+        math_ops.matmul(
+            operator2.to_dense(),
+            operator1.to_dense()),
+        self.evaluate(operator_matmul.to_dense()))
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_test.py
index 819347343b1..8f8b15e8ed8 100644
--- a/tensorflow/python/kernel_tests/linalg/linear_operator_test.py
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_test.py
@@ -22,6 +22,7 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import linalg_ops
 from tensorflow.python.ops import math_ops
@@ -107,6 +108,7 @@ class LinearOperatorTest(test.TestCase):
     self.assertAllEqual(4, operator.domain_dimension)
     self.assertAllEqual(3, operator.range_dimension)
 
+  @test_util.run_deprecated_v1
   def test_all_shape_methods_defined_by_the_one_method_shape(self):
     with self.cached_session():
       shape = (1, 2, 3, 4)
@@ -134,8 +136,9 @@ class LinearOperatorTest(test.TestCase):
     with self.cached_session():
       operator_dense = operator.to_dense()
       self.assertAllEqual((2, 3, 4), operator_dense.get_shape())
-      self.assertAllClose(matrix, operator_dense.eval())
+      self.assertAllClose(matrix, self.evaluate(operator_dense))
 
+  @test_util.run_deprecated_v1
   def test_generic_to_dense_method_non_square_matrix_tensor(self):
     matrix = rng.randn(2, 3, 4)
     matrix_ph = array_ops.placeholder(dtypes.float64)
@@ -152,7 +155,7 @@ class LinearOperatorTest(test.TestCase):
     with self.cached_session():
       y = operator.matvec(x)
       self.assertAllEqual((2,), y.get_shape())
-      self.assertAllClose([1., 2.], y.eval())
+      self.assertAllClose([1., 2.], self.evaluate(y))
 
   def test_solvevec(self):
     matrix = [[1., 0], [0., 2.]]
@@ -161,7 +164,7 @@ class LinearOperatorTest(test.TestCase):
     with self.cached_session():
       x = operator.solvevec(y)
       self.assertAllEqual((2,), x.get_shape())
-      self.assertAllClose([1., 1 / 2.], x.eval())
+      self.assertAllClose([1., 1 / 2.], self.evaluate(x))
 
   def test_is_square_set_to_true_for_square_static_shapes(self):
     operator = LinearOperatorShape(shape=(2, 4, 4))
@@ -175,6 +178,7 @@ class LinearOperatorTest(test.TestCase):
     with self.assertRaisesRegexp(ValueError, "but.*was square"):
       _ = LinearOperatorShape(shape=(2, 4, 4), is_square=False).is_square
 
+  @test_util.run_deprecated_v1
   def test_is_square_set_inconsistent_with_other_hints_raises(self):
     with self.assertRaisesRegexp(ValueError, "is always square"):
       matrix = array_ops.placeholder(dtypes.float32)
@@ -185,6 +189,7 @@ class LinearOperatorTest(test.TestCase):
       LinearOperatorMatmulSolve(
           matrix, is_positive_definite=True, is_square=False)
 
+  @test_util.run_deprecated_v1
   def test_non_square_operators_raise_on_determinant_and_solve(self):
     operator = LinearOperatorShape((2, 3))
     with self.assertRaisesRegexp(NotImplementedError, "not be square"):
@@ -199,6 +204,7 @@ class LinearOperatorTest(test.TestCase):
       LinearOperatorMatmulSolve(
           matrix, is_positive_definite=True, is_square=False)
 
+  @test_util.run_deprecated_v1
   def test_is_square_manual_set_works(self):
     matrix = array_ops.placeholder(dtypes.float32)
     # Default is None.
@@ -208,6 +214,80 @@ class LinearOperatorTest(test.TestCase):
     operator = LinearOperatorMatmulSolve(matrix, is_square=True)
     self.assertTrue(operator.is_square)
 
+  @test_util.run_deprecated_v1
+  def test_linear_operator_matmul_hints_closed(self):
+    matrix = array_ops.placeholder(dtypes.float32)
+    operator1 = LinearOperatorMatmulSolve(matrix)
+
+    operator_matmul = operator1.matmul(operator1)
+
+    self.assertEqual(None, operator_matmul.is_square)
+    self.assertEqual(None, operator_matmul.is_non_singular)
+    self.assertEqual(None, operator_matmul.is_self_adjoint)
+    self.assertEqual(None, operator_matmul.is_positive_definite)
+
+    operator2 = LinearOperatorMatmulSolve(
+        matrix,
+        is_non_singular=True,
+        is_self_adjoint=True,
+        is_positive_definite=True,
+        is_square=True,
+    )
+
+    operator_matmul = operator2.matmul(operator2)
+
+    self.assertTrue(operator_matmul.is_square)
+    self.assertTrue(operator_matmul.is_non_singular)
+    self.assertTrue(operator_matmul.is_self_adjoint)
+    self.assertEqual(None, operator_matmul.is_positive_definite)
+
+  @test_util.run_deprecated_v1
+  def test_linear_operator_matmul_hints_false(self):
+    matrix = array_ops.placeholder(dtypes.float32)
+    operator1 = LinearOperatorMatmulSolve(
+        matrix,
+        is_non_singular=False,
+        is_self_adjoint=False,
+        is_positive_definite=False,
+        is_square=True,
+    )
+
+    operator_matmul = operator1.matmul(operator1)
+
+    self.assertTrue(operator_matmul.is_square)
+    self.assertFalse(operator_matmul.is_non_singular)
+    self.assertEqual(None, operator_matmul.is_self_adjoint)
+    self.assertEqual(None, operator_matmul.is_positive_definite)
+
+    operator2 = LinearOperatorMatmulSolve(
+        matrix,
+        is_non_singular=False,
+        is_self_adjoint=False,
+        is_positive_definite=False,
+        is_square=False,
+    )
+
+    operator_matmul = operator2.matmul(operator2)
+
+    self.assertEqual(None, operator_matmul.is_square)
+    self.assertEqual(None, operator_matmul.is_non_singular)
+    self.assertEqual(None, operator_matmul.is_self_adjoint)
+    self.assertEqual(None, operator_matmul.is_positive_definite)
+
+  @test_util.run_deprecated_v1
+  def test_linear_operator_matmul_hint_infer_square(self):
+    matrix1 = array_ops.placeholder(shape=[2, 3], dtype=dtypes.float32)
+    matrix2 = array_ops.placeholder(shape=[3, 2], dtype=dtypes.float32)
+    matrix3 = array_ops.placeholder(shape=[3, 4], dtype=dtypes.float32)
+
+    operator1 = LinearOperatorMatmulSolve(matrix1, is_square=False)
+    operator2 = LinearOperatorMatmulSolve(matrix2, is_square=False)
+    operator3 = LinearOperatorMatmulSolve(matrix3, is_square=False)
+
+    self.assertTrue(operator1.matmul(operator2).is_square)
+    self.assertTrue(operator2.matmul(operator1).is_square)
+    self.assertFalse(operator1.matmul(operator3).is_square)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_util_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_util_test.py
index 31fb19e4a69..d1e6c37e35a 100644
--- a/tensorflow/python/kernel_tests/linalg/linear_operator_util_test.py
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_util_test.py
@@ -21,6 +21,7 @@ import numpy as np
 
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import linalg_ops
 from tensorflow.python.ops import math_ops
@@ -32,12 +33,14 @@ rng = np.random.RandomState(0)
 
 class AssertZeroImagPartTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def test_real_tensor_doesnt_raise(self):
     x = ops.convert_to_tensor([0., 2, 3])
     with self.cached_session():
       # Should not raise.
       linear_operator_util.assert_zero_imag_part(x, message="ABC123").run()
 
+  @test_util.run_deprecated_v1
   def test_complex_tensor_with_imag_zero_doesnt_raise(self):
     x = ops.convert_to_tensor([1., 0, 3])
     y = ops.convert_to_tensor([0., 0, 0])
@@ -57,6 +60,7 @@ class AssertZeroImagPartTest(test.TestCase):
 
 class AssertNoEntriesWithModulusZeroTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def test_nonzero_real_tensor_doesnt_raise(self):
     x = ops.convert_to_tensor([1., 2, 3])
     with self.cached_session():
@@ -64,6 +68,7 @@ class AssertNoEntriesWithModulusZeroTest(test.TestCase):
       linear_operator_util.assert_no_entries_with_modulus_zero(
           x, message="ABC123").run()
 
+  @test_util.run_deprecated_v1
   def test_nonzero_complex_tensor_doesnt_raise(self):
     x = ops.convert_to_tensor([1., 0, 3])
     y = ops.convert_to_tensor([1., 2, 0])
@@ -102,8 +107,9 @@ class BroadcastMatrixBatchDimsTest(test.TestCase):
     self.assertTrue(isinstance(tensor, ops.Tensor))
 
     with self.cached_session():
-      self.assertAllClose(arr, tensor.eval())
+      self.assertAllClose(arr, self.evaluate(tensor))
 
+  @test_util.run_deprecated_v1
   def test_static_dims_broadcast(self):
     # x.batch_shape = [3, 1, 2]
     # y.batch_shape = [4, 1]
@@ -119,7 +125,7 @@ class BroadcastMatrixBatchDimsTest(test.TestCase):
     with self.cached_session() as sess:
       self.assertAllEqual(x_bc_expected.shape, x_bc.get_shape())
       self.assertAllEqual(y_bc_expected.shape, y_bc.get_shape())
-      x_bc_, y_bc_ = sess.run([x_bc, y_bc])
+      x_bc_, y_bc_ = self.evaluate([x_bc, y_bc])
       self.assertAllClose(x_bc_expected, x_bc_)
       self.assertAllClose(y_bc_expected, y_bc_)
 
@@ -138,10 +144,11 @@ class BroadcastMatrixBatchDimsTest(test.TestCase):
     with self.cached_session() as sess:
       self.assertAllEqual(x_bc_expected.shape, x_bc.get_shape())
       self.assertAllEqual(y_bc_expected.shape, y_bc.get_shape())
-      x_bc_, y_bc_ = sess.run([x_bc, y_bc])
+      x_bc_, y_bc_ = self.evaluate([x_bc, y_bc])
       self.assertAllClose(x_bc_expected, x_bc_)
       self.assertAllClose(y_bc_expected, y_bc_)
 
+  @test_util.run_deprecated_v1
   def test_dynamic_dims_broadcast_32bit(self):
     # x.batch_shape = [3, 1, 2]
     # y.batch_shape = [4, 1]
@@ -162,6 +169,7 @@ class BroadcastMatrixBatchDimsTest(test.TestCase):
       self.assertAllClose(x_bc_expected, x_bc_)
       self.assertAllClose(y_bc_expected, y_bc_)
 
+  @test_util.run_deprecated_v1
   def test_dynamic_dims_broadcast_32bit_second_arg_higher_rank(self):
     # x.batch_shape =    [1, 2]
     # y.batch_shape = [3, 4, 1]
@@ -195,6 +203,7 @@ class BroadcastMatrixBatchDimsTest(test.TestCase):
 
 class CholeskySolveWithBroadcastTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def test_static_dims_broadcast(self):
     # batch_shape = [2]
     chol = rng.rand(3, 3)
@@ -205,8 +214,9 @@ class CholeskySolveWithBroadcastTest(test.TestCase):
       result = linear_operator_util.cholesky_solve_with_broadcast(chol, rhs)
       self.assertAllEqual((2, 3, 7), result.get_shape())
       expected = linalg_ops.cholesky_solve(chol_broadcast, rhs)
-      self.assertAllClose(expected.eval(), result.eval())
+      self.assertAllClose(expected.eval(), self.evaluate(result))
 
+  @test_util.run_deprecated_v1
   def test_dynamic_dims_broadcast_64bit(self):
     # batch_shape = [2, 2]
     chol = rng.rand(2, 3, 3)
@@ -233,6 +243,7 @@ class CholeskySolveWithBroadcastTest(test.TestCase):
 
 class MatmulWithBroadcastTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def test_static_dims_broadcast_x_has_extra_dims(self):
     # batch_shape = [2]
     # for each batch member, we have a 1x3 matrix times a 3x7 matrix ==> 1x7
@@ -244,8 +255,9 @@ class MatmulWithBroadcastTest(test.TestCase):
       result = linear_operator_util.matmul_with_broadcast(x, y)
       self.assertAllEqual((2, 1, 7), result.get_shape())
       expected = math_ops.matmul(x, y_broadcast)
-      self.assertAllClose(expected.eval(), result.eval())
+      self.assertAllClose(expected.eval(), self.evaluate(result))
 
+  @test_util.run_deprecated_v1
   def test_static_dims_broadcast_y_has_extra_dims(self):
     # Since the second arg has extra dims, and the domain dim of the first arg
     # is larger than the number of linear equations, code will "flip" the extra
@@ -261,8 +273,9 @@ class MatmulWithBroadcastTest(test.TestCase):
       result = linear_operator_util.matmul_with_broadcast(x, y)
       self.assertAllEqual((2, 3, 5, 5), result.get_shape())
       expected = math_ops.matmul(x_broadcast, y)
-      self.assertAllClose(expected.eval(), result.eval())
+      self.assertAllClose(expected.eval(), self.evaluate(result))
 
+  @test_util.run_deprecated_v1
   def test_static_dims_broadcast_y_has_extra_dims_transpose_a_and_b(self):
     # Since the second arg has extra dims, and the domain dim of the first arg
     # is larger than the number of linear equations, code will "flip" the extra
@@ -280,8 +293,9 @@ class MatmulWithBroadcastTest(test.TestCase):
       self.assertAllEqual((2, 3, 5, 1), result.get_shape())
       expected = math_ops.matmul(
           x_broadcast, y, transpose_a=True, transpose_b=True)
-      self.assertAllClose(expected.eval(), result.eval())
+      self.assertAllClose(expected.eval(), self.evaluate(result))
 
+  @test_util.run_deprecated_v1
   def test_static_dims_broadcast_y_has_extra_dims_transpose_dynamic(self):
     # Since the second arg has extra dims, and the domain dim of the first arg
     # is larger than the number of linear equations, code will "flip" the extra
@@ -308,6 +322,7 @@ class MatmulWithBroadcastTest(test.TestCase):
                               y_ph: y
                           }))
 
+  @test_util.run_deprecated_v1
   def test_dynamic_dims_broadcast_64bit(self):
     # batch_shape = [2]
     # for each batch member, we have a 1x3 matrix times a 3x7 matrix ==> 1x7
@@ -333,6 +348,7 @@ class MatmulWithBroadcastTest(test.TestCase):
 
 class MatrixSolveWithBroadcastTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def test_static_dims_broadcast_matrix_has_extra_dims(self):
     # batch_shape = [2]
     matrix = rng.rand(2, 3, 3)
@@ -344,8 +360,9 @@ class MatrixSolveWithBroadcastTest(test.TestCase):
           matrix, rhs)
       self.assertAllEqual((2, 3, 7), result.get_shape())
       expected = linalg_ops.matrix_solve(matrix, rhs_broadcast)
-      self.assertAllClose(expected.eval(), result.eval())
+      self.assertAllClose(expected.eval(), self.evaluate(result))
 
+  @test_util.run_deprecated_v1
   def test_static_dims_broadcast_rhs_has_extra_dims(self):
     # Since the second arg has extra dims, and the domain dim of the first arg
     # is larger than the number of linear equations, code will "flip" the extra
@@ -362,8 +379,9 @@ class MatrixSolveWithBroadcastTest(test.TestCase):
       result = linear_operator_util.matrix_solve_with_broadcast(matrix, rhs)
       self.assertAllEqual((2, 3, 2), result.get_shape())
       expected = linalg_ops.matrix_solve(matrix_broadcast, rhs)
-      self.assertAllClose(expected.eval(), result.eval())
+      self.assertAllClose(expected.eval(), self.evaluate(result))
 
+  @test_util.run_deprecated_v1
   def test_static_dims_broadcast_rhs_has_extra_dims_dynamic(self):
     # Since the second arg has extra dims, and the domain dim of the first arg
     # is larger than the number of linear equations, code will "flip" the extra
@@ -385,12 +403,13 @@ class MatrixSolveWithBroadcastTest(test.TestCase):
       self.assertAllEqual(3, result.shape.ndims)
       expected = linalg_ops.matrix_solve(matrix_broadcast, rhs)
       self.assertAllClose(
-          expected.eval(),
+          self.evaluate(expected),
           result.eval(feed_dict={
               matrix_ph: matrix,
               rhs_ph: rhs
           }))
 
+  @test_util.run_deprecated_v1
   def test_static_dims_broadcast_rhs_has_extra_dims_and_adjoint(self):
     # Since the second arg has extra dims, and the domain dim of the first arg
     # is larger than the number of linear equations, code will "flip" the extra
@@ -408,8 +427,9 @@ class MatrixSolveWithBroadcastTest(test.TestCase):
           matrix, rhs, adjoint=True)
       self.assertAllEqual((2, 3, 2), result.get_shape())
       expected = linalg_ops.matrix_solve(matrix_broadcast, rhs, adjoint=True)
-      self.assertAllClose(expected.eval(), result.eval())
+      self.assertAllClose(expected.eval(), self.evaluate(result))
 
+  @test_util.run_deprecated_v1
   def test_dynamic_dims_broadcast_64bit(self):
     # batch_shape = [2, 2]
     matrix = rng.rand(2, 3, 3)
@@ -436,6 +456,7 @@ class MatrixSolveWithBroadcastTest(test.TestCase):
 
 class MatrixTriangularSolveWithBroadcastTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def test_static_dims_broadcast_matrix_has_extra_dims(self):
     # batch_shape = [2]
     matrix = rng.rand(2, 3, 3)
@@ -447,8 +468,9 @@ class MatrixTriangularSolveWithBroadcastTest(test.TestCase):
           matrix, rhs)
       self.assertAllEqual((2, 3, 7), result.get_shape())
       expected = linalg_ops.matrix_triangular_solve(matrix, rhs_broadcast)
-      self.assertAllClose(expected.eval(), result.eval())
+      self.assertAllClose(expected.eval(), self.evaluate(result))
 
+  @test_util.run_deprecated_v1
   def test_static_dims_broadcast_rhs_has_extra_dims(self):
     # Since the second arg has extra dims, and the domain dim of the first arg
     # is larger than the number of linear equations, code will "flip" the extra
@@ -466,8 +488,9 @@ class MatrixTriangularSolveWithBroadcastTest(test.TestCase):
           matrix, rhs)
       self.assertAllEqual((2, 3, 2), result.get_shape())
       expected = linalg_ops.matrix_triangular_solve(matrix_broadcast, rhs)
-      self.assertAllClose(expected.eval(), result.eval())
+      self.assertAllClose(expected.eval(), self.evaluate(result))
 
+  @test_util.run_deprecated_v1
   def test_static_dims_broadcast_rhs_has_extra_dims_and_adjoint(self):
     # Since the second arg has extra dims, and the domain dim of the first arg
     # is larger than the number of linear equations, code will "flip" the extra
@@ -486,8 +509,9 @@ class MatrixTriangularSolveWithBroadcastTest(test.TestCase):
       self.assertAllEqual((2, 3, 2), result.get_shape())
       expected = linalg_ops.matrix_triangular_solve(
           matrix_broadcast, rhs, adjoint=True)
-      self.assertAllClose(expected.eval(), result.eval())
+      self.assertAllClose(expected.eval(), self.evaluate(result))
 
+  @test_util.run_deprecated_v1
   def test_dynamic_dims_broadcast_64bit(self):
     # batch_shape = [2]
     matrix = rng.rand(2, 3, 3)
@@ -522,6 +546,7 @@ class DomainDimensionStubOperator(object):
 
 class AssertCompatibleMatrixDimensionsTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def test_compatible_dimensions_do_not_raise(self):
     with self.cached_session():
       x = ops.convert_to_tensor(rng.rand(2, 3, 4))
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_zeros_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_zeros_test.py
index ad97d1a93ea..eb0b8ef1277 100644
--- a/tensorflow/python/kernel_tests/linalg/linear_operator_zeros_test.py
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_zeros_test.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops.linalg import linalg as linalg_lib
 from tensorflow.python.ops.linalg import linear_operator_test_util
@@ -35,7 +36,7 @@ class LinearOperatorZerosTest(
 
   @property
   def _tests_to_skip(self):
-    return ["log_abs_det", "solve", "solve_with_broadcast"]
+    return ["cholesky", "log_abs_det", "solve", "solve_with_broadcast"]
 
   @property
   def _operator_build_infos(self):
@@ -46,7 +47,10 @@ class LinearOperatorZerosTest(
         build_info((3, 4, 4)),
         build_info((2, 1, 4, 4))]
 
-  def _operator_and_matrix(self, build_info, dtype, use_placeholder):
+  def _operator_and_matrix(
+      self, build_info, dtype, use_placeholder,
+      ensure_self_adjoint_and_pd=False):
+    del ensure_self_adjoint_and_pd
     del use_placeholder
     shape = list(build_info.shape)
     assert shape[-1] == shape[-2]
@@ -70,6 +74,7 @@ class LinearOperatorZerosTest(
       operator = linalg_lib.LinearOperatorZeros(num_rows=2)
       operator.assert_non_singular()
 
+  @test_util.run_deprecated_v1
   def test_assert_self_adjoint(self):
     with self.cached_session():
       operator = linalg_lib.LinearOperatorZeros(num_rows=2)
@@ -105,6 +110,7 @@ class LinearOperatorZerosTest(
     with self.assertRaisesRegexp(ValueError, "must be non-negative"):
       linalg_lib.LinearOperatorZeros(num_rows=2, batch_shape=[-2])
 
+  @test_util.run_deprecated_v1
   def test_non_scalar_num_rows_raises_dynamic(self):
     with self.cached_session():
       num_rows = array_ops.placeholder(dtypes.int32)
@@ -113,6 +119,7 @@ class LinearOperatorZerosTest(
       with self.assertRaisesOpError("must be a 0-D Tensor"):
         operator.to_dense().eval(feed_dict={num_rows: [2]})
 
+  @test_util.run_deprecated_v1
   def test_negative_num_rows_raises_dynamic(self):
     with self.cached_session():
       n = array_ops.placeholder(dtypes.int32)
@@ -126,6 +133,7 @@ class LinearOperatorZerosTest(
       with self.assertRaisesOpError("must be non-negative"):
         operator.to_dense().eval(feed_dict={n: -2})
 
+  @test_util.run_deprecated_v1
   def test_non_1d_batch_shape_raises_dynamic(self):
     with self.cached_session():
       batch_shape = array_ops.placeholder(dtypes.int32)
@@ -134,6 +142,7 @@ class LinearOperatorZerosTest(
       with self.assertRaisesOpError("must be a 1-D"):
         operator.to_dense().eval(feed_dict={batch_shape: 2})
 
+  @test_util.run_deprecated_v1
   def test_negative_batch_shape_raises_dynamic(self):
     with self.cached_session():
       batch_shape = array_ops.placeholder(dtypes.int32)
@@ -148,6 +157,7 @@ class LinearOperatorZerosTest(
     with self.assertRaisesRegexp(ValueError, "Dimensions.*not compatible"):
       operator.matmul(x)
 
+  @test_util.run_deprecated_v1
   def test_wrong_matrix_dimensions_raises_dynamic(self):
     num_rows = array_ops.placeholder(dtypes.int32)
     x = array_ops.placeholder(dtypes.float32)
@@ -166,6 +176,17 @@ class LinearOperatorZerosTest(
     self.assertFalse(operator.is_non_singular)
     self.assertTrue(operator.is_self_adjoint)
 
+  def test_zeros_matmul(self):
+    operator1 = linalg_lib.LinearOperatorIdentity(num_rows=2)
+    operator2 = linalg_lib.LinearOperatorZeros(num_rows=2)
+    self.assertTrue(isinstance(
+        operator1.matmul(operator2),
+        linalg_lib.LinearOperatorZeros))
+
+    self.assertTrue(isinstance(
+        operator2.matmul(operator1),
+        linalg_lib.LinearOperatorZeros))
+
 
 class LinearOperatorZerosNotSquareTest(
     linear_operator_test_util.NonSquareLinearOperatorDerivedClassTest):
diff --git a/tensorflow/python/kernel_tests/linalg_grad_test.py b/tensorflow/python/kernel_tests/linalg_grad_test.py
index 03b640a85a3..28e1d7e1684 100644
--- a/tensorflow/python/kernel_tests/linalg_grad_test.py
+++ b/tensorflow/python/kernel_tests/linalg_grad_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import gradients_impl
@@ -39,6 +40,7 @@ def _AddTest(test, op_name, testcase_name, fn):
 
 class ShapeTest(test_lib.TestCase):
 
+  @test_util.run_deprecated_v1
   def testBatchGradientUnknownSize(self):
     with self.cached_session():
       batch_size = constant_op.constant(3)
@@ -50,7 +52,7 @@ class ShapeTest(test_lib.TestCase):
       determinants = linalg_ops.matrix_determinant(batch_identity)
       reduced = math_ops.reduce_sum(determinants)
       sum_grad = gradients_impl.gradients(reduced, batch_identity)[0]
-      self.assertAllClose(batch_identity.eval(), sum_grad.eval())
+      self.assertAllClose(batch_identity.eval(), self.evaluate(sum_grad))
 
 
 class MatrixUnaryFunctorGradientTest(test_lib.TestCase):
@@ -69,7 +71,7 @@ def _GetMatrixUnaryFunctorGradientTest(functor_, dtype_, shape_, **kwargs_):
       if functor_.__name__ == 'matrix_square_root':
         # Square the input matrix to ensure that its matrix square root exists
         a = math_ops.matmul(a, a)
-        a_np = a.eval()
+        a_np = self.evaluate(a)
       b = functor_(a, **kwargs_)
 
       # Optimal stepsize for central difference is O(epsilon^{1/3}).
diff --git a/tensorflow/python/kernel_tests/linalg_ops_test.py b/tensorflow/python/kernel_tests/linalg_ops_test.py
index 28391aaa878..028167a7860 100644
--- a/tensorflow/python/kernel_tests/linalg_ops_test.py
+++ b/tensorflow/python/kernel_tests/linalg_ops_test.py
@@ -25,6 +25,7 @@ import numpy as np
 
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import linalg_ops
 from tensorflow.python.ops import math_ops
@@ -52,6 +53,7 @@ class CholeskySolveTest(test.TestCase):
   def setUp(self):
     self.rng = np.random.RandomState(0)
 
+  @test_util.run_deprecated_v1
   def test_works_with_five_different_random_pos_def_matrices(self):
     for n in range(1, 6):
       for np_type, atol in [(np.float32, 0.05), (np.float64, 1e-5)]:
@@ -73,6 +75,7 @@ class LogdetTest(test.TestCase):
   def setUp(self):
     self.rng = np.random.RandomState(42)
 
+  @test_util.run_deprecated_v1
   def test_works_with_five_different_random_pos_def_matrices(self):
     for n in range(1, 6):
       for np_dtype, atol in [(np.float32, 0.05), (np.float64, 1e-5),
@@ -85,7 +88,7 @@ class LogdetTest(test.TestCase):
           #     [_RandomPDMatrix(n, self.rng, np_dtype),
           #      _RandomPDMatrix(n, self.rng, np_dtype)]).astype(np_dtype)
           logdet_tf = linalg.logdet(matrix)
-          self.assertAllClose(logdet_np, logdet_tf.eval(), atol=atol)
+          self.assertAllClose(logdet_np, self.evaluate(logdet_tf), atol=atol)
 
   def test_works_with_underflow_case(self):
     for np_dtype, atol in [(np.float32, 0.05), (np.float64, 1e-5),
@@ -94,7 +97,7 @@ class LogdetTest(test.TestCase):
       _, logdet_np = np.linalg.slogdet(matrix)
       with self.session(use_gpu=True):
         logdet_tf = linalg.logdet(matrix)
-        self.assertAllClose(logdet_np, logdet_tf.eval(), atol=atol)
+        self.assertAllClose(logdet_np, self.evaluate(logdet_tf), atol=atol)
 
 
 class SlogdetTest(test.TestCase):
@@ -102,6 +105,7 @@ class SlogdetTest(test.TestCase):
   def setUp(self):
     self.rng = np.random.RandomState(42)
 
+  @test_util.run_deprecated_v1
   def test_works_with_five_different_random_pos_def_matrices(self):
     for n in range(1, 6):
       for np_dtype, atol in [(np.float32, 0.05), (np.float64, 1e-5),
@@ -110,8 +114,9 @@ class SlogdetTest(test.TestCase):
         sign_np, log_abs_det_np = np.linalg.slogdet(matrix)
         with self.session(use_gpu=True):
           sign_tf, log_abs_det_tf = linalg.slogdet(matrix)
-          self.assertAllClose(log_abs_det_np, log_abs_det_tf.eval(), atol=atol)
-          self.assertAllClose(sign_np, sign_tf.eval(), atol=atol)
+          self.assertAllClose(
+              log_abs_det_np, self.evaluate(log_abs_det_tf), atol=atol)
+          self.assertAllClose(sign_np, self.evaluate(sign_tf), atol=atol)
 
   def test_works_with_underflow_case(self):
     for np_dtype, atol in [(np.float32, 0.05), (np.float64, 1e-5),
@@ -120,8 +125,9 @@ class SlogdetTest(test.TestCase):
       sign_np, log_abs_det_np = np.linalg.slogdet(matrix)
       with self.session(use_gpu=True):
         sign_tf, log_abs_det_tf = linalg.slogdet(matrix)
-        self.assertAllClose(log_abs_det_np, log_abs_det_tf.eval(), atol=atol)
-        self.assertAllClose(sign_np, sign_tf.eval(), atol=atol)
+        self.assertAllClose(
+            log_abs_det_np, self.evaluate(log_abs_det_tf), atol=atol)
+        self.assertAllClose(sign_np, self.evaluate(sign_tf), atol=atol)
 
 
 class AdjointTest(test.TestCase):
@@ -135,7 +141,7 @@ class AdjointTest(test.TestCase):
         matrix = ops.convert_to_tensor(matrix_np)
         transposed = linalg.adjoint(matrix)
         self.assertEqual((3, 2), transposed.get_shape())
-        self.assertAllEqual(expected_transposed, transposed.eval())
+        self.assertAllEqual(expected_transposed, self.evaluate(transposed))
 
 
 class EyeTest(parameterized.TestCase, test.TestCase):
@@ -230,6 +236,7 @@ class EyeTest(parameterized.TestCase, test.TestCase):
               dtypes.complex128
           ])
       )
+  @test_util.run_deprecated_v1
   def test_eye_with_placeholder(
       self, num_rows, num_columns, batch_shape, dtype):
     eye_np = np.eye(num_rows, M=num_columns, dtype=dtype.as_numpy_dtype)
diff --git a/tensorflow/python/kernel_tests/list_ops_test.py b/tensorflow/python/kernel_tests/list_ops_test.py
index 92552854aa6..489f6c9b004 100644
--- a/tensorflow/python/kernel_tests/list_ops_test.py
+++ b/tensorflow/python/kernel_tests/list_ops_test.py
@@ -29,9 +29,11 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import list_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import state_ops
@@ -39,17 +41,13 @@ from tensorflow.python.ops import variable_scope as vs
 from tensorflow.python.platform import test
 
 
-def scalar_shape():
-  return ops.convert_to_tensor([], dtype=dtypes.int32)
-
-
 @test_util.run_all_in_graph_and_eager_modes
 class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
 
   def _testPushPop(self, max_num_elements):
     l = list_ops.empty_tensor_list(
         element_dtype=dtypes.float32,
-        element_shape=scalar_shape(),
+        element_shape=[],
         max_num_elements=max_num_elements)
     l = list_ops.tensor_list_push_back(l, constant_op.constant(1.0))
     l, e = list_ops.tensor_list_pop_back(l, element_dtype=dtypes.float32)
@@ -68,11 +66,10 @@ class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     with context.device("gpu:0"):
       self._testPushPop(max_num_elements)
 
+  @test_util.run_deprecated_v1
   def testPushInFullListFails(self):
     l = list_ops.empty_tensor_list(
-        element_dtype=dtypes.float32,
-        element_shape=scalar_shape(),
-        max_num_elements=1)
+        element_dtype=dtypes.float32, element_shape=[], max_num_elements=1)
     l = list_ops.tensor_list_push_back(l, constant_op.constant(1.0))
     with self.assertRaisesRegexp(errors.InvalidArgumentError,
                                  "Tried to push item into a full list"):
@@ -81,10 +78,11 @@ class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
 
   @parameterized.named_parameters(("NoMaxNumElements", None),
                                   ("WithMaxNumElements", 2))
+  @test_util.run_deprecated_v1
   def testPopFromEmptyTensorListFails(self, max_num_elements):
     l = list_ops.empty_tensor_list(
         element_dtype=dtypes.float32,
-        element_shape=scalar_shape(),
+        element_shape=[],
         max_num_elements=max_num_elements)
     with self.assertRaisesRegexp(errors.InvalidArgumentError,
                                  "Trying to pop from an empty list"):
@@ -94,11 +92,13 @@ class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
   def _testStack(self, max_num_elements):
     l = list_ops.empty_tensor_list(
         element_dtype=dtypes.float32,
-        element_shape=scalar_shape(),
+        element_shape=[],
         max_num_elements=max_num_elements)
     l = list_ops.tensor_list_push_back(l, constant_op.constant(1.0))
     l = list_ops.tensor_list_push_back(l, constant_op.constant(2.0))
     t = list_ops.tensor_list_stack(l, element_dtype=dtypes.float32)
+    if not context.executing_eagerly():
+      self.assertAllEqual(t.shape.as_list(), [None])
     self.assertAllEqual(self.evaluate(t), [1.0, 2.0])
 
   @parameterized.named_parameters(("NoMaxNumElements", None),
@@ -116,10 +116,11 @@ class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
 
   @parameterized.named_parameters(("NoMaxNumElements", None),
                                   ("WithMaxNumElements", 3))
+  @test_util.run_deprecated_v1
   def testStackWithUnknownElementShape(self, max_num_elements):
     l = list_ops.empty_tensor_list(
         element_dtype=dtypes.float32,
-        element_shape=-1,
+        element_shape=None,
         max_num_elements=max_num_elements)
     l = list_ops.tensor_list_push_back(l, constant_op.constant(1.0))
     l = list_ops.tensor_list_push_back(l, constant_op.constant(2.0))
@@ -136,10 +137,11 @@ class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
 
   @parameterized.named_parameters(("NoMaxNumElements", None),
                                   ("WithMaxNumElements", 3))
+  @test_util.run_deprecated_v1
   def testStackWithPartiallyDefinedElementShape(self, max_num_elements):
     l = list_ops.empty_tensor_list(
         element_dtype=dtypes.float32,
-        element_shape=[-1],
+        element_shape=[None],
         max_num_elements=max_num_elements)
     l = list_ops.tensor_list_push_back(l, constant_op.constant([1.0]))
     l = list_ops.tensor_list_push_back(l, constant_op.constant([2.0]))
@@ -156,6 +158,7 @@ class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
 
   @parameterized.named_parameters(("NoMaxNumElements", None),
                                   ("WithMaxNumElements", 2))
+  @test_util.run_deprecated_v1
   def testStackEmptyList(self, max_num_elements):
     # Should be able to stack empty lists with fully defined element_shape.
     l = list_ops.empty_tensor_list(
@@ -171,7 +174,7 @@ class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
                                  "non-fully-defined"):
       l = list_ops.empty_tensor_list(
           element_dtype=dtypes.float32,
-          element_shape=[-1, 2],
+          element_shape=[None, 2],
           max_num_elements=max_num_elements)
       t = list_ops.tensor_list_stack(l, element_dtype=dtypes.float32)
       self.evaluate(t)
@@ -181,7 +184,7 @@ class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
                                  "non-fully-defined"):
       l = list_ops.empty_tensor_list(
           element_dtype=dtypes.float32,
-          element_shape=-1,
+          element_shape=None,
           max_num_elements=max_num_elements)
       t = list_ops.tensor_list_stack(l, element_dtype=dtypes.float32)
       self.evaluate(t)
@@ -192,7 +195,7 @@ class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     with backprop.GradientTape() as tape:
       l = list_ops.empty_tensor_list(
           element_dtype=dtypes.float32,
-          element_shape=scalar_shape(),
+          element_shape=[],
           max_num_elements=max_num_elements)
       c0 = constant_op.constant(1.0)
       tape.watch(c0)
@@ -206,10 +209,11 @@ class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
 
   @parameterized.named_parameters(("NoMaxNumElements", None),
                                   ("WithMaxNumElements", 3))
+  @test_util.run_deprecated_v1
   def testGatherWithUnknownElementShape(self, max_num_elements):
     l = list_ops.empty_tensor_list(
         element_dtype=dtypes.float32,
-        element_shape=-1,
+        element_shape=None,
         max_num_elements=max_num_elements)
     l = list_ops.tensor_list_push_back(l, constant_op.constant(1.0))
     l = list_ops.tensor_list_push_back(l, constant_op.constant(2.0))
@@ -229,10 +233,11 @@ class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
 
   @parameterized.named_parameters(("NoMaxNumElements", None),
                                   ("WithMaxNumElements", 3))
+  @test_util.run_deprecated_v1
   def testGatherWithPartiallyDefinedElementShape(self, max_num_elements):
     l = list_ops.empty_tensor_list(
         element_dtype=dtypes.float32,
-        element_shape=[-1],
+        element_shape=[None],
         max_num_elements=max_num_elements)
     l = list_ops.tensor_list_push_back(l, constant_op.constant([1.0]))
     l = list_ops.tensor_list_push_back(l, constant_op.constant([2.0, 3.0]))
@@ -252,6 +257,7 @@ class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
 
   @parameterized.named_parameters(("NoMaxNumElements", None),
                                   ("WithMaxNumElements", 3))
+  @test_util.run_deprecated_v1
   def testGatherEmptyList(self, max_num_elements):
     # Should be able to gather from empty lists with fully defined
     # element_shape.
@@ -268,7 +274,7 @@ class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
                                  "non-fully-defined"):
       l = list_ops.empty_tensor_list(
           element_dtype=dtypes.float32,
-          element_shape=[-1, 2],
+          element_shape=[None, 2],
           max_num_elements=max_num_elements)
       t = list_ops.tensor_list_gather(l, [], element_dtype=dtypes.float32)
       self.evaluate(t)
@@ -279,7 +285,7 @@ class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
                                  "non-fully-defined"):
       l = list_ops.empty_tensor_list(
           element_dtype=dtypes.float32,
-          element_shape=-1,
+          element_shape=None,
           max_num_elements=max_num_elements)
       t = list_ops.tensor_list_gather(l, [], element_dtype=dtypes.float32)
       self.evaluate(t)
@@ -300,7 +306,7 @@ class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
 
   def testTensorListFromTensor(self):
     t = constant_op.constant([1.0, 2.0])
-    l = list_ops.tensor_list_from_tensor(t, element_shape=scalar_shape())
+    l = list_ops.tensor_list_from_tensor(t, element_shape=[])
     l, e = list_ops.tensor_list_pop_back(l, element_dtype=dtypes.float32)
     self.assertAllEqual(self.evaluate(e), 2.0)
     l, e = list_ops.tensor_list_pop_back(l, element_dtype=dtypes.float32)
@@ -315,7 +321,7 @@ class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
 
   def testGetSetItem(self):
     t = constant_op.constant([1.0, 2.0])
-    l = list_ops.tensor_list_from_tensor(t, element_shape=scalar_shape())
+    l = list_ops.tensor_list_from_tensor(t, element_shape=[])
     e0 = list_ops.tensor_list_get_item(l, 0, element_dtype=dtypes.float32)
     self.assertAllEqual(self.evaluate(e0), 1.0)
     l = list_ops.tensor_list_set_item(l, 0, 3.0)
@@ -333,19 +339,16 @@ class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
       t = constant_op.constant(5.)
       tape.watch(t)
       l = list_ops.tensor_list_reserve(
-          element_dtype=dtypes.float32,
-          element_shape=scalar_shape(),
-          num_elements=3)
+          element_dtype=dtypes.float32, element_shape=[], num_elements=3)
       l = list_ops.tensor_list_set_item(l, 1, 2. * t)
       e = list_ops.tensor_list_get_item(l, 1, element_dtype=dtypes.float32)
       self.assertAllEqual(self.evaluate(e), 10.0)
     self.assertAllEqual(self.evaluate(tape.gradient(e, t)), 2.0)
 
+  @test_util.run_deprecated_v1
   def testSetOnEmptyListWithMaxNumElementsFails(self):
     l = list_ops.empty_tensor_list(
-        element_dtype=dtypes.float32,
-        element_shape=scalar_shape(),
-        max_num_elements=3)
+        element_dtype=dtypes.float32, element_shape=[], max_num_elements=3)
     with self.assertRaisesRegexp(
         errors.InvalidArgumentError,
         "Trying to modify element 0 in a list with 0 elements."):
@@ -354,7 +357,7 @@ class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
 
   def testUnknownShape(self):
     l = list_ops.empty_tensor_list(
-        element_dtype=dtypes.float32, element_shape=-1)
+        element_dtype=dtypes.float32, element_shape=None)
     l = list_ops.tensor_list_push_back(l, constant_op.constant(1.0))
     l = list_ops.tensor_list_push_back(l, constant_op.constant([1.0, 2.0]))
     l, e = list_ops.tensor_list_pop_back(l, element_dtype=dtypes.float32)
@@ -366,7 +369,7 @@ class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     if not context.num_gpus():
       return
     t = constant_op.constant([1.0, 2.0])
-    l = list_ops.tensor_list_from_tensor(t, element_shape=scalar_shape())
+    l = list_ops.tensor_list_from_tensor(t, element_shape=[])
     with context.device("gpu:0"):
       l_gpu = array_ops.identity(l)
       self.assertAllEqual(
@@ -383,7 +386,7 @@ class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     if not context.num_gpus():
       return
     t = constant_op.constant([1.0, 2.0])
-    child_l = list_ops.tensor_list_from_tensor(t, element_shape=scalar_shape())
+    child_l = list_ops.tensor_list_from_tensor(t, element_shape=[])
     l = list_ops.empty_tensor_list(
         element_shape=constant_op.constant([], dtype=dtypes.int32),
         element_dtype=dtypes.variant)
@@ -495,9 +498,7 @@ class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     with ops.Graph().as_default(), session.Session(target=worker.target):
       with ops.device("/job:worker"):
         l = list_ops.tensor_list_reserve(
-            element_dtype=dtypes.float32,
-            element_shape=scalar_shape(),
-            num_elements=2)
+            element_dtype=dtypes.float32, element_shape=[], num_elements=2)
         l = list_ops.tensor_list_set_item(l, 0, 1.)
       with ops.device("/job:ps"):
         l_ps = array_ops.identity(l)
@@ -512,7 +513,7 @@ class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     with ops.Graph().as_default(), session.Session(target=worker.target):
       with ops.device("/job:worker"):
         t = constant_op.constant([[1.0], [2.0]])
-        l = list_ops.tensor_list_from_tensor(t, element_shape=-1)
+        l = list_ops.tensor_list_from_tensor(t, element_shape=None)
       with ops.device("/job:ps"):
         l_ps = array_ops.identity(l)
         element_shape = list_ops.tensor_list_element_shape(
@@ -529,7 +530,9 @@ class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     with ops.Graph().as_default(), session.Session(target=worker.target):
       with ops.device("/job:worker"):
         l = list_ops.empty_tensor_list(
-            element_shape=-1, element_dtype=dtypes.float32, max_num_elements=2)
+            element_shape=None,
+            element_dtype=dtypes.float32,
+            max_num_elements=2)
         l = list_ops.tensor_list_push_back(l, 1.)
       with ops.device("/job:ps"):
         l_ps = array_ops.identity(l)
@@ -543,8 +546,8 @@ class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
 
   def testPushPopGradients(self):
     with backprop.GradientTape() as tape:
-      l = list_ops.empty_tensor_list(element_dtype=dtypes.float32,
-                                     element_shape=scalar_shape())
+      l = list_ops.empty_tensor_list(
+          element_dtype=dtypes.float32, element_shape=[])
       c = constant_op.constant(1.0)
       tape.watch(c)
       l = list_ops.tensor_list_push_back(l, c)
@@ -556,7 +559,7 @@ class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     with backprop.GradientTape() as tape:
       c = constant_op.constant([1.0, 2.0])
       tape.watch(c)
-      l = list_ops.tensor_list_from_tensor(c, element_shape=scalar_shape())
+      l = list_ops.tensor_list_from_tensor(c, element_shape=[])
       c2 = list_ops.tensor_list_stack(
           l, element_dtype=dtypes.float32, num_elements=2)
       result = c2 * 2.0
@@ -567,7 +570,7 @@ class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     with backprop.GradientTape() as tape:
       c = constant_op.constant([1.0, 2.0])
       tape.watch(c)
-      l = list_ops.tensor_list_from_tensor(c, element_shape=scalar_shape())
+      l = list_ops.tensor_list_from_tensor(c, element_shape=[])
       c2 = constant_op.constant(3.0)
       tape.watch(c2)
       l = list_ops.tensor_list_set_item(l, 0, c2)
@@ -578,17 +581,19 @@ class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     self.assertAllEqual(self.evaluate(grad_c), [0.0, 4.0])
     self.assertAllEqual(self.evaluate(grad_c2), 6.0)
 
+  @test_util.run_deprecated_v1
   def testSetOutOfBounds(self):
     c = constant_op.constant([1.0, 2.0])
-    l = list_ops.tensor_list_from_tensor(c, element_shape=scalar_shape())
+    l = list_ops.tensor_list_from_tensor(c, element_shape=[])
     with self.assertRaises(errors.InvalidArgumentError):
       self.evaluate(list_ops.tensor_list_set_item(l, 20, 3.0))
 
+  @test_util.run_deprecated_v1
   def testSkipEagerSetItemWithMismatchedShapeFails(self):
     with self.cached_session() as sess:
       ph = array_ops.placeholder(dtypes.float32)
       c = constant_op.constant([1.0, 2.0])
-      l = list_ops.tensor_list_from_tensor(c, element_shape=scalar_shape())
+      l = list_ops.tensor_list_from_tensor(c, element_shape=[])
       # Set a placeholder with unknown shape to satisfy the shape inference
       # at graph building time.
       l = list_ops.tensor_list_set_item(l, 0, ph)
@@ -599,7 +604,7 @@ class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
 
   def testResourceVariableScatterGather(self):
     c = constant_op.constant([1.0, 2.0], dtype=dtypes.float32)
-    l = list_ops.tensor_list_from_tensor(c, element_shape=scalar_shape())
+    l = list_ops.tensor_list_from_tensor(c, element_shape=[])
     v = vs.get_variable("var", initializer=[l] * 10, use_resource=True)
     v_r_0_stacked = list_ops.tensor_list_stack(v[0], dtypes.float32)
     self.evaluate(v.initializer)
@@ -607,10 +612,8 @@ class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     v_r_sparse_stacked = list_ops.tensor_list_stack(
         v.sparse_read(0), dtypes.float32)
     self.assertAllEqual([1.0, 2.0], self.evaluate(v_r_sparse_stacked))
-    l_new_0 = list_ops.tensor_list_from_tensor(
-        [3.0, 4.0], element_shape=scalar_shape())
-    l_new_1 = list_ops.tensor_list_from_tensor(
-        [5.0, 6.0], element_shape=scalar_shape())
+    l_new_0 = list_ops.tensor_list_from_tensor([3.0, 4.0], element_shape=[])
+    l_new_1 = list_ops.tensor_list_from_tensor([5.0, 6.0], element_shape=[])
     updated_v = state_ops.scatter_update(v, [3, 5], [l_new_0, l_new_1])
     updated_v_elems = array_ops.unstack(updated_v)
     updated_v_stacked = [
@@ -620,10 +623,11 @@ class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
                 [[1.0, 2.0]] * 4)
     self.assertAllEqual(self.evaluate(updated_v_stacked), expected)
 
+  @test_util.run_deprecated_v1
   def testConcat(self):
     c = constant_op.constant([1.0, 2.0], dtype=dtypes.float32)
-    l0 = list_ops.tensor_list_from_tensor(c, element_shape=scalar_shape())
-    l1 = list_ops.tensor_list_from_tensor([-1.0], element_shape=scalar_shape())
+    l0 = list_ops.tensor_list_from_tensor(c, element_shape=[])
+    l1 = list_ops.tensor_list_from_tensor([-1.0], element_shape=[])
     l_batch_0 = array_ops.stack([l0, l1])
     l_batch_1 = array_ops.stack([l1, l0])
 
@@ -659,7 +663,7 @@ class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
       self.evaluate(
           list_ops.tensor_list_concat_lists(
               l_batch_0,
-              list_ops.empty_tensor_list(scalar_shape(), dtypes.float32),
+              list_ops.empty_tensor_list([], dtypes.float32),
               element_dtype=dtypes.float32))
 
     with self.assertRaisesRegexp(errors.InvalidArgumentError,
@@ -673,16 +677,16 @@ class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     with self.assertRaisesRegexp(errors.InvalidArgumentError,
                                  r"input_b\[0\].dtype != element_dtype."):
       l_batch_of_int_tls = array_ops.stack(
-          [list_ops.tensor_list_from_tensor([1], element_shape=scalar_shape())]
-          * 2)
+          [list_ops.tensor_list_from_tensor([1], element_shape=[])] * 2)
       self.evaluate(
           list_ops.tensor_list_concat_lists(l_batch_0, l_batch_of_int_tls,
                                             element_dtype=dtypes.float32))
 
+  @test_util.run_deprecated_v1
   def testPushBackBatch(self):
     c = constant_op.constant([1.0, 2.0], dtype=dtypes.float32)
-    l0 = list_ops.tensor_list_from_tensor(c, element_shape=scalar_shape())
-    l1 = list_ops.tensor_list_from_tensor([-1.0], element_shape=scalar_shape())
+    l0 = list_ops.tensor_list_from_tensor(c, element_shape=[])
+    l1 = list_ops.tensor_list_from_tensor([-1.0], element_shape=[])
     l_batch = array_ops.stack([l0, l1])
     l_push = list_ops.tensor_list_push_back_batch(l_batch, [3.0, 4.0])
     l_unstack = array_ops.unstack(l_push)
@@ -726,7 +730,7 @@ class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
                   dtypes.float64, dtypes.complex64, dtypes.complex128,
                   dtypes.bool):
       l_empty = list_ops.empty_tensor_list(
-          element_dtype=dtype, element_shape=scalar_shape())
+          element_dtype=dtype, element_shape=[])
       l_empty_zeros = array_ops.zeros_like(l_empty)
       t_empty_zeros = list_ops.tensor_list_stack(
           l_empty_zeros, element_dtype=dtype)
@@ -750,10 +754,9 @@ class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
                   dtypes.float64, dtypes.complex64, dtypes.complex128,
                   dtypes.bool):
       l = list_ops.empty_tensor_list(
-          element_dtype=dtypes.variant, element_shape=scalar_shape())
+          element_dtype=dtypes.variant, element_shape=[])
 
-      sub_l = list_ops.empty_tensor_list(
-          element_dtype=dtype, element_shape=scalar_shape())
+      sub_l = list_ops.empty_tensor_list(element_dtype=dtype, element_shape=[])
       l = list_ops.tensor_list_push_back(l, sub_l)
       sub_l = list_ops.tensor_list_push_back(sub_l, math_ops.cast(
           1, dtype=dtype))
@@ -786,13 +789,12 @@ class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
 
   def testElementShape(self):
     l = list_ops.empty_tensor_list(
-        element_dtype=dtypes.float32, element_shape=-1)
+        element_dtype=dtypes.float32, element_shape=None)
     shape = list_ops.tensor_list_element_shape(l, shape_type=dtypes.int32)
     self.assertEqual(self.evaluate(shape), -1)
 
   def testZerosLikeUninitialized(self):
-    l0 = list_ops.tensor_list_reserve(
-        scalar_shape(), 3, element_dtype=dtypes.float32)
+    l0 = list_ops.tensor_list_reserve([], 3, element_dtype=dtypes.float32)
     l1 = list_ops.tensor_list_set_item(l0, 0, 1.)  # [1., _, _]
     zeros_1 = array_ops.zeros_like(l1)  # [0., _, _]
     l2 = list_ops.tensor_list_set_item(l1, 2, 2.)  # [1., _, 2.]
@@ -808,6 +810,292 @@ class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     self.assertAllEqual(self.evaluate(res_1), [0.])
     self.assertAllEqual(self.evaluate(res_2), [0., 0.])
 
+  @test_util.run_deprecated_v1
+  def testSkipEagerTensorListGetItemGradAggregation(self):
+    l = list_ops.tensor_list_reserve(
+        element_shape=[], num_elements=1, element_dtype=dtypes.float32)
+    x = constant_op.constant(1.0)
+    l = list_ops.tensor_list_set_item(l, 0, x)
+    l_read1 = list_ops.tensor_list_get_item(l, 0, element_dtype=dtypes.float32)
+    l_read2 = list_ops.tensor_list_get_item(l, 0, element_dtype=dtypes.float32)
+    grad = gradients_impl.gradients([l_read1, l_read2], [x])
+    with self.cached_session() as sess:
+      self.assertSequenceEqual(self.evaluate(grad), [2.])
+
+  @test_util.run_deprecated_v1
+  def testSkipEagerBuildElementShape(self):
+    fn = list_ops._build_element_shape
+    # Unknown shape -> -1.
+    self.assertEqual(fn(None), -1)
+    self.assertEqual(fn(tensor_shape.unknown_shape()), -1)
+    # Scalar shape -> [] with type int32.
+    self.assertEqual(fn([]).dtype, dtypes.int32)
+    self.assertEqual(fn(tensor_shape.scalar()).dtype, dtypes.int32)
+    self.assertAllEqual(self.evaluate(fn([])), np.array([], np.int32))
+    self.assertAllEqual(
+        self.evaluate(fn(tensor_shape.scalar())), np.array([], np.int32))
+    # Tensor -> Tensor
+    shape = constant_op.constant(1)
+    self.assertIs(fn(shape), shape)
+    # Shape with unknown dims -> shape list with -1's.
+    shape = [None, 5]
+    self.assertAllEqual(fn(shape), [-1, 5])
+    self.assertAllEqual(fn(tensor_shape.TensorShape(shape)), [-1, 5])
+    # Shape with unknown dims and tensor dims -> shape list with -1's and tensor
+    # dims.
+    t = array_ops.placeholder(dtypes.int32)
+    shape = [None, 5, t]
+    result = fn(shape)
+    self.assertAllEqual(result[:2], [-1, 5])
+    self.assertIs(result[2], t)
+
+  def testAddN(self):
+    l1 = list_ops.tensor_list_from_tensor([1.0, 2.0], element_shape=[])
+    l2 = list_ops.tensor_list_from_tensor([3.0, 4.0], element_shape=[])
+    l3 = list_ops.tensor_list_from_tensor([5.0, 6.0], element_shape=[])
+    result = math_ops.add_n((l1, l2, l3))
+    result_t = list_ops.tensor_list_stack(result, element_dtype=dtypes.float32)
+    self.assertAllEqual(self.evaluate(result_t), [9., 12.])
+
+  def testAddNNestedList(self):
+    l1 = list_ops.tensor_list_from_tensor([1.0, 2.0], element_shape=[])
+    l2 = list_ops.tensor_list_from_tensor([3.0, 4.0], element_shape=[])
+    l3 = list_ops.tensor_list_from_tensor([5.0, 6.0], element_shape=[])
+    l4 = list_ops.tensor_list_from_tensor([7.0, 8.0], element_shape=[])
+    a = list_ops.empty_tensor_list(
+        element_dtype=dtypes.variant, element_shape=[])
+    a = list_ops.tensor_list_push_back(a, l1)
+    a = list_ops.tensor_list_push_back(a, l2)
+    b = list_ops.empty_tensor_list(
+        element_dtype=dtypes.variant, element_shape=[])
+    b = list_ops.tensor_list_push_back(b, l3)
+    b = list_ops.tensor_list_push_back(b, l4)
+    result = math_ops.add_n((a, b))
+    result_0 = list_ops.tensor_list_stack(
+        list_ops.tensor_list_get_item(result, 0, element_dtype=dtypes.variant),
+        element_dtype=dtypes.float32)
+    result_1 = list_ops.tensor_list_stack(
+        list_ops.tensor_list_get_item(result, 1, element_dtype=dtypes.variant),
+        element_dtype=dtypes.float32)
+    self.assertAllEqual(self.evaluate(result_0), [6., 8.])
+    self.assertAllEqual(self.evaluate(result_1), [10., 12.])
+
+  @test_util.run_deprecated_v1
+  def testSkipEagerConcatShapeInference(self):
+
+    def BuildTensor(element_shape):
+      l = list_ops.empty_tensor_list(
+          element_dtype=dtypes.float32, element_shape=element_shape)
+      return list_ops.tensor_list_concat(l, element_dtype=dtypes.float32)
+
+    self.assertIsNone(BuildTensor(None).shape.rank)
+    self.assertAllEqual(BuildTensor([None, 2, 3]).shape.as_list(), [None, 2, 3])
+    self.assertAllEqual(
+        BuildTensor([None, 2, None]).shape.as_list(), [None, 2, None])
+    self.assertAllEqual(BuildTensor([1, 2, 3]).shape.as_list(), [None, 2, 3])
+
+  def testConcatWithFullyDefinedElementShape(self):
+    l = list_ops.empty_tensor_list(
+        element_dtype=dtypes.float32, element_shape=[2, 2])
+    l = list_ops.tensor_list_push_back(l, [[0., 1.], [2., 3.]])
+    l = list_ops.tensor_list_push_back(l, [[4., 5.], [6., 7.]])
+    t = list_ops.tensor_list_concat(l, element_dtype=dtypes.float32)
+    self.assertAllEqual(
+        self.evaluate(t), [[0., 1.], [2., 3.], [4., 5.], [6., 7.]])
+
+  def testConcatWithNonFullyDefinedElementShape(self):
+    l = list_ops.empty_tensor_list(
+        element_dtype=dtypes.float32, element_shape=[None, 2])
+    l = list_ops.tensor_list_push_back(l, [[0., 1.]])
+    l = list_ops.tensor_list_push_back(l, [[2., 3.], [4., 5.]])
+    t = list_ops.tensor_list_concat(l, element_dtype=dtypes.float32)
+    self.assertAllEqual(self.evaluate(t), [[0., 1.], [2., 3.], [4., 5.]])
+
+  def testConcatWithMismatchingTensorShapesFails(self):
+    l = list_ops.empty_tensor_list(
+        element_dtype=dtypes.float32, element_shape=None)
+    l = list_ops.tensor_list_push_back(l, [[0., 1.]])
+    l = list_ops.tensor_list_push_back(l, [[2.], [4.]])
+    with self.assertRaisesRegexp(
+        errors.InvalidArgumentError,
+        r"Tried to concat tensors with unequal shapes: "
+        r"\[2\] vs \[1\]"):
+      t = list_ops.tensor_list_concat(l, element_dtype=dtypes.float32)
+      self.evaluate(t)
+
+  def testConcatEmptyListWithFullyDefinedElementShape(self):
+    l = list_ops.empty_tensor_list(
+        element_dtype=dtypes.float32, element_shape=[5, 2])
+    t = list_ops.tensor_list_concat(l, element_dtype=dtypes.float32)
+    self.assertAllEqual(self.evaluate(t).shape, (0, 2))
+    l = list_ops.empty_tensor_list(
+        element_dtype=dtypes.float32, element_shape=[None, 2])
+    t = list_ops.tensor_list_concat(l, element_dtype=dtypes.float32)
+    self.assertAllEqual(self.evaluate(t).shape, (0, 2))
+
+  def testConcatEmptyListWithUnknownElementShapeFails(self):
+    l = list_ops.empty_tensor_list(
+        element_dtype=dtypes.float32, element_shape=None)
+    with self.assertRaisesRegexp(
+        errors.InvalidArgumentError,
+        "All except the first dimension must be fully"
+        " defined when concating an empty tensor list"):
+      t = list_ops.tensor_list_concat(l, element_dtype=dtypes.float32)
+      self.evaluate(t)
+
+  def testConcatEmptyListWithPartiallyDefinedElementShapeFails(self):
+    l = list_ops.empty_tensor_list(
+        element_dtype=dtypes.float32, element_shape=[2, None])
+    with self.assertRaisesRegexp(
+        errors.InvalidArgumentError,
+        "All except the first dimension must be fully"
+        " defined when concating an empty tensor list"):
+      t = list_ops.tensor_list_concat(l, element_dtype=dtypes.float32)
+      self.evaluate(t)
+
+  def testConcatListWithScalarElementShapeFails(self):
+    l = list_ops.empty_tensor_list(
+        element_dtype=dtypes.float32, element_shape=tensor_shape.scalar())
+    with self.assertRaisesRegexp(
+        errors.InvalidArgumentError,
+        "Concat requires elements to be at least vectors, "
+        "found scalars instead"):
+      t = list_ops.tensor_list_concat(l, element_dtype=dtypes.float32)
+      self.evaluate(t)
+
+  def testConcatListWithScalarElementsFails(self):
+    l = list_ops.empty_tensor_list(
+        element_dtype=dtypes.float32, element_shape=None)
+    l1 = list_ops.tensor_list_push_back(l, 1.)
+    with self.assertRaisesRegexp(
+        errors.InvalidArgumentError, "Concat saw a scalar shape at index 0"
+        " but requires at least vectors"):
+      t = list_ops.tensor_list_concat(l1, element_dtype=dtypes.float32)
+      self.evaluate(t)
+    l1 = list_ops.tensor_list_push_back(l, [1.])
+    l1 = list_ops.tensor_list_push_back(l1, 2.)
+    with self.assertRaisesRegexp(
+        errors.InvalidArgumentError, "Concat saw a scalar shape at index 1"
+        " but requires at least vectors"):
+      t = list_ops.tensor_list_concat(l1, element_dtype=dtypes.float32)
+      self.evaluate(t)
+
+  def testEvenSplit(self):
+
+    def RunTest(input_tensor, lengths, expected_stacked_output):
+      l = list_ops.tensor_list_split(
+          input_tensor, element_shape=None, lengths=lengths)
+      self.assertAllEqual(
+          list_ops.tensor_list_stack(l, element_dtype=dtypes.float32),
+          expected_stacked_output)
+
+    RunTest([1., 2., 3.], [1, 1, 1], [[1.], [2.], [3.]])
+    RunTest([1., 2., 3., 4.], [2, 2], [[1., 2.], [3., 4.]])
+    RunTest([[1., 2.], [3., 4.]], [1, 1], [[[1., 2.]], [[3., 4.]]])
+
+  def testUnevenSplit(self):
+    l = list_ops.tensor_list_split([1., 2., 3., 4., 5],
+                                   element_shape=None,
+                                   lengths=[3, 2])
+    self.assertAllEqual(list_ops.tensor_list_length(l), 2)
+    self.assertAllEqual(
+        list_ops.tensor_list_get_item(l, 0, element_dtype=dtypes.float32),
+        [1., 2., 3.])
+    self.assertAllEqual(
+        list_ops.tensor_list_get_item(l, 1, element_dtype=dtypes.float32),
+        [4., 5.])
+
+  @test_util.run_deprecated_v1
+  def testSkipEagerSplitWithInvalidTensorShapeFails(self):
+    with self.cached_session():
+      tensor = array_ops.placeholder(dtype=dtypes.float32)
+      l = list_ops.tensor_list_split(tensor, element_shape=None, lengths=[1])
+      with self.assertRaisesRegexp(
+          errors.InvalidArgumentError,
+          r"Tensor must be at least a vector, but saw shape: \[\]"):
+        l.eval({tensor: 1})
+
+  @test_util.run_deprecated_v1
+  def testSkipEagerSplitWithInvalidLengthsShapeFails(self):
+    with self.cached_session():
+      lengths = array_ops.placeholder(dtype=dtypes.int64)
+      l = list_ops.tensor_list_split([1., 2.],
+                                     element_shape=None,
+                                     lengths=lengths)
+      with self.assertRaisesRegexp(
+          errors.InvalidArgumentError,
+          r"Expected lengths to be a vector, received shape: \[\]"):
+        l.eval({lengths: 1})
+
+  def testSplitWithInvalidLengthsFails(self):
+    with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                 r"Invalid value in lengths: -1"):
+      l = list_ops.tensor_list_split([1., 2.],
+                                     element_shape=None,
+                                     lengths=[1, -1])
+      self.evaluate(l)
+    with self.assertRaisesRegexp(
+        errors.InvalidArgumentError,
+        r"Attempting to slice \[0, 3\] from tensor with length 2"):
+      l = list_ops.tensor_list_split([1., 2.], element_shape=None, lengths=[3])
+      self.evaluate(l)
+    with self.assertRaisesRegexp(
+        errors.InvalidArgumentError,
+        r"Unused values in tensor. Length of tensor: 2 Values used: 1"):
+      l = list_ops.tensor_list_split([1., 2.], element_shape=None, lengths=[1])
+      self.evaluate(l)
+
+  @test_util.run_deprecated_v1
+  def testSkipEagerSplitWithScalarElementShapeFails(self):
+    with self.assertRaisesRegexp(ValueError,
+                                 r"Shapes must be equal rank, but are 1 and 0"):
+      l = list_ops.tensor_list_split([1., 2.], element_shape=[], lengths=[1, 1])
+    with self.cached_session():
+      with self.assertRaisesRegexp(
+          errors.InvalidArgumentError,
+          r"TensorListSplit requires element_shape to be at least of rank 1, "
+          r"but saw: \[\]"):
+        element_shape = array_ops.placeholder(dtype=dtypes.int32)
+        l = list_ops.tensor_list_split([1., 2.],
+                                       element_shape=element_shape,
+                                       lengths=[1, 1])
+        l.eval({element_shape: []})
+
+  def testEagerOnlySplitWithScalarElementShapeFails(self):
+    if context.executing_eagerly():
+      with self.assertRaisesRegexp(
+          errors.InvalidArgumentError,
+          r"TensorListSplit requires element_shape to be at least of rank 1, "
+          r"but saw: \[\]"):
+        list_ops.tensor_list_split([1., 2.], element_shape=[], lengths=[1, 1])
+
+  @test_util.run_deprecated_v1
+  def testSkipEagerSplitWithIncompatibleTensorShapeAndElementShapeFails(self):
+    with self.assertRaisesRegexp(ValueError,
+                                 r"Shapes must be equal rank, but are 2 and 1"):
+      l = list_ops.tensor_list_split([[1.], [2.]],
+                                     element_shape=[1],
+                                     lengths=[1, 1])
+
+    with self.cached_session():
+      with self.assertRaisesRegexp(
+          errors.InvalidArgumentError,
+          r"tensor shape \[2,1\] is not compatible with element_shape \[1\]"):
+        element_shape = array_ops.placeholder(dtype=dtypes.int32)
+        l = list_ops.tensor_list_split([[1.], [2.]],
+                                       element_shape=element_shape,
+                                       lengths=[1, 1])
+        l.eval({element_shape: [1]})
+
+  def testEagerOnlySplitWithIncompatibleTensorShapeAndElementShapeFails(self):
+    if context.executing_eagerly():
+      with self.assertRaisesRegexp(
+          errors.InvalidArgumentError,
+          r"tensor shape \[2,1\] is not compatible with element_shape \[1\]"):
+        list_ops.tensor_list_split([[1.], [2.]],
+                                   element_shape=[1],
+                                   lengths=[1, 1])
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/listdiff_op_test.py b/tensorflow/python/kernel_tests/listdiff_op_test.py
index baeb40dd635..28657107980 100644
--- a/tensorflow/python/kernel_tests/listdiff_op_test.py
+++ b/tensorflow/python/kernel_tests/listdiff_op_test.py
@@ -47,7 +47,7 @@ class ListDiffTest(test.TestCase):
             y_tensor = ops.convert_to_tensor(y, dtype=dtype)
             out_tensor, idx_tensor = diff_func(x_tensor, y_tensor,
                                                index_dtype=index_dtype)
-            tf_out, tf_idx = sess.run([out_tensor, idx_tensor])
+            tf_out, tf_idx = self.evaluate([out_tensor, idx_tensor])
           self.assertAllEqual(tf_out, out)
           self.assertAllEqual(tf_idx, idx)
           self.assertEqual(1, out_tensor.get_shape().ndims)
diff --git a/tensorflow/python/kernel_tests/logging_ops_test.py b/tensorflow/python/kernel_tests/logging_ops_test.py
index 8e9b87f6512..85035e5f7d3 100644
--- a/tensorflow/python/kernel_tests/logging_ops_test.py
+++ b/tensorflow/python/kernel_tests/logging_ops_test.py
@@ -39,6 +39,7 @@ from tensorflow.python.platform import test
 
 class LoggingOpsTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testAssertDivideByZero(self):
     with self.cached_session() as sess:
       epsilon = ops.convert_to_tensor(1e-20)
@@ -52,7 +53,7 @@ class LoggingOpsTest(test.TestCase):
               math_ops.less(epsilon, y), ["Divide-by-zero"])
       ]):
         out = math_ops.div(z, y)
-      self.assertAllEqual(2.0, out.eval())
+      self.assertAllEqual(2.0, self.evaluate(out))
       # assert(epsilon < x)
       # z / x
       #
@@ -63,7 +64,7 @@ class LoggingOpsTest(test.TestCase):
       ]):
         out = math_ops.div(z, x)
       with self.assertRaisesOpError("less than x"):
-        out.eval()
+        self.evaluate(out)
 
 
 class PrintV2Test(test.TestCase):
@@ -305,12 +306,14 @@ class PrintV2Test(test.TestCase):
             tensor, output_stream="unknown")
         self.evaluate(print_op)
 
+  @test_util.run_deprecated_v1
   def testPrintOpName(self):
     with self.cached_session():
       tensor = math_ops.range(10)
       print_op = logging_ops.print_v2(tensor, name="print_name")
       self.assertEqual(print_op.name, "print_name")
 
+  @test_util.run_deprecated_v1
   def testNoDuplicateFormatOpGraphModeAfterExplicitFormat(self):
     with self.cached_session():
       tensor = math_ops.range(10)
@@ -379,6 +382,7 @@ class PrintGradientTest(test.TestCase):
     inp_printed = logging_ops.Print(inp, ["hello"])
     self.assertEqual(inp.get_shape(), inp_printed.get_shape())
 
+  @test_util.run_deprecated_v1
   def testPrintGradient(self):
     with self.cached_session():
       inp = constant_op.constant(2.0, shape=[100, 32], name="in")
@@ -387,8 +391,8 @@ class PrintGradientTest(test.TestCase):
       wx_print = logging_ops.Print(wx, [w, w, w])
       wx_grad = gradients_impl.gradients(wx, w)[0]
       wx_print_grad = gradients_impl.gradients(wx_print, w)[0]
-      wxg = wx_grad.eval()
-      wxpg = wx_print_grad.eval()
+      wxg = self.evaluate(wx_grad)
+      wxpg = self.evaluate(wx_print_grad)
     self.assertAllEqual(wxg, wxpg)
 
 
diff --git a/tensorflow/python/kernel_tests/lookup_ops_test.py b/tensorflow/python/kernel_tests/lookup_ops_test.py
index bd93942efbd..ad81e0be649 100644
--- a/tensorflow/python/kernel_tests/lookup_ops_test.py
+++ b/tensorflow/python/kernel_tests/lookup_ops_test.py
@@ -37,6 +37,7 @@ from tensorflow.python.training import server_lib
 
 class HashTableOpTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testHashTable(self):
     with self.cached_session():
       default_val = -1
@@ -52,15 +53,16 @@ class HashTableOpTest(test.TestCase):
       output = table.lookup(input_string)
       self.assertAllEqual([3], output.get_shape())
 
-      result = output.eval()
+      result = self.evaluate(output)
       self.assertAllEqual([0, 1, -1], result)
 
       exported_keys_tensor, exported_values_tensor = table.export()
 
       self.assertItemsEqual([b"brain", b"salad", b"surgery"],
-                            exported_keys_tensor.eval())
-      self.assertItemsEqual([0, 1, 2], exported_values_tensor.eval())
+                            self.evaluate(exported_keys_tensor))
+      self.assertItemsEqual([0, 1, 2], self.evaluate(exported_values_tensor))
 
+  @test_util.run_deprecated_v1
   def testHashTableFindHighRank(self):
     with self.cached_session():
       default_val = -1
@@ -76,9 +78,10 @@ class HashTableOpTest(test.TestCase):
           [["brain", "salad"], ["tank", "tarkus"]])
       output = table.lookup(input_string)
 
-      result = output.eval()
+      result = self.evaluate(output)
       self.assertAllEqual([[0, 1], [-1, -1]], result)
 
+  @test_util.run_deprecated_v1
   def testHashTableInitWithPythonArrays(self):
     with self.cached_session():
       default_val = -1
@@ -94,9 +97,10 @@ class HashTableOpTest(test.TestCase):
       input_string = constant_op.constant(["brain", "salad", "tank"])
       output = table.lookup(input_string)
 
-      result = output.eval()
+      result = self.evaluate(output)
       self.assertAllEqual([0, 1, -1], result)
 
+  @test_util.run_deprecated_v1
   def testHashTableInitWithNumPyArrays(self):
     with self.cached_session():
       default_val = -1
@@ -111,9 +115,10 @@ class HashTableOpTest(test.TestCase):
       input_string = constant_op.constant(["brain", "salad", "tank"])
       output = table.lookup(input_string)
 
-      result = output.eval()
+      result = self.evaluate(output)
       self.assertAllEqual([0, 1, -1], result)
 
+  @test_util.run_deprecated_v1
   def testMultipleHashTables(self):
     with self.cached_session() as sess:
       default_val = -1
@@ -137,11 +142,12 @@ class HashTableOpTest(test.TestCase):
       output2 = table2.lookup(input_string)
       output3 = table3.lookup(input_string)
 
-      out1, out2, out3 = sess.run([output1, output2, output3])
+      out1, out2, out3 = self.evaluate([output1, output2, output3])
       self.assertAllEqual([0, 1, -1], out1)
       self.assertAllEqual([0, 1, -1], out2)
       self.assertAllEqual([0, 1, -1], out3)
 
+  @test_util.run_deprecated_v1
   def testHashTableWithTensorDefault(self):
     with self.cached_session():
       default_val = constant_op.constant(-1, dtypes.int64)
@@ -154,9 +160,10 @@ class HashTableOpTest(test.TestCase):
       input_string = constant_op.constant(["brain", "salad", "tank"])
       output = table.lookup(input_string)
 
-      result = output.eval()
+      result = self.evaluate(output)
       self.assertAllEqual([0, 1, -1], result)
 
+  @test_util.run_deprecated_v1
   def testHashTableWithSparseTensorInput(self):
     with self.cached_session() as sess:
       default_val = constant_op.constant(-1, dtypes.int64)
@@ -174,12 +181,13 @@ class HashTableOpTest(test.TestCase):
           constant_op.constant(sp_shape, dtypes.int64))
       output = table.lookup(input_tensor)
 
-      out_indices, out_values, out_shape = sess.run(output)
+      out_indices, out_values, out_shape = self.evaluate(output)
 
       self.assertAllEqual([0, 1, -1], out_values)
       self.assertAllEqual(sp_indices, out_indices)
       self.assertAllEqual(sp_shape, out_shape)
 
+  @test_util.run_deprecated_v1
   def testSignatureMismatch(self):
     with self.cached_session():
       default_val = -1
@@ -210,6 +218,7 @@ class HashTableOpTest(test.TestCase):
             lookup_ops.KeyValueTensorInitializer(["a"], [1], [dtypes.string],
                                                  dtypes.int64), default_val)
 
+  @test_util.run_deprecated_v1
   def testNotInitialized(self):
     with self.cached_session():
       default_val = -1
@@ -221,8 +230,9 @@ class HashTableOpTest(test.TestCase):
       output = table.lookup(input_string)
 
       with self.assertRaisesOpError("Table not initialized"):
-        output.eval()
+        self.evaluate(output)
 
+  @test_util.run_deprecated_v1
   def testInitializeTwice(self):
     with self.cached_session():
       default_val = -1
@@ -235,6 +245,7 @@ class HashTableOpTest(test.TestCase):
       with self.assertRaisesOpError("Table already initialized"):
         table.initializer.run()
 
+  @test_util.run_deprecated_v1
   def testInitializationWithInvalidDimensions(self):
     with self.cached_session():
       default_val = -1
@@ -245,6 +256,7 @@ class HashTableOpTest(test.TestCase):
         lookup_ops.HashTable(
             lookup_ops.KeyValueTensorInitializer(keys, values), default_val)
 
+  @test_util.run_deprecated_v1
   def testMultipleSessions(self):
     # Start a server
     server = server_lib.Server(
@@ -274,6 +286,7 @@ class HashTableOpTest(test.TestCase):
       table.initializer.run()
       self.assertAllEqual(3, table.size().eval())
 
+  @test_util.run_deprecated_v1
   def testHashTableInt32String(self):
     with self.cached_session():
       default_val = "n/a"
@@ -286,7 +299,7 @@ class HashTableOpTest(test.TestCase):
       input_tensor = constant_op.constant([0, 1, -1])
       output = table.lookup(input_tensor)
 
-      result = output.eval()
+      result = self.evaluate(output)
       self.assertAllEqual([b"brain", b"salad", b"n/a"], result)
 
 
@@ -298,6 +311,7 @@ class IndexTableFromFile(test.TestCase):
       f.write("\n".join(values) + "\n")
     return vocabulary_file
 
+  @test_util.run_deprecated_v1
   def test_string_index_table_from_file(self):
     vocabulary_file = self._createVocabFile("f2i_vocab1.txt")
     with self.cached_session():
@@ -306,10 +320,11 @@ class IndexTableFromFile(test.TestCase):
       ids = table.lookup(constant_op.constant(["salad", "surgery", "tarkus"]))
 
       with self.assertRaises(errors_impl.OpError):
-        ids.eval()
+        self.evaluate(ids)
       lookup_ops.tables_initializer().run()
-      self.assertAllEqual((1, 2, 3), ids.eval())
+      self.assertAllEqual((1, 2, 3), self.evaluate(ids))
 
+  @test_util.run_deprecated_v1
   def test_string_index_table_from_multicolumn_file(self):
     vocabulary_file = self._createVocabFile(
         "f2i_vocab1.txt", values=("brain\t300", "salad\t20", "surgery\t1"))
@@ -322,10 +337,11 @@ class IndexTableFromFile(test.TestCase):
       ids = table.lookup(constant_op.constant(["salad", "surgery", "tarkus"]))
 
       with self.assertRaises(errors_impl.OpError):
-        ids.eval()
+        self.evaluate(ids)
       lookup_ops.tables_initializer().run()
-      self.assertAllEqual((1, 2, 3), ids.eval())
+      self.assertAllEqual((1, 2, 3), self.evaluate(ids))
 
+  @test_util.run_deprecated_v1
   def test_string_index_table_from_multicolumn_file_custom_delimiter(self):
     vocabulary_file = self._createVocabFile(
         "f2i_vocab1.txt", values=("brain 300", "salad 20", "surgery 1"))
@@ -339,10 +355,11 @@ class IndexTableFromFile(test.TestCase):
       ids = table.lookup(constant_op.constant(["salad", "surgery", "tarkus"]))
 
       with self.assertRaises(errors_impl.OpError):
-        ids.eval()
+        self.evaluate(ids)
       lookup_ops.tables_initializer().run()
-      self.assertAllEqual((1, 2, 3), ids.eval())
+      self.assertAllEqual((1, 2, 3), self.evaluate(ids))
 
+  @test_util.run_deprecated_v1
   def test_string_index_table_from_file_tensor_filename(self):
     vocabulary_file = self._createVocabFile("f2i_vocab1.txt")
     with self.cached_session():
@@ -352,12 +369,13 @@ class IndexTableFromFile(test.TestCase):
       ids = table.lookup(constant_op.constant(["salad", "surgery", "tarkus"]))
 
       with self.assertRaises(errors_impl.OpError):
-        ids.eval()
+        self.evaluate(ids)
       lookup_ops.tables_initializer().run()
-      self.assertAllEqual((1, 2, 3), ids.eval())
+      self.assertAllEqual((1, 2, 3), self.evaluate(ids))
       self.assertEqual(1,
                        len(ops.get_collection(ops.GraphKeys.ASSET_FILEPATHS)))
 
+  @test_util.run_deprecated_v1
   def test_string_index_table_from_file_placeholder_filename(self):
     vocabulary_file = self._createVocabFile("f2i_vocab1.txt")
     with self.cached_session():
@@ -367,14 +385,15 @@ class IndexTableFromFile(test.TestCase):
       ids = table.lookup(constant_op.constant(["salad", "surgery", "tarkus"]))
 
       with self.assertRaises(errors_impl.OpError):
-        ids.eval()
+        self.evaluate(ids)
 
       feed_dict = {vocabulary_placeholder.name: vocabulary_file}
       lookup_ops.tables_initializer().run(feed_dict=feed_dict)
-      self.assertAllEqual((1, 2, 3), ids.eval())
+      self.assertAllEqual((1, 2, 3), self.evaluate(ids))
       self.assertEqual(0,
                        len(ops.get_collection(ops.GraphKeys.ASSET_FILEPATHS)))
 
+  @test_util.run_deprecated_v1
   def test_int32_index_table_from_file(self):
     vocabulary_file = self._createVocabFile(
         "f2i_vocab2.txt", values=("42", "1", "-1000"))
@@ -387,10 +406,11 @@ class IndexTableFromFile(test.TestCase):
           constant_op.constant((1, -1000, 11), dtype=dtypes.int32))
 
       with self.assertRaises(errors_impl.OpError):
-        ids.eval()
+        self.evaluate(ids)
       lookup_ops.tables_initializer().run()
-      self.assertAllEqual((1, 2, 3), ids.eval())
+      self.assertAllEqual((1, 2, 3), self.evaluate(ids))
 
+  @test_util.run_deprecated_v1
   def test_int64_index_table_from_file(self):
     vocabulary_file = self._createVocabFile(
         "f2i_vocab3.txt", values=("42", "1", "-1000"))
@@ -403,10 +423,11 @@ class IndexTableFromFile(test.TestCase):
           constant_op.constant((1, -1000, 11), dtype=dtypes.int64))
 
       with self.assertRaises(errors_impl.OpError):
-        ids.eval()
+        self.evaluate(ids)
       lookup_ops.tables_initializer().run()
-      self.assertAllEqual((1, 2, 3), ids.eval())
+      self.assertAllEqual((1, 2, 3), self.evaluate(ids))
 
+  @test_util.run_deprecated_v1
   def test_index_table_from_file_with_default_value(self):
     default_value = -42
     vocabulary_file = self._createVocabFile("f2i_vocab4.txt")
@@ -416,10 +437,11 @@ class IndexTableFromFile(test.TestCase):
       ids = table.lookup(constant_op.constant(["salad", "surgery", "tarkus"]))
 
       with self.assertRaises(errors_impl.OpError):
-        ids.eval()
+        self.evaluate(ids)
       lookup_ops.tables_initializer().run()
-      self.assertAllEqual((1, 2, default_value), ids.eval())
+      self.assertAllEqual((1, 2, default_value), self.evaluate(ids))
 
+  @test_util.run_deprecated_v1
   def test_index_table_from_file_with_oov_buckets(self):
     vocabulary_file = self._createVocabFile("f2i_vocab5.txt")
     with self.cached_session():
@@ -429,7 +451,7 @@ class IndexTableFromFile(test.TestCase):
           constant_op.constant(["salad", "surgery", "tarkus", "toccata"]))
 
       with self.assertRaises(errors_impl.OpError):
-        ids.eval()
+        self.evaluate(ids)
       lookup_ops.tables_initializer().run()
       self.assertAllEqual(
           (
@@ -437,7 +459,7 @@ class IndexTableFromFile(test.TestCase):
               2,  # From vocabulary file.
               867,  # 3 + fingerprint("tarkus") mod 300.
               860),  # 3 + fingerprint("toccata") mod 300.
-          ids.eval())
+          self.evaluate(ids))
 
   def test_index_table_from_file_fails_with_empty_vocabulary_file_name(self):
     self.assertRaises(
@@ -468,6 +490,7 @@ class IndexTableFromFile(test.TestCase):
         vocabulary_file=vocabulary_file,
         vocab_size=0)
 
+  @test_util.run_deprecated_v1
   def test_index_table_from_file_with_vocab_size_too_small(self):
     vocabulary_file = self._createVocabFile("f2i_vocab6.txt")
     with self.cached_session():
@@ -476,11 +499,12 @@ class IndexTableFromFile(test.TestCase):
       ids = table.lookup(constant_op.constant(["salad", "surgery", "tarkus"]))
 
       with self.assertRaises(errors_impl.OpError):
-        ids.eval()
+        self.evaluate(ids)
       lookup_ops.tables_initializer().run()
-      self.assertAllEqual((1, -1, -1), ids.eval())
+      self.assertAllEqual((1, -1, -1), self.evaluate(ids))
       self.assertEqual(2, table.size().eval())
 
+  @test_util.run_deprecated_v1
   def test_index_table_from_file_with_vocab_size_too_large(self):
     vocabulary_file = self._createVocabFile("f2i_vocab7.txt")
     with self.cached_session():
@@ -489,6 +513,7 @@ class IndexTableFromFile(test.TestCase):
       self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
                               "Invalid vocab_size", table.initializer.run)
 
+  @test_util.run_deprecated_v1
   def test_index_table_from_file_with_vocab_size(self):
     vocabulary_file = self._createVocabFile("f2i_vocab8.txt")
 
@@ -504,9 +529,9 @@ class IndexTableFromFile(test.TestCase):
       ids = table.lookup(constant_op.constant(["salad", "surgery", "tarkus"]))
 
       with self.assertRaises(errors_impl.OpError):
-        ids.eval()
+        self.evaluate(ids)
       lookup_ops.tables_initializer().run()
-      self.assertAllEqual((1, 2, -1), ids.eval())
+      self.assertAllEqual((1, 2, -1), self.evaluate(ids))
       self.assertEqual(3, table.size().eval())
 
   def test_index_table_from_file_with_invalid_hashers(self):
@@ -577,6 +602,7 @@ class KeyValueTensorInitializerTest(test.TestCase):
       table = lookup_ops.HashTable(init, default_value=-1)
       table.initializer.run()
 
+  @test_util.run_deprecated_v1
   def test_int32(self):
     with ops.Graph().as_default(), self.cached_session():
       init = lookup_ops.KeyValueTensorInitializer((42, 1, -1000), (0, 1, 2),
@@ -590,6 +616,7 @@ class KeyValueTensorInitializerTest(test.TestCase):
 class IndexTableFromTensor(test.TestCase):
 
   @test_util.run_in_graph_and_eager_modes
+  @test_util.run_deprecated_v1
   def test_index_table_from_tensor_with_tensor_init(self):
     table = lookup_ops.index_table_from_tensor(
         vocabulary_list=("brain", "salad", "surgery"), num_oov_buckets=1)
@@ -606,6 +633,7 @@ class IndexTableFromTensor(test.TestCase):
     ids = table.lookup(constant_op.constant(("salad", "surgery", "tarkus")))
     self.assertAllEqual((1, 2, 3), self.evaluate(ids))
 
+  @test_util.run_deprecated_v1
   def test_int32_index_table_from_tensor_with_tensor_init(self):
     with self.cached_session():
       table = lookup_ops.index_table_from_tensor(
@@ -614,10 +642,11 @@ class IndexTableFromTensor(test.TestCase):
           constant_op.constant((1, -1000, 11), dtype=dtypes.int32))
 
       with self.assertRaises(errors_impl.FailedPreconditionError):
-        ids.eval()
+        self.evaluate(ids)
       lookup_ops.tables_initializer().run()
-      self.assertAllEqual((1, 2, 3), ids.eval())
+      self.assertAllEqual((1, 2, 3), self.evaluate(ids))
 
+  @test_util.run_deprecated_v1
   def test_int64_index_table_from_tensor_with_tensor_init(self):
     with self.cached_session():
       table = lookup_ops.index_table_from_tensor(
@@ -626,10 +655,11 @@ class IndexTableFromTensor(test.TestCase):
           constant_op.constant((1, -1000, 11), dtype=dtypes.int64))
 
       with self.assertRaises(errors_impl.FailedPreconditionError):
-        ids.eval()
+        self.evaluate(ids)
       lookup_ops.tables_initializer().run()
-      self.assertAllEqual((1, 2, 3), ids.eval())
+      self.assertAllEqual((1, 2, 3), self.evaluate(ids))
 
+  @test_util.run_deprecated_v1
   def test_index_table_from_tensor_with_default_value(self):
     default_value = -42
     with self.cached_session():
@@ -639,9 +669,9 @@ class IndexTableFromTensor(test.TestCase):
       ids = table.lookup(constant_op.constant(["salad", "surgery", "tarkus"]))
 
       with self.assertRaises(errors_impl.FailedPreconditionError):
-        ids.eval()
+        self.evaluate(ids)
       lookup_ops.tables_initializer().run()
-      self.assertAllEqual((1, 2, default_value), ids.eval())
+      self.assertAllEqual((1, 2, default_value), self.evaluate(ids))
 
   def test_index_table_from_tensor_missing_vocabulary_list(self):
     with self.cached_session():
@@ -650,13 +680,14 @@ class IndexTableFromTensor(test.TestCase):
         lookup_ops.index_table_from_tensor(
             vocabulary_list=None, num_oov_buckets=1)
 
+  @test_util.run_deprecated_v1
   def test_index_table_from_tensor_empty_vocabulary_list(self):
     with self.cached_session():
       table = lookup_ops.index_table_from_tensor(
           vocabulary_list=np.array([], dtype=np.str_), num_oov_buckets=1)
       ids = table.lookup(constant_op.constant(["salad", "surgery", "brain"]))
       with self.assertRaises(errors_impl.OpError):
-        ids.eval()
+        self.evaluate(ids)
       with self.assertRaisesRegexp(
           errors_impl.OpError, "keys and values cannot be empty"):
         lookup_ops.tables_initializer().run()
@@ -686,6 +717,7 @@ class IndexToStringTableFromFileTest(test.TestCase):
       f.write("\n".join(values) + "\n")
     return vocabulary_file
 
+  @test_util.run_deprecated_v1
   def test_index_to_string_table(self):
     vocabulary_path = self._createVocabFile("i2f_vocab1.txt")
     # vocabulary_file supports string and tensor
@@ -698,11 +730,12 @@ class IndexToStringTableFromFileTest(test.TestCase):
         features = table.lookup(
             constant_op.constant([0, 1, 2, 3], dtypes.int64))
         with self.assertRaises(errors_impl.OpError):
-          features.eval()
+          self.evaluate(features)
         lookup_ops.tables_initializer().run()
         self.assertAllEqual((b"brain", b"salad", b"surgery", b"UNK"),
-                            features.eval())
+                            self.evaluate(features))
 
+  @test_util.run_deprecated_v1
   def test_index_to_string_table_from_multicolumn_file(self):
     vocabulary_file = self._createVocabFile(
         "f2i_vocab1.txt", values=("brain\t300", "salad\t20", "surgery\t1"))
@@ -713,11 +746,12 @@ class IndexToStringTableFromFileTest(test.TestCase):
           value_column_index=0)
       features = table.lookup(constant_op.constant([0, 1, 2, 3], dtypes.int64))
       with self.assertRaises(errors_impl.OpError):
-        features.eval()
+        self.evaluate(features)
       lookup_ops.tables_initializer().run()
       self.assertAllEqual((b"brain", b"salad", b"surgery", b"UNK"),
-                          features.eval())
+                          self.evaluate(features))
 
+  @test_util.run_deprecated_v1
   def test_index_to_string_table_from_multicolumn_file_custom_delimiter(self):
     vocabulary_file = self._createVocabFile(
         "f2i_vocab1.txt", values=("brain 300", "salad 20", "surgery 1"))
@@ -729,11 +763,12 @@ class IndexToStringTableFromFileTest(test.TestCase):
           delimiter=" ")
       features = table.lookup(constant_op.constant([0, 1, 2, 3], dtypes.int64))
       with self.assertRaises(errors_impl.OpError):
-        features.eval()
+        self.evaluate(features)
       lookup_ops.tables_initializer().run()
       self.assertAllEqual((b"brain", b"salad", b"surgery", b"UNK"),
-                          features.eval())
+                          self.evaluate(features))
 
+  @test_util.run_deprecated_v1
   def test_index_to_string_table_with_default_value(self):
     default_value = b"NONE"
     vocabulary_file = self._createVocabFile("f2i_vocab2.txt")
@@ -742,11 +777,12 @@ class IndexToStringTableFromFileTest(test.TestCase):
           vocabulary_file=vocabulary_file, default_value=default_value)
       features = table.lookup(constant_op.constant([1, 2, 4], dtypes.int64))
       with self.assertRaises(errors_impl.OpError):
-        features.eval()
+        self.evaluate(features)
       lookup_ops.tables_initializer().run()
       self.assertAllEqual((b"salad", b"surgery", default_value),
-                          features.eval())
+                          self.evaluate(features))
 
+  @test_util.run_deprecated_v1
   def test_index_to_string_table_with_vocab_size_too_small(self):
     default_value = b"NONE"
     vocabulary_file = self._createVocabFile("f2i_vocab2.txt")
@@ -757,11 +793,12 @@ class IndexToStringTableFromFileTest(test.TestCase):
           default_value=default_value)
       features = table.lookup(constant_op.constant([1, 2, 4], dtypes.int64))
       with self.assertRaises(errors_impl.OpError):
-        features.eval()
+        self.evaluate(features)
       lookup_ops.tables_initializer().run()
       self.assertAllEqual((b"salad", default_value, default_value),
-                          features.eval())
+                          self.evaluate(features))
 
+  @test_util.run_deprecated_v1
   def test_index_to_string_table_with_vocab_size_too_large(self):
     vocabulary_file = self._createVocabFile("f2i_vocab6.txt")
     with self.cached_session():
@@ -770,11 +807,12 @@ class IndexToStringTableFromFileTest(test.TestCase):
       features = table.lookup(constant_op.constant([1, 2, 4], dtypes.int64))
 
       with self.assertRaises(errors_impl.OpError):
-        features.eval()
+        self.evaluate(features)
       init = lookup_ops.tables_initializer()
       self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
                               "Invalid vocab_size", init.run)
 
+  @test_util.run_deprecated_v1
   def test_index_to_string_table_with_vocab_size(self):
     vocabulary_file = self._createVocabFile("f2i_vocab7.txt")
     with self.cached_session():
@@ -783,13 +821,15 @@ class IndexToStringTableFromFileTest(test.TestCase):
       features = table.lookup(constant_op.constant([1, 2, 4], dtypes.int64))
 
       with self.assertRaises(errors_impl.OpError):
-        features.eval()
+        self.evaluate(features)
       lookup_ops.tables_initializer().run()
-      self.assertAllEqual((b"salad", b"surgery", b"UNK"), features.eval())
+      self.assertAllEqual((b"salad", b"surgery", b"UNK"),
+                          self.evaluate(features))
 
 
 class IndexToStringTableFromTensorTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def test_index_to_string_table_from_tensor(self):
     with self.cached_session():
       vocabulary_list = constant_op.constant(["brain", "salad", "surgery"])
@@ -799,12 +839,13 @@ class IndexToStringTableFromTensorTest(test.TestCase):
       indices = constant_op.constant([0, 1, 2, 3], dtypes.int64)
       features = table.lookup(indices)
       with self.assertRaises(errors_impl.OpError):
-        features.eval()
+        self.evaluate(features)
       lookup_ops.tables_initializer().run()
 
       self.assertAllEqual((b"brain", b"salad", b"surgery", b"UNK"),
-                          features.eval())
+                          self.evaluate(features))
 
+  @test_util.run_deprecated_v1
   def test_duplicate_entries(self):
     with self.cached_session():
       vocabulary_list = constant_op.constant(["hello", "hello"])
@@ -813,8 +854,9 @@ class IndexToStringTableFromTensorTest(test.TestCase):
       indices = constant_op.constant([0, 1, 4], dtypes.int64)
       features = table.lookup(indices)
       lookup_ops.tables_initializer().run()
-      self.assertAllEqual((b"hello", b"hello", b"UNK"), features.eval())
+      self.assertAllEqual((b"hello", b"hello", b"UNK"), self.evaluate(features))
 
+  @test_util.run_deprecated_v1
   def test_index_to_string_with_default_value(self):
     default_value = b"NONE"
     with self.cached_session():
@@ -824,11 +866,11 @@ class IndexToStringTableFromTensorTest(test.TestCase):
       indices = constant_op.constant([1, 2, 4], dtypes.int64)
       features = table.lookup(indices)
       with self.assertRaises(errors_impl.OpError):
-        features.eval()
+        self.evaluate(features)
 
       lookup_ops.tables_initializer().run()
       self.assertAllEqual((b"salad", b"surgery", default_value),
-                          features.eval())
+                          self.evaluate(features))
 
 
 class InitializeTableFromFileOpTest(test.TestCase):
@@ -854,6 +896,7 @@ class InitializeTableFromFileOpTest(test.TestCase):
     result = self.evaluate(output)
     self.assertAllEqual([0, 1, -1], result)
 
+  @test_util.run_deprecated_v1
   def testInitializeInt64Table(self):
     vocabulary_file = self._createVocabFile(
         "one_column_int64.txt", values=("42", "1", "-1000"))
@@ -870,9 +913,10 @@ class InitializeTableFromFileOpTest(test.TestCase):
       output = table.lookup(
           constant_op.constant((42, 1, 11), dtype=dtypes.int64))
 
-      result = output.eval()
+      result = self.evaluate(output)
       self.assertAllEqual([0, 1, -1], result)
 
+  @test_util.run_deprecated_v1
   def testInitializeIndexTable(self):
     vocabulary_file = self._createVocabFile("one_column_2.txt")
 
@@ -889,9 +933,10 @@ class InitializeTableFromFileOpTest(test.TestCase):
       input_values = constant_op.constant([0, 1, 2, 3], dtypes.int64)
       output = table.lookup(input_values)
 
-      result = output.eval()
+      result = self.evaluate(output)
       self.assertAllEqual([b"brain", b"salad", b"surgery", b"UNK"], result)
 
+  @test_util.run_deprecated_v1
   def testMultiColumn(self):
     vocabulary_file = os.path.join(self.get_temp_dir(), "three_columns.txt")
     with open(vocabulary_file, "w") as f:
@@ -911,9 +956,10 @@ class InitializeTableFromFileOpTest(test.TestCase):
       input_string = constant_op.constant(["brain", "salad", "surgery"])
       output = table.lookup(input_string)
 
-      result = output.eval()
+      result = self.evaluate(output)
       self.assertAllEqual([1, 5, 6], result)
 
+  @test_util.run_deprecated_v1
   def testInvalidDataTypeInMultiColumn(self):
     vocabulary_file = os.path.join(self.get_temp_dir(), "three_columns.txt")
     with open(vocabulary_file, "w") as f:
@@ -944,6 +990,7 @@ class InitializeTableFromFileOpTest(test.TestCase):
                                            key_index, dtypes.string,
                                            value_index), default_value)
 
+  @test_util.run_deprecated_v1
   def testInvalidIndex(self):
     vocabulary_file = self._createVocabFile("one_column_4.txt")
     with self.cached_session():
@@ -958,6 +1005,7 @@ class InitializeTableFromFileOpTest(test.TestCase):
       with self.assertRaisesOpError("Invalid number of columns"):
         table.initializer.run()
 
+  @test_util.run_deprecated_v1
   def testInitializeSameTableWithMultipleNodes(self):
     vocabulary_file = self._createVocabFile("one_column_5.txt")
 
@@ -994,7 +1042,7 @@ class InitializeTableFromFileOpTest(test.TestCase):
       output2 = table2.lookup(input_string)
       output3 = table3.lookup(input_string)
 
-      out1, out2, out3 = sess.run([output1, output2, output3])
+      out1, out2, out3 = self.evaluate([output1, output2, output3])
       self.assertAllEqual([0, 1, -1], out1)
       self.assertAllEqual([0, 1, -1], out2)
       self.assertAllEqual([0, 1, -1], out3)
@@ -1009,6 +1057,7 @@ class InitializeTableFromFileOpTest(test.TestCase):
                 dtypes.int64, lookup_ops.TextFileIndex.LINE_NUMBER),
             default_value)
 
+  @test_util.run_deprecated_v1
   def testInitializeWithVocabSize(self):
     with self.cached_session():
       default_value = -1
@@ -1055,6 +1104,7 @@ class InitializeTableFromFileOpTest(test.TestCase):
       table3.initializer.run()
       self.assertEquals(vocab_size, table3.size().eval())
 
+  @test_util.run_deprecated_v1
   def testFeedVocabularyName(self):
     vocabulary_file = self._createVocabFile("feed_vocabulary.txt")
 
@@ -1078,9 +1128,10 @@ class InitializeTableFromFileOpTest(test.TestCase):
       input_string = constant_op.constant(["brain", "salad", "tank"])
       output = table.lookup(input_string)
 
-      result = output.eval()
+      result = self.evaluate(output)
       self.assertAllEqual([0, 1, -1], result)
 
+  @test_util.run_deprecated_v1
   def testInvalidFilenames(self):
     vocabulary_file = self._createVocabFile("filename_shape.txt")
 
@@ -1105,6 +1156,7 @@ class InitializeTableFromFileOpTest(test.TestCase):
                 dtypes.int64, lookup_ops.TextFileIndex.LINE_NUMBER),
             default_value)
 
+  @test_util.run_deprecated_v1
   def testIdToStringTable(self):
     vocab_file = self._createVocabFile("feat_to_id_1.txt")
     with self.cached_session():
@@ -1119,9 +1171,11 @@ class InitializeTableFromFileOpTest(test.TestCase):
       input_values = constant_op.constant([0, 1, 2, 3], dtypes.int64)
 
       out = table.lookup(input_values)
-      self.assertAllEqual([b"brain", b"salad", b"surgery", b"UNK"], out.eval())
+      self.assertAllEqual([b"brain", b"salad", b"surgery", b"UNK"],
+                          self.evaluate(out))
       self.assertEquals(vocab_size, table.size().eval())
 
+  @test_util.run_deprecated_v1
   def testStringToIdTable(self):
     vocab_file = self._createVocabFile("feat_to_id_2.txt")
     with self.cached_session():
@@ -1135,9 +1189,10 @@ class InitializeTableFromFileOpTest(test.TestCase):
       input_string = constant_op.constant(["brain", "salad", "surgery", "UNK"])
 
       out = table.lookup(input_string)
-      self.assertAllEqual([0, 1, 2, -1], out.eval())
+      self.assertAllEqual([0, 1, 2, -1], self.evaluate(out))
       self.assertEquals(vocab_size, table.size().eval())
 
+  @test_util.run_deprecated_v1
   def testInt64ToIdTable(self):
     vocab_file = self._createVocabFile(
         "feat_to_id_3.txt", values=("42", "1", "-1000"))
@@ -1152,7 +1207,7 @@ class InitializeTableFromFileOpTest(test.TestCase):
 
       out = table.lookup(
           constant_op.constant((42, 1, -1000, 11), dtype=dtypes.int64))
-      self.assertAllEqual((0, 1, 2, -1), out.eval())
+      self.assertAllEqual((0, 1, 2, -1), self.evaluate(out))
       self.assertEquals(vocab_size, table.size().eval())
 
 
@@ -1164,6 +1219,7 @@ class IdTableWithHashBucketsTest(test.TestCase):
       f.write("\n".join(values) + "\n")
     return vocabulary_file
 
+  @test_util.run_deprecated_v1
   def testStringIdTableWithHashBuckets(self):
     vocab_file = self._createVocabFile("feat_to_id_1.txt")
     with self.cached_session():
@@ -1181,9 +1237,10 @@ class IdTableWithHashBucketsTest(test.TestCase):
       input_string = constant_op.constant(["brain", "salad", "surgery", "UNK"])
 
       out = table.lookup(input_string)
-      self.assertAllEqual([0, 1, 2, 3], out.eval())
+      self.assertAllEqual([0, 1, 2, 3], self.evaluate(out))
       self.assertEquals(vocab_size + oov_buckets, table.size().eval())
 
+  @test_util.run_deprecated_v1
   def testInt32IdTableWithHashBuckets(self):
     vocab_file = self._createVocabFile("feat_to_id_2.txt", ("42", "1", "-1000"))
     with self.cached_session():
@@ -1203,9 +1260,10 @@ class IdTableWithHashBucketsTest(test.TestCase):
       values = constant_op.constant((42, 1, -1000, 11), dtype=dtypes.int32)
 
       out = table.lookup(values)
-      self.assertAllEqual([0, 1, 2, 3], out.eval())
+      self.assertAllEqual([0, 1, 2, 3], self.evaluate(out))
       self.assertEquals(vocab_size + oov_buckets, table.size().eval())
 
+  @test_util.run_deprecated_v1
   def testInt64IdTableWithHashBuckets(self):
     vocab_file = self._createVocabFile("feat_to_id_3.txt", ("42", "1", "-1000"))
     with self.cached_session():
@@ -1223,9 +1281,10 @@ class IdTableWithHashBucketsTest(test.TestCase):
       values = constant_op.constant((42, 1, -1000, 11), dtype=dtypes.int64)
 
       out = table.lookup(values)
-      self.assertAllEqual([0, 1, 2, 3], out.eval())
+      self.assertAllEqual([0, 1, 2, 3], self.evaluate(out))
       self.assertEquals(vocab_size + oov_buckets, table.size().eval())
 
+  @test_util.run_deprecated_v1
   def testStringIdTableWithOnlyHashBucket(self):
     with self.cached_session():
       oov_buckets = 5
@@ -1244,9 +1303,10 @@ class IdTableWithHashBucketsTest(test.TestCase):
               1,  # fingerprint("salad") mod 5.
               4  # fingerprint("surgery") mod 5
           ],
-          out.eval())
+          self.evaluate(out))
       self.assertEquals(oov_buckets, table.size().eval())
 
+  @test_util.run_deprecated_v1
   def testInt32IdTableWithOnlyHashBucket(self):
     with self.cached_session():
       oov_buckets = 5
@@ -1266,7 +1326,7 @@ class IdTableWithHashBucketsTest(test.TestCase):
               4,  # fingerprint("1") mod 5.
               2  # fingerprint("-1000") mod 5
           ],
-          out.eval())
+          self.evaluate(out))
       self.assertEquals(oov_buckets, table.size().eval())
 
   def testFloat64IdTableWithOnlyHashBucket(self):
@@ -1281,6 +1341,7 @@ class IdTableWithHashBucketsTest(test.TestCase):
         lookup_ops.IdTableWithHashBuckets(
             None, num_oov_buckets=5, key_dtype=dtypes.bool)
 
+  @test_util.run_deprecated_v1
   def testIdTableWithHashBucketsWithMultipleInitializers(self):
     vocab_file = self._createVocabFile("feat_to_id_4.txt")
     with self.cached_session() as sess:
@@ -1311,7 +1372,7 @@ class IdTableWithHashBucketsTest(test.TestCase):
       out1 = table1.lookup(input_string)
       out2 = table2.lookup(input_string)
 
-      out1, out2 = sess.run([out1, out2])
+      out1, out2 = self.evaluate([out1, out2])
       self.assertAllEqual([5, 0, 1, 2, 5], out1)
       self.assertAllEqual([5, 0, 1, 2, 3], out2)
       self.assertEquals(vocab_size + oov_buckets, table1.size().eval())
@@ -1321,6 +1382,7 @@ class IdTableWithHashBucketsTest(test.TestCase):
           "table2_Lookup/hash_bucket": "StringToHashBucketStrong",
       }, sess.graph)
 
+  @test_util.run_deprecated_v1
   def testIdTableWithHashBucketsInitializationAcrossSessions(self):
     vocab_file = self._createVocabFile("feat_to_id_5.txt")
     shared_name = "across-sessions"
@@ -1342,7 +1404,7 @@ class IdTableWithHashBucketsTest(test.TestCase):
 
       out1 = table1.lookup(input_string_1)
 
-      self.assertAllEqual([0, 1, 2, 3], out1.eval())
+      self.assertAllEqual([0, 1, 2, 3], self.evaluate(out1))
       self.assertEquals(vocab_size + oov_buckets, table1.size().eval())
 
     with self.cached_session():
@@ -1363,9 +1425,10 @@ class IdTableWithHashBucketsTest(test.TestCase):
 
       out2 = table2.lookup(input_string_2)
 
-      self.assertAllEqual([3, 1, 3], out2.eval())
+      self.assertAllEqual([3, 1, 3], self.evaluate(out2))
       self.assertEquals(vocab_size + oov_buckets, table2.size().eval())
 
+  @test_util.run_deprecated_v1
   def testIdTableWithHashBucketsWithMultipleInitializersDifferentDefault(self):
     vocab_file = self._createVocabFile("feat_to_id_6.txt")
     with self.cached_session() as sess:
@@ -1394,12 +1457,13 @@ class IdTableWithHashBucketsTest(test.TestCase):
       out1 = table1.lookup(input_string_1)
       out2 = table2.lookup(input_string_2)
 
-      out1, out2 = sess.run([out1, out2])
+      out1, out2 = self.evaluate([out1, out2])
       self.assertAllEqual([0, 1, 2, -1], out1)
       self.assertAllEqual([-2, 1, -2], out2)
       self.assertEquals(vocab_size + oov_buckets, table1.size().eval())
       self.assertEquals(vocab_size + oov_buckets, table2.size().eval())
 
+  @test_util.run_deprecated_v1
   def testSparseTensor(self):
     vocab_file = self._createVocabFile("feat_to_id_7.txt")
     input_indices = [[0, 0], [0, 1], [2, 0], [2, 2], [3, 0]]
@@ -1428,6 +1492,7 @@ class IdTableWithHashBucketsTest(test.TestCase):
       self.assertAllEqual([0, 1, 0, 2, 3], sp_ids_val)
       self.assertAllEqual(input_shape, sp_ids_shape)
 
+  @test_util.run_deprecated_v1
   def testInt32SparseTensor(self):
     input_indices = [[0, 0], [0, 1], [2, 0], [2, 2], [3, 0]]
     input_shape = [4, 4]
@@ -1456,6 +1521,7 @@ class IdTableWithHashBucketsTest(test.TestCase):
       self.assertAllEqual([0, 1, 0, 2, 3], sp_ids_val)
       self.assertAllEqual(input_shape, sp_ids_shape)
 
+  @test_util.run_deprecated_v1
   def testInt64SparseTensor(self):
     input_indices = [[0, 0], [0, 1], [2, 0], [2, 2], [3, 0]]
     input_shape = [4, 4]
diff --git a/tensorflow/python/kernel_tests/losses_test.py b/tensorflow/python/kernel_tests/losses_test.py
index b04996f7889..abff61f81b0 100644
--- a/tensorflow/python/kernel_tests/losses_test.py
+++ b/tensorflow/python/kernel_tests/losses_test.py
@@ -51,58 +51,62 @@ class AbsoluteDifferenceLossTest(test.TestCase):
         losses.absolute_difference(
             self._predictions, self._predictions, weights=None)
 
+  @test_util.run_deprecated_v1
   def testAllCorrectNoLossWeight(self):
     loss = losses.absolute_difference(self._predictions, self._predictions)
     with self.cached_session():
-      self.assertAlmostEqual(0.0, loss.eval(), 3)
+      self.assertAlmostEqual(0.0, self.evaluate(loss), 3)
 
+  @test_util.run_deprecated_v1
   def testNonZeroLoss(self):
     loss = losses.absolute_difference(self._labels, self._predictions)
     with self.cached_session():
-      self.assertAlmostEqual(5.5, loss.eval(), 3)
+      self.assertAlmostEqual(5.5, self.evaluate(loss), 3)
 
+  @test_util.run_deprecated_v1
   def testNonZeroLossWithPythonScalarWeight(self):
     weights = 2.3
     loss = losses.absolute_difference(self._labels, self._predictions, weights)
     with self.cached_session():
-      self.assertAlmostEqual(5.5 * weights, loss.eval(), 3)
+      self.assertAlmostEqual(5.5 * weights, self.evaluate(loss), 3)
 
+  @test_util.run_deprecated_v1
   def testNonZeroLossWithScalarTensorWeight(self):
     weights = 2.3
     loss = losses.absolute_difference(self._labels, self._predictions,
                                       constant_op.constant(weights))
     with self.cached_session():
-      self.assertAlmostEqual(5.5 * weights, loss.eval(), 3)
+      self.assertAlmostEqual(5.5 * weights, self.evaluate(loss), 3)
 
   def testNonZeroLossWithOneDimBatchSpecificWeights(self):
     weights = constant_op.constant((1.2, 0.0), shape=(2, 1))
     loss = losses.absolute_difference(self._labels, self._predictions, weights)
     with self.cached_session():
-      self.assertAlmostEqual(5.6, loss.eval(), 3)
+      self.assertAlmostEqual(5.6, self.evaluate(loss), 3)
 
   def testNonZeroLossWithTwoDimBatchSpecificWeights(self):
     weights = constant_op.constant([1.2, 0.0], shape=[2, 1])
     loss = losses.absolute_difference(self._labels, self._predictions, weights)
     with self.cached_session():
-      self.assertAlmostEqual(5.6, loss.eval(), 3)
+      self.assertAlmostEqual(5.6, self.evaluate(loss), 3)
 
   def testNonZeroLossWithSampleSpecificWeights(self):
     weights = constant_op.constant([3, 6, 5, 0, 4, 2], shape=[2, 3])
     loss = losses.absolute_difference(self._labels, self._predictions, weights)
     with self.cached_session():
-      self.assertAlmostEqual(16.6, loss.eval(), 3)
+      self.assertAlmostEqual(16.6, self.evaluate(loss), 3)
 
   def testNonZeroLossWithSampleSpecificWeightsMostZero(self):
     weights = constant_op.constant([0, 0, 0, 0, 0, 2], shape=[2, 3])
     loss = losses.absolute_difference(self._labels, self._predictions, weights)
     with self.cached_session():
-      self.assertAlmostEqual(6.0, loss.eval(), 3)
+      self.assertAlmostEqual(6.0, self.evaluate(loss), 3)
 
   def testLossWithSampleSpecificWeightsAllZero(self):
     weights = array_ops.zeros((2, 3))
     loss = losses.absolute_difference(self._labels, self._predictions, weights)
     with self.cached_session():
-      self.assertAlmostEqual(0.0, loss.eval(), 3)
+      self.assertAlmostEqual(0.0, self.evaluate(loss), 3)
 
   @test_util.assert_no_new_pyobjects_executing_eagerly
   def testEagerNoMemoryLeaked(self):
@@ -123,6 +127,7 @@ class SoftmaxCrossEntropyLossTest(test.TestCase):
       with self.assertRaises(ValueError):
         losses.softmax_cross_entropy(labels, logits, weights=None)
 
+  @test_util.run_deprecated_v1
   def testAllCorrect(self):
     with self.cached_session():
       logits = constant_op.constant([[10.0, 0.0, 0.0], [0.0, 10.0, 0.0],
@@ -132,6 +137,7 @@ class SoftmaxCrossEntropyLossTest(test.TestCase):
       self.assertEquals('softmax_cross_entropy_loss/value', loss.op.name)
       self.assertAlmostEqual(loss.eval(), 0.0, 3)
 
+  @test_util.run_deprecated_v1
   def testAllWrong(self):
     logits = constant_op.constant([[10.0, 0.0, 0.0], [0.0, 10.0, 0.0],
                                    [0.0, 0.0, 10.0]])
@@ -142,6 +148,7 @@ class SoftmaxCrossEntropyLossTest(test.TestCase):
       self.assertEquals(loss.op.name, 'softmax_cross_entropy_loss/value')
       self.assertAlmostEqual(loss.eval(), 10.0, 3)
 
+  @test_util.run_deprecated_v1
   def testNonZeroLossWithPythonScalarWeight(self):
     logits = constant_op.constant([[10.0, 0.0, 0.0], [0.0, 10.0, 0.0],
                                    [0.0, 0.0, 10.0]])
@@ -149,8 +156,9 @@ class SoftmaxCrossEntropyLossTest(test.TestCase):
     weights = 2.3
     with self.cached_session():
       loss = losses.softmax_cross_entropy(labels, logits, weights)
-      self.assertAlmostEqual(weights * 10.0, loss.eval(), 3)
+      self.assertAlmostEqual(weights * 10.0, self.evaluate(loss), 3)
 
+  @test_util.run_deprecated_v1
   def testNonZeroLossWithScalarTensorWeight(self):
     logits = constant_op.constant([[10.0, 0.0, 0.0], [0.0, 10.0, 0.0],
                                    [0.0, 0.0, 10.0]])
@@ -159,7 +167,7 @@ class SoftmaxCrossEntropyLossTest(test.TestCase):
     with self.cached_session():
       loss = losses.softmax_cross_entropy(labels, logits,
                                           constant_op.constant(weights))
-      self.assertAlmostEqual(weights * 10.0, loss.eval(), 3)
+      self.assertAlmostEqual(weights * 10.0, self.evaluate(loss), 3)
 
   def testNonZeroLossWithOneDimBatchSpecificWeights(self):
     logits = constant_op.constant([[10.0, 0.0, 0.0], [0.0, 10.0, 0.0],
@@ -168,7 +176,8 @@ class SoftmaxCrossEntropyLossTest(test.TestCase):
     weights = constant_op.constant((1.2, 3.4, 5.6))
     with self.cached_session():
       loss = losses.softmax_cross_entropy(labels, logits, weights)
-      self.assertAlmostEqual((1.2 + 3.4 + 5.6) * 10.0 / 3.0, loss.eval(), 3)
+      self.assertAlmostEqual((1.2 + 3.4 + 5.6) * 10.0 / 3.0,
+                             self.evaluate(loss), 3)
 
   def testAllWrongAllWeightsMissing(self):
     logits = constant_op.constant([[10.0, 0.0, 0.0], [0.0, 10.0, 0.0],
@@ -177,7 +186,7 @@ class SoftmaxCrossEntropyLossTest(test.TestCase):
     weights = constant_op.constant([0, 0, 0], shape=[3])
     with self.cached_session():
       loss = losses.softmax_cross_entropy(labels, logits, weights)
-      self.assertAlmostEqual(0.0, loss.eval(), 3)
+      self.assertAlmostEqual(0.0, self.evaluate(loss), 3)
 
   def testSomeWeightsMissing(self):
     logits = constant_op.constant([[10.0, 0.0, 0.0], [0.0, 10.0, 0.0],
@@ -186,7 +195,7 @@ class SoftmaxCrossEntropyLossTest(test.TestCase):
     weights = constant_op.constant([1.2, 0, 0], shape=[3])
     with self.cached_session():
       loss = losses.softmax_cross_entropy(labels, logits, weights)
-      self.assertAlmostEqual(12.0, loss.eval(), 3)
+      self.assertAlmostEqual(12.0, self.evaluate(loss), 3)
 
   def testSoftmaxWithMeasurementSpecificWeightsRaisesException(self):
     with self.cached_session():
@@ -199,6 +208,7 @@ class SoftmaxCrossEntropyLossTest(test.TestCase):
       with self.assertRaises(ValueError):
         losses.softmax_cross_entropy(labels, logits, weights=weights).eval()
 
+  @test_util.run_deprecated_v1
   def testSoftmaxLabelSmoothing(self):
     with self.cached_session():
       # Softmax Cross Entropy Loss is:
@@ -231,6 +241,7 @@ class SparseSoftmaxCrossEntropyLossTest(test.TestCase):
       with self.assertRaises(ValueError):
         losses.sparse_softmax_cross_entropy(labels, logits, weights=None)
 
+  @test_util.run_deprecated_v1
   def testAllCorrectInt32Labels(self):
     with self.cached_session():
       logits = constant_op.constant([[10.0, 0.0, 0.0], [0.0, 10.0, 0.0],
@@ -247,6 +258,7 @@ class SparseSoftmaxCrossEntropyLossTest(test.TestCase):
     labels = constant_op.constant([[0], [1], [2]], dtype=dtypes.int32)
     losses.sparse_softmax_cross_entropy(labels, logits)
 
+  @test_util.run_deprecated_v1
   def testAllCorrectInt64Labels(self):
     with self.cached_session():
       logits = constant_op.constant([[10.0, 0.0, 0.0], [0.0, 10.0, 0.0],
@@ -256,6 +268,7 @@ class SparseSoftmaxCrossEntropyLossTest(test.TestCase):
       self.assertEquals(loss.op.name, 'sparse_softmax_cross_entropy_loss/value')
       self.assertAlmostEqual(loss.eval(), 0.0, 3)
 
+  @test_util.run_deprecated_v1
   def testAllCorrectNonColumnLabels(self):
     with self.cached_session():
       logits = constant_op.constant([[10.0, 0.0, 0.0], [0.0, 10.0, 0.0],
@@ -265,6 +278,7 @@ class SparseSoftmaxCrossEntropyLossTest(test.TestCase):
       self.assertEquals(loss.op.name, 'sparse_softmax_cross_entropy_loss/value')
       self.assertAlmostEqual(loss.eval(), 0.0, 3)
 
+  @test_util.run_deprecated_v1
   def testAllWrongInt32Labels(self):
     logits = constant_op.constant([[10.0, 0.0, 0.0], [0.0, 10.0, 0.0],
                                    [0.0, 0.0, 10.0]])
@@ -275,6 +289,7 @@ class SparseSoftmaxCrossEntropyLossTest(test.TestCase):
       self.assertEquals(loss.op.name, 'sparse_softmax_cross_entropy_loss/value')
       self.assertAlmostEqual(loss.eval(), 10.0, 3)
 
+  @test_util.run_deprecated_v1
   def testAllWrongInt64Labels(self):
     logits = constant_op.constant([[10.0, 0.0, 0.0], [0.0, 10.0, 0.0],
                                    [0.0, 0.0, 10.0]])
@@ -285,6 +300,7 @@ class SparseSoftmaxCrossEntropyLossTest(test.TestCase):
       self.assertEquals(loss.op.name, 'sparse_softmax_cross_entropy_loss/value')
       self.assertAlmostEqual(loss.eval(), 10.0, 3)
 
+  @test_util.run_deprecated_v1
   def testAllWrongNonColumnLabels(self):
     logits = constant_op.constant([[10.0, 0.0, 0.0], [0.0, 10.0, 0.0],
                                    [0.0, 0.0, 10.0]])
@@ -295,6 +311,7 @@ class SparseSoftmaxCrossEntropyLossTest(test.TestCase):
       self.assertEquals(loss.op.name, 'sparse_softmax_cross_entropy_loss/value')
       self.assertAlmostEqual(loss.eval(), 10.0, 3)
 
+  @test_util.run_deprecated_v1
   def testNonZeroLossWithPythonScalarWeight(self):
     logits = constant_op.constant([[10.0, 0.0, 0.0], [0.0, 10.0, 0.0],
                                    [0.0, 0.0, 10.0]])
@@ -302,8 +319,9 @@ class SparseSoftmaxCrossEntropyLossTest(test.TestCase):
     weights = 2.3
     with self.cached_session():
       loss = losses.sparse_softmax_cross_entropy(labels, logits, weights)
-      self.assertAlmostEqual(weights * 10.0, loss.eval(), 3)
+      self.assertAlmostEqual(weights * 10.0, self.evaluate(loss), 3)
 
+  @test_util.run_deprecated_v1
   def testNonZeroLossWithScalarTensorWeight(self):
     logits = constant_op.constant([[10.0, 0.0, 0.0], [0.0, 10.0, 0.0],
                                    [0.0, 0.0, 10.0]])
@@ -312,7 +330,7 @@ class SparseSoftmaxCrossEntropyLossTest(test.TestCase):
     with self.cached_session():
       loss = losses.sparse_softmax_cross_entropy(labels, logits,
                                                  constant_op.constant(weights))
-      self.assertAlmostEqual(weights * 10.0, loss.eval(), 3)
+      self.assertAlmostEqual(weights * 10.0, self.evaluate(loss), 3)
 
   def testNonZeroLossWith1DTensorWeight(self):
     logits = constant_op.constant([[10.0, 0.0, 0.0], [0.0, 10.0, 0.0],
@@ -322,8 +340,9 @@ class SparseSoftmaxCrossEntropyLossTest(test.TestCase):
     with self.cached_session():
       loss = losses.sparse_softmax_cross_entropy(
           labels, logits, constant_op.constant((weights,)))
-      self.assertAlmostEqual(weights * 10.0, loss.eval(), 3)
+      self.assertAlmostEqual(weights * 10.0, self.evaluate(loss), 3)
 
+  @test_util.run_deprecated_v1
   def testNonZeroLossWithPlaceholderForWeights(self):
     logits = constant_op.constant([[10.0, 0.0, 0.0],
                                    [0.0, 10.0, 0.0],
@@ -336,6 +355,7 @@ class SparseSoftmaxCrossEntropyLossTest(test.TestCase):
                           feed_dict={weights: ((1.2,), (3.4,), (5.6,))})
       self.assertAlmostEqual((1.2 + 3.4 + 5.6) * 10.0 / 3.0, loss_val, 3)
 
+  @test_util.run_deprecated_v1
   def testUnknownShapePlaceholderForLogitsLabelsButScalarWeights(self):
     logits = array_ops.placeholder(dtypes.float32)
     labels = array_ops.placeholder(dtypes.int32)
@@ -351,6 +371,7 @@ class SparseSoftmaxCrossEntropyLossTest(test.TestCase):
                           })
       self.assertAlmostEqual((1.0 + 1.0 + 1.0) * 10.0 / 3.0, loss_val, 3)
 
+  @test_util.run_deprecated_v1
   def testNonZeroLossWithPlaceholderForLogitsLabelsAndWeights(self):
     logits = array_ops.placeholder(dtypes.float32, shape=(None, 3))
     labels = array_ops.placeholder(dtypes.int32, shape=(None, 1))
@@ -374,7 +395,8 @@ class SparseSoftmaxCrossEntropyLossTest(test.TestCase):
     weights = constant_op.constant([1.2, 3.4, 5.6], shape=(3, 1))
     with self.cached_session():
       loss = losses.sparse_softmax_cross_entropy(labels, logits, weights)
-      self.assertAlmostEqual((1.2 + 3.4 + 5.6) * 10.0 / 3.0, loss.eval(), 3)
+      self.assertAlmostEqual((1.2 + 3.4 + 5.6) * 10.0 / 3.0,
+                             self.evaluate(loss), 3)
 
   def testNonZeroLossWithColumnWeights(self):
     logits = constant_op.constant([[10.0, 0.0, 0.0], [0.0, 10.0, 0.0],
@@ -383,7 +405,8 @@ class SparseSoftmaxCrossEntropyLossTest(test.TestCase):
     weights = constant_op.constant([[1.2], [3.4], [5.6]])
     with self.cached_session():
       loss = losses.sparse_softmax_cross_entropy(labels, logits, weights)
-      self.assertAlmostEqual((1.2 + 3.4 + 5.6) * 10.0 / 3.0, loss.eval(), 3)
+      self.assertAlmostEqual((1.2 + 3.4 + 5.6) * 10.0 / 3.0,
+                             self.evaluate(loss), 3)
 
   def testAllWrongAllWeightsMissing(self):
     logits = constant_op.constant([[10.0, 0.0, 0.0], [0.0, 10.0, 0.0],
@@ -392,7 +415,7 @@ class SparseSoftmaxCrossEntropyLossTest(test.TestCase):
     weights = constant_op.constant([0, 0, 0], shape=(3, 1))
     with self.cached_session():
       loss = losses.sparse_softmax_cross_entropy(labels, logits, weights)
-      self.assertAlmostEqual(0.0, loss.eval(), 3)
+      self.assertAlmostEqual(0.0, self.evaluate(loss), 3)
 
   def testSomeWeightsMissing(self):
     logits = constant_op.constant([[10.0, 0.0, 0.0], [0.0, 10.0, 0.0],
@@ -401,8 +424,9 @@ class SparseSoftmaxCrossEntropyLossTest(test.TestCase):
     weights = constant_op.constant([1.2, 0, 0], shape=(3, 1))
     with self.cached_session():
       loss = losses.sparse_softmax_cross_entropy(labels, logits, weights)
-      self.assertAlmostEqual(12.0, loss.eval(), 3)
+      self.assertAlmostEqual(12.0, self.evaluate(loss), 3)
 
+  @test_util.run_deprecated_v1
   def testMeasurementSpecificWeightsRaisesException(self):
     with self.cached_session():
       logits = constant_op.constant([[100.0, -100.0, -100.0],
@@ -441,6 +465,7 @@ class SparseSoftmaxCrossEntropyLossTest(test.TestCase):
         losses.sparse_softmax_cross_entropy(
             labels, logits, weights=weights).eval()
 
+  @test_util.run_deprecated_v1
   def testInconsistentWeightShapeRaisesException(self):
     """The weight tensor has incorrect shape."""
     with self.cached_session():
@@ -455,6 +480,7 @@ class SparseSoftmaxCrossEntropyLossTest(test.TestCase):
         losses.sparse_softmax_cross_entropy(
             labels, logits, weights=weights).eval()
 
+  @test_util.run_deprecated_v1
   def testInconsistentLabelShapeRaisesException(self):
     """The label tensor has incorrect shape."""
     with self.cached_session():
@@ -472,6 +498,7 @@ class SparseSoftmaxCrossEntropyLossTest(test.TestCase):
 
 class SigmoidCrossEntropyLossTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testAllCorrectSigmoid(self):
     with self.cached_session():
       logits = constant_op.constant([[100.0, -100.0, -100.0],
@@ -481,8 +508,9 @@ class SigmoidCrossEntropyLossTest(test.TestCase):
       loss = losses.sigmoid_cross_entropy(labels, logits)
       self.assertEquals(logits.dtype, loss.dtype)
       self.assertEquals('sigmoid_cross_entropy_loss/value', loss.op.name)
-      self.assertAlmostEqual(0.0, loss.eval(), 3)
+      self.assertAlmostEqual(0.0, self.evaluate(loss), 3)
 
+  @test_util.run_deprecated_v1
   def testLossWithSingleDimPlaceholderForLogitsAndWeights1(self):
     logits = array_ops.placeholder(dtypes.float32, shape=(None, 1))
     labels = array_ops.placeholder(dtypes.float32, shape=(None, 1))
@@ -499,6 +527,7 @@ class SigmoidCrossEntropyLossTest(test.TestCase):
                       })
       self.assertAlmostEqual(0.313, loss, 3)
 
+  @test_util.run_deprecated_v1
   def testLossWithSingleDimPlaceholderForLogitsAndWeights2(self):
     logits = array_ops.placeholder(dtypes.float32, shape=(None, 2))
     labels = array_ops.placeholder(dtypes.float32, shape=(None, 2))
@@ -515,6 +544,7 @@ class SigmoidCrossEntropyLossTest(test.TestCase):
                       })
       self.assertAlmostEqual(0.313, loss, 3)
 
+  @test_util.run_deprecated_v1
   def testAllWrongSigmoid(self):
     with self.cached_session():
       logits = constant_op.constant([[100.0, -100.0, -100.0],
@@ -526,6 +556,7 @@ class SigmoidCrossEntropyLossTest(test.TestCase):
       self.assertEquals('sigmoid_cross_entropy_loss/value', loss.op.name)
       self.assertAlmostEqual(loss.eval(), 600.0 / 9.0, 3)
 
+  @test_util.run_deprecated_v1
   def testAllWrongSigmoidWithMeasurementSpecificWeights(self):
     with self.cached_session():
       logits = constant_op.constant([[100.0, -100.0, -100.0],
@@ -536,8 +567,9 @@ class SigmoidCrossEntropyLossTest(test.TestCase):
       loss = losses.sigmoid_cross_entropy(labels, logits, weights)
       self.assertEquals(logits.dtype, loss.dtype)
       self.assertEquals('sigmoid_cross_entropy_loss/value', loss.op.name)
-      self.assertAlmostEqual(1700.0 / 7.0, loss.eval(), 3)
+      self.assertAlmostEqual(1700.0 / 7.0, self.evaluate(loss), 3)
 
+  @test_util.run_deprecated_v1
   def testMultiCorrectSigmoid(self):
     logits = constant_op.constant([[100.0, -100.0, 100.0],
                                    [100.0, 100.0, -100.0],
@@ -548,7 +580,7 @@ class SigmoidCrossEntropyLossTest(test.TestCase):
     self.assertEquals('sigmoid_cross_entropy_loss/value', loss.op.name)
 
     with self.cached_session():
-      self.assertAlmostEqual(0.0, loss.eval(), 3)
+      self.assertAlmostEqual(0.0, self.evaluate(loss), 3)
 
   def testSigmoidFloat64(self):
     logits = constant_op.constant((
@@ -563,7 +595,7 @@ class SigmoidCrossEntropyLossTest(test.TestCase):
     self.assertEquals(logits.dtype, loss.dtype)
 
     with self.cached_session():
-      self.assertAlmostEqual(44.444, loss.eval(), 3)
+      self.assertAlmostEqual(44.444, self.evaluate(loss), 3)
 
   def testSigmoidNoReduction(self):
     logits = constant_op.constant((
@@ -576,12 +608,10 @@ class SigmoidCrossEntropyLossTest(test.TestCase):
     self.assertEquals(logits.dtype, loss.dtype)
 
     with self.cached_session():
-      self.assertAllClose((
-          (0., 0., 0.),
-          (0., 100., 100.),
-          (100., 0., 100.)
-      ), loss.eval(), 3)
+      self.assertAllClose(((0., 0., 0.), (0., 100., 100.), (100., 0., 100.)),
+                          self.evaluate(loss), 3)
 
+  @test_util.run_deprecated_v1
   def testSigmoidLabelSmoothingCorrect(self):
     with self.cached_session():
       logits = constant_op.constant([[100.0, -100.0, -100.0]])
@@ -605,6 +635,7 @@ class SigmoidCrossEntropyLossTest(test.TestCase):
       expected_value = (100.0 + 50.0 * label_smoothing) / 3.0
       self.assertAlmostEqual(loss.eval(), expected_value, 3)
 
+  @test_util.run_deprecated_v1
   def testSigmoidLabelSmoothingEqualsSoftmaxTwoLabel(self):
     with self.cached_session():
       label_smoothing = 0.1
@@ -619,7 +650,8 @@ class SigmoidCrossEntropyLossTest(test.TestCase):
       softmax_labels = constant_op.constant([[0, 1], [1, 0], [0, 1]])
       softmax_loss = losses.softmax_cross_entropy(
           softmax_labels, softmax_logits, label_smoothing=label_smoothing)
-      self.assertAlmostEqual(sigmoid_loss.eval(), softmax_loss.eval(), 3)
+      self.assertAlmostEqual(sigmoid_loss.eval(), self.evaluate(softmax_loss),
+                             3)
 
 
 class LogLossTest(test.TestCase):
@@ -645,11 +677,13 @@ class LogLossTest(test.TestCase):
       with self.assertRaises(ValueError):
         losses.log_loss(self._labels, self._labels, weights=None)
 
+  @test_util.run_deprecated_v1
   def testAllCorrectNoLossWeight(self):
     loss = losses.log_loss(self._labels, self._labels)
     with self.cached_session():
-      self.assertAlmostEqual(0.0, loss.eval(), 3)
+      self.assertAlmostEqual(0.0, self.evaluate(loss), 3)
 
+  @test_util.run_deprecated_v1
   def testAllCorrectNoLossWeightWithPlaceholder(self):
     tf_predictions = array_ops.placeholder(
         dtypes.float32, shape=self._np_labels.shape)
@@ -658,27 +692,31 @@ class LogLossTest(test.TestCase):
       self.assertAlmostEqual(
           0.0, loss.eval(feed_dict={tf_predictions: self._np_labels}), 3)
 
+  @test_util.run_deprecated_v1
   def testNonZeroLoss(self):
     loss = losses.log_loss(self._labels, self._predictions)
     with self.cached_session():
       self.assertAlmostEqual(-np.sum(self._expected_losses) / 6.0,
-                             loss.eval(), 3)
+                             self.evaluate(loss), 3)
 
+  @test_util.run_deprecated_v1
   def testNonZeroLossWithPythonScalarWeight(self):
     weights = 2.3
     loss = losses.log_loss(self._labels, self._predictions, weights)
     with self.cached_session():
       self.assertAlmostEqual(weights * -np.sum(self._expected_losses) / 6.0,
-                             loss.eval(), 3)
+                             self.evaluate(loss), 3)
 
+  @test_util.run_deprecated_v1
   def testNonZeroLossWithScalarTensorWeight(self):
     weights = 2.3
     loss = losses.log_loss(self._labels, self._predictions,
                            constant_op.constant(weights))
     with self.cached_session():
       self.assertAlmostEqual(weights * -np.sum(self._expected_losses) / 6.0,
-                             loss.eval(), 3)
+                             self.evaluate(loss), 3)
 
+  @test_util.run_deprecated_v1
   def testNonZeroLossWithScalarTensorWeightAndPlaceholder(self):
     tf_predictions = array_ops.placeholder(
         dtypes.float32, shape=self._np_predictions.shape)
@@ -690,6 +728,7 @@ class LogLossTest(test.TestCase):
       self.assertAlmostEqual(weights * -np.sum(self._expected_losses) / 6.0,
                              loss, 3)
 
+  @test_util.run_deprecated_v1
   def testNonZeroLossWithScalarTensorWeightAndPlaceholderWithRankOnly(self):
     tf_predictions = array_ops.placeholder(dtypes.float32, shape=[None, None])
     weights = 2.3
@@ -707,7 +746,8 @@ class LogLossTest(test.TestCase):
         np.asarray([1.2, 1.2, 1.2, 3.4, 3.4, 3.4]).reshape((2, 3)))
     loss = losses.log_loss(self._labels, self._predictions, weights)
     with self.cached_session():
-      self.assertAlmostEqual(-np.sum(expected_losses) / 6.0, loss.eval(), 3)
+      self.assertAlmostEqual(-np.sum(expected_losses) / 6.0,
+                             self.evaluate(loss), 3)
 
   def testNonZeroLossWithOneDimBatchSpecificWeightsSomeZero(self):
     weights = constant_op.constant((1.2, 0), shape=(2, 1))
@@ -716,7 +756,8 @@ class LogLossTest(test.TestCase):
                                       (2, 3)))
     loss = losses.log_loss(self._labels, self._predictions, weights)
     with self.cached_session():
-      self.assertAlmostEqual(-np.sum(expected_losses) / 3.0, loss.eval(), 3)
+      self.assertAlmostEqual(-np.sum(expected_losses) / 3.0,
+                             self.evaluate(loss), 3)
 
   def testNonZeroLossWithTwoDimBatchSpecificWeightsSomeZero(self):
     weights = constant_op.constant([1.2, 0], shape=[2, 1])
@@ -725,7 +766,8 @@ class LogLossTest(test.TestCase):
                                       (2, 3)))
     loss = losses.log_loss(self._labels, self._predictions, weights)
     with self.cached_session():
-      self.assertAlmostEqual(-np.sum(expected_losses) / 3.0, loss.eval(), 3)
+      self.assertAlmostEqual(-np.sum(expected_losses) / 3.0,
+                             self.evaluate(loss), 3)
 
   def testWeightsWithSameNumDimsButWrongShapeThrowsException(self):
     weights = constant_op.constant(np.random.normal(size=(2, 4)), shape=[2, 4])
@@ -743,8 +785,10 @@ class LogLossTest(test.TestCase):
         constant_op.constant(
             weights, shape=(2, 3)))
     with self.cached_session():
-      self.assertAlmostEqual(-np.sum(expected_losses) / 5.0, loss.eval(), 3)
+      self.assertAlmostEqual(-np.sum(expected_losses) / 5.0,
+                             self.evaluate(loss), 3)
 
+  @test_util.run_deprecated_v1
   def testNonZeroLossWithMeasurementSpecificWeightsWithPlaceholder(self):
     weights = np.array([3, 6, 5, 0, 4, 2]).reshape((2, 3))
     expected_losses = np.multiply(self._expected_losses, weights)
@@ -770,8 +814,9 @@ class LogLossTest(test.TestCase):
         constant_op.constant(
             weights, shape=(2, 3)))
     with self.cached_session():
-      self.assertAlmostEqual(-np.sum(expected_losses), loss.eval(), 3)
+      self.assertAlmostEqual(-np.sum(expected_losses), self.evaluate(loss), 3)
 
+  @test_util.run_deprecated_v1
   def testNonZeroLossWithSampleSpecificWeightsMostZeroWithPlaceholder(self):
     weights = np.array([0, 0, 0, 0, 0, 2]).reshape((2, 3))
     expected_losses = np.multiply(self._expected_losses, weights)
@@ -788,7 +833,7 @@ class LogLossTest(test.TestCase):
     tf_weights = array_ops.zeros(shape=(2, 3))
     loss = losses.log_loss(self._labels, self._predictions, tf_weights)
     with self.cached_session():
-      self.assertAlmostEqual(0.0, loss.eval(), 3)
+      self.assertAlmostEqual(0.0, self.evaluate(loss), 3)
 
 
 class HingeLossTest(test.TestCase):
@@ -800,6 +845,7 @@ class HingeLossTest(test.TestCase):
       with self.assertRaises(ValueError):
         _ = losses.hinge_loss(labels, logits).eval()
 
+  @test_util.run_deprecated_v1
   def testAllOutsideMargin(self):
     with self.cached_session():
       logits = constant_op.constant([1.2, -1.4, -1.0, 2.1])
@@ -807,6 +853,7 @@ class HingeLossTest(test.TestCase):
       loss = losses.hinge_loss(labels, logits)
       self.assertAllClose(loss.eval(), 0.0, atol=1e-3)
 
+  @test_util.run_deprecated_v1
   def testSomeInsideMargin(self):
     with self.cached_session():
       logits = constant_op.constant([[-0.7], [-1.4], [1.4], [0.6]])
@@ -816,6 +863,7 @@ class HingeLossTest(test.TestCase):
       # the margin so they incur some (small) loss.
       self.assertAllClose(loss.eval(), 0.175, atol=1e-3)
 
+  @test_util.run_deprecated_v1
   def testSomeMisclassified(self):
     with self.cached_session():
       logits = constant_op.constant([[[1.2], [0.4], [-1.0], [-1.1]]])
@@ -835,6 +883,7 @@ class HuberLossTest(test.TestCase):
       with self.assertRaises(ValueError):
         _ = losses.huber_loss(labels, predictions).eval()
 
+  @test_util.run_deprecated_v1
   def testAllQuadratic(self):
     with self.cached_session():
       predictions = constant_op.constant([1.5, -1.4, -1.0, 0.0])
@@ -843,6 +892,7 @@ class HuberLossTest(test.TestCase):
       self.assertAllClose(loss.eval(),
                           0.5 * (0.25 + 0.16 + 1.0 + 0.25) / 4., atol=1e-5)
 
+  @test_util.run_deprecated_v1
   def testAllLinear(self):
     with self.cached_session():
       predictions = constant_op.constant([1.5, -1.4, -1.0, 0.0])
@@ -851,6 +901,7 @@ class HuberLossTest(test.TestCase):
       self.assertAllClose(loss.eval(),
                           (1.5 + 2.4 + 1.0 + 1.5) / 4. - 0.5, atol=1e-5)
 
+  @test_util.run_deprecated_v1
   def testMixedQuadraticLinear(self):
     with self.cached_session():
       predictions = constant_op.constant([[1.5, -1.4, -1.0, 0.0],
@@ -870,7 +921,7 @@ class HuberLossTest(test.TestCase):
       labels = constant_op.constant([1.0, -1.0, 0.0, 0.5])
       expected = 0.5 * np.array([0.5**2, 0.4**2, 0.5**2, 0.5**2]).mean()
       loss = losses.huber_loss(labels, predictions, delta=delta)
-      self.assertAllClose(expected, loss.eval(), atol=1e-5)
+      self.assertAllClose(expected, self.evaluate(loss), atol=1e-5)
 
   def testAllLinearDelta(self):
     delta = 0.5
@@ -880,7 +931,7 @@ class HuberLossTest(test.TestCase):
     expected -= 0.5 * delta**2
     loss = losses.huber_loss(labels, predictions, delta=delta)
     with self.cached_session():
-      self.assertAllClose(expected, loss.eval(), atol=1e-5)
+      self.assertAllClose(expected, self.evaluate(loss), atol=1e-5)
 
 
 class MeanSquaredErrorTest(test.TestCase):
@@ -896,6 +947,7 @@ class MeanSquaredErrorTest(test.TestCase):
         losses.mean_squared_error(
             self._predictions, self._predictions, weights=None)
 
+  @test_util.run_deprecated_v1
   def testScalar(self):
     with self.cached_session():
       self.assertEqual(
@@ -903,58 +955,62 @@ class MeanSquaredErrorTest(test.TestCase):
           losses.mean_squared_error(predictions=constant_op.constant(0),
                                     labels=constant_op.constant(0)).eval())
 
+  @test_util.run_deprecated_v1
   def testAllCorrectNoLossWeight(self):
     loss = losses.mean_squared_error(self._predictions, self._predictions)
     with self.cached_session():
-      self.assertAlmostEqual(0.0, loss.eval(), 3)
+      self.assertAlmostEqual(0.0, self.evaluate(loss), 3)
 
+  @test_util.run_deprecated_v1
   def testNonZeroLoss(self):
     loss = losses.mean_squared_error(self._labels, self._predictions)
     with self.cached_session():
-      self.assertAlmostEqual(49.5, loss.eval(), 3)
+      self.assertAlmostEqual(49.5, self.evaluate(loss), 3)
 
+  @test_util.run_deprecated_v1
   def testNonZeroLossWithPythonScalarWeight(self):
     weights = 2.3
     loss = losses.mean_squared_error(self._labels, self._predictions, weights)
     with self.cached_session():
-      self.assertAlmostEqual(49.5 * weights, loss.eval(), 3)
+      self.assertAlmostEqual(49.5 * weights, self.evaluate(loss), 3)
 
+  @test_util.run_deprecated_v1
   def testNonZeroLossWithScalarTensorWeight(self):
     weights = 2.3
     loss = losses.mean_squared_error(self._labels, self._predictions,
                                      constant_op.constant(weights))
     with self.cached_session():
-      self.assertAlmostEqual(49.5 * weights, loss.eval(), 3)
+      self.assertAlmostEqual(49.5 * weights, self.evaluate(loss), 3)
 
   def testNonZeroLossWithOneDimBatchSpecificWeights(self):
     weights = constant_op.constant([1.2, 3.4], shape=(2, 1))
     loss = losses.mean_squared_error(self._labels, self._predictions, weights)
     with self.cached_session():
-      self.assertAlmostEqual(767.8 / 6.0, loss.eval(), 3)
+      self.assertAlmostEqual(767.8 / 6.0, self.evaluate(loss), 3)
 
   def testNonZeroLossWithTwoDimBatchSpecificWeights(self):
     weights = constant_op.constant([1.2, 3.4], shape=[2, 1])
     loss = losses.mean_squared_error(self._labels, self._predictions, weights)
     with self.cached_session():
-      self.assertAlmostEqual(767.8 / 6.0, loss.eval(), 3)
+      self.assertAlmostEqual(767.8 / 6.0, self.evaluate(loss), 3)
 
   def testNonZeroLossWithSampleSpecificWeights(self):
     weights = constant_op.constant([3, 6, 5, 0, 4, 2], shape=[2, 3])
     loss = losses.mean_squared_error(self._labels, self._predictions, weights)
     with self.cached_session():
-      self.assertAlmostEqual(587 / 5.0, loss.eval(), 3)
+      self.assertAlmostEqual(587 / 5.0, self.evaluate(loss), 3)
 
   def testNonZeroLossWithSampleSpecificWeightsMostZero(self):
     weights = constant_op.constant([0, 0, 0, 0, 0, 2], shape=[2, 3])
     loss = losses.mean_squared_error(self._labels, self._predictions, weights)
     with self.cached_session():
-      self.assertAlmostEqual(18.0, loss.eval(), 3)
+      self.assertAlmostEqual(18.0, self.evaluate(loss), 3)
 
   def testLossWithSampleSpecificWeightsAllZero(self):
     weights = array_ops.zeros((2, 3))
     loss = losses.mean_squared_error(self._labels, self._predictions, weights)
     with self.cached_session():
-      self.assertAlmostEqual(0.0, loss.eval(), 3)
+      self.assertAlmostEqual(0.0, self.evaluate(loss), 3)
 
 
 class MeanPairwiseSquaredErrorTest(test.TestCase):
@@ -991,7 +1047,8 @@ class MeanPairwiseSquaredErrorTest(test.TestCase):
     with self.cached_session():
       static_inputs_op = losses.mean_pairwise_squared_error(
           predictions=predictions, labels=labels, weights=weights)
-      self.assertAlmostEqual(expected_loss, static_inputs_op.eval(), places=3)
+      self.assertAlmostEqual(
+          expected_loss, self.evaluate(static_inputs_op), places=3)
 
       predictions_placeholder = array_ops.placeholder(
           dtypes.float32, shape=np.asarray(predictions.shape))
@@ -1011,10 +1068,12 @@ class MeanPairwiseSquaredErrorTest(test.TestCase):
       self.assertAlmostEqual(
           expected_loss, dynamic_inputs_op.eval(feed_dict=feed_dict), places=3)
 
+  @test_util.run_deprecated_v1
   def testAllCorrectNoLossWeight(self):
     self._test_valid_weights(
         self._labels, self._labels, expected_loss=0.0)
 
+  @test_util.run_deprecated_v1
   def testNonZeroLoss(self):
     self._test_valid_weights(
         self._labels, self._predictions,
@@ -1040,11 +1099,12 @@ class MeanPairwiseSquaredErrorTest(test.TestCase):
       init_op = variables.global_variables_initializer()
 
       with self.cached_session() as sess:
-        sess.run(init_op)
+        self.evaluate(init_op)
         for grad, _ in gradients_to_variables:
-          np_grad = sess.run(grad)
+          np_grad = self.evaluate(grad)
           self.assertFalse(np.isnan(np_grad).any())
 
+  @test_util.run_deprecated_v1
   def testNonZeroLossWithPythonScalarWeight(self):
     weight = 2.3
     self._test_valid_weights(
@@ -1052,6 +1112,7 @@ class MeanPairwiseSquaredErrorTest(test.TestCase):
         expected_loss=weight * np.sum(self._expected_losses),
         weights=weight)
 
+  @test_util.run_deprecated_v1
   def testNonZeroLossWithScalarTensorWeight(self):
     weights = 2.3
     loss = losses.mean_pairwise_squared_error(
@@ -1060,12 +1121,14 @@ class MeanPairwiseSquaredErrorTest(test.TestCase):
         weights=constant_op.constant(weights))
     with self.cached_session():
       self.assertAlmostEqual(weights * np.sum(self._expected_losses),
-                             loss.eval(), 3)
+                             self.evaluate(loss), 3)
 
+  @test_util.run_deprecated_v1
   def testNonZeroLossWithScalarZeroWeight(self):
     self._test_valid_weights(
         self._labels, self._predictions, expected_loss=0.0, weights=0.0)
 
+  @test_util.run_deprecated_v1
   def test3d(self):
     labels = np.array([
         [[1, 9, 2], [12, 11, 10], [9, 8, 7]],
@@ -1077,6 +1140,7 @@ class MeanPairwiseSquaredErrorTest(test.TestCase):
     ])
     self._test_valid_weights(labels, predictions, expected_loss=137.5)
 
+  @test_util.run_deprecated_v1
   def test3dWeightedScalar(self):
     labels = np.array([
         [[1, 9, 2], [12, 11, 10], [9, 8, 7]],
@@ -1115,6 +1179,7 @@ class MeanPairwiseSquaredErrorTest(test.TestCase):
             weights_placeholder: weights,
         })
 
+  @test_util.run_deprecated_v1
   def testInvalid3dWeighted2x0(self):
     labels = np.array([
         [[1, 9, 2], [12, 11, 10], [9, 8, 7]],
@@ -1127,6 +1192,7 @@ class MeanPairwiseSquaredErrorTest(test.TestCase):
     self._test_invalid_weights(
         labels, predictions, weights=np.asarray((1.2, 3.4)))
 
+  @test_util.run_deprecated_v1
   def test3dWeighted2x3x3(self):
     labels = np.array([
         [[1, 9, 2], [12, 11, 10], [9, 8, 7]],
@@ -1143,6 +1209,7 @@ class MeanPairwiseSquaredErrorTest(test.TestCase):
         expected_loss=9 * 137.5,
         weights=np.ones((2, 3, 3)))
 
+  @test_util.run_deprecated_v1
   def testLossWithAllZeroBatchSpecificWeights(self):
     self._test_valid_weights(
         self._labels, self._predictions, expected_loss=0.0,
@@ -1215,7 +1282,7 @@ class CosineDistanceLossTest(test.TestCase):
         labels=constant_op.constant(self._labels),
         dim=2)
     with self.cached_session():
-      self.assertAlmostEqual(0, loss.eval(), 5)
+      self.assertAlmostEqual(0, self.evaluate(loss), 5)
 
   def testPartiallyCorrectWithIntegerValues(self):
     loss = losses.cosine_distance(
@@ -1223,7 +1290,7 @@ class CosineDistanceLossTest(test.TestCase):
         labels=constant_op.constant(self._labels),
         dim=2)
     with self.cached_session():
-      self.assertAlmostEqual(1, loss.eval(), 5)
+      self.assertAlmostEqual(1, self.evaluate(loss), 5)
 
   def testPartiallyCorrectFloatingPointValues(self):
     predictions = np.matrix(
@@ -1241,7 +1308,7 @@ class CosineDistanceLossTest(test.TestCase):
     loss = losses.cosine_distance(tf_labels, tf_preds, dim=2)
 
     with self.cached_session():
-      self.assertAlmostEqual(1.0, loss.eval(), 5)
+      self.assertAlmostEqual(1.0, self.evaluate(loss), 5)
 
   def testSampleSpecificWeights(self):
     loss = losses.cosine_distance(
@@ -1250,7 +1317,7 @@ class CosineDistanceLossTest(test.TestCase):
         dim=2,
         weights=np.asarray((1, 0, 0)).reshape((3, 1, 1)))
     with self.cached_session():
-      self.assertEqual(1.0, loss.eval())
+      self.assertEqual(1.0, self.evaluate(loss))
 
   def testMeasurementSpecificWeights(self):
     loss = losses.cosine_distance(
@@ -1260,8 +1327,9 @@ class CosineDistanceLossTest(test.TestCase):
         weights=constant_op.constant(
             [1, 0, 0, 1, 1, 1], shape=(3, 2, 1)))
     with self.cached_session():
-      self.assertEqual(3.0 / 4.0, loss.eval())
+      self.assertEqual(3.0 / 4.0, self.evaluate(loss))
 
+  @test_util.run_deprecated_v1
   def testMeasurementSpecificWeightsWithPlaceholderWithShape(self):
     tf_predictions = array_ops.placeholder(
         dtypes.float32, shape=self._labels.shape)
@@ -1282,7 +1350,7 @@ class CosineDistanceLossTest(test.TestCase):
         dim=2,
         weights=array_ops.zeros((3, 1, 1)))
     with self.cached_session():
-      self.assertEqual(0, loss.eval())
+      self.assertEqual(0, self.evaluate(loss))
 
   def testZeroLossWhenAllMeasurementSpecificWeightsAreZero(self):
     loss = losses.cosine_distance(
@@ -1291,7 +1359,7 @@ class CosineDistanceLossTest(test.TestCase):
         dim=2,
         weights=array_ops.zeros((3, 2, 1)))
     with self.cached_session():
-      self.assertEqual(0, loss.eval())
+      self.assertEqual(0, self.evaluate(loss))
 
 
 class AddLossTest(test.TestCase):
@@ -1351,15 +1419,16 @@ class ComputeWeightedLossTest(test.TestCase):
         with self.session(g):
           for unweighted_loss in unweighted_losses:
             if reduction == losses.Reduction.NONE:
-              self.assertAllClose(self._raw_losses, unweighted_loss.eval())
+              self.assertAllClose(self._raw_losses,
+                                  self.evaluate(unweighted_loss))
             elif reduction == losses.Reduction.SUM:
               self.assertAllClose(
-                  np.sum(self._raw_losses), unweighted_loss.eval())
+                  np.sum(self._raw_losses), self.evaluate(unweighted_loss))
             else:
               # reduction one of MEAN, SUM_OVER_NONZERO_WEIGHTS,
               # SUM_BY_NONZERO_WEIGHTS or SUM_OVER_BATCH_SIZE.
               self.assertAllClose(
-                  np.mean(self._raw_losses), unweighted_loss.eval())
+                  np.mean(self._raw_losses), self.evaluate(unweighted_loss))
 
   def testUnweightedFromPlaceholder(self):
     for reduction in losses.Reduction.all():
@@ -1398,7 +1467,7 @@ class ComputeWeightedLossTest(test.TestCase):
       self.assertEqual(1, len(util.get_losses()))
       with self.cached_session():
         self.assertAllClose(
-            np.mean(weight * self._raw_losses), weighted_loss.eval())
+            np.mean(weight * self._raw_losses), self.evaluate(weighted_loss))
 
   def _test_invalid_weights(self, weights):
     with ops.Graph().as_default():
@@ -1470,24 +1539,22 @@ class ComputeWeightedLossTest(test.TestCase):
           weighted_losses = weights * self._raw_losses
           weighted_sum = np.sum(weighted_losses)
           if reduction == losses.Reduction.NONE:
-            self.assertAllClose(weighted_losses, weighted_loss.eval())
+            self.assertAllClose(weighted_losses, self.evaluate(weighted_loss))
           elif reduction == losses.Reduction.SUM:
-            self.assertAllClose(weighted_sum, weighted_loss.eval())
+            self.assertAllClose(weighted_sum, self.evaluate(weighted_loss))
           else:
             broadcast_weights = weights * np.ones_like(self._raw_losses)
             if reduction == losses.Reduction.MEAN:
-              self.assertAllClose(
-                  weighted_sum / np.sum(broadcast_weights),
-                  weighted_loss.eval())
+              self.assertAllClose(weighted_sum / np.sum(broadcast_weights),
+                                  self.evaluate(weighted_loss))
             elif (reduction == losses.Reduction.SUM_OVER_NONZERO_WEIGHTS or
                   reduction == losses.Reduction.SUM_BY_NONZERO_WEIGHTS):
               self.assertAllClose(
                   weighted_sum / np.count_nonzero(broadcast_weights),
-                  weighted_loss.eval())
+                  self.evaluate(weighted_loss))
             elif reduction == losses.Reduction.SUM_OVER_BATCH_SIZE:
-              self.assertAllClose(
-                  weighted_sum / self._raw_losses.size,
-                  weighted_loss.eval())
+              self.assertAllClose(weighted_sum / self._raw_losses.size,
+                                  self.evaluate(weighted_loss))
 
   def test1x1x1Weight(self):
     self._test_valid_weights((((17.0,),),))
diff --git a/tensorflow/python/kernel_tests/lrn_op_test.py b/tensorflow/python/kernel_tests/lrn_op_test.py
index 7ebeb91d90e..fbe628c3944 100644
--- a/tensorflow/python/kernel_tests/lrn_op_test.py
+++ b/tensorflow/python/kernel_tests/lrn_op_test.py
@@ -24,6 +24,7 @@ import numpy as np
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import gradients_impl
@@ -92,6 +93,7 @@ class LRNOpTest(test.TestCase):
       self.assertTrue(err < 1e-2)
     self.assertShapeEqual(expected, lrn_t)
 
+  @test_util.run_deprecated_v1
   def testCompute(self):
     for _ in range(2):
       self._RunAndVerify(dtypes.float32)
@@ -99,6 +101,7 @@ class LRNOpTest(test.TestCase):
       if not test.is_gpu_available():
         self._RunAndVerify(dtypes.float16)
 
+  @test_util.run_deprecated_v1
   def testGradientsZeroInput(self):
     with self.session(use_gpu=True):
       shape = [4, 4, 4, 4]
@@ -147,6 +150,7 @@ class LRNOpTest(test.TestCase):
     else:
       self.assertLess(err, 1.0)
 
+  @test_util.run_deprecated_v1
   def testGradients(self):
     for _ in range(2):
       self._RunAndVerifyGradients(dtypes.float32)
diff --git a/tensorflow/python/kernel_tests/manip_ops_test.py b/tensorflow/python/kernel_tests/manip_ops_test.py
index f71857a3cba..5700db4b950 100644
--- a/tensorflow/python/kernel_tests/manip_ops_test.py
+++ b/tensorflow/python/kernel_tests/manip_ops_test.py
@@ -62,6 +62,7 @@ class RollTest(test_util.TensorFlowTestCase):
     if np_input.dtype == np.float32:
       self._testGradient(np_input, shift, axis)
 
+  @test_util.run_deprecated_v1
   def testIntTypes(self):
     for t in [np.int32, np.int64]:
       self._testAll(np.random.randint(-100, 100, (5)).astype(t), 3, 0)
@@ -73,6 +74,7 @@ class RollTest(test_util.TensorFlowTestCase):
             np.random.randint(-100, 100, (4, 2, 1, 3)).astype(t), [0, 1, -2],
             [1, 2, 3])
 
+  @test_util.run_deprecated_v1
   def testFloatTypes(self):
     for t in [np.float32, np.float64]:
       self._testAll(np.random.rand(5).astype(t), 2, 0)
@@ -80,6 +82,7 @@ class RollTest(test_util.TensorFlowTestCase):
         self._testAll(np.random.rand(3, 4).astype(t), [1, 2], [1, 0])
         self._testAll(np.random.rand(1, 3, 4).astype(t), [1, 0, -3], [0, 1, 2])
 
+  @test_util.run_deprecated_v1
   def testComplexTypes(self):
     for t in [np.complex64, np.complex128]:
       x = np.random.rand(4, 4).astype(t)
@@ -90,6 +93,7 @@ class RollTest(test_util.TensorFlowTestCase):
         x = np.random.rand(3, 2, 1, 1).astype(t)
         self._testAll(x + 1j * x, [2, 1, 1, 0], [0, 3, 1, 2])
 
+  @test_util.run_deprecated_v1
   def testNegativeAxis(self):
     self._testAll(np.random.randint(-100, 100, (5)).astype(np.int32), 3, -1)
     self._testAll(np.random.randint(-100, 100, (4, 4)).astype(np.int32), 3, -2)
@@ -100,12 +104,14 @@ class RollTest(test_util.TensorFlowTestCase):
         manip_ops.roll(np.random.randint(-100, 100, (4, 4)).astype(np.int32),
                        3, -10).eval()
 
+  @test_util.run_deprecated_v1
   def testInvalidInputShape(self):
     # The input should be 1-D or higher, checked in shape function.
     with self.assertRaisesRegexp(
         ValueError, "Shape must be at least rank 1 but is rank 0"):
       manip_ops.roll(7, 1, 0)
 
+  @test_util.run_deprecated_v1
   def testRollInputMustVectorHigherRaises(self):
     # The input should be 1-D or higher, checked in kernel.
     tensor = array_ops.placeholder(dtype=dtypes.int32)
@@ -116,12 +122,14 @@ class RollTest(test_util.TensorFlowTestCase):
                                    "input must be 1-D or higher"):
         manip_ops.roll(tensor, shift, axis).eval(feed_dict={tensor: 7})
 
+  @test_util.run_deprecated_v1
   def testInvalidAxisShape(self):
     # The axis should be a scalar or 1-D, checked in shape function.
     with self.assertRaisesRegexp(
         ValueError, "Shape must be at most rank 1 but is rank 2"):
       manip_ops.roll([[1, 2], [3, 4]], 1, [[0, 1]])
 
+  @test_util.run_deprecated_v1
   def testRollAxisMustBeScalarOrVectorRaises(self):
     # The axis should be a scalar or 1-D, checked in kernel.
     tensor = [[1, 2], [3, 4]]
@@ -132,12 +140,14 @@ class RollTest(test_util.TensorFlowTestCase):
                                    "axis must be a scalar or a 1-D vector"):
         manip_ops.roll(tensor, shift, axis).eval(feed_dict={axis: [[0, 1]]})
 
+  @test_util.run_deprecated_v1
   def testInvalidShiftShape(self):
     # The shift should be a scalar or 1-D, checked in shape function.
     with self.assertRaisesRegexp(
         ValueError, "Shape must be at most rank 1 but is rank 2"):
       manip_ops.roll([[1, 2], [3, 4]], [[0, 1]], 1)
 
+  @test_util.run_deprecated_v1
   def testRollShiftMustBeScalarOrVectorRaises(self):
     # The shift should be a scalar or 1-D, checked in kernel.
     tensor = [[1, 2], [3, 4]]
@@ -148,11 +158,13 @@ class RollTest(test_util.TensorFlowTestCase):
                                    "shift must be a scalar or a 1-D vector"):
         manip_ops.roll(tensor, shift, axis).eval(feed_dict={shift: [[0, 1]]})
 
+  @test_util.run_deprecated_v1
   def testInvalidShiftAndAxisNotEqualShape(self):
     # The shift and axis must be same size, checked in shape function.
     with self.assertRaisesRegexp(ValueError, "both shapes must be equal"):
       manip_ops.roll([[1, 2], [3, 4]], [1], [0, 1])
 
+  @test_util.run_deprecated_v1
   def testRollShiftAndAxisMustBeSameSizeRaises(self):
     # The shift and axis must be same size, checked in kernel.
     tensor = [[1, 2], [3, 4]]
diff --git a/tensorflow/python/kernel_tests/map_stage_op_test.py b/tensorflow/python/kernel_tests/map_stage_op_test.py
index d503f3d7c9f..dd16fad6904 100644
--- a/tensorflow/python/kernel_tests/map_stage_op_test.py
+++ b/tensorflow/python/kernel_tests/map_stage_op_test.py
@@ -19,6 +19,7 @@ from __future__ import print_function
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.ops import math_ops
@@ -29,6 +30,7 @@ TIMEOUT = 1
 
 class MapStageTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testSimple(self):
     with ops.Graph().as_default() as G:
       with ops.device('/cpu:0'):
@@ -50,6 +52,7 @@ class MapStageTest(test.TestCase):
         _, yval = sess.run([stage, y], feed_dict={x: i, pi: i + 1, gi: i})
         self.assertAllClose(4 * (i - 1) * (i - 1) * 128, yval, rtol=1e-4)
 
+  @test_util.run_deprecated_v1
   def testMultiple(self):
     with ops.Graph().as_default() as G:
       with ops.device('/cpu:0'):
@@ -72,6 +75,7 @@ class MapStageTest(test.TestCase):
         self.assertAllClose(
             4 * (i - 1) * (i - 1) * (i - 1) * 128, yval, rtol=1e-4)
 
+  @test_util.run_deprecated_v1
   def testDictionary(self):
     with ops.Graph().as_default() as G:
       with ops.device('/cpu:0'):
@@ -121,6 +125,7 @@ class MapStageTest(test.TestCase):
 
     G.finalize()
 
+  @test_util.run_deprecated_v1
   def testPeek(self):
     with ops.Graph().as_default() as G:
       with ops.device('/cpu:0'):
@@ -150,6 +155,7 @@ class MapStageTest(test.TestCase):
 
       self.assertTrue(sess.run(size) == 10)
 
+  @test_util.run_deprecated_v1
   def testSizeAndClear(self):
     with ops.Graph().as_default() as G:
       with ops.device('/cpu:0'):
@@ -176,6 +182,7 @@ class MapStageTest(test.TestCase):
       sess.run(clear)
       self.assertEqual(sess.run(size), 0)
 
+  @test_util.run_deprecated_v1
   def testCapacity(self):
     capacity = 3
 
@@ -239,6 +246,7 @@ class MapStageTest(test.TestCase):
 
       self.assertTrue(sess.run(size) == 0)
 
+  @test_util.run_deprecated_v1
   def testMemoryLimit(self):
     memory_limit = 512 * 1024  # 512K
     chunk = 200 * 1024  # 256K
@@ -303,6 +311,7 @@ class MapStageTest(test.TestCase):
 
       self.assertTrue(sess.run(size) == 0)
 
+  @test_util.run_deprecated_v1
   def testOrdering(self):
     import six
     import random
@@ -341,6 +350,7 @@ class MapStageTest(test.TestCase):
 
       self.assertTrue(sess.run(size) == 0)
 
+  @test_util.run_deprecated_v1
   def testPartialDictInsert(self):
     with ops.Graph().as_default() as G:
       with ops.device('/cpu:0'):
@@ -400,6 +410,7 @@ class MapStageTest(test.TestCase):
               'v': 3
           }])
 
+  @test_util.run_deprecated_v1
   def testPartialIndexInsert(self):
     with ops.Graph().as_default() as G:
       with ops.device('/cpu:0'):
@@ -443,6 +454,7 @@ class MapStageTest(test.TestCase):
       # We can now obtain tuple associated with key 1
       self.assertTrue(sess.run([key, ret], feed_dict={gi: 1}) == [1, [1, 3, 2]])
 
+  @test_util.run_deprecated_v1
   def testPartialDictGetsAndPeeks(self):
     with ops.Graph().as_default() as G:
       with ops.device('/cpu:0'):
@@ -540,6 +552,7 @@ class MapStageTest(test.TestCase):
       # Nothing is left
       self.assertTrue(sess.run([size, isize]) == [0, 0])
 
+  @test_util.run_deprecated_v1
   def testPartialIndexGets(self):
     with ops.Graph().as_default() as G:
       with ops.device('/cpu:0'):
diff --git a/tensorflow/python/kernel_tests/matmul_op_test.py b/tensorflow/python/kernel_tests/matmul_op_test.py
index 1c2822180ac..983f463f5e3 100644
--- a/tensorflow/python/kernel_tests/matmul_op_test.py
+++ b/tensorflow/python/kernel_tests/matmul_op_test.py
@@ -44,7 +44,7 @@ class MatVecTest(test_lib.TestCase):
     with self.cached_session():
       c = math_ops.matvec(a, b)
       self.assertAllEqual((2,), c.shape)
-      c_ = c.eval()
+      c_ = self.evaluate(c)
     self.assertAllEqual([5 + 2 * 6, 3 * 5 + 4 * 6], c_)
 
 
@@ -90,7 +90,7 @@ def _GetMatMulTest(a_np_, b_np_, use_static_shape_, **kwargs_):
         a = constant_op.constant(effective_a_np)
         b = constant_op.constant(effective_b_np)
         res = math_ops.matmul(a, b, **kwargs_)
-        tf_val = res.eval()
+        tf_val = self.evaluate(res)
       else:
         a = array_ops.placeholder(a_np_.dtype)
         b = array_ops.placeholder(b_np_.dtype)
@@ -194,6 +194,7 @@ except AttributeError:
 
 class MatMulInfixOperatorTest(test_lib.TestCase):
 
+  @test_util.run_deprecated_v1
   def testMismatchedShape(self):
     with self.assertRaisesWithPredicateMatch(ValueError,
                                              lambda e: "Shape must" in str(e)):
@@ -201,6 +202,7 @@ class MatMulInfixOperatorTest(test_lib.TestCase):
           ops.convert_to_tensor([10.0, 20.0, 30.0]),
           ops.convert_to_tensor([[40.0, 50.0], [60.0, 70.0]]))
 
+  @test_util.run_deprecated_v1
   def testMismatchedDimensions(self):
     with self.assertRaisesWithPredicateMatch(
         ValueError, lambda e: "Dimensions must" in str(e)):
@@ -208,19 +210,21 @@ class MatMulInfixOperatorTest(test_lib.TestCase):
           ops.convert_to_tensor([[10.0, 20.0, 30.0]]),
           ops.convert_to_tensor([[40.0, 50.0], [60.0, 70.0]]))
 
+  @test_util.run_deprecated_v1
   def testInfixMatmulIsTfMatmul(self):
     a = ops.convert_to_tensor([[10.0, 20.0, 30.0]])
     b = ops.convert_to_tensor([[40.0, 50.0], [60.0, 70.0], [80.0, 90.0]])
     c = infix_matmul(a, b)
     self.assertEqual(c.op.type, "MatMul")
 
+  @test_util.run_deprecated_v1
   def testInfixMatmulDoesDotProduct(self):
     a = ops.convert_to_tensor([[10.0, 20.0, 30.0]])
     b = ops.convert_to_tensor([[40.0, 50.0], [60.0, 70.0], [80.0, 90.0]])
     c = infix_matmul(a, b)
     d = math_ops.matmul(a, b)
     with self.cached_session():
-      self.assertAllEqual(c.eval(), d.eval())
+      self.assertAllEqual(c.eval(), self.evaluate(d))
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/kernel_tests/matrix_band_part_op_test.py b/tensorflow/python/kernel_tests/matrix_band_part_op_test.py
index 93a668f1259..129ea40dfe6 100644
--- a/tensorflow/python/kernel_tests/matrix_band_part_op_test.py
+++ b/tensorflow/python/kernel_tests/matrix_band_part_op_test.py
@@ -62,7 +62,7 @@ def _GetMatrixBandPartTest(dtype_, batch_shape_, shape_):
                 batch_mat,
                 constant_op.constant(lower, index_dtype),
                 constant_op.constant(upper, index_dtype))
-            self.assertAllEqual(band_np, band.eval())
+            self.assertAllEqual(band_np, self.evaluate(band))
 
   return Test
 
diff --git a/tensorflow/python/kernel_tests/matrix_exponential_op_test.py b/tensorflow/python/kernel_tests/matrix_exponential_op_test.py
index 3abdf50ece5..372b6dc17f4 100644
--- a/tensorflow/python/kernel_tests/matrix_exponential_op_test.py
+++ b/tensorflow/python/kernel_tests/matrix_exponential_op_test.py
@@ -25,6 +25,7 @@ import numpy as np
 from tensorflow.python.client import session
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import random_ops
@@ -50,7 +51,7 @@ class ExponentialOpTest(test.TestCase):
 
   def _verifyExponential(self, x, np_type):
     inp = x.astype(np_type)
-    with self.cached_session(use_gpu=True):
+    with test_util.use_gpu():
       tf_ans = linalg_impl.matrix_exponential(inp)
       if x.size == 0:
         np_ans = np.empty(x.shape, dtype=np_type)
@@ -61,7 +62,7 @@ class ExponentialOpTest(test.TestCase):
             np_ans[i] = np_expm(inp[i])
         else:
           np_ans = np_expm(inp)
-      out = tf_ans.eval()
+      out = self.evaluate(tf_ans)
       self.assertAllClose(np_ans, out, rtol=1e-4, atol=1e-3)
 
   def _verifyExponentialReal(self, x):
@@ -121,12 +122,14 @@ class ExponentialOpTest(test.TestCase):
     # Complex batch
     self._verifyExponentialComplex(self._makeBatch(matrix1, matrix2))
 
+  @test_util.run_deprecated_v1
   def testNonSquareMatrix(self):
     # When the exponential of a non-square matrix is attempted we should return
     # an error
     with self.assertRaises(ValueError):
       linalg_impl.matrix_exponential(np.array([[1., 2., 3.], [3., 4., 5.]]))
 
+  @test_util.run_deprecated_v1
   def testWrongDimensions(self):
     # The input to the exponential should be at least a 2-dimensional tensor.
     tensor3 = constant_op.constant([1., 2.])
@@ -137,6 +140,7 @@ class ExponentialOpTest(test.TestCase):
     self._verifyExponentialReal(np.empty([0, 2, 2]))
     self._verifyExponentialReal(np.empty([2, 0, 0]))
 
+  @test_util.run_deprecated_v1
   def testDynamic(self):
     with self.session(use_gpu=True) as sess:
       inp = array_ops.placeholder(ops.dtypes.float32)
@@ -144,13 +148,14 @@ class ExponentialOpTest(test.TestCase):
       matrix = np.array([[1., 2.], [3., 4.]])
       sess.run(expm, feed_dict={inp: matrix})
 
+  @test_util.run_deprecated_v1
   def testConcurrentExecutesWithoutError(self):
     with self.session(use_gpu=True) as sess:
       matrix1 = random_ops.random_normal([5, 5], seed=42)
       matrix2 = random_ops.random_normal([5, 5], seed=42)
       expm1 = linalg_impl.matrix_exponential(matrix1)
       expm2 = linalg_impl.matrix_exponential(matrix2)
-      expm = sess.run([expm1, expm2])
+      expm = self.evaluate([expm1, expm2])
       self.assertAllEqual(expm[0], expm[1])
 
 
diff --git a/tensorflow/python/kernel_tests/matrix_inverse_op_test.py b/tensorflow/python/kernel_tests/matrix_inverse_op_test.py
index 2247f1541e2..5cef4b79a32 100644
--- a/tensorflow/python/kernel_tests/matrix_inverse_op_test.py
+++ b/tensorflow/python/kernel_tests/matrix_inverse_op_test.py
@@ -46,7 +46,7 @@ class InverseOpTest(test.TestCase):
           tiling = list(y.shape)
           tiling[-2:] = [1, 1]
           np_ans = np.tile(np_ans, tiling)
-        out = tf_ans.eval()
+        out = self.evaluate(tf_ans)
         self.assertAllClose(np_ans, out, rtol=1e-4, atol=1e-3)
         self.assertShapeEqual(y, tf_ans)
 
@@ -146,7 +146,7 @@ class InverseOpTest(test.TestCase):
         inv1 = linalg_ops.matrix_inverse(matrix1, adjoint=adjoint_)
         inv2 = linalg_ops.matrix_inverse(matrix2, adjoint=adjoint_)
         all_ops += [inv1, inv2]
-      inv = sess.run(all_ops)
+      inv = self.evaluate(all_ops)
       self.assertAllEqual(inv[0], inv[1])
       self.assertAllEqual(inv[2], inv[3])
 
diff --git a/tensorflow/python/kernel_tests/matrix_logarithm_op_test.py b/tensorflow/python/kernel_tests/matrix_logarithm_op_test.py
index 2010a4b2a86..b0bce6a1b9b 100644
--- a/tensorflow/python/kernel_tests/matrix_logarithm_op_test.py
+++ b/tensorflow/python/kernel_tests/matrix_logarithm_op_test.py
@@ -25,6 +25,7 @@ from tensorflow.python.client import session
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gen_linalg_ops
 from tensorflow.python.ops import math_ops
@@ -39,11 +40,11 @@ class LogarithmOpTest(test.TestCase):
 
   def _verifyLogarithm(self, x, np_type):
     inp = x.astype(np_type)
-    with self.cached_session(use_gpu=True):
+    with test_util.use_gpu():
       # Verify that expm(logm(A)) == A.
       tf_ans = linalg_impl.matrix_exponential(
           gen_linalg_ops.matrix_logarithm(inp))
-      out = tf_ans.eval()
+      out = self.evaluate(tf_ans)
       self.assertAllClose(inp, out, rtol=1e-4, atol=1e-3)
 
   def _verifyLogarithmComplex(self, x):
@@ -128,7 +129,7 @@ class LogarithmOpTest(test.TestCase):
           random_ops.random_normal([5, 5], seed=42), dtypes.complex64)
       logm1 = gen_linalg_ops.matrix_logarithm(matrix1)
       logm2 = gen_linalg_ops.matrix_logarithm(matrix2)
-      logm = sess.run([logm1, logm2])
+      logm = self.evaluate([logm1, logm2])
       self.assertAllEqual(logm[0], logm[1])
 
 
diff --git a/tensorflow/python/kernel_tests/matrix_solve_ls_op_test.py b/tensorflow/python/kernel_tests/matrix_solve_ls_op_test.py
index 13a7df7f95d..a6f5da9d3d7 100644
--- a/tensorflow/python/kernel_tests/matrix_solve_ls_op_test.py
+++ b/tensorflow/python/kernel_tests/matrix_solve_ls_op_test.py
@@ -24,6 +24,7 @@ from tensorflow.python.client import session
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import linalg_ops
@@ -133,6 +134,7 @@ class MatrixSolveLsOpTest(test_lib.TestCase):
       self.assertEqual(np_ans.shape, tf_ans_val.shape)
       self.assertAllClose(np_ans, tf_ans_val, atol=2 * tol, rtol=2 * tol)
 
+  @test_util.run_deprecated_v1
   def testWrongDimensions(self):
     # The matrix and right-hand sides should have the same number of rows.
     with self.session(use_gpu=True):
@@ -141,6 +143,7 @@ class MatrixSolveLsOpTest(test_lib.TestCase):
       with self.assertRaises(ValueError):
         linalg_ops.matrix_solve_ls(matrix, rhs)
 
+  @test_util.run_deprecated_v1
   def testEmpty(self):
     full = np.array([[1., 2.], [3., 4.], [5., 6.]])
     empty0 = np.empty([3, 0])
@@ -156,6 +159,7 @@ class MatrixSolveLsOpTest(test_lib.TestCase):
         tf_ans = linalg_ops.matrix_solve_ls(empty1, empty1, fast=fast).eval()
         self.assertEqual(tf_ans.shape, (2, 2))
 
+  @test_util.run_deprecated_v1
   def testBatchResultSize(self):
     # 3x3x3 matrices, 3x3x1 right-hand sides.
     matrix = np.array([1., 2., 3., 4., 5., 6., 7., 8., 9.] * 3).reshape(3, 3, 3)
diff --git a/tensorflow/python/kernel_tests/matrix_solve_op_test.py b/tensorflow/python/kernel_tests/matrix_solve_op_test.py
index 9e30ae16289..db7c4802f69 100644
--- a/tensorflow/python/kernel_tests/matrix_solve_op_test.py
+++ b/tensorflow/python/kernel_tests/matrix_solve_op_test.py
@@ -24,6 +24,7 @@ from tensorflow.python.client import session
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import linalg_ops
@@ -63,7 +64,7 @@ class MatrixSolveOpTest(test.TestCase):
               out = sess.run(tf_ans, {a_ph: a, b_ph: b})
             else:
               tf_ans = linalg_ops.matrix_solve(a, b, adjoint=adjoint)
-              out = tf_ans.eval()
+              out = self.evaluate(tf_ans)
               self.assertEqual(tf_ans.get_shape(), out.shape)
             self.assertEqual(np_ans.shape, out.shape)
             self.assertAllClose(np_ans, out, atol=tol, rtol=tol)
@@ -75,6 +76,7 @@ class MatrixSolveOpTest(test.TestCase):
         [m, n]))
     return matrix
 
+  @test_util.run_deprecated_v1
   def testSolve(self):
     for n in 1, 2, 4, 9:
       matrix = self._generateMatrix(n, n)
@@ -82,6 +84,7 @@ class MatrixSolveOpTest(test.TestCase):
         rhs = self._generateMatrix(n, nrhs)
         self._verifySolve(matrix, rhs)
 
+  @test_util.run_deprecated_v1
   def testSolveBatch(self):
     for n in 2, 5:
       matrix = self._generateMatrix(n, n)
@@ -90,6 +93,7 @@ class MatrixSolveOpTest(test.TestCase):
         for batch_dims in [[2], [2, 2], [7, 4]]:
           self._verifySolve(matrix, rhs, batch_dims=batch_dims)
 
+  @test_util.run_deprecated_v1
   def testNonSquareMatrix(self):
     # When the solve of a non-square matrix is attempted we should return
     # an error
@@ -98,6 +102,7 @@ class MatrixSolveOpTest(test.TestCase):
         matrix = constant_op.constant([[1., 2., 3.], [3., 4., 5.]])
         linalg_ops.matrix_solve(matrix, matrix)
 
+  @test_util.run_deprecated_v1
   def testWrongDimensions(self):
     # The matrix and right-hand sides should have the same number of rows.
     with self.session(use_gpu=True):
@@ -115,6 +120,7 @@ class MatrixSolveOpTest(test.TestCase):
                                        [0., -1., 1.]])
         linalg_ops.matrix_solve(matrix, matrix).eval()
 
+  @test_util.run_deprecated_v1
   def testConcurrent(self):
     with self.session(use_gpu=True) as sess:
       all_ops = []
@@ -126,7 +132,7 @@ class MatrixSolveOpTest(test.TestCase):
         s1 = linalg_ops.matrix_solve(lhs1, rhs1, adjoint=adjoint_)
         s2 = linalg_ops.matrix_solve(lhs2, rhs2, adjoint=adjoint_)
         all_ops += [s1, s2]
-      val = sess.run(all_ops)
+      val = self.evaluate(all_ops)
       self.assertAllEqual(val[0], val[1])
       self.assertAllEqual(val[2], val[3])
 
diff --git a/tensorflow/python/kernel_tests/matrix_square_root_op_test.py b/tensorflow/python/kernel_tests/matrix_square_root_op_test.py
index 9212580313c..1e2109b8c41 100644
--- a/tensorflow/python/kernel_tests/matrix_square_root_op_test.py
+++ b/tensorflow/python/kernel_tests/matrix_square_root_op_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import gen_linalg_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
@@ -31,7 +32,7 @@ class SquareRootOpTest(test.TestCase):
 
   def _verifySquareRoot(self, matrix, np_type):
     matrix = matrix.astype(np_type)
-    with self.test_session(use_gpu=True):
+    with test_util.use_gpu():
       # Verify that matmul(sqrtm(A), sqrtm(A)) = A
       sqrt = gen_linalg_ops.matrix_square_root(matrix)
       square = math_ops.matmul(sqrt, sqrt)
@@ -96,19 +97,20 @@ class SquareRootOpTest(test.TestCase):
       gen_linalg_ops.matrix_square_root(tensor)
 
   def testNotSquare(self):
-    with self.test_session():
-      with self.assertRaises(ValueError):
-        tensor = constant_op.constant([[1., 0., -1.], [-1., 1., 0.]])
-        gen_linalg_ops.matrix_square_root(tensor).eval()
+    with self.assertRaises(ValueError):
+      tensor = constant_op.constant([[1., 0., -1.], [-1., 1., 0.]])
+      self.evaluate(gen_linalg_ops.matrix_square_root(tensor))
 
   def testConcurrentExecutesWithoutError(self):
-    with self.test_session(use_gpu=True) as sess:
+    with test_util.use_gpu():
       matrix1 = random_ops.random_normal([5, 5], seed=42)
       matrix2 = random_ops.random_normal([5, 5], seed=42)
-      sqrt1 = gen_linalg_ops.matrix_square_root(matrix1)
-      sqrt2 = gen_linalg_ops.matrix_square_root(matrix2)
+      square1 = math_ops.matmul(matrix1, matrix1)
+      square2 = math_ops.matmul(matrix2, matrix2)
+      sqrt1 = gen_linalg_ops.matrix_square_root(square1)
+      sqrt2 = gen_linalg_ops.matrix_square_root(square2)
       all_ops = [sqrt1, sqrt2]
-      sqrt = sess.run(all_ops)
+      sqrt = self.evaluate(all_ops)
       self.assertAllEqual(sqrt[0], sqrt[1])
 
 
diff --git a/tensorflow/python/kernel_tests/matrix_triangular_solve_op_test.py b/tensorflow/python/kernel_tests/matrix_triangular_solve_op_test.py
index 445faca3ee2..dde83f12f3c 100644
--- a/tensorflow/python/kernel_tests/matrix_triangular_solve_op_test.py
+++ b/tensorflow/python/kernel_tests/matrix_triangular_solve_op_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import linalg_ops
 from tensorflow.python.platform import test
@@ -87,12 +88,13 @@ class MatrixTriangularSolveOpTest(test.TestCase):
           b_tf = constant_op.constant(b)
           tf_ans = linalg_ops.matrix_triangular_solve(
               a_tf, b_tf, lower=lower, adjoint=adjoint)
-          tf_val = tf_ans.eval()
+          tf_val = self.evaluate(tf_ans)
           np_ans = np.linalg.solve(a_np, b)
           self.assertEqual(np_ans.shape, tf_ans.get_shape())
         self.assertEqual(np_ans.shape, tf_val.shape)
         self.assertAllClose(np_ans, tf_val)
 
+  @test_util.run_deprecated_v1
   def testSolve(self):
     # 1x1 matrix, single rhs.
     matrix = np.array([[0.1]])
@@ -106,6 +108,7 @@ class MatrixTriangularSolveOpTest(test.TestCase):
     rhs1 = np.array([[1., 0., 1.], [0., 1., 1.]])
     self._verifySolveAllWaysReal(matrix, rhs1)
 
+  @test_util.run_deprecated_v1
   def testSolveComplex(self):
     # 1x1 matrix, single rhs.
     matrix = np.array([[0.1 + 1j * 0.1]])
@@ -122,6 +125,7 @@ class MatrixTriangularSolveOpTest(test.TestCase):
     rhs1 += 1j * rhs1
     self._verifySolveAllWaysComplex(matrix, rhs1)
 
+  @test_util.run_deprecated_v1
   def testSolveBatch(self):
     matrix = np.array([[1., 2.], [3., 4.]])
     rhs = np.array([[1., 0., 1.], [0., 1., 1.]])
@@ -130,6 +134,7 @@ class MatrixTriangularSolveOpTest(test.TestCase):
     # Batch of 3x2x2x2 matrices, 3x2x2x3 right-hand sides.
     self._verifySolveAllWaysReal(matrix, rhs, batch_dims=[3, 2])
 
+  @test_util.run_deprecated_v1
   def testSolveBatchComplex(self):
     matrix = np.array([[1., 2.], [3., 4.]]).astype(np.complex64)
     matrix += 1j * matrix
@@ -140,6 +145,7 @@ class MatrixTriangularSolveOpTest(test.TestCase):
     # Batch of 3x2x2x2 matrices, 3x2x2x3 right-hand sides.
     self._verifySolveAllWaysComplex(matrix, rhs, batch_dims=[3, 2])
 
+  @test_util.run_deprecated_v1
   def testNonSquareMatrix(self):
     # A non-square matrix should cause an error.
     matrix = np.array([[1., 2., 3.], [3., 4., 5.]])
@@ -149,6 +155,7 @@ class MatrixTriangularSolveOpTest(test.TestCase):
       with self.assertRaises(ValueError):
         self._verifySolve(matrix, matrix, batch_dims=[2, 3])
 
+  @test_util.run_deprecated_v1
   def testWrongDimensions(self):
     # The matrix should have the same number of rows as the
     # right-hand sides.
diff --git a/tensorflow/python/kernel_tests/metrics_test.py b/tensorflow/python/kernel_tests/metrics_test.py
index 5dcdb9e4205..64dd5914552 100644
--- a/tensorflow/python/kernel_tests/metrics_test.py
+++ b/tensorflow/python/kernel_tests/metrics_test.py
@@ -29,6 +29,7 @@ from tensorflow.python.framework import dtypes as dtypes_lib
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.ops import math_ops
@@ -175,22 +176,26 @@ class MeanTest(test.TestCase):
   def setUp(self):
     ops.reset_default_graph()
 
+  @test_util.run_deprecated_v1
   def testVars(self):
     metrics.mean(array_ops.ones([4, 3]))
     _assert_metric_variables(self, ('mean/count:0', 'mean/total:0'))
 
+  @test_util.run_deprecated_v1
   def testMetricsCollection(self):
     my_collection_name = '__metrics__'
     mean, _ = metrics.mean(
         array_ops.ones([4, 3]), metrics_collections=[my_collection_name])
     self.assertListEqual(ops.get_collection(my_collection_name), [mean])
 
+  @test_util.run_deprecated_v1
   def testUpdatesCollection(self):
     my_collection_name = '__updates__'
     _, update_op = metrics.mean(
         array_ops.ones([4, 3]), updates_collections=[my_collection_name])
     self.assertListEqual(ops.get_collection(my_collection_name), [update_op])
 
+  @test_util.run_deprecated_v1
   def testBasic(self):
     with self.cached_session() as sess:
       values_queue = data_flow_ops.FIFOQueue(
@@ -203,11 +208,12 @@ class MeanTest(test.TestCase):
 
       mean, update_op = metrics.mean(values)
 
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       for _ in range(4):
-        sess.run(update_op)
-      self.assertAlmostEqual(1.65, sess.run(mean), 5)
+        self.evaluate(update_op)
+      self.assertAlmostEqual(1.65, self.evaluate(mean), 5)
 
+  @test_util.run_deprecated_v1
   def testUpdateOpsReturnsCurrentValue(self):
     with self.cached_session() as sess:
       values_queue = data_flow_ops.FIFOQueue(
@@ -220,15 +226,16 @@ class MeanTest(test.TestCase):
 
       mean, update_op = metrics.mean(values)
 
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
 
-      self.assertAlmostEqual(0.5, sess.run(update_op), 5)
-      self.assertAlmostEqual(1.475, sess.run(update_op), 5)
-      self.assertAlmostEqual(12.4 / 6.0, sess.run(update_op), 5)
-      self.assertAlmostEqual(1.65, sess.run(update_op), 5)
+      self.assertAlmostEqual(0.5, self.evaluate(update_op), 5)
+      self.assertAlmostEqual(1.475, self.evaluate(update_op), 5)
+      self.assertAlmostEqual(12.4 / 6.0, self.evaluate(update_op), 5)
+      self.assertAlmostEqual(1.65, self.evaluate(update_op), 5)
 
-      self.assertAlmostEqual(1.65, sess.run(mean), 5)
+      self.assertAlmostEqual(1.65, self.evaluate(mean), 5)
 
+  @test_util.run_deprecated_v1
   def testUnweighted(self):
     values = _test_values((3, 2, 4, 1))
     mean_results = (
@@ -271,37 +278,44 @@ class MeanTest(test.TestCase):
       self.assertAlmostEqual(expected, update_op.eval(), places=5)
       self.assertAlmostEqual(expected, mean.eval(), places=5)
 
+  @test_util.run_deprecated_v1
   def test1x1x1Weighted(self):
     self._test_3d_weighted(
         _test_values((3, 2, 4)),
         weights=np.asarray((5,)).reshape((1, 1, 1)))
 
+  @test_util.run_deprecated_v1
   def test1x1xNWeighted(self):
     self._test_3d_weighted(
         _test_values((3, 2, 4)),
         weights=np.asarray((5, 7, 11, 3)).reshape((1, 1, 4)))
 
+  @test_util.run_deprecated_v1
   def test1xNx1Weighted(self):
     self._test_3d_weighted(
         _test_values((3, 2, 4)),
         weights=np.asarray((5, 11)).reshape((1, 2, 1)))
 
+  @test_util.run_deprecated_v1
   def test1xNxNWeighted(self):
     self._test_3d_weighted(
         _test_values((3, 2, 4)),
         weights=np.asarray((5, 7, 11, 3, 2, 13, 7, 5)).reshape((1, 2, 4)))
 
+  @test_util.run_deprecated_v1
   def testNx1x1Weighted(self):
     self._test_3d_weighted(
         _test_values((3, 2, 4)),
         weights=np.asarray((5, 7, 11)).reshape((3, 1, 1)))
 
+  @test_util.run_deprecated_v1
   def testNx1xNWeighted(self):
     self._test_3d_weighted(
         _test_values((3, 2, 4)),
         weights=np.asarray((
             5, 7, 11, 3, 2, 12, 7, 5, 2, 17, 11, 3)).reshape((3, 1, 4)))
 
+  @test_util.run_deprecated_v1
   def testNxNxNWeighted(self):
     self._test_3d_weighted(
         _test_values((3, 2, 4)),
@@ -309,6 +323,7 @@ class MeanTest(test.TestCase):
             5, 7, 11, 3, 2, 12, 7, 5, 2, 17, 11, 3,
             2, 17, 11, 3, 5, 7, 11, 3, 2, 12, 7, 5)).reshape((3, 2, 4)))
 
+  @test_util.run_deprecated_v1
   def testInvalidWeights(self):
     values_placeholder = array_ops.placeholder(dtype=dtypes_lib.float32)
     values = _test_values((3, 2, 4, 1))
@@ -341,23 +356,27 @@ class MeanTensorTest(test.TestCase):
   def setUp(self):
     ops.reset_default_graph()
 
+  @test_util.run_deprecated_v1
   def testVars(self):
     metrics.mean_tensor(array_ops.ones([4, 3]))
     _assert_metric_variables(self,
                              ('mean/total_tensor:0', 'mean/count_tensor:0'))
 
+  @test_util.run_deprecated_v1
   def testMetricsCollection(self):
     my_collection_name = '__metrics__'
     mean, _ = metrics.mean_tensor(
         array_ops.ones([4, 3]), metrics_collections=[my_collection_name])
     self.assertListEqual(ops.get_collection(my_collection_name), [mean])
 
+  @test_util.run_deprecated_v1
   def testUpdatesCollection(self):
     my_collection_name = '__updates__'
     _, update_op = metrics.mean_tensor(
         array_ops.ones([4, 3]), updates_collections=[my_collection_name])
     self.assertListEqual(ops.get_collection(my_collection_name), [update_op])
 
+  @test_util.run_deprecated_v1
   def testBasic(self):
     with self.cached_session() as sess:
       values_queue = data_flow_ops.FIFOQueue(
@@ -370,11 +389,12 @@ class MeanTensorTest(test.TestCase):
 
       mean, update_op = metrics.mean_tensor(values)
 
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       for _ in range(4):
-        sess.run(update_op)
-      self.assertAllClose([[-0.9 / 4., 3.525]], sess.run(mean))
+        self.evaluate(update_op)
+      self.assertAllClose([[-0.9 / 4., 3.525]], self.evaluate(mean))
 
+  @test_util.run_deprecated_v1
   def testMultiDimensional(self):
     with self.cached_session() as sess:
       values_queue = data_flow_ops.FIFOQueue(
@@ -391,11 +411,13 @@ class MeanTensorTest(test.TestCase):
 
       mean, update_op = metrics.mean_tensor(values)
 
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       for _ in range(2):
-        sess.run(update_op)
-      self.assertAllClose([[[1, 2], [1, 2]], [[2, 3], [5, 6]]], sess.run(mean))
+        self.evaluate(update_op)
+      self.assertAllClose([[[1, 2], [1, 2]], [[2, 3], [5, 6]]],
+                          self.evaluate(mean))
 
+  @test_util.run_deprecated_v1
   def testUpdateOpsReturnsCurrentValue(self):
     with self.cached_session() as sess:
       values_queue = data_flow_ops.FIFOQueue(
@@ -408,15 +430,16 @@ class MeanTensorTest(test.TestCase):
 
       mean, update_op = metrics.mean_tensor(values)
 
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
 
-      self.assertAllClose([[0, 1]], sess.run(update_op), 5)
-      self.assertAllClose([[-2.1, 5.05]], sess.run(update_op), 5)
-      self.assertAllClose([[2.3 / 3., 10.1 / 3.]], sess.run(update_op), 5)
-      self.assertAllClose([[-0.9 / 4., 3.525]], sess.run(update_op), 5)
+      self.assertAllClose([[0, 1]], self.evaluate(update_op), 5)
+      self.assertAllClose([[-2.1, 5.05]], self.evaluate(update_op), 5)
+      self.assertAllClose([[2.3 / 3., 10.1 / 3.]], self.evaluate(update_op), 5)
+      self.assertAllClose([[-0.9 / 4., 3.525]], self.evaluate(update_op), 5)
 
-      self.assertAllClose([[-0.9 / 4., 3.525]], sess.run(mean), 5)
+      self.assertAllClose([[-0.9 / 4., 3.525]], self.evaluate(mean), 5)
 
+  @test_util.run_deprecated_v1
   def testBinaryWeighted1d(self):
     with self.cached_session() as sess:
       # Create the queue that populates the values.
@@ -439,11 +462,12 @@ class MeanTensorTest(test.TestCase):
 
       mean, update_op = metrics.mean_tensor(values, weights)
 
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       for _ in range(4):
-        sess.run(update_op)
-      self.assertAllClose([[3.25, 0.5]], sess.run(mean), 5)
+        self.evaluate(update_op)
+      self.assertAllClose([[3.25, 0.5]], self.evaluate(mean), 5)
 
+  @test_util.run_deprecated_v1
   def testWeighted1d(self):
     with self.cached_session() as sess:
       # Create the queue that populates the values.
@@ -466,11 +490,12 @@ class MeanTensorTest(test.TestCase):
 
       mean, update_op = metrics.mean_tensor(values, weights)
 
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       for _ in range(4):
-        sess.run(update_op)
-      self.assertAllClose([[0.8, 3.52]], sess.run(mean), 5)
+        self.evaluate(update_op)
+      self.assertAllClose([[0.8, 3.52]], self.evaluate(mean), 5)
 
+  @test_util.run_deprecated_v1
   def testWeighted2d_1(self):
     with self.cached_session() as sess:
       # Create the queue that populates the values.
@@ -493,11 +518,12 @@ class MeanTensorTest(test.TestCase):
 
       mean, update_op = metrics.mean_tensor(values, weights)
 
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       for _ in range(4):
-        sess.run(update_op)
-      self.assertAllClose([[-2.1, 0.5]], sess.run(mean), 5)
+        self.evaluate(update_op)
+      self.assertAllClose([[-2.1, 0.5]], self.evaluate(mean), 5)
 
+  @test_util.run_deprecated_v1
   def testWeighted2d_2(self):
     with self.cached_session() as sess:
       # Create the queue that populates the values.
@@ -520,10 +546,10 @@ class MeanTensorTest(test.TestCase):
 
       mean, update_op = metrics.mean_tensor(values, weights)
 
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       for _ in range(4):
-        sess.run(update_op)
-      self.assertAllClose([[0, 0.5]], sess.run(mean), 5)
+        self.evaluate(update_op)
+      self.assertAllClose([[0, 0.5]], self.evaluate(mean), 5)
 
 
 class AccuracyTest(test.TestCase):
@@ -531,6 +557,7 @@ class AccuracyTest(test.TestCase):
   def setUp(self):
     ops.reset_default_graph()
 
+  @test_util.run_deprecated_v1
   def testVars(self):
     metrics.accuracy(
         predictions=array_ops.ones((10, 1)),
@@ -539,6 +566,7 @@ class AccuracyTest(test.TestCase):
     _assert_metric_variables(self,
                              ('my_accuracy/count:0', 'my_accuracy/total:0'))
 
+  @test_util.run_deprecated_v1
   def testMetricsCollection(self):
     my_collection_name = '__metrics__'
     mean, _ = metrics.accuracy(
@@ -547,6 +575,7 @@ class AccuracyTest(test.TestCase):
         metrics_collections=[my_collection_name])
     self.assertListEqual(ops.get_collection(my_collection_name), [mean])
 
+  @test_util.run_deprecated_v1
   def testUpdatesCollection(self):
     my_collection_name = '__updates__'
     _, update_op = metrics.accuracy(
@@ -555,12 +584,14 @@ class AccuracyTest(test.TestCase):
         updates_collections=[my_collection_name])
     self.assertListEqual(ops.get_collection(my_collection_name), [update_op])
 
+  @test_util.run_deprecated_v1
   def testPredictionsAndLabelsOfDifferentSizeRaisesValueError(self):
     predictions = array_ops.ones((10, 3))
     labels = array_ops.ones((10, 4))
     with self.assertRaises(ValueError):
       metrics.accuracy(labels, predictions)
 
+  @test_util.run_deprecated_v1
   def testPredictionsAndWeightsOfDifferentSizeRaisesValueError(self):
     predictions = array_ops.ones((10, 3))
     labels = array_ops.ones((10, 3))
@@ -568,6 +599,7 @@ class AccuracyTest(test.TestCase):
     with self.assertRaises(ValueError):
       metrics.accuracy(labels, predictions, weights)
 
+  @test_util.run_deprecated_v1
   def testValueTensorIsIdempotent(self):
     predictions = random_ops.random_uniform(
         (10, 3), maxval=3, dtype=dtypes_lib.int64, seed=1)
@@ -576,17 +608,18 @@ class AccuracyTest(test.TestCase):
     accuracy, update_op = metrics.accuracy(labels, predictions)
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
 
       # Run several updates.
       for _ in range(10):
-        sess.run(update_op)
+        self.evaluate(update_op)
 
       # Then verify idempotency.
       initial_accuracy = accuracy.eval()
       for _ in range(10):
         self.assertEqual(initial_accuracy, accuracy.eval())
 
+  @test_util.run_deprecated_v1
   def testMultipleUpdates(self):
     with self.cached_session() as sess:
       # Create the queue that populates the predictions.
@@ -609,32 +642,35 @@ class AccuracyTest(test.TestCase):
 
       accuracy, update_op = metrics.accuracy(labels, predictions)
 
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       for _ in xrange(3):
-        sess.run(update_op)
-      self.assertEqual(0.5, sess.run(update_op))
+        self.evaluate(update_op)
+      self.assertEqual(0.5, self.evaluate(update_op))
       self.assertEqual(0.5, accuracy.eval())
 
+  @test_util.run_deprecated_v1
   def testEffectivelyEquivalentSizes(self):
     predictions = array_ops.ones((40, 1))
     labels = array_ops.ones((40,))
     with self.cached_session() as sess:
       accuracy, update_op = metrics.accuracy(labels, predictions)
 
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       self.assertEqual(1.0, update_op.eval())
       self.assertEqual(1.0, accuracy.eval())
 
+  @test_util.run_deprecated_v1
   def testEffectivelyEquivalentSizesWithScalarWeight(self):
     predictions = array_ops.ones((40, 1))
     labels = array_ops.ones((40,))
     with self.cached_session() as sess:
       accuracy, update_op = metrics.accuracy(labels, predictions, weights=2.0)
 
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       self.assertEqual(1.0, update_op.eval())
       self.assertEqual(1.0, accuracy.eval())
 
+  @test_util.run_deprecated_v1
   def testEffectivelyEquivalentSizesWithStaticShapedWeight(self):
     predictions = ops.convert_to_tensor([1, 1, 1])  # shape 3,
     labels = array_ops.expand_dims(ops.convert_to_tensor([1, 0, 0]),
@@ -645,13 +681,14 @@ class AccuracyTest(test.TestCase):
     with self.cached_session() as sess:
       accuracy, update_op = metrics.accuracy(labels, predictions, weights)
 
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       # if streaming_accuracy does not flatten the weight, accuracy would be
       # 0.33333334 due to an intended broadcast of weight. Due to flattening,
       # it will be higher than .95
       self.assertGreater(update_op.eval(), .95)
       self.assertGreater(accuracy.eval(), .95)
 
+  @test_util.run_deprecated_v1
   def testEffectivelyEquivalentSizesWithDynamicallyShapedWeight(self):
     predictions = ops.convert_to_tensor([1, 1, 1])  # shape 3,
     labels = array_ops.expand_dims(ops.convert_to_tensor([1, 0, 0]),
@@ -666,13 +703,14 @@ class AccuracyTest(test.TestCase):
       accuracy, update_op = metrics.accuracy(labels, predictions,
                                              weights_placeholder)
 
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       # if streaming_accuracy does not flatten the weight, accuracy would be
       # 0.33333334 due to an intended broadcast of weight. Due to flattening,
       # it will be higher than .95
       self.assertGreater(update_op.eval(feed_dict=feed_dict), .95)
       self.assertGreater(accuracy.eval(feed_dict=feed_dict), .95)
 
+  @test_util.run_deprecated_v1
   def testMultipleUpdatesWithWeightedValues(self):
     with self.cached_session() as sess:
       # Create the queue that populates the predictions.
@@ -704,10 +742,10 @@ class AccuracyTest(test.TestCase):
 
       accuracy, update_op = metrics.accuracy(labels, predictions, weights)
 
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       for _ in xrange(3):
-        sess.run(update_op)
-      self.assertEqual(1.0, sess.run(update_op))
+        self.evaluate(update_op)
+      self.assertEqual(1.0, self.evaluate(update_op))
       self.assertEqual(1.0, accuracy.eval())
 
 
@@ -717,12 +755,14 @@ class PrecisionTest(test.TestCase):
     np.random.seed(1)
     ops.reset_default_graph()
 
+  @test_util.run_deprecated_v1
   def testVars(self):
     metrics.precision(
         predictions=array_ops.ones((10, 1)), labels=array_ops.ones((10, 1)))
     _assert_metric_variables(self, ('precision/false_positives/count:0',
                                     'precision/true_positives/count:0'))
 
+  @test_util.run_deprecated_v1
   def testMetricsCollection(self):
     my_collection_name = '__metrics__'
     mean, _ = metrics.precision(
@@ -731,6 +771,7 @@ class PrecisionTest(test.TestCase):
         metrics_collections=[my_collection_name])
     self.assertListEqual(ops.get_collection(my_collection_name), [mean])
 
+  @test_util.run_deprecated_v1
   def testUpdatesCollection(self):
     my_collection_name = '__updates__'
     _, update_op = metrics.precision(
@@ -739,6 +780,7 @@ class PrecisionTest(test.TestCase):
         updates_collections=[my_collection_name])
     self.assertListEqual(ops.get_collection(my_collection_name), [update_op])
 
+  @test_util.run_deprecated_v1
   def testValueTensorIsIdempotent(self):
     predictions = random_ops.random_uniform(
         (10, 3), maxval=1, dtype=dtypes_lib.int64, seed=1)
@@ -747,17 +789,18 @@ class PrecisionTest(test.TestCase):
     precision, update_op = metrics.precision(labels, predictions)
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
 
       # Run several updates.
       for _ in range(10):
-        sess.run(update_op)
+        self.evaluate(update_op)
 
       # Then verify idempotency.
       initial_precision = precision.eval()
       for _ in range(10):
         self.assertEqual(initial_precision, precision.eval())
 
+  @test_util.run_deprecated_v1
   def testAllCorrect(self):
     inputs = np.random.randint(0, 2, size=(100, 1))
 
@@ -766,10 +809,11 @@ class PrecisionTest(test.TestCase):
     precision, update_op = metrics.precision(labels, predictions)
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
-      self.assertAlmostEqual(1, sess.run(update_op))
+      self.evaluate(variables.local_variables_initializer())
+      self.assertAlmostEqual(1, self.evaluate(update_op))
       self.assertAlmostEqual(1, precision.eval())
 
+  @test_util.run_deprecated_v1
   def testSomeCorrect_multipleInputDtypes(self):
     for dtype in (dtypes_lib.bool, dtypes_lib.int32, dtypes_lib.float32):
       predictions = math_ops.cast(
@@ -779,10 +823,11 @@ class PrecisionTest(test.TestCase):
       precision, update_op = metrics.precision(labels, predictions)
 
       with self.cached_session() as sess:
-        sess.run(variables.local_variables_initializer())
+        self.evaluate(variables.local_variables_initializer())
         self.assertAlmostEqual(0.5, update_op.eval())
         self.assertAlmostEqual(0.5, precision.eval())
 
+  @test_util.run_deprecated_v1
   def testWeighted1d(self):
     predictions = constant_op.constant([[1, 0, 1, 0], [1, 0, 1, 0]])
     labels = constant_op.constant([[0, 1, 1, 0], [1, 0, 0, 1]])
@@ -797,6 +842,7 @@ class PrecisionTest(test.TestCase):
       self.assertAlmostEqual(expected_precision, update_op.eval())
       self.assertAlmostEqual(expected_precision, precision.eval())
 
+  @test_util.run_deprecated_v1
   def testWeightedScalar_placeholders(self):
     predictions = array_ops.placeholder(dtype=dtypes_lib.float32)
     labels = array_ops.placeholder(dtype=dtypes_lib.float32)
@@ -816,6 +862,7 @@ class PrecisionTest(test.TestCase):
       self.assertAlmostEqual(
           expected_precision, precision.eval(feed_dict=feed_dict))
 
+  @test_util.run_deprecated_v1
   def testWeighted1d_placeholders(self):
     predictions = array_ops.placeholder(dtype=dtypes_lib.float32)
     labels = array_ops.placeholder(dtype=dtypes_lib.float32)
@@ -836,6 +883,7 @@ class PrecisionTest(test.TestCase):
       self.assertAlmostEqual(
           expected_precision, precision.eval(feed_dict=feed_dict))
 
+  @test_util.run_deprecated_v1
   def testWeighted2d(self):
     predictions = constant_op.constant([[1, 0, 1, 0], [1, 0, 1, 0]])
     labels = constant_op.constant([[0, 1, 1, 0], [1, 0, 0, 1]])
@@ -852,6 +900,7 @@ class PrecisionTest(test.TestCase):
       self.assertAlmostEqual(expected_precision, update_op.eval())
       self.assertAlmostEqual(expected_precision, precision.eval())
 
+  @test_util.run_deprecated_v1
   def testWeighted2d_placeholders(self):
     predictions = array_ops.placeholder(dtype=dtypes_lib.float32)
     labels = array_ops.placeholder(dtype=dtypes_lib.float32)
@@ -874,6 +923,7 @@ class PrecisionTest(test.TestCase):
       self.assertAlmostEqual(
           expected_precision, precision.eval(feed_dict=feed_dict))
 
+  @test_util.run_deprecated_v1
   def testAllIncorrect(self):
     inputs = np.random.randint(0, 2, size=(100, 1))
 
@@ -882,18 +932,19 @@ class PrecisionTest(test.TestCase):
     precision, update_op = metrics.precision(labels, predictions)
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
-      sess.run(update_op)
+      self.evaluate(variables.local_variables_initializer())
+      self.evaluate(update_op)
       self.assertAlmostEqual(0, precision.eval())
 
+  @test_util.run_deprecated_v1
   def testZeroTrueAndFalsePositivesGivesZeroPrecision(self):
     predictions = constant_op.constant([0, 0, 0, 0])
     labels = constant_op.constant([0, 0, 0, 0])
     precision, update_op = metrics.precision(labels, predictions)
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
-      sess.run(update_op)
+      self.evaluate(variables.local_variables_initializer())
+      self.evaluate(update_op)
       self.assertEqual(0.0, precision.eval())
 
 
@@ -903,6 +954,7 @@ class RecallTest(test.TestCase):
     np.random.seed(1)
     ops.reset_default_graph()
 
+  @test_util.run_deprecated_v1
   def testVars(self):
     metrics.recall(
         predictions=array_ops.ones((10, 1)), labels=array_ops.ones((10, 1)))
@@ -910,6 +962,7 @@ class RecallTest(test.TestCase):
         self,
         ('recall/false_negatives/count:0', 'recall/true_positives/count:0'))
 
+  @test_util.run_deprecated_v1
   def testMetricsCollection(self):
     my_collection_name = '__metrics__'
     mean, _ = metrics.recall(
@@ -918,6 +971,7 @@ class RecallTest(test.TestCase):
         metrics_collections=[my_collection_name])
     self.assertListEqual(ops.get_collection(my_collection_name), [mean])
 
+  @test_util.run_deprecated_v1
   def testUpdatesCollection(self):
     my_collection_name = '__updates__'
     _, update_op = metrics.recall(
@@ -926,6 +980,7 @@ class RecallTest(test.TestCase):
         updates_collections=[my_collection_name])
     self.assertListEqual(ops.get_collection(my_collection_name), [update_op])
 
+  @test_util.run_deprecated_v1
   def testValueTensorIsIdempotent(self):
     predictions = random_ops.random_uniform(
         (10, 3), maxval=1, dtype=dtypes_lib.int64, seed=1)
@@ -934,17 +989,18 @@ class RecallTest(test.TestCase):
     recall, update_op = metrics.recall(labels, predictions)
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
 
       # Run several updates.
       for _ in range(10):
-        sess.run(update_op)
+        self.evaluate(update_op)
 
       # Then verify idempotency.
       initial_recall = recall.eval()
       for _ in range(10):
         self.assertEqual(initial_recall, recall.eval())
 
+  @test_util.run_deprecated_v1
   def testAllCorrect(self):
     np_inputs = np.random.randint(0, 2, size=(100, 1))
 
@@ -953,10 +1009,11 @@ class RecallTest(test.TestCase):
     recall, update_op = metrics.recall(labels, predictions)
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
-      sess.run(update_op)
+      self.evaluate(variables.local_variables_initializer())
+      self.evaluate(update_op)
       self.assertEqual(1, recall.eval())
 
+  @test_util.run_deprecated_v1
   def testSomeCorrect_multipleInputDtypes(self):
     for dtype in (dtypes_lib.bool, dtypes_lib.int32, dtypes_lib.float32):
       predictions = math_ops.cast(
@@ -966,10 +1023,11 @@ class RecallTest(test.TestCase):
       recall, update_op = metrics.recall(labels, predictions)
 
       with self.cached_session() as sess:
-        sess.run(variables.local_variables_initializer())
+        self.evaluate(variables.local_variables_initializer())
         self.assertAlmostEqual(0.5, update_op.eval())
         self.assertAlmostEqual(0.5, recall.eval())
 
+  @test_util.run_deprecated_v1
   def testWeighted1d(self):
     predictions = constant_op.constant([[1, 0, 1, 0], [0, 1, 0, 1]])
     labels = constant_op.constant([[0, 1, 1, 0], [1, 0, 0, 1]])
@@ -977,13 +1035,14 @@ class RecallTest(test.TestCase):
     recall, update_op = metrics.recall(labels, predictions, weights=weights)
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       weighted_tp = 2.0 + 5.0
       weighted_t = (2.0 + 2.0) + (5.0 + 5.0)
       expected_precision = weighted_tp / weighted_t
       self.assertAlmostEqual(expected_precision, update_op.eval())
       self.assertAlmostEqual(expected_precision, recall.eval())
 
+  @test_util.run_deprecated_v1
   def testWeighted2d(self):
     predictions = constant_op.constant([[1, 0, 1, 0], [0, 1, 0, 1]])
     labels = constant_op.constant([[0, 1, 1, 0], [1, 0, 0, 1]])
@@ -991,13 +1050,14 @@ class RecallTest(test.TestCase):
     recall, update_op = metrics.recall(labels, predictions, weights=weights)
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       weighted_tp = 3.0 + 1.0
       weighted_t = (2.0 + 3.0) + (4.0 + 1.0)
       expected_precision = weighted_tp / weighted_t
       self.assertAlmostEqual(expected_precision, update_op.eval())
       self.assertAlmostEqual(expected_precision, recall.eval())
 
+  @test_util.run_deprecated_v1
   def testAllIncorrect(self):
     np_inputs = np.random.randint(0, 2, size=(100, 1))
 
@@ -1006,18 +1066,19 @@ class RecallTest(test.TestCase):
     recall, update_op = metrics.recall(labels, predictions)
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
-      sess.run(update_op)
+      self.evaluate(variables.local_variables_initializer())
+      self.evaluate(update_op)
       self.assertEqual(0, recall.eval())
 
+  @test_util.run_deprecated_v1
   def testZeroTruePositivesAndFalseNegativesGivesZeroRecall(self):
     predictions = array_ops.zeros((1, 4))
     labels = array_ops.zeros((1, 4))
     recall, update_op = metrics.recall(labels, predictions)
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
-      sess.run(update_op)
+      self.evaluate(variables.local_variables_initializer())
+      self.evaluate(update_op)
       self.assertEqual(0, recall.eval())
 
 
@@ -1027,6 +1088,7 @@ class AUCTest(test.TestCase):
     np.random.seed(1)
     ops.reset_default_graph()
 
+  @test_util.run_deprecated_v1
   def testVars(self):
     metrics.auc(predictions=array_ops.ones((10, 1)),
                 labels=array_ops.ones((10, 1)))
@@ -1034,6 +1096,7 @@ class AUCTest(test.TestCase):
                              ('auc/true_positives:0', 'auc/false_negatives:0',
                               'auc/false_positives:0', 'auc/true_negatives:0'))
 
+  @test_util.run_deprecated_v1
   def testMetricsCollection(self):
     my_collection_name = '__metrics__'
     mean, _ = metrics.auc(predictions=array_ops.ones((10, 1)),
@@ -1041,6 +1104,7 @@ class AUCTest(test.TestCase):
                           metrics_collections=[my_collection_name])
     self.assertListEqual(ops.get_collection(my_collection_name), [mean])
 
+  @test_util.run_deprecated_v1
   def testUpdatesCollection(self):
     my_collection_name = '__updates__'
     _, update_op = metrics.auc(predictions=array_ops.ones((10, 1)),
@@ -1048,6 +1112,7 @@ class AUCTest(test.TestCase):
                                updates_collections=[my_collection_name])
     self.assertListEqual(ops.get_collection(my_collection_name), [update_op])
 
+  @test_util.run_deprecated_v1
   def testValueTensorIsIdempotent(self):
     predictions = random_ops.random_uniform(
         (10, 3), maxval=1, dtype=dtypes_lib.float32, seed=1)
@@ -1056,17 +1121,18 @@ class AUCTest(test.TestCase):
     auc, update_op = metrics.auc(labels, predictions)
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
 
       # Run several updates.
       for _ in range(10):
-        sess.run(update_op)
+        self.evaluate(update_op)
 
       # Then verify idempotency.
       initial_auc = auc.eval()
       for _ in range(10):
         self.assertAlmostEqual(initial_auc, auc.eval(), 5)
 
+  @test_util.run_deprecated_v1
   def testAllCorrect(self):
     self.allCorrectAsExpected('ROC')
 
@@ -1078,11 +1144,12 @@ class AUCTest(test.TestCase):
       labels = constant_op.constant(inputs)
       auc, update_op = metrics.auc(labels, predictions, curve=curve)
 
-      sess.run(variables.local_variables_initializer())
-      self.assertEqual(1, sess.run(update_op))
+      self.evaluate(variables.local_variables_initializer())
+      self.assertEqual(1, self.evaluate(update_op))
 
       self.assertEqual(1, auc.eval())
 
+  @test_util.run_deprecated_v1
   def testSomeCorrect_multipleLabelDtypes(self):
     with self.cached_session() as sess:
       for label_dtype in (
@@ -1093,11 +1160,12 @@ class AUCTest(test.TestCase):
             constant_op.constant([0, 1, 1, 0], shape=(1, 4)), dtype=label_dtype)
         auc, update_op = metrics.auc(labels, predictions)
 
-        sess.run(variables.local_variables_initializer())
-        self.assertAlmostEqual(0.5, sess.run(update_op))
+        self.evaluate(variables.local_variables_initializer())
+        self.assertAlmostEqual(0.5, self.evaluate(update_op))
 
         self.assertAlmostEqual(0.5, auc.eval())
 
+  @test_util.run_deprecated_v1
   def testWeighted1d(self):
     with self.cached_session() as sess:
       predictions = constant_op.constant(
@@ -1106,11 +1174,12 @@ class AUCTest(test.TestCase):
       weights = constant_op.constant([2], shape=(1, 1))
       auc, update_op = metrics.auc(labels, predictions, weights=weights)
 
-      sess.run(variables.local_variables_initializer())
-      self.assertAlmostEqual(0.5, sess.run(update_op), 5)
+      self.evaluate(variables.local_variables_initializer())
+      self.assertAlmostEqual(0.5, self.evaluate(update_op), 5)
 
       self.assertAlmostEqual(0.5, auc.eval(), 5)
 
+  @test_util.run_deprecated_v1
   def testWeighted2d(self):
     with self.cached_session() as sess:
       predictions = constant_op.constant(
@@ -1119,13 +1188,14 @@ class AUCTest(test.TestCase):
       weights = constant_op.constant([1, 2, 3, 4], shape=(1, 4))
       auc, update_op = metrics.auc(labels, predictions, weights=weights)
 
-      sess.run(variables.local_variables_initializer())
-      self.assertAlmostEqual(0.7, sess.run(update_op), 5)
+      self.evaluate(variables.local_variables_initializer())
+      self.assertAlmostEqual(0.7, self.evaluate(update_op), 5)
 
       self.assertAlmostEqual(0.7, auc.eval(), 5)
 
   # Regarding the AUC-PR tests: note that the preferred method when
   # calculating AUC-PR is summation_method='careful_interpolation'.
+  @test_util.run_deprecated_v1
   def testCorrectAUCPRSpecialCase(self):
     with self.cached_session() as sess:
       predictions = constant_op.constant(
@@ -1134,12 +1204,13 @@ class AUCTest(test.TestCase):
       auc, update_op = metrics.auc(labels, predictions, curve='PR',
                                    summation_method='careful_interpolation')
 
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       # expected ~= 0.79726744594
       expected = 1 - math.log(1.5) / 2
-      self.assertAlmostEqual(expected, sess.run(update_op), delta=1e-3)
+      self.assertAlmostEqual(expected, self.evaluate(update_op), delta=1e-3)
       self.assertAlmostEqual(expected, auc.eval(), delta=1e-3)
 
+  @test_util.run_deprecated_v1
   def testCorrectAnotherAUCPRSpecialCase(self):
     with self.cached_session() as sess:
       predictions = constant_op.constant(
@@ -1150,12 +1221,13 @@ class AUCTest(test.TestCase):
       auc, update_op = metrics.auc(labels, predictions, curve='PR',
                                    summation_method='careful_interpolation')
 
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       # expected ~= 0.61350593198
       expected = (2.5 - 2 * math.log(4./3) - 0.25 * math.log(7./5)) / 3
-      self.assertAlmostEqual(expected, sess.run(update_op), delta=1e-3)
+      self.assertAlmostEqual(expected, self.evaluate(update_op), delta=1e-3)
       self.assertAlmostEqual(expected, auc.eval(), delta=1e-3)
 
+  @test_util.run_deprecated_v1
   def testThirdCorrectAUCPRSpecialCase(self):
     with self.cached_session() as sess:
       predictions = constant_op.constant(
@@ -1166,12 +1238,13 @@ class AUCTest(test.TestCase):
       auc, update_op = metrics.auc(labels, predictions, curve='PR',
                                    summation_method='careful_interpolation')
 
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       # expected ~= 0.90410597584
       expected = 1 - math.log(4./3) / 3
-      self.assertAlmostEqual(expected, sess.run(update_op), delta=1e-3)
+      self.assertAlmostEqual(expected, self.evaluate(update_op), delta=1e-3)
       self.assertAlmostEqual(expected, auc.eval(), delta=1e-3)
 
+  @test_util.run_deprecated_v1
   def testIncorrectAUCPRSpecialCase(self):
     with self.cached_session() as sess:
       predictions = constant_op.constant(
@@ -1180,11 +1253,12 @@ class AUCTest(test.TestCase):
       auc, update_op = metrics.auc(labels, predictions, curve='PR',
                                    summation_method='trapezoidal')
 
-      sess.run(variables.local_variables_initializer())
-      self.assertAlmostEqual(0.79166, sess.run(update_op), delta=1e-3)
+      self.evaluate(variables.local_variables_initializer())
+      self.assertAlmostEqual(0.79166, self.evaluate(update_op), delta=1e-3)
 
       self.assertAlmostEqual(0.79166, auc.eval(), delta=1e-3)
 
+  @test_util.run_deprecated_v1
   def testAnotherIncorrectAUCPRSpecialCase(self):
     with self.cached_session() as sess:
       predictions = constant_op.constant(
@@ -1195,11 +1269,12 @@ class AUCTest(test.TestCase):
       auc, update_op = metrics.auc(labels, predictions, curve='PR',
                                    summation_method='trapezoidal')
 
-      sess.run(variables.local_variables_initializer())
-      self.assertAlmostEqual(0.610317, sess.run(update_op), delta=1e-3)
+      self.evaluate(variables.local_variables_initializer())
+      self.assertAlmostEqual(0.610317, self.evaluate(update_op), delta=1e-3)
 
       self.assertAlmostEqual(0.610317, auc.eval(), delta=1e-3)
 
+  @test_util.run_deprecated_v1
   def testThirdIncorrectAUCPRSpecialCase(self):
     with self.cached_session() as sess:
       predictions = constant_op.constant(
@@ -1210,11 +1285,12 @@ class AUCTest(test.TestCase):
       auc, update_op = metrics.auc(labels, predictions, curve='PR',
                                    summation_method='trapezoidal')
 
-      sess.run(variables.local_variables_initializer())
-      self.assertAlmostEqual(0.90277, sess.run(update_op), delta=1e-3)
+      self.evaluate(variables.local_variables_initializer())
+      self.assertAlmostEqual(0.90277, self.evaluate(update_op), delta=1e-3)
 
       self.assertAlmostEqual(0.90277, auc.eval(), delta=1e-3)
 
+  @test_util.run_deprecated_v1
   def testAllIncorrect(self):
     inputs = np.random.randint(0, 2, size=(100, 1))
 
@@ -1223,30 +1299,32 @@ class AUCTest(test.TestCase):
       labels = constant_op.constant(1 - inputs, dtype=dtypes_lib.float32)
       auc, update_op = metrics.auc(labels, predictions)
 
-      sess.run(variables.local_variables_initializer())
-      self.assertAlmostEqual(0, sess.run(update_op))
+      self.evaluate(variables.local_variables_initializer())
+      self.assertAlmostEqual(0, self.evaluate(update_op))
 
       self.assertAlmostEqual(0, auc.eval())
 
+  @test_util.run_deprecated_v1
   def testZeroTruePositivesAndFalseNegativesGivesOneAUC(self):
     with self.cached_session() as sess:
       predictions = array_ops.zeros([4], dtype=dtypes_lib.float32)
       labels = array_ops.zeros([4])
       auc, update_op = metrics.auc(labels, predictions)
 
-      sess.run(variables.local_variables_initializer())
-      self.assertAlmostEqual(1, sess.run(update_op), 6)
+      self.evaluate(variables.local_variables_initializer())
+      self.assertAlmostEqual(1, self.evaluate(update_op), 6)
 
       self.assertAlmostEqual(1, auc.eval(), 6)
 
+  @test_util.run_deprecated_v1
   def testRecallOneAndPrecisionOneGivesOnePRAUC(self):
     with self.cached_session() as sess:
       predictions = array_ops.ones([4], dtype=dtypes_lib.float32)
       labels = array_ops.ones([4])
       auc, update_op = metrics.auc(labels, predictions, curve='PR')
 
-      sess.run(variables.local_variables_initializer())
-      self.assertAlmostEqual(1, sess.run(update_op), 6)
+      self.evaluate(variables.local_variables_initializer())
+      self.assertAlmostEqual(1, self.evaluate(update_op), 6)
 
       self.assertAlmostEqual(1, auc.eval(), 6)
 
@@ -1277,6 +1355,7 @@ class AUCTest(test.TestCase):
     tp = np.cumsum(sorted_weights * is_positive) / num_positives
     return np.sum((sorted_weights * tp)[~is_positive]) / num_negatives
 
+  @test_util.run_deprecated_v1
   def testWithMultipleUpdates(self):
     num_samples = 1000
     batch_size = 10
@@ -1317,9 +1396,9 @@ class AUCTest(test.TestCase):
                                      num_thresholds=500,
                                      weights=tf_weights)
 
-        sess.run(variables.local_variables_initializer())
+        self.evaluate(variables.local_variables_initializer())
         for i in range(num_batches):
-          sess.run(update_op)
+          self.evaluate(update_op)
 
         # Since this is only approximate, we can't expect a 6 digits match.
         # Although with higher number of samples/thresholds we should see the
@@ -1333,6 +1412,7 @@ class SpecificityAtSensitivityTest(test.TestCase):
     np.random.seed(1)
     ops.reset_default_graph()
 
+  @test_util.run_deprecated_v1
   def testVars(self):
     metrics.specificity_at_sensitivity(
         predictions=array_ops.ones((10, 1)),
@@ -1344,6 +1424,7 @@ class SpecificityAtSensitivityTest(test.TestCase):
                               'specificity_at_sensitivity/false_positives:0',
                               'specificity_at_sensitivity/true_negatives:0'))
 
+  @test_util.run_deprecated_v1
   def testMetricsCollection(self):
     my_collection_name = '__metrics__'
     mean, _ = metrics.specificity_at_sensitivity(
@@ -1353,6 +1434,7 @@ class SpecificityAtSensitivityTest(test.TestCase):
         metrics_collections=[my_collection_name])
     self.assertListEqual(ops.get_collection(my_collection_name), [mean])
 
+  @test_util.run_deprecated_v1
   def testUpdatesCollection(self):
     my_collection_name = '__updates__'
     _, update_op = metrics.specificity_at_sensitivity(
@@ -1362,6 +1444,7 @@ class SpecificityAtSensitivityTest(test.TestCase):
         updates_collections=[my_collection_name])
     self.assertListEqual(ops.get_collection(my_collection_name), [update_op])
 
+  @test_util.run_deprecated_v1
   def testValueTensorIsIdempotent(self):
     predictions = random_ops.random_uniform(
         (10, 3), maxval=1, dtype=dtypes_lib.float32, seed=1)
@@ -1371,17 +1454,18 @@ class SpecificityAtSensitivityTest(test.TestCase):
         labels, predictions, sensitivity=0.7)
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
 
       # Run several updates.
       for _ in range(10):
-        sess.run(update_op)
+        self.evaluate(update_op)
 
       # Then verify idempotency.
       initial_specificity = specificity.eval()
       for _ in range(10):
         self.assertAlmostEqual(initial_specificity, specificity.eval(), 5)
 
+  @test_util.run_deprecated_v1
   def testAllCorrect(self):
     inputs = np.random.randint(0, 2, size=(100, 1))
 
@@ -1391,10 +1475,11 @@ class SpecificityAtSensitivityTest(test.TestCase):
         labels, predictions, sensitivity=0.7)
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
-      self.assertEqual(1, sess.run(update_op))
+      self.evaluate(variables.local_variables_initializer())
+      self.assertEqual(1, self.evaluate(update_op))
       self.assertEqual(1, specificity.eval())
 
+  @test_util.run_deprecated_v1
   def testSomeCorrectHighSensitivity(self):
     predictions_values = [0.1, 0.2, 0.4, 0.3, 0.0, 0.1, 0.45, 0.5, 0.8, 0.9]
     labels_values = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
@@ -1406,10 +1491,11 @@ class SpecificityAtSensitivityTest(test.TestCase):
         labels, predictions, sensitivity=0.8)
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
-      self.assertAlmostEqual(1.0, sess.run(update_op))
+      self.evaluate(variables.local_variables_initializer())
+      self.assertAlmostEqual(1.0, self.evaluate(update_op))
       self.assertAlmostEqual(1.0, specificity.eval())
 
+  @test_util.run_deprecated_v1
   def testSomeCorrectLowSensitivity(self):
     predictions_values = [0.1, 0.2, 0.4, 0.3, 0.0, 0.1, 0.2, 0.2, 0.26, 0.26]
     labels_values = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
@@ -1421,11 +1507,12 @@ class SpecificityAtSensitivityTest(test.TestCase):
         labels, predictions, sensitivity=0.4)
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
 
-      self.assertAlmostEqual(0.6, sess.run(update_op))
+      self.assertAlmostEqual(0.6, self.evaluate(update_op))
       self.assertAlmostEqual(0.6, specificity.eval())
 
+  @test_util.run_deprecated_v1
   def testWeighted1d_multipleLabelDtypes(self):
     for label_dtype in (dtypes_lib.bool, dtypes_lib.int32, dtypes_lib.float32):
       predictions_values = [0.1, 0.2, 0.4, 0.3, 0.0, 0.1, 0.2, 0.2, 0.26, 0.26]
@@ -1440,11 +1527,12 @@ class SpecificityAtSensitivityTest(test.TestCase):
           labels, predictions, weights=weights, sensitivity=0.4)
 
       with self.cached_session() as sess:
-        sess.run(variables.local_variables_initializer())
+        self.evaluate(variables.local_variables_initializer())
 
-        self.assertAlmostEqual(0.6, sess.run(update_op))
+        self.assertAlmostEqual(0.6, self.evaluate(update_op))
         self.assertAlmostEqual(0.6, specificity.eval())
 
+  @test_util.run_deprecated_v1
   def testWeighted2d(self):
     predictions_values = [0.1, 0.2, 0.4, 0.3, 0.0, 0.1, 0.2, 0.2, 0.26, 0.26]
     labels_values = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
@@ -1458,9 +1546,9 @@ class SpecificityAtSensitivityTest(test.TestCase):
         labels, predictions, weights=weights, sensitivity=0.4)
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
 
-      self.assertAlmostEqual(8.0 / 15.0, sess.run(update_op))
+      self.assertAlmostEqual(8.0 / 15.0, self.evaluate(update_op))
       self.assertAlmostEqual(8.0 / 15.0, specificity.eval())
 
 
@@ -1470,6 +1558,7 @@ class SensitivityAtSpecificityTest(test.TestCase):
     np.random.seed(1)
     ops.reset_default_graph()
 
+  @test_util.run_deprecated_v1
   def testVars(self):
     metrics.sensitivity_at_specificity(
         predictions=array_ops.ones((10, 1)),
@@ -1481,6 +1570,7 @@ class SensitivityAtSpecificityTest(test.TestCase):
                               'sensitivity_at_specificity/false_positives:0',
                               'sensitivity_at_specificity/true_negatives:0'))
 
+  @test_util.run_deprecated_v1
   def testMetricsCollection(self):
     my_collection_name = '__metrics__'
     mean, _ = metrics.sensitivity_at_specificity(
@@ -1490,6 +1580,7 @@ class SensitivityAtSpecificityTest(test.TestCase):
         metrics_collections=[my_collection_name])
     self.assertListEqual(ops.get_collection(my_collection_name), [mean])
 
+  @test_util.run_deprecated_v1
   def testUpdatesCollection(self):
     my_collection_name = '__updates__'
     _, update_op = metrics.sensitivity_at_specificity(
@@ -1499,6 +1590,7 @@ class SensitivityAtSpecificityTest(test.TestCase):
         updates_collections=[my_collection_name])
     self.assertListEqual(ops.get_collection(my_collection_name), [update_op])
 
+  @test_util.run_deprecated_v1
   def testValueTensorIsIdempotent(self):
     predictions = random_ops.random_uniform(
         (10, 3), maxval=1, dtype=dtypes_lib.float32, seed=1)
@@ -1508,17 +1600,18 @@ class SensitivityAtSpecificityTest(test.TestCase):
         labels, predictions, specificity=0.7)
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
 
       # Run several updates.
       for _ in range(10):
-        sess.run(update_op)
+        self.evaluate(update_op)
 
       # Then verify idempotency.
       initial_sensitivity = sensitivity.eval()
       for _ in range(10):
         self.assertAlmostEqual(initial_sensitivity, sensitivity.eval(), 5)
 
+  @test_util.run_deprecated_v1
   def testAllCorrect(self):
     inputs = np.random.randint(0, 2, size=(100, 1))
 
@@ -1528,10 +1621,11 @@ class SensitivityAtSpecificityTest(test.TestCase):
         labels, predictions, specificity=0.7)
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
-      self.assertEqual(1, sess.run(update_op))
+      self.evaluate(variables.local_variables_initializer())
+      self.assertEqual(1, self.evaluate(update_op))
       self.assertEqual(1, specificity.eval())
 
+  @test_util.run_deprecated_v1
   def testSomeCorrectHighSpecificity(self):
     predictions_values = [0.0, 0.1, 0.2, 0.3, 0.4, 0.1, 0.45, 0.5, 0.8, 0.9]
     labels_values = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
@@ -1543,10 +1637,11 @@ class SensitivityAtSpecificityTest(test.TestCase):
         labels, predictions, specificity=0.8)
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
-      self.assertAlmostEqual(0.8, sess.run(update_op))
+      self.evaluate(variables.local_variables_initializer())
+      self.assertAlmostEqual(0.8, self.evaluate(update_op))
       self.assertAlmostEqual(0.8, specificity.eval())
 
+  @test_util.run_deprecated_v1
   def testSomeCorrectLowSpecificity(self):
     predictions_values = [0.0, 0.1, 0.2, 0.3, 0.4, 0.01, 0.02, 0.25, 0.26, 0.26]
     labels_values = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
@@ -1558,10 +1653,11 @@ class SensitivityAtSpecificityTest(test.TestCase):
         labels, predictions, specificity=0.4)
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
-      self.assertAlmostEqual(0.6, sess.run(update_op))
+      self.evaluate(variables.local_variables_initializer())
+      self.assertAlmostEqual(0.6, self.evaluate(update_op))
       self.assertAlmostEqual(0.6, specificity.eval())
 
+  @test_util.run_deprecated_v1
   def testWeighted_multipleLabelDtypes(self):
     for label_dtype in (dtypes_lib.bool, dtypes_lib.int32, dtypes_lib.float32):
       predictions_values = [
@@ -1577,8 +1673,8 @@ class SensitivityAtSpecificityTest(test.TestCase):
           labels, predictions, weights=weights, specificity=0.4)
 
       with self.cached_session() as sess:
-        sess.run(variables.local_variables_initializer())
-        self.assertAlmostEqual(0.675, sess.run(update_op))
+        self.evaluate(variables.local_variables_initializer())
+        self.assertAlmostEqual(0.675, self.evaluate(update_op))
         self.assertAlmostEqual(0.675, specificity.eval())
 
 
@@ -1589,6 +1685,7 @@ class PrecisionRecallThresholdsTest(test.TestCase):
     np.random.seed(1)
     ops.reset_default_graph()
 
+  @test_util.run_deprecated_v1
   def testVars(self):
     metrics.precision_at_thresholds(
         predictions=array_ops.ones((10, 1)),
@@ -1599,6 +1696,7 @@ class PrecisionRecallThresholdsTest(test.TestCase):
         'precision_at_thresholds/false_positives:0',
     ))
 
+  @test_util.run_deprecated_v1
   def testMetricsCollection(self):
     my_collection_name = '__metrics__'
     prec, _ = metrics.precision_at_thresholds(
@@ -1613,6 +1711,7 @@ class PrecisionRecallThresholdsTest(test.TestCase):
         metrics_collections=[my_collection_name])
     self.assertListEqual(ops.get_collection(my_collection_name), [prec, rec])
 
+  @test_util.run_deprecated_v1
   def testUpdatesCollection(self):
     my_collection_name = '__updates__'
     _, precision_op = metrics.precision_at_thresholds(
@@ -1628,6 +1727,7 @@ class PrecisionRecallThresholdsTest(test.TestCase):
     self.assertListEqual(
         ops.get_collection(my_collection_name), [precision_op, recall_op])
 
+  @test_util.run_deprecated_v1
   def testValueTensorIsIdempotent(self):
     predictions = random_ops.random_uniform(
         (10, 3), maxval=1, dtype=dtypes_lib.float32, seed=1)
@@ -1639,18 +1739,19 @@ class PrecisionRecallThresholdsTest(test.TestCase):
     rec, rec_op = metrics.recall_at_thresholds(labels, predictions, thresholds)
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
 
       # Run several updates, then verify idempotency.
-      sess.run([prec_op, rec_op])
+      self.evaluate([prec_op, rec_op])
       initial_prec = prec.eval()
       initial_rec = rec.eval()
       for _ in range(10):
-        sess.run([prec_op, rec_op])
+        self.evaluate([prec_op, rec_op])
         self.assertAllClose(initial_prec, prec.eval())
         self.assertAllClose(initial_rec, rec.eval())
 
   # TODO(nsilberman): fix tests (passing but incorrect).
+  @test_util.run_deprecated_v1
   def testAllCorrect(self):
     inputs = np.random.randint(0, 2, size=(100, 1))
 
@@ -1663,12 +1764,13 @@ class PrecisionRecallThresholdsTest(test.TestCase):
       rec, rec_op = metrics.recall_at_thresholds(labels, predictions,
                                                  thresholds)
 
-      sess.run(variables.local_variables_initializer())
-      sess.run([prec_op, rec_op])
+      self.evaluate(variables.local_variables_initializer())
+      self.evaluate([prec_op, rec_op])
 
       self.assertEqual(1, prec.eval())
       self.assertEqual(1, rec.eval())
 
+  @test_util.run_deprecated_v1
   def testSomeCorrect_multipleLabelDtypes(self):
     with self.cached_session() as sess:
       for label_dtype in (
@@ -1683,12 +1785,13 @@ class PrecisionRecallThresholdsTest(test.TestCase):
         rec, rec_op = metrics.recall_at_thresholds(labels, predictions,
                                                    thresholds)
 
-        sess.run(variables.local_variables_initializer())
-        sess.run([prec_op, rec_op])
+        self.evaluate(variables.local_variables_initializer())
+        self.evaluate([prec_op, rec_op])
 
         self.assertAlmostEqual(0.5, prec.eval())
         self.assertAlmostEqual(0.5, rec.eval())
 
+  @test_util.run_deprecated_v1
   def testAllIncorrect(self):
     inputs = np.random.randint(0, 2, size=(100, 1))
 
@@ -1701,12 +1804,13 @@ class PrecisionRecallThresholdsTest(test.TestCase):
       rec, rec_op = metrics.recall_at_thresholds(labels, predictions,
                                                  thresholds)
 
-      sess.run(variables.local_variables_initializer())
-      sess.run([prec_op, rec_op])
+      self.evaluate(variables.local_variables_initializer())
+      self.evaluate([prec_op, rec_op])
 
       self.assertAlmostEqual(0, prec.eval())
       self.assertAlmostEqual(0, rec.eval())
 
+  @test_util.run_deprecated_v1
   def testWeights1d(self):
     with self.cached_session() as sess:
       predictions = constant_op.constant(
@@ -1729,14 +1833,15 @@ class PrecisionRecallThresholdsTest(test.TestCase):
       rec_low = array_ops.reshape(rec_low, shape=())
       rec_high = array_ops.reshape(rec_high, shape=())
 
-      sess.run(variables.local_variables_initializer())
-      sess.run([prec_op, rec_op])
+      self.evaluate(variables.local_variables_initializer())
+      self.evaluate([prec_op, rec_op])
 
       self.assertAlmostEqual(1.0, prec_low.eval(), places=5)
       self.assertAlmostEqual(0.0, prec_high.eval(), places=5)
       self.assertAlmostEqual(1.0, rec_low.eval(), places=5)
       self.assertAlmostEqual(0.0, rec_high.eval(), places=5)
 
+  @test_util.run_deprecated_v1
   def testWeights2d(self):
     with self.cached_session() as sess:
       predictions = constant_op.constant(
@@ -1759,14 +1864,15 @@ class PrecisionRecallThresholdsTest(test.TestCase):
       rec_low = array_ops.reshape(rec_low, shape=())
       rec_high = array_ops.reshape(rec_high, shape=())
 
-      sess.run(variables.local_variables_initializer())
-      sess.run([prec_op, rec_op])
+      self.evaluate(variables.local_variables_initializer())
+      self.evaluate([prec_op, rec_op])
 
       self.assertAlmostEqual(1.0, prec_low.eval(), places=5)
       self.assertAlmostEqual(0.0, prec_high.eval(), places=5)
       self.assertAlmostEqual(1.0, rec_low.eval(), places=5)
       self.assertAlmostEqual(0.0, rec_high.eval(), places=5)
 
+  @test_util.run_deprecated_v1
   def testExtremeThresholds(self):
     with self.cached_session() as sess:
       predictions = constant_op.constant(
@@ -1783,14 +1889,15 @@ class PrecisionRecallThresholdsTest(test.TestCase):
       [rec_low, rec_high] = array_ops.split(
           value=rec, num_or_size_splits=2, axis=0)
 
-      sess.run(variables.local_variables_initializer())
-      sess.run([prec_op, rec_op])
+      self.evaluate(variables.local_variables_initializer())
+      self.evaluate([prec_op, rec_op])
 
       self.assertAlmostEqual(0.75, prec_low.eval())
       self.assertAlmostEqual(0.0, prec_high.eval())
       self.assertAlmostEqual(1.0, rec_low.eval())
       self.assertAlmostEqual(0.0, rec_high.eval())
 
+  @test_util.run_deprecated_v1
   def testZeroLabelsPredictions(self):
     with self.cached_session() as sess:
       predictions = array_ops.zeros([4], dtype=dtypes_lib.float32)
@@ -1801,12 +1908,13 @@ class PrecisionRecallThresholdsTest(test.TestCase):
       rec, rec_op = metrics.recall_at_thresholds(labels, predictions,
                                                  thresholds)
 
-      sess.run(variables.local_variables_initializer())
-      sess.run([prec_op, rec_op])
+      self.evaluate(variables.local_variables_initializer())
+      self.evaluate([prec_op, rec_op])
 
       self.assertAlmostEqual(0, prec.eval(), 6)
       self.assertAlmostEqual(0, rec.eval(), 6)
 
+  @test_util.run_deprecated_v1
   def testWithMultipleUpdates(self):
     num_samples = 1000
     batch_size = 10
@@ -1869,9 +1977,9 @@ class PrecisionRecallThresholdsTest(test.TestCase):
       rec, rec_op = metrics.recall_at_thresholds(tf_labels, tf_predictions,
                                                  thresholds)
 
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       for _ in range(int(num_samples / batch_size)):
-        sess.run([prec_op, rec_op])
+        self.evaluate([prec_op, rec_op])
       # Since this is only approximate, we can't expect a 6 digits match.
       # Although with higher number of samples/thresholds we should see the
       # accuracy improving
@@ -1989,6 +2097,7 @@ class SingleLabelPrecisionAtKTest(test.TestCase):
     self._test_average_precision_at_k = functools.partial(
         _test_average_precision_at_k, test_case=self)
 
+  @test_util.run_deprecated_v1
   def test_at_k1_nan(self):
     for labels in self._labels:
       # Classes 0,1,2 have 0 predictions, classes -1 and 4 are out of range.
@@ -1998,6 +2107,7 @@ class SingleLabelPrecisionAtKTest(test.TestCase):
         self._test_precision_at_top_k(
             self._predictions_idx, labels, k=1, expected=NAN, class_id=class_id)
 
+  @test_util.run_deprecated_v1
   def test_at_k1(self):
     for labels in self._labels:
       # Class 3: 1 label, 2 predictions, 1 correct.
@@ -2025,6 +2135,7 @@ class MultiLabelPrecisionAtKTest(test.TestCase):
     self._test_average_precision_at_k = functools.partial(
         _test_average_precision_at_k, test_case=self)
 
+  @test_util.run_deprecated_v1
   def test_average_precision(self):
     # Example 1.
     # Matches example here:
@@ -2100,6 +2211,7 @@ class MultiLabelPrecisionAtKTest(test.TestCase):
           expected=streaming_average_precision[i],
           weights=weights)
 
+  @test_util.run_deprecated_v1
   def test_average_precision_some_labels_out_of_range(self):
     """Tests that labels outside the [0, n_classes) range are ignored."""
     labels_ex1 = (-1, 0, 1, 2, 3, 4, 7)
@@ -2119,6 +2231,7 @@ class MultiLabelPrecisionAtKTest(test.TestCase):
       self._test_average_precision_at_k(
           predictions, labels, k, expected=avg_precision_ex1[i])
 
+  @test_util.run_deprecated_v1
   def test_three_labels_at_k5_no_predictions(self):
     predictions = [[0.5, 0.1, 0.6, 0.3, 0.8, 0.0, 0.7, 0.2, 0.4, 0.9],
                    [0.3, 0.0, 0.7, 0.2, 0.4, 0.9, 0.5, 0.8, 0.1, 0.6]]
@@ -2135,6 +2248,7 @@ class MultiLabelPrecisionAtKTest(test.TestCase):
         self._test_precision_at_top_k(
             predictions_idx, labels, k=5, expected=NAN, class_id=class_id)
 
+  @test_util.run_deprecated_v1
   def test_three_labels_at_k5_no_labels(self):
     predictions = [[0.5, 0.1, 0.6, 0.3, 0.8, 0.0, 0.7, 0.2, 0.4, 0.9],
                    [0.3, 0.0, 0.7, 0.2, 0.4, 0.9, 0.5, 0.8, 0.1, 0.6]]
@@ -2151,6 +2265,7 @@ class MultiLabelPrecisionAtKTest(test.TestCase):
         self._test_precision_at_top_k(
             predictions_idx, labels, k=5, expected=0.0, class_id=class_id)
 
+  @test_util.run_deprecated_v1
   def test_three_labels_at_k5(self):
     predictions = [[0.5, 0.1, 0.6, 0.3, 0.8, 0.0, 0.7, 0.2, 0.4, 0.9],
                    [0.3, 0.0, 0.7, 0.2, 0.4, 0.9, 0.5, 0.8, 0.1, 0.6]]
@@ -2184,6 +2299,7 @@ class MultiLabelPrecisionAtKTest(test.TestCase):
       self._test_precision_at_top_k(
           predictions_idx, labels, k=5, expected=3.0 / 10)
 
+  @test_util.run_deprecated_v1
   def test_three_labels_at_k5_some_out_of_range(self):
     """Tests that labels outside the [0, n_classes) range are ignored."""
     predictions = [[0.5, 0.1, 0.6, 0.3, 0.8, 0.0, 0.7, 0.2, 0.4, 0.9],
@@ -2220,6 +2336,7 @@ class MultiLabelPrecisionAtKTest(test.TestCase):
     self._test_precision_at_top_k(
         predictions_idx, sp_labels, k=5, expected=3.0 / 10)
 
+  @test_util.run_deprecated_v1
   def test_3d_nan(self):
     predictions = [[[0.5, 0.1, 0.6, 0.3, 0.8, 0.0, 0.7, 0.2, 0.4, 0.9],
                     [0.3, 0.0, 0.7, 0.2, 0.4, 0.9, 0.5, 0.8, 0.1, 0.6]],
@@ -2238,6 +2355,7 @@ class MultiLabelPrecisionAtKTest(test.TestCase):
       self._test_precision_at_top_k(
           predictions_idx, labels, k=5, expected=NAN, class_id=class_id)
 
+  @test_util.run_deprecated_v1
   def test_3d_no_labels(self):
     predictions = [[[0.5, 0.1, 0.6, 0.3, 0.8, 0.0, 0.7, 0.2, 0.4, 0.9],
                     [0.3, 0.0, 0.7, 0.2, 0.4, 0.9, 0.5, 0.8, 0.1, 0.6]],
@@ -2256,6 +2374,7 @@ class MultiLabelPrecisionAtKTest(test.TestCase):
       self._test_precision_at_top_k(
           predictions_idx, labels, k=5, expected=0.0, class_id=class_id)
 
+  @test_util.run_deprecated_v1
   def test_3d(self):
     predictions = [[[0.5, 0.1, 0.6, 0.3, 0.8, 0.0, 0.7, 0.2, 0.4, 0.9],
                     [0.3, 0.0, 0.7, 0.2, 0.4, 0.9, 0.5, 0.8, 0.1, 0.6]],
@@ -2291,6 +2410,7 @@ class MultiLabelPrecisionAtKTest(test.TestCase):
     self._test_precision_at_top_k(
         predictions_idx, labels, k=5, expected=7.0 / 20)
 
+  @test_util.run_deprecated_v1
   def test_3d_ignore_some(self):
     predictions = [[[0.5, 0.1, 0.6, 0.3, 0.8, 0.0, 0.7, 0.2, 0.4, 0.9],
                     [0.3, 0.0, 0.7, 0.2, 0.4, 0.9, 0.5, 0.8, 0.1, 0.6]],
@@ -2432,6 +2552,7 @@ class SingleLabelRecallAtKTest(test.TestCase):
     self._test_recall_at_top_k = functools.partial(
         _test_recall_at_top_k, test_case=self)
 
+  @test_util.run_deprecated_v1
   def test_at_k1_nan(self):
     # Classes 0,1 have 0 labels, 0 predictions, classes -1 and 4 are out of
     # range.
@@ -2442,6 +2563,7 @@ class SingleLabelRecallAtKTest(test.TestCase):
         self._test_recall_at_top_k(
             self._predictions_idx, labels, k=1, expected=NAN, class_id=class_id)
 
+  @test_util.run_deprecated_v1
   def test_at_k1_no_predictions(self):
     for labels in self._labels:
       # Class 2: 0 predictions.
@@ -2450,6 +2572,7 @@ class SingleLabelRecallAtKTest(test.TestCase):
       self._test_recall_at_top_k(
           self._predictions_idx, labels, k=1, expected=0.0, class_id=2)
 
+  @test_util.run_deprecated_v1
   def test_one_label_at_k1(self):
     for labels in self._labels:
       # Class 3: 1 label, 2 predictions, 1 correct.
@@ -2463,6 +2586,7 @@ class SingleLabelRecallAtKTest(test.TestCase):
       self._test_recall_at_top_k(
           self._predictions_idx, labels, k=1, expected=1.0 / 2)
 
+  @test_util.run_deprecated_v1
   def test_one_label_at_k1_weighted_class_id3(self):
     predictions = self._predictions
     predictions_idx = self._predictions_idx
@@ -2504,6 +2628,7 @@ class SingleLabelRecallAtKTest(test.TestCase):
           predictions_idx, labels, k=1, expected=2.0 / 2, class_id=3,
           weights=(2.0, 3.0))
 
+  @test_util.run_deprecated_v1
   def test_one_label_at_k1_weighted(self):
     predictions = self._predictions
     predictions_idx = self._predictions_idx
@@ -2553,6 +2678,7 @@ class MultiLabel2dRecallAtKTest(test.TestCase):
     self._test_recall_at_top_k = functools.partial(
         _test_recall_at_top_k, test_case=self)
 
+  @test_util.run_deprecated_v1
   def test_at_k5_nan(self):
     for labels in self._labels:
       # Classes 0,3,4,6,9 have 0 labels, class 10 is out of range.
@@ -2562,6 +2688,7 @@ class MultiLabel2dRecallAtKTest(test.TestCase):
         self._test_recall_at_top_k(
             self._predictions_idx, labels, k=5, expected=NAN, class_id=class_id)
 
+  @test_util.run_deprecated_v1
   def test_at_k5_no_predictions(self):
     for labels in self._labels:
       # Class 8: 1 label, no predictions.
@@ -2570,6 +2697,7 @@ class MultiLabel2dRecallAtKTest(test.TestCase):
       self._test_recall_at_top_k(
           self._predictions_idx, labels, k=5, expected=0.0 / 1, class_id=8)
 
+  @test_util.run_deprecated_v1
   def test_at_k5(self):
     for labels in self._labels:
       # Class 2: 2 labels, both correct.
@@ -2595,6 +2723,7 @@ class MultiLabel2dRecallAtKTest(test.TestCase):
       self._test_recall_at_top_k(
           self._predictions_idx, labels, k=5, expected=3.0 / 6)
 
+  @test_util.run_deprecated_v1
   def test_at_k5_some_out_of_range(self):
     """Tests that labels outside the [0, n_classes) count in denominator."""
     labels = sparse_tensor.SparseTensorValue(
@@ -2647,6 +2776,7 @@ class MultiLabel3dRecallAtKTest(test.TestCase):
     self._test_recall_at_top_k = functools.partial(
         _test_recall_at_top_k, test_case=self)
 
+  @test_util.run_deprecated_v1
   def test_3d_nan(self):
     # Classes 0,3,4,6,9 have 0 labels, class 10 is out of range.
     for class_id in (0, 3, 4, 6, 9, 10):
@@ -2656,6 +2786,7 @@ class MultiLabel3dRecallAtKTest(test.TestCase):
           self._predictions_idx, self._labels, k=5, expected=NAN,
           class_id=class_id)
 
+  @test_util.run_deprecated_v1
   def test_3d_no_predictions(self):
     # Classes 1,8 have 0 predictions, >=1 label.
     for class_id in (1, 8):
@@ -2665,6 +2796,7 @@ class MultiLabel3dRecallAtKTest(test.TestCase):
           self._predictions_idx, self._labels, k=5, expected=0.0,
           class_id=class_id)
 
+  @test_util.run_deprecated_v1
   def test_3d(self):
     # Class 2: 4 labels, all correct.
     self._test_recall_at_k(
@@ -2693,6 +2825,7 @@ class MultiLabel3dRecallAtKTest(test.TestCase):
     self._test_recall_at_top_k(
         self._predictions_idx, self._labels, k=5, expected=7.0 / 12)
 
+  @test_util.run_deprecated_v1
   def test_3d_ignore_all(self):
     for class_id in xrange(10):
       self._test_recall_at_k(
@@ -2719,6 +2852,7 @@ class MultiLabel3dRecallAtKTest(test.TestCase):
         self._predictions_idx, self._labels, k=5, expected=NAN,
         weights=[[0, 0], [0, 0]])
 
+  @test_util.run_deprecated_v1
   def test_3d_ignore_some(self):
     # Class 2: 2 labels, both correct.
     self._test_recall_at_k(
@@ -2774,12 +2908,14 @@ class MeanAbsoluteErrorTest(test.TestCase):
   def setUp(self):
     ops.reset_default_graph()
 
+  @test_util.run_deprecated_v1
   def testVars(self):
     metrics.mean_absolute_error(
         predictions=array_ops.ones((10, 1)), labels=array_ops.ones((10, 1)))
     _assert_metric_variables(
         self, ('mean_absolute_error/count:0', 'mean_absolute_error/total:0'))
 
+  @test_util.run_deprecated_v1
   def testMetricsCollection(self):
     my_collection_name = '__metrics__'
     mean, _ = metrics.mean_absolute_error(
@@ -2788,6 +2924,7 @@ class MeanAbsoluteErrorTest(test.TestCase):
         metrics_collections=[my_collection_name])
     self.assertListEqual(ops.get_collection(my_collection_name), [mean])
 
+  @test_util.run_deprecated_v1
   def testUpdatesCollection(self):
     my_collection_name = '__updates__'
     _, update_op = metrics.mean_absolute_error(
@@ -2796,23 +2933,25 @@ class MeanAbsoluteErrorTest(test.TestCase):
         updates_collections=[my_collection_name])
     self.assertListEqual(ops.get_collection(my_collection_name), [update_op])
 
+  @test_util.run_deprecated_v1
   def testValueTensorIsIdempotent(self):
     predictions = random_ops.random_normal((10, 3), seed=1)
     labels = random_ops.random_normal((10, 3), seed=2)
     error, update_op = metrics.mean_absolute_error(labels, predictions)
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
 
       # Run several updates.
       for _ in range(10):
-        sess.run(update_op)
+        self.evaluate(update_op)
 
       # Then verify idempotency.
       initial_error = error.eval()
       for _ in range(10):
         self.assertEqual(initial_error, error.eval())
 
+  @test_util.run_deprecated_v1
   def testSingleUpdateWithErrorAndWeights(self):
     predictions = constant_op.constant(
         [2, 4, 6, 8], shape=(1, 4), dtype=dtypes_lib.float32)
@@ -2823,8 +2962,8 @@ class MeanAbsoluteErrorTest(test.TestCase):
     error, update_op = metrics.mean_absolute_error(labels, predictions, weights)
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
-      self.assertEqual(3, sess.run(update_op))
+      self.evaluate(variables.local_variables_initializer())
+      self.assertEqual(3, self.evaluate(update_op))
       self.assertEqual(3, error.eval())
 
 
@@ -2833,6 +2972,7 @@ class MeanRelativeErrorTest(test.TestCase):
   def setUp(self):
     ops.reset_default_graph()
 
+  @test_util.run_deprecated_v1
   def testVars(self):
     metrics.mean_relative_error(
         predictions=array_ops.ones((10, 1)),
@@ -2841,6 +2981,7 @@ class MeanRelativeErrorTest(test.TestCase):
     _assert_metric_variables(
         self, ('mean_relative_error/count:0', 'mean_relative_error/total:0'))
 
+  @test_util.run_deprecated_v1
   def testMetricsCollection(self):
     my_collection_name = '__metrics__'
     mean, _ = metrics.mean_relative_error(
@@ -2850,6 +2991,7 @@ class MeanRelativeErrorTest(test.TestCase):
         metrics_collections=[my_collection_name])
     self.assertListEqual(ops.get_collection(my_collection_name), [mean])
 
+  @test_util.run_deprecated_v1
   def testUpdatesCollection(self):
     my_collection_name = '__updates__'
     _, update_op = metrics.mean_relative_error(
@@ -2859,6 +3001,7 @@ class MeanRelativeErrorTest(test.TestCase):
         updates_collections=[my_collection_name])
     self.assertListEqual(ops.get_collection(my_collection_name), [update_op])
 
+  @test_util.run_deprecated_v1
   def testValueTensorIsIdempotent(self):
     predictions = random_ops.random_normal((10, 3), seed=1)
     labels = random_ops.random_normal((10, 3), seed=2)
@@ -2867,17 +3010,18 @@ class MeanRelativeErrorTest(test.TestCase):
                                                    normalizer)
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
 
       # Run several updates.
       for _ in range(10):
-        sess.run(update_op)
+        self.evaluate(update_op)
 
       # Then verify idempotency.
       initial_error = error.eval()
       for _ in range(10):
         self.assertEqual(initial_error, error.eval())
 
+  @test_util.run_deprecated_v1
   def testSingleUpdateNormalizedByLabels(self):
     np_predictions = np.asarray([2, 4, 6, 8], dtype=np.float32)
     np_labels = np.asarray([1, 3, 2, 3], dtype=np.float32)
@@ -2892,10 +3036,11 @@ class MeanRelativeErrorTest(test.TestCase):
         labels, predictions, normalizer=labels)
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
-      self.assertEqual(expected_error, sess.run(update_op))
+      self.evaluate(variables.local_variables_initializer())
+      self.assertEqual(expected_error, self.evaluate(update_op))
       self.assertEqual(expected_error, error.eval())
 
+  @test_util.run_deprecated_v1
   def testSingleUpdateNormalizedByZeros(self):
     np_predictions = np.asarray([2, 4, 6, 8], dtype=np.float32)
 
@@ -2908,8 +3053,8 @@ class MeanRelativeErrorTest(test.TestCase):
         labels, predictions, normalizer=array_ops.zeros_like(labels))
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
-      self.assertEqual(0.0, sess.run(update_op))
+      self.evaluate(variables.local_variables_initializer())
+      self.assertEqual(0.0, self.evaluate(update_op))
       self.assertEqual(0.0, error.eval())
 
 
@@ -2918,12 +3063,14 @@ class MeanSquaredErrorTest(test.TestCase):
   def setUp(self):
     ops.reset_default_graph()
 
+  @test_util.run_deprecated_v1
   def testVars(self):
     metrics.mean_squared_error(
         predictions=array_ops.ones((10, 1)), labels=array_ops.ones((10, 1)))
     _assert_metric_variables(
         self, ('mean_squared_error/count:0', 'mean_squared_error/total:0'))
 
+  @test_util.run_deprecated_v1
   def testMetricsCollection(self):
     my_collection_name = '__metrics__'
     mean, _ = metrics.mean_squared_error(
@@ -2932,6 +3079,7 @@ class MeanSquaredErrorTest(test.TestCase):
         metrics_collections=[my_collection_name])
     self.assertListEqual(ops.get_collection(my_collection_name), [mean])
 
+  @test_util.run_deprecated_v1
   def testUpdatesCollection(self):
     my_collection_name = '__updates__'
     _, update_op = metrics.mean_squared_error(
@@ -2940,23 +3088,25 @@ class MeanSquaredErrorTest(test.TestCase):
         updates_collections=[my_collection_name])
     self.assertListEqual(ops.get_collection(my_collection_name), [update_op])
 
+  @test_util.run_deprecated_v1
   def testValueTensorIsIdempotent(self):
     predictions = random_ops.random_normal((10, 3), seed=1)
     labels = random_ops.random_normal((10, 3), seed=2)
     error, update_op = metrics.mean_squared_error(labels, predictions)
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
 
       # Run several updates.
       for _ in range(10):
-        sess.run(update_op)
+        self.evaluate(update_op)
 
       # Then verify idempotency.
       initial_error = error.eval()
       for _ in range(10):
         self.assertEqual(initial_error, error.eval())
 
+  @test_util.run_deprecated_v1
   def testSingleUpdateZeroError(self):
     predictions = array_ops.zeros((1, 3), dtype=dtypes_lib.float32)
     labels = array_ops.zeros((1, 3), dtype=dtypes_lib.float32)
@@ -2964,10 +3114,11 @@ class MeanSquaredErrorTest(test.TestCase):
     error, update_op = metrics.mean_squared_error(labels, predictions)
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
-      self.assertEqual(0, sess.run(update_op))
+      self.evaluate(variables.local_variables_initializer())
+      self.assertEqual(0, self.evaluate(update_op))
       self.assertEqual(0, error.eval())
 
+  @test_util.run_deprecated_v1
   def testSingleUpdateWithError(self):
     predictions = constant_op.constant(
         [2, 4, 6], shape=(1, 3), dtype=dtypes_lib.float32)
@@ -2977,10 +3128,11 @@ class MeanSquaredErrorTest(test.TestCase):
     error, update_op = metrics.mean_squared_error(labels, predictions)
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
-      self.assertEqual(6, sess.run(update_op))
+      self.evaluate(variables.local_variables_initializer())
+      self.assertEqual(6, self.evaluate(update_op))
       self.assertEqual(6, error.eval())
 
+  @test_util.run_deprecated_v1
   def testSingleUpdateWithErrorAndWeights(self):
     predictions = constant_op.constant(
         [2, 4, 6, 8], shape=(1, 4), dtype=dtypes_lib.float32)
@@ -2991,10 +3143,11 @@ class MeanSquaredErrorTest(test.TestCase):
     error, update_op = metrics.mean_squared_error(labels, predictions, weights)
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
-      self.assertEqual(13, sess.run(update_op))
+      self.evaluate(variables.local_variables_initializer())
+      self.assertEqual(13, self.evaluate(update_op))
       self.assertEqual(13, error.eval())
 
+  @test_util.run_deprecated_v1
   def testMultipleBatchesOfSizeOne(self):
     with self.cached_session() as sess:
       # Create the queue that populates the predictions.
@@ -3013,12 +3166,13 @@ class MeanSquaredErrorTest(test.TestCase):
 
       error, update_op = metrics.mean_squared_error(labels, predictions)
 
-      sess.run(variables.local_variables_initializer())
-      sess.run(update_op)
-      self.assertAlmostEqual(208.0 / 6, sess.run(update_op), 5)
+      self.evaluate(variables.local_variables_initializer())
+      self.evaluate(update_op)
+      self.assertAlmostEqual(208.0 / 6, self.evaluate(update_op), 5)
 
       self.assertAlmostEqual(208.0 / 6, error.eval(), 5)
 
+  @test_util.run_deprecated_v1
   def testMetricsComputedConcurrently(self):
     with self.cached_session() as sess:
       # Create the queue that populates one set of predictions.
@@ -3054,14 +3208,15 @@ class MeanSquaredErrorTest(test.TestCase):
       mse1, update_op1 = metrics.mean_squared_error(
           labels1, predictions1, name='msd1')
 
-      sess.run(variables.local_variables_initializer())
-      sess.run([update_op0, update_op1])
-      sess.run([update_op0, update_op1])
+      self.evaluate(variables.local_variables_initializer())
+      self.evaluate([update_op0, update_op1])
+      self.evaluate([update_op0, update_op1])
 
-      mse0, mse1 = sess.run([mse0, mse1])
+      mse0, mse1 = self.evaluate([mse0, mse1])
       self.assertAlmostEqual(208.0 / 6, mse0, 5)
       self.assertAlmostEqual(79.0 / 6, mse1, 5)
 
+  @test_util.run_deprecated_v1
   def testMultipleMetricsOnMultipleBatchesOfSizeOne(self):
     with self.cached_session() as sess:
       # Create the queue that populates the predictions.
@@ -3081,9 +3236,9 @@ class MeanSquaredErrorTest(test.TestCase):
       mae, ma_update_op = metrics.mean_absolute_error(labels, predictions)
       mse, ms_update_op = metrics.mean_squared_error(labels, predictions)
 
-      sess.run(variables.local_variables_initializer())
-      sess.run([ma_update_op, ms_update_op])
-      sess.run([ma_update_op, ms_update_op])
+      self.evaluate(variables.local_variables_initializer())
+      self.evaluate([ma_update_op, ms_update_op])
+      self.evaluate([ma_update_op, ms_update_op])
 
       self.assertAlmostEqual(32.0 / 6, mae.eval(), 5)
       self.assertAlmostEqual(208.0 / 6, mse.eval(), 5)
@@ -3094,6 +3249,7 @@ class RootMeanSquaredErrorTest(test.TestCase):
   def setUp(self):
     ops.reset_default_graph()
 
+  @test_util.run_deprecated_v1
   def testVars(self):
     metrics.root_mean_squared_error(
         predictions=array_ops.ones((10, 1)), labels=array_ops.ones((10, 1)))
@@ -3101,6 +3257,7 @@ class RootMeanSquaredErrorTest(test.TestCase):
         self,
         ('root_mean_squared_error/count:0', 'root_mean_squared_error/total:0'))
 
+  @test_util.run_deprecated_v1
   def testMetricsCollection(self):
     my_collection_name = '__metrics__'
     mean, _ = metrics.root_mean_squared_error(
@@ -3109,6 +3266,7 @@ class RootMeanSquaredErrorTest(test.TestCase):
         metrics_collections=[my_collection_name])
     self.assertListEqual(ops.get_collection(my_collection_name), [mean])
 
+  @test_util.run_deprecated_v1
   def testUpdatesCollection(self):
     my_collection_name = '__updates__'
     _, update_op = metrics.root_mean_squared_error(
@@ -3117,23 +3275,25 @@ class RootMeanSquaredErrorTest(test.TestCase):
         updates_collections=[my_collection_name])
     self.assertListEqual(ops.get_collection(my_collection_name), [update_op])
 
+  @test_util.run_deprecated_v1
   def testValueTensorIsIdempotent(self):
     predictions = random_ops.random_normal((10, 3), seed=1)
     labels = random_ops.random_normal((10, 3), seed=2)
     error, update_op = metrics.root_mean_squared_error(labels, predictions)
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
 
       # Run several updates.
       for _ in range(10):
-        sess.run(update_op)
+        self.evaluate(update_op)
 
       # Then verify idempotency.
       initial_error = error.eval()
       for _ in range(10):
         self.assertEqual(initial_error, error.eval())
 
+  @test_util.run_deprecated_v1
   def testSingleUpdateZeroError(self):
     with self.cached_session() as sess:
       predictions = constant_op.constant(
@@ -3142,11 +3302,12 @@ class RootMeanSquaredErrorTest(test.TestCase):
 
       rmse, update_op = metrics.root_mean_squared_error(labels, predictions)
 
-      sess.run(variables.local_variables_initializer())
-      self.assertEqual(0, sess.run(update_op))
+      self.evaluate(variables.local_variables_initializer())
+      self.assertEqual(0, self.evaluate(update_op))
 
       self.assertEqual(0, rmse.eval())
 
+  @test_util.run_deprecated_v1
   def testSingleUpdateWithError(self):
     with self.cached_session() as sess:
       predictions = constant_op.constant(
@@ -3156,10 +3317,11 @@ class RootMeanSquaredErrorTest(test.TestCase):
 
       rmse, update_op = metrics.root_mean_squared_error(labels, predictions)
 
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       self.assertAlmostEqual(math.sqrt(6), update_op.eval(), 5)
       self.assertAlmostEqual(math.sqrt(6), rmse.eval(), 5)
 
+  @test_util.run_deprecated_v1
   def testSingleUpdateWithErrorAndWeights(self):
     with self.cached_session() as sess:
       predictions = constant_op.constant(
@@ -3171,8 +3333,8 @@ class RootMeanSquaredErrorTest(test.TestCase):
       rmse, update_op = metrics.root_mean_squared_error(labels, predictions,
                                                         weights)
 
-      sess.run(variables.local_variables_initializer())
-      self.assertAlmostEqual(math.sqrt(13), sess.run(update_op))
+      self.evaluate(variables.local_variables_initializer())
+      self.assertAlmostEqual(math.sqrt(13), self.evaluate(update_op))
 
       self.assertAlmostEqual(math.sqrt(13), rmse.eval(), 5)
 
@@ -3187,6 +3349,7 @@ class MeanCosineDistanceTest(test.TestCase):
   def setUp(self):
     ops.reset_default_graph()
 
+  @test_util.run_deprecated_v1
   def testVars(self):
     metrics.mean_cosine_distance(
         predictions=array_ops.ones((10, 3)),
@@ -3197,6 +3360,7 @@ class MeanCosineDistanceTest(test.TestCase):
         'mean_cosine_distance/total:0',
     ))
 
+  @test_util.run_deprecated_v1
   def testMetricsCollection(self):
     my_collection_name = '__metrics__'
     mean, _ = metrics.mean_cosine_distance(
@@ -3206,6 +3370,7 @@ class MeanCosineDistanceTest(test.TestCase):
         metrics_collections=[my_collection_name])
     self.assertListEqual(ops.get_collection(my_collection_name), [mean])
 
+  @test_util.run_deprecated_v1
   def testUpdatesCollection(self):
     my_collection_name = '__updates__'
     _, update_op = metrics.mean_cosine_distance(
@@ -3215,23 +3380,25 @@ class MeanCosineDistanceTest(test.TestCase):
         updates_collections=[my_collection_name])
     self.assertListEqual(ops.get_collection(my_collection_name), [update_op])
 
+  @test_util.run_deprecated_v1
   def testValueTensorIsIdempotent(self):
     predictions = random_ops.random_normal((10, 3), seed=1)
     labels = random_ops.random_normal((10, 3), seed=2)
     error, update_op = metrics.mean_cosine_distance(labels, predictions, dim=1)
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
 
       # Run several updates.
       for _ in range(10):
-        sess.run(update_op)
+        self.evaluate(update_op)
 
       # Then verify idempotency.
       initial_error = error.eval()
       for _ in range(10):
         self.assertEqual(initial_error, error.eval())
 
+  @test_util.run_deprecated_v1
   def testSingleUpdateZeroError(self):
     np_labels = np.matrix(('1 0 0;' '0 0 1;' '0 1 0'))
 
@@ -3243,10 +3410,11 @@ class MeanCosineDistanceTest(test.TestCase):
     error, update_op = metrics.mean_cosine_distance(labels, predictions, dim=2)
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
-      self.assertEqual(0, sess.run(update_op))
+      self.evaluate(variables.local_variables_initializer())
+      self.assertEqual(0, self.evaluate(update_op))
       self.assertEqual(0, error.eval())
 
+  @test_util.run_deprecated_v1
   def testSingleUpdateWithError1(self):
     np_labels = np.matrix(('1 0 0;' '0 0 1;' '0 1 0'))
     np_predictions = np.matrix(('1 0 0;' '0 0 -1;' '1 0 0'))
@@ -3259,10 +3427,11 @@ class MeanCosineDistanceTest(test.TestCase):
     error, update_op = metrics.mean_cosine_distance(labels, predictions, dim=2)
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
-      self.assertAlmostEqual(1, sess.run(update_op), 5)
+      self.evaluate(variables.local_variables_initializer())
+      self.assertAlmostEqual(1, self.evaluate(update_op), 5)
       self.assertAlmostEqual(1, error.eval(), 5)
 
+  @test_util.run_deprecated_v1
   def testSingleUpdateWithError2(self):
     np_predictions = np.matrix(
         ('0.819031913261206 0.567041924552012 0.087465312324590;'
@@ -3280,10 +3449,11 @@ class MeanCosineDistanceTest(test.TestCase):
     error, update_op = metrics.mean_cosine_distance(labels, predictions, dim=2)
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
-      self.assertAlmostEqual(1.0, sess.run(update_op), 5)
+      self.evaluate(variables.local_variables_initializer())
+      self.assertAlmostEqual(1.0, self.evaluate(update_op), 5)
       self.assertAlmostEqual(1.0, error.eval(), 5)
 
+  @test_util.run_deprecated_v1
   def testSingleUpdateWithErrorAndWeights1(self):
     np_predictions = np.matrix(('1 0 0;' '0 0 -1;' '1 0 0'))
     np_labels = np.matrix(('1 0 0;' '0 0 1;' '0 1 0'))
@@ -3299,10 +3469,11 @@ class MeanCosineDistanceTest(test.TestCase):
         labels, predictions, dim=2, weights=weights)
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
-      self.assertEqual(0, sess.run(update_op))
+      self.evaluate(variables.local_variables_initializer())
+      self.assertEqual(0, self.evaluate(update_op))
       self.assertEqual(0, error.eval())
 
+  @test_util.run_deprecated_v1
   def testSingleUpdateWithErrorAndWeights2(self):
     np_predictions = np.matrix(('1 0 0;' '0 0 -1;' '1 0 0'))
     np_labels = np.matrix(('1 0 0;' '0 0 1;' '0 1 0'))
@@ -3318,7 +3489,7 @@ class MeanCosineDistanceTest(test.TestCase):
         labels, predictions, dim=2, weights=weights)
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       self.assertEqual(1.5, update_op.eval())
       self.assertEqual(1.5, error.eval())
 
@@ -3328,6 +3499,7 @@ class PcntBelowThreshTest(test.TestCase):
   def setUp(self):
     ops.reset_default_graph()
 
+  @test_util.run_deprecated_v1
   def testVars(self):
     metrics.percentage_below(values=array_ops.ones((10,)), threshold=2)
     _assert_metric_variables(self, (
@@ -3335,6 +3507,7 @@ class PcntBelowThreshTest(test.TestCase):
         'percentage_below_threshold/total:0',
     ))
 
+  @test_util.run_deprecated_v1
   def testMetricsCollection(self):
     my_collection_name = '__metrics__'
     mean, _ = metrics.percentage_below(
@@ -3343,6 +3516,7 @@ class PcntBelowThreshTest(test.TestCase):
         metrics_collections=[my_collection_name])
     self.assertListEqual(ops.get_collection(my_collection_name), [mean])
 
+  @test_util.run_deprecated_v1
   def testUpdatesCollection(self):
     my_collection_name = '__updates__'
     _, update_op = metrics.percentage_below(
@@ -3351,6 +3525,7 @@ class PcntBelowThreshTest(test.TestCase):
         updates_collections=[my_collection_name])
     self.assertListEqual(ops.get_collection(my_collection_name), [update_op])
 
+  @test_util.run_deprecated_v1
   def testOneUpdate(self):
     with self.cached_session() as sess:
       values = constant_op.constant(
@@ -3360,14 +3535,15 @@ class PcntBelowThreshTest(test.TestCase):
       pcnt1, update_op1 = metrics.percentage_below(values, 7, name='medium')
       pcnt2, update_op2 = metrics.percentage_below(values, 1, name='low')
 
-      sess.run(variables.local_variables_initializer())
-      sess.run([update_op0, update_op1, update_op2])
+      self.evaluate(variables.local_variables_initializer())
+      self.evaluate([update_op0, update_op1, update_op2])
 
-      pcnt0, pcnt1, pcnt2 = sess.run([pcnt0, pcnt1, pcnt2])
+      pcnt0, pcnt1, pcnt2 = self.evaluate([pcnt0, pcnt1, pcnt2])
       self.assertAlmostEqual(1.0, pcnt0, 5)
       self.assertAlmostEqual(0.75, pcnt1, 5)
       self.assertAlmostEqual(0.0, pcnt2, 5)
 
+  @test_util.run_deprecated_v1
   def testSomePresentOneUpdate(self):
     with self.cached_session() as sess:
       values = constant_op.constant(
@@ -3382,11 +3558,11 @@ class PcntBelowThreshTest(test.TestCase):
       pcnt2, update_op2 = metrics.percentage_below(
           values, 1, weights=weights, name='low')
 
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       self.assertListEqual([1.0, 0.5, 0.0],
-                           sess.run([update_op0, update_op1, update_op2]))
+                           self.evaluate([update_op0, update_op1, update_op2]))
 
-      pcnt0, pcnt1, pcnt2 = sess.run([pcnt0, pcnt1, pcnt2])
+      pcnt0, pcnt1, pcnt2 = self.evaluate([pcnt0, pcnt1, pcnt2])
       self.assertAlmostEqual(1.0, pcnt0, 5)
       self.assertAlmostEqual(0.5, pcnt1, 5)
       self.assertAlmostEqual(0.0, pcnt2, 5)
@@ -3398,6 +3574,7 @@ class MeanIOUTest(test.TestCase):
     np.random.seed(1)
     ops.reset_default_graph()
 
+  @test_util.run_deprecated_v1
   def testVars(self):
     metrics.mean_iou(
         predictions=array_ops.ones([10, 1]),
@@ -3405,6 +3582,7 @@ class MeanIOUTest(test.TestCase):
         num_classes=2)
     _assert_metric_variables(self, ('mean_iou/total_confusion_matrix:0',))
 
+  @test_util.run_deprecated_v1
   def testMetricsCollections(self):
     my_collection_name = '__metrics__'
     mean_iou, _ = metrics.mean_iou(
@@ -3414,6 +3592,7 @@ class MeanIOUTest(test.TestCase):
         metrics_collections=[my_collection_name])
     self.assertListEqual(ops.get_collection(my_collection_name), [mean_iou])
 
+  @test_util.run_deprecated_v1
   def testUpdatesCollection(self):
     my_collection_name = '__updates__'
     _, update_op = metrics.mean_iou(
@@ -3423,12 +3602,14 @@ class MeanIOUTest(test.TestCase):
         updates_collections=[my_collection_name])
     self.assertListEqual(ops.get_collection(my_collection_name), [update_op])
 
+  @test_util.run_deprecated_v1
   def testPredictionsAndLabelsOfDifferentSizeRaisesValueError(self):
     predictions = array_ops.ones([10, 3])
     labels = array_ops.ones([10, 4])
     with self.assertRaises(ValueError):
       metrics.mean_iou(labels, predictions, num_classes=2)
 
+  @test_util.run_deprecated_v1
   def testLabelsAndWeightsOfDifferentSizeRaisesValueError(self):
     predictions = array_ops.ones([10])
     labels = array_ops.ones([10])
@@ -3436,6 +3617,7 @@ class MeanIOUTest(test.TestCase):
     with self.assertRaises(ValueError):
       metrics.mean_iou(labels, predictions, num_classes=2, weights=weights)
 
+  @test_util.run_deprecated_v1
   def testValueTensorIsIdempotent(self):
     num_classes = 3
     predictions = random_ops.random_uniform(
@@ -3446,17 +3628,18 @@ class MeanIOUTest(test.TestCase):
         labels, predictions, num_classes=num_classes)
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
 
       # Run several updates.
       for _ in range(10):
-        sess.run(update_op)
+        self.evaluate(update_op)
 
       # Then verify idempotency.
       initial_mean_iou = mean_iou.eval()
       for _ in range(10):
         self.assertEqual(initial_mean_iou, mean_iou.eval())
 
+  @test_util.run_deprecated_v1
   def testMultipleUpdates(self):
     num_classes = 3
     with self.cached_session() as sess:
@@ -3482,12 +3665,13 @@ class MeanIOUTest(test.TestCase):
 
       miou, update_op = metrics.mean_iou(labels, predictions, num_classes)
 
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       for _ in range(5):
-        sess.run(update_op)
+        self.evaluate(update_op)
       desired_output = np.mean([1.0 / 2.0, 1.0 / 4.0, 0.])
       self.assertEqual(desired_output, miou.eval())
 
+  @test_util.run_deprecated_v1
   def testMultipleUpdatesWithWeights(self):
     num_classes = 2
     with self.cached_session() as sess:
@@ -3529,10 +3713,11 @@ class MeanIOUTest(test.TestCase):
 
       variables.local_variables_initializer().run()
       for _ in range(6):
-        sess.run(update_op)
+        self.evaluate(update_op)
       desired_output = np.mean([2.0 / 3.0, 1.0 / 2.0])
       self.assertAlmostEqual(desired_output, mean_iou.eval())
 
+  @test_util.run_deprecated_v1
   def testMultipleUpdatesWithMissingClass(self):
     # Test the case where there are no predicions and labels for
     # one class, and thus there is one row and one column with
@@ -3563,12 +3748,13 @@ class MeanIOUTest(test.TestCase):
 
       miou, update_op = metrics.mean_iou(labels, predictions, num_classes)
 
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       for _ in range(5):
-        sess.run(update_op)
+        self.evaluate(update_op)
       desired_output = np.mean([1.0 / 3.0, 2.0 / 4.0])
       self.assertAlmostEqual(desired_output, miou.eval())
 
+  @test_util.run_deprecated_v1
   def testUpdateOpEvalIsAccumulatedConfusionMatrix(self):
     predictions = array_ops.concat(
         [
@@ -3587,32 +3773,35 @@ class MeanIOUTest(test.TestCase):
     num_classes = 2
     with self.cached_session() as sess:
       miou, update_op = metrics.mean_iou(labels, predictions, num_classes)
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       confusion_matrix = update_op.eval()
       self.assertAllEqual([[3, 0], [2, 5]], confusion_matrix)
       desired_miou = np.mean([3. / 5., 5. / 7.])
       self.assertAlmostEqual(desired_miou, miou.eval())
 
+  @test_util.run_deprecated_v1
   def testAllCorrect(self):
     predictions = array_ops.zeros([40])
     labels = array_ops.zeros([40])
     num_classes = 1
     with self.cached_session() as sess:
       miou, update_op = metrics.mean_iou(labels, predictions, num_classes)
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       self.assertEqual(40, update_op.eval()[0])
       self.assertEqual(1.0, miou.eval())
 
+  @test_util.run_deprecated_v1
   def testAllWrong(self):
     predictions = array_ops.zeros([40])
     labels = array_ops.ones([40])
     num_classes = 2
     with self.cached_session() as sess:
       miou, update_op = metrics.mean_iou(labels, predictions, num_classes)
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       self.assertAllEqual([[0, 0], [40, 0]], update_op.eval())
       self.assertEqual(0., miou.eval())
 
+  @test_util.run_deprecated_v1
   def testResultsWithSomeMissing(self):
     predictions = array_ops.concat(
         [
@@ -3640,11 +3829,12 @@ class MeanIOUTest(test.TestCase):
     with self.cached_session() as sess:
       miou, update_op = metrics.mean_iou(
           labels, predictions, num_classes, weights=weights)
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       self.assertAllEqual([[2, 0], [2, 4]], update_op.eval())
       desired_miou = np.mean([2. / 4., 4. / 6.])
       self.assertAlmostEqual(desired_miou, miou.eval())
 
+  @test_util.run_deprecated_v1
   def testMissingClassInLabels(self):
     labels = constant_op.constant([
         [[0, 0, 1, 1, 0, 0],
@@ -3659,22 +3849,24 @@ class MeanIOUTest(test.TestCase):
     num_classes = 3
     with self.cached_session() as sess:
       miou, update_op = metrics.mean_iou(labels, predictions, num_classes)
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       self.assertAllEqual([[7, 4, 3], [3, 5, 2], [0, 0, 0]], update_op.eval())
       self.assertAlmostEqual(
           1 / 3 * (7 / (7 + 3 + 7) + 5 / (5 + 4 + 5) + 0 / (0 + 5 + 0)),
           miou.eval())
 
+  @test_util.run_deprecated_v1
   def testMissingClassOverallSmall(self):
     labels = constant_op.constant([0])
     predictions = constant_op.constant([0])
     num_classes = 2
     with self.cached_session() as sess:
       miou, update_op = metrics.mean_iou(labels, predictions, num_classes)
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       self.assertAllEqual([[1, 0], [0, 0]], update_op.eval())
       self.assertAlmostEqual(1, miou.eval())
 
+  @test_util.run_deprecated_v1
   def testMissingClassOverallLarge(self):
     labels = constant_op.constant([
         [[0, 0, 1, 1, 0, 0],
@@ -3689,7 +3881,7 @@ class MeanIOUTest(test.TestCase):
     num_classes = 3
     with self.cached_session() as sess:
       miou, update_op = metrics.mean_iou(labels, predictions, num_classes)
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       self.assertAllEqual([[9, 5, 0], [3, 7, 0], [0, 0, 0]], update_op.eval())
       self.assertAlmostEqual(
           1 / 2 * (9 / (9 + 3 + 5) + 7 / (7 + 5 + 3)), miou.eval())
@@ -3701,6 +3893,7 @@ class MeanPerClassAccuracyTest(test.TestCase):
     np.random.seed(1)
     ops.reset_default_graph()
 
+  @test_util.run_deprecated_v1
   def testVars(self):
     metrics.mean_per_class_accuracy(
         predictions=array_ops.ones([10, 1]),
@@ -3709,6 +3902,7 @@ class MeanPerClassAccuracyTest(test.TestCase):
     _assert_metric_variables(self, ('mean_accuracy/count:0',
                                     'mean_accuracy/total:0'))
 
+  @test_util.run_deprecated_v1
   def testMetricsCollections(self):
     my_collection_name = '__metrics__'
     mean_accuracy, _ = metrics.mean_per_class_accuracy(
@@ -3719,6 +3913,7 @@ class MeanPerClassAccuracyTest(test.TestCase):
     self.assertListEqual(
         ops.get_collection(my_collection_name), [mean_accuracy])
 
+  @test_util.run_deprecated_v1
   def testUpdatesCollection(self):
     my_collection_name = '__updates__'
     _, update_op = metrics.mean_per_class_accuracy(
@@ -3728,12 +3923,14 @@ class MeanPerClassAccuracyTest(test.TestCase):
         updates_collections=[my_collection_name])
     self.assertListEqual(ops.get_collection(my_collection_name), [update_op])
 
+  @test_util.run_deprecated_v1
   def testPredictionsAndLabelsOfDifferentSizeRaisesValueError(self):
     predictions = array_ops.ones([10, 3])
     labels = array_ops.ones([10, 4])
     with self.assertRaises(ValueError):
       metrics.mean_per_class_accuracy(labels, predictions, num_classes=2)
 
+  @test_util.run_deprecated_v1
   def testLabelsAndWeightsOfDifferentSizeRaisesValueError(self):
     predictions = array_ops.ones([10])
     labels = array_ops.ones([10])
@@ -3742,6 +3939,7 @@ class MeanPerClassAccuracyTest(test.TestCase):
       metrics.mean_per_class_accuracy(
           labels, predictions, num_classes=2, weights=weights)
 
+  @test_util.run_deprecated_v1
   def testValueTensorIsIdempotent(self):
     num_classes = 3
     predictions = random_ops.random_uniform(
@@ -3752,11 +3950,11 @@ class MeanPerClassAccuracyTest(test.TestCase):
         labels, predictions, num_classes=num_classes)
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
 
       # Run several updates.
       for _ in range(10):
-        sess.run(update_op)
+        self.evaluate(update_op)
 
       # Then verify idempotency.
       initial_mean_accuracy = mean_accuracy.eval()
@@ -3788,12 +3986,13 @@ class MeanPerClassAccuracyTest(test.TestCase):
       mean_accuracy, update_op = metrics.mean_per_class_accuracy(
           labels, predictions, num_classes)
 
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       for _ in range(5):
-        sess.run(update_op)
+        self.evaluate(update_op)
       desired_output = np.mean([1.0, 1.0 / 3.0, 0.0])
       self.assertAlmostEqual(desired_output, mean_accuracy.eval())
 
+  @test_util.run_deprecated_v1
   def testMultipleUpdatesWithWeights(self):
     num_classes = 2
     with self.cached_session() as sess:
@@ -3835,10 +4034,11 @@ class MeanPerClassAccuracyTest(test.TestCase):
 
       variables.local_variables_initializer().run()
       for _ in range(6):
-        sess.run(update_op)
+        self.evaluate(update_op)
       desired_output = np.mean([2.0 / 2.0, 0.5 / 1.5])
       self.assertAlmostEqual(desired_output, mean_accuracy.eval())
 
+  @test_util.run_deprecated_v1
   def testMultipleUpdatesWithMissingClass(self):
     # Test the case where there are no predicions and labels for
     # one class, and thus there is one row and one column with
@@ -3870,12 +4070,13 @@ class MeanPerClassAccuracyTest(test.TestCase):
       mean_accuracy, update_op = metrics.mean_per_class_accuracy(
           labels, predictions, num_classes)
 
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       for _ in range(5):
-        sess.run(update_op)
+        self.evaluate(update_op)
       desired_output = np.mean([1.0 / 2.0, 2.0 / 3.0, 0.])
       self.assertAlmostEqual(desired_output, mean_accuracy.eval())
 
+  @test_util.run_deprecated_v1
   def testAllCorrect(self):
     predictions = array_ops.zeros([40])
     labels = array_ops.zeros([40])
@@ -3883,10 +4084,11 @@ class MeanPerClassAccuracyTest(test.TestCase):
     with self.cached_session() as sess:
       mean_accuracy, update_op = metrics.mean_per_class_accuracy(
           labels, predictions, num_classes)
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       self.assertEqual(1.0, update_op.eval()[0])
       self.assertEqual(1.0, mean_accuracy.eval())
 
+  @test_util.run_deprecated_v1
   def testAllWrong(self):
     predictions = array_ops.zeros([40])
     labels = array_ops.ones([40])
@@ -3894,10 +4096,11 @@ class MeanPerClassAccuracyTest(test.TestCase):
     with self.cached_session() as sess:
       mean_accuracy, update_op = metrics.mean_per_class_accuracy(
           labels, predictions, num_classes)
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       self.assertAllEqual([0.0, 0.0], update_op.eval())
       self.assertEqual(0., mean_accuracy.eval())
 
+  @test_util.run_deprecated_v1
   def testResultsWithSomeMissing(self):
     predictions = array_ops.concat([
         constant_op.constant(0, shape=[5]), constant_op.constant(1, shape=[5])
@@ -3913,7 +4116,7 @@ class MeanPerClassAccuracyTest(test.TestCase):
     with self.cached_session() as sess:
       mean_accuracy, update_op = metrics.mean_per_class_accuracy(
           labels, predictions, num_classes, weights=weights)
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       desired_accuracy = np.array([2. / 2., 4. / 6.], dtype=np.float32)
       self.assertAllEqual(desired_accuracy, update_op.eval())
       desired_mean_accuracy = np.mean(desired_accuracy)
@@ -3926,12 +4129,14 @@ class FalseNegativesTest(test.TestCase):
     np.random.seed(1)
     ops.reset_default_graph()
 
+  @test_util.run_deprecated_v1
   def testVars(self):
     metrics.false_negatives(
         labels=(0, 1, 0, 1),
         predictions=(0, 0, 1, 1))
     _assert_metric_variables(self, ('false_negatives/count:0',))
 
+  @test_util.run_deprecated_v1
   def testUnweighted(self):
     labels = constant_op.constant(((0, 1, 0, 1, 0),
                                    (0, 0, 1, 1, 1),
@@ -3945,11 +4150,12 @@ class FalseNegativesTest(test.TestCase):
         labels=labels, predictions=predictions)
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       self.assertAllClose(0., tn.eval())
       self.assertAllClose(3., tn_update_op.eval())
       self.assertAllClose(3., tn.eval())
 
+  @test_util.run_deprecated_v1
   def testWeighted(self):
     labels = constant_op.constant(((0, 1, 0, 1, 0),
                                    (0, 0, 1, 1, 1),
@@ -3964,7 +4170,7 @@ class FalseNegativesTest(test.TestCase):
         labels=labels, predictions=predictions, weights=weights)
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       self.assertAllClose(0., tn.eval())
       self.assertAllClose(5., tn_update_op.eval())
       self.assertAllClose(5., tn.eval())
@@ -3976,6 +4182,7 @@ class FalseNegativesAtThresholdsTest(test.TestCase):
     np.random.seed(1)
     ops.reset_default_graph()
 
+  @test_util.run_deprecated_v1
   def testVars(self):
     metrics.false_negatives_at_thresholds(
         predictions=array_ops.ones((10, 1)),
@@ -3983,6 +4190,7 @@ class FalseNegativesAtThresholdsTest(test.TestCase):
         thresholds=[0.15, 0.5, 0.85])
     _assert_metric_variables(self, ('false_negatives/false_negatives:0',))
 
+  @test_util.run_deprecated_v1
   def testUnweighted(self):
     predictions = constant_op.constant(((0.9, 0.2, 0.8, 0.1),
                                         (0.2, 0.9, 0.7, 0.6),
@@ -3994,11 +4202,12 @@ class FalseNegativesAtThresholdsTest(test.TestCase):
         predictions=predictions, labels=labels, thresholds=[0.15, 0.5, 0.85])
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       self.assertAllEqual((0, 0, 0), fn.eval())
       self.assertAllEqual((0, 2, 3), fn_update_op.eval())
       self.assertAllEqual((0, 2, 3), fn.eval())
 
+  @test_util.run_deprecated_v1
   def testWeighted(self):
     predictions = constant_op.constant(((0.9, 0.2, 0.8, 0.1),
                                         (0.2, 0.9, 0.7, 0.6),
@@ -4013,7 +4222,7 @@ class FalseNegativesAtThresholdsTest(test.TestCase):
         thresholds=[0.15, 0.5, 0.85])
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       self.assertAllEqual((0.0, 0.0, 0.0), fn.eval())
       self.assertAllEqual((0.0, 8.0, 11.0), fn_update_op.eval())
       self.assertAllEqual((0.0, 8.0, 11.0), fn.eval())
@@ -4025,12 +4234,14 @@ class FalsePositivesTest(test.TestCase):
     np.random.seed(1)
     ops.reset_default_graph()
 
+  @test_util.run_deprecated_v1
   def testVars(self):
     metrics.false_positives(
         labels=(0, 1, 0, 1),
         predictions=(0, 0, 1, 1))
     _assert_metric_variables(self, ('false_positives/count:0',))
 
+  @test_util.run_deprecated_v1
   def testUnweighted(self):
     labels = constant_op.constant(((0, 1, 0, 1, 0),
                                    (0, 0, 1, 1, 1),
@@ -4044,11 +4255,12 @@ class FalsePositivesTest(test.TestCase):
         labels=labels, predictions=predictions)
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       self.assertAllClose(0., tn.eval())
       self.assertAllClose(7., tn_update_op.eval())
       self.assertAllClose(7., tn.eval())
 
+  @test_util.run_deprecated_v1
   def testWeighted(self):
     labels = constant_op.constant(((0, 1, 0, 1, 0),
                                    (0, 0, 1, 1, 1),
@@ -4063,7 +4275,7 @@ class FalsePositivesTest(test.TestCase):
         labels=labels, predictions=predictions, weights=weights)
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       self.assertAllClose(0., tn.eval())
       self.assertAllClose(14., tn_update_op.eval())
       self.assertAllClose(14., tn.eval())
@@ -4075,6 +4287,7 @@ class FalsePositivesAtThresholdsTest(test.TestCase):
     np.random.seed(1)
     ops.reset_default_graph()
 
+  @test_util.run_deprecated_v1
   def testVars(self):
     metrics.false_positives_at_thresholds(
         predictions=array_ops.ones((10, 1)),
@@ -4082,6 +4295,7 @@ class FalsePositivesAtThresholdsTest(test.TestCase):
         thresholds=[0.15, 0.5, 0.85])
     _assert_metric_variables(self, ('false_positives/false_positives:0',))
 
+  @test_util.run_deprecated_v1
   def testUnweighted(self):
     predictions = constant_op.constant(((0.9, 0.2, 0.8, 0.1),
                                         (0.2, 0.9, 0.7, 0.6),
@@ -4093,11 +4307,12 @@ class FalsePositivesAtThresholdsTest(test.TestCase):
         predictions=predictions, labels=labels, thresholds=[0.15, 0.5, 0.85])
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       self.assertAllEqual((0, 0, 0), fp.eval())
       self.assertAllEqual((7, 4, 2), fp_update_op.eval())
       self.assertAllEqual((7, 4, 2), fp.eval())
 
+  @test_util.run_deprecated_v1
   def testWeighted(self):
     predictions = constant_op.constant(((0.9, 0.2, 0.8, 0.1),
                                         (0.2, 0.9, 0.7, 0.6),
@@ -4114,7 +4329,7 @@ class FalsePositivesAtThresholdsTest(test.TestCase):
         thresholds=[0.15, 0.5, 0.85])
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       self.assertAllEqual((0.0, 0.0, 0.0), fp.eval())
       self.assertAllEqual((125.0, 42.0, 12.0), fp_update_op.eval())
       self.assertAllEqual((125.0, 42.0, 12.0), fp.eval())
@@ -4126,12 +4341,14 @@ class TrueNegativesTest(test.TestCase):
     np.random.seed(1)
     ops.reset_default_graph()
 
+  @test_util.run_deprecated_v1
   def testVars(self):
     metrics.true_negatives(
         labels=(0, 1, 0, 1),
         predictions=(0, 0, 1, 1))
     _assert_metric_variables(self, ('true_negatives/count:0',))
 
+  @test_util.run_deprecated_v1
   def testUnweighted(self):
     labels = constant_op.constant(((0, 1, 0, 1, 0),
                                    (0, 0, 1, 1, 1),
@@ -4145,11 +4362,12 @@ class TrueNegativesTest(test.TestCase):
         labels=labels, predictions=predictions)
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       self.assertAllClose(0., tn.eval())
       self.assertAllClose(3., tn_update_op.eval())
       self.assertAllClose(3., tn.eval())
 
+  @test_util.run_deprecated_v1
   def testWeighted(self):
     labels = constant_op.constant(((0, 1, 0, 1, 0),
                                    (0, 0, 1, 1, 1),
@@ -4164,7 +4382,7 @@ class TrueNegativesTest(test.TestCase):
         labels=labels, predictions=predictions, weights=weights)
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       self.assertAllClose(0., tn.eval())
       self.assertAllClose(4., tn_update_op.eval())
       self.assertAllClose(4., tn.eval())
@@ -4176,6 +4394,7 @@ class TrueNegativesAtThresholdsTest(test.TestCase):
     np.random.seed(1)
     ops.reset_default_graph()
 
+  @test_util.run_deprecated_v1
   def testVars(self):
     metrics.true_negatives_at_thresholds(
         predictions=array_ops.ones((10, 1)),
@@ -4183,6 +4402,7 @@ class TrueNegativesAtThresholdsTest(test.TestCase):
         thresholds=[0.15, 0.5, 0.85])
     _assert_metric_variables(self, ('true_negatives/true_negatives:0',))
 
+  @test_util.run_deprecated_v1
   def testUnweighted(self):
     predictions = constant_op.constant(((0.9, 0.2, 0.8, 0.1),
                                         (0.2, 0.9, 0.7, 0.6),
@@ -4194,11 +4414,12 @@ class TrueNegativesAtThresholdsTest(test.TestCase):
         predictions=predictions, labels=labels, thresholds=[0.15, 0.5, 0.85])
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       self.assertAllEqual((0, 0, 0), tn.eval())
       self.assertAllEqual((2, 5, 7), tn_update_op.eval())
       self.assertAllEqual((2, 5, 7), tn.eval())
 
+  @test_util.run_deprecated_v1
   def testWeighted(self):
     predictions = constant_op.constant(((0.9, 0.2, 0.8, 0.1),
                                         (0.2, 0.9, 0.7, 0.6),
@@ -4213,7 +4434,7 @@ class TrueNegativesAtThresholdsTest(test.TestCase):
         thresholds=[0.15, 0.5, 0.85])
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       self.assertAllEqual((0.0, 0.0, 0.0), tn.eval())
       self.assertAllEqual((5.0, 15.0, 23.0), tn_update_op.eval())
       self.assertAllEqual((5.0, 15.0, 23.0), tn.eval())
@@ -4225,12 +4446,14 @@ class TruePositivesTest(test.TestCase):
     np.random.seed(1)
     ops.reset_default_graph()
 
+  @test_util.run_deprecated_v1
   def testVars(self):
     metrics.true_positives(
         labels=(0, 1, 0, 1),
         predictions=(0, 0, 1, 1))
     _assert_metric_variables(self, ('true_positives/count:0',))
 
+  @test_util.run_deprecated_v1
   def testUnweighted(self):
     labels = constant_op.constant(((0, 1, 0, 1, 0),
                                    (0, 0, 1, 1, 1),
@@ -4244,11 +4467,12 @@ class TruePositivesTest(test.TestCase):
         labels=labels, predictions=predictions)
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       self.assertAllClose(0., tn.eval())
       self.assertAllClose(7., tn_update_op.eval())
       self.assertAllClose(7., tn.eval())
 
+  @test_util.run_deprecated_v1
   def testWeighted(self):
     labels = constant_op.constant(((0, 1, 0, 1, 0),
                                    (0, 0, 1, 1, 1),
@@ -4263,7 +4487,7 @@ class TruePositivesTest(test.TestCase):
         labels=labels, predictions=predictions, weights=weights)
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       self.assertAllClose(0., tn.eval())
       self.assertAllClose(12., tn_update_op.eval())
       self.assertAllClose(12., tn.eval())
@@ -4275,6 +4499,7 @@ class TruePositivesAtThresholdsTest(test.TestCase):
     np.random.seed(1)
     ops.reset_default_graph()
 
+  @test_util.run_deprecated_v1
   def testVars(self):
     metrics.true_positives_at_thresholds(
         predictions=array_ops.ones((10, 1)),
@@ -4282,6 +4507,7 @@ class TruePositivesAtThresholdsTest(test.TestCase):
         thresholds=[0.15, 0.5, 0.85])
     _assert_metric_variables(self, ('true_positives/true_positives:0',))
 
+  @test_util.run_deprecated_v1
   def testUnweighted(self):
     predictions = constant_op.constant(((0.9, 0.2, 0.8, 0.1),
                                         (0.2, 0.9, 0.7, 0.6),
@@ -4293,11 +4519,12 @@ class TruePositivesAtThresholdsTest(test.TestCase):
         predictions=predictions, labels=labels, thresholds=[0.15, 0.5, 0.85])
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       self.assertAllEqual((0, 0, 0), tp.eval())
       self.assertAllEqual((3, 1, 0), tp_update_op.eval())
       self.assertAllEqual((3, 1, 0), tp.eval())
 
+  @test_util.run_deprecated_v1
   def testWeighted(self):
     predictions = constant_op.constant(((0.9, 0.2, 0.8, 0.1),
                                         (0.2, 0.9, 0.7, 0.6),
@@ -4310,7 +4537,7 @@ class TruePositivesAtThresholdsTest(test.TestCase):
         thresholds=[0.15, 0.5, 0.85])
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       self.assertAllEqual((0.0, 0.0, 0.0), tp.eval())
       self.assertAllEqual((111.0, 37.0, 0.0), tp_update_op.eval())
       self.assertAllEqual((111.0, 37.0, 0.0), tp.eval())
diff --git a/tensorflow/python/kernel_tests/morphological_ops_test.py b/tensorflow/python/kernel_tests/morphological_ops_test.py
index 6d601554b80..f54aaf30d0a 100644
--- a/tensorflow/python/kernel_tests/morphological_ops_test.py
+++ b/tensorflow/python/kernel_tests/morphological_ops_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import nn_ops
 import tensorflow.python.ops.nn_grad  # pylint: disable=unused-import
@@ -52,7 +53,7 @@ class DilationTest(test.TestCase):
           rates=rates,
           padding=padding,
           name="dilation2d")
-      self.assertAllClose(out, out_tensor.eval())
+      self.assertAllClose(out, self.evaluate(out_tensor))
 
   def _testDilationValidPadding(self, use_gpu):
     # [1, 2, 2, 1]
@@ -216,7 +217,7 @@ class DilationTest(test.TestCase):
           rates=rates,
           padding=padding,
           name="dilation2d")
-      out_shape = out_tensor.eval().shape
+      out_shape = self.evaluate(out_tensor).shape
 
       # Small delta is necessary for argmax to remain the same.
       err = gradient_checker.compute_gradient_error(
@@ -291,6 +292,7 @@ class DilationTest(test.TestCase):
         padding="SAME",
         use_gpu=use_gpu)
 
+  @test_util.run_deprecated_v1
   def testDilationGrad(self):
     for use_gpu in True, False:
       self._testDilationGradValidPadding_1x1x1(use_gpu)
@@ -327,7 +329,7 @@ class ErosionTest(test.TestCase):
           rates=rates,
           padding=padding,
           name="erosion2d")
-      self.assertAllClose(out, out_tensor.eval())
+      self.assertAllClose(out, self.evaluate(out_tensor))
 
   def _testErosionValidPadding(self, use_gpu):
     # [1, 2, 2, 1]
@@ -491,7 +493,7 @@ class ErosionTest(test.TestCase):
           rates=rates,
           padding=padding,
           name="erosion2d")
-      out_shape = out_tensor.eval().shape
+      out_shape = self.evaluate(out_tensor).shape
 
       # Small delta is necessary for argmax to remain the same.
       err = gradient_checker.compute_gradient_error(
@@ -566,6 +568,7 @@ class ErosionTest(test.TestCase):
         padding="SAME",
         use_gpu=use_gpu)
 
+  @test_util.run_deprecated_v1
   def testErosionGrad(self):
     for use_gpu in True, False:
       self._testErosionGradValidPadding_1x1x1(use_gpu)
diff --git a/tensorflow/python/kernel_tests/neon_depthwise_conv_op_test.py b/tensorflow/python/kernel_tests/neon_depthwise_conv_op_test.py
index 15e38265421..380d2860da4 100644
--- a/tensorflow/python/kernel_tests/neon_depthwise_conv_op_test.py
+++ b/tensorflow/python/kernel_tests/neon_depthwise_conv_op_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import nn_impl
 from tensorflow.python.ops import nn_ops
@@ -142,8 +143,8 @@ class DepthwiseConv2DTest(test.TestCase):
       conv_interface = nn_impl.depthwise_conv2d(
           t1, t2, strides=[1, stride, stride, 1], padding=padding)
 
-      native_result = sess.run(conv_native)
-      interface_result = sess.run(conv_interface)
+      native_result = self.evaluate(conv_native)
+      interface_result = self.evaluate(conv_interface)
 
     print("depthwise conv_2d: ", tensor_in_sizes, "*", filter_in_sizes,
           ", stride:", stride, ", padding: ", padding, ", max diff: ",
@@ -153,6 +154,7 @@ class DepthwiseConv2DTest(test.TestCase):
     self.assertShapeEqual(native_result, conv_native)
     self.assertShapeEqual(native_result, conv_interface)
 
+  @test_util.run_deprecated_v1
   def testDepthwiseConv2D(self):
     for index, (input_size, filter_size, _, stride,
                 padding) in enumerate(ConfigsToTest()):
@@ -211,11 +213,12 @@ class DepthwiseConv2DTest(test.TestCase):
         t2 = constant_op.constant(x2, shape=filter_in_sizes)
         conv = nn_ops.depthwise_conv2d_native(
             t1, t2, strides=[1, stride, stride, 1], padding=padding)
-        value = sess.run(conv)
+        value = self.evaluate(conv)
     print("value = ", value)
     self.assertAllClose(expected, np.ravel(value), 1e-5)
     self.assertShapeEqual(value, conv)
 
+  @test_util.run_deprecated_v1
   def testConv2D2x2Filter(self):
     # The inputs look like this (it's a 3 x 2 matrix, each of depth 2):
     #
diff --git a/tensorflow/python/kernel_tests/norm_op_test.py b/tensorflow/python/kernel_tests/norm_op_test.py
index e202b6e8a43..5ff0c58bf1b 100644
--- a/tensorflow/python/kernel_tests/norm_op_test.py
+++ b/tensorflow/python/kernel_tests/norm_op_test.py
@@ -70,7 +70,7 @@ def _GetNormOpTest(dtype_, shape_, ord_, axis_, keep_dims_, use_static_shape_):
         tf_matrix = constant_op.constant(matrix)
         tf_norm = linalg_ops.norm(
             tf_matrix, ord=ord_, axis=axis_, keepdims=keep_dims_)
-        tf_norm_val = sess.run(tf_norm)
+        tf_norm_val = self.evaluate(tf_norm)
       else:
         tf_matrix = array_ops.placeholder(dtype_)
         tf_norm = linalg_ops.norm(
diff --git a/tensorflow/python/kernel_tests/nth_element_op_test.py b/tensorflow/python/kernel_tests/nth_element_op_test.py
index 338b6cec010..4be78b2d5ca 100644
--- a/tensorflow/python/kernel_tests/nth_element_op_test.py
+++ b/tensorflow/python/kernel_tests/nth_element_op_test.py
@@ -22,6 +22,7 @@ import numpy as np
 import tensorflow.python.ops.nn_grad  # pylint: disable=unused-import
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradients_impl
@@ -35,7 +36,7 @@ class NthElementTest(test.TestCase):
     with self.cached_session(use_gpu=False) as sess:
       inputs_op = ops.convert_to_tensor(inputs, dtype=dtype)
       values_op = nn_ops.nth_element(inputs_op, n, reverse=reverse)
-      values = sess.run(values_op)
+      values = self.evaluate(values_op)
 
       self.assertShapeEqual(np_expected_values, values_op)
       self.assertAllClose(np_expected_values, values)
@@ -111,17 +112,20 @@ class NthElementTest(test.TestCase):
     self._testEnumerateN([10, 10, 10])
     self._testEnumerateN([10, 10, 10, 10])
 
+  @test_util.run_deprecated_v1
   def testInvalidInput(self):
     with self.assertRaisesRegexp(ValueError,
                                  "at least rank 1 but is rank 0"):
       nn_ops.nth_element(5, 0)
 
+  @test_util.run_deprecated_v1
   def testInvalidInputAtEval(self):
     with self.session(use_gpu=False):
       v = array_ops.placeholder(dtype=dtypes.float32)
       with self.assertRaisesOpError("Input must be >= 1-D"):
         nn_ops.nth_element(v, 0).eval(feed_dict={v: 5.0})
 
+  @test_util.run_deprecated_v1
   def testInvalidN(self):
     with self.assertRaisesRegexp(ValueError,
                                  "non-negative but is -1"):
@@ -130,6 +134,7 @@ class NthElementTest(test.TestCase):
                                  "scalar but has rank 1"):
       nn_ops.nth_element([5, 6, 3], [1])
 
+  @test_util.run_deprecated_v1
   def testInvalidNAtEval(self):
     inputs = [[0.1, 0.2], [0.3, 0.4]]
     with self.session(use_gpu=False):
@@ -138,12 +143,14 @@ class NthElementTest(test.TestCase):
       with self.assertRaisesOpError("Need n >= 0, got -7"):
         values.eval(feed_dict={n: -7})
 
+  @test_util.run_deprecated_v1
   def testNTooLarge(self):
     inputs = [[0.1, 0.2], [0.3, 0.4]]
     with self.assertRaisesRegexp(ValueError,
                                  "must have last dimension > n = 2"):
       nn_ops.nth_element(inputs, 2)
 
+  @test_util.run_deprecated_v1
   def testNTooLargeAtEval(self):
     inputs = [[0.1, 0.2], [0.3, 0.4]]
     with self.session(use_gpu=False):
@@ -152,6 +159,7 @@ class NthElementTest(test.TestCase):
       with self.assertRaisesOpError(r"Input must have at least n\+1 columns"):
         values.eval(feed_dict={n: 2})
 
+  @test_util.run_deprecated_v1
   def testGradients(self):
     with self.session(use_gpu=False) as sess:
       inputs = array_ops.placeholder(dtypes.float32, shape=[3, 5])
diff --git a/tensorflow/python/kernel_tests/numerics_test.py b/tensorflow/python/kernel_tests/numerics_test.py
index 5db591ed304..5751f3fe767 100644
--- a/tensorflow/python/kernel_tests/numerics_test.py
+++ b/tensorflow/python/kernel_tests/numerics_test.py
@@ -23,6 +23,7 @@ import numpy as np
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
@@ -35,11 +36,11 @@ class VerifyTensorAllFiniteTest(test.TestCase):
   def testVerifyTensorAllFiniteSucceeds(self):
     x_shape = [5, 4]
     x = np.random.random_sample(x_shape).astype(np.float32)
-    with self.session(use_gpu=True):
+    with test_util.use_gpu():
       t = constant_op.constant(x, shape=x_shape, dtype=dtypes.float32)
       t_verified = numerics.verify_tensor_all_finite(t,
                                                      "Input is not a number.")
-      self.assertAllClose(x, t_verified.eval())
+      self.assertAllClose(x, self.evaluate(t_verified))
 
   def testVerifyTensorAllFiniteFails(self):
     x_shape = [5, 4]
@@ -48,23 +49,24 @@ class VerifyTensorAllFiniteTest(test.TestCase):
 
     # Test NaN.
     x[0] = np.nan
-    with self.session(use_gpu=True):
+    with test_util.use_gpu():
       with self.assertRaisesOpError(my_msg):
         t = constant_op.constant(x, shape=x_shape, dtype=dtypes.float32)
         t_verified = numerics.verify_tensor_all_finite(t, my_msg)
-        t_verified.eval()
+        self.evaluate(t_verified)
 
     # Test Inf.
     x[0] = np.inf
-    with self.session(use_gpu=True):
+    with test_util.use_gpu():
       with self.assertRaisesOpError(my_msg):
         t = constant_op.constant(x, shape=x_shape, dtype=dtypes.float32)
         t_verified = numerics.verify_tensor_all_finite(t, my_msg)
-        t_verified.eval()
+        self.evaluate(t_verified)
 
 
 class NumericsTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testInf(self):
     with self.session(graph=ops.Graph()):
       t1 = constant_op.constant(1.0)
@@ -73,8 +75,9 @@ class NumericsTest(test.TestCase):
       check = numerics.add_check_numerics_ops()
       a = control_flow_ops.with_dependencies([check], a)
       with self.assertRaisesOpError("Inf"):
-        a.eval()
+        self.evaluate(a)
 
+  @test_util.run_deprecated_v1
   def testNaN(self):
     with self.session(graph=ops.Graph()):
       t1 = constant_op.constant(0.0)
@@ -83,8 +86,9 @@ class NumericsTest(test.TestCase):
       check = numerics.add_check_numerics_ops()
       a = control_flow_ops.with_dependencies([check], a)
       with self.assertRaisesOpError("NaN"):
-        a.eval()
+        self.evaluate(a)
 
+  @test_util.run_deprecated_v1
   def testBoth(self):
     with self.session(graph=ops.Graph()):
       t1 = constant_op.constant([1.0, 0.0])
@@ -93,16 +97,17 @@ class NumericsTest(test.TestCase):
       check = numerics.add_check_numerics_ops()
       a = control_flow_ops.with_dependencies([check], a)
       with self.assertRaisesOpError("Inf and NaN"):
-        a.eval()
+        self.evaluate(a)
 
   def testPassThrough(self):
     with self.session(graph=ops.Graph()):
       t1 = constant_op.constant([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], shape=[2, 3])
       checked = array_ops.check_numerics(t1, message="pass through test")
-      value = checked.eval()
+      value = self.evaluate(checked)
       self.assertAllEqual(np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]), value)
       self.assertEqual([2, 3], checked.get_shape())
 
+  @test_util.run_deprecated_v1
   def testControlFlowCond(self):
     predicate = array_ops.placeholder(dtypes.bool, shape=[])
     _ = control_flow_ops.cond(predicate,
@@ -115,6 +120,7 @@ class NumericsTest(test.TestCase):
         r"or `tf.while_loop\(\)`\."):
       numerics.add_check_numerics_ops()
 
+  @test_util.run_deprecated_v1
   def testControlFlowWhile(self):
     predicate = array_ops.placeholder(dtypes.bool, shape=[])
     _ = control_flow_ops.while_loop(lambda _: predicate,
diff --git a/tensorflow/python/kernel_tests/one_hot_op_test.py b/tensorflow/python/kernel_tests/one_hot_op_test.py
index 377d545c9cd..856ba7bb7f3 100644
--- a/tensorflow/python/kernel_tests/one_hot_op_test.py
+++ b/tensorflow/python/kernel_tests/one_hot_op_test.py
@@ -41,12 +41,12 @@ class OneHotTest(test.TestCase):
       else:
         ans = array_ops.one_hot(**inputs)
         if expected_err_re is None:
-          tf_ans = ans.eval()
+          tf_ans = self.evaluate(ans)
           self.assertAllEqual(tf_ans, truth)
           self.assertEqual(tf_ans.shape, ans.get_shape())
         else:
           with self.assertRaisesOpError(expected_err_re):
-            ans.eval()
+            self.evaluate(ans)
 
   def _testBothOneHot(self, truth, expected_err_re=None, raises=None, **inputs):
     self._testOneHot(truth, True, expected_err_re, raises, **inputs)
diff --git a/tensorflow/python/kernel_tests/pad_op_test.py b/tensorflow/python/kernel_tests/pad_op_test.py
index fc302c4141a..7b1b054ae06 100644
--- a/tensorflow/python/kernel_tests/pad_op_test.py
+++ b/tensorflow/python/kernel_tests/pad_op_test.py
@@ -23,6 +23,7 @@ import numpy as np
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.platform import test
@@ -88,7 +89,7 @@ class PadOpTest(test.TestCase):
     with self.cached_session(use_gpu=True):
       tf_val = array_ops.pad(np_inputs, paddings, mode=mode,
                              constant_values=constant_values)
-      out = tf_val.eval()
+      out = self.evaluate(tf_val)
     self.assertAllEqual(np_val, out)
     self.assertShapeEqual(np_val, tf_val)
 
@@ -116,6 +117,7 @@ class PadOpTest(test.TestCase):
           self._testGradient(np_inputs, paddings, mode=mode,
                              constant_values=constant_values)
 
+  @test_util.run_deprecated_v1
   def testInputDims(self):
     with self.session(use_gpu=True):
       with self.assertRaises(ValueError):
@@ -124,6 +126,7 @@ class PadOpTest(test.TestCase):
                       array_ops.reshape(
                           [1, 2], shape=[1, 2]))
 
+  @test_util.run_deprecated_v1
   def testPaddingsDim(self):
     with self.session(use_gpu=True):
       with self.assertRaises(ValueError):
@@ -132,6 +135,7 @@ class PadOpTest(test.TestCase):
                       array_ops.reshape(
                           [1, 2], shape=[2]))
 
+  @test_util.run_deprecated_v1
   def testPaddingsDim2(self):
     with self.session(use_gpu=True):
       with self.assertRaises(ValueError):
@@ -140,6 +144,7 @@ class PadOpTest(test.TestCase):
                       array_ops.reshape(
                           [1, 2], shape=[2, 1]))
 
+  @test_util.run_deprecated_v1
   def testPaddingsDim3(self):
     with self.session(use_gpu=True):
       with self.assertRaises(ValueError):
@@ -148,6 +153,7 @@ class PadOpTest(test.TestCase):
                       array_ops.reshape(
                           [1, 2], shape=[1, 2]))
 
+  @test_util.run_deprecated_v1
   def testPaddingsDim4(self):
     with self.session(use_gpu=True):
       with self.assertRaises(ValueError):
@@ -156,6 +162,7 @@ class PadOpTest(test.TestCase):
                       array_ops.reshape(
                           [1, 2, 3, 4, 5, 6], shape=[3, 2]))
 
+  @test_util.run_deprecated_v1
   def testPaddingsNonNegative(self):
     with self.session(use_gpu=True):
       with self.assertRaisesRegexp(ValueError, "must be non-negative"):
@@ -164,6 +171,7 @@ class PadOpTest(test.TestCase):
                       constant_op.constant(
                           [-1, 0], shape=[1, 2]))
 
+  @test_util.run_deprecated_v1
   def testPaddingsNonNegative2(self):
     with self.session(use_gpu=True):
       with self.assertRaisesRegexp(ValueError, "must be non-negative"):
@@ -208,7 +216,7 @@ class PadOpTest(test.TestCase):
                                  constant_op.constant(paddings, padding_dtype),
                                  mode=mode,
                                  constant_values=0)
-          out = tf_val.eval()
+          out = self.evaluate(tf_val)
         self.assertAllEqual(np_val, out)
         self.assertShapeEqual(np_val, tf_val)
 
@@ -223,6 +231,7 @@ class PadOpTest(test.TestCase):
           np.random.randint(-100, 100, (4, 2, 1, 3)).astype(t),
           [[0, 0], [0, 0], [0, 0], [0, 0]], -123)
 
+  @test_util.run_deprecated_v1
   def testFloatTypes(self):
     for t in [np.float32, np.float64]:
       self._testAll(np.random.rand(2, 5).astype(t), [[1, 0], [2, 0]], 0.0)
@@ -250,17 +259,18 @@ class PadOpTest(test.TestCase):
     symmetric = array_ops.pad(x, [[1, 0], [0, 1]], mode="SYMMETRIC",
                               constant_values="PAD")
     with self.session(use_gpu=True):
-      self.assertAllEqual([[b"PAD", b"PAD", b"PAD"],
-                           [b"Hello", b"World", b"PAD"],
-                           [b"Goodnight", b"Moon", b"PAD"]], constant.eval())
+      self.assertAllEqual(
+          [[b"PAD", b"PAD", b"PAD"], [b"Hello", b"World", b"PAD"],
+           [b"Goodnight", b"Moon", b"PAD"]], self.evaluate(constant))
       self.assertAllEqual([[b"Goodnight", b"Moon", b"Goodnight"],
                            [b"Hello", b"World", b"Hello"],
                            [b"Goodnight", b"Moon", b"Goodnight"]],
-                          reflect.eval())
-      self.assertAllEqual([[b"Hello", b"World", b"World"],
-                           [b"Hello", b"World", b"World"],
-                           [b"Goodnight", b"Moon", b"Moon"]], symmetric.eval())
+                          self.evaluate(reflect))
+      self.assertAllEqual(
+          [[b"Hello", b"World", b"World"], [b"Hello", b"World", b"World"],
+           [b"Goodnight", b"Moon", b"Moon"]], self.evaluate(symmetric))
 
+  @test_util.run_deprecated_v1
   def testShapeFunctionEdgeCases(self):
     # Unknown paddings shape.
     inp = constant_op.constant(0.0, shape=[4, 4, 4, 4])
@@ -277,6 +287,7 @@ class PadOpTest(test.TestCase):
     padded = array_ops.pad(inp, array_ops.placeholder(dtypes.int32))
     self.assertAllEqual(None, padded.get_shape().ndims)
 
+  @test_util.run_deprecated_v1
   def testPartialShapeInformation(self):
     unknown = array_ops.placeholder(dtypes.int32)
 
@@ -327,7 +338,7 @@ class PadOpTest(test.TestCase):
     inp = np.asarray(7)
     with self.session(use_gpu=True):
       tf_val = array_ops.pad(inp, paddings)
-      out = tf_val.eval()
+      out = self.evaluate(tf_val)
     self.assertAllEqual(inp, out)
     self.assertShapeEqual(inp, tf_val)
 
@@ -337,10 +348,11 @@ class PadOpTest(test.TestCase):
       inp = np.asarray(7)
       with self.cached_session(use_gpu=True):
         tf_val = array_ops.pad(inp, constant_op.constant(paddings, dtype=dtype))
-        out = tf_val.eval()
+        out = self.evaluate(tf_val)
       self.assertAllEqual(inp, out)
       self.assertShapeEqual(inp, tf_val)
 
+  @test_util.run_deprecated_v1
   def testCollapseAdjacentNonPaddedDimensions(self):
     # pyformat: disable
     paddings_values = [[[0, 0], [0, 0], [0, 0], [0, 1]],
@@ -361,11 +373,12 @@ class PadOpTest(test.TestCase):
             [paddings_value[i][0] + inp.shape.dims[i].value for i in range(4)],
             [-1, -1, -1, -1])
         with self.cached_session(use_gpu=True):
-          self.assertAllEqual(inp.eval(), middle.eval())
+          self.assertAllEqual(inp.eval(), self.evaluate(middle))
           self.assertAllEqual(
-              np.zeros([row[0] for row in paddings_value]), left.eval())
+              np.zeros([row[0] for row in paddings_value]), self.evaluate(left))
           self.assertAllEqual(
-              np.zeros([row[1] for row in paddings_value]), right.eval())
+              np.zeros([row[1] for row in paddings_value]),
+              self.evaluate(right))
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/kernel_tests/padding_fifo_queue_test.py b/tensorflow/python/kernel_tests/padding_fifo_queue_test.py
index 95f3dcceeaa..b4818360d57 100644
--- a/tensorflow/python/kernel_tests/padding_fifo_queue_test.py
+++ b/tensorflow/python/kernel_tests/padding_fifo_queue_test.py
@@ -126,7 +126,7 @@ class PaddingFIFOQueueTest(test.TestCase):
 
       # Run one producer thread for each element in elems.
       def enqueue(enqueue_op):
-        sess.run(enqueue_op)
+        self.evaluate(enqueue_op)
 
       threads = [
           self.checkedThread(
@@ -158,7 +158,7 @@ class PaddingFIFOQueueTest(test.TestCase):
       results = []
 
       def dequeue():
-        results.append(sess.run(dequeued_t))
+        results.append(self.evaluate(dequeued_t))
 
       threads = [self.checkedThread(target=dequeue) for _ in enqueue_ops]
       for thread in threads:
@@ -178,7 +178,7 @@ class PaddingFIFOQueueTest(test.TestCase):
         enqueue_op.run()
 
       for i in xrange(len(elems)):
-        vals = dequeued_t.eval()
+        vals = self.evaluate(dequeued_t)
         self.assertEqual([elems[i]], vals)
 
   def testEnqueueAndBlockingDequeue(self):
@@ -193,13 +193,13 @@ class PaddingFIFOQueueTest(test.TestCase):
         # TODO(mrry): Figure out how to do this without sleeping.
         time.sleep(0.1)
         for enqueue_op in enqueue_ops:
-          sess.run(enqueue_op)
+          self.evaluate(enqueue_op)
 
       results = []
 
       def dequeue():
         for _ in xrange(len(elems)):
-          results.append(sess.run(dequeued_t))
+          results.append(self.evaluate(dequeued_t))
 
       enqueue_thread = self.checkedThread(target=enqueue)
       dequeue_thread = self.checkedThread(target=dequeue)
@@ -224,7 +224,7 @@ class PaddingFIFOQueueTest(test.TestCase):
         enqueue_op.run()
 
       for i in xrange(len(elems)):
-        x_val, y_val = sess.run(dequeued_t)
+        x_val, y_val = self.evaluate(dequeued_t)
         x, y = elems[i]
         self.assertEqual([x], x_val)
         self.assertEqual([y], y_val)
@@ -243,9 +243,9 @@ class PaddingFIFOQueueTest(test.TestCase):
       self.assertEqual([], size.get_shape())
 
       enqueue_op.run()
-      self.assertEqual(1, size.eval())
+      self.assertEqual(1, self.evaluate(size))
       dequeued_t.op.run()
-      self.assertEqual(0, size.eval())
+      self.assertEqual(0, self.evaluate(size))
 
   def testEnqueueMany(self):
     with self.cached_session():
@@ -257,7 +257,7 @@ class PaddingFIFOQueueTest(test.TestCase):
       enqueue_op.run()
 
       for i in range(8):
-        vals = dequeued_t.eval()
+        vals = self.evaluate(dequeued_t)
         self.assertEqual([elems[i % 4]], vals)
 
   def testEmptyEnqueueMany(self):
@@ -269,9 +269,9 @@ class PaddingFIFOQueueTest(test.TestCase):
       enqueue_op = q.enqueue_many((empty_t,))
       size_t = q.size()
 
-      self.assertEqual([0], size_t.eval())
+      self.assertEqual([0], self.evaluate(size_t))
       enqueue_op.run()
-      self.assertEqual([0], size_t.eval())
+      self.assertEqual([0], self.evaluate(size_t))
 
   def testEmptyDequeueMany(self):
     with self.cached_session():
@@ -279,9 +279,9 @@ class PaddingFIFOQueueTest(test.TestCase):
       enqueue_op = q.enqueue((10.0,))
       dequeued_t = q.dequeue_many(0)
 
-      self.assertEqual([], dequeued_t.eval().tolist())
+      self.assertEqual([], self.evaluate(dequeued_t).tolist())
       enqueue_op.run()
-      self.assertEqual([], dequeued_t.eval().tolist())
+      self.assertEqual([], self.evaluate(dequeued_t).tolist())
 
   def testEmptyDequeueManyWithDynamicShape(self):
     with self.cached_session():
@@ -290,9 +290,9 @@ class PaddingFIFOQueueTest(test.TestCase):
       enqueue_op = q.enqueue(([10.0],))
       dequeued_t = q.dequeue_many(0)
 
-      self.assertEqual([], dequeued_t.eval().tolist())
+      self.assertEqual([], self.evaluate(dequeued_t).tolist())
       enqueue_op.run()
-      self.assertEqual([], dequeued_t.eval().tolist())
+      self.assertEqual([], self.evaluate(dequeued_t).tolist())
 
   def testEmptyDequeueUpToWithDynamicShape(self):
     with self.cached_session():
@@ -301,9 +301,9 @@ class PaddingFIFOQueueTest(test.TestCase):
       enqueue_op = q.enqueue(([10.0],))
       dequeued_t = q.dequeue_up_to(0)
 
-      self.assertEqual([], dequeued_t.eval().tolist())
+      self.assertEqual([], self.evaluate(dequeued_t).tolist())
       enqueue_op.run()
-      self.assertEqual([], dequeued_t.eval().tolist())
+      self.assertEqual([], self.evaluate(dequeued_t).tolist())
 
   def testConstructPaddingFIFOQueueWithNoShape(self):
     with self.cached_session():
@@ -327,7 +327,7 @@ class PaddingFIFOQueueTest(test.TestCase):
       enqueue_op.run()
 
       for i in range(8):
-        float_val, int_val = sess.run(dequeued_t)
+        float_val, int_val = self.evaluate(dequeued_t)
         self.assertEqual(float_elems[i % 4], float_val)
         self.assertAllEqual(int_elems[i % 4], int_val)
 
@@ -344,7 +344,7 @@ class PaddingFIFOQueueTest(test.TestCase):
       enqueue_op.run()
 
       for i in range(8):
-        float_val, int_val = sess.run(dequeued_t)
+        float_val, int_val = self.evaluate(dequeued_t)
         self.assertEqual(float_elems[i % 4], float_val)
         self.assertAllEqual(int_elems[i % 4], int_val)
 
@@ -357,8 +357,8 @@ class PaddingFIFOQueueTest(test.TestCase):
 
       enqueue_op.run()
 
-      self.assertAllEqual(elems[0:4], dequeued_t.eval())
-      self.assertAllEqual(elems[4:8], dequeued_t.eval())
+      self.assertAllEqual(elems[0:4], self.evaluate(dequeued_t))
+      self.assertAllEqual(elems[4:8], self.evaluate(dequeued_t))
 
   def testDequeueUpToNoBlocking(self):
     with self.cached_session():
@@ -369,8 +369,8 @@ class PaddingFIFOQueueTest(test.TestCase):
 
       enqueue_op.run()
 
-      self.assertAllEqual(elems[0:4], dequeued_t.eval())
-      self.assertAllEqual(elems[4:8], dequeued_t.eval())
+      self.assertAllEqual(elems[0:4], self.evaluate(dequeued_t))
+      self.assertAllEqual(elems[4:8], self.evaluate(dequeued_t))
 
   def testMultiDequeueMany(self):
     with self.cached_session() as sess:
@@ -387,17 +387,17 @@ class PaddingFIFOQueueTest(test.TestCase):
 
       enqueue_op.run()
 
-      float_val, int_val = sess.run(dequeued_t)
+      float_val, int_val = self.evaluate(dequeued_t)
       self.assertAllEqual(float_elems[0:4], float_val)
       self.assertAllEqual(int_elems[0:4], int_val)
       self.assertEqual(float_val.shape, dequeued_t[0].get_shape())
       self.assertEqual(int_val.shape, dequeued_t[1].get_shape())
 
-      float_val, int_val = sess.run(dequeued_t)
+      float_val, int_val = self.evaluate(dequeued_t)
       self.assertAllEqual(float_elems[4:8], float_val)
       self.assertAllEqual(int_elems[4:8], int_val)
 
-      float_val, int_val = sess.run(dequeued_single_t)
+      float_val, int_val = self.evaluate(dequeued_single_t)
       self.assertAllEqual(float_elems[8], float_val)
       self.assertAllEqual(int_elems[8], int_val)
       self.assertEqual(float_val.shape, dequeued_single_t[0].get_shape())
@@ -418,7 +418,7 @@ class PaddingFIFOQueueTest(test.TestCase):
 
       enqueue_op.run()
 
-      float_val, int_val = sess.run(dequeued_t)
+      float_val, int_val = self.evaluate(dequeued_t)
       self.assertAllEqual(float_elems[0:4], float_val)
       self.assertAllEqual(int_elems[0:4], int_val)
       self.assertTrue(
@@ -428,11 +428,11 @@ class PaddingFIFOQueueTest(test.TestCase):
           tensor_shape.TensorShape(int_val.shape).is_compatible_with(dequeued_t[
               1].get_shape()))
 
-      float_val, int_val = sess.run(dequeued_t)
+      float_val, int_val = self.evaluate(dequeued_t)
       self.assertAllEqual(float_elems[4:8], float_val)
       self.assertAllEqual(int_elems[4:8], int_val)
 
-      float_val, int_val = sess.run(dequeued_single_t)
+      float_val, int_val = self.evaluate(dequeued_single_t)
       self.assertAllEqual(float_elems[8], float_val)
       self.assertAllEqual(int_elems[8], int_val)
       self.assertTrue(
@@ -459,7 +459,7 @@ class PaddingFIFOQueueTest(test.TestCase):
 
       for enqueue_op in enqueue_ops:
         enqueue_op.run()
-      string_val, int_val = sess.run(dequeued_t)
+      string_val, int_val = self.evaluate(dequeued_t)
 
       self.assertAllEqual([[b"a", b"", b""], [b"ab", b"", b""],
                            [b"abc", b"", b""], [b"abc", b"d", b""],
@@ -473,7 +473,7 @@ class PaddingFIFOQueueTest(test.TestCase):
           tensor_shape.TensorShape(int_val.shape).is_compatible_with(dequeued_t[
               1].get_shape()))
 
-      string_val, int_val = sess.run(dequeued_single_t)
+      string_val, int_val = self.evaluate(dequeued_single_t)
       self.assertAllEqual([b"abc", b"d", b"e", b"f"], string_val)
       self.assertAllEqual([[1, 2, 3, 4]], int_val)
       self.assertTrue(
@@ -500,7 +500,7 @@ class PaddingFIFOQueueTest(test.TestCase):
 
       for enqueue_op in enqueue_ops:
         enqueue_op.run()
-      string_val, int_val = sess.run(dequeued_t)
+      string_val, int_val = self.evaluate(dequeued_t)
 
       self.assertAllEqual([[b"a", b"", b""], [b"ab", b"", b""],
                            [b"abc", b"", b""], [b"abc", b"d", b""],
@@ -514,7 +514,7 @@ class PaddingFIFOQueueTest(test.TestCase):
           tensor_shape.TensorShape(int_val.shape).is_compatible_with(dequeued_t[
               1].get_shape()))
 
-      string_val, int_val = sess.run(dequeued_single_t)
+      string_val, int_val = self.evaluate(dequeued_single_t)
       self.assertAllEqual([b"abc", b"d", b"e", b"f"], string_val)
       self.assertAllEqual([[1, 2, 3, 4]], int_val)
       self.assertTrue(
@@ -622,7 +622,7 @@ class PaddingFIFOQueueTest(test.TestCase):
                                    r"Expected \[2,\?,3\], got \[2,3,4\]"):
         sess.run([enqueue_op],
                  feed_dict={elems_bad: np.array([1] * 24).reshape((2, 3, 4))})
-        dequeued_t.eval()
+        self.evaluate(dequeued_t)
 
   def testParallelEnqueueMany(self):
     with self.cached_session() as sess:
@@ -633,7 +633,7 @@ class PaddingFIFOQueueTest(test.TestCase):
 
       # Enqueue 100 items in parallel on 10 threads.
       def enqueue():
-        sess.run(enqueue_op)
+        self.evaluate(enqueue_op)
 
       threads = [self.checkedThread(target=enqueue) for _ in range(10)]
       for thread in threads:
@@ -656,7 +656,7 @@ class PaddingFIFOQueueTest(test.TestCase):
       dequeued_elems = []
 
       def dequeue():
-        dequeued_elems.extend(sess.run(dequeued_t))
+        dequeued_elems.extend(self.evaluate(dequeued_t))
 
       threads = [self.checkedThread(target=dequeue) for _ in range(10)]
       for thread in threads:
@@ -680,7 +680,7 @@ class PaddingFIFOQueueTest(test.TestCase):
       dequeued_elems = []
 
       def dequeue():
-        dequeued_elems.extend(sess.run(dequeued_t))
+        dequeued_elems.extend(self.evaluate(dequeued_t))
 
       threads = [self.checkedThread(target=dequeue) for _ in range(10)]
       for thread in threads:
@@ -700,11 +700,11 @@ class PaddingFIFOQueueTest(test.TestCase):
 
       def enqueue():
         for _ in xrange(100):
-          sess.run(enqueue_op)
+          self.evaluate(enqueue_op)
 
       def dequeue():
         for _ in xrange(100):
-          self.assertTrue(sess.run(dequeued_t) in (10.0, 20.0))
+          self.assertTrue(self.evaluate(dequeued_t) in (10.0, 20.0))
 
       enqueue_threads = [self.checkedThread(target=enqueue) for _ in range(10)]
       dequeue_threads = [self.checkedThread(target=dequeue) for _ in range(10)]
@@ -736,7 +736,7 @@ class PaddingFIFOQueueTest(test.TestCase):
 
       def dequeue():
         for i in xrange(250):
-          self.assertEqual(i, sess.run(dequeued_t))
+          self.assertEqual(i, self.evaluate(dequeued_t))
 
       dequeue_thread = self.checkedThread(target=dequeue)
       dequeue_thread.start()
@@ -767,7 +767,7 @@ class PaddingFIFOQueueTest(test.TestCase):
       dequeuemany_t = q.dequeue_many(count_placeholder)
 
       def enqueue():
-        sess.run(enqueue_op)
+        self.evaluate(enqueue_op)
 
       enqueue_thread = self.checkedThread(target=enqueue)
       enqueue_thread.start()
@@ -776,7 +776,7 @@ class PaddingFIFOQueueTest(test.TestCase):
       while elements_dequeued < 250:
         # With equal probability, run Dequeue or dequeue_many.
         if random.random() > 0.5:
-          self.assertEqual(elements_dequeued, dequeued_t.eval())
+          self.assertEqual(elements_dequeued, self.evaluate(dequeued_t))
           elements_dequeued += 1
         else:
           count = random.randint(0, min(20, 250 - elements_dequeued))
@@ -805,10 +805,10 @@ class PaddingFIFOQueueTest(test.TestCase):
         # The enqueue_op should run after the dequeue op has blocked.
         # TODO(mrry): Figure out how to do this without sleeping.
         time.sleep(0.1)
-        sess.run(enqueue_op)
+        self.evaluate(enqueue_op)
 
       def dequeue():
-        dequeued_elems.extend(sess.run(dequeued_t).tolist())
+        dequeued_elems.extend(self.evaluate(dequeued_t).tolist())
 
       enqueue_thread = self.checkedThread(target=enqueue)
       dequeue_thread = self.checkedThread(target=dequeue)
@@ -832,10 +832,10 @@ class PaddingFIFOQueueTest(test.TestCase):
         # The enqueue_op should run after the dequeue op has blocked.
         # TODO(mrry): Figure out how to do this without sleeping.
         time.sleep(0.1)
-        sess.run(enqueue_op)
+        self.evaluate(enqueue_op)
 
       def dequeue():
-        dequeued_elems.extend(sess.run(dequeued_t).tolist())
+        dequeued_elems.extend(self.evaluate(dequeued_t).tolist())
 
       enqueue_thread = self.checkedThread(target=enqueue)
       dequeue_thread = self.checkedThread(target=dequeue)
@@ -882,12 +882,12 @@ class PaddingFIFOQueueTest(test.TestCase):
       enqueue_op.run()
       close_op.run()
       for elem in elems:
-        self.assertEqual([elem], dequeued_t.eval())
+        self.assertEqual([elem], self.evaluate(dequeued_t))
 
       # Expect the operation to fail due to the queue being closed.
       with self.assertRaisesRegexp(errors_impl.OutOfRangeError,
                                    "is closed and has insufficient"):
-        dequeued_t.eval()
+        self.evaluate(dequeued_t)
 
   def testBlockingDequeueFromClosedQueue(self):
     with self.cached_session() as sess:
@@ -901,11 +901,11 @@ class PaddingFIFOQueueTest(test.TestCase):
 
       def dequeue():
         for elem in elems:
-          self.assertEqual([elem], sess.run(dequeued_t))
+          self.assertEqual([elem], self.evaluate(dequeued_t))
         # Expect the operation to fail due to the queue being closed.
         with self.assertRaisesRegexp(errors_impl.OutOfRangeError,
                                      "is closed and has insufficient"):
-          sess.run(dequeued_t)
+          self.evaluate(dequeued_t)
 
       dequeue_thread = self.checkedThread(target=dequeue)
       dequeue_thread.start()
@@ -926,8 +926,8 @@ class PaddingFIFOQueueTest(test.TestCase):
       enqueue_op.run()
 
       def dequeue():
-        self.assertAllEqual(elems[:3], sess.run(dequeued_t))
-        self.assertAllEqual(elems[3:], sess.run(dequeued_t))
+        self.assertAllEqual(elems[:3], self.evaluate(dequeued_t))
+        self.assertAllEqual(elems[3:], self.evaluate(dequeued_t))
 
       dequeue_thread = self.checkedThread(target=dequeue)
       dequeue_thread.start()
@@ -947,7 +947,7 @@ class PaddingFIFOQueueTest(test.TestCase):
         # Expect the operation to fail due to the queue being closed.
         with self.assertRaisesRegexp(errors_impl.OutOfRangeError,
                                      "is closed and has insufficient"):
-          sess.run(dequeued_t)
+          self.evaluate(dequeued_t)
 
       dequeue_thread = self.checkedThread(target=dequeue)
       dequeue_thread.start()
@@ -968,11 +968,11 @@ class PaddingFIFOQueueTest(test.TestCase):
       enqueue_op.run()
 
       def dequeue():
-        self.assertAllEqual(elems, sess.run(dequeued_t))
+        self.assertAllEqual(elems, self.evaluate(dequeued_t))
         # Expect the operation to fail due to the queue being closed.
         with self.assertRaisesRegexp(errors_impl.OutOfRangeError,
                                      "is closed and has insufficient"):
-          sess.run(dequeued_t)
+          self.evaluate(dequeued_t)
 
       dequeue_thread = self.checkedThread(target=dequeue)
       dequeue_thread.start()
@@ -993,11 +993,11 @@ class PaddingFIFOQueueTest(test.TestCase):
       enqueue_op.run()
 
       def dequeue():
-        self.assertAllEqual(elems[:3], sess.run(dequeued_t))
+        self.assertAllEqual(elems[:3], self.evaluate(dequeued_t))
         # Expect the operation to fail due to the queue being closed.
         with self.assertRaisesRegexp(errors_impl.OutOfRangeError,
                                      "is closed and has insufficient"):
-          sess.run(dequeued_t)
+          self.evaluate(dequeued_t)
 
       dequeue_thread = self.checkedThread(target=dequeue)
       dequeue_thread.start()
@@ -1017,16 +1017,16 @@ class PaddingFIFOQueueTest(test.TestCase):
       cleanup_dequeue_t = q.dequeue()
 
       def enqueue():
-        sess.run(enqueue_op)
+        self.evaluate(enqueue_op)
 
       def dequeue():
-        self.assertAllEqual(elems[0:3], sess.run(dequeued_t))
+        self.assertAllEqual(elems[0:3], self.evaluate(dequeued_t))
         with self.assertRaises(errors_impl.OutOfRangeError):
-          sess.run(dequeued_t)
-        self.assertEqual(elems[3], sess.run(cleanup_dequeue_t))
+          self.evaluate(dequeued_t)
+        self.assertEqual(elems[3], self.evaluate(cleanup_dequeue_t))
 
       def close():
-        sess.run(close_op)
+        self.evaluate(close_op)
 
       enqueue_thread = self.checkedThread(target=enqueue)
       enqueue_thread.start()
@@ -1059,7 +1059,7 @@ class PaddingFIFOQueueTest(test.TestCase):
 
       def dequeue():
         with self.assertRaises(errors_impl.OutOfRangeError):
-          sess.run([dequeued_a_t, dequeued_b_t])
+          self.evaluate([dequeued_a_t, dequeued_b_t])
 
       dequeue_thread = self.checkedThread(target=dequeue)
       dequeue_thread.start()
@@ -1072,7 +1072,7 @@ class PaddingFIFOQueueTest(test.TestCase):
       # Test that the elements in the partially-dequeued batch are
       # restored in the correct order.
       for elem_a, elem_b in zip(elems_a, elems_b):
-        val_a, val_b = sess.run([cleanup_dequeue_a_t, cleanup_dequeue_b_t])
+        val_a, val_b = self.evaluate([cleanup_dequeue_a_t, cleanup_dequeue_b_t])
         self.assertEqual(elem_a, val_a)
         self.assertEqual(elem_b, val_b)
       self.assertEqual(0, q.size().eval())
@@ -1087,7 +1087,7 @@ class PaddingFIFOQueueTest(test.TestCase):
         # Expect the operation to fail due to the queue being closed.
         with self.assertRaisesRegexp(errors_impl.OutOfRangeError,
                                      "is closed and has insufficient"):
-          sess.run(dequeued_t)
+          self.evaluate(dequeued_t)
 
       dequeue_thread = self.checkedThread(target=dequeue)
       dequeue_thread.start()
@@ -1107,7 +1107,7 @@ class PaddingFIFOQueueTest(test.TestCase):
         # Expect the operation to fail due to the queue being closed.
         with self.assertRaisesRegexp(errors_impl.OutOfRangeError,
                                      "is closed and has insufficient"):
-          sess.run(dequeued_t)
+          self.evaluate(dequeued_t)
 
       dequeue_thread = self.checkedThread(target=dequeue)
       dequeue_thread.start()
@@ -1155,7 +1155,7 @@ class PaddingFIFOQueueTest(test.TestCase):
       enqueue_op.run()
 
       def blocking_enqueue():
-        sess.run(blocking_enqueue_op)
+        self.evaluate(blocking_enqueue_op)
 
       thread = self.checkedThread(target=blocking_enqueue)
       thread.start()
@@ -1163,8 +1163,8 @@ class PaddingFIFOQueueTest(test.TestCase):
       # TODO(mrry): Figure out how to do this without sleeping.
       time.sleep(0.1)
       for elem in elems:
-        self.assertEqual([elem], dequeued_t.eval())
-      self.assertEqual([50.0], dequeued_t.eval())
+        self.assertEqual([elem], self.evaluate(dequeued_t))
+      self.assertEqual([50.0], self.evaluate(dequeued_t))
       thread.join()
 
   def testBlockingEnqueueManyToFullQueue(self):
@@ -1178,7 +1178,7 @@ class PaddingFIFOQueueTest(test.TestCase):
       enqueue_op.run()
 
       def blocking_enqueue():
-        sess.run(blocking_enqueue_op)
+        self.evaluate(blocking_enqueue_op)
 
       thread = self.checkedThread(target=blocking_enqueue)
       thread.start()
@@ -1186,10 +1186,10 @@ class PaddingFIFOQueueTest(test.TestCase):
       # TODO(mrry): Figure out how to do this without sleeping.
       time.sleep(0.1)
       for elem in elems:
-        self.assertEqual([elem], dequeued_t.eval())
+        self.assertEqual([elem], self.evaluate(dequeued_t))
         time.sleep(0.01)
-      self.assertEqual([50.0], dequeued_t.eval())
-      self.assertEqual([60.0], dequeued_t.eval())
+      self.assertEqual([50.0], self.evaluate(dequeued_t))
+      self.assertEqual([60.0], self.evaluate(dequeued_t))
 
       # Make sure the thread finishes before exiting.
       thread.join()
@@ -1207,7 +1207,7 @@ class PaddingFIFOQueueTest(test.TestCase):
 
       def blocking_enqueue():
         # Expect the operation to succeed once the dequeue op runs.
-        sess.run(blocking_enqueue_op)
+        self.evaluate(blocking_enqueue_op)
 
       enqueue_thread = self.checkedThread(target=blocking_enqueue)
       enqueue_thread.start()
@@ -1217,18 +1217,18 @@ class PaddingFIFOQueueTest(test.TestCase):
       time.sleep(0.1)
 
       def close():
-        sess.run(close_op)
+        self.evaluate(close_op)
 
       close_thread = self.checkedThread(target=close)
       close_thread.start()
 
       # The dequeue will unblock both threads.
-      self.assertEqual(10.0, dequeued_t.eval())
+      self.assertEqual(10.0, self.evaluate(dequeued_t))
       enqueue_thread.join()
       close_thread.join()
 
       for elem in [20.0, 30.0, 40.0, 50.0]:
-        self.assertEqual(elem, dequeued_t.eval())
+        self.assertEqual(elem, self.evaluate(dequeued_t))
       self.assertEqual(0, q.size().eval())
 
   def testBlockingEnqueueManyBeforeClose(self):
@@ -1242,7 +1242,7 @@ class PaddingFIFOQueueTest(test.TestCase):
       enqueue_op.run()
 
       def blocking_enqueue():
-        sess.run(blocking_enqueue_op)
+        self.evaluate(blocking_enqueue_op)
 
       enqueue_thread = self.checkedThread(target=blocking_enqueue)
       enqueue_thread.start()
@@ -1252,17 +1252,17 @@ class PaddingFIFOQueueTest(test.TestCase):
       time.sleep(0.1)
 
       def close():
-        sess.run(close_op)
+        self.evaluate(close_op)
 
       close_thread = self.checkedThread(target=close)
       close_thread.start()
 
       # The dequeue will unblock both threads.
-      self.assertEqual(10.0, dequeued_t.eval())
+      self.assertEqual(10.0, self.evaluate(dequeued_t))
       enqueue_thread.join()
       close_thread.join()
       for elem in [20.0, 30.0, 50.0, 60.0]:
-        self.assertEqual(elem, dequeued_t.eval())
+        self.assertEqual(elem, self.evaluate(dequeued_t))
 
   def testDoesNotLoseValue(self):
     with self.cached_session():
@@ -1379,19 +1379,19 @@ class PaddingFIFOQueueTest(test.TestCase):
 
   def _blockingDequeue(self, sess, dequeue_op):
     with self.assertRaisesOpError("was cancelled"):
-      sess.run(dequeue_op)
+      self.evaluate(dequeue_op)
 
   def _blockingDequeueMany(self, sess, dequeue_many_op):
     with self.assertRaisesOpError("was cancelled"):
-      sess.run(dequeue_many_op)
+      self.evaluate(dequeue_many_op)
 
   def _blockingEnqueue(self, sess, enqueue_op):
     with self.assertRaisesOpError("was cancelled"):
-      sess.run(enqueue_op)
+      self.evaluate(enqueue_op)
 
   def _blockingEnqueueMany(self, sess, enqueue_many_op):
     with self.assertRaisesOpError("was cancelled"):
-      sess.run(enqueue_many_op)
+      self.evaluate(enqueue_many_op)
 
   def testResetOfBlockingOperation(self):
     with self.cached_session() as sess:
@@ -1434,7 +1434,7 @@ class PaddingFIFOQueueTest(test.TestCase):
       def blocking_enqueue():
         enq_done.append(False)
         # This will fill the queue and then block until enough dequeues happen.
-        sess.run(enq)
+        self.evaluate(enq)
         enq_done.append(True)
 
       thread = self.checkedThread(target=blocking_enqueue)
@@ -1444,14 +1444,14 @@ class PaddingFIFOQueueTest(test.TestCase):
       results = []
       results.append(deq.eval())  # Will only complete after the enqueue starts.
       self.assertEqual(len(enq_done), 1)
-      self.assertEqual(sess.run(size_op), 5)
+      self.assertEqual(self.evaluate(size_op), 5)
 
       for _ in range(3):
         results.append(deq.eval())
 
       time.sleep(0.1)
       self.assertEqual(len(enq_done), 1)
-      self.assertEqual(sess.run(size_op), 5)
+      self.assertEqual(self.evaluate(size_op), 5)
 
       # This dequeue will unblock the thread.
       results.append(deq.eval())
@@ -1477,7 +1477,7 @@ class PaddingFIFOQueueTest(test.TestCase):
 
       def blocking_dequeue():
         # Will only complete after 4 enqueues complete.
-        results.extend(sess.run(deq))
+        results.extend(self.evaluate(deq))
 
       thread = self.checkedThread(target=blocking_dequeue)
       thread.start()
@@ -1486,7 +1486,7 @@ class PaddingFIFOQueueTest(test.TestCase):
         # TODO(mrry): Figure out how to do this without sleeping.
         time.sleep(0.1)
         self.assertEqual(len(results), 0)
-        sess.run(enq)
+        self.evaluate(enq)
 
       # Enough enqueued to unblock the dequeue
       thread.join()
@@ -1517,7 +1517,7 @@ class PaddingFIFOQueueTest(test.TestCase):
       q.enqueue_many(input_tuple).run()
 
       output_tuple_t = q.dequeue_many(32)
-      output_tuple = sess.run(output_tuple_t)
+      output_tuple = self.evaluate(output_tuple_t)
 
       for (input_elem, output_elem) in zip(input_tuple, output_tuple):
         self.assertAllEqual(input_elem, output_elem)
diff --git a/tensorflow/python/kernel_tests/parameterized_truncated_normal_op_test.py b/tensorflow/python/kernel_tests/parameterized_truncated_normal_op_test.py
index c9221f8c209..f87f5170539 100644
--- a/tensorflow/python/kernel_tests/parameterized_truncated_normal_op_test.py
+++ b/tensorflow/python/kernel_tests/parameterized_truncated_normal_op_test.py
@@ -29,6 +29,7 @@ from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.client import session
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import random_seed
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.platform import test
@@ -166,30 +167,39 @@ class ParameterizedTruncatedNormalTest(test.TestCase):
     except ImportError as e:
       tf_logging.warn("Cannot test truncated normal op: %s" % str(e))
 
+  @test_util.run_deprecated_v1
   def testDefaults(self):
     self.validateMoments([10**5], 0.0, 1.0, -2.0, 2.0)
 
+  @test_util.run_deprecated_v1
   def testShifted(self):
     self.validateMoments([10**5], -1.0, 1.0, -2.0, 2.0)
 
+  @test_util.run_deprecated_v1
   def testRightTail(self):
     self.validateMoments([10**5], 0.0, 1.0, 4.0, np.infty)
 
+  @test_util.run_deprecated_v1
   def testLeftTail(self):
     self.validateMoments([10**5], 0.0, 1.0, -np.infty, -4.0)
 
+  @test_util.run_deprecated_v1
   def testLeftTailTwoSidedBounds(self):
     self.validateMoments([10**5], 0.0, 1.0, -6.0, -3.0)
 
+  @test_util.run_deprecated_v1
   def testTwoSidedLeftTailShifted(self):
     self.validateKolmogorovSmirnov([10**5], 6.0, 1.0, -1.0, 1.0)
 
+  @test_util.run_deprecated_v1
   def testRightTailShifted(self):
     self.validateMoments([10**5], -5.0, 1.0, 2.0, np.infty)
 
+  @test_util.run_deprecated_v1
   def testSmallStddev(self):
     self.validateKolmogorovSmirnov([10**5], 0.0, 0.1, 0.05, 0.10)
 
+  @test_util.run_deprecated_v1
   def testSamplingWithSmallStdDevFarFromBound(self):
     sample_op = random_ops.parameterized_truncated_normal(
         shape=(int(1e5),), means=0.8, stddevs=0.05, minvals=-1., maxvals=1.)
@@ -202,6 +212,7 @@ class ParameterizedTruncatedNormalTest(test.TestCase):
       no_neg_samples = np.sum(samples < 0.)
       self.assertEqual(no_neg_samples, 0.)
 
+  @test_util.run_deprecated_v1
   def testSamplingAtRandnSwitchover(self):
     # The randn sampler is used as the bounds are moved farther from the mean,
     # and the probability of accepting a sample increases the farther the
diff --git a/tensorflow/python/kernel_tests/parse_single_example_op_test.py b/tensorflow/python/kernel_tests/parse_single_example_op_test.py
index a84895a287e..43c8fa4ab5c 100644
--- a/tensorflow/python/kernel_tests/parse_single_example_op_test.py
+++ b/tensorflow/python/kernel_tests/parse_single_example_op_test.py
@@ -29,6 +29,7 @@ from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import parsing_ops
 from tensorflow.python.platform import test
 from tensorflow.python.platform import tf_logging
@@ -107,7 +108,7 @@ class ParseExampleTest(test.TestCase):
         for result_dict in [out, out_with_example_name]:
           result = flatten_values_tensors_or_sparse(result_dict.values())
           # Check values.
-          tf_result = sess.run(result)
+          tf_result = self.evaluate(result)
           _compare_output_to_expected(self, result_dict, expected_values,
                                       tf_result)
 
@@ -121,6 +122,7 @@ class ParseExampleTest(test.TestCase):
           self.assertEqual(
               tuple(out[k].dense_shape.get_shape().as_list()), (1,))
 
+  @test_util.run_deprecated_v1
   def testEmptySerializedWithAllDefaults(self):
     sparse_name = "st_a"
     a_name = "a"
@@ -229,6 +231,7 @@ class ParseExampleTest(test.TestCase):
         },
         expected_err=(ValueError, "Missing shape for feature a"))
 
+  @test_util.run_deprecated_v1
   def testSerializedContainingSparse(self):
     original = [
         example(features=features({
@@ -552,6 +555,7 @@ class ParseExampleTest(test.TestCase):
           }
       }, expected_output)
 
+  @test_util.run_deprecated_v1
   def testSerializedContainingSparseAndSparseFeatureAndDenseWithNoDefault(self):
     original = [
         example(features=features({
@@ -618,6 +622,7 @@ class ParseExampleTest(test.TestCase):
           },
           expected_output)
 
+  @test_util.run_deprecated_v1
   def testSerializedContainingSparseAndSparseFeatureWithReuse(self):
     original = [
         example(features=features({
@@ -658,6 +663,7 @@ class ParseExampleTest(test.TestCase):
           }
       }, expected_output)
 
+  @test_util.run_deprecated_v1
   def testSerializedContainingVarLenDense(self):
     aname = "a"
     bname = "b"
@@ -869,6 +875,7 @@ class ParseSingleExampleTest(test.TestCase):
           self.assertEqual(
               tuple(out[k].dense_shape.get_shape().as_list()), (1,))
 
+  @test_util.run_deprecated_v1
   def testSingleExampleWithSparseAndSparseFeatureAndDense(self):
     original = example(features=features({
         "c": float_feature([3, 4]),
diff --git a/tensorflow/python/kernel_tests/parsing_ops_test.py b/tensorflow/python/kernel_tests/parsing_ops_test.py
index 71d8b60d3cc..af76e09f393 100644
--- a/tensorflow/python/kernel_tests/parsing_ops_test.py
+++ b/tensorflow/python/kernel_tests/parsing_ops_test.py
@@ -33,6 +33,7 @@ from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import test_util
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import parsing_ops
@@ -101,15 +102,15 @@ class ParseExampleTest(test.TestCase):
         out = parsing_ops.parse_example(**kwargs)
         result = flatten_values_tensors_or_sparse(out.values())
         # Check values.
-        tf_result = sess.run(result)
+        tf_result = self.evaluate(result)
         _compare_output_to_expected(self, out, expected_values, tf_result)
 
       # Check shapes; if serialized is a Tensor we need its size to
       # properly check.
       serialized = kwargs["serialized"]
       batch_size = (
-          serialized.eval().size if isinstance(serialized, ops.Tensor) else
-          np.asarray(serialized).size)
+          self.evaluate(serialized).size if isinstance(serialized, ops.Tensor)
+          else np.asarray(serialized).size)
       for k, f in kwargs["features"].items():
         if isinstance(f, parsing_ops.FixedLenFeature) and f.shape is not None:
           self.assertEqual(
@@ -121,6 +122,7 @@ class ParseExampleTest(test.TestCase):
           self.assertEqual(
               tuple(out[k].dense_shape.get_shape().as_list()), (2,))
 
+  @test_util.run_deprecated_v1
   def testEmptySerializedWithAllDefaults(self):
     sparse_name = "st_a"
     a_name = "a"
@@ -243,6 +245,7 @@ class ParseExampleTest(test.TestCase):
         },
         expected_err=(ValueError, "Missing shape for feature a"))
 
+  @test_util.run_deprecated_v1
   def testSerializedContainingSparse(self):
     original = [
         example(features=features({
@@ -571,6 +574,7 @@ class ParseExampleTest(test.TestCase):
         }
     }, expected_output)
 
+  @test_util.run_deprecated_v1
   def testSerializedContainingSparseAndSparseFeatureAndDenseWithNoDefault(self):
     expected_st_a = (  # indices, values, shape
         np.empty((0, 2), dtype=np.int64),  # indices
@@ -631,6 +635,7 @@ class ParseExampleTest(test.TestCase):
         },
         expected_output)
 
+  @test_util.run_deprecated_v1
   def testSerializedContainingSparseAndSparseFeatureWithReuse(self):
     expected_idx = (  # indices, values, shape
         np.array([[0, 0], [0, 1], [1, 0], [1, 1]], dtype=np.int64),
@@ -740,6 +745,7 @@ class ParseExampleTest(test.TestCase):
     for batch_size in (1, 10, 20, 100, 256):
       self._testSerializedContainingVarLenDenseLargerBatch(batch_size)
 
+  @test_util.run_deprecated_v1
   def testSerializedContainingVarLenDense(self):
     aname = "a"
     bname = "b"
@@ -962,6 +968,7 @@ class ParseSingleExampleTest(test.TestCase):
           self.assertEqual(
               tuple(out[k].dense_shape.get_shape().as_list()), (1,))
 
+  @test_util.run_deprecated_v1
   def testSingleExampleWithSparseAndSparseFeatureAndDense(self):
     original = example(
         features=features({
@@ -1180,6 +1187,7 @@ class ParseSequenceExampleTest(test.TestCase):
         expected_err=expected_err,
         batch=True)
 
+  @test_util.run_deprecated_v1
   def testSequenceExampleWithSparseAndDenseContext(self):
     original = sequence_example(
         context=features({
@@ -1223,6 +1231,7 @@ class ParseSequenceExampleTest(test.TestCase):
         },
         expected_context_values=expected_context_output)
 
+  @test_util.run_deprecated_v1
   def testSequenceExampleWithMultipleSizeFeatureLists(self):
     original = sequence_example(
         feature_lists=feature_lists({
@@ -1286,6 +1295,7 @@ class ParseSequenceExampleTest(test.TestCase):
         },
         expected_feat_list_values=expected_feature_list_output)
 
+  @test_util.run_deprecated_v1
   def testSequenceExampleWithoutDebugName(self):
     original = sequence_example(
         feature_lists=feature_lists({
@@ -1343,6 +1353,7 @@ class ParseSequenceExampleTest(test.TestCase):
         },
         expected_feat_list_values=expected_feature_list_output)
 
+  @test_util.run_deprecated_v1
   def testSequenceExampleWithSparseAndDenseFeatureLists(self):
     original = sequence_example(
         feature_lists=feature_lists({
@@ -1401,6 +1412,7 @@ class ParseSequenceExampleTest(test.TestCase):
         },
         expected_feat_list_values=expected_feature_list_output)
 
+  @test_util.run_deprecated_v1
   def testSequenceExampleWithEmptyFeatureInFeatureLists(self):
     original = sequence_example(
         feature_lists=feature_lists({
@@ -1541,6 +1553,7 @@ class ParseSequenceExampleTest(test.TestCase):
             " feature_list_dense_missing_assumed_empty or"
             " feature_list_dense_defaults?"))
 
+  @test_util.run_deprecated_v1
   def testSequenceExampleBatch(self):
     first = sequence_example(
         feature_lists=feature_lists({
@@ -1614,7 +1627,7 @@ class DecodeJSONExampleTest(test.TestCase):
           shape=examples.shape,
           dtype=dtypes.string)
       binary_tensor = parsing_ops.decode_json_example(json_tensor)
-      binary_val = sess.run(binary_tensor)
+      binary_val = self.evaluate(binary_tensor)
 
       if examples.shape:
         self.assertShapeEqual(binary_val, json_tensor)
@@ -1695,16 +1708,18 @@ class DecodeJSONExampleTest(test.TestCase):
             })),
     ])
 
+  @test_util.run_deprecated_v1
   def testInvalidSyntax(self):
     with self.cached_session() as sess:
       json_tensor = constant_op.constant(["{]"])
       binary_tensor = parsing_ops.decode_json_example(json_tensor)
       with self.assertRaisesOpError("Error while parsing JSON"):
-        sess.run(binary_tensor)
+        self.evaluate(binary_tensor)
 
 
 class ParseTensorOpTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testToFloat32(self):
     with self.cached_session():
       expected = np.random.rand(3, 4, 5).astype(np.float32)
@@ -1718,6 +1733,7 @@ class ParseTensorOpTest(test.TestCase):
 
       self.assertAllEqual(expected, result)
 
+  @test_util.run_deprecated_v1
   def testToUint8(self):
     with self.cached_session():
       expected = np.random.rand(3, 4, 5).astype(np.uint8)
@@ -1731,6 +1747,7 @@ class ParseTensorOpTest(test.TestCase):
 
       self.assertAllEqual(expected, result)
 
+  @test_util.run_deprecated_v1
   def testTypeMismatch(self):
     with self.cached_session():
       expected = np.random.rand(3, 4, 5).astype(np.uint8)
@@ -1744,6 +1761,7 @@ class ParseTensorOpTest(test.TestCase):
           r"\(uint16\)"):
         tensor.eval(feed_dict={serialized: tensor_proto.SerializeToString()})
 
+  @test_util.run_deprecated_v1
   def testInvalidInput(self):
     with self.cached_session():
       serialized = array_ops.placeholder(dtypes.string)
diff --git a/tensorflow/python/kernel_tests/partitioned_variables_test.py b/tensorflow/python/kernel_tests/partitioned_variables_test.py
index d1f0c6c2a05..48655391fa7 100644
--- a/tensorflow/python/kernel_tests/partitioned_variables_test.py
+++ b/tensorflow/python/kernel_tests/partitioned_variables_test.py
@@ -26,6 +26,7 @@ from six.moves import xrange  # pylint: disable=redefined-builtin
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import partitioned_variables
@@ -322,17 +323,19 @@ class PartitionedVariablesTestCase(test.TestCase):
     for i in xrange(len(expected_specs)):
       self.assertEquals(expected_specs[i], slices[i]._save_slice_info.spec)
 
+  @test_util.run_deprecated_v1
   def testVecConstantInit(self):
     with self.cached_session():
       rnd_par = constant_op.constant([1, 2, 3, 4])
       vs = partitioned_variables.create_partitioned_variables([4], [4], rnd_par)
       variables.global_variables_initializer().run()
       val = array_ops.concat(vs, 0).eval()
-      rnd = rnd_par.eval()
+      rnd = self.evaluate(rnd_par)
       self.assertAllClose(rnd, val)
       self.assertEqual([dtypes.int32] * 4, [v.dtype.base_dtype for v in vs])
       self._TestSaveSpec(vs, ["4 0,1", "4 1,1", "4 2,1", "4 3,1"])
 
+  @test_util.run_deprecated_v1
   def testConstantInit(self):
     with self.cached_session():
       rnd_par = constant_op.constant([[1, 2, 3, 4], [5, 6, 7, 8]])
@@ -340,7 +343,7 @@ class PartitionedVariablesTestCase(test.TestCase):
                                                               rnd_par)
       variables.global_variables_initializer().run()
       val = array_ops.concat(vs, 1).eval()
-      rnd = rnd_par.eval()
+      rnd = self.evaluate(rnd_par)
       self.assertAllClose(rnd, val)
       self.assertEqual([dtypes.int32] * 2, [v.dtype.base_dtype for v in vs])
       self._TestSaveSpec(vs, ["2 4 0,2:0,2", "2 4 0,2:2,2"])
@@ -401,12 +404,15 @@ class PartitionedVariablesTestCase(test.TestCase):
       self.assertEqual(var2_name + "/part_0:0", vs2[0].name)
       self.assertEqual(var2_name + "/part_1:0", vs2[1].name)
 
+  @test_util.run_deprecated_v1
   def testName(self):
     self._testNameHelper(use_resource=False)
 
+  @test_util.run_deprecated_v1
   def testResourceName(self):
     self._testNameHelper(use_resource=True)
 
+  @test_util.run_deprecated_v1
   def testRandomInitValue(self):
     with self.cached_session():
       rnd = variables.Variable(random_ops.random_uniform([200, 40]))
@@ -414,7 +420,7 @@ class PartitionedVariablesTestCase(test.TestCase):
           rnd.get_shape(), [1, 10], rnd.initialized_value())
       variables.global_variables_initializer().run()
       val = array_ops.concat(vs, 1).eval()
-      rnd = rnd.eval()
+      rnd = self.evaluate(rnd)
       self.assertAllClose(rnd, val)
       self.assertEqual([dtypes.float32] * 10, [v.dtype.base_dtype for v in vs])
       self._TestSaveSpec(vs, [
@@ -424,6 +430,7 @@ class PartitionedVariablesTestCase(test.TestCase):
           "200 40 0,200:36,4"
       ])
 
+  @test_util.run_deprecated_v1
   def testRandomInitUnevenPartitions(self):
     with self.cached_session():
       rnd = variables.Variable(
@@ -434,7 +441,7 @@ class PartitionedVariablesTestCase(test.TestCase):
           for i in xrange(1, 10)
       ]
       variables.global_variables_initializer().run()
-      rnd_val = rnd.eval()
+      rnd_val = self.evaluate(rnd)
       # Only check the slice save specs for the first 5 tf.
       save_specs = [
           # One slice
@@ -462,6 +469,7 @@ class PartitionedVariablesTestCase(test.TestCase):
         if i < len(save_specs):
           self._TestSaveSpec(vs, save_specs[i])
 
+  @test_util.run_deprecated_v1
   def testDegenerate(self):
     with self.cached_session():
       rnd = variables.Variable(random_ops.random_uniform([10, 43]))
@@ -469,10 +477,11 @@ class PartitionedVariablesTestCase(test.TestCase):
           rnd.get_shape(), [1, 1], rnd.initialized_value())
       variables.global_variables_initializer().run()
       val = array_ops.concat(vs, 0).eval()
-      rnd = rnd.eval()
+      rnd = self.evaluate(rnd)
       self.assertAllClose(rnd, val)
       self._TestSaveSpec(vs, ["10 43 0,10:0,43"])
 
+  @test_util.run_deprecated_v1
   def testSliceSizeOne(self):
     with self.cached_session():
       rnd = variables.Variable(random_ops.random_uniform([10, 43]))
@@ -480,7 +489,7 @@ class PartitionedVariablesTestCase(test.TestCase):
           rnd.get_shape(), [10, 1], rnd.initialized_value())
       variables.global_variables_initializer().run()
       val = array_ops.concat(vs, 0).eval()
-      rnd = rnd.eval()
+      rnd = self.evaluate(rnd)
       self.assertAllClose(rnd, val)
       self._TestSaveSpec(vs, [
           "10 43 0,1:0,43", "10 43 1,1:0,43", "10 43 2,1:0,43",
@@ -488,6 +497,7 @@ class PartitionedVariablesTestCase(test.TestCase):
           "10 43 6,1:0,43", "10 43 7,1:0,43", "10 43 8,1:0,43", "10 43 9,1:0,43"
       ])
 
+  @test_util.run_deprecated_v1
   def testIotaInitializer(self):
     self.assertAllClose([0., 1., 2., 3.], _IotaInitializer([4]))
     self.assertAllClose([[0., 1.], [0., 10.], [0., 100.], [0., 1000.]],
@@ -503,6 +513,7 @@ class PartitionedVariablesTestCase(test.TestCase):
       self.assertAllClose(slice0 + slice1 + slice2, val)
       self._TestSaveSpec(vs, ["13 5 0,5:0,5", "13 5 5,4:0,5", "13 5 9,4:0,5"])
 
+  @test_util.run_deprecated_v1
   def testRandomInitializer(self):
     # Sanity check that the slices uses a different seed when using a random
     # initializer function.
@@ -510,7 +521,7 @@ class PartitionedVariablesTestCase(test.TestCase):
       var0, var1 = partitioned_variables.create_partitioned_variables(
           [20, 12], [1, 2], init_ops.random_uniform_initializer())
       variables.global_variables_initializer().run()
-      val0, val1 = var0.eval().flatten(), var1.eval().flatten()
+      val0, val1 = self.evaluate(var0).flatten(), self.evaluate(var1).flatten()
       self.assertTrue(np.linalg.norm(val0 - val1) > 1e-6)
     # Negative test that proves that slices have the same values if
     # the random initializer uses a seed.
@@ -518,7 +529,7 @@ class PartitionedVariablesTestCase(test.TestCase):
       var0, var1 = partitioned_variables.create_partitioned_variables(
           [20, 12], [1, 2], init_ops.random_uniform_initializer(seed=201))
       variables.global_variables_initializer().run()
-      val0, val1 = var0.eval().flatten(), var1.eval().flatten()
+      val0, val1 = self.evaluate(var0).flatten(), self.evaluate(var1).flatten()
       self.assertAllClose(val0, val1)
 
   def testSomeErrors(self):
@@ -546,6 +557,7 @@ class PartitionedVariablesTestCase(test.TestCase):
         partitioned_variables.create_partitioned_variables(
             [10, 43], [1, 50], rnd.initialized_value())
 
+  @test_util.run_deprecated_v1
   def testControlDepsNone(self):
     with self.cached_session() as session:
       c = constant_op.constant(1.0)
@@ -572,6 +584,7 @@ class PartitionedVariablesTestCase(test.TestCase):
       for op in reading_ops:
         self.assertEqual([], op.control_inputs)
 
+  @test_util.run_deprecated_v1
   def testConcat(self):
     with self.cached_session() as session:
       var_x = variable_scope.get_variable(
diff --git a/tensorflow/python/kernel_tests/pool_test.py b/tensorflow/python/kernel_tests/pool_test.py
index 372861297fb..78e786f01ca 100644
--- a/tensorflow/python/kernel_tests/pool_test.py
+++ b/tensorflow/python/kernel_tests/pool_test.py
@@ -24,6 +24,7 @@ import numpy as np
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import nn_ops
 import tensorflow.python.ops.nn_grad  # pylint: disable=unused-import
@@ -151,7 +152,7 @@ class PoolingTest(test.TestCase):
         np.prod(input_shape), dtype=np.float32).reshape(input_shape) - 1
     y1 = pool_direct(input=x, **kwargs)
     y2 = nn_ops.pool(input=x, **kwargs)
-    self.assertAllClose(y1, y2.eval(), rtol=1e-2, atol=1e-2)
+    self.assertAllClose(y1, self.evaluate(y2), rtol=1e-2, atol=1e-2)
 
   def testPoolSimple(self):
     with self.session(use_gpu=test.is_gpu_available()):
@@ -301,6 +302,7 @@ class PoolingTest(test.TestCase):
     err_tolerance = 1e-2
     self.assertLess(err, err_tolerance)
 
+  @test_util.run_deprecated_v1
   def testGradient1D(self):
     with self.session(use_gpu=test.is_gpu_available()):
       for padding in ["SAME", "VALID"]:
@@ -327,6 +329,7 @@ class PoolingTest(test.TestCase):
                     dilation_rate=[1],
                     strides=strides)
 
+  @test_util.run_deprecated_v1
   def testGradient2D(self):
     with self.session(use_gpu=test.is_gpu_available()):
       for padding in ["SAME", "VALID"]:
@@ -353,6 +356,7 @@ class PoolingTest(test.TestCase):
                     dilation_rate=[1, 1],
                     strides=strides)
 
+  @test_util.run_deprecated_v1
   def testGradient3D(self):
     with self.session(use_gpu=test.is_gpu_available()):
       for padding in ["SAME", "VALID"]:
diff --git a/tensorflow/python/kernel_tests/pooling_ops_3d_test.py b/tensorflow/python/kernel_tests/pooling_ops_3d_test.py
index e393c7a0229..347e092dee3 100644
--- a/tensorflow/python/kernel_tests/pooling_ops_3d_test.py
+++ b/tensorflow/python/kernel_tests/pooling_ops_3d_test.py
@@ -81,7 +81,7 @@ class PoolingTest(test.TestCase):
           data_format=data_format)
       if data_format == "NCDHW":
         t = test_util.NCHWToNHWC(t)
-      vals = sess.run(t)
+      vals = self.evaluate(t)
     # Verifies values.
     actual = vals.flatten()
     self.assertAllClose(expected, actual)
@@ -253,6 +253,7 @@ class PoolingTest(test.TestCase):
         ksize = test_util.NHWCToNCHW(ksize)
         strides = test_util.NHWCToNCHW(strides)
         t = test_util.NHWCToNCHW(t)
+        output_sizes = test_util.NHWCToNCHW(output_sizes)
 
       t = pool_func(
           t,
@@ -294,6 +295,7 @@ class PoolingTest(test.TestCase):
                                               use_gpu=use_gpu,
                                               **kwargs)
 
+  @test_util.run_deprecated_v1
   def testMaxPoolGradValidPadding1_1_3d(self):
     self._ConstructAndTestGradient(
         nn_ops.max_pool3d,
@@ -303,6 +305,7 @@ class PoolingTest(test.TestCase):
         strides=(1, 1, 1),
         padding="VALID")
 
+  @test_util.run_deprecated_v1
   def testMaxPoolGradValidPadding2_1_6_3d(self):
     self._ConstructAndTestGradient(
         nn_ops.max_pool3d,
@@ -312,6 +315,7 @@ class PoolingTest(test.TestCase):
         strides=(1, 1, 1),
         padding="VALID")
 
+  @test_util.run_deprecated_v1
   def testMaxPoolGradValidPadding2_1_7_3d(self):
     self._ConstructAndTestGradient(
         nn_ops.max_pool3d,
@@ -321,6 +325,7 @@ class PoolingTest(test.TestCase):
         strides=(1, 1, 1),
         padding="VALID")
 
+  @test_util.run_deprecated_v1
   def testMaxPoolGradValidPadding1_2_3d(self):
     self._ConstructAndTestGradient(
         nn_ops.max_pool3d,
@@ -330,6 +335,7 @@ class PoolingTest(test.TestCase):
         strides=(2, 2, 2),
         padding="VALID")
 
+  @test_util.run_deprecated_v1
   def testMaxPoolGradValidPadding2_2_3d(self):
     self._ConstructAndTestGradient(
         nn_ops.max_pool3d,
@@ -339,6 +345,7 @@ class PoolingTest(test.TestCase):
         strides=(2, 2, 2),
         padding="VALID")
 
+  @test_util.run_deprecated_v1
   def testMaxPoolGradSamePadding1_1_3d(self):
     self._ConstructAndTestGradient(
         nn_ops.max_pool3d,
@@ -348,6 +355,7 @@ class PoolingTest(test.TestCase):
         strides=(1, 1, 1),
         padding="SAME")
 
+  @test_util.run_deprecated_v1
   def testMaxPoolGradSamePadding1_2_3d(self):
     self._ConstructAndTestGradient(
         nn_ops.max_pool3d,
@@ -357,6 +365,7 @@ class PoolingTest(test.TestCase):
         strides=(2, 2, 2),
         padding="SAME")
 
+  @test_util.run_deprecated_v1
   def testMaxPoolGradSamePadding2_1_3d(self):
     self._ConstructAndTestGradient(
         nn_ops.max_pool3d,
@@ -366,6 +375,7 @@ class PoolingTest(test.TestCase):
         strides=(1, 1, 1),
         padding="SAME")
 
+  @test_util.run_deprecated_v1
   def testMaxPoolGradSamePadding2_2_3d(self):
     self._ConstructAndTestGradient(
         nn_ops.max_pool3d,
@@ -375,6 +385,7 @@ class PoolingTest(test.TestCase):
         strides=(2, 2, 2),
         padding="SAME")
 
+  @test_util.run_deprecated_v1
   def testMaxPoolGradSamePadding3_1_3d(self):
     self._ConstructAndTestGradient(
         nn_ops.max_pool3d,
@@ -384,6 +395,7 @@ class PoolingTest(test.TestCase):
         strides=(1, 1, 1),
         padding="SAME")
 
+  @test_util.run_deprecated_v1
   def testAvgPoolGradValidPadding1_1_3d(self):
     self._ConstructAndTestGradient(
         nn_ops.avg_pool3d,
@@ -393,6 +405,7 @@ class PoolingTest(test.TestCase):
         strides=(1, 1, 1),
         padding="VALID")
 
+  @test_util.run_deprecated_v1
   def testAvgPoolGradValidPadding1_2_3d(self):
     self._ConstructAndTestGradient(
         nn_ops.avg_pool3d,
@@ -402,6 +415,7 @@ class PoolingTest(test.TestCase):
         strides=(2, 2, 2),
         padding="VALID")
 
+  @test_util.run_deprecated_v1
   def testAvgPoolGradValidPadding2_1_3d(self):
     self._ConstructAndTestGradient(
         nn_ops.avg_pool3d,
@@ -411,6 +425,7 @@ class PoolingTest(test.TestCase):
         strides=(1, 1, 1),
         padding="VALID")
 
+  @test_util.run_deprecated_v1
   def testAvgPoolGradValidPadding2_2_3d(self):
     self._ConstructAndTestGradient(
         nn_ops.avg_pool3d,
@@ -420,6 +435,7 @@ class PoolingTest(test.TestCase):
         strides=(2, 2, 2),
         padding="VALID")
 
+  @test_util.run_deprecated_v1
   def testAvgPoolGradSamePadding1_1_3d(self):
     self._ConstructAndTestGradient(
         nn_ops.avg_pool3d,
@@ -429,6 +445,7 @@ class PoolingTest(test.TestCase):
         strides=(1, 1, 1),
         padding="SAME")
 
+  @test_util.run_deprecated_v1
   def testAvgPoolGradSamePadding1_2_3d(self):
     self._ConstructAndTestGradient(
         nn_ops.avg_pool3d,
@@ -438,6 +455,7 @@ class PoolingTest(test.TestCase):
         strides=(2, 2, 2),
         padding="SAME")
 
+  @test_util.run_deprecated_v1
   def testAvgPoolGradSamePadding2_1_3d(self):
     self._ConstructAndTestGradient(
         nn_ops.avg_pool3d,
@@ -447,6 +465,7 @@ class PoolingTest(test.TestCase):
         strides=(1, 1, 1),
         padding="SAME")
 
+  @test_util.run_deprecated_v1
   def testAvgPoolGradSamePadding2_2_3d(self):
     self._ConstructAndTestGradient(
         nn_ops.avg_pool3d,
@@ -456,6 +475,7 @@ class PoolingTest(test.TestCase):
         strides=(2, 2, 2),
         padding="SAME")
 
+  @test_util.run_deprecated_v1
   def testAvgPoolGradSamePadding3_1_3d(self):
     self._ConstructAndTestGradient(
         nn_ops.avg_pool3d,
diff --git a/tensorflow/python/kernel_tests/pooling_ops_test.py b/tensorflow/python/kernel_tests/pooling_ops_test.py
index 53003a7f284..c33b59bb99b 100644
--- a/tensorflow/python/kernel_tests/pooling_ops_test.py
+++ b/tensorflow/python/kernel_tests/pooling_ops_test.py
@@ -166,7 +166,7 @@ class PoolingTest(test.TestCase):
             strides_placeholder: strides
         })
       else:
-        actual = t.eval()
+        actual = self.evaluate(t)
         self.assertShapeEqual(actual, t)
       self.assertAllCloseAccordingToType(expected, actual.flatten())
 
@@ -384,6 +384,7 @@ class PoolingTest(test.TestCase):
         expected=[],
         use_gpu=use_gpu)
 
+  @test_util.run_deprecated_v1
   def testAvgPooling(self):
     for use_gpu in True, False:
       self._testAvgPoolValidPadding(use_gpu)
@@ -577,6 +578,7 @@ class PoolingTest(test.TestCase):
         expected=[],
         use_gpu=use_gpu)
 
+  @test_util.run_deprecated_v1
   def testMaxPooling(self):
     for use_gpu in True, False:
       self._testMaxPoolValidPadding(use_gpu)
@@ -588,6 +590,7 @@ class PoolingTest(test.TestCase):
       self._testMaxPoolEmptyInput(use_gpu)
 
   # Tests for DepthwiseMaxPooling on CPU only.
+  @test_util.run_deprecated_v1
   def testDepthwiseMaxPool1x1DepthWindow1(self):
     # input is:
     # [1.0, ..., 10.0] along depth,
@@ -613,6 +616,7 @@ class PoolingTest(test.TestCase):
           use_gpu=False,
           v2=v2)
 
+  @test_util.run_deprecated_v1
   def testDepthwiseMaxPool2x2DepthWindow3(self):
     # input is:
     #
@@ -639,6 +643,7 @@ class PoolingTest(test.TestCase):
           use_gpu=False,
           v2=v2)
 
+  @test_util.run_deprecated_v1
   def testKernelSmallerThanStrideValid(self):
     for use_gpu in [True, False]:
       self._VerifyValues(
@@ -670,6 +675,7 @@ class PoolingTest(test.TestCase):
           expected=[5, 8, 26, 29],
           use_gpu=use_gpu)
 
+  @test_util.run_deprecated_v1
   def testKernelSmallerThanStrideSame(self):
     for use_gpu in [True, False]:
       for pool_func in [nn_ops.max_pool, nn_ops.avg_pool]:
@@ -750,11 +756,11 @@ class PoolingTest(test.TestCase):
       with self.cached_session(use_gpu=True):
         t = constant_op.constant(tensor_input, shape=input_shape)
         out_op, _ = nn_ops.max_pool_with_argmax(t, ksize, strides, padding)
-        gpu_val = out_op.eval()
+        gpu_val = self.evaluate(out_op)
       with self.cached_session(use_gpu=False):
         t = constant_op.constant(tensor_input, shape=input_shape)
         out_op = nn_ops.max_pool(t, ksize, strides, padding)
-        cpu_val = out_op.eval()
+        cpu_val = self.evaluate(out_op)
       self.assertAllCloseAccordingToType(cpu_val, gpu_val)
 
   def _CompareMaxPoolingBk(self, input_shape, output_shape, ksize, strides,
@@ -767,20 +773,20 @@ class PoolingTest(test.TestCase):
       with self.cached_session(use_gpu=True):
         t = constant_op.constant(tensor_input, shape=input_shape)
         _, argmax_op = nn_ops.max_pool_with_argmax(t, ksize, strides, padding)
-        argmax = argmax_op.eval()
+        argmax = self.evaluate(argmax_op)
         grad_in = constant_op.constant(tensor_output, shape=output_shape)
         out_op = gen_nn_ops.max_pool_grad_with_argmax(t, grad_in, argmax, ksize,
                                                       strides, padding)
-        gpu_val = out_op.eval()
+        gpu_val = self.evaluate(out_op)
         self.assertShapeEqual(gpu_val, out_op)
       with self.cached_session(use_gpu=False):
         t = constant_op.constant(tensor_input, shape=input_shape)
         out_op = nn_ops.max_pool(t, ksize, strides, padding)
-        orig_out = out_op.eval()
+        orig_out = self.evaluate(out_op)
         grad_in = constant_op.constant(tensor_output, shape=output_shape)
         out_op = gen_nn_ops.max_pool_grad(t, orig_out, grad_in, ksize, strides,
                                           padding)
-        cpu_val = out_op.eval()
+        cpu_val = self.evaluate(out_op)
         self.assertShapeEqual(cpu_val, out_op)
       # The CPU version accumulates its gradient on fp16, so it's less
       # accurate than the GPU version that does the accumulation on fp32
@@ -796,20 +802,20 @@ class PoolingTest(test.TestCase):
       with self.cached_session(use_gpu=True):
         t = constant_op.constant(tensor_input, shape=input_shape)
         _, argmax_op = nn_ops.max_pool_with_argmax(t, ksize, strides, padding)
-        argmax = argmax_op.eval()
+        argmax = self.evaluate(argmax_op)
         grad_in = constant_op.constant(tensor_input, shape=input_shape)
         out_op = gen_nn_ops.max_pool_grad_grad_with_argmax(
             t, grad_in, argmax, ksize, strides, padding)
-        gpu_val = out_op.eval()
+        gpu_val = self.evaluate(out_op)
         self.assertShapeEqual(gpu_val, out_op)
       with self.cached_session(use_gpu=False):
         t = constant_op.constant(tensor_input, shape=input_shape)
         out_op = nn_ops.max_pool(t, ksize, strides, padding)
-        orig_out = out_op.eval()
+        orig_out = self.evaluate(out_op)
         grad_in = constant_op.constant(tensor_input, shape=input_shape)
         out_op = gen_nn_ops.max_pool_grad_grad(t, orig_out, grad_in, ksize,
                                                strides, padding)
-        cpu_val = out_op.eval()
+        cpu_val = self.evaluate(out_op)
         self.assertShapeEqual(cpu_val, out_op)
       # The CPU version accumulates its gradient on fp16, so it's less
       # accurate than the GPU version that does the accumulation on fp32
@@ -826,7 +832,7 @@ class PoolingTest(test.TestCase):
           strides=[1, 1, 1, 1],
           Targmax=dtypes.int64,
           padding="VALID")
-      out, argmax = sess.run([out_op, argmax_op])
+      out, argmax = self.evaluate([out_op, argmax_op])
       self.assertShapeEqual(out, out_op)
       self.assertShapeEqual(argmax, argmax_op)
       self.assertAllClose(out.ravel(), [1.0, 1.0, 1.0, 1.0])
@@ -848,7 +854,7 @@ class PoolingTest(test.TestCase):
           ksize=[1, 2, 2, 1],
           strides=[1, 1, 1, 1],
           padding="VALID")
-      out = out_op.eval().flatten()
+      out = self.evaluate(out_op).flatten()
       self.assertAllClose(out,
                           [11.0, 12.0, 0.0, 13.0, 0.0, 14.0, 0.0, 0.0, 0.0])
 
@@ -871,7 +877,7 @@ class PoolingTest(test.TestCase):
           ksize=[1, 2, 2, 1],
           strides=[1, 1, 1, 1],
           padding="VALID")
-      out = out_op.eval().flatten()
+      out = self.evaluate(out_op).flatten()
       self.assertAllClose(out, [11.0, 12.0, 14.0, 16.0])
 
   def _ConstructAndTestGradient(self,
@@ -1167,6 +1173,7 @@ class PoolingTest(test.TestCase):
           data_format=data_format,
           use_gpu=use_gpu)
 
+  @test_util.run_deprecated_v1
   def testMaxPoolGrad(self):
     for (data_format, use_gpu) in GetTestConfigs():
       self._testMaxPoolGradValidPadding1_1(data_format, use_gpu)
@@ -1221,12 +1228,12 @@ class PoolingTest(test.TestCase):
           input_tensor, output_tensor, output_backprop_tensor, window_rows,
           window_cols, row_stride, col_stride, padding, v2)
 
-      actual_input_backprop = input_backprop_tensor.eval()
+      actual_input_backprop = self.evaluate(input_backprop_tensor)
       self.assertShapeEqual(actual_input_backprop, input_backprop_tensor)
       actual_input_backprop = actual_input_backprop.flatten()
       actual_input_backprop = self._GetNdArray(actual_input_backprop)
 
-      actual_output = output_tensor.eval().flatten()
+      actual_output = self.evaluate(output_tensor).flatten()
       actual_output = self._GetNdArray(actual_output)
 
       self.assertAllClose(
@@ -1497,6 +1504,7 @@ class PoolingTest(test.TestCase):
     else:
       del os.environ["TF_ENABLE_MAXPOOL_NANPROP"]
 
+  @test_util.run_deprecated_v1
   def testMaxPoolGradDirect(self):
     self._testMaxPoolGradDirect1_1()
     self._testMaxPoolGradDirect1_2()
@@ -1616,6 +1624,7 @@ class PoolingTest(test.TestCase):
           data_format=data_format,
           use_gpu=use_gpu)
 
+  @test_util.run_deprecated_v1
   def testMaxPoolGradGrad(self):
     for (data_format, use_gpu) in GetTestConfigs():
       self._testMaxPoolGradGradValidPadding1_1(data_format, use_gpu)
@@ -1649,6 +1658,7 @@ class PoolingTest(test.TestCase):
         orig_input, orig_output, grad, [1, window_rows, window_cols, 1],
         [1, row_stride, col_stride, 1], padding)
 
+  @test_util.run_deprecated_v1
   def testAvgPoolGrad(self):
     for (data_format, use_gpu) in GetTestConfigs():
       self._testAvgPoolGradValidPadding1_1(data_format, use_gpu)
@@ -1778,6 +1788,7 @@ class PoolingTest(test.TestCase):
         data_format=data_format,
         use_gpu=use_gpu)
 
+  @test_util.run_deprecated_v1
   def testShapeFunctionEdgeCases(self):
     # All shapes unknown.
     for pool_func in [nn_ops.max_pool, nn_ops.avg_pool]:
@@ -1806,6 +1817,7 @@ class PoolingTest(test.TestCase):
             strides=[1, 1, 1, 1],
             padding="SAME")
 
+  @test_util.run_deprecated_v1
   def testOpEdgeCases(self):
     with self.session(use_gpu=test.is_gpu_available()) as sess:
       pool_funcs = [nn_ops.max_pool, nn_ops.avg_pool]
diff --git a/tensorflow/python/kernel_tests/priority_queue_test.py b/tensorflow/python/kernel_tests/priority_queue_test.py
index 73a9c816382..9be682ea52f 100644
--- a/tensorflow/python/kernel_tests/priority_queue_test.py
+++ b/tensorflow/python/kernel_tests/priority_queue_test.py
@@ -50,7 +50,7 @@ class PriorityQueueTest(test.TestCase):
         enq.run()
 
       deq = q.dequeue_many(100)
-      deq_elem, deq_value_0, deq_value_1 = sess.run(deq)
+      deq_elem, deq_value_0, deq_value_1 = self.evaluate(deq)
 
       allowed = {}
       missed = set()
@@ -81,7 +81,7 @@ class PriorityQueueTest(test.TestCase):
 
       # Run one producer thread for each element in elems.
       def enqueue(enqueue_op):
-        sess.run(enqueue_op)
+        self.evaluate(enqueue_op)
 
       dequeue_op = q.dequeue_many(100)
 
@@ -93,7 +93,7 @@ class PriorityQueueTest(test.TestCase):
       for t in enqueue_threads:
         t.start()
 
-      deq_elem, deq_value_0, deq_value_1 = sess.run(dequeue_op)
+      deq_elem, deq_value_0, deq_value_1 = self.evaluate(dequeue_op)
 
       for t in enqueue_threads:
         t.join()
@@ -132,12 +132,12 @@ class PriorityQueueTest(test.TestCase):
 
       # Run one producer thread for each element in elems.
       def enqueue(enqueue_op):
-        sess.run(enqueue_op)
+        self.evaluate(enqueue_op)
 
       dequeued = []
 
       def dequeue(dequeue_op):
-        (dequeue_indices, dequeue_values) = sess.run(dequeue_op)
+        (dequeue_indices, dequeue_values) = self.evaluate(dequeue_op)
         self.assertAllEqual(dequeue_indices, dequeue_values)
         dequeued.extend(dequeue_indices)
 
@@ -184,10 +184,10 @@ class PriorityQueueTest(test.TestCase):
 
       # Run one producer thread for each element in elems.
       def enqueue(enqueue_op):
-        sess.run(enqueue_op)
+        self.evaluate(enqueue_op)
 
       def dequeue(dequeue_op, dequeued):
-        (dequeue_indices, dequeue_values) = sess.run(dequeue_op)
+        (dequeue_indices, dequeue_values) = self.evaluate(dequeue_op)
         self.assertAllEqual(dequeue_indices, dequeue_values)
         dequeue_wait.acquire()
         dequeued.extend(dequeue_indices)
@@ -215,7 +215,7 @@ class PriorityQueueTest(test.TestCase):
 
       # We can't guarantee full sorting because we can't guarantee
       # that the dequeued.extend() call runs immediately after the
-      # sess.run() call.  Here we're just happy everything came out.
+      # self.evaluate() call.  Here we're just happy everything came out.
       self.assertAllEqual(set(dequeued), set(all_enqueued_values))
 
   def testRoundTripInsertManyMultiThreadedReadOnceSorts(self):
@@ -236,7 +236,7 @@ class PriorityQueueTest(test.TestCase):
 
       # Run one producer thread for each element in elems.
       def enqueue(enqueue_op):
-        sess.run(enqueue_op)
+        self.evaluate(enqueue_op)
 
       dequeue_op = q.dequeue_many(100)
 
@@ -248,7 +248,7 @@ class PriorityQueueTest(test.TestCase):
       for t in enqueue_threads:
         t.start()
 
-      deq_elem, deq_value_0, deq_value_1 = sess.run(dequeue_op)
+      deq_elem, deq_value_0, deq_value_1 = self.evaluate(dequeue_op)
 
       for t in enqueue_threads:
         t.join()
@@ -276,7 +276,7 @@ class PriorityQueueTest(test.TestCase):
       side_value_1 = np.random.rand(1000).astype(bytes)
       q.enqueue_many((elem, side_value_0, side_value_1)).run()
       deq = q.dequeue_many(1000)
-      deq_elem, deq_value_0, deq_value_1 = sess.run(deq)
+      deq_elem, deq_value_0, deq_value_1 = self.evaluate(deq)
 
       allowed = {}
       for e, v0, v1 in zip(elem, side_value_0, side_value_1):
diff --git a/tensorflow/python/kernel_tests/py_func_test.py b/tensorflow/python/kernel_tests/py_func_test.py
index 837f1ec054f..1f3f02a9f01 100644
--- a/tensorflow/python/kernel_tests/py_func_test.py
+++ b/tensorflow/python/kernel_tests/py_func_test.py
@@ -272,7 +272,7 @@ class PyFuncTest(test.TestCase):
 
       with self.assertRaisesRegexp(errors.UnimplementedError,
                                    "Unsupported numpy type"):
-        y.eval()
+        self.evaluate(y)
 
   def testBadReturnType(self):
     with self.cached_session():
@@ -285,7 +285,7 @@ class PyFuncTest(test.TestCase):
 
       with self.assertRaisesRegexp(errors.UnimplementedError,
                                    "Unsupported object type"):
-        z.eval()
+        self.evaluate(z)
 
   def testReturnInput(self):
     with self.cached_session():
@@ -307,9 +307,9 @@ class PyFuncTest(test.TestCase):
     with session_lib.Session() as sess:
       producer = iter(range(3))
       x, = script_ops.py_func(lambda: next(producer), [], [dtypes.int64])
-      self.assertEqual(sess.run(x), 0)
-      self.assertEqual(sess.run(x), 1)
-      self.assertEqual(sess.run(x), 2)
+      self.assertEqual(self.evaluate(x), 0)
+      self.assertEqual(self.evaluate(x), 1)
+      self.assertEqual(self.evaluate(x), 2)
 
   def testStateless(self):
     # Not using self.cached_session(), which disables optimization.
@@ -317,9 +317,9 @@ class PyFuncTest(test.TestCase):
       producer = iter(range(3))
       x, = script_ops.py_func(
           lambda: next(producer), [], [dtypes.int64], stateful=False)
-      self.assertEqual(sess.run(x), 0)
-      self.assertEqual(sess.run(x), 0)
-      self.assertEqual(sess.run(x), 0)
+      self.assertEqual(self.evaluate(x), 0)
+      self.assertEqual(self.evaluate(x), 0)
+      self.assertEqual(self.evaluate(x), 0)
 
   def testGradientFunction(self):
     # Input to tf.py_func is necessary, otherwise get_gradient_function()
@@ -335,7 +335,7 @@ class PyFuncTest(test.TestCase):
       val = [[1, 2], [3, 4]]
       x, = script_ops.py_func(lambda: np.array(val, order="F"), [],
                               [dtypes.int64])
-      self.assertAllEqual(val, x.eval())
+      self.assertAllEqual(val, self.evaluate(x))
 
   def testParallel(self):
     # Tests that tf.py_func's can run in parallel if they release the GIL.
@@ -390,7 +390,7 @@ class PyFuncTest(test.TestCase):
     f = script_ops.py_func(
         do_nothing, [constant_op.constant(3, dtypes.int64)], [], stateful=False)
     with self.cached_session() as sess:
-      self.assertEqual(sess.run(f), [])
+      self.assertEqual(self.evaluate(f), [])
 
   def _testExceptionHandling(self, py_exp, tf_exp, eager=False):
 
@@ -514,6 +514,7 @@ class PyFuncTest(test.TestCase):
       self.assertAllEqual(ret, [[3.0], [3.0], [3.0]])
 
   @test_util.run_in_graph_and_eager_modes
+  @test_util.run_deprecated_v1
   def testEagerExceptionHandling(self):
     with test_util.device(use_gpu=True):
       self._testExceptionHandling(
@@ -533,6 +534,7 @@ class PyFuncTest(test.TestCase):
       self._testExceptionHandling(WeirdError, errors.UnknownError, eager=True)
 
   @test_util.run_in_graph_and_eager_modes
+  @test_util.run_deprecated_v1
   def testEagerReturningVariableRaisesError(self):
     def return_variable():
       return resource_variable_ops.ResourceVariable(0.0)
diff --git a/tensorflow/python/kernel_tests/qr_op_test.py b/tensorflow/python/kernel_tests/qr_op_test.py
index a60237fb25a..0f2537b3711 100644
--- a/tensorflow/python/kernel_tests/qr_op_test.py
+++ b/tensorflow/python/kernel_tests/qr_op_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import linalg_ops
@@ -49,6 +50,7 @@ class QrOpTest(test.TestCase):
                                  "Shape must be at least rank 2 but is rank 1"):
       linalg_ops.qr(vector)
 
+  @test_util.run_deprecated_v1
   def testConcurrentExecutesWithoutError(self):
     with self.session(use_gpu=True) as sess:
       all_ops = []
@@ -60,7 +62,7 @@ class QrOpTest(test.TestCase):
             q1, r1 = linalg_ops.qr(matrix1, full_matrices=full_matrices_)
             q2, r2 = linalg_ops.qr(matrix2, full_matrices=full_matrices_)
             all_ops += [q1, r1, q2, r2]
-      val = sess.run(all_ops)
+      val = self.evaluate(all_ops)
       for i in range(8):
         q = 4 * i
         self.assertAllEqual(val[q], val[q + 2])  # q1 == q2
@@ -110,7 +112,7 @@ def _GetQrOpTest(dtype_, shape_, full_matrices_, use_static_shape_):
       tol = 1e-5
     else:
       tol = 1e-14
-    self.assertAllClose(identity.eval(), xx.eval(), atol=tol)
+    self.assertAllClose(identity.eval(), self.evaluate(xx), atol=tol)
 
   def Test(self):
     np.random.seed(1)
@@ -129,7 +131,7 @@ def _GetQrOpTest(dtype_, shape_, full_matrices_, use_static_shape_):
       q_tf, r_tf = linalg_ops.qr(x_tf, full_matrices=full_matrices_)
 
       if use_static_shape_:
-        q_tf_val, r_tf_val = sess.run([q_tf, r_tf])
+        q_tf_val, r_tf_val = self.evaluate([q_tf, r_tf])
       else:
         q_tf_val, r_tf_val = sess.run([q_tf, r_tf], feed_dict={x_tf: x_np})
 
diff --git a/tensorflow/python/kernel_tests/random/multinomial_op_big_test.py b/tensorflow/python/kernel_tests/random/multinomial_op_big_test.py
index 0023506b77a..576720528e2 100644
--- a/tensorflow/python/kernel_tests/random/multinomial_op_big_test.py
+++ b/tensorflow/python/kernel_tests/random/multinomial_op_big_test.py
@@ -23,6 +23,7 @@ import numpy as np
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import random_seed
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import random_ops
 from tensorflow.python.platform import test
 
@@ -39,7 +40,7 @@ class MultinomialTest(test.TestCase):
           num_samples=1000000,
           seed=15)
       for _ in range(100):
-        x = sess.run(samples)
+        x = self.evaluate(samples)
         indices, counts = np.unique(x, return_counts=True)
         for index, count in zip(indices, counts):
           if index in counts_by_indices.keys():
@@ -57,7 +58,7 @@ class MultinomialTest(test.TestCase):
           num_samples=1000000,
           seed=15)
       for _ in range(100):
-        x = sess.run(samples)
+        x = self.evaluate(samples)
         indices, counts = np.unique(x, return_counts=True)
         for index, count in zip(indices, counts):
           if index in counts_by_indices.keys():
@@ -66,6 +67,7 @@ class MultinomialTest(test.TestCase):
             counts_by_indices[index] = count
     self.assertEqual(counts_by_indices[0], 100000000)
 
+  @test_util.run_deprecated_v1
   def testLargeDynamicRange3(self):
     random_seed.set_random_seed(10)
     counts_by_indices = {}
@@ -79,7 +81,7 @@ class MultinomialTest(test.TestCase):
       # we'll run out of memory if we try to draw 1e9 samples directly
       # really should fit in 12GB of memory...
       for _ in range(100):
-        x = sess.run(samples)
+        x = self.evaluate(samples)
         indices, counts = np.unique(x, return_counts=True)
         for index, count in zip(indices, counts):
           if index in counts_by_indices.keys():
diff --git a/tensorflow/python/kernel_tests/random/multinomial_op_test.py b/tensorflow/python/kernel_tests/random/multinomial_op_test.py
index bd64d61af8e..5d123307a8e 100644
--- a/tensorflow/python/kernel_tests/random/multinomial_op_test.py
+++ b/tensorflow/python/kernel_tests/random/multinomial_op_test.py
@@ -66,12 +66,13 @@ class MultinomialTest(test.TestCase):
             logits, num_samples, output_dtype=output_dtype))
         self.assertAllEqual([[1] * num_samples, [2] * num_samples], samples)
 
+  @test_util.run_deprecated_v1
   def testOneOpMultipleStepsIndependent(self):
-    with self.test_session(use_gpu=True) as sess:
+    with test_util.use_gpu():
       sample_op1, _ = self._make_ops(10)
       # Consecutive runs shouldn't yield identical output.
-      sample1a = sess.run(sample_op1)
-      sample1b = sess.run(sample_op1)
+      sample1a = self.evaluate(sample_op1)
+      sample1b = self.evaluate(sample_op1)
       self.assertFalse(np.equal(sample1a, sample1b).all())
 
   def testEagerOneOpMultipleStepsIndependent(self):
@@ -81,26 +82,27 @@ class MultinomialTest(test.TestCase):
       self.assertFalse(np.equal(sample1.numpy(), sample2.numpy()).all())
 
   def testTwoOpsIndependent(self):
-    with self.test_session(use_gpu=True) as sess:
+    with test_util.use_gpu():
       sample_op1, sample_op2 = self._make_ops(32)
-      sample1, sample2 = sess.run([sample_op1, sample_op2])
+      sample1, sample2 = self.evaluate([sample_op1, sample_op2])
       # We expect sample1 and sample2 to be independent.
       # 1 in 2^32 chance of this assertion failing.
       self.assertFalse(np.equal(sample1, sample2).all())
 
+  @test_util.run_deprecated_v1
   def testTwoOpsSameSeedDrawSameSequences(self):
-    with self.test_session(use_gpu=True) as sess:
+    with test_util.use_gpu():
       sample_op1, sample_op2 = self._make_ops(1000, seed=1)
-      sample1, sample2 = sess.run([sample_op1, sample_op2])
+      sample1, sample2 = self.evaluate([sample_op1, sample_op2])
       self.assertAllEqual(sample1, sample2)
 
   def testLargeLogits(self):
     for neg in [True, False]:
-      with self.test_session(use_gpu=True):
+      with test_util.use_gpu():
         logits = np.array([[1000.] * 5])
         if neg:
           logits *= -1
-        samples = random_ops.multinomial(logits, 10).eval()
+        samples = self.evaluate(random_ops.multinomial(logits, 10))
       # Sampled classes should be in-range.
       self.assertTrue((samples >= 0).all())
       self.assertTrue((samples < 5).all())
@@ -157,10 +159,10 @@ class MultinomialTest(test.TestCase):
     Returns:
       Frequencies from sampled classes; shape [batch_size, num_classes].
     """
-    with self.test_session(use_gpu=True) as sess:
+    with test_util.use_gpu():
       random_seed.set_random_seed(1618)
       op = sampler(constant_op.constant(logits), num_samples)
-      d = sess.run(op)
+      d = self.evaluate(op)
 
     batch_size, num_classes = logits.shape
     freqs_mat = []
@@ -186,25 +188,27 @@ class MultinomialTest(test.TestCase):
 
   def testEmpty(self):
     classes = 5
-    with self.test_session(use_gpu=True):
+    with test_util.use_gpu():
       for batch in 0, 3:
         for samples in 0, 7:
-          x = random_ops.multinomial(
-              array_ops.zeros([batch, classes]), samples).eval()
+          x = self.evaluate(
+              random_ops.multinomial(
+                  array_ops.zeros([batch, classes]), samples))
           self.assertEqual(x.shape, (batch, samples))
 
+  @test_util.run_deprecated_v1
   def testEmptyClasses(self):
-    with self.test_session(use_gpu=True):
+    with test_util.use_gpu():
       x = random_ops.multinomial(array_ops.zeros([5, 0]), 7)
       with self.assertRaisesOpError("num_classes should be positive"):
-        x.eval()
+        self.evaluate(x)
 
   def testNegativeMinLogits(self):
     random_seed.set_random_seed(78844)
-    with self.test_session(use_gpu=True):
+    with test_util.use_gpu():
       logits = constant_op.constant([[np.finfo(np.float32).min] * 1023 + [0]])
       num_samples = 1000
-      samples = random_ops.multinomial(logits, num_samples).eval()
+      samples = self.evaluate(random_ops.multinomial(logits, num_samples))
       self.assertAllEqual([[1023] * num_samples], samples)
 
 
diff --git a/tensorflow/python/kernel_tests/random/random_crop_test.py b/tensorflow/python/kernel_tests/random/random_crop_test.py
index 8ded522320b..724bee07157 100644
--- a/tensorflow/python/kernel_tests/random/random_crop_test.py
+++ b/tensorflow/python/kernel_tests/random/random_crop_test.py
@@ -20,12 +20,14 @@ from __future__ import print_function
 
 import numpy as np
 
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import random_ops
 from tensorflow.python.platform import test
 
 
 class RandomCropTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testNoOp(self):
     # No random cropping is performed since the size is value.shape.
     for shape in (2, 1, 1), (2, 1, 3), (4, 5, 3):
@@ -44,10 +46,11 @@ class RandomCropTest(test.TestCase):
           for i in range(2) for j in range(3) for k in range(4))
       crop = random_ops.random_crop(value, size=target)
       for _ in range(20):
-        y = crop.eval()
+        y = self.evaluate(crop)
         self.assertAllEqual(y.shape, target)
         self.assertTrue(tuple(y.ravel()) in value_set)
 
+  @test_util.run_deprecated_v1
   def testRandomization(self):
     # Run 1x1 crop num_samples times in an image and ensure that one finds each
     # pixel 1/size of the time.
@@ -61,7 +64,7 @@ class RandomCropTest(test.TestCase):
       crop = random_ops.random_crop(value, single, seed=7)
       counts = np.zeros(size, dtype=np.int32)
       for _ in range(num_samples):
-        y = crop.eval()
+        y = self.evaluate(crop)
         self.assertAllEqual(y.shape, single)
         counts[y] += 1
 
diff --git a/tensorflow/python/kernel_tests/random/random_gamma_test.py b/tensorflow/python/kernel_tests/random/random_gamma_test.py
index 606e8862c47..a5952a21968 100644
--- a/tensorflow/python/kernel_tests/random/random_gamma_test.py
+++ b/tensorflow/python/kernel_tests/random/random_gamma_test.py
@@ -26,6 +26,7 @@ from six.moves import xrange  # pylint: disable=redefined-builtin
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import random_seed
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
@@ -48,14 +49,16 @@ class RandomGammaTest(test.TestCase):
             [num], alpha, beta=beta, dtype=dtype, seed=seed)
         ret = np.empty([10, num])
         for i in xrange(10):
-          ret[i, :] = sess.run(rng)
+          ret[i, :] = self.evaluate(rng)
       return ret
 
     return func
 
+  @test_util.run_deprecated_v1
   def testMomentsFloat32(self):
     self._testMoments(dtypes.float32)
 
+  @test_util.run_deprecated_v1
   def testMomentsFloat64(self):
     self._testMoments(dtypes.float64)
 
@@ -208,6 +211,7 @@ class RandomGammaTest(test.TestCase):
         sy = self._Sampler(1000, 0.0, 1.0, dt, use_gpu=use_gpu, seed=345)
         self.assertAllEqual(sx(), sy())
 
+  @test_util.run_deprecated_v1
   def testNoCSE(self):
     """CSE = constant subexpression eliminator.
 
@@ -222,6 +226,7 @@ class RandomGammaTest(test.TestCase):
           diff = rnd2 - rnd1
           self.assertGreater(np.linalg.norm(diff.eval()), 0.1)
 
+  @test_util.run_deprecated_v1
   def testShape(self):
     # Fully known shape.
     rnd = random_ops.random_gamma([150], 2.0)
@@ -253,6 +258,7 @@ class RandomGammaTest(test.TestCase):
     rnd = random_ops.random_gamma([50], array_ops.placeholder(dtypes.float32))
     self.assertIs(None, rnd.get_shape().ndims)
 
+  @test_util.run_deprecated_v1
   def testPositive(self):
     n = int(10e3)
     for dt in [dtypes.float16, dtypes.float32, dtypes.float64]:
diff --git a/tensorflow/python/kernel_tests/random/random_grad_test.py b/tensorflow/python/kernel_tests/random/random_grad_test.py
index d89056c485a..aac6eeac06a 100644
--- a/tensorflow/python/kernel_tests/random/random_grad_test.py
+++ b/tensorflow/python/kernel_tests/random/random_grad_test.py
@@ -22,6 +22,7 @@ import numpy as np
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import math_ops
@@ -45,6 +46,7 @@ class AddLeadingUnitDimensionsTest(test.TestCase):
     ret = random_grad.add_leading_unit_dimensions(1.0, 2)
     self.assertAllEqual(ret.shape, [1, 1])
 
+  @test_util.run_deprecated_v1
   def testUnknownShape(self):
     x = array_ops.placeholder(dtypes.float32)
     num_dimensions = array_ops.placeholder(dtypes.int32)
@@ -72,6 +74,7 @@ class RandomGammaGradTest(test.TestCase):
   some statistical properties of the derivative.
   """
 
+  @test_util.run_deprecated_v1
   def testGradientsShape(self):
     shape = [2, 3]
     alpha = array_ops.ones([2, 2])
@@ -81,6 +84,7 @@ class RandomGammaGradTest(test.TestCase):
     self.assertAllEqual(grads_alpha.shape, alpha.shape)
     self.assertAllEqual(grads_beta.shape, beta.shape)
 
+  @test_util.run_deprecated_v1
   def testGradientsShapeWithOneSamplePerParameter(self):
     shape = []
     alpha = array_ops.ones([2, 2])
@@ -90,6 +94,7 @@ class RandomGammaGradTest(test.TestCase):
     self.assertAllEqual(grads_alpha.shape, alpha.shape)
     self.assertAllEqual(grads_beta.shape, beta.shape)
 
+  @test_util.run_deprecated_v1
   def testGradientsUnknownShape(self):
     shape = array_ops.placeholder(dtypes.int32)
     alpha = array_ops.placeholder(dtypes.float32)
@@ -138,9 +143,11 @@ class RandomGammaGradTest(test.TestCase):
     except ImportError as e:
       tf_logging.warn("Cannot use special functions in a test: %s" % str(e))
 
+  @test_util.run_deprecated_v1
   def testCompareToExplicitDerivativeFloat(self):
     self._testCompareToExplicitDerivative(dtypes.float32)
 
+  @test_util.run_deprecated_v1
   def testCompareToExplicitDerivativeDouble(self):
     self._testCompareToExplicitDerivative(dtypes.float64)
 
@@ -182,12 +189,15 @@ class RandomGammaGradTest(test.TestCase):
 
     self.assertAllClose(actual_val, expected_val, rtol=1e-3, atol=1e-3)
 
+  @test_util.run_deprecated_v1
   def testCompareToImplicitDerivativeFloat(self):
     self._testCompareToImplicitDerivative(dtypes.float32)
 
+  @test_util.run_deprecated_v1
   def testCompareToImplicitDerivativeDouble(self):
     self._testCompareToImplicitDerivative(dtypes.float64)
 
+  @test_util.run_deprecated_v1
   def testAverageAlphaGradient(self):
     """Statistical test for the gradient.
 
@@ -207,6 +217,7 @@ class RandomGammaGradTest(test.TestCase):
     dsample_dalpha_val = self.evaluate(dsample_dalpha)
     self.assertAllClose(dsample_dalpha_val, [1.0] * 3, atol=1e-1, rtol=1e-1)
 
+  @test_util.run_deprecated_v1
   def testQuadraticLoss(self):
     """Statistical test for the gradient.
 
diff --git a/tensorflow/python/kernel_tests/random/random_ops_test.py b/tensorflow/python/kernel_tests/random/random_ops_test.py
index 6de894846bc..1384c3f446f 100644
--- a/tensorflow/python/kernel_tests/random/random_ops_test.py
+++ b/tensorflow/python/kernel_tests/random/random_ops_test.py
@@ -25,6 +25,7 @@ from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import random_seed
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import variables
@@ -49,9 +50,9 @@ class RandomOpTestCommon(test.TestCase):
         random_seed.set_random_seed(graph_seed)
       x = rng_func([num], min_or_mean, max_or_stddev, dtype=dtype, seed=op_seed)
 
-      y = sess.run(x)
-      z = sess.run(x)
-      w = sess.run(x)
+      y = self.evaluate(x)
+      z = self.evaluate(x)
+      w = self.evaluate(x)
 
       # We use exact equality here. If the random-number generator is producing
       # the same output, all three outputs will be bitwise identical.
@@ -69,7 +70,7 @@ class RandomNormalTest(RandomOpTestCommon):
             [num], mean=mu, stddev=sigma, dtype=dtype, seed=seed)
         ret = np.empty([10, num])
         for i in xrange(10):
-          ret[i, :] = sess.run(rng)
+          ret[i, :] = self.evaluate(rng)
       return ret
 
     return func
@@ -92,6 +93,7 @@ class RandomNormalTest(RandomOpTestCommon):
 
   # Checks that the CPU and GPU implementation returns the same results,
   # given the same random seed
+  @test_util.run_deprecated_v1
   def testCPUGPUMatch(self):
     for dt in dtypes.float16, dtypes.float32, dtypes.float64:
       results = {}
@@ -104,12 +106,14 @@ class RandomNormalTest(RandomOpTestCommon):
       else:
         self.assertAllClose(results[False], results[True], rtol=1e-6, atol=1e-6)
 
+  @test_util.run_deprecated_v1
   def testSeed(self):
     for dt in dtypes.float16, dtypes.float32, dtypes.float64:
       sx = self._Sampler(1000, 0.0, 1.0, dt, use_gpu=True, seed=345)
       sy = self._Sampler(1000, 0.0, 1.0, dt, use_gpu=True, seed=345)
       self.assertAllEqual(sx(), sy())
 
+  @test_util.run_deprecated_v1
   def testNoCSE(self):
     for use_gpu in [False, True]:
       with self.session(use_gpu=use_gpu):
@@ -119,12 +123,14 @@ class RandomNormalTest(RandomOpTestCommon):
         diff = rnd2 - rnd1
         self.assertTrue(np.linalg.norm(diff.eval()) > 0.1)
 
+  @test_util.run_deprecated_v1
   def testSingleSessionNotConstant(self):
     for use_gpu in [False, True]:
       for dt in dtypes.float16, dtypes.float32, dtypes.float64:
         self._testSingleSessionNotConstant(
             random_ops.random_normal, 100, dt, 0.0, 1.0, use_gpu=use_gpu)
 
+  @test_util.run_deprecated_v1
   def testSingleSessionOpSeedNotConstant(self):
     for use_gpu in [False, True]:
       for dt in dtypes.float16, dtypes.float32, dtypes.float64:
@@ -137,6 +143,7 @@ class RandomNormalTest(RandomOpTestCommon):
             use_gpu=use_gpu,
             op_seed=1345)
 
+  @test_util.run_deprecated_v1
   def testSingleSessionGraphSeedNotConstant(self):
     for use_gpu in [False, True]:
       for dt in dtypes.float16, dtypes.float32, dtypes.float64:
@@ -160,7 +167,7 @@ class TruncatedNormalTest(test.TestCase):
             [num], mean=mu, stddev=sigma, dtype=dtype, seed=seed)
         ret = np.empty([10, num])
         for i in xrange(10):
-          ret[i, :] = sess.run(rng)
+          ret[i, :] = self.evaluate(rng)
       return ret
 
     return func
@@ -185,6 +192,7 @@ class TruncatedNormalTest(test.TestCase):
 
   # Checks that the CPU and GPU implementation returns the same results,
   # given the same random seed
+  @test_util.run_deprecated_v1
   def testCPUGPUMatch(self):
     # Skip the test if there is no GPU.
     if not test.is_gpu_available():
@@ -203,6 +211,7 @@ class TruncatedNormalTest(test.TestCase):
       else:
         self.assertAllClose(results[False], results[True], rtol=1e-6, atol=1e-6)
 
+  @test_util.run_deprecated_v1
   def testSeed(self):
     for dt in dtypes.float16, dtypes.float32, dtypes.float64:
       sx = self._Sampler(1000, 0.0, 1.0, dt, use_gpu=True, seed=345)
@@ -219,6 +228,7 @@ class TruncatedNormalTest(test.TestCase):
       print("std(x)", np.std(x), abs(np.std(x) / stddev - 0.85))
       self.assertTrue(abs(np.std(x) / stddev - 0.85) < 0.04)
 
+  @test_util.run_deprecated_v1
   def testLargeShape(self):
     with self.session(use_gpu=True):
       v = variables.Variable(
@@ -226,6 +236,7 @@ class TruncatedNormalTest(test.TestCase):
       n = random_ops.truncated_normal(v.shape)
       self.assertEqual([8589934592, 1], n.shape.as_list())
 
+  @test_util.run_deprecated_v1
   def testNoCSE(self):
     with self.session(use_gpu=True):
       shape = [2, 3, 4]
@@ -256,7 +267,7 @@ class RandomUniformTest(RandomOpTestCommon):
             [num], minval=minv, maxval=maxv, dtype=dtype, seed=seed)
         ret = np.empty([10, num])
         for i in xrange(10):
-          ret[i, :] = sess.run(rng)
+          ret[i, :] = self.evaluate(rng)
       return ret
 
     return func
@@ -287,6 +298,7 @@ class RandomUniformTest(RandomOpTestCommon):
         print("count = ", count)
       self.assertTrue(count < count_limit)
 
+  @test_util.run_deprecated_v1
   def testUniformIntsWithInvalidShape(self):
     for dtype in dtypes.int32, dtypes.int64:
       with self.assertRaisesRegexp(
@@ -299,6 +311,7 @@ class RandomUniformTest(RandomOpTestCommon):
             [1000], minval=1, maxval=[2, 3], dtype=dtype)
 
   # Check that uniform ints actually follow a uniform distribution.
+  @test_util.run_deprecated_v1
   def testUniformInts(self):
     minv = -2
     maxv = 15
@@ -331,6 +344,7 @@ class RandomUniformTest(RandomOpTestCommon):
 
   # Checks that the CPU and GPU implementation returns the same results,
   # given the same random seed
+  @test_util.run_deprecated_v1
   def testCPUGPUMatch(self):
     for dt in (dtypes.float16, dtypes.float32, dtypes.float64, dtypes.int32,
                dtypes.int64):
@@ -342,6 +356,7 @@ class RandomUniformTest(RandomOpTestCommon):
         results[use_gpu] = sampler()
       self.assertAllEqual(results[False], results[True])
 
+  @test_util.run_deprecated_v1
   def testSeed(self):
     for dt in (dtypes.float16, dtypes.float32, dtypes.float64, dtypes.int32,
                dtypes.int64):
@@ -350,6 +365,7 @@ class RandomUniformTest(RandomOpTestCommon):
         sy = self._Sampler(1000, 0, 17, dtype=dt, use_gpu=True, seed=seed)
         self.assertAllEqual(sx(), sy())
 
+  @test_util.run_deprecated_v1
   def testNoCSE(self):
     shape = [2, 3, 4]
     for dtype in dtypes.float16, dtypes.float32, dtypes.int32:
@@ -359,6 +375,7 @@ class RandomUniformTest(RandomOpTestCommon):
         diff = (rnd2 - rnd1).eval()
         self.assertTrue(np.linalg.norm(diff) > 0.1)
 
+  @test_util.run_deprecated_v1
   def testSingleSessionNotConstant(self):
     for use_gpu in [False, True]:
       for dt in (dtypes.float16, dtypes.float32, dtypes.float64, dtypes.int32,
@@ -366,6 +383,7 @@ class RandomUniformTest(RandomOpTestCommon):
         self._testSingleSessionNotConstant(
             random_ops.random_uniform, 100, dt, 0, 17, use_gpu=use_gpu)
 
+  @test_util.run_deprecated_v1
   def testSingleSessionOpSeedNotConstant(self):
     for use_gpu in [False, True]:
       for dt in (dtypes.float16, dtypes.float32, dtypes.float64, dtypes.int32,
@@ -379,6 +397,7 @@ class RandomUniformTest(RandomOpTestCommon):
             use_gpu=use_gpu,
             op_seed=1345)
 
+  @test_util.run_deprecated_v1
   def testSingleSessionGraphSeedNotConstant(self):
     for use_gpu in [False, True]:
       for dt in (dtypes.float16, dtypes.float32, dtypes.float64, dtypes.int32,
@@ -395,6 +414,7 @@ class RandomUniformTest(RandomOpTestCommon):
 
 class RandomShapeTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testTruncatedNormal(self):
     # Fully known shape.
     rnd1 = random_ops.truncated_normal([1, 2, 3])
@@ -407,6 +427,7 @@ class RandomShapeTest(test.TestCase):
     rnd3 = random_ops.truncated_normal(array_ops.placeholder(dtypes.int32))
     self.assertIs(None, rnd3.get_shape().ndims)
 
+  @test_util.run_deprecated_v1
   def testRandomNormal(self):
     # Fully known shape.
     rnd1 = random_ops.random_normal([1, 2, 3])
@@ -419,6 +440,7 @@ class RandomShapeTest(test.TestCase):
     rnd3 = random_ops.random_normal(array_ops.placeholder(dtypes.int32))
     self.assertIs(None, rnd3.get_shape().ndims)
 
+  @test_util.run_deprecated_v1
   def testRandomUniform(self):
     # Fully known shape.
     rnd1 = random_ops.random_uniform([1, 2, 3])
diff --git a/tensorflow/python/kernel_tests/random/random_poisson_test.py b/tensorflow/python/kernel_tests/random/random_poisson_test.py
index 417588f8a39..0a6b004d682 100644
--- a/tensorflow/python/kernel_tests/random/random_poisson_test.py
+++ b/tensorflow/python/kernel_tests/random/random_poisson_test.py
@@ -23,6 +23,7 @@ from six.moves import xrange  # pylint: disable=redefined-builtin
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.platform import test
@@ -43,7 +44,7 @@ class RandomPoissonTest(test.TestCase):
         rng = random_ops.random_poisson(lam, [num], dtype=dtype, seed=seed)
         ret = np.empty([10, num])
         for i in xrange(10):
-          ret[i, :] = sess.run(rng)
+          ret[i, :] = self.evaluate(rng)
       return ret
 
     return func
@@ -104,6 +105,7 @@ class RandomPoissonTest(test.TestCase):
 
   # Checks that the CPU and GPU implementation returns the same results,
   # given the same random seed
+  @test_util.run_deprecated_v1
   def testCPUGPUMatch(self):
     for dt in _SUPPORTED_DTYPES:
       results = {}
@@ -115,12 +117,14 @@ class RandomPoissonTest(test.TestCase):
       else:
         self.assertAllClose(results[False], results[True], rtol=1e-6, atol=1e-6)
 
+  @test_util.run_deprecated_v1
   def testSeed(self):
     for dt in dtypes.float16, dtypes.float32, dtypes.float64:
       sx = self._Sampler(1000, 1.0, dt, use_gpu=True, seed=345)
       sy = self._Sampler(1000, 1.0, dt, use_gpu=True, seed=345)
       self.assertAllEqual(sx(), sy())
 
+  @test_util.run_deprecated_v1
   def testNoCSE(self):
     """CSE = constant subexpression eliminator.
 
@@ -140,8 +144,9 @@ class RandomPoissonTest(test.TestCase):
     with self.cached_session():
       rnd = random_ops.random_poisson([], [], seed=12345)
       self.assertEqual([0], rnd.get_shape().as_list())
-      self.assertAllClose(np.array([], dtype=np.float32), rnd.eval())
+      self.assertAllClose(np.array([], dtype=np.float32), self.evaluate(rnd))
 
+  @test_util.run_deprecated_v1
   def testShape(self):
     # Fully known shape
     rnd = random_ops.random_poisson(2.0, [150], seed=12345)
@@ -184,6 +189,7 @@ class RandomPoissonTest(test.TestCase):
         seed=12345)
     self.assertIs(None, rnd.get_shape().ndims)
 
+  @test_util.run_deprecated_v1
   def testDTypeCombinationsV2(self):
     """Tests random_poisson_v2() for all supported dtype combinations."""
     with self.cached_session():
diff --git a/tensorflow/python/kernel_tests/random/random_shuffle_queue_test.py b/tensorflow/python/kernel_tests/random/random_shuffle_queue_test.py
index 0d85a072d4a..ed4f5434d9f 100644
--- a/tensorflow/python/kernel_tests/random/random_shuffle_queue_test.py
+++ b/tensorflow/python/kernel_tests/random/random_shuffle_queue_test.py
@@ -84,9 +84,9 @@ class RandomShuffleQueueTest(test.TestCase):
       dequeue_t = q.dequeue()
       results = []
       for _ in range(2):
-        a, b = sess.run(dequeue_t)
+        a, b = self.evaluate(dequeue_t)
         results.append((a, b))
-      a, b = sess.run(q.dequeue_many(3))
+      a, b = self.evaluate(q.dequeue_many(3))
       for i in range(3):
         results.append((a[i], b[i]))
       self.assertItemsEqual([(1, [5]), (2, [6]), (3, [7]), (4, [8]), (9, [10])],
@@ -101,7 +101,7 @@ class RandomShuffleQueueTest(test.TestCase):
 
       # Run one producer thread for each element in elems.
       def enqueue(enqueue_op):
-        sess.run(enqueue_op)
+        self.evaluate(enqueue_op)
 
       threads = [
           self.checkedThread(
@@ -133,7 +133,7 @@ class RandomShuffleQueueTest(test.TestCase):
       results = []
 
       def dequeue():
-        results.append(sess.run(dequeued_t))
+        results.append(self.evaluate(dequeued_t))
 
       threads = [self.checkedThread(target=dequeue) for _ in enqueue_ops]
       for thread in threads:
@@ -167,13 +167,13 @@ class RandomShuffleQueueTest(test.TestCase):
         # TODO(mrry): Figure out how to do this without sleeping.
         time.sleep(0.1)
         for enqueue_op in enqueue_ops:
-          sess.run(enqueue_op)
+          self.evaluate(enqueue_op)
 
       results = []
 
       def dequeue():
         for _ in xrange(len(elems)):
-          results.append(sess.run(dequeued_t))
+          results.append(self.evaluate(dequeued_t))
 
       enqueue_thread = self.checkedThread(target=enqueue)
       dequeue_thread = self.checkedThread(target=dequeue)
@@ -197,7 +197,7 @@ class RandomShuffleQueueTest(test.TestCase):
 
       results = []
       for _ in xrange(len(elems)):
-        x, y = sess.run(dequeued_t)
+        x, y = self.evaluate(dequeued_t)
         results.append((x, y))
       self.assertItemsEqual(elems, results)
 
@@ -215,9 +215,9 @@ class RandomShuffleQueueTest(test.TestCase):
       self.assertEqual([], size.get_shape())
 
       enqueue_op.run()
-      self.assertEqual([1], size.eval())
+      self.assertEqual([1], self.evaluate(size))
       dequeued_t.op.run()
-      self.assertEqual([0], size.eval())
+      self.assertEqual([0], self.evaluate(size))
 
   def testEnqueueMany(self):
     with self.cached_session():
@@ -241,9 +241,9 @@ class RandomShuffleQueueTest(test.TestCase):
       enqueue_op = q.enqueue_many((empty_t,))
       size_t = q.size()
 
-      self.assertEqual(0, size_t.eval())
+      self.assertEqual(0, self.evaluate(size_t))
       enqueue_op.run()
-      self.assertEqual(0, size_t.eval())
+      self.assertEqual(0, self.evaluate(size_t))
 
   def testEmptyDequeueMany(self):
     with self.cached_session():
@@ -251,9 +251,9 @@ class RandomShuffleQueueTest(test.TestCase):
       enqueue_op = q.enqueue((10.0,))
       dequeued_t = q.dequeue_many(0)
 
-      self.assertEqual([], dequeued_t.eval().tolist())
+      self.assertEqual([], self.evaluate(dequeued_t).tolist())
       enqueue_op.run()
-      self.assertEqual([], dequeued_t.eval().tolist())
+      self.assertEqual([], self.evaluate(dequeued_t).tolist())
 
   def testEmptyDequeueUpTo(self):
     with self.cached_session():
@@ -261,9 +261,9 @@ class RandomShuffleQueueTest(test.TestCase):
       enqueue_op = q.enqueue((10.0,))
       dequeued_t = q.dequeue_up_to(0)
 
-      self.assertEqual([], dequeued_t.eval().tolist())
+      self.assertEqual([], self.evaluate(dequeued_t).tolist())
       enqueue_op.run()
-      self.assertEqual([], dequeued_t.eval().tolist())
+      self.assertEqual([], self.evaluate(dequeued_t).tolist())
 
   def testEmptyDequeueManyWithNoShape(self):
     with self.cached_session():
@@ -275,7 +275,7 @@ class RandomShuffleQueueTest(test.TestCase):
       # Expect the operation to fail due to the shape not being constrained.
       with self.assertRaisesOpError(
           "require the components to have specified shapes"):
-        dequeued_t.eval()
+        self.evaluate(dequeued_t)
 
       enqueue_op.run()
 
@@ -284,7 +284,7 @@ class RandomShuffleQueueTest(test.TestCase):
       # elements enqueued.
       with self.assertRaisesOpError(
           "require the components to have specified shapes"):
-        dequeued_t.eval()
+        self.evaluate(dequeued_t)
 
   def testEmptyDequeueUpToWithNoShape(self):
     with self.cached_session():
@@ -296,7 +296,7 @@ class RandomShuffleQueueTest(test.TestCase):
       # Expect the operation to fail due to the shape not being constrained.
       with self.assertRaisesOpError(
           "require the components to have specified shapes"):
-        dequeued_t.eval()
+        self.evaluate(dequeued_t)
 
       enqueue_op.run()
 
@@ -305,7 +305,7 @@ class RandomShuffleQueueTest(test.TestCase):
       # elements enqueued.
       with self.assertRaisesOpError(
           "require the components to have specified shapes"):
-        dequeued_t.eval()
+        self.evaluate(dequeued_t)
 
   def testMultiEnqueueMany(self):
     with self.cached_session() as sess:
@@ -321,7 +321,7 @@ class RandomShuffleQueueTest(test.TestCase):
 
       results = []
       for _ in range(8):
-        float_val, int_val = sess.run(dequeued_t)
+        float_val, int_val = self.evaluate(dequeued_t)
         results.append((float_val, [int_val[0], int_val[1]]))
       expected = list(zip(float_elems, int_elems)) * 2
       self.assertItemsEqual(expected, results)
@@ -335,7 +335,7 @@ class RandomShuffleQueueTest(test.TestCase):
 
       enqueue_op.run()
 
-      results = dequeued_t.eval().tolist()
+      results = self.evaluate(dequeued_t).tolist()
       results.extend(dequeued_t.eval())
       self.assertItemsEqual(elems, results)
 
@@ -348,7 +348,7 @@ class RandomShuffleQueueTest(test.TestCase):
 
       enqueue_op.run()
 
-      results = dequeued_t.eval().tolist()
+      results = self.evaluate(dequeued_t).tolist()
       results.extend(dequeued_t.eval())
       self.assertItemsEqual(elems, results)
 
@@ -368,20 +368,20 @@ class RandomShuffleQueueTest(test.TestCase):
       enqueue_op.run()
 
       results = []
-      float_val, int_val = sess.run(dequeued_t)
+      float_val, int_val = self.evaluate(dequeued_t)
       self.assertEqual(float_val.shape, dequeued_t[0].get_shape())
       self.assertEqual(int_val.shape, dequeued_t[1].get_shape())
       results.extend(zip(float_val, int_val.tolist()))
 
-      float_val, int_val = sess.run(dequeued_t)
+      float_val, int_val = self.evaluate(dequeued_t)
       results.extend(zip(float_val, int_val.tolist()))
 
-      float_val, int_val = sess.run(dequeued_single_t)
+      float_val, int_val = self.evaluate(dequeued_single_t)
       self.assertEqual(float_val.shape, dequeued_single_t[0].get_shape())
       self.assertEqual(int_val.shape, dequeued_single_t[1].get_shape())
       results.append((float_val, int_val.tolist()))
 
-      float_val, int_val = sess.run(dequeued_single_t)
+      float_val, int_val = self.evaluate(dequeued_single_t)
       results.append((float_val, int_val.tolist()))
 
       self.assertItemsEqual(zip(float_elems, int_elems), results)
@@ -402,21 +402,21 @@ class RandomShuffleQueueTest(test.TestCase):
       enqueue_op.run()
 
       results = []
-      float_val, int_val = sess.run(dequeued_t)
+      float_val, int_val = self.evaluate(dequeued_t)
       # dequeue_up_to has undefined shape.
       self.assertEqual([None], dequeued_t[0].get_shape().as_list())
       self.assertEqual([None, 2], dequeued_t[1].get_shape().as_list())
       results.extend(zip(float_val, int_val.tolist()))
 
-      float_val, int_val = sess.run(dequeued_t)
+      float_val, int_val = self.evaluate(dequeued_t)
       results.extend(zip(float_val, int_val.tolist()))
 
-      float_val, int_val = sess.run(dequeued_single_t)
+      float_val, int_val = self.evaluate(dequeued_single_t)
       self.assertEqual(float_val.shape, dequeued_single_t[0].get_shape())
       self.assertEqual(int_val.shape, dequeued_single_t[1].get_shape())
       results.append((float_val, int_val.tolist()))
 
-      float_val, int_val = sess.run(dequeued_single_t)
+      float_val, int_val = self.evaluate(dequeued_single_t)
       results.append((float_val, int_val.tolist()))
 
       self.assertItemsEqual(zip(float_elems, int_elems), results)
@@ -442,7 +442,7 @@ class RandomShuffleQueueTest(test.TestCase):
 
       # Enqueue 100 items in parallel on 10 threads.
       def enqueue():
-        sess.run(enqueue_op)
+        self.evaluate(enqueue_op)
 
       threads = [self.checkedThread(target=enqueue) for _ in range(10)]
       for thread in threads:
@@ -466,7 +466,7 @@ class RandomShuffleQueueTest(test.TestCase):
       dequeued_elems = []
 
       def dequeue():
-        dequeued_elems.extend(sess.run(dequeued_t))
+        dequeued_elems.extend(self.evaluate(dequeued_t))
 
       threads = [self.checkedThread(target=dequeue) for _ in range(10)]
       for thread in threads:
@@ -489,7 +489,7 @@ class RandomShuffleQueueTest(test.TestCase):
       dequeued_elems = []
 
       def dequeue():
-        dequeued_elems.extend(sess.run(dequeued_t))
+        dequeued_elems.extend(self.evaluate(dequeued_t))
 
       threads = [self.checkedThread(target=dequeue) for _ in range(10)]
       for thread in threads:
@@ -515,7 +515,7 @@ class RandomShuffleQueueTest(test.TestCase):
       dequeued_elems = []
 
       def dequeue(dequeue_op):
-        dequeued_elems.extend(sess.run(dequeue_op))
+        dequeued_elems.extend(self.evaluate(dequeue_op))
 
       threads = []
       for dequeue_op in dequeue_ops:
@@ -539,10 +539,10 @@ class RandomShuffleQueueTest(test.TestCase):
         # The enqueue_op should run after the dequeue op has blocked.
         # TODO(mrry): Figure out how to do this without sleeping.
         time.sleep(0.1)
-        sess.run(enqueue_op)
+        self.evaluate(enqueue_op)
 
       def dequeue():
-        dequeued_elems.extend(sess.run(dequeued_t).tolist())
+        dequeued_elems.extend(self.evaluate(dequeued_t).tolist())
 
       enqueue_thread = self.checkedThread(target=enqueue)
       dequeue_thread = self.checkedThread(target=dequeue)
@@ -566,10 +566,10 @@ class RandomShuffleQueueTest(test.TestCase):
         # The enqueue_op should run after the dequeue op has blocked.
         # TODO(mrry): Figure out how to do this without sleeping.
         time.sleep(0.1)
-        sess.run(enqueue_op)
+        self.evaluate(enqueue_op)
 
       def dequeue():
-        dequeued_elems.extend(sess.run(dequeued_t).tolist())
+        dequeued_elems.extend(self.evaluate(dequeued_t).tolist())
 
       enqueue_thread = self.checkedThread(target=enqueue)
       dequeue_thread = self.checkedThread(target=dequeue)
@@ -649,7 +649,7 @@ class RandomShuffleQueueTest(test.TestCase):
       # Expect the operation to fail due to the queue being closed.
       with self.assertRaisesRegexp(errors_impl.OutOfRangeError,
                                    "is closed and has insufficient"):
-        dequeued_t.eval()
+        self.evaluate(dequeued_t)
 
   def testBlockingDequeueFromClosedQueue(self):
     with self.cached_session() as sess:
@@ -665,18 +665,18 @@ class RandomShuffleQueueTest(test.TestCase):
       results = []
 
       # Manually dequeue until we hit min_size.
-      results.append(sess.run(dequeued_t))
-      results.append(sess.run(dequeued_t))
+      results.append(self.evaluate(dequeued_t))
+      results.append(self.evaluate(dequeued_t))
 
       def blocking_dequeue():
-        results.append(sess.run(dequeued_t))
-        results.append(sess.run(dequeued_t))
+        results.append(self.evaluate(dequeued_t))
+        results.append(self.evaluate(dequeued_t))
 
         self.assertItemsEqual(elems, results)
         # Expect the operation to fail due to the queue being closed.
         with self.assertRaisesRegexp(errors_impl.OutOfRangeError,
                                      "is closed and has insufficient"):
-          sess.run(dequeued_t)
+          self.evaluate(dequeued_t)
 
       dequeue_thread = self.checkedThread(target=blocking_dequeue)
       dequeue_thread.start()
@@ -701,7 +701,7 @@ class RandomShuffleQueueTest(test.TestCase):
         # Expect the operation to fail due to the queue being closed.
         with self.assertRaisesRegexp(errors_impl.OutOfRangeError,
                                      "is closed and has insufficient"):
-          sess.run(dequeued_t)
+          self.evaluate(dequeued_t)
         finished.append(True)
 
       dequeue_thread = self.checkedThread(target=dequeue)
@@ -727,12 +727,12 @@ class RandomShuffleQueueTest(test.TestCase):
       progress = []  # Must be mutable
 
       def dequeue():
-        self.assertItemsEqual(elems, sess.run(dequeued_t))
+        self.assertItemsEqual(elems, self.evaluate(dequeued_t))
         progress.append(1)
         # Expect the operation to fail due to the queue being closed.
         with self.assertRaisesRegexp(errors_impl.OutOfRangeError,
                                      "is closed and has insufficient"):
-          sess.run(dequeued_t)
+          self.evaluate(dequeued_t)
         progress.append(2)
 
       self.assertEqual(len(progress), 0)
@@ -763,9 +763,9 @@ class RandomShuffleQueueTest(test.TestCase):
       results = []
 
       def dequeue():
-        results.extend(sess.run(dequeued_t))
+        results.extend(self.evaluate(dequeued_t))
         self.assertEquals(3, len(results))
-        results.extend(sess.run(dequeued_t))
+        results.extend(self.evaluate(dequeued_t))
         self.assertEquals(4, len(results))
 
       dequeue_thread = self.checkedThread(target=dequeue)
@@ -794,11 +794,11 @@ class RandomShuffleQueueTest(test.TestCase):
       results = []
 
       def dequeue():
-        results.extend(sess.run(dequeued_t))
+        results.extend(self.evaluate(dequeued_t))
         self.assertEquals(3, len(results))
         # min_after_dequeue is 2, we ask for 3 elements, and we end up only
         # getting the remaining 1.
-        results.extend(sess.run(dequeued_t))
+        results.extend(self.evaluate(dequeued_t))
         self.assertEquals(4, len(results))
 
       dequeue_thread = self.checkedThread(target=dequeue)
@@ -824,16 +824,16 @@ class RandomShuffleQueueTest(test.TestCase):
       results = []
 
       def dequeue():
-        results.extend(sess.run(dequeued_t))
+        results.extend(self.evaluate(dequeued_t))
         self.assertEqual(len(results), 3)
         # Expect the operation to fail due to the queue being closed.
         with self.assertRaisesRegexp(errors_impl.OutOfRangeError,
                                      "is closed and has insufficient"):
-          sess.run(dequeued_t)
+          self.evaluate(dequeued_t)
         # While the last dequeue failed, we want to insure that it returns
         # any elements that it potentially reserved to dequeue. Thus the
         # next cleanup should return a single element.
-        results.extend(sess.run(cleanup_dequeue_t))
+        results.extend(self.evaluate(cleanup_dequeue_t))
 
       dequeue_thread = self.checkedThread(target=dequeue)
       dequeue_thread.start()
@@ -854,7 +854,7 @@ class RandomShuffleQueueTest(test.TestCase):
         # Expect the operation to fail due to the queue being closed.
         with self.assertRaisesRegexp(errors_impl.OutOfRangeError,
                                      "is closed and has insufficient"):
-          sess.run(dequeued_t)
+          self.evaluate(dequeued_t)
 
       dequeue_thread = self.checkedThread(target=dequeue)
       dequeue_thread.start()
@@ -874,7 +874,7 @@ class RandomShuffleQueueTest(test.TestCase):
         # Expect the operation to fail due to the queue being closed.
         with self.assertRaisesRegexp(errors_impl.OutOfRangeError,
                                      "is closed and has insufficient"):
-          sess.run(dequeued_t)
+          self.evaluate(dequeued_t)
 
       dequeue_thread = self.checkedThread(target=dequeue)
       dequeue_thread.start()
@@ -922,7 +922,7 @@ class RandomShuffleQueueTest(test.TestCase):
       enqueue_op.run()
 
       def blocking_enqueue():
-        sess.run(blocking_enqueue_op)
+        self.evaluate(blocking_enqueue_op)
 
       thread = self.checkedThread(target=blocking_enqueue)
       thread.start()
@@ -950,7 +950,7 @@ class RandomShuffleQueueTest(test.TestCase):
       enqueue_op.run()
 
       def blocking_enqueue():
-        sess.run(blocking_enqueue_op)
+        self.evaluate(blocking_enqueue_op)
 
       thread = self.checkedThread(target=blocking_enqueue)
       thread.start()
@@ -987,11 +987,11 @@ class RandomShuffleQueueTest(test.TestCase):
       def blocking_enqueue():
         # Expect the operation to succeed since it will complete
         # before the queue is closed.
-        sess.run(blocking_enqueue_op)
+        self.evaluate(blocking_enqueue_op)
 
         # Expect the operation to fail due to the queue being closed.
         with self.assertRaisesRegexp(errors_impl.CancelledError, "closed"):
-          sess.run(blocking_enqueue_op)
+          self.evaluate(blocking_enqueue_op)
 
       thread1 = self.checkedThread(target=blocking_enqueue)
       thread1.start()
@@ -1001,7 +1001,7 @@ class RandomShuffleQueueTest(test.TestCase):
       time.sleep(0.1)
 
       def blocking_close():
-        sess.run(close_op)
+        self.evaluate(close_op)
 
       thread2 = self.checkedThread(target=blocking_close)
       thread2.start()
@@ -1032,7 +1032,7 @@ class RandomShuffleQueueTest(test.TestCase):
 
       def blocking_enqueue():
         # This will block until the dequeue after the close.
-        sess.run(blocking_enqueue_op)
+        self.evaluate(blocking_enqueue_op)
 
       thread1 = self.checkedThread(target=blocking_enqueue)
       thread1.start()
@@ -1040,7 +1040,7 @@ class RandomShuffleQueueTest(test.TestCase):
       # First blocking_enqueue_op of blocking_enqueue has enqueued 1 of 2
       # elements, and is blocked waiting for one more element to be dequeue.
       for i in range(50):
-        queue_size = size_t.eval()
+        queue_size = self.evaluate(size_t)
         if queue_size == 4:
           break
         elif i == 49:
@@ -1050,7 +1050,7 @@ class RandomShuffleQueueTest(test.TestCase):
         time.sleep(0.1)
 
       def blocking_close():
-        sess.run(close_op)
+        self.evaluate(close_op)
 
       thread2 = self.checkedThread(target=blocking_close)
       thread2.start()
@@ -1064,7 +1064,7 @@ class RandomShuffleQueueTest(test.TestCase):
       # At this point the close operation will complete, so the next enqueue
       # will fail.
       with self.assertRaisesRegexp(errors_impl.CancelledError, "closed"):
-        sess.run(blocking_enqueue_op)
+        self.evaluate(blocking_enqueue_op)
 
   def testSharedQueueSameSession(self):
     with self.cached_session():
@@ -1216,23 +1216,23 @@ class RandomShuffleQueueTest(test.TestCase):
 
   def _blockingDequeue(self, sess, dequeue_op):
     with self.assertRaisesOpError("was cancelled"):
-      sess.run(dequeue_op)
+      self.evaluate(dequeue_op)
 
   def _blockingDequeueMany(self, sess, dequeue_many_op):
     with self.assertRaisesOpError("was cancelled"):
-      sess.run(dequeue_many_op)
+      self.evaluate(dequeue_many_op)
 
   def _blockingDequeueUpTo(self, sess, dequeue_up_to_op):
     with self.assertRaisesOpError("was cancelled"):
-      sess.run(dequeue_up_to_op)
+      self.evaluate(dequeue_up_to_op)
 
   def _blockingEnqueue(self, sess, enqueue_op):
     with self.assertRaisesOpError("was cancelled"):
-      sess.run(enqueue_op)
+      self.evaluate(enqueue_op)
 
   def _blockingEnqueueMany(self, sess, enqueue_many_op):
     with self.assertRaisesOpError("was cancelled"):
-      sess.run(enqueue_many_op)
+      self.evaluate(enqueue_many_op)
 
   def testResetOfBlockingOperation(self):
     with self.cached_session() as sess:
@@ -1383,7 +1383,7 @@ class RandomShuffleQueueTest(test.TestCase):
       def blocking_enqueue():
         enq_done.append(False)
         # This will fill the queue and then block until enough dequeues happen.
-        sess.run(enq)
+        self.evaluate(enq)
         enq_done.append(True)
 
       thread = self.checkedThread(target=blocking_enqueue)
@@ -1393,14 +1393,14 @@ class RandomShuffleQueueTest(test.TestCase):
       results = []
       results.append(deq.eval())  # Will only complete after the enqueue starts.
       self.assertEqual(len(enq_done), 1)
-      self.assertEqual(sess.run(size_op), 5)
+      self.assertEqual(self.evaluate(size_op), 5)
 
       for _ in range(3):
         results.append(deq.eval())
 
       time.sleep(0.1)
       self.assertEqual(len(enq_done), 1)
-      self.assertEqual(sess.run(size_op), 5)
+      self.assertEqual(self.evaluate(size_op), 5)
 
       # This dequeue will unblock the thread.
       results.append(deq.eval())
@@ -1426,7 +1426,7 @@ class RandomShuffleQueueTest(test.TestCase):
 
       def blocking_dequeue():
         # Will only complete after 4 enqueues complete.
-        results.extend(sess.run(deq))
+        results.extend(self.evaluate(deq))
 
       thread = self.checkedThread(target=blocking_dequeue)
       thread.start()
@@ -1435,7 +1435,7 @@ class RandomShuffleQueueTest(test.TestCase):
         # TODO(mrry): Figure out how to do this without sleeping.
         time.sleep(0.1)
         self.assertEqual(len(results), 0)
-        sess.run(enq)
+        self.evaluate(enq)
 
       # Enough enqueued to unblock the dequeue
       thread.join()
diff --git a/tensorflow/python/kernel_tests/random/stateless_random_ops_test.py b/tensorflow/python/kernel_tests/random/stateless_random_ops_test.py
index d57db3c5126..898f38444b7 100644
--- a/tensorflow/python/kernel_tests/random/stateless_random_ops_test.py
+++ b/tensorflow/python/kernel_tests/random/stateless_random_ops_test.py
@@ -24,6 +24,7 @@ import numpy as np
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import random_seed
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import stateless_random_ops as stateless
@@ -58,11 +59,11 @@ class StatelessOpsTest(test.TestCase):
       preseed = invert_philox(key, (seed[0], 0, seed[1], 0)).astype(np.uint64)
       preseed = preseed[::2] | preseed[1::2] << 32
       random_seed.set_random_seed(seed[0])
-      with self.test_session(use_gpu=True):
+      with test_util.use_gpu():
         for stateless_op, stateful_op in cases:
           stateful = stateful_op(seed=seed[1])
           pure = stateless_op(seed=preseed)
-          self.assertAllEqual(stateful.eval(), pure.eval())
+          self.assertAllEqual(self.evaluate(stateful), self.evaluate(pure))
 
   def _test_determinism(self, cases):
     # Stateless values should be equal iff the seeds are equal (roughly)
@@ -128,23 +129,29 @@ class StatelessOpsTest(test.TestCase):
           yield (functools.partial(stateless.stateless_multinomial, **kwds),
                  functools.partial(random_ops.multinomial, **kwds))
 
+  @test_util.run_deprecated_v1
   def testMatchFloat(self):
     self._test_match(self._float_cases())
 
+  @test_util.run_deprecated_v1
   def testMatchInt(self):
     self._test_match(self._int_cases())
 
+  @test_util.run_deprecated_v1
   def testMatchMultinomial(self):
     self._test_match(self._multinomial_cases())
 
+  @test_util.run_deprecated_v1
   def testDeterminismFloat(self):
     self._test_determinism(
         self._float_cases(shape_dtypes=(dtypes.int32, dtypes.int64)))
 
+  @test_util.run_deprecated_v1
   def testDeterminismInt(self):
     self._test_determinism(
         self._int_cases(shape_dtypes=(dtypes.int32, dtypes.int64)))
 
+  @test_util.run_deprecated_v1
   def testDeterminismMultinomial(self):
     self._test_determinism(self._multinomial_cases())
 
diff --git a/tensorflow/python/kernel_tests/reader_ops_test.py b/tensorflow/python/kernel_tests/reader_ops_test.py
index ac9be56d63f..43d15817e97 100644
--- a/tensorflow/python/kernel_tests/reader_ops_test.py
+++ b/tensorflow/python/kernel_tests/reader_ops_test.py
@@ -28,6 +28,7 @@ import zlib
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
+from tensorflow.python.framework import test_util
 from tensorflow.python.lib.io import tf_record
 from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.ops import io_ops
@@ -140,147 +141,147 @@ class TFCompressionTestCase(test.TestCase):
 
 class IdentityReaderTest(test.TestCase):
 
-  def _ExpectRead(self, sess, key, value, expected):
-    k, v = sess.run([key, value])
+  def _ExpectRead(self, key, value, expected):
+    k, v = self.evaluate([key, value])
     self.assertAllEqual(expected, k)
     self.assertAllEqual(expected, v)
 
+  @test_util.run_deprecated_v1
   def testOneEpoch(self):
-    with self.cached_session() as sess:
-      reader = io_ops.IdentityReader("test_reader")
-      work_completed = reader.num_work_units_completed()
-      produced = reader.num_records_produced()
-      queue = data_flow_ops.FIFOQueue(99, [dtypes.string], shapes=())
-      queued_length = queue.size()
-      key, value = reader.read(queue)
+    reader = io_ops.IdentityReader("test_reader")
+    work_completed = reader.num_work_units_completed()
+    produced = reader.num_records_produced()
+    queue = data_flow_ops.FIFOQueue(99, [dtypes.string], shapes=())
+    queued_length = queue.size()
+    key, value = reader.read(queue)
 
-      self.assertAllEqual(0, work_completed.eval())
-      self.assertAllEqual(0, produced.eval())
-      self.assertAllEqual(0, queued_length.eval())
+    self.assertAllEqual(0, self.evaluate(work_completed))
+    self.assertAllEqual(0, self.evaluate(produced))
+    self.assertAllEqual(0, self.evaluate(queued_length))
 
-      queue.enqueue_many([["A", "B", "C"]]).run()
-      queue.close().run()
-      self.assertAllEqual(3, queued_length.eval())
+    self.evaluate(queue.enqueue_many([["A", "B", "C"]]))
+    self.evaluate(queue.close())
+    self.assertAllEqual(3, self.evaluate(queued_length))
 
-      self._ExpectRead(sess, key, value, b"A")
-      self.assertAllEqual(1, produced.eval())
+    self._ExpectRead(key, value, b"A")
+    self.assertAllEqual(1, self.evaluate(produced))
 
-      self._ExpectRead(sess, key, value, b"B")
+    self._ExpectRead(key, value, b"B")
 
-      self._ExpectRead(sess, key, value, b"C")
-      self.assertAllEqual(3, produced.eval())
-      self.assertAllEqual(0, queued_length.eval())
+    self._ExpectRead(key, value, b"C")
+    self.assertAllEqual(3, self.evaluate(produced))
+    self.assertAllEqual(0, self.evaluate(queued_length))
 
-      with self.assertRaisesOpError("is closed and has insufficient elements "
-                                    "\\(requested 1, current size 0\\)"):
-        sess.run([key, value])
+    with self.assertRaisesOpError("is closed and has insufficient elements "
+                                  "\\(requested 1, current size 0\\)"):
+      self.evaluate([key, value])
 
-      self.assertAllEqual(3, work_completed.eval())
-      self.assertAllEqual(3, produced.eval())
-      self.assertAllEqual(0, queued_length.eval())
+    self.assertAllEqual(3, self.evaluate(work_completed))
+    self.assertAllEqual(3, self.evaluate(produced))
+    self.assertAllEqual(0, self.evaluate(queued_length))
 
+  @test_util.run_deprecated_v1
   def testMultipleEpochs(self):
-    with self.cached_session() as sess:
-      reader = io_ops.IdentityReader("test_reader")
-      queue = data_flow_ops.FIFOQueue(99, [dtypes.string], shapes=())
-      enqueue = queue.enqueue_many([["DD", "EE"]])
-      key, value = reader.read(queue)
+    reader = io_ops.IdentityReader("test_reader")
+    queue = data_flow_ops.FIFOQueue(99, [dtypes.string], shapes=())
+    enqueue = queue.enqueue_many([["DD", "EE"]])
+    key, value = reader.read(queue)
 
-      enqueue.run()
-      self._ExpectRead(sess, key, value, b"DD")
-      self._ExpectRead(sess, key, value, b"EE")
-      enqueue.run()
-      self._ExpectRead(sess, key, value, b"DD")
-      self._ExpectRead(sess, key, value, b"EE")
-      enqueue.run()
-      self._ExpectRead(sess, key, value, b"DD")
-      self._ExpectRead(sess, key, value, b"EE")
-      queue.close().run()
-      with self.assertRaisesOpError("is closed and has insufficient elements "
-                                    "\\(requested 1, current size 0\\)"):
-        sess.run([key, value])
+    self.evaluate(enqueue)
+    self._ExpectRead(key, value, b"DD")
+    self._ExpectRead(key, value, b"EE")
+    self.evaluate(enqueue)
+    self._ExpectRead(key, value, b"DD")
+    self._ExpectRead(key, value, b"EE")
+    self.evaluate(enqueue)
+    self._ExpectRead(key, value, b"DD")
+    self._ExpectRead(key, value, b"EE")
+    self.evaluate(queue.close())
+    with self.assertRaisesOpError("is closed and has insufficient elements "
+                                  "\\(requested 1, current size 0\\)"):
+      self.evaluate([key, value])
 
+  @test_util.run_deprecated_v1
   def testSerializeRestore(self):
-    with self.cached_session() as sess:
-      reader = io_ops.IdentityReader("test_reader")
-      produced = reader.num_records_produced()
-      queue = data_flow_ops.FIFOQueue(99, [dtypes.string], shapes=())
-      queue.enqueue_many([["X", "Y", "Z"]]).run()
-      key, value = reader.read(queue)
+    reader = io_ops.IdentityReader("test_reader")
+    produced = reader.num_records_produced()
+    queue = data_flow_ops.FIFOQueue(99, [dtypes.string], shapes=())
+    self.evaluate(queue.enqueue_many([["X", "Y", "Z"]]))
+    key, value = reader.read(queue)
 
-      self._ExpectRead(sess, key, value, b"X")
-      self.assertAllEqual(1, produced.eval())
-      state = reader.serialize_state().eval()
+    self._ExpectRead(key, value, b"X")
+    self.assertAllEqual(1, self.evaluate(produced))
+    state = self.evaluate(reader.serialize_state())
 
-      self._ExpectRead(sess, key, value, b"Y")
-      self._ExpectRead(sess, key, value, b"Z")
-      self.assertAllEqual(3, produced.eval())
+    self._ExpectRead(key, value, b"Y")
+    self._ExpectRead(key, value, b"Z")
+    self.assertAllEqual(3, self.evaluate(produced))
 
-      queue.enqueue_many([["Y", "Z"]]).run()
-      queue.close().run()
-      reader.restore_state(state).run()
-      self.assertAllEqual(1, produced.eval())
-      self._ExpectRead(sess, key, value, b"Y")
-      self._ExpectRead(sess, key, value, b"Z")
-      with self.assertRaisesOpError("is closed and has insufficient elements "
-                                    "\\(requested 1, current size 0\\)"):
-        sess.run([key, value])
-      self.assertAllEqual(3, produced.eval())
+    self.evaluate(queue.enqueue_many([["Y", "Z"]]))
+    self.evaluate(queue.close())
+    self.evaluate(reader.restore_state(state))
+    self.assertAllEqual(1, self.evaluate(produced))
+    self._ExpectRead(key, value, b"Y")
+    self._ExpectRead(key, value, b"Z")
+    with self.assertRaisesOpError("is closed and has insufficient elements "
+                                  "\\(requested 1, current size 0\\)"):
+      self.evaluate([key, value])
+    self.assertAllEqual(3, self.evaluate(produced))
 
-      self.assertEqual(bytes, type(state))
+    self.assertEqual(bytes, type(state))
 
-      with self.assertRaises(ValueError):
-        reader.restore_state([])
+    with self.assertRaises(ValueError):
+      reader.restore_state([])
 
-      with self.assertRaises(ValueError):
-        reader.restore_state([state, state])
+    with self.assertRaises(ValueError):
+      reader.restore_state([state, state])
 
-      with self.assertRaisesOpError(
-          "Could not parse state for IdentityReader 'test_reader'"):
-        reader.restore_state(state[1:]).run()
+    with self.assertRaisesOpError(
+        "Could not parse state for IdentityReader 'test_reader'"):
+      self.evaluate(reader.restore_state(state[1:]))
 
-      with self.assertRaisesOpError(
-          "Could not parse state for IdentityReader 'test_reader'"):
-        reader.restore_state(state[:-1]).run()
+    with self.assertRaisesOpError(
+        "Could not parse state for IdentityReader 'test_reader'"):
+      self.evaluate(reader.restore_state(state[:-1]))
 
-      with self.assertRaisesOpError(
-          "Could not parse state for IdentityReader 'test_reader'"):
-        reader.restore_state(state + b"ExtraJunk").run()
+    with self.assertRaisesOpError(
+        "Could not parse state for IdentityReader 'test_reader'"):
+      self.evaluate(reader.restore_state(state + b"ExtraJunk"))
 
-      with self.assertRaisesOpError(
-          "Could not parse state for IdentityReader 'test_reader'"):
-        reader.restore_state(b"PREFIX" + state).run()
+    with self.assertRaisesOpError(
+        "Could not parse state for IdentityReader 'test_reader'"):
+      self.evaluate(reader.restore_state(b"PREFIX" + state))
 
-      with self.assertRaisesOpError(
-          "Could not parse state for IdentityReader 'test_reader'"):
-        reader.restore_state(b"BOGUS" + state[5:]).run()
+    with self.assertRaisesOpError(
+        "Could not parse state for IdentityReader 'test_reader'"):
+      self.evaluate(reader.restore_state(b"BOGUS" + state[5:]))
 
+  @test_util.run_deprecated_v1
   def testReset(self):
-    with self.cached_session() as sess:
-      reader = io_ops.IdentityReader("test_reader")
-      work_completed = reader.num_work_units_completed()
-      produced = reader.num_records_produced()
-      queue = data_flow_ops.FIFOQueue(99, [dtypes.string], shapes=())
-      queued_length = queue.size()
-      key, value = reader.read(queue)
+    reader = io_ops.IdentityReader("test_reader")
+    work_completed = reader.num_work_units_completed()
+    produced = reader.num_records_produced()
+    queue = data_flow_ops.FIFOQueue(99, [dtypes.string], shapes=())
+    queued_length = queue.size()
+    key, value = reader.read(queue)
 
-      queue.enqueue_many([["X", "Y", "Z"]]).run()
-      self._ExpectRead(sess, key, value, b"X")
-      self.assertLess(0, queued_length.eval())
-      self.assertAllEqual(1, produced.eval())
+    self.evaluate(queue.enqueue_many([["X", "Y", "Z"]]))
+    self._ExpectRead(key, value, b"X")
+    self.assertLess(0, self.evaluate(queued_length))
+    self.assertAllEqual(1, self.evaluate(produced))
 
-      self._ExpectRead(sess, key, value, b"Y")
-      self.assertLess(0, work_completed.eval())
-      self.assertAllEqual(2, produced.eval())
+    self._ExpectRead(key, value, b"Y")
+    self.assertLess(0, self.evaluate(work_completed))
+    self.assertAllEqual(2, self.evaluate(produced))
 
-      reader.reset().run()
-      self.assertAllEqual(0, work_completed.eval())
-      self.assertAllEqual(0, produced.eval())
-      self.assertAllEqual(1, queued_length.eval())
-      self._ExpectRead(sess, key, value, b"Z")
+    self.evaluate(reader.reset())
+    self.assertAllEqual(0, self.evaluate(work_completed))
+    self.assertAllEqual(0, self.evaluate(produced))
+    self.assertAllEqual(1, self.evaluate(queued_length))
+    self._ExpectRead(key, value, b"Z")
 
-      queue.enqueue_many([["K", "L"]]).run()
-      self._ExpectRead(sess, key, value, b"K")
+    self.evaluate(queue.enqueue_many([["K", "L"]]))
+    self._ExpectRead(key, value, b"K")
 
 
 class WholeFileReaderTest(test.TestCase):
@@ -301,44 +302,44 @@ class WholeFileReaderTest(test.TestCase):
       os.remove(fn)
     super(WholeFileReaderTest, self).tearDown()
 
-  def _ExpectRead(self, sess, key, value, index):
-    k, v = sess.run([key, value])
+  def _ExpectRead(self, key, value, index):
+    k, v = self.evaluate([key, value])
     self.assertAllEqual(compat.as_bytes(self._filenames[index]), k)
     self.assertAllEqual(self._content[index], v)
 
+  @test_util.run_deprecated_v1
   def testOneEpoch(self):
-    with self.cached_session() as sess:
-      reader = io_ops.WholeFileReader("test_reader")
-      queue = data_flow_ops.FIFOQueue(99, [dtypes.string], shapes=())
-      queue.enqueue_many([self._filenames]).run()
-      queue.close().run()
-      key, value = reader.read(queue)
+    reader = io_ops.WholeFileReader("test_reader")
+    queue = data_flow_ops.FIFOQueue(99, [dtypes.string], shapes=())
+    self.evaluate(queue.enqueue_many([self._filenames]))
+    self.evaluate(queue.close())
+    key, value = reader.read(queue)
 
-      self._ExpectRead(sess, key, value, 0)
-      self._ExpectRead(sess, key, value, 1)
-      self._ExpectRead(sess, key, value, 2)
+    self._ExpectRead(key, value, 0)
+    self._ExpectRead(key, value, 1)
+    self._ExpectRead(key, value, 2)
 
-      with self.assertRaisesOpError("is closed and has insufficient elements "
-                                    "\\(requested 1, current size 0\\)"):
-        sess.run([key, value])
+    with self.assertRaisesOpError("is closed and has insufficient elements "
+                                  "\\(requested 1, current size 0\\)"):
+      self.evaluate([key, value])
 
+  @test_util.run_deprecated_v1
   def testInfiniteEpochs(self):
-    with self.cached_session() as sess:
-      reader = io_ops.WholeFileReader("test_reader")
-      queue = data_flow_ops.FIFOQueue(99, [dtypes.string], shapes=())
-      enqueue = queue.enqueue_many([self._filenames])
-      key, value = reader.read(queue)
+    reader = io_ops.WholeFileReader("test_reader")
+    queue = data_flow_ops.FIFOQueue(99, [dtypes.string], shapes=())
+    enqueue = queue.enqueue_many([self._filenames])
+    key, value = reader.read(queue)
 
-      enqueue.run()
-      self._ExpectRead(sess, key, value, 0)
-      self._ExpectRead(sess, key, value, 1)
-      enqueue.run()
-      self._ExpectRead(sess, key, value, 2)
-      self._ExpectRead(sess, key, value, 0)
-      self._ExpectRead(sess, key, value, 1)
-      enqueue.run()
-      self._ExpectRead(sess, key, value, 2)
-      self._ExpectRead(sess, key, value, 0)
+    self.evaluate(enqueue)
+    self._ExpectRead(key, value, 0)
+    self._ExpectRead(key, value, 1)
+    self.evaluate(enqueue)
+    self._ExpectRead(key, value, 2)
+    self._ExpectRead(key, value, 0)
+    self._ExpectRead(key, value, 1)
+    self.evaluate(enqueue)
+    self._ExpectRead(key, value, 2)
+    self._ExpectRead(key, value, 0)
 
 
 class TextLineReaderTest(test.TestCase):
@@ -366,47 +367,48 @@ class TextLineReaderTest(test.TestCase):
     return filenames
 
   def _testOneEpoch(self, files):
-    with self.cached_session() as sess:
-      reader = io_ops.TextLineReader(name="test_reader")
-      queue = data_flow_ops.FIFOQueue(99, [dtypes.string], shapes=())
-      key, value = reader.read(queue)
+    reader = io_ops.TextLineReader(name="test_reader")
+    queue = data_flow_ops.FIFOQueue(99, [dtypes.string], shapes=())
+    key, value = reader.read(queue)
 
-      queue.enqueue_many([files]).run()
-      queue.close().run()
-      for i in range(self._num_files):
-        for j in range(self._num_lines):
-          k, v = sess.run([key, value])
-          self.assertAllEqual("%s:%d" % (files[i], j + 1), compat.as_text(k))
-          self.assertAllEqual(self._LineText(i, j), v)
+    self.evaluate(queue.enqueue_many([files]))
+    self.evaluate(queue.close())
+    for i in range(self._num_files):
+      for j in range(self._num_lines):
+        k, v = self.evaluate([key, value])
+        self.assertAllEqual("%s:%d" % (files[i], j + 1), compat.as_text(k))
+        self.assertAllEqual(self._LineText(i, j), v)
 
-      with self.assertRaisesOpError("is closed and has insufficient elements "
-                                    "\\(requested 1, current size 0\\)"):
-        k, v = sess.run([key, value])
+    with self.assertRaisesOpError("is closed and has insufficient elements "
+                                  "\\(requested 1, current size 0\\)"):
+      k, v = self.evaluate([key, value])
 
+  @test_util.run_deprecated_v1
   def testOneEpochLF(self):
     self._testOneEpoch(self._CreateFiles(crlf=False))
 
+  @test_util.run_deprecated_v1
   def testOneEpochCRLF(self):
     self._testOneEpoch(self._CreateFiles(crlf=True))
 
+  @test_util.run_deprecated_v1
   def testSkipHeaderLines(self):
     files = self._CreateFiles()
-    with self.cached_session() as sess:
-      reader = io_ops.TextLineReader(skip_header_lines=1, name="test_reader")
-      queue = data_flow_ops.FIFOQueue(99, [dtypes.string], shapes=())
-      key, value = reader.read(queue)
+    reader = io_ops.TextLineReader(skip_header_lines=1, name="test_reader")
+    queue = data_flow_ops.FIFOQueue(99, [dtypes.string], shapes=())
+    key, value = reader.read(queue)
 
-      queue.enqueue_many([files]).run()
-      queue.close().run()
-      for i in range(self._num_files):
-        for j in range(self._num_lines - 1):
-          k, v = sess.run([key, value])
-          self.assertAllEqual("%s:%d" % (files[i], j + 2), compat.as_text(k))
-          self.assertAllEqual(self._LineText(i, j + 1), v)
+    self.evaluate(queue.enqueue_many([files]))
+    self.evaluate(queue.close())
+    for i in range(self._num_files):
+      for j in range(self._num_lines - 1):
+        k, v = self.evaluate([key, value])
+        self.assertAllEqual("%s:%d" % (files[i], j + 2), compat.as_text(k))
+        self.assertAllEqual(self._LineText(i, j + 1), v)
 
-      with self.assertRaisesOpError("is closed and has insufficient elements "
-                                    "\\(requested 1, current size 0\\)"):
-        k, v = sess.run([key, value])
+    with self.assertRaisesOpError("is closed and has insufficient elements "
+                                  "\\(requested 1, current size 0\\)"):
+      k, v = self.evaluate([key, value])
 
 
 class FixedLengthRecordReaderTest(TFCompressionTestCase):
@@ -522,56 +524,55 @@ class FixedLengthRecordReaderTest(TFCompressionTestCase):
   # gap_bytes=hop_bytes-record_bytes
   def _TestOneEpoch(self, files, num_records, gap_bytes, encoding=None):
     hop_bytes = 0 if gap_bytes == 0 else self._record_bytes + gap_bytes
-    with self.cached_session() as sess:
-      reader = io_ops.FixedLengthRecordReader(
-          header_bytes=self._header_bytes,
-          record_bytes=self._record_bytes,
-          footer_bytes=self._footer_bytes,
-          hop_bytes=hop_bytes,
-          encoding=encoding,
-          name="test_reader")
-      queue = data_flow_ops.FIFOQueue(99, [dtypes.string], shapes=())
-      key, value = reader.read(queue)
+    reader = io_ops.FixedLengthRecordReader(
+        header_bytes=self._header_bytes,
+        record_bytes=self._record_bytes,
+        footer_bytes=self._footer_bytes,
+        hop_bytes=hop_bytes,
+        encoding=encoding,
+        name="test_reader")
+    queue = data_flow_ops.FIFOQueue(99, [dtypes.string], shapes=())
+    key, value = reader.read(queue)
 
-      queue.enqueue_many([files]).run()
-      queue.close().run()
-      for i in range(self._num_files):
-        for j in range(num_records):
-          k, v = sess.run([key, value])
-          self.assertAllEqual("%s:%d" % (files[i], j), compat.as_text(k))
-          self.assertAllEqual(self._Record(i, j), v)
+    self.evaluate(queue.enqueue_many([files]))
+    self.evaluate(queue.close())
+    for i in range(self._num_files):
+      for j in range(num_records):
+        k, v = self.evaluate([key, value])
+        self.assertAllEqual("%s:%d" % (files[i], j), compat.as_text(k))
+        self.assertAllEqual(self._Record(i, j), v)
 
-      with self.assertRaisesOpError("is closed and has insufficient elements "
-                                    "\\(requested 1, current size 0\\)"):
-        k, v = sess.run([key, value])
+    with self.assertRaisesOpError("is closed and has insufficient elements "
+                                  "\\(requested 1, current size 0\\)"):
+      k, v = self.evaluate([key, value])
 
   def _TestOneEpochWithHopBytes(self,
                                 files,
                                 num_overlapped_records,
                                 encoding=None):
-    with self.cached_session() as sess:
-      reader = io_ops.FixedLengthRecordReader(
-          header_bytes=self._header_bytes,
-          record_bytes=self._record_bytes,
-          footer_bytes=self._footer_bytes,
-          hop_bytes=self._hop_bytes,
-          encoding=encoding,
-          name="test_reader")
-      queue = data_flow_ops.FIFOQueue(99, [dtypes.string], shapes=())
-      key, value = reader.read(queue)
+    reader = io_ops.FixedLengthRecordReader(
+        header_bytes=self._header_bytes,
+        record_bytes=self._record_bytes,
+        footer_bytes=self._footer_bytes,
+        hop_bytes=self._hop_bytes,
+        encoding=encoding,
+        name="test_reader")
+    queue = data_flow_ops.FIFOQueue(99, [dtypes.string], shapes=())
+    key, value = reader.read(queue)
 
-      queue.enqueue_many([files]).run()
-      queue.close().run()
-      for i in range(self._num_files):
-        for j in range(num_overlapped_records):
-          k, v = sess.run([key, value])
-          self.assertAllEqual("%s:%d" % (files[i], j), compat.as_text(k))
-          self.assertAllEqual(self._OverlappedRecord(i, j), v)
+    self.evaluate(queue.enqueue_many([files]))
+    self.evaluate(queue.close())
+    for i in range(self._num_files):
+      for j in range(num_overlapped_records):
+        k, v = self.evaluate([key, value])
+        self.assertAllEqual("%s:%d" % (files[i], j), compat.as_text(k))
+        self.assertAllEqual(self._OverlappedRecord(i, j), v)
 
-      with self.assertRaisesOpError("is closed and has insufficient elements "
-                                    "\\(requested 1, current size 0\\)"):
-        k, v = sess.run([key, value])
+    with self.assertRaisesOpError("is closed and has insufficient elements "
+                                  "\\(requested 1, current size 0\\)"):
+      k, v = self.evaluate([key, value])
 
+  @test_util.run_deprecated_v1
   def testOneEpoch(self):
     for num_records in [0, 7]:
       # gap_bytes=0: hop_bytes=0
@@ -580,6 +581,7 @@ class FixedLengthRecordReaderTest(TFCompressionTestCase):
         files = self._CreateFiles(num_records, gap_bytes)
         self._TestOneEpoch(files, num_records, gap_bytes)
 
+  @test_util.run_deprecated_v1
   def testGzipOneEpoch(self):
     for num_records in [0, 7]:
       # gap_bytes=0: hop_bytes=0
@@ -588,6 +590,7 @@ class FixedLengthRecordReaderTest(TFCompressionTestCase):
         files = self._CreateGzipFiles(num_records, gap_bytes)
         self._TestOneEpoch(files, num_records, gap_bytes, encoding="GZIP")
 
+  @test_util.run_deprecated_v1
   def testZlibOneEpoch(self):
     for num_records in [0, 7]:
       # gap_bytes=0: hop_bytes=0
@@ -596,17 +599,20 @@ class FixedLengthRecordReaderTest(TFCompressionTestCase):
         files = self._CreateZlibFiles(num_records, gap_bytes)
         self._TestOneEpoch(files, num_records, gap_bytes, encoding="ZLIB")
 
+  @test_util.run_deprecated_v1
   def testOneEpochWithHopBytes(self):
     for num_overlapped_records in [0, 2]:
       files = self._CreateOverlappedRecordFiles(num_overlapped_records)
       self._TestOneEpochWithHopBytes(files, num_overlapped_records)
 
+  @test_util.run_deprecated_v1
   def testGzipOneEpochWithHopBytes(self):
     for num_overlapped_records in [0, 2]:
       files = self._CreateGzipOverlappedRecordFiles(num_overlapped_records,)
       self._TestOneEpochWithHopBytes(
           files, num_overlapped_records, encoding="GZIP")
 
+  @test_util.run_deprecated_v1
   def testZlibOneEpochWithHopBytes(self):
     for num_overlapped_records in [0, 2]:
       files = self._CreateZlibOverlappedRecordFiles(num_overlapped_records)
@@ -619,90 +625,91 @@ class TFRecordReaderTest(TFCompressionTestCase):
   def setUp(self):
     super(TFRecordReaderTest, self).setUp()
 
+  @test_util.run_deprecated_v1
   def testOneEpoch(self):
     files = self._CreateFiles()
-    with self.cached_session() as sess:
-      reader = io_ops.TFRecordReader(name="test_reader")
-      queue = data_flow_ops.FIFOQueue(99, [dtypes.string], shapes=())
-      key, value = reader.read(queue)
+    reader = io_ops.TFRecordReader(name="test_reader")
+    queue = data_flow_ops.FIFOQueue(99, [dtypes.string], shapes=())
+    key, value = reader.read(queue)
 
-      queue.enqueue_many([files]).run()
-      queue.close().run()
-      for i in range(self._num_files):
-        for j in range(self._num_records):
-          k, v = sess.run([key, value])
-          self.assertTrue(compat.as_text(k).startswith("%s:" % files[i]))
-          self.assertAllEqual(self._Record(i, j), v)
+    self.evaluate(queue.enqueue_many([files]))
+    self.evaluate(queue.close())
+    for i in range(self._num_files):
+      for j in range(self._num_records):
+        k, v = self.evaluate([key, value])
+        self.assertTrue(compat.as_text(k).startswith("%s:" % files[i]))
+        self.assertAllEqual(self._Record(i, j), v)
 
-      with self.assertRaisesOpError("is closed and has insufficient elements "
-                                    "\\(requested 1, current size 0\\)"):
-        k, v = sess.run([key, value])
+    with self.assertRaisesOpError("is closed and has insufficient elements "
+                                  "\\(requested 1, current size 0\\)"):
+      k, v = self.evaluate([key, value])
 
+  @test_util.run_deprecated_v1
   def testReadUpTo(self):
     files = self._CreateFiles()
-    with self.cached_session() as sess:
-      reader = io_ops.TFRecordReader(name="test_reader")
-      queue = data_flow_ops.FIFOQueue(99, [dtypes.string], shapes=())
-      batch_size = 3
-      key, value = reader.read_up_to(queue, batch_size)
+    reader = io_ops.TFRecordReader(name="test_reader")
+    queue = data_flow_ops.FIFOQueue(99, [dtypes.string], shapes=())
+    batch_size = 3
+    key, value = reader.read_up_to(queue, batch_size)
 
-      queue.enqueue_many([files]).run()
-      queue.close().run()
-      num_k = 0
-      num_v = 0
+    self.evaluate(queue.enqueue_many([files]))
+    self.evaluate(queue.close())
+    num_k = 0
+    num_v = 0
 
-      while True:
-        try:
-          k, v = sess.run([key, value])
-          # Test reading *up to* batch_size records
-          self.assertLessEqual(len(k), batch_size)
-          self.assertLessEqual(len(v), batch_size)
-          num_k += len(k)
-          num_v += len(v)
-        except errors_impl.OutOfRangeError:
-          break
+    while True:
+      try:
+        k, v = self.evaluate([key, value])
+        # Test reading *up to* batch_size records
+        self.assertLessEqual(len(k), batch_size)
+        self.assertLessEqual(len(v), batch_size)
+        num_k += len(k)
+        num_v += len(v)
+      except errors_impl.OutOfRangeError:
+        break
 
-      # Test that we have read everything
-      self.assertEqual(self._num_files * self._num_records, num_k)
-      self.assertEqual(self._num_files * self._num_records, num_v)
+    # Test that we have read everything
+    self.assertEqual(self._num_files * self._num_records, num_k)
+    self.assertEqual(self._num_files * self._num_records, num_v)
 
+  @test_util.run_deprecated_v1
   def testReadZlibFiles(self):
     options = tf_record.TFRecordOptions(TFRecordCompressionType.ZLIB)
     files = self._CreateFiles(options)
 
-    with self.cached_session() as sess:
-      reader = io_ops.TFRecordReader(name="test_reader", options=options)
-      queue = data_flow_ops.FIFOQueue(99, [dtypes.string], shapes=())
-      key, value = reader.read(queue)
+    reader = io_ops.TFRecordReader(name="test_reader", options=options)
+    queue = data_flow_ops.FIFOQueue(99, [dtypes.string], shapes=())
+    key, value = reader.read(queue)
 
-      queue.enqueue_many([files]).run()
-      queue.close().run()
-      for i in range(self._num_files):
-        for j in range(self._num_records):
-          k, v = sess.run([key, value])
-          self.assertTrue(compat.as_text(k).startswith("%s:" % files[i]))
-          self.assertAllEqual(self._Record(i, j), v)
+    self.evaluate(queue.enqueue_many([files]))
+    self.evaluate(queue.close())
+    for i in range(self._num_files):
+      for j in range(self._num_records):
+        k, v = self.evaluate([key, value])
+        self.assertTrue(compat.as_text(k).startswith("%s:" % files[i]))
+        self.assertAllEqual(self._Record(i, j), v)
 
+  @test_util.run_deprecated_v1
   def testReadGzipFiles(self):
     options = tf_record.TFRecordOptions(TFRecordCompressionType.GZIP)
     files = self._CreateFiles(options)
 
-    with self.cached_session() as sess:
-      reader = io_ops.TFRecordReader(name="test_reader", options=options)
-      queue = data_flow_ops.FIFOQueue(99, [dtypes.string], shapes=())
-      key, value = reader.read(queue)
+    reader = io_ops.TFRecordReader(name="test_reader", options=options)
+    queue = data_flow_ops.FIFOQueue(99, [dtypes.string], shapes=())
+    key, value = reader.read(queue)
 
-      queue.enqueue_many([files]).run()
-      queue.close().run()
-      for i in range(self._num_files):
-        for j in range(self._num_records):
-          k, v = sess.run([key, value])
-          self.assertTrue(compat.as_text(k).startswith("%s:" % files[i]))
-          self.assertAllEqual(self._Record(i, j), v)
+    self.evaluate(queue.enqueue_many([files]))
+    self.evaluate(queue.close())
+    for i in range(self._num_files):
+      for j in range(self._num_records):
+        k, v = self.evaluate([key, value])
+        self.assertTrue(compat.as_text(k).startswith("%s:" % files[i]))
+        self.assertAllEqual(self._Record(i, j), v)
 
 
 class AsyncReaderTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testNoDeadlockFromQueue(self):
     """Tests that reading does not block main execution threads."""
     config = config_pb2.ConfigProto(
@@ -724,7 +731,7 @@ class AsyncReaderTest(test.TestCase):
         thread_data.append(thread_data_t(t, queue, output))
 
       # Start all readers. They are all blocked waiting for queue entries.
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       for d in thread_data:
         d.thread.start()
 
@@ -733,7 +740,7 @@ class AsyncReaderTest(test.TestCase):
         fname = os.path.join(self.get_temp_dir(), "deadlock.%s.txt" % i)
         with open(fname, "wb") as f:
           f.write(("file-%s" % i).encode())
-        d.queue.enqueue_many([[fname]]).run()
+        self.evaluate(d.queue.enqueue_many([[fname]]))
         d.thread.join()
         self.assertEqual([[("file-%s" % i).encode()]], d.output)
 
@@ -751,24 +758,25 @@ class LMDBReaderTest(test.TestCase):
     self.db_path = os.path.join(self.get_temp_dir(), "data.mdb")
     shutil.copy(path, self.db_path)
 
+  @test_util.run_deprecated_v1
   def testReadFromFile(self):
-    with self.cached_session() as sess:
-      reader = io_ops.LMDBReader(name="test_read_from_file")
-      queue = data_flow_ops.FIFOQueue(99, [dtypes.string], shapes=())
-      key, value = reader.read(queue)
+    reader = io_ops.LMDBReader(name="test_read_from_file")
+    queue = data_flow_ops.FIFOQueue(99, [dtypes.string], shapes=())
+    key, value = reader.read(queue)
 
-      queue.enqueue([self.db_path]).run()
-      queue.close().run()
-      for i in range(10):
-        k, v = sess.run([key, value])
-        self.assertAllEqual(compat.as_bytes(k), compat.as_bytes(str(i)))
-        self.assertAllEqual(
-            compat.as_bytes(v), compat.as_bytes(str(chr(ord("a") + i))))
+    self.evaluate(queue.enqueue([self.db_path]))
+    self.evaluate(queue.close())
+    for i in range(10):
+      k, v = self.evaluate([key, value])
+      self.assertAllEqual(compat.as_bytes(k), compat.as_bytes(str(i)))
+      self.assertAllEqual(
+          compat.as_bytes(v), compat.as_bytes(str(chr(ord("a") + i))))
 
-      with self.assertRaisesOpError("is closed and has insufficient elements "
-                                    "\\(requested 1, current size 0\\)"):
-        k, v = sess.run([key, value])
+    with self.assertRaisesOpError("is closed and has insufficient elements "
+                                  "\\(requested 1, current size 0\\)"):
+      k, v = self.evaluate([key, value])
 
+  @test_util.run_deprecated_v1
   def testReadFromSameFile(self):
     with self.cached_session() as sess:
       reader1 = io_ops.LMDBReader(name="test_read_from_same_file1")
@@ -782,30 +790,31 @@ class LMDBReaderTest(test.TestCase):
       threads = queue_runner_impl.start_queue_runners(sess, coord=coord)
       for _ in range(3):
         for _ in range(10):
-          k1, v1, k2, v2 = sess.run([key1, value1, key2, value2])
+          k1, v1, k2, v2 = self.evaluate([key1, value1, key2, value2])
           self.assertAllEqual(compat.as_bytes(k1), compat.as_bytes(k2))
           self.assertAllEqual(compat.as_bytes(v1), compat.as_bytes(v2))
       coord.request_stop()
       coord.join(threads)
 
+  @test_util.run_deprecated_v1
   def testReadFromFolder(self):
-    with self.cached_session() as sess:
-      reader = io_ops.LMDBReader(name="test_read_from_folder")
-      queue = data_flow_ops.FIFOQueue(99, [dtypes.string], shapes=())
-      key, value = reader.read(queue)
+    reader = io_ops.LMDBReader(name="test_read_from_folder")
+    queue = data_flow_ops.FIFOQueue(99, [dtypes.string], shapes=())
+    key, value = reader.read(queue)
 
-      queue.enqueue([self.db_path]).run()
-      queue.close().run()
-      for i in range(10):
-        k, v = sess.run([key, value])
-        self.assertAllEqual(compat.as_bytes(k), compat.as_bytes(str(i)))
-        self.assertAllEqual(
-            compat.as_bytes(v), compat.as_bytes(str(chr(ord("a") + i))))
+    self.evaluate(queue.enqueue([self.db_path]))
+    self.evaluate(queue.close())
+    for i in range(10):
+      k, v = self.evaluate([key, value])
+      self.assertAllEqual(compat.as_bytes(k), compat.as_bytes(str(i)))
+      self.assertAllEqual(
+          compat.as_bytes(v), compat.as_bytes(str(chr(ord("a") + i))))
 
-      with self.assertRaisesOpError("is closed and has insufficient elements "
-                                    "\\(requested 1, current size 0\\)"):
-        k, v = sess.run([key, value])
+    with self.assertRaisesOpError("is closed and has insufficient elements "
+                                  "\\(requested 1, current size 0\\)"):
+      k, v = self.evaluate([key, value])
 
+  @test_util.run_deprecated_v1
   def testReadFromFileRepeatedly(self):
     with self.cached_session() as sess:
       reader = io_ops.LMDBReader(name="test_read_from_file_repeated")
@@ -819,7 +828,7 @@ class LMDBReaderTest(test.TestCase):
       for _ in range(3):
         # Go over all 10 records each time.
         for j in range(10):
-          k, v = sess.run([key, value])
+          k, v = self.evaluate([key, value])
           self.assertAllEqual(compat.as_bytes(k), compat.as_bytes(str(j)))
           self.assertAllEqual(
               compat.as_bytes(v), compat.as_bytes(str(chr(ord("a") + j))))
diff --git a/tensorflow/python/kernel_tests/record_input_test.py b/tensorflow/python/kernel_tests/record_input_test.py
index ebb9872f226..ad8188b372f 100644
--- a/tensorflow/python/kernel_tests/record_input_test.py
+++ b/tensorflow/python/kernel_tests/record_input_test.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 
 import os
 
+from tensorflow.python.framework import test_util
 from tensorflow.python.framework.errors_impl import NotFoundError
 from tensorflow.python.lib.io import tf_record
 from tensorflow.python.ops import data_flow_ops
@@ -54,7 +55,7 @@ class RecordInputOpTest(test.TestCase):
           batch_size=1,
           name="record_input").get_yield_op()
 
-      self.assertEqual(sess.run(yield_op), b"0000000000")
+      self.assertEqual(self.evaluate(yield_op), b"0000000000")
 
   def testRecordInputSimpleGzip(self):
     with self.cached_session() as sess:
@@ -73,7 +74,7 @@ class RecordInputOpTest(test.TestCase):
           compression_type=tf_record.TFRecordCompressionType.GZIP).get_yield_op(
           )
 
-      self.assertEqual(sess.run(yield_op), b"0000000000")
+      self.assertEqual(self.evaluate(yield_op), b"0000000000")
 
   def testRecordInputSimpleZlib(self):
     with self.cached_session() as sess:
@@ -92,8 +93,9 @@ class RecordInputOpTest(test.TestCase):
           compression_type=tf_record.TFRecordCompressionType.ZLIB).get_yield_op(
           )
 
-      self.assertEqual(sess.run(yield_op), b"0000000000")
+      self.assertEqual(self.evaluate(yield_op), b"0000000000")
 
+  @test_util.run_deprecated_v1
   def testRecordInputEpochs(self):
     files = 100
     records_per_file = 100
@@ -117,7 +119,7 @@ class RecordInputOpTest(test.TestCase):
       for _ in range(3):
         epoch_set = set()
         for _ in range(int(files * records_per_file / batches)):
-          op_list = sess.run(yield_op)
+          op_list = self.evaluate(yield_op)
           self.assertTrue(len(op_list) is batches)
           for r in op_list:
             self.assertTrue(r[0] not in epoch_set)
@@ -138,16 +140,18 @@ class RecordInputOpTest(test.TestCase):
 
         yield_op = records.get_yield_op()
         for _ in range(50):
-          sess.run(yield_op)
+          self.evaluate(yield_op)
 
+  @test_util.run_deprecated_v1
   def testEmptyGlob(self):
     with self.cached_session() as sess:
       record_input = data_flow_ops.RecordInput(file_pattern="foo")
       yield_op = record_input.get_yield_op()
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       with self.assertRaises(NotFoundError):
-        sess.run(yield_op)
+        self.evaluate(yield_op)
 
+  @test_util.run_deprecated_v1
   def testBufferTooSmall(self):
     files = 10
     records_per_file = 10
@@ -171,7 +175,7 @@ class RecordInputOpTest(test.TestCase):
       for _ in range(3):
         epoch_set = set()
         for _ in range(int(files * records_per_file / batches)):
-          op_list = sess.run(yield_op)
+          op_list = self.evaluate(yield_op)
           self.assertTrue(len(op_list) is batches)
           for r in op_list:
             self.assertTrue(r[0] not in epoch_set)
diff --git a/tensorflow/python/kernel_tests/reduce_benchmark_test.py b/tensorflow/python/kernel_tests/reduce_benchmark_test.py
index 3a2fb81157d..ef9c4c350fd 100644
--- a/tensorflow/python/kernel_tests/reduce_benchmark_test.py
+++ b/tensorflow/python/kernel_tests/reduce_benchmark_test.py
@@ -81,7 +81,7 @@ class ReduceBenchmarks(test.Benchmark):
       grad, = gradients_impl.gradients(reduction, tensor)
 
       def fn():
-        sess.run(grad.op)
+        self.evaluate(grad.op)
 
       self._run(fn, 10000)
 
@@ -98,7 +98,7 @@ class ReduceBenchmarks(test.Benchmark):
         grad, = gradients_impl.gradients(reduction, tensor)
 
       def fn():
-        sess.run(grad.op)
+        self.evaluate(grad.op)
 
       self._run(fn, 10000)
 
diff --git a/tensorflow/python/kernel_tests/reduce_join_op_test.py b/tensorflow/python/kernel_tests/reduce_join_op_test.py
index 3bb4986313d..49b6620779e 100644
--- a/tensorflow/python/kernel_tests/reduce_join_op_test.py
+++ b/tensorflow/python/kernel_tests/reduce_join_op_test.py
@@ -25,6 +25,7 @@ from six.moves import xrange  # pylint: disable=redefined-builtin
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import string_ops
 from tensorflow.python.platform import test
@@ -119,7 +120,7 @@ class ReduceJoinTest(UnicodeTestCase):
           axis=axis,
           keep_dims=keep_dims,
           separator=separator)
-      output_array = output.eval()
+      output_array = self.evaluate(output)
 
     self.assertAllEqualUnicode(truth, output_array)
     self.assertAllEqual(truth_shape, output.get_shape())
@@ -149,10 +150,10 @@ class ReduceJoinTest(UnicodeTestCase):
       if not axis:
         truth = constant_op.constant(truth)
       truth_squeezed = array_ops.squeeze(truth, axis=axis)
-      output_array = output.eval()
-      output_keep_dims_array = output_keep_dims.eval()
-      truth_array = truth.eval()
-      truth_squeezed_array = truth_squeezed.eval()
+      output_array = self.evaluate(output)
+      output_keep_dims_array = self.evaluate(output_keep_dims)
+      truth_array = self.evaluate(truth)
+      truth_squeezed_array = self.evaluate(truth_squeezed)
     self.assertAllEqualUnicode(truth_array, output_keep_dims_array)
     self.assertAllEqualUnicode(truth_squeezed_array, output_array)
     self.assertAllEqual(truth.get_shape(), output_keep_dims.get_shape())
@@ -230,6 +231,7 @@ class ReduceJoinTest(UnicodeTestCase):
         axis=1,
         separator="  ")
 
+  @test_util.run_deprecated_v1
   def testUnknownShape(self):
     input_array = [["a"], ["b"]]
     truth = ["ab"]
@@ -241,6 +243,7 @@ class ReduceJoinTest(UnicodeTestCase):
       self.assertAllEqualUnicode(truth, output_array)
       self.assertAllEqual(truth_shape, reduced.get_shape())
 
+  @test_util.run_deprecated_v1
   def testUnknownIndices(self):
     input_array = [["this", "is", "a", "test"],
                    ["please", "do", "not", "panic"]]
@@ -297,6 +300,7 @@ class ReduceJoinTest(UnicodeTestCase):
       for permutation in itertools.permutations(xrange(num_dims), i):
         self._testMultipleReduceJoin(input_array, axis=permutation)
 
+  @test_util.run_deprecated_v1
   def testInvalidReductionIndices(self):
     with self.cached_session():
       with self.assertRaisesRegexp(ValueError, "Invalid reduction dim"):
@@ -318,13 +322,14 @@ class ReduceJoinTest(UnicodeTestCase):
 
       # Reduction that drops the dim of size 0.
       output = string_ops.reduce_join(inputs=inputs, axis=0)
-      self.assertAllEqualUnicode([""], output.eval())
+      self.assertAllEqualUnicode([""], self.evaluate(output))
 
       # Reduction that keeps the dim of size 0.
       output = string_ops.reduce_join(inputs=inputs, axis=1)
-      output_shape = output.eval().shape
+      output_shape = self.evaluate(output).shape
       self.assertAllEqual([0], output_shape)
 
+  @test_util.run_deprecated_v1
   def testInvalidArgsUnknownShape(self):
     with self.cached_session():
       placeholder = array_ops.placeholder(dtypes.string, name="placeholder")
@@ -335,6 +340,7 @@ class ReduceJoinTest(UnicodeTestCase):
       with self.assertRaisesOpError("Duplicate reduction dimension 1"):
         duplicate_index.eval(feed_dict={placeholder.name: [[""]]})
 
+  @test_util.run_deprecated_v1
   def testInvalidArgsUnknownIndices(self):
     with self.cached_session():
       placeholder = array_ops.placeholder(dtypes.int32, name="placeholder")
diff --git a/tensorflow/python/kernel_tests/reduction_ops_test.py b/tensorflow/python/kernel_tests/reduction_ops_test.py
index 2ac3996e25b..67a89461f3a 100644
--- a/tensorflow/python/kernel_tests/reduction_ops_test.py
+++ b/tensorflow/python/kernel_tests/reduction_ops_test.py
@@ -27,6 +27,7 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import math_ops
@@ -60,6 +61,7 @@ class ReducedShapeTest(test.TestCase):
     output = math_ops.reduced_shape(shape, axes=axes)
     self.assertAllEqual(output.eval(), result)
 
+  @test_util.run_deprecated_v1
   def testSimple(self):
     with self.cached_session():
       self._check([3], [], [3])
@@ -69,6 +71,7 @@ class ReducedShapeTest(test.TestCase):
       self._check([5, 3], [1], [5, 1])
       self._check([5, 3], [0, 1], [1, 1])
 
+  @test_util.run_deprecated_v1
   def testZeros(self):
     """Check that reduced_shape does the right thing with zero dimensions."""
     with self.cached_session():
@@ -83,6 +86,7 @@ class ReducedShapeTest(test.TestCase):
       self._check([3, 0], [1], [3, 1])
       self._check([3, 0], [0, 1], [1, 1])
 
+  @test_util.run_deprecated_v1
   def testNegAxes(self):
     with self.cached_session():
       self._check([10, 10, 10], [-1], [10, 10, 1])
@@ -94,6 +98,7 @@ class ReducedShapeTest(test.TestCase):
 
 class ReductionUnknownShape(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testBasic(self):
     with self.cached_session():
       for dtype, reductions in [(dtypes.float32,
@@ -185,9 +190,10 @@ class SumReductionTest(BaseReductionTest):
     for dtype in [dtypes.int64, dtypes.int32]:
       with self.cached_session(use_gpu=True) as sess:
         v = math_ops.reduce_sum([0, 0], constant_op.constant(0, dtype=dtype))
-        tf_v = sess.run(v)
+        tf_v = self.evaluate(v)
       self.assertAllEqual(tf_v, 0)
 
+  @test_util.run_deprecated_v1
   def testInfinity(self):
     for dtype in [np.float32, np.float64]:
       for special_value_x in [-np.inf, np.inf]:
@@ -195,11 +201,13 @@ class SumReductionTest(BaseReductionTest):
           np_arr = np.array([special_value_x, special_value_y]).astype(dtype)
           self._compareAll(np_arr, None)
 
+  @test_util.run_deprecated_v1
   def testInt32(self):
     for rank in range(1, _MAX_RANK + 1):
       np_arr = self._makeIncremental((2,) * rank, dtypes.int32)
       self._compareAllAxes(np_arr)
 
+  @test_util.run_deprecated_v1
   def testFloat16(self):
     for rank in range(1, _MAX_RANK + 1):
       np_arr = self._makeIncremental((2,) * rank, dtypes.float16)
@@ -216,9 +224,10 @@ class SumReductionTest(BaseReductionTest):
       tf_arr = variables.Variable(arr)
       variables.global_variables_initializer().run()
       tf_mean = math_ops.reduce_mean(tf_arr, 0, False)
-      tf_out_mean = sess.run(tf_mean)
+      tf_out_mean = self.evaluate(tf_mean)
     self.assertAllClose(tf_out_mean, 1.)
 
+  @test_util.run_deprecated_v1
   def testFloat32(self):
     for rank in range(1, _MAX_RANK + 1):
       np_arr = self._makeIncremental((2,) * rank, dtypes.float32)
@@ -238,7 +247,7 @@ class SumReductionTest(BaseReductionTest):
       with self.session(graph=ops.Graph(), use_gpu=True) as sess:
         tf_row_sum = self._tf_reduce(arr, 1, False)
         tf_col_sum = self._tf_reduce(arr, 0, False)
-        tf_out_row, tf_out_col = sess.run([tf_row_sum, tf_col_sum])
+        tf_out_row, tf_out_col = self.evaluate([tf_row_sum, tf_col_sum])
       self.assertAllClose(col_sum, tf_out_col)
       self.assertAllClose(row_sum, tf_out_row)
 
@@ -252,25 +261,29 @@ class SumReductionTest(BaseReductionTest):
           with self.session(graph=ops.Graph(), use_gpu=True) as sess:
             tf_sum_xz = self._tf_reduce(arr, [0, 2], False)
             tf_sum_y = self._tf_reduce(arr, 1, False)
-            tf_out_sum_xz, tf_out_sum_y = sess.run([tf_sum_xz, tf_sum_y])
+            tf_out_sum_xz, tf_out_sum_y = self.evaluate([tf_sum_xz, tf_sum_y])
           self.assertAllClose(sum_y, tf_out_sum_y)
           self.assertAllClose(sum_xz, tf_out_sum_xz)
 
+  @test_util.run_deprecated_v1
   def testFloat64(self):
     for rank in range(1, _MAX_RANK + 1):
       np_arr = self._makeIncremental((2,) * rank, dtypes.float64)
       self._compareAllAxes(np_arr)
 
+  @test_util.run_deprecated_v1
   def testComplex64(self):
     for rank in range(1, _MAX_RANK + 1):
       np_arr = self._makeIncremental((2,) * rank, dtypes.complex64)
       self._compareAllAxes(np_arr)
 
+  @test_util.run_deprecated_v1
   def testComplex128(self):
     for rank in range(1, _MAX_RANK + 1):
       np_arr = self._makeIncremental((2,) * rank, dtypes.complex128)
       self._compareAllAxes(np_arr)
 
+  @test_util.run_deprecated_v1
   def testInvalidIndex(self):
     np_arr = np.arange(0, 10).reshape([2, 5]).astype(np.float32)
     input_tensor = ops.convert_to_tensor(np_arr)
@@ -284,6 +297,7 @@ class SumReductionTest(BaseReductionTest):
         ValueError, lambda e: "Invalid reduction dimension" in str(e)):
       math_ops.reduce_sum(input_tensor, [0, 2])
 
+  @test_util.run_deprecated_v1
   def testPartialShapes(self):
     np.random.seed(1618)
 
@@ -317,6 +331,7 @@ class SumReductionTest(BaseReductionTest):
         c_unknown_indices, unknown_indices, keepdims=True)
     self.assertEqual(2, s_unknown_indices_keep.get_shape().rank)
 
+  @test_util.run_deprecated_v1
   def testWrongShapeForReductionIndices(self):
     reduction_axes = [[1], [2]]
     c_unknown = array_ops.placeholder(dtypes.float32)
@@ -326,6 +341,7 @@ class SumReductionTest(BaseReductionTest):
 
   # Int64??
 
+  @test_util.run_deprecated_v1
   def testGradient(self):
     for dtype in [
         dtypes.float32, dtypes.float64, dtypes.complex64, dtypes.complex128
@@ -333,6 +349,7 @@ class SumReductionTest(BaseReductionTest):
       x = self._makeIncremental([2, 3, 4, 2], dtype)
       self._compareGradientAxes(x)
 
+  @test_util.run_deprecated_v1
   def testHighRank(self):
     # Do a bunch of random high dimensional reductions
     np.random.seed(42)
@@ -350,11 +367,13 @@ class SumReductionTest(BaseReductionTest):
                    np.arange(1, rank, 2)):
         self._compareAll(data, axes)
 
+  @test_util.run_deprecated_v1
   def testExpand(self):
     # Reduce an empty tensor to a nonempty tensor
     x = np.zeros((5, 0))
     self._compareAll(x, [1])
 
+  @test_util.run_deprecated_v1
   def testEmptyGradients(self):
     with self.session(use_gpu=True):
       x = array_ops.zeros([0, 3])
@@ -362,6 +381,7 @@ class SumReductionTest(BaseReductionTest):
       error = gradient_checker.compute_gradient_error(x, [0, 3], y, [0])
       self.assertEqual(error, 0)
 
+  @test_util.run_deprecated_v1
   def testDegenerate(self):
     with self.session(use_gpu=True):
       for dtype in (dtypes.float16, dtypes.float32, dtypes.float64,
@@ -400,9 +420,10 @@ class MeanReductionTest(BaseReductionTest):
     for dtype in [dtypes.int64, dtypes.int32]:
       with self.cached_session(use_gpu=True) as sess:
         v = math_ops.reduce_mean([0, 0], constant_op.constant(0, dtype=dtype))
-        tf_v = sess.run(v)
+        tf_v = self.evaluate(v)
       self.assertAllEqual(tf_v, 0)
 
+  @test_util.run_deprecated_v1
   def testInfinity(self):
     for dtype in [np.float32, np.float64]:
       for special_value_x in [-np.inf, np.inf]:
@@ -410,37 +431,44 @@ class MeanReductionTest(BaseReductionTest):
           np_arr = np.array([special_value_x, special_value_y]).astype(dtype)
           self._compareAll(np_arr, None)
 
+  @test_util.run_deprecated_v1
   def testInt32(self):
     for rank in range(1, _MAX_RANK + 1):
       np_arr = self._makeIncremental((2,) * rank, dtypes.int32)
       self._compareAllAxes(np_arr)
 
+  @test_util.run_deprecated_v1
   def testFloat32(self):
     for rank in range(1, _MAX_RANK + 1):
       np_arr = self._makeIncremental((2,) * rank, dtypes.float32)
       self._compareAllAxes(np_arr)
 
+  @test_util.run_deprecated_v1
   def testFloat64(self):
     for rank in range(1, _MAX_RANK + 1):
       np_arr = self._makeIncremental((2,) * rank, dtypes.float64)
       self._compareAllAxes(np_arr)
 
+  @test_util.run_deprecated_v1
   def testComplex64(self):
     for rank in range(1, _MAX_RANK + 1):
       np_arr = self._makeIncremental((2,) * rank, dtypes.complex64)
       self._compareAllAxes(np_arr)
 
+  @test_util.run_deprecated_v1
   def testComplex128(self):
     for rank in range(1, _MAX_RANK + 1):
       np_arr = self._makeIncremental((2,) * rank, dtypes.complex128)
       self._compareAllAxes(np_arr)
 
+  @test_util.run_deprecated_v1
   def testGradient(self):
     s = [2, 3, 4, 2]
     for dtype in [dtypes.float32, dtypes.float64]:
       x = self._makeIncremental(s, dtype)
       self._compareGradientAxes(x, rtol=1e-3, atol=1e-3)
 
+  @test_util.run_deprecated_v1
   def testEmptyGradients(self):
     with self.session(use_gpu=True):
       x = array_ops.zeros([0, 3])
@@ -448,6 +476,7 @@ class MeanReductionTest(BaseReductionTest):
       error = gradient_checker.compute_gradient_error(x, [0, 3], y, [0])
       self.assertEqual(error, 0)
 
+  @test_util.run_deprecated_v1
   def testDegenerate(self):
     with self.session(use_gpu=True):
       for dtype in (dtypes.float16, dtypes.float32, dtypes.float64):
@@ -473,9 +502,10 @@ class ProdReductionTest(BaseReductionTest):
     for dtype in [dtypes.int64, dtypes.int32]:
       with self.cached_session(use_gpu=True) as sess:
         v = math_ops.reduce_prod([0, 0], constant_op.constant(0, dtype=dtype))
-        tf_v = sess.run(v)
+        tf_v = self.evaluate(v)
       self.assertAllEqual(tf_v, 0)
 
+  @test_util.run_deprecated_v1
   def testInfinity(self):
     for dtype in [np.float32, np.float64]:
       for special_value_x in [-np.inf, np.inf]:
@@ -483,6 +513,7 @@ class ProdReductionTest(BaseReductionTest):
           np_arr = np.array([special_value_x, special_value_y]).astype(dtype)
           self._compareAll(np_arr, None)
 
+  @test_util.run_deprecated_v1
   def testInt32(self):
     # Numpy automatically upgrades the type of np.prod from int32 to int64, so
     # Numpy does not overflow an int32 np.prod while TensorFlow does. To avoid
@@ -491,26 +522,31 @@ class ProdReductionTest(BaseReductionTest):
       np_arr = self._makeIncremental((2,) * rank, dtypes.int32) / 2
       self._compareAllAxes(np_arr)
 
+  @test_util.run_deprecated_v1
   def testFloat32(self):
     for rank in range(1, _MAX_RANK + 1):
       np_arr = self._makeIncremental((2,) * rank, dtypes.float32)
       self._compareAllAxes(np_arr)
 
+  @test_util.run_deprecated_v1
   def testFloat64(self):
     for rank in range(1, _MAX_RANK + 1):
       np_arr = self._makeIncremental((2,) * rank, dtypes.float64)
       self._compareAllAxes(np_arr)
 
+  @test_util.run_deprecated_v1
   def testComplex64(self):
     for rank in range(1, _MAX_RANK + 1):
       np_arr = self._makeIncremental((2,) * rank, dtypes.complex64)
       self._compareAllAxes(np_arr)
 
+  @test_util.run_deprecated_v1
   def testComplex128(self):
     for rank in range(1, _MAX_RANK + 1):
       np_arr = self._makeIncremental((2,) * rank, dtypes.complex128)
       self._compareAllAxes(np_arr)
 
+  @test_util.run_deprecated_v1
   def testGradientWithZeros(self):
     s = [2, 3, 4, 2]
     x = self._makeIncremental(s, dtypes.float32) / 20.
@@ -533,6 +569,7 @@ class ProdReductionTest(BaseReductionTest):
     x4[:, :, :, :] = 0
     self._compareGradientAxes(x4, rtol=1e-3, atol=1e-3)
 
+  @test_util.run_deprecated_v1
   def testEmptyGradients(self):
     with self.session(use_gpu=True):
       x = array_ops.zeros([0, 3])
@@ -540,6 +577,7 @@ class ProdReductionTest(BaseReductionTest):
       error = gradient_checker.compute_gradient_error(x, [0, 3], y, [0])
       self.assertEqual(error, 0)
 
+  @test_util.run_deprecated_v1
   def testDegenerate(self):
     with self.session(use_gpu=True):
       for dtype in (dtypes.float16, dtypes.float32, dtypes.float64):
@@ -562,7 +600,7 @@ class MinReductionTest(test.TestCase):
       if reduction_axes is not None:
         reduction_axes = np.array(reduction_axes).astype(np.int32)
       tf_ans = math_ops.reduce_min(x, reduction_axes, keepdims)
-      out = tf_ans.eval()
+      out = self.evaluate(tf_ans)
     self.assertAllClose(np_ans, out)
     self.assertShapeEqual(np_ans, tf_ans)
 
@@ -576,9 +614,10 @@ class MinReductionTest(test.TestCase):
     for dtype in [dtypes.int64, dtypes.int32]:
       with self.cached_session(use_gpu=True) as sess:
         v = math_ops.reduce_min([0, 0], constant_op.constant(0, dtype=dtype))
-        tf_v = sess.run(v)
+        tf_v = self.evaluate(v)
       self.assertAllEqual(tf_v, 0)
 
+  @test_util.run_deprecated_v1
   def testInfinity(self):
     for dtype in [np.float32, np.float64]:
       for special_value_x in [-np.inf, np.inf]:
@@ -614,6 +653,7 @@ class MinReductionTest(test.TestCase):
     self._compareAll(np_arr, [0, 2])
     self._compareAll(np_arr, [0, 1, 2])
 
+  @test_util.run_deprecated_v1
   def testGradient(self):
     s = [2, 3, 4, 2]
     x = np.arange(1.0, 49.0).reshape(s).astype(np.float64)
@@ -624,6 +664,7 @@ class MinReductionTest(test.TestCase):
           t, s, su, [2, 2], x_init_value=x, delta=1)
     self.assertAllClose(jacob_t, jacob_n, rtol=1e-8, atol=1e-8)
 
+  @test_util.run_deprecated_v1
   def testGradient2(self):
     s = [2, 3, 4, 2]
     x = np.arange(1.0, 49.0).reshape(s).astype(np.float64)
@@ -634,6 +675,7 @@ class MinReductionTest(test.TestCase):
           t, s, su, [2, 4, 2], x_init_value=x, delta=1)
     self.assertAllClose(jacob_t, jacob_n, rtol=1e-8, atol=1e-8)
 
+  @test_util.run_deprecated_v1
   def testGradient3(self):
     s = [2, 3, 4, 2]
     x = np.arange(1.0, 49.0).reshape(s).astype(np.float64)
@@ -644,6 +686,7 @@ class MinReductionTest(test.TestCase):
           t, s, su, [2, 3, 2], x_init_value=x, delta=1)
     self.assertAllClose(jacob_t, jacob_n, rtol=1e-8, atol=1e-8)
 
+  @test_util.run_deprecated_v1
   def testGradient4(self):
     s = [2, 3, 4, 2]
     x = np.arange(1.0, 49.0).reshape(s).astype(np.float64)
@@ -654,6 +697,7 @@ class MinReductionTest(test.TestCase):
           t, s, su, [1], x_init_value=x, delta=1)
     self.assertAllClose(jacob_t, jacob_n, rtol=1e-8, atol=1e-8)
 
+  @test_util.run_deprecated_v1
   def testEmptyGradients(self):
     with self.cached_session():
       x = array_ops.zeros([0, 3])
@@ -675,7 +719,7 @@ class MaxReductionTest(test.TestCase):
       if reduction_axes is not None:
         reduction_axes = np.array(reduction_axes).astype(np.int32)
       tf_ans = math_ops.reduce_max(x, reduction_axes, keepdims)
-      out = tf_ans.eval()
+      out = self.evaluate(tf_ans)
     self.assertAllClose(np_ans, out)
     self.assertShapeEqual(np_ans, tf_ans)
 
@@ -689,9 +733,10 @@ class MaxReductionTest(test.TestCase):
     for dtype in [dtypes.int64, dtypes.int32]:
       with self.cached_session(use_gpu=True) as sess:
         v = math_ops.reduce_max([0, 0], constant_op.constant(0, dtype=dtype))
-        tf_v = sess.run(v)
+        tf_v = self.evaluate(v)
       self.assertAllEqual(tf_v, 0)
 
+  @test_util.run_deprecated_v1
   def testInfinity(self):
     for dtype in [np.float32, np.float64]:
       for special_value_x in [-np.inf, np.inf]:
@@ -741,6 +786,7 @@ class MaxReductionTest(test.TestCase):
     self._compareAll(np_arr, [0, 2])
     self._compareAll(np_arr, [0, 1, 2])
 
+  @test_util.run_deprecated_v1
   def testGradient(self):
     s = [2, 3, 4, 2]
     x = np.arange(-49.0, -1.0).reshape(s).astype(np.float64)
@@ -751,6 +797,7 @@ class MaxReductionTest(test.TestCase):
           t, s, su, [2, 2], x_init_value=x, delta=1)
     self.assertAllClose(jacob_t, jacob_n, rtol=1e-8, atol=1e-8)
 
+  @test_util.run_deprecated_v1
   def testGradient2(self):
     s = [2, 3, 4, 2]
     x = np.arange(-49.0, -1.0).reshape(s).astype(np.float64)
@@ -761,6 +808,7 @@ class MaxReductionTest(test.TestCase):
           t, s, su, [2, 4, 2], x_init_value=x, delta=1)
     self.assertAllClose(jacob_t, jacob_n, rtol=1e-8, atol=1e-8)
 
+  @test_util.run_deprecated_v1
   def testGradient3(self):
     s = [2, 3, 4, 2]
     x = np.arange(-49.0, -1.0).reshape(s).astype(np.float64)
@@ -771,6 +819,7 @@ class MaxReductionTest(test.TestCase):
           t, s, su, [2, 3, 2], x_init_value=x, delta=1)
     self.assertAllClose(jacob_t, jacob_n, rtol=1e-8, atol=1e-8)
 
+  @test_util.run_deprecated_v1
   def testGradient4(self):
     s = [2, 3, 4, 2]
     x = np.arange(-49.0, -1.0).reshape(s).astype(np.float64)
@@ -781,6 +830,7 @@ class MaxReductionTest(test.TestCase):
           t, s, su, [1], x_init_value=x, delta=1)
     self.assertAllClose(jacob_t, jacob_n, rtol=1e-8, atol=1e-8)
 
+  @test_util.run_deprecated_v1
   def testEmptyGradients(self):
     with self.cached_session():
       x = array_ops.zeros([0, 3])
@@ -802,7 +852,7 @@ class AllReductionTest(test.TestCase):
       if reduction_axes is not None:
         reduction_axes = np.array(reduction_axes).astype(np.int32)
       tf_ans = math_ops.reduce_all(x, reduction_axes, keepdims)
-      out = tf_ans.eval()
+      out = self.evaluate(tf_ans)
     self.assertAllEqual(np_ans, out)
     self.assertShapeEqual(np_ans, tf_ans)
 
@@ -817,7 +867,7 @@ class AllReductionTest(test.TestCase):
       with self.session(use_gpu=True) as sess:
         v = math_ops.reduce_all([True, True],
                                 constant_op.constant(0, dtype=dtype))
-        tf_v = sess.run(v)
+        tf_v = self.evaluate(v)
       self.assertAllEqual(tf_v, True)
 
   def testAll3D(self):
@@ -851,7 +901,7 @@ class AnyReductionTest(test.TestCase):
       if reduction_axes is not None:
         reduction_axes = np.array(reduction_axes).astype(np.int32)
       tf_ans = math_ops.reduce_any(x, reduction_axes, keepdims)
-      out = tf_ans.eval()
+      out = self.evaluate(tf_ans)
     self.assertAllEqual(np_ans, out)
     self.assertShapeEqual(np_ans, tf_ans)
 
@@ -866,7 +916,7 @@ class AnyReductionTest(test.TestCase):
       with self.session(use_gpu=True) as sess:
         v = math_ops.reduce_any([True, True],
                                 constant_op.constant(0, dtype=dtype))
-        tf_v = sess.run(v)
+        tf_v = self.evaluate(v)
       self.assertAllEqual(tf_v, True)
 
   def testAll3D(self):
@@ -913,6 +963,7 @@ class CountNonzeroReductionTest(test.TestCase):
     self._compare(x, reduction_axes, True, use_gpu=True, feed_dict=feed_dict)
     self._compare(x, reduction_axes, True, use_gpu=False, feed_dict=feed_dict)
 
+  @test_util.run_deprecated_v1
   def testBoolReduce1D(self):
     # Create a 1D array of floats
     np_arr = np.asarray([False, False, True, False, False, True])
@@ -920,11 +971,13 @@ class CountNonzeroReductionTest(test.TestCase):
     self._compareAll(np_arr, [])
     self._compareAll(np_arr, [0])
 
+  @test_util.run_deprecated_v1
   def testFloatReduce1D(self):
     # Create a 1D array of floats
     np_arr = np.asarray([0.0, 1.0, -1.0, 0.0, 0.0, 3.0]).astype(np.float32)
     self._compareAll(np_arr, [0])
 
+  @test_util.run_deprecated_v1
   def testFloatReduce4D(self):
     # Create a 4D array of floats and reduce across some
     # dimensions
@@ -944,11 +997,13 @@ class CountNonzeroReductionTest(test.TestCase):
     self._compareAll(np_arr, [1, 2, 3])
     self._compareAll(np_arr, [0, 1, 2, 3])
 
+  @test_util.run_deprecated_v1
   def testExpand(self):
     # Reduce an empty tensor to a nonempty tensor
     x = np.zeros((5, 0))
     self._compareAll(x, [1])
 
+  @test_util.run_deprecated_v1
   def testDegenerate(self):
     for use_gpu in False, True:
       with self.cached_session(use_gpu=use_gpu):
@@ -962,8 +1017,9 @@ class CountNonzeroReductionTest(test.TestCase):
     # Test case for GitHub issue 18712
     with self.cached_session() as sess:
       v = math_ops.count_nonzero(constant_op.constant(["test"]))
-      self.assertAllClose(sess.run(v), 1)
+      self.assertAllClose(self.evaluate(v), 1)
 
+  @test_util.run_deprecated_v1
   def testStringReduce1D(self):
     # Create a 1D array of strings
     x = np.asarray(["", "", "a", "", "", "b"])
@@ -974,6 +1030,7 @@ class CountNonzeroReductionTest(test.TestCase):
     self._compare(x, [], keepdims=True, zero=np.str(""))
     self._compare(x, [0], keepdims=True, zero=np.str(""))
 
+  @test_util.run_deprecated_v1
   def testStringReduce2D(self):
     # Create a 2D array of strings
     x = np.asarray([["", "", "a", "", "", "b"],
diff --git a/tensorflow/python/kernel_tests/regex_full_match_op_test.py b/tensorflow/python/kernel_tests/regex_full_match_op_test.py
index 98746e7d9b1..488ec85ab2c 100644
--- a/tensorflow/python/kernel_tests/regex_full_match_op_test.py
+++ b/tensorflow/python/kernel_tests/regex_full_match_op_test.py
@@ -23,6 +23,7 @@ from absl.testing import parameterized
 from tensorflow.python.compat import compat
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import gen_string_ops
 from tensorflow.python.ops import string_ops
 from tensorflow.python.platform import test
@@ -33,6 +34,7 @@ from tensorflow.python.platform import test
     (gen_string_ops.static_regex_full_match))
 class RegexFullMatchOpVariantsTest(test.TestCase, parameterized.TestCase):
 
+  @test_util.run_deprecated_v1
   def testRegexFullMatch(self, op):
     values = ["abaaba", "abcdabcde"]
     with self.cached_session():
@@ -40,6 +42,7 @@ class RegexFullMatchOpVariantsTest(test.TestCase, parameterized.TestCase):
       matched = op(input_tensor, "a.*a").eval()
       self.assertAllEqual([True, False], matched)
 
+  @test_util.run_deprecated_v1
   def testRegexFullMatchTwoDims(self, op):
     values = [["abaaba", "abcdabcde"], ["acdcba", "ebcda"]]
     with self.cached_session():
@@ -47,6 +50,7 @@ class RegexFullMatchOpVariantsTest(test.TestCase, parameterized.TestCase):
       matched = op(input_tensor, "a.*a").eval()
       self.assertAllEqual([[True, False], [True, False]], matched)
 
+  @test_util.run_deprecated_v1
   def testEmptyMatch(self, op):
     values = ["abc", "1"]
     with self.cached_session():
@@ -54,6 +58,7 @@ class RegexFullMatchOpVariantsTest(test.TestCase, parameterized.TestCase):
       matched = op(input_tensor, "").eval()
       self.assertAllEqual([False, False], matched)
 
+  @test_util.run_deprecated_v1
   def testInvalidPattern(self, op):
     values = ["abc", "1"]
     with self.cached_session():
@@ -61,11 +66,12 @@ class RegexFullMatchOpVariantsTest(test.TestCase, parameterized.TestCase):
       invalid_pattern = "A["
       matched = op(input_tensor, invalid_pattern)
       with self.assertRaisesOpError("Invalid pattern"):
-        matched.eval()
+        self.evaluate(matched)
 
 
 class RegexFullMatchOpTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testRegexFullMatchDelegation(self):
     with compat.forward_compatibility_horizon(2018, 11, 1):
       with self.cached_session():
@@ -78,6 +84,7 @@ class RegexFullMatchOpTest(test.TestCase):
         op_tensor = string_ops.regex_full_match(input_tensor, pattern_tensor)
         self.assertTrue(op_tensor.name.startswith("RegexFullMatch"), op.name)
 
+  @test_util.run_deprecated_v1
   def testStaticRegexFullMatchDelegation(self):
     with compat.forward_compatibility_horizon(2018, 11, 20):
       with self.cached_session():
diff --git a/tensorflow/python/kernel_tests/regex_replace_op_test.py b/tensorflow/python/kernel_tests/regex_replace_op_test.py
index d9b7ed28d21..6c7dfee7b40 100644
--- a/tensorflow/python/kernel_tests/regex_replace_op_test.py
+++ b/tensorflow/python/kernel_tests/regex_replace_op_test.py
@@ -22,6 +22,7 @@ from absl.testing import parameterized
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import gen_string_ops
 from tensorflow.python.ops import string_ops
 from tensorflow.python.platform import test
@@ -32,6 +33,7 @@ from tensorflow.python.platform import test
     (gen_string_ops.static_regex_replace))
 class RegexReplaceOpVariantsTest(test.TestCase, parameterized.TestCase):
 
+  @test_util.run_deprecated_v1
   def testForwarding(self, op):
     with self.cached_session():
       # Generate an input that is uniquely consumed by the regex op.
@@ -45,6 +47,7 @@ class RegexReplaceOpVariantsTest(test.TestCase, parameterized.TestCase):
       stripped = op(inp, "\\p{Ll}", ".").eval()
       self.assertAllEqual([b"A.C.E", b"H.J.L"], stripped)
 
+  @test_util.run_deprecated_v1
   def testRemovePrefix(self, op):
     values = ["a:foo", "a:bar", "a:foo", "b:baz", "b:qux", "ca:b"]
     with self.cached_session():
@@ -53,6 +56,7 @@ class RegexReplaceOpVariantsTest(test.TestCase, parameterized.TestCase):
       self.assertAllEqual([b"foo", b"bar", b"foo", b"baz", b"qux", b"ca:b"],
                           stripped)
 
+  @test_util.run_deprecated_v1
   def testRegexReplace(self, op):
     values = ["aba\naba", "abcdabcde"]
     with self.cached_session():
@@ -60,6 +64,7 @@ class RegexReplaceOpVariantsTest(test.TestCase, parameterized.TestCase):
       stripped = op(input_vector, "a.*a", "(\\0)").eval()
       self.assertAllEqual([b"(aba)\n(aba)", b"(abcda)bcde"], stripped)
 
+  @test_util.run_deprecated_v1
   def testEmptyMatch(self, op):
     values = ["abc", "1"]
     with self.cached_session():
@@ -67,6 +72,7 @@ class RegexReplaceOpVariantsTest(test.TestCase, parameterized.TestCase):
       stripped = op(input_vector, "", "x").eval()
       self.assertAllEqual([b"xaxbxcx", b"x1x"], stripped)
 
+  @test_util.run_deprecated_v1
   def testInvalidPattern(self, op):
     values = ["abc", "1"]
     with self.cached_session():
@@ -74,8 +80,9 @@ class RegexReplaceOpVariantsTest(test.TestCase, parameterized.TestCase):
       invalid_pattern = "A["
       replace = op(input_vector, invalid_pattern, "x")
       with self.assertRaisesOpError("Invalid pattern"):
-        replace.eval()
+        self.evaluate(replace)
 
+  @test_util.run_deprecated_v1
   def testGlobal(self, op):
     values = ["ababababab", "abcabcabc", ""]
     with self.cached_session():
@@ -98,6 +105,7 @@ class RegexReplaceTest(test.TestCase, parameterized.TestCase):
       (as_string, as_tensor),
       (as_tensor, as_string),
       (as_tensor, as_tensor))
+  @test_util.run_deprecated_v1
   def testRegexReplaceDelegation(self, pattern_fn, rewrite_fn):
     with self.cached_session():
       input_vector = constant_op.constant("foo", dtypes.string)
@@ -106,6 +114,7 @@ class RegexReplaceTest(test.TestCase, parameterized.TestCase):
       op = string_ops.regex_replace(input_vector, pattern, replace)
       self.assertTrue(op.name.startswith("RegexReplace"))
 
+  @test_util.run_deprecated_v1
   def testStaticRegexReplaceDelegation(self):
     with self.cached_session():
       input_vector = constant_op.constant("foo", dtypes.string)
diff --git a/tensorflow/python/kernel_tests/relu_op_test.py b/tensorflow/python/kernel_tests/relu_op_test.py
index b0f2796ede1..55e68f4884c 100644
--- a/tensorflow/python/kernel_tests/relu_op_test.py
+++ b/tensorflow/python/kernel_tests/relu_op_test.py
@@ -25,6 +25,8 @@ from tensorflow.python.compat import compat
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import gradients_impl
@@ -55,55 +57,56 @@ class ReluTest(test.TestCase):
             np.array([[-0.9, 0.7, -0.5, 0.3, -0.1], [0.1, -0.3, 0.5, -0.7,
                                                      0.9]])))
 
-  def _testRelu(self, np_features, use_gpu=False):
+  def _testRelu(self, np_features):
     np_relu = self._npRelu(np_features)
-    with self.cached_session(use_gpu=use_gpu):
-      relu = nn_ops.relu(np_features)
-      tf_relu = relu.eval()
+    tf_relu = nn_ops.relu(np_features)
     self.assertAllClose(np_relu, tf_relu)
-    self.assertShapeEqual(np_relu, relu)
+    self.assertShapeEqual(np_relu, tf_relu)
 
-  def testNumbers(self):
+  def testNumbersCPU(self):
     for t in [np.int32, np.int64, np.float16, np.float32, np.float64]:
-      self._testRelu(
-          np.array([[-9, 7, -5, 3, -1], [1, -3, 5, -7, 9]]).astype(t),
-          use_gpu=False)
-      if t in [np.float16, np.float32, np.float64]:
+      # Force execution on CPU even if a GPU kernel is available for the type.
+      with ops.device("/device:CPU:0"):
         self._testRelu(
-            np.array([[-9, 7, -5, 3, -1], [1, -3, 5, -7, 9]]).astype(t),
-            use_gpu=True)
+            np.array([[-9, 7, -5, 3, -1], [1, -3, 5, -7, 9]]).astype(t))
 
-  def _testReluInt8x4(self, np_inputs):
-    if not test.is_gpu_available(cuda_only=True):
-      return
-    np_relu = self._npRelu(np_inputs)
-    with self.cached_session(use_gpu=True):
-      relu = nn_ops.relu(constant_op.constant(np_inputs, dtypes.qint8))
-      if np_inputs.size % 4 == 0:
-        tf_relu = relu.eval()
-        self.assertAllClose(np_relu, tf_relu)
-        self.assertShapeEqual(np_relu, relu)
-      else:
-        with self.assertRaisesRegexp(
-            errors.InvalidArgumentError,
-            "Tensor size must be a multiple of 4 for Relu<qint8>. Got %d" %
-            np_inputs.size):
-          tf_relu = relu.eval()
+  def testNumbersGPU(self):
+    if not test.is_gpu_available():
+      self.skipTest("No GPU available")
+    for t in [np.float16, np.float32, np.float64]:
+      self._testRelu(
+          np.array([[-9, 7, -5, 3, -1], [1, -3, 5, -7, 9]]).astype(t))
 
   def testReluInt8x4GoodShape(self):
-    self._testReluInt8x4(np.array([[-50, 7, 23, 0], [-1, -5, 6, 11]]))
+    if not test.is_gpu_available(cuda_only=True):
+      self.skipTest("No GPU available")
+    inputs = np.array([[-50, 7, 23, 0], [-1, -5, 6, 11]])
+    np_relu = self._npRelu(inputs)
+    tf_relu = nn_ops.relu(constant_op.constant(inputs, dtypes.qint8))
+    self.assertAllClose(np_relu, tf_relu)
+    self.assertShapeEqual(np_relu, tf_relu)
 
   def testReluInt8x4BadShape(self):
-    np_inputs = np.array([[-50, 7, 23], [0, 1, -5], [6, -2, 11]])
-    self.assertEqual(np_inputs.size, 9)
-    self._testReluInt8x4(np_inputs)
-    np_inputs = np.array(
-        [1, -2, 3, -4, 5, -6, 7, -8, 9, -8, 7, -6, 5, -4, 3, -2, 1])
-    self.assertEqual(np_inputs.size, 17)
-    self._testReluInt8x4(np_inputs)
+    if not test.is_gpu_available(cuda_only=True):
+      self.skipTest("No GPU available")
+    inputs = constant_op.constant(
+        np.array([[-50, 7, 23], [0, 1, -5], [6, -2, 11]]), dtypes.qint8)
+    with self.assertRaisesRegexp(
+        errors.InvalidArgumentError,
+        "Tensor size must be a multiple of 4 for Relu<qint8>. Got 9"):
+      self.evaluate(nn_ops.relu(inputs))
+
+    inputs = constant_op.constant(
+        np.array([1, -2, 3, -4, 5, -6, 7, -8, 9, -8, 7, -6, 5, -4, 3, -2, 1]),
+        dtypes.qint8)
+    with self.assertRaisesRegexp(
+        errors.InvalidArgumentError,
+        "Tensor size must be a multiple of 4 for Relu<qint8>. Got 17"):
+      self.evaluate(nn_ops.relu(inputs))
 
   # The gradient test for ReLU is a bit tricky as the derivative is not well
   # defined at around zero and we want to avoid that in terms of input values.
+  @test_util.run_deprecated_v1
   def testGradientFloat32(self):
     with self.cached_session():
       x = constant_op.constant(
@@ -123,6 +126,7 @@ class ReluTest(test.TestCase):
   # The gradient for fp16 is inaccurate due to the low-precision.
   # Instead of relying on compute_gradient_error, we compare the fp16 analytical
   # gradient against their fp32 counterpart.
+  @test_util.run_deprecated_v1
   def testGradientFloat16(self):
     with self.session(use_gpu=True) as sess:
       # Randomly construct a 1D shape from [1, 40)
@@ -146,9 +150,10 @@ class ReluTest(test.TestCase):
       # Repeat the experiment for 100 times. All tensor shapes and its tensor
       # values are randomly generated for each run.
       for _ in xrange(100):
-        dx_f32_v, dx_f16_v = sess.run([dx_f32, dx_f16])
+        dx_f32_v, dx_f16_v = self.evaluate([dx_f32, dx_f16])
         self.assertAllClose(dx_f32_v, dx_f16_v, atol=3e-4)
 
+  @test_util.run_deprecated_v1
   def testGradientFloat64(self):
     with self.cached_session():
       x = constant_op.constant(
@@ -166,6 +171,7 @@ class ReluTest(test.TestCase):
     print("relu (float64) gradient err = ", err)
     self.assertLess(err, 1e-10)
 
+  @test_util.run_deprecated_v1
   def testGradGradFloat32(self):
     with self.cached_session():
       x = constant_op.constant(
@@ -183,6 +189,7 @@ class ReluTest(test.TestCase):
     print("relu (float32) gradient of gradient err = ", err)
     self.assertLess(err, 1e-4)
 
+  @test_util.run_deprecated_v1
   def testGradGradFloat64(self):
     with self.cached_session():
       x = constant_op.constant(
@@ -202,15 +209,15 @@ class ReluTest(test.TestCase):
     self.assertLess(err, 1e-10)
 
   def testGradientScalar(self):
-    with self.cached_session() as sess:
-      x = variables.Variable(100.)
-      y = nn_ops.relu(x)
-      loss = y**2
-      optimizer = gradient_descent.GradientDescentOptimizer(learning_rate=0.25)
-      train_op = optimizer.minimize(loss)
-      sess.run(variables.global_variables_initializer())
-      sess.run(train_op)
-      self.assertAllClose(x.eval(), 50.0)
+    x = variables.Variable(100.)
+
+    def loss():
+      return nn_ops.relu(x)**2
+
+    optimizer = gradient_descent.GradientDescentOptimizer(learning_rate=0.25)
+    self.evaluate(variables.global_variables_initializer())
+    self.evaluate(optimizer.minimize(loss))
+    self.assertAllClose(x.read_value(), 50.0)
 
 
 class Relu6Test(test.TestCase):
@@ -228,27 +235,30 @@ class Relu6Test(test.TestCase):
             np.array([[-0.9, 0.7, -0.5, 0.3, 6.0], [0.1, -0.3, 6.5, -0.7,
                                                     0.9]])))
 
-  def _testRelu6(self, np_features, use_gpu=False):
+  def _testRelu6(self, np_features):
     np_relu6 = self._npRelu6(np_features)
-    with self.cached_session(use_gpu=use_gpu):
-      relu6 = nn_ops.relu6(np_features)
-      tf_relu6 = relu6.eval()
+    tf_relu6 = nn_ops.relu6(np_features)
     self.assertAllClose(np_relu6, tf_relu6)
-    self.assertShapeEqual(np_relu6, relu6)
+    self.assertShapeEqual(np_relu6, tf_relu6)
 
-  def testNumbers(self):
+  def testNumbersCPU(self):
     for t in [np.int32, np.int64, np.float16, np.float32, np.float64]:
-      self._testRelu6(
-          np.array([[-9, 7, -5, 3, -1], [1, -3, 5, -7, 9]]).astype(t),
-          use_gpu=False)
-      if t in [np.float16, np.float, np.double]:
+      # Force execution on CPU even if a GPU kernel is available for the type.
+      with ops.device("/device:CPU:0"):
         self._testRelu6(
-            np.array([[-9, 7, -5, 3, -1], [1, -3, 5, -7, 9]]).astype(t),
-            use_gpu=True)
+            np.array([[-9, 7, -5, 3, -1], [1, -3, 5, -7, 9]]).astype(t))
+
+  def testNumbersGPU(self):
+    if not test.is_gpu_available():
+      self.skipTest("No GPU available")
+    for t in [np.float16, np.float, np.double]:
+      self._testRelu6(
+          np.array([[-9, 7, -5, 3, -1], [1, -3, 5, -7, 9]]).astype(t))
 
   # The gradient test for ReLU6 is a bit tricky as the derivative is
   # not well defined at around zero and six and we want to avoid that
   # in terms of input values.
+  @test_util.run_deprecated_v1
   def testGradientFloat32(self):
     with self.cached_session():
       x = constant_op.constant(
@@ -265,6 +275,7 @@ class Relu6Test(test.TestCase):
     print("relu6 (float32) gradient err = ", err)
     self.assertLess(err, 1e-4)
 
+  @test_util.run_deprecated_v1
   def testGradientFloat64(self):
     with self.cached_session():
       x = constant_op.constant(
@@ -297,29 +308,32 @@ class LeakyReluTest(test.TestCase):
                                                      0.9]]),
             alpha=0.1))
 
-  def _testLeakyRelu(self, np_features, alpha, use_gpu=False):
+  def _testLeakyRelu(self, np_features, alpha):
     np_leaky_relu = self._npLeakyRelu(np_features, alpha)
-    with self.test_session(use_gpu=use_gpu):
-      leaky_relu = nn_ops.leaky_relu(np_features, alpha)
-      tf_leaky_relu = leaky_relu.eval()
+    tf_leaky_relu = nn_ops.leaky_relu(np_features, alpha)
     self.assertAllClose(np_leaky_relu, tf_leaky_relu)
-    self.assertShapeEqual(np_leaky_relu, leaky_relu)
+    self.assertShapeEqual(np_leaky_relu, tf_leaky_relu)
 
-  def testNumbers(self):
+  def testNumbersCPU(self):
     for t in [np.int32, np.int64, np.float16, np.float32, np.float64]:
-      self._testLeakyRelu(
-          np.array([[-9, 7, -5, 3, -1], [1, -3, 5, -7, 9]]).astype(t),
-          alpha=0.2,
-          use_gpu=False)
-      if t in [np.float16, np.float32, np.float64]:
+      # Force execution on CPU even if a GPU kernel is available for the type.
+      with ops.device("/device:CPU:0"):
         self._testLeakyRelu(
             np.array([[-9, 7, -5, 3, -1], [1, -3, 5, -7, 9]]).astype(t),
-            alpha=0.1,
-            use_gpu=True)
+            alpha=0.2)
+
+  def testNumbersGPU(self):
+    if not test.is_gpu_available():
+      self.skipTest("No GPU available")
+    for t in [np.float16, np.float32, np.float64]:
+      self._testLeakyRelu(
+          np.array([[-9, 7, -5, 3, -1], [1, -3, 5, -7, 9]]).astype(t),
+          alpha=0.1)
 
   # The gradient test for Leaky ReLU is a bit tricky as the derivative is not
   # well defined at around zero and we want to avoid that in terms of input
   # values.
+  @test_util.run_deprecated_v1
   def testGradientFloat32(self):
     with self.test_session():
       x = constant_op.constant(
@@ -336,6 +350,7 @@ class LeakyReluTest(test.TestCase):
     print("leaky_relu (float32) gradient err = ", err)
     self.assertLess(err, 1e-4)
 
+  @test_util.run_deprecated_v1
   def testGradientFloat64(self):
     with self.test_session():
       x = constant_op.constant(
@@ -353,6 +368,7 @@ class LeakyReluTest(test.TestCase):
     print("leaky_relu (float64) gradient err = ", err)
     self.assertLess(err, 1e-10)
 
+  @test_util.run_deprecated_v1
   def testGradGradFloat32(self):
     with compat.forward_compatibility_horizon(2018, 11, 2):
       with self.test_session():
@@ -371,6 +387,7 @@ class LeakyReluTest(test.TestCase):
       print("leaky_relu (float32) gradient of gradient err = ", err)
       self.assertLess(err, 1e-4)
 
+  @test_util.run_deprecated_v1
   def testGradGradFloat64(self):
     with compat.forward_compatibility_horizon(2018, 11, 2):
       with self.test_session():
@@ -391,15 +408,15 @@ class LeakyReluTest(test.TestCase):
       self.assertLess(err, 1e-10)
 
   def testGradientScalar(self):
-    with self.test_session() as sess:
-      x = variables.Variable(-100.)
-      y = nn_ops.leaky_relu(x, 0.05)
-      loss = y**2
-      optimizer = gradient_descent.GradientDescentOptimizer(learning_rate=0.2)
-      train_op = optimizer.minimize(loss)
-      sess.run(variables.global_variables_initializer())
-      sess.run(train_op)
-      self.assertAllClose(x.eval(), -99.9)
+    x = variables.Variable(-100.)
+
+    def loss():
+      return nn_ops.leaky_relu(x, 0.05)**2
+
+    optimizer = gradient_descent.GradientDescentOptimizer(learning_rate=0.2)
+    self.evaluate(variables.global_variables_initializer())
+    self.evaluate(optimizer.minimize(loss))
+    self.assertAllClose(x.read_value(), -99.9)
 
 
 class EluTest(test.TestCase):
@@ -415,23 +432,26 @@ class EluTest(test.TestCase):
             np.array([[-0.9, 0.7, -0.5, 0.3, -0.1], [0.1, -0.3, 0.5, -0.7,
                                                      0.9]])))
 
-  def _testElu(self, np_features, use_gpu=False):
+  def _testElu(self, np_features):
     np_elu = self._npElu(np_features)
-    with self.cached_session(use_gpu=use_gpu):
-      elu = nn_ops.elu(np_features)
-      tf_elu = elu.eval()
+    tf_elu = nn_ops.elu(np_features)
     self.assertAllClose(np_elu, tf_elu)
-    self.assertShapeEqual(np_elu, elu)
+    self.assertShapeEqual(np_elu, tf_elu)
 
-  def testNumbers(self):
+  def testNumbersCPU(self):
     for t in [np.float16, np.float32, np.float64]:
-      self._testElu(
-          np.array([[-9, 7, -5, 3, -1], [1, -3, 5, -7, 9]]).astype(t),
-          use_gpu=False)
-      self._testElu(
-          np.array([[-9, 7, -5, 3, -1], [1, -3, 5, -7, 9]]).astype(t),
-          use_gpu=True)
+      # Force execution on CPU even if a GPU kernel is available for the type.
+      with ops.device("/device:CPU:0"):
+        self._testElu(
+            np.array([[-9, 7, -5, 3, -1], [1, -3, 5, -7, 9]]).astype(t))
 
+  def testNumbersGPU(self):
+    if not test.is_gpu_available():
+      self.skipTest("No GPU available")
+    for t in [np.float16, np.float32, np.float64]:
+      self._testElu(np.array([[-9, 7, -5, 3, -1], [1, -3, 5, -7, 9]]).astype(t))
+
+  @test_util.run_deprecated_v1
   def testGradientFloat32(self):
     with self.cached_session():
       x_val = [[-0.9, -0.7, -0.5, -0.3, -0.1], [0.1, 0.3, 0.5, 0.7, 0.9]]
@@ -443,6 +463,7 @@ class EluTest(test.TestCase):
     print("elu (float32) gradient err = ", err)
     self.assertLess(err, 1e-4)
 
+  @test_util.run_deprecated_v1
   def testGradientFloat64(self):
     with self.cached_session():
       x_val = [[-0.9, -0.7, -0.5, -0.3, -0.1], [0.1, 0.3, 0.5, 0.7, 0.9]]
@@ -454,6 +475,7 @@ class EluTest(test.TestCase):
     print("elu (float64) gradient err = ", err)
     self.assertLess(err, 1e-6)
 
+  @test_util.run_deprecated_v1
   def testGradGrad(self):
     with self.cached_session():
       x = array_ops.placeholder(dtype=dtypes.float32)
@@ -465,6 +487,7 @@ class EluTest(test.TestCase):
         err = np.abs(gg.eval(feed_dict={x: x_val}) - _elu_grad_grad(x_val))
         self.assertLess(err, 1e-4)
 
+  @test_util.run_deprecated_v1
   def testGradGradFloat32(self):
     with self.cached_session():
       x = constant_op.constant(
@@ -482,6 +505,7 @@ class EluTest(test.TestCase):
     print("elu (float32) gradient of gradient err = ", err)
     self.assertLess(err, 1e-4)
 
+  @test_util.run_deprecated_v1
   def testGradGradFloat64(self):
     with self.cached_session():
       x = constant_op.constant(
@@ -517,23 +541,22 @@ class SeluTest(test.TestCase):
             np.array([[-0.9, 0.7, -0.5, 0.3, -0.1], [0.1, -0.3, 0.5, -0.7,
                                                      0.9]])))
 
-  def _testSelu(self, np_features, use_gpu=False):
+  def _testSelu(self, np_features):
     np_selu = self._npSelu(np_features)
-    with self.cached_session(use_gpu=use_gpu):
-      selu = nn_ops.selu(np_features)
-      tf_selu = selu.eval()
+    tf_selu = nn_ops.selu(np_features)
     self.assertAllClose(np_selu, tf_selu)
-    self.assertShapeEqual(np_selu, selu)
+    self.assertShapeEqual(np_selu, tf_selu)
 
   def testNumbers(self):
     for t in [np.float16, np.float32, np.float64]:
       self._testSelu(
-          np.array([[-9, 7, -5, 3, -1], [1, -3, 5, -7, 9]]).astype(t),
-          use_gpu=False)
-      self._testSelu(
-          np.array([[-9, 7, -5, 3, -1], [1, -3, 5, -7, 9]]).astype(t),
-          use_gpu=True)
+          np.array([[-9, 7, -5, 3, -1], [1, -3, 5, -7, 9]]).astype(t))
+      # Force executed on CPU in case GPU kernels are avaiable.
+      with ops.device("/device:CPU:0"):
+        self._testSelu(
+            np.array([[-9, 7, -5, 3, -1], [1, -3, 5, -7, 9]]).astype(t))
 
+  @test_util.run_deprecated_v1
   def testGradientFloat32(self):
     with self.cached_session():
       x_val = [[-0.9, -0.7, -0.5, -0.3, -0.1], [0.1, 0.3, 0.5, 0.7, 0.9]]
@@ -545,6 +568,7 @@ class SeluTest(test.TestCase):
     print("selu (float32) gradient err = ", err)
     self.assertLess(err, 1e-4)
 
+  @test_util.run_deprecated_v1
   def testGradientFloat64(self):
     with self.cached_session():
       x_val = [[-0.9, -0.7, -0.5, -0.3, -0.1], [0.1, 0.3, 0.5, 0.7, 0.9]]
@@ -556,6 +580,7 @@ class SeluTest(test.TestCase):
     print("selu (float64) gradient err = ", err)
     self.assertLess(err, 1e-6)
 
+  @test_util.run_deprecated_v1
   def testGradGradFloat32(self):
     with self.cached_session():
       x = constant_op.constant(
@@ -573,6 +598,7 @@ class SeluTest(test.TestCase):
     print("selu (float32) gradient of gradient err = ", err)
     self.assertLess(err, 1e-4)
 
+  @test_util.run_deprecated_v1
   def testGradGradFloat64(self):
     with self.cached_session():
       x = constant_op.constant(
@@ -599,46 +625,44 @@ class CreluTest(test.TestCase):
     t = nn_ops.crelu(f)
     self.assertEqual([50, 5, 7, 20], t.get_shape())
 
-  def _testCrelu(self, np_features, use_gpu=False):
+  def _testCrelu(self, np_features):
     np_relu = np.maximum(np_features, np.zeros_like(np_features))
     np_neg_relu = np.maximum(-np_features, np.zeros_like(np_features))
     np_crelu = np.concatenate((np_relu, np_neg_relu),
                               len(np_features.shape) - 1)
 
-    with self.cached_session(use_gpu=use_gpu):
-      crelu = nn_ops.crelu(np_features)
-      tf_relu = crelu.eval()
+    tf_crelu = nn_ops.crelu(np_features)
 
-    self.assertAllClose(np_crelu, tf_relu)
-    self.assertShapeEqual(np_crelu, crelu)
+    self.assertAllClose(np_crelu, tf_crelu)
+    self.assertShapeEqual(np_crelu, tf_crelu)
 
-  def testNumbers(self):
+  def testNumbersCPU(self):
     for t in [np.int32, np.int64, np.float16, np.float32, np.float64]:
-      self._testCrelu(
-          np.array([[-9, 7, -5, 3, -1], [1, -3, 5, -7, 9]]).astype(t),
-          use_gpu=False)
-      if t in [np.float16, np.float32, np.float64]:
+      # Force execution on CPU even if a GPU kernel is available for the type.
+      with ops.device("/device:CPU:0"):
         self._testCrelu(
-            np.array([[-9, 7, -5, 3, -1], [1, -3, 5, -7, 9]]).astype(t),
-            use_gpu=True)
+            np.array([[-9, 7, -5, 3, -1], [1, -3, 5, -7, 9]]).astype(t))
+
+  def testNumbersGPU(self):
+    if not test.is_gpu_available():
+      self.skipTest("No GPU available")
+    for t in [np.float16, np.float32, np.float64]:
+      self._testCrelu(
+          np.array([[-9, 7, -5, 3, -1], [1, -3, 5, -7, 9]]).astype(t))
 
   def testNumbersWithAxis0(self):
-    with self.cached_session():
-      crelu = nn_ops.crelu(
-          np.array([[-9, 7, -5, 3, -1], [1, -3, 5, -7, 9]]), axis=0)
-      tf_relu = crelu.eval()
-      np_crelu = np.array([[0, 7, 0, 3, 0], [1, 0, 5, 0, 9], [9, 0, 5, 0, 1],
-                           [0, 3, 0, 7, 0]])
-      self.assertAllEqual(np_crelu, tf_relu)
+    tf_crelu = nn_ops.crelu(
+        np.array([[-9, 7, -5, 3, -1], [1, -3, 5, -7, 9]]), axis=0)
+    np_crelu = np.array([[0, 7, 0, 3, 0], [1, 0, 5, 0, 9], [9, 0, 5, 0, 1],
+                         [0, 3, 0, 7, 0]])
+    self.assertAllEqual(np_crelu, tf_crelu)
 
   def testNumbersWithAxis1(self):
-    with self.cached_session():
-      crelu = nn_ops.crelu(
-          np.array([[-9, 7, -5, 3, -1], [1, -3, 5, -7, 9]]), axis=1)
-      tf_relu = crelu.eval()
-      np_crelu = np.array([[0, 7, 0, 3, 0, 9, 0, 5, 0, 1],
-                           [1, 0, 5, 0, 9, 0, 3, 0, 7, 0]])
-      self.assertAllEqual(np_crelu, tf_relu)
+    tf_crelu = nn_ops.crelu(
+        np.array([[-9, 7, -5, 3, -1], [1, -3, 5, -7, 9]]), axis=1)
+    np_crelu = np.array([[0, 7, 0, 3, 0, 9, 0, 5, 0, 1],
+                         [1, 0, 5, 0, 9, 0, 3, 0, 7, 0]])
+    self.assertAllEqual(np_crelu, tf_crelu)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/kernel_tests/reshape_op_test.py b/tensorflow/python/kernel_tests/reshape_op_test.py
index 14cdae18370..db3e88a104f 100644
--- a/tensorflow/python/kernel_tests/reshape_op_test.py
+++ b/tensorflow/python/kernel_tests/reshape_op_test.py
@@ -22,6 +22,7 @@ import numpy as np
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.platform import test
@@ -33,14 +34,14 @@ class ReshapeTest(test.TestCase):
     with self.cached_session(use_gpu=use_gpu):
       np_ans = x.reshape(y)
       tf_ans = array_ops.reshape(x, y)
-      out = tf_ans.eval()
+      out = self.evaluate(tf_ans)
       self.assertEqual(tf_ans.get_shape(), out.shape)
       self.assertShapeEqual(np_ans, tf_ans)
 
       # Repeat with an int64 shape tensor.
       y64 = constant_op.constant(y, dtype=dtypes.int64)
       tf_ans = array_ops.reshape(x, y64)
-      out = tf_ans.eval()
+      out = self.evaluate(tf_ans)
       self.assertEqual(tf_ans.get_shape(), out.shape)
       self.assertShapeEqual(np_ans, tf_ans)
 
@@ -91,6 +92,7 @@ class ReshapeTest(test.TestCase):
   # TODO(vrv): Add tests for failure conditions once python test_util
   # reports errors.
 
+  @test_util.run_deprecated_v1
   def testFloatReshapeGradThreeDimensions(self):
     x = np.arange(1., 25.).reshape([2, 3, 4]).astype(np.float32)
     s = list(np.shape(x))
@@ -111,6 +113,7 @@ class ReshapeTest(test.TestCase):
     self._testBothReshape(x, [0, 0, 0])
     self._testBothReshape(x, [1, -1, 5])
 
+  @test_util.run_deprecated_v1
   def testErrors(self):
     y = constant_op.constant(0.0, shape=[23, 29, 31])
     with self.assertRaisesRegexp(ValueError, "must be evenly divisible by 17"):
@@ -121,6 +124,7 @@ class ReshapeTest(test.TestCase):
                                  "Cannot reshape a tensor with 4096 elements"):
       array_ops.reshape(z, [4095])
 
+  @test_util.run_deprecated_v1
   def testPartialShapes(self):
     x = array_ops.placeholder(dtypes.float32)
 
diff --git a/tensorflow/python/kernel_tests/resource_variable_ops_test.py b/tensorflow/python/kernel_tests/resource_variable_ops_test.py
index c8227dc117f..b57d9d47aa3 100644
--- a/tensorflow/python/kernel_tests/resource_variable_ops_test.py
+++ b/tensorflow/python/kernel_tests/resource_variable_ops_test.py
@@ -29,6 +29,7 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_util
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
@@ -53,6 +54,7 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase):
     # involving objects with __del__ defined.
     self.assertEqual(0, len(gc.garbage))
 
+  @test_util.run_deprecated_v1
   def testHandleDtypeShapeMatch(self):
     with self.cached_session():
       handle = resource_variable_ops.var_handle_op(dtype=dtypes.int32, shape=[])
@@ -122,6 +124,7 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase):
       # values.
       self.assertFalse(np.allclose(variable.numpy(), copied_variable.numpy()))
 
+  @test_util.run_deprecated_v1
   def testGraphDeepCopy(self):
     with self.cached_session():
       init_value = np.ones((4, 4, 4))
@@ -137,6 +140,15 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase):
     self.evaluate(v[0].assign(2.0))
     self.assertAllEqual(self.evaluate(v), [2.0, 2.0])
 
+  @test_util.run_in_graph_and_eager_modes
+  def testVariableShape(self):
+    v = resource_variable_ops.ResourceVariable([1., 1.])
+    self.assertAllEqual(
+        tensor_util.constant_value(
+            resource_variable_ops.variable_shape(v.handle)),
+        [2])
+
+  @test_util.run_deprecated_v1
   def testDifferentAssignGraph(self):
     with ops.Graph().as_default():
       v = resource_variable_ops.ResourceVariable(1.0)
@@ -144,16 +156,18 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase):
     v.assign(2.0)  # Note: this fails if we run convert_to_tensor on not the
     # variable graph.
 
+  @test_util.run_deprecated_v1
   def testFetchHandle(self):
     with self.cached_session():
       handle = resource_variable_ops.var_handle_op(
           dtype=dtypes.int32, shape=[1], name="foo")
       self.assertGreater(len(handle.eval()), 0)
 
+  @test_util.run_deprecated_v1
   def testCachedValueReadBeforeWrite(self):
     with self.cached_session() as sess:
       v = resource_variable_ops.ResourceVariable(0.0, caching_device="cpu:0")
-      sess.run(v.initializer)
+      self.evaluate(v.initializer)
       value, _ = sess.run([v, v.assign_add(1.0)])
       self.assertAllEqual(value, 0.0)
 
@@ -426,6 +440,7 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase):
     read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32)
     self.assertEqual(self.evaluate(read), [[6]])
 
+  @test_util.run_deprecated_v1
   def testScatterUpdateString(self):
     handle = resource_variable_ops.var_handle_op(
         dtype=dtypes.string, shape=[1, 1])
@@ -437,6 +452,7 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase):
     self.assertEqual(compat.as_bytes(self.evaluate(read)[0][0]),
                      compat.as_bytes("b"))
 
+  @test_util.run_deprecated_v1
   def testScatterUpdateStringScalar(self):
     handle = resource_variable_ops.var_handle_op(
         dtype=dtypes.string, shape=[1, 1])
@@ -456,7 +472,7 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase):
 
   # TODO(alive): get this to work in Eager mode.
   def testGPU(self):
-    with self.test_session(use_gpu=True):
+    with test_util.use_gpu():
       abc = variable_scope.get_variable(
           "abc",
           shape=[1],
@@ -491,6 +507,7 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase):
           initial_value=lambda: 1, constraint=constraint, name="var1")
 
   # TODO(alive): how should this work in Eager mode?
+  @test_util.run_deprecated_v1
   def testInitFn(self):
     with self.cached_session():
       v = resource_variable_ops.ResourceVariable(
@@ -568,6 +585,21 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase):
     v.load(2.0)
     self.assertEqual(2.0, self.evaluate(v.value()))
 
+  def testToFromProtoCachedValue(self):
+    with ops.Graph().as_default():
+      v_def = resource_variable_ops.ResourceVariable(
+          initial_value=constant_op.constant(3.0)).to_proto()
+      v_prime = resource_variable_ops.ResourceVariable(variable_def=v_def)
+      self.assertTrue(getattr(v_prime, "_cached_value", None) is None)
+
+      other_v_def = resource_variable_ops.ResourceVariable(
+          caching_device="cpu:0",
+          initial_value=constant_op.constant(3.0)).to_proto()
+      other_v_prime = resource_variable_ops.ResourceVariable(
+          variable_def=other_v_def)
+      self.assertTrue(other_v_prime._cached_value is not None)
+
+  @test_util.run_deprecated_v1
   def testVariableDefInitializedInstances(self):
     with ops.Graph().as_default(), self.cached_session() as sess:
       v_def = resource_variable_ops.ResourceVariable(
@@ -576,11 +608,11 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase):
     with ops.Graph().as_default(), self.cached_session() as sess:
       # v describes a VariableDef-based variable without an initial value.
       v = resource_variable_ops.ResourceVariable(variable_def=v_def)
-      self.assertEqual(3.0, sess.run(v.initialized_value()))
+      self.assertEqual(3.0, self.evaluate(v.initialized_value()))
 
       # initialized_value should not rerun the initializer_op if the variable
       # has already been initialized elsewhere.
-      sess.run(v.assign(1.0))
+      self.evaluate(v.assign(1.0))
       self.assertEqual(1.0, v.initialized_value().eval())
 
     v_def.ClearField("initial_value_name")
@@ -592,7 +624,7 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase):
       self.assertProtoEquals(v_def, v.to_proto())
       # But attempts to use initialized_value will result in errors.
       with self.assertRaises(ValueError):
-        sess.run(v.initialized_value())
+        self.evaluate(v.initialized_value())
 
   def testTrainableInProto(self):
     with ops.Graph().as_default():
@@ -623,6 +655,7 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase):
     value = self.evaluate(v.sparse_read([0, 3, 1, 2]))
     self.assertAllEqual(init_value[[0, 3, 1, 2], ...], value)
 
+  @test_util.run_deprecated_v1
   def testToFromProto(self):
     with self.cached_session():
       v = resource_variable_ops.ResourceVariable(1.0)
@@ -671,6 +704,7 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase):
     self.assertEqual(0.0, self.evaluate(v.value()))
 
   @test_util.run_in_graph_and_eager_modes
+  @test_util.run_deprecated_v1
   def testDestroyResource(self):
     v = resource_variable_ops.ResourceVariable(3.0, name="var0")
     self.evaluate(variables.global_variables_initializer())
@@ -684,6 +718,7 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase):
     self.evaluate(resource_variable_ops.destroy_resource_op(
         handle, ignore_lookup_error=True))
 
+  @test_util.run_deprecated_v1
   def testAssignDifferentShapes(self):
     with self.cached_session() as sess, variable_scope.variable_scope(
         "foo", use_resource=True):
@@ -704,6 +739,7 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase):
           assign = var.assign(np.zeros(shape=[2, 2]))
           self.evaluate(assign)
 
+  @test_util.run_deprecated_v1
   def testDtypeAfterFromProto(self):
     v = resource_variable_ops.ResourceVariable(2.0)
     w = resource_variable_ops.ResourceVariable.from_proto(v.to_proto())
@@ -711,6 +747,7 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase):
     self.assertEqual(v.dtype, w.dtype)
 
   # TODO(alive): get caching to work in eager mode.
+  @test_util.run_deprecated_v1
   def testCachingDevice(self):
     with ops.device("/job:server/task:1"):
       v = resource_variable_ops.ResourceVariable(
@@ -726,6 +763,7 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase):
       with self.assertRaises(ValueError):
         _ = w.value().op.get_attr("_class")
 
+  @test_util.run_deprecated_v1
   def testSharedName(self):
     with self.cached_session():
       v = resource_variable_ops.ResourceVariable(300.0, name="var4")
@@ -736,7 +774,7 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase):
           # Needed in Eager since we get a unique container name by default.
           container=ops.get_default_graph()._container)
       w_read = resource_variable_ops.read_variable_op(w, v.dtype.base_dtype)
-      self.assertEqual(300.0, w_read.eval())
+      self.assertEqual(300.0, self.evaluate(w_read))
 
       x = resource_variable_ops.var_handle_op(
           dtype=v.dtype.base_dtype, shape=v.get_shape(), shared_name="var5",
@@ -744,6 +782,7 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase):
       with self.assertRaisesOpError("Resource .*/var5/.* does not exist"):
         resource_variable_ops.read_variable_op(x, v.dtype.base_dtype).eval()
 
+  @test_util.run_deprecated_v1
   def testSharedNameWithNamescope(self):
     with self.cached_session():
       with ops.name_scope("foo"):
@@ -772,6 +811,7 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase):
           "<unknown>",
           str(v.sparse_read(array_ops.placeholder(dtypes.int32)).shape))
 
+  @test_util.run_deprecated_v1
   def testSetInitialValue(self):
     with self.cached_session():
       # Initialize variable with a value different from the initial value passed
@@ -780,6 +820,7 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase):
       v.initializer.run(feed_dict={v.initial_value: 3.0})
       self.assertEqual(3.0, v.value().eval())
 
+  @test_util.run_deprecated_v1
   def testControlFlowInitialization(self):
     """Expects an error if an initializer is in a control-flow scope."""
 
@@ -916,6 +957,7 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase):
       self.assertAllEqual(self.evaluate(v.assign_add(1)), [1, 2, 3, 4])
 
   @test_util.run_in_graph_and_eager_modes
+  @test_util.run_deprecated_v1
   def testCopyToGraphUninitialized(self):
     v = resource_variable_ops.ResourceVariable([0, 1, 2, 3])
     copy_to_graph = ops.Graph()
diff --git a/tensorflow/python/kernel_tests/reverse_sequence_op_test.py b/tensorflow/python/kernel_tests/reverse_sequence_op_test.py
index 56609bd0a5e..05307c9834a 100644
--- a/tensorflow/python/kernel_tests/reverse_sequence_op_test.py
+++ b/tensorflow/python/kernel_tests/reverse_sequence_op_test.py
@@ -23,6 +23,7 @@ from six.moves import xrange  # pylint: disable=redefined-builtin
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.platform import test
@@ -42,12 +43,12 @@ class ReverseSequenceTest(test.TestCase):
       ans = array_ops.reverse_sequence(
           x, batch_axis=batch_axis, seq_axis=seq_axis, seq_lengths=seq_lengths)
       if expected_err_re is None:
-        tf_ans = ans.eval()
+        tf_ans = self.evaluate(ans)
         self.assertAllClose(tf_ans, truth, atol=1e-10)
         self.assertShapeEqual(truth, ans)
       else:
         with self.assertRaisesOpError(expected_err_re):
-          ans.eval()
+          self.evaluate(ans)
 
   def _testBothReverseSequence(self,
                                x,
@@ -107,6 +108,7 @@ class ReverseSequenceTest(test.TestCase):
   def testComplex128Basic(self):
     self._testBasic(np.complex128)
 
+  @test_util.run_deprecated_v1
   def testFloatReverseSequenceGrad(self):
     x = np.asarray(
         [[[1, 2, 3, 4], [5, 6, 7, 8]], [[9, 10, 11, 12], [13, 14, 15, 16]],
@@ -133,6 +135,7 @@ class ReverseSequenceTest(test.TestCase):
     print("ReverseSequence gradient error = %g" % err)
     self.assertLess(err, 1e-8)
 
+  @test_util.run_deprecated_v1
   def testShapeFunctionEdgeCases(self):
     t = array_ops.reverse_sequence(
         array_ops.placeholder(
diff --git a/tensorflow/python/kernel_tests/rnn_test.py b/tensorflow/python/kernel_tests/rnn_test.py
index 0090b7332f9..3bc457f8fb6 100644
--- a/tensorflow/python/kernel_tests/rnn_test.py
+++ b/tensorflow/python/kernel_tests/rnn_test.py
@@ -262,6 +262,7 @@ class RNNTest(test.TestCase):
       rnn.dynamic_rnn(cell, inputs, dtype=dtypes.float32, sequence_length=[4])
 
   @test_util.run_in_graph_and_eager_modes
+  @test_util.run_deprecated_v1
   def testTensorArrayStateIsAccepted(self):
     cell = TensorArrayStateRNNCell()
     in_eager_mode = context.executing_eagerly()
@@ -285,6 +286,7 @@ class RNNTest(test.TestCase):
     self.assertAllEqual(4, state[0])
     self.assertAllEqual([[[1]], [[2]], [[3]], [[4]]], state[1])
 
+  @test_util.run_deprecated_v1
   def testCellGetInitialState(self):
     cell = rnn_cell_impl.BasicRNNCell(5)
     with self.assertRaisesRegexp(
@@ -345,6 +347,7 @@ class RNNTest(test.TestCase):
     self._assert_cell_builds(contrib_rnn.IndyLSTMCell, f32, 5, 7, 3)
     self._assert_cell_builds(contrib_rnn.IndyLSTMCell, f64, 5, 7, 3)
 
+  @test_util.run_deprecated_v1
   def testRNNWithKerasSimpleRNNCell(self):
     with self.cached_session() as sess:
       input_shape = 10
@@ -378,6 +381,7 @@ class RNNTest(test.TestCase):
       self.assertEqual(len(outputs), batch)
       self.assertEqual(len(state), batch)
 
+  @test_util.run_deprecated_v1
   def testRNNWithKerasGRUCell(self):
     with self.cached_session() as sess:
       input_shape = 10
@@ -411,6 +415,7 @@ class RNNTest(test.TestCase):
       self.assertEqual(len(outputs), batch)
       self.assertEqual(len(state), batch)
 
+  @test_util.run_deprecated_v1
   def testRNNWithKerasLSTMCell(self):
     with self.cached_session() as sess:
       input_shape = 10
@@ -448,6 +453,7 @@ class RNNTest(test.TestCase):
       self.assertEqual(len(state[0]), batch)
       self.assertEqual(len(state[1]), batch)
 
+  @test_util.run_deprecated_v1
   def testRNNWithStackKerasCell(self):
     with self.cached_session() as sess:
       input_shape = 10
@@ -491,6 +497,7 @@ class RNNTest(test.TestCase):
       for s in state:
         self.assertEqual(len(s), batch)
 
+  @test_util.run_deprecated_v1
   def testStaticRNNWithKerasSimpleRNNCell(self):
     with self.cached_session() as sess:
       input_shape = 10
@@ -529,6 +536,7 @@ class RNNTest(test.TestCase):
       self.assertEqual(len(outputs[0]), batch)
       self.assertEqual(len(state), batch)
 
+  @test_util.run_deprecated_v1
   def testKerasAndTFRNNLayerOutputComparison(self):
     input_shape = 10
     output_shape = 5
@@ -562,6 +570,7 @@ class RNNTest(test.TestCase):
     self.assertAllClose(tf_out, k_out)
     self.assertAllClose(tf_state, k_state)
 
+  @test_util.run_deprecated_v1
   def testSimpleRNNCellAndBasicRNNCellComparison(self):
     input_shape = 10
     output_shape = 5
@@ -601,6 +610,7 @@ class RNNTest(test.TestCase):
     self.assertAllClose(tf_out, k_out, atol=1e-5)
     self.assertAllClose(tf_state, k_state, atol=1e-5)
 
+  @test_util.run_deprecated_v1
   def testBasicLSTMCellInterchangeWithLSTMCell(self):
     with self.session(graph=ops_lib.Graph()) as sess:
       basic_cell = rnn_cell_impl.BasicLSTMCell(1)
diff --git a/tensorflow/python/kernel_tests/save_restore_ops_test.py b/tensorflow/python/kernel_tests/save_restore_ops_test.py
index cb9aa1e34d6..fecc9a3800f 100644
--- a/tensorflow/python/kernel_tests/save_restore_ops_test.py
+++ b/tensorflow/python/kernel_tests/save_restore_ops_test.py
@@ -17,14 +17,30 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import os
+
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.client import session
+from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import gen_io_ops
 from tensorflow.python.ops import io_ops
 from tensorflow.python.platform import test
 
 
+class SaveTest(test.TestCase):
+
+  @test_util.run_in_graph_and_eager_modes
+  def testRelativePath(self):
+    os.chdir(self.get_temp_dir())
+    self.evaluate(io_ops.save_v2(
+        "ckpt", ["x"], [""], [constant_op.constant(100.)]))
+    self.assertAllEqual([100.],
+                        self.evaluate(io_ops.restore_v2(
+                            "ckpt", ["x"], [""], [dtypes.float32])))
+
+
 class ShardedFileOpsTest(test.TestCase):
 
   def testShardedFileName(self):
@@ -39,6 +55,7 @@ class ShardedFileOpsTest(test.TestCase):
 
 class ShapeInferenceTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testRestoreV2WithSliceInput(self):
     op = io_ops.restore_v2("model", ["var1", "var2"], ["", "3 4 0,1:-"],
                            [dtypes.float32, dtypes.float32])
@@ -46,11 +63,13 @@ class ShapeInferenceTest(test.TestCase):
     self.assertFalse(op[0].get_shape().is_fully_defined())
     self.assertEqual([1, 4], op[1].get_shape())
 
+  @test_util.run_deprecated_v1
   def testRestoreV2NumSlicesNotMatch(self):
     with self.assertRaises(ValueError):
       io_ops.restore_v2("model", ["var1", "var2", "var3"], ["", "3 4 0,1:-"],
                         [dtypes.float32, dtypes.float32])
 
+  @test_util.run_deprecated_v1
   def testRestoreSlice(self):
     op = gen_io_ops.restore_slice("model", "var", "3 4 0,1:-", dtypes.float32)
     self.assertEqual([1, 4], op.get_shape())
diff --git a/tensorflow/python/kernel_tests/scan_ops_test.py b/tensorflow/python/kernel_tests/scan_ops_test.py
index b3692225652..33e491fee1d 100644
--- a/tensorflow/python/kernel_tests/scan_ops_test.py
+++ b/tensorflow/python/kernel_tests/scan_ops_test.py
@@ -24,6 +24,7 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
@@ -88,12 +89,14 @@ class CumsumTest(test.TestCase):
       for reverse in [True, False]:
         self._compare(x, axis, exclusive, reverse)
 
+  @test_util.run_deprecated_v1
   def testEmpty(self):
     for dtype in self.valid_dtypes:
       x = np.zeros([0]).astype(dtype)
       for axis in (-1, 0):
         self._compareAll(x, axis)
 
+  @test_util.run_deprecated_v1
   def testAxisType(self):
     for dtype in self.valid_dtypes:
       x = np.arange(1, 6).reshape([5]).astype(dtype)
@@ -102,30 +105,40 @@ class CumsumTest(test.TestCase):
           axis = constant_op.constant(0, axis_dtype)
           tf_out = math_ops.cumsum(x, axis).eval()
 
+  @test_util.run_deprecated_v1
   def test1D(self):
     for dtype in self.valid_dtypes:
       x = np.arange(1, 6).reshape([5]).astype(dtype)
       for axis in (-1, 0):
         self._compareAll(x, axis)
 
+  @test_util.run_deprecated_v1
   def test2D(self):
     for dtype in self.valid_dtypes:
       x = np.arange(0, 10).reshape([2, 5]).astype(dtype)
       for axis in (-2, -1, 0, 1):
         self._compareAll(x, axis)
 
+  @test_util.run_deprecated_v1
   def test3D(self):
     for dtype in self.valid_dtypes:
       x = np.arange(0, 20).reshape([2, 2, 5]).astype(dtype)
       for axis in (-3, -2, -1, 0, 1, 2):
         self._compareAll(x, axis)
 
+  @test_util.run_deprecated_v1
   def test6D(self):
     for dtype in self.valid_dtypes:
       x = np.arange(1, 145).reshape([2, 2, 3, 3, 2, 2]).astype(dtype)
       for axis in range(-6, 6, 3):
         self._compareAll(x, axis)
 
+  @test_util.run_deprecated_v1
+  def testLarge(self):
+    for dtype in self.valid_dtypes:
+      x = np.ones([1000000], dtype=dtype) / 1024
+      self._compareAll(x, 0)
+
   def testInvalidAxis(self):
     x = np.arange(0, 10).reshape([2, 5]).astype(np.float32)
     input_tensor = ops.convert_to_tensor(x)
@@ -152,22 +165,27 @@ class CumsumTest(test.TestCase):
           t, shape, result, shape, x_init_value=x, delta=1)
     self.assertAllClose(jacob_t, jacob_n, rtol=1e-8, atol=1e-8)
 
+  @test_util.run_deprecated_v1
   def testGradient(self):
     for axis in (-1, 0):
       self._compareGradient([50], axis, False, False)
 
+  @test_util.run_deprecated_v1
   def testGradientReverse(self):
     for axis in (-1, 0):
       self._compareGradient([50], axis, False, True)
 
+  @test_util.run_deprecated_v1
   def testGradientExclusive(self):
     for axis in (-1, 0):
       self._compareGradient([50], axis, True, False)
 
+  @test_util.run_deprecated_v1
   def testGradientExclusiveReverse(self):
     for axis in (-1, 0):
       self._compareGradient([50], axis, True, True)
 
+  @test_util.run_deprecated_v1
   def testGradient2D(self):
     for axis in (-1, 0, 1):
       for exclusive in [True, False]:
@@ -194,12 +212,14 @@ class CumprodTest(test.TestCase):
       for reverse in [True, False]:
         self._compare(x, axis, exclusive, reverse)
 
+  @test_util.run_deprecated_v1
   def testEmpty(self):
     for dtype in self.valid_dtypes:
       x = np.zeros([0]).astype(dtype)
       for axis in (-1, 0):
         self._compareAll(x, axis)
 
+  @test_util.run_deprecated_v1
   def testAxisType(self):
     for dtype in self.valid_dtypes:
       x = np.arange(1, 6).reshape([5]).astype(dtype)
@@ -208,24 +228,28 @@ class CumprodTest(test.TestCase):
           axis = constant_op.constant(0, axis_dtype)
           tf_out = math_ops.cumprod(x, axis).eval()
 
+  @test_util.run_deprecated_v1
   def test1D(self):
     for dtype in self.valid_dtypes:
       x = np.arange(1, 6).reshape([5]).astype(dtype)
       for axis in (-1, 0):
         self._compareAll(x, axis)
 
+  @test_util.run_deprecated_v1
   def test2D(self):
     for dtype in self.valid_dtypes:
       x = np.arange(1, 11).reshape([2, 5]).astype(dtype)
       for axis in (-2, -1, 0, 1):
         self._compareAll(x, axis)
 
+  @test_util.run_deprecated_v1
   def test3D(self):
     for dtype in self.valid_dtypes:
       x = np.arange(1, 21).reshape([2, 2, 5]).astype(dtype)
       for axis in (-3, -2, -1, 0, 1, 2):
         self._compareAll(x, axis)
 
+  @test_util.run_deprecated_v1
   def test6D(self):
     for dtype in self.valid_dtypes:
       x = np.arange(1, 145).reshape([2, 2, 3, 3, 2, 2]).astype(dtype)
@@ -258,22 +282,27 @@ class CumprodTest(test.TestCase):
           t, shape, result, shape, x_init_value=x, delta=1)
     self.assertAllClose(jacob_t, jacob_n, rtol=1e-8, atol=1e-8)
 
+  @test_util.run_deprecated_v1
   def testGradient(self):
     for axis in (-1, 0):
       self._compareGradient([8], axis, False, False)
 
+  @test_util.run_deprecated_v1
   def testGradientReverse(self):
     for axis in (-1, 0):
       self._compareGradient([8], axis, False, True)
 
+  @test_util.run_deprecated_v1
   def testGradientExclusive(self):
     for axis in (-1, 0):
       self._compareGradient([8], axis, True, False)
 
+  @test_util.run_deprecated_v1
   def testGradientExclusiveReverse(self):
     for axis in (-1, 0):
       self._compareGradient([8], axis, True, True)
 
+  @test_util.run_deprecated_v1
   def testGradient2D(self):
     for axis in (-2, -1, 0, 1):
       for exclusive in [True, False]:
diff --git a/tensorflow/python/kernel_tests/scatter_nd_ops_test.py b/tensorflow/python/kernel_tests/scatter_nd_ops_test.py
index 0ed508b9fe2..c1241ba87ee 100644
--- a/tensorflow/python/kernel_tests/scatter_nd_ops_test.py
+++ b/tensorflow/python/kernel_tests/scatter_nd_ops_test.py
@@ -29,6 +29,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import state_ops
@@ -144,7 +145,7 @@ class StatefulScatterNdTest(test.TestCase):
         tf_scatter(ref_var, indices, updates).eval()
 
         # Compare
-        self.assertAllClose(new, ref_var.eval())
+        self.assertAllClose(new, self.evaluate(ref_var))
 
   def _VariableRankTests(self, np_scatter, tf_scatter):
     for vtype in (np.int32, np.float16, np.float32, np.float64, np.complex64,
@@ -161,10 +162,11 @@ class StatefulScatterNdTest(test.TestCase):
     init = variables.global_variables_initializer()
 
     with self.session(use_gpu=True) as sess:
-      sess.run(init)
-      result = sess.run(scatter)
+      self.evaluate(init)
+      result = self.evaluate(scatter)
       self.assertAllClose(result, expected)
 
+  @test_util.run_deprecated_v1
   def testSimpleResource(self):
     indices = constant_op.constant([[4], [3], [1], [7]], dtype=dtypes.int32)
     updates = constant_op.constant([9, 10, 11, 12], dtype=dtypes.float32)
@@ -175,8 +177,8 @@ class StatefulScatterNdTest(test.TestCase):
     init = variables.global_variables_initializer()
 
     with self.session(use_gpu=True) as sess:
-      sess.run(init)
-      sess.run(scatter)
+      self.evaluate(init)
+      self.evaluate(scatter)
       self.assertAllClose(ref.eval(), expected)
 
   def testSimple2(self):
@@ -189,8 +191,8 @@ class StatefulScatterNdTest(test.TestCase):
     init = variables.global_variables_initializer()
 
     with self.session(use_gpu=True) as sess:
-      sess.run(init)
-      result = sess.run(scatter)
+      self.evaluate(init)
+      result = self.evaluate(scatter)
       self.assertAllClose(result, expected)
 
   def testSimple3(self):
@@ -203,16 +205,19 @@ class StatefulScatterNdTest(test.TestCase):
     init = variables.global_variables_initializer()
 
     with self.session(use_gpu=True) as sess:
-      sess.run(init)
-      result = sess.run(scatter)
+      self.evaluate(init)
+      result = self.evaluate(scatter)
       self.assertAllClose(result, expected)
 
+  @test_util.run_deprecated_v1
   def testVariableRankUpdate(self):
     self._VariableRankTests(_NumpyUpdate, state_ops.scatter_nd_update)
 
+  @test_util.run_deprecated_v1
   def testVariableRankAdd(self):
     self._VariableRankTests(_NumpyAdd, state_ops.scatter_nd_add)
 
+  @test_util.run_deprecated_v1
   def testVariableRankSub(self):
     self._VariableRankTests(_NumpySub, state_ops.scatter_nd_sub)
 
@@ -230,6 +235,7 @@ class StatefulScatterNdTest(test.TestCase):
         self._VariableRankTest(
             np_scatter, tf_scatter, vtype, itype, repeat_indices=True)
 
+  @test_util.run_deprecated_v1
   def testScatterRepeatIndices(self):
     """This tests scatter_add using indices that repeat."""
     self._ScatterRepeatIndicesTest(_NumpyAdd, state_ops.scatter_nd_add)
@@ -249,8 +255,9 @@ class StatefulScatterNdTest(test.TestCase):
   #             [[0]], dtype=tf.int64), [False])
   #     var.initializer.run()
   #     session.run([update0, update1])
-  #     self.assertAllEqual([False, True], var.eval())
+  #     self.assertAllEqual([False, True], self.evaluate(var))
 
+  @test_util.run_deprecated_v1
   def testScatterOutOfRangeCpu(self):
     # TODO(simister): Re-enable once binary size increase due to
     # scatter_nd ops is under control.
@@ -287,6 +294,7 @@ class StatefulScatterNdTest(test.TestCase):
         state_ops.scatter_nd_update(ref, indices,
                                     updates).get_shape().as_list(), shape)
 
+  @test_util.run_deprecated_v1
   def testResVarInvalidOutputShape(self):
     res = variables.Variable(
         initial_value=lambda: array_ops.zeros(shape=[], dtype=dtypes.float32),
@@ -296,6 +304,7 @@ class StatefulScatterNdTest(test.TestCase):
       with self.assertRaisesOpError("Output must be at least 1-D"):
         state_ops.scatter_nd_update(res, [[0]], [0.22]).eval()
 
+  @test_util.run_deprecated_v1
   def testExtraIndicesDimensions(self):
     indices = array_ops.zeros([1, 1, 2], dtypes.int32)
     updates = array_ops.zeros([1, 1], dtypes.int32)
@@ -307,8 +316,9 @@ class StatefulScatterNdTest(test.TestCase):
     expected_result = np.zeros([2, 2], dtype=np.int32)
     with self.cached_session():
       ref.initializer.run()
-      self.assertAllEqual(expected_result, scatter_update.eval())
+      self.assertAllEqual(expected_result, self.evaluate(scatter_update))
 
+  @test_util.run_deprecated_v1
   def testRank3InvalidShape1(self):
     indices = array_ops.zeros([3, 2, 2], dtypes.int32)
     updates = array_ops.zeros([2, 2, 2], dtypes.int32)
@@ -318,6 +328,7 @@ class StatefulScatterNdTest(test.TestCase):
         ValueError, "The outer \\d+ dimensions of indices\\.shape="):
       state_ops.scatter_nd_update(ref, indices, updates)
 
+  @test_util.run_deprecated_v1
   def testRank3InvalidShape2(self):
     indices = array_ops.zeros([2, 2, 1], dtypes.int32)
     updates = array_ops.zeros([2, 2], dtypes.int32)
@@ -327,6 +338,7 @@ class StatefulScatterNdTest(test.TestCase):
         ValueError, "The inner \\d+ dimensions of input\\.shape="):
       state_ops.scatter_nd_update(ref, indices, updates)
 
+  @test_util.run_deprecated_v1
   def testConcurrentUpdates(self):
     num_updates = 10000
     update_values = np.random.rand(num_updates)
@@ -341,8 +353,8 @@ class StatefulScatterNdTest(test.TestCase):
     init = variables.global_variables_initializer()
 
     with session.Session() as sess:
-      sess.run(init)
-      result = sess.run(scatter)
+      self.evaluate(init)
+      result = self.evaluate(scatter)
       assert np.allclose(result, expected_result)
 
   # TODO(fpmc): Re-enable this test when gpu_pip test actually runs on a GPU.
@@ -421,7 +433,7 @@ class ScatterNdTest(test.TestCase):
                          b"", b"", b"seven"])
     scatter = self.scatter_nd(indices, updates, shape=(8,))
     with self.cached_session() as sess:
-      result = sess.run(scatter)
+      result = self.evaluate(scatter)
       self.assertAllEqual(expected, result)
 
     # Same indice is updated twice by same value.
@@ -432,7 +444,7 @@ class ScatterNdTest(test.TestCase):
     expected = np.array([b"", b"", b"", b"bb", b"a", b"", b"", b"c"])
     scatter = self.scatter_nd(indices, updates, shape=(8,))
     with self.cached_session() as sess:
-      result = sess.run(scatter)
+      result = self.evaluate(scatter)
       self.assertAllEqual(expected, result)
 
     # Same indice is updated twice by different value.
@@ -444,7 +456,7 @@ class ScatterNdTest(test.TestCase):
                 np.array([b"", b"", b"", b"cb", b"a", b"", b"", b"d"])]
     scatter = self.scatter_nd(indices, updates, shape=(8,))
     with self.cached_session() as sess:
-      result = sess.run(scatter)
+      result = self.evaluate(scatter)
       self.assertTrue(np.array_equal(result, expected[0]) or
                       np.array_equal(result, expected[1]))
 
@@ -455,6 +467,7 @@ class ScatterNdTest(test.TestCase):
     self.assertAllEqual(
         self.scatter_nd(indices, updates, shape).get_shape().as_list(), shape)
 
+  @test_util.run_deprecated_v1
   def testExtraIndicesDimensions(self):
     indices = array_ops.zeros([1, 1, 2], dtypes.int32)
     updates = array_ops.zeros([1, 1], dtypes.int32)
@@ -463,26 +476,30 @@ class ScatterNdTest(test.TestCase):
     self.assertAllEqual(scatter.get_shape().as_list(), shape)
     expected_result = np.zeros([2, 2], dtype=np.int32)
     with self.cached_session():
-      self.assertAllEqual(expected_result, scatter.eval())
+      self.assertAllEqual(expected_result, self.evaluate(scatter))
 
+  @test_util.run_deprecated_v1
   def testUndefinedIndicesShape(self):
     indices = array_ops.placeholder(dtypes.int32, shape=None)
     updates = array_ops.placeholder(dtypes.int32, shape=[2, 2, 2])
     shape = constant_op.constant([2, 2, 2], dtypes.int32)
     self.scatter_nd(indices, updates, shape)
 
+  @test_util.run_deprecated_v1
   def testUndefinedUpdatesShape(self):
     indices = array_ops.placeholder(dtypes.int32, shape=[2, 2, 2])
     updates = array_ops.placeholder(dtypes.int32, shape=None)
     shape = constant_op.constant([2, 2, 2], dtypes.int32)
     self.scatter_nd(indices, updates, shape)
 
+  @test_util.run_deprecated_v1
   def testUndefinedOutputShape(self):
     indices = array_ops.placeholder(dtypes.int32, shape=[2, 2, 2])
     updates = array_ops.placeholder(dtypes.int32, shape=[2, 2, 2])
     shape = array_ops.placeholder(dtypes.int32, shape=[None])
     self.scatter_nd(indices, updates, shape)
 
+  @test_util.run_deprecated_v1
   def testEmptyOutputShape1(self):
     indices = array_ops.zeros([2, 2, 2], dtypes.int32)
     updates = array_ops.zeros([2, 2, 2], dtypes.int32)
@@ -492,6 +509,7 @@ class ScatterNdTest(test.TestCase):
         ValueError, "Indices and updates specified for empty output shape"):
       self.scatter_nd(indices, updates, shape)
 
+  @test_util.run_deprecated_v1
   def testEmptyOutputShape2(self):
     indices = array_ops.placeholder(dtypes.int32, shape=None)
     updates = array_ops.placeholder(dtypes.int32, shape=None)
@@ -505,6 +523,7 @@ class ScatterNdTest(test.TestCase):
             updates: np.zeros([2, 2, 2], dtype=np.int32)
         })
 
+  @test_util.run_deprecated_v1
   def testEmptyOutputShape3(self):
     indices = array_ops.zeros([0], dtypes.int32)
     updates = array_ops.zeros([0], dtypes.int32)
@@ -514,6 +533,7 @@ class ScatterNdTest(test.TestCase):
     with self.cached_session():
       self.assertEqual(scatter.eval().size, 0)
 
+  @test_util.run_deprecated_v1
   def testRank3InvalidShape1(self):
     indices = array_ops.zeros([3, 2, 2], dtypes.int32)
     updates = array_ops.zeros([2, 2, 2], dtypes.int32)
@@ -522,6 +542,7 @@ class ScatterNdTest(test.TestCase):
         ValueError, "The outer \\d+ dimensions of indices\\.shape="):
       self.scatter_nd(indices, updates, shape)
 
+  @test_util.run_deprecated_v1
   def testRank3InvalidShape2(self):
     indices = array_ops.zeros([2, 2, 1], dtypes.int32)
     updates = array_ops.zeros([2, 2], dtypes.int32)
@@ -530,6 +551,7 @@ class ScatterNdTest(test.TestCase):
         ValueError, "The inner \\d+ dimensions of (input|output)\\.shape="):
       self.scatter_nd(indices, updates, shape)
 
+  @test_util.run_deprecated_v1
   def testGradientsRank2ElementUpdate(self):
     for dtype in GRADIENT_TESTS_DTYPES:
       indices = constant_op.constant([[0, 0], [1, 1]], dtype=dtypes.int32)
@@ -545,10 +567,11 @@ class ScatterNdTest(test.TestCase):
       expected_input_grad = np.array([[1, 2], [3, 4]],
                                      dtype=dtype.as_numpy_dtype())
       with self.cached_session():
-        self.assertAllEqual(expected_updates_grad, updates_grad.eval())
+        self.assertAllEqual(expected_updates_grad, self.evaluate(updates_grad))
         if self.non_aliasing_add_test:
-          self.assertAllEqual(expected_input_grad, input_grad.eval())
+          self.assertAllEqual(expected_input_grad, self.evaluate(input_grad))
 
+  @test_util.run_deprecated_v1
   def testGradientsRank2SliceUpdate(self):
     for dtype in GRADIENT_TESTS_DTYPES:
       indices = constant_op.constant([[1], [0]], dtype=dtypes.int32)
@@ -565,10 +588,11 @@ class ScatterNdTest(test.TestCase):
       expected_input_grad = np.array([[3, 4], [1, 2]],
                                      dtype=dtype.as_numpy_dtype())
       with self.cached_session():
-        self.assertAllEqual(expected_updates_grad, updates_grad.eval())
+        self.assertAllEqual(expected_updates_grad, self.evaluate(updates_grad))
         if self.non_aliasing_add_test:
-          self.assertAllEqual(expected_input_grad, input_grad.eval())
+          self.assertAllEqual(expected_input_grad, self.evaluate(input_grad))
 
+  @test_util.run_deprecated_v1
   def testGradientsRank3SliceUpdate(self):
     for dtype in GRADIENT_TESTS_DTYPES:
       indices = constant_op.constant([[[0, 1], [1, 0]], [[0, 0], [1, 1]]],
@@ -588,10 +612,11 @@ class ScatterNdTest(test.TestCase):
       expected_input_grad = np.array([[[1, 2], [3, 4]], [[5, 6], [7, 8]]],
                                      dtype=dtype.as_numpy_dtype())
       with self.cached_session():
-        self.assertAllEqual(expected_updates_grad, updates_grad.eval())
+        self.assertAllEqual(expected_updates_grad, self.evaluate(updates_grad))
         if self.non_aliasing_add_test:
-          self.assertAllEqual(expected_input_grad, input_grad.eval())
+          self.assertAllEqual(expected_input_grad, self.evaluate(input_grad))
 
+  @test_util.run_deprecated_v1
   def testGradientsRank7SliceUpdate(self):
     for dtype in GRADIENT_TESTS_DTYPES:
       indices = constant_op.constant(
@@ -615,10 +640,11 @@ class ScatterNdTest(test.TestCase):
           [[[[[[[1, 2], [3, 4]]]], [[[[5, 6], [7, 8]]]]]]],
           dtype=dtype.as_numpy_dtype())
       with self.cached_session():
-        self.assertAllEqual(expected_updates_grad, updates_grad.eval())
+        self.assertAllEqual(expected_updates_grad, self.evaluate(updates_grad))
         if self.non_aliasing_add_test:
-          self.assertAllEqual(expected_input_grad, input_grad.eval())
+          self.assertAllEqual(expected_input_grad, self.evaluate(input_grad))
 
+  @test_util.run_deprecated_v1
   def testScatterNdRepatedIndicesAdd(self):
     indices = array_ops.zeros([100000, 1], dtypes.int32)
     values = np.random.randn(100000)
@@ -627,6 +653,7 @@ class ScatterNdTest(test.TestCase):
       val = self.scatter_nd(indices, values, shape).eval()
     self.assertAllClose([np.sum(values)], val)
 
+  @test_util.run_deprecated_v1
   def testSmokeScatterNdBatch2DSliceDim2(self):
     with self.cached_session():
       indices = array_ops.zeros([3, 5, 2], dtype=dtypes.int32)
@@ -634,6 +661,7 @@ class ScatterNdTest(test.TestCase):
       shape = [4, 6, 7]
       self.scatter_nd(indices, values, shape).eval()
 
+  @test_util.run_deprecated_v1
   def testSmokeScatterNdBatch1DSliceDim2(self):
     with self.cached_session():
       indices = array_ops.zeros([0, 2], dtype=dtypes.int32)
@@ -641,6 +669,7 @@ class ScatterNdTest(test.TestCase):
       shape = [4, 6, 7]
       self.scatter_nd(indices, values, shape).eval()
 
+  @test_util.run_deprecated_v1
   def testSmokeScatterNdBatch1DSliceDim3ShapeRank7(self):
     with self.cached_session():
       indices = array_ops.zeros([1, 3], dtype=dtypes.int32)
@@ -648,6 +677,7 @@ class ScatterNdTest(test.TestCase):
       shape = [3, 4, 5, 6, 7, 8, 9]
       self.scatter_nd(indices, values, shape).eval()
 
+  @test_util.run_deprecated_v1
   def testSmokeScatterNdBatch2DSliceDim3ShapeRank7(self):
     with self.cached_session():
       indices = array_ops.zeros([1, 2, 3], dtype=dtypes.int32)
@@ -669,5 +699,55 @@ class ScatterNdNonAliasingAddTest(ScatterNdTest):
     pass
 
 
+class ScatterNdTensorTest(test.TestCase):
+
+  @test_util.run_in_graph_and_eager_modes
+  def testUpdateAddSub(self):
+    indices = constant_op.constant([[4], [3], [1], [7]])
+    updates = constant_op.constant([9, 10, 11, 12], dtype=dtypes.float32)
+    t = array_ops.ones([8], dtype=dtypes.float32)
+    assigned = array_ops.tensor_scatter_update(t, indices, updates)
+    added = array_ops.tensor_scatter_add(t, indices, updates)
+    subbed = array_ops.tensor_scatter_sub(t, indices, updates)
+
+    self.assertAllEqual(assigned,
+                        constant_op.constant([1, 11, 1, 10, 9, 1, 1, 12]))
+    self.assertAllEqual(added,
+                        constant_op.constant([1, 12, 1, 11, 10, 1, 1, 13]))
+    self.assertAllEqual(subbed,
+                        constant_op.constant([1, -10, 1, -9, -8, 1, 1, -11]))
+
+  def testUpdateAddSubGradients(self):
+
+    with self.cached_session():
+      indices = constant_op.constant([[3], [1]])
+      updates = constant_op.constant([9, 10], dtype=dtypes.float32)
+      x = array_ops.ones([4], dtype=dtypes.float32)
+
+      assigned = array_ops.tensor_scatter_update(x, indices, updates)
+      added = array_ops.tensor_scatter_add(x, indices, updates)
+      subbed = array_ops.tensor_scatter_sub(x, indices, updates)
+
+      err_assigned = gradient_checker.compute_gradient_error(
+          x, [4], assigned, [4])
+      err_added = gradient_checker.compute_gradient_error(x, [4], added, [4])
+      err_subbed = gradient_checker.compute_gradient_error(x, [4], subbed, [4])
+
+      self.assertLess(err_assigned, 2e-4)
+      self.assertLess(err_added, 2e-4)
+      self.assertLess(err_subbed, 2e-4)
+
+      err_assigned_wrt_updates = gradient_checker.compute_gradient_error(
+          updates, [2], assigned, [4])
+      err_added_wrt_updates = gradient_checker.compute_gradient_error(
+          updates, [2], added, [4])
+      err_subbed_wrt_updates = gradient_checker.compute_gradient_error(
+          updates, [2], subbed, [4])
+
+      self.assertLess(err_assigned_wrt_updates, 2e-4)
+      self.assertLess(err_added_wrt_updates, 2e-4)
+      self.assertLess(err_subbed_wrt_updates, 2e-4)
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/scatter_ops_test.py b/tensorflow/python/kernel_tests/scatter_ops_test.py
index 87c345245c1..623c17d373c 100644
--- a/tensorflow/python/kernel_tests/scatter_ops_test.py
+++ b/tensorflow/python/kernel_tests/scatter_ops_test.py
@@ -22,6 +22,7 @@ import numpy as np
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
@@ -196,84 +197,111 @@ class ScatterTest(test.TestCase):
         self._VariableRankTest(tf_scatter, vtype, itype, repeat_indices,
                                updates_are_scalar)
 
+  @test_util.run_deprecated_v1
   def testVariableRankUpdate(self):
     self._VariableRankTests(state_ops.scatter_update, False)
 
+  @test_util.run_deprecated_v1
   def testVariableRankAdd(self):
     self._VariableRankTests(state_ops.scatter_add, False)
 
+  @test_util.run_deprecated_v1
   def testVariableRankSub(self):
     self._VariableRankTests(state_ops.scatter_sub, False)
 
+  @test_util.run_deprecated_v1
   def testVariableRankMul(self):
     self._VariableRankTests(state_ops.scatter_mul, False)
 
+  @test_util.run_deprecated_v1
   def testVariableRankDiv(self):
     self._VariableRankTests(state_ops.scatter_div, False)
 
+  @test_util.run_deprecated_v1
   def testVariableRankMin(self):
     self._VariableRankTests(state_ops.scatter_min, False)
 
+  @test_util.run_deprecated_v1
   def testVariableRankMax(self):
     self._VariableRankTests(state_ops.scatter_max, False)
 
+  @test_util.run_deprecated_v1
   def testRepeatIndicesAdd(self):
     self._VariableRankTests(state_ops.scatter_add, True)
 
+  @test_util.run_deprecated_v1
   def testRepeatIndicesSub(self):
     self._VariableRankTests(state_ops.scatter_sub, True)
 
+  @test_util.run_deprecated_v1
   def testRepeatIndicesMul(self):
     self._VariableRankTests(state_ops.scatter_mul, True)
 
+  @test_util.run_deprecated_v1
   def testRepeatIndicesDiv(self):
     self._VariableRankTests(state_ops.scatter_div, True)
 
+  @test_util.run_deprecated_v1
   def testRepeatIndicesMin(self):
     self._VariableRankTests(state_ops.scatter_min, True)
 
+  @test_util.run_deprecated_v1
   def testRepeatIndicesMax(self):
     self._VariableRankTests(state_ops.scatter_max, True)
 
+  @test_util.run_deprecated_v1
   def testVariableRankUpdateScalar(self):
     self._VariableRankTests(state_ops.scatter_update, False, True)
 
+  @test_util.run_deprecated_v1
   def testVariableRankAddScalar(self):
     self._VariableRankTests(state_ops.scatter_add, False, True)
 
+  @test_util.run_deprecated_v1
   def testVariableRankSubScalar(self):
     self._VariableRankTests(state_ops.scatter_sub, False, True)
 
+  @test_util.run_deprecated_v1
   def testVariableRankMulScalar(self):
     self._VariableRankTests(state_ops.scatter_mul, False, True)
 
+  @test_util.run_deprecated_v1
   def testVariableRankDivScalar(self):
     self._VariableRankTests(state_ops.scatter_div, False, True)
 
+  @test_util.run_deprecated_v1
   def testVariableRankMinScalar(self):
     self._VariableRankTests(state_ops.scatter_min, False, True)
 
+  @test_util.run_deprecated_v1
   def testVariableRankMaxScalar(self):
     self._VariableRankTests(state_ops.scatter_max, False, True)
 
+  @test_util.run_deprecated_v1
   def testRepeatIndicesAddScalar(self):
     self._VariableRankTests(state_ops.scatter_add, True, True)
 
+  @test_util.run_deprecated_v1
   def testRepeatIndicesSubScalar(self):
     self._VariableRankTests(state_ops.scatter_sub, True, True)
 
+  @test_util.run_deprecated_v1
   def testRepeatIndicesMulScalar(self):
     self._VariableRankTests(state_ops.scatter_mul, True, True)
 
+  @test_util.run_deprecated_v1
   def testRepeatIndicesDivScalar(self):
     self._VariableRankTests(state_ops.scatter_div, True, True)
 
+  @test_util.run_deprecated_v1
   def testRepeatIndicesMinScalar(self):
     self._VariableRankTests(state_ops.scatter_min, True, True)
 
+  @test_util.run_deprecated_v1
   def testRepeatIndicesMaxScalar(self):
     self._VariableRankTests(state_ops.scatter_max, True, True)
 
+  @test_util.run_deprecated_v1
   def testBooleanScatterUpdate(self):
     if not test.is_gpu_available():
       with self.session(use_gpu=False) as session:
@@ -286,8 +314,9 @@ class ScatterTest(test.TestCase):
 
         session.run([update0, update1])
 
-        self.assertAllEqual([False, True], var.eval())
+        self.assertAllEqual([False, True], self.evaluate(var))
 
+  @test_util.run_deprecated_v1
   def testScatterOutOfRangeCpu(self):
     for op, _ in _TF_OPS_TO_NUMPY.items():
       params = np.array([1, 2, 3, 4, 5, 6]).astype(np.float32)
@@ -320,19 +349,19 @@ class ScatterTest(test.TestCase):
       updates = np.array([-3, -4, -5]).astype(np.float32)
       # With GPU, the code ignores indices that are out of range.
       # We don't test the implementation; just test there's no failures.
-      with self.cached_session(force_gpu=True):
+      with test_util.force_gpu():
         ref = variables.Variable(params)
         ref.initializer.run()
 
         # Indices all in range, no problem.
         indices = np.array([2, 0, 5])
-        op(ref, indices, updates).eval()
+        self.evaluate(op(ref, indices, updates))
 
         # Indicies out of range should not fail.
         indices = np.array([-1, 0, 5])
-        op(ref, indices, updates).eval()
+        self.evaluate(op(ref, indices, updates))
         indices = np.array([2, 0, 6])
-        op(ref, indices, updates).eval()
+        self.evaluate(op(ref, indices, updates))
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/kernel_tests/segment_reduction_ops_test.py b/tensorflow/python/kernel_tests/segment_reduction_ops_test.py
index 3f7e43b5335..8af1b47e83c 100644
--- a/tensorflow/python/kernel_tests/segment_reduction_ops_test.py
+++ b/tensorflow/python/kernel_tests/segment_reduction_ops_test.py
@@ -26,6 +26,7 @@ from tensorflow.python.client import session
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes as dtypes_lib
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variables
@@ -118,7 +119,7 @@ class SegmentReductionOpTest(SegmentReductionHelper):
           for np_op1, np_op2, tf_op in curr_ops_list:
             np_ans = self._segmentReduce(indices, np_x, np_op1, np_op2)
             s = tf_op(data=tf_x, segment_ids=indices)
-            tf_ans = s.eval()
+            tf_ans = self.evaluate(s)
             self.assertAllClose(np_ans, tf_ans)
             # NOTE(mrry): The static shape inference that computes
             # `tf_ans.shape` can only infer that sizes from dimension 1
@@ -126,6 +127,7 @@ class SegmentReductionOpTest(SegmentReductionHelper):
             # and may therefore vary dynamically.
             self.assertAllEqual(np_ans.shape[1:], tf_ans.shape[1:])
 
+  @test_util.run_deprecated_v1
   def testSegmentIdsShape(self):
     shape = [4, 4]
     tf_x, _ = self._input(shape)
@@ -133,6 +135,7 @@ class SegmentReductionOpTest(SegmentReductionHelper):
     with self.assertRaises(ValueError):
       math_ops.segment_sum(data=tf_x, segment_ids=indices)
 
+  @test_util.run_deprecated_v1
   def testSegmentIdsSize(self):
     shape = [4, 4]
     for use_gpu in [True, False]:
@@ -141,8 +144,9 @@ class SegmentReductionOpTest(SegmentReductionHelper):
         indices = [0, 1]
         s = math_ops.segment_sum(data=tf_x, segment_ids=indices)
         with self.assertRaisesOpError("segment_ids should be the same size"):
-          s.eval()
+          self.evaluate(s)
 
+  @test_util.run_deprecated_v1
   def testSegmentIdsValid(self):
     # This is a baseline for the following SegmentIdsInvalid* tests.
     shape = [4, 4]
@@ -161,7 +165,7 @@ class SegmentReductionOpTest(SegmentReductionHelper):
         indices = [1, 1, 2, 2]
         np_ans = self._segmentReduce(indices, np_x, np.add)
         s = math_ops.segment_sum(data=tf_x, segment_ids=indices)
-        tf_ans = s.eval()
+        tf_ans = self.evaluate(s)
         self.assertAllClose(np_ans, tf_ans)
 
   def testSegmentIdsHole(self):
@@ -172,9 +176,10 @@ class SegmentReductionOpTest(SegmentReductionHelper):
         indices = [0, 0, 3, 3]
         np_ans = self._segmentReduce(indices, np_x, np.add)
         s = math_ops.segment_sum(data=tf_x, segment_ids=indices)
-        tf_ans = s.eval()
+        tf_ans = self.evaluate(s)
         self.assertAllClose(np_ans, tf_ans)
 
+  @test_util.run_deprecated_v1
   def testSegmentIdsInvalid1(self):
     shape = [4, 4]
     with self.cached_session():
@@ -184,8 +189,9 @@ class SegmentReductionOpTest(SegmentReductionHelper):
       with self.assertRaisesOpError(
           r"Segment id -1 out of range \[0, 1\), possibly because "
           "'segment_ids' input is not sorted."):
-        s.eval()
+        self.evaluate(s)
 
+  @test_util.run_deprecated_v1
   def testSegmentIdsInvalid2(self):
     shape = [4, 4]
     with self.cached_session():
@@ -193,8 +199,9 @@ class SegmentReductionOpTest(SegmentReductionHelper):
       indices = [0, 1, 0, 1]
       s = math_ops.segment_sum(data=tf_x, segment_ids=indices)
       with self.assertRaisesOpError("segment ids are not increasing"):
-        s.eval()
+        self.evaluate(s)
 
+  @test_util.run_deprecated_v1
   def testSegmentIdsInvalid3(self):
     shape = [4, 4]
     with self.cached_session():
@@ -204,8 +211,9 @@ class SegmentReductionOpTest(SegmentReductionHelper):
       with self.assertRaisesOpError(
           r"Segment id 1 out of range \[0, 1\), possibly "
           "because 'segment_ids' input is not sorted."):
-        s.eval()
+        self.evaluate(s)
 
+  @test_util.run_deprecated_v1
   def testSegmentIdsInvalid4(self):
     shape = [4, 4]
     for use_gpu in [True, False]:
@@ -214,8 +222,9 @@ class SegmentReductionOpTest(SegmentReductionHelper):
         indices = [0, 0, 0, -1]
         s = math_ops.segment_sum(data=tf_x, segment_ids=indices)
         with self.assertRaisesOpError("segment ids must be >= 0"):
-          s.eval()
+          self.evaluate(s)
 
+  @test_util.run_deprecated_v1
   def testSegmentIdsInvalid5(self):
     shape = [4, 4]
     for use_gpu in [True, False]:
@@ -224,8 +233,9 @@ class SegmentReductionOpTest(SegmentReductionHelper):
         indices = [0, 0, 0, -2]
         s = math_ops.segment_sum(data=tf_x, segment_ids=indices)
         with self.assertRaisesOpError("segment ids must be >= 0"):
-          s.eval()
+          self.evaluate(s)
 
+  @test_util.run_deprecated_v1
   def testGradient(self):
     shape = [4, 4]
     indices = [0, 1, 2, 2]
@@ -297,7 +307,7 @@ class UnsortedSegmentTest(SegmentReductionHelper):
                   indices, np_x, np_op1, np_op2, num_segments=num_segments,
                   initial_value=init_op(dtype))
               s = tf_op(tf_x, segment_ids=indices, num_segments=num_segments)
-              tf_ans = s.eval()
+              tf_ans = self.evaluate(s)
               if dtype is dtypes_lib.bfloat16:
                 tf_ans = tf_ans.astype(np.float32)
               self.assertAllCloseAccordingToType(np_ans, tf_ans)
@@ -320,10 +330,11 @@ class UnsortedSegmentTest(SegmentReductionHelper):
               data=tf_x,
               segment_ids=indices,
               num_segments=num_segments_constant)
-          tf_ans = s.eval()
+          tf_ans = self.evaluate(s)
         self.assertAllClose(np_ans, tf_ans)
         self.assertShapeEqual(np_ans, s)
 
+  @test_util.run_deprecated_v1
   def testGradients(self):
     num_cols = 2
     indices_flat = np.array([0, 4, 0, -1, 3, -1, 4, 7, 7, 3])
@@ -346,6 +357,7 @@ class UnsortedSegmentTest(SegmentReductionHelper):
                   delta=1)
             self.assertAllClose(jacob_t, jacob_n)
 
+  @test_util.run_deprecated_v1
   def testProdGrad(self):
     # additional test for the prod gradient to ensure correct handling of zeros
     values = np.array([0, 0, 1, 0, 2, 2, 3, 3, 3], dtype=np.float32)
@@ -370,6 +382,7 @@ class UnsortedSegmentTest(SegmentReductionHelper):
           self.assertAllClose(jacob_t, jacob_n)
           self.assertAllClose(jacob_t, grad_gt)
 
+  @test_util.run_deprecated_v1
   def testGradientMatchesSegmentSum(self):
     # Strategy: compute the gradient for UnsortedSegmentSum and SegmentSum
     # and compare the outputs, which should be identical.
@@ -403,6 +416,7 @@ class UnsortedSegmentTest(SegmentReductionHelper):
       self.assertAllClose(unsorted_jacob_t, sorted_jacob_t)
       self.assertAllClose(unsorted_jacob_n, sorted_jacob_n)
 
+  @test_util.run_deprecated_v1
   def testBadIndices(self):
     # Note: GPU kernel does not return the out-of-range error needed for this
     # test, so this test is marked as cpu-only.
@@ -412,8 +426,9 @@ class UnsortedSegmentTest(SegmentReductionHelper):
         unsorted = math_ops.unsorted_segment_sum([[17]], bad, num_segments=2)
         with self.assertRaisesOpError(
             r"segment_ids\[0,0\] = %d is out of range \[0, 2\)" % bad[0][0]):
-          unsorted.eval()
+          self.evaluate(unsorted)
 
+  @test_util.run_deprecated_v1
   def testEmptySecondDimension(self):
     dtypes = [np.float16, np.float32, np.float64, np.int64, np.int32,
               np.complex64, np.complex128]
@@ -443,7 +458,7 @@ class UnsortedSegmentTest(SegmentReductionHelper):
           np.place(indices, indices == 8, [-1])
           s = math_ops.unsorted_segment_sum(
               data=tf_x, segment_ids=indices, num_segments=num_segments)
-          tf_ans = s.eval()
+          tf_ans = self.evaluate(s)
         self.assertAllClose(np_ans, tf_ans)
         self.assertShapeEqual(np_ans, s)
 
@@ -499,7 +514,7 @@ class SparseSegmentReductionOpTest(SparseSegmentReductionHelper):
           np_ans = self._sparseSegmentReduce(np_x, np_indices, segment_indices,
                                              np_op1, np_op2)
           s = tf_op(data=tf_x, indices=tf_indices, segment_ids=segment_indices)
-          tf_ans = s.eval()
+          tf_ans = self.evaluate(s)
           self.assertAllClose(np_ans, tf_ans)
           # NOTE(mrry): The static shape inference that computes
           # `tf_ans.shape` can only infer that sizes from dimension 1
@@ -518,7 +533,7 @@ class SparseSegmentReductionOpTest(SparseSegmentReductionHelper):
         np_ans = self._sparseSegmentReduce(np_x, tf_indices, segment_indices,
                                            np_op1, np_op2)
         s = tf_op(data=tf_x, indices=tf_indices, segment_ids=segment_indices)
-        tf_ans = s.eval()
+        tf_ans = self.evaluate(s)
         self.assertAllClose(np_ans, tf_ans)
 
   def testWithNumSegments(self):
@@ -543,7 +558,7 @@ class SparseSegmentReductionOpTest(SparseSegmentReductionHelper):
             indices=tf_indices,
             segment_ids=segment_indices,
             num_segments=num_segments)
-        tf_ans = s.eval()
+        tf_ans = self.evaluate(s)
         self.assertAllClose(np_ans, tf_ans)
 
   def testWithEmptySegments(self):
@@ -562,7 +577,7 @@ class SparseSegmentReductionOpTest(SparseSegmentReductionHelper):
             indices=tf_indices,
             segment_ids=segment_indices,
             num_segments=num_segments)
-        tf_ans = s.eval()
+        tf_ans = self.evaluate(s)
         self.assertAllClose(np.zeros([5, 4]), tf_ans)
 
   def testSegmentIdsGreaterThanZero(self):
@@ -576,7 +591,7 @@ class SparseSegmentReductionOpTest(SparseSegmentReductionHelper):
         np_ans = self._sparseSegmentReduce(np_x, tf_indices, segment_indices,
                                            np_op1, np_op2)
         s = tf_op(data=tf_x, indices=tf_indices, segment_ids=segment_indices)
-        tf_ans = s.eval()
+        tf_ans = self.evaluate(s)
         self.assertAllClose(np_ans, tf_ans)
 
   def testValid(self):
@@ -588,8 +603,9 @@ class SparseSegmentReductionOpTest(SparseSegmentReductionHelper):
     with self.session(use_gpu=False):
       for tf_op in ops_list:
         s = tf_op(data=tf_x, indices=tf_indices, segment_ids=segment_indices)
-        s.eval()
+        self.evaluate(s)
 
+  @test_util.run_deprecated_v1
   def testIndicesInvalid1(self):
     tf_x, _ = self._input([10, 4], dtype=dtypes_lib.float32)
     ops_list = [math_ops.sparse_segment_sum, math_ops.sparse_segment_mean]
@@ -600,8 +616,9 @@ class SparseSegmentReductionOpTest(SparseSegmentReductionHelper):
         s = tf_op(data=tf_x, indices=tf_indices, segment_ids=segment_indices)
         with self.assertRaisesOpError(
             r"indices\[1\] == -1 out of range \[0, 10\)"):
-          s.eval()
+          self.evaluate(s)
 
+  @test_util.run_deprecated_v1
   def testIndicesInvalid2(self):
     tf_x, _ = self._input([10, 4], dtype=dtypes_lib.float32)
     ops_list = [math_ops.sparse_segment_sum, math_ops.sparse_segment_mean]
@@ -612,8 +629,9 @@ class SparseSegmentReductionOpTest(SparseSegmentReductionHelper):
         s = tf_op(data=tf_x, indices=tf_indices, segment_ids=segment_indices)
         with self.assertRaisesOpError(
             r"indices\[3\] == 10 out of range \[0, 10\)"):
-          s.eval()
+          self.evaluate(s)
 
+  @test_util.run_deprecated_v1
   def testSegmentsInvalid2(self):
     tf_x, _ = self._input([10, 4], dtype=dtypes_lib.float32)
     ops_list = [math_ops.sparse_segment_sum, math_ops.sparse_segment_mean]
@@ -623,8 +641,9 @@ class SparseSegmentReductionOpTest(SparseSegmentReductionHelper):
       for tf_op in ops_list:
         s = tf_op(data=tf_x, indices=tf_indices, segment_ids=segment_indices)
         with self.assertRaisesOpError("segment ids are not increasing"):
-          s.eval()
+          self.evaluate(s)
 
+  @test_util.run_deprecated_v1
   def testSegmentsInvalid3(self):
     tf_x, _ = self._input([10, 4], dtype=dtypes_lib.float32)
     ops_list = [math_ops.sparse_segment_sum, math_ops.sparse_segment_mean]
@@ -636,8 +655,9 @@ class SparseSegmentReductionOpTest(SparseSegmentReductionHelper):
         with self.assertRaisesOpError(
             r"Segment id 1 out of range \[0, 1\), possibly because "
             "'segment_ids' input is not sorted"):
-          s.eval()
+          self.evaluate(s)
 
+  @test_util.run_deprecated_v1
   def testSegmentsInvalid4(self):
     tf_x, _ = self._input([10, 4], dtype=dtypes_lib.float32)
     ops_list = [math_ops.sparse_segment_sum, math_ops.sparse_segment_mean]
@@ -649,8 +669,9 @@ class SparseSegmentReductionOpTest(SparseSegmentReductionHelper):
         with self.assertRaisesOpError(
             r"Segment id -1 out of range \[0, 2\), possibly because "
             "'segment_ids' input is not sorted"):
-          s.eval()
+          self.evaluate(s)
 
+  @test_util.run_deprecated_v1
   def testSegmentsInvalid6(self):
     tf_x, _ = self._input([10, 4], dtype=dtypes_lib.float32)
     ops_list = [math_ops.sparse_segment_sum, math_ops.sparse_segment_mean]
@@ -660,8 +681,9 @@ class SparseSegmentReductionOpTest(SparseSegmentReductionHelper):
       for tf_op in ops_list:
         s = tf_op(data=tf_x, indices=tf_indices, segment_ids=segment_indices)
         with self.assertRaisesOpError("segment ids must be >= 0"):
-          s.eval()
+          self.evaluate(s)
 
+  @test_util.run_deprecated_v1
   def testSegmentsInvalid7(self):
     tf_x, _ = self._input([10, 4], dtype=dtypes_lib.float32)
     ops_list = [math_ops.sparse_segment_sum, math_ops.sparse_segment_mean]
@@ -671,7 +693,7 @@ class SparseSegmentReductionOpTest(SparseSegmentReductionHelper):
       for tf_op in ops_list:
         s = tf_op(data=tf_x, indices=tf_indices, segment_ids=segment_indices)
         with self.assertRaisesOpError("segment ids must be >= 0"):
-          s.eval()
+          self.evaluate(s)
 
   def testSegmentWithNumSegmentsValid(self):
     # Baseline for the test*WithNumSegmentsInvalid* methods below.
@@ -690,8 +712,9 @@ class SparseSegmentReductionOpTest(SparseSegmentReductionHelper):
             indices=tf_indices,
             segment_ids=segment_indices,
             num_segments=num_segments)
-        s.eval()
+        self.evaluate(s)
 
+  @test_util.run_deprecated_v1
   def testSegmentWithNumSegmentsInvalid1(self):
     tf_x, _ = self._input([10, 4], dtype=dtypes_lib.float32)
     ops_list = [
@@ -709,8 +732,9 @@ class SparseSegmentReductionOpTest(SparseSegmentReductionHelper):
             segment_ids=segment_indices,
             num_segments=num_segments)
         with self.assertRaisesOpError("segment ids must be < num_segments"):
-          s.eval()
+          self.evaluate(s)
 
+  @test_util.run_deprecated_v1
   def testSegmentWithNumSegmentsInvalid2(self):
     tf_x, _ = self._input([10, 4], dtype=dtypes_lib.float32)
     ops_list = [
@@ -730,6 +754,7 @@ class SparseSegmentReductionOpTest(SparseSegmentReductionHelper):
               segment_ids=segment_indices,
               num_segments=num_segments)
 
+  @test_util.run_deprecated_v1
   def testGradient(self):
     shape = [10, 4]
 
@@ -748,6 +773,7 @@ class SparseSegmentReductionOpTest(SparseSegmentReductionHelper):
             delta=1)
       self.assertAllClose(jacob_t, jacob_n)
 
+  @test_util.run_deprecated_v1
   def testGradientWithEmptySegmentsAtEnd(self):
     shape = [10, 4]
 
@@ -785,8 +811,9 @@ class SparseSegmentReductionOpTest(SparseSegmentReductionHelper):
     with self.session(use_gpu=False):
       for tf_op in ops_list:
         s = tf_op(tf_x, tf_indices, segment_indices, 10)
-        s.eval()
+        self.evaluate(s)
 
+  @test_util.run_deprecated_v1
   def testGradientIndicesInvalid1(self):
     tf_x, _ = self._input([3, 4], dtype=dtypes_lib.float32)
     ops_list = [
@@ -798,8 +825,9 @@ class SparseSegmentReductionOpTest(SparseSegmentReductionHelper):
       for tf_op in ops_list:
         s = tf_op(tf_x, tf_indices, segment_indices, 10)
         with self.assertRaisesOpError(r"Index 10 out of range \[0, 10\)"):
-          s.eval()
+          self.evaluate(s)
 
+  @test_util.run_deprecated_v1
   def testGradientIndicesInvalid2(self):
     tf_x, _ = self._input([3, 4], dtype=dtypes_lib.float32)
     ops_list = [
@@ -811,8 +839,9 @@ class SparseSegmentReductionOpTest(SparseSegmentReductionHelper):
       for tf_op in ops_list:
         s = tf_op(tf_x, tf_indices, segment_indices, 10)
         with self.assertRaisesOpError(r"Index -1 out of range \[0, 10\)"):
-          s.eval()
+          self.evaluate(s)
 
+  @test_util.run_deprecated_v1
   def testGradientSegmentsInvalid1(self):
     tf_x, _ = self._input(
         [3, 4], dtype=dtypes_lib.float32)  # expecting 3 segments
@@ -825,8 +854,9 @@ class SparseSegmentReductionOpTest(SparseSegmentReductionHelper):
       for tf_op in ops_list:
         s = tf_op(tf_x, tf_indices, segment_indices, 10)
         with self.assertRaisesOpError("Invalid number of segments"):
-          s.eval()
+          self.evaluate(s)
 
+  @test_util.run_deprecated_v1
   def testGradientSegmentsInvalid2(self):
     tf_x, _ = self._input([1, 4], dtype=dtypes_lib.float32)
     ops_list = [
@@ -838,8 +868,9 @@ class SparseSegmentReductionOpTest(SparseSegmentReductionHelper):
       for tf_op in ops_list:
         s = tf_op(tf_x, tf_indices, segment_indices, 10)
         with self.assertRaisesOpError(r"Segment id 1 out of range \[0, 1\)"):
-          s.eval()
+          self.evaluate(s)
 
+  @test_util.run_deprecated_v1
   def testGradientSegmentsInvalid3(self):
     tf_x, _ = self._input([2, 4], dtype=dtypes_lib.float32)
     ops_list = [
@@ -851,8 +882,9 @@ class SparseSegmentReductionOpTest(SparseSegmentReductionHelper):
       for tf_op in ops_list:
         s = tf_op(tf_x, tf_indices, segment_indices, 10)
         with self.assertRaisesOpError(r"Segment id -1 out of range \[0, 2\)"):
-          s.eval()
+          self.evaluate(s)
 
+  @test_util.run_deprecated_v1
   def testGradientSegmentsInvalid4(self):
     tf_x, _ = self._input([0, 4], dtype=dtypes_lib.float32)
     ops_list = [
@@ -864,7 +896,8 @@ class SparseSegmentReductionOpTest(SparseSegmentReductionHelper):
       for tf_op in ops_list:
         s = tf_op(tf_x, tf_indices, segment_indices, 10)
         with self.assertRaisesOpError(r"Segment id 0 out of range \[0, 0\)"):
-          s.eval()
+          self.evaluate(s)
+
 
 class SegmentReductionOpBenchmark(test.Benchmark):
   outer_dim_options = [2**x for x in range(9, 14, 2)]
diff --git a/tensorflow/python/kernel_tests/self_adjoint_eig_op_test.py b/tensorflow/python/kernel_tests/self_adjoint_eig_op_test.py
index 1b4aff8c9ca..42577f7e423 100644
--- a/tensorflow/python/kernel_tests/self_adjoint_eig_op_test.py
+++ b/tensorflow/python/kernel_tests/self_adjoint_eig_op_test.py
@@ -63,7 +63,7 @@ class SelfAdjointEigTest(test.TestCase):
           e1 = linalg_ops.self_adjoint_eigvals(matrix1)
           e2 = linalg_ops.self_adjoint_eigvals(matrix2)
           all_ops += [e1, e2]
-      val = sess.run(all_ops)
+      val = self.evaluate(all_ops)
       self.assertAllEqual(val[0], val[2])
       # The algorithm is slightly different for compute_v being True and False,
       # so require approximate equality only here.
@@ -81,7 +81,7 @@ class SelfAdjointEigTest(test.TestCase):
     self.assertEqual(matrix.shape, (32, 32))
     matrix_tensor = constant_op.constant(matrix)
     with self.session(use_gpu=True) as sess:
-      (e, v) = sess.run(linalg_ops.self_adjoint_eig(matrix_tensor))
+      (e, v) = self.evaluate(linalg_ops.self_adjoint_eig(matrix_tensor))
       self.assertEqual(e.size, 32)
       self.assertAllClose(
           np.matmul(v, v.transpose()), np.eye(32, dtype=np.float32), atol=2e-3)
@@ -164,8 +164,8 @@ def _GetSelfAdjointEigTest(dtype_, shape_, compute_v_):
         self.assertAllClose(a_ev.eval(), a, atol=atol)
 
         # Compare to numpy.linalg.eigh.
-        CompareEigenDecompositions(self, np_e, np_v,
-                                   tf_e.eval(), tf_v.eval(), atol)
+        CompareEigenDecompositions(self, np_e, np_v, self.evaluate(tf_e),
+                                   self.evaluate(tf_v), atol)
       else:
         tf_e = linalg_ops.self_adjoint_eigvals(constant_op.constant(a))
         self.assertAllClose(
diff --git a/tensorflow/python/kernel_tests/session_ops_test.py b/tensorflow/python/kernel_tests/session_ops_test.py
index 03e1ae852fc..dc663cb091c 100644
--- a/tensorflow/python/kernel_tests/session_ops_test.py
+++ b/tensorflow/python/kernel_tests/session_ops_test.py
@@ -37,7 +37,7 @@ class SessionOpsTest(test.TestCase):
       b = constant_op.constant(5)
       c = math_ops.multiply(a, b)
       h = session_ops.get_session_handle(c)
-      h = sess.run(h)
+      h = self.evaluate(h)
 
       # Feed a tensor handle.
       f, x = session_ops.get_session_tensor(h.handle, dtypes.int32)
@@ -51,7 +51,7 @@ class SessionOpsTest(test.TestCase):
       b = constant_op.constant(5)
       c = math_ops.multiply(a, b)
       h = session_ops.get_session_handle(c)
-      h = sess.run(h)
+      h = self.evaluate(h)
 
       # Get the tensor from its handle.
       self.assertEqual(50, h.eval())
@@ -64,7 +64,7 @@ class SessionOpsTest(test.TestCase):
       c = math_ops.multiply(a, b)
       h = session_ops.get_session_handle(c)
       v = math_ops.multiply(a, c)
-      h, v = sess.run([h, v])
+      h, v = self.evaluate([h, v])
 
       self.assertEqual(50, h.eval())
       self.assertEqual(500, v)
@@ -77,7 +77,7 @@ class SessionOpsTest(test.TestCase):
       p = math_ops.less(a, b)
       c = math_ops.multiply(a, b)
       h = session_ops.get_session_handle(c)
-      p, h = sess.run([p, h])
+      p, h = self.evaluate([p, h])
 
       # Run by feeding a tensor handle.
       f, x = session_ops.get_session_tensor(h.handle, dtypes.int32)
@@ -94,7 +94,7 @@ class SessionOpsTest(test.TestCase):
       # Initialize a handle.
       a = constant_op.constant(0)
       h = session_ops.get_session_handle(a)
-      h = sess.run(h)
+      h = self.evaluate(h)
 
       # Do some computation.
       f, x = session_ops.get_session_tensor(h.handle, dtypes.int32)
@@ -111,7 +111,7 @@ class SessionOpsTest(test.TestCase):
       # Initialize a handle.
       a = constant_op.constant(0)
       h = session_ops.get_session_handle(a)
-      h = sess.run(h)
+      h = self.evaluate(h)
 
       # Do some computation.
       f, x = session_ops.get_session_tensor(h.handle, dtypes.int32)
@@ -133,7 +133,7 @@ class SessionOpsTest(test.TestCase):
       b = constant_op.constant(5)
       c = math_ops.multiply(a, b)
       h = session_ops.get_session_handle(c)
-      h = sess.run(h)
+      h = self.evaluate(h)
 
       # Feed a tensor handle.
       f, x = session_ops.get_session_tensor(h.handle, dtypes.int32)
@@ -144,7 +144,7 @@ class SessionOpsTest(test.TestCase):
       with ops.device(test.gpu_device_name()):
         a = constant_op.constant(10)
         h = session_ops.get_session_handle(a)
-        h = sess.run(h)
+        h = self.evaluate(h)
         self.assertEqual(100, sess.run(y, feed_dict={f: h.handle}))
 
   def testHandleDelete(self):
@@ -154,7 +154,7 @@ class SessionOpsTest(test.TestCase):
       b = constant_op.constant(5)
       c = math_ops.multiply(a, b)
       h = session_ops.get_session_handle(c)
-      sess.run(h).delete()
+      self.evaluate(h).delete()
 
   def testHandleDeleteRaw(self):
     with self.cached_session() as sess:
@@ -163,7 +163,7 @@ class SessionOpsTest(test.TestCase):
       b = constant_op.constant(5)
       c = math_ops.multiply(a, b)
       h = session_ops.get_session_handle(c)
-      h = sess.run(h)
+      h = self.evaluate(h)
 
       # Delete using a raw tensor handle.
       raw_h = h.get_raw_handle()
@@ -174,10 +174,10 @@ class SessionOpsTest(test.TestCase):
     with self.cached_session() as sess:
       with ops.device(test.gpu_device_name()):
         a = constant_op.constant(1.0)
-        a_handle = sess.run(session_ops.get_session_handle(a))
+        a_handle = self.evaluate(session_ops.get_session_handle(a))
       with ops.device("/cpu:0"):
         b = constant_op.constant(2.0)
-        b_handle = sess.run(session_ops.get_session_handle(b))
+        b_handle = self.evaluate(session_ops.get_session_handle(b))
 
       a_p, a_t = session_ops.get_session_tensor(a_handle.handle, dtypes.float32)
       b_p, b_t = session_ops.get_session_tensor(b_handle.handle, dtypes.float32)
@@ -193,8 +193,8 @@ class SessionOpsTest(test.TestCase):
       # initial values live on CPU
       with ops.device("/cpu:0"):
         one = constant_op.constant(1, dtype=dtypes.float32)
-        one_handle = sess.run(session_ops.get_session_handle(one))
-        x_handle = sess.run(session_ops.get_session_handle(one))
+        one_handle = self.evaluate(session_ops.get_session_handle(one))
+        x_handle = self.evaluate(session_ops.get_session_handle(one))
 
       # addition lives on GPU
       with ops.device(test.gpu_device_name()):
@@ -219,8 +219,8 @@ class SessionOpsTest(test.TestCase):
       b = constant_op.constant(2.0)
       b_handle_op = session_ops.get_session_handle(b)
 
-      a_handle = sess.run(a_handle_op)
-      b_handle = sess.run(b_handle_op)
+      a_handle = self.evaluate(a_handle_op)
+      b_handle = self.evaluate(b_handle_op)
 
       a_p, a_t = session_ops.get_session_tensor(a_handle.handle, dtypes.float32)
       b_p, b_t = session_ops.get_session_tensor(b_handle.handle, dtypes.float32)
@@ -239,7 +239,7 @@ class SessionOpsTest(test.TestCase):
       c = math_ops.multiply(a, b)
       d = math_ops.multiply(c, c)
 
-      h_c = sess.run(session_ops.get_session_handle(c))
+      h_c = self.evaluate(session_ops.get_session_handle(c))
 
       self.assertAllClose(2500.0, sess.run(d, feed_dict={c: h_c}))
 
@@ -248,7 +248,7 @@ class SessionOpsTest(test.TestCase):
       a = constant_op.constant(10.0)
       b = constant_op.constant(5.0)
       c = math_ops.multiply(a, b)
-      h_c = sess.run(session_ops.get_session_handle(c))
+      h_c = self.evaluate(session_ops.get_session_handle(c))
       d = array_ops.identity(c)
 
       c_val = sess.run(c, feed_dict={c: h_c})
@@ -277,8 +277,8 @@ class SessionOpsTest(test.TestCase):
       d = math_ops.div(a, b)
       e = math_ops.subtract(c, d)
 
-      h_c = sess.run(session_ops.get_session_handle(c))
-      h_d = sess.run(session_ops.get_session_handle(d))
+      h_c = self.evaluate(session_ops.get_session_handle(c))
+      h_d = self.evaluate(session_ops.get_session_handle(d))
 
       self.assertAllClose(48.0, sess.run(e, feed_dict={c: h_c, d: h_d}))
       self.assertAllClose(-48.0, sess.run(e, feed_dict={c: h_d, d: h_c}))
@@ -288,13 +288,13 @@ class SessionOpsTest(test.TestCase):
       a = variables.Variable(12.0)
       inc_a = state_ops.assign_add(a, 2.0)
       b = math_ops.add(a, 5.0)
-      sess.run(a.initializer)
+      self.evaluate(a.initializer)
 
       h_a_read = sess.run(session_ops.get_session_handle(a.read_value()))
-      self.assertAllClose(12.0, sess.run(a))
+      self.assertAllClose(12.0, self.evaluate(a))
 
       self.assertAllClose(17.0, sess.run(b, feed_dict={a: h_a_read}))
-      sess.run(inc_a)
+      self.evaluate(inc_a)
       self.assertAllClose(19.0, sess.run(b, feed_dict={a: h_a_read}))
 
 
diff --git a/tensorflow/python/kernel_tests/sets_test.py b/tensorflow/python/kernel_tests/sets_test.py
index 8335e9c139a..b4f23229348 100644
--- a/tensorflow/python/kernel_tests/sets_test.py
+++ b/tensorflow/python/kernel_tests/sets_test.py
@@ -70,6 +70,7 @@ def _dense_to_sparse(dense, dtype):
 
 class SetOpsTest(test_util.TensorFlowTestCase):
 
+  @test_util.run_deprecated_v1
   def test_set_size_2d(self):
     for dtype in _DTYPES:
       self._test_set_size_2d(dtype)
@@ -83,6 +84,7 @@ class SetOpsTest(test_util.TensorFlowTestCase):
     self.assertAllEqual(
         [0, 3], self._set_size(_dense_to_sparse([[], [1, 9, 2]], dtype)))
 
+  @test_util.run_deprecated_v1
   def test_set_size_duplicates_2d(self):
     for dtype in _DTYPES:
       self._test_set_size_duplicates_2d(dtype)
@@ -96,6 +98,7 @@ class SetOpsTest(test_util.TensorFlowTestCase):
                                 6, 7, 8, 8, 6, 7, 5, 3, 3, 0, 6, 6, 9, 0, 0, 0
                             ], [999, 1, -1000], [], [-1]], dtype)))
 
+  @test_util.run_deprecated_v1
   def test_set_size_3d(self):
     for dtype in _DTYPES:
       self._test_set_size_3d(dtype)
@@ -159,10 +162,11 @@ class SetOpsTest(test_util.TensorFlowTestCase):
       self.assertEqual(None, op.get_shape().dims)
       self.assertEqual(dtypes.int32, op.dtype)
     with self.cached_session() as sess:
-      results = sess.run(ops)
+      results = self.evaluate(ops)
     self.assertAllEqual(results[0], results[1])
     return results[0]
 
+  @test_util.run_deprecated_v1
   def test_set_intersection_multirow_2d(self):
     for dtype in _DTYPES:
       self._test_set_intersection_multirow_2d(dtype)
@@ -199,6 +203,7 @@ class SetOpsTest(test_util.TensorFlowTestCase):
     self.assertAllEqual(expected_counts,
                         self._set_intersection_count(sp_a, sp_b))
 
+  @test_util.run_deprecated_v1
   def test_dense_set_intersection_multirow_2d(self):
     for dtype in _DTYPES:
       self._test_dense_set_intersection_multirow_2d(dtype)
@@ -223,6 +228,7 @@ class SetOpsTest(test_util.TensorFlowTestCase):
         dtype=dtype)
     self.assertAllEqual(expected_counts, self._set_intersection_count(a, b))
 
+  @test_util.run_deprecated_v1
   def test_set_intersection_duplicates_2d(self):
     for dtype in _DTYPES:
       self._test_set_intersection_duplicates_2d(dtype)
@@ -270,6 +276,7 @@ class SetOpsTest(test_util.TensorFlowTestCase):
     self.assertAllEqual(expected_counts,
                         self._set_intersection_count(sp_a, sp_b))
 
+  @test_util.run_deprecated_v1
   def test_set_intersection_3d(self):
     for dtype in _DTYPES:
       self._test_set_intersection_3d(dtype=dtype)
@@ -534,8 +541,9 @@ class SetOpsTest(test_util.TensorFlowTestCase):
   def _set_intersection_count(self, a, b):
     op = sets.set_size(sets.set_intersection(a, b))
     with self.cached_session() as sess:
-      return sess.run(op)
+      return self.evaluate(op)
 
+  @test_util.run_deprecated_v1
   def test_set_difference_multirow_2d(self):
     for dtype in _DTYPES:
       self._test_set_difference_multirow_2d(dtype)
@@ -604,6 +612,7 @@ class SetOpsTest(test_util.TensorFlowTestCase):
     self.assertAllEqual(expected_counts,
                         self._set_difference_count(sp_a, sp_b, False))
 
+  @test_util.run_deprecated_v1
   def test_dense_set_difference_multirow_2d(self):
     for dtype in _DTYPES:
       self._test_dense_set_difference_multirow_2d(dtype)
@@ -647,6 +656,7 @@ class SetOpsTest(test_util.TensorFlowTestCase):
     self.assertAllEqual(expected_counts,
                         self._set_difference_count(a, b, False))
 
+  @test_util.run_deprecated_v1
   def test_sparse_set_difference_multirow_2d(self):
     for dtype in _DTYPES:
       self._test_sparse_set_difference_multirow_2d(dtype)
@@ -688,6 +698,7 @@ class SetOpsTest(test_util.TensorFlowTestCase):
     self.assertAllEqual(expected_counts,
                         self._set_difference_count(sp_a, sp_b, False))
 
+  @test_util.run_deprecated_v1
   def test_set_difference_duplicates_2d(self):
     for dtype in _DTYPES:
       self._test_set_difference_duplicates_2d(dtype)
@@ -755,6 +766,7 @@ class SetOpsTest(test_util.TensorFlowTestCase):
     self.assertAllEqual(expected_counts,
                         self._set_difference_count(a, sp_b, False))
 
+  @test_util.run_deprecated_v1
   def test_sparse_set_difference_3d(self):
     for dtype in _DTYPES:
       self._test_sparse_set_difference_3d(dtype)
@@ -972,8 +984,9 @@ class SetOpsTest(test_util.TensorFlowTestCase):
   def _set_difference_count(self, a, b, aminusb=True):
     op = sets.set_size(sets.set_difference(a, b, aminusb))
     with self.cached_session() as sess:
-      return sess.run(op)
+      return self.evaluate(op)
 
+  @test_util.run_deprecated_v1
   def test_set_union_multirow_2d(self):
     for dtype in _DTYPES:
       self._test_set_union_multirow_2d(dtype)
@@ -1001,6 +1014,7 @@ class SetOpsTest(test_util.TensorFlowTestCase):
         expected_indices, expected_values, expected_shape, union, dtype=dtype)
     self.assertAllEqual(expected_counts, self._set_union_count(sp_a, sp_b))
 
+  @test_util.run_deprecated_v1
   def test_dense_set_union_multirow_2d(self):
     for dtype in _DTYPES:
       self._test_dense_set_union_multirow_2d(dtype)
@@ -1021,6 +1035,7 @@ class SetOpsTest(test_util.TensorFlowTestCase):
         expected_indices, expected_values, expected_shape, union, dtype=dtype)
     self.assertAllEqual(expected_counts, self._set_union_count(a, b))
 
+  @test_util.run_deprecated_v1
   def test_set_union_duplicates_2d(self):
     for dtype in _DTYPES:
       self._test_set_union_duplicates_2d(dtype)
@@ -1047,6 +1062,7 @@ class SetOpsTest(test_util.TensorFlowTestCase):
         expected_indices, expected_values, expected_shape, union, dtype=dtype)
     self.assertAllEqual([2], self._set_union_count(sp_a, sp_b))
 
+  @test_util.run_deprecated_v1
   def test_sparse_set_union_3d(self):
     for dtype in _DTYPES:
       self._test_sparse_set_union_3d(dtype)
@@ -1221,7 +1237,7 @@ class SetOpsTest(test_util.TensorFlowTestCase):
   def _set_union_count(self, a, b):
     op = sets.set_size(sets.set_union(a, b))
     with self.cached_session() as sess:
-      return sess.run(op)
+      return self.evaluate(op)
 
   def _assert_set_operation(self, expected_indices, expected_values,
                             expected_shape, sparse_tensor_value, dtype):
diff --git a/tensorflow/python/kernel_tests/shape_ops_test.py b/tensorflow/python/kernel_tests/shape_ops_test.py
index ee813e5ffd9..c8e7c143ade 100644
--- a/tensorflow/python/kernel_tests/shape_ops_test.py
+++ b/tensorflow/python/kernel_tests/shape_ops_test.py
@@ -26,6 +26,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import importer
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import gradients_impl
@@ -53,8 +54,8 @@ class ShapeOpsTest(test.TestCase):
     with self.cached_session(use_gpu=use_gpu):
       tf_ans = array_ops.shape(x)
       tf_ans_64 = array_ops.shape(x, out_type=dtypes.int64)
-      result = tf_ans.eval()
-      result_64 = tf_ans_64.eval()
+      result = self.evaluate(tf_ans)
+      result_64 = self.evaluate(tf_ans_64)
     self.assertAllEqual(np_ans, result)
     self.assertAllEqual(np_ans, result_64)
     self.assertShapeEqual(np_ans, tf_ans)
@@ -64,7 +65,7 @@ class ShapeOpsTest(test.TestCase):
     x_tf, unused_nnz = _sparsify(x_np)
     with self.cached_session(use_gpu=use_gpu):
       tf_ans = array_ops.shape(x_tf)
-      result = tf_ans.eval()
+      result = self.evaluate(tf_ans)
     self.assertAllEqual(np_ans, result)
     self.assertShapeEqual(np_ans, tf_ans)
 
@@ -73,8 +74,8 @@ class ShapeOpsTest(test.TestCase):
     with self.cached_session(use_gpu=use_gpu) as sess:
       tf_ans = array_ops.shape_n([x, x, x])
       tf_ans_64 = array_ops.shape_n([x, x, x], out_type=dtypes.int64)
-      result = sess.run(tf_ans)
-      result_64 = sess.run(tf_ans_64)
+      result = self.evaluate(tf_ans)
+      result_64 = self.evaluate(tf_ans_64)
     for i in range(3):
       self.assertAllEqual(np_ans, result[i])
       self.assertAllEqual(np_ans, result_64[i])
@@ -84,7 +85,7 @@ class ShapeOpsTest(test.TestCase):
     np_ans = np.asarray(np.ndim(x))
     with self.cached_session(use_gpu=use_gpu):
       tf_ans = array_ops.rank(x)
-      result = tf_ans.eval()
+      result = self.evaluate(tf_ans)
     self.assertAllEqual(np_ans, result)
     self.assertShapeEqual(np_ans, tf_ans)
 
@@ -93,7 +94,7 @@ class ShapeOpsTest(test.TestCase):
     x_tf, unused_nnz = _sparsify(x_np)
     with self.cached_session(use_gpu=use_gpu):
       tf_ans = array_ops.rank(x_tf)
-      result = tf_ans.eval()
+      result = self.evaluate(tf_ans)
     self.assertAllEqual(np_ans, result)
     self.assertShapeEqual(np_ans, tf_ans)
 
@@ -101,9 +102,9 @@ class ShapeOpsTest(test.TestCase):
     np_ans = np.asarray(np.size(x))
     with self.cached_session(use_gpu=use_gpu):
       tf_ans = array_ops.size(x)
-      result = tf_ans.eval()
+      result = self.evaluate(tf_ans)
       tf_ans_64 = array_ops.size(x, out_type=dtypes.int64)
-      result_64 = tf_ans_64.eval()
+      result_64 = self.evaluate(tf_ans_64)
     self.assertAllEqual(np_ans, result)
     self.assertAllEqual(np_ans, result_64)
     self.assertShapeEqual(np_ans, tf_ans)
@@ -113,7 +114,7 @@ class ShapeOpsTest(test.TestCase):
     x_tf, unused_nnz = _sparsify(x_np)
     with self.cached_session(use_gpu=use_gpu):
       tf_ans = array_ops.size(x_tf)
-      result = tf_ans.eval()
+      result = self.evaluate(tf_ans)
     self.assertAllEqual(np_ans, result)
     self.assertShapeEqual(np_ans, tf_ans)
 
@@ -162,7 +163,7 @@ class ShapeOpsTest(test.TestCase):
       inp = array_ops.zeros([2**31])
       num_elements = array_ops.size_internal(
           inp, optimize=False, out_type=dtypes.int64)
-      self.assertEqual(2**31, num_elements.eval())
+      self.assertEqual(2**31, self.evaluate(num_elements))
 
     # Too large for tf.int32 output.
     with self.assertRaises(errors_impl.InvalidArgumentError):
@@ -170,13 +171,13 @@ class ShapeOpsTest(test.TestCase):
         inp = array_ops.zeros([2**31])
         num_elements = array_ops.size_internal(
             inp, optimize=False, out_type=dtypes.int32)
-        self.assertEqual(2**31, num_elements.eval())
+        self.assertEqual(2**31, self.evaluate(num_elements))
 
   def _compareExpandDims(self, x, dim, use_gpu):
     np_ans = np.expand_dims(x, axis=dim)
     with self.cached_session(use_gpu=use_gpu):
       tensor = array_ops.expand_dims(x, dim)
-      tf_ans = tensor.eval()
+      tf_ans = self.evaluate(tensor)
     self.assertShapeEqual(np_ans, tensor)
     self.assertAllEqual(np_ans, tf_ans)
 
@@ -227,6 +228,7 @@ class ShapeOpsTest(test.TestCase):
     self._compareExpandDimsAll(choice([2, 3, 5]), -3)
     self._compareExpandDimsAll(choice([2, 3, 5]), -4)
 
+  @test_util.run_deprecated_v1
   def testExpandDimsErrors(self):
     with self.cached_session():
       self.assertRaises(ValueError, array_ops.expand_dims,
@@ -238,6 +240,7 @@ class ShapeOpsTest(test.TestCase):
       self.assertRaises(ValueError, array_ops.expand_dims,
                         [False, True, True], 4)
 
+  @test_util.run_deprecated_v1
   def testExpandDimsGradient(self):
     with self.cached_session():
       inp = constant_op.constant(
@@ -248,6 +251,7 @@ class ShapeOpsTest(test.TestCase):
                                                     [4, 1, 2])
     self.assertLess(err, 1e-3)
 
+  @test_util.run_deprecated_v1
   def testExpandDimsScalar(self):
     with self.cached_session():
       inp = constant_op.constant(7)
@@ -264,7 +268,7 @@ class ShapeOpsTest(test.TestCase):
       np_ans = np.expand_dims(x, axis=0)
       with self.cached_session(use_gpu=True):
         tensor = array_ops.expand_dims(x, constant_op.constant(0, dtype))
-        tf_ans = tensor.eval()
+        tf_ans = self.evaluate(tensor)
       self.assertShapeEqual(np_ans, tensor)
       self.assertAllEqual(np_ans, tf_ans)
 
@@ -273,11 +277,11 @@ class ShapeOpsTest(test.TestCase):
       if squeeze_dims:
         np_ans = np.squeeze(x, axis=tuple(squeeze_dims))
         tensor = array_ops.squeeze(x, squeeze_dims)
-        tf_ans = tensor.eval()
+        tf_ans = self.evaluate(tensor)
       else:
         np_ans = np.squeeze(x)
         tensor = array_ops.squeeze(x)
-        tf_ans = tensor.eval()
+        tf_ans = self.evaluate(tensor)
     self.assertShapeEqual(np_ans, tensor)
     self.assertAllEqual(np_ans, tf_ans)
 
@@ -340,7 +344,7 @@ class ShapeOpsTest(test.TestCase):
       with self.cached_session(use_gpu=use_gpu):
         tensor = array_ops.squeeze(np.zeros([1, 1, 1]), [])
         self.assertEqual(np.shape(1), tensor.get_shape())
-        tf_ans = tensor.eval()
+        tf_ans = self.evaluate(tensor)
         self.assertEqual(np.shape(1), tf_ans.shape)
 
   def testSqueezeAllOnesBool(self):
@@ -350,9 +354,10 @@ class ShapeOpsTest(test.TestCase):
       with self.cached_session(use_gpu=use_gpu):
         tensor = array_ops.squeeze([[[False]]], [])
         self.assertEqual(np.shape(1), tensor.get_shape())
-        tf_ans = tensor.eval()
+        tf_ans = self.evaluate(tensor)
         self.assertEqual(np.shape(1), tf_ans.shape)
 
+  @test_util.run_deprecated_v1
   def testSqueezeOnlyOnes(self):
     for use_gpu in [False, True]:
       with self.cached_session(use_gpu=use_gpu):
@@ -362,6 +367,7 @@ class ShapeOpsTest(test.TestCase):
         self._compareSqueezeAll(input_1x1x3, [1])
         self.assertRaises(ValueError, array_ops.squeeze, input_1x1x3, [2])
 
+  @test_util.run_deprecated_v1
   def testSqueezeErrors(self):
     for use_gpu in [False, True]:
       with self.cached_session(use_gpu=use_gpu):
@@ -374,6 +380,7 @@ class ShapeOpsTest(test.TestCase):
         self.assertRaises(ValueError, array_ops.squeeze,
                           np.zeros([1, 2, 1]), [2, 3])
 
+  @test_util.run_deprecated_v1
   def testSqueezeGradient(self):
     with self.cached_session():
       inp = np.random.rand(4, 2).astype("f")
@@ -384,6 +391,7 @@ class ShapeOpsTest(test.TestCase):
                                                     [4, 2])
     self.assertLess(err, 1e-3)
 
+  @test_util.run_deprecated_v1
   def testSqueezeGradientWithSqueezeDims(self):
     with self.cached_session():
       inp = np.random.rand(4, 2).astype("f")
@@ -394,6 +402,7 @@ class ShapeOpsTest(test.TestCase):
                                                     [4, 2, 1])
     self.assertLess(err, 1e-3)
 
+  @test_util.run_deprecated_v1
   def testSqueezeWithUnknownShape(self):
     with self.cached_session():
       a = array_ops.placeholder(dtypes.float32, shape=[2, None])
@@ -415,7 +424,7 @@ class TileTest(test.TestCase):
       with self.cached_session(use_gpu=use_gpu):
         a = constant_op.constant(7, shape=[], dtype=dtypes.float32)
         tiled = array_ops.tile(a, [])
-        result = tiled.eval()
+        result = self.evaluate(tiled)
       self.assertEqual(result.shape, ())
       self.assertEqual([], tiled.get_shape())
       self.assertEqual(7, result)
@@ -427,7 +436,7 @@ class TileTest(test.TestCase):
         inp = np.random.rand(4, 1).astype(np.float32)
         a = constant_op.constant(inp)
         tiled = array_ops.tile(a, constant_op.constant([1, 4], dtype=dtype))
-        result = tiled.eval()
+        result = self.evaluate(tiled)
       self.assertEqual(result.shape, (4, 4))
       self.assertEqual([4, 4], tiled.get_shape())
       self.assertTrue((result == np.tile(inp, (1, 4))).all())
@@ -437,7 +446,7 @@ class TileTest(test.TestCase):
       inp = np.random.rand(4, 1).astype(np.float32)
       a = constant_op.constant(inp)
       tiled = array_ops.tile(a, [1, 1])
-      result = tiled.eval()
+      result = self.evaluate(tiled)
     self.assertEqual(result.shape, (4, 1))
     self.assertEqual([4, 1], tiled.get_shape())
     self.assertTrue((result == np.tile(inp, (1, 1))).all())
@@ -447,10 +456,11 @@ class TileTest(test.TestCase):
       inp = np.random.rand(2, 3).astype(np.float32)
       a = constant_op.constant(inp)
       tiled = array_ops.tile(a, [5, 0])
-      result = tiled.eval()
+      result = self.evaluate(tiled)
     self.assertEqual(result.shape, (10, 0))
     self.assertEqual([10, 0], tiled.get_shape())
 
+  @test_util.run_deprecated_v1
   def testUnknownInputShape(self):
     """Importing can call _TileShape without shape of <multiples> known."""
     with self.cached_session():
@@ -497,11 +507,12 @@ class TileTest(test.TestCase):
             shape=[4, 1],
             dtype=dtype_tf)
         tiled = array_ops.tile(a, [1, 4])
-        result = tiled.eval()
+        result = self.evaluate(tiled)
       self.assertEqual(result.shape, (4, 4))
       self.assertEqual([4, 4], tiled.get_shape())
       self.assertAllEqual(result, np.tile(inp, (1, 4)))
 
+  @test_util.run_deprecated_v1
   def testInvalidDim(self):
     with self.cached_session():
       inp = np.random.rand(4, 1).astype("f")
@@ -527,7 +538,7 @@ class TileTest(test.TestCase):
           dtype=dtypes.float32)
       multiples = np.random.randint(1, 4, size=rank).astype(np.int32)
       tiled = array_ops.tile(a, multiples)
-      result = tiled.eval()
+      result = self.evaluate(tiled)
     self.assertTrue((np.array(multiples) * np.array(inp.shape) == np.array(
         result.shape)).all())
     self.assertAllEqual(result, np.tile(inp, tuple(multiples)))
@@ -545,6 +556,7 @@ class TileTest(test.TestCase):
     for _ in range(5):
       self._RunAndVerifyResult(10, use_gpu=True)
 
+  @test_util.run_deprecated_v1
   def testGradientSimpleReduction(self):
     with self.cached_session():
       inp = np.random.rand(4, 1).astype("f")
@@ -557,9 +569,10 @@ class TileTest(test.TestCase):
           [float(x) for x in grad_inp.flatten()], shape=grad_shape)
       grad = gradients_impl.gradients([tiled], [a], [grad_tensor])[0]
       self.assertShapeEqual(inp, grad)
-      result = grad.eval()
+      result = self.evaluate(grad)
     self.assertAllClose(np.sum(grad_inp, axis=1).reshape(4, 1), result, 1e-3)
 
+  @test_util.run_deprecated_v1
   def testGradientStridedReduction(self):
     with self.cached_session():
       inp = np.random.rand(4, 2).astype("f")
@@ -572,13 +585,14 @@ class TileTest(test.TestCase):
           [float(x) for x in grad_inp.flatten()], shape=grad_shape)
       grad = gradients_impl.gradients([tiled], [a], [grad_tensor])[0]
       self.assertShapeEqual(inp, grad)
-      result = grad.eval()
+      result = self.evaluate(grad)
     expected_shape = [4, 2]
     expected = np.zeros(expected_shape)
     expected[:, 0] = grad_inp[:, 0] + grad_inp[:, 2]
     expected[:, 1] = grad_inp[:, 1] + grad_inp[:, 3]
     self.assertTrue((np.abs(expected - result) < 1e-3).all())
 
+  @test_util.run_deprecated_v1
   def testGradientSimpleReductionOnGPU(self):
     with self.session(use_gpu=True):
       inp = np.random.rand(4, 1).astype("f")
@@ -590,9 +604,10 @@ class TileTest(test.TestCase):
       grad_tensor = constant_op.constant(
           [float(x) for x in grad_inp.flatten()], shape=grad_shape)
       grad = gradients_impl.gradients([tiled], [a], [grad_tensor])[0]
-      result = grad.eval()
+      result = self.evaluate(grad)
     self.assertAllClose(np.sum(grad_inp, axis=1).reshape(4, 1), result, 1e-3)
 
+  @test_util.run_deprecated_v1
   def testGradientStridedReductionOnGPU(self):
     with self.session(use_gpu=True):
       inp = np.random.rand(4, 2).astype("f")
@@ -604,7 +619,7 @@ class TileTest(test.TestCase):
       grad_tensor = constant_op.constant(
           [float(x) for x in grad_inp.flatten()], shape=grad_shape)
       grad = gradients_impl.gradients([tiled], [a], [grad_tensor])[0]
-      result = grad.eval()
+      result = self.evaluate(grad)
     expected_shape = [4, 2]
     expected = np.zeros(expected_shape)
     expected[:, 0] = grad_inp[:, 0] + grad_inp[:, 2]
@@ -624,15 +639,18 @@ class TileTest(test.TestCase):
       print("tile(float) error = ", err)
       self.assertLess(err, 1e-3)
 
+  @test_util.run_deprecated_v1
   def testGradientRandomScalar(self):
     self._RunAndVerifyGradientResult([], [])
 
+  @test_util.run_deprecated_v1
   def testGradientRandom(self):
     self._RunAndVerifyGradientResult([2, 2, 1, 1, 3], [1, 1, 1, 1, 1])
     self._RunAndVerifyGradientResult([2, 2, 1, 1, 3], [1, 2, 1, 3, 1])
     self._RunAndVerifyGradientResult([2, 3, 1, 1, 3], [3, 1, 1, 2, 2])
     self._RunAndVerifyGradientResult([2, 1, 3, 3, 2], [1, 3, 3, 1, 2])
 
+  @test_util.run_deprecated_v1
   def testGradientStridedReductionGC(self):
     with self.cached_session():
       inp = np.random.rand(4, 2).astype("f")
@@ -642,6 +660,7 @@ class TileTest(test.TestCase):
       err = gradient_checker.compute_gradient_error(a, [4, 2], tiled, [4, 4])
     self.assertLess(err, 1e-3)
 
+  @test_util.run_deprecated_v1
   def testGradientWithSparseGradWithRank1(self):
     inputs = constant_op.constant([1.0, 2.0, 3.0, 4.0],
                                   dtype=dtypes.float32)
@@ -653,6 +672,7 @@ class TileTest(test.TestCase):
           outputs, outputs.get_shape().as_list())
       self.assertLess(error, 1e-4)
 
+  @test_util.run_deprecated_v1
   def testGradientWithSparseGradWithRank3(self):
     inputs = constant_op.constant([1.0, 2.0, 3.0, 4.0],
                                   dtype=dtypes.float32)
@@ -665,6 +685,7 @@ class TileTest(test.TestCase):
           outputs, outputs.get_shape().as_list())
       self.assertLess(error, 1e-4)
 
+  @test_util.run_deprecated_v1
   def testShapeFunctionEdgeCases(self):
     # Unknown multiples shape.
     inp = constant_op.constant(0.0, shape=[4, 4, 4, 4])
diff --git a/tensorflow/python/kernel_tests/signal/BUILD b/tensorflow/python/kernel_tests/signal/BUILD
index 56d4d46d462..8f4e31abe3c 100644
--- a/tensorflow/python/kernel_tests/signal/BUILD
+++ b/tensorflow/python/kernel_tests/signal/BUILD
@@ -18,6 +18,35 @@ py_library(
     ],
 )
 
+cuda_py_tests(
+    name = "dct_ops_test",
+    srcs = ["dct_ops_test.py"],
+    additional_deps = [
+        "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:spectral_ops_test_util",
+        "//tensorflow/python/ops/signal",
+    ],
+)
+
+cuda_py_tests(
+    name = "fft_ops_test",
+    size = "medium",
+    srcs = ["fft_ops_test.py"],
+    additional_deps = [
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:spectral_ops_test_util",
+        "//tensorflow/python/ops/signal",
+    ],
+    shard_count = 4,
+    tags = ["optonly"],
+)
+
 cuda_py_tests(
     name = "mel_ops_test",
     srcs = ["mel_ops_test.py"],
@@ -91,9 +120,9 @@ cuda_py_tests(
         "//tensorflow/python:framework",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python/ops/signal",
         "//tensorflow/python:platform_test",
         "//tensorflow/python:spectral_ops_test_util",
+        "//tensorflow/python/ops/signal",
     ],
     tags = ["nomac"],
 )
diff --git a/tensorflow/python/kernel_tests/dct_ops_test.py b/tensorflow/python/kernel_tests/signal/dct_ops_test.py
similarity index 67%
rename from tensorflow/python/kernel_tests/dct_ops_test.py
rename to tensorflow/python/kernel_tests/signal/dct_ops_test.py
index c9d0167608e..a3ac15bab8a 100644
--- a/tensorflow/python/kernel_tests/dct_ops_test.py
+++ b/tensorflow/python/kernel_tests/signal/dct_ops_test.py
@@ -20,10 +20,12 @@ from __future__ import print_function
 
 import importlib
 
+from absl.testing import parameterized
 import numpy as np
 
-from tensorflow.python.ops import spectral_ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import spectral_ops_test_util
+from tensorflow.python.ops.signal import dct_ops
 from tensorflow.python.platform import test
 from tensorflow.python.platform import tf_logging
 
@@ -40,6 +42,20 @@ def try_import(name):  # pylint: disable=invalid-name
 fftpack = try_import("scipy.fftpack")
 
 
+def _np_dct1(signals, norm=None):
+  """Computes the DCT-I manually with NumPy."""
+  # X_k = (x_0 + (-1)**k * x_{N-1} +
+  #       2 * sum_{n=0}^{N-2} x_n * cos(\frac{pi}{N-1} * n * k)  k=0,...,N-1
+  del norm
+  dct_size = signals.shape[-1]
+  dct = np.zeros_like(signals)
+  for k in range(dct_size):
+    phi = np.cos(np.pi * np.arange(1, dct_size - 1) * k / (dct_size - 1))
+    dct[..., k] = 2 * np.sum(signals[..., 1:-1] * phi, axis=-1) + (
+        signals[..., 0] + (-1) ** k * signals[..., -1])
+  return dct
+
+
 def _np_dct2(signals, norm=None):
   """Computes the DCT-II manually with NumPy."""
   # X_k = sum_{n=0}^{N-1} x_n * cos(\frac{pi}{N} * (n + 0.5) * k)  k=0,...,N-1
@@ -81,19 +97,19 @@ def _np_dct3(signals, norm=None):
   return dct
 
 
-NP_DCT = {2: _np_dct2, 3: _np_dct3}
-NP_IDCT = {2: _np_dct3, 3: _np_dct2}
+NP_DCT = {1: _np_dct1, 2: _np_dct2, 3: _np_dct3}
+NP_IDCT = {1: _np_dct1, 2: _np_dct3, 3: _np_dct2}
 
 
-class DCTOpsTest(test.TestCase):
+class DCTOpsTest(parameterized.TestCase, test.TestCase):
 
   def _compare(self, signals, norm, dct_type, atol=5e-4, rtol=5e-4):
     """Compares (I)DCT to SciPy (if available) and a NumPy implementation."""
     np_dct = NP_DCT[dct_type](signals, norm)
-    tf_dct = spectral_ops.dct(signals, type=dct_type, norm=norm).eval()
+    tf_dct = dct_ops.dct(signals, type=dct_type, norm=norm).eval()
     self.assertAllClose(np_dct, tf_dct, atol=atol, rtol=rtol)
     np_idct = NP_IDCT[dct_type](signals, norm)
-    tf_idct = spectral_ops.idct(signals, type=dct_type, norm=norm).eval()
+    tf_idct = dct_ops.idct(signals, type=dct_type, norm=norm).eval()
     self.assertAllClose(np_idct, tf_idct, atol=atol, rtol=rtol)
     if fftpack:
       scipy_dct = fftpack.dct(signals, type=dct_type, norm=norm)
@@ -101,38 +117,52 @@ class DCTOpsTest(test.TestCase):
       scipy_idct = fftpack.idct(signals, type=dct_type, norm=norm)
       self.assertAllClose(scipy_idct, tf_idct, atol=atol, rtol=rtol)
     # Verify inverse(forward(s)) == s, up to a normalization factor.
-    tf_idct_dct = spectral_ops.idct(
+    tf_idct_dct = dct_ops.idct(
         tf_dct, type=dct_type, norm=norm).eval()
-    tf_dct_idct = spectral_ops.dct(
+    tf_dct_idct = dct_ops.dct(
         tf_idct, type=dct_type, norm=norm).eval()
     if norm is None:
-      tf_idct_dct *= 0.5 / signals.shape[-1]
-      tf_dct_idct *= 0.5 / signals.shape[-1]
+      if dct_type == 1:
+        tf_idct_dct *= 0.5 / (signals.shape[-1] - 1)
+        tf_dct_idct *= 0.5 / (signals.shape[-1] - 1)
+      else:
+        tf_idct_dct *= 0.5 / signals.shape[-1]
+        tf_dct_idct *= 0.5 / signals.shape[-1]
     self.assertAllClose(signals, tf_idct_dct, atol=atol, rtol=rtol)
     self.assertAllClose(signals, tf_dct_idct, atol=atol, rtol=rtol)
 
-  def test_random(self):
+  @parameterized.parameters([
+      [[2]], [[3]], [[10]], [[2, 20]], [[2, 3, 25]]])
+  @test_util.run_deprecated_v1
+  def test_random(self, shape):
     """Test randomly generated batches of data."""
     with spectral_ops_test_util.fft_kernel_label_map():
       with self.session(use_gpu=True):
-        for shape in ([1], [2], [3], [10], [2, 20], [2, 3, 25]):
-          signals = np.random.rand(*shape).astype(np.float32)
-          for norm in (None, "ortho"):
-            self._compare(signals, norm, 2)
-            self._compare(signals, norm, 3)
+        signals = np.random.rand(*shape).astype(np.float32)
+        # Normalization not implemented for orthonormal.
+        self._compare(signals, norm=None, dct_type=1)
+        for norm in (None, "ortho"):
+          self._compare(signals, norm, 2)
+          self._compare(signals, norm, 3)
 
   def test_error(self):
     signals = np.random.rand(10)
     # Unsupported type.
     with self.assertRaises(ValueError):
-      spectral_ops.dct(signals, type=1)
+      dct_ops.dct(signals, type=5)
+    # DCT-I normalization not implemented.
+    with self.assertRaises(ValueError):
+      dct_ops.dct(signals, type=1, norm="ortho")
+    # DCT-I requires at least two inputs.
+    with self.assertRaises(ValueError):
+      dct_ops.dct(np.random.rand(1), type=1)
     # Unknown normalization.
     with self.assertRaises(ValueError):
-      spectral_ops.dct(signals, norm="bad")
+      dct_ops.dct(signals, norm="bad")
     with self.assertRaises(NotImplementedError):
-      spectral_ops.dct(signals, n=10)
+      dct_ops.dct(signals, n=10)
     with self.assertRaises(NotImplementedError):
-      spectral_ops.dct(signals, axis=0)
+      dct_ops.dct(signals, axis=0)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/kernel_tests/fft_ops_test.py b/tensorflow/python/kernel_tests/signal/fft_ops_test.py
similarity index 96%
rename from tensorflow/python/kernel_tests/fft_ops_test.py
rename to tensorflow/python/kernel_tests/signal/fft_ops_test.py
index 8592550f99a..5b1053428c0 100644
--- a/tensorflow/python/kernel_tests/fft_ops_test.py
+++ b/tensorflow/python/kernel_tests/signal/fft_ops_test.py
@@ -25,12 +25,13 @@ from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_spectral_ops
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import spectral_ops
 from tensorflow.python.ops import spectral_ops_test_util
+from tensorflow.python.ops.signal import fft_ops
 from tensorflow.python.platform import test
 
 VALID_FFT_RANKS = (1, 2, 3)
@@ -139,24 +140,25 @@ class FFTOpsTest(BaseFFTOpsTest):
 
   def _tfFFTForRank(self, rank):
     if rank == 1:
-      return spectral_ops.fft
+      return fft_ops.fft
     elif rank == 2:
-      return spectral_ops.fft2d
+      return fft_ops.fft2d
     elif rank == 3:
-      return spectral_ops.fft3d
+      return fft_ops.fft3d
     else:
       raise ValueError("invalid rank")
 
   def _tfIFFTForRank(self, rank):
     if rank == 1:
-      return spectral_ops.ifft
+      return fft_ops.ifft
     elif rank == 2:
-      return spectral_ops.ifft2d
+      return fft_ops.ifft2d
     elif rank == 3:
-      return spectral_ops.ifft3d
+      return fft_ops.ifft3d
     else:
       raise ValueError("invalid rank")
 
+  @test_util.run_deprecated_v1
   def testEmpty(self):
     with spectral_ops_test_util.fft_kernel_label_map():
       for np_type in (np.complex64, np.complex128):
@@ -166,6 +168,7 @@ class FFTOpsTest(BaseFFTOpsTest):
             self.assertEqual(x.shape, self._tfFFT(x, rank).shape)
             self.assertEqual(x.shape, self._tfIFFT(x, rank).shape)
 
+  @test_util.run_deprecated_v1
   def testBasic(self):
     with spectral_ops_test_util.fft_kernel_label_map():
       for np_type, tol in ((np.complex64, 1e-4), (np.complex128, 1e-8)):
@@ -194,6 +197,7 @@ class FFTOpsTest(BaseFFTOpsTest):
   #           np.mod(np.arange(np.power(128, dims)), 64).reshape(
   #               (128,) * dims).astype(np.complex64), rank)
 
+  @test_util.run_deprecated_v1
   def testBasicPlaceholder(self):
     with spectral_ops_test_util.fft_kernel_label_map():
       for np_type, tol in ((np.complex64, 1e-4), (np.complex128, 1e-8)):
@@ -204,6 +208,7 @@ class FFTOpsTest(BaseFFTOpsTest):
                     (4,) * dims).astype(np_type),
                 rank, use_placeholder=True, rtol=tol, atol=tol)
 
+  @test_util.run_deprecated_v1
   def testRandom(self):
     with spectral_ops_test_util.fft_kernel_label_map():
       for np_type, tol in ((np.complex64, 1e-4), (np.complex128, 5e-6)):
@@ -218,6 +223,7 @@ class FFTOpsTest(BaseFFTOpsTest):
             self._compare(gen((4,) * dims).astype(np_type), rank,
                           rtol=tol, atol=tol)
 
+  @test_util.run_deprecated_v1
   def testRandom1D(self):
     with spectral_ops_test_util.fft_kernel_label_map():
       for np_type in (np.complex64, np.complex128):
@@ -240,6 +246,7 @@ class FFTOpsTest(BaseFFTOpsTest):
         for dim in (127, 255, 511, 1023):
           self._compare(gen((dim,)).astype(np_type), 1, rtol=tol, atol=tol)
 
+  @test_util.run_deprecated_v1
   def testError(self):
     for rank in VALID_FFT_RANKS:
       for dims in xrange(0, rank):
@@ -251,6 +258,7 @@ class FFTOpsTest(BaseFFTOpsTest):
             ValueError, "Shape must be .*rank {}.*".format(rank)):
           self._tfIFFT(x, rank)
 
+  @test_util.run_deprecated_v1
   def testGrad_Simple(self):
     with spectral_ops_test_util.fft_kernel_label_map():
       for np_type, tol in ((np.float32, 1e-4), (np.float64, 1e-10)):
@@ -263,6 +271,7 @@ class FFTOpsTest(BaseFFTOpsTest):
             self._checkGradComplex(self._tfIFFTForRank(rank), re, im,
                                    rtol=tol, atol=tol)
 
+  @test_util.run_deprecated_v1
   def testGrad_Random(self):
     with spectral_ops_test_util.fft_kernel_label_map():
       for np_type, tol in ((np.float32, 1e-2), (np.float64, 1e-10)):
@@ -312,24 +321,25 @@ class RFFTOpsTest(BaseFFTOpsTest):
 
   def _tfFFTForRank(self, rank):
     if rank == 1:
-      return spectral_ops.rfft
+      return fft_ops.rfft
     elif rank == 2:
-      return spectral_ops.rfft2d
+      return fft_ops.rfft2d
     elif rank == 3:
-      return spectral_ops.rfft3d
+      return fft_ops.rfft3d
     else:
       raise ValueError("invalid rank")
 
   def _tfIFFTForRank(self, rank):
     if rank == 1:
-      return spectral_ops.irfft
+      return fft_ops.irfft
     elif rank == 2:
-      return spectral_ops.irfft2d
+      return fft_ops.irfft2d
     elif rank == 3:
-      return spectral_ops.irfft3d
+      return fft_ops.irfft3d
     else:
       raise ValueError("invalid rank")
 
+  @test_util.run_deprecated_v1
   def testEmpty(self):
     with spectral_ops_test_util.fft_kernel_label_map():
       for rank in VALID_FFT_RANKS:
@@ -339,6 +349,7 @@ class RFFTOpsTest(BaseFFTOpsTest):
           x = np.zeros((0,) * dims).astype(np.complex64)
           self.assertEqual(x.shape, self._tfIFFT(x, rank).shape)
 
+  @test_util.run_deprecated_v1
   def testBasic(self):
     with spectral_ops_test_util.fft_kernel_label_map():
       for rank in VALID_FFT_RANKS:
@@ -366,6 +377,7 @@ class RFFTOpsTest(BaseFFTOpsTest):
                        10).reshape((size,) * (dims - 1) + (inner_dim,))
           self._compareBackward(c2r.astype(np.complex64), rank, (size,) * rank)
 
+  @test_util.run_deprecated_v1
   def testBasicPlaceholder(self):
     with spectral_ops_test_util.fft_kernel_label_map():
       for rank in VALID_FFT_RANKS:
@@ -427,6 +439,7 @@ class RFFTOpsTest(BaseFFTOpsTest):
                   fft_length,
                   use_placeholder=True)
 
+  @test_util.run_deprecated_v1
   def testRandom(self):
     with spectral_ops_test_util.fft_kernel_label_map():
       def gen_real(shape):
@@ -451,6 +464,7 @@ class RFFTOpsTest(BaseFFTOpsTest):
             self._compareBackward(
                 gen_complex(complex_dims), rank, (size,) * rank)
 
+  @test_util.run_deprecated_v1
   def testError(self):
     with spectral_ops_test_util.fft_kernel_label_map():
       for rank in VALID_FFT_RANKS:
@@ -507,6 +521,7 @@ class RFFTOpsTest(BaseFFTOpsTest):
           with self.cached_session():
             irfft_fn(x, fft_length).eval()
 
+  @test_util.run_deprecated_v1
   def testGrad_Simple(self):
     with spectral_ops_test_util.fft_kernel_label_map():
       for rank in VALID_FFT_RANKS:
@@ -521,6 +536,7 @@ class RFFTOpsTest(BaseFFTOpsTest):
             self._checkGradComplex(
                 self._tfIFFTForRank(rank), re, im, result_is_complex=False)
 
+  @test_util.run_deprecated_v1
   def testGrad_Random(self):
     with spectral_ops_test_util.fft_kernel_label_map():
       for rank in VALID_FFT_RANKS:
diff --git a/tensorflow/python/kernel_tests/signal/mel_ops_test.py b/tensorflow/python/kernel_tests/signal/mel_ops_test.py
index 1ed4429b42a..3134503daec 100644
--- a/tensorflow/python/kernel_tests/signal/mel_ops_test.py
+++ b/tensorflow/python/kernel_tests/signal/mel_ops_test.py
@@ -22,6 +22,7 @@ import numpy as np
 
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util as tf_test_util
 from tensorflow.python.kernel_tests.signal import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops.signal import mel_ops
@@ -141,14 +142,16 @@ class LinearToMelTest(test.TestCase):
       for config in configs:
         mel_matrix_np = spectrogram_to_mel_matrix(*config)
         mel_matrix = mel_ops.linear_to_mel_weight_matrix(*config)
-        self.assertAllClose(mel_matrix_np, mel_matrix.eval(), atol=3e-6)
+        self.assertAllClose(mel_matrix_np, self.evaluate(mel_matrix), atol=3e-6)
 
+  @tf_test_util.run_deprecated_v1
   def test_dtypes(self):
     # LinSpace is not supported for tf.float16.
     for dtype in (dtypes.bfloat16, dtypes.float32, dtypes.float64):
       self.assertEqual(dtype,
                        mel_ops.linear_to_mel_weight_matrix(dtype=dtype).dtype)
 
+  @tf_test_util.run_deprecated_v1
   def test_error(self):
     with self.assertRaises(ValueError):
       mel_ops.linear_to_mel_weight_matrix(num_mel_bins=0)
@@ -177,6 +180,7 @@ class LinearToMelTest(test.TestCase):
         rewritten_graph = test_util.grappler_optimize(g, [mel_matrix])
         self.assertEqual(1, len(rewritten_graph.node))
 
+  @tf_test_util.run_deprecated_v1
   def test_num_spectrogram_bins_dynamic(self):
     with self.session(use_gpu=True):
       num_spectrogram_bins = array_ops.placeholder(shape=(),
diff --git a/tensorflow/python/kernel_tests/signal/mfcc_ops_test.py b/tensorflow/python/kernel_tests/signal/mfcc_ops_test.py
index 79d23d77d1e..935922657cd 100644
--- a/tensorflow/python/kernel_tests/signal/mfcc_ops_test.py
+++ b/tensorflow/python/kernel_tests/signal/mfcc_ops_test.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import spectral_ops_test_util
@@ -32,6 +33,7 @@ from tensorflow.python.platform import test
 # HTK conventions.
 class MFCCTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def test_error(self):
     # num_mel_bins must be positive.
     with self.assertRaises(ValueError):
@@ -43,6 +45,7 @@ class MFCCTest(test.TestCase):
       signal = array_ops.zeros((2, 3, 5), dtype=dtypes.float64)
       mfcc_ops.mfccs_from_log_mel_spectrograms(signal)
 
+  @test_util.run_deprecated_v1
   def test_basic(self):
     """A basic test that the op runs on random input."""
     with spectral_ops_test_util.fft_kernel_label_map():
@@ -50,6 +53,7 @@ class MFCCTest(test.TestCase):
         signal = random_ops.random_normal((2, 3, 5))
         mfcc_ops.mfccs_from_log_mel_spectrograms(signal).eval()
 
+  @test_util.run_deprecated_v1
   def test_unknown_shape(self):
     """A test that the op runs when shape and rank are unknown."""
     with spectral_ops_test_util.fft_kernel_label_map():
diff --git a/tensorflow/python/kernel_tests/signal/reconstruction_ops_test.py b/tensorflow/python/kernel_tests/signal/reconstruction_ops_test.py
index c4e5b6f6740..4cb6cedee99 100644
--- a/tensorflow/python/kernel_tests/signal/reconstruction_ops_test.py
+++ b/tensorflow/python/kernel_tests/signal/reconstruction_ops_test.py
@@ -20,8 +20,10 @@ from __future__ import print_function
 
 import numpy as np
 
+from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import math_ops
@@ -52,16 +54,75 @@ class ReconstructionOpsTest(test.TestCase):
                             "100000000000000"]
 
   def test_all_ones(self):
-    signal = constant_op.constant(np.ones((3, 5)), dtype=dtypes.int64)
+    signal = array_ops.ones([3, 5])
     reconstruction = reconstruction_ops.overlap_and_add(signal, 2)
 
-    with self.session(use_gpu=True) as sess:
-      output = sess.run(reconstruction)
+    self.assertEqual(reconstruction.shape.as_list(), [9])
+
+    with self.session(use_gpu=True):
+      output = self.evaluate(reconstruction)
 
       expected_output = np.array([1, 1, 2, 2, 3, 2, 2, 1, 1])
 
       self.assertAllClose(output, expected_output)
 
+  def test_unknown_shapes(self):
+    # This test uses placeholders and does not work in eager mode.
+    if context.executing_eagerly():
+      return
+
+    signal = array_ops.placeholder(dtype=dtypes.int32, shape=[None, None, None])
+    frame_step = array_ops.placeholder(dtype=dtypes.int32, shape=[])
+    reconstruction = reconstruction_ops.overlap_and_add(signal, frame_step)
+
+    self.assertEqual(reconstruction.shape.as_list(), [None, None])
+
+    with self.session(use_gpu=True) as sess:
+      output = sess.run(reconstruction,
+                        feed_dict={signal: np.ones([4, 3, 5]), frame_step: 2})
+
+      expected_output = np.array([[1, 1, 2, 2, 3, 2, 2, 1, 1]] * 4)
+
+      self.assertAllClose(output, expected_output)
+
+  def test_unknown_rank(self):
+    # This test uses placeholders and does not work in eager mode.
+    if context.executing_eagerly():
+      return
+
+    signal = array_ops.placeholder(dtype=dtypes.int32, shape=None)
+    frame_step = array_ops.placeholder(dtype=dtypes.int32, shape=[])
+    reconstruction = reconstruction_ops.overlap_and_add(signal, frame_step)
+
+    self.assertEqual(reconstruction.shape, None)
+
+    with self.session(use_gpu=True) as sess:
+      output = sess.run(reconstruction,
+                        feed_dict={signal: np.ones([4, 3, 5]), frame_step: 2})
+
+      expected_output = np.array([[1, 1, 2, 2, 3, 2, 2, 1, 1]] * 4)
+
+      self.assertAllClose(output, expected_output)
+
+  def test_fast_path(self):
+    # This test uses tensor names and does not work in eager mode.
+    if context.executing_eagerly():
+      return
+
+    signal = array_ops.ones([3, 5])
+    frame_step = 5
+    reconstruction = reconstruction_ops.overlap_and_add(signal, frame_step)
+
+    self.assertEqual(reconstruction.name, "overlap_and_add/fast_path:0")
+
+    with self.session(use_gpu=True) as sess:
+      output = self.evaluate(reconstruction)
+
+      expected_output = np.ones([15])
+
+      self.assertAllClose(output, expected_output)
+
+  @test_util.run_deprecated_v1
   def test_simple(self):
     def make_input(frame_length, num_frames=3):
       """Generate a tensor of num_frames frames of frame_length."""
@@ -98,8 +159,8 @@ class ReconstructionOpsTest(test.TestCase):
                                   dtype=dtypes.int64)
     reconstruction = reconstruction_ops.overlap_and_add(signal, self.frame_hop)
 
-    with self.session(use_gpu=True) as sess:
-      output = sess.run(reconstruction)
+    with self.session(use_gpu=True):
+      output = self.evaluate(reconstruction)
       string_output = [np.base_repr(x, self.bases[0]) for x in output]
 
       self.assertEqual(string_output, self.expected_string)
@@ -108,8 +169,8 @@ class ReconstructionOpsTest(test.TestCase):
     signal = constant_op.constant(self.powers, dtype=dtypes.int64)
     reconstruction = reconstruction_ops.overlap_and_add(signal, self.frame_hop)
 
-    with self.session(use_gpu=True) as sess:
-      output = sess.run(reconstruction)
+    with self.session(use_gpu=True):
+      output = self.evaluate(reconstruction)
 
       accumulator = True
       for i in range(self.batch_size):
@@ -124,8 +185,8 @@ class ReconstructionOpsTest(test.TestCase):
     signal = constant_op.constant(input_matrix, dtype=dtypes.float32)
     reconstruction = reconstruction_ops.overlap_and_add(signal, self.frame_hop)
 
-    with self.session(use_gpu=True) as sess:
-      output = sess.run(reconstruction)
+    with self.session(use_gpu=True):
+      output = self.evaluate(reconstruction)
 
       string_output = [np.base_repr(int(x), self.bases[0]) for x in
                        np.squeeze(output)]
@@ -133,6 +194,7 @@ class ReconstructionOpsTest(test.TestCase):
       self.assertEqual(output.shape, (1, 9))
       self.assertEqual(string_output, self.expected_string)
 
+  @test_util.run_deprecated_v1
   def test_gradient(self):
     configurations = [
         ((1, 128), 1),
@@ -154,6 +216,7 @@ class ReconstructionOpsTest(test.TestCase):
         gradient = sess.run(gradients_impl.gradients([loss], [signal])[0])
         self.assertTrue((gradient == 1.0).all())
 
+  @test_util.run_deprecated_v1
   def test_gradient_batch(self):
     with self.session(use_gpu=True) as sess:
       signal = array_ops.zeros((2, 10, 10))
@@ -176,6 +239,7 @@ class ReconstructionOpsTest(test.TestCase):
           np.reshape(np.arange(100).astype(np.float32), (10, 10))])
       self.assertAllEqual(expected_gradient, gradient)
 
+  @test_util.run_deprecated_v1
   def test_gradient_numerical(self):
     with self.session(use_gpu=True):
       shape = (2, 10, 10)
diff --git a/tensorflow/python/kernel_tests/signal/shape_ops_test.py b/tensorflow/python/kernel_tests/signal/shape_ops_test.py
index 398fba8b6df..32ac76e80d0 100644
--- a/tensorflow/python/kernel_tests/signal/shape_ops_test.py
+++ b/tensorflow/python/kernel_tests/signal/shape_ops_test.py
@@ -23,6 +23,7 @@ import numpy as np
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util as tf_test_util
 from tensorflow.python.kernel_tests.signal import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
@@ -32,6 +33,7 @@ from tensorflow.python.platform import test
 
 class FrameTest(test.TestCase):
 
+  @tf_test_util.run_deprecated_v1
   def test_mapping_of_indices_without_padding(self):
     with self.session(use_gpu=True):
       tensor = constant_op.constant(np.arange(9152), dtypes.int32)
@@ -47,6 +49,7 @@ class FrameTest(test.TestCase):
 
       self.assertAllEqual(expected, result)
 
+  @tf_test_util.run_deprecated_v1
   def test_mapping_of_indices_with_padding(self):
     with self.session(use_gpu=True):
       tensor = constant_op.constant(np.arange(10000), dtypes.int32)
@@ -64,6 +67,7 @@ class FrameTest(test.TestCase):
 
       self.assertAllEqual(expected, result)
 
+  @tf_test_util.run_deprecated_v1
   def test_invalid_inputs(self):
     # Rank 0 input signal.
     with self.assertRaises(ValueError):
@@ -84,6 +88,7 @@ class FrameTest(test.TestCase):
     with self.assertRaises(ValueError):
       shape_ops.frame([1], 1, 1, pad_end=True, pad_value=[1])
 
+  @tf_test_util.run_deprecated_v1
   def test_length_zero(self):
     signal = constant_op.constant([], dtype=dtypes.float32)
     frame_length = 2
@@ -98,6 +103,7 @@ class FrameTest(test.TestCase):
                                pad_end=False).eval()
       self.assertEqual((0, 2), result.shape)
 
+  @tf_test_util.run_deprecated_v1
   def test_shape_inference(self):
     signal = array_ops.placeholder(dtypes.int32, shape=[1, 1])
     frame_length = 2
@@ -150,9 +156,10 @@ class FrameTest(test.TestCase):
           op = shape_ops.frame(signal, frame_length, frame_step,
                                pad_end=pad_end, pad_value=99)
           with self.cached_session(use_gpu=True):
-            result = op.eval()
+            result = self.evaluate(op)
           self.assertEqual(op.shape.as_list(), list(result.shape))
 
+  @tf_test_util.run_deprecated_v1
   def test_basic_mono(self):
     signal = np.arange(6)
     frame_length = 3
@@ -178,6 +185,7 @@ class FrameTest(test.TestCase):
                                  pad_end=False).eval()
         self.assertAllEqual(expected, result)
 
+  @tf_test_util.run_deprecated_v1
   def test_basic_stereo(self):
     signal = np.vstack([np.arange(6),
                         np.arange(6) + 10])
@@ -207,6 +215,7 @@ class FrameTest(test.TestCase):
                                  pad_end=False).eval()
         self.assertAllEqual(expected, result)
 
+  @tf_test_util.run_deprecated_v1
   def test_complex_shape(self):
     signal = np.vstack([np.arange(6),
                         np.arange(6) + 10,
@@ -248,7 +257,7 @@ class FrameTest(test.TestCase):
       result = shape_ops.frame(signal, frame_length=2, frame_step=2,
                                pad_end=True, axis=1)
       expected = np.reshape(np.arange(16), (2, 2, 2, 2))
-      self.assertAllEqual(expected, result.eval())
+      self.assertAllEqual(expected, self.evaluate(result))
 
       result = shape_ops.frame(signal, frame_length=2, frame_step=1,
                                pad_end=True, axis=1)
@@ -260,7 +269,7 @@ class FrameTest(test.TestCase):
                    [[10, 11], [12, 13]],
                    [[12, 13], [14, 15]],
                    [[14, 15], [0, 0]]]]
-      self.assertAllEqual(expected, result.eval())
+      self.assertAllEqual(expected, self.evaluate(result))
 
       result = shape_ops.frame(signal, frame_length=3, frame_step=1,
                                pad_end=True, axis=1)
@@ -272,8 +281,9 @@ class FrameTest(test.TestCase):
                    [[10, 11], [12, 13], [14, 15]],
                    [[12, 13], [14, 15], [0, 0]],
                    [[14, 15], [0, 0], [0, 0]]]]
-      self.assertAllEqual(expected, result.eval())
+      self.assertAllEqual(expected, self.evaluate(result))
 
+  @tf_test_util.run_deprecated_v1
   def test_window_larger_than_signal(self):
     signal = constant_op.constant([[1, 2], [11, 12]], dtype=dtypes.float32)
     frame_length = 4
@@ -307,6 +317,7 @@ class FrameTest(test.TestCase):
       result = shape_ops.frame(signal, frame_length, frame_step)
       self.assertEqual(result.dtype, signal.dtype)
 
+  @tf_test_util.run_deprecated_v1
   def test_dynamic_tensor(self):
     # Show that frame works even when the dimensions of its input are
     # not known at graph creation time.
@@ -325,6 +336,7 @@ class FrameTest(test.TestCase):
                            [[10, 11], [12, 13]],
                            [[20, 21], [22, 23]]], result)
 
+  @tf_test_util.run_deprecated_v1
   def test_gradient_numerical(self):
     with self.session(use_gpu=True):
       signal_shape = (2, 128)
diff --git a/tensorflow/python/kernel_tests/signal/spectral_ops_test.py b/tensorflow/python/kernel_tests/signal/spectral_ops_test.py
index 26cb1270639..7b9748c7f26 100644
--- a/tensorflow/python/kernel_tests/signal/spectral_ops_test.py
+++ b/tensorflow/python/kernel_tests/signal/spectral_ops_test.py
@@ -125,22 +125,22 @@ class SpectralOpsTest(test.TestCase):
       stft = spectral_ops.stft(signal, frame_length=7, frame_step=8,
                                pad_end=True)
       self.assertAllEqual([64, 5], stft.shape.as_list())
-      self.assertAllEqual([64, 5], stft.eval().shape)
+      self.assertAllEqual([64, 5], self.evaluate(stft).shape)
 
       stft = spectral_ops.stft(signal, frame_length=8, frame_step=8,
                                pad_end=True)
       self.assertAllEqual([64, 5], stft.shape.as_list())
-      self.assertAllEqual([64, 5], stft.eval().shape)
+      self.assertAllEqual([64, 5], self.evaluate(stft).shape)
 
       stft = spectral_ops.stft(signal, frame_length=8, frame_step=8,
                                fft_length=16, pad_end=True)
       self.assertAllEqual([64, 9], stft.shape.as_list())
-      self.assertAllEqual([64, 9], stft.eval().shape)
+      self.assertAllEqual([64, 9], self.evaluate(stft).shape)
 
       stft = spectral_ops.stft(signal, frame_length=16, frame_step=8,
                                fft_length=8, pad_end=True)
       self.assertAllEqual([64, 5], stft.shape.as_list())
-      self.assertAllEqual([64, 5], stft.eval().shape)
+      self.assertAllEqual([64, 5], self.evaluate(stft).shape)
 
       stft = np.zeros((32, 9)).astype(np.complex64)
 
@@ -148,7 +148,7 @@ class SpectralOpsTest(test.TestCase):
                                                fft_length=16, frame_step=8)
       expected_length = (stft.shape[0] - 1) * 8 + 8
       self.assertAllEqual([256], inverse_stft.shape.as_list())
-      self.assertAllEqual([expected_length], inverse_stft.eval().shape)
+      self.assertAllEqual([expected_length], self.evaluate(inverse_stft).shape)
 
   def test_stft_and_inverse_stft(self):
     """Test that spectral_ops.stft/inverse_stft match a NumPy implementation."""
@@ -235,7 +235,8 @@ class SpectralOpsTest(test.TestCase):
       inverse_window = inverse_window_fn(frame_length, dtype=dtypes.float32)
 
       with self.cached_session(use_gpu=True) as sess:
-        hann_window, inverse_window = sess.run([hann_window, inverse_window])
+        hann_window, inverse_window = self.evaluate(
+            [hann_window, inverse_window])
 
       # Expect unit gain at each phase of the window.
       product_window = hann_window * inverse_window
@@ -263,7 +264,8 @@ class SpectralOpsTest(test.TestCase):
       inverse_window = inverse_window_fn(frame_length, dtype=dtypes.float32)
 
       with self.cached_session(use_gpu=True) as sess:
-        hann_window, inverse_window = sess.run([hann_window, inverse_window])
+        hann_window, inverse_window = self.evaluate(
+            [hann_window, inverse_window])
 
       self.assertAllClose(hann_window, inverse_window * 1.5)
 
@@ -293,7 +295,7 @@ class SpectralOpsTest(test.TestCase):
       # the sum of the magnitude STFT.
       sinusoid = math_ops.sin(
           2 * np.pi * math_ops.linspace(0.0, 1.0, signal_length))
-      sinusoid_gradient = sess.run(self._compute_stft_gradient(sinusoid))
+      sinusoid_gradient = self.evaluate(self._compute_stft_gradient(sinusoid))
       self.assertFalse((sinusoid_gradient == 0.0).all())
 
   def test_gradients_numerical(self):
diff --git a/tensorflow/python/kernel_tests/signal/test_util.py b/tensorflow/python/kernel_tests/signal/test_util.py
index f2c4d0dc8f8..0a8a621c3ee 100644
--- a/tensorflow/python/kernel_tests/signal/test_util.py
+++ b/tensorflow/python/kernel_tests/signal/test_util.py
@@ -18,12 +18,12 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.core.protobuf import rewriter_config_pb2
+from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.grappler import tf_optimizer
 from tensorflow.python.training import saver
 
 
-def grappler_optimize(graph, fetches=None, rewriter_config=None):
+def grappler_optimize(graph, fetches=None, config_proto=None):
   """Tries to optimize the provided graph using grappler.
 
   Args:
@@ -31,17 +31,17 @@ def grappler_optimize(graph, fetches=None, rewriter_config=None):
     fetches: An optional list of `Tensor`s to fetch (i.e. not optimize away).
       Grappler uses the 'train_op' collection to look for fetches, so if not
       provided this collection should be non-empty.
-    rewriter_config: An optional `tf.RewriterConfig` to use when rewriting the
+    config_proto: An optional `tf.ConfigProto` to use when rewriting the
       graph.
 
   Returns:
     A `tf.GraphDef` containing the rewritten graph.
   """
-  if rewriter_config is None:
-    rewriter_config = rewriter_config_pb2.RewriterConfig()
-    rewriter_config.min_graph_nodes = -1
+  if config_proto is None:
+    config_proto = config_pb2.ConfigProto()
+    config_proto.graph_options.rewrite_options.min_graph_nodes = -1
   if fetches is not None:
     for fetch in fetches:
       graph.add_to_collection('train_op', fetch)
   metagraph = saver.export_meta_graph(graph_def=graph.as_graph_def())
-  return tf_optimizer.OptimizeGraph(rewriter_config, metagraph)
+  return tf_optimizer.OptimizeGraph(config_proto, metagraph)
diff --git a/tensorflow/python/kernel_tests/signal/window_ops_test.py b/tensorflow/python/kernel_tests/signal/window_ops_test.py
index 2f19134f5a8..a72cdb288bb 100644
--- a/tensorflow/python/kernel_tests/signal/window_ops_test.py
+++ b/tensorflow/python/kernel_tests/signal/window_ops_test.py
@@ -24,6 +24,7 @@ import numpy as np
 
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util as tf_test_util
 from tensorflow.python.kernel_tests.signal import test_util
 from tensorflow.python.ops.signal import window_ops
 from tensorflow.python.platform import test
@@ -75,6 +76,7 @@ class WindowOpsTest(test.TestCase):
                                   dtype=tf_dtype).eval()
             self.assertAllClose(expected, actual, tol, tol)
 
+  @tf_test_util.run_deprecated_v1
   def test_hann_window(self):
     """Check that hann_window matches scipy.signal.hann behavior."""
     # The Hann window is a raised cosine window with parameters alpha=0.5 and
@@ -84,6 +86,7 @@ class WindowOpsTest(test.TestCase):
         functools.partial(_scipy_raised_cosine, a=0.5, b=0.5),
         window_ops.hann_window)
 
+  @tf_test_util.run_deprecated_v1
   def test_hamming_window(self):
     """Check that hamming_window matches scipy.signal.hamming's behavior."""
     # The Hamming window is a raised cosine window with parameters alpha=0.54
diff --git a/tensorflow/python/kernel_tests/slice_op_test.py b/tensorflow/python/kernel_tests/slice_op_test.py
index 41f040ab739..8f7245214a2 100644
--- a/tensorflow/python/kernel_tests/slice_op_test.py
+++ b/tensorflow/python/kernel_tests/slice_op_test.py
@@ -24,6 +24,7 @@ from six.moves import xrange  # pylint: disable=redefined-builtin
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import math_ops
@@ -38,7 +39,7 @@ class SliceTest(test.TestCase):
       with self.cached_session(use_gpu=True):
         a = constant_op.constant(inp, shape=[4, 4], dtype=dtypes.float32)
         slice_t = a[2, k:k]
-        slice_val = slice_t.eval()
+        slice_val = self.evaluate(slice_t)
       self.assertAllEqual(slice_val, inp[2, k:k])
 
   def testInt32(self):
@@ -47,7 +48,7 @@ class SliceTest(test.TestCase):
       with self.cached_session(use_gpu=True):
         a = constant_op.constant(inp, shape=[4, 4], dtype=dtypes.int32)
         slice_t = a[2, k:k]
-        slice_val = slice_t.eval()
+        slice_val = self.evaluate(slice_t)
       self.assertAllEqual(slice_val, inp[2, k:k])
 
   def testSlicingWithInt64Index(self):
@@ -57,33 +58,33 @@ class SliceTest(test.TestCase):
       # Slice using int64 Tensor.
       i = constant_op.constant(1, dtype=dtypes.int64)
       slice_t = a[i]
-      slice_val = slice_t.eval()
+      slice_val = self.evaluate(slice_t)
       self.assertAllEqual(1, slice_val)
       slice_t = a[i:i+1]
-      slice_val = slice_t.eval()
+      slice_val = self.evaluate(slice_t)
       self.assertAllEqual([1], slice_val)
 
       # Slice using int64 integer.
       i = np.asarray(1).astype(np.int64)
       slice_t = a[i]
-      slice_val = slice_t.eval()
+      slice_val = self.evaluate(slice_t)
       self.assertAllEqual(1, slice_val)
       slice_t = a[i:i+1]
-      slice_val = slice_t.eval()
+      slice_val = self.evaluate(slice_t)
       self.assertAllEqual([1], slice_val)
 
       a_int32 = constant_op.constant([0, 1, 2], dtype=dtypes.int32)
       slice_t = array_ops.slice(a_int32,
                                 np.asarray([1]).astype(np.int64),
                                 np.asarray([2]).astype(np.int64))
-      slice_val = slice_t.eval()
+      slice_val = self.evaluate(slice_t)
       self.assertAllEqual([1, 2], slice_val)
 
       a_float32 = constant_op.constant([0, 1, 2], dtype=dtypes.float32)
       slice_t = array_ops.slice(a_float32,
                                 np.asarray([1]).astype(np.int64),
                                 np.asarray([2]).astype(np.int64))
-      slice_val = slice_t.eval()
+      slice_val = self.evaluate(slice_t)
       self.assertAllEqual([1, 2], slice_val)
 
   def testSlicingInt64Tensor(self):
@@ -93,23 +94,23 @@ class SliceTest(test.TestCase):
       # Slice using int32 Tensor.
       i = constant_op.constant(1, dtype=dtypes.int32)
       slice_t = a[i]
-      slice_val = slice_t.eval()
+      slice_val = self.evaluate(slice_t)
       self.assertAllEqual(1, slice_val)
       slice_t = a[i:i + 1]
-      slice_val = slice_t.eval()
+      slice_val = self.evaluate(slice_t)
       self.assertAllEqual([1], slice_val)
 
       # Slice using int32 integer.
       i = np.asarray(1).astype(np.int32)
       slice_t = a[i]
-      slice_val = slice_t.eval()
+      slice_val = self.evaluate(slice_t)
       self.assertAllEqual(1, slice_val)
       slice_t = a[i:i + 1]
-      slice_val = slice_t.eval()
+      slice_val = self.evaluate(slice_t)
       self.assertAllEqual([1], slice_val)
 
       slice_t = array_ops.slice(a, [1], [2])
-      slice_val = slice_t.eval()
+      slice_val = self.evaluate(slice_t)
       self.assertAllEqual([1, 2], slice_val)
 
   def testSelectAll(self):
@@ -121,8 +122,8 @@ class SliceTest(test.TestCase):
         slice_explicit_t = array_ops.slice(a, [0, 0, 0, 0], [-1, -1, -1, -1])
         slice_implicit_t = a[:, :, :, :]
 
-        self.assertAllEqual(inp, slice_explicit_t.eval())
-        self.assertAllEqual(inp, slice_implicit_t.eval())
+        self.assertAllEqual(inp, self.evaluate(slice_explicit_t))
+        self.assertAllEqual(inp, self.evaluate(slice_implicit_t))
         self.assertEqual(inp.shape, slice_explicit_t.get_shape())
         self.assertEqual(inp.shape, slice_implicit_t.get_shape())
 
@@ -134,7 +135,7 @@ class SliceTest(test.TestCase):
 
         hi = np.random.randint(0, 9)
         scalar_t = a[hi]
-        scalar_val = scalar_t.eval()
+        scalar_val = self.evaluate(scalar_t)
         self.assertAllEqual(scalar_val, inp[hi])
 
         if hi > 0:
@@ -142,9 +143,10 @@ class SliceTest(test.TestCase):
         else:
           lo = 0
         slice_t = a[lo:hi]
-        slice_val = slice_t.eval()
+        slice_val = self.evaluate(slice_t)
         self.assertAllEqual(slice_val, inp[lo:hi])
 
+  @test_util.run_deprecated_v1
   def testScalarInput(self):
     input_val = 0
     with self.cached_session() as sess:
@@ -159,6 +161,7 @@ class SliceTest(test.TestCase):
                                                "out of range"):
         sess.run([slice_t], feed_dict={input_t: input_val})
 
+  @test_util.run_deprecated_v1
   def testInvalidIndex(self):
     input_val = [1, 2]
     with self.cached_session() as sess:
@@ -179,6 +182,7 @@ class SliceTest(test.TestCase):
     np_ans = x[begin:begin + size, :]
     self.assertAllEqual(tf_ans, np_ans)
 
+  @test_util.run_deprecated_v1
   def testSliceMatrixDim0(self):
     x = np.random.rand(8, 4).astype("f")
     self._testSliceMatrixDim0(x, 1, 2)
@@ -195,7 +199,7 @@ class SliceTest(test.TestCase):
 
         x, y = np.random.randint(0, 3, size=2).tolist()
         slice_t = a[x, 0:y]
-        slice_val = slice_t.eval()
+        slice_val = self.evaluate(slice_t)
       self.assertAllEqual(slice_val, inp[x, 0:y])
 
   def testSimple(self):
@@ -207,12 +211,13 @@ class SliceTest(test.TestCase):
           dtype=dtypes.float32)
       slice_t = array_ops.slice(a, [0, 0], [2, 2])
       slice2_t = a[:2, :2]
-      slice_val, slice2_val = sess.run([slice_t, slice2_t])
+      slice_val, slice2_val = self.evaluate([slice_t, slice2_t])
     self.assertAllEqual(slice_val, inp[:2, :2])
     self.assertAllEqual(slice2_val, inp[:2, :2])
     self.assertEqual(slice_val.shape, slice_t.get_shape())
     self.assertEqual(slice2_val.shape, slice2_t.get_shape())
 
+  @test_util.run_deprecated_v1
   def testComplex(self):
     with self.session(use_gpu=True):
       inp = np.random.rand(4, 10, 10, 4).astype("f")
@@ -247,7 +252,7 @@ class SliceTest(test.TestCase):
                    + sizes[3], indices[4]:indices[4] + sizes[4], indices[5]:
                    indices[5] + sizes[5]]
 
-      slice_val, slice2_val = sess.run([slice_t, slice2_t])
+      slice_val, slice2_val = self.evaluate([slice_t, slice2_t])
 
     expected_val = inp[indices[0]:indices[0] + sizes[0], indices[1]:indices[
         1] + sizes[1], indices[2]:indices[2] + sizes[2], indices[3]:indices[
@@ -282,7 +287,7 @@ class SliceTest(test.TestCase):
       grads = np.random.rand(num_grads).astype("f").reshape(slice_size)
       grad_tensor = constant_op.constant(grads)
       grad = gradients_impl.gradients(slice_t, [a], grad_tensor)[0]
-      result = grad.eval()
+      result = self.evaluate(grad)
 
     # Create a zero tensor of the input shape ane place
     # the grads into the right location to compare against TensorFlow.
@@ -313,9 +318,10 @@ class SliceTest(test.TestCase):
       g1 = gradients_impl.gradients(loss1, x)[0]
       g2 = gradients_impl.gradients(loss2, x)[0]
 
-      g1_val, g2_val = sess.run([g1, g2])
+      g1_val, g2_val = self.evaluate([g1, g2])
     self.assertAllEqual(g1_val, g2_val)
 
+  @test_util.run_deprecated_v1
   def testGradientsAll(self):
     # Slice the middle square out of a 4x4 input
     self._testGradientSlice([4, 4], [1, 1], [2, 2])
@@ -335,6 +341,7 @@ class SliceTest(test.TestCase):
     # Use -1 as a slice dimension on a 2D tensor.
     self._testGradientVariableSize2D()
 
+  @test_util.run_deprecated_v1
   def testNotIterable(self):
     # NOTE(mrry): If we register __getitem__ as an overloaded
     # operator, Python will valiantly attempt to iterate over the
@@ -346,6 +353,7 @@ class SliceTest(test.TestCase):
       for _ in c:
         pass
 
+  @test_util.run_deprecated_v1
   def testComputedShape(self):
     # NOTE(mrry): We cannot currently handle partially-known values,
     # because `tf.slice()` uses -1 to specify a wildcard size, and
@@ -368,7 +376,7 @@ class SliceTest(test.TestCase):
       c = b[:-1, :]
       d = c[1, :]
       res = 2 * d - c[1, :] + a[2, :] - 2 * b[-2, :]
-      self.assertAllEqual([0, 0, 0], res.eval())
+      self.assertAllEqual([0, 0, 0], self.evaluate(res))
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/kernel_tests/softmax_op_test.py b/tensorflow/python/kernel_tests/softmax_op_test.py
index ef9301d4e35..707b8a429f2 100644
--- a/tensorflow/python/kernel_tests/softmax_op_test.py
+++ b/tensorflow/python/kernel_tests/softmax_op_test.py
@@ -24,6 +24,7 @@ import numpy as np
 
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.platform import test
@@ -64,7 +65,7 @@ class SoftmaxTest(test.TestCase):
         tf_softmax = nn_ops.log_softmax(np_features, axis=dim, name=name)
       else:
         tf_softmax = nn_ops.softmax(np_features, axis=dim, name=name)
-      out = tf_softmax.eval()
+      out = self.evaluate(tf_softmax)
     self.assertAllCloseAccordingToType(np_softmax, out)
     self.assertShapeEqual(np_softmax, tf_softmax)
     if not log:
@@ -113,7 +114,7 @@ class SoftmaxTest(test.TestCase):
     features = np.array([[1., 1., 1., 1.], [max, 1., 2., 3.]]).astype(type)
     with self.cached_session(use_gpu=use_gpu):
       tf_log_softmax = nn_ops.log_softmax(features)
-      out = tf_log_softmax.eval()
+      out = self.evaluate(tf_log_softmax)
     self.assertAllClose(
         np.array([[-1.386294, -1.386294, -1.386294, -1.386294],
                   [0, -max, -max, -max]]),
@@ -206,6 +207,7 @@ class SoftmaxTest(test.TestCase):
                          [[5., 4., 3., 2.], [1., 2., 3., 4.]]])
     self.assertEqual([3, 2, 4], op.get_shape())
 
+  @test_util.run_deprecated_v1
   def testEmptyInput(self):
     with self.cached_session():
       x = array_ops.placeholder(dtypes.float32, shape=[0, 3])
@@ -229,6 +231,7 @@ class SoftmaxTest(test.TestCase):
       with self.assertRaises(errors_impl.InvalidArgumentError):
         nn_ops.softmax(ones, axis=2).eval()
 
+  @test_util.run_deprecated_v1
   def testLargeDims(self):
     # Make sure that we properly handle large inputs. See
     # https://github.com/tensorflow/tensorflow/issues/4425 for details
diff --git a/tensorflow/python/kernel_tests/softplus_op_test.py b/tensorflow/python/kernel_tests/softplus_op_test.py
index 50a8291ea88..5273dd7ffc7 100644
--- a/tensorflow/python/kernel_tests/softplus_op_test.py
+++ b/tensorflow/python/kernel_tests/softplus_op_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import nn_ops
@@ -39,7 +40,7 @@ class SoftplusTest(test.TestCase):
     np_softplus = self._npSoftplus(np_features)
     with self.cached_session(use_gpu=use_gpu):
       softplus = nn_ops.softplus(np_features)
-      tf_softplus = softplus.eval()
+      tf_softplus = self.evaluate(softplus)
     self.assertAllCloseAccordingToType(np_softplus, tf_softplus)
     self.assertTrue(np.all(tf_softplus > 0))
     self.assertShapeEqual(np_softplus, softplus)
@@ -70,6 +71,7 @@ class SoftplusTest(test.TestCase):
           ],
           use_gpu=True)
 
+  @test_util.run_deprecated_v1
   def testGradient(self):
     with self.cached_session():
       x = constant_op.constant(
@@ -86,6 +88,7 @@ class SoftplusTest(test.TestCase):
     print("softplus (float) gradient err = ", err)
     self.assertLess(err, 1e-4)
 
+  @test_util.run_deprecated_v1
   def testGradGrad(self):
     with self.cached_session():
       x = constant_op.constant(
@@ -103,6 +106,7 @@ class SoftplusTest(test.TestCase):
     print("softplus (float) gradient of gradient err = ", err)
     self.assertLess(err, 5e-5)
 
+  @test_util.run_deprecated_v1
   def testGradGradGrad(self):
     with self.cached_session():
       x = constant_op.constant(
@@ -121,6 +125,7 @@ class SoftplusTest(test.TestCase):
     print("softplus (float) third-order gradient err = ", err)
     self.assertLess(err, 5e-5)
 
+  @test_util.run_deprecated_v1
   def testNoInts(self):
     with self.cached_session():
       with self.assertRaisesRegexp(
diff --git a/tensorflow/python/kernel_tests/softsign_op_test.py b/tensorflow/python/kernel_tests/softsign_op_test.py
index ee2e2e03032..5554240c826 100644
--- a/tensorflow/python/kernel_tests/softsign_op_test.py
+++ b/tensorflow/python/kernel_tests/softsign_op_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import nn_ops
 import tensorflow.python.ops.nn_grad  # pylint: disable=unused-import
@@ -36,7 +37,7 @@ class SoftsignTest(test.TestCase):
     np_softsign = self._npSoftsign(np_features)
     with self.cached_session(use_gpu=use_gpu):
       softsign = nn_ops.softsign(np_features)
-      tf_softsign = softsign.eval()
+      tf_softsign = self.evaluate(softsign)
     self.assertAllClose(np_softsign, tf_softsign)
     self.assertShapeEqual(np_softsign, softsign)
 
@@ -49,6 +50,7 @@ class SoftsignTest(test.TestCase):
           np.array([[-9, 7, -5, 3, -1], [1, -3, 5, -7, 9]]).astype(t),
           use_gpu=True)
 
+  @test_util.run_deprecated_v1
   def testGradient(self):
     with self.cached_session():
       x = constant_op.constant(
@@ -65,6 +67,7 @@ class SoftsignTest(test.TestCase):
     print("softsign (float) gradient err = ", err)
     self.assertLess(err, 1e-4)
 
+  @test_util.run_deprecated_v1
   def testNoInts(self):
     with self.cached_session():
       with self.assertRaisesRegexp(
diff --git a/tensorflow/python/kernel_tests/spacetobatch_op_test.py b/tensorflow/python/kernel_tests/spacetobatch_op_test.py
index 21134adf2ca..8641156604c 100644
--- a/tensorflow/python/kernel_tests/spacetobatch_op_test.py
+++ b/tensorflow/python/kernel_tests/spacetobatch_op_test.py
@@ -23,6 +23,7 @@ import numpy as np
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_array_ops
 from tensorflow.python.ops import gradient_checker
@@ -115,6 +116,7 @@ class SpaceToBatchTest(test.TestCase, PythonOpImpl):
     self._testPad(inputs, paddings, block_size, outputs)
 
   # [1, 2, 2, 1] <-> [4, 1, 1, 1]
+  @test_util.run_deprecated_v1
   def testSmallInput2x2(self):
     x_np = [[[[1], [2]], [[3], [4]]]]
     block_size = 2
@@ -122,6 +124,7 @@ class SpaceToBatchTest(test.TestCase, PythonOpImpl):
     self._testOne(x_np, block_size, x_out)
 
   # [1, 2, 2, 1] <-> [1, 3, 3, 1] (padding) <-> [9, 1, 1, 1]
+  @test_util.run_deprecated_v1
   def testSmallInput2x2Pad1x0(self):
     x_np = [[[[1], [2]], [[3], [4]]]]
     paddings = np.array([[1, 0], [1, 0]], dtype=np.int32)
@@ -132,6 +135,7 @@ class SpaceToBatchTest(test.TestCase, PythonOpImpl):
 
   # Test with depth larger than 1.
   # [1, 2, 2, 3] <-> [4, 1, 1, 3]
+  @test_util.run_deprecated_v1
   def testDepthInput2x2(self):
     x_np = [[[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]]]
     block_size = 2
@@ -140,6 +144,7 @@ class SpaceToBatchTest(test.TestCase, PythonOpImpl):
 
   # Test for larger input dimensions.
   # [1, 4, 4, 1] <-> [4, 2, 2, 1]
+  @test_util.run_deprecated_v1
   def testLargerInput2x2(self):
     x_np = [[[[1], [2], [3], [4]], [[5], [6], [7], [8]],
              [[9], [10], [11], [12]], [[13], [14], [15], [16]]]]
@@ -150,6 +155,7 @@ class SpaceToBatchTest(test.TestCase, PythonOpImpl):
 
   # Test with batch larger than 1.
   # [2, 2, 4, 1] <-> [8, 1, 2, 1]
+  @test_util.run_deprecated_v1
   def testBatchInput2x2(self):
     x_np = [[[[1], [2], [3], [4]], [[5], [6], [7], [8]]],
             [[[9], [10], [11], [12]], [[13], [14], [15], [16]]]]
@@ -162,6 +168,7 @@ class SpaceToBatchTest(test.TestCase, PythonOpImpl):
   # that elements are correctly laid out spatially and properly interleaved
   # along the batch dimension.
   # [2, 4, 4, 1] <-> [8, 2, 2, 1]
+  @test_util.run_deprecated_v1
   def testLargerInputBatch2x2(self):
     x_np = [[[[1], [2], [3], [4]], [[5], [6], [7], [8]],
              [[9], [10], [11], [12]], [[13], [14], [15], [16]]],
@@ -206,6 +213,7 @@ class SpaceToBatchNDTest(test.TestCase):
     self._testPad(inputs, block_shape, paddings,
                   space_to_batch_direct(inputs, block_shape, paddings))
 
+  @test_util.run_deprecated_v1
   def testZeroBlockDimsZeroRemainingDims(self):
     self._testPad(
         inputs=[1, 2],
@@ -213,6 +221,7 @@ class SpaceToBatchNDTest(test.TestCase):
         paddings=[],
         outputs=[1, 2],)
 
+  @test_util.run_deprecated_v1
   def testZeroBlockDimsOneRemainingDim(self):
     self._testPad(
         inputs=[[1, 2], [3, 4]],
@@ -227,6 +236,7 @@ class SpaceToBatchNDTest(test.TestCase):
         paddings=[[0, 0]],
         outputs=[[1, 2], [3, 4]])
 
+  @test_util.run_deprecated_v1
   def testZeroBlockDimsTwoRemainingDims(self):
     self._testPad(
         inputs=[[[1, 2], [3, 4]], [[5, 6], [7, 8]]],
@@ -248,6 +258,7 @@ class SpaceToBatchNDTest(test.TestCase):
         paddings=[[0, 0], [0, 0]],
         outputs=[[[1, 2], [3, 4]], [[5, 6], [7, 8]]])
 
+  @test_util.run_deprecated_v1
   def testOneBlockDimZeroRemainingDims(self):
     self._testPad(
         inputs=[[1, 2, 3], [4, 5, 6]],
@@ -255,6 +266,7 @@ class SpaceToBatchNDTest(test.TestCase):
         paddings=[1, 0],
         outputs=[[0, 2], [0, 5], [1, 3], [4, 6]])
 
+  @test_util.run_deprecated_v1
   def testOneBlockDimOneRemainingDim(self):
     self._testPad(
         inputs=[[[1, 11], [2, 21], [3, 31]], [[4, 41], [5, 51], [6, 61]]],
@@ -263,6 +275,7 @@ class SpaceToBatchNDTest(test.TestCase):
         outputs=[[[0, 0], [2, 21]], [[0, 0], [5, 51]], [[1, 11], [3, 31]],
                  [[4, 41], [6, 61]]])
 
+  @test_util.run_deprecated_v1
   def testDirect(self):
     # Test with zero-size remaining dimension.
     self._testDirect(
@@ -300,6 +313,7 @@ class SpaceToBatchNDTest(test.TestCase):
 class SpaceToBatchSpaceToDepth(test.TestCase, PythonOpImpl):
 
   # Verifies that: space_to_batch(x) = transpose(space_to_depth(transpose(x)))
+  @test_util.run_deprecated_v1
   def testSpaceToDepthTranspose(self):
     x = np.arange(5 * 10 * 16 * 7, dtype=np.float32).reshape([5, 10, 16, 7])
     block_size = 2
@@ -319,6 +333,7 @@ class SpaceToBatchSpaceToDepthCpp(SpaceToBatchSpaceToDepth, CppOpImpl):
 
 class SpaceToBatchErrorHandlingTest(test.TestCase, PythonOpImpl):
 
+  @test_util.run_deprecated_v1
   def testInputWrongDimMissingBatch(self):
     # The input is missing the first dimension ("batch")
     x_np = [[[1], [2]], [[3], [4]]]
@@ -327,6 +342,7 @@ class SpaceToBatchErrorHandlingTest(test.TestCase, PythonOpImpl):
     with self.assertRaises(ValueError):
       _ = self.space_to_batch(x_np, paddings, block_size)
 
+  @test_util.run_deprecated_v1
   def testBlockSize0(self):
     # The block size is 0.
     x_np = [[[[1], [2]], [[3], [4]]]]
@@ -336,6 +352,7 @@ class SpaceToBatchErrorHandlingTest(test.TestCase, PythonOpImpl):
       out_tf = self.space_to_batch(x_np, paddings, block_size)
       out_tf.eval()
 
+  @test_util.run_deprecated_v1
   def testBlockSizeOne(self):
     # The block size is 1. The block size needs to be > 1.
     x_np = [[[[1], [2]], [[3], [4]]]]
@@ -345,6 +362,7 @@ class SpaceToBatchErrorHandlingTest(test.TestCase, PythonOpImpl):
       out_tf = self.space_to_batch(x_np, paddings, block_size)
       out_tf.eval()
 
+  @test_util.run_deprecated_v1
   def testBlockSizeLarger(self):
     # The block size is too large for this input.
     x_np = [[[[1], [2]], [[3], [4]]]]
@@ -354,6 +372,7 @@ class SpaceToBatchErrorHandlingTest(test.TestCase, PythonOpImpl):
       out_tf = self.space_to_batch(x_np, paddings, block_size)
       out_tf.eval()
 
+  @test_util.run_deprecated_v1
   def testBlockSizeNotDivisibleWidth(self):
     # The block size divides width but not height.
     x_np = [[[[1], [2], [3]], [[3], [4], [7]]]]
@@ -362,6 +381,7 @@ class SpaceToBatchErrorHandlingTest(test.TestCase, PythonOpImpl):
     with self.assertRaises(ValueError):
       _ = self.space_to_batch(x_np, paddings, block_size)
 
+  @test_util.run_deprecated_v1
   def testBlockSizeNotDivisibleHeight(self):
     # The block size divides height but not width.
     x_np = [[[[1], [2]], [[3], [4]], [[5], [6]]]]
@@ -370,6 +390,7 @@ class SpaceToBatchErrorHandlingTest(test.TestCase, PythonOpImpl):
     with self.assertRaises(ValueError):
       _ = self.space_to_batch(x_np, paddings, block_size)
 
+  @test_util.run_deprecated_v1
   def testBlockSizeNotDivisibleBoth(self):
     # The block size does not divide neither width or height.
     x_np = [[[[1], [2]], [[3], [4]]]]
@@ -378,6 +399,7 @@ class SpaceToBatchErrorHandlingTest(test.TestCase, PythonOpImpl):
     with self.assertRaises(ValueError):
       _ = self.space_to_batch(x_np, paddings, block_size)
 
+  @test_util.run_deprecated_v1
   def testUnknownShape(self):
     t = self.space_to_batch(
         array_ops.placeholder(dtypes.float32),
@@ -424,25 +446,31 @@ class SpaceToBatchNDErrorHandlingTest(test.TestCase):
     self._testStaticShape(input_shape, block_shape, paddings, error)
     self._testDynamicShape(input_shape, block_shape, paddings)
 
+  @test_util.run_deprecated_v1
   def testBlockSize0(self):
     # The block size is 0.
     self._testShape([1, 2, 2], [0, 2], [[0, 0], [0, 0]], ValueError)
 
+  @test_util.run_deprecated_v1
   def testBlockSizeNegative(self):
     self._testShape([1, 2, 2], [-1, 2], [[0, 0], [0, 0]], ValueError)
 
+  @test_util.run_deprecated_v1
   def testNegativePadding(self):
     # The padding is negative.
     self._testShape([1, 2, 2], [1, 1], [[0, -1], [0, 0]], ValueError)
 
+  @test_util.run_deprecated_v1
   def testBlockSizeNotDivisible(self):
     # The padded size is not divisible by the block size.
     self._testShape([1, 2, 3, 1], [3, 3], [[0, 0], [0, 0]], ValueError)
 
+  @test_util.run_deprecated_v1
   def testBlockDimsMismatch(self):
     # Shape of block_shape does not match shape of paddings.
     self._testStaticShape([1, 3, 3, 1], [3, 3], [[0, 0]], ValueError)
 
+  @test_util.run_deprecated_v1
   def testUnknown(self):
     # Verify that input shape and paddings shape can be unknown.
     _ = array_ops.space_to_batch_nd(
@@ -522,18 +550,21 @@ class SpaceToBatchGradientTest(test.TestCase, PythonOpImpl):
 
   # Don't use very large numbers as dimensions here as the result is tensor
   # with cartesian product of the dimensions.
+  @test_util.run_deprecated_v1
   def testSmall(self):
     block_size = 2
     pad_beg = 0
     pad_end = 0
     self._compare(1, 2, 3, 5, block_size, pad_beg, pad_end)
 
+  @test_util.run_deprecated_v1
   def testSmall2(self):
     block_size = 2
     pad_beg = 0
     pad_end = 0
     self._compare(2, 4, 3, 2, block_size, pad_beg, pad_end)
 
+  @test_util.run_deprecated_v1
   def testSmallPad1x1(self):
     block_size = 2
     pad_beg = 1
@@ -572,15 +603,19 @@ class SpaceToBatchNDGradientTest(test.TestCase):
 
   # Don't use very large numbers as dimensions here as the result is tensor
   # with cartesian product of the dimensions.
+  @test_util.run_deprecated_v1
   def testSmall(self):
     self._compare([1, 4, 6, 5], [2, 2], [[0, 0], [0, 0]])
 
+  @test_util.run_deprecated_v1
   def testSmall2(self):
     self._compare([2, 8, 6, 2], [2, 2], [[0, 0], [0, 0]])
 
+  @test_util.run_deprecated_v1
   def testSmallPad1(self):
     self._compare([2, 4, 6, 2], [2, 2], [[1, 1], [1, 1]])
 
+  @test_util.run_deprecated_v1
   def testSmallPadThreeBlockDims(self):
     self._compare([2, 2, 4, 3, 2], [2, 2, 2], [[1, 1], [1, 1], [1, 0]])
 
@@ -644,6 +679,7 @@ class RequiredSpaceToBatchPaddingsTest(test.TestCase):
     self.assertAllEqual(paddings_result, paddings_const)
     self.assertAllEqual(crops_result, crops_const)
 
+  @test_util.run_deprecated_v1
   def testSimple(self):
     self._test(
         input_shape=np.zeros((0,), np.int32),
diff --git a/tensorflow/python/kernel_tests/spacetodepth_op_test.py b/tensorflow/python/kernel_tests/spacetodepth_op_test.py
index b05f14f7381..e96bc09f365 100644
--- a/tensorflow/python/kernel_tests/spacetodepth_op_test.py
+++ b/tensorflow/python/kernel_tests/spacetodepth_op_test.py
@@ -36,21 +36,22 @@ class SpaceToDepthTest(test.TestCase):
 
   def _testOne(self, inputs, block_size, outputs, dtype=dtypes.float32):
     input_nhwc = math_ops.cast(inputs, dtype)
-    with self.session(use_gpu=False):
+    with test_util.force_cpu():
       # test NHWC (default) on CPU
       x_tf = array_ops.space_to_depth(input_nhwc, block_size)
-      self.assertAllEqual(x_tf.eval(), outputs)
-    if test.is_gpu_available():
-      with self.session(force_gpu=True):
+      self.assertAllEqual(self.evaluate(x_tf), outputs)
+
+    if test_util.is_gpu_available():
+      with test_util.force_gpu():
         # test NHWC (default) on GPU
         x_tf = array_ops.space_to_depth(input_nhwc, block_size)
-        self.assertAllEqual(x_tf.eval(), outputs)
+        self.assertAllEqual(self.evaluate(x_tf), outputs)
         # test NCHW on GPU
         input_nchw = test_util.NHWCToNCHW(input_nhwc)
         output_nchw = array_ops.space_to_depth(
             input_nchw, block_size, data_format="NCHW")
         output_nhwc = test_util.NCHWToNHWC(output_nchw)
-        self.assertAllEqual(output_nhwc.eval(), outputs)
+        self.assertAllEqual(self.evaluate(output_nhwc), outputs)
 
   def testBasic(self):
     x_np = [[[[1], [2]], [[3], [4]]]]
@@ -134,17 +135,18 @@ class SpaceToDepthTest(test.TestCase):
     input_nhwc = array_ops.ones([batch_size, 4, 6, 3])
     x_out = array_ops.ones([batch_size, 2, 3, 12])
 
-    with self.session(use_gpu=False):
+    with test_util.force_cpu():
       # test NHWC (default) on CPU
       x_tf = array_ops.space_to_depth(input_nhwc, block_size)
       self.assertAllEqual(x_tf.shape, x_out.shape)
-      x_tf.eval()
+      self.evaluate(x_tf)
+
     if test.is_gpu_available():
-      with self.session(use_gpu=True):
+      with test_util.use_gpu():
         # test NHWC (default) on GPU
         x_tf = array_ops.space_to_depth(input_nhwc, block_size)
         self.assertAllEqual(x_tf.shape, x_out.shape)
-        x_tf.eval()
+        self.evaluate(x_tf)
 
   # Tests for different width and height.
   def testNonSquare(self):
@@ -157,14 +159,16 @@ class SpaceToDepthTest(test.TestCase):
 
   # Error handling:
 
+  @test_util.run_deprecated_v1
   def testInputWrongDimMissingDepth(self):
     # The input is missing the last dimension ("depth")
     x_np = [[[1, 2], [3, 4]]]
     block_size = 2
     with self.assertRaises(ValueError):
       out_tf = array_ops.space_to_depth(x_np, block_size)
-      out_tf.eval()
+      self.evaluate(out_tf)
 
+  @test_util.run_deprecated_v1
   def testInputWrongDimMissingBatch(self):
     # The input is missing the first dimension ("batch")
     x_np = [[[1], [2]], [[3], [4]]]
@@ -172,30 +176,34 @@ class SpaceToDepthTest(test.TestCase):
     with self.assertRaises(ValueError):
       _ = array_ops.space_to_depth(x_np, block_size)
 
+  @test_util.run_deprecated_v1
   def testBlockSize0(self):
     # The block size is 0.
     x_np = [[[[1], [2]], [[3], [4]]]]
     block_size = 0
     with self.assertRaises(ValueError):
       out_tf = array_ops.space_to_depth(x_np, block_size)
-      out_tf.eval()
+      self.evaluate(out_tf)
 
+  @test_util.run_deprecated_v1
   def testBlockSizeOne(self):
     # The block size is 1. The block size needs to be > 1.
     x_np = [[[[1], [2]], [[3], [4]]]]
     block_size = 1
     with self.assertRaises(ValueError):
       out_tf = array_ops.space_to_depth(x_np, block_size)
-      out_tf.eval()
+      self.evaluate(out_tf)
 
+  @test_util.run_deprecated_v1
   def testBlockSizeLarger(self):
     # The block size is too large for this input.
     x_np = [[[[1], [2]], [[3], [4]]]]
     block_size = 10
     with self.assertRaises(ValueError):
       out_tf = array_ops.space_to_depth(x_np, block_size)
-      out_tf.eval()
+      self.evaluate(out_tf)
 
+  @test_util.run_deprecated_v1
   def testBlockSizeNotDivisibleWidth(self):
     # The block size divides width but not height.
     x_np = [[[[1], [2], [3]], [[3], [4], [7]]]]
@@ -203,6 +211,7 @@ class SpaceToDepthTest(test.TestCase):
     with self.assertRaises(ValueError):
       _ = array_ops.space_to_depth(x_np, block_size)
 
+  @test_util.run_deprecated_v1
   def testBlockSizeNotDivisibleHeight(self):
     # The block size divides height but not width.
     x_np = [[[[1], [2]], [[3], [4]], [[5], [6]]]]
@@ -210,6 +219,7 @@ class SpaceToDepthTest(test.TestCase):
     with self.assertRaises(ValueError):
       _ = array_ops.space_to_depth(x_np, block_size)
 
+  @test_util.run_deprecated_v1
   def testBlockSizeNotDivisibleBoth(self):
     # The block size does not divide neither width or height.
     x_np = [[[[1], [2]], [[3], [4]]]]
@@ -217,6 +227,7 @@ class SpaceToDepthTest(test.TestCase):
     with self.assertRaises(ValueError):
       _ = array_ops.space_to_depth(x_np, block_size)
 
+  @test_util.run_deprecated_v1
   def testUnknownShape(self):
     t = array_ops.space_to_depth(
         array_ops.placeholder(dtypes.float32), block_size=4)
@@ -271,7 +282,7 @@ class SpaceToDepthTest(test.TestCase):
       actual = array_ops.space_to_depth(t, block_size, data_format=data_format)
 
     with self.cached_session(use_gpu=use_gpu) as sess:
-      actual_vals, expected_vals = sess.run([actual, expected])
+      actual_vals, expected_vals = self.evaluate([actual, expected])
       self.assertTrue(np.array_equal(actual_vals, expected_vals))
 
   def testAgainstTranspose(self):
@@ -332,11 +343,13 @@ class SpaceToDepthGradientTest(test.TestCase):
 
   # Don't use very large numbers as dimensions here as the result is tensor
   # with cartesian product of the dimensions.
+  @test_util.run_deprecated_v1
   def testSmall(self):
     block_size = 2
     self._compare(1, 2, 3, 5, block_size, "NHWC")
     self._compare(1, 2, 3, 5, block_size, "NCHW")
 
+  @test_util.run_deprecated_v1
   def testSmall2(self):
     block_size = 2
     self._compare(2, 4, 3, 2, block_size, "NHWC")
diff --git a/tensorflow/python/kernel_tests/sparse_add_op_test.py b/tensorflow/python/kernel_tests/sparse_add_op_test.py
index a746830afb3..00eff54077c 100644
--- a/tensorflow/python/kernel_tests/sparse_add_op_test.py
+++ b/tensorflow/python/kernel_tests/sparse_add_op_test.py
@@ -28,6 +28,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import sparse_ops
@@ -85,13 +86,13 @@ class SparseAddTest(test.TestCase):
         constant_op.constant(shape, dtypes.int64))
 
   def testAddSelf(self):
-    with self.session(use_gpu=False) as sess:
+    with test_util.force_cpu():
       for sp_a in (self._SparseTensorValue_3x3(), self._SparseTensor_3x3()):
         for sp_b in (self._SparseTensorValue_3x3(), self._SparseTensor_3x3()):
           sp_sum = sparse_ops.sparse_add(sp_a, sp_b)
           self.assertAllEqual((3, 3), sp_sum.get_shape())
 
-          sum_out = sess.run(sp_sum)
+          sum_out = self.evaluate(sp_sum)
 
           self.assertEqual(sp_sum.dense_shape.get_shape(), [2])
           self.assertAllEqual(sum_out.indices, [[0, 1], [1, 0], [2, 0], [2, 1]])
@@ -99,12 +100,12 @@ class SparseAddTest(test.TestCase):
           self.assertAllEqual(sum_out.dense_shape, [3, 3])
 
   def testAddSelfAndNegation(self):
-    with self.session(use_gpu=False) as sess:
+    with test_util.force_cpu():
       sp_a = self._SparseTensor_3x3()
       sp_b = self._SparseTensor_3x3(negate=True)
 
       sp_sum = sparse_ops.sparse_add(sp_a, sp_b, 0.1)
-      sum_out = sess.run(sp_sum)
+      sum_out = self.evaluate(sp_sum)
 
       self.assertEqual(sp_sum.dense_shape.get_shape(), [2])
       self.assertAllEqual(sum_out.indices, np.empty([0, 2]))
@@ -112,7 +113,7 @@ class SparseAddTest(test.TestCase):
       self.assertAllEqual(sum_out.dense_shape, [3, 3])
 
   def testSmallValuesShouldVanish(self):
-    with self.session(use_gpu=False) as sess:
+    with test_util.force_cpu():
       sp_a = self._SparseTensor_3x3()
       sp_b = self._SparseTensor_3x3_v2()
 
@@ -123,7 +124,7 @@ class SparseAddTest(test.TestCase):
 
       # two values should vanish: |.1| < .21, and |-.2| < .21
       sp_sum = sparse_ops.sparse_add(sp_a, sp_b, thresh=0.21)
-      sum_out = sess.run(sp_sum)
+      sum_out = self.evaluate(sp_sum)
 
       self.assertEqual(sp_sum.dense_shape.get_shape(), [2])
       self.assertAllEqual(sum_out.indices, [[0, 1], [2, 0]])
@@ -132,13 +133,14 @@ class SparseAddTest(test.TestCase):
 
       # only .1 vanishes
       sp_sum = sparse_ops.sparse_add(sp_a, sp_b, thresh=0.11)
-      sum_out = sess.run(sp_sum)
+      sum_out = self.evaluate(sp_sum)
 
       self.assertEqual(sp_sum.dense_shape.get_shape(), [2])
       self.assertAllEqual(sum_out.indices, [[0, 1], [2, 0], [2, 1]])
       self.assertAllClose(sum_out.values, [2, 6, -.2])
       self.assertAllEqual(sum_out.dense_shape, [3, 3])
 
+  @test_util.run_deprecated_v1
   def testGradients(self):
     np.random.seed(1618)  # Make it reproducible.
     with self.session(use_gpu=False):
@@ -147,7 +149,7 @@ class SparseAddTest(test.TestCase):
           sp_a, nnz_a = self._randomTensor([n, m], np.float32)
           sp_b, nnz_b = self._randomTensor([n, m], np.float32)
           sp_sum = sparse_ops.sparse_add(sp_a, sp_b)
-          nnz_sum = len(sp_sum.values.eval())
+          nnz_sum = len(self.evaluate(sp_sum.values))
 
           err = gradient_checker.compute_gradient_error(
               [sp_a.values, sp_b.values], [(nnz_a,), (nnz_b,)], sp_sum.values,
@@ -162,19 +164,20 @@ class SparseAddTest(test.TestCase):
         rand_vals_np = np.random.randn(n, m).astype(dtype)
         dense_np = np.random.randn(n, m).astype(dtype)
 
-        with self.cached_session(use_gpu=False):
+        with test_util.force_cpu():
           sparse, unused_nnz = _sparsify(rand_vals_np, index_dtype=index_dtype)
-          s = sparse_ops.sparse_add(sparse,
-                                    constant_op.constant(dense_np)).eval()
+          s = self.evaluate(
+              sparse_ops.sparse_add(sparse, constant_op.constant(dense_np)))
           self.assertAllEqual(dense_np + rand_vals_np, s)
           self.assertTrue(s.dtype == dtype)
 
           # check commutativity
-          s = sparse_ops.sparse_add(constant_op.constant(dense_np),
-                                    sparse).eval()
+          s = self.evaluate(
+              sparse_ops.sparse_add(constant_op.constant(dense_np), sparse))
           self.assertAllEqual(dense_np + rand_vals_np, s)
           self.assertTrue(s.dtype == dtype)
 
+  @test_util.run_deprecated_v1
   def testSparseTensorDenseAddGradients(self):
     np.random.seed(1618)  # Make it reproducible.
     n, m = np.random.randint(30, size=2)
@@ -190,8 +193,9 @@ class SparseAddTest(test.TestCase):
                                                     [(nnz,), (n, m)], s, (n, m))
       self.assertLess(err, 1e-3)
 
+  @test_util.run_deprecated_v1
   def testInvalidSparseTensor(self):
-    with self.session(use_gpu=False) as sess:
+    with test_util.force_cpu():
       shape = [2, 2]
       val = [0]
       dense = constant_op.constant(np.zeros(shape, dtype=np.int32))
@@ -205,7 +209,7 @@ class SparseAddTest(test.TestCase):
 
         with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
                                      "invalid index"):
-          sess.run(s)
+          self.evaluate(s)
 
 ######################## Benchmarking code
 
diff --git a/tensorflow/python/kernel_tests/sparse_concat_op_test.py b/tensorflow/python/kernel_tests/sparse_concat_op_test.py
index 402c5eb4ea3..04b6b9b8d20 100644
--- a/tensorflow/python/kernel_tests/sparse_concat_op_test.py
+++ b/tensorflow/python/kernel_tests/sparse_concat_op_test.py
@@ -23,6 +23,7 @@ import numpy as np
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import sparse_ops
 from tensorflow.python.platform import test
@@ -147,7 +148,7 @@ class SparseConcatTest(test.TestCase):
           self.assertEqual(sp_concat.values.get_shape(), [4])
           self.assertEqual(sp_concat.dense_shape.get_shape(), [2])
 
-          concat_out = sess.run(sp_concat)
+          concat_out = self.evaluate(sp_concat)
 
           self.assertAllEqual(concat_out.indices,
                               [[0, 2], [1, 0], [2, 0], [2, 2]])
@@ -169,7 +170,7 @@ class SparseConcatTest(test.TestCase):
             self.assertEqual(sp_concat.values.get_shape(), [8])
             self.assertEqual(sp_concat.dense_shape.get_shape(), [2])
 
-            concat_out = sess.run(sp_concat)
+            concat_out = self.evaluate(sp_concat)
 
             self.assertAllEqual(concat_out.indices, [[0, 2], [1, 0], [1, 4],
                                                      [2, 0], [2, 2], [2, 3],
@@ -195,7 +196,7 @@ class SparseConcatTest(test.TestCase):
         self.assertEqual(sp_concat.values.get_shape(), [7])
         self.assertEqual(sp_concat.dense_shape.get_shape(), [2])
 
-        concat_out = sess.run(sp_concat)
+        concat_out = self.evaluate(sp_concat)
 
         self.assertAllEqual(
             concat_out.indices,
@@ -220,7 +221,7 @@ class SparseConcatTest(test.TestCase):
         self.assertEqual(sp_concat.values.get_shape(), [10])
         self.assertEqual(sp_concat.dense_shape.get_shape(), [2])
 
-        concat_out = sess.run(sp_concat)
+        concat_out = self.evaluate(sp_concat)
 
         self.assertAllEqual(concat_out.indices, [[0, 2], [1, 0], [1, 4], [1, 8],
                                                  [2, 0], [2, 2], [2, 3], [2, 6],
@@ -244,7 +245,7 @@ class SparseConcatTest(test.TestCase):
         self.assertEqual(sp_concat.values.get_shape(), [8])
         self.assertEqual(sp_concat.dense_shape.get_shape(), [2])
 
-        concat_out = sess.run(sp_concat)
+        concat_out = self.evaluate(sp_concat)
 
         self.assertAllEqual(
             concat_out.indices,
@@ -253,6 +254,7 @@ class SparseConcatTest(test.TestCase):
                             [b"a", b"b", b"e", b"c", b"d", b"f", b"g", b"h"])
         self.assertAllEqual(concat_out.dense_shape, [3, 8])
 
+  @test_util.run_deprecated_v1
   def testMismatchedRank(self):
     with self.session(use_gpu=False):
       sp_a = self._SparseTensor_3x3()
@@ -263,6 +265,7 @@ class SparseConcatTest(test.TestCase):
         with self.assertRaises(ValueError):
           sparse_ops.sparse_concat(concat_dim, [sp_a, sp_e])
 
+  @test_util.run_deprecated_v1
   def testMismatchedRankExpandNonconcatDim(self):
     with self.session(use_gpu=False):
       sp_a = self._SparseTensor_3x3()
@@ -275,6 +278,7 @@ class SparseConcatTest(test.TestCase):
           sparse_ops.sparse_concat(
               concat_dim, [sp_a, sp_e], expand_nonconcat_dim=True)
 
+  @test_util.run_deprecated_v1
   def testMismatchedShapes(self):
     with self.session(use_gpu=False) as sess:
       sp_a = self._SparseTensor_3x3()
@@ -287,7 +291,7 @@ class SparseConcatTest(test.TestCase):
 
         # Shape mismatches can only be caught when the op is run
         with self.assertRaisesOpError("Input shapes must match"):
-          sess.run(sp_concat)
+          self.evaluate(sp_concat)
 
   def testMismatchedShapesExpandNonconcatDim(self):
     with self.session(use_gpu=False) as sess:
@@ -302,8 +306,8 @@ class SparseConcatTest(test.TestCase):
           sp_concat_dim1 = sparse_ops.sparse_concat(
               concat_dim1, [sp_a, sp_b, sp_c, sp_d], expand_nonconcat_dim=True)
 
-          sp_concat_dim0_out = sess.run(sp_concat_dim0)
-          sp_concat_dim1_out = sess.run(sp_concat_dim1)
+          sp_concat_dim0_out = self.evaluate(sp_concat_dim0)
+          sp_concat_dim1_out = self.evaluate(sp_concat_dim1)
 
           self.assertAllEqual(sp_concat_dim0_out.indices,
                               [[0, 2], [1, 0], [2, 0], [2, 2], [4, 1], [5, 0],
@@ -321,6 +325,7 @@ class SparseConcatTest(test.TestCase):
                               [1, 1, 2, 1, 1, 1, 2, 3, 4, 2, 1, 0, 2])
           self.assertAllEqual(sp_concat_dim1_out.dense_shape, [3, 13])
 
+  @test_util.run_deprecated_v1
   def testShapeInferenceUnknownShapes(self):
     with self.session(use_gpu=False):
       sp_inputs = [
diff --git a/tensorflow/python/kernel_tests/sparse_conditional_accumulator_test.py b/tensorflow/python/kernel_tests/sparse_conditional_accumulator_test.py
index a824d5c8263..275c86e5349 100644
--- a/tensorflow/python/kernel_tests/sparse_conditional_accumulator_test.py
+++ b/tensorflow/python/kernel_tests/sparse_conditional_accumulator_test.py
@@ -26,6 +26,7 @@ from tensorflow.python.framework import dtypes as dtypes_lib
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.platform import test
@@ -98,12 +99,14 @@ class IndexedSlicesConditionalAccumulatorTest(test.TestCase):
       attr { key: 'reduction_type' value {s: 'MEAN'} }
       """, q.accumulator_ref.op.node_def)
 
+  @test_util.run_deprecated_v1
   def testAccumulatorSizeEmpty(self):
     with self.cached_session():
       q = data_flow_ops.SparseConditionalAccumulator(
           dtypes_lib.float32, name="Q")
       self.assertEqual(q.num_accumulated().eval(), 0)
 
+  @test_util.run_deprecated_v1
   def testAccumulatorSetGlobalStep(self):
     with self.cached_session():
       q = data_flow_ops.SparseConditionalAccumulator(
@@ -111,6 +114,7 @@ class IndexedSlicesConditionalAccumulatorTest(test.TestCase):
       set_global_step_op = q.set_global_step(1)
       set_global_step_op.run()
 
+  @test_util.run_deprecated_v1
   def testAccumulatorApplyGradFloat32(self):
     with self.cached_session():
       q = data_flow_ops.SparseConditionalAccumulator(
@@ -122,6 +126,7 @@ class IndexedSlicesConditionalAccumulatorTest(test.TestCase):
       accum_op.run()
       self.assertEqual(q.num_accumulated().eval(), 1)
 
+  @test_util.run_deprecated_v1
   def testDtypes(self):
     with self.cached_session() as sess:
       dtypes = [dtypes_lib.float16, dtypes_lib.float32, dtypes_lib.float64]
@@ -140,10 +145,11 @@ class IndexedSlicesConditionalAccumulatorTest(test.TestCase):
           t = _indexedslice(mat_to_add)
           q.apply_indexed_slices_grad(t).run()
 
-        result = sess.run(q.take_indexed_slices_grad(1))
+        result = self.evaluate(q.take_indexed_slices_grad(1))
 
         self._assertEqual_nparray(sum_elems / len(elems), result, sess)
 
+  @test_util.run_deprecated_v1
   def testAccumulatorMultipleAccumulators(self):
     with self.cached_session() as sess:
       q_f32_0 = data_flow_ops.SparseConditionalAccumulator(
@@ -174,6 +180,7 @@ class IndexedSlicesConditionalAccumulatorTest(test.TestCase):
         result = sess.run(accums[i].take_indexed_slices_grad(1))
         self._assertEqual_indexedslices(expected_tensors[i], result)
 
+  @test_util.run_deprecated_v1
   def testAccumulatorTakeGradMean(self):
     with self.cached_session() as sess:
       q = data_flow_ops.SparseConditionalAccumulator(
@@ -189,11 +196,12 @@ class IndexedSlicesConditionalAccumulatorTest(test.TestCase):
       accum_op.run()
 
       takeg_t = q.take_indexed_slices_grad(1)
-      val = sess.run(takeg_t)
+      val = self.evaluate(takeg_t)
       self.assertAllEqual([0, 1, 2], val.indices)
       self.assertAllEqual([[0.5, 0.5], [0, 2], [3, 0]], val.values)
       self.assertAllEqual([-1, 2], val.dense_shape)
 
+  @test_util.run_deprecated_v1
   def testAccumulatorTakeGradSum(self):
     with self.cached_session() as sess:
       q = data_flow_ops.SparseConditionalAccumulator(
@@ -209,16 +217,18 @@ class IndexedSlicesConditionalAccumulatorTest(test.TestCase):
       accum_op.run()
 
       takeg_t = q.take_indexed_slices_grad(1)
-      val = sess.run(takeg_t)
+      val = self.evaluate(takeg_t)
       self.assertAllEqual([0, 1, 2], val.indices)
       self.assertAllEqual([[1, 1], [0, 2], [3, 0]], val.values)
       self.assertAllEqual([-1, 2], val.dense_shape)
 
+  @test_util.run_deprecated_v1
   def testAccumulatorTakeGradInvalidReductionType(self):
     with self.assertRaises(ValueError):
       data_flow_ops.SparseConditionalAccumulator(
           dtypes_lib.float32, name="Q", shape=(), reduction_type="Invalid")
 
+  @test_util.run_deprecated_v1
   def testAccumulatorRepeatedTakeGrad(self):
     with self.cached_session() as sess:
       q = data_flow_ops.SparseConditionalAccumulator(
@@ -235,7 +245,7 @@ class IndexedSlicesConditionalAccumulatorTest(test.TestCase):
       accum_op.run()
 
       takeg_t = q.take_indexed_slices_grad(1)
-      val = sess.run(takeg_t)
+      val = self.evaluate(takeg_t)
       self.assertAllEqual(val.indices, [0, 1, 2])
       self.assertAllEqual(val.values, [[0.5, 0.5], [0, 2], [3, 0]])
       self.assertAllEqual(val.dense_shape, [-1, 2])
@@ -252,11 +262,12 @@ class IndexedSlicesConditionalAccumulatorTest(test.TestCase):
       accum_op.run()
 
       takeg_t = q.take_indexed_slices_grad(1)
-      val = sess.run(takeg_t)
+      val = self.evaluate(takeg_t)
       self.assertAllEqual(val.indices, [0, 1, 2])
       self.assertAllEqual(val.values, [[5, 5], [0, 20], [30, 0]])
       self.assertAllEqual(val.dense_shape, [-1, 2])
 
+  @test_util.run_deprecated_v1
   def testParallelApplyGradMean(self):
     with self.cached_session() as sess:
       q = data_flow_ops.SparseConditionalAccumulator(
@@ -269,7 +280,7 @@ class IndexedSlicesConditionalAccumulatorTest(test.TestCase):
       takeg_t = q.take_indexed_slices_grad(1)
 
       def apply_indexed_slices_grad(accum_op):
-        sess.run(accum_op)
+        self.evaluate(accum_op)
 
       threads = [
           self.checkedThread(
@@ -281,13 +292,14 @@ class IndexedSlicesConditionalAccumulatorTest(test.TestCase):
       for thread in threads:
         thread.join()
 
-      val = sess.run(takeg_t)
+      val = self.evaluate(takeg_t)
 
       expected_val = sum(elems) / len(elems)
       self._assertEqual_nparray(
           np.array([[expected_val, 0], [0, expected_val]]).astype(np.float32),
           val, sess)
 
+  @test_util.run_deprecated_v1
   def testParallelApplyGradSum(self):
     with self.cached_session() as sess:
       q = data_flow_ops.SparseConditionalAccumulator(
@@ -303,7 +315,7 @@ class IndexedSlicesConditionalAccumulatorTest(test.TestCase):
       takeg_t = q.take_indexed_slices_grad(1)
 
       def apply_indexed_slices_grad(accum_op):
-        sess.run(accum_op)
+        self.evaluate(accum_op)
 
       threads = [
           self.checkedThread(target=apply_indexed_slices_grad, args=(o,))
@@ -315,13 +327,14 @@ class IndexedSlicesConditionalAccumulatorTest(test.TestCase):
       for thread in threads:
         thread.join()
 
-      val = sess.run(takeg_t)
+      val = self.evaluate(takeg_t)
 
       expected_val = 550.0
       self._assertEqual_nparray(
           np.array([[expected_val, 0], [0, expected_val]]).astype(np.float32),
           val, sess)
 
+  @test_util.run_deprecated_v1
   def testParallelTakeGrad(self):
     with self.cached_session() as sess:
       q = data_flow_ops.SparseConditionalAccumulator(
@@ -338,13 +351,13 @@ class IndexedSlicesConditionalAccumulatorTest(test.TestCase):
       def apply_indexed_slices_grad():
         for accum_op in accum_ops:
           time.sleep(1.0)
-          sess.run(accum_op)
+          self.evaluate(accum_op)
 
       apply_indexed_slices_grad_thread = self.checkedThread(
           target=apply_indexed_slices_grad)
 
       def take_grad():
-        t = sess.run(takeg_t)
+        t = self.evaluate(takeg_t)
         results.append(t)
 
       threads = [self.checkedThread(target=take_grad) for _ in range(10)]
@@ -361,6 +374,7 @@ class IndexedSlicesConditionalAccumulatorTest(test.TestCase):
         self._assertEqual_nparray(
             np.array([[0, 0], [elems[i], 0]]), results[i], sess)
 
+  @test_util.run_deprecated_v1
   def testAccumulatorApplyAndBlockingTake(self):
     with self.cached_session() as sess:
       q = data_flow_ops.SparseConditionalAccumulator(
@@ -378,10 +392,10 @@ class IndexedSlicesConditionalAccumulatorTest(test.TestCase):
 
       def apply_indexed_slices_grad():
         for accum_op in accum_ops:
-          sess.run(accum_op)
+          self.evaluate(accum_op)
 
       def take_grad():
-        results.append(sess.run(takeg_t))
+        results.append(self.evaluate(takeg_t))
 
       accum_thread = self.checkedThread(target=apply_indexed_slices_grad)
       takeg_thread = self.checkedThread(target=take_grad)
@@ -394,8 +408,9 @@ class IndexedSlicesConditionalAccumulatorTest(test.TestCase):
 
   def _blocking_takeg(self, sess, takeg_op):
     with self.assertRaisesOpError("was cancelled"):
-      sess.run(takeg_op)
+      self.evaluate(takeg_op)
 
+  @test_util.run_deprecated_v1
   def testAccumulatorCancel(self):
     with self.cached_session() as sess:
       q = data_flow_ops.SparseConditionalAccumulator(
@@ -415,6 +430,7 @@ class IndexedSlicesConditionalAccumulatorTest(test.TestCase):
 
       takeg_thread.join()
 
+  @test_util.run_deprecated_v1
   def testNonVectorIndices(self):
     with self.cached_session():
       q = data_flow_ops.SparseConditionalAccumulator(
@@ -427,6 +443,7 @@ class IndexedSlicesConditionalAccumulatorTest(test.TestCase):
             grad_indices=[[0, 1], [1, 0]],
             grad_values=np.array([1, 2]).astype(np.float32)).run()
 
+  @test_util.run_deprecated_v1
   def testZeroDimensionValues(self):
     with self.cached_session():
       q = data_flow_ops.SparseConditionalAccumulator(
@@ -437,6 +454,7 @@ class IndexedSlicesConditionalAccumulatorTest(test.TestCase):
         q.apply_grad(
             grad_indices=[0], grad_values=np.array(1).astype(np.float32)).run()
 
+  @test_util.run_deprecated_v1
   def testWrongNonEmptyInputValues(self):
     with self.cached_session():
       q = data_flow_ops.SparseConditionalAccumulator(
@@ -448,6 +466,7 @@ class IndexedSlicesConditionalAccumulatorTest(test.TestCase):
             grad_indices=[0, 1],
             grad_values=np.array([[0, 1, 1]]).astype(np.float32)).run()
 
+  @test_util.run_deprecated_v1
   def testDynamicNonVectorIndices(self):
     with self.cached_session() as sess:
       q = data_flow_ops.SparseConditionalAccumulator(
@@ -467,6 +486,7 @@ class IndexedSlicesConditionalAccumulatorTest(test.TestCase):
                      x_values: np.array([1, 2]).astype(np.float32)
                  })
 
+  @test_util.run_deprecated_v1
   def testDynamicWrongNonEmptyInputValues(self):
     with self.cached_session() as sess:
       q = data_flow_ops.SparseConditionalAccumulator(
@@ -485,6 +505,7 @@ class IndexedSlicesConditionalAccumulatorTest(test.TestCase):
                      x_values: np.array([[0, 1, 1]]).astype(np.float32)
                  })
 
+  @test_util.run_deprecated_v1
   def testEmptyShapeApply(self):
     with self.cached_session():
       q = data_flow_ops.SparseConditionalAccumulator(
@@ -510,6 +531,7 @@ class IndexedSlicesConditionalAccumulatorTest(test.TestCase):
       q.apply_grad(grad_indices=[0], grad_values=[1.0], grad_shape=[]).run()
       q.apply_grad(grad_indices=[0], grad_values=[1.0]).run()
 
+  @test_util.run_deprecated_v1
   def testValidateShape(self):
     with self.cached_session() as sess:
       q = data_flow_ops.SparseConditionalAccumulator(
@@ -585,7 +607,7 @@ class IndexedSlicesConditionalAccumulatorTest(test.TestCase):
                     np.float32)).run()
 
       # After take grad, constraints on accumulated gradient are removed
-      sess.run(q.take_grad(1))
+      self.evaluate(q.take_grad(1))
 
       # First successful gradient imposes new constraints.
       # Hereafter, shape will additionally constrained to [None,2,2,3]
@@ -605,6 +627,7 @@ class IndexedSlicesConditionalAccumulatorTest(test.TestCase):
                 [[[[1, 2], [3, 4]], [[5, 6], [7, 8]]]]).astype(np.float32),
             local_step=1).run()
 
+  @test_util.run_deprecated_v1
   def testReturnShape(self):
     with self.cached_session() as sess:
       q = data_flow_ops.SparseConditionalAccumulator(
@@ -615,7 +638,7 @@ class IndexedSlicesConditionalAccumulatorTest(test.TestCase):
           grad_values=np.array(
               [[[[1, 2], [3, 4]], [[5, 6], [7, 8]]]]).astype(np.float32)).run()
 
-      val = sess.run(q.take_indexed_slices_grad(1))
+      val = self.evaluate(q.take_indexed_slices_grad(1))
       self.assertAllEqual(val.dense_shape, [2, 2, 2, 2])
 
       q = data_flow_ops.SparseConditionalAccumulator(
@@ -627,9 +650,10 @@ class IndexedSlicesConditionalAccumulatorTest(test.TestCase):
               [[[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]]]).astype(
                   np.float32)).run()
 
-      val = sess.run(q.take_indexed_slices_grad(1))
+      val = self.evaluate(q.take_indexed_slices_grad(1))
       self.assertAllEqual(val.dense_shape, [-1, 2, 2, 3])
 
+  @test_util.run_deprecated_v1
   def testApplyGradtInt32IndicesAndShape(self):
     with self.cached_session() as sess:
       q = data_flow_ops.SparseConditionalAccumulator(
@@ -653,7 +677,7 @@ class IndexedSlicesConditionalAccumulatorTest(test.TestCase):
       accum_op.run()
       self.assertEqual(q.num_accumulated().eval(), 2)
 
-      val = sess.run(q.take_indexed_slices_grad(1))
+      val = self.evaluate(q.take_indexed_slices_grad(1))
       self.assertAllEqual(val.indices, [0, 2])
       self.assertAllEqual(val.values, [[0, 0, 1], [3, 0, 4]])
       self.assertAllEqual(val.dense_shape, [3, 3])
diff --git a/tensorflow/python/kernel_tests/sparse_cross_op_test.py b/tensorflow/python/kernel_tests/sparse_cross_op_test.py
index 6e0714da702..566bbb56f00 100644
--- a/tensorflow/python/kernel_tests/sparse_cross_op_test.py
+++ b/tensorflow/python/kernel_tests/sparse_cross_op_test.py
@@ -24,12 +24,14 @@ from tensorflow.python.client import session
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import sparse_ops
 from tensorflow.python.platform import test
 
 
 class SparseCrossOpTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def test_simple(self):
     """Tests a simple scenario."""
     op = sparse_ops.sparse_cross([
@@ -43,8 +45,9 @@ class SparseCrossOpTest(test.TestCase):
         'batch2-FC1-F2_X_batch2-FC2-F1', 'batch2-FC1-F2_X_batch2-FC2-F2'
     ]])
     with self.cached_session() as sess:
-      self._assert_sparse_tensor_equals(expected_out, sess.run(op))
+      self._assert_sparse_tensor_equals(expected_out, self.evaluate(op))
 
+  @test_util.run_deprecated_v1
   def test_dense(self):
     """Tests only dense inputs."""
     op = sparse_ops.sparse_cross([
@@ -63,8 +66,9 @@ class SparseCrossOpTest(test.TestCase):
         'batch2-FC1-F2_X_batch2-FC2-F1', 'batch2-FC1-F2_X_batch2-FC2-F2'
     ]])
     with self.cached_session() as sess:
-      self._assert_sparse_tensor_equals(expected_out, sess.run(op))
+      self._assert_sparse_tensor_equals(expected_out, self.evaluate(op))
 
+  @test_util.run_deprecated_v1
   def test_integer_mixed_string_sparse(self):
     """Tests mixed type."""
     op = sparse_ops.sparse_cross([
@@ -77,8 +81,9 @@ class SparseCrossOpTest(test.TestCase):
         '55555_X_batch2-FC2-F2'
     ]])
     with self.cached_session() as sess:
-      self._assert_sparse_tensor_equals(expected_out, sess.run(op))
+      self._assert_sparse_tensor_equals(expected_out, self.evaluate(op))
 
+  @test_util.run_deprecated_v1
   def test_integer_mixed_string_dense(self):
     """Tests mixed dense inputs."""
     op = sparse_ops.sparse_cross([
@@ -95,8 +100,9 @@ class SparseCrossOpTest(test.TestCase):
         '999999_X_batch2-FC2-F1', '999999_X_batch2-FC2-F2'
     ]])
     with self.cached_session() as sess:
-      self._assert_sparse_tensor_equals(expected_out, sess.run(op))
+      self._assert_sparse_tensor_equals(expected_out, self.evaluate(op))
 
+  @test_util.run_deprecated_v1
   def test_sparse_cross_dense(self):
     """Tests sparse and dense inputs."""
     op = sparse_ops.sparse_cross([
@@ -112,8 +118,9 @@ class SparseCrossOpTest(test.TestCase):
             'batch2-FC1-F2_X_batch2-FC2-F1', 'batch2-FC1-F2_X_batch2-FC2-F2'
         ]])
     with self.cached_session() as sess:
-      self._assert_sparse_tensor_equals(expected_out, sess.run(op))
+      self._assert_sparse_tensor_equals(expected_out, self.evaluate(op))
 
+  @test_util.run_deprecated_v1
   def test_integer_sparse_input(self):
     """Tests mixed type sparse and dense inputs."""
     op = sparse_ops.sparse_cross([
@@ -128,8 +135,9 @@ class SparseCrossOpTest(test.TestCase):
             '5555_X_batch2-FC2-F1', '5555_X_batch2-FC2-F2'
         ]])
     with self.cached_session() as sess:
-      self._assert_sparse_tensor_equals(expected_out, sess.run(op))
+      self._assert_sparse_tensor_equals(expected_out, self.evaluate(op))
 
+  @test_util.run_deprecated_v1
   def test_permutation_3x3x3(self):
     """Tests 3x3x3 permutation."""
     op = sparse_ops.sparse_cross([
@@ -170,8 +178,9 @@ class SparseCrossOpTest(test.TestCase):
         'batch1-FC1-F3_X_batch1-FC2-F3_X_batch1-FC3-F3'
     ]])
     with self.cached_session() as sess:
-      self._assert_sparse_tensor_equals(expected_out, sess.run(op))
+      self._assert_sparse_tensor_equals(expected_out, self.evaluate(op))
 
+  @test_util.run_deprecated_v1
   def test_permutation_3x1x2(self):
     """Tests 3x1x2 permutation."""
     op = sparse_ops.sparse_cross([
@@ -189,8 +198,9 @@ class SparseCrossOpTest(test.TestCase):
         'batch1-FC1-F3_X_batch1-FC2-F1_X_batch1-FC3-F2'
     ]])
     with self.cached_session() as sess:
-      self._assert_sparse_tensor_equals(expected_out, sess.run(op))
+      self._assert_sparse_tensor_equals(expected_out, self.evaluate(op))
 
+  @test_util.run_deprecated_v1
   def test_large_batch(self):
     """Tests with large batch size to force multithreading."""
     batch_size = 5000
@@ -222,8 +232,9 @@ class SparseCrossOpTest(test.TestCase):
 
     expected_out = self._sparse_tensor(col_out)
     with self.cached_session() as sess:
-      self._assert_sparse_tensor_equals(expected_out, sess.run(op))
+      self._assert_sparse_tensor_equals(expected_out, self.evaluate(op))
 
+  @test_util.run_deprecated_v1
   def test_one_column_empty(self):
     """Tests when one column is empty.
 
@@ -235,8 +246,9 @@ class SparseCrossOpTest(test.TestCase):
         self._sparse_tensor([['batch1-FC3-F1', 'batch1-FC3-F2']])
     ])
     with self.cached_session() as sess:
-      self._assert_sparse_tensor_empty(sess.run(op))
+      self._assert_sparse_tensor_empty(self.evaluate(op))
 
+  @test_util.run_deprecated_v1
   def test_some_columns_empty(self):
     """Tests when more than one columns are empty.
 
@@ -254,8 +266,9 @@ class SparseCrossOpTest(test.TestCase):
         'batch1-FC1-F2_X_batch1-FC2-F1_X_batch1-FC3-F2'
     ]], 2)
     with self.cached_session() as sess:
-      self._assert_sparse_tensor_equals(expected_out, sess.run(op))
+      self._assert_sparse_tensor_equals(expected_out, self.evaluate(op))
 
+  @test_util.run_deprecated_v1
   def test_all_columns_empty(self):
     """Tests when all columns are empty.
 
@@ -267,8 +280,9 @@ class SparseCrossOpTest(test.TestCase):
         self._sparse_tensor([])
     ])
     with self.cached_session() as sess:
-      self._assert_sparse_tensor_empty(sess.run(op))
+      self._assert_sparse_tensor_empty(self.evaluate(op))
 
+  @test_util.run_deprecated_v1
   def test_hashed_zero_bucket_no_hash_key(self):
     op = sparse_ops.sparse_cross_hashed([
         self._sparse_tensor([['batch1-FC1-F1']]),
@@ -278,8 +292,9 @@ class SparseCrossOpTest(test.TestCase):
     # Check actual hashed output to prevent unintentional hashing changes.
     expected_out = self._sparse_tensor([[1971693436396284976]])
     with self.cached_session() as sess:
-      self._assert_sparse_tensor_equals(expected_out, sess.run(op))
+      self._assert_sparse_tensor_equals(expected_out, self.evaluate(op))
 
+  @test_util.run_deprecated_v1
   def test_hashed_zero_bucket(self):
     op = sparse_ops.sparse_cross_hashed(
         [
@@ -291,9 +306,10 @@ class SparseCrossOpTest(test.TestCase):
     # Check actual hashed output to prevent unintentional hashing changes.
     expected_out = self._sparse_tensor([[4847552627144134031]])
     with self.cached_session() as sess:
-      self._assert_sparse_tensor_equals(expected_out, sess.run(op))
+      self._assert_sparse_tensor_equals(expected_out, self.evaluate(op))
 
   # TODO(sibyl-Aix6ihai): Add benchmark to compare Hashed vs Non-hashed.
+  @test_util.run_deprecated_v1
   def test_hashed_no_hash_key(self):
     op = sparse_ops.sparse_cross_hashed(
         [
@@ -305,8 +321,9 @@ class SparseCrossOpTest(test.TestCase):
     # Check actual hashed output to prevent unintentional hashing changes.
     expected_out = self._sparse_tensor([[83]])
     with self.cached_session() as sess:
-      self._assert_sparse_tensor_equals(expected_out, sess.run(op))
+      self._assert_sparse_tensor_equals(expected_out, self.evaluate(op))
 
+  @test_util.run_deprecated_v1
   def test_hashed_output(self):
     op = sparse_ops.sparse_cross_hashed(
         [
@@ -319,8 +336,9 @@ class SparseCrossOpTest(test.TestCase):
     # Check actual hashed output to prevent unintentional hashing changes.
     expected_out = self._sparse_tensor([[31]])
     with self.cached_session() as sess:
-      self._assert_sparse_tensor_equals(expected_out, sess.run(op))
+      self._assert_sparse_tensor_equals(expected_out, self.evaluate(op))
 
+  @test_util.run_deprecated_v1
   def test_hashed__has_no_collision(self):
     """Tests that fingerprint concatenation has no collisions."""
     # Although the last 10 bits of 359 and 1024+359 are identical.
@@ -331,7 +349,7 @@ class SparseCrossOpTest(test.TestCase):
         [t2, t1], num_buckets=1024, hash_key=sparse_ops._DEFAULT_HASH_KEY + 1)
     cross_dense = sparse_ops.sparse_tensor_to_dense(cross)
     with session.Session():
-      values = cross_dense.eval()
+      values = self.evaluate(cross_dense)
       self.assertTrue(numpy.not_equal(values[0], values[1]).all())
 
   def test_hashed_3x1x2(self):
@@ -345,7 +363,7 @@ class SparseCrossOpTest(test.TestCase):
         ],
         num_buckets=1000)
     with self.cached_session() as sess:
-      out = sess.run(op)
+      out = self.evaluate(op)
       self.assertEqual(6, len(out.values))
       self.assertAllEqual([[0, i] for i in range(6)], out.indices)
       self.assertTrue(all(x < 1000 and x >= 0 for x in out.values))
diff --git a/tensorflow/python/kernel_tests/sparse_matmul_op_test.py b/tensorflow/python/kernel_tests/sparse_matmul_op_test.py
index 541463e76bb..2e17a9c608f 100644
--- a/tensorflow/python/kernel_tests/sparse_matmul_op_test.py
+++ b/tensorflow/python/kernel_tests/sparse_matmul_op_test.py
@@ -22,6 +22,7 @@ import numpy as np
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
@@ -58,7 +59,7 @@ class SparseMatMulTest(test.TestCase):
           transpose_b=tr_b,
           a_is_sparse=sp_a,
           b_is_sparse=sp_b)
-      out = tf_ans.eval()
+      out = self.evaluate(tf_ans)
       np_x = math_ops.cast(tf_x, dtypes.float32).eval()
       np_y = math_ops.cast(tf_y, dtypes.float32).eval()
 
@@ -71,6 +72,7 @@ class SparseMatMulTest(test.TestCase):
     self.assertShapeEqual(np_ans, tf_ans)
     self.assertAllCloseAccordingToType(np_ans, out, rtol=1e-4, atol=1e-4)
 
+  @test_util.run_deprecated_v1
   def testBasic(self):
     x = np.arange(0., 4.).reshape([4, 1]).astype(np.float32)
     y = np.arange(-1., 1.).reshape([1, 2]).astype(np.float32)
@@ -78,6 +80,7 @@ class SparseMatMulTest(test.TestCase):
       for y_dtype in (dtypes.float32, dtypes.bfloat16):
         self._testCpuMatmul(x, y, x_dtype=x_dtype, y_dtype=y_dtype)
 
+  @test_util.run_deprecated_v1
   def testZeroDim(self):
     x = np.ones((4, 0)).astype(np.float32)
     y = np.ones((0, 3)).astype(np.float32)
@@ -85,6 +88,7 @@ class SparseMatMulTest(test.TestCase):
       for y_dtype in (dtypes.float32, dtypes.bfloat16):
         self._testCpuMatmul(x, y, x_dtype=x_dtype, y_dtype=y_dtype)
 
+  @test_util.run_deprecated_v1
   def testEmpty(self):
     x = np.ones((0, 0)).astype(np.float32)
     y = np.ones((0, 0)).astype(np.float32)
@@ -93,6 +97,7 @@ class SparseMatMulTest(test.TestCase):
         self._testCpuMatmul(x, y, x_dtype=x_dtype, y_dtype=y_dtype)
 
   # Tests setting one dimension to be a high value.
+  @test_util.run_deprecated_v1
   def testLarge(self):
     r1 = np.random.randint(6000, 20000)
     r2 = np.random.randint(1, 10)
@@ -105,6 +110,7 @@ class SparseMatMulTest(test.TestCase):
           self._testCpuMatmul(x, y, x_dtype=x_dtype, y_dtype=y_dtype)
 
   # Tests random sized matrices.
+  @test_util.run_deprecated_v1
   def testRandom(self):
     for tr_a in [True, False]:
       for tr_b in [True, False]:
@@ -159,6 +165,7 @@ class MatMulGradientTest(test.TestCase):
               delta=delta))
     self.assertLessEqual(err, delta / 2.)
 
+  @test_util.run_deprecated_v1
   def testGradientInput(self):
     for tr_a in [True, False]:
       for tr_b in [True, False]:
diff --git a/tensorflow/python/kernel_tests/sparse_ops_test.py b/tensorflow/python/kernel_tests/sparse_ops_test.py
index a45ce2e13b4..75f65e62517 100644
--- a/tensorflow/python/kernel_tests/sparse_ops_test.py
+++ b/tensorflow/python/kernel_tests/sparse_ops_test.py
@@ -71,6 +71,7 @@ class SparseToIndicatorTest(test_util.TensorFlowTestCase):
         constant_op.constant(val, dtype),
         constant_op.constant(shape, dtypes.int64))
 
+  @test_util.run_deprecated_v1
   def testInt32(self):
     with self.session(use_gpu=False):
       sp_input = self._SparseTensor_5x6(dtypes.int32)
@@ -83,6 +84,7 @@ class SparseToIndicatorTest(test_util.TensorFlowTestCase):
 
       self.assertAllEqual(output, expected_output)
 
+  @test_util.run_deprecated_v1
   def testInt64(self):
     with self.session(use_gpu=False):
       sp_input = self._SparseTensor_5x6(dtypes.int64)
@@ -95,6 +97,7 @@ class SparseToIndicatorTest(test_util.TensorFlowTestCase):
 
       self.assertAllEqual(output, expected_output)
 
+  @test_util.run_deprecated_v1
   def testHigherRank(self):
     with self.session(use_gpu=False):
       sp_input = self._SparseTensor_2x3x4(dtypes.int64)
@@ -154,7 +157,7 @@ class SparseMergeTest(test_util.TensorFlowTestCase):
                        sparse_tensor.SparseTensor.from_value(values_v)):
           sp_output = sparse_ops.sparse_merge(indices, values, vocab_size)
 
-          output = sess.run(sp_output)
+          output = self.evaluate(sp_output)
           self._AssertResultsSorted(output, vocab_size)
 
   def testInt64AndFloat32(self):
@@ -163,7 +166,7 @@ class SparseMergeTest(test_util.TensorFlowTestCase):
       indices, values = self._SparseTensor_3x50(np.int64, np.float32)
       sp_output = sparse_ops.sparse_merge(indices, values, vocab_size)
 
-      output = sess.run(sp_output)
+      output = self.evaluate(sp_output)
       self._AssertResultsSorted(output, vocab_size)
 
   def testInt64AndFloat64(self):
@@ -172,7 +175,7 @@ class SparseMergeTest(test_util.TensorFlowTestCase):
       indices, values = self._SparseTensor_3x50(np.int64, np.float64)
       sp_output = sparse_ops.sparse_merge(indices, values, vocab_size)
 
-      output = sess.run(sp_output)
+      output = self.evaluate(sp_output)
       self._AssertResultsSorted(output, vocab_size)
 
   def testInt32AndFloat32NonCanonicalOrder(self):
@@ -182,7 +185,7 @@ class SparseMergeTest(test_util.TensorFlowTestCase):
       sp_output = sparse_ops.sparse_merge(
           indices, values, vocab_size, already_sorted=True)
 
-      output = sess.run(sp_output)
+      output = self.evaluate(sp_output)
       self._AssertResultsNotSorted(output, vocab_size)
 
   def testInt64AndFloat32NonCanonicalOrder(self):
@@ -192,7 +195,7 @@ class SparseMergeTest(test_util.TensorFlowTestCase):
       sp_output = sparse_ops.sparse_merge(
           indices, values, vocab_size, already_sorted=True)
 
-      output = sess.run(sp_output)
+      output = self.evaluate(sp_output)
       self._AssertResultsNotSorted(output, vocab_size)
 
   def testInt64AndFloat64NonCanonicalOrder(self):
@@ -203,7 +206,7 @@ class SparseMergeTest(test_util.TensorFlowTestCase):
       sp_output = sparse_ops.sparse_merge(
           indices, values, vocab_size_tensor, already_sorted=True)
 
-      output = sess.run(sp_output)
+      output = self.evaluate(sp_output)
       self._AssertResultsNotSorted(output, vocab_size)
 
   def testShouldSetLastDimensionInDynamicShape(self):
@@ -261,7 +264,7 @@ class SparseMergeHighDimTest(test_util.TensorFlowTestCase):
       indices, values = self._SparseTensor_3x50(np.int64, np.float32)
       sp_output = sparse_ops.sparse_merge(indices, values, vocab_size)
 
-      output = sess.run(sp_output)
+      output = self.evaluate(sp_output)
       self._AssertResultsSorted(output, vocab_size)
 
   def testInt64AndFloat64(self):
@@ -270,7 +273,7 @@ class SparseMergeHighDimTest(test_util.TensorFlowTestCase):
       indices, values = self._SparseTensor_3x50(np.int64, np.float64)
       sp_output = sparse_ops.sparse_merge(indices, values, vocab_size)
 
-      output = sess.run(sp_output)
+      output = self.evaluate(sp_output)
       self._AssertResultsSorted(output, vocab_size)
 
   def testInt64AndFloat64Shape(self):
@@ -279,7 +282,7 @@ class SparseMergeHighDimTest(test_util.TensorFlowTestCase):
       indices, values = self._SparseTensor_3x50(np.int64, np.float64)
       sp_output = sparse_ops.sparse_merge(indices, values, vocab_size)
 
-      output = sess.run(sp_output)
+      output = self.evaluate(sp_output)
       self._AssertResultsSorted(output, vocab_size)
 
 
@@ -296,13 +299,14 @@ class SparseRetainTest(test_util.TensorFlowTestCase):
   def _SparseTensor_5x6(self):
     return sparse_tensor.SparseTensor.from_value(self._SparseTensorValue_5x6())
 
+  @test_util.run_deprecated_v1
   def testBasic(self):
     with self.session(use_gpu=False) as sess:
       for sp_input in (self._SparseTensorValue_5x6(), self._SparseTensor_5x6()):
         to_retain = np.array([1, 0, 0, 1, 1, 0], dtype=np.bool)
         sp_output = sparse_ops.sparse_retain(sp_input, to_retain)
 
-        output = sess.run(sp_output)
+        output = self.evaluate(sp_output)
 
         self.assertAllEqual(output.indices, [[0, 0], [1, 4], [3, 2]])
         self.assertAllEqual(output.values, [0, 14, 32])
@@ -314,7 +318,7 @@ class SparseRetainTest(test_util.TensorFlowTestCase):
       to_retain = np.zeros((6,), dtype=np.bool)
       sp_output = sparse_ops.sparse_retain(sp_input, to_retain)
 
-      output = sess.run(sp_output)
+      output = self.evaluate(sp_output)
 
       self.assertAllEqual(output.indices, np.array([]).reshape((0, 2)))
       self.assertAllEqual(output.values, [])
@@ -353,38 +357,42 @@ class SparseResetShapeTest(test_util.TensorFlowTestCase):
     return sparse_tensor.SparseTensorValue(self._IND_2_5_6, self._VAL_2_5_6,
                                            self._SHP_2_5_6)
 
+  @test_util.run_deprecated_v1
   def testStaticShapeInfoPreservedWhenNewShapeIsProvidedAndStatic(self):
     sp_input = self._SparseTensor_2x5x6()
     new_shape = np.array([3, 6, 7], dtype=np.int64)
     sp_output = sparse_ops.sparse_reset_shape(sp_input, new_shape)
     self.assertAllEqual([3, 6, 7], sp_output.get_shape())
 
+  @test_util.run_deprecated_v1
   def testBasic(self):
     with self.session(use_gpu=False) as sess:
       sp_input = self._SparseTensor_2x5x6()
       new_shape = np.array([3, 6, 7], dtype=np.int64)
       sp_output = sparse_ops.sparse_reset_shape(sp_input, new_shape)
 
-      output = sess.run(sp_output)
+      output = self.evaluate(sp_output)
 
       self.assertAllEqual(output.indices, [[0, 0, 0], [0, 1, 0], [0, 1, 3],
                                            [1, 1, 4], [1, 3, 2], [1, 3, 3]])
       self.assertAllEqual(output.values, [0, 10, 13, 14, 32, 33])
       self.assertAllEqual(output.dense_shape, [3, 6, 7])
 
+  @test_util.run_deprecated_v1
   def testInputUnavailableInGraphConstructionOk(self):
     with self.session(use_gpu=False) as sess:
       sp_input = self._SparseTensorValue_2x5x6()
       new_shape = np.array([3, 6, 7], dtype=np.int64)
       sp_output = sparse_ops.sparse_reset_shape(sp_input, new_shape)
 
-      output = sess.run(sp_output)
+      output = self.evaluate(sp_output)
 
       self.assertAllEqual(output.indices, [[0, 0, 0], [0, 1, 0], [0, 1, 3],
                                            [1, 1, 4], [1, 3, 2], [1, 3, 3]])
       self.assertAllEqual(output.values, [0, 10, 13, 14, 32, 33])
       self.assertAllEqual(output.dense_shape, [3, 6, 7])
 
+  @test_util.run_deprecated_v1
   def testFeedInputUnavailableInGraphConstructionOk(self):
     with self.session(use_gpu=False) as sess:
       sp_input = array_ops.sparse_placeholder(dtype=dtypes.int32)
@@ -404,7 +412,7 @@ class SparseResetShapeTest(test_util.TensorFlowTestCase):
       sp_input = self._SparseTensor_2x5x6()
       sp_output = sparse_ops.sparse_reset_shape(sp_input)
 
-      output = sess.run(sp_output)
+      output = self.evaluate(sp_output)
 
       self.assertAllEqual(output.indices, [[0, 0, 0], [0, 1, 0], [0, 1, 3],
                                            [1, 1, 4], [1, 3, 2], [1, 3, 3]])
@@ -416,12 +424,13 @@ class SparseResetShapeTest(test_util.TensorFlowTestCase):
       sp_input = self._SparseTensor_2x5x6_Empty()
       sp_output = sparse_ops.sparse_reset_shape(sp_input)
 
-      output = sess.run(sp_output)
+      output = self.evaluate(sp_output)
 
       self.assertAllEqual(output.indices.shape, [0, 3])
       self.assertAllEqual(output.values.shape, [0])
       self.assertAllEqual(output.dense_shape, [0, 0, 0])
 
+  @test_util.run_deprecated_v1
   def testInvalidRank(self):
     with self.session(use_gpu=False):
       sp_input = self._SparseTensor_2x5x6()
@@ -430,6 +439,7 @@ class SparseResetShapeTest(test_util.TensorFlowTestCase):
       with self.assertRaises(ValueError):
         sparse_ops.sparse_reset_shape(sp_input, new_shape)
 
+  @test_util.run_deprecated_v1
   def testInvalidRankNewShapeUnavailableInGraphConstruction(self):
     with self.session(use_gpu=False) as sess:
       new_shape = array_ops.placeholder(dtype=dtypes.int64)
@@ -439,6 +449,7 @@ class SparseResetShapeTest(test_util.TensorFlowTestCase):
       with self.assertRaisesOpError("x == y did not hold element-wise"):
         sess.run(out, feed_dict={new_shape: np.array([3, 7], dtype=np.int64)})
 
+  @test_util.run_deprecated_v1
   def testInvalidDimensionSizeStatic(self):
     sp_input = self._SparseTensor_2x5x6()
     new_shape = np.array([3, 7, 5], dtype=np.int64)
@@ -446,6 +457,7 @@ class SparseResetShapeTest(test_util.TensorFlowTestCase):
     with self.assertRaisesRegexp(ValueError, "should have dimension sizes"):
       sparse_ops.sparse_reset_shape(sp_input, new_shape)
 
+  @test_util.run_deprecated_v1
   def testInvalidDimensionSizeDynamic(self):
     with self.session(use_gpu=False) as sess:
       sp_input = self._SparseTensor_2x5x6()
@@ -455,6 +467,7 @@ class SparseResetShapeTest(test_util.TensorFlowTestCase):
       with self.assertRaisesOpError("x <= y did not hold element-wise"):
         sess.run(out, feed_dict={new_shape: [3, 7, 5]})
 
+  @test_util.run_deprecated_v1
   def testInvalidDimensionSizeInputUnavailableInGraphConstruction(self):
     sp_input = array_ops.sparse_placeholder(dtype=dtypes.int32)
     with self.session(use_gpu=False) as sess:
@@ -496,6 +509,7 @@ class SparseFillEmptyRowsTest(test_util.TensorFlowTestCase):
         constant_op.constant(val, dtypes.int32),
         constant_op.constant(shape, dtypes.int64))
 
+  @test_util.run_deprecated_v1
   def testFillNumber(self):
     with self.session(use_gpu=False) as sess:
       for sp_input in (self._SparseTensorValue_5x6(), self._SparseTensor_5x6()):
@@ -513,6 +527,7 @@ class SparseFillEmptyRowsTest(test_util.TensorFlowTestCase):
         self.assertAllEqual(empty_row_indicator_out,
                             np.array([0, 0, 1, 0, 1]).astype(np.bool))
 
+  @test_util.run_deprecated_v1
   def testFillFloat(self):
     with self.session(use_gpu=False) as sess:
       values = constant_op.constant(
@@ -547,6 +562,7 @@ class SparseFillEmptyRowsTest(test_util.TensorFlowTestCase):
       self.assertGreater(default_value_grad_err, 0)
       self.assertLess(default_value_grad_err, 1e-8)
 
+  @test_util.run_deprecated_v1
   def testFillString(self):
     with self.session(use_gpu=False) as sess:
       sp_input = self._SparseTensor_String5x6()
@@ -565,6 +581,7 @@ class SparseFillEmptyRowsTest(test_util.TensorFlowTestCase):
       self.assertAllEqual(empty_row_indicator_out,
                           np.array([0, 0, 1, 0, 1]).astype(np.bool))
 
+  @test_util.run_deprecated_v1
   def testNoEmptyRows(self):
     with self.session(use_gpu=False) as sess:
       sp_input = self._SparseTensor_2x6()
@@ -582,6 +599,7 @@ class SparseFillEmptyRowsTest(test_util.TensorFlowTestCase):
 
 class SparseAddTest(test_util.TensorFlowTestCase):
 
+  @test_util.run_deprecated_v1
   def testValuesInVariable(self):
     indices = constant_op.constant([[1]], dtype=dtypes.int64)
     values = variables.Variable([1], trainable=False, dtype=dtypes.float32)
@@ -591,8 +609,8 @@ class SparseAddTest(test_util.TensorFlowTestCase):
     sp_output = sparse_ops.sparse_add(sp_input, sp_input)
 
     with self.session(use_gpu=False) as sess:
-      sess.run(variables.global_variables_initializer())
-      output = sess.run(sp_output)
+      self.evaluate(variables.global_variables_initializer())
+      output = self.evaluate(sp_output)
       self.assertAllEqual(output.values, [2])
 
 
@@ -635,7 +653,7 @@ class SparseReduceTest(test_util.TensorFlowTestCase):
       else:
         tf_dense_ans = sparse_ops.sparse_reduce_max(sp_t, reduction_axes,
                                                     keep_dims)
-      out_dense = tf_dense_ans.eval()
+      out_dense = self.evaluate(tf_dense_ans)
 
       if do_sum:
         tf_sparse_ans = sparse_ops.sparse_reduce_sum_sparse(sp_t,
@@ -657,6 +675,7 @@ class SparseReduceTest(test_util.TensorFlowTestCase):
     self._compare(sp_t, reduction_axes, ndims, True, False)
     self._compare(sp_t, reduction_axes, ndims, True, True)
 
+  @test_util.run_deprecated_v1
   def testSimpleAndRandomInputs(self):
     if np.__version__ == "1.13.0":
       self.skipTest("numpy 1.13.0 bug")
@@ -696,6 +715,7 @@ class SparseReduceTest(test_util.TensorFlowTestCase):
       with self.assertRaisesOpError("Invalid reduction dimension 2"):
         sparse_ops.sparse_reduce_max(sp_t, 2).eval()
 
+  @test_util.run_deprecated_v1
   def testGradient(self):
     if np.__version__ == "1.13.0":
       self.skipTest("numpy 1.13.0 bug")
@@ -710,18 +730,59 @@ class SparseReduceTest(test_util.TensorFlowTestCase):
           axes = np.random.choice(len(dims), size=d, replace=False).tolist()
           reduced = sparse_ops.sparse_reduce_sum(sp_t, axes)
 
-          err = gradient_checker.compute_gradient_error(sp_t.values, (nnz,),
-                                                        reduced,
-                                                        reduced.eval().shape)
+          err = gradient_checker.compute_gradient_error(
+              sp_t.values, (nnz,), reduced,
+              self.evaluate(reduced).shape)
           self.assertLess(err, 1e-3)
 
         # Tests for negative axes.
         reduced = sparse_ops.sparse_reduce_sum(sp_t, -1)
-        err = gradient_checker.compute_gradient_error(sp_t.values, (nnz,),
-                                                      reduced,
-                                                      reduced.eval().shape)
+        err = gradient_checker.compute_gradient_error(
+            sp_t.values, (nnz,), reduced,
+            self.evaluate(reduced).shape)
         self.assertLess(err, 1e-3)
 
+  def _testSparseReduceShape(self, sp_t, reduction_axes, ndims, keep_dims,
+                             do_sum):
+    densified = sparse_ops.sparse_tensor_to_dense(sp_t).eval()
+
+    np_op = np.sum
+    tf_op = sparse_ops.sparse_reduce_sum
+    if not do_sum:
+      np_op = np.max
+      tf_op = sparse_ops.sparse_reduce_max
+
+    np_ans = densified
+    if reduction_axes is None:
+      np_ans = np_op(np_ans, keepdims=keep_dims)
+    else:
+      if not isinstance(reduction_axes, list):  # Single scalar.
+        reduction_axes = [reduction_axes]
+      reduction_axes = np.array(reduction_axes).astype(np.int32)
+      # Handles negative axes.
+      reduction_axes = (reduction_axes + ndims) % ndims
+      # Loop below depends on sorted.
+      reduction_axes.sort()
+      for ra in reduction_axes.ravel()[::-1]:
+        np_ans = np_op(np_ans, axis=ra, keepdims=keep_dims)
+
+    tf_ans = tf_op(sp_t, reduction_axes, keep_dims)
+    self.assertAllEqual(np_ans.shape, tf_ans.get_shape().as_list())
+
+  def testSparseReduceSumOrMaxShape(self):
+    sp_t = sparse_tensor.SparseTensor(self.ind, self.vals, self.dense_shape)
+
+    with self.session(use_gpu=False):
+      for do_sum in [True, False]:
+        for keep_dims in [True, False]:
+          self._testSparseReduceShape(sp_t, None, 2, keep_dims, do_sum)
+          self._testSparseReduceShape(sp_t, 0, 2, keep_dims, do_sum)
+          self._testSparseReduceShape(sp_t, [1], 2, keep_dims, do_sum)
+          self._testSparseReduceShape(sp_t, [0, 1], 2, keep_dims, do_sum)
+          self._testSparseReduceShape(sp_t, [1, 0], 2, keep_dims, do_sum)
+          self._testSparseReduceShape(sp_t, [-1], 2, keep_dims, do_sum)
+          self._testSparseReduceShape(sp_t, [1, -2], 2, keep_dims, do_sum)
+
 
 class SparseMathOpsTest(test_util.TensorFlowTestCase):
 
@@ -737,6 +798,7 @@ class SparseMathOpsTest(test_util.TensorFlowTestCase):
                                                result_tensor.values).eval()
     self.assertAllEqual(result_np, res_densified)
 
+  @test_util.run_deprecated_v1
   def testCwiseDivAndMul(self):
     np.random.seed(1618)
     sp_shapes = [(10, 10, 10), (5, 5), (1618,), (3, 3, 7)]
@@ -760,6 +822,7 @@ class SparseMathOpsTest(test_util.TensorFlowTestCase):
             res = sp_t / dense_t  # should invoke "__truediv__"
             self.assertEqual(res.values.eval().dtype, np.float64)
 
+  @test_util.run_deprecated_v1
   def testCwiseAdd(self):
     with self.session(use_gpu=False):
       # Identity(2) + AllOnes(2,2).  Should be equal to 2 * Identity(2).
@@ -779,6 +842,7 @@ class SparseMathOpsTest(test_util.TensorFlowTestCase):
           sparse_ops.sparse_dense_cwise_add(sp_t, dense_t),
           np.identity(2) * 2, sp_t)
 
+  @test_util.run_deprecated_v1
   def testGradients(self):
     np.random.seed(1618)
     sp_shapes = [(10, 10, 10), (5, 5), (1618,), (3, 3, 7)]
@@ -812,6 +876,7 @@ class SparseMathOpsTest(test_util.TensorFlowTestCase):
 
 class SparseSoftmaxTest(test_util.TensorFlowTestCase):
 
+  @test_util.run_deprecated_v1
   def testEquivalentToDensified(self):
     np.random.seed(1618)
     n, m = np.random.choice(20, size=2)
@@ -831,6 +896,7 @@ class SparseSoftmaxTest(test_util.TensorFlowTestCase):
 
         self.assertAllClose(dense_result.eval(), sp_result)
 
+  @test_util.run_deprecated_v1
   def testHigherRanks(self):
     # For the first shape:
     # First batch:
@@ -860,6 +926,7 @@ class SparseSoftmaxTest(test_util.TensorFlowTestCase):
         self.assertAllEqual(sp_t.indices.eval(), result.indices)
         self.assertAllEqual(shape, result.dense_shape)
 
+  @test_util.run_deprecated_v1
   def testGradient(self):
     x_shape = [2, 5, 10]
     with self.cached_session(use_gpu=False):
@@ -879,6 +946,7 @@ class SparseMinimumMaximumTest(test_util.TensorFlowTestCase):
     self.assertAllEqual(a.values, b.values)
     self.assertAllEqual(a.dense_shape, b.dense_shape)
 
+  @test_util.run_deprecated_v1
   def testBasic(self):
     with self.cached_session(use_gpu=False):
       # 1-D, values at index 0.
@@ -898,6 +966,7 @@ class SparseMinimumMaximumTest(test_util.TensorFlowTestCase):
       self._assertSparseTensorValueEqual(expected.eval(), max_tf)
       self._assertSparseTensorValueEqual(expected.eval(), min_tf)
 
+  @test_util.run_deprecated_v1
   def testRandom(self):
     np.random.seed(1618)
     shapes = [(13,), (6, 8), (1, 7, 1)]
@@ -939,6 +1008,7 @@ class SparseMinimumMaximumTest(test_util.TensorFlowTestCase):
 
 class SparseTransposeTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testTranspose(self):
     if np.__version__ == "1.13.0":
       self.skipTest("numpy 1.13.0 bug")
@@ -961,16 +1031,19 @@ class SparseTransposeTest(test.TestCase):
 
 class SparsePlaceholderTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testPlaceholder(self):
     foo = array_ops.sparse_placeholder(dtypes.float32, shape=(10, 47))
     self.assertAllEqual([10, 47], foo.get_shape())
     self.assertAllEqual([None, 2], foo.indices.get_shape().as_list())
 
+  @test_util.run_deprecated_v1
   def testPartialShapePlaceholder(self):
     foo = array_ops.sparse_placeholder(dtypes.float32, shape=(None, 47))
     self.assertAllEqual([None, None], foo.get_shape().as_list())
     self.assertAllEqual([None, 2], foo.indices.get_shape().as_list())
 
+  @test_util.run_deprecated_v1
   def testNoShapePlaceholder(self):
     foo = array_ops.sparse_placeholder(dtypes.float32, shape=None)
     self.assertAllEqual(None, foo.get_shape())
diff --git a/tensorflow/python/kernel_tests/sparse_reorder_op_test.py b/tensorflow/python/kernel_tests/sparse_reorder_op_test.py
index 7b83ae51779..93fcc6a18e6 100644
--- a/tensorflow/python/kernel_tests/sparse_reorder_op_test.py
+++ b/tensorflow/python/kernel_tests/sparse_reorder_op_test.py
@@ -22,6 +22,7 @@ import numpy as np
 
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import sparse_ops
@@ -60,11 +61,12 @@ class SparseReorderTest(test.TestCase):
       input_val = self._SparseTensorValue_5x6(np.arange(6))
       sp_output = sparse_ops.sparse_reorder(input_val)
 
-      output_val = sess.run(sp_output)
+      output_val = self.evaluate(sp_output)
       self.assertAllEqual(output_val.indices, input_val.indices)
       self.assertAllEqual(output_val.values, input_val.values)
       self.assertAllEqual(output_val.dense_shape, input_val.dense_shape)
 
+  @test_util.run_deprecated_v1
   def testFeedAlreadyInOrder(self):
     with self.session(use_gpu=False) as sess:
       sp_input = self._SparseTensorPlaceholder()
@@ -83,12 +85,13 @@ class SparseReorderTest(test.TestCase):
         input_val = self._SparseTensorValue_5x6(np.random.permutation(6))
         sp_output = sparse_ops.sparse_reorder(input_val)
 
-        output_val = sess.run(sp_output)
+        output_val = self.evaluate(sp_output)
         self.assertAllEqual(output_val.indices, expected_output_val.indices)
         self.assertAllEqual(output_val.values, expected_output_val.values)
         self.assertAllEqual(output_val.dense_shape,
                             expected_output_val.dense_shape)
 
+  @test_util.run_deprecated_v1
   def testFeedOutOfOrder(self):
     expected_output_val = self._SparseTensorValue_5x6(np.arange(6))
     with self.session(use_gpu=False) as sess:
@@ -103,6 +106,7 @@ class SparseReorderTest(test.TestCase):
         self.assertAllEqual(output_val.dense_shape,
                             expected_output_val.dense_shape)
 
+  @test_util.run_deprecated_v1
   def testGradients(self):
     with self.session(use_gpu=False):
       for _ in range(5):  # To test various random permutations
diff --git a/tensorflow/python/kernel_tests/sparse_reshape_op_test.py b/tensorflow/python/kernel_tests/sparse_reshape_op_test.py
index f7be397c333..9341228d57e 100644
--- a/tensorflow/python/kernel_tests/sparse_reshape_op_test.py
+++ b/tensorflow/python/kernel_tests/sparse_reshape_op_test.py
@@ -22,6 +22,7 @@ import numpy as np
 
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import sparse_ops
 from tensorflow.python.platform import test
@@ -64,12 +65,14 @@ class SparseReshapeTest(test.TestCase):
     sp_output = sparse_ops.sparse_reshape(sp_input, shape=(2, -1))
     self.assertAllEqual((2, 3 * 4), sp_output.get_shape())
 
+  @test_util.run_deprecated_v1
   def testRaisesIfMoreThanOneInferredDim(self):
     sp_input = sparse_tensor.SparseTensor.from_value(
         self._SparseTensorValue_2x3x4())
     with self.assertRaisesRegexp(ValueError, "At most one dimension can"):
       sparse_ops.sparse_reshape(sp_input, shape=(-1, 2, -1))
 
+  @test_util.run_deprecated_v1
   def testRaisesIfInferredShapeNotPossible(self):
     sp_input = sparse_tensor.SparseTensor.from_value(
         self._SparseTensorValue_2x3x4())
@@ -81,11 +84,12 @@ class SparseReshapeTest(test.TestCase):
       input_val = self._SparseTensorValue_5x6()
       sp_output = sparse_ops.sparse_reshape(input_val, [5, 6])
 
-      output_val = sess.run(sp_output)
+      output_val = self.evaluate(sp_output)
       self.assertAllEqual(output_val.indices, input_val.indices)
       self.assertAllEqual(output_val.values, input_val.values)
       self.assertAllEqual(output_val.dense_shape, input_val.dense_shape)
 
+  @test_util.run_deprecated_v1
   def testFeedSameShape(self):
     with self.session(use_gpu=False) as sess:
       sp_input = self._SparseTensorPlaceholder()
@@ -97,6 +101,7 @@ class SparseReshapeTest(test.TestCase):
       self.assertAllEqual(output_val.values, input_val.values)
       self.assertAllEqual(output_val.dense_shape, input_val.dense_shape)
 
+  @test_util.run_deprecated_v1
   def testWorksWellWithTfShape(self):
     with self.session(use_gpu=False) as sess:
       sp_input = self._SparseTensorPlaceholder()
@@ -109,6 +114,7 @@ class SparseReshapeTest(test.TestCase):
       self.assertAllEqual(output_val.values, input_val.values)
       self.assertAllEqual(output_val.dense_shape, input_val.dense_shape)
 
+  @test_util.run_deprecated_v1
   def testFeedSameShapeWithInferredDim(self):
     with self.session(use_gpu=False) as sess:
       sp_input = self._SparseTensorPlaceholder()
@@ -120,6 +126,7 @@ class SparseReshapeTest(test.TestCase):
       self.assertAllEqual(output_val.values, input_val.values)
       self.assertAllEqual(output_val.dense_shape, input_val.dense_shape)
 
+  @test_util.run_deprecated_v1
   def testFeedNewShapeSameRank(self):
     with self.session(use_gpu=False) as sess:
       sp_input = self._SparseTensorPlaceholder()
@@ -133,6 +140,7 @@ class SparseReshapeTest(test.TestCase):
       self.assertAllEqual(output_val.values, input_val.values)
       self.assertAllEqual(output_val.dense_shape, [3, 10])
 
+  @test_util.run_deprecated_v1
   def testFeedNewShapeSameRankWithInferredDim(self):
     with self.session(use_gpu=False) as sess:
       sp_input = self._SparseTensorPlaceholder()
@@ -151,13 +159,14 @@ class SparseReshapeTest(test.TestCase):
       input_val = self._SparseTensorValue_5x6()
       sp_output = sparse_ops.sparse_reshape(input_val, [2, 3, 5])
 
-      output_val = sess.run(sp_output)
+      output_val = self.evaluate(sp_output)
       self.assertAllEqual(output_val.indices,
                           np.array([[0, 0, 0], [0, 1, 1], [0, 1, 4], [0, 2, 0],
                                     [1, 1, 0], [1, 1, 1]]))
       self.assertAllEqual(output_val.values, input_val.values)
       self.assertAllEqual(output_val.dense_shape, [2, 3, 5])
 
+  @test_util.run_deprecated_v1
   def testFeedUpRank(self):
     with self.session(use_gpu=False) as sess:
       sp_input = self._SparseTensorPlaceholder()
@@ -171,6 +180,7 @@ class SparseReshapeTest(test.TestCase):
       self.assertAllEqual(output_val.values, input_val.values)
       self.assertAllEqual(output_val.dense_shape, [2, 3, 5])
 
+  @test_util.run_deprecated_v1
   def testFeedUpRankWithInferredDim(self):
     with self.session(use_gpu=False) as sess:
       sp_input = self._SparseTensorPlaceholder()
@@ -184,6 +194,7 @@ class SparseReshapeTest(test.TestCase):
       self.assertAllEqual(output_val.values, input_val.values)
       self.assertAllEqual(output_val.dense_shape, [2, 3, 5])
 
+  @test_util.run_deprecated_v1
   def testFeedDownRank(self):
     with self.session(use_gpu=False) as sess:
       sp_input = self._SparseTensorPlaceholder()
@@ -197,6 +208,7 @@ class SparseReshapeTest(test.TestCase):
       self.assertAllEqual(output_val.values, input_val.values)
       self.assertAllEqual(output_val.dense_shape, [6, 4])
 
+  @test_util.run_deprecated_v1
   def testFeedDownRankWithInferredDim(self):
     with self.session(use_gpu=False) as sess:
       sp_input = self._SparseTensorPlaceholder()
@@ -210,6 +222,7 @@ class SparseReshapeTest(test.TestCase):
       self.assertAllEqual(output_val.values, input_val.values)
       self.assertAllEqual(output_val.dense_shape, [6, 4])
 
+  @test_util.run_deprecated_v1
   def testFeedMultipleInferredDims(self):
     with self.session(use_gpu=False) as sess:
       sp_input = self._SparseTensorPlaceholder()
@@ -218,12 +231,14 @@ class SparseReshapeTest(test.TestCase):
       with self.assertRaisesOpError("only one output dimension may be -1"):
         sess.run(sp_output, {sp_input: input_val})
 
+  @test_util.run_deprecated_v1
   def testProvideStaticallyMismatchedSizes(self):
     input_val = self._SparseTensorValue_5x6()
     sp_input = sparse_tensor.SparseTensor.from_value(input_val)
     with self.assertRaisesRegexp(ValueError, "Cannot reshape"):
       sparse_ops.sparse_reshape(sp_input, [4, 7])
 
+  @test_util.run_deprecated_v1
   def testFeedMismatchedSizes(self):
     with self.session(use_gpu=False) as sess:
       sp_input = self._SparseTensorPlaceholder()
@@ -233,6 +248,7 @@ class SparseReshapeTest(test.TestCase):
           "Input to reshape is a tensor with 30 dense values"):
         sess.run(sp_output, {sp_input: input_val})
 
+  @test_util.run_deprecated_v1
   def testFeedMismatchedSizesWithInferredDim(self):
     with self.session(use_gpu=False) as sess:
       sp_input = self._SparseTensorPlaceholder()
@@ -241,6 +257,7 @@ class SparseReshapeTest(test.TestCase):
       with self.assertRaisesOpError("requested shape requires a multiple"):
         sess.run(sp_output, {sp_input: input_val})
 
+  @test_util.run_deprecated_v1
   def testFeedPartialShapes(self):
     with self.session(use_gpu=False):
       # Incorporate new rank into shape information if known
@@ -266,6 +283,7 @@ class SparseReshapeTest(test.TestCase):
       self.assertListEqual(sp_output.indices.get_shape().as_list(), [5, None])
       self.assertListEqual(sp_output.dense_shape.get_shape().as_list(), [None])
 
+  @test_util.run_deprecated_v1
   def testFeedDenseReshapeSemantics(self):
     with self.session(use_gpu=False) as sess:
       # Compute a random rank-5 initial shape and new shape, randomly sparsify
diff --git a/tensorflow/python/kernel_tests/sparse_serialization_ops_test.py b/tensorflow/python/kernel_tests/sparse_serialization_ops_test.py
index b24a0869699..5a48eb825db 100644
--- a/tensorflow/python/kernel_tests/sparse_serialization_ops_test.py
+++ b/tensorflow/python/kernel_tests/sparse_serialization_ops_test.py
@@ -22,6 +22,7 @@ import numpy as np
 
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import sparse_tensor as sparse_tensor_lib
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import sparse_ops
 from tensorflow.python.platform import test
@@ -73,7 +74,7 @@ class SerializeSparseTest(test.TestCase):
       serialized = serialize_fn(sp_input, out_type=out_type)
       sp_deserialized = deserialize_fn(serialized, dtype=dtypes.int32)
 
-      indices, values, shape = sess.run(sp_deserialized)
+      indices, values, shape = self.evaluate(sp_deserialized)
 
       self.assertAllEqual(indices, sp_input[0])
       self.assertAllEqual(values, sp_input[1])
@@ -110,14 +111,17 @@ class SerializeSparseTest(test.TestCase):
       self.assertAllEqual(combined_values[6:], sp_input[1])
       self.assertAllEqual(combined_shape, [2, 5, 6])
 
+  @test_util.run_deprecated_v1
   def testSerializeDeserializeBatch(self):
     self._testSerializeDeserializeBatchHelper(sparse_ops.serialize_sparse,
                                               sparse_ops.deserialize_sparse)
 
+  @test_util.run_deprecated_v1
   def testSerializeDeserializeManyBatch(self):
     self._testSerializeDeserializeBatchHelper(
         sparse_ops.serialize_sparse, sparse_ops.deserialize_many_sparse)
 
+  @test_util.run_deprecated_v1
   def testVariantSerializeDeserializeBatch(self):
     self._testSerializeDeserializeBatchHelper(sparse_ops.serialize_sparse,
                                               sparse_ops.deserialize_sparse,
@@ -145,10 +149,12 @@ class SerializeSparseTest(test.TestCase):
       self.assertAllEqual(combined_values[6:], sp_input1[1])
       self.assertAllEqual(combined_shape, [2, 5, 6])
 
+  @test_util.run_deprecated_v1
   def testSerializeDeserializeBatchInconsistentShape(self):
     self._testSerializeDeserializeBatchInconsistentShapeHelper(
         sparse_ops.serialize_sparse, sparse_ops.deserialize_sparse)
 
+  @test_util.run_deprecated_v1
   def testVariantSerializeDeserializeBatchInconsistentShape(self):
     self._testSerializeDeserializeBatchInconsistentShapeHelper(
         sparse_ops.serialize_sparse, sparse_ops.deserialize_sparse,
@@ -188,10 +194,12 @@ class SerializeSparseTest(test.TestCase):
 
       self.assertAllEqual(combined_shape, [2, 2, 5, 6])
 
+  @test_util.run_deprecated_v1
   def testSerializeDeserializeNestedBatch(self):
     self._testSerializeDeserializeNestedBatchHelper(
         sparse_ops.serialize_sparse, sparse_ops.deserialize_sparse)
 
+  @test_util.run_deprecated_v1
   def testVariantSerializeDeserializeNestedBatch(self):
     self._testSerializeDeserializeNestedBatchHelper(
         sparse_ops.serialize_sparse, sparse_ops.deserialize_sparse,
@@ -224,14 +232,17 @@ class SerializeSparseTest(test.TestCase):
       self.assertAllEqual(combined_values[6:], input1_val[1])
       self.assertAllEqual(combined_shape, [2, 5, 6])
 
+  @test_util.run_deprecated_v1
   def testFeedSerializeDeserializeBatch(self):
     self._testFeedSerializeDeserializeBatchHelper(sparse_ops.serialize_sparse,
                                                   sparse_ops.deserialize_sparse)
 
+  @test_util.run_deprecated_v1
   def testFeedSerializeDeserializeManyBatch(self):
     self._testFeedSerializeDeserializeBatchHelper(
         sparse_ops.serialize_sparse, sparse_ops.deserialize_many_sparse)
 
+  @test_util.run_deprecated_v1
   def testFeedVariantSerializeDeserializeBatch(self):
     self._testFeedSerializeDeserializeBatchHelper(sparse_ops.serialize_sparse,
                                                   sparse_ops.deserialize_sparse,
@@ -256,6 +267,7 @@ class SerializeSparseTest(test.TestCase):
           })
       self.assertEqual(serialized_value.shape, (4, 3))
 
+  @test_util.run_deprecated_v1
   def testSerializeManyShape(self):
     self._testSerializeManyShapeHelper(sparse_ops.serialize_many_sparse)
 
@@ -287,19 +299,23 @@ class SerializeSparseTest(test.TestCase):
       self.assertAllEqual(deserialized_value.values, values_value)
       self.assertAllEqual(deserialized_value.dense_shape, shape_value)
 
+  @test_util.run_deprecated_v1
   def testSerializeManyDeserializeBatch(self):
     self._testSerializeManyDeserializeBatchHelper(
         sparse_ops.serialize_many_sparse, sparse_ops.deserialize_sparse)
 
+  @test_util.run_deprecated_v1
   def testSerializeManyDeserializeManyBatch(self):
     self._testSerializeManyDeserializeBatchHelper(
         sparse_ops.serialize_many_sparse, sparse_ops.deserialize_many_sparse)
 
+  @test_util.run_deprecated_v1
   def testVariantSerializeManyDeserializeBatch(self):
     self._testSerializeManyDeserializeBatchHelper(
         sparse_ops.serialize_many_sparse, sparse_ops.deserialize_sparse,
         dtypes.variant)
 
+  @test_util.run_deprecated_v1
   def testVariantSerializeDeserializeScalar(self):
     with self.session(use_gpu=False) as sess:
       indices_value = np.array([[]], dtype=np.int64)
@@ -321,6 +337,7 @@ class SerializeSparseTest(test.TestCase):
       self.assertAllEqual(deserialized_value.values, values_value)
       self.assertAllEqual(deserialized_value.dense_shape, shape_value)
 
+  @test_util.run_deprecated_v1
   def testVariantSerializeDeserializeScalarBatch(self):
     with self.session(use_gpu=False) as sess:
       indices_value = np.array([[]], dtype=np.int64)
@@ -367,14 +384,17 @@ class SerializeSparseTest(test.TestCase):
                  {sp_input0: input0_val,
                   sp_input1: input1_val})
 
+  @test_util.run_deprecated_v1
   def testDeserializeFailsWrongType(self):
     self._testDeserializeFailsWrongTypeHelper(sparse_ops.serialize_sparse,
                                               sparse_ops.deserialize_sparse)
 
+  @test_util.run_deprecated_v1
   def testDeserializeManyFailsWrongType(self):
     self._testDeserializeFailsWrongTypeHelper(
         sparse_ops.serialize_sparse, sparse_ops.deserialize_many_sparse)
 
+  @test_util.run_deprecated_v1
   def testVariantDeserializeFailsWrongType(self):
     self._testDeserializeFailsWrongTypeHelper(sparse_ops.serialize_sparse,
                                               sparse_ops.deserialize_sparse,
@@ -402,14 +422,17 @@ class SerializeSparseTest(test.TestCase):
                  {sp_input0: input0_val,
                   sp_input1: input1_val})
 
+  @test_util.run_deprecated_v1
   def testDeserializeFailsInconsistentRank(self):
     self._testDeserializeFailsInconsistentRankHelper(
         sparse_ops.serialize_sparse, sparse_ops.deserialize_sparse)
 
+  @test_util.run_deprecated_v1
   def testDeserializeManyFailsInconsistentRank(self):
     self._testDeserializeFailsInconsistentRankHelper(
         sparse_ops.serialize_sparse, sparse_ops.deserialize_many_sparse)
 
+  @test_util.run_deprecated_v1
   def testVariantDeserializeFailsInconsistentRank(self):
     self._testDeserializeFailsInconsistentRankHelper(
         sparse_ops.serialize_sparse, sparse_ops.deserialize_sparse,
@@ -431,10 +454,12 @@ class SerializeSparseTest(test.TestCase):
       with self.assertRaisesOpError(r"Could not parse serialized proto"):
         sess.run(sp_deserialized, {sp_input0: input0_val})
 
+  @test_util.run_deprecated_v1
   def testDeserializeFailsInvalidProto(self):
     self._testDeserializeFailsInvalidProtoHelper(sparse_ops.serialize_sparse,
                                                  sparse_ops.deserialize_sparse)
 
+  @test_util.run_deprecated_v1
   def testDeserializeManyFailsInvalidProto(self):
     self._testDeserializeFailsInvalidProtoHelper(
         sparse_ops.serialize_sparse, sparse_ops.deserialize_many_sparse)
diff --git a/tensorflow/python/kernel_tests/sparse_slice_op_test.py b/tensorflow/python/kernel_tests/sparse_slice_op_test.py
index 098353741f3..7f8c91bde67 100644
--- a/tensorflow/python/kernel_tests/sparse_slice_op_test.py
+++ b/tensorflow/python/kernel_tests/sparse_slice_op_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import sparse_ops
 import tensorflow.python.ops.sparse_grad  # pylint: disable=unused-import
@@ -79,6 +80,7 @@ class SparseSliceOpTest(test.TestCase):
     return sparse_tensor.SparseTensor.from_value(
         self._SparseTensorValue_3x4x2())
 
+  @test_util.run_deprecated_v1
   def testSliceMatrixRows(self):
     with self.session(use_gpu=False):
       sp_input = self._SparseTensor_4x6()
@@ -96,6 +98,7 @@ class SparseSliceOpTest(test.TestCase):
                           [20, 23, 25, 30, 32, 33, 35])
       self.assertAllEqual(sp_tensor1.dense_shape.eval(), [2, 6])
 
+  @test_util.run_deprecated_v1
   def testSliceMatrixUnevenCols(self):
     with self.session(use_gpu=False):
       sp_input = self._SparseTensor_5x7()
@@ -137,6 +140,7 @@ class SparseSliceOpTest(test.TestCase):
       self.assertAllEqual(sp_tensor3.values.eval(), [16, 46])
       self.assertAllEqual(sp_tensor3.dense_shape.eval(), [5, 1])
 
+  @test_util.run_deprecated_v1
   def testSliceMatrixUnevenRows(self):
     with self.session(use_gpu=False):
       sp_input = self._SparseTensor_5x7()
@@ -173,6 +177,7 @@ class SparseSliceOpTest(test.TestCase):
       self.assertAllEqual(sp_tensor2.dense_shape.eval(), [1, 7])
     return
 
+  @test_util.run_deprecated_v1
   def testSliceAllRows(self):
     with self.session(use_gpu=False):
       sp_input = self._SparseTensor_4x6()
@@ -195,6 +200,7 @@ class SparseSliceOpTest(test.TestCase):
       self.assertAllEqual(sp_tensor3.values.eval(), [30, 32, 33, 35])
       self.assertAllEqual(sp_tensor3.dense_shape.eval(), [1, 6])
 
+  @test_util.run_deprecated_v1
   def testSliceColumns(self):
     with self.session(use_gpu=False):
       sp_input = self._SparseTensor_4x6()
@@ -215,6 +221,7 @@ class SparseSliceOpTest(test.TestCase):
       self.assertAllEqual(sparse_tensor2.values.eval(), [4, 5, 14, 25, 35])
       self.assertAllEqual(sparse_tensor2.dense_shape.eval(), [4, 2])
 
+  @test_util.run_deprecated_v1
   def testSliceAllColumns(self):
     with self.session(use_gpu=False):
       sp_input = self._SparseTensor_4x6()
@@ -246,6 +253,7 @@ class SparseSliceOpTest(test.TestCase):
       self.assertAllEqual(sparse_tensor5.values.eval(), [5, 25, 35])
       self.assertAllEqual(sparse_tensor5.dense_shape.eval(), [4, 1])
 
+  @test_util.run_deprecated_v1
   def testGradients(self):
     sp_input = self._SparseTensor_4x6(val_dtype=np.float32)
     start_and_size = [([0, 0], [4, 2]),
diff --git a/tensorflow/python/kernel_tests/sparse_split_op_test.py b/tensorflow/python/kernel_tests/sparse_split_op_test.py
index 95661ded4be..f4bb7498b02 100644
--- a/tensorflow/python/kernel_tests/sparse_split_op_test.py
+++ b/tensorflow/python/kernel_tests/sparse_split_op_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import sparse_ops
 from tensorflow.python.platform import test
 
@@ -75,6 +76,7 @@ class SparseSplitOpTest(test.TestCase):
     return sparse_tensor.SparseTensor.from_value(self._SparseTensorValue_3x4x2(
     ))
 
+  @test_util.run_deprecated_v1
   def testSplitMatrixRows(self):
     with self.session(use_gpu=False):
       sp_tensors = sparse_ops.sparse_split(
@@ -92,6 +94,7 @@ class SparseSplitOpTest(test.TestCase):
                           [20, 23, 25, 30, 32, 33, 35])
       self.assertAllEqual(sp_tensors[1].dense_shape.eval(), [2, 6])
 
+  @test_util.run_deprecated_v1
   def testSplitMatrixUnevenCols(self):
     with self.session(use_gpu=False):
       sp_tensors_3 = sparse_ops.sparse_split(
@@ -131,6 +134,7 @@ class SparseSplitOpTest(test.TestCase):
       self.assertAllEqual(sp_tensors_4[3].values.eval(), [16, 46])
       self.assertAllEqual(sp_tensors_4[3].dense_shape.eval(), [5, 1])
 
+  @test_util.run_deprecated_v1
   def testSplitMatrixUnevenRows(self):
     with self.session(use_gpu=False):
       sp_tensors_2 = sparse_ops.sparse_split(
@@ -167,6 +171,7 @@ class SparseSplitOpTest(test.TestCase):
       self.assertAllEqual(sp_tensors_3[2].dense_shape.eval(), [1, 7])
     return
 
+  @test_util.run_deprecated_v1
   def testSplitAllRows(self):
     with self.session(use_gpu=False):
       sp_tensors = sparse_ops.sparse_split(
@@ -189,6 +194,7 @@ class SparseSplitOpTest(test.TestCase):
       self.assertAllEqual(sp_tensors[3].values.eval(), [30, 32, 33, 35])
       self.assertAllEqual(sp_tensors[3].dense_shape.eval(), [1, 6])
 
+  @test_util.run_deprecated_v1
   def testSplitColumns(self):
     with self.session(use_gpu=False):
       sparse_tensors = sparse_ops.sparse_split(
@@ -207,6 +213,7 @@ class SparseSplitOpTest(test.TestCase):
       self.assertAllEqual(sparse_tensors[2].values.eval(), [4, 5, 14, 25, 35])
       self.assertAllEqual(sparse_tensors[2].dense_shape.eval(), [4, 2])
 
+  @test_util.run_deprecated_v1
   def testSplitAllColumns(self):
     with self.session(use_gpu=False):
       sparse_tensors = sparse_ops.sparse_split(
@@ -234,6 +241,7 @@ class SparseSplitOpTest(test.TestCase):
       self.assertAllEqual(sparse_tensors[5].values.eval(), [5, 25, 35])
       self.assertAllEqual(sparse_tensors[5].dense_shape.eval(), [4, 1])
 
+  @test_util.run_deprecated_v1
   def testSliceConcat(self):
     for sp_input in (self._SparseTensorValue_3x4x2(),
                      self._SparseTensor_3x4x2()):
diff --git a/tensorflow/python/kernel_tests/sparse_tensor_dense_matmul_grad_test.py b/tensorflow/python/kernel_tests/sparse_tensor_dense_matmul_grad_test.py
index b8f33d6a813..fa2bab1fca6 100644
--- a/tensorflow/python/kernel_tests/sparse_tensor_dense_matmul_grad_test.py
+++ b/tensorflow/python/kernel_tests/sparse_tensor_dense_matmul_grad_test.py
@@ -22,6 +22,7 @@ import numpy as np
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import sparse_ops
 import tensorflow.python.ops.sparse_grad  # pylint: disable=unused-import
@@ -89,6 +90,7 @@ class SparseTensorDenseMatMulGradientTest(test.TestCase):
         self._testGradients(adjoint_a, adjoint_b, name, values_dtype,
                             indices_dtype)
 
+  @test_util.run_deprecated_v1
   def testGradients(self):
     np.random.seed(5)  # Fix seed to avoid flakiness
     self._testGradientsType(np.float32, np.int64)
diff --git a/tensorflow/python/kernel_tests/sparse_tensor_dense_matmul_op_test.py b/tensorflow/python/kernel_tests/sparse_tensor_dense_matmul_op_test.py
index fe334045afe..637cfaec990 100644
--- a/tensorflow/python/kernel_tests/sparse_tensor_dense_matmul_op_test.py
+++ b/tensorflow/python/kernel_tests/sparse_tensor_dense_matmul_op_test.py
@@ -30,6 +30,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
@@ -80,7 +81,7 @@ class SparseTensorDenseMatMulTest(test.TestCase):
       self.assertEqual(tf_value_ans.get_shape()[1], np_ans.shape[1])
       self.assertEqual(tf_tensor_ans.get_shape()[1], np_ans.shape[1])
 
-      for out in (tf_value_ans.eval(), tf_tensor_ans.eval()):
+      for out in (tf_value_ans.eval(), self.evaluate(tf_tensor_ans)):
         if x.dtype == np.float32:
           self.assertAllClose(np_ans, out, rtol=1e-4, atol=1e-4)
         elif x.dtype == np.float64:
@@ -96,6 +97,7 @@ class SparseTensorDenseMatMulTest(test.TestCase):
 
     self._testMatmul(x, y, indices_dtype=indices_dtype)
 
+  @test_util.run_deprecated_v1
   def testBasic(self):
     np.random.seed(127)  # Repeatable results
     self._testBasic(np.int32)
@@ -106,6 +108,7 @@ class SparseTensorDenseMatMulTest(test.TestCase):
     self._testBasic(np.int32, indices_dtype=np.int32)
     self._testBasic(np.float32, indices_dtype=np.int32)
 
+  @test_util.run_deprecated_v1
   def testShapeInference(self):
     x = np.random.rand(10, 10)
     x[np.abs(x) < 0.5] = 0  # Make it sparse
@@ -229,6 +232,7 @@ class SparseTensorDenseMatMulTest(test.TestCase):
     self._testLarge(np.complex128)
 
   # Tests random sized matrices.
+  @test_util.run_deprecated_v1
   def testFloatRandom(self):
     np.random.seed(127)  # Repeatable results
     for _ in range(8):
diff --git a/tensorflow/python/kernel_tests/sparse_tensors_map_ops_test.py b/tensorflow/python/kernel_tests/sparse_tensors_map_ops_test.py
index e08464a701c..6039ff1afa7 100644
--- a/tensorflow/python/kernel_tests/sparse_tensors_map_ops_test.py
+++ b/tensorflow/python/kernel_tests/sparse_tensors_map_ops_test.py
@@ -24,6 +24,7 @@ from tensorflow.python.client import session
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor as sparse_tensor_lib
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import sparse_ops
 from tensorflow.python.ops import variables
@@ -76,6 +77,7 @@ class SparseTensorsMapTest(test.TestCase):
     shape = np.array([3, 4, 5]).astype(np.int64)
     return sparse_tensor_lib.SparseTensorValue(ind, val, shape)
 
+  @test_util.run_deprecated_v1
   def testAddTakeMany(self):
     with self.session(graph=ops.Graph(), use_gpu=False) as sess:
       sp_input0 = self._SparseTensorValue_5x6(np.arange(6))
@@ -88,7 +90,7 @@ class SparseTensorsMapTest(test.TestCase):
       sp_out = take_many_sparse_from_tensors_map(
           sparse_map_op=handle0.op, sparse_handles=handles_concat)
 
-      combined_indices, combined_values, combined_shape = sess.run(sp_out)
+      combined_indices, combined_values, combined_shape = self.evaluate(sp_out)
 
       self.assertAllEqual(combined_indices[:6, 0], [0] * 6)  # minibatch 0
       self.assertAllEqual(combined_indices[:6, 1:], sp_input0[0])
@@ -98,6 +100,7 @@ class SparseTensorsMapTest(test.TestCase):
       self.assertAllEqual(combined_values[6:], sp_input1[1])
       self.assertAllEqual(combined_shape, [2, 5, 6])
 
+  @test_util.run_deprecated_v1
   def testFeedAddTakeMany(self):
     with self.session(use_gpu=False) as sess:
       sp_input = self._SparseTensorPlaceholder()
@@ -114,7 +117,8 @@ class SparseTensorsMapTest(test.TestCase):
       sp_roundtrip = take_many_sparse_from_tensors_map(
           sparse_map_op=handle.op, sparse_handles=sparse_handles)
 
-      combined_indices, combined_values, combined_shape = sess.run(sp_roundtrip)
+      combined_indices, combined_values, combined_shape = self.evaluate(
+          sp_roundtrip)
 
       self.assertAllEqual(combined_indices[:6, 0], [0] * 6)  # minibatch 0
       self.assertAllEqual(combined_indices[:6, 1:], input0_val[0])
@@ -124,6 +128,7 @@ class SparseTensorsMapTest(test.TestCase):
       self.assertAllEqual(combined_values[6:], input1_val[1])
       self.assertAllEqual(combined_shape, [2, 5, 6])
 
+  @test_util.run_deprecated_v1
   def testAddManyTakeManyRoundTrip(self):
     with self.session(use_gpu=False) as sess:
       # N == 4 because shape_value == [4, 5]
@@ -146,6 +151,7 @@ class SparseTensorsMapTest(test.TestCase):
       self.assertAllEqual(roundtrip_value.values, values_value)
       self.assertAllEqual(roundtrip_value.dense_shape, shape_value)
 
+  @test_util.run_deprecated_v1
   def testDeserializeFailsInconsistentRank(self):
     with self.session(use_gpu=False) as sess:
       sp_input = self._SparseTensorPlaceholder()
@@ -165,19 +171,20 @@ class SparseTensorsMapTest(test.TestCase):
       with self.assertRaisesOpError(
           r"Inconsistent rank across SparseTensors: rank prior to "
           r"SparseTensor\[1\] was: 3 but rank of SparseTensor\[1\] is: 4"):
-        sess.run(sp_roundtrip)
+        self.evaluate(sp_roundtrip)
 
+  @test_util.run_deprecated_v1
   def testTakeManyFailsWrongInputOp(self):
     with self.session(use_gpu=False) as sess:
       input_val = self._SparseTensorValue_5x6(np.arange(6))
       handle = add_sparse_to_tensors_map(input_val)
-      handle_value = sess.run(handle)
+      handle_value = self.evaluate(handle)
       bad_handle = handle_value + 10
       sp_roundtrip = take_many_sparse_from_tensors_map(
           sparse_map_op=handle.op, sparse_handles=[handle_value, bad_handle])
 
       with self.assertRaisesOpError(r"Unable to find SparseTensor: 10"):
-        sess.run(sp_roundtrip)
+        self.evaluate(sp_roundtrip)
 
 
 class BenchmarkSparseTensorsMapVsSerialization(test.Benchmark):
@@ -212,8 +219,8 @@ class BenchmarkSparseTensorsMapVsSerialization(test.Benchmark):
 
         variables.global_variables_initializer().run()
 
-        st_roundtrip_values = sess.run(st_roundtrip)
-        st_deserialized_values = sess.run(st_deserialized)
+        st_roundtrip_values = self.evaluate(st_roundtrip)
+        st_deserialized_values = self.evaluate(st_deserialized)
         np.testing.assert_equal(st_roundtrip_values.values,
                                 st_deserialized_values.values)
         np.testing.assert_equal(st_roundtrip_values.indices,
diff --git a/tensorflow/python/kernel_tests/sparse_to_dense_op_py_test.py b/tensorflow/python/kernel_tests/sparse_to_dense_op_py_test.py
index 7f63532e10d..c6c45db4f9a 100644
--- a/tensorflow/python/kernel_tests/sparse_to_dense_op_py_test.py
+++ b/tensorflow/python/kernel_tests/sparse_to_dense_op_py_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import sparse_ops
 from tensorflow.python.platform import test
@@ -41,36 +42,42 @@ def _SparseToDense(sparse_indices,
 
 class SparseToDenseTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testInt(self):
     with self.session(use_gpu=False):
       tf_ans = _SparseToDense([1, 3], [5], 1, 0).eval()
     np_ans = np.array([0, 1, 0, 1, 0]).astype(np.int32)
     self.assertAllClose(np_ans, tf_ans)
 
+  @test_util.run_deprecated_v1
   def testFloat(self):
     with self.session(use_gpu=False):
       tf_ans = _SparseToDense([1, 3], [5], 1.0, 0.0).eval()
     np_ans = np.array([0, 1, 0, 1, 0]).astype(np.float32)
     self.assertAllClose(np_ans, tf_ans)
 
+  @test_util.run_deprecated_v1
   def testString(self):
     with self.session(use_gpu=False):
       tf_ans = _SparseToDense([1, 3], [5], "a", "b").eval()
     np_ans = np.array(["b", "a", "b", "a", "b"]).astype(np.string_)
     self.assertAllEqual(np_ans, tf_ans)
 
+  @test_util.run_deprecated_v1
   def testSetValue(self):
     with self.session(use_gpu=False):
       tf_ans = _SparseToDense([1, 3], [5], [1, 2], -1).eval()
     np_ans = np.array([-1, 1, -1, 2, -1]).astype(np.int32)
     self.assertAllClose(np_ans, tf_ans)
 
+  @test_util.run_deprecated_v1
   def testSetSingleValue(self):
     with self.session(use_gpu=False):
       tf_ans = _SparseToDense([1, 3], [5], 1, -1).eval()
     np_ans = np.array([-1, 1, -1, 1, -1]).astype(np.int32)
     self.assertAllClose(np_ans, tf_ans)
 
+  @test_util.run_deprecated_v1
   def test2d(self):
     # pylint: disable=bad-whitespace
     with self.session(use_gpu=False):
@@ -80,11 +87,13 @@ class SparseToDenseTest(test.TestCase):
                        [ 1, -1, -1, -1]]).astype(np.int32)
     self.assertAllClose(np_ans, tf_ans)
 
+  @test_util.run_deprecated_v1
   def testZeroDefault(self):
     with self.cached_session():
       x = sparse_ops.sparse_to_dense(2, [4], 7).eval()
       self.assertAllEqual(x, [0, 0, 7, 0])
 
+  @test_util.run_deprecated_v1
   def test3d(self):
     with self.session(use_gpu=False):
       tf_ans = _SparseToDense([[1, 3, 0], [2, 0, 1]], [3, 4, 2], 1, -1).eval()
@@ -93,32 +102,37 @@ class SparseToDenseTest(test.TestCase):
     np_ans[2, 0, 1] = 1
     self.assertAllClose(np_ans, tf_ans)
 
+  @test_util.run_deprecated_v1
   def testBadShape(self):
     with self.cached_session():
       with self.assertRaisesWithPredicateMatch(ValueError, "must be rank 1"):
         _SparseToDense([1, 3], [[5], [3]], 1, -1)
 
+  @test_util.run_deprecated_v1
   def testBadValue(self):
     with self.cached_session():
       dense = _SparseToDense([1, 3], [5], [[5], [3]], -1)
       with self.assertRaisesOpError(
           r"sparse_values has incorrect shape \[2,1\], "
           r"should be \[\] or \[2\]"):
-        dense.eval()
+        self.evaluate(dense)
 
+  @test_util.run_deprecated_v1
   def testBadNumValues(self):
     with self.cached_session():
       dense = _SparseToDense([1, 3], [5], [1, 2, 3], -1)
       with self.assertRaisesOpError(
           r"sparse_values has incorrect shape \[3\], should be \[\] or \[2\]"):
-        dense.eval()
+        self.evaluate(dense)
 
+  @test_util.run_deprecated_v1
   def testBadDefault(self):
     with self.cached_session():
       dense = _SparseToDense([1, 3], [5], [1, 2], [0])
       with self.assertRaisesOpError("default_value should be a scalar"):
-        dense.eval()
+        self.evaluate(dense)
 
+  @test_util.run_deprecated_v1
   def testOutOfBoundsIndicesWithWithoutValidation(self):
     with self.cached_session():
       dense = _SparseToDense(
@@ -128,7 +142,7 @@ class SparseToDenseTest(test.TestCase):
           default_value=0.0)
       with self.assertRaisesOpError(
           r"indices\[1\] = \[10\] is out of bounds: need 0 <= index < \[5\]"):
-        dense.eval()
+        self.evaluate(dense)
       # Disable checks, the allocation should still fail.
       with self.assertRaisesOpError("out of bounds"):
         dense_without_validation = _SparseToDense(
@@ -137,8 +151,9 @@ class SparseToDenseTest(test.TestCase):
             sparse_values=[-1.0, 1.0],
             default_value=0.0,
             validate_indices=False)
-        dense_without_validation.eval()
+        self.evaluate(dense_without_validation)
 
+  @test_util.run_deprecated_v1
   def testRepeatingIndicesWithWithoutValidation(self):
     with self.cached_session():
       dense = _SparseToDense(
@@ -147,7 +162,7 @@ class SparseToDenseTest(test.TestCase):
           sparse_values=[-1.0, 1.0],
           default_value=0.0)
       with self.assertRaisesOpError(r"indices\[1\] = \[1\] is repeated"):
-        dense.eval()
+        self.evaluate(dense)
       # Disable checks
       dense_without_validation = _SparseToDense(
           sparse_indices=[[1], [1]],
@@ -155,8 +170,9 @@ class SparseToDenseTest(test.TestCase):
           sparse_values=[-1.0, 1.0],
           default_value=0.0,
           validate_indices=False)
-      dense_without_validation.eval()
+      self.evaluate(dense_without_validation)
 
+  @test_util.run_deprecated_v1
   def testUnsortedIndicesWithWithoutValidation(self):
     with self.cached_session():
       dense = _SparseToDense(
@@ -165,7 +181,7 @@ class SparseToDenseTest(test.TestCase):
           sparse_values=[-1.0, 1.0],
           default_value=0.0)
       with self.assertRaisesOpError(r"indices\[1\] = \[1\] is out of order"):
-        dense.eval()
+        self.evaluate(dense)
       # Disable checks
       dense_without_validation = _SparseToDense(
           sparse_indices=[[2], [1]],
@@ -173,8 +189,9 @@ class SparseToDenseTest(test.TestCase):
           sparse_values=[-1.0, 1.0],
           default_value=0.0,
           validate_indices=False)
-      dense_without_validation.eval()
+      self.evaluate(dense_without_validation)
 
+  @test_util.run_deprecated_v1
   def testShapeInferenceKnownShape(self):
     with self.session(use_gpu=False):
       indices = array_ops.placeholder(dtypes.int64)
@@ -187,6 +204,7 @@ class SparseToDenseTest(test.TestCase):
       output = sparse_ops.sparse_to_dense(indices, shape, 1, 0)
       self.assertEqual(output.get_shape().as_list(), [None, None, None])
 
+  @test_util.run_deprecated_v1
   def testShapeInferenceUnknownShape(self):
     with self.session(use_gpu=False):
       indices = array_ops.placeholder(dtypes.int64)
diff --git a/tensorflow/python/kernel_tests/sparse_xent_op_test.py b/tensorflow/python/kernel_tests/sparse_xent_op_test.py
index 0510bc53214..8f0842f7f50 100644
--- a/tensorflow/python/kernel_tests/sparse_xent_op_test.py
+++ b/tensorflow/python/kernel_tests/sparse_xent_op_test.py
@@ -29,6 +29,7 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops as ops_lib
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_nn_ops
 from tensorflow.python.ops import gradient_checker
@@ -66,7 +67,7 @@ class SparseXentTest(test.TestCase):
     with self.cached_session(use_gpu=True) as sess:
       loss, backprop = gen_nn_ops.sparse_softmax_cross_entropy_with_logits(
           np_features, np_labels)
-      tf_loss, tf_backprop = sess.run([loss, backprop])
+      tf_loss, tf_backprop = self.evaluate([loss, backprop])
     self.assertAllCloseAccordingToType(np_loss, tf_loss)
     self.assertAllCloseAccordingToType(np_backprop, tf_backprop)
 
@@ -76,10 +77,11 @@ class SparseXentTest(test.TestCase):
         loss, backprop = gen_nn_ops.sparse_softmax_cross_entropy_with_logits(
             np.array([[1.], [-1.], [0.]]).astype(np.float32),
             np.array([0, 0, 0]).astype(label_dtype))
-        tf_loss, tf_backprop = sess.run([loss, backprop])
+        tf_loss, tf_backprop = self.evaluate([loss, backprop])
       self.assertAllClose([0.0, 0.0, 0.0], tf_loss)
       self.assertAllClose([[0.0], [0.0], [0.0]], tf_backprop)
 
+  @test_util.run_deprecated_v1
   def testInvalidLabel(self):
     features = [[1., 1., 1., 1.], [1., 1., 1., 1.], [1., 2., 3., 4.],
                 [1., 2., 3., 4.]]
@@ -90,7 +92,7 @@ class SparseXentTest(test.TestCase):
         loss, backprop = (
             gen_nn_ops.sparse_softmax_cross_entropy_with_logits(
                 features, labels))
-        tf_loss, tf_backprop = sess.run([loss, backprop])
+        tf_loss, tf_backprop = self.evaluate([loss, backprop])
         self.assertAllClose(
             [[np.nan] * 4, [0.25, 0.25, 0.25, -0.75],
              [-0.968, 0.087, 0.237, 0.6439], [np.nan] * 4],
@@ -104,7 +106,7 @@ class SparseXentTest(test.TestCase):
       loss, backprop = (
           gen_nn_ops.sparse_softmax_cross_entropy_with_logits(features, labels))
       with self.assertRaisesOpError("Received a label value of"):
-        sess.run([loss, backprop])
+        self.evaluate([loss, backprop])
 
   def testNpXent(self):
     # We create 2 batches of logits for testing.
@@ -152,6 +154,7 @@ class SparseXentTest(test.TestCase):
         nn_ops.sparse_softmax_cross_entropy_with_logits(
             labels=constant_op.constant(0), logits=constant_op.constant(1.0))
 
+  @test_util.run_deprecated_v1
   def testLabelsPlaceholderScalar(self):
     with self.session(use_gpu=True):
       labels = array_ops.placeholder(np.int32)
@@ -164,7 +167,7 @@ class SparseXentTest(test.TestCase):
     with self.session(use_gpu=True):
       loss = nn_ops.sparse_softmax_cross_entropy_with_logits(
           labels=constant_op.constant(0), logits=constant_op.constant([1.0]))
-      self.assertAllClose(0.0, loss.eval())
+      self.assertAllClose(0.0, self.evaluate(loss))
 
   def testFloat(self):
     for label_dtype in np.int32, np.int64:
@@ -187,6 +190,7 @@ class SparseXentTest(test.TestCase):
   def testEmpty(self):
     self._testXent(np.zeros((0, 3)), np.zeros((0,), dtype=np.int32))
 
+  @test_util.run_deprecated_v1
   def testGradient(self):
     with self.session(use_gpu=True):
       l = constant_op.constant([3, 0, 1], name="l")
@@ -201,6 +205,7 @@ class SparseXentTest(test.TestCase):
     print("cross entropy gradient err = ", err)
     self.assertLess(err, 5e-8)
 
+  @test_util.run_deprecated_v1
   def testSecondGradient(self):
     images_placeholder = array_ops.placeholder(dtypes.float32, shape=(3, 2))
     labels_placeholder = array_ops.placeholder(dtypes.int32, shape=(3))
@@ -226,21 +231,24 @@ class SparseXentTest(test.TestCase):
       loss = nn_ops.sparse_softmax_cross_entropy_with_logits(
           labels=labels, logits=features)
       backprop = loss.op.inputs[0].op.outputs[1]
-      tf_loss, tf_backprop = sess.run([loss, backprop])
+      tf_loss, tf_backprop = self.evaluate([loss, backprop])
     self.assertAllCloseAccordingToType(np_loss, tf_loss)
     self.assertAllCloseAccordingToType(np_backprop, tf_backprop)
 
+  @test_util.run_deprecated_v1
   def testHighDim(self):
     features = [[[1., 1., 1., 1.]], [[1., 2., 3., 4.]]]
     labels = [[3], [0]]
     self._testHighDim(features, labels)
 
+  @test_util.run_deprecated_v1
   def testHighDim2(self):
     features = [[[1., 1., 1., 1.], [2., 2., 2., 2.]],
                 [[1., 2., 3., 4.], [5., 6., 7., 8.]]]
     labels = [[3, 2], [0, 3]]
     self._testHighDim(features, labels)
 
+  @test_util.run_deprecated_v1
   def testScalarHandling(self):
     with self.session(use_gpu=False) as sess:
       with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
@@ -318,7 +326,7 @@ def sparse_vs_dense_xent_benchmark(batch_size, num_entries, use_gpu):
   # Using sparse_softmax_cross_entropy_with_logits
   with session.Session(config=config) as sess:
     if not use_gpu:
-      with ops_lib.device("/cpu:0"):
+      with test_util.device("/cpu:0"):
         ops = _sparse_vs_dense_xent_benchmark_sparse(labels, logits)
     else:
       ops = _sparse_vs_dense_xent_benchmark_sparse(labels, logits)
diff --git a/tensorflow/python/kernel_tests/sparsemask_op_test.py b/tensorflow/python/kernel_tests/sparsemask_op_test.py
index 6f5dd45b616..b1cd0227bc0 100644
--- a/tensorflow/python/kernel_tests/sparsemask_op_test.py
+++ b/tensorflow/python/kernel_tests/sparsemask_op_test.py
@@ -20,12 +20,14 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
 
 
 class SparseMaskTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testBasic(self):
     values = np.random.rand(4, 4).astype(np.single)
     indices = np.array([0, 2, 3, 4], dtype=np.int32)
diff --git a/tensorflow/python/kernel_tests/split_op_test.py b/tensorflow/python/kernel_tests/split_op_test.py
index 944b0e59b12..517db3450f3 100644
--- a/tensorflow/python/kernel_tests/split_op_test.py
+++ b/tensorflow/python/kernel_tests/split_op_test.py
@@ -42,6 +42,7 @@ class SplitOpTest(test.TestCase):
       data -= 1j * data
     return data
 
+  @test_util.run_deprecated_v1
   def testShapeInference(self):
     model_input = array_ops.placeholder(dtypes.float32, shape=(1, 10))
 
@@ -85,6 +86,7 @@ class SplitOpTest(test.TestCase):
     with self.cached_session(use_gpu=True) as sess:
       sess.run(result, feed_dict={model_input2: np.ones([4, 2])})
 
+  @test_util.run_deprecated_v1
   def testFailWithoutExplicitNum(self):
     size_splits = array_ops.placeholder(dtype=dtypes.int32, shape=[None])
 
@@ -209,6 +211,7 @@ class SplitOpTest(test.TestCase):
     self.assertAllEqual(result[:, 0:1], inp_grads[0])
     self.assertAllEqual(result[:, 1:4], inp_grads[1])
 
+  @test_util.run_deprecated_v1
   def testOutputShape(self):
     for axis in [1, -1]:
       with self.cached_session(use_gpu=True):
@@ -318,15 +321,17 @@ class SplitOpTest(test.TestCase):
       inp_grads = [self._makeData((4, 1), dtype)for _ in range(4)]
       grad_tensors = [constant_op.constant(x) for x in inp_grads]
       grad = gradients_impl.gradients(s, [inp_tensor], grad_tensors)[0]
-      result = grad.eval()
+      result = self.evaluate(grad)
     for i in range(4):
       self.assertAllEqual(result[:, i:i + 1], inp_grads[i])
 
+  @test_util.run_deprecated_v1
   def testGradientsAll(self):
     for dtype in _TEST_DTYPES:
       self._testGradientsSimple(dtype)
       self._testGradientsSimpleVariable(dtype)
 
+  @test_util.run_deprecated_v1
   def testShapeFunctionEdgeCases(self):
     # split_dim greater than rank of input.
     with self.assertRaises(ValueError):
@@ -356,6 +361,7 @@ class SplitOpTest(test.TestCase):
     for s in splits:
       self.assertEqual(None, s.get_shape().ndims)
 
+  @test_util.run_deprecated_v1
   def testVariableShapeFunction(self):
     # size_splits too big
     with self.assertRaises(ValueError):
@@ -366,6 +372,7 @@ class SplitOpTest(test.TestCase):
     assert s0.shape.as_list() == [2]
     assert s1.shape.as_list() == [1]
 
+  @test_util.run_deprecated_v1
   def testNonexistentDimTensor(self):
     x = array_ops.placeholder(dtypes.int32)
     values = np.zeros([5, 30])
diff --git a/tensorflow/python/kernel_tests/stack_op_test.py b/tensorflow/python/kernel_tests/stack_op_test.py
index 4b355620bf9..ca3357a0ed8 100644
--- a/tensorflow/python/kernel_tests/stack_op_test.py
+++ b/tensorflow/python/kernel_tests/stack_op_test.py
@@ -24,6 +24,7 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import variables
@@ -41,6 +42,7 @@ def np_split_squeeze(array, axis):
 
 class StackOpTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testSimple(self):
     np.random.seed(7)
     with self.session(use_gpu=True):
@@ -54,6 +56,7 @@ class StackOpTest(test.TestCase):
           c = array_ops.stack(xs)
           self.assertAllEqual(c.eval(), data)
 
+  @test_util.run_deprecated_v1
   def testSimpleParallelCPU(self):
     np.random.seed(7)
     with self.session(use_gpu=False):
@@ -63,6 +66,7 @@ class StackOpTest(test.TestCase):
         c = array_ops.parallel_stack(xs)
         self.assertAllEqual(c.eval(), data)
 
+  @test_util.run_deprecated_v1
   def testSimpleParallelGPU(self):
     np.random.seed(7)
     with self.session(use_gpu=True):
@@ -72,6 +76,7 @@ class StackOpTest(test.TestCase):
         c = array_ops.parallel_stack(xs)
         self.assertAllEqual(c.eval(), data)
 
+  @test_util.run_deprecated_v1
   def testConst(self):
     np.random.seed(7)
     with self.session(use_gpu=True):
@@ -96,6 +101,7 @@ class StackOpTest(test.TestCase):
         b = array_ops.reshape(a, array_ops.stack([2, 3]))
         self.assertAllEqual(b.get_shape(), [2, 3])
 
+  @test_util.run_deprecated_v1
   def testConstParallelCPU(self):
     np.random.seed(7)
     with self.session(use_gpu=False):
@@ -110,6 +116,7 @@ class StackOpTest(test.TestCase):
         c = array_ops.parallel_stack(data)
         self.assertAllEqual(c.eval(), data)
 
+  @test_util.run_deprecated_v1
   def testConstParallelGPU(self):
     np.random.seed(7)
     with self.session(use_gpu=True):
@@ -124,6 +131,7 @@ class StackOpTest(test.TestCase):
         c = array_ops.parallel_stack(data)
         self.assertAllEqual(c.eval(), data)
 
+  @test_util.run_deprecated_v1
   def testGradientsAxis0(self):
     np.random.seed(7)
     for shape in (2,), (3,), (2, 3), (3, 2), (4, 3, 2):
@@ -136,6 +144,7 @@ class StackOpTest(test.TestCase):
         err = gradient_checker.compute_gradient_error(xs, shapes, c, shape)
         self.assertLess(err, 1e-6)
 
+  @test_util.run_deprecated_v1
   def testGradientsAxis1(self):
     np.random.seed(7)
     for shape in (2, 3), (3, 2), (4, 3, 2):
@@ -150,6 +159,7 @@ class StackOpTest(test.TestCase):
         err = gradient_checker.compute_gradient_error(xs, shapes, c, out_shape)
         self.assertLess(err, 1e-6)
 
+  @test_util.run_deprecated_v1
   def testZeroSizeCPU(self):
     # Verify that stack doesn't crash for zero size inputs
     with self.session(use_gpu=False):
@@ -161,6 +171,7 @@ class StackOpTest(test.TestCase):
         p = array_ops.parallel_stack(list(x)).eval()
         self.assertAllEqual(p, x)
 
+  @test_util.run_deprecated_v1
   def testZeroSizeGPU(self):
     # Verify that stack doesn't crash for zero size inputs
     with self.session(use_gpu=True):
@@ -172,6 +183,7 @@ class StackOpTest(test.TestCase):
         p = array_ops.parallel_stack(list(x)).eval()
         self.assertAllEqual(p, x)
 
+  @test_util.run_deprecated_v1
   def testAxis0DefaultCPU(self):
     with self.session(use_gpu=False):
       t = [constant_op.constant([1, 2, 3]), constant_op.constant([4, 5, 6])]
@@ -182,6 +194,7 @@ class StackOpTest(test.TestCase):
     self.assertAllEqual(stacked, expected)
     self.assertAllEqual(parallel_stacked, expected)
 
+  @test_util.run_deprecated_v1
   def testAxis0DefaultGPU(self):
     with self.session(use_gpu=True):
       t = [constant_op.constant([1, 2, 3]), constant_op.constant([4, 5, 6])]
@@ -204,11 +217,11 @@ class StackOpTest(test.TestCase):
         with self.cached_session(use_gpu=True):
           actual_pack = array_ops.stack(test_arrays, axis=j)
           self.assertEqual(expected.shape, actual_pack.get_shape())
-          actual_pack = actual_pack.eval()
+          actual_pack = self.evaluate(actual_pack)
 
           actual_stack = array_ops.stack(test_arrays, axis=j)
           self.assertEqual(expected.shape, actual_stack.get_shape())
-          actual_stack = actual_stack.eval()
+          actual_stack = self.evaluate(actual_stack)
 
         self.assertNDArrayNear(expected, actual_stack, 1e-6)
 
@@ -225,6 +238,7 @@ class StackOpTest(test.TestCase):
 
 class AutomaticStackingTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testSimple(self):
     with self.session(use_gpu=True):
       self.assertAllEqual(
@@ -253,17 +267,20 @@ class AutomaticStackingTest(test.TestCase):
                                           [[2., 2.], [3., 3.]],
                                           dtype=np.float32)])
       self.assertAllEqual([[[0., 0.], [1., 1.]], [[2., 2.], [3., 3.]]],
-                          result.eval())
+                          self.evaluate(result))
 
+  @test_util.run_deprecated_v1
   def testVariable(self):
     with self.session(use_gpu=True):
       v = variables.Variable(17)
       result = ops.convert_to_tensor([[0, 0, 0], [0, v, 0], [0, 0, 0]])
       v.initializer.run()
-      self.assertAllEqual([[0, 0, 0], [0, 17, 0], [0, 0, 0]], result.eval())
+      self.assertAllEqual([[0, 0, 0], [0, 17, 0], [0, 0, 0]],
+                          self.evaluate(result))
 
       v.assign(38).op.run()
-      self.assertAllEqual([[0, 0, 0], [0, 38, 0], [0, 0, 0]], result.eval())
+      self.assertAllEqual([[0, 0, 0], [0, 38, 0], [0, 0, 0]],
+                          self.evaluate(result))
 
   def testDtype(self):
     t_0 = ops.convert_to_tensor([[0., 0., 0.], [0., 0., 0.], [0., 0., 0.]])
@@ -306,6 +323,7 @@ class AutomaticStackingTest(test.TestCase):
     t_2 = ops.convert_to_tensor([t_0, t_0, t_1], dtype=dtypes.float64)
     self.assertEqual(dtypes.float64, t_2.dtype)
 
+  @test_util.run_deprecated_v1
   def testPlaceholder(self):
     with self.session(use_gpu=True):
       # Test using placeholder with a defined shape.
@@ -324,6 +342,7 @@ class AutomaticStackingTest(test.TestCase):
       self.assertAllEqual(
           [[0, 0, 0], [0, 2, 0], [0, 0, 0]], result_1.eval(feed_dict={ph_1: 2}))
 
+  @test_util.run_deprecated_v1
   def testShapeErrors(self):
     # Static shape error.
     ph_0 = array_ops.placeholder(dtypes.int32, shape=[1])
diff --git a/tensorflow/python/kernel_tests/stack_ops_test.py b/tensorflow/python/kernel_tests/stack_ops_test.py
index 1aa12009ea5..d50f3f46806 100644
--- a/tensorflow/python/kernel_tests/stack_ops_test.py
+++ b/tensorflow/python/kernel_tests/stack_ops_test.py
@@ -24,6 +24,7 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gen_data_flow_ops
 from tensorflow.python.ops import math_ops
@@ -39,8 +40,9 @@ class StackOpTest(test.TestCase):
       c = gen_data_flow_ops.stack_push_v2(h, [[4.0, 5.0]])
       with ops.control_dependencies([c]):
         c1 = gen_data_flow_ops.stack_pop_v2(h, dtypes.float32)
-      self.assertAllClose([[4.0, 5.0]], c1.eval())
+      self.assertAllClose([[4.0, 5.0]], self.evaluate(c1))
 
+  @test_util.run_deprecated_v1
   def testStackPushPop(self):
     self._testStackPushPop(use_gpu=False)
     self._testStackPushPop(use_gpu=True)
@@ -54,8 +56,9 @@ class StackOpTest(test.TestCase):
       c = gen_data_flow_ops.stack_push_v2(h, x, swap_memory=True)
       with ops.control_dependencies([c]):
         c1 = gen_data_flow_ops.stack_pop_v2(h, dtypes.float32)
-      self.assertAllClose(a, c1.eval())
+      self.assertAllClose(a, self.evaluate(c1))
 
+  @test_util.run_deprecated_v1
   def testStackPushPopSwap(self):
     self._testStackPushPopSwap(use_gpu=False)
     self._testStackPushPopSwap(use_gpu=True)
@@ -91,8 +94,9 @@ class StackOpTest(test.TestCase):
 
       _, ry = control_flow_ops.while_loop(
           c1, b1, [r, v], [r.get_shape(), tensor_shape.unknown_shape()])
-      self.assertAllClose(np.ones(2000) * 10.0, ry.eval())
+      self.assertAllClose(np.ones(2000) * 10.0, self.evaluate(ry))
 
+  @test_util.run_deprecated_v1
   def testStackWhileSwap(self):
     self._testStackWhileSwap(use_gpu=False)
     self._testStackWhileSwap(use_gpu=True)
@@ -110,8 +114,9 @@ class StackOpTest(test.TestCase):
       with ops.control_dependencies([c2]):
         c2 = gen_data_flow_ops.stack_pop_v2(h2, dtypes.float32)
       r = c1 + c2
-      self.assertAllClose(9.0, r.eval())
+      self.assertAllClose(9.0, self.evaluate(r))
 
+  @test_util.run_deprecated_v1
   def testMultiStack(self):
     self._testMultiStack(use_gpu=False)
     self._testMultiStack(use_gpu=True)
@@ -131,10 +136,11 @@ class StackOpTest(test.TestCase):
         pop1 = gen_data_flow_ops.stack_pop_v2(h1, dtypes.float32)
         pop2 = gen_data_flow_ops.stack_pop_v2(h2, dtypes.float32)
 
-      out1, out2 = sess.run([pop1, pop2])
+      out1, out2 = self.evaluate([pop1, pop2])
       self.assertAllClose(out1, 4.0)
       self.assertAllClose(out2, 5.0)
 
+  @test_util.run_deprecated_v1
   def testSameNameStacks(self):
     self._testSameNameStacks(use_gpu=False)
     self._testSameNameStacks(use_gpu=True)
@@ -144,8 +150,9 @@ class StackOpTest(test.TestCase):
       h = gen_data_flow_ops.stack_v2(
           -1, elem_type=dtypes.float32, stack_name="foo")
       c1 = gen_data_flow_ops.stack_close_v2(h)
-      sess.run(c1)
+      self.evaluate(c1)
 
+  @test_util.run_deprecated_v1
   def testCloseStack(self):
     self._testCloseStack(use_gpu=False)
     self._testCloseStack(use_gpu=True)
@@ -157,8 +164,9 @@ class StackOpTest(test.TestCase):
       c = gen_data_flow_ops.stack_push_v2(h, [[4.0, 5.0]])
       with ops.control_dependencies([c]):
         c1 = gen_data_flow_ops.stack_close_v2(h)
-      sess.run(c1)
+      self.evaluate(c1)
 
+  @test_util.run_deprecated_v1
   def testPushCloseStack(self):
     self._testPushCloseStack(use_gpu=False)
     self._testPushCloseStack(use_gpu=True)
@@ -173,8 +181,9 @@ class StackOpRefTest(test.TestCase):
       c = gen_data_flow_ops.stack_push(h, [[4.0, 5.0]])
       with ops.control_dependencies([c]):
         c1 = gen_data_flow_ops.stack_pop(h, dtypes.float32)
-      self.assertAllClose([[4.0, 5.0]], c1.eval())
+      self.assertAllClose([[4.0, 5.0]], self.evaluate(c1))
 
+  @test_util.run_deprecated_v1
   def testStackPushPop(self):
     self._testStackPushPop(use_gpu=False)
     self._testStackPushPop(use_gpu=True)
@@ -187,8 +196,9 @@ class StackOpRefTest(test.TestCase):
       c = gen_data_flow_ops.stack_push(h, x, swap_memory=True)
       with ops.control_dependencies([c]):
         c1 = gen_data_flow_ops.stack_pop(h, dtypes.float32)
-      self.assertAllClose(a, c1.eval())
+      self.assertAllClose(a, self.evaluate(c1))
 
+  @test_util.run_deprecated_v1
   def testStackPushPopSwap(self):
     self._testStackPushPopSwap(use_gpu=False)
     self._testStackPushPopSwap(use_gpu=True)
@@ -204,7 +214,7 @@ class StackOpRefTest(test.TestCase):
       with ops.control_dependencies([c2]):
         c2 = gen_data_flow_ops.stack_pop(h2, dtypes.float32)
       r = c1 + c2
-      self.assertAllClose(9.0, r.eval())
+      self.assertAllClose(9.0, self.evaluate(r))
 
   def _testStackWhileSwap(self, use_gpu):
     with self.cached_session(use_gpu=use_gpu):
@@ -236,12 +246,14 @@ class StackOpRefTest(test.TestCase):
 
       _, ry = control_flow_ops.while_loop(
           c1, b1, [r, v], [r.get_shape(), tensor_shape.unknown_shape()])
-      self.assertAllClose(np.ones(2000) * 10.0, ry.eval())
+      self.assertAllClose(np.ones(2000) * 10.0, self.evaluate(ry))
 
+  @test_util.run_deprecated_v1
   def testStackWhileSwap(self):
     self._testStackWhileSwap(use_gpu=False)
     self._testStackWhileSwap(use_gpu=True)
 
+  @test_util.run_deprecated_v1
   def testMultiStack(self):
     self._testMultiStack(use_gpu=False)
     self._testMultiStack(use_gpu=True)
@@ -253,8 +265,9 @@ class StackOpRefTest(test.TestCase):
       h2 = gen_data_flow_ops._stack(dtypes.float32, stack_name="foo")
       c2 = gen_data_flow_ops.stack_push(h2, 5.0)
       _ = c1 + c2
-      self.assertNotEqual(h1.eval()[1], h2.eval()[1])
+      self.assertNotEqual(h1.eval()[1], self.evaluate(h2)[1])
 
+  @test_util.run_deprecated_v1
   def testSameNameStacks(self):
     self._testSameNameStacks(use_gpu=False)
     self._testSameNameStacks(use_gpu=True)
@@ -263,8 +276,9 @@ class StackOpRefTest(test.TestCase):
     with self.cached_session(use_gpu=use_gpu) as sess:
       h = gen_data_flow_ops._stack(dtypes.float32, stack_name="foo")
       c1 = gen_data_flow_ops.stack_close(h)
-      sess.run(c1)
+      self.evaluate(c1)
 
+  @test_util.run_deprecated_v1
   def testCloseStack(self):
     self._testCloseStack(use_gpu=False)
     self._testCloseStack(use_gpu=True)
@@ -275,8 +289,9 @@ class StackOpRefTest(test.TestCase):
       c = gen_data_flow_ops.stack_push(h, [[4.0, 5.0]])
       with ops.control_dependencies([c]):
         c1 = gen_data_flow_ops.stack_close(h)
-      sess.run(c1)
+      self.evaluate(c1)
 
+  @test_util.run_deprecated_v1
   def testPushCloseStack(self):
     self._testPushCloseStack(use_gpu=False)
     self._testPushCloseStack(use_gpu=True)
diff --git a/tensorflow/python/kernel_tests/stage_op_test.py b/tensorflow/python/kernel_tests/stage_op_test.py
index b814843b86c..83e06ba48bd 100644
--- a/tensorflow/python/kernel_tests/stage_op_test.py
+++ b/tensorflow/python/kernel_tests/stage_op_test.py
@@ -18,6 +18,7 @@ from __future__ import print_function
 
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.ops import math_ops
@@ -28,6 +29,7 @@ TIMEOUT = 1
 
 class StageTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testSimple(self):
     with ops.Graph().as_default() as G:
       with ops.device('/cpu:0'):
@@ -47,6 +49,7 @@ class StageTest(test.TestCase):
         _, yval = sess.run([stage, y], feed_dict={x: i})
         self.assertAllClose(4 * (i - 1) * (i - 1) * 128, yval, rtol=1e-4)
 
+  @test_util.run_deprecated_v1
   def testMultiple(self):
     with ops.Graph().as_default() as G:
       with ops.device('/cpu:0'):
@@ -67,6 +70,7 @@ class StageTest(test.TestCase):
         self.assertAllClose(
             4 * (i - 1) * (i - 1) * (i - 1) * 128, yval, rtol=1e-4)
 
+  @test_util.run_deprecated_v1
   def testDictionary(self):
     with ops.Graph().as_default() as G:
       with ops.device('/cpu:0'):
@@ -110,6 +114,7 @@ class StageTest(test.TestCase):
 
     G.finalize()
 
+  @test_util.run_deprecated_v1
   def testPeek(self):
     with ops.Graph().as_default() as G:
       with ops.device('/cpu:0'):
@@ -133,6 +138,7 @@ class StageTest(test.TestCase):
       for i in range(10):
         self.assertTrue(sess.run(peek, feed_dict={p: i}) == [i])
 
+  @test_util.run_deprecated_v1
   def testSizeAndClear(self):
     with ops.Graph().as_default() as G:
       with ops.device('/cpu:0'):
@@ -158,6 +164,7 @@ class StageTest(test.TestCase):
       sess.run(clear)
       self.assertEqual(sess.run(size), 0)
 
+  @test_util.run_deprecated_v1
   def testCapacity(self):
     capacity = 3
 
@@ -219,6 +226,7 @@ class StageTest(test.TestCase):
       # It should now be empty
       self.assertTrue(sess.run(size) == 0)
 
+  @test_util.run_deprecated_v1
   def testMemoryLimit(self):
     memory_limit = 512 * 1024  # 512K
     chunk = 200 * 1024  # 256K
diff --git a/tensorflow/python/kernel_tests/string_join_op_test.py b/tensorflow/python/kernel_tests/string_join_op_test.py
index e4371ab5b93..2548e8695fe 100644
--- a/tensorflow/python/kernel_tests/string_join_op_test.py
+++ b/tensorflow/python/kernel_tests/string_join_op_test.py
@@ -17,12 +17,14 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import string_ops
 from tensorflow.python.platform import test
 
 
 class StringJoinOpTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testStringJoin(self):
     input0 = ["a", "b"]
     input1 = "a"
diff --git a/tensorflow/python/kernel_tests/string_length_op_test.py b/tensorflow/python/kernel_tests/string_length_op_test.py
index 57db7302b15..bfa6ac2454a 100644
--- a/tensorflow/python/kernel_tests/string_length_op_test.py
+++ b/tensorflow/python/kernel_tests/string_length_op_test.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import string_ops
 from tensorflow.python.platform import test
 
@@ -29,9 +30,10 @@ class StringLengthOpTest(test.TestCase):
 
     with self.cached_session() as sess:
       lengths = string_ops.string_length(strings)
-      values = sess.run(lengths)
+      values = self.evaluate(lengths)
       self.assertAllEqual(values, [[[1, 2], [3, 4], [5, 6]]])
 
+  @test_util.run_deprecated_v1
   def testUnit(self):
     unicode_strings = [u"H\xc3llo", u"\U0001f604"]
     utf8_strings = [s.encode("utf-8") for s in unicode_strings]
@@ -43,14 +45,15 @@ class StringLengthOpTest(test.TestCase):
       utf8_char_lengths = string_ops.string_length(
           utf8_strings, unit="UTF8_CHAR")
       self.assertAllEqual(
-          sess.run(utf8_byte_lengths), expected_utf8_byte_lengths)
+          self.evaluate(utf8_byte_lengths), expected_utf8_byte_lengths)
       self.assertAllEqual(
-          sess.run(utf8_char_lengths), expected_utf8_char_lengths)
+          self.evaluate(utf8_char_lengths), expected_utf8_char_lengths)
       with self.assertRaisesRegexp(
           ValueError, "Attr 'unit' of 'StringLength' Op passed string 'XYZ' "
           'not in: "BYTE", "UTF8_CHAR"'):
         string_ops.string_length(utf8_strings, unit="XYZ")
 
+  @test_util.run_deprecated_v1
   def testLegacyPositionalName(self):
     # Code that predates the 'unit' parameter may have used a positional
     # argument for the 'name' parameter.  Check that we don't break such code.
diff --git a/tensorflow/python/kernel_tests/string_split_op_test.py b/tensorflow/python/kernel_tests/string_split_op_test.py
index b968e885eda..0c91deb5220 100644
--- a/tensorflow/python/kernel_tests/string_split_op_test.py
+++ b/tensorflow/python/kernel_tests/string_split_op_test.py
@@ -22,6 +22,7 @@ import numpy as np
 
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import string_ops
 from tensorflow.python.platform import test
@@ -34,17 +35,18 @@ class StringSplitOpTest(test.TestCase):
 
     with self.cached_session() as sess:
       tokens = string_ops.string_split(strings)
-      indices, values, shape = sess.run(tokens)
+      indices, values, shape = self.evaluate(tokens)
       self.assertAllEqual(indices, [[0, 0], [0, 1], [0, 2], [0, 3], [1, 0]])
       self.assertAllEqual(values, [b"pigs", b"on", b"the", b"wing", b"animals"])
       self.assertAllEqual(shape, [2, 4])
 
+  @test_util.run_deprecated_v1
   def testStringSplitEmptyDelimiter(self):
     strings = ["hello", "hola", b"\xF0\x9F\x98\x8E"]  # Last string is U+1F60E
 
     with self.cached_session() as sess:
       tokens = string_ops.string_split(strings, delimiter="")
-      indices, values, shape = sess.run(tokens)
+      indices, values, shape = self.evaluate(tokens)
       self.assertAllEqual(indices, [[0, 0], [0, 1], [0, 2], [0, 3], [0, 4],
                                     [1, 0], [1, 1], [1, 2], [1, 3], [2, 0],
                                     [2, 1], [2, 2], [2, 3]])
@@ -62,7 +64,7 @@ class StringSplitOpTest(test.TestCase):
 
     with self.cached_session() as sess:
       tokens = string_ops.string_split(strings)
-      indices, values, shape = sess.run(tokens)
+      indices, values, shape = self.evaluate(tokens)
       self.assertAllEqual(
           indices,
           [[1, 0], [2, 0], [3, 0], [5, 0], [6, 0], [7, 0], [8, 0]])
@@ -74,13 +76,14 @@ class StringSplitOpTest(test.TestCase):
 
     with self.cached_session() as sess:
       tokens = string_ops.string_split(strings, delimiter=" .")
-      indices, values, shape = sess.run(tokens)
+      indices, values, shape = self.evaluate(tokens)
       self.assertAllEqual(
           indices,
           [[1, 0], [2, 0], [3, 0], [5, 0], [6, 0], [7, 0], [8, 0]])
       self.assertAllEqual(values, [b"a", b"b", b"c", b"d", b"e", b"f", b"g"])
       self.assertAllEqual(shape, [10, 1])
 
+  @test_util.run_deprecated_v1
   def testStringSplitWithDelimiter(self):
     strings = ["hello|world", "hello world"]
 
@@ -92,17 +95,18 @@ class StringSplitOpTest(test.TestCase):
           ValueError, string_ops.string_split, strings, delimiter=["a"])
 
       tokens = string_ops.string_split(strings, delimiter="|")
-      indices, values, shape = sess.run(tokens)
+      indices, values, shape = self.evaluate(tokens)
       self.assertAllEqual(indices, [[0, 0], [0, 1], [1, 0]])
       self.assertAllEqual(values, [b"hello", b"world", b"hello world"])
       self.assertAllEqual(shape, [2, 2])
 
       tokens = string_ops.string_split(strings, delimiter="| ")
-      indices, values, shape = sess.run(tokens)
+      indices, values, shape = self.evaluate(tokens)
       self.assertAllEqual(indices, [[0, 0], [0, 1], [1, 0], [1, 1]])
       self.assertAllEqual(values, [b"hello", b"world", b"hello", b"world"])
       self.assertAllEqual(shape, [2, 2])
 
+  @test_util.run_deprecated_v1
   def testStringSplitWithDelimiterTensor(self):
     strings = ["hello|world", "hello world"]
 
@@ -121,6 +125,7 @@ class StringSplitOpTest(test.TestCase):
       self.assertAllEqual(values, [b"hello", b"world", b"hello world"])
       self.assertAllEqual(shape, [2, 2])
 
+  @test_util.run_deprecated_v1
   def testStringSplitWithDelimitersTensor(self):
     strings = ["hello.cruel,world", "hello cruel world"]
 
@@ -145,7 +150,7 @@ class StringSplitOpTest(test.TestCase):
 
     with self.cached_session() as sess:
       tokens = string_ops.string_split(strings, "#", skip_empty=False)
-      indices, values, shape = sess.run(tokens)
+      indices, values, shape = self.evaluate(tokens)
       self.assertAllEqual(indices, [[0, 0], [0, 1],
                                     [1, 0], [1, 1],
                                     [2, 0], [2, 1], [2, 2]])
@@ -154,7 +159,7 @@ class StringSplitOpTest(test.TestCase):
 
     with self.cached_session() as sess:
       tokens = string_ops.string_split(strings, "#")
-      indices, values, shape = sess.run(tokens)
+      indices, values, shape = self.evaluate(tokens)
       self.assertAllEqual(values, [b"a", b"b", b"c"])
       self.assertAllEqual(indices, [[0, 0], [1, 0], [2, 0]])
       self.assertAllEqual(shape, [3, 1])
@@ -167,7 +172,7 @@ class StringSplitV2OpTest(test.TestCase):
 
     with self.cached_session() as sess:
       tokens = string_ops.string_split_v2(strings)
-      indices, values, shape = sess.run(tokens)
+      indices, values, shape = self.evaluate(tokens)
       self.assertAllEqual(indices, [[0, 0], [0, 1], [0, 2], [0, 3], [1, 0]])
       self.assertAllEqual(values, [b"pigs", b"on", b"the", b"wing", b"animals"])
       self.assertAllEqual(shape, [2, 4])
@@ -182,7 +187,7 @@ class StringSplitV2OpTest(test.TestCase):
 
     with self.cached_session() as sess:
       tokens = string_ops.string_split_v2(strings, sep="<>")
-      indices, values, shape = sess.run(tokens)
+      indices, values, shape = self.evaluate(tokens)
       self.assertAllEqual(
           indices, [[0, 0], [0, 1], [0, 2],
                     [1, 0], [1, 1], [1, 2], [1, 3], [1, 4], [1, 5], [1, 6]])
@@ -200,7 +205,7 @@ class StringSplitV2OpTest(test.TestCase):
 
     with self.cached_session() as sess:
       tokens = string_ops.string_split_v2(strings, sep=',')
-      indices, values, shape = sess.run(tokens)
+      indices, values, shape = self.evaluate(tokens)
       self.assertAllEqual(indices, [[0, 0], [0, 1], [0, 2],
                                     [1, 0], [1, 1], [1, 2], [1, 3], [1, 4]])
       self.assertAllEqual(values, [b"1", b"2", b"3",
@@ -217,7 +222,7 @@ class StringSplitV2OpTest(test.TestCase):
 
     with self.cached_session() as sess:
       tokens = string_ops.string_split_v2(strings)
-      indices, values, shape = sess.run(tokens)
+      indices, values, shape = self.evaluate(tokens)
       self.assertAllEqual(indices, [[0, 0], [0, 1], [0, 2],
                                     [1, 0], [1, 1], [1, 2]])
       self.assertAllEqual(values, [b"1", b"2", b"3", b"4", b"5", b"6"])
@@ -233,7 +238,7 @@ class StringSplitV2OpTest(test.TestCase):
 
     with self.cached_session() as sess:
       tokens = string_ops.string_split_v2(strings, sep=',', maxsplit=1)
-      indices, values, shape = sess.run(tokens)
+      indices, values, shape = self.evaluate(tokens)
       self.assertAllEqual(indices, [[0, 0], [0, 1],
                                     [1, 0], [1, 1]])
       self.assertAllEqual(values, [b"1", b"2,3", b"4", b"5,,6,"])
@@ -249,7 +254,7 @@ class StringSplitV2OpTest(test.TestCase):
 
     with self.cached_session() as sess:
       tokens = string_ops.string_split_v2(strings, maxsplit=1)
-      indices, values, shape = sess.run(tokens)
+      indices, values, shape = self.evaluate(tokens)
       self.assertAllEqual(indices, [[0, 0], [0, 1],
                                     [1, 0], [1, 1]])
       self.assertAllEqual(values, [b"1", b"2 3", b"4", b"5    6  "])
diff --git a/tensorflow/python/kernel_tests/string_strip_op_test.py b/tensorflow/python/kernel_tests/string_strip_op_test.py
index 1e404b71462..edff3862ff6 100644
--- a/tensorflow/python/kernel_tests/string_strip_op_test.py
+++ b/tensorflow/python/kernel_tests/string_strip_op_test.py
@@ -30,7 +30,7 @@ class StringStripOpTest(test.TestCase):
 
     with self.cached_session() as sess:
       output = string_ops.string_strip(strings)
-      output = sess.run(output)
+      output = self.evaluate(output)
       self.assertAllEqual(output, [b"pigs on the wing", b"animals"])
 
   def test_string_strip_2d(self):
@@ -39,7 +39,7 @@ class StringStripOpTest(test.TestCase):
 
     with self.cached_session() as sess:
       output = string_ops.string_strip(strings)
-      output = sess.run(output)
+      output = self.evaluate(output)
       self.assertAllEqual(output, [[b"pigs on the wing", b"animals"],
                                    [b"hello", b"world"]])
 
@@ -48,7 +48,7 @@ class StringStripOpTest(test.TestCase):
 
     with self.cached_session() as sess:
       output = string_ops.string_strip(strings)
-      output = sess.run(output)
+      output = self.evaluate(output)
       self.assertAllEqual(output, [b"hello", b"", b"world", b""])
 
 
diff --git a/tensorflow/python/kernel_tests/string_to_hash_bucket_op_test.py b/tensorflow/python/kernel_tests/string_to_hash_bucket_op_test.py
index 9cb0c9d18f3..25f573fc144 100644
--- a/tensorflow/python/kernel_tests/string_to_hash_bucket_op_test.py
+++ b/tensorflow/python/kernel_tests/string_to_hash_bucket_op_test.py
@@ -19,6 +19,7 @@ from __future__ import print_function
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import string_ops
 from tensorflow.python.platform import test
@@ -26,6 +27,7 @@ from tensorflow.python.platform import test
 
 class StringToHashBucketOpTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testStringToOneHashBucketFast(self):
     with self.cached_session():
       input_string = array_ops.placeholder(dtypes.string)
@@ -34,6 +36,7 @@ class StringToHashBucketOpTest(test.TestCase):
 
       self.assertAllEqual([0, 0, 0], result)
 
+  @test_util.run_deprecated_v1
   def testStringToHashBucketsFast(self):
     with self.cached_session():
       input_string = array_ops.placeholder(dtypes.string)
@@ -46,6 +49,7 @@ class StringToHashBucketOpTest(test.TestCase):
       # Fingerprint64('d') -> 4470636696479570465 -> mod 10 -> 5
       self.assertAllEqual([9, 2, 2, 5], result)
 
+  @test_util.run_deprecated_v1
   def testStringToOneHashBucketLegacyHash(self):
     with self.cached_session():
       input_string = array_ops.placeholder(dtypes.string)
@@ -54,6 +58,7 @@ class StringToHashBucketOpTest(test.TestCase):
 
       self.assertAllEqual([0, 0, 0], result)
 
+  @test_util.run_deprecated_v1
   def testStringToHashBucketsLegacyHash(self):
     with self.cached_session():
       input_string = array_ops.placeholder(dtypes.string)
@@ -70,7 +75,7 @@ class StringToHashBucketOpTest(test.TestCase):
       input_string = constant_op.constant(['a', 'b', 'c'])
       output = string_ops.string_to_hash_bucket_strong(
           input_string, 1, key=[123, 345])
-      self.assertAllEqual([0, 0, 0], output.eval())
+      self.assertAllEqual([0, 0, 0], self.evaluate(output))
 
   def testStringToHashBucketsStrong(self):
     with self.cached_session():
@@ -81,7 +86,7 @@ class StringToHashBucketOpTest(test.TestCase):
       # StrongKeyedHash(key, 'a') -> 7157389809176466784 -> mod 10 -> 4
       # StrongKeyedHash(key, 'b') -> 15805638358933211562 -> mod 10 -> 2
       # StrongKeyedHash(key, 'c') -> 18100027895074076528 -> mod 10 -> 8
-      self.assertAllEqual([4, 2, 8], output.eval())
+      self.assertAllEqual([4, 2, 8], self.evaluate(output))
 
   def testStringToHashBucketsStrongInvalidKey(self):
     with self.cached_session():
diff --git a/tensorflow/python/kernel_tests/string_to_number_op_test.py b/tensorflow/python/kernel_tests/string_to_number_op_test.py
index 99ee25e1253..49ccfd1028f 100644
--- a/tensorflow/python/kernel_tests/string_to_number_op_test.py
+++ b/tensorflow/python/kernel_tests/string_to_number_op_test.py
@@ -19,6 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import parsing_ops
 from tensorflow.python.platform import test
@@ -45,6 +46,7 @@ class StringToNumberOpTest(test.TestCase):
         with self.assertRaisesOpError(outstr):
           output.eval(feed_dict={input_string: [instr]})
 
+  @test_util.run_deprecated_v1
   def testToFloat(self):
     self._test(dtypes.float32,
                [("0", 0), ("3", 3), ("-1", -1),
@@ -58,6 +60,7 @@ class StringToNumberOpTest(test.TestCase):
                 ("INF", float("INF"))],
                [("10foobar", _ERROR_MESSAGE + "10foobar")])
 
+  @test_util.run_deprecated_v1
   def testToDouble(self):
     self._test(dtypes.float64,
                [("0", 0), ("3", 3), ("-1", -1),
@@ -71,6 +74,7 @@ class StringToNumberOpTest(test.TestCase):
                 ("INF", float("INF"))],
                [("10foobar", _ERROR_MESSAGE + "10foobar")])
 
+  @test_util.run_deprecated_v1
   def testToInt32(self):
     self._test(dtypes.int32,
                [("0", 0), ("3", 3), ("-1", -1),
@@ -84,6 +88,7 @@ class StringToNumberOpTest(test.TestCase):
                    ("2.9", _ERROR_MESSAGE + "2.9"),
                    ("10foobar", _ERROR_MESSAGE + "10foobar")])
 
+  @test_util.run_deprecated_v1
   def testToInt64(self):
     self._test(dtypes.int64,
                [("0", 0), ("3", 3), ("-1", -1),
diff --git a/tensorflow/python/kernel_tests/substr_op_test.py b/tensorflow/python/kernel_tests/substr_op_test.py
index 37aa624b07e..9302152e82b 100644
--- a/tensorflow/python/kernel_tests/substr_op_test.py
+++ b/tensorflow/python/kernel_tests/substr_op_test.py
@@ -22,6 +22,7 @@ from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.python.framework import errors_impl
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import string_ops
 from tensorflow.python.platform import test
 
@@ -51,7 +52,7 @@ class SubstrOpTest(test.TestCase, parameterized.TestCase):
     length = np.array(3, dtype)
     substr_op = string_ops.substr(test_string, position, length, unit=unit)
     with self.cached_session():
-      substr = substr_op.eval()
+      substr = self.evaluate(substr_op)
       self.assertAllEqual(substr, expected_value)
 
   @parameterized.parameters(
@@ -71,7 +72,7 @@ class SubstrOpTest(test.TestCase, parameterized.TestCase):
     length = np.array(3, dtype)
     substr_op = string_ops.substr(test_string, position, length, unit=unit)
     with self.cached_session():
-      substr = substr_op.eval()
+      substr = self.evaluate(substr_op)
       self.assertAllEqual(substr, expected_value)
 
     # Full string
@@ -83,7 +84,7 @@ class SubstrOpTest(test.TestCase, parameterized.TestCase):
     length = np.array(5, dtype)
     substr_op = string_ops.substr(test_string, position, length, unit=unit)
     with self.cached_session():
-      substr = substr_op.eval()
+      substr = self.evaluate(substr_op)
       self.assertAllEqual(substr, test_string)
 
     # Full string (Negative)
@@ -95,7 +96,7 @@ class SubstrOpTest(test.TestCase, parameterized.TestCase):
     length = np.array(5, dtype)
     substr_op = string_ops.substr(test_string, position, length, unit=unit)
     with self.cached_session():
-      substr = substr_op.eval()
+      substr = self.evaluate(substr_op)
       self.assertAllEqual(substr, test_string)
 
     # Length is larger in magnitude than a negative position
@@ -111,7 +112,7 @@ class SubstrOpTest(test.TestCase, parameterized.TestCase):
     length = np.array(5, dtype)
     substr_op = string_ops.substr(test_string, position, length, unit=unit)
     with self.cached_session():
-      substr = substr_op.eval()
+      substr = self.evaluate(substr_op)
       self.assertAllEqual(substr, expected_string)
 
   @parameterized.parameters(
@@ -138,7 +139,7 @@ class SubstrOpTest(test.TestCase, parameterized.TestCase):
     length = np.array(3, dtype)
     substr_op = string_ops.substr(test_string, position, length, unit=unit)
     with self.cached_session():
-      substr = substr_op.eval()
+      substr = self.evaluate(substr_op)
       self.assertAllEqual(substr, expected_value)
 
   @parameterized.parameters(
@@ -173,7 +174,7 @@ class SubstrOpTest(test.TestCase, parameterized.TestCase):
     }[unit]
     substr_op = string_ops.substr(test_string, position, length, unit=unit)
     with self.cached_session():
-      substr = substr_op.eval()
+      substr = self.evaluate(substr_op)
       self.assertAllEqual(substr, expected_value)
 
     position = np.array(-3, dtype)
@@ -188,7 +189,7 @@ class SubstrOpTest(test.TestCase, parameterized.TestCase):
     }[unit]
     substr_op = string_ops.substr(test_string, position, length, unit=unit)
     with self.cached_session():
-      substr = substr_op.eval()
+      substr = self.evaluate(substr_op)
       self.assertAllEqual(substr, expected_value)
 
   @parameterized.parameters(
@@ -229,7 +230,7 @@ class SubstrOpTest(test.TestCase, parameterized.TestCase):
     }[unit]
     substr_op = string_ops.substr(test_string, position, length, unit=unit)
     with self.cached_session():
-      substr = substr_op.eval()
+      substr = self.evaluate(substr_op)
       self.assertAllEqual(substr, expected_value)
 
   @parameterized.parameters(
@@ -271,7 +272,7 @@ class SubstrOpTest(test.TestCase, parameterized.TestCase):
     }[unit]
     substr_op = string_ops.substr(test_string, position, length, unit=unit)
     with self.cached_session():
-      substr = substr_op.eval()
+      substr = self.evaluate(substr_op)
       self.assertAllEqual(substr, expected_value)
 
     # Broadcast input string onto pos/len
@@ -294,7 +295,7 @@ class SubstrOpTest(test.TestCase, parameterized.TestCase):
     }[unit]
     substr_op = string_ops.substr(test_string, position, length, unit=unit)
     with self.cached_session():
-      substr = substr_op.eval()
+      substr = self.evaluate(substr_op)
       self.assertAllEqual(substr, expected_value)
 
     # Test 1D broadcast
@@ -310,7 +311,7 @@ class SubstrOpTest(test.TestCase, parameterized.TestCase):
     }[unit]
     substr_op = string_ops.substr(test_string, position, length, unit=unit)
     with self.cached_session():
-      substr = substr_op.eval()
+      substr = self.evaluate(substr_op)
       self.assertAllEqual(substr, expected_value)
 
   @parameterized.parameters(
@@ -319,6 +320,7 @@ class SubstrOpTest(test.TestCase, parameterized.TestCase):
       (np.int32, "UTF8_CHAR"),
       (np.int64, "UTF8_CHAR"),
   )
+  @test_util.run_deprecated_v1
   def testBadBroadcast(self, dtype, unit):
     test_string = [[b"ten", b"eleven", b"twelve"],
                    [b"thirteen", b"fourteen", b"fifteen"],
@@ -338,6 +340,7 @@ class SubstrOpTest(test.TestCase, parameterized.TestCase):
       (np.int32, -6, "UTF8_CHAR"),
       (np.int64, -6, "UTF8_CHAR"),
   )
+  @test_util.run_deprecated_v1
   def testOutOfRangeError_Scalar(self, dtype, pos, unit):
     # Scalar/Scalar
     test_string = {
@@ -349,7 +352,7 @@ class SubstrOpTest(test.TestCase, parameterized.TestCase):
     substr_op = string_ops.substr(test_string, position, length, unit=unit)
     with self.cached_session():
       with self.assertRaises(errors_impl.InvalidArgumentError):
-        substr_op.eval()
+        self.evaluate(substr_op)
 
   @parameterized.parameters(
       (np.int32, 4, "BYTE"),
@@ -361,6 +364,7 @@ class SubstrOpTest(test.TestCase, parameterized.TestCase):
       (np.int32, -4, "UTF8_CHAR"),
       (np.int64, -4, "UTF8_CHAR"),
   )
+  @test_util.run_deprecated_v1
   def testOutOfRangeError_VectorScalar(self, dtype, pos, unit):
     # Vector/Scalar
     test_string = {
@@ -373,7 +377,7 @@ class SubstrOpTest(test.TestCase, parameterized.TestCase):
     substr_op = string_ops.substr(test_string, position, length, unit=unit)
     with self.cached_session():
       with self.assertRaises(errors_impl.InvalidArgumentError):
-        substr_op.eval()
+        self.evaluate(substr_op)
 
   @parameterized.parameters(
       (np.int32, "BYTE"),
@@ -381,6 +385,7 @@ class SubstrOpTest(test.TestCase, parameterized.TestCase):
       (np.int32, "UTF8_CHAR"),
       (np.int64, "UTF8_CHAR"),
   )
+  @test_util.run_deprecated_v1
   def testOutOfRangeError_MatrixMatrix(self, dtype, unit):
     # Matrix/Matrix
     test_string = {
@@ -398,7 +403,7 @@ class SubstrOpTest(test.TestCase, parameterized.TestCase):
     substr_op = string_ops.substr(test_string, position, length, unit=unit)
     with self.cached_session():
       with self.assertRaises(errors_impl.InvalidArgumentError):
-        substr_op.eval()
+        self.evaluate(substr_op)
 
     # Matrix/Matrix (with negative)
     position = np.array([[1, 2, -3], [1, 2, -4], [1, 2, -3]], dtype)
@@ -406,7 +411,7 @@ class SubstrOpTest(test.TestCase, parameterized.TestCase):
     substr_op = string_ops.substr(test_string, position, length, unit=unit)
     with self.cached_session():
       with self.assertRaises(errors_impl.InvalidArgumentError):
-        substr_op.eval()
+        self.evaluate(substr_op)
 
   @parameterized.parameters(
       (np.int32, "BYTE"),
@@ -414,6 +419,7 @@ class SubstrOpTest(test.TestCase, parameterized.TestCase):
       (np.int32, "UTF8_CHAR"),
       (np.int64, "UTF8_CHAR"),
   )
+  @test_util.run_deprecated_v1
   def testOutOfRangeError_Broadcast(self, dtype, unit):
     # Broadcast
     test_string = {
@@ -428,7 +434,7 @@ class SubstrOpTest(test.TestCase, parameterized.TestCase):
     substr_op = string_ops.substr(test_string, position, length, unit=unit)
     with self.cached_session():
       with self.assertRaises(errors_impl.InvalidArgumentError):
-        substr_op.eval()
+        self.evaluate(substr_op)
 
     # Broadcast (with negative)
     position = np.array([-1, -2, -4], dtype)
@@ -436,7 +442,7 @@ class SubstrOpTest(test.TestCase, parameterized.TestCase):
     substr_op = string_ops.substr(test_string, position, length, unit=unit)
     with self.cached_session():
       with self.assertRaises(errors_impl.InvalidArgumentError):
-        substr_op.eval()
+        self.evaluate(substr_op)
 
   @parameterized.parameters(
       (np.int32, "BYTE"),
@@ -444,6 +450,7 @@ class SubstrOpTest(test.TestCase, parameterized.TestCase):
       (np.int32, "UTF8_CHAR"),
       (np.int64, "UTF8_CHAR"),
   )
+  @test_util.run_deprecated_v1
   def testMismatchPosLenShapes(self, dtype, unit):
     test_string = {
         "BYTE": [[b"ten", b"eleven", b"twelve"],
@@ -471,6 +478,7 @@ class SubstrOpTest(test.TestCase, parameterized.TestCase):
     with self.assertRaises(ValueError):
       string_ops.substr(test_string, position, length)
 
+  @test_util.run_deprecated_v1
   def testWrongDtype(self):
     with self.cached_session():
       with self.assertRaises(TypeError):
@@ -478,6 +486,7 @@ class SubstrOpTest(test.TestCase, parameterized.TestCase):
       with self.assertRaises(TypeError):
         string_ops.substr(b"test", 3, 1.0)
 
+  @test_util.run_deprecated_v1
   def testInvalidUnit(self):
     with self.cached_session():
       with self.assertRaises(ValueError):
diff --git a/tensorflow/python/kernel_tests/summary_v1_audio_op_test.py b/tensorflow/python/kernel_tests/summary_v1_audio_op_test.py
index 63ce77b9d55..1547c55f8b0 100644
--- a/tensorflow/python/kernel_tests/summary_v1_audio_op_test.py
+++ b/tensorflow/python/kernel_tests/summary_v1_audio_op_test.py
@@ -60,7 +60,7 @@ class SummaryV1AudioOpTest(test.TestCase):
         sample_rate = 8000
         summ = summary.audio(
             "snd", const, max_outputs=3, sample_rate=sample_rate)
-        value = sess.run(summ)
+        value = self.evaluate(summ)
         self.assertEqual([], summ.get_shape())
         audio_summ = self._AsSummary(value)
 
diff --git a/tensorflow/python/kernel_tests/summary_v1_image_op_test.py b/tensorflow/python/kernel_tests/summary_v1_image_op_test.py
index 094606944ff..56de2e933db 100644
--- a/tensorflow/python/kernel_tests/summary_v1_image_op_test.py
+++ b/tensorflow/python/kernel_tests/summary_v1_image_op_test.py
@@ -24,6 +24,7 @@ from six.moves import xrange  # pylint: disable=redefined-builtin
 from tensorflow.core.framework import summary_pb2
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import image_ops
 import tensorflow.python.ops.nn_grad  # pylint: disable=unused-import
 from tensorflow.python.platform import test
@@ -49,6 +50,7 @@ class SummaryV1ImageOpTest(test.TestCase):
         }""" % ((i,) + shape[1:]) for i in xrange(3))
     self.assertProtoEquals(expected, image_summ)
 
+  @test_util.run_deprecated_v1
   def testImageSummary(self):
     for depth in (1, 3, 4):
       for positive in False, True:
@@ -70,7 +72,7 @@ class SummaryV1ImageOpTest(test.TestCase):
 
           # Summarize
           summ = summary.image("img", const)
-          value = sess.run(summ)
+          value = self.evaluate(summ)
           self.assertEqual([], summ.get_shape())
           image_summ = self._AsSummary(value)
 
@@ -84,6 +86,7 @@ class SummaryV1ImageOpTest(test.TestCase):
           # Check the rest of the proto
           self._CheckProto(image_summ, shape)
 
+  @test_util.run_deprecated_v1
   def testImageSummaryUint8(self):
     np.random.seed(7)
     for depth in (1, 3, 4):
@@ -97,7 +100,7 @@ class SummaryV1ImageOpTest(test.TestCase):
 
         # Summarize
         summ = summary.image("img", tf_images)
-        value = sess.run(summ)
+        value = self.evaluate(summ)
         self.assertEqual([], summ.get_shape())
         image_summ = self._AsSummary(value)
 
diff --git a/tensorflow/python/kernel_tests/summary_v1_ops_test.py b/tensorflow/python/kernel_tests/summary_v1_ops_test.py
index 6c4e106b118..e070f5bf6f5 100644
--- a/tensorflow/python/kernel_tests/summary_v1_ops_test.py
+++ b/tensorflow/python/kernel_tests/summary_v1_ops_test.py
@@ -26,6 +26,7 @@ from __future__ import print_function
 from tensorflow.core.framework import summary_pb2
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import logging_ops
 from tensorflow.python.platform import test
 from tensorflow.python.summary import summary
@@ -42,7 +43,7 @@ class SummaryV1OpsTest(test.TestCase):
     with self.cached_session() as sess:
       const = constant_op.constant([10.0, 20.0])
       summ = logging_ops.scalar_summary(["c1", "c2"], const, name="mysumm")
-      value = sess.run(summ)
+      value = self.evaluate(summ)
     self.assertEqual([], summ.get_shape())
     self.assertProtoEquals("""
       value { tag: "c1" simple_value: 10.0 }
@@ -53,20 +54,21 @@ class SummaryV1OpsTest(test.TestCase):
     with self.cached_session() as sess:
       const = constant_op.constant([10.0, 20.0])
       summ = logging_ops.scalar_summary(["c1", "c2"], const)
-      value = sess.run(summ)
+      value = self.evaluate(summ)
     self.assertEqual([], summ.get_shape())
     self.assertProtoEquals("""
       value { tag: "c1" simple_value: 10.0 }
       value { tag: "c2" simple_value: 20.0 }
       """, self._AsSummary(value))
 
+  @test_util.run_deprecated_v1
   def testMergeSummary(self):
     with self.cached_session() as sess:
       const = constant_op.constant(10.0)
       summ1 = summary.histogram("h", const)
       summ2 = logging_ops.scalar_summary("c", const)
       merge = summary.merge([summ1, summ2])
-      value = sess.run(merge)
+      value = self.evaluate(merge)
     self.assertEqual([], merge.get_shape())
     self.assertProtoEquals("""
       value {
diff --git a/tensorflow/python/kernel_tests/summary_v1_tensor_op_test.py b/tensorflow/python/kernel_tests/summary_v1_tensor_op_test.py
index 34f771679ae..b8e5b5b882a 100644
--- a/tensorflow/python/kernel_tests/summary_v1_tensor_op_test.py
+++ b/tensorflow/python/kernel_tests/summary_v1_tensor_op_test.py
@@ -50,7 +50,7 @@ class SummaryV1TensorOpTest(test.TestCase):
         with ops.name_scope("zod"):
           s3 = summary_lib.tensor_summary("s3", c)
           s4 = summary_lib.tensor_summary("TensorSummary", c)
-      summ1, summ2, summ3, summ4 = sess.run([s1, s2, s3, s4])
+      summ1, summ2, summ3, summ4 = self.evaluate([s1, s2, s3, s4])
 
     v1 = self._SummarySingleValue(summ1)
     self.assertEqual(v1.tag, "s1")
@@ -68,7 +68,7 @@ class SummaryV1TensorOpTest(test.TestCase):
     with self.cached_session() as sess:
       const = constant_op.constant(10.0)
       summ = summary_lib.tensor_summary("foo", const)
-      result = sess.run(summ)
+      result = self.evaluate(summ)
 
     value = self._SummarySingleValue(result)
     n = tensor_util.MakeNdarray(value.tensor)
@@ -79,7 +79,7 @@ class SummaryV1TensorOpTest(test.TestCase):
     with self.cached_session() as sess:
       const = constant_op.constant(s)
       summ = summary_lib.tensor_summary("foo", const)
-      result = sess.run(summ)
+      result = self.evaluate(summ)
 
     value = self._SummarySingleValue(result)
     n = tensor_util.MakeNdarray(value.tensor)
@@ -89,7 +89,7 @@ class SummaryV1TensorOpTest(test.TestCase):
     with self.cached_session() as sess:
       const = array_ops.ones([5, 5, 5])
       summ = summary_lib.tensor_summary("foo", const)
-      result = sess.run(summ)
+      result = self.evaluate(summ)
     value = self._SummarySingleValue(result)
     n = tensor_util.MakeNdarray(value.tensor)
     self._AssertNumpyEq(n, np.ones([5, 5, 5]))
@@ -99,7 +99,7 @@ class SummaryV1TensorOpTest(test.TestCase):
     with self.cached_session() as sess:
       const = constant_op.constant(strings)
       summ = summary_lib.tensor_summary("foo", const)
-      result = sess.run(summ)
+      result = self.evaluate(summ)
     value = self._SummarySingleValue(result)
     n = tensor_util.MakeNdarray(value.tensor)
     self._AssertNumpyEq(n, strings)
@@ -109,7 +109,7 @@ class SummaryV1TensorOpTest(test.TestCase):
     with self.cached_session() as sess:
       const = constant_op.constant(bools)
       summ = summary_lib.tensor_summary("foo", const)
-      result = sess.run(summ)
+      result = self.evaluate(summ)
 
     value = self._SummarySingleValue(result)
     n = tensor_util.MakeNdarray(value.tensor)
@@ -119,7 +119,7 @@ class SummaryV1TensorOpTest(test.TestCase):
     with self.cached_session() as sess:
 
       def get_description(summary_op):
-        summ_str = sess.run(summary_op)
+        summ_str = self.evaluate(summary_op)
         summ = summary_pb2.Summary()
         summ.ParseFromString(summ_str)
         return summ.value[0].metadata
diff --git a/tensorflow/python/kernel_tests/svd_op_test.py b/tensorflow/python/kernel_tests/svd_op_test.py
index 57298c0fecc..97a280ef51c 100644
--- a/tensorflow/python/kernel_tests/svd_op_test.py
+++ b/tensorflow/python/kernel_tests/svd_op_test.py
@@ -68,7 +68,7 @@ class SvdOpTest(test.TestCase):
             s2 = linalg_ops.svd(
                 matrix2, compute_uv=compute_uv_, full_matrices=full_matrices_)
             all_ops += [s1, s2]
-      val = sess.run(all_ops)
+      val = self.evaluate(all_ops)
       for i in range(2):
         s = 6 * i
         self.assertAllEqual(val[s], val[s + 3])  # s1 == s2
@@ -123,7 +123,7 @@ def _GetSvdOpTest(dtype_, shape_, use_static_shape_, compute_uv_,
     # Tests that x[...,:,:]^H * x[...,:,:] is close to the identity.
     xx = math_ops.matmul(x, x, adjoint_a=True)
     identity = array_ops.matrix_band_part(array_ops.ones_like(xx), 0, 0)
-    self.assertAllClose(identity.eval(), xx.eval(), atol=tol)
+    self.assertAllClose(identity.eval(), self.evaluate(xx), atol=tol)
 
   def Test(self):
     is_complex = dtype_ in (np.complex64, np.complex128)
@@ -150,7 +150,7 @@ def _GetSvdOpTest(dtype_, shape_, use_static_shape_, compute_uv_,
         s_tf, u_tf, v_tf = linalg_ops.svd(
             x_tf, compute_uv=compute_uv_, full_matrices=full_matrices_)
         if use_static_shape_:
-          s_tf_val, u_tf_val, v_tf_val = sess.run([s_tf, u_tf, v_tf])
+          s_tf_val, u_tf_val, v_tf_val = self.evaluate([s_tf, u_tf, v_tf])
         else:
           s_tf_val, u_tf_val, v_tf_val = sess.run(
               [s_tf, u_tf, v_tf], feed_dict={x_tf: x_np})
@@ -158,7 +158,7 @@ def _GetSvdOpTest(dtype_, shape_, use_static_shape_, compute_uv_,
         s_tf = linalg_ops.svd(
             x_tf, compute_uv=compute_uv_, full_matrices=full_matrices_)
         if use_static_shape_:
-          s_tf_val = sess.run(s_tf)
+          s_tf_val = self.evaluate(s_tf)
         else:
           s_tf_val = sess.run(s_tf, feed_dict={x_tf: x_np})
 
diff --git a/tensorflow/python/kernel_tests/template_test.py b/tensorflow/python/kernel_tests/template_test.py
index 9dcdaa61ed2..3b2a56bd1ff 100644
--- a/tensorflow/python/kernel_tests/template_test.py
+++ b/tensorflow/python/kernel_tests/template_test.py
@@ -72,6 +72,7 @@ def variable_scoped_function_with_local_variable():
 
 class TemplateTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def test_end_to_end(self):
     """This test shows a very simple line model with test_loss.
 
@@ -104,10 +105,10 @@ class TemplateTest(test.TestCase):
     train_op = optimizer.minimize(train_loss)
 
     with session.Session() as sess:
-      sess.run(variables.global_variables_initializer())
-      initial_test_loss = sess.run(test_loss)
-      sess.run(train_op)
-      final_test_loss = sess.run(test_loss)
+      self.evaluate(variables.global_variables_initializer())
+      initial_test_loss = self.evaluate(test_loss)
+      self.evaluate(train_op)
+      final_test_loss = self.evaluate(test_loss)
 
     # Parameters are tied, so the loss should have gone down when we trained it.
     self.assertLess(final_test_loss, initial_test_loss)
@@ -172,6 +173,7 @@ class TemplateTest(test.TestCase):
     self.assertEqual("s1/dummy:0", v1.name)
     self.assertEqual("s1_1/dummy:0", v3.name)
 
+  @test_util.run_deprecated_v1
   def test_same_unique_name_raise_error(self):
     tmpl1 = template.make_template(
         "_", variable_scoped_function, unique_name_="s1")
@@ -190,6 +192,7 @@ class TemplateTest(test.TestCase):
         template.make_template(
             "_", variable_scoped_function, unique_name_="s1")
 
+  @test_util.run_deprecated_v1
   def test_unique_name_and_reuse(self):
     tmpl1 = template.make_template(
         "_", variable_scoped_function, unique_name_="s1")
@@ -260,6 +263,7 @@ class TemplateTest(test.TestCase):
     self.assertEqual("s1/test/dummy:0", v1.name)
     self.assertEqual("s1_1/test/dummy:0", v3.name)
 
+  @test_util.run_deprecated_v1
   def test_enforces_no_extra_trainable_variables(self):
     tmpl = template.make_template("s", function_with_create, trainable=True)
 
@@ -675,6 +679,7 @@ class TemplateTest(test.TestCase):
     self.assertEqual(1, len(tb.variables))
 
   # TODO(apassos) handle local variables in Eager
+  @test_util.run_deprecated_v1
   def test_local_variables(self):
     # Make sure trainable_variables are created.
     with variable_scope.variable_scope("foo3"):
diff --git a/tensorflow/python/kernel_tests/tensor_array_ops_test.py b/tensorflow/python/kernel_tests/tensor_array_ops_test.py
index 0188eb246f0..884c04eb7ac 100644
--- a/tensorflow/python/kernel_tests/tensor_array_ops_test.py
+++ b/tensorflow/python/kernel_tests/tensor_array_ops_test.py
@@ -63,6 +63,8 @@ def _make_ta(size, name, dtype=dtypes.float32, infer_shape=False):
       dtype=dtype, tensor_array_name=name, size=size, infer_shape=infer_shape)
 
 
+@test_util.run_all_in_graph_and_eager_modes
+@test_util.with_control_flow_v2
 class TensorArrayTest(test.TestCase):
 
   @classmethod
@@ -123,11 +125,9 @@ class TensorArrayTest(test.TestCase):
     self._testTensorArrayWritePack(dtypes.complex128)
     self._testTensorArrayWritePack(dtypes.string)
 
-  @test_util.run_in_graph_and_eager_modes
   def testTensorArrayWritePack(self):
     self._testTensorArrayWritePackMaybeLegacy()
 
-  @test_util.run_in_graph_and_eager_modes
   def testEmptyTensorArrayPack(self):
     with self.session(use_gpu=True):
       ta = tensor_array_ops.TensorArray(
@@ -161,7 +161,7 @@ class TensorArrayTest(test.TestCase):
           convert([[4.0, 5.0], [104.0, 105.0], [204.0, 205.0], [6.0, 7.0],
                    [106.0, 107.0], [8.0, 9.0]]), c0)
 
-  @test_util.run_in_graph_and_eager_modes
+  @test_util.run_deprecated_v1
   def testTensorArrayWriteConcat(self):
     self._testTensorArrayWriteConcat(dtypes.float32)
     self._testTensorArrayWriteConcat(dtypes.float64)
@@ -184,7 +184,8 @@ class TensorArrayTest(test.TestCase):
       self.assertAllEqual([[0.0, 0.0], [4.0, 5.0], [0.0, 0.0]],
                           self.evaluate(ta.write(1, [[4.0, 5.0]]).concat()))
 
-  @test_util.run_in_graph_and_eager_modes
+  @test_util.disable_control_flow_v2("b/118890905")
+  @test_util.run_v1_only("b/118890905")
   def testTensorArrayReadOrPackNotAllValuesAvailableFillsZeros(self):
     self._testTensorArrayReadOrPackNotAllValuesAvailableFillsZeros()
 
@@ -200,7 +201,8 @@ class TensorArrayTest(test.TestCase):
     self.assertAllEqual([[0.0, 0.0], [4.0, 5.0], [0.0, 0.0]],
                         self.evaluate(ta.write(1, [[4.0, 5.0]]).concat()))
 
-  @test_util.run_in_graph_and_eager_modes
+  @test_util.disable_control_flow_v2("b/118890905")
+  @test_util.run_v1_only("b/118890905")
   def testTensorArrayReadOrPackNotAllValuesAvailableInferShapeFillsZeros(self):
     self._testTensorArrayReadOrPackNotAllValuesAvailableInferShapeFillsZeros()
 
@@ -251,7 +253,6 @@ class TensorArrayTest(test.TestCase):
     self._testTensorArrayUnpackRead(dtypes.complex128)
     self._testTensorArrayUnpackRead(dtypes.string)
 
-  @test_util.run_in_graph_and_eager_modes
   def testTensorArrayUnpackRead(self):
     self._testTensorArrayUnpackReadMaybeLegacy()
 
@@ -297,7 +298,7 @@ class TensorArrayTest(test.TestCase):
       self.assertAllEqual(convert([]).reshape(0, 2), d1)
       self.assertAllEqual(convert([[3.0, 301.0]]), d2)
 
-  @test_util.run_in_graph_and_eager_modes
+  @test_util.run_deprecated_v1
   def testTensorArraySplitRead(self):
     self._testTensorArraySplitRead(dtypes.float32)
     self._testTensorArraySplitRead(dtypes.float64)
@@ -307,7 +308,9 @@ class TensorArrayTest(test.TestCase):
     self._testTensorArraySplitRead(dtypes.complex128)
     self._testTensorArraySplitRead(dtypes.string)
 
-  def testTensorGradArrayWriteRead(self):
+  @test_util.disable_control_flow_v2("v2 does not support TensorArray.grad.")
+  @test_util.run_deprecated_v1
+  def testSkipEagerTensorGradArrayWriteRead(self):
     with self.session(use_gpu=True) as session:
       ta = tensor_array_ops.TensorArray(
           dtype=dtypes.float32,
@@ -340,7 +343,29 @@ class TensorArrayTest(test.TestCase):
       self.assertAllEqual([[2.0]], g_d1)
       self.assertAllEqual(-2.0, g_d2)
 
-  def testTensorGradArrayDynamicWriteRead(self):
+  @test_util.run_deprecated_v1
+  def testSkipEagerTensorArrayGradGrad(self):
+    if not tensor_array_ops.ENABLE_TENSOR_ARRAY_V2:
+      self.skipTest("Legacy TensorArray does not support double derivatives.")
+    with self.test_session(use_gpu=True) as session:
+      x = constant_op.constant(4.0)
+
+      ta = tensor_array_ops.TensorArray(
+          dtype=dtypes.float32,
+          tensor_array_name="foo",
+          size=1,
+          infer_shape=False)
+      w0 = ta.write(0, x)
+      r0 = w0.read(0)
+      y = r0 * r0
+
+      g1 = gradients_impl.gradients(ys=[y], xs=[x])
+      g2 = gradients_impl.gradients(ys=[g1], xs=[x])
+      self.assertAllEqual([2.0], session.run(g2))
+
+  @test_util.disable_control_flow_v2("v2 does not support TensorArray.grad.")
+  @test_util.run_deprecated_v1
+  def testSkipEagerTensorGradArrayDynamicWriteRead(self):
     with self.session(use_gpu=True) as session:
       ta = tensor_array_ops.TensorArray(
           dtype=dtypes.float32,
@@ -381,7 +406,9 @@ class TensorArrayTest(test.TestCase):
       self.assertAllEqual(3, vs)
       self.assertAllEqual(3, g_vs)
 
-  def testTensorGradAccessTwiceReceiveSameObject(self):
+  @test_util.disable_control_flow_v2("v2 does not support TensorArray.grad.")
+  @test_util.run_deprecated_v1
+  def testSkipEagerTensorGradAccessTwiceReceiveSameObject(self):
     with self.session(use_gpu=True) as session:
       ta = tensor_array_ops.TensorArray(
           dtype=dtypes.float32, tensor_array_name="foo", size=3)
@@ -397,26 +424,41 @@ class TensorArrayTest(test.TestCase):
       self.assertAllEqual(t_g_ta_0, t_g_ta_1)
       self.assertAllEqual([[4.0, 5.0]], d_r1_0)
 
-  @test_util.run_in_graph_and_eager_modes
+  @test_util.run_deprecated_v1
   def testTensorArrayWriteWrongIndexOrDataTypeFails(self):
     with self.session(use_gpu=True):
       ta = _make_ta(3, "foo", dtype=dtypes.float32)
       # Test writing the wrong datatype
-      with self.assertRaisesOpError(
-          "TensorArray dtype is (float|float32) but Op is trying to write "
-          "dtype string"):
+      if (tensor_array_ops.ENABLE_TENSOR_ARRAY_V2 and
+          not context.executing_eagerly()):
+        error_msg = ("Invalid data types; op elements string but list elements "
+                     "float")
+      else:
+        error_msg = (
+            "TensorArray dtype is (float|float32) but Op is trying to write "
+            "dtype string")
+      with self.assertRaisesOpError(error_msg):
         self.evaluate(ta.write(0, "wrong_type_scalar").flow)
 
-      with self.assertRaisesOpError("index -1"):
+      if (tensor_array_ops.ENABLE_TENSOR_ARRAY_V2 and
+          not context.executing_eagerly()):
+        error_msg = "Trying to modify element -1 in a list with 3 elements."
+      else:
+        error_msg = "index -1"
+      with self.assertRaisesOpError(error_msg):
         self.evaluate(ta.write(-1, 3.0).flow)
 
+      if (tensor_array_ops.ENABLE_TENSOR_ARRAY_V2 and
+          not context.executing_eagerly()):
+        error_msg = "Trying to modify element 3 in a list with 3 elements"
+      else:
+        error_msg = ("Tried to write to index 3 but array is not "
+                     "resizeable and size is: 3")
       # Test reading from too large an index
-      with self.assertRaisesOpError(
-          "Tried to write to index 3 but array is not "
-          "resizeable and size is: 3"):
+      with self.assertRaisesOpError(error_msg):
         self.evaluate(ta.write(3, 3.0).flow)
 
-  @test_util.run_in_graph_and_eager_modes
+  @test_util.run_deprecated_v1
   def testTensorArrayReadWrongIndexOrDataTypeFails(self):
     with self.session(use_gpu=True):
       ta = _make_ta(3, "foo", dtype=dtypes.float32)
@@ -424,23 +466,35 @@ class TensorArrayTest(test.TestCase):
       w0 = ta.write(0, [[4.0, 5.0]])
 
       # Test reading wrong datatype (only possible when constructing graphs).
-      if not context.executing_eagerly():
+      if (not context.executing_eagerly() and
+          not tensor_array_ops.ENABLE_TENSOR_ARRAY_V2):
         r0_bad = gen_data_flow_ops.tensor_array_read_v3(
             handle=w0.handle, index=0, dtype=dtypes.float64, flow_in=w0.flow)
         with self.assertRaisesOpError(
             "TensorArray dtype is float but Op requested dtype double."):
-          r0_bad.eval()
+          self.evaluate(r0_bad)
 
+      if (tensor_array_ops.ENABLE_TENSOR_ARRAY_V2 and
+          not context.executing_eagerly()):
+        error_msg = "Trying to access element -1 in a list with 3 elements."
+      else:
+        error_msg = "index -1"
       # Test reading from a negative index, which is not allowed
-      with self.assertRaisesOpError("index -1"):
+      with self.assertRaisesOpError(error_msg):
         self.evaluate(ta.read(-1))
 
+      if (tensor_array_ops.ENABLE_TENSOR_ARRAY_V2 and
+          not context.executing_eagerly()):
+        error_msg = "Trying to access element 3 in a list with 3 elements."
+      else:
+        error_msg = "Tried to read from index 3 but array size is: 3"
       # Test reading from too large an index
-      with self.assertRaisesOpError(
-          "Tried to read from index 3 but array size is: 3"):
+      with self.assertRaisesOpError(error_msg):
         self.evaluate(ta.read(3))
 
-  def testTensorArrayWriteMultipleFails(self):
+  @test_util.disable_control_flow_v2("v2 allows multiple writes.")
+  @test_util.run_deprecated_v1
+  def testSkipEagerTensorArrayWriteMultipleFails(self):
     with self.session(use_gpu=True):
       ta = tensor_array_ops.TensorArray(
           dtype=dtypes.float32, tensor_array_name="foo", size=3)
@@ -450,7 +504,7 @@ class TensorArrayTest(test.TestCase):
           "it has already been written to."):
         self.evaluate(ta.write(2, 3.0).write(2, 3.0).flow)
 
-  @test_util.run_in_graph_and_eager_modes
+  @test_util.run_deprecated_v1
   def testTensorArrayConcatIncompatibleShapesFails(self):
     with self.session(use_gpu=True):
       ta = tensor_array_ops.TensorArray(
@@ -482,7 +536,7 @@ class TensorArrayTest(test.TestCase):
       with self.assertRaisesOpError("shape"):
         self.evaluate(w3.concat())
 
-  @test_util.run_in_graph_and_eager_modes
+  @test_util.run_deprecated_v1
   def testTensorArraySplitIncompatibleShapesFails(self):
     with self.session(use_gpu=True):
       in_eager_mode = context.executing_eagerly()
@@ -495,22 +549,32 @@ class TensorArrayTest(test.TestCase):
           lengths = array_ops.placeholder(dtypes.int64)
           ta.split([1.0, 2.0, 3.0], lengths).flow.eval(feed_dict={lengths: 1})
 
-      with self.assertRaisesOpError(
-          r"Expected sum of lengths to be equal to values.shape\[0\], "
-          r"but sum of lengths is 1 and value's shape is: \[3\]"):
+      error_msg = ("Unused values in tensor. Length of tensor: 3 Values used: 1"
+                   if tensor_array_ops.ENABLE_TENSOR_ARRAY_V2 and
+                   not in_eager_mode else
+                   r"Expected sum of lengths to be equal to values.shape\[0\], "
+                   r"but sum of lengths is 1 and value's shape is: \[3\]")
+      with self.assertRaisesOpError(error_msg):
         self.evaluate(ta.split([1.0, 2.0, 3.0], [1]).flow)
 
       ta = _make_ta(1, "baz")
-      with self.assertRaisesOpError(
-          r"Expected value to be at least a vector, but received shape: \[\]"):
-        self.evaluate(ta.split(1.0, [1]).flow)
+      if tensor_array_ops.ENABLE_TENSOR_ARRAY_V2 and not in_eager_mode:
+        with self.assertRaisesRegexp(
+            ValueError, "Shape must be at least rank 1 but is rank 0"):
+          self.evaluate(ta.split(1.0, [1]).flow)
+      else:
+        with self.assertRaisesOpError(
+            r"Expected value to be at least a vector, but received shape: \[\]"
+        ):
+          self.evaluate(ta.split(1.0, [1]).flow)
 
-      ta = _make_ta(2, "buz")
-      with self.assertRaisesOpError(
-          r"TensorArray's size is not equal to the size of lengths "
-          r"\(2 vs. 1\), and the TensorArray is not marked as "
-          r"dynamically resizeable"):
-        self.evaluate(ta.split([1.0], [1]).flow)
+      if not tensor_array_ops.ENABLE_TENSOR_ARRAY_V2 or in_eager_mode:
+        ta = _make_ta(2, "buz")
+        with self.assertRaisesOpError(
+            r"TensorArray's size is not equal to the size of lengths "
+            r"\(2 vs. 1\), and the TensorArray is not marked as "
+            r"dynamically resizeable"):
+          self.evaluate(ta.split([1.0], [1]).flow)
 
   def _testTensorArrayWriteGradientAddMultipleAdds(self, dtype):
     with self.cached_session(use_gpu=True):
@@ -546,12 +610,16 @@ class TensorArrayTest(test.TestCase):
           r"existing shape is \[\] but the new input shape is \[1\]"):
         wb1_grad.flow.eval()
 
-  def testTensorArrayWriteGradientAddMultipleAdds(self):
+  @test_util.disable_control_flow_v2("v2 does not support TensorArray.grad.")
+  @test_util.run_deprecated_v1
+  def testSkipEagerTensorArrayWriteGradientAddMultipleAdds(self):
     for dtype in (dtypes.int32, dtypes.int64, dtypes.float32, dtypes.float64,
                   dtypes.complex64, dtypes.complex128):
       self._testTensorArrayWriteGradientAddMultipleAdds(dtype)
 
-  def testTensorArrayGradWithShapeKnownElementShape(self):
+  @test_util.disable_control_flow_v2("Low level legacy TA op test.")
+  @test_util.run_deprecated_v1
+  def testSkipEagerTensorArrayGradWithShapeKnownElementShape(self):
     with self.session(use_gpu=True) as sess:
       ta = tensor_array_ops.TensorArray(
           size=3,
@@ -580,7 +648,9 @@ class TensorArrayTest(test.TestCase):
       self.assertAllClose(fed_value,
                           sess.run(read_value, feed_dict={value: fed_value}))
 
-  def testTensorArrayGradWithShapeUnknownElementShape(self):
+  @test_util.disable_control_flow_v2("Low level legacy TA op test.")
+  @test_util.run_deprecated_v1
+  def testSkipEagerTensorArrayGradWithShapeUnknownElementShape(self):
     with self.session(use_gpu=True) as sess:
       ta = tensor_array_ops.TensorArray(
           size=3, dtype=dtypes.float32,
@@ -603,7 +673,6 @@ class TensorArrayTest(test.TestCase):
       self.assertAllClose(fed_value,
                           sess.run(read_value, feed_dict={value: fed_value}))
 
-  @test_util.run_in_graph_and_eager_modes
   def testMultiTensorArray(self):
     with self.session(use_gpu=True):
       h1 = tensor_array_ops.TensorArray(
@@ -667,7 +736,8 @@ class TensorArrayTest(test.TestCase):
       self.assertAllEqual(c([[3.0, 2.0]]), grad_vals[0])
       self.assertAllEqual(c(-2.0), grad_vals[1])
 
-  def testTensorArrayGradientWriteRead(self):
+  @test_util.run_deprecated_v1
+  def testSkipEagerTensorArrayGradientWriteRead(self):
     for dtype in (np.float32, np.float64, np.complex64, np.complex128):
       self._testTensorArrayGradientWriteReadType(dtype)
 
@@ -698,15 +768,17 @@ class TensorArrayTest(test.TestCase):
                 [-0.5, 1.5],  # read(0) gradient
                 [20.0, 30.0, 40.0, 50.0]
             ])  # concat gradient
-      grad_vals = sess.run(grad_r)  # 2 + 2 entries
+      grad_vals = self.evaluate(grad_r)  # 2 + 2 entries
 
       self.assertAllClose([2.0 - 0.5 + 20.0, 3.0 + 1.5 + 30.0], grad_vals[0])
       self.assertAllEqual([4.0 + 40.0, 5.0 + 50.0], grad_vals[1])
 
-  def testTensorArrayGradientWritePackConcatAndRead(self):
+  @test_util.run_deprecated_v1
+  def testSkipEagerTensorArrayGradientWritePackConcatAndRead(self):
     self._testTensorArrayGradientWritePackConcatAndRead()
 
-  @test_util.run_in_graph_and_eager_modes
+  @test_util.disable_control_flow_v2("v2 does not support clear_after_read.")
+  @test_util.run_deprecated_v1
   def testTensorArrayReadTwice(self):
     with self.session(use_gpu=True):
       value = constant_op.constant([[1.0, -1.0], [10.0, -10.0]])
@@ -760,10 +832,12 @@ class TensorArrayTest(test.TestCase):
       self.assertEqual(len(grad_vals), 1)
       self.assertAllEqual([[2.0 - 1.5, 3.0 + 1.5], [4.0, 5.0]], grad_vals[0])
 
-  def testTensorArrayGradientUnpackRead(self):
+  @test_util.run_deprecated_v1
+  def testSkipEagerTensorArrayGradientUnpackRead(self):
     self._testTensorArrayGradientUnpackRead()
 
-  def testTensorArrayGradientSplitConcat(self):
+  @test_util.run_deprecated_v1
+  def testSkipEagerTensorArrayGradientSplitConcat(self):
     with self.session(use_gpu=True) as session:
       ta = tensor_array_ops.TensorArray(
           dtype=dtypes.float32, tensor_array_name="foo", size=2,
@@ -808,17 +882,16 @@ class TensorArrayTest(test.TestCase):
       self.assertEqual(len(grad_vals), 1)
       self.assertAllEqual([[2.0, 3.0], [4.0, 5.0]], grad_vals[0])
 
-  def testTensorArrayGradientDynamicUnpackRead(self):
+  @test_util.run_deprecated_v1
+  def testSkipEagerTensorArrayGradientDynamicUnpackRead(self):
     self._testTensorArrayGradientDynamicUnpackRead()
 
-  @test_util.run_in_graph_and_eager_modes
   def testCloseTensorArray(self):
     with self.session(use_gpu=True):
       ta = tensor_array_ops.TensorArray(
           dtype=dtypes.float32, tensor_array_name="foo", size=3)
       self.evaluate(ta.close())
 
-  @test_util.run_in_graph_and_eager_modes
   def testSizeTensorArray(self):
     with self.session(use_gpu=True):
       ta = tensor_array_ops.TensorArray(
@@ -826,7 +899,6 @@ class TensorArrayTest(test.TestCase):
       s = ta.size()
       self.assertAllEqual(3, self.evaluate(s))
 
-  @test_util.run_in_graph_and_eager_modes
   def testWriteCloseTensorArray(self):
     with self.session(use_gpu=True):
       ta = tensor_array_ops.TensorArray(
@@ -924,7 +996,6 @@ class TensorArrayTest(test.TestCase):
       self.assertAllClose(grad_val.sum(axis=0), var_grad_t)
       self.assertAllClose(grad_val.sum(axis=0), state0_grad_t)
 
-  @test_util.run_in_graph_and_eager_modes
   def testWhileLoopWritePackGradients(self):
     self._testWhileLoopWritePackGradients(
         dynamic_size=False, dtype=dtypes.float32)
@@ -932,11 +1003,28 @@ class TensorArrayTest(test.TestCase):
     # self._testWhileLoopWritePackGradients(
     #     dynamic_size=False, dtype=tf.int64)
 
-  def testWhileLoopDynamicWritePackGradients(self):
+  @test_util.disable_control_flow_v2("Testing v1 while_loop with v2 TA")
+  @test_util.enable_tensor_array_v2
+  def testWhileLoopV1WithTensorArrayV2(self):
+    size = 3
+    ta = tensor_array_ops.TensorArray(
+        dtype=dtypes.int32, size=size, element_shape=tensor_shape.scalar())
+
+    def Body(counter, ta):
+      return counter + 1, ta.write(counter, counter)
+
+    _, ta = control_flow_ops.while_loop(lambda i, _: i < size, Body, [0, ta])
+
+    for i in range(size):
+      self.assertEqual(self.evaluate(ta.read(i)), i)
+
+  @test_util.disable_control_flow_v2("b/117943489 (dynamic_size)")
+  @test_util.run_v1_only("b/117943489")
+  def testSkipEagerWhileLoopDynamicWritePackGradients(self):
     self._testWhileLoopWritePackGradients(
         dynamic_size=True, dtype=dtypes.float32)
 
-  @test_util.run_in_graph_and_eager_modes
+  @test_util.disable_control_flow_v2("b/119323158")
   def testGradSerialTwoLoops(self):
     with self.session(use_gpu=True):
       def loop(x):
@@ -976,7 +1064,8 @@ class TensorArrayTest(test.TestCase):
         grad = gradients_impl.gradients(loop(x), [x])[0]
       self.assertAllClose(31.0, self.evaluate(grad))
 
-  def testSumOfTwoReadVariablesWithoutRepeatGrad(self):
+  @test_util.run_deprecated_v1
+  def testSkipEagerSumOfTwoReadVariablesWithoutRepeatGrad(self):
     with self.session(use_gpu=True) as session:
       a = array_ops.identity(
           np.arange(
@@ -1011,7 +1100,8 @@ class TensorArrayTest(test.TestCase):
   def _grad_source_for_name(self, name):
     return tensor_array_grad._GetGradSource(constant_op.constant(0, name=name))
 
-  def testGetGradSource_Invalid(self):
+  @test_util.run_deprecated_v1
+  def testSkipEagerGetGradSource_Invalid(self):
     with self.assertRaises(ValueError):
       self._grad_source_for_name("")
     with self.assertRaises(ValueError):
@@ -1019,7 +1109,8 @@ class TensorArrayTest(test.TestCase):
     with self.assertRaises(ValueError):
       self._grad_source_for_name("foo/bar")
 
-  def testGetGradSource_NoEnclosingScope(self):
+  @test_util.run_deprecated_v1
+  def testSkipEagerGetGradSource_NoEnclosingScope(self):
     self.assertEqual("gradients:0", self._grad_source_for_name("gradients"))
     self.assertEqual("gradients_0:0", self._grad_source_for_name("gradients_0"))
     self.assertEqual("gradients", self._grad_source_for_name("gradients/foo"))
@@ -1030,7 +1121,8 @@ class TensorArrayTest(test.TestCase):
     self.assertEqual("gradients_0",
                      self._grad_source_for_name("gradients_0/foo/bar"))
 
-  def testGetGradSource_EnclosingScope(self):
+  @test_util.run_deprecated_v1
+  def testSkipEagerGetGradSource_EnclosingScope(self):
     self.assertEqual("foo/gradients:0",
                      self._grad_source_for_name("foo/gradients"))
     self.assertEqual("foo/gradients_0:0",
@@ -1044,12 +1136,14 @@ class TensorArrayTest(test.TestCase):
     self.assertEqual("foo/bar/gradients_0",
                      self._grad_source_for_name("foo/bar/gradients_0/baz"))
 
-  def testGetGradSource_NestedUsesInnermost(self):
+  @test_util.run_deprecated_v1
+  def testSkipEagerGetGradSource_NestedUsesInnermost(self):
     self.assertEqual(
         "foo/gradients/bar/gradients_0",
         self._grad_source_for_name("foo/gradients/bar/gradients_0/baz"))
 
-  def testWriteShape(self):
+  @test_util.run_deprecated_v1
+  def testSkipEagerWriteShape(self):
     with self.session(use_gpu=True):
       ta = tensor_array_ops.TensorArray(
           dtype=dtypes.float32, tensor_array_name="foo", size=3)
@@ -1073,7 +1167,8 @@ class TensorArrayTest(test.TestCase):
       with self.assertRaises(ValueError):
         w0.write(0, c2)
 
-  def testPartlyUnknownShape(self):
+  @test_util.run_deprecated_v1
+  def testSkipEagerPartlyUnknownShape(self):
     with self.session(use_gpu=True):
       ta = tensor_array_ops.TensorArray(
           dtype=dtypes.float32, tensor_array_name="foo", size=6)
@@ -1113,7 +1208,6 @@ class TensorArrayTest(test.TestCase):
       r5 = w5.read(0)
       self.assertAllEqual([5, 4, 2, 3], r5.get_shape().as_list())
 
-  @test_util.run_in_graph_and_eager_modes
   def _testUnpackShape(self):
     with self.cached_session(use_gpu=True):
       ta = tensor_array_ops.TensorArray(
@@ -1144,10 +1238,12 @@ class TensorArrayTest(test.TestCase):
       with self.assertRaises(ValueError):
         w1.write(4, c2)
 
+  @test_util.disable_control_flow_v2("b/117943489 (dynamic_size)")
+  @test_util.run_v1_only("b/117943489")
   def testUnpackShape(self):
     self._testUnpackShape()
 
-  @test_util.run_in_graph_and_eager_modes
+  @test_util.run_deprecated_v1
   def testSplitShape(self):
     with self.session(use_gpu=True):
       ta = tensor_array_ops.TensorArray(
@@ -1174,11 +1270,13 @@ class TensorArrayTest(test.TestCase):
         self.assertEqual((2, 2), w0.read(1).get_shape())
       else:
         self.assertEqual(r0.get_shape().ndims, None)
-        self.assertEqual(
-            tensor_shape.TensorShape(
-                ta1.handle.op.get_attr("element_shape")).ndims, None)
+        if not tensor_array_ops.ENABLE_TENSOR_ARRAY_V2:
+          self.assertEqual(
+              tensor_shape.TensorShape(
+                  ta1.handle.op.get_attr("element_shape")).ndims, None)
 
-  def testWriteUnknownShape(self):
+  @test_util.run_deprecated_v1
+  def testSkipEagerWriteUnknownShape(self):
     with self.session(use_gpu=True):
       ta = tensor_array_ops.TensorArray(
           dtype=dtypes.float32,
@@ -1201,7 +1299,11 @@ class TensorArrayTest(test.TestCase):
       grad_r0_vals = session.run(grad_r0)[0]
       self.assertAllEqual(grad_r0_vals, [1.0, 0.0])
 
-  def testGradientWhenNotAllComponentsRead(self):
+  # TODO(srbs): Figure out how to enable this. This is probably failing
+  # because we are trying to stack a TensorList with invalid tensors.
+  # That is because we do not receive gradients for all list indices.
+  # Figure out how TensorArray handles this.
+  def disabletestGradientWhenNotAllComponentsRead(self):
     self._testGradientWhenNotAllComponentsRead()
 
   def _testTensorArrayUnpackDynamic(self):
@@ -1212,14 +1314,18 @@ class TensorArrayTest(test.TestCase):
       w0 = ta.unstack(x)
       w1 = w0.write(3, 4.0)
       r = w1.stack()
-      self.assertAllEqual(np.array([1.0, 2.0, 3.0, 4.0]), r.eval())
+      self.assertAllEqual(np.array([1.0, 2.0, 3.0, 4.0]), self.evaluate(r))
       grad = gradients_impl.gradients(ys=[r], xs=[x])
-      self.assertAllEqual(np.array([1.0, 1.0, 1.0]), sess.run(grad)[0])
+      self.assertAllEqual(np.array([1.0, 1.0, 1.0]), self.evaluate(grad)[0])
 
-  def testTensorArrayUnpackDynamic(self):
+  @test_util.disable_control_flow_v2("b/117943489")
+  @test_util.run_v1_only("b/117943489")
+  def testSkipEagerTensorArrayUnpackDynamic(self):
     self._testTensorArrayUnpackDynamic()
 
-  def testTensorArraySplitDynamic(self):
+  @test_util.disable_control_flow_v2("b/117943489")
+  @test_util.run_v1_only("b/117943489")
+  def testSkipEagerTensorArraySplitDynamic(self):
     with self.session(use_gpu=True) as sess:
       ta = tensor_array_ops.TensorArray(
           dtype=dtypes.float32, size=3, dynamic_size=True)
@@ -1227,21 +1333,26 @@ class TensorArrayTest(test.TestCase):
       w0 = ta.split(x, [1, 1, 1])
       w1 = w0.write(3, [4.0])
       r = w1.concat()
-      self.assertAllEqual(np.array([1.0, 2.0, 3.0, 4.0]), r.eval())
+      self.assertAllEqual(np.array([1.0, 2.0, 3.0, 4.0]), self.evaluate(r))
       grad = gradients_impl.gradients(ys=[r], xs=[x])
-      self.assertAllEqual(np.array([1.0, 1.0, 1.0]), sess.run(grad)[0])
+      self.assertAllEqual(np.array([1.0, 1.0, 1.0]), self.evaluate(grad)[0])
 
   def _testTensorArrayEvalEmpty(self):
     with self.cached_session(use_gpu=True):
       ta = tensor_array_ops.TensorArray(
           dtype=dtypes.float32, size=0, dynamic_size=False, infer_shape=False)
-      with self.assertRaisesOpError(
-          "TensorArray has size zero, but element shape <unknown> is not fully "
-          "defined. Currently only static shapes are supported when packing "
-          "zero-size TensorArrays."):
+      v2_msg = ("Tried to stack elements of a empty list with "
+                "non-fully-defined shape")
+      v1_msg = (
+          "TensorArray has size zero, but element shape <unknown> is not "
+          "fully defined. Currently only static shapes are supported when "
+          "packing zero-size TensorArrays.")
+      with self.assertRaisesOpError(v2_msg if tensor_array_ops
+                                    .ENABLE_TENSOR_ARRAY_V2 else v1_msg):
         ta.stack().eval()
 
-  def testTensorArrayEvalEmpty(self):
+  @test_util.run_deprecated_v1
+  def testSkipEagerTensorArrayEvalEmpty(self):
     self._testTensorArrayEvalEmpty()
 
   # this test is ill-defined for Eager mode --- unpacking an empty tensor
@@ -1255,15 +1366,19 @@ class TensorArrayTest(test.TestCase):
       ta.unstack(array_ops.zeros([0, 3, 5])).mark_used()
       packed = ta.stack()
       concatenated = ta.concat()
-      self.assertAllEqual([0, 3, 5], packed.eval().shape)
+      self.assertAllEqual([0, 3, 5], self.evaluate(packed).shape)
       # Concatenating zero tensors along their first dimension gives a
       # first dimension of zero
-      self.assertAllEqual([0, 5], concatenated.eval().shape)
+      self.assertAllEqual([0, 5], self.evaluate(concatenated).shape)
 
-  def testTensorArrayEvalEmptyWithDefault(self):
+  @test_util.disable_control_flow_v2("b/117943489")
+  @test_util.run_v1_only("b/117943489")
+  def testSkipEagerTensorArrayEvalEmptyWithDefault(self):
     self._testTensorArrayEvalEmptyWithDefault()
 
-  def testTensorArrayScatterReadAndGradients(self):
+  @test_util.disable_control_flow_v2("b/117943489")
+  @test_util.run_v1_only("b/117943489")
+  def testSkipEagerTensorArrayScatterReadAndGradients(self):
     with self.session(use_gpu=True) as session:
       ta = tensor_array_ops.TensorArray(
           dtype=dtypes.float32,
@@ -1289,7 +1404,8 @@ class TensorArrayTest(test.TestCase):
       self.assertAllEqual([10.0, -10.0], read_vals[1])
       self.assertAllEqual([[2.0, 3.0], [4.0, 5.0]], grad_vals[0])
 
-  @test_util.run_in_graph_and_eager_modes
+  @test_util.disable_control_flow_v2("b/117943286")
+  @test_util.run_v1_only("b/117943286")
   def testTensorArrayWriteGatherAndGradients(self):
     with self.session(use_gpu=True) as session:
       ta = tensor_array_ops.TensorArray(
@@ -1326,7 +1442,9 @@ class TensorArrayTest(test.TestCase):
       self.assertAllEqual([[1.0, -1.0], [8.0, -8.0]], g_vals[0])
       self.assertAllEqual(expected_grad, grad_vals[0])
 
-  def testTensorArrayGetsDeviceFromFirstWrite(self):
+  @test_util.disable_control_flow_v2("colocate_with not supported in v2.")
+  @test_util.run_deprecated_v1
+  def testSkipEagerTensorArrayGetsDeviceFromFirstWrite(self):
     with ops.device("/job:worker/task:0/cpu:0"):
       # this initial device will be ignored.
       ta = tensor_array_ops.TensorArray(dtype=dtypes.float32, size=2)
@@ -1374,7 +1492,9 @@ class TensorArrayTest(test.TestCase):
         self.assertFalse(
             [s for s in dev_stats[d] if "/TensorArray" in s.node_name])
 
-  def testTensorArrayGetsDeviceFromFirstWriteInWhileLoop(self):
+  @test_util.disable_control_flow_v2("colocate_with not supported in v2.")
+  @test_util.run_deprecated_v1
+  def testSkipEagerTensorArrayGetsDeviceFromFirstWriteInWhileLoop(self):
     with ops.device("/job:worker/task:0/cpu:0"):
       ta = tensor_array_ops.TensorArray(dtype=dtypes.float32, size=2)
 
@@ -1403,7 +1523,9 @@ class TensorArrayTest(test.TestCase):
         self.assertFalse(
             [s for s in dev_stats[d] if "TensorArray" == s.node_name])
 
-  def testTensorArrayDisabledColocateWithFirstWriteCall(self):
+  @test_util.disable_control_flow_v2("colocate_with not supported in v2.")
+  @test_util.run_deprecated_v1
+  def testSkipEagerTensorArrayDisabledColocateWithFirstWriteCall(self):
     with ops.device("/job:worker/task:0/cpu:0"):
       ta = tensor_array_ops.TensorArray(
           dtype=dtypes.float32, size=2, colocate_with_first_write_call=False)
@@ -1433,7 +1555,6 @@ class TensorArrayTest(test.TestCase):
         self.assertFalse(
             [s for s in dev_stats[d] if "TensorArray" == s.node_name])
 
-  @test_util.run_in_graph_and_eager_modes
   def testTensorArrayIdentity(self):
     with self.session(use_gpu=True):
       ta0 = tensor_array_ops.TensorArray(dtype=dtypes.float32, size=2,
@@ -1486,7 +1607,8 @@ class TensorArrayTest(test.TestCase):
       self.assertEqual(size0_v, 2)
       self.assertEqual(size1_v, 4)
 
-  def testTensorArrayGradYsInCorrectScope(self):
+  @test_util.run_deprecated_v1
+  def testSkipEagerTensorArrayGradYsInCorrectScope(self):
     n_time = 1
     n_dim = 1
     x = constant_op.constant([[1.42]])
@@ -1501,10 +1623,10 @@ class TensorArrayTest(test.TestCase):
       # wrap it in the correct name scope.
       dx, = gradients_impl.gradients(ys=[y], xs=[x], grad_ys=[dy])
       with self.cached_session(use_gpu=True) as sess:
-        vdx, vdy = sess.run([dx, dy])
+        vdx, vdy = self.evaluate([dx, dy])
       self.assertAllClose(vdx, vdy)
 
-  def testTensorArrayInt64GPU(self):
+  def testSkipEagerTensorArrayInt64GPU(self):
     if not test.is_gpu_available():
       return
     with self.session(use_gpu=True, force_gpu=True) as sess:
diff --git a/tensorflow/python/kernel_tests/topk_op_test.py b/tensorflow/python/kernel_tests/topk_op_test.py
index d9f340de6b2..5d46176bce8 100644
--- a/tensorflow/python/kernel_tests/topk_op_test.py
+++ b/tensorflow/python/kernel_tests/topk_op_test.py
@@ -27,6 +27,7 @@ from tensorflow.python.client import session
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import nn_ops
@@ -48,7 +49,7 @@ class TopKTest(test.TestCase):
     np_expected_indices = np.array(expected_indices)
     with self.cached_session(use_gpu=True) as sess:
       values_op, indices_op = nn_ops.top_k(inputs, k, sorted=sorted)
-      values, indices = sess.run([values_op, indices_op])
+      values, indices = self.evaluate([values_op, indices_op])
 
       self.assertShapeEqual(np_expected_values, values_op)
       self.assertShapeEqual(np_expected_indices, indices_op)
@@ -181,6 +182,7 @@ class TopKTest(test.TestCase):
     k = constant_op.constant(3)
     self._validateTopK(inputs, k, [19, 18, 17], [11, 3, 7])
 
+  @test_util.run_deprecated_v1
   def testKNegative(self):
     inputs = [[0.1, 0.2], [0.3, 0.4]]
     with self.session(use_gpu=True):
@@ -189,12 +191,14 @@ class TopKTest(test.TestCase):
       with self.assertRaisesOpError("Need k >= 0, got -7"):
         values.eval(feed_dict={k: -7})
 
+  @test_util.run_deprecated_v1
   def testKTooLarge(self):
     inputs = [[0.1, 0.2], [0.3, 0.4]]
     with self.assertRaisesRegexp(ValueError,
                                  r"must have last dimension >= k = 4"):
       nn_ops.top_k(inputs, 4)
 
+  @test_util.run_deprecated_v1
   def testTopKGradients(self):
     with self.session(use_gpu=True) as sess:
       inputs = array_ops.placeholder(dtypes.float32, shape=[2, 5])
diff --git a/tensorflow/python/kernel_tests/trace_op_test.py b/tensorflow/python/kernel_tests/trace_op_test.py
index f1abaefb66b..52640c02c22 100644
--- a/tensorflow/python/kernel_tests/trace_op_test.py
+++ b/tensorflow/python/kernel_tests/trace_op_test.py
@@ -19,6 +19,7 @@ from __future__ import print_function
 
 import numpy as np
 
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 
@@ -34,6 +35,7 @@ class TraceTest(test.TestCase):
       tf_ans = math_ops.trace(x).eval()
     self.assertAllClose(tf_ans, np_ans)
 
+  @test_util.run_deprecated_v1
   def testTrace(self):
     for dtype in [np.int32, np.float32, np.float64]:
       for shape in [[2, 2], [2, 3], [3, 2], [2, 3, 2], [2, 2, 2, 3]]:
diff --git a/tensorflow/python/kernel_tests/transpose_op_test.py b/tensorflow/python/kernel_tests/transpose_op_test.py
index 8c11c207097..76e1002ee1b 100644
--- a/tensorflow/python/kernel_tests/transpose_op_test.py
+++ b/tensorflow/python/kernel_tests/transpose_op_test.py
@@ -50,7 +50,7 @@ class TransposeTest(test.TestCase):
     with self.cached_session(use_gpu=False):
       inx = ops.convert_to_tensor(x)
       y = array_ops.transpose(inx, p, conjugate=conjugate)
-      tf_ans = y.eval()
+      tf_ans = self.evaluate(y)
       self.assertShapeEqual(np_ans, y)
       self.assertAllEqual(np_ans, tf_ans)
 
@@ -81,7 +81,7 @@ class TransposeTest(test.TestCase):
     with self.cached_session(use_gpu=True):
       inx = ops.convert_to_tensor(x)
       y = array_ops.transpose(inx, p, conjugate=conjugate)
-      tf_ans = y.eval()
+      tf_ans = self.evaluate(y)
 
       self.assertAllEqual(np_ans, tf_ans)
       self.assertShapeEqual(np_ans, y)
@@ -168,7 +168,7 @@ class TransposeTest(test.TestCase):
         with self.cached_session(use_gpu=True):
           inx = ops.convert_to_tensor(inp)
           y = array_ops.transpose(inx, perm)
-          tf_ans = y.eval()
+          tf_ans = self.evaluate(y)
         self.assertAllEqual(np_ans, tf_ans)
         self.assertShapeEqual(np_ans, y)
 
@@ -189,7 +189,7 @@ class TransposeTest(test.TestCase):
       with self.cached_session(use_gpu=True):
         inx = ops.convert_to_tensor(inp)
         y = array_ops.transpose(inx, perm)
-        tf_ans = y.eval()
+        tf_ans = self.evaluate(y)
       self.assertAllEqual(np_ans, tf_ans)
       self.assertShapeEqual(np_ans, y)
 
@@ -224,7 +224,7 @@ class TransposeTest(test.TestCase):
       with self.cached_session(use_gpu=True):
         inx = ops.convert_to_tensor(inp)
         y = array_ops.transpose(inx, perm)
-        tf_ans = y.eval()
+        tf_ans = self.evaluate(y)
       self.assertAllEqual(np_ans, tf_ans)
       self.assertShapeEqual(np_ans, y)
 
@@ -246,7 +246,7 @@ class TransposeTest(test.TestCase):
         with self.cached_session(use_gpu=True):
           inx = ops.convert_to_tensor(inp)
           y = array_ops.transpose(inx, perm)
-          tf_ans = y.eval()
+          tf_ans = self.evaluate(y)
         self.assertAllEqual(np_ans, tf_ans)
         self.assertShapeEqual(np_ans, y)
 
@@ -267,7 +267,7 @@ class TransposeTest(test.TestCase):
       with self.cached_session(use_gpu=True):
         inx = ops.convert_to_tensor(inp)
         y = array_ops.transpose(inx, perm)
-        tf_ans = y.eval()
+        tf_ans = self.evaluate(y)
       self.assertAllEqual(np_ans, tf_ans)
       self.assertShapeEqual(np_ans, y)
 
@@ -319,7 +319,7 @@ class TransposeTest(test.TestCase):
       with self.cached_session(use_gpu=True):
         inx = ops.convert_to_tensor(inp)
         y = array_ops.transpose(inx, perm)
-        tf_ans = y.eval()
+        tf_ans = self.evaluate(y)
       self.assertAllEqual(np_ans, tf_ans)
       self.assertShapeEqual(np_ans, y)
       self._ClearCachedSession()
@@ -341,7 +341,7 @@ class TransposeTest(test.TestCase):
         inx = ops.convert_to_tensor(x)
         inp = constant_op.constant(p)
         y = array_ops.transpose(inx, inp)
-        tf_ans = y.eval()
+        tf_ans = self.evaluate(y)
         self.assertShapeEqual(np_ans, y)
         self.assertAllEqual(np_ans, tf_ans)
 
diff --git a/tensorflow/python/kernel_tests/unicode_decode_op_test.py b/tensorflow/python/kernel_tests/unicode_decode_op_test.py
new file mode 100644
index 00000000000..c165021eea3
--- /dev/null
+++ b/tensorflow/python/kernel_tests/unicode_decode_op_test.py
@@ -0,0 +1,153 @@
+# -*- coding: utf-8 -*-
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for unicode_decode and unicode_decode_with_splits."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import errors_impl as errors
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import gen_string_ops
+from tensorflow.python.platform import test
+
+
+# Account for python2 and python3 execution of the test.
+def codepoint(s):
+  if isinstance(s, bytes):
+    return ord(s.decode("utf-8"))
+  elif isinstance(s, str):
+    return ord(s)
+
+
+class UnicodeDecodeTest(test.TestCase):
+
+  def testBatchDecode(self):
+    text = constant_op.constant(
+        ["仅今年前", "分享介面終於迎來更新"])
+    row_splits, utf8_text, offsets = gen_string_ops.unicode_decode_with_offsets(
+        text, "utf-8")
+
+    with self.test_session():
+      self.assertAllEqual([
+          codepoint("仅"),
+          codepoint("今"),
+          codepoint("年"),
+          codepoint("前"),
+          codepoint("分"),
+          codepoint("享"),
+          codepoint("介"),
+          codepoint("面"),
+          codepoint("終"),
+          codepoint("於"),
+          codepoint("迎"),
+          codepoint("來"),
+          codepoint("更"),
+          codepoint("新")
+      ],
+                          self.evaluate(utf8_text).tolist())
+      self.assertAllEqual([0, 4, 14], self.evaluate(row_splits).tolist())
+      self.assertAllEqual([0, 3, 6, 9, 0, 3, 6, 9, 12, 15, 18, 21, 24, 27],
+                          self.evaluate(offsets).tolist())
+
+  def testBasicDecodeWithOffset(self):
+    text = constant_op.constant(["仅今年前"])
+    row_splits, utf8_text, starts = gen_string_ops.unicode_decode_with_offsets(
+        text, "utf-8")
+
+    with self.test_session():
+      self.assertAllEqual([
+          codepoint("仅"),
+          codepoint("今"),
+          codepoint("年"),
+          codepoint("前"),
+      ],
+                          self.evaluate(utf8_text).tolist())
+      self.assertAllEqual(self.evaluate(row_splits).tolist(), [0, 4])
+      self.assertAllEqual(self.evaluate(starts).tolist(), [0, 3, 6, 9])
+
+  @test_util.run_deprecated_v1
+  def testStrictError(self):
+    text = constant_op.constant([b"\xFEED"])
+    _, error, _ = gen_string_ops.unicode_decode_with_offsets(
+        text, "utf-8", errors="strict")
+
+    with self.assertRaises(errors.InvalidArgumentError):
+      with self.test_session():
+        self.evaluate(error)
+
+  def testReplaceOnError(self):
+    text = constant_op.constant([b"\xFE"])
+
+    _, utf8_text, _ = gen_string_ops.unicode_decode_with_offsets(
+        text, "utf-8", errors="replace")
+
+    with self.test_session():
+      self.assertAllEqual(self.evaluate(utf8_text).tolist(), [65533])
+
+  @test_util.run_deprecated_v1
+  def testBadReplacementChar(self):
+    text = constant_op.constant([b"\xFE"])
+    _, error, _ = gen_string_ops.unicode_decode_with_offsets(
+        text, "utf-8", errors="replace", replacement_char=11141111)
+
+    with self.assertRaises(errors.InvalidArgumentError):
+      with self.test_session():
+        self.evaluate(error)
+
+  def testIgnoreOnError(self):
+    text = constant_op.constant([b"\xFEhello"])
+
+    _, utf8_text, _ = gen_string_ops.unicode_decode_with_offsets(
+        text, "utf-8", errors="ignore")
+
+    with self.test_session():
+      self.assertAllEqual(self.evaluate(utf8_text).tolist(), [
+          codepoint("h"),
+          codepoint("e"),
+          codepoint("l"),
+          codepoint("l"),
+          codepoint("o")
+      ])
+
+  @test_util.run_deprecated_v1
+  def testBadErrorPolicy(self):
+    text = constant_op.constant(["hippopotamus"])
+
+    with self.assertRaises(ValueError):
+      _, _, _ = gen_string_ops.unicode_decode_with_offsets(
+          text, "utf-8", errors="oranguatan")
+
+  def testReplaceControlChars(self):
+    text = constant_op.constant(["\x02仅今年前"])
+    row_splits, utf8_text, _ = gen_string_ops.unicode_decode_with_offsets(
+        text, "utf-8", replace_control_characters=True)
+
+    with self.test_session():
+      self.assertAllEqual([
+          65533,
+          codepoint("仅"),
+          codepoint("今"),
+          codepoint("年"),
+          codepoint("前"),
+      ],
+                          self.evaluate(utf8_text).tolist())
+      self.assertAllEqual([0, 5], self.evaluate(row_splits).tolist())
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/kernel_tests/unicode_encode_op_test.py b/tensorflow/python/kernel_tests/unicode_encode_op_test.py
new file mode 100644
index 00000000000..a5a5c2017c6
--- /dev/null
+++ b/tensorflow/python/kernel_tests/unicode_encode_op_test.py
@@ -0,0 +1,301 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for UnicodeEncode op from ragged_string_ops."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+import numpy as np
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import errors_impl as errors
+from tensorflow.python.framework import ops
+from tensorflow.python.ops.ragged import ragged_factory_ops
+from tensorflow.python.ops.ragged import ragged_string_ops
+from tensorflow.python.platform import test
+
+
+class UnicodeEncodeOpTest(test.TestCase, parameterized.TestCase):
+
+  def testScalar(self):
+    with self.cached_session():
+      with self.assertRaises(ValueError):
+        ragged_string_ops.unicode_encode(72, "UTF-8")
+    with self.cached_session():
+      with self.assertRaises(ValueError):
+        ragged_string_ops.unicode_encode(constant_op.constant(72), "UTF-8")
+
+  def testRequireParams(self):
+    with self.cached_session():
+      with self.assertRaises(TypeError):
+        ragged_string_ops.unicode_encode()
+    with self.cached_session():
+      with self.assertRaises(TypeError):
+        ragged_string_ops.unicode_encode(72)
+    with self.cached_session():
+      with self.assertRaises(TypeError):
+        ragged_string_ops.unicode_encode(encoding="UTF-8")
+
+  @parameterized.parameters("UTF-8", "UTF-16-BE", "UTF-32-BE")
+  def testStrictErrors(self, encoding):
+    test_value = np.array([72, 101, 2147483647, -1, 111], np.int32)
+    with self.cached_session():
+      with self.assertRaises(errors.InvalidArgumentError):
+        ragged_string_ops.unicode_encode(test_value, encoding, "strict").eval()
+
+  @parameterized.parameters("UTF-8", "UTF-16-BE", "UTF-32-BE")
+  def testIgnoreErrors(self, encoding):
+    test_value = np.array([72, 101, 2147483647, -1, 111], np.int32)
+    expected_value = u"Heo".encode(encoding)
+    unicode_encode_op = ragged_string_ops.unicode_encode(test_value, encoding,
+                                                         "ignore")
+    with self.cached_session():
+      result = unicode_encode_op.eval()
+      self.assertIsInstance(result, bytes)
+      self.assertAllEqual(result, expected_value)
+
+  @parameterized.parameters("UTF-8", "UTF-16-BE", "UTF-32-BE")
+  def testReplaceErrors(self, encoding):
+    test_value = np.array([72, 101, 2147483647, -1, 111], np.int32)
+    expected_value = u"He\U0000fffd\U0000fffdo".encode(encoding)
+    unicode_encode_op = ragged_string_ops.unicode_encode(test_value, encoding,
+                                                         "replace")
+    with self.cached_session():
+      result = unicode_encode_op.eval()
+      self.assertIsInstance(result, bytes)
+      self.assertAllEqual(result, expected_value)
+
+    # Test custom replacement character
+    test_value = np.array([72, 101, 2147483647, -1, 111], np.int32)
+    expected_value = u"Heooo".encode(encoding)
+    unicode_encode_op = ragged_string_ops.unicode_encode(test_value, encoding,
+                                                         "replace", 111)
+    with self.cached_session():
+      result = unicode_encode_op.eval()
+      self.assertIsInstance(result, bytes)
+      self.assertAllEqual(result, expected_value)
+
+    # Verify "replace" is default
+    test_value = np.array([72, 101, 2147483647, -1, 111], np.int32)
+    expected_value = u"He\U0000fffd\U0000fffdo".encode(encoding)
+    unicode_encode_op = ragged_string_ops.unicode_encode(test_value, encoding)
+    with self.cached_session():
+      result = unicode_encode_op.eval()
+      self.assertIsInstance(result, bytes)
+      self.assertAllEqual(result, expected_value)
+
+    # Replacement_char must be within range
+    test_value = np.array([72, 101, 2147483647, -1, 111], np.int32)
+    unicode_encode_op = ragged_string_ops.unicode_encode(test_value, encoding,
+                                                         "replace", 1114112)
+    with self.cached_session():
+      with self.assertRaises(errors.InvalidArgumentError):
+        unicode_encode_op.eval()
+
+  # -- regular Tensor tests -- #
+
+  @parameterized.parameters("UTF-8", "UTF-16-BE", "UTF-32-BE")
+  def testVector(self, encoding):
+    test_value = np.array([72, 101, 108, 108, 111], np.int32)
+    expected_value = u"Hello".encode(encoding)
+    unicode_encode_op = ragged_string_ops.unicode_encode(test_value, encoding)
+    with self.cached_session():
+      result = unicode_encode_op.eval()
+      self.assertIsInstance(result, bytes)
+      self.assertAllEqual(result, expected_value)
+
+    test_value = np.array([72, 101, 195, 195, 128516], np.int32)
+    expected_value = u"He\xc3\xc3\U0001f604".encode(encoding)
+    unicode_encode_op = ragged_string_ops.unicode_encode(test_value, encoding)
+    with self.cached_session():
+      result = unicode_encode_op.eval()
+      self.assertIsInstance(result, bytes)
+      self.assertAllEqual(result, expected_value)
+
+    # Single character string
+    test_value = np.array([72], np.int32)
+    expected_value = u"H".encode(encoding)
+    unicode_encode_op = ragged_string_ops.unicode_encode(test_value, encoding)
+    with self.cached_session():
+      result = unicode_encode_op.eval()
+      self.assertIsInstance(result, bytes)
+      self.assertAllEqual(result, expected_value)
+
+    test_value = np.array([128516], np.int32)
+    expected_value = u"\U0001f604".encode(encoding)
+    unicode_encode_op = ragged_string_ops.unicode_encode(test_value, encoding)
+    with self.cached_session():
+      result = unicode_encode_op.eval()
+      self.assertIsInstance(result, bytes)
+      self.assertAllEqual(result, expected_value)
+
+  @parameterized.parameters("UTF-8", "UTF-16-BE", "UTF-32-BE")
+  def testMatrix(self, encoding):
+    test_value = np.array(
+        [[72, 128516, 108, 108, 111], [87, 128516, 114, 108, 100]], np.int32)
+    expected_value = [
+        u"H\U0001f604llo".encode(encoding), u"W\U0001f604rld".encode(encoding)
+    ]
+    unicode_encode_op = ragged_string_ops.unicode_encode(test_value, encoding)
+    with self.cached_session():
+      result = unicode_encode_op.eval()
+      self.assertIsInstance(unicode_encode_op, ops.Tensor)
+      self.assertAllEqual(result, expected_value)
+
+  @parameterized.parameters("UTF-8", "UTF-16-BE", "UTF-32-BE")
+  def test3DimMatrix(self, encoding):
+    test_value = constant_op.constant(
+        [[[72, 101, 108, 108, 111], [87, 111, 114, 108, 100]],
+         [[102, 105, 120, 101, 100], [119, 111, 114, 100, 115]],
+         [[72, 121, 112, 101, 114], [99, 117, 98, 101, 46]]], np.int32)
+    expected_value = [[u"Hello".encode(encoding), u"World".encode(encoding)],
+                      [u"fixed".encode(encoding), u"words".encode(encoding)],
+                      [u"Hyper".encode(encoding), u"cube.".encode(encoding)]]
+    unicode_encode_op = ragged_string_ops.unicode_encode(test_value, encoding)
+    with self.cached_session():
+      result = unicode_encode_op.eval()
+      self.assertIsInstance(unicode_encode_op, ops.Tensor)
+      self.assertAllEqual(result, expected_value)
+
+  @parameterized.parameters("UTF-8", "UTF-16-BE", "UTF-32-BE")
+  def test4DimMatrix(self, encoding):
+    test_value = constant_op.constant(
+        [[[[72, 101, 108, 108, 111]], [[87, 111, 114, 108, 100]]],
+         [[[102, 105, 120, 101, 100]], [[119, 111, 114, 100, 115]]],
+         [[[72, 121, 112, 101, 114]], [[99, 117, 98, 101, 46]]]], np.int32)
+    expected_value = [[[u"Hello".encode(encoding)],
+                       [u"World".encode(encoding)]],
+                      [[u"fixed".encode(encoding)],
+                       [u"words".encode(encoding)]],
+                      [[u"Hyper".encode(encoding)],
+                       [u"cube.".encode(encoding)]]]
+    unicode_encode_op = ragged_string_ops.unicode_encode(test_value, encoding)
+    with self.cached_session():
+      result = unicode_encode_op.eval()
+      self.assertIsInstance(unicode_encode_op, ops.Tensor)
+      self.assertAllEqual(result, expected_value)
+
+  # -- Ragged Tensor tests -- #
+
+  @parameterized.parameters("UTF-8", "UTF-16-BE", "UTF-32-BE")
+  def testRaggedMatrix(self, encoding):
+    test_value = ragged_factory_ops.constant(
+        [[72, 195, 108, 108, 111], [87, 128516, 114, 108, 100, 46]], np.int32)
+    expected_value = [
+        u"H\xc3llo".encode(encoding), u"W\U0001f604rld.".encode(encoding)
+    ]
+    unicode_encode_op = ragged_string_ops.unicode_encode(test_value, encoding)
+    with self.cached_session():
+      result = unicode_encode_op.eval()
+      self.assertIsInstance(unicode_encode_op, ops.Tensor)
+      self.assertAllEqual(result, expected_value)
+
+  @parameterized.parameters("UTF-8", "UTF-16-BE", "UTF-32-BE")
+  def test3DimMatrixWithRagged2ndDim(self, encoding):
+    test_value = ragged_factory_ops.constant(
+        [[[72, 101, 108, 108, 111], [87, 111, 114, 108, 100]],
+         [[102, 105, 120, 101, 100]],
+         [[72, 121, 112, 101, 114], [119, 111, 114, 100, 115],
+          [99, 117, 98, 101, 46]]], np.int32)
+    expected_value = [[u"Hello".encode(encoding), u"World".encode(encoding)],
+                      [u"fixed".encode(encoding)],
+                      [
+                          u"Hyper".encode(encoding), u"words".encode(encoding),
+                          u"cube.".encode(encoding)
+                      ]]
+    unicode_encode_op = ragged_string_ops.unicode_encode(test_value, encoding)
+    with self.cached_session():
+      result = unicode_encode_op.eval()
+      self.assertEqual(unicode_encode_op.ragged_rank, 1)
+      self.assertAllEqual(result.tolist(), expected_value)
+
+  @parameterized.parameters("UTF-8", "UTF-16-BE", "UTF-32-BE")
+  def test3DimMatrixWithRagged3rdDim(self, encoding):
+    test_value = ragged_factory_ops.constant(
+        [[[72, 101, 108, 108, 111], [87, 111, 114, 108, 100, 46]],
+         [[68, 111, 110, 39, 116], [119, 195, 114, 114, 121, 44, 32, 98, 101]],
+         [[128516], []]], np.int32)
+    expected_value = [[u"Hello".encode(encoding), u"World.".encode(encoding)],
+                      [
+                          u"Don't".encode(encoding),
+                          u"w\xc3rry, be".encode(encoding)
+                      ], [u"\U0001f604".encode(encoding), u"".encode(encoding)]]
+    unicode_encode_op = ragged_string_ops.unicode_encode(test_value, encoding)
+    with self.cached_session():
+      result = unicode_encode_op.eval()
+      self.assertEqual(unicode_encode_op.ragged_rank, 1)
+      self.assertAllEqual(result.tolist(), expected_value)
+
+  @parameterized.parameters("UTF-8", "UTF-16-BE", "UTF-32-BE")
+  def test3DimMatrixWithRagged2ndAnd3rdDim(self, encoding):
+    test_value = ragged_factory_ops.constant(
+        [[[72, 101, 108, 108, 111], [87, 111, 114, 108, 100, 46]], [],
+         [[128516]]], np.int32)
+    expected_value = [[u"Hello".encode(encoding), u"World.".encode(encoding)],
+                      [], [u"\U0001f604".encode(encoding)]]
+    unicode_encode_op = ragged_string_ops.unicode_encode(test_value, encoding)
+    with self.cached_session():
+      result = unicode_encode_op.eval()
+      self.assertEqual(unicode_encode_op.ragged_rank, 1)
+      self.assertAllEqual(result.tolist(), expected_value)
+
+  @parameterized.parameters("UTF-8", "UTF-16-BE", "UTF-32-BE")
+  def test4DimRaggedMatrix(self, encoding):
+    test_value = ragged_factory_ops.constant(
+        [[[[72, 101, 108, 108, 111], [87, 111, 114, 108, 100]]],
+         [[[]], [[72, 121, 112, 101]]]], np.int32)
+    expected_value = [[[u"Hello".encode(encoding), u"World".encode(encoding)]],
+                      [[u"".encode(encoding)], [u"Hype".encode(encoding)]]]
+    unicode_encode_op = ragged_string_ops.unicode_encode(test_value, encoding)
+    with self.cached_session():
+      result = unicode_encode_op.eval()
+      self.assertEqual(unicode_encode_op.ragged_rank, 2)
+      self.assertAllEqual(result.tolist(), expected_value)
+
+  @parameterized.parameters("UTF-8", "UTF-16-BE", "UTF-32-BE")
+  def testRaggedMatrixWithMultiDimensionInnerValues(self, encoding):
+    test_inner_values = constant_op.constant([[[72, 101, 108, 108, 111],
+                                               [87, 111, 114, 108, 100]],
+                                              [[102, 105, 120, 101, 100],
+                                               [119, 111, 114, 100, 115]],
+                                              [[72, 121, 112, 101, 114],
+                                               [99, 117, 98, 101, 46]]])
+    test_row_splits = [
+        constant_op.constant([0, 2, 3], dtype=np.int64),
+        constant_op.constant([0, 1, 1, 3], dtype=np.int64)
+    ]
+    test_value = ragged_factory_ops.from_nested_row_splits(test_inner_values,
+                                                           test_row_splits)
+    expected_value = [[[[u"Hello".encode(encoding), u"World".encode(encoding)]],
+                       []],
+                      [[[u"fixed".encode(encoding), u"words".encode(encoding)],
+                        [u"Hyper".encode(encoding),
+                         u"cube.".encode(encoding)]]]]
+    unicode_encode_op = ragged_string_ops.unicode_encode(test_value, encoding)
+    with self.cached_session():
+      result = unicode_encode_op.eval()
+      self.assertEqual(unicode_encode_op.ragged_rank, 2)
+      self.assertAllEqual(result.tolist(), expected_value)
+      # These next two assertions don't necessarily need to be here as they test
+      # internal representations and we already verified the value is correct.
+      self.assertAllEqual(len(result.nested_row_splits), len(test_row_splits))
+      self.assertEqual(unicode_encode_op.inner_values.shape.ndims,
+                       test_inner_values.shape.ndims - 1)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/kernel_tests/unicode_script_op_test.py b/tensorflow/python/kernel_tests/unicode_script_op_test.py
index 927e5459ed2..83cfeb20216 100644
--- a/tensorflow/python/kernel_tests/unicode_script_op_test.py
+++ b/tensorflow/python/kernel_tests/unicode_script_op_test.py
@@ -20,12 +20,14 @@ from __future__ import print_function
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import string_ops
 from tensorflow.python.platform import test
 
 
 class UnicodeScriptOpTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testValidScripts(self):
     inputs = [
         ord("a"),
@@ -45,6 +47,7 @@ class UnicodeScriptOpTest(test.TestCase):
               0  # USCRIPT_COMMON (ZYYY)
           ])
 
+  @test_util.run_deprecated_v1
   def testInvalidScript(self):
     inputs = [-100, 0xffffff]
     with self.cached_session():
diff --git a/tensorflow/python/kernel_tests/unicode_transcode_op_test.py b/tensorflow/python/kernel_tests/unicode_transcode_op_test.py
index 2908e2bfc56..a3b4fd03474 100644
--- a/tensorflow/python/kernel_tests/unicode_transcode_op_test.py
+++ b/tensorflow/python/kernel_tests/unicode_transcode_op_test.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for string_length_op."""
+"""Tests for unicode_transcode op."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -22,6 +22,7 @@ from absl.testing import parameterized
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import string_ops
 from tensorflow.python.platform import test
 
@@ -42,7 +43,7 @@ class UnicodeTranscodeOpTest(test.TestCase, parameterized.TestCase):
           errors="replace",
           replacement_char=ord(" "),
           replace_control_characters=False)
-      values = sess.run(outputs)
+      values = self.evaluate(outputs)
       self.assertAllEqual(values, strings)
 
       outputs = string_ops.unicode_transcode(
@@ -52,7 +53,7 @@ class UnicodeTranscodeOpTest(test.TestCase, parameterized.TestCase):
           errors="replace",
           replacement_char=ord(" "),
           replace_control_characters=False)
-      values = sess.run(outputs)
+      values = self.evaluate(outputs)
       self.assertAllEqual(values, strings)
 
       outputs = string_ops.unicode_transcode(
@@ -62,7 +63,7 @@ class UnicodeTranscodeOpTest(test.TestCase, parameterized.TestCase):
           errors="replace",
           replacement_char=ord(" "),
           replace_control_characters=False)
-      values = sess.run(outputs)
+      values = self.evaluate(outputs)
       self.assertAllEqual(values, strings)
 
   def test_transcode_utf16_to_utf8(self):
@@ -77,7 +78,7 @@ class UnicodeTranscodeOpTest(test.TestCase, parameterized.TestCase):
           errors="replace",
           replacement_char=ord(" "),
           replace_control_characters=False)
-      values = sess.run(outputs)
+      values = self.evaluate(outputs)
       self.assertAllEqual(values, expected)
 
   def test_transcode_bad_utf8(self):
@@ -90,7 +91,7 @@ class UnicodeTranscodeOpTest(test.TestCase, parameterized.TestCase):
           errors="replace",
           replacement_char=ord(" "),
           replace_control_characters=True)
-      values = sess.run(outputs)
+      values = self.evaluate(outputs)
       self.assertAllEqual(values, b"  ")
 
       outputs = string_ops.unicode_transcode(
@@ -100,7 +101,7 @@ class UnicodeTranscodeOpTest(test.TestCase, parameterized.TestCase):
           errors="replace",
           replacement_char=ord(" "),
           replace_control_characters=False)
-      values = sess.run(outputs)
+      values = self.evaluate(outputs)
       self.assertAllEqual(values, b"\x00 ")
 
   def test_transcode_bad_utf8_with_some_good(self):
@@ -113,7 +114,7 @@ class UnicodeTranscodeOpTest(test.TestCase, parameterized.TestCase):
           errors="replace",
           replacement_char=ord(" "),
           replace_control_characters=False)
-      values = sess.run(outputs)
+      values = self.evaluate(outputs)
       self.assertAllEqual(values, b"abc abcdefg")
 
   def test_transcode_bad_utf8_with_defaults(self):
@@ -121,7 +122,7 @@ class UnicodeTranscodeOpTest(test.TestCase, parameterized.TestCase):
     with self.cached_session() as sess:
       outputs = string_ops.unicode_transcode(
           bad_string, input_encoding="UTF-8", output_encoding="UTF-8")
-      values = sess.run(outputs)
+      values = self.evaluate(outputs)
       self.assertAllEqual(values, b"\x00\xef\xbf\xbd")
 
   def test_transcode_bad_utf8_with_space_replacement(self):
@@ -130,9 +131,10 @@ class UnicodeTranscodeOpTest(test.TestCase, parameterized.TestCase):
       outputs = string_ops.unicode_transcode(
           bad_string, input_encoding="UTF-8", output_encoding="UTF-8",
           replacement_char=ord(" "))
-      values = sess.run(outputs)
+      values = self.evaluate(outputs)
       self.assertAllEqual(values, b"\x00 ")
 
+  @test_util.run_deprecated_v1
   def test_transcode_bad_utf8_with_strict_errors(self):
     bad_string = b"\x00\xff"
     with self.cached_session() as sess:
@@ -143,8 +145,9 @@ class UnicodeTranscodeOpTest(test.TestCase, parameterized.TestCase):
           errors="strict")
       with self.assertRaisesOpError(
           "Invalid formatting on input string"):
-        sess.run(outputs)
+        self.evaluate(outputs)
 
+  @test_util.run_deprecated_v1
   def test_transcode_bad_utf8_start_with_strict_errors(self):
     bad_string = b"\xffabcd"
     with self.cached_session() as sess:
@@ -155,7 +158,7 @@ class UnicodeTranscodeOpTest(test.TestCase, parameterized.TestCase):
           errors="strict")
       with self.assertRaisesOpError(
           "Invalid formatting on input string"):
-        sess.run(outputs)
+        self.evaluate(outputs)
 
   def test_transcode_bad_utf8_with_elision_of_malformatting(self):
     bad_string = b"\x00\xff"
@@ -165,7 +168,7 @@ class UnicodeTranscodeOpTest(test.TestCase, parameterized.TestCase):
           input_encoding="UTF-8",
           output_encoding="UTF-8",
           errors="ignore")
-      values = sess.run(outputs)
+      values = self.evaluate(outputs)
       self.assertAllEqual(values, b"\x00")
 
   def test_transcode_bad_utf8_with_elision_including_control_chars(self):
@@ -177,7 +180,7 @@ class UnicodeTranscodeOpTest(test.TestCase, parameterized.TestCase):
           output_encoding="UTF-8",
           errors="ignore",
           replace_control_characters=True)
-      values = sess.run(outputs)
+      values = self.evaluate(outputs)
       self.assertAllEqual(values, b"")
 
   def test_transcode_bad_utf8_termination_with_defaults(self):
@@ -185,7 +188,7 @@ class UnicodeTranscodeOpTest(test.TestCase, parameterized.TestCase):
     with self.cached_session() as sess:
       outputs = string_ops.unicode_transcode(
           bad_string, input_encoding="UTF-8", output_encoding="UTF-8")
-      values = sess.run(outputs)
+      values = self.evaluate(outputs)
       self.assertAllEqual(values, b"a\xef\xbf\xbd")   # 0xFFFD
 
   def test_transcode_utf8_with_replacement_char(self):
@@ -194,13 +197,13 @@ class UnicodeTranscodeOpTest(test.TestCase, parameterized.TestCase):
       outputs = string_ops.unicode_transcode(
           strings, input_encoding="UTF-8", output_encoding="UTF-8",
           errors="strict")
-      values = sess.run(outputs)
+      values = self.evaluate(outputs)
       self.assertAllEqual(values, [b"a\xef\xbf\xbd"])
 
       outputs = string_ops.unicode_transcode(
           strings, input_encoding="UTF-8", output_encoding="UTF-8",
           errors="replace", replacement_char=ord("?"))
-      values = sess.run(outputs)
+      values = self.evaluate(outputs)
       self.assertAllEqual(values, [b"a\xef\xbf\xbd"])
 
   def test_transcode_utf8_to_utf16(self):
@@ -214,7 +217,7 @@ class UnicodeTranscodeOpTest(test.TestCase, parameterized.TestCase):
           output_encoding="UTF-16-BE",
           replacement_char=ord(" "),
           replace_control_characters=False)
-      values = sess.run(outputs)
+      values = self.evaluate(outputs)
       print("values=", values)
       self.assertAllEqual(values, expected)
 
@@ -230,7 +233,7 @@ class UnicodeTranscodeOpTest(test.TestCase, parameterized.TestCase):
           output_encoding="UTF-8",
           replacement_char=ord(" "),
           replace_control_characters=False)
-      values = sess.run(outputs)
+      values = self.evaluate(outputs)
       self.assertAllEqual(values, expected)
 
   def test_transcode_utf8_to_utf32(self):
@@ -243,7 +246,7 @@ class UnicodeTranscodeOpTest(test.TestCase, parameterized.TestCase):
           output_encoding="UTF-32-BE",
           replacement_char=ord(" "),
           replace_control_characters=False)
-      values = sess.run(outputs)
+      values = self.evaluate(outputs)
       self.assertAllEqual(values, expected)
 
   # Documentation in ICU suggests that getNextUChar may produce a different
@@ -258,7 +261,7 @@ class UnicodeTranscodeOpTest(test.TestCase, parameterized.TestCase):
           output_encoding="UTF-8",
           replacement_char=ord(" "),
           replace_control_characters=False)
-      values = sess.run(outputs)
+      values = self.evaluate(outputs)
       self.assertAllEqual(values, strings)
 
   def test_transcode_utf8_with_bom(self):
@@ -266,12 +269,12 @@ class UnicodeTranscodeOpTest(test.TestCase, parameterized.TestCase):
     with self.cached_session() as sess:
       outputs = string_ops.unicode_transcode(
           bom_string, input_encoding="UTF-8", output_encoding="UTF-8")
-      values = sess.run(outputs)
+      values = self.evaluate(outputs)
       self.assertAllEqual(values, b"\xef\xbb\xbfabcdefg")  # BOM preserved
 
       outputs = string_ops.unicode_transcode(
           bom_string, input_encoding="UTF-8", output_encoding="UTF-16-BE")
-      values = sess.run(outputs)
+      values = self.evaluate(outputs)
       utf16expected = bom_string.decode("UTF-8").encode("UTF-16-BE")
       self.assertAllEqual(values, utf16expected)
 
@@ -280,20 +283,20 @@ class UnicodeTranscodeOpTest(test.TestCase, parameterized.TestCase):
     with self.cached_session() as sess:
       outputs = string_ops.unicode_transcode(
           bom_string, input_encoding="UTF-16-BE", output_encoding="UTF-8")
-      values = sess.run(outputs)
+      values = self.evaluate(outputs)
       # BOM is preserved in output
       self.assertAllEqual(values, b"\xef\xbb\xbfa")
 
       outputs = string_ops.unicode_transcode(
           bom_string, input_encoding="UTF-16-LE", output_encoding="UTF-8")
-      values = sess.run(outputs)
+      values = self.evaluate(outputs)
       # mangled BOM and value from (incorrect) LE encoding
       self.assertAllEqual(values, b"\xef\xbf\xbe\xe6\x84\x80")
 
       bom_string = b"\xff\xfe\x61\x00"  # Little-endian BOM with 'a' encoded
       outputs = string_ops.unicode_transcode(
           bom_string, input_encoding="UTF-16-LE", output_encoding="UTF-8")
-      values = sess.run(outputs)
+      values = self.evaluate(outputs)
       self.assertAllEqual(values, b"\xef\xbb\xbfa")
 
   @parameterized.parameters(
@@ -317,12 +320,14 @@ class UnicodeTranscodeOpTest(test.TestCase, parameterized.TestCase):
       (b"\xfe\xff\x00<\xfe\xff\x00>", "UTF-16", b"<\xef\xbb\xbf>"),
       (b"\xff\xfe<\x00\xff\xfe>\x00", "UTF-16", b"<\xef\xbb\xbf>"),
   )
+  @test_util.run_deprecated_v1
   def test_bom_handling(self, string, input_encoding, expected):
     with self.test_session():
       output = string_ops.unicode_transcode(
           string, input_encoding=input_encoding, output_encoding="UTF-8")
       self.assertAllEqual(output.eval(), expected)
 
+  @test_util.run_deprecated_v1
   def test_invalid_encoding_causes_errors(self):
     strings = [[b"a", b"abc"], [b"ABC", b"DEF"]]
 
@@ -336,7 +341,7 @@ class UnicodeTranscodeOpTest(test.TestCase, parameterized.TestCase):
           replace_control_characters=False)
       with self.assertRaisesOpError(
           "Could not create converter for input encoding: invalid"):
-        sess.run(outputs)
+        self.evaluate(outputs)
 
     with self.assertRaisesRegexp(ValueError, "Op passed string 'invalid'"):
       with self.cached_session() as sess:
@@ -347,8 +352,9 @@ class UnicodeTranscodeOpTest(test.TestCase, parameterized.TestCase):
             errors="replace",
             replacement_char=ord(" "),
             replace_control_characters=False)
-        sess.run(outputs)
+        self.evaluate(outputs)
 
+  @test_util.run_deprecated_v1
   def test_invalid_error_policy_causes_errors(self):
     strings = [[b"a", b"abc"], [b"ABC", b"DEF"]]
 
@@ -362,7 +368,7 @@ class UnicodeTranscodeOpTest(test.TestCase, parameterized.TestCase):
             errors="invalid",
             replacement_char=ord(" "),
             replace_control_characters=False)
-        sess.run(outputs)
+        self.evaluate(outputs)
 
   def test_forwarding(self):
     with self.cached_session():
@@ -378,6 +384,61 @@ class UnicodeTranscodeOpTest(test.TestCase, parameterized.TestCase):
 
       self.assertAllEqual([b"AbCdE", b"HiJkL"], transcoded)
 
+  @test_util.run_deprecated_v1
+  def test_cjk_encodings(self):
+    strings_ja = [
+        b"\x5c\x5c",  # Yen sign
+        b"\x8f\x70",  # kanji character "waza"
+        b"\x83\x4f"
+    ]  # katakana character "gu"
+    strings_zh_cn = [b"\xca\xf5"]  # simplified "shu4"
+    strings_zh_tw = [b"\xb3\x4e"]  # traditional "shu4"
+    strings_ko = [b"\xc7\xd1\xb9\xce"]  # hangul "hanmin"
+
+    expected_ja = [s.decode("shift_jis").encode("UTF-8") for s in strings_ja]
+    expected_zh_cn = [
+        s.decode("gb18030").encode("UTF-8") for s in strings_zh_cn
+    ]
+    expected_zh_tw = [s.decode("big5").encode("UTF-8") for s in strings_zh_tw]
+    expected_ko = [s.decode("euc_kr").encode("UTF-8") for s in strings_ko]
+
+    with self.cached_session() as sess:
+      outputs_ja = string_ops.unicode_transcode(
+          strings_ja,
+          input_encoding="shift_jis",
+          output_encoding="UTF-8",
+          replacement_char=ord(" "),
+          replace_control_characters=False)
+
+      outputs_zh_cn = string_ops.unicode_transcode(
+          strings_zh_cn,
+          input_encoding="gb18030",
+          output_encoding="UTF-8",
+          replacement_char=ord(" "),
+          replace_control_characters=False)
+
+      outputs_zh_tw = string_ops.unicode_transcode(
+          strings_zh_tw,
+          input_encoding="big5",
+          output_encoding="UTF-8",
+          replacement_char=ord(" "),
+          replace_control_characters=False)
+
+      outputs_ko = string_ops.unicode_transcode(
+          strings_ko,
+          input_encoding="euc_kr",
+          output_encoding="UTF-8",
+          replacement_char=ord(" "),
+          replace_control_characters=False)
+
+      result_ja, result_zh_cn, result_zh_tw, result_ko = sess.run(
+          [outputs_ja, outputs_zh_cn, outputs_zh_tw, outputs_ko])
+
+      self.assertAllEqual(result_ja, expected_ja)
+      self.assertAllEqual(result_zh_cn, expected_zh_cn)
+      self.assertAllEqual(result_zh_tw, expected_zh_tw)
+      self.assertAllEqual(result_ko, expected_ko)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/unique_op_test.py b/tensorflow/python/kernel_tests/unique_op_test.py
index 316570e13e2..f203263e0c5 100644
--- a/tensorflow/python/kernel_tests/unique_op_test.py
+++ b/tensorflow/python/kernel_tests/unique_op_test.py
@@ -32,7 +32,7 @@ class UniqueTest(test.TestCase):
     x = np.random.randint(2, high=10, size=7000)
     with self.cached_session() as sess:
       y, idx = array_ops.unique(x)
-      tf_y, tf_idx = sess.run([y, idx])
+      tf_y, tf_idx = self.evaluate([y, idx])
 
     self.assertEqual(len(x), len(tf_idx))
     self.assertEqual(len(tf_y), len(np.unique(x)))
@@ -43,7 +43,7 @@ class UniqueTest(test.TestCase):
     x = np.random.randint(2, high=10, size=7000)
     with self.cached_session() as sess:
       y, idx = array_ops.unique(x, out_idx=dtypes.int64)
-      tf_y, tf_idx = sess.run([y, idx])
+      tf_y, tf_idx = self.evaluate([y, idx])
 
     self.assertEqual(len(x), len(tf_idx))
     self.assertEqual(len(tf_y), len(np.unique(x)))
@@ -55,7 +55,7 @@ class UniqueTest(test.TestCase):
     x = [chr(i) for i in indx]
     with self.cached_session() as sess:
       y, idx = array_ops.unique(x)
-      tf_y, tf_idx = sess.run([y, idx])
+      tf_y, tf_idx = self.evaluate([y, idx])
 
     self.assertEqual(len(x), len(tf_idx))
     self.assertEqual(len(tf_y), len(np.unique(x)))
@@ -67,9 +67,9 @@ class UniqueTest(test.TestCase):
       x = np.array([[1, 0, 0], [1, 0, 0], [2, 0, 0]])
       with self.cached_session() as sess:
         y0, idx0 = gen_array_ops.unique_v2(x, axis=np.array([0], dtype))
-        tf_y0, tf_idx0 = sess.run([y0, idx0])
+        tf_y0, tf_idx0 = self.evaluate([y0, idx0])
         y1, idx1 = gen_array_ops.unique_v2(x, axis=np.array([1], dtype))
-        tf_y1, tf_idx1 = sess.run([y1, idx1])
+        tf_y1, tf_idx1 = self.evaluate([y1, idx1])
       self.assertAllEqual(tf_y0, np.array([[1, 0, 0], [2, 0, 0]]))
       self.assertAllEqual(tf_idx0, np.array([0, 0, 1]))
       self.assertAllEqual(tf_y1, np.array([[1, 0], [1, 0], [2, 0]]))
@@ -81,7 +81,7 @@ class UniqueTest(test.TestCase):
     x = np.random.randint(2, high=10, size=7000)
     with self.cached_session() as sess:
       y, idx = gen_array_ops.unique_v2(x, axis=np.array([], np.int32))
-      tf_y, tf_idx = sess.run([y, idx])
+      tf_y, tf_idx = self.evaluate([y, idx])
 
     self.assertEqual(len(x), len(tf_idx))
     self.assertEqual(len(tf_y), len(np.unique(x)))
@@ -95,7 +95,7 @@ class UniqueWithCountsTest(test.TestCase):
     x = np.random.randint(2, high=10, size=7000)
     with self.cached_session() as sess:
       y, idx, count = array_ops.unique_with_counts(x)
-      tf_y, tf_idx, tf_count = sess.run([y, idx, count])
+      tf_y, tf_idx, tf_count = self.evaluate([y, idx, count])
 
     self.assertEqual(len(x), len(tf_idx))
     self.assertEqual(len(tf_y), len(np.unique(x)))
@@ -108,7 +108,7 @@ class UniqueWithCountsTest(test.TestCase):
     x = np.random.randint(2, high=10, size=7000)
     with self.cached_session() as sess:
       y, idx, count = array_ops.unique_with_counts(x, out_idx=dtypes.int64)
-      tf_y, tf_idx, tf_count = sess.run([y, idx, count])
+      tf_y, tf_idx, tf_count = self.evaluate([y, idx, count])
 
     self.assertEqual(len(x), len(tf_idx))
     self.assertEqual(len(tf_y), len(np.unique(x)))
@@ -123,7 +123,7 @@ class UniqueWithCountsTest(test.TestCase):
 
     with self.cached_session() as sess:
       y, idx, count = array_ops.unique_with_counts(x)
-      tf_y, tf_idx, tf_count = sess.run([y, idx, count])
+      tf_y, tf_idx, tf_count = self.evaluate([y, idx, count])
 
     self.assertEqual(len(x), len(tf_idx))
     self.assertEqual(len(tf_y), len(np.unique(x)))
@@ -139,10 +139,10 @@ class UniqueWithCountsTest(test.TestCase):
       with self.cached_session() as sess:
         y0, idx0, count0 = gen_array_ops.unique_with_counts_v2(
             x, axis=np.array([0], dtype))
-        tf_y0, tf_idx0, tf_count0 = sess.run([y0, idx0, count0])
+        tf_y0, tf_idx0, tf_count0 = self.evaluate([y0, idx0, count0])
         y1, idx1, count1 = gen_array_ops.unique_with_counts_v2(
             x, axis=np.array([1], dtype))
-        tf_y1, tf_idx1, tf_count1 = sess.run([y1, idx1, count1])
+        tf_y1, tf_idx1, tf_count1 = self.evaluate([y1, idx1, count1])
       self.assertAllEqual(tf_y0, np.array([[1, 0, 0], [2, 0, 0]]))
       self.assertAllEqual(tf_idx0, np.array([0, 0, 1]))
       self.assertAllEqual(tf_count0, np.array([2, 1]))
@@ -157,7 +157,7 @@ class UniqueWithCountsTest(test.TestCase):
     with self.cached_session() as sess:
       y, idx, count = gen_array_ops.unique_with_counts_v2(
           x, axis=np.array([], np.int32))
-      tf_y, tf_idx, tf_count = sess.run([y, idx, count])
+      tf_y, tf_idx, tf_count = self.evaluate([y, idx, count])
 
     self.assertEqual(len(x), len(tf_idx))
     self.assertEqual(len(tf_y), len(np.unique(x)))
diff --git a/tensorflow/python/kernel_tests/unstack_op_test.py b/tensorflow/python/kernel_tests/unstack_op_test.py
index 6aea42990ac..f5ba475e7ad 100644
--- a/tensorflow/python/kernel_tests/unstack_op_test.py
+++ b/tensorflow/python/kernel_tests/unstack_op_test.py
@@ -41,7 +41,7 @@ class UnstackOpTest(test.TestCase):
 
   def testSimple(self):
     np.random.seed(7)
-    with self.session(use_gpu=True):
+    with test_util.use_gpu():
       for shape in (2,), (3,), (2, 3), (3, 2), (4, 3, 2):
         for dtype in [
             np.bool, np.float16, np.float32, np.float64, np.int32, np.int64
@@ -53,14 +53,15 @@ class UnstackOpTest(test.TestCase):
           cs = array_ops.unstack(x, num=shape[0])
           self.assertEqual(type(cs), list)
           self.assertEqual(len(cs), shape[0])
-          cs = [c.eval() for c in cs]
+          cs = [self.evaluate(c) for c in cs]
           self.assertAllEqual(cs, data)
 
   def testSimpleGpu(self):
     if not test_util.is_gpu_available():
       self.skipTest('No GPU available')
+
     np.random.seed(7)
-    with self.session(use_gpu=True, force_gpu=True):
+    with test_util.force_gpu():
       for shape in (2,), (3,), (2, 3), (3, 2), (4, 3, 2):
         for dtype in [np.float16, np.float32, np.float64, np.int32, np.int64]:
           data = np.random.randn(*shape).astype(dtype)
@@ -70,9 +71,10 @@ class UnstackOpTest(test.TestCase):
           cs = array_ops.unstack(x, num=shape[0])
           self.assertEqual(type(cs), list)
           self.assertEqual(len(cs), shape[0])
-          cs = [c.eval() for c in cs]
+          cs = [self.evaluate(c) for c in cs]
           self.assertAllEqual(cs, data)
 
+  @test_util.run_deprecated_v1
   def testGradientsAxis0(self):
     for shape in (2,), (3,), (2, 3), (3, 2), (4, 3, 2):
       data = np.random.randn(*shape)
@@ -85,6 +87,7 @@ class UnstackOpTest(test.TestCase):
                                                         shapes[i])
           self.assertLess(err, 1e-6)
 
+  @test_util.run_deprecated_v1
   def testGradientsAxis1(self):
     for shape in (2, 3), (3, 2), (4, 3, 2):
       data = np.random.randn(*shape)
@@ -98,6 +101,7 @@ class UnstackOpTest(test.TestCase):
                                                         out_shape)
           self.assertLess(err, 1e-6)
 
+  @test_util.run_deprecated_v1
   def testInferNum(self):
     with self.cached_session():
       for shape in (2,), (3,), (2, 3), (3, 2), (4, 3, 2):
@@ -106,16 +110,19 @@ class UnstackOpTest(test.TestCase):
         self.assertEqual(type(cs), list)
         self.assertEqual(len(cs), shape[0])
 
+  @test_util.run_deprecated_v1
   def testCannotInferNumFromUnknownShape(self):
     x = array_ops.placeholder(np.float32)
     with self.assertRaisesRegexp(ValueError,
                                  r'Cannot infer num from shape <unknown>'):
       array_ops.unstack(x)
 
+  @test_util.run_deprecated_v1
   def testUnknownShapeOkWithNum(self):
     x = array_ops.placeholder(np.float32)
     array_ops.unstack(x, num=2)
 
+  @test_util.run_deprecated_v1
   def testCannotInferNumFromNoneShape(self):
     x = array_ops.placeholder(np.float32, shape=(None,))
     with self.assertRaisesRegexp(ValueError,
@@ -131,15 +138,13 @@ class UnstackOpTest(test.TestCase):
       for j in range(-i, i):
         expected = np_split_squeeze(a, j)
 
-        with self.cached_session() as sess:
-          actual_unstack = sess.run(array_ops.unstack(a, axis=j))
+        actual_unstack = self.evaluate(array_ops.unstack(a, axis=j))
 
         self.assertAllEqual(expected, actual_unstack)
 
   def testAxis0Default(self):
-    with self.cached_session() as sess:
-      a = constant_op.constant([[1, 2, 3], [4, 5, 6]], name='a')
-      unstacked = sess.run(array_ops.unstack(a))
+    a = constant_op.constant([[1, 2, 3], [4, 5, 6]], name='a')
+    unstacked = self.evaluate(array_ops.unstack(a))
 
     self.assertEqual(len(unstacked), 2)
     self.assertAllEqual(unstacked[0], [1, 2, 3])
@@ -156,10 +161,9 @@ class UnstackOpTest(test.TestCase):
       array_ops.unstack(a, axis=-3)
 
   def testZeroLengthDim(self):
-    with self.cached_session():
-      x = array_ops.zeros(shape=(0, 1, 2))
-      y = array_ops.unstack(x, axis=1)[0].eval()
-      self.assertEqual(y.shape, (0, 2))
+    x = array_ops.zeros(shape=(0, 1, 2))
+    y = self.evaluate(array_ops.unstack(x, axis=1)[0])
+    self.assertEqual(y.shape, (0, 2))
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/kernel_tests/variable_ops_test.py b/tensorflow/python/kernel_tests/variable_ops_test.py
index 3d2f8b61555..0f3e2619925 100644
--- a/tensorflow/python/kernel_tests/variable_ops_test.py
+++ b/tensorflow/python/kernel_tests/variable_ops_test.py
@@ -24,6 +24,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_state_ops
 from tensorflow.python.ops import math_ops
@@ -46,7 +47,7 @@ class VariableOpTest(test.TestCase):
       p = state_ops.variable_op(x.shape, tftype)
       op = state_ops.assign(p, x)
       op.op.run()
-      return p.eval()
+      return self.evaluate(p)
 
   def _testTypes(self, vals):
     for dtype in [np.float32, np.float64, np.int32, np.int64]:
@@ -59,15 +60,18 @@ class VariableOpTest(test.TestCase):
       # that Variable and Assign have GPU implementations for matching tf.
       self.assertAllEqual(x, self._initFetch(x, tftype, use_gpu=True))
 
+  @test_util.run_deprecated_v1
   def testBasic(self):
     self._testTypes(np.arange(0, 20).reshape([4, 5]))
 
+  @test_util.run_deprecated_v1
   def testset_shape(self):
     p = state_ops.variable_op([1, 2], dtypes.float32)
     self.assertEqual([1, 2], p.get_shape())
     p = state_ops.variable_op([1, 2], dtypes.float32, set_shape=False)
     self.assertEqual(tensor_shape.unknown_shape(), p.get_shape())
 
+  @test_util.run_deprecated_v1
   def testAssign(self):
     value = np.array([[42.0, 43.0]])
     var = state_ops.variable_op(value.shape, dtypes.float32)
@@ -75,6 +79,7 @@ class VariableOpTest(test.TestCase):
     assigned = state_ops.assign(var, value)
     self.assertShapeEqual(value, assigned)
 
+  @test_util.run_deprecated_v1
   def testAssignNoValidateShape(self):
     value = np.array([[42.0, 43.0]])
     var = state_ops.variable_op(value.shape, dtypes.float32)
@@ -82,6 +87,7 @@ class VariableOpTest(test.TestCase):
     assigned = state_ops.assign(var, value, validate_shape=False)
     self.assertShapeEqual(value, assigned)
 
+  @test_util.run_deprecated_v1
   def testAssignNoVarShape(self):
     value = np.array([[42.0, 43.0]])
     var = state_ops.variable_op(value.shape, dtypes.float32, set_shape=False)
@@ -89,6 +95,7 @@ class VariableOpTest(test.TestCase):
     assigned = state_ops.assign(var, value)
     self.assertShapeEqual(value, assigned)
 
+  @test_util.run_deprecated_v1
   def testAssignNoVarShapeNoValidateShape(self):
     value = np.array([[42.0, 43.0]])
     var = state_ops.variable_op(value.shape, dtypes.float32, set_shape=False)
@@ -101,6 +108,7 @@ class VariableOpTest(test.TestCase):
     self.assertEqual(tensor_shape.unknown_shape(), tensor.get_shape())
     return tensor
 
+  @test_util.run_deprecated_v1
   def testAssignNoValueShape(self):
     value = self._NewShapelessTensor()
     shape = [1, 2]
@@ -109,6 +117,7 @@ class VariableOpTest(test.TestCase):
     self.assertEqual(shape, var.get_shape())
     self.assertEqual(shape, assigned.get_shape())
 
+  @test_util.run_deprecated_v1
   def testAssignNoValueShapeNoValidateShape(self):
     value = self._NewShapelessTensor()
     shape = [1, 2]
@@ -117,6 +126,7 @@ class VariableOpTest(test.TestCase):
     assigned = state_ops.assign(var, value, validate_shape=False)
     self.assertEqual(tensor_shape.unknown_shape(), assigned.get_shape())
 
+  @test_util.run_deprecated_v1
   def testAssignNoShape(self):
     with self.cached_session():
       value = self._NewShapelessTensor()
@@ -125,6 +135,7 @@ class VariableOpTest(test.TestCase):
       self.assertEqual(tensor_shape.unknown_shape(),
                        state_ops.assign(var, value).get_shape())
 
+  @test_util.run_deprecated_v1
   def testAssignNoShapeNoValidateShape(self):
     with self.cached_session():
       value = self._NewShapelessTensor()
@@ -135,6 +146,7 @@ class VariableOpTest(test.TestCase):
           state_ops.assign(
               var, value, validate_shape=False).get_shape())
 
+  @test_util.run_deprecated_v1
   def testAssignUpdate(self):
     var = state_ops.variable_op([1, 2], dtypes.float32)
     added = state_ops.assign_add(var, [[2.0, 3.0]])
@@ -142,6 +154,7 @@ class VariableOpTest(test.TestCase):
     subbed = state_ops.assign_sub(var, [[12.0, 13.0]])
     self.assertEqual([1, 2], subbed.get_shape())
 
+  @test_util.run_deprecated_v1
   def testAssignUpdateNoVarShape(self):
     var = state_ops.variable_op([1, 2], dtypes.float32, set_shape=False)
     added = state_ops.assign_add(var, [[2.0, 3.0]])
@@ -149,6 +162,7 @@ class VariableOpTest(test.TestCase):
     subbed = state_ops.assign_sub(var, [[12.0, 13.0]])
     self.assertEqual([1, 2], subbed.get_shape())
 
+  @test_util.run_deprecated_v1
   def testAssignUpdateNoValueShape(self):
     var = state_ops.variable_op([1, 2], dtypes.float32)
     added = state_ops.assign_add(var, self._NewShapelessTensor())
@@ -156,6 +170,7 @@ class VariableOpTest(test.TestCase):
     subbed = state_ops.assign_sub(var, self._NewShapelessTensor())
     self.assertEqual([1, 2], subbed.get_shape())
 
+  @test_util.run_deprecated_v1
   def testAssignUpdateNoShape(self):
     var = state_ops.variable_op([1, 2], dtypes.float32, set_shape=False)
     added = state_ops.assign_add(var, self._NewShapelessTensor())
@@ -163,24 +178,27 @@ class VariableOpTest(test.TestCase):
     subbed = state_ops.assign_sub(var, self._NewShapelessTensor())
     self.assertEqual(tensor_shape.unknown_shape(), subbed.get_shape())
 
+  @test_util.run_deprecated_v1
   def testTemporaryVariable(self):
-    with self.test_session(use_gpu=True):
+    with test_util.use_gpu():
       var = gen_state_ops.temporary_variable(
           [1, 2], dtypes.float32, var_name="foo")
       var = state_ops.assign(var, [[4.0, 5.0]])
       var = state_ops.assign_add(var, [[6.0, 7.0]])
       final = gen_state_ops.destroy_temporary_variable(var, var_name="foo")
-      self.assertAllClose([[10.0, 12.0]], final.eval())
+      self.assertAllClose([[10.0, 12.0]], self.evaluate(final))
 
+  @test_util.run_deprecated_v1
   def testDestroyNonexistentTemporaryVariable(self):
-    with self.test_session(use_gpu=True):
+    with test_util.use_gpu():
       var = gen_state_ops.temporary_variable([1, 2], dtypes.float32)
       final = gen_state_ops.destroy_temporary_variable(var, var_name="bad")
       with self.assertRaises(errors.NotFoundError):
-        final.eval()
+        self.evaluate(final)
 
+  @test_util.run_deprecated_v1
   def testDuplicateTemporaryVariable(self):
-    with self.test_session(use_gpu=True):
+    with test_util.use_gpu():
       var1 = gen_state_ops.temporary_variable(
           [1, 2], dtypes.float32, var_name="dup")
       var1 = state_ops.assign(var1, [[1.0, 2.0]])
@@ -189,48 +207,53 @@ class VariableOpTest(test.TestCase):
       var2 = state_ops.assign(var2, [[3.0, 4.0]])
       final = var1 + var2
       with self.assertRaises(errors.AlreadyExistsError):
-        final.eval()
+        self.evaluate(final)
 
+  @test_util.run_deprecated_v1
   def testDestroyTemporaryVariableTwice(self):
-    with self.test_session(use_gpu=True):
+    with test_util.use_gpu():
       var = gen_state_ops.temporary_variable([1, 2], dtypes.float32)
       val1 = gen_state_ops.destroy_temporary_variable(var, var_name="dup")
       val2 = gen_state_ops.destroy_temporary_variable(var, var_name="dup")
       final = val1 + val2
       with self.assertRaises(errors.NotFoundError):
-        final.eval()
+        self.evaluate(final)
 
+  @test_util.run_deprecated_v1
   def testTemporaryVariableNoLeak(self):
-    with self.test_session(use_gpu=True):
+    with test_util.use_gpu():
       var = gen_state_ops.temporary_variable(
           [1, 2], dtypes.float32, var_name="bar")
       final = array_ops.identity(var)
-      final.eval()
+      self.evaluate(final)
 
+  @test_util.run_deprecated_v1
   def testTwoTemporaryVariablesNoLeaks(self):
-    with self.test_session(use_gpu=True):
+    with test_util.use_gpu():
       var1 = gen_state_ops.temporary_variable(
           [1, 2], dtypes.float32, var_name="var1")
       var2 = gen_state_ops.temporary_variable(
           [1, 2], dtypes.float32, var_name="var2")
       final = var1 + var2
-      final.eval()
+      self.evaluate(final)
 
+  @test_util.run_deprecated_v1
   def testAssignDependencyAcrossDevices(self):
-    with self.test_session(use_gpu=True):
+    with test_util.use_gpu():
       # The variable and an op to increment it are on the GPU.
       var = state_ops.variable_op([1], dtypes.float32)
-      state_ops.assign(var, [1.0]).eval()
+      self.evaluate(state_ops.assign(var, [1.0]))
       increment = state_ops.assign_add(var, [1.0])
       with ops.control_dependencies([increment]):
-        with ops.device("/cpu:0"):
+        with test_util.force_cpu():
           # This mul op is pinned to the CPU, but reads the variable from the
           # GPU. The test ensures that the dependency on 'increment' is still
           # honored, i.e., the Send and Recv from GPU to CPU should take place
           # only after the increment.
           result = math_ops.multiply(var, var)
-      self.assertAllClose([4.0], result.eval())
+      self.assertAllClose([4.0], self.evaluate(result))
 
+  @test_util.run_deprecated_v1
   def testIsVariableInitialized(self):
     for use_gpu in [True, False]:
       with self.test_session(use_gpu=use_gpu):
diff --git a/tensorflow/python/kernel_tests/variable_scope_test.py b/tensorflow/python/kernel_tests/variable_scope_test.py
index 2ba064f8a50..44d4bd5e30f 100644
--- a/tensorflow/python/kernel_tests/variable_scope_test.py
+++ b/tensorflow/python/kernel_tests/variable_scope_test.py
@@ -152,6 +152,7 @@ class VariableScopeTest(test.TestCase):
   # TypeError: Fetch argument <tf.Variable 'string:0' shape=() dtype=string>
   # has invalid type <class '...ResourceVariable'>, must be a string or Tensor.
   # (Can not convert a ResourceVariable into a Tensor or Operation.)
+  @test_util.run_deprecated_v1
   def testStringDefaultInitializer(self):
     with self.cached_session():
       v = variable_scope.get_variable("string", shape=[], dtype=dtypes.string)
@@ -308,9 +309,9 @@ class VariableScopeTest(test.TestCase):
       self.evaluate(variables_lib.global_variables_initializer())
       self.assertAllEqual(self.evaluate(x.value()), self.evaluate(y.value()))
 
-  # TODO(alive): support variable partitioning/caching in eager mode.
   # TODO(mihaimaruseac): Not converted to use wrap_function because of
   # InvalidArgumentError: /job:moo/replica:0/task:0/device:CPU:0 unknown device.
+  @test_util.run_deprecated_v1
   def testVarScopeCachingDevice(self):
     with self.cached_session():
       caching_device = "/job:moo"
@@ -425,6 +426,7 @@ class VariableScopeTest(test.TestCase):
   # invalid type <class '...ops.resource_variable_ops.ResourceVariable'>, must
   # be a string or Tensor. (Can not convert a ResourceVariable into a Tensor or
   # Operation.)
+  @test_util.run_deprecated_v1
   def testControlDeps(self):
     with self.cached_session() as sess:
       v0 = variable_scope.get_variable(
@@ -435,22 +437,23 @@ class VariableScopeTest(test.TestCase):
         add = v1 + v0
       # v0 should be uninitialized.
       with self.assertRaisesRegexp(errors.OpError, "uninitialized"):
-        sess.run(v0)
+        self.evaluate(v0)
       # We should be able to initialize and run v1 without initializing
       # v0, even if the variable was created with a control dep on v0.
-      sess.run(v1.initializer)
-      self.assertEqual(1, sess.run(v1))
+      self.evaluate(v1.initializer)
+      self.assertEqual(1, self.evaluate(v1))
       # v0 should still be uninitialized.
       with self.assertRaisesRegexp(errors.OpError, "uninitialized"):
-        sess.run(v0)
+        self.evaluate(v0)
       with self.assertRaisesRegexp(errors.OpError, "uninitialized"):
-        sess.run(add)
+        self.evaluate(add)
       # If we initialize v0 we should be able to run 'add'.
-      sess.run(v0.initializer)
-      sess.run(add)
+      self.evaluate(v0.initializer)
+      self.evaluate(add)
 
   # TODO(mihaimaruseac): Not converted to use wrap_function because of
   # AssertionError: True is not false (last assertFalse)
+  @test_util.run_deprecated_v1
   def testEnableResourceVariables(self):
     old = variable_scope._DEFAULT_USE_RESOURCE
     try:
@@ -465,6 +468,7 @@ class VariableScopeTest(test.TestCase):
 
   # TODO(mihaimaruseac): Not converted to use wrap_function because of
   # TypeError: Fetch argument None has invalid type <type 'NoneType'>
+  @test_util.run_deprecated_v1
   def testControlFlow(self):
     with self.cached_session() as sess:
       v0 = variable_scope.get_variable(
@@ -490,19 +494,19 @@ class VariableScopeTest(test.TestCase):
       v2 = var_dict["v2"]
       # We should be able to initialize and run v1 and v2 without initializing
       # v0, even if the variable was created with a control dep on v0.
-      sess.run(v1.initializer)
-      self.assertEqual([1], sess.run(v1))
-      sess.run(v2.initializer)
-      self.assertEqual([2], sess.run(v2))
+      self.evaluate(v1.initializer)
+      self.assertEqual([1], self.evaluate(v1))
+      self.evaluate(v2.initializer)
+      self.assertEqual([2], self.evaluate(v2))
       # v0 should still be uninitialized.
       with self.assertRaisesRegexp(errors.OpError, "uninitialized"):
-        sess.run(v0)
+        self.evaluate(v0)
       # We should not be able to run 'add' yet.
       with self.assertRaisesRegexp(errors.OpError, "uninitialized"):
-        sess.run(add)
+        self.evaluate(add)
       # If we initialize v0 we should be able to run 'add'.
-      sess.run(v0.initializer)
-      sess.run(add)
+      self.evaluate(v0.initializer)
+      self.evaluate(add)
 
   # TODO(mihaimaruseac): Not converted to use wrap_function because of
   # TypeError: Expected tf.group() expected Tensor arguments not 'None' with
@@ -649,7 +653,7 @@ class VariableScopeTest(test.TestCase):
             "testVarScopeGetOrCreateReuse_bar",
             reuse=variable_scope.AUTO_REUSE):
           _ = variable_scope.get_variable("var", [])
-        self.assertEqual(value, x.eval())
+        self.assertEqual(value, self.evaluate(x))
 
       test_value(42.)  # Variable is created.
       test_value(13.)  # Variable is reused hereafter.
@@ -1149,6 +1153,7 @@ class VariableScopeTest(test.TestCase):
 
   # TODO(mihaimaruseac): Not converted to use wrap_function because of
   # obtaining different results in the eager case compared to the graph one
+  @test_util.run_deprecated_v1
   def testGetCollection(self):
     with self.cached_session():
       _ = variable_scope.get_variable("testGetCollection_a", [])
@@ -1205,6 +1210,7 @@ class VariableScopeTest(test.TestCase):
 
   # TODO(mihaimaruseac): Not converted to use wrap_function because of
   # obtaining different results in the eager case compared to the graph one
+  @test_util.run_deprecated_v1
   def testGetTrainableVariablesWithGetVariable(self):
     with self.cached_session():
       _ = variable_scope.get_variable("testGetTrainableVariables_a", [])
@@ -1243,6 +1249,7 @@ class VariableScopeTest(test.TestCase):
 
   # TODO(mihaimaruseac): Not converted to use wrap_function because of
   # obtaining different results in the eager case compared to the graph one
+  @test_util.run_deprecated_v1
   def testGetTrainableVariablesWithVariable(self):
     with self.cached_session():
       _ = variable_scope.variable(1.0, name="testGetTrainableVariables_a")
@@ -1284,6 +1291,7 @@ class VariableScopeTest(test.TestCase):
 
   # TODO(mihaimaruseac): Not converted to use wrap_function because of
   # obtaining different results in the eager case compared to the graph one
+  @test_util.run_deprecated_v1
   def testGetGlobalVariables(self):
     with self.cached_session():
       _ = variable_scope.get_variable("testGetGlobalVariables_a", [])
@@ -1296,6 +1304,7 @@ class VariableScopeTest(test.TestCase):
 
   # TODO(mihaimaruseac): Not converted to use wrap_function because of
   # obtaining different results in the eager case compared to the graph one
+  @test_util.run_deprecated_v1
   def testGetLocalVariables(self):
     with self.cached_session():
       _ = variable_scope.get_variable(
@@ -1313,6 +1322,28 @@ class VariableScopeTest(test.TestCase):
     # Ensure it is possible to do get_variable with a _ref dtype passed in.
     _ = variable_scope.get_variable("w", shape=[5, 6], dtype=v.dtype)
 
+  @test_util.run_in_graph_and_eager_modes
+  @run_inside_wrap_function_in_eager_mode
+  def testGetVariableWithInitializerWhichTakesNoArgs(self):
+    v = variable_scope.get_variable("foo", initializer=lambda: [2])
+    self.assertEqual(v.name, "foo:0")
+
+  @test_util.run_in_graph_and_eager_modes
+  @run_inside_wrap_function_in_eager_mode
+  def testGetVariableWithInitializerWhichTakesOptionalArgs(self):
+    v = variable_scope.get_variable("foo", initializer=lambda x=True: [2])
+    self.assertEqual(v.name, "foo:0")
+
+  @test_util.run_in_graph_and_eager_modes
+  @run_inside_wrap_function_in_eager_mode
+  def testGetVariableWithInitializerWhichTakesUnprovidedArgsAndNoShape(self):
+    with self.assertRaisesRegexp(
+        ValueError,
+        "The initializer passed is not valid. It should be a callable with no "
+        "arguments and the shape should not be provided or an instance of "
+        "`tf.keras.initializers.*' and `shape` should be fully defined."):
+      variable_scope.get_variable("foo", initializer=lambda x: [2])
+
   @test_util.run_in_graph_and_eager_modes
   @run_inside_wrap_function_in_eager_mode
   def testTwoGraphs(self):
@@ -1349,6 +1380,7 @@ class VariableScopeWithPartitioningTest(test.TestCase):
 
   # TODO(mihaimaruseac): Not converted to use wrap_function because of
   # obtaining different results in the eager case compared to the graph one
+  @test_util.run_deprecated_v1
   def testResultNameMatchesRequested(self):
     with variable_scope.variable_scope(
         "scope0", partitioner=axis0_into2_partitioner):
@@ -1404,6 +1436,14 @@ class VariableScopeWithPartitioningTest(test.TestCase):
       v_reused = variable_scope.get_variable("name0")
     self.assertEqual(v, v_reused)
 
+  def testNoReuseInEagerByDefault(self):
+    with context.eager_mode():
+      with variable_scope.variable_scope(
+          "scope0", partitioner=axis0_into2_partitioner):
+        v1 = variable_scope.get_variable("name0", shape=(3, 1, 1))
+        v2 = variable_scope.get_variable("name0", shape=(3, 1, 1))
+        self.assertIsNot(v1, v2)
+
   @test_util.run_in_graph_and_eager_modes
   @run_inside_wrap_function_in_eager_mode
   def testPropagatePartitionerOnReopening(self):
@@ -1415,6 +1455,7 @@ class VariableScopeWithPartitioningTest(test.TestCase):
 
   # TODO(mihaimaruseac): Not converted to use wrap_function because of
   # obtaining different results in the eager case compared to the graph one
+  @test_util.run_deprecated_v1
   def testScalarIgnoresPartitioner(self):
     with variable_scope.variable_scope(
         "scope0", partitioner=axis0_into2_partitioner):
@@ -1459,6 +1500,10 @@ class VariableScopeWithPartitioningTest(test.TestCase):
   def testPartitionConcatenatesAlongCorrectAxisResource(self):
     self._testPartitionConcatenatesAlongCorrectAxis(use_resource=True)
 
+  def testPartitionConcatenatesAlongCorrectAxisResourceInEager(self):
+    with context.eager_mode():
+      self._testPartitionConcatenatesAlongCorrectAxis(use_resource=True)
+
 
 class VariableScopeWithCustomGetterTest(test.TestCase):
 
@@ -1550,6 +1595,7 @@ class VariableScopeWithCustomGetterTest(test.TestCase):
   # dtype=float32> cannot be interpreted as a Tensor. (Tensor
   # Tensor("custom_getter/add:0", shape=(1, 2, 3), dtype=float32) is not an
   # element of this graph.)
+  @test_util.run_deprecated_v1
   def testGetterThatCreatesTwoVariablesAndSumsThem(self):
 
     def custom_getter(getter, name, *args, **kwargs):
@@ -1569,7 +1615,7 @@ class VariableScopeWithCustomGetterTest(test.TestCase):
     self.assertEqual("custom_getter/add:0", v.name)
     with self.cached_session() as sess:
       variables_lib.global_variables_initializer().run()
-      np_vars, np_v = sess.run([true_vars, v])
+      np_vars, np_v = self.evaluate([true_vars, v])
       self.assertAllClose(np_v, sum(np_vars))
 
   # TODO(mihaimaruseac): Not converted to use wrap_function because of
@@ -1577,6 +1623,7 @@ class VariableScopeWithCustomGetterTest(test.TestCase):
   # dtype=float32> cannot be interpreted as a Tensor. (Tensor
   # Tensor("sum_getter_2/add:0", shape=(1, 2, 3), dtype=float32) is not an
   # element of this graph.)
+  @test_util.run_deprecated_v1
   def testNestedCustomGetters(self):
 
     def sum_getter(getter, name, *args, **kwargs):
@@ -1614,7 +1661,7 @@ class VariableScopeWithCustomGetterTest(test.TestCase):
 
     with self.cached_session() as sess:
       variables_lib.global_variables_initializer().run()
-      np_vars, np_v = sess.run([true_vars, v])
+      np_vars, np_v = self.evaluate([true_vars, v])
       # take products of sums of products
       self.assertAllClose(
           np_v, (((np_vars[0] * np_vars[1]) + (np_vars[2] * np_vars[3])) + (
diff --git a/tensorflow/python/kernel_tests/variables_test.py b/tensorflow/python/kernel_tests/variables_test.py
index b3eebf83168..08d885e8a87 100644
--- a/tensorflow/python/kernel_tests/variables_test.py
+++ b/tensorflow/python/kernel_tests/variables_test.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import functools
 import operator
 
 import numpy as np
@@ -27,6 +28,7 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gen_state_ops
@@ -41,6 +43,7 @@ from tensorflow.python.util import compat
 
 class VariablesTestCase(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testInitialization(self):
     with self.cached_session():
       var0 = variables.VariableV1(0.0)
@@ -58,16 +61,17 @@ class VariablesTestCase(test.TestCase):
       self.assertEqual([], var1.shape)
 
       with self.assertRaisesOpError("Attempting to use uninitialized value"):
-        var0.eval()
+        self.evaluate(var0)
 
       with self.assertRaisesOpError("Attempting to use uninitialized value"):
-        var1.eval()
+        self.evaluate(var1)
 
       variables.global_variables_initializer().run()
 
-      self.assertAllClose(0.0, var0.eval())
-      self.assertAllClose(1.1, var1.eval())
+      self.assertAllClose(0.0, self.evaluate(var0))
+      self.assertAllClose(1.1, self.evaluate(var1))
 
+  @test_util.run_deprecated_v1
   def testInitializationOrder(self):
     with self.cached_session():
       rnd = variables.Variable(random_ops.random_uniform([3, 6]), name="rnd")
@@ -94,8 +98,9 @@ class VariablesTestCase(test.TestCase):
 
       variables.global_variables_initializer().run()
 
-      self.assertAllClose(rnd.eval(), dep.eval())
-      self.assertAllClose(rnd.eval() + dep.eval() + 2.0, depdep.eval())
+      self.assertAllClose(rnd.eval(), self.evaluate(dep))
+      self.assertAllClose(rnd.eval() + self.evaluate(dep) + 2.0,
+                          self.evaluate(depdep))
 
   def testIterable(self):
     with self.assertRaisesRegexp(TypeError, "not iterable"):
@@ -105,6 +110,7 @@ class VariablesTestCase(test.TestCase):
       for _ in variables.Variable([0.0, 1.0]):
         pass
 
+  @test_util.run_deprecated_v1
   def testAssignments(self):
     with self.cached_session():
       var = variables.Variable(0.0)
@@ -112,17 +118,18 @@ class VariablesTestCase(test.TestCase):
       minus_one = var.assign_sub(2.0)
       four = var.assign(4.0)
       variables.global_variables_initializer().run()
-      self.assertAllClose(0.0, var.eval())
+      self.assertAllClose(0.0, self.evaluate(var))
 
-      self.assertAllClose(1.0, plus_one.eval())
-      self.assertAllClose(1.0, var.eval())
+      self.assertAllClose(1.0, self.evaluate(plus_one))
+      self.assertAllClose(1.0, self.evaluate(var))
 
-      self.assertAllClose(-1.0, minus_one.eval())
-      self.assertAllClose(-1.0, var.eval())
+      self.assertAllClose(-1.0, self.evaluate(minus_one))
+      self.assertAllClose(-1.0, self.evaluate(var))
 
-      self.assertAllClose(4.0, four.eval())
-      self.assertAllClose(4.0, var.eval())
+      self.assertAllClose(4.0, self.evaluate(four))
+      self.assertAllClose(4.0, self.evaluate(var))
 
+  @test_util.run_deprecated_v1
   def testResourceAssignments(self):
     with self.session(use_gpu=True):
       var = resource_variable_ops.ResourceVariable(0.0)
@@ -130,16 +137,16 @@ class VariablesTestCase(test.TestCase):
       minus_one = var.assign_sub(2.0)
       four = var.assign(4.0)
       variables.global_variables_initializer().run()
-      self.assertAllClose(0.0, var.eval())
+      self.assertAllClose(0.0, self.evaluate(var))
 
-      plus_one.eval()
-      self.assertAllClose(1.0, var.eval())
+      self.evaluate(plus_one)
+      self.assertAllClose(1.0, self.evaluate(var))
 
-      minus_one.eval()
-      self.assertAllClose(-1.0, var.eval())
+      self.evaluate(minus_one)
+      self.assertAllClose(-1.0, self.evaluate(var))
 
-      four.eval()
-      self.assertAllClose(4.0, var.eval())
+      self.evaluate(four)
+      self.assertAllClose(4.0, self.evaluate(var))
 
   def testZeroSizeStringAssign(self):
     with self.cached_session() as sess:
@@ -148,10 +155,10 @@ class VariablesTestCase(test.TestCase):
           name="foo",
           trainable=False,
           collections=[ops.GraphKeys.LOCAL_VARIABLES])
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       old_value = array.value()
       copy_op = array.assign(old_value)
-      self.assertEqual([], list(sess.run(copy_op)))
+      self.assertEqual([], list(self.evaluate(copy_op)))
 
   def _countUpToTest(self, dtype):
     with self.cached_session():
@@ -160,31 +167,34 @@ class VariablesTestCase(test.TestCase):
       count_up_to = var.count_up_to(3)
 
       variables.global_variables_initializer().run()
-      self.assertEqual(0, var.eval())
+      self.assertEqual(0, self.evaluate(var))
 
-      self.assertEqual(0, count_up_to.eval())
-      self.assertEqual(1, var.eval())
+      self.assertEqual(0, self.evaluate(count_up_to))
+      self.assertEqual(1, self.evaluate(var))
 
-      self.assertEqual(1, count_up_to.eval())
-      self.assertEqual(2, var.eval())
+      self.assertEqual(1, self.evaluate(count_up_to))
+      self.assertEqual(2, self.evaluate(var))
 
-      self.assertEqual(2, count_up_to.eval())
-      self.assertEqual(3, var.eval())
+      self.assertEqual(2, self.evaluate(count_up_to))
+      self.assertEqual(3, self.evaluate(var))
 
       with self.assertRaisesOpError("Reached limit of 3"):
-        count_up_to.eval()
-      self.assertEqual(3, var.eval())
+        self.evaluate(count_up_to)
+      self.assertEqual(3, self.evaluate(var))
 
       with self.assertRaisesOpError("Reached limit of 3"):
-        count_up_to.eval()
-      self.assertEqual(3, var.eval())
+        self.evaluate(count_up_to)
+      self.assertEqual(3, self.evaluate(var))
 
+  @test_util.run_deprecated_v1
   def testCountUpToInt32(self):
     self._countUpToTest(dtypes.int32)
 
+  @test_util.run_deprecated_v1
   def testCountUpToInt64(self):
     self._countUpToTest(dtypes.int64)
 
+  @test_util.run_deprecated_v1
   def testControlDepsNone(self):
     with self.cached_session():
       c = constant_op.constant(1.0)
@@ -198,6 +208,7 @@ class VariablesTestCase(test.TestCase):
       self.assertEqual([], var_x.value().op.control_inputs)
       self.assertEqual([], var_x._ref().op.control_inputs)  # pylint: disable=protected-access
 
+  @test_util.run_deprecated_v1
   def testControlFlow(self):
     with self.cached_session() as sess:
       v0 = variables.Variable(0, name="v0")
@@ -220,20 +231,21 @@ class VariablesTestCase(test.TestCase):
       v2 = var_dict["v2"]
       # We should be able to initialize and run v1 and v2 without initializing
       # v0, even if the variable was created with a control dep on v0.
-      sess.run(v1.initializer)
-      self.assertEqual([1], sess.run(v1))
-      sess.run(v2.initializer)
-      self.assertEqual([2], sess.run(v2))
+      self.evaluate(v1.initializer)
+      self.assertEqual([1], self.evaluate(v1))
+      self.evaluate(v2.initializer)
+      self.assertEqual([2], self.evaluate(v2))
       # v0 should still be uninitialized.
       with self.assertRaisesRegexp(errors_impl.OpError, "uninitialized"):
-        sess.run(v0)
+        self.evaluate(v0)
       # We should not be able to run 'add' yet.
       with self.assertRaisesRegexp(errors_impl.OpError, "uninitialized"):
-        sess.run(add)
+        self.evaluate(add)
       # If we initialize v0 we should be able to run 'add'.
-      sess.run(v0.initializer)
-      sess.run(add)
+      self.evaluate(v0.initializer)
+      self.evaluate(add)
 
+  @test_util.run_deprecated_v1
   def testControlFlowInitialization(self):
     """Expects an error if an initializer is in a control-flow scope."""
     def cond(i, _):
@@ -247,15 +259,17 @@ class VariablesTestCase(test.TestCase):
     with self.assertRaisesRegexp(ValueError, "inside a control-flow"):
       control_flow_ops.while_loop(cond, body, [0, 0])
 
+  @test_util.run_deprecated_v1
   def testUseVariableAsTensor(self):
     with self.cached_session():
       var_x = variables.Variable(2.0)
       var_y = variables.Variable(3.0)
       variables.global_variables_initializer().run()
-      self.assertAllClose(2.0, var_x.eval())
-      self.assertAllClose(3.0, var_y.eval())
+      self.assertAllClose(2.0, self.evaluate(var_x))
+      self.assertAllClose(3.0, self.evaluate(var_y))
       self.assertAllClose(5.0, math_ops.add(var_x, var_y).eval())
 
+  @test_util.run_deprecated_v1
   def testZeroSizeVarSameAsConst(self):
     with self.cached_session():
       zero_size_var = variables.Variable(array_ops.zeros([0, 2]))
@@ -264,10 +278,11 @@ class VariablesTestCase(test.TestCase):
       const_mul = math_ops.matmul(
           zero_size_const, zero_size_const, transpose_b=True)
       variables.global_variables_initializer().run()
-      variable_output = variable_mul.eval()
+      variable_output = self.evaluate(variable_mul)
       self.assertAllClose(const_mul.eval(), variable_output)
       self.assertAllClose([[0., 0.], [0., 0.]], variable_output)
 
+  @test_util.run_deprecated_v1
   def testCachingDevice(self):
     with self.cached_session():
       var = variables.Variable(2.0)
@@ -278,6 +293,7 @@ class VariablesTestCase(test.TestCase):
       self.assertFalse(var_cached.device.startswith("/job:foo"))
       self.assertTrue(var_cached.value().device.startswith("/job:foo"))
 
+  @test_util.run_deprecated_v1
   def testCollections(self):
     with self.cached_session():
       var_x = variables.VariableV1(2.0)
@@ -293,6 +309,7 @@ class VariablesTestCase(test.TestCase):
                        variables.global_variables())
       self.assertEqual([var_x, var_z, var_t], variables.trainable_variables())
 
+  @test_util.run_deprecated_v1
   def testCollectionsWithScope(self):
     with self.cached_session():
       with ops.name_scope("scope_1"):
@@ -308,6 +325,13 @@ class VariablesTestCase(test.TestCase):
       self.assertEqual([var_x], variables.trainable_variables("scope_1"))
       self.assertEqual([var_y], variables.trainable_variables("scope_2"))
 
+  def testOperatorWrapping(self):
+    for attr in functools.WRAPPER_ASSIGNMENTS:
+      self.assertEqual(
+          getattr(variables.Variable.__add__, attr),
+          getattr(ops.Tensor.__add__, attr))
+
+  @test_util.run_deprecated_v1
   def testOperators(self):
     with self.cached_session():
       var_f = variables.Variable([2.0])
@@ -349,54 +373,46 @@ class VariablesTestCase(test.TestCase):
       rmatmul = var_m.__rmatmul__([[10.0], [20.0]])
 
       variables.global_variables_initializer().run()
-      self.assertAllClose([2.0], add.eval())
-      self.assertAllClose([3.0], radd.eval())
-      self.assertAllClose([1.0], sub.eval())
-      self.assertAllClose([-1.0], rsub.eval())
-      self.assertAllClose([20.0], mul.eval())
-      self.assertAllClose([20.0], rmul.eval())
-      self.assertAllClose([0.2], div.eval())
-      self.assertAllClose([5.0], rdiv.eval())
-      self.assertAllClose([-2.0], neg.eval())
-      self.assertAllClose([2.0], abs_v.eval())
-      self.assertAllClose([True], lt.eval())
-      self.assertAllClose([False], rlt.eval())
-      self.assertAllClose([True], le.eval())
-      self.assertAllClose([True], rle.eval())
-      self.assertAllClose([False], gt.eval())
-      self.assertAllClose([True], rgt.eval())
-      self.assertAllClose([True], ge.eval())
-      self.assertAllClose([True], rge.eval())
+      self.assertAllClose([2.0], self.evaluate(add))
+      self.assertAllClose([3.0], self.evaluate(radd))
+      self.assertAllClose([1.0], self.evaluate(sub))
+      self.assertAllClose([-1.0], self.evaluate(rsub))
+      self.assertAllClose([20.0], self.evaluate(mul))
+      self.assertAllClose([20.0], self.evaluate(rmul))
+      self.assertAllClose([0.2], self.evaluate(div))
+      self.assertAllClose([5.0], self.evaluate(rdiv))
+      self.assertAllClose([-2.0], self.evaluate(neg))
+      self.assertAllClose([2.0], self.evaluate(abs_v))
+      self.assertAllClose([True], self.evaluate(lt))
+      self.assertAllClose([False], self.evaluate(rlt))
+      self.assertAllClose([True], self.evaluate(le))
+      self.assertAllClose([True], self.evaluate(rle))
+      self.assertAllClose([False], self.evaluate(gt))
+      self.assertAllClose([True], self.evaluate(rgt))
+      self.assertAllClose([True], self.evaluate(ge))
+      self.assertAllClose([True], self.evaluate(rge))
 
-      self.assertAllClose([6], mod.eval())
-      self.assertAllClose([3], rmod.eval())
+      self.assertAllClose([6], self.evaluate(mod))
+      self.assertAllClose([3], self.evaluate(rmod))
 
-      self.assertAllClose([True, False], and_v.eval())
-      self.assertAllClose([True, True], or_v.eval())
-      self.assertAllClose([True, False], xor_v.eval())
-      self.assertAllClose([False, True], invert_v.eval())
+      self.assertAllClose([True, False], self.evaluate(and_v))
+      self.assertAllClose([True, True], self.evaluate(or_v))
+      self.assertAllClose([True, False], self.evaluate(xor_v))
+      self.assertAllClose([False, True], self.evaluate(invert_v))
 
-      self.assertAllClose(rnd[2, 0:0], slice_v.eval())
+      self.assertAllClose(rnd[2, 0:0], self.evaluate(slice_v))
 
-      self.assertAllClose([[80.0]], matmul.eval())
-      self.assertAllClose([[20.0, 30.0], [40.0, 60.0]], rmatmul.eval())
+      self.assertAllClose([[80.0]], self.evaluate(matmul))
+      self.assertAllClose([[20.0, 30.0], [40.0, 60.0]], self.evaluate(rmatmul))
 
+  @test_util.run_deprecated_v1
   def testSession(self):
     with self.cached_session() as sess:
       var = variables.Variable([1, 12])
       variables.global_variables_initializer().run()
-      self.assertAllClose([1, 12], sess.run(var))
-
-  def testDevicePlacement(self):
-    with self.cached_session() as sess:
-      with ops.device("/cpu:0"):
-        var = variables.Variable([1, 12])
-      init_value = var.initialized_value()
-      init_op = variables.global_variables_initializer()
-      self.assertEqual(var.op.device, init_value.device)
-      self.assertEqual(var.op.device, init_op.device)
-      sess.run(init_op)
+      self.assertAllClose([1, 12], self.evaluate(var))
 
+  @test_util.run_deprecated_v1
   def testColocation(self):
     with ops.device("/job:ps"):
       var = variables.VariableV1(0, name="v")
@@ -405,6 +421,7 @@ class VariablesTestCase(test.TestCase):
     self.assertDeviceEqual("/job:ps", assign_op.device)
     self.assertEqual([b"loc:@v"], assign_op.op.colocation_groups())
 
+  @test_util.run_deprecated_v1
   def testInitializerFunction(self):
     value = [[-42], [133.7]]
     shape = [2, 1]
@@ -416,7 +433,7 @@ class VariablesTestCase(test.TestCase):
       self.assertEqual(shape, v1.shape)
       self.assertAllClose(value, v1.initial_value.eval())
       with self.assertRaises(errors_impl.FailedPreconditionError):
-        v1.eval()
+        self.evaluate(v1)
 
       v2 = variables.Variable(
           math_ops.negative(v1.initialized_value()), dtype=dtypes.float32)
@@ -425,9 +442,9 @@ class VariablesTestCase(test.TestCase):
       self.assertAllClose(np.negative(value), v2.initial_value.eval())
 
       with self.assertRaises(errors_impl.FailedPreconditionError):
-        v2.eval()
+        self.evaluate(v2)
       variables.global_variables_initializer().run()
-      self.assertAllClose(np.negative(value), v2.eval())
+      self.assertAllClose(np.negative(value), self.evaluate(v2))
 
   def testConstraintArg(self):
     constraint = lambda x: x
@@ -442,6 +459,7 @@ class VariablesTestCase(test.TestCase):
           lambda: constant_op.constant(1.),
           constraint=constraint)
 
+  @test_util.run_deprecated_v1
   def testNoRefDataRace(self):
     with self.cached_session():
       a = variables.Variable([1, 2, 3], dtype=dtypes.float32)
@@ -452,6 +470,7 @@ class VariablesTestCase(test.TestCase):
       self.assertAllEqual(b.eval(), [3, 4, 5])
       self.assertAllEqual(c.eval(), [5, 6, 7])
 
+  @test_util.run_deprecated_v1
   def testInitializerFunctionDevicePlacement(self):
     with self.cached_session():
       initializer = lambda: constant_op.constant(42.0)
@@ -470,6 +489,7 @@ class VariablesTestCase(test.TestCase):
       for i in v2.initializer.inputs:
         self.assertEqual(expected_group_v2, i.op.colocation_groups())
 
+  @test_util.run_deprecated_v1
   def testVariableDefInitializedInstances(self):
     with ops.Graph().as_default(), self.cached_session() as sess:
       v_def = variables.Variable(
@@ -478,11 +498,11 @@ class VariablesTestCase(test.TestCase):
     with ops.Graph().as_default(), self.cached_session() as sess:
       # v describes a VariableDef-based variable without an initial value.
       v = variables.Variable(variable_def=v_def)
-      self.assertEqual(3.0, sess.run(v.initialized_value()))
+      self.assertEqual(3.0, self.evaluate(v.initialized_value()))
 
       # initialized_value should not rerun the initializer_op if the variable
       # has already been initialized elsewhere.
-      sess.run(v.assign(1.0))
+      self.evaluate(v.assign(1.0))
       self.assertEqual(1.0, v.initialized_value().eval())
 
     v_def.ClearField("initial_value_name")
@@ -494,7 +514,7 @@ class VariablesTestCase(test.TestCase):
       self.assertProtoEquals(v_def, v.to_proto())
       # But attempts to use initialized_value will result in errors.
       with self.assertRaises(ValueError):
-        sess.run(v.initialized_value())
+        self.evaluate(v.initialized_value())
 
   def testTrainableInProto(self):
     with ops.Graph().as_default():
@@ -513,14 +533,16 @@ class VariablesTestCase(test.TestCase):
           variables.Variable(variable_def=trainable_variable.to_proto())
           .trainable)
 
+  @test_util.run_deprecated_v1
   def testLoad(self):
     with self.cached_session():
       var = variables.Variable(np.zeros((5, 5), np.float32))
       variables.global_variables_initializer().run()
       var.load(np.ones((5, 5), np.float32))
 
-      self.assertAllClose(np.ones((5, 5), np.float32), var.eval())
+      self.assertAllClose(np.ones((5, 5), np.float32), self.evaluate(var))
 
+  @test_util.run_deprecated_v1
   def testRepr(self):
     var = variables.VariableV1(np.zeros((5, 5), np.float32), name="noop")
     self.assertEqual(
@@ -542,7 +564,7 @@ class IsInitializedTest(test.TestCase):
   def testNoVars(self):
     with ops.Graph().as_default(), self.cached_session() as sess:
       uninited = variables.report_uninitialized_variables()
-      self.assertEqual(0, sess.run(uninited).size)
+      self.assertEqual(0, self.evaluate(uninited).size)
 
   def testAssertVariablesInitialized(self):
     with ops.Graph().as_default(), self.cached_session() as sess:
@@ -550,27 +572,28 @@ class IsInitializedTest(test.TestCase):
       w = variables.Variable([3, 4], name="w")
       _ = v, w
       uninited = variables.report_uninitialized_variables()
-      self.assertAllEqual(np.array([b"v", b"w"]), sess.run(uninited))
+      self.assertAllEqual(np.array([b"v", b"w"]), self.evaluate(uninited))
       variables.global_variables_initializer().run()
-      self.assertEqual(0, sess.run(uninited).size)
+      self.assertEqual(0, self.evaluate(uninited).size)
 
+  @test_util.run_deprecated_v1
   def testVariableList(self):
     with ops.Graph().as_default(), self.cached_session() as sess:
       v = variables.VariableV1([1, 2], name="v")
       w = variables.VariableV1([3, 4], name="w")
       uninited = variables.report_uninitialized_variables()
-      self.assertAllEqual(np.array([b"v", b"w"]), sess.run(uninited))
-      sess.run(w.initializer)
-      self.assertAllEqual(np.array([b"v"]), sess.run(uninited))
+      self.assertAllEqual(np.array([b"v", b"w"]), self.evaluate(uninited))
+      self.evaluate(w.initializer)
+      self.assertAllEqual(np.array([b"v"]), self.evaluate(uninited))
       v.initializer.run()
-      self.assertEqual(0, sess.run(uninited).size)
+      self.assertEqual(0, self.evaluate(uninited).size)
 
   def testZeroSizeVarInitialized(self):
     with ops.Graph().as_default(), self.cached_session() as sess:
       v = variables.Variable(array_ops.zeros([0, 2]), name="v")
       uninited = variables.report_uninitialized_variables()
       v.initializer.run()  # not strictly necessary
-      self.assertEqual(0, sess.run(uninited).size)
+      self.assertEqual(0, self.evaluate(uninited).size)
 
   def testTrainingWithZeroSizeVar(self):
     with ops.Graph().as_default(), self.cached_session() as sess:
@@ -581,8 +604,8 @@ class IsInitializedTest(test.TestCase):
       variables.global_variables_initializer().run()
       do_opt = gradient_descent.GradientDescentOptimizer(0.1).minimize(
           objective)
-      sess.run([do_opt])
-      self.assertAllClose([[0.9, 0.9], [0.9, 0.9]], b.eval())
+      self.evaluate([do_opt])
+      self.assertAllClose([[0.9, 0.9], [0.9, 0.9]], self.evaluate(b))
 
 
 class ObsoleteIsInitializedTest(test.TestCase):
@@ -591,6 +614,7 @@ class ObsoleteIsInitializedTest(test.TestCase):
     with ops.Graph().as_default():
       self.assertEqual(None, variables.assert_variables_initialized())
 
+  @test_util.run_deprecated_v1
   def testVariables(self):
     with ops.Graph().as_default(), self.cached_session() as sess:
       v = variables.VariableV1([1, 2])
@@ -598,10 +622,11 @@ class ObsoleteIsInitializedTest(test.TestCase):
       _ = v, w
       inited = variables.assert_variables_initialized()
       with self.assertRaisesOpError("Attempting to use uninitialized value"):
-        sess.run(inited)
+        self.evaluate(inited)
       variables.global_variables_initializer().run()
-      sess.run(inited)
+      self.evaluate(inited)
 
+  @test_util.run_deprecated_v1
   def testVariableList(self):
     with ops.Graph().as_default(), self.cached_session() as sess:
       v = variables.VariableV1([1, 2])
@@ -609,7 +634,7 @@ class ObsoleteIsInitializedTest(test.TestCase):
       inited = variables.assert_variables_initialized([v])
       with self.assertRaisesOpError("Attempting to use uninitialized value"):
         inited.op.run()
-      sess.run(w.initializer)
+      self.evaluate(w.initializer)
       with self.assertRaisesOpError("Attempting to use uninitialized value"):
         inited.op.run()
       v.initializer.run()
@@ -744,34 +769,34 @@ class PartitionedVariableTest(test.TestCase):
       variables.global_variables_initializer().run()
 
       self.assertEqual([1.0], plus_delta[0].eval())
-      self.assertEqual([1.0], v0.eval())
+      self.assertEqual([1.0], self.evaluate(v0))
       self.assertEqual([3.0], plus_delta[1].eval())
-      self.assertEqual([3.0], v1.eval())
+      self.assertEqual([3.0], self.evaluate(v1))
 
       self.assertEqual([-2.0], minus_delta[0].eval())
-      self.assertEqual([-2.0], v0.eval())
+      self.assertEqual([-2.0], self.evaluate(v0))
       self.assertEqual([-1.0], minus_delta[1].eval())
-      self.assertEqual([-1.0], v1.eval())
+      self.assertEqual([-1.0], self.evaluate(v1))
 
       self.assertEqual([1.0], assign_ones[0].eval())
-      self.assertEqual([1.0], v0.eval())
+      self.assertEqual([1.0], self.evaluate(v0))
       self.assertEqual([1.0], assign_ones[1].eval())
-      self.assertEqual([1.0], v1.eval())
+      self.assertEqual([1.0], self.evaluate(v1))
 
       self.assertEqual([2.0], assign_list[0].eval())
-      self.assertEqual([2.0], v2.eval())
+      self.assertEqual([2.0], self.evaluate(v2))
       self.assertEqual([3.0], assign_list[1].eval())
-      self.assertEqual([3.0], v3.eval())
+      self.assertEqual([3.0], self.evaluate(v3))
 
       self.assertEqual([3.0], assign_part_value[0].eval())
-      self.assertEqual([3.0], v2.eval())
+      self.assertEqual([3.0], self.evaluate(v2))
       self.assertEqual([4.0], assign_part_value[1].eval())
-      self.assertEqual([4.0], v3.eval())
+      self.assertEqual([4.0], self.evaluate(v3))
 
       self.assertEqual([2.0], assign_part_var[0].eval())
-      self.assertEqual([2.0], v2.eval())
+      self.assertEqual([2.0], self.evaluate(v2))
       self.assertEqual([3.0], assign_part_var[1].eval())
-      self.assertEqual([3.0], v3.eval())
+      self.assertEqual([3.0], self.evaluate(v3))
 
 
 class VariableContainerTest(test.TestCase):
diff --git a/tensorflow/python/kernel_tests/weights_broadcast_test.py b/tensorflow/python/kernel_tests/weights_broadcast_test.py
index 85f9abc69f7..677d8f2f22f 100644
--- a/tensorflow/python/kernel_tests/weights_broadcast_test.py
+++ b/tensorflow/python/kernel_tests/weights_broadcast_test.py
@@ -23,6 +23,7 @@ import numpy as np
 from tensorflow.python.framework import dtypes as dtypes_lib
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import weights_broadcast_ops
 from tensorflow.python.platform import test
@@ -51,40 +52,48 @@ class AssertBroadcastableTest(test.TestCase):
           values_placeholder: values,
       })
 
+  @test_util.run_deprecated_v1
   def testScalar(self):
     self._test_valid(weights=5, values=_test_values((3, 2, 4)))
 
+  @test_util.run_deprecated_v1
   def test1x1x1(self):
     self._test_valid(
         weights=np.asarray((5,)).reshape((1, 1, 1)),
         values=_test_values((3, 2, 4)))
 
+  @test_util.run_deprecated_v1
   def test1x1xN(self):
     self._test_valid(
         weights=np.asarray((5, 7, 11, 3)).reshape((1, 1, 4)),
         values=_test_values((3, 2, 4)))
 
+  @test_util.run_deprecated_v1
   def test1xNx1(self):
     self._test_valid(
         weights=np.asarray((5, 11)).reshape((1, 2, 1)),
         values=_test_values((3, 2, 4)))
 
+  @test_util.run_deprecated_v1
   def test1xNxN(self):
     self._test_valid(
         weights=np.asarray((5, 7, 11, 3, 2, 13, 7, 5)).reshape((1, 2, 4)),
         values=_test_values((3, 2, 4)))
 
+  @test_util.run_deprecated_v1
   def testNx1x1(self):
     self._test_valid(
         weights=np.asarray((5, 7, 11)).reshape((3, 1, 1)),
         values=_test_values((3, 2, 4)))
 
+  @test_util.run_deprecated_v1
   def testNx1xN(self):
     self._test_valid(
         weights=np.asarray((
             5, 7, 11, 3, 2, 12, 7, 5, 2, 17, 11, 3)).reshape((3, 1, 4)),
         values=_test_values((3, 2, 4)))
 
+  @test_util.run_deprecated_v1
   def testNxNxN(self):
     self._test_valid(
         weights=np.asarray((
@@ -107,29 +116,35 @@ class AssertBroadcastableTest(test.TestCase):
             values_placeholder: values,
         })
 
+  @test_util.run_deprecated_v1
   def testInvalid1(self):
     self._test_invalid(weights=np.asarray((5,)), values=_test_values((3, 2, 4)))
 
+  @test_util.run_deprecated_v1
   def testInvalid1x1(self):
     self._test_invalid(
         weights=np.asarray((5,)).reshape((1, 1)),
         values=_test_values((3, 2, 4)))
 
+  @test_util.run_deprecated_v1
   def testInvalidPrefixMatch(self):
     self._test_invalid(
         weights=np.asarray((5, 7, 11, 3, 2, 12)).reshape((3, 2)),
         values=_test_values((3, 2, 4)))
 
+  @test_util.run_deprecated_v1
   def testInvalidSuffixMatch(self):
     self._test_invalid(
         weights=np.asarray((5, 7, 11, 3, 2, 12, 7, 5)).reshape((2, 4)),
         values=_test_values((3, 2, 4)))
 
+  @test_util.run_deprecated_v1
   def testInvalidOnesExtraDim(self):
     self._test_invalid(
         weights=np.asarray((5,)).reshape((1, 1, 1, 1)),
         values=_test_values((3, 2, 4)))
 
+  @test_util.run_deprecated_v1
   def testInvalidPrefixMatchExtraDim(self):
     self._test_invalid(
         weights=np.asarray((
@@ -137,6 +152,7 @@ class AssertBroadcastableTest(test.TestCase):
             2, 17, 11, 3, 5, 7, 11, 3, 2, 12, 7, 5)).reshape((3, 2, 4, 1)),
         values=_test_values((3, 2, 4)))
 
+  @test_util.run_deprecated_v1
   def testInvalidSuffixMatchExtraDim(self):
     self._test_invalid(
         weights=np.asarray((
@@ -158,24 +174,27 @@ class BroadcastWeightsTest(test.TestCase):
     dynamic_op = weights_broadcast_ops.broadcast_weights(
         weights=weights_placeholder, values=values_placeholder)
     with self.cached_session():
-      self.assertAllEqual(expected, static_op.eval())
+      self.assertAllEqual(expected, self.evaluate(static_op))
       self.assertAllEqual(expected, dynamic_op.eval(feed_dict={
           weights_placeholder: weights,
           values_placeholder: values,
       }))
 
+  @test_util.run_deprecated_v1
   def testScalar(self):
     self._test_valid(
         weights=5,
         values=_test_values((3, 2, 4)),
         expected=5 * np.ones((3, 2, 4)))
 
+  @test_util.run_deprecated_v1
   def test1x1x1(self):
     self._test_valid(
         weights=np.asarray((5,)).reshape((1, 1, 1)),
         values=_test_values((3, 2, 4)),
         expected=5 * np.ones((3, 2, 4)))
 
+  @test_util.run_deprecated_v1
   def test1x1xN(self):
     weights = np.asarray((5, 7, 11, 3)).reshape((1, 1, 4))
     self._test_valid(
@@ -183,6 +202,7 @@ class BroadcastWeightsTest(test.TestCase):
         values=_test_values((3, 2, 4)),
         expected=np.tile(weights, reps=(3, 2, 1)))
 
+  @test_util.run_deprecated_v1
   def test1xNx1(self):
     weights = np.asarray((5, 11)).reshape((1, 2, 1))
     self._test_valid(
@@ -190,6 +210,7 @@ class BroadcastWeightsTest(test.TestCase):
         values=_test_values((3, 2, 4)),
         expected=np.tile(weights, reps=(3, 1, 4)))
 
+  @test_util.run_deprecated_v1
   def test1xNxN(self):
     weights = np.asarray((5, 7, 11, 3, 2, 13, 7, 5)).reshape((1, 2, 4))
     self._test_valid(
@@ -197,6 +218,7 @@ class BroadcastWeightsTest(test.TestCase):
         values=_test_values((3, 2, 4)),
         expected=np.tile(weights, reps=(3, 1, 1)))
 
+  @test_util.run_deprecated_v1
   def testNx1x1(self):
     weights = np.asarray((5, 7, 11)).reshape((3, 1, 1))
     self._test_valid(
@@ -204,6 +226,7 @@ class BroadcastWeightsTest(test.TestCase):
         values=_test_values((3, 2, 4)),
         expected=np.tile(weights, reps=(1, 2, 4)))
 
+  @test_util.run_deprecated_v1
   def testNx1xN(self):
     weights = np.asarray((
         5, 7, 11, 3, 2, 12, 7, 5, 2, 17, 11, 3)).reshape((3, 1, 4))
@@ -212,6 +235,7 @@ class BroadcastWeightsTest(test.TestCase):
         values=_test_values((3, 2, 4)),
         expected=np.tile(weights, reps=(1, 2, 1)))
 
+  @test_util.run_deprecated_v1
   def testNxNxN(self):
     weights = np.asarray((
         5, 7, 11, 3, 2, 12, 7, 5, 2, 17, 11, 3,
@@ -234,29 +258,35 @@ class BroadcastWeightsTest(test.TestCase):
             values_placeholder: values,
         })
 
+  @test_util.run_deprecated_v1
   def testInvalid1(self):
     self._test_invalid(weights=np.asarray((5,)), values=_test_values((3, 2, 4)))
 
+  @test_util.run_deprecated_v1
   def testInvalid1x1(self):
     self._test_invalid(
         weights=np.asarray((5,)).reshape((1, 1)),
         values=_test_values((3, 2, 4)))
 
+  @test_util.run_deprecated_v1
   def testInvalidPrefixMatch(self):
     self._test_invalid(
         weights=np.asarray((5, 7, 11, 3, 2, 12)).reshape((3, 2)),
         values=_test_values((3, 2, 4)))
 
+  @test_util.run_deprecated_v1
   def testInvalidSuffixMatch(self):
     self._test_invalid(
         weights=np.asarray((5, 7, 11, 3, 2, 12, 7, 5)).reshape((2, 4)),
         values=_test_values((3, 2, 4)))
 
+  @test_util.run_deprecated_v1
   def testInvalidOnesExtraDim(self):
     self._test_invalid(
         weights=np.asarray((5,)).reshape((1, 1, 1, 1)),
         values=_test_values((3, 2, 4)))
 
+  @test_util.run_deprecated_v1
   def testInvalidPrefixMatchExtraDim(self):
     self._test_invalid(
         weights=np.asarray((
@@ -264,6 +294,7 @@ class BroadcastWeightsTest(test.TestCase):
             2, 17, 11, 3, 5, 7, 11, 3, 2, 12, 7, 5)).reshape((3, 2, 4, 1)),
         values=_test_values((3, 2, 4)))
 
+  @test_util.run_deprecated_v1
   def testInvalidSuffixMatchExtraDim(self):
     self._test_invalid(
         weights=np.asarray((
diff --git a/tensorflow/python/kernel_tests/where_op_test.py b/tensorflow/python/kernel_tests/where_op_test.py
index fca45c3ece4..56c13904113 100644
--- a/tensorflow/python/kernel_tests/where_op_test.py
+++ b/tensorflow/python/kernel_tests/where_op_test.py
@@ -27,6 +27,7 @@ from tensorflow.python.client import session
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import resource_variable_ops
@@ -41,11 +42,11 @@ class WhereOpTest(test.TestCase):
       ans = array_ops.where(x)
       self.assertEqual([None, x.ndim], ans.get_shape().as_list())
       if expected_err_re is None:
-        tf_ans = ans.eval()
+        tf_ans = self.evaluate(ans)
         self.assertAllClose(tf_ans, truth, atol=1e-10)
       else:
         with self.assertRaisesOpError(expected_err_re):
-          ans.eval()
+          self.evaluate(ans)
 
   def testWrongNumbers(self):
     with self.session(use_gpu=True):
@@ -54,6 +55,7 @@ class WhereOpTest(test.TestCase):
       with self.assertRaises(ValueError):
         array_ops.where([False, True], None, [1, 2])
 
+  @test_util.run_deprecated_v1
   def testBasicVec(self):
     x = np.asarray([True, False])
     truth = np.asarray([[0]], dtype=np.int64)
@@ -67,11 +69,13 @@ class WhereOpTest(test.TestCase):
     truth = np.asarray([[2], [4]], dtype=np.int64)
     self._testWhere(x, truth)
 
+  @test_util.run_deprecated_v1
   def testRandomVec(self):
     x = np.random.rand(1000000) > 0.5
     truth = np.vstack([np.where(x)[0].astype(np.int64)]).T
     self._testWhere(x, truth)
 
+  @test_util.run_deprecated_v1
   def testBasicMat(self):
     x = np.asarray([[True, False], [True, False]])
 
@@ -80,6 +84,7 @@ class WhereOpTest(test.TestCase):
 
     self._testWhere(x, truth)
 
+  @test_util.run_deprecated_v1
   def testBasic3Tensor(self):
     x = np.asarray([[[True, False], [True, False]],
                     [[False, True], [False, True]],
@@ -99,36 +104,47 @@ class WhereOpTest(test.TestCase):
     truth = np.vstack(truth).T  # Convert to [num_true, indices].
     self._testWhere(x, truth, expected_err_re)
 
+  @test_util.run_deprecated_v1
   def testRandomBool(self):
     self._testRandom(np.bool)
 
+  @test_util.run_deprecated_v1
   def testRandomInt32(self):
     self._testRandom(np.int32)
 
+  @test_util.run_deprecated_v1
   def testRandomInt64(self):
     self._testRandom(np.int64)
 
+  @test_util.run_deprecated_v1
   def testRandomFloat(self):
     self._testRandom(np.float32)
 
+  @test_util.run_deprecated_v1
   def testRandomDouble(self):
     self._testRandom(np.float64)
 
+  @test_util.run_deprecated_v1
   def testRandomComplex64(self):
     self._testRandom(np.complex64)
 
+  @test_util.run_deprecated_v1
   def testRandomComplex128(self):
     self._testRandom(np.complex128)
 
+  @test_util.run_deprecated_v1
   def testRandomUint8(self):
     self._testRandom(np.uint8)
 
+  @test_util.run_deprecated_v1
   def testRandomInt8(self):
     self._testRandom(np.int8)
 
+  @test_util.run_deprecated_v1
   def testRandomInt16(self):
     self._testRandom(np.int16)
 
+  @test_util.run_deprecated_v1
   def testThreeArgument(self):
     x = np.array([[-2, 3, -1], [1, -3, -3]])
     np_val = np.where(x > 0, x * x, -x)
@@ -136,6 +152,7 @@ class WhereOpTest(test.TestCase):
       tf_val = array_ops.where(constant_op.constant(x) > 0, x * x, -x).eval()
     self.assertAllEqual(tf_val, np_val)
 
+  @test_util.run_deprecated_v1
   def testBatchSelect(self):
     x = np.array([[-2, 3, -1] * 64, [1, -3, -3] * 64] * 8192)  # [16384, 192]
     c_mat = np.array([[False] * 192, [True] * 192] * 8192)  # [16384, 192]
diff --git a/tensorflow/python/kernel_tests/while_v2_test.py b/tensorflow/python/kernel_tests/while_v2_test.py
index dc1bcb78b80..09cbeb1a0d5 100644
--- a/tensorflow/python/kernel_tests/while_v2_test.py
+++ b/tensorflow/python/kernel_tests/while_v2_test.py
@@ -20,14 +20,15 @@ from __future__ import print_function
 
 from absl.testing import parameterized
 
+from tensorflow.core.protobuf import config_pb2
 from tensorflow.core.protobuf import rewriter_config_pb2
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import meta_graph
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.grappler import tf_optimizer
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import functional_ops
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import list_ops
@@ -42,14 +43,29 @@ from tensorflow.python.platform import test
 
 class WhileV2Test(test.TestCase, parameterized.TestCase):
 
+  @test_util.run_deprecated_v1
   def testSingleLoopVar(self):
     x = constant_op.constant(2.)
-    ret = while_loop_v2(lambda v: v < 8., lambda v: v * v, [x])
+    ret = while_loop_v2(
+        lambda v: v < 8., lambda v: v * v, [x], return_same_structure=False)
     grad = gradients_impl.gradients(ret, [x])
     with self.cached_session() as sess:
-      self.assertEqual(sess.run(ret), 16.)
+      self.assertEqual(self.evaluate(ret), 16.)
+      self.assertSequenceEqual(self.evaluate(grad), [32.])
+
+  def testReturnSameStructureTrue(self):
+    x = constant_op.constant(2.)
+    ret = while_loop_v2(
+        lambda v: v < 8., lambda v: v * v, [x], return_same_structure=True)
+    grad = gradients_impl.gradients(ret, [x])
+    with self.cached_session() as sess:
+      eval_result = sess.run(ret)
+      self.assertIsInstance(eval_result, list)
+      self.assertLen(eval_result, 1)
+      self.assertEqual(16., eval_result[0])
       self.assertSequenceEqual(sess.run(grad), [32.])
 
+  @test_util.run_deprecated_v1
   def testMultipleLoopVarsBasic(self):
     x = constant_op.constant(5.)
     y = constant_op.constant(3.)
@@ -58,15 +74,19 @@ class WhileV2Test(test.TestCase, parameterized.TestCase):
     # y = 3.
     # while x < 45.:
     #   x = x * y
-    ret = while_loop_v2(lambda v, _: v < 45., lambda v, w: (v * w, w), [x, y])
+    ret = while_loop_v2(
+        lambda v, _: v < 45.,
+        lambda v, w: (v * w, w), [x, y],
+        return_same_structure=False)
     # ret = [x*y^2, y]
 
     # Note: This is simply d_ret[0]/d_x since d_ret[1]/d_x is 0.
     grad = gradients_impl.gradients(ret, [x])  # [2*x*y]
     with self.cached_session() as sess:
-      self.assertSequenceEqual(sess.run(ret), [45., 3.])
-      self.assertSequenceEqual(sess.run(grad), [9.])
+      self.assertSequenceEqual(self.evaluate(ret), [45., 3.])
+      self.assertSequenceEqual(self.evaluate(grad), [9.])
 
+  @test_util.run_deprecated_v1
   def testMultipleLoopVars(self):
     x = constant_op.constant(5.)
     y = constant_op.constant(3.)
@@ -76,8 +96,10 @@ class WhileV2Test(test.TestCase, parameterized.TestCase):
     # while x < 45.:
     #   x = x * y
     #   y = x + y
-    ret = while_loop_v2(lambda v, _: v < 45., lambda v, w: (v * w, v + w),
-                        [x, y])
+    ret = while_loop_v2(
+        lambda v, _: v < 45.,
+        lambda v, w: (v * w, v + w), [x, y],
+        return_same_structure=False)
     # ret = [y*x**2 + x*y**2, x*y + x + y]
 
     gradx_0 = gradients_impl.gradients(ret[0], [x])  # [2*x*y + y**2]
@@ -87,34 +109,43 @@ class WhileV2Test(test.TestCase, parameterized.TestCase):
     grady_1 = gradients_impl.gradients(ret[1], [y])  # [x + 1]
     grady_2 = gradients_impl.gradients(ret, [y])  # [2*x*y + x**2 + x + 1]
     with self.cached_session() as sess:
-      self.assertSequenceEqual(sess.run(ret), [120., 23.])
-      self.assertSequenceEqual(sess.run(gradx_0), [39.])
-      self.assertSequenceEqual(sess.run(gradx_1), [4.])
-      self.assertSequenceEqual(sess.run(gradx_2), [43.])
-      self.assertSequenceEqual(sess.run(grady_0), [55.])
-      self.assertSequenceEqual(sess.run(grady_1), [6.])
-      self.assertSequenceEqual(sess.run(grady_2), [61.])
+      self.assertSequenceEqual(self.evaluate(ret), [120., 23.])
+      self.assertSequenceEqual(self.evaluate(gradx_0), [39.])
+      self.assertSequenceEqual(self.evaluate(gradx_1), [4.])
+      self.assertSequenceEqual(self.evaluate(gradx_2), [43.])
+      self.assertSequenceEqual(self.evaluate(grady_0), [55.])
+      self.assertSequenceEqual(self.evaluate(grady_1), [6.])
+      self.assertSequenceEqual(self.evaluate(grady_2), [61.])
 
+  @test_util.run_deprecated_v1
   def testMultipleWhileLoops(self):
     x = constant_op.constant(2.)
-    ret1 = while_loop_v2(lambda v: v < 4., lambda v: v * v, [x])  # x**2
-    ret2 = while_loop_v2(lambda v: v < 16., lambda v: v * v, [ret1])  # x**4
+    ret1 = while_loop_v2(
+        lambda v: v < 4., lambda v: v * v, [x],
+        return_same_structure=False)  # x**2
+    ret2 = while_loop_v2(
+        lambda v: v < 16., lambda v: v * v, [ret1],
+        return_same_structure=False)  # x**4
     grad = gradients_impl.gradients(ret2, [x])  # 4x**3
     grad_grad = gradients_impl.gradients(grad, [x])  # 12x**2
     with self.cached_session() as sess:
-      self.assertSequenceEqual(sess.run(grad), [32.])
-      self.assertSequenceEqual(sess.run(grad_grad), [48.])
+      self.assertSequenceEqual(self.evaluate(grad), [32.])
+      self.assertSequenceEqual(self.evaluate(grad_grad), [48.])
 
+  @test_util.run_deprecated_v1
   def testDoubleDerivative(self):
     x = constant_op.constant(2.)
-    ret = while_loop_v2(lambda v: v < 8., lambda v: v**2, [x])  # x**4
+    ret = while_loop_v2(
+        lambda v: v < 8., lambda v: v**2, [x],
+        return_same_structure=False)  # x**4
     grad = gradients_impl.gradients(ret, [x])  # 4x**3
     grad_grad = gradients_impl.gradients(grad, [x])  # 12x**2
     with self.cached_session() as sess:
-      self.assertEqual(sess.run(ret), 16.)
-      self.assertSequenceEqual(sess.run(grad), [32.])
-      self.assertSequenceEqual(sess.run(grad_grad), [48.])
+      self.assertEqual(self.evaluate(ret), 16.)
+      self.assertSequenceEqual(self.evaluate(grad), [32.])
+      self.assertSequenceEqual(self.evaluate(grad_grad), [48.])
 
+  @test_util.run_deprecated_v1
   def testPruning(self):
     x = constant_op.constant(1)
 
@@ -135,10 +166,12 @@ class WhileV2Test(test.TestCase, parameterized.TestCase):
 
     def GetOptimizedGraph():
       mg = meta_graph.create_meta_graph_def(graph=ops.get_default_graph())
-      rewriter_config = rewriter_config_pb2.RewriterConfig(
-          constant_folding=rewriter_config_pb2.RewriterConfig.OFF,
-          memory_optimization=rewriter_config_pb2.RewriterConfig.MANUAL)
-      return tf_optimizer.OptimizeGraph(rewriter_config, mg)
+      config = config_pb2.ConfigProto()
+      config.graph_options.rewrite_options.CopyFrom(
+          rewriter_config_pb2.RewriterConfig(
+              constant_folding=rewriter_config_pb2.RewriterConfig.OFF,
+              memory_optimization=rewriter_config_pb2.RewriterConfig.MANUAL))
+      return tf_optimizer.OptimizeGraph(config, mg)
 
     g = GetOptimizedGraph()
     self.assertEqual(len([n for n in g.node if n.op == "Enter"]), 1)
@@ -148,24 +181,31 @@ class WhileV2Test(test.TestCase, parameterized.TestCase):
     g = GetOptimizedGraph()
     self.assertEqual(len([n for n in g.node if n.op == "Enter"]), 2)
 
+  @test_util.run_deprecated_v1
   def testCaptureExternalTensorInCond(self):
     x = constant_op.constant(2.)
     y = constant_op.constant(1.)
-    ret = while_loop_v2(lambda v: v + y < 9., lambda v: v * 3., [x])
+    ret = while_loop_v2(
+        lambda v: v + y < 9.,
+        lambda v: v * 3., [x],
+        return_same_structure=False)
     grad = gradients_impl.gradients(ret, [x])
     with self.cached_session() as sess:
-      self.assertEqual(sess.run(ret), 18.)
-      self.assertSequenceEqual(sess.run(grad), [9.])
+      self.assertEqual(self.evaluate(ret), 18.)
+      self.assertSequenceEqual(self.evaluate(grad), [9.])
 
+  @test_util.run_deprecated_v1
   def testCaptureExternalTensorInBody(self):
     x = constant_op.constant(2.)
     y = constant_op.constant(3.)
-    ret = while_loop_v2(lambda v: v < 8., lambda v: v * y, [x])
+    ret = while_loop_v2(
+        lambda v: v < 8., lambda v: v * y, [x], return_same_structure=False)
     grad = gradients_impl.gradients(ret, [x])
     with self.cached_session() as sess:
-      self.assertEqual(sess.run(ret), 18.)
-      self.assertSequenceEqual(sess.run(grad), [9.])
+      self.assertEqual(self.evaluate(ret), 18.)
+      self.assertSequenceEqual(self.evaluate(grad), [9.])
 
+  @test_util.run_deprecated_v1
   def testLoopWithTensorListPushBack(self):
     x = constant_op.constant(2.)
 
@@ -181,12 +221,14 @@ class WhileV2Test(test.TestCase, parameterized.TestCase):
       tl = list_ops.tensor_list_push_back(tl, constant_op.constant(100.))
       return x**2., tl
 
-    ret = while_loop_v2(Cond, Body, [x, tensor_list])
+    ret = while_loop_v2(
+        Cond, Body, [x, tensor_list], return_same_structure=False)
     grad = gradients_impl.gradients(ret[0], x)
     with self.cached_session() as sess:
       self.assertEqual(sess.run(ret[0]), 16.)
-      self.assertSequenceEqual(sess.run(grad), [32.])
+      self.assertSequenceEqual(self.evaluate(grad), [32.])
 
+  @test_util.run_deprecated_v1
   def testDuplicateAccumulator(self):
     x = constant_op.constant(2.)
 
@@ -203,7 +245,8 @@ class WhileV2Test(test.TestCase, parameterized.TestCase):
       tl = list_ops.tensor_list_push_back(tl, x)
       return x**2., tl
 
-    ret = while_loop_v2(Cond, Body, [x, tensor_list])
+    ret = while_loop_v2(
+        Cond, Body, [x, tensor_list], return_same_structure=False)
 
     for op in ops.get_default_graph().get_operations():
       if op.type == "While":
@@ -219,13 +262,14 @@ class WhileV2Test(test.TestCase, parameterized.TestCase):
     grad = gradients_impl.gradients(ret[0], x)
     with self.cached_session() as sess:
       self.assertEqual(sess.run(ret[0]), 16.)
-      self.assertSequenceEqual(sess.run(grad), [32.])
+      self.assertSequenceEqual(self.evaluate(grad), [32.])
 
   @parameterized.named_parameters(
       ("UnknownShape", None),
       ("PartiallyDefinedShape", [None, 2]),
       ("FullyDefinedShape", [1, 2]),
   )
+  @test_util.run_deprecated_v1
   def testAccumulatorElementShape(self, shape):
 
     def MatchShape(actual_tensor_shape):
@@ -250,7 +294,10 @@ class WhileV2Test(test.TestCase, parameterized.TestCase):
     y = array_ops.placeholder(dtype=dtypes.float32, shape=shape)
 
     # Forward pass.
-    ret = while_loop_v2(lambda v, u: v < 8., lambda v, u: (v * v, u), [x, y])
+    ret = while_loop_v2(
+        lambda v, u: v < 8.,
+        lambda v, u: (v * v, u), [x, y],
+        return_same_structure=False)
     while_op = ret[0].op.inputs[0].op
     # Get the TensorList output of While op containing the accumulated values
     # of y.
@@ -262,7 +309,7 @@ class WhileV2Test(test.TestCase, parameterized.TestCase):
 
     # Gradient pass.
     grad = gradients_impl.gradients(ret[1], y)
-    grad_while_op = grad[0].op
+    grad_while_op = grad[0].op.inputs[0].op
     # Get the TensorList output of gradient While op containing the accumulated
     # values of grad_y.
     # grad_while_op.inputs:
@@ -274,8 +321,10 @@ class WhileV2Test(test.TestCase, parameterized.TestCase):
 
   def _createWhile(self, name):
     """Helper function testDefaultName."""
-    output = while_v2.while_loop(lambda i: i < 3, lambda i: i + 1,
-                                 [constant_op.constant(0)])
+    output = while_v2.while_loop(
+        lambda i: i < 3,
+        lambda i: i + 1, [constant_op.constant(0)],
+        return_same_structure=False)
     while_op = output.op.inputs[0].op
     self.assertEqual(while_op.type, "While")
     return while_op
@@ -305,19 +354,19 @@ class WhileV2Test(test.TestCase, parameterized.TestCase):
         self.assertRegexpMatches(
             while2_op.get_attr("body").name, r"foo_while_1_body_\d*")
 
+  @test_util.enable_control_flow_v2
+  @test_util.run_deprecated_v1
   def testWhileAndTensorArray(self):
-    old_enable_while_v2 = control_flow_ops.ENABLE_WHILE_V2
-    control_flow_ops.ENABLE_WHILE_V2 = True
     with self.cached_session() as sess:
       param = constant_op.constant(2.0)
       y0 = constant_op.constant([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], name="elems")
       # map_fn uses TensorArray internally.
       r = functional_ops.map_fn(lambda x: math_ops.multiply(x, param), y0)
-      self.assertAllClose([2.0, 4.0, 6.0, 8.0, 10.0, 12.0], sess.run(r))
+      self.assertAllClose([2.0, 4.0, 6.0, 8.0, 10.0, 12.0], self.evaluate(r))
       r = gradients_impl.gradients(r, param)[0]
-      self.assertAllClose(21.0, sess.run(r))
-    control_flow_ops.ENABLE_WHILE_V2 = old_enable_while_v2
+      self.assertAllClose(21.0, self.evaluate(r))
 
+  @test_util.run_deprecated_v1
   def testNestedWhile(self):
     # Compute sum of geometric progression: n^0 + n^1 + ... + n^m
     # We compute the pow using a while loop.
@@ -328,14 +377,20 @@ class WhileV2Test(test.TestCase, parameterized.TestCase):
     def Body(i, previous_sum):
       prod = constant_op.constant(1.)
       return i - 1., previous_sum + while_loop_v2(
-          lambda c, _: c > 0, lambda c, v: (c - 1., v * n), [i, prod])[1]
+          lambda c, _: c > 0,
+          lambda c, v: (c - 1., v * n), [i, prod],
+          return_same_structure=False)[1]
 
-    result = while_loop_v2(lambda i, _: i >= 0, Body, [m, sum_of_powers])[1]
+    result = while_loop_v2(
+        lambda i, _: i >= 0,
+        Body, [m, sum_of_powers],
+        return_same_structure=False)[1]
     grad = gradients_impl.gradients(result, [n])
     with self.cached_session() as sess:
-      self.assertEqual(sess.run(result), 364.)
-      self.assertSequenceEqual(sess.run(grad), [547.])
+      self.assertEqual(self.evaluate(result), 364.)
+      self.assertSequenceEqual(self.evaluate(grad), [547.])
 
+  @test_util.run_deprecated_v1
   def testIdentityNodeInBody(self):
 
     def Body(v):
@@ -344,12 +399,14 @@ class WhileV2Test(test.TestCase, parameterized.TestCase):
       return v * v
 
     x = constant_op.constant(2.)
-    ret = while_loop_v2(lambda v: v < 8., Body, [x])
+    ret = while_loop_v2(
+        lambda v: v < 8., Body, [x], return_same_structure=False)
     grad = gradients_impl.gradients(ret, [x])
     with self.cached_session() as sess:
-      self.assertEqual(sess.run(ret), 16.)
-      self.assertSequenceEqual(sess.run(grad), [32.])
+      self.assertEqual(self.evaluate(ret), 16.)
+      self.assertSequenceEqual(self.evaluate(grad), [32.])
 
+  @test_util.run_deprecated_v1
   def testNestedWhileAndTensorArray(self):
     n = constant_op.constant(3.0)
 
@@ -362,13 +419,17 @@ class WhileV2Test(test.TestCase, parameterized.TestCase):
         return row, col + 1., ta, n
 
       # TODO(b/118457764): Remove n from loop_vars from both loops once fixed.
-      ta = while_loop_v2(lambda _, col, _1, n: col <= n, InnerBody,
-                         [row, constant_op.constant(1.), ta, n])[2]
+      ta = while_loop_v2(
+          lambda _, col, _1, n: col <= n,
+          InnerBody, [row, constant_op.constant(1.), ta, n],
+          return_same_structure=False)[2]
       return row + 1., ta, n
 
     ta = tensor_array_ops.TensorArray(dtype=dtypes.float32, size=9)
-    ta = while_loop_v2(lambda row, _, _1: row <= n, Body,
-                       [constant_op.constant(1.), ta, n])[1]
+    ta = while_loop_v2(
+        lambda row, _, _1: row <= n,
+        Body, [constant_op.constant(1.), ta, n],
+        return_same_structure=False)[1]
 
     output = array_ops.reshape(ta.stack(), [3, 3])
     self.assertAllEqual(
diff --git a/tensorflow/python/kernel_tests/xent_op_test.py b/tensorflow/python/kernel_tests/xent_op_test.py
index c3c7f867a1e..f5d03c23701 100644
--- a/tensorflow/python/kernel_tests/xent_op_test.py
+++ b/tensorflow/python/kernel_tests/xent_op_test.py
@@ -27,6 +27,7 @@ from tensorflow.python.client import session
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_nn_ops
 from tensorflow.python.ops import gradient_checker
@@ -56,7 +57,7 @@ class XentTest(test.TestCase):
     with self.cached_session(use_gpu=use_gpu) as sess:
       loss, backprop = gen_nn_ops.softmax_cross_entropy_with_logits(
           np_features, np_labels)
-      tf_loss, tf_backprop = sess.run([loss, backprop])
+      tf_loss, tf_backprop = self.evaluate([loss, backprop])
     self.assertAllCloseAccordingToType(np_loss, tf_loss)
     self.assertAllCloseAccordingToType(np_backprop, tf_backprop)
 
@@ -65,7 +66,7 @@ class XentTest(test.TestCase):
     with self.cached_session(use_gpu=use_gpu) as sess:
       loss = nn_ops.softmax_cross_entropy_with_logits(
           labels=np_labels, logits=np_features, dim=dim)
-      tf_loss = sess.run(loss)
+      tf_loss = self.evaluate(loss)
     print("np_loss:", np_loss)
     print("tf_loss:", tf_loss)
     self.assertAllCloseAccordingToType(np_loss, tf_loss)
@@ -80,7 +81,7 @@ class XentTest(test.TestCase):
         loss, backprop = gen_nn_ops.softmax_cross_entropy_with_logits(
             np.array([[1.], [-1.], [0.]]).astype(dtype),
             np.array([[-1.], [0.], [1.]]).astype(dtype))
-        tf_loss, tf_backprop = sess.run([loss, backprop])
+        tf_loss, tf_backprop = self.evaluate([loss, backprop])
       self.assertAllClose([0.0, 0.0, 0.0], tf_loss)
       self.assertAllClose([[2.0], [1.0], [0.0]], tf_backprop)
 
@@ -88,6 +89,7 @@ class XentTest(test.TestCase):
     self._testSingleClass(True)
     self._testSingleClass(False)
 
+  @test_util.run_deprecated_v1
   def testRankTooLarge(self):
     for dtype in np.float16, np.float32:
       np_features = np.array([[[1., 1., 1., 1.]], [[1., 2., 3.,
@@ -148,16 +150,18 @@ class XentTest(test.TestCase):
       with self.cached_session(use_gpu=use_gpu) as sess:
         loss, backprop = gen_nn_ops.softmax_cross_entropy_with_logits(
             tf_f, tf_l)
-        tf_loss, tf_backprop = sess.run([loss, backprop])
+        tf_loss, tf_backprop = self.evaluate([loss, backprop])
       self.assertAllCloseAccordingToType(np_loss, tf_loss)
       self.assertAllCloseAccordingToType(np_backprop, tf_backprop)
 
+  @test_util.run_deprecated_v1
   def testShapeMismatch(self):
     with self.cached_session():
       with self.assertRaises(ValueError):
         gen_nn_ops.softmax_cross_entropy_with_logits(
             [[0., 1.], [2., 3.]], [[0., 1., 0.], [1., 0., 0.]])
 
+  @test_util.run_deprecated_v1
   def testNotMatrix(self):
     with self.cached_session():
       with self.assertRaises(ValueError):
@@ -179,6 +183,7 @@ class XentTest(test.TestCase):
         np.array([[1., 1., 1., 1.], [1., 2., 3., 4.]]).astype(np.float64),
         np.array([[0., 0., 0., 1.], [0., .5, .5, 0.]]).astype(np.float64))
 
+  @test_util.run_deprecated_v1
   def testGradient(self):
     with self.cached_session() as sess:
       l = constant_op.constant(
@@ -206,6 +211,7 @@ class XentTest(test.TestCase):
     print("cross entropy gradient err = ", err)
     self.assertLess(err, 5e-8)
 
+  @test_util.run_deprecated_v1
   def testGradientLabelWithV2(self):
     with self.cached_session():
       l = constant_op.constant(
@@ -224,6 +230,7 @@ class XentTest(test.TestCase):
 
     self.assertLess(err, 5e-8)
 
+  @test_util.run_deprecated_v1
   def testSecondGradient(self):
     with self.cached_session() as sess:
       l = constant_op.constant(
@@ -280,7 +287,7 @@ class XentTest(test.TestCase):
     with self.session(use_gpu=True) as sess:
       loss = nn_ops.softmax_cross_entropy_with_logits(
           labels=labels, logits=features)
-      tf_loss = sess.run(loss)
+      tf_loss = self.evaluate(loss)
     self.assertAllEqual(np_loss, tf_loss)
 
 
diff --git a/tensorflow/python/kernel_tests/zero_division_test.py b/tensorflow/python/kernel_tests/zero_division_test.py
index e68b96e670f..3dd9ec4ba94 100644
--- a/tensorflow/python/kernel_tests/zero_division_test.py
+++ b/tensorflow/python/kernel_tests/zero_division_test.py
@@ -21,13 +21,15 @@ from __future__ import print_function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
+from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
 
 
 class ZeroDivisionTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testZeros(self):
-    with self.session(use_gpu=True):
+    with test_util.use_gpu():
       for dtype in dtypes.uint8, dtypes.int16, dtypes.int32, dtypes.int64:
         zero = constant_op.constant(0, dtype=dtype)
         one = constant_op.constant(1, dtype=dtype)
@@ -36,7 +38,7 @@ class ZeroDivisionTest(test.TestCase):
           bads.append(one % zero)
         for bad in bads:
           try:
-            result = bad.eval()
+            result = self.evaluate(bad)
           except errors_impl.OpError as e:
             # Ideally, we'd get a nice exception.  In theory, this should only
             # happen on CPU, but 32 bit integer GPU division is actually on
diff --git a/tensorflow/python/layers/base.py b/tensorflow/python/layers/base.py
index fccea484b0f..bfe591f8755 100644
--- a/tensorflow/python/layers/base.py
+++ b/tensorflow/python/layers/base.py
@@ -23,6 +23,7 @@ from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.keras.engine import base_layer
+from tensorflow.python.keras.engine import base_layer_utils
 from tensorflow.python.ops import variable_scope as vs
 from tensorflow.python.ops import variables as tf_variables
 from tensorflow.python.util import function_utils
@@ -30,10 +31,10 @@ from tensorflow.python.util import nest
 from tensorflow.python.util import tf_contextlib
 from tensorflow.python.util.tf_export import tf_export
 
-
+# Avoid breaking users who directly import this symbol from this file.
+# TODO(fchollet): remove this.
 InputSpec = base_layer.InputSpec  # pylint: disable=invalid-name
 
-
 _KERAS_STYLE_SCOPE = False
 
 
@@ -242,11 +243,11 @@ class Layer(base_layer.Layer):
   def _make_unique_name(self, name_uid_map=None, avoid_names=None,
                         namespace='', zero_based=False):
     base_name = base_layer.to_snake_case(self.__class__.__name__)
-    name = base_layer.unique_layer_name(base_name,
-                                        name_uid_map=name_uid_map,
-                                        avoid_names=avoid_names,
-                                        namespace=namespace,
-                                        zero_based=zero_based)
+    name = base_layer_utils.unique_layer_name(base_name,
+                                              name_uid_map=name_uid_map,
+                                              avoid_names=avoid_names,
+                                              namespace=namespace,
+                                              zero_based=zero_based)
     return (name, base_name)
 
   @property
diff --git a/tensorflow/python/layers/base_test.py b/tensorflow/python/layers/base_test.py
index 90abf35e875..d0ec4f4425f 100644
--- a/tensorflow/python/layers/base_test.py
+++ b/tensorflow/python/layers/base_test.py
@@ -26,6 +26,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.keras.engine import base_layer as keras_base_layer
+from tensorflow.python.keras.engine import input_spec
 from tensorflow.python.layers import base as base_layers
 from tensorflow.python.layers import core as core_layers
 from tensorflow.python.ops import array_ops
@@ -142,6 +143,7 @@ class BaseLayerTest(test.TestCase):
           synchronization=variable_scope.VariableSynchronization.ON_READ,
           trainable=True)
 
+  @test_util.run_deprecated_v1
   def testReusePartitionedVaraiblesAndRegularizers(self):
     regularizer = lambda x: math_ops.reduce_sum(x) * 1e-3
     partitioner = partitioned_variables.fixed_size_partitioner(3)
@@ -251,7 +253,7 @@ class BaseLayerTest(test.TestCase):
 
       def __init__(self):
         super(CustomerLayer, self).__init__()
-        self.input_spec = base_layers.InputSpec(ndim=2)
+        self.input_spec = input_spec.InputSpec(ndim=2)
 
       def call(self, inputs):
         return inputs
@@ -278,7 +280,7 @@ class BaseLayerTest(test.TestCase):
 
       def __init__(self):
         super(CustomerLayer, self).__init__()
-        self.input_spec = base_layers.InputSpec(min_ndim=2)
+        self.input_spec = input_spec.InputSpec(min_ndim=2)
 
       def call(self, inputs):
         return inputs
@@ -306,7 +308,7 @@ class BaseLayerTest(test.TestCase):
 
       def __init__(self):
         super(CustomerLayer, self).__init__()
-        self.input_spec = base_layers.InputSpec(max_ndim=2)
+        self.input_spec = input_spec.InputSpec(max_ndim=2)
 
       def call(self, inputs):
         return inputs
@@ -334,7 +336,7 @@ class BaseLayerTest(test.TestCase):
 
       def __init__(self):
         super(CustomerLayer, self).__init__()
-        self.input_spec = base_layers.InputSpec(dtype='float32')
+        self.input_spec = input_spec.InputSpec(dtype='float32')
 
       def call(self, inputs):
         return inputs
@@ -354,7 +356,7 @@ class BaseLayerTest(test.TestCase):
 
       def __init__(self):
         super(CustomerLayer, self).__init__()
-        self.input_spec = base_layers.InputSpec(axes={-1: 2})
+        self.input_spec = input_spec.InputSpec(axes={-1: 2})
 
       def call(self, inputs):
         return inputs
@@ -376,7 +378,7 @@ class BaseLayerTest(test.TestCase):
 
       def __init__(self):
         super(CustomerLayer, self).__init__()
-        self.input_spec = base_layers.InputSpec(shape=(None, 3))
+        self.input_spec = input_spec.InputSpec(shape=(None, 3))
 
       def call(self, inputs):
         return inputs
@@ -444,6 +446,7 @@ class BaseLayerTest(test.TestCase):
       self.assertTrue(isinstance(result, dict))
       self.assertEqual(set(['label', 'logits']), set(result.keys()))
 
+  @test_util.run_deprecated_v1
   def testActivityRegularizer(self):
     regularizer = math_ops.reduce_sum
     layer = base_layers.Layer(activity_regularizer=regularizer)
@@ -532,6 +535,7 @@ class BaseLayerTest(test.TestCase):
         self.assertEqual(len(layer.trainable_variables), 1)
         self.assertEqual(layer.variables[0].graph, outer_graph)
 
+  @test_util.run_deprecated_v1
   def testGetUpdateFor(self):
 
     class MyLayer(base_layers.Layer):
@@ -576,6 +580,7 @@ class BaseLayerTest(test.TestCase):
     self.assertEqual(len(layer.get_updates_for([intermediate_inputs])), 1)
     self.assertEqual(len(layer.get_updates_for([outputs])), 0)
 
+  @test_util.run_deprecated_v1
   def testGetLossesFor(self):
 
     class MyLayer(base_layers.Layer):
diff --git a/tensorflow/python/layers/convolutional_test.py b/tensorflow/python/layers/convolutional_test.py
index 257fa271567..a3e493edfea 100644
--- a/tensorflow/python/layers/convolutional_test.py
+++ b/tensorflow/python/layers/convolutional_test.py
@@ -22,6 +22,7 @@ import numpy as np
 
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.layers import convolutional as conv_layers
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import init_ops
@@ -59,6 +60,7 @@ class ConvTest(test.TestCase):
     with self.assertRaisesRegexp(ValueError, 'kernel_size'):
       conv_layers.conv2d(images, 32, None)
 
+  @test_util.run_deprecated_v1
   def testCreateConv2D(self):
     height, width = 7, 9
     images = random_ops.random_uniform((5, height, width, 4))
@@ -87,6 +89,7 @@ class ConvTest(test.TestCase):
     self.assertListEqual(layer.kernel.get_shape().as_list(), [3, 3, 4, 32])
     self.assertListEqual(layer.bias.get_shape().as_list(), [32])
 
+  @test_util.run_deprecated_v1
   def testCreateConv2DChannelsFirst(self):
     height, width = 7, 9
     images = random_ops.random_uniform((5, 4, height, width))
@@ -97,6 +100,7 @@ class ConvTest(test.TestCase):
     self.assertListEqual(layer.kernel.get_shape().as_list(), [3, 3, 4, 32])
     self.assertListEqual(layer.bias.get_shape().as_list(), [32])
 
+  @test_util.run_deprecated_v1
   def testUnknownInputChannels(self):
     images = array_ops.placeholder(dtypes.float32, (5, 7, 9, None))
     layer = conv_layers.Conv2D(32, [3, 3], activation=nn_ops.relu)
@@ -140,6 +144,7 @@ class ConvTest(test.TestCase):
     self.assertListEqual(output.get_shape().as_list(),
                          [5, height / 2, width, 32])
 
+  @test_util.run_deprecated_v1
   def testCreateConv1D(self):
     width = 7
     data = random_ops.random_uniform((5, width, 4))
@@ -156,6 +161,7 @@ class ConvTest(test.TestCase):
     output = conv_layers.conv1d(data, 32, 3, activation=nn_ops.relu)
     self.assertListEqual(output.get_shape().as_list(), [5, width - 2, 32])
 
+  @test_util.run_deprecated_v1
   def testCreateConv1DChannelsFirst(self):
     width = 7
     data = random_ops.random_uniform((5, 4, width))
@@ -165,6 +171,7 @@ class ConvTest(test.TestCase):
     self.assertListEqual(layer.kernel.get_shape().as_list(), [3, 4, 32])
     self.assertListEqual(layer.bias.get_shape().as_list(), [32])
 
+  @test_util.run_deprecated_v1
   def testUnknownInputChannelsConv1D(self):
     data = array_ops.placeholder(dtypes.float32, (5, 4, None))
     layer = conv_layers.Conv1D(32, 3, activation=nn_ops.relu)
@@ -180,6 +187,7 @@ class ConvTest(test.TestCase):
                                  'should be defined. Found `None`.'):
       _ = layer.apply(data)
 
+  @test_util.run_deprecated_v1
   def testCreateConv3D(self):
     depth, height, width = 6, 7, 9
     volumes = random_ops.random_uniform((5, depth, height, width, 4))
@@ -191,6 +199,7 @@ class ConvTest(test.TestCase):
     self.assertListEqual(layer.kernel.get_shape().as_list(), [3, 3, 3, 4, 32])
     self.assertListEqual(layer.bias.get_shape().as_list(), [32])
 
+  @test_util.run_deprecated_v1
   def testUnknownInputChannelsConv3D(self):
     volumes = array_ops.placeholder(dtypes.float32, (5, 6, 7, 9, None))
     layer = conv_layers.Conv3D(32, [3, 3, 3], activation=nn_ops.relu)
@@ -199,6 +208,7 @@ class ConvTest(test.TestCase):
                                  'should be defined. Found `None`.'):
       _ = layer.apply(volumes)
 
+  @test_util.run_deprecated_v1
   def testConv2DKernelRegularizer(self):
     height, width = 7, 9
     images = random_ops.random_uniform((5, height, width, 4))
@@ -210,6 +220,7 @@ class ConvTest(test.TestCase):
     self.evaluate([v.initializer for v in layer.variables])
     self.assertListEqual(self.evaluate(layer.losses), self.evaluate(loss_keys))
 
+  @test_util.run_deprecated_v1
   def testConv2DBiasRegularizer(self):
     height, width = 7, 9
     images = random_ops.random_uniform((5, height, width, 4))
@@ -221,6 +232,7 @@ class ConvTest(test.TestCase):
     self.evaluate([v.initializer for v in layer.variables])
     self.assertListEqual(self.evaluate(layer.losses), self.evaluate(loss_keys))
 
+  @test_util.run_deprecated_v1
   def testConv2DNoBias(self):
     height, width = 7, 9
     images = random_ops.random_uniform((5, height, width, 4))
@@ -247,6 +259,7 @@ class ConvTest(test.TestCase):
     output = layer.apply(images)
     self.assertListEqual(output.get_shape().as_list(), [5, height - 2, 3, 32])
 
+  @test_util.run_deprecated_v1
   def testFunctionalConv2DReuse(self):
     height, width = 7, 9
     images = random_ops.random_uniform((5, height, width, 3), seed=1)
@@ -255,6 +268,7 @@ class ConvTest(test.TestCase):
     conv_layers.conv2d(images, 32, [3, 3], name='conv1', reuse=True)
     self.assertEqual(len(variables.trainable_variables()), 2)
 
+  @test_util.run_deprecated_v1
   def testFunctionalConv2DReuseFromScope(self):
     with variable_scope.variable_scope('scope'):
       height, width = 7, 9
@@ -265,6 +279,7 @@ class ConvTest(test.TestCase):
       conv_layers.conv2d(images, 32, [3, 3], name='conv1')
       self.assertEqual(len(variables.trainable_variables()), 2)
 
+  @test_util.run_deprecated_v1
   def testFunctionalConv2DInitializerFromScope(self):
     with self.cached_session() as sess:
       with variable_scope.variable_scope(
@@ -276,13 +291,14 @@ class ConvTest(test.TestCase):
         # Check the names of weights in order.
         self.assertTrue('kernel' in weights[0].name)
         self.assertTrue('bias' in weights[1].name)
-        sess.run(variables.global_variables_initializer())
-        weights = sess.run(weights)
+        self.evaluate(variables.global_variables_initializer())
+        weights = self.evaluate(weights)
         # Check that the kernel weights got initialized to ones (from scope)
         self.assertAllClose(weights[0], np.ones((3, 3, 3, 32)))
         # Check that the bias still got initialized to zeros.
         self.assertAllClose(weights[1], np.zeros((32)))
 
+  @test_util.run_deprecated_v1
   def testFunctionalConv2DNoReuse(self):
     height, width = 7, 9
     images = random_ops.random_uniform((5, height, width, 3), seed=1)
@@ -325,6 +341,7 @@ class ConvTest(test.TestCase):
     self.assertEqual(conv3d.kernel_constraint, k_constraint)
     self.assertEqual(conv3d.bias_constraint, b_constraint)
 
+  @test_util.run_deprecated_v1
   def testConv3DChannelsFirst(self):
     # Test case for GitHub issue 15655
     images = array_ops.placeholder(
@@ -358,6 +375,7 @@ class SeparableConv1DTest(test.TestCase):
     with self.assertRaisesRegexp(ValueError, 'kernel_size'):
       conv_layers.separable_conv1d(data, 32, None)
 
+  @test_util.run_deprecated_v1
   def testCreateSeparableConv1D(self):
     length = 9
     data = random_ops.random_uniform((5, length, 4))
@@ -379,6 +397,7 @@ class SeparableConv1DTest(test.TestCase):
     self.assertEqual(layer.pointwise_kernel.get_shape().as_list(), [1, 8, 32])
     self.assertEqual(layer.bias.get_shape().as_list(), [32])
 
+  @test_util.run_deprecated_v1
   def testCreateSeparableConv1DChannelsFirst(self):
     length = 9
     data = random_ops.random_uniform((5, 4, length))
@@ -404,6 +423,7 @@ class SeparableConv1DTest(test.TestCase):
     output = layer.apply(data)
     self.assertEqual(output.get_shape().as_list(), [5, length // 2, 32])
 
+  @test_util.run_deprecated_v1
   def testCreateSeparableConv1DWithStridesChannelsFirst(self):
     data_format = 'channels_first'
     length = 10
@@ -413,6 +433,7 @@ class SeparableConv1DTest(test.TestCase):
     output = layer.apply(data)
     self.assertEqual(output.get_shape().as_list(), [5, 32, length // 2])
 
+  @test_util.run_deprecated_v1
   def testFunctionalConv1DReuse(self):
     length = 10
     data = random_ops.random_uniform((5, length, 3), seed=1)
@@ -421,6 +442,7 @@ class SeparableConv1DTest(test.TestCase):
     conv_layers.separable_conv1d(data, 32, 3, name='sepconv1', reuse=True)
     self.assertEqual(len(variables.trainable_variables()), 3)
 
+  @test_util.run_deprecated_v1
   def testFunctionalConv1DReuseFromScope(self):
     with variable_scope.variable_scope('scope'):
       length = 10
@@ -431,6 +453,7 @@ class SeparableConv1DTest(test.TestCase):
       conv_layers.separable_conv1d(data, 32, 3, name='sepconv1')
       self.assertEqual(len(variables.trainable_variables()), 3)
 
+  @test_util.run_deprecated_v1
   def testFunctionalConv1DNoReuse(self):
     length = 10
     data = random_ops.random_uniform((5, length, 3), seed=1)
@@ -439,6 +462,7 @@ class SeparableConv1DTest(test.TestCase):
     conv_layers.separable_conv1d(data, 32, 3)
     self.assertEqual(len(variables.trainable_variables()), 6)
 
+  @test_util.run_deprecated_v1
   def testSeparableConv1DDepthwiseRegularizer(self):
     length = 9
     data = random_ops.random_uniform((5, length, 4))
@@ -450,6 +474,7 @@ class SeparableConv1DTest(test.TestCase):
     self.evaluate([v.initializer for v in layer.variables])
     self.assertListEqual(self.evaluate(layer.losses), self.evaluate(loss_keys))
 
+  @test_util.run_deprecated_v1
   def testSeparableConv1DPointwiseRegularizer(self):
     length = 9
     data = random_ops.random_uniform((5, length, 4))
@@ -461,6 +486,7 @@ class SeparableConv1DTest(test.TestCase):
     self.evaluate([v.initializer for v in layer.variables])
     self.assertListEqual(self.evaluate(layer.losses), self.evaluate(loss_keys))
 
+  @test_util.run_deprecated_v1
   def testSeparableConv1DBiasRegularizer(self):
     length = 9
     data = random_ops.random_uniform((5, length, 4))
@@ -472,6 +498,7 @@ class SeparableConv1DTest(test.TestCase):
     self.evaluate([v.initializer for v in layer.variables])
     self.assertListEqual(self.evaluate(layer.losses), self.evaluate(loss_keys))
 
+  @test_util.run_deprecated_v1
   def testSeparableConv1DNoBias(self):
     length = 9
     data = random_ops.random_uniform((5, length, 4))
@@ -522,6 +549,7 @@ class SeparableConv2DTest(test.TestCase):
     with self.assertRaisesRegexp(ValueError, 'kernel_size'):
       conv_layers.separable_conv2d(images, 32, None)
 
+  @test_util.run_deprecated_v1
   def testCreateSeparableConv2D(self):
     height, width = 7, 9
     images = random_ops.random_uniform((5, height, width, 4))
@@ -562,6 +590,7 @@ class SeparableConv2DTest(test.TestCase):
                          [1, 1, 4, 32])
     self.assertListEqual(layer.bias.get_shape().as_list(), [32])
 
+  @test_util.run_deprecated_v1
   def testCreateSeparableConv2DChannelsFirst(self):
     height, width = 7, 9
     images = random_ops.random_uniform((5, 4, height, width))
@@ -584,6 +613,7 @@ class SeparableConv2DTest(test.TestCase):
     output = layer.apply(images)
     self.assertListEqual(output.get_shape().as_list(), [5, height, width, 64])
 
+  @test_util.run_deprecated_v1
   def testCreateSeparableConvWithStrides(self):
     height, width = 6, 8
     # Test strides tuple
@@ -607,6 +637,7 @@ class SeparableConv2DTest(test.TestCase):
     self.assertListEqual(output.get_shape().as_list(),
                          [5, height / 2, width, 32])
 
+  @test_util.run_deprecated_v1
   def testCreateSeparableConvWithStridesChannelsFirst(self):
     data_format = 'channels_first'
     height, width = 6, 8
@@ -632,6 +663,7 @@ class SeparableConv2DTest(test.TestCase):
     self.assertListEqual(output.get_shape().as_list(),
                          [5, 32, height / 2, width])
 
+  @test_util.run_deprecated_v1
   def testFunctionalConv2DReuse(self):
     height, width = 7, 9
     images = random_ops.random_uniform((5, height, width, 3), seed=1)
@@ -641,6 +673,7 @@ class SeparableConv2DTest(test.TestCase):
         images, 32, [3, 3], name='sepconv1', reuse=True)
     self.assertEqual(len(variables.trainable_variables()), 3)
 
+  @test_util.run_deprecated_v1
   def testFunctionalConv2DReuseFromScope(self):
     with variable_scope.variable_scope('scope'):
       height, width = 7, 9
@@ -651,6 +684,7 @@ class SeparableConv2DTest(test.TestCase):
       conv_layers.separable_conv2d(images, 32, [3, 3], name='sepconv1')
       self.assertEqual(len(variables.trainable_variables()), 3)
 
+  @test_util.run_deprecated_v1
   def testFunctionalConv2DInitializerFromScope(self):
     with self.cached_session() as sess:
       with variable_scope.variable_scope(
@@ -663,14 +697,15 @@ class SeparableConv2DTest(test.TestCase):
         self.assertTrue('depthwise_kernel' in weights[0].name)
         self.assertTrue('pointwise_kernel' in weights[1].name)
         self.assertTrue('bias' in weights[2].name)
-        sess.run(variables.global_variables_initializer())
-        weights = sess.run(weights)
+        self.evaluate(variables.global_variables_initializer())
+        weights = self.evaluate(weights)
         # Check that the kernel weights got initialized to ones (from scope)
         self.assertAllClose(weights[0], np.ones((3, 3, 3, 1)))
         self.assertAllClose(weights[1], np.ones((1, 1, 3, 32)))
         # Check that the bias still got initialized to zeros.
         self.assertAllClose(weights[2], np.zeros((32)))
 
+  @test_util.run_deprecated_v1
   def testFunctionalConv2DNoReuse(self):
     height, width = 7, 9
     images = random_ops.random_uniform((5, height, width, 3), seed=1)
@@ -679,6 +714,7 @@ class SeparableConv2DTest(test.TestCase):
     conv_layers.separable_conv2d(images, 32, [3, 3])
     self.assertEqual(len(variables.trainable_variables()), 6)
 
+  @test_util.run_deprecated_v1
   def testSeparableConv2DDepthwiseRegularizer(self):
     height, width = 7, 9
     images = random_ops.random_uniform((5, height, width, 4))
@@ -690,6 +726,7 @@ class SeparableConv2DTest(test.TestCase):
     self.evaluate([v.initializer for v in layer.variables])
     self.assertListEqual(self.evaluate(layer.losses), self.evaluate(loss_keys))
 
+  @test_util.run_deprecated_v1
   def testSeparableConv2DPointwiseRegularizer(self):
     height, width = 7, 9
     images = random_ops.random_uniform((5, height, width, 4))
@@ -701,6 +738,7 @@ class SeparableConv2DTest(test.TestCase):
     self.evaluate([v.initializer for v in layer.variables])
     self.assertListEqual(self.evaluate(layer.losses), self.evaluate(loss_keys))
 
+  @test_util.run_deprecated_v1
   def testSeparableConv2DBiasRegularizer(self):
     height, width = 7, 9
     images = random_ops.random_uniform((5, height, width, 4))
@@ -712,6 +750,7 @@ class SeparableConv2DTest(test.TestCase):
     self.evaluate([v.initializer for v in layer.variables])
     self.assertListEqual(self.evaluate(layer.losses), self.evaluate(loss_keys))
 
+  @test_util.run_deprecated_v1
   def testSeparableConv2DNoBias(self):
     height, width = 7, 9
     images = random_ops.random_uniform((5, height, width, 4))
@@ -768,6 +807,7 @@ class Conv2DTransposeTest(test.TestCase):
     with self.assertRaisesRegexp(ValueError, 'kernel_size'):
       conv_layers.conv2d_transpose(images, 32, None)
 
+  @test_util.run_deprecated_v1
   def testCreateConv2DTranspose(self):
     height, width = 7, 9
     images = random_ops.random_uniform((5, height, width, 4))
@@ -839,6 +879,7 @@ class Conv2DTransposeTest(test.TestCase):
     self.assertListEqual(output.get_shape().as_list(),
                          [5, height * 2, width, 32])
 
+  @test_util.run_deprecated_v1
   def testConv2DTransposeKernelRegularizer(self):
     height, width = 7, 9
     images = random_ops.random_uniform((5, height, width, 4))
@@ -850,6 +891,7 @@ class Conv2DTransposeTest(test.TestCase):
     self.evaluate([v.initializer for v in layer.variables])
     self.assertListEqual(self.evaluate(layer.losses), self.evaluate(loss_keys))
 
+  @test_util.run_deprecated_v1
   def testConv2DTransposeBiasRegularizer(self):
     height, width = 7, 9
     images = random_ops.random_uniform((5, height, width, 4))
@@ -861,6 +903,7 @@ class Conv2DTransposeTest(test.TestCase):
     self.evaluate([v.initializer for v in layer.variables])
     self.assertListEqual(self.evaluate(layer.losses), self.evaluate(loss_keys))
 
+  @test_util.run_deprecated_v1
   def testConv2DTransposeNoBias(self):
     height, width = 7, 9
     images = random_ops.random_uniform((5, height, width, 4))
@@ -873,6 +916,7 @@ class Conv2DTransposeTest(test.TestCase):
     self.assertListEqual(layer.kernel.get_shape().as_list(), [3, 3, 32, 4])
     self.assertEqual(layer.bias, None)
 
+  @test_util.run_deprecated_v1
   def testFunctionalConv2DTransposeReuse(self):
     height, width = 7, 9
     images = random_ops.random_uniform((5, height, width, 3), seed=1)
@@ -881,6 +925,7 @@ class Conv2DTransposeTest(test.TestCase):
     conv_layers.conv2d_transpose(images, 32, [3, 3], name='deconv1', reuse=True)
     self.assertEqual(len(variables.trainable_variables()), 2)
 
+  @test_util.run_deprecated_v1
   def testFunctionalConv2DTransposeReuseFromScope(self):
     with variable_scope.variable_scope('scope'):
       height, width = 7, 9
@@ -891,6 +936,7 @@ class Conv2DTransposeTest(test.TestCase):
       conv_layers.conv2d_transpose(images, 32, [3, 3], name='deconv1')
       self.assertEqual(len(variables.trainable_variables()), 2)
 
+  @test_util.run_deprecated_v1
   def testFunctionalConv2DTransposeInitializerFromScope(self):
     with self.cached_session() as sess:
       with variable_scope.variable_scope(
@@ -902,13 +948,14 @@ class Conv2DTransposeTest(test.TestCase):
         # Check the names of weights in order.
         self.assertTrue('kernel' in weights[0].name)
         self.assertTrue('bias' in weights[1].name)
-        sess.run(variables.global_variables_initializer())
-        weights = sess.run(weights)
+        self.evaluate(variables.global_variables_initializer())
+        weights = self.evaluate(weights)
         # Check that the kernel weights got initialized to ones (from scope)
         self.assertAllClose(weights[0], np.ones((3, 3, 32, 3)))
         # Check that the bias still got initialized to zeros.
         self.assertAllClose(weights[1], np.zeros((32)))
 
+  @test_util.run_deprecated_v1
   def testFunctionalConv2DTransposeNoReuse(self):
     height, width = 7, 9
     images = random_ops.random_uniform((5, height, width, 3), seed=1)
@@ -955,6 +1002,7 @@ class Conv3DTransposeTest(test.TestCase):
     with self.assertRaisesRegexp(ValueError, 'kernel_size'):
       conv_layers.conv3d_transpose(volumes, 4, None)
 
+  @test_util.run_deprecated_v1
   def testCreateConv3DTranspose(self):
     depth, height, width = 5, 7, 9
     volumes = random_ops.random_uniform((5, depth, height, width, 32))
@@ -976,6 +1024,7 @@ class Conv3DTransposeTest(test.TestCase):
     self.assertListEqual(layer.kernel.get_shape().as_list(), [3, 3, 3, 4, 32])
     self.assertListEqual(layer.bias.get_shape().as_list(), [4])
 
+  @test_util.run_deprecated_v1
   def testCreateConv3DTransposeChannelsFirst(self):
     depth, height, width = 5, 7, 9
     volumes = random_ops.random_uniform((5, 32, depth, height, width))
@@ -1019,6 +1068,7 @@ class Conv3DTransposeTest(test.TestCase):
     self.assertListEqual(output.get_shape().as_list(),
                          [5, depth * 2, height, width, 4])
 
+  @test_util.run_deprecated_v1
   def testConv3DTransposeKernelRegularizer(self):
     depth, height, width = 5, 7, 9
     volumes = random_ops.random_uniform((5, depth, height, width, 32))
@@ -1030,6 +1080,7 @@ class Conv3DTransposeTest(test.TestCase):
     self.evaluate([v.initializer for v in layer.variables])
     self.assertListEqual(self.evaluate(layer.losses), self.evaluate(loss_keys))
 
+  @test_util.run_deprecated_v1
   def testConv3DTransposeBiasRegularizer(self):
     depth, height, width = 5, 7, 9
     volumes = random_ops.random_uniform((5, depth, height, width, 32))
@@ -1041,6 +1092,7 @@ class Conv3DTransposeTest(test.TestCase):
     self.evaluate([v.initializer for v in layer.variables])
     self.assertListEqual(self.evaluate(layer.losses), self.evaluate(loss_keys))
 
+  @test_util.run_deprecated_v1
   def testConv3DTransposeNoBias(self):
     depth, height, width = 5, 7, 9
     volumes = random_ops.random_uniform((5, depth, height, width, 32))
@@ -1053,6 +1105,7 @@ class Conv3DTransposeTest(test.TestCase):
     self.assertListEqual(layer.kernel.get_shape().as_list(), [3, 3, 3, 4, 32])
     self.assertEqual(layer.bias, None)
 
+  @test_util.run_deprecated_v1
   def testFunctionalConv3DTransposeReuse(self):
     depth, height, width = 5, 7, 9
     volumes = random_ops.random_uniform((5, depth, height, width, 32), seed=1)
@@ -1062,6 +1115,7 @@ class Conv3DTransposeTest(test.TestCase):
         volumes, 4, [3, 3, 3], name='deconv1', reuse=True)
     self.assertEqual(len(variables.trainable_variables()), 2)
 
+  @test_util.run_deprecated_v1
   def testFunctionalConv3DTransposeReuseFromScope(self):
     with variable_scope.variable_scope('scope'):
       depth, height, width = 5, 7, 9
@@ -1072,6 +1126,7 @@ class Conv3DTransposeTest(test.TestCase):
       conv_layers.conv3d_transpose(volumes, 4, [3, 3, 3], name='deconv1')
       self.assertEqual(len(variables.trainable_variables()), 2)
 
+  @test_util.run_deprecated_v1
   def testFunctionalConv3DTransposeInitializerFromScope(self):
     with self.cached_session() as sess:
       with variable_scope.variable_scope(
@@ -1084,13 +1139,14 @@ class Conv3DTransposeTest(test.TestCase):
         # Check the names of weights in order.
         self.assertTrue('kernel' in weights[0].name)
         self.assertTrue('bias' in weights[1].name)
-        sess.run(variables.global_variables_initializer())
-        weights = sess.run(weights)
+        self.evaluate(variables.global_variables_initializer())
+        weights = self.evaluate(weights)
         # Check that the kernel weights got initialized to ones (from scope)
         self.assertAllClose(weights[0], np.ones((3, 3, 3, 4, 32)))
         # Check that the bias still got initialized to zeros.
         self.assertAllClose(weights[1], np.zeros((4)))
 
+  @test_util.run_deprecated_v1
   def testFunctionalConv3DTransposeNoReuse(self):
     depth, height, width = 5, 7, 9
     volumes = random_ops.random_uniform((5, depth, height, width, 32), seed=1)
diff --git a/tensorflow/python/layers/core_test.py b/tensorflow/python/layers/core_test.py
index 0343bfa8bd2..cf6f0fbb700 100644
--- a/tensorflow/python/layers/core_test.py
+++ b/tensorflow/python/layers/core_test.py
@@ -59,6 +59,7 @@ class DenseTest(test.TestCase):
     dense.apply(random_ops.random_uniform((5, 2)))
     self.assertEqual(dense.name, 'dense_2')
 
+  @test_util.run_deprecated_v1
   def testVariableInput(self):
     with self.cached_session():
       v = variable_scope.get_variable(
@@ -140,6 +141,7 @@ class DenseTest(test.TestCase):
     outputs = dense.apply(inputs)
     self.assertEqual(outputs.get_shape().as_list(), [1, 2, 4, 7])
 
+  @test_util.run_deprecated_v1
   def testCallOnPlaceHolder(self):
     inputs = array_ops.placeholder(dtype=dtypes.float32)
     dense = core_layers.Dense(4, name='my_dense')
@@ -179,6 +181,7 @@ class DenseTest(test.TestCase):
     if not context.executing_eagerly():
       self.assertEqual(outputs.op.name, 'dense2/BiasAdd')
 
+  @test_util.run_deprecated_v1
   def testActivityRegularizer(self):
     regularizer = lambda x: math_ops.reduce_sum(x) * 1e-3
     dense = core_layers.Dense(
@@ -189,6 +192,7 @@ class DenseTest(test.TestCase):
     self.assertEqual(len(loss_keys), 1)
     self.assertListEqual(dense.losses, loss_keys)
 
+  @test_util.run_deprecated_v1
   def testKernelRegularizer(self):
     regularizer = lambda x: math_ops.reduce_sum(x) * 1e-3
     dense = core_layers.Dense(
@@ -200,6 +204,7 @@ class DenseTest(test.TestCase):
     self.evaluate([v.initializer for v in dense.variables])
     self.assertAllEqual(self.evaluate(dense.losses), self.evaluate(loss_keys))
 
+  @test_util.run_deprecated_v1
   def testKernelRegularizerWithReuse(self):
     regularizer = lambda x: math_ops.reduce_sum(x) * 1e-3
     inputs = random_ops.random_uniform((5, 3), seed=1)
@@ -212,6 +217,7 @@ class DenseTest(test.TestCase):
     self.assertEqual(
         len(ops.get_collection(ops.GraphKeys.REGULARIZATION_LOSSES)), 1)
 
+  @test_util.run_deprecated_v1
   def testBiasRegularizer(self):
     regularizer = lambda x: math_ops.reduce_sum(x) * 1e-3
     dense = core_layers.Dense(2, name='my_dense', bias_regularizer=regularizer)
@@ -222,6 +228,7 @@ class DenseTest(test.TestCase):
     self.evaluate([v.initializer for v in dense.variables])
     self.assertAllEqual(self.evaluate(dense.losses), self.evaluate(loss_keys))
 
+  @test_util.run_deprecated_v1
   def testFunctionalDense(self):
     with self.cached_session():
       inputs = random_ops.random_uniform((5, 3), seed=1)
@@ -231,6 +238,7 @@ class DenseTest(test.TestCase):
           len(ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)), 2)
       self.assertEqual(outputs.op.name, 'my_dense/Relu')
 
+  @test_util.run_deprecated_v1
   def testFunctionalDenseTwice(self):
     inputs = random_ops.random_uniform((5, 3), seed=1)
     core_layers.dense(inputs, 2)
@@ -262,6 +270,7 @@ class DenseTest(test.TestCase):
         vars2 = variables.trainable_variables()
       self.assertEqual(vars1, vars2)
 
+  @test_util.run_deprecated_v1
   def testFunctionalDenseInitializerFromScope(self):
     with variable_scope.variable_scope(
         'scope',
@@ -307,6 +316,7 @@ class DenseTest(test.TestCase):
       core_layers.dense(inputs, 2)
     self.assertEqual(called[0], 2)
 
+  @test_util.run_deprecated_v1
   def testFunctionalDenseInScope(self):
     with self.cached_session():
       with variable_scope.variable_scope('test'):
@@ -393,6 +403,7 @@ class DropoutTest(test.TestCase):
     np_output = self.evaluate(dropped)
     self.assertAllClose(np.ones((5, 3)), np_output)
 
+  @test_util.run_deprecated_v1
   def testDynamicLearningPhase(self):
     with self.cached_session() as sess:
       dp = core_layers.Dropout(0.5, seed=1)
@@ -426,6 +437,7 @@ class DropoutTest(test.TestCase):
     self.assertAlmostEqual(0., np_output.min())
     self.assertAllClose(np_output[:, 0, :], np_output[:, 1, :])
 
+  @test_util.run_deprecated_v1
   def testFunctionalDropout(self):
     with self.cached_session():
       inputs = array_ops.ones((5, 5))
@@ -437,13 +449,14 @@ class DropoutTest(test.TestCase):
       np_output = self.evaluate(dropped)
       self.assertAllClose(np.ones((5, 5)), np_output)
 
+  @test_util.run_deprecated_v1
   def testDynamicRate(self):
     with self.cached_session() as sess:
       rate = array_ops.placeholder(dtype='float32', name='rate')
       dp = core_layers.Dropout(rate, name='dropout')
       inputs = array_ops.ones((5, 5))
       dropped = dp.apply(inputs, training=True)
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       np_output = sess.run(dropped, feed_dict={rate: 0.5})
       self.assertAlmostEqual(0., np_output.min())
       np_output = sess.run(dropped, feed_dict={rate: 0.0})
@@ -452,6 +465,7 @@ class DropoutTest(test.TestCase):
 
 class FlattenTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testCreateFlatten(self):
     with self.cached_session() as sess:
       x = array_ops.placeholder(shape=(None, 2, 3), dtype='float32')
@@ -476,6 +490,7 @@ class FlattenTest(test.TestCase):
     shape = core_layers.Flatten().compute_output_shape((None, 3, None))
     self.assertEqual(shape.as_list(), [None, None])
 
+  @test_util.run_deprecated_v1
   def testDataFormat5d(self):
     np_input_channels_last = np.arange(
         120, dtype='float32').reshape([1, 5, 4, 3, 2])
@@ -493,6 +508,7 @@ class FlattenTest(test.TestCase):
 
       self.assertAllEqual(np_output_cl, np_output_cf)
 
+  @test_util.run_deprecated_v1
   def testDataFormat4d(self):
     np_input_channels_last = np.arange(
         24, dtype='float32').reshape([1, 4, 3, 2])
@@ -510,16 +526,19 @@ class FlattenTest(test.TestCase):
 
       self.assertAllEqual(np_output_cl, np_output_cf)
 
+  @test_util.run_deprecated_v1
   def testFunctionalFlatten(self):
     x = array_ops.placeholder(shape=(None, 2, 3), dtype='float32')
     y = core_layers.flatten(x, name='flatten')
     self.assertEqual(y.get_shape().as_list(), [None, 6])
 
+  @test_util.run_deprecated_v1
   def testFlattenValueError(self):
     x = array_ops.placeholder(shape=(None,), dtype='float32')
     with self.assertRaises(ValueError):
       core_layers.Flatten()(x)
 
+  @test_util.run_deprecated_v1
   def testFlattenUnknownAxes(self):
     with self.cached_session() as sess:
       x = array_ops.placeholder(shape=(5, None, None), dtype='float32')
diff --git a/tensorflow/python/layers/layers.py b/tensorflow/python/layers/layers.py
index 11a2ebc040f..93eec38a08c 100644
--- a/tensorflow/python/layers/layers.py
+++ b/tensorflow/python/layers/layers.py
@@ -24,7 +24,7 @@ from __future__ import print_function
 
 # Base objects.
 from tensorflow.python.layers.base import Layer
-from tensorflow.python.layers.base import InputSpec
+from tensorflow.python.keras.engine.input_spec import InputSpec
 
 # Core layers.
 from tensorflow.python.layers.core import Dense
diff --git a/tensorflow/python/layers/normalization_test.py b/tensorflow/python/layers/normalization_test.py
index ba2bf10cf3a..07d8e40b759 100644
--- a/tensorflow/python/layers/normalization_test.py
+++ b/tensorflow/python/layers/normalization_test.py
@@ -24,6 +24,7 @@ import numpy as np
 from tensorflow.core.protobuf import saver_pb2
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.layers import convolutional as conv_layers
 from tensorflow.python.layers import normalization as normalization_layers
 from tensorflow.python.ops import array_ops
@@ -78,7 +79,7 @@ class BNTest(test.TestCase):
       if restore:
         saver.restore(sess, checkpoint_path)
       else:
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
       np.random.seed(0)
       for _ in range(2):
         image_val = np.random.rand(*shape).astype(dtype.as_numpy_dtype)
@@ -143,6 +144,7 @@ class BNTest(test.TestCase):
 
     return train_vars, loss_val
 
+  @test_util.run_deprecated_v1
   def testHalfPrecision(self):
     ref_vars, ref_loss = self._trainEvalSequence(
         dtype=dtypes.float32,
@@ -228,33 +230,43 @@ class BNTest(test.TestCase):
                                ckpt_b_use_gpu, use_gpu_test_a, use_gpu_test_b,
                                freeze_mode)
 
+  @test_util.run_deprecated_v1
   def testCheckpointFusedCPUAndFusedGPU(self):
     self._testCheckpointCrossDevice(True, False, True, True)
 
+  @test_util.run_deprecated_v1
   def testCheckpointFusedCPUAndFusedCPU(self):
     self._testCheckpointCrossDevice(True, False, True, False)
 
+  @test_util.run_deprecated_v1
   def testCheckpointFusedGPUAndFusedGPU(self):
     self._testCheckpointCrossDevice(True, True, True, True)
 
+  @test_util.run_deprecated_v1
   def testCheckpointNonFusedCPUAndNonFusedGPU(self):
     self._testCheckpointCrossDevice(False, False, False, True)
 
+  @test_util.run_deprecated_v1
   def testCheckpointNonFusedCPUAndNonFusedCPU(self):
     self._testCheckpointCrossDevice(False, False, False, False)
 
+  @test_util.run_deprecated_v1
   def testCheckpointNonFusedGPUAndNonFusedGPU(self):
     self._testCheckpointCrossDevice(False, True, False, True)
 
+  @test_util.run_deprecated_v1
   def testCheckpointNonFusedGPUAndFusedGPU(self):
     self._testCheckpointCrossDevice(False, True, True, True)
 
+  @test_util.run_deprecated_v1
   def testCheckpointNonFusedGPUAndFusedCPU(self):
     self._testCheckpointCrossDevice(False, True, True, False)
 
+  @test_util.run_deprecated_v1
   def testCheckpointNonFusedCPUAndFusedCPU(self):
     self._testCheckpointCrossDevice(False, False, True, False)
 
+  @test_util.run_deprecated_v1
   def testCreateBN(self):
     # Call layer.
     bn = normalization_layers.BatchNormalization(axis=1)
@@ -281,6 +293,7 @@ class BNTest(test.TestCase):
         ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES),
         bn.trainable_variables)
 
+  @test_util.run_deprecated_v1
   def testCreateFusedBNFloat16(self):
     # Call layer.
     bn = normalization_layers.BatchNormalization(axis=1, fused=True)
@@ -310,6 +323,7 @@ class BNTest(test.TestCase):
         ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES),
         bn.trainable_variables)
 
+  @test_util.run_deprecated_v1
   def test3DInputAxis1(self):
     epsilon = 1e-3
     bn = normalization_layers.BatchNormalization(
@@ -321,9 +335,9 @@ class BNTest(test.TestCase):
 
     with self.cached_session() as sess:
       # Test training with placeholder learning phase.
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
 
-      np_gamma, np_beta = sess.run([bn.gamma, bn.beta])
+      np_gamma, np_beta = self.evaluate([bn.gamma, bn.beta])
       np_gamma = np.reshape(np_gamma, (1, 4, 1))
       np_beta = np.reshape(np_beta, (1, 4, 1))
 
@@ -336,8 +350,9 @@ class BNTest(test.TestCase):
         self.assertAlmostEqual(np.std(normed_np_output), 1., places=1)
 
       # Verify that the statistics are updated during training.
-      moving_mean, moving_var = sess.run([bn.moving_mean, bn.moving_variance])
-      np_inputs = sess.run(inputs)
+      moving_mean, moving_var = self.evaluate(
+          [bn.moving_mean, bn.moving_variance])
+      np_inputs = self.evaluate(inputs)
       mean = np.mean(np_inputs, axis=(0, 2))
       std = np.std(np_inputs, axis=(0, 2))
       variance = np.square(std)
@@ -352,6 +367,7 @@ class BNTest(test.TestCase):
       self.assertAlmostEqual(np.mean(normed_np_output), 0., places=1)
       self.assertAlmostEqual(np.std(normed_np_output), 1., places=1)
 
+  @test_util.run_deprecated_v1
   def test3DInputAxis2(self):
     epsilon = 1e-3
     bn = normalization_layers.BatchNormalization(
@@ -363,8 +379,8 @@ class BNTest(test.TestCase):
 
     with self.cached_session() as sess:
       # Test training with placeholder learning phase.
-      sess.run(variables.global_variables_initializer())
-      np_gamma, np_beta = sess.run([bn.gamma, bn.beta])
+      self.evaluate(variables.global_variables_initializer())
+      np_gamma, np_beta = self.evaluate([bn.gamma, bn.beta])
       np_gamma = np.reshape(np_gamma, (1, 1, 3))
       np_beta = np.reshape(np_beta, (1, 1, 3))
       for _ in range(100):
@@ -376,8 +392,9 @@ class BNTest(test.TestCase):
         self.assertAlmostEqual(np.std(normed_np_output), 1., places=1)
 
       # Verify that the statistics are updated during training.
-      moving_mean, moving_var = sess.run([bn.moving_mean, bn.moving_variance])
-      np_inputs = sess.run(inputs)
+      moving_mean, moving_var = self.evaluate(
+          [bn.moving_mean, bn.moving_variance])
+      np_inputs = self.evaluate(inputs)
       mean = np.mean(np_inputs, axis=(0, 1))
       std = np.std(np_inputs, axis=(0, 1))
       variance = np.square(std)
@@ -404,8 +421,8 @@ class BNTest(test.TestCase):
 
       with self.session(use_gpu=True) as sess:
         # Test training with placeholder learning phase.
-        sess.run(variables.global_variables_initializer())
-        np_gamma, np_beta = sess.run([bn.gamma, bn.beta])
+        self.evaluate(variables.global_variables_initializer())
+        np_gamma, np_beta = self.evaluate([bn.gamma, bn.beta])
         np_gamma = np.reshape(np_gamma, (1, 4, 1, 1))
         np_beta = np.reshape(np_beta, (1, 4, 1, 1))
         for _ in range(100):
@@ -417,8 +434,9 @@ class BNTest(test.TestCase):
           self.assertAlmostEqual(np.std(normed_np_output), 1., places=1)
 
         # Verify that the statistics are updated during training.
-        moving_mean, moving_var = sess.run([bn.moving_mean, bn.moving_variance])
-        np_inputs = sess.run(inputs)
+        moving_mean, moving_var = self.evaluate(
+            [bn.moving_mean, bn.moving_variance])
+        np_inputs = self.evaluate(inputs)
         mean = np.mean(np_inputs, axis=(0, 2, 3))
         std = np.std(np_inputs, axis=(0, 2, 3))
         variance = np.square(std)
@@ -433,6 +451,7 @@ class BNTest(test.TestCase):
         self.assertAlmostEqual(np.mean(normed_np_output), 0., places=1)
         self.assertAlmostEqual(np.std(normed_np_output), 1., places=1)
 
+  @test_util.run_deprecated_v1
   def test4DInputAxis2(self):
     epsilon = 1e-3
     bn = normalization_layers.BatchNormalization(
@@ -444,8 +463,8 @@ class BNTest(test.TestCase):
 
     with self.cached_session() as sess:
       # Test training with placeholder learning phase.
-      sess.run(variables.global_variables_initializer())
-      np_gamma, np_beta = sess.run([bn.gamma, bn.beta])
+      self.evaluate(variables.global_variables_initializer())
+      np_gamma, np_beta = self.evaluate([bn.gamma, bn.beta])
       np_gamma = np.reshape(np_gamma, (1, 1, 3, 1))
       np_beta = np.reshape(np_beta, (1, 1, 3, 1))
       for _ in range(100):
@@ -457,8 +476,9 @@ class BNTest(test.TestCase):
         self.assertAlmostEqual(np.std(normed_np_output), 1., places=1)
 
       # Verify that the statistics are updated during training.
-      moving_mean, moving_var = sess.run([bn.moving_mean, bn.moving_variance])
-      np_inputs = sess.run(inputs)
+      moving_mean, moving_var = self.evaluate(
+          [bn.moving_mean, bn.moving_variance])
+      np_inputs = self.evaluate(inputs)
       mean = np.mean(np_inputs, axis=(0, 1, 3))
       std = np.std(np_inputs, axis=(0, 1, 3))
       variance = np.square(std)
@@ -473,6 +493,7 @@ class BNTest(test.TestCase):
       self.assertAlmostEqual(np.mean(normed_np_output), 0., places=1)
       self.assertAlmostEqual(np.std(normed_np_output), 1., places=1)
 
+  @test_util.run_deprecated_v1
   def test4DInputAxis3(self):
     epsilon = 1e-3
     bn = normalization_layers.BatchNormalization(
@@ -484,8 +505,8 @@ class BNTest(test.TestCase):
 
     with self.cached_session() as sess:
       # Test training with placeholder learning phase.
-      sess.run(variables.global_variables_initializer())
-      np_gamma, np_beta = sess.run([bn.gamma, bn.beta])
+      self.evaluate(variables.global_variables_initializer())
+      np_gamma, np_beta = self.evaluate([bn.gamma, bn.beta])
       np_gamma = np.reshape(np_gamma, (1, 1, 1, 6))
       np_beta = np.reshape(np_beta, (1, 1, 1, 6))
       for _ in range(100):
@@ -497,8 +518,9 @@ class BNTest(test.TestCase):
         self.assertAlmostEqual(np.std(normed_np_output), 1., places=1)
 
       # Verify that the statistics are updated during training.
-      moving_mean, moving_var = sess.run([bn.moving_mean, bn.moving_variance])
-      np_inputs = sess.run(inputs)
+      moving_mean, moving_var = self.evaluate(
+          [bn.moving_mean, bn.moving_variance])
+      np_inputs = self.evaluate(inputs)
       mean = np.mean(np_inputs, axis=(0, 1, 2))
       std = np.std(np_inputs, axis=(0, 1, 2))
       variance = np.square(std)
@@ -513,6 +535,7 @@ class BNTest(test.TestCase):
       self.assertAlmostEqual(np.mean(normed_np_output), 0., places=1)
       self.assertAlmostEqual(np.std(normed_np_output), 1., places=1)
 
+  @test_util.run_deprecated_v1
   def test4DInputAxis3Fused(self):
     epsilon = 1e-3
     bn = normalization_layers.BatchNormalization(
@@ -524,8 +547,8 @@ class BNTest(test.TestCase):
 
     with self.cached_session() as sess:
       # Test training with placeholder learning phase.
-      sess.run(variables.global_variables_initializer())
-      np_gamma, np_beta = sess.run([bn.gamma, bn.beta])
+      self.evaluate(variables.global_variables_initializer())
+      np_gamma, np_beta = self.evaluate([bn.gamma, bn.beta])
       np_gamma = np.reshape(np_gamma, (1, 1, 1, 6))
       np_beta = np.reshape(np_beta, (1, 1, 1, 6))
       for _ in range(100):
@@ -537,8 +560,9 @@ class BNTest(test.TestCase):
         self.assertAlmostEqual(np.std(normed_np_output), 1., places=1)
 
       # Verify that the statistics are updated during training.
-      moving_mean, moving_var = sess.run([bn.moving_mean, bn.moving_variance])
-      np_inputs = sess.run(inputs)
+      moving_mean, moving_var = self.evaluate(
+          [bn.moving_mean, bn.moving_variance])
+      np_inputs = self.evaluate(inputs)
       mean = np.mean(np_inputs, axis=(0, 1, 2))
       std = np.std(np_inputs, axis=(0, 1, 2))
       variance = np.square(std)
@@ -565,8 +589,8 @@ class BNTest(test.TestCase):
 
       with self.cached_session() as sess:
         # Test training with placeholder learning phase.
-        sess.run(variables.global_variables_initializer())
-        np_gamma, np_beta = sess.run([bn.gamma, bn.beta])
+        self.evaluate(variables.global_variables_initializer())
+        np_gamma, np_beta = self.evaluate([bn.gamma, bn.beta])
         np_gamma = np.reshape(np_gamma, (1, 4, 1, 1))
         np_beta = np.reshape(np_beta, (1, 4, 1, 1))
         for _ in range(100):
@@ -578,8 +602,9 @@ class BNTest(test.TestCase):
           self.assertAlmostEqual(np.std(normed_np_output), 1., places=1)
 
         # Verify that the statistics are updated during training.
-        moving_mean, moving_var = sess.run([bn.moving_mean, bn.moving_variance])
-        np_inputs = sess.run(inputs)
+        moving_mean, moving_var = self.evaluate(
+            [bn.moving_mean, bn.moving_variance])
+        np_inputs = self.evaluate(inputs)
         mean = np.mean(np_inputs, axis=(0, 2, 3))
         std = np.std(np_inputs, axis=(0, 2, 3))
         variance = np.square(std)
@@ -594,6 +619,7 @@ class BNTest(test.TestCase):
         self.assertAlmostEqual(np.mean(normed_np_output), 0., places=1)
         self.assertAlmostEqual(np.std(normed_np_output), 1., places=1)
 
+  @test_util.run_deprecated_v1
   def testNegativeAxis(self):
     epsilon = 1e-3
     bn = normalization_layers.BatchNormalization(
@@ -605,8 +631,8 @@ class BNTest(test.TestCase):
 
     with self.cached_session() as sess:
       # Test training with placeholder learning phase.
-      sess.run(variables.global_variables_initializer())
-      np_gamma, np_beta = sess.run([bn.gamma, bn.beta])
+      self.evaluate(variables.global_variables_initializer())
+      np_gamma, np_beta = self.evaluate([bn.gamma, bn.beta])
       np_gamma = np.reshape(np_gamma, (1, 1, 1, 6))
       np_beta = np.reshape(np_beta, (1, 1, 1, 6))
       for _ in range(100):
@@ -619,8 +645,9 @@ class BNTest(test.TestCase):
         self.assertAlmostEqual(np.std(normed_np_output), 1., places=1)
 
       # Verify that the statistics are updated during training.
-      moving_mean, moving_var = sess.run([bn.moving_mean, bn.moving_variance])
-      np_inputs = sess.run(inputs)
+      moving_mean, moving_var = self.evaluate(
+          [bn.moving_mean, bn.moving_variance])
+      np_inputs = self.evaluate(inputs)
       mean = np.mean(np_inputs, axis=(0, 1, 2))
       std = np.std(np_inputs, axis=(0, 1, 2))
       variance = np.square(std)
@@ -635,6 +662,7 @@ class BNTest(test.TestCase):
       self.assertAlmostEqual(np.mean(normed_np_output), 0., places=1)
       self.assertAlmostEqual(np.std(normed_np_output), 1., places=1)
 
+  @test_util.run_deprecated_v1
   def testBooleanLearningPhase(self):
     epsilon = 1e-3
     bn = normalization_layers.BatchNormalization(
@@ -646,8 +674,8 @@ class BNTest(test.TestCase):
 
     with self.cached_session() as sess:
       # Test training with placeholder learning phase.
-      sess.run(variables.global_variables_initializer())
-      np_gamma, np_beta = sess.run([bn.gamma, bn.beta])
+      self.evaluate(variables.global_variables_initializer())
+      np_gamma, np_beta = self.evaluate([bn.gamma, bn.beta])
       np_gamma = np.reshape(np_gamma, (1, 1, 1, 6))
       np_beta = np.reshape(np_beta, (1, 1, 1, 6))
       for _ in range(100):
@@ -658,8 +686,9 @@ class BNTest(test.TestCase):
         self.assertAlmostEqual(np.std(normed_np_output), 1., places=1)
 
       # Verify that the statistics are updated during training.
-      moving_mean, moving_var = sess.run([bn.moving_mean, bn.moving_variance])
-      np_inputs = sess.run(inputs)
+      moving_mean, moving_var = self.evaluate(
+          [bn.moving_mean, bn.moving_variance])
+      np_inputs = self.evaluate(inputs)
       mean = np.mean(np_inputs, axis=(0, 1, 2))
       std = np.std(np_inputs, axis=(0, 1, 2))
       variance = np.square(std)
@@ -667,13 +696,14 @@ class BNTest(test.TestCase):
       self.assertAllClose(variance, moving_var, atol=1e-2)
 
       # Test inference with placeholder learning phase.
-      np_output = sess.run(outputs_infer)
+      np_output = self.evaluate(outputs_infer)
 
       # Verify that the axis is normalized during inference.
       normed_np_output = ((np_output - epsilon) * np_gamma) + np_beta
       self.assertAlmostEqual(np.mean(normed_np_output), 0., places=1)
       self.assertAlmostEqual(np.std(normed_np_output), 1., places=1)
 
+  @test_util.run_deprecated_v1
   def testFunctionalNoReuse(self):
     inputs = variables.Variable(
         np.random.random((5, 4, 3, 6)), dtype=dtypes.float32)
@@ -696,8 +726,8 @@ class BNTest(test.TestCase):
 
     with self.cached_session() as sess:
       # Test training with placeholder learning phase.
-      sess.run(variables.global_variables_initializer())
-      np_gamma, np_beta = sess.run([gamma, beta])
+      self.evaluate(variables.global_variables_initializer())
+      np_gamma, np_beta = self.evaluate([gamma, beta])
       np_gamma = np.reshape(np_gamma, (1, 1, 1, 6))
       np_beta = np.reshape(np_beta, (1, 1, 1, 6))
       for _ in range(100):
@@ -709,8 +739,9 @@ class BNTest(test.TestCase):
         self.assertAlmostEqual(np.std(normed_np_output), 1., places=1)
 
       # Verify that the statistics are updated during training.
-      np_moving_mean, np_moving_var = sess.run([moving_mean, moving_variance])
-      np_inputs = sess.run(inputs)
+      np_moving_mean, np_moving_var = self.evaluate(
+          [moving_mean, moving_variance])
+      np_inputs = self.evaluate(inputs)
       np_mean = np.mean(np_inputs, axis=(0, 1, 2))
       np_std = np.std(np_inputs, axis=(0, 1, 2))
       np_variance = np.square(np_std)
@@ -725,6 +756,7 @@ class BNTest(test.TestCase):
       self.assertAlmostEqual(np.mean(normed_np_output), 0., places=1)
       self.assertAlmostEqual(np.std(normed_np_output), 1., places=1)
 
+  @test_util.run_deprecated_v1
   def testFunctionalReuse(self):
     inputs1 = variables.Variable(
         np.random.random((5, 4, 3, 6)), dtype=dtypes.float32)
@@ -758,14 +790,15 @@ class BNTest(test.TestCase):
 
     with self.cached_session() as sess:
       # Test training with placeholder learning phase.
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       for _ in range(100):
         np_output, _, _ = sess.run([outputs2] + updates,
                                    feed_dict={training: True})
 
       # Verify that the statistics are updated during training.
-      np_moving_mean, np_moving_var = sess.run([moving_mean, moving_variance])
-      np_inputs = sess.run(inputs2)
+      np_moving_mean, np_moving_var = self.evaluate(
+          [moving_mean, moving_variance])
+      np_inputs = self.evaluate(inputs2)
       np_mean = np.mean(np_inputs, axis=(0, 1, 2))
       np_std = np.std(np_inputs, axis=(0, 1, 2))
       np_variance = np.square(np_std)
@@ -773,7 +806,7 @@ class BNTest(test.TestCase):
       self.assertAllClose(np_variance, np_moving_var, atol=1e-2)
 
       # Verify that the axis is normalized during training.
-      np_gamma, np_beta = sess.run([gamma, beta])
+      np_gamma, np_beta = self.evaluate([gamma, beta])
       np_gamma = np.reshape(np_gamma, (1, 1, 1, 6))
       np_beta = np.reshape(np_beta, (1, 1, 1, 6))
       normed_np_output = ((np_output - epsilon) * np_gamma) + np_beta
@@ -788,6 +821,7 @@ class BNTest(test.TestCase):
       self.assertAlmostEqual(np.mean(normed_np_output), 0., places=2)
       self.assertAlmostEqual(np.std(normed_np_output), 1., places=1)
 
+  @test_util.run_deprecated_v1
   def testFunctionalReuseFromScope(self):
     inputs = variables.Variable(
         np.random.random((5, 4, 3, 6)), dtype=dtypes.float32)
@@ -802,6 +836,7 @@ class BNTest(test.TestCase):
           inputs, axis=-1, momentum=0.9, epsilon=epsilon, training=training)
       self.assertEqual(len(variables.global_variables()), 5)
 
+  @test_util.run_deprecated_v1
   def testNoCenter(self):
     bn = normalization_layers.BatchNormalization(axis=1, center=False)
     inputs = random_ops.random_uniform((5, 4, 3), seed=1)
@@ -817,6 +852,7 @@ class BNTest(test.TestCase):
     self.assertEqual(len(bn.trainable_variables), 1)
     self.assertEqual(len(bn.non_trainable_variables), 2)
 
+  @test_util.run_deprecated_v1
   def testNoScale(self):
     bn = normalization_layers.BatchNormalization(axis=1, scale=False)
     inputs = random_ops.random_uniform((5, 4, 3), seed=1)
@@ -832,6 +868,7 @@ class BNTest(test.TestCase):
     self.assertEqual(len(bn.trainable_variables), 1)
     self.assertEqual(len(bn.non_trainable_variables), 2)
 
+  @test_util.run_deprecated_v1
   def testRegularizers(self):
     reg = lambda x: 0.1 * math_ops.reduce_sum(x)
     bn = normalization_layers.BatchNormalization(axis=1, beta_regularizer=reg)
@@ -857,6 +894,7 @@ class BNTest(test.TestCase):
     self.assertEqual(bn.gamma_constraint, g_constraint)
     self.assertEqual(bn.beta_constraint, b_constraint)
 
+  @test_util.run_deprecated_v1
   def testRenorm(self):
     shape = (4, 3)
     xt = array_ops.placeholder(dtypes.float32, shape)
@@ -885,7 +923,7 @@ class BNTest(test.TestCase):
     renorm_mean = renorm_stddev = 0.
     renorm_weight = 0.
     with self.session(use_gpu=True) as sess:
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       for _ in range(5):
         x = np.random.random(shape)
 
@@ -915,6 +953,7 @@ class BNTest(test.TestCase):
         self.assertAllClose(y_train, yt_val_train, atol=1e-5)
         self.assertAllClose(y_test, yt_val_test, atol=1e-5)
 
+  @test_util.run_deprecated_v1
   def testAdjustment(self):
     shape = (4, 3)
     xt = array_ops.placeholder(dtypes.float32, shape)
@@ -937,7 +976,7 @@ class BNTest(test.TestCase):
     moving_mean = 0.
     moving_variance = 1.
     with self.session(use_gpu=True) as sess:
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       for _ in range(5):
         x = np.random.random(shape)
         yt_val_train, adj_scale_val, adj_bias_val = sess.run(
@@ -959,6 +998,7 @@ class BNTest(test.TestCase):
         self.assertAllClose(y_train, yt_val_train, atol=1e-5)
         self.assertAllClose(y_test, yt_val_test, atol=1e-5)
 
+  @test_util.run_deprecated_v1
   def testRenormWithAdjustment(self):
     shape = (4, 3)
     xt = array_ops.placeholder(dtypes.float32, shape)
@@ -990,7 +1030,7 @@ class BNTest(test.TestCase):
     renorm_mean = renorm_stddev = 0.
     renorm_weight = 0.
     with self.session(use_gpu=True) as sess:
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       for _ in range(5):
         x = np.random.random(shape)
         yt_val_train, adj_scale_val, adj_bias_val = sess.run(
@@ -1029,6 +1069,7 @@ class BNTest(test.TestCase):
       normalization_layers.batch_normalization(
           inp, virtual_batch_size=-1)
 
+  @test_util.run_deprecated_v1
   def testGhostBNVirtualBatchFull(self):
     shape = [6, 5, 4, 3]
     inp = random_ops.random_uniform(shape, seed=1)
@@ -1040,7 +1081,7 @@ class BNTest(test.TestCase):
         out1.shape.as_list(), out2.shape.as_list())
 
     with self.session(use_gpu=True) as sess:
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
 
       x = np.random.random(shape)
       y1, y2 = sess.run([out1, out2], feed_dict={inp: x})
@@ -1054,6 +1095,7 @@ class BNTest(test.TestCase):
         inp, virtual_batch_size=3)
     self.assertListEqual(out.shape.as_list(), shape)
 
+  @test_util.run_deprecated_v1
   def testGhostBNUnknownBatchSize(self):
     np_shape = [10, 5, 4]
     tf_shape = [None, 5, 4]
@@ -1062,13 +1104,14 @@ class BNTest(test.TestCase):
         inp, virtual_batch_size=2)
 
     with self.session(use_gpu=True) as sess:
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
 
       x = np.random.random(np_shape)
       y = sess.run(out, feed_dict={inp: x})
 
       self.assertListEqual(list(y.shape), np_shape)
 
+  @test_util.run_deprecated_v1
   def testGhostBN2Dims(self):
     shape = [6, 2]
     virtual_batch_size = 3
@@ -1093,7 +1136,7 @@ class BNTest(test.TestCase):
                     shape[1]])
 
     with self.session(use_gpu=True) as sess:
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       for _ in range(5):
         x = np.random.random(shape)
 
@@ -1122,6 +1165,7 @@ class BNTest(test.TestCase):
         self.assertAllClose(y_train, y_val_train, atol=1e-5)
         self.assertAllClose(y_test, y_val_test, atol=1e-5)
 
+  @test_util.run_deprecated_v1
   def testGhostBN4DimsAxis3(self):
     shape = [6, 10, 10, 3]
     virtual_batch_size = 2
@@ -1146,7 +1190,7 @@ class BNTest(test.TestCase):
                    shape[1:])
 
     with self.session(use_gpu=True) as sess:
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       for _ in range(5):
         x = np.random.random(shape)
 
@@ -1175,6 +1219,7 @@ class BNTest(test.TestCase):
         self.assertAllClose(y_train, y_val_train, atol=1e-2)
         self.assertAllClose(y_test, y_val_test, atol=1e-2)
 
+  @test_util.run_deprecated_v1
   def testGhostBN4DimsAxis1(self):
     shape = [6, 3, 10, 10]
     virtual_batch_size = 2
@@ -1200,7 +1245,7 @@ class BNTest(test.TestCase):
                    shape[1:])
 
     with self.session(use_gpu=True) as sess:
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       for _ in range(5):
         x = np.random.random(shape)
 
@@ -1245,6 +1290,7 @@ class BNTest(test.TestCase):
       normalization_layers.batch_normalization(
           inp, axis=[1, 2, 1])   # duplicate
 
+  @test_util.run_deprecated_v1
   def test3DInputMultiAxis12(self):
     epsilon = 1e-3
     bn = normalization_layers.BatchNormalization(
@@ -1256,9 +1302,9 @@ class BNTest(test.TestCase):
 
     with self.cached_session() as sess:
       # Test training with placeholder learning phase.
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
 
-      np_gamma, np_beta = sess.run([bn.gamma, bn.beta])
+      np_gamma, np_beta = self.evaluate([bn.gamma, bn.beta])
 
       for _ in range(100):
         np_output, _, _ = sess.run([outputs] + bn.updates,
@@ -1269,8 +1315,9 @@ class BNTest(test.TestCase):
         self.assertAlmostEqual(np.std(normed_np_output), 1., places=1)
 
       # Verify that the statistics are updated during training.
-      moving_mean, moving_var = sess.run([bn.moving_mean, bn.moving_variance])
-      np_inputs = sess.run(inputs)
+      moving_mean, moving_var = self.evaluate(
+          [bn.moving_mean, bn.moving_variance])
+      np_inputs = self.evaluate(inputs)
       mean = np.mean(np_inputs, axis=0, keepdims=True)
       std = np.std(np_inputs, axis=0, keepdims=True)
       variance = np.square(std)
@@ -1285,6 +1332,7 @@ class BNTest(test.TestCase):
       self.assertAlmostEqual(np.mean(normed_np_output), 0., places=1)
       self.assertAlmostEqual(np.std(normed_np_output), 1., places=1)
 
+  @test_util.run_deprecated_v1
   def test5DInputMultiAxis123(self):
     epsilon = 1e-3
     bn = normalization_layers.BatchNormalization(
@@ -1296,9 +1344,9 @@ class BNTest(test.TestCase):
 
     with self.cached_session() as sess:
       # Test training with placeholder learning phase.
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
 
-      np_gamma, np_beta = sess.run([bn.gamma, bn.beta])
+      np_gamma, np_beta = self.evaluate([bn.gamma, bn.beta])
 
       for _ in range(100):
         np_output, _, _ = sess.run([outputs] + bn.updates,
@@ -1309,8 +1357,9 @@ class BNTest(test.TestCase):
         self.assertAlmostEqual(np.std(normed_np_output), 1., places=1)
 
       # Verify that the statistics are updated during training.
-      moving_mean, moving_var = sess.run([bn.moving_mean, bn.moving_variance])
-      np_inputs = sess.run(inputs)
+      moving_mean, moving_var = self.evaluate(
+          [bn.moving_mean, bn.moving_variance])
+      np_inputs = self.evaluate(inputs)
       mean = np.mean(np_inputs, axis=(0, 4), keepdims=True)
       std = np.std(np_inputs, axis=(0, 4), keepdims=True)
       variance = np.square(std)
@@ -1325,6 +1374,7 @@ class BNTest(test.TestCase):
       self.assertAlmostEqual(np.mean(normed_np_output), 0., places=1)
       self.assertAlmostEqual(np.std(normed_np_output), 1., places=1)
 
+  @test_util.run_deprecated_v1
   def testGhostBN5DimsMultiAxis14(self):
     shape = [6, 3, 10, 10, 4]
     virtual_batch_size = 3
@@ -1350,7 +1400,7 @@ class BNTest(test.TestCase):
                    shape[1:])
 
     with self.session(use_gpu=True) as sess:
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       for _ in range(5):
         x = np.random.random(shape)
 
diff --git a/tensorflow/python/layers/pooling_test.py b/tensorflow/python/layers/pooling_test.py
index 7533674e5a0..cf1fa1e6915 100644
--- a/tensorflow/python/layers/pooling_test.py
+++ b/tensorflow/python/layers/pooling_test.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.framework import test_util
 from tensorflow.python.layers import pooling as pooling_layers
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import random_ops
@@ -64,6 +65,7 @@ class PoolingTest(test.TestCase):
     output = layer.apply(images)
     self.assertListEqual(output.get_shape().as_list(), [5, 3, 4, 4])
 
+  @test_util.run_deprecated_v1
   def testCreateMaxPooling2DChannelsFirst(self):
     height, width = 7, 9
     images = random_ops.random_uniform((5, 2, height, width))
@@ -73,6 +75,7 @@ class PoolingTest(test.TestCase):
     output = layer.apply(images)
     self.assertListEqual(output.get_shape().as_list(), [5, 2, 6, 8])
 
+  @test_util.run_deprecated_v1
   def testCreateAveragePooling2DChannelsFirst(self):
     height, width = 5, 6
     images = random_ops.random_uniform((3, 4, height, width))
@@ -83,6 +86,7 @@ class PoolingTest(test.TestCase):
     output = layer.apply(images)
     self.assertListEqual(output.get_shape().as_list(), [3, 4, 4, 5])
 
+  @test_util.run_deprecated_v1
   def testCreateAveragePooling2DChannelsFirstWithNoneBatch(self):
     height, width = 5, 6
     images = array_ops.placeholder(dtype='float32',
diff --git a/tensorflow/python/lib/io/file_io.py b/tensorflow/python/lib/io/file_io.py
index f22fb253e4d..4caa5750bf6 100644
--- a/tensorflow/python/lib/io/file_io.py
+++ b/tensorflow/python/lib/io/file_io.py
@@ -241,7 +241,7 @@ class FileIO(object):
     self._writable_file = None
 
 
-@tf_export("gfile.Exists")
+@tf_export(v1=["gfile.Exists"])
 def file_exists(filename):
   """Determines whether a path exists or not.
 
@@ -252,18 +252,35 @@ def file_exists(filename):
     True if the path exists, whether its a file or a directory.
     False if the path does not exist and there are no filesystem errors.
 
+  Raises:
+    errors.OpError: Propagates any errors reported by the FileSystem API.
+  """
+  return file_exists_v2(filename)
+
+
+@tf_export("io.gfile.exists")
+def file_exists_v2(path):
+  """Determines whether a path exists or not.
+
+  Args:
+    path: string, a path
+
+  Returns:
+    True if the path exists, whether its a file or a directory.
+    False if the path does not exist and there are no filesystem errors.
+
   Raises:
     errors.OpError: Propagates any errors reported by the FileSystem API.
   """
   try:
     with errors.raise_exception_on_not_ok_status() as status:
-      pywrap_tensorflow.FileExists(compat.as_bytes(filename), status)
+      pywrap_tensorflow.FileExists(compat.as_bytes(path), status)
   except errors.NotFoundError:
     return False
   return True
 
 
-@tf_export("gfile.Remove")
+@tf_export(v1=["gfile.Remove"])
 def delete_file(filename):
   """Deletes the file located at 'filename'.
 
@@ -274,8 +291,22 @@ def delete_file(filename):
     errors.OpError: Propagates any errors reported by the FileSystem API.  E.g.,
     NotFoundError if the file does not exist.
   """
+  delete_file_v2(filename)
+
+
+@tf_export("io.gfile.remove")
+def delete_file_v2(path):
+  """Deletes the path located at 'path'.
+
+  Args:
+    path: string, a path
+
+  Raises:
+    errors.OpError: Propagates any errors reported by the FileSystem API.  E.g.,
+    NotFoundError if the path does not exist.
+  """
   with errors.raise_exception_on_not_ok_status() as status:
-    pywrap_tensorflow.DeleteFile(compat.as_bytes(filename), status)
+    pywrap_tensorflow.DeleteFile(compat.as_bytes(path), status)
 
 
 def read_file_to_string(filename, binary_mode=False):
@@ -314,7 +345,7 @@ def write_string_to_file(filename, file_content):
     f.write(file_content)
 
 
-@tf_export("gfile.Glob")
+@tf_export(v1=["gfile.Glob"])
 def get_matching_files(filename):
   """Returns a list of files that match the given pattern(s).
 
@@ -324,28 +355,44 @@ def get_matching_files(filename):
   Returns:
     A list of strings containing filenames that match the given pattern(s).
 
+  Raises:
+    errors.OpError: If there are filesystem / directory listing errors.
+  """
+  return get_matching_files_v2(filename)
+
+
+@tf_export("io.gfile.glob")
+def get_matching_files_v2(pattern):
+  """Returns a list of files that match the given pattern(s).
+
+  Args:
+    pattern: string or iterable of strings. The glob pattern(s).
+
+  Returns:
+    A list of strings containing filenames that match the given pattern(s).
+
   Raises:
     errors.OpError: If there are filesystem / directory listing errors.
   """
   with errors.raise_exception_on_not_ok_status() as status:
-    if isinstance(filename, six.string_types):
+    if isinstance(pattern, six.string_types):
       return [
           # Convert the filenames to string from bytes.
           compat.as_str_any(matching_filename)
           for matching_filename in pywrap_tensorflow.GetMatchingFiles(
-              compat.as_bytes(filename), status)
+              compat.as_bytes(pattern), status)
       ]
     else:
       return [
           # Convert the filenames to string from bytes.
           compat.as_str_any(matching_filename)
-          for single_filename in filename
+          for single_filename in pattern
           for matching_filename in pywrap_tensorflow.GetMatchingFiles(
               compat.as_bytes(single_filename), status)
       ]
 
 
-@tf_export("gfile.MkDir")
+@tf_export(v1=["gfile.MkDir"])
 def create_dir(dirname):
   """Creates a directory with the name 'dirname'.
 
@@ -356,14 +403,31 @@ def create_dir(dirname):
     The parent directories need to exist. Use recursive_create_dir instead if
     there is the possibility that the parent dirs don't exist.
 
+  Raises:
+    errors.OpError: If the operation fails.
+  """
+  create_dir_v2(dirname)
+
+
+@tf_export("io.gfile.mkdir")
+def create_dir_v2(path):
+  """Creates a directory with the name given by 'path'.
+
+  Args:
+    path: string, name of the directory to be created
+
+  Notes:
+    The parent directories need to exist. Use recursive_create_dir instead if
+    there is the possibility that the parent dirs don't exist.
+
   Raises:
     errors.OpError: If the operation fails.
   """
   with errors.raise_exception_on_not_ok_status() as status:
-    pywrap_tensorflow.CreateDir(compat.as_bytes(dirname), status)
+    pywrap_tensorflow.CreateDir(compat.as_bytes(path), status)
 
 
-@tf_export("gfile.MakeDirs")
+@tf_export(v1=["gfile.MakeDirs"])
 def recursive_create_dir(dirname):
   """Creates a directory and all parent/intermediate directories.
 
@@ -372,14 +436,29 @@ def recursive_create_dir(dirname):
   Args:
     dirname: string, name of the directory to be created
 
+  Raises:
+    errors.OpError: If the operation fails.
+  """
+  recursive_create_dir_v2(dirname)
+
+
+@tf_export("io.gfile.makedirs")
+def recursive_create_dir_v2(path):
+  """Creates a directory and all parent/intermediate directories.
+
+  It succeeds if path already exists and is writable.
+
+  Args:
+    path: string, name of the directory to be created
+
   Raises:
     errors.OpError: If the operation fails.
   """
   with errors.raise_exception_on_not_ok_status() as status:
-    pywrap_tensorflow.RecursivelyCreateDir(compat.as_bytes(dirname), status)
+    pywrap_tensorflow.RecursivelyCreateDir(compat.as_bytes(path), status)
 
 
-@tf_export("gfile.Copy")
+@tf_export(v1=["gfile.Copy"])
 def copy(oldpath, newpath, overwrite=False):
   """Copies data from oldpath to newpath.
 
@@ -389,15 +468,31 @@ def copy(oldpath, newpath, overwrite=False):
     overwrite: boolean, if false its an error for newpath to be occupied by an
         existing file.
 
+  Raises:
+    errors.OpError: If the operation fails.
+  """
+  copy_v2(oldpath, newpath, overwrite)
+
+
+@tf_export("io.gfile.copy")
+def copy_v2(src, dst, overwrite=False):
+  """Copies data from src to dst.
+
+  Args:
+    src: string, name of the file whose contents need to be copied
+    dst: string, name of the file to which to copy to
+    overwrite: boolean, if false its an error for newpath to be occupied by an
+        existing file.
+
   Raises:
     errors.OpError: If the operation fails.
   """
   with errors.raise_exception_on_not_ok_status() as status:
     pywrap_tensorflow.CopyFile(
-        compat.as_bytes(oldpath), compat.as_bytes(newpath), overwrite, status)
+        compat.as_bytes(src), compat.as_bytes(dst), overwrite, status)
 
 
-@tf_export("gfile.Rename")
+@tf_export(v1=["gfile.Rename"])
 def rename(oldname, newname, overwrite=False):
   """Rename or move a file / directory.
 
@@ -407,12 +502,28 @@ def rename(oldname, newname, overwrite=False):
     overwrite: boolean, if false it's an error for `newname` to be occupied by
         an existing file.
 
+  Raises:
+    errors.OpError: If the operation fails.
+  """
+  rename_v2(oldname, newname, overwrite)
+
+
+@tf_export("io.gfile.rename")
+def rename_v2(src, dst, overwrite=False):
+  """Rename or move a file / directory.
+
+  Args:
+    src: string, pathname for a file
+    dst: string, pathname to which the file needs to be moved
+    overwrite: boolean, if false it's an error for `dst` to be occupied by
+        an existing file.
+
   Raises:
     errors.OpError: If the operation fails.
   """
   with errors.raise_exception_on_not_ok_status() as status:
     pywrap_tensorflow.RenameFile(
-        compat.as_bytes(oldname), compat.as_bytes(newname), overwrite, status)
+        compat.as_bytes(src), compat.as_bytes(dst), overwrite, status)
 
 
 def atomic_write_string_to_file(filename, contents, overwrite=True):
@@ -439,35 +550,61 @@ def atomic_write_string_to_file(filename, contents, overwrite=True):
     raise
 
 
-@tf_export("gfile.DeleteRecursively")
+@tf_export(v1=["gfile.DeleteRecursively"])
 def delete_recursively(dirname):
   """Deletes everything under dirname recursively.
 
   Args:
     dirname: string, a path to a directory
 
+  Raises:
+    errors.OpError: If the operation fails.
+  """
+  delete_recursively_v2(dirname)
+
+
+@tf_export("io.gfile.rmtree")
+def delete_recursively_v2(path):
+  """Deletes everything under path recursively.
+
+  Args:
+    path: string, a path
+
   Raises:
     errors.OpError: If the operation fails.
   """
   with errors.raise_exception_on_not_ok_status() as status:
-    pywrap_tensorflow.DeleteRecursively(compat.as_bytes(dirname), status)
+    pywrap_tensorflow.DeleteRecursively(compat.as_bytes(path), status)
 
 
-@tf_export("gfile.IsDirectory")
+@tf_export(v1=["gfile.IsDirectory"])
 def is_directory(dirname):
   """Returns whether the path is a directory or not.
 
   Args:
     dirname: string, path to a potential directory
 
+  Returns:
+    True, if the path is a directory; False otherwise
+  """
+  return is_directory_v2(dirname)
+
+
+@tf_export("io.gfile.isdir")
+def is_directory_v2(path):
+  """Returns whether the path is a directory or not.
+
+  Args:
+    path: string, path to a potential directory
+
   Returns:
     True, if the path is a directory; False otherwise
   """
   status = c_api_util.ScopedTFStatus()
-  return pywrap_tensorflow.IsDirectory(compat.as_bytes(dirname), status)
+  return pywrap_tensorflow.IsDirectory(compat.as_bytes(path), status)
 
 
-@tf_export("gfile.ListDirectory")
+@tf_export(v1=["gfile.ListDirectory"])
 def list_directory(dirname):
   """Returns a list of entries contained within a directory.
 
@@ -483,7 +620,26 @@ def list_directory(dirname):
   Raises:
     errors.NotFoundError if directory doesn't exist
   """
-  if not is_directory(dirname):
+  return list_directory_v2(dirname)
+
+
+@tf_export("io.gfile.listdir")
+def list_directory_v2(path):
+  """Returns a list of entries contained within a directory.
+
+  The list is in arbitrary order. It does not contain the special entries "."
+  and "..".
+
+  Args:
+    path: string, path to a directory
+
+  Returns:
+    [filename1, filename2, ... filenameN] as strings
+
+  Raises:
+    errors.NotFoundError if directory doesn't exist
+  """
+  if not is_directory(path):
     raise errors.NotFoundError(None, None, "Could not find directory")
   with errors.raise_exception_on_not_ok_status() as status:
     # Convert each element to string, since the return values of the
@@ -491,11 +647,11 @@ def list_directory(dirname):
     return [
         compat.as_str_any(filename)
         for filename in pywrap_tensorflow.GetChildren(
-            compat.as_bytes(dirname), status)
+            compat.as_bytes(path), status)
     ]
 
 
-@tf_export("gfile.Walk")
+@tf_export(v1=["gfile.Walk"])
 def walk(top, in_order=True):
   """Recursive directory tree generator for directories.
 
@@ -505,6 +661,27 @@ def walk(top, in_order=True):
 
   Errors that happen while listing directories are ignored.
 
+  Yields:
+    Each yield is a 3-tuple:  the pathname of a directory, followed by lists of
+    all its subdirectories and leaf files.
+    (dirname, [subdirname, subdirname, ...], [filename, filename, ...])
+    as strings
+  """
+  return walk_v2(top, in_order)
+
+
+@tf_export("io.gfile.walk")
+def walk_v2(top, topdown, onerror=None):
+  """Recursive directory tree generator for directories.
+
+  Args:
+    top: string, a Directory name
+    topdown: bool, Traverse pre order if True, post order if False.
+    onerror: optional handler for errors. Should be a function, it will be
+      called with the error as argument. Rethrowing the error aborts the walk.
+
+  Errors that happen while listing directories are ignored.
+
   Yields:
     Each yield is a 3-tuple:  the pathname of a directory, followed by lists of
     all its subdirectories and leaf files.
@@ -514,8 +691,11 @@ def walk(top, in_order=True):
   top = compat.as_str_any(top)
   try:
     listing = list_directory(top)
-  except errors.NotFoundError:
-    return
+  except errors.NotFoundError as err:
+    if onerror:
+      onerror(err)
+    else:
+      return
 
   files = []
   subdirs = []
@@ -528,18 +708,18 @@ def walk(top, in_order=True):
 
   here = (top, subdirs, files)
 
-  if in_order:
+  if topdown:
     yield here
 
   for subdir in subdirs:
-    for subitem in walk(os.path.join(top, subdir), in_order):
+    for subitem in walk_v2(os.path.join(top, subdir), topdown, onerror=onerror):
       yield subitem
 
-  if not in_order:
+  if not topdown:
     yield here
 
 
-@tf_export("gfile.Stat")
+@tf_export(v1=["gfile.Stat"])
 def stat(filename):
   """Returns file statistics for a given path.
 
@@ -549,12 +729,28 @@ def stat(filename):
   Returns:
     FileStatistics struct that contains information about the path
 
+  Raises:
+    errors.OpError: If the operation fails.
+  """
+  return stat_v2(filename)
+
+
+@tf_export("io.gfile.stat")
+def stat_v2(path):
+  """Returns file statistics for a given path.
+
+  Args:
+    path: string, path to a file
+
+  Returns:
+    FileStatistics struct that contains information about the path
+
   Raises:
     errors.OpError: If the operation fails.
   """
   file_statistics = pywrap_tensorflow.FileStatistics()
   with errors.raise_exception_on_not_ok_status() as status:
-    pywrap_tensorflow.Stat(compat.as_bytes(filename), file_statistics, status)
+    pywrap_tensorflow.Stat(compat.as_bytes(path), file_statistics, status)
     return file_statistics
 
 
diff --git a/tensorflow/python/lib/io/tf_record.py b/tensorflow/python/lib/io/tf_record.py
index b7fae852955..43086ab18d7 100644
--- a/tensorflow/python/lib/io/tf_record.py
+++ b/tensorflow/python/lib/io/tf_record.py
@@ -150,10 +150,11 @@ class TFRecordOptions(object):
     return options
 
 
-@tf_export(
-    "io.tf_record_iterator",
-    v1=["io.tf_record_iterator", "python_io.tf_record_iterator"])
-@deprecation.deprecated_endpoints("python_io.tf_record_iterator")
+@tf_export(v1=["io.tf_record_iterator", "python_io.tf_record_iterator"])
+@deprecation.deprecated(
+    date=None,
+    instructions=("Use eager execution and: \n"
+                  "`tf.data.TFRecordDataset(path)`"))
 def tf_record_iterator(path, options=None):
   """An iterator that read the records from a TFRecords file.
 
diff --git a/tensorflow/python/ops/array_grad.py b/tensorflow/python/ops/array_grad.py
index 68c392bf28d..45e741ef222 100644
--- a/tensorflow/python/ops/array_grad.py
+++ b/tensorflow/python/ops/array_grad.py
@@ -489,10 +489,12 @@ def _GatherNdGrad(op, grad):
 
 
 @ops.RegisterGradient("CheckNumerics")
-def _CheckNumericsGrad(_, grad):
+def _CheckNumericsGrad(op, grad):
   """Gradient for check_numerics op."""
   return array_ops.check_numerics(
-      grad, "Not a number (NaN) or infinity (Inf) values detected in gradient.")
+      grad,
+      "Not a number (NaN) or infinity (Inf) values detected in gradient. %s" %
+      op.get_attr("message"))
 
 
 @ops.RegisterGradient("PlaceholderWithDefault")
@@ -800,6 +802,32 @@ def _ScatterNdGrad(op, grad):
   return [None, updates_grad, None]
 
 
+@ops.RegisterGradient("TensorScatterUpdate")
+def _TensorScatterUpdateGrad(op, grad):
+  indices = op.inputs[1]
+  updates_grad = array_ops.gather_nd(grad, indices)
+  tensor_grad = array_ops.tensor_scatter_update(
+      array_ops.identity(grad), indices,
+      array_ops.zeros_like(op.inputs[2], dtype=grad.dtype))
+  return [tensor_grad, None, updates_grad]
+
+
+@ops.RegisterGradient("TensorScatterAdd")
+def _TensorScatterAddGrad(op, grad):
+  indices = op.inputs[1]
+  updates_grad = array_ops.gather_nd(grad, indices)
+  tensor_grad = array_ops.identity(grad)
+  return [tensor_grad, None, updates_grad]
+
+
+@ops.RegisterGradient("TensorScatterSub")
+def _TensorScatterSubGrad(op, grad):
+  indices = op.inputs[1]
+  updates_grad = array_ops.gather_nd(grad, indices)
+  tensor_grad = array_ops.identity(grad)
+  return [tensor_grad, None, -updates_grad]
+
+
 @ops.RegisterGradient("ScatterNdNonAliasingAdd")
 def _ScatterNdNonAliasingAddGrad(op, grad):
   indices = op.inputs[1]
diff --git a/tensorflow/python/ops/array_ops.py b/tensorflow/python/ops/array_ops.py
index bbf7d166bf9..7b6242b1cd7 100644
--- a/tensorflow/python/ops/array_ops.py
+++ b/tensorflow/python/ops/array_ops.py
@@ -22,6 +22,7 @@ from __future__ import print_function
 import sys
 
 import numpy as np
+import six
 
 from tensorflow.python.eager import context
 from tensorflow.python.framework import common_shapes
@@ -40,6 +41,7 @@ from tensorflow.python.ops import gen_math_ops
 from tensorflow.python.ops.gen_array_ops import *
 from tensorflow.python.ops.gen_array_ops import reverse_v2 as reverse  # pylint: disable=unused-import
 from tensorflow.python.util import deprecation
+from tensorflow.python.util import dispatch
 from tensorflow.python.util import nest
 from tensorflow.python.util.tf_export import tf_export
 # pylint: enable=wildcard-import
@@ -79,7 +81,7 @@ def identity(input, name=None):  # pylint: disable=redefined-builtin
 
 
 # pylint: disable=redefined-builtin,protected-access
-@tf_export("expand_dims")
+@tf_export(v1=["expand_dims"])
 @deprecation.deprecated_args(None, "Use the `axis` argument instead", "dim")
 def expand_dims(input, axis=None, name=None, dim=None):
   """Inserts a dimension of 1 into a tensor's shape.
@@ -133,6 +135,55 @@ def expand_dims(input, axis=None, name=None, dim=None):
   axis = deprecation.deprecated_argument_lookup("axis", axis, "dim", dim)
   if axis is None:
     raise ValueError("Must specify an axis argument to tf.expand_dims()")
+  return expand_dims_v2(input, axis, name)
+
+
+@tf_export("expand_dims", v1=[])
+def expand_dims_v2(input, axis, name=None):
+  """Inserts a dimension of 1 into a tensor's shape.
+
+  Given a tensor `input`, this operation inserts a dimension of 1 at the
+  dimension index `axis` of `input`'s shape. The dimension index `axis` starts
+  at zero; if you specify a negative number for `axis` it is counted backward
+  from the end.
+
+  This operation is useful if you want to add a batch dimension to a single
+  element. For example, if you have a single image of shape `[height, width,
+  channels]`, you can make it a batch of 1 image with `expand_dims(image, 0)`,
+  which will make the shape `[1, height, width, channels]`.
+
+  Other examples:
+
+  ```python
+  # 't' is a tensor of shape [2]
+  tf.shape(tf.expand_dims(t, 0))  # [1, 2]
+  tf.shape(tf.expand_dims(t, 1))  # [2, 1]
+  tf.shape(tf.expand_dims(t, -1))  # [2, 1]
+
+  # 't2' is a tensor of shape [2, 3, 5]
+  tf.shape(tf.expand_dims(t2, 0))  # [1, 2, 3, 5]
+  tf.shape(tf.expand_dims(t2, 2))  # [2, 3, 1, 5]
+  tf.shape(tf.expand_dims(t2, 3))  # [2, 3, 5, 1]
+  ```
+
+  This operation requires that:
+
+  `-1-input.dims() <= dim <= input.dims()`
+
+  This operation is related to `squeeze()`, which removes dimensions of
+  size 1.
+
+  Args:
+    input: A `Tensor`.
+    axis: 0-D (scalar). Specifies the dimension index at which to
+      expand the shape of `input`. Must be in the range
+      `[-rank(input) - 1, rank(input)]`.
+    name: The name of the output `Tensor` (optional).
+
+  Returns:
+    A `Tensor` with the same data as `input`, but its shape has an additional
+    dimension of size 1 added.
+  """
   return gen_array_ops.expand_dims(input, axis, name)
 
 
@@ -219,7 +270,13 @@ def broadcast_static_shape(shape_x, shape_y):
   return common_shapes.broadcast_shape(shape_x, shape_y)
 
 
-@tf_export("shape")
+@tf_export("shape", v1=[])
+def shape_v2(input, out_type=dtypes.int32, name=None):
+  # pylint: disable=redefined-builtin
+  return shape(input, name, out_type)
+
+
+@tf_export(v1=["shape"])
 def shape(input, name=None, out_type=dtypes.int32):
   # pylint: disable=redefined-builtin
   """Returns the shape of a tensor.
@@ -292,7 +349,13 @@ def shape_n(input, out_type=dtypes.int32, name=None):
   return gen_array_ops.shape_n(input, out_type=out_type, name=name)
 
 
-@tf_export("size")
+@tf_export("size", v1=[])
+def size_v2(input, out_type=dtypes.int32, name=None):
+  # pylint: disable=redefined-builtin
+  return size(input, name, out_type)
+
+
+@tf_export(v1=["size"])
 def size(input, name=None, out_type=dtypes.int32):
   # pylint: disable=redefined-builtin
   """Returns the size of a tensor.
@@ -341,7 +404,7 @@ def size_internal(input, name=None, optimize=True, out_type=dtypes.int32):
       input, (sparse_tensor.SparseTensor, sparse_tensor.SparseTensorValue)):
     input = ops.convert_to_tensor(input)
     np_out_type = out_type.as_numpy_dtype
-    num_elements = np.prod(input._shape_tuple(), dtype=np_out_type)  # pylint: disable=protected-acces:
+    num_elements = np.prod(input._shape_tuple(), dtype=np_out_type)  # pylint: disable=protected-access
     return ops.convert_to_tensor(num_elements, dtype=out_type)
   with ops.name_scope(name, "Size", [input]) as name:
     if isinstance(input, (sparse_tensor.SparseTensor,
@@ -431,7 +494,7 @@ _SUPPORTED_SLICE_DTYPES = (
 
 def _check_index(idx):
   """Check if a given value is a valid index into a tensor."""
-  if isinstance(idx, (int, tensor_shape.Dimension)):
+  if isinstance(idx, (six.integer_types, tensor_shape.Dimension)):
     return
 
   # Optimistic check. Assumptions:
@@ -1445,7 +1508,75 @@ def split(value, num_or_size_splits, axis=0, num=None, name="split"):
       value=value, size_splits=size_splits, axis=axis, num_split=num, name=name)
 
 
-@tf_export("transpose")
+@tf_export("transpose", v1=[])
+def transpose_v2(a, perm=None, conjugate=False, name="transpose"):
+  """Transposes `a`. Permutes the dimensions according to `perm`.
+
+  The returned tensor's dimension i will correspond to the input dimension
+  `perm[i]`. If `perm` is not given, it is set to (n-1...0), where n is
+  the rank of the input tensor. Hence by default, this operation performs a
+  regular matrix transpose on 2-D input Tensors. If conjugate is True and
+  `a.dtype` is either `complex64` or `complex128` then the values of `a`
+  are conjugated and transposed.
+
+  @compatibility(numpy)
+  In `numpy` transposes are memory-efficient constant time operations as they
+  simply return a new view of the same data with adjusted `strides`.
+
+  TensorFlow does not support strides, so `transpose` returns a new tensor with
+  the items permuted.
+  @end_compatibility
+
+  For example:
+
+  ```python
+  x = tf.constant([[1, 2, 3], [4, 5, 6]])
+  tf.transpose(x)  # [[1, 4]
+                   #  [2, 5]
+                   #  [3, 6]]
+
+  # Equivalently
+  tf.transpose(x, perm=[1, 0])  # [[1, 4]
+                                #  [2, 5]
+                                #  [3, 6]]
+
+  # If x is complex, setting conjugate=True gives the conjugate transpose
+  x = tf.constant([[1 + 1j, 2 + 2j, 3 + 3j],
+                   [4 + 4j, 5 + 5j, 6 + 6j]])
+  tf.transpose(x, conjugate=True)  # [[1 - 1j, 4 - 4j],
+                                   #  [2 - 2j, 5 - 5j],
+                                   #  [3 - 3j, 6 - 6j]]
+
+  # 'perm' is more useful for n-dimensional tensors, for n > 2
+  x = tf.constant([[[ 1,  2,  3],
+                    [ 4,  5,  6]],
+                   [[ 7,  8,  9],
+                    [10, 11, 12]]])
+
+  # Take the transpose of the matrices in dimension-0
+  # (this common operation has a shorthand `linalg.transpose`)
+  tf.transpose(x, perm=[0, 2, 1])  # [[[1,  4],
+                                   #   [2,  5],
+                                   #   [3,  6]],
+                                   #  [[7, 10],
+                                   #   [8, 11],
+                                   #   [9, 12]]]
+  ```
+
+  Args:
+    a: A `Tensor`.
+    perm: A permutation of the dimensions of `a`.
+    conjugate: Optional bool. Setting it to `True` is mathematically equivalent
+      to tf.conj(tf.transpose(input)).
+    name: A name for the operation (optional).
+
+  Returns:
+    A transposed `Tensor`.
+  """
+  return transpose(a=a, perm=perm, name=name, conjugate=conjugate)
+
+
+@tf_export(v1=["transpose"])
 def transpose(a, perm=None, name="transpose", conjugate=False):
   """Transposes `a`. Permutes the dimensions according to `perm`.
 
@@ -1678,7 +1809,7 @@ def zeros(shape, dtype=dtypes.float32, name=None):
   return output
 
 
-@tf_export("zeros_like")
+@tf_export(v1=["zeros_like"])
 def zeros_like(tensor, dtype=None, name=None, optimize=True):
   """Creates a tensor with all elements set to zero.
 
@@ -1705,6 +1836,42 @@ def zeros_like(tensor, dtype=None, name=None, optimize=True):
   Returns:
     A `Tensor` with all elements set to zero.
   """
+  return zeros_like_impl(tensor, dtype, name, optimize)
+
+
+@tf_export("zeros_like", v1=[])
+def zeros_like_v2(
+    input,  # pylint: disable=redefined-builtin
+    dtype=None,
+    name=None):
+  """Creates a tensor with all elements set to zero.
+
+  Given a single tensor (`tensor`), this operation returns a tensor of the
+  same type and shape as `tensor` with all elements set to zero. Optionally,
+  you can use `dtype` to specify a new type for the returned tensor.
+
+  For example:
+
+  ```python
+  tensor = tf.constant([[1, 2, 3], [4, 5, 6]])
+  tf.zeros_like(tensor)  # [[0, 0, 0], [0, 0, 0]]
+  ```
+
+  Args:
+    input: A `Tensor`.
+    dtype: A type for the returned `Tensor`. Must be `float16`, `float32`,
+      `float64`, `int8`, `uint8`, `int16`, `uint16`, `int32`, `int64`,
+      `complex64`, `complex128`, `bool` or `string`.
+    name: A name for the operation (optional).
+
+  Returns:
+    A `Tensor` with all elements set to zero.
+  """
+  return zeros_like_impl(input, dtype, name, optimize=True)
+
+
+def zeros_like_impl(tensor, dtype, name, optimize=True):
+  """Internal implementation for the v1/v2 zeros_like API calls."""
   with ops.name_scope(name, "zeros_like", [tensor]) as name:
     tensor = ops.convert_to_tensor(tensor, name="tensor")
 
@@ -1731,7 +1898,7 @@ def zeros_like(tensor, dtype=None, name=None, optimize=True):
       return gen_array_ops.zeros_like(tensor, name=name)
 
 
-@tf_export("ones_like")
+@tf_export(v1=["ones_like"])
 def ones_like(tensor, dtype=None, name=None, optimize=True):
   """Creates a tensor with all elements set to 1.
 
@@ -1758,6 +1925,42 @@ def ones_like(tensor, dtype=None, name=None, optimize=True):
   Returns:
     A `Tensor` with all elements set to 1.
   """
+  return ones_like_impl(tensor, dtype, name, optimize)
+
+
+@tf_export("ones_like", v1=[])
+def ones_like_v2(
+    input,  # pylint: disable=redefined-builtin
+    dtype=None,
+    name=None):
+  """Creates a tensor with all elements set to zero.
+
+  Given a single tensor (`tensor`), this operation returns a tensor of the
+  same type and shape as `tensor` with all elements set to zero. Optionally,
+  you can use `dtype` to specify a new type for the returned tensor.
+
+  For example:
+
+  ```python
+  tensor = tf.constant([[1, 2, 3], [4, 5, 6]])
+  tf.ones_like(tensor)  # [[1, 1, 1], [1, 1, 1]]
+  ```
+
+  Args:
+    input: A `Tensor`.
+    dtype: A type for the returned `Tensor`. Must be `float16`, `float32`,
+      `float64`, `int8`, `uint8`, `int16`, `uint16`, `int32`, `int64`,
+      `complex64`, `complex128`, `bool` or `string`.
+    name: A name for the operation (optional).
+
+  Returns:
+    A `Tensor` with all elements set to zero.
+  """
+  return ones_like_impl(input, dtype, name, optimize=True)
+
+
+def ones_like_impl(tensor, dtype, name, optimize=True):
+  """Internal implementation for the v1/v2 ones_like API calls."""
   with ops.name_scope(name, "ones_like", [tensor]) as name:
     tensor = ops.convert_to_tensor(tensor, name="tensor")
     ones_shape = shape_internal(tensor, optimize=optimize)
@@ -1955,7 +2158,65 @@ def sparse_placeholder(dtype, shape=None, name=None):
 # pylint: enable=redefined-outer-name
 
 
-@tf_export("pad")
+@tf_export("pad", v1=[])
+def pad_v2(tensor, paddings, mode="CONSTANT", constant_values=0, name=None):
+  """Pads a tensor.
+
+  This operation pads a `tensor` according to the `paddings` you specify.
+  `paddings` is an integer tensor with shape `[n, 2]`, where n is the rank of
+  `tensor`. For each dimension D of `input`, `paddings[D, 0]` indicates how
+  many values to add before the contents of `tensor` in that dimension, and
+  `paddings[D, 1]` indicates how many values to add after the contents of
+  `tensor` in that dimension. If `mode` is "REFLECT" then both `paddings[D, 0]`
+  and `paddings[D, 1]` must be no greater than `tensor.dim_size(D) - 1`. If
+  `mode` is "SYMMETRIC" then both `paddings[D, 0]` and `paddings[D, 1]` must be
+  no greater than `tensor.dim_size(D)`.
+
+  The padded size of each dimension D of the output is:
+
+  `paddings[D, 0] + tensor.dim_size(D) + paddings[D, 1]`
+
+  For example:
+
+  ```python
+  t = tf.constant([[1, 2, 3], [4, 5, 6]])
+  paddings = tf.constant([[1, 1,], [2, 2]])
+  # 'constant_values' is 0.
+  # rank of 't' is 2.
+  tf.pad(t, paddings, "CONSTANT")  # [[0, 0, 0, 0, 0, 0, 0],
+                                   #  [0, 0, 1, 2, 3, 0, 0],
+                                   #  [0, 0, 4, 5, 6, 0, 0],
+                                   #  [0, 0, 0, 0, 0, 0, 0]]
+
+  tf.pad(t, paddings, "REFLECT")  # [[6, 5, 4, 5, 6, 5, 4],
+                                  #  [3, 2, 1, 2, 3, 2, 1],
+                                  #  [6, 5, 4, 5, 6, 5, 4],
+                                  #  [3, 2, 1, 2, 3, 2, 1]]
+
+  tf.pad(t, paddings, "SYMMETRIC")  # [[2, 1, 1, 2, 3, 3, 2],
+                                    #  [2, 1, 1, 2, 3, 3, 2],
+                                    #  [5, 4, 4, 5, 6, 6, 5],
+                                    #  [5, 4, 4, 5, 6, 6, 5]]
+  ```
+
+  Args:
+    tensor: A `Tensor`.
+    paddings: A `Tensor` of type `int32`.
+    mode: One of "CONSTANT", "REFLECT", or "SYMMETRIC" (case-insensitive)
+    constant_values: In "CONSTANT" mode, the scalar pad value to use. Must be
+      same type as `tensor`.
+    name: A name for the operation (optional).
+
+  Returns:
+    A `Tensor`. Has the same type as `tensor`.
+
+  Raises:
+    ValueError: When mode is not one of "CONSTANT", "REFLECT", or "SYMMETRIC".
+  """
+  return pad(tensor, paddings, mode, name, constant_values)
+
+
+@tf_export(v1=["pad"])
 def pad(tensor, paddings, mode="CONSTANT", name=None, constant_values=0):  # pylint: disable=invalid-name
   """Pads a tensor.
 
@@ -2415,7 +2676,7 @@ def depth_to_space(input, block_size, name=None, data_format="NHWC"):  # pylint:
 depth_to_space.__doc__ = gen_array_ops.depth_to_space.__doc__
 
 
-@tf_export("batch_to_space")
+@tf_export(v1=["batch_to_space"])
 def batch_to_space(input, crops, block_size, name=None):  # pylint: disable=redefined-builtin
   result = batch_to_space_nd(
       input,
@@ -2429,6 +2690,151 @@ def batch_to_space(input, crops, block_size, name=None):  # pylint: disable=rede
 batch_to_space.__doc__ = gen_array_ops.batch_to_space.__doc__
 
 
+@tf_export("batch_to_space", v1=[])
+def batch_to_space_v2(input, block_shape, crops, name=None):  # pylint: disable=redefined-builtin
+  """BatchToSpace for N-D tensors of type T.
+
+  This operation reshapes the "batch" dimension 0 into `M + 1` dimensions of
+  shape `block_shape + [batch]`, interleaves these blocks back into the grid
+  defined by the spatial dimensions `[1, ..., M]`, to obtain a result with the
+  same rank as the input.  The spatial dimensions of this intermediate result
+  are then optionally cropped according to `crops` to produce the output.  This
+  is the reverse of SpaceToBatch.  See below for a precise description.
+
+  Args:
+    input: A `Tensor`.
+      N-D with shape `input_shape = [batch] + spatial_shape + remaining_shape`,
+      where spatial_shape has M dimensions.
+    block_shape: A `Tensor`. Must be one of the following types:
+      `int32`, `int64`. 1-D with shape `[M]`, all values must be >= 1.
+      For backwards compatibility with TF 1.0, this parameter may be an int, in
+      which case it is converted to
+      `numpy.array([block_shape, block_shape], dtype=numpy.int64)`.
+    crops: A `Tensor`. Must be one of the following types: `int32`, `int64`.
+      2-D with shape `[M, 2]`, all values must be >= 0.
+        `crops[i] = [crop_start, crop_end]` specifies the amount to crop from
+        input dimension `i + 1`, which corresponds to spatial dimension `i`.  It
+        is required that
+        `crop_start[i] + crop_end[i] <= block_shape[i] * input_shape[i + 1]`.
+
+      This operation is equivalent to the following steps:
+
+      1. Reshape `input` to `reshaped` of shape:
+           [block_shape[0], ..., block_shape[M-1],
+            batch / prod(block_shape),
+            input_shape[1], ..., input_shape[N-1]]
+
+      2. Permute dimensions of `reshaped` to produce `permuted` of shape
+           [batch / prod(block_shape),
+
+            input_shape[1], block_shape[0],
+            ...,
+            input_shape[M], block_shape[M-1],
+
+            input_shape[M+1], ..., input_shape[N-1]]
+
+      3. Reshape `permuted` to produce `reshaped_permuted` of shape
+           [batch / prod(block_shape),
+
+            input_shape[1] * block_shape[0],
+            ...,
+            input_shape[M] * block_shape[M-1],
+
+            input_shape[M+1],
+            ...,
+            input_shape[N-1]]
+
+      4. Crop the start and end of dimensions `[1, ..., M]` of
+         `reshaped_permuted` according to `crops` to produce the
+         output of shape:
+           [batch / prod(block_shape),
+
+            input_shape[1] * block_shape[0] - crops[0,0] - crops[0,1],
+            ...,
+            input_shape[M] * block_shape[M-1] - crops[M-1,0] - crops[M-1,1],
+
+            input_shape[M+1], ..., input_shape[N-1]]
+
+      Some examples:
+
+      (1) For the following input of shape `[4, 1, 1, 1]`,
+          `block_shape = [2, 2]`, and `crops = [[0, 0], [0, 0]]`:
+
+      ```
+      [[[[1]]], [[[2]]], [[[3]]], [[[4]]]]
+      ```
+
+      The output tensor has shape `[1, 2, 2, 1]` and value:
+
+      ```
+      x = [[[[1], [2]], [[3], [4]]]]
+      ```
+
+      (2) For the following input of shape `[4, 1, 1, 3]`,
+          `block_shape = [2, 2]`, and `crops = [[0, 0], [0, 0]]`:
+
+      ```
+      [[[1, 2, 3]], [[4, 5, 6]], [[7, 8, 9]], [[10, 11, 12]]]
+      ```
+
+      The output tensor has shape `[1, 2, 2, 3]` and value:
+
+      ```
+      x = [[[[1, 2, 3], [4, 5, 6]],
+            [[7, 8, 9], [10, 11, 12]]]]
+      ```
+
+      (3) For the following input of shape `[4, 2, 2, 1]`,
+          `block_shape = [2, 2]`, and `crops = [[0, 0], [0, 0]]`:
+
+      ```
+      x = [[[[1], [3]], [[9], [11]]],
+           [[[2], [4]], [[10], [12]]],
+           [[[5], [7]], [[13], [15]]],
+           [[[6], [8]], [[14], [16]]]]
+      ```
+
+      The output tensor has shape `[1, 4, 4, 1]` and value:
+
+      ```
+      x = [[[1],   [2],  [3],  [4]],
+           [[5],   [6],  [7],  [8]],
+           [[9],  [10], [11],  [12]],
+           [[13], [14], [15],  [16]]]
+      ```
+
+      (4) For the following input of shape `[8, 1, 3, 1]`,
+          `block_shape = [2, 2]`, and `crops = [[0, 0], [2, 0]]`:
+
+      ```
+      x = [[[[0], [1], [3]]], [[[0], [9], [11]]],
+           [[[0], [2], [4]]], [[[0], [10], [12]]],
+           [[[0], [5], [7]]], [[[0], [13], [15]]],
+           [[[0], [6], [8]]], [[[0], [14], [16]]]]
+      ```
+
+      The output tensor has shape `[2, 2, 4, 1]` and value:
+
+      ```
+      x = [[[[1],   [2],  [3],  [4]],
+            [[5],   [6],  [7],  [8]]],
+           [[[9],  [10], [11],  [12]],
+            [[13], [14], [15],  [16]]]]
+      ```
+    name: A name for the operation (optional).
+
+  Returns:
+    A `Tensor`. Has the same type as `input`.
+  """
+  if isinstance(block_shape, int):
+    block_shape = np.array([block_shape, block_shape], dtype=np.int64)
+
+  return batch_to_space_nd(input=input,
+                           block_shape=block_shape,
+                           crops=crops,
+                           name=name)
+
+
 @tf_export("one_hot")
 def one_hot(indices,
             depth,
@@ -2652,7 +3058,7 @@ def sequence_mask(lengths, maxlen=None, dtype=dtypes.bool, name=None):
       return gen_math_ops.cast(result, dtype)
 
 
-@tf_export("squeeze")
+@tf_export(v1=["squeeze"])
 @deprecation.deprecated_args(None, "Use the `axis` argument instead",
                              "squeeze_dims")
 def squeeze(input, axis=None, name=None, squeeze_dims=None):
@@ -2702,6 +3108,12 @@ def squeeze(input, axis=None, name=None, squeeze_dims=None):
   return gen_array_ops.squeeze(input, axis, name)
 
 
+@tf_export("squeeze", v1=[])
+def squeeze_v2(input, axis=None, name=None):
+  # pylint: disable=redefined-builtin
+  return squeeze(input, axis, name)
+
+
 @tf_export("where")
 def where(condition, x=None, y=None, name=None):
   """Return the elements, either from `x` or `y`, depending on the `condition`.
@@ -2756,7 +3168,7 @@ def where(condition, x=None, y=None, name=None):
 
 
 # pylint: disable=redefined-builtin
-@tf_export("reverse_sequence")
+@tf_export(v1=["reverse_sequence"])
 @deprecation.deprecated_args(
     None, "seq_dim is deprecated, use seq_axis instead", "seq_dim")
 @deprecation.deprecated_args(
@@ -2780,15 +3192,32 @@ def reverse_sequence(input,
       name=name)
 
 
-# pylint: enable=redefined-builtin
-
 reverse_sequence.__doc__ = deprecation.rewrite_argument_docstring(
     deprecation.rewrite_argument_docstring(
         gen_array_ops.reverse_sequence.__doc__, "batch_dim", "batch_axis"),
     "seq_dim", "seq_axis")
 
 
-@tf_export("gather")
+@tf_export("reverse_sequence", v1=[])
+def reverse_sequence_v2(
+    input, seq_lengths, seq_axis=None, batch_axis=None, name=None):
+  return gen_array_ops.reverse_sequence(
+      input=input,
+      seq_lengths=seq_lengths,
+      seq_dim=seq_axis,
+      batch_dim=batch_axis,
+      name=name)
+
+
+reverse_sequence_v2.__doc__ = deprecation.rewrite_argument_docstring(
+    deprecation.rewrite_argument_docstring(
+        gen_array_ops.reverse_sequence.__doc__, "batch_dim", "batch_axis"),
+    "seq_dim", "seq_axis")
+
+# pylint: enable=redefined-builtin
+
+
+@tf_export(v1=["gather"])
 def gather(params, indices, validate_indices=None, name=None, axis=0):
   del validate_indices
   if axis != 0:
@@ -2804,10 +3233,18 @@ def gather(params, indices, validate_indices=None, name=None, axis=0):
     return gen_array_ops.gather_v2(params, indices, axis, name=name)
 
 
-gather.__doc__ = gen_array_ops.gather_v2.__doc__
+@tf_export("gather", v1=[])
+def gather_v2(params, indices, validate_indices=None, axis=0, name=None):
+  return gather(params, indices, validate_indices=validate_indices, name=name,
+                axis=axis)
+
+
+gather.__doc__ = gather_v2.__doc__ = gen_array_ops.gather_v2.__doc__
+
 
 
 @tf_export("batch_gather")
+@dispatch.add_dispatch_support
 def batch_gather(params, indices, name=None):
   """Gather slices from `params` according to `indices` with leading batch dims.
 
@@ -2885,7 +3322,7 @@ def batch_gather(params, indices, name=None):
 
 # Define quantize_v2 here in order to make name the second-to-last attribute,
 # because round_mode was added later.
-@tf_export("quantize_v2")
+@tf_export(v1=["quantize_v2"])
 @deprecation.deprecated(
     "2017-10-25",
     "`tf.quantize_v2` is deprecated, please use `tf.quantization.quantize` "
@@ -2906,7 +3343,7 @@ def quantize_v2(input,  # pylint: disable=redefined-builtin
                                    round_mode=round_mode)
 
 
-quantize_v2.__doc__ = """Please use `tf.quantize` instead."""
+quantize_v2.__doc__ = """Please use `tf.quantization.quantize` instead."""
 
 
 # We want to expose tf.quantize instead of tf.quantize_v2; we can deprecate
@@ -2992,3 +3429,48 @@ def searchsorted(sorted_sequence,
 
 
 quantize.__doc__ = gen_array_ops.quantize_v2.__doc__
+
+
+@tf_export("image.extract_image_patches", v1=[])
+def extract_image_patches_v2(
+    images,
+    sizes,
+    strides,
+    rates,
+    padding,
+    name=None):
+  # pylint: disable=line-too-long
+  r"""Extract `patches` from `images` and put them in the \"depth\" output dimension.
+
+  Args:
+    images: A 4-D Tensor with shape `[batch, in_rows, in_cols, depth]
+    sizes: The size of the sliding window for each dimension of `images`.
+    strides: A 1-D Tensor of length 4. How far the centers of two consecutive
+      patches are in the images. Must be: `[1, stride_rows, stride_cols, 1]`.
+    rates: A 1-D Tensor of length 4. Must be: `[1, rate_rows, rate_cols, 1]`.
+      This is the input stride, specifying how far two consecutive patch samples
+      are in the input. Equivalent to extracting patches with `patch_sizes_eff =
+      patch_sizes + (patch_sizes - 1) * (rates - 1)`, followed by subsampling
+      them spatially by a factor of `rates`. This is equivalent to `rate` in
+      dilated (a.k.a. Atrous) convolutions.
+    padding: The type of padding algorithm to use.
+      We specify the size-related attributes as: ```python ksizes = [1,
+        ksize_rows, ksize_cols, 1] strides = [1, strides_rows, strides_cols, 1]
+        rates = [1, rates_rows, rates_cols, 1]
+    name: A name for the operation (optional).
+
+  Returns:
+    A 4-D Tensor. Has the same type as `images`, and with shape `[batch,
+    out_rows, out_cols, ksize_rows * ksize_cols * depth]` containing image
+    patches with size `ksize_rows x ksize_cols x depth` vectorized in the
+    \"depth\" dimension. Note `out_rows` and `out_cols` are the dimensions of
+    the output patches.
+  """
+  # pylint: enable=line-too-long
+  return gen_array_ops.extract_image_patches(
+      images, sizes, strides, rates, padding, name)
+
+extract_image_patches_deprecation = deprecation.deprecated_args(
+    None, "ksizes is deprecated, use sizes instead", "ksizes")
+tf_export(v1=["image.extract_image_patches", "extract_image_patches"])(
+    extract_image_patches_deprecation(gen_array_ops.extract_image_patches))
diff --git a/tensorflow/python/ops/bitwise_ops_test.py b/tensorflow/python/ops/bitwise_ops_test.py
index dfb40db2d5a..d154b6759bf 100644
--- a/tensorflow/python/ops/bitwise_ops_test.py
+++ b/tensorflow/python/ops/bitwise_ops_test.py
@@ -34,6 +34,7 @@ class BitwiseOpTest(test_util.TensorFlowTestCase):
   def __init__(self, method_name="runTest"):
     super(BitwiseOpTest, self).__init__(method_name)
 
+  @test_util.run_deprecated_v1
   def testBinaryOps(self):
     dtype_list = [dtypes.int8, dtypes.int16, dtypes.int32, dtypes.int64,
                   dtypes.uint8, dtypes.uint16, dtypes.uint32, dtypes.uint64]
@@ -59,16 +60,18 @@ class BitwiseOpTest(test_util.TensorFlowTestCase):
                   2**31 - 1, 2**31, 2**32 - 1, 2**32, -2**32 + 1, -2**32,
                   -2**63 + 1, 2**63 - 1]
     def count_bits(x):
-      return sum([bin(z).count("1") for z in six.iterbytes(x.tobytes())])
+      return sum(bin(z).count("1") for z in six.iterbytes(x.tobytes()))
     for dtype in dtype_list:
       with self.cached_session(use_gpu=True) as sess:
         print("PopulationCount test: ", dtype)
         inputs = np.array(raw_inputs, dtype=dtype.as_numpy_dtype)
         truth = [count_bits(x) for x in inputs]
         input_tensor = constant_op.constant(inputs, dtype=dtype)
-        popcnt_result = sess.run(gen_bitwise_ops.population_count(input_tensor))
+        popcnt_result = self.evaluate(
+            gen_bitwise_ops.population_count(input_tensor))
         self.assertAllEqual(truth, popcnt_result)
 
+  @test_util.run_deprecated_v1
   def testInvertOp(self):
     dtype_list = [dtypes.int8, dtypes.int16, dtypes.int32, dtypes.int64,
                   dtypes.uint8, dtypes.uint16, dtypes.uint32, dtypes.uint64]
@@ -89,10 +92,11 @@ class BitwiseOpTest(test_util.TensorFlowTestCase):
         self.assertAllEqual(not_a_or_a, [not_0] * 4)
         # For unsigned dtypes let's also check the result directly.
         if dtype.is_unsigned:
-          inverted = sess.run(bitwise_ops.invert(input_tensor))
+          inverted = self.evaluate(bitwise_ops.invert(input_tensor))
           expected = [dtype.max - x for x in inputs]
           self.assertAllEqual(inverted, expected)
 
+  @test_util.run_deprecated_v1
   def testShiftsWithPositiveLHS(self):
     dtype_list = [np.int8, np.int16, np.int32, np.int64,
                   np.uint8, np.uint16, np.uint32, np.uint64]
@@ -107,6 +111,7 @@ class BitwiseOpTest(test_util.TensorFlowTestCase):
         self.assertAllEqual(left_shift_result, np.left_shift(lhs, rhs))
         self.assertAllEqual(right_shift_result, np.right_shift(lhs, rhs))
 
+  @test_util.run_deprecated_v1
   def testShiftsWithNegativeLHS(self):
     dtype_list = [np.int8, np.int16, np.int32, np.int64]
 
@@ -120,6 +125,7 @@ class BitwiseOpTest(test_util.TensorFlowTestCase):
         self.assertAllEqual(left_shift_result, np.left_shift(lhs, rhs))
         self.assertAllEqual(right_shift_result, np.right_shift(lhs, rhs))
 
+  @test_util.run_deprecated_v1
   def testImplementationDefinedShiftsDoNotCrash(self):
     dtype_list = [np.int8, np.int16, np.int32, np.int64]
 
@@ -135,6 +141,7 @@ class BitwiseOpTest(test_util.TensorFlowTestCase):
                   bitwise_ops.right_shift(lhs, rhs)])
 
 
+  @test_util.run_deprecated_v1
   def testShapeInference(self):
     dtype_list = [dtypes.int8, dtypes.int16, dtypes.int32, dtypes.int64,
                   dtypes.uint8, dtypes.uint16]
diff --git a/tensorflow/python/ops/candidate_sampling_ops.py b/tensorflow/python/ops/candidate_sampling_ops.py
index f0bfdb2b7a3..c64000b65d4 100644
--- a/tensorflow/python/ops/candidate_sampling_ops.py
+++ b/tensorflow/python/ops/candidate_sampling_ops.py
@@ -208,7 +208,9 @@ def learned_unigram_candidate_sampler(true_classes, num_true, num_sampled,
       seed2=seed2, name=name)
 
 
-@tf_export('nn.fixed_unigram_candidate_sampler')
+@tf_export('random.fixed_unigram_candidate_sampler',
+           'nn.fixed_unigram_candidate_sampler',
+           v1=['nn.fixed_unigram_candidate_sampler'])
 def fixed_unigram_candidate_sampler(true_classes,
                                     num_true,
                                     num_sampled,
@@ -300,7 +302,8 @@ def fixed_unigram_candidate_sampler(true_classes,
       unigrams=unigrams, seed=seed1, seed2=seed2, name=name)
 
 
-@tf_export('nn.all_candidate_sampler')
+@tf_export('random.all_candidate_sampler', 'nn.all_candidate_sampler',
+           v1=['nn.all_candidate_sampler'])
 def all_candidate_sampler(true_classes, num_true, num_sampled, unique,
                           seed=None, name=None):
   """Generate the set of all classes.
diff --git a/tensorflow/python/ops/check_ops.py b/tensorflow/python/ops/check_ops.py
index 5589bbc8485..f1f36269cf2 100644
--- a/tensorflow/python/ops/check_ops.py
+++ b/tensorflow/python/ops/check_ops.py
@@ -119,9 +119,31 @@ def assert_proper_iterable(values):
         'Expected argument "values" to be iterable.  Found: %s' % type(values))
 
 
-@tf_export(
-    'debugging.assert_negative',
-    v1=['debugging.assert_negative', 'assert_negative'])
+@tf_export('debugging.assert_negative', v1=[])
+def assert_negative_v2(x, message=None, summarize=None, name=None):
+  """Assert the condition `x < 0` holds element-wise.
+
+  This Op checks that `x[i] < 0` holds for every element of `x`. If `x` is
+  empty, this is trivially satisfied.
+
+  If `x` is not negative everywhere, `message`, as well as the first `summarize`
+  entries of `x` are printed, and `InvalidArgumentError` is raised.
+
+  Args:
+    x:  Numeric `Tensor`.
+    message: A string to prefix to the default message.
+    summarize: Print this many entries of each tensor.
+    name: A name for this operation (optional).  Defaults to "assert_negative".
+
+  Raises:
+    InvalidArgumentError: if the check can be performed immediately and
+      `x[i] < 0` is False. The check can be performed immediately during eager
+      execution or if `x` is statically known.
+  """
+  assert_negative(x=x, message=message, summarize=summarize, name=name)
+
+
+@tf_export(v1=['debugging.assert_negative', 'assert_negative'])
 @deprecation.deprecated_endpoints('assert_negative')
 def assert_negative(x, data=None, summarize=None, message=None, name=None):
   """Assert the condition `x < 0` holds element-wise.
@@ -163,9 +185,31 @@ def assert_negative(x, data=None, summarize=None, message=None, name=None):
     return assert_less(x, zero, data=data, summarize=summarize)
 
 
-@tf_export(
-    'debugging.assert_positive',
-    v1=['debugging.assert_positive', 'assert_positive'])
+@tf_export('debugging.assert_positive', v1=[])
+def assert_positive_v2(x, message=None, summarize=None, name=None):
+  """Assert the condition `x > 0` holds element-wise.
+
+  This Op checks that `x[i] > 0` holds for every element of `x`. If `x` is
+  empty, this is trivially satisfied.
+
+  If `x` is not positive everywhere, `message`, as well as the first `summarize`
+  entries of `x` are printed, and `InvalidArgumentError` is raised.
+
+  Args:
+    x:  Numeric `Tensor`.
+    message: A string to prefix to the default message.
+    summarize: Print this many entries of each tensor.
+    name: A name for this operation (optional). Defaults to "assert_positive".
+
+  Raises:
+    InvalidArgumentError: if the check can be performed immediately and
+      `x[i] > 0` is False. The check can be performed immediately during eager
+      execution or if `x` is statically known.
+  """
+  assert_positive(x=x, summarize=summarize, message=message, name=name)
+
+
+@tf_export(v1=['debugging.assert_positive', 'assert_positive'])
 @deprecation.deprecated_endpoints('assert_positive')
 def assert_positive(x, data=None, summarize=None, message=None, name=None):
   """Assert the condition `x > 0` holds element-wise.
@@ -206,9 +250,32 @@ def assert_positive(x, data=None, summarize=None, message=None, name=None):
     return assert_less(zero, x, data=data, summarize=summarize)
 
 
-@tf_export(
-    'debugging.assert_non_negative',
-    v1=['debugging.assert_non_negative', 'assert_non_negative'])
+@tf_export('debugging.assert_non_negative', v1=[])
+def assert_non_negative_v2(x, message=None, summarize=None, name=None):
+  """Assert the condition `x >= 0` holds element-wise.
+
+  This Op checks that `x[i] >= 0` holds for every element of `x`. If `x` is
+  empty, this is trivially satisfied.
+
+  If `x` is not >= 0 everywhere, `message`, as well as the first `summarize`
+  entries of `x` are printed, and `InvalidArgumentError` is raised.
+
+  Args:
+    x:  Numeric `Tensor`.
+    message: A string to prefix to the default message.
+    summarize: Print this many entries of each tensor.
+    name: A name for this operation (optional).  Defaults to
+      "assert_non_negative".
+
+  Raises:
+    InvalidArgumentError: if the check can be performed immediately and
+      `x[i] >= 0` is False. The check can be performed immediately during eager
+      execution or if `x` is statically known.
+  """
+  assert_non_negative(x=x, summarize=summarize, message=message, name=name)
+
+
+@tf_export(v1=['debugging.assert_non_negative', 'assert_non_negative'])
 @deprecation.deprecated_endpoints('assert_non_negative')
 def assert_non_negative(x, data=None, summarize=None, message=None, name=None):
   """Assert the condition `x >= 0` holds element-wise.
@@ -251,9 +318,32 @@ def assert_non_negative(x, data=None, summarize=None, message=None, name=None):
     return assert_less_equal(zero, x, data=data, summarize=summarize)
 
 
-@tf_export(
-    'debugging.assert_non_positive',
-    v1=['debugging.assert_non_positive', 'assert_non_positive'])
+@tf_export('debugging.assert_non_positive', v1=[])
+def assert_non_positive_v2(x, message=None, summarize=None, name=None):
+  """Assert the condition `x <= 0` holds element-wise.
+
+  This Op checks that `x[i] <= 0` holds for every element of `x`. If `x` is
+  empty, this is trivially satisfied.
+
+  If `x` is not <= 0 everywhere, `message`, as well as the first `summarize`
+  entries of `x` are printed, and `InvalidArgumentError` is raised.
+
+  Args:
+    x:  Numeric `Tensor`.
+    message: A string to prefix to the default message.
+    summarize: Print this many entries of each tensor.
+    name: A name for this operation (optional).  Defaults to
+      "assert_non_positive".
+
+  Raises:
+    InvalidArgumentError: if the check can be performed immediately and
+      `x[i] <= 0` is False. The check can be performed immediately during eager
+      execution or if `x` is statically known.
+  """
+  assert_non_positive(x=x, summarize=summarize, message=message, name=name)
+
+
+@tf_export(v1=['debugging.assert_non_positive', 'assert_non_positive'])
 @deprecation.deprecated_endpoints('assert_non_positive')
 def assert_non_positive(x, data=None, summarize=None, message=None, name=None):
   """Assert the condition `x <= 0` holds element-wise.
@@ -296,7 +386,33 @@ def assert_non_positive(x, data=None, summarize=None, message=None, name=None):
     return assert_less_equal(x, zero, data=data, summarize=summarize)
 
 
-@tf_export('debugging.assert_equal', 'assert_equal')
+@tf_export('debugging.assert_equal', 'assert_equal', v1=[])
+def assert_equal_v2(x, y, message=None, summarize=None, name=None):
+  """Assert the condition `x == y` holds element-wise.
+
+  This Op checks that `x[i] == y[i]` holds for every pair of (possibly
+  broadcast) elements of `x` and `y`. If both `x` and `y` are empty, this is
+  trivially satisfied.
+
+  If `x` and `y` are not equal, `message`, as well as the first `summarize`
+  entries of `x` and `y` are printed, and `InvalidArgumentError` is raised.
+
+  Args:
+    x:  Numeric `Tensor`.
+    y:  Numeric `Tensor`, same dtype as and broadcastable to `x`.
+    message: A string to prefix to the default message.
+    summarize: Print this many entries of each tensor.
+    name: A name for this operation (optional).  Defaults to "assert_equal".
+
+  Raises:
+    InvalidArgumentError: if the check can be performed immediately and
+      `x == y` is False. The check can be performed immediately during eager
+      execution or if `x` and `y` are statically known.
+  """
+  assert_equal(x=x, y=y, summarize=summarize, message=message, name=name)
+
+
+@tf_export(v1=['debugging.assert_equal', 'assert_equal'])
 def assert_equal(x, y, data=None, summarize=None, message=None, name=None):
   """Assert the condition `x == y` holds element-wise.
 
@@ -396,9 +512,36 @@ def assert_equal(x, y, data=None, summarize=None, message=None, name=None):
     return control_flow_ops.Assert(condition, data, summarize=summarize)
 
 
-@tf_export(
-    'debugging.assert_none_equal',
-    v1=['debugging.assert_none_equal', 'assert_none_equal'])
+@tf_export('debugging.assert_none_equal', v1=[])
+def assert_none_equal_v2(x, y, summarize=None, message=None, name=None):
+  """Assert the condition `x != y` holds for all elements.
+
+  This Op checks that `x[i] != y[i]` holds for every pair of (possibly
+  broadcast) elements of `x` and `y`. If both `x` and `y` are empty, this is
+  trivially satisfied.
+
+  If any elements of `x` and `y` are equal, `message`, as well as the first
+  `summarize` entries of `x` and `y` are printed, and `InvalidArgumentError`
+  is raised.
+
+  Args:
+    x:  Numeric `Tensor`.
+    y:  Numeric `Tensor`, same dtype as and broadcastable to `x`.
+    summarize: Print this many entries of each tensor.
+    message: A string to prefix to the default message.
+    name: A name for this operation (optional).  Defaults to
+    "assert_none_equal".
+
+  Raises:
+    InvalidArgumentError: if the check can be performed immediately and
+      `x != y` is False for any pair of elements in `x` and `y`. The check can
+      be performed immediately during eager execution or if `x` and `y` are
+      statically known.
+  """
+  assert_none_equal(x=x, y=y, summarize=summarize, message=message, name=name)
+
+
+@tf_export(v1=['debugging.assert_none_equal', 'assert_none_equal'])
 @deprecation.deprecated_endpoints('assert_none_equal')
 def assert_none_equal(
     x, y, data=None, summarize=None, message=None, name=None):
@@ -450,7 +593,52 @@ def assert_none_equal(
     return control_flow_ops.Assert(condition, data, summarize=summarize)
 
 
-@tf_export('debugging.assert_near', v1=['debugging.assert_near', 'assert_near'])
+@tf_export('debugging.assert_near', v1=[])
+def assert_near_v2(x, y, rtol=None, atol=None, message=None, summarize=None,
+                   name=None):
+  """Assert the condition `x` and `y` are close element-wise.
+
+  This Op checks that `x[i] - y[i] < atol + rtol * tf.abs(y[i])` holds for every
+  pair of (possibly broadcast) elements of `x` and `y`. If both `x` and `y` are
+  empty, this is trivially satisfied.
+
+  If any elements of `x` and `y` are not close, `message`, as well as the first
+  `summarize` entries of `x` and `y` are printed, and `InvalidArgumentError`
+  is raised.
+
+  The default `atol` and `rtol` is `10 * eps`, where `eps` is the smallest
+  representable positive number such that `1 + eps != 1`.  This is about
+  `1.2e-6` in `32bit`, `2.22e-15` in `64bit`, and `0.00977` in `16bit`.
+  See `numpy.finfo`.
+
+  Args:
+    x: Float or complex `Tensor`.
+    y: Float or complex `Tensor`, same dtype as and broadcastable to `x`.
+    rtol:  `Tensor`.  Same `dtype` as, and broadcastable to, `x`.
+      The relative tolerance.  Default is `10 * eps`.
+    atol:  `Tensor`.  Same `dtype` as, and broadcastable to, `x`.
+      The absolute tolerance.  Default is `10 * eps`.
+    message: A string to prefix to the default message.
+    summarize: Print this many entries of each tensor.
+    name: A name for this operation (optional).  Defaults to "assert_near".
+
+  Raises:
+    InvalidArgumentError: if the check can be performed immediately and
+      `x != y` is False for any pair of elements in `x` and `y`. The check can
+      be performed immediately during eager execution or if `x` and `y` are
+      statically known.
+
+  @compatibility(numpy)
+  Similar to `numpy.assert_allclose`, except tolerance depends on data type.
+  This is due to the fact that `TensorFlow` is often used with `32bit`, `64bit`,
+  and even `16bit` data.
+  @end_compatibility
+  """
+  assert_near(x=x, y=y, rtol=rtol, atol=atol, summarize=summarize,
+              message=message, name=name)
+
+
+@tf_export(v1=['debugging.assert_near', 'assert_near'])
 @deprecation.deprecated_endpoints('assert_near')
 def assert_near(
     x, y, rtol=None, atol=None, data=None, summarize=None, message=None,
@@ -529,7 +717,34 @@ def assert_near(
     return control_flow_ops.Assert(condition, data, summarize=summarize)
 
 
-@tf_export('debugging.assert_less', 'assert_less')
+@tf_export('debugging.assert_less', 'assert_less', v1=[])
+def assert_less_v2(x, y, message=None, summarize=None, name=None):
+  """Assert the condition `x < y` holds element-wise.
+
+  This Op checks that `x[i] < y[i]` holds for every pair of (possibly
+  broadcast) elements of `x` and `y`. If both `x` and `y` are empty, this is
+  trivially satisfied.
+
+  If `x` is not less than `y` element-wise, `message`, as well as the first
+  `summarize` entries of `x` and `y` are printed, and `InvalidArgumentError` is
+  raised.
+
+  Args:
+    x:  Numeric `Tensor`.
+    y:  Numeric `Tensor`, same dtype as and broadcastable to `x`.
+    message: A string to prefix to the default message.
+    summarize: Print this many entries of each tensor.
+    name: A name for this operation (optional).  Defaults to "assert_less".
+
+  Raises:
+    InvalidArgumentError: if the check can be performed immediately and
+      `x < y` is False. The check can be performed immediately during eager
+      execution or if `x` and `y` are statically known.
+  """
+  assert_less(x=x, y=y, summarize=summarize, message=message, name=name)
+
+
+@tf_export(v1=['debugging.assert_less', 'assert_less'])
 def assert_less(x, y, data=None, summarize=None, message=None, name=None):
   """Assert the condition `x < y` holds element-wise.
 
@@ -577,9 +792,34 @@ def assert_less(x, y, data=None, summarize=None, message=None, name=None):
     return control_flow_ops.Assert(condition, data, summarize=summarize)
 
 
-@tf_export(
-    'debugging.assert_less_equal',
-    v1=['debugging.assert_less_equal', 'assert_less_equal'])
+@tf_export('debugging.assert_less_equal', v1=[])
+def assert_less_equal_v2(x, y, message=None, summarize=None, name=None):
+  """Assert the condition `x <= y` holds element-wise.
+
+  This Op checks that `x[i] <= y[i]` holds for every pair of (possibly
+  broadcast) elements of `x` and `y`. If both `x` and `y` are empty, this is
+  trivially satisfied.
+
+  If `x` is not less or equal than `y` element-wise, `message`, as well as the
+  first `summarize` entries of `x` and `y` are printed, and
+  `InvalidArgumentError` is raised.
+
+  Args:
+    x:  Numeric `Tensor`.
+    y:  Numeric `Tensor`, same dtype as and broadcastable to `x`.
+    message: A string to prefix to the default message.
+    summarize: Print this many entries of each tensor.
+    name: A name for this operation (optional). Defaults to "assert_less_equal".
+
+  Raises:
+    InvalidArgumentError: if the check can be performed immediately and
+      `x <= y` is False. The check can be performed immediately during eager
+      execution or if `x` and `y` are statically known.
+  """
+  assert_less_equal(x=x, y=y, summarize=summarize, message=message, name=name)
+
+
+@tf_export(v1=['debugging.assert_less_equal', 'assert_less_equal'])
 @deprecation.deprecated_endpoints('assert_less_equal')
 def assert_less_equal(x, y, data=None, summarize=None, message=None, name=None):
   """Assert the condition `x <= y` holds element-wise.
@@ -628,7 +868,34 @@ def assert_less_equal(x, y, data=None, summarize=None, message=None, name=None):
     return control_flow_ops.Assert(condition, data, summarize=summarize)
 
 
-@tf_export('debugging.assert_greater', 'assert_greater')
+@tf_export('debugging.assert_greater', 'assert_greater', v1=[])
+def assert_greater_v2(x, y, message=None, summarize=None, name=None):
+  """Assert the condition `x > y` holds element-wise.
+
+  This Op checks that `x[i] > y[i]` holds for every pair of (possibly
+  broadcast) elements of `x` and `y`. If both `x` and `y` are empty, this is
+  trivially satisfied.
+
+  If `x` is not greater than `y` element-wise, `message`, as well as the first
+  `summarize` entries of `x` and `y` are printed, and `InvalidArgumentError` is
+  raised.
+
+  Args:
+    x:  Numeric `Tensor`.
+    y:  Numeric `Tensor`, same dtype as and broadcastable to `x`.
+    message: A string to prefix to the default message.
+    summarize: Print this many entries of each tensor.
+    name: A name for this operation (optional).  Defaults to "assert_greater".
+
+  Raises:
+    InvalidArgumentError: if the check can be performed immediately and
+      `x > y` is False. The check can be performed immediately during eager
+      execution or if `x` and `y` are statically known.
+  """
+  assert_greater(x=x, y=y, summarize=summarize, message=message, name=name)
+
+
+@tf_export(v1=['debugging.assert_greater', 'assert_greater'])
 def assert_greater(x, y, data=None, summarize=None, message=None, name=None):
   """Assert the condition `x > y` holds element-wise.
 
@@ -676,9 +943,36 @@ def assert_greater(x, y, data=None, summarize=None, message=None, name=None):
     return control_flow_ops.Assert(condition, data, summarize=summarize)
 
 
-@tf_export(
-    'debugging.assert_greater_equal',
-    v1=['debugging.assert_greater_equal', 'assert_greater_equal'])
+@tf_export('debugging.assert_greater_equal', v1=[])
+def assert_greater_equal_v2(x, y, message=None, summarize=None, name=None):
+  """Assert the condition `x >= y` holds element-wise.
+
+  This Op checks that `x[i] >= y[i]` holds for every pair of (possibly
+  broadcast) elements of `x` and `y`. If both `x` and `y` are empty, this is
+  trivially satisfied.
+
+  If `x` is not greater or equal to `y` element-wise, `message`, as well as the
+  first `summarize` entries of `x` and `y` are printed, and
+  `InvalidArgumentError` is raised.
+
+  Args:
+    x:  Numeric `Tensor`.
+    y:  Numeric `Tensor`, same dtype as and broadcastable to `x`.
+    message: A string to prefix to the default message.
+    summarize: Print this many entries of each tensor.
+    name: A name for this operation (optional).  Defaults to
+    "assert_greater_equal".
+
+  Raises:
+    InvalidArgumentError: if the check can be performed immediately and
+      `x >= y` is False. The check can be performed immediately during eager
+      execution or if `x` and `y` are statically known.
+  """
+  assert_greater_equal(x=x, y=y, summarize=summarize, message=message,
+                       name=name)
+
+
+@tf_export(v1=['debugging.assert_greater_equal', 'assert_greater_equal'])
 @deprecation.deprecated_endpoints('assert_greater_equal')
 def assert_greater_equal(x, y, data=None, summarize=None, message=None,
                          name=None):
@@ -777,7 +1071,31 @@ def _assert_rank_condition(
   return control_flow_ops.Assert(condition, data, summarize=summarize)
 
 
-@tf_export('debugging.assert_rank', 'assert_rank')
+@tf_export('debugging.assert_rank', 'assert_rank', v1=[])
+def assert_rank_v2(x, rank, message=None, name=None):
+  """Assert that `x` has rank equal to `rank`.
+
+  This Op checks that the rank of `x` is equal to `rank`.
+
+  If `x` has a different rank, `message`, as well as the shape of `x` are
+  printed, and `InvalidArgumentError` is raised.
+
+  Args:
+    x: `Tensor`.
+    rank: Scalar integer `Tensor`.
+    message: A string to prefix to the default message.
+    name: A name for this operation (optional). Defaults to
+      "assert_rank".
+
+  Raises:
+    InvalidArgumentError: if the check can be performed immediately and
+      `x` does not have rank `rank`. The check can be performed immediately
+      during eager execution or if the shape of `x` is statically known.
+  """
+  assert_rank(x=x, rank=rank, message=message, name=name)
+
+
+@tf_export(v1=['debugging.assert_rank', 'assert_rank'])
 def assert_rank(x, rank, data=None, summarize=None, message=None, name=None):
   """Assert `x` has rank equal to `rank`.
 
@@ -792,7 +1110,7 @@ def assert_rank(x, rank, data=None, summarize=None, message=None, name=None):
     x:  Numeric `Tensor`.
     rank:  Scalar integer `Tensor`.
     data:  The tensors to print out if the condition is False.  Defaults to
-      error message and first few entries of `x`.
+      error message and the shape of `x`.
     summarize: Print this many entries of each tensor.
     message: A string to prefix to the default message.
     name: A name for this operation (optional).  Defaults to "assert_rank".
@@ -839,9 +1157,31 @@ def assert_rank(x, rank, data=None, summarize=None, message=None, name=None):
   return assert_op
 
 
-@tf_export(
-    'debugging.assert_rank_at_least',
-    v1=['debugging.assert_rank_at_least', 'assert_rank_at_least'])
+@tf_export('debugging.assert_rank_at_least', v1=[])
+def assert_rank_at_least_v2(x, rank, message=None, name=None):
+  """Assert that `x` has rank of at least `rank`.
+
+  This Op checks that the rank of `x` is greater or equal to `rank`.
+
+  If `x` has a rank lower than `rank`, `message`, as well as the shape of `x`
+  are printed, and `InvalidArgumentError` is raised.
+
+  Args:
+    x: `Tensor`.
+    rank: Scalar integer `Tensor`.
+    message: A string to prefix to the default message.
+    name: A name for this operation (optional).  Defaults to
+      "assert_rank_at_least".
+
+  Raises:
+    InvalidArgumentError: `x` does not have rank at least `rank`, but the rank
+      cannot be statically determined.
+    ValueError: If static checks determine `x` has mismatched rank.
+  """
+  assert_rank_at_least(x=x, rank=rank, message=message, name=name)
+
+
+@tf_export(v1=['debugging.assert_rank_at_least', 'assert_rank_at_least'])
 @deprecation.deprecated_endpoints('assert_rank_at_least')
 def assert_rank_at_least(
     x, rank, data=None, summarize=None, message=None, name=None):
@@ -973,9 +1313,30 @@ def _assert_ranks_condition(
   return control_flow_ops.Assert(condition, data, summarize=summarize)
 
 
-@tf_export(
-    'debugging.assert_rank_in',
-    v1=['debugging.assert_rank_in', 'assert_rank_in'])
+@tf_export('debugging.assert_rank_in', v1=[])
+def assert_rank_in_v2(x, ranks, message=None, name=None):
+  """Assert that `x` has a rank in `ranks`.
+
+  This Op checks that the rank of `x` is in `ranks`.
+
+  If `x` has a different rank, `message`, as well as the shape of `x` are
+  printed, and `InvalidArgumentError` is raised.
+
+  Args:
+    x: `Tensor`.
+    ranks: `Iterable` of scalar `Tensor` objects.
+    message: A string to prefix to the default message.
+    name: A name for this operation (optional). Defaults to "assert_rank_in".
+
+  Raises:
+    InvalidArgumentError: `x` does not have rank in `ranks`, but the rank cannot
+      be statically determined.
+    ValueError: If static checks determine `x` has mismatched rank.
+  """
+  assert_rank_in(x=x, ranks=ranks, message=message, name=name)
+
+
+@tf_export(v1=['debugging.assert_rank_in', 'assert_rank_in'])
 @deprecation.deprecated_endpoints('assert_rank_in')
 def assert_rank_in(
     x, ranks, data=None, summarize=None, message=None, name=None):
@@ -1038,9 +1399,25 @@ def assert_rank_in(
   return assert_op
 
 
-@tf_export(
-    'debugging.assert_integer',
-    v1=['debugging.assert_integer', 'assert_integer'])
+@tf_export('debugging.assert_integer', v1=[])
+def assert_integer_v2(x, message=None, name=None):
+  """Assert that `x` is of integer dtype.
+
+  If `x` has a non-integer type, `message`, as well as the dtype of `x` are
+  printed, and `InvalidArgumentError` is raised.
+
+  Args:
+    x: A `Tensor`.
+    message: A string to prefix to the default message.
+    name: A name for this operation (optional). Defaults to "assert_integer".
+
+  Raises:
+    TypeError:  If `x.dtype` is not a non-quantized integer type.
+  """
+  assert_integer(x=x, message=message, name=name)
+
+
+@tf_export(v1=['debugging.assert_integer', 'assert_integer'])
 @deprecation.deprecated_endpoints('assert_integer')
 def assert_integer(x, message=None, name=None):
   """Assert that `x` is of integer dtype.
@@ -1079,13 +1456,30 @@ def assert_integer(x, message=None, name=None):
     return control_flow_ops.no_op('statically_determined_was_integer')
 
 
-@tf_export('debugging.assert_type', v1=['debugging.assert_type', 'assert_type'])
+@tf_export('debugging.assert_type', v1=[])
+def assert_type_v2(tensor, tf_type, message=None, name=None):
+  """Asserts that the given `Tensor` is of the specified type.
+
+  Args:
+    tensor: A `Tensor`.
+    tf_type: A tensorflow type (`dtypes.float32`, `tf.int64`, `dtypes.bool`,
+      etc).
+    message: A string to prefix to the default message.
+    name:  A name for this operation. Defaults to "assert_type"
+
+  Raises:
+    TypeError: If the tensor's data type doesn't match `tf_type`.
+  """
+  assert_type(tensor=tensor, tf_type=tf_type, message=message, name=name)
+
+
+@tf_export(v1=['debugging.assert_type', 'assert_type'])
 @deprecation.deprecated_endpoints('assert_type')
 def assert_type(tensor, tf_type, message=None, name=None):
   """Statically asserts that the given `Tensor` is of the specified type.
 
   Args:
-    tensor: A tensorflow `Tensor`.
+    tensor: A `Tensor`.
     tf_type: A tensorflow type (`dtypes.float32`, `tf.int64`, `dtypes.bool`,
       etc).
     message: A string to prefix to the default message.
@@ -1136,9 +1530,13 @@ def is_numeric_tensor(tensor):
 
 
 @tf_export(
-    'debugging.is_non_decreasing',
-    v1=['debugging.is_non_decreasing', 'is_non_decreasing'])
-@deprecation.deprecated_endpoints('is_non_decreasing')
+    'math.is_non_decreasing',
+    v1=[
+        'math.is_non_decreasing', 'debugging.is_non_decreasing',
+        'is_non_decreasing'
+    ])
+@deprecation.deprecated_endpoints('debugging.is_non_decreasing',
+                                  'is_non_decreasing')
 def is_non_decreasing(x, name=None):
   """Returns `True` if `x` is non-decreasing.
 
@@ -1166,9 +1564,13 @@ def is_non_decreasing(x, name=None):
 
 
 @tf_export(
-    'debugging.is_strictly_increasing',
-    v1=['debugging.is_strictly_increasing', 'is_strictly_increasing'])
-@deprecation.deprecated_endpoints('is_strictly_increasing')
+    'math.is_strictly_increasing',
+    v1=[
+        'math.is_strictly_increasing', 'debugging.is_strictly_increasing',
+        'is_strictly_increasing'
+    ])
+@deprecation.deprecated_endpoints('debugging.is_strictly_increasing',
+                                  'is_strictly_increasing')
 def is_strictly_increasing(x, name=None):
   """Returns `True` if `x` is strictly increasing.
 
@@ -1260,8 +1662,10 @@ def assert_same_float_dtype(tensors=None, dtype=None):
     tensors: Tensors of input values. Can include `None` elements, which will be
         ignored.
     dtype: Expected type.
+
   Returns:
     Validated type.
+
   Raises:
     ValueError: if neither `tensors` nor `dtype` is supplied, or result is not
         float, or the common type of the inputs is not a floating point type.
@@ -1275,20 +1679,57 @@ def assert_same_float_dtype(tensors=None, dtype=None):
   return dtype
 
 
-@tf_export(
-    'debugging.assert_scalar', v1=['debugging.assert_scalar', 'assert_scalar'])
+@tf_export('debugging.assert_scalar', v1=[])
+def assert_scalar_v2(tensor, message=None, name=None):
+  """Asserts that the given `tensor` is a scalar.
+
+  This function raises `ValueError` unless it can be certain that the given
+  `tensor` is a scalar. `ValueError` is also raised if the shape of `tensor` is
+  unknown.
+
+  Args:
+    tensor: A `Tensor`.
+    message: A string to prefix to the default message.
+    name:  A name for this operation. Defaults to "assert_scalar"
+
+  Raises:
+    ValueError: If the tensor is not scalar (rank 0), or if its shape is
+      unknown.
+  """
+  assert_scalar(tensor=tensor, message=message, name=name)
+
+
+@tf_export(v1=['debugging.assert_scalar', 'assert_scalar'])
 @deprecation.deprecated_endpoints('assert_scalar')
-def assert_scalar(tensor, name=None):
+def assert_scalar(tensor, name=None, message=None):
+  """Asserts that the given `tensor` is a scalar.
+
+  This function raises `ValueError` unless it can be certain that the given
+  `tensor` is a scalar. `ValueError` is also raised if the shape of `tensor` is
+  unknown.
+
+  Args:
+    tensor: A `Tensor`.
+    name:  A name for this operation. Defaults to "assert_scalar"
+    message: A string to prefix to the default message.
+
+  Returns:
+    The input tensor (potentially converted to a `Tensor`).
+
+  Raises:
+    ValueError: If the tensor is not scalar (rank 0), or if its shape is
+      unknown.
+  """
   with ops.name_scope(name, 'assert_scalar', [tensor]) as name_scope:
     tensor = ops.convert_to_tensor(tensor, name=name_scope)
     shape = tensor.get_shape()
     if shape.ndims != 0:
       if context.executing_eagerly():
-        raise ValueError('Expected scalar shape, saw shape: %s.'
-                         % (shape,))
+        raise ValueError('%sExpected scalar shape, saw shape: %s.'
+                         % (message or '', shape,))
       else:
-        raise ValueError('Expected scalar shape for %s, saw shape: %s.'
-                         % (tensor.name, shape))
+        raise ValueError('%sExpected scalar shape for %s, saw shape: %s.'
+                         % (message or '', tensor.name, shape))
     return tensor
 
 
diff --git a/tensorflow/python/ops/clip_ops.py b/tensorflow/python/ops/clip_ops.py
index 5cd626b92dc..82803ac3516 100644
--- a/tensorflow/python/ops/clip_ops.py
+++ b/tensorflow/python/ops/clip_ops.py
@@ -300,7 +300,12 @@ def clip_by_global_norm(t_list, clip_norm, use_norm=None, name=None):
   return list_clipped, use_norm
 
 
-@tf_export("clip_by_average_norm")
+@deprecation.deprecated(
+    date=None,
+    instructions=
+    "clip_by_average_norm is deprecated in TensorFlow 2.0. Please use "
+    "clip_by_norm(t, clip_norm * tf.to_float(tf.size(t), name)) instead.")
+@tf_export(v1=["clip_by_average_norm"])
 def clip_by_average_norm(t, clip_norm, name=None):
   """Clips tensor values to a maximum average L2-norm.
 
diff --git a/tensorflow/python/ops/clip_ops_test.py b/tensorflow/python/ops/clip_ops_test.py
index 8aa9c4ffb34..a59a0c22d40 100644
--- a/tensorflow/python/ops/clip_ops_test.py
+++ b/tensorflow/python/ops/clip_ops_test.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import clip_ops
 from tensorflow.python.ops import numerics
 from tensorflow.python.platform import test
@@ -35,7 +36,7 @@ class ClipOpsTest(test.TestCase):
       input_op = constant_op.constant(inputs)
       clipped = clip_ops.clip_by_norm(input_op, max_norm)
       check_op = numerics.add_check_numerics_ops()
-      result, _ = sess.run([clipped, check_op])
+      result, _ = self.evaluate([clipped, check_op])
     self.assertAllClose(result, expected)
 
   def _testClipIndexedSlicesByNorm(self, values, indices, shape, max_norm,
@@ -54,9 +55,10 @@ class ClipOpsTest(test.TestCase):
       # Tensor mode
       dense_tensor = ops.convert_to_tensor(indixed_slices)
       dense_clipped = clip_ops.clip_by_norm(dense_tensor, max_norm, axes)
-      result, expected = sess.run([clipped, dense_clipped])
+      result, expected = self.evaluate([clipped, dense_clipped])
     self.assertAllClose(result, expected)
 
+  @test_util.run_deprecated_v1
   def testClipTensorByNorm(self):
     # Simple example
     self._testClipTensorByNorm([[-3.0, 0.0, 0.0], [4.0, 0.0, 0.0]], 4.0,
diff --git a/tensorflow/python/ops/collective_ops_test.py b/tensorflow/python/ops/collective_ops_test.py
index 9c772a93548..0fd9368d219 100644
--- a/tensorflow/python/ops/collective_ops_test.py
+++ b/tensorflow/python/ops/collective_ops_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import collective_ops
 from tensorflow.python.platform import test
 
@@ -49,16 +50,19 @@ class CollectiveOpTest(test.TestCase):
     self.assertAllClose(results[0], expected, rtol=1e-5, atol=1e-5)
     self.assertAllClose(results[1], expected, rtol=1e-5, atol=1e-5)
 
+  @test_util.run_deprecated_v1
   def testCollectiveReduce(self):
     self._testCollectiveReduce([0.1, 1.1, 2.1, 3.1, 4.1, 5.1, 6.1, 7.1],
                                [0.3, 1.3, 2.3, 3.3, 4.3, 5.3, 6.3, 7.3],
                                [0.2, 1.2, 2.2, 3.2, 4.2, 5.2, 6.2, 7.2], True)
 
+  @test_util.run_deprecated_v1
   def testCollectiveAutoGraphKey(self):
     self._testCollectiveReduce([0.1, 1.1, 2.1, 3.1, 4.1, 5.1, 6.1, 7.1],
                                [0.3, 1.3, 2.3, 3.3, 4.3, 5.3, 6.3, 7.3],
                                [0.2, 1.2, 2.2, 3.2, 4.2, 5.2, 6.2, 7.2], False)
 
+  @test_util.run_deprecated_v1
   def testCollectiveReduceScalar(self):
     self._testCollectiveReduce(0.1, 0.3, 0.2, True)
 
@@ -81,6 +85,7 @@ class CollectiveOpTest(test.TestCase):
     self.assertAllClose(results[0], t0, rtol=1e-5, atol=1e-5)
     self.assertAllClose(results[1], t0, rtol=1e-5, atol=1e-5)
 
+  @test_util.run_deprecated_v1
   def testCollectiveBroadcast(self):
     self._testCollectiveBroadcast([0.1, 1.1, 2.1, 3.1, 4.1, 5.1, 6.1, 7.1])
 
diff --git a/tensorflow/python/ops/cond_v2.py b/tensorflow/python/ops/cond_v2.py
index 998c3e08f6f..e882a270c8c 100644
--- a/tensorflow/python/ops/cond_v2.py
+++ b/tensorflow/python/ops/cond_v2.py
@@ -25,15 +25,19 @@ from __future__ import print_function
 
 import collections
 
-from tensorflow.core.framework import attr_value_pb2
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import func_graph as func_graph_module
 from tensorflow.python.framework import function_def_to_graph
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_util
 from tensorflow.python.ops import control_flow_util_v2 as util
+from tensorflow.python.ops import gen_dataset_ops
 from tensorflow.python.ops import gen_functional_ops
+from tensorflow.python.ops import gen_resource_variable_ops
 from tensorflow.python.ops import gradients_impl
+from tensorflow.python.util import nest
+
 
 # NOTE(skyewm): TensorFlow uses protected class methods and fields to signify
 # that they aren't part of the official public API. These protected members
@@ -74,73 +78,17 @@ def cond_v2(pred, true_fn, false_fn, name="cond"):
             false_name, read_only_collections=False),
         add_control_dependencies=add_control_dependencies,
         op_return_value=pred)
-    _check_same_outputs(true_graph, false_graph)
 
-    # Add inputs to true_graph and false_graph to make them match. Note that
-    # this modifies true_graph and false_graph.
-    cond_inputs = _make_inputs_match(true_graph, false_graph,
-                                     true_graph.external_captures,
-                                     false_graph.external_captures)
+    outputs = _build_cond(pred, true_graph, false_graph,
+                          true_graph.external_captures,
+                          false_graph.external_captures,
+                          name=scope)
 
-    # Add all intermediate tensors as function outputs so they're available for
-    # the gradient computation.
-
-    true_intermediates = _get_intermediates(true_graph)
-    false_intermediates = _get_intermediates(false_graph)
-
-    # Save the original number of outputs to return to the caller.
-    num_cond_outputs = len(true_graph.outputs)
-
-    # Make the number/type of new intermediate outputs match.
-    extra_true_outputs, extra_false_outputs = _pad_params(
-        true_graph, false_graph, true_intermediates, false_intermediates)
-
-    true_graph.outputs.extend(extra_true_outputs)
-    false_graph.outputs.extend(extra_false_outputs)
-
-    # Create the If op.
-    tensors = gen_functional_ops._if(  # pylint: disable=protected-access
-        pred,
-        cond_inputs, [t.dtype for t in true_graph.outputs],
-        util.create_new_tf_function(true_graph),
-        util.create_new_tf_function(false_graph),
-        output_shapes=_get_output_shapes(true_graph.outputs,
-                                         false_graph.outputs),
-        name=scope)
-
-    # Set the flag to enable lowering on the `if` op if necessary
-    # Lowering allows cond_v2 to avoid some of the limitations of Functions,
-    # allowing users to specify devices & colocation inside of cond_v2 branches,
-    # and enabling non-strict evaluation & partial pruning of cond_v2 branches.
-    # This brings cond_v2 closer to feature parity with tf.cond.
-    #
-    # However, we do not lower `If` in the XLA context because it is easier for
-    # XLA to apply its own optimizations when dealing with un-lowered `If`
-    # operators than with lowered switch/merge control flow.
-    #
-    # TODO(b/110167197) this approach requires cond_v2 to have at least 1 output
-    if_op = tensors[0].op
-    if not control_flow_util.IsInXLAContext(if_op):
-      # pylint: disable=protected-access
-      if_op._set_attr("_lower_using_switch_merge",
-                      attr_value_pb2.AttrValue(b=True))
-      # pylint: enable=protected-access
-
-    # Return identities for each output of the If op, rather than the output of
-    # the If op directly. This makes pruning work if the output of cond() is
-    # fetched: the lowering pass converts the If outputs into IdentityN outputs,
-    # which if fetched will cause all ops in the taken branch to be run (since
-    # it takes all merge ops as input). After lowering, each output identity op
-    # will end up with only the appropriate merge op as input.
-    # TODO(b/79984175): this doesn't have to be a tuple once we covert to the
-    # correct output structure
-    tensors = tuple(array_ops.identity(t) for t in tensors)
-
-    result = tuple(tensors[:num_cond_outputs])
-    if len(result) == 1:
-      return result[0]
-    else:
-      return result
+    # Packing output tensors in the same nested structure as the true and false
+    # functions return
+    result = nest.pack_sequence_as(structure=true_graph.structured_outputs, 
+                                   flat_sequence=tensors[:num_cond_outputs])
+    return result
 
 
 @ops.RegisterGradient("If")
@@ -168,39 +116,105 @@ def _IfGrad(op, *grads):  # pylint: disable=invalid-name
   true_grad_inputs = _resolve_grad_inputs(true_graph, true_grad_graph)
   false_grad_inputs = _resolve_grad_inputs(false_graph, false_grad_graph)
 
-  # Make the inputs to true_grad_graph and false_grad_graph match. Note that
-  # this modifies true_grad_graph and false_grad_graph.
-  grad_inputs = _make_inputs_match(true_grad_graph, false_grad_graph,
-                                   true_grad_inputs, false_grad_inputs)
-
-  # Add all intermediate tensors as function outputs so they're available for
-  # higher-order gradient computations.
-
-  true_grad_intermediates = _get_intermediates(true_grad_graph)
-  false_grad_intermediates = _get_intermediates(false_grad_graph)
-
-  # Save the original number of gradient outputs to return.
-  num_grad_outputs = len(true_grad_graph.outputs)
-
-  # Make the number/type of new intermediate outputs match.
-  extra_true_grad_outputs, extra_false_grad_outputs = _pad_params(
-      true_grad_graph, false_grad_graph,
-      true_grad_intermediates, false_grad_intermediates)
-
-  true_grad_graph.outputs.extend(extra_true_grad_outputs)
-  false_grad_graph.outputs.extend(extra_false_grad_outputs)
-
-  # Create the gradient If op.
-  tensors = gen_functional_ops._if(
-      op.inputs[0],
-      grad_inputs, [t.dtype for t in true_grad_graph.outputs],
-      util.create_new_tf_function(true_grad_graph),
-      util.create_new_tf_function(false_grad_graph),
-      output_shapes=_get_output_shapes(true_grad_graph.outputs,
-                                       false_grad_graph.outputs))
+  outputs = _build_cond(op.inputs[0], true_grad_graph, false_grad_graph,
+                        true_grad_inputs, false_grad_inputs)
 
   # The predicate has no gradient.
-  return [None] + tensors[:num_grad_outputs]
+  return [None] + outputs
+
+
+def _build_cond(pred, true_graph, false_graph, true_inputs, false_inputs,
+                name=None):
+  """Creates an If op from the specified predicate, branch functions and inputs.
+
+  Note that this modifies true_graph and false_graph to make the inputs match,
+  and to output all intermediates values so they're available for the gradient
+  computation.
+
+  true_graph and false_graph need not have the same input types, but they must
+  have the same outpute types.
+
+  Args:
+    pred: boolean Tensor
+    true_graph: FuncGraph
+    false_graph: FuncGraph
+    true_inputs: a list of Tensors to be passed to true_graph as input.
+    false_inputs: a list of Tensors to be passed to false_graph as input.
+    name: the name for the If op.
+
+  Returns:
+    A list of Tensors which are the outputs of the If op. Does not include added
+    intermediate outputs.
+  """
+  _check_same_outputs(true_graph, false_graph)
+
+  # Add inputs to true_graph and false_graph to make them match. Note that
+  # this modifies true_graph and false_graph.
+  cond_inputs = _make_inputs_match(true_graph, false_graph,
+                                   true_inputs, false_inputs)
+
+  # Add all intermediate tensors as function outputs so they're available for
+  # the gradient computation. Since the outputs of the two functions must match,
+  # we wrap all the intermediates in optionals. Each intermediate output will
+  # have a value iff its corresponding branch is taken.
+
+  true_intermediates = _get_intermediates(true_graph)
+  false_intermediates = _get_intermediates(false_graph)
+
+  # Save the original number of outputs to return to the caller.
+  num_cond_outputs = len(true_graph.outputs)
+
+  if control_flow_util.InXlaContext(ops.get_default_graph()):
+    # XLA does not yet support optionals, so output intermediates directly and
+    # make them match via FakeParams, which can be converted to zeros in XLA.
+    # TODO(skyewm,jpienaar): can XLA support optionals?
+    extra_true_outputs, extra_false_outputs = _make_intermediates_match_xla(
+        true_graph, false_graph, true_intermediates, false_intermediates)
+  else:
+    # Wrap intermediates in optionals.
+    wrapped_true_intermediates = _wrap_intermediates(true_graph,
+                                                     true_intermediates)
+    wrapped_false_intermediates = _wrap_intermediates(false_graph,
+                                                      false_intermediates)
+
+    # Make outputs match by adding none optionals.
+    extra_true_outputs, extra_false_outputs = _make_intermediates_match(
+        true_graph, false_graph,
+        wrapped_true_intermediates, wrapped_false_intermediates)
+
+  true_graph.outputs.extend(extra_true_outputs)
+  false_graph.outputs.extend(extra_false_outputs)
+  # TODO(skyewm): somehow indicate it's a bug if this fails.
+  _check_same_outputs(true_graph, false_graph)
+
+  # Create the If op.
+  tensors = gen_functional_ops._if(  # pylint: disable=protected-access
+      pred,
+      cond_inputs, [t.dtype for t in true_graph.outputs],
+      util.create_new_tf_function(true_graph),
+      util.create_new_tf_function(false_graph),
+      output_shapes=_get_output_shapes(true_graph.outputs,
+                                       false_graph.outputs),
+      name=name)
+
+  # TODO(b/110167197) this approach requires cond_v2 to have at least 1 output
+  if_op = tensors[0].op
+  util.maybe_set_lowering_attr(if_op)
+
+  # Return identities for each output of the If op, rather than the output of
+  # the If op directly. This makes pruning work if the output of cond() is
+  # fetched: the lowering pass converts the If outputs into IdentityN outputs,
+  # which if fetched will cause all ops in the taken branch to be run (since
+  # it takes all merge ops as input). After lowering, each output identity op
+  # will end up with only the appropriate merge op as input.
+  # TODO(b/79984175): this doesn't have to be a tuple once we covert to the
+  # correct output structure
+  tensors = [array_ops.identity(t) for t in tensors]
+
+  # Prevent fetching since the variant outputs can't be fetched directly.
+  if_op.graph.prevent_fetching(if_op)
+
+  return tensors[:num_cond_outputs]
 
 
 def _get_func_graphs(if_op):
@@ -277,7 +291,11 @@ def _grad_fn(func_graph, grads):
   # both branches have zero gradient.
   for i in range(len(result)):
     if result[i] is None:
-      result[i] = array_ops.zeros_like(func_graph.inputs[i])
+      if func_graph.inputs[i].dtype == dtypes.resource:
+        result[i] = array_ops.zeros(
+            gen_resource_variable_ops.variable_shape(func_graph.inputs[i]))
+      else:
+        result[i] = array_ops.zeros_like(func_graph.inputs[i])
 
   return result
 
@@ -287,7 +305,7 @@ def _create_grad_func(func_graph, grads, name):
   return func_graph_module.func_graph_from_py_func(
       name,
       lambda: _grad_fn(func_graph, grads), [], {},
-      func_graph=util.CondBranchFuncGraph(name, read_only_collections=False))
+      func_graph=_CondGradFuncGraph(name, func_graph))
 
 
 def _resolve_grad_inputs(cond_graph, grad_graph):
@@ -369,28 +387,39 @@ def _separate_unique_inputs(true_inputs, false_inputs):
   return list(shared_inputs), list(true_only_inputs), list(false_only_inputs)
 
 
-def _pad_params(true_graph, false_graph, true_params, false_params):
-  """Returns new param lists that have matching signatures.
+def _make_intermediates_match(true_graph, false_graph,
+                              true_optionals, false_optionals):
+  """Returns new optionals lists that have matching signatures.
 
-  This is done by mirroring each param list in the other using dummy params.
-  There is no merging of params.
+  This is done by mirroring each list in the other using none optionals.
+  There is no merging of like optionals.
 
   Args:
     true_graph: FuncGraph
     false_graph: FuncGraph
-    true_params: a list of Tensors from true_graph
-    false_params: a list of Tensors from false_graph
+    true_optionals: a list of optional Tensors from true_graph
+    false_optionals: a list of optional Tensors from false_graph
 
   Returns:
     A new list of Tensors in true_graph and a new list of Tensors in
-    false_graph. The two lists have the same number of Tensors, with matching
-    types and shapes across the lists.
+    false_graph. The two lists have the same number of Tensors, all of which
+    will be optionals of the same shape/type.
   """
-  new_true_params = (true_params +
-                     _create_dummy_params(true_graph, false_params))
-  new_false_inputs = (_create_dummy_params(false_graph, true_params)
-                      + false_params)
-  return new_true_params, new_false_inputs
+  new_true_optionals = (true_optionals +
+                        _create_none_optionals(true_graph, false_optionals))
+  new_false_optionals = (_create_none_optionals(false_graph, true_optionals)
+                         + false_optionals)
+  return new_true_optionals, new_false_optionals
+
+
+def _make_intermediates_match_xla(true_graph, false_graph, true_intermediates,
+                                  false_intermediates):
+  """Like _make_intermediates_match but for the XLA case."""
+  new_true_intermediates = (true_intermediates +
+                            _create_fakeparams(true_graph, false_intermediates))
+  new_false_intermediates = (_create_fakeparams(false_graph, true_intermediates)
+                             + false_intermediates)
+  return new_true_intermediates, new_false_intermediates
 
 
 def _make_inputs_match(true_graph, false_graph, true_inputs, false_inputs):
@@ -425,11 +454,11 @@ def _make_inputs_match(true_graph, false_graph, true_inputs, false_inputs):
   true_graph.inputs = (
       [true_input_to_param[t] for t in shared_inputs] +
       [true_input_to_param[t] for t in true_only_inputs] +
-      _create_dummy_params(true_graph, false_only_inputs))
+      _create_dummy_inputs(true_graph, false_only_inputs))
 
   false_graph.inputs = (
       [false_input_to_param[t] for t in shared_inputs] +
-      _create_dummy_params(false_graph, true_only_inputs) +
+      _create_dummy_inputs(false_graph, true_only_inputs) +
       [false_input_to_param[t] for t in false_only_inputs])
 
   # Rewrite the FuncGraphs' state to reflect the new inputs.
@@ -441,7 +470,12 @@ def _make_inputs_match(true_graph, false_graph, true_inputs, false_inputs):
   return new_inputs
 
 
-def _create_dummy_params(func_graph, template_tensors):
+def _wrap_intermediates(func_graph, intermediates):
+  with func_graph.as_default():
+    return [gen_dataset_ops.optional_from_value([t]) for t in intermediates]
+
+
+def _create_dummy_inputs(func_graph, template_tensors):
   """Creates tensors in func_graph to represent template_tensors.
 
   Args:
@@ -451,6 +485,27 @@ def _create_dummy_params(func_graph, template_tensors):
   Returns:
     A list of tensors in func_graph.
   """
+  with func_graph.as_default():
+    return [array_ops.placeholder(t.dtype, shape=t.shape)
+            for t in template_tensors]
+
+
+def _create_none_optionals(func_graph, template_tensors):
+  """Creates none optionals in func_graph to represent template_tensors.
+
+  Args:
+    func_graph: FuncGraph.
+    template_tensors: a list of tensors in func_graph.
+
+  Returns:
+    A list of tensors in func_graph.
+  """
+  with func_graph.as_default():
+    return [gen_dataset_ops.optional_none() for _ in template_tensors]
+
+
+def _create_fakeparams(func_graph, template_tensors):
+  """Create FakeParams for the XLA case."""
   with func_graph.as_default():
     return [gen_functional_ops.fake_param(dtype=t.dtype, shape=t.shape)
             for t in template_tensors]
@@ -462,12 +517,16 @@ def _check_same_outputs(true_graph, false_graph):
   false_output_types = [t.dtype for t in false_graph.outputs]
   if (len(true_graph.outputs) != len(false_graph.outputs) or
       true_output_types != false_output_types):
-    raise ValueError(
+    raise TypeError(
         "true_fn() and false_fn() must return the same number and type of "
         "arguments, got:\n"
         "  true_fn: %s\n"
         "  false_fn: %s" % (true_output_types, false_output_types))
 
+  # Make sure both structured outputs for both graphs have the same structure
+  nest.assert_same_structure(true_graph.structured_outputs, 
+                             false_graph.structured_outputs)
+
 
 def _get_output_shapes(true_graph_outputs, false_graph_outputs):
   output_shapes = [
@@ -475,3 +534,38 @@ def _get_output_shapes(true_graph_outputs, false_graph_outputs):
       for t_out, f_out in zip(true_graph_outputs, false_graph_outputs)
   ]
   return output_shapes
+
+
+class _CondGradFuncGraph(util.CondBranchFuncGraph):
+  """FuncGraph for the gradient function of the branch of an If op.
+
+  Handles unwrapping optional intermediate values that are captured by the
+  gradient computation.
+  """
+
+  def __init__(self, name, forward_graph):
+    super(_CondGradFuncGraph, self).__init__(name, read_only_collections=False)
+    self._forward_graph = forward_graph
+
+  def _capture_helper(self, tensor, name):
+    if (tensor.graph is not self._forward_graph or
+        tensor in self._forward_graph.inputs or
+        tensor in self._forward_graph.outputs):
+      return super(_CondGradFuncGraph, self)._capture_helper(tensor, name)
+
+    # 'tensor' is an intermediate in the forward graph. We find the corresonding
+    # optional tensor, which is output from the If op, and capture it as
+    # normal. We then unwrap the captured optional value to get the raw
+    # intermediate value.
+    for consumer in tensor.consumers():
+      if (consumer.type == "OptionalFromValue"
+          and consumer.outputs[0] in self._forward_graph.outputs):
+        optional = consumer.outputs[0]
+        captured_optional = super(_CondGradFuncGraph, self)._capture_helper(
+            optional, name)
+        return gen_dataset_ops.optional_get_value(
+            captured_optional, [tensor.dtype], [tensor.shape])[0]
+    raise ValueError(
+        "Couldn't find OptionalFromValue consumer for tensor '%s'.\n"
+        "This is an internal bug, please report at "
+        "https://github.com/tensorflow/tensorflow/issues." % tensor.name)
diff --git a/tensorflow/python/ops/confusion_matrix.py b/tensorflow/python/ops/confusion_matrix.py
index b86b174afe5..ccfe3b65c2d 100644
--- a/tensorflow/python/ops/confusion_matrix.py
+++ b/tensorflow/python/ops/confusion_matrix.py
@@ -90,12 +90,13 @@ def remove_squeezable_dimensions(
     return labels, predictions
 
 
-@tf_export(
-    'math.confusion_matrix',
-    v1=['math.confusion_matrix', 'confusion_matrix'])
-@deprecation.deprecated_endpoints('confusion_matrix', 'train.confusion_matrix')
-def confusion_matrix(labels, predictions, num_classes=None, dtype=dtypes.int32,
-                     name=None, weights=None):
+@tf_export('math.confusion_matrix', v1=[])
+def confusion_matrix(labels,
+                     predictions,
+                     num_classes=None,
+                     weights=None,
+                     dtype=dtypes.int32,
+                     name=None):
   """Computes the confusion matrix from predictions and labels.
 
   The matrix columns represent the prediction labels and the rows represent the
@@ -132,9 +133,9 @@ def confusion_matrix(labels, predictions, num_classes=None, dtype=dtypes.int32,
     num_classes: The possible number of labels the classification task can
                  have. If this value is not provided, it will be calculated
                  using both predictions and labels array.
+    weights: An optional `Tensor` whose shape matches `predictions`.
     dtype: Data type of the confusion matrix.
     name: Scope name.
-    weights: An optional `Tensor` whose shape matches `predictions`.
 
   Returns:
     A `Tensor` of type `dtype` with shape `[n, n]` representing the confusion
@@ -193,3 +194,65 @@ def confusion_matrix(labels, predictions, num_classes=None, dtype=dtypes.int32,
     zero_matrix = array_ops.zeros(math_ops.to_int32(shape), dtype)
 
     return sparse_ops.sparse_add(zero_matrix, cm_sparse)
+
+
+@tf_export(v1=['math.confusion_matrix', 'confusion_matrix'])
+@deprecation.deprecated_endpoints('confusion_matrix', 'train.confusion_matrix')
+def confusion_matrix_v1(labels,
+                        predictions,
+                        num_classes=None,
+                        dtype=dtypes.int32,
+                        name=None,
+                        weights=None):
+  """Computes the confusion matrix from predictions and labels.
+
+  The matrix columns represent the prediction labels and the rows represent the
+  real labels. The confusion matrix is always a 2-D array of shape `[n, n]`,
+  where `n` is the number of valid labels for a given classification task. Both
+  prediction and labels must be 1-D arrays of the same shape in order for this
+  function to work.
+
+  If `num_classes` is `None`, then `num_classes` will be set to one plus the
+  maximum value in either predictions or labels. Class labels are expected to
+  start at 0. For example, if `num_classes` is 3, then the possible labels
+  would be `[0, 1, 2]`.
+
+  If `weights` is not `None`, then each prediction contributes its
+  corresponding weight to the total value of the confusion matrix cell.
+
+  For example:
+
+  ```python
+    tf.confusion_matrix([1, 2, 4], [2, 2, 4]) ==>
+        [[0 0 0 0 0]
+         [0 0 1 0 0]
+         [0 0 1 0 0]
+         [0 0 0 0 0]
+         [0 0 0 0 1]]
+  ```
+
+  Note that the possible labels are assumed to be `[0, 1, 2, 3, 4]`,
+  resulting in a 5x5 confusion matrix.
+
+  Args:
+    labels: 1-D `Tensor` of real labels for the classification task.
+    predictions: 1-D `Tensor` of predictions for a given classification.
+    num_classes: The possible number of labels the classification task can have.
+      If this value is not provided, it will be calculated using both
+      predictions and labels array.
+    dtype: Data type of the confusion matrix.
+    name: Scope name.
+    weights: An optional `Tensor` whose shape matches `predictions`.
+
+  Returns:
+    A `Tensor` of type `dtype` with shape `[n, n]` representing the confusion
+    matrix, where `n` is the number of possible labels in the classification
+    task.
+
+  Raises:
+    ValueError: If both predictions and labels are not 1-D vectors and have
+      mismatched shapes, or if `weights` is not `None` and its shape doesn't
+      match `predictions`.
+  """
+  return confusion_matrix(labels, predictions, num_classes, weights, dtype,
+                          name)
diff --git a/tensorflow/python/ops/control_flow_ops.py b/tensorflow/python/ops/control_flow_ops.py
index 0d04f0697df..b7e50c1dae5 100644
--- a/tensorflow/python/ops/control_flow_ops.py
+++ b/tensorflow/python/ops/control_flow_ops.py
@@ -158,7 +158,7 @@ def Assert(condition, data, summarize=None, name=None):
 
   with ops.name_scope(name, "Assert", [condition, data]) as name:
     xs = ops.convert_n_to_tensor(data)
-    if all([x.dtype in {dtypes.string, dtypes.int32} for x in xs]):
+    if all(x.dtype in {dtypes.string, dtypes.int32} for x in xs):
       # As a simple heuristic, we assume that string and int32 are
       # on host to avoid the need to use cond. If it is not case,
       # we will pay the price copying the tensor to host memory.
@@ -457,19 +457,19 @@ def merge(inputs, name=None):
     ValueError: If any of the inputs is None, or inputs are IndexedSlices and
       some but not all have a dense_shape property.
   """
-  if any([inp is None for inp in inputs]):
+  if any(inp is None for inp in inputs):
     raise ValueError("At least one of the merge inputs is None: %s" % inputs)
   with ops.name_scope(name, "Merge", inputs) as name:
     inputs = [
         ops.internal_convert_to_tensor_or_indexed_slices(inp, as_ref=True)
         for inp in inputs
     ]
-    if all([isinstance(v, ops.Tensor) for v in inputs]):
-      if all([v.dtype._is_ref_dtype for v in inputs]):  # pylint: disable=protected-access
+    if all(isinstance(v, ops.Tensor) for v in inputs):
+      if all(v.dtype._is_ref_dtype for v in inputs):  # pylint: disable=protected-access
         return gen_control_flow_ops.ref_merge(inputs, name)
       else:
         return gen_control_flow_ops.merge(inputs, name)
-    elif all([isinstance(v, sparse_tensor.SparseTensor) for v in inputs]):
+    elif all(isinstance(v, sparse_tensor.SparseTensor) for v in inputs):
       # Only handle the case when all inputs are SparseTensor.
       values, _ = merge([inp.values for inp in inputs], name=name)
       indices, chosen_index = gen_control_flow_ops.merge(
@@ -557,7 +557,7 @@ def _SetShapeInvariants(input_vars, enter_vars, shapes):
   if shapes is None:
     return
   flat_shapes = nest.flatten(shapes)
-  if not all([isinstance(s, tensor_shape.TensorShape) for s in flat_shapes]):
+  if not all(isinstance(s, tensor_shape.TensorShape) for s in flat_shapes):
     raise ValueError("`shapes` must be a (possibly nested) list of shapes.")
   # Check that the shapes of the inputs are less than the shape invariants,
   # and set the shapes of `enter_vars` to the shape invariants.
@@ -1976,7 +1976,7 @@ def _UnpackIfSingleton(res):
 
 # pylint: disable=redefined-outer-name
 # pylint: disable=g-doc-args
-@tf_export("cond")
+@tf_export(v1=["cond"])
 @deprecation.deprecated_args(
     None, "fn1/fn2 are deprecated in favor of the true_fn/false_fn arguments.",
     "fn1", "fn2")
@@ -2173,6 +2173,77 @@ def cond(pred,
 # pylint: enable=redefined-outer-name
 
 
+@tf_export("cond", v1=[])
+def cond_for_tf_v2(pred,
+                   true_fn=None,
+                   false_fn=None,
+                   name=None):
+  """Return `true_fn()` if the predicate `pred` is true else `false_fn()`.
+
+  `true_fn` and `false_fn` both return lists of output tensors. `true_fn` and
+  `false_fn` must have the same non-zero number and type of outputs.
+
+  **WARNING**: Any Tensors or Operations created outside of `true_fn` and
+  `false_fn` will be executed regardless of which branch is selected at runtime.
+
+  Although this behavior is consistent with the dataflow model of TensorFlow,
+  it has frequently surprised users who expected a lazier semantics.
+  Consider the following simple program:
+
+  ```python
+  z = tf.multiply(a, b)
+  result = tf.cond(x < y, lambda: tf.add(x, z), lambda: tf.square(y))
+  ```
+
+  If `x < y`, the `tf.add` operation will be executed and `tf.square`
+  operation will not be executed. Since `z` is needed for at least one
+  branch of the `cond`, the `tf.multiply` operation is always executed,
+  unconditionally.
+
+  Note that `cond` calls `true_fn` and `false_fn` *exactly once* (inside the
+  call to `cond`, and not at all during `Session.run()`). `cond`
+  stitches together the graph fragments created during the `true_fn` and
+  `false_fn` calls with some additional graph nodes to ensure that the right
+  branch gets executed depending on the value of `pred`.
+
+  `tf.cond` supports nested structures as implemented in
+  `tensorflow.python.util.nest`. Both `true_fn` and `false_fn` must return the
+  same (possibly nested) value structure of lists, tuples, and/or named tuples.
+  Singleton lists and tuples form the only exceptions to this: when returned by
+  `true_fn` and/or `false_fn`, they are implicitly unpacked to single values.
+
+  Args:
+    pred: A scalar determining whether to return the result of `true_fn` or
+      `false_fn`.
+    true_fn: The callable to be performed if pred is true.
+    false_fn: The callable to be performed if pred is false.
+    name: Optional name prefix for the returned tensors.
+
+  Returns:
+    Tensors returned by the call to either `true_fn` or `false_fn`. If the
+    callables return a singleton list, the element is extracted from the list.
+
+  Raises:
+    TypeError: if `true_fn` or `false_fn` is not callable.
+    ValueError: if `true_fn` and `false_fn` do not return the same number of
+      tensors, or return tensors of different types.
+
+  Example:
+
+  ```python
+  x = tf.constant(2)
+  y = tf.constant(5)
+  def f1(): return tf.multiply(x, 17)
+  def f2(): return tf.add(y, 23)
+  r = tf.cond(tf.less(x, y), f1, f2)
+  # r is set to f1().
+  # Operations in f2 (e.g., tf.add) are not executed.
+  ```
+
+  """
+  return cond(pred, true_fn=true_fn, false_fn=false_fn, strict=True, name=name)
+
+
 def _resource_safe_shape(t):
   """Returns the shape of t or the variable it points to."""
   if t.dtype == dtypes.resource:
@@ -3065,7 +3136,186 @@ class WhileContext(ControlFlowContext):
 
 
 # pylint: disable=redefined-outer-name
-@tf_export("while_loop")
+@tf_export("while_loop", v1=[])
+def while_loop_v2(cond,
+                  body,
+                  loop_vars,
+                  shape_invariants=None,
+                  parallel_iterations=10,
+                  back_prop=True,
+                  swap_memory=False,
+                  maximum_iterations=None,
+                  name=None):
+  """Repeat `body` while the condition `cond` is true.
+
+  `cond` is a callable returning a boolean scalar tensor. `body` is a callable
+  returning a (possibly nested) tuple, namedtuple or list of tensors of the same
+  arity (length and structure) and types as `loop_vars`. `loop_vars` is a
+  (possibly nested) tuple, namedtuple or list of tensors that is passed to both
+  `cond` and `body`. `cond` and `body` both take as many arguments as there are
+  `loop_vars`.
+
+  In addition to regular Tensors or IndexedSlices, the body may accept and
+  return TensorArray objects.  The flows of the TensorArray objects will
+  be appropriately forwarded between loops and during gradient calculations.
+
+  Note that `while_loop` calls `cond` and `body` *exactly once* (inside the
+  call to `while_loop`, and not at all during `Session.run()`). `while_loop`
+  stitches together the graph fragments created during the `cond` and `body`
+  calls with some additional graph nodes to create the graph flow that
+  repeats `body` until `cond` returns false.
+
+  For correctness, `tf.while_loop()` strictly enforces shape invariants for
+  the loop variables. A shape invariant is a (possibly partial) shape that
+  is unchanged across the iterations of the loop. An error will be raised
+  if the shape of a loop variable after an iteration is determined to be more
+  general than or incompatible with its shape invariant. For example, a shape
+  of [11, None] is more general than a shape of [11, 17], and [11, 21] is not
+  compatible with [11, 17]. By default (if the argument `shape_invariants` is
+  not specified), it is assumed that the initial shape of each tensor in
+  `loop_vars` is the same in every iteration. The `shape_invariants` argument
+  allows the caller to specify a less specific shape invariant for each loop
+  variable, which is needed if the shape varies between iterations. The
+  `tf.Tensor.set_shape`
+  function may also be used in the `body` function to indicate that
+  the output loop variable has a particular shape. The shape invariant for
+  SparseTensor and IndexedSlices are treated specially as follows:
+
+  a) If a loop variable is a SparseTensor, the shape invariant must be
+  TensorShape([r]) where r is the rank of the dense tensor represented
+  by the sparse tensor. It means the shapes of the three tensors of the
+  SparseTensor are ([None], [None, r], [r]). NOTE: The shape invariant here
+  is the shape of the SparseTensor.dense_shape property. It must be the shape of
+  a vector.
+
+  b) If a loop variable is an IndexedSlices, the shape invariant must be
+  a shape invariant of the values tensor of the IndexedSlices. It means
+  the shapes of the three tensors of the IndexedSlices are (shape, [shape[0]],
+  [shape.ndims]).
+
+  `while_loop` implements non-strict semantics, enabling multiple iterations
+  to run in parallel. The maximum number of parallel iterations can be
+  controlled by `parallel_iterations`, which gives users some control over
+  memory consumption and execution order. For correct programs, `while_loop`
+  should return the same result for any parallel_iterations > 0.
+
+  For training, TensorFlow stores the tensors that are produced in the
+  forward inference and are needed in back propagation. These tensors are a
+  main source of memory consumption and often cause OOM errors when training
+  on GPUs. When the flag swap_memory is true, we swap out these tensors from
+  GPU to CPU. This for example allows us to train RNN models with very long
+  sequences and large batches.
+
+  Args:
+    cond: A callable that represents the termination condition of the loop.
+    body: A callable that represents the loop body.
+    loop_vars: A (possibly nested) tuple, namedtuple or list of numpy array,
+      `Tensor`, and `TensorArray` objects.
+    shape_invariants: The shape invariants for the loop variables.
+    parallel_iterations: The number of iterations allowed to run in parallel. It
+      must be a positive integer.
+    back_prop: Whether backprop is enabled for this while loop.
+    swap_memory: Whether GPU-CPU memory swap is enabled for this loop.
+    maximum_iterations: Optional maximum number of iterations of the while loop
+      to run.  If provided, the `cond` output is AND-ed with an additional
+      condition ensuring the number of iterations executed is no greater than
+      `maximum_iterations`.
+    name: Optional name prefix for the returned tensors.
+
+  Returns:
+    The output tensors for the loop variables after the loop. The return value
+      has the same structure as `loop_vars`.
+
+  Raises:
+    TypeError: if `cond` or `body` is not callable.
+    ValueError: if `loop_vars` is empty.
+
+  Example:
+
+  ```python
+  i = tf.constant(0)
+  c = lambda i: tf.less(i, 10)
+  b = lambda i: tf.add(i, 1)
+  r = tf.while_loop(c, b, [i])
+  ```
+
+  Example with nesting and a namedtuple:
+
+  ```python
+  import collections
+  Pair = collections.namedtuple('Pair', 'j, k')
+  ijk_0 = (tf.constant(0), Pair(tf.constant(1), tf.constant(2)))
+  c = lambda i, p: i < 10
+  b = lambda i, p: (i + 1, Pair((p.j + p.k), (p.j - p.k)))
+  ijk_final = tf.while_loop(c, b, ijk_0)
+  ```
+
+  Example using shape_invariants:
+
+  ```python
+  i0 = tf.constant(0)
+  m0 = tf.ones([2, 2])
+  c = lambda i, m: i < 10
+  b = lambda i, m: [i+1, tf.concat([m, m], axis=0)]
+  tf.while_loop(
+      c, b, loop_vars=[i0, m0],
+      shape_invariants=[i0.get_shape(), tf.TensorShape([None, 2])])
+  ```
+
+  Example which demonstrates non-strict semantics: In the following
+  example, the final value of the counter `i` does not depend on `x`. So
+  the `while_loop` can increment the counter parallel to updates of `x`.
+  However, because the loop counter at one loop iteration depends
+  on the value at the previous iteration, the loop counter itself cannot
+  be incremented in parallel. Hence if we just want the final value of the
+  counter (which we print on the line `print(sess.run(i))`), then
+  `x` will never be incremented, but the counter will be updated on a
+  single thread. Conversely, if we want the value of the output (which we
+  print on the line `print(sess.run(out).shape)`), then the counter may be
+  incremented on its own thread, while `x` can be incremented in
+  parallel on a separate thread. In the extreme case, it is conceivable
+  that the thread incrementing the counter runs until completion before
+  `x` is incremented even a single time. The only thing that can never
+  happen is that the thread updating `x` can never get ahead of the
+  counter thread because the thread incrementing `x` depends on the value
+  of the counter.
+
+  ```python
+  import tensorflow as tf
+
+  n = 10000
+  x = tf.constant(list(range(n)))
+  c = lambda i, x: i < n
+  b = lambda i, x: (tf.Print(i + 1, [i]), tf.Print(x + 1, [i], "x:"))
+  i, out = tf.while_loop(c, b, (0, x))
+  with tf.Session() as sess:
+      print(sess.run(i))  # prints [0] ... [9999]
+
+      # The following line may increment the counter and x in parallel.
+      # The counter thread may get ahead of the other thread, but not the
+      # other way around. So you may see things like
+      # [9996] x:[9987]
+      # meaning that the counter thread is on iteration 9996,
+      # while the other thread is on iteration 9987
+      print(sess.run(out).shape)
+  ```
+
+  """
+  return while_loop(
+      cond=cond,
+      body=body,
+      loop_vars=loop_vars,
+      shape_invariants=shape_invariants,
+      parallel_iterations=parallel_iterations,
+      back_prop=back_prop,
+      swap_memory=swap_memory,
+      name=name,
+      maximum_iterations=maximum_iterations,
+      return_same_structure=True)
+
+
+# pylint: disable=redefined-outer-name
+@tf_export(v1=["while_loop"])
 def while_loop(cond,
                body,
                loop_vars,
@@ -3244,7 +3494,8 @@ def while_loop(cond,
         loop_vars,
         shape_invariants=shape_invariants,
         maximum_iterations=maximum_iterations,
-        name=name)
+        name=name,
+        return_same_structure=return_same_structure)
 
   with ops.name_scope(name, "while", loop_vars):
     if not loop_vars:
@@ -3465,7 +3716,43 @@ def group(*inputs, **kwargs):
       return no_op(name=name)
 
 
-@tf_export("tuple")
+@tf_export("tuple", v1=[])
+def tuple_v2(tensors, control_inputs=None, name=None):
+  """Group tensors together.
+
+  This creates a tuple of tensors with the same values as the `tensors`
+  argument, except that the value of each tensor is only returned after the
+  values of all tensors have been computed.
+
+  `control_inputs` contains additional ops that have to finish before this op
+  finishes, but whose outputs are not returned.
+
+  This can be used as a "join" mechanism for parallel computations: all the
+  argument tensors can be computed in parallel, but the values of any tensor
+  returned by `tuple` are only available after all the parallel computations
+  are done.
+
+  See also `tf.group` and
+  `tf.control_dependencies`.
+
+  Args:
+    tensors: A list of `Tensor`s or `IndexedSlices`, some entries can be `None`.
+    control_inputs: List of additional ops to finish before returning.
+    name: (optional) A name to use as a `name_scope` for the operation.
+
+  Returns:
+    Same as `tensors`.
+
+  Raises:
+    ValueError: If `tensors` does not contain any `Tensor` or `IndexedSlices`.
+    TypeError: If `control_inputs` is not a list of `Operation` or `Tensor`
+      objects.
+
+  """
+  return tuple(tensors=tensors, name=name, control_inputs=control_inputs)  # pylint: disable=redefined-builtin
+
+
+@tf_export(v1=["tuple"])
 def tuple(tensors, name=None, control_inputs=None):  # pylint: disable=redefined-builtin
   """Group tensors together.
 
diff --git a/tensorflow/python/ops/control_flow_ops_test.py b/tensorflow/python/ops/control_flow_ops_test.py
index c3514c183c4..c020189ad63 100644
--- a/tensorflow/python/ops/control_flow_ops_test.py
+++ b/tensorflow/python/ops/control_flow_ops_test.py
@@ -155,9 +155,9 @@ class WithDependenciesTestCase(test_util.TensorFlowTestCase):
           constant_op.constant(7))
       with self.cached_session():
         variables.global_variables_initializer().run()
-        self.assertEquals(0, counter.eval())
-        self.assertEquals(7, const_with_dep.eval())
-        self.assertEquals(1, counter.eval())
+        self.assertEquals(0, self.evaluate(counter))
+        self.assertEquals(7, self.evaluate(const_with_dep))
+        self.assertEquals(1, self.evaluate(counter))
 
   def testListDependencies(self):
     with ops.Graph().as_default():
@@ -169,9 +169,9 @@ class WithDependenciesTestCase(test_util.TensorFlowTestCase):
           constant_op.constant(7))
       with self.cached_session():
         variables.global_variables_initializer().run()
-        self.assertEquals(0, counter.eval())
-        self.assertEquals(7, const_with_dep.eval())
-        self.assertEquals(1, counter.eval())
+        self.assertEquals(0, self.evaluate(counter))
+        self.assertEquals(7, self.evaluate(const_with_dep))
+        self.assertEquals(1, self.evaluate(counter))
 
 
 class SwitchTestCase(test_util.TensorFlowTestCase):
@@ -209,9 +209,9 @@ class SwitchTestCase(test_util.TensorFlowTestCase):
       optimizer = momentum.MomentumOptimizer(0.1, 0.9)
       train_op = optimizer.minimize(cost)
       with self.cached_session() as sess:
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         for _ in range(10):
-          sess.run([train_op])
+          self.evaluate([train_op])
 
   def testResourceReadInLoop(self):
     with ops.Graph().as_default():
@@ -232,8 +232,8 @@ class SwitchTestCase(test_util.TensorFlowTestCase):
           cond, body, [constant_op.constant(0),
                        constant_op.constant(0.0)])
       with self.cached_session() as sess:
-        sess.run(variables.global_variables_initializer())
-        self.assertAllEqual(10.0, cost.eval())
+        self.evaluate(variables.global_variables_initializer())
+        self.assertAllEqual(10.0, self.evaluate(cost))
 
   def doTestIndexedSlicesGradientInCondInWhileLoop(self, use_resource=False):
     with ops.Graph().as_default():
@@ -269,8 +269,8 @@ class SwitchTestCase(test_util.TensorFlowTestCase):
                                           static_grads.indices)
 
       with self.cached_session() as sess:
-        sess.run(variables.global_variables_initializer())
-        self.assertAllEqual(*sess.run([static_grads, dynamic_grads]))
+        self.evaluate(variables.global_variables_initializer())
+        self.assertAllEqual(*self.evaluate([static_grads, dynamic_grads]))
 
   def testIndexedSlicesGradientInCondInWhileLoop(self):
     self.doTestIndexedSlicesGradientInCondInWhileLoop(use_resource=False)
@@ -398,9 +398,9 @@ class CondTest(test_util.TensorFlowTestCase):
             pred=bool_var,
             true_fn=lambda: state_ops.assign(bool_var, False),
             false_fn=lambda: True)
-        sess.run(bool_var.initializer)
-        self.assertEquals(sess.run(cond_on_bool_var), False)
-        self.assertEquals(sess.run(cond_on_bool_var), True)
+        self.evaluate(bool_var.initializer)
+        self.assertEquals(self.evaluate(cond_on_bool_var), False)
+        self.assertEquals(self.evaluate(cond_on_bool_var), True)
 
   def testCondMissingArg1(self):
     with ops.Graph().as_default():
diff --git a/tensorflow/python/ops/control_flow_util.py b/tensorflow/python/ops/control_flow_util.py
index 72c074ed1af..cb628f4aa64 100644
--- a/tensorflow/python/ops/control_flow_util.py
+++ b/tensorflow/python/ops/control_flow_util.py
@@ -38,6 +38,11 @@ def IsInXLAContext(op):
   return GetContainingXLAContext(ctxt) is not None
 
 
+def InXlaContext(graph):
+  ctxt = graph._get_control_flow_context()  # pylint: disable=protected-access
+  return GetContainingXLAContext(ctxt) is not None
+
+
 def IsInWhileLoop(op):
   ctxt = op._get_control_flow_context()  # pylint: disable=protected-access
   return GetContainingWhileContext(ctxt) is not None
diff --git a/tensorflow/python/ops/control_flow_util_v2.py b/tensorflow/python/ops/control_flow_util_v2.py
index cab1d7b02e1..5f56850884a 100644
--- a/tensorflow/python/ops/control_flow_util_v2.py
+++ b/tensorflow/python/ops/control_flow_util_v2.py
@@ -19,10 +19,12 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.core.framework import attr_value_pb2
 from tensorflow.python.eager import context
 from tensorflow.python.eager import function
 from tensorflow.python.framework import ops
 from tensorflow.python.framework.func_graph import FuncGraph
+from tensorflow.python.ops import control_flow_util
 
 
 class CondBranchFuncGraph(FuncGraph):
@@ -90,3 +92,31 @@ def unique_fn_name(scope, name):
 
 def unique_grad_fn_name(forward_name):
   return "%s_grad_%s" % (forward_name, ops.uid())
+
+
+def maybe_set_lowering_attr(op):
+  """Sets the flag to enable lowering on `op` if necessary.
+
+  Lowering allows cond_v2 and while_v2 to avoid some of the limitations of
+  Functions, allowing users to specify devices & colocation inside of cond_v2
+  and while_v2 input functions, and enabling non-strict evaluation & partial
+  pruning. This brings v2 control flow closer to feature parity with v1 control
+  flow.
+
+  However, we do not lower in the following cases:
+    - When the `If` or `While` ops are in the XLA context. Because it is easier
+      for XLA to apply its own optimizations when dealing with un-lowered
+      control flow operators than with low-level control flow primitives.
+    - When the eager execution context specifies the executor of functions to
+      be the single threaded executor (see context.function_executor_type()).
+      Because the single threaded executor does not support v1 control flow ops.
+
+  Args:
+    op: An `If` or `While` Operation.
+  """
+  if (not control_flow_util.IsInXLAContext(op) and
+      context.context().get_function_call_options().executor_type
+      != "SINGLE_THREADED_EXECUTOR"):
+    # pylint: disable=protected-access
+    op._set_attr("_lower_using_switch_merge", attr_value_pb2.AttrValue(b=True))
+    # pylint: enable=protected-access
diff --git a/tensorflow/python/ops/ctc_ops.py b/tensorflow/python/ops/ctc_ops.py
index e1071afd8e0..3a7eb9355a6 100644
--- a/tensorflow/python/ops/ctc_ops.py
+++ b/tensorflow/python/ops/ctc_ops.py
@@ -19,17 +19,27 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import function
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import functional_ops
 from tensorflow.python.ops import gen_ctc_ops
+from tensorflow.python.ops import inplace_ops
+from tensorflow.python.ops import linalg_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn_ops
+from tensorflow.python.ops import sparse_ops
 from tensorflow.python.ops.nn_grad import _BroadcastMul
+from tensorflow.python.util import nest
 from tensorflow.python.util.tf_export import tf_export
 
 
 # pylint: disable=protected-access, invalid-name
-@tf_export("nn.ctc_loss")
+@tf_export(v1=["nn.ctc_loss"])
 def ctc_loss(labels, inputs, sequence_length,
              preprocess_collapse_repeated=False,
              ctc_merge_repeated=True,
@@ -336,6 +346,785 @@ def ctc_beam_search_decoder_v2(inputs, sequence_length, beam_width=100,
 
 
 ops.NotDifferentiable("CTCGreedyDecoder")
-
-
 ops.NotDifferentiable("CTCBeamSearchDecoder")
+
+
+def _ctc_state_trans(label_seq):
+  """Compute CTC alignment model transition matrix.
+
+  Args:
+    label_seq: tensor of shape [batch_size, max_seq_length]
+
+  Returns:
+    tensor of shape [batch_size, states, states] with a state transition matrix
+    computed for each sequence of the batch.
+  """
+
+  with ops.name_scope("ctc_state_trans"):
+    label_seq = ops.convert_to_tensor(label_seq, name="label_seq")
+    batch_size = _get_dim(label_seq, 0)
+    num_labels = _get_dim(label_seq, 1)
+
+    num_label_states = num_labels + 1
+    num_states = 2 * num_label_states
+
+    label_states = math_ops.range(num_label_states)
+    blank_states = label_states + num_label_states
+
+    # Start state to first label.
+    start_to_label = [[1, 0]]
+
+    # Blank to label transitions.
+    blank_to_label = array_ops.stack([label_states[1:], blank_states[:-1]], 1)
+
+    # Label to blank transitions.
+    label_to_blank = array_ops.stack([blank_states, label_states], 1)
+
+    # Scatter transitions that don't depend on sequence.
+    indices = array_ops.concat(
+        [start_to_label, blank_to_label, label_to_blank], 0)
+    values = array_ops.ones([_get_dim(indices, 0)])
+    trans = array_ops.scatter_nd(
+        indices, values, shape=[num_states, num_states])
+    trans += linalg_ops.eye(num_states)  # Self-loops.
+
+    # Label to label transitions. Disallow transitions between repeated labels
+    # with no blank state in between.
+    batch_idx = array_ops.zeros_like(label_states[2:])
+    indices = array_ops.stack(
+        [batch_idx, label_states[2:], label_states[1:-1]], 1)
+    indices = array_ops.tile(
+        array_ops.expand_dims(indices, 0), [batch_size, 1, 1])
+    batch_idx = array_ops.expand_dims(math_ops.range(batch_size), 1) * [1, 0, 0]
+    indices += array_ops.expand_dims(batch_idx, 1)
+    repeats = math_ops.equal(label_seq[:, :-1], label_seq[:, 1:])
+    values = 1.0 - math_ops.cast(repeats, dtypes.float32)
+    batched_shape = [batch_size, num_states, num_states]
+    label_to_label = array_ops.scatter_nd(indices, values, batched_shape)
+
+    return array_ops.expand_dims(trans, 0) + label_to_label
+
+
+def ctc_state_log_probs(seq_lengths, max_seq_length):
+  """Computes CTC alignment initial and final state log probabilities.
+
+  Create the initial/final state values directly as log values to avoid
+  having to take a float64 log on tpu (which does not exist).
+
+  Args:
+    seq_lengths: int tensor of shape [batch_size], seq lengths in the batch.
+    max_seq_length: int, max sequence length possible.
+
+  Returns:
+    initial_state_log_probs, final_state_log_probs
+  """
+
+  batch_size = _get_dim(seq_lengths, 0)
+  num_label_states = max_seq_length + 1
+  num_duration_states = 2
+  num_states = num_duration_states * num_label_states
+  log_0 = math_ops.cast(
+      math_ops.log(math_ops.cast(0, dtypes.float64) + 1e-307),
+      dtypes.float32)
+
+  initial_state_log_probs = array_ops.one_hot(
+      indices=array_ops.zeros([batch_size], dtype=dtypes.int32),
+      depth=num_states,
+      on_value=0.0,
+      off_value=log_0, axis=1)
+
+  label_final_state_mask = array_ops.one_hot(
+      seq_lengths, depth=num_label_states, axis=0)
+  duration_final_state_mask = array_ops.ones(
+      [num_duration_states, 1, batch_size])
+  final_state_mask = duration_final_state_mask * label_final_state_mask
+  final_state_log_probs = (1.0 - final_state_mask) * log_0
+  final_state_log_probs = array_ops.reshape(
+      final_state_log_probs, [num_states, batch_size])
+
+  return initial_state_log_probs, array_ops.transpose(final_state_log_probs)
+
+
+def _ilabel_to_state(labels, num_labels, ilabel_log_probs):
+  """Project ilabel log probs to state log probs."""
+
+  num_label_states = _get_dim(labels, 1)
+  blank = ilabel_log_probs[:, :, :1]
+  blank = array_ops.tile(blank, [1, 1, num_label_states + 1])
+  one_hot = array_ops.one_hot(labels, depth=num_labels)
+  one_hot = array_ops.expand_dims(one_hot, axis=0)
+  ilabel_log_probs = array_ops.expand_dims(ilabel_log_probs, axis=2)
+  state_log_probs = math_ops.reduce_sum(ilabel_log_probs * one_hot, axis=3)
+  state_log_probs = array_ops.concat([state_log_probs, blank], axis=2)
+  return array_ops.pad(
+      state_log_probs, [[0, 0], [0, 0], [1, 0]],
+      constant_values=math_ops.log(0.0))
+
+
+def _state_to_olabel(labels, num_labels, states):
+  """Sum state log probs to ilabel log probs."""
+
+  num_label_states = _get_dim(labels, 1) + 1
+  label_states = states[:, :, 1:num_label_states]
+  blank_states = states[:, :, num_label_states:]
+  one_hot = array_ops.one_hot(
+      labels - 1, depth=(num_labels - 1),
+      on_value=0.0, off_value=math_ops.log(0.0))
+  one_hot = array_ops.expand_dims(one_hot, axis=0)
+  label_states = array_ops.expand_dims(label_states, axis=3)
+  label_olabels = math_ops.reduce_logsumexp(label_states + one_hot, axis=2)
+  blank_olabels = math_ops.reduce_logsumexp(
+      blank_states, axis=2, keepdims=True)
+  return array_ops.concat([blank_olabels, label_olabels], axis=-1)
+
+
+# pylint: disable=redefined-outer-name
+def _state_to_olabel_unique(labels, num_labels, states, unique):
+  """Sum state log probs to ilabel log probs using unique label indices."""
+
+  num_label_states = _get_dim(labels, 1) + 1
+  label_states = states[:, :, 1:num_label_states]
+  blank_states = states[:, :, num_label_states:]
+
+  unique_y, unique_idx = unique
+  mul_reduce = _sum_states(unique_idx, label_states)
+
+  num_frames = states.shape[0]
+  batch_size = states.shape[1]
+  num_states = num_label_states - 1
+  batch_state_major = array_ops.transpose(mul_reduce, perm=[1, 2, 0])
+  batch_state_major = array_ops.reshape(
+      batch_state_major, [batch_size * num_states, num_frames])
+  batch_offset = math_ops.range(batch_size, dtype=unique_y.dtype) * num_labels
+  indices = unique_y + array_ops.expand_dims(batch_offset, axis=-1)
+  indices = array_ops.reshape(indices, [-1, 1])
+  scatter = array_ops.scatter_nd(
+      indices=indices,
+      updates=batch_state_major,
+      shape=[batch_size * num_labels, num_frames])
+  scatter = array_ops.reshape(scatter, [batch_size, num_labels, num_frames])
+  scatter = array_ops.where(
+      math_ops.equal(scatter, 0.0),
+      array_ops.fill(array_ops.shape(scatter), math_ops.log(0.0)),
+      scatter)
+  label_olabels = array_ops.transpose(scatter, [2, 0, 1])
+  label_olabels = label_olabels[:, :, 1:]
+
+  blank_olabels = math_ops.reduce_logsumexp(
+      blank_states, axis=2, keepdims=True)
+
+  return array_ops.concat([blank_olabels, label_olabels], axis=-1)
+
+
+def ctc_loss_and_grad(logits, labels, label_length, logit_length, unique=None):
+  """Computes the CTC loss and gradients.
+
+  Most users will want fwd_bwd.ctc_loss
+
+  This function returns the computed gradient, it does not have a gradient
+  of its own defined.
+
+  Args:
+    logits: tensor of shape [frames, batch_size, num_labels]
+    labels: tensor of shape [batch_size, max_label_seq_length]
+    label_length: tensor of shape [batch_size]
+      Length of reference label sequence in labels.
+    logit_length: tensor of shape [batch_size]
+      Length of input sequence in logits.
+    unique: (optional) unique label indices as computed by unique(labels)
+      If supplied, enables an implementation that is faster and more memory
+      efficient on TPU.
+
+  Returns:
+    loss: tensor of shape [batch_size]
+    gradient: tensor of shape [frames, batch_size, num_labels]
+  """
+
+  num_labels = _get_dim(logits, 2)
+  max_label_seq_length = _get_dim(labels, 1)
+
+  ilabel_log_probs = nn_ops.log_softmax(logits)
+  state_log_probs = _ilabel_to_state(labels, num_labels, ilabel_log_probs)
+  state_trans_probs = _ctc_state_trans(labels)
+  initial_state_log_probs, final_state_log_probs = ctc_state_log_probs(
+      label_length, max_label_seq_length)
+  fwd_bwd_log_probs, log_likelihood = _forward_backward_log(
+      state_trans_log_probs=math_ops.log(state_trans_probs),
+      initial_state_log_probs=initial_state_log_probs,
+      final_state_log_probs=final_state_log_probs,
+      observed_log_probs=state_log_probs,
+      sequence_length=logit_length)
+
+  if unique:
+    olabel_log_probs = _state_to_olabel_unique(
+        labels, num_labels, fwd_bwd_log_probs, unique)
+  else:
+    olabel_log_probs = _state_to_olabel(labels, num_labels, fwd_bwd_log_probs)
+
+  grad = math_ops.exp(ilabel_log_probs) - math_ops.exp(olabel_log_probs)
+  loss = -log_likelihood
+  return loss, grad
+
+
+def _ctc_loss_grad(op, grad_loss, _):
+  grad = op.outputs[1]
+  grad = [array_ops.reshape(grad_loss, [1, -1, 1]) * grad]
+  grad += [None] * (len(op.inputs) - len(grad))
+  return grad
+
+
+def _ctc_loss_shape(op):
+  return [op.inputs[2].get_shape(), op.inputs[0].get_shape()]
+
+
+@tf_export("nn.ctc_loss", v1=["nn.ctc_loss_v2"])
+def ctc_loss_v2(labels, logits, label_length, logit_length,
+                logits_time_major=True, unique=None,
+                blank_index=None, name=None):
+  """Computes CTC (Connectionist Temporal Classification) loss.
+
+  This op implements the CTC loss as presented in the article:
+
+  [A. Graves, S. Fernandez, F. Gomez, J. Schmidhuber.
+  Connectionist Temporal Classification: Labeling Unsegmented Sequence Data
+  with Recurrent Neural Networks. ICML 2006, Pittsburgh, USA,
+  pp. 369-376.](http://www.cs.toronto.edu/~graves/icml_2006.pdf)
+
+  Notes:
+      - Same as the "Classic CTC" in TensorFlow 1.x's tf.nn.ctc_loss setting of
+        preprocess_collapse_repeated=False, ctc_merge_repeated=True
+      - Labels may be supplied as either a dense, zero-padded tensor with a
+        vector of label sequence lengths OR as a SparseTensor.
+      - On TPU and GPU:
+          - Only dense padded labels are supported.
+      - On CPU:
+          - Caller may use SparseTensor or dense padded labels but calling with
+            a SparseTensor will be significantly faster.
+      - Default blank label is 0 rather num_classes - 1, unless overridden by
+        blank_index.
+
+  Args:
+    labels: tensor of shape [batch_size, max_label_seq_length] or SparseTensor
+    logits: tensor of shape [frames, batch_size, num_labels],
+      if logits_time_major == False, shape is [batch_size, frames, num_labels].
+    label_length: tensor of shape [batch_size], None if labels is SparseTensor
+      Length of reference label sequence in labels.
+    logit_length: tensor of shape [batch_size]
+      Length of input sequence in logits.
+    logits_time_major: (optional) If True (default), logits is shaped
+      [time, batch, logits]. If False, shape is [batch, time, logits]
+    unique: (optional) Unique label indices as computed by
+      ctc_unique_labels(labels).  If supplied, enable a faster, memory
+      efficient implementation on TPU.
+    blank_index: (optional) Set the class index to use for the blank label.
+      Negative values will start from num_classes, ie, -1 will reproduce the
+      ctc_loss behavior of using num_classes - 1 for the blank symbol.
+      There is some memory/performance overhead to switching from the default
+      of 0 as an additional shifted copy of the logits may be created.
+    name: A name for this `Op`. Defaults to "ctc_loss_dense".
+
+  Returns:
+    loss: tensor of shape [batch_size], negative log probabilities.
+  """
+  if isinstance(labels, sparse_tensor.SparseTensor):
+    if blank_index is None:
+      raise ValueError(
+          "blank_index must be given when using SparseTensor labels.")
+
+    if blank_index < 0:
+      blank_index += _get_dim(logits, 2)
+
+    if blank_index != _get_dim(logits, 2) - 1:
+      logits = array_ops.concat([
+          logits[:, :, :blank_index],
+          logits[:, :, blank_index+1:],
+          logits[:, :, blank_index:blank_index+1],
+      ], axis=2)
+      labels = sparse_tensor.SparseTensor(
+          labels.indices,
+          array_ops.where(labels.values < blank_index,
+                          labels.values,
+                          labels.values - 1),
+          labels.dense_shape)
+
+    return ctc_loss(labels=labels,
+                    inputs=logits,
+                    sequence_length=logit_length,
+                    time_major=logits_time_major)
+
+  if blank_index is None:
+    blank_index = 0
+
+  return ctc_loss_dense(labels=labels,
+                        logits=logits,
+                        label_length=label_length,
+                        logit_length=logit_length,
+                        logits_time_major=logits_time_major,
+                        unique=unique,
+                        blank_index=blank_index,
+                        name=name)
+
+
+def ctc_loss_dense(labels, logits, label_length, logit_length,
+                   logits_time_major=True, unique=None,
+                   blank_index=0, name=None):
+  """Computes CTC (Connectionist Temporal Classification) loss.
+
+  This op implements the CTC loss as presented in the article:
+
+  [A. Graves, S. Fernandez, F. Gomez, J. Schmidhuber.
+  Connectionist Temporal Classification: Labeling Unsegmented Sequence Data
+  with Recurrent Neural Networks. ICML 2006, Pittsburgh, USA,
+  pp. 369-376.](http://www.cs.toronto.edu/~graves/icml_2006.pdf)
+
+  Using the batched forward backward algorithm described in:
+
+  [Sim, K. C., Narayanan, A., Bagby, T., Sainath, T. N., & Bacchiani, M.
+  Improving the efficiency of forward-backward algorithm using batched
+    computation in TensorFlow.
+  Automatic Speech Recognition and Understanding Workshop (ASRU),
+    2017 IEEE (pp. 258-264).
+  ](https://ieeexplore.ieee.org/iel7/8260578/8268903/08268944.pdf)
+
+  Notes:
+    Significant differences from tf.nn.ctc_loss:
+      Supports GPU and TPU (tf.nn.ctc_loss supports CPU only):
+        For batched operations, GPU and TPU are significantly faster than using
+        ctc_loss on CPU.
+        This implementation runs on CPU, but significantly slower than ctc_loss.
+      Blank label is 0 rather num_classes - 1, unless overridden by blank_index.
+      Logits and labels are dense arrays with padding rather than SparseTensor.
+      The only mode supported is the same as:
+        preprocess_collapse_repeated=False, ctc_merge_repeated=True
+        To collapse labels, the caller can preprocess label sequence first.
+
+    The dense implementation supports both CPU, GPU and TPU. A fast path is
+    provided that significantly improves memory use for large vocabulary if the
+    caller preprocesses label sequences to get unique label indices on the CPU
+    (eg. in the data input pipeline) using ctc_ops.unique and simplies this in
+    the optional "unique" kwarg. This is especially useful for TPU and GPU but
+    also works with if used on CPU.
+
+  Args:
+    labels: tensor of shape [batch_size, max_label_seq_length]
+    logits: tensor of shape [frames, batch_size, num_labels],
+      if logits_time_major == False, shape is [batch_size, frames, num_labels].
+    label_length: tensor of shape [batch_size]
+      Length of reference label sequence in labels.
+    logit_length: tensor of shape [batch_size]
+      Length of input sequence in logits.
+    logits_time_major: (optional) If True (default), logits is shaped
+      [time, batch, logits]. If False, shape is [batch, time, logits]
+    unique: (optional) Unique label indices as computed by unique(labels).
+      If supplied, enable a faster, memory efficient implementation on TPU.
+    blank_index: (optional) Set the class index to use for the blank label.
+      Negative values will start from num_classes, ie, -1 will reproduce the
+      ctc_loss behavior of using num_classes - 1 for the blank symbol.
+      There is some memory/performance overhead to switching from the default
+      of 0 as an additional shifted copy of the logits may be created.
+    name: A name for this `Op`. Defaults to "ctc_loss_dense".
+
+  Returns:
+    loss: tensor of shape [batch_size], negative log probabilities.
+  """
+
+  with ops.name_scope(name, "ctc_loss_dense",
+                      [logits, labels, label_length, logit_length]):
+    logits = ops.convert_to_tensor(logits, name="logits")
+    labels = ops.convert_to_tensor(labels, name="labels")
+    label_length = ops.convert_to_tensor(label_length, name="label_length")
+    logit_length = ops.convert_to_tensor(logit_length, name="logit_length")
+
+    if not logits_time_major:
+      logits = array_ops.transpose(logits, perm=[1, 0, 2])
+
+    if blank_index != 0:
+      if blank_index < 0:
+        blank_index += _get_dim(logits, 2)
+      logits = array_ops.concat([
+          logits[:, :, blank_index:blank_index+1],
+          logits[:, :, :blank_index],
+          logits[:, :, blank_index+1:],
+      ], axis=2)
+      labels = array_ops.where(labels < blank_index, labels + 1, labels)
+
+    args = [logits, labels, label_length, logit_length]
+
+    if unique:
+      unique_y, unique_idx = unique
+      args.extend([unique_y, unique_idx])
+
+    # TODO(tombagby): Update to tfe.defun
+    @function.Defun(*[x.dtype for x in args],
+                    python_grad_func=_ctc_loss_grad,
+                    shape_func=_ctc_loss_shape)
+    def compute_ctc_loss(logits_t, labels_t, label_length_t, logit_length_t,
+                         *unique_t):
+      """Compute CTC loss."""
+      logits_t.set_shape(logits.shape)
+      labels_t.set_shape(labels.shape)
+      label_length_t.set_shape(label_length.shape)
+      logit_length_t.set_shape(logit_length.shape)
+      kwargs = dict(
+          logits=logits_t,
+          labels=labels_t,
+          label_length=label_length_t,
+          logit_length=logit_length_t)
+      if unique_t:
+        kwargs["unique"] = unique_t
+      return ctc_loss_and_grad(**kwargs)
+
+    return compute_ctc_loss(*args)[0]
+
+
+@tf_export("nn.collapse_repeated")
+def collapse_repeated(labels, seq_length, name=None):
+  """Merge repeated labels into single labels.
+
+  Args:
+    labels: Tensor of shape (batch, max value in seq_length)
+    seq_length: Tensor of shape (batch), sequence length of each batch element.
+    name: A name for this `Op`. Defaults to "collapse_repeated_labels".
+
+  Returns:
+    tuple of Tensor of shape (batch, max_seq_length) with repeated labels
+    collapsed and padded to max_seq_length, eg:
+        [[A, A, B, B, A],
+         [A, B, C, D, E]] => [[A, B, A, 0, 0],
+                              [A, B, C, D, E]]
+    and int tensor of shape [batch] with new sequence lengths.
+  """
+
+  with ops.name_scope(name, "collapse_repeated_labels",
+                      [labels, seq_length]):
+    labels = ops.convert_to_tensor(labels, name="labels")
+    seq_length = ops.convert_to_tensor(seq_length, name="seq_length")
+
+    # Mask labels that don't equal previous label.
+    label_mask = array_ops.concat(
+        [array_ops.ones_like(labels[:, :1], dtypes.bool),
+         math_ops.not_equal(labels[:, 1:], labels[:, :-1])],
+        axis=1)
+
+    # Filter labels that aren't in the original sequence.
+    maxlen = _get_dim(labels, 1)
+    seq_mask = array_ops.sequence_mask(seq_length, maxlen=maxlen)
+    label_mask = math_ops.logical_and(label_mask, seq_mask)
+
+    # Count masks for new sequence lengths.
+    new_seq_len = math_ops.reduce_sum(
+        math_ops.cast(label_mask, dtypes.int32), axis=1)
+
+    # Mask indexes based on sequence length mask.
+    new_maxlen = math_ops.reduce_max(new_seq_len)
+    idx_mask = array_ops.sequence_mask(new_seq_len, maxlen=new_maxlen)
+
+    # Flatten everything and mask out labels to keep and sparse indices.
+    flat_labels = array_ops.reshape(labels, [-1])
+    flat_label_mask = array_ops.reshape(label_mask, [-1])
+    flat_idx_mask = array_ops.reshape(idx_mask, [-1])
+    idx = math_ops.range(_get_dim(flat_idx_mask, 0))
+
+    # Scatter to flat shape.
+    flat = array_ops.scatter_nd(
+        indices=array_ops.expand_dims(
+            array_ops.boolean_mask(idx, flat_idx_mask), axis=1),
+        updates=array_ops.boolean_mask(flat_labels, flat_label_mask),
+        shape=array_ops.shape(flat_idx_mask))
+
+    # Reshape back to square batch.
+    batch_size = _get_dim(labels, 0)
+    new_shape = [batch_size, new_maxlen]
+    return (array_ops.reshape(flat, new_shape),
+            math_ops.cast(new_seq_len, seq_length.dtype))
+
+
+def dense_labels_to_sparse(dense, length):
+  """Convert dense labels with sequence lengths to sparse tensor.
+
+  Args:
+    dense: tensor of shape [batch, max_length]
+    length: int tensor of shape [batch]
+      The length of each sequence in dense.
+
+  Returns:
+    tf.SparseTensor with values only for the valid elements of sequences.
+  """
+
+  flat_values = array_ops.reshape(dense, [-1])
+  flat_indices = math_ops.range(
+      array_ops.shape(flat_values, out_type=dtypes.int64)[0])
+  mask = array_ops.sequence_mask(length, maxlen=array_ops.shape(dense)[1])
+  flat_mask = array_ops.reshape(mask, [-1])
+  indices = array_ops.expand_dims(
+      array_ops.boolean_mask(flat_indices, flat_mask), 1)
+  values = array_ops.boolean_mask(flat_values, flat_mask)
+  sparse = sparse_tensor.SparseTensor(
+      indices=indices, values=math_ops.cast(values, dtypes.int32),
+      dense_shape=array_ops.shape(flat_values, out_type=dtypes.int64))
+  reshaped = sparse_ops.sparse_reshape(sparse, array_ops.shape(dense))
+  max_length = math_ops.reduce_max(length)
+  return sparse_tensor.SparseTensor(
+      indices=reshaped.indices,
+      values=reshaped.values,
+      dense_shape=[
+          math_ops.cast(reshaped.dense_shape[0], dtypes.int64),
+          math_ops.cast(max_length, dtypes.int64)])
+
+
+@tf_export("nn.ctc_unique_labels")
+def ctc_unique_labels(labels, name=None):
+  """Get unique labels and indices for batched labels for tf.nn.ctc_loss.
+
+  For use with tf.nn.ctc_loss_v2 optional argument `unique`: This op can be
+  used to preprocess labels in input pipeline to for better speed/memory use
+  computing the ctc loss on TPU.
+
+  Example:
+    ctc_unique_labels([[3, 4, 4, 3]]) ->
+      unique labels padded with 0: [[3, 4, 0, 0]]
+      indices of original labels in unique: [0, 1, 1, 0]
+
+  Args:
+    labels: tensor of shape [batch_size, max_label_length] padded with 0.
+    name: A name for this `Op`. Defaults to "ctc_unique_labels".
+
+  Returns:
+    tuple of
+      - unique labels, tensor of shape `[batch_size, max_label_length]`
+      - indices into unique labels, shape `[batch_size, max_label_length]`
+  """
+
+  with ops.name_scope(name, "ctc_unique_labels", [labels]):
+    labels = ops.convert_to_tensor(labels, name="labels")
+    def _unique(x):
+      u = array_ops.unique(x)
+      y = array_ops.pad(
+          u.y, [[0, _get_dim(u.idx, 0) - _get_dim(u.y, 0)]])
+      y = math_ops.cast(y, dtypes.int64)
+      return [y, u.idx]
+    return functional_ops.map_fn(
+        _unique, labels, dtype=[dtypes.int64, dtypes.int32])
+
+
+def _sum_states(idx, states):
+  """Take logsumexp for each unique state out of all label states.
+
+  Args:
+    idx: tensor of shape [batch, label_length]
+      For each sequence, indices into a set of unique labels as computed by
+      calling unique.
+    states: tensor of shape [frames, batch, label_length]
+      Log probabilities for each label state.
+
+  Returns:
+    tensor of shape [frames, batch_size, label_length], log probabilites summed
+      for each unique label of the sequence.
+  """
+
+  with ops.name_scope("sum_states"):
+    idx = ops.convert_to_tensor(idx, name="idx")
+    num_states = _get_dim(states, 2)
+    states = array_ops.expand_dims(states, axis=2)
+    one_hot = array_ops.one_hot(
+        idx, depth=num_states, on_value=0.0, off_value=math_ops.log(0.0),
+        axis=1)
+    return math_ops.reduce_logsumexp(states + one_hot, axis=-1)
+
+
+def _forward_backward_log(state_trans_log_probs, initial_state_log_probs,
+                          final_state_log_probs, observed_log_probs,
+                          sequence_length):
+  """Forward-backward algorithm computed in log domain.
+
+  Args:
+    state_trans_log_probs: tensor of shape [states, states] or
+      if different transition matrix per batch [batch_size, states, states]
+    initial_state_log_probs: tensor of shape [batch_size, states]
+    final_state_log_probs: tensor of shape [batch_size, states]
+    observed_log_probs: tensor of shape [frames, batch_size, states]
+    sequence_length: tensor of shape [batch_size]
+
+  Returns:
+    forward backward log probabilites: tensor of shape [frames, batch, states]
+    log_likelihood: tensor of shape [batch_size]
+
+  Raises:
+    ValueError: If state_trans_log_probs has unknown or incorrect rank.
+  """
+
+  if state_trans_log_probs.shape.ndims == 2:
+    perm = [1, 0]
+  elif state_trans_log_probs.shape.ndims == 3:
+    perm = [0, 2, 1]
+  else:
+    raise ValueError(
+        "state_trans_log_probs rank must be known and == 2 or 3, is: %s" %
+        state_trans_log_probs.shape.ndims)
+
+  bwd_state_trans_log_probs = array_ops.transpose(state_trans_log_probs, perm)
+  batch_size = _get_dim(observed_log_probs, 1)
+
+  def _forward(state_log_prob, obs_log_prob):
+    state_log_prob = array_ops.expand_dims(state_log_prob, axis=1)  # Broadcast.
+    state_log_prob += state_trans_log_probs
+    state_log_prob = math_ops.reduce_logsumexp(state_log_prob, axis=-1)
+    state_log_prob += obs_log_prob
+    log_prob_sum = math_ops.reduce_logsumexp(
+        state_log_prob, axis=-1, keepdims=True)
+    state_log_prob -= log_prob_sum
+    return state_log_prob
+
+  fwd = _scan(_forward, observed_log_probs, initial_state_log_probs,
+              inclusive=True)
+
+  def _backward(accs, elems):
+    """Calculate log probs and cumulative sum masked for sequence length."""
+    state_log_prob, cum_log_sum = accs
+    obs_log_prob, mask = elems
+    state_log_prob += obs_log_prob
+    state_log_prob = array_ops.expand_dims(state_log_prob, axis=1)  # Broadcast.
+    state_log_prob += bwd_state_trans_log_probs
+    state_log_prob = math_ops.reduce_logsumexp(state_log_prob, axis=-1)
+
+    log_prob_sum = math_ops.reduce_logsumexp(
+        state_log_prob, axis=-1, keepdims=True)
+    state_log_prob -= log_prob_sum
+
+    cum_log_sum += array_ops.squeeze(log_prob_sum) * mask
+    batched_mask = array_ops.expand_dims(mask, axis=1)
+    out = state_log_prob * batched_mask
+    out += final_state_log_probs * (1.0 - batched_mask)
+    return out, cum_log_sum
+
+  zero_log_sum = array_ops.zeros([batch_size])
+  maxlen = _get_dim(observed_log_probs, 0)
+  mask = array_ops.sequence_mask(sequence_length, maxlen, dtypes.float32)
+  mask = array_ops.transpose(mask, perm=[1, 0])
+
+  bwd, cum_log_sum = _scan(_backward, (observed_log_probs, mask),
+                           (final_state_log_probs, zero_log_sum),
+                           reverse=True, inclusive=True)
+
+  fwd_bwd_log_probs = fwd[1:] + bwd[1:]
+  fwd_bwd_log_probs_sum = math_ops.reduce_logsumexp(
+      fwd_bwd_log_probs, axis=2, keepdims=True)
+  fwd_bwd_log_probs -= fwd_bwd_log_probs_sum
+  fwd_bwd_log_probs += math_ops.log(array_ops.expand_dims(mask, axis=2))
+
+  log_likelihood = bwd[0, :, 0] + cum_log_sum[0]
+
+  return fwd_bwd_log_probs, log_likelihood
+
+
+# TODO(tombagby): This is currently faster for the ctc implementation than using
+# functional_ops.scan, but could be replaced by that or something similar if
+# things change.
+def _scan(fn, elems, initial, reverse=False, inclusive=False, final_only=False):
+  """Repeatedly applies callable `fn` to a sequence of elements.
+
+  Implemented by functional_ops.While, tpu friendly, no gradient.
+
+  This is similar to functional_ops.scan but significantly faster on tpu/gpu
+  for the forward backward use case.
+
+  Examples:
+    scan(lambda a, e: a + e, [1.0, 2.0, 3.0], 1.0) => [2.0, 3.0, 4.0]
+
+    Multiple accumulators:
+      scan(lambda a, e: (a[0] + e, a[1] * e), [1.0, 2.0, 3.0], (0.0, 1.0))
+
+    Multiple inputs:
+      scan(lambda a, e: a + (e[0] * e[1]), (elems1, elems2), 0.0)
+
+  Args:
+    fn: callable, fn(accumulators, element) return new accumulator values.
+      The (possibly nested) sequence of accumulators is the same as `initial`
+      and the return value must have the same structure.
+    elems: A (possibly nested) tensor which will be unpacked along the first
+      dimension. The resulting slices will be the second argument to fn. The
+      first dimension of all nested input tensors must be the same.
+    initial: A tensor or (possibly nested) sequence of tensors with initial
+      values for the accumulators.
+    reverse: (optional) True enables scan and output elems in reverse order.
+    inclusive: (optional) True includes the initial accumulator values in the
+      output. Length of output will be len(elem sequence) + 1. Not meaningful
+      if final_only is True.
+    final_only: (optional) When True, return only the final accumulated values,
+      not the concatenation of accumulated values for each input.
+
+  Returns:
+    A (possibly nested) sequence of tensors with the results of applying fn
+    to tensors unpacked from elems and previous accumulator values.
+  """
+
+  flat_elems = [ops.convert_to_tensor(x) for x in nest.flatten(elems)]
+  num_elems = array_ops.shape(flat_elems[0])[0]
+  pack_elems = lambda x: nest.pack_sequence_as(structure=elems, flat_sequence=x)
+  flat_initial = [ops.convert_to_tensor(x) for x in nest.flatten(initial)]
+  pack = lambda x: nest.pack_sequence_as(structure=initial, flat_sequence=x)
+  accum_dtypes = [x.dtype for x in flat_initial]
+  num_accums = len(flat_initial)
+
+  # Types for counter, [outputs], [accumulators] loop arguments.
+  if final_only:
+    loop_dtypes = [dtypes.int32, dtypes.int32] + accum_dtypes
+  else:
+    loop_dtypes = [dtypes.int32, dtypes.int32] + accum_dtypes + accum_dtypes
+
+  # TODO(tombagby): Update to tfe.defun
+  @function.Defun(*loop_dtypes)
+  def cond(i, num_elems, *args):
+    del args
+    return i >= 0 if reverse else i < num_elems
+
+  # The loop *args are [output tensors] + [accumulator tensors] which must
+  # be paired. Each output corresponds to one accumulator.
+  @function.Defun(*loop_dtypes)
+  def body(i, num_elems, *args):
+    """Loop body."""
+    i.set_shape([])
+    if final_only:
+      accum = args
+    else:
+      out, accum = args[:num_accums], args[num_accums:]
+    slices = [array_ops.gather(e, i) for e in flat_elems]
+    accum = fn(pack(accum), pack_elems(slices))
+    flat_accum = nest.flatten(accum)
+    if final_only:
+      new_out = []
+    else:
+      update_i = i + 1 if inclusive and not reverse else i
+      new_out = [inplace_ops.alias_inplace_update(x, update_i, y)
+                 for x, y in zip(out, flat_accum)]
+    i = i - 1 if reverse else i + 1
+    return [i, num_elems] + new_out + flat_accum
+
+  init_i = (array_ops.shape(flat_elems[0])[0] - 1 if reverse
+            else constant_op.constant(0, dtype=dtypes.int32))
+  outputs = []
+  if not final_only:
+    num_outputs = array_ops.shape(flat_elems[0])[0] + (1 if inclusive else 0)
+    for initial_accum in flat_initial:
+      out_shape = array_ops.concat(
+          [[num_outputs], array_ops.shape(initial_accum)], 0)
+      out = inplace_ops.empty(out_shape, dtype=initial_accum.dtype, init=True)
+      if inclusive:
+        out = inplace_ops.alias_inplace_add(
+            out, init_i + (1 if reverse else 0), initial_accum)
+      outputs.append(out)
+  loop_in = [init_i, num_elems] + outputs + flat_initial
+  hostmem = [
+      i for i, x in enumerate(loop_in)
+      if x.dtype.base_dtype in (dtypes.int32, dtypes.int64)
+  ]
+
+  # TODO(tombagby): Update to while_v2.
+  loop_results = functional_ops.While(loop_in, cond, body, hostmem=hostmem)
+  out = loop_results[2:num_accums + 2]
+  return pack(out)
+
+
+def _get_dim(tensor, i):
+  """Get value of tensor shape[i] preferring static value if available."""
+  return tensor.shape[i].value or array_ops.shape(tensor)[i]
diff --git a/tensorflow/python/ops/data_flow_ops.py b/tensorflow/python/ops/data_flow_ops.py
index cca8e12b434..2030332e4ea 100644
--- a/tensorflow/python/ops/data_flow_ops.py
+++ b/tensorflow/python/ops/data_flow_ops.py
@@ -79,7 +79,7 @@ def _as_shape_list(shapes,
     shapes = [shapes]
   shapes = [tensor_shape.as_shape(shape) for shape in shapes]
   if not unknown_dim_allowed:
-    if any([not shape.is_fully_defined() for shape in shapes]):
+    if any(not shape.is_fully_defined() for shape in shapes):
       raise ValueError("All shapes must be fully defined: %s" % shapes)
   if not unknown_rank_allowed:
     if any([shape.dims is None for shape in shapes]):
@@ -171,7 +171,10 @@ class QueueBase(object):
       self._names = None
     self._queue_ref = queue_ref
     if context.executing_eagerly():
-      self._name = context.context().scope_name
+      if context.context().scope_name:
+        self._name = context.context().scope_name
+      else:
+        self._name = "Empty"
       self._resource_deleter = resource_variable_ops.EagerResourceDeleter(
           queue_ref, None)
     else:
@@ -198,11 +201,11 @@ class QueueBase(object):
       raise TypeError("A list of queues expected")
 
     dtypes = queues[0].dtypes
-    if not all([dtypes == q.dtypes for q in queues[1:]]):
+    if not all(dtypes == q.dtypes for q in queues[1:]):
       raise TypeError("Queues do not have matching component dtypes.")
 
     names = queues[0].names
-    if not all([names == q.names for q in queues[1:]]):
+    if not all(names == q.names for q in queues[1:]):
       raise TypeError("Queues do not have matching component names.")
 
     queue_shapes = [q.shapes for q in queues]
@@ -1148,7 +1151,7 @@ class Barrier(object):
         self._barrier_ref, name=name)
 
 
-@tf_export("ConditionalAccumulatorBase")
+@tf_export(v1=["ConditionalAccumulatorBase"])
 class ConditionalAccumulatorBase(object):
   """A conditional accumulator for aggregating gradients.
 
@@ -1227,7 +1230,7 @@ class ConditionalAccumulatorBase(object):
         name=name)
 
 
-@tf_export("ConditionalAccumulator")
+@tf_export(v1=["ConditionalAccumulator"])
 class ConditionalAccumulator(ConditionalAccumulatorBase):
   """A conditional accumulator for aggregating gradients.
 
diff --git a/tensorflow/python/ops/dequantize_op_test.py b/tensorflow/python/ops/dequantize_op_test.py
index 13e50273d86..794985b2dbb 100644
--- a/tensorflow/python/ops/dequantize_op_test.py
+++ b/tensorflow/python/ops/dequantize_op_test.py
@@ -35,7 +35,7 @@ class DequantizeOpTest(test.TestCase):
     with self.cached_session():
       input_op = constant_op.constant(inputs, shape=[len(inputs)], dtype=dtype)
       dequantized = array_ops.dequantize(input_op, min_range, max_range)
-      tf_ans = dequantized.eval()
+      tf_ans = self.evaluate(dequantized)
 
     # TODO(vrv): Add support for DT_QINT32 quantization if needed.
     type_dict = {
diff --git a/tensorflow/python/ops/distributions/util.py b/tensorflow/python/ops/distributions/util.py
index 760e7a8a84b..24314e8fc92 100644
--- a/tensorflow/python/ops/distributions/util.py
+++ b/tensorflow/python/ops/distributions/util.py
@@ -343,7 +343,7 @@ def embed_check_categorical_event_shape(
     x_dtype = x.dtype.base_dtype
     max_event_size = (_largest_integer_by_dtype(x_dtype)
                       if x_dtype.is_floating else 0)
-    if max_event_size is 0:
+    if max_event_size == 0:
       raise TypeError("Unable to validate size of unrecognized dtype "
                       "({}).".format(x_dtype.name))
     try:
diff --git a/tensorflow/python/ops/embedding_ops.py b/tensorflow/python/ops/embedding_ops.py
index 9ce024ad965..b398601e6f0 100644
--- a/tensorflow/python/ops/embedding_ops.py
+++ b/tensorflow/python/ops/embedding_ops.py
@@ -554,7 +554,10 @@ def safe_embedding_lookup_sparse(embedding_weights,
 
   dtype = sparse_weights.dtype if sparse_weights is not None else None
   embedding_weights = [
-      ops.convert_to_tensor(w, dtype=dtype) for w in embedding_weights
+      w if (isinstance(w, resource_variable_ops.ResourceVariable)
+            and dtype in (None, w.dtype))
+      else ops.convert_to_tensor(w, dtype=dtype)
+      for w in embedding_weights
   ]
 
   with ops.name_scope(name, 'embedding_lookup',
diff --git a/tensorflow/python/ops/functional_ops.py b/tensorflow/python/ops/functional_ops.py
index fecd7ddbf9f..57542e3c7ba 100644
--- a/tensorflow/python/ops/functional_ops.py
+++ b/tensorflow/python/ops/functional_ops.py
@@ -19,7 +19,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.core.protobuf import rewriter_config_pb2
+from tensorflow.core.protobuf import config_pb2
 from tensorflow.core.framework import attr_value_pb2
 from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
@@ -1027,9 +1027,10 @@ _rewriter_config_optimizer_disabled = None
 def _get_disabled_rewriter_config():
   global _rewriter_config_optimizer_disabled
   if _rewriter_config_optimizer_disabled is None:
-    rewriter_config = rewriter_config_pb2.RewriterConfig()
+    config = config_pb2.ConfigProto()
+    rewriter_config = config.graph_options.rewrite_options
     rewriter_config.disable_meta_optimizer = True
-    _rewriter_config_optimizer_disabled = rewriter_config.SerializeToString()
+    _rewriter_config_optimizer_disabled = config.SerializeToString()
   return _rewriter_config_optimizer_disabled
 
 
@@ -1048,7 +1049,7 @@ def partitioned_call(args, f, tout=None, executing_eagerly=None, config=None,
       the signature of `f`.
     executing_eagerly: (Optional) A boolean indicating whether the context is
       executing eagerly. If `None`, fetched from the global context.
-    config: (Optional) A tensorflow::RewriterConfig proto, serialized. If
+    config: (Optional) A `tensorflow::ConfigProto` proto, serialized. If
       `None`, all optimizations are disabled. Currently only handled for eager
       defined functions.
     executor_type: (Optional) A string for the name of the executor to be used
@@ -1076,10 +1077,12 @@ def partitioned_call(args, f, tout=None, executing_eagerly=None, config=None,
   if executing_eagerly or len(tout):
     if f.stateful_ops:
       outputs = gen_functional_ops.stateful_partitioned_call(
-          args=args, Tout=tout, f=f, config=config, executor_type=executor_type)
+          args=args, Tout=tout, f=f, config_proto=config,
+          executor_type=executor_type)
     else:
       outputs = gen_functional_ops.partitioned_call(
-          args=args, Tout=tout, f=f, config=config, executor_type=executor_type)
+          args=args, Tout=tout, f=f, config_proto=config,
+          executor_type=executor_type)
     return outputs if outputs else None
 
   # The generated binding returns an empty list for functions that don't
@@ -1098,7 +1101,7 @@ def partitioned_call(args, f, tout=None, executing_eagerly=None, config=None,
   # When running in graph mode, the graph and function graphs are optimized
   # (i.e. run through grappler) per the session options, so we can disable any
   # eager-specific rewriting.
-  rewriter_config = attr_value_pb2.AttrValue(s=_get_disabled_rewriter_config())
+  config_proto = attr_value_pb2.AttrValue(s=_get_disabled_rewriter_config())
 
   graph = ops.get_default_graph()
   f.add_to_graph(graph)
@@ -1113,7 +1116,7 @@ def partitioned_call(args, f, tout=None, executing_eagerly=None, config=None,
           "Tin": tin_attr,
           "Tout": tout_attr,
           "f": func_attr,
-          "config": rewriter_config,
+          "config_proto": config_proto,
           "executor_type": executor_type_attr,
       })
   outputs = op.outputs
diff --git a/tensorflow/python/ops/gradient_checker.py b/tensorflow/python/ops/gradient_checker.py
index 1665219c80c..683f78ce9b2 100644
--- a/tensorflow/python/ops/gradient_checker.py
+++ b/tensorflow/python/ops/gradient_checker.py
@@ -31,7 +31,6 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradients
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.util import deprecation
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -158,7 +157,8 @@ def _compute_numeric_jacobian(x, x_shape, x_data, y, y_shape, delta,
   # as delta. Convert to float32 here. Since numeric_jacobian is expected to
   # be the groundtruth to compare against, it shouldn't lose any information.
   if x.dtype == dtypes.bfloat16:
-    x = math_ops.cast(x, dtypes.float32)
+    x = math_ops.cast(x, dtypes.float32)  # TODO(wangpeng): Now that the new x
+            # is an output of the old x, isn't feeding to the new x a mistake?
   if y.dtype == dtypes.bfloat16:
     y = math_ops.cast(y, dtypes.float32)
   if x_data.dtype == dtypes.bfloat16.as_numpy_dtype:
@@ -266,7 +266,7 @@ def _compute_gradient_list(x,
   return ret
 
 
-@tf_export("test.compute_gradient")
+@tf_export(v1=["test.compute_gradient"])
 def compute_gradient(x,
                      x_shape,
                      y,
@@ -301,7 +301,6 @@ def compute_gradient(x,
       as the initial value.
     delta: (optional) the amount of perturbation.
     init_targets: list of targets to run to initialize model params.
-      TODO(mrry): remove this argument.
     extra_feed_dict: dict that allows fixing specified tensor values
       during the Jacobian calculation.
 
@@ -311,6 +310,7 @@ def compute_gradient(x,
     where "x_size" is the number of elements in x and "y_size" is the
     number of elements in y. If x is a list, returns a list of two numpy arrays.
   """
+  # TODO(mrry): remove argument `init_targets`
   if extra_feed_dict is None:
     extra_feed_dict = {}
 
@@ -328,10 +328,17 @@ def compute_gradient(x,
     return ret
 
 
+def _compute_error(grad):
+  if isinstance(grad, tuple):
+    grad = [grad]
+  error = 0
+  for j_t, j_n in grad:
+    if j_t.size or j_n.size:  # Handle zero size tensors correctly
+      error = np.maximum(error, np.fabs(j_t - j_n).max())
+  return error
+
+
 @tf_export(v1=["test.compute_gradient_error"])
-@deprecation.deprecated_args(
-    None, "init_targets will be deprecated in TensorFlow 2.0",
-    ("init_targets", None))  # Do not trigger warning in V2
 def compute_gradient_error(x,
                            x_shape,
                            y,
@@ -373,59 +380,4 @@ def compute_gradient_error(x,
   """
   grad = compute_gradient(x, x_shape, y, y_shape, x_init_value, delta,
                           init_targets, extra_feed_dict=extra_feed_dict)
-  if isinstance(grad, tuple):
-    grad = [grad]
-  error = 0
-  for j_t, j_n in grad:
-    if j_t.size or j_n.size:  # Handle zero size tensors correctly
-      error = np.maximum(error, np.fabs(j_t - j_n).max())
-  return error
-
-
-@tf_export("test.compute_gradient_error", v1=[])
-def compute_gradient_error_v2(x,
-                              x_shape,
-                              y,
-                              y_shape,
-                              x_init_value=None,
-                              delta=1e-3,
-                              extra_feed_dict=None):
-  """Computes the gradient error.
-
-  Computes the maximum error for dy/dx between the computed Jacobian and the
-  numerically estimated Jacobian.
-
-  This function will modify the tensors passed in as it adds more operations
-  and hence changing the consumers of the operations of the input tensors.
-
-  This function adds operations to the current session. To compute the error
-  using a particular device, such as a GPU, use the standard methods for
-  setting a device (e.g. using with sess.graph.device() or setting a device
-  function in the session constructor).
-
-  Args:
-    x: a tensor or list of tensors
-    x_shape: the dimensions of x as a tuple or an array of ints. If x is a list,
-      then this is the list of shapes.
-    y: a tensor
-    y_shape: the dimensions of y as a tuple or an array of ints.
-    x_init_value: (optional) a numpy array of the same shape as "x" representing
-      the initial value of x. If x is a list, this should be a list of numpy
-      arrays.  If this is none, the function will pick a random tensor as the
-      initial value.
-    delta: (optional) the amount of perturbation.
-    extra_feed_dict: dict that allows fixing specified tensor values during the
-      Jacobian calculation.
-
-  Returns:
-    The maximum error in between the two Jacobians.
-  """
-  return compute_gradient_error(
-      x,
-      x_shape,
-      y,
-      y_shape,
-      x_init_value=x_init_value,
-      delta=delta,
-      init_targets=None,
-      extra_feed_dict=extra_feed_dict)
+  return _compute_error(grad)
diff --git a/tensorflow/python/ops/gradient_checker_test.py b/tensorflow/python/ops/gradient_checker_test.py
index 66c7b9a71b5..4d2b5efac7b 100644
--- a/tensorflow/python/ops/gradient_checker_test.py
+++ b/tensorflow/python/ops/gradient_checker_test.py
@@ -23,6 +23,7 @@ import numpy as np
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import math_ops
@@ -46,6 +47,7 @@ def _nan_grad(unused_op, grad):
 
 class GradientCheckerTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testAddSimple(self):
     np.random.seed(1)  # Fix seed to avoid flakiness
     with self.session(use_gpu=False):
@@ -60,6 +62,7 @@ class GradientCheckerTest(test.TestCase):
     tf_logging.info("x1 error = %f", error)
     assert error < 1e-4
 
+  @test_util.run_deprecated_v1
   def testAddSimpleGPU(self):
     np.random.seed(2)  # Fix seed to avoid flakiness
     with self.session(use_gpu=True):
@@ -74,6 +77,7 @@ class GradientCheckerTest(test.TestCase):
     tf_logging.info("x1 error = %f", error)
     assert error < 1e-4
 
+  @test_util.run_deprecated_v1
   def testAddCustomized(self):
     np.random.seed(3)  # Fix seed to avoid flakiness
     with self.cached_session():
@@ -92,6 +96,7 @@ class GradientCheckerTest(test.TestCase):
     tf_logging.info("x2 error = %f", error)
     assert error < 1e-10
 
+  @test_util.run_deprecated_v1
   def testGather(self):
     np.random.seed(4)  # Fix seed to avoid flakiness
     with self.cached_session():
@@ -109,6 +114,7 @@ class GradientCheckerTest(test.TestCase):
     tf_logging.info("gather error = %f", error)
     assert error < 1e-4
 
+  @test_util.run_deprecated_v1
   def testNestedGather(self):
     np.random.seed(5)  # Fix seed to avoid flakiness
     with self.cached_session():
@@ -130,6 +136,7 @@ class GradientCheckerTest(test.TestCase):
     tf_logging.info("nested gather error = %f", error)
     assert error < 1e-4
 
+  @test_util.run_deprecated_v1
   def testComplexMul(self):
     with self.cached_session():
       size = ()
@@ -144,6 +151,7 @@ class GradientCheckerTest(test.TestCase):
       self.assertLess(
           gradient_checker.compute_gradient_error(x, size, y, size), 2e-4)
 
+  @test_util.run_deprecated_v1
   def testComplexConj(self):
     with self.cached_session():
       size = ()
@@ -157,6 +165,7 @@ class GradientCheckerTest(test.TestCase):
       self.assertLess(
           gradient_checker.compute_gradient_error(x, size, y, size), 2e-5)
 
+  @test_util.run_deprecated_v1
   def testEmptySucceeds(self):
     with self.cached_session():
       x = array_ops.placeholder(dtypes.float32)
@@ -279,18 +288,23 @@ class MiniMNISTTest(test.TestCase):
     tf_logging.info("Mini MNIST: %s gradient error = %g", tag, err)
     return err
 
+  @test_util.run_deprecated_v1
   def testInputGradient(self):
     self.assertLess(self._BuildAndTestMiniMNIST(0, "input"), 1e-8)
 
+  @test_util.run_deprecated_v1
   def testHiddenWeightGradient(self):
     self.assertLess(self._BuildAndTestMiniMNIST(1, "hidden_weight"), 1e-8)
 
+  @test_util.run_deprecated_v1
   def testHiddenBiasGradient(self):
     self.assertLess(self._BuildAndTestMiniMNIST(2, "hidden_bias"), 1e-8)
 
+  @test_util.run_deprecated_v1
   def testSoftmaxWeightGradient(self):
     self.assertLess(self._BuildAndTestMiniMNIST(3, "softmax_weight"), 1e-8)
 
+  @test_util.run_deprecated_v1
   def testSoftmaxBiasGradient(self):
     self.assertLess(self._BuildAndTestMiniMNIST(4, "softmax_bias"), 1e-8)
 
diff --git a/tensorflow/python/ops/gradient_checker_v2.py b/tensorflow/python/ops/gradient_checker_v2.py
new file mode 100644
index 00000000000..cf844841127
--- /dev/null
+++ b/tensorflow/python/ops/gradient_checker_v2.py
@@ -0,0 +1,318 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Gradient checker for functions.
+
+The gradient checker verifies numerically that an function properly
+computes the gradients
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.eager import backprop
+from tensorflow.python.eager import context
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.util.tf_export import tf_export
+
+
+def _product(t):
+  if isinstance(t, int):
+    return t
+  else:
+    y = 1
+    for x in t:
+      y *= x
+    return y
+
+
+def _to_numpy(a):
+  """Converts tensors to numpy arrays.
+
+  Converts Tensors and EagerTensors to numpy arrays.
+  When eager execution is enabled, converts IndexedSlices
+  to IndexedSlicesValue with numpy indices/values
+
+  Args:
+    a: any value.
+
+  Returns:
+    If a is EagerTensor or Tensor, returns the evaluation of a by calling
+    numpy() or run().
+    If a is IndexedSlices and eager execution is enabled, calls numpy() on a's
+    fields. Otherwise returns a unchanged.
+  """
+  if isinstance(a, ops.EagerTensor):
+    return a.numpy()
+  if isinstance(a, ops.Tensor):
+    sess = ops.get_default_session()
+    return sess.run(a)
+  if isinstance(a, ops.IndexedSlices) and context.executing_eagerly():
+    return ops.IndexedSlicesValue(
+        indices=[x.numpy() for x in a.indices],
+        values=[x.numpy() for x in a.values],
+        dense_shape=a.dense_shape)
+  return a
+
+
+def _prepare(f, xs_dtypes):
+  """Return a function that executes 'f'.
+
+    In TF 2.x, this is the same as `f`.
+    In TF 1.x, returns a Python function that executes the graph defined by `f`
+    in a Session.
+
+  Args:
+    f: the function.
+    xs_dtypes: dtypes of f's arguments.
+
+  Returns:
+    a function that will be evaluated in both graph and eager mode
+  """
+  if context.executing_eagerly():
+
+    def decorated_eager(*xs_data):
+      return f(*map(ops.convert_to_tensor, xs_data))
+
+    return decorated_eager
+  xs = [array_ops.placeholder(x_dtype) for x_dtype in xs_dtypes]
+  y = f(*xs)
+  sess = ops.get_default_session()
+  def decorated_graph(*xs_data):
+    xs_data = [_to_numpy(a) for a in xs_data]
+    return sess.run(y, feed_dict=dict(zip(xs, xs_data)))
+  return decorated_graph
+
+
+def _compute_theoretical_jacobian(f, y_shape, y_dtype, xs, param):
+  """Computes the theoretical Jacobian for f regarding xs[param].
+
+  One can think of the relation among f, xs and y as y = f(xs).
+
+  Args:
+    f: the function.
+    y_shape: the shape of the result.
+    y_dtype: the dtype of the result.
+    xs: a list of tensors.
+    param: the index of the target parameter.
+
+  Returns:
+    A 2-d numpy array representing the Jacobian. It has "x_size" rows
+    and "y_size" columns where "x_size" is the number of elements in xs[param]
+    and "y_size" is the number of elements in the result.
+
+  Raises:
+    ValueError: If result is empty but the gradient is nonzero.
+  """
+  x = xs[param]
+  # Complex vectors are treated as vectors of twice as many reals.
+  x_shape = tuple(x.shape) + (2,) if x.dtype.is_complex else x.shape
+  y_factor = 2 if y_dtype.is_complex else 1
+
+  # To compute the jacobian, we treat x and y as one-dimensional vectors.
+  x_size = _product(x_shape)
+  x_val_size = _product(x_shape[1:])  # This is used for sparse gradients
+  y_size = _product(y_shape) * y_factor
+
+  # Allocate 2-D Jacobian, with x dimensions smashed into the first
+  # dimension and y dimensions smashed into the second.
+  jacobian = np.zeros((x_size, y_size), dtype=x.dtype.real_dtype.as_numpy_dtype)
+
+  # For each of the entry of dy, we set this to be 1 and
+  # everything else to be 0 and compute the gradients -- this will give us one
+  # one column of the Jacobian matrix.
+  dy_data = np.zeros(y_shape, dtype=y_dtype.as_numpy_dtype)
+  dy_data_flat = dy_data.ravel().view(y_dtype.real_dtype.as_numpy_dtype)
+  grad_fn_unprep = backprop.gradients_function(f, [param])
+  grad_fn = _prepare(lambda dy, *xs: grad_fn_unprep(*xs, dy=dy),
+                     [y_dtype] + [x.dtype for x in xs])
+  for col in range(y_size):
+    dy_data_flat[col] = 1
+    grad = _to_numpy(grad_fn(dy_data, *xs)[0])
+    dy_data_flat[col] = 0
+    if isinstance(grad, ops.IndexedSlicesValue):
+      for i, v in zip(grad.indices, grad.values):
+        r_begin = i * x_val_size
+        r_end = r_begin + x_val_size
+        jacobian[r_begin:r_end, col] += v.flat
+    else:
+      jacobian[:, col] = grad.ravel().view(jacobian.dtype)
+
+  # If the output is empty, run the gradients at least once and make sure
+  # they produce zeros.
+  if y_size == 0:  # don't use 'not y_size', because y_size may not be an int
+    grad = _to_numpy(grad_fn(dy_data, *xs)[0])
+    if grad.shape != x.shape:
+      raise ValueError("Empty gradient has wrong shape: expected %s, got %s" %
+                       (x.shape, grad.shape))
+    if np.any(grad):
+      raise ValueError("Empty tensor with nonzero gradients")
+
+  logging.vlog(1, "Theoretical Jacobian =\n%s", jacobian)
+  return jacobian
+
+
+def _compute_numeric_jacobian(f, y_size, y_dtype, xs, param,
+                              delta):
+  """Computes the numeric Jacobian for f regarding xs[param].
+
+  One can think of the relation among f, xs and y as y = f(xs).
+
+  Args:
+    f: the function.
+    y_size: the number of elements of the result.
+    y_dtype: the dtype of the result.
+    xs: a list of tensors.
+    param: the index of the target parameter.
+    delta: the amount of perturbation we give to the input.
+
+  Returns:
+    A 2-d numpy array representing the Jacobian. It has "x_size" rows
+    and "y_size" columns where "x_size" is the number of elements in xs[param]
+    and "y_size" is the number of elements in the result.
+  """
+  # bfloat16 doesn't have enough bits to represent high precision numbers such
+  # as delta. Convert to float32 here. Since numeric_jacobian is expected to
+  # be the groundtruth to compare against, it shouldn't lose any information.
+  x_shape = xs[param].shape
+  x_dtype = xs[param].dtype
+  if y_dtype == dtypes.bfloat16:
+    f = lambda *xs: math_ops.cast(f(*xs), dtypes.float32)
+    y_dtype = dtypes.float32
+
+  # To compute the jacobian, we treat x and y as one-dimensional vectors
+  x_size = _product(x_shape) * (2 if x_dtype.is_complex else 1)
+  y_size = y_size * (2 if y_dtype.is_complex else 1)
+  x_dtype = x_dtype.real_dtype.as_numpy_dtype
+  y_dtype = y_dtype.real_dtype.as_numpy_dtype
+
+  xs_dtypes = [x.dtype for x in xs]
+  # Converts xs to numpy arrays to do in-place perturbation.
+  # Calls asarray() to avoid copying in ravel() later.
+  xs = [np.asarray(_to_numpy(x)) for x in xs]
+  x = xs[param]
+
+  # Make sure we have the right types
+  scale = np.asarray(2 * delta, dtype=y_dtype)[()]
+
+  jacobian = np.zeros((x_size, y_size), dtype=x_dtype)
+  # For each of the entry of x, we slightly perturbs this by adding and
+  # subtracting a delta and then compute difference between the outputs. This
+  # will give us one row of the Jacobian matrix.
+
+  f = _prepare(f, xs_dtypes)
+  for row in range(x_size):
+    original = x.ravel().view(x_dtype)[row]
+    x.ravel().view(x_dtype)[row] += delta
+    y_pos = _to_numpy(f(*xs))
+    x.ravel().view(x_dtype)[row] = original
+    x.ravel().view(x_dtype)[row] -= delta
+    y_neg = _to_numpy(f(*xs))
+    x.ravel().view(x_dtype)[row] = original
+    diff = (y_pos - y_neg) / scale
+    jacobian[row, :] = diff.ravel().view(y_dtype)
+
+  logging.vlog(1, "Numeric Jacobian =\n%s", jacobian)
+  return jacobian
+
+
+def _compute_gradient(f,
+                      y_shape,
+                      y_dtype,
+                      xs,
+                      param,
+                      delta):
+  """Computes the theoretical and numerical jacobian."""
+  x = xs[param]
+  t = x.dtype
+  allowed_types = [dtypes.float16, dtypes.bfloat16, dtypes.float32,
+                   dtypes.float64, dtypes.complex64, dtypes.complex128]
+  assert t.base_dtype in allowed_types, ("Cannot compute gradient for"
+                                         "unsupported type %s of argument %s" %
+                                         (t.name, param))
+  t2 = y_dtype
+  assert t2.base_dtype in allowed_types, ("Cannot compute gradient for"
+                                          "unsupported type %s of y" % t2.name)
+  y_size = _product(y_shape)
+  jacob_t = _compute_theoretical_jacobian(f, y_shape, y_dtype,
+                                          xs, param)
+  jacob_n = _compute_numeric_jacobian(f, y_size, y_dtype, xs,
+                                      param, delta)
+  return jacob_t, jacob_n
+
+
+def _compute_gradient_list(f, xs, delta):
+  """Compute gradients for a list of x values."""
+  # convert xs to tensors so that dtype and shape have uniform types
+  xs = list(map(ops.convert_to_tensor, xs))
+  # run the function to get info of the result
+  xs_dtypes = [x.dtype for x in xs]
+  f_temp = _prepare(f, xs_dtypes)
+  y = f_temp(*xs)
+  return zip(*[_compute_gradient(f, y.shape, dtypes.as_dtype(y.dtype),
+                                 xs, i, delta) for i in range(len(xs))])
+
+
+@tf_export("test.compute_gradient", v1=[])
+def compute_gradient(f, x, delta=1e-3):
+  """Computes the theoretical and numeric Jacobian of f.
+
+  With y = f(x), computes the theoretical and numeric Jacobian dy/dx.
+
+  Args:
+    f: the function.
+    x: a list of tensors.
+    delta: (optional) perturbation used to compute numeric Jacobian.
+
+  Returns:
+    A pair of lists, where the first is a list of 2-d numpy arrays representing
+    the theoretical Jacobians for each argument, and the second list is the
+    numerical ones. Each 2-d array has "x_size" rows
+    and "y_size" columns where "x_size" is the number of elements in the
+    corresponding argument and "y_size" is the number of elements in f(x).
+
+  Raises:
+    ValueError: If result is empty but the gradient is nonzero.
+  """
+  if not isinstance(x, list):
+    raise ValueError(
+        "`x` must be a list of Tensors (arguments to `f`), not a %s" % type(x))
+  return _compute_gradient_list(f, x, delta)
+
+
+def max_error(grad1, grad2):
+  """Computes maximum elementwise gap.
+
+  Computes the maximum elementwise gap between two lists of tensors of the same
+  shape.
+
+  Args:
+    grad1: a lists of tensors.
+    grad2: a lists of tensors with the same shape as grad1.
+
+  Returns:
+    The maximum elementwise gap between the two.
+  """
+  error = 0
+  for j_t, j_n in zip(grad1, grad2):
+    if j_t.size or j_n.size:  # Handle zero size tensors correctly
+      error = np.maximum(error, np.fabs(j_t - j_n).max())
+  return error
diff --git a/tensorflow/python/ops/gradient_checker_v2_test.py b/tensorflow/python/ops/gradient_checker_v2_test.py
new file mode 100644
index 00000000000..ce9ff47d617
--- /dev/null
+++ b/tensorflow/python/ops/gradient_checker_v2_test.py
@@ -0,0 +1,300 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for compute_gradient.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.eager import backprop
+from tensorflow.python.eager import context
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import custom_gradient
+from tensorflow.python.ops import \
+gradient_checker_v2 as gradient_checker
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn_ops
+# needs this to register gradient for SoftmaxCrossEntropyWithLogits:
+import tensorflow.python.ops.nn_grad  # pylint: disable=unused-import
+from tensorflow.python.platform import test
+from tensorflow.python.platform import tf_logging
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class GradientCheckerTest(test.TestCase):
+
+  def testAddSimple(self):
+    # if context.executing_eagerly():
+    #   return
+    np.random.seed(1)  # Fix seed to avoid flakiness
+    size = (2, 3)
+    x1 = constant_op.constant(2.0, shape=size, name="x1")
+    x2 = constant_op.constant(3.0, shape=size, name="x2")
+    error = gradient_checker.max_error(*gradient_checker.compute_gradient(
+        lambda x1: math_ops.add(x1, x2), [x1]))
+    tf_logging.info("x1 error = %f", error)
+    assert error < 1e-4
+
+  def testAddCustomized(self):
+    np.random.seed(3)  # Fix seed to avoid flakiness
+    size = (2, 3)
+    x1 = constant_op.constant(
+        2.0, shape=size, dtype=dtypes.float64, name="x1")
+    x2 = np.asarray(np.arange(6, dtype=np.float64).reshape(2, 3))
+    # checkint gradients for x2 using a special delta
+    error = gradient_checker.max_error(*gradient_checker.compute_gradient(
+        lambda x2: math_ops.add(x1, x2),
+        [x2], delta=1e-2))
+    tf_logging.info("x2 error = %f", error)
+    assert error < 1e-10
+
+  def testGather(self):
+    np.random.seed(4)  # Fix seed to avoid flakiness
+    def f(params):
+      index_values = [1, 3]
+      indices = constant_op.constant(index_values, name="i")
+      return array_ops.gather(params, indices, name="y")
+    p_shape = (4, 2)
+    p_size = 8
+    params = constant_op.constant(
+        np.arange(p_size).astype(np.float), shape=p_shape, name="p")
+    error = gradient_checker.max_error(*gradient_checker.compute_gradient(
+        f, [params]))
+    tf_logging.info("gather error = %f", error)
+    assert error < 1e-4
+
+  def testNestedGather(self):
+    np.random.seed(5)  # Fix seed to avoid flakiness
+    def f(params):
+      index_values = [1, 3, 5, 6]
+      indices = constant_op.constant(index_values, name="i")
+      y = array_ops.gather(params, indices, name="y")
+      index_values2 = [0, 2]
+      indices2 = constant_op.constant(index_values2, name="i2")
+      return array_ops.gather(y, indices2, name="y2")
+    p_shape = (8, 2)
+    p_size = 16
+    params = constant_op.constant(
+        np.arange(p_size).astype(np.float), shape=p_shape, name="p")
+    error = gradient_checker.max_error(*gradient_checker.compute_gradient(
+        f, [params]))
+    tf_logging.info("nested gather error = %f", error)
+    assert error < 1e-4
+
+  def testComplexMul(self):
+    if not context.executing_eagerly():
+      return
+    c = constant_op.constant(5 + 7j, dtype=dtypes.complex64)
+    def f(x):
+      return c * x
+    x = constant_op.constant(11 - 13j, dtype=dtypes.complex64)
+    analytical, numerical = gradient_checker.compute_gradient(
+        f, [x], delta=0.1)
+    correct = np.array([[5, 7], [-7, 5]])
+    self.assertAllEqual(correct, analytical[0])
+    self.assertAllClose(correct, numerical[0], rtol=1e-4)
+    self.assertLess(
+        gradient_checker.max_error(*gradient_checker.compute_gradient(
+            f, [x], delta=0.1)), 2e-4)
+
+  def testComplexConj(self):
+    def f(x):
+      return math_ops.conj(x)
+    x = constant_op.constant(11 - 13j, dtype=dtypes.complex64)
+    analytical, numerical = gradient_checker.compute_gradient(
+        f, [x], delta=0.1)
+    correct = np.array([[1, 0], [0, -1]])
+    self.assertAllEqual(correct, analytical[0])
+    self.assertAllClose(correct, numerical[0], rtol=2e-5)
+    self.assertLess(
+        gradient_checker.max_error(*gradient_checker.compute_gradient(
+            f, [x], delta=0.1)), 2e-5)
+
+  def testEmptySucceeds(self):
+    def f(x):
+      return array_ops.identity(x)
+    x = constant_op.constant(np.random.random_sample((0, 3)),
+                             dtype=dtypes.float32)
+    for grad in gradient_checker.compute_gradient(f, [x]):
+      self.assertEqual(grad[0].shape, (0, 0))
+    error = gradient_checker.max_error(*gradient_checker.compute_gradient(
+        f, [x]))
+    self.assertEqual(error, 0)
+
+  def testEmptyFails(self):
+    # if not context.executing_eagerly():
+    #   return
+    @custom_gradient.custom_gradient
+    def id_bad_grad(x):
+      y = array_ops.identity(x)
+      def grad_fn(dy):
+        # dx = constant_op.constant(np.zeros((1, 4)), dtype=dtypes.float32)
+        dx = array_ops.transpose(dy)
+        return dx
+      return y, grad_fn
+    def f(x):
+      return id_bad_grad(x)
+    x = constant_op.constant(np.random.random_sample((0, 3)),
+                             dtype=dtypes.float32)
+    bad = r"Empty gradient has wrong shape: expected \(0, 3\), got \(3, 0\)"
+    with self.assertRaisesRegexp(ValueError, bad):
+      gradient_checker.compute_gradient(f, [x])
+
+  def testNaNGradFails(self):
+    @custom_gradient.custom_gradient
+    def id_nan_grad(x):
+      y = array_ops.identity(x)
+      def grad_fn(dy):
+        dx = np.nan * dy
+        # dx = dy
+        return dx
+      return y, grad_fn
+    def f(x):
+      return id_nan_grad(x)
+    x = constant_op.constant(np.random.random_sample((1, 1)),
+                             dtype=dtypes.float32)
+    error = gradient_checker.max_error(*gradient_checker.compute_gradient(
+        f, [x]))
+    # Typical test would assert error < max_err, so assert this test would
+    # raise AssertionError, since NaN is not < 1.0.
+    with self.assertRaisesRegexp(AssertionError, "False is not true"):
+      self.assertTrue(error < 1.0)
+
+  def testGradGrad(self):
+
+    def f(x):
+      with backprop.GradientTape() as tape:
+        tape.watch(x)
+        y = math_ops.square(x)
+        z = math_ops.square(y)
+      return tape.gradient(z, x)
+
+    analytical, numerical = gradient_checker.compute_gradient(f, [2.0])
+    self.assertAllEqual([[[48.]]], analytical)
+    self.assertAllClose([[[48.]]], numerical, rtol=1e-4)
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class MiniMNISTTest(test.TestCase):
+
+  # Gradient checker for MNIST.
+  def _BuildAndTestMiniMNIST(self, param_index, tag):
+    # Fix seed to avoid occasional flakiness
+    np.random.seed(6)
+
+    # Hyperparameters
+    batch = 3
+    inputs = 16
+    features = 32
+    classes = 10
+
+    # Define the parameters
+    inp_data = np.random.random_sample(inputs * batch)
+    hidden_weight_data = np.random.randn(inputs * features) / np.sqrt(inputs)
+    hidden_bias_data = np.random.random_sample(features)
+    sm_weight_data = np.random.randn(features * classes) / np.sqrt(features)
+    sm_bias_data = np.random.random_sample(classes)
+
+    # special care for labels since they need to be normalized per batch
+    label_data = np.random.random(batch * classes).reshape((batch, classes))
+    s = label_data.sum(axis=1)
+    label_data /= s[:, None]
+
+    # We treat the inputs as "parameters" here
+    inp = constant_op.constant(
+        inp_data.tolist(),
+        shape=[batch, inputs],
+        dtype=dtypes.float64,
+        name="inp")
+    hidden_weight = constant_op.constant(
+        hidden_weight_data.tolist(),
+        shape=[inputs, features],
+        dtype=dtypes.float64,
+        name="hidden_weight")
+    hidden_bias = constant_op.constant(
+        hidden_bias_data.tolist(),
+        shape=[features],
+        dtype=dtypes.float64,
+        name="hidden_bias")
+    softmax_weight = constant_op.constant(
+        sm_weight_data.tolist(),
+        shape=[features, classes],
+        dtype=dtypes.float64,
+        name="softmax_weight")
+    softmax_bias = constant_op.constant(
+        sm_bias_data.tolist(),
+        shape=[classes],
+        dtype=dtypes.float64,
+        name="softmax_bias")
+
+    # List all the parameter so that we can test them one at a time
+    all_params = [
+        inp, hidden_weight, hidden_bias, softmax_weight, softmax_bias
+    ]
+
+    # Now, Building MNIST
+    def f(inp, hidden_weight, hidden_bias, softmax_weight, softmax_bias):
+      features = nn_ops.relu(
+          nn_ops.xw_plus_b(inp, hidden_weight, hidden_bias), name="features")
+      logits = nn_ops.xw_plus_b(
+          features, softmax_weight, softmax_bias, name="logits")
+      labels = constant_op.constant(
+          label_data.tolist(),
+          shape=[batch, classes],
+          dtype=dtypes.float64,
+          name="labels")
+      cost = nn_ops.softmax_cross_entropy_with_logits(
+          labels=labels, logits=logits, name="cost")
+      return cost
+
+    def f_restricted(x):
+      xs = all_params
+      i = param_index
+      # use x for the i-th parameter
+      xs = xs[0:i]+[x]+xs[i+1:]
+      return f(*xs)
+    # Test the gradients.
+    err = gradient_checker.max_error(*gradient_checker.compute_gradient(
+        f_restricted, [all_params[param_index]], delta=1e-5))
+
+    tf_logging.info("Mini MNIST: %s gradient error = %g", tag, err)
+    return err
+
+  def testInputGradient(self):
+    # if context.executing_eagerly():
+    #   return
+    self.assertLess(self._BuildAndTestMiniMNIST(0, "input"), 1e-8)
+
+  def testHiddenWeightGradient(self):
+    self.assertLess(self._BuildAndTestMiniMNIST(1, "hidden_weight"), 1e-8)
+
+  def testHiddenBiasGradient(self):
+    self.assertLess(self._BuildAndTestMiniMNIST(2, "hidden_bias"), 1e-8)
+
+  def testSoftmaxWeightGradient(self):
+    self.assertLess(self._BuildAndTestMiniMNIST(3, "softmax_weight"), 1e-8)
+
+  def testSoftmaxBiasGradient(self):
+    self.assertLess(self._BuildAndTestMiniMNIST(4, "softmax_bias"), 1e-8)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/ops/gradients_impl.py b/tensorflow/python/ops/gradients_impl.py
index 4f0fb54dcab..8cc4d926c7f 100644
--- a/tensorflow/python/ops/gradients_impl.py
+++ b/tensorflow/python/ops/gradients_impl.py
@@ -49,9 +49,9 @@ from tensorflow.python.ops import logging_ops  # pylint: disable=unused-import
 from tensorflow.python.ops import manip_grad  # pylint: disable=unused-import
 from tensorflow.python.ops import math_grad  # pylint: disable=unused-import
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import optional_grad  # pylint: disable=unused-import
 from tensorflow.python.ops import random_grad  # pylint: disable=unused-import
 from tensorflow.python.ops import resource_variable_ops
-from tensorflow.python.ops import spectral_grad  # pylint: disable=unused-import
 from tensorflow.python.ops import tensor_array_ops
 from tensorflow.python.ops.unconnected_gradients import UnconnectedGradients
 from tensorflow.python.platform import tf_logging as logging
@@ -540,7 +540,7 @@ def _Consumers(t, func_graphs):
   return consumers
 
 
-@tf_export("gradients")
+@tf_export(v1=["gradients"])
 def gradients(ys,
               xs,
               grad_ys=None,
@@ -656,6 +656,119 @@ def gradients(ys,
                             unconnected_gradients)
 
 
+@tf_export("gradients", v1=[])
+def gradients_v2(ys,  # pylint: disable=invalid-name
+                 xs,
+                 grad_ys=None,
+                 name="gradients",
+                 gate_gradients=False,
+                 aggregation_method=None,
+                 stop_gradients=None,
+                 unconnected_gradients=UnconnectedGradients.NONE):
+  """Constructs symbolic derivatives of sum of `ys` w.r.t. x in `xs`.
+
+  `ys` and `xs` are each a `Tensor` or a list of tensors.  `grad_ys`
+  is a list of `Tensor`, holding the gradients received by the
+  `ys`. The list must be the same length as `ys`.
+
+  `gradients()` adds ops to the graph to output the derivatives of `ys` with
+  respect to `xs`.  It returns a list of `Tensor` of length `len(xs)` where
+  each tensor is the `sum(dy/dx)` for y in `ys`.
+
+  `grad_ys` is a list of tensors of the same length as `ys` that holds
+  the initial gradients for each y in `ys`.  When `grad_ys` is None,
+  we fill in a tensor of '1's of the shape of y for each y in `ys`.  A
+  user can provide their own initial `grad_ys` to compute the
+  derivatives using a different initial gradient for each y (e.g., if
+  one wanted to weight the gradient differently for each value in
+  each y).
+
+  `stop_gradients` is a `Tensor` or a list of tensors to be considered constant
+  with respect to all `xs`. These tensors will not be backpropagated through,
+  as though they had been explicitly disconnected using `stop_gradient`.  Among
+  other things, this allows computation of partial derivatives as opposed to
+  total derivatives. For example:
+
+  ```python
+  a = tf.constant(0.)
+  b = 2 * a
+  g = tf.gradients(a + b, [a, b], stop_gradients=[a, b])
+  ```
+
+  Here the partial derivatives `g` evaluate to `[1.0, 1.0]`, compared to the
+  total derivatives `tf.gradients(a + b, [a, b])`, which take into account the
+  influence of `a` on `b` and evaluate to `[3.0, 1.0]`.  Note that the above is
+  equivalent to:
+
+  ```python
+  a = tf.stop_gradient(tf.constant(0.))
+  b = tf.stop_gradient(2 * a)
+  g = tf.gradients(a + b, [a, b])
+  ```
+
+  `stop_gradients` provides a way of stopping gradient after the graph has
+  already been constructed, as compared to `tf.stop_gradient` which is used
+  during graph construction.  When the two approaches are combined,
+  backpropagation stops at both `tf.stop_gradient` nodes and nodes in
+  `stop_gradients`, whichever is encountered first.
+
+  All integer tensors are considered constant with respect to all `xs`, as if
+  they were included in `stop_gradients`.
+
+  `unconnected_gradients` determines the value returned for each x in xs if it
+  is unconnected in the graph to ys. By default this is None to safeguard
+  against errors. MAthematically these gradients are zero which can be requested
+  using the `'zero'` option. `tf.UnconnectedGradients` provides the
+  following options and behaviors:
+
+  ```python
+  a = tf.ones([1, 2])
+  b = tf.ones([3, 1])
+  g1 = tf.gradients([b], [a], unnconnected_gradients='none')
+  sess.run(g1)  # [None]
+
+  g2 = tf.gradients([b], [a], unconnected_gradients='zero')
+  sess.run(g2)  # [array([[0., 0.]], dtype=float32)]
+  ```
+
+
+  Args:
+    ys: A `Tensor` or list of tensors to be differentiated.
+    xs: A `Tensor` or list of tensors to be used for differentiation.
+    grad_ys: Optional. A `Tensor` or list of tensors the same size as
+      `ys` and holding the gradients computed for each y in `ys`.
+    name: Optional name to use for grouping all the gradient ops together.
+      defaults to 'gradients'.
+    gate_gradients: If True, add a tuple around the gradients returned
+      for an operations.  This avoids some race conditions.
+    aggregation_method: Specifies the method used to combine gradient terms.
+      Accepted values are constants defined in the class `AggregationMethod`.
+    stop_gradients: Optional. A `Tensor` or list of tensors not to differentiate
+      through.
+    unconnected_gradients: Optional. Specifies the gradient value returned when
+      the given input tensors are unconnected. Accepted values are constants
+      defined in the class `tf.UnconnectedGradients` and the default value is
+      `none`.
+
+  Returns:
+    A list of `sum(dy/dx)` for each x in `xs`.
+
+  Raises:
+    LookupError: if one of the operations between `x` and `y` does not
+      have a registered gradient function.
+    ValueError: if the arguments are invalid.
+    RuntimeError: if called in Eager mode.
+
+  """
+  # Creating the gradient graph for control flow mutates Operations.
+  # _mutation_lock ensures a Session.run call cannot occur between creating and
+  # mutating new ops.
+  with ops.get_default_graph()._mutation_lock():  # pylint: disable=protected-access
+    return _GradientsHelper(ys, xs, grad_ys, name, True, gate_gradients,
+                            aggregation_method, stop_gradients,
+                            unconnected_gradients)
+
+
 def _GradientsHelper(ys,
                      xs,
                      grad_ys=None,
@@ -896,7 +1009,7 @@ def _HasAnyNotNoneGrads(grads, op):
     if isinstance(out_grad, (ops.Tensor, ops.IndexedSlices)):
       return True
     if out_grad and isinstance(out_grad, collections.Sequence):
-      if any([g is not None for g in out_grad]):
+      if any(g is not None for g in out_grad):
         return True
   return False
 
@@ -1111,11 +1224,11 @@ def _AggregatedGrads(grads,
         assert control_flow_util.IsLoopSwitch(op)
         continue
     # Grads have to be Tensors or IndexedSlices
-    if (isinstance(out_grad, collections.Sequence) and not all([
+    if (isinstance(out_grad, collections.Sequence) and not all(
         isinstance(g, (ops.Tensor, ops.IndexedSlices))
         for g in out_grad
         if g is not None
-    ])):
+    )):
       raise TypeError("gradients have to be either all Tensors "
                       "or all IndexedSlices")
     # Aggregate multiple gradients, and convert [] to None.
@@ -1123,7 +1236,7 @@ def _AggregatedGrads(grads,
       if len(out_grad) < 2:
         used = "nop"
         out_grads[i] = out_grad[0]
-      elif all([isinstance(g, ops.Tensor) for g in out_grad if g is not None]):
+      elif all(isinstance(g, ops.Tensor) for g in out_grad if g is not None):
         tensor_shape = _AccumulatorShape(out_grad)
         if (aggregation_method == AggregationMethod.EXPERIMENTAL_ACCUMULATE_N
             and len(out_grad) > 2 and tensor_shape.is_fully_defined()):
@@ -1240,7 +1353,7 @@ def _hessian_vector_product(ys, xs, v):
   return gradients(elemwise_products, xs)
 
 
-@tf_export("hessians")
+@tf_export(v1=["hessians"])
 def hessians(ys,
              xs,
              name="hessians",
@@ -1305,3 +1418,16 @@ def hessians(ys,
                                           array_ops.concat((_shape, _shape), 0))
     hessians.append(_reshaped_hessian)
   return hessians
+
+
+@tf_export("hessians", v1=[])
+def HessiansV2(ys,
+               xs,
+               gate_gradients=False,
+               aggregation_method=None,
+               name="hessians"):
+  return hessians(ys, xs, name=name, gate_gradients=gate_gradients,
+                  aggregation_method=aggregation_method)
+
+
+HessiansV2.__doc__ = hessians.__doc__
diff --git a/tensorflow/python/ops/gradients_test.py b/tensorflow/python/ops/gradients_test.py
index 103e3902b60..a9058c4a341 100644
--- a/tensorflow/python/ops/gradients_test.py
+++ b/tensorflow/python/ops/gradients_test.py
@@ -144,7 +144,7 @@ class GradientsTest(test_util.TensorFlowTestCase):
                                  gate_gradients=True)[0]
       with session.Session():
         # Make sure the placer doesn't complain.
-        gz_x.eval()
+        self.evaluate(gz_x)
 
   def testBoundaryStop(self):
     # Test that we don't differentiate 'x'. The gradient function for 'x' is
@@ -365,7 +365,7 @@ class GradientsTest(test_util.TensorFlowTestCase):
       grads = gradients.gradients(
           [y], [x], unconnected_gradients="zero")
       with self.cached_session() as sess:
-        self.assertAllEqual([[0.0, 0.0], [0.0, 0.0]], sess.run(grads)[0])
+        self.assertAllEqual([[0.0, 0.0], [0.0, 0.0]], self.evaluate(grads)[0])
 
   def testUnconnectedGradientsZeroConnectedGradients(self):
     with ops.Graph().as_default():
@@ -374,7 +374,7 @@ class GradientsTest(test_util.TensorFlowTestCase):
       grad = gradients.gradients(
           [y], [x], unconnected_gradients="zero")
       with self.cached_session() as sess:
-        self.assertEquals(3.0, sess.run(grad)[0])
+        self.assertEquals(3.0, self.evaluate(grad)[0])
 
   def testUnknownUnconnectedGradientsValueGiven(self):
     with ops.Graph().as_default():
@@ -438,8 +438,8 @@ class FunctionGradientsTest(test_util.TensorFlowTestCase):
       grads = gradients.gradients(y, [x, b1])
 
       with self.cached_session() as sess:
-        self.assertAllEqual([40.0], sess.run(grads)[0])
-        self.assertAllEqual([10.0], sess.run(grads)[1])
+        self.assertAllEqual([40.0], self.evaluate(grads)[0])
+        self.assertAllEqual([10.0], self.evaluate(grads)[1])
 
   def testFunctionGradientsWithGradFunc(self):
     g = ops.Graph()
@@ -487,7 +487,7 @@ class FunctionGradientsTest(test_util.TensorFlowTestCase):
 
       f = Foo()
       with self.cached_session() as sess:
-        self.assertEqual(sess.run(f), 2.0)
+        self.assertEqual(self.evaluate(f), 2.0)
 
   def testGradientOfCaptured(self):
     with ops.Graph().as_default():
@@ -501,7 +501,7 @@ class FunctionGradientsTest(test_util.TensorFlowTestCase):
 
       f = Foo()
       with self.cached_session() as sess:
-        self.assertEqual(sess.run(f), 2.0)
+        self.assertEqual(self.evaluate(f), 2.0)
 
   def testCapturedResourceVariable(self):
     with ops.Graph().as_default():
@@ -515,8 +515,8 @@ class FunctionGradientsTest(test_util.TensorFlowTestCase):
 
       f = Foo()
       with self.cached_session() as sess:
-        sess.run(variables.global_variables_initializer())
-        self.assertEqual(sess.run(f), 2.0)
+        self.evaluate(variables.global_variables_initializer())
+        self.assertEqual(self.evaluate(f), 2.0)
 
   def testCapturedNested(self):
     with ops.Graph().as_default():
@@ -541,9 +541,9 @@ class FunctionGradientsTest(test_util.TensorFlowTestCase):
       x1_grad, x2_grad = Outer()
       with self.cached_session() as sess:
         # 1.0 + None + 2.0 + 1.0 = 4.0
-        self.assertEqual(sess.run(x1_grad), 4.0)
+        self.assertEqual(self.evaluate(x1_grad), 4.0)
         # None + 1.0 + 1.0 + None = 2.0
-        self.assertEqual(sess.run(x2_grad), 2.0)
+        self.assertEqual(self.evaluate(x2_grad), 2.0)
 
   def testCapturedFromFunction(self):
     with ops.Graph().as_default():
@@ -563,7 +563,7 @@ class FunctionGradientsTest(test_util.TensorFlowTestCase):
 
       z_grad = Outer()
       with self.cached_session() as sess:
-        self.assertEqual(sess.run(z_grad), 3.0)
+        self.assertEqual(self.evaluate(z_grad), 3.0)
 
   def testCapturedEagerTensors(self):
     # Test that we can handle captured eager tensors unrelated to the gradient
@@ -628,7 +628,7 @@ class HessianVectorProductTest(test_util.TensorFlowTestCase):
         mat_x = math_ops.matmul(mat, x, name="Ax")
         x_mat_x = math_ops.matmul(array_ops.transpose(x), mat_x, name="xAx")
         hess_v = gradients_impl._hessian_vector_product(x_mat_x, [x], [v])[0]
-        hess_v_actual = hess_v.eval()
+        hess_v_actual = self.evaluate(hess_v)
       self.assertAllClose(hess_v_value, hess_v_actual)
 
 
@@ -648,7 +648,7 @@ class HessianTest(test_util.TensorFlowTestCase):
       x = constant_op.constant(x_value)
       x_mat_x = math_ops.reduce_sum(x[:, None] * mat * x[None, :])
       hess = gradients.hessians(x_mat_x, x)[0]
-      hess_actual = hess.eval()
+      hess_actual = self.evaluate(hess)
     self.assertAllClose(hess_value, hess_actual)
 
   def testHessian1D_multi(self):
@@ -692,7 +692,7 @@ class HessianTest(test_util.TensorFlowTestCase):
           math_ops.matmul(array_ops.transpose(x), x) * 0.5
       )
       hess = gradients.hessians(x_square, x)[0]
-      hess_actual = hess.eval()
+      hess_actual = self.evaluate(hess)
     hess_value = np.bmat([
         [elem*np.ones((m, m)) for elem in vec]
         for vec in np.eye(m)
@@ -711,7 +711,7 @@ class HessianTest(test_util.TensorFlowTestCase):
           math_ops.matmul(array_ops.transpose(x), x) * 0.5
       )
       hess = gradients.hessians(x_square, x)[0]
-      hess_actual = hess.eval()
+      hess_actual = self.evaluate(hess)
     hess_value = np.bmat([
         [elem*np.ones((n, n)) for elem in vec]
         for vec in np.eye(m)
@@ -729,7 +729,7 @@ class IndexedSlicesToTensorTest(test_util.TensorFlowTestCase):
       c_sparse = math_ops._as_indexed_slices(c)
       self.assertAllEqual(np_val.shape, c_sparse.dense_shape.eval())
       c_dense = math_ops.multiply(c_sparse, 1.0)
-      self.assertAllClose(np_val, c_dense.eval())
+      self.assertAllClose(np_val, self.evaluate(c_dense))
 
   def testIndexedSlicesToTensorList(self):
     with self.cached_session():
@@ -745,7 +745,7 @@ class IndexedSlicesToTensorTest(test_util.TensorFlowTestCase):
         sparse_list.append(c_sparse)
       packed_dense = array_ops.stack(dense_list)
       packed_sparse = array_ops.stack(sparse_list)
-      self.assertAllClose(packed_dense.eval(), packed_sparse.eval())
+      self.assertAllClose(packed_dense.eval(), self.evaluate(packed_sparse))
 
   def testInt64Indices(self):
     with self.cached_session():
@@ -757,7 +757,7 @@ class IndexedSlicesToTensorTest(test_util.TensorFlowTestCase):
           math_ops.cast(c_sparse.indices, dtypes.int64), c_sparse.dense_shape)
       self.assertAllEqual(np_val.shape, c_sparse.dense_shape.eval())
       c_dense = math_ops.multiply(c_sparse, 1.0)
-      self.assertAllClose(np_val, c_dense.eval())
+      self.assertAllClose(np_val, self.evaluate(c_dense))
 
   def testWarnings(self):
     # TODO(gunan) Reenable after this issue is fixed:
@@ -853,7 +853,7 @@ class CustomGradientTest(test_util.TensorFlowTestCase):
       y = MyIdentity(MyIdentity(x))
       dy = gradients.gradients(y, x)[0]
       with session.Session():
-        self.assertEqual(9., dy.eval())
+        self.assertEqual(9., self.evaluate(dy))
 
   def testCustomGradient(self):
 
@@ -873,7 +873,7 @@ class CustomGradientTest(test_util.TensorFlowTestCase):
       y = MyMultiply(x1, x2)
       dy = gradients.gradients(y, [x1, x2])
       with session.Session() as sess:
-        self.assertAllEqual([3., 5.], sess.run(dy))
+        self.assertAllEqual([3., 5.], self.evaluate(dy))
 
   def testCustomGradientErrors(self):
 
@@ -914,7 +914,7 @@ class CustomGradientTest(test_util.TensorFlowTestCase):
       for g in grads:
         self.assertTrue(g is not None)
       with session.Session() as sess:
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         dw = sess.run(math_ops.reduce_sum(grads[1]))
         self.assertEqual(12., dw)
 
@@ -1074,7 +1074,7 @@ class TensorListGradientsTest(test_util.TensorFlowTestCase):
 
       grad = gradients.gradients(tl, a, grad_ys=grad_tl)[0]
       with self.cached_session() as sess:
-        self.assertEquals(sess.run(grad), 5.)
+        self.assertEquals(self.evaluate(grad), 5.)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/ops/histogram_ops_test.py b/tensorflow/python/ops/histogram_ops_test.py
index e7fe0efba4e..b48ef67196b 100644
--- a/tensorflow/python/ops/histogram_ops_test.py
+++ b/tensorflow/python/ops/histogram_ops_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.framework import constant_op
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import histogram_ops
@@ -39,7 +40,7 @@ class BinValuesFixedWidth(test.TestCase):
       bins = histogram_ops.histogram_fixed_width_bins(
           values, value_range, nbins=5)
       self.assertEqual(dtypes.int32, bins.dtype)
-      self.assertAllClose(expected_bins, bins.eval())
+      self.assertAllClose(expected_bins, self.evaluate(bins))
 
   def test_1d_values_int32_output(self):
     # Bins will be:
@@ -51,7 +52,7 @@ class BinValuesFixedWidth(test.TestCase):
       bins = histogram_ops.histogram_fixed_width_bins(
           values, value_range, nbins=5, dtype=dtypes.int64)
       self.assertEqual(dtypes.int32, bins.dtype)
-      self.assertAllClose(expected_bins, bins.eval())
+      self.assertAllClose(expected_bins, self.evaluate(bins))
 
   def test_1d_float64_values_int32_output(self):
     # Bins will be:
@@ -63,7 +64,7 @@ class BinValuesFixedWidth(test.TestCase):
       bins = histogram_ops.histogram_fixed_width_bins(
           values, value_range, nbins=5)
       self.assertEqual(dtypes.int32, bins.dtype)
-      self.assertAllClose(expected_bins, bins.eval())
+      self.assertAllClose(expected_bins, self.evaluate(bins))
 
   def test_2d_values(self):
     # Bins will be:
@@ -76,7 +77,7 @@ class BinValuesFixedWidth(test.TestCase):
       bins = histogram_ops.histogram_fixed_width_bins(
           values, value_range, nbins=5)
       self.assertEqual(dtypes.int32, bins.dtype)
-      self.assertAllClose(expected_bins, bins.eval())
+      self.assertAllClose(expected_bins, self.evaluate(bins))
 
 
 class HistogramFixedWidthTest(test.TestCase):
@@ -84,6 +85,7 @@ class HistogramFixedWidthTest(test.TestCase):
   def setUp(self):
     self.rng = np.random.RandomState(0)
 
+  @test_util.run_deprecated_v1
   def test_with_invalid_value_range(self):
     values = [-1.0, 0.0, 1.5, 2.0, 5.0, 15]
     with self.assertRaisesRegexp(
@@ -92,6 +94,7 @@ class HistogramFixedWidthTest(test.TestCase):
     with self.assertRaisesRegexp(ValueError, "Dimension must be 2 but is 3"):
       histogram_ops.histogram_fixed_width(values, [1.0, 2.0, 3.0])
 
+  @test_util.run_deprecated_v1
   def test_with_invalid_nbins(self):
     values = [-1.0, 0.0, 1.5, 2.0, 5.0, 15]
     with self.assertRaisesRegexp(
@@ -110,7 +113,7 @@ class HistogramFixedWidthTest(test.TestCase):
     with self.session(use_gpu=True):
       hist = histogram_ops.histogram_fixed_width(values, value_range, nbins=5)
       self.assertEqual(dtypes.int32, hist.dtype)
-      self.assertAllClose(expected_bin_counts, hist.eval())
+      self.assertAllClose(expected_bin_counts, self.evaluate(hist))
 
   def test_1d_values_int64_output(self):
     # Bins will be:
@@ -122,7 +125,7 @@ class HistogramFixedWidthTest(test.TestCase):
       hist = histogram_ops.histogram_fixed_width(
           values, value_range, nbins=5, dtype=dtypes.int64)
       self.assertEqual(dtypes.int64, hist.dtype)
-      self.assertAllClose(expected_bin_counts, hist.eval())
+      self.assertAllClose(expected_bin_counts, self.evaluate(hist))
 
   def test_1d_float64_values(self):
     # Bins will be:
@@ -133,7 +136,7 @@ class HistogramFixedWidthTest(test.TestCase):
     with self.session(use_gpu=True):
       hist = histogram_ops.histogram_fixed_width(values, value_range, nbins=5)
       self.assertEqual(dtypes.int32, hist.dtype)
-      self.assertAllClose(expected_bin_counts, hist.eval())
+      self.assertAllClose(expected_bin_counts, self.evaluate(hist))
 
   def test_2d_values(self):
     # Bins will be:
@@ -144,8 +147,9 @@ class HistogramFixedWidthTest(test.TestCase):
     with self.session(use_gpu=True):
       hist = histogram_ops.histogram_fixed_width(values, value_range, nbins=5)
       self.assertEqual(dtypes.int32, hist.dtype)
-      self.assertAllClose(expected_bin_counts, hist.eval())
+      self.assertAllClose(expected_bin_counts, self.evaluate(hist))
 
+  @test_util.run_deprecated_v1
   def test_shape_inference(self):
     value_range = [0.0, 5.0]
     values = [[-1.0, 0.0, 1.5], [2.0, 5.0, 15]]
@@ -155,7 +159,7 @@ class HistogramFixedWidthTest(test.TestCase):
       hist = histogram_ops.histogram_fixed_width(values, value_range, nbins=5)
       self.assertAllEqual(hist.shape.as_list(), (5,))
       self.assertEqual(dtypes.int32, hist.dtype)
-      self.assertAllClose(expected_bin_counts, hist.eval())
+      self.assertAllClose(expected_bin_counts, self.evaluate(hist))
 
       hist = histogram_ops.histogram_fixed_width(
           values, value_range, nbins=placeholder)
diff --git a/tensorflow/python/ops/image_grad_test.py b/tensorflow/python/ops/image_grad_test.py
index 32c2f37c0b7..c481266dd71 100644
--- a/tensorflow/python/ops/image_grad_test.py
+++ b/tensorflow/python/ops/image_grad_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import image_ops
@@ -44,9 +45,10 @@ class ResizeNearestNeighborOpTest(test.TestCase):
                                                        out_shape[1:3])
         self.assertEqual(out_shape, list(resize_out.get_shape()))
 
-        resize_out = sess.run(resize_out)
+        resize_out = self.evaluate(resize_out)
       self.assertEqual(out_shape, list(resize_out.shape))
 
+  @test_util.run_deprecated_v1
   def testGradFromResizeToLargerInBothDims(self):
     in_shape = [1, 2, 3, 1]
     out_shape = [1, 4, 6, 1]
@@ -62,6 +64,7 @@ class ResizeNearestNeighborOpTest(test.TestCase):
             input_tensor, in_shape, resize_out, out_shape, x_init_value=x)
       self.assertLess(err, 1e-3)
 
+  @test_util.run_deprecated_v1
   def testGradFromResizeToSmallerInBothDims(self):
     in_shape = [1, 4, 6, 1]
     out_shape = [1, 2, 3, 1]
@@ -77,6 +80,7 @@ class ResizeNearestNeighborOpTest(test.TestCase):
             input_tensor, in_shape, resize_out, out_shape, x_init_value=x)
       self.assertLess(err, 1e-3)
 
+  @test_util.run_deprecated_v1
   def testCompareGpuVsCpu(self):
     in_shape = [1, 4, 6, 3]
     out_shape = [1, 8, 16, 3]
@@ -113,9 +117,10 @@ class ResizeBilinearOpTest(test.TestCase):
       resize_out = image_ops.resize_bilinear(input_tensor, out_shape[1:3])
       self.assertEqual(out_shape, list(resize_out.get_shape()))
 
-      resize_out = sess.run(resize_out)
+      resize_out = self.evaluate(resize_out)
       self.assertEqual(out_shape, list(resize_out.shape))
 
+  @test_util.run_deprecated_v1
   def testGradFromResizeToLargerInBothDims(self):
     in_shape = [1, 2, 3, 1]
     out_shape = [1, 4, 6, 1]
@@ -129,6 +134,7 @@ class ResizeBilinearOpTest(test.TestCase):
           input_tensor, in_shape, resize_out, out_shape, x_init_value=x)
     self.assertLess(err, 1e-3)
 
+  @test_util.run_deprecated_v1
   def testGradFromResizeToSmallerInBothDims(self):
     in_shape = [1, 4, 6, 1]
     out_shape = [1, 2, 3, 1]
@@ -142,6 +148,7 @@ class ResizeBilinearOpTest(test.TestCase):
           input_tensor, in_shape, resize_out, out_shape, x_init_value=x)
     self.assertLess(err, 1e-3)
 
+  @test_util.run_deprecated_v1
   def testCompareGpuVsCpu(self):
     in_shape = [2, 4, 6, 3]
     out_shape = [2, 8, 16, 3]
@@ -160,6 +167,7 @@ class ResizeBilinearOpTest(test.TestCase):
 
       self.assertAllClose(grad[False], grad[True], rtol=1e-4, atol=1e-4)
 
+  @test_util.run_deprecated_v1
   def testTypes(self):
     in_shape = [1, 4, 6, 1]
     out_shape = [1, 2, 3, 1]
@@ -196,9 +204,10 @@ class ResizeBicubicOpTest(test.TestCase):
                                               align_corners=align_corners)
         self.assertEqual(out_shape, list(resize_out.get_shape()))
 
-        resize_out = sess.run(resize_out)
+        resize_out = self.evaluate(resize_out)
         self.assertEqual(out_shape, list(resize_out.shape))
 
+  @test_util.run_deprecated_v1
   def testGradFromResizeToLargerInBothDims(self):
     in_shape = [1, 2, 3, 1]
     out_shape = [1, 4, 6, 1]
@@ -214,6 +223,7 @@ class ResizeBicubicOpTest(test.TestCase):
             input_tensor, in_shape, resize_out, out_shape, x_init_value=x)
       self.assertLess(err, 1e-3)
 
+  @test_util.run_deprecated_v1
   def testGradFromResizeToSmallerInBothDims(self):
     in_shape = [1, 4, 6, 1]
     out_shape = [1, 2, 3, 1]
@@ -229,6 +239,7 @@ class ResizeBicubicOpTest(test.TestCase):
             input_tensor, in_shape, resize_out, out_shape, x_init_value=x)
       self.assertLess(err, 1e-3)
 
+  @test_util.run_deprecated_v1
   def testGradOnUnsupportedType(self):
     in_shape = [1, 4, 6, 1]
     out_shape = [1, 2, 3, 1]
@@ -273,7 +284,7 @@ class CropAndResizeOpTest(test.TestCase):
           constant_op.constant(
               crop_size, shape=[2]))
       self.assertEqual(crops_shape, list(crops.get_shape()))
-      crops = sess.run(crops)
+      crops = self.evaluate(crops)
       self.assertEqual(crops_shape, list(crops.shape))
 
   def _randomUniformAvoidAnchors(self, low, high, anchors, radius, num_samples):
@@ -306,6 +317,7 @@ class CropAndResizeOpTest(test.TestCase):
         samples.append(sample)
     return samples
 
+  @test_util.run_deprecated_v1
   def testGradRandomBoxes(self):
     """Test that the gradient is correct for randomly generated boxes.
 
diff --git a/tensorflow/python/ops/image_ops_impl.py b/tensorflow/python/ops/image_ops_impl.py
index 3ab3695a03c..229393c9703 100644
--- a/tensorflow/python/ops/image_ops_impl.py
+++ b/tensorflow/python/ops/image_ops_impl.py
@@ -24,6 +24,7 @@ from tensorflow.python.compat import compat
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import random_seed
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
@@ -37,6 +38,7 @@ from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import string_ops
 from tensorflow.python.ops import variables
+from tensorflow.python.util import deprecation
 from tensorflow.python.util.tf_export import tf_export
 
 ops.NotDifferentiable('RandomCrop')
@@ -511,15 +513,20 @@ def _rot90_4D(images, k, name_scope):
   result.set_shape([shape[0], None, None, shape[3]])
   return result
 
-@tf_export('image.transpose_image')
-def transpose_image(image):
-  """Transpose image(s) by swapping the height and width dimension.
 
-  See also `transpose()`.
+@tf_export(v1=['image.transpose', 'image.transpose_image'])
+def transpose_image(image):
+  return transpose(image=image, name=None)
+
+
+@tf_export('image.transpose', v1=[])
+def transpose(image, name=None):
+  """Transpose image(s) by swapping the height and width dimension.
 
   Args:
     image: 4-D Tensor of shape `[batch, height, width, channels]` or
            3-D Tensor of shape `[height, width, channels]`.
+    name: A name for this operation (optional).
 
   Returns:
     If `image` was 4-D, a 4-D float Tensor of shape
@@ -530,14 +537,14 @@ def transpose_image(image):
   Raises:
     ValueError: if the shape of `image` not supported.
   """
-  with ops.name_scope(None, 'transpose_image', [image]):
+  with ops.name_scope(name, 'transpose', [image]):
     image = ops.convert_to_tensor(image, name='image')
     image = _AssertAtLeast3DImage(image)
     shape = image.get_shape()
     if shape.ndims == 3 or shape.ndims is None:
-      return array_ops.transpose(image, [1, 0, 2], name='transpose_image')
+      return array_ops.transpose(image, [1, 0, 2], name=name)
     elif shape.ndims == 4:
-      return array_ops.transpose(image, [0, 2, 1, 3], name='transpose_image')
+      return array_ops.transpose(image, [0, 2, 1, 3], name=name)
     else:
       raise ValueError('\'image\' must have either 3 or 4 dimensions.')
 
@@ -938,12 +945,28 @@ class ResizeMethod(object):
   AREA = 3
 
 
-@tf_export('image.resize_images')
+@tf_export(v1=['image.resize_images', 'image.resize'])
 def resize_images(images,
                   size,
                   method=ResizeMethod.BILINEAR,
                   align_corners=False,
                   preserve_aspect_ratio=False):
+  return resize_images_v2(
+      images=images,
+      size=size,
+      method=method,
+      align_corners=align_corners,
+      preserve_aspect_ratio=preserve_aspect_ratio,
+      name=None)
+
+
+@tf_export('image.resize', v1=[])
+def resize_images_v2(images,
+                     size,
+                     method=ResizeMethod.BILINEAR,
+                     align_corners=False,
+                     preserve_aspect_ratio=False,
+                     name=None):
   """Resize `images` to `size` using the specified `method`.
 
   Resized images will be distorted if their original aspect ratio is not
@@ -979,6 +1002,7 @@ def resize_images(images,
       then `images` will be resized to a size that fits in `size` while
       preserving the aspect ratio of the original image. Scales up the image if
       `size` is bigger than the current size of the `image`. Defaults to False.
+    name: A name for this operation (optional).
 
   Raises:
     ValueError: if the shape of `images` is incompatible with the
@@ -992,7 +1016,7 @@ def resize_images(images,
     If `images` was 3-D, a 3-D float Tensor of shape
     `[new_height, new_width, channels]`.
   """
-  with ops.name_scope(None, 'resize_images', [images, size]):
+  with ops.name_scope(name, 'resize', [images, size]):
     images = ops.convert_to_tensor(images, name='images')
     if images.get_shape().ndims is None:
       raise ValueError('\'images\' contains no shape.')
@@ -1736,7 +1760,7 @@ def adjust_saturation(image, saturation_factor, name=None):
         orig_dtype)
 
 
-@tf_export('image.is_jpeg')
+@tf_export('io.is_jpeg', 'image.is_jpeg', v1=['io.is_jpeg', 'image.is_jpeg'])
 def is_jpeg(contents, name=None):
   r"""Convenience function to check if the 'contents' encodes a JPEG image.
 
@@ -1771,8 +1795,28 @@ def _is_png(contents, name=None):
     substr = string_ops.substr(contents, 0, 3)
     return math_ops.equal(substr, b'\211PN', name=name)
 
+tf_export('io.decode_and_crop_jpeg', 'image.decode_and_crop_jpeg',
+          v1=['io.decode_and_crop_jpeg', 'image.decode_and_crop_jpeg'])(
+              gen_image_ops.decode_and_crop_jpeg)
 
-@tf_export('image.decode_image')
+tf_export('io.decode_bmp', 'image.decode_bmp',
+          v1=['io.decode_bmp', 'image.decode_bmp'])(gen_image_ops.decode_bmp)
+tf_export('io.decode_gif', 'image.decode_gif',
+          v1=['io.decode_gif', 'image.decode_gif'])(gen_image_ops.decode_gif)
+tf_export('io.decode_jpeg', 'image.decode_jpeg',
+          v1=['io.decode_jpeg', 'image.decode_jpeg'])(gen_image_ops.decode_jpeg)
+tf_export('io.decode_png', 'image.decode_png',
+          v1=['io.decode_png', 'image.decode_png'])(gen_image_ops.decode_png)
+
+tf_export('io.encode_jpeg', 'image.encode_jpeg',
+          v1=['io.encode_jpeg', 'image.encode_jpeg'])(gen_image_ops.encode_jpeg)
+tf_export('io.extract_jpeg_shape', 'image.extract_jpeg_shape',
+          v1=['io.extract_jpeg_shape', 'image.extract_jpeg_shape'])(
+              gen_image_ops.extract_jpeg_shape)
+
+
+@tf_export('io.decode_image', 'image.decode_image',
+           v1=['io.decode_image', 'image.decode_image'])
 def decode_image(contents, channels=None, dtype=dtypes.uint8, name=None):
   """Convenience function for `decode_bmp`, `decode_gif`, `decode_jpeg`,
   and `decode_png`.
@@ -1942,7 +1986,114 @@ def total_variation(images, name=None):
   return tot_var
 
 
-@tf_export('image.sample_distorted_bounding_box')
+@tf_export('image.sample_distorted_bounding_box', v1=[])
+def sample_distorted_bounding_box_v2(image_size,
+                                     bounding_boxes,
+                                     seed=0,
+                                     min_object_covered=0.1,
+                                     aspect_ratio_range=None,
+                                     area_range=None,
+                                     max_attempts=None,
+                                     use_image_if_no_bounding_boxes=None,
+                                     name=None):
+  """Generate a single randomly distorted bounding box for an image.
+
+  Bounding box annotations are often supplied in addition to ground-truth labels
+  in image recognition or object localization tasks. A common technique for
+  training such a system is to randomly distort an image while preserving
+  its content, i.e. *data augmentation*. This Op outputs a randomly distorted
+  localization of an object, i.e. bounding box, given an `image_size`,
+  `bounding_boxes` and a series of constraints.
+
+  The output of this Op is a single bounding box that may be used to crop the
+  original image. The output is returned as 3 tensors: `begin`, `size` and
+  `bboxes`. The first 2 tensors can be fed directly into `tf.slice` to crop the
+  image. The latter may be supplied to `tf.image.draw_bounding_boxes` to
+  visualize what the bounding box looks like.
+
+  Bounding boxes are supplied and returned as `[y_min, x_min, y_max, x_max]`.
+  The bounding box coordinates are floats in `[0.0, 1.0]` relative to the width
+  and height of the underlying image.
+
+  For example,
+
+  ```python
+      # Generate a single distorted bounding box.
+      begin, size, bbox_for_draw = tf.image.sample_distorted_bounding_box(
+          tf.shape(image),
+          bounding_boxes=bounding_boxes,
+          min_object_covered=0.1)
+
+      # Draw the bounding box in an image summary.
+      image_with_box = tf.image.draw_bounding_boxes(tf.expand_dims(image, 0),
+                                                    bbox_for_draw)
+      tf.summary.image('images_with_box', image_with_box)
+
+      # Employ the bounding box to distort the image.
+      distorted_image = tf.slice(image, begin, size)
+  ```
+
+  Note that if no bounding box information is available, setting
+  `use_image_if_no_bounding_boxes = true` will assume there is a single implicit
+  bounding box covering the whole image. If `use_image_if_no_bounding_boxes` is
+  false and no bounding boxes are supplied, an error is raised.
+
+  Args:
+    image_size: A `Tensor`. Must be one of the following types: `uint8`, `int8`,
+      `int16`, `int32`, `int64`.
+      1-D, containing `[height, width, channels]`.
+    bounding_boxes: A `Tensor` of type `float32`.
+      3-D with shape `[batch, N, 4]` describing the N bounding boxes
+      associated with the image.
+    seed: An optional `int`. Defaults to `0`.
+      If either `seed` or `seed2` are set to non-zero, the random number
+      generator is seeded by the given `seed`.  Otherwise, it is seeded by a
+      random seed.
+    min_object_covered: A Tensor of type `float32`. Defaults to `0.1`.
+      The cropped area of the image must contain at least this
+      fraction of any bounding box supplied. The value of this parameter should
+      be non-negative. In the case of 0, the cropped area does not need to
+      overlap any of the bounding boxes supplied.
+    aspect_ratio_range: An optional list of `floats`. Defaults to `[0.75,
+      1.33]`.
+      The cropped area of the image must have an aspect `ratio =
+      width / height` within this range.
+    area_range: An optional list of `floats`. Defaults to `[0.05, 1]`.
+      The cropped area of the image must contain a fraction of the
+      supplied image within this range.
+    max_attempts: An optional `int`. Defaults to `100`.
+      Number of attempts at generating a cropped region of the image
+      of the specified constraints. After `max_attempts` failures, return the
+      entire image.
+    use_image_if_no_bounding_boxes: An optional `bool`. Defaults to `False`.
+      Controls behavior if no bounding boxes supplied.
+      If true, assume an implicit bounding box covering the whole input. If
+      false, raise an error.
+    name: A name for the operation (optional).
+
+  Returns:
+    A tuple of `Tensor` objects (begin, size, bboxes).
+
+    begin: A `Tensor`. Has the same type as `image_size`. 1-D, containing
+    `[offset_height, offset_width, 0]`. Provide as input to
+      `tf.slice`.
+    size: A `Tensor`. Has the same type as `image_size`. 1-D, containing
+    `[target_height, target_width, -1]`. Provide as input to
+      `tf.slice`.
+    bboxes: A `Tensor` of type `float32`. 3-D with shape `[1, 1, 4]` containing
+    the distorted bounding box.
+    Provide as input to `tf.image.draw_bounding_boxes`.
+  """
+  seed1, seed2 = random_seed.get_seed(seed) if seed else (0, 0)
+  return sample_distorted_bounding_box(
+      image_size, bounding_boxes, seed1, seed2, min_object_covered,
+      aspect_ratio_range, area_range, max_attempts,
+      use_image_if_no_bounding_boxes, name)
+
+
+@tf_export(v1=['image.sample_distorted_bounding_box'])
+@deprecation.deprecated(date=None, instructions='`seed2` arg is deprecated.'
+                        'Use sample_distorted_bounding_box_v2 instead.')
 def sample_distorted_bounding_box(image_size,
                                   bounding_boxes,
                                   seed=None,
@@ -2808,3 +2959,102 @@ def sobel_edges(image):
   output = array_ops.reshape(output, shape=shape)
   output.set_shape(static_image_shape.concatenate([num_kernels]))
   return output
+
+
+resize_area_deprecation = deprecation.deprecated(
+    date=None,
+    instructions=(
+        'Use `tf.image.resize(...method=ResizeMethod.AREA...)` instead.'))
+tf_export(v1=['image.resize_area'])(
+    resize_area_deprecation(gen_image_ops.resize_area))
+
+resize_bicubic_deprecation = deprecation.deprecated(
+    date=None,
+    instructions=(
+        'Use `tf.image.resize(...method=ResizeMethod.BICUBIC...)` instead.'))
+tf_export(v1=['image.resize_bicubic'])(
+    resize_bicubic_deprecation(gen_image_ops.resize_bicubic))
+
+resize_bilinear_deprecation = deprecation.deprecated(
+    date=None,
+    instructions=(
+        'Use `tf.image.resize(...method=ResizeMethod.BILINEAR...)` instead.'))
+tf_export(v1=['image.resize_bilinear'])(
+    resize_bilinear_deprecation(gen_image_ops.resize_bilinear))
+
+resize_nearest_neighbor_deprecation = deprecation.deprecated(
+    date=None,
+    instructions=(
+        'Use `tf.image.resize(...method=ResizeMethod.NEAREST_NEIGHBOR...)` '
+        'instead.'))
+tf_export(v1=['image.resize_nearest_neighbor'])(
+    resize_nearest_neighbor_deprecation(gen_image_ops.resize_nearest_neighbor))
+
+
+@tf_export('image.crop_and_resize', v1=[])
+def crop_and_resize_v2(
+    image,
+    boxes,
+    box_indices,
+    crop_size,
+    method='bilinear',
+    extrapolation_value=0,
+    name=None):
+  """Extracts crops from the input image tensor and resizes them.
+
+  Extracts crops from the input image tensor and resizes them using bilinear
+  sampling or nearest neighbor sampling (possibly with aspect ratio change) to a
+  common output size specified by `crop_size`. This is more general than the
+  `crop_to_bounding_box` op which extracts a fixed size slice from the input
+  image and does not allow resizing or aspect ratio change.
+
+  Returns a tensor with `crops` from the input `image` at positions defined at
+  the bounding box locations in `boxes`. The cropped boxes are all resized (with
+  bilinear or nearest neighbor interpolation) to a fixed
+  `size = [crop_height, crop_width]`. The result is a 4-D tensor
+  `[num_boxes, crop_height, crop_width, depth]`. The resizing is corner aligned.
+  In particular, if `boxes = [[0, 0, 1, 1]]`, the method will give identical
+  results to using `tf.image.resize_bilinear()` or
+  `tf.image.resize_nearest_neighbor()`(depends on the `method` argument) with
+  `align_corners=True`.
+
+  Args:
+    image: A 4-D tensor of shape `[batch, image_height, image_width, depth]`.
+      Both `image_height` and `image_width` need to be positive.
+    boxes: A 2-D tensor of shape `[num_boxes, 4]`. The `i`-th row of the tensor
+      specifies the coordinates of a box in the `box_ind[i]` image and is
+      specified in normalized coordinates `[y1, x1, y2, x2]`. A normalized
+      coordinate value of `y` is mapped to the image coordinate at `y *
+      (image_height - 1)`, so as the `[0, 1]` interval of normalized image
+      height is mapped to `[0, image_height - 1]` in image height coordinates.
+      We do allow `y1` > `y2`, in which case the sampled crop is an up-down
+      flipped version of the original image. The width dimension is treated
+      similarly. Normalized coordinates outside the `[0, 1]` range are allowed,
+      in which case we use `extrapolation_value` to extrapolate the input image
+      values.
+    box_indices: A 1-D tensor of shape `[num_boxes]` with int32 values in `[0,
+      batch)`. The value of `box_ind[i]` specifies the image that the `i`-th box
+      refers to.
+    crop_size: A 1-D tensor of 2 elements, `size = [crop_height, crop_width]`.
+      All cropped image patches are resized to this size. The aspect ratio of
+      the image content is not preserved. Both `crop_height` and `crop_width`
+      need to be positive.
+    method: An optional string specifying the sampling method for resizing. It
+      can be either `"bilinear"` or `"nearest"` and default to `"bilinear"`.
+      Currently two sampling methods are supported: Bilinear and Nearest
+      Neighbor.
+    extrapolation_value: An optional `float`. Defaults to `0`. Value used for
+      extrapolation, when applicable.
+    name: A name for the operation (optional).
+
+  Returns:
+    A 4-D tensor of shape `[num_boxes, crop_height, crop_width, depth]`.
+  """
+  return gen_image_ops.crop_and_resize(
+      image, boxes, box_indices, crop_size, method, extrapolation_value, name)
+
+
+crop_and_resize_deprecation = deprecation.deprecated_args(
+    None, 'box_ind is deprecated, use box_indices instead', 'box_ind')
+tf_export(v1=['image.crop_and_resize'])(
+    crop_and_resize_deprecation(gen_image_ops.crop_and_resize))
diff --git a/tensorflow/python/ops/image_ops_test.py b/tensorflow/python/ops/image_ops_test.py
index a3aeb79586b..e7249333bd3 100644
--- a/tensorflow/python/ops/image_ops_test.py
+++ b/tensorflow/python/ops/image_ops_test.py
@@ -70,7 +70,8 @@ class RGBToHSVTest(test_util.TensorFlowTestCase):
         split2 = list(map(image_ops.hsv_to_rgb, split1))
         join1 = array_ops.stack(split1)
         join2 = array_ops.stack(split2)
-        batch1, batch2, join1, join2 = sess.run([batch1, batch2, join1, join2])
+        batch1, batch2, join1, join2 = self.evaluate(
+            [batch1, batch2, join1, join2])
 
       # Verify that processing batch elements together is the same as separate
       self.assertAllClose(batch1, join1)
@@ -84,7 +85,7 @@ class RGBToHSVTest(test_util.TensorFlowTestCase):
       with self.test_session(use_gpu=True):
         hsv = image_ops.rgb_to_hsv(rgb_np)
         rgb = image_ops.hsv_to_rgb(hsv)
-        rgb_tf = rgb.eval()
+        rgb_tf = self.evaluate(rgb)
       self.assertAllClose(rgb_tf, rgb_np)
 
 
@@ -109,7 +110,8 @@ class RGBToYIQTest(test_util.TensorFlowTestCase):
         split2 = list(map(image_ops.yiq_to_rgb, split1))
         join1 = array_ops.stack(split1)
         join2 = array_ops.stack(split2)
-        batch1, batch2, join1, join2 = sess.run([batch1, batch2, join1, join2])
+        batch1, batch2, join1, join2 = self.evaluate(
+            [batch1, batch2, join1, join2])
 
       # Verify that processing batch elements together is the same as separate
       self.assertAllClose(batch1, join1, rtol=1e-4, atol=1e-4)
@@ -138,7 +140,8 @@ class RGBToYUVTest(test_util.TensorFlowTestCase):
         split2 = list(map(image_ops.yuv_to_rgb, split1))
         join1 = array_ops.stack(split1)
         join2 = array_ops.stack(split2)
-        batch1, batch2, join1, join2 = sess.run([batch1, batch2, join1, join2])
+        batch1, batch2, join1, join2 = self.evaluate(
+            [batch1, batch2, join1, join2])
 
       # Verify that processing batch elements together is the same as separate
       self.assertAllClose(batch1, join1, rtol=1e-4, atol=1e-4)
@@ -173,7 +176,7 @@ class GrayscaleToRGBTest(test_util.TensorFlowTestCase):
     with self.test_session(use_gpu=True):
       x_tf = constant_op.constant(x_np, shape=x_np.shape)
       y = image_ops.rgb_to_grayscale(x_tf)
-      y_tf = y.eval()
+      y_tf = self.evaluate(y)
       self.assertAllEqual(y_tf, y_np)
 
   def testBasicRGBToGrayscale(self):
@@ -195,7 +198,7 @@ class GrayscaleToRGBTest(test_util.TensorFlowTestCase):
     with self.test_session(use_gpu=True):
       x_tf = constant_op.constant(x_np, shape=x_np.shape)
       y = image_ops.grayscale_to_rgb(x_tf)
-      y_tf = y.eval()
+      y_tf = self.evaluate(y)
       self.assertAllEqual(y_tf, y_np)
 
     # 3-D input with no batch dimension.
@@ -205,9 +208,10 @@ class GrayscaleToRGBTest(test_util.TensorFlowTestCase):
     with self.test_session(use_gpu=True):
       x_tf = constant_op.constant(x_np, shape=x_np.shape)
       y = image_ops.grayscale_to_rgb(x_tf)
-      y_tf = y.eval()
+      y_tf = self.evaluate(y)
       self.assertAllEqual(y_tf, y_np)
 
+  @test_util.run_deprecated_v1
   def testShapeInference(self):
     # Shape inference works and produces expected output where possible
     rgb_shape = [7, None, 19, 3]
@@ -245,7 +249,7 @@ class AdjustGamma(test_util.TensorFlowTestCase):
       x = constant_op.constant(x_np, shape=x_np.shape)
       y = image_ops.adjust_gamma(x, gamma=1)
 
-      y_tf = y.eval()
+      y_tf = self.evaluate(y)
       y_np = x_np
 
       self.assertAllClose(y_tf, y_np, 1e-6)
@@ -268,6 +272,7 @@ class AdjustGamma(test_util.TensorFlowTestCase):
       else:
         raise AssertionError("Exception not raised: %s" % err_msg)
 
+  @test_util.run_deprecated_v1
   def test_adjust_gamma_less_zero_tensor(self):
     """White image should be returned for gamma equal to zero"""
     with self.cached_session():
@@ -281,7 +286,7 @@ class AdjustGamma(test_util.TensorFlowTestCase):
 
       err_msg = "Gamma should be a non-negative real number."
       try:
-        image.eval()
+        self.evaluate(image)
       except Exception as e:
         if err_msg not in str(e):
           raise
@@ -297,7 +302,7 @@ class AdjustGamma(test_util.TensorFlowTestCase):
       x = constant_op.constant(x_np, shape=x_np.shape)
       y = image_ops.adjust_gamma(x, gamma=0)
 
-      y_tf = y.eval()
+      y_tf = self.evaluate(y)
 
       dtype = x.dtype.as_numpy_dtype
       y_np = np.array([dtypes.dtype_range[dtype][1]] * x_np.size)
@@ -305,6 +310,7 @@ class AdjustGamma(test_util.TensorFlowTestCase):
 
       self.assertAllClose(y_tf, y_np, 1e-6)
 
+  @test_util.run_deprecated_v1
   def test_adjust_gamma_less_one(self):
     """Verifying the output with expected results for gamma
     correction with gamma equal to half"""
@@ -326,6 +332,7 @@ class AdjustGamma(test_util.TensorFlowTestCase):
 
       self.assertAllClose(y_tf, y_np, 1e-6)
 
+  @test_util.run_deprecated_v1
   def test_adjust_gamma_greater_one(self):
     """Verifying the output with expected results for gamma
     correction with gamma equal to two"""
@@ -360,7 +367,7 @@ class AdjustHueTest(test_util.TensorFlowTestCase):
     with self.test_session(use_gpu=True):
       x = constant_op.constant(x_np, shape=x_shape)
       y = image_ops.adjust_hue(x, delta)
-      y_tf = y.eval()
+      y_tf = self.evaluate(y)
       self.assertAllEqual(y_tf, y_np)
 
   def testAdjustPositiveHue(self):
@@ -375,7 +382,7 @@ class AdjustHueTest(test_util.TensorFlowTestCase):
     with self.test_session(use_gpu=True):
       x = constant_op.constant(x_np, shape=x_shape)
       y = image_ops.adjust_hue(x, delta)
-      y_tf = y.eval()
+      y_tf = self.evaluate(y)
       self.assertAllEqual(y_tf, y_np)
 
   def testBatchAdjustHue(self):
@@ -390,7 +397,7 @@ class AdjustHueTest(test_util.TensorFlowTestCase):
     with self.test_session(use_gpu=True):
       x = constant_op.constant(x_np, shape=x_shape)
       y = image_ops.adjust_hue(x, delta)
-      y_tf = y.eval()
+      y_tf = self.evaluate(y)
       self.assertAllEqual(y_tf, y_np)
 
   def _adjustHueNp(self, x_np, delta_h):
@@ -415,7 +422,7 @@ class AdjustHueTest(test_util.TensorFlowTestCase):
     with self.test_session(use_gpu=True):
       x = constant_op.constant(x_np)
       y = image_ops.adjust_hue(x, delta_h)
-      y_tf = y.eval()
+      y_tf = self.evaluate(y)
     return y_tf
 
   def testAdjustRandomHue(self):
@@ -488,11 +495,11 @@ class FlipImageBenchmark(test.Benchmark):
             trainable=False,
             dtype=dtypes.float32)
         run_op = image_ops.flip_left_right(inputs)
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         for i in xrange(warmup_rounds + benchmark_rounds):
           if i == warmup_rounds:
             start = time.time()
-          sess.run(run_op)
+          self.evaluate(run_op)
     end = time.time()
     step_time = (end - start) / benchmark_rounds
     tag = device + "_%s" % (cpu_count if cpu_count is not None else "_all")
@@ -518,11 +525,11 @@ class FlipImageBenchmark(test.Benchmark):
             trainable=False,
             dtype=dtypes.float32)
         run_op = image_ops.random_flip_left_right(inputs)
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         for i in xrange(warmup_rounds + benchmark_rounds):
           if i == warmup_rounds:
             start = time.time()
-          sess.run(run_op)
+          self.evaluate(run_op)
     end = time.time()
     step_time = (end - start) / benchmark_rounds
     tag = device + "_%s" % (cpu_count if cpu_count is not None else "_all")
@@ -548,11 +555,11 @@ class FlipImageBenchmark(test.Benchmark):
             trainable=False,
             dtype=dtypes.float32)
         run_op = image_ops.random_flip_left_right(inputs)
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         for i in xrange(warmup_rounds + benchmark_rounds):
           if i == warmup_rounds:
             start = time.time()
-          sess.run(run_op)
+          self.evaluate(run_op)
     end = time.time()
     step_time = (end - start) / benchmark_rounds
     tag = device + "_%s" % (cpu_count if cpu_count is not None else "_all")
@@ -610,11 +617,11 @@ class AdjustHueBenchmark(test.Benchmark):
       delta = constant_op.constant(0.1, dtype=dtypes.float32)
       outputs = image_ops.adjust_hue(inputs, delta)
       run_op = control_flow_ops.group(outputs)
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       for i in xrange(warmup_rounds + benchmark_rounds):
         if i == warmup_rounds:
           start = time.time()
-        sess.run(run_op)
+        self.evaluate(run_op)
     end = time.time()
     step_time = (end - start) / benchmark_rounds
     tag = device + "_%s" % (cpu_count if cpu_count is not None else "_all")
@@ -653,12 +660,12 @@ class AdjustSaturationBenchmark(test.Benchmark):
       delta = constant_op.constant(0.1, dtype=dtypes.float32)
       outputs = image_ops.adjust_saturation(inputs, delta)
       run_op = control_flow_ops.group(outputs)
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       for _ in xrange(warmup_rounds):
-        sess.run(run_op)
+        self.evaluate(run_op)
       start = time.time()
       for _ in xrange(benchmark_rounds):
-        sess.run(run_op)
+        self.evaluate(run_op)
     end = time.time()
     step_time = (end - start) / benchmark_rounds
     tag = device + "_%s" % (cpu_count if cpu_count is not None else "_all")
@@ -698,7 +705,7 @@ class ResizeBilinearBenchmark(test.Benchmark):
       benchmark_op = control_flow_ops.group(*deps)
 
     with self.benchmark_session() as sess:
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       results = self.run_op_benchmark(
           sess,
           benchmark_op,
@@ -746,7 +753,7 @@ class ResizeBicubicBenchmark(test.Benchmark):
       benchmark_op = control_flow_ops.group(*deps)
 
     with self.benchmark_session() as sess:
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       results = self.run_op_benchmark(
           sess,
           benchmark_op,
@@ -803,7 +810,7 @@ class ResizeAreaBenchmark(test.Benchmark):
       benchmark_op = control_flow_ops.group(*deps)
 
     with self.benchmark_session() as sess:
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       results = self.run_op_benchmark(
           sess,
           benchmark_op,
@@ -846,7 +853,7 @@ class AdjustSaturationTest(test_util.TensorFlowTestCase):
     with self.test_session(use_gpu=True):
       x = constant_op.constant(x_np, shape=x_shape)
       y = image_ops.adjust_saturation(x, saturation_factor)
-      y_tf = y.eval()
+      y_tf = self.evaluate(y)
       self.assertAllEqual(y_tf, y_np)
 
   def testTwiceSaturation(self):
@@ -861,7 +868,7 @@ class AdjustSaturationTest(test_util.TensorFlowTestCase):
     with self.test_session(use_gpu=True):
       x = constant_op.constant(x_np, shape=x_shape)
       y = image_ops.adjust_saturation(x, saturation_factor)
-      y_tf = y.eval()
+      y_tf = self.evaluate(y)
       self.assertAllEqual(y_tf, y_np)
 
   def testBatchSaturation(self):
@@ -876,7 +883,7 @@ class AdjustSaturationTest(test_util.TensorFlowTestCase):
     with self.test_session(use_gpu=True):
       x = constant_op.constant(x_np, shape=x_shape)
       y = image_ops.adjust_saturation(x, saturation_factor)
-      y_tf = y.eval()
+      y_tf = self.evaluate(y)
       self.assertAllEqual(y_tf, y_np)
 
   def _adjust_saturation(self, image, saturation_factor):
@@ -899,7 +906,7 @@ class AdjustSaturationTest(test_util.TensorFlowTestCase):
     with self.test_session(use_gpu=True):
       x = constant_op.constant(x_np, shape=x_shape)
       y = self._adjust_saturation(x, saturation_factor)
-      y_tf = y.eval()
+      y_tf = self.evaluate(y)
       self.assertAllEqual(y_tf, y_np)
 
   def testTwiceSaturationFused(self):
@@ -914,7 +921,7 @@ class AdjustSaturationTest(test_util.TensorFlowTestCase):
     with self.test_session(use_gpu=True):
       x = constant_op.constant(x_np, shape=x_shape)
       y = self._adjust_saturation(x, saturation_factor)
-      y_tf = y.eval()
+      y_tf = self.evaluate(y)
       self.assertAllEqual(y_tf, y_np)
 
   def _adjustSaturationNp(self, x_np, scale):
@@ -935,6 +942,7 @@ class AdjustSaturationTest(test_util.TensorFlowTestCase):
       y_v[i][2] = b
     return y_v.reshape(x_np.shape)
 
+  @test_util.run_deprecated_v1
   def testAdjustRandomSaturation(self):
     x_shapes = [
         [2, 2, 3],
@@ -980,7 +988,7 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
     with self.test_session(use_gpu=True):
       x_tf = constant_op.constant(x_np, shape=x_np.shape)
       y = image_ops.flip_left_right(image_ops.flip_left_right(x_tf))
-      y_tf = y.eval()
+      y_tf = self.evaluate(y)
       self.assertAllEqual(y_tf, x_np)
 
   def testInvolutionLeftRightWithBatch(self):
@@ -990,9 +998,10 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
     with self.test_session(use_gpu=True):
       x_tf = constant_op.constant(x_np, shape=x_np.shape)
       y = image_ops.flip_left_right(image_ops.flip_left_right(x_tf))
-      y_tf = y.eval()
+      y_tf = self.evaluate(y)
       self.assertAllEqual(y_tf, x_np)
 
+  @test_util.run_deprecated_v1
   def testLeftRight(self):
     x_np = np.array([[1, 2, 3], [1, 2, 3]], dtype=np.uint8).reshape([2, 3, 1])
     y_np = np.array([[3, 2, 1], [3, 2, 1]], dtype=np.uint8).reshape([2, 3, 1])
@@ -1001,7 +1010,7 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
       x_tf = constant_op.constant(x_np, shape=x_np.shape)
       y = image_ops.flip_left_right(x_tf)
       self.assertTrue(y.op.name.startswith("flip_left_right"))
-      y_tf = y.eval()
+      y_tf = self.evaluate(y)
       self.assertAllEqual(y_tf, y_np)
 
   def testLeftRightWithBatch(self):
@@ -1015,9 +1024,10 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
     with self.test_session(use_gpu=True):
       x_tf = constant_op.constant(x_np, shape=x_np.shape)
       y = image_ops.flip_left_right(x_tf)
-      y_tf = y.eval()
+      y_tf = self.evaluate(y)
       self.assertAllEqual(y_tf, y_np)
 
+  @test_util.run_deprecated_v1
   def testRandomFlipLeftRight(self):
     x_np = np.array([[1, 2, 3], [1, 2, 3]], dtype=np.uint8).reshape([2, 3, 1])
     y_np = np.array([[3, 2, 1], [3, 2, 1]], dtype=np.uint8).reshape([2, 3, 1])
@@ -1031,7 +1041,7 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
       count_flipped = 0
       count_unflipped = 0
       for _ in range(100):
-        y_tf = y.eval()
+        y_tf = self.evaluate(y)
         if y_tf[0][0] == 1:
           self.assertAllEqual(y_tf, x_np)
           count_unflipped += 1
@@ -1046,6 +1056,7 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
       self.assertGreaterEqual(count_flipped, 20)
       self.assertGreaterEqual(count_unflipped, 20)
 
+  @test_util.run_deprecated_v1
   def testRandomFlipLeftRightWithBatch(self):
     batch_size = 16
     seed = 42
@@ -1070,7 +1081,7 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
       count_flipped = 0
       count_unflipped = 0
       for _ in range(100):
-        y_tf = y.eval()
+        y_tf = self.evaluate(y)
 
         # check every element of the batch
         for i in range(batch_size):
@@ -1096,7 +1107,7 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
     with self.test_session(use_gpu=True):
       x_tf = constant_op.constant(x_np, shape=x_np.shape)
       y = image_ops.flip_up_down(image_ops.flip_up_down(x_tf))
-      y_tf = y.eval()
+      y_tf = self.evaluate(y)
       self.assertAllEqual(y_tf, x_np)
 
   def testInvolutionUpDownWithBatch(self):
@@ -1107,9 +1118,10 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
     with self.test_session(use_gpu=True):
       x_tf = constant_op.constant(x_np, shape=x_np.shape)
       y = image_ops.flip_up_down(image_ops.flip_up_down(x_tf))
-      y_tf = y.eval()
+      y_tf = self.evaluate(y)
       self.assertAllEqual(y_tf, x_np)
 
+  @test_util.run_deprecated_v1
   def testUpDown(self):
     x_np = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.uint8).reshape([2, 3, 1])
     y_np = np.array([[4, 5, 6], [1, 2, 3]], dtype=np.uint8).reshape([2, 3, 1])
@@ -1118,7 +1130,7 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
       x_tf = constant_op.constant(x_np, shape=x_np.shape)
       y = image_ops.flip_up_down(x_tf)
       self.assertTrue(y.op.name.startswith("flip_up_down"))
-      y_tf = y.eval()
+      y_tf = self.evaluate(y)
       self.assertAllEqual(y_tf, y_np)
 
   def testUpDownWithBatch(self):
@@ -1132,9 +1144,10 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
     with self.test_session(use_gpu=True):
       x_tf = constant_op.constant(x_np, shape=x_np.shape)
       y = image_ops.flip_up_down(x_tf)
-      y_tf = y.eval()
+      y_tf = self.evaluate(y)
       self.assertAllEqual(y_tf, y_np)
 
+  @test_util.run_deprecated_v1
   def testRandomFlipUpDown(self):
     x_np = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.uint8).reshape([2, 3, 1])
     y_np = np.array([[4, 5, 6], [1, 2, 3]], dtype=np.uint8).reshape([2, 3, 1])
@@ -1148,7 +1161,7 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
       count_flipped = 0
       count_unflipped = 0
       for _ in range(100):
-        y_tf = y.eval()
+        y_tf = self.evaluate(y)
         if y_tf[0][0] == 1:
           self.assertAllEqual(y_tf, x_np)
           count_unflipped += 1
@@ -1163,6 +1176,7 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
       self.assertGreaterEqual(count_flipped, 20)
       self.assertGreaterEqual(count_unflipped, 20)
 
+  @test_util.run_deprecated_v1
   def testRandomFlipUpDownWithBatch(self):
     batch_size = 16
     seed = 42
@@ -1187,7 +1201,7 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
       count_flipped = 0
       count_unflipped = 0
       for _ in range(100):
-        y_tf = y.eval()
+        y_tf = self.evaluate(y)
 
         # check every element of the batch
         for i in range(batch_size):
@@ -1213,7 +1227,7 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
     with self.test_session(use_gpu=True):
       x_tf = constant_op.constant(x_np, shape=x_np.shape)
       y = image_ops.transpose_image(image_ops.transpose_image(x_tf))
-      y_tf = y.eval()
+      y_tf = self.evaluate(y)
       self.assertAllEqual(y_tf, x_np)
 
   def testInvolutionTransposeWithBatch(self):
@@ -1224,9 +1238,10 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
     with self.test_session(use_gpu=True):
       x_tf = constant_op.constant(x_np, shape=x_np.shape)
       y = image_ops.transpose_image(image_ops.transpose_image(x_tf))
-      y_tf = y.eval()
+      y_tf = self.evaluate(y)
       self.assertAllEqual(y_tf, x_np)
 
+  @test_util.run_deprecated_v1
   def testTranspose(self):
     x_np = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.uint8).reshape([2, 3, 1])
     y_np = np.array([[1, 4], [2, 5], [3, 6]], dtype=np.uint8).reshape([3, 2, 1])
@@ -1234,8 +1249,8 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
     with self.test_session(use_gpu=True):
       x_tf = constant_op.constant(x_np, shape=x_np.shape)
       y = image_ops.transpose_image(x_tf)
-      self.assertTrue(y.op.name.startswith("transpose_image"))
-      y_tf = y.eval()
+      self.assertTrue(y.op.name.startswith("transpose"))
+      y_tf = self.evaluate(y)
       self.assertAllEqual(y_tf, y_np)
 
   def testTransposeWithBatch(self):
@@ -1250,9 +1265,10 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
     with self.test_session(use_gpu=True):
       x_tf = constant_op.constant(x_np, shape=x_np.shape)
       y = image_ops.transpose_image(x_tf)
-      y_tf = y.eval()
+      y_tf = self.evaluate(y)
       self.assertAllEqual(y_tf, y_np)
 
+  @test_util.run_deprecated_v1
   def testPartialShapes(self):
     p_unknown_rank = array_ops.placeholder(dtypes.uint8)
     p_unknown_dims_3 = array_ops.placeholder(
@@ -1301,7 +1317,7 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
       rotated = image
       for _ in xrange(4):
         rotated = image_ops.rot90(rotated)
-      self.assertAllEqual(image, rotated.eval())
+      self.assertAllEqual(image, self.evaluate(rotated))
 
   def testRot90GroupOrderWithBatch(self):
     image = np.arange(48, dtype=np.uint8).reshape([2, 2, 4, 3])
@@ -1309,8 +1325,9 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
       rotated = image
       for _ in xrange(4):
         rotated = image_ops.rot90(rotated)
-      self.assertAllEqual(image, rotated.eval())
+      self.assertAllEqual(image, self.evaluate(rotated))
 
+  @test_util.run_deprecated_v1
   def testRot90NumpyEquivalence(self):
     image = np.arange(24, dtype=np.uint8).reshape([2, 4, 3])
     with self.test_session(use_gpu=True):
@@ -1320,6 +1337,7 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
         y_np = np.rot90(image, k=k)
         self.assertAllEqual(y_np, y_tf.eval({k_placeholder: k}))
 
+  @test_util.run_deprecated_v1
   def testRot90NumpyEquivalenceWithBatch(self):
     image = np.arange(48, dtype=np.uint8).reshape([2, 2, 4, 3])
     with self.test_session(use_gpu=True):
@@ -1335,7 +1353,7 @@ class AdjustContrastTest(test_util.TensorFlowTestCase):
     with self.test_session(use_gpu=True):
       x = constant_op.constant(x_np, shape=x_np.shape)
       y = image_ops.adjust_contrast(x, contrast_factor)
-      y_tf = y.eval()
+      y_tf = self.evaluate(y)
       self.assertAllClose(y_tf, y_np, 1e-6)
 
   def testDoubleContrastUint8(self):
@@ -1390,7 +1408,7 @@ class AdjustContrastTest(test_util.TensorFlowTestCase):
     with self.test_session(use_gpu=True):
       x = constant_op.constant(x_np)
       y = image_ops.adjust_contrast(x, contrast_factor)
-      y_tf = y.eval()
+      y_tf = self.evaluate(y)
     return y_tf
 
   def testRandomContrast(self):
@@ -1408,6 +1426,7 @@ class AdjustContrastTest(test_util.TensorFlowTestCase):
       y_tf = self._adjustContrastTf(x_np, contrast_factor)
       self.assertAllClose(y_tf, y_np, rtol=1e-5, atol=1e-5)
 
+  @test_util.run_deprecated_v1
   def testContrastFactorShape(self):
     x_shape = [1, 2, 2, 3]
     x_data = [0, 5, 13, 54, 135, 226, 37, 8, 234, 90, 255, 1]
@@ -1423,7 +1442,7 @@ class AdjustBrightnessTest(test_util.TensorFlowTestCase):
     with self.test_session(use_gpu=True):
       x = constant_op.constant(x_np, shape=x_np.shape)
       y = image_ops.adjust_brightness(x, delta)
-      y_tf = y.eval()
+      y_tf = self.evaluate(y)
       self.assertAllClose(y_tf, y_np, 1e-6)
 
   def testPositiveDeltaUint8(self):
@@ -1471,6 +1490,7 @@ class PerImageWhiteningTest(test_util.TensorFlowTestCase):
     y /= stddev
     return y
 
+  @test_util.run_deprecated_v1
   def testBasic(self):
     x_shape = [13, 9, 3]
     x_np = np.arange(0, np.prod(x_shape), dtype=np.int32).reshape(x_shape)
@@ -1480,7 +1500,7 @@ class PerImageWhiteningTest(test_util.TensorFlowTestCase):
       x = constant_op.constant(x_np, shape=x_shape)
       y = image_ops.per_image_standardization(x)
       self.assertTrue(y.op.name.startswith("per_image_standardization"))
-      y_tf = y.eval()
+      y_tf = self.evaluate(y)
       self.assertAllClose(y_tf, y_np, atol=1e-4)
 
   def testUniformImage(self):
@@ -1488,7 +1508,7 @@ class PerImageWhiteningTest(test_util.TensorFlowTestCase):
     im = constant_op.constant(im_np)
     whiten = image_ops.per_image_standardization(im)
     with self.test_session(use_gpu=True):
-      whiten_np = whiten.eval()
+      whiten_np = self.evaluate(whiten)
       self.assertFalse(np.any(np.isnan(whiten_np)))
 
   def testBatchWhitening(self):
@@ -1497,7 +1517,7 @@ class PerImageWhiteningTest(test_util.TensorFlowTestCase):
     with self.test_session(use_gpu=True):
       imgs = constant_op.constant(imgs_np)
       whiten = image_ops.per_image_standardization(imgs)
-      whiten_tf = whiten.eval()
+      whiten_tf = self.evaluate(whiten)
       for w_tf, w_np in zip(whiten_tf, whiten_np):
         self.assertAllClose(w_tf, w_np, atol=1e-4)
 
@@ -1571,11 +1591,13 @@ class CropToBoundingBoxTest(test_util.TensorFlowTestCase):
     y = image_ops.crop_to_bounding_box(image, 0, 0, height, width)
     self.assertEqual(y.get_shape().as_list(), post_shape)
 
+  @test_util.run_deprecated_v1
   def testNoOp(self):
     x_shape = [10, 10, 10]
     x = np.random.uniform(size=x_shape)
     self._assertReturns(x, x_shape, 0, 0, x, x_shape)
 
+  @test_util.run_deprecated_v1
   def testCrop(self):
     x = [1, 2, 3, 4, 5, 6, 7, 8, 9]
     x_shape = [3, 3, 1]
@@ -1600,6 +1622,7 @@ class CropToBoundingBoxTest(test_util.TensorFlowTestCase):
     y = [1, 2, 4, 5, 7, 8]
     self._assertReturns(x, x_shape, offset_height, offset_width, y, y_shape)
 
+  @test_util.run_deprecated_v1
   def testShapeInference(self):
     self._assertShapeInference([55, 66, 3], 55, 66, [55, 66, 3])
     self._assertShapeInference([59, 69, 3], 55, 66, [55, 66, 3])
@@ -1613,6 +1636,7 @@ class CropToBoundingBoxTest(test_util.TensorFlowTestCase):
     self._assertShapeInference([None, None, None], 55, 66, [55, 66, None])
     self._assertShapeInference(None, 55, 66, [55, 66, None])
 
+  @test_util.run_deprecated_v1
   def testNon3DInput(self):
     # Input image is not 3D
     x = [0] * 15
@@ -1624,6 +1648,7 @@ class CropToBoundingBoxTest(test_util.TensorFlowTestCase):
                          target_width,
                          "'image' must have either 3 or 4 dimensions.")
 
+  @test_util.run_deprecated_v1
   def testZeroLengthInput(self):
     # Input image has 0-length dimension(s).
     # Each line is a test configuration:
@@ -1655,6 +1680,7 @@ class CropToBoundingBoxTest(test_util.TensorFlowTestCase):
           "assertion failed:",
           use_tensor_inputs_options=[True])
 
+  @test_util.run_deprecated_v1
   def testBadParams(self):
     x_shape = [4, 4, 1]
     x = np.zeros(x_shape)
@@ -1672,6 +1698,7 @@ class CropToBoundingBoxTest(test_util.TensorFlowTestCase):
     for params, err_msg in test_config:
       self._assertRaises(x, x_shape, *params, err_msg=err_msg)
 
+  @test_util.run_deprecated_v1
   def testNameScope(self):
     image = array_ops.placeholder(dtypes.float32, shape=[55, 66, 3])
     y = image_ops.crop_to_bounding_box(image, 0, 0, 55, 66)
@@ -1688,6 +1715,7 @@ class CentralCropTest(test_util.TensorFlowTestCase):
     else:
       self.assertEqual(y.get_shape().as_list(), post_shape)
 
+  @test_util.run_deprecated_v1
   def testNoOp(self):
     x_shapes = [[13, 9, 3], [5, 13, 9, 3]]
     for x_shape in x_shapes:
@@ -1696,7 +1724,7 @@ class CentralCropTest(test_util.TensorFlowTestCase):
         with self.test_session(use_gpu=use_gpu):
           x = constant_op.constant(x_np, shape=x_shape)
           y = image_ops.central_crop(x, 1.0)
-          y_tf = y.eval()
+          y_tf = self.evaluate(y)
           self.assertAllEqual(y_tf, x_np)
           self.assertEqual(y.op.name, x.op.name)
 
@@ -1711,7 +1739,7 @@ class CentralCropTest(test_util.TensorFlowTestCase):
       with self.test_session(use_gpu=use_gpu):
         x = constant_op.constant(x_np, shape=x_shape)
         y = image_ops.central_crop(x, 0.5)
-        y_tf = y.eval()
+        y_tf = self.evaluate(y)
         self.assertAllEqual(y_tf, y_np)
         self.assertAllEqual(y_tf.shape, y_np.shape)
 
@@ -1727,10 +1755,11 @@ class CentralCropTest(test_util.TensorFlowTestCase):
     with self.test_session(use_gpu=True):
       x = constant_op.constant(x_np, shape=x_shape)
       y = image_ops.central_crop(x, 0.5)
-      y_tf = y.eval()
+      y_tf = self.evaluate(y)
       self.assertAllEqual(y_tf, y_np)
       self.assertAllEqual(y_tf.shape, y_np.shape)
 
+  @test_util.run_deprecated_v1
   def testCropping2(self):
     # Test case for 10315
     x_shapes = [[240, 320, 3], [5, 240, 320, 3]]
@@ -1747,6 +1776,7 @@ class CentralCropTest(test_util.TensorFlowTestCase):
           self.assertAllEqual(y_tf, y_np)
           self.assertAllEqual(y_tf.shape, y_np.shape)
 
+  @test_util.run_deprecated_v1
   def testShapeInference(self):
     # Test no-op fraction=1.0, with 3-D tensors.
     self._assertShapeInference([50, 60, 3], 1.0, [50, 60, 3])
@@ -1807,6 +1837,7 @@ class CentralCropTest(test_util.TensorFlowTestCase):
           with self.assertRaises(ValueError):
             _ = image_ops.central_crop(x, 0.5)
 
+  @test_util.run_deprecated_v1
   def testNameScope(self):
     x_shape = [13, 9, 3]
     x_np = np.ones(x_shape, dtype=np.float32)
@@ -1897,14 +1928,16 @@ class PadToBoundingBoxTest(test_util.TensorFlowTestCase):
     i = constant_op.constant([1, 0, 4, 3], dtype=dtypes.int64)
     y_tf = image_ops.pad_to_bounding_box(x, i[0], i[1], i[2], i[3])
     with self.test_session(use_gpu=True):
-      self.assertAllClose(y, y_tf.eval())
+      self.assertAllClose(y, self.evaluate(y_tf))
 
+  @test_util.run_deprecated_v1
   def testNoOp(self):
     x_shape = [10, 10, 10]
     x = np.random.uniform(size=x_shape)
     offset_height, offset_width = [0, 0]
     self._assertReturns(x, x_shape, offset_height, offset_width, x, x_shape)
 
+  @test_util.run_deprecated_v1
   def testPadding(self):
     x = [1, 2, 3, 4, 5, 6, 7, 8, 9]
     x_shape = [3, 3, 1]
@@ -1929,6 +1962,7 @@ class PadToBoundingBoxTest(test_util.TensorFlowTestCase):
     y_shape = [3, 4, 1]
     self._assertReturns(x, x_shape, offset_height, offset_width, y, y_shape)
 
+  @test_util.run_deprecated_v1
   def testShapeInference(self):
     self._assertShapeInference([55, 66, 3], 55, 66, [55, 66, 3])
     self._assertShapeInference([50, 60, 3], 55, 66, [55, 66, 3])
@@ -1942,6 +1976,7 @@ class PadToBoundingBoxTest(test_util.TensorFlowTestCase):
     self._assertShapeInference([None, None, None], 55, 66, [55, 66, None])
     self._assertShapeInference(None, 55, 66, [55, 66, None])
 
+  @test_util.run_deprecated_v1
   def testNon3DInput(self):
     # Input image is not 3D
     x = [0] * 15
@@ -1953,6 +1988,7 @@ class PadToBoundingBoxTest(test_util.TensorFlowTestCase):
                          target_width,
                          "'image' must have either 3 or 4 dimensions.")
 
+  @test_util.run_deprecated_v1
   def testZeroLengthInput(self):
     # Input image has 0-length dimension(s).
     # Each line is a test configuration:
@@ -1985,6 +2021,7 @@ class PadToBoundingBoxTest(test_util.TensorFlowTestCase):
           "all dims of \\'image.shape\\' must be > 0",
           use_tensor_inputs_options=[True])
 
+  @test_util.run_deprecated_v1
   def testBadParams(self):
     x_shape = [3, 3, 1]
     x = np.zeros(x_shape)
@@ -1999,6 +2036,7 @@ class PadToBoundingBoxTest(test_util.TensorFlowTestCase):
     for config_item in test_config:
       self._assertRaises(x, x_shape, *config_item)
 
+  @test_util.run_deprecated_v1
   def testNameScope(self):
     image = array_ops.placeholder(dtypes.float32, shape=[55, 66, 3])
     y = image_ops.pad_to_bounding_box(image, 0, 0, 55, 66)
@@ -2040,7 +2078,7 @@ class SelectDistortedCropBoxTest(test_util.TensorFlowTestCase):
       y = array_ops.strided_slice(image_tf, begin, begin + size)
 
       for _ in xrange(num_iter):
-        y_tf = y.eval()
+        y_tf = self.evaluate(y)
         crop_height = y_tf.shape[0]
         crop_width = y_tf.shape[1]
         aspect_ratio = float(crop_width) / float(crop_height)
@@ -2106,6 +2144,7 @@ class SelectDistortedCropBoxTest(test_util.TensorFlowTestCase):
     # TODO(wicke, shlens, dga): Restore this test so that it is no longer flaky.
     # self.assertGreaterEqual(min(fraction_object_covered), min_object_covered)
 
+  @test_util.run_deprecated_v1
   def testWholeImageBoundingBox(self):
     height = 40
     width = 50
@@ -2120,6 +2159,7 @@ class SelectDistortedCropBoxTest(test_util.TensorFlowTestCase):
         aspect_ratio_range=(0.75, 1.33),
         area_range=(0.05, 1.0))
 
+  @test_util.run_deprecated_v1
   def testWithBoundingBox(self):
     height = 40
     width = 50
@@ -2150,6 +2190,7 @@ class SelectDistortedCropBoxTest(test_util.TensorFlowTestCase):
         aspect_ratio_range=(0.75, 1.33),
         area_range=(0.05, 1.0))
 
+  @test_util.run_deprecated_v1
   def testSampleDistortedBoundingBoxShape(self):
     with self.test_session(use_gpu=True):
       image_size = constant_op.constant(
@@ -2171,9 +2212,9 @@ class SelectDistortedCropBoxTest(test_util.TensorFlowTestCase):
       self.assertAllEqual([3], end.get_shape().as_list())
       self.assertAllEqual([1, 1, 4], bbox_for_drawing.get_shape().as_list())
       # Actual run to make sure shape is correct inside Compute().
-      begin = begin.eval()
-      end = end.eval()
-      bbox_for_drawing = bbox_for_drawing.eval()
+      begin = self.evaluate(begin)
+      end = self.evaluate(end)
+      bbox_for_drawing = self.evaluate(bbox_for_drawing)
 
       begin, end, bbox_for_drawing = image_ops.sample_distorted_bounding_box(
           image_size=image_size,
@@ -2207,9 +2248,9 @@ class SelectDistortedCropBoxTest(test_util.TensorFlowTestCase):
       self.assertAllEqual([3], end.get_shape().as_list())
       self.assertAllEqual([1, 1, 4], bbox_for_drawing.get_shape().as_list())
       # Actual run to make sure shape is correct inside Compute().
-      begin = begin.eval()
-      end = end.eval()
-      bbox_for_drawing = bbox_for_drawing.eval()
+      begin = self.evaluate(begin)
+      end = self.evaluate(end)
+      bbox_for_drawing = self.evaluate(bbox_for_drawing)
 
 
 class ResizeImagesTest(test_util.TensorFlowTestCase):
@@ -2245,6 +2286,7 @@ class ResizeImagesTest(test_util.TensorFlowTestCase):
     else:
       return False
 
+  @test_util.run_deprecated_v1
   def testNoOp(self):
     img_shape = [1, 6, 4, 1]
     single_shape = [6, 4, 1]
@@ -2265,7 +2307,7 @@ class ResizeImagesTest(test_util.TensorFlowTestCase):
           image = constant_op.constant(img_np, shape=img_shape)
           y = image_ops.resize_images(image, [target_height, target_width], opt)
           yshape = array_ops.shape(y)
-          resized, newshape = sess.run([y, yshape])
+          resized, newshape = self.evaluate([y, yshape])
           self.assertAllEqual(img_shape, newshape)
           self.assertAllClose(resized, img_np, atol=1e-5)
 
@@ -2276,9 +2318,10 @@ class ResizeImagesTest(test_util.TensorFlowTestCase):
         y = image_ops.resize_images(image, [target_height, target_width],
                                     self.OPTIONS[0])
         yshape = array_ops.shape(y)
-        newshape = yshape.eval()
+        newshape = self.evaluate(yshape)
         self.assertAllEqual(single_shape, newshape)
 
+  @test_util.run_deprecated_v1
   def testTensorArguments(self):
     img_shape = [1, 6, 4, 1]
     single_shape = [6, 4, 1]
@@ -2340,6 +2383,7 @@ class ResizeImagesTest(test_util.TensorFlowTestCase):
       _ = image_ops.resize_images(image, [6, None],
                                   image_ops.ResizeMethod.BILINEAR)
 
+  @test_util.run_deprecated_v1
   def testReturnDtype(self):
     target_shapes = [[6, 4], [3, 2], [
         array_ops.placeholder(dtypes.int32),
@@ -2379,7 +2423,7 @@ class ResizeImagesTest(test_util.TensorFlowTestCase):
         image = constant_op.constant(img_np, shape=img_shape)
         y = image_ops.resize_images(image, [height, width], opt)
         yshape = array_ops.shape(y)
-        resized, newshape = sess.run([y, yshape])
+        resized, newshape = self.evaluate([y, yshape])
         self.assertAllEqual(img_shape, newshape)
         self.assertAllClose(resized, img_np, atol=1e-5)
 
@@ -2411,7 +2455,7 @@ class ResizeImagesTest(test_util.TensorFlowTestCase):
               y = image_ops.resize_images(image, [target_height, target_width],
                                           opt)
               expected = np.array(expected_data).reshape(target_shape)
-              resized = y.eval()
+              resized = self.evaluate(y)
               self.assertAllClose(resized, expected, atol=1e-5)
 
   def testResizeUpAlignCornersFalse(self):
@@ -2446,7 +2490,7 @@ class ResizeImagesTest(test_util.TensorFlowTestCase):
           image = constant_op.constant(img_np, shape=img_shape)
           y = image_ops.resize_images(
               image, [target_height, target_width], opt, align_corners=False)
-          resized = y.eval()
+          resized = self.evaluate(y)
           expected = np.array(expected_data[opt]).reshape(
               [1, target_height, target_width, 1])
           self.assertAllClose(resized, expected, atol=1e-05)
@@ -2482,7 +2526,7 @@ class ResizeImagesTest(test_util.TensorFlowTestCase):
           image = constant_op.constant(img_np, shape=img_shape)
           y = image_ops.resize_images(
               image, [target_height, target_width], opt, align_corners=True)
-          resized = y.eval()
+          resized = self.evaluate(y)
           expected = np.array(expected_data[opt]).reshape(
               [1, target_height, target_width, 1])
           self.assertAllClose(resized, expected, atol=1e-05)
@@ -2509,7 +2553,7 @@ class ResizeImagesTest(test_util.TensorFlowTestCase):
       image = constant_op.constant(img_np, shape=img_shape)
       y = image_ops.resize_images(image, [target_height, target_width],
                                   image_ops.ResizeMethod.BICUBIC)
-      resized = y.eval()
+      resized = self.evaluate(y)
       expected = np.array(expected_data).reshape(
           [1, target_height, target_width, 1])
       self.assertAllClose(resized, expected, atol=1)
@@ -2534,7 +2578,7 @@ class ResizeImagesTest(test_util.TensorFlowTestCase):
                                   image_ops.ResizeMethod.AREA)
       expected = np.array(expected_data).reshape(
           [1, target_height, target_width, 1])
-      resized = y.eval()
+      resized = self.evaluate(y)
       self.assertAllClose(resized, expected, atol=1)
 
   def testCompareNearestNeighbor(self):
@@ -2554,7 +2598,7 @@ class ResizeImagesTest(test_util.TensorFlowTestCase):
                 new_size,
                 image_ops.ResizeMethod.NEAREST_NEIGHBOR,
                 align_corners=align_corners)
-            gpu_val = out_op.eval()
+            gpu_val = self.evaluate(out_op)
           with self.test_session(use_gpu=False):
             image = constant_op.constant(img_np, shape=input_shape)
             new_size = constant_op.constant([target_height, target_width])
@@ -2563,7 +2607,7 @@ class ResizeImagesTest(test_util.TensorFlowTestCase):
                 new_size,
                 image_ops.ResizeMethod.NEAREST_NEIGHBOR,
                 align_corners=align_corners)
-            cpu_val = out_op.eval()
+            cpu_val = self.evaluate(out_op)
           self.assertAllClose(cpu_val, gpu_val, rtol=1e-5, atol=1e-5)
 
   def testCompareBilinear(self):
@@ -2585,9 +2629,10 @@ class ResizeImagesTest(test_util.TensorFlowTestCase):
                   new_size,
                   image_ops.ResizeMethod.BILINEAR,
                   align_corners=align_corners)
-              value[use_gpu] = out_op.eval()
+              value[use_gpu] = self.evaluate(out_op)
           self.assertAllClose(value[True], value[False], rtol=1e-5, atol=1e-5)
 
+  @test_util.run_deprecated_v1
   def testShapeInference(self):
     self._assertShapeInference([50, 60, 3], [55, 66], [55, 66, 3])
     self._assertShapeInference([55, 66, 3], [55, 66], [55, 66, 3])
@@ -2608,12 +2653,13 @@ class ResizeImagesTest(test_util.TensorFlowTestCase):
     self._assertShapeInference([59, 60, None], [55, 66], [55, 66, None])
     self._assertShapeInference([None, None, None], [55, 66], [55, 66, None])
 
+  @test_util.run_deprecated_v1
   def testNameScope(self):
     img_shape = [1, 3, 2, 1]
     with self.test_session(use_gpu=True):
       single_image = array_ops.placeholder(dtypes.float32, shape=[50, 60, 3])
       y = image_ops.resize_images(single_image, [55, 66])
-      self.assertTrue(y.op.name.startswith("resize_images"))
+      self.assertTrue(y.op.name.startswith("resize"))
 
   def _ResizeImageCall(self, x, max_h, max_w, preserve_aspect_ratio,
                        use_tensor_inputs):
@@ -2658,6 +2704,7 @@ class ResizeImagesTest(test_util.TensorFlowTestCase):
                                    preserve_aspect_ratio, use_tensor_inputs)
       self.assertShapeEqual(y, ops.convert_to_tensor(y_tf))
 
+  @test_util.run_deprecated_v1
   def testPreserveAspectRatioMultipleImages(self):
     x_shape = [10, 100, 100, 10]
     x = np.random.uniform(size=x_shape)
@@ -2665,36 +2712,42 @@ class ResizeImagesTest(test_util.TensorFlowTestCase):
     self._assertResizeCheckShape(x, x_shape, [250, 250], [10, 250, 250, 10],
                                  preserve_aspect_ratio=False)
 
+  @test_util.run_deprecated_v1
   def testPreserveAspectRatioNoOp(self):
     x_shape = [10, 10, 10]
     x = np.random.uniform(size=x_shape)
 
     self._assertResizeEqual(x, x_shape, x, x_shape)
 
+  @test_util.run_deprecated_v1
   def testPreserveAspectRatioSmaller(self):
     x_shape = [100, 100, 10]
     x = np.random.uniform(size=x_shape)
 
     self._assertResizeCheckShape(x, x_shape, [75, 50], [50, 50, 10])
 
+  @test_util.run_deprecated_v1
   def testPreserveAspectRatioSmallerMultipleImages(self):
     x_shape = [10, 100, 100, 10]
     x = np.random.uniform(size=x_shape)
 
     self._assertResizeCheckShape(x, x_shape, [75, 50], [10, 50, 50, 10])
 
+  @test_util.run_deprecated_v1
   def testPreserveAspectRatioLarger(self):
     x_shape = [100, 100, 10]
     x = np.random.uniform(size=x_shape)
 
     self._assertResizeCheckShape(x, x_shape, [150, 200], [150, 150, 10])
 
+  @test_util.run_deprecated_v1
   def testPreserveAspectRatioSameRatio(self):
     x_shape = [1920, 1080, 3]
     x = np.random.uniform(size=x_shape)
 
     self._assertResizeCheckShape(x, x_shape, [3840, 2160], [3840, 2160, 3])
 
+  @test_util.run_deprecated_v1
   def testPreserveAspectRatioSquare(self):
     x_shape = [299, 299, 3]
     x = np.random.uniform(size=x_shape)
@@ -2764,12 +2817,14 @@ class ResizeImageWithPadTest(test_util.TensorFlowTestCase):
     y = image_ops.resize_image_with_pad(image, height, width)
     self.assertEqual(y.get_shape().as_list(), post_shape)
 
+  @test_util.run_deprecated_v1
   def testNoOp(self):
     x_shape = [10, 10, 10]
     x = np.random.uniform(size=x_shape)
 
     self._assertReturns(x, x_shape, x, x_shape)
 
+  @test_util.run_deprecated_v1
   def testPad(self):
     # Reduce vertical dimension
     x = [1, 2, 3, 4, 5, 6, 7, 8]
@@ -2860,12 +2915,14 @@ class ResizeImageWithCropOrPadTest(test_util.TensorFlowTestCase):
     y = image_ops.resize_image_with_crop_or_pad(image, height, width)
     self.assertEqual(y.get_shape().as_list(), post_shape)
 
+  @test_util.run_deprecated_v1
   def testNoOp(self):
     x_shape = [10, 10, 10]
     x = np.random.uniform(size=x_shape)
 
     self._assertReturns(x, x_shape, x, x_shape)
 
+  @test_util.run_deprecated_v1
   def testPad(self):
     # Pad even along col.
     x = [1, 2, 3, 4, 5, 6, 7, 8]
@@ -2903,6 +2960,7 @@ class ResizeImageWithCropOrPadTest(test_util.TensorFlowTestCase):
 
     self._assertReturns(x, x_shape, y, y_shape)
 
+  @test_util.run_deprecated_v1
   def testCrop(self):
     # Crop even along col.
     x = [1, 2, 3, 4, 5, 6, 7, 8]
@@ -2940,6 +2998,7 @@ class ResizeImageWithCropOrPadTest(test_util.TensorFlowTestCase):
 
     self._assertReturns(x, x_shape, y, y_shape)
 
+  @test_util.run_deprecated_v1
   def testCropAndPad(self):
     # Pad along row but crop along col.
     x = [1, 2, 3, 4, 5, 6, 7, 8]
@@ -2959,6 +3018,7 @@ class ResizeImageWithCropOrPadTest(test_util.TensorFlowTestCase):
 
     self._assertReturns(x, x_shape, y, y_shape)
 
+  @test_util.run_deprecated_v1
   def testShapeInference(self):
     self._assertShapeInference([50, 60, 3], 55, 66, [55, 66, 3])
     self._assertShapeInference([55, 66, 3], 55, 66, [55, 66, 3])
@@ -2980,6 +3040,7 @@ class ResizeImageWithCropOrPadTest(test_util.TensorFlowTestCase):
     self._assertShapeInference([None, None, None], 55, 66, [55, 66, None])
     self._assertShapeInference(None, 55, 66, [55, 66, None])
 
+  @test_util.run_deprecated_v1
   def testNon3DInput(self):
     # Input image is not 3D
     x = [0] * 15
@@ -2993,6 +3054,7 @@ class ResizeImageWithCropOrPadTest(test_util.TensorFlowTestCase):
       self._assertRaises(x, x_shape, target_height, target_width,
                          "'image' must have either 3 or 4 dimensions.")
 
+  @test_util.run_deprecated_v1
   def testZeroLengthInput(self):
     # Input image has 0-length dimension(s).
     target_height, target_width = [1, 1]
@@ -3018,6 +3080,7 @@ class ResizeImageWithCropOrPadTest(test_util.TensorFlowTestCase):
           "all dims of \\'image.shape\\' must be > 0",
           use_tensor_inputs_options=[True])
 
+  @test_util.run_deprecated_v1
   def testBadParams(self):
     x_shape = [4, 4, 1]
     x = np.zeros(x_shape)
@@ -3032,6 +3095,7 @@ class ResizeImageWithCropOrPadTest(test_util.TensorFlowTestCase):
     self._assertRaises(x, x_shape, target_height, target_width,
                        "target_width must be > 0")
 
+  @test_util.run_deprecated_v1
   def testNameScope(self):
     image = array_ops.placeholder(dtypes.float32, shape=[50, 60, 3])
     y = image_ops.resize_image_with_crop_or_pad(image, 55, 66)
@@ -3066,7 +3130,7 @@ class JpegTest(test_util.TensorFlowTestCase):
       jpeg0 = io_ops.read_file(path)
       image0 = image_ops.decode_jpeg(jpeg0)
       image1 = image_ops.decode_jpeg(image_ops.encode_jpeg(image0))
-      jpeg0, image0, image1 = sess.run([jpeg0, image0, image1])
+      jpeg0, image0, image1 = self.evaluate([jpeg0, image0, image1])
       self.assertEqual(len(jpeg0), 3771)
       self.assertEqual(image0.shape, (256, 128, 3))
       self.assertLess(self.averageError(image0, image1), 1.4)
@@ -3083,7 +3147,7 @@ class JpegTest(test_util.TensorFlowTestCase):
             io_ops.read_file(rgb_path), channels=channels)
         cmyk = image_ops.decode_jpeg(
             io_ops.read_file(cmyk_path), channels=channels)
-        rgb, cmyk = sess.run([rgb, cmyk])
+        rgb, cmyk = self.evaluate([rgb, cmyk])
         self.assertEqual(rgb.shape, shape)
         self.assertEqual(cmyk.shape, shape)
         error = self.averageError(rgb, cmyk)
@@ -3112,9 +3176,10 @@ class JpegTest(test_util.TensorFlowTestCase):
                             image2.get_shape().as_list())
 
         # CropAndDecode should be equal to DecodeJpeg+Crop.
-        image1_crop, image2 = sess.run([image1_crop, image2])
+        image1_crop, image2 = self.evaluate([image1_crop, image2])
         self.assertAllEqual(image1_crop, image2)
 
+  @test_util.run_deprecated_v1
   def testCropAndDecodeJpegWithInvalidCropWindow(self):
     with self.cached_session() as sess:
       # Encode it, then decode it, then encode it
@@ -3131,7 +3196,7 @@ class JpegTest(test_util.TensorFlowTestCase):
         with self.assertRaisesWithPredicateMatch(
             errors.InvalidArgumentError,
             lambda e: "Invalid JPEG data or crop window" in str(e)):
-          sess.run(result)
+          self.evaluate(result)
 
   def testSynthetic(self):
     with self.test_session(use_gpu=True) as sess:
@@ -3141,7 +3206,8 @@ class JpegTest(test_util.TensorFlowTestCase):
       image1 = image_ops.decode_jpeg(jpeg0, dct_method="INTEGER_ACCURATE")
       image2 = image_ops.decode_jpeg(
           image_ops.encode_jpeg(image1), dct_method="INTEGER_ACCURATE")
-      jpeg0, image0, image1, image2 = sess.run([jpeg0, image0, image1, image2])
+      jpeg0, image0, image1, image2 = self.evaluate(
+          [jpeg0, image0, image1, image2])
 
       # The decoded-encoded image should be similar to the input
       self.assertLess(self.averageError(image0, image1), 0.6)
@@ -3161,7 +3227,8 @@ class JpegTest(test_util.TensorFlowTestCase):
       image1 = image_ops.decode_jpeg(jpeg0, dct_method="INTEGER_FAST")
       image2 = image_ops.decode_jpeg(
           image_ops.encode_jpeg(image1), dct_method="INTEGER_FAST")
-      jpeg0, image0, image1, image2 = sess.run([jpeg0, image0, image1, image2])
+      jpeg0, image0, image1, image2 = self.evaluate(
+          [jpeg0, image0, image1, image2])
 
       # The decoded-encoded image should be similar to the input, but
       # note this is worse than the slower algorithm because it is
@@ -3184,11 +3251,12 @@ class JpegTest(test_util.TensorFlowTestCase):
       jpeg0 = image_ops.encode_jpeg(image0)
       image1 = image_ops.decode_jpeg(jpeg0, dct_method="INTEGER_FAST")
       image2 = image_ops.decode_jpeg(jpeg0)
-      image1, image2 = sess.run([image1, image2])
+      image1, image2 = self.evaluate([image1, image2])
 
       # The images should be the same.
       self.assertAllClose(image1, image2)
 
+  @test_util.run_deprecated_v1
   def testShape(self):
     with self.test_session(use_gpu=True) as sess:
       jpeg = constant_op.constant("nonsense")
@@ -3197,6 +3265,7 @@ class JpegTest(test_util.TensorFlowTestCase):
         self.assertEqual(image.get_shape().as_list(),
                          [None, None, channels or None])
 
+  @test_util.run_deprecated_v1
   def testExtractJpegShape(self):
     # Read a real jpeg and verify shape.
     path = ("tensorflow/core/lib/jpeg/testdata/"
@@ -3207,6 +3276,7 @@ class JpegTest(test_util.TensorFlowTestCase):
       [image_shape] = sess.run([image_ops.extract_jpeg_shape(jpeg)])
       self.assertEqual(image_shape.tolist(), [256, 128, 3])
 
+  @test_util.run_deprecated_v1
   def testExtractJpegShapeforCmyk(self):
     # Read a cmyk jpeg image, and verify its shape.
     path = ("tensorflow/core/lib/jpeg/testdata/"
@@ -3230,11 +3300,11 @@ class PngTest(test_util.TensorFlowTestCase):
         with self.test_session(use_gpu=True) as sess:
           png0 = io_ops.read_file(prefix + filename)
           image0 = image_ops.decode_png(png0, channels=channels)
-          png0, image0 = sess.run([png0, image0])
+          png0, image0 = self.evaluate([png0, image0])
           self.assertEqual(image0.shape, (26, 51, channels or channels_in))
           if channels == channels_in:
             image1 = image_ops.decode_png(image_ops.encode_png(image0))
-            self.assertAllEqual(image0, image1.eval())
+            self.assertAllEqual(image0, self.evaluate(image1))
 
   def testSynthetic(self):
     with self.test_session(use_gpu=True) as sess:
@@ -3242,7 +3312,7 @@ class PngTest(test_util.TensorFlowTestCase):
       image0 = constant_op.constant(_SimpleColorRamp())
       png0 = image_ops.encode_png(image0, compression=7)
       image1 = image_ops.decode_png(png0)
-      png0, image0, image1 = sess.run([png0, image0, image1])
+      png0, image0, image1 = self.evaluate([png0, image0, image1])
 
       # PNG is lossless
       self.assertAllEqual(image0, image1)
@@ -3257,7 +3327,7 @@ class PngTest(test_util.TensorFlowTestCase):
       image0 = constant_op.constant(_SimpleColorRamp(), dtype=dtypes.uint16)
       png0 = image_ops.encode_png(image0, compression=7)
       image1 = image_ops.decode_png(png0, dtype=dtypes.uint16)
-      png0, image0, image1 = sess.run([png0, image0, image1])
+      png0, image0, image1 = self.evaluate([png0, image0, image1])
 
       # PNG is lossless
       self.assertAllEqual(image0, image1)
@@ -3273,7 +3343,7 @@ class PngTest(test_util.TensorFlowTestCase):
       image0 = constant_op.constant(gray_alpha)
       png0 = image_ops.encode_png(image0, compression=7)
       image1 = image_ops.decode_png(png0)
-      png0, image0, image1 = sess.run([png0, image0, image1])
+      png0, image0, image1 = self.evaluate([png0, image0, image1])
       self.assertEqual(2, image0.shape[-1])
       self.assertAllEqual(image0, image1)
 
@@ -3284,10 +3354,11 @@ class PngTest(test_util.TensorFlowTestCase):
       image0 = constant_op.constant(gray_alpha, dtype=dtypes.uint16)
       png0 = image_ops.encode_png(image0, compression=7)
       image1 = image_ops.decode_png(png0, dtype=dtypes.uint16)
-      png0, image0, image1 = sess.run([png0, image0, image1])
+      png0, image0, image1 = self.evaluate([png0, image0, image1])
       self.assertEqual(2, image0.shape[-1])
       self.assertAllEqual(image0, image1)
 
+  @test_util.run_deprecated_v1
   def testShape(self):
     with self.test_session(use_gpu=True):
       png = constant_op.constant("nonsense")
@@ -3310,7 +3381,7 @@ class GifTest(test_util.TensorFlowTestCase):
     with self.test_session(use_gpu=True) as sess:
       gif0 = io_ops.read_file(prefix + filename)
       image0 = image_ops.decode_gif(gif0)
-      gif0, image0 = sess.run([gif0, image0])
+      gif0, image0 = self.evaluate([gif0, image0])
 
       self.assertEqual(image0.shape, shape)
 
@@ -3332,6 +3403,7 @@ class GifTest(test_util.TensorFlowTestCase):
     self._testValid("scan.gif")
     self._testValid("optimized.gif")
 
+  @test_util.run_deprecated_v1
   def testShape(self):
     with self.test_session(use_gpu=True) as sess:
       gif = constant_op.constant("nonsense")
@@ -3358,6 +3430,7 @@ class ConvertImageTest(test_util.TensorFlowTestCase):
         self.assertTrue(y_saturate.dtype == output_dtype)
         self.assertAllClose(y_saturate.eval(), y_np, atol=1e-5)
 
+  @test_util.run_deprecated_v1
   def testNoConvert(self):
     # Make sure converting to the same data type creates only an identity op
     with self.test_session(use_gpu=True):
@@ -3367,6 +3440,7 @@ class ConvertImageTest(test_util.TensorFlowTestCase):
       self.assertEquals(y.op.type, "Identity")
       self.assertEquals(y.op.inputs[0], image)
 
+  @test_util.run_deprecated_v1
   def testConvertBetweenInteger(self):
     # Make sure converting to between integer types scales appropriately
     with self.test_session(use_gpu=True):
@@ -3375,6 +3449,7 @@ class ConvertImageTest(test_util.TensorFlowTestCase):
       self._convert([0, 2**32], dtypes.int64, dtypes.int32, [0, 1])
       self._convert([0, 1], dtypes.int32, dtypes.int64, [0, 2**32])
 
+  @test_util.run_deprecated_v1
   def testConvertBetweenFloat(self):
     # Make sure converting to between float types does nothing interesting
     with self.test_session(use_gpu=True):
@@ -3383,6 +3458,7 @@ class ConvertImageTest(test_util.TensorFlowTestCase):
       self._convert([-1.0, 0, 1.0, 200000], dtypes.float64, dtypes.float32,
                     [-1.0, 0, 1.0, 200000])
 
+  @test_util.run_deprecated_v1
   def testConvertBetweenIntegerAndFloat(self):
     # Make sure converting from and to a float type scales appropriately
     with self.test_session(use_gpu=True):
@@ -3391,6 +3467,7 @@ class ConvertImageTest(test_util.TensorFlowTestCase):
       self._convert([0, 1.1 / 255.0, 1], dtypes.float32, dtypes.uint8,
                     [0, 1, 255])
 
+  @test_util.run_deprecated_v1
   def testConvertBetweenInt16AndInt8(self):
     with self.test_session(use_gpu=True):
       # uint8, uint16
@@ -3431,7 +3508,7 @@ class TotalVariationTest(test_util.TensorFlowTestCase):
       y = image_ops.total_variation(images=x_tf)
 
       # Run the TensorFlow session to calculate the result.
-      y_tf = y.eval()
+      y_tf = self.evaluate(y)
 
       # Assert that the results are as expected within
       # some small error-bound in case they are float-values.
@@ -3582,6 +3659,7 @@ class TotalVariationTest(test_util.TensorFlowTestCase):
 
 class FormatTest(test_util.TensorFlowTestCase):
 
+  @test_util.run_deprecated_v1
   def testFormats(self):
     prefix = "tensorflow/core/lib"
     paths = ("png/testdata/lena_gray.png", "jpeg/testdata/jpeg_merge_test1.jpg",
@@ -3614,6 +3692,7 @@ class FormatTest(test_util.TensorFlowTestCase):
 
 class NonMaxSuppressionTest(test_util.TensorFlowTestCase):
 
+  @test_util.run_deprecated_v1
   def testSelectFromThreeClusters(self):
     boxes_np = [[0, 0, 1, 1], [0, 0.1, 1, 1.1], [0, -0.1, 1, 0.9],
                 [0, 10, 1, 11], [0, 10.1, 1, 11.1], [0, 100, 1, 101]]
@@ -3629,6 +3708,7 @@ class NonMaxSuppressionTest(test_util.TensorFlowTestCase):
           boxes, scores, max_output_size, iou_threshold).eval()
       self.assertAllClose(selected_indices, [3, 0, 5])
 
+  @test_util.run_deprecated_v1
   def testInvalidShape(self):
     # The boxes should be 2D of shape [num_boxes, 4].
     with self.assertRaisesRegexp(ValueError,
@@ -3671,6 +3751,7 @@ class NonMaxSuppressionTest(test_util.TensorFlowTestCase):
       scores = constant_op.constant([0.9])
       image_ops.non_max_suppression(boxes, scores, 3, [[0.5]])
 
+  @test_util.run_deprecated_v1
   def testDataTypes(self):
     # Test case for GitHub issue 20199.
     boxes_np = [[0, 0, 1, 1], [0, 0.1, 1, 1.1], [0, -0.1, 1, 0.9],
@@ -3709,12 +3790,13 @@ class NonMaxSuppressionTest(test_util.TensorFlowTestCase):
         iou_threshold = constant_op.constant(iou_threshold_np)
         selected_indices, _ = gen_image_ops.non_max_suppression_v4(
             boxes, scores, max_output_size, iou_threshold, score_threshold)
-        selected_indices = selected_indices.eval()
+        selected_indices = self.evaluate(selected_indices)
         self.assertAllClose(selected_indices, [3, 0, 5])
 
 
 class NonMaxSuppressionPaddedTest(test_util.TensorFlowTestCase):
 
+  @test_util.run_deprecated_v1
   def testSelectFromThreeClusters(self):
     boxes_np = [[0, 0, 1, 1], [0, 0.1, 1, 1.1], [0, -0.1, 1, 0.9],
                 [0, 10, 1, 11], [0, 10.1, 1, 11.1], [0, 100, 1, 101]]
@@ -3747,6 +3829,7 @@ class NonMaxSuppressionPaddedTest(test_util.TensorFlowTestCase):
       self.assertAllClose(selected_indices.eval(), [3, 0, 5])
       self.assertEqual(num_valid.eval(), 3)
 
+  @test_util.run_deprecated_v1
   def testSelectFromContinuousOverLap(self):
     boxes_np = [[0, 0, 1, 1], [0, 0.2, 1, 1.2], [0, 0.4, 1, 1.4],
                 [0, 0.6, 1, 1.6], [0, 0.8, 1, 1.8], [0, 2, 1, 2]]
@@ -3774,6 +3857,7 @@ class NonMaxSuppressionPaddedTest(test_util.TensorFlowTestCase):
 
 class NonMaxSuppressionWithOverlapsTest(test_util.TensorFlowTestCase):
 
+  @test_util.run_deprecated_v1
   def testSelectOneFromThree(self):
     overlaps_np = [
         [1.0, 0.7, 0.2],
@@ -3799,6 +3883,7 @@ class NonMaxSuppressionWithOverlapsTest(test_util.TensorFlowTestCase):
 class VerifyCompatibleImageShapesTest(test_util.TensorFlowTestCase):
   """Tests utility function used by ssim() and psnr()."""
 
+  @test_util.run_deprecated_v1
   def testWrongDims(self):
     img = array_ops.placeholder(dtype=dtypes.float32)
     img_np = np.array((2, 2))
@@ -3808,6 +3893,7 @@ class VerifyCompatibleImageShapesTest(test_util.TensorFlowTestCase):
       with self.assertRaises(errors.InvalidArgumentError):
         sess.run(checks, {img: img_np})
 
+  @test_util.run_deprecated_v1
   def testShapeMismatch(self):
     img1 = array_ops.placeholder(dtype=dtypes.float32)
     img2 = array_ops.placeholder(dtype=dtypes.float32)
@@ -3829,7 +3915,7 @@ class PSNRTest(test_util.TensorFlowTestCase):
         "tensorflow/core/lib/psnr/testdata", filename))
     im = image_ops.decode_jpeg(content, dct_method="INTEGER_ACCURATE")
     im = image_ops.convert_image_dtype(im, dtypes.float32)
-    im, = sess.run([im])
+    im, = self.evaluate([im])
     return np.expand_dims(im, axis=0)
 
   def _LoadTestImages(self):
@@ -3848,6 +3934,7 @@ class PSNRTest(test_util.TensorFlowTestCase):
     """Returns an image or image batch with given shape."""
     return np.random.rand(*shape).astype(np.float32) * max_val
 
+  @test_util.run_deprecated_v1
   def testPSNRSingleImage(self):
     image1 = self._RandomImage((8, 8, 1), 1)
     image2 = self._RandomImage((8, 8, 1), 1)
@@ -3861,6 +3948,7 @@ class PSNRTest(test_util.TensorFlowTestCase):
       tf_psnr = image_ops.psnr(tf_image1, tf_image2, 1.0, "psnr").eval()
       self.assertAllClose(psnr, tf_psnr, atol=0.001)
 
+  @test_util.run_deprecated_v1
   def testPSNRMultiImage(self):
     image1 = self._RandomImage((10, 8, 8, 1), 1)
     image2 = self._RandomImage((10, 8, 8, 1), 1)
@@ -3874,6 +3962,7 @@ class PSNRTest(test_util.TensorFlowTestCase):
       tf_psnr = image_ops.psnr(tf_image1, tf_image2, 1, "psnr").eval()
       self.assertAllClose(psnr, tf_psnr, atol=0.001)
 
+  @test_util.run_deprecated_v1
   def testGoldenPSNR(self):
     q20, q72, q95 = self._LoadTestImages()
 
@@ -3898,6 +3987,7 @@ class PSNRTest(test_util.TensorFlowTestCase):
       self.assertAllClose(psnr2, tf_psnr2, atol=0.001)
       self.assertAllClose(psnr3, tf_psnr3, atol=0.001)
 
+  @test_util.run_deprecated_v1
   def testInfinity(self):
     q20, _, _ = self._LoadTestImages()
     psnr = self._PSNR_NumPy(q20, q20, 1)
@@ -3906,6 +3996,7 @@ class PSNRTest(test_util.TensorFlowTestCase):
       tf_psnr = image_ops.psnr(tf_q20, tf_q20, 1, "psnr").eval()
       self.assertAllClose(psnr, tf_psnr, atol=0.001)
 
+  @test_util.run_deprecated_v1
   def testInt(self):
     img1 = self._RandomImage((10, 8, 8, 1), 255)
     img2 = self._RandomImage((10, 8, 8, 1), 255)
@@ -3916,7 +4007,8 @@ class PSNRTest(test_util.TensorFlowTestCase):
     img2 = image_ops.convert_image_dtype(img2, dtypes.float32)
     psnr_float32 = image_ops.psnr(img1, img2, 1.0)
     with self.test_session(use_gpu=True):
-      self.assertAllClose(psnr_uint8.eval(), psnr_float32.eval(), atol=0.001)
+      self.assertAllClose(
+          psnr_uint8.eval(), self.evaluate(psnr_float32), atol=0.001)
 
 
 class SSIMTest(test_util.TensorFlowTestCase):
@@ -3935,7 +4027,7 @@ class SSIMTest(test_util.TensorFlowTestCase):
         "tensorflow/core/lib/ssim/testdata", filename))
     im = image_ops.decode_png(content)
     im = image_ops.convert_image_dtype(im, dtypes.float32)
-    im, = sess.run([im])
+    im, = self.evaluate([im])
     return np.expand_dims(im, axis=0)
 
   def _LoadTestImages(self):
@@ -3946,6 +4038,7 @@ class SSIMTest(test_util.TensorFlowTestCase):
     """Returns an image or image batch with given shape."""
     return np.random.rand(*shape).astype(np.float32) * max_val
 
+  @test_util.run_deprecated_v1
   def testAgainstMatlab(self):
     """Tests against values produced by Matlab."""
     img = self._LoadTestImages()
@@ -3969,7 +4062,7 @@ class SSIMTest(test_util.TensorFlowTestCase):
     ssim = image_ops.ssim(constant_op.constant(img1),
                           constant_op.constant(img2), 1.0)
     with self.test_session(use_gpu=True):
-      self.assertAllClose(expected, ssim.eval(), atol=1e-4)
+      self.assertAllClose(expected, self.evaluate(ssim), atol=1e-4)
 
   def testBroadcast(self):
     img = self._LoadTestImages()[:2]
@@ -3981,8 +4074,9 @@ class SSIMTest(test_util.TensorFlowTestCase):
 
     ssim = image_ops.ssim(img1, img2, 1.0)
     with self.test_session(use_gpu=True):
-      self.assertAllClose(expected, ssim.eval(), atol=1e-4)
+      self.assertAllClose(expected, self.evaluate(ssim), atol=1e-4)
 
+  @test_util.run_deprecated_v1
   def testNegative(self):
     """Tests against negative SSIM index."""
     step = np.expand_dims(np.arange(0, 256, 16, dtype=np.uint8), axis=0)
@@ -3997,6 +4091,7 @@ class SSIMTest(test_util.TensorFlowTestCase):
     with self.test_session(use_gpu=True):
       self.assertLess(ssim.eval(), 0)
 
+  @test_util.run_deprecated_v1
   def testInt(self):
     img1 = self._RandomImage((1, 16, 16, 3), 255)
     img2 = self._RandomImage((1, 16, 16, 3), 255)
@@ -4007,7 +4102,8 @@ class SSIMTest(test_util.TensorFlowTestCase):
     img2 = image_ops.convert_image_dtype(img2, dtypes.float32)
     ssim_float32 = image_ops.ssim(img1, img2, 1.0)
     with self.test_session(use_gpu=True):
-      self.assertAllClose(ssim_uint8.eval(), ssim_float32.eval(), atol=0.001)
+      self.assertAllClose(
+          ssim_uint8.eval(), self.evaluate(ssim_float32), atol=0.001)
 
 
 class MultiscaleSSIMTest(test_util.TensorFlowTestCase):
@@ -4026,7 +4122,7 @@ class MultiscaleSSIMTest(test_util.TensorFlowTestCase):
         "tensorflow/core/lib/ssim/testdata", filename))
     im = image_ops.decode_png(content)
     im = image_ops.convert_image_dtype(im, dtypes.float32)
-    im, = sess.run([im])
+    im, = self.evaluate([im])
     return np.expand_dims(im, axis=0)
 
   def _LoadTestImages(self):
@@ -4037,6 +4133,7 @@ class MultiscaleSSIMTest(test_util.TensorFlowTestCase):
     """Returns an image or image batch with given shape."""
     return np.random.rand(*shape).astype(np.float32) * max_val
 
+  @test_util.run_deprecated_v1
   def testAgainstMatlab(self):
     """Tests against MS-SSIM computed with Matlab implementation.
 
@@ -4053,6 +4150,7 @@ class MultiscaleSSIMTest(test_util.TensorFlowTestCase):
 
     self.assertAllClose(expected, np.squeeze(scores), atol=1e-4)
 
+  @test_util.run_deprecated_v1
   def testUnweightedIsDifferentiable(self):
     img = self._LoadTestImages()
     ph = [array_ops.placeholder(dtype=dtypes.float32) for _ in range(2)]
@@ -4077,7 +4175,7 @@ class MultiscaleSSIMTest(test_util.TensorFlowTestCase):
     msssim = image_ops.ssim_multiscale(constant_op.constant(img1),
                                        constant_op.constant(img2), 1.0)
     with self.test_session(use_gpu=True):
-      self.assertAllClose(expected, msssim.eval(), 1e-4)
+      self.assertAllClose(expected, self.evaluate(msssim), 1e-4)
 
   def testBroadcast(self):
     """Tests MS-SSIM broadcasting."""
@@ -4090,7 +4188,7 @@ class MultiscaleSSIMTest(test_util.TensorFlowTestCase):
 
     score_tensor = image_ops.ssim_multiscale(img1, img2, 1.0)
     with self.test_session(use_gpu=True):
-      self.assertAllClose(expected, score_tensor.eval(), 1e-4)
+      self.assertAllClose(expected, self.evaluate(score_tensor), 1e-4)
 
   def testRange(self):
     """Tests against low MS-SSIM score.
@@ -4108,12 +4206,13 @@ class MultiscaleSSIMTest(test_util.TensorFlowTestCase):
       images = [ops.convert_to_tensor(x, dtype=dtypes.float32) for x in images]
       msssim_ops = [image_ops.ssim_multiscale(x, y, 1.0)
                     for x, y in itertools.combinations(images, 2)]
-      msssim = sess.run(msssim_ops)
+      msssim = self.evaluate(msssim_ops)
       msssim = np.squeeze(msssim)
 
     self.assertTrue(np.all(msssim >= 0.0))
     self.assertTrue(np.all(msssim <= 1.0))
 
+  @test_util.run_deprecated_v1
   def testInt(self):
     img1 = self._RandomImage((1, 180, 240, 3), 255)
     img2 = self._RandomImage((1, 180, 240, 3), 255)
@@ -4124,7 +4223,8 @@ class MultiscaleSSIMTest(test_util.TensorFlowTestCase):
     img2 = image_ops.convert_image_dtype(img2, dtypes.float32)
     ssim_float32 = image_ops.ssim_multiscale(img1, img2, 1.0)
     with self.test_session(use_gpu=True):
-      self.assertAllClose(ssim_uint8.eval(), ssim_float32.eval(), atol=0.001)
+      self.assertAllClose(
+          ssim_uint8.eval(), self.evaluate(ssim_float32), atol=0.001)
 
 
 class ImageGradientsTest(test_util.TensorFlowTestCase):
@@ -4139,8 +4239,8 @@ class ImageGradientsTest(test_util.TensorFlowTestCase):
 
     dy, dx = image_ops.image_gradients(img)
     with self.cached_session():
-      actual_dy = dy.eval()
-      actual_dx = dx.eval()
+      actual_dy = self.evaluate(dy)
+      actual_dx = self.evaluate(dx)
       self.assertAllClose(expected_dy, actual_dy)
       self.assertAllClose(expected_dx, actual_dx)
 
@@ -4164,8 +4264,8 @@ class ImageGradientsTest(test_util.TensorFlowTestCase):
     assert batch.get_shape().as_list() == [2, 2, 3, 2]
     dy, dx = image_ops.image_gradients(batch)
     with self.test_session(use_gpu=True):
-      actual_dy = dy.eval()
-      actual_dx = dx.eval()
+      actual_dy = self.evaluate(dy)
+      actual_dx = self.evaluate(dx)
       self.assertAllClose(expected_dy, actual_dy)
       self.assertAllClose(expected_dx, actual_dx)
 
@@ -4185,7 +4285,7 @@ class SobelEdgesTest(test_util.TensorFlowTestCase):
                            [[0, 0], [0, 12], [0, 0]]], [1, 2, 3, 1, 2])
     sobel = image_ops.sobel_edges(img)
     with self.test_session(use_gpu=True):
-      actual_sobel = sobel.eval()
+      actual_sobel = self.evaluate(sobel)
       self.assertAllClose(expected, actual_sobel)
 
   def testSobelEdges5x3x4x2(self):
@@ -4207,7 +4307,7 @@ class SobelEdgesTest(test_util.TensorFlowTestCase):
 
     sobel = image_ops.sobel_edges(img)
     with self.test_session(use_gpu=True):
-      actual_sobel = sobel.eval()
+      actual_sobel = self.evaluate(sobel)
       self.assertAllClose(expected_batch, actual_sobel)
 
 
@@ -4220,7 +4320,7 @@ class DecodeImageTest(test_util.TensorFlowTestCase):
       image0 = image_ops.decode_image(jpeg0, dtype=dtypes.uint16)
       image1 = image_ops.convert_image_dtype(image_ops.decode_jpeg(jpeg0),
                                              dtypes.uint16)
-      image0, image1 = sess.run([image0, image1])
+      image0, image1 = self.evaluate([image0, image1])
       self.assertAllEqual(image0, image1)
 
   def testPngUint16(self):
@@ -4230,7 +4330,7 @@ class DecodeImageTest(test_util.TensorFlowTestCase):
       image0 = image_ops.decode_image(png0, dtype=dtypes.uint16)
       image1 = image_ops.convert_image_dtype(
           image_ops.decode_png(png0, dtype=dtypes.uint16), dtypes.uint16)
-      image0, image1 = sess.run([image0, image1])
+      image0, image1 = self.evaluate([image0, image1])
       self.assertAllEqual(image0, image1)
 
   def testGifUint16(self):
@@ -4240,7 +4340,7 @@ class DecodeImageTest(test_util.TensorFlowTestCase):
       image0 = image_ops.decode_image(gif0, dtype=dtypes.uint16)
       image1 = image_ops.convert_image_dtype(image_ops.decode_gif(gif0),
                                              dtypes.uint16)
-      image0, image1 = sess.run([image0, image1])
+      image0, image1 = self.evaluate([image0, image1])
       self.assertAllEqual(image0, image1)
 
   def testBmpUint16(self):
@@ -4250,7 +4350,7 @@ class DecodeImageTest(test_util.TensorFlowTestCase):
       image0 = image_ops.decode_image(bmp0, dtype=dtypes.uint16)
       image1 = image_ops.convert_image_dtype(image_ops.decode_bmp(bmp0),
                                              dtypes.uint16)
-      image0, image1 = sess.run([image0, image1])
+      image0, image1 = self.evaluate([image0, image1])
       self.assertAllEqual(image0, image1)
 
   def testJpegFloat32(self):
@@ -4260,7 +4360,7 @@ class DecodeImageTest(test_util.TensorFlowTestCase):
       image0 = image_ops.decode_image(jpeg0, dtype=dtypes.float32)
       image1 = image_ops.convert_image_dtype(image_ops.decode_jpeg(jpeg0),
                                              dtypes.float32)
-      image0, image1 = sess.run([image0, image1])
+      image0, image1 = self.evaluate([image0, image1])
       self.assertAllEqual(image0, image1)
 
   def testPngFloat32(self):
@@ -4270,7 +4370,7 @@ class DecodeImageTest(test_util.TensorFlowTestCase):
       image0 = image_ops.decode_image(png0, dtype=dtypes.float32)
       image1 = image_ops.convert_image_dtype(
           image_ops.decode_png(png0, dtype=dtypes.uint16), dtypes.float32)
-      image0, image1 = sess.run([image0, image1])
+      image0, image1 = self.evaluate([image0, image1])
       self.assertAllEqual(image0, image1)
 
   def testGifFloat32(self):
@@ -4280,7 +4380,7 @@ class DecodeImageTest(test_util.TensorFlowTestCase):
       image0 = image_ops.decode_image(gif0, dtype=dtypes.float32)
       image1 = image_ops.convert_image_dtype(image_ops.decode_gif(gif0),
                                              dtypes.float32)
-      image0, image1 = sess.run([image0, image1])
+      image0, image1 = self.evaluate([image0, image1])
       self.assertAllEqual(image0, image1)
 
   def testBmpFloat32(self):
@@ -4290,7 +4390,7 @@ class DecodeImageTest(test_util.TensorFlowTestCase):
       image0 = image_ops.decode_image(bmp0, dtype=dtypes.float32)
       image1 = image_ops.convert_image_dtype(image_ops.decode_bmp(bmp0),
                                              dtypes.float32)
-      image0, image1 = sess.run([image0, image1])
+      image0, image1 = self.evaluate([image0, image1])
       self.assertAllEqual(image0, image1)
 
 
diff --git a/tensorflow/python/ops/init_ops.py b/tensorflow/python/ops/init_ops.py
index 4fe6d05620f..c0a4bcd51dd 100644
--- a/tensorflow/python/ops/init_ops.py
+++ b/tensorflow/python/ops/init_ops.py
@@ -55,6 +55,15 @@ class Initializer(object):
   """
 
   def __call__(self, shape, dtype=None, partition_info=None):
+    """Returns a tensor object initialized as specified by the initializer.
+
+    Args:
+      shape: Shape of the tensor.
+      dtype: Optional dtype of the tensor. If not provided use the initializer
+        dtype.
+      partition_info: Optional information about the possible partitioning of a
+        tensor.
+    """
     raise NotImplementedError
 
   def get_config(self):
@@ -143,7 +152,8 @@ class Constant(Initializer):
     value: A Python scalar, list or tuple of values, or a N-dimensional numpy
       array. All elements of the initialized variable will be set to the
       corresponding value in the `value` argument.
-    dtype: The data type.
+    dtype: Default data type, used if no `dtype` argument is provided when
+      calling the initializer.
     verify_shape: Boolean that enables verification of the shape of `value`. If
       `True`, the initializer will throw an error if the shape of `value` is not
       compatible with the shape of the initialized tensor.
@@ -216,7 +226,7 @@ class Constant(Initializer):
       dtype = self.dtype
     if verify_shape is None:
       verify_shape = self._verify_shape
-    return constant_op.constant(
+    return constant_op.constant_v1(
         self.value, dtype=dtype, shape=shape, verify_shape=verify_shape)
 
   def get_config(self):
@@ -239,7 +249,8 @@ class RandomUniform(Initializer):
     seed: A Python integer. Used to create random seeds. See
       `tf.set_random_seed`
       for behavior.
-    dtype: The data type.
+    dtype: Default data type, used if no `dtype` argument is provided when
+      calling the initializer.
   """
 
   def __init__(self, minval=0, maxval=None, seed=None, dtype=dtypes.float32):
@@ -275,7 +286,8 @@ class RandomNormal(Initializer):
     seed: A Python integer. Used to create random seeds. See
       `tf.set_random_seed`
       for behavior.
-    dtype: The data type. Only floating point types are supported.
+    dtype: Default data type, used if no `dtype` argument is provided when
+      calling the initializer. Only floating point types are supported.
   """
 
   def __init__(self, mean=0.0, stddev=1.0, seed=None, dtype=dtypes.float32):
@@ -316,7 +328,8 @@ class TruncatedNormal(Initializer):
     seed: A Python integer. Used to create random seeds. See
       `tf.set_random_seed`
       for behavior.
-    dtype: The data type. Only floating point types are supported.
+    dtype: Default data type, used if no `dtype` argument is provided when
+      calling the initializer. Only floating point types are supported.
   """
 
   def __init__(self, mean=0.0, stddev=1.0, seed=None, dtype=dtypes.float32):
@@ -360,8 +373,7 @@ class UniformUnitScaling(Initializer):
   A similar calculation for convolutional networks gives an analogous result
   with `dim` equal to the product of the first 3 dimensions.  When
   nonlinearities are present, we need to multiply this by a constant `factor`.
-  See [Sussillo et al., 2014](https://arxiv.org/abs/1412.6558)
-  ([pdf](http://arxiv.org/pdf/1412.6558.pdf)) for deeper motivation, experiments
+  See (Sussillo et al., 2014) for deeper motivation, experiments
   and the calculation of constants. In section 2.3 there, the constants were
   numerically computed: for a linear layer it's 1.0, relu: ~1.43, tanh: ~1.15.
 
@@ -370,7 +382,12 @@ class UniformUnitScaling(Initializer):
     seed: A Python integer. Used to create random seeds. See
       `tf.set_random_seed`
       for behavior.
-    dtype: The data type. Only floating point types are supported.
+    dtype: Default data type, used if no `dtype` argument is provided when
+      calling the initializer. Only floating point types are supported.
+
+  References:
+      [Sussillo et al., 2014](https://arxiv.org/abs/1412.6558)
+      ([pdf](http://arxiv.org/pdf/1412.6558.pdf))
   """
 
   @deprecated(None,
@@ -434,7 +451,8 @@ class VarianceScaling(Initializer):
     seed: A Python integer. Used to create random seeds. See
       `tf.set_random_seed`
       for behavior.
-    dtype: The data type. Only floating point types are supported.
+    dtype: Default data type, used if no `dtype` argument is provided when
+      calling the initializer. Only floating point types are supported.
 
   Raises:
     ValueError: In case of an invalid value for the "scale", mode" or
@@ -480,7 +498,7 @@ class VarianceScaling(Initializer):
     else:
       scale /= max(1., (fan_in + fan_out) / 2.)
     if self.distribution == "normal" or self.distribution == "truncated_normal":
-      # constant taken from scipy.stats.truncnorm.std(a=-2, b=2, loc=0., scale=1.)
+    # constant taken from scipy.stats.truncnorm.std(a=-2, b=2, loc=0., scale=1.)
       stddev = math.sqrt(scale) / .87962566103423978
       return random_ops.truncated_normal(
           shape, 0.0, stddev, dtype, seed=self.seed)
@@ -531,7 +549,12 @@ class Orthogonal(Initializer):
     seed: A Python integer. Used to create random seeds. See
       `tf.set_random_seed`
       for behavior.
-    dtype: The data type.
+    dtype: Default data type, used if no `dtype` argument is provided when
+      calling the initializer. Only floating point types are supported.
+
+  References:
+      [Saxe et al., 2014](https://openreview.net/forum?id=_wzZwKpTDF_9C)
+      ([pdf](https://arxiv.org/pdf/1312.6120.pdf))
   """
 
   def __init__(self, gain=1.0, seed=None, dtype=dtypes.float32):
@@ -576,16 +599,21 @@ class ConvolutionDeltaOrthogonal(Initializer):
   The shape of the tensor must have length 3, 4 or 5. The number of input
   filters must not exceed the number of output filters. The center pixels of the
   tensor form an orthogonal matrix. Other pixels are set to be zero. See
-  algorithm 2 in [Xiao et al., 2018]: https://arxiv.org/abs/1806.05393
+  algorithm 2 in (Xiao et al., 2018).
 
 
   Args:
-    gain: Multiplicative factor to apply to the orthogonal matrix. Default is 1.
-      The 2-norm of an input is multiplied by a factor of 'sqrt(gain)' after
-      applying this convolution.
+    gain: Multiplicative factor to apply to the orthogonal
+      matrix. Default is 1. The 2-norm of an input is multiplied by a factor of
+      `gain` after applying this convolution.
     seed: A Python integer. Used to create random seeds. See
       `tf.set_random_seed` for behavior.
-    dtype: The data type.
+    dtype: Default data type, used if no `dtype` argument is provided when
+      calling the initializer. Only floating point types are supported.
+
+  References:
+      [Xiao et al., 2018](http://proceedings.mlr.press/v80/xiao18a.html)
+      ([pdf](http://proceedings.mlr.press/v80/xiao18a/xiao18a.pdf))
   """
 
   def __init__(self, gain=1.0, seed=None, dtype=dtypes.float32):
@@ -613,7 +641,7 @@ class ConvolutionDeltaOrthogonal(Initializer):
     d = array_ops.diag_part(r)
     q *= math_ops.sign(d)
     q = q[:shape[-2], :]
-    q *= math_ops.sqrt(math_ops.cast(self.gain, dtype=dtype))
+    q *= math_ops.cast(self.gain, dtype=dtype)
     if len(shape) == 3:
       weight = array_ops.scatter_nd([[(shape[0]-1)//2]],
                                     array_ops.expand_dims(q, 0), shape)
@@ -636,12 +664,17 @@ class ConvolutionOrthogonal(Initializer):
   Base class used to construct 1D, 2D and 3D orthogonal kernels for convolution.
 
   Args:
-    gain: multiplicative factor to apply to the orthogonal matrix. Default is 1.
-      The 2-norm of an input is multiplied by a factor of 'sqrt(gain)' after
-      applying this convolution.
+    gain: multiplicative factor to apply to the orthogonal
+      matrix. Default is 1. The 2-norm of an input is multiplied by a factor of
+      `gain` after applying this convolution.
     seed: A Python integer. Used to create random seeds. See
       `tf.set_random_seed` for behavior.
-    dtype: The data type.
+    dtype: Default data type, used if no `dtype` argument is provided when
+      calling the initializer. Only floating point types are supported.
+
+  References:
+      [Xiao et al., 2018](http://proceedings.mlr.press/v80/xiao18a.html)
+      ([pdf](http://proceedings.mlr.press/v80/xiao18a/xiao18a.pdf))
   """
 
   def __init__(self, gain=1.0, seed=None, dtype=dtypes.float32):
@@ -698,15 +731,20 @@ class ConvolutionOrthogonal2D(ConvolutionOrthogonal):
   filters must not exceed the number of output filters.
   The orthogonality(==isometry) is exact when the inputs are circular padded.
   There are finite-width effects with non-circular padding (e.g. zero padding).
-  See algorithm 1 in [Xiao et al., 2018]: https://arxiv.org/abs/1806.05393
+  See algorithm 1 in (Xiao et al., 2018).
 
   Args:
-    gain: Multiplicative factor to apply to the orthogonal matrix. Default is 1.
-      This has the effect of scaling the output 2-norm by a factor of
-      `sqrt(gain)`.
+    gain: Multiplicative factor to apply to the orthogonal
+      matrix. Default is 1. This has the effect of scaling the output 2-norm by
+      a factor of `gain`.
     seed: A Python integer. Used to create random seeds. See
       `tf.set_random_seed` for behavior.
-    dtype: The data type.
+    dtype: Default data type, used if no `dtype` argument is provided when
+      calling the initializer. Only floating point types are supported.
+
+  References:
+      [Xiao et al., 2018](http://proceedings.mlr.press/v80/xiao18a.html)
+      ([pdf](http://proceedings.mlr.press/v80/xiao18a/xiao18a.pdf))
   """
 
   def __call__(self, shape, dtype=None, partition_info=None):
@@ -722,7 +760,7 @@ class ConvolutionOrthogonal2D(ConvolutionOrthogonal):
       raise ValueError("Kernel sizes must be equal.")
 
     kernel = self._orthogonal_kernel(shape[0], shape[2], shape[3])
-    kernel *= math_ops.sqrt(math_ops.cast(self.gain, dtype=dtype))
+    kernel *= math_ops.cast(self.gain, dtype=dtype)
     return kernel
 
   def _dict_to_tensor(self, x, k1, k2):
@@ -834,16 +872,21 @@ class ConvolutionOrthogonal1D(ConvolutionOrthogonal):
   filters must not exceed the number of output filters.
   The orthogonality(==isometry) is exact when the inputs are circular padded.
   There are finite-width effects with non-circular padding (e.g. zero padding).
-  See algorithm 1 in [Xiao et al., 2018]: https://arxiv.org/abs/1806.05393
+  See algorithm 1 in (Xiao et al., 2018).
 
   Args:
-    gain: Multiplicative factor to apply to the orthogonal matrix. Default is 1.
-      The 2-norm of an input is multiplied by a factor of 'sqrt(gain)' after
-      applying this convolution.
+    gain: Multiplicative factor to apply to the orthogonal
+      matrix. Default is 1. The 2-norm of an input is multiplied by a factor of
+      `gain` after applying this convolution.
     seed: A Python integer. Used to create random seeds. See
       `tf.set_random_seed`
       for behavior.
-    dtype: The data type.
+    dtype: Default data type, used if no `dtype` argument is provided when
+      calling the initializer. Only floating point types are supported.
+
+  References:
+      [Xiao et al., 2018](http://proceedings.mlr.press/v80/xiao18a.html)
+      ([pdf](http://proceedings.mlr.press/v80/xiao18a/xiao18a.pdf))
   """
 
   def __call__(self, shape, dtype=None, partition_info=None):
@@ -856,7 +899,7 @@ class ConvolutionOrthogonal1D(ConvolutionOrthogonal):
       raise ValueError("In_filters cannot be greater than out_filters.")
 
     kernel = self._orthogonal_kernel(shape[0], shape[-2], shape[-1])
-    kernel *= math_ops.sqrt(math_ops.cast(self.gain, dtype=dtype))
+    kernel *= math_ops.cast(self.gain, dtype=dtype)
     return kernel
 
   def _dict_to_tensor(self, x, k):
@@ -951,15 +994,20 @@ class ConvolutionOrthogonal3D(ConvolutionOrthogonal):
   filters must not exceed the number of output filters.
   The orthogonality(==isometry) is exact when the inputs are circular padded.
   There are finite-width effects with non-circular padding (e.g. zero padding).
-  See algorithm 1 [Xiao et al., 2018] in: https://arxiv.org/abs/1806.05393
+  See algorithm 1 (Xiao et al., 2018).
 
   Args:
-    gain: Multiplicative factor to apply to the orthogonal matrix. Default is 1.
-      The 2-norm of an input is multiplied by a factor of 'sqrt(gain)' after
-      applying this convolution.
+    gain: Multiplicative factor to apply to the orthogonal
+      matrix. Default is 1. The 2-norm of an input is multiplied by a factor of
+      `gain` after applying this convolution.
     seed: A Python integer. Used to create random seeds. See
       `tf.set_random_seed` for behavior.
-    dtype: The data type.
+    dtype: Default data type, used if no `dtype` argument is provided when
+      calling the initializer. Only floating point types are supported.
+
+  References:
+      [Xiao et al., 2018](http://proceedings.mlr.press/v80/xiao18a.html)
+      ([pdf](http://proceedings.mlr.press/v80/xiao18a/xiao18a.pdf))
   """
 
   def __call__(self, shape, dtype=None, partition_info=None):
@@ -975,7 +1023,7 @@ class ConvolutionOrthogonal3D(ConvolutionOrthogonal):
       raise ValueError("Kernel sizes must be equal.")
 
     kernel = self._orthogonal_kernel(shape[0], shape[-2], shape[-1])
-    kernel *= math_ops.sqrt(math_ops.cast(self.gain, dtype=dtype))
+    kernel *= math_ops.cast(self.gain, dtype=dtype)
     return kernel
 
   def _dict_to_tensor(self, x, k1, k2, k3):
@@ -1105,7 +1153,8 @@ class Identity(Initializer):
 
   Args:
     gain: Multiplicative factor to apply to the identity matrix.
-    dtype: The type of the output.
+    dtype: Default data type, used if no `dtype` argument is provided when
+      calling the initializer. Only floating point types are supported.
   """
 
   def __init__(self, gain=1.0, dtype=dtypes.float32):
@@ -1139,18 +1188,19 @@ class GlorotUniform(VarianceScaling):
   where `fan_in` is the number of input units in the weight tensor
   and `fan_out` is the number of output units in the weight tensor.
 
-  Reference: http://jmlr.org/proceedings/papers/v9/glorot10a/glorot10a.pdf
-
   Args:
     seed: A Python integer. Used to create random seeds. See
       `tf.set_random_seed`
       for behavior.
-    dtype: The data type. Only floating point types are supported.
+    dtype: Default data type, used if no `dtype` argument is provided when
+      calling the initializer. Only floating point types are supported.
+
+  References:
+      [Glorot et al., 2010](http://proceedings.mlr.press/v9/glorot10a.html)
+      ([pdf](http://jmlr.org/proceedings/papers/v9/glorot10a/glorot10a.pdf))
   """
 
-  def __init__(self,
-               seed=None,
-               dtype=dtypes.float32):
+  def __init__(self, seed=None, dtype=dtypes.float32):
     super(GlorotUniform, self).__init__(
         scale=1.0,
         mode="fan_avg",
@@ -1159,10 +1209,7 @@ class GlorotUniform(VarianceScaling):
         dtype=dtype)
 
   def get_config(self):
-    return {
-        "seed": self.seed,
-        "dtype": self.dtype.name
-    }
+    return {"seed": self.seed, "dtype": self.dtype.name}
 
 
 @tf_export(
@@ -1181,18 +1228,18 @@ class GlorotNormal(VarianceScaling):
   where `fan_in` is the number of input units in the weight tensor
   and `fan_out` is the number of output units in the weight tensor.
 
-  Reference: http://jmlr.org/proceedings/papers/v9/glorot10a/glorot10a.pdf
-
   Args:
     seed: A Python integer. Used to create random seeds. See
-      `tf.set_random_seed`
-      for behavior.
-    dtype: The data type. Only floating point types are supported.
+      `tf.set_random_seed` for behavior.
+    dtype: Default data type, used if no `dtype` argument is provided when
+      calling the initializer. Only floating point types are supported.
+
+  References:
+      [Glorot et al., 2010](http://proceedings.mlr.press/v9/glorot10a.html)
+      ([pdf](http://jmlr.org/proceedings/papers/v9/glorot10a/glorot10a.pdf))
   """
 
-  def __init__(self,
-               seed=None,
-               dtype=dtypes.float32):
+  def __init__(self, seed=None, dtype=dtypes.float32):
     super(GlorotNormal, self).__init__(
         scale=1.0,
         mode="fan_avg",
@@ -1201,10 +1248,7 @@ class GlorotNormal(VarianceScaling):
         dtype=dtype)
 
   def get_config(self):
-    return {
-        "seed": self.seed,
-        "dtype": self.dtype.name
-    }
+    return {"seed": self.seed, "dtype": self.dtype.name}
 
 
 # Aliases.
@@ -1244,9 +1288,11 @@ def lecun_normal(seed=None):
       An initializer.
 
   References:
-      - [Self-Normalizing Neural Networks](https://arxiv.org/abs/1706.02515)
-      - [Efficient
-      Backprop](http://yann.lecun.com/exdb/publis/pdf/lecun-98b.pdf)
+      - Self-Normalizing Neural Networks,
+      [Klambauer et al., 2017](https://papers.nips.cc/paper/6698-self-normalizing-neural-networks)
+      ([pdf](https://papers.nips.cc/paper/6698-self-normalizing-neural-networks.pdf))
+      - Efficient Backprop,
+      [Lecun et al., 1998](http://yann.lecun.com/exdb/publis/pdf/lecun-98b.pdf)
   """
   return VarianceScaling(
       scale=1., mode="fan_in", distribution="truncated_normal", seed=seed)
@@ -1267,8 +1313,11 @@ def lecun_uniform(seed=None):
       An initializer.
 
   References:
-      LeCun 98, Efficient Backprop,
-      http://yann.lecun.com/exdb/publis/pdf/lecun-98b.pdf
+      - Self-Normalizing Neural Networks,
+      [Klambauer et al., 2017](https://papers.nips.cc/paper/6698-self-normalizing-neural-networks)
+      ([pdf](https://papers.nips.cc/paper/6698-self-normalizing-neural-networks.pdf))
+      - Efficient Backprop,
+      [Lecun et al., 1998](http://yann.lecun.com/exdb/publis/pdf/lecun-98b.pdf)
   """
   return VarianceScaling(
       scale=1., mode="fan_in", distribution="uniform", seed=seed)
@@ -1289,7 +1338,8 @@ def he_normal(seed=None):
       An initializer.
 
   References:
-      He et al., http://arxiv.org/abs/1502.01852
+      [He et al., 2015](https://www.cv-foundation.org/openaccess/content_iccv_2015/html/He_Delving_Deep_into_ICCV_2015_paper.html)
+      ([pdf](https://www.cv-foundation.org/openaccess/content_iccv_2015/papers/He_Delving_Deep_into_ICCV_2015_paper.pdf))
   """
   return VarianceScaling(
       scale=2., mode="fan_in", distribution="truncated_normal", seed=seed)
@@ -1310,7 +1360,8 @@ def he_uniform(seed=None):
       An initializer.
 
   References:
-      He et al., http://arxiv.org/abs/1502.01852
+      [He et al., 2015](https://www.cv-foundation.org/openaccess/content_iccv_2015/html/He_Delving_Deep_into_ICCV_2015_paper.html)
+      ([pdf](https://www.cv-foundation.org/openaccess/content_iccv_2015/papers/He_Delving_Deep_into_ICCV_2015_paper.pdf))
   """
   return VarianceScaling(
       scale=2., mode="fan_in", distribution="uniform", seed=seed)
diff --git a/tensorflow/python/ops/init_ops_test.py b/tensorflow/python/ops/init_ops_test.py
index 5693c3caaf5..1f222480046 100644
--- a/tensorflow/python/ops/init_ops_test.py
+++ b/tensorflow/python/ops/init_ops_test.py
@@ -45,8 +45,8 @@ class InitializersTest(test.TestCase):
       output = variable.numpy()
     else:
       sess = ops.get_default_session()
-      sess.run(variable.initializer)
-      output = sess.run(variable)
+      self.evaluate(variable.initializer)
+      output = self.evaluate(variable)
     lim = 3e-2
     if target_std is not None:
       self.assertGreater(lim, abs(output.std() - target_std))
diff --git a/tensorflow/python/ops/linalg/BUILD b/tensorflow/python/ops/linalg/BUILD
index c7314d77749..5df2d6b8381 100644
--- a/tensorflow/python/ops/linalg/BUILD
+++ b/tensorflow/python/ops/linalg/BUILD
@@ -18,6 +18,7 @@ py_library(
         "//tensorflow/python:random_ops",
         "//tensorflow/python:tensor_util",
         "//tensorflow/python:util",
+        "//tensorflow/python/ops/signal",
         "//third_party/py/numpy",
         "@six_archive//:six",
     ],
diff --git a/tensorflow/python/ops/linalg/cholesky_registrations.py b/tensorflow/python/ops/linalg/cholesky_registrations.py
new file mode 100644
index 00000000000..e5284cf22ac
--- /dev/null
+++ b/tensorflow/python/ops/linalg/cholesky_registrations.py
@@ -0,0 +1,101 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Registrations for LinearOperator.cholesky."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.ops import linalg_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.linalg import linear_operator
+from tensorflow.python.ops.linalg import linear_operator_algebra
+from tensorflow.python.ops.linalg import linear_operator_block_diag
+from tensorflow.python.ops.linalg import linear_operator_diag
+from tensorflow.python.ops.linalg import linear_operator_identity
+from tensorflow.python.ops.linalg import linear_operator_kronecker
+from tensorflow.python.ops.linalg import linear_operator_lower_triangular
+
+
+# By default, compute the Cholesky of the dense matrix, and return a
+# LowerTriangular operator. Methods below specialize this registration.
+@linear_operator_algebra.RegisterCholesky(linear_operator.LinearOperator)
+def _cholesky_linear_operator(linop):
+  return linear_operator_lower_triangular.LinearOperatorLowerTriangular(
+      linalg_ops.cholesky(linop.to_dense()),
+      is_non_singular=True,
+      is_self_adjoint=False,
+      is_square=True)
+
+
+@linear_operator_algebra.RegisterCholesky(
+    linear_operator_diag.LinearOperatorDiag)
+def _cholesky_diag(diag_operator):
+  return linear_operator_diag.LinearOperatorDiag(
+      math_ops.sqrt(diag_operator.diag),
+      is_non_singular=True,
+      is_self_adjoint=True,
+      is_positive_definite=True,
+      is_square=True)
+
+
+@linear_operator_algebra.RegisterCholesky(
+    linear_operator_identity.LinearOperatorIdentity)
+def _cholesky_identity(identity_operator):
+  return linear_operator_identity.LinearOperatorIdentity(
+      num_rows=identity_operator._num_rows,  # pylint: disable=protected-access
+      batch_shape=identity_operator.batch_shape,
+      dtype=identity_operator.dtype,
+      is_non_singular=True,
+      is_self_adjoint=True,
+      is_positive_definite=True,
+      is_square=True)
+
+
+@linear_operator_algebra.RegisterCholesky(
+    linear_operator_identity.LinearOperatorScaledIdentity)
+def _cholesky_scaled_identity(identity_operator):
+  return linear_operator_identity.LinearOperatorScaledIdentity(
+      num_rows=identity_operator._num_rows,  # pylint: disable=protected-access
+      multiplier=math_ops.sqrt(identity_operator.multiplier),
+      is_non_singular=True,
+      is_self_adjoint=True,
+      is_positive_definite=True,
+      is_square=True)
+
+
+@linear_operator_algebra.RegisterCholesky(
+    linear_operator_block_diag.LinearOperatorBlockDiag)
+def _cholesky_block_diag(block_diag_operator):
+    # We take the cholesky of each block on the diagonal.
+  return linear_operator_block_diag.LinearOperatorBlockDiag(
+      operators=[
+          operator.cholesky() for operator in block_diag_operator.operators],
+      is_non_singular=True,
+      is_self_adjoint=False,
+      is_square=True)
+
+
+@linear_operator_algebra.RegisterCholesky(
+    linear_operator_kronecker.LinearOperatorKronecker)
+def _cholesky_kronecker(kronecker_operator):
+    # Cholesky decomposition of a Kronecker product is the Kronecker product
+    # of cholesky decompositions.
+  return linear_operator_kronecker.LinearOperatorKronecker(
+      operators=[
+          operator.cholesky() for operator in kronecker_operator.operators],
+      is_non_singular=True,
+      is_self_adjoint=False,
+      is_square=True)
diff --git a/tensorflow/python/ops/linalg/linalg.py b/tensorflow/python/ops/linalg/linalg.py
index c29b5033bb1..ac4fd4ebc60 100644
--- a/tensorflow/python/ops/linalg/linalg.py
+++ b/tensorflow/python/ops/linalg/linalg.py
@@ -20,6 +20,9 @@ from __future__ import print_function
 
 # go/tf-wildcard-import
 # pylint: disable=wildcard-import,unused-import
+from tensorflow.python.ops.linalg import cholesky_registrations as _cholesky_registrations
+from tensorflow.python.ops.linalg import linear_operator_algebra as _linear_operator_algebra
+from tensorflow.python.ops.linalg import matmul_registrations as _matmul_registrations
 from tensorflow.python.ops.linalg.linalg_impl import *
 from tensorflow.python.ops.linalg.linear_operator import *
 from tensorflow.python.ops.linalg.linear_operator_block_diag import *
diff --git a/tensorflow/python/ops/linalg/linalg_impl.py b/tensorflow/python/ops/linalg/linalg_impl.py
index 08d50ce622f..2c9476a9bd3 100644
--- a/tensorflow/python/ops/linalg/linalg_impl.py
+++ b/tensorflow/python/ops/linalg/linalg_impl.py
@@ -88,7 +88,7 @@ def logdet(matrix, name=None):
     chol = gen_linalg_ops.cholesky(matrix)
     return 2.0 * math_ops.reduce_sum(
         math_ops.log(math_ops.real(array_ops.matrix_diag_part(chol))),
-        reduction_indices=[-1])
+        axis=[-1])
 
 
 @tf_export('linalg.adjoint')
diff --git a/tensorflow/python/ops/linalg/linear_operator.py b/tensorflow/python/ops/linalg/linear_operator.py
index 9ef6c42b04c..8efafda3a1e 100644
--- a/tensorflow/python/ops/linalg/linear_operator.py
+++ b/tensorflow/python/ops/linalg/linear_operator.py
@@ -32,6 +32,7 @@ from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import linalg_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.linalg import linalg_impl as linalg
+from tensorflow.python.ops.linalg import linear_operator_algebra
 from tensorflow.python.ops.linalg import linear_operator_util
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util.tf_export import tf_export
@@ -284,7 +285,7 @@ class LinearOperator(object):
     `[B1,...,Bb, M, N]`, equivalent to `tf.shape(A)`.
 
     Args:
-      name:  A name for this `Op.
+      name:  A name for this `Op`.
 
     Returns:
       `int32` `Tensor`
@@ -318,7 +319,7 @@ class LinearOperator(object):
     `[B1,...,Bb]`.
 
     Args:
-      name:  A name for this `Op.
+      name:  A name for this `Op`.
 
     Returns:
       `int32` `Tensor`
@@ -340,7 +341,7 @@ class LinearOperator(object):
     `A.shape = [B1,...,Bb, M, N]`, then this returns `b + 2`.
 
     Args:
-      name:  A name for this `Op.
+      name:  A name for this `Op`.
 
     Returns:
       Python integer, or None if the tensor rank is undefined.
@@ -356,7 +357,7 @@ class LinearOperator(object):
     `A.shape = [B1,...,Bb, M, N]`, then this returns `b + 2`.
 
     Args:
-      name:  A name for this `Op.
+      name:  A name for this `Op`.
 
     Returns:
       `int32` `Tensor`, determined at runtime.
@@ -581,16 +582,29 @@ class LinearOperator(object):
     ```
 
     Args:
-      x: `Tensor` with compatible shape and same `dtype` as `self`.
-        See class docstring for definition of compatibility.
+      x: `LinearOperator` or `Tensor` with compatible shape and same `dtype` as
+        `self`. See class docstring for definition of compatibility.
       adjoint: Python `bool`.  If `True`, left multiply by the adjoint: `A^H x`.
       adjoint_arg:  Python `bool`.  If `True`, compute `A x^H` where `x^H` is
         the hermitian transpose (transposition and complex conjugation).
-      name:  A name for this `Op.
+      name:  A name for this `Op`.
 
     Returns:
-      A `Tensor` with shape `[..., M, R]` and same `dtype` as `self`.
+      A `LinearOperator` or `Tensor` with shape `[..., M, R]` and same `dtype`
+        as `self`.
     """
+    if isinstance(x, LinearOperator):
+      if adjoint or adjoint_arg:
+        raise ValueError(".matmul not supported with adjoints.")
+      if (x.range_dimension is not None and
+          self.domain_dimension is not None and
+          x.range_dimension != self.domain_dimension):
+        raise ValueError(
+            "Operators are incompatible. Expected `x` to have dimension"
+            " {} but got {}.".format(self.domain_dimension, x.range_dimension))
+      with self._name_scope(name):
+        return linear_operator_algebra.matmul(self, x)
+
     with self._name_scope(name, values=[x]):
       x = ops.convert_to_tensor(x, name="x")
       self._check_input_dtype(x)
@@ -630,7 +644,7 @@ class LinearOperator(object):
         dimensions, the last dimension defines a vector.
         See class docstring for definition of compatibility.
       adjoint: Python `bool`.  If `True`, left multiply by the adjoint: `A^H x`.
-      name:  A name for this `Op.
+      name:  A name for this `Op`.
 
     Returns:
       A `Tensor` with shape `[..., M]` and same `dtype` as `self`.
@@ -655,7 +669,7 @@ class LinearOperator(object):
     """Determinant for every batch member.
 
     Args:
-      name:  A name for this `Op.
+      name:  A name for this `Op`.
 
     Returns:
       `Tensor` with shape `self.batch_shape` and same `dtype` as `self`.
@@ -676,7 +690,7 @@ class LinearOperator(object):
         "  Requires conversion to a dense matrix and O(N^3) operations.")
     if self._can_use_cholesky():
       diag = array_ops.matrix_diag_part(linalg_ops.cholesky(self.to_dense()))
-      return 2 * math_ops.reduce_sum(math_ops.log(diag), reduction_indices=[-1])
+      return 2 * math_ops.reduce_sum(math_ops.log(diag), axis=[-1])
     _, log_abs_det = linalg.slogdet(self.to_dense())
     return log_abs_det
 
@@ -684,7 +698,7 @@ class LinearOperator(object):
     """Log absolute value of determinant for every batch member.
 
     Args:
-      name:  A name for this `Op.
+      name:  A name for this `Op`.
 
     Returns:
       `Tensor` with shape `self.batch_shape` and same `dtype` as `self`.
@@ -830,6 +844,31 @@ class LinearOperator(object):
 
       return self._solvevec(rhs, adjoint=adjoint)
 
+  def cholesky(self, name="cholesky"):
+    """Returns a Cholesky factor as a `LinearOperator`.
+
+    Given `A` representing this `LinearOperator`, if `A` is positive definite
+    self-adjoint, return `L`, where `A = L L^T`, i.e. the cholesky
+    decomposition.
+
+    Args:
+      name:  A name for this `Op`.
+
+    Returns:
+      `LinearOperator` which represents the lower triangular matrix
+      in the Cholesky decomposition.
+
+    Raises:
+      ValueError: When the `LinearOperator` is not hinted to be positive
+        definite and self adjoint.
+    """
+
+    if not self._can_use_cholesky():
+      raise ValueError("Cannot take the Cholesky decomposition: "
+                       "Not a positive definite self adjoint matrix.")
+    with self._name_scope(name):
+      return linear_operator_algebra.cholesky(self)
+
   def _to_dense(self):
     """Generic and often inefficient implementation.  Override often."""
     logging.warn("Using (possibly slow) default implementation of to_dense."
@@ -922,6 +961,4 @@ class LinearOperator(object):
       return self._add_to_tensor(x)
 
   def _can_use_cholesky(self):
-    # TODO(langmore) Add complex types when tf.cholesky can use them.
-    return (not self.dtype.is_complex and self.is_self_adjoint and
-            self.is_positive_definite)
+    return self.is_self_adjoint and self.is_positive_definite
diff --git a/tensorflow/python/ops/linalg/linear_operator_adjoint.py b/tensorflow/python/ops/linalg/linear_operator_adjoint.py
new file mode 100644
index 00000000000..858e224b9ad
--- /dev/null
+++ b/tensorflow/python/ops/linalg/linear_operator_adjoint.py
@@ -0,0 +1,207 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Takes the adjoint of a `LinearOperator`."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.linalg import linalg_impl as linalg
+from tensorflow.python.ops.linalg import linear_operator
+from tensorflow.python.util.tf_export import tf_export
+
+__all__ = []
+
+
+@tf_export("linalg.LinearOperatorAdjoint")
+class LinearOperatorAdjoint(linear_operator.LinearOperator):
+  """`LinearOperator` representing the adjoint of another operator.
+
+  This operator represents the adjoint of another operator.
+
+  ```python
+  # Create a 2 x 2 linear operator.
+  operator = LinearOperatorFullMatrix([[1 - i., 3.], [0., 1. + i]])
+  operator_adjoint = LinearOperatorAdjoint(operator)
+
+  operator_adjoint.to_dense()
+  ==> [[1. + i, 0.]
+       [3., 1 - i]]
+
+  operator_adjoint.shape
+  ==> [2, 2]
+
+  operator_adjoint.log_abs_determinant()
+  ==> - log(2)
+
+  x = ... Shape [2, 4] Tensor
+  operator_adjoint.matmul(x)
+  ==> Shape [2, 4] Tensor, equal to operator.matmul(x, adjoint=True)
+  ```
+
+  #### Performance
+
+  The performance of `LinearOperatorAdjoint` depends on the underlying
+  operators performance.
+
+  #### Matrix property hints
+
+  This `LinearOperator` is initialized with boolean flags of the form `is_X`,
+  for `X = non_singular, self_adjoint, positive_definite, square`.
+  These have the following meaning:
+
+  * If `is_X == True`, callers should expect the operator to have the
+    property `X`.  This is a promise that should be fulfilled, but is *not* a
+    runtime assert.  For example, finite floating point precision may result
+    in these promises being violated.
+  * If `is_X == False`, callers should expect the operator to not have `X`.
+  * If `is_X == None` (the default), callers should have no expectation either
+    way.
+  """
+
+  def __init__(self,
+               operator,
+               is_non_singular=None,
+               is_self_adjoint=None,
+               is_positive_definite=None,
+               is_square=None,
+               name=None):
+    r"""Initialize a `LinearOperatorAdjoint`.
+
+    `LinearOperatorAdjoint` is initialized with an operator `A`.  The `solve`
+    and `matmul` methods effectively flip the `adjoint` argument.  E.g.
+
+    ```
+    A = MyLinearOperator(...)
+    B = LinearOperatorAdjoint(A)
+    x = [....]  # a vector
+
+    assert A.matvec(x, adjoint=True) == B.matvec(x, adjoint=False)
+    ```
+
+    Args:
+      operator: `LinearOperator` object.
+      is_non_singular:  Expect that this operator is non-singular.
+      is_self_adjoint:  Expect that this operator is equal to its hermitian
+        transpose.
+      is_positive_definite:  Expect that this operator is positive definite,
+        meaning the quadratic form `x^H A x` has positive real part for all
+        nonzero `x`.  Note that we do not require the operator to be
+        self-adjoint to be positive-definite.  See:
+        https://en.wikipedia.org/wiki/Positive-definite_matrix#Extension_for_non-symmetric_matrices
+      is_square:  Expect that this operator acts like square [batch] matrices.
+      name: A name for this `LinearOperator`. Default is `operator.name +
+        "_adjoint"`.
+
+    Raises:
+      ValueError:  If `operator.is_non_singular` is False.
+    """
+
+    self._operator = operator
+
+    # The congruency of is_non_singular and is_self_adjoint was checked in the
+    # base operator.
+    def _combined_hint(hint_str, provided_hint_value, message):
+      """Get combined hint in the case where operator.hint should equal hint."""
+      op_hint = getattr(operator, hint_str)
+      if op_hint is False and provided_hint_value:
+        raise ValueError(message)
+      if op_hint and provided_hint_value is False:
+        raise ValueError(message)
+      return (op_hint or provided_hint_value) or None
+
+    is_square = _combined_hint(
+        "is_square", is_square,
+        "An operator is square if and only if its adjoint is square.")
+
+    is_non_singular = _combined_hint(
+        "is_non_singular", is_non_singular,
+        "An operator is non-singular if and only if its adjoint is "
+        "non-singular.")
+
+    is_self_adjoint = _combined_hint(
+        "is_self_adjoint", is_self_adjoint,
+        "An operator is self-adjoint if and only if its adjoint is "
+        "self-adjoint.")
+
+    is_positive_definite = _combined_hint(
+        "is_positive_definite", is_positive_definite,
+        "An operator is positive-definite if and only if its adjoint is "
+        "positive-definite.")
+
+    is_square = _combined_hint(
+        "is_square", is_square,
+        "An operator is square if and only if its adjoint is square.")
+
+    # Initialization.
+    if name is None:
+      name = operator.name + "_adjoint"
+    with ops.name_scope(name, values=operator.graph_parents):
+      super(LinearOperatorAdjoint, self).__init__(
+          dtype=operator.dtype,
+          graph_parents=operator.graph_parents,
+          is_non_singular=is_non_singular,
+          is_self_adjoint=is_self_adjoint,
+          is_positive_definite=is_positive_definite,
+          is_square=is_square,
+          name=name)
+
+  @property
+  def operator(self):
+    """The operator before taking the adjoint."""
+    return self._operator
+
+  def _assert_non_singular(self):
+    return self.operator.assert_non_singular()
+
+  def _assert_positive_definite(self):
+    return self.operator.assert_positive_definite()
+
+  def _assert_self_adjoint(self):
+    return self.operator.assert_self_adjoint()
+
+  def _shape(self):
+    return self.operator.shape
+
+  def _shape_tensor(self):
+    return self.operator.shape_tensor()
+
+  def _matmul(self, x, adjoint=False, adjoint_arg=False):
+    return self.operator.matmul(
+        x, adjoint=(not adjoint), adjoint_arg=adjoint_arg)
+
+  def _determinant(self):
+    if self.is_self_adjoint:
+      return self.operator.determinant()
+    return math_ops.conj(self.operator.determinant())
+
+  def _log_abs_determinant(self):
+    return self.operator.log_abs_determinant()
+
+  def _trace(self):
+    if self.is_self_adjoint:
+      return self.operator.trace()
+    return math_ops.conj(self.operator.trace())
+
+  def _solve(self, rhs, adjoint=False, adjoint_arg=False):
+    return self.operator.solve(
+        rhs, adjoint=(not adjoint), adjoint_arg=adjoint_arg)
+
+  def _to_dense(self):
+    if self.is_self_adjoint:
+      return self.operator.to_dense()
+    return linalg.adjoint(self.operator.to_dense())
diff --git a/tensorflow/python/ops/linalg/linear_operator_algebra.py b/tensorflow/python/ops/linalg/linear_operator_algebra.py
new file mode 100644
index 00000000000..7b99066e4c1
--- /dev/null
+++ b/tensorflow/python/ops/linalg/linear_operator_algebra.py
@@ -0,0 +1,191 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Registration mechanisms for various n-ary operations on LinearOperators."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import itertools
+
+from tensorflow.python.framework import ops
+from tensorflow.python.util import tf_inspect
+
+
+_CHOLESKY_DECOMPS = {}
+_MATMUL = {}
+
+
+def _registered_function(type_list, registry):
+  """Given a list of classes, finds the most specific function registered."""
+  enumerated_hierarchies = [enumerate(tf_inspect.getmro(t)) for t in type_list]
+  # Get all possible combinations of hierarchies.
+  cls_combinations = list(itertools.product(*enumerated_hierarchies))
+
+  def hierarchy_distance(cls_combination):
+    candidate_distance = sum(c[0] for c in cls_combination)
+    if tuple(c[1] for c in cls_combination) in registry:
+      return candidate_distance
+    return 10000
+
+  registered_combination = min(cls_combinations, key=hierarchy_distance)
+  return registry.get(tuple(r[1] for r in registered_combination), None)
+
+
+def _registered_cholesky(type_a):
+  """Get the Cholesky function registered for class a."""
+  return _registered_function([type_a], _CHOLESKY_DECOMPS)
+
+
+def _registered_matmul(type_a, type_b):
+  """Get the Matmul function registered for classes a and b."""
+  return _registered_function([type_a, type_b], _MATMUL)
+
+
+def cholesky(lin_op_a, name=None):
+  """Get the Cholesky factor associated to lin_op_a.
+
+  Args:
+    lin_op_a: The LinearOperator to decompose.
+    name: Name to use for this operation.
+
+  Returns:
+    A LinearOperator that represents the lower Cholesky factor of `lin_op_a`.
+
+  Raises:
+    NotImplementedError: If no Cholesky method is defined for the LinearOperator
+      type of `lin_op_a`.
+  """
+  cholesky_fn = _registered_cholesky(type(lin_op_a))
+  if cholesky_fn is None:
+    raise ValueError("No cholesky decomposition registered for {}".format(
+        type(lin_op_a)))
+
+  with ops.name_scope(name, "Cholesky"):
+    return cholesky_fn(lin_op_a)
+
+
+def matmul(lin_op_a, lin_op_b, name=None):
+  """Compute lin_op_a.matmul(lin_op_b).
+
+  Args:
+    lin_op_a: The LinearOperator on the left.
+    lin_op_b: The LinearOperator on the right.
+    name: Name to use for this operation.
+
+  Returns:
+    A LinearOperator that represents the matmul between `lin_op_a` and
+      `lin_op_b`.
+
+  Raises:
+    NotImplementedError: If no matmul method is defined between types of
+      `lin_op_a` and `lin_op_b`.
+  """
+  matmul_fn = _registered_matmul(type(lin_op_a), type(lin_op_b))
+  if matmul_fn is None:
+    raise ValueError("No matmul registered for {}.matmul({})".format(
+        type(lin_op_a), type(lin_op_b)))
+
+  with ops.name_scope(name, "Matmul"):
+    return matmul_fn(lin_op_a, lin_op_b)
+
+
+class RegisterCholesky(object):
+  """Decorator to register a Cholesky implementation function.
+
+  Usage:
+
+  @linear_operator_algebra.RegisterCholesky(lin_op.LinearOperatorIdentity)
+  def _cholesky_identity(lin_op_a):
+    # Return the identity matrix.
+  """
+
+  def __init__(self, lin_op_cls_a):
+    """Initialize the LinearOperator registrar.
+
+    Args:
+      lin_op_cls_a: the class of the LinearOperator to decompose.
+    """
+    self._key = (lin_op_cls_a,)
+
+  def __call__(self, cholesky_fn):
+    """Perform the Cholesky registration.
+
+    Args:
+      cholesky_fn: The function to use for the Cholesky.
+
+    Returns:
+      cholesky_fn
+
+    Raises:
+      TypeError: if cholesky_fn is not a callable.
+      ValueError: if a Cholesky function has already been registered for
+        the given argument classes.
+    """
+    if not callable(cholesky_fn):
+      raise TypeError(
+          "cholesky_fn must be callable, received: {}".format(cholesky_fn))
+    if self._key in _CHOLESKY_DECOMPS:
+      raise ValueError("Cholesky({}) has already been registered to: {}".format(
+          self._key[0].__name__, _CHOLESKY_DECOMPS[self._key]))
+    _CHOLESKY_DECOMPS[self._key] = cholesky_fn
+    return cholesky_fn
+
+
+class RegisterMatmul(object):
+  """Decorator to register a Matmul implementation function.
+
+  Usage:
+
+  @linear_operator_algebra.RegisterMatmul(
+    lin_op.LinearOperatorIdentity,
+    lin_op.LinearOperatorIdentity)
+  def _matmul_identity(a, b):
+    # Return the identity matrix.
+  """
+
+  def __init__(self, lin_op_cls_a, lin_op_cls_b):
+    """Initialize the LinearOperator registrar.
+
+    Args:
+      lin_op_cls_a: the class of the LinearOperator to multiply.
+      lin_op_cls_b: the class of the second LinearOperator to multiply.
+    """
+    self._key = (lin_op_cls_a, lin_op_cls_b)
+
+  def __call__(self, matmul_fn):
+    """Perform the Matmul registration.
+
+    Args:
+      matmul_fn: The function to use for the Matmul.
+
+    Returns:
+      matmul_fn
+
+    Raises:
+      TypeError: if matmul_fn is not a callable.
+      ValueError: if a Matmul function has already been registered for
+        the given argument classes.
+    """
+    if not callable(matmul_fn):
+      raise TypeError(
+          "matmul_fn must be callable, received: {}".format(matmul_fn))
+    if self._key in _MATMUL:
+      raise ValueError("Matmul({}, {}) has already been registered.".format(
+          self._key[0].__name__,
+          self._key[1].__name__))
+    _MATMUL[self._key] = matmul_fn
+    return matmul_fn
diff --git a/tensorflow/python/ops/linalg/linear_operator_block_diag.py b/tensorflow/python/ops/linalg/linear_operator_block_diag.py
index 438c3496bdf..b0b418c9970 100644
--- a/tensorflow/python/ops/linalg/linear_operator_block_diag.py
+++ b/tensorflow/python/ops/linalg/linear_operator_block_diag.py
@@ -29,9 +29,7 @@ from tensorflow.python.ops.linalg import linear_operator
 from tensorflow.python.ops.linalg import linear_operator_util
 from tensorflow.python.util.tf_export import tf_export
 
-__all__ = [
-    "LinearOperatorBlockDiag",
-]
+__all__ = ["LinearOperatorBlockDiag"]
 
 
 @tf_export("linalg.LinearOperatorBlockDiag")
diff --git a/tensorflow/python/ops/linalg/linear_operator_circulant.py b/tensorflow/python/ops/linalg/linear_operator_circulant.py
index 021ef473836..b74baa5dfdb 100644
--- a/tensorflow/python/ops/linalg/linear_operator_circulant.py
+++ b/tensorflow/python/ops/linalg/linear_operator_circulant.py
@@ -30,6 +30,7 @@ from tensorflow.python.ops.distributions import util as distribution_util
 from tensorflow.python.ops.linalg import linalg_impl as linalg
 from tensorflow.python.ops.linalg import linear_operator
 from tensorflow.python.ops.linalg import linear_operator_util
+from tensorflow.python.ops.signal import fft_ops
 from tensorflow.python.util.tf_export import tf_export
 
 __all__ = [
@@ -39,8 +40,8 @@ __all__ = [
 ]
 
 # Different FFT Ops will be used for different block depths.
-_FFT_OP = {1: math_ops.fft, 2: math_ops.fft2d, 3: math_ops.fft3d}
-_IFFT_OP = {1: math_ops.ifft, 2: math_ops.ifft2d, 3: math_ops.ifft3d}
+_FFT_OP = {1: fft_ops.fft, 2: fft_ops.fft2d, 3: fft_ops.fft3d}
+_IFFT_OP = {1: fft_ops.ifft, 2: fft_ops.ifft2d, 3: fft_ops.ifft3d}
 
 # This is the only dtype allowed with fft ops.
 # TODO(langmore) Add other types once available.
@@ -417,15 +418,13 @@ class _BaseLinearOperatorCirculant(linear_operator.LinearOperator):
     return math_ops.cast(y, self.dtype)
 
   def _determinant(self):
-    reduction_indices = [-(i + 1) for i in range(self.block_depth)]
-    det = math_ops.reduce_prod(
-        self.spectrum, reduction_indices=reduction_indices)
+    axis = [-(i + 1) for i in range(self.block_depth)]
+    det = math_ops.reduce_prod(self.spectrum, axis=axis)
     return math_ops.cast(det, self.dtype)
 
   def _log_abs_determinant(self):
-    reduction_indices = [-(i + 1) for i in range(self.block_depth)]
-    lad = math_ops.reduce_sum(
-        math_ops.log(self._abs_spectrum), reduction_indices=reduction_indices)
+    axis = [-(i + 1) for i in range(self.block_depth)]
+    lad = math_ops.reduce_sum(math_ops.log(self._abs_spectrum), axis=axis)
     return math_ops.cast(lad, self.dtype)
 
   def _solve(self, rhs, adjoint=False, adjoint_arg=False):
diff --git a/tensorflow/python/ops/linalg/linear_operator_composition.py b/tensorflow/python/ops/linalg/linear_operator_composition.py
index 0292bc51dcf..f499b306612 100644
--- a/tensorflow/python/ops/linalg/linear_operator_composition.py
+++ b/tensorflow/python/ops/linalg/linear_operator_composition.py
@@ -275,6 +275,3 @@ class LinearOperatorComposition(linear_operator.LinearOperator):
     for operator in solve_order_list[1:]:
       solution = operator.solve(solution, adjoint=adjoint)
     return solution
-
-  def _add_to_tensor(self, x):
-    return self.to_dense() + x
diff --git a/tensorflow/python/ops/linalg/linear_operator_diag.py b/tensorflow/python/ops/linalg/linear_operator_diag.py
index ed53decc00d..be893c705c9 100644
--- a/tensorflow/python/ops/linalg/linear_operator_diag.py
+++ b/tensorflow/python/ops/linalg/linear_operator_diag.py
@@ -228,11 +228,11 @@ class LinearOperatorDiag(linear_operator.LinearOperator):
     return diag_mat * x
 
   def _determinant(self):
-    return math_ops.reduce_prod(self._diag, reduction_indices=[-1])
+    return math_ops.reduce_prod(self._diag, axis=[-1])
 
   def _log_abs_determinant(self):
     log_det = math_ops.reduce_sum(
-        math_ops.log(math_ops.abs(self._diag)), reduction_indices=[-1])
+        math_ops.log(math_ops.abs(self._diag)), axis=[-1])
     if self.dtype.is_complex:
       log_det = math_ops.cast(log_det, dtype=self.dtype)
     return log_det
diff --git a/tensorflow/python/ops/linalg/linear_operator_inversion.py b/tensorflow/python/ops/linalg/linear_operator_inversion.py
new file mode 100644
index 00000000000..7aa4b40e16b
--- /dev/null
+++ b/tensorflow/python/ops/linalg/linear_operator_inversion.py
@@ -0,0 +1,207 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Inverts a non-singular `LinearOperator`."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import ops
+from tensorflow.python.ops.linalg import linear_operator
+from tensorflow.python.util.tf_export import tf_export
+
+__all__ = []
+
+
+@tf_export("linalg.LinearOperatorInversion")
+class LinearOperatorInversion(linear_operator.LinearOperator):
+  """`LinearOperator` representing the inverse of another operator.
+
+  This operator represents the inverse of another operator.
+
+  ```python
+  # Create a 2 x 2 linear operator.
+  operator = LinearOperatorFullMatrix([[1., 0.], [0., 2.]])
+  operator_inv = LinearOperatorInversion(operator)
+
+  operator_inv.to_dense()
+  ==> [[1., 0.]
+       [0., 0.5]]
+
+  operator_inv.shape
+  ==> [2, 2]
+
+  operator_inv.log_abs_determinant()
+  ==> - log(2)
+
+  x = ... Shape [2, 4] Tensor
+  operator_inv.matmul(x)
+  ==> Shape [2, 4] Tensor, equal to operator.solve(x)
+  ```
+
+  #### Performance
+
+  The performance of `LinearOperatorInversion` depends on the underlying
+  operators performance:  `solve` and `matmul` are swapped, and determinant is
+  inverted.
+
+  #### Matrix property hints
+
+  This `LinearOperator` is initialized with boolean flags of the form `is_X`,
+  for `X = non_singular, self_adjoint, positive_definite, square`.
+  These have the following meaning:
+
+  * If `is_X == True`, callers should expect the operator to have the
+    property `X`.  This is a promise that should be fulfilled, but is *not* a
+    runtime assert.  For example, finite floating point precision may result
+    in these promises being violated.
+  * If `is_X == False`, callers should expect the operator to not have `X`.
+  * If `is_X == None` (the default), callers should have no expectation either
+    way.
+  """
+
+  def __init__(self,
+               operator,
+               is_non_singular=None,
+               is_self_adjoint=None,
+               is_positive_definite=None,
+               is_square=None,
+               name=None):
+    r"""Initialize a `LinearOperatorInversion`.
+
+    `LinearOperatorInversion` is initialized with an operator `A`.  The `solve`
+    and `matmul` methods are effectively swapped.  E.g.
+
+    ```
+    A = MyLinearOperator(...)
+    B = LinearOperatorInversion(A)
+    x = [....]  # a vector
+
+    assert A.matvec(x) == B.solvevec(x)
+    ```
+
+    Args:
+      operator: `LinearOperator` object. If `operator.is_non_singular == False`,
+        an exception is raised.  We do allow `operator.is_non_singular == None`,
+        in which case this operator will have `is_non_singular == None`.
+        Similarly for `is_self_adjoint` and `is_positive_definite`.
+      is_non_singular:  Expect that this operator is non-singular.
+      is_self_adjoint:  Expect that this operator is equal to its hermitian
+        transpose.
+      is_positive_definite:  Expect that this operator is positive definite,
+        meaning the quadratic form `x^H A x` has positive real part for all
+        nonzero `x`.  Note that we do not require the operator to be
+        self-adjoint to be positive-definite.  See:
+        https://en.wikipedia.org/wiki/Positive-definite_matrix#Extension_for_non-symmetric_matrices
+      is_square:  Expect that this operator acts like square [batch] matrices.
+      name: A name for this `LinearOperator`. Default is `operator.name +
+        "_inv"`.
+
+    Raises:
+      ValueError:  If `operator.is_non_singular` is False.
+    """
+
+    self._operator = operator
+
+    # Auto-set and check hints.
+    if operator.is_non_singular is False or is_non_singular is False:
+      raise ValueError(
+          "operator and supplied hints must have `is_non_singular` equal to "
+          "`True` or `None`.  Found %s, %s" % (operator.is_non_singular,
+                                               is_non_singular))
+    if operator.is_square is False or is_square is False:
+      raise ValueError(
+          "operator and supplied hints must have `is_square` equal to "
+          "`True` or `None`.  Found %s, %s" % (operator.is_square, is_square))
+
+    # The congruency of is_non_singular and is_self_adjoint was checked in the
+    # base operator.  Other hints are, in this special case of inversion, ones
+    # that must be the same for base/derived operator.
+    def _combined_hint(hint_str, provided_hint_value, message):
+      """Get combined hint in the case where operator.hint should equal hint."""
+      op_hint = getattr(operator, hint_str)
+      if op_hint is False and provided_hint_value:
+        raise ValueError(message)
+      if op_hint and provided_hint_value is False:
+        raise ValueError(message)
+      return (op_hint or provided_hint_value) or None
+
+    is_square = _combined_hint(
+        "is_square", is_square,
+        "An operator is square if and only if its inverse is square.")
+
+    is_non_singular = _combined_hint(
+        "is_non_singular", is_non_singular,
+        "An operator is non-singular if and only if its inverse is "
+        "non-singular.")
+
+    is_self_adjoint = _combined_hint(
+        "is_self_adjoint", is_self_adjoint,
+        "An operator is self-adjoint if and only if its inverse is "
+        "self-adjoint.")
+
+    is_positive_definite = _combined_hint(
+        "is_positive_definite", is_positive_definite,
+        "An operator is positive-definite if and only if its inverse is "
+        "positive-definite.")
+
+    is_square = _combined_hint(
+        "is_square", is_square,
+        "An operator is square if and only if its inverse is square.")
+
+    # Initialization.
+    if name is None:
+      name = operator.name + "_inv"
+    with ops.name_scope(name, values=operator.graph_parents):
+      super(LinearOperatorInversion, self).__init__(
+          dtype=operator.dtype,
+          graph_parents=operator.graph_parents,
+          is_non_singular=is_non_singular,
+          is_self_adjoint=is_self_adjoint,
+          is_positive_definite=is_positive_definite,
+          is_square=is_square,
+          name=name)
+
+  @property
+  def operator(self):
+    """The operator before inversion."""
+    return self._operator
+
+  def _assert_non_singular(self):
+    return self.operator.assert_non_singular()
+
+  def _assert_positive_definite(self):
+    return self.operator.assert_positive_definite()
+
+  def _assert_self_adjoint(self):
+    return self.operator.assert_self_adjoint()
+
+  def _shape(self):
+    return self.operator.shape
+
+  def _shape_tensor(self):
+    return self.operator.shape_tensor()
+
+  def _matmul(self, x, adjoint=False, adjoint_arg=False):
+    return self.operator.solve(x, adjoint=adjoint, adjoint_arg=adjoint_arg)
+
+  def _determinant(self):
+    return 1. / self.operator.determinant()
+
+  def _log_abs_determinant(self):
+    return -1. * self.operator.log_abs_determinant()
+
+  def _solve(self, rhs, adjoint=False, adjoint_arg=False):
+    return self.operator.matmul(rhs, adjoint=adjoint, adjoint_arg=adjoint_arg)
diff --git a/tensorflow/python/ops/linalg/linear_operator_kronecker.py b/tensorflow/python/ops/linalg/linear_operator_kronecker.py
index 1fd5073c178..f7e785caa5d 100644
--- a/tensorflow/python/ops/linalg/linear_operator_kronecker.py
+++ b/tensorflow/python/ops/linalg/linear_operator_kronecker.py
@@ -30,9 +30,7 @@ from tensorflow.python.ops.linalg import linalg_impl as linalg
 from tensorflow.python.ops.linalg import linear_operator
 from tensorflow.python.util.tf_export import tf_export
 
-__all__ = [
-    "LinearOperatorKronecker",
-]
+__all__ = ["LinearOperatorKronecker"]
 
 
 def _vec(x):
diff --git a/tensorflow/python/ops/linalg/linear_operator_low_rank_update.py b/tensorflow/python/ops/linalg/linear_operator_low_rank_update.py
index c4288ff8f87..aa0500aff06 100644
--- a/tensorflow/python/ops/linalg/linear_operator_low_rank_update.py
+++ b/tensorflow/python/ops/linalg/linear_operator_low_rank_update.py
@@ -391,7 +391,7 @@ class LinearOperatorLowRankUpdate(linear_operator.LinearOperator):
     if self._use_cholesky:
       chol_cap_diag = array_ops.matrix_diag_part(self._chol_capacitance)
       log_abs_det_c = 2 * math_ops.reduce_sum(
-          math_ops.log(chol_cap_diag), reduction_indices=[-1])
+          math_ops.log(chol_cap_diag), axis=[-1])
     else:
       det_c = linalg_ops.matrix_determinant(self._capacitance)
       log_abs_det_c = math_ops.log(math_ops.abs(det_c))
diff --git a/tensorflow/python/ops/linalg/linear_operator_lower_triangular.py b/tensorflow/python/ops/linalg/linear_operator_lower_triangular.py
index ca6d3f54051..d33fe17e042 100644
--- a/tensorflow/python/ops/linalg/linear_operator_lower_triangular.py
+++ b/tensorflow/python/ops/linalg/linear_operator_lower_triangular.py
@@ -195,11 +195,11 @@ class LinearOperatorLowerTriangular(linear_operator.LinearOperator):
         self._tril, x, adjoint_a=adjoint, adjoint_b=adjoint_arg)
 
   def _determinant(self):
-    return math_ops.reduce_prod(self._diag, reduction_indices=[-1])
+    return math_ops.reduce_prod(self._diag, axis=[-1])
 
   def _log_abs_determinant(self):
     return math_ops.reduce_sum(
-        math_ops.log(math_ops.abs(self._diag)), reduction_indices=[-1])
+        math_ops.log(math_ops.abs(self._diag)), axis=[-1])
 
   def _solve(self, rhs, adjoint=False, adjoint_arg=False):
     rhs = linalg.adjoint(rhs) if adjoint_arg else rhs
diff --git a/tensorflow/python/ops/linalg/linear_operator_test_util.py b/tensorflow/python/ops/linalg/linear_operator_test_util.py
index 76d659f1097..e50f572b5f4 100644
--- a/tensorflow/python/ops/linalg/linear_operator_test_util.py
+++ b/tensorflow/python/ops/linalg/linear_operator_test_util.py
@@ -102,7 +102,9 @@ class LinearOperatorDerivedClassTest(test.TestCase):
     raise NotImplementedError("operator_build_infos has not been implemented.")
 
   @abc.abstractmethod
-  def _operator_and_matrix(self, build_info, dtype, use_placeholder):
+  def _operator_and_matrix(
+      self, build_info, dtype, use_placeholder,
+      ensure_self_adjoint_and_pd=False):
     """Build a batch matrix and an Operator that should have similar behavior.
 
     Every operator acts like a (batch) matrix.  This method returns both
@@ -114,6 +116,11 @@ class LinearOperatorDerivedClassTest(test.TestCase):
       dtype:  Numpy dtype.  Data type of returned array/operator.
       use_placeholder:  Python bool.  If True, initialize the operator with a
         placeholder of undefined shape and correct dtype.
+      ensure_self_adjoint_and_pd: If `True`,
+        construct this operator to be Hermitian Positive Definite, as well
+        as ensuring the hints `is_positive_definite` and `is_self_adjoint`
+        are set.
+        This is useful for testing methods such as `cholesky`.
 
     Returns:
       operator:  `LinearOperator` subclass instance.
@@ -271,6 +278,21 @@ class LinearOperatorDerivedClassTest(test.TestCase):
     self._skip_if_tests_to_skip_contains("matmul_with_broadcast")
     self._test_matmul(with_batch=False)
 
+  def test_cholesky(self):
+    self._skip_if_tests_to_skip_contains("cholesky")
+    for use_placeholder in self._use_placeholder_options:
+      for build_info in self._operator_build_infos:
+        for dtype in self._dtypes_to_test:
+          with self.test_session(graph=ops.Graph()) as sess:
+            sess.graph.seed = random_seed.DEFAULT_GRAPH_SEED
+            operator, mat = self._operator_and_matrix(
+                build_info, dtype, use_placeholder=use_placeholder,
+                ensure_self_adjoint_and_pd=True)
+            op_chol = operator.cholesky().to_dense()
+            mat_chol = linalg_ops.cholesky(mat)
+            op_chol_v, mat_chol_v = sess.run([op_chol, mat_chol])
+            self.assertAC(mat_chol_v, op_chol_v)
+
   def _test_solve(self, with_batch):
     for use_placeholder in self._use_placeholder_options:
       for build_info in self._operator_build_infos:
@@ -441,7 +463,7 @@ class NonSquareLinearOperatorDerivedClassTest(LinearOperatorDerivedClassTest):
   @property
   def _tests_to_skip(self):
     """List of test names to skip."""
-    return ["solve", "solve_with_broadcast", "det", "log_abs_det"]
+    return ["cholesky", "solve", "solve_with_broadcast", "det", "log_abs_det"]
 
   @property
   def _operator_build_infos(self):
diff --git a/tensorflow/python/ops/linalg/matmul_registrations.py b/tensorflow/python/ops/linalg/matmul_registrations.py
new file mode 100644
index 00000000000..e0ac988ba27
--- /dev/null
+++ b/tensorflow/python/ops/linalg/matmul_registrations.py
@@ -0,0 +1,252 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Registrations for LinearOperator.matmul."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.ops.linalg import linear_operator
+from tensorflow.python.ops.linalg import linear_operator_algebra
+from tensorflow.python.ops.linalg import linear_operator_circulant
+from tensorflow.python.ops.linalg import linear_operator_composition
+from tensorflow.python.ops.linalg import linear_operator_diag
+from tensorflow.python.ops.linalg import linear_operator_identity
+from tensorflow.python.ops.linalg import linear_operator_lower_triangular
+from tensorflow.python.ops.linalg import linear_operator_zeros
+
+
+def _combined_self_adjoint_hint(operator_a, operator_b):
+  """Get combined hint for self-adjoint-ness."""
+  # Note: only use this method in the commuting case.
+  # The property is preserved under composition when the operators commute.
+  if operator_a.is_self_adjoint and operator_b.is_self_adjoint:
+    return True
+
+  # The property is not preserved when an operator with the property is composed
+  # with an operator without the property.
+  if ((operator_a.is_self_adjoint is True and
+       operator_b.is_self_adjoint is False) or
+      (operator_a.is_self_adjoint is False and
+       operator_b.is_self_adjoint is True)):
+    return False
+
+  # The property is not known when operators are not known to have the property
+  # or both operators don't have the property (the property for the complement
+  # class is not closed under composition).
+  return None
+
+
+def _is_square(operator_a, operator_b):
+  """Return a hint to whether the composition is square."""
+  if operator_a.is_square and operator_b.is_square:
+    return True
+  if operator_a.is_square is False and operator_b.is_square is False:
+    # Let A have shape [B, M, N], B have shape [B, N, L].
+    m = operator_a.range_dimension
+    l = operator_b.domain_dimension
+    if m is not None and l is not None:
+      return m == l
+
+    return None
+
+
+def _combined_positive_definite_hint(operator_a, operator_b):
+  """Get combined PD hint for compositions."""
+  # Note: Positive definiteness is only guaranteed to be preserved
+  # when the operators commute and are symmetric. Only use this method in
+  # commuting cases.
+
+  if (operator_a.is_positive_definite is True and
+      operator_a.is_self_adjoint is True and
+      operator_b.is_positive_definite is True and
+      operator_b.is_self_adjoint is True):
+    return True
+
+  return None
+
+
+def _combined_non_singular_hint(operator_a, operator_b):
+  """Get combined hint for when ."""
+  # If either operator is not-invertible the composition isn't.
+  if (operator_a.is_non_singular is False or
+      operator_b.is_non_singular is False):
+    return False
+
+  return operator_a.is_non_singular and operator_b.is_non_singular
+
+
+# By default, use a LinearOperatorComposition to delay the computation.
+@linear_operator_algebra.RegisterMatmul(
+    linear_operator.LinearOperator, linear_operator.LinearOperator)
+def _matmul_linear_operator(linop_a, linop_b):
+  """Generic matmul of two `LinearOperator`s."""
+  is_square = _is_square(linop_a, linop_b)
+  is_non_singular = None
+  is_self_adjoint = None
+  is_positive_definite = None
+
+  if is_square:
+    is_non_singular = _combined_non_singular_hint(linop_a, linop_b)
+    is_self_adjoint = _combined_self_adjoint_hint(linop_a, linop_b)
+  elif is_square is False:
+    is_non_singular = False
+    is_self_adjoint = False
+    is_positive_definite = False
+
+  return linear_operator_composition.LinearOperatorComposition(
+      operators=[linop_a, linop_b],
+      is_non_singular=is_non_singular,
+      is_self_adjoint=is_self_adjoint,
+      is_positive_definite=is_positive_definite,
+      is_square=is_square,
+  )
+
+# Identity
+
+
+@linear_operator_algebra.RegisterMatmul(
+    linear_operator_identity.LinearOperatorIdentity,
+    linear_operator.LinearOperator)
+def _matmul_linear_operator_identity_left(identity, linop):
+  del identity
+  return linop
+
+
+@linear_operator_algebra.RegisterMatmul(
+    linear_operator.LinearOperator,
+    linear_operator_identity.LinearOperatorIdentity)
+def _matmul_linear_operator_identity_right(linop, identity):
+  del identity
+  return linop
+
+
+# Zeros
+
+
+@linear_operator_algebra.RegisterMatmul(
+    linear_operator.LinearOperator,
+    linear_operator_zeros.LinearOperatorZeros)
+def _matmul_linear_operator_zeros_right(linop, zeros):
+  if not zeros.is_square or not linop.is_square:
+    raise ValueError("Matmul with non-square `LinearOperator`s or non-square "
+                     "`LinearOperatorZeros` not supported at this time.")
+  return zeros
+
+
+@linear_operator_algebra.RegisterMatmul(
+    linear_operator_zeros.LinearOperatorZeros,
+    linear_operator.LinearOperator)
+def _matmul_linear_operator_zeros_left(zeros, linop):
+  if not zeros.is_square or not linop.is_square:
+    raise ValueError("Matmul with non-square `LinearOperator`s or non-square "
+                     "`LinearOperatorZeros` not supported at this time.")
+  return zeros
+
+
+# Diag.
+
+
+@linear_operator_algebra.RegisterMatmul(
+    linear_operator_diag.LinearOperatorDiag,
+    linear_operator_diag.LinearOperatorDiag)
+def _matmul_linear_operator_diag(linop_a, linop_b):
+  return linear_operator_diag.LinearOperatorDiag(
+      diag=linop_a.diag * linop_b.diag,
+      is_non_singular=_combined_non_singular_hint(linop_a, linop_b),
+      is_self_adjoint=_combined_self_adjoint_hint(
+          linop_a, linop_b),
+      is_positive_definite=_combined_positive_definite_hint(
+          linop_a, linop_b),
+      is_square=True)
+
+
+@linear_operator_algebra.RegisterMatmul(
+    linear_operator_diag.LinearOperatorDiag,
+    linear_operator_identity.LinearOperatorScaledIdentity)
+def _matmul_linear_operator_diag_scaled_identity_right(
+    linop_diag, linop_scaled_identity):
+  return linear_operator_diag.LinearOperatorDiag(
+      diag=linop_diag.diag * linop_scaled_identity.multiplier,
+      is_non_singular=_combined_non_singular_hint(
+          linop_diag, linop_scaled_identity),
+      is_self_adjoint=_combined_self_adjoint_hint(
+          linop_diag, linop_scaled_identity),
+      is_positive_definite=_combined_positive_definite_hint(
+          linop_diag, linop_scaled_identity),
+      is_square=True)
+
+
+@linear_operator_algebra.RegisterMatmul(
+    linear_operator_identity.LinearOperatorScaledIdentity,
+    linear_operator_diag.LinearOperatorDiag)
+def _matmul_linear_operator_diag_scaled_identity_left(
+    linop_scaled_identity, linop_diag):
+  return linear_operator_diag.LinearOperatorDiag(
+      diag=linop_diag.diag * linop_scaled_identity.multiplier,
+      is_non_singular=_combined_non_singular_hint(
+          linop_diag, linop_scaled_identity),
+      is_self_adjoint=_combined_self_adjoint_hint(
+          linop_diag, linop_scaled_identity),
+      is_positive_definite=_combined_positive_definite_hint(
+          linop_diag, linop_scaled_identity),
+      is_square=True)
+
+
+@linear_operator_algebra.RegisterMatmul(
+    linear_operator_diag.LinearOperatorDiag,
+    linear_operator_lower_triangular.LinearOperatorLowerTriangular)
+def _matmul_linear_operator_diag_tril(linop_diag, linop_triangular):
+  return linear_operator_lower_triangular.LinearOperatorLowerTriangular(
+      tril=linop_diag.diag[..., None] * linop_triangular.to_dense(),
+      is_non_singular=_combined_non_singular_hint(
+          linop_diag, linop_triangular),
+      # This is safe to do since the Triangular matrix is only self-adjoint
+      # when it is a diagonal matrix, and hence commutes.
+      is_self_adjoint=_combined_self_adjoint_hint(
+          linop_diag, linop_triangular),
+      is_positive_definite=None,
+      is_square=True)
+
+
+@linear_operator_algebra.RegisterMatmul(
+    linear_operator_lower_triangular.LinearOperatorLowerTriangular,
+    linear_operator_diag.LinearOperatorDiag)
+def _matmul_linear_operator_tril_diag(linop_triangular, linop_diag):
+  return linear_operator_lower_triangular.LinearOperatorLowerTriangular(
+      tril=linop_triangular.to_dense() * linop_diag.diag,
+      is_non_singular=_combined_non_singular_hint(
+          linop_diag, linop_triangular),
+      # This is safe to do since the Triangular matrix is only self-adjoint
+      # when it is a diagonal matrix, and hence commutes.
+      is_self_adjoint=_combined_self_adjoint_hint(
+          linop_diag, linop_triangular),
+      is_positive_definite=None,
+      is_square=True)
+
+# Circulant.
+
+
+@linear_operator_algebra.RegisterMatmul(
+    linear_operator_circulant.LinearOperatorCirculant,
+    linear_operator_circulant.LinearOperatorCirculant)
+def _matmul_linear_operator_circulant_circulant(linop_a, linop_b):
+  return linear_operator_circulant.LinearOperatorCirculant(
+      spectrum=linop_a.spectrum * linop_b.spectrum,
+      is_non_singular=_combined_non_singular_hint(linop_a, linop_b),
+      is_self_adjoint=_combined_self_adjoint_hint(linop_a, linop_b),
+      is_positive_definite=_combined_positive_definite_hint(
+          linop_a, linop_b),
+      is_square=True)
diff --git a/tensorflow/python/ops/linalg_ops.py b/tensorflow/python/ops/linalg_ops.py
index bbccc7e0369..1a9e7112b45 100644
--- a/tensorflow/python/ops/linalg_ops.py
+++ b/tensorflow/python/ops/linalg_ops.py
@@ -423,7 +423,78 @@ def svd(tensor, full_matrices=False, compute_uv=True, name=None):
 
 
 # pylint: disable=redefined-builtin
-@tf_export('norm', 'linalg.norm')
+@tf_export('norm', 'linalg.norm', v1=[])
+def norm_v2(tensor,
+            ord='euclidean',
+            axis=None,
+            keepdims=None,
+            name=None):
+  r"""Computes the norm of vectors, matrices, and tensors.
+
+  This function can compute several different vector norms (the 1-norm, the
+  Euclidean or 2-norm, the inf-norm, and in general the p-norm for p > 0) and
+  matrix norms (Frobenius, 1-norm, 2-norm and inf-norm).
+
+  Args:
+    tensor: `Tensor` of types `float32`, `float64`, `complex64`, `complex128`
+    ord: Order of the norm. Supported values are 'fro', 'euclidean',
+      `1`, `2`, `np.inf` and any positive real number yielding the corresponding
+      p-norm. Default is 'euclidean' which is equivalent to Frobenius norm if
+      `tensor` is a matrix and equivalent to 2-norm for vectors.
+      Some restrictions apply:
+        a) The Frobenius norm `fro` is not defined for vectors,
+        b) If axis is a 2-tuple (matrix norm), only 'euclidean', 'fro', `1`,
+           `2`, `np.inf` are supported.
+      See the description of `axis` on how to compute norms for a batch of
+      vectors or matrices stored in a tensor.
+    axis: If `axis` is `None` (the default), the input is considered a vector
+      and a single vector norm is computed over the entire set of values in the
+      tensor, i.e. `norm(tensor, ord=ord)` is equivalent to
+      `norm(reshape(tensor, [-1]), ord=ord)`.
+      If `axis` is a Python integer, the input is considered a batch of vectors,
+      and `axis` determines the axis in `tensor` over which to compute vector
+      norms.
+      If `axis` is a 2-tuple of Python integers it is considered a batch of
+      matrices and `axis` determines the axes in `tensor` over which to compute
+      a matrix norm.
+      Negative indices are supported. Example: If you are passing a tensor that
+      can be either a matrix or a batch of matrices at runtime, pass
+      `axis=[-2,-1]` instead of `axis=None` to make sure that matrix norms are
+      computed.
+    keepdims: If True, the axis indicated in `axis` are kept with size 1.
+      Otherwise, the dimensions in `axis` are removed from the output shape.
+    name: The name of the op.
+
+  Returns:
+    output: A `Tensor` of the same type as tensor, containing the vector or
+      matrix norms. If `keepdims` is True then the rank of output is equal to
+      the rank of `tensor`. Otherwise, if `axis` is none the output is a scalar,
+      if `axis` is an integer, the rank of `output` is one less than the rank
+      of `tensor`, if `axis` is a 2-tuple the rank of `output` is two less
+      than the rank of `tensor`.
+
+  Raises:
+    ValueError: If `ord` or `axis` is invalid.
+
+  @compatibility(numpy)
+  Mostly equivalent to numpy.linalg.norm.
+  Not supported: ord <= 0, 2-norm for matrices, nuclear norm.
+  Other differences:
+    a) If axis is `None`, treats the flattened `tensor` as a vector
+     regardless of rank.
+    b) Explicitly supports 'euclidean' norm as the default, including for
+     higher order tensors.
+  @end_compatibility
+  """
+  return norm(tensor=tensor,
+              ord=ord,
+              axis=axis,
+              keepdims=keepdims,
+              name=name)
+
+
+# pylint: disable=redefined-builtin
+@tf_export(v1=['norm', 'linalg.norm'])
 @deprecation.deprecated_args(
     None, 'keep_dims is deprecated, use keepdims instead', 'keep_dims')
 def norm(tensor,
diff --git a/tensorflow/python/ops/list_ops.py b/tensorflow/python/ops/list_ops.py
index b4a1fc6af61..dbaae886d43 100644
--- a/tensorflow/python/ops/list_ops.py
+++ b/tensorflow/python/ops/list_ops.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_list_ops
 # go/tf-wildcard-import
@@ -29,7 +30,9 @@ from tensorflow.python.ops.gen_list_ops import *
 # pylint: enable=wildcard-import
 
 
-ops.NotDifferentiable("TensorListConcat")
+ops.NotDifferentiable("TensorListConcatLists")
+ops.NotDifferentiable("TensorListElementShape")
+ops.NotDifferentiable("TensorListLength")
 ops.NotDifferentiable("TensorListPushBackBatch")
 
 
@@ -41,12 +44,42 @@ def empty_tensor_list(element_shape,
     max_num_elements = -1
 
   return gen_list_ops.empty_tensor_list(
-      element_shape=element_shape,
+      element_shape=_build_element_shape(element_shape),
       element_dtype=element_dtype,
       max_num_elements=max_num_elements,
       name=name)
 
 
+def tensor_list_reserve(element_shape, num_elements, element_dtype, name=None):
+  return gen_list_ops.tensor_list_reserve(
+      element_shape=_build_element_shape(element_shape),
+      num_elements=num_elements,
+      element_dtype=element_dtype,
+      name=name)
+
+
+def tensor_list_from_tensor(tensor, element_shape, name=None):
+  return gen_list_ops.tensor_list_from_tensor(
+      tensor=tensor,
+      element_shape=_build_element_shape(element_shape),
+      name=name)
+
+
+def tensor_list_concat(input_handle, element_dtype, name=None):
+  # Ignore the lengths output of TensorListConcat. It is only used during
+  # gradient computation.
+  return gen_list_ops.tensor_list_concat(
+      input_handle=input_handle, element_dtype=element_dtype, name=name)[0]
+
+
+def tensor_list_split(tensor, element_shape, lengths, name=None):
+  return gen_list_ops.tensor_list_split(
+      tensor=tensor,
+      element_shape=_build_element_shape(element_shape),
+      lengths=lengths,
+      name=name)
+
+
 @ops.RegisterGradient("TensorListPushBack")
 def _PushBackGrad(op, dresult):
   return gen_list_ops.tensor_list_pop_back(
@@ -65,14 +98,32 @@ def _PopBackGrad(op, dlist, delement):
 
 @ops.RegisterGradient("TensorListStack")
 def _TensorListStackGrad(unused_op, dtensor):
-  return gen_list_ops.tensor_list_from_tensor(dtensor,
-                                              element_shape=dtensor.shape[1:])
+  return tensor_list_from_tensor(dtensor, element_shape=dtensor.shape[1:])
+
+
+@ops.RegisterGradient("TensorListConcat")
+def _TensorListConcatGrad(op, dtensor, unused_dlengths):
+  # TODO(srbs): We lose the element_shape information in tensor_list_concat.
+  # Consider providing that as an output of TensorListConcat?
+  if dtensor.shape.rank is None:
+    element_shape = None
+  else:
+    element_shape = [None] + dtensor.shape.as_list()[1:]
+  return tensor_list_split(
+      dtensor,
+      element_shape=_build_element_shape(element_shape),
+      lengths=op.outputs[1])
+
+
+@ops.RegisterGradient("TensorListSplit")
+def _TensorListSplitGrad(op, dlist):
+  return tensor_list_concat(dlist, element_dtype=op.inputs[0].dtype), None, None
 
 
 @ops.RegisterGradient("TensorListFromTensor")
 def _TensorListFromTensorGrad(op, dlist):
   """Gradient for TensorListFromTensor."""
-  if op.inputs[0].shape.dims[0].value is not None:
+  if op.inputs[0].shape.dims and op.inputs[0].shape.dims[0].value is not None:
     num_elements = op.inputs[0].shape.dims[0].value
   else:
     num_elements = None
@@ -126,3 +177,40 @@ def _TensorListScatterGrad(op, dlist):
   t, indices, _ = op.inputs
   return gen_list_ops.tensor_list_gather(
       dlist, indices, element_dtype=t.dtype), None
+
+
+def _build_element_shape(shape):
+  """Converts shape to a format understood by list_ops for element_shape.
+
+  If `shape` is already a `Tensor` it is returned as-is. We do not perform a
+  type check here.
+
+  If shape is None or a TensorShape with unknown rank, -1 is returned.
+
+  If shape is a scalar, an int32 tensor with empty list is returned. Note we
+  do directly return an empty list since ops.convert_to_tensor would conver it
+  to a float32 which is not a valid type for element_shape.
+
+  If shape is a sequence of dims, None's in the list are replaced with -1. We
+  do not check the dtype of the other dims.
+
+  Args:
+    shape: Could be None, Tensor, TensorShape or a list of dims (each dim could
+      be a None, scalar or Tensor).
+
+  Returns:
+    A None-free shape that can be converted to a tensor.
+  """
+  if isinstance(shape, ops.Tensor):
+    return shape
+  if isinstance(shape, tensor_shape.TensorShape):
+    # `TensorShape.as_list` requires rank to be known.
+    shape = shape.as_list() if shape else None
+  # Shape is unknown.
+  if shape is None:
+    return -1
+  # Shape is a scalar.
+  if not shape:
+    return ops.convert_to_tensor(shape, dtype=dtypes.int32)
+  # Shape is a sequence of dimensions. Convert None dims to -1.
+  return [d if d is not None else -1 for d in shape]
diff --git a/tensorflow/python/ops/lookup_ops.py b/tensorflow/python/ops/lookup_ops.py
index 397d56ef409..758cb8041da 100644
--- a/tensorflow/python/ops/lookup_ops.py
+++ b/tensorflow/python/ops/lookup_ops.py
@@ -39,6 +39,7 @@ from tensorflow.python.ops import string_ops
 # pylint: disable=wildcard-import
 from tensorflow.python.ops.gen_lookup_ops import *
 # pylint: enable=wildcard-import
+from tensorflow.python.training.checkpointable import base as checkpointable_base
 from tensorflow.python.training.checkpointable import tracking as checkpointable
 from tensorflow.python.util import compat
 from tensorflow.python.util.deprecation import deprecated
@@ -160,7 +161,9 @@ class InitializableLookupTableBase(LookupInterface):
     self._default_value = ops.convert_to_tensor(
         default_value, dtype=self._value_dtype)
     self._default_value.get_shape().merge_with(tensor_shape.scalar())
-    self._initializer = initializer
+    if isinstance(initializer, checkpointable_base.CheckpointableBase):
+      self._initializer = self._track_checkpointable(
+          initializer, "_initializer")
     self._resource_handle = self.create_resource()
     self._init_op = self.initialize()
 
@@ -309,7 +312,7 @@ class HashTable(InitializableLookupTableBase):
     return exported_keys, exported_values
 
 
-class TableInitializerBase(object):
+class TableInitializerBase(checkpointable_base.CheckpointableBase):
   """Base class for lookup table initializers."""
 
   def __init__(self, key_dtype, value_dtype):
@@ -522,12 +525,14 @@ class TextFileInitializer(TableInitializerBase):
     if (vocab_size is not None) and (vocab_size <= 0):
       raise ValueError("Invalid vocab_size %s." % vocab_size)
 
-    self._filename = filename
     self._key_index = key_index
     self._value_index = value_index
     self._vocab_size = vocab_size
     self._delimiter = delimiter
     self._name = name
+    self._filename = self._track_checkpointable(
+        checkpointable.TrackableAsset(filename),
+        "_filename")
 
     super(TextFileInitializer, self).__init__(key_dtype, value_dtype)
 
diff --git a/tensorflow/python/ops/losses/losses_impl.py b/tensorflow/python/ops/losses/losses_impl.py
index 53c09ee8ddf..9e9de62e6ca 100644
--- a/tensorflow/python/ops/losses/losses_impl.py
+++ b/tensorflow/python/ops/losses/losses_impl.py
@@ -18,7 +18,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.compat import compat
 from tensorflow.python.eager import context
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
@@ -34,28 +33,48 @@ from tensorflow.python.util.deprecation import deprecated_argument_lookup
 from tensorflow.python.util.tf_export import tf_export
 
 
-@tf_export("losses.Reduction")
+@tf_export("losses.Reduction", "keras.losses.Reduction", v1=[])
+class ReductionV2(object):
+  """Types of loss reduction.
+
+  Contains the following values:
+  `NONE`: Un-reduced weighted losses with the same shape as input.
+  `SUM`: Scalar sum of weighted losses.
+  `SUM_OVER_BATCH_SIZE`: Scalar `SUM` divided by number of elements in losses.
+  """
+
+  NONE = "none"
+  SUM = "sum"
+  SUM_OVER_BATCH_SIZE = "sum_over_batch_size"
+
+  @classmethod
+  def all(cls):
+    return (cls.NONE, cls.SUM, cls.SUM_OVER_BATCH_SIZE)
+
+  @classmethod
+  def validate(cls, key):
+    if key not in cls.all():
+      raise ValueError("Invalid Reduction Key %s." % key)
+
+
+@tf_export(v1=["losses.Reduction"])
 class Reduction(object):
   """Types of loss reduction.
 
   Contains the following values:
   `NONE`: Un-reduced weighted losses with the same shape as input.
   `SUM`: Scalar sum of weighted losses.
-  `MEAN`: Scalar `SUM` divided by sum of weights.
+  `MEAN`: Scalar `SUM` divided by sum of weights. DEPRECATED.
   `SUM_OVER_BATCH_SIZE`: Scalar `SUM` divided by number of elements in losses.
   `SUM_OVER_NONZERO_WEIGHTS`: Scalar `SUM` divided by number of non-zero
-     weights.
+     weights. DEPRECATED.
   `SUM_BY_NONZERO_WEIGHTS`: Same as `SUM_OVER_NONZERO_WEIGHTS`.
   """
 
   NONE = "none"
-
   SUM = "weighted_sum"
-
-  MEAN = "weighted_mean"
-
   SUM_OVER_BATCH_SIZE = "weighted_sum_over_batch_size"
-
+  MEAN = "weighted_mean"
   SUM_BY_NONZERO_WEIGHTS = "weighted_sum_by_nonzero_weights"
   SUM_OVER_NONZERO_WEIGHTS = SUM_BY_NONZERO_WEIGHTS
 
@@ -72,35 +91,7 @@ class Reduction(object):
   @classmethod
   def validate(cls, key):
     if key not in cls.all():
-      raise ValueError("Invalid ReductionKey %s." % key)
-
-
-def _safe_div(numerator, denominator, name="value"):
-  """Computes a safe divide which returns 0 if the denominator is zero.
-
-  Note that the function contains an additional conditional check that is
-  necessary for avoiding situations where the loss is zero causing NaNs to
-  creep into the gradient computation.
-
-  Args:
-    numerator: An arbitrary `Tensor`.
-    denominator: A `Tensor` whose shape matches `numerator` and whose values are
-      assumed to be non-negative.
-    name: An optional name for the returned op.
-
-  Returns:
-    The element-wise value of the numerator divided by the denominator.
-  """
-  if compat.forward_compatible(2018, 11, 1):
-    return math_ops.div_no_nan(numerator, denominator, name=name)
-  return array_ops.where(
-      math_ops.greater(denominator, 0),
-      math_ops.div(numerator,
-                   array_ops.where(
-                       math_ops.equal(denominator, 0),
-                       array_ops.ones_like(denominator), denominator)),
-      array_ops.zeros_like(numerator),
-      name=name)
+      raise ValueError("Invalid Reduction Key %s." % key)
 
 
 def _safe_mean(losses, num_present):
@@ -115,7 +106,7 @@ def _safe_mean(losses, num_present):
       then zero is returned.
   """
   total_loss = math_ops.reduce_sum(losses)
-  return _safe_div(total_loss, num_present)
+  return math_ops.div_no_nan(total_loss, num_present, name="value")
 
 
 def _num_present(losses, weights, per_batch=False):
@@ -166,7 +157,7 @@ def _num_elements(losses):
     return math_ops.cast(array_ops.size(losses, name=scope), dtype=losses.dtype)
 
 
-@tf_export("losses.compute_weighted_loss")
+@tf_export(v1=["losses.compute_weighted_loss"])
 def compute_weighted_loss(
     losses, weights=1.0, scope=None, loss_collection=ops.GraphKeys.LOSSES,
     reduction=Reduction.SUM_BY_NONZERO_WEIGHTS):
@@ -236,7 +227,7 @@ def compute_weighted_loss(
       return loss
 
 
-@tf_export("losses.absolute_difference")
+@tf_export(v1=["losses.absolute_difference"])
 def absolute_difference(
     labels, predictions, weights=1.0, scope=None,
     loss_collection=ops.GraphKeys.LOSSES,
@@ -289,7 +280,7 @@ def absolute_difference(
         losses, weights, scope, loss_collection, reduction=reduction)
 
 
-@tf_export("losses.cosine_distance")
+@tf_export(v1=["losses.cosine_distance"])
 @deprecated_args(None, "dim is deprecated, use axis instead", "dim")
 def cosine_distance(
     labels, predictions, axis=None, weights=1.0, scope=None,
@@ -345,7 +336,7 @@ def cosine_distance(
         losses, weights, scope, loss_collection, reduction=reduction)
 
 
-@tf_export("losses.hinge_loss")
+@tf_export(v1=["losses.hinge_loss"])
 def hinge_loss(labels, logits, weights=1.0, scope=None,
                loss_collection=ops.GraphKeys.LOSSES,
                reduction=Reduction.SUM_BY_NONZERO_WEIGHTS):
@@ -395,7 +386,7 @@ def hinge_loss(labels, logits, weights=1.0, scope=None,
         losses, weights, scope, loss_collection, reduction=reduction)
 
 
-@tf_export("losses.huber_loss")
+@tf_export(v1=["losses.huber_loss"])
 def huber_loss(labels, predictions, weights=1.0, delta=1.0, scope=None,
                loss_collection=ops.GraphKeys.LOSSES,
                reduction=Reduction.SUM_BY_NONZERO_WEIGHTS):
@@ -473,7 +464,7 @@ def huber_loss(labels, predictions, weights=1.0, delta=1.0, scope=None,
         losses, weights, scope, loss_collection, reduction=reduction)
 
 
-@tf_export("losses.log_loss")
+@tf_export(v1=["losses.log_loss"])
 def log_loss(labels, predictions, weights=1.0, epsilon=1e-7, scope=None,
              loss_collection=ops.GraphKeys.LOSSES,
              reduction=Reduction.SUM_BY_NONZERO_WEIGHTS):
@@ -530,7 +521,7 @@ def log_loss(labels, predictions, weights=1.0, epsilon=1e-7, scope=None,
 
 
 # TODO(b/37208492): Add reduction arg.
-@tf_export("losses.mean_pairwise_squared_error")
+@tf_export(v1=["losses.mean_pairwise_squared_error"])
 def mean_pairwise_squared_error(
     labels, predictions, weights=1.0, scope=None,
     loss_collection=ops.GraphKeys.LOSSES):
@@ -595,26 +586,24 @@ def mean_pairwise_squared_error(
 
       diffs = math_ops.subtract(predictions, labels)
 
-      reduction_indices = math_ops.range(1, array_ops.rank(diffs))
+      axis = math_ops.range(1, array_ops.rank(diffs))
 
       sum_squares_diff_per_batch = math_ops.reduce_sum(
-          math_ops.square(diffs),
-          reduction_indices=reduction_indices,
-          keepdims=True)
+          math_ops.square(diffs), axis=axis, keepdims=True)
       num_present_per_batch = _num_present(diffs, weights, per_batch=True)
 
-      term1 = 2.0 * _safe_div(
+      term1 = 2.0 * math_ops.div_no_nan(
           sum_squares_diff_per_batch,
-          math_ops.maximum(num_present_per_batch - 1, 0))
+          math_ops.maximum(num_present_per_batch - 1, 0),
+          name="value")
 
-      sum_diff = math_ops.reduce_sum(
-          diffs, reduction_indices=reduction_indices, keepdims=True)
-      term2 = 2.0 * _safe_div(
+      sum_diff = math_ops.reduce_sum(diffs, axis=axis, keepdims=True)
+      term2 = 2.0 * math_ops.div_no_nan(
           math_ops.square(sum_diff),
           math_ops.maximum(
               math_ops.multiply(num_present_per_batch,
-                                num_present_per_batch - 1),
-              0))
+                                num_present_per_batch - 1), 0),
+          name="value")
 
       weighted_losses = math_ops.multiply(term1 - term2, weights)
       loss = math_ops.reduce_sum(weighted_losses)
@@ -628,7 +617,7 @@ def mean_pairwise_squared_error(
       return mean_loss
 
 
-@tf_export("losses.mean_squared_error")
+@tf_export(v1=["losses.mean_squared_error"])
 def mean_squared_error(
     labels, predictions, weights=1.0, scope=None,
     loss_collection=ops.GraphKeys.LOSSES,
@@ -681,7 +670,7 @@ def mean_squared_error(
         losses, weights, scope, loss_collection, reduction=reduction)
 
 
-@tf_export("losses.sigmoid_cross_entropy")
+@tf_export(v1=["losses.sigmoid_cross_entropy"])
 def sigmoid_cross_entropy(
     multi_class_labels, logits, weights=1.0, label_smoothing=0, scope=None,
     loss_collection=ops.GraphKeys.LOSSES,
@@ -745,7 +734,7 @@ def sigmoid_cross_entropy(
         losses, weights, scope, loss_collection, reduction=reduction)
 
 
-@tf_export("losses.softmax_cross_entropy")
+@tf_export(v1=["losses.softmax_cross_entropy"])
 def softmax_cross_entropy(
     onehot_labels, logits, weights=1.0, label_smoothing=0, scope=None,
     loss_collection=ops.GraphKeys.LOSSES,
@@ -867,7 +856,7 @@ def _remove_squeezable_dimensions(
   return labels, predictions, weights
 
 
-@tf_export("losses.sparse_softmax_cross_entropy")
+@tf_export(v1=["losses.sparse_softmax_cross_entropy"])
 def sparse_softmax_cross_entropy(
     labels, logits, weights=1.0, scope=None,
     loss_collection=ops.GraphKeys.LOSSES,
diff --git a/tensorflow/python/ops/losses/util_test.py b/tensorflow/python/ops/losses/util_test.py
index df2e60e2e45..22a8eaae266 100644
--- a/tensorflow/python/ops/losses/util_test.py
+++ b/tensorflow/python/ops/losses/util_test.py
@@ -20,12 +20,14 @@ from __future__ import print_function
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops.losses import util
 from tensorflow.python.platform import test
 
 
 class LossesUtilTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testGetRegularizationLoss(self):
     # Empty regularization collection should evaluate to 0.0.
     with self.cached_session():
diff --git a/tensorflow/python/ops/math_grad.py b/tensorflow/python/ops/math_grad.py
index 35278d96804..c7ec1c57d1b 100644
--- a/tensorflow/python/ops/math_grad.py
+++ b/tensorflow/python/ops/math_grad.py
@@ -1041,11 +1041,12 @@ def _PowGrad(op, grad):
   # Avoid false singularity at x = 0
   if x.dtype.is_complex:
     # real(x) < 0 is fine for the complex case
-    log_x = array_ops.where(
-        math_ops.not_equal(x, 0), math_ops.log(x), array_ops.zeros_like(x))
+    mask = math_ops.not_equal(x, 0)
   else:
     # There's no sensible real value to return if x < 0, so return 0
-    log_x = array_ops.where(x > 0, math_ops.log(x), array_ops.zeros_like(x))
+    mask = x > 0
+  safe_x = array_ops.where(mask, x, array_ops.ones_like(x))
+  log_x = array_ops.where(mask, math_ops.log(safe_x), array_ops.zeros_like(x))
   gy = array_ops.reshape(math_ops.reduce_sum(grad * z * log_x, ry), sy)
   return gx, gy
 
diff --git a/tensorflow/python/ops/math_grad_test.py b/tensorflow/python/ops/math_grad_test.py
index d1fe834fc78..822f89768c5 100644
--- a/tensorflow/python/ops/math_grad_test.py
+++ b/tensorflow/python/ops/math_grad_test.py
@@ -20,9 +20,13 @@ from __future__ import print_function
 
 import numpy as np
 
+from tensorflow.python.eager import backprop
+from tensorflow.python.eager import context
+from tensorflow.python.eager import execution_callbacks
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import gradients
@@ -52,6 +56,7 @@ class SquaredDifferenceOpTest(test.TestCase):
     self.assertLess(left_err, 1e-10)
     self.assertLess(right_err, 1e-10)
 
+  @test_util.run_deprecated_v1
   def testGrad(self):
     self._testGrad([1, 2, 3, 2], [3, 2])
     self._testGrad([2, 4], [3, 2, 4])
@@ -83,6 +88,7 @@ class AbsOpTest(test.TestCase):
           value, shape, output, output.get_shape().as_list())
     self.assertLess(error, max_error)
 
+  @test_util.run_deprecated_v1
   def testComplexAbs(self):
     # Bias random test values away from zero to avoid numeric instabilities.
     self._testGrad(
@@ -99,6 +105,7 @@ class AbsOpTest(test.TestCase):
 
 class MinOrMaxGradientTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testMinGradient(self):
     inputs = constant_op.constant([1.0], dtype=dtypes.float32)
     outputs = math_ops.reduce_min(array_ops.concat([inputs, inputs], 0))
@@ -106,6 +113,7 @@ class MinOrMaxGradientTest(test.TestCase):
       error = gradient_checker.compute_gradient_error(inputs, [1], outputs, [])
       self.assertLess(error, 1e-4)
 
+  @test_util.run_deprecated_v1
   def testMaxGradient(self):
     inputs = constant_op.constant([1.0], dtype=dtypes.float32)
     outputs = math_ops.reduce_max(array_ops.concat([inputs, inputs], 0))
@@ -116,6 +124,7 @@ class MinOrMaxGradientTest(test.TestCase):
 
 class MaximumOrMinimumGradientTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testMaximumGradient(self):
     inputs = constant_op.constant([1.0, 2.0, 3.0, 4.0], dtype=dtypes.float32)
     outputs = math_ops.maximum(inputs, 3.0)
@@ -123,6 +132,7 @@ class MaximumOrMinimumGradientTest(test.TestCase):
       error = gradient_checker.compute_gradient_error(inputs, [4], outputs, [4])
       self.assertLess(error, 1e-4)
 
+  @test_util.run_deprecated_v1
   def testMinimumGradient(self):
     inputs = constant_op.constant([1.0, 2.0, 3.0, 4.0], dtype=dtypes.float32)
     outputs = math_ops.minimum(inputs, 2.0)
@@ -133,6 +143,7 @@ class MaximumOrMinimumGradientTest(test.TestCase):
 
 class ProdGradientTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testProdGradient(self):
     inputs = constant_op.constant([[1., 2.], [3., 4.]],
                                   dtype=dtypes.float32)
@@ -143,6 +154,7 @@ class ProdGradientTest(test.TestCase):
           outputs, outputs.get_shape().as_list())
       self.assertLess(error, 1e-4)
 
+  @test_util.run_deprecated_v1
   def testProdGradientForNegativeAxis(self):
     inputs = constant_op.constant([[1., 2.], [3., 4.]],
                                   dtype=dtypes.float32)
@@ -153,6 +165,7 @@ class ProdGradientTest(test.TestCase):
           outputs, outputs.get_shape().as_list())
       self.assertLess(error, 1e-4)
 
+  @test_util.run_deprecated_v1
   def testProdGradientComplex(self):
     for dtype in dtypes.complex64, dtypes.complex128:
       inputs = constant_op.constant([[1 + 3j, 2 - 1j], [3j, 4]],
@@ -164,6 +177,7 @@ class ProdGradientTest(test.TestCase):
             outputs, outputs.get_shape().as_list())
         self.assertLess(error, 1e-4)
 
+  @test_util.run_deprecated_v1
   def testProdGradientForNegativeAxisComplex(self):
     for dtype in dtypes.complex64, dtypes.complex128:
       inputs = constant_op.constant([[1 + 3j, 2 - 1j], [3j, 4]],
@@ -178,6 +192,7 @@ class ProdGradientTest(test.TestCase):
 
 class SegmentMinOrMaxGradientTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testSegmentMinGradient(self):
     data = constant_op.constant([1.0, 2.0, 3.0], dtype=dtypes.float32)
     segment_ids = constant_op.constant([0, 0, 1], dtype=dtypes.int64)
@@ -187,6 +202,7 @@ class SegmentMinOrMaxGradientTest(test.TestCase):
                                                       [2])
       self.assertLess(error, 1e-4)
 
+  @test_util.run_deprecated_v1
   def testSegmentMaxGradient(self):
     data = constant_op.constant([1.0, 2.0, 3.0], dtype=dtypes.float32)
     segment_ids = constant_op.constant([0, 0, 1], dtype=dtypes.int64)
@@ -196,6 +212,7 @@ class SegmentMinOrMaxGradientTest(test.TestCase):
                                                       [2])
       self.assertLess(error, 1e-4)
 
+  @test_util.run_deprecated_v1
   def testSegmentMinGradientWithTies(self):
     inputs = constant_op.constant([1.0], dtype=dtypes.float32)
     data = array_ops.concat([inputs, inputs], 0)
@@ -206,6 +223,7 @@ class SegmentMinOrMaxGradientTest(test.TestCase):
                                                       [1])
       self.assertLess(error, 1e-4)
 
+  @test_util.run_deprecated_v1
   def testSegmentMaxGradientWithTies(self):
     inputs = constant_op.constant([1.0], dtype=dtypes.float32)
     data = array_ops.concat([inputs, inputs], 0)
@@ -219,6 +237,7 @@ class SegmentMinOrMaxGradientTest(test.TestCase):
 
 class FloorModGradientTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testFloorModGradient(self):
     # Making sure the input is not near the discontinuity point where
     # x/y == floor(x/y)
@@ -233,6 +252,7 @@ class FloorModGradientTest(test.TestCase):
 
 class DivNoNanGradientTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testBasicGradient(self):
     inputs = constant_op.constant(np.arange(-3, 3),
                                   dtype=dtypes.float32)
@@ -244,6 +264,7 @@ class DivNoNanGradientTest(test.TestCase):
           outputs.get_shape().as_list())
       self.assertLess(error, 1e-4)
 
+  @test_util.run_deprecated_v1
   def testGradientWithDenominatorIsZero(self):
     x = constant_op.constant(np.arange(-3, 3),
                              dtype=dtypes.float32)
@@ -263,6 +284,7 @@ class XlogyTest(test.TestCase):
     xlogy_ygrad = self.evaluate(gradients.gradients(math_ops.xlogy(x, y), y)[0])
     return xlogy_xgrad, xlogy_ygrad
 
+  @test_util.run_deprecated_v1
   def testNonZeroValuesGrad(self):
     for dtype in [dtypes.float16, dtypes.float32, dtypes.float64]:
       x = constant_op.constant(0.1, dtype=dtype)
@@ -273,6 +295,7 @@ class XlogyTest(test.TestCase):
       self.assertAllClose(xlogy_expected_xgrad, xlogy_xgrad)
       self.assertAllClose(xlogy_expected_ygrad, xlogy_ygrad)
 
+  @test_util.run_deprecated_v1
   def testZeroXGrad(self):
     for dtype in [dtypes.float16, dtypes.float32, dtypes.float64]:
       x = constant_op.constant(0., dtype=dtype)
@@ -282,6 +305,7 @@ class XlogyTest(test.TestCase):
       self.assertAllClose(zero, xlogy_xgrad)
       self.assertAllClose(zero, xlogy_ygrad)
 
+  @test_util.run_deprecated_v1
   def testZeroYGrad(self):
     for dtype in [dtypes.float16, dtypes.float32, dtypes.float64]:
       x = constant_op.constant(0.1, dtype=dtype)
@@ -290,6 +314,7 @@ class XlogyTest(test.TestCase):
       self.assertAllClose(-np.inf, xlogy_xgrad)
       self.assertAllClose(np.inf, xlogy_ygrad)
 
+  @test_util.run_deprecated_v1
   def testZeroXYGrad(self):
     for dtype in [dtypes.float16, dtypes.float32, dtypes.float64]:
       x = constant_op.constant(0., dtype=dtype)
@@ -307,6 +332,7 @@ class XdivyTest(test.TestCase):
     xdivy_ygrad = self.evaluate(gradients.gradients(math_ops.xdivy(x, y), y)[0])
     return xdivy_xgrad, xdivy_ygrad
 
+  @test_util.run_deprecated_v1
   def testNonZeroValuesGrad(self):
     for dtype in [dtypes.float16, dtypes.float32, dtypes.float64]:
       x = constant_op.constant(0.1, dtype=dtype)
@@ -317,6 +343,7 @@ class XdivyTest(test.TestCase):
       self.assertAllClose(xdivy_expected_xgrad, xdivy_xgrad)
       self.assertAllClose(xdivy_expected_ygrad, xdivy_ygrad)
 
+  @test_util.run_deprecated_v1
   def testZeroXGrad(self):
     for dtype in [dtypes.float16, dtypes.float32, dtypes.float64]:
       x = constant_op.constant(0., dtype=dtype)
@@ -326,6 +353,7 @@ class XdivyTest(test.TestCase):
       self.assertAllClose(zero, xdivy_xgrad)
       self.assertAllClose(zero, xdivy_ygrad)
 
+  @test_util.run_deprecated_v1
   def testZeroYGrad(self):
     for dtype in [dtypes.float16, dtypes.float32, dtypes.float64]:
       x = constant_op.constant(0.1, dtype=dtype)
@@ -334,6 +362,7 @@ class XdivyTest(test.TestCase):
       self.assertAllClose(np.inf, xdivy_xgrad)
       self.assertAllClose(-np.inf, xdivy_ygrad)
 
+  @test_util.run_deprecated_v1
   def testZeroXYGrad(self):
     for dtype in [dtypes.float16, dtypes.float32, dtypes.float64]:
       x = constant_op.constant(0., dtype=dtype)
@@ -344,5 +373,25 @@ class XdivyTest(test.TestCase):
       self.assertAllClose(zero, xdivy_ygrad)
 
 
+@test_util.run_all_in_graph_and_eager_modes
+class PowGradTest(test.TestCase):
+
+  def test_zero_grad_tf_gradients(self):
+    if context.executing_eagerly():
+      self.skipTest("tf.gradients not supported in eager.")
+
+    x = constant_op.constant([-1., 0., 1.])
+    g = self.evaluate(gradients.gradients(math_ops.pow(x, 2), x)[0])
+    self.assertAllClose([-2., 0., 2.], g)
+
+  def test_zero_grad_tape(self):
+    with execution_callbacks.errstate(inf_or_nan=execution_callbacks.RAISE):
+      x = constant_op.constant([-1, 0., 1.])
+      with backprop.GradientTape() as tape:
+        tape.watch(x)
+        g = tape.gradient(math_ops.pow(x, 2), x)
+      g = self.evaluate(g)
+      self.assertAllClose([-2., 0., 2.], g)
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/ops/math_ops.py b/tensorflow/python/ops/math_ops.py
index 39b1ca8993e..f0d8bed5087 100644
--- a/tensorflow/python/ops/math_ops.py
+++ b/tensorflow/python/ops/math_ops.py
@@ -36,7 +36,6 @@ from tensorflow.python.ops import gen_data_flow_ops
 from tensorflow.python.ops import gen_math_ops
 from tensorflow.python.ops import gen_nn_ops
 from tensorflow.python.ops import gen_sparse_ops
-from tensorflow.python.ops import gen_spectral_ops
 # go/tf-wildcard-import
 # pylint: disable=wildcard-import
 from tensorflow.python.ops.gen_math_ops import *
@@ -44,6 +43,7 @@ from tensorflow.python.ops.gen_math_ops import *
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import compat
 from tensorflow.python.util import deprecation
+from tensorflow.python.util import dispatch
 from tensorflow.python.util import nest
 from tensorflow.python.util.tf_export import tf_export
 
@@ -52,8 +52,8 @@ linspace = gen_math_ops.lin_space
 
 arg_max = deprecation.deprecated(None, "Use `tf.math.argmax` instead")(arg_max)  # pylint: disable=used-before-assignment
 arg_min = deprecation.deprecated(None, "Use `tf.math.argmin` instead")(arg_min)  # pylint: disable=used-before-assignment
-tf_export("arg_max")(arg_max)
-tf_export("arg_min")(arg_min)
+tf_export(v1=["arg_max"])(arg_max)
+tf_export(v1=["arg_min"])(arg_min)
 
 # This is set by resource_variable_ops.py. It is included in this way since
 # there is a circular dependency between math_ops and resource_variable_ops
@@ -83,8 +83,6 @@ def argmax(input,
            output_type=dtypes.int64):
   axis = deprecation.deprecated_argument_lookup(
       "axis", axis, "dimension", dimension)
-  if axis is None:
-    axis = 0
   return argmax_v2(input, axis, output_type, name)
 
 
@@ -112,6 +110,8 @@ def argmax_v2(input,
   Returns:
     A `Tensor` of type `output_type`.
   """
+  if axis is None:
+    axis = 0
   return gen_math_ops.arg_max(input, axis, name=name, output_type=output_type)
 
 
@@ -128,8 +128,6 @@ def argmin(input,
            output_type=dtypes.int64):
   axis = deprecation.deprecated_argument_lookup(
       "axis", axis, "dimension", dimension)
-  if axis is None:
-    axis = 0
   return argmin_v2(input, axis, output_type, name)
 
 
@@ -157,6 +155,8 @@ def argmin_v2(input,
   Returns:
     A `Tensor` of type `output_type`.
   """
+  if axis is None:
+    axis = 0
   return gen_math_ops.arg_min(input, axis, name=name, output_type=output_type)
 
 
@@ -166,6 +166,7 @@ def argmin_v2(input,
 # pylint: disable=anomalous-backslash-in-string,protected-access
 # pylint: disable=g-docstring-has-escape
 @tf_export("math.abs", "abs")
+@dispatch.add_dispatch_support
 def abs(x, name=None):  # pylint: disable=redefined-builtin
   r"""Computes the absolute value of a tensor.
 
@@ -190,22 +191,10 @@ def abs(x, name=None):  # pylint: disable=redefined-builtin
       of type `float32` or `float64`, respectively.
   """
   with ops.name_scope(name, "Abs", [x]) as name:
-    if isinstance(x, sparse_tensor.SparseTensor):
-      if x.values.dtype.is_complex:
-        x_abs = gen_math_ops.complex_abs(
-            x.values, Tout=x.values.dtype.real_dtype, name=name)
-        return sparse_tensor.SparseTensor(
-            indices=x.indices, values=x_abs, dense_shape=x.dense_shape)
-      x_abs = gen_math_ops._abs(x.values, name=name)
-      return sparse_tensor.SparseTensor(
-          indices=x.indices, values=x_abs, dense_shape=x.dense_shape)
-    else:
-      x = ops.convert_to_tensor(x, name="x")
-      if x.dtype.is_complex:
-        return gen_math_ops.complex_abs(x, Tout=x.dtype.real_dtype, name=name)
-      return gen_math_ops._abs(x, name=name)
-
-
+    x = ops.convert_to_tensor(x, name="x")
+    if x.dtype.is_complex:
+      return gen_math_ops.complex_abs(x, Tout=x.dtype.real_dtype, name=name)
+    return gen_math_ops._abs(x, name=name)
 # pylint: enable=g-docstring-has-escape
 
 
@@ -292,31 +281,7 @@ _sub.__doc__ = (
     gen_math_ops.sub.__doc__ + ("" if _sub.__doc__ is None else _sub.__doc__))
 
 
-# pylint: disable=g-docstring-has-escape
-@tf_export("math.negative", "negative")
-def negative(x, name=None):
-  """Computes numerical negative value element-wise.
-
-  I.e., \\(y = -x\\).
-
-  Args:
-    x: A `Tensor` or `SparseTensor`. Must be one of the following types: `half`,
-      `float32`, `float64`, `int32`, `int64`, `complex64`, `complex128`.
-    name: A name for the operation (optional).
-
-  Returns:
-    A `Tensor` or `SparseTensor`, respectively. Has the same type as `x`.
-  """
-  with ops.name_scope(name, "Neg", [x]) as name:
-    if isinstance(x, sparse_tensor.SparseTensor):
-      x_neg = gen_math_ops.neg(x.values, name=name)
-      return sparse_tensor.SparseTensor(
-          indices=x.indices, values=x_neg, dense_shape=x.dense_shape)
-    else:
-      return gen_math_ops.neg(x, name=name)
-
-
-# pylint: enable=g-docstring-has-escape
+negative = gen_math_ops.neg
 
 
 # pylint: disable=g-docstring-has-escape
@@ -342,107 +307,8 @@ def _neg(x, name=None):
 # pylint: enable=g-docstring-has-escape
 
 
-@tf_export("math.sign", "sign")
-def sign(x, name=None):
-  """Returns an element-wise indication of the sign of a number.
-
-  `y = sign(x) = -1` if `x < 0`; 0 if `x == 0` or `tf.is_nan(x)`; 1 if `x > 0`.
-
-  Zero is returned for NaN inputs.
-
-  For complex numbers, `y = sign(x) = x / |x|` if `x != 0`, otherwise `y = 0`.
-
-  Args:
-    x: A `Tensor` or `SparseTensor`. Must be one of the following types: `half`,
-      `float32`, `float64`, `int32`, `int64`, `complex64`, `complex128`.
-    name: A name for the operation (optional).
-
-  Returns:
-    A `Tensor` or `SparseTensor`, respectively. Has the same type as `x`.
-
-  @compatibility(numpy)
-  Equivalent to numpy.sign except for the behavior for input values of NaN.
-  @end_compatibility
-  """
-  with ops.name_scope(name, "Sign", [x]) as name:
-    if isinstance(x, sparse_tensor.SparseTensor):
-      x_sign = gen_math_ops.sign(x.values, name=name)
-      return sparse_tensor.SparseTensor(
-          indices=x.indices, values=x_sign, dense_shape=x.dense_shape)
-    else:
-      return gen_math_ops.sign(x, name=name)
-
-
-@tf_export("math.square", "square")
-def square(x, name=None):
-  r"""Computes square of x element-wise.
-
-  I.e., \\(y = x * x = x^2\\).
-
-  Args:
-    x: A `Tensor` or `SparseTensor`. Must be one of the following types: `half`,
-      `float32`, `float64`, `int32`, `int64`, `complex64`, `complex128`.
-    name: A name for the operation (optional).
-
-  Returns:
-    A `Tensor` or `SparseTensor`. Has the same type as `x`.
-  """
-  with ops.name_scope(name, "Square", [x]) as name:
-    if isinstance(x, sparse_tensor.SparseTensor):
-      x_square = gen_math_ops.square(x.values, name=name)
-      return sparse_tensor.SparseTensor(
-          indices=x.indices, values=x_square, dense_shape=x.dense_shape)
-    else:
-      return gen_math_ops.square(x, name=name)
-
-
-@tf_export("math.sqrt", "sqrt")
-def sqrt(x, name=None):
-  r"""Computes square root of x element-wise.
-
-  I.e., \\(y = \sqrt{x} = x^{1/2}\\).
-
-  Args:
-    x: A `Tensor` or `SparseTensor`. Must be one of the following types: `half`,
-      `float32`, `float64`, `complex64`, `complex128`.
-    name: A name for the operation (optional).
-
-  Returns:
-    A `Tensor` or `SparseTensor`, respectively. Has the same type as `x`.
-  """
-  with ops.name_scope(name, "Sqrt", [x]) as name:
-    if isinstance(x, sparse_tensor.SparseTensor):
-      x_sqrt = gen_math_ops.sqrt(x.values, name=name)
-      return sparse_tensor.SparseTensor(
-          indices=x.indices, values=x_sqrt, dense_shape=x.dense_shape)
-    else:
-      return gen_math_ops.sqrt(x, name=name)
-
-
-@tf_export("math.erf", v1=["math.erf", "erf"])
-@deprecation.deprecated_endpoints("erf")
-def erf(x, name=None):
-  """Computes the Gauss error function of `x` element-wise.
-
-  Args:
-    x: A `Tensor` or `SparseTensor`. Must be one of the following types: `half`,
-      `float32`, `float64`.
-    name: A name for the operation (optional).
-
-  Returns:
-    A `Tensor` or `SparseTensor`, respectively. Has the same type as `x`.
-  """
-  with ops.name_scope(name, "Erf", [x]) as name:
-    if isinstance(x, sparse_tensor.SparseTensor):
-      x_erf = gen_math_ops.erf(x.values, name=name)
-      return sparse_tensor.SparseTensor(
-          indices=x.indices, values=x_erf, dense_shape=x.dense_shape)
-    else:
-      return gen_math_ops.erf(x, name=name)
-
-
-@tf_export("math.scalar_mul", "scalar_mul")
-def scalar_mul(scalar, x):
+@tf_export(v1=["math.scalar_mul", "scalar_mul"])
+def scalar_mul(scalar, x, name=None):
   """Multiplies a scalar times a `Tensor` or `IndexedSlices` object.
 
   Intended for use in gradient code which might deal with `IndexedSlices`
@@ -452,6 +318,7 @@ def scalar_mul(scalar, x):
   Args:
     scalar: A 0-D scalar `Tensor`. Must have known shape.
     x: A `Tensor` or `IndexedSlices` to be scaled.
+    name: A name for the operation (optional).
 
   Returns:
     `scalar * x` of the same type (`Tensor` or `IndexedSlices`) as `x`.
@@ -464,13 +331,21 @@ def scalar_mul(scalar, x):
   shape = scalar.get_shape()
   if shape.ndims == 0:
     if isinstance(x, ops.IndexedSlices):
-      return ops.IndexedSlices(scalar * x.values, x.indices, x.dense_shape)
+      return ops.IndexedSlices(gen_math_ops.mul(scalar, x.values, name),
+                               x.indices, x.dense_shape)
     else:
-      return scalar * x
+      return gen_math_ops.mul(scalar, x, name)
   else:
     raise ValueError("Only scalar multiply works, got shape %s" % shape)
 
 
+@tf_export("math.scalar_mul", "scalar_mul", v1=[])
+@_set_doc(scalar_mul.__doc__)
+def scalar_mul_v2(scalar, x, name=None):
+  with ops.name_scope(name, "scalar_mul", [x]) as name:
+    return scalar_mul(scalar, x, name)
+
+
 @tf_export("math.pow", "pow")
 def pow(x, y, name=None):  # pylint: disable=redefined-builtin
   r"""Computes the power of one value to another.
@@ -1091,7 +966,10 @@ def truediv(x, y, name=None):
   return _truediv_python3(x, y, name)
 
 
-@tf_export("div")
+@deprecation.deprecated(
+    date=None,
+    instructions="Deprecated in favor of operator or tf.math.divide.")
+@tf_export(v1=["div"])
 def div(x, y, name=None):
   """Divides x / y elementwise (using Python 2 division operator semantics).
 
@@ -1312,7 +1190,7 @@ def range(start, limit=None, delta=1, dtype=None, name="range"):  # pylint: disa
 
 
 # Reduction operations
-def _ReductionDims(x, axis, reduction_indices):
+def _ReductionDims(x, axis, reduction_indices=None):  # pylint: disable=invalid-name
   """Returns range(0, rank(x)) if reduction_indices is None."""
   # TODO(aselle): Remove this after deprecation
   if reduction_indices is not None:
@@ -1335,23 +1213,23 @@ def _ReductionDims(x, axis, reduction_indices):
     return range(0, array_ops.rank(x))
 
 
-def _may_reduce_to_scalar(keepdims, axis, reduction_indices, output):
+def _may_reduce_to_scalar(keepdims, axis, output):
   """Set a reduction's output shape to be a scalar if we are certain."""
   if not common_shapes.has_fully_defined_shape(output) and (not keepdims) and (
-      axis is None) and (reduction_indices is None):
+      axis is None):
     output.set_shape(())
   return output
 
 
-@tf_export("math.reduce_sum", "reduce_sum")
+@tf_export(v1=["math.reduce_sum", "reduce_sum"])
 @deprecation.deprecated_args(
     None, "keep_dims is deprecated, use keepdims instead", "keep_dims")
-def reduce_sum(input_tensor,
-               axis=None,
-               keepdims=None,
-               name=None,
-               reduction_indices=None,
-               keep_dims=None):
+def reduce_sum_v1(input_tensor,
+                  axis=None,
+                  keepdims=None,
+                  name=None,
+                  reduction_indices=None,
+                  keep_dims=None):
   """Computes the sum of elements across dimensions of a tensor.
 
   Reduces `input_tensor` along the dimensions given in `axis`.
@@ -1391,21 +1269,61 @@ def reduce_sum(input_tensor,
   int64 while tensorflow returns the same dtype as the input.
   @end_compatibility
   """
+  axis = deprecation.deprecated_argument_lookup(
+      "axis", axis, "reduction_indices", reduction_indices)
   keepdims = deprecation.deprecated_argument_lookup("keepdims", keepdims,
                                                     "keep_dims", keep_dims)
-  if keepdims is None:
-    keepdims = False
-
-  return _may_reduce_to_scalar(keepdims, axis, reduction_indices,
-                               gen_math_ops._sum(
-                                   input_tensor,
-                                   _ReductionDims(input_tensor, axis,
-                                                  reduction_indices),
-                                   keepdims,
-                                   name=name))
+  return reduce_sum(input_tensor, axis, keepdims, name)
 
 
-@tf_export("math.count_nonzero", "count_nonzero")
+@tf_export("math.reduce_sum", "reduce_sum", v1=[])
+def reduce_sum(input_tensor, axis=None, keepdims=False, name=None):
+  """Computes the sum of elements across dimensions of a tensor.
+
+  Reduces `input_tensor` along the dimensions given in `axis`.
+  Unless `keepdims` is true, the rank of the tensor is reduced by 1 for each
+  entry in `axis`. If `keepdims` is true, the reduced dimensions
+  are retained with length 1.
+
+  If `axis` is None, all dimensions are reduced, and a
+  tensor with a single element is returned.
+
+  For example:
+
+  ```python
+  x = tf.constant([[1, 1, 1], [1, 1, 1]])
+  tf.reduce_sum(x)  # 6
+  tf.reduce_sum(x, 0)  # [2, 2, 2]
+  tf.reduce_sum(x, 1)  # [3, 3]
+  tf.reduce_sum(x, 1, keepdims=True)  # [[3], [3]]
+  tf.reduce_sum(x, [0, 1])  # 6
+  ```
+
+  Args:
+    input_tensor: The tensor to reduce. Should have numeric type.
+    axis: The dimensions to reduce. If `None` (the default), reduces all
+      dimensions. Must be in the range `[-rank(input_tensor),
+      rank(input_tensor))`.
+    keepdims: If true, retains reduced dimensions with length 1.
+    name: A name for the operation (optional).
+
+  Returns:
+    The reduced tensor, of the same dtype as the input_tensor.
+
+  @compatibility(numpy)
+  Equivalent to np.sum apart the fact that numpy upcast uint8 and int32 to
+  int64 while tensorflow returns the same dtype as the input.
+  @end_compatibility
+  """
+  keepdims = False if keepdims is None else keepdims
+  return _may_reduce_to_scalar(
+      keepdims, axis,
+      gen_math_ops._sum(
+          input_tensor, _ReductionDims(input_tensor, axis), keepdims,
+          name=name))
+
+
+@tf_export(v1=["math.count_nonzero", "count_nonzero"])
 @deprecation.deprecated_args(
     None, "keep_dims is deprecated, use keepdims instead", "keep_dims")
 def count_nonzero(input_tensor,
@@ -1466,32 +1384,89 @@ def count_nonzero(input_tensor,
   """
   keepdims = deprecation.deprecated_argument_lookup("keepdims", keepdims,
                                                     "keep_dims", keep_dims)
+  axis = deprecation.deprecated_argument_lookup(
+      "axis", axis,
+      "reduction_indices", reduction_indices
+      )
+
+  return count_nonzero_v2(input_tensor, axis, keepdims, dtype, name)
+
+
+@tf_export("math.count_nonzero", v1=[])
+def count_nonzero_v2(input,  # pylint: disable=redefined-builtin
+                     axis=None,
+                     keepdims=None,
+                     dtype=dtypes.int64,
+                     name=None):
+  """Computes number of nonzero elements across dimensions of a tensor.
+
+  Reduces `input` along the dimensions given in `axis`.
+  Unless `keepdims` is true, the rank of the tensor is reduced by 1 for each
+  entry in `axis`. If `keepdims` is true, the reduced dimensions
+  are retained with length 1.
+
+  If `axis` has no entries, all dimensions are reduced, and a
+  tensor with a single element is returned.
+
+  **NOTE** Floating point comparison to zero is done by exact floating point
+  equality check.  Small values are **not** rounded to zero for purposes of
+  the nonzero check.
+
+  For example:
+
+  ```python
+  x = tf.constant([[0, 1, 0], [1, 1, 0]])
+  tf.count_nonzero(x)  # 3
+  tf.count_nonzero(x, 0)  # [1, 2, 0]
+  tf.count_nonzero(x, 1)  # [1, 2]
+  tf.count_nonzero(x, 1, keepdims=True)  # [[1], [2]]
+  tf.count_nonzero(x, [0, 1])  # 3
+  ```
+
+  **NOTE** Strings are compared against zero-length empty string `""`. Any
+  string with a size greater than zero is already considered as nonzero.
+
+  For example:
+  ```python
+  x = tf.constant(["", "a", "  ", "b", ""])
+  tf.count_nonzero(x) # 3, with "a", "  ", and "b" as nonzero strings.
+  ```
+
+  Args:
+    input: The tensor to reduce. Should be of numeric type, `bool`,
+      or `string`.
+    axis: The dimensions to reduce. If `None` (the default),
+      reduces all dimensions. Must be in the range
+      `[-rank(input), rank(input))`.
+    keepdims: If true, retains reduced dimensions with length 1.
+    dtype: The output dtype; defaults to `tf.int64`.
+    name: A name for the operation (optional).
+
+  Returns:
+    The reduced tensor (number of nonzero values).
+  """
   if keepdims is None:
     keepdims = False
-
-  with ops.name_scope(name, "count_nonzero", [input_tensor]):
-    input_tensor = ops.convert_to_tensor(input_tensor, name="input_tensor")
+  with ops.name_scope(name, "count_nonzero", [input]):
+    input = ops.convert_to_tensor(input, name="input")
     # A scalar of 'zero' is enough as `not_equal` will broadcast.
-    zero = array_ops.zeros([], dtype=input_tensor.dtype)
+    zero = array_ops.zeros([], dtype=input.dtype)
     return cast(
         reduce_sum(
             # int64 reduction happens on GPU
-            to_int64(gen_math_ops.not_equal(input_tensor, zero)),
+            to_int64(gen_math_ops.not_equal(input, zero)),
             axis=axis,
-            keepdims=keepdims,
-            reduction_indices=reduction_indices),
+            keepdims=keepdims),
         dtype=dtype)
 
 
-@tf_export("math.reduce_mean", "reduce_mean")
-@deprecation.deprecated_args(
-    None, "keep_dims is deprecated, use keepdims instead", "keep_dims")
-def reduce_mean(input_tensor,
-                axis=None,
-                keepdims=None,
-                name=None,
-                reduction_indices=None,
-                keep_dims=None):
+@tf_export(v1=["math.reduce_mean", "reduce_mean"])
+def reduce_mean_v1(input_tensor,
+                   axis=None,
+                   keepdims=None,
+                   name=None,
+                   reduction_indices=None,
+                   keep_dims=None):
   """Computes the mean of elements across dimensions of a tensor.
 
   Reduces `input_tensor` along the dimensions given in `axis`.
@@ -1541,22 +1516,72 @@ def reduce_mean(input_tensor,
 
   @end_compatibility
   """
+  axis = deprecation.deprecated_argument_lookup(
+      "axis", axis, "reduction_indices", reduction_indices)
   keepdims = deprecation.deprecated_argument_lookup("keepdims", keepdims,
                                                     "keep_dims", keep_dims)
+  return reduce_mean(input_tensor, axis, keepdims, name)
 
-  if keepdims is None:
-    keepdims = False
-  return _may_reduce_to_scalar(keepdims, axis, reduction_indices,
-                               gen_math_ops.mean(
-                                   input_tensor,
-                                   _ReductionDims(input_tensor, axis,
-                                                  reduction_indices),
-                                   keepdims,
-                                   name=name))
+
+@tf_export("math.reduce_mean", "reduce_mean", v1=[])
+def reduce_mean(input_tensor, axis=None, keepdims=False, name=None):
+  """Computes the mean of elements across dimensions of a tensor.
+
+  Reduces `input_tensor` along the dimensions given in `axis`.
+  Unless `keepdims` is true, the rank of the tensor is reduced by 1 for each
+  entry in `axis`. If `keepdims` is true, the reduced dimensions
+  are retained with length 1.
+
+  If `axis` is None, all dimensions are reduced, and a
+  tensor with a single element is returned.
+
+  For example:
+
+  ```python
+  x = tf.constant([[1., 1.], [2., 2.]])
+  tf.reduce_mean(x)  # 1.5
+  tf.reduce_mean(x, 0)  # [1.5, 1.5]
+  tf.reduce_mean(x, 1)  # [1.,  2.]
+  ```
+
+  Args:
+    input_tensor: The tensor to reduce. Should have numeric type.
+    axis: The dimensions to reduce. If `None` (the default), reduces all
+      dimensions. Must be in the range `[-rank(input_tensor),
+      rank(input_tensor))`.
+    keepdims: If true, retains reduced dimensions with length 1.
+    name: A name for the operation (optional).
+
+  Returns:
+    The reduced tensor.
+
+  @compatibility(numpy)
+  Equivalent to np.mean
+
+  Please note that `np.mean` has a `dtype` parameter that could be used to
+  specify the output type. By default this is `dtype=float64`. On the other
+  hand, `tf.reduce_mean` has an aggressive type inference from `input_tensor`,
+  for example:
+
+  ```python
+  x = tf.constant([1, 0, 1, 0])
+  tf.reduce_mean(x)  # 0
+  y = tf.constant([1., 0., 1., 0.])
+  tf.reduce_mean(y)  # 0.5
+  ```
+
+  @end_compatibility
+  """
+  keepdims = False if keepdims is None else keepdims
+  return _may_reduce_to_scalar(
+      keepdims, axis,
+      gen_math_ops.mean(
+          input_tensor, _ReductionDims(input_tensor, axis), keepdims,
+          name=name))
 
 
 @tf_export("math.reduce_variance")
-def reduce_variance(input_tensor, axis=None, keepdims=None, name=None):
+def reduce_variance(input_tensor, axis=None, keepdims=False, name=None):
   """Computes the variance of elements across dimensions of a tensor.
 
   Reduces `input_tensor` along the dimensions given in `axis`.
@@ -1599,12 +1624,12 @@ def reduce_variance(input_tensor, axis=None, keepdims=None, name=None):
   name = name if name else "reduce_variance"
   with ops.name_scope(name):
     means = reduce_mean(input_tensor, axis=axis, keepdims=True)
-    squared_deviations = square(input_tensor - means)
+    squared_deviations = gen_math_ops.square(input_tensor - means)
     return reduce_mean(squared_deviations, axis=axis, keepdims=keepdims)
 
 
 @tf_export("math.reduce_std")
-def reduce_std(input_tensor, axis=None, keepdims=None, name=None):
+def reduce_std(input_tensor, axis=None, keepdims=False, name=None):
   """Computes the standard deviation of elements across dimensions of a tensor.
 
   Reduces `input_tensor` along the dimensions given in `axis`.
@@ -1646,18 +1671,11 @@ def reduce_std(input_tensor, axis=None, keepdims=None, name=None):
   name = name if name else "reduce_std"
   with ops.name_scope(name):
     variance = reduce_variance(input_tensor, axis=axis, keepdims=keepdims)
-    return sqrt(variance)
+    return gen_math_ops.sqrt(variance)
 
 
-@tf_export("math.reduce_prod", "reduce_prod")
-@deprecation.deprecated_args(
-    None, "keep_dims is deprecated, use keepdims instead", "keep_dims")
-def reduce_prod(input_tensor,
-                axis=None,
-                keepdims=None,
-                name=None,
-                reduction_indices=None,
-                keep_dims=None):
+@tf_export("math.reduce_prod", "reduce_prod", v1=[])
+def reduce_prod(input_tensor, axis=None, keepdims=False, name=None):
   """Computes the product of elements across dimensions of a tensor.
 
   Reduces `input_tensor` along the dimensions given in `axis`.
@@ -1675,6 +1693,48 @@ def reduce_prod(input_tensor,
       `[-rank(input_tensor), rank(input_tensor))`.
     keepdims: If true, retains reduced dimensions with length 1.
     name: A name for the operation (optional).
+
+  Returns:
+    The reduced tensor.
+
+  @compatibility(numpy)
+  Equivalent to np.prod
+  @end_compatibility
+  """
+  keepdims = False if keepdims is None else keepdims
+  return _may_reduce_to_scalar(
+      keepdims, axis,
+      gen_math_ops.prod(
+          input_tensor, _ReductionDims(input_tensor, axis), keepdims,
+          name=name))
+
+
+@tf_export(v1=["math.reduce_prod", "reduce_prod"])
+@deprecation.deprecated_args(
+    None, "keep_dims is deprecated, use keepdims instead", "keep_dims")
+def reduce_prod_v1(input_tensor,
+                   axis=None,
+                   keepdims=None,
+                   name=None,
+                   reduction_indices=None,
+                   keep_dims=None):
+  """Computes the product of elements across dimensions of a tensor.
+
+  Reduces `input_tensor` along the dimensions given in `axis`.
+  Unless `keepdims` is true, the rank of the tensor is reduced by 1 for each
+  entry in `axis`. If `keepdims` is true, the reduced dimensions
+  are retained with length 1.
+
+  If `axis` is None, all dimensions are reduced, and a
+  tensor with a single element is returned.
+
+  Args:
+    input_tensor: The tensor to reduce. Should have numeric type.
+    axis: The dimensions to reduce. If `None` (the default), reduces all
+      dimensions. Must be in the range `[-rank(input_tensor),
+      rank(input_tensor))`.
+    keepdims: If true, retains reduced dimensions with length 1.
+    name: A name for the operation (optional).
     reduction_indices: The old (deprecated) name for axis.
     keep_dims: Deprecated alias for `keepdims`.
 
@@ -1685,29 +1745,22 @@ def reduce_prod(input_tensor,
   Equivalent to np.prod
   @end_compatibility
   """
+  axis = deprecation.deprecated_argument_lookup(
+      "axis", axis, "reduction_indices", reduction_indices)
   keepdims = deprecation.deprecated_argument_lookup("keepdims", keepdims,
                                                     "keep_dims", keep_dims)
-
-  if keepdims is None:
-    keepdims = False
-  return _may_reduce_to_scalar(keepdims, axis, reduction_indices,
-                               gen_math_ops.prod(
-                                   input_tensor,
-                                   _ReductionDims(input_tensor, axis,
-                                                  reduction_indices),
-                                   keepdims,
-                                   name=name))
+  return reduce_prod(input_tensor, axis, keepdims, name)
 
 
-@tf_export("math.reduce_min", "reduce_min")
+@tf_export(v1=["math.reduce_min", "reduce_min"])
 @deprecation.deprecated_args(
     None, "keep_dims is deprecated, use keepdims instead", "keep_dims")
-def reduce_min(input_tensor,
-               axis=None,
-               keepdims=None,
-               name=None,
-               reduction_indices=None,
-               keep_dims=None):
+def reduce_min_v1(input_tensor,
+                  axis=None,
+                  keepdims=None,
+                  name=None,
+                  reduction_indices=None,
+                  keep_dims=None):
   """Computes the minimum of elements across dimensions of a tensor.
 
   Reduces `input_tensor` along the dimensions given in `axis`.
@@ -1720,9 +1773,9 @@ def reduce_min(input_tensor,
 
   Args:
     input_tensor: The tensor to reduce. Should have real numeric type.
-    axis: The dimensions to reduce. If `None` (the default),
-      reduces all dimensions. Must be in the range
-      `[-rank(input_tensor), rank(input_tensor))`.
+    axis: The dimensions to reduce. If `None` (the default), reduces all
+      dimensions. Must be in the range `[-rank(input_tensor),
+      rank(input_tensor))`.
     keepdims: If true, retains reduced dimensions with length 1.
     name: A name for the operation (optional).
     reduction_indices: The old (deprecated) name for axis.
@@ -1735,28 +1788,57 @@ def reduce_min(input_tensor,
   Equivalent to np.min
   @end_compatibility
   """
+  axis = deprecation.deprecated_argument_lookup(
+      "axis", axis, "reduction_indices", reduction_indices)
   keepdims = deprecation.deprecated_argument_lookup("keepdims", keepdims,
                                                     "keep_dims", keep_dims)
-  if keepdims is None:
-    keepdims = False
-  return _may_reduce_to_scalar(keepdims, axis, reduction_indices,
-                               gen_math_ops._min(
-                                   input_tensor,
-                                   _ReductionDims(input_tensor, axis,
-                                                  reduction_indices),
-                                   keepdims,
-                                   name=name))
+  return reduce_min(input_tensor, axis, keepdims, name)
 
 
-@tf_export("math.reduce_max", "reduce_max")
+@tf_export("math.reduce_min", "reduce_min", v1=[])
+def reduce_min(input_tensor, axis=None, keepdims=False, name=None):
+  """Computes the minimum of elements across dimensions of a tensor.
+
+  Reduces `input_tensor` along the dimensions given in `axis`.
+  Unless `keepdims` is true, the rank of the tensor is reduced by 1 for each
+  entry in `axis`. If `keepdims` is true, the reduced dimensions
+  are retained with length 1.
+
+  If `axis` is None, all dimensions are reduced, and a
+  tensor with a single element is returned.
+
+  Args:
+    input_tensor: The tensor to reduce. Should have real numeric type.
+    axis: The dimensions to reduce. If `None` (the default), reduces all
+      dimensions. Must be in the range `[-rank(input_tensor),
+      rank(input_tensor))`.
+    keepdims: If true, retains reduced dimensions with length 1.
+    name: A name for the operation (optional).
+
+  Returns:
+    The reduced tensor.
+
+  @compatibility(numpy)
+  Equivalent to np.min
+  @end_compatibility
+  """
+  keepdims = False if keepdims is None else keepdims
+  return _may_reduce_to_scalar(
+      keepdims, axis,
+      gen_math_ops._min(
+          input_tensor, _ReductionDims(input_tensor, axis), keepdims,
+          name=name))
+
+
+@tf_export(v1=["math.reduce_max", "reduce_max"])
 @deprecation.deprecated_args(
     None, "keep_dims is deprecated, use keepdims instead", "keep_dims")
-def reduce_max(input_tensor,
-               axis=None,
-               keepdims=None,
-               name=None,
-               reduction_indices=None,
-               keep_dims=None):
+def reduce_max_v1(input_tensor,
+                  axis=None,
+                  keepdims=None,
+                  name=None,
+                  reduction_indices=None,
+                  keep_dims=None):
   """Computes the maximum of elements across dimensions of a tensor.
 
   Reduces `input_tensor` along the dimensions given in `axis`.
@@ -1784,28 +1866,57 @@ def reduce_max(input_tensor,
   Equivalent to np.max
   @end_compatibility
   """
+  axis = deprecation.deprecated_argument_lookup(
+      "axis", axis, "reduction_indices", reduction_indices)
   keepdims = deprecation.deprecated_argument_lookup("keepdims", keepdims,
                                                     "keep_dims", keep_dims)
-  if keepdims is None:
-    keepdims = False
-  return _may_reduce_to_scalar(keepdims, axis, reduction_indices,
-                               gen_math_ops._max(
-                                   input_tensor,
-                                   _ReductionDims(input_tensor, axis,
-                                                  reduction_indices),
-                                   keepdims,
-                                   name=name))
+  return reduce_max(input_tensor, axis, keepdims, name)
 
 
-@tf_export("math.reduce_all", "reduce_all")
+@tf_export("math.reduce_max", "reduce_max", v1=[])
+def reduce_max(input_tensor, axis=None, keepdims=False, name=None):
+  """Computes the maximum of elements across dimensions of a tensor.
+
+  Reduces `input_tensor` along the dimensions given in `axis`.
+  Unless `keepdims` is true, the rank of the tensor is reduced by 1 for each
+  entry in `axis`. If `keepdims` is true, the reduced dimensions
+  are retained with length 1.
+
+  If `axis` is None, all dimensions are reduced, and a
+  tensor with a single element is returned.
+
+  Args:
+    input_tensor: The tensor to reduce. Should have real numeric type.
+    axis: The dimensions to reduce. If `None` (the default), reduces all
+      dimensions. Must be in the range `[-rank(input_tensor),
+      rank(input_tensor))`.
+    keepdims: If true, retains reduced dimensions with length 1.
+    name: A name for the operation (optional).
+
+  Returns:
+    The reduced tensor.
+
+  @compatibility(numpy)
+  Equivalent to np.max
+  @end_compatibility
+  """
+  keepdims = False if keepdims is None else keepdims
+  return _may_reduce_to_scalar(
+      keepdims, axis,
+      gen_math_ops._max(
+          input_tensor, _ReductionDims(input_tensor, axis), keepdims,
+          name=name))
+
+
+@tf_export(v1=["math.reduce_all", "reduce_all"])
 @deprecation.deprecated_args(
     None, "keep_dims is deprecated, use keepdims instead", "keep_dims")
-def reduce_all(input_tensor,
-               axis=None,
-               keepdims=None,
-               name=None,
-               reduction_indices=None,
-               keep_dims=None):
+def reduce_all_v1(input_tensor,
+                  axis=None,
+                  keepdims=None,
+                  name=None,
+                  reduction_indices=None,
+                  keep_dims=None):
   """Computes the "logical and" of elements across dimensions of a tensor.
 
   Reduces `input_tensor` along the dimensions given in `axis`.
@@ -1827,9 +1938,9 @@ def reduce_all(input_tensor,
 
   Args:
     input_tensor: The boolean tensor to reduce.
-    axis: The dimensions to reduce. If `None` (the default),
-      reduces all dimensions. Must be in the range
-      `[-rank(input_tensor), rank(input_tensor))`.
+    axis: The dimensions to reduce. If `None` (the default), reduces all
+      dimensions. Must be in the range `[-rank(input_tensor),
+      rank(input_tensor))`.
     keepdims: If true, retains reduced dimensions with length 1.
     name: A name for the operation (optional).
     reduction_indices: The old (deprecated) name for axis.
@@ -1842,28 +1953,66 @@ def reduce_all(input_tensor,
   Equivalent to np.all
   @end_compatibility
   """
+  axis = deprecation.deprecated_argument_lookup(
+      "axis", axis, "reduction_indices", reduction_indices)
   keepdims = deprecation.deprecated_argument_lookup("keepdims", keepdims,
                                                     "keep_dims", keep_dims)
-  if keepdims is None:
-    keepdims = False
-  return _may_reduce_to_scalar(keepdims, axis, reduction_indices,
-                               gen_math_ops._all(
-                                   input_tensor,
-                                   _ReductionDims(input_tensor, axis,
-                                                  reduction_indices),
-                                   keepdims,
-                                   name=name))
+  return reduce_all(input_tensor, axis, keepdims, name)
 
 
-@tf_export("math.reduce_any", "reduce_any")
+@tf_export("reduce_all", "math.reduce_all", v1=[])
+def reduce_all(input_tensor, axis=None, keepdims=False, name=None):
+  """Computes the "logical and" of elements across dimensions of a tensor.
+
+  Reduces `input_tensor` along the dimensions given in `axis`.
+  Unless `keepdims` is true, the rank of the tensor is reduced by 1 for each
+  entry in `axis`. If `keepdims` is true, the reduced dimensions
+  are retained with length 1.
+
+  If `axis` is None, all dimensions are reduced, and a
+  tensor with a single element is returned.
+
+  For example:
+
+  ```python
+  x = tf.constant([[True,  True], [False, False]])
+  tf.reduce_all(x)  # False
+  tf.reduce_all(x, 0)  # [False, False]
+  tf.reduce_all(x, 1)  # [True, False]
+  ```
+
+  Args:
+    input_tensor: The boolean tensor to reduce.
+    axis: The dimensions to reduce. If `None` (the default), reduces all
+      dimensions. Must be in the range `[-rank(input_tensor),
+      rank(input_tensor))`.
+    keepdims: If true, retains reduced dimensions with length 1.
+    name: A name for the operation (optional).
+
+  Returns:
+    The reduced tensor.
+
+  @compatibility(numpy)
+  Equivalent to np.all
+  @end_compatibility
+  """
+  keepdims = False if keepdims is None else keepdims
+  return _may_reduce_to_scalar(
+      keepdims, axis,
+      gen_math_ops._all(
+          input_tensor, _ReductionDims(input_tensor, axis), keepdims,
+          name=name))
+
+
+@tf_export(v1=["math.reduce_any", "reduce_any"])
 @deprecation.deprecated_args(
     None, "keep_dims is deprecated, use keepdims instead", "keep_dims")
-def reduce_any(input_tensor,
-               axis=None,
-               keepdims=None,
-               name=None,
-               reduction_indices=None,
-               keep_dims=None):
+def reduce_any_v1(input_tensor,
+                  axis=None,
+                  keepdims=None,
+                  name=None,
+                  reduction_indices=None,
+                  keep_dims=None):
   """Computes the "logical or" of elements across dimensions of a tensor.
 
   Reduces `input_tensor` along the dimensions given in `axis`.
@@ -1885,9 +2034,9 @@ def reduce_any(input_tensor,
 
   Args:
     input_tensor: The boolean tensor to reduce.
-    axis: The dimensions to reduce. If `None` (the default),
-      reduces all dimensions. Must be in the range
-      `[-rank(input_tensor), rank(input_tensor))`.
+    axis: The dimensions to reduce. If `None` (the default), reduces all
+      dimensions. Must be in the range `[-rank(input_tensor),
+      rank(input_tensor))`.
     keepdims: If true, retains reduced dimensions with length 1.
     name: A name for the operation (optional).
     reduction_indices: The old (deprecated) name for axis.
@@ -1900,28 +2049,66 @@ def reduce_any(input_tensor,
   Equivalent to np.any
   @end_compatibility
   """
+  axis = deprecation.deprecated_argument_lookup(
+      "axis", axis, "reduction_indices", reduction_indices)
   keepdims = deprecation.deprecated_argument_lookup("keepdims", keepdims,
                                                     "keep_dims", keep_dims)
-  if keepdims is None:
-    keepdims = False
-  return _may_reduce_to_scalar(keepdims, axis, reduction_indices,
-                               gen_math_ops._any(
-                                   input_tensor,
-                                   _ReductionDims(input_tensor, axis,
-                                                  reduction_indices),
-                                   keepdims,
-                                   name=name))
+  return reduce_any(input_tensor, axis, keepdims, name)
 
 
-@tf_export("math.reduce_logsumexp", "reduce_logsumexp")
+@tf_export("math.reduce_any", "reduce_any", v1=[])
+def reduce_any(input_tensor, axis=None, keepdims=False, name=None):
+  """Computes the "logical or" of elements across dimensions of a tensor.
+
+  Reduces `input_tensor` along the dimensions given in `axis`.
+  Unless `keepdims` is true, the rank of the tensor is reduced by 1 for each
+  entry in `axis`. If `keepdims` is true, the reduced dimensions
+  are retained with length 1.
+
+  If `axis` is None, all dimensions are reduced, and a
+  tensor with a single element is returned.
+
+  For example:
+
+  ```python
+  x = tf.constant([[True,  True], [False, False]])
+  tf.reduce_any(x)  # True
+  tf.reduce_any(x, 0)  # [True, True]
+  tf.reduce_any(x, 1)  # [True, False]
+  ```
+
+  Args:
+    input_tensor: The boolean tensor to reduce.
+    axis: The dimensions to reduce. If `None` (the default), reduces all
+      dimensions. Must be in the range `[-rank(input_tensor),
+      rank(input_tensor))`.
+    keepdims: If true, retains reduced dimensions with length 1.
+    name: A name for the operation (optional).
+
+  Returns:
+    The reduced tensor.
+
+  @compatibility(numpy)
+  Equivalent to np.any
+  @end_compatibility
+  """
+  keepdims = False if keepdims is None else keepdims
+  return _may_reduce_to_scalar(
+      keepdims, axis,
+      gen_math_ops._any(
+          input_tensor, _ReductionDims(input_tensor, axis), keepdims,
+          name=name))
+
+
+@tf_export(v1=["math.reduce_logsumexp", "reduce_logsumexp"])
 @deprecation.deprecated_args(
     None, "keep_dims is deprecated, use keepdims instead", "keep_dims")
-def reduce_logsumexp(input_tensor,
-                     axis=None,
-                     keepdims=None,
-                     name=None,
-                     reduction_indices=None,
-                     keep_dims=None):
+def reduce_logsumexp_v1(input_tensor,
+                        axis=None,
+                        keepdims=None,
+                        name=None,
+                        reduction_indices=None,
+                        keep_dims=None):
   """Computes log(sum(exp(elements across dimensions of a tensor))).
 
   Reduces `input_tensor` along the dimensions given in `axis`.
@@ -1949,9 +2136,9 @@ def reduce_logsumexp(input_tensor,
 
   Args:
     input_tensor: The tensor to reduce. Should have numeric type.
-    axis: The dimensions to reduce. If `None` (the default),
-      reduces all dimensions. Must be in the range
-      `[-rank(input_tensor), rank(input_tensor))`.
+    axis: The dimensions to reduce. If `None` (the default), reduces all
+      dimensions. Must be in the range `[-rank(input_tensor),
+      rank(input_tensor))`.
     keepdims: If true, retains reduced dimensions with length 1.
     name: A name for the operation (optional).
     reduction_indices: The old (deprecated) name for axis.
@@ -1960,16 +2147,57 @@ def reduce_logsumexp(input_tensor,
   Returns:
     The reduced tensor.
   """
+  axis = deprecation.deprecated_argument_lookup(
+      "axis", axis, "reduction_indices", reduction_indices)
   keepdims = deprecation.deprecated_argument_lookup("keepdims", keepdims,
                                                     "keep_dims", keep_dims)
-  if keepdims is None:
-    keepdims = False
+  return reduce_logsumexp(input_tensor, axis, keepdims, name)
+
+
+@tf_export("math.reduce_logsumexp", "reduce_logsumexp", v1=[])
+def reduce_logsumexp(input_tensor, axis=None, keepdims=False, name=None):
+  """Computes log(sum(exp(elements across dimensions of a tensor))).
+
+  Reduces `input_tensor` along the dimensions given in `axis`.
+  Unless `keepdims` is true, the rank of the tensor is reduced by 1 for each
+  entry in `axis`. If `keepdims` is true, the reduced dimensions
+  are retained with length 1.
+
+  If `axis` has no entries, all dimensions are reduced, and a
+  tensor with a single element is returned.
+
+  This function is more numerically stable than log(sum(exp(input))). It avoids
+  overflows caused by taking the exp of large inputs and underflows caused by
+  taking the log of small inputs.
+
+  For example:
+
+  ```python
+  x = tf.constant([[0., 0., 0.], [0., 0., 0.]])
+  tf.reduce_logsumexp(x)  # log(6)
+  tf.reduce_logsumexp(x, 0)  # [log(2), log(2), log(2)]
+  tf.reduce_logsumexp(x, 1)  # [log(3), log(3)]
+  tf.reduce_logsumexp(x, 1, keepdims=True)  # [[log(3)], [log(3)]]
+  tf.reduce_logsumexp(x, [0, 1])  # log(6)
+  ```
+
+  Args:
+    input_tensor: The tensor to reduce. Should have numeric type.
+    axis: The dimensions to reduce. If `None` (the default), reduces all
+      dimensions. Must be in the range `[-rank(input_tensor),
+      rank(input_tensor))`.
+    keepdims: If true, retains reduced dimensions with length 1.
+    name: A name for the operation (optional).
+
+  Returns:
+    The reduced tensor.
+  """
+  keepdims = False if keepdims is None else keepdims
   input_tensor = ops.convert_to_tensor(input_tensor)
   with ops.name_scope(name, "ReduceLogSumExp", [input_tensor]) as name:
     raw_max = reduce_max(
         input_tensor,
         axis=axis,
-        reduction_indices=reduction_indices,
         keepdims=True)
     my_max = array_ops.stop_gradient(
         array_ops.where(
@@ -1979,12 +2207,11 @@ def reduce_logsumexp(input_tensor,
         reduce_sum(
             gen_math_ops.exp(gen_math_ops.sub(input_tensor, my_max)),
             axis,
-            keepdims=keepdims,
-            reduction_indices=reduction_indices))
+            keepdims=keepdims))
     if not keepdims:
       my_max = array_ops.reshape(my_max, array_ops.shape(result))
     result = gen_math_ops.add(result, my_max)
-    return _may_reduce_to_scalar(keepdims, axis, reduction_indices, result)
+    return _may_reduce_to_scalar(keepdims, axis, result)
 
 
 @tf_export("linalg.trace", v1=["linalg.trace", "trace"])
@@ -2311,7 +2538,8 @@ def matvec(a,
 
 _OverrideBinaryOperatorHelper(matmul, "matmul")
 
-sparse_matmul = gen_math_ops.sparse_mat_mul
+sparse_matmul = deprecation.deprecated(None, "Use `tf.linalg.matmul` instead")(
+    gen_math_ops.sparse_mat_mul)
 tf_export(v1=["sparse_matmul"])(sparse_matmul)
 
 
@@ -2555,34 +2783,13 @@ def log_sigmoid(x, name=None):
     return gen_math_ops.neg(gen_nn_ops.softplus(-x), name=name)
 
 
-@tf_export("math.tanh", "nn.tanh", "tanh")
-def tanh(x, name=None):
-  """Computes hyperbolic tangent of `x` element-wise.
-
-  Args:
-    x: A Tensor or SparseTensor with type `float16`, `float32`, `double`,
-      `complex64`, or `complex128`.
-    name: A name for the operation (optional).
-
-  Returns:
-    A Tensor or SparseTensor respectively with the same type as `x`.
-  """
-  with ops.name_scope(name, "Tanh", [x]) as name:
-    if isinstance(x, sparse_tensor.SparseTensor):
-      x_tanh = gen_math_ops.tanh(x.values, name=name)
-      return sparse_tensor.SparseTensor(
-          indices=x.indices, values=x_tanh, dense_shape=x.dense_shape)
-    else:
-      return gen_math_ops.tanh(x, name=name)
-
-
-@tf_export("math.bincount", v1=["math.bincount", "bincount"])
-@deprecation.deprecated_endpoints("bincount")
+@tf_export("math.bincount", v1=[])
 def bincount(arr,
              weights=None,
              minlength=None,
              maxlength=None,
-             dtype=dtypes.int32):
+             dtype=dtypes.int32,
+             name=None):
   """Counts the number of occurrences of each value in an integer array.
 
   If `minlength` and `maxlength` are not given, returns a vector with length
@@ -2594,34 +2801,70 @@ def bincount(arr,
   Args:
     arr: An int32 tensor of non-negative values.
     weights: If non-None, must be the same shape as arr. For each value in
-        `arr`, the bin will be incremented by the corresponding weight instead
-        of 1.
+      `arr`, the bin will be incremented by the corresponding weight instead of
+      1.
     minlength: If given, ensures the output has length at least `minlength`,
-        padding with zeros at the end if necessary.
+      padding with zeros at the end if necessary.
     maxlength: If given, skips values in `arr` that are equal or greater than
-        `maxlength`, ensuring that the output has length at most `maxlength`.
+      `maxlength`, ensuring that the output has length at most `maxlength`.
+    dtype: If `weights` is None, determines the type of the output bins.
+    name: A name scope for the associated operations (optional).
+
+  Returns:
+    A vector with the same dtype as `weights` or the given `dtype`. The bin
+    values.
+  """
+  name = "bincount" if name is None else name
+  with ops.name_scope(name):
+    arr = ops.convert_to_tensor(arr, name="arr", dtype=dtypes.int32)
+    array_is_nonempty = reduce_prod(array_ops.shape(arr)) > 0
+    output_size = cast(array_is_nonempty, dtypes.int32) * (reduce_max(arr) + 1)
+    if minlength is not None:
+      minlength = ops.convert_to_tensor(
+          minlength, name="minlength", dtype=dtypes.int32)
+      output_size = gen_math_ops.maximum(minlength, output_size)
+    if maxlength is not None:
+      maxlength = ops.convert_to_tensor(
+          maxlength, name="maxlength", dtype=dtypes.int32)
+      output_size = gen_math_ops.minimum(maxlength, output_size)
+    if weights is not None:
+      weights = ops.convert_to_tensor(weights, name="weights")
+      return gen_math_ops.unsorted_segment_sum(weights, arr, output_size)
+    weights = constant_op.constant([], dtype)
+    return gen_math_ops.bincount(arr, output_size, weights)
+
+
+@tf_export(v1=["math.bincount", "bincount"])
+@deprecation.deprecated_endpoints("bincount")
+def bincount_v1(arr,
+                weights=None,
+                minlength=None,
+                maxlength=None,
+                dtype=dtypes.int32):
+  """Counts the number of occurrences of each value in an integer array.
+
+  If `minlength` and `maxlength` are not given, returns a vector with length
+  `tf.reduce_max(arr) + 1` if `arr` is non-empty, and length 0 otherwise.
+  If `weights` are non-None, then index `i` of the output stores the sum of the
+  value in `weights` at each index where the corresponding value in `arr` is
+  `i`.
+
+  Args:
+    arr: An int32 tensor of non-negative values.
+    weights: If non-None, must be the same shape as arr. For each value in
+      `arr`, the bin will be incremented by the corresponding weight instead of
+      1.
+    minlength: If given, ensures the output has length at least `minlength`,
+      padding with zeros at the end if necessary.
+    maxlength: If given, skips values in `arr` that are equal or greater than
+      `maxlength`, ensuring that the output has length at most `maxlength`.
     dtype: If `weights` is None, determines the type of the output bins.
 
   Returns:
     A vector with the same dtype as `weights` or the given `dtype`. The bin
     values.
   """
-  arr = ops.convert_to_tensor(arr, name="arr", dtype=dtypes.int32)
-  array_is_nonempty = reduce_prod(array_ops.shape(arr)) > 0
-  output_size = cast(array_is_nonempty, dtypes.int32) * (reduce_max(arr) + 1)
-  if minlength is not None:
-    minlength = ops.convert_to_tensor(
-        minlength, name="minlength", dtype=dtypes.int32)
-    output_size = gen_math_ops.maximum(minlength, output_size)
-  if maxlength is not None:
-    maxlength = ops.convert_to_tensor(
-        maxlength, name="maxlength", dtype=dtypes.int32)
-    output_size = gen_math_ops.minimum(maxlength, output_size)
-  if weights is not None:
-    weights = ops.convert_to_tensor(weights, name="weights")
-    return gen_math_ops.unsorted_segment_sum(weights, arr, output_size)
-  weights = constant_op.constant([], dtype)
-  return gen_math_ops.bincount(arr, output_size, weights)
+  return bincount(arr, weights, minlength, maxlength, dtype)
 
 
 @tf_export("math.cumsum", "cumsum")
@@ -2923,8 +3166,7 @@ def unsorted_segment_sqrt_n(data, segment_ids, num_segments, name=None):
     return summed / gen_math_ops.sqrt(N)
 
 
-@tf_export(
-    "sparse.segment_sum", v1=["sparse.segment_sum", "sparse_segment_sum"])
+@tf_export(v1=["sparse.segment_sum", "sparse_segment_sum"])
 @deprecation.deprecated_endpoints("sparse_segment_sum")
 def sparse_segment_sum(data, indices, segment_ids, name=None,
                        num_segments=None):
@@ -2998,8 +3240,17 @@ def sparse_segment_sum(data, indices, segment_ids, name=None,
         data=data, indices=indices, segment_ids=segment_ids, name=name)
 
 
-@tf_export(
-    "sparse.segment_mean", v1=["sparse.segment_mean", "sparse_segment_mean"])
+@tf_export("sparse.segment_sum", v1=[])
+def sparse_segment_sum_v2(data,
+                          indices,
+                          segment_ids,
+                          num_segments=None,
+                          name=None):
+  return sparse_segment_mean(
+      data, indices, segment_ids, name=name, num_segments=num_segments)
+
+
+@tf_export(v1=["sparse.segment_mean", "sparse_segment_mean"])
 @deprecation.deprecated_endpoints("sparse_segment_mean")
 def sparse_segment_mean(data,
                         indices,
@@ -3045,9 +3296,44 @@ def sparse_segment_mean(data,
         data=data, indices=indices, segment_ids=segment_ids, name=name)
 
 
-@tf_export(
-    "sparse.segment_sqrt_n",
-    v1=["sparse.segment_sqrt_n", "sparse_segment_sqrt_n"])
+@tf_export("sparse.segment_mean", v1=[])
+def sparse_segment_mean_v2(data,
+                           indices,
+                           segment_ids,
+                           num_segments=None,
+                           name=None):
+  r"""Computes the mean along sparse segments of a tensor.
+
+  Read [the section on
+  segmentation](https://tensorflow.org/api_guides/python/math_ops#Segmentation)
+  for an explanation of segments.
+
+  Like `SegmentMean`, but `segment_ids` can have rank less than `data`'s first
+  dimension, selecting a subset of dimension 0, specified by `indices`.
+  `segment_ids` is allowed to have missing ids, in which case the output will
+  be zeros at those indices. In those cases `num_segments` is used to determine
+  the size of the output.
+
+  Args:
+    data: A `Tensor` with data that will be assembled in the output.
+    indices: A 1-D `Tensor` with indices into `data`. Has same rank as
+      `segment_ids`.
+    segment_ids: A 1-D `Tensor` with indices into the output `Tensor`. Values
+      should be sorted and can be repeated.
+    num_segments: An optional int32 scalar. Indicates the size of the output
+      `Tensor`.
+    name: A name for the operation (optional).
+
+  Returns:
+    A `tensor` of the shape as data, except for dimension 0 which
+    has size `k`, the number of segments specified via `num_segments` or
+    inferred for the last element in `segments_ids`.
+  """
+  return sparse_segment_mean(
+      data, indices, segment_ids, name=name, num_segments=num_segments)
+
+
+@tf_export(v1=["sparse.segment_sqrt_n", "sparse_segment_sqrt_n"])
 @deprecation.deprecated_endpoints("sparse_segment_sqrt_n")
 def sparse_segment_sqrt_n(data,
                           indices,
@@ -3085,6 +3371,35 @@ def sparse_segment_sqrt_n(data,
         data=data, indices=indices, segment_ids=segment_ids, name=name)
 
 
+@tf_export("sparse.segment_sqrt_n", v1=[])
+def sparse_segment_sqrt_n_v2(data,
+                             indices,
+                             segment_ids,
+                             num_segments=None,
+                             name=None):
+  r"""Computes the sum along sparse segments of a tensor divided by the sqrt(N).
+
+  `N` is the size of the segment being reduced.
+
+  Args:
+    data: A `Tensor` with data that will be assembled in the output.
+    indices: A 1-D `Tensor` with indices into `data`. Has same rank as
+      `segment_ids`.
+    segment_ids: A 1-D `Tensor` with indices into the output `Tensor`. Values
+      should be sorted and can be repeated.
+    num_segments: An optional int32 scalar. Indicates the size of the output
+      `Tensor`.
+    name: A name for the operation (optional).
+
+  Returns:
+    A `tensor` of the shape as data, except for dimension 0 which
+    has size `k`, the number of segments specified via `num_segments` or
+    inferred for the last element in `segments_ids`.
+  """
+  return sparse_segment_sqrt_n(
+      data, indices, segment_ids, name=name, num_segments=num_segments)
+
+
 @tf_export("tensordot", "linalg.tensordot")
 def tensordot(a, b, axes, name=None):
   r"""Tensor contraction of a and b along specified axes.
@@ -3118,12 +3433,11 @@ def tensordot(a, b, axes, name=None):
     a: `Tensor` of type `float32` or `float64`.
     b: `Tensor` with the same type as `a`.
     axes: Either a scalar `N`, or a list or an `int32` `Tensor` of shape [2, k].
-     If axes is a scalar, sum over the last N axes of a and the first N axes
-     of b in order.
-     If axes is a list or `Tensor` the first and second row contain the set of
-     unique integers specifying axes along which the contraction is computed,
-     for `a` and `b`, respectively. The number of axes for `a` and `b` must
-     be equal.
+      If axes is a scalar, sum over the last N axes of a and the first N axes of
+      b in order. If axes is a list or `Tensor` the first and second row contain
+      the set of unique integers specifying axes along which the contraction is
+      computed, for `a` and `b`, respectively. The number of axes for `a` and
+      `b` must be equal.
     name: A name for the operation (optional).
 
   Returns:
@@ -3295,73 +3609,3 @@ def polyval(coeffs, x, name=None):
     for c in coeffs[1:]:
       p = c + p * x
     return p
-
-
-@tf_export("math.bessel_i0e")
-def bessel_i0e(x, name=None):
-  """Computes the Bessel i0e function of `x` element-wise.
-
-  Exponentially scaled modified Bessel function of order 0 defined as
-  `bessel_i0e(x) = exp(-abs(x)) bessel_i0(x)`.
-
-  This function is faster and numerically stabler than `bessel_i0(x)`.
-
-  Args:
-    x: A `Tensor` or `SparseTensor`. Must be one of the following types: `half`,
-      `float32`, `float64`.
-    name: A name for the operation (optional).
-
-  Returns:
-    A `Tensor` or `SparseTensor`, respectively. Has the same type as `x`.
-
-  @compatibility(scipy)
-  Equivalent to scipy.special.i0e
-  @end_compatibility
-  """
-  with ops.name_scope(name, "bessel_i0e", [x]) as name:
-    if isinstance(x, sparse_tensor.SparseTensor):
-      x_i0e = gen_math_ops.bessel_i0e(x.values, name=name)
-      return sparse_tensor.SparseTensor(
-          indices=x.indices, values=x_i0e, dense_shape=x.dense_shape)
-    else:
-      return gen_math_ops.bessel_i0e(x, name=name)
-
-
-@tf_export("math.bessel_i1e")
-def bessel_i1e(x, name=None):
-  """Computes the Bessel i1e function of `x` element-wise.
-
-  Exponentially scaled modified Bessel function of order 1 defined as
-  `bessel_i1e(x) = exp(-abs(x)) bessel_i1(x)`.
-
-  This function is faster and numerically stabler than `bessel_i1(x)`.
-
-  Args:
-    x: A `Tensor` or `SparseTensor`. Must be one of the following types: `half`,
-      `float32`, `float64`.
-    name: A name for the operation (optional).
-
-  Returns:
-    A `Tensor` or `SparseTensor`, respectively. Has the same type as `x`.
-
-  @compatibility(scipy)
-  Equivalent to scipy.special.i1e
-  @end_compatibility
-  """
-  with ops.name_scope(name, "bessel_i1e", [x]) as name:
-    if isinstance(x, sparse_tensor.SparseTensor):
-      x_i1e = gen_math_ops.bessel_i1e(x.values, name=name)
-      return sparse_tensor.SparseTensor(
-          indices=x.indices, values=x_i1e, dense_shape=x.dense_shape)
-    else:
-      return gen_math_ops.bessel_i1e(x, name=name)
-
-
-# FFT ops were moved to tf.spectral. tf.fft symbols were part of the TensorFlow
-# 1.0 API so we leave these here for backwards compatibility.
-fft = gen_spectral_ops.fft
-ifft = gen_spectral_ops.ifft
-fft2d = gen_spectral_ops.fft2d
-ifft2d = gen_spectral_ops.ifft2d
-fft3d = gen_spectral_ops.fft3d
-ifft3d = gen_spectral_ops.ifft3d
diff --git a/tensorflow/python/ops/math_ops_test.py b/tensorflow/python/ops/math_ops_test.py
index a4da0c6c339..add1621a56b 100644
--- a/tensorflow/python/ops/math_ops_test.py
+++ b/tensorflow/python/ops/math_ops_test.py
@@ -92,6 +92,7 @@ class ReduceTest(test_util.TensorFlowTestCase):
 
 class LogSumExpTest(test_util.TensorFlowTestCase):
 
+  @test_util.run_deprecated_v1
   def testReduceLogSumExp(self):
     for dtype in [np.float16, np.float32, np.double]:
       x_np = np.random.rand(5, 5).astype(dtype)
@@ -104,22 +105,23 @@ class LogSumExpTest(test_util.TensorFlowTestCase):
     for dtype in [np.float16, np.float32, np.double]:
       x_np = np.random.rand(5, 5).astype(dtype)
       with self.cached_session(use_gpu=True):
-        y_tf = math_ops.reduce_logsumexp(x_np, reduction_indices=[0])
+        y_tf = math_ops.reduce_logsumexp(x_np, axis=[0])
         y_np = log(np.sum(exp(x_np), axis=0))
         self.assertShapeEqual(y_np, y_tf)
-        y_tf_np = y_tf.eval()
+        y_tf_np = self.evaluate(y_tf)
         self.assertAllClose(y_tf_np, y_np)
 
   def testReductionIndices2(self):
     for dtype in [np.float16, np.float32, np.double]:
       x_np = np.random.rand(5, 5).astype(dtype)
       with self.cached_session(use_gpu=True):
-        y_tf = math_ops.reduce_logsumexp(x_np, reduction_indices=0)
+        y_tf = math_ops.reduce_logsumexp(x_np, axis=0)
         y_np = log(np.sum(exp(x_np), axis=0))
         self.assertShapeEqual(y_np, y_tf)
-        y_tf_np = y_tf.eval()
+        y_tf_np = self.evaluate(y_tf)
         self.assertAllClose(y_tf_np, y_np)
 
+  @test_util.run_deprecated_v1
   def testKeepDims(self):
     for dtype in [np.float16, np.float32, np.double]:
       x_np = np.random.rand(5, 5).astype(dtype)
@@ -129,6 +131,7 @@ class LogSumExpTest(test_util.TensorFlowTestCase):
         y_np = log(np.sum(exp(x_np), keepdims=True))
         self.assertAllClose(y_tf_np, y_np)
 
+  @test_util.run_deprecated_v1
   def testOverflow(self):
     x = [1000, 1001, 1002, 1003]
     for dtype in [np.float16, np.float32, np.double]:
@@ -146,6 +149,7 @@ class LogSumExpTest(test_util.TensorFlowTestCase):
         y_np = log(np.sum(exp(x_np - max_np))) + max_np
         self.assertAllClose(y_tf_np, y_np)
 
+  @test_util.run_deprecated_v1
   def testUnderflow(self):
     x = [-1000, -1001, -1002, -1003]
     for dtype in [np.float16, np.float32, np.double]:
@@ -163,6 +167,7 @@ class LogSumExpTest(test_util.TensorFlowTestCase):
         y_np = log(np.sum(exp(x_np - max_np))) + max_np
         self.assertAllClose(y_tf_np, y_np)
 
+  @test_util.run_deprecated_v1
   def testInfinity(self):
     with self.session(use_gpu=True):
       res = math_ops.reduce_logsumexp(-np.inf).eval()
@@ -186,6 +191,7 @@ class RoundTest(test_util.TensorFlowTestCase):
 
 class ModTest(test_util.TensorFlowTestCase):
 
+  @test_util.run_deprecated_v1
   def testFloat(self):
     x = [0.5, 0.7, 0.3]
     for dtype in [np.float32, np.double]:
@@ -195,7 +201,7 @@ class ModTest(test_util.TensorFlowTestCase):
         with self.cached_session(use_gpu=True):
           x_tf = constant_op.constant(x_np, shape=x_np.shape)
           y_tf = math_ops.mod(x_tf, denom)
-          y_tf_np = y_tf.eval()
+          y_tf_np = self.evaluate(y_tf)
           y_np = np.fmod(x_np, denom)
         self.assertAllClose(y_tf_np, y_np, atol=1e-2)
 
@@ -208,7 +214,7 @@ class ModTest(test_util.TensorFlowTestCase):
         with self.cached_session(use_gpu=True):
           x_tf = constant_op.constant(x_np, shape=x_np.shape)
           y_tf = math_ops.mod(x_tf, denom)
-          y_tf_np = y_tf.eval()
+          y_tf_np = self.evaluate(y_tf)
           y_np = np.mod(x_np, denom)
         self.assertAllClose(y_tf_np, y_np)
 
@@ -256,6 +262,7 @@ class ApproximateEqualTest(test_util.TensorFlowTestCase):
         z_tf = self.evaluate(math_ops.approximate_equal(x, y, tolerance=0.0001))
         self.assertAllEqual(z, z_tf)
 
+  @test_util.run_deprecated_v1
   def testApproximateEqualShape(self):
     for dtype in [np.float32, np.double]:
       x = np.array([1, 2], dtype=dtype)
@@ -309,6 +316,7 @@ class ScalarMulTest(test_util.TensorFlowTestCase):
 
 class AccumulateNTest(test_util.TensorFlowTestCase):
 
+  @test_util.run_deprecated_v1
   def testFloat(self):
     np.random.seed(12345)
     x = [np.random.random((1, 2, 3, 4, 5)) - 0.5 for _ in range(5)]
@@ -317,6 +325,7 @@ class AccumulateNTest(test_util.TensorFlowTestCase):
       self.assertAllClose(sum(x), math_ops.accumulate_n(tf_x).eval())
       self.assertAllClose(x[0] * 5, math_ops.accumulate_n([tf_x[0]] * 5).eval())
 
+  @test_util.run_deprecated_v1
   def testInt(self):
     np.random.seed(54321)
     x = [np.random.randint(-128, 128, (5, 4, 3, 2, 1)) for _ in range(6)]
@@ -328,6 +337,7 @@ class AccumulateNTest(test_util.TensorFlowTestCase):
 
 class AddNTest(test_util.TensorFlowTestCase):
 
+  @test_util.run_deprecated_v1
   def testPartials(self):
     """Test that previously revealed a bug in buffer forwarding for AddN."""
     partials = []
@@ -341,6 +351,7 @@ class AddNTest(test_util.TensorFlowTestCase):
     with self.session(use_gpu=True):
       self.assertAllEqual(res.eval(), 100)
 
+  @test_util.run_deprecated_v1
   def testFloat(self):
     np.random.seed(12345)
     for num_inputs in range(1, 10):
@@ -351,6 +362,7 @@ class AddNTest(test_util.TensorFlowTestCase):
         self.assertAllClose(x[0] * num_inputs,
                             math_ops.add_n([tf_x[0]] * num_inputs).eval())
 
+  @test_util.run_deprecated_v1
   def testInt(self):
     np.random.seed(54321)
     for num_inputs in range(1, 10):
@@ -364,6 +376,7 @@ class AddNTest(test_util.TensorFlowTestCase):
         self.assertAllEqual(x[0] * num_inputs,
                             math_ops.add_n([tf_x[0]] * num_inputs).eval())
 
+  @test_util.run_deprecated_v1
   def testGrad(self):
     np.random.seed(42)
     for num_inputs in range(1, 10):
@@ -373,7 +386,7 @@ class AddNTest(test_util.TensorFlowTestCase):
             for i in range(0, num_inputs)
         ]
         addn = math_ops.add_n(input_vars)
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         add_n_grad = gradients.gradients(addn, input_vars)
         self.assertAllEqual(np.repeat(1.0, num_inputs), # d/dx (x + y + ...) = 1
                             [g.eval() for g in add_n_grad])
@@ -392,6 +405,7 @@ class DivAndModTest(test_util.TensorFlowTestCase):
     divs = np.arange(-3, 0, .25).reshape(1, 12)
     return nums, divs
 
+  @test_util.run_deprecated_v1
   def testFloorModInt(self):
     nums, divs = self.intTestData()
     with self.cached_session():
@@ -401,6 +415,7 @@ class DivAndModTest(test_util.TensorFlowTestCase):
       np_result = nums % divs
       self.assertAllEqual(tf_result, np_result)
 
+  @test_util.run_deprecated_v1
   def testFloorModFloat(self):
     nums, divs = self.floatTestData()
     with self.cached_session():
@@ -412,6 +427,7 @@ class DivAndModTest(test_util.TensorFlowTestCase):
       #               % array_ops.constant(divs)).eval()
       # self.assertAllEqual(tf2_result, tf_result)
 
+  @test_util.run_deprecated_v1
   def testTruncateModInt(self):
     nums, divs = self.intTestData()
     with self.cached_session():
@@ -419,6 +435,7 @@ class DivAndModTest(test_util.TensorFlowTestCase):
       np_result = np.fmod(nums, divs)
       self.assertAllEqual(tf_result, np_result)
 
+  @test_util.run_deprecated_v1
   def testTruncateModFloat(self):
     nums, divs = self.floatTestData()
     with self.cached_session():
@@ -426,6 +443,7 @@ class DivAndModTest(test_util.TensorFlowTestCase):
       np_result = np.fmod(nums, divs)
       self.assertAllEqual(tf_result, np_result)
 
+  @test_util.run_deprecated_v1
   def testDivideInt(self):
     nums, divs = self.intTestData()
     with self.cached_session():
@@ -437,12 +455,14 @@ class DivAndModTest(test_util.TensorFlowTestCase):
       #               // array_ops.constant(divs)).eval()
       # self.assertAllEqual(tf2_result, tf_result)
 
+  @test_util.run_deprecated_v1
   def testDivideName(self):
     with self.cached_session():
       op = math_ops.divide(
           array_ops.constant(3), array_ops.constant(4), name="my_cool_divide")
       self.assertEqual(op.name, "my_cool_divide:0")
 
+  @test_util.run_deprecated_v1
   def testRealDiv(self):
     nums, divs = self.floatTestData()
     with self.cached_session():
@@ -450,26 +470,30 @@ class DivAndModTest(test_util.TensorFlowTestCase):
       np_result = np.divide(nums, divs)
       self.assertAllEqual(tf_result, np_result)
 
+  @test_util.run_deprecated_v1
   def testComplexDiv(self):
     foo = array_ops.constant([1. + 3.j])
     with self.cached_session():
       _ = math_ops.divide(foo, 1.).eval()
       _ = math_ops.div(foo, 2.).eval()
 
+  @test_util.run_deprecated_v1
   def testFloorDivGrad(self):
     with self.cached_session():
       a = variables.Variable(2.)
       b = variables.Variable(4.)
       with self.cached_session() as sess:
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         c_grad = gradients.gradients(math_ops.divide(a, b), [a, b])
         self.assertAllEqual([x.eval() for x in c_grad], [.25, -.125])
         c_grad = gradients.gradients(math_ops.div(a, b), [a, b])
         self.assertAllEqual([x.eval() for x in c_grad], [.25, -.125])
         c_grad = gradients.gradients(math_ops.floordiv(a, b), [a, b])
-        self.assertAllEqual([None if x is None else x.eval()
-                             for x in c_grad], [None, None])
+        self.assertAllEqual(
+            [None if x is None else self.evaluate(x) for x in c_grad],
+            [None, None])
 
+  @test_util.run_deprecated_v1
   def testConsistent(self):
     nums, divs = self.intTestData()
     with self.cached_session():
@@ -496,6 +520,7 @@ class DivAndModTest(test_util.TensorFlowTestCase):
 
 class DivNoNanTest(test_util.TensorFlowTestCase):
 
+  @test_util.run_deprecated_v1
   def testBasic(self):
     for dtype in [np.float32, np.float64]:
       nums = np.arange(-10, 10, .25, dtype=dtype).reshape(80, 1)
diff --git a/tensorflow/python/ops/metrics_impl.py b/tensorflow/python/ops/metrics_impl.py
index e86a3b85360..cb421990112 100644
--- a/tensorflow/python/ops/metrics_impl.py
+++ b/tensorflow/python/ops/metrics_impl.py
@@ -18,7 +18,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.compat import compat
 from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -213,26 +212,6 @@ def _maybe_expand_labels(labels, predictions):
         lambda: array_ops.expand_dims(labels, -1, name=scope), lambda: labels)
 
 
-def _safe_div(numerator, denominator, name):
-  """Divides two tensors element-wise, returning 0 if the denominator is <= 0.
-
-  Args:
-    numerator: A real `Tensor`.
-    denominator: A real `Tensor`, with dtype matching `numerator`.
-    name: Name for the returned op.
-
-  Returns:
-    0 if `denominator` <= 0, else `numerator` / `denominator`
-  """
-  if compat.forward_compatible(2018, 11, 1):
-    return math_ops.div_no_nan(numerator, denominator, name=name)
-  t = math_ops.truediv(numerator, denominator)
-  zero = array_ops.zeros_like(t, dtype=denominator.dtype)
-  condition = math_ops.greater(denominator, zero)
-  zero = math_ops.cast(zero, t.dtype)
-  return array_ops.where(condition, t, zero, name=name)
-
-
 def _safe_scalar_div(numerator, denominator, name):
   """Divides two values, returning 0 if the denominator is 0.
 
@@ -246,7 +225,7 @@ def _safe_scalar_div(numerator, denominator, name):
   """
   numerator.get_shape().with_rank_at_most(1)
   denominator.get_shape().with_rank_at_most(1)
-  return _safe_div(numerator, denominator, name=name)
+  return math_ops.div_no_nan(numerator, denominator, name=name)
 
 
 def _streaming_confusion_matrix(labels, predictions, num_classes, weights=None):
@@ -302,7 +281,7 @@ def _aggregate_across_replicas(metrics_collections, metric_value_fn, *args):
   """Aggregate metric value across replicas."""
   def fn(distribution, *a):
     """Call `metric_value_fn` in the correct control flow context."""
-    if hasattr(distribution, '_outer_control_flow_context'):
+    if hasattr(distribution.extended, '_outer_control_flow_context'):
       # If there was an outer context captured before this method was called,
       # then we enter that context to create the metric value op. If the
       # caputred context is `None`, ops.control_dependencies(None) gives the
@@ -315,13 +294,13 @@ def _aggregate_across_replicas(metrics_collections, metric_value_fn, *args):
       # once the update ops have been evaluted.
 
       # pylint: disable=protected-access
-      if distribution._outer_control_flow_context is None:
+      if distribution.extended._outer_control_flow_context is None:
         with ops.control_dependencies(None):
           metric_value = metric_value_fn(distribution, *a)
       else:
-        distribution._outer_control_flow_context.Enter()
+        distribution.extended._outer_control_flow_context.Enter()
         metric_value = metric_value_fn(distribution, *a)
-        distribution._outer_control_flow_context.Exit()
+        distribution.extended._outer_control_flow_context.Exit()
         # pylint: enable=protected-access
     else:
       metric_value = metric_value_fn(distribution, *a)
@@ -330,10 +309,10 @@ def _aggregate_across_replicas(metrics_collections, metric_value_fn, *args):
     return metric_value
 
   return distribution_strategy_context.get_replica_context().merge_call(
-      fn, *args)
+      fn, args=args)
 
 
-@tf_export('metrics.mean')
+@tf_export(v1=['metrics.mean'])
 def mean(values,
          weights=None,
          metrics_collections=None,
@@ -401,13 +380,12 @@ def mean(values,
       update_count_op = state_ops.assign_add(count, num_values)
 
     def compute_mean(_, t, c):
-      return _safe_div(t, math_ops.maximum(c, 0), name='value')
+      return math_ops.div_no_nan(t, math_ops.maximum(c, 0), name='value')
 
     mean_t = _aggregate_across_replicas(
         metrics_collections, compute_mean, total, count)
-    update_op = _safe_div(update_total_op,
-                          math_ops.maximum(update_count_op, 0),
-                          name='update_op')
+    update_op = math_ops.div_no_nan(
+        update_total_op, math_ops.maximum(update_count_op, 0), name='update_op')
 
     if updates_collections:
       ops.add_to_collections(updates_collections, update_op)
@@ -415,7 +393,7 @@ def mean(values,
     return mean_t, update_op
 
 
-@tf_export('metrics.accuracy')
+@tf_export(v1=['metrics.accuracy'])
 def accuracy(labels,
              predictions,
              weights=None,
@@ -647,7 +625,7 @@ def _aggregate_variable(v, collections):
   return _aggregate_across_replicas(collections, f, v)
 
 
-@tf_export('metrics.auc')
+@tf_export(v1=['metrics.auc'])
 def auc(labels,
         predictions,
         weights=None,
@@ -779,19 +757,19 @@ def auc(labels,
       """
       dtp = tp[:num_thresholds - 1] - tp[1:]
       p = tp + fp
-      prec_slope = _safe_div(
+      prec_slope = math_ops.div_no_nan(
           dtp,
           math_ops.maximum(p[:num_thresholds - 1] - p[1:], 0),
           name='prec_slope')
       intercept = tp[1:] - math_ops.multiply(prec_slope, p[1:])
       safe_p_ratio = array_ops.where(
           math_ops.logical_and(p[:num_thresholds - 1] > 0, p[1:] > 0),
-          _safe_div(p[:num_thresholds - 1],
-                    math_ops.maximum(p[1:], 0),
-                    name='recall_relative_ratio'),
-          array_ops.ones_like(p[1:]))
+          math_ops.div_no_nan(
+              p[:num_thresholds - 1],
+              math_ops.maximum(p[1:], 0),
+              name='recall_relative_ratio'), array_ops.ones_like(p[1:]))
       return math_ops.reduce_sum(
-          _safe_div(
+          math_ops.div_no_nan(
               prec_slope * (dtp + intercept * math_ops.log(safe_p_ratio)),
               math_ops.maximum(tp[1:] + fn[1:], 0),
               name='pr_auc_increment'),
@@ -852,7 +830,7 @@ def auc(labels,
     return auc_value, update_op
 
 
-@tf_export('metrics.mean_absolute_error')
+@tf_export(v1=['metrics.mean_absolute_error'])
 def mean_absolute_error(labels,
                         predictions,
                         weights=None,
@@ -913,7 +891,7 @@ def mean_absolute_error(labels,
               updates_collections, name or 'mean_absolute_error')
 
 
-@tf_export('metrics.mean_cosine_distance')
+@tf_export(v1=['metrics.mean_cosine_distance'])
 def mean_cosine_distance(labels,
                          predictions,
                          dim,
@@ -970,7 +948,7 @@ def mean_cosine_distance(labels,
       predictions=predictions, labels=labels, weights=weights)
   radial_diffs = math_ops.multiply(predictions, labels)
   radial_diffs = math_ops.reduce_sum(
-      radial_diffs, reduction_indices=[
+      radial_diffs, axis=[
           dim,
       ], keepdims=True)
   mean_distance, update_op = mean(radial_diffs, weights, None, None, name or
@@ -987,7 +965,7 @@ def mean_cosine_distance(labels,
   return mean_distance, update_op
 
 
-@tf_export('metrics.mean_per_class_accuracy')
+@tf_export(v1=['metrics.mean_per_class_accuracy'])
 def mean_per_class_accuracy(labels,
                             predictions,
                             num_classes,
@@ -1074,7 +1052,7 @@ def mean_per_class_accuracy(labels,
     update_count_op = state_ops.scatter_add(count, labels, is_correct)
 
     def compute_mean_accuracy(_, count, total):
-      per_class_accuracy = _safe_div(
+      per_class_accuracy = math_ops.div_no_nan(
           count, math_ops.maximum(total, 0), name=None)
       mean_accuracy_v = math_ops.reduce_mean(
           per_class_accuracy, name='mean_accuracy')
@@ -1083,16 +1061,15 @@ def mean_per_class_accuracy(labels,
     mean_accuracy_v = _aggregate_across_replicas(
         metrics_collections, compute_mean_accuracy, count, total)
 
-    update_op = _safe_div(update_count_op,
-                          math_ops.maximum(update_total_op, 0),
-                          name='update_op')
+    update_op = math_ops.div_no_nan(
+        update_count_op, math_ops.maximum(update_total_op, 0), name='update_op')
     if updates_collections:
       ops.add_to_collections(updates_collections, update_op)
 
     return mean_accuracy_v, update_op
 
 
-@tf_export('metrics.mean_iou')
+@tf_export(v1=['metrics.mean_iou'])
 def mean_iou(labels,
              predictions,
              num_classes,
@@ -1193,7 +1170,7 @@ def mean_iou(labels,
     return mean_iou_v, update_op
 
 
-@tf_export('metrics.mean_relative_error')
+@tf_export(v1=['metrics.mean_relative_error'])
 def mean_relative_error(labels,
                         predictions,
                         normalizer,
@@ -1262,7 +1239,7 @@ def mean_relative_error(labels,
               updates_collections, name or 'mean_relative_error')
 
 
-@tf_export('metrics.mean_squared_error')
+@tf_export(v1=['metrics.mean_squared_error'])
 def mean_squared_error(labels,
                        predictions,
                        weights=None,
@@ -1323,7 +1300,7 @@ def mean_squared_error(labels,
               name or 'mean_squared_error')
 
 
-@tf_export('metrics.mean_tensor')
+@tf_export(v1=['metrics.mean_tensor'])
 def mean_tensor(values,
                 weights=None,
                 metrics_collections=None,
@@ -1394,22 +1371,21 @@ def mean_tensor(values,
     with ops.control_dependencies([values]):
       update_count_op = state_ops.assign_add(count, num_values)
 
-    compute_mean = lambda _, t, c: _safe_div(
+    compute_mean = lambda _, t, c: math_ops.div_no_nan(  # pylint: disable=g-long-lambda
         t, math_ops.maximum(c, 0), name='value')
 
     mean_t = _aggregate_across_replicas(
         metrics_collections, compute_mean, total, count)
 
-    update_op = _safe_div(update_total_op,
-                          math_ops.maximum(update_count_op, 0),
-                          name='update_op')
+    update_op = math_ops.div_no_nan(
+        update_total_op, math_ops.maximum(update_count_op, 0), name='update_op')
     if updates_collections:
       ops.add_to_collections(updates_collections, update_op)
 
     return mean_t, update_op
 
 
-@tf_export('metrics.percentage_below')
+@tf_export(v1=['metrics.percentage_below'])
 def percentage_below(values,
                      threshold,
                      weights=None,
@@ -1509,7 +1485,7 @@ def _count_condition(values,
   return value_tensor, update_op
 
 
-@tf_export('metrics.false_negatives')
+@tf_export(v1=['metrics.false_negatives'])
 def false_negatives(labels,
                     predictions,
                     weights=None,
@@ -1561,7 +1537,7 @@ def false_negatives(labels,
                             updates_collections)
 
 
-@tf_export('metrics.false_negatives_at_thresholds')
+@tf_export(v1=['metrics.false_negatives_at_thresholds'])
 def false_negatives_at_thresholds(labels,
                                   predictions,
                                   thresholds,
@@ -1617,7 +1593,7 @@ def false_negatives_at_thresholds(labels,
     return fn_value, update_ops['fn']
 
 
-@tf_export('metrics.false_positives')
+@tf_export(v1=['metrics.false_positives'])
 def false_positives(labels,
                     predictions,
                     weights=None,
@@ -1670,7 +1646,7 @@ def false_positives(labels,
                             updates_collections)
 
 
-@tf_export('metrics.false_positives_at_thresholds')
+@tf_export(v1=['metrics.false_positives_at_thresholds'])
 def false_positives_at_thresholds(labels,
                                   predictions,
                                   thresholds,
@@ -1726,7 +1702,7 @@ def false_positives_at_thresholds(labels,
     return fp_value, update_ops['fp']
 
 
-@tf_export('metrics.true_negatives')
+@tf_export(v1=['metrics.true_negatives'])
 def true_negatives(labels,
                    predictions,
                    weights=None,
@@ -1779,7 +1755,7 @@ def true_negatives(labels,
                             updates_collections)
 
 
-@tf_export('metrics.true_negatives_at_thresholds')
+@tf_export(v1=['metrics.true_negatives_at_thresholds'])
 def true_negatives_at_thresholds(labels,
                                  predictions,
                                  thresholds,
@@ -1835,7 +1811,7 @@ def true_negatives_at_thresholds(labels,
     return tn_value, update_ops['tn']
 
 
-@tf_export('metrics.true_positives')
+@tf_export(v1=['metrics.true_positives'])
 def true_positives(labels,
                    predictions,
                    weights=None,
@@ -1888,7 +1864,7 @@ def true_positives(labels,
                             updates_collections)
 
 
-@tf_export('metrics.true_positives_at_thresholds')
+@tf_export(v1=['metrics.true_positives_at_thresholds'])
 def true_positives_at_thresholds(labels,
                                  predictions,
                                  thresholds,
@@ -1944,7 +1920,7 @@ def true_positives_at_thresholds(labels,
     return tp_value, update_ops['tp']
 
 
-@tf_export('metrics.precision')
+@tf_export(v1=['metrics.precision'])
 def precision(labels,
               predictions,
               weights=None,
@@ -2039,7 +2015,7 @@ def precision(labels,
     return p, update_op
 
 
-@tf_export('metrics.precision_at_thresholds')
+@tf_export(v1=['metrics.precision_at_thresholds'])
 def precision_at_thresholds(labels,
                             predictions,
                             thresholds,
@@ -2120,7 +2096,7 @@ def precision_at_thresholds(labels,
     return prec, update_op
 
 
-@tf_export('metrics.recall')
+@tf_export(v1=['metrics.recall'])
 def recall(labels,
            predictions,
            weights=None,
@@ -2471,7 +2447,7 @@ def _streaming_sparse_false_negative_at_k(labels,
     return var, state_ops.assign_add(var, batch_total_fn, name='update')
 
 
-@tf_export('metrics.recall_at_k')
+@tf_export(v1=['metrics.recall_at_k'])
 def recall_at_k(labels,
                 predictions,
                 k,
@@ -2564,7 +2540,7 @@ def recall_at_k(labels,
         name=scope)
 
 
-@tf_export('metrics.recall_at_top_k')
+@tf_export(v1=['metrics.recall_at_top_k'])
 def recall_at_top_k(labels,
                     predictions_idx,
                     k=None,
@@ -2648,7 +2624,7 @@ def recall_at_top_k(labels,
     return metric, update
 
 
-@tf_export('metrics.recall_at_thresholds')
+@tf_export(v1=['metrics.recall_at_thresholds'])
 def recall_at_thresholds(labels,
                          predictions,
                          thresholds,
@@ -2726,7 +2702,7 @@ def recall_at_thresholds(labels,
     return rec, update_op
 
 
-@tf_export('metrics.root_mean_squared_error')
+@tf_export(v1=['metrics.root_mean_squared_error'])
 def root_mean_squared_error(labels,
                             predictions,
                             weights=None,
@@ -2797,7 +2773,7 @@ def root_mean_squared_error(labels,
   return rmse, update_rmse_op
 
 
-@tf_export('metrics.sensitivity_at_specificity')
+@tf_export(v1=['metrics.sensitivity_at_specificity'])
 def sensitivity_at_specificity(labels,
                                predictions,
                                specificity,
@@ -3069,7 +3045,7 @@ def _sparse_average_precision_at_top_k(labels, predictions_idx):
 
     # Reduce along k dimension to get the sum, yielding a [D1, ... DN] tensor.
     precision_sum = math_ops.reduce_sum(
-        relevant_precision_per_k, reduction_indices=(-1,), name='precision_sum')
+        relevant_precision_per_k, axis=(-1,), name='precision_sum')
 
     # Divide by number of relevant items to get average precision. These are
     # the "num_relevant_items" and "AveP" terms from the formula above.
@@ -3170,7 +3146,7 @@ def _streaming_sparse_average_precision_at_top_k(labels,
     return mean_average_precision, update
 
 
-@tf_export('metrics.sparse_average_precision_at_k')
+@tf_export(v1=['metrics.sparse_average_precision_at_k'])
 @deprecated(None, 'Use average_precision_at_k instead')
 def sparse_average_precision_at_k(labels,
                                   predictions,
@@ -3190,7 +3166,7 @@ def sparse_average_precision_at_k(labels,
       name=name)
 
 
-@tf_export('metrics.average_precision_at_k')
+@tf_export(v1=['metrics.average_precision_at_k'])
 def average_precision_at_k(labels,
                            predictions,
                            k,
@@ -3364,7 +3340,7 @@ def _streaming_sparse_false_positive_at_k(labels,
     return var, state_ops.assign_add(var, batch_total_fp, name='update')
 
 
-@tf_export('metrics.precision_at_top_k')
+@tf_export(v1=['metrics.precision_at_top_k'])
 def precision_at_top_k(labels,
                        predictions_idx,
                        k=None,
@@ -3453,7 +3429,7 @@ def precision_at_top_k(labels,
     return metric, update
 
 
-@tf_export('metrics.sparse_precision_at_k')
+@tf_export(v1=['metrics.sparse_precision_at_k'])
 @deprecated(None, 'Use precision_at_k instead')
 def sparse_precision_at_k(labels,
                           predictions,
@@ -3475,7 +3451,7 @@ def sparse_precision_at_k(labels,
       name=name)
 
 
-@tf_export('metrics.precision_at_k')
+@tf_export(v1=['metrics.precision_at_k'])
 def precision_at_k(labels,
                    predictions,
                    k,
@@ -3569,7 +3545,7 @@ def precision_at_k(labels,
         name=scope)
 
 
-@tf_export('metrics.specificity_at_sensitivity')
+@tf_export(v1=['metrics.specificity_at_sensitivity'])
 def specificity_at_sensitivity(labels,
                                predictions,
                                sensitivity,
diff --git a/tensorflow/python/ops/nccl_ops_test.py b/tensorflow/python/ops/nccl_ops_test.py
index 1b496fec473..3b2e2b0175f 100644
--- a/tensorflow/python/ops/nccl_ops_test.py
+++ b/tensorflow/python/ops/nccl_ops_test.py
@@ -102,7 +102,7 @@ class NcclTestCase(test.TestCase):
             continue
 
           # Test execution and results.
-          for t in sess.run(result_tensors):
+          for t in self.evaluate(result_tensors):
             self.assertAllClose(t, np_ans)
 
   def _TestGradient(self, nccl_reduce, numpy_fn):
diff --git a/tensorflow/python/ops/nn_batchnorm_test.py b/tensorflow/python/ops/nn_batchnorm_test.py
index c8a5b58e458..e978f1d3260 100644
--- a/tensorflow/python/ops/nn_batchnorm_test.py
+++ b/tensorflow/python/ops/nn_batchnorm_test.py
@@ -71,6 +71,7 @@ class BatchNormalizationTest(test.TestCase):
                                        gamma if scale_after_normalization else
                                        None, epsilon)
 
+  @test_util.run_deprecated_v1
   def testBatchNorm(self):
     x_shape = [3, 5, 4, 2]
     param_shape = [2]
@@ -169,16 +170,20 @@ class BatchNormalizationTest(test.TestCase):
                                       shift_after_normalization, v,
                                       err_tolerance)
 
+  @test_util.run_deprecated_v1
   def testBatchNormInputGradient(self):
     self._testBatchNormGradientInAllNeedConfigs(0, "x")
 
+  @test_util.run_deprecated_v1
   def testBatchNormMeanGradient(self):
     self._testBatchNormGradientInAllNeedConfigs(1, "mean")
 
+  @test_util.run_deprecated_v1
   def testBatchNormVarianceGradient(self):
     self._testBatchNormGradientInAllNeedConfigs(
         2, "variance", err_tolerance=1e-03)
 
+  @test_util.run_deprecated_v1
   def testBatchNormBetaGradient(self):
     # Since beta does not exist when scale_after_normalization=False, we only
     # test for scale_after_normalization=True.
@@ -187,6 +192,7 @@ class BatchNormalizationTest(test.TestCase):
         self._testBatchNormGradient(3, "beta", scale_after_normalization, True,
                                     v)
 
+  @test_util.run_deprecated_v1
   def testBatchNormGammaGradient(self):
     # If scale_after_normalization is False, backprop for gamma in v1
     # will be 0. In version 2 of the API, if scale_after_normalization is False,
@@ -199,6 +205,7 @@ class BatchNormalizationTest(test.TestCase):
       self._testBatchNormGradient(4, "gamma", True, shift_after_normalization,
                                   2)
 
+  @test_util.run_deprecated_v1
   def testBatchNormGradImpl(self):
     x_shape = [7, 5, 4, 6]
     param_shape = [6]
@@ -235,15 +242,17 @@ class BatchNormalizationTest(test.TestCase):
           odx, odm, odv, odb, odg = gradients_impl.gradients(
               [on], [x, m, v, beta, gamma], [backprop])
           if scale_after_normalization:
-            all_grads = sess.run([dx, dm, dv, db, dg, odx, odm, odv, odb, odg])
+            all_grads = self.evaluate(
+                [dx, dm, dv, db, dg, odx, odm, odv, odb, odg])
             to_check = ["dx", "dm", "dv", "db", "dg"]
           else:
-            all_grads = sess.run([dx, dm, dv, db, odx, odm, odv, odb])
+            all_grads = self.evaluate([dx, dm, dv, db, odx, odm, odv, odb])
             to_check = ["dx", "dm", "dv", "db"]
           for i, _ in enumerate(to_check):
             self.assertAllClose(
                 all_grads[i + len(to_check)], all_grads[i], atol=0.000001)
 
+  @test_util.run_deprecated_v1
   def testBatchNormKeepDims(self):
     """Test for tf.nn.moments(..., keep_dims=True / False).
 
@@ -318,7 +327,7 @@ class BatchNormalizationTest(test.TestCase):
                                               gamma_val, epsilon,
                                               scale_after_normalization,
                                               shift_after_normalization)
-            [tf_batch_norm] = sess.run([bn])
+            [tf_batch_norm] = self.evaluate([bn])
             self.assertEquals(x_shape, np_batch_norm.shape)
             self.assertEquals(x_shape, tf_batch_norm.shape)
             self.assertAllClose(np_batch_norm, tf_batch_norm, atol=atol)
@@ -371,9 +380,9 @@ class SufficientStatisticsTest(test.TestCase):
           x.set_shape(x_shape)
           op_c, op_m, op_v, op_s = self._opSuffStats(x, axes, shift, keep_dims)
           if shift:
-            tf_c, tf_m, tf_v, tf_s = sess.run([op_c, op_m, op_v, op_s])
+            tf_c, tf_m, tf_v, tf_s = self.evaluate([op_c, op_m, op_v, op_s])
           else:
-            tf_c, tf_m, tf_v = sess.run([op_c, op_m, op_v])
+            tf_c, tf_m, tf_v = self.evaluate([op_c, op_m, op_v])
         else:
           x = array_ops.placeholder(
               dtype=dtypes.float32, shape=[None] * len(x_shape), name="x")
@@ -390,6 +399,7 @@ class SufficientStatisticsTest(test.TestCase):
         if shift:
           self.assertAllClose(np_s, tf_s, atol=0.000001)
 
+  @test_util.run_deprecated_v1
   def testSuffStats(self):
     for has_shape in [True, False]:
       for keep_dims in [True, False]:
@@ -432,7 +442,7 @@ class NormalizeMomentsTest(test.TestCase):
           tf_shift_v = None
         opm, opv = self._opNormalizeMoments(tf_counts, tf_mean_ss,
                                             tf_variance_ss, tf_shift_v)
-        tfm, tfv = sess.run([opm, opv])
+        tfm, tfv = self.evaluate([opm, opv])
         self.assertAllClose(npm, tfm, atol=0.000001)
         self.assertAllClose(npv, tfv, atol=0.000001)
 
@@ -507,9 +517,10 @@ class MomentsTest(test.TestCase):
       expected_variance = expected_x_squared - expected_mean_squared
 
       # Check that the moments are correct.
-      self.assertAllCloseAccordingToType(expected_mean, mean.eval())
-      self.assertAllCloseAccordingToType(expected_variance, var.eval())
+      self.assertAllCloseAccordingToType(expected_mean, self.evaluate(mean))
+      self.assertAllCloseAccordingToType(expected_variance, self.evaluate(var))
 
+  @test_util.run_deprecated_v1
   def testBasic(self):
     for keep_dims in [False, True]:
       for dtype in [dtypes.float32, dtypes.float16]:
@@ -518,6 +529,7 @@ class MomentsTest(test.TestCase):
         self.RunMomentTestWithDynamicShape(
             shape=[2, 3, 5, 4], axes=[0], keep_dims=keep_dims, dtype=dtype)
 
+  @test_util.run_deprecated_v1
   def testGlobalNormalization(self):
     for keep_dims in [False, True]:
       for dtype in [dtypes.float32, dtypes.float16]:
@@ -532,6 +544,7 @@ class MomentsTest(test.TestCase):
             keep_dims=keep_dims,
             dtype=dtype)
 
+  @test_util.run_deprecated_v1
   def testAxes(self):
     for keep_dims in [False, True]:
       for dtype in [dtypes.float32, dtypes.float16]:
@@ -572,9 +585,11 @@ class MomentsTest(test.TestCase):
         print("Moments %s gradient err vs input %d = %g" % (from_y, i, err))
         self.assertLess(err, 1e-11)
 
+  @test_util.run_deprecated_v1
   def testMeanGlobalGradient(self):
     self._testGlobalGradient(from_y="mean")
 
+  @test_util.run_deprecated_v1
   def testVarGlobalGradient(self):
     self._testGlobalGradient(from_y="var")
 
diff --git a/tensorflow/python/ops/nn_fused_batchnorm_test.py b/tensorflow/python/ops/nn_fused_batchnorm_test.py
index 5ac8eba6f73..4bc33ff8bdb 100644
--- a/tensorflow/python/ops/nn_fused_batchnorm_test.py
+++ b/tensorflow/python/ops/nn_fused_batchnorm_test.py
@@ -50,7 +50,7 @@ class BatchNormalizationTest(test.TestCase):
     y = self._batch_norm(x, mean, var, offset, scale, epsilon)
     if data_format == 'NCHW':
       y = array_ops.transpose(y, [0, 3, 1, 2])
-    return y.eval()
+    return self.evaluate(y)
 
   def _test_inference(self,
                       x_shape,
@@ -82,7 +82,7 @@ class BatchNormalizationTest(test.TestCase):
           epsilon=epsilon,
           data_format=data_format,
           is_training=False)
-      y_val = sess.run(y)
+      y_val = self.evaluate(y)
       y_ref = self._inference_ref(x, scale, offset, mean, var, epsilon,
                                   data_format)
     # An atol value of 1e-3 is too small for float16's, because some adjacent
@@ -102,7 +102,7 @@ class BatchNormalizationTest(test.TestCase):
     y = self._batch_norm(x, mean, var, offset, scale, epsilon)
     if data_format == 'NCHW':
       y = array_ops.transpose(y, [0, 3, 1, 2])
-    return y.eval(), mean.eval(), var.eval()
+    return self.evaluate(y), self.evaluate(mean), self.evaluate(var)
 
   def _test_training(self,
                      x_shape,
@@ -127,7 +127,7 @@ class BatchNormalizationTest(test.TestCase):
           epsilon=epsilon,
           data_format=data_format,
           is_training=True)
-      y_val, mean_val, var_val = sess.run([y, mean, var])
+      y_val, mean_val, var_val = self.evaluate([y, mean, var])
       y_ref, mean_ref, var_ref = self._training_ref(x, scale, offset, epsilon,
                                                     data_format)
     y_atol = 2e-3 if x_dtype == np.float16 else 1e-3
@@ -277,10 +277,10 @@ class BatchNormalizationTest(test.TestCase):
       if is_training:
         epsilon = y.op.get_attr('epsilon')
         data_format = y.op.get_attr('data_format')
-        grad_vals = sess.run([grad_x, grad_scale, grad_offset])
+        grad_vals = self.evaluate([grad_x, grad_scale, grad_offset])
         grad_internal = nn_grad._BatchNormGrad(grad_y, x, scale, pop_mean,
                                                pop_var, epsilon, data_format)
-        grad_internal_vals = sess.run(list(grad_internal))
+        grad_internal_vals = self.evaluate(list(grad_internal))
         for grad_val, grad_internal_val in zip(grad_vals, grad_internal_vals):
           self.assertAllClose(grad_val, grad_internal_val, atol=err_tolerance)
 
diff --git a/tensorflow/python/ops/nn_grad.py b/tensorflow/python/ops/nn_grad.py
index 902653befc4..34404edc9a1 100644
--- a/tensorflow/python/ops/nn_grad.py
+++ b/tensorflow/python/ops/nn_grad.py
@@ -18,13 +18,13 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_nn_ops
-from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
 
@@ -948,10 +948,14 @@ def _FusedBatchNormGradGrad(op, *grad):
   grad_grad_x = grad[0]
   grad_grad_scale = grad[1]
   grad_grad_offset = grad[2]
-  grad_x, grad_scale, grad_offset = _BatchNormGrad(
-      grad_y, x, scale, pop_mean, pop_var, epsilon, data_format, is_training)
-  grad_initial = [grad_grad_x, grad_grad_scale, grad_grad_offset]
-  grad_grad_y, grad_x, grad_scale = gradients_impl.gradients(
+  with backprop.GradientTape() as tape:
+    tape.watch(grad_y)
+    tape.watch(x)
+    tape.watch(scale)
+    grad_x, grad_scale, grad_offset = _BatchNormGrad(
+        grad_y, x, scale, pop_mean, pop_var, epsilon, data_format, is_training)
+    grad_initial = [grad_grad_x, grad_grad_scale, grad_grad_offset]
+  grad_grad_y, grad_x, grad_scale = tape.gradient(
       [grad_x, grad_scale, grad_offset], [grad_y, x, scale], grad_initial)
   return grad_grad_y, grad_x, grad_scale, None, None
 
diff --git a/tensorflow/python/ops/nn_grad_test.py b/tensorflow/python/ops/nn_grad_test.py
index 8065df4b165..95e05a977b8 100644
--- a/tensorflow/python/ops/nn_grad_test.py
+++ b/tensorflow/python/ops/nn_grad_test.py
@@ -22,6 +22,7 @@ import numpy as np
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import nn_grad  # pylint: disable=unused-import
@@ -31,6 +32,7 @@ from tensorflow.python.platform import test
 
 class Relu6OpTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testRelu6GradGrad(self):
     inputs = constant_op.constant(
         [[-2, -1, 1, 3], [5, 7, 8, 9]], dtype=dtypes.float32)
diff --git a/tensorflow/python/ops/nn_impl.py b/tensorflow/python/ops/nn_impl.py
index ef763a4b614..8f74f831c1d 100644
--- a/tensorflow/python/ops/nn_impl.py
+++ b/tensorflow/python/ops/nn_impl.py
@@ -329,7 +329,7 @@ def swish(features):
   return features * math_ops.sigmoid(features)
 
 
-@tf_export("math.l2_normalize", "linalg.l2_normalize", "nn.l2_normalize")
+@tf_export(v1=["math.l2_normalize", "linalg.l2_normalize", "nn.l2_normalize"])
 @deprecated_args(None, "dim is deprecated, use axis instead", "dim")
 def l2_normalize(x, axis=None, epsilon=1e-12, name=None, dim=None):
   """Normalizes along dimension `axis` using an L2 norm.
@@ -350,11 +350,36 @@ def l2_normalize(x, axis=None, epsilon=1e-12, name=None, dim=None):
     name: A name for this operation (optional).
     dim: Deprecated alias for axis.
 
+  Returns:
+    A `Tensor` with the same shape as `x`.
+  """
+  axis = deprecated_argument_lookup("axis", axis, "dim", dim)
+  return l2_normalize_v2(x, axis, epsilon, name)
+
+
+@tf_export("math.l2_normalize", "linalg.l2_normalize", "nn.l2_normalize", v1=[])
+def l2_normalize_v2(x, axis=None, epsilon=1e-12, name=None):
+  """Normalizes along dimension `axis` using an L2 norm.
+
+  For a 1-D tensor with `axis = 0`, computes
+
+      output = x / sqrt(max(sum(x**2), epsilon))
+
+  For `x` with more dimensions, independently normalizes each 1-D slice along
+  dimension `axis`.
+
+  Args:
+    x: A `Tensor`.
+    axis: Dimension along which to normalize.  A scalar or a vector of
+      integers.
+    epsilon: A lower bound value for the norm. Will use `sqrt(epsilon)` as the
+      divisor if `norm < sqrt(epsilon)`.
+    name: A name for this operation (optional).
+
   Returns:
     A `Tensor` with the same shape as `x`.
   """
   with ops.name_scope(name, "l2_normalize", [x]) as name:
-    axis = deprecated_argument_lookup("axis", axis, "dim", dim)
     x = ops.convert_to_tensor(x, name="x")
     square_sum = math_ops.reduce_sum(math_ops.square(x), axis, keepdims=True)
     x_inv_norm = math_ops.rsqrt(math_ops.maximum(square_sum, epsilon))
@@ -424,7 +449,7 @@ def zero_fraction(value, name=None):
 
 
 # pylint: disable=redefined-builtin
-@tf_export("nn.depthwise_conv2d")
+@tf_export(v1=["nn.depthwise_conv2d"])
 def depthwise_conv2d(input,
                      filter,
                      strides,
@@ -497,11 +522,68 @@ def depthwise_conv2d(input,
         op=op)
 
 
+@tf_export("nn.depthwise_conv2d", v1=[])
+def depthwise_conv2d_v2(input,
+                        filter,
+                        strides,
+                        padding,
+                        data_format=None,
+                        dilations=None,
+                        name=None):
+  """Depthwise 2-D convolution.
+
+  Given a 4D input tensor ('NHWC' or 'NCHW' data formats)
+  and a filter tensor of shape
+  `[filter_height, filter_width, in_channels, channel_multiplier]`
+  containing `in_channels` convolutional filters of depth 1, `depthwise_conv2d`
+  applies a different filter to each input channel (expanding from 1 channel
+  to `channel_multiplier` channels for each), then concatenates the results
+  together.  The output has `in_channels * channel_multiplier` channels.
+
+  In detail,
+
+      output[b, i, j, k * channel_multiplier + q] = sum_{di, dj}
+           filter[di, dj, k, q] * input[b, strides[1] * i + rate[0] * di,
+                                           strides[2] * j + rate[1] * dj, k]
+
+  Must have `strides[0] = strides[3] = 1`.  For the most common case of the
+  same horizontal and vertical strides, `strides = [1, stride, stride, 1]`.
+  If any value in `rate` is greater than 1, we perform atrous depthwise
+  convolution, in which case all values in the `strides` tensor must be equal
+  to 1.
+
+  Args:
+    input: 4-D with shape according to `data_format`.
+    filter: 4-D with shape
+      `[filter_height, filter_width, in_channels, channel_multiplier]`.
+    strides: 1-D of size 4.  The stride of the sliding window for each
+      dimension of `input`.
+    padding: A string, either `'VALID'` or `'SAME'`. The padding algorithm.
+      See the "returns" section of `tf.nn.convolution` for details.
+    data_format: The data format for input. Either "NHWC" (default) or "NCHW".
+    dilations: 1-D of size 2. The dilation rate in which we sample input values
+      across the `height` and `width` dimensions in atrous convolution. If it is
+      greater than 1, then all values of strides must be 1.
+    name: A name for this operation (optional).
+
+  Returns:
+    A 4-D `Tensor` with shape according to `data_format`.  E.g., for
+    "NHWC" format, shape is
+    `[batch, out_height, out_width, in_channels * channel_multiplier].`
+  """
+  return depthwise_conv2d(input=input,
+                          filter=filter,
+                          strides=strides,
+                          padding=padding,
+                          rate=dilations,
+                          name=name,
+                          data_format=data_format)
+
 # pylint: enable=redefined-builtin
 
 
 # pylint: disable=redefined-builtin,line-too-long
-@tf_export("nn.separable_conv2d")
+@tf_export(v1=["nn.separable_conv2d"])
 def separable_conv2d(input,
                      depthwise_filter,
                      pointwise_filter,
@@ -599,10 +681,76 @@ def separable_conv2d(input,
         name=name)
 
 
+@tf_export("nn.separable_conv2d", v1=[])
+def separable_conv2d_v2(
+    input,
+    depthwise_filter,
+    pointwise_filter,
+    strides,
+    padding,
+    data_format=None,
+    dilations=None,
+    name=None,
+):
+  """2-D convolution with separable filters.
+
+  Performs a depthwise convolution that acts separately on channels followed by
+  a pointwise convolution that mixes channels.  Note that this is separability
+  between dimensions `[1, 2]` and `3`, not spatial separability between
+  dimensions `1` and `2`.
+
+  In detail,
+
+      output[b, i, j, k] = sum_{di, dj, q, r}
+          input[b, strides[1] * i + di, strides[2] * j + dj, q] *
+          depthwise_filter[di, dj, q, r] *
+          pointwise_filter[0, 0, q * channel_multiplier + r, k]
+
+  `strides` controls the strides for the depthwise convolution only, since
+  the pointwise convolution has implicit strides of `[1, 1, 1, 1]`.  Must have
+  `strides[0] = strides[3] = 1`.  For the most common case of the same
+  horizontal and vertical strides, `strides = [1, stride, stride, 1]`.
+  If any value in `rate` is greater than 1, we perform atrous depthwise
+  convolution, in which case all values in the `strides` tensor must be equal
+  to 1.
+
+  Args:
+    input: 4-D `Tensor` with shape according to `data_format`.
+    depthwise_filter: 4-D `Tensor` with shape `[filter_height, filter_width,
+      in_channels, channel_multiplier]`. Contains `in_channels` convolutional
+      filters of depth 1.
+    pointwise_filter: 4-D `Tensor` with shape `[1, 1, channel_multiplier *
+      in_channels, out_channels]`.  Pointwise filter to mix channels after
+      `depthwise_filter` has convolved spatially.
+    strides: 1-D of size 4.  The strides for the depthwise convolution for each
+      dimension of `input`.
+    padding: A string, either `'VALID'` or `'SAME'`.  The padding algorithm. See
+      the "returns" section of `tf.nn.convolution` for details.
+    data_format: The data format for input. Either "NHWC" (default) or "NCHW".
+    dilations: 1-D of size 2. The dilation rate in which we sample input values
+      across the `height` and `width` dimensions in atrous convolution. If it is
+      greater than 1, then all values of strides must be 1.
+    name: A name for this operation (optional).
+
+  Returns:
+    A 4-D `Tensor` with shape according to 'data_format'. For
+      example, with data_format="NHWC", shape is [batch, out_height,
+      out_width, out_channels].
+  """
+  return separable_conv2d(
+      input,
+      depthwise_filter,
+      pointwise_filter,
+      strides,
+      padding,
+      rate=dilations,
+      name=name,
+      data_format=data_format)
+
 # pylint: enable=redefined-builtin,line-too-long
 
 
-@tf_export("nn.sufficient_statistics")
+@tf_export(v1=["nn.sufficient_statistics"])
 def sufficient_statistics(x, axes, shift=None, keep_dims=False, name=None):
   """Calculate the sufficient statistics for the mean and variance of `x`.
 
@@ -652,6 +800,35 @@ def sufficient_statistics(x, axes, shift=None, keep_dims=False, name=None):
   return counts, m_ss, v_ss, shift
 
 
+@tf_export("nn.sufficient_statistics", v1=[])
+def sufficient_statistics_v2(x, axes, shift=None, keepdims=False, name=None):
+  """Calculate the sufficient statistics for the mean and variance of `x`.
+
+  These sufficient statistics are computed using the one pass algorithm on
+  an input that's optionally shifted. See:
+  https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Computing_shifted_data
+
+  Args:
+    x: A `Tensor`.
+    axes: Array of ints. Axes along which to compute mean and variance.
+    shift: A `Tensor` containing the value by which to shift the data for
+      numerical stability, or `None` if no shift is to be performed. A shift
+      close to the true mean provides the most numerically stable results.
+    keepdims: produce statistics with the same dimensionality as the input.
+    name: Name used to scope the operations that compute the sufficient stats.
+
+  Returns:
+    Four `Tensor` objects of the same type as `x`:
+
+    * the count (number of elements to average over).
+    * the (possibly shifted) sum of the elements in the array.
+    * the (possibly shifted) sum of squares of the elements in the array.
+    * the shift by which the mean must be corrected or None if `shift` is None.
+  """
+  return sufficient_statistics(
+      x=x, axes=axes, shift=shift, keep_dims=keepdims, name=name)
+
+
 @tf_export("nn.normalize_moments")
 def normalize_moments(counts, mean_ss, variance_ss, shift, name=None):
   """Calculate the mean and variance of based on the sufficient statistics.
@@ -684,7 +861,7 @@ def normalize_moments(counts, mean_ss, variance_ss, shift, name=None):
   return (mean, variance)
 
 
-@tf_export("nn.moments")
+@tf_export(v1=["nn.moments"])
 def moments(
     x,
     axes,
@@ -743,7 +920,43 @@ def moments(
       return (mean, variance)
 
 
-@tf_export("nn.weighted_moments")
+@tf_export("nn.moments", v1=[])
+def moments_v2(
+    x,
+    axes,
+    shift=None,
+    keepdims=False,
+    name=None):
+  """Calculates the mean and variance of `x`.
+
+  The mean and variance are calculated by aggregating the contents of `x`
+  across `axes`.  If `x` is 1-D and `axes = [0]` this is just the mean
+  and variance of a vector.
+
+  Note: shift is currently not used; the true mean is computed and used.
+
+  When using these moments for batch normalization (see
+  `tf.nn.batch_normalization`):
+
+   * for so-called "global normalization", used with convolutional filters with
+     shape `[batch, height, width, depth]`, pass `axes=[0, 1, 2]`.
+   * for simple batch normalization pass `axes=[0]` (batch only).
+
+  Args:
+    x: A `Tensor`.
+    axes: Array of ints.  Axes along which to compute mean and
+      variance.
+    shift: Not used in the current implementation.
+    keepdims: produce moments with the same dimensionality as the input.
+    name: Name used to scope the operations that compute the moments.
+
+  Returns:
+    Two `Tensor` objects: `mean` and `variance`.
+  """
+  return moments(x=x, axes=axes, shift=shift, name=name, keep_dims=keepdims)
+
+
+@tf_export(v1=["nn.weighted_moments"])
 def weighted_moments(x, axes, frequency_weights, name=None, keep_dims=False):
   """Returns the frequency-weighted mean and variance of `x`.
 
@@ -815,6 +1028,30 @@ def weighted_moments(x, axes, frequency_weights, name=None, keep_dims=False):
     return weighted_mean, weighted_variance
 
 
+@tf_export("nn.weighted_moments", v1=[])
+def weighted_moments_v2(x, axes, frequency_weights, keepdims=False, name=None):
+  """Returns the frequency-weighted mean and variance of `x`.
+
+  Args:
+    x: A tensor.
+    axes: 1-d tensor of int32 values; these are the axes along which
+      to compute mean and variance.
+    frequency_weights: A tensor of positive weights which can be
+      broadcast with x.
+    keepdims: Produce moments with the same dimensionality as the input.
+    name: Name used to scope the operation.
+
+  Returns:
+    Two tensors: `weighted_mean` and `weighted_variance`.
+  """
+  return weighted_moments(
+      x=x,
+      axes=axes,
+      frequency_weights=frequency_weights,
+      name=name,
+      keep_dims=keepdims)
+
+
 @tf_export("nn.batch_normalization")
 def batch_normalization(x,
                         mean,
@@ -875,7 +1112,7 @@ def batch_normalization(x,
         offset - mean * inv if offset is not None else -mean * inv, x.dtype)
 
 
-@tf_export("nn.fused_batch_norm")
+@tf_export(v1=["nn.fused_batch_norm"])
 def fused_batch_norm(
     x,
     scale,
@@ -946,7 +1183,7 @@ def fused_batch_norm(
   return y, batch_mean, batch_var
 
 
-@tf_export("nn.batch_norm_with_global_normalization")
+@tf_export(v1=["nn.batch_norm_with_global_normalization"])
 def batch_norm_with_global_normalization(t,
                                          m,
                                          v,
@@ -984,6 +1221,53 @@ def batch_norm_with_global_normalization(t,
                              else None, variance_epsilon, name)
 
 
+# pylint: disable=redefined-builtin,line-too-long
+@tf_export("nn.batch_norm_with_global_normalization", v1=[])
+def batch_norm_with_global_normalization_v2(input,
+                                            mean,
+                                            variance,
+                                            beta,
+                                            gamma,
+                                            variance_epsilon,
+                                            scale_after_normalization,
+                                            name=None):
+  """Batch normalization.
+
+  This op is deprecated. See `tf.nn.batch_normalization`.
+
+  Args:
+    input: A 4D input Tensor.
+    mean: A 1D mean Tensor with size matching the last dimension of t.
+      This is the first output from tf.nn.moments,
+      or a saved moving average thereof.
+    variance: A 1D variance Tensor with size matching the last dimension of t.
+      This is the second output from tf.nn.moments,
+      or a saved moving average thereof.
+    beta: A 1D beta Tensor with size matching the last dimension of t.
+      An offset to be added to the normalized tensor.
+    gamma: A 1D gamma Tensor with size matching the last dimension of t.
+      If "scale_after_normalization" is true, this tensor will be multiplied
+      with the normalized tensor.
+    variance_epsilon: A small float number to avoid dividing by 0.
+    scale_after_normalization: A bool indicating whether the resulted tensor
+      needs to be multiplied with gamma.
+    name: A name for this operation (optional).
+
+  Returns:
+     A batch-normalized `t`.
+  """
+  return batch_norm_with_global_normalization(t=input,
+                                              m=mean,
+                                              v=variance,
+                                              beta=beta,
+                                              gamma=gamma,
+                                              variance_epsilon=variance_epsilon,
+                                              scale_after_normalization=scale_after_normalization,
+                                              name=name)
+
+# pylint: enable=redefined-builtin,line-too-long
+
+
 def _sum_rows(x):
   """Returns a vector summing up each row of the matrix x."""
   # _sum_rows(x) is equivalent to math_ops.reduce_sum(x, 1) when x is
@@ -1178,7 +1462,111 @@ def _compute_sampled_logits(weights,
     return out_logits, out_labels
 
 
-@tf_export("nn.nce_loss")
+@tf_export("nn.nce_loss", v1=[])
+def nce_loss_v2(weights,
+                biases,
+                labels,
+                inputs,
+                num_sampled,
+                num_classes,
+                num_true=1,
+                sampled_values=None,
+                remove_accidental_hits=False,
+                name="nce_loss"):
+  """Computes and returns the noise-contrastive estimation training loss.
+
+  See [Noise-contrastive estimation: A new estimation principle for
+  unnormalized statistical
+  models](http://www.jmlr.org/proceedings/papers/v9/gutmann10a/gutmann10a.pdf).
+  Also see our [Candidate Sampling Algorithms
+  Reference](https://www.tensorflow.org/extras/candidate_sampling.pdf)
+
+  A common use case is to use this method for training, and calculate the full
+  sigmoid loss for evaluation or inference as in the following example:
+
+  ```python
+  if mode == "train":
+    loss = tf.nn.nce_loss(
+        weights=weights,
+        biases=biases,
+        labels=labels,
+        inputs=inputs,
+        ...)
+  elif mode == "eval":
+    logits = tf.matmul(inputs, tf.transpose(weights))
+    logits = tf.nn.bias_add(logits, biases)
+    labels_one_hot = tf.one_hot(labels, n_classes)
+    loss = tf.nn.sigmoid_cross_entropy_with_logits(
+        labels=labels_one_hot,
+        logits=logits)
+    loss = tf.reduce_sum(loss, axis=1)
+  ```
+
+  Note: when doing embedding lookup on `weights` and `bias`, "div" partition
+  strategy will be used. Support for other partition strategy will be added
+  later.
+
+  Note: By default this uses a log-uniform (Zipfian) distribution for sampling,
+  so your labels must be sorted in order of decreasing frequency to achieve
+  good results.  For more details, see
+  `tf.nn.log_uniform_candidate_sampler`.
+
+  Note: In the case where `num_true` > 1, we assign to each target class
+  the target probability 1 / `num_true` so that the target probabilities
+  sum to 1 per-example.
+
+  Note: It would be useful to allow a variable number of target classes per
+  example.  We hope to provide this functionality in a future release.
+  For now, if you have a variable number of target classes, you can pad them
+  out to a constant number by either repeating them or by padding
+  with an otherwise unused class.
+
+  Args:
+    weights: A `Tensor` of shape `[num_classes, dim]`, or a list of `Tensor`
+      objects whose concatenation along dimension 0 has shape [num_classes,
+      dim].  The (possibly-partitioned) class embeddings.
+    biases: A `Tensor` of shape `[num_classes]`.  The class biases.
+    labels: A `Tensor` of type `int64` and shape `[batch_size, num_true]`. The
+      target classes.
+    inputs: A `Tensor` of shape `[batch_size, dim]`.  The forward activations of
+      the input network.
+    num_sampled: An `int`.  The number of negative classes to randomly sample
+      per batch. This single sample of negative classes is evaluated for each
+      element in the batch.
+    num_classes: An `int`. The number of possible classes.
+    num_true: An `int`.  The number of target classes per training example.
+    sampled_values: a tuple of (`sampled_candidates`, `true_expected_count`,
+      `sampled_expected_count`) returned by a `*_candidate_sampler` function.
+      (if None, we default to `log_uniform_candidate_sampler`)
+    remove_accidental_hits:  A `bool`.  Whether to remove "accidental hits"
+      where a sampled class equals one of the target classes.  If set to `True`,
+      this is a "Sampled Logistic" loss instead of NCE, and we are learning to
+      generate log-odds instead of log probabilities.  See our [Candidate
+      Sampling Algorithms Reference]
+        (https://www.tensorflow.org/extras/candidate_sampling.pdf). Default is
+          False.
+    name: A name for the operation (optional).
+
+  Returns:
+    A `batch_size` 1-D tensor of per-example NCE losses.
+  """
+  # TODO(yuefengz): get partition_strategy from either variables or distribution
+  # strategies.
+  return nce_loss(
+      weights,
+      biases,
+      labels,
+      inputs,
+      num_sampled,
+      num_classes,
+      num_true=num_true,
+      sampled_values=sampled_values,
+      remove_accidental_hits=remove_accidental_hits,
+      partition_strategy="div",
+      name=name)
+
+
+@tf_export(v1=["nn.nce_loss"])
 def nce_loss(weights,
              biases,
              labels,
diff --git a/tensorflow/python/ops/nn_ops.py b/tensorflow/python/ops/nn_ops.py
index bc195993c2e..225904854fa 100644
--- a/tensorflow/python/ops/nn_ops.py
+++ b/tensorflow/python/ops/nn_ops.py
@@ -28,6 +28,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import graph_util
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import random_seed
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
@@ -35,13 +36,14 @@ from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import gen_nn_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
-
 # go/tf-wildcard-import
 # pylint: disable=wildcard-import
 from tensorflow.python.ops.gen_nn_ops import *
 # pylint: enable=wildcard-import
-
 from tensorflow.python.util import deprecation
+from tensorflow.python.util.deprecation import deprecated_args
+from tensorflow.python.util.deprecation import deprecated_argument_lookup
+
 from tensorflow.python.util.tf_export import tf_export
 
 # Aliases for some automatically-generated names.
@@ -206,6 +208,73 @@ class _NonAtrousConvolution(object):
         name=self.name)
 
 
+@tf_export("nn.dilation2d", v1=[])
+def dilation2d_v2(
+    input,   # pylint: disable=redefined-builtin
+    filters,  # pylint: disable=redefined-builtin
+    strides,
+    padding,
+    data_format,
+    dilations,
+    name=None):
+  """Computes the grayscale dilation of 4-D `input` and 3-D `filters` tensors.
+
+  The `input` tensor has shape `[batch, in_height, in_width, depth]` and the
+  `filters` tensor has shape `[filter_height, filter_width, depth]`, i.e., each
+  input channel is processed independently of the others with its own
+  structuring function. The `output` tensor has shape
+  `[batch, out_height, out_width, depth]`. The spatial dimensions of the output
+  tensor depend on the `padding` algorithm. We currently only support the
+  default "NHWC" `data_format`.
+
+  In detail, the grayscale morphological 2-D dilation is the max-sum correlation
+  (for consistency with `conv2d`, we use unmirrored filters):
+
+      output[b, y, x, c] =
+         max_{dy, dx} input[b,
+                            strides[1] * y + rates[1] * dy,
+                            strides[2] * x + rates[2] * dx,
+                            c] +
+                      filters[dy, dx, c]
+
+  Max-pooling is a special case when the filter has size equal to the pooling
+  kernel size and contains all zeros.
+
+  Note on duality: The dilation of `input` by the `filters` is equal to the
+  negation of the erosion of `-input` by the reflected `filters`.
+
+  Args:
+    input: A `Tensor`. Must be one of the following types: `float32`, `float64`,
+      `int32`, `uint8`, `int16`, `int8`, `int64`, `bfloat16`, `uint16`, `half`,
+      `uint32`, `uint64`.
+      4-D with shape `[batch, in_height, in_width, depth]`.
+    filters: A `Tensor`. Must have the same type as `input`.
+      3-D with shape `[filter_height, filter_width, depth]`.
+    strides: A list of `ints` that has length `>= 4`.
+      The stride of the sliding window for each dimension of the input
+      tensor. Must be: `[1, stride_height, stride_width, 1]`.
+    padding: A `string` from: `"SAME", "VALID"`.
+      The type of padding algorithm to use.
+    data_format: A `string`, only `"NCHW"` is currently supported.
+    dilations: A list of `ints` that has length `>= 4`.
+      The input stride for atrous morphological dilation. Must be:
+      `[1, rate_height, rate_width, 1]`.
+    name: A name for the operation (optional).
+
+  Returns:
+    A `Tensor`. Has the same type as `input`.
+  """
+  if data_format != "NCHW":
+    raise ValueError("Data formats other than NCHW are not yet supported")
+
+  return gen_nn_ops.dilation2d(input=input,
+                               filter=filters,
+                               strides=strides,
+                               rates=dilations,
+                               padding=padding,
+                               name=name)
+
+
 @tf_export("nn.with_space_to_batch")
 def with_space_to_batch(
     input,  # pylint: disable=redefined-builtin
@@ -644,7 +713,7 @@ def _get_strides_and_dilation_rate(num_spatial_dims, strides, dilation_rate):
   return strides, dilation_rate
 
 
-@tf_export("nn.convolution")
+@tf_export(v1=["nn.convolution"])
 def convolution(
     input,  # pylint: disable=redefined-builtin
     filter,  # pylint: disable=redefined-builtin
@@ -782,6 +851,30 @@ def convolution(
     return op(input, filter)
 
 
+@tf_export("nn.convolution", v1=[])
+def convolution_v2(
+    input,  # pylint: disable=redefined-builtin
+    filters,
+    strides=None,
+    padding="VALID",
+    data_format=None,
+    dilations=None,
+    name=None):
+  return convolution(
+      input,  # pylint: disable=redefined-builtin
+      filters,
+      padding=padding,
+      strides=strides,
+      dilation_rate=dilations,
+      name=name,
+      data_format=data_format)
+
+convolution_v2.__doc__ = deprecation.rewrite_argument_docstring(
+    deprecation.rewrite_argument_docstring(
+        convolution.__doc__, "dilation_rate", "dilations"),
+    "filter", "filters")
+
+
 class Convolution(object):
   """Helper class for convolution.
 
@@ -873,7 +966,7 @@ class Convolution(object):
     return self.conv_op(inp, filter)
 
 
-@tf_export("nn.pool")
+@tf_export(v1=["nn.pool"])
 def pool(
     input,  # pylint: disable=redefined-builtin
     window_shape,
@@ -1044,6 +1137,105 @@ def pool(
         filter_shape=window_shape)
 
 
+@tf_export("nn.pool", v1=[])
+def pool_v2(
+    input,  # pylint: disable=redefined-builtin
+    window_shape,
+    pooling_type,
+    strides=None,
+    padding="VALID",
+    data_format=None,
+    dilations=None,
+    name=None):
+  # pylint: disable=line-too-long
+  """Performs an N-D pooling operation.
+
+  In the case that `data_format` does not start with "NC", computes for
+      0 <= b < batch_size,
+      0 <= x[i] < output_spatial_shape[i],
+      0 <= c < num_channels:
+
+  ```
+    output[b, x[0], ..., x[N-1], c] =
+      REDUCE_{z[0], ..., z[N-1]}
+        input[b,
+              x[0] * strides[0] - pad_before[0] + dilation_rate[0]*z[0],
+              ...
+              x[N-1]*strides[N-1] - pad_before[N-1] + dilation_rate[N-1]*z[N-1],
+              c],
+  ```
+
+  where the reduction function REDUCE depends on the value of `pooling_type`,
+  and pad_before is defined based on the value of `padding` as described in
+  the "returns" section of `tf.nn.convolution` for details.
+  The reduction never includes out-of-bounds positions.
+
+  In the case that `data_format` starts with `"NC"`, the `input` and output are
+  simply transposed as follows:
+
+  ```
+    pool(input, data_format, **kwargs) =
+      tf.transpose(pool(tf.transpose(input, [0] + range(2,N+2) + [1]),
+                        **kwargs),
+                   [0, N+1] + range(1, N+1))
+  ```
+
+  Args:
+    input: Tensor of rank N+2, of shape `[batch_size] + input_spatial_shape +
+      [num_channels]` if data_format does not start with "NC" (default), or
+      `[batch_size, num_channels] + input_spatial_shape` if data_format starts
+      with "NC".  Pooling happens over the spatial dimensions only.
+    window_shape: Sequence of N ints >= 1.
+    pooling_type: Specifies pooling operation, must be "AVG" or "MAX".
+    strides: Optional. Sequence of N ints >= 1.  Defaults to [1]*N. If any value of
+      strides is > 1, then all values of dilation_rate must be 1.
+    padding: The padding algorithm, must be "SAME" or "VALID". Defaults to "SAME".
+      See the "returns" section of `tf.nn.convolution` for details.
+    data_format: A string or None.  Specifies whether the channel dimension of
+      the `input` and output is the last dimension (default, or if `data_format`
+      does not start with "NC"), or the second dimension (if `data_format`
+      starts with "NC").  For N=1, the valid values are "NWC" (default) and
+      "NCW".  For N=2, the valid values are "NHWC" (default) and "NCHW". For
+      N=3, the valid values are "NDHWC" (default) and "NCDHW".
+    dilations: Optional.  Dilation rate.  List of N ints >= 1. Defaults to
+      [1]*N.  If any value of dilation_rate is > 1, then all values of strides
+      must be 1.
+    name: Optional. Name of the op.
+
+  Returns:
+    Tensor of rank N+2, of shape
+      [batch_size] + output_spatial_shape + [num_channels]
+
+    if data_format is None or does not start with "NC", or
+
+      [batch_size, num_channels] + output_spatial_shape
+
+    if data_format starts with "NC",
+    where `output_spatial_shape` depends on the value of padding:
+
+    If padding = "SAME":
+      output_spatial_shape[i] = ceil(input_spatial_shape[i] / strides[i])
+
+    If padding = "VALID":
+      output_spatial_shape[i] =
+        ceil((input_spatial_shape[i] - (window_shape[i] - 1) * dilation_rate[i])
+             / strides[i]).
+
+  Raises:
+    ValueError: if arguments are invalid.
+
+  """
+  return pool(
+      input=input,
+      window_shape=window_shape,
+      pooling_type=pooling_type,
+      padding=padding,
+      dilation_rate=dilations,
+      strides=strides,
+      name=name,
+      data_format=data_format)
+
+
 @tf_export("nn.atrous_conv2d")
 def atrous_conv2d(value, filters, rate, padding, name=None):
   """Atrous convolution (a.k.a. convolution with holes or dilated convolution).
@@ -1181,7 +1373,208 @@ def atrous_conv2d(value, filters, rate, padding, name=None):
       name=name)
 
 
-@tf_export("nn.conv2d_transpose")
+@tf_export("nn.conv2d", v1=[])
+def conv2d_v2(input,  # pylint: disable=redefined-builtin
+              filters,
+              strides,
+              padding,
+              data_format="NHWC",
+              dilations=None,
+              name=None):
+  # pylint: disable=line-too-long
+  r"""Computes a 2-D convolution given 4-D `input` and `filters` tensors.
+
+  Given an input tensor of shape `[batch, in_height, in_width, in_channels]`
+  and a filter / kernel tensor of shape
+  `[filter_height, filter_width, in_channels, out_channels]`, this op
+  performs the following:
+
+  1. Flattens the filter to a 2-D matrix with shape
+     `[filter_height * filter_width * in_channels, output_channels]`.
+  2. Extracts image patches from the input tensor to form a *virtual*
+     tensor of shape `[batch, out_height, out_width,
+     filter_height * filter_width * in_channels]`.
+  3. For each patch, right-multiplies the filter matrix and the image patch
+     vector.
+
+  In detail, with the default NHWC format,
+
+      output[b, i, j, k] =
+          sum_{di, dj, q} input[b, strides[1] * i + di, strides[2] * j + dj, q] *
+                          filter[di, dj, q, k]
+
+  Must have `strides[0] = strides[3] = 1`.  For the most common case of the same
+  horizontal and vertices strides, `strides = [1, stride, stride, 1]`.
+
+  Args:
+    input: A `Tensor`. Must be one of the following types:
+      `half`, `bfloat16`, `float32`, `float64`.
+      A 4-D tensor. The dimension order is interpreted according to the value
+      of `data_format`, see below for details.
+    filters: A `Tensor`. Must have the same type as `input`.
+      A 4-D tensor of shape
+      `[filter_height, filter_width, in_channels, out_channels]`
+    strides: A list of `ints`.
+      1-D tensor of length 4.  The stride of the sliding window for each
+      dimension of `input`. The dimension order is determined by the value of
+      `data_format`, see below for details.
+    padding: A `string` from: `"SAME", "VALID"`.
+      The type of padding algorithm to use.
+    data_format: An optional `string` from: `"NHWC", "NCHW"`.
+      Defaults to `"NHWC"`.
+      Specify the data format of the input and output data. With the
+      default format "NHWC", the data is stored in the order of:
+          [batch, height, width, channels].
+      Alternatively, the format could be "NCHW", the data storage order of:
+          [batch, channels, height, width].
+    dilations: An optional list of `ints`. Defaults to `[1, 1, 1, 1]`.
+      1-D tensor of length 4.  The dilation factor for each dimension of
+      `input`. If set to k > 1, there will be k-1 skipped cells between each
+      filter element on that dimension. The dimension order is determined by the
+      value of `data_format`, see above for details. Dilations in the batch and
+      depth dimensions must be 1.
+    name: A name for the operation (optional).
+
+  Returns:
+    A `Tensor`. Has the same type as `input`.
+  """
+  # pylint: enable=line-too-long
+  if dilations is None:
+    dilations = [1, 1, 1, 1]
+  return gen_nn_ops.conv2d(input,  # pylint: disable=redefined-builtin
+                           filters,
+                           strides,
+                           padding,
+                           use_cudnn_on_gpu=True,
+                           data_format=data_format,
+                           dilations=dilations,
+                           name=name)
+tf_export(v1=["nn.conv2d"])(gen_nn_ops.conv2d)
+
+
+@tf_export("nn.conv2d_backprop_filter", v1=[])
+def conv2d_backprop_filter_v2(input,  # pylint: disable=redefined-builtin
+                              filter_sizes,
+                              out_backprop,
+                              strides,
+                              padding,
+                              data_format="NHWC",
+                              dilations=None,
+                              name=None):
+  r"""Computes the gradients of convolution with respect to the filter.
+
+  Args:
+    input: A `Tensor`. Must be one of the following types:
+      `half`, `bfloat16`, `float32`, `float64`.
+      4-D with shape `[batch, in_height, in_width, in_channels]`.
+    filter_sizes: A `Tensor` of type `int32`.
+      An integer vector representing the tensor shape of `filter`,
+      where `filter` is a 4-D
+      `[filter_height, filter_width, in_channels, out_channels]` tensor.
+    out_backprop: A `Tensor`. Must have the same type as `input`.
+      4-D with shape `[batch, out_height, out_width, out_channels]`.
+      Gradients w.r.t. the output of the convolution.
+    strides: A list of `ints`.
+      The stride of the sliding window for each dimension of the input
+      of the convolution. Must be in the same order as the dimension specified
+      with format.
+    padding: A `string` from: `"SAME", "VALID"`.
+      The type of padding algorithm to use.
+    data_format: An optional `string` from: `"NHWC", "NCHW"`.
+      Defaults to `"NHWC"`.
+      Specify the data format of the input and output data. With the
+      default format "NHWC", the data is stored in the order of:
+          [batch, in_height, in_width, in_channels].
+      Alternatively, the format could be "NCHW", the data storage order of:
+          [batch, in_channels, in_height, in_width].
+    dilations: An optional list of `ints`. Defaults to `[1, 1, 1, 1]`.
+      1-D tensor of length 4.  The dilation factor for each dimension of
+      `input`. If set to k > 1, there will be k-1 skipped cells between each
+      filter element on that dimension. The dimension order is determined by
+      the value of `data_format`, see above for details. Dilations in the batch
+      and depth dimensions must be 1.
+    name: A name for the operation (optional).
+
+  Returns:
+    A `Tensor`. Has the same type as `input`.
+  """
+  if dilations is None:
+    dilations = [1, 1, 1, 1]
+  return gen_nn_ops.conv2d_backprop_filter(input,  # pylint: disable=redefined-builtin
+                                           filter_sizes,
+                                           out_backprop,
+                                           strides,
+                                           padding,
+                                           use_cudnn_on_gpu=True,
+                                           data_format=data_format,
+                                           dilations=dilations,
+                                           name=name)
+tf_export(v1=["nn.conv2d_backprop_filter"])(
+    gen_nn_ops.conv2d_backprop_filter)
+
+
+@tf_export("nn.conv2d_backprop_input", v1=[])
+def conv2d_backprop_input_v2(input_sizes,
+                             filters,
+                             out_backprop,
+                             strides,
+                             padding,
+                             data_format="NHWC",
+                             dilations=None,
+                             name=None):
+  r"""Computes the gradients of convolution with respect to the input.
+
+  Args:
+    input_sizes: A `Tensor` of type `int32`.
+      An integer vector representing the shape of `input`,
+      where `input` is a 4-D `[batch, height, width, channels]` tensor.
+    filters: A `Tensor`. Must be one of the following types:
+      `half`, `bfloat16`, `float32`, `float64`.
+      4-D with shape
+      `[filter_height, filter_width, in_channels, out_channels]`.
+    out_backprop: A `Tensor`. Must have the same type as `filters`.
+      4-D with shape `[batch, out_height, out_width, out_channels]`.
+      Gradients w.r.t. the output of the convolution.
+    strides: A list of `ints`.
+      The stride of the sliding window for each dimension of the input
+      of the convolution. Must be in the same order as the dimension specified
+      with format.
+    padding: A `string` from: `"SAME", "VALID"`.
+      The type of padding algorithm to use.
+    data_format: An optional `string` from: `"NHWC", "NCHW"`.
+      Defaults to `"NHWC"`.
+      Specify the data format of the input and output data. With the
+      default format "NHWC", the data is stored in the order of:
+          [batch, in_height, in_width, in_channels].
+      Alternatively, the format could be "NCHW", the data storage order of:
+          [batch, in_channels, in_height, in_width].
+    dilations: An optional list of `ints`. Defaults to `[1, 1, 1, 1]`.
+      1-D tensor of length 4.  The dilation factor for each dimension of
+      `input`. If set to k > 1, there will be k-1 skipped cells between each
+      filter element on that dimension. The dimension order is determined by
+      the value of `data_format`, see above for details. Dilations in the batch
+      and depth dimensions must be 1.
+    name: A name for the operation (optional).
+
+  Returns:
+    A `Tensor`. Has the same type as `filters`.
+  """
+  if dilations is None:
+    dilations = [1, 1, 1, 1]
+  return gen_nn_ops.conv2d_backprop_input(input_sizes,
+                                          filters,
+                                          out_backprop,
+                                          strides,
+                                          padding,
+                                          use_cudnn_on_gpu=True,
+                                          data_format=data_format,
+                                          dilations=dilations,
+                                          name=name)
+tf_export(v1=["nn.conv2d_backprop_input"])(
+    gen_nn_ops.conv2d_backprop_input)
+
+
+@tf_export(v1=["nn.conv2d_transpose"])
 def conv2d_transpose(
     value,
     filter,  # pylint: disable=redefined-builtin
@@ -1261,6 +1654,31 @@ def conv2d_transpose(
         name=name)
 
 
+# pylint: disable=redefined-builtin
+@tf_export("nn.conv2d_transpose", v1=[])
+def conv2d_transpose_v2(
+    input,
+    filters,  # pylint: disable=redefined-builtin
+    output_shape,
+    strides,
+    padding="SAME",
+    data_format="NHWC",
+    name=None):
+  return conv2d_transpose(
+      input,
+      filters,
+      output_shape,
+      strides,
+      padding=padding,
+      data_format=data_format,
+      name=name)
+# pylint: enable=redefined-builtin
+conv2d_transpose_v2.__doc__ = deprecation.rewrite_argument_docstring(
+    deprecation.rewrite_argument_docstring(
+        conv2d_transpose.__doc__, "filter", "filters"),
+    "value", "input")
+
+
 @tf_export("nn.atrous_conv2d_transpose")
 def atrous_conv2d_transpose(value,
                             filters,
@@ -1409,7 +1827,29 @@ def atrous_conv2d_transpose(value,
         input=value, crops=batch_to_space_crop, block_size=rate)
 
 
-@tf_export("nn.conv3d_transpose")
+@tf_export("nn.conv3d", v1=[])
+def conv3d_v2(input,  # pylint: disable=redefined-builtin,missing-docstring
+              filters,
+              strides,
+              padding,
+              data_format="NDHWC",
+              dilations=None,
+              name=None):
+  if dilations is None:
+    dilations = [1, 1, 1, 1, 1]
+  return gen_nn_ops.conv3d(input,  # pylint: disable=redefined-builtin
+                           filters,
+                           strides,
+                           padding,
+                           data_format=data_format,
+                           dilations=dilations,
+                           name=name)
+tf_export(v1=["nn.conv3d"])(gen_nn_ops.conv3d)
+conv3d_v2.__doc__ = deprecation.rewrite_argument_docstring(
+    gen_nn_ops.conv3d.__doc__, "filter", "filters")
+
+
+@tf_export(v1=["nn.conv3d_transpose"])
 def conv3d_transpose(
     value,
     filter,  # pylint: disable=redefined-builtin
@@ -1487,6 +1927,31 @@ def conv3d_transpose(
         name=name)
 
 
+# pylint: disable=redefined-builtin
+@tf_export("nn.conv3d_transpose", v1=[])
+def conv3d_transpose_v2(
+    input,
+    filters,
+    output_shape,
+    strides,
+    padding="SAME",
+    data_format="NDHWC",
+    name=None):
+  return conv3d_transpose(
+      input,
+      filters,
+      output_shape,
+      strides,
+      padding=padding,
+      data_format=data_format,
+      name=name)
+# pylint: enable=redefined-builtin
+conv3d_transpose_v2.__doc__ = deprecation.rewrite_argument_docstring(
+    deprecation.rewrite_argument_docstring(
+        conv3d_transpose.__doc__, "filter", "filters"),
+    "value", "input")
+
+
 @tf_export("nn.bias_add")
 def bias_add(value, bias, data_format=None, name=None):
   """Adds `bias` to `value`.
@@ -1542,7 +2007,7 @@ def bias_add_v1(value, bias, name=None):
     return gen_nn_ops.bias_add_v1(value, bias, name=name)
 
 
-@tf_export("nn.crelu")
+@tf_export(v1=["nn.crelu"])
 def crelu(features, name=None, axis=-1):
   """Computes Concatenated ReLU.
 
@@ -1568,6 +2033,12 @@ def crelu(features, name=None, axis=-1):
     return gen_nn_ops.relu(c)
 
 
+@tf_export("nn.crelu", v1=[])
+def crelu_v2(features, axis=-1, name=None):
+  return crelu(features, name=name, axis=axis)
+crelu_v2.__doc__ = crelu.__doc__
+
+
 @tf_export("nn.relu6")
 def relu6(features, name=None):
   """Computes Rectified Linear 6: `min(max(features, 0), 6)`.
@@ -1715,7 +2186,7 @@ def _softmax(logits, compute_op, dim=-1, name=None):
   return output
 
 
-@tf_export("nn.softmax", "math.softmax")
+@tf_export(v1=["nn.softmax", "math.softmax"])
 @deprecation.deprecated_args(None, "dim is deprecated, use axis instead", "dim")
 def softmax(logits, axis=None, name=None, dim=None):
   """Computes softmax activations.
@@ -1745,7 +2216,34 @@ def softmax(logits, axis=None, name=None, dim=None):
   return _softmax(logits, gen_nn_ops.softmax, axis, name)
 
 
-@tf_export("nn.log_softmax", "math.log_softmax")
+@tf_export("nn.softmax", "math.softmax", v1=[])
+def softmax_v2(logits, axis=None, name=None):
+  """Computes softmax activations.
+
+  This function performs the equivalent of
+
+      softmax = tf.exp(logits) / tf.reduce_sum(tf.exp(logits), axis)
+
+  Args:
+    logits: A non-empty `Tensor`. Must be one of the following types: `half`,
+      `float32`, `float64`.
+    axis: The dimension softmax would be performed on. The default is -1 which
+      indicates the last dimension.
+    name: A name for the operation (optional).
+
+  Returns:
+    A `Tensor`. Has the same type and shape as `logits`.
+
+  Raises:
+    InvalidArgumentError: if `logits` is empty or `axis` is beyond the last
+      dimension of `logits`.
+  """
+  if axis is None:
+    axis = -1
+  return _softmax(logits, gen_nn_ops.softmax, axis, name)
+
+
+@tf_export(v1=["nn.log_softmax", "math.log_softmax"])
 @deprecation.deprecated_args(None, "dim is deprecated, use axis instead", "dim")
 def log_softmax(logits, axis=None, name=None, dim=None):
   """Computes log softmax activations.
@@ -1775,6 +2273,33 @@ def log_softmax(logits, axis=None, name=None, dim=None):
   return _softmax(logits, gen_nn_ops.log_softmax, axis, name)
 
 
+@tf_export("nn.log_softmax", "math.log_softmax", v1=[])
+def log_softmax_v2(logits, axis=None, name=None):
+  """Computes log softmax activations.
+
+  For each batch `i` and class `j` we have
+
+      logsoftmax = logits - log(reduce_sum(exp(logits), axis))
+
+  Args:
+    logits: A non-empty `Tensor`. Must be one of the following types: `half`,
+      `float32`, `float64`.
+    axis: The dimension softmax would be performed on. The default is -1 which
+      indicates the last dimension.
+    name: A name for the operation (optional).
+
+  Returns:
+    A `Tensor`. Has the same type as `logits`. Same shape as `logits`.
+
+  Raises:
+    InvalidArgumentError: if `logits` is empty or `axis` is beyond the last
+      dimension of `logits`.
+  """
+  if axis is None:
+    axis = -1
+  return _softmax(logits, gen_nn_ops.log_softmax, axis, name)
+
+
 def _ensure_xent_args(name, sentinel, labels, logits):
   # Make sure that all arguments were passed as named arguments.
   if sentinel is not None:
@@ -1784,9 +2309,8 @@ def _ensure_xent_args(name, sentinel, labels, logits):
     raise ValueError("Both labels and logits must be provided.")
 
 
-@tf_export("nn.softmax_cross_entropy_with_logits",
-           v1=["nn.softmax_cross_entropy_with_logits_v2"])
-def softmax_cross_entropy_with_logits_v2(labels, logits, dim=-1, name=None):
+@tf_export("nn.softmax_cross_entropy_with_logits", v1=[])
+def softmax_cross_entropy_with_logits_v2(labels, logits, axis=-1, name=None):
   """Computes softmax cross entropy between `logits` and `labels`.
 
   Measures the probability error in discrete classification tasks in which the
@@ -1808,7 +2332,7 @@ def softmax_cross_entropy_with_logits_v2(labels, logits, dim=-1, name=None):
 
   A common use case is to have logits and labels of shape
   `[batch_size, num_classes]`, but higher dimensions are supported, with
-  the `dim` argument specifying the class dimension.
+  the `axis` argument specifying the class dimension.
 
   `logits` and `labels` must have the same dtype (either `float16`, `float32`,
   or `float64`).
@@ -1826,9 +2350,65 @@ def softmax_cross_entropy_with_logits_v2(labels, logits, dim=-1, name=None):
       `[batch_size, num_classes]`, each row of `labels[i]` must be a valid
       probability distribution.
     logits: Unscaled log probabilities.
-    dim: The class dimension. Defaulted to -1 which is the last dimension.
+    axis: The class dimension. Defaulted to -1 which is the last dimension.
     name: A name for the operation (optional).
 
+  Returns:
+    A `Tensor` that contains the softmax cross entropy loss. Its type is the
+    same as `logits` and its shape is the same as `labels` except that it does
+    not have the last dimension of `labels`.
+  """
+  return softmax_cross_entropy_with_logits_v2_helper(
+      labels=labels, logits=logits, axis=axis, name=name)
+
+
+@tf_export(v1=["nn.softmax_cross_entropy_with_logits_v2"])
+@deprecated_args(None, "dim is deprecated, use axis instead", "dim")
+def softmax_cross_entropy_with_logits_v2_helper(
+    labels, logits, axis=None, name=None, dim=None):
+  """Computes softmax cross entropy between `logits` and `labels`.
+
+  Measures the probability error in discrete classification tasks in which the
+  classes are mutually exclusive (each entry is in exactly one class).  For
+  example, each CIFAR-10 image is labeled with one and only one label: an image
+  can be a dog or a truck, but not both.
+
+  **NOTE:**  While the classes are mutually exclusive, their probabilities
+  need not be.  All that is required is that each row of `labels` is
+  a valid probability distribution.  If they are not, the computation of the
+  gradient will be incorrect.
+
+  If using exclusive `labels` (wherein one and only
+  one class is true at a time), see `sparse_softmax_cross_entropy_with_logits`.
+
+  **WARNING:** This op expects unscaled logits, since it performs a `softmax`
+  on `logits` internally for efficiency.  Do not call this op with the
+  output of `softmax`, as it will produce incorrect results.
+
+  A common use case is to have logits and labels of shape
+  `[batch_size, num_classes]`, but higher dimensions are supported, with
+  the `axis` argument specifying the class dimension.
+
+  `logits` and `labels` must have the same dtype (either `float16`, `float32`,
+  or `float64`).
+
+  Backpropagation will happen into both `logits` and `labels`.  To disallow
+  backpropagation into `labels`, pass label tensors through `tf.stop_gradient`
+  before feeding it to this function.
+
+  **Note that to avoid confusion, it is required to pass only named arguments to
+  this function.**
+
+  Args:
+    labels: Each vector along the class dimension should hold a valid
+      probability distribution e.g. for the case in which labels are of shape
+      `[batch_size, num_classes]`, each row of `labels[i]` must be a valid
+      probability distribution.
+    logits: Unscaled log probabilities.
+    axis: The class dimension. Defaulted to -1 which is the last dimension.
+    name: A name for the operation (optional).
+    dim: Deprecated alias for axis.
+
   Returns:
     A `Tensor` that contains the softmax cross entropy loss. Its type is the
     same as `logits` and its shape is the same as `labels` except that it does
@@ -1837,6 +2417,10 @@ def softmax_cross_entropy_with_logits_v2(labels, logits, dim=-1, name=None):
   # TODO(pcmurray) Raise an error when the labels do not sum to 1. Note: This
   # could break users who call this with bad labels, but disregard the bad
   # results.
+  axis = deprecated_argument_lookup("axis", axis, "dim", dim)
+  del dim
+  if axis is None:
+    axis = -1
 
   with ops.name_scope(name, "softmax_cross_entropy_with_logits",
                       [logits, labels]) as name:
@@ -1853,7 +2437,7 @@ def softmax_cross_entropy_with_logits_v2(labels, logits, dim=-1, name=None):
     shape = logits.get_shape()
 
     # Move the dim to the end if dim is not the last dimension.
-    if dim is not -1:
+    if axis != -1:
 
       def _move_dim_to_end(tensor, dim_index, rank):
         return array_ops.transpose(
@@ -1863,8 +2447,8 @@ def softmax_cross_entropy_with_logits_v2(labels, logits, dim=-1, name=None):
                 math_ops.range(dim_index + 1, rank), [dim_index]
             ], 0))
 
-      precise_logits = _move_dim_to_end(precise_logits, dim, input_rank)
-      labels = _move_dim_to_end(labels, dim, input_rank)
+      precise_logits = _move_dim_to_end(precise_logits, axis, input_rank)
+      labels = _move_dim_to_end(labels, axis, input_rank)
 
     input_shape = array_ops.shape(precise_logits)
 
@@ -1878,7 +2462,7 @@ def softmax_cross_entropy_with_logits_v2(labels, logits, dim=-1, name=None):
     cost, unused_backprop = gen_nn_ops.softmax_cross_entropy_with_logits(
         precise_logits, labels, name=name)
 
-    # The output cost shape should be the input minus dim.
+    # The output cost shape should be the input minus axis.
     output_shape = array_ops.slice(input_shape, [0],
                                    [math_ops.subtract(input_rank, 1)])
     cost = array_ops.reshape(cost, output_shape)
@@ -1888,7 +2472,7 @@ def softmax_cross_entropy_with_logits_v2(labels, logits, dim=-1, name=None):
     if not context.executing_eagerly(
     ) and shape is not None and shape.dims is not None:
       shape = shape.as_list()
-      del shape[dim]
+      del shape[axis]
       cost.set_shape(shape)
 
     if convert_to_float32:
@@ -1966,7 +2550,7 @@ def softmax_cross_entropy_with_logits(
     labels = array_ops.stop_gradient(labels, name="labels_stop_gradient")
 
   return softmax_cross_entropy_with_logits_v2(
-      labels=labels, logits=logits, dim=dim, name=name)
+      labels=labels, logits=logits, axis=dim, name=name)
 
 
 @tf_export("nn.sparse_softmax_cross_entropy_with_logits")
@@ -2155,6 +2739,67 @@ def max_pool(value, ksize, strides, padding, data_format="NHWC", name=None):
         name=name)
 
 
+# pylint: disable=redefined-builtin
+@tf_export("nn.max_pool_with_argmax", v1=[])
+def max_pool_with_argmax_v2(input,
+                            ksize,
+                            strides,
+                            padding,
+                            data_format="NHWC",
+                            output_dtype=dtypes.int64,
+                            name=None):
+  """Performs max pooling on the input and outputs both max values and indices.
+
+  The indices in `argmax` are flattened, so that a maximum value at position
+  `[b, y, x, c]` becomes flattened index
+  `((b * height + y) * width + x) * channels + c`.
+
+  The indices returned are always in `[0, height) x [0, width)` before
+  flattening, even if padding is involved and the mathematically correct answer
+  is outside (either negative or too large).  This is a bug, but fixing it is
+  difficult to do in a safe backwards compatible way, especially due to
+  flattening.
+
+  Args:
+    input: A `Tensor`. Must be one of the following types: `float32`, `float64`,
+      `int32`, `uint8`, `int16`, `int8`, `int64`, `bfloat16`, `uint16`, `half`,
+      `uint32`, `uint64`.
+      4-D with shape `[batch, height, width, channels]`.  Input to pool over.
+    ksize: A list of `ints` that has length `>= 4`.
+      The size of the window for each dimension of the input tensor.
+    strides: A list of `ints` that has length `>= 4`.
+      The stride of the sliding window for each dimension of the
+      input tensor.
+    padding: A `string` from: `"SAME", "VALID"`.
+      The type of padding algorithm to use.
+    data_format: An optional `string`, must be set to `"NHWC"`. Defaults to
+      `"NHWC"`.
+      Specify the data format of the input and output data.
+    output_dtype: An optional `tf.DType` from: `tf.int32, tf.int64`.
+      Defaults to `tf.int64`.
+      The dtype of the returned argmax tensor.
+    name: A name for the operation (optional).
+
+  Returns:
+    A tuple of `Tensor` objects (output, argmax).
+
+    output: A `Tensor`. Has the same type as `input`.
+    argmax: A `Tensor` of type `output_dtype`.
+  """
+
+  if data_format != "NHWC":
+    raise ValueError("Data formats other than 'NHWC' are not yet supported")
+
+  return gen_nn_ops.max_pool_with_argmax(input=input,
+                                         ksize=ksize,
+                                         strides=strides,
+                                         padding=padding,
+                                         Targmax=output_dtype,
+                                         name=name)
+
+# pylint: enable=redefined-builtin
+
+
 @ops.RegisterStatistics("Conv2D", "flops")
 def _calc_conv_flops(graph, node):
   """Calculates the compute resources needed for Conv2D."""
@@ -2199,7 +2844,7 @@ def _calc_bias_add_flops(graph, node):
   return ops.OpStats("flops", input_count)
 
 
-@tf_export("nn.xw_plus_b")
+@tf_export(v1=["nn.xw_plus_b"])
 def xw_plus_b(x, weights, biases, name=None):  # pylint: disable=invalid-name
   """Computes matmul(x, weights) + biases.
 
@@ -2271,12 +2916,16 @@ def _get_noise_shape(x, noise_shape):
   return noise_shape
 
 
-@tf_export("nn.dropout")
-def dropout(x, keep_prob, noise_shape=None, seed=None, name=None):  # pylint: disable=invalid-name
+@tf_export(v1=["nn.dropout"])
+@deprecation.deprecated_args(None, "Please use `rate` instead of `keep_prob`. "
+                             "Rate should be set to `rate = 1 - keep_prob`.",
+                             "keep_prob")
+def dropout(x, keep_prob=None, noise_shape=None, seed=None, name=None,
+            rate=None):  # pylint: disable=invalid-name
   """Computes dropout.
 
-  With probability `keep_prob`, outputs the input element scaled up by
-  `1 / keep_prob`, otherwise outputs `0`.  The scaling is so that the expected
+  For each element of `x`, with probability `rate`, outputs `0`, and otherwise
+  scales up the input by `1 / (1-rate)`. The scaling is such that the expected
   sum is unchanged.
 
   By default, each element is kept or dropped independently.  If `noise_shape`
@@ -2289,8 +2938,59 @@ def dropout(x, keep_prob, noise_shape=None, seed=None, name=None):  # pylint: di
 
   Args:
     x: A floating point tensor.
-    keep_prob: A scalar `Tensor` with the same type as x. The probability
-      that each element is kept.
+    keep_prob: (deprecated) A deprecated alias for `(1-rate)`.
+    noise_shape: A 1-D `Tensor` of type `int32`, representing the
+      shape for randomly generated keep/drop flags.
+    seed: A Python integer. Used to create random seeds. See
+      `tf.set_random_seed` for behavior.
+    name: A name for this operation (optional).
+    rate: A scalar `Tensor` with the same type as `x`. The probability that each
+      element of `x` is discarded.
+
+  Returns:
+    A Tensor of the same shape of `x`.
+
+  Raises:
+    ValueError: If `rate` is not in `[0, 1)` or if `x` is not a floating
+      point tensor.
+  """
+  try:
+    keep = 1. - keep_prob if keep_prob is not None else None
+  except TypeError:
+    raise ValueError("keep_prob must be a floating point number or Tensor "
+                     "(got %r)" % keep_prob)
+
+  rate = deprecation.deprecated_argument_lookup(
+      "rate", rate,
+      "keep_prob", keep)
+
+  if rate is None:
+    raise ValueError("You must provide a rate to dropout.")
+
+  return dropout_v2(x, rate, noise_shape=noise_shape, seed=seed, name=name)
+
+
+@tf_export("nn.dropout", v1=[])
+def dropout_v2(x, rate, noise_shape=None, seed=None, name=None):  # pylint: disable=invalid-name
+  """Computes dropout.
+
+  With probability `rate`, drops elements of `x`. Input that are kept are
+  scaled up by `1 / (1 - rate)`, otherwise outputs `0`.  The scaling is so that
+  the expected sum is unchanged.
+
+  By default, each element is kept or dropped independently.  If `noise_shape`
+  is specified, it must be
+  [broadcastable](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+  to the shape of `x`, and only dimensions with `noise_shape[i] == shape(x)[i]`
+  will make independent decisions.  For example, if `shape(x) = [k, l, m, n]`
+  and `noise_shape = [k, 1, 1, n]`, each batch and channel component will be
+  kept independently and each row and column will be kept or not kept together.
+
+  Args:
+    x: A floating point tensor.
+    rate: A scalar `Tensor` with the same type as x. The probability
+      that each element is dropped. For example, setting rate=0.1 would drop
+      10% of input elements.
     noise_shape: A 1-D `Tensor` of type `int32`, representing the
       shape for randomly generated keep/drop flags.
     seed: A Python integer. Used to create random seeds. See
@@ -2310,28 +3010,29 @@ def dropout(x, keep_prob, noise_shape=None, seed=None, name=None):  # pylint: di
     if not x.dtype.is_floating:
       raise ValueError("x has to be a floating point tensor since it's going to"
                        " be scaled. Got a %s tensor instead." % x.dtype)
-    if isinstance(keep_prob, numbers.Real) and not 0 < keep_prob <= 1:
-      raise ValueError("keep_prob must be a scalar tensor or a float in the "
-                       "range (0, 1], got %g" % keep_prob)
+    if isinstance(rate, numbers.Real) and not (rate >= 0 and rate < 1):
+      raise ValueError("rate must be a scalar tensor or a float in the "
+                       "range [0, 1), got %g" % rate)
 
     # Early return if nothing needs to be dropped.
-    if isinstance(keep_prob, float) and keep_prob == 1:
+    if isinstance(rate, numbers.Real) and rate == 0:
       return x
     if context.executing_eagerly():
-      if isinstance(keep_prob, ops.EagerTensor):
-        if keep_prob.numpy() == 1:
+      if isinstance(rate, ops.EagerTensor):
+        if rate.numpy() == 0:
           return x
     else:
-      keep_prob = ops.convert_to_tensor(
-          keep_prob, dtype=x.dtype, name="keep_prob")
-      keep_prob.get_shape().assert_is_compatible_with(tensor_shape.scalar())
+      rate = ops.convert_to_tensor(
+          rate, dtype=x.dtype, name="rate")
+      rate.get_shape().assert_is_compatible_with(tensor_shape.scalar())
 
-      # Do nothing if we know keep_prob == 1
-      if tensor_util.constant_value(keep_prob) == 1:
+      # Do nothing if we know rate == 0
+      if tensor_util.constant_value(rate) == 0:
         return x
 
     noise_shape = _get_noise_shape(x, noise_shape)
 
+    keep_prob = 1 - rate
     # uniform [keep_prob, 1.0 + keep_prob)
     random_tensor = keep_prob
     random_tensor += random_ops.random_uniform(
@@ -2402,7 +3103,293 @@ def nth_element(input, n, reverse=False, name=None):  # pylint: disable=redefine
   return gen_nn_ops.nth_element(input, n, reverse=reverse, name=name)
 
 
-@tf_export("nn.conv1d")
+@tf_export(v1=["nn.fractional_max_pool"])
+@deprecation.deprecated(date=None, instructions="`seed2` and `deterministic` "
+                        "args are deprecated.  Use fractional_max_pool_v2.")
+def fractional_max_pool(value,
+                        pooling_ratio,
+                        pseudo_random=False,
+                        overlapping=False,
+                        deterministic=False,
+                        seed=0,
+                        seed2=0,
+                        name=None):   # pylint: disable=redefined-builtin
+  r"""Performs fractional max pooling on the input.
+
+  This is a deprecated version of `fractional_max_pool`.
+
+  Fractional max pooling is slightly different than regular max pooling.  In
+  regular max pooling, you downsize an input set by taking the maximum value of
+  smaller N x N subsections of the set (often 2x2), and try to reduce the set by
+  a factor of N, where N is an integer.  Fractional max pooling, as you might
+  expect from the word "fractional", means that the overall reduction ratio N
+  does not have to be an integer.
+
+  The sizes of the pooling regions are generated randomly but are fairly
+  uniform.  For example, let's look at the height dimension, and the constraints
+  on the list of rows that will be pool boundaries.
+
+  First we define the following:
+
+  1.  input_row_length : the number of rows from the input set
+  2.  output_row_length : which will be smaller than the input
+  3.  alpha = input_row_length / output_row_length : our reduction ratio
+  4.  K = floor(alpha)
+  5.  row_pooling_sequence : this is the result list of pool boundary rows
+
+  Then, row_pooling_sequence should satisfy:
+
+  1.  a[0] = 0 : the first value of the sequence is 0
+  2.  a[end] = input_row_length : the last value of the sequence is the size
+  3.  K <= (a[i+1] - a[i]) <= K+1 : all intervals are K or K+1 size
+  4.  length(row_pooling_sequence) = output_row_length+1
+
+  For more details on fractional max pooling, see this paper: [Benjamin Graham,
+  Fractional Max-Pooling](http://arxiv.org/abs/1412.6071)
+
+  Args:
+    value: A `Tensor`. 4-D with shape `[batch, height, width, channels]`.
+    pooling_ratio: A list of `floats` that has length >= 4.  Pooling ratio for
+      each dimension of `value`, currently only supports row and col dimension
+      and should be >= 1.0. For example, a valid pooling ratio looks like [1.0,
+      1.44, 1.73, 1.0]. The first and last elements must be 1.0 because we don't
+      allow pooling on batch and channels dimensions.  1.44 and 1.73 are pooling
+      ratio on height and width dimensions respectively.
+    pseudo_random: An optional `bool`.  Defaults to `False`. When set to `True`,
+      generates the pooling sequence in a pseudorandom fashion, otherwise, in a
+      random fashion. Check paper [Benjamin Graham, Fractional
+      Max-Pooling](http://arxiv.org/abs/1412.6071) for difference between
+      pseudorandom and random.
+    overlapping: An optional `bool`.  Defaults to `False`.  When set to `True`,
+      it means when pooling, the values at the boundary of adjacent pooling
+      cells are used by both cells. For example:
+      `index  0  1  2  3  4`
+      `value  20 5  16 3  7`
+      If the pooling sequence is [0, 2, 4], then 16, at index 2 will be used
+      twice.  The result would be [20, 16] for fractional max pooling.
+    deterministic: An optional `bool`.  Deprecated; use `fractional_max_pool_v2`
+      instead.
+    seed: An optional `int`.  Defaults to `0`.  If set to be non-zero, the
+      random number generator is seeded by the given seed.  Otherwise it is
+      seeded by a random seed.
+    seed2: An optional `int`.  Deprecated; use `fractional_max_pool_v2` instead.
+    name: A name for the operation (optional).
+
+  Returns:
+  A tuple of `Tensor` objects (`output`, `row_pooling_sequence`,
+  `col_pooling_sequence`).
+    output: Output `Tensor` after fractional max pooling.  Has the same type as
+      `value`.
+    row_pooling_sequence: A `Tensor` of type `int64`.
+    col_pooling_sequence: A `Tensor` of type `int64`.
+  """
+  return gen_nn_ops.fractional_max_pool(value, pooling_ratio, pseudo_random,
+                                        overlapping, deterministic, seed, seed2,
+                                        name)
+
+
+@tf_export("nn.fractional_max_pool", v1=[])
+def fractional_max_pool_v2(value,
+                           pooling_ratio,
+                           pseudo_random=False,
+                           overlapping=False,
+                           seed=0,
+                           name=None):  # pylint: disable=redefined-builtin
+  r"""Performs fractional max pooling on the input.
+
+  Fractional max pooling is slightly different than regular max pooling.  In
+  regular max pooling, you downsize an input set by taking the maximum value of
+  smaller N x N subsections of the set (often 2x2), and try to reduce the set by
+  a factor of N, where N is an integer.  Fractional max pooling, as you might
+  expect from the word "fractional", means that the overall reduction ratio N
+  does not have to be an integer.
+
+  The sizes of the pooling regions are generated randomly but are fairly
+  uniform.  For example, let's look at the height dimension, and the constraints
+  on the list of rows that will be pool boundaries.
+
+  First we define the following:
+
+  1.  input_row_length : the number of rows from the input set
+  2.  output_row_length : which will be smaller than the input
+  3.  alpha = input_row_length / output_row_length : our reduction ratio
+  4.  K = floor(alpha)
+  5.  row_pooling_sequence : this is the result list of pool boundary rows
+
+  Then, row_pooling_sequence should satisfy:
+
+  1.  a[0] = 0 : the first value of the sequence is 0
+  2.  a[end] = input_row_length : the last value of the sequence is the size
+  3.  K <= (a[i+1] - a[i]) <= K+1 : all intervals are K or K+1 size
+  4.  length(row_pooling_sequence) = output_row_length+1
+
+  For more details on fractional max pooling, see this paper: [Benjamin Graham,
+  Fractional Max-Pooling](http://arxiv.org/abs/1412.6071)
+
+  Args:
+    value: A `Tensor`. 4-D with shape `[batch, height, width, channels]`.
+    pooling_ratio: A list of `floats` that has length >= 4.  Pooling ratio for
+      each dimension of `value`, currently only supports row and col dimension
+      and should be >= 1.0. For example, a valid pooling ratio looks like [1.0,
+      1.44, 1.73, 1.0]. The first and last elements must be 1.0 because we don't
+      allow pooling on batch and channels dimensions.  1.44 and 1.73 are pooling
+      ratio on height and width dimensions respectively.
+    pseudo_random: An optional `bool`.  Defaults to `False`. When set to `True`,
+      generates the pooling sequence in a pseudorandom fashion, otherwise, in a
+      random fashion. Check paper [Benjamin Graham, Fractional
+      Max-Pooling](http://arxiv.org/abs/1412.6071) for difference between
+      pseudorandom and random.
+    overlapping: An optional `bool`.  Defaults to `False`.  When set to `True`,
+      it means when pooling, the values at the boundary of adjacent pooling
+      cells are used by both cells. For example:
+      `index  0  1  2  3  4`
+      `value  20 5  16 3  7`
+      If the pooling sequence is [0, 2, 4], then 16, at index 2 will be used
+      twice.  The result would be [20, 16] for fractional max pooling.
+    seed: An optional `int`.  Defaults to `0`.  If set to be non-zero, the
+      random number generator is seeded by the given seed.  Otherwise it is
+      seeded by a random seed.
+    name: A name for the operation (optional).
+
+  Returns:
+  A tuple of `Tensor` objects (`output`, `row_pooling_sequence`,
+  `col_pooling_sequence`).
+    output: Output `Tensor` after fractional max pooling.  Has the same type as
+      `value`.
+    row_pooling_sequence: A `Tensor` of type `int64`.
+    col_pooling_sequence: A `Tensor` of type `int64`.
+  """
+  if seed == 0:
+    return gen_nn_ops.fractional_max_pool(value, pooling_ratio, pseudo_random,
+                                          overlapping, deterministic=False,
+                                          seed=0, seed2=0, name=name)
+  else:
+    seed1, seed2 = random_seed.get_seed(seed)
+    return gen_nn_ops.fractional_max_pool(value, pooling_ratio, pseudo_random,
+                                          overlapping, deterministic=True,
+                                          seed=seed1, seed2=seed2, name=name)
+
+
+@tf_export(v1=["nn.fractional_avg_pool"])
+@deprecation.deprecated(date=None, instructions="`seed2` and `deterministic` "
+                        "args are deprecated.  Use fractional_avg_pool_v2.")
+def fractional_avg_pool(value,
+                        pooling_ratio,
+                        pseudo_random=False,
+                        overlapping=False,
+                        deterministic=False,
+                        seed=0,
+                        seed2=0,
+                        name=None):  # pylint: disable=redefined-builtin
+  r"""Performs fractional average pooling on the input.
+
+  This is a deprecated version of `fractional_avg_pool`.
+
+  Fractional average pooling is similar to Fractional max pooling in the pooling
+  region generation step. The only difference is that after pooling regions are
+  generated, a mean operation is performed instead of a max operation in each
+  pooling region.
+
+  Args:
+    value: A `Tensor`. 4-D with shape `[batch, height, width, channels]`.
+    pooling_ratio: A list of `floats` that has length >= 4.  Pooling ratio for
+      each dimension of `value`, currently only supports row and col dimension
+      and should be >= 1.0. For example, a valid pooling ratio looks like [1.0,
+      1.44, 1.73, 1.0]. The first and last elements must be 1.0 because we don't
+      allow pooling on batch and channels dimensions.  1.44 and 1.73 are pooling
+      ratio on height and width dimensions respectively.
+    pseudo_random: An optional `bool`.  Defaults to `False`. When set to `True`,
+      generates the pooling sequence in a pseudorandom fashion, otherwise, in a
+      random fashion. Check paper [Benjamin Graham, Fractional
+      Max-Pooling](http://arxiv.org/abs/1412.6071) for difference between
+      pseudorandom and random.
+    overlapping: An optional `bool`.  Defaults to `False`.  When set to `True`,
+      it means when pooling, the values at the boundary of adjacent pooling
+      cells are used by both cells. For example:
+      `index  0  1  2  3  4`
+      `value  20 5  16 3  7`
+      If the pooling sequence is [0, 2, 4], then 16, at index 2 will be used
+      twice.  The result would be [20, 16] for fractional avg pooling.
+    deterministic: An optional `bool`.  Deprecated; use `fractional_avg_pool_v2`
+      instead.
+    seed: An optional `int`.  Defaults to `0`.  If set to be non-zero, the
+      random number generator is seeded by the given seed.  Otherwise it is
+      seeded by a random seed.
+    seed2: An optional `int`.  Deprecated; use `fractional_avg_pool_v2` instead.
+    name: A name for the operation (optional).
+
+  Returns:
+  A tuple of `Tensor` objects (`output`, `row_pooling_sequence`,
+  `col_pooling_sequence`).
+    output: Output `Tensor` after fractional avg pooling.  Has the same type as
+      `value`.
+    row_pooling_sequence: A `Tensor` of type `int64`.
+    col_pooling_sequence: A `Tensor` of type `int64`.
+  """
+  return gen_nn_ops.fractional_avg_pool(value, pooling_ratio, pseudo_random,
+                                        overlapping, deterministic, seed, seed2,
+                                        name=name)
+
+
+@tf_export("nn.fractional_avg_pool", v1=[])
+def fractional_avg_pool_v2(value,
+                           pooling_ratio,
+                           pseudo_random=False,
+                           overlapping=False,
+                           seed=0,
+                           name=None):  # pylint: disable=redefined-builtin
+  r"""Performs fractional average pooling on the input.
+
+  Fractional average pooling is similar to Fractional max pooling in the pooling
+  region generation step. The only difference is that after pooling regions are
+  generated, a mean operation is performed instead of a max operation in each
+  pooling region.
+
+  Args:
+    value: A `Tensor`. 4-D with shape `[batch, height, width, channels]`.
+    pooling_ratio: A list of `floats` that has length >= 4.  Pooling ratio for
+      each dimension of `value`, currently only supports row and col dimension
+      and should be >= 1.0. For example, a valid pooling ratio looks like [1.0,
+      1.44, 1.73, 1.0]. The first and last elements must be 1.0 because we don't
+      allow pooling on batch and channels dimensions.  1.44 and 1.73 are pooling
+      ratio on height and width dimensions respectively.
+    pseudo_random: An optional `bool`.  Defaults to `False`. When set to `True`,
+      generates the pooling sequence in a pseudorandom fashion, otherwise, in a
+      random fashion. Check paper [Benjamin Graham, Fractional
+      Max-Pooling](http://arxiv.org/abs/1412.6071) for difference between
+      pseudorandom and random.
+    overlapping: An optional `bool`.  Defaults to `False`.  When set to `True`,
+      it means when pooling, the values at the boundary of adjacent pooling
+      cells are used by both cells. For example:
+      `index  0  1  2  3  4`
+      `value  20 5  16 3  7`
+      If the pooling sequence is [0, 2, 4], then 16, at index 2 will be used
+      twice.  The result would be [20, 16] for fractional avg pooling.
+    seed: An optional `int`.  Defaults to `0`.  If set to be non-zero, the
+      random number generator is seeded by the given seed.  Otherwise it is
+      seeded by a random seed.
+    name: A name for the operation (optional).
+
+  Returns:
+  A tuple of `Tensor` objects (`output`, `row_pooling_sequence`,
+  `col_pooling_sequence`).
+    output: Output `Tensor` after fractional avg pooling.  Has the same type as
+      `value`.
+    row_pooling_sequence: A `Tensor` of type `int64`.
+    col_pooling_sequence: A `Tensor` of type `int64`.
+  """
+  if seed == 0:
+    return gen_nn_ops.fractional_avg_pool(value, pooling_ratio, pseudo_random,
+                                          overlapping, deterministic=False,
+                                          seed=0, seed2=0, name=name)
+  else:
+    seed1, seed2 = random_seed.get_seed(seed)
+    return gen_nn_ops.fractional_avg_pool(value, pooling_ratio, pseudo_random,
+                                          overlapping, deterministic=True,
+                                          seed=seed1, seed2=seed2, name=name)
+
+
+@tf_export(v1=["nn.conv1d"])
 @deprecation.deprecated_arg_values(
     None,
     "`NCHW` for data_format is deprecated, use `NCW` instead",
@@ -2487,6 +3474,64 @@ def conv1d(value,
     return array_ops.squeeze(result, [spatial_start_dim])
 
 
+@tf_export("nn.conv1d", v1=[])
+def conv1d_v2(input,  # pylint: disable=redefined-builtin
+              filters,
+              stride,
+              padding,
+              data_format=None,
+              name=None):
+  r"""Computes a 1-D convolution given 3-D input and filter tensors.
+
+  Given an input tensor of shape
+    [batch, in_width, in_channels]
+  if data_format is "NWC", or
+    [batch, in_channels, in_width]
+  if data_format is "NCW",
+  and a filter / kernel tensor of shape
+  [filter_width, in_channels, out_channels], this op reshapes
+  the arguments to pass them to conv2d to perform the equivalent
+  convolution operation.
+
+  Internally, this op reshapes the input tensors and invokes `tf.nn.conv2d`.
+  For example, if `data_format` does not start with "NC", a tensor of shape
+    [batch, in_width, in_channels]
+  is reshaped to
+    [batch, 1, in_width, in_channels],
+  and the filter is reshaped to
+    [1, filter_width, in_channels, out_channels].
+  The result is then reshaped back to
+    [batch, out_width, out_channels]
+  \(where out_width is a function of the stride and padding as in conv2d\) and
+  returned to the caller.
+
+  Args:
+    input: A 3D `Tensor`.  Must be of type `float16`, `float32`, or `float64`.
+    filters: A 3D `Tensor`.  Must have the same type as `input`.
+    stride: An `integer`.  The number of entries by which
+      the filter is moved right at each step.
+    padding: 'SAME' or 'VALID'
+    data_format: An optional `string` from `"NWC", "NCW"`.  Defaults
+      to `"NWC"`, the data is stored in the order of
+      [batch, in_width, in_channels].  The `"NCW"` format stores
+      data as [batch, in_channels, in_width].
+    name: A name for the operation (optional).
+
+  Returns:
+    A `Tensor`.  Has the same type as input.
+
+  Raises:
+    ValueError: if `data_format` is invalid.
+  """
+  return conv1d(input,  # pylint: disable=redefined-builtin
+                filters,
+                stride,
+                padding,
+                use_cudnn_on_gpu=True,
+                data_format=data_format,
+                name=name)
+
+
 def conv1d_transpose(
     value,
     filter,  # pylint: disable=redefined-builtin
@@ -2602,7 +3647,7 @@ def _calc_dilation2d_flops(graph, node):
   return ops.OpStats("flops", (output_count * filter_height * filter_width * 2))
 
 
-@tf_export("nn.erosion2d")
+@tf_export(v1=["nn.erosion2d"])
 def erosion2d(value, kernel, strides, rates, padding, name=None):
   """Computes the grayscale erosion of 4-D `value` and 3-D `kernel` tensors.
 
@@ -2661,6 +3706,75 @@ def erosion2d(value, kernel, strides, rates, padding, name=None):
             name=name))
 
 
+@tf_export("nn.erosion2d", v1=[])
+def erosion2d_v2(value,
+                 filters,
+                 strides,
+                 padding,
+                 data_format,
+                 dilations,
+                 name=None):
+  """Computes the grayscale erosion of 4-D `value` and 3-D `filters` tensors.
+
+  The `value` tensor has shape `[batch, in_height, in_width, depth]` and the
+  `filters` tensor has shape `[filters_height, filters_width, depth]`, i.e.,
+  each input channel is processed independently of the others with its own
+  structuring function. The `output` tensor has shape
+  `[batch, out_height, out_width, depth]`. The spatial dimensions of the
+  output tensor depend on the `padding` algorithm. We currently only support the
+  default "NHWC" `data_format`.
+
+  In detail, the grayscale morphological 2-D erosion is given by:
+
+      output[b, y, x, c] =
+         min_{dy, dx} value[b,
+                            strides[1] * y - dilations[1] * dy,
+                            strides[2] * x - dilations[2] * dx,
+                            c] -
+                      filters[dy, dx, c]
+
+  Duality: The erosion of `value` by the `filters` is equal to the negation of
+  the dilation of `-value` by the reflected `filters`.
+
+  Args:
+    value: A `Tensor`. 4-D with shape `[batch, in_height, in_width, depth]`.
+    filters: A `Tensor`. Must have the same type as `value`.
+      3-D with shape `[filters_height, filters_width, depth]`.
+    strides: A list of `ints` that has length `>= 4`.
+      1-D of length 4. The stride of the sliding window for each dimension of
+      the input tensor. Must be: `[1, stride_height, stride_width, 1]`.
+    padding: A `string` from: `"SAME", "VALID"`.
+      The type of padding algorithm to use.
+    data_format: A `string`, only `"NHWC"` is currently supported.
+    dilations: A list of `ints` that has length `>= 4`.
+      1-D of length 4. The input stride for atrous morphological dilation.
+      Must be: `[1, rate_height, rate_width, 1]`.
+    name: A name for the operation (optional). If not specified "erosion2d"
+      is used.
+
+  Returns:
+    A `Tensor`. Has the same type as `value`.
+    4-D with shape `[batch, out_height, out_width, depth]`.
+
+  Raises:
+    ValueError: If the `value` depth does not match `filters`' shape, or if
+      padding is other than `'VALID'` or `'SAME'`.
+  """
+  if data_format != "NHWC":
+    raise ValueError("Data formats other than NHWC are not yet supported")
+
+  with ops.name_scope(name, "erosion2d", [value, filters]) as name:
+    # Reduce erosion to dilation by duality.
+    return math_ops.negative(
+        gen_nn_ops.dilation2d(
+            input=math_ops.negative(value),
+            filter=array_ops.reverse_v2(filters, [0, 1]),
+            strides=strides,
+            rates=dilations,
+            padding=padding,
+            name=name))
+
+
 @tf_export("math.in_top_k", "nn.in_top_k")
 def in_top_k(predictions, targets, k, name=None):
   r"""Says whether the targets are in the top `K` predictions.
@@ -2693,3 +3807,10 @@ def in_top_k(predictions, targets, k, name=None):
   """
   with ops.name_scope(name, "in_top_k"):
     return gen_nn_ops.in_top_kv2(predictions, targets, k, name=name)
+
+
+tf_export(v1=["nn.quantized_avg_pool"])(gen_nn_ops.quantized_avg_pool)
+tf_export(v1=["nn.quantized_conv2d"])(gen_nn_ops.quantized_conv2d)
+tf_export(v1=["nn.quantized_relu_x"])(gen_nn_ops.quantized_relu_x)
+tf_export(v1=["nn.quantized_max_pool"])(gen_nn_ops.quantized_max_pool)
+
diff --git a/tensorflow/python/ops/nn_test.py b/tensorflow/python/ops/nn_test.py
index 152b2020ebb..0336d0d27cb 100644
--- a/tensorflow/python/ops/nn_test.py
+++ b/tensorflow/python/ops/nn_test.py
@@ -49,36 +49,39 @@ class ZeroFractionTest(test_lib.TestCase):
     nonzeros = np.count_nonzero(x.flatten())
     return 1.0 - nonzeros / total_elements
 
+  @test_util.run_deprecated_v1
   def testZeroFraction(self):
     x_shape = [5, 17]
     x_np = np.random.randint(0, 2, size=x_shape).astype(np.float32)
     y_np = self._ZeroFraction(x_np)
-    with self.cached_session():
-      x_tf = constant_op.constant(x_np)
-      x_tf.set_shape(x_shape)
-      y_tf = nn_impl.zero_fraction(x_tf)
-      y_tf_np = y_tf.eval()
+
+    x_tf = constant_op.constant(x_np)
+    x_tf.set_shape(x_shape)
+    y_tf = nn_impl.zero_fraction(x_tf)
+    y_tf_np = self.evaluate(y_tf)
+
     eps = 1e-8
     self.assertAllClose(y_tf_np, y_np, eps)
 
+  @test_util.run_deprecated_v1
   def testZeroFractionEmpty(self):
-    with self.cached_session():
-      x = np.zeros(0)
-      y = nn_impl.zero_fraction(x).eval()
-      self.assertTrue(np.isnan(y))
+    x = np.zeros(0)
+    y = self.evaluate(nn_impl.zero_fraction(x))
+    self.assertTrue(np.isnan(y))
 
+  @test_util.run_deprecated_v1
   def testZeroFraction2_27Zeros(self):
     sparsity = nn_impl.zero_fraction(
         array_ops.zeros([int(2**27 * 1.01)], dtype=dtypes.int8))
-    with self.cached_session():
-      self.assertAllClose(1.0, sparsity.eval())
+    self.assertAllClose(1.0, self.evaluate(sparsity))
 
+  @test_util.run_deprecated_v1
   def testZeroFraction2_27Ones(self):
     sparsity = nn_impl.zero_fraction(
         array_ops.ones([int(2**27 * 1.01)], dtype=dtypes.int8))
-    with self.cached_session():
-      self.assertAllClose(0.0, sparsity.eval())
+    self.assertAllClose(0.0, self.evaluate(sparsity))
 
+  @test_util.run_deprecated_v1
   def testUnknownSize(self):
     value = array_ops.placeholder(dtype=dtypes.float32)
     sparsity = nn_impl.zero_fraction(value)
@@ -103,8 +106,8 @@ class SoftmaxTest(test_lib.TestCase, parameterized.TestCase):
     x_np = np.random.randn(*x_shape).astype(np.float32)
     y_np = self._softmax(x_np)
     x_tf = constant_op.constant(x_np)
-    y_tf = nn_ops.softmax(x_tf)
-    y_tf_last_dim = nn_ops.softmax(x_tf, 1)
+    y_tf = nn_ops.softmax_v2(x_tf)
+    y_tf_last_dim = nn_ops.softmax_v2(x_tf, 1)
     y_tf_np = self.evaluate(y_tf)
     y_tf_last_dim_np = self.evaluate(y_tf_last_dim)
     eps = 1e-3
@@ -113,9 +116,9 @@ class SoftmaxTest(test_lib.TestCase, parameterized.TestCase):
 
   def testSoftmaxAxes(self):
     arr = np.linspace(0., 1, 12).reshape(3, 4)
-    x_neg_axis = nn_ops.softmax(arr, axis=-2)
-    y_pos_axis = nn_ops.softmax(arr, axis=0)
-    z_gt_axis = nn_ops.softmax(arr, axis=0)
+    x_neg_axis = nn_ops.softmax_v2(arr, axis=-2)
+    y_pos_axis = nn_ops.softmax_v2(arr, axis=0)
+    z_gt_axis = nn_ops.softmax_v2(arr, axis=0)
     x_neg_axis_tf = self.evaluate(x_neg_axis)
     y_pos_axis_tf = self.evaluate(y_pos_axis)
     z_gt_axis_tf = self.evaluate(z_gt_axis)
@@ -124,11 +127,12 @@ class SoftmaxTest(test_lib.TestCase, parameterized.TestCase):
     self.assertAllClose(y_pos_axis_tf, z_gt_axis_tf, eps)
 
   @parameterized.parameters(((5, 10),), ((2, 3, 4),))
+  @test_util.run_deprecated_v1
   def testGradient(self, x_shape):
     x_np = np.random.randn(*x_shape).astype(np.float64)
     with self.cached_session():
       x_tf = constant_op.constant(x_np)
-      y_tf = nn_ops.softmax(x_tf)
+      y_tf = nn_ops.softmax_v2(x_tf)
       err = gradient_checker.compute_gradient_error(x_tf, x_shape, y_tf,
                                                     x_shape)
     eps = 2e-8
@@ -159,6 +163,7 @@ class LogPoissonLossTest(test_lib.TestCase):
     self.assertAllClose(y_tf_np, y_np, eps)
     self.assertAllClose(y_tf_np_stirling, y_np_stirling, eps)
 
+  @test_util.run_deprecated_v1
   def testGradient(self):
     x_shape = [5, 10]
     x_np = np.random.randn(*x_shape).astype(np.float64)
@@ -191,16 +196,16 @@ class LogSoftmaxTest(test_lib.TestCase, parameterized.TestCase):
     x_np = np.random.randn(*x_shape).astype(np.float32)
     y_np = self._log_softmax(x_np)
     x_tf = constant_op.constant(x_np)
-    y_tf = nn_ops.log_softmax(x_tf)
+    y_tf = nn_ops.log_softmax_v2(x_tf)
     y_tf_np = self.evaluate(y_tf)
     eps = 1e-3
     self.assertAllClose(y_tf_np, y_np, eps)
 
   def testLogSoftmaxAxes(self):
     arr = np.linspace(0., 1, 12).reshape(3, 4)
-    x_neg_axis = nn_ops.log_softmax(arr, axis=-2)
-    y_pos_axis = nn_ops.log_softmax(arr, axis=0)
-    z_gt_axis = nn_ops.log_softmax(arr, axis=0)
+    x_neg_axis = nn_ops.log_softmax_v2(arr, axis=-2)
+    y_pos_axis = nn_ops.log_softmax_v2(arr, axis=0)
+    z_gt_axis = nn_ops.log_softmax_v2(arr, axis=0)
     x_neg_axis_tf = self.evaluate(x_neg_axis)
     y_pos_axis_tf = self.evaluate(y_pos_axis)
     z_gt_axis_tf = self.evaluate(z_gt_axis)
@@ -209,11 +214,12 @@ class LogSoftmaxTest(test_lib.TestCase, parameterized.TestCase):
     self.assertAllClose(y_pos_axis_tf, z_gt_axis_tf, eps)
 
   @parameterized.parameters(((5, 10),), ((2, 3, 4),))
+  @test_util.run_deprecated_v1
   def testGradient(self, x_shape):
     x_np = np.random.randn(*x_shape).astype(np.float64)
     with self.cached_session():
       x_tf = constant_op.constant(x_np)
-      y_tf = nn_ops.log_softmax(x_tf)
+      y_tf = nn_ops.log_softmax_v2(x_tf)
       err = gradient_checker.compute_gradient_error(x_tf, x_shape, y_tf,
                                                     x_shape)
     eps = 1e-7
@@ -231,6 +237,7 @@ class L2LossTest(test_lib.TestCase):
       value = self.evaluate(l2loss)
       self.assertAllClose(7.0, value)
 
+  @test_util.run_deprecated_v1
   def testGradient(self):
     x_shape = [20, 7, 3]
     np.random.seed(1)  # Make it reproducible.
@@ -264,7 +271,7 @@ class L2NormalizeTest(test_lib.TestCase):
     for dim in range(len(x_shape)):
       y_np = self._l2Normalize(x_np, dim)
       x_tf = constant_op.constant(x_np, name="x")
-      y_tf = nn_impl.l2_normalize(x_tf, dim)
+      y_tf = nn_impl.l2_normalize_v2(x_tf, dim)
       self.assertAllClose(y_np, self.evaluate(y_tf))
 
   @test_util.run_in_graph_and_eager_modes
@@ -275,9 +282,10 @@ class L2NormalizeTest(test_lib.TestCase):
     dim = [1, 2]
     y_np = self._l2Normalize(x_np, dim)
     x_tf = constant_op.constant(x_np, name="x")
-    y_tf = nn_impl.l2_normalize(x_tf, dim)
+    y_tf = nn_impl.l2_normalize_v2(x_tf, dim)
     self.assertAllClose(y_np, self.evaluate(y_tf))
 
+  @test_util.run_deprecated_v1
   def testL2NormalizeGradient(self):
     x_shape = [20, 7, 3]
     np.random.seed(1)
@@ -285,7 +293,7 @@ class L2NormalizeTest(test_lib.TestCase):
     for dim in range(len(x_shape)):
       with self.cached_session():
         x_tf = constant_op.constant(x_np, name="x")
-        y_tf = nn_impl.l2_normalize(x_tf, dim)
+        y_tf = nn_impl.l2_normalize_v2(x_tf, dim)
         err = gradient_checker.compute_gradient_error(x_tf, x_shape, y_tf,
                                                       x_shape)
       print("L2Normalize gradient err = %g " % err)
@@ -302,19 +310,18 @@ class DropoutTest(test_lib.TestCase):
     y_dim = 30
     num_iter = 10
     for keep_prob in [0.1, 0.5, 0.8]:
-      with self.cached_session():
-        t = constant_op.constant(
-            1.0, shape=[x_dim, y_dim], dtype=dtypes.float32)
-        dropout = nn_ops.dropout(t, keep_prob)
-        final_count = 0
-        self.assertEqual([x_dim, y_dim], dropout.get_shape())
-        for _ in xrange(0, num_iter):
-          value = dropout.eval()
-          final_count += np.count_nonzero(value)
-          # Verifies that there are only two values: 0 and 1/keep_prob.
-          sorted_value = np.unique(np.sort(value))
-          self.assertEqual(0, sorted_value[0])
-          self.assertAllClose(1 / keep_prob, sorted_value[1])
+      t = constant_op.constant(1.0, shape=[x_dim, y_dim], dtype=dtypes.float32)
+      dropout = nn_ops.dropout(t, keep_prob)
+      final_count = 0
+      self.assertEqual([x_dim, y_dim], dropout.get_shape())
+      for _ in xrange(0, num_iter):
+        value = self.evaluate(dropout)
+        final_count += np.count_nonzero(value)
+        # Verifies that there are only two values: 0 and 1/keep_prob.
+        sorted_value = np.unique(np.sort(value))
+        self.assertEqual(0, sorted_value[0])
+        self.assertAllClose(1 / keep_prob, sorted_value[1])
+
       # Check that we are in the 15% error range
       expected_count = x_dim * y_dim * keep_prob * num_iter
       rel_error = math.fabs(final_count - expected_count) / expected_count
@@ -330,19 +337,18 @@ class DropoutTest(test_lib.TestCase):
     y_dim = 3
     num_iter = 10
     for keep_prob in [0.1, 0.5, 0.8]:
-      with self.cached_session():
-        t = constant_op.constant(
-            1.0, shape=[x_dim, y_dim], dtype=dtypes.float32)
-        dropout = nn_ops.dropout(t, keep_prob, noise_shape=[x_dim, 1])
-        self.assertEqual([x_dim, y_dim], dropout.get_shape())
-        final_count = 0
-        for _ in xrange(0, num_iter):
-          value = dropout.eval()
-          final_count += np.count_nonzero(value)
-          # Verifies that there are only two values: 0 and 1/keep_prob.
-          sorted_value = np.unique(np.sort(value))
-          self.assertEqual(0, sorted_value[0])
-          self.assertAllClose(1 / keep_prob, sorted_value[1])
+      t = constant_op.constant(1.0, shape=[x_dim, y_dim], dtype=dtypes.float32)
+      dropout = nn_ops.dropout(t, keep_prob, noise_shape=[x_dim, 1])
+      self.assertEqual([x_dim, y_dim], dropout.get_shape())
+      final_count = 0
+      for _ in xrange(0, num_iter):
+        value = self.evaluate(dropout)
+        final_count += np.count_nonzero(value)
+        # Verifies that there are only two values: 0 and 1/keep_prob.
+        sorted_value = np.unique(np.sort(value))
+        self.assertEqual(0, sorted_value[0])
+        self.assertAllClose(1 / keep_prob, sorted_value[1])
+
       # Check that we are in the 15% error range
       expected_count = x_dim * y_dim * keep_prob * num_iter
       rel_error = math.fabs(final_count - expected_count) / expected_count
@@ -355,18 +361,17 @@ class DropoutTest(test_lib.TestCase):
     y_dim = 30
     num_iter = 10
     for keep_prob in [0.1, 0.5, 0.8]:
-      with self.cached_session():
-        t = constant_op.constant(
-            1.0, shape=[x_dim, y_dim], dtype=dtypes.float32)
-        dropout = nn_ops.dropout(t, keep_prob, noise_shape=[x_dim, 1])
-        self.assertEqual([x_dim, y_dim], dropout.get_shape())
-        for _ in xrange(0, num_iter):
-          value = dropout.eval()
-          # Verifies that each y column as only one type of activation.
-          for i in xrange(x_dim):
-            sorted_value = np.unique(np.sort(value[i, :]))
-            self.assertEqual(sorted_value.size, 1)
+      t = constant_op.constant(1.0, shape=[x_dim, y_dim], dtype=dtypes.float32)
+      dropout = nn_ops.dropout(t, keep_prob, noise_shape=[x_dim, 1])
+      self.assertEqual([x_dim, y_dim], dropout.get_shape())
+      for _ in xrange(0, num_iter):
+        value = self.evaluate(dropout)
+        # Verifies that each y column as only one type of activation.
+        for i in xrange(x_dim):
+          sorted_value = np.unique(np.sort(value[i, :]))
+          self.assertEqual(sorted_value.size, 1)
 
+  @test_util.run_deprecated_v1
   def testDropoutPlaceholderKeepProb(self):
     # Runs dropout with 0-1 tensor 10 times, sum the number of ones and validate
     # that it is producing approximately the right number of ones over a large
@@ -395,6 +400,7 @@ class DropoutTest(test_lib.TestCase):
       print(rel_error)
       self.assertTrue(rel_error < 0.15)
 
+  @test_util.run_deprecated_v1
   def testShapedDropoutUnknownShape(self):
     x_dim = 40
     y_dim = 30
@@ -409,26 +415,26 @@ class DropoutTest(test_lib.TestCase):
     y_dim = 3
     num_iter = 10
     for keep_prob in [0.1, 0.5, 0.8]:
-      with self.cached_session():
-        t = constant_op.constant(
-            1.0, shape=[x_dim, y_dim], dtype=dtypes.float32)
-        # Set noise_shape=[None, 1] which means [x_dim, 1].
-        dropout = nn_ops.dropout(t, keep_prob, noise_shape=[None, 1])
-        self.assertEqual([x_dim, y_dim], dropout.get_shape())
-        final_count = 0
-        for _ in xrange(0, num_iter):
-          value = dropout.eval()
-          final_count += np.count_nonzero(value)
-          # Verifies that there are only two values: 0 and 1/keep_prob.
-          sorted_value = np.unique(np.sort(value))
-          self.assertEqual(0, sorted_value[0])
-          self.assertAllClose(1 / keep_prob, sorted_value[1])
+      t = constant_op.constant(1.0, shape=[x_dim, y_dim], dtype=dtypes.float32)
+      # Set noise_shape=[None, 1] which means [x_dim, 1].
+      dropout = nn_ops.dropout(t, keep_prob, noise_shape=[None, 1])
+      self.assertEqual([x_dim, y_dim], dropout.get_shape())
+      final_count = 0
+      for _ in xrange(0, num_iter):
+        value = self.evaluate(dropout)
+        final_count += np.count_nonzero(value)
+        # Verifies that there are only two values: 0 and 1/keep_prob.
+        sorted_value = np.unique(np.sort(value))
+        self.assertEqual(0, sorted_value[0])
+        self.assertAllClose(1 / keep_prob, sorted_value[1])
+
       # Check that we are in the 15% error range
       expected_count = x_dim * y_dim * keep_prob * num_iter
       rel_error = math.fabs(final_count - expected_count) / expected_count
       print(rel_error)
       self.assertTrue(rel_error < 0.15)
 
+  @test_util.run_deprecated_v1
   def testInvalidKeepProb(self):
     x_dim = 40
     y_dim = 30
@@ -444,6 +450,18 @@ class DropoutTest(test_lib.TestCase):
     with self.assertRaises(ValueError):
       nn_ops.dropout(t, array_ops.placeholder(dtypes.float32, shape=[2]))
 
+  def testInvalidRate(self):
+    x_dim = 40
+    y_dim = 30
+    t = constant_op.constant(1.0, shape=[x_dim, y_dim], dtype=dtypes.float32)
+    with self.assertRaises(ValueError):
+      nn_ops.dropout_v2(t, -1.0)
+    with self.assertRaises(ValueError):
+      nn_ops.dropout_v2(t, 1.1)
+    with self.assertRaises(ValueError):
+      nn_ops.dropout_v2(t, [0.0, 1.0])
+
+  @test_util.run_deprecated_v1
   def testShapedDropoutShapeError(self):
     # Runs shaped dropout and verifies an error is thrown on misshapen noise.
     x_dim = 40
@@ -466,9 +484,11 @@ class DropoutTest(test_lib.TestCase):
 
   def testNoDropoutFast(self):
     x = array_ops.zeros((5,))
-    for p in 1, constant_op.constant(1.0):
-      y = nn_ops.dropout(x, keep_prob=p)
-      self.assertTrue(x is y)
+    y = nn_ops.dropout(x, keep_prob=1)
+    self.assertTrue(x is y)
+
+    y = nn_ops.dropout_v2(x, rate=0)
+    self.assertTrue(x is y)
 
   def testDropoutWithIntegerInputs(self):
     x = constant_op.constant([1, 1, 1, 1, 1])
@@ -563,78 +583,78 @@ class ComputeSampledLogitsTest(test_lib.TestCase):
           initializer=constant_op.constant(biases))
       with self.session(graph=g) as sess:
         variables.global_variables_initializer().run()
-        return sess.run([list(sharded_weights), list(sharded_biases)])
+        return self.evaluate([list(sharded_weights), list(sharded_biases)])
 
   def testShapes(self):
     np.random.seed(0)
     num_classes = 5
     batch_size = 3
-    with self.cached_session() as sess:
-      for num_true in range(1, 5):
-        labels = np.random.randint(
-            low=0, high=num_classes, size=batch_size * num_true)
-        (weights, biases, hidden_acts, sampled_vals, exp_logits,
-         exp_labels) = self._GenerateTestData(
-             num_classes=num_classes,
-             dim=10,
-             batch_size=batch_size,
-             num_true=num_true,
-             labels=labels,
-             sampled=[1, 0, 2, 3],
-             subtract_log_q=False)
-        logits_tensor, labels_tensor = _compute_sampled_logits(
-            weights=constant_op.constant(weights),
-            biases=constant_op.constant(biases),
-            labels=constant_op.constant(
-                labels, dtype=dtypes.int64, shape=(batch_size, num_true)),
-            inputs=constant_op.constant(hidden_acts),
-            num_sampled=4,
-            num_classes=num_classes,
-            num_true=num_true,
-            sampled_values=sampled_vals,
-            subtract_log_q=False,
-            remove_accidental_hits=False,
-            partition_strategy="div",
-            name="sampled_logits_basic_num_true_%d" % num_true)
-        got_logits, got_labels = sess.run([logits_tensor, labels_tensor])
-        self.assertEqual(exp_logits.shape, got_logits.shape, self._eps)
-        self.assertEqual(exp_labels.shape, got_labels.shape, self._eps)
+
+    for num_true in range(1, 5):
+      labels = np.random.randint(
+          low=0, high=num_classes, size=batch_size * num_true)
+      (weights, biases, hidden_acts, sampled_vals, exp_logits,
+       exp_labels) = self._GenerateTestData(
+           num_classes=num_classes,
+           dim=10,
+           batch_size=batch_size,
+           num_true=num_true,
+           labels=labels,
+           sampled=[1, 0, 2, 3],
+           subtract_log_q=False)
+      logits_tensor, labels_tensor = _compute_sampled_logits(
+          weights=constant_op.constant(weights),
+          biases=constant_op.constant(biases),
+          labels=constant_op.constant(
+              labels, dtype=dtypes.int64, shape=(batch_size, num_true)),
+          inputs=constant_op.constant(hidden_acts),
+          num_sampled=4,
+          num_classes=num_classes,
+          num_true=num_true,
+          sampled_values=sampled_vals,
+          subtract_log_q=False,
+          remove_accidental_hits=False,
+          partition_strategy="div",
+          name="sampled_logits_basic_num_true_%d" % num_true)
+      got_logits, got_labels = self.evaluate([logits_tensor, labels_tensor])
+      self.assertEqual(exp_logits.shape, got_logits.shape, self._eps)
+      self.assertEqual(exp_labels.shape, got_labels.shape, self._eps)
 
   def testBasic(self):
     """Without accidental hit removal or subtract_log_q."""
     np.random.seed(0)
     num_classes = 5
     batch_size = 3
-    with self.cached_session() as sess:
-      for num_true in range(1, 5):
-        labels = np.random.randint(
-            low=0, high=num_classes, size=batch_size * num_true)
-        (weights, biases, hidden_acts, sampled_vals, exp_logits,
-         exp_labels) = self._GenerateTestData(
-             num_classes=num_classes,
-             dim=10,
-             batch_size=batch_size,
-             num_true=num_true,
-             labels=labels,
-             sampled=[1, 0, 2, 3],
-             subtract_log_q=False)
-        logits_tensor, labels_tensor = _compute_sampled_logits(
-            weights=constant_op.constant(weights),
-            biases=constant_op.constant(biases),
-            labels=constant_op.constant(
-                labels, dtype=dtypes.int64, shape=(batch_size, num_true)),
-            inputs=constant_op.constant(hidden_acts),
-            num_sampled=4,
-            num_classes=num_classes,
-            num_true=num_true,
-            sampled_values=sampled_vals,
-            subtract_log_q=False,
-            remove_accidental_hits=False,
-            partition_strategy="div",
-            name="sampled_logits_basic_num_true_%d" % num_true)
-        got_logits, got_labels = sess.run([logits_tensor, labels_tensor])
-        self.assertAllClose(exp_logits, got_logits, self._eps)
-        self.assertAllClose(exp_labels, got_labels, self._eps)
+
+    for num_true in range(1, 5):
+      labels = np.random.randint(
+          low=0, high=num_classes, size=batch_size * num_true)
+      (weights, biases, hidden_acts, sampled_vals, exp_logits,
+       exp_labels) = self._GenerateTestData(
+           num_classes=num_classes,
+           dim=10,
+           batch_size=batch_size,
+           num_true=num_true,
+           labels=labels,
+           sampled=[1, 0, 2, 3],
+           subtract_log_q=False)
+      logits_tensor, labels_tensor = _compute_sampled_logits(
+          weights=constant_op.constant(weights),
+          biases=constant_op.constant(biases),
+          labels=constant_op.constant(
+              labels, dtype=dtypes.int64, shape=(batch_size, num_true)),
+          inputs=constant_op.constant(hidden_acts),
+          num_sampled=4,
+          num_classes=num_classes,
+          num_true=num_true,
+          sampled_values=sampled_vals,
+          subtract_log_q=False,
+          remove_accidental_hits=False,
+          partition_strategy="div",
+          name="sampled_logits_basic_num_true_%d" % num_true)
+      got_logits, got_labels = self.evaluate([logits_tensor, labels_tensor])
+      self.assertAllClose(exp_logits, got_logits, self._eps)
+      self.assertAllClose(exp_labels, got_labels, self._eps)
 
   def testAccidentalHitRemoval(self):
     """With accidental hit removal, no subtract_log_q."""
@@ -642,118 +662,118 @@ class ComputeSampledLogitsTest(test_lib.TestCase):
     num_classes = 5
     batch_size = 3
     sampled = [1, 0, 2, 3]
-    with self.cached_session():
-      for num_true in range(1, 5):
-        labels = np.random.randint(
-            low=0, high=num_classes, size=batch_size * num_true)
-        (weights, biases, hidden_acts, sampled_vals, _,
-         _) = self._GenerateTestData(
-             num_classes=num_classes,
-             dim=10,
-             batch_size=batch_size,
-             num_true=num_true,
-             labels=labels,
-             sampled=sampled,
-             subtract_log_q=False)
-        logits_tensor, _ = _compute_sampled_logits(
-            weights=constant_op.constant(weights),
-            biases=constant_op.constant(biases),
-            labels=constant_op.constant(
-                labels, dtype=dtypes.int64, shape=(batch_size, num_true)),
-            inputs=constant_op.constant(hidden_acts),
-            num_sampled=len(sampled),
-            num_classes=num_classes,
-            num_true=num_true,
-            sampled_values=sampled_vals,
-            subtract_log_q=False,
-            remove_accidental_hits=True,
-            partition_strategy="div",
-            name="sampled_logits_accidental_hit_removal_num_true_%d" % num_true)
-        # Test that the exponentiated logits of accidental hits are near 0.
-        # First we need to find the hits in this random test run:
-        labels_reshape = labels.reshape((batch_size, num_true))
-        got_logits = logits_tensor.eval()
-        for row in xrange(batch_size):
-          row_labels = labels_reshape[row, :]
-          for col in xrange(len(sampled)):
-            if sampled[col] in row_labels:
-              # We need to add the num_true_test offset into logits_*
-              self.assertNear(
-                  np.exp(got_logits[row, col + num_true]), 0., self._eps)
+
+    for num_true in range(1, 5):
+      labels = np.random.randint(
+          low=0, high=num_classes, size=batch_size * num_true)
+      (weights, biases, hidden_acts, sampled_vals, _,
+       _) = self._GenerateTestData(
+           num_classes=num_classes,
+           dim=10,
+           batch_size=batch_size,
+           num_true=num_true,
+           labels=labels,
+           sampled=sampled,
+           subtract_log_q=False)
+      logits_tensor, _ = _compute_sampled_logits(
+          weights=constant_op.constant(weights),
+          biases=constant_op.constant(biases),
+          labels=constant_op.constant(
+              labels, dtype=dtypes.int64, shape=(batch_size, num_true)),
+          inputs=constant_op.constant(hidden_acts),
+          num_sampled=len(sampled),
+          num_classes=num_classes,
+          num_true=num_true,
+          sampled_values=sampled_vals,
+          subtract_log_q=False,
+          remove_accidental_hits=True,
+          partition_strategy="div",
+          name="sampled_logits_accidental_hit_removal_num_true_%d" % num_true)
+      # Test that the exponentiated logits of accidental hits are near 0.
+      # First we need to find the hits in this random test run:
+      labels_reshape = labels.reshape((batch_size, num_true))
+      got_logits = self.evaluate(logits_tensor)
+      for row in xrange(batch_size):
+        row_labels = labels_reshape[row, :]
+        for col in xrange(len(sampled)):
+          if sampled[col] in row_labels:
+            # We need to add the num_true_test offset into logits_*
+            self.assertNear(
+                np.exp(got_logits[row, col + num_true]), 0., self._eps)
 
   def testSubtractLogQ(self):
     """With subtract_log_q, no accidental hit removal."""
     np.random.seed(0)
     num_classes = 5
     batch_size = 3
-    with self.cached_session() as sess:
-      for num_true in range(1, 5):
-        labels = np.random.randint(
-            low=0, high=num_classes, size=batch_size * num_true)
-        (weights, biases, hidden_acts, sampled_vals, exp_logits,
-         exp_labels) = self._GenerateTestData(
-             num_classes=num_classes,
-             dim=10,
-             batch_size=batch_size,
-             num_true=num_true,
-             labels=labels,
-             sampled=[1, 0, 2, 3],
-             subtract_log_q=True)
-        logits_tensor, labels_tensor = _compute_sampled_logits(
-            weights=constant_op.constant(weights),
-            biases=constant_op.constant(biases),
-            labels=constant_op.constant(
-                labels, dtype=dtypes.int64, shape=(batch_size, num_true)),
-            inputs=constant_op.constant(hidden_acts),
-            num_sampled=4,
-            num_classes=num_classes,
-            num_true=num_true,
-            sampled_values=sampled_vals,
-            subtract_log_q=True,
-            remove_accidental_hits=False,
-            partition_strategy="div",
-            name="sampled_logits_subtract_log_q_num_true_%d" % num_true)
-        got_logits, got_labels = sess.run([logits_tensor, labels_tensor])
-        self.assertAllClose(exp_logits, got_logits, self._eps)
-        self.assertAllClose(exp_labels, got_labels, self._eps)
+
+    for num_true in range(1, 5):
+      labels = np.random.randint(
+          low=0, high=num_classes, size=batch_size * num_true)
+      (weights, biases, hidden_acts, sampled_vals, exp_logits,
+       exp_labels) = self._GenerateTestData(
+           num_classes=num_classes,
+           dim=10,
+           batch_size=batch_size,
+           num_true=num_true,
+           labels=labels,
+           sampled=[1, 0, 2, 3],
+           subtract_log_q=True)
+      logits_tensor, labels_tensor = _compute_sampled_logits(
+          weights=constant_op.constant(weights),
+          biases=constant_op.constant(biases),
+          labels=constant_op.constant(
+              labels, dtype=dtypes.int64, shape=(batch_size, num_true)),
+          inputs=constant_op.constant(hidden_acts),
+          num_sampled=4,
+          num_classes=num_classes,
+          num_true=num_true,
+          sampled_values=sampled_vals,
+          subtract_log_q=True,
+          remove_accidental_hits=False,
+          partition_strategy="div",
+          name="sampled_logits_subtract_log_q_num_true_%d" % num_true)
+      got_logits, got_labels = self.evaluate([logits_tensor, labels_tensor])
+      self.assertAllClose(exp_logits, got_logits, self._eps)
+      self.assertAllClose(exp_labels, got_labels, self._eps)
 
   def testSharded(self):
     """With sharded weights and sharded biases."""
     np.random.seed(0)
     num_classes = 5
     batch_size = 3
-    with self.cached_session() as sess:
-      for num_true in range(1, 5):
-        labels = np.random.randint(
-            low=0, high=num_classes, size=batch_size * num_true)
-        (weights, biases, hidden_acts, sampled_vals, exp_logits,
-         exp_labels) = self._GenerateTestData(
-             num_classes=num_classes,
-             dim=10,
-             batch_size=batch_size,
-             num_true=num_true,
-             labels=labels,
-             sampled=[1, 0, 2, 3],
-             subtract_log_q=False)
-        weight_shards, bias_shards = self._ShardTestEmbeddings(
-            weights, biases, num_shards=3)
-        logits_tensor, labels_tensor = _compute_sampled_logits(
-            weights=[constant_op.constant(shard) for shard in weight_shards],
-            biases=[constant_op.constant(shard) for shard in bias_shards],
-            labels=constant_op.constant(
-                labels, dtype=dtypes.int64, shape=(batch_size, num_true)),
-            inputs=constant_op.constant(hidden_acts),
-            num_sampled=4,
-            num_classes=num_classes,
-            num_true=num_true,
-            sampled_values=sampled_vals,
-            subtract_log_q=False,
-            remove_accidental_hits=False,
-            partition_strategy="div",
-            name="sampled_logits_sharded_num_true_%d" % num_true)
-        got_logits, got_labels = sess.run([logits_tensor, labels_tensor])
-        self.assertAllClose(exp_logits, got_logits, self._eps)
-        self.assertAllClose(exp_labels, got_labels, self._eps)
+
+    for num_true in range(1, 5):
+      labels = np.random.randint(
+          low=0, high=num_classes, size=batch_size * num_true)
+      (weights, biases, hidden_acts, sampled_vals, exp_logits,
+       exp_labels) = self._GenerateTestData(
+           num_classes=num_classes,
+           dim=10,
+           batch_size=batch_size,
+           num_true=num_true,
+           labels=labels,
+           sampled=[1, 0, 2, 3],
+           subtract_log_q=False)
+      weight_shards, bias_shards = self._ShardTestEmbeddings(
+          weights, biases, num_shards=3)
+      logits_tensor, labels_tensor = _compute_sampled_logits(
+          weights=[constant_op.constant(shard) for shard in weight_shards],
+          biases=[constant_op.constant(shard) for shard in bias_shards],
+          labels=constant_op.constant(
+              labels, dtype=dtypes.int64, shape=(batch_size, num_true)),
+          inputs=constant_op.constant(hidden_acts),
+          num_sampled=4,
+          num_classes=num_classes,
+          num_true=num_true,
+          sampled_values=sampled_vals,
+          subtract_log_q=False,
+          remove_accidental_hits=False,
+          partition_strategy="div",
+          name="sampled_logits_sharded_num_true_%d" % num_true)
+      got_logits, got_labels = self.evaluate([logits_tensor, labels_tensor])
+      self.assertAllClose(exp_logits, got_logits, self._eps)
+      self.assertAllClose(exp_labels, got_labels, self._eps)
 
   def testNCELoss(self):
     # A simple test to verify the numerics.
@@ -782,35 +802,32 @@ class ComputeSampledLogitsTest(test_lib.TestCase):
     exp_nce_loss = np.sum(
         _SigmoidCrossEntropyWithLogits(exp_logits, exp_labels), 1)
 
-    with self.cached_session():
-      got_nce_loss = nn_impl.nce_loss(
-          weights=constant_op.constant(weights),
-          biases=constant_op.constant(biases),
-          labels=constant_op.constant(labels, shape=(batch_size, 1)),
-          inputs=constant_op.constant(hidden_acts),
-          num_sampled=4,
-          num_classes=num_classes,
-          num_true=1,
-          sampled_values=sampled_vals,
-          partition_strategy="div")
+    got_nce_loss = nn_impl.nce_loss_v2(
+        weights=constant_op.constant(weights),
+        biases=constant_op.constant(biases),
+        labels=constant_op.constant(labels, shape=(batch_size, 1)),
+        inputs=constant_op.constant(hidden_acts),
+        num_sampled=4,
+        num_classes=num_classes,
+        num_true=1,
+        sampled_values=sampled_vals)
 
-      self.assertAllClose(exp_nce_loss, got_nce_loss.eval(), 1e-4)
+    self.assertAllClose(exp_nce_loss, self.evaluate(got_nce_loss), 1e-4)
 
-      # Test with sharded weights and sharded biases.
-      weight_shards, bias_shards = self._ShardTestEmbeddings(
-          weights, biases, num_shards=3)
-      got_nce_loss = nn_impl.nce_loss(
-          weights=[constant_op.constant(shard) for shard in weight_shards],
-          biases=[constant_op.constant(shard) for shard in bias_shards],
-          labels=constant_op.constant(labels, shape=(batch_size, 1)),
-          inputs=constant_op.constant(hidden_acts),
-          num_sampled=4,
-          num_classes=num_classes,
-          num_true=1,
-          sampled_values=sampled_vals,
-          partition_strategy="div")
+    # Test with sharded weights and sharded biases.
+    weight_shards, bias_shards = self._ShardTestEmbeddings(
+        weights, biases, num_shards=3)
+    got_nce_loss = nn_impl.nce_loss_v2(
+        weights=[constant_op.constant(shard) for shard in weight_shards],
+        biases=[constant_op.constant(shard) for shard in bias_shards],
+        labels=constant_op.constant(labels, shape=(batch_size, 1)),
+        inputs=constant_op.constant(hidden_acts),
+        num_sampled=4,
+        num_classes=num_classes,
+        num_true=1,
+        sampled_values=sampled_vals)
 
-      self.assertAllClose(exp_nce_loss, got_nce_loss.eval(), 1e-4)
+    self.assertAllClose(exp_nce_loss, self.evaluate(got_nce_loss), 1e-4)
 
   def testSampledSoftmaxLoss(self):
     # A simple test to verify the numerics.
@@ -839,39 +856,38 @@ class ComputeSampledLogitsTest(test_lib.TestCase):
     exp_sampled_softmax_loss = _SoftmaxCrossEntropyWithLogits(
         exp_logits, exp_labels)
 
-    with self.cached_session():
-      got_sampled_softmax_loss = nn_impl.sampled_softmax_loss(
-          weights=constant_op.constant(weights),
-          biases=constant_op.constant(biases),
-          labels=constant_op.constant(labels, shape=(batch_size, 1)),
-          inputs=constant_op.constant(hidden_acts),
-          num_sampled=4,
-          num_classes=num_classes,
-          num_true=1,
-          sampled_values=sampled_vals,
-          remove_accidental_hits=False,
-          partition_strategy="div")
+    got_sampled_softmax_loss = nn_impl.sampled_softmax_loss(
+        weights=constant_op.constant(weights),
+        biases=constant_op.constant(biases),
+        labels=constant_op.constant(labels, shape=(batch_size, 1)),
+        inputs=constant_op.constant(hidden_acts),
+        num_sampled=4,
+        num_classes=num_classes,
+        num_true=1,
+        sampled_values=sampled_vals,
+        remove_accidental_hits=False,
+        partition_strategy="div")
 
-      self.assertAllClose(exp_sampled_softmax_loss,
-                          got_sampled_softmax_loss.eval(), 1e-4)
+    self.assertAllClose(exp_sampled_softmax_loss,
+                        self.evaluate(got_sampled_softmax_loss), 1e-4)
 
-      # Test with sharded weights and sharded biases.
-      weight_shards, bias_shards = self._ShardTestEmbeddings(
-          weights, biases, num_shards=3)
-      got_sampled_softmax_loss = nn_impl.sampled_softmax_loss(
-          weights=[constant_op.constant(shard) for shard in weight_shards],
-          biases=[constant_op.constant(shard) for shard in bias_shards],
-          labels=constant_op.constant(labels, shape=(batch_size, 1)),
-          inputs=constant_op.constant(hidden_acts),
-          num_sampled=4,
-          num_classes=num_classes,
-          num_true=1,
-          sampled_values=sampled_vals,
-          remove_accidental_hits=False,
-          partition_strategy="div")
+    # Test with sharded weights and sharded biases.
+    weight_shards, bias_shards = self._ShardTestEmbeddings(
+        weights, biases, num_shards=3)
+    got_sampled_softmax_loss = nn_impl.sampled_softmax_loss(
+        weights=[constant_op.constant(shard) for shard in weight_shards],
+        biases=[constant_op.constant(shard) for shard in bias_shards],
+        labels=constant_op.constant(labels, shape=(batch_size, 1)),
+        inputs=constant_op.constant(hidden_acts),
+        num_sampled=4,
+        num_classes=num_classes,
+        num_true=1,
+        sampled_values=sampled_vals,
+        remove_accidental_hits=False,
+        partition_strategy="div")
 
-      self.assertAllClose(exp_sampled_softmax_loss,
-                          got_sampled_softmax_loss.eval(), 1e-4)
+    self.assertAllClose(exp_sampled_softmax_loss,
+                        self.evaluate(got_sampled_softmax_loss), 1e-4)
 
   def testSampledSoftmaxLossBf16(self):
     # A simple test to verify the numerics for bfloat16.
@@ -900,29 +916,30 @@ class ComputeSampledLogitsTest(test_lib.TestCase):
     exp_sampled_softmax_loss = _SoftmaxCrossEntropyWithLogits(
         exp_logits, exp_labels)
 
-    with self.cached_session():
-      true_exp_bf16 = np.full(
-          [batch_size, 1], fill_value=0.5, dtype=dtypes.bfloat16.as_numpy_dtype)
-      sampled_exp_bf16 = np.full(
-          [len(sampled)], fill_value=0.5, dtype=dtypes.bfloat16.as_numpy_dtype)
-      sampled_vals_bf16 = (sampled, true_exp_bf16, sampled_exp_bf16)
+    true_exp_bf16 = np.full([batch_size, 1],
+                            fill_value=0.5,
+                            dtype=dtypes.bfloat16.as_numpy_dtype)
+    sampled_exp_bf16 = np.full([len(sampled)],
+                               fill_value=0.5,
+                               dtype=dtypes.bfloat16.as_numpy_dtype)
+    sampled_vals_bf16 = (sampled, true_exp_bf16, sampled_exp_bf16)
 
-      got_sampled_softmax_loss = math_ops.cast(
-          nn_impl.sampled_softmax_loss(
-              weights=constant_op.constant(weights, dtype=dtypes.bfloat16),
-              biases=constant_op.constant(biases, dtype=dtypes.bfloat16),
-              labels=constant_op.constant(
-                  labels, shape=(batch_size, 1), dtype=dtypes.bfloat16),
-              inputs=constant_op.constant(hidden_acts, dtype=dtypes.bfloat16),
-              num_sampled=4,
-              num_classes=num_classes,
-              num_true=1,
-              sampled_values=sampled_vals_bf16,
-              remove_accidental_hits=False,
-              partition_strategy="div"), dtypes.float32)
+    got_sampled_softmax_loss = math_ops.cast(
+        nn_impl.sampled_softmax_loss(
+            weights=constant_op.constant(weights, dtype=dtypes.bfloat16),
+            biases=constant_op.constant(biases, dtype=dtypes.bfloat16),
+            labels=constant_op.constant(
+                labels, shape=(batch_size, 1), dtype=dtypes.bfloat16),
+            inputs=constant_op.constant(hidden_acts, dtype=dtypes.bfloat16),
+            num_sampled=4,
+            num_classes=num_classes,
+            num_true=1,
+            sampled_values=sampled_vals_bf16,
+            remove_accidental_hits=False,
+            partition_strategy="div"), dtypes.float32)
 
-      self.assertAllClose(exp_sampled_softmax_loss,
-                          got_sampled_softmax_loss.eval(), 1e-1)
+    self.assertAllClose(exp_sampled_softmax_loss,
+                        self.evaluate(got_sampled_softmax_loss), 1e-1)
 
 
 class CReluTest(test_lib.TestCase):
@@ -931,9 +948,9 @@ class CReluTest(test_lib.TestCase):
     np.random.seed(1)  # Make it reproducible.
     x = np.random.randn(3, 4).astype(np.float32)
     y = np.concatenate([x * (x > 0), -x * (x < 0)], axis=1)
-    with self.cached_session():
-      z = nn_ops.crelu(constant_op.constant(x)).eval()
-      self.assertAllClose(y, z, 1e-4)
+
+    z = self.evaluate(nn_ops.crelu(constant_op.constant(x)))
+    self.assertAllClose(y, z, 1e-4)
 
 
 class ReluTest(test_lib.TestCase):
@@ -942,10 +959,11 @@ class ReluTest(test_lib.TestCase):
     np.random.seed(1)  # Make it reproducible.
     x = np.random.randn(3, 4).astype(np.float32)
     y = np.maximum(x, 0.0)
-    with self.cached_session():
-      z = nn_ops.relu(constant_op.constant(x)).eval()
-      self.assertAllEqual(y, z)
 
+    z = self.evaluate(nn_ops.relu(constant_op.constant(x)))
+    self.assertAllEqual(y, z)
+
+  @test_util.run_deprecated_v1
   def testNaNs(self):
     # Test that relu(nan) = nan for various sizes.
     for i in range(18):
@@ -967,22 +985,26 @@ class LeakyReluTest(test_lib.TestCase):
 
     outputs = nn_ops.leaky_relu(inputs)
     self.assertEquals(inputs.shape, outputs.shape)
-    with self.cached_session() as sess:
-      inputs, outputs = sess.run([inputs, outputs])
+
+    inputs, outputs = self.evaluate([inputs, outputs])
+
     self.assertGreaterEqual(outputs.min(), 0.0)
     self.assertLessEqual(outputs.max(), 1.0)
     self.assertAllClose(inputs, outputs)
 
+  @test_util.run_deprecated_v1
   def testValues(self):
     for dtype in [np.int32, np.int64, np.float16, np.float32, np.float64]:
       np_values = np.array([-2, -1, 0, 1, 2], dtype=dtype)
       outputs = nn_ops.leaky_relu(constant_op.constant(np_values))
-      with self.cached_session() as sess:
-        outputs = sess.run(outputs)
+
+      outputs = self.evaluate(outputs)
+
       tol = 2e-3 if dtype == np.float16 else 1e-6
       self.assertAllClose(
           outputs, [-0.4, -0.2, 0.0, 1.0, 2.0], rtol=tol, atol=tol)
 
+  @test_util.run_deprecated_v1
   def testName(self):
     np_values = np.array([-2, -1, 0, 1, 2], dtype=np.float64)
     outputs_with_name_set = nn_ops.leaky_relu(
@@ -996,6 +1018,7 @@ class LeakyReluTest(test_lib.TestCase):
 
 class SwishTest(test_lib.TestCase):
 
+  @test_util.run_deprecated_v1
   def testValues(self):
     np_values = np.array(
         [np.linspace(-10.0, 0.0, 100),
@@ -1004,11 +1027,13 @@ class SwishTest(test_lib.TestCase):
     tf_values = constant_op.constant(np_values)
     actual_tf_outputs = nn_impl.swish(tf_values)
     expected_tf_outputs = tf_values * math_ops.sigmoid(tf_values)
-    with self.cached_session() as sess:
-      actual_outputs, expected_outputs = sess.run(
-          [actual_tf_outputs, expected_tf_outputs])
+
+    actual_outputs, expected_outputs = self.evaluate(
+        [actual_tf_outputs, expected_tf_outputs])
+
     self.assertAllClose(actual_outputs, expected_outputs)
 
+  @test_util.run_deprecated_v1
   def testGradients(self):
     shape = [5, 3, 4]
     sigma = 5
@@ -1039,8 +1064,8 @@ class MomentsTest(test_lib.TestCase):
             with self.session(graph=g) as sess:
               inputs = constant_op.constant(
                   input_values, shape=input_shape, dtype=dtypes.float32)
-              mean, variance = nn_impl.moments(
-                  inputs, moments_axes, keep_dims=keep_dims)
+              mean, variance = nn_impl.moments_v2(
+                  inputs, moments_axes, keepdims=keep_dims)
 
               if check_gradients:
                 err = gradient_checker.compute_gradient_error(
@@ -1051,7 +1076,7 @@ class MomentsTest(test_lib.TestCase):
                 self.assertLess(err, 1e-3)
 
               # Evaluate.
-              [mean, variance] = sess.run([mean, variance])
+              [mean, variance] = self.evaluate([mean, variance])
               # Make sure that there are no NaNs
               self.assertFalse(np.isnan(mean).any())
               self.assertFalse(np.isnan(variance).any())
@@ -1094,9 +1119,9 @@ class DataFormatDimMapTest(test_lib.TestCase):
   def _test(self, x_val, y_val_expected):
     x = constant_op.constant(x_val)
     y = nn_ops.data_format_dim_map(x)
-    with self.cached_session(use_gpu=test_lib.is_gpu_available()) as sess:
-      y_val = sess.run(y)
-      self.assertAllEqual(y_val, y_val_expected)
+
+    y_val = self.evaluate(y)
+    self.assertAllEqual(y_val, y_val_expected)
 
   def test(self):
     self._test(0, 0)
@@ -1117,8 +1142,8 @@ class DataFormatDimMapTest(test_lib.TestCase):
     y_val_expected = [2, 2, 3]
     x = constant_op.constant(x_val)
     y = nn_ops.data_format_dim_map(x, src_format="NHWC", dst_format="NCHW")
-    with self.session(use_gpu=test_lib.is_gpu_available()) as sess:
-      y_val = sess.run(y)
+    with test_util.use_gpu():
+      y_val = self.evaluate(y)
       self.assertAllEqual(y_val, y_val_expected)
 
   def testNHWCtoHWNC(self):
@@ -1126,8 +1151,8 @@ class DataFormatDimMapTest(test_lib.TestCase):
     y_val_expected = [2, 0, 1, 3, 2, 0, 1, 3]
     x = constant_op.constant(x_val)
     y = nn_ops.data_format_dim_map(x, src_format="NHWC", dst_format="HWNC")
-    with self.session(use_gpu=test_lib.is_gpu_available()) as sess:
-      y_val = sess.run(y)
+    with test_util.use_gpu():
+      y_val = self.evaluate(y)
       self.assertAllEqual(y_val, y_val_expected)
 
   def testNHWCtoWHCN(self):
@@ -1135,8 +1160,8 @@ class DataFormatDimMapTest(test_lib.TestCase):
     y_val_expected = [3, 1, 0, 2, 3, 1, 0, 2]
     x = constant_op.constant(x_val)
     y = nn_ops.data_format_dim_map(x, src_format="NHWC", dst_format="WHCN")
-    with self.session(use_gpu=test_lib.is_gpu_available()) as sess:
-      y_val = sess.run(y)
+    with test_util.use_gpu():
+      y_val = self.evaluate(y)
       self.assertAllEqual(y_val, y_val_expected)
 
   def testArbitraryASCII(self):
@@ -1144,8 +1169,8 @@ class DataFormatDimMapTest(test_lib.TestCase):
     y_val_expected = [3, 2, 1, 0, 3, 2, 1, 0]
     x = constant_op.constant(x_val)
     y = nn_ops.data_format_dim_map(x, src_format="qwer", dst_format="rewq")
-    with self.session(use_gpu=test_lib.is_gpu_available()) as sess:
-      y_val = sess.run(y)
+    with test_util.use_gpu():
+      y_val = self.evaluate(y)
       self.assertAllEqual(y_val, y_val_expected)
 
 
@@ -1155,64 +1180,64 @@ class DataFormatVectorPermuteTest(test_lib.TestCase):
     x_val = [7, 4, 9, 3]
     x = constant_op.constant(x_val)
     y = nn_ops.data_format_vec_permute(x)
-    with self.session(use_gpu=test_lib.is_gpu_available()) as sess:
-      y_val = sess.run(y)
+    with test_util.use_gpu():
+      y_val = self.evaluate(y)
       self.assertAllEqual(y_val, [7, 3, 4, 9])
 
   def testNCHWToNHWC(self):
     x_val = [7, 4, 9, 3]
     x = constant_op.constant(x_val)
     y = nn_ops.data_format_vec_permute(x, src_format="NCHW", dst_format="NHWC")
-    with self.session(use_gpu=test_lib.is_gpu_available()) as sess:
-      y_val = sess.run(y)
+    with test_util.use_gpu():
+      y_val = self.evaluate(y)
       self.assertAllEqual(y_val, [7, 9, 3, 4])
 
   def testNHWCToHWNC(self):
     x_val = [7, 4, 9, 3]
     x = constant_op.constant(x_val)
     y = nn_ops.data_format_vec_permute(x, src_format="NHWC", dst_format="HWNC")
-    with self.session(use_gpu=test_lib.is_gpu_available()) as sess:
-      y_val = sess.run(y)
+    with test_util.use_gpu():
+      y_val = self.evaluate(y)
       self.assertAllEqual(y_val, [4, 9, 7, 3])
 
   def testHWNCToNHWC(self):
     x_val = [7, 4, 9, 3]
     x = constant_op.constant(x_val)
     y = nn_ops.data_format_vec_permute(x, src_format="HWNC", dst_format="NHWC")
-    with self.session(use_gpu=test_lib.is_gpu_available()) as sess:
-      y_val = sess.run(y)
+    with test_util.use_gpu():
+      y_val = self.evaluate(y)
       self.assertAllEqual(y_val, [9, 7, 4, 3])
 
   def testNHWCToNCHW2D(self):
     x_val = [[7, 4], [9, 3], [4, 5], [5, 1]]
     x = constant_op.constant(x_val)
     y = nn_ops.data_format_vec_permute(x)
-    with self.session(use_gpu=test_lib.is_gpu_available()) as sess:
-      y_val = sess.run(y)
+    with test_util.use_gpu():
+      y_val = self.evaluate(y)
       self.assertAllEqual(y_val, [[7, 4], [5, 1], [9, 3], [4, 5]])
 
   def testNHWCToHWNC2D(self):
     x_val = [[7, 4], [9, 3], [4, 5], [5, 1]]
     x = constant_op.constant(x_val)
     y = nn_ops.data_format_vec_permute(x, src_format="NHWC", dst_format="HWNC")
-    with self.session(use_gpu=test_lib.is_gpu_available()) as sess:
-      y_val = sess.run(y)
+    with test_util.use_gpu():
+      y_val = self.evaluate(y)
       self.assertAllEqual(y_val, [[9, 3], [4, 5], [7, 4], [5, 1]])
 
   def testHWNCToNHWC2D(self):
     x_val = [[7, 4], [9, 3], [4, 5], [5, 1]]
     x = constant_op.constant(x_val)
     y = nn_ops.data_format_vec_permute(x, src_format="HWNC", dst_format="NHWC")
-    with self.session(use_gpu=test_lib.is_gpu_available()) as sess:
-      y_val = sess.run(y)
+    with test_util.use_gpu():
+      y_val = self.evaluate(y)
       self.assertAllEqual(y_val, [[4, 5], [7, 4], [9, 3], [5, 1]])
 
   def testNCHWToNHWC2D(self):
     x_val = [[7, 4], [9, 3], [4, 5], [5, 1]]
     x = constant_op.constant(x_val)
     y = nn_ops.data_format_vec_permute(x, src_format="NCHW", dst_format="NHWC")
-    with self.session(use_gpu=test_lib.is_gpu_available()) as sess:
-      y_val = sess.run(y)
+    with test_util.use_gpu():
+      y_val = self.evaluate(y)
       self.assertAllEqual(y_val, [[7, 4], [4, 5], [5, 1], [9, 3]])
 
 
diff --git a/tensorflow/python/ops/nn_xent_test.py b/tensorflow/python/ops/nn_xent_test.py
index 57ce4fd0a99..3e5c198fc6a 100644
--- a/tensorflow/python/ops/nn_xent_test.py
+++ b/tensorflow/python/ops/nn_xent_test.py
@@ -24,6 +24,7 @@ import numpy as np
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import nn_impl
@@ -53,6 +54,7 @@ class SigmoidCrossEntropyWithLogitsTest(test.TestCase):
     losses = np.array(self._SigmoidCrossEntropyWithLogits(x, y)).reshape(*sizes)
     return logits, targets, losses
 
+  @test_util.run_deprecated_v1
   def testConstructionNamed(self):
     with self.cached_session():
       logits, targets, _ = self._Inputs()
@@ -68,7 +70,7 @@ class SigmoidCrossEntropyWithLogitsTest(test.TestCase):
           loss = nn_impl.sigmoid_cross_entropy_with_logits(
               labels=targets, logits=logits)
           np_loss = np.array(losses).astype(np.float32)
-          tf_loss = loss.eval()
+          tf_loss = self.evaluate(loss)
         self.assertAllClose(np_loss, tf_loss, atol=0.001)
 
   def testLogisticOutputMultiDim(self):
@@ -79,9 +81,10 @@ class SigmoidCrossEntropyWithLogitsTest(test.TestCase):
           loss = nn_impl.sigmoid_cross_entropy_with_logits(
               labels=targets, logits=logits)
           np_loss = np.array(losses).astype(np.float32)
-          tf_loss = loss.eval()
+          tf_loss = self.evaluate(loss)
         self.assertAllClose(np_loss, tf_loss, atol=0.001)
 
+  @test_util.run_deprecated_v1
   def testGradient(self):
     sizes = [4, 2]
     with self.cached_session():
@@ -92,6 +95,7 @@ class SigmoidCrossEntropyWithLogitsTest(test.TestCase):
     print("logistic loss gradient err = ", err)
     self.assertLess(err, 1e-7)
 
+  @test_util.run_deprecated_v1
   def testGradientAtZero(self):
     with self.cached_session():
       logits = constant_op.constant([0.0, 0.0], dtype=dtypes.float64)
@@ -129,6 +133,7 @@ class WeightedCrossEntropyTest(test.TestCase):
     losses = np.array(self._WeightedCrossEntropy(x, y, q)).reshape(*sizes)
     return logits, targets, q, losses
 
+  @test_util.run_deprecated_v1
   def testConstructionNamed(self):
     with self.cached_session():
       logits, targets, pos_weight, _ = self._Inputs()
@@ -143,7 +148,7 @@ class WeightedCrossEntropyTest(test.TestCase):
         loss = nn_impl.weighted_cross_entropy_with_logits(
             targets=targets, logits=logits, pos_weight=pos_weight)
         np_loss = np.array(losses).astype(np.float32)
-        tf_loss = loss.eval()
+        tf_loss = self.evaluate(loss)
       self.assertAllClose(np_loss, tf_loss, atol=0.001)
 
   def testOutputMultiDim(self):
@@ -154,9 +159,10 @@ class WeightedCrossEntropyTest(test.TestCase):
         loss = nn_impl.weighted_cross_entropy_with_logits(
             targets=targets, logits=logits, pos_weight=pos_weight)
         np_loss = np.array(losses).astype(np.float32)
-        tf_loss = loss.eval()
+        tf_loss = self.evaluate(loss)
       self.assertAllClose(np_loss, tf_loss, atol=0.001)
 
+  @test_util.run_deprecated_v1
   def testGradient(self):
     sizes = [4, 2]
     with self.cached_session():
diff --git a/tensorflow/python/ops/numerics.py b/tensorflow/python/ops/numerics.py
index 1a235de90cf..0ab39ad0a8e 100644
--- a/tensorflow/python/ops/numerics.py
+++ b/tensorflow/python/ops/numerics.py
@@ -28,9 +28,7 @@ from tensorflow.python.util import deprecation
 from tensorflow.python.util.tf_export import tf_export
 
 
-@tf_export(
-    "debugging.assert_all_finite",
-    v1=["debugging.assert_all_finite", "verify_tensor_all_finite"])
+@tf_export(v1=["debugging.assert_all_finite", "verify_tensor_all_finite"])
 @deprecation.deprecated_endpoints("verify_tensor_all_finite")
 def verify_tensor_all_finite(t, msg, name=None):
   """Assert that the tensor does not contain any NaN's or Inf's.
@@ -43,11 +41,26 @@ def verify_tensor_all_finite(t, msg, name=None):
   Returns:
     Same tensor as `t`.
   """
-  with ops.name_scope(name, "VerifyFinite", [t]) as name:
-    t = ops.convert_to_tensor(t, name="t")
-    with ops.colocate_with(t):
-      verify_input = array_ops.check_numerics(t, message=msg)
-      out = control_flow_ops.with_dependencies([verify_input], t)
+  return verify_tensor_all_finite_v2(t, msg, name)
+
+
+@tf_export("debugging.assert_all_finite", v1=[])
+def verify_tensor_all_finite_v2(x, message, name=None):
+  """Assert that the tensor does not contain any NaN's or Inf's.
+
+  Args:
+    x: Tensor to check.
+    message: Message to log on failure.
+    name: A name for this operation (optional).
+
+  Returns:
+    Same tensor as `x`.
+  """
+  with ops.name_scope(name, "VerifyFinite", [x]) as name:
+    x = ops.convert_to_tensor(x, name="x")
+    with ops.colocate_with(x):
+      verify_input = array_ops.check_numerics(x, message=message)
+      out = control_flow_ops.with_dependencies([verify_input], x)
   return out
 
 
diff --git a/tensorflow/tools/compatibility/testdata/test_file_v1_10.py b/tensorflow/python/ops/optional_grad.py
similarity index 62%
rename from tensorflow/tools/compatibility/testdata/test_file_v1_10.py
rename to tensorflow/python/ops/optional_grad.py
index e5ca8d3e2e2..0d1eae3cda4 100644
--- a/tensorflow/tools/compatibility/testdata/test_file_v1_10.py
+++ b/tensorflow/python/ops/optional_grad.py
@@ -12,23 +12,22 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for tf upgrader."""
+"""Gradient functions for optional ops."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
-import tensorflow as tf
-from tensorflow.python.framework import test_util
-from tensorflow.python.platform import test as test_lib
+
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import gen_dataset_ops
 
 
-class TestUpgrade(test_util.TensorFlowTestCase):
-  """Test various APIs that have been changed in 2.0."""
+@ops.RegisterGradient("OptionalFromValue")
+def _OptionalFromValueGrad(op, grad):
+  return gen_dataset_ops.optional_get_value(
+      grad, [t.dtype for t in op.inputs], [t.shape for t in op.inputs])
 
-  def testRenames(self):
-    with self.cached_session():
-      self.assertAllClose(1.04719755, tf.acos(0.5).eval())
-      self.assertAllClose(0.5, tf.rsqrt(4.0).eval())
 
-if __name__ == "__main__":
-  test_lib.main()
+@ops.RegisterGradient("OptionalGetValue")
+def _OptionalGetValueGrad(unused_op, *grads):
+  return gen_dataset_ops.optional_from_value(grads)
diff --git a/tensorflow/python/ops/parallel_for/control_flow_ops.py b/tensorflow/python/ops/parallel_for/control_flow_ops.py
index ead7ae5478c..8f652e9c509 100644
--- a/tensorflow/python/ops/parallel_for/control_flow_ops.py
+++ b/tensorflow/python/ops/parallel_for/control_flow_ops.py
@@ -17,16 +17,20 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.eager import context
+from tensorflow.python.eager import function
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import tensor_array_ops
 from tensorflow.python.ops.parallel_for.pfor import PFor
 from tensorflow.python.util import nest
 
 
-def for_loop(loop_fn, loop_fn_dtypes, iters):
+def for_loop(loop_fn, loop_fn_dtypes, iters, parallel_iterations=None):
   """Runs `loop_fn` `iters` times and stacks the outputs.
 
 
@@ -39,6 +43,8 @@ def for_loop(loop_fn, loop_fn_dtypes, iters):
       objects. The shape of these outputs should not depend on the input.
     loop_fn_dtypes: dtypes for the outputs of loop_fn.
     iters: Number of iterations for which to run loop_fn.
+    parallel_iterations: The number of iterations that can be dispatched in
+      parallel. This knob can be used to control the total memory usage.
 
   Returns:
     Returns a nested structure of stacked output tensor objects with the same
@@ -66,11 +72,16 @@ def for_loop(loop_fn, loop_fn_dtypes, iters):
       outputs.append(ta)
     return tuple([i + 1] + outputs)
 
+  if parallel_iterations is not None:
+    extra_args = {"parallel_iterations": parallel_iterations}
+  else:
+    extra_args = {}
   ta_list = control_flow_ops.while_loop(
-      lambda i, *ta: i < iters, while_body, [0] + [
-          tensor_array_ops.TensorArray(dtype, iters)
-          for dtype in flat_loop_fn_dtypes
-      ])[1:]
+      lambda i, *ta: i < iters,
+      while_body,
+      [0] + [tensor_array_ops.TensorArray(dtype, iters)
+             for dtype in flat_loop_fn_dtypes],
+      **extra_args)[1:]
 
   # TODO(rachelim): enable this for sparse tensors
 
@@ -79,7 +90,15 @@ def for_loop(loop_fn, loop_fn_dtypes, iters):
   return nest.pack_sequence_as(loop_fn_dtypes, output)
 
 
-def pfor(loop_fn, iters):
+def _flatten_first_two_dims(x):
+  """Flattens the first two dimensions of x into a single dimension."""
+  old_shape = array_ops.shape(x)
+  new_shape = array_ops.concat([[old_shape[0] * old_shape[1]], old_shape[2:]],
+                               axis=0)
+  return array_ops.reshape(x, new_shape)
+
+
+def pfor(loop_fn, iters, parallel_iterations=None):
   """Equivalent to running `loop_fn` `iters` times and stacking the outputs.
 
   `pfor` has functionality similar to `for_loop`, i.e. running `loop_fn` `iters`
@@ -99,8 +118,8 @@ def pfor(loop_fn, iters):
       reads, etc).
     - Conversion works only on a limited set of kernels for which a converter
       has been registered.
-    - loop_fn cannot currently contain control flow operations like
-      tf.while_loop or tf.cond.
+    - loop_fn has limited support for control flow operations. tf.cond in
+      particular is not supported.
     - `loop_fn` should return nested structure of Tensors or Operations. However
       if an Operation is returned, it should have zero outputs.
     - The shape and dtype of `loop_fn` outputs should not depend on the input
@@ -109,22 +128,92 @@ def pfor(loop_fn, iters):
   Args:
     loop_fn: A function that takes an int32 scalar tf.Tensor object representing
       the iteration number, and returns a possibly nested structure of Tensor or
-      Operation objects.
+      Operation objects. Note that if setting `parallel_iterations` argument to
+      something other than None, `loop_fn` may be called more than once during
+      graph construction. So it may need to avoid mutating global state.
     iters: Number of iterations for which to run loop_fn.
+    parallel_iterations: A knob to control how many iterations are vectorized
+      and dispatched in parallel. The default value of None corresponds to
+      vectorizing all the iterations.  If `parallel_iterations` is smaller than
+      `iters`, then chunks of at most that many iterations are dispatched in
+      sequence. This knob can be used to control the total memory usage.
 
   Returns:
     Returns a nested structure of stacked tensor objects with the same nested
     structure as the output of `loop_fn`.
+  Raises:
+    ValueError: If parallel_iterations is not None and not an integer > 1.
   """
+  def f():
+    return _pfor_impl(loop_fn, iters, parallel_iterations=parallel_iterations)
+  if context.executing_eagerly():
+    f = function.defun(f)
+  return f()
+
+
+def _pfor_impl(loop_fn, iters, parallel_iterations=None):
+  """Implementation of pfor."""
   existing_ops = set(ops.get_default_graph().get_operations())
   with ops.name_scope("loop_body"):
     loop_var = array_ops.placeholder(dtypes.int32, shape=[])
     loop_fn_outputs = loop_fn(loop_var)
   new_ops = set(ops.get_default_graph().get_operations()) - existing_ops
   iters = ops.convert_to_tensor(iters)
-  with ops.name_scope("pfor"):
-    converter = PFor(loop_var, iters, new_ops)
-    outputs = []
-    for loop_fn_output in nest.flatten(loop_fn_outputs):
-      outputs.append(converter.convert(loop_fn_output))
-    return nest.pack_sequence_as(loop_fn_outputs, outputs)
+  if parallel_iterations is not None:
+    if parallel_iterations < 1:
+      raise ValueError("parallel_iterations must be None or a positive integer")
+    if parallel_iterations == 1:
+      raise ValueError("Found parallel_iterations == 1. Use for_loop instead.")
+    iters_value = tensor_util.constant_value(iters)
+    if iters_value is not None and iters_value < parallel_iterations:
+      parallel_iterations = None
+  if parallel_iterations is None:
+    with ops.name_scope("pfor"):
+      converter = PFor(loop_var, iters, new_ops)
+      outputs = []
+      for loop_fn_output in nest.flatten(loop_fn_outputs):
+        outputs.append(converter.convert(loop_fn_output))
+      return nest.pack_sequence_as(loop_fn_outputs, outputs)
+  else:
+    num_tiled_iterations = iters // parallel_iterations
+    num_remaining_iterations = iters % parallel_iterations
+    # TODO(agarwal): Avoid calling loop_fn twice. Generate the loop body inside
+    # a tf.function and extract the graph from there to vectorize it.
+    with ops.name_scope("pfor_untiled"):
+      converter = PFor(loop_var, num_remaining_iterations, new_ops)
+      remaining_outputs = []
+      flattened_loop_fn_outputs = nest.flatten(loop_fn_outputs)
+      for loop_fn_output in flattened_loop_fn_outputs:
+        remaining_outputs.append(converter.convert(loop_fn_output))
+
+    with ops.name_scope("pfor_tiled"):
+      loop_fn_dtypes = [ops.convert_to_tensor(x).dtype
+                        for x in flattened_loop_fn_outputs]
+
+      def tiled_loop_body(j):
+        offset = j * parallel_iterations + num_remaining_iterations
+
+        def tiled_loop_fn(i):
+          return nest.flatten(loop_fn(i + offset))
+
+        return pfor(tiled_loop_fn, parallel_iterations)
+
+      tiled_outputs = for_loop(tiled_loop_body, loop_fn_dtypes,
+                               num_tiled_iterations, parallel_iterations=1)
+      tiled_outputs = [_flatten_first_two_dims(y) for y in tiled_outputs]
+
+    with ops.name_scope("pfor"):
+      iters_value = tensor_util.constant_value(iters)
+      if iters_value is None or iters_value % parallel_iterations:
+        outputs = control_flow_ops.cond(
+            math_ops.equal(num_remaining_iterations, 0),
+            lambda: tiled_outputs,
+            lambda: [array_ops.concat([x, y], axis=0)
+                     for x, y in zip(remaining_outputs, tiled_outputs)])
+      else:
+        outputs = tiled_outputs
+      return nest.pack_sequence_as(loop_fn_outputs, nest.flatten(outputs))
+
+
+
+
diff --git a/tensorflow/python/ops/parallel_for/control_flow_ops_test.py b/tensorflow/python/ops/parallel_for/control_flow_ops_test.py
index 171369b724a..cc20d7ca6aa 100644
--- a/tensorflow/python/ops/parallel_for/control_flow_ops_test.py
+++ b/tensorflow/python/ops/parallel_for/control_flow_ops_test.py
@@ -26,10 +26,12 @@ import numpy as np
 from tensorflow.core.example import example_pb2
 from tensorflow.core.example import feature_pb2
 from tensorflow.python.client import session
+from tensorflow.python.eager import backprop
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import bitwise_ops
 from tensorflow.python.ops import clip_ops
@@ -52,6 +54,7 @@ from tensorflow.python.platform import test
 from tensorflow.python.util import nest
 
 
+@test_util.run_all_in_graph_and_eager_modes
 class PForTest(test.TestCase):
 
   def _run_targets(self, targets1, targets2=None, run_init=True):
@@ -73,9 +76,13 @@ class PForTest(test.TestCase):
       else:
         self.assertAllEqual(outputs[i + n], outputs[i])
 
-  def _test_loop_fn(self, loop_fn, iters, loop_fn_dtypes=dtypes.float32):
-    t1 = pfor_control_flow_ops.pfor(loop_fn, iters=iters)
-    t2 = pfor_control_flow_ops.for_loop(loop_fn, loop_fn_dtypes, iters=iters)
+  def _test_loop_fn(self, loop_fn, iters,
+                    loop_fn_dtypes=dtypes.float32,
+                    parallel_iterations=None):
+    t1 = pfor_control_flow_ops.pfor(loop_fn, iters=iters,
+                                    parallel_iterations=parallel_iterations)
+    t2 = pfor_control_flow_ops.for_loop(loop_fn, loop_fn_dtypes, iters=iters,
+                                        parallel_iterations=parallel_iterations)
     self.run_and_assert_equal(t1, t2)
 
   def test_op_conversion_fallback_to_while_loop(self):
@@ -96,7 +103,32 @@ class PForTest(test.TestCase):
         loop_fn, 3, loop_fn_dtypes=[dtypes.float32, dtypes.int32])
     flags.FLAGS.op_conversion_fallback_to_while_loop = False
 
+  def test_parallel_iterations(self):
+    for parallel_iterations in [2, 3, 8, 10]:
+      x = random_ops.random_uniform([8, 3])
 
+      # pylint: disable=cell-var-from-loop
+      def loop_fn(i):
+        return array_ops.gather(x, i)
+      # pylint: enable=cell-var-from-loop
+
+      self._test_loop_fn(loop_fn, 8, parallel_iterations=parallel_iterations)
+      self._test_loop_fn(loop_fn, 4 * constant_op.constant(2),
+                         parallel_iterations=parallel_iterations)
+
+  def test_parallel_iterations_zero(self):
+    with self.assertRaisesRegexp(ValueError, "positive integer"):
+      pfor_control_flow_ops.pfor(lambda i: 1, 8, parallel_iterations=0)
+    with self.assertRaisesRegexp(TypeError, "positive integer"):
+      pfor_control_flow_ops.for_loop(lambda i: 1, dtypes.int32, 8,
+                                     parallel_iterations=0)
+
+  def test_parallel_iterations_one(self):
+    with self.assertRaisesRegexp(ValueError, "Use for_loop instead"):
+      pfor_control_flow_ops.pfor(lambda i: 1, 8, parallel_iterations=1)
+
+
+@test_util.run_all_in_graph_and_eager_modes
 class ArrayTest(PForTest):
 
   def test_gather(self):
@@ -288,14 +320,17 @@ class ArrayTest(PForTest):
 
   def test_unary_cwise_ops(self):
     for op in [array_ops.identity, array_ops.stop_gradient]:
-      x = random_ops.random_uniform([3, 5])
+      with backprop.GradientTape(persistent=True) as g:
+        x = random_ops.random_uniform([3, 5])
+        g.watch(x)
 
       # pylint: disable=cell-var-from-loop
       def loop_fn(i):
-        x1 = array_ops.gather(x, i)
-        y = op(x1) + x1
-        loss = nn.l2_loss(y)
-        return op(x), y, gradient_ops.gradients(loss, x1)
+        with g:
+          x1 = array_ops.gather(x, i)
+          y = op(x1) + x1
+          loss = nn.l2_loss(y)
+        return op(x), y, g.gradient(loss, x1)
 
       # pylint: enable=cell-var-from-loop
 
@@ -318,17 +353,21 @@ class ArrayTest(PForTest):
     self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=[dtypes.float32])
 
   def test_strided_slice(self):
-    x = random_ops.random_uniform([3, 3, 4, 4, 2, 2, 2])
+    with backprop.GradientTape(persistent=True) as g:
+      x = random_ops.random_uniform([3, 3, 4, 4, 2, 2, 2])
+      g.watch(x)
 
     def loop_fn(i):
-      x_i = array_ops.gather(x, i)
-      y = x_i[:2, ::2, 1::3, ..., array_ops.newaxis, 1]
-      loss = nn.l2_loss(y)
-      return y, gradient_ops.gradients(loss, x_i)
+      with g:
+        x_i = array_ops.gather(x, i)
+        y = x_i[:2, ::2, 1::3, ..., array_ops.newaxis, 1]
+        loss = nn.l2_loss(y)
+      return y, g.gradient(loss, x_i)
 
     self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=[dtypes.float32] * 2)
 
 
+@test_util.run_all_in_graph_and_eager_modes
 class BitwiseTest(PForTest):
 
   def test_unary_cwise(self):
@@ -368,6 +407,7 @@ class BitwiseTest(PForTest):
       self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=output_dtypes)
 
 
+@test_util.run_all_in_graph_and_eager_modes
 class MathTest(PForTest):
 
   def test_unary_cwise_ops(self):
@@ -424,22 +464,29 @@ class MathTest(PForTest):
         nn.softsign,
     ]
     for op in complex_ops + real_ops:
-      x = random_ops.random_uniform([3, 5])
-      if op in complex_ops:
-        y = random_ops.random_uniform([3, 5])
-        x = math_ops.complex(x, y)
+      with backprop.GradientTape(persistent=True) as g:
+        x = random_ops.random_uniform([3, 5])
+        g.watch(x)
+        if op in complex_ops:
+          y = random_ops.random_uniform([3, 5])
+          g.watch(y)
+          x = math_ops.complex(x, y)
 
       # pylint: disable=cell-var-from-loop
       output_dtypes = []
       def loop_fn(i):
-        x1 = array_ops.gather(x, i)
-        y1 = op(x1)
-        outputs = [op(x), y1]
-        if y1.dtype == dtypes.float32:
-          loss = math_ops.reduce_sum(y1 * y1)
-          grad = gradient_ops.gradients(loss, x1)
-          if grad and grad[0] is not None:
-            outputs.extend(grad)
+        with g:
+          x1 = array_ops.gather(x, i)
+          y1 = op(x1)
+          outputs = [op(x), y1]
+          if y1.dtype == dtypes.float32:
+            loss = math_ops.reduce_sum(y1 * y1)
+          else:
+            loss = None
+        if loss is not None:
+          grad = g.gradient(loss, x1)
+          if grad is not None:
+            outputs.append(grad)
         del output_dtypes[:]
         output_dtypes.extend([t.dtype for t in outputs])
         return outputs
@@ -656,17 +703,19 @@ class MathTest(PForTest):
     x_shape = [2, 3, 4, 5, 6]
     x = random_ops.random_uniform(x_shape)
     for data_format in ("NCHW", "NHWC"):
-      bias_dim = 2 if data_format == "NCHW" else -1
-      bias_shape = x_shape[bias_dim]
-      bias = random_ops.random_uniform([bias_shape])
+      with backprop.GradientTape(persistent=True) as g:
+        bias_dim = 2 if data_format == "NCHW" else -1
+        bias_shape = x_shape[bias_dim]
+        bias = random_ops.random_uniform([bias_shape])
+        g.watch(bias)
 
       # pylint: disable=cell-var-from-loop
       def loop_fn(i):
-        a = array_ops.gather(x, i)
-        y = nn.bias_add(a, bias, data_format=data_format)
-        loss = math_ops.reduce_sum(y * y)
-        return y, gradient_ops.gradients(loss, bias)
-
+        with g:
+          a = array_ops.gather(x, i)
+          y = nn.bias_add(a, bias, data_format=data_format)
+          loss = math_ops.reduce_sum(y * y)
+        return y, g.gradient(loss, bias)
       # pylint: enable=cell-var-from-loop
 
       self._test_loop_fn(
@@ -727,6 +776,7 @@ class MathTest(PForTest):
       self._test_loop_fn(loop_fn, 2)
 
 
+@test_util.run_all_in_graph_and_eager_modes
 class NNTest(PForTest):
 
   def test_conv2d(self):
@@ -779,30 +829,60 @@ class NNTest(PForTest):
     self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=[dtypes.float32] * 2)
 
   def test_avg_pool(self):
-    x = random_ops.random_uniform([3, 2, 12, 12, 3])
-    ksize = [1, 3, 3, 1]
+    with backprop.GradientTape(persistent=True) as g:
+      x = random_ops.random_uniform([3, 2, 12, 12, 3])
+      g.watch(x)
+      ksize = [1, 3, 3, 1]
 
     def loop_fn(i):
-      x1 = array_ops.gather(x, i)
-      output = nn.avg_pool(
-          x1, ksize, strides=[1, 2, 2, 1], padding="VALID", data_format="NHWC")
-      loss = nn.l2_loss(output)
-      return output, gradient_ops.gradients(loss, x1)
+      with g:
+        x1 = array_ops.gather(x, i)
+        output = nn.avg_pool(
+            x1, ksize, strides=[1, 2, 2, 1], padding="VALID",
+            data_format="NHWC")
+        loss = nn.l2_loss(output)
+      return output, g.gradient(loss, x1)
 
     self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=[dtypes.float32] * 2)
 
   def test_max_pool(self):
-    x = random_ops.random_uniform([3, 2, 12, 12, 3])
-    ksize = [1, 3, 3, 1]
+    with backprop.GradientTape(persistent=True) as g:
+      x = random_ops.random_uniform([3, 2, 12, 12, 3])
+      g.watch(x)
+      ksize = [1, 3, 3, 1]
+      strides = [1, 2, 2, 1]
 
     def loop_fn(i):
-      x1 = array_ops.gather(x, i)
-      output = nn.max_pool(
-          x1, ksize, strides=[1, 2, 2, 1], padding="VALID", data_format="NHWC")
-      loss = nn.l2_loss(output)
-      ones = array_ops.ones_like(output)
-      grad = gradient_ops.gradients(loss, x1, grad_ys=ones)
-      grad_grad = gradient_ops.gradients(grad, ones)
+      with g:
+        x1 = array_ops.gather(x, i)
+        output = nn.max_pool(
+            x1, ksize, strides=strides, padding="VALID", data_format="NHWC")
+        loss = nn.l2_loss(output)
+        ones = array_ops.ones_like(output)
+        g.watch(ones)
+        grad = g.gradient(loss, x1, output_gradients=ones)
+      grad_grad = g.gradient(grad, ones)
+      return output, grad, grad_grad
+
+    self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=[dtypes.float32] * 3)
+
+  def test_max_pool3d(self):
+    with backprop.GradientTape(persistent=True) as g:
+      x = random_ops.random_uniform([3, 3, 2, 12, 12, 3])
+      g.watch(x)
+      ksize = [1, 1, 3, 3, 1]
+      strides = [1, 1, 2, 2, 1]
+
+    def loop_fn(i):
+      with g:
+        x1 = array_ops.gather(x, i)
+        output = nn.max_pool3d(
+            x1, ksize, strides=strides, padding="VALID", data_format="NDHWC")
+        loss = nn.l2_loss(output)
+        ones = array_ops.ones_like(output)
+        g.watch(ones)
+        grad = g.gradient(loss, x1, output_gradients=ones)
+      grad_grad = g.gradient(grad, ones)
       return output, grad, grad_grad
 
     self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=[dtypes.float32] * 3)
@@ -813,36 +893,41 @@ class NNTest(PForTest):
       data_formats.append("NCHW")
     for is_training in (True, False):
       for data_format in data_formats:
-        if data_format == "NCHW":
-          x = random_ops.random_uniform([3, 1, 2, 5, 5])
-        else:
-          x = random_ops.random_uniform([3, 1, 5, 5, 2])
-        scale = random_ops.random_uniform([2])
-        offset = random_ops.random_uniform([2])
-        mean = None if is_training else random_ops.random_uniform([2])
-        variance = None if is_training else random_ops.random_uniform([2])
+        with backprop.GradientTape(persistent=True) as g:
+          if data_format == "NCHW":
+            x = random_ops.random_uniform([3, 1, 2, 5, 5])
+          else:
+            x = random_ops.random_uniform([3, 1, 5, 5, 2])
+          g.watch(x)
+          scale = random_ops.random_uniform([2])
+          g.watch(scale)
+          offset = random_ops.random_uniform([2])
+          g.watch(offset)
+          mean = None if is_training else random_ops.random_uniform([2])
+          variance = None if is_training else random_ops.random_uniform([2])
 
         # pylint: disable=cell-var-from-loop
         def loop_fn(i):
-          x1 = array_ops.gather(x, i)
-          outputs = nn.fused_batch_norm(
-              x1,
-              scale,
-              offset,
-              mean=mean,
-              variance=variance,
-              epsilon=0.01,
-              data_format=data_format,
-              is_training=is_training)
-          outputs = list(outputs)
-          # We only test the first value of outputs when is_training is False.
-          # It looks like CPU and GPU have different outputs for batch_mean and
-          # batch_variance for this case.
-          if not is_training:
-            outputs[1] = constant_op.constant(0.)
-            outputs[2] = constant_op.constant(0.)
-          loss = nn.l2_loss(outputs[0])
-          gradients = gradient_ops.gradients(loss, [x1, scale, offset])
+          with g:
+            x1 = array_ops.gather(x, i)
+            outputs = nn.fused_batch_norm(
+                x1,
+                scale,
+                offset,
+                mean=mean,
+                variance=variance,
+                epsilon=0.01,
+                data_format=data_format,
+                is_training=is_training)
+            outputs = list(outputs)
+            # We only test the first value of outputs when is_training is False.
+            # It looks like CPU and GPU have different outputs for batch_mean
+            # and batch_variance for this case.
+            if not is_training:
+              outputs[1] = constant_op.constant(0.)
+              outputs[2] = constant_op.constant(0.)
+            loss = nn.l2_loss(outputs[0])
+          gradients = g.gradient(loss, [x1, scale, offset])
           return outputs + gradients
 
         # pylint: enable=cell-var-from-loop
@@ -850,16 +935,20 @@ class NNTest(PForTest):
         self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=[dtypes.float32] * 6)
 
   def test_softmax_cross_entropy_with_logits(self):
-    logits = random_ops.random_uniform([3, 2, 4])
-    labels = random_ops.random_uniform([3, 2, 4])
-    labels /= math_ops.reduce_sum(labels, axis=[2], keepdims=True)
+    with backprop.GradientTape(persistent=True) as g:
+      logits = random_ops.random_uniform([3, 2, 4])
+      g.watch(logits)
+      labels = random_ops.random_uniform([3, 2, 4])
+      labels /= math_ops.reduce_sum(labels, axis=[2], keepdims=True)
 
     def loop_fn(i):
-      logits_i = array_ops.gather(logits, i)
-      labels_i = array_ops.gather(labels, i)
-      loss = nn.softmax_cross_entropy_with_logits(
-          labels=labels_i, logits=logits_i)
-      return loss, gradient_ops.gradients(math_ops.reduce_sum(loss), logits_i)
+      with g:
+        logits_i = array_ops.gather(logits, i)
+        labels_i = array_ops.gather(labels, i)
+        loss = nn.softmax_cross_entropy_with_logits(
+            labels=labels_i, logits=logits_i)
+        total_loss = math_ops.reduce_sum(loss)
+      return loss, g.gradient(total_loss, logits_i)
 
     self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=[dtypes.float32] * 2)
 
@@ -1278,13 +1367,12 @@ class ControlFlowTest(PForTest):
     pfor_out, pfor_out_grad = pfor_control_flow_ops.pfor(loop_fn, 4)
     # Note that tf.while_loop does not work in the setup above. So we manually
     # construct the equivalent computation of the above loops here.
-    real_out = math_ops.reduce_sum(inp, reduction_indices=[0])
-    real_out = math_ops.reduce_prod(real_out, reduction_indices=[1])
+    real_out = math_ops.reduce_sum(inp, axis=[0])
+    real_out = math_ops.reduce_prod(real_out, axis=[1])
     # Note that gradients of real_out will accumulate the gradients across the
     # output value. Hence we do the same aggregation on pfor_out_grad.
     real_out_grad = gradient_ops.gradients(real_out, inp)[0]
-    sum_pfor_out_grad = math_ops.reduce_sum(
-        pfor_out_grad, reduction_indices=[0])
+    sum_pfor_out_grad = math_ops.reduce_sum(pfor_out_grad, axis=[0])
 
     with session.Session() as sess:
       v1, v2, v1_grad, v2_grad = sess.run(
diff --git a/tensorflow/python/ops/parallel_for/gradients.py b/tensorflow/python/ops/parallel_for/gradients.py
index 1f026b3660c..3ba1bde3476 100644
--- a/tensorflow/python/ops/parallel_for/gradients.py
+++ b/tensorflow/python/ops/parallel_for/gradients.py
@@ -25,7 +25,7 @@ from tensorflow.python.ops.parallel_for import control_flow_ops
 from tensorflow.python.util import nest
 
 
-def jacobian(output, inputs, use_pfor=True):
+def jacobian(output, inputs, use_pfor=True, parallel_iterations=None):
   """Computes jacobian of `output` w.r.t. `inputs`.
 
   Args:
@@ -33,6 +33,8 @@ def jacobian(output, inputs, use_pfor=True):
     inputs: A tensor or a nested structure of tensor objects.
     use_pfor: If true, uses pfor for computing the jacobian. Else uses
       tf.while_loop.
+    parallel_iterations: A knob to control how many iterations and dispatched in
+      parallel. This knob can be used to control the total memory usage.
 
   Returns:
     A tensor or a nested strucutre of tensors with the same structure as
@@ -56,10 +58,14 @@ def jacobian(output, inputs, use_pfor=True):
     output_size = array_ops.shape(output)[0]
 
   if use_pfor:
-    pfor_outputs = control_flow_ops.pfor(loop_fn, output_size)
+    pfor_outputs = control_flow_ops.pfor(
+        loop_fn, output_size, parallel_iterations=parallel_iterations)
   else:
     pfor_outputs = control_flow_ops.for_loop(
-        loop_fn, [output.dtype] * len(flat_inputs), output_size)
+        loop_fn,
+        [output.dtype] * len(flat_inputs),
+        output_size,
+        parallel_iterations=parallel_iterations)
 
   for i, out in enumerate(pfor_outputs):
     if out is not None:
@@ -72,7 +78,7 @@ def jacobian(output, inputs, use_pfor=True):
   return nest.pack_sequence_as(inputs, pfor_outputs)
 
 
-def batch_jacobian(output, inp, use_pfor=True):
+def batch_jacobian(output, inp, use_pfor=True, parallel_iterations=None):
   """Computes and stacks jacobians of `output[i,...]` w.r.t. `input[i,...]`.
 
   e.g.
@@ -87,6 +93,8 @@ def batch_jacobian(output, inp, use_pfor=True):
     inp: A tensor with shape [b, x1, ..., x_m]
     use_pfor: If true, uses pfor for computing the Jacobian. Else uses a
       tf.while_loop.
+    parallel_iterations: A knob to control how many iterations and dispatched in
+      parallel. This knob can be used to control the total memory usage.
 
   Returns:
     A tensor `t` with shape [b, y_1, ..., y_n, x1, ..., x_m] where `t[i, ...]`
@@ -118,10 +126,13 @@ def batch_jacobian(output, inp, use_pfor=True):
     return gradient_ops.gradients(y, inp)[0]
 
   if use_pfor:
-    pfor_output = control_flow_ops.pfor(loop_fn, output_row_size)
+    pfor_output = control_flow_ops.pfor(loop_fn, output_row_size,
+                                        parallel_iterations=parallel_iterations)
   else:
-    pfor_output = control_flow_ops.for_loop(loop_fn, output.dtype,
-                                            output_row_size)
+    pfor_output = control_flow_ops.for_loop(
+        loop_fn, output.dtype,
+        output_row_size,
+        parallel_iterations=parallel_iterations)
   if pfor_output is None:
     return None
   pfor_output = array_ops.reshape(pfor_output,
diff --git a/tensorflow/python/ops/parallel_for/gradients_test.py b/tensorflow/python/ops/parallel_for/gradients_test.py
index 5a058bae825..4342833e3eb 100644
--- a/tensorflow/python/ops/parallel_for/gradients_test.py
+++ b/tensorflow/python/ops/parallel_for/gradients_test.py
@@ -416,6 +416,12 @@ class GradientsTest(test.TestCase):
       self.assertAllClose(ans, pfor_value)
       self.assertAllClose(ans, while_value)
 
+  def test_jacobian_parallel_iterations(self):
+    x = constant_op.constant([[1., 2], [3, 4]])
+    y = math_ops.matmul(x, x)
+    self.assertAllClose(gradients.jacobian(y, x, parallel_iterations=2),
+                        gradients.jacobian(y, x, parallel_iterations=3))
+
   def test_batch_jacobian_bad_shapes(self):
     x = random_ops.random_uniform([2, 2])
     y = random_ops.random_uniform([3, 2])
@@ -459,6 +465,13 @@ class GradientsTest(test.TestCase):
       self.assertAllClose(ans, pfor_value)
       self.assertAllClose(ans, while_value)
 
+  def test_batch_jacobian_parallel_iterations(self):
+    x = constant_op.constant([[1., 2], [3, 4]])
+    w = constant_op.constant([[1., 2, 3, 4], [5, 6, 7, 8]])
+    y = math_ops.matmul(x, w)
+    self.assertAllClose(gradients.batch_jacobian(y, x, parallel_iterations=2),
+                        gradients.batch_jacobian(y, x, parallel_iterations=3))
+
   def test_fc_batch_jacobian(self):
     pfor_jacobian, while_jacobian = create_fc_batch_jacobian(8, 4, 2)
     self.run_and_assert_equal(pfor_jacobian, while_jacobian)
@@ -471,8 +484,8 @@ class GradientsTest(test.TestCase):
     pfor_jacobian, while_gradients = create_dynamic_lstm_batch_jacobian(8, 4, 3)
     with session.Session() as sess:
       init = variables.global_variables_initializer()
-      sess.run(init)
-      pfor = sess.run(pfor_jacobian)
+      self.evaluate(init)
+      pfor = self.evaluate(pfor_jacobian)
       for i in range(4):
         while_i = sess.run(while_gradients[i])
         self.assertAllClose(while_i, pfor[:, i, ...])
@@ -547,11 +560,11 @@ class GradientsBenchmarks(test.Benchmark):
     sess = session.Session()
     with sess:
       init = variables.global_variables_initializer()
-      sess.run(init)
-      sess.run(targets)
+      self.evaluate(init)
+      self.evaluate(targets)
       begin = time.time()
       for _ in range(iters):
-        sess.run(targets)
+        self.evaluate(targets)
       end = time.time()
     avg_time_ms = 1000 * (end - begin) / iters
     self.report_benchmark(iters=iters, wall_time=avg_time_ms, name=name)
diff --git a/tensorflow/python/ops/parallel_for/pfor.py b/tensorflow/python/ops/parallel_for/pfor.py
index e6f140a9410..a22c1126c93 100644
--- a/tensorflow/python/ops/parallel_for/pfor.py
+++ b/tensorflow/python/ops/parallel_for/pfor.py
@@ -1152,9 +1152,8 @@ class PFor(object):
           continue
 
         converted_inputs = [self._conversion_map[inp] for inp in y_op.inputs]
-        some_input_converted = any(
-            [self._was_converted(x) for x in y_op.inputs])
-        some_input_stacked = any([x.is_stacked for x in converted_inputs])
+        some_input_converted = any(self._was_converted(x) for x in y_op.inputs)
+        some_input_stacked = any(x.is_stacked for x in converted_inputs)
 
         converted_control_ops = set()
         some_control_input_converted = False
@@ -1198,7 +1197,7 @@ class PFor(object):
           # All inputs are unstacked or uncoverted but some control inputs are
           # converted.
           # TODO(rachelim): Handle the case where some inputs are sparsely
-          # stacked (i.e. any([x.is_sparse_stacked for x in converted_inputs]))
+          # stacked (i.e. any(x.is_sparse_stacked for x in converted_inputs))
           new_op = _create_op(y_op.type, [x.t for x in converted_inputs],
                               [x.dtype for x in y_op.outputs],
                               y_op.node_def.attr)
@@ -1303,7 +1302,10 @@ def _inputs_with_flattening(pfor_input, input_indices):
 @RegisterPForWithArgs("Conv2D", dims=[0])
 @RegisterPForWithArgs("AvgPool", dims=[0])
 @RegisterPForWithArgs("MaxPool", dims=[0])
+@RegisterPForWithArgs("MaxPool3D", dims=[0])
+@RegisterPForWithArgs("MaxPool3DGrad", dims=[0, 1, 2])
 @RegisterPForWithArgs("MaxPoolGrad", dims=[0, 1, 2])
+@RegisterPForWithArgs("MaxPool3DGradGrad", dims=[0, 1, 2])
 @RegisterPForWithArgs("MaxPoolGradGrad", dims=[0, 1, 2])
 @RegisterPForWithArgs("SoftmaxCrossEntropyWithLogits", dims=[0, 1])
 def _convert_flatten_batch(pfor_input, op_type, dims):
diff --git a/tensorflow/python/ops/parsing_ops.py b/tensorflow/python/ops/parsing_ops.py
index 484caf01796..a84af6c5cf2 100644
--- a/tensorflow/python/ops/parsing_ops.py
+++ b/tensorflow/python/ops/parsing_ops.py
@@ -363,7 +363,7 @@ def _prepend_none_dimension(features):
     return features
 
 
-@tf_export("io.parse_example", v1=["io.parse_example", "parse_example"])
+@tf_export(v1=["io.parse_example", "parse_example"])
 def parse_example(serialized, features, name=None, example_names=None):
   # pylint: disable=line-too-long
   """Parses `Example` protos into a `dict` of tensors.
@@ -574,6 +574,223 @@ def parse_example(serialized, features, name=None, example_names=None):
   Returns:
     A `dict` mapping feature keys to `Tensor` and `SparseTensor` values.
 
+  Raises:
+    ValueError: if any feature is invalid.
+  """
+  return parse_example_v2(serialized, features, example_names, name)
+
+
+@tf_export("io.parse_example", v1=[])
+def parse_example_v2(serialized, features, example_names=None, name=None):
+  # pylint: disable=line-too-long
+  """Parses `Example` protos into a `dict` of tensors.
+
+  Parses a number of serialized [`Example`](https://www.tensorflow.org/code/tensorflow/core/example/example.proto)
+  protos given in `serialized`. We refer to `serialized` as a batch with
+  `batch_size` many entries of individual `Example` protos.
+
+  `example_names` may contain descriptive names for the corresponding serialized
+  protos. These may be useful for debugging purposes, but they have no effect on
+  the output. If not `None`, `example_names` must be the same length as
+  `serialized`.
+
+  This op parses serialized examples into a dictionary mapping keys to `Tensor`
+  and `SparseTensor` objects. `features` is a dict from keys to `VarLenFeature`,
+  `SparseFeature`, and `FixedLenFeature` objects. Each `VarLenFeature`
+  and `SparseFeature` is mapped to a `SparseTensor`, and each
+  `FixedLenFeature` is mapped to a `Tensor`.
+
+  Each `VarLenFeature` maps to a `SparseTensor` of the specified type
+  representing a ragged matrix. Its indices are `[batch, index]` where `batch`
+  identifies the example in `serialized`, and `index` is the value's index in
+  the list of values associated with that feature and example.
+
+  Each `SparseFeature` maps to a `SparseTensor` of the specified type
+  representing a Tensor of `dense_shape` `[batch_size] + SparseFeature.size`.
+  Its `values` come from the feature in the examples with key `value_key`.
+  A `values[i]` comes from a position `k` in the feature of an example at batch
+  entry `batch`. This positional information is recorded in `indices[i]` as
+  `[batch, index_0, index_1, ...]` where `index_j` is the `k-th` value of
+  the feature in the example at with key `SparseFeature.index_key[j]`.
+  In other words, we split the indices (except the first index indicating the
+  batch entry) of a `SparseTensor` by dimension into different features of the
+  `Example`. Due to its complexity a `VarLenFeature` should be preferred over a
+  `SparseFeature` whenever possible.
+
+  Each `FixedLenFeature` `df` maps to a `Tensor` of the specified type (or
+  `tf.float32` if not specified) and shape `(serialized.size(),) + df.shape`.
+
+  `FixedLenFeature` entries with a `default_value` are optional. With no default
+  value, we will fail if that `Feature` is missing from any example in
+  `serialized`.
+
+  Each `FixedLenSequenceFeature` `df` maps to a `Tensor` of the specified type
+  (or `tf.float32` if not specified) and shape
+  `(serialized.size(), None) + df.shape`.
+  All examples in `serialized` will be padded with `default_value` along the
+  second dimension.
+
+  Examples:
+
+  For example, if one expects a `tf.float32` `VarLenFeature` `ft` and three
+  serialized `Example`s are provided:
+
+  ```
+  serialized = [
+    features
+      { feature { key: "ft" value { float_list { value: [1.0, 2.0] } } } },
+    features
+      { feature []},
+    features
+      { feature { key: "ft" value { float_list { value: [3.0] } } }
+  ]
+  ```
+
+  then the output will look like:
+
+  ```python
+  {"ft": SparseTensor(indices=[[0, 0], [0, 1], [2, 0]],
+                      values=[1.0, 2.0, 3.0],
+                      dense_shape=(3, 2)) }
+  ```
+
+  If instead a `FixedLenSequenceFeature` with `default_value = -1.0` and
+  `shape=[]` is used then the output will look like:
+
+  ```python
+  {"ft": [[1.0, 2.0], [3.0, -1.0]]}
+  ```
+
+  Given two `Example` input protos in `serialized`:
+
+  ```
+  [
+    features {
+      feature { key: "kw" value { bytes_list { value: [ "knit", "big" ] } } }
+      feature { key: "gps" value { float_list { value: [] } } }
+    },
+    features {
+      feature { key: "kw" value { bytes_list { value: [ "emmy" ] } } }
+      feature { key: "dank" value { int64_list { value: [ 42 ] } } }
+      feature { key: "gps" value { } }
+    }
+  ]
+  ```
+
+  And arguments
+
+  ```
+  example_names: ["input0", "input1"],
+  features: {
+      "kw": VarLenFeature(tf.string),
+      "dank": VarLenFeature(tf.int64),
+      "gps": VarLenFeature(tf.float32),
+  }
+  ```
+
+  Then the output is a dictionary:
+
+  ```python
+  {
+    "kw": SparseTensor(
+        indices=[[0, 0], [0, 1], [1, 0]],
+        values=["knit", "big", "emmy"]
+        dense_shape=[2, 2]),
+    "dank": SparseTensor(
+        indices=[[1, 0]],
+        values=[42],
+        dense_shape=[2, 1]),
+    "gps": SparseTensor(
+        indices=[],
+        values=[],
+        dense_shape=[2, 0]),
+  }
+  ```
+
+  For dense results in two serialized `Example`s:
+
+  ```
+  [
+    features {
+      feature { key: "age" value { int64_list { value: [ 0 ] } } }
+      feature { key: "gender" value { bytes_list { value: [ "f" ] } } }
+     },
+     features {
+      feature { key: "age" value { int64_list { value: [] } } }
+      feature { key: "gender" value { bytes_list { value: [ "f" ] } } }
+    }
+  ]
+  ```
+
+  We can use arguments:
+
+  ```
+  example_names: ["input0", "input1"],
+  features: {
+      "age": FixedLenFeature([], dtype=tf.int64, default_value=-1),
+      "gender": FixedLenFeature([], dtype=tf.string),
+  }
+  ```
+
+  And the expected output is:
+
+  ```python
+  {
+    "age": [[0], [-1]],
+    "gender": [["f"], ["f"]],
+  }
+  ```
+
+  An alternative to `VarLenFeature` to obtain a `SparseTensor` is
+  `SparseFeature`. For example, given two `Example` input protos in
+  `serialized`:
+
+  ```
+  [
+    features {
+      feature { key: "val" value { float_list { value: [ 0.5, -1.0 ] } } }
+      feature { key: "ix" value { int64_list { value: [ 3, 20 ] } } }
+    },
+    features {
+      feature { key: "val" value { float_list { value: [ 0.0 ] } } }
+      feature { key: "ix" value { int64_list { value: [ 42 ] } } }
+    }
+  ]
+  ```
+
+  And arguments
+
+  ```
+  example_names: ["input0", "input1"],
+  features: {
+      "sparse": SparseFeature(
+          index_key="ix", value_key="val", dtype=tf.float32, size=100),
+  }
+  ```
+
+  Then the output is a dictionary:
+
+  ```python
+  {
+    "sparse": SparseTensor(
+        indices=[[0, 3], [0, 20], [1, 42]],
+        values=[0.5, -1.0, 0.0]
+        dense_shape=[2, 100]),
+  }
+  ```
+
+  Args:
+    serialized: A vector (1-D Tensor) of strings, a batch of binary
+      serialized `Example` protos.
+    features: A `dict` mapping feature keys to `FixedLenFeature`,
+      `VarLenFeature`, and `SparseFeature` values.
+    example_names: A vector (1-D Tensor) of strings (optional), the names of
+      the serialized protos in the batch.
+    name: A name for this operation (optional).
+
+  Returns:
+    A `dict` mapping feature keys to `Tensor` and `SparseTensor` values.
+
   Raises:
     ValueError: if any feature is invalid.
   """
@@ -764,8 +981,7 @@ def _process_raw_parameters(names, dense_defaults, sparse_keys, sparse_types,
           dense_shapes_as_proto, dense_shapes)
 
 
-@tf_export("io.parse_single_example",
-           v1=["io.parse_single_example", "parse_single_example"])
+@tf_export(v1=["io.parse_single_example", "parse_single_example"])
 def parse_single_example(serialized, features, name=None, example_names=None):
   """Parses a single `Example` proto.
 
@@ -795,6 +1011,48 @@ def parse_single_example(serialized, features, name=None, example_names=None):
   Returns:
     A `dict` mapping feature keys to `Tensor` and `SparseTensor` values.
 
+  Raises:
+    ValueError: if any feature is invalid.
+  """
+  return parse_single_example_v2_unoptimized(
+      serialized, features, example_names, name
+      )
+
+
+# TODO(b/70890287): Combine the implementation of this op and
+# `parse_single_example_v2()` after 1/10/2018.
+@tf_export("io.parse_single_example", v1=[])
+def parse_single_example_v2_unoptimized(
+    serialized, features, example_names=None, name=None
+    ):
+  """Parses a single `Example` proto.
+
+  Similar to `parse_example`, except:
+
+  For dense tensors, the returned `Tensor` is identical to the output of
+  `parse_example`, except there is no batch dimension, the output shape is the
+  same as the shape given in `dense_shape`.
+
+  For `SparseTensor`s, the first (batch) column of the indices matrix is removed
+  (the indices matrix is a column vector), the values vector is unchanged, and
+  the first (`batch_size`) entry of the shape vector is removed (it is now a
+  single element vector).
+
+  One might see performance advantages by batching `Example` protos with
+  `parse_example` instead of using this function directly.
+
+  Args:
+    serialized: A scalar string Tensor, a single serialized Example.
+      See `_parse_single_example_raw` documentation for more details.
+    features: A `dict` mapping feature keys to `FixedLenFeature` or
+      `VarLenFeature` values.
+    example_names: (Optional) A scalar string Tensor, the associated name.
+      See `_parse_single_example_raw` documentation for more details.
+    name: A name for this operation (optional).
+
+  Returns:
+    A `dict` mapping feature keys to `Tensor` and `SparseTensor` values.
+
   Raises:
     ValueError: if any feature is invalid.
   """
@@ -1570,7 +1828,7 @@ def _parse_single_sequence_example_raw(serialized,
 
 
 # Swap `name` and `na_value` for backward compatibility.
-@tf_export("io.decode_csv", v1=["io.decode_csv", "decode_csv"])
+@tf_export(v1=["io.decode_csv", "decode_csv"])
 @deprecation.deprecated_endpoints("decode_csv")
 def decode_csv(records,
                record_defaults,
@@ -1609,6 +1867,54 @@ def decode_csv(records,
     A list of `Tensor` objects. Has the same type as `record_defaults`.
     Each tensor will have the same shape as records.
 
+  Raises:
+    ValueError: If any of the arguments is malformed.
+  """
+  return decode_csv_v2(
+      records, record_defaults,
+      field_delim, use_quote_delim,
+      na_value, select_cols, name
+      )
+
+
+@tf_export("io.decode_csv", v1=[])
+def decode_csv_v2(records,
+                  record_defaults,
+                  field_delim=",",
+                  use_quote_delim=True,
+                  na_value="",
+                  select_cols=None,
+                  name=None):
+  """Convert CSV records to tensors. Each column maps to one tensor.
+
+  RFC 4180 format is expected for the CSV records.
+  (https://tools.ietf.org/html/rfc4180)
+  Note that we allow leading and trailing spaces with int or float field.
+
+  Args:
+    records: A `Tensor` of type `string`.
+      Each string is a record/row in the csv and all records should have
+      the same format.
+    record_defaults: A list of `Tensor` objects with specific types.
+      Acceptable types are `float32`, `float64`, `int32`, `int64`, `string`.
+      One tensor per column of the input record, with either a
+      scalar default value for that column or an empty vector if the column is
+      required.
+    field_delim: An optional `string`. Defaults to `","`.
+      char delimiter to separate fields in a record.
+    use_quote_delim: An optional `bool`. Defaults to `True`.
+      If false, treats double quotation marks as regular
+      characters inside of the string fields (ignoring RFC 4180, Section 2,
+      Bullet 5).
+    na_value: Additional string to recognize as NA/NaN.
+    select_cols: Optional sorted list of column indices to select. If specified,
+      only this subset of columns will be parsed and returned.
+    name: A name for the operation (optional).
+
+  Returns:
+    A list of `Tensor` objects. Has the same type as `record_defaults`.
+    Each tensor will have the same shape as records.
+
   Raises:
     ValueError: If any of the arguments is malformed.
   """
diff --git a/tensorflow/python/ops/partitioned_variables.py b/tensorflow/python/ops/partitioned_variables.py
index 7743b634e8f..c1084c25592 100644
--- a/tensorflow/python/ops/partitioned_variables.py
+++ b/tensorflow/python/ops/partitioned_variables.py
@@ -57,7 +57,7 @@ import math
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import variable_scope
-from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.util import deprecation
 from tensorflow.python.util.tf_export import tf_export
 
 __all__ = [
@@ -68,7 +68,7 @@ __all__ = [
 ]
 
 
-@tf_export("variable_axis_size_partitioner")
+@tf_export(v1=["variable_axis_size_partitioner"])
 def variable_axis_size_partitioner(
     max_shard_bytes, axis=0, bytes_per_string_element=16, max_shards=None):
   """Get a partitioner for VariableScope to keep shards below `max_shard_bytes`.
@@ -96,7 +96,7 @@ def variable_axis_size_partitioner(
 
   Returns:
     A partition function usable as the `partitioner` argument to
-    `variable_scope`, `get_variable`, and `get_partitioned_variable_list`.
+    `variable_scope` and `get_variable`.
 
   Raises:
     ValueError: If any of the byte counts are non-positive.
@@ -154,7 +154,7 @@ def variable_axis_size_partitioner(
   return _partitioner
 
 
-@tf_export("min_max_variable_partitioner")
+@tf_export(v1=["min_max_variable_partitioner"])
 def min_max_variable_partitioner(max_partitions=1, axis=0,
                                  min_slice_size=256 << 10,
                                  bytes_per_string_element=16):
@@ -175,7 +175,7 @@ def min_max_variable_partitioner(max_partitions=1, axis=0,
 
   Returns:
     A partition function usable as the `partitioner` argument to
-    `variable_scope`, `get_variable`, and `get_partitioned_variable_list`.
+    `variable_scope` and `get_variable`.
 
   """
   def _partitioner(shape, dtype):
@@ -218,7 +218,7 @@ def min_max_variable_partitioner(max_partitions=1, axis=0,
   return _partitioner
 
 
-@tf_export("fixed_size_partitioner")
+@tf_export(v1=["fixed_size_partitioner"])
 def fixed_size_partitioner(num_shards, axis=0):
   """Partitioner to specify a fixed number of shards along given axis.
 
@@ -228,7 +228,7 @@ def fixed_size_partitioner(num_shards, axis=0):
 
   Returns:
     A partition function usable as the `partitioner` argument to
-    `variable_scope`, `get_variable`, and `get_partitioned_variable_list`.
+    `variable_scope` and `get_variable`.
   """
   def _partitioner(shape, **unused_args):
     partitions_list = [1] * len(shape)
@@ -237,7 +237,10 @@ def fixed_size_partitioner(num_shards, axis=0):
   return _partitioner
 
 
-@tf_export("create_partitioned_variables")
+@tf_export(v1=["create_partitioned_variables"])
+@deprecation.deprecated(
+    date=None,
+    instructions="Use tf.get_variable with a partitioner set.")
 def create_partitioned_variables(
     shape, slicing, initializer, dtype=dtypes.float32,
     trainable=True, collections=None, name=None, reuse=None):
@@ -282,11 +285,6 @@ def create_partitioned_variables(
   Raises:
     ValueError: If any of the arguments is malformed.
   """
-  logging.warn(
-      "create_partitioned_variables is deprecated.  Use "
-      "tf.get_variable with a partitioner set, or "
-      "tf.get_partitioned_variable_list, instead.")
-
   if len(shape) != len(slicing):
     raise ValueError("The 'shape' and 'slicing' of a partitioned Variable "
                      "must have the length: shape: %s, slicing: %s" %
diff --git a/tensorflow/python/ops/quantized_conv_ops_test.py b/tensorflow/python/ops/quantized_conv_ops_test.py
index f7fa264461e..6b469a954f6 100644
--- a/tensorflow/python/ops/quantized_conv_ops_test.py
+++ b/tensorflow/python/ops/quantized_conv_ops_test.py
@@ -73,7 +73,7 @@ class Conv2DTest(test.TestCase):
           max_input=x1_max,
           min_filter=x2_min,
           max_filter=x2_max)
-      value = sess.run(conv)
+      value = self.evaluate(conv)
     quantized_output = value[0]
     output_min = value[1]
     output_max = value[2]
diff --git a/tensorflow/python/ops/quantized_ops_test.py b/tensorflow/python/ops/quantized_ops_test.py
index 0f3b04e4ad0..b81843d1748 100644
--- a/tensorflow/python/ops/quantized_ops_test.py
+++ b/tensorflow/python/ops/quantized_ops_test.py
@@ -41,7 +41,7 @@ class QuantizedOpsTest(test.TestCase):
       x_min = 0.0
       x_max = 255.0
       op = array_ops.quantize(x, x_min, x_max, dtypes.quint8, mode="MIN_FIRST")
-      value = sess.run(op)
+      value = self.evaluate(op)
       self.assertArrayNear(expected_output, value.output, 0.1)
 
   def testDequantizeOp(self):
@@ -52,7 +52,7 @@ class QuantizedOpsTest(test.TestCase):
       x_min = 0.0
       x_max = 255.0
       op = array_ops.dequantize(x, x_min, x_max, mode="MIN_FIRST")
-      value = sess.run(op)
+      value = self.evaluate(op)
       self.assertArrayNear(expected_output, value, 0.1)
 
 
diff --git a/tensorflow/python/ops/ragged/BUILD b/tensorflow/python/ops/ragged/BUILD
index 152c6dc8416..e335c5cb6f3 100644
--- a/tensorflow/python/ops/ragged/BUILD
+++ b/tensorflow/python/ops/ragged/BUILD
@@ -32,7 +32,9 @@ py_library(
         ":ragged_map_ops",
         ":ragged_math_ops",
         ":ragged_operators",
+        ":ragged_string_ops",
         ":ragged_tensor",
+        ":ragged_tensor_shape",
         ":ragged_tensor_value",
         ":ragged_util",
         ":segment_id_ops",
@@ -155,6 +157,7 @@ py_library(
     deps = [
         ":ragged_factory_ops",
         ":ragged_tensor",
+        ":ragged_tensor_shape",
         ":ragged_util",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:clip_ops",
@@ -178,6 +181,21 @@ py_library(
     ],
 )
 
+py_library(
+    name = "ragged_string_ops",
+    srcs = ["ragged_string_ops.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":ragged_array_ops",
+        ":ragged_conversion_ops",
+        ":ragged_factory_ops",
+        ":ragged_tensor",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:util",
+    ],
+)
+
 py_library(
     name = "ragged_tensor",
     srcs = ["ragged_tensor.py"],
@@ -190,6 +208,25 @@ py_library(
     ],
 )
 
+py_library(
+    name = "ragged_tensor_shape",
+    srcs = ["ragged_tensor_shape.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":ragged_array_ops",
+        ":ragged_conversion_ops",
+        ":ragged_factory_ops",
+        ":ragged_tensor",
+        ":ragged_util",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:tensor_shape",
+        "//tensorflow/python:tensor_util",
+    ],
+)
+
 py_library(
     name = "ragged_tensor_value",
     srcs = ["ragged_tensor_value.py"],
@@ -207,6 +244,7 @@ py_library(
         "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:math_ops",
+        "//tensorflow/python:ragged_math_ops_gen",
     ],
 )
 
@@ -256,6 +294,9 @@ py_test(
     size = "medium",
     srcs = ["ragged_tensor_test.py"],
     srcs_version = "PY2AND3",
+    tags = [
+        "no_windows",
+    ],
     deps = [
         ":ragged",
         "//tensorflow/python:array_ops",
@@ -407,6 +448,9 @@ py_test(
     name = "ragged_to_sparse_op_test",
     srcs = ["ragged_to_sparse_op_test.py"],
     srcs_version = "PY2AND3",
+    tags = [
+        "no_windows",
+    ],
     deps = [
         ":ragged",
         "//tensorflow/python:array_ops",
@@ -513,6 +557,9 @@ py_test(
     name = "ragged_constant_value_op_test",
     srcs = ["ragged_constant_value_op_test.py"],
     srcs_version = "PY2AND3",
+    tags = [
+        "no_windows",
+    ],
     deps = [
         ":ragged",
         "//tensorflow/python:framework_test_lib",
@@ -681,3 +728,15 @@ py_test(
         "@absl_py//absl/testing:parameterized",
     ],
 )
+
+py_test(
+    name = "ragged_tensor_shape_test",
+    srcs = ["ragged_tensor_shape_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":ragged",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
diff --git a/tensorflow/python/ops/ragged/__init__.py b/tensorflow/python/ops/ragged/__init__.py
index 3a288485454..1b2a7be95fc 100644
--- a/tensorflow/python/ops/ragged/__init__.py
+++ b/tensorflow/python/ops/ragged/__init__.py
@@ -143,6 +143,11 @@ The following operations are specific to ragged tensors:
 <!-- Elementwise Ops -->
 @@make_elementwise_op
 
+<!-- Shape & broadcasting -->
+@@RaggedTensorDynamicShape
+@@broadcast_to
+@@broadcast_dynamic_shape
+
 <!-- Symbols from  ragged_elementwise_ops._symbols_to_export are whitelisted -->
 """
 
@@ -151,6 +156,7 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.ops.ragged import ragged_operators
+from tensorflow.python.ops.ragged import ragged_string_ops
 
 from tensorflow.python.ops.ragged.ragged_array_ops import batch_gather
 from tensorflow.python.ops.ragged.ragged_array_ops import boolean_mask
@@ -214,6 +220,10 @@ from tensorflow.python.ops.ragged.ragged_tensor import is_ragged
 from tensorflow.python.ops.ragged.ragged_tensor import RaggedTensor
 from tensorflow.python.ops.ragged.ragged_tensor import RaggedTensorType
 
+from tensorflow.python.ops.ragged.ragged_tensor_shape import broadcast_dynamic_shape
+from tensorflow.python.ops.ragged.ragged_tensor_shape import broadcast_to
+from tensorflow.python.ops.ragged.ragged_tensor_shape import RaggedTensorDynamicShape
+
 from tensorflow.python.ops.ragged.ragged_tensor_value import RaggedTensorValue
 
 from tensorflow.python.ops.ragged.segment_id_ops import row_splits_to_segment_ids
diff --git a/tensorflow/python/ops/ragged/convert_to_tensor_or_ragged_tensor_op_test.py b/tensorflow/python/ops/ragged/convert_to_tensor_or_ragged_tensor_op_test.py
index b43470dfa11..ef3464f2437 100644
--- a/tensorflow/python/ops/ragged/convert_to_tensor_or_ragged_tensor_op_test.py
+++ b/tensorflow/python/ops/ragged/convert_to_tensor_or_ragged_tensor_op_test.py
@@ -90,6 +90,7 @@ class RaggedConvertToTensorOrRaggedTensorTest(test_util.TensorFlowTestCase,
           preferred_dtype=dtypes.string,
           expected_dtype=dtypes.int32),
   ])
+  @test_util.run_deprecated_v1
   def testConvertRaggedTensorValue(self,
                                    value,
                                    dtype=None,
@@ -102,7 +103,7 @@ class RaggedConvertToTensorOrRaggedTensorTest(test_util.TensorFlowTestCase,
     self.assertEqual(value.ragged_rank, converted.ragged_rank)
     self.assertEqual(dtypes.as_dtype(expected_dtype), converted.dtype)
     with self.test_session():
-      self.assertEqual(value.tolist(), converted.eval().tolist())
+      self.assertEqual(value.tolist(), self.evaluate(converted).tolist())
 
   @parameterized.parameters([
       dict(
@@ -145,6 +146,7 @@ class RaggedConvertToTensorOrRaggedTensorTest(test_util.TensorFlowTestCase,
           message=('Tensor conversion requested dtype string for '
                    'Tensor with dtype int32')),
   ])
+  @test_util.run_deprecated_v1
   def testConvertTensorError(self,
                              pylist,
                              message,
diff --git a/tensorflow/python/ops/ragged/ragged_array_ops.py b/tensorflow/python/ops/ragged/ragged_array_ops.py
index 425f3957c38..603e39d1dcf 100644
--- a/tensorflow/python/ops/ragged/ragged_array_ops.py
+++ b/tensorflow/python/ops/ragged/ragged_array_ops.py
@@ -225,6 +225,28 @@ def row_lengths(rt_input, axis=1, name=None):
       return array_ops.ones(shape[:axis], dtypes.int64) * shape[axis]
 
 
+def nested_row_lengths(rt_input, name=None):
+  """Returns a tuple containing the row_lengths for all ragged dimensions.
+
+  `nested_row_lengths(rt)` is a tuple containing the `row_lengths` tensors for
+  all ragged dimensions in `rt`, ordered from outermost to innermost.
+
+  Args:
+    rt_input: A potentially ragged tensor.
+    name: A name prefix for the returned tensors (optional).
+
+  Returns:
+    A `tuple` of 1-D `int64` `Tensors`.  The length of the tuple is equal to
+    `rt_input.ragged_rank`.
+  """
+  with ops.name_scope(name, 'RaggedNestedRowLengths', [rt_input]):
+    rt_nested_row_lengths = []
+    while isinstance(rt_input, ragged_tensor.RaggedTensor):
+      rt_nested_row_lengths.append(row_lengths(rt_input))
+      rt_input = rt_input.values
+    return tuple(rt_nested_row_lengths)
+
+
 #===============================================================================
 # Bounding Shape
 #===============================================================================
@@ -451,8 +473,7 @@ def batch_gather(params, indices, name=None):
         adjusted_indices = math_ops.to_int64(indices) + adjustments
         return gather(params.values, adjusted_indices)
       else:
-        raise ValueError(
-            'batch shape from indices does not match params shape')
+        raise ValueError('batch shape from indices does not match params shape')
 
 
 #===============================================================================
@@ -719,7 +740,7 @@ def boolean_mask(data, mask, keepdims=False, name=None):
             int_mask = ragged_functional_ops.map_inner_values(
                 math_ops.cast, mask, dtype=dtypes.int64)
             masked_row_lengths = ragged_math_ops.reduce_sum(int_mask, axis=1)
-            splits.append(_lengths_to_splits(masked_row_lengths))
+            splits.append(ragged_util.lengths_to_splits(masked_row_lengths))
           mask = mask.values
           data = data.values
 
@@ -741,7 +762,7 @@ def boolean_mask(data, mask, keepdims=False, name=None):
       # masks back to a splits tensor.
       lengths = row_lengths(data)
       masked_lengths = array_ops.boolean_mask(lengths, mask)
-      masked_splits = _lengths_to_splits(masked_lengths)
+      masked_splits = ragged_util.lengths_to_splits(masked_lengths)
 
       # Get the masked values: first get row ids corresponding to each
       # value, then use tf.gather to build a boolean mask that's false for
@@ -977,7 +998,7 @@ def _ragged_stack_concat_axis_0(rt_inputs, stack_values):
   # If we are performing a stack operation, then add another splits.
   if stack_values:
     stack_lengths = array_ops.stack([nrows(rt) for rt in rt_inputs])
-    stack_splits = _lengths_to_splits(stack_lengths)
+    stack_splits = ragged_util.lengths_to_splits(stack_lengths)
     concatenated_nested_splits.insert(0, stack_splits)
 
   return ragged_factory_ops.from_nested_row_splits(concatenated_inner_values,
@@ -1131,7 +1152,8 @@ def _tile_ragged_values(rt_input, multiples, const_multiples=None):
 
     # Repeat each element in this ragged dimension `multiples[axis]` times.
     if const_multiples is None or const_multiples[axis] != 1:
-      inner_value_ids = _repeat_ranges(inner_value_ids, splits, multiples[axis])
+      inner_value_ids = ragged_util.repeat_ranges(inner_value_ids, splits,
+                                                  multiples[axis])
 
     prev_splits = splits
 
@@ -1172,6 +1194,17 @@ def _tile_ragged_splits(rt_input, multiples, const_multiples=None):
   ragged_rank = rt_input.ragged_rank
   nested_splits = rt_input.nested_row_splits
 
+  # projected_splits[src_axis, dst_axis] contains the split points that divide
+  # the rows from src_axis in the list of dst_axis values.  E.g.,
+  # projected_splits[i, i] = nested_splits[i], and
+  # projected_splits[i, i+1] = gather(nested_splits[i+1], nested_splits[i]).
+  projected_splits = [{i: nested_splits[i]} for i in range(ragged_rank)]
+  for src_axis in range(ragged_rank):
+    for dst_axis in range(src_axis + 1, ragged_rank - 1):
+      projected_splits[src_axis][dst_axis] = array_ops.gather(
+          nested_splits[dst_axis],
+          projected_splits[src_axis][dst_axis - 1])
+
   # For each ragged dimension: nested_splits[axis] -> result_splits[axis].
   result_splits = []
   for axis in range(ragged_rank):
@@ -1188,16 +1221,16 @@ def _tile_ragged_splits(rt_input, multiples, const_multiples=None):
     repeats = 1
     for d in range(axis - 1, -1, -1):
       if const_multiples is None or const_multiples[d + 1] != 1:
-        splits = nested_splits[d] * repeats
-        output_lengths = _repeat_ranges(output_lengths, splits,
-                                        multiples[d + 1])
+        splits = projected_splits[d][axis - 1] * repeats
+        output_lengths = ragged_util.repeat_ranges(output_lengths, splits,
+                                                   multiples[d + 1])
       repeats *= multiples[d + 1]
 
     # Tile splits for the outermost (uniform) dimension.
     output_lengths = array_ops.tile(output_lengths, multiples[:1])
 
     # Convert to splits.
-    result_splits.append(_lengths_to_splits(output_lengths))
+    result_splits.append(ragged_util.lengths_to_splits(output_lengths))
 
   return result_splits
 
@@ -1425,11 +1458,6 @@ def _coordinate_where(condition):
 #===============================================================================
 
 
-def _lengths_to_splits(lengths):
-  """Returns splits corresponding to the given lengths."""
-  return array_ops.concat([[0], math_ops.cumsum(lengths)], axis=0)
-
-
 def _increase_ragged_rank_to(rt_input, ragged_rank):
   """Adds ragged dimensions to `rt_input` so it has the desired ragged rank."""
   if ragged_rank > 0:
@@ -1449,45 +1477,3 @@ def _concat_ragged_splits(splits_list):
     pieces.append(splits[1:] + splits_offset)
     splits_offset += splits[-1]
   return array_ops.concat(pieces, axis=0)
-
-
-def _repeat_ranges(params, splits, multiple):
-  """Repeats each range of `params` (as specified by `splits`) `multiple` times.
-
-  Let the `i`th range of `params` be defined as
-  `params[splits[i]:splits[i + 1]]`.  Then this function returns a tensor
-  containing range 0 repeated `multiple` times, followed by range 1 repeated
-  `multiple`, ..., followed by the last range repeated `multiple` times.
-
-  Args:
-    params: The `Tensor` whose values should be repeated.
-    splits: A splits tensor indicating the ranges of `params` that should be
-      repeated.
-    multiple: The number of times each range should be repeated.
-
-  Returns:
-    A `Tensor` with the same rank and type as `params`.
-
-  #### Example:
-    ```python
-    >>> _repeat_ranges(['a', 'b', 'c'], [0, 2, 3], 3)
-    ['a', 'b', 'a', 'b', 'a', 'b', 'c', 'c', 'c']
-    ```
-  """
-  # Repeat each split value `multiple` times.  E.g., if `splits=[0 3 4]` and
-  # `multiples=3`, then `repeated_splits=[0 0 0 3 3 3 4 4 4]`.
-  repeated_splits = array_ops.tile(
-      array_ops.expand_dims(splits, axis=1), array_ops.stack([1, multiple]))
-  repeated_splits = array_ops.reshape(repeated_splits, [-1])
-
-  # Divide the splits into repeated starts & repeated limits.  E.g., if
-  # `repeated_splits=[0 0 0 3 3 3 4 4 4]` then `repeated_starts=[0 0 0 3 3 3]`
-  # and `repeated_limits=[3 3 3 4 4 4]`.
-  n_splits = array_ops.shape(repeated_splits, out_type=dtypes.int64)[0]
-  repeated_starts = repeated_splits[:n_splits - multiple]
-  repeated_limits = repeated_splits[multiple:]
-
-  # Get indices for each range from starts to limits, and use those to gather
-  # the values in the desired repetition pattern.
-  offsets = ragged_math_ops.range(repeated_starts, repeated_limits).values
-  return array_ops.gather(params, offsets)
diff --git a/tensorflow/python/ops/ragged/ragged_batch_gather_op_test.py b/tensorflow/python/ops/ragged/ragged_batch_gather_op_test.py
index 79a2ecd87ae..d9d840500cb 100644
--- a/tensorflow/python/ops/ragged/ragged_batch_gather_op_test.py
+++ b/tensorflow/python/ops/ragged/ragged_batch_gather_op_test.py
@@ -135,6 +135,7 @@ class RaggedBatchGatherOpTest(test_util.TensorFlowTestCase,
           expected=ragged.constant_value(
               [[[[b'c', b'a'], [b'd', b'd']], [[b'f', b'e']]]], ragged_rank=2)),
   ])
+  @test_util.run_deprecated_v1
   def testRaggedBatchGather(self, descr, params, indices, expected):
     result = ragged.batch_gather(params, indices)
     self.assertEqual(
@@ -144,6 +145,7 @@ class RaggedBatchGatherOpTest(test_util.TensorFlowTestCase,
         expected = expected.tolist()
       self.assertEqual(result.eval().tolist(), expected)
 
+  @test_util.run_deprecated_v1
   def testRaggedBatchGatherUnknownRankError(self):
     params = [['a', 'b'], ['c', 'd']]
     indices = array_ops.placeholder(dtypes.int32, shape=None)
@@ -186,6 +188,7 @@ class RaggedBatchGatherOpTest(test_util.TensorFlowTestCase,
            indices=[[[0]]],
            message='batch shape from indices does not match params shape'),
   ])
+  @test_util.run_deprecated_v1
   def testRaggedBatchGatherStaticError(self,
                                        params,
                                        indices,
diff --git a/tensorflow/python/ops/ragged/ragged_boolean_mask_op_test.py b/tensorflow/python/ops/ragged/ragged_boolean_mask_op_test.py
index b3279c1e840..d939d9d6341 100644
--- a/tensorflow/python/ops/ragged/ragged_boolean_mask_op_test.py
+++ b/tensorflow/python/ops/ragged/ragged_boolean_mask_op_test.py
@@ -298,6 +298,7 @@ class RaggedBooleanMaskOpTest(test_util.TensorFlowTestCase,
           keepdims=True,
           expected=ragged.constant_value([[[1], [4, 6]], [[7, 9], []]])),
   ])  # pyformat: disable
+  @test_util.run_deprecated_v1
   def testBooleanMask(self, descr, data, mask, keepdims, expected):
     actual = ragged.boolean_mask(data, mask, keepdims=keepdims)
     self.assertEqual(
@@ -307,6 +308,7 @@ class RaggedBooleanMaskOpTest(test_util.TensorFlowTestCase,
         expected = expected.tolist()
       self.assertEqual(actual.eval().tolist(), expected)
 
+  @test_util.run_deprecated_v1
   def testErrors(self):
     self.assertRaisesRegexp(ValueError,
                             r'mask\.shape\.ndims must be kown statically',
diff --git a/tensorflow/python/ops/ragged/ragged_concat_op_test.py b/tensorflow/python/ops/ragged/ragged_concat_op_test.py
index 6b1a602d049..3699f90f46b 100644
--- a/tensorflow/python/ops/ragged/ragged_concat_op_test.py
+++ b/tensorflow/python/ops/ragged/ragged_concat_op_test.py
@@ -41,6 +41,11 @@ class RaggedConcatOpTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     ]
 
   @parameterized.parameters(
+      dict(
+          descr='Two rank-2 inputs with empty value axis=1',
+          rt_inputs=([[]], [[]]),
+          axis=1,
+          expected=[[]]),
       dict(
           descr='Two rank-2 inputs (ragged_rank=1), axis=0',
           rt_inputs=(
@@ -216,6 +221,7 @@ class RaggedConcatOpTest(test_util.TensorFlowTestCase, parameterized.TestCase):
           axis=0,
           expected=[[b'a00', b'a01'], [], [b'a20', b'a21']]),
   )   # pyformat: disable
+  @test_util.run_deprecated_v1
   def testRaggedConcat(self,
                        descr,
                        rt_inputs,
@@ -261,6 +267,7 @@ class RaggedConcatOpTest(test_util.TensorFlowTestCase, parameterized.TestCase):
           error=ValueError,
           message='Dimension 0 in both shapes must be equal'),
   )
+  @test_util.run_deprecated_v1
   def testStaticError(self, rt_inputs, axis, error, message, ragged_ranks=None):
     rt_inputs = self._rt_inputs_to_tensors(rt_inputs, ragged_ranks)
     self.assertRaisesRegexp(error, message, ragged.concat, rt_inputs, axis)
@@ -273,6 +280,7 @@ class RaggedConcatOpTest(test_util.TensorFlowTestCase, parameterized.TestCase):
           error=errors.InvalidArgumentError,
           message='Input tensors have incompatible shapes'),
   ])
+  @test_util.run_deprecated_v1
   def testRuntimeError(self, rt_inputs, axis, error, message,
                        ragged_ranks=None):
     rt_inputs = [
@@ -282,6 +290,7 @@ class RaggedConcatOpTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     with self.test_session():
       self.assertRaisesRegexp(error, message, concatenated.eval)
 
+  @test_util.run_deprecated_v1
   def testNegativeAxisWithUnknownRankError(self):
     rt_inputs = [
         array_ops.placeholder(dtypes.int64),
@@ -291,6 +300,7 @@ class RaggedConcatOpTest(test_util.TensorFlowTestCase, parameterized.TestCase):
         ValueError, r'axis may only be negative if ndims is statically known.',
         ragged.concat, rt_inputs, -1)
 
+  @test_util.run_deprecated_v1
   def testSingleTensorInput(self):
     """Tests ragged_concat with a single tensor input.
 
diff --git a/tensorflow/python/ops/ragged/ragged_const_op_test.py b/tensorflow/python/ops/ragged/ragged_const_op_test.py
index 13f79c57292..2505b23912a 100644
--- a/tensorflow/python/ops/ragged/ragged_const_op_test.py
+++ b/tensorflow/python/ops/ragged/ragged_const_op_test.py
@@ -133,6 +133,7 @@ class RaggedConstOpTest(test_util.TensorFlowTestCase, parameterized.TestCase):
       dict(pylist=[[b'a', b'b'], [b'c'], [b'd', b'e', b'f']],
            dtype=dtypes.string),
   )
+  @test_util.run_deprecated_v1
   def testRaggedConst(self,
                       pylist,
                       dtype=None,
@@ -183,7 +184,7 @@ class RaggedConstOpTest(test_util.TensorFlowTestCase, parameterized.TestCase):
       self.assertEqual(tuple(rt.shape.as_list()), expected_shape)
 
     with self.test_session():
-      result = rt.eval()
+      result = self.evaluate(rt)
       if rt.shape.ndims > 0:
         self.assertEqual(result.tolist(), pylist)
         if expected_shape is not None:
@@ -238,8 +239,8 @@ class RaggedConstOpTest(test_util.TensorFlowTestCase, parameterized.TestCase):
       dict(
           pylist=[1, 2, 3],
           inner_shape=(1, 1),
-          exception=ValueError,
-          message='Too many elements provided.'),
+          exception=TypeError,
+          message='Expected Tensor\'s shape'),
       dict(
           pylist=[[[1, 2], [3, 4]], [[5, 6], [7, 8]]],
           inner_shape=(2, 2),
@@ -258,6 +259,7 @@ class RaggedConstOpTest(test_util.TensorFlowTestCase, parameterized.TestCase):
           exception=ValueError,
           message='inner values have inconsistent shape'),
   )
+  @test_util.run_deprecated_v1
   def testRaggedConstError(self,
                            pylist,
                            dtype=None,
diff --git a/tensorflow/python/ops/ragged/ragged_conversion_ops.py b/tensorflow/python/ops/ragged/ragged_conversion_ops.py
index 3ec246ccaf1..83212e49cf7 100644
--- a/tensorflow/python/ops/ragged/ragged_conversion_ops.py
+++ b/tensorflow/python/ops/ragged/ragged_conversion_ops.py
@@ -196,7 +196,7 @@ def to_tensor(rt_input, default_value=None, name=None):
   Args:
     rt_input: The input `RaggedTensor`.
     default_value: Value to set for indices not specified in `rt_input`.
-      Defaults to zero.  `default_value.shape` must be equal to
+      Defaults to zero.  `default_value` must be broadcastable to
       `rt_input.shape[rt_input.ragged_rank + 1:]`.
     name: A name prefix for the returned tensors (optional).
 
@@ -210,6 +210,9 @@ def to_tensor(rt_input, default_value=None, name=None):
         rt_input, name='rt_input')
     if not ragged_tensor.is_ragged(rt_input):
       return rt_input  # already dense
+    if default_value is not None:
+      default_value = ops.convert_to_tensor(
+          default_value, name='default_value', dtype=rt_input.dtype)
 
     # If ragged_rank > 1, then recursively convert the ragged values into a
     # `Tensor` before we proceed.
@@ -217,6 +220,16 @@ def to_tensor(rt_input, default_value=None, name=None):
     if ragged_tensor.is_ragged(values):
       values = to_tensor(values, default_value)
 
+    # Tile the default value, if necessary.
+    if default_value is not None:
+      if values.shape.ndims is not None:
+        default_value.shape.with_rank_at_most(values.shape.ndims - 1)
+      if (values.shape.ndims is None or default_value.shape.ndims is None or
+          values.shape.ndims != default_value.shape.ndims + 1):
+        value_shape = array_ops.shape(values)[1:]
+        default_value = array_ops.broadcast_to(default_value, value_shape)
+      default_value.shape.assert_is_compatible_with(values.shape[1:])
+
     # Get the expected dense shape ([nrows, ncols] + value_shape).
     rt_row_lengths = [rt_input.row_splits[1:] - rt_input.row_splits[:-1]]
     nrows = array_ops.shape(rt_input.row_splits, out_type=dtypes.int64)[0] - 1
@@ -228,9 +241,6 @@ def to_tensor(rt_input, default_value=None, name=None):
     # Build a default value if none was supplied.
     if default_value is None:
       default_value = array_ops.zeros(value_shape, dtype=values.dtype)
-    else:
-      default_value = ops.convert_to_tensor(
-          default_value, name='default_value', dtype=values.dtype)
     default_value.shape.assert_is_compatible_with(values.shape[1:])
     default_value.set_shape(values.shape[1:])
 
@@ -351,9 +361,14 @@ def from_sparse(st_input, name=None):
     st_input = sparse_tensor.convert_to_tensor_or_sparse_tensor(
         st_input, name='rt_input')
 
-    if (st_input.dense_shape.shape.ndims != 2 and
-        st_input.indices.shape.ndims is None or
-        st_input.indices.shape.dims[1].value != 2):
+    static_rank_from_dense_shape = (
+        None if st_input.dense_shape.shape.ndims is None
+        else st_input.dense_shape.shape.dims[0].value)
+    static_rank_from_indices = (
+        None if st_input.indices.shape.ndims is None
+        else st_input.indices.shape.dims[1].value)
+
+    if static_rank_from_dense_shape != 2 and static_rank_from_indices != 2:
       raise ValueError('rank(st_input) must be 2')
 
     with ops.control_dependencies(
diff --git a/tensorflow/python/ops/ragged/ragged_elementwise_ops.py b/tensorflow/python/ops/ragged/ragged_elementwise_ops.py
index 23d0e8b5fc4..59b7dd16617 100644
--- a/tensorflow/python/ops/ragged/ragged_elementwise_ops.py
+++ b/tensorflow/python/ops/ragged/ragged_elementwise_ops.py
@@ -28,7 +28,7 @@ from tensorflow.python.ops import parsing_ops
 from tensorflow.python.ops import string_ops
 from tensorflow.python.ops.ragged import ragged_factory_ops
 from tensorflow.python.ops.ragged import ragged_tensor
-from tensorflow.python.ops.ragged import ragged_util
+from tensorflow.python.ops.ragged import ragged_tensor_shape
 from tensorflow.python.util import tf_decorator
 from tensorflow.python.util import tf_export
 from tensorflow.python.util import tf_inspect
@@ -209,28 +209,45 @@ def _broadcast_elementwise_args(elementwise_args):
     if not any(is_ragged):
       return elementwise_args, (), ()
 
-    # Support limited broadcasting (namely, scalar + ragged).  Full
-    # broadcasting support will be added later.
-    if all((ragged_tensor.is_ragged(t) or t.shape.ndims == 0)
-           for t in elementwise_args.values()):
+    # If we have a single ragged tensor plus a set of scalars, then we can
+    # rely on the underlying elementwise op to do broadcasting.
+    if (sum(is_ragged) == 1 and
+        all((ragged_tensor.is_ragged(t) or t.shape.ndims == 0)
+            for t in elementwise_args.values())):
       nested_splits_lists = [
           t.nested_row_splits
           for t in elementwise_args.values()
-          if ragged_tensor.is_ragged(t)
-      ]
-      if len(nested_splits_lists) == 1:
-        checks = ()
-      else:
-        if any(t.shape.ndims is None for t in elementwise_args.values()):
-          raise ValueError('Ragged elementwise ops require that rank (number '
-                           'of dimensions) be statically known.')
-        if len(set(t.shape.ndims for t in elementwise_args.values())) != 1:
-          raise ValueError('Ragged elementwise ops do not support '
-                           'broadcasting yet')
-        checks = ragged_util.assert_splits_match(nested_splits_lists)
-      return (elementwise_args, nested_splits_lists[0], checks)
+          if ragged_tensor.is_ragged(t)][0]
+      return elementwise_args, nested_splits_lists, ()
+
     else:
-      raise ValueError('Ragged elementwise ops do not support broadcasting yet')
+      # Get the shapes of all the elementwise arguments.
+      shapes = [ragged_tensor_shape.RaggedTensorDynamicShape.from_tensor(t)
+                for t in elementwise_args.values()]
+
+      # Broadcast the shapes to all have the same rank (the max rank).
+      ranks = [t.shape.ndims for t in elementwise_args.values()]
+      if any(rank is None for rank in ranks):
+        raise ValueError('Unable to broadcast: unknown rank')
+      broadcast_rank = max(ranks)
+      shapes = [shape.broadcast_to_rank(broadcast_rank) for shape in shapes]
+
+      # For each dimension, broadcast the shapes to be compatible.
+      for axis in range(broadcast_rank):
+        # For each i, broadcast shape[i+1] to be compatible with shape[i]; and
+        # then finally broadcast shape[0] to be compatible with shape[-1].
+        for i in range(len(shapes)):
+          j = (i + 1) % len(shapes)
+          dim_size = shapes[i].dimension_size(axis)
+          shapes[j] = shapes[j].broadcast_dimension(axis, dim_size)
+      broadcast_shape = shapes[0]
+
+      # Broadcast every elementwise arg to the shape that we calculated.
+      elementwise_args = dict([
+          (key, ragged_tensor_shape.broadcast_to(t, broadcast_shape, False))
+          for (key, t) in elementwise_args.items()])
+      nested_splits_lists = list(elementwise_args.values())[0].nested_row_splits
+      return elementwise_args, nested_splits_lists, ()
 
 
 # A list of symbols that should be exported in the "ragged" package.
@@ -252,6 +269,10 @@ def _add_elementwise_ops_to_this_module(specs, verbose=False):
       op_name = canonical_name
     else:
       op_name = original_op.__name__
+
+    # Temporary hack (will be removed once dispatch is added for RaggedTensors):
+    if op_name == 'neg': op_name = 'negative'
+
     if verbose:
       print('Adding ragged_elementwise_op: tf.ragged.%s (based on tf.%s)' %
             (op_name, canonical_name))
@@ -348,7 +369,7 @@ _TF_ELEMENTWISE_OPS = [
     (string_ops.regex_replace, 'input'),
     (string_ops.string_join, '[inputs]'),
     (string_ops.string_strip, 'input'),
-    (string_ops.string_to_hash_bucket, 'string_tensor'),
+    (string_ops.string_to_hash_bucket, 'input'),
     (string_ops.string_to_hash_bucket_fast, 'input'),
     (string_ops.string_to_hash_bucket_strong, 'input'),
     (string_ops.substr, 'input'),
@@ -365,3 +386,4 @@ _TF_ELEMENTWISE_OPS = [
     (parsing_ops.string_to_number, 'string_tensor'),
 ]
 _add_elementwise_ops_to_this_module(_TF_ELEMENTWISE_OPS)
+
diff --git a/tensorflow/python/ops/ragged/ragged_elementwise_ops_test.py b/tensorflow/python/ops/ragged/ragged_elementwise_ops_test.py
index 5dfa5cff45d..305a96df9cc 100644
--- a/tensorflow/python/ops/ragged/ragged_elementwise_ops_test.py
+++ b/tensorflow/python/ops/ragged/ragged_elementwise_ops_test.py
@@ -394,49 +394,43 @@ class RaggedElementwiseOpsTest(test_util.TensorFlowTestCase,
         result_flat_values = array_ops.reshape(result, [-1])
       self.assertAllEqual(expected_flat_values, result_flat_values)
 
+  @test_util.run_deprecated_v1
   def testUnknownRankError(self):
     x = ragged.constant([[1, 2], [3]])
     y = ragged.from_row_splits(
         array_ops.placeholder_with_default([1, 2, 3], shape=None), x.row_splits)
     with self.assertRaisesRegexp(
-        ValueError, r'Ragged elementwise ops require that rank \(number '
-        r'of dimensions\) be statically known.'):
+        ValueError, r'Unable to broadcast: unknown rank'):
       ragged.add(x, y)
 
-  def testBroadcastError1(self):
-    x = ragged.constant([[1, 2], [3]])
-    y = [[12]]
-    with self.assertRaisesRegexp(
-        ValueError, 'Ragged elementwise ops do not support broadcasting yet'):
-      ragged.add(x, y)
-
-  def testBroadcastError2(self):
-    x = ragged.constant([[[1, 2], [3, 4]], [[5]]], ragged_rank=2)
-    y = ragged.constant([[[8], [3]], [[2]]], ragged_rank=1)
-    with self.assertRaisesRegexp(ValueError,
-                                 'Inputs must have identical ragged splits'):
-      ragged.add(x, y)
-
-  def testBroadcastError3(self):
-    x = ragged.constant([[[1, 2], [3]], [[4, 5], [6]]], ragged_rank=2)
-    y = ragged.constant([[7, 8], [9]], ragged_rank=1)
-    with self.assertRaisesRegexp(
-        ValueError, 'Ragged elementwise ops do not support broadcasting yet'):
-      ragged.add(x, y)
-
-  def testBroadcastError4(self):
-    x = ragged.constant([[[1]]])
-    y = ragged.constant([[1]])
-    with self.assertRaisesRegexp(
-        ValueError, 'Ragged elementwise ops do not support broadcasting yet'):
-      ragged.add(x, y)
+  @parameterized.parameters([
+      dict(
+          x=ragged.constant_value([[1, 2], [3]]),
+          y=[[10]],
+          expected=[[11, 12], [13]]),
+      dict(
+          x=ragged.constant_value([[[1, 2], [3, 4]], [[5]]], ragged_rank=2),
+          y=ragged.constant_value([[[10], [20]], [[30]]], ragged_rank=1),
+          expected=[[[11, 12], [23, 24]], [[35]]]),
+      dict(
+          x=ragged.constant_value([[[1]]]),
+          y=ragged.constant_value([[1]]),
+          expected=[[[2]]]),
+  ])
+  def testBroadcastAdd(self, x, y, expected):
+    x = ragged.convert_to_tensor_or_ragged_tensor(x, dtype=dtypes.int32)
+    y = ragged.convert_to_tensor_or_ragged_tensor(y, dtype=dtypes.int32)
+    result = x + y
+    with self.cached_session():
+      self.assertEqual(result.eval().tolist(), expected)
 
   def testShapeMismatch(self):
     x = ragged.constant([[1, 2, 3], [4, 5]])
     y = ragged.constant([[1, 2, 3], [4, 5, 6]])
     with self.assertRaisesRegexp(errors.InvalidArgumentError,
-                                 'Inputs must have identical ragged splits'):
-      ragged.add(x, y)
+                                 'Incompatible shapes'):
+      with self.cached_session():
+        ragged.add(x, y).eval()
 
   def testDocstring(self):
     self.assertRegexpMatches(
diff --git a/tensorflow/python/ops/ragged/ragged_expand_dims_op_test.py b/tensorflow/python/ops/ragged/ragged_expand_dims_op_test.py
index 0c4fd458c23..3ff66973b6f 100644
--- a/tensorflow/python/ops/ragged/ragged_expand_dims_op_test.py
+++ b/tensorflow/python/ops/ragged/ragged_expand_dims_op_test.py
@@ -105,6 +105,7 @@ class RaggedExpandDimsOpTest(test_util.TensorFlowTestCase,
            expected=EXAMPLE4D_EXPAND_AXIS[4],
            expected_shape=[3, None, None, 2, 1]),
   ])  # pyformat: disable
+  @test_util.run_deprecated_v1
   def testRaggedExpandDims(self,
                            rt_input,
                            axis,
diff --git a/tensorflow/python/ops/ragged/ragged_factory_ops.py b/tensorflow/python/ops/ragged/ragged_factory_ops.py
index de3a2d5b10b..d1f301bc58f 100644
--- a/tensorflow/python/ops/ragged/ragged_factory_ops.py
+++ b/tensorflow/python/ops/ragged/ragged_factory_ops.py
@@ -676,3 +676,33 @@ def from_nested_row_splits(inner_values, nested_row_splits, name=None):
     for splits in reversed(nested_row_splits):
       result = from_row_splits(result, splits)
     return result
+
+
+def from_nested_row_lengths(inner_values, nested_row_lengths, name=None):
+  """Creates a `RaggedTensor` from a nested list of `row_lengths` tensors.
+
+  Equivalent to:
+
+  ```python
+  result = inner_values
+  for row_lengths in reversed(nested_row_lengths):
+    result = from_row_lengths(result, row_lengths)
+  ```
+
+  Args:
+    inner_values: A potentially ragged tensor.
+    nested_row_lengths: A list of 1-D int64 tensors.  The `i`th tensor is used
+      as the `row_lengths` for the `i`th ragged dimension.
+    name: A name prefix for the RaggedTensor (optional).
+
+  Returns:
+    A `RaggedTensor` (or `inner_values` if `nested_row_lengths` is empty).
+  """
+  if isinstance(nested_row_lengths, ops.Tensor):
+    raise TypeError('nested_row_lengths must be a list of Tensors')
+  with ops.name_scope(name, 'RaggedFromNestedRowlengths',
+                      [inner_values] + list(nested_row_lengths)):
+    result = inner_values
+    for lengths in reversed(nested_row_lengths):
+      result = from_row_lengths(result, lengths)
+    return result
diff --git a/tensorflow/python/ops/ragged/ragged_from_sparse_op_test.py b/tensorflow/python/ops/ragged/ragged_from_sparse_op_test.py
index ff19ddedebf..3c0db9e8fb6 100644
--- a/tensorflow/python/ops/ragged/ragged_from_sparse_op_test.py
+++ b/tensorflow/python/ops/ragged/ragged_from_sparse_op_test.py
@@ -29,6 +29,7 @@ from tensorflow.python.platform import googletest
 
 class RaggedTensorToSparseOpTest(test_util.TensorFlowTestCase):
 
+  @test_util.run_deprecated_v1
   def testDocStringExample(self):
     st = sparse_tensor.SparseTensor(
         indices=[[0, 0], [0, 1], [0, 2], [1, 0], [3, 0]],
@@ -39,6 +40,7 @@ class RaggedTensorToSparseOpTest(test_util.TensorFlowTestCase):
     with self.test_session():
       self.assertEqual(rt.eval().tolist(), [[1, 2, 3], [4], [], [5]])
 
+  @test_util.run_deprecated_v1
   def testEmpty(self):
     st = sparse_tensor.SparseTensor(
         indices=array_ops.zeros([0, 2], dtype=dtypes.int64),
@@ -49,6 +51,7 @@ class RaggedTensorToSparseOpTest(test_util.TensorFlowTestCase):
     with self.test_session():
       self.assertEqual(rt.eval().tolist(), [[], [], [], []])
 
+  @test_util.run_deprecated_v1
   def testBadSparseTensorRank(self):
     st1 = sparse_tensor.SparseTensor(indices=[[0]], values=[0], dense_shape=[3])
     st2 = sparse_tensor.SparseTensor(
@@ -64,6 +67,22 @@ class RaggedTensorToSparseOpTest(test_util.TensorFlowTestCase):
     self.assertRaisesRegexp(ValueError, r'rank\(st_input\) must be 2',
                             ragged.from_sparse, st3)
 
+  @test_util.run_deprecated_v1
+  def testGoodPartialSparseTensorRank(self):
+    st1 = sparse_tensor.SparseTensor(
+        indices=[[0, 0]],
+        values=[0],
+        dense_shape=array_ops.placeholder(dtypes.int64))
+    st2 = sparse_tensor.SparseTensor(
+        indices=array_ops.placeholder(dtypes.int64),
+        values=[0],
+        dense_shape=[4, 3])
+
+    # Shouldn't throw ValueError
+    ragged.from_sparse(st1)
+    ragged.from_sparse(st2)
+
+  @test_util.run_deprecated_v1
   def testNonRaggedSparseTensor(self):
     # "index_suffix" means the value of the innermost dimension of the index
     # (i.e., indices[i][-1]).
diff --git a/tensorflow/python/ops/ragged/ragged_from_tensor_op_test.py b/tensorflow/python/ops/ragged/ragged_from_tensor_op_test.py
index eb237f4c956..1d8a00cc18d 100644
--- a/tensorflow/python/ops/ragged/ragged_from_tensor_op_test.py
+++ b/tensorflow/python/ops/ragged/ragged_from_tensor_op_test.py
@@ -31,6 +31,7 @@ from tensorflow.python.platform import googletest
 class RaggedFromTensorOpTest(test_util.TensorFlowTestCase,
                              parameterized.TestCase):
 
+  @test_util.run_deprecated_v1
   def testDocStringExamples(self):
     # The examples from ragged.from_tensor.__doc__.
     dt = constant_op.constant([[5, 7, 0], [0, 3, 0], [6, 0, 0]])
@@ -262,6 +263,7 @@ class RaggedFromTensorOpTest(test_util.TensorFlowTestCase,
                        [[[5, 6], [7]], [[0, 8], []]]]
       },
   )  # pyformat: disable
+  @test_util.run_deprecated_v1
   def testRaggedFromTensor(self,
                            tensor,
                            expected,
@@ -278,6 +280,7 @@ class RaggedFromTensorOpTest(test_util.TensorFlowTestCase,
     with self.test_session():
       self.assertEqual(rt.eval().tolist(), expected)
 
+  @test_util.run_deprecated_v1
   def testHighDimensions(self):
     # Use distinct prime numbers for all dimension shapes in this test, so
     # we can see any errors that are caused by mixing up dimension sizes.
@@ -291,7 +294,7 @@ class RaggedFromTensorOpTest(test_util.TensorFlowTestCase,
           dt.shape.is_compatible_with(rt.shape),
           '%s is incompatible with %s' % (dt.shape, rt.shape))
       with self.test_session():
-        self.assertEqual(rt.eval().tolist(), dt.eval().tolist())
+        self.assertEqual(rt.eval().tolist(), self.evaluate(dt).tolist())
 
   @parameterized.parameters(
       # With no padding or lengths
@@ -395,6 +398,7 @@ class RaggedFromTensorOpTest(test_util.TensorFlowTestCase,
           'expected': [[], []]
       },
   )
+  @test_util.run_deprecated_v1
   def testEmpty(self, dt_shape, expected, lengths=None, padding=None):
     dt = array_ops.zeros(dt_shape)
     rt = ragged.from_tensor(dt, lengths, padding)
@@ -447,6 +451,7 @@ class RaggedFromTensorOpTest(test_util.TensorFlowTestCase,
           'error': (ValueError, r'ragged_rank must be greater than 0; got -1')
       },
   )
+  @test_util.run_deprecated_v1
   def testErrors(self,
                  tensor,
                  lengths=None,
diff --git a/tensorflow/python/ops/ragged/ragged_gather_nd_op_test.py b/tensorflow/python/ops/ragged/ragged_gather_nd_op_test.py
index dcf1feaa696..62c6819374a 100644
--- a/tensorflow/python/ops/ragged/ragged_gather_nd_op_test.py
+++ b/tensorflow/python/ops/ragged/ragged_gather_nd_op_test.py
@@ -183,6 +183,7 @@ class RaggedGatherNdOpTest(test_util.TensorFlowTestCase,
           indices=[[0, 0, 1], [0, 0, 0], [0, 1, 0]],
           expected=[[b'c', b'd'], [b'a', b'b'], [b'e', b'f']]),
   ])  # pyformat: disable
+  @test_util.run_deprecated_v1
   def testRaggedGatherNd(self, descr, params, indices, expected):
     result = ragged.gather_nd(params, indices)
     self.assertEqual(
@@ -190,8 +191,9 @@ class RaggedGatherNdOpTest(test_util.TensorFlowTestCase,
     with self.test_session() as sess:
       if hasattr(expected, 'tolist'):
         expected = expected.tolist()
-      self.assertEqual(sess.run(result).tolist(), expected)
+      self.assertEqual(self.evaluate(result).tolist(), expected)
 
+  @test_util.run_deprecated_v1
   def testRaggedGatherNdUnknownRankError(self):
     params = ragged.constant([['a', 'b'], ['c', 'd']])
     indices1 = array_ops.placeholder(dtypes.int32, shape=None)
@@ -219,6 +221,7 @@ class RaggedGatherNdOpTest(test_util.TensorFlowTestCase,
           indices=ragged.constant([[0]]),
           message='The innermost dimension of indices may not be ragged'),
   ])
+  @test_util.run_deprecated_v1
   def testRaggedGatherNdStaticError(self,
                                     params,
                                     indices,
diff --git a/tensorflow/python/ops/ragged/ragged_gather_op_test.py b/tensorflow/python/ops/ragged/ragged_gather_op_test.py
index bb52d05c32e..76c90cdfeeb 100644
--- a/tensorflow/python/ops/ragged/ragged_gather_op_test.py
+++ b/tensorflow/python/ops/ragged/ragged_gather_op_test.py
@@ -30,6 +30,7 @@ from tensorflow.python.platform import googletest
 
 class RaggedTensorOpsTest(test_util.TensorFlowTestCase):
 
+  @test_util.run_deprecated_v1
   def testDocStringExamples(self):
     params = constant_op.constant(['a', 'b', 'c', 'd', 'e'])
     indices = constant_op.constant([3, 1, 2, 1, 0])
@@ -46,6 +47,7 @@ class RaggedTensorOpsTest(test_util.TensorFlowTestCase):
           ragged.gather(ragged_params, ragged_indices).eval().tolist(),
           [[[b'e'], [b'd'], []], [[b'd']], [], [[b'a', b'b', b'c']]])
 
+  @test_util.run_deprecated_v1
   def testTensorParamsAndTensorIndices(self):
     params = ['a', 'b', 'c', 'd', 'e']
     indices = [2, 0, 2, 1]
@@ -55,6 +57,7 @@ class RaggedTensorOpsTest(test_util.TensorFlowTestCase):
           [b'c', b'a', b'c', b'b'])
       self.assertEqual(type(ragged.gather(params, indices)), ops.Tensor)
 
+  @test_util.run_deprecated_v1
   def testRaggedParamsAndTensorIndices(self):
     params = ragged.constant([['a', 'b'], ['c', 'd', 'e'], ['f'], [], ['g']])
     indices = [2, 0, 2, 1]
@@ -63,6 +66,7 @@ class RaggedTensorOpsTest(test_util.TensorFlowTestCase):
           ragged.gather(params, indices).eval().tolist(),
           [[b'f'], [b'a', b'b'], [b'f'], [b'c', b'd', b'e']])
 
+  @test_util.run_deprecated_v1
   def testTensorParamsAndRaggedIndices(self):
     params = ['a', 'b', 'c', 'd', 'e']
     indices = ragged.constant([[2, 1], [1, 2, 0], [3]])
@@ -71,6 +75,7 @@ class RaggedTensorOpsTest(test_util.TensorFlowTestCase):
           ragged.gather(params, indices).eval().tolist(),
           [[b'c', b'b'], [b'b', b'c', b'a'], [b'd']])
 
+  @test_util.run_deprecated_v1
   def testRaggedParamsAndRaggedIndices(self):
     params = ragged.constant([['a', 'b'], ['c', 'd', 'e'], ['f'], [], ['g']])
     indices = ragged.constant([[2, 1], [1, 2, 0], [3]])
@@ -82,6 +87,7 @@ class RaggedTensorOpsTest(test_util.TensorFlowTestCase):
            [[]]]                                        #  [p[3]            ]]
       )  # pyformat: disable
 
+  @test_util.run_deprecated_v1
   def testRaggedParamsAndScalarIndices(self):
     params = ragged.constant([['a', 'b'], ['c', 'd', 'e'], ['f'], [], ['g']])
     indices = 1
@@ -89,6 +95,7 @@ class RaggedTensorOpsTest(test_util.TensorFlowTestCase):
       self.assertEqual(
           ragged.gather(params, indices).eval().tolist(), [b'c', b'd', b'e'])
 
+  @test_util.run_deprecated_v1
   def test3DRaggedParamsAnd2DTensorIndices(self):
     params = ragged.constant([[['a', 'b'], []], [['c', 'd'], ['e'], ['f']],
                               [['g']]])
@@ -101,6 +108,7 @@ class RaggedTensorOpsTest(test_util.TensorFlowTestCase):
            [[[b'g']], [[b'g']]]]                                  #  [p2, p2]]
       )  # pyformat: disable
 
+  @test_util.run_deprecated_v1
   def testTensorParamsAnd4DRaggedIndices(self):
     indices = ragged.constant(
         [[[[3, 4], [0, 6]], []], [[[2, 1], [1, 0]], [[2, 5]], [[2, 3]]],
@@ -115,6 +123,7 @@ class RaggedTensorOpsTest(test_util.TensorFlowTestCase):
            [[[b'c', b'b'], [b'b', b'a']], [[b'c', b'f']], [[b'c', b'd']]],
            [[[b'b', b'a']]]])  # pyformat: disable
 
+  @test_util.run_deprecated_v1
   def testOutOfBoundsError(self):
     tensor_params = ['a', 'b', 'c']
     tensor_indices = [0, 1, 2]
@@ -131,6 +140,7 @@ class RaggedTensorOpsTest(test_util.TensorFlowTestCase):
                               r'indices\[1\] = 3 is not in \[0, 2\)',
                               ragged.gather(ragged_params, ragged_indices).eval)
 
+  @test_util.run_deprecated_v1
   def testUnknownIndicesRankError(self):
     params = ragged.constant([], ragged_rank=1)
     indices = constant_op.constant([0], dtype=dtypes.int64)
diff --git a/tensorflow/python/ops/ragged/ragged_map_fn_op_test.py b/tensorflow/python/ops/ragged/ragged_map_fn_op_test.py
index 6f3f33b4441..7a8603c949a 100644
--- a/tensorflow/python/ops/ragged/ragged_map_fn_op_test.py
+++ b/tensorflow/python/ops/ragged/ragged_map_fn_op_test.py
@@ -140,6 +140,7 @@ class RaggedMapOpTest(test_util.TensorFlowTestCase, parameterized.TestCase):
       ),
   ])
 
+  @test_util.run_deprecated_v1
   def testRaggedMap(
       self,
       fn,
@@ -161,9 +162,10 @@ class RaggedMapOpTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     with self.test_session():
       if ragged.is_ragged(expected_output):
         self.assertEqual(output.ragged_rank, expected_rt.ragged_rank)
-      output_values = output.eval()
+      output_values = self.evaluate(output)
       self.assertAllEqual(expected_output, output_values.tolist())
 
+  @test_util.run_deprecated_v1
   def testRaggedMapOnStructure(self):
     batman = ragged.constant([[1, 2, 3], [4], [5, 6, 7]])
     # [[10, 20, 30], [40], [50, 60, 70]]
@@ -184,6 +186,7 @@ class RaggedMapOpTest(test_util.TensorFlowTestCase, parameterized.TestCase):
       self.assertAllEqual(output.eval().tolist(), [66, 44, 198])
 
   # Test mapping over a dict of RTs can produce a dict of RTs.
+  @test_util.run_deprecated_v1
   def testRaggedMapOnStructure_RaggedOutputs(self):
     batman = ragged.constant([[1, 2, 3], [4], [5, 6, 7]])
     # [[10, 20, 30], [40], [50, 60, 70]]
@@ -215,6 +218,7 @@ class RaggedMapOpTest(test_util.TensorFlowTestCase, parameterized.TestCase):
       self.assertAllEqual(output['robin'].eval().tolist(),
                           [[11, 21, 31], [41], [51, 61, 71]])
 
+  @test_util.run_deprecated_v1
   def testZip(self):
     x = ragged.constant([[10, 20], [30, 40], [50, 60], [70], [80, 90, 100]],
                         dtypes.int64)
@@ -232,11 +236,12 @@ class RaggedMapOpTest(test_util.TensorFlowTestCase, parameterized.TestCase):
         infer_shape=False)
 
     with self.test_session():
-      result = output.eval().tolist()
+      result = self.evaluate(output).tolist()
       self.assertAllEqual(
           result, [[[0, 10], [0, 20]], [[1, 30], [1, 40]], [[2, 50], [2, 60]],
                    [[3, 70]], [[4, 80], [4, 90], [4, 100]]])
 
+  @test_util.run_deprecated_v1
   def testBatchGather(self):
     tokens = ragged.constant([['hello', '.', 'there'], ['merhaba'],
                               ['bonjour', '.', 'ca va', '?']])
@@ -255,7 +260,7 @@ class RaggedMapOpTest(test_util.TensorFlowTestCase, parameterized.TestCase):
 
     with self.test_session():
       self.assertAllEqual(
-          out.eval().tolist(),
+          self.evaluate(out).tolist(),
           [[b'hello', b'there'], [b'merhaba'], [b'bonjour', b'ca va']])
 
   def testMismatchRaggedRank(self):
diff --git a/tensorflow/python/ops/ragged/ragged_map_inner_values_op_test.py b/tensorflow/python/ops/ragged/ragged_map_inner_values_op_test.py
index 798d7c3ce81..b5802cb82d9 100644
--- a/tensorflow/python/ops/ragged/ragged_map_inner_values_op_test.py
+++ b/tensorflow/python/ops/ragged/ragged_map_inner_values_op_test.py
@@ -43,6 +43,7 @@ class RaggedMapInnerValuesOpTest(test_util.TensorFlowTestCase,
     with self.test_session():
       self.assertEqual(result.eval().tolist(), expected)
 
+  @test_util.run_deprecated_v1
   def testDocStringExamples(self):
     """Test the examples in apply_op_to_ragged_values.__doc__."""
     rt = ragged.constant([[1, 2, 3], [], [4, 5], [6]])
@@ -54,6 +55,7 @@ class RaggedMapInnerValuesOpTest(test_util.TensorFlowTestCase,
       self.assertEqual(v2.eval().tolist(), [[1, 4, 9], [], [16, 25], [36]])
       self.assertEqual(v3.eval().tolist(), [[6, 7, 8], [], [9, 10], [11]])
 
+  @test_util.run_deprecated_v1
   def testOpWithSingleRaggedTensorArg(self):
     tensor = ragged.constant([[1, 2, 3], [], [4, 5]])
     self.assertRaggedMapInnerValuesReturns(
@@ -61,17 +63,20 @@ class RaggedMapInnerValuesOpTest(test_util.TensorFlowTestCase,
         args=(tensor,),
         expected=[[0, 0, 0], [], [0, 0]])
 
+  @test_util.run_deprecated_v1
   def testOpWithTwoRaggedTensorArgs(self):
     x = ragged.constant([[3, 1, 4], [], [1, 5]])
     y = ragged.constant([[1, 2, 3], [], [4, 5]])
     self.assertRaggedMapInnerValuesReturns(
         op=math_ops.multiply, args=(x, y), expected=[[3, 2, 12], [], [4, 25]])
 
+  @test_util.run_deprecated_v1
   def testOpWithRaggedTensorAndScalarArgs(self):
     y = ragged.constant([[1, 2, 3], [], [4, 5]])
     self.assertRaggedMapInnerValuesReturns(
         op=math_ops.multiply, args=(5, y), expected=[[5, 10, 15], [], [20, 25]])
 
+  @test_util.run_deprecated_v1
   def testOpWithThreeRaggedTensorArgs(self):
     condition = ragged.constant(
         [[True, True, False], [], [True, False]])  # pyformat: disable
@@ -82,6 +87,7 @@ class RaggedMapInnerValuesOpTest(test_util.TensorFlowTestCase,
         args=(condition, x, y),
         expected=[[b'a', b'b', b'C'], [], [b'd', b'E']])
 
+  @test_util.run_deprecated_v1
   def testOpWithRaggedTensorListArg(self):
     x = ragged.constant([[1, 2, 3], [], [4, 5]])
     y = ragged.constant([[10, 20, 30], [], [40, 50]])
@@ -90,6 +96,7 @@ class RaggedMapInnerValuesOpTest(test_util.TensorFlowTestCase,
         args=([x, y, x],),
         expected=[[12, 24, 36], [], [48, 60]])
 
+  @test_util.run_deprecated_v1
   def testOpWithKeywordArgs(self):
     x = ragged.constant([[3, 1, 4], [], [1, 5]])
     y = ragged.constant([[1, 2, 3], [], [4, 5]])
@@ -98,6 +105,7 @@ class RaggedMapInnerValuesOpTest(test_util.TensorFlowTestCase,
         kwargs=dict(x=x, y=y),
         expected=[[3, 2, 12], [], [4, 25]])
 
+  @test_util.run_deprecated_v1
   def testOpWithMixedPositionalAndKeywordArgs(self):
     x = ragged.constant([[3, 1, 4], [], [1, 5]])
     y = ragged.constant([[1, 2, 3], [], [4, 5]])
@@ -107,6 +115,7 @@ class RaggedMapInnerValuesOpTest(test_util.TensorFlowTestCase,
         kwargs=dict(y=y),
         expected=[[3, 2, 12], [], [4, 25]])
 
+  @test_util.run_deprecated_v1
   def testNonElementWiseOp(self):
     x = ragged.constant(
         [[[3, 1, 4], [1, 5, 9], [2, 6, 5]], [], [[3, 5, 8], [9, 7, 9]]],
@@ -119,6 +128,7 @@ class RaggedMapInnerValuesOpTest(test_util.TensorFlowTestCase,
         },
         expected=[[8, 15, 13], [], [16, 25]])
 
+  @test_util.run_deprecated_v1
   def testOpWithRaggedRankGreaterThanOne(self):
     # ragged_rank=0
     x0 = [3, 1, 4, 1, 5, 9, 2, 6, 5]
@@ -163,6 +173,7 @@ class RaggedMapInnerValuesOpTest(test_util.TensorFlowTestCase,
             [[[54, 14], [48, 45]]]    # row 3
         ])  # pyformat: disable
 
+  @test_util.run_deprecated_v1
   def testOpWithRaggedRankThree(self):
     x = ragged.constant([[[3, 1, 4]], [], [[], [1, 5]]])
     y = ragged.constant([[[1, 2, 3]], [], [[], [4, 5]]])
@@ -171,6 +182,7 @@ class RaggedMapInnerValuesOpTest(test_util.TensorFlowTestCase,
         args=(x, y),
         expected=[[[3, 2, 12]], [], [[], [4, 25]]])
 
+  @test_util.run_deprecated_v1
   def testOpWithInnerValuesOnly(self):
     x = constant_op.constant([[1, 2], [3, 4], [5, 6]])
     y = constant_op.constant(2)
@@ -191,6 +203,7 @@ class RaggedMapInnerValuesOpTest(test_util.TensorFlowTestCase,
                             r'Inputs must have identical ragged splits.*',
                             ragged.map_inner_values, math_ops.add, x, y)
 
+  @test_util.run_deprecated_v1
   def testRaggedTensorSplitsMismatchErrorAtRuntime(self):
     splits1 = array_ops.placeholder_with_default(
         constant_op.constant([0, 3, 3, 5], dtypes.int64), None)
diff --git a/tensorflow/python/ops/ragged/ragged_operators_test.py b/tensorflow/python/ops/ragged/ragged_operators_test.py
index a99d788ef79..7fe8159d822 100644
--- a/tensorflow/python/ops/ragged/ragged_operators_test.py
+++ b/tensorflow/python/ops/ragged/ragged_operators_test.py
@@ -27,6 +27,7 @@ class RaggedElementwiseOpsTest(test_util.TensorFlowTestCase):
   # @TODO(edloper): Test right-handed versions of operators once we add
   # broadcasting support for elementwise ops.
 
+  @test_util.run_deprecated_v1
   def testOrderingOperators(self):
     x = ragged.constant([[1, 5], [3]])
     y = ragged.constant([[4, 5], [1]])
@@ -40,6 +41,7 @@ class RaggedElementwiseOpsTest(test_util.TensorFlowTestCase):
     if a != b:
       print('%30s %s' % (b, a))
 
+  @test_util.run_deprecated_v1
   def testArithmeticOperators(self):
     x = ragged.constant([[1.0, -2.0], [8.0]])
     y = ragged.constant([[4.0, 4.0], [2.0]])
@@ -75,6 +77,7 @@ class RaggedElementwiseOpsTest(test_util.TensorFlowTestCase):
       self.assertEqual((2.0 % y).eval().tolist(), [[2.0, 2.0], [0.0]])
       self.assertEqual((x % 2.0).eval().tolist(), [[1.0, 0.0], [0.0]])
 
+  @test_util.run_deprecated_v1
   def testLogicalOperators(self):
     a = ragged.constant([[True, True], [False]])
     b = ragged.constant([[True, False], [False]])
diff --git a/tensorflow/python/ops/ragged/ragged_range_op_test.py b/tensorflow/python/ops/ragged/ragged_range_op_test.py
index 3c6a6fb75c8..644423ecb7f 100644
--- a/tensorflow/python/ops/ragged/ragged_range_op_test.py
+++ b/tensorflow/python/ops/ragged/ragged_range_op_test.py
@@ -26,6 +26,7 @@ from tensorflow.python.platform import googletest
 
 class RaggedRangeOpTest(test_util.TensorFlowTestCase):
 
+  @test_util.run_deprecated_v1
   def testDocStringExamples(self):
     """Examples from ragged_range.__doc__."""
     with self.test_session():
@@ -38,6 +39,7 @@ class RaggedRangeOpTest(test_util.TensorFlowTestCase):
       rt3 = ragged.range([0, 5, 8], [3, 3, 12], 2).eval().tolist()
       self.assertEqual(rt3, [[0, 2], [], [8, 10]])
 
+  @test_util.run_deprecated_v1
   def testBasicRanges(self):
     with self.test_session():
       # Specify limits only.
@@ -56,6 +58,7 @@ class RaggedRangeOpTest(test_util.TensorFlowTestCase):
           [list(range(0, 4, 2)), list(range(3, 4, 3)),
            list(range(5, 15, 4))])
 
+  @test_util.run_deprecated_v1
   def testFloatRanges(self):
     with self.test_session():
       expected = [[0.0, 0.4, 0.8, 1.2, 1.6, 2.0, 2.4, 2.8, 3.2, 3.6], [3.0],
@@ -64,6 +67,7 @@ class RaggedRangeOpTest(test_util.TensorFlowTestCase):
                             [0.4, 1.5, 2.2]).eval().tolist()
       self.assertEqual(expected, [[round(v, 5) for v in row] for row in actual])
 
+  @test_util.run_deprecated_v1
   def testNegativeDeltas(self):
     with self.test_session():
       self.assertEqual(
@@ -77,6 +81,7 @@ class RaggedRangeOpTest(test_util.TensorFlowTestCase):
           [list(range(0, 0, -1)), list(range(-3, 0, 1)),
            list(range(5, 0, -2))])
 
+  @test_util.run_deprecated_v1
   def testBroadcast(self):
     with self.test_session():
       # Specify starts and limits, broadcast deltas.
@@ -89,6 +94,7 @@ class RaggedRangeOpTest(test_util.TensorFlowTestCase):
       self.assertEqual(
           ragged.range(0, 5, 1).eval().tolist(), [list(range(0, 5, 1))])
 
+  @test_util.run_deprecated_v1
   def testEmptyRanges(self):
     rt1 = ragged.range([0, 5, 3], [0, 3, 5])
     rt2 = ragged.range([0, 5, 5], [0, 3, 5], -1)
@@ -96,6 +102,7 @@ class RaggedRangeOpTest(test_util.TensorFlowTestCase):
       self.assertEqual(rt1.eval().tolist(), [[], [], [3, 4]])
       self.assertEqual(rt2.eval().tolist(), [[], [5, 4], []])
 
+  @test_util.run_deprecated_v1
   def testShapeFnErrors(self):
     with self.test_session():
       self.assertRaisesRegexp(ValueError, r'Shape must be at most rank 1.*',
@@ -107,12 +114,14 @@ class RaggedRangeOpTest(test_util.TensorFlowTestCase):
       self.assertRaisesRegexp(ValueError, r'Dimensions must be equal.*',
                               ragged.range, [0], [1, 2])
 
+  @test_util.run_deprecated_v1
   def testKernelErrors(self):
     with self.test_session():
       self.assertRaisesRegexp(errors.InvalidArgumentError,
                               r'Requires delta != 0',
                               ragged.range(0, 0, 0).eval)
 
+  @test_util.run_deprecated_v1
   def testShape(self):
     self.assertEqual(ragged.range(0, 0, 0).shape.as_list(), [1, None])
     self.assertEqual(ragged.range([1, 2, 3]).shape.as_list(), [3, None])
diff --git a/tensorflow/python/ops/ragged/ragged_reduce_op_test.py b/tensorflow/python/ops/ragged/ragged_reduce_op_test.py
index 93176c738df..9f51d59ba3c 100644
--- a/tensorflow/python/ops/ragged/ragged_reduce_op_test.py
+++ b/tensorflow/python/ops/ragged/ragged_reduce_op_test.py
@@ -300,6 +300,7 @@ class RaggedReduceOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
           axis=2,
           expected=[[mean(1, 2), mean(3, 4, 5)], [mean(6, 7), 8], [9]]),
   )
+  @test_util.run_deprecated_v1
   def testReduce(self, ragged_reduce_op, rt_input, axis, expected):
     rt_input = ragged.constant(rt_input)
     reduced = ragged_reduce_op(rt_input, axis)
@@ -311,6 +312,7 @@ class RaggedReduceOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     self.assertTrue(
         ((actual == expected) | (np.isnan(actual) & np.isnan(expected))).all())
 
+  @test_util.run_deprecated_v1
   def testMeanNan(self):
     rt_as_list = [[0, 1, 2, 3], [4], [], [5, 6], [7], [8, 9]]
     expected = (
@@ -321,6 +323,7 @@ class RaggedReduceOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     with self.test_session():
       self.assertEqualWithNan(reduced.eval(), expected)
 
+  @test_util.run_deprecated_v1
   def testMeanWithTensorInputs(self):
     tensor = [[1.0, 2.0, 3.0], [10.0, 20.0, 30.0]]
     expected = [2.0, 20.0]
@@ -328,6 +331,7 @@ class RaggedReduceOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     with self.test_session():
       self.assertAllEqual(reduced.eval(), expected)
 
+  @test_util.run_deprecated_v1
   def testErrors(self):
     rt_input = ragged.constant([[1, 2, 3], [4, 5]])
     axis = array_ops.placeholder_with_default(constant_op.constant([0]), None)
diff --git a/tensorflow/python/ops/ragged/ragged_row_lengths_op_test.py b/tensorflow/python/ops/ragged/ragged_row_lengths_op_test.py
index 4d5a0a5d11c..4a705be4848 100644
--- a/tensorflow/python/ops/ragged/ragged_row_lengths_op_test.py
+++ b/tensorflow/python/ops/ragged/ragged_row_lengths_op_test.py
@@ -143,6 +143,7 @@ class RaggedRowLengthsOp(test_util.TensorFlowTestCase, parameterized.TestCase):
           expected=[[2, 3, 0], [4, 1]],
           expected_ragged_rank=1),
   ])  # pyformat: disable
+  @test_util.run_deprecated_v1
   def testRowLengths(self,
                      rt_input,
                      expected,
diff --git a/tensorflow/python/ops/ragged/ragged_row_splits_to_segment_ids_op_test.py b/tensorflow/python/ops/ragged/ragged_row_splits_to_segment_ids_op_test.py
index f246bf35524..7f5f4e91bde 100644
--- a/tensorflow/python/ops/ragged/ragged_row_splits_to_segment_ids_op_test.py
+++ b/tensorflow/python/ops/ragged/ragged_row_splits_to_segment_ids_op_test.py
@@ -26,6 +26,7 @@ from tensorflow.python.platform import googletest
 
 class RaggedSplitsToSegmentIdsOpTest(test_util.TensorFlowTestCase):
 
+  @test_util.run_deprecated_v1
   def testDocStringExample(self):
     splits = [0, 3, 3, 5, 6, 9]
     expected = [0, 0, 0, 2, 2, 3, 4, 4, 4]
@@ -33,12 +34,14 @@ class RaggedSplitsToSegmentIdsOpTest(test_util.TensorFlowTestCase):
     with self.test_session():
       self.assertEqual(segment_ids.eval().tolist(), expected)
 
+  @test_util.run_deprecated_v1
   def testEmptySplits(self):
     # Note: the splits for an empty ragged tensor contains a single zero.
     segment_ids = ragged.row_splits_to_segment_ids([0])
     with self.test_session():
       self.assertEqual(segment_ids.eval().tolist(), [])
 
+  @test_util.run_deprecated_v1
   def testErrors(self):
     self.assertRaisesRegexp(ValueError, r'Invalid row_splits: \[\]',
                             ragged.row_splits_to_segment_ids, [])
diff --git a/tensorflow/python/ops/ragged/ragged_segment_ids_to_row_splits_op_test.py b/tensorflow/python/ops/ragged/ragged_segment_ids_to_row_splits_op_test.py
index fa7adf66b0b..7e52f2d844b 100644
--- a/tensorflow/python/ops/ragged/ragged_segment_ids_to_row_splits_op_test.py
+++ b/tensorflow/python/ops/ragged/ragged_segment_ids_to_row_splits_op_test.py
@@ -26,6 +26,7 @@ from tensorflow.python.platform import googletest
 
 class RaggedSplitsToSegmentIdsOpTest(test_util.TensorFlowTestCase):
 
+  @test_util.run_deprecated_v1
   def testDocStringExample(self):
     segment_ids = [0, 0, 0, 2, 2, 3, 4, 4, 4]
     expected = [0, 3, 3, 5, 6, 9]
@@ -33,6 +34,7 @@ class RaggedSplitsToSegmentIdsOpTest(test_util.TensorFlowTestCase):
     with self.test_session():
       self.assertEqual(splits.eval().tolist(), expected)
 
+  @test_util.run_deprecated_v1
   def testEmptySegmentIds(self):
     # Note: the splits for an empty ragged tensor contains a single zero.
     segment_ids = ragged.segment_ids_to_row_splits([])
@@ -49,6 +51,7 @@ class RaggedSplitsToSegmentIdsOpTest(test_util.TensorFlowTestCase):
     self.assertRaisesRegexp(ValueError, r'Shape \(1, 1\) must have rank 1',
                             ragged.segment_ids_to_row_splits, [[0]])
 
+  @test_util.run_deprecated_v1
   def testNumSegments(self):
     segment_ids = [0, 0, 0, 2, 2, 3, 4, 4, 4]
     num_segments = 7
@@ -57,6 +60,7 @@ class RaggedSplitsToSegmentIdsOpTest(test_util.TensorFlowTestCase):
     with self.test_session():
       self.assertEqual(splits.eval().tolist(), expected)
 
+  @test_util.run_deprecated_v1
   def testUnsortedSegmentIds(self):
     # Segment ids are not required to be sorted.
     segment_ids = [0, 4, 3, 2, 4, 4, 2, 0, 0]
diff --git a/tensorflow/python/ops/ragged/ragged_segment_op_test.py b/tensorflow/python/ops/ragged/ragged_segment_op_test.py
index 7d41eb7f753..9e4877ae3e6 100644
--- a/tensorflow/python/ops/ragged/ragged_segment_op_test.py
+++ b/tensorflow/python/ops/ragged/ragged_segment_op_test.py
@@ -110,6 +110,7 @@ class RaggedSegmentOpsTest(test_util.TensorFlowTestCase,
       (ragged.segment_mean, mean, [5, 4, 3, 2, 1, 0]),
       (ragged.segment_mean, mean, [0, 0, 0, 10, 10, 10]),
   )
+  @test_util.run_deprecated_v1
   def testRaggedSegment_Int(self, segment_op, combiner, segment_ids):
     rt_as_list = [[0, 1, 2, 3], [4], [], [5, 6], [7], [8, 9]]
     rt = ragged.constant(rt_as_list)
@@ -118,8 +119,7 @@ class RaggedSegmentOpsTest(test_util.TensorFlowTestCase,
                                    combiner)
 
     segmented = segment_op(rt, segment_ids, num_segments)
-    with self.test_session():
-      self.assertListEqual(segmented.eval().tolist(), expected)
+    self.assertListEqual(self.evaluate(segmented).tolist(), expected)
 
   @parameterized.parameters(
       (ragged.segment_sum, sum, [0, 0, 1, 1, 2, 2]),
@@ -147,6 +147,7 @@ class RaggedSegmentOpsTest(test_util.TensorFlowTestCase,
       (ragged.segment_sqrt_n, sqrt_n, [5, 4, 3, 2, 1, 0]),
       (ragged.segment_sqrt_n, sqrt_n, [0, 0, 0, 10, 10, 10]),
   )
+  @test_util.run_deprecated_v1
   def testRaggedSegment_Float(self, segment_op, combiner, segment_ids):
     rt_as_list = [[0., 1., 2., 3.], [4.], [], [5., 6.], [7.], [8., 9.]]
     rt = ragged.constant(rt_as_list)
@@ -155,10 +156,10 @@ class RaggedSegmentOpsTest(test_util.TensorFlowTestCase,
                                    combiner)
 
     segmented = segment_op(rt, segment_ids, num_segments)
-    with self.test_session():
-      self.assertNestedListAmostEqual(
-          segmented.eval().tolist(), expected, places=5)
+    self.assertNestedListAmostEqual(
+        self.evaluate(segmented).tolist(), expected, places=5)
 
+  @test_util.run_deprecated_v1
   def testRaggedRankTwo(self):
     rt = ragged.constant([
         [[111, 112, 113, 114], [121],],  # row 0
@@ -172,17 +173,16 @@ class RaggedSegmentOpsTest(test_util.TensorFlowTestCase,
                  [],                                # row 1
                  [[411, 412], [321, 322], [331]]    # row 2
                 ]  # pyformat: disable
-    with self.test_session():
-      self.assertEqual(segmented1.eval().tolist(), expected1)
+    self.assertEqual(self.evaluate(segmented1).tolist(), expected1)
 
     segment_ids2 = [1, 2, 1, 1]
     segmented2 = ragged.segment_sum(rt, segment_ids2, 3)
     expected2 = [[],
                  [[111+411, 112+412, 113, 114], [121+321, 322], [331]],
                  []]  # pyformat: disable
-    with self.test_session():
-      self.assertEqual(segmented2.eval().tolist(), expected2)
+    self.assertEqual(self.evaluate(segmented2).tolist(), expected2)
 
+  @test_util.run_deprecated_v1
   def testRaggedSegmentIds(self):
     rt = ragged.constant([
         [[111, 112, 113, 114], [121],],  # row 0
@@ -195,8 +195,7 @@ class RaggedSegmentOpsTest(test_util.TensorFlowTestCase,
     expected = [[],
                 [111+321, 112+322, 113, 114],
                 [121+331+411, 412]]  # pyformat: disable
-    with self.test_session():
-      self.assertEqual(segmented.eval().tolist(), expected)
+    self.assertEqual(self.evaluate(segmented).tolist(), expected)
 
   def testShapeMismatchError1(self):
     dt = constant_op.constant([1, 2, 3, 4, 5, 6])
@@ -206,6 +205,7 @@ class RaggedSegmentOpsTest(test_util.TensorFlowTestCase,
         'but segment_ids is ragged and data is not.', ragged.segment_sum, dt,
         segment_ids, 3)
 
+  @test_util.run_deprecated_v1
   def testShapeMismatchError2(self):
     rt = ragged.constant([
         [[111, 112, 113, 114], [121]],  # row 0
@@ -226,7 +226,7 @@ class RaggedSegmentOpsTest(test_util.TensorFlowTestCase,
         array_ops.placeholder_with_default(segment_ids.values, None),
         array_ops.placeholder_with_default(segment_ids.row_splits, None))
     segmented2 = ragged.segment_sum(rt, segment_ids2, 3)
-    with self.test_session():
+    with self.cached_session():
       self.assertRaisesRegexp(
           errors.InvalidArgumentError,
           'segment_ids.shape must be a prefix of data.shape.*', segmented2.eval)
diff --git a/tensorflow/python/ops/ragged/ragged_stack_op_test.py b/tensorflow/python/ops/ragged/ragged_stack_op_test.py
index d474a749f04..43434716942 100644
--- a/tensorflow/python/ops/ragged/ragged_stack_op_test.py
+++ b/tensorflow/python/ops/ragged/ragged_stack_op_test.py
@@ -265,6 +265,7 @@ class RaggedStackOpTest(test_util.TensorFlowTestCase, parameterized.TestCase):
           axis=0,
           expected=[[[b'a00', b'a01'], [], [b'a20', b'a21']]]),
   )   # pyformat: disable
+  @test_util.run_deprecated_v1
   def testRaggedStack(self,
                       descr,
                       rt_inputs,
@@ -313,6 +314,7 @@ class RaggedStackOpTest(test_util.TensorFlowTestCase, parameterized.TestCase):
   def testError(self, rt_inputs, axis, error, message):
     self.assertRaisesRegexp(error, message, ragged.stack, rt_inputs, axis)
 
+  @test_util.run_deprecated_v1
   def testSingleTensorInput(self):
     """Tests ragged_stack with a single tensor input.
 
diff --git a/tensorflow/python/ops/ragged/ragged_string_ops.py b/tensorflow/python/ops/ragged/ragged_string_ops.py
new file mode 100644
index 00000000000..cdcdbdff07b
--- /dev/null
+++ b/tensorflow/python/ops/ragged/ragged_string_ops.py
@@ -0,0 +1,119 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Ragged operations for working with string Tensors."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gen_string_ops
+from tensorflow.python.ops.ragged import ragged_conversion_ops
+from tensorflow.python.ops.ragged import ragged_factory_ops
+from tensorflow.python.ops.ragged import ragged_tensor
+from tensorflow.python.util.tf_export import tf_export
+
+
+# pylint: disable=redefined-builtin
+@tf_export("strings.unicode_encode")
+def unicode_encode(input, output_encoding, errors="replace",
+                   replacement_char=65533, name=None):
+  r"""Encodes each sequence of Unicode code points in `input` into a string.
+
+  `result[i1...iN]` is the string formed by concatenating the Unicode
+  codepoints `input[1...iN, :]`, encoded using `output_encoding`.
+
+  Args:
+    input: An `N+1` dimensional potentially ragged integer tensor with
+        shape `[D1...DN, num_chars]`.
+    output_encoding: Unicode encoding that should be used to encode each
+      codepoint sequence.  Can be `"UTF-8"`, `"UTF-16-BE"`, or `"UTF-32-BE"`.
+    errors: Specifies the response when an invalid codepoint is encountered
+      (optional). One of:
+            * `'replace'`: Replace invalid codepoint with the
+              `replacement_char`. (default)
+            * `'ignore'`: Skip invalid codepoints.
+            * `'strict'`: Raise an exception for any invalid codepoint.
+    replacement_char: The replacement character codepoint to be used in place of
+      any invalid input when `errors='replace'`. Any valid unicode codepoint may
+      be used. The default value is the default unicode replacement character
+      which is 0xFFFD (U+65533).
+    name: A name for the operation (optional).
+
+  Returns:
+    A `N` dimensional `string` tensor with shape `[D1...DN]`.
+
+  #### Example:
+    ```python
+      >>> input = [[71, 246, 246, 100, 110, 105, 103, 104, 116], [128522]]
+      >>> unicode_encode(input, 'UTF8')
+      ['G\xc3\xb6\xc3\xb6dnight', '\xf0\x9f\x98\x8a']
+    ```
+  """
+  with ops.name_scope(name, "UnicodeEncode", [input]):
+    input_tensor = ragged_factory_ops.convert_to_tensor_or_ragged_tensor(input)
+    if input_tensor.shape.ndims is None:
+      raise ValueError("Rank of input_tensor must be statically known.")
+    if ragged_tensor.is_ragged(input_tensor):
+      if input_tensor.inner_values.shape.ndims > 1:
+        # If the inner_values of our ragged tensor is multi-dimensional, we can
+        # process it separately and our output will have the same nested splits
+        # as our input.
+        return input_tensor.with_inner_values(
+            unicode_encode(input_tensor.inner_values, output_encoding, errors,
+                           replacement_char))
+      elif input_tensor.ragged_rank > 1:
+        # Recursively process the values of the ragged tensor.
+        return input_tensor.with_values(
+            unicode_encode(input_tensor.values, output_encoding, errors,
+                           replacement_char))
+      else:
+        # Our ragged tensor is of the correct shape (rank 1 inner_values tensor
+        # with ragged_rank of 1) so we can process it as normal.
+        return gen_string_ops.unicode_encode(
+            input_values=input_tensor.values,
+            input_splits=input_tensor.row_splits,
+            output_encoding=output_encoding,
+            errors=errors,
+            replacement_char=replacement_char)
+    else:
+      if input_tensor.shape.ndims == 2:
+        # The input tensor is of the correct 2-D shape, it's just not ragged.
+        return unicode_encode(ragged_conversion_ops.from_tensor(input_tensor),
+                              output_encoding, errors, replacement_char)
+      elif input_tensor.shape.ndims > 2:
+        # We need to initially flatten the input tensor to 2-D, and then can
+        # reshape the output of our processed flattened tensor.
+        flat_input_tensor = array_ops.reshape(
+            input_tensor,
+            array_ops.stack([-1, array_ops.shape(input_tensor)[-1]]))
+        flat_output_tensor = unicode_encode(flat_input_tensor, output_encoding,
+                                            errors, replacement_char)
+        return array_ops.reshape(flat_output_tensor, input_tensor.shape[:-1])
+      elif input_tensor.shape.ndims == 0:
+        raise ValueError("input_tensor's rank must be at least 1.")
+      else:
+        # Our input tensor is rank 1, so we create a ragged tensor with an added
+        # dimension to create the correct input shape & type, and then remove
+        # the additional dimension from the output and return the string scalar.
+        ragged_input_tensor = ragged_factory_ops.from_row_splits(
+            input_tensor,
+            array_ops.stack([0, array_ops.shape(input_tensor,
+                                                out_type=dtypes.int64)[0]]))
+        output_tensor = unicode_encode(ragged_input_tensor, output_encoding,
+                                       errors, replacement_char)
+        return array_ops.reshape(output_tensor, [])
diff --git a/tensorflow/python/ops/ragged/ragged_tensor.py b/tensorflow/python/ops/ragged/ragged_tensor.py
index abb27fc3c08..ddeabfb4649 100644
--- a/tensorflow/python/ops/ragged/ragged_tensor.py
+++ b/tensorflow/python/ops/ragged/ragged_tensor.py
@@ -64,7 +64,7 @@ class RaggedTensor(object):
   a 3-D `RaggedTensor` that stores the fixed-size word embedding for each
   word in a sentence, for each sentence in a batch, could be written as
   `[num_sentences, (num_words), embedding_size]`.  The parentheses around
-  `(num_words)` indicate that that dimension is ragged, and that the length
+  `(num_words)` indicate that dimension is ragged, and that the length
   of each element list in that dimension may vary for each item.
 
   ### Component Tensors
@@ -257,6 +257,7 @@ class RaggedTensor(object):
       raise TypeError("Row-partitioning argument must be a Tensor.")
     values.shape.with_rank_at_least(1)
     row_splits.shape.assert_has_rank(1)
+    row_splits.set_shape([None])
 
     self._values = values
     self._row_splits = row_splits
diff --git a/tensorflow/python/ops/ragged/ragged_tensor_bounding_shape_op_test.py b/tensorflow/python/ops/ragged/ragged_tensor_bounding_shape_op_test.py
index a1c10aff9de..befe30f0e10 100644
--- a/tensorflow/python/ops/ragged/ragged_tensor_bounding_shape_op_test.py
+++ b/tensorflow/python/ops/ragged/ragged_tensor_bounding_shape_op_test.py
@@ -28,41 +28,39 @@ class RaggedTensorBoundingShapeOp(test_util.TensorFlowTestCase):
   def testDocStringExample(self):
     # This is the example from ragged.bounding_shape.__doc__.
     rt = ragged.constant([[1, 2, 3, 4], [5], [], [6, 7, 8, 9], [10]])
-    with self.test_session():
-      self.assertEqual(ragged.bounding_shape(rt).eval().tolist(), [5, 4])
+    self.assertEqual(self.evaluate(ragged.bounding_shape(rt)).tolist(), [5, 4])
 
   def test2DRaggedTensorWithOneRaggedDimension(self):
     values = ['a', 'b', 'c', 'd', 'e', 'f', 'g']
     rt1 = ragged.from_row_splits(values, [0, 2, 5, 6, 6, 7])
     rt2 = ragged.from_row_splits(values, [0, 7])
     rt3 = ragged.from_row_splits(values, [0, 0, 7, 7])
-    with self.test_session():
-      self.assertEqual(ragged.bounding_shape(rt1).eval().tolist(), [5, 3])
-      self.assertEqual(ragged.bounding_shape(rt2).eval().tolist(), [1, 7])
-      self.assertEqual(ragged.bounding_shape(rt3).eval().tolist(), [3, 7])
+    self.assertEqual(self.evaluate(ragged.bounding_shape(rt1)).tolist(), [5, 3])
+    self.assertEqual(self.evaluate(ragged.bounding_shape(rt2)).tolist(), [1, 7])
+    self.assertEqual(self.evaluate(ragged.bounding_shape(rt3)).tolist(), [3, 7])
 
   def test3DRaggedTensorWithOneRaggedDimension(self):
     values = [[0, 1], [2, 3], [4, 5], [6, 7], [8, 9], [10, 11], [12, 13]]
     rt1 = ragged.from_row_splits(values, [0, 2, 5, 6, 6, 7])
     rt2 = ragged.from_row_splits(values, [0, 7])
     rt3 = ragged.from_row_splits(values, [0, 0, 7, 7])
-    with self.test_session():
-      self.assertEqual(ragged.bounding_shape(rt1).eval().tolist(), [5, 3, 2])
-      self.assertEqual(ragged.bounding_shape(rt2).eval().tolist(), [1, 7, 2])
-      self.assertEqual(ragged.bounding_shape(rt3).eval().tolist(), [3, 7, 2])
+    self.assertEqual(
+        self.evaluate(ragged.bounding_shape(rt1)).tolist(), [5, 3, 2])
+    self.assertEqual(
+        self.evaluate(ragged.bounding_shape(rt2)).tolist(), [1, 7, 2])
+    self.assertEqual(
+        self.evaluate(ragged.bounding_shape(rt3)).tolist(), [3, 7, 2])
 
   def testNonRaggedTensor(self):
     dt = [[0, 1, 2], [3, 4, 5], [6, 7, 8], [9, 10, 11]]
-    with self.test_session():
-      self.assertEqual(ragged.bounding_shape(dt).eval().tolist(), [4, 3])
+    self.assertEqual(self.evaluate(ragged.bounding_shape(dt)).tolist(), [4, 3])
 
   def testExplicitAxisOptimizations(self):
     rt = ragged.from_row_splits(b'a b c d e f g'.split(), [0, 2, 5, 6, 6, 7])
-    with self.test_session():
-      self.assertEqual(ragged.bounding_shape(rt, 0).eval().tolist(), 5)
-      self.assertEqual(ragged.bounding_shape(rt, 1).eval().tolist(), 3)
-      self.assertEqual(
-          ragged.bounding_shape(rt, [1, 0]).eval().tolist(), [3, 5])
+    self.assertEqual(self.evaluate(ragged.bounding_shape(rt, 0)).tolist(), 5)
+    self.assertEqual(self.evaluate(ragged.bounding_shape(rt, 1)).tolist(), 3)
+    self.assertEqual(
+        self.evaluate(ragged.bounding_shape(rt, [1, 0])).tolist(), [3, 5])
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/ops/ragged/ragged_tensor_shape.py b/tensorflow/python/ops/ragged/ragged_tensor_shape.py
new file mode 100644
index 00000000000..9129b4b10b4
--- /dev/null
+++ b/tensorflow/python/ops/ragged/ragged_tensor_shape.py
@@ -0,0 +1,570 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Shapes & broadcasting for RaggedTensors."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.ragged import ragged_array_ops
+from tensorflow.python.ops.ragged import ragged_conversion_ops
+from tensorflow.python.ops.ragged import ragged_factory_ops
+from tensorflow.python.ops.ragged import ragged_tensor
+from tensorflow.python.ops.ragged import ragged_util
+
+
+class RaggedTensorDynamicShape(object):
+  """A collection of tensors encoding the shape of a potentially ragged tensor.
+
+  Each `RaggedTensorDynamicShape` consists of an ordered list of dimension
+  sizes.  There are two dimension types:
+
+    * "Uniform dimensions" are dimenisons where all slices have the same
+      length.  `RaggedTensorDynamicShape` records the size of each uniform
+      dimension using a single scalar integer.
+
+    * "Ragged dimensions" are dimensions whose slices may have different
+      lengths.  `RaggedTensorDynamicShape` records the size of each ragged
+      dimension using an integer vector containing the slice lengths for all
+      the slices across that dimension.
+
+  Furthermore, there are two ways a dimension might be encoded:
+
+    * "Partitioned dimensions" are dimensions that are encoded using a
+      `RaggedTensor`'s `nested_row_splits`.  The outermostmost partitioned
+      dimension must be uniform, and the innermost partitioned dimension must
+      be ragged.
+
+    * "Inner dimensions" are dimensions that are encoded using a
+      `RaggedTensor`'s `inner_values`.  Inner dimensions are always uniform.
+
+  The sizes of partitioned dimensions are recorded using `partitioned_dim_sizes`
+  and `inner_dim_sizes`:
+
+    * `paritioned_dim_sizes` is a list of tensors (one for each partitioned
+      dimension).
+
+      * For uniform dimensions, the tensor is an integer scalar specifying the
+        size of all slices across that dimension.
+      * For ragged dimensions, the tensor is an integer vector specifying the
+        size of each slice across that dimension.
+
+    * `inner_dim_sizes` is a single integer vector, where each element
+      specifies the size of a single inner dimension.
+
+  Examples:
+
+  Tensor                         | Ragged | Partitioned Dim Sizes  | Inner Dim
+                                 : Rank   :                        : Sizes
+  ------------------------------ | ------ | ---------------------- | ----------
+  `[[1, 2, 3], [4, 5, 6]]`       |      0 |                        | `2, 3`
+  `[[1, 2], [], [3, 4, 5]]`      |      1 | `3, (2, 0, 3)`         |
+  `[[[1, 2], [3, 4]], [[5, 6]]]` |      1 | `2, (2, 1)`            | 2
+  `[[[1, 2], [3]], [[4, 5]]]`    |      2 | `2, (2, 1), (2, 1, 2)` |
+  """
+
+  def __init__(self, partitioned_dim_sizes, inner_dim_sizes):
+    """Creates a RaggedTensorDynamicShape.
+
+    Args:
+      partitioned_dim_sizes: A `list` of 0-D or 1-D integer `Tensor`, one for
+        each partitioned dimension.  If dimension `d` is uniform, then
+        `partitioned_dim_sizes[d]` must be an integer scalar, specifying the
+        size of all slices across dimension `d`.  If dimension `d` is ragged,
+        then `partitioned_dim_sizes[d]` must be an integer vector, specifying
+        the size of each slice across dimension `d`.
+      inner_dim_sizes: A 1-D integer `Tensor`, whose length is equal to the
+        number of inner dimensions.  `inner_dim_sizes[n]` is the size of all
+        slices across the `n`th inner dimension (which is the
+        `(len(partitioned_dim_sizes)+n)`th dimension in the overall tensor.
+    """
+    assert isinstance(partitioned_dim_sizes, (list, tuple))
+    with ops.name_scope(None, 'RaggedTensorDynamicShape',
+                        (partitioned_dim_sizes, inner_dim_sizes)):
+      partitioned_dim_sizes = tuple(
+          ragged_util.convert_to_int_tensor(
+              size, dtype=dtypes.int64, name='partitioned_dimension_size')
+          for size in partitioned_dim_sizes)
+      inner_dim_sizes = ragged_util.convert_to_int_tensor(
+          inner_dim_sizes, dtype=dtypes.int64, name='inner_dim_sizes')
+
+      # Validate shapes.
+      if partitioned_dim_sizes:
+        for axis, dimension_size in enumerate(partitioned_dim_sizes):
+          if dimension_size.shape.ndims is None:
+            raise ValueError(
+                'rank of partitioned_dim_sizes[%d] is unknown' % axis)
+          dimension_size.shape.with_rank_at_most(1)
+        if partitioned_dim_sizes[0].shape.ndims == 1:
+          raise ValueError('outermost partitioned dimension must be uniform')
+        if partitioned_dim_sizes[-1].shape.ndims == 0:
+          raise ValueError('innermost partitioned dimension must be ragged')
+      inner_dim_sizes.shape.assert_has_rank(1)
+
+      self._partitioned_dim_sizes = partitioned_dim_sizes
+      self._inner_dim_sizes = inner_dim_sizes
+
+  def __repr__(self):
+    return ('RaggedTensorDynamicShape'
+            '(partitioned_dim_sizes=%r, inner_dim_sizes=%r)' %
+            (self._partitioned_dim_sizes, self._inner_dim_sizes))
+
+  @staticmethod
+  def from_dim_sizes(dim_sizes):
+    """Constructs a ragged shape from a list of dimension sizes.
+
+    This list contains a single tensor for each dimension, where the tensor
+    is a scalar if the dimension is uniform, or a vector if the dimension is
+    ragged.
+
+    Args:
+      dim_sizes: List of int64 scalars or vectors.
+
+    Returns:
+      A RaggedTensorDynamicShape.
+    """
+    with ops.name_scope(None, 'RaggedTensorDynamicShapeFromDimensionSizes',
+                        [dim_sizes]):
+      dim_sizes = tuple(
+          ragged_util.convert_to_int_tensor(
+              size, dtype=dtypes.int64, name='dim_sizes') for size in dim_sizes)
+      # Split the dimensions into partitioned & inner dimensions.
+      inner_split = 0
+      for dim, dim_size in enumerate(dim_sizes):
+        if dim_size.shape.ndims == 1:
+          inner_split = dim + 1
+        elif dim_size.shape.ndims != 0:
+          raise ValueError('Each dim_size must be a scalar or a vector')
+      return RaggedTensorDynamicShape(dim_sizes[:inner_split],
+                                      dim_sizes[inner_split:])
+
+  @classmethod
+  def from_tensor(cls, rt_input):
+    """Constructs a ragged shape for a potentially ragged tensor."""
+    with ops.name_scope(None, 'RaggedTensorDynamicShapeFromTensor', [rt_input]):
+      rt_input = ragged_factory_ops.convert_to_tensor_or_ragged_tensor(rt_input)
+      if not ragged_tensor.is_ragged(rt_input):
+        return cls([], array_ops.shape(rt_input))
+      else:
+        partitioned_dim_sizes = ((ragged_array_ops.nrows(rt_input),) +
+                                 ragged_array_ops.nested_row_lengths(rt_input))
+        return RaggedTensorDynamicShape(
+            partitioned_dim_sizes,
+            array_ops.shape(rt_input.inner_values)[1:])
+
+  def dimension_size(self, axis):
+    """Returns the size of slices across the specified dimension."""
+    if not isinstance(axis, int):
+      raise TypeError('axis must be an integer')
+    partitioned_ndims = len(self._partitioned_dim_sizes)
+    if axis < partitioned_ndims:
+      return self._partitioned_dim_sizes[axis]
+    else:
+      return self._inner_dim_sizes[axis - partitioned_ndims]
+
+  def is_ragged(self, axis):
+    """Returns true if the indicated dimension is ragged."""
+    if not isinstance(axis, int):
+      raise TypeError('axis must be an integer')
+    rank = self.rank
+    if axis < 0:
+      raise ValueError('Negative axis values are not supported')
+    elif rank is not None and axis >= rank:
+      raise ValueError('Expected axis=%s < rank=%s' % (axis, rank))
+    else:
+      return (axis > 0 and axis < len(self._partitioned_dim_sizes) and
+              self._partitioned_dim_sizes[axis].shape.ndims == 1)
+
+  @property
+  def rank(self):
+    """The number of dimensions in this shape, or None if unknown."""
+    inner_ndims = self._inner_dim_sizes.shape[0].value
+    if inner_ndims is None:
+      return None
+    else:
+      return len(self._partitioned_dim_sizes) + inner_ndims
+
+  @property
+  def partitioned_dim_sizes(self):
+    """The partitioned dimension sizes for this shape.
+
+    Returns:
+      A `list` of 0-D or 1-D integer `Tensor`.
+    """
+    return self._partitioned_dim_sizes
+
+  @property
+  def inner_dim_sizes(self):
+    """The inner dimension sizes for this shape.
+
+    Returns:
+      A 1-D integer `Tensor`.
+    """
+    return self._inner_dim_sizes
+
+  @property
+  def num_partitioned_dimensions(self):
+    """The number of partitioned dimensions in this shape."""
+    return len(self._partitioned_dim_sizes)
+
+  @property
+  def num_inner_dimensions(self):
+    """The number of inner dimensions, or `None` if not statically known."""
+    return self._inner_dim_sizes.shape[0].value
+
+  def broadcast_to_rank(self, rank):
+    """Adds leading size-1 dimensions to broadcast `self` to the given rank.
+
+    E.g., if `shape1` is `[3, (D2), 4]`, then `shape1.broadcast_to_rank(5)`
+    is `[1, 1, 3, (D2), 4]`.
+
+    Args:
+      rank: The rank for the returned shape.
+
+    Returns:
+      A RaggedTensorDynamicShape with `rank` dimensions, whose inner dimensions
+      have the same size as `self` and whose outer dimensions have size `1`.
+
+    Raises:
+      ValueError: If `self.rank` is unknown or greater than `rank`.
+    """
+    if self.rank is None:
+      raise ValueError('Unable to broadcast: self.rank is unknown')
+    dims_to_add = rank - self.rank
+    if dims_to_add < 0:
+      raise ValueError('Unable to broadcast: rank=%d must be greater than '
+                       'self.rank=%d.' % (rank, self.rank))
+    elif dims_to_add == 0:
+      return self
+    elif self._partitioned_dim_sizes:
+      partitioned_dims = (1,) * dims_to_add + self._partitioned_dim_sizes
+      return RaggedTensorDynamicShape(partitioned_dims, self._inner_dim_sizes)
+    else:
+      inner_dims = array_ops.concat(
+          [array_ops.ones([dims_to_add], dtypes.int64), self.inner_dim_sizes],
+          axis=0)
+      return RaggedTensorDynamicShape([], inner_dims)
+
+  def broadcast_dimension(self, axis, lengths):
+    """Returns a shape that is broadcast-compatible with self & lengths.
+
+    * If dimension[axis] is uniform and lengths is a scalar, the check
+      that either lengths==1 or axis==1 or lengths==axis, and tile
+      dimension[axis] with tf.where(lengths==axis, 1, axis) repeats.
+
+    * If dimension[axis] is uniform and lengths is a vector, then check
+      that dimension[axis]==1, and raggedly tile dimension[axis] with
+      lengths repeats.  (we can skip tiling if we statically know that
+      slice_lengths == 1??)
+
+    * If dimension[axis] is ragged and lengths is a scalar, then check
+      that lengths==1.
+
+    * If dimension[axis] is ragged and lengths is a vector, then check
+      that self.dimension_size(axis) == lengths.
+
+    Args:
+      axis: `int`.  The dimension to broadcast.
+      lengths: 0-D or 1-D integer `Tensor`.
+
+    Returns:
+      A `RaggedTensorDynamicShape`.
+    """
+    lengths = ragged_util.convert_to_int_tensor(
+        lengths, name='lengths', dtype=dtypes.int64)
+    # Check whether lengths is a scalar (for uniform dimensions) or
+    # vector (for ragged dimensions).
+    if lengths.shape.ndims is None:
+      raise ValueError('lengths must have a known rank.')
+    elif lengths.shape.ndims > 1:
+      raise ValueError('lengths must be a scalar or vector')
+    else:
+      lengths_is_scalar = (lengths.shape.ndims == 0)
+
+    # Verify that the shapes are compatible.
+    if self.is_ragged(axis):
+      if lengths_is_scalar:
+        condition = math_ops.equal(lengths, 1)
+      else:
+        condition = math_ops.reduce_all(
+            math_ops.equal(lengths, self.dimension_size(axis)))
+    else:
+      axis_dim_size = self.dimension_size(axis)
+      if lengths_is_scalar:
+        condition = (
+            math_ops.equal(lengths, 1) | math_ops.equal(axis_dim_size, 1)
+            | math_ops.equal(axis_dim_size, lengths))
+      else:
+        condition = math_ops.equal(axis_dim_size, 1)
+    broadcast_err = [
+        'Unable to broadcast: dimension size mismatch in dimension', axis,
+        'lengths=', lengths, 'dim_size=',
+        self.dimension_size(axis)
+    ]
+    broadcast_check = control_flow_ops.Assert(
+        condition, data=broadcast_err, summarize=10)
+
+    with ops.control_dependencies([broadcast_check]):
+      # Partitioned dimensions:
+      if axis < self.num_partitioned_dimensions:
+        if self.is_ragged(axis):
+          # Use an identity op to make sure the check actually gets run.
+          return RaggedTensorDynamicShape(
+              self._partitioned_dim_sizes,
+              array_ops.identity(self.inner_dim_sizes))
+        else:
+          return self._broadcast_uniform_partitioned_dimension(axis, lengths)
+
+      # Inner dimensions:
+      else:
+        if lengths_is_scalar:
+          return self._broadcast_inner_dimension_to_uniform(axis, lengths)
+        else:
+          if axis == 0:
+            raise ValueError('Unable to broadcast: '
+                             'outermost dimension must be uniform.')
+          return self._broadcast_inner_dimension_to_ragged(axis, lengths)
+
+  def num_slices_in_dimension(self, axis):
+    """Returns the total number of slices across the indicated dimension."""
+    if axis < 0:
+      return constant_op.constant(1, dtype=dtypes.int64)
+    elif self.is_ragged(axis):
+      return math_ops.reduce_sum(self._partitioned_dim_sizes[axis])
+    else:
+      return self.dimension_size(axis) * self.num_slices_in_dimension(axis - 1)
+
+  def _broadcast_uniform_partitioned_dimension(self, axis, lengths):
+    """Broadcasts the partitioned dimension `axis` to match `lengths`."""
+    axis_dim_size = self.dimension_size(axis)
+    partitioned_sizes = list(self._partitioned_dim_sizes[:axis])
+
+    if lengths.shape.ndims == 0:
+      lengths = array_ops.where(
+          math_ops.equal(axis_dim_size, 1), lengths, axis_dim_size)
+      repeats = array_ops.where(math_ops.equal(axis_dim_size, 1), lengths, 1)
+      splits = array_ops.stack([0, self.num_slices_in_dimension(axis)])
+    else:
+      splits = math_ops.range(
+          array_ops.size(lengths, out_type=dtypes.int64) + 1)
+      repeats = lengths
+
+    partitioned_sizes.append(lengths)
+
+    for dim_size in self._partitioned_dim_sizes[axis + 1:]:
+      if dim_size.shape.ndims == 0:
+        partitioned_sizes.append(dim_size)
+        splits *= dim_size
+      else:
+        partitioned_sizes.append(
+            ragged_util.repeat_ranges(dim_size, splits, repeats))
+        splits = array_ops.gather(
+            ragged_util.lengths_to_splits(dim_size), splits)
+    inner_sizes = self._inner_dim_sizes
+    return RaggedTensorDynamicShape(partitioned_sizes, inner_sizes)
+
+  def _broadcast_inner_dimension_to_uniform(self, axis, length):
+    """Broadcasts the inner dimension `axis` to match `lengths`."""
+    dim_size = self.dimension_size(axis)
+    axis_in_inner_dims = axis - self.num_partitioned_dimensions
+    partitioned_sizes = self._partitioned_dim_sizes
+    inner_sizes = array_ops.concat([
+        self._inner_dim_sizes[:axis_in_inner_dims],
+        [array_ops.where(math_ops.equal(dim_size, 1), length, dim_size)],
+        self._inner_dim_sizes[axis_in_inner_dims + 1:]
+    ],
+                                   axis=0)
+    return RaggedTensorDynamicShape(partitioned_sizes, inner_sizes)
+
+  def _broadcast_inner_dimension_to_ragged(self, axis, lengths):
+    axis_in_inner_dims = axis - self.num_partitioned_dimensions
+    partitioned_sizes = (
+        self._partitioned_dim_sizes + tuple([
+            self._inner_dim_sizes[i] for i in range(axis_in_inner_dims)
+        ]) + (lengths,))
+    inner_sizes = self._inner_dim_sizes[axis_in_inner_dims + 1:]
+    return RaggedTensorDynamicShape(partitioned_sizes, inner_sizes)
+
+
+def broadcast_dynamic_shape(shape_x, shape_y):
+  """Returns the shape formed by broadcasting two shapes to be compatible.
+
+  Args:
+    shape_x: A `RaggedTensorDynamicShape`
+    shape_y: A `RaggedTensorDynamicShape`
+
+  Returns:
+    A `RaggedTensorDynamicShape`.
+  Raises:
+    ValueError: If `shape_x` and `shape_y` are not broadcast-compatible.
+  """
+  if not isinstance(shape_x, RaggedTensorDynamicShape):
+    raise TypeError('shape_x must be a RaggedTensorDynamicShape')
+  if not isinstance(shape_y, RaggedTensorDynamicShape):
+    raise TypeError('shape_y must be a RaggedTensorDynamicShape')
+
+  # Broadcast both shapes to have the same rank.
+  if shape_x.rank is None or shape_y.rank is None:
+    raise ValueError('Unable to broadcast: unknown rank')
+  broadcast_rank = max(shape_x.rank, shape_y.rank)
+  shape_x = shape_x.broadcast_to_rank(broadcast_rank)
+  shape_y = shape_y.broadcast_to_rank(broadcast_rank)
+
+  # Broadcast dimensions one at a time, starting from the outermost dimension.
+  for axis in range(broadcast_rank):
+    shape_x = shape_x.broadcast_dimension(axis, shape_y.dimension_size(axis))
+    shape_y = shape_y.broadcast_dimension(axis, shape_x.dimension_size(axis))
+
+  return shape_x
+
+
+def broadcast_to(rt_input, shape, broadcast_inner_dimensions=True):
+  """Broadcasts a potentially ragged tensor to a ragged shape.
+
+  Tiles `rt_input` as necessary to match the given shape.
+
+  Behavior is undefined if `rt_input` is not broadcast-compatible with `shape`.
+
+  Args:
+    rt_input: The potentially ragged tensor to broadcast.
+    shape: A `RaggedTensorDynamicShape`
+    broadcast_inner_dimensions: If false, then inner dimensions will not be
+      tiled.
+
+  Returns:
+    A potentially ragged tensor whose values are taken from
+    `rt_input`, and whose shape matches `shape`.
+  """
+  if not isinstance(shape, RaggedTensorDynamicShape):
+    raise TypeError('shape must be a RaggedTensorDynamicShape')
+  rt_input = ragged_factory_ops.convert_to_tensor_or_ragged_tensor(rt_input)
+
+  # Broadcasting to a uniform shape.
+  if shape.num_partitioned_dimensions == 0:
+    return _broadcast_to_uniform_shape(rt_input, shape,
+                                       broadcast_inner_dimensions)
+  else:
+    return _broadcast_to_ragged_shape(rt_input, shape,
+                                      broadcast_inner_dimensions)
+
+
+def _broadcast_to_uniform_shape(rt_input, shape, broadcast_inner_dimensions):
+  """Broadcasts rt_input to the uniform shape `shape`."""
+  if isinstance(rt_input, ragged_tensor.RaggedTensor):
+    raise ValueError('Incompatible with shape: ragged rank mismatch')
+  if broadcast_inner_dimensions:
+    return array_ops.broadcast_to(rt_input, shape.inner_dim_sizes)
+  else:
+    return rt_input
+
+
+def _broadcast_to_ragged_shape(rt_input, dst_shape, broadcast_inner_dimensions):
+  """Broadcasts rt_input to the ragged shape `dst_shape`."""
+  # dst_shape's rank and ragged_rank must be greater than or equal to rt_input's
+  if rt_input.shape.ndims is None or dst_shape.rank is None:
+    raise ValueError('Unable to broadcast: unknown rank')
+  if rt_input.shape.ndims > dst_shape.rank:
+    raise ValueError('Incompatible with shape: rank mismatch')
+  if (isinstance(rt_input, ragged_tensor.RaggedTensor) and
+      rt_input.ragged_rank >= dst_shape.num_partitioned_dimensions):
+    raise ValueError('Incompatible with shape: ragged rank mismatch')
+
+  src_shape = RaggedTensorDynamicShape.from_tensor(rt_input)
+  src_shape = src_shape.broadcast_to_rank(dst_shape.rank)
+
+  # Add dimensions to rt_input so its rank and ragged_rank matches dst_shape.
+  if dst_shape.rank > rt_input.shape.ndims:
+    if rt_input.shape.ndims < dst_shape.num_inner_dimensions + 1:
+      rt_input = array_ops.reshape(
+          rt_input, array_ops.concat([[-1], dst_shape.inner_dim_sizes], axis=0))
+    for _ in range(dst_shape.rank - rt_input.shape.ndims):
+      rt_input = ragged_factory_ops.from_row_lengths(
+          rt_input, [ragged_array_ops.nrows(rt_input)])
+
+  # Add ragged dimensions to match dst_shape.
+  if ragged_tensor.is_ragged(rt_input):
+    inner_rank_diff = (
+        rt_input.inner_values.shape.ndims - 1 - dst_shape.num_inner_dimensions)
+    if inner_rank_diff > 0:
+      rt_input = rt_input.with_inner_values(
+          ragged_conversion_ops.from_tensor(
+              rt_input.inner_values, ragged_rank=inner_rank_diff))
+  else:
+    rt_input = ragged_conversion_ops.from_tensor(
+        rt_input, ragged_rank=dst_shape.num_partitioned_dimensions - 1)
+
+  # Do broadcasting for any dimensions that will remain uniform.  We can do
+  # these all at once, since they're independent of one another.
+  multiples = [1] * dst_shape.rank
+  for axis in range(dst_shape.num_partitioned_dimensions):
+    if not src_shape.is_ragged(axis) and not dst_shape.is_ragged(axis):
+      src_size = src_shape.dimension_size(axis)
+      dst_size = dst_shape.dimension_size(axis)
+      if ((tensor_util.constant_value(src_size) in (1, None)) and
+          (tensor_util.constant_value(dst_size) != 1)):
+        multiples[axis] = array_ops.where(
+            math_ops.equal(src_size, 1), dst_size, 1)
+  if not all(isinstance(v, int) and v == 1 for v in multiples):
+    multiples = array_ops.stack(multiples, axis=0)
+    rt_input = ragged_array_ops.tile(rt_input, multiples)
+
+  if broadcast_inner_dimensions:
+    rt_input = rt_input.with_inner_values(
+        array_ops.reshape(
+            rt_input.inner_values,
+            array_ops.concat([[-1], dst_shape.inner_dim_sizes], axis=0)))
+
+  # Do broadcasting for dimensions that become ragged.  We must do these from
+  # outermost to innermost.
+  for axis in range(dst_shape.num_partitioned_dimensions):
+    if not src_shape.is_ragged(axis) and dst_shape.is_ragged(axis):
+      dst_size = dst_shape.dimension_size(axis)
+      rt_input = _ragged_tile_axis(rt_input, axis, dst_size)
+
+  return rt_input
+
+
+def _ragged_tile_axis(rt_input, axis, repeats):
+  """Tile a dimension of a RaggedTensor to match a ragged shape."""
+  assert axis > 0  # Outermost dimension may not be ragged.
+
+  if not ragged_tensor.is_ragged(rt_input):
+    rt_input = ragged_conversion_ops.from_tensor(rt_input, ragged_rank=1)
+
+  if axis > 1:
+    return rt_input.with_values(
+        _ragged_tile_axis(rt_input.values, axis - 1, repeats))
+  else:
+    src_row_splits = rt_input.nested_row_splits
+    src_row_lengths = ragged_array_ops.nested_row_lengths(rt_input)
+    splits = src_row_splits[0]
+
+    dst_row_lengths = [repeats]
+    for i in range(1, len(src_row_lengths)):
+      dst_row_lengths.append(
+          ragged_util.repeat_ranges(src_row_lengths[i], splits, repeats))
+      splits = array_ops.gather(src_row_splits[i], splits)
+    dst_values = ragged_util.repeat_ranges(rt_input.inner_values, splits,
+                                           repeats)
+    return ragged_factory_ops.from_nested_row_lengths(dst_values,
+                                                      dst_row_lengths)
+
diff --git a/tensorflow/python/ops/ragged/ragged_tensor_shape_test.py b/tensorflow/python/ops/ragged/ragged_tensor_shape_test.py
new file mode 100644
index 00000000000..9c2dd260503
--- /dev/null
+++ b/tensorflow/python/ops/ragged/ragged_tensor_shape_test.py
@@ -0,0 +1,487 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for tf.ragged.ragged_tensor_shape."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+import numpy as np
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import ragged
+from tensorflow.python.platform import googletest
+
+
+class RaggedTensorShapeTest(test_util.TensorFlowTestCase,
+                            parameterized.TestCase):
+
+  def assertShapeEq(self, x, y):
+    assert isinstance(x, ragged.RaggedTensorDynamicShape)
+    assert isinstance(y, ragged.RaggedTensorDynamicShape)
+    x_partitioned_dim_sizes = [
+        splits.eval().tolist()  #
+        for splits in x.partitioned_dim_sizes
+    ]
+    y_partitioned_dim_sizes = [
+        splits.eval().tolist()  #
+        for splits in y.partitioned_dim_sizes
+    ]
+    self.assertEqual(x_partitioned_dim_sizes, y_partitioned_dim_sizes)
+    self.assertEqual(x.inner_dim_sizes.eval().tolist(),
+                     y.inner_dim_sizes.eval().tolist())
+
+  @parameterized.parameters([
+      dict(value='x', expected_dim_sizes=[]),
+      dict(value=['a', 'b', 'c'], expected_dim_sizes=[3]),
+      dict(value=[['a', 'b', 'c'], ['d', 'e', 'f']], expected_dim_sizes=[2, 3]),
+      dict(
+          value=[[['a', 'b', 'c'], ['d', 'e', 'f']]],
+          expected_dim_sizes=[1, 2, 3]),
+      dict(
+          value=ragged.constant_value([['a', 'b', 'c'], ['d', 'e']]),
+          expected_dim_sizes=[2, [3, 2]]),
+      dict(
+          value=ragged.constant_value([[['a', 'b', 'c'], ['d', 'e']]]),
+          expected_dim_sizes=[1, [2], [3, 2]]),
+      dict(
+          value=ragged.constant_value([[['a', 'b', 'c'], ['d', 'e', 'f']]],
+                                      ragged_rank=1),
+          expected_dim_sizes=[1, [2], 3]),
+      dict(
+          value=ragged.constant_value([[[[1], [2]], [[3], [4]]],
+                                       [[[5], [6]]]], ragged_rank=1),
+          expected_dim_sizes=[2, [2, 1], 2, 1]),
+      dict(
+          value=ragged.constant_value([[10, 20], [30]]),
+          expected_dim_sizes=[2, [2, 1]]),
+      # Docstring examples:
+      dict(value=[[1, 2, 3], [4, 5, 6]], expected_dim_sizes=[2, 3]),
+      dict(
+          value=ragged.constant_value([[1, 2], [], [3, 4, 5]]),
+          expected_dim_sizes=[3, [2, 0, 3]]),
+      dict(
+          value=ragged.constant_value([[[1, 2], [3, 4]], [[5, 6]]],
+                                      ragged_rank=1),
+          expected_dim_sizes=[2, [2, 1], 2]),
+      dict(
+          value=ragged.constant_value([[[1, 2], [3]], [[4, 5]]]),
+          expected_dim_sizes=[2, [2, 1], [2, 1, 2]]),
+  ])
+  def testFromTensor(self, value, expected_dim_sizes):
+    shape = ragged.RaggedTensorDynamicShape.from_tensor(value)
+    expected = ragged.RaggedTensorDynamicShape.from_dim_sizes(
+        expected_dim_sizes)
+    with self.cached_session():
+      self.assertShapeEq(shape, expected)
+
+  @parameterized.parameters([
+      dict(dim_sizes=[], rank=0, expected_dim_sizes=[]),
+      dict(dim_sizes=[], rank=3, expected_dim_sizes=[1, 1, 1]),
+      dict(dim_sizes=[3], rank=1, expected_dim_sizes=[3]),
+      dict(dim_sizes=[3], rank=3, expected_dim_sizes=[1, 1, 3]),
+      dict(dim_sizes=[2, 3], rank=3, expected_dim_sizes=[1, 2, 3]),
+      dict(dim_sizes=[3, [3, 2, 4]], rank=2, expected_dim_sizes=[3, [3, 2, 4]]),
+      dict(
+          dim_sizes=[3, [3, 2, 4]],
+          rank=4,
+          expected_dim_sizes=[1, 1, 3, [3, 2, 4]]),
+      dict(
+          dim_sizes=[3, [3, 2, 4], 2, 3],
+          rank=5,
+          expected_dim_sizes=[1, 3, [3, 2, 4], 2, 3]),
+  ])
+  def testBroadcastToRank(self, dim_sizes, rank, expected_dim_sizes):
+    shape = ragged.RaggedTensorDynamicShape.from_dim_sizes(dim_sizes)
+    expected = ragged.RaggedTensorDynamicShape.from_dim_sizes(
+        expected_dim_sizes)
+    broadcasted_shape = shape.broadcast_to_rank(rank)
+    with self.cached_session():
+      self.assertShapeEq(broadcasted_shape, expected)
+      self.assertEqual(broadcasted_shape.rank, rank)
+
+  @parameterized.parameters([
+      #=========================================================================
+      # dimension[axis] is uniform inner; and row_lengths is a scalar
+      #=========================================================================
+      # shape: [BROADCAST(UNIFORM), UNIFORM, UNIFORM]
+      dict(axis=0,
+           row_length=3,
+           original_dim_sizes=[1, 4, 5],
+           broadcast_dim_sizes=[3, 4, 5]),
+
+      # shape: [UNIFORM, UNIFORM, BROADCAST(UNIFORM)]
+      dict(axis=2,
+           row_length=5,
+           original_dim_sizes=[3, 4, 1],
+           broadcast_dim_sizes=[3, 4, 5]),
+
+      # shape: [UNIFORM, RAGGED, BROADCAST(UNIFORM)]
+      dict(axis=2,
+           row_length=5,
+           original_dim_sizes=[3, [3, 2, 8], 1],
+           broadcast_dim_sizes=[3, [3, 2, 8], 5]),
+
+      # shape: [UNIFORM, RAGGED, RAGGED, UNIFORM, UNIFORM, BROADCAST(UNIFORM)]
+      dict(axis=5,
+           row_length=5,
+           original_dim_sizes=[2, [2, 1], [3, 2, 8], 3, 4, 1],
+           broadcast_dim_sizes=[2, [2, 1], [3, 2, 8], 3, 4, 5]),
+
+      #=========================================================================
+      # dimension[axis] is uniform inner; and row_lengths is a vector
+      #=========================================================================
+      # shape: [UNIFORM, BROADCAST(UNIFORM)]
+      dict(axis=1,
+           row_length=[2, 0, 1],
+           original_dim_sizes=[3, 1],
+           broadcast_dim_sizes=[3, [2, 0, 1]]),
+      # shape: [UNIFORM, BROADCAST(UNIFORM), UNIFORM]
+      dict(axis=1,
+           row_length=[2, 0, 1],
+           original_dim_sizes=[3, 1, 5],
+           broadcast_dim_sizes=[3, [2, 0, 1], 5]),
+
+      # shape: [UNIFORM, UNIFORM, BROADCAST(UNIFORM)]
+      dict(axis=2,
+           row_length=[2, 0, 1, 3, 8, 2, 3, 4, 1, 8, 7, 0],
+           original_dim_sizes=[4, 3, 1],
+           broadcast_dim_sizes=[4, 3, [2, 0, 1, 3, 8, 2, 3, 4, 1, 8, 7, 0]]),
+
+      # shape: [UNIFORM, RAGGED, BROADCAST(UNIFORM)]
+      dict(axis=2,
+           row_length=[2, 5, 3],
+           original_dim_sizes=[2, [2, 1], 1],
+           broadcast_dim_sizes=[2, [2, 1], [2, 5, 3]]),
+
+      # shape: [UNIFORM, RAGGED, UNIFORM, UNIFORM, BROADCAST(UNIFORM), UNIFORM]
+      dict(axis=4,
+           row_length=list(range(18)),
+           original_dim_sizes=[2, [2, 1], 3, 2, 1, 8],
+           broadcast_dim_sizes=[2, [2, 1], 3, 2, list(range(18)), 8]),
+
+      #=========================================================================
+      # dimension[axis] is uniform partitioned; and row_lengths is a scalar
+      #=========================================================================
+      # shape: [BROADCAST(UNIFORM), RAGGED]
+      dict(axis=0,
+           row_length=3,
+           original_dim_sizes=[1, [5]],
+           broadcast_dim_sizes=[3, [5, 5, 5]]),
+
+      # shape: [BROADCAST(UNIFORM), UNIFORM, RAGGED]
+      dict(axis=0,
+           row_length=2,
+           original_dim_sizes=[1, 3, [3, 0, 2]],
+           broadcast_dim_sizes=[2, 3, [3, 0, 2, 3, 0, 2]]),
+
+      # shape: [BROADCAST(UNIFORM), RAGGED, RAGGED, UNIFORM, UNIFORM]
+      dict(axis=0,
+           row_length=3,
+           original_dim_sizes=[1, [3], [3, 5, 2], 9, 4, 5],
+           broadcast_dim_sizes=[3, [3, 3, 3], [3, 5, 2, 3, 5, 2, 3, 5, 2],
+                                9, 4, 5]),
+
+      # shape: [BROADCAST(UNIFORM), UNIFORM, RAGGED, UNIFORM]
+      dict(axis=0,
+           row_length=2,
+           original_dim_sizes=[1, 2, [2, 1], [3, 5, 2], 2],
+           broadcast_dim_sizes=[2, 2, [2, 1, 2, 1], [3, 5, 2, 3, 5, 2], 2]),
+
+      # shape: [UNIFORM, BROADCAST(UNIFORM), RAGGED, UNIFORM]
+      dict(axis=1,
+           row_length=2,
+           original_dim_sizes=[3, 1, [4, 0, 2], 5],
+           broadcast_dim_sizes=[3, 2, [4, 0, 2, 4, 0, 2], 5]),
+
+      # shape: [UNIFORM, BROADCAST(UNIFORM), RAGGED]
+      dict(axis=1,
+           row_length=1,
+           original_dim_sizes=[2, 3, (1, 2, 3, 4, 5, 6)],
+           broadcast_dim_sizes=[2, 3, (1, 2, 3, 4, 5, 6)]),
+
+      #=========================================================================
+      # dimension[axis] is uniform partitioned; and row_lengths is a vector
+      #=========================================================================
+      # shape: [UNIFORM, BROADCAST(UNIFORM), RAGGED, UNIFORM]
+      dict(axis=1,
+           row_length=[4, 1, 2],
+           original_dim_sizes=[
+               3,                          # axis=0
+               1,                          # axis=1 (broadcast)
+               [3, 1, 2],                  # axis=2
+               5],                         # axis=3
+           broadcast_dim_sizes=[
+               3,                          # axis=0
+               [4, 1, 2],                  # axis=1 (broadcast)
+               [3, 3, 3, 3, 1, 2, 2],      # axis=2
+               5]),                        # axis=3
+
+      # shape: [UNIFORM, BROADCAST(UNIFORM), RAGGED, RAGGED]
+      dict(axis=1,
+           row_length=[2, 0, 3],
+           original_dim_sizes=[
+               3,                                         # axis=0
+               1,                                         # axis=1 (broadcast)
+               [3, 1, 2],                                 # axis=2
+               [3, 1, 4, 1, 5, 9]],                       # axis=3
+           broadcast_dim_sizes=[
+               3,                                         # axis=0
+               [2, 0, 3],                                 # axis=1 (broadcast)
+               [3, 3, 2, 2, 2],                           # axis=2
+               [3, 1, 4, 3, 1, 4, 5, 9, 5, 9, 5, 9]]),    # axis=3
+
+      # shape: [UNIFORM, RAGGED, BROADCAST(UNIFORM), RAGGED, RAGGED, UNIFORM]
+      dict(axis=2,
+           row_length=[4, 1, 2],
+           original_dim_sizes=[
+               3,                                         # axis=0
+               [2, 0, 1],                                 # axis=1
+               1,                                         # axis=2 (broadcast)
+               [3, 2, 1],                                 # axis=3
+               [1, 0, 1, 0, 2, 3],                        # axis=4
+               5],                                        # axis=5
+           broadcast_dim_sizes=[
+               3,                                         # axis=0
+               [2, 0, 1],                                 # axis=2
+               [4, 1, 2],                                 # axis=2 (broadcast)
+               [3, 3, 3, 3, 2, 1, 1],                     # axis=3
+               [1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0,    # axis=4
+                2, 3, 3],
+               5]),                                       # axis=5
+
+      dict(axis=0,
+           row_length=2,
+           original_dim_sizes=[1, 1, 2, (2, 1)],
+           broadcast_dim_sizes=[2, 1, 2, (2, 1, 2, 1)]),
+      dict(axis=1,
+           row_length=(2, 1),
+           original_dim_sizes=[2, 1, 2, (2, 1, 2, 1)],
+           broadcast_dim_sizes=[2, (2, 1), 2, (2, 1, 2, 1, 2, 1)]),
+      dict(axis=2,
+           row_length=2,
+           original_dim_sizes=[2, (2, 1), 2, (2, 1, 2, 1, 2, 1)],
+           broadcast_dim_sizes=[2, (2, 1), 2, (2, 1, 2, 1, 2, 1)]),
+      dict(axis=3,
+           row_length=(2, 1, 2, 1, 2, 1),
+           original_dim_sizes=[2, (2, 1), 2, 1],
+           broadcast_dim_sizes=[2, (2, 1), 2, (2, 1, 2, 1, 2, 1)]),
+  ])  # pyformat: disable
+  def testBroadcastDimension(self, axis, row_length, original_dim_sizes,
+                             broadcast_dim_sizes):
+    """Tests for the broadcast_dimension method.
+
+    Verifies that:
+
+    * `original.broadcast_dimension(axis, row_length) == broadcast`
+    * `broadcast.broadcast_dimension(axis, row_length) == broadcast`
+    * `broadcast.broadcast_dimension(axis, 1) == broadcast`
+
+    Args:
+      axis: The axis to broadcast
+      row_length: The slice lengths to broadcast to.
+      original_dim_sizes: The dimension sizes before broadcasting.
+        original_dim_sizes[axis] should be equal to `1` or `row_length`.
+      broadcast_dim_sizes: THe dimension sizes after broadcasting.
+    """
+    original_shape = ragged.RaggedTensorDynamicShape.from_dim_sizes(
+        original_dim_sizes)
+    broadcast_shape = ragged.RaggedTensorDynamicShape.from_dim_sizes(
+        broadcast_dim_sizes)
+    self.assertEqual(original_shape.rank, broadcast_shape.rank)
+    with self.cached_session():
+      # shape[axis].value == 1 and row_length > 1:
+      bcast1 = original_shape.broadcast_dimension(axis, row_length)
+      # shape[axis].value > 1 and row_length == shape[axis].value:
+      bcast2 = broadcast_shape.broadcast_dimension(axis, row_length)
+      # shape[axis].value > 1 and row_length == 1:
+      bcast3 = broadcast_shape.broadcast_dimension(axis, 1)
+
+      self.assertShapeEq(bcast1, broadcast_shape)
+      self.assertShapeEq(bcast2, broadcast_shape)
+      self.assertShapeEq(bcast3, broadcast_shape)
+
+  @parameterized.parameters(
+      [
+          # Broadcast scalar
+          dict(x_dims=[], y_dims=[], expected_dims=[]),
+          dict(x_dims=[], y_dims=[2], expected_dims=[2]),
+          dict(x_dims=[], y_dims=[2, 3], expected_dims=[2, 3]),
+          dict(
+              x_dims=[],
+              y_dims=[2, (2, 3), (5, 7, 2, 0, 9)],
+              expected_dims=[2, (2, 3), (5, 7, 2, 0, 9)]),
+          # Broadcast vector
+          dict(x_dims=[3], y_dims=[4, 2, 3], expected_dims=[4, 2, 3]),
+          dict(x_dims=[1], y_dims=[4, 2, 3], expected_dims=[4, 2, 3]),
+          dict(x_dims=[3], y_dims=[4, 2, 1], expected_dims=[4, 2, 3]),
+          dict(
+              x_dims=[3],
+              y_dims=[3, (2, 3, 1), 1],
+              expected_dims=[3, (2, 3, 1), 3]),
+          dict(x_dims=[1], y_dims=[3, (2, 1, 3)], expected_dims=[3, (2, 1, 3)]),
+          dict(
+              x_dims=[1],
+              y_dims=[3, (2, 1, 3), 8],
+              expected_dims=[3, (2, 1, 3), 8]),
+          dict(
+              x_dims=[1],
+              y_dims=[2, (2, 3), (5, 7, 2, 0, 9)],
+              expected_dims=[2, (2, 3), (5, 7, 2, 0, 9)]),
+          # Mixed broadcasting
+          dict(
+              x_dims=[
+                  1,  # axis=0
+                  3,  # axis=1
+                  (3, 0, 2),  # axis=2
+                  1,  # axis=3
+                  2,  # axis=4
+              ],
+              y_dims=[
+                  2,  # axis=0
+                  1,  # axis=1
+                  1,  # axis=2
+                  (7, 2),  # axis=3
+                  1,  # axis=4
+              ],
+              expected_dims=[
+                  2,  # axis=0
+                  3,  # axis=1
+                  (3, 0, 2, 3, 0, 2),  # axis=2
+                  (7, 7, 7, 7, 7, 2, 2, 2, 2, 2),  # axis=3
+                  2,  # axis=4
+              ]),
+          dict(
+              x_dims=[2, (2, 1), 2, 1],
+              y_dims=[1, 1, 2, (2, 1)],
+              expected_dims=[2, (2, 1), 2, (2, 1, 2, 1, 2, 1)]),
+      ])
+  def testBroadcastDynamicShape(self, x_dims, y_dims, expected_dims):
+    x_shape = ragged.RaggedTensorDynamicShape.from_dim_sizes(x_dims)
+    y_shape = ragged.RaggedTensorDynamicShape.from_dim_sizes(y_dims)
+    expected = ragged.RaggedTensorDynamicShape.from_dim_sizes(expected_dims)
+    result1 = ragged.broadcast_dynamic_shape(x_shape, y_shape)
+    result2 = ragged.broadcast_dynamic_shape(y_shape, x_shape)
+    with self.cached_session():
+      self.assertShapeEq(expected, result1)
+      self.assertShapeEq(expected, result2)
+
+  def testRepr(self):
+    shape = ragged.RaggedTensorDynamicShape.from_dim_sizes([2, (2, 1), 2, 1])
+    self.assertRegexpMatches(
+        repr(shape),
+        r'RaggedTensorDynamicShape\('
+        r'partitioned_dim_sizes=\(<[^>]+>, <[^>]+>\), '
+        r'inner_dim_sizes=<[^>]+>\)')
+
+  @parameterized.parameters([
+      dict(
+          x=[[10], [20], [30]],  # shape=[3, 1]
+          dim_sizes=[3, 2],
+          expected=[[10, 10], [20, 20], [30, 30]]),
+      dict(
+          x=[[10], [20], [30]],  # shape=[3, 1]
+          dim_sizes=[3, [3, 0, 2]],
+          expected=ragged.constant_value([[10, 10, 10], [], [30, 30]],
+                                         dtype=np.int32)),
+      dict(
+          x=[[[1, 2, 3]], [[4, 5, 6]]],  # shape = [2, 1, 3]
+          dim_sizes=[2, [2, 3], 3],
+          expected=ragged.constant_value(
+              [[[1, 2, 3], [1, 2, 3]], [[4, 5, 6], [4, 5, 6], [4, 5, 6]]],
+              dtype=np.int32,
+              ragged_rank=1)),
+      dict(
+          x=[[[1]], [[2]]],  # shape = [2, 1, 1]
+          dim_sizes=[2, [2, 3], [0, 2, 1, 2, 0]],
+          expected=ragged.constant_value([[[], [1, 1]], [[2], [2, 2], []]],
+                                         dtype=np.int32,
+                                         ragged_rank=2)),
+      dict(
+          x=10,
+          dim_sizes=[3, [3, 0, 2]],
+          expected=ragged.constant_value([[10, 10, 10], [], [10, 10]])),
+  ])
+  def testRaggedBroadcastTo(self, x, dim_sizes, expected):
+    shape = ragged.RaggedTensorDynamicShape.from_dim_sizes(dim_sizes)
+    result = ragged.broadcast_to(x, shape)
+    with self.cached_session():
+      self.assertEqual(
+          getattr(result, 'ragged_rank', 0), getattr(expected, 'ragged_rank',
+                                                     0))
+      if hasattr(expected, 'tolist'):
+        expected = expected.tolist()
+      self.assertEqual(result.eval().tolist(), expected)
+
+  @parameterized.parameters([
+      dict(
+          doc='x.shape=[3, (D1)]; y.shape=[3, 1]; bcast.shape=[3, (D1)]',
+          x=ragged.constant_value([[1, 2, 3], [], [4, 5]], dtype=np.int32),
+          y=[[10], [20], [30]],
+          expected=ragged.constant_value([[11, 12, 13], [], [34, 35]])),
+      dict(
+          doc='x.shape=[3, (D1)]; y.shape=[]; bcast.shape=[3, (D1)]',
+          x=ragged.constant_value([[1, 2, 3], [], [4, 5]], dtype=np.int32),
+          y=10,
+          expected=ragged.constant_value([[11, 12, 13], [], [14, 15]])),
+      dict(
+          doc='x.shape=[1, (D1)]; y.shape=[3, 1]; bcast.shape=[3, (D1)]',
+          x=ragged.constant_value([[1, 2, 3]], dtype=np.int32),
+          y=[[10], [20], [30]],
+          expected=ragged.constant_value(
+              [[11, 12, 13], [21, 22, 23], [31, 32, 33]], dtype=np.int32)),
+      dict(
+          doc=('x.shape=[2, (D1), 1]; y.shape=[1, (D2)]; '
+               'bcast.shape=[2, (D1), (D2)]'),
+          x=ragged.constant_value([[[1], [2], [3]], [[4]]], ragged_rank=1),
+          y=ragged.constant_value([[10, 20, 30]]),
+          expected=ragged.constant_value([[[11, 21, 31], [12, 22, 32],
+                                           [13, 23, 33]], [[14, 24, 34]]])),
+      dict(
+          doc=('x.shape=[2, (D1), 1]; y.shape=[1, 1, 4]; '
+               'bcast.shape=[2, (D1), 4]'),
+          x=ragged.constant_value([[[10], [20]], [[30]]], ragged_rank=1),
+          y=[[[1, 2, 3, 4]]],
+          expected=ragged.constant_value(
+              [[[11, 12, 13, 14], [21, 22, 23, 24]], [[31, 32, 33, 34]]],
+              ragged_rank=1)),
+      dict(
+          doc=('x.shape=[2, (D1), 2, 1]; y.shape=[2, (D2)]; '
+               'bcast.shape=[2, (D1), (2), (D2)'),
+          x=ragged.constant_value([[[[1], [2]], [[3], [4]]],
+                                   [[[5], [6]]]],
+                                  ragged_rank=1),
+          y=ragged.constant_value([[10, 20], [30]]),
+          expected=ragged.constant_value(
+              [[[[11, 21], [32]], [[13, 23], [34]]],
+               [[[15, 25], [36]]]])),
+  ])
+  def testRaggedAddWithBroadcasting(self, x, y, expected, doc):
+    expected_rrank = getattr(expected, 'ragged_rank', 0)
+    x = ragged.convert_to_tensor_or_ragged_tensor(x, dtype=dtypes.int32)
+    y = ragged.convert_to_tensor_or_ragged_tensor(y, dtype=dtypes.int32)
+    result = x + y
+    result_rrank = getattr(result, 'ragged_rank', 0)
+    self.assertEqual(expected_rrank, result_rrank)
+    if hasattr(expected, 'tolist'):
+      expected = expected.tolist()
+    with self.cached_session():
+      self.assertEqual(result.eval().tolist(), expected)
+
+
+if __name__ == '__main__':
+  googletest.main()
diff --git a/tensorflow/python/ops/ragged/ragged_tensor_test.py b/tensorflow/python/ops/ragged/ragged_tensor_test.py
index 61bfcb68090..608fbd6e5b7 100644
--- a/tensorflow/python/ops/ragged/ragged_tensor_test.py
+++ b/tensorflow/python/ops/ragged/ragged_tensor_test.py
@@ -114,13 +114,13 @@ class RaggedTensorTest(test_util.TensorFlowTestCase, parameterized.TestCase):
   # RaggedTensor class docstring examples
   #=============================================================================
 
+  @test_util.run_deprecated_v1
   def testClassDocStringExamples(self):
     # From section: "Component Tensors"
     rt = ragged.from_row_splits(
         values=[3, 1, 4, 1, 5, 9, 2, 6], row_splits=[0, 4, 4, 7, 8, 8])
-    with self.test_session():
-      self.assertEqual(rt.tolist(),
-                       [[3, 1, 4, 1], [], [5, 9, 2], [6], []])
+    self.assertEqual(
+        self.evaluate(rt).tolist(), [[3, 1, 4, 1], [], [5, 9, 2], [6], []])
     del rt
 
     # From section: "Alternative Row-Partitioning Schemes"
@@ -132,9 +132,8 @@ class RaggedTensorTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     rt4 = ragged.from_row_starts(values, row_starts=[0, 4, 4, 7, 8])
     rt5 = ragged.from_row_limits(values, row_limits=[4, 4, 7, 8, 8])
     for rt in (rt1, rt2, rt3, rt4, rt5):
-      with self.test_session():
-        self.assertEqual(rt.tolist(),
-                         [[3, 1, 4, 1], [], [5, 9, 2], [6], []])
+      self.assertEqual(
+          self.evaluate(rt).tolist(), [[3, 1, 4, 1], [], [5, 9, 2], [6], []])
     del rt1, rt2, rt3, rt4, rt5
 
     # From section: "Multiple Ragged Dimensions"
@@ -142,28 +141,27 @@ class RaggedTensorTest(test_util.TensorFlowTestCase, parameterized.TestCase):
         values=[3, 1, 4, 1, 5, 9, 2, 6], row_splits=[0, 4, 4, 7, 8, 8])
     outer_rt = ragged.from_row_splits(values=inner_rt, row_splits=[0, 3, 3, 5])
     self.assertEqual(outer_rt.ragged_rank, 2)
-    with self.test_session():
-      self.assertEqual(outer_rt.tolist(),
-                       [[[3, 1, 4, 1], [], [5, 9, 2]], [], [[6], []]])
+    self.assertEqual(
+        self.evaluate(outer_rt).tolist(),
+        [[[3, 1, 4, 1], [], [5, 9, 2]], [], [[6], []]])
     del inner_rt, outer_rt
 
     # From section: "Multiple Ragged Dimensions"
     rt = ragged.from_nested_row_splits(
         inner_values=[3, 1, 4, 1, 5, 9, 2, 6],
         nested_row_splits=([0, 3, 3, 5], [0, 4, 4, 7, 8, 8]))
-    with self.test_session():
-      self.assertEqual(rt.tolist(),
-                       [[[3, 1, 4, 1], [], [5, 9, 2]], [], [[6], []]])
+    self.assertEqual(
+        self.evaluate(rt).tolist(),
+        [[[3, 1, 4, 1], [], [5, 9, 2]], [], [[6], []]])
     del rt
 
     # From section: "Uniform Inner Dimensions"
     rt = ragged.from_row_splits(
         values=array_ops.ones([5, 3]), row_splits=[0, 2, 5])
-    with self.test_session():
-      self.assertEqual(
-          rt.tolist(),
-          [[[1, 1, 1], [1, 1, 1]], [[1, 1, 1], [1, 1, 1], [1, 1, 1]]])
-      self.assertEqual(rt.shape.as_list(), [2, None, 3])
+    self.assertEqual(
+        self.evaluate(rt).tolist(),
+        [[[1, 1, 1], [1, 1, 1]], [[1, 1, 1], [1, 1, 1], [1, 1, 1]]])
+    self.assertEqual(rt.shape.as_list(), [2, None, 3])
     del rt
 
   #=============================================================================
@@ -202,15 +200,16 @@ class RaggedTensorTest(test_util.TensorFlowTestCase, parameterized.TestCase):
   # RaggedTensor Constructor (private)
   #=============================================================================
 
+  @test_util.run_deprecated_v1
   def testRaggedTensorConstruction(self):
     values = constant_op.constant(['a', 'b', 'c', 'd', 'e', 'f', 'g'])
     row_splits = constant_op.constant([0, 2, 2, 5, 6, 7], dtypes.int64)
     rt = ragged.RaggedTensor(
         values=values, row_splits=row_splits, internal=True)
 
-    with self.test_session():
-      self.assertEqual(rt.tolist(),
-                       [[b'a', b'b'], [], [b'c', b'd', b'e'], [b'f'], [b'g']])
+    self.assertEqual(
+        self.evaluate(rt).tolist(),
+        [[b'a', b'b'], [], [b'c', b'd', b'e'], [b'f'], [b'g']])
 
   def testRaggedTensorConstructionErrors(self):
     values = constant_op.constant(['a', 'b', 'c', 'd', 'e', 'f', 'g'])
@@ -246,6 +245,7 @@ class RaggedTensorTest(test_util.TensorFlowTestCase, parameterized.TestCase):
 # RaggedTensor Factory Ops
 #=============================================================================
 
+  @test_util.run_deprecated_v1
   def testFromValueRowIdsWithDerivedNRows(self):
     # nrows is known at graph creation time.
     values = constant_op.constant(['a', 'b', 'c', 'd', 'e', 'f', 'g'])
@@ -262,12 +262,13 @@ class RaggedTensorTest(test_util.TensorFlowTestCase, parameterized.TestCase):
 
     self.assertIs(rt_values, values)
     self.assertIs(rt_value_rowids, value_rowids)  # cached_value_rowids
-    with self.test_session():
-      self.assertAllEqual(rt_value_rowids, value_rowids)
-      self.assertEqual(rt_nrows.eval(), 5)
-      self.assertEqual(rt.tolist(),
-                       [[b'a', b'b'], [], [b'c', b'd', b'e'], [b'f'], [b'g']])
+    self.assertAllEqual(rt_value_rowids, value_rowids)
+    self.assertEqual(self.evaluate(rt_nrows), 5)
+    self.assertEqual(
+        self.evaluate(rt).tolist(),
+        [[b'a', b'b'], [], [b'c', b'd', b'e'], [b'f'], [b'g']])
 
+  @test_util.run_deprecated_v1
   def testFromValueRowIdsWithDerivedNRowsDynamic(self):
     # nrows is not known at graph creation time.
     values = constant_op.constant(['a', 'b', 'c', 'd', 'e', 'f', 'g'])
@@ -285,12 +286,13 @@ class RaggedTensorTest(test_util.TensorFlowTestCase, parameterized.TestCase):
 
     self.assertIs(rt_values, values)
     self.assertIs(rt_value_rowids, value_rowids)  # cached_value_rowids
-    with self.test_session():
-      self.assertAllEqual(rt_value_rowids, value_rowids)
-      self.assertEqual(rt_nrows.eval(), 5)
-      self.assertEqual(rt.tolist(),
-                       [[b'a', b'b'], [], [b'c', b'd', b'e'], [b'f'], [b'g']])
+    self.assertAllEqual(rt_value_rowids, value_rowids)
+    self.assertEqual(self.evaluate(rt_nrows), 5)
+    self.assertEqual(
+        self.evaluate(rt).tolist(),
+        [[b'a', b'b'], [], [b'c', b'd', b'e'], [b'f'], [b'g']])
 
+  @test_util.run_deprecated_v1
   def testFromValueRowIdsWithExplicitNRows(self):
     values = constant_op.constant(['a', 'b', 'c', 'd', 'e', 'f', 'g'])
     value_rowids = constant_op.constant([0, 0, 2, 2, 2, 3, 4], dtypes.int64)
@@ -308,11 +310,11 @@ class RaggedTensorTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     self.assertIs(rt_values, values)
     self.assertIs(rt_value_rowids, value_rowids)  # cached_value_rowids
     self.assertIs(rt_nrows, nrows)  # cached_nrows
-    with self.test_session():
-      self.assertEqual(
-          rt.tolist(),
-          [[b'a', b'b'], [], [b'c', b'd', b'e'], [b'f'], [b'g'], [], []])
+    self.assertEqual(
+        self.evaluate(rt).tolist(),
+        [[b'a', b'b'], [], [b'c', b'd', b'e'], [b'f'], [b'g'], [], []])
 
+  @test_util.run_deprecated_v1
   def testFromValueRowIdsWithExplicitNRowsEqualToDefault(self):
     values = constant_op.constant(['a', 'b', 'c', 'd', 'e', 'f', 'g'])
     value_rowids = constant_op.constant([0, 0, 2, 2, 2, 3, 4], dtypes.int64)
@@ -330,12 +332,13 @@ class RaggedTensorTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     self.assertIs(rt_values, values)
     self.assertIs(rt_value_rowids, value_rowids)  # cached_value_rowids
     self.assertIs(rt_nrows, nrows)  # cached_nrows
-    with self.test_session():
-      self.assertAllEqual(rt_value_rowids, value_rowids)
-      self.assertAllEqual(rt_nrows, nrows)
-      self.assertEqual(rt.tolist(),
-                       [[b'a', b'b'], [], [b'c', b'd', b'e'], [b'f'], [b'g']])
+    self.assertAllEqual(rt_value_rowids, value_rowids)
+    self.assertAllEqual(rt_nrows, nrows)
+    self.assertEqual(
+        self.evaluate(rt).tolist(),
+        [[b'a', b'b'], [], [b'c', b'd', b'e'], [b'f'], [b'g']])
 
+  @test_util.run_deprecated_v1
   def testFromValueRowIdsWithEmptyValues(self):
     rt = ragged.from_value_rowids([], [])
     rt_nrows = ragged.nrows(rt)
@@ -344,10 +347,10 @@ class RaggedTensorTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     self.assertEqual(rt.ragged_rank, 1)
     self.assertEqual(rt.values.shape.as_list(), [0])
     self.assertEqual(ragged.value_rowids(rt).shape.as_list(), [0])
-    with self.test_session():
-      self.assertEqual(rt_nrows.eval().tolist(), 0)
-      self.assertEqual(rt.tolist(), [])
+    self.assertEqual(self.evaluate(rt_nrows).tolist(), 0)
+    self.assertEqual(self.evaluate(rt).tolist(), [])
 
+  @test_util.run_deprecated_v1
   def testFromRowSplits(self):
     values = constant_op.constant(['a', 'b', 'c', 'd', 'e', 'f', 'g'])
     row_splits = constant_op.constant([0, 2, 2, 5, 6, 7], dtypes.int64)
@@ -363,16 +366,17 @@ class RaggedTensorTest(test_util.TensorFlowTestCase, parameterized.TestCase):
 
     self.assertIs(rt_values, values)
     self.assertIs(rt_row_splits, row_splits)
-    with self.test_session():
-      self.assertEqual(rt_nrows.eval(), 5)
-      self.assertEqual(rt.tolist(),
-                       [[b'a', b'b'], [], [b'c', b'd', b'e'], [b'f'], [b'g']])
+    self.assertEqual(self.evaluate(rt_nrows), 5)
+    self.assertEqual(
+        self.evaluate(rt).tolist(),
+        [[b'a', b'b'], [], [b'c', b'd', b'e'], [b'f'], [b'g']])
 
   def testFromRowSplitsWithEmptySplits(self):
     err_msg = 'row_splits tensor may not be empty'
     with self.assertRaisesRegexp(ValueError, err_msg):
       ragged.from_row_splits([], [])
 
+  @test_util.run_deprecated_v1
   def testFromRowStarts(self):
     values = constant_op.constant(['a', 'b', 'c', 'd', 'e', 'f', 'g'])
     row_starts = constant_op.constant([0, 2, 2, 5, 6], dtypes.int64)
@@ -387,12 +391,13 @@ class RaggedTensorTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     rt_nrows = ragged.nrows(rt)
 
     self.assertIs(rt_values, values)
-    with self.test_session():
-      self.assertEqual(rt_nrows.eval(), 5)
-      self.assertAllEqual(rt_row_starts, row_starts)
-      self.assertEqual(rt.tolist(),
-                       [[b'a', b'b'], [], [b'c', b'd', b'e'], [b'f'], [b'g']])
+    self.assertEqual(self.evaluate(rt_nrows), 5)
+    self.assertAllEqual(rt_row_starts, row_starts)
+    self.assertEqual(
+        self.evaluate(rt).tolist(),
+        [[b'a', b'b'], [], [b'c', b'd', b'e'], [b'f'], [b'g']])
 
+  @test_util.run_deprecated_v1
   def testFromRowLimits(self):
     values = constant_op.constant(['a', 'b', 'c', 'd', 'e', 'f', 'g'])
     row_limits = constant_op.constant([2, 2, 5, 6, 7], dtypes.int64)
@@ -407,12 +412,13 @@ class RaggedTensorTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     rt_nrows = ragged.nrows(rt)
 
     self.assertIs(rt_values, values)
-    with self.test_session():
-      self.assertEqual(rt_nrows.eval(), 5)
-      self.assertAllEqual(rt_row_limits, row_limits)
-      self.assertEqual(rt.tolist(),
-                       [[b'a', b'b'], [], [b'c', b'd', b'e'], [b'f'], [b'g']])
+    self.assertEqual(self.evaluate(rt_nrows), 5)
+    self.assertAllEqual(rt_row_limits, row_limits)
+    self.assertEqual(
+        self.evaluate(rt).tolist(),
+        [[b'a', b'b'], [], [b'c', b'd', b'e'], [b'f'], [b'g']])
 
+  @test_util.run_deprecated_v1
   def testFromRowLengths(self):
     values = constant_op.constant(['a', 'b', 'c', 'd', 'e', 'f', 'g'])
     row_lengths = constant_op.constant([2, 0, 3, 1, 1], dtypes.int64)
@@ -428,12 +434,13 @@ class RaggedTensorTest(test_util.TensorFlowTestCase, parameterized.TestCase):
 
     self.assertIs(rt_values, values)
     self.assertIs(rt_row_lengths, row_lengths)  # cached_nrows
-    with self.test_session():
-      self.assertEqual(rt_nrows.eval(), 5)
-      self.assertAllEqual(rt_row_lengths, row_lengths)
-      self.assertEqual(rt.tolist(),
-                       [[b'a', b'b'], [], [b'c', b'd', b'e'], [b'f'], [b'g']])
+    self.assertEqual(self.evaluate(rt_nrows), 5)
+    self.assertAllEqual(rt_row_lengths, row_lengths)
+    self.assertEqual(
+        self.evaluate(rt).tolist(),
+        [[b'a', b'b'], [], [b'c', b'd', b'e'], [b'f'], [b'g']])
 
+  @test_util.run_deprecated_v1
   def testFromNestedValueRowIdsWithDerivedNRows(self):
     values = constant_op.constant(['a', 'b', 'c', 'd', 'e', 'f', 'g'])
     nested_value_rowids = [
@@ -452,13 +459,13 @@ class RaggedTensorTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     rt_values_value_rowids = ragged.value_rowids(rt_values)
 
     self.assertIs(rt_values_values, values)
-    with self.test_session():
-      self.assertAllEqual(rt_value_rowids, nested_value_rowids[0])
-      self.assertAllEqual(rt_values_value_rowids, nested_value_rowids[1])
-      self.assertEqual(
-          rt.tolist(),
-          [[[b'a', b'b'], []], [[b'c', b'd', b'e']], [], [[b'f'], [b'g']]])
+    self.assertAllEqual(rt_value_rowids, nested_value_rowids[0])
+    self.assertAllEqual(rt_values_value_rowids, nested_value_rowids[1])
+    self.assertEqual(
+        self.evaluate(rt).tolist(),
+        [[[b'a', b'b'], []], [[b'c', b'd', b'e']], [], [[b'f'], [b'g']]])
 
+  @test_util.run_deprecated_v1
   def testFromNestedValueRowIdsWithExplicitNRows(self):
     values = constant_op.constant(['a', 'b', 'c', 'd', 'e', 'f', 'g'])
     nested_value_rowids = [
@@ -483,14 +490,14 @@ class RaggedTensorTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     rt_values_nrows = ragged.nrows(rt_values)
 
     self.assertIs(rt_values_values, values)
-    with self.test_session():
-      self.assertAllEqual(rt_value_rowids, nested_value_rowids[0])
-      self.assertAllEqual(rt_values_value_rowids, nested_value_rowids[1])
-      self.assertAllEqual(rt_nrows, nrows[0])
-      self.assertAllEqual(rt_values_nrows, nrows[1])
-      self.assertEqual(rt.tolist(),
-                       [[[b'a', b'b'], []], [[b'c', b'd', b'e']], [],
-                        [[b'f'], [b'g'], []], [], []])
+    self.assertAllEqual(rt_value_rowids, nested_value_rowids[0])
+    self.assertAllEqual(rt_values_value_rowids, nested_value_rowids[1])
+    self.assertAllEqual(rt_nrows, nrows[0])
+    self.assertAllEqual(rt_values_nrows, nrows[1])
+    self.assertEqual(
+        self.evaluate(rt).tolist(),
+        [[[b'a', b'b'], []], [[b'c', b'd', b'e']], [], [[b'f'], [b'g'], []], [],
+         []])
 
   def testFromNestedValueRowIdsWithExplicitNRowsMismatch(self):
     values = constant_op.constant(['a', 'b', 'c', 'd', 'e', 'f', 'g'])
@@ -515,6 +522,7 @@ class RaggedTensorTest(test_util.TensorFlowTestCase, parameterized.TestCase):
       ragged.from_nested_value_rowids([1, 2, 3], [[0, 1, 2], [0, 1, 2]],
                                       constant_op.constant([3, 3]))
 
+  @test_util.run_deprecated_v1
   def testFromNestedRowSplits(self):
     inner_values = constant_op.constant(['a', 'b', 'c', 'd', 'e', 'f', 'g'])
     nested_row_splits = [
@@ -535,10 +543,9 @@ class RaggedTensorTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     self.assertIs(rt_values_values, inner_values)
     self.assertIs(rt_row_splits, nested_row_splits[0])
     self.assertIs(rt_values_row_splits, nested_row_splits[1])
-    with self.test_session():
-      self.assertEqual(
-          rt.tolist(),
-          [[[b'a', b'b'], []], [[b'c', b'd', b'e']], [], [[b'f'], [b'g']]])
+    self.assertEqual(
+        self.evaluate(rt).tolist(),
+        [[[b'a', b'b'], []], [[b'c', b'd', b'e']], [], [[b'f'], [b'g']]])
 
   def testFromNestedRowSplitsWithNonListInput(self):
     with self.assertRaisesRegexp(TypeError,
@@ -583,6 +590,7 @@ class RaggedTensorTest(test_util.TensorFlowTestCase, parameterized.TestCase):
           value_rowids=value_rowids,
           nrows=array_ops.expand_dims(nrows, 0))
 
+  @test_util.run_deprecated_v1
   def testGraphMismatch(self):
     with ops.Graph().as_default():
       values = constant_op.constant([1, 2, 3])
@@ -595,6 +603,7 @@ class RaggedTensorTest(test_util.TensorFlowTestCase, parameterized.TestCase):
   # Ragged Value & Row-Partitioning Tensor Accessors
   #=============================================================================
 
+  @test_util.run_deprecated_v1
   def testRaggedTensorAccessors_2d(self):
     values = constant_op.constant(['a', 'b', 'c', 'd', 'e', 'f', 'g'])
     row_splits = constant_op.constant([0, 2, 2, 5, 6, 7], dtypes.int64)
@@ -603,25 +612,33 @@ class RaggedTensorTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     rt2 = ragged.from_value_rowids(values, value_rowids)
 
     for rt in [rt1, rt2]:
-      with self.test_session():
-        self.assertEqual(rt.tolist(),
-                         [[b'a', b'b'], [], [b'c', b'd', b'e'], [b'f'], [b'g']])
-        self.assertEqual(rt.values.eval().tolist(),
-                         [b'a', b'b', b'c', b'd', b'e', b'f', b'g'])
-        self.assertEqual(rt.values.shape.dims[0].value, 7)
-        self.assertEqual(
-            ragged.value_rowids(rt).eval().tolist(), [0, 0, 2, 2, 2, 3, 4])
-        self.assertEqual(ragged.nrows(rt).eval().tolist(), 5)
-        self.assertEqual(rt.row_splits.eval().tolist(), [0, 2, 2, 5, 6, 7])
-        self.assertEqual(ragged.row_starts(rt).eval().tolist(), [0, 2, 2, 5, 6])
-        self.assertEqual(ragged.row_limits(rt).eval().tolist(), [2, 2, 5, 6, 7])
-        self.assertEqual(
-            ragged.row_lengths(rt).eval().tolist(), [2, 0, 3, 1, 1])
-        self.assertEqual(rt.inner_values.eval().tolist(),
-                         [b'a', b'b', b'c', b'd', b'e', b'f', b'g'])
-        self.assertEqual([s.eval().tolist() for s in rt.nested_row_splits],
-                         [[0, 2, 2, 5, 6, 7]])
+      self.assertEqual(
+          self.evaluate(rt).tolist(),
+          [[b'a', b'b'], [], [b'c', b'd', b'e'], [b'f'], [b'g']])
+      self.assertEqual(
+          self.evaluate(rt.values).tolist(),
+          [b'a', b'b', b'c', b'd', b'e', b'f', b'g'])
+      self.assertEqual(rt.values.shape.dims[0].value, 7)
+      self.assertEqual(
+          self.evaluate(ragged.value_rowids(rt)).tolist(),
+          [0, 0, 2, 2, 2, 3, 4])
+      self.assertEqual(self.evaluate(ragged.nrows(rt)).tolist(), 5)
+      self.assertEqual(
+          self.evaluate(rt.row_splits).tolist(), [0, 2, 2, 5, 6, 7])
+      self.assertEqual(
+          self.evaluate(ragged.row_starts(rt)).tolist(), [0, 2, 2, 5, 6])
+      self.assertEqual(
+          self.evaluate(ragged.row_limits(rt)).tolist(), [2, 2, 5, 6, 7])
+      self.assertEqual(
+          self.evaluate(ragged.row_lengths(rt)).tolist(), [2, 0, 3, 1, 1])
+      self.assertEqual(
+          self.evaluate(rt.inner_values).tolist(),
+          [b'a', b'b', b'c', b'd', b'e', b'f', b'g'])
+      self.assertEqual(
+          [self.evaluate(s).tolist() for s in rt.nested_row_splits],
+          [[0, 2, 2, 5, 6, 7]])
 
+  @test_util.run_deprecated_v1
   def testRaggedTensorAccessors_3d_with_ragged_rank_1(self):
     values = [[0, 1], [2, 3], [4, 5], [6, 7], [8, 9], [10, 11], [12, 13]]
     row_splits = constant_op.constant([0, 2, 2, 5, 6, 7], dtypes.int64)
@@ -630,28 +647,34 @@ class RaggedTensorTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     rt2 = ragged.from_value_rowids(values, value_rowids)
 
     for rt in [rt1, rt2]:
-      with self.test_session():
-        self.assertEqual(rt.tolist(),
-                         [[[0, 1], [2, 3]], [], [[4, 5], [6, 7], [8, 9]],
-                          [[10, 11]], [[12, 13]]])
-        self.assertEqual(
-            rt.values.eval().tolist(),
-            [[0, 1], [2, 3], [4, 5], [6, 7], [8, 9], [10, 11], [12, 13]])
-        self.assertEqual(rt.values.shape.dims[0].value, 7)
-        self.assertEqual(
-            ragged.value_rowids(rt).eval().tolist(), [0, 0, 2, 2, 2, 3, 4])
-        self.assertEqual(ragged.nrows(rt).eval().tolist(), 5)
-        self.assertEqual(rt.row_splits.eval().tolist(), [0, 2, 2, 5, 6, 7])
-        self.assertEqual(ragged.row_starts(rt).eval().tolist(), [0, 2, 2, 5, 6])
-        self.assertEqual(ragged.row_limits(rt).eval().tolist(), [2, 2, 5, 6, 7])
-        self.assertEqual(
-            ragged.row_lengths(rt).eval().tolist(), [2, 0, 3, 1, 1])
-        self.assertEqual(
-            rt.inner_values.eval().tolist(),
-            [[0, 1], [2, 3], [4, 5], [6, 7], [8, 9], [10, 11], [12, 13]])
-        self.assertEqual([s.eval().tolist() for s in rt.nested_row_splits],
-                         [[0, 2, 2, 5, 6, 7]])
+      self.assertEqual(
+          self.evaluate(rt).tolist(),
+          [[[0, 1], [2, 3]], [], [[4, 5], [6, 7], [8, 9]], [[10, 11]],
+           [[12, 13]]])
+      self.assertEqual(
+          self.evaluate(rt.values).tolist(),
+          [[0, 1], [2, 3], [4, 5], [6, 7], [8, 9], [10, 11], [12, 13]])
+      self.assertEqual(rt.values.shape.dims[0].value, 7)
+      self.assertEqual(
+          self.evaluate(ragged.value_rowids(rt)).tolist(),
+          [0, 0, 2, 2, 2, 3, 4])
+      self.assertEqual(self.evaluate(ragged.nrows(rt)).tolist(), 5)
+      self.assertEqual(
+          self.evaluate(rt.row_splits).tolist(), [0, 2, 2, 5, 6, 7])
+      self.assertEqual(
+          self.evaluate(ragged.row_starts(rt)).tolist(), [0, 2, 2, 5, 6])
+      self.assertEqual(
+          self.evaluate(ragged.row_limits(rt)).tolist(), [2, 2, 5, 6, 7])
+      self.assertEqual(
+          self.evaluate(ragged.row_lengths(rt)).tolist(), [2, 0, 3, 1, 1])
+      self.assertEqual(
+          self.evaluate(rt.inner_values).tolist(),
+          [[0, 1], [2, 3], [4, 5], [6, 7], [8, 9], [10, 11], [12, 13]])
+      self.assertEqual(
+          [self.evaluate(s).tolist() for s in rt.nested_row_splits],
+          [[0, 2, 2, 5, 6, 7]])
 
+  @test_util.run_deprecated_v1
   def testRaggedTensorAccessors_3d_with_ragged_rank_2(self):
     values = constant_op.constant(['a', 'b', 'c', 'd', 'e', 'f', 'g'])
     nested_row_splits = [
@@ -666,41 +689,45 @@ class RaggedTensorTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     rt2 = ragged.from_nested_value_rowids(values, nested_value_rowids)
 
     for rt in [rt1, rt2]:
-      with self.test_session():
-        self.assertEqual(
-            rt.tolist(),
-            [[[b'a', b'b'], []], [[b'c', b'd', b'e']], [], [[b'f'], [b'g']]])
-        self.assertEqual(rt.values.eval().tolist(),
-                         [[b'a', b'b'], [], [b'c', b'd', b'e'], [b'f'], [b'g']])
-        self.assertEqual(rt.values.shape.dims[0].value, 5)
-        self.assertEqual(
-            ragged.value_rowids(rt).eval().tolist(), [0, 0, 1, 3, 3])
-        self.assertEqual(ragged.nrows(rt).eval().tolist(), 4)
-        self.assertEqual(rt.row_splits.eval().tolist(), [0, 2, 3, 3, 5])
-        self.assertEqual(ragged.row_starts(rt).eval().tolist(), [0, 2, 3, 3])
-        self.assertEqual(ragged.row_limits(rt).eval().tolist(), [2, 3, 3, 5])
-        self.assertEqual(ragged.row_lengths(rt).eval().tolist(), [2, 1, 0, 2])
-        self.assertEqual(rt.inner_values.eval().tolist(),
-                         [b'a', b'b', b'c', b'd', b'e', b'f', b'g'])
-        self.assertEqual([s.eval().tolist() for s in rt.nested_row_splits],
-                         [[0, 2, 3, 3, 5], [0, 2, 2, 5, 6, 7]])
+      self.assertEqual(
+          self.evaluate(rt).tolist(),
+          [[[b'a', b'b'], []], [[b'c', b'd', b'e']], [], [[b'f'], [b'g']]])
+      self.assertEqual(
+          self.evaluate(rt.values).tolist(),
+          [[b'a', b'b'], [], [b'c', b'd', b'e'], [b'f'], [b'g']])
+      self.assertEqual(rt.values.shape.dims[0].value, 5)
+      self.assertEqual(
+          self.evaluate(ragged.value_rowids(rt)).tolist(), [0, 0, 1, 3, 3])
+      self.assertEqual(self.evaluate(ragged.nrows(rt)).tolist(), 4)
+      self.assertEqual(self.evaluate(rt.row_splits).tolist(), [0, 2, 3, 3, 5])
+      self.assertEqual(
+          self.evaluate(ragged.row_starts(rt)).tolist(), [0, 2, 3, 3])
+      self.assertEqual(
+          self.evaluate(ragged.row_limits(rt)).tolist(), [2, 3, 3, 5])
+      self.assertEqual(
+          self.evaluate(ragged.row_lengths(rt)).tolist(), [2, 1, 0, 2])
+      self.assertEqual(
+          self.evaluate(rt.inner_values).tolist(),
+          [b'a', b'b', b'c', b'd', b'e', b'f', b'g'])
+      self.assertEqual(
+          [self.evaluate(s).tolist() for s in rt.nested_row_splits],
+          [[0, 2, 3, 3, 5], [0, 2, 2, 5, 6, 7]])
 
   def testNRowsWithTensorInput(self):
     dt = constant_op.constant([[1, 2, 3], [4, 5, 6]])
     nrows = ragged.nrows(dt)
-    with self.test_session():
-      self.assertEqual(nrows.eval(), 2)
+    self.assertEqual(self.evaluate(nrows), 2)
 
   def testRowLengthsWithTensorInput(self):
     dt = constant_op.constant([[1, 2, 3], [4, 5, 6]])
     row_lengths = ragged.row_lengths(dt)
-    with self.test_session():
-      self.assertEqual(row_lengths.eval().tolist(), [3, 3])
+    self.assertEqual(self.evaluate(row_lengths).tolist(), [3, 3])
 
   #=============================================================================
   # RaggedTensor.shape
   #=============================================================================
 
+  @test_util.run_deprecated_v1
   def testShape(self):
     """Tests for RaggedTensor.shape."""
     rt1 = ragged.from_row_splits(b'a b c d e f g'.split(), [0, 2, 5, 6, 6, 7])
@@ -748,29 +775,27 @@ class RaggedTensorTest(test_util.TensorFlowTestCase, parameterized.TestCase):
       expected: The expected value of rt.__getitem__(slice_spec), as a python
         list; or an exception class.
     """
-    with self.test_session():
-      tensor_slice_spec1 = _make_tensor_slice_spec(slice_spec, True)
-      tensor_slice_spec2 = _make_tensor_slice_spec(slice_spec, False)
-      value1 = rt.__getitem__(slice_spec).eval()
-      value2 = rt.__getitem__(tensor_slice_spec1).eval()
-      value3 = rt.__getitem__(tensor_slice_spec2).eval()
-      if hasattr(value1, 'tolist'):
-        value1 = value1.tolist()
-      if hasattr(value2, 'tolist'):
-        value2 = value2.tolist()
-      if hasattr(value3, 'tolist'):
-        value3 = value3.tolist()
-      self.assertEqual(value1, expected, 'slice_spec=%s' % (slice_spec,))
-      self.assertEqual(value2, expected, 'slice_spec=%s' % (slice_spec,))
-      self.assertEqual(value3, expected, 'slice_spec=%s' % (slice_spec,))
+    tensor_slice_spec1 = _make_tensor_slice_spec(slice_spec, True)
+    tensor_slice_spec2 = _make_tensor_slice_spec(slice_spec, False)
+    value1 = self.evaluate(rt.__getitem__(slice_spec))
+    value2 = self.evaluate(rt.__getitem__(tensor_slice_spec1))
+    value3 = self.evaluate(rt.__getitem__(tensor_slice_spec2))
+    if hasattr(value1, 'tolist'):
+      value1 = value1.tolist()
+    if hasattr(value2, 'tolist'):
+      value2 = value2.tolist()
+    if hasattr(value3, 'tolist'):
+      value3 = value3.tolist()
+    self.assertEqual(value1, expected, 'slice_spec=%s' % (slice_spec,))
+    self.assertEqual(value2, expected, 'slice_spec=%s' % (slice_spec,))
+    self.assertEqual(value3, expected, 'slice_spec=%s' % (slice_spec,))
 
   def _TestGetItemException(self, rt, slice_spec, expected, message):
     """Helper function for testing RaggedTensor.__getitem__ exceptions."""
-    with self.test_session():
-      tensor_slice_spec1 = _make_tensor_slice_spec(slice_spec, True)
-      self.assertRaisesRegexp(expected, message, rt.__getitem__, slice_spec)
-      self.assertRaisesRegexp(expected, message, rt.__getitem__,
-                              tensor_slice_spec1)
+    tensor_slice_spec1 = _make_tensor_slice_spec(slice_spec, True)
+    self.assertRaisesRegexp(expected, message, rt.__getitem__, slice_spec)
+    self.assertRaisesRegexp(expected, message, rt.__getitem__,
+                            tensor_slice_spec1)
 
   @parameterized.parameters(
       # Tests for rt[i]
@@ -836,14 +861,14 @@ class RaggedTensorTest(test_util.TensorFlowTestCase, parameterized.TestCase):
       (SLICE_BUILDER[:, -2:], [row[-2:] for row in EXAMPLE_RAGGED_TENSOR_2D]),
       # TODO(edloper): Add tests for strided slices, once support is added.
   )
+  @test_util.run_deprecated_v1
   def testRaggedTensorGetItemWithRaggedRank1(self, slice_spec, expected):
     """Test that rt.__getitem__(slice_spec) == expected."""
     # Ragged tensor
     rt = ragged.from_row_splits(EXAMPLE_RAGGED_TENSOR_2D_VALUES,
                                 EXAMPLE_RAGGED_TENSOR_2D_SPLITS)
 
-    with self.test_session():
-      self.assertEqual(rt.tolist(), EXAMPLE_RAGGED_TENSOR_2D)
+    self.assertEqual(self.evaluate(rt).tolist(), EXAMPLE_RAGGED_TENSOR_2D)
     self._TestGetItem(rt, slice_spec, expected)
 
   # pylint: disable=invalid-slice-index
@@ -878,6 +903,7 @@ class RaggedTensorTest(test_util.TensorFlowTestCase, parameterized.TestCase):
       (SLICE_BUILDER[..., 0, 0, 0], IndexError,
        'Too many indices for RaggedTensor'),
   )
+  @test_util.run_deprecated_v1
   def testRaggedTensorGetItemErrorsWithRaggedRank1(self, slice_spec, expected,
                                                    message):
     """Test that rt.__getitem__(slice_spec) == expected."""
@@ -887,8 +913,7 @@ class RaggedTensorTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     # if sys.version_info[0] == 3:
     #   message = 'must be str, not int'
 
-    with self.test_session():
-      self.assertEqual(rt.tolist(), EXAMPLE_RAGGED_TENSOR_2D)
+    self.assertEqual(self.evaluate(rt).tolist(), EXAMPLE_RAGGED_TENSOR_2D)
     self._TestGetItemException(rt, slice_spec, expected, message)
 
   @parameterized.parameters(
@@ -957,13 +982,13 @@ class RaggedTensorTest(test_util.TensorFlowTestCase, parameterized.TestCase):
       # TODO(edloper): Add tests slicing inner ragged dimensions, one support
       # is added.
   )
+  @test_util.run_deprecated_v1
   def testRaggedTensorGetItemWithRaggedRank2(self, slice_spec, expected):
     """Test that rt.__getitem__(slice_spec) == expected."""
     rt = ragged.from_nested_row_splits(
         EXAMPLE_RAGGED_TENSOR_4D_VALUES,
         [EXAMPLE_RAGGED_TENSOR_4D_SPLITS1, EXAMPLE_RAGGED_TENSOR_4D_SPLITS2])
-    with self.test_session():
-      self.assertEqual(rt.tolist(), EXAMPLE_RAGGED_TENSOR_4D)
+    self.assertEqual(self.evaluate(rt).tolist(), EXAMPLE_RAGGED_TENSOR_4D)
     self._TestGetItem(rt, slice_spec, expected)
 
   @parameterized.parameters(
@@ -979,14 +1004,14 @@ class RaggedTensorTest(test_util.TensorFlowTestCase, parameterized.TestCase):
       (SLICE_BUILDER[5], ValueError, '.*out of bounds.*'),
       (SLICE_BUILDER[0, 5], ValueError, '.*out of bounds.*'),
   )
+  @test_util.run_deprecated_v1
   def testRaggedTensorGetItemErrorsWithRaggedRank2(self, slice_spec, expected,
                                                    message):
     """Test that rt.__getitem__(slice_spec) == expected."""
     rt = ragged.from_nested_row_splits(
         EXAMPLE_RAGGED_TENSOR_4D_VALUES,
         [EXAMPLE_RAGGED_TENSOR_4D_SPLITS1, EXAMPLE_RAGGED_TENSOR_4D_SPLITS2])
-    with self.test_session():
-      self.assertEqual(rt.tolist(), EXAMPLE_RAGGED_TENSOR_4D)
+    self.assertEqual(self.evaluate(rt).tolist(), EXAMPLE_RAGGED_TENSOR_4D)
     self._TestGetItemException(rt, slice_spec, expected, message)
 
   @parameterized.parameters(
@@ -994,6 +1019,7 @@ class RaggedTensorTest(test_util.TensorFlowTestCase, parameterized.TestCase):
       (SLICE_BUILDER[2:], []),
       (SLICE_BUILDER[:-3], []),
   )
+  @test_util.run_deprecated_v1
   def testRaggedTensorGetItemWithEmptyTensor(self, slice_spec, expected):
     """Test that rt.__getitem__(slice_spec) == expected."""
     rt = ragged.from_row_splits([], [0])
@@ -1003,6 +1029,7 @@ class RaggedTensorTest(test_util.TensorFlowTestCase, parameterized.TestCase):
       (SLICE_BUILDER[0], ValueError, '.*out of bounds.*'),
       (SLICE_BUILDER[-1], ValueError, '.*out of bounds.*'),
   )
+  @test_util.run_deprecated_v1
   def testRaggedTensorGetItemErrorsWithEmptyTensor(self, slice_spec, expected,
                                                    message):
     """Test that rt.__getitem__(slice_spec) == expected."""
@@ -1018,6 +1045,7 @@ class RaggedTensorTest(test_util.TensorFlowTestCase, parameterized.TestCase):
       (SLICE_BUILDER[0, 1], EXAMPLE_RAGGED_TENSOR_2D[0][1]),
       (SLICE_BUILDER[-3, 0], EXAMPLE_RAGGED_TENSOR_2D[-3][0]),
   )
+  @test_util.run_deprecated_v1
   def testRaggedTensorGetItemWithPlaceholderShapes(self, slice_spec, expected):
     """Test that rt.__getitem__(slice_spec) == expected."""
     # Intentionally use an unknown shape for `splits`, to force the code path
@@ -1026,13 +1054,13 @@ class RaggedTensorTest(test_util.TensorFlowTestCase, parameterized.TestCase):
         EXAMPLE_RAGGED_TENSOR_2D_SPLITS, dtype=dtypes.int64)
     splits = array_ops.placeholder_with_default(splits, None)
     rt = ragged.from_row_splits(EXAMPLE_RAGGED_TENSOR_2D_VALUES, splits)
-    with self.test_session():
-      self.assertEqual(rt.tolist(), EXAMPLE_RAGGED_TENSOR_2D)
+    self.assertEqual(self.evaluate(rt).tolist(), EXAMPLE_RAGGED_TENSOR_2D)
     self._TestGetItem(rt, slice_spec, expected)
 
   @parameterized.parameters(
       (SLICE_BUILDER[..., 2], ValueError,
        'Ellipsis not supported for unknown shape RaggedTensors'),)
+  @test_util.run_deprecated_v1
   def testRaggedTensorGetItemErrorsWithPlaceholderShapes(
       self, slice_spec, expected, message):
     """Test that rt.__getitem__(slice_spec) == expected."""
@@ -1041,55 +1069,55 @@ class RaggedTensorTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     rt = ragged.from_row_splits(values, [0, 1])
     self._TestGetItemException(rt, slice_spec, expected, message)
 
-  # TODO(edloper): Remove this decorator once c shapes become the default.
-  @test_util.enable_c_shapes
+  @test_util.run_deprecated_v1
   def testGetItemNewAxis(self):
     # rt: [[[['a', 'b'], ['c', 'd']], [], [['e', 'f']]], []]
     splits1 = [0, 3, 3]
     splits2 = [0, 2, 2, 3]
     values = constant_op.constant([['a', 'b'], ['c', 'd'], ['e', 'f']])
     rt = ragged.from_nested_row_splits(values, [splits1, splits2])
-    with self.test_session():
-      rt_newaxis0 = rt[array_ops.newaxis]
-      rt_newaxis1 = rt[:, array_ops.newaxis]
-      rt_newaxis2 = rt[:, :, array_ops.newaxis]
-      rt_newaxis3 = rt[:, :, :, array_ops.newaxis]
-      rt_newaxis4 = rt[:, :, :, :, array_ops.newaxis]
+    rt_newaxis0 = rt[array_ops.newaxis]
+    rt_newaxis1 = rt[:, array_ops.newaxis]
+    rt_newaxis2 = rt[:, :, array_ops.newaxis]
+    rt_newaxis3 = rt[:, :, :, array_ops.newaxis]
+    rt_newaxis4 = rt[:, :, :, :, array_ops.newaxis]
 
-      self.assertEqual(rt.tolist(),
-                       [[[[b'a', b'b'], [b'c', b'd']], [], [[b'e', b'f']]], []])
-      self.assertEqual(
-          rt_newaxis0.tolist(),
-          [[[[[b'a', b'b'], [b'c', b'd']], [], [[b'e', b'f']]], []]])
-      self.assertEqual(
-          rt_newaxis1.tolist(),
-          [[[[[b'a', b'b'], [b'c', b'd']], [], [[b'e', b'f']]]], [[]]])
-      self.assertEqual(
-          rt_newaxis2.tolist(),
-          [[[[[b'a', b'b'], [b'c', b'd']]], [[]], [[[b'e', b'f']]]], []])
-      self.assertEqual(
-          rt_newaxis3.tolist(),
-          [[[[[b'a', b'b']], [[b'c', b'd']]], [], [[[b'e', b'f']]]], []])
-      self.assertEqual(
-          rt_newaxis4.tolist(),
-          [[[[[b'a'], [b'b']], [[b'c'], [b'd']]], [], [[[b'e'], [b'f']]]], []])
+    self.assertEqual(
+        self.evaluate(rt).tolist(),
+        [[[[b'a', b'b'], [b'c', b'd']], [], [[b'e', b'f']]], []])
+    self.assertEqual(
+        self.evaluate(rt_newaxis0).tolist(),
+        [[[[[b'a', b'b'], [b'c', b'd']], [], [[b'e', b'f']]], []]])
+    self.assertEqual(
+        self.evaluate(rt_newaxis1).tolist(),
+        [[[[[b'a', b'b'], [b'c', b'd']], [], [[b'e', b'f']]]], [[]]])
+    self.assertEqual(
+        self.evaluate(rt_newaxis2).tolist(),
+        [[[[[b'a', b'b'], [b'c', b'd']]], [[]], [[[b'e', b'f']]]], []])
+    self.assertEqual(
+        self.evaluate(rt_newaxis3).tolist(),
+        [[[[[b'a', b'b']], [[b'c', b'd']]], [], [[[b'e', b'f']]]], []])
+    self.assertEqual(
+        self.evaluate(rt_newaxis4).tolist(),
+        [[[[[b'a'], [b'b']], [[b'c'], [b'd']]], [], [[[b'e'], [b'f']]]], []])
 
-      self.assertEqual(rt.ragged_rank, 2)
-      self.assertEqual(rt_newaxis0.ragged_rank, 3)
-      self.assertEqual(rt_newaxis1.ragged_rank, 3)
-      self.assertEqual(rt_newaxis2.ragged_rank, 3)
-      self.assertEqual(rt_newaxis3.ragged_rank, 2)
-      self.assertEqual(rt_newaxis4.ragged_rank, 2)
+    self.assertEqual(rt.ragged_rank, 2)
+    self.assertEqual(rt_newaxis0.ragged_rank, 3)
+    self.assertEqual(rt_newaxis1.ragged_rank, 3)
+    self.assertEqual(rt_newaxis2.ragged_rank, 3)
+    self.assertEqual(rt_newaxis3.ragged_rank, 2)
+    self.assertEqual(rt_newaxis4.ragged_rank, 2)
 
-      self.assertEqual(rt_newaxis0.shape.as_list(), [1, None, None, None, 2])
-      self.assertEqual(rt_newaxis1.shape.as_list(), [2, None, None, None, 2])
-      self.assertEqual(rt_newaxis2.shape.as_list(), [2, None, None, None, 2])
-      self.assertEqual(rt_newaxis3.shape.as_list(), [2, None, None, 1, 2])
-      self.assertEqual(rt_newaxis4.shape.as_list(), [2, None, None, 2, 1])
+    self.assertEqual(rt_newaxis0.shape.as_list(), [1, None, None, None, 2])
+    self.assertEqual(rt_newaxis1.shape.as_list(), [2, None, None, None, 2])
+    self.assertEqual(rt_newaxis2.shape.as_list(), [2, None, None, None, 2])
+    self.assertEqual(rt_newaxis3.shape.as_list(), [2, None, None, 1, 2])
+    self.assertEqual(rt_newaxis4.shape.as_list(), [2, None, None, 2, 1])
 
   #=============================================================================
   # RaggedTensor.__str__
   #=============================================================================
+  @test_util.run_deprecated_v1
   def testRaggedTensorStr(self):
     rt1 = ragged.from_row_splits(b'a b c d e f g'.split(), [0, 2, 5, 6, 6, 7])
     expected1 = ('RaggedTensor(values=Tensor("RaggedFromRowSplits/values:0", '
@@ -1127,6 +1155,7 @@ class RaggedTensorTest(test_util.TensorFlowTestCase, parameterized.TestCase):
   # RaggedTensor.with_values() and RaggedTensor.with_inner_values().
   #=============================================================================
 
+  @test_util.run_deprecated_v1
   def testWithValues(self):
     rt1 = ragged.constant([[1, 2], [3, 4, 5], [6], [], [7]])
     rt2 = ragged.constant([[[1, 2], [3, 4, 5]], [[6]], [], [[], [7]]])
@@ -1135,17 +1164,20 @@ class RaggedTensorTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     rt2_times_10 = rt2.with_inner_values(rt2.inner_values * 10)
     rt1_expanded = rt1.with_values(array_ops.expand_dims(rt1.values, axis=1))
 
-    with self.test_session():
-      self.assertEqual(rt1_plus_10.tolist(),
-                       [[11, 12], [13, 14, 15], [16], [], [17]])
-      self.assertEqual(rt2_times_10.tolist(),
-                       [[[10, 20], [30, 40, 50]], [[60]], [], [[], [70]]])
-      self.assertEqual(rt1_expanded.tolist(),
-                       [[[1], [2]], [[3], [4], [5]], [[6]], [], [[7]]])
+    self.assertEqual(
+        self.evaluate(rt1_plus_10).tolist(),
+        [[11, 12], [13, 14, 15], [16], [], [17]])
+    self.assertEqual(
+        self.evaluate(rt2_times_10).tolist(),
+        [[[10, 20], [30, 40, 50]], [[60]], [], [[], [70]]])
+    self.assertEqual(
+        self.evaluate(rt1_expanded).tolist(),
+        [[[1], [2]], [[3], [4], [5]], [[6]], [], [[7]]])
 
   #=============================================================================
   # Session.run
   #=============================================================================
+  @test_util.run_deprecated_v1
   def testSessionRun(self):
     rt1 = ragged.constant([[1, 2, 3], [4]])
     rt2 = ragged.constant([[[], [1, 2]], [[3]]])
@@ -1155,6 +1187,7 @@ class RaggedTensorTest(test_util.TensorFlowTestCase, parameterized.TestCase):
       self.assertEqual(result['rt1'].tolist(), [[1, 2, 3], [4]])
       self.assertEqual(result['rt2'].tolist(), [[[], [1, 2]], [[3]]])
 
+  @test_util.run_deprecated_v1
   def testSessionRunFeed(self):
     rt1 = ragged.from_row_splits(
         array_ops.placeholder(dtypes.int32),
@@ -1175,6 +1208,7 @@ class RaggedTensorTest(test_util.TensorFlowTestCase, parameterized.TestCase):
       self.assertEqual(result['rt1'].tolist(), [[1, 2, 3], [4]])
       self.assertEqual(result['rt2'].tolist(), [[[], [1, 2]], [[3]]])
 
+  @test_util.run_deprecated_v1
   def testSessionPartialRunFeed(self):
     # Placeholder inputs.
     a = ragged.from_row_splits(
diff --git a/tensorflow/python/ops/ragged/ragged_tile_op_test.py b/tensorflow/python/ops/ragged/ragged_tile_op_test.py
index bf62d96e7a9..f335b15dd15 100644
--- a/tensorflow/python/ops/ragged/ragged_tile_op_test.py
+++ b/tensorflow/python/ops/ragged/ragged_tile_op_test.py
@@ -170,8 +170,18 @@ class RaggedTileOpTest(test_util.TensorFlowTestCase, parameterized.TestCase):
           rt_input=[[[[1], [2]], [[3]]], [[]], [[[4, 5]]]],
           multiples=[1, 1, 1, 0],
           expected=[[[[], []], [[]]], [[]], [[[]]]]),
+      #=========================================================================
+      # multiple=1
+      #=========================================================================
+      dict(
+          descr='rank=4, multiples=1 (no repeats)',
+          rt_input=[[[[1], [2]], [[3], [4]]], [[[5], [6]]]],
+          multiples=[1, 1, 1, 1],
+          expected=[[[[1], [2]], [[3], [4]]],
+                    [[[5], [6]]]]),
 
   ])  # pyformat: disable
+  @test_util.run_deprecated_v1
   def testRaggedTile(self,
                      descr,
                      rt_input,
@@ -200,6 +210,7 @@ class RaggedTileOpTest(test_util.TensorFlowTestCase, parameterized.TestCase):
       with self.test_session():
         self.assertEqual(tiled.eval().tolist(), expected)
 
+  @test_util.run_deprecated_v1
   def testRaggedTileWithTensorInput(self):
     # When the input is a `Tensor`, ragged_tile just delegates to tf.tile.
     dt = constant_op.constant([[1, 2], [3, 4]])
diff --git a/tensorflow/python/ops/ragged/ragged_to_sparse_op_test.py b/tensorflow/python/ops/ragged/ragged_to_sparse_op_test.py
index 2fd31837c62..69b31ad0e97 100644
--- a/tensorflow/python/ops/ragged/ragged_to_sparse_op_test.py
+++ b/tensorflow/python/ops/ragged/ragged_to_sparse_op_test.py
@@ -23,12 +23,14 @@ from tensorflow.python.framework import errors
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradients_impl
+from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import ragged
 from tensorflow.python.platform import googletest
 
 
 class RaggedTensorToSparseOpTest(test_util.TensorFlowTestCase):
 
+  @test_util.run_deprecated_v1
   def testDocStringExample(self):
     rt = ragged.constant([[1, 2, 3], [4], [], [5, 6]])
     st = ragged.to_sparse(rt)
@@ -39,6 +41,7 @@ class RaggedTensorToSparseOpTest(test_util.TensorFlowTestCase):
     with self.test_session():
       self.assertEqual(' '.join(repr(st.eval()).split()), expected)
 
+  @test_util.run_deprecated_v1
   def test2DRaggedTensorWithOneRaggedDimension(self):
     rt = ragged.constant([['a', 'b'], ['c', 'd', 'e'], ['f'], [], ['g']])
     with self.test_session():
@@ -48,6 +51,7 @@ class RaggedTensorToSparseOpTest(test_util.TensorFlowTestCase):
       self.assertAllEqual(st.values, b'a b c d e f g'.split())
       self.assertAllEqual(st.dense_shape, [5, 3])
 
+  @test_util.run_deprecated_v1
   def test3DRaggedTensorWithOneRaggedDimension(self):
     rt = ragged.constant([[[1, 2], [3, 4]], [[5, 6], [7, 8], [9, 10]],
                           [[11, 12]], [], [[13, 14]]],
@@ -62,6 +66,7 @@ class RaggedTensorToSparseOpTest(test_util.TensorFlowTestCase):
                           [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14])
       self.assertAllEqual(st.dense_shape, [5, 3, 2])
 
+  @test_util.run_deprecated_v1
   def test4DRaggedTensorWithOneRaggedDimension(self):
     rt = ragged.constant(
         [[[[1, 2], [3, 4]], [[5, 6], [7, 8]]], [], [[[9, 10], [11, 12]]]],
@@ -87,6 +92,7 @@ class RaggedTensorToSparseOpTest(test_util.TensorFlowTestCase):
           ])
       self.assertAllEqual(st.dense_shape, [3, 2, 2, 2])
 
+  @test_util.run_deprecated_v1
   def test4DRaggedTensorWithTwoRaggedDimensions(self):
     rt = ragged.constant([[[[1, 2], [3, 4]], [[5, 6], [7, 8], [9, 10]]],
                           [[[11, 12]], [], [[13, 14]]], []],
@@ -134,6 +140,7 @@ class RaggedTensorToSparseOpTest(test_util.TensorFlowTestCase):
     self.assertEqual(st.values.shape.as_list(), [7])
     self.assertEqual(st.dense_shape.shape.as_list(), [3])
 
+  @test_util.run_deprecated_v1
   def testKernelErrors(self):
     # An empty vector, defined using a placeholder to ensure that we can't
     # determine that it's invalid at graph-construction time.
@@ -172,13 +179,14 @@ class RaggedTensorToSparseOpTest(test_util.TensorFlowTestCase):
       self.assertRaisesRegexp(errors.InvalidArgumentError, empty_splits_error,
                               ragged.to_sparse(bad_rt5).eval)
 
+  @test_util.run_deprecated_v1
   def testGradient(self):
     # rt1.shape == rt2.shape == [2, (D2), (D3), 2].
     rt1 = ragged.constant([[[[1.0, 2.0], [3.0, 4.0]], [[5.0, 6.0]]]],
                           ragged_rank=2)
     rt2 = ragged.constant([[[[9.0, 8.0], [7.0, 6.0]], [[5.0, 4.0]]]],
                           ragged_rank=2)
-    rt = rt1 + rt2 * 2.0
+    rt = ragged.map_inner_values(math_ops.add, rt1, rt2 * 2.0)
     st = ragged.to_sparse(rt)
 
     g1, g2 = gradients_impl.gradients(st.values, [rt1.inner_values,
diff --git a/tensorflow/python/ops/ragged/ragged_to_tensor_op_test.py b/tensorflow/python/ops/ragged/ragged_to_tensor_op_test.py
index 0ccc214a9c7..77499b9cb3c 100644
--- a/tensorflow/python/ops/ragged/ragged_to_tensor_op_test.py
+++ b/tensorflow/python/ops/ragged/ragged_to_tensor_op_test.py
@@ -30,6 +30,7 @@ from tensorflow.python.platform import googletest
 class RaggedTensorToTensorOpTest(test_util.TensorFlowTestCase,
                                  parameterized.TestCase):
 
+  @test_util.run_deprecated_v1
   def testDocStringExamples(self):
     """Example from ragged_to_tensor.__doc__."""
     rt = ragged.constant([[9, 8, 7], [], [6, 5], [4]])
@@ -71,10 +72,33 @@ class RaggedTensorToTensorOpTest(test_util.TensorFlowTestCase,
               [[1, 2], [0, 0], [3, 4]],  #
               [[0, 0], [0, 0], [0, 0]],  #
               [[5, 0], [0, 0], [0, 0]],  #
-              [[6, 7], [8, 0], [0, 0]]
-          ]  #
+              [[6, 7], [8, 0], [0, 0]],  #
+          ]
+      },
+      {
+          'rt_input': [[[1, 2], [], [3, 4]], [], [[5]], [[6, 7], [8]]],
+          'default':
+              9,
+          'expected': [
+              [[1, 2], [9, 9], [3, 4]],  #
+              [[9, 9], [9, 9], [9, 9]],  #
+              [[5, 9], [9, 9], [9, 9]],  #
+              [[6, 7], [8, 9], [9, 9]],  #
+          ]
+      },
+      {
+          'rt_input': [[[1], [2], [3]]],
+          'ragged_rank': 1,
+          'default': 0,
+          'expected': [[[1], [2], [3]]],
+      },
+      {
+          'rt_input': [[[[1], [2]], [], [[3]]]],
+          'default': 9,
+          'expected': [[[[1], [2]], [[9], [9]], [[3], [9]]]],
       },
   )
+  @test_util.run_deprecated_v1
   def testRaggedTensorToTensor(self,
                                rt_input,
                                expected,
@@ -96,17 +120,13 @@ class RaggedTensorToTensorOpTest(test_util.TensorFlowTestCase,
       {
           'rt_input': [[1, 2, 3]],
           'default': [0],
-          'error': (ValueError, r'Shapes \(1,\) and \(\) are incompatible'),
+          'error': (ValueError, r'Shape \(1,\) must have rank at most 0'),
       },
       {
-          'rt_input': [[[1], [2], [3]]],
-          'default': 0,
-          'error': (ValueError, r'Shapes \(\) and \(1,\) are incompatible'),
-      },
-      {
-          'rt_input': [[[[1], [2]], [], [[3]]]],
-          'default': 0,
-          'error': (ValueError, r'Shapes \(\) and \(1,\) are incompatible'),
+          'rt_input': [[[1, 2], [3, 4]], [[5, 6]]],
+          'ragged_rank': 1,
+          'default': [7, 8, 9],
+          'error': (ValueError, r'Shapes \(3,\) and \(2,\) are incompatible'),
       },
       {
           'rt_input': [[1, 2, 3]],
@@ -114,9 +134,11 @@ class RaggedTensorToTensorOpTest(test_util.TensorFlowTestCase,
           'error': (TypeError, "Expected int32, got 'a' of type 'str' instead"),
       },
   )
-  def testError(self, rt_input, default, error):
-    rt = ragged.constant(rt_input)
-    self.assertRaisesRegexp(error[0], error[1], ragged.to_tensor, rt, default)
+  @test_util.run_deprecated_v1
+  def testError(self, rt_input, default, error, ragged_rank=None):
+    rt = ragged.constant(rt_input, ragged_rank=ragged_rank)
+    with self.assertRaisesRegexp(error[0], error[1]):
+      ragged.to_tensor(rt, default)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/ops/ragged/ragged_util.py b/tensorflow/python/ops/ragged/ragged_util.py
index 03f050de514..a832f937d16 100644
--- a/tensorflow/python/ops/ragged/ragged_util.py
+++ b/tensorflow/python/ops/ragged/ragged_util.py
@@ -25,6 +25,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import gen_ragged_math_ops
 from tensorflow.python.ops import math_ops
 
 
@@ -229,3 +230,51 @@ def _with_nonzero_rank(data):
     return array_ops.reshape(
         data,
         array_ops.concat([[1], data_shape], axis=0)[-data_ndims:])
+
+
+def lengths_to_splits(lengths):
+  """Returns splits corresponding to the given lengths."""
+  return array_ops.concat([[0], math_ops.cumsum(lengths)], axis=-1)
+
+
+def repeat_ranges(params, splits, repeats):
+  """Repeats each range of `params` (as specified by `splits`) `repeats` times.
+
+  Let the `i`th range of `params` be defined as
+  `params[splits[i]:splits[i + 1]]`.  Then this function returns a tensor
+  containing range 0 repeated `repeats[0]` times, followed by range 1 repeated
+  `repeats[1]`, ..., followed by the last range repeated `repeats[-1]` times.
+
+  Args:
+    params: The `Tensor` whose values should be repeated.
+    splits: A splits tensor indicating the ranges of `params` that should be
+      repeated.
+    repeats: The number of times each range should be repeated.  Supports
+      broadcasting from a scalar value.
+
+  Returns:
+    A `Tensor` with the same rank and type as `params`.
+
+  #### Example:
+    ```python
+    >>> repeat_ranges(['a', 'b', 'c'], [0, 2, 3], 3)
+    ['a', 'b', 'a', 'b', 'a', 'b', 'c', 'c', 'c']
+    ```
+  """
+  # Divide `splits` into starts and limits, and repeat them `repeats` times.
+  if repeats.shape.ndims != 0:
+    repeated_starts = repeat(splits[:-1], repeats, axis=0)
+    repeated_limits = repeat(splits[1:], repeats, axis=0)
+  else:
+    # Optimization: we can just call repeat once, and then slice the result.
+    repeated_splits = repeat(splits, repeats, axis=0)
+    n_splits = array_ops.shape(repeated_splits, out_type=dtypes.int64)[0]
+    repeated_starts = repeated_splits[:n_splits - repeats]
+    repeated_limits = repeated_splits[repeats:]
+
+  # Get indices for each range from starts to limits, and use those to gather
+  # the values in the desired repetition pattern.
+  one = array_ops.ones((), repeated_starts.dtype)
+  offsets = gen_ragged_math_ops.ragged_range(
+      repeated_starts, repeated_limits, one)
+  return array_ops.gather(params, offsets.rt_dense_values)
diff --git a/tensorflow/python/ops/ragged/ragged_where_op_test.py b/tensorflow/python/ops/ragged/ragged_where_op_test.py
index 755333de392..de83a549771 100644
--- a/tensorflow/python/ops/ragged/ragged_where_op_test.py
+++ b/tensorflow/python/ops/ragged/ragged_where_op_test.py
@@ -165,12 +165,13 @@ class RaggedWhereOpTest(test_util.TensorFlowTestCase, parameterized.TestCase):
           y=ragged.constant_value([[[['a']]], [[['b']]]]),
           expected=ragged.constant_value([[[[], [b'A']]], [[[b'b']]]])),
   ])   # pyformat: disable
+  @test_util.run_deprecated_v1
   def testRaggedWhere(self, condition, expected, x=None, y=None):
     result = ragged.where(condition, x, y)
     self.assertEqual(
         getattr(result, 'ragged_rank', 0), getattr(expected, 'ragged_rank', 0))
     with self.test_session():
-      result_value = result.eval()
+      result_value = self.evaluate(result)
       if hasattr(result_value, 'tolist'):
         result_value = result_value.tolist()
       if hasattr(expected, 'tolist'):
diff --git a/tensorflow/python/ops/random_ops.py b/tensorflow/python/ops/random_ops.py
index 1f7db0af61f..62e2f6d1025 100644
--- a/tensorflow/python/ops/random_ops.py
+++ b/tensorflow/python/ops/random_ops.py
@@ -138,7 +138,9 @@ def parameterized_truncated_normal(shape,
     return rnd
 
 
-@tf_export("random.truncated_normal", "truncated_normal")
+@tf_export("random.truncated_normal",
+           v1=["random.truncated_normal", "truncated_normal"])
+@deprecation.deprecated_endpoints("truncated_normal")
 def truncated_normal(shape,
                      mean=0.0,
                      stddev=1.0,
@@ -325,7 +327,9 @@ def random_crop(value, size, seed=None, name=None):
     return array_ops.slice(value, offset, size, name=name)
 
 
-@tf_export("random.multinomial", "multinomial")
+@tf_export(v1=["random.multinomial", "multinomial"])
+@deprecation.deprecated(
+    date=None, instructions="Use tf.random.categorical instead.")
 def multinomial(logits, num_samples, seed=None, name=None, output_dtype=None):
   """Draws samples from a multinomial distribution.
 
@@ -342,9 +346,7 @@ def multinomial(logits, num_samples, seed=None, name=None, output_dtype=None):
       `[i, :]` represents the unnormalized log-probabilities for all classes.
     num_samples: 0-D.  Number of independent samples to draw for each row slice.
     seed: A Python integer. Used to create a random seed for the distribution.
-      See
-      `tf.set_random_seed`
-      for behavior.
+      See `tf.set_random_seed` for behavior.
     name: Optional name for the operation.
     output_dtype: integer type to use for the output. Defaults to int64.
 
@@ -352,10 +354,43 @@ def multinomial(logits, num_samples, seed=None, name=None, output_dtype=None):
     The drawn samples of shape `[batch_size, num_samples]`.
   """
   with ops.name_scope(name, "multinomial", [logits]):
-    logits = ops.convert_to_tensor(logits, name="logits")
-    seed1, seed2 = random_seed.get_seed(seed)
-    return gen_random_ops.multinomial(
-        logits, num_samples, seed=seed1, seed2=seed2, output_dtype=output_dtype)
+    return multinomial_categorical_impl(logits, num_samples, output_dtype, seed)
+
+
+@tf_export("random.categorical")
+def categorical(logits, num_samples, dtype=None, seed=None, name=None):
+  """Draws samples from a categorical distribution.
+
+  Example:
+
+  ```python
+  # samples has shape [1, 5], where each value is either 0 or 1 with equal
+  # probability.
+  samples = tf.random.categorical(tf.log([[10., 10.]]), 5)
+  ```
+
+  Args:
+    logits: 2-D Tensor with shape `[batch_size, num_classes]`.  Each slice
+      `[i, :]` represents the unnormalized log-probabilities for all classes.
+    num_samples: 0-D.  Number of independent samples to draw for each row slice.
+    dtype: integer type to use for the output. Defaults to int64.
+    seed: A Python integer. Used to create a random seed for the distribution.
+      See `tf.set_random_seed` for behavior.
+    name: Optional name for the operation.
+
+  Returns:
+    The drawn samples of shape `[batch_size, num_samples]`.
+  """
+  with ops.name_scope(name, "categorical", [logits]):
+    return multinomial_categorical_impl(logits, num_samples, dtype, seed)
+
+
+def multinomial_categorical_impl(logits, num_samples, dtype, seed):
+  """Implementation for random.multinomial (v1) and random.categorical (v2)."""
+  logits = ops.convert_to_tensor(logits, name="logits")
+  seed1, seed2 = random_seed.get_seed(seed)
+  return gen_random_ops.multinomial(
+      logits, num_samples, seed=seed1, seed2=seed2, output_dtype=dtype)
 
 
 ops.NotDifferentiable("Multinomial")
@@ -445,7 +480,7 @@ def random_gamma(shape,
             shape, alpha_broadcast, seed=seed1, seed2=seed2) / beta)
 
 
-@tf_export("random.poisson", v1=["random.poisson", "random_poisson"])
+@tf_export(v1=["random.poisson", "random_poisson"])
 @deprecation.deprecated_endpoints("random_poisson")
 def random_poisson(lam, shape, dtype=dtypes.float32, seed=None, name=None):
   """Draws `shape` samples from each of the given Poisson distribution(s).
@@ -478,6 +513,45 @@ def random_poisson(lam, shape, dtype=dtypes.float32, seed=None, name=None):
       for behavior.
     name: Optional name for the operation.
 
+  Returns:
+    samples: a `Tensor` of shape `tf.concat([shape, tf.shape(lam)], axis=0)`
+      with values of type `dtype`.
+  """
+  return random_poisson_v2(shape, lam, dtype, seed, name)
+
+
+@tf_export("random.poisson", v1=[])
+def random_poisson_v2(shape, lam, dtype=dtypes.float32, seed=None, name=None):
+  """Draws `shape` samples from each of the given Poisson distribution(s).
+
+  `lam` is the rate parameter describing the distribution(s).
+
+  Example:
+
+  ```python
+  samples = tf.random_poisson([10], [0.5, 1.5])
+  # samples has shape [10, 2], where each slice [:, 0] and [:, 1] represents
+  # the samples drawn from each distribution
+
+  samples = tf.random_poisson([7, 5], [12.2, 3.3])
+  # samples has shape [7, 5, 2], where each slice [:, :, 0] and [:, :, 1]
+  # represents the 7x5 samples drawn from each of the two distributions
+  ```
+
+  Args:
+    shape: A 1-D integer Tensor or Python array. The shape of the output samples
+      to be drawn per "rate"-parameterized distribution.
+    lam: A Tensor or Python value or N-D array of type `dtype`.
+      `lam` provides the rate parameter(s) describing the poisson
+      distribution(s) to sample.
+    dtype: The type of the output: `float16`, `float32`, `float64`, `int32` or
+      `int64`.
+    seed: A Python integer. Used to create a random seed for the distributions.
+      See
+      `tf.set_random_seed`
+      for behavior.
+    name: Optional name for the operation.
+
   Returns:
     samples: a `Tensor` of shape `tf.concat([shape, tf.shape(lam)], axis=0)`
       with values of type `dtype`.
diff --git a/tensorflow/python/ops/resource_variable_ops.py b/tensorflow/python/ops/resource_variable_ops.py
index 488b6fcbcdb..1066b357b43 100644
--- a/tensorflow/python/ops/resource_variable_ops.py
+++ b/tensorflow/python/ops/resource_variable_ops.py
@@ -26,6 +26,7 @@ from tensorflow.core.framework import variable_pb2
 from tensorflow.python import pywrap_tensorflow
 from tensorflow.python.eager import context
 from tensorflow.python.eager import tape
+from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import cpp_shape_inference_pb2
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -64,6 +65,7 @@ def eager_safe_variable_handle(shape, dtype, shared_name, name, graph_mode):
                                                    name=name,
                                                    container=container)
   if graph_mode:
+    handle._handle_data = get_resource_handle_data(handle)  # pylint: disable=protected-access
     return handle
 
   # We do not want two distinct ResourceVariable objects for the same
@@ -519,7 +521,10 @@ class ResourceVariable(variables.RefVariable):
       snapshot = g.as_graph_element(
           ops.prepend_name_scope(
               variable_def.snapshot_name, import_scope=import_scope))
-      self._cached_value = snapshot
+      if snapshot.op.type != "ReadVariableOp":
+        self._cached_value = snapshot
+      else:
+        self._cached_value = None
       while snapshot.op.type != "ReadVariableOp":
         snapshot = snapshot.op.inputs[0]
       self._graph_element = snapshot
@@ -802,16 +807,6 @@ class ResourceVariable(variables.RefVariable):
     return ResourceVariable(
         variable_def=variable_def, import_scope=import_scope)
 
-  @staticmethod
-  def _OverloadAllOperators():  # pylint: disable=invalid-name
-    """Register overloads for all operators."""
-    for operator in ops.Tensor.OVERLOADABLE_OPERATORS:
-      ResourceVariable._OverloadOperator(operator)
-    # For slicing, bind getitem differently than a tensor (use SliceHelperVar
-    # instead)
-    # pylint: disable=protected-access
-    setattr(ResourceVariable, "__getitem__", array_ops._SliceHelperVar)
-
   def _AsTensor(self):
     return self.value()
 
@@ -823,30 +818,6 @@ class ResourceVariable(variables.RefVariable):
     """Unsupported."""
     raise NotImplementedError("ResourceVariable does not implement set_shape()")
 
-  @staticmethod
-  def _OverloadOperator(operator):  # pylint: disable=invalid-name
-    """Defer an operator overload to `ops.Tensor`.
-
-    We pull the operator out of ops.Tensor dynamically to avoid ordering issues.
-
-    Args:
-      operator: string. The operator name.
-    """
-
-    tensor_oper = getattr(ops.Tensor, operator)
-    def _run_op(a, *args):
-      # pylint: disable=protected-access
-      value = a._AsTensor()
-      return tensor_oper(value, *args)
-
-    # Propagate __doc__ to wrapper
-    try:
-      _run_op.__doc__ = tensor_oper.__doc__
-    except AttributeError:
-      pass
-
-    setattr(ResourceVariable, operator, _run_op)
-
   __array_priority__ = 100
 
   def is_initialized(self, name=None):
@@ -1432,7 +1403,6 @@ ops.register_tensor_conversion_function(
     variables.Variable, variables.Variable._TensorConversionFunction)  # pylint: disable=protected-access
 
 # pylint: disable=protected-access
-ResourceVariable._OverloadAllOperators()
 ops.register_dense_tensor_like_type(ResourceVariable)
 
 
@@ -1442,13 +1412,23 @@ def _ReadGrad(_, grad):
   return grad
 
 
+def variable_shape(handle, out_type=dtypes.int32):
+  if getattr(
+      handle, "_handle_data", None) is None or not handle._handle_data.is_set:
+    return gen_resource_variable_ops.variable_shape(handle, out_type=out_type)
+  shape_proto = handle._handle_data.shape_and_type[0].shape
+  if shape_proto.unknown_rank or any(x.size == -1 for x in shape_proto.dim):
+    return gen_resource_variable_ops.variable_shape(handle, out_type=out_type)
+  return constant_op.constant([x.size for x in shape_proto.dim], dtype=out_type)
+
+
 @ops.RegisterGradient("ResourceGather")
 def _GatherGrad(op, grad):
   """Gradient for gather op."""
   # Build appropriately shaped IndexedSlices
   handle = op.inputs[0]
   indices = op.inputs[1]
-  params_shape = gen_resource_variable_ops.variable_shape(handle)
+  params_shape = variable_shape(handle)
   size = array_ops.expand_dims(array_ops.size(indices), 0)
   values_shape = array_ops.concat([size, params_shape[1:]], 0)
   values = array_ops.reshape(grad, values_shape)
@@ -1522,3 +1502,6 @@ def copy_to_graph_uninitialized(var):
   new_variable._maybe_initialize_checkpointable()
   # pylint: enable=protected-access
   return new_variable
+
+ops.NotDifferentiable("VarIsInitializedOp")
+ops.NotDifferentiable("VariableShape")
diff --git a/tensorflow/python/ops/resources.py b/tensorflow/python/ops/resources.py
index db6740643cf..6cd12b7bbbe 100644
--- a/tensorflow/python/ops/resources.py
+++ b/tensorflow/python/ops/resources.py
@@ -21,6 +21,7 @@ from __future__ import division
 from __future__ import print_function
 
 import collections
+import os
 
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -86,7 +87,8 @@ def report_uninitialized_resources(resource_list=None,
     resource_list = shared_resources() + local_resources()
   with ops.name_scope(name):
     # Run all operations on CPU
-    with ops.device("/cpu:0"):
+    local_device = os.environ.get("TF_DEVICE_FOR_UNINITIALIZED_VARIABLE_REPORTING", "/cpu:0")
+    with ops.device(local_device):
       if not resource_list:
         # Return an empty tensor so we only need to check for returned tensor
         # size being 0 as an indication of model ready.
diff --git a/tensorflow/python/ops/rnn.py b/tensorflow/python/ops/rnn.py
index 57ecb505573..ec48cab91d1 100644
--- a/tensorflow/python/ops/rnn.py
+++ b/tensorflow/python/ops/rnn.py
@@ -117,7 +117,7 @@ def _infer_state_dtype(explicit_dtype, state):
     inferred_dtypes = [element.dtype for element in nest.flatten(state)]
     if not inferred_dtypes:
       raise ValueError("Unable to infer dtype from empty state.")
-    all_same = all([x == inferred_dtypes[0] for x in inferred_dtypes])
+    all_same = all(x == inferred_dtypes[0] for x in inferred_dtypes)
     if not all_same:
       raise ValueError(
           "State has tensors of different inferred_dtypes. Unable to infer a "
@@ -348,7 +348,10 @@ def _reverse_seq(input_seq, lengths):
   return results
 
 
-@tf_export("nn.bidirectional_dynamic_rnn")
+@deprecation.deprecated(None, "Please use `keras.layers.Bidirectional("
+                        "keras.layers.RNN(cell))`, which is equivalent to "
+                        "this API")
+@tf_export(v1=["nn.bidirectional_dynamic_rnn"])
 def bidirectional_dynamic_rnn(cell_fw, cell_bw, inputs, sequence_length=None,
                               initial_state_fw=None, initial_state_bw=None,
                               dtype=None, parallel_iterations=None,
@@ -1490,7 +1493,10 @@ def static_state_saving_rnn(cell,
   return (outputs, state)
 
 
-@tf_export("nn.static_bidirectional_rnn")
+@deprecation.deprecated(None, "Please use `keras.layers.Bidirectional("
+                        "keras.layers.RNN(cell, unroll=True))`, which is "
+                        "equivalent to this API")
+@tf_export(v1=["nn.static_bidirectional_rnn"])
 def static_bidirectional_rnn(cell_fw,
                              cell_bw,
                              inputs,
diff --git a/tensorflow/python/ops/rnn_cell_impl.py b/tensorflow/python/ops/rnn_cell_impl.py
index 050b4868939..ffc45619a74 100644
--- a/tensorflow/python/ops/rnn_cell_impl.py
+++ b/tensorflow/python/ops/rnn_cell_impl.py
@@ -36,6 +36,7 @@ from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.keras import activations
 from tensorflow.python.keras import initializers
+from tensorflow.python.keras.engine import input_spec
 from tensorflow.python.keras.utils import tf_utils
 from tensorflow.python.layers import base as base_layer
 from tensorflow.python.ops import array_ops
@@ -410,7 +411,7 @@ class BasicRNNCell(LayerRNNCell):
                    "performance on GPU.", self)
 
     # Inputs must be 2-dimensional.
-    self.input_spec = base_layer.InputSpec(ndim=2)
+    self.input_spec = input_spec.InputSpec(ndim=2)
 
     self._num_units = num_units
     if activation:
@@ -507,7 +508,7 @@ class GRUCell(LayerRNNCell):
                    "Please use tf.contrib.cudnn_rnn.CudnnGRU for better "
                    "performance on GPU.", self)
     # Inputs must be 2-dimensional.
-    self.input_spec = base_layer.InputSpec(ndim=2)
+    self.input_spec = input_spec.InputSpec(ndim=2)
 
     self._num_units = num_units
     if activation:
@@ -683,7 +684,7 @@ class BasicLSTMCell(LayerRNNCell):
                    "performance on GPU.", self)
 
     # Inputs must be 2-dimensional.
-    self.input_spec = base_layer.InputSpec(ndim=2)
+    self.input_spec = input_spec.InputSpec(ndim=2)
 
     self._num_units = num_units
     self._forget_bias = forget_bias
@@ -871,7 +872,7 @@ class LSTMCell(LayerRNNCell):
                    "performance on GPU.", self)
 
     # Inputs must be 2-dimensional.
-    self.input_spec = base_layer.InputSpec(ndim=2)
+    self.input_spec = input_spec.InputSpec(ndim=2)
 
     self._num_units = num_units
     self._use_peepholes = use_peepholes
@@ -1394,7 +1395,7 @@ class DeviceWrapper(RNNCell):
       return self._cell(inputs, state, scope=scope)
 
 
-@tf_export("nn.rnn_cell.MultiRNNCell")
+@tf_export(v1=["nn.rnn_cell.MultiRNNCell"])
 class MultiRNNCell(RNNCell):
   """RNN cell composed sequentially of multiple simple cells.
 
@@ -1407,6 +1408,9 @@ class MultiRNNCell(RNNCell):
   ```
   """
 
+  @deprecated(None, "This class is equivalent as "
+                    "tf.keras.layers.StackedRNNCells, and will be replaced by "
+                    "that in Tensorflow 2.0.")
   def __init__(self, cells, state_is_tuple=True):
     """Create a RNN cell composed sequentially of a number of RNNCells.
 
@@ -1452,7 +1456,7 @@ class MultiRNNCell(RNNCell):
     if self._state_is_tuple:
       return tuple(cell.state_size for cell in self._cells)
     else:
-      return sum([cell.state_size for cell in self._cells])
+      return sum(cell.state_size for cell in self._cells)
 
   @property
   def output_size(self):
diff --git a/tensorflow/python/ops/sets_impl.py b/tensorflow/python/ops/sets_impl.py
index 21e08d03d21..ee9c9b6bc0b 100644
--- a/tensorflow/python/ops/sets_impl.py
+++ b/tensorflow/python/ops/sets_impl.py
@@ -31,7 +31,7 @@ _VALID_DTYPES = set([
     dtypes.uint8, dtypes.uint16, dtypes.string])
 
 
-@tf_export("sets.set_size")
+@tf_export("sets.size", v1=["sets.size", "sets.set_size"])
 def set_size(a, validate_indices=True):
   """Compute number of unique elements along last dimension of `a`.
 
@@ -133,7 +133,8 @@ def _set_operation(a, b, set_operation, validate_indices=True):
   return sparse_tensor.SparseTensor(indices, values, shape)
 
 
-@tf_export("sets.set_intersection")
+@tf_export(
+    "sets.intersection", v1=["sets.intersection", "sets.set_intersection"])
 def set_intersection(a, b, validate_indices=True):
   """Compute set intersection of elements in last dimension of `a` and `b`.
 
@@ -200,7 +201,8 @@ def set_intersection(a, b, validate_indices=True):
   return _set_operation(a, b, "intersection", validate_indices)
 
 
-@tf_export("sets.set_difference")
+@tf_export(
+	   "sets.difference", v1=["sets.difference", "sets.set_difference"])
 def set_difference(a, b, aminusb=True, validate_indices=True):
   """Compute set difference of elements in last dimension of `a` and `b`.
 
@@ -271,7 +273,8 @@ def set_difference(a, b, aminusb=True, validate_indices=True):
   return _set_operation(a, b, "a-b" if aminusb else "b-a", validate_indices)
 
 
-@tf_export("sets.set_union")
+@tf_export(
+	   "sets.union", v1=["sets.union", "sets.set_union"])
 def set_union(a, b, validate_indices=True):
   """Compute set union of elements in last dimension of `a` and `b`.
 
diff --git a/tensorflow/python/ops/signal/BUILD b/tensorflow/python/ops/signal/BUILD
index 0d04dc0c1bf..da2bf9c1d2d 100644
--- a/tensorflow/python/ops/signal/BUILD
+++ b/tensorflow/python/ops/signal/BUILD
@@ -6,16 +6,29 @@ exports_files(["LICENSE"])
 
 py_library(
     name = "signal",
-    srcs = glob(["*.py"]),
+    srcs = [
+        "dct_ops.py",
+        "fft_ops.py",
+        "mel_ops.py",
+        "mfcc_ops.py",
+        "reconstruction_ops.py",
+        "shape_ops.py",
+        "signal.py",
+        "spectral_ops.py",
+        "util_ops.py",
+        "window_ops.py",
+    ],
     srcs_version = "PY2AND3",
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:constant_op",
         "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:math_ops",
-        "//tensorflow/python:spectral_ops",
+        "//tensorflow/python:spectral_ops_gen",
         "//tensorflow/python:tensor_util",
         "//tensorflow/python:util",
         "//third_party/py/numpy",
diff --git a/tensorflow/python/ops/signal/__init__.py b/tensorflow/python/ops/signal/__init__.py
deleted file mode 100644
index 3fa4e94e588..00000000000
--- a/tensorflow/python/ops/signal/__init__.py
+++ /dev/null
@@ -1,37 +0,0 @@
-"""Signal processing operations.
-
-See the [tf.signal](https://tensorflow.org/api_guides/python/contrib.signal)
-guide.
-
-@@frame
-@@hamming_window
-@@hann_window
-@@inverse_stft
-@@inverse_stft_window_fn
-@@mfccs_from_log_mel_spectrograms
-@@linear_to_mel_weight_matrix
-@@overlap_and_add
-@@stft
-
-[hamming]: https://en.wikipedia.org/wiki/Window_function#Hamming_window
-[hann]: https://en.wikipedia.org/wiki/Window_function#Hann_window
-[mel]: https://en.wikipedia.org/wiki/Mel_scale
-[mfcc]: https://en.wikipedia.org/wiki/Mel-frequency_cepstrum
-[stft]: https://en.wikipedia.org/wiki/Short-time_Fourier_transform
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-# pylint: disable=unused-import
-from tensorflow.python.ops.signal.mel_ops import linear_to_mel_weight_matrix
-from tensorflow.python.ops.signal.mfcc_ops import mfccs_from_log_mel_spectrograms
-from tensorflow.python.ops.signal.reconstruction_ops import overlap_and_add
-from tensorflow.python.ops.signal.shape_ops import frame
-from tensorflow.python.ops.signal.spectral_ops import inverse_stft
-from tensorflow.python.ops.signal.spectral_ops import inverse_stft_window_fn
-from tensorflow.python.ops.signal.spectral_ops import stft
-from tensorflow.python.ops.signal.window_ops import hamming_window
-from tensorflow.python.ops.signal.window_ops import hann_window
-# pylint: enable=unused-import
diff --git a/tensorflow/python/ops/signal/dct_ops.py b/tensorflow/python/ops/signal/dct_ops.py
new file mode 100644
index 00000000000..d042c95c049
--- /dev/null
+++ b/tensorflow/python/ops/signal/dct_ops.py
@@ -0,0 +1,192 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Discrete Cosine Transform ops."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import math as _math
+
+from tensorflow.python.framework import dtypes as _dtypes
+from tensorflow.python.framework import ops as _ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.ops import array_ops as _array_ops
+from tensorflow.python.ops import math_ops as _math_ops
+from tensorflow.python.ops.signal import fft_ops
+from tensorflow.python.util.tf_export import tf_export
+
+
+def _validate_dct_arguments(input_tensor, dct_type, n, axis, norm):
+  """Checks that DCT/IDCT arguments are compatible and well formed."""
+  if n is not None:
+    raise NotImplementedError("The DCT length argument is not implemented.")
+  if axis != -1:
+    raise NotImplementedError("axis must be -1. Got: %s" % axis)
+  if dct_type not in (1, 2, 3):
+    raise ValueError("Only Types I, II and III (I)DCT are supported.")
+  if dct_type == 1:
+    if norm == "ortho":
+      raise ValueError("Normalization is not supported for the Type-I DCT.")
+    if input_tensor.shape[-1] is not None and input_tensor.shape[-1] < 2:
+      raise ValueError(
+          "Type-I DCT requires the dimension to be greater than one.")
+
+  if norm not in (None, "ortho"):
+    raise ValueError(
+        "Unknown normalization. Expected None or 'ortho', got: %s" % norm)
+
+
+# TODO(rjryan): Implement `n` and `axis` parameters.
+@tf_export("signal.dct", v1=["signal.dct", "spectral.dct"])
+def dct(input, type=2, n=None, axis=-1, norm=None, name=None):  # pylint: disable=redefined-builtin
+  """Computes the 1D [Discrete Cosine Transform (DCT)][dct] of `input`.
+
+  Currently only Types I, II and III are supported.
+  Type I is implemented using a length `2N` padded `tf.spectral.rfft`.
+  Type II is implemented using a length `2N` padded `tf.spectral.rfft`, as
+  described here:
+  https://dsp.stackexchange.com/a/10606.
+  Type III is a fairly straightforward inverse of Type II
+  (i.e. using a length `2N` padded `tf.spectral.irfft`).
+
+  @compatibility(scipy)
+  Equivalent to scipy.fftpack.dct for Type-I, Type-II and Type-III DCT.
+  https://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.fftpack.dct.html
+  @end_compatibility
+
+  Args:
+    input: A `[..., samples]` `float32` `Tensor` containing the signals to
+      take the DCT of.
+    type: The DCT type to perform. Must be 1, 2 or 3.
+    n: For future expansion. The length of the transform. Must be `None`.
+    axis: For future expansion. The axis to compute the DCT along. Must be `-1`.
+    norm: The normalization to apply. `None` for no normalization or `'ortho'`
+      for orthonormal normalization.
+    name: An optional name for the operation.
+
+  Returns:
+    A `[..., samples]` `float32` `Tensor` containing the DCT of `input`.
+
+  Raises:
+    ValueError: If `type` is not `1`, `2` or `3`, `n` is not `None, `axis` is
+      not `-1`, or `norm` is not `None` or `'ortho'`.
+    ValueError: If `type` is `1` and `norm` is `ortho`.
+
+  [dct]: https://en.wikipedia.org/wiki/Discrete_cosine_transform
+  """
+  _validate_dct_arguments(input, type, n, axis, norm)
+  with _ops.name_scope(name, "dct", [input]):
+    # We use the RFFT to compute the DCT and TensorFlow only supports float32
+    # for FFTs at the moment.
+    input = _ops.convert_to_tensor(input, dtype=_dtypes.float32)
+
+    axis_dim = (tensor_shape.dimension_value(input.shape[-1])
+                or _array_ops.shape(input)[-1])
+    axis_dim_float = _math_ops.to_float(axis_dim)
+
+    if type == 1:
+      dct1_input = _array_ops.concat([input, input[..., -2:0:-1]], axis=-1)
+      dct1 = _math_ops.real(fft_ops.rfft(dct1_input))
+      return dct1
+
+    if type == 2:
+      scale = 2.0 * _math_ops.exp(
+          _math_ops.complex(
+              0.0, -_math_ops.range(axis_dim_float) * _math.pi * 0.5 /
+              axis_dim_float))
+
+      # TODO(rjryan): Benchmark performance and memory usage of the various
+      # approaches to computing a DCT via the RFFT.
+      dct2 = _math_ops.real(
+          fft_ops.rfft(
+              input, fft_length=[2 * axis_dim])[..., :axis_dim] * scale)
+
+      if norm == "ortho":
+        n1 = 0.5 * _math_ops.rsqrt(axis_dim_float)
+        n2 = n1 * _math_ops.sqrt(2.0)
+        # Use tf.pad to make a vector of [n1, n2, n2, n2, ...].
+        weights = _array_ops.pad(
+            _array_ops.expand_dims(n1, 0), [[0, axis_dim - 1]],
+            constant_values=n2)
+        dct2 *= weights
+
+      return dct2
+
+    elif type == 3:
+      if norm == "ortho":
+        n1 = _math_ops.sqrt(axis_dim_float)
+        n2 = n1 * _math_ops.sqrt(0.5)
+        # Use tf.pad to make a vector of [n1, n2, n2, n2, ...].
+        weights = _array_ops.pad(
+            _array_ops.expand_dims(n1, 0), [[0, axis_dim - 1]],
+            constant_values=n2)
+        input *= weights
+      else:
+        input *= axis_dim_float
+      scale = 2.0 * _math_ops.exp(
+          _math_ops.complex(
+              0.0,
+              _math_ops.range(axis_dim_float) * _math.pi * 0.5 /
+              axis_dim_float))
+      dct3 = _math_ops.real(
+          fft_ops.irfft(
+              scale * _math_ops.complex(input, 0.0),
+              fft_length=[2 * axis_dim]))[..., :axis_dim]
+
+      return dct3
+
+
+# TODO(rjryan): Implement `n` and `axis` parameters.
+@tf_export("signal.idct", v1=["signal.idct", "spectral.idct"])
+def idct(input, type=2, n=None, axis=-1, norm=None, name=None):  # pylint: disable=redefined-builtin
+  """Computes the 1D [Inverse Discrete Cosine Transform (DCT)][idct] of `input`.
+
+  Currently only Types I, II and III are supported. Type III is the inverse of
+  Type II, and vice versa.
+
+  Note that you must re-normalize by 1/(2n) to obtain an inverse if `norm` is
+  not `'ortho'`. That is:
+  `signal == idct(dct(signal)) * 0.5 / signal.shape[-1]`.
+  When `norm='ortho'`, we have:
+  `signal == idct(dct(signal, norm='ortho'), norm='ortho')`.
+
+  @compatibility(scipy)
+  Equivalent to scipy.fftpack.idct for Type-I, Type-II and Type-III DCT.
+  https://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.fftpack.idct.html
+  @end_compatibility
+
+  Args:
+    input: A `[..., samples]` `float32` `Tensor` containing the signals to take
+      the DCT of.
+    type: The IDCT type to perform. Must be 1, 2 or 3.
+    n: For future expansion. The length of the transform. Must be `None`.
+    axis: For future expansion. The axis to compute the DCT along. Must be `-1`.
+    norm: The normalization to apply. `None` for no normalization or `'ortho'`
+      for orthonormal normalization.
+    name: An optional name for the operation.
+
+  Returns:
+    A `[..., samples]` `float32` `Tensor` containing the IDCT of `input`.
+
+  Raises:
+    ValueError: If `type` is not `1`, `2` or `3`, `n` is not `None, `axis` is
+      not `-1`, or `norm` is not `None` or `'ortho'`.
+
+  [idct]:
+  https://en.wikipedia.org/wiki/Discrete_cosine_transform#Inverse_transforms
+  """
+  _validate_dct_arguments(input, type, n, axis, norm)
+  inverse_type = {1: 1, 2: 3, 3: 2}[type]
+  return dct(input, type=inverse_type, n=n, axis=axis, norm=norm, name=name)
diff --git a/tensorflow/python/ops/spectral_ops.py b/tensorflow/python/ops/signal/fft_ops.py
similarity index 51%
rename from tensorflow/python/ops/spectral_ops.py
rename to tensorflow/python/ops/signal/fft_ops.py
index 4dcc90aefa9..2d14b2bbd75 100644
--- a/tensorflow/python/ops/spectral_ops.py
+++ b/tensorflow/python/ops/signal/fft_ops.py
@@ -12,16 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Spectral operators (e.g. DCT, FFT, RFFT)."""
+"""Fast-Fourier Transform ops."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import math as _math
+import numpy as np
 
 from tensorflow.python.framework import dtypes as _dtypes
 from tensorflow.python.framework import ops as _ops
-from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util as _tensor_util
 from tensorflow.python.ops import array_ops as _array_ops
 from tensorflow.python.ops import gen_spectral_ops
@@ -112,6 +111,7 @@ def _rfft_wrapper(fft_fn, fft_rank, default_name):
   """Wrapper around gen_spectral_ops.rfft* that infers fft_length argument."""
 
   def _rfft(input_tensor, fft_length=None, name=None):
+    """Wrapper around gen_spectral_ops.rfft* that infers fft_length argument."""
     with _ops.name_scope(name, default_name,
                          [input_tensor, fft_length]) as name:
       input_tensor = _ops.convert_to_tensor(input_tensor, _dtypes.float32)
@@ -130,6 +130,7 @@ def _irfft_wrapper(ifft_fn, fft_rank, default_name):
   """Wrapper around gen_spectral_ops.irfft* that infers fft_length argument."""
 
   def _irfft(input_tensor, fft_length=None, name=None):
+    """Wrapper irfft* that infers fft_length argument."""
     with _ops.name_scope(name, default_name,
                          [input_tensor, fft_length]) as name:
       input_tensor = _ops.convert_to_tensor(input_tensor, _dtypes.complex64)
@@ -145,6 +146,8 @@ def _irfft_wrapper(ifft_fn, fft_rank, default_name):
   return _irfft
 
 
+# FFT/IFFT 1/2/3D are exported via
+# third_party/tensorflow/core/api_def/python_api/
 fft = gen_spectral_ops.fft
 ifft = gen_spectral_ops.ifft
 fft2d = gen_spectral_ops.fft2d
@@ -152,159 +155,176 @@ ifft2d = gen_spectral_ops.ifft2d
 fft3d = gen_spectral_ops.fft3d
 ifft3d = gen_spectral_ops.ifft3d
 rfft = _rfft_wrapper(gen_spectral_ops.rfft, 1, "rfft")
-tf_export("spectral.rfft")(rfft)
+tf_export("signal.rfft", v1=["signal.rfft", "spectral.rfft"])(rfft)
 irfft = _irfft_wrapper(gen_spectral_ops.irfft, 1, "irfft")
-tf_export("spectral.irfft")(irfft)
+tf_export("signal.irfft", v1=["signal.irfft", "spectral.irfft"])(irfft)
 rfft2d = _rfft_wrapper(gen_spectral_ops.rfft2d, 2, "rfft2d")
-tf_export("spectral.rfft2d")(rfft2d)
+tf_export("signal.rfft2d", v1=["signal.rfft2d", "spectral.rfft2d"])(rfft2d)
 irfft2d = _irfft_wrapper(gen_spectral_ops.irfft2d, 2, "irfft2d")
-tf_export("spectral.irfft2d")(irfft2d)
+tf_export("signal.irfft2d", v1=["signal.irfft2d", "spectral.irfft2d"])(irfft2d)
 rfft3d = _rfft_wrapper(gen_spectral_ops.rfft3d, 3, "rfft3d")
-tf_export("spectral.rfft3d")(rfft3d)
+tf_export("signal.rfft3d", v1=["signal.rfft3d", "spectral.rfft3d"])(rfft3d)
 irfft3d = _irfft_wrapper(gen_spectral_ops.irfft3d, 3, "irfft3d")
-tf_export("spectral.irfft3d")(irfft3d)
+tf_export("signal.irfft3d", v1=["signal.irfft3d", "spectral.irfft3d"])(irfft3d)
 
 
-def _validate_dct_arguments(dct_type, n, axis, norm):
-  if n is not None:
-    raise NotImplementedError("The DCT length argument is not implemented.")
-  if axis != -1:
-    raise NotImplementedError("axis must be -1. Got: %s" % axis)
-  if dct_type not in (2, 3):
-    raise ValueError("Only Types II and III (I)DCT are supported.")
-  if norm not in (None, "ortho"):
-    raise ValueError(
-        "Unknown normalization. Expected None or 'ortho', got: %s" % norm)
+def _fft_size_for_grad(grad, rank):
+  return _math_ops.reduce_prod(_array_ops.shape(grad)[-rank:])
 
 
-# TODO(rjryan): Implement `type`, `n` and `axis` parameters.
-@tf_export("spectral.dct")
-def dct(input, type=2, n=None, axis=-1, norm=None, name=None):  # pylint: disable=redefined-builtin
-  """Computes the 1D [Discrete Cosine Transform (DCT)][dct] of `input`.
-
-  Currently only Types II and III are supported. Type II is implemented using a
-  length `2N` padded `tf.spectral.rfft`, as described here:
-  https://dsp.stackexchange.com/a/10606. Type III is a fairly straightforward
-  inverse of Type II (i.e. using a length `2N` padded `tf.spectral.irfft`).
-
-  @compatibility(scipy)
-  Equivalent to scipy.fftpack.dct for Type-II and Type-III DCT.
-  https://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.fftpack.dct.html
-  @end_compatibility
-
-  Args:
-    input: A `[..., samples]` `float32` `Tensor` containing the signals to
-      take the DCT of.
-    type: The DCT type to perform. Must be 2 or 3.
-    n: For future expansion. The length of the transform. Must be `None`.
-    axis: For future expansion. The axis to compute the DCT along. Must be `-1`.
-    norm: The normalization to apply. `None` for no normalization or `'ortho'`
-      for orthonormal normalization.
-    name: An optional name for the operation.
-
-  Returns:
-    A `[..., samples]` `float32` `Tensor` containing the DCT of `input`.
-
-  Raises:
-    ValueError: If `type` is not `2` or `3`, `n` is not `None, `axis` is not
-      `-1`, or `norm` is not `None` or `'ortho'`.
-
-  [dct]: https://en.wikipedia.org/wiki/Discrete_cosine_transform
-  """
-  _validate_dct_arguments(type, n, axis, norm)
-  with _ops.name_scope(name, "dct", [input]):
-    # We use the RFFT to compute the DCT and TensorFlow only supports float32
-    # for FFTs at the moment.
-    input = _ops.convert_to_tensor(input, dtype=_dtypes.float32)
-
-    axis_dim = (tensor_shape.dimension_value(input.shape[-1])
-                or _array_ops.shape(input)[-1])
-    axis_dim_float = _math_ops.to_float(axis_dim)
-    if type == 2:
-      scale = 2.0 * _math_ops.exp(
-          _math_ops.complex(
-              0.0, -_math_ops.range(axis_dim_float) * _math.pi * 0.5 /
-              axis_dim_float))
-
-      # TODO(rjryan): Benchmark performance and memory usage of the various
-      # approaches to computing a DCT via the RFFT.
-      dct2 = _math_ops.real(
-          rfft(input, fft_length=[2 * axis_dim])[..., :axis_dim] * scale)
-
-      if norm == "ortho":
-        n1 = 0.5 * _math_ops.rsqrt(axis_dim_float)
-        n2 = n1 * _math_ops.sqrt(2.0)
-        # Use tf.pad to make a vector of [n1, n2, n2, n2, ...].
-        weights = _array_ops.pad(
-            _array_ops.expand_dims(n1, 0), [[0, axis_dim - 1]],
-            constant_values=n2)
-        dct2 *= weights
-
-      return dct2
-
-    elif type == 3:
-      if norm == "ortho":
-        n1 = _math_ops.sqrt(axis_dim_float)
-        n2 = n1 * _math_ops.sqrt(0.5)
-        # Use tf.pad to make a vector of [n1, n2, n2, n2, ...].
-        weights = _array_ops.pad(
-            _array_ops.expand_dims(n1, 0), [[0, axis_dim - 1]],
-            constant_values=n2)
-        input *= weights
-      else:
-        input *= axis_dim_float
-      scale = 2.0 * _math_ops.exp(
-          _math_ops.complex(
-              0.0,
-              _math_ops.range(axis_dim_float) * _math.pi * 0.5 /
-              axis_dim_float))
-      dct3 = _math_ops.real(
-          irfft(
-              scale * _math_ops.complex(input, 0.0),
-              fft_length=[2 * axis_dim]))[..., :axis_dim]
-
-      return dct3
+@_ops.RegisterGradient("FFT")
+def _fft_grad(_, grad):
+  size = _math_ops.cast(_fft_size_for_grad(grad, 1), grad.dtype)
+  return ifft(grad) * size
 
 
-# TODO(rjryan): Implement `type`, `n` and `axis` parameters.
-@tf_export("spectral.idct")
-def idct(input, type=2, n=None, axis=-1, norm=None, name=None):  # pylint: disable=redefined-builtin
-  """Computes the 1D [Inverse Discrete Cosine Transform (DCT)][idct] of `input`.
+@_ops.RegisterGradient("IFFT")
+def _ifft_grad(_, grad):
+  rsize = _math_ops.cast(
+      1. / _math_ops.cast(_fft_size_for_grad(grad, 1), grad.dtype.real_dtype),
+      grad.dtype)
+  return fft(grad) * rsize
 
-  Currently only Types II and III are supported. Type III is the inverse of
-  Type II, and vice versa.
 
-  Note that you must re-normalize by 1/(2n) to obtain an inverse if `norm` is
-  not `'ortho'`. That is:
-  `signal == idct(dct(signal)) * 0.5 / signal.shape[-1]`.
-  When `norm='ortho'`, we have:
-  `signal == idct(dct(signal, norm='ortho'), norm='ortho')`.
+@_ops.RegisterGradient("FFT2D")
+def _fft2d_grad(_, grad):
+  size = _math_ops.cast(_fft_size_for_grad(grad, 2), grad.dtype)
+  return ifft2d(grad) * size
 
-  @compatibility(scipy)
-  Equivalent to scipy.fftpack.idct for Type-II and Type-III DCT.
-  https://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.fftpack.idct.html
-  @end_compatibility
 
-  Args:
-    input: A `[..., samples]` `float32` `Tensor` containing the signals to take
-      the DCT of.
-    type: The IDCT type to perform. Must be 2 or 3.
-    n: For future expansion. The length of the transform. Must be `None`.
-    axis: For future expansion. The axis to compute the DCT along. Must be `-1`.
-    norm: The normalization to apply. `None` for no normalization or `'ortho'`
-      for orthonormal normalization.
-    name: An optional name for the operation.
+@_ops.RegisterGradient("IFFT2D")
+def _ifft2d_grad(_, grad):
+  rsize = _math_ops.cast(
+      1. / _math_ops.cast(_fft_size_for_grad(grad, 2), grad.dtype.real_dtype),
+      grad.dtype)
+  return fft2d(grad) * rsize
 
-  Returns:
-    A `[..., samples]` `float32` `Tensor` containing the IDCT of `input`.
 
-  Raises:
-    ValueError: If `type` is not `2` or `3`, `n` is not `None, `axis` is not
-      `-1`, or `norm` is not `None` or `'ortho'`.
+@_ops.RegisterGradient("FFT3D")
+def _fft3d_grad(_, grad):
+  size = _math_ops.cast(_fft_size_for_grad(grad, 3), grad.dtype)
+  return ifft3d(grad) * size
 
-  [idct]:
-  https://en.wikipedia.org/wiki/Discrete_cosine_transform#Inverse_transforms
-  """
-  _validate_dct_arguments(type, n, axis, norm)
-  inverse_type = {2: 3, 3: 2}[type]
-  return dct(input, type=inverse_type, n=n, axis=axis, norm=norm, name=name)
+
+@_ops.RegisterGradient("IFFT3D")
+def _ifft3d_grad(_, grad):
+  rsize = _math_ops.cast(
+      1. / _math_ops.cast(_fft_size_for_grad(grad, 3), grad.dtype.real_dtype),
+      grad.dtype)
+  return fft3d(grad) * rsize
+
+
+def _rfft_grad_helper(rank, irfft_fn):
+  """Returns a gradient function for an RFFT of the provided rank."""
+  # Can't happen because we don't register a gradient for RFFT3D.
+  assert rank in (1, 2), "Gradient for RFFT3D is not implemented."
+
+  def _grad(op, grad):
+    """A gradient function for RFFT with the provided `rank` and `irfft_fn`."""
+    fft_length = op.inputs[1]
+    input_shape = _array_ops.shape(op.inputs[0])
+    is_even = _math_ops.cast(1 - (fft_length[-1] % 2), _dtypes.complex64)
+
+    def _tile_for_broadcasting(matrix, t):
+      expanded = _array_ops.reshape(
+          matrix,
+          _array_ops.concat([
+              _array_ops.ones([_array_ops.rank(t) - 2], _dtypes.int32),
+              _array_ops.shape(matrix)
+          ], 0))
+      return _array_ops.tile(
+          expanded, _array_ops.concat([_array_ops.shape(t)[:-2], [1, 1]], 0))
+
+    def _mask_matrix(length):
+      """Computes t_n = exp(sqrt(-1) * pi * n^2 / line_len)."""
+      # TODO(rjryan): Speed up computation of twiddle factors using the
+      # following recurrence relation and cache them across invocations of RFFT.
+      #
+      # t_n = exp(sqrt(-1) * pi * n^2 / line_len)
+      # for n = 0, 1,..., line_len-1.
+      # For n > 2, use t_n = t_{n-1}^2 / t_{n-2} * t_1^2
+      a = _array_ops.tile(
+          _array_ops.expand_dims(_math_ops.range(length), 0), (length, 1))
+      b = _array_ops.transpose(a, [1, 0])
+      return _math_ops.exp(
+          -2j * np.pi * _math_ops.cast(a * b, _dtypes.complex64) /
+          _math_ops.cast(length, _dtypes.complex64))
+
+    def _ymask(length):
+      """A sequence of [1+0j, -1+0j, 1+0j, -1+0j, ...] with length `length`."""
+      return _math_ops.cast(1 - 2 * (_math_ops.range(length) % 2),
+                            _dtypes.complex64)
+
+    y0 = grad[..., 0:1]
+    if rank == 1:
+      ym = grad[..., -1:]
+      extra_terms = y0 + is_even * ym * _ymask(input_shape[-1])
+    elif rank == 2:
+      # Create a mask matrix for y0 and ym.
+      base_mask = _mask_matrix(input_shape[-2])
+
+      # Tile base_mask to match y0 in shape so that we can batch-matmul the
+      # inner 2 dimensions.
+      tiled_mask = _tile_for_broadcasting(base_mask, y0)
+
+      y0_term = _math_ops.matmul(tiled_mask, _math_ops.conj(y0))
+      extra_terms = y0_term
+
+      ym = grad[..., -1:]
+      ym_term = _math_ops.matmul(tiled_mask, _math_ops.conj(ym))
+
+      inner_dim = input_shape[-1]
+      ym_term = _array_ops.tile(
+          ym_term,
+          _array_ops.concat([
+              _array_ops.ones([_array_ops.rank(grad) - 1], _dtypes.int32),
+              [inner_dim]
+          ], 0)) * _ymask(inner_dim)
+
+      extra_terms += is_even * ym_term
+
+    # The gradient of RFFT is the IRFFT of the incoming gradient times a scaling
+    # factor, plus some additional terms to make up for the components dropped
+    # due to Hermitian symmetry.
+    input_size = _math_ops.to_float(_fft_size_for_grad(op.inputs[0], rank))
+    the_irfft = irfft_fn(grad, fft_length)
+    return 0.5 * (the_irfft * input_size + _math_ops.real(extra_terms)), None
+
+  return _grad
+
+
+def _irfft_grad_helper(rank, rfft_fn):
+  """Returns a gradient function for an IRFFT of the provided rank."""
+  # Can't happen because we don't register a gradient for IRFFT3D.
+  assert rank in (1, 2), "Gradient for IRFFT3D is not implemented."
+
+  def _grad(op, grad):
+    """A gradient function for IRFFT with the provided `rank` and `rfft_fn`."""
+    # Generate a simple mask like [1.0, 2.0, ..., 2.0, 1.0] for even-length FFTs
+    # and [1.0, 2.0, ..., 2.0] for odd-length FFTs. To reduce extra ops in the
+    # graph we special-case the situation where the FFT length and last
+    # dimension of the input are known at graph construction time.
+    fft_length = op.inputs[1]
+    is_odd = _math_ops.mod(fft_length[-1], 2)
+    input_last_dimension = _array_ops.shape(op.inputs[0])[-1]
+    mask = _array_ops.concat(
+        [[1.0], 2.0 * _array_ops.ones([input_last_dimension - 2 + is_odd]),
+         _array_ops.ones([1 - is_odd])], 0)
+
+    rsize = _math_ops.reciprocal(_math_ops.to_float(
+        _fft_size_for_grad(grad, rank)))
+
+    # The gradient of IRFFT is the RFFT of the incoming gradient times a scaling
+    # factor and a mask. The mask scales the gradient for the Hermitian
+    # symmetric components of the RFFT by a factor of two, since these
+    # components are de-duplicated in the RFFT.
+    the_rfft = rfft_fn(grad, fft_length)
+    return the_rfft * _math_ops.cast(rsize * mask, _dtypes.complex64), None
+
+  return _grad
+
+
+_ops.RegisterGradient("RFFT")(_rfft_grad_helper(1, irfft))
+_ops.RegisterGradient("IRFFT")(_irfft_grad_helper(1, rfft))
+_ops.RegisterGradient("RFFT2D")(_rfft_grad_helper(2, irfft2d))
+_ops.RegisterGradient("IRFFT2D")(_irfft_grad_helper(2, rfft2d))
diff --git a/tensorflow/python/ops/signal/mfcc_ops.py b/tensorflow/python/ops/signal/mfcc_ops.py
index 6ae3b222ba5..601409dea90 100644
--- a/tensorflow/python/ops/signal/mfcc_ops.py
+++ b/tensorflow/python/ops/signal/mfcc_ops.py
@@ -22,7 +22,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import spectral_ops
+from tensorflow.python.ops.signal import dct_ops
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -106,5 +106,5 @@ def mfccs_from_log_mel_spectrograms(log_mel_spectrograms, name=None):
     else:
       num_mel_bins = array_ops.shape(log_mel_spectrograms)[-1]
 
-    dct2 = spectral_ops.dct(log_mel_spectrograms)
+    dct2 = dct_ops.dct(log_mel_spectrograms, type=2)
     return dct2 * math_ops.rsqrt(math_ops.to_float(num_mel_bins) * 2.0)
diff --git a/tensorflow/python/ops/signal/reconstruction_ops.py b/tensorflow/python/ops/signal/reconstruction_ops.py
index 0fc7fec2393..4eaab4e0a0c 100644
--- a/tensorflow/python/ops/signal/reconstruction_ops.py
+++ b/tensorflow/python/ops/signal/reconstruction_ops.py
@@ -18,46 +18,14 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
-from tensorflow.python.ops.signal import shape_ops
-from tensorflow.python.ops.signal import util_ops
 from tensorflow.python.util.tf_export import tf_export
 
 
-def _shuffle_to_front(input_tensor, k):
-  """Shuffles the last `k` indices of `input_tensor` to the front.
-
-  Transposes `input_tensor` to have the last `k` indices at the front. The input
-  may have arbitrary rank and unknown shape.
-
-  Args:
-    input_tensor: A `Tensor` of arbitrary rank and unknown shape.
-    k: A scalar `Tensor` specifying how many indices to shuffle.
-
-  Returns:
-    A transposed version of `input_tensor` with `k` indices shuffled to the
-    front.
-
-  Raises:
-    ValueError: If `input_tensor` is not at least rank `k` or `k` is not scalar.
-  """
-  k = ops.convert_to_tensor(k, name="k")
-  k.shape.with_rank(0)
-  k_static = tensor_util.constant_value(k)
-  if k_static is not None:
-    input_tensor.shape.with_rank_at_least(k_static)
-
-  rank = array_ops.rank(input_tensor)
-  outer_indices, inner_indices = array_ops.split(math_ops.range(rank),
-                                                 [rank - k, k])
-  permutation = array_ops.concat([inner_indices, outer_indices], 0)
-
-  return array_ops.transpose(input_tensor, perm=permutation)
-
-
 @tf_export("signal.overlap_and_add")
 def overlap_and_add(signal, frame_step, name=None):
   """Reconstructs a signal from a framed representation.
@@ -80,8 +48,8 @@ def overlap_and_add(signal, frame_step, name=None):
     frames of `signal`'s inner-most two dimensions.
 
   Raises:
-    ValueError: If `signal`'s rank is less than 2, `frame_step` is not a scalar
-      integer or `frame_step` is greater than `frame_length`.
+    ValueError: If `signal`'s rank is less than 2, or `frame_step` is not a
+      scalar integer.
   """
   with ops.name_scope(name, "overlap_and_add", [signal, frame_step]):
     signal = ops.convert_to_tensor(signal, name="signal")
@@ -97,56 +65,91 @@ def overlap_and_add(signal, frame_step, name=None):
     # All dimensions that are not part of the overlap-and-add. Can be empty for
     # rank 2 inputs.
     outer_dimensions = signal_shape[:-2]
+    outer_rank = array_ops.size(outer_dimensions)
 
-    # If frame_length and frame_step are known at graph construction time, check
-    # frame_step is less than or equal to frame_length.
-    frame_step_static = tensor_util.constant_value(frame_step)
-    if (frame_step_static is not None and signal.shape.ndims is not None and
-        signal.shape.dims[-1].value is not None):
-      if frame_step_static > signal.shape.dims[-1].value:
-        raise ValueError(
-            "frame_step (%d) must be less than or equal to "
-            "frame_length (%d)" % (
-                frame_step_static, signal.shape.dims[-1].value))
-      # If frame_length is equal to frame_step, there's no overlap so just
-      # reshape the tensor.
-      if frame_step_static == signal.shape.dims[-1].value:
-        return array_ops.reshape(signal, array_ops.concat(
-            [outer_dimensions, [-1]], 0))
+    def full_shape(inner_shape):
+      return array_ops.concat([outer_dimensions, inner_shape], 0)
 
-    signal_rank = array_ops.rank(signal)
-    frames = signal_shape[-2]
     frame_length = signal_shape[-1]
+    frames = signal_shape[-2]
 
-    subframe_length = util_ops.gcd(frame_length, frame_step)
-    subframe_step = frame_step // subframe_length
-    subframes_per_frame = frame_length // subframe_length
-    output_size = frame_step * (frames - 1) + frame_length
-    output_subframes = output_size // subframe_length
+    # Compute output length.
+    output_length = frame_length + frame_step * (frames - 1)
 
-    # To avoid overlap-adding sample-by-sample, we overlap-add at the "subframe"
-    # level, where a subframe is gcd(frame_length, frame_step). Reshape signal
-    # from [..., frames, frame_length] into [..., subframes, subframe_length].
-    subframe_shape = array_ops.concat(
-        [outer_dimensions, [-1, subframe_length]], 0)
-    subframe_signal = array_ops.reshape(signal, subframe_shape)
+    # If frame_length is equal to frame_step, there's no overlap so just
+    # reshape the tensor.
+    frame_step_static = tensor_util.constant_value(frame_step)
+    if (frame_step_static is not None and signal.shape.dims is not None and
+        frame_step_static == signal.shape.dims[-1].value):
+      output_shape = full_shape([output_length])
+      return array_ops.reshape(signal, output_shape, name="fast_path")
 
-    # Now we shuffle the last [subframes, subframe_length] dimensions to the
-    # front.
-    # TODO(rjryan): Add an axis argument to unsorted_segment_sum so we can
-    # avoid this pair of transposes.
-    subframe_signal = _shuffle_to_front(subframe_signal, 2)
+    # The following code is documented using this example:
+    #
+    # frame_step = 2
+    # signal.shape = (3, 5)
+    # a b c d e
+    # f g h i j
+    # k l m n o
 
-    # Use unsorted_segment_sum to add overlapping subframes together.
-    segment_ids = array_ops.reshape(shape_ops.frame(
-        math_ops.range(output_subframes), subframes_per_frame, subframe_step,
-        pad_end=False), [-1])
-    result = math_ops.unsorted_segment_sum(subframe_signal, segment_ids,
-                                           num_segments=output_subframes)
+    # Compute the number of segments, per frame.
+    segments = -(-frame_length // frame_step)  # Divide and round up.
 
-    # result is a [subframes, subframe_length, ...outer_dimensions] tensor. We
-    # return a [...outer_dimensions, output_size] tensor with a transpose and
-    # reshape.
-    result_shape = array_ops.concat([outer_dimensions, [output_size]], 0)
-    return array_ops.reshape(_shuffle_to_front(result, signal_rank - 2),
-                             result_shape)
+    # Pad the frame_length dimension to a multiple of the frame step.
+    # Pad the frames dimension by `segments` so that signal.shape = (6, 6)
+    # a b c d e 0
+    # f g h i j 0
+    # k l m n o 0
+    # 0 0 0 0 0 0
+    # 0 0 0 0 0 0
+    # 0 0 0 0 0 0
+    paddings = [[0, segments], [0, segments * frame_step - frame_length]]
+    outer_paddings = array_ops.zeros([outer_rank, 2], dtypes.int32)
+    paddings = array_ops.concat([outer_paddings, paddings], 0)
+    signal = array_ops.pad(signal, paddings)
+
+    # Reshape so that signal.shape = (3, 6, 2)
+    # ab cd e0
+    # fg hi j0
+    # kl mn o0
+    # 00 00 00
+    # 00 00 00
+    # 00 00 00
+    shape = full_shape([frames + segments, segments, frame_step])
+    signal = array_ops.reshape(signal, shape)
+
+    # Transpose dimensions so that signal.shape = (3, 6, 2)
+    # ab fg kl 00 00 00
+    # cd hi mn 00 00 00
+    # e0 j0 o0 00 00 00
+    perm = array_ops.concat(
+        [math_ops.range(outer_rank), outer_rank + [1, 0, 2]], 0)
+    signal = array_ops.transpose(signal, perm)
+
+    # Reshape so that signal.shape = (18, 2)
+    # ab fg kl 00 00 00 cd hi mn 00 00 00 e0 j0 o0 00 00 00
+    shape = full_shape([(frames + segments) * segments, frame_step])
+    signal = array_ops.reshape(signal, shape)
+
+    # Truncate so that signal.shape = (15, 2)
+    # ab fg kl 00 00 00 cd hi mn 00 00 00 e0 j0 o0
+    signal = signal[..., :(frames + segments - 1) * segments, :]
+
+    # Reshape so that signal.shape = (3, 5, 2)
+    # ab fg kl 00 00
+    # 00 cd hi mn 00
+    # 00 00 e0 j0 o0
+    shape = full_shape([segments, (frames + segments - 1), frame_step])
+    signal = array_ops.reshape(signal, shape)
+
+    # Now, reduce over the columns, to achieve the desired sum.
+    signal = math_ops.reduce_sum(signal, -3)
+
+    # Flatten the array.
+    shape = full_shape([(frames + segments - 1) * frame_step])
+    signal = array_ops.reshape(signal, shape)
+
+    # Truncate to final length.
+    signal = signal[..., :output_length]
+
+    return signal
diff --git a/tensorflow/python/ops/signal/shape_ops.py b/tensorflow/python/ops/signal/shape_ops.py
index 02dd7c97e8f..ae9c2ef28e4 100644
--- a/tensorflow/python/ops/signal/shape_ops.py
+++ b/tensorflow/python/ops/signal/shape_ops.py
@@ -71,7 +71,7 @@ def frame(signal, frame_length, frame_step, pad_end=False, pad_value=0, axis=-1,
   ```python
   pcm = tf.placeholder(tf.float32, [None, 9152])
   frames = tf.signal.frame(pcm, 512, 180)
-  magspec = tf.abs(tf.spectral.rfft(frames, [512]))
+  magspec = tf.abs(tf.signal.rfft(frames, [512]))
   image = tf.expand_dims(magspec, 3)
   ```
 
diff --git a/tensorflow/python/ops/signal/signal.py b/tensorflow/python/ops/signal/signal.py
new file mode 100644
index 00000000000..cdc4d1c1911
--- /dev/null
+++ b/tensorflow/python/ops/signal/signal.py
@@ -0,0 +1,65 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Signal processing operations.
+
+See the [tf.signal](https://tensorflow.org/api_guides/python/contrib.signal)
+guide.
+
+@@frame
+@@hamming_window
+@@hann_window
+@@inverse_stft
+@@inverse_stft_window_fn
+@@mfccs_from_log_mel_spectrograms
+@@linear_to_mel_weight_matrix
+@@overlap_and_add
+@@stft
+
+[hamming]: https://en.wikipedia.org/wiki/Window_function#Hamming_window
+[hann]: https://en.wikipedia.org/wiki/Window_function#Hann_window
+[mel]: https://en.wikipedia.org/wiki/Mel_scale
+[mfcc]: https://en.wikipedia.org/wiki/Mel-frequency_cepstrum
+[stft]: https://en.wikipedia.org/wiki/Short-time_Fourier_transform
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# pylint: disable=unused-import
+from tensorflow.python.ops.signal.dct_ops import dct
+from tensorflow.python.ops.signal.dct_ops import idct
+from tensorflow.python.ops.signal.fft_ops import fft
+from tensorflow.python.ops.signal.fft_ops import fft2d
+from tensorflow.python.ops.signal.fft_ops import fft3d
+from tensorflow.python.ops.signal.fft_ops import ifft
+from tensorflow.python.ops.signal.fft_ops import ifft2d
+from tensorflow.python.ops.signal.fft_ops import ifft3d
+from tensorflow.python.ops.signal.fft_ops import irfft
+from tensorflow.python.ops.signal.fft_ops import irfft2d
+from tensorflow.python.ops.signal.fft_ops import irfft3d
+from tensorflow.python.ops.signal.fft_ops import rfft
+from tensorflow.python.ops.signal.fft_ops import rfft2d
+from tensorflow.python.ops.signal.fft_ops import rfft3d
+from tensorflow.python.ops.signal.mel_ops import linear_to_mel_weight_matrix
+from tensorflow.python.ops.signal.mfcc_ops import mfccs_from_log_mel_spectrograms
+from tensorflow.python.ops.signal.reconstruction_ops import overlap_and_add
+from tensorflow.python.ops.signal.shape_ops import frame
+from tensorflow.python.ops.signal.spectral_ops import inverse_stft
+from tensorflow.python.ops.signal.spectral_ops import inverse_stft_window_fn
+from tensorflow.python.ops.signal.spectral_ops import stft
+from tensorflow.python.ops.signal.window_ops import hamming_window
+from tensorflow.python.ops.signal.window_ops import hann_window
+# pylint: enable=unused-import
diff --git a/tensorflow/python/ops/signal/spectral_ops.py b/tensorflow/python/ops/signal/spectral_ops.py
index b0b7d964b93..f029e0a8b59 100644
--- a/tensorflow/python/ops/signal/spectral_ops.py
+++ b/tensorflow/python/ops/signal/spectral_ops.py
@@ -25,7 +25,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import spectral_ops
+from tensorflow.python.ops.signal import fft_ops
 from tensorflow.python.ops.signal import reconstruction_ops
 from tensorflow.python.ops.signal import shape_ops
 from tensorflow.python.ops.signal import window_ops
@@ -86,9 +86,9 @@ def stft(signals, frame_length, frame_step, fft_length=None,
       window = window_fn(frame_length, dtype=framed_signals.dtype)
       framed_signals *= window
 
-    # spectral_ops.rfft produces the (fft_length/2 + 1) unique components of the
+    # fft_ops.rfft produces the (fft_length/2 + 1) unique components of the
     # FFT of the real windowed signals in framed_signals.
-    return spectral_ops.rfft(framed_signals, [fft_length])
+    return fft_ops.rfft(framed_signals, [fft_length])
 
 
 @tf_export('signal.inverse_stft_window_fn')
@@ -232,7 +232,7 @@ def inverse_stft(stfts,
       fft_length = ops.convert_to_tensor(fft_length, name='fft_length')
       fft_length.shape.assert_has_rank(0)
 
-    real_frames = spectral_ops.irfft(stfts, [fft_length])
+    real_frames = fft_ops.irfft(stfts, [fft_length])
 
     # frame_length may be larger or smaller than fft_length, so we pad or
     # truncate real_frames to frame_length.
diff --git a/tensorflow/python/ops/sort_ops.py b/tensorflow/python/ops/sort_ops.py
new file mode 100644
index 00000000000..c3e23d701ed
--- /dev/null
+++ b/tensorflow/python/ops/sort_ops.py
@@ -0,0 +1,197 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Support for sorting tensors.
+
+@@argsort
+@@sort
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import ops as framework_ops
+from tensorflow.python.framework import tensor_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn_ops
+from tensorflow.python.util.tf_export import tf_export
+
+
+@tf_export('sort')
+def sort(values, axis=-1, direction='ASCENDING', name=None):
+  """Sorts a tensor.
+
+  Args:
+    values: 1-D or higher numeric `Tensor`.
+    axis: The axis along which to sort. The default is -1, which sorts the last
+      axis.
+    direction: The direction in which to sort the values (`'ASCENDING'` or
+      `'DESCENDING'`).
+    name: Optional name for the operation.
+
+  Returns:
+    A `Tensor` with the same dtype and shape as `values`, with the elements
+        sorted along the given `axis`.
+
+  Raises:
+    ValueError: If axis is not a constant scalar, or the direction is invalid.
+  """
+  with framework_ops.name_scope(name, 'sort'):
+    return _sort_or_argsort(values, axis, direction, return_argsort=False)
+
+
+@tf_export('argsort')
+def argsort(values, axis=-1, direction='ASCENDING', stable=False, name=None):
+  """Returns the indices of a tensor that give its sorted order along an axis.
+
+  For a 1D tensor, `tf.gather(values, tf.argsort(values))` is equivalent to
+  `tf.sort(values)`. For higher dimensions, the output has the same shape as
+  `values`, but along the given axis, values represent the index of the sorted
+  element in that slice of the tensor at the given position.
+
+  Args:
+    values: 1-D or higher numeric `Tensor`.
+    axis: The axis along which to sort. The default is -1, which sorts the last
+      axis.
+    direction: The direction in which to sort the values (`'ASCENDING'` or
+      `'DESCENDING'`).
+    stable: If True, equal elements in the original tensor will not be
+      re-ordered in the returned order. Unstable sort is not yet implemented,
+      but will eventually be the default for performance reasons. If you require
+      a stable order, pass `stable=True` for forwards compatibility.
+    name: Optional name for the operation.
+
+  Returns:
+    An int32 `Tensor` with the same shape as `values`. The indices that would
+        sort each slice of the given `values` along the given `axis`.
+
+  Raises:
+    ValueError: If axis is not a constant scalar, or the direction is invalid.
+  """
+  del stable  # Unused.
+  with framework_ops.name_scope(name, 'argsort'):
+    return _sort_or_argsort(values, axis, direction, return_argsort=True)
+
+
+def _sort_or_argsort(values, axis, direction, return_argsort):
+  """Internal sort/argsort implementation.
+
+  Args:
+    values: The input values.
+    axis: The axis along which to sort.
+    direction: 'ASCENDING' or 'DESCENDING'.
+    return_argsort: Whether to return the argsort result.
+
+  Returns:
+    Either the sorted values, or the indices of the sorted values in the
+        original tensor. See the `sort` and `argsort` docstrings.
+
+  Raises:
+    ValueError: If axis is not a constant scalar, or the direction is invalid.
+  """
+  if direction not in _SORT_IMPL:
+    raise ValueError('%s should be one of %s' % (direction, ', '.join(
+        sorted(_SORT_IMPL.keys()))))
+  # Axis must be an integer, not a Tensor.
+  axis = framework_ops.convert_to_tensor(axis, name='axis')
+  axis_static = tensor_util.constant_value(axis)
+  if axis.shape.ndims != 0 or axis_static is None:
+    raise ValueError('axis must be a constant scalar')
+  axis_static = int(axis_static)  # Avoids NumPy casting error
+
+  values = framework_ops.convert_to_tensor(values, name='values')
+
+  return _SORT_IMPL[direction](values, axis_static, return_argsort)
+
+
+def _descending_sort(values, axis, return_argsort=False):
+  """Sorts values in reverse using `top_k`.
+
+  Args:
+    values: Tensor of numeric values.
+    axis: Index of the axis which values should be sorted along.
+    return_argsort: If False, return the sorted values. If True, return the
+      indices that would sort the values.
+
+  Returns:
+    The sorted values.
+  """
+  k = array_ops.shape(values)[axis]
+  rank = array_ops.rank(values)
+  static_rank = values.shape.ndims
+  # Fast path: sorting the last axis.
+  if axis == -1 or axis + 1 == values.get_shape().ndims:
+    top_k_input = values
+    transposition = None
+  else:
+    # Otherwise, transpose the array. Swap axes `axis` and `rank - 1`.
+    if axis < 0:
+      # Calculate the actual axis index if counting from the end. Use the static
+      # rank if available, or else make the axis back into a tensor.
+      axis += static_rank or rank
+    if static_rank is not None:
+      # Prefer to calculate the transposition array in NumPy and make it a
+      # constant.
+      transposition = constant_op.constant(
+          np.r_[
+              # Axes up to axis are unchanged.
+              np.arange(axis),
+              # Swap axis and rank - 1.
+              [static_rank - 1],
+              # Axes in [axis + 1, rank - 1) are unchanged.
+              np.arange(axis + 1, static_rank - 1),
+              # Swap axis and rank - 1.
+              [axis]],
+          name='transposition')
+    else:
+      # Generate the transposition array from the tensors.
+      transposition = array_ops.concat(
+          [
+              # Axes up to axis are unchanged.
+              math_ops.range(axis),
+              # Swap axis and rank - 1.
+              [rank - 1],
+              # Axes in [axis + 1, rank - 1) are unchanged.
+              math_ops.range(axis + 1, rank - 1),
+              # Swap axis and rank - 1.
+              [axis]
+          ],
+          axis=0)
+    top_k_input = array_ops.transpose(values, transposition)
+
+  values, indices = nn_ops.top_k(top_k_input, k)
+  return_value = indices if return_argsort else values
+  if transposition is not None:
+    # transposition contains a single cycle of length 2 (swapping 2 elements),
+    # so it is an involution (it is its own inverse).
+    return_value = array_ops.transpose(return_value, transposition)
+  return return_value
+
+
+def _ascending_sort(values, axis, return_argsort=False):
+  # Negate the values to get the ascending order from descending sort.
+  values_or_indices = _descending_sort(-values, axis, return_argsort)
+  # If not argsort, negate the values again.
+  return values_or_indices if return_argsort else -values_or_indices
+
+
+_SORT_IMPL = {
+    'ASCENDING': _ascending_sort,
+    'DESCENDING': _descending_sort,
+}
diff --git a/tensorflow/contrib/framework/python/ops/sort_ops_test.py b/tensorflow/python/ops/sort_ops_test.py
similarity index 90%
rename from tensorflow/contrib/framework/python/ops/sort_ops_test.py
rename to tensorflow/python/ops/sort_ops_test.py
index 791b32cd1e2..17ce604cbf1 100644
--- a/tensorflow/contrib/framework/python/ops/sort_ops_test.py
+++ b/tensorflow/python/ops/sort_ops_test.py
@@ -20,22 +20,25 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.contrib.framework.python.ops import sort_ops
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import sort_ops
 from tensorflow.python.platform import test
 
 
 class SortTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testRandom_lowDimensionality(self):
     self._testRandom_lowDimensionality(negative_axis=False)
 
+  @test_util.run_deprecated_v1
   def testRandom_lowDimensionality_negative(self):
     self._testRandom_lowDimensionality(negative_axis=True)
 
@@ -53,6 +56,7 @@ class SortTest(test.TestCase):
             np.sort(arr, axis=sort_axis),
             sort_ops.sort(constant_op.constant(arr), axis=sort_axis).eval())
 
+  @test_util.run_deprecated_v1
   def testRandom_highDimensionality(self):
     np.random.seed(100)
     for _ in range(20):
@@ -65,6 +69,7 @@ class SortTest(test.TestCase):
             np.sort(arr, axis=sort_axis),
             sort_ops.sort(constant_op.constant(arr), axis=sort_axis).eval())
 
+  @test_util.run_deprecated_v1
   def testScalar(self):
     # Create an empty scalar where the static shape is unknown.
     zeros_length_1 = array_ops.zeros(
@@ -77,21 +82,22 @@ class SortTest(test.TestCase):
       with self.assertRaises(errors.InvalidArgumentError):
         sort.eval()
 
+  @test_util.run_deprecated_v1
   def testNegativeOutOfBounds_staticShape(self):
     arr = constant_op.constant([3, 4, 5])
     with self.assertRaises(ValueError):
       sort_ops.sort(arr, axis=-4)
 
+  @test_util.run_deprecated_v1
   def testDescending(self):
     arr = np.random.random((10, 5, 5))
     with self.cached_session():
       self.assertAllEqual(
           np.sort(arr, axis=0)[::-1],
           sort_ops.sort(
-              constant_op.constant(arr),
-              axis=0,
-              direction='DESCENDING').eval())
+              constant_op.constant(arr), axis=0, direction='DESCENDING').eval())
 
+  @test_util.run_deprecated_v1
   def testSort_staticallyKnownRank_constantTransposition(self):
     # The transposition array should be a constant if the rank of "values" is
     # statically known.
@@ -109,6 +115,7 @@ class SortTest(test.TestCase):
         tensor_util.constant_value(transposition),
         [0, 4, 2, 3, 1])
 
+  @test_util.run_deprecated_v1
   def testArgsort_1d(self):
     arr = np.random.random(42)
     with self.cached_session():
@@ -116,6 +123,7 @@ class SortTest(test.TestCase):
           np.sort(arr),
           array_ops.gather(arr, sort_ops.argsort(arr)).eval())
 
+  @test_util.run_deprecated_v1
   def testArgsort(self):
     arr = np.random.random((5, 6, 7, 8))
     for axis in range(4):
diff --git a/tensorflow/python/ops/sparse_grad.py b/tensorflow/python/ops/sparse_grad.py
index 1223b290ff6..2ca9c0c647d 100644
--- a/tensorflow/python/ops/sparse_grad.py
+++ b/tensorflow/python/ops/sparse_grad.py
@@ -195,7 +195,7 @@ def _SparseTensorDenseMatMulGrad(op, grad):
   parts_a = array_ops.gather(grad, rows if not adj_a else cols)
   parts_b = array_ops.gather(b if not adj_b else array_ops.transpose(b),
                              cols if not adj_a else rows)
-  a_values_grad = math_ops.reduce_sum(parts_a * parts_b, reduction_indices=1)
+  a_values_grad = math_ops.reduce_sum(parts_a * parts_b, axis=1)
 
   # gradients w.r.t. (a_indices, a_values, a_shape, b)
   return (None, a_values_grad, None, b_grad)
diff --git a/tensorflow/python/ops/sparse_ops.py b/tensorflow/python/ops/sparse_ops.py
index b98c7f5f65b..346ab9c0cb4 100644
--- a/tensorflow/python/ops/sparse_ops.py
+++ b/tensorflow/python/ops/sparse_ops.py
@@ -44,6 +44,9 @@ from tensorflow.python.ops.gen_sparse_ops import *
 # pylint: enable=wildcard-import
 from tensorflow.python.util import compat
 from tensorflow.python.util import deprecation
+from tensorflow.python.util import dispatch
+from tensorflow.python.util import tf_inspect
+from tensorflow.python.util.tf_export import get_canonical_name_for_symbol
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -186,7 +189,7 @@ def sparse_eye(num_rows,
 
 
 # pylint: disable=protected-access
-@tf_export("sparse.concat", "sparse_concat")
+@tf_export(v1=["sparse.concat", "sparse_concat"])
 @deprecation.deprecated_endpoints("sparse_concat")
 @deprecation.deprecated_args(
     None, "concat_dim is deprecated, use axis instead", "concat_dim")
@@ -292,6 +295,11 @@ def sparse_concat(axis,
   """
   axis = deprecation.deprecated_argument_lookup("axis", axis, "concat_dim",
                                                 concat_dim)
+  return sparse_concat_v2(axis, sp_inputs, expand_nonconcat_dim, name)
+
+
+@tf_export("sparse.concat", v1=[])
+def sparse_concat_v2(axis, sp_inputs, expand_nonconcat_dim=False, name=None):  # pylint: disable=missing-docstring
   sp_inputs = _convert_to_sparse_tensors(sp_inputs)
 
   if len(sp_inputs) == 1:  # Degenerate case of one tensor.
@@ -319,9 +327,15 @@ def sparse_concat(axis,
   return sparse_tensor.SparseTensor(output_ind, output_val, output_shape)
 
 
-@tf_export("sparse.add", v1=["sparse.add", "sparse_add"])
+sparse_concat_v2.__doc__ = sparse_concat.__doc__.replace(
+    "    concat_dim: The old (deprecated) name for axis.\n", "")
+
+
+@tf_export(v1=["sparse.add", "sparse_add"])
 @deprecation.deprecated_endpoints("sparse_add")
-def sparse_add(a, b, thresh=0):
+@deprecation.deprecated_args(
+    None, "thresh is deprecated, use threshold instead", "thresh")
+def sparse_add(a, b, threshold=None, thresh=None):
   """Adds two tensors, at least one of each is a `SparseTensor`.
 
   If one `SparseTensor` and one `Tensor` are passed in, returns a `Tensor`.  If
@@ -359,12 +373,74 @@ def sparse_add(a, b, thresh=0):
 
   Args:
     a: The first operand; `SparseTensor` or `Tensor`.
-    b: The second operand; `SparseTensor` or `Tensor`.  At least one operand
+    b: The second operand; `SparseTensor` or `Tensor`. At least one operand
       must be sparse.
-    thresh: A 0-D `Tensor`.  The magnitude threshold that determines if an
-    output value/index pair takes space.  Its dtype should match that of the
-    values if they are real; if the latter are complex64/complex128, then the
-    dtype should be float32/float64, correspondingly.
+    threshold: An optional 0-D `Tensor` (defaults to `0`). The magnitude
+      threshold that determines if an output value/index pair takes space. Its
+      dtype should match that of the values if they are real; if the latter are
+      complex64/complex128, then the dtype should be float32/float64,
+      correspondingly.
+    thresh: Deprecated alias for `threshold`.
+
+  Returns:
+    A `SparseTensor` or a `Tensor`, representing the sum.
+
+  Raises:
+    TypeError: If both `a` and `b` are `Tensor`s.  Use `tf.add()` instead.
+  """
+  threshold = deprecation.deprecated_argument_lookup("threshold", threshold,
+                                                     "thresh", thresh)
+  if threshold is None:
+    threshold = 0
+  return sparse_add_v2(a, b, threshold)
+
+
+@tf_export("sparse.add", v1=[])
+def sparse_add_v2(a, b, threshold=0):
+  """Adds two tensors, at least one of each is a `SparseTensor`.
+
+  If one `SparseTensor` and one `Tensor` are passed in, returns a `Tensor`.  If
+  both arguments are `SparseTensor`s, this returns a `SparseTensor`.  The order
+  of arguments does not matter.  Use vanilla `tf.add()` for adding two dense
+  `Tensor`s.
+
+  The shapes of the two operands must match: broadcasting is not supported.
+
+  The indices of any input `SparseTensor` are assumed ordered in standard
+  lexicographic order.  If this is not the case, before this step run
+  `SparseReorder` to restore index ordering.
+
+  If both arguments are sparse, we perform "clipping" as follows.  By default,
+  if two values sum to zero at some index, the output `SparseTensor` would still
+  include that particular location in its index, storing a zero in the
+  corresponding value slot.  To override this, callers can specify `threshold`,
+  indicating that if the sum has a magnitude strictly smaller than `threshold`,
+  its corresponding value and index would then not be included.  In particular,
+  `threshold == 0.0` (default) means everything is kept and actual thresholding
+  happens only for a positive value.
+
+  For example, suppose the logical sum of two sparse operands is (densified):
+
+      [       2]
+      [.1     0]
+      [ 6   -.2]
+
+  Then,
+
+      * `threshold == 0` (the default): all 5 index/value pairs will be
+          returned.
+      * `threshold == 0.11`: only .1 and 0 will vanish, and the remaining three
+          index/value pairs will be returned.
+      * `threshold == 0.21`: .1, 0, and -.2 will vanish.
+
+  Args:
+    a: The first operand; `SparseTensor` or `Tensor`.
+    b: The second operand; `SparseTensor` or `Tensor`. At least one operand
+      must be sparse.
+    threshold: A 0-D `Tensor`. The magnitude threshold that determines if an
+      output value/index pair takes space. Its dtype should match that of the
+      values if they are real; if the latter are complex64/complex128, then the
+      dtype should be float32/float64, correspondingly.
 
   Returns:
     A `SparseTensor` or a `Tensor`, representing the sum.
@@ -380,11 +456,12 @@ def sparse_add(a, b, thresh=0):
   if all(isinstance(inp, sparse_classes) for inp in [a, b]):
     a = _convert_to_sparse_tensor(a)
     b = _convert_to_sparse_tensor(b)
-    thresh = ops.convert_to_tensor(
-        thresh, dtype=a.values.dtype.real_dtype.base_dtype, name="thresh")
+    threshold = ops.convert_to_tensor(
+        threshold, dtype=a.values.dtype.real_dtype.base_dtype, name="threshold")
     output_ind, output_val, output_shape = (
         gen_sparse_ops.sparse_add(a.indices, a.values, a.dense_shape,
-                                  b.indices, b.values, b.dense_shape, thresh))
+                                  b.indices, b.values, b.dense_shape,
+                                  threshold))
 
     # Attempt to get output_shape statically.
     a.get_shape().assert_is_compatible_with(b.get_shape())
@@ -705,7 +782,7 @@ class KeywordRequired(object):
     return "KeywordRequired()"
 
 
-@tf_export("sparse.split", "sparse_split")
+@tf_export(v1=["sparse.split", "sparse_split"])
 @deprecation.deprecated_endpoints("sparse_split")
 @deprecation.deprecated_args(
     None, "split_dim is deprecated, use axis instead", "split_dim")
@@ -779,6 +856,51 @@ def sparse_split(keyword_required=KeywordRequired(),
   return sparse_tensors
 
 
+@tf_export("sparse.split", v1=[])
+def sparse_split_v2(sp_input=None,
+                    num_split=None,
+                    axis=None,
+                    name=None):
+  """Split a `SparseTensor` into `num_split` tensors along `axis`.
+
+  If the `sp_input.dense_shape[axis]` is not an integer multiple of `num_split`
+  each slice starting from 0:`shape[axis] % num_split` gets extra one
+  dimension. For example, if `axis = 1` and `num_split = 2` and the
+  input is:
+
+      input_tensor = shape = [2, 7]
+      [    a   d e  ]
+      [b c          ]
+
+  Graphically the output tensors are:
+
+      output_tensor[0] =
+      [    a ]
+      [b c   ]
+
+      output_tensor[1] =
+      [ d e  ]
+      [      ]
+
+  Args:
+    sp_input: The `SparseTensor` to split.
+    num_split: A Python integer. The number of ways to split.
+    axis: A 0-D `int32` `Tensor`. The dimension along which to split.
+    name: A name for the operation (optional).
+
+  Returns:
+    `num_split` `SparseTensor` objects resulting from splitting `value`.
+
+  Raises:
+    TypeError: If `sp_input` is not a `SparseTensor`.
+  """
+  return sparse_split(sp_input=sp_input,
+                      num_split=num_split,
+                      axis=axis,
+                      name=name,
+                      split_dim=None)
+
+
 @tf_export("sparse.slice", v1=["sparse.slice", "sparse_slice"])
 @deprecation.deprecated_endpoints("sparse_slice")
 def sparse_slice(sp_input, start, size, name=None):
@@ -829,7 +951,7 @@ def sparse_slice(sp_input, start, size, name=None):
                                       output_shape)
 
 
-@tf_export("sparse_to_dense")
+@tf_export(v1=["sparse_to_dense"])
 @deprecation.deprecated(
     None,
     "Create a `tf.sparse.SparseTensor` and use `tf.sparse.to_dense` instead.")
@@ -888,7 +1010,86 @@ def sparse_to_dense(sparse_indices,
       name=name)
 
 
-@tf_export("sparse.reduce_max", "sparse_reduce_max")
+@tf_export("sparse.reduce_max", v1=[])
+def sparse_reduce_max_v2(
+    sp_input, axis=None, keepdims=None, output_is_sparse=False, name=None):
+  """Computes the max of elements across dimensions of a SparseTensor.
+
+  This Op takes a SparseTensor and is the sparse counterpart to
+  `tf.reduce_max()`.  In particular, this Op also returns a dense `Tensor`
+  if `output_is_sparse` is `False`, or a `SparseTensor` if `output_is_sparse`
+  is `True`.
+
+  Note: A gradient is not defined for this function, so it can't be used
+  in training models that need gradient descent.
+
+  Reduces `sp_input` along the dimensions given in `axis`.  Unless
+  `keepdims` is true, the rank of the tensor is reduced by 1 for each entry in
+  `axis`. If `keepdims` is true, the reduced dimensions are retained
+  with length 1.
+
+  If `axis` has no entries, all dimensions are reduced, and a tensor
+  with a single element is returned.  Additionally, the axes can be negative,
+  similar to the indexing rules in Python.
+
+  The values not defined in `sp_input` don't participate in the reduce max,
+  as opposed to be implicitly assumed 0 -- hence it can return negative values
+  for sparse `axis`. But, in case there are no values in
+  `axis`, it will reduce to 0. See second example below.
+
+  For example:
+
+  ```python
+  # 'x' represents [[1, ?, 2]
+  #                 [?, 3, ?]]
+  # where ? is implicitly-zero.
+  tf.sparse.reduce_max(x) ==> 3
+  tf.sparse.reduce_max(x, 0) ==> [1, 3, 2]
+  tf.sparse.reduce_max(x, 1) ==> [2, 3]  # Can also use -1 as the axis.
+  tf.sparse.reduce_max(x, 1, keepdims=True) ==> [[2], [3]]
+  tf.sparse.reduce_max(x, [0, 1]) ==> 3
+
+  # 'y' represents [[-7, ?]
+  #                 [ 4, 3]
+  #                 [ ?, ?]
+  tf.sparse.reduce_max(x, 1) ==> [-7, 4, 0]
+  ```
+
+  Args:
+    sp_input: The SparseTensor to reduce. Should have numeric type.
+    axis: The dimensions to reduce; list or scalar. If `None` (the
+      default), reduces all dimensions.
+    keepdims: If true, retain reduced dimensions with length 1.
+    output_is_sparse: If true, returns a `SparseTensor` instead of a dense
+      `Tensor` (the default).
+    name: A name for the operation (optional).
+
+  Returns:
+    The reduced Tensor or the reduced SparseTensor if `output_is_sparse` is
+    True.
+  """
+  if keepdims is None:
+    keepdims = False
+
+  # reduction_axes is the deprecated name for axis.
+  reduction_axes = None
+
+  if output_is_sparse:
+    output_ind, output_val, output_shape = (
+        gen_sparse_ops.sparse_reduce_max_sparse(
+            sp_input.indices, sp_input.values, sp_input.dense_shape,
+            math_ops._ReductionDims(sp_input, axis, reduction_axes), keepdims,
+            name=name))
+
+    return sparse_tensor.SparseTensor(output_ind, output_val, output_shape)
+
+  return gen_sparse_ops.sparse_reduce_max(
+      sp_input.indices, sp_input.values, sp_input.dense_shape,
+      math_ops._ReductionDims(sp_input, axis, reduction_axes), keepdims,
+      name=name)
+
+
+@tf_export(v1=["sparse.reduce_max", "sparse_reduce_max"])
 @deprecation.deprecated_endpoints("sparse_reduce_max")
 @deprecation.deprecated_args(
     None, "keep_dims is deprecated, use keepdims instead", "keep_dims")
@@ -956,7 +1157,7 @@ def sparse_reduce_max(sp_input, axis=None, keepdims=None,
       math_ops._ReductionDims(sp_input, axis, reduction_axes), keepdims)
 
 
-@tf_export("sparse.reduce_max_sparse", "sparse_reduce_max_sparse")
+@tf_export(v1=["sparse.reduce_max_sparse", "sparse_reduce_max_sparse"])
 @deprecation.deprecated_endpoints("sparse_reduce_max_sparse")
 @deprecation.deprecated_args(
     None, "keep_dims is deprecated, use keepdims instead", "keep_dims")
@@ -1007,7 +1208,74 @@ def sparse_reduce_max_sparse(sp_input,
   return sparse_tensor.SparseTensor(output_ind, output_val, output_shape)
 
 
-@tf_export("sparse.reduce_sum", "sparse_reduce_sum")
+@tf_export("sparse.reduce_sum", v1=[])
+def sparse_reduce_sum_v2(
+    sp_input, axis=None, keepdims=None, output_is_sparse=False, name=None):
+  """Computes the sum of elements across dimensions of a SparseTensor.
+
+  This Op takes a SparseTensor and is the sparse counterpart to
+  `tf.reduce_sum()`.  In particular, this Op also returns a dense `Tensor`
+  if `output_is_sparse` is `False`, or a `SparseTensor` if `output_is_sparse`
+  is `True`.
+
+  Note: if `output_is_sparse` is True, a gradient is not defined for this
+  function, so it can't be used in training models that need gradient descent.
+
+  Reduces `sp_input` along the dimensions given in `axis`.  Unless `keepdims` is
+  true, the rank of the tensor is reduced by 1 for each entry in `axis`. If
+  `keepdims` is true, the reduced dimensions are retained with length 1.
+
+  If `axis` has no entries, all dimensions are reduced, and a tensor
+  with a single element is returned.  Additionally, the axes can be negative,
+  similar to the indexing rules in Python.
+
+  For example:
+
+  ```python
+  # 'x' represents [[1, ?, 1]
+  #                 [?, 1, ?]]
+  # where ? is implicitly-zero.
+  tf.sparse.reduce_sum(x) ==> 3
+  tf.sparse.reduce_sum(x, 0) ==> [1, 1, 1]
+  tf.sparse.reduce_sum(x, 1) ==> [2, 1]  # Can also use -1 as the axis.
+  tf.sparse.reduce_sum(x, 1, keepdims=True) ==> [[2], [1]]
+  tf.sparse.reduce_sum(x, [0, 1]) ==> 3
+  ```
+
+  Args:
+    sp_input: The SparseTensor to reduce. Should have numeric type.
+    axis: The dimensions to reduce; list or scalar. If `None` (the
+      default), reduces all dimensions.
+    keepdims: If true, retain reduced dimensions with length 1.
+    output_is_sparse: If true, returns a `SparseTensor` instead of a dense
+      `Tensor` (the default).
+    name: A name for the operation (optional).
+
+  Returns:
+    The reduced Tensor or the reduced SparseTensor if `output_is_sparse` is
+    True.
+  """
+  if keepdims is None:
+    keepdims = False
+
+  # reduction_axes is the deprecated name for axis.
+  reduction_axes = None
+
+  if output_is_sparse:
+    output_ind, output_val, output_shape = (
+        gen_sparse_ops.sparse_reduce_sum_sparse(
+            sp_input.indices, sp_input.values, sp_input.dense_shape,
+            math_ops._ReductionDims(sp_input, axis, reduction_axes), keepdims,
+            name=name))
+    return sparse_tensor.SparseTensor(output_ind, output_val, output_shape)
+
+  return gen_sparse_ops.sparse_reduce_sum(
+      sp_input.indices, sp_input.values, sp_input.dense_shape,
+      math_ops._ReductionDims(sp_input, axis, reduction_axes), keepdims,
+      name=name)
+
+
+@tf_export(v1=["sparse.reduce_sum", "sparse_reduce_sum"])
 @deprecation.deprecated_endpoints("sparse_reduce_sum")
 @deprecation.deprecated_args(
     None, "keep_dims is deprecated, use keepdims instead", "keep_dims")
@@ -1062,7 +1330,7 @@ def sparse_reduce_sum(sp_input, axis=None, keepdims=None,
       math_ops._ReductionDims(sp_input, axis, reduction_axes), keepdims)
 
 
-@tf_export("sparse.reduce_sum_sparse", "sparse_reduce_sum_sparse")
+@tf_export(v1=["sparse.reduce_sum_sparse", "sparse_reduce_sum_sparse"])
 @deprecation.deprecated_endpoints("sparse_reduce_sum_sparse")
 @deprecation.deprecated_args(
     None, "keep_dims is deprecated, use keepdims instead", "keep_dims")
@@ -1157,7 +1425,7 @@ def sparse_tensor_to_dense(sp_input,
   """
   sp_input = _convert_to_sparse_tensor(sp_input)
 
-  return sparse_to_dense(
+  return gen_sparse_ops.sparse_to_dense(
       sp_input.indices,
       sp_input.dense_shape,
       sp_input.values,
@@ -1231,8 +1499,8 @@ def sparse_to_indicator(sp_input, vocab_size, name=None):
         sp_new, default_value=False, validate_indices=False, name=name)
 
 
-@tf_export("sparse.merge", v1=["sparse.merge", "sparse_merge"])
-@deprecation.deprecated_endpoints("sparse_merge")
+@tf_export(v1=["sparse.merge", "sparse_merge"])
+@deprecation.deprecated(None, "No similar op available at this time.")
 def sparse_merge(sp_ids, sp_values, vocab_size, name=None,
                  already_sorted=False):
   """Combines a batch of feature ids and values into a single `SparseTensor`.
@@ -1593,8 +1861,7 @@ def sparse_fill_empty_rows(sp_input, default_value, name=None):
         dense_shape=sp_input.dense_shape), empty_row_indicator)
 
 
-@tf_export(
-    "io.serialize_sparse", v1=["io.serialize_sparse", "serialize_sparse"])
+@tf_export(v1=["io.serialize_sparse", "serialize_sparse"])
 @deprecation.deprecated_endpoints("serialize_sparse")
 def serialize_sparse(sp_input, name=None, out_type=dtypes.string):
   """Serialize a `SparseTensor` into a 3-vector (1-D `Tensor`) object.
@@ -1608,6 +1875,25 @@ def serialize_sparse(sp_input, name=None, out_type=dtypes.string):
     A 3-vector (1-D `Tensor`), with each column representing the serialized
     `SparseTensor`'s indices, values, and shape (respectively).
 
+  Raises:
+    TypeError: If `sp_input` is not a `SparseTensor`.
+  """
+  return serialize_sparse_v2(sp_input, out_type, name)
+
+
+@tf_export("io.serialize_sparse", v1=[])
+def serialize_sparse_v2(sp_input, out_type=dtypes.string, name=None):
+  """Serialize a `SparseTensor` into a 3-vector (1-D `Tensor`) object.
+
+  Args:
+    sp_input: The input `SparseTensor`.
+    out_type: The `dtype` to use for serialization.
+    name: A name prefix for the returned tensors (optional).
+
+  Returns:
+    A 3-vector (1-D `Tensor`), with each column representing the serialized
+    `SparseTensor`'s indices, values, and shape (respectively).
+
   Raises:
     TypeError: If `sp_input` is not a `SparseTensor`.
   """
@@ -1621,9 +1907,7 @@ def serialize_sparse(sp_input, name=None, out_type=dtypes.string):
       out_type=out_type)
 
 
-@tf_export(
-    "io.serialize_many_sparse",
-    v1=["io.serialize_many_sparse", "serialize_many_sparse"])
+@tf_export(v1=["io.serialize_many_sparse", "serialize_many_sparse"])
 @deprecation.deprecated_endpoints("serialize_many_sparse")
 def serialize_many_sparse(sp_input, name=None, out_type=dtypes.string):
   """Serialize `N`-minibatch `SparseTensor` into an `[N, 3]` `Tensor`.
@@ -1646,6 +1930,34 @@ def serialize_many_sparse(sp_input, name=None, out_type=dtypes.string):
     represents serialized `SparseTensor`'s indices, values, and shape
     (respectively).
 
+  Raises:
+    TypeError: If `sp_input` is not a `SparseTensor`.
+  """
+  return serialize_many_sparse_v2(sp_input, out_type, name)
+
+
+@tf_export("io.serialize_many_sparse", v1=[])
+def serialize_many_sparse_v2(sp_input, out_type=dtypes.string, name=None):
+  """Serialize `N`-minibatch `SparseTensor` into an `[N, 3]` `Tensor`.
+
+  The `SparseTensor` must have rank `R` greater than 1, and the first dimension
+  is treated as the minibatch dimension.  Elements of the `SparseTensor`
+  must be sorted in increasing order of this first dimension.  The serialized
+  `SparseTensor` objects going into each row of the output `Tensor` will have
+  rank `R-1`.
+
+  The minibatch size `N` is extracted from `sparse_shape[0]`.
+
+  Args:
+    sp_input: The input rank `R` `SparseTensor`.
+    out_type: The `dtype` to use for serialization.
+    name: A name prefix for the returned tensors (optional).
+
+  Returns:
+    A matrix (2-D `Tensor`) with `N` rows and `3` columns. Each column
+    represents serialized `SparseTensor`'s indices, values, and shape
+    (respectively).
+
   Raises:
     TypeError: If `sp_input` is not a `SparseTensor`.
   """
@@ -1798,7 +2110,9 @@ def deserialize_many_sparse(serialized_sparse, dtype, rank=None, name=None):
   return sparse_tensor.SparseTensor(output_indices, output_values, output_shape)
 
 
-@tf_export("sparse.matmul", v1=["sparse.matmul", "sparse_tensor_dense_matmul"])
+@tf_export("sparse.sparse_dense_matmul",
+           v1=["sparse.sparse_dense_matmul", "sparse.matmul",
+               "sparse_tensor_dense_matmul"])
 @deprecation.deprecated_endpoints("sparse_tensor_dense_matmul")
 def sparse_tensor_dense_matmul(sp_a,
                                b,
@@ -2362,3 +2676,47 @@ def _take_many_sparse_from_tensors_map(sparse_map_op,
   output_shape.set_shape([rank])
 
   return sparse_tensor.SparseTensor(output_indices, output_values, output_shape)
+
+
+class _UnaryMapValueDispatcher(dispatch.OpDispatcher):
+  """OpDispatcher for unary ops that maps base function across sparse values."""
+
+  def __init__(self, original_func):
+    self._original_func = original_func
+    func_name = get_canonical_name_for_symbol(original_func)
+    arg_names = tf_inspect.getfullargspec(original_func)[0]
+    self._x = arg_names[0]
+    original_func.__doc__ = (
+        original_func.__doc__.rstrip() + "\n\n" +
+        ("    If `{x}` is a `SparseTensor`, returns\n"
+         "    `SparseTensor({x}.indices, tf.{func}({x}.values, ...), "
+         "{x}.dense_shape)`").format(x=self._x, func=func_name))
+
+  def handle(self, args, kwargs):
+    if args:
+      x, args = args[0], args[1:]
+    else:
+      x = kwargs.pop(self._x, None)
+    if isinstance(x, sparse_tensor.SparseTensor):
+      return sparse_tensor.SparseTensor(
+          indices=x.indices,
+          values=self._original_func(x.values, *args, **kwargs),
+          dense_shape=x.dense_shape)
+    else:
+      return self.NOT_SUPPORTED
+
+
+_UNARY_OPS = [
+    # TODO(b/120307967) Add dispatchers for additional TensorFlow ops.
+    math_ops.abs,
+    math_ops.negative,
+    math_ops.sign,
+    math_ops.square,
+    math_ops.sqrt,
+    math_ops.erf,
+    math_ops.tanh,
+    math_ops.bessel_i0e,
+    math_ops.bessel_i1e,
+]
+for unary_op in _UNARY_OPS:
+  _UnaryMapValueDispatcher(unary_op).register(unary_op)
diff --git a/tensorflow/python/ops/sparse_ops_test.py b/tensorflow/python/ops/sparse_ops_test.py
index 4ee1569249b..031069a0f01 100644
--- a/tensorflow/python/ops/sparse_ops_test.py
+++ b/tensorflow/python/ops/sparse_ops_test.py
@@ -18,18 +18,20 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import test_util
+from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import sparse_ops
 from tensorflow.python.platform import googletest
 
 
 @test_util.run_all_in_graph_and_eager_modes
-class SparseOpsTest(test_util.TensorFlowTestCase):
+class SparseOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
 
   def testSparseEye(self):
     def test_one(n, m, as_tensors):
@@ -77,5 +79,23 @@ class SparseOpsTest(test_util.TensorFlowTestCase):
           d = sparse_ops.sparse_to_dense(s.indices, s.dense_shape, s.values)
           self.assertAllEqual(self.evaluate(d), expected_after)
 
+  @parameterized.parameters([
+      (math_ops.abs, [1.0, -1.0, 3.0, -4.0], [1.0, 1.0, 3.0, 4.0]),
+      (math_ops.negative, [1.0, -1.0, 3.0, -4.0], [-1.0, 1.0, -3.0, 4.0]),
+      (math_ops.sign, [3.0, -2.0, 0.0, -4.0], [1.0, -1.0, 0.0, -1.0]),
+      (math_ops.square, [1.0, -1.0, 3.0, -4.0], [1.0, 1.0, 9.0, 16.0]),
+  ])
+  def testUnarySparseDispatch(self, op, values, expected):
+    st = sparse_tensor.SparseTensor(
+        indices=[[0, 0], [0, 1], [2, 0], [2, 4]],
+        values=values,
+        dense_shape=[3, 6])
+    result = op(st)
+    result_value = self.evaluate(result)
+    self.assertAllEqual(result_value.indices, st.indices)
+    self.assertAllEqual(result_value.values, expected)
+    self.assertAllEqual(result_value.dense_shape, st.dense_shape)
+
+
 if __name__ == '__main__':
   googletest.main()
diff --git a/tensorflow/python/ops/special_math_ops.py b/tensorflow/python/ops/special_math_ops.py
index f44f694109e..21f4996798e 100644
--- a/tensorflow/python/ops/special_math_ops.py
+++ b/tensorflow/python/ops/special_math_ops.py
@@ -70,8 +70,7 @@ def lbeta(x, name=None):
     x = ops.convert_to_tensor(x, name='x')
 
     # Note reduce_sum([]) = 0.
-    log_prod_gamma_x = math_ops.reduce_sum(
-        math_ops.lgamma(x), reduction_indices=[-1])
+    log_prod_gamma_x = math_ops.reduce_sum(math_ops.lgamma(x), axis=[-1])
 
     # Note lgamma(0) = infinity, so if x = []
     # log_gamma_sum_x = lgamma(0) = infinity, and
@@ -264,11 +263,11 @@ def einsum(equation, *inputs, **kwargs):
 
     missing_indices = set(temp_axis_labels) - set(output_axis_labels)
     if missing_indices:
-      reduction_indices = [
+      axis = [
           i for i, a in enumerate(temp_axis_labels)
           if a not in output_axis_labels
       ]
-      temp = math_ops.reduce_sum(temp, reduction_indices=reduction_indices)
+      temp = math_ops.reduce_sum(temp, axis=axis)
       temp_axis_labels = ''.join(
           a for a in temp_axis_labels if a in output_axis_labels)
 
diff --git a/tensorflow/python/ops/special_math_ops_test.py b/tensorflow/python/ops/special_math_ops_test.py
index 7438cdb3f11..94aaebed951 100644
--- a/tensorflow/python/ops/special_math_ops_test.py
+++ b/tensorflow/python/ops/special_math_ops_test.py
@@ -46,6 +46,7 @@ class LBetaTest(test.TestCase):
           0.5, self.evaluate(math_ops.exp(special_math_ops.lbeta(x_one_half))))
       self.assertEqual([], special_math_ops.lbeta(x_one).get_shape())
 
+  @test_util.run_deprecated_v1
   def test_one_dimensional_arg_dynamic(self):
     # Should evaluate to 1 and 1/2.
     x_one = [1, 1.]
@@ -57,6 +58,7 @@ class LBetaTest(test.TestCase):
       self.assertAllClose(0.5,
                           beta_ph.eval(feed_dict={ph: x_one_half}))
 
+  @test_util.run_deprecated_v1
   def test_four_dimensional_arg_with_partial_shape_dynamic(self):
     x_ = np.ones((3, 2, 3, 4))
     # Gamma(1) = 0! = 1
@@ -81,6 +83,7 @@ class LBetaTest(test.TestCase):
           self.evaluate(math_ops.exp(special_math_ops.lbeta(x_one_half))))
       self.assertEqual((2,), special_math_ops.lbeta(x_one_half).get_shape())
 
+  @test_util.run_deprecated_v1
   def test_two_dimensional_arg_dynamic(self):
     # Should evaluate to 1/2.
     x_one_half = [[2, 1.], [2, 1.]]
@@ -288,6 +291,7 @@ class EinsumTest(test.TestCase):
     for case in self.long_cases:
       self.run_test(case)
 
+  @test_util.run_deprecated_v1
   def test_invalid(self):
     for axes in self.invalid_cases:
       inputs = [
@@ -297,6 +301,7 @@ class EinsumTest(test.TestCase):
       with self.assertRaises(ValueError):
         _ = special_math_ops.einsum(axes, *inputs)
 
+  @test_util.run_deprecated_v1
   def test_invalid_keyword_arguments(self):
     m0 = array_ops.placeholder(dtypes.int32, shape=(1, None))
     m1 = array_ops.placeholder(dtypes.int32, shape=(None, 1))
@@ -311,11 +316,13 @@ class EinsumTest(test.TestCase):
           invalid1='value1',
           invalid2='value2')
 
+  @test_util.run_deprecated_v1
   def test_repeated_axis_single_input(self):
     x = array_ops.placeholder(dtypes.float32, shape=[2, 2])
     with self.assertRaises(ValueError):
       _ = special_math_ops.einsum('ii->', x)
 
+  @test_util.run_deprecated_v1
   def test_dim_mismatch(self):
     for axes, input_shapes in self.dim_mismatch_cases:
       inputs = [
diff --git a/tensorflow/python/ops/spectral_grad.py b/tensorflow/python/ops/spectral_grad.py
deleted file mode 100644
index 0af24114acb..00000000000
--- a/tensorflow/python/ops/spectral_grad.py
+++ /dev/null
@@ -1,185 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Gradients for operators defined in spectral_ops.py."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import spectral_ops
-
-
-def _FFTSizeForGrad(grad, rank):
-  return math_ops.reduce_prod(array_ops.shape(grad)[-rank:])
-
-
-@ops.RegisterGradient("FFT")
-def _FFTGrad(_, grad):
-  size = math_ops.cast(_FFTSizeForGrad(grad, 1), grad.dtype)
-  return spectral_ops.ifft(grad) * size
-
-
-@ops.RegisterGradient("IFFT")
-def _IFFTGrad(_, grad):
-  rsize = math_ops.cast(
-      1. / math_ops.cast(_FFTSizeForGrad(grad, 1), grad.dtype.real_dtype),
-      grad.dtype)
-  return spectral_ops.fft(grad) * rsize
-
-
-@ops.RegisterGradient("FFT2D")
-def _FFT2DGrad(_, grad):
-  size = math_ops.cast(_FFTSizeForGrad(grad, 2), grad.dtype)
-  return spectral_ops.ifft2d(grad) * size
-
-
-@ops.RegisterGradient("IFFT2D")
-def _IFFT2DGrad(_, grad):
-  rsize = math_ops.cast(
-      1. / math_ops.cast(_FFTSizeForGrad(grad, 2), grad.dtype.real_dtype),
-      grad.dtype)
-  return spectral_ops.fft2d(grad) * rsize
-
-
-@ops.RegisterGradient("FFT3D")
-def _FFT3DGrad(_, grad):
-  size = math_ops.cast(_FFTSizeForGrad(grad, 3), grad.dtype)
-  return spectral_ops.ifft3d(grad) * size
-
-
-@ops.RegisterGradient("IFFT3D")
-def _IFFT3DGrad(_, grad):
-  rsize = math_ops.cast(
-      1. / math_ops.cast(_FFTSizeForGrad(grad, 3), grad.dtype.real_dtype),
-      grad.dtype)
-  return spectral_ops.fft3d(grad) * rsize
-
-
-def _RFFTGradHelper(rank, irfft_fn):
-  """Returns a gradient function for an RFFT of the provided rank."""
-  # Can't happen because we don't register a gradient for RFFT3D.
-  assert rank in (1, 2), "Gradient for RFFT3D is not implemented."
-
-  def _Grad(op, grad):
-    """A gradient function for RFFT with the provided `rank` and `irfft_fn`."""
-    fft_length = op.inputs[1]
-    input_shape = array_ops.shape(op.inputs[0])
-    is_even = math_ops.cast(1 - (fft_length[-1] % 2), dtypes.complex64)
-
-    def _TileForBroadcasting(matrix, t):
-      expanded = array_ops.reshape(
-          matrix,
-          array_ops.concat([
-              array_ops.ones([array_ops.rank(t) - 2], dtypes.int32),
-              array_ops.shape(matrix)
-          ], 0))
-      return array_ops.tile(
-          expanded, array_ops.concat([array_ops.shape(t)[:-2], [1, 1]], 0))
-
-    def _MaskMatrix(length):
-      # TODO(rjryan): Speed up computation of twiddle factors using the
-      # following recurrence relation and cache them across invocations of RFFT.
-      #
-      # t_n = exp(sqrt(-1) * pi * n^2 / line_len)
-      # for n = 0, 1,..., line_len-1.
-      # For n > 2, use t_n = t_{n-1}^2 / t_{n-2} * t_1^2
-      a = array_ops.tile(
-          array_ops.expand_dims(math_ops.range(length), 0), (length, 1))
-      b = array_ops.transpose(a, [1, 0])
-      return math_ops.exp(-2j * np.pi * math_ops.cast(a * b, dtypes.complex64) /
-                          math_ops.cast(length, dtypes.complex64))
-
-    def _YMMask(length):
-      """A sequence of [1+0j, -1+0j, 1+0j, -1+0j, ...] with length `length`."""
-      return math_ops.cast(1 - 2 * (math_ops.range(length) % 2),
-                           dtypes.complex64)
-
-    y0 = grad[..., 0:1]
-    if rank == 1:
-      ym = grad[..., -1:]
-      extra_terms = y0 + is_even * ym * _YMMask(input_shape[-1])
-    elif rank == 2:
-      # Create a mask matrix for y0 and ym.
-      base_mask = _MaskMatrix(input_shape[-2])
-
-      # Tile base_mask to match y0 in shape so that we can batch-matmul the
-      # inner 2 dimensions.
-      tiled_mask = _TileForBroadcasting(base_mask, y0)
-
-      y0_term = math_ops.matmul(tiled_mask, math_ops.conj(y0))
-      extra_terms = y0_term
-
-      ym = grad[..., -1:]
-      ym_term = math_ops.matmul(tiled_mask, math_ops.conj(ym))
-
-      inner_dim = input_shape[-1]
-      ym_term = array_ops.tile(
-          ym_term,
-          array_ops.concat([
-              array_ops.ones([array_ops.rank(grad) - 1], dtypes.int32),
-              [inner_dim]
-          ], 0)) * _YMMask(inner_dim)
-
-      extra_terms += is_even * ym_term
-
-    # The gradient of RFFT is the IRFFT of the incoming gradient times a scaling
-    # factor, plus some additional terms to make up for the components dropped
-    # due to Hermitian symmetry.
-    input_size = math_ops.to_float(_FFTSizeForGrad(op.inputs[0], rank))
-    irfft = irfft_fn(grad, fft_length)
-    return 0.5 * (irfft * input_size + math_ops.real(extra_terms)), None
-
-  return _Grad
-
-
-def _IRFFTGradHelper(rank, rfft_fn):
-  """Returns a gradient function for an IRFFT of the provided rank."""
-  # Can't happen because we don't register a gradient for IRFFT3D.
-  assert rank in (1, 2), "Gradient for IRFFT3D is not implemented."
-
-  def _Grad(op, grad):
-    """A gradient function for IRFFT with the provided `rank` and `rfft_fn`."""
-    # Generate a simple mask like [1.0, 2.0, ..., 2.0, 1.0] for even-length FFTs
-    # and [1.0, 2.0, ..., 2.0] for odd-length FFTs. To reduce extra ops in the
-    # graph we special-case the situation where the FFT length and last
-    # dimension of the input are known at graph construction time.
-    fft_length = op.inputs[1]
-    is_odd = math_ops.mod(fft_length[-1], 2)
-    input_last_dimension = array_ops.shape(op.inputs[0])[-1]
-    mask = array_ops.concat(
-        [[1.0], 2.0 * array_ops.ones([input_last_dimension - 2 + is_odd]),
-         array_ops.ones([1 - is_odd])], 0)
-
-    rsize = math_ops.reciprocal(math_ops.to_float(_FFTSizeForGrad(grad, rank)))
-
-    # The gradient of IRFFT is the RFFT of the incoming gradient times a scaling
-    # factor and a mask. The mask scales the gradient for the Hermitian
-    # symmetric components of the RFFT by a factor of two, since these
-    # components are de-duplicated in the RFFT.
-    rfft = rfft_fn(grad, fft_length)
-    return rfft * math_ops.cast(rsize * mask, dtypes.complex64), None
-
-  return _Grad
-
-
-ops.RegisterGradient("RFFT")(_RFFTGradHelper(1, spectral_ops.irfft))
-ops.RegisterGradient("IRFFT")(_IRFFTGradHelper(1, spectral_ops.rfft))
-ops.RegisterGradient("RFFT2D")(_RFFTGradHelper(2, spectral_ops.irfft2d))
-ops.RegisterGradient("IRFFT2D")(_IRFFTGradHelper(2, spectral_ops.rfft2d))
diff --git a/tensorflow/python/ops/standard_ops.py b/tensorflow/python/ops/standard_ops.py
index 4f1662ab086..c614d072bad 100644
--- a/tensorflow/python/ops/standard_ops.py
+++ b/tensorflow/python/ops/standard_ops.py
@@ -31,7 +31,6 @@ from tensorflow.python.ops import manip_grad
 from tensorflow.python.ops import math_grad
 from tensorflow.python.ops import random_grad
 from tensorflow.python.ops import sparse_grad
-from tensorflow.python.ops import spectral_grad
 from tensorflow.python.ops import state_grad
 from tensorflow.python.ops import tensor_array_grad
 
@@ -51,6 +50,7 @@ from tensorflow.python.ops.control_flow_ops import group
 from tensorflow.python.ops.control_flow_ops import no_op
 from tensorflow.python.ops.control_flow_ops import tuple  # pylint: disable=redefined-builtin
 # pylint: enable=redefined-builtin
+from tensorflow.python.eager import wrap_function
 from tensorflow.python.ops.control_flow_ops import while_loop
 from tensorflow.python.ops.data_flow_ops import *
 from tensorflow.python.ops.functional_ops import *
@@ -72,6 +72,7 @@ from tensorflow.python.ops.partitioned_variables import *
 from tensorflow.python.ops.random_ops import *
 from tensorflow.python.ops.script_ops import py_func
 from tensorflow.python.ops.session_ops import *
+from tensorflow.python.ops.sort_ops import *
 from tensorflow.python.ops.sparse_ops import *
 from tensorflow.python.ops.state_ops import assign
 from tensorflow.python.ops.state_ops import assign_add
diff --git a/tensorflow/python/ops/stateless_random_ops.py b/tensorflow/python/ops/stateless_random_ops.py
index c6defabacdb..b119049b163 100644
--- a/tensorflow/python/ops/stateless_random_ops.py
+++ b/tensorflow/python/ops/stateless_random_ops.py
@@ -24,6 +24,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.util import deprecation
 from tensorflow.python.util.tf_export import tf_export
 
 ops.NotDifferentiable("StatelessMultinomial")
@@ -179,7 +180,9 @@ def stateless_truncated_normal(shape,
     return math_ops.add(rnd * stddev, mean, name=name)
 
 
-@tf_export("random.stateless_multinomial")
+@tf_export(v1=["random.stateless_multinomial"])
+@deprecation.deprecated(
+    date=None, instructions="Use tf.random.stateless_categorical instead.")
 def stateless_multinomial(logits,
                           num_samples,
                           seed,
@@ -207,13 +210,58 @@ def stateless_multinomial(logits,
       `[i, :]` represents the unnormalized log-probabilities for all classes.
     num_samples: 0-D.  Number of independent samples to draw for each row slice.
     seed: A shape [2] integer Tensor of seeds to the random number generator.
-    name: Optional name for the operation.
     output_dtype: integer type to use for the output. Defaults to int64.
+    name: Optional name for the operation.
 
   Returns:
     The drawn samples of shape `[batch_size, num_samples]`.
   """
   with ops.name_scope(name, "stateless_multinomial", [logits, seed]):
-    logits = ops.convert_to_tensor(logits, name="logits")
-    return gen_stateless_random_ops.stateless_multinomial(
-        logits, num_samples, seed, output_dtype=output_dtype)
+    return stateless_multinomial_categorical_impl(logits, num_samples,
+                                                  output_dtype, seed)
+
+
+@tf_export("random.stateless_categorical")
+def stateless_categorical(logits,
+                          num_samples,
+                          seed,
+                          dtype=dtypes.int64,
+                          name=None):
+  """Draws deterministic pseudorandom samples from a categorical distribution.
+
+  This is a stateless version of `tf.categorical`: if run twice with the
+  same seeds, it will produce the same pseudorandom numbers.  The output is
+  consistent across multiple runs on the same hardware (and between CPU
+  and GPU), but may change between versions of TensorFlow or on non-CPU/GPU
+  hardware.
+
+  Example:
+
+  ```python
+  # samples has shape [1, 5], where each value is either 0 or 1 with equal
+  # probability.
+  samples = tf.random.stateless_categorical(
+      tf.log([[10., 10.]]), 5, seed=[7, 17])
+  ```
+
+  Args:
+    logits: 2-D Tensor with shape `[batch_size, num_classes]`.  Each slice
+      `[i, :]` represents the unnormalized log-probabilities for all classes.
+    num_samples: 0-D.  Number of independent samples to draw for each row slice.
+    seed: A shape [2] integer Tensor of seeds to the random number generator.
+    dtype: integer type to use for the output. Defaults to int64.
+    name: Optional name for the operation.
+
+  Returns:
+    The drawn samples of shape `[batch_size, num_samples]`.
+  """
+  with ops.name_scope(name, "stateless_categorical", [logits, seed]):
+    return stateless_multinomial_categorical_impl(logits, num_samples, dtype,
+                                                  seed)
+
+
+def stateless_multinomial_categorical_impl(logits, num_samples, dtype, seed):
+  """Implementation for stateless multinomial/categorical ops (v1/v2)."""
+  logits = ops.convert_to_tensor(logits, name="logits")
+  return gen_stateless_random_ops.stateless_multinomial(
+      logits, num_samples, seed, output_dtype=dtype)
diff --git a/tensorflow/python/ops/string_ops.py b/tensorflow/python/ops/string_ops.py
index 25e86cadeb6..b6b329c4865 100644
--- a/tensorflow/python/ops/string_ops.py
+++ b/tensorflow/python/ops/string_ops.py
@@ -28,6 +28,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gen_parsing_ops
 from tensorflow.python.ops import gen_string_ops
 from tensorflow.python.ops import math_ops
 
@@ -311,7 +312,7 @@ def _reduce_join_reduction_dims(x, axis, reduction_indices):
     return math_ops.range(array_ops.rank(x) - 1, -1, -1)
 
 
-@tf_export("strings.reduce_join", v1=["strings.reduce_join", "reduce_join"])
+@tf_export(v1=["strings.reduce_join", "reduce_join"])
 @deprecation.deprecated_endpoints("reduce_join")
 def reduce_join(inputs, axis=None,  # pylint: disable=missing-docstring
                 keep_dims=False,
@@ -329,6 +330,17 @@ def reduce_join(inputs, axis=None,  # pylint: disable=missing-docstring
       name=name)
 
 
+@tf_export("strings.reduce_join", v1=[])
+def reduce_join_v2(  # pylint: disable=missing-docstring
+    inputs,
+    axis=None,
+    keepdims=False,
+    separator="",
+    name=None):
+  return reduce_join(
+      inputs, axis, keep_dims=keepdims, separator=separator, name=name)
+
+
 reduce_join.__doc__ = deprecation.rewrite_argument_docstring(
     gen_string_ops.reduce_join.__doc__, "reduction_indices", "axis")
 reduce_join.__doc__ = reduce_join.__doc__.replace("tf.reduce_join(",
@@ -337,10 +349,14 @@ reduce_join.__doc__ = reduce_join.__doc__.replace("tf.reduce_join(",
 
 # This wrapper provides backwards compatibility for code that predates the
 # unit argument and that passed 'name' as a positional argument.
-@tf_export("strings.length")
+@tf_export(v1=["strings.length"])
 def string_length(input, name=None, unit="BYTE"):
   return gen_string_ops.string_length(input, unit=unit, name=name)
 
+@tf_export("strings.length", v1=[])
+def string_length_v2(input, unit="BYTE", name=None):
+  return string_length(input, name, unit)
+
 
 string_length.__doc__ = gen_string_ops.string_length.__doc__
 
@@ -353,11 +369,16 @@ def substr_deprecated(input, pos, len, name=None, unit="BYTE"):
 substr_deprecated.__doc__ = gen_string_ops.substr.__doc__
 
 
-@tf_export("strings.substr")
+@tf_export(v1=["strings.substr"])
 def substr(input, pos, len, name=None, unit="BYTE"):
   return gen_string_ops.substr(input, pos, len, unit=unit, name=name)
 
 
+@tf_export("strings.substr", v1=[])
+def substr_v2(input, pos, len, unit="BYTE", name=None):
+  return substr(input, pos, len, name=name, unit=unit)
+
+
 substr.__doc__ = gen_string_ops.substr.__doc__
 
 
@@ -371,3 +392,53 @@ ops.NotDifferentiable("StringSplit")
 ops.NotDifferentiable("AsString")
 ops.NotDifferentiable("EncodeBase64")
 ops.NotDifferentiable("DecodeBase64")
+
+
+@tf_export("strings.to_number", v1=[])
+def string_to_number(input, out_type=dtypes.float32, name=None):
+  r"""Converts each string in the input Tensor to the specified numeric type.
+
+  (Note that int32 overflow results in an error while float overflow
+  results in a rounded value.)
+
+  Args:
+    input: A `Tensor` of type `string`.
+    out_type: An optional `tf.DType` from: `tf.float32, tf.float64, tf.int32,
+      tf.int64`. Defaults to `tf.float32`.
+      The numeric type to interpret each string in `string_tensor` as.
+    name: A name for the operation (optional).
+
+  Returns:
+    A `Tensor` of type `out_type`.
+  """
+  return gen_parsing_ops.string_to_number(input, out_type, name)
+tf_export(v1=["strings.to_number", "string_to_number"])(
+    gen_parsing_ops.string_to_number
+    )
+
+
+@tf_export("strings.to_hash_bucket", v1=[])
+def string_to_hash_bucket(input, num_buckets, name=None):
+  # pylint: disable=line-too-long
+  r"""Converts each string in the input Tensor to its hash mod by a number of buckets.
+
+  The hash function is deterministic on the content of the string within the
+  process.
+
+  Note that the hash function may change from time to time.
+  This functionality will be deprecated and it's recommended to use
+  `tf.string_to_hash_bucket_fast()` or `tf.string_to_hash_bucket_strong()`.
+
+  Args:
+    input: A `Tensor` of type `string`.
+    num_buckets: An `int` that is `>= 1`. The number of buckets.
+    name: A name for the operation (optional).
+
+  Returns:
+    A `Tensor` of type `int64`.
+  """
+  # pylint: enable=line-too-long
+  return gen_string_ops.string_to_hash_bucket(input, num_buckets, name)
+tf_export(v1=["strings.to_hash_bucket", "string_to_hash_bucket"])(
+    gen_string_ops.string_to_hash_bucket
+    )
diff --git a/tensorflow/python/ops/summary_op_util.py b/tensorflow/python/ops/summary_op_util.py
index 14aa44a9208..c72a9aefc3f 100644
--- a/tensorflow/python/ops/summary_op_util.py
+++ b/tensorflow/python/ops/summary_op_util.py
@@ -22,6 +22,7 @@ import contextlib
 import re
 
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_util
 from tensorflow.python.platform import tf_logging
 from tensorflow.python.training import distribution_strategy_context
 
@@ -44,13 +45,27 @@ _INVALID_TAG_CHARACTERS = re.compile(r'[^-/\w\.]')
 
 
 def skip_summary():
-  # If using multiple replicas in distributed strategy, skip summaries on all
-  # replicas except the first one (replica_id=0).
+  """Determines if summary should be skipped.
+
+  If using multiple replicas in distributed strategy, skip summaries on all
+  replicas except the first one (replica_id=0).
+
+  Returns:
+    True if the summary is skipped; False otherwise.
+  """
+
   # TODO(priyag): Add a new optional argument that will provide multiple
   # alternatives to override default behavior. (e.g. run on last replica,
   # compute sum or mean across replicas).
   replica_context = distribution_strategy_context.get_replica_context()
-  return replica_context and replica_context.replica_id > 0
+  if not replica_context:
+    return False
+  # TODO(b/118385803): when replica_id of _TPUReplicaContext is properly
+  # initialized, remember to change here as well.
+  replica_id = replica_context.replica_id_in_sync_group
+  if isinstance(replica_id, ops.Tensor):
+    replica_id = tensor_util.constant_value(replica_id)
+  return replica_id and replica_id > 0
 
 
 def clean_tag(name):
diff --git a/tensorflow/python/ops/summary_ops_v2.py b/tensorflow/python/ops/summary_ops_v2.py
index 18cefb8e1c4..3f99b9f8773 100644
--- a/tensorflow/python/ops/summary_ops_v2.py
+++ b/tensorflow/python/ops/summary_ops_v2.py
@@ -40,11 +40,14 @@ from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import summary_op_util
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import training_util
+from tensorflow.python.util import deprecation
 from tensorflow.python.util import tf_contextlib
+from tensorflow.python.util.tf_export import tf_export
 
 
-# A global dictionary mapping graph keys to boolean values indicating whether
-# we should record summaries for this particular graph or not.
+# Dictionary mapping graph keys to a boolean Tensor (or callable returning
+# a boolean Tensor) indicating whether we should record summaries for the
+# graph identified by the key of the dictionary.
 _SHOULD_RECORD_SUMMARIES = {}
 
 # A global dictionary mapping graph keys to a list of summary writer init ops.
@@ -59,58 +62,67 @@ def should_record_summaries():
   """Returns boolean Tensor which is true if summaries should be recorded."""
   global _SHOULD_RECORD_SUMMARIES
   key = ops.get_default_graph()._graph_key  # pylint: disable=protected-access
-  return _SHOULD_RECORD_SUMMARIES.setdefault(key, False)
+  should = _SHOULD_RECORD_SUMMARIES.setdefault(key, False)
+  return should() if callable(should) else should
+
+
+@tf_contextlib.contextmanager
+def _record_summaries(boolean=True):
+  """Sets summary recording on or off per the provided boolean value.
+
+  The provided value can be a python boolean, a scalar boolean Tensor, or
+  or a callable providing such a value; if a callable is passed it will be
+  invoked each time should_record_summaries() is called to determine whether
+  summary writing should be enabled.
+
+  Args:
+    boolean: can be True, False, a bool Tensor, or a callable providing such.
+      Defaults to True.
+
+  Yields:
+    Returns a context manager that sets this value on enter and restores the
+    previous value on exit.
+  """
+  # TODO(nickfelt): make this threadlocal
+  global _SHOULD_RECORD_SUMMARIES
+  key = ops.get_default_graph()._graph_key  # pylint: disable=protected-access
+  old = _SHOULD_RECORD_SUMMARIES.setdefault(key, False)
+  try:
+    _SHOULD_RECORD_SUMMARIES[key] = boolean
+    yield
+  finally:
+    _SHOULD_RECORD_SUMMARIES[key] = old
 
 
 # TODO(apassos) consider how to handle local step here.
-@tf_contextlib.contextmanager
 def record_summaries_every_n_global_steps(n, global_step=None):
   """Sets the should_record_summaries Tensor to true if global_step % n == 0."""
   if global_step is None:
     global_step = training_util.get_or_create_global_step()
-  global _SHOULD_RECORD_SUMMARIES
-  key = ops.get_default_graph()._graph_key  # pylint: disable=protected-access
-  old = _SHOULD_RECORD_SUMMARIES.setdefault(key, False)
-  try:
-    with ops.device("cpu:0"):
-      _SHOULD_RECORD_SUMMARIES[key] = math_ops.equal(global_step % n, 0)
-    yield
-  finally:
-    _SHOULD_RECORD_SUMMARIES[key] = old
+  with ops.device("cpu:0"):
+    should = lambda: math_ops.equal(global_step % n, 0)
+    if not context.executing_eagerly():
+      should = should()
+  return _record_summaries(should)
 
 
-@tf_contextlib.contextmanager
 def always_record_summaries():
   """Sets the should_record_summaries Tensor to always true."""
-  global _SHOULD_RECORD_SUMMARIES
-  key = ops.get_default_graph()._graph_key  # pylint: disable=protected-access
-  old = _SHOULD_RECORD_SUMMARIES.setdefault(key, False)
-  try:
-    _SHOULD_RECORD_SUMMARIES[key] = True
-    yield
-  finally:
-    _SHOULD_RECORD_SUMMARIES[key] = old
+  return _record_summaries(True)
 
 
-@tf_contextlib.contextmanager
 def never_record_summaries():
   """Sets the should_record_summaries Tensor to always false."""
-  global _SHOULD_RECORD_SUMMARIES
-  key = ops.get_default_graph()._graph_key  # pylint: disable=protected-access
-  old = _SHOULD_RECORD_SUMMARIES.setdefault(key, False)
-  try:
-    _SHOULD_RECORD_SUMMARIES[key] = False
-    yield
-  finally:
-    _SHOULD_RECORD_SUMMARIES[key] = old
+  return _record_summaries(False)
 
 
+@tf_export("summary.SummaryWriter", v1=[])
 class SummaryWriter(object):
   """Encapsulates a stateful summary writer resource.
 
   See also:
-  - `tf.contrib.summary.create_file_writer`
-  - `tf.contrib.summary.create_db_writer`
+  - `tf.summary.create_file_writer`
+  - `tf.summary.create_db_writer`
   """
 
   def  __init__(self, resource, init_op_fn):
@@ -205,6 +217,7 @@ def initialize(
     session.run(_graph(x, 0), feed_dict={x: data})
 
 
+@tf_export("summary.create_file_writer", v1=[])
 def create_file_writer(logdir,
                        max_queue=None,
                        flush_millis=None,
@@ -280,7 +293,7 @@ def create_db_writer(db_uri,
       `tf.Graph`.
 
   Returns:
-    A `tf.contrib.summary.SummaryWriter` instance.
+    A `tf.summary.SummaryWriter` instance.
   """
   with ops.device("cpu:0"):
     if experiment_name is None:
@@ -329,7 +342,7 @@ def _nothing():
 def all_summary_ops():
   """Graph-mode only. Returns all summary ops.
 
-  Please note this excludes `tf.contrib.summary.graph` ops.
+  Please note this excludes `tf.summary.graph` ops.
 
   Returns:
     The summary ops.
@@ -497,7 +510,7 @@ def graph(param, step=None, name=None):
   """Writes a TensorFlow graph to the summary interface.
 
   The graph summary is, strictly speaking, not a summary. Conditions
-  like `tf.contrib.summary.never_record_summaries` do not apply. Only
+  like `tf.summary.should_record_summaries` do not apply. Only
   a single graph can be associated with a particular run. If multiple
   graphs are written, then only the last one will be considered by
   TensorBoard.
@@ -541,14 +554,13 @@ def graph(param, step=None, name=None):
 _graph = graph  # for functions with a graph parameter
 
 
+@tf_export("summary.import_event", v1=[])
 def import_event(tensor, name=None):
   """Writes a `tf.Event` binary proto.
 
-  When using create_db_writer(), this can be used alongside
-  `tf.TFRecordReader` to load event logs into the database. Please
-  note that this is lower level than the other summary functions and
-  will ignore any conditions set by methods like
-  `tf.contrib.summary.should_record_summaries`.
+  This can be used to import existing event logs into a new summary writer sink.
+  Please note that this is lower level than the other summary functions and
+  will ignore the `tf.summary.should_record_summaries` setting.
 
   Args:
     tensor: A `tf.Tensor` of type `string` containing a serialized
@@ -562,13 +574,14 @@ def import_event(tensor, name=None):
       context.context().summary_writer_resource, tensor, name=name)
 
 
+@tf_export("summary.flush", v1=[])
 def flush(writer=None, name=None):
   """Forces summary writer to send any buffered data to storage.
 
   This operation blocks until that finishes.
 
   Args:
-    writer: The `tf.contrib.summary.SummaryWriter` resource to flush.
+    writer: The `tf.summary.SummaryWriter` resource to flush.
       The thread default will be used if this parameter is None.
       Otherwise a `tf.no_op` is returned.
     name: A name for the operation (optional).
@@ -595,6 +608,8 @@ def eval_dir(model_dir, name=None):
   return os.path.join(model_dir, "eval" if not name else "eval_" + name)
 
 
+@deprecation.deprecated(date=None,
+                        instructions="Renamed to create_file_writer().")
 def create_summary_file_writer(*args, **kwargs):
   """Please use `tf.contrib.summary.create_file_writer`."""
   logging.warning("Deprecation Warning: create_summary_file_writer was renamed "
diff --git a/tensorflow/python/ops/tensor_array_ops.py b/tensorflow/python/ops/tensor_array_ops.py
index f86dfb35276..d1516949517 100644
--- a/tensorflow/python/ops/tensor_array_ops.py
+++ b/tensorflow/python/ops/tensor_array_ops.py
@@ -20,8 +20,10 @@ from __future__ import division
 from __future__ import print_function
 
 import contextlib
+import os
 import weakref
 
+from tensorflow.python import tf2
 from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -30,12 +32,18 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gen_control_flow_ops
 from tensorflow.python.ops import gen_data_flow_ops
+from tensorflow.python.ops import list_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.util import tf_should_use
 from tensorflow.python.util.tf_export import tf_export
 
 
+ENABLE_TENSOR_ARRAY_V2 = (
+    tf2.enabled() or os.getenv("TF_ENABLE_TENSOR_ARRAY_V2") is not None)
+
+
 # _GraphTensorArray accesses many of the hidden generated ops, but is in
 # fact built to wrap these methods.
 # pylint: disable=protected-access
@@ -393,6 +401,273 @@ class _GraphTensorArray(object):
     return gen_data_flow_ops.tensor_array_close_v3(
         handle=self._handle, name=name)
 
+
+class _GraphTensorArrayV2(object):
+  """Graph-mode implementation of TensorArray backed by TensorLists.
+
+  The backing tensor of this TensorArray is a TensorList variant tensor which is
+  stored in the `flow`. The `handle` is always none here. The reason we use the
+  `flow` field and not the `handle` field is to ensure backwards compatibility
+  with legacy control flow.
+  """
+
+  def __init__(self,
+               dtype,
+               size=None,
+               dynamic_size=None,
+               clear_after_read=None,
+               tensor_array_name=None,
+               handle=None,
+               flow=None,
+               infer_shape=True,
+               element_shape=None,
+               colocate_with_first_write_call=True,
+               name=None):
+    """Constructs a graph mode TensorArray.
+
+    Args:
+      dtype: (required) data type of the TensorArray.
+      size: (optional) int32 scalar `Tensor`: the size of the TensorArray.
+        Required if flow is not provided.
+      dynamic_size: (optional) Python bool: If true, writes to the TensorArray
+        can grow the TensorArray past its initial size.  Default: False.
+      clear_after_read: (optional) unused. Not supported in TensorLists.
+      tensor_array_name: (optional) unused.
+      handle: (optional) Must always be None.
+      flow: (optional) A variant `Tensor` scalar for a TensorList.
+      infer_shape: (optional, default: True) If True, shape inference is
+        enabled.  In this case, all elements must have the same shape.
+      element_shape: (optional, default: None) A `TensorShape` object specifying
+        the shape constraints of each of the elements of the TensorArray. Need
+        not be fully defined.
+      colocate_with_first_write_call: (optional). unused.
+      name: (optional) A name for the operation.
+
+    Raises:
+      ValueError: if both handle and tensor_array_name are provided.
+      TypeError: if handle is provided but is not a Tensor.
+    """
+    assert handle is None
+    del handle
+    del clear_after_read
+    del tensor_array_name
+    del colocate_with_first_write_call
+
+    del dynamic_size  # TODO(b/117943489): Unused for now.
+
+    if (flow is not None and
+        (not isinstance(flow, ops.Tensor) or flow.dtype != dtypes.variant)):
+      raise TypeError("flow must be a variant tensor")
+    if flow is None and size is None:
+      raise ValueError("Size must be provided if flow is not provided")
+    if flow is not None and size is not None:
+      raise ValueError("Cannot provide both a flow and size "
+                       "at the same time")
+    if flow is not None and element_shape is not None:
+      raise ValueError("Cannot provide both a flow and element_shape "
+                       "at the same time")
+
+    self._dtype = dtype
+
+    # Record the current static shape for the array elements. The element
+    # shape is defined either by `element_shape` or the shape of the tensor
+    # of the first write. If `infer_shape` is true, all writes checks for
+    # shape equality.
+    if element_shape is None:
+      self._infer_shape = infer_shape
+      self._element_shape = []
+    else:
+      self._infer_shape = True
+      self._element_shape = [tensor_shape.TensorShape(element_shape)]
+    with ops.name_scope(name, "TensorArrayV2", [size, flow]) as scope:
+      if flow is None:
+        self._flow = list_ops.tensor_list_reserve(
+            element_shape=element_shape,
+            num_elements=size,
+            element_dtype=dtype,
+            name=scope)
+      else:
+        self._flow = flow
+
+    # For backwards compatibility.
+    self._colocate_with_first_write_call = None
+    self._colocate_with = None
+
+  @property
+  def flow(self):
+    return self._flow
+
+  @property
+  def dtype(self):
+    return self._dtype
+
+  @property
+  def handle(self):
+    # We intentionally do not raise an error so that legacy while_loop does not
+    # complain.
+    return None
+
+  def _merge_element_shape(self, shape):
+    """Changes the element shape of the array given a shape to merge with.
+
+    Args:
+      shape: A `TensorShape` object to merge with.
+
+    Raises:
+      ValueError: if the provided shape is incompatible with the current
+          element shape of the `TensorArray`.
+    """
+
+    if self._element_shape:
+      if not shape.is_compatible_with(self._element_shape[0]):
+        raise ValueError(
+            "Inconsistent shapes: saw %s but expected %s "
+            "(and infer_shape=True)" % (shape, self._element_shape[0]))
+      self._element_shape[0] = self._element_shape[0].merge_with(shape)
+    else:
+      self._element_shape.append(shape)
+
+  def identity(self):
+    """See TensorArray."""
+    flow = array_ops.identity(self._flow)
+    ta = TensorArray(
+        dtype=self._dtype, flow=flow, infer_shape=self._infer_shape)
+    ta._element_shape = self._element_shape
+    return ta
+
+  def grad(self, source, flow=None, name=None):
+    """Not supported."""
+    raise NotImplementedError()
+
+  def read(self, index, name=None):
+    """See TensorArray."""
+    value = list_ops.tensor_list_get_item(
+        input_handle=self._flow,
+        index=index,
+        element_dtype=self._dtype,
+        name=name)
+    if self._element_shape:
+      value.set_shape(self._element_shape[0].dims)
+    return value
+
+  @tf_should_use.should_use_result
+  def write(self, index, value, name=None):
+    """See TensorArray."""
+    with ops.name_scope(name, "TensorArrayV2Write", [self._flow, index, value]):
+      value = ops.convert_to_tensor(value, name="value")
+      if self._infer_shape:
+        self._merge_element_shape(value.shape)
+      flow_out = list_ops.tensor_list_set_item(
+          input_handle=self._flow, index=index, item=value, name=name)
+      ta = TensorArray(dtype=self._dtype, handle=None, flow=flow_out)
+      ta._infer_shape = self._infer_shape
+      ta._element_shape = self._element_shape
+      return ta
+
+  def stack(self, name=None):
+    """See TensorArray."""
+    with ops.name_scope(name, "TensorArrayV2Stack", [self._flow]):
+      value = list_ops.tensor_list_stack(
+          input_handle=self._flow, element_dtype=self._dtype)
+      if self._element_shape and self._element_shape[0].dims is not None:
+        value.set_shape([None] + self._element_shape[0].dims)
+      return value
+
+  def gather(self, indices, name=None):
+    """See TensorArray."""
+    value = list_ops.tensor_list_gather(
+        input_handle=self._flow,
+        indices=indices,
+        element_dtype=self._dtype,
+        name=name)
+    if self._element_shape and self._element_shape[0].dims is not None:
+      value.set_shape([None] + self._element_shape[0].dims)
+    return value
+
+  def concat(self, name=None):
+    """See TensorArray."""
+    value = list_ops.tensor_list_concat(
+        input_handle=self._flow, element_dtype=self._dtype, name=name)
+    if self._element_shape and self._element_shape[0].dims is not None:
+      value.set_shape([None] + self._element_shape[0].dims[1:])
+    return value
+
+  @tf_should_use.should_use_result
+  def unstack(self, value, name=None):
+    """See TensorArray."""
+    with ops.name_scope(name, "TensorArrayUnstack", [self._flow, value]):
+      value = ops.convert_to_tensor(value, name="value")
+      if self._infer_shape and not context.executing_eagerly():
+        self._merge_element_shape(value.shape[1:])
+      flow_out = list_ops.tensor_list_from_tensor(
+          tensor=value, element_shape=value.shape[1:])
+      ta = TensorArray(
+          dtype=self._dtype,
+          handle=self.handle,
+          flow=flow_out,
+          colocate_with_first_write_call=self._colocate_with_first_write_call)
+      ta._infer_shape = self._infer_shape
+      ta._element_shape = self._element_shape
+      ta._colocate_with = self._colocate_with
+      return ta
+
+  @tf_should_use.should_use_result
+  def scatter(self, indices, value, name=None):
+    """See TensorArray."""
+    with ops.name_scope(name, "TensorArrayScatter",
+                        [self._flow, value, indices]):
+      value = ops.convert_to_tensor(value, name="value")
+      if self._infer_shape and not context.executing_eagerly():
+        self._merge_element_shape(value.shape[1:])
+      flow_out = list_ops.tensor_list_scatter(
+          tensor=value, indices=indices, element_shape=-1)
+      ta = TensorArray(
+          dtype=self._dtype,
+          handle=self.handle,
+          flow=flow_out,
+          colocate_with_first_write_call=self._colocate_with_first_write_call)
+      ta._infer_shape = self._infer_shape
+      ta._element_shape = self._element_shape
+      ta._colocate_with = self._colocate_with
+      return ta
+
+  @tf_should_use.should_use_result
+  def split(self, value, lengths, name=None):
+    """See TensorArray."""
+    with ops.name_scope(name, "TensorArraySplit", [self._flow, value, lengths]):
+      value = ops.convert_to_tensor(value, name="value")
+      lengths_64 = math_ops.to_int64(lengths)
+      if self._infer_shape and not context.executing_eagerly():
+        clengths = tensor_util.constant_value(lengths_64)
+        if value.shape.dims is not None:
+          if clengths is not None and clengths.max() == clengths.min():
+            self._merge_element_shape(
+                tensor_shape.TensorShape([clengths[0]]).concatenate(
+                    value.shape[1:]))
+      flow_out = list_ops.tensor_list_split(
+          tensor=value,
+          lengths=lengths_64,
+          element_shape=self._element_shape[0] if self._element_shape else None,
+          name=name)
+      ta = TensorArray(
+          dtype=self._dtype,
+          handle=self.handle,
+          flow=flow_out,
+          colocate_with_first_write_call=self._colocate_with_first_write_call)
+      ta._infer_shape = self._infer_shape
+      ta._element_shape = self._element_shape
+      ta._colocate_with = self._colocate_with
+      return ta
+
+  def size(self, name=None):
+    """See TensorArray."""
+    return list_ops.tensor_list_length(input_handle=self._flow, name=name)
+
+  @tf_should_use.should_use_result
+  def close(self, name=None):
+    """See TensorArray."""
+    return gen_control_flow_ops.no_op(name=name)
+
 # pylint: enable=protected-access
 
 
@@ -738,8 +1013,10 @@ class TensorArray(object):
     if context.executing_eagerly():
       implementation = _EagerTensorArray
     else:
-      implementation = _GraphTensorArray
-
+      if ENABLE_TENSOR_ARRAY_V2:
+        implementation = _GraphTensorArrayV2
+      else:
+        implementation = _GraphTensorArray
     self._implementation = implementation(
         dtype,
         size=size,
@@ -768,7 +1045,7 @@ class TensorArray(object):
   @property
   def handle(self):
     """The reference to the TensorArray."""
-    return self._implementation._handle
+    return self._implementation.handle
 
   @property
   def _infer_shape(self):
@@ -953,4 +1230,16 @@ class TensorArray(object):
     """Close the current TensorArray."""
     return self._implementation.close(name=name)
 
+
+def build_ta_with_new_flow(old_ta, flow):
+  ta = TensorArray(
+      dtype=old_ta.dtype,
+      handle=old_ta.handle,
+      flow=flow,
+      infer_shape=old_ta._infer_shape,
+      colocate_with_first_write_call=old_ta._colocate_with_first_write_call)
+  ta._colocate_with = old_ta._colocate_with
+  ta._element_shape = old_ta._element_shape
+  return ta
+
 # pylint: enable=protected-access
diff --git a/tensorflow/python/ops/tensor_forest_ops.py b/tensorflow/python/ops/tensor_forest_ops.py
new file mode 100644
index 00000000000..42f3cdf324a
--- /dev/null
+++ b/tensorflow/python/ops/tensor_forest_ops.py
@@ -0,0 +1,110 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Ops for tensor_forest."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.ops import resources
+
+from tensorflow.python import ops
+from tensorflow.python.ops import gen_tensor_forest_ops
+from tensorflow.python.training import saver
+
+
+class TreeVariableSaveable(saver.BaseSaverBuilder.SaveableObject):
+
+  def __init__(self, type_name, name, container, config, resource_handle_func,
+               create_op_func, is_initialized_op_func, serialize_op_func,
+               deserialize_op_func):
+
+    with ops.name_scope(name, type_name) as name:
+      self._resource_handle = resource_handle_func(
+          container, shared_name=name, name=name)
+
+    self._is_initialized_op = is_initialized_op_func(
+        self._resource_handle)
+    tensor = serialize_op_func(self._resource_handle)
+    self._create_op = create_op_func(
+        self._resource_handle,
+        config)
+    # slice_spec is useful for saving a slice from a variable.
+    # It's not meaningful the tree variable. So we just pass an empty
+    # value.
+    slice_spec = ""
+    specs = [saver.BaseSaverBuilder.SaveSpec(tensor, slice_spec, name)]
+    super(TreeVariableSaveable,
+          self).__init__(self._resource_handle, specs, name)
+
+    ops.add_to_collection(
+        ops.GraphKeys.SAVEABLE_OBJECTS, self)
+
+    resources.register_resource(
+        self._resource_handle, self._create_op, self._is_initialized_op)
+    self._deserialize_op_func = deserialize_op_func
+
+  def restore(self, restored_tensors, unused_restored_shapes):
+    """Restores the associated tree from 'restored_tensors'.
+
+    Args:
+      restored_tensors: the tensors that were loaded from a checkpoint.
+      unused_restored_shapes: the shapes this object should conform to after
+        restore. Not meaningful for trees.
+
+    Returns:
+      The operation that restores the state of the tree variable.
+    """
+    with ops.control_dependencies([self._create_op]):
+      return self._deserialize_op_func(
+          self._resource_handle,
+          restored_tensors[0],
+      )
+
+  @property
+  def resource(self):
+    return self._resource_handle
+
+
+def tree_variable(tree_config, name, container=None):
+  return TreeVariableSaveable(
+      "TreeVariable",
+      name,
+      container,
+      tree_config,
+      gen_tensor_forest_ops.tensor_forest_tree_resource_handle_op,
+      gen_tensor_forest_ops.tensor_forest_create_tree_variable,
+      gen_tensor_forest_ops.tensor_forest_tree_is_initialized_op,
+      gen_tensor_forest_ops.tensor_forest_tree_serialize,
+      gen_tensor_forest_ops.tensor_forest_tree_deserialize).resource
+
+
+class ForestVariables(object):
+
+  def __init__(self, params,
+               tree_configs=None):
+
+    self._variables = []
+
+    for i in range(params.n_trees):
+      tree_config = ''
+      if tree_configs is not None:
+        tree_config = tree_configs[i]
+      self._variables.append(tree_variable(
+          tree_config,
+          'tree-%s' % i,
+      ))
+
+  def __getitem__(self, t):
+    return self._variables[t]
diff --git a/tensorflow/python/ops/variable_scope.py b/tensorflow/python/ops/variable_scope.py
index fe93bfb61f7..ccce9e2f93b 100644
--- a/tensorflow/python/ops/variable_scope.py
+++ b/tensorflow/python/ops/variable_scope.py
@@ -646,14 +646,8 @@ class _VariableStore(object):
         when violating reuse during variable creation, or if an existing
         sharded variable exists for the given name but with different sharding.
     """
-    if context.executing_eagerly():
-      raise NotImplementedError("Partitioned variables are not yet supported "
-                                "when eager execution is enabled.")
-
     initializing_from_value = initializer is not None and isinstance(
         initializer, ops.Tensor)
-    reuse_without_partition = reuse and not partitioner
-
     if name in self._vars:
       raise ValueError(
           "A partitioner was provided, but an unpartitioned version of the "
@@ -664,30 +658,9 @@ class _VariableStore(object):
     if initializing_from_value:
       shape = shape.merge_with(initializer.get_shape())
 
-    if not reuse_without_partition:
-      if not shape.is_fully_defined():
-        raise ValueError("Shape of a new partitioned variable (%s) must be "
-                         "fully defined, but instead was %s." % (name, shape))
-
-      if shape.ndims < 1:
-        raise ValueError("A partitioned Variable must have rank at least 1, "
-                         "shape: %s" % shape)
-
-      partitions = partitioner(shape=shape, dtype=dtype)
-
-      if not isinstance(partitions, collections_lib.Sequence):
-        raise ValueError("Partitioner must return a sequence, but saw: %s"
-                         % partitions)
-
-      if len(partitions) != shape.ndims:
-        raise ValueError(
-            "Partitioner returned a partition list that does not match the "
-            "Variable's rank: %s vs. %s" % (partitions, shape))
-
-      if any([p < 1 for p in partitions]):
-        raise ValueError(
-            "Partitioner returned zero partitions for some axes: %s" %
-            partitions)
+    partitions = None
+    if not reuse or partitioner:
+      partitions = _call_partitioner(partitioner, shape, dtype)
 
     if name in self._partitioned_vars:
       if reuse is False:
@@ -709,7 +682,7 @@ class _VariableStore(object):
             % (name, dtype.name, existing_var.dtype.name))
 
       # pylint: disable=protected-access
-      if (not reuse_without_partition and
+      if (partitions is not None and
           existing_var._get_partitions() != partitions):
         raise ValueError(
             "Trying to reuse partitioned variable %s, but specified partitions "
@@ -724,14 +697,7 @@ class _VariableStore(object):
                        "created with tf.get_variable(). Did you mean to set "
                        "reuse=False or reuse=tf.AUTO_REUSE in VarScope?" % name)
 
-    slice_dim, slice_shape = _compute_slice_dim_and_shape(
-        shape.as_list(), partitions)
-
-    vs = []
-    num_slices = partitions[slice_dim]
-    num_slices_with_excess = shape.dims[slice_dim].value % num_slices
-
-    slice_offset = [0] * shape.ndims
+    slice_dim, num_slices = _get_slice_dim_and_num_slices(partitions)
 
     if "%s/part_0" % name in self._vars:
       if "%s/part_%d" % (name, num_slices - 1) not in self._vars:
@@ -747,15 +713,14 @@ class _VariableStore(object):
             "%s/part_0 was found, but so was the extra shard %s/part_%d."
             % (num_slices, name, name, num_slices))
 
-    for i in xrange(num_slices):
-      var_shape = slice_shape[:]
-      var_offset = slice_offset[:]
+    vs = []
+    for i, (var_offset, var_shape) in enumerate(_iter_slices(
+        shape.as_list(),
+        num_slices,
+        slice_dim
+    )):
       partition_info = _PartitionInfo(
           full_shape=shape.as_list(), var_offset=var_offset)
-      if i < num_slices_with_excess:
-        var_shape[slice_dim] += 1
-      slice_offset[slice_dim] += var_shape[slice_dim]
-
       var_full_name = "%s/part_%d" % (name, i)
       with ops.name_scope(var_full_name + "/PartitionedInitializer"):
         # Create the tensor to initialize the variable with default value.
@@ -803,15 +768,13 @@ class _VariableStore(object):
       vs.append(var)
       # pylint: enable=protected-access
 
-      # pylint: disable=protected-access
     partitioned_var = variables.PartitionedVariable(name=name,
                                                     shape=shape,
                                                     dtype=dtype,
                                                     variable_list=vs,
                                                     partitions=partitions)
-    # pylint: enable=protected-access
-
-    self._partitioned_vars[name] = partitioned_var
+    if not context.executing_eagerly() or self._store_eager_variables:
+      self._partitioned_vars[name] = partitioned_var
     return partitioned_var
 
   def _get_single_variable(self,
@@ -913,20 +876,22 @@ class _VariableStore(object):
         variable_dtype = None
       else:
         # Instantiate initializer if provided initializer is a type object.
-        if isinstance(initializer, type(init_ops.Initializer)):
+        if tf_inspect.isclass(initializer):
           initializer = initializer(dtype=dtype)
-        if shape and shape.is_fully_defined():
+        if shape is not None and shape.is_fully_defined():
           init_val = lambda: initializer(  # pylint: disable=g-long-lambda
               shape.as_list(), dtype=dtype, partition_info=partition_info)
-        elif not tf_inspect.getargspec(initializer).args:
+          variable_dtype = dtype.base_dtype
+        elif len(tf_inspect.getargspec(initializer).args) == len(
+            tf_inspect.getargspec(initializer).defaults or []):
           init_val = initializer
+          variable_dtype = None
         else:
-          raise ValueError("You can only pass an initializer function that "
-                           "expects no arguments to its callable when the "
-                           "shape is not fully defined. The given initializer "
-                           "function expects the following args %s" %
-                           tf_inspect.getargspec(initializer).args)
-        variable_dtype = dtype.base_dtype
+          raise ValueError("The initializer passed is not valid. It should "
+                           "be a callable with no arguments and the "
+                           "shape should not be provided or an instance of "
+                           "`tf.keras.initializers.*' and `shape` should be "
+                           "fully defined.")
 
     # Create the variable.
     if use_resource is None:
@@ -1080,9 +1045,6 @@ class VariableScope(object):
       if self._caching_device is not None:
         raise NotImplementedError("Caching devices is not yet supported "
                                   "when eager execution is enabled.")
-      if self._partitioner is not None:
-        raise NotImplementedError("Partitioned variables are not yet supported "
-                                  "when eager execution is enabled.")
       self._reuse = AUTO_REUSE
       self._use_resource = True
 
@@ -1162,9 +1124,6 @@ class VariableScope(object):
 
   def set_partitioner(self, partitioner):
     """Set partitioner for this scope."""
-    if partitioner and context.executing_eagerly():
-      raise NotImplementedError("Partitioned variables are not yet supported "
-                                "when eager execution is enabled.")
     self._partitioner = partitioner
 
   def set_custom_getter(self, custom_getter):
@@ -1277,9 +1236,6 @@ class VariableScope(object):
                                 synchronization=VariableSynchronization.AUTO,
                                 aggregation=VariableAggregation.NONE):
     """Gets an existing variable with this name or create a new one."""
-    if context.executing_eagerly():
-      raise NotImplementedError("Partitioned variables are not yet supported "
-                                "when eager execution is enabled.")
     if initializer is None:
       initializer = self._initializer
     if regularizer is None:
@@ -2246,8 +2202,8 @@ class variable_scope(object):
 
     try:
       return self._enter_scope_uncached()
-    except:
-      if not self._building_function:
+    except Exception:
+      if self._in_graph_mode and not self._building_function:
         if self._graph_context_manager is not None:
           self._graph_context_manager.__exit__(*sys.exc_info())
       raise
@@ -2413,34 +2369,71 @@ def variable_op_scope(values,
     yield scope
 
 
-def _compute_slice_dim_and_shape(full_shape, slicing):
-  """Computes which dimension is being sliced and the typical slice shape."""
+def _call_partitioner(partitioner, shape, dtype):
+  """Call partitioner validating its inputs/output.
 
-  slice_shape = [0] * len(full_shape)
-  slice_dim = None
-  for dim, num_slices in enumerate(slicing):
-    dim_size = full_shape[dim]
-    if num_slices <= 0 or dim_size < num_slices:
-      raise ValueError("Cannot create %d slices for size %d. shape: %s, "
-                       "slicing: %s" %
-                       (num_slices, full_shape[dim], full_shape, slicing))
-    if num_slices == 1:
-      # Not slicing in this dimension.
-      slice_shape[dim] = dim_size
-    elif slice_dim is not None:
-      # We only support slicing along one of the dimensions.
-      raise ValueError("Can only slice a variable along one dimension: "
-                       "shape: %s, slicing: %s" % (full_shape, slicing))
-    else:
-      # Note: We will add any extras onto the last slice, later.
-      slice_dim = dim
-      slice_shape[dim] = dim_size // num_slices
+  Args:
+    partitioner: a function mapping `Tensor` shape and dtype to a
+        list of partitions.
+    shape: shape of the `Tensor` to partition, must have at least two
+        dimensions.
+    dtype: dtype of the elements in the `Tensor`.
 
-  # Degenerate case: If "slicing" was all ones, pretend we are slicing along
-  # the first dimension.
-  if slice_dim is None:
+  Returns:
+    A list with elements >=1 and exactly one >1. The index of that
+    element corresponds to the partitioning axis.
+  """
+  if not shape.is_fully_defined():
+    raise ValueError("Shape of a new partitioned variable must be "
+                     "fully defined, but instead was %s." % (shape,))
+  if shape.ndims < 1:
+    raise ValueError("A partitioned Variable must have rank at least 1, "
+                     "shape: %s" % shape)
+
+  slicing = partitioner(shape=shape, dtype=dtype)
+  if not isinstance(slicing, collections_lib.Sequence):
+    raise ValueError("Partitioner must return a sequence, but saw: %s"
+                     % slicing)
+  if len(slicing) != shape.ndims:
+    raise ValueError(
+        "Partitioner returned a partition list that does not match the "
+        "Variable's rank: %s vs. %s" % (slicing, shape))
+  if any(p < 1 for p in slicing):
+    raise ValueError(
+        "Partitioner returned zero partitions for some axes: %s" %
+        slicing)
+  if sum(p > 1 for p in slicing) > 1:
+    raise ValueError(
+        "Can only slice a variable along one dimension: "
+        "shape: %s, partitioning: %s" % (shape, slicing))
+  return slicing
+
+
+# TODO(slebedev): could be inlined, but
+# `_VariableStore._get_partitioned_variable` is too complex even
+# without this logic.
+def _get_slice_dim_and_num_slices(slicing):
+  """Get slicing dimension and number of slices from the partitioner output."""
+  for slice_dim, num_slices in enumerate(slicing):
+    if num_slices > 1:
+      break
+  else:
+    # Degenerate case: no partitioning applied.
     slice_dim = 0
-  return slice_dim, slice_shape
+    num_slices = 1
+  return slice_dim, num_slices
+
+
+def _iter_slices(full_shape, num_slices, slice_dim):
+  """Slices a given a shape along the specified dimension."""
+  num_slices_with_excess = full_shape[slice_dim] % num_slices
+  offset = [0] * len(full_shape)
+  min_slice_len = full_shape[slice_dim] // num_slices
+  for i in xrange(num_slices):
+    shape = full_shape[:]
+    shape[slice_dim] = min_slice_len + bool(i < num_slices_with_excess)
+    yield offset[:], shape
+    offset[slice_dim] += shape[slice_dim]
 
 
 def _get_trainable_value(synchronization, trainable):
diff --git a/tensorflow/python/ops/variables.py b/tensorflow/python/ops/variables.py
index e43736069e3..d809c81b982 100644
--- a/tensorflow/python/ops/variables.py
+++ b/tensorflow/python/ops/variables.py
@@ -18,7 +18,8 @@ from __future__ import division
 from __future__ import print_function
 
 import enum  # pylint: disable=g-bad-import-order
-
+import functools
+import os
 import six
 
 from tensorflow.core.framework import attr_value_pb2
@@ -860,18 +861,18 @@ class Variable(six.with_metaclass(VariableMetaclass,
     else:
       return v.value()
 
-  @staticmethod
-  def _OverloadAllOperators():  # pylint: disable=invalid-name
+  @classmethod
+  def _OverloadAllOperators(cls):  # pylint: disable=invalid-name
     """Register overloads for all operators."""
     for operator in ops.Tensor.OVERLOADABLE_OPERATORS:
-      Variable._OverloadOperator(operator)
+      cls._OverloadOperator(operator)
     # For slicing, bind getitem differently than a tensor (use SliceHelperVar
     # instead)
     # pylint: disable=protected-access
-    setattr(Variable, "__getitem__", array_ops._SliceHelperVar)
+    setattr(cls, "__getitem__", array_ops._SliceHelperVar)
 
-  @staticmethod
-  def _OverloadOperator(operator):  # pylint: disable=invalid-name
+  @classmethod
+  def _OverloadOperator(cls, operator):  # pylint: disable=invalid-name
     """Defer an operator overload to `ops.Tensor`.
 
     We pull the operator out of ops.Tensor dynamically to avoid ordering issues.
@@ -879,17 +880,26 @@ class Variable(six.with_metaclass(VariableMetaclass,
     Args:
       operator: string. The operator name.
     """
+    tensor_oper = getattr(ops.Tensor, operator)
 
-    def _run_op(a, *args):
+    def _run_op(a, *args, **kwargs):
       # pylint: disable=protected-access
-      return getattr(ops.Tensor, operator)(a._AsTensor(), *args)
-    # Propagate __doc__ to wrapper
-    try:
-      _run_op.__doc__ = getattr(ops.Tensor, operator).__doc__
-    except AttributeError:
-      pass
+      return tensor_oper(a._AsTensor(), *args, **kwargs)
 
-    setattr(Variable, operator, _run_op)
+    functools.update_wrapper(_run_op, tensor_oper)
+    setattr(cls, operator, _run_op)
+
+  def __iter__(self):
+    """Dummy method to prevent iteration. Do not call.
+
+    NOTE(mrry): If we register __getitem__ as an overloaded operator,
+    Python will valiantly attempt to iterate over the variable's Tensor from 0
+    to infinity.  Declaring this method prevents this unintended behavior.
+
+    Raises:
+      TypeError: when invoked.
+    """
+    raise TypeError("'Variable' object is not iterable.")
 
   # NOTE(mrry): This enables the Variable's overloaded "right" binary
   # operators to run when the left operand is an ndarray, because it
@@ -1045,27 +1055,6 @@ class Variable(six.with_metaclass(VariableMetaclass,
       else:
         return None
 
-  def __iadd__(self, other):
-    raise NotImplementedError
-
-  def __isub__(self, other):
-    raise NotImplementedError
-
-  def __imul__(self, other):
-    raise NotImplementedError
-
-  def __idiv__(self, other):
-    raise NotImplementedError
-
-  def __itruediv__(self, other):
-    raise NotImplementedError
-
-  def __irealdiv__(self, other):
-    raise NotImplementedError
-
-  def __ipow__(self, other):
-    raise NotImplementedError
-
 
 @tf_export(v1=["Variable"])
 class VariableV1(Variable):
@@ -1576,18 +1565,6 @@ class RefVariable(VariableV1):
     """
     return self._snapshot
 
-  def __iter__(self):
-    """Dummy method to prevent iteration. Do not call.
-
-    NOTE(mrry): If we register __getitem__ as an overloaded operator,
-    Python will valiantly attempt to iterate over the variable's Tensor from 0
-    to infinity.  Declaring this method prevents this unintended behavior.
-
-    Raises:
-      TypeError: when invoked.
-    """
-    raise TypeError("'Variable' object is not iterable.")
-
   def value(self):
     """Returns the last snapshot of this variable.
 
@@ -2123,37 +2100,6 @@ class RefVariable(VariableV1):
     else:
       return v.value()
 
-  @staticmethod
-  def _OverloadAllOperators():  # pylint: disable=invalid-name
-    """Register overloads for all operators."""
-    for operator in ops.Tensor.OVERLOADABLE_OPERATORS:
-      Variable._OverloadOperator(operator)  # pylint: disable=protected-access
-    # For slicing, bind getitem differently than a tensor (use SliceHelperVar
-    # instead)
-    # pylint: disable=protected-access
-    setattr(Variable, "__getitem__", array_ops._SliceHelperVar)
-
-  @staticmethod
-  def _OverloadOperator(operator):  # pylint: disable=invalid-name
-    """Defer an operator overload to `ops.Tensor`.
-
-    We pull the operator out of ops.Tensor dynamically to avoid ordering issues.
-
-    Args:
-      operator: string. The operator name.
-    """
-
-    def _run_op(a, *args):
-      # pylint: disable=protected-access
-      return getattr(ops.Tensor, operator)(a._AsTensor(), *args)
-    # Propagate __doc__ to wrapper
-    try:
-      _run_op.__doc__ = getattr(ops.Tensor, operator).__doc__
-    except AttributeError:
-      pass
-
-    setattr(Variable, operator, _run_op)
-
   def _gather_saveables_for_checkpoint(self):
     """For implementing `Checkpointable`. This object is saveable on its own."""
     return {checkpointable.VARIABLE_VALUE_KEY: self}
@@ -2457,34 +2403,6 @@ class PartitionedVariable(object):
   @end_compatibility
   """
 
-  class PartitionedVariableIterator(object):
-    """An iterator that allows accessing the underlying `Variable` objects.
-
-    This iterator is necessary to control order of access when Variables
-    are not partitioned in a standard way along a single axis.
-
-    Allows e.g. `list(partitioned_variable)` to return a proper list.
-    """
-
-    def __init__(self, partitioned_variable):
-      self._ix = 0
-      self._partitioned_variable = partitioned_variable
-
-    def __iter__(self):
-      return self
-
-    def __next__(self):  # For python3 compatibility.
-      return self.next()
-
-    def next(self):
-      # pylint: disable=protected-access
-      if self._ix >= len(self._partitioned_variable._variable_list):
-        raise StopIteration()
-      variable = self._partitioned_variable._variable_list[self._ix]
-      # pylint: enable=protected-access
-      self._ix += 1
-      return variable
-
   def __init__(self, name, shape, dtype, variable_list, partitions):
     """Creates a new partitioned variable wrapper.
 
@@ -2504,31 +2422,27 @@ class PartitionedVariable(object):
         `partitions` is not a list.
       ValueError: If `variable_list` is empty, or the `Variable` shape
         information does not match `shape`, or `partitions` has invalid values.
-      RuntimeError: If eager execution is enabled
     """
-    if context.executing_eagerly():
-      raise RuntimeError(
-          "tf.PartitionedVariable not supported with eager execution enabled.")
     if not isinstance(variable_list, (list, tuple)):
       raise TypeError(
           "variable_list is not a list or tuple: %s" % variable_list)
     if not isinstance(partitions, (list, tuple)):
       raise TypeError("partitions is not a list or tuple: %s" % partitions)
-    if not all([p >= 1 for p in partitions]):
+    if not all(p >= 1 for p in partitions):
       raise ValueError("partition values must be positive: %s" % partitions)
     if not variable_list:
       raise ValueError("variable_list may not be empty")
     # pylint: disable=protected-access
     for v in variable_list:
       # Sort the variable_list lexicographically according to var offset value.
-      if not all([v._get_save_slice_info() is not None for v in variable_list]):
+      if not all(v._get_save_slice_info() is not None for v in variable_list):
         raise ValueError(
             "All variables must have a save_slice_info available: %s"
             % [v.name for v in variable_list])
       if len(shape) != len(partitions):
         raise ValueError("len(shape) != len(partitions): %s vs. %s"
                          % (shape, partitions))
-      if not all([v._get_save_slice_info().full_shape == shape]):
+      if v._get_save_slice_info().full_shape != shape:
         raise ValueError(
             "All variables' full shapes must match shape: %s; "
             "but full shapes were: %s"
@@ -2545,7 +2459,7 @@ class PartitionedVariable(object):
 
   def __iter__(self):
     """Return an iterable for accessing the underlying partition Variables."""
-    return self.PartitionedVariableIterator(self)
+    return iter(self._variable_list)
 
   def __len__(self):
     num_partition_axes = len(self._partition_axes())
@@ -2555,7 +2469,7 @@ class PartitionedVariable(object):
     return len(self._variable_list)
 
   def _partition_axes(self):
-    if all([p == 1 for p in self._partitions]):
+    if all(p == 1 for p in self._partitions):
       return [0]
     else:
       return [i for i, p in enumerate(self._partitions) if p > 1]
@@ -2995,7 +2909,8 @@ def report_uninitialized_variables(var_list=None,
     # Run all operations on CPU
     if var_list:
       init_vars = [state_ops.is_variable_initialized(v) for v in var_list]
-    with ops.device("/cpu:0"):
+    local_device = os.environ.get("TF_DEVICE_FOR_UNINITIALIZED_VARIABLE_REPORTING", "/cpu:0")
+    with ops.device(local_device):
       if not var_list:
         # Return an empty tensor so we only need to check for returned tensor
         # size being 0 as an indication of model ready.
diff --git a/tensorflow/python/ops/while_v2.py b/tensorflow/python/ops/while_v2.py
index 7b0f0ed4fc8..59ca29e3bad 100644
--- a/tensorflow/python/ops/while_v2.py
+++ b/tensorflow/python/ops/while_v2.py
@@ -23,7 +23,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.core.framework import attr_value_pb2
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import func_graph as func_graph_module
@@ -65,7 +64,8 @@ def while_loop(cond,
                loop_vars,
                shape_invariants=None,
                maximum_iterations=None,
-               name=None):
+               name=None,
+               return_same_structure=True):
   """Like tf.while_loop, except emits a single While op."""
   maximum_iterations = _validate_and_convert_to_tensor(maximum_iterations)
   # Keep the original loop_vars around to know which args were TensorArrays.
@@ -100,7 +100,8 @@ def while_loop(cond,
     # Add loop counter needed for computing gradients.
     loop_vars = [loop_counter] + loop_vars
 
-    shape_invariants = [tensor_shape.scalar()] + shape_invariants
+    shape_invariants = type(shape_invariants)([tensor_shape.scalar()
+                                              ]) + shape_invariants
 
     # Automatic control dependencies are added in defuns, but not in v1
     # graphs. Propagate that behavior here.
@@ -133,9 +134,8 @@ def while_loop(cond,
     # the value of that tensor in each iteration is the same as it was at the
     # beginning of the loop execution.
     loop_vars = loop_vars + cond_graph.external_captures
-    shape_invariants = shape_invariants + [
-        t.shape for t in cond_graph.external_captures
-    ]
+    shape_invariants = shape_invariants + type(shape_invariants)(
+        [t.shape for t in cond_graph.external_captures])
 
     def wrapped_body(loop_counter, *args):
       """Loop body augmented with counter update.
@@ -208,8 +208,7 @@ def while_loop(cond,
     for intermediate_tensor in intermediate_tensors:
       tensor_list = list_ops.empty_tensor_list(
           element_dtype=intermediate_tensor.dtype,
-          element_shape=_get_tensor_convertible_shape(
-              intermediate_tensor.shape),
+          element_shape=intermediate_tensor.shape,
           max_num_elements=maximum_iterations)
       loop_vars.append(tensor_list)
       with cond_graph.as_default():
@@ -245,7 +244,7 @@ def while_loop(cond,
         name=scope)
 
     _copy_handle_data(body_graph.outputs, outputs)
-    _maybe_set_lowering_attr(outputs[0].op)
+    util.maybe_set_lowering_attr(outputs[0].op)
     _maybe_set_maximum_iterations_attr(outputs[0].op, maximum_iterations)
 
     # Return identities for each output of the While op, rather than the output
@@ -257,11 +256,17 @@ def while_loop(cond,
     outputs = tuple(array_ops.identity(t) for t in outputs)
 
   # First var is loop counter.
-  if num_flattened_outputs == 1:
-    return outputs[1]
+  outputs = _pack_sequence_as(orig_loop_vars,
+                              outputs[1:1 + num_flattened_outputs])
+
+  if return_same_structure:
+    return outputs
+
+  flattened_outputs = nest.flatten(outputs)
+  if len(flattened_outputs) == 1:
+    return flattened_outputs[0]
   else:
-    return _pack_sequence_as(orig_loop_vars,
-                             outputs[1:1 + num_flattened_outputs])
+    return outputs
 
 
 @ops.RegisterGradient("While")
@@ -313,7 +318,7 @@ def _WhileGrad(op, *grads):  # pylint: disable=invalid-name
   for intermediate_tensor in intermediate_tensors:
     tensor_list = list_ops.empty_tensor_list(
         element_dtype=intermediate_tensor.dtype,
-        element_shape=_get_tensor_convertible_shape(intermediate_tensor.shape),
+        element_shape=intermediate_tensor.shape,
         max_num_elements=maximum_iterations)
 
     with body_grad_graph.as_default():
@@ -343,9 +348,12 @@ def _WhileGrad(op, *grads):  # pylint: disable=invalid-name
       name="%s_grad" % op.name)
 
   _copy_handle_data(body_grad_graph.outputs, outputs)
-  _maybe_set_lowering_attr(outputs[0].op)
+  util.maybe_set_lowering_attr(outputs[0].op)
   _maybe_set_maximum_iterations_attr(outputs[0].op, maximum_iterations)
 
+  # See comment in while_loop.
+  outputs = [array_ops.identity(t) for t in outputs]
+
   # Set None as the output gradient for tensors with None input gradient
   # e.g. TensorArray handles.
   # outputs[0] is the loop counter.
@@ -505,7 +513,7 @@ def _grad_fn(ys, xs, args, func_graph):
 
   # TODO(b/118712257): Handle the case when grad_outs has None's e.g. when there
   # is a tf.StopGradient in the loop body.
-  assert all([g is not None for g in grad_outs])
+  assert all(g is not None for g in grad_outs)
   counter = args[0]
   total_iters = args[1]
   return [counter + 1, total_iters] + grad_outs
@@ -792,29 +800,6 @@ def _copy_handle_data(src_tensors, tgt_tensors):
     custom_gradient.copy_handle_data(src_t, tgt_t)
 
 
-# TODO(srbs): Move to common utils for cond_v2 and while_v2.
-def _maybe_set_lowering_attr(op):
-  """Sets the flag to enable lowering on the `While` op if necessary.
-
-  Lowering allows while_v2 to avoid some of the limitations of Functions,
-  allowing users to specify devices & colocation inside of while_v2
-  branches, and enabling non-strict evaluation & partial pruning of while_v2
-  branches. This brings while_v2 closer to feature parity with
-  tf.while_loop.
-
-  However, we do not lower `While` in the XLA context because it is easier
-  for XLA to apply its own optimizations when dealing with un-lowered
-  `While` operators than with low-level control flow primitives.
-
-  Args:
-    op: The While op.
-  """
-  if not control_flow_util.IsInXLAContext(op):
-    # pylint: disable=protected-access
-    op._set_attr("_lower_using_switch_merge", attr_value_pb2.AttrValue(b=True))
-    # pylint: enable=protected-access
-
-
 def _maybe_set_maximum_iterations_attr(op, maximum_iterations):
   if control_flow_util.IsInXLAContext(op):
     # Store the maximum_iterations to use in the gradient pass.
@@ -837,18 +822,6 @@ def _is_in_xla_context():
   return control_flow_util.GetContainingXLAContext(cur_ctxt) is not None
 
 
-def _get_tensor_convertible_shape(shape):
-  assert isinstance(shape, tensor_shape.TensorShape)
-  if shape.is_fully_defined():
-    return shape
-  if not shape:  # Unknown shape.
-    return -1
-  # Partially defined shape.
-  shape_list = shape.as_list()
-  shape_list = [s if s is not None else -1 for s in shape_list]
-  return ops.convert_to_tensor(shape_list)
-
-
 def _graph_name(graph):
   if isinstance(graph, func_graph_module.FuncGraph):
     return graph.name
@@ -870,6 +843,10 @@ def _is_tensor_array_handle(tensor):
   # TODO(b/118452219): add test coverage for this.
   tensor = func_graph_module.maybe_captured(tensor)
 
+  if isinstance(tensor, ops.EagerTensor):
+    # Eager execution doesn't quite support legacy tensorarray
+    return False
+
   return tensor.op.type in TENSOR_ARRAY_HANDLE_OPS
 
 
diff --git a/tensorflow/python/platform/__init__.py b/tensorflow/python/platform/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/tensorflow/python/platform/app.py b/tensorflow/python/platform/app.py
index 4c91bc3652d..7b917235c0a 100644
--- a/tensorflow/python/platform/app.py
+++ b/tensorflow/python/platform/app.py
@@ -108,7 +108,7 @@ def _define_help_flags():
     _define_help_flags_called = True
 
 
-@tf_export('app.run')
+@tf_export(v1=['app.run'])
 def run(main=None, argv=None):
   """Runs the program with an optional 'main' function and 'argv' list."""
 
diff --git a/tensorflow/python/platform/benchmark.py b/tensorflow/python/platform/benchmark.py
index 4f7abb311a7..d6773d7b813 100644
--- a/tensorflow/python/platform/benchmark.py
+++ b/tensorflow/python/platform/benchmark.py
@@ -30,6 +30,7 @@ from tensorflow.core.protobuf import config_pb2
 from tensorflow.core.protobuf import rewriter_config_pb2
 from tensorflow.core.util import test_log_pb2
 from tensorflow.python.client import timeline
+from tensorflow.python.framework import ops
 from tensorflow.python.platform import app
 from tensorflow.python.platform import gfile
 from tensorflow.python.platform import tf_logging as logging
@@ -299,6 +300,18 @@ class TensorFlowBenchmark(Benchmark):
     benchmark_values["extras"].update(unreported_extras)
     return benchmark_values
 
+  def evaluate(self, tensors):
+    """Evaluates tensors and returns numpy values.
+
+    Args:
+      tensors: A Tensor or a nested list/tuple of Tensors.
+
+    Returns:
+      tensors numpy values.
+    """
+    sess = ops.get_default_session() or self.cached_session()
+    return sess.run(tensors)
+
 
 def _run_benchmarks(regex):
   """Run benchmarks that match regex `regex`.
diff --git a/tensorflow/python/platform/gfile.py b/tensorflow/python/platform/gfile.py
index 5927bc2409b..d0159e9e981 100644
--- a/tensorflow/python/platform/gfile.py
+++ b/tensorflow/python/platform/gfile.py
@@ -37,7 +37,7 @@ from tensorflow.python.util.deprecation import deprecated
 from tensorflow.python.util.tf_export import tf_export
 
 
-@tf_export('gfile.GFile', 'gfile.Open')
+@tf_export(v1=['gfile.GFile', 'gfile.Open'], v2=['io.gfile.GFile'])
 class GFile(_FileIO):
   """File I/O wrappers without thread locking.
 
@@ -52,7 +52,7 @@ class GFile(_FileIO):
     super(GFile, self).__init__(name=name, mode=mode)
 
 
-@tf_export('gfile.FastGFile')
+@tf_export(v1=['gfile.FastGFile'])
 class FastGFile(_FileIO):
   """File I/O wrappers without thread locking.
 
diff --git a/tensorflow/python/platform/googletest.py b/tensorflow/python/platform/googletest.py
index 8141cf92c56..4d34c508da8 100644
--- a/tensorflow/python/platform/googletest.py
+++ b/tensorflow/python/platform/googletest.py
@@ -104,10 +104,13 @@ def GetTempDir():
   """Return a temporary directory for tests to use."""
   global _googletest_temp_dir
   if not _googletest_temp_dir:
-    first_frame = tf_inspect.stack()[-1][0]
-    temp_dir = os.path.join(tempfile.gettempdir(),
-                            os.path.basename(tf_inspect.getfile(first_frame)))
-    temp_dir = tempfile.mkdtemp(prefix=temp_dir.rstrip('.py'))
+    if os.environ.get('TEST_TMPDIR'):
+      temp_dir = tempfile.mkdtemp(prefix=os.environ['TEST_TMPDIR'])
+    else:
+      first_frame = tf_inspect.stack()[-1][0]
+      temp_dir = os.path.join(tempfile.gettempdir(),
+                              os.path.basename(tf_inspect.getfile(first_frame)))
+      temp_dir = tempfile.mkdtemp(prefix=temp_dir.rstrip('.py'))
 
     def delete_temp_dir(dirname=temp_dir):
       try:
diff --git a/tensorflow/python/platform/test.py b/tensorflow/python/platform/test.py
index d084870b255..943832af7a2 100644
--- a/tensorflow/python/platform/test.py
+++ b/tensorflow/python/platform/test.py
@@ -46,9 +46,9 @@ from tensorflow.python.util.tf_export import tf_export
 if sys.version_info.major == 2:
   import mock                # pylint: disable=g-import-not-at-top,unused-import
 else:
-  from unittest import mock  # pylint: disable=g-import-not-at-top
+  from unittest import mock  # pylint: disable=g-import-not-at-top,g-importing-member
 
-tf_export('test.mock')(mock)
+tf_export(v1=['test.mock'])(mock)
 
 # Import Benchmark class
 Benchmark = _googletest.Benchmark  # pylint: disable=invalid-name
diff --git a/tensorflow/python/platform/tf_logging.py b/tensorflow/python/platform/tf_logging.py
index 59e60856ae8..813bcb89bea 100644
--- a/tensorflow/python/platform/tf_logging.py
+++ b/tensorflow/python/platform/tf_logging.py
@@ -37,7 +37,7 @@ import six
 
 from tensorflow.python.util.tf_export import tf_export
 
-# Don't use this directly. Use _get_logger() instead.
+# Don't use this directly. Use get_logger() instead.
 _logger = None
 _logger_lock = threading.Lock()
 
@@ -78,7 +78,8 @@ else:
       return '(unknown file)', 0, '(unknown function)'
 
 
-def _get_logger():
+@tf_export('get_logger')
+def get_logger():
   """Return TF logger instance."""
   global _logger
 
@@ -130,39 +131,39 @@ def _get_logger():
     _logger_lock.release()
 
 
-@tf_export('logging.log')
+@tf_export(v1=['logging.log'])
 def log(level, msg, *args, **kwargs):
-  _get_logger().log(level, msg, *args, **kwargs)
+  get_logger().log(level, msg, *args, **kwargs)
 
 
-@tf_export('logging.debug')
+@tf_export(v1=['logging.debug'])
 def debug(msg, *args, **kwargs):
-  _get_logger().debug(msg, *args, **kwargs)
+  get_logger().debug(msg, *args, **kwargs)
 
 
-@tf_export('logging.error')
+@tf_export(v1=['logging.error'])
 def error(msg, *args, **kwargs):
-  _get_logger().error(msg, *args, **kwargs)
+  get_logger().error(msg, *args, **kwargs)
 
 
-@tf_export('logging.fatal')
+@tf_export(v1=['logging.fatal'])
 def fatal(msg, *args, **kwargs):
-  _get_logger().fatal(msg, *args, **kwargs)
+  get_logger().fatal(msg, *args, **kwargs)
 
 
-@tf_export('logging.info')
+@tf_export(v1=['logging.info'])
 def info(msg, *args, **kwargs):
-  _get_logger().info(msg, *args, **kwargs)
+  get_logger().info(msg, *args, **kwargs)
 
 
-@tf_export('logging.warn')
+@tf_export(v1=['logging.warn'])
 def warn(msg, *args, **kwargs):
-  _get_logger().warn(msg, *args, **kwargs)
+  get_logger().warn(msg, *args, **kwargs)
 
 
-@tf_export('logging.warning')
+@tf_export(v1=['logging.warning'])
 def warning(msg, *args, **kwargs):
-  _get_logger().warning(msg, *args, **kwargs)
+  get_logger().warning(msg, *args, **kwargs)
 
 
 _level_names = {
@@ -183,20 +184,20 @@ _log_prefix = None  # later set to google2_log_prefix
 _log_counter_per_token = {}
 
 
-@tf_export('logging.TaskLevelStatusMessage')
+@tf_export(v1=['logging.TaskLevelStatusMessage'])
 def TaskLevelStatusMessage(msg):
   error(msg)
 
 
-@tf_export('logging.flush')
+@tf_export(v1=['logging.flush'])
 def flush():
   raise NotImplementedError()
 
 
 # Code below is taken from pyglib/logging
-@tf_export('logging.vlog')
+@tf_export(v1=['logging.vlog'])
 def vlog(level, msg, *args, **kwargs):
-  _get_logger().log(level, msg, *args, **kwargs)
+  get_logger().log(level, msg, *args, **kwargs)
 
 
 def _GetNextLogCountPerToken(token):
@@ -214,7 +215,7 @@ def _GetNextLogCountPerToken(token):
   return _log_counter_per_token[token]
 
 
-@tf_export('logging.log_every_n')
+@tf_export(v1=['logging.log_every_n'])
 def log_every_n(level, msg, n, *args):
   """Log 'msg % args' at level 'level' once per 'n' times.
 
@@ -231,7 +232,7 @@ def log_every_n(level, msg, n, *args):
   log_if(level, msg, not (count % n), *args)
 
 
-@tf_export('logging.log_first_n')
+@tf_export(v1=['logging.log_first_n'])
 def log_first_n(level, msg, n, *args):  # pylint: disable=g-bad-name
   """Log 'msg % args' at level 'level' only first 'n' times.
 
@@ -247,7 +248,7 @@ def log_first_n(level, msg, n, *args):  # pylint: disable=g-bad-name
   log_if(level, msg, count < n, *args)
 
 
-@tf_export('logging.log_if')
+@tf_export(v1=['logging.log_if'])
 def log_if(level, msg, condition, *args):
   """Log 'msg % args' at level 'level' only if condition is fulfilled."""
   if condition:
@@ -296,16 +297,16 @@ def google2_log_prefix(level, timestamp=None, file_and_line=None):
   return s
 
 
-@tf_export('logging.get_verbosity')
+@tf_export(v1=['logging.get_verbosity'])
 def get_verbosity():
   """Return how much logging output will be produced."""
-  return _get_logger().getEffectiveLevel()
+  return get_logger().getEffectiveLevel()
 
 
-@tf_export('logging.set_verbosity')
+@tf_export(v1=['logging.set_verbosity'])
 def set_verbosity(v):
   """Sets the threshold for what messages will be logged."""
-  _get_logger().setLevel(v)
+  get_logger().setLevel(v)
 
 
 def _get_thread_id():
@@ -318,8 +319,8 @@ def _get_thread_id():
 
 _log_prefix = google2_log_prefix
 
-tf_export('logging.DEBUG').export_constant(__name__, 'DEBUG')
-tf_export('logging.ERROR').export_constant(__name__, 'ERROR')
-tf_export('logging.FATAL').export_constant(__name__, 'FATAL')
-tf_export('logging.INFO').export_constant(__name__, 'INFO')
-tf_export('logging.WARN').export_constant(__name__, 'WARN')
+tf_export(v1=['logging.DEBUG']).export_constant(__name__, 'DEBUG')
+tf_export(v1=['logging.ERROR']).export_constant(__name__, 'ERROR')
+tf_export(v1=['logging.FATAL']).export_constant(__name__, 'FATAL')
+tf_export(v1=['logging.INFO']).export_constant(__name__, 'INFO')
+tf_export(v1=['logging.WARN']).export_constant(__name__, 'WARN')
diff --git a/tensorflow/python/profiler/internal/run_metadata_test.py b/tensorflow/python/profiler/internal/run_metadata_test.py
index 216cc3dd54b..a8859f845b3 100644
--- a/tensorflow/python/profiler/internal/run_metadata_test.py
+++ b/tensorflow/python/profiler/internal/run_metadata_test.py
@@ -26,6 +26,7 @@ from tensorflow.core.protobuf import config_pb2
 from tensorflow.core.protobuf import rewriter_config_pb2
 from tensorflow.python.client import session
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import variables
@@ -154,6 +155,7 @@ class RunMetadataTest(test.TestCase):
     # deallocates the memory after matmul started.
     self.assertGreater(random_allocs[1].alloc_micros, mm.all_start_micros)
 
+  @test_util.run_deprecated_v1
   def testCPU(self):
     ops.reset_default_graph()
     with ops.device('/cpu:0'):
@@ -167,6 +169,7 @@ class RunMetadataTest(test.TestCase):
     ret = _extract_node(run_meta, 'MatMul:MatMul')
     self.assertEqual(len(ret), 0)
 
+  @test_util.run_deprecated_v1
   def testLoopCPU(self):
     ops.reset_default_graph()
     with ops.device('/cpu:0'):
diff --git a/tensorflow/python/profiler/model_analyzer.py b/tensorflow/python/profiler/model_analyzer.py
index 5f19eac0436..4b2d9052b78 100644
--- a/tensorflow/python/profiler/model_analyzer.py
+++ b/tensorflow/python/profiler/model_analyzer.py
@@ -122,7 +122,7 @@ def _build_advisor_options(options):
   return opts
 
 
-@tf_export('profiler.Profiler')
+@tf_export(v1=['profiler.Profiler'])
 class Profiler(object):
   """TensorFlow multi-step profiler.
 
@@ -306,7 +306,7 @@ class Profiler(object):
     print_mdl.WriteProfile(filename)
 
 
-@tf_export('profiler.profile')
+@tf_export(v1=['profiler.profile'])
 def profile(graph=None,
             run_meta=None,
             op_log=None,
@@ -381,7 +381,7 @@ def profile(graph=None,
   return tfprof_node
 
 
-@tf_export('profiler.advise')
+@tf_export(v1=['profiler.advise'])
 def advise(graph=None, run_meta=None, options=_DEFAULT_ADVISE_OPTIONS):
   """Auto profile and advise.
 
diff --git a/tensorflow/python/profiler/model_analyzer_test.py b/tensorflow/python/profiler/model_analyzer_test.py
index 94c685274a7..8648f0b5148 100644
--- a/tensorflow/python/profiler/model_analyzer_test.py
+++ b/tensorflow/python/profiler/model_analyzer_test.py
@@ -93,10 +93,10 @@ class PrintModelAnalysisTest(test.TestCase):
           config=self._no_rewrite_session_config()) as sess, ops.device(dev):
         x = lib.BuildSmallModel()
 
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         pctx.trace_next_step()
         pctx.dump_next_step()
-        _ = sess.run(x)
+        _ = self.evaluate(x)
 
         pctx.profiler.profile_name_scope(options=opts)
 
@@ -160,7 +160,7 @@ class PrintModelAnalysisTest(test.TestCase):
                         ) as sess, ops.device('/device:CPU:0'):
       x = lib.BuildSmallModel()
 
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       run_meta = config_pb2.RunMetadata()
       _ = sess.run(x,
                    options=config_pb2.RunOptions(
@@ -186,7 +186,7 @@ class PrintModelAnalysisTest(test.TestCase):
     with session.Session(config=self._no_rewrite_session_config()) as sess:
       x = lib.BuildSmallModel()
 
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       run_meta = config_pb2.RunMetadata()
       _ = sess.run(x,
                    options=config_pb2.RunOptions(
@@ -220,9 +220,9 @@ class PrintModelAnalysisTest(test.TestCase):
       with session.Session(config=self._no_rewrite_session_config()) as sess:
         x = lib.BuildFullModel()
 
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         pctx.trace_next_step()
-        _ = sess.run(x)
+        _ = self.evaluate(x)
         tfprof_node = pctx.profiler.profile_python(options=opts)
 
         # pylint: disable=line-too-long
@@ -281,7 +281,7 @@ class PrintModelAnalysisTest(test.TestCase):
     with session.Session(config=self._no_rewrite_session_config()) as sess:
       x = lib.BuildSmallModel()
 
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       run_meta = config_pb2.RunMetadata()
       _ = sess.run(x,
                    options=config_pb2.RunOptions(
@@ -309,7 +309,7 @@ class PrintModelAnalysisTest(test.TestCase):
     with session.Session(config=self._no_rewrite_session_config()) as sess:
       x = lib.BuildFullModel()
 
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       run_meta = config_pb2.RunMetadata()
       _ = sess.run(
           x,
@@ -345,7 +345,7 @@ class PrintModelAnalysisTest(test.TestCase):
     with session.Session(config=self._no_rewrite_session_config()) as sess:
       x = lib.BuildFullModel()
 
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       run_meta = config_pb2.RunMetadata()
       _ = sess.run(x,
                    options=config_pb2.RunOptions(
@@ -391,7 +391,7 @@ class PrintModelAnalysisTest(test.TestCase):
     with session.Session(config=self._no_rewrite_session_config()) as sess:
       x = lib.BuildFullModel()
 
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       run_meta = config_pb2.RunMetadata()
       _ = sess.run(
           x,
@@ -424,7 +424,7 @@ class PrintModelAnalysisTest(test.TestCase):
     with session.Session(config=self._no_rewrite_session_config()) as sess:
       x = lib.BuildFullModel()
 
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       run_meta = config_pb2.RunMetadata()
       _ = sess.run(
           x,
@@ -490,7 +490,7 @@ class PrintModelAnalysisTest(test.TestCase):
 
     with session.Session(config=self._no_rewrite_session_config()) as sess:
       x = lib.BuildSmallModel()
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       run_meta = config_pb2.RunMetadata()
       _ = sess.run(x,
                    options=config_pb2.RunOptions(
@@ -555,7 +555,7 @@ class PrintModelAnalysisTest(test.TestCase):
 
     with session.Session(config=self._no_rewrite_session_config()) as sess:
       x = lib.BuildSmallModel()
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       run_meta = config_pb2.RunMetadata()
       _ = sess.run(x,
                    options=config_pb2.RunOptions(
@@ -587,10 +587,10 @@ class PrintModelAnalysisTest(test.TestCase):
   def _trainLoop(self, train_op, train_steps, time_dir, time_step,
                  memory_dir, memory_step, profile_dir, dump_step):
     with session.Session(config=self._no_rewrite_session_config()) as sess:
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       # start from 1 because variable_initializer took one step.
       for i in range(1, train_steps + 1):
-        _ = sess.run(train_op)
+        _ = self.evaluate(train_op)
         if i in time_step:
           ret = gfile.ListDirectory(time_dir)
           self.assertEqual(len(ret), 1)
diff --git a/tensorflow/python/profiler/option_builder.py b/tensorflow/python/profiler/option_builder.py
index 2ad7adf7693..9d8f7683a65 100644
--- a/tensorflow/python/profiler/option_builder.py
+++ b/tensorflow/python/profiler/option_builder.py
@@ -23,7 +23,7 @@ from tensorflow.python.profiler import tfprof_logger
 from tensorflow.python.util.tf_export import tf_export
 
 
-@tf_export('profiler.ProfileOptionBuilder')
+@tf_export(v1=['profiler.ProfileOptionBuilder'])
 class ProfileOptionBuilder(object):
   # pylint: disable=line-too-long
   """Option Builder for Profiling API.
diff --git a/tensorflow/python/profiler/pprof_profiler_test.py b/tensorflow/python/profiler/pprof_profiler_test.py
index 11a3487360c..120a0d0eaa6 100644
--- a/tensorflow/python/profiler/pprof_profiler_test.py
+++ b/tensorflow/python/profiler/pprof_profiler_test.py
@@ -24,6 +24,7 @@ from proto import profile_pb2
 from tensorflow.core.framework import step_stats_pb2
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
@@ -135,6 +136,7 @@ comment: 9
       profile.ParseFromString(profile_contents)
       self.assertEquals(expected_proto, str(profile))
 
+  @test_util.run_deprecated_v1
   def testProfileWithWhileLoop(self):
     options = config_pb2.RunOptions()
     options.trace_level = config_pb2.RunOptions.FULL_TRACE
diff --git a/tensorflow/python/profiler/profile_context_test.py b/tensorflow/python/profiler/profile_context_test.py
index 107ad443c32..885f08ca4b9 100644
--- a/tensorflow/python/profiler/profile_context_test.py
+++ b/tensorflow/python/profiler/profile_context_test.py
@@ -21,6 +21,7 @@ import os
 
 from tensorflow.python.client import session
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import gfile
 from tensorflow.python.platform import test
@@ -35,6 +36,7 @@ builder = option_builder.ProfileOptionBuilder
 
 class ProfilerContextTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testBasics(self):
     ops.reset_default_graph()
     outfile = os.path.join(test.get_temp_dir(), "dump")
@@ -48,10 +50,10 @@ class ProfilerContextTest(test.TestCase):
     with profile_context.ProfileContext(test.get_temp_dir()) as pctx:
       pctx.add_auto_profiling("op", options=opts, profile_steps=[15, 50, 100])
       with session.Session() as sess:
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         total_steps = 101
         for i in range(total_steps):
-          sess.run(x)
+          self.evaluate(x)
           if i == 14 or i == 49:
             self.assertTrue(gfile.Exists(outfile))
             gfile.Remove(outfile)
@@ -69,45 +71,47 @@ class ProfilerContextTest(test.TestCase):
       with gfile.Open(outfile, "r") as f:
         self.assertEqual(profile_str, f.read())
 
+  @test_util.run_deprecated_v1
   def testAutoTracingInDeubMode(self):
     ops.reset_default_graph()
     x = lib.BuildFullModel()
 
     with profile_context.ProfileContext(test.get_temp_dir(), debug=True):
       with session.Session() as sess:
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         for _ in range(10):
-          sess.run(x)
+          self.evaluate(x)
           for f in gfile.ListDirectory(test.get_temp_dir()):
             # Warm up, no tracing.
             self.assertFalse("run_meta" in f)
-        sess.run(x)
+        self.evaluate(x)
         self.assertTrue(
             gfile.Exists(os.path.join(test.get_temp_dir(), "run_meta_11")))
         gfile.Remove(os.path.join(test.get_temp_dir(), "run_meta_11"))
         # fetched already.
-        sess.run(x)
+        self.evaluate(x)
         for f in gfile.ListDirectory(test.get_temp_dir()):
           self.assertFalse("run_meta" in f)
 
+  @test_util.run_deprecated_v1
   def testDisabled(self):
     ops.reset_default_graph()
     x = lib.BuildFullModel()
     with profile_context.ProfileContext(test.get_temp_dir(),
                                         enabled=False) as pctx:
       with session.Session() as sess:
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         for _ in range(10):
-          sess.run(x)
+          self.evaluate(x)
       self.assertTrue(pctx.profiler is None)
       self.assertTrue(
           getattr(session.BaseSession, "profile_context", None) is None)
 
     with profile_context.ProfileContext(test.get_temp_dir()) as pctx:
       with session.Session() as sess:
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         for _ in range(10):
-          sess.run(x)
+          self.evaluate(x)
       self.assertFalse(pctx.profiler is None)
       self.assertFalse(
           getattr(session.BaseSession, "profile_context", None) is None)
diff --git a/tensorflow/python/profiler/profiler.py b/tensorflow/python/profiler/profiler.py
index efbdd1ba684..5f62690b54e 100644
--- a/tensorflow/python/profiler/profiler.py
+++ b/tensorflow/python/profiler/profiler.py
@@ -49,7 +49,7 @@ _allowed_symbols.extend([
 ])
 
 # Export protos
-tf_export('profiler.GraphNodeProto')(GraphNodeProto)
-tf_export('profiler.MultiGraphNodeProto')(MultiGraphNodeProto)
-tf_export('profiler.AdviceProto')(AdviceProto)
-tf_export('profiler.OpLogProto')(OpLogProto)
+tf_export(v1=['profiler.GraphNodeProto'])(GraphNodeProto)
+tf_export(v1=['profiler.MultiGraphNodeProto'])(MultiGraphNodeProto)
+tf_export(v1=['profiler.AdviceProto'])(AdviceProto)
+tf_export(v1=['profiler.OpLogProto'])(OpLogProto)
diff --git a/tensorflow/python/profiler/profiler_test.py b/tensorflow/python/profiler/profiler_test.py
index eacb7d21e6a..e4f7361e5d7 100644
--- a/tensorflow/python/profiler/profiler_test.py
+++ b/tensorflow/python/profiler/profiler_test.py
@@ -21,6 +21,7 @@ import os
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.client import session
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import gfile
 from tensorflow.python.platform import test
@@ -35,6 +36,7 @@ builder = option_builder.ProfileOptionBuilder
 
 class ProfilerTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testProfileBasic(self):
     ops.reset_default_graph()
     outfile = os.path.join(test.get_temp_dir(), 'dump')
@@ -171,6 +173,7 @@ class ProfilerTest(test.TestCase):
       checker = advice_pb.checkers['ExpensiveOperationChecker']
       self.assertGreater(len(checker.reports), 0)
 
+  @test_util.run_deprecated_v1
   def testMultipleProfilePerStep(self):
     ops.reset_default_graph()
     opts = (builder(builder.trainable_variables_parameter())
diff --git a/tensorflow/python/profiler/tfprof_logger.py b/tensorflow/python/profiler/tfprof_logger.py
index e651de32ea3..6ccd0e0ff3b 100644
--- a/tensorflow/python/profiler/tfprof_logger.py
+++ b/tensorflow/python/profiler/tfprof_logger.py
@@ -188,7 +188,7 @@ def merge_default_with_oplog(graph, op_log=None, run_meta=None,
   return tmp_op_log
 
 
-@tf_export('profiler.write_op_log')
+@tf_export(v1=['profiler.write_op_log'])
 def write_op_log(graph, log_dir, op_log=None, run_meta=None, add_trace=True):
   """Log provided 'op_log', and add additional model information below.
 
diff --git a/tensorflow/python/saved_model/BUILD b/tensorflow/python/saved_model/BUILD
index e7a3b8afd5d..53d0640542f 100644
--- a/tensorflow/python/saved_model/BUILD
+++ b/tensorflow/python/saved_model/BUILD
@@ -12,6 +12,8 @@ licenses(["notice"])  # Apache 2.0
 exports_files(["LICENSE"])
 
 load("//tensorflow:tensorflow.bzl", "py_test")
+load("//tensorflow/core:platform/default/build_config.bzl", "tf_proto_library")
+load("//tensorflow/core:platform/default/build_config.bzl", "tf_additional_all_protos")
 
 py_library(
     name = "saved_model",
@@ -21,6 +23,7 @@ py_library(
     deps = [
         ":builder",
         ":constants",
+        ":load",
         ":loader",
         ":main_op",
         ":save",
@@ -83,12 +86,13 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         ":constants",
+        ":signature_def_utils",
         ":utils",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:lib",
         "//tensorflow/python:platform",
-        "//tensorflow/python:training",
+        "//tensorflow/python:saver",
         "//tensorflow/python:util",
         "//tensorflow/python:variables",
     ],
@@ -114,6 +118,7 @@ py_test(
         "//tensorflow/python:state_ops",
         "//tensorflow/python:training",
         "//tensorflow/python:variables",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -166,14 +171,15 @@ py_test(
         ":signature_def_utils",
         ":tag_constants",
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:client",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:errors",
         "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:lib",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:saver_test_utils",
+        "//tensorflow/python:session",
         "//tensorflow/python:state_ops",
         "//tensorflow/python:test_ops",
         "//tensorflow/python:training",
@@ -264,6 +270,14 @@ py_test(
     ],
 )
 
+tf_proto_library(
+    name = "saved_object_graph",
+    srcs = ["saved_object_graph.proto"],
+    cc_api_version = 2,
+    protodeps = tf_additional_all_protos(),
+    visibility = ["//tensorflow:internal"],
+)
+
 py_library(
     name = "save",
     srcs = [
@@ -271,16 +285,26 @@ py_library(
     ],
     srcs_version = "PY2AND3",
     deps = [
+        ":builder",
+        ":constants",
         ":loader",
+        ":saved_object_graph_py",
         ":signature_constants",
         ":signature_def_utils",
+        ":tag_constants",
         ":utils",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
+        "//tensorflow/python:framework",
         "//tensorflow/python:framework_ops",
+        "//tensorflow/python:lib",
+        "//tensorflow/python:resource_variable_ops",
         "//tensorflow/python:util",
+        "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:def_function",
-        "//tensorflow/python/eager:test",
+        "//tensorflow/python/eager:function",
+        "//tensorflow/python/training/checkpointable:base",
+        "//tensorflow/python/training/checkpointable:util",
     ],
 )
 
@@ -289,13 +313,42 @@ py_test(
     srcs = ["save_test.py"],
     srcs_version = "PY2AND3",
     deps = [
+        ":loader",
         ":save",
         ":signature_constants",
         ":tag_constants",
         "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/eager:test",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
-# -----------------------------------------------------------------------------
-# Google-internal targets.  These must be at the end for syncrepo.
+py_library(
+    name = "load",
+    srcs = [
+        "load.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":loader",
+        ":saved_object_graph_py",
+        "//tensorflow/python:lib",
+        "//tensorflow/python:util",
+        "//tensorflow/python/training/checkpointable:tracking",
+    ],
+)
+
+py_test(
+    name = "load_test",
+    srcs = ["load_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":load",
+        ":save",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:tensor_spec",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/eager:test",
+        "//tensorflow/python/training/checkpointable:tracking",
+    ],
+)
diff --git a/tensorflow/python/saved_model/builder.py b/tensorflow/python/saved_model/builder.py
index be49c70c604..b929934eebb 100644
--- a/tensorflow/python/saved_model/builder.py
+++ b/tensorflow/python/saved_model/builder.py
@@ -24,5 +24,6 @@ from __future__ import division
 from __future__ import print_function
 
 # pylint: disable=unused-import
+from tensorflow.python.saved_model.builder_impl import _SavedModelBuilder
 from tensorflow.python.saved_model.builder_impl import SavedModelBuilder
 # pylint: enable=unused-import
diff --git a/tensorflow/python/saved_model/builder_impl.py b/tensorflow/python/saved_model/builder_impl.py
index 4f68f7c5aea..f37d283a2a2 100644
--- a/tensorflow/python/saved_model/builder_impl.py
+++ b/tensorflow/python/saved_model/builder_impl.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import functools
 import os
 
 from google.protobuf.any_pb2 import Any
@@ -32,6 +33,7 @@ from tensorflow.python.lib.io import file_io
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import tf_logging
 from tensorflow.python.saved_model import constants
+from tensorflow.python.saved_model import signature_def_utils
 from tensorflow.python.saved_model import utils_impl as saved_model_utils
 from tensorflow.python.training import saver as tf_saver
 from tensorflow.python.util import compat
@@ -39,8 +41,9 @@ from tensorflow.python.util.deprecation import deprecated_args
 from tensorflow.python.util.tf_export import tf_export
 
 
-@tf_export(v1=["saved_model.Builder", "saved_model.builder.SavedModelBuilder"])
-class SavedModelBuilder(object):
+# Base class for the SavedModelBuilder that is only used by Tensorflow
+# internally. Please use tf.compat.v1.saved_model.SavedModelBuilder instead.
+class _SavedModelBuilder(object):
   """Builds the `SavedModel` protocol buffer and saves variables and assets.
 
   The `SavedModelBuilder` class provides functionality to build a `SavedModel`
@@ -68,7 +71,7 @@ class SavedModelBuilder(object):
     builder.add_meta_graph_and_variables(sess,
                                     ["foo-tag"],
                                     signature_def_map=foo_signatures,
-                                    assets_collection=foo_assets)
+                                    assets_list=foo_assets)
   ...
 
   with tf.Session(graph=tf.Graph()) as sess:
@@ -105,82 +108,24 @@ class SavedModelBuilder(object):
     # weights.
     self._has_saved_variables = False
 
-  def _save_and_write_assets(self, assets_collection_to_add=None):
+  def _save_and_write_assets(self, meta_graph_def, assets_list=None):
     """Saves asset to the meta graph and writes asset files to disk.
 
     Args:
-      assets_collection_to_add: The collection where the asset paths are setup.
+      meta_graph_def: The meta graph def to which the assets will be added.
+      assets_list: The list where the asset paths are setup.
     """
-    asset_filename_map = _maybe_save_assets(assets_collection_to_add)
+    # Creates a function that adds assets into the meta graph def.
+    write_fn = functools.partial(_add_asset_to_metagraph, meta_graph_def)
+    asset_filename_map = _maybe_save_assets(write_fn, assets_list)
 
     # Return if there are no assets to write.
     if not asset_filename_map:
       tf_logging.info("No assets to write.")
       return
 
-    assets_destination_dir = saved_model_utils.get_or_create_assets_dir(
-        self._export_dir)
-
-    # Copy each asset from source path to destination path.
-    for asset_basename, asset_source_filepath in asset_filename_map.items():
-      asset_destination_filepath = os.path.join(
-          compat.as_bytes(assets_destination_dir),
-          compat.as_bytes(asset_basename))
-
-      # Only copy the asset file to the destination if it does not already
-      # exist. This is to ensure that an asset with the same name defined as
-      # part of multiple graphs is only copied the first time.
-      if not file_io.file_exists(asset_destination_filepath):
-        file_io.copy(asset_source_filepath, asset_destination_filepath)
-
-    tf_logging.info("Assets written to: %s",
-                    compat.as_text(assets_destination_dir))
-
-  def _maybe_add_main_op(self, main_op):
-    """Adds main op to the SavedModel.
-
-    Args:
-      main_op: Main op to run as part of graph initialization. If None, no
-        main op will be added to the graph.
-
-    Raises:
-      TypeError: if main op is provided but is not of type `Operation`.
-      ValueError: if the Graph already contains an init op.
-    """
-    if main_op is None:
-      return
-
-    if not isinstance(main_op, ops.Operation):
-      raise TypeError("main_op needs to be an Operation: %r" % main_op)
-
-    # Validate that no other init ops have been added to this graph already.
-    # We check main_op and legacy_init_op for thoroughness and explicitness.
-    for init_op_key in (constants.MAIN_OP_KEY, constants.LEGACY_INIT_OP_KEY):
-      if ops.get_collection(init_op_key):
-        raise ValueError(
-            "Graph already contains one or more main ops under the "
-            "collection {}.".format(init_op_key))
-
-    ops.add_to_collection(constants.MAIN_OP_KEY, main_op)
-
-  def _add_train_op(self, train_op):
-    """Add train op to the SavedModel.
-
-    Note that this functionality is in development, and liable to be
-    moved elsewhere.
-
-    Args:
-      train_op: Op or group of ops that are used for training. These are
-        stored as a collection with key TRAIN_OP_KEY, but not executed.
-
-    Raises:
-      TypeError if Train op is not of type `Operation`.
-    """
-    if train_op is not None:
-      if (not isinstance(train_op, ops.Tensor) and
-          not isinstance(train_op, ops.Operation)):
-        raise TypeError("train_op needs to be a Tensor or Op: %r" % train_op)
-      ops.add_to_collection(constants.TRAIN_OP_KEY, train_op)
+    # Copy assets from source path to destination path.
+    copy_assets_to_destination_dir(asset_filename_map, self._export_dir)
 
   def _tag_and_add_meta_graph(self, meta_graph_def, tags, signature_def_map):
     """Tags the meta graph def and adds it to the SavedModel.
@@ -237,30 +182,32 @@ class SavedModelBuilder(object):
 
     Validation of entries in the signature def map includes ensuring that the
     `name` and `dtype` fields of the TensorInfo protos of the `inputs` and
-    `outputs` of each `SignatureDef` are populated.
+    `outputs` of each `SignatureDef` are populated. Also ensures that reserved
+    SigantureDef keys for the initialization and train ops are not used.
 
     Args:
       signature_def_map: The map of signature defs to be validated.
+
+    Raises:
+      AssertionError: If a TensorInfo is not valid.
+      KeyError: If a reserved signature key is used in the map.
     """
-    if signature_def_map is not None:
-      for signature_def_key in signature_def_map:
-        signature_def = signature_def_map[signature_def_key]
-        inputs = signature_def.inputs
-        outputs = signature_def.outputs
-        for inputs_key in inputs:
-          self._validate_tensor_info(inputs[inputs_key])
-        for outputs_key in outputs:
-          self._validate_tensor_info(outputs[outputs_key])
-
-  def _add_collections(
-      self, assets_collection, main_op, train_op):
-    """Add asset and op collections to be saved."""
-    # Save asset files and write them to disk, if any.
-    self._save_and_write_assets(assets_collection)
-
-    self._maybe_add_main_op(main_op)
-
-    self._add_train_op(train_op)
+    for signature_def_key in signature_def_map:
+      signature_def = signature_def_map[signature_def_key]
+      inputs = signature_def.inputs
+      outputs = signature_def.outputs
+      for inputs_key in inputs:
+        self._validate_tensor_info(inputs[inputs_key])
+      for outputs_key in outputs:
+        self._validate_tensor_info(outputs[outputs_key])
+    if constants.INIT_OP_SIGNATURE_KEY in signature_def_map:
+      raise KeyError(
+          "SignatureDef map key \"{}\" is reserved for initialization. Please "
+          "use a different key.".format(constants.INIT_OP_SIGNATURE_KEY))
+    if constants.TRAIN_OP_SIGNATURE_KEY in signature_def_map:
+      raise KeyError(
+          "SignatureDef map key \"{}\" is reserved for the train op. Please "
+          "use a different key.".format(constants.TRAIN_OP_SIGNATURE_KEY))
 
   def _maybe_create_saver(self, saver=None):
     """Creates a sharded saver if one does not already exist."""
@@ -274,6 +221,283 @@ class SavedModelBuilder(object):
           allow_empty=True)
     return saver
 
+  def add_meta_graph(self,
+                     tags,
+                     signature_def_map=None,
+                     assets_list=None,
+                     clear_devices=False,
+                     init_op=None,
+                     train_op=None,
+                     saver=None):
+    """Adds the current meta graph to the SavedModel.
+
+    Creates a Saver in the current scope and uses the Saver to export the meta
+    graph def. Invoking this API requires the `add_meta_graph_and_variables()`
+    API to have been invoked before.
+
+    Args:
+      tags: The set of tags to annotate the meta graph def with.
+      signature_def_map: The map of signature defs to be added to the meta graph
+          def.
+      assets_list: Assets to be saved with SavedModel. Note
+          that this list should be a subset of the assets saved as part of
+          the first meta graph in the SavedModel.
+      clear_devices: Set to true if the device info on the default graph should
+          be cleared.
+      init_op: Op or group of ops to execute when the graph is loaded. Note
+          that when the init_op is specified it is run after the restore op at
+          load-time.
+      train_op: Op or group of opts that trains the model when run. This will
+        not be run automatically when the graph is loaded, instead saved in
+        a SignatureDef accessible through the exported MetaGraph.
+      saver: An instance of tf.train.Saver that will be used to export the
+        metagraph. If None, a sharded Saver that restores all variables will
+        be used.
+
+    Raises:
+      AssertionError: If the variables for the SavedModel have not been saved
+          yet, or if the graph already contains one or more legacy init ops.
+    """
+    if not self._has_saved_variables:
+      raise AssertionError(
+          "Graph state including variables and assets has not been saved yet. "
+          "Please invoke `add_meta_graph_and_variables()` first.")
+
+    # Validate the signature def map to ensure all included TensorInfos are
+    # properly populated.
+    signature_def_map = signature_def_map or {}
+    self._validate_signature_def_map(signature_def_map)
+
+    # Create a SignatureDef pointing to the graph initialization op, which will
+    # be added to the MetaGraphDef.
+    _add_op_to_signature_def_map(signature_def_map, init_op,
+                                 constants.INIT_OP_SIGNATURE_KEY)
+    _add_op_to_signature_def_map(signature_def_map, train_op,
+                                 constants.TRAIN_OP_SIGNATURE_KEY)
+
+    saver = self._maybe_create_saver(saver)
+
+    # The graph almost certainly previously contained at least one Saver, and
+    # possibly several (e.g. one for loading a pretrained embedding, and another
+    # for the model weights).  Removing the preexisting ones was the
+    # motivation for the clear_extraneous_savers option, but it turns out that
+    # there are edge cases where that option breaks the graph.  Until that is
+    # resolved, we just leave the option set to False for now.
+    # TODO(soergel): Reinstate clear_extraneous_savers=True when possible.
+    meta_graph_def = saver.export_meta_graph(
+        clear_devices=clear_devices, strip_default_attrs=True)
+
+    # Save asset files and write them to disk, if any.
+    self._save_and_write_assets(meta_graph_def, assets_list)
+
+    # Tag the meta graph def and add it to the SavedModel.
+    self._tag_and_add_meta_graph(meta_graph_def, tags, signature_def_map)
+
+  def add_meta_graph_and_variables(self,
+                                   sess,
+                                   tags,
+                                   signature_def_map=None,
+                                   assets_list=None,
+                                   clear_devices=False,
+                                   init_op=None,
+                                   train_op=None,
+                                   strip_default_attrs=False,
+                                   saver=None):
+    # pylint: disable=line-too-long
+    """Adds the current meta graph to the SavedModel and saves variables.
+
+    Creates a Saver to save the variables from the provided session. Exports the
+    corresponding meta graph def. This function assumes that the variables to be
+    saved have been initialized. For a given `SavedModelBuilder`, this API must
+    be called exactly once and for the first meta graph to save. For subsequent
+    meta graph defs to be added, the `add_meta_graph()` API must be used.
+
+    Args:
+      sess: The TensorFlow session from which to save the meta graph and
+        variables.
+      tags: The set of tags with which to save the meta graph.
+      signature_def_map: The map of signature def map to add to the meta graph
+        def.
+      assets_list: Assets to be saved with SavedModel.
+      clear_devices: Set to true if the device info on the default graph should
+          be cleared.
+      init_op: Op or group of ops to execute when the graph is loaded. Note
+          that when the init_op is specified it is run after the restore op at
+          load-time.
+      train_op: Op or group of ops that trains the model when run. This will
+        not be run automatically when the graph is loaded, instead saved in
+        a SignatureDef accessible through the exported MetaGraph.
+      strip_default_attrs: Boolean. If `True`, default-valued attributes will be
+        removed from the NodeDefs. For a detailed guide, see
+        [Stripping Default-Valued Attributes](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/saved_model/README.md#stripping-default-valued-attributes).
+      saver: An instance of tf.train.Saver that will be used to export the
+        metagraph and save variables. If None, a sharded Saver that restores
+        all variables will be used.
+
+    """
+    # pylint: enable=line-too-long
+    if self._has_saved_variables:
+      raise AssertionError("Graph state including variables and assets has "
+                           "already been saved. Please invoke "
+                           "`add_meta_graph()` instead.")
+
+    # Validate the signature def map to ensure all included TensorInfos are
+    # properly populated.
+    signature_def_map = signature_def_map or {}
+    self._validate_signature_def_map(signature_def_map)
+
+    # Create a SignatureDef pointing to the graph initialization op, which will
+    # be added to the MetaGraphDef.
+    _add_op_to_signature_def_map(signature_def_map, init_op,
+                                 constants.INIT_OP_SIGNATURE_KEY)
+    _add_op_to_signature_def_map(signature_def_map, train_op,
+                                 constants.TRAIN_OP_SIGNATURE_KEY)
+
+    saved_model_utils.get_or_create_variables_dir(self._export_dir)
+    variables_path = saved_model_utils.get_variables_path(self._export_dir)
+
+    saver = self._maybe_create_saver(saver)
+
+    # Save the variables. Also, disable writing the checkpoint state proto. The
+    # file is not used during SavedModel loading. In addition, since a
+    # SavedModel can be copied or moved, this avoids the checkpoint state to
+    # become outdated.
+    saver.save(sess, variables_path, write_meta_graph=False, write_state=False)
+
+    # Export the meta graph def.
+
+    # The graph almost certainly previously contained at least one Saver, and
+    # possibly several (e.g. one for loading a pretrained embedding, and another
+    # for the model weights).  Removing the preexisting ones was the
+    # motivation for the clear_extraneous_savers option, but it turns out that
+    # there are edge cases where that option breaks the graph.  Until that is
+    # resolved, we just leave the option set to False for now.
+    # TODO(soergel): Reinstate clear_extraneous_savers=True when possible.
+    meta_graph_def = saver.export_meta_graph(
+        clear_devices=clear_devices, strip_default_attrs=strip_default_attrs)
+
+    # Save asset files and write them to disk, if any.
+    self._save_and_write_assets(meta_graph_def, assets_list)
+
+    # Tag the meta graph def and add it to the SavedModel.
+    self._tag_and_add_meta_graph(meta_graph_def, tags, signature_def_map)
+
+    # Mark this instance of SavedModel as having saved variables, such that
+    # subsequent attempts to save variables will fail.
+    self._has_saved_variables = True
+
+  def save(self, as_text=False):
+    """Writes a `SavedModel` protocol buffer to disk.
+
+    The function writes the SavedModel protocol buffer to the export directory
+    in serialized format.
+
+    Args:
+      as_text: Writes the SavedModel protocol buffer in text format to disk.
+
+    Returns:
+      The path to which the SavedModel protocol buffer was written.
+    """
+    if not file_io.file_exists(self._export_dir):
+      file_io.recursive_create_dir(self._export_dir)
+
+    if as_text:
+      path = os.path.join(
+          compat.as_bytes(self._export_dir),
+          compat.as_bytes(constants.SAVED_MODEL_FILENAME_PBTXT))
+      file_io.write_string_to_file(path, str(self._saved_model))
+    else:
+      path = os.path.join(
+          compat.as_bytes(self._export_dir),
+          compat.as_bytes(constants.SAVED_MODEL_FILENAME_PB))
+      file_io.write_string_to_file(path, self._saved_model.SerializeToString())
+    tf_logging.info("SavedModel written to: %s", compat.as_text(path))
+
+    return path
+
+
+@tf_export(v1=["saved_model.Builder", "saved_model.builder.SavedModelBuilder"])  # pylint: disable=missing-docstring
+class SavedModelBuilder(_SavedModelBuilder):
+  __doc__ = _SavedModelBuilder.__doc__.replace("assets_list",
+                                               "assets_collection")
+
+  def __init__(self, export_dir):
+    super(SavedModelBuilder, self).__init__(export_dir=export_dir)
+
+  def _add_collections(self, assets_collection, main_op, train_op):
+    """Add asset and op collections to be saved."""
+    # Save asset files and write them to disk, if any.
+    self._save_and_write_assets(assets_collection)
+
+    self._maybe_add_main_op(main_op)
+
+    self._add_train_op(train_op)
+
+  def _save_and_write_assets(self, assets_collection_to_add=None):
+    """Saves asset to the meta graph and writes asset files to disk.
+
+    Args:
+      assets_collection_to_add: The collection where the asset paths are setup.
+    """
+    # Add assets to the collection with key `constants.ASSETS_KEY`, in the
+    # graph.
+    asset_filename_map = _maybe_save_assets(_add_asset_to_collection,
+                                            assets_collection_to_add)
+
+    # Return if there are no assets to write.
+    if not asset_filename_map:
+      tf_logging.info("No assets to write.")
+      return
+
+    # Copy assets from source path to destination path.
+    copy_assets_to_destination_dir(asset_filename_map, self._export_dir)
+
+  def _maybe_add_main_op(self, main_op):
+    """Adds main op to the SavedModel.
+
+    Args:
+      main_op: Main op to run as part of graph initialization. If None, no main
+        op will be added to the graph.
+
+    Raises:
+      TypeError: if main op is provided but is not of type `Operation`.
+      ValueError: if the Graph already contains an init op.
+    """
+    if main_op is None:
+      return
+
+    if not isinstance(main_op, ops.Operation):
+      raise TypeError("main_op needs to be an Operation: %r" % main_op)
+
+    # Validate that no other init ops have been added to this graph already.
+    # We check main_op and legacy_init_op for thoroughness and explicitness.
+    for init_op_key in (constants.MAIN_OP_KEY, constants.LEGACY_INIT_OP_KEY):
+      if ops.get_collection(init_op_key):
+        raise ValueError(
+            "Graph already contains one or more main ops under the "
+            "collection {}.".format(init_op_key))
+
+    ops.add_to_collection(constants.MAIN_OP_KEY, main_op)
+
+  def _add_train_op(self, train_op):
+    """Add train op to the SavedModel.
+
+    Note that this functionality is in development, and liable to be
+    moved elsewhere.
+
+    Args:
+      train_op: Op or group of ops that are used for training. These are stored
+        as a collection with key TRAIN_OP_KEY, but not executed.
+
+    Raises:
+      TypeError if Train op is not of type `Operation`.
+    """
+    if train_op is not None:
+      if (not isinstance(train_op, ops.Tensor) and
+          not isinstance(train_op, ops.Operation)):
+        raise TypeError("train_op needs to be a Tensor or Op: %r" % train_op)
+      ops.add_to_collection(constants.TRAIN_OP_KEY, train_op)
+
   @deprecated_args(None,
                    "Pass your op to the equivalent parameter main_op instead.",
                    "legacy_init_op")
@@ -286,39 +510,6 @@ class SavedModelBuilder(object):
                      main_op=None,
                      strip_default_attrs=False,
                      saver=None):
-    # pylint: disable=line-too-long
-    """Adds the current meta graph to the SavedModel.
-
-    Creates a Saver in the current scope and uses the Saver to export the meta
-    graph def. Invoking this API requires the `add_meta_graph_and_variables()`
-    API to have been invoked before.
-
-    Args:
-      tags: The set of tags to annotate the meta graph def with.
-      signature_def_map: The map of signature defs to be added to the meta graph
-          def.
-      assets_collection: Assets collection to be saved with SavedModel. Note
-          that this collection should be a subset of the assets saved as part of
-          the first meta graph in the SavedModel.
-      legacy_init_op: Legacy support for op or group of ops to execute after the
-          restore op upon a load. Deprecated; please use main_op instead.
-      clear_devices: Set to true if the device info on the default graph should
-          be cleared.
-      main_op: Op or group of ops to execute when the graph is loaded. Note
-          that when the main_op is specified it is run after the restore op at
-          load-time.
-      strip_default_attrs: Boolean. If `True`, default-valued attributes will be
-        removed from the NodeDefs. For a detailed guide, see
-        [Stripping Default-Valued Attributes](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/saved_model/README.md#stripping-default-valued-attributes).
-      saver: An instance of tf.train.Saver that will be used to export the
-        metagraph. If None, a sharded Saver that restores all variables will
-        be used.
-
-    Raises:
-      AssertionError: If the variables for the SavedModel have not been saved
-          yet, or if the graph already contains one or more legacy init ops.
-    """
-    # pylint: enable=line-too-long
     if not self._has_saved_variables:
       raise AssertionError(
           "Graph state including variables and assets has not been saved yet. "
@@ -326,6 +517,7 @@ class SavedModelBuilder(object):
 
     # Validate the signature def map to ensure all included TensorInfos are
     # properly populated.
+    signature_def_map = signature_def_map or {}
     self._validate_signature_def_map(signature_def_map)
 
     # legacy_init_op is deprecated, and going away in TF 2.0.
@@ -363,38 +555,6 @@ class SavedModelBuilder(object):
                                    main_op=None,
                                    strip_default_attrs=False,
                                    saver=None):
-    # pylint: disable=line-too-long
-    """Adds the current meta graph to the SavedModel and saves variables.
-
-    Creates a Saver to save the variables from the provided session. Exports the
-    corresponding meta graph def. This function assumes that the variables to be
-    saved have been initialized. For a given `SavedModelBuilder`, this API must
-    be called exactly once and for the first meta graph to save. For subsequent
-    meta graph defs to be added, the `add_meta_graph()` API must be used.
-
-    Args:
-      sess: The TensorFlow session from which to save the meta graph and
-        variables.
-      tags: The set of tags with which to save the meta graph.
-      signature_def_map: The map of signature def map to add to the meta graph
-        def.
-      assets_collection: Assets collection to be saved with SavedModel.
-      legacy_init_op: Legacy support for op or group of ops to execute after the
-          restore op upon a load. Deprecated; please use main_op instead.
-      clear_devices: Set to true if the device info on the default graph should
-          be cleared.
-      main_op: Op or group of ops to execute when the graph is loaded. Note
-          that when the main_op is specified it is run after the restore op at
-          load-time.
-      strip_default_attrs: Boolean. If `True`, default-valued attributes will be
-        removed from the NodeDefs. For a detailed guide, see
-        [Stripping Default-Valued Attributes](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/saved_model/README.md#stripping-default-valued-attributes).
-      saver: An instance of tf.train.Saver that will be used to export the
-        metagraph and save variables. If None, a sharded Saver that restores
-        all variables will be used.
-
-    """
-    # pylint: enable=line-too-long
     if self._has_saved_variables:
       raise AssertionError("Graph state including variables and assets has "
                            "already been saved. Please invoke "
@@ -402,6 +562,7 @@ class SavedModelBuilder(object):
 
     # Validate the signature def map to ensure all included TensorInfos are
     # properly populated.
+    signature_def_map = signature_def_map or {}
     self._validate_signature_def_map(signature_def_map)
 
     # legacy_init_op is deprecated, and going away in TF 2.0.
@@ -441,41 +602,19 @@ class SavedModelBuilder(object):
     # subsequent attempts to save variables will fail.
     self._has_saved_variables = True
 
-  def save(self, as_text=False):
-    """Writes a `SavedModel` protocol buffer to disk.
-
-    The function writes the SavedModel protocol buffer to the export directory
-    in serialized format.
-
-    Args:
-      as_text: Writes the SavedModel protocol buffer in text format to disk.
-
-    Returns:
-      The path to which the SavedModel protocol buffer was written.
-    """
-    if not file_io.file_exists(self._export_dir):
-      file_io.recursive_create_dir(self._export_dir)
-
-    if as_text:
-      path = os.path.join(
-          compat.as_bytes(self._export_dir),
-          compat.as_bytes(constants.SAVED_MODEL_FILENAME_PBTXT))
-      file_io.write_string_to_file(path, str(self._saved_model))
-    else:
-      path = os.path.join(
-          compat.as_bytes(self._export_dir),
-          compat.as_bytes(constants.SAVED_MODEL_FILENAME_PB))
-      file_io.write_string_to_file(path, self._saved_model.SerializeToString())
-    tf_logging.info("SavedModel written to: %s", compat.as_text(path))
-
-    return path
+  add_meta_graph.__doc__ = _SavedModelBuilder.add_meta_graph.__doc__.replace(
+      "assets_list", "assets_collection")
+  add_meta_graph_and_variables.__doc__ = \
+      _SavedModelBuilder.add_meta_graph_and_variables.__doc__.replace(
+          "assets_list", "assets_collection")
 
 
-def _maybe_save_assets(assets_collection_to_add=None):
+def _maybe_save_assets(write_fn, assets_to_add=None):
   """Saves assets to the meta graph.
 
   Args:
-    assets_collection_to_add: The collection where the asset paths are setup.
+    write_fn: A function callback that writes asset into meta graph.
+    assets_to_add: The list where the asset paths are setup.
 
   Returns:
     A dict of asset basenames for saving to the original full path to the asset.
@@ -486,25 +625,25 @@ def _maybe_save_assets(assets_collection_to_add=None):
   # Map of target file names to original filenames
   asset_filename_map = {}
 
-  if assets_collection_to_add is None:
+  if assets_to_add is None:
     tf_logging.info("No assets to save.")
     return asset_filename_map
 
-  # Iterate over the supplied asset collection, build the `AssetFile` proto
-  # and add them to the collection with key `constants.ASSETS_KEY`, in the
-  # graph.
-  for asset_tensor in assets_collection_to_add:
+  # Iterate over the supplied assets, build the `AssetFile` proto and add them
+  # to the meta graph.
+  for asset_tensor in assets_to_add:
     asset_source_filepath = _asset_path_from_tensor(asset_tensor)
     if not asset_source_filepath:
       raise ValueError("Invalid asset filepath tensor %s" % asset_tensor)
 
-    asset_filename = _get_asset_filename_to_add(
+    asset_filename = get_asset_filename_to_add(
         asset_source_filepath, asset_filename_map)
 
-    # Build `AssetFile` proto and add it to the asset collection in the graph.
+    # Call the passed-in function that builds AssetFileDef proto and adds it
+    # to either the collection or asset_file_def field of the meta graph.
     # Note that this should be done even when the file is a duplicate of an
     # already-added file, as the tensor reference should still exist.
-    _add_asset_to_collection(asset_filename, asset_tensor)
+    write_fn(asset_filename, asset_tensor)
 
     # In the cases where we are adding a duplicate, this will result in the
     # last of the filepaths being the one used for copying the file to the
@@ -516,7 +655,7 @@ def _maybe_save_assets(assets_collection_to_add=None):
   return asset_filename_map
 
 
-def _get_asset_filename_to_add(asset_filepath, asset_filename_map):
+def get_asset_filename_to_add(asset_filepath, asset_filename_map):
   """Get a unique basename to add to the SavedModel if this file is unseen.
 
   Assets come from users as full paths, and we save them out to the
@@ -542,7 +681,7 @@ def _get_asset_filename_to_add(asset_filepath, asset_filename_map):
 
   other_asset_filepath = asset_filename_map[asset_filename]
   if other_asset_filepath == asset_filepath:
-    # This is the same file, stored twice in the collection list. No need
+    # This is the same file, stored twice in the list. No need
     # to make unique.
     return asset_filename
 
@@ -589,6 +728,41 @@ def _asset_path_from_tensor(path_tensor):
   return str_values[0]
 
 
+def _add_asset_to_metagraph(meta_graph_def, asset_filename, asset_tensor):
+  """Builds an asset proto and adds it to the meta graph def.
+
+  Args:
+    meta_graph_def: The meta graph def to which the asset will be added.
+    asset_filename: The filename of the asset to be added.
+    asset_tensor: The asset tensor used to populate the tensor info of the asset
+      proto.
+  """
+  asset_proto = meta_graph_def.asset_file_def.add()
+  asset_proto.filename = asset_filename
+  asset_proto.tensor_info.name = asset_tensor.name
+
+
+def copy_assets_to_destination_dir(asset_filename_map, destination_dir):
+  """Copy all assets from source path to destination path."""
+  assets_destination_dir = saved_model_utils.get_or_create_assets_dir(
+      destination_dir)
+
+  # Copy each asset from source path to destination path.
+  for asset_basename, asset_source_filepath in asset_filename_map.items():
+    asset_destination_filepath = os.path.join(
+        compat.as_bytes(assets_destination_dir),
+        compat.as_bytes(asset_basename))
+
+    # Only copy the asset file to the destination if it does not already
+    # exist. This is to ensure that an asset with the same name defined as
+    # part of multiple graphs is only copied the first time.
+    if not file_io.file_exists(asset_destination_filepath):
+      file_io.copy(asset_source_filepath, asset_destination_filepath)
+
+  tf_logging.info("Assets written to: %s",
+                  compat.as_text(assets_destination_dir))
+
+
 def _add_asset_to_collection(asset_filename, asset_tensor):
   """Builds an asset proto and adds it to the asset collection of the graph.
 
@@ -604,3 +778,8 @@ def _add_asset_to_collection(asset_filename, asset_tensor):
   asset_any_proto = Any()
   asset_any_proto.Pack(asset_proto)
   ops.add_to_collection(constants.ASSETS_KEY, asset_any_proto)
+
+
+def _add_op_to_signature_def_map(signature_def_map, op, key):
+  if op is not None:
+    signature_def_map[key] = signature_def_utils.op_signature_def(op, key)
diff --git a/tensorflow/python/saved_model/constants.py b/tensorflow/python/saved_model/constants.py
index 0addbdc9686..90511a409ed 100644
--- a/tensorflow/python/saved_model/constants.py
+++ b/tensorflow/python/saved_model/constants.py
@@ -29,6 +29,9 @@ tf_export(
         "saved_model.ASSETS_DIRECTORY", "saved_model.constants.ASSETS_DIRECTORY"
     ]).export_constant(__name__, "ASSETS_DIRECTORY")
 
+# Subdirectory name containing unmanaged files from higher-level APIs.
+EXTRA_ASSETS_DIRECTORY = "assets.extra"
+
 # CollectionDef key containing SavedModel assets.
 ASSETS_KEY = "saved_model_assets"
 tf_export(
@@ -40,7 +43,6 @@ tf_export(
 # CollectionDef key for the legacy init op.
 LEGACY_INIT_OP_KEY = "legacy_init_op"
 tf_export(
-    "saved_model.LEGACY_INIT_OP_KEY",
     v1=[
         "saved_model.LEGACY_INIT_OP_KEY",
         "saved_model.constants.LEGACY_INIT_OP_KEY"
@@ -49,13 +51,12 @@ tf_export(
 # CollectionDef key for the SavedModel main op.
 MAIN_OP_KEY = "saved_model_main_op"
 tf_export(
-    "saved_model.MAIN_OP_KEY",
     v1=["saved_model.MAIN_OP_KEY",
         "saved_model.constants.MAIN_OP_KEY"]).export_constant(
             __name__, "MAIN_OP_KEY")
 
 # CollectionDef key for the SavedModel train op.
-# Not exported while export_all_saved_models is in contrib.
+# Not exported while export_all_saved_models is experimental.
 TRAIN_OP_KEY = "saved_model_train_op"
 
 # Schema version for SavedModel.
@@ -106,3 +107,8 @@ tf_export(
         "saved_model.VARIABLES_FILENAME",
         "saved_model.constants.VARIABLES_FILENAME"
     ]).export_constant(__name__, "VARIABLES_FILENAME")
+
+# The initialization and train ops for a MetaGraph are stored in the
+# signature def map. The ops are added to the map with the following keys.
+INIT_OP_SIGNATURE_KEY = "__saved_model_init_op"
+TRAIN_OP_SIGNATURE_KEY = "__saved_model_train_op"
diff --git a/tensorflow/python/saved_model/load.py b/tensorflow/python/saved_model/load.py
new file mode 100644
index 00000000000..397086e1069
--- /dev/null
+++ b/tensorflow/python/saved_model/load.py
@@ -0,0 +1,61 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Import a checkpointable object from a SavedModel."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+from tensorflow.python.lib.io import file_io
+from tensorflow.python.saved_model import constants
+from tensorflow.python.saved_model import saved_object_graph_pb2
+from tensorflow.python.training.checkpointable import tracking
+from tensorflow.python.util import compat
+
+
+def _recreate_object_graph(object_graph_proto):
+  """Recreates Python objects from an ObjectGraph proto."""
+  objects = []
+  for _ in object_graph_proto.nodes:
+    # TODO(allenl): re-create variables and other types
+    objects.append(tracking.Checkpointable())
+  for obj, object_proto in zip(objects, object_graph_proto.nodes):
+    for reference in object_proto.children:
+      setattr(obj, reference.local_name, objects[reference.node_id])
+  return objects[0]
+
+
+def load(export_dir):
+  """Load a SavedModel from `export_dir`."""
+  object_graph_filename = os.path.join(
+      compat.as_bytes(export_dir),
+      compat.as_bytes(constants.EXTRA_ASSETS_DIRECTORY),
+      compat.as_bytes("object_graph.pb"))
+  if file_io.file_exists(object_graph_filename):
+    # If there is an object graph associated with the SavedModel, we'll create a
+    # root object from that.
+    object_graph_string = file_io.FileIO(object_graph_filename, "rb").read()
+    object_graph_proto = (
+        saved_object_graph_pb2.SavedObjectGraph())
+    object_graph_proto.ParseFromString(object_graph_string)
+    root = _recreate_object_graph(object_graph_proto)
+  else:
+    raise NotImplementedError(
+        "Currently only SavedModels exported with `tf.saved_model.save` may be "
+        "imported. Other SavedModels may eventually be supported via load().")
+  # TODO(allenl): load functions from the SavedModel into the eager context
+  return root
diff --git a/tensorflow/python/saved_model/load_test.py b/tensorflow/python/saved_model/load_test.py
new file mode 100644
index 00000000000..bfa201ad630
--- /dev/null
+++ b/tensorflow/python/saved_model/load_test.py
@@ -0,0 +1,51 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for checkpointable object SavedModel loading."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+from tensorflow.python.eager import def_function
+from tensorflow.python.eager import test
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import tensor_spec
+from tensorflow.python.saved_model import load
+from tensorflow.python.saved_model import save
+from tensorflow.python.training.checkpointable import tracking
+
+
+class LoadTest(test.TestCase):
+
+  def test_structure_import(self):
+    root = tracking.Checkpointable()
+    root.f = def_function.function(
+        lambda x: 2. * x,
+        input_signature=[tensor_spec.TensorSpec(None, dtypes.float32)])
+    root.dep_one = tracking.Checkpointable()
+    root.dep_two = tracking.Checkpointable()
+    root.dep_two.dep = tracking.Checkpointable()
+    root.dep_three = root.dep_two.dep
+    save_dir = os.path.join(self.get_temp_dir(), "saved_model")
+    save.save(root, save_dir)
+    imported = load.load(save_dir)
+    self.assertIs(imported.dep_three, imported.dep_two.dep)
+    self.assertIsNot(imported.dep_one, imported.dep_two)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/saved_model/loader_impl.py b/tensorflow/python/saved_model/loader_impl.py
index 8c8eaf038a1..6bf39a2c676 100644
--- a/tensorflow/python/saved_model/loader_impl.py
+++ b/tensorflow/python/saved_model/loader_impl.py
@@ -31,6 +31,7 @@ from tensorflow.python.lib.io import file_io
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import tf_logging
 from tensorflow.python.saved_model import constants
+from tensorflow.python.saved_model import signature_def_utils
 from tensorflow.python.saved_model import utils_impl as saved_model_utils
 from tensorflow.python.training import saver as tf_saver
 from tensorflow.python.util import compat
@@ -99,22 +100,29 @@ def _get_asset_tensors(export_dir, meta_graph_def_to_load, import_scope=None):
   collection_def = meta_graph_def_to_load.collection_def
 
   asset_tensor_dict = {}
-  if constants.ASSETS_KEY in collection_def:
-    # Location of the assets for SavedModel.
-    assets_directory = os.path.join(
-        compat.as_bytes(export_dir),
-        compat.as_bytes(constants.ASSETS_DIRECTORY))
+  asset_protos = []
+
+  if meta_graph_def_to_load.asset_file_def:
+    asset_protos = meta_graph_def_to_load.asset_file_def
+  elif constants.ASSETS_KEY in collection_def:
     assets_any_proto = collection_def[constants.ASSETS_KEY].any_list.value
-    # Process each asset and add it to the asset tensor dictionary.
     for asset_any_proto in assets_any_proto:
       asset_proto = meta_graph_pb2.AssetFileDef()
       asset_any_proto.Unpack(asset_proto)
-      tensor_name = asset_proto.tensor_info.name
-      if import_scope:
-        tensor_name = "%s/%s" % (import_scope, tensor_name)
-      asset_tensor_dict[tensor_name] = os.path.join(
-          compat.as_bytes(assets_directory),
-          compat.as_bytes(asset_proto.filename))
+      asset_protos.append(asset_proto)
+
+  # Location of the assets for SavedModel.
+  assets_directory = os.path.join(
+      compat.as_bytes(export_dir), compat.as_bytes(constants.ASSETS_DIRECTORY))
+  # Process each asset and add it to the asset tensor dictionary.
+  for asset_proto in asset_protos:
+    tensor_name = asset_proto.tensor_info.name
+    if import_scope:
+      tensor_name = "%s/%s" % (import_scope, tensor_name)
+    asset_tensor_dict[tensor_name] = os.path.join(
+        compat.as_bytes(assets_directory),
+        compat.as_bytes(asset_proto.filename))
+
   return asset_tensor_dict
 
 
@@ -134,23 +142,53 @@ def _get_main_op_tensor(
     RuntimeError: If the collection def corresponding to the main op key has
         other than exactly one tensor.
   """
+  # TODO(kathywu): Rename this method to _get_op_from_collection when
+  # dependency from SavedModelEstimator is removed.
   collection_def = meta_graph_def_to_load.collection_def
-  main_op_tensor = None
+  init_op = None
   if init_op_key in collection_def:
-    main_ops = collection_def[init_op_key].node_list.value
-    if len(main_ops) != 1:
-      raise RuntimeError("Expected exactly one SavedModel main op. "
-                         "Found: {}".format(main_ops))
-    main_op_tensor = ops.get_collection(init_op_key)[0]
-  return main_op_tensor
+    init_op_list = collection_def[init_op_key].node_list.value
+    if len(init_op_list) != 1:
+      raise RuntimeError("Expected exactly one SavedModel init op. "
+                         "Found: {}".format(init_op_list))
+    init_op = ops.get_collection(init_op_key)[0]
+  return init_op
 
 
-@tf_export(
+def _get_op_from_collection(meta_graph_def, op_key):
+  return _get_main_op_tensor(meta_graph_def, op_key)
+
+
+def _get_op_from_signature_def(meta_graph_def, op_signature_key, import_scope):
+  """Retrieve op stored in the imported meta graph's signature def."""
+  if op_signature_key in meta_graph_def.signature_def:
+    return signature_def_utils.load_op_from_signature_def(
+        meta_graph_def.signature_def[op_signature_key], op_signature_key,
+        import_scope)
+  else:
+    return None
+
+
+def get_init_op(meta_graph_def, import_scope=None):
+  return (_get_op_from_signature_def(
+      meta_graph_def, constants.INIT_OP_SIGNATURE_KEY, import_scope) or
+          _get_op_from_collection(meta_graph_def, constants.MAIN_OP_KEY) or
+          _get_op_from_collection(meta_graph_def, constants.LEGACY_INIT_OP_KEY))
+
+
+def get_train_op(meta_graph_def, import_scope=None):
+  train_op = _get_op_from_signature_def(
+      meta_graph_def, constants.TRAIN_OP_SIGNATURE_KEY, import_scope)
+  if train_op is None:
+    train_op = _get_op_from_collection(meta_graph_def, constants.TRAIN_OP_KEY)
+  return train_op
+
+
+@tf_export(v1=[
+    "saved_model.contains_saved_model",
     "saved_model.maybe_saved_model_directory",
-    v1=[
-        "saved_model.maybe_saved_model_directory",
-        "saved_model.loader.maybe_saved_model_directory"
-    ])
+    "saved_model.loader.maybe_saved_model_directory"
+])
 @deprecation.deprecated_endpoints(
     "saved_model.loader.maybe_saved_model_directory")
 def maybe_saved_model_directory(export_dir):
@@ -173,6 +211,25 @@ def maybe_saved_model_directory(export_dir):
   return file_io.file_exists(txt_path) or file_io.file_exists(pb_path)
 
 
+@tf_export("saved_model.contains_saved_model", v1=[])
+def contains_saved_model(export_dir):
+  """Checks whether the provided export directory could contain a SavedModel.
+
+  Note that the method does not load any data by itself. If the method returns
+  `false`, the export directory definitely does not contain a SavedModel. If the
+  method returns `true`, the export directory may contain a SavedModel but
+  provides no guarantee that it can be loaded.
+
+  Args:
+    export_dir: Absolute string path to possible export location. For example,
+                '/my/foo/model'.
+
+  Returns:
+    True if the export directory contains SavedModel files, False otherwise.
+  """
+  return maybe_saved_model_directory(export_dir)
+
+
 @tf_export(v1=["saved_model.load", "saved_model.loader.load"])
 @deprecation.deprecated(
     None,
@@ -334,11 +391,9 @@ class SavedModelLoader(object):
       asset_tensors_dictionary = _get_asset_tensors(
           self._export_dir, meta_graph_def, import_scope=import_scope)
 
-      main_op_tensor = (
-          _get_main_op_tensor(meta_graph_def, constants.MAIN_OP_KEY) or
-          _get_main_op_tensor(meta_graph_def, constants.LEGACY_INIT_OP_KEY))
-      if main_op_tensor is not None:
-        sess.run(fetches=[main_op_tensor], feed_dict=asset_tensors_dictionary)
+      init_op = get_init_op(meta_graph_def, import_scope)
+      if init_op is not None:
+        sess.run(fetches=[init_op], feed_dict=asset_tensors_dictionary)
 
   def load(self, sess, tags, import_scope=None, **saver_kwargs):
     """Load the MetaGraphDef graph and restore variable values into the session.
diff --git a/tensorflow/python/saved_model/loader_test.py b/tensorflow/python/saved_model/loader_test.py
index 924b2e7c065..3b7f0b250e7 100644
--- a/tensorflow/python/saved_model/loader_test.py
+++ b/tensorflow/python/saved_model/loader_test.py
@@ -19,11 +19,14 @@ from __future__ import division
 from __future__ import print_function
 
 import os
+import shutil
+
+from absl.testing import parameterized
 
 from tensorflow.python.client import session
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
-from tensorflow.python.lib.io import file_io
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variables
@@ -42,55 +45,74 @@ SIMPLE_ADD_SAVED_MODEL = _get_export_dir("simple_add_saved_model")
 SAVED_MODEL_WITH_MAIN_OP = _get_export_dir("saved_model_with_main_op")
 
 
-class SavedModelLoaderTest(test.TestCase):
+def build_graph_helper():
+  g = ops.Graph()
+  with g.as_default():
+    x = variables.VariableV1(5, name="x")
+    y = variables.VariableV1(11, name="y")
+    z = x + y
 
-  def setUp(self):
-    """Write test SavedModels to a temp directory."""
-    with session.Session(graph=ops.Graph()) as sess:
-      x = variables.VariableV1(5, name="x")
-      y = variables.VariableV1(11, name="y")
-      z = x + y
-      sess.run(variables.global_variables_initializer())
+    foo_sig_def = signature_def_utils.build_signature_def({
+        "foo_input": utils.build_tensor_info(x)
+    }, {"foo_output": utils.build_tensor_info(z)})
+    bar_sig_def = signature_def_utils.build_signature_def({
+        "bar_x": utils.build_tensor_info(x),
+        "bar_y": utils.build_tensor_info(y)
+    }, {"bar_z": utils.build_tensor_info(z)})
+  return g, {"foo": foo_sig_def, "bar": bar_sig_def}, y
 
-      foo_sig_def = signature_def_utils.build_signature_def(
-          {"foo_input": utils.build_tensor_info(x)},
-          {"foo_output": utils.build_tensor_info(z)})
-      bar_sig_def = signature_def_utils.build_signature_def(
-          {"bar_x": utils.build_tensor_info(x),
-           "bar_y": utils.build_tensor_info(y)},
-          {"bar_z": utils.build_tensor_info(z)})
 
-      builder = saved_model_builder.SavedModelBuilder(SIMPLE_ADD_SAVED_MODEL)
-      builder.add_meta_graph_and_variables(
-          sess, ["foo_graph"], {"foo": foo_sig_def, "bar": bar_sig_def})
+@parameterized.parameters((saved_model_builder.SavedModelBuilder,),
+                          (saved_model_builder._SavedModelBuilder,))
+class SavedModelLoaderTest(test.TestCase, parameterized.TestCase):
+
+  def export_simple_graph(self, builder_cls):
+    g, sig_def_map, _ = build_graph_helper()
+    with session.Session(graph=g) as sess:
+      self.evaluate(variables.global_variables_initializer())
+      builder = builder_cls(SIMPLE_ADD_SAVED_MODEL)
+      builder.add_meta_graph_and_variables(sess, ["foo_graph"], sig_def_map)
       builder.save()
 
-      # Write SavedModel with a main_op
+  def export_graph_with_main_op(self, builder_cls):
+    g, sig_def_map, y = build_graph_helper()
+    with session.Session(graph=g) as sess:
+      self.evaluate(variables.global_variables_initializer())
       assign_op = control_flow_ops.group(state_ops.assign(y, 7))
 
-      builder = saved_model_builder.SavedModelBuilder(SAVED_MODEL_WITH_MAIN_OP)
-      builder.add_meta_graph_and_variables(
-          sess, ["foo_graph"], {"foo": foo_sig_def, "bar": bar_sig_def},
-          main_op=assign_op)
+      builder = builder_cls(SAVED_MODEL_WITH_MAIN_OP)
+
+      if builder_cls == saved_model_builder._SavedModelBuilder:
+        builder.add_meta_graph_and_variables(
+            sess, ["foo_graph"], sig_def_map, init_op=assign_op)
+      else:
+        builder.add_meta_graph_and_variables(
+            sess, ["foo_graph"], sig_def_map, main_op=assign_op)
       builder.save()
 
   def tearDown(self):
-    file_io.delete_recursively(test.get_temp_dir())
+    super(SavedModelLoaderTest, self).tearDown()
+    shutil.rmtree(test.get_temp_dir(), ignore_errors=True)
 
-  def test_load_function(self):
+  @test_util.run_deprecated_v1
+  def test_load_function(self, builder_cls):
+    self.export_simple_graph(builder_cls)
     loader = loader_impl.SavedModelLoader(SIMPLE_ADD_SAVED_MODEL)
     with self.session(graph=ops.Graph()) as sess:
       loader.load(sess, ["foo_graph"])
       self.assertEqual(5, sess.graph.get_tensor_by_name("x:0").eval())
       self.assertEqual(11, sess.graph.get_tensor_by_name("y:0").eval())
 
+    self.export_graph_with_main_op(builder_cls)
     loader2 = loader_impl.SavedModelLoader(SAVED_MODEL_WITH_MAIN_OP)
     with self.session(graph=ops.Graph()) as sess:
       loader2.load(sess, ["foo_graph"])
       self.assertEqual(5, sess.graph.get_tensor_by_name("x:0").eval())
       self.assertEqual(7, sess.graph.get_tensor_by_name("y:0").eval())
 
-  def test_load_graph(self):
+  @test_util.run_deprecated_v1
+  def test_load_graph(self, builder_cls):
+    self.export_simple_graph(builder_cls)
     loader = loader_impl.SavedModelLoader(SIMPLE_ADD_SAVED_MODEL)
     graph = ops.Graph()
     loader.load_graph(graph, ["foo_graph"])
@@ -101,14 +123,16 @@ class SavedModelLoaderTest(test.TestCase):
     with self.assertRaises(KeyError):
       graph.get_tensor_by_name("z:0")
 
-    with self.session(graph=graph) as sess:
+    with self.session(graph=graph):
       # Check that x and y are not initialized
       with self.assertRaises(errors.FailedPreconditionError):
-        sess.run(x)
+        self.evaluate(x)
       with self.assertRaises(errors.FailedPreconditionError):
-        sess.run(y)
+        self.evaluate(y)
 
-  def test_load_with_import_scope(self):
+  @test_util.run_deprecated_v1
+  def test_load_with_import_scope(self, builder_cls):
+    self.export_graph_with_main_op(builder_cls)
     loader = loader_impl.SavedModelLoader(SAVED_MODEL_WITH_MAIN_OP)
     with self.session(graph=ops.Graph()) as sess:
       saver, _ = loader.load_graph(
@@ -119,7 +143,13 @@ class SavedModelLoaderTest(test.TestCase):
         loader.restore_variables(sess, tf_saver.Saver())
 
       loader.restore_variables(sess, saver)
-      loader.run_init_ops(sess, ["foo_graph"])
+
+      if builder_cls == saved_model_builder._SavedModelBuilder:
+        with self.assertRaises(errors.NotFoundError):
+          loader.run_init_ops(sess, ["foo_graph"])
+        loader.run_init_ops(sess, ["foo_graph"], import_scope="baz")
+      else:
+        loader.run_init_ops(sess, ["foo_graph"])
 
       self.assertEqual(5, sess.graph.get_tensor_by_name("baz/x:0").eval())
       self.assertEqual(7, sess.graph.get_tensor_by_name("baz/y:0").eval())
@@ -131,23 +161,27 @@ class SavedModelLoaderTest(test.TestCase):
       self.assertEqual(5, sess.graph.get_tensor_by_name("baa/x:0").eval())
       self.assertEqual(7, sess.graph.get_tensor_by_name("baa/y:0").eval())
 
-  def test_restore_variables(self):
+  @test_util.run_deprecated_v1
+  def test_restore_variables(self, builder_cls):
+    self.export_graph_with_main_op(builder_cls)
     loader = loader_impl.SavedModelLoader(SAVED_MODEL_WITH_MAIN_OP)
     with self.session(graph=ops.Graph()) as sess:
       x = variables.VariableV1(0, name="x")
       y = variables.VariableV1(0, name="y")
       z = x * y
 
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
 
       # There are variables to restore, so a saver must be created.
       with self.assertRaises(ValueError):
         loader.restore_variables(sess, None)
 
       loader.restore_variables(sess, tf_saver.Saver())
-      self.assertEqual(55, z.eval())
+      self.assertEqual(55, self.evaluate(z))
 
-  def test_run_init_op(self):
+  @test_util.run_deprecated_v1
+  def test_run_init_op(self, builder_cls):
+    self.export_graph_with_main_op(builder_cls)
     loader = loader_impl.SavedModelLoader(SAVED_MODEL_WITH_MAIN_OP)
     graph = ops.Graph()
     saver, _ = loader.load_graph(graph, ["foo_graph"])
@@ -160,14 +194,16 @@ class SavedModelLoaderTest(test.TestCase):
       self.assertEqual(5, sess.graph.get_tensor_by_name("x:0").eval())
       self.assertEqual(7, sess.graph.get_tensor_by_name("y:0").eval())
 
-  def test_parse_saved_model(self):
+  def test_parse_saved_model(self, builder_cls):
+    self.export_simple_graph(builder_cls)
     loader = loader_impl.SavedModelLoader(SIMPLE_ADD_SAVED_MODEL)
     meta_graph = loader.get_meta_graph_def_from_tags(["foo_graph"])
     self.assertIsNotNone(meta_graph)
     self.assertIn("foo", meta_graph.signature_def)
     self.assertIn("bar", meta_graph.signature_def)
 
-  def test_load_invalid_meta_graph(self):
+  def test_load_invalid_meta_graph(self, builder_cls):
+    self.export_simple_graph(builder_cls)
     loader = loader_impl.SavedModelLoader(SIMPLE_ADD_SAVED_MODEL)
     with self.assertRaises(RuntimeError):
       loader.get_meta_graph_def_from_tags([])
@@ -176,13 +212,17 @@ class SavedModelLoaderTest(test.TestCase):
     with self.assertRaises(RuntimeError):
       loader.get_meta_graph_def_from_tags(["not_a_graph"])
 
-  def test_load_saved_model_with_no_variables(self):
+  @test_util.run_deprecated_v1
+  def test_load_saved_model_with_no_variables(self, builder_cls):
     """Test that SavedModel runs saver when there appear to be no variables.
 
     When no variables are detected, this may mean that the variables were saved
     to different collections, or the collections weren't saved to the
     SavedModel. If the SavedModel MetaGraphDef contains a saver, it should still
     run in either of these cases.
+
+    Args:
+      builder_cls: SavedModelBuilder or _SavedModelBuilder class
     """
     path = _get_export_dir("no_variable_saved_model")
     with session.Session(graph=ops.Graph()) as sess:
@@ -192,7 +232,7 @@ class SavedModelLoaderTest(test.TestCase):
           11, name="y", collections=["not_global_variable"])
       self.assertFalse(variables._all_saveable_objects())
       z = x + y
-      sess.run(variables.variables_initializer([x, y]))
+      self.evaluate(variables.variables_initializer([x, y]))
 
       foo_sig_def = signature_def_utils.build_signature_def(
           {"foo_input": utils.build_tensor_info(x)},
@@ -215,8 +255,9 @@ class SavedModelLoaderTest(test.TestCase):
       self.assertEqual(5, sess.graph.get_tensor_by_name("x:0").eval())
       self.assertEqual(11, sess.graph.get_tensor_by_name("y:0").eval())
 
-  def test_load_saved_model_graph_with_return_elements(self):
+  def test_load_saved_model_graph_with_return_elements(self, builder_cls):
     """Ensure that the correct elements are returned."""
+    self.export_simple_graph(builder_cls)
     loader = loader_impl.SavedModelLoader(SIMPLE_ADD_SAVED_MODEL)
     graph = ops.Graph()
     _, ret = loader.load_graph(graph, ["foo_graph"],
@@ -228,5 +269,6 @@ class SavedModelLoaderTest(test.TestCase):
     with self.assertRaisesRegexp(ValueError, "not found in graph"):
       loader.load_graph(graph, ["foo_graph"], return_elements=["z:0"])
 
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/saved_model/save.py b/tensorflow/python/saved_model/save.py
index 63575f631eb..a66e19b1995 100644
--- a/tensorflow/python/saved_model/save.py
+++ b/tensorflow/python/saved_model/save.py
@@ -19,39 +19,82 @@ from __future__ import division
 from __future__ import print_function
 
 import collections
+import functools
 import os
 
+from tensorflow.core.protobuf import meta_graph_pb2
 from tensorflow.core.protobuf import saved_model_pb2
 from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
-from tensorflow.python.eager import function
+from tensorflow.python.eager import function as defun
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import meta_graph
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_spec
 from tensorflow.python.lib.io import file_io
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.saved_model import builder_impl
 from tensorflow.python.saved_model import constants
+from tensorflow.python.saved_model import saved_object_graph_pb2
 from tensorflow.python.saved_model import signature_constants
 from tensorflow.python.saved_model import signature_def_utils
+from tensorflow.python.saved_model import tag_constants
 from tensorflow.python.saved_model import utils_impl
 from tensorflow.python.training.checkpointable import base
+from tensorflow.python.training.checkpointable import tracking
 from tensorflow.python.training.checkpointable import util
 from tensorflow.python.util import compat
 from tensorflow.python.util import nest
 from tensorflow.python.util.tf_export import tf_export
 
 
+def _check_for_functional_keras_model(root):
+  """Makes an export signature for `root` if it's a functional Keras Model."""
+  # If nothing is decorated yet but this is a functional Keras Model (duck
+  # typed), we'll try to make a signature ourselves.
+  try:
+    inputs = root.inputs
+    input_names = root.input_names
+  except AttributeError:
+    return None
+  input_signature = []
+  for input_tensor, input_name in zip(inputs, input_names):
+    input_signature.append(tensor_spec.TensorSpec(
+        shape=input_tensor.shape, dtype=input_tensor.dtype,
+        name=input_name))
+
+  @def_function.function(input_signature=input_signature)
+  def _wrapped_model(*args):
+    outputs_list = nest.flatten(root(inputs=list(args)))
+    return {name: output for name, output
+            in zip(root.output_names, outputs_list)}
+  return _wrapped_model
+
+
 def _find_function_to_export(root):
   """Iterate over `root`'s attributes, finding traced functions."""
-  functions = []
-  function_attribute_names = []
+  exported_function = None
+  previous_attribute_name = None
   for attribute_name in dir(root):
     attribute_value = getattr(root, attribute_name, None)
     if isinstance(attribute_value, def_function.PolymorphicFunction):
-      functions.append(attribute_value)
-      function_attribute_names.append(attribute_name)
-  # TODO(allenl): Automatically infer signatures for Keras functional models?
-  if not functions:
+      if exported_function is not None:
+        raise ValueError(
+            ("Exporting an object with no "
+             "tf.saved_model.save(..., signatures=...) "
+             "argument specified, and with more than one "
+             "@tf.function-decorated method attached to it: {}. The signature "
+             "keys for these functions are ambiguous. Specify signature "
+             "functions explicitly.").format(
+                 [previous_attribute_name, attribute_name]))
+      exported_function = attribute_value
+      previous_attribute_name = attribute_name
+  if exported_function is None:
+    exported_function = _check_for_functional_keras_model(root)
+  if exported_function is None:
     raise ValueError(
         ("Exporting an object with no tf.saved_model.save(..., signatures=...) "
          "argument specified, and with no @tf.function-decorated methods "
@@ -60,14 +103,7 @@ def _find_function_to_export(root):
          "signatures does not make sense, as the only consumers will expect "
          "signatures. Either decorate a method or specify a signature function "
          "explicitly."))
-  elif len(functions) > 1:
-    raise ValueError(
-        ("Exporting an object with no tf.saved_model.save(..., signatures=...) "
-         "argument specified, and with more than one @tf.function-decorated "
-         "method attached to it: {}. The signature keys for these functions "
-         "are ambiguous. Specify signature functions explicitly.").format(
-             function_attribute_names))
-  return functions[0]
+  return exported_function
 
 
 def _canonicalize_signatures(signatures):
@@ -77,7 +113,7 @@ def _canonicalize_signatures(signatures):
         signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY: signatures}
   concrete_signatures = {}
   for serving_key, signature_function in signatures.items():
-    if isinstance(signature_function, (function.PolymorphicFunction,
+    if isinstance(signature_function, (defun.PolymorphicFunction,
                                        def_function.PolymorphicFunction)):
       input_signature = signature_function._input_signature  # pylint: disable=protected-access
       if input_signature is None:
@@ -88,7 +124,7 @@ def _canonicalize_signatures(signatures):
              "converted to concrete functions using "
              "`f.get_concrete_function(...)`.").format(signature_function))
       signature_function = signature_function.get_concrete_function()
-    elif not isinstance(signature_function, function.Function):
+    elif not isinstance(signature_function, defun.Function):
       raise ValueError(
           ("Expected a TensorFlow function to generate a signature for, but "
            "got {}. Python functions may be decorated with "
@@ -145,64 +181,65 @@ def _tensor_dict_to_tensorinfo(tensor_dict):
           for key, value in tensor_dict.items()}
 
 
-def _map_captured_resources_to_created_resources(
+def _map_captures_to_created_tensors(
     original_captures, resource_map):
-  """Maps eager resources captured by a function to Graph resources for export.
+  """Maps eager tensors captured by a function to Graph resources for export.
 
   Args:
-    original_captures: A dictionary mapping from resource tensors captured by
-      the function to interior placeholders for those resources (inside the
-      function body).
+    original_captures: A dictionary mapping from tensors captured by the
+      function to interior placeholders for those tensors (inside the function
+      body).
     resource_map: A dictionary mapping from resource tensors owned by the eager
       context to resource tensors in the exported graph.
 
   Returns:
-    A dictionary mapping from interior placeholders in the function body to
-    exterior stand-in resource tensors which belong to the exported graph.
+    A list of stand-in tensors which belong to the exported graph, corresponding
+    to the function's captures.
 
   Raises:
     AssertionError: If the function references a resource which is not part of
       `resource_map`.
   """
-  export_captures = {}
+  export_captures = []
   for exterior, interior in original_captures.items():
     mapped_resource = resource_map.get(exterior, None)
     if mapped_resource is None:
-      raise AssertionError(
-          ("Tried to export a function which references untracked stateful "
-           "object {}. Stateful TensorFlow objects (e.g. tf.Variable) must "
-           "be tracked by the main object. Objects may be tracked by "
-           "assigning them to an attribute of another tracked object, or to "
-           "an attribute of the main object directly.")
-          .format(interior))
-    export_captures[interior] = mapped_resource
+      if exterior.dtype == dtypes.resource:
+        raise AssertionError(
+            ("Tried to export a function which references untracked stateful "
+             "object {}. Stateful TensorFlow objects (e.g. tf.Variable) must "
+             "be tracked by the main object. Objects may be tracked by "
+             "assigning them to an attribute of another tracked object, or to "
+             "an attribute of the main object directly.")
+            .format(interior))
+      else:
+        # This is a captured Tensor, but it's not a resource. We'll just add it
+        # to the graph as a constant.
+        mapped_resource = constant_op.constant(exterior.numpy())
+    export_captures.append(mapped_resource)
   return export_captures
 
 
-def _map_function_inputs_to_created_inputs(
-    function_inputs, export_captures, signature_key, function_name):
-  """Creates exterior placeholders in the exported graph for function inputs.
+def _map_function_arguments_to_created_inputs(
+    function_arguments, signature_key, function_name):
+  """Creates exterior placeholders in the exported graph for function arguments.
 
   Functions have two types of inputs: tensors captured from the outside (eager)
   context, and arguments to the function which we expect to receive from the
-  user at each call. `_map_captured_resources_to_created_resources` replaces
+  user at each call. `_map_captures_to_created_tensors` replaces
   captured tensors with stand-ins (typically these are resource dtype tensors
   associated with variables). `_map_function_inputs_to_created_inputs` runs over
-  every input, either captured or argument. For captures, it uses the mapped
-  resource from `export_captures`. For arguments, it creates a new placeholder
-  which will belong to the exported graph rather than the function body.
+  every argument, creating a new placeholder for each which will belong to the
+  exported graph rather than the function body.
 
   Args:
-    function_inputs: A list of all placeholders in the function body.
-    export_captures: A dictionary mapping from interior placeholders in the
-      function body to exterior stand-in resource tensors which belong to the
-      exported graph (see `_map_captured_resources_to_created_resources`).
+    function_arguments: A list of argument placeholders in the function body.
     signature_key: The name of the signature being exported, for error messages.
     function_name: The name of the function, for error messages.
 
   Returns:
     A tuple of (mapped_inputs, exterior_placeholders)
-      mapped_inputs: A list with entries corresponding to `function_inputs`
+      mapped_inputs: A list with entries corresponding to `function_arguments`
         containing all of the inputs of the function gathered from the exported
         graph (both captured resources and arguments).
       exterior_argument_placeholders: A dictionary mapping from argument names
@@ -220,12 +257,7 @@ def _map_function_inputs_to_created_inputs(
   # MetaGraph.
   exterior_argument_placeholders = {}
   mapped_inputs = []
-  for placeholder in function_inputs:
-    mapped_resource_tensor = export_captures.get(placeholder, None)
-    if mapped_resource_tensor is not None:
-      # This is a captured resource.
-      mapped_inputs.append(mapped_resource_tensor)
-      continue
+  for placeholder in function_arguments:
     # `export_captures` contains an exhaustive set of captures, so if we don't
     # find the input there then we now know we have an argument.
     user_input_name = compat.as_str_any(
@@ -258,6 +290,20 @@ def _map_function_inputs_to_created_inputs(
   return mapped_inputs, exterior_argument_placeholders
 
 
+def _call_function_with_mapped_captures(function, args, resource_map):
+  """Calls `function` in the exported graph, using mapped resource captures."""
+  export_captures = _map_captures_to_created_tensors(
+      function.graph.captures, resource_map)
+  mapped_inputs = args + export_captures
+  # Calls the function quite directly, since we have new captured resource
+  # tensors we need to feed in which weren't part of the original function
+  # definition.
+  # pylint: disable=protected-access
+  outputs = function._build_call_outputs(
+      function._inference_function.call(context.context(), mapped_inputs))
+  return outputs
+
+
 def _generate_signatures(signature_functions, resource_map):
   """Validates and calls `signature_functions` in the default graph.
 
@@ -287,35 +333,77 @@ def _generate_signatures(signature_functions, resource_map):
     SignatureDefs as part of that MetaGraph.
   """
   signatures = {}
-  for signature_key, func in sorted(signature_functions.items()):
-    # Register the inference function for this signature in the exported
-    # graph. There is no direct use for the gradient of this function, so we
-    # don't generate/register a gradient function here (but may end up with one
-    # if another function relies on it). Users can still take symbolic gradients
-    # of the function on import, the gradient just won't be in the saved
-    # graph. When exporting a signature which already computes gradients, this
-    # stops us from taking needless second-order gradients.
-    func.add_to_graph(register_gradient_functions=False)
-    export_captures = _map_captured_resources_to_created_resources(
-        func.graph.captures, resource_map)
+  for signature_key, function in sorted(signature_functions.items()):
+    if function.graph.captures:
+      argument_inputs = function.graph.inputs[:-len(function.graph.captures)]
+    else:
+      argument_inputs = function.graph.inputs
     mapped_inputs, exterior_argument_placeholders = (
-        _map_function_inputs_to_created_inputs(
-            func.inputs, export_captures, signature_key, func.name))
-    # Calls the function quite directly, since we have new captured resource
-    # tensors we need to feed in which weren't part of the original function
-    # definition.
-    # pylint: disable=protected-access
+        _map_function_arguments_to_created_inputs(
+            argument_inputs, signature_key, function.name))
     outputs = _normalize_outputs(
-        func._build_call_outputs(
-            func._inference_function.call(context.context(), mapped_inputs)),
-        func.name, signature_key)
-    # pylint: enable=protected-access
+        _call_function_with_mapped_captures(
+            function, mapped_inputs, resource_map),
+        function.name, signature_key)
     signatures[signature_key] = signature_def_utils.build_signature_def(
         _tensor_dict_to_tensorinfo(exterior_argument_placeholders),
         _tensor_dict_to_tensorinfo(outputs))
   return signatures
 
 
+def _trace_resource_initializers(accessible_objects):
+  """Create concrete functions from `TrackableResource` objects."""
+  resource_initializers = []
+
+  def _wrap_initializer(obj):
+    obj.initialize()
+    return constant_op.constant(1.)  # Dummy control output
+
+  for obj in accessible_objects:
+    if isinstance(obj, tracking.TrackableResource):
+      resource_initializers.append(def_function.function(
+          functools.partial(_wrap_initializer, obj),
+          # All inputs are captures.
+          input_signature=[]).get_concrete_function())
+  return resource_initializers
+
+
+_AssetInfo = collections.namedtuple(
+    "_AssetInfo", [
+        # List of AssetFileDef protocol buffers
+        "asset_defs",
+        # Map from asset variable resource Tensors to their init ops
+        "asset_initializers_by_resource",
+        # Map from base asset filenames to full paths
+        "asset_filename_map"])
+
+
+def _process_asset(trackable_asset, asset_info, resource_map):
+  """Add `trackable_asset` to `asset_info` and `resource_map`."""
+  original_variable = trackable_asset.asset_path
+  with context.eager_mode():
+    original_path = original_variable.numpy()
+  path = builder_impl.get_asset_filename_to_add(
+      asset_filepath=original_path,
+      asset_filename_map=asset_info.asset_filename_map)
+  asset_variable = asset_info.asset_filename_map.get(path, None)
+  if asset_variable is None:
+    asset_path_initializer = array_ops.placeholder(
+        shape=original_variable.shape,
+        dtype=dtypes.string,
+        name="asset_path_initializer")
+    asset_variable = resource_variable_ops.ResourceVariable(
+        asset_path_initializer)
+    asset_info.asset_filename_map[path] = original_path
+    asset_def = meta_graph_pb2.AssetFileDef()
+    asset_def.filename = path
+    asset_def.tensor_info.name = asset_path_initializer.name
+    asset_info.asset_defs.append(asset_def)
+    asset_info.asset_initializers_by_resource[original_variable.handle] = (
+        asset_variable.initializer)
+  resource_map[original_variable.handle] = asset_variable.handle
+
+
 def _map_resources(accessible_objects):
   """Makes new resource handle ops corresponding to existing resource tensors.
 
@@ -329,34 +417,83 @@ def _map_resources(accessible_objects):
       to create replacements for.
 
   Returns:
-    A tuple of (object_map, resource_map):
+    A tuple of (object_map, resource_map, asset_info):
       object_map: A dictionary mapping from object in `accessible_objects` to
         replacement objects created to hold the new resource tensors.
       resource_map: A dictionary mapping from resource tensors extracted from
         `accessible_objects` to newly created resource tensors.
+      asset_info: An _AssetInfo tuple describing external assets referenced from
+        accessible_objects.
   """
-  # TODO(allenl, rohanj): Map generic resources rather than just variables.
   # TODO(allenl): Handle MirroredVariables and other types of variables which
   # may need special casing.
   object_map = {}
   resource_map = {}
+  asset_info = _AssetInfo(
+      asset_defs=[],
+      asset_initializers_by_resource={},
+      asset_filename_map={})
   for obj in accessible_objects:
-    if resource_variable_ops.is_resource_variable(obj):
+    if isinstance(obj, tracking.TrackableResource):
+      new_resource = obj.create_resource()
+      resource_map[obj.resource_handle] = new_resource
+    elif resource_variable_ops.is_resource_variable(obj):
       new_variable = resource_variable_ops.copy_to_graph_uninitialized(obj)
       object_map[obj] = new_variable
       resource_map[obj.handle] = new_variable.handle
-  return object_map, resource_map
+    if isinstance(obj, tracking.TrackableAsset):
+      _process_asset(obj, asset_info, resource_map)
+  return object_map, resource_map, asset_info
 
 
-def _make_graph_def(root, signature_functions, object_saver):
-  """Generates and exports call ops for `signature_functions`."""
+def _fill_meta_graph_def(meta_graph_def, obj, signature_functions,
+                         object_saver):
+  """Generates a MetaGraph which calls `signature_functions`.
+
+  Args:
+    meta_graph_def: The MetaGraphDef proto to fill.
+    obj: The checkpointable object being exported.
+    signature_functions: A dictionary mapping signature keys to concrete
+      functions containing signatures to add to the MetaGraph.
+    object_saver: A CheckpointableSaver to add to the MetaGraph.
+
+  Returns:
+    asset_filename_map, a dictionary mapping from asset base names to
+    user-specified full asset paths, which should be copied to the SavedModel's
+    assets/ directory.
+  """
   signatures = {}
   # List objects from the eager context to make sure Optimizers give us the
   # right Graph-dependent variables.
-  accessible_objects = util.list_objects(root)
+  accessible_objects = util.list_objects(obj)
+  resource_initializer_functions = _trace_resource_initializers(
+      accessible_objects)
   exported_graph = ops.Graph()
+  resource_initializer_ops = []
   with exported_graph.as_default():
-    object_map, resource_map = _map_resources(accessible_objects)
+    object_map, resource_map, asset_info = _map_resources(accessible_objects)
+    for resource_initializer_function in resource_initializer_functions:
+      asset_dependencies = []
+      for capture in resource_initializer_function.graph.external_captures:
+        asset_initializer = asset_info.asset_initializers_by_resource.get(
+            capture, None)
+        if asset_initializer is not None:
+          asset_dependencies.append(asset_initializer)
+      with ops.control_dependencies(asset_dependencies):
+        resource_initializer_ops.append(
+            _call_function_with_mapped_captures(
+                resource_initializer_function, [], resource_map))
+    with ops.control_dependencies(resource_initializer_ops):
+      init_op = control_flow_ops.no_op()
+    # Add the same op to the main_op collection and to the init_op
+    # signature. The collection is for compatibility with older loader APIs;
+    # only one will be executed.
+    meta_graph_def.collection_def[constants.MAIN_OP_KEY].node_list.value.append(
+        init_op.name)
+    meta_graph_def.signature_def[constants.INIT_OP_SIGNATURE_KEY].CopyFrom(
+        signature_def_utils.op_signature_def(
+            init_op, constants.INIT_OP_SIGNATURE_KEY))
+
   # Saving an object-based checkpoint again gathers variables. We need to do the
   # gathering from the eager context so Optimizers save the right set of
   # variables, but want any operations associated with the save/restore to be in
@@ -365,11 +502,35 @@ def _make_graph_def(root, signature_functions, object_saver):
   with exported_graph.as_default():
     signatures = _generate_signatures(signature_functions, resource_map)
     saver_def = saver.to_proto()
+    meta_graph_def.saver_def.CopyFrom(saver_def)
   graph_def = exported_graph.as_graph_def(add_shapes=True)
   # Clean reference cycles so repeated export()s don't make work for the garbage
   # collector.
   ops.dismantle_graph(exported_graph)
-  return graph_def, signatures, saver_def
+
+  meta_graph_def.graph_def.CopyFrom(graph_def)
+  meta_graph_def.meta_info_def.tags.append(tag_constants.SERVING)
+  meta_graph_def.asset_file_def.extend(asset_info.asset_defs)
+  for signature_key, signature in signatures.items():
+    meta_graph_def.signature_def[signature_key].CopyFrom(signature)
+  meta_graph.strip_graph_default_valued_attrs(meta_graph_def)
+  return asset_info.asset_filename_map
+
+
+def _write_object_graph(obj, export_dir):
+  """Save a SavedObjectGraph proto for `obj`."""
+  # SavedObjectGraph is similar to the CheckpointableObjectGraph proto in the
+  # checkpoint. It will eventually go into the SavedModel.
+  object_proto = util.make_object_graph_without_attributes(
+      obj, proto=saved_object_graph_pb2.SavedObjectGraph())
+  extra_asset_dir = os.path.join(
+      compat.as_bytes(export_dir),
+      compat.as_bytes(constants.EXTRA_ASSETS_DIRECTORY))
+  file_io.recursive_create_dir(extra_asset_dir)
+  object_graph_filename = os.path.join(
+      extra_asset_dir, compat.as_bytes("object_graph.pb"))
+  file_io.write_string_to_file(object_graph_filename,
+                               object_proto.SerializeToString())
 
 
 @tf_export("saved_model.save", v1=["saved_model.experimental.save"])
@@ -450,6 +611,19 @@ def save(obj, export_dir, signatures=None):
           tf.TensorSpec(shape=[None, 3], dtype=tf.float32, name="inp")))
   ```
 
+  `tf.keras.Model` instances constructed from inputs and outputs already have a
+  signature and so do not require a `@tf.function` decorator or a `signatures`
+  argument. If neither are specified, the model's forward pass is exported.
+
+  ```python
+  x = input_layer.Input((4,), name="x")
+  y = core.Dense(5, name="out")(x)
+  model = training.Model(x, y)
+  tf.saved_model.save(model, '/tmp/saved_model/')
+  # The exported SavedModel takes "x" with shape [None, 4] and returns "out"
+  # with shape [None, 5]
+  ```
+
   Variables must be tracked by assigning them to an attribute of a tracked
   object or to an attribute of `obj` directly. TensorFlow objects (e.g. layers
   from `tf.keras.layers`, optimizers from `tf.train`) track their variables
@@ -515,26 +689,26 @@ def save(obj, export_dir, signatures=None):
     # Note that we run this before saving the checkpoint, since looping over
     # attributes may have the side effect of creating variables in some cases.
     signatures = _find_function_to_export(obj)
-  object_saver = util.CheckpointableSaver(obj)
-  utils_impl.get_or_create_variables_dir(export_dir)
-  object_saver.save(utils_impl.get_variables_path(export_dir))
 
   signatures = _canonicalize_signatures(signatures)
-  graph_def, signatures, saver_def = _make_graph_def(
-      obj, signatures, object_saver)
-  saved_model = saved_model_pb2.SavedModel()
-  saved_model.saved_model_schema_version = (
-      constants.SAVED_MODEL_SCHEMA_VERSION)
-  meta_graph_def = saved_model.meta_graphs.add()
-  meta_graph_def.saver_def.CopyFrom(saver_def)
   # TODO(allenl): Factor out some subset of SavedModelBuilder which is 2.x
   # compatible (no sessions) and share it with this export API rather than
   # making a SavedModel proto and writing it directly.
-  meta_graph_def.graph_def.MergeFrom(graph_def)
-  for signature_key, signature in signatures.items():
-    meta_graph_def.signature_def[signature_key].MergeFrom(signature)
-  meta_graph.strip_graph_default_valued_attrs(meta_graph_def)
+  saved_model = saved_model_pb2.SavedModel()
+  meta_graph_def = saved_model.meta_graphs.add()
+  object_saver = util.CheckpointableSaver(obj)
+  asset_filename_map = _fill_meta_graph_def(
+      meta_graph_def, obj, signatures, object_saver)
+  saved_model.saved_model_schema_version = (
+      constants.SAVED_MODEL_SCHEMA_VERSION)
+  # So far we've just been generating protocol buffers with no I/O. Now we write
+  # the checkpoint, copy assets into the assets directory, and write out the
+  # SavedModel proto itself.
+  utils_impl.get_or_create_variables_dir(export_dir)
+  object_saver.save(utils_impl.get_variables_path(export_dir))
+  builder_impl.copy_assets_to_destination_dir(asset_filename_map, export_dir)
   path = os.path.join(
       compat.as_bytes(export_dir),
       compat.as_bytes(constants.SAVED_MODEL_FILENAME_PB))
   file_io.write_string_to_file(path, saved_model.SerializeToString())
+  _write_object_graph(obj, export_dir)
diff --git a/tensorflow/python/saved_model/save_test.py b/tensorflow/python/saved_model/save_test.py
index 42ff508b38a..9e5b9b97176 100644
--- a/tensorflow/python/saved_model/save_test.py
+++ b/tensorflow/python/saved_model/save_test.py
@@ -21,6 +21,9 @@ from __future__ import print_function
 import os
 import sys
 
+import numpy
+
+from tensorflow.python.client import session as session_lib
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import def_function
 from tensorflow.python.eager import test
@@ -29,13 +32,19 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_spec
 from tensorflow.python.framework import test_util
+from tensorflow.python.keras.engine import input_layer
 from tensorflow.python.keras.engine import training
 from tensorflow.python.keras.layers import core
+from tensorflow.python.keras.layers import merge
+from tensorflow.python.lib.io import file_io
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import lookup_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.saved_model import loader
 from tensorflow.python.saved_model import save
 from tensorflow.python.saved_model import signature_constants
+from tensorflow.python.saved_model import tag_constants
 from tensorflow.python.training import adam
 from tensorflow.python.training.checkpointable import tracking
 from tensorflow.python.training.checkpointable import util
@@ -60,26 +69,27 @@ class _ModelWithOptimizer(training.Model):
     return {"loss": loss}
 
 
-class SaveTest(test.TestCase):
+def _import_and_infer(
+    save_dir, inputs,
+    signature_key=signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY):
+  """Import a SavedModel into a TF 1.x-style graph and run `signature_key`."""
+  graph = ops.Graph()
+  with graph.as_default(), session_lib.Session() as session:
+    model = loader.load(session, [tag_constants.SERVING], save_dir)
+    signature = model.signature_def[signature_key]
+    assert set(inputs.keys()) == set(signature.inputs.keys())
+    feed_dict = {}
+    for arg_name in inputs.keys():
+      feed_dict[graph.get_tensor_by_name(signature.inputs[arg_name].name)] = (
+          inputs[arg_name])
+    output_dict = {}
+    for output_name, output_tensor_info in signature.outputs.items():
+      output_dict[output_name] = graph.get_tensor_by_name(
+          output_tensor_info.name)
+    return session.run(output_dict, feed_dict=feed_dict)
 
-  def _import_and_infer(
-      self, save_dir, inputs,
-      signature_key=signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY):
-    """Import a SavedModel into a TF 1.x-style graph and run `signature_key`."""
-    graph = ops.Graph()
-    with graph.as_default(), self.session(graph) as session:
-      model = loader.load(session, [], save_dir)
-      signature = model.signature_def[signature_key]
-      self.assertEqual(set(inputs.keys()), set(signature.inputs.keys()))
-      feed_dict = {}
-      for arg_name in inputs.keys():
-        feed_dict[graph.get_tensor_by_name(signature.inputs[arg_name].name)] = (
-            inputs[arg_name])
-      output_dict = {}
-      for output_name, output_tensor_info in signature.outputs.items():
-        output_dict[output_name] = graph.get_tensor_by_name(
-            output_tensor_info.name)
-      return session.run(output_dict, feed_dict=feed_dict)
+
+class SaveTest(test.TestCase):
 
   def test_method_save_signature(self):
     root = tracking.Checkpointable()
@@ -91,7 +101,7 @@ class SaveTest(test.TestCase):
     save.save(root, save_dir, root.f)
     self.assertEqual(
         {"output_0": 2.},
-        self._import_and_infer(save_dir, {"x": 1.}))
+        _import_and_infer(save_dir, {"x": 1.}))
 
   def test_method_save_concrete(self):
     root = tracking.Checkpointable()
@@ -106,7 +116,7 @@ class SaveTest(test.TestCase):
             tensor_spec.TensorSpec(None, dtypes.float32))})
     self.assertEqual(
         {"out": 2.},
-        self._import_and_infer(
+        _import_and_infer(
             save_dir, {"z": 1.}, signature_key="non_default_key"))
 
   def test_non_concrete_error(self):
@@ -163,7 +173,7 @@ class SaveTest(test.TestCase):
     save_dir = os.path.join(self.get_temp_dir(), "saved_model")
     save.save(root, save_dir, to_save)
     self.assertAllEqual({"output_0": 12.},
-                        self._import_and_infer(save_dir, {"x": 2.}))
+                        _import_and_infer(save_dir, {"x": 2.}))
 
   def test_optimizer(self):
     x = constant_op.constant([[3., 4.]])
@@ -176,7 +186,7 @@ class SaveTest(test.TestCase):
     self.assertNotEqual(first_loss, second_loss)
     self.assertAllClose(
         second_loss,
-        self._import_and_infer(save_dir, {"x": [[3., 4.]], "y": [2.]}))
+        _import_and_infer(save_dir, {"x": [[3., 4.]], "y": [2.]}))
 
   def test_trivial_save_exception(self):
     save_dir = os.path.join(self.get_temp_dir(), "saved_model")
@@ -191,8 +201,8 @@ class SaveTest(test.TestCase):
     save_dir = os.path.join(self.get_temp_dir(), "saved_model")
     save.save(model, save_dir)
     self.assertIn("loss",
-                  self._import_and_infer(save_dir,
-                                         {"x": [[3., 4.]], "y": [2.]}))
+                  _import_and_infer(save_dir,
+                                    {"x": [[3., 4.]], "y": [2.]}))
 
   def test_single_function_default_signature(self):
     model = tracking.Checkpointable()
@@ -201,7 +211,7 @@ class SaveTest(test.TestCase):
     save_dir = os.path.join(self.get_temp_dir(), "saved_model")
     save.save(model, save_dir)
     self.assertAllClose({"output_0": 3.},
-                        self._import_and_infer(save_dir, {}))
+                        _import_and_infer(save_dir, {}))
 
   def test_ambiguous_signatures(self):
     model = _ModelWithOptimizer()
@@ -213,6 +223,19 @@ class SaveTest(test.TestCase):
     with self.assertRaisesRegexp(ValueError, "call.*second_function"):
       save.save(model, save_dir)
 
+  def test_subclassed_no_signature(self):
+
+    class Subclassed(training.Model):
+
+      def call(self, inputs):
+        return inputs * 2.
+
+    save_dir = os.path.join(self.get_temp_dir(), "saved_model")
+    model = Subclassed()
+    with self.assertRaisesRegexp(
+        ValueError, "no @tf.function-decorated methods"):
+      save.save(model, save_dir)
+
   def test_docstring(self):
 
     class Adder(util.Checkpoint):
@@ -227,7 +250,7 @@ class SaveTest(test.TestCase):
     save_dir = os.path.join(self.get_temp_dir(), "saved_model")
     save.save(to_save, save_dir)
     self.assertAllClose({"output_0": 7.},
-                        self._import_and_infer(save_dir, {"x": 3.}))
+                        _import_and_infer(save_dir, {"x": 3.}))
 
   def test_default_attr_stripping(self):
 
@@ -246,13 +269,90 @@ class SaveTest(test.TestCase):
     save.save(to_save, save_dir)
     graph = ops.Graph()
     with graph.as_default(), self.session(graph) as session:
-      loader.load(session, [], save_dir)
+      loader.load(session, [tag_constants.SERVING], save_dir)
       func, = graph._functions.values()
       complex_node, = [
           node for node in func.definition.node_def if node.op == "Complex"]
       self.assertNotIn("T", complex_node.attr)
       self.assertNotIn("Tout", complex_node.attr)
 
+  def test_export_functional_keras_model(self):
+    x = input_layer.Input((4,), name="x")
+    y = core.Dense(4, name="out")(x)
+    model = training.Model(x, y)
+    save_dir = os.path.join(self.get_temp_dir(), "saved_model")
+    save.save(model, save_dir)
+    self.assertAllClose(
+        {"out": model(array_ops.ones([1, 4]))},
+        _import_and_infer(save_dir, {"x": [[1., 1., 1., 1.]]}))
+
+  @test_util.run_deprecated_v1
+  def test_export_functional_keras_model_after_fit(self):
+    x = input_layer.Input((1,))
+    y = core.Dense(1, name="y")(x)
+    model = training.Model(x, y)
+    model.compile(optimizer="sgd", loss="mse")
+    model.fit(x=numpy.array([[1.]]),
+              y=numpy.array([2.]), epochs=2)
+    save_dir = os.path.join(self.get_temp_dir(), "saved_model")
+    save.save(model, save_dir)
+    self.assertAllClose(
+        {"y": model(constant_op.constant([[1.], [2.]]))},
+        _import_and_infer(save_dir, {"input_1": [[1.], [2.]]}))
+
+  def test_export_multi_input_functional_keras_model(self):
+    x1 = input_layer.Input((2,), name="x1")
+    x2 = input_layer.Input((2,), name="x2")
+    y1 = core.Dense(4)(merge.Add()([x1, x2]))
+    y2 = core.Dense(4)(merge.Multiply()([x1, x2]))
+    model = training.Model([x1, x2], [y1, y2])
+    save_dir = os.path.join(self.get_temp_dir(), "saved_model")
+    save.save(model, save_dir)
+    outputs = model([array_ops.ones([1, 2]), 2. * array_ops.ones([1, 2])])
+    self.assertAllClose(
+        {"dense": outputs[0], "dense_1": outputs[1]},
+        _import_and_infer(
+            save_dir,
+            {"x1": [[1., 1.]],
+             "x2": [[2., 2.]]}))
+
+
+class AssetTests(test.TestCase):
+
+  def setUp(self):
+    super(AssetTests, self).setUp()
+    self._vocab_path = os.path.join(self.get_temp_dir(), "vocab.txt")
+    with open(self._vocab_path, "w") as f:
+      f.write("alpha\nbeta\ngamma\n")
+
+  def test_table(self):
+    initializer = lookup_ops.TextFileInitializer(
+        self._vocab_path,
+        key_dtype=dtypes.string,
+        key_index=lookup_ops.TextFileIndex.WHOLE_LINE,
+        value_dtype=dtypes.int64,
+        value_index=lookup_ops.TextFileIndex.LINE_NUMBER)
+    root = util.Checkpoint(table=lookup_ops.HashTable(
+        initializer, default_value=-1))
+    root.table_user = def_function.function(
+        root.table.lookup,
+        input_signature=[tensor_spec.TensorSpec(None, dtypes.string)])
+    self.assertEqual(
+        2,
+        self.evaluate(root.table_user(constant_op.constant("gamma"))))
+    save_dir = os.path.join(self.get_temp_dir(), "saved_model")
+    save.save(root, save_dir)
+    file_io.delete_file(self._vocab_path)
+    self.assertAllClose(
+        {"output_0": [2, 0]},
+        _import_and_infer(save_dir, {"keys": ["gamma", "alpha"]}))
+    second_dir = os.path.join(self.get_temp_dir(), "second_dir")
+    # Asset paths should track the location the SavedModel is loaded from.
+    file_io.rename(save_dir, second_dir)
+    self.assertAllClose(
+        {"output_0": [2, 1]},
+        _import_and_infer(second_dir, {"keys": ["gamma", "beta"]}))
+
 
 class MemoryTests(test.TestCase):
 
diff --git a/tensorflow/python/saved_model/saved_model_test.py b/tensorflow/python/saved_model/saved_model_test.py
index 5d6167ab38f..0f18fb1a016 100644
--- a/tensorflow/python/saved_model/saved_model_test.py
+++ b/tensorflow/python/saved_model/saved_model_test.py
@@ -54,15 +54,15 @@ def tearDownModule():
   file_io.delete_recursively(test.get_temp_dir())
 
 
-class SavedModelTest(test.TestCase):
+class SavedModelTestBase(test.TestCase):
 
   def _get_export_dir(self, label):
     return os.path.join(test.get_temp_dir(), label)
 
   def _init_and_validate_variable(self, sess, variable_name, variable_value):
     v = variables.VariableV1(variable_value, name=variable_name)
-    sess.run(variables.global_variables_initializer())
-    self.assertEqual(variable_value, v.eval())
+    self.evaluate(variables.global_variables_initializer())
+    self.assertEqual(variable_value, self.evaluate(v))
 
   def _build_asset_collection(self, asset_file_name, asset_file_contents,
                               asset_file_tensor_name, asset_subdir=""):
@@ -78,14 +78,16 @@ class SavedModelTest(test.TestCase):
     asset_collection = ops.get_collection(ops.GraphKeys.ASSET_FILEPATHS)
     return asset_collection
 
-  def _validate_asset_collection(self, export_dir, graph_collection_def,
-                                 expected_asset_file_name,
-                                 expected_asset_file_contents,
-                                 expected_asset_tensor_name,
-                                 asset_id=0):
-    assets_any = graph_collection_def[constants.ASSETS_KEY].any_list.value
-    asset = meta_graph_pb2.AssetFileDef()
-    assets_any[asset_id].Unpack(asset)
+
+class SavedModelTest(SavedModelTestBase):
+
+  def _validate_assets(self,
+                       export_dir,
+                       asset_file_def,
+                       expected_asset_file_name,
+                       expected_asset_file_contents,
+                       expected_asset_tensor_name,
+                       asset_id=0):
     assets_path = os.path.join(
         compat.as_bytes(export_dir),
         compat.as_bytes(constants.ASSETS_DIRECTORY),
@@ -93,8 +95,10 @@ class SavedModelTest(test.TestCase):
     actual_asset_contents = file_io.read_file_to_string(assets_path)
     self.assertEqual(expected_asset_file_contents,
                      compat.as_text(actual_asset_contents))
-    self.assertEqual(expected_asset_file_name, asset.filename)
-    self.assertEqual(expected_asset_tensor_name, asset.tensor_info.name)
+    self.assertEqual(expected_asset_file_name,
+                     asset_file_def[asset_id].filename)
+    self.assertEqual(expected_asset_tensor_name,
+                     asset_file_def[asset_id].tensor_info.name)
 
   def _validate_inputs_tensor_info_fail(self, builder, tensor_info):
     with self.session(graph=ops.Graph()) as sess:
@@ -142,6 +146,18 @@ class SavedModelTest(test.TestCase):
           sess, ["foo"],
           signature_def_map={"foo_key": foo_signature})
 
+  def _validate_sig_def_keys(self, builder, valid_tensor_info, invalid_key):
+    with self.session(graph=ops.Graph()) as sess:
+      self._init_and_validate_variable(sess, "v", 42)
+
+      foo_signature = signature_def_utils.build_signature_def(
+          dict(), {"foo_key": valid_tensor_info}, "foo")
+      self.assertRaises(
+          KeyError,
+          builder.add_meta_graph_and_variables,
+          sess, ["foo"],
+          signature_def_map={invalid_key: foo_signature})
+
   def testMaybeSavedModelDir(self):
     base_path = test.test_src_dir_path("/python/saved_model")
     self.assertFalse(loader.maybe_saved_model_directory(base_path))
@@ -183,9 +199,10 @@ class SavedModelTest(test.TestCase):
                                    constants.SAVED_MODEL_FILENAME_PBTXT):
         loader.load(sess, ["foo"], export_dir)
 
+  @test_util.run_deprecated_v1
   def testVerifySessionGraphUsage(self):
     export_dir = self._get_export_dir("test_verify_session_graph_usage")
-    builder = saved_model_builder.SavedModelBuilder(export_dir)
+    builder = saved_model_builder._SavedModelBuilder(export_dir)
 
     with self.session(graph=ops.Graph()) as sess:
       self._init_and_validate_variable(sess, "v", 42)
@@ -203,9 +220,10 @@ class SavedModelTest(test.TestCase):
       self.assertEqual(
           42, ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)[0].eval())
 
+  @test_util.run_deprecated_v1
   def testSequence(self):
     export_dir = self._get_export_dir("test_sequence")
-    builder = saved_model_builder.SavedModelBuilder(export_dir)
+    builder = saved_model_builder._SavedModelBuilder(export_dir)
 
     # Expect an assertion error since add_meta_graph_and_variables() should be
     # invoked before any add_meta_graph() calls.
@@ -220,9 +238,10 @@ class SavedModelTest(test.TestCase):
       self.assertRaises(AssertionError, builder.add_meta_graph_and_variables,
                         sess, ["baz"])
 
+  @test_util.run_deprecated_v1
   def testTags(self):
     export_dir = self._get_export_dir("test_tags")
-    builder = saved_model_builder.SavedModelBuilder(export_dir)
+    builder = saved_model_builder._SavedModelBuilder(export_dir)
 
     # Graph with a single variable. SavedModel invoked to:
     # - add with weights.
@@ -309,9 +328,10 @@ class SavedModelTest(test.TestCase):
       self.assertRaises(RuntimeError, loader.load, sess, ["foo", "baz"],
                         export_dir)
 
+  @test_util.run_deprecated_v1
   def testVariables(self):
     export_dir = self._get_export_dir("test_variables")
-    builder = saved_model_builder.SavedModelBuilder(export_dir)
+    builder = saved_model_builder._SavedModelBuilder(export_dir)
 
     # Graph with two variables. SavedModel invoked to:
     # - add with weights.
@@ -361,9 +381,10 @@ class SavedModelTest(test.TestCase):
       self.assertRaises(errors.NotFoundError, loader.load, sess, ["baz"],
                         export_dir)
 
+  @test_util.run_deprecated_v1
   def testGraphWithoutVariables(self):
     export_dir = self._get_export_dir("test_graph_has_variables")
-    builder = saved_model_builder.SavedModelBuilder(export_dir)
+    builder = saved_model_builder._SavedModelBuilder(export_dir)
 
     # Graph with no variables.
     with self.session(graph=ops.Graph()) as sess:
@@ -385,7 +406,7 @@ class SavedModelTest(test.TestCase):
       a = ops.get_default_graph().get_tensor_by_name(constant_5_name)
       b = constant_op.constant(6.0)
       c = a * b
-      self.assertEqual(30.0, sess.run(c))
+      self.assertEqual(30.0, self.evaluate(c))
 
     # Restore the graph with tag "bar".
     with self.session(graph=ops.Graph()) as sess:
@@ -394,11 +415,12 @@ class SavedModelTest(test.TestCase):
       a = ops.get_default_graph().get_tensor_by_name(constant_6_name)
       b = constant_op.constant(5.0)
       c = a * b
-      self.assertEqual(30.0, sess.run(c))
+      self.assertEqual(30.0, self.evaluate(c))
 
+  @test_util.run_deprecated_v1
   def testNoOverwrite(self):
     export_dir = self._get_export_dir("test_no_overwrite")
-    builder = saved_model_builder.SavedModelBuilder(export_dir)
+    builder = saved_model_builder._SavedModelBuilder(export_dir)
 
     # Graph with a single variable. SavedModel invoked to:
     # - add with weights.
@@ -417,12 +439,13 @@ class SavedModelTest(test.TestCase):
 
     # An attempt to create another builder with the same export directory should
     # result in an assertion error.
-    self.assertRaises(AssertionError, saved_model_builder.SavedModelBuilder,
+    self.assertRaises(AssertionError, saved_model_builder._SavedModelBuilder,
                       export_dir)
 
+  @test_util.run_deprecated_v1
   def testSaveAsText(self):
     export_dir = self._get_export_dir("test_astext")
-    builder = saved_model_builder.SavedModelBuilder(export_dir)
+    builder = saved_model_builder._SavedModelBuilder(export_dir)
 
     # Graph with a single variable. SavedModel invoked to:
     # - add with weights.
@@ -451,17 +474,18 @@ class SavedModelTest(test.TestCase):
       self.assertEqual(
           42, ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)[0].eval())
 
+  @test_util.run_deprecated_v1
   def testCollections(self):
     export_dir = self._get_export_dir("test_collections")
-    builder = saved_model_builder.SavedModelBuilder(export_dir)
+    builder = saved_model_builder._SavedModelBuilder(export_dir)
 
     # Graph with a single variable added to a collection. SavedModel invoked to:
     # - add with weights.
     with self.session(graph=ops.Graph()) as sess:
       v = variables.VariableV1(42, name="v")
       ops.add_to_collection("foo_vars", v)
-      sess.run(variables.global_variables_initializer())
-      self.assertEqual(42, v.eval())
+      self.evaluate(variables.global_variables_initializer())
+      self.assertEqual(42, self.evaluate(v))
       builder.add_meta_graph_and_variables(sess, ["foo"])
 
     # Graph with the same single variable added to a different collection.
@@ -470,8 +494,8 @@ class SavedModelTest(test.TestCase):
     with self.session(graph=ops.Graph()) as sess:
       v = variables.VariableV1(43, name="v")
       ops.add_to_collection("bar_vars", v)
-      sess.run(variables.global_variables_initializer())
-      self.assertEqual(43, v.eval())
+      self.evaluate(variables.global_variables_initializer())
+      self.assertEqual(43, self.evaluate(v))
       builder.add_meta_graph(["bar"])
 
     # Save the SavedModel to disk.
@@ -501,9 +525,10 @@ class SavedModelTest(test.TestCase):
 
       self.assertEqual(len(ops.get_collection("foo_vars")), 0)
 
+  @test_util.run_deprecated_v1
   def testSignatureDefs(self):
     export_dir = self._get_export_dir("test_signature_defs")
-    builder = saved_model_builder.SavedModelBuilder(export_dir)
+    builder = saved_model_builder._SavedModelBuilder(export_dir)
 
     # Graph with a single variable and a single entry in the signature def map.
     # SavedModel is invoked to add with weights.
@@ -563,7 +588,7 @@ class SavedModelTest(test.TestCase):
 
   def testSignatureDefValidationFails(self):
     export_dir = self._get_export_dir("test_signature_def_validation_fail")
-    builder = saved_model_builder.SavedModelBuilder(export_dir)
+    builder = saved_model_builder._SavedModelBuilder(export_dir)
 
     tensor_without_encoding = meta_graph_pb2.TensorInfo()
     tensor_without_encoding.dtype = types_pb2.DT_FLOAT
@@ -579,19 +604,30 @@ class SavedModelTest(test.TestCase):
     self._validate_inputs_tensor_info_fail(builder, tensor_empty)
     self._validate_outputs_tensor_info_fail(builder, tensor_empty)
 
+    valid_tensor_info = meta_graph_pb2.TensorInfo()
+    valid_tensor_info.name = "foo"
+    valid_tensor_info.dtype = types_pb2.DT_FLOAT
+
+    self._validate_sig_def_keys(builder, valid_tensor_info,
+                                constants.INIT_OP_SIGNATURE_KEY)
+    self._validate_sig_def_keys(builder, valid_tensor_info,
+                                constants.TRAIN_OP_SIGNATURE_KEY)
+
+  @test_util.run_deprecated_v1
   def testSignatureDefValidationSucceedsWithName(self):
     tensor_with_name = meta_graph_pb2.TensorInfo()
     tensor_with_name.name = "foo"
     tensor_with_name.dtype = types_pb2.DT_FLOAT
 
     export_dir = self._get_export_dir("test_signature_def_validation_name_1")
-    builder = saved_model_builder.SavedModelBuilder(export_dir)
+    builder = saved_model_builder._SavedModelBuilder(export_dir)
     self._validate_inputs_tensor_info_accept(builder, tensor_with_name)
 
     export_dir = self._get_export_dir("test_signature_def_validation_name_2")
-    builder = saved_model_builder.SavedModelBuilder(export_dir)
+    builder = saved_model_builder._SavedModelBuilder(export_dir)
     self._validate_outputs_tensor_info_accept(builder, tensor_with_name)
 
+  @test_util.run_deprecated_v1
   def testSignatureDefValidationSucceedsWithCoo(self):
     tensor_with_coo = meta_graph_pb2.TensorInfo()
     # TODO(soergel) test validation of each of the fields of coo_sparse
@@ -599,16 +635,17 @@ class SavedModelTest(test.TestCase):
     tensor_with_coo.dtype = types_pb2.DT_FLOAT
 
     export_dir = self._get_export_dir("test_signature_def_validation_coo_1")
-    builder = saved_model_builder.SavedModelBuilder(export_dir)
+    builder = saved_model_builder._SavedModelBuilder(export_dir)
     self._validate_inputs_tensor_info_accept(builder, tensor_with_coo)
 
     export_dir = self._get_export_dir("test_signature_def_validation_coo_2")
-    builder = saved_model_builder.SavedModelBuilder(export_dir)
+    builder = saved_model_builder._SavedModelBuilder(export_dir)
     self._validate_outputs_tensor_info_accept(builder, tensor_with_coo)
 
+  @test_util.run_deprecated_v1
   def testAssets(self):
     export_dir = self._get_export_dir("test_assets")
-    builder = saved_model_builder.SavedModelBuilder(export_dir)
+    builder = saved_model_builder._SavedModelBuilder(export_dir)
 
     with self.session(graph=ops.Graph()) as sess:
       self._init_and_validate_variable(sess, "v", 42)
@@ -618,145 +655,151 @@ class SavedModelTest(test.TestCase):
           compat.as_bytes(test.get_temp_dir()), compat.as_bytes("ignored.txt"))
       file_io.write_string_to_file(ignored_filepath, "will be ignored")
 
-      asset_collection = self._build_asset_collection("hello42.txt",
-                                                      "foo bar baz",
-                                                      "asset_file_tensor")
+      asset_list = self._build_asset_collection("hello42.txt", "foo bar baz",
+                                                "asset_file_tensor")
 
       builder.add_meta_graph_and_variables(
-          sess, ["foo"], assets_collection=asset_collection)
+          sess, ["foo"], assets_list=asset_list)
 
     # Save the SavedModel to disk.
     builder.save()
 
     with self.session(graph=ops.Graph()) as sess:
       foo_graph = loader.load(sess, ["foo"], export_dir)
-      self._validate_asset_collection(export_dir, foo_graph.collection_def,
-                                      "hello42.txt", "foo bar baz",
-                                      "asset_file_tensor:0")
+      self._validate_assets(export_dir, foo_graph.asset_file_def, "hello42.txt",
+                            "foo bar baz", "asset_file_tensor:0")
       ignored_asset_path = os.path.join(
           compat.as_bytes(export_dir),
           compat.as_bytes(constants.ASSETS_DIRECTORY),
           compat.as_bytes("ignored.txt"))
       self.assertFalse(file_io.file_exists(ignored_asset_path))
 
+  @test_util.run_deprecated_v1
   def testAssetsNameCollisionDiffFile(self):
     export_dir = self._get_export_dir("test_assets_name_collision_diff_file")
-    builder = saved_model_builder.SavedModelBuilder(export_dir)
+    builder = saved_model_builder._SavedModelBuilder(export_dir)
 
     with self.session(graph=ops.Graph()) as sess:
       self._init_and_validate_variable(sess, "v", 42)
 
-      asset_collection = self._build_asset_collection(
-          "hello42.txt", "foo bar bak", "asset_file_tensor",
-          asset_subdir="1")
+      asset_list = self._build_asset_collection(
+          "hello42.txt", "foo bar bak", "asset_file_tensor", asset_subdir="1")
 
-      asset_collection = self._build_asset_collection(
-          "hello42.txt", "foo bar baz", "asset_file_tensor_1",
-          asset_subdir="2")
+      asset_list = self._build_asset_collection(
+          "hello42.txt", "foo bar baz", "asset_file_tensor_1", asset_subdir="2")
 
       builder.add_meta_graph_and_variables(
-          sess, ["foo"], assets_collection=asset_collection)
+          sess, ["foo"], assets_list=asset_list)
 
     # Save the SavedModel to disk.
     builder.save()
 
     with self.session(graph=ops.Graph()) as sess:
       foo_graph = loader.load(sess, ["foo"], export_dir)
-      self._validate_asset_collection(export_dir, foo_graph.collection_def,
-                                      "hello42.txt", "foo bar bak",
-                                      "asset_file_tensor:0")
-      self._validate_asset_collection(export_dir, foo_graph.collection_def,
-                                      "hello42.txt_1", "foo bar baz",
-                                      "asset_file_tensor_1:0",
-                                      asset_id=1)
+      self._validate_assets(export_dir, foo_graph.asset_file_def, "hello42.txt",
+                            "foo bar bak", "asset_file_tensor:0")
+      self._validate_assets(
+          export_dir,
+          foo_graph.asset_file_def,
+          "hello42.txt_1",
+          "foo bar baz",
+          "asset_file_tensor_1:0",
+          asset_id=1)
 
+  @test_util.run_deprecated_v1
   def testAssetsNameCollisionSameFilepath(self):
     export_dir = self._get_export_dir("test_assets_name_collision_same_path")
-    builder = saved_model_builder.SavedModelBuilder(export_dir)
+    builder = saved_model_builder._SavedModelBuilder(export_dir)
 
     with self.session(graph=ops.Graph()) as sess:
       self._init_and_validate_variable(sess, "v", 42)
 
-      asset_collection = self._build_asset_collection(
-          "hello42.txt", "foo bar baz", "asset_file_tensor")
+      asset_list = self._build_asset_collection("hello42.txt", "foo bar baz",
+                                                "asset_file_tensor")
 
-      asset_collection = self._build_asset_collection(
-          "hello42.txt", "foo bar baz", "asset_file_tensor_1")
+      asset_list = self._build_asset_collection("hello42.txt", "foo bar baz",
+                                                "asset_file_tensor_1")
 
       builder.add_meta_graph_and_variables(
-          sess, ["foo"], assets_collection=asset_collection)
+          sess, ["foo"], assets_list=asset_list)
 
     # Save the SavedModel to disk.
     builder.save()
 
     with self.session(graph=ops.Graph()) as sess:
       foo_graph = loader.load(sess, ["foo"], export_dir)
-      self._validate_asset_collection(export_dir, foo_graph.collection_def,
-                                      "hello42.txt", "foo bar baz",
-                                      "asset_file_tensor:0")
+      self._validate_assets(export_dir, foo_graph.asset_file_def, "hello42.txt",
+                            "foo bar baz", "asset_file_tensor:0")
       # The second tensor should be recorded, but the same.
-      self._validate_asset_collection(export_dir, foo_graph.collection_def,
-                                      "hello42.txt", "foo bar baz",
-                                      "asset_file_tensor_1:0",
-                                      asset_id=1)
+      self._validate_assets(
+          export_dir,
+          foo_graph.asset_file_def,
+          "hello42.txt",
+          "foo bar baz",
+          "asset_file_tensor_1:0",
+          asset_id=1)
       ignored_asset_path = os.path.join(
           compat.as_bytes(export_dir),
           compat.as_bytes(constants.ASSETS_DIRECTORY),
           compat.as_bytes("hello42.txt_1"))
       self.assertFalse(file_io.file_exists(ignored_asset_path))
 
+  @test_util.run_deprecated_v1
   def testAssetsNameCollisionSameFile(self):
     export_dir = self._get_export_dir("test_assets_name_collision_same_file")
-    builder = saved_model_builder.SavedModelBuilder(export_dir)
+    builder = saved_model_builder._SavedModelBuilder(export_dir)
 
     with self.session(graph=ops.Graph()) as sess:
       self._init_and_validate_variable(sess, "v", 42)
 
-      asset_collection = self._build_asset_collection(
-          "hello42.txt", "foo bar baz", "asset_file_tensor",
-          asset_subdir="1")
+      asset_list = self._build_asset_collection(
+          "hello42.txt", "foo bar baz", "asset_file_tensor", asset_subdir="1")
 
-      asset_collection = self._build_asset_collection(
-          "hello42.txt", "foo bar baz", "asset_file_tensor_1",
-          asset_subdir="2")
+      asset_list = self._build_asset_collection(
+          "hello42.txt", "foo bar baz", "asset_file_tensor_1", asset_subdir="2")
 
       builder.add_meta_graph_and_variables(
-          sess, ["foo"], assets_collection=asset_collection)
+          sess, ["foo"], assets_list=asset_list)
 
     # Save the SavedModel to disk.
     builder.save()
 
     with self.session(graph=ops.Graph()) as sess:
       foo_graph = loader.load(sess, ["foo"], export_dir)
-      self._validate_asset_collection(export_dir, foo_graph.collection_def,
-                                      "hello42.txt", "foo bar baz",
-                                      "asset_file_tensor:0")
+      self._validate_assets(export_dir, foo_graph.asset_file_def, "hello42.txt",
+                            "foo bar baz", "asset_file_tensor:0")
       # The second tensor should be recorded, but the same.
-      self._validate_asset_collection(export_dir, foo_graph.collection_def,
-                                      "hello42.txt", "foo bar baz",
-                                      "asset_file_tensor_1:0",
-                                      asset_id=1)
+      self._validate_assets(
+          export_dir,
+          foo_graph.asset_file_def,
+          "hello42.txt",
+          "foo bar baz",
+          "asset_file_tensor_1:0",
+          asset_id=1)
       ignored_asset_path = os.path.join(
           compat.as_bytes(export_dir),
           compat.as_bytes(constants.ASSETS_DIRECTORY),
           compat.as_bytes("hello42.txt_1"))
       self.assertFalse(file_io.file_exists(ignored_asset_path))
 
+  @test_util.run_deprecated_v1
   def testAssetsNameCollisionManyFiles(self):
     export_dir = self._get_export_dir("test_assets_name_collision_many_files")
-    builder = saved_model_builder.SavedModelBuilder(export_dir)
+    builder = saved_model_builder._SavedModelBuilder(export_dir)
 
     with self.session(graph=ops.Graph()) as sess:
       self._init_and_validate_variable(sess, "v", 42)
 
       for i in range(5):
         idx = str(i)
-        asset_collection = self._build_asset_collection(
-            "hello42.txt", "foo bar baz " + idx, "asset_file_tensor_" + idx,
+        asset_list = self._build_asset_collection(
+            "hello42.txt",
+            "foo bar baz " + idx,
+            "asset_file_tensor_" + idx,
             asset_subdir=idx)
 
       builder.add_meta_graph_and_variables(
-          sess, ["foo"], assets_collection=asset_collection)
+          sess, ["foo"], assets_list=asset_list)
 
     # Save the SavedModel to disk.
     builder.save()
@@ -765,18 +808,20 @@ class SavedModelTest(test.TestCase):
       foo_graph = loader.load(sess, ["foo"], export_dir)
       for i in range(1, 5):
         idx = str(i)
-        self._validate_asset_collection(
-            export_dir, foo_graph.collection_def, "hello42.txt_" + idx,
-            "foo bar baz " + idx, "asset_file_tensor_{}:0".format(idx),
+        self._validate_assets(
+            export_dir,
+            foo_graph.asset_file_def,
+            "hello42.txt_" + idx,
+            "foo bar baz " + idx,
+            "asset_file_tensor_{}:0".format(idx),
             asset_id=i)
 
-      self._validate_asset_collection(export_dir, foo_graph.collection_def,
-                                      "hello42.txt", "foo bar baz 0",
-                                      "asset_file_tensor_0:0")
+      self._validate_assets(export_dir, foo_graph.asset_file_def, "hello42.txt",
+                            "foo bar baz 0", "asset_file_tensor_0:0")
 
-  def testCustomMainOp(self):
+  def testCustomInitOp(self):
     export_dir = self._get_export_dir("test_main_op")
-    builder = saved_model_builder.SavedModelBuilder(export_dir)
+    builder = saved_model_builder._SavedModelBuilder(export_dir)
 
     with self.session(graph=ops.Graph()) as sess:
       # Add `v1` and `v2` variables to the graph.
@@ -792,11 +837,11 @@ class SavedModelTest(test.TestCase):
       # Set up an assignment op to be run as part of the main_op.
       with ops.control_dependencies([main_op.main_op()]):
         add_v1_v2 = math_ops.add(v1._ref(), v2._ref())
-        custom_main_op = control_flow_ops.group(state_ops.assign(v3, add_v1_v2))
+        custom_init_op = control_flow_ops.group(state_ops.assign(v3, add_v1_v2))
 
-      sess.run(custom_main_op)
+      self.evaluate(custom_init_op)
       builder.add_meta_graph_and_variables(
-          sess, ["foo"], main_op=custom_main_op)
+          sess, ["foo"], init_op=custom_init_op)
 
     # Save the SavedModel to disk.
     builder.save()
@@ -809,83 +854,10 @@ class SavedModelTest(test.TestCase):
       # the main_op, following a restore.
       self.assertEqual(3, ops.get_collection("v")[2].eval())
 
-  def testLegacyInitOp(self):
-    export_dir = self._get_export_dir("test_legacy_init_op")
-    builder = saved_model_builder.SavedModelBuilder(export_dir)
-
-    with self.session(graph=ops.Graph()) as sess:
-      # Add `v1` and `v2` variables to the graph.
-      v1 = variables.VariableV1(1, name="v1")
-      ops.add_to_collection("v", v1)
-      v2 = variables.VariableV1(2, name="v2")
-      ops.add_to_collection("v", v2)
-
-      # Initialize another variable `v3` to 42.
-      v3 = variables.VariableV1(42, name="v3", trainable=False, collections=[])
-      ops.add_to_collection("v", v3)
-
-      # Set up an assignment op to be run as part of the legacy_init_op.
-      assign_v3 = state_ops.assign(v3, math_ops.add(v1, v2))
-      legacy_init_op = control_flow_ops.group(assign_v3, name="legacy_init_op")
-
-      sess.run(variables.global_variables_initializer())
-      builder.add_meta_graph_and_variables(
-          sess, ["foo"], legacy_init_op=legacy_init_op)
-
-    # Save the SavedModel to disk.
-    builder.save()
-
-    with self.session(graph=ops.Graph()) as sess:
-      loader.load(sess, ["foo"], export_dir)
-      self.assertEqual(1, ops.get_collection("v")[0].eval())
-      self.assertEqual(2, ops.get_collection("v")[1].eval())
-      # Evaluates to the sum of the first two variables and assigned as part of
-      # the legacy_init_op, following a restore.
-      self.assertEqual(3, ops.get_collection("v")[2].eval())
-
-  def testLegacyInitOpWithNonEmptyCollection(self):
-    export_dir = self._get_export_dir(
-        "test_legacy_init_op_with_non_empty_collection")
-    self._testInitOpsWithNonEmptyCollection(
-        export_dir, constants.LEGACY_INIT_OP_KEY)
-
-  def testMainOpWithNonEmptyCollection(self):
-    export_dir = self._get_export_dir(
-        "test_main_op_with_non_empty_collection")
-    self._testInitOpsWithNonEmptyCollection(export_dir, constants.MAIN_OP_KEY)
-
-  def _testInitOpsWithNonEmptyCollection(self, export_dir, key):
-    builder = saved_model_builder.SavedModelBuilder(export_dir)
-
-    g = ops.Graph()
-    with self.session(graph=g) as sess:
-      # Initialize variable `v1` to 1.
-      v1 = variables.VariableV1(1, name="v1")
-      ops.add_to_collection("v", v1)
-
-      # Initialize another variable `v2` to 42.
-      v2 = variables.VariableV1(42, name="v2", trainable=False, collections=[])
-      ops.add_to_collection("v", v2)
-
-      # Set up an assignment op to be run as part of the init op.
-      assign_v2 = state_ops.assign(v2, v1)
-      init_op = control_flow_ops.group(assign_v2, name="init_op")
-
-      sess.run(variables.global_variables_initializer())
-
-      ops.add_to_collection(key, control_flow_ops.no_op())
-      # ValueError should be raised since the LEGACY_INIT_OP_KEY collection
-      # is not empty and we don't support multiple init ops.
-      with self.assertRaisesRegexp(ValueError, "Graph already contains"):
-        builder.add_meta_graph_and_variables(
-            sess, ["foo"], legacy_init_op=init_op)
-      # We shouldn't be able to add as MAIN_OP, either.
-      with self.assertRaisesRegexp(ValueError, "Graph already contains"):
-        builder.add_meta_graph_and_variables(sess, ["foo"], main_op=init_op)
-
+  @test_util.run_deprecated_v1
   def testTrainOp(self):
     export_dir = self._get_export_dir("test_train_op")
-    builder = saved_model_builder.SavedModelBuilder(export_dir)
+    builder = saved_model_builder._SavedModelBuilder(export_dir)
 
     with self.session(graph=ops.Graph()) as sess:
       # Add `v1` and `v2` variables to the graph.
@@ -894,27 +866,26 @@ class SavedModelTest(test.TestCase):
       v2 = variables.VariableV1(2, name="v2")
       ops.add_to_collection("v", v2)
 
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       train_op = state_ops.assign_add(v1, v2)
 
-      sess.run(train_op)
-      # TODO(karmel): remove explicit call when in the public method.
-      builder._add_train_op(train_op)
-      builder.add_meta_graph_and_variables(sess, ["foo"])
+      self.evaluate(train_op)
+      builder.add_meta_graph_and_variables(sess, ["foo"], train_op=train_op)
 
     # Save the SavedModel to disk.
     builder.save()
 
     with self.session(graph=ops.Graph()) as sess:
-      loader.load(sess, ["foo"], export_dir)
+      meta_graph_def = loader.load(sess, ["foo"], export_dir)
       self.assertEqual(3, ops.get_collection("v")[0].eval())
       self.assertEqual(2, ops.get_collection("v")[1].eval())
       self.assertIsInstance(
-          ops.get_collection(constants.TRAIN_OP_KEY)[0], ops.Tensor)
+          loader_impl.get_train_op(meta_graph_def), ops.Tensor)
 
+  @test_util.run_deprecated_v1
   def testTrainOpGroup(self):
     export_dir = self._get_export_dir("test_train_op_group")
-    builder = saved_model_builder.SavedModelBuilder(export_dir)
+    builder = saved_model_builder._SavedModelBuilder(export_dir)
 
     with self.session(graph=ops.Graph()) as sess:
       # Add `v1` and `v2` variables to the graph.
@@ -923,27 +894,26 @@ class SavedModelTest(test.TestCase):
       v2 = variables.VariableV1(2, name="v2")
       ops.add_to_collection("v", v2)
 
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       train_op = control_flow_ops.group()
 
-      sess.run(train_op)
-      # TODO(karmel): remove explicit call when in the public method.
-      builder._add_train_op(train_op)
-      builder.add_meta_graph_and_variables(sess, ["foo"])
+      self.evaluate(train_op)
+      builder.add_meta_graph_and_variables(sess, ["foo"], train_op=train_op)
 
     # Save the SavedModel to disk.
     builder.save()
 
     with self.session(graph=ops.Graph()) as sess:
-      loader.load(sess, ["foo"], export_dir)
+      meta_graph_def = loader.load(sess, ["foo"], export_dir)
       self.assertEqual(1, ops.get_collection("v")[0].eval())
       self.assertEqual(2, ops.get_collection("v")[1].eval())
       self.assertIsInstance(
-          ops.get_collection(constants.TRAIN_OP_KEY)[0], ops.Operation)
+          loader_impl.get_train_op(meta_graph_def), ops.Operation)
 
+  @test_util.run_deprecated_v1
   def testTrainOpAfterVariables(self):
     export_dir = self._get_export_dir("test_train_op_after_variables")
-    builder = saved_model_builder.SavedModelBuilder(export_dir)
+    builder = saved_model_builder._SavedModelBuilder(export_dir)
 
     with self.session(graph=ops.Graph()) as sess:
       # Add `v1` and `v2` variables to the graph.
@@ -952,51 +922,50 @@ class SavedModelTest(test.TestCase):
       v2 = variables.VariableV1(2, name="v2")
       ops.add_to_collection("v", v2)
 
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       builder.add_meta_graph_and_variables(sess, ["pre_foo"])
 
       train_op = state_ops.assign_add(v1, v2)
-      sess.run(train_op)
-      # TODO(karmel): remove explicit call when in the public method.
-      builder._add_train_op(train_op)
-      builder.add_meta_graph(["foo"])
+      self.evaluate(train_op)
+      builder.add_meta_graph(["foo"], train_op=train_op)
 
     # Save the SavedModel to disk.
     builder.save()
 
     with self.session(graph=ops.Graph()) as sess:
-      loader.load(sess, ["foo"], export_dir)
+      meta_graph_def = loader.load(sess, ["foo"], export_dir)
       self.assertIsInstance(
-          ops.get_collection(constants.TRAIN_OP_KEY)[0], ops.Tensor)
+          loader_impl.get_train_op(meta_graph_def), ops.Tensor)
 
     with self.session(graph=ops.Graph()) as sess:
       loader.load(sess, ["pre_foo"], export_dir)
       self.assertFalse(ops.get_collection(constants.TRAIN_OP_KEY))
 
+  @test_util.run_deprecated_v1
   def testMultipleAssets(self):
     export_dir = self._get_export_dir("test_multiple_assets")
-    builder = saved_model_builder.SavedModelBuilder(export_dir)
+    builder = saved_model_builder._SavedModelBuilder(export_dir)
 
     with self.session(graph=ops.Graph()) as sess:
       self._init_and_validate_variable(sess, "v", 42)
 
       # Build an asset collection specific to `foo` graph.
-      asset_collection = self._build_asset_collection("foo.txt", "content_foo",
-                                                      "asset_file_tensor")
+      asset_list = self._build_asset_collection("foo.txt", "content_foo",
+                                                "asset_file_tensor")
 
       # Add the asset collection as part of the graph with tag "foo".
       builder.add_meta_graph_and_variables(
-          sess, ["foo"], assets_collection=asset_collection)
+          sess, ["foo"], assets_list=asset_list)
 
     with self.session(graph=ops.Graph()) as sess:
       self._init_and_validate_variable(sess, "v", 42)
 
       # Build an asset collection specific to `bar` graph.
-      asset_collection = self._build_asset_collection("bar.txt", "content_bar",
-                                                      "asset_file_tensor")
+      asset_list = self._build_asset_collection("bar.txt", "content_bar",
+                                                "asset_file_tensor")
 
       # Add the asset collection as part of the graph with tag "bar".
-      builder.add_meta_graph(["bar"], assets_collection=asset_collection)
+      builder.add_meta_graph(["bar"], assets_list=asset_list)
 
     # Save the SavedModel to disk.
     builder.save()
@@ -1004,43 +973,42 @@ class SavedModelTest(test.TestCase):
     # Check assets restored for graph with tag "foo".
     with self.session(graph=ops.Graph()) as sess:
       foo_graph = loader.load(sess, ["foo"], export_dir)
-      self._validate_asset_collection(export_dir, foo_graph.collection_def,
-                                      "foo.txt", "content_foo",
-                                      "asset_file_tensor:0")
+      self._validate_assets(export_dir, foo_graph.asset_file_def, "foo.txt",
+                            "content_foo", "asset_file_tensor:0")
 
     # Check assets restored for graph with tag "bar".
     with self.session(graph=ops.Graph()) as sess:
       bar_graph = loader.load(sess, ["bar"], export_dir)
-      self._validate_asset_collection(export_dir, bar_graph.collection_def,
-                                      "bar.txt", "content_bar",
-                                      "asset_file_tensor:0")
+      self._validate_assets(export_dir, bar_graph.asset_file_def, "bar.txt",
+                            "content_bar", "asset_file_tensor:0")
 
+  @test_util.run_deprecated_v1
   def testDuplicateAssets(self):
     export_dir = self._get_export_dir("test_duplicate_assets")
-    builder = saved_model_builder.SavedModelBuilder(export_dir)
+    builder = saved_model_builder._SavedModelBuilder(export_dir)
 
     with self.session(graph=ops.Graph()) as sess:
       self._init_and_validate_variable(sess, "v", 42)
 
       # Build an asset collection with `foo.txt` that has `foo` specific
       # content.
-      asset_collection = self._build_asset_collection("foo.txt", "content_foo",
-                                                      "asset_file_tensor")
+      asset_list = self._build_asset_collection("foo.txt", "content_foo",
+                                                "asset_file_tensor")
 
       # Add the asset collection as part of the graph with tag "foo".
       builder.add_meta_graph_and_variables(
-          sess, ["foo"], assets_collection=asset_collection)
+          sess, ["foo"], assets_list=asset_list)
 
     with self.session(graph=ops.Graph()) as sess:
       self._init_and_validate_variable(sess, "v", 42)
 
       # Build an asset collection with `foo.txt` that has `bar` specific
       # content.
-      asset_collection = self._build_asset_collection("foo.txt", "content_bar",
-                                                      "asset_file_tensor")
+      asset_list = self._build_asset_collection("foo.txt", "content_bar",
+                                                "asset_file_tensor")
 
       # Add the asset collection as part of the graph with tag "bar".
-      builder.add_meta_graph(["bar"], assets_collection=asset_collection)
+      builder.add_meta_graph(["bar"], assets_list=asset_list)
 
     # Save the SavedModel to disk.
     builder.save()
@@ -1048,9 +1016,8 @@ class SavedModelTest(test.TestCase):
     # Check assets restored for graph with tag "foo".
     with self.session(graph=ops.Graph()) as sess:
       foo_graph = loader.load(sess, ["foo"], export_dir)
-      self._validate_asset_collection(export_dir, foo_graph.collection_def,
-                                      "foo.txt", "content_foo",
-                                      "asset_file_tensor:0")
+      self._validate_assets(export_dir, foo_graph.asset_file_def, "foo.txt",
+                            "content_foo", "asset_file_tensor:0")
 
     # Check assets restored for graph with tag "bar".
     with self.session(graph=ops.Graph()) as sess:
@@ -1059,13 +1026,13 @@ class SavedModelTest(test.TestCase):
       # Validate the assets for `bar` graph. `foo.txt` should contain the
       # original contents corresponding to `foo` graph since an asset with the
       # same name across multiple graphs is only stored the first time
-      self._validate_asset_collection(export_dir, bar_graph.collection_def,
-                                      "foo.txt", "content_foo",
-                                      "asset_file_tensor:0")
+      self._validate_assets(export_dir, bar_graph.asset_file_def, "foo.txt",
+                            "content_foo", "asset_file_tensor:0")
 
+  @test_util.run_deprecated_v1
   def testOp(self):
     export_dir = self._get_export_dir("test_op")
-    builder = saved_model_builder.SavedModelBuilder(export_dir)
+    builder = saved_model_builder._SavedModelBuilder(export_dir)
 
     with session.Session(
         graph=ops.Graph(),
@@ -1086,7 +1053,7 @@ class SavedModelTest(test.TestCase):
       ops.add_to_collection("v", v3)
       ops.add_to_collection("init_op", init_op)
 
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       self.assertEqual(1, ops.get_collection("v")[0].eval())
       self.assertEqual(2, ops.get_collection("v")[1].eval())
 
@@ -1108,7 +1075,7 @@ class SavedModelTest(test.TestCase):
 
   def testCustomSaveable(self):
     export_dir = self._get_export_dir("custom_saveable")
-    builder = saved_model_builder.SavedModelBuilder(export_dir)
+    builder = saved_model_builder._SavedModelBuilder(export_dir)
 
     with session.Session(
         graph=ops.Graph(),
@@ -1135,13 +1102,14 @@ class SavedModelTest(test.TestCase):
       self.assertEqual(b"k1", v1.keys().eval())
       self.assertEqual(3.0, v1.values().eval())
 
+  @test_util.run_deprecated_v1
   def testCustomSaver(self):
     export_dir = self._get_export_dir("test_custom_saver")
-    builder = saved_model_builder.SavedModelBuilder(export_dir)
+    builder = saved_model_builder._SavedModelBuilder(export_dir)
 
     with self.session(graph=ops.Graph()) as sess:
       variables.VariableV1(1, name="v1")
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       custom_saver = training.Saver(name="my_saver")
       builder.add_meta_graph_and_variables(sess, ["tag"], saver=custom_saver)
 
@@ -1157,13 +1125,14 @@ class SavedModelTest(test.TestCase):
         self.assertEqual(
             saved_graph.saver_def.restore_op_name, "my_saver/restore_all")
 
+  @test_util.run_deprecated_v1
   def testNoCustomSaver(self):
     export_dir = self._get_export_dir("test_no_custom_saver")
-    builder = saved_model_builder.SavedModelBuilder(export_dir)
+    builder = saved_model_builder._SavedModelBuilder(export_dir)
 
     with self.session(graph=ops.Graph()) as sess:
       variables.VariableV1(1, name="v1")
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       training.Saver(name="my_saver")
       builder.add_meta_graph_and_variables(sess, ["tag"])
 
@@ -1179,13 +1148,14 @@ class SavedModelTest(test.TestCase):
         self.assertEqual(
             saved_graph.saver_def.restore_op_name, "save/restore_all")
 
+  @test_util.run_deprecated_v1
   def testMultipleCustomSavers(self):
     export_dir = self._get_export_dir("test_multiple_custom_savers")
-    builder = saved_model_builder.SavedModelBuilder(export_dir)
+    builder = saved_model_builder._SavedModelBuilder(export_dir)
 
     with self.session(graph=ops.Graph()) as sess:
       variables.VariableV1(1, name="v1")
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       builder.add_meta_graph_and_variables(sess, ["tag_0"])
 
       saver_1 = training.Saver()
@@ -1209,21 +1179,22 @@ class SavedModelTest(test.TestCase):
     _validate_custom_saver("tag_1", "save_1/restore_all")
     _validate_custom_saver("tag_2", "save_2/restore_all")
 
+  @test_util.run_deprecated_v1
   def testImportScope(self):
     export_dir = self._get_export_dir("test_scoped_assets")
-    builder = saved_model_builder.SavedModelBuilder(export_dir)
+    builder = saved_model_builder._SavedModelBuilder(export_dir)
 
     # Build a SavedModel with a variable, an asset, and a constant tensor.
     with self.session(graph=ops.Graph()) as sess:
       self._init_and_validate_variable(sess, "v", 42)
-      asset_collection = self._build_asset_collection("foo.txt", "content_foo",
-                                                      "asset_file_tensor")
+      asset_list = self._build_asset_collection("foo.txt", "content_foo",
+                                                "asset_file_tensor")
       constant_op.constant("constant value", name="constant_tensor_name")
       builder.add_meta_graph_and_variables(
-          sess, ["tag_name"], assets_collection=asset_collection)
+          sess, ["tag_name"], assets_list=asset_list)
 
       # Save the asset file path for later comparison.
-      asset_file_path = asset_collection[0].eval()
+      asset_file_path = asset_list[0].eval()
 
     # Save the SavedModel to disk.
     builder.save()
@@ -1244,16 +1215,14 @@ class SavedModelTest(test.TestCase):
 
       # The loaded asset tensor should be scoped, but the asset file path and
       # contents should be unchanged.
-      asset_collection = ops.get_collection(ops.GraphKeys.ASSET_FILEPATHS)
-      self.assertEqual(1, len(asset_collection))
-      self.assertEqual(asset_file_path, asset_collection[0].eval())
-      self.assertEqual("scope_name/asset_file_tensor:0",
-                       asset_collection[0].name)
+      asset_list = ops.get_collection(ops.GraphKeys.ASSET_FILEPATHS)
+      self.assertEqual(1, len(asset_list))
+      self.assertEqual(asset_file_path, asset_list[0].eval())
+      self.assertEqual("scope_name/asset_file_tensor:0", asset_list[0].name)
       # The static asset data inside graph_proto.collection_def should not be
       # scoped.
-      self._validate_asset_collection(export_dir, graph_proto.collection_def,
-                                      "foo.txt", "content_foo",
-                                      "asset_file_tensor:0")
+      self._validate_assets(export_dir, graph_proto.asset_file_def, "foo.txt",
+                            "content_foo", "asset_file_tensor:0")
 
       # The constant tensor should be scoped, but its contents should be
       # unchanged.
@@ -1262,9 +1231,10 @@ class SavedModelTest(test.TestCase):
           ops.get_default_graph().get_tensor_by_name(
               "scope_name/constant_tensor_name:0").eval())
 
+  @test_util.run_deprecated_v1
   def testClearDevices(self):
     export_dir = self._get_export_dir("test_clear_devices")
-    builder = saved_model_builder.SavedModelBuilder(export_dir)
+    builder = saved_model_builder._SavedModelBuilder(export_dir)
 
     # Specify a device and save a variable.
     ops.reset_default_graph()
@@ -1286,89 +1256,19 @@ class SavedModelTest(test.TestCase):
       self.assertEqual(
           42, ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)[0].eval())
 
-  def testStripDefaultAttrs(self):
-    export_dir = self._get_export_dir("test_strip_default_attrs")
-    builder = saved_model_builder.SavedModelBuilder(export_dir)
-
-    # Add a graph with two float32 variables and a Complex Op composing them
-    # with strip_default_attrs enabled.
-    with session.Session(graph=ops.Graph()) as sess:
-      real_num = variables.VariableV1(1.0, dtype=dtypes.float32, name="real")
-      imag_num = variables.VariableV1(2.0, dtype=dtypes.float32, name="imag")
-      math_ops.complex(real_num, imag_num, name="complex")
-      sess.run(variables.global_variables_initializer())
-      builder.add_meta_graph_and_variables(
-          sess, ["foo"], strip_default_attrs=True)
-
-    # Add a graph with the same float32 variables and a Complex Op composing
-    # them with strip_default_attrs disabled.
-    with session.Session(graph=ops.Graph()) as sess:
-      real_num = variables.VariableV1(1.0, dtype=dtypes.float32, name="real")
-      imag_num = variables.VariableV1(2.0, dtype=dtypes.float32, name="imag")
-      math_ops.complex(real_num, imag_num, name="complex")
-      sess.run(variables.global_variables_initializer())
-      builder.add_meta_graph(["bar"], strip_default_attrs=False)
-
-    # Save the SavedModel to disk in text format.
-    builder.save(as_text=True)
-
-    # Loading graph "foo" via the loader must restore the defaults for the
-    # "Complex" node based on the "Complex" OpDef in the Op registry.
-    sess = session.Session(graph=ops.Graph())
-    meta_graph_def = loader.load(sess, ["foo"], export_dir)
-    complex_node = test_util.get_node_def_from_graph("complex",
-                                                     meta_graph_def.graph_def)
-    self.assertIn("T", complex_node.attr)
-    self.assertIn("Tout", complex_node.attr)
-
-    # Load graph "foo" from disk as-is to verify default attrs are stripped.
-    # pylint: disable=protected-access
-    saved_model_pb = loader_impl._parse_saved_model(export_dir)
-    self.assertIsNotNone(saved_model_pb)
-    # pylint: enable=protected-access
-
-    meta_graph_foo_def = None
-    meta_graph_bar_def = None
-    for meta_graph_def in saved_model_pb.meta_graphs:
-      if set(meta_graph_def.meta_info_def.tags) == set(["foo"]):
-        meta_graph_foo_def = meta_graph_def
-      elif set(meta_graph_def.meta_info_def.tags) == set(["bar"]):
-        meta_graph_bar_def = meta_graph_def
-
-    self.assertIsNotNone(meta_graph_foo_def)
-    self.assertIsNotNone(meta_graph_bar_def)
-
-    # "Complex" Op has 2 attributes with defaults:
-    #   o "T"    : float32.   (input type)
-    #   o "Tout" : complex64. (output type)
-
-    # "Complex" Op in graph "foo" shouldn't have attributes "T" and "Tout".
-    # Graph "foo" was saved with strip_default_attrs set to True.
-    node_def = test_util.get_node_def_from_graph("complex",
-                                                 meta_graph_foo_def.graph_def)
-    self.assertNotIn("T", node_def.attr)
-    self.assertNotIn("Tout", node_def.attr)
-
-    # "Complex" Op in graph "bar" must have attributes "T" and "Tout".
-    # Graph "bar" was saved with strip_default_attrs set to False.
-    node_def = test_util.get_node_def_from_graph("complex",
-                                                 meta_graph_bar_def.graph_def)
-    self.assertIn("T", node_def.attr)
-    self.assertIn("Tout", node_def.attr)
-
   # Tests the behavior of loading SavedModels that having missing attrs or attrs
   # with incorrect types.
   def testInconsistentConsumerDefaultAttrs(self):
     export_dir = self._get_export_dir(
         "test_strip_default_attrs_no_consumer_defaults")
-    builder = saved_model_builder.SavedModelBuilder(export_dir)
+    builder = saved_model_builder._SavedModelBuilder(export_dir)
 
     # Add a graph with a single variable and a test op with a defaultless
     # float32 attr, "test_attr".
     with session.Session(graph=ops.Graph()) as sess:
       variables.VariableV1(1.0, dtype=dtypes.float64, name="var")
       test_ops.test_attr(T=dtypes.float32, name="test_attr")
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       builder.add_meta_graph_and_variables(sess, ["foo"])
 
     # Save the SavedModel to disk in text format.
@@ -1428,5 +1328,207 @@ class SavedModelTest(test.TestCase):
       loader.load(sess, ["foo"], export_dir)
 
 
+class SavedModelV1Test(SavedModelTestBase):
+
+  def _validate_asset_collection(self,
+                                 export_dir,
+                                 graph_collection_def,
+                                 expected_asset_file_name,
+                                 expected_asset_file_contents,
+                                 expected_asset_tensor_name,
+                                 asset_id=0):
+    assets_any = graph_collection_def[constants.ASSETS_KEY].any_list.value
+    asset = meta_graph_pb2.AssetFileDef()
+    assets_any[asset_id].Unpack(asset)
+    assets_path = os.path.join(
+        compat.as_bytes(export_dir),
+        compat.as_bytes(constants.ASSETS_DIRECTORY),
+        compat.as_bytes(expected_asset_file_name))
+    actual_asset_contents = file_io.read_file_to_string(assets_path)
+    self.assertEqual(expected_asset_file_contents,
+                     compat.as_text(actual_asset_contents))
+    self.assertEqual(expected_asset_file_name, asset.filename)
+    self.assertEqual(expected_asset_tensor_name, asset.tensor_info.name)
+
+  @test_util.run_deprecated_v1
+  def testWritingAssetsToCollection(self):
+    export_dir = self._get_export_dir("test_writing_assets_to_collection")
+    builder = saved_model_builder.SavedModelBuilder(export_dir)
+
+    with self.session(graph=ops.Graph()) as sess:
+      self._init_and_validate_variable(sess, "v", 42)
+
+      # Build an asset list.
+      ignored_filepath = os.path.join(
+          compat.as_bytes(test.get_temp_dir()), compat.as_bytes("ignored.txt"))
+      file_io.write_string_to_file(ignored_filepath, "will be ignored")
+
+      asset_collection = self._build_asset_collection(
+          "hello42.txt", "foo bar baz", "asset_file_tensor")
+
+      builder.add_meta_graph_and_variables(
+          sess, ["foo"], assets_collection=asset_collection)
+
+    # Save the SavedModel to disk.
+    builder.save()
+
+    with self.session(graph=ops.Graph()) as sess:
+      foo_graph = loader.load(sess, ["foo"], export_dir)
+      self._validate_asset_collection(export_dir, foo_graph.collection_def,
+                                      "hello42.txt", "foo bar baz",
+                                      "asset_file_tensor:0")
+      ignored_asset_path = os.path.join(
+          compat.as_bytes(export_dir),
+          compat.as_bytes(constants.ASSETS_DIRECTORY),
+          compat.as_bytes("ignored.txt"))
+      self.assertFalse(file_io.file_exists(ignored_asset_path))
+
+  @test_util.run_deprecated_v1
+  def testLegacyInitOpWithNonEmptyCollection(self):
+    export_dir = self._get_export_dir(
+        "test_legacy_init_op_with_non_empty_collection")
+    self._testInitOpsWithNonEmptyCollection(export_dir,
+                                            constants.LEGACY_INIT_OP_KEY)
+
+  @test_util.run_deprecated_v1
+  def testMainOpWithNonEmptyCollection(self):
+    export_dir = self._get_export_dir("test_main_op_with_non_empty_collection")
+    self._testInitOpsWithNonEmptyCollection(export_dir, constants.MAIN_OP_KEY)
+
+  def _testInitOpsWithNonEmptyCollection(self, export_dir, key):
+    builder = saved_model_builder.SavedModelBuilder(export_dir)
+
+    g = ops.Graph()
+    with self.session(graph=g) as sess:
+      # Initialize variable `v1` to 1.
+      v1 = variables.VariableV1(1, name="v1")
+      ops.add_to_collection("v", v1)
+
+      # Initialize another variable `v2` to 42.
+      v2 = variables.VariableV1(42, name="v2", trainable=False, collections=[])
+      ops.add_to_collection("v", v2)
+
+      # Set up an assignment op to be run as part of the init op.
+      assign_v2 = state_ops.assign(v2, v1)
+      init_op = control_flow_ops.group(assign_v2, name="init_op")
+
+      self.evaluate(variables.global_variables_initializer())
+
+      ops.add_to_collection(key, control_flow_ops.no_op())
+      # ValueError should be raised since the LEGACY_INIT_OP_KEY collection
+      # is not empty and we don't support multiple init ops.
+      with self.assertRaisesRegexp(ValueError, "Graph already contains"):
+        builder.add_meta_graph_and_variables(
+            sess, ["foo"], legacy_init_op=init_op)
+      # We shouldn't be able to add as MAIN_OP, either.
+      with self.assertRaisesRegexp(ValueError, "Graph already contains"):
+        builder.add_meta_graph_and_variables(sess, ["foo"], main_op=init_op)
+
+  def testStripDefaultAttrs(self):
+    export_dir = self._get_export_dir("test_strip_default_attrs")
+    builder = saved_model_builder.SavedModelBuilder(export_dir)
+
+    # Add a graph with two float32 variables and a Complex Op composing them
+    # with strip_default_attrs enabled.
+    with session.Session(graph=ops.Graph()) as sess:
+      real_num = variables.VariableV1(1.0, dtype=dtypes.float32, name="real")
+      imag_num = variables.VariableV1(2.0, dtype=dtypes.float32, name="imag")
+      math_ops.complex(real_num, imag_num, name="complex")
+      self.evaluate(variables.global_variables_initializer())
+      builder.add_meta_graph_and_variables(
+          sess, ["foo"], strip_default_attrs=True)
+
+    # Add a graph with the same float32 variables and a Complex Op composing
+    # them with strip_default_attrs disabled.
+    with session.Session(graph=ops.Graph()) as sess:
+      real_num = variables.VariableV1(1.0, dtype=dtypes.float32, name="real")
+      imag_num = variables.VariableV1(2.0, dtype=dtypes.float32, name="imag")
+      math_ops.complex(real_num, imag_num, name="complex")
+      self.evaluate(variables.global_variables_initializer())
+      builder.add_meta_graph(["bar"], strip_default_attrs=False)
+
+    # Save the SavedModel to disk in text format.
+    builder.save(as_text=True)
+
+    # Loading graph "foo" via the loader must restore the defaults for the
+    # "Complex" node based on the "Complex" OpDef in the Op registry.
+    sess = session.Session(graph=ops.Graph())
+    meta_graph_def = loader.load(sess, ["foo"], export_dir)
+    complex_node = test_util.get_node_def_from_graph("complex",
+                                                     meta_graph_def.graph_def)
+    self.assertIn("T", complex_node.attr)
+    self.assertIn("Tout", complex_node.attr)
+
+    # Load graph "foo" from disk as-is to verify default attrs are stripped.
+    # pylint: disable=protected-access
+    saved_model_pb = loader_impl._parse_saved_model(export_dir)
+    self.assertIsNotNone(saved_model_pb)
+    # pylint: enable=protected-access
+
+    meta_graph_foo_def = None
+    meta_graph_bar_def = None
+    for meta_graph_def in saved_model_pb.meta_graphs:
+      if set(meta_graph_def.meta_info_def.tags) == set(["foo"]):
+        meta_graph_foo_def = meta_graph_def
+      elif set(meta_graph_def.meta_info_def.tags) == set(["bar"]):
+        meta_graph_bar_def = meta_graph_def
+
+    self.assertIsNotNone(meta_graph_foo_def)
+    self.assertIsNotNone(meta_graph_bar_def)
+
+    # "Complex" Op has 2 attributes with defaults:
+    #   o "T"    : float32.   (input type)
+    #   o "Tout" : complex64. (output type)
+
+    # "Complex" Op in graph "foo" shouldn't have attributes "T" and "Tout".
+    # Graph "foo" was saved with strip_default_attrs set to True.
+    node_def = test_util.get_node_def_from_graph("complex",
+                                                 meta_graph_foo_def.graph_def)
+    self.assertNotIn("T", node_def.attr)
+    self.assertNotIn("Tout", node_def.attr)
+
+    # "Complex" Op in graph "bar" must have attributes "T" and "Tout".
+    # Graph "bar" was saved with strip_default_attrs set to False.
+    node_def = test_util.get_node_def_from_graph("complex",
+                                                 meta_graph_bar_def.graph_def)
+    self.assertIn("T", node_def.attr)
+    self.assertIn("Tout", node_def.attr)
+
+  @test_util.run_deprecated_v1
+  def testLegacyInitOp(self):
+    export_dir = self._get_export_dir("test_legacy_init_op")
+    builder = saved_model_builder.SavedModelBuilder(export_dir)
+
+    with self.session(graph=ops.Graph()) as sess:
+      # Add `v1` and `v2` variables to the graph.
+      v1 = variables.VariableV1(1, name="v1")
+      ops.add_to_collection("v", v1)
+      v2 = variables.VariableV1(2, name="v2")
+      ops.add_to_collection("v", v2)
+
+      # Initialize another variable `v3` to 42.
+      v3 = variables.VariableV1(42, name="v3", trainable=False, collections=[])
+      ops.add_to_collection("v", v3)
+
+      # Set up an assignment op to be run as part of the init_op.
+      assign_v3 = state_ops.assign(v3, math_ops.add(v1, v2))
+      legacy_init_op = control_flow_ops.group(assign_v3, name="legacy_init_op")
+
+      self.evaluate(variables.global_variables_initializer())
+      builder.add_meta_graph_and_variables(
+          sess, ["foo"], legacy_init_op=legacy_init_op)
+
+    # Save the SavedModel to disk.
+    builder.save()
+
+    with self.session(graph=ops.Graph()) as sess:
+      loader.load(sess, ["foo"], export_dir)
+      self.assertEqual(1, ops.get_collection("v")[0].eval())
+      self.assertEqual(2, ops.get_collection("v")[1].eval())
+      # Evaluates to the sum of the first two variables and assigned as part of
+      # the legacy_init_op, following a restore.
+      self.assertEqual(3, ops.get_collection("v")[2].eval())
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/saved_model/saved_object_graph.proto b/tensorflow/python/saved_model/saved_object_graph.proto
new file mode 100644
index 00000000000..89f82b90d61
--- /dev/null
+++ b/tensorflow/python/saved_model/saved_object_graph.proto
@@ -0,0 +1,38 @@
+syntax = "proto3";
+
+import "tensorflow/core/protobuf/checkpointable_object_graph.proto";
+
+option cc_enable_arenas = true;
+
+package tensorflow;
+
+// A SavedObjectGraph is part of object-based SavedModels in TF 2.0. It
+// describes the directed graph of Python objects (or equivalent in other
+// languages) that make up a model, with nodes[0] at the root.
+
+// SavedObjectGraph shares some structure with CheckpointableObjectGraph, but
+// ObjectGraph belongs to the SavedModel and contains pointers to functions and
+// type information, while CheckpointableObjectGraph lives in the checkpoint and
+// contains pointers only to variable values.
+
+// NOTE: This protocol buffer format is experimental and subject to change.
+
+message SavedObjectGraph {
+  message SavedObject {
+    // Objects which this object depends on: named edges in the dependency
+    // graph.
+    repeated CheckpointableObjectGraph.CheckpointableObject.ObjectReference
+        children = 1;
+    // Removed when forking from CheckpointableObjectGraph.
+    reserved "attributes";
+    reserved 2;
+    // Slot variables owned by this object. This describes the three-way
+    // (optimizer, variable, slot variable) relationship; none of the three
+    // depend on the others directly.
+    repeated
+        CheckpointableObjectGraph.CheckpointableObject.SlotVariableReference
+            slot_variables = 3;
+  }
+
+  repeated SavedObject nodes = 1;
+}
diff --git a/tensorflow/python/saved_model/signature_constants.py b/tensorflow/python/saved_model/signature_constants.py
index 96460717ec5..0efe1763430 100644
--- a/tensorflow/python/saved_model/signature_constants.py
+++ b/tensorflow/python/saved_model/signature_constants.py
@@ -135,7 +135,7 @@ tf_export(
 
 ################################################################################
 # Train/Eval API constants.
-# Not exported while export_all_saved_models is in contrib.
+# Not exported while export_all_saved_models is experimental.
 
 SUPERVISED_TRAIN_METHOD_NAME = "tensorflow/supervised/training"
 
diff --git a/tensorflow/python/saved_model/signature_def_utils.py b/tensorflow/python/saved_model/signature_def_utils.py
index 27d6b70e9dc..6a3c0aaf385 100644
--- a/tensorflow/python/saved_model/signature_def_utils.py
+++ b/tensorflow/python/saved_model/signature_def_utils.py
@@ -24,6 +24,8 @@ from __future__ import print_function
 from tensorflow.python.saved_model.signature_def_utils_impl import build_signature_def
 from tensorflow.python.saved_model.signature_def_utils_impl import classification_signature_def
 from tensorflow.python.saved_model.signature_def_utils_impl import is_valid_signature
+from tensorflow.python.saved_model.signature_def_utils_impl import load_op_from_signature_def
+from tensorflow.python.saved_model.signature_def_utils_impl import op_signature_def
 from tensorflow.python.saved_model.signature_def_utils_impl import predict_signature_def
 from tensorflow.python.saved_model.signature_def_utils_impl import regression_signature_def
 from tensorflow.python.saved_model.signature_def_utils_impl import supervised_eval_signature_def
diff --git a/tensorflow/python/saved_model/signature_def_utils_impl.py b/tensorflow/python/saved_model/signature_def_utils_impl.py
index 6e5e3bc6822..f6e6e1d13ec 100644
--- a/tensorflow/python/saved_model/signature_def_utils_impl.py
+++ b/tensorflow/python/saved_model/signature_def_utils_impl.py
@@ -21,9 +21,10 @@ from __future__ import print_function
 
 from tensorflow.core.framework import types_pb2
 from tensorflow.core.protobuf import meta_graph_pb2
+from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.saved_model import signature_constants
-from tensorflow.python.saved_model import utils
+from tensorflow.python.saved_model import utils_impl as utils
 from tensorflow.python.util import deprecation
 from tensorflow.python.util.tf_export import tf_export
 
@@ -349,3 +350,51 @@ def _is_valid_classification_signature(signature_def):
     return False
 
   return True
+
+
+def op_signature_def(op, key):
+  """Creates a signature def with the output pointing to an op.
+
+  Note that op isn't strictly enforced to be an Op object, and may be a Tensor.
+  It is recommended to use the build_signature_def() function for Tensors.
+
+  Args:
+    op: An Op (or possibly Tensor).
+    key: Key to graph element in the SignatureDef outputs.
+
+  Returns:
+    A SignatureDef with a single output pointing to the op.
+  """
+  # Use build_tensor_info_from_op, which creates a TensorInfo from the element's
+  # name.
+  return build_signature_def(outputs={key: utils.build_tensor_info_from_op(op)})
+
+
+def load_op_from_signature_def(signature_def, key, import_scope=None):
+  """Load an Op from a SignatureDef created by op_signature_def().
+
+  Args:
+    signature_def: a SignatureDef proto
+    key: string key to op in the SignatureDef outputs.
+    import_scope: Scope used to import the op
+
+  Returns:
+    Op (or possibly Tensor) in the graph with the same name as saved in the
+      SignatureDef.
+
+  Raises:
+    NotFoundError: If the op could not be found in the graph.
+  """
+  tensor_info = signature_def.outputs[key]
+  try:
+    # The init and train ops are not strictly enforced to be operations, so
+    # retrieve any graph element (can be either op or tensor).
+    return utils.get_element_from_tensor_info(
+        tensor_info, import_scope=import_scope)
+  except KeyError:
+    raise errors.NotFoundError(
+        None, None,
+        'The {0} could not be found in the graph. Please make sure the '
+        'SavedModel was created by the internal _SavedModelBuilder. If you '
+        'are using the public API, please make sure the SignatureDef in the '
+        'SavedModel does not contain the key "{0}".'.format(key))
diff --git a/tensorflow/python/saved_model/signature_def_utils_test.py b/tensorflow/python/saved_model/signature_def_utils_test.py
index 18c55d8d332..53c452359f1 100644
--- a/tensorflow/python/saved_model/signature_def_utils_test.py
+++ b/tensorflow/python/saved_model/signature_def_utils_test.py
@@ -22,7 +22,9 @@ from tensorflow.core.framework import types_pb2
 from tensorflow.core.protobuf import meta_graph_pb2
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 from tensorflow.python.saved_model import signature_constants
 from tensorflow.python.saved_model import signature_def_utils_impl
@@ -58,6 +60,7 @@ def _make_signature(inputs, outputs, name=None):
 
 class SignatureDefUtilsTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testBuildSignatureDef(self):
     x = array_ops.placeholder(dtypes.float32, 1, name="x")
     x_tensor_info = utils.build_tensor_info(x)
@@ -88,6 +91,7 @@ class SignatureDefUtilsTest(test.TestCase):
     self.assertEqual(types_pb2.DT_FLOAT, y_tensor_info_actual.dtype)
     self.assertEqual(0, len(y_tensor_info_actual.tensor_shape.dim))
 
+  @test_util.run_deprecated_v1
   def testRegressionSignatureDef(self):
     input1 = constant_op.constant("a", name="input-1")
     output1 = constant_op.constant(2.2, name="output-1")
@@ -113,6 +117,7 @@ class SignatureDefUtilsTest(test.TestCase):
     self.assertEqual(types_pb2.DT_FLOAT, y_tensor_info_actual.dtype)
     self.assertEqual(0, len(y_tensor_info_actual.tensor_shape.dim))
 
+  @test_util.run_deprecated_v1
   def testClassificationSignatureDef(self):
     input1 = constant_op.constant("a", name="input-1")
     output1 = constant_op.constant("b", name="output-1")
@@ -144,6 +149,7 @@ class SignatureDefUtilsTest(test.TestCase):
     self.assertEqual(types_pb2.DT_FLOAT, scores_tensor_info_actual.dtype)
     self.assertEqual(0, len(scores_tensor_info_actual.tensor_shape.dim))
 
+  @test_util.run_deprecated_v1
   def testPredictionSignatureDef(self):
     input1 = constant_op.constant("a", name="input-1")
     input2 = constant_op.constant("b", name="input-2")
@@ -180,11 +186,13 @@ class SignatureDefUtilsTest(test.TestCase):
     self.assertEqual(types_pb2.DT_STRING, output2_tensor_info_actual.dtype)
     self.assertEqual(0, len(output2_tensor_info_actual.tensor_shape.dim))
 
+  @test_util.run_deprecated_v1
   def testTrainSignatureDef(self):
     self._testSupervisedSignatureDef(
         signature_def_utils_impl.supervised_train_signature_def,
         signature_constants.SUPERVISED_TRAIN_METHOD_NAME)
 
+  @test_util.run_deprecated_v1
   def testEvalSignatureDef(self):
     self._testSupervisedSignatureDef(
         signature_def_utils_impl.supervised_eval_signature_def,
@@ -238,11 +246,13 @@ class SignatureDefUtilsTest(test.TestCase):
     self.assertEqual(
         types_pb2.DT_FLOAT, signature_def.outputs["metrics/value"].dtype)
 
+  @test_util.run_deprecated_v1
   def testTrainSignatureDefMissingInputs(self):
     self._testSupervisedSignatureDefMissingInputs(
         signature_def_utils_impl.supervised_train_signature_def,
         signature_constants.SUPERVISED_TRAIN_METHOD_NAME)
 
+  @test_util.run_deprecated_v1
   def testEvalSignatureDefMissingInputs(self):
     self._testSupervisedSignatureDefMissingInputs(
         signature_def_utils_impl.supervised_eval_signature_def,
@@ -413,5 +423,22 @@ class SignatureDefUtilsTest(test.TestCase):
         {},
         signature_constants.PREDICT_METHOD_NAME)
 
+  def testOpSignatureDef(self):
+    key = "adding_1_and_2_key"
+    add_op = math_ops.add(1, 2, name="adding_1_and_2")
+    signature_def = signature_def_utils_impl.op_signature_def(add_op, key)
+    self.assertIn(key, signature_def.outputs)
+    self.assertEqual(add_op.name, signature_def.outputs[key].name)
+
+  def testLoadOpFromSignatureDef(self):
+    key = "adding_1_and_2_key"
+    add_op = math_ops.add(1, 2, name="adding_1_and_2")
+    signature_def = signature_def_utils_impl.op_signature_def(add_op, key)
+
+    self.assertEqual(
+        add_op,
+        signature_def_utils_impl.load_op_from_signature_def(signature_def, key))
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/saved_model/simple_save_test.py b/tensorflow/python/saved_model/simple_save_test.py
index 18f82daadad..21c2e9df2fa 100644
--- a/tensorflow/python/saved_model/simple_save_test.py
+++ b/tensorflow/python/saved_model/simple_save_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import os
 
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 from tensorflow.python.saved_model import loader
@@ -33,8 +34,8 @@ class SimpleSaveTest(test.TestCase):
 
   def _init_and_validate_variable(self, sess, variable_name, variable_value):
     v = variables.Variable(variable_value, name=variable_name)
-    sess.run(variables.global_variables_initializer())
-    self.assertEqual(variable_value, v.eval())
+    self.evaluate(variables.global_variables_initializer())
+    self.assertEqual(variable_value, self.evaluate(v))
     return v
 
   def _check_variable_info(self, actual_variable, expected_variable):
@@ -53,6 +54,7 @@ class SimpleSaveTest(test.TestCase):
       self.assertEqual(actual_tensor_info.tensor_shape.dim[i].size,
                        expected_tensor.shape[i])
 
+  @test_util.run_deprecated_v1
   def testSimpleSave(self):
     """Test simple_save that uses the default parameters."""
     export_dir = os.path.join(test.get_temp_dir(),
diff --git a/tensorflow/python/saved_model/utils_impl.py b/tensorflow/python/saved_model/utils_impl.py
index 10667419761..5caabe59fec 100644
--- a/tensorflow/python/saved_model/utils_impl.py
+++ b/tensorflow/python/saved_model/utils_impl.py
@@ -141,6 +141,27 @@ def get_tensor_from_tensor_info(tensor_info, graph=None, import_scope=None):
     raise ValueError("Invalid TensorInfo.encoding: %s" % encoding)
 
 
+def get_element_from_tensor_info(tensor_info, graph=None, import_scope=None):
+  """Returns the element in the graph described by a TensorInfo proto.
+
+  Args:
+    tensor_info: A TensorInfo proto describing an Op or Tensor by name.
+    graph: The tf.Graph in which tensors are looked up. If None, the current
+      default graph is used.
+    import_scope: If not None, names in `tensor_info` are prefixed with this
+      string before lookup.
+
+  Returns:
+    Op or tensor in `graph` described by `tensor_info`.
+
+  Raises:
+    KeyError: If `tensor_info` does not correspond to an op or tensor in `graph`
+  """
+  graph = graph or ops.get_default_graph()
+  return graph.as_graph_element(
+      ops.prepend_name_scope(tensor_info.name, import_scope=import_scope))
+
+
 # Path helpers.
 
 
diff --git a/tensorflow/python/summary/summary.py b/tensorflow/python/summary/summary.py
index 9e9e6ed9035..0c13016712f 100644
--- a/tensorflow/python/summary/summary.py
+++ b/tensorflow/python/summary/summary.py
@@ -52,7 +52,7 @@ from tensorflow.python.util import compat as _compat
 from tensorflow.python.util.tf_export import tf_export
 
 
-@tf_export('summary.scalar')
+@tf_export(v1=['summary.scalar'])
 def scalar(name, tensor, collections=None, family=None):
   """Outputs a `Summary` protocol buffer containing a single scalar value.
 
@@ -82,7 +82,7 @@ def scalar(name, tensor, collections=None, family=None):
   return val
 
 
-@tf_export('summary.image')
+@tf_export(v1=['summary.image'])
 def image(name, tensor, max_outputs=3, collections=None, family=None):
   """Outputs a `Summary` protocol buffer with images.
 
@@ -138,7 +138,7 @@ def image(name, tensor, max_outputs=3, collections=None, family=None):
   return val
 
 
-@tf_export('summary.histogram')
+@tf_export(v1=['summary.histogram'])
 def histogram(name, values, collections=None, family=None):
   # pylint: disable=line-too-long
   """Outputs a `Summary` protocol buffer with a histogram.
@@ -179,7 +179,7 @@ def histogram(name, values, collections=None, family=None):
   return val
 
 
-@tf_export('summary.audio')
+@tf_export(v1=['summary.audio'])
 def audio(name, tensor, sample_rate, max_outputs=3, collections=None,
           family=None):
   # pylint: disable=line-too-long
@@ -228,7 +228,7 @@ def audio(name, tensor, sample_rate, max_outputs=3, collections=None,
   return val
 
 
-@tf_export('summary.text')
+@tf_export(v1=['summary.text'])
 def text(name, tensor, collections=None):
   """Summarizes textual data.
 
@@ -269,7 +269,7 @@ def text(name, tensor, collections=None):
   return t_summary
 
 
-@tf_export('summary.tensor_summary')
+@tf_export(v1=['summary.tensor_summary'])
 def tensor_summary(name,
                    tensor,
                    summary_description=None,
@@ -325,7 +325,7 @@ def tensor_summary(name,
   return val
 
 
-@tf_export('summary.merge')
+@tf_export(v1=['summary.merge'])
 def merge(inputs, collections=None, name=None):
   # pylint: disable=line-too-long
   """Merges summaries.
@@ -371,7 +371,7 @@ def merge(inputs, collections=None, name=None):
   return val
 
 
-@tf_export('summary.merge_all')
+@tf_export(v1=['summary.merge_all'])
 def merge_all(key=_ops.GraphKeys.SUMMARIES, scope=None, name=None):
   """Merges all summaries collected in the default graph.
 
@@ -404,7 +404,7 @@ def merge_all(key=_ops.GraphKeys.SUMMARIES, scope=None, name=None):
     return merge(summary_ops, name=name)
 
 
-@tf_export('summary.get_summary_description')
+@tf_export(v1=['summary.get_summary_description'])
 def get_summary_description(node_def):
   """Given a TensorSummary node_def, retrieve its SummaryDescription.
 
diff --git a/tensorflow/python/summary/summary_test.py b/tensorflow/python/summary/summary_test.py
index cacc28cc596..64f0f315c58 100644
--- a/tensorflow/python/summary/summary_test.py
+++ b/tensorflow/python/summary/summary_test.py
@@ -30,6 +30,7 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import meta_graph
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
@@ -38,6 +39,7 @@ from tensorflow.python.summary import summary as summary_lib
 
 class SummaryTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testScalarSummary(self):
     with self.cached_session() as s:
       i = constant_op.constant(3)
@@ -51,6 +53,7 @@ class SummaryTest(test.TestCase):
     self.assertEqual(values[0].tag, 'outer/inner')
     self.assertEqual(values[0].simple_value, 3.0)
 
+  @test_util.run_deprecated_v1
   def testScalarSummaryWithFamily(self):
     with self.cached_session() as s:
       i = constant_op.constant(7)
@@ -74,6 +77,7 @@ class SummaryTest(test.TestCase):
     self.assertEqual(values[0].tag, 'family/outer/family/inner_1')
     self.assertEqual(values[0].simple_value, 7.0)
 
+  @test_util.run_deprecated_v1
   def testSummarizingVariable(self):
     with self.cached_session() as s:
       c = constant_op.constant(42.0)
@@ -89,6 +93,7 @@ class SummaryTest(test.TestCase):
     self.assertEqual(value.tag, 'summary')
     self.assertEqual(value.simple_value, 42.0)
 
+  @test_util.run_deprecated_v1
   def testImageSummary(self):
     with self.cached_session() as s:
       i = array_ops.ones((5, 4, 4, 3))
@@ -103,6 +108,7 @@ class SummaryTest(test.TestCase):
     expected = sorted('outer/inner/image/{}'.format(i) for i in xrange(3))
     self.assertEqual(tags, expected)
 
+  @test_util.run_deprecated_v1
   def testImageSummaryWithFamily(self):
     with self.cached_session() as s:
       i = array_ops.ones((5, 2, 3, 1))
@@ -119,6 +125,7 @@ class SummaryTest(test.TestCase):
                       for i in xrange(3))
     self.assertEqual(tags, expected)
 
+  @test_util.run_deprecated_v1
   def testHistogramSummary(self):
     with self.cached_session() as s:
       i = array_ops.ones((5, 4, 4, 3))
@@ -130,6 +137,7 @@ class SummaryTest(test.TestCase):
     self.assertEqual(len(summary.value), 1)
     self.assertEqual(summary.value[0].tag, 'outer/inner')
 
+  @test_util.run_deprecated_v1
   def testHistogramSummaryWithFamily(self):
     with self.cached_session() as s:
       i = array_ops.ones((5, 4, 4, 3))
@@ -148,6 +156,7 @@ class SummaryTest(test.TestCase):
       const = constant_op.constant(10, dtype=dtype)
       summary_lib.histogram('h', const)
 
+  @test_util.run_deprecated_v1
   def testAudioSummary(self):
     with self.cached_session() as s:
       i = array_ops.ones((5, 3, 4))
@@ -162,6 +171,7 @@ class SummaryTest(test.TestCase):
     expected = sorted('outer/inner/audio/{}'.format(i) for i in xrange(3))
     self.assertEqual(tags, expected)
 
+  @test_util.run_deprecated_v1
   def testAudioSummaryWithFamily(self):
     with self.cached_session() as s:
       i = array_ops.ones((5, 3, 4))
@@ -178,6 +188,7 @@ class SummaryTest(test.TestCase):
                       for i in xrange(3))
     self.assertEqual(tags, expected)
 
+  @test_util.run_deprecated_v1
   def testTextSummary(self):
     with self.cached_session():
       with self.assertRaises(ValueError):
@@ -193,6 +204,7 @@ class SummaryTest(test.TestCase):
       summ = summary_lib.text('foo', array_ops.constant('one'))
       self.assertEqual(summ.op.type, 'TensorSummaryV2')
 
+  @test_util.run_deprecated_v1
   def testSummaryNameConversion(self):
     c = constant_op.constant(3)
     s = summary_lib.scalar('name with spaces', c)
@@ -204,6 +216,7 @@ class SummaryTest(test.TestCase):
     s3 = summary_lib.scalar('/name/with/leading/slash', c)
     self.assertEqual(s3.op.name, 'name/with/leading/slash')
 
+  @test_util.run_deprecated_v1
   def testSummaryWithFamilyMetaGraphExport(self):
     with ops.name_scope('outer'):
       i = constant_op.constant(11)
diff --git a/tensorflow/python/summary/writer/writer_test.py b/tensorflow/python/summary/writer/writer_test.py
index 09d4b63fbb6..d702ddc0a27 100644
--- a/tensorflow/python/summary/writer/writer_test.py
+++ b/tensorflow/python/summary/writer/writer_test.py
@@ -35,6 +35,7 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import meta_graph
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import summary_ops_v2
 from tensorflow.python.platform import gfile
 from tensorflow.python.platform import test
@@ -100,6 +101,7 @@ class FileWriterTestCase(test.TestCase):
     # We should be done.
     self.assertRaises(StopIteration, lambda: next(rr))
 
+  @test_util.run_deprecated_v1
   def testAddingSummaryGraphAndRunMetadata(self):
     test_dir = self._CleanTestDir("basics")
     sw = self._FileWriter(test_dir)
@@ -173,6 +175,7 @@ class FileWriterTestCase(test.TestCase):
     # We should be done.
     self.assertRaises(StopIteration, lambda: next(rr))
 
+  @test_util.run_deprecated_v1
   def testGraphAsNamed(self):
     test_dir = self._CleanTestDir("basics_named_graph")
     with ops.Graph().as_default() as g:
@@ -181,6 +184,7 @@ class FileWriterTestCase(test.TestCase):
     sw.close()
     self._assertEventsWithGraph(test_dir, g, True)
 
+  @test_util.run_deprecated_v1
   def testGraphAsPositional(self):
     test_dir = self._CleanTestDir("basics_positional_graph")
     with ops.Graph().as_default() as g:
@@ -189,6 +193,7 @@ class FileWriterTestCase(test.TestCase):
     sw.close()
     self._assertEventsWithGraph(test_dir, g, True)
 
+  @test_util.run_deprecated_v1
   def testGraphDefAsNamed(self):
     test_dir = self._CleanTestDir("basics_named_graph_def")
     with ops.Graph().as_default() as g:
@@ -198,6 +203,7 @@ class FileWriterTestCase(test.TestCase):
     sw.close()
     self._assertEventsWithGraph(test_dir, g, False)
 
+  @test_util.run_deprecated_v1
   def testGraphDefAsPositional(self):
     test_dir = self._CleanTestDir("basics_positional_graph_def")
     with ops.Graph().as_default() as g:
@@ -207,6 +213,7 @@ class FileWriterTestCase(test.TestCase):
     sw.close()
     self._assertEventsWithGraph(test_dir, g, False)
 
+  @test_util.run_deprecated_v1
   def testGraphAndGraphDef(self):
     with self.assertRaises(ValueError):
       test_dir = self._CleanTestDir("basics_graph_and_graph_def")
@@ -216,12 +223,14 @@ class FileWriterTestCase(test.TestCase):
       sw = self._FileWriter(test_dir, graph=g, graph_def=gd)
       sw.close()
 
+  @test_util.run_deprecated_v1
   def testNeitherGraphNorGraphDef(self):
     with self.assertRaises(TypeError):
       test_dir = self._CleanTestDir("basics_string_instead_of_graph")
       sw = self._FileWriter(test_dir, "string instead of graph object")
       sw.close()
 
+  @test_util.run_deprecated_v1
   def testCloseAndReopen(self):
     test_dir = self._CleanTestDir("close_and_reopen")
     sw = self._FileWriter(test_dir)
@@ -265,6 +274,7 @@ class FileWriterTestCase(test.TestCase):
     # We should be done.
     self.assertRaises(StopIteration, lambda: next(rr))
 
+  @test_util.run_deprecated_v1
   def testNonBlockingClose(self):
     test_dir = self._CleanTestDir("non_blocking_close")
     sw = self._FileWriter(test_dir)
@@ -274,6 +284,7 @@ class FileWriterTestCase(test.TestCase):
     sw.close()
     self._assertRecent(time_before_close)
 
+  @test_util.run_deprecated_v1
   def testUseAfterClose(self):
     test_dir = self._CleanTestDir("use_after_close")
     sw = self._FileWriter(test_dir)
@@ -289,6 +300,7 @@ class FileWriterTestCase(test.TestCase):
     for w in triggered:
       self.assertEqual(w.category, UserWarning)
 
+  @test_util.run_deprecated_v1
   def testWithStatement(self):
     test_dir = self._CleanTestDir("with_statement")
     with self._FileWriter(test_dir) as sw:
@@ -299,6 +311,7 @@ class FileWriterTestCase(test.TestCase):
   # Checks that values returned from session Run() calls are added correctly to
   # summaries.  These are numpy types so we need to check they fit in the
   # protocol buffers correctly.
+  @test_util.run_deprecated_v1
   def testAddingSummariesFromSessionRunCalls(self):
     test_dir = self._CleanTestDir("global_step")
     sw = self._FileWriter(test_dir)
@@ -309,12 +322,11 @@ class FileWriterTestCase(test.TestCase):
       summ = summary_pb2.Summary(
           value=[summary_pb2.Summary.Value(
               tag="i", simple_value=1.0)])
-      sw.add_summary(summ.SerializeToString(), i.eval())
+      sw.add_summary(summ.SerializeToString(), self.evaluate(i))
       sw.add_summary(
           summary_pb2.Summary(
-              value=[summary_pb2.Summary.Value(
-                  tag="l", simple_value=2.0)]),
-          l.eval())
+              value=[summary_pb2.Summary.Value(tag="l", simple_value=2.0)]),
+          self.evaluate(l))
       sw.close()
 
     rr = self._EventsReader(test_dir)
@@ -346,6 +358,7 @@ class FileWriterTestCase(test.TestCase):
     # We should be done.
     self.assertRaises(StopIteration, lambda: next(rr))
 
+  @test_util.run_deprecated_v1
   def testPluginMetadataStrippedFromSubsequentEvents(self):
     test_dir = self._CleanTestDir("basics")
     sw = self._FileWriter(test_dir)
@@ -405,6 +418,7 @@ class FileWriterTestCase(test.TestCase):
     # We should be done.
     self.assertRaises(StopIteration, lambda: next(rr))
 
+  @test_util.run_deprecated_v1
   def testFileWriterWithSuffix(self):
     test_dir = self._CleanTestDir("test_suffix")
     sw = self._FileWriter(test_dir, filename_suffix="_test_suffix")
diff --git a/tensorflow/python/tf2.py b/tensorflow/python/tf2.py
index c9782a71199..75748f8f2c5 100644
--- a/tensorflow/python/tf2.py
+++ b/tensorflow/python/tf2.py
@@ -25,6 +25,21 @@ from __future__ import print_function
 import os
 
 
+_force_enable = False
+
+
+def enable():
+  """Enables v2 behaviors."""
+  global _force_enable
+  _force_enable = True
+
+
+def disable():
+  """Disables v2 behaviors (TF2_BEHAVIOR env variable is still respected)."""
+  global _force_enable
+  _force_enable = False
+
+
 def enabled():
   """Returns True iff TensorFlow 2.0 behavior should be enabled."""
-  return os.getenv("TF2_BEHAVIOR") is not None
+  return _force_enable or os.getenv("TF2_BEHAVIOR", "0") != "0"
diff --git a/tensorflow/python/tools/BUILD b/tensorflow/python/tools/BUILD
index 384c7a82d27..901d6bc335f 100644
--- a/tensorflow/python/tools/BUILD
+++ b/tensorflow/python/tools/BUILD
@@ -29,6 +29,8 @@ py_library(
         ":optimize_for_inference_lib",
         ":selective_registration_header_lib",
         ":strip_unused_lib",
+        # Include the TF upgrade script to users can run it directly after install TF
+        "//tensorflow/tools/compatibility:tf_upgrade_v2",
     ],
 )
 
diff --git a/tensorflow/python/tools/api/generator/api_gen.bzl b/tensorflow/python/tools/api/generator/api_gen.bzl
index 2e5d875a58a..5e64cc64d24 100644
--- a/tensorflow/python/tools/api/generator/api_gen.bzl
+++ b/tensorflow/python/tools/api/generator/api_gen.bzl
@@ -20,7 +20,8 @@ def gen_api_init_files(
         packages = ["tensorflow.python", "tensorflow.lite.python.lite"],
         package_deps = ["//tensorflow/python:no_contrib"],
         output_package = "tensorflow",
-        output_dir = ""):
+        output_dir = "",
+        root_file_name = "__init__.py"):
     """Creates API directory structure and __init__.py files.
 
     Creates a genrule that generates a directory structure with __init__.py
@@ -54,13 +55,14 @@ def gen_api_init_files(
       output_package: Package where generated API will be added to.
       output_dir: Subdirectory to output API to.
         If non-empty, must end with '/'.
+      root_file_name: Name of the root file with all the root imports.
     """
     root_init_template_flag = ""
     if root_init_template:
         root_init_template_flag = "--root_init_template=$(location " + root_init_template + ")"
 
     primary_package = packages[0]
-    api_gen_binary_target = ("create_" + primary_package + "_api_%d") % api_version
+    api_gen_binary_target = ("create_" + primary_package + "_api_%d_%s") % (api_version, name)
     native.py_binary(
         name = api_gen_binary_target,
         srcs = ["//tensorflow/python/tools/api/generator:create_python_api.py"],
@@ -73,6 +75,11 @@ def gen_api_init_files(
         ],
     )
 
+    # Replace name of root file with root_file_name.
+    output_files = [
+        root_file_name if f == "__init__.py" else f
+        for f in output_files
+    ]
     all_output_files = ["%s%s" % (output_dir, f) for f in output_files]
     compat_api_version_flags = ""
     for compat_api_version in compat_api_versions:
diff --git a/tensorflow/python/tools/api/generator/api_init_files.bzl b/tensorflow/python/tools/api/generator/api_init_files.bzl
index 5699d86e6d5..3517c11cc93 100644
--- a/tensorflow/python/tools/api/generator/api_init_files.bzl
+++ b/tensorflow/python/tools/api/generator/api_init_files.bzl
@@ -4,17 +4,17 @@
 TENSORFLOW_API_INIT_FILES = [
     # BEGIN GENERATED FILES
     "__init__.py",
-    "app/__init__.py",
     "bitwise/__init__.py",
     "compat/__init__.py",
     "data/__init__.py",
     "data/experimental/__init__.py",
     "debugging/__init__.py",
+    "distribute/__init__.py",
     "dtypes/__init__.py",
     "errors/__init__.py",
     "experimental/__init__.py",
     "feature_column/__init__.py",
-    "gfile/__init__.py",
+    "io/gfile/__init__.py",
     "graph_util/__init__.py",
     "image/__init__.py",
     "io/__init__.py",
@@ -62,20 +62,17 @@ TENSORFLOW_API_INIT_FILES = [
     "linalg/__init__.py",
     "lite/__init__.py",
     "lite/constants/__init__.py",
-    "logging/__init__.py",
     "losses/__init__.py",
     "math/__init__.py",
     "metrics/__init__.py",
     "nn/__init__.py",
     "nn/rnn_cell/__init__.py",
-    "profiler/__init__.py",
     "quantization/__init__.py",
     "random/__init__.py",
     "saved_model/__init__.py",
     "sets/__init__.py",
     "signal/__init__.py",
     "sparse/__init__.py",
-    "spectral/__init__.py",
     "strings/__init__.py",
     "summary/__init__.py",
     "sysconfig/__init__.py",
diff --git a/tensorflow/python/tools/api/generator/api_init_files_v1.bzl b/tensorflow/python/tools/api/generator/api_init_files_v1.bzl
index 89c817f6090..e35b9c43740 100644
--- a/tensorflow/python/tools/api/generator/api_init_files_v1.bzl
+++ b/tensorflow/python/tools/api/generator/api_init_files_v1.bzl
@@ -10,12 +10,14 @@ TENSORFLOW_API_INIT_FILES_V1 = [
     "data/__init__.py",
     "data/experimental/__init__.py",
     "debugging/__init__.py",
+    "distribute/__init__.py",
     "distributions/__init__.py",
     "dtypes/__init__.py",
     "errors/__init__.py",
     "experimental/__init__.py",
     "feature_column/__init__.py",
     "gfile/__init__.py",
+    "io/gfile/__init__.py",
     "graph_util/__init__.py",
     "image/__init__.py",
     "io/__init__.py",
diff --git a/tensorflow/python/tools/api/generator/create_python_api.py b/tensorflow/python/tools/api/generator/create_python_api.py
index f6258034213..51c2bfba7c1 100644
--- a/tensorflow/python/tools/api/generator/create_python_api.py
+++ b/tensorflow/python/tools/api/generator/create_python_api.py
@@ -45,10 +45,10 @@ _GENERATED_FILE_HEADER = """# This file is MACHINE GENERATED! Do not edit.
 \"\"\"%s
 \"\"\"
 
-from __future__ import print_function
+from __future__ import print_function as _print_function
 
 """
-_GENERATED_FILE_FOOTER = '\n\ndel print_function\n'
+_GENERATED_FILE_FOOTER = '\n\ndel _print_function\n'
 
 
 class SymbolExposedTwiceError(Exception):
@@ -463,8 +463,9 @@ def create_api_files(output_files, packages, root_init_template, output_dir,
     raise ValueError(
         """Missing outputs for genrule:\n%s. Be sure to add these targets to
 tensorflow/python/tools/api/generator/api_init_files_v1.bzl and
-tensorflow/python/tools/api/generator/api_init_files.bzl""" % ',\n'.join(
-    sorted(missing_output_files)))
+tensorflow/python/tools/api/generator/api_init_files.bzl (tensorflow repo), or
+tensorflow_estimator/python/estimator/api/api_gen.bzl (estimator repo)"""
+        % ',\n'.join(sorted(missing_output_files)))
 
 
 def main():
diff --git a/tensorflow/python/tools/api/generator/doc_srcs.py b/tensorflow/python/tools/api/generator/doc_srcs.py
index 479d5006d1e..abb5886deb3 100644
--- a/tensorflow/python/tools/api/generator/doc_srcs.py
+++ b/tensorflow/python/tools/api/generator/doc_srcs.py
@@ -35,10 +35,11 @@ DocSource.__new__.__defaults__ = (None,) * len(DocSource._fields)
 
 _TENSORFLOW_DOC_SOURCES = {
     'app': DocSource(docstring_module_name='platform.app'),
+    'bitwise': DocSource(docstring_module_name='ops.bitwise_ops'),
     'compat': DocSource(docstring_module_name='util.compat'),
+    'distribute': DocSource(docstring_module_name='distribute.distribute_lib'),
     'distributions': DocSource(
         docstring_module_name='ops.distributions.distributions'),
-    'bitwise': DocSource(docstring_module_name='ops.bitwise_ops'),
     'errors': DocSource(docstring_module_name='framework.errors'),
     'gfile': DocSource(docstring_module_name='platform.gfile'),
     'graph_util': DocSource(docstring_module_name='framework.graph_util'),
@@ -56,9 +57,8 @@ _TENSORFLOW_DOC_SOURCES = {
     'resource_loader': DocSource(
         docstring_module_name='platform.resource_loader'),
     'sets': DocSource(docstring_module_name='ops.sets'),
-    'signal': DocSource(docstring_module_name='ops.signal'),
+    'signal': DocSource(docstring_module_name='ops.signal.signal'),
     'sparse': DocSource(docstring_module_name='ops.sparse_ops'),
-    'spectral': DocSource(docstring_module_name='ops.spectral_ops'),
     'strings': DocSource(docstring_module_name='ops.string_ops'),
     'sysconfig': DocSource(docstring_module_name='platform.sysconfig'),
     'test': DocSource(docstring_module_name='platform.test'),
diff --git a/tensorflow/python/tools/freeze_graph_test.py b/tensorflow/python/tools/freeze_graph_test.py
index 5dc14a6961e..efdf7dd2cf1 100644
--- a/tensorflow/python/tools/freeze_graph_test.py
+++ b/tensorflow/python/tools/freeze_graph_test.py
@@ -161,9 +161,11 @@ class FreezeGraphTest(test_util.TensorFlowTestCase):
             },)
         builder.save(as_text=True)
 
+  @test_util.run_deprecated_v1
   def testFreezeGraphV1(self):
     self._testFreezeGraph(saver_pb2.SaverDef.V1)
 
+  @test_util.run_deprecated_v1
   def testFreezeGraphV2(self):
     self._testFreezeGraph(saver_pb2.SaverDef.V2)
 
diff --git a/tensorflow/python/tools/inspect_checkpoint.py b/tensorflow/python/tools/inspect_checkpoint.py
index 6504fbc1075..ea1f6aa5555 100644
--- a/tensorflow/python/tools/inspect_checkpoint.py
+++ b/tensorflow/python/tools/inspect_checkpoint.py
@@ -63,7 +63,7 @@ def print_tensors_in_checkpoint_file(file_name, tensor_name, all_tensors,
       print("It's likely that your checkpoint file has been compressed "
             "with SNAPPY.")
     if ("Data loss" in str(e) and
-        (any([e in file_name for e in [".index", ".meta", ".data"]]))):
+        any(e in file_name for e in [".index", ".meta", ".data"])):
       proposed_file = ".".join(file_name.split(".")[0:-1])
       v2_file_error_template = """
 It's likely that this is a V2 checkpoint and you need to provide the filename
diff --git a/tensorflow/python/tools/optimize_for_inference_test.py b/tensorflow/python/tools/optimize_for_inference_test.py
index 10bfb0dc706..310776ff1b0 100644
--- a/tensorflow/python/tools/optimize_for_inference_test.py
+++ b/tensorflow/python/tools/optimize_for_inference_test.py
@@ -128,6 +128,7 @@ class OptimizeForInferenceTest(test.TestCase):
         graph_def, [], [add_name], dtypes.float32.as_datatype_enum)
     self.assertProtoEquals(expected_output, output)
 
+  @test_util.run_deprecated_v1
   def testFoldBatchNorms(self):
     with self.cached_session() as sess:
       inputs = [1, 4, 2, 5, 3, 6, -1, -4, -2, -5, -3, -6]
@@ -171,6 +172,7 @@ class OptimizeForInferenceTest(test.TestCase):
     for node in optimized_graph_def.node:
       self.assertNotEqual("BatchNormWithGlobalNormalization", node.op)
 
+  @test_util.run_deprecated_v1
   def testFoldFusedBatchNorms(self):
     for data_format, use_gpu in [("NHWC", False), ("NCHW", True)]:
       with self.cached_session(use_gpu=use_gpu) as sess:
@@ -222,6 +224,7 @@ class OptimizeForInferenceTest(test.TestCase):
       for node in optimized_graph_def.node:
         self.assertNotEqual("FusedBatchNorm", node.op)
 
+  @test_util.run_deprecated_v1
   def testFuseResizePadAndConv(self):
     with self.cached_session() as sess:
       inputs = [1, 4, 2, 5, 3, 6, -1, -4, -2, -5, -3, -6]
@@ -253,6 +256,7 @@ class OptimizeForInferenceTest(test.TestCase):
       self.assertNotEqual("MirrorPad", node.op)
       self.assertNotEqual("ResizeBilinear", node.op)
 
+  @test_util.run_deprecated_v1
   def testFuseResizeAndConv(self):
     with self.cached_session() as sess:
       inputs = [1, 4, 2, 5, 3, 6, -1, -4, -2, -5, -3, -6]
@@ -282,6 +286,7 @@ class OptimizeForInferenceTest(test.TestCase):
       self.assertNotEqual("MirrorPad", node.op)
 
 
+  @test_util.run_deprecated_v1
   def testFusePadAndConv(self):
     with self.cached_session() as sess:
       inputs = [1, 4, 2, 5, 3, 6, -1, -4, -2, -5, -3, -6]
diff --git a/tensorflow/python/tools/strip_unused_test.py b/tensorflow/python/tools/strip_unused_test.py
index 7cf0c3e3ed9..e906ff94ba8 100644
--- a/tensorflow/python/tools/strip_unused_test.py
+++ b/tensorflow/python/tools/strip_unused_test.py
@@ -50,7 +50,7 @@ class StripUnusedTest(test_util.TensorFlowTestCase):
           wanted_input_node, 2.0, name="output_node")
       math_ops.add(output_node, 2.0, name="later_node")
       sess = session.Session()
-      output = sess.run(output_node)
+      output = self.evaluate(output_node)
       self.assertNear(-4.0, output, 0.00001)
       graph_io.write_graph(sess.graph, self.get_temp_dir(), input_graph_name)
 
@@ -113,7 +113,7 @@ class StripUnusedTest(test_util.TensorFlowTestCase):
           input_node1, input_node2, name="output_node")
       math_ops.add(output_node, 2.0, name="later_node")
       sess = session.Session()
-      output = sess.run(output_node)
+      output = self.evaluate(output_node)
       self.assertNear(6.0, output, 0.00001)
       graph_io.write_graph(sess.graph, self.get_temp_dir(), input_graph_name)
 
diff --git a/tensorflow/python/training/adadelta.py b/tensorflow/python/training/adadelta.py
index 95eca764969..dd210160004 100644
--- a/tensorflow/python/training/adadelta.py
+++ b/tensorflow/python/training/adadelta.py
@@ -25,7 +25,7 @@ from tensorflow.python.training import training_ops
 from tensorflow.python.util.tf_export import tf_export
 
 
-@tf_export("train.AdadeltaOptimizer")
+@tf_export(v1=["train.AdadeltaOptimizer"])
 class AdadeltaOptimizer(optimizer.Optimizer):
   """Optimizer that implements the Adadelta algorithm.
 
diff --git a/tensorflow/python/training/adadelta_test.py b/tensorflow/python/training/adadelta_test.py
index a14ac895ac0..0e5af5a9222 100644
--- a/tensorflow/python/training/adadelta_test.py
+++ b/tensorflow/python/training/adadelta_test.py
@@ -166,6 +166,7 @@ class AdadeltaOptimizerTest(test.TestCase):
     with context.eager_mode():
       self.doTestBasic(use_resource=True, use_callable_params=True)
 
+  @test_util.run_deprecated_v1
   def testMinimizeSparseResourceVariable(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
       with self.cached_session():
@@ -177,12 +178,11 @@ class AdadeltaOptimizerTest(test.TestCase):
             1.0, 1.0, 1.0).minimize(loss)
         variables.global_variables_initializer().run()
         # Fetch params to validate initial values
-        self.assertAllCloseAccordingToType([[1.0, 2.0]], var0.eval())
+        self.assertAllCloseAccordingToType([[1.0, 2.0]], self.evaluate(var0))
         # Run 1 step of sgd
         sgd_op.run()
         # Validate updated params
-        self.assertAllCloseAccordingToType(
-            [[-111, -138]], var0.eval())
+        self.assertAllCloseAccordingToType([[-111, -138]], self.evaluate(var0))
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/training/adagrad.py b/tensorflow/python/training/adagrad.py
index cc0da26b279..10c043bae17 100644
--- a/tensorflow/python/training/adagrad.py
+++ b/tensorflow/python/training/adagrad.py
@@ -28,7 +28,7 @@ from tensorflow.python.training import training_ops
 from tensorflow.python.util.tf_export import tf_export
 
 
-@tf_export("train.AdagradOptimizer")
+@tf_export(v1=["train.AdagradOptimizer"])
 class AdagradOptimizer(optimizer.Optimizer):
   """Optimizer that implements the Adagrad algorithm.
 
diff --git a/tensorflow/python/training/adagrad_da.py b/tensorflow/python/training/adagrad_da.py
index 5ba403554f5..e23b7134b3b 100644
--- a/tensorflow/python/training/adagrad_da.py
+++ b/tensorflow/python/training/adagrad_da.py
@@ -26,7 +26,7 @@ from tensorflow.python.training import training_ops
 from tensorflow.python.util.tf_export import tf_export
 
 
-@tf_export("train.AdagradDAOptimizer")
+@tf_export(v1=["train.AdagradDAOptimizer"])
 class AdagradDAOptimizer(optimizer.Optimizer):
   """Adagrad Dual Averaging algorithm for sparse linear models.
 
diff --git a/tensorflow/python/training/adagrad_da_test.py b/tensorflow/python/training/adagrad_da_test.py
index 00801be3b4d..aacfe6faf4e 100644
--- a/tensorflow/python/training/adagrad_da_test.py
+++ b/tensorflow/python/training/adagrad_da_test.py
@@ -22,6 +22,7 @@ import numpy as np
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import embedding_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import resource_variable_ops
@@ -54,14 +55,14 @@ class AdagradDAOptimizerTest(test.TestCase):
             zip([grads0, grads1], [var0, var1]), global_step=global_step)
         variables.global_variables_initializer().run()
 
-        v0_val, v1_val = sess.run([var0, var1])
+        v0_val, v1_val = self.evaluate([var0, var1])
         self.assertAllClose([0.0, 0.0], v0_val)
         self.assertAllClose([0.0, 0.0], v1_val)
 
         # Run a step of AdagradDA
         update.run()
 
-        v0_val, v1_val = sess.run([var0, var1])
+        v0_val, v1_val = self.evaluate([var0, var1])
         # Let g to be gradient accumulator, gg to be gradient squared
         # accumulator, T be the global step, lr is the learning rate, and k the
         # initial gradient squared accumulator value.
@@ -73,12 +74,15 @@ class AdagradDAOptimizerTest(test.TestCase):
         self.assertAllCloseAccordingToType(
             np.array([-0.094821, -0.189358]), v1_val)
 
+  @test_util.run_deprecated_v1
   def testAdagradDAWithoutRegularizationBasic1(self):
     self.doTestAdagradDAwithoutRegularizationBasic1()
 
+  @test_util.run_deprecated_v1
   def testResourceAdagradDAWithoutRegularizationBasic1(self):
     self.doTestAdagradDAwithoutRegularizationBasic1(use_resource=True)
 
+  @test_util.run_deprecated_v1
   def testMinimizeSparseResourceVariable(self):
     for dtype in [dtypes.float32, dtypes.float64]:
       with self.cached_session():
@@ -92,13 +96,15 @@ class AdagradDAOptimizerTest(test.TestCase):
             1.0, global_step).minimize(loss)
         variables.global_variables_initializer().run()
         # Fetch params to validate initial values
-        self.assertAllCloseAccordingToType([[1.0, 2.0]], var0.eval())
+        self.assertAllCloseAccordingToType([[1.0, 2.0]], self.evaluate(var0))
         # Run 1 step of sgd
         sgd_op.run()
         # Validate updated params
-        self.assertAllCloseAccordingToType(
-            [[-1, -1]], var0.eval(), rtol=0.01)
+        self.assertAllCloseAccordingToType([[-1, -1]],
+                                           self.evaluate(var0),
+                                           rtol=0.01)
 
+  @test_util.run_deprecated_v1
   def testAdagradDAwithoutRegularizationBasic2(self):
     for dtype in [dtypes.float64, dtypes.float32]:
       with self.cached_session() as sess:
@@ -118,19 +124,20 @@ class AdagradDAOptimizerTest(test.TestCase):
             zip([grads0, grads1], [var0, var1]), global_step=global_step)
         variables.global_variables_initializer().run()
 
-        v0_val, v1_val = sess.run([var0, var1])
+        v0_val, v1_val = self.evaluate([var0, var1])
         self.assertAllCloseAccordingToType([1.0, 2.0], v0_val)
         self.assertAllCloseAccordingToType([4.0, 3.0], v1_val)
 
         # Run a step of AdagradDA
         update.run()
 
-        v0_val, v1_val = sess.run([var0, var1])
+        v0_val, v1_val = self.evaluate([var0, var1])
         self.assertAllCloseAccordingToType(
             np.array([-0.904534, -1.603567]), v0_val)
         self.assertAllCloseAccordingToType(
             np.array([-0.094821, -0.189358]), v1_val)
 
+  @test_util.run_deprecated_v1
   def testAdagradDAWithL1(self):
     for dtype in [dtypes.float64, dtypes.float32]:
       with self.cached_session() as sess:
@@ -150,19 +157,20 @@ class AdagradDAOptimizerTest(test.TestCase):
             zip([grads0, grads1], [var0, var1]), global_step=global_step)
         variables.global_variables_initializer().run()
 
-        v0_val, v1_val = sess.run([var0, var1])
+        v0_val, v1_val = self.evaluate([var0, var1])
         self.assertAllCloseAccordingToType([1.0, 2.0], v0_val)
         self.assertAllCloseAccordingToType([4.0, 3.0], v1_val)
 
         # Run a step of AdagradDA
         update.run()
 
-        v0_val, v1_val = sess.run([var0, var1])
+        v0_val, v1_val = self.evaluate([var0, var1])
         self.assertAllCloseAccordingToType(
             np.array([-0.895489, -1.59555]), v0_val)
         self.assertAllCloseAccordingToType(
             np.array([-0.085339, -0.17989]), v1_val)
 
+  @test_util.run_deprecated_v1
   def testAdagradDAWithL1_L2(self):
     for dtype in [dtypes.float64, dtypes.float32]:
       with self.cached_session() as sess:
@@ -182,14 +190,14 @@ class AdagradDAOptimizerTest(test.TestCase):
             zip([grads0, grads1], [var0, var1]), global_step=global_step)
         variables.global_variables_initializer().run()
 
-        v0_val, v1_val = sess.run([var0, var1])
+        v0_val, v1_val = self.evaluate([var0, var1])
         self.assertAllCloseAccordingToType([1.0, 2.0], v0_val)
         self.assertAllCloseAccordingToType([4.0, 3.0], v1_val)
 
         # Run a step of AdagradDA
         update.run()
 
-        v0_val, v1_val = sess.run([var0, var1])
+        v0_val, v1_val = self.evaluate([var0, var1])
         self.assertAllCloseAccordingToType(
             np.array([-0.046907, -0.093659]), v0_val)
         self.assertAllCloseAccordingToType(
diff --git a/tensorflow/python/training/adagrad_test.py b/tensorflow/python/training/adagrad_test.py
index 7caf01f64d5..da26fcdb7f6 100644
--- a/tensorflow/python/training/adagrad_test.py
+++ b/tensorflow/python/training/adagrad_test.py
@@ -96,6 +96,7 @@ class AdagradOptimizerTest(test.TestCase):
   def testBasicLocked(self):
     self.doTestBasic(use_locking=True)
 
+  @test_util.run_deprecated_v1
   def testMinimizeSparseResourceVariable(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
       with self.cached_session():
@@ -107,14 +108,16 @@ class AdagradOptimizerTest(test.TestCase):
         sgd_op = adagrad.AdagradOptimizer(1.0).minimize(loss)
         variables.global_variables_initializer().run()
         # Fetch params to validate initial values
-        self.assertAllCloseAccordingToType(
-            [[1.0, 2.0], [3.0, 4.0]], var0.eval())
+        self.assertAllCloseAccordingToType([[1.0, 2.0], [3.0, 4.0]],
+                                           self.evaluate(var0))
         # Run 1 step of sgd
         sgd_op.run()
         # Validate updated params
-        self.assertAllCloseAccordingToType(
-            [[0, 1], [3, 4]], var0.eval(), atol=0.01)
+        self.assertAllCloseAccordingToType([[0, 1], [3, 4]],
+                                           self.evaluate(var0),
+                                           atol=0.01)
 
+  @test_util.run_deprecated_v1
   def testTensorLearningRate(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
       with self.cached_session():
@@ -128,17 +131,20 @@ class AdagradOptimizerTest(test.TestCase):
             zip([grads0, grads1], [var0, var1]))
         variables.global_variables_initializer().run()
         # Fetch params to validate initial values
-        self.assertAllClose([1.0, 2.0], var0.eval())
-        self.assertAllClose([3.0, 4.0], var1.eval())
+        self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+        self.assertAllClose([3.0, 4.0], self.evaluate(var1))
         # Run 3 steps of adagrad
         for _ in range(3):
           ada_update.run()
         # Validate updated params
         self.assertAllCloseAccordingToType(
-            np.array([-1.6026098728179932, -0.6026098728179932]), var0.eval())
+            np.array([-1.6026098728179932, -0.6026098728179932]),
+            self.evaluate(var0))
         self.assertAllCloseAccordingToType(
-            np.array([2.715679168701172, 3.715679168701172]), var1.eval())
+            np.array([2.715679168701172, 3.715679168701172]),
+            self.evaluate(var1))
 
+  @test_util.run_deprecated_v1
   def testSparseBasic(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
       with self.cached_session():
@@ -159,17 +165,18 @@ class AdagradOptimizerTest(test.TestCase):
             zip([grads0, grads1], [var0, var1]))
         variables.global_variables_initializer().run()
         # Fetch params to validate initial values
-        self.assertAllClose([[1.0], [2.0]], var0.eval())
-        self.assertAllClose([[3.0], [4.0]], var1.eval())
+        self.assertAllClose([[1.0], [2.0]], self.evaluate(var0))
+        self.assertAllClose([[3.0], [4.0]], self.evaluate(var1))
         # Run 3 step of sgd
         for _ in range(3):
           ada_update.run()
         # Validate updated params
         self.assertAllCloseAccordingToType(
-            np.array([[-1.6026098728179932], [2.0]]), var0.eval())
+            np.array([[-1.6026098728179932], [2.0]]), self.evaluate(var0))
         self.assertAllCloseAccordingToType(
-            np.array([[3.0], [3.715679168701172]]), var1.eval())
+            np.array([[3.0], [3.715679168701172]]), self.evaluate(var1))
 
+  @test_util.run_deprecated_v1
   def testSparseRepeatedIndices(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
       with self.cached_session():
@@ -193,13 +200,14 @@ class AdagradOptimizerTest(test.TestCase):
             [(grad_aggregated, aggregated_update_var)])
         variables.global_variables_initializer().run()
         self.assertAllClose(aggregated_update_var.eval(),
-                            repeated_index_update_var.eval())
+                            self.evaluate(repeated_index_update_var))
         for _ in range(3):
           repeated_update.run()
           aggregated_update.run()
           self.assertAllClose(aggregated_update_var.eval(),
-                              repeated_index_update_var.eval())
+                              self.evaluate(repeated_index_update_var))
 
+  @test_util.run_deprecated_v1
   def testSparseRepeatedIndicesResourceVariable(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
       with self.cached_session():
@@ -217,13 +225,14 @@ class AdagradOptimizerTest(test.TestCase):
             2.0).minimize(loss_aggregated)
         variables.global_variables_initializer().run()
         self.assertAllCloseAccordingToType(
-            var_repeated.eval(), var_aggregated.eval())
+            self.evaluate(var_repeated), self.evaluate(var_aggregated))
         for _ in range(3):
           update_op_repeated.run()
           update_op_aggregated.run()
           self.assertAllCloseAccordingToType(
-              var_repeated.eval(), var_aggregated.eval())
+              self.evaluate(var_repeated), self.evaluate(var_aggregated))
 
+  @test_util.run_deprecated_v1
   def testSparseStability(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
       with self.cached_session():
@@ -253,13 +262,14 @@ class AdagradOptimizerTest(test.TestCase):
           init.run()
           ada_update.run()
           self.assertAllCloseAccordingToType(
-              np.array([[0.1, 0.1, 0.1, 0.1, 0.1, 0.1]]), slot0.eval())
+              np.array([[0.1, 0.1, 0.1, 0.1, 0.1, 0.1]]), self.evaluate(slot0))
           self.assertAllCloseAccordingToType(
               np.array([[
                   0.00891194, -0.10712013, 0.11047515, 0.22636929, -0.0144573,
                   -0.01029443
-              ]]), var0.eval())
+              ]]), self.evaluate(var0))
 
+  @test_util.run_deprecated_v1
   def testSharing(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
       with self.cached_session():
@@ -282,18 +292,21 @@ class AdagradOptimizerTest(test.TestCase):
         variables.global_variables_initializer().run()
 
         # Fetch params to validate initial values.
-        self.assertAllClose([1.0, 2.0], var0.eval())
-        self.assertAllClose([3.0, 4.0], var1.eval())
+        self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+        self.assertAllClose([3.0, 4.0], self.evaluate(var1))
         # Mix the first and the second adagrad for 3 steps.
         ada_update1.run()
         ada_update2.run()
         ada_update1.run()
         # Validate updated params (the same as with only 1 Adagrad).
         self.assertAllCloseAccordingToType(
-            np.array([-1.6026098728179932, -0.6026098728179932]), var0.eval())
+            np.array([-1.6026098728179932, -0.6026098728179932]),
+            self.evaluate(var0))
         self.assertAllCloseAccordingToType(
-            np.array([2.715679168701172, 3.715679168701172]), var1.eval())
+            np.array([2.715679168701172, 3.715679168701172]),
+            self.evaluate(var1))
 
+  @test_util.run_deprecated_v1
   def testDynamicShapeVariable_Ok(self):
     with self.cached_session():
       v = variable_scope.get_variable("v", initializer=constant_op.constant(1.),
@@ -302,6 +315,7 @@ class AdagradOptimizerTest(test.TestCase):
       # Creating optimizer should cause no exception.
       adagrad.AdagradOptimizer(3.0, initial_accumulator_value=0.1)
 
+  @test_util.run_deprecated_v1
   def testDynamicShapeVariableWithCallableInit(self):
     var0 = variable_scope.get_variable("var0",
                                        initializer=constant_op.constant(1.),
diff --git a/tensorflow/python/training/adam.py b/tensorflow/python/training/adam.py
index 704ad6d3fe8..0c701f47122 100644
--- a/tensorflow/python/training/adam.py
+++ b/tensorflow/python/training/adam.py
@@ -29,7 +29,7 @@ from tensorflow.python.training import training_ops
 from tensorflow.python.util.tf_export import tf_export
 
 
-@tf_export("train.AdamOptimizer")
+@tf_export(v1=["train.AdamOptimizer"])
 class AdamOptimizer(optimizer.Optimizer):
   """Optimizer that implements the Adam algorithm.
 
diff --git a/tensorflow/python/training/adam_test.py b/tensorflow/python/training/adam_test.py
index 0d42cc7b9c6..b0bae275773 100644
--- a/tensorflow/python/training/adam_test.py
+++ b/tensorflow/python/training/adam_test.py
@@ -83,30 +83,34 @@ class AdamOptimizerTest(test.TestCase):
         variables.global_variables_initializer().run()
 
         # Fetch params to validate initial values
-        self.assertAllClose([1.0, 2.0], var0.eval())
-        self.assertAllClose([3.0, 4.0], var1.eval())
+        self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+        self.assertAllClose([3.0, 4.0], self.evaluate(var1))
 
         beta1_power, beta2_power = opt._get_beta_accumulators()
 
         # Run 3 steps of Adam
         for t in range(1, 4):
-          self.assertAllCloseAccordingToType(0.9**t, beta1_power.eval())
-          self.assertAllCloseAccordingToType(0.999**t, beta2_power.eval())
+          self.assertAllCloseAccordingToType(0.9**t, self.evaluate(beta1_power))
+          self.assertAllCloseAccordingToType(0.999**t,
+                                             self.evaluate(beta2_power))
           update.run()
 
           var0_np, m0, v0 = adam_update_numpy(var0_np, grads0_np, t, m0, v0)
           var1_np, m1, v1 = adam_update_numpy(var1_np, grads1_np, t, m1, v1)
 
           # Validate updated params
-          self.assertAllCloseAccordingToType(var0_np, var0.eval())
-          self.assertAllCloseAccordingToType(var1_np, var1.eval())
+          self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
+          self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
 
+  @test_util.run_deprecated_v1
   def testSparse(self):
     self.doTestSparse(use_resource=False)
 
+  @test_util.run_deprecated_v1
   def testResourceSparse(self):
     self.doTestSparse(use_resource=True)
 
+  @test_util.run_deprecated_v1
   def testSparseDevicePlacement(self):
     for index_dtype in [dtypes.int32, dtypes.int64]:
       with self.cached_session(force_gpu=test.is_gpu_available()):
@@ -120,6 +124,7 @@ class AdamOptimizerTest(test.TestCase):
         variables.global_variables_initializer().run()
         minimize_op.run()
 
+  @test_util.run_deprecated_v1
   def testSparseRepeatedIndices(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
       with self.cached_session():
@@ -143,12 +148,12 @@ class AdamOptimizerTest(test.TestCase):
             [(grad_aggregated, aggregated_update_var)])
         variables.global_variables_initializer().run()
         self.assertAllClose(aggregated_update_var.eval(),
-                            repeated_index_update_var.eval())
+                            self.evaluate(repeated_index_update_var))
         for _ in range(3):
           repeated_update.run()
           aggregated_update.run()
           self.assertAllClose(aggregated_update_var.eval(),
-                              repeated_index_update_var.eval())
+                              self.evaluate(repeated_index_update_var))
 
   def doTestBasic(self, use_resource=False, use_callable_params=False):
     for i, dtype in enumerate([dtypes.half, dtypes.float32, dtypes.float64]):
@@ -235,6 +240,7 @@ class AdamOptimizerTest(test.TestCase):
     with context.eager_mode():
       self.doTestBasic(use_resource=True, use_callable_params=True)
 
+  @test_util.run_deprecated_v1
   def testTensorLearningRate(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
       with self.cached_session():
@@ -254,24 +260,26 @@ class AdamOptimizerTest(test.TestCase):
         variables.global_variables_initializer().run()
 
         # Fetch params to validate initial values
-        self.assertAllClose([1.0, 2.0], var0.eval())
-        self.assertAllClose([3.0, 4.0], var1.eval())
+        self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+        self.assertAllClose([3.0, 4.0], self.evaluate(var1))
 
         beta1_power, beta2_power = opt._get_beta_accumulators()
 
         # Run 3 steps of Adam
         for t in range(1, 4):
-          self.assertAllCloseAccordingToType(0.9**t, beta1_power.eval())
-          self.assertAllCloseAccordingToType(0.999**t, beta2_power.eval())
+          self.assertAllCloseAccordingToType(0.9**t, self.evaluate(beta1_power))
+          self.assertAllCloseAccordingToType(0.999**t,
+                                             self.evaluate(beta2_power))
           update.run()
 
           var0_np, m0, v0 = adam_update_numpy(var0_np, grads0_np, t, m0, v0)
           var1_np, m1, v1 = adam_update_numpy(var1_np, grads1_np, t, m1, v1)
 
           # Validate updated params
-          self.assertAllCloseAccordingToType(var0_np, var0.eval())
-          self.assertAllCloseAccordingToType(var1_np, var1.eval())
+          self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
+          self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
 
+  @test_util.run_deprecated_v1
   def testSharing(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
       with self.cached_session():
@@ -294,13 +302,14 @@ class AdamOptimizerTest(test.TestCase):
         beta1_power, beta2_power = opt._get_beta_accumulators()
 
         # Fetch params to validate initial values
-        self.assertAllClose([1.0, 2.0], var0.eval())
-        self.assertAllClose([3.0, 4.0], var1.eval())
+        self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+        self.assertAllClose([3.0, 4.0], self.evaluate(var1))
 
         # Run 3 steps of intertwined Adam1 and Adam2.
         for t in range(1, 4):
-          self.assertAllCloseAccordingToType(0.9**t, beta1_power.eval())
-          self.assertAllCloseAccordingToType(0.999**t, beta2_power.eval())
+          self.assertAllCloseAccordingToType(0.9**t, self.evaluate(beta1_power))
+          self.assertAllCloseAccordingToType(0.999**t,
+                                             self.evaluate(beta2_power))
           if t % 2 == 0:
             update1.run()
           else:
@@ -310,8 +319,8 @@ class AdamOptimizerTest(test.TestCase):
           var1_np, m1, v1 = adam_update_numpy(var1_np, grads1_np, t, m1, v1)
 
           # Validate updated params
-          self.assertAllCloseAccordingToType(var0_np, var0.eval())
-          self.assertAllCloseAccordingToType(var1_np, var1.eval())
+          self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
+          self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
 
   def testTwoSessions(self):
     optimizer = adam.AdamOptimizer()
diff --git a/tensorflow/python/training/basic_loops_test.py b/tensorflow/python/training/basic_loops_test.py
index 5f5718e64a6..511a8334d56 100644
--- a/tensorflow/python/training/basic_loops_test.py
+++ b/tensorflow/python/training/basic_loops_test.py
@@ -23,6 +23,7 @@ import shutil
 
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
 from tensorflow.python.training import basic_loops
 from tensorflow.python.training import supervisor
@@ -37,6 +38,7 @@ def _test_dir(test_name):
 
 class BasicTrainLoopTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testBasicTrainLoop(self):
     logdir = _test_dir("basic_train_loop")
     sv = supervisor.Supervisor(logdir=logdir)
@@ -55,6 +57,7 @@ class BasicTrainLoopTest(test.TestCase):
           sv, train_fn, args=(sv, "y"), kwargs={"a": "A"})
       self.assertEqual(3, num_calls[0])
 
+  @test_util.run_deprecated_v1
   def testBasicTrainLoopExceptionAborts(self):
     logdir = _test_dir("basic_train_loop_exception_aborts")
     sv = supervisor.Supervisor(logdir=logdir)
@@ -71,6 +74,7 @@ class BasicTrainLoopTest(test.TestCase):
       with self.assertRaisesRegexp(RuntimeError, "Failed"):
         basic_loops.basic_train_loop(sv, train_fn)
 
+  @test_util.run_deprecated_v1
   def testBasicTrainLoopRetryOnAborted(self):
     logdir = _test_dir("basic_train_loop_exception_aborts")
     sv = supervisor.Supervisor(logdir=logdir)
diff --git a/tensorflow/python/training/basic_session_run_hooks.py b/tensorflow/python/training/basic_session_run_hooks.py
index 1efabcd854d..b64c7ada62a 100644
--- a/tensorflow/python/training/basic_session_run_hooks.py
+++ b/tensorflow/python/training/basic_session_run_hooks.py
@@ -83,7 +83,7 @@ class _HookTimer(object):
     raise NotImplementedError
 
 
-@tf_export("train.SecondOrStepTimer")
+@tf_export(v1=["train.SecondOrStepTimer"])
 class SecondOrStepTimer(_HookTimer):
   """Timer that triggers at most once every N seconds or once every N steps.
   """
@@ -429,7 +429,7 @@ class StopAtStepHook(session_run_hook.SessionRunHook):
         run_context.request_stop()
 
 
-@tf_export("train.CheckpointSaverListener")
+@tf_export(v1=["train.CheckpointSaverListener"])
 class CheckpointSaverListener(object):
   """Interface for listeners that take action before or after checkpoint save.
 
@@ -718,7 +718,7 @@ class StepCounterHook(session_run_hook.SessionRunHook):
     self._last_global_step = stale_global_step
 
 
-@tf_export("train.NanLossDuringTrainingError")
+@tf_export(v1=["train.NanLossDuringTrainingError"])
 class NanLossDuringTrainingError(RuntimeError):
 
   def __str__(self):
@@ -976,7 +976,7 @@ class FeedFnHook(session_run_hook.SessionRunHook):
         fetches=None, feed_dict=self.feed_fn())
 
 
-@tf_export("train.ProfilerHook")
+@tf_export(v1=["train.ProfilerHook"])
 class ProfilerHook(session_run_hook.SessionRunHook):
   """Captures CPU/GPU profiling information every N steps or seconds.
 
diff --git a/tensorflow/python/training/basic_session_run_hooks_test.py b/tensorflow/python/training/basic_session_run_hooks_test.py
index 2d469634e0e..08942c5bb6e 100644
--- a/tensorflow/python/training/basic_session_run_hooks_test.py
+++ b/tensorflow/python/training/basic_session_run_hooks_test.py
@@ -22,7 +22,6 @@ from __future__ import print_function
 import os.path
 import shutil
 import tempfile
-import threading
 import time
 
 from tensorflow.contrib.framework.python.framework import checkpoint_utils
@@ -35,6 +34,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import meta_graph
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import state_ops
@@ -52,6 +52,11 @@ from tensorflow.python.training import session_run_hook
 from tensorflow.python.training import training_util
 
 
+# Provide a realistic start time for unit tests where we need to mock out
+# calls to time.time().
+MOCK_START_TIME = 1484695987.209386
+
+
 class MockCheckpointSaverListener(
     basic_session_run_hooks.CheckpointSaverListener):
 
@@ -87,15 +92,19 @@ class MockCheckpointSaverListener(
 
 class SecondOrStepTimerTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def test_raise_in_both_secs_and_steps(self):
     with self.assertRaises(ValueError):
       basic_session_run_hooks.SecondOrStepTimer(every_secs=2.0, every_steps=10)
 
+  @test_util.run_deprecated_v1
   def test_raise_in_none_secs_and_steps(self):
     with self.assertRaises(ValueError):
       basic_session_run_hooks.SecondOrStepTimer()
 
-  def test_every_secs(self):
+  @test.mock.patch.object(time, 'time')
+  def test_every_secs(self, mock_time):
+    mock_time.return_value = MOCK_START_TIME
     timer = basic_session_run_hooks.SecondOrStepTimer(every_secs=1.0)
     self.assertTrue(timer.should_trigger_for_step(1))
 
@@ -103,7 +112,7 @@ class SecondOrStepTimerTest(test.TestCase):
     self.assertFalse(timer.should_trigger_for_step(1))
     self.assertFalse(timer.should_trigger_for_step(2))
 
-    time.sleep(1.0)
+    mock_time.return_value += 1.0
     self.assertFalse(timer.should_trigger_for_step(1))
     self.assertTrue(timer.should_trigger_for_step(2))
 
@@ -243,7 +252,7 @@ class LoggingTensorHookTest(test.TestCase):
           tensors=[t.name], at_end=True)
       hook.begin()
       mon_sess = monitored_session._HookedSession(sess, [hook])
-      sess.run(variables_lib.global_variables_initializer())
+      self.evaluate(variables_lib.global_variables_initializer())
       self.logged_message = ''
       for _ in range(3):
         mon_sess.run(train_op)
@@ -261,7 +270,7 @@ class LoggingTensorHookTest(test.TestCase):
         tensors=[t.name], every_n_iter=10, at_end=at_end)
     hook.begin()
     mon_sess = monitored_session._HookedSession(sess, [hook])
-    sess.run(variables_lib.global_variables_initializer())
+    self.evaluate(variables_lib.global_variables_initializer())
     mon_sess.run(train_op)
     self.assertRegexpMatches(str(self.logged_message), t.name)
     for _ in range(3):
@@ -308,13 +317,13 @@ class LoggingTensorHookTest(test.TestCase):
           tensors={'foo': t}, every_n_iter=1)
       hook.begin()
       mon_sess = monitored_session._HookedSession(sess, [hook])
-      sess.run(variables_lib.global_variables_initializer())
+      self.evaluate(variables_lib.global_variables_initializer())
       mon_sess.run(train_op)
       self.assertRegexpMatches(str(self.logged_message), 'foo')
       # in first run, elapsed time is None.
       self.assertEqual(str(self.logged_message).find('sec'), -1)
 
-  def _validate_print_every_n_secs(self, sess, at_end):
+  def _validate_print_every_n_secs(self, sess, at_end, mock_time):
     t = constant_op.constant(42.0, name='foo')
     train_op = constant_op.constant(3)
 
@@ -322,7 +331,7 @@ class LoggingTensorHookTest(test.TestCase):
         tensors=[t.name], every_n_secs=1.0, at_end=at_end)
     hook.begin()
     mon_sess = monitored_session._HookedSession(sess, [hook])
-    sess.run(variables_lib.global_variables_initializer())
+    self.evaluate(variables_lib.global_variables_initializer())
 
     mon_sess.run(train_op)
     self.assertRegexpMatches(str(self.logged_message), t.name)
@@ -331,7 +340,7 @@ class LoggingTensorHookTest(test.TestCase):
     self.logged_message = ''
     mon_sess.run(train_op)
     self.assertEqual(str(self.logged_message).find(t.name), -1)
-    time.sleep(1.0)
+    mock_time.return_value += 1.0
 
     self.logged_message = ''
     mon_sess.run(train_op)
@@ -345,17 +354,21 @@ class LoggingTensorHookTest(test.TestCase):
       # assertNotRegexpMatches is not supported by python 3.1 and later
       self.assertEqual(str(self.logged_message).find(t.name), -1)
 
-  def test_print_every_n_secs(self):
+  @test.mock.patch.object(time, 'time')
+  def test_print_every_n_secs(self, mock_time):
     with ops.Graph().as_default(), session_lib.Session() as sess:
-      self._validate_print_every_n_secs(sess, at_end=False)
+      mock_time.return_value = MOCK_START_TIME
+      self._validate_print_every_n_secs(sess, at_end=False, mock_time=mock_time)
       # Verify proper reset.
-      self._validate_print_every_n_secs(sess, at_end=False)
+      self._validate_print_every_n_secs(sess, at_end=False, mock_time=mock_time)
 
-  def test_print_every_n_secs_and_end(self):
+  @test.mock.patch.object(time, 'time')
+  def test_print_every_n_secs_and_end(self, mock_time):
     with ops.Graph().as_default(), session_lib.Session() as sess:
-      self._validate_print_every_n_secs(sess, at_end=True)
+      mock_time.return_value = MOCK_START_TIME
+      self._validate_print_every_n_secs(sess, at_end=True, mock_time=mock_time)
       # Verify proper reset.
-      self._validate_print_every_n_secs(sess, at_end=True)
+      self._validate_print_every_n_secs(sess, at_end=True, mock_time=mock_time)
 
   def test_print_formatter(self):
     with ops.Graph().as_default(), session_lib.Session() as sess:
@@ -366,7 +379,7 @@ class LoggingTensorHookTest(test.TestCase):
           formatter=lambda items: 'qqq=%s' % items[t.name])
       hook.begin()
       mon_sess = monitored_session._HookedSession(sess, [hook])
-      sess.run(variables_lib.global_variables_initializer())
+      self.evaluate(variables_lib.global_variables_initializer())
       mon_sess.run(train_op)
       self.assertEqual(self.logged_message[0], 'qqq=42.0')
 
@@ -403,11 +416,13 @@ class CheckpointSaverHookTest(test.TestCase):
       basic_session_run_hooks.CheckpointSaverHook(
           self.model_dir, saver=self.scaffold.saver, scaffold=self.scaffold)
 
+  @test_util.run_deprecated_v1
   def test_raise_in_both_secs_and_steps(self):
     with self.assertRaises(ValueError):
       basic_session_run_hooks.CheckpointSaverHook(
           self.model_dir, save_secs=10, save_steps=20)
 
+  @test_util.run_deprecated_v1
   def test_raise_in_none_secs_and_steps(self):
     with self.assertRaises(ValueError):
       basic_session_run_hooks.CheckpointSaverHook(self.model_dir)
@@ -562,11 +577,8 @@ class CheckpointSaverHookTest(test.TestCase):
 
   @test.mock.patch.object(time, 'time')
   def test_save_secs_saves_periodically(self, mock_time):
-    # Let's have a realistic start time
-    current_time = 1484695987.209386
-
     with self.graph.as_default():
-      mock_time.return_value = current_time
+      mock_time.return_value = MOCK_START_TIME
       hook = basic_session_run_hooks.CheckpointSaverHook(
           self.model_dir, save_secs=2, scaffold=self.scaffold)
       hook.begin()
@@ -576,10 +588,10 @@ class CheckpointSaverHookTest(test.TestCase):
         sess.run(self.scaffold.init_op)
         mon_sess = monitored_session._HookedSession(sess, [hook])
 
-        mock_time.return_value = current_time
+        mock_time.return_value = MOCK_START_TIME
         mon_sess.run(self.train_op)  # Saved.
 
-        mock_time.return_value = current_time + 0.5
+        mock_time.return_value = MOCK_START_TIME + 0.5
         mon_sess.run(self.train_op)  # Not saved.
 
         self.assertEqual(1,
@@ -587,13 +599,13 @@ class CheckpointSaverHookTest(test.TestCase):
                                                         self.global_step.name))
 
         # Simulate 2.5 seconds of sleep.
-        mock_time.return_value = current_time + 2.5
+        mock_time.return_value = MOCK_START_TIME + 2.5
         mon_sess.run(self.train_op)  # Saved.
 
-        mock_time.return_value = current_time + 2.6
+        mock_time.return_value = MOCK_START_TIME + 2.6
         mon_sess.run(self.train_op)  # Not saved.
 
-        mock_time.return_value = current_time + 2.7
+        mock_time.return_value = MOCK_START_TIME + 2.7
         mon_sess.run(self.train_op)  # Not saved.
 
         self.assertEqual(3,
@@ -601,7 +613,7 @@ class CheckpointSaverHookTest(test.TestCase):
                                                         self.global_step.name))
 
         # Simulate 7.5 more seconds of sleep (10 seconds from start.
-        mock_time.return_value = current_time + 10
+        mock_time.return_value = MOCK_START_TIME + 10
         mon_sess.run(self.train_op)  # Saved.
         self.assertEqual(6,
                          checkpoint_utils.load_variable(self.model_dir,
@@ -609,11 +621,8 @@ class CheckpointSaverHookTest(test.TestCase):
 
   @test.mock.patch.object(time, 'time')
   def test_save_secs_calls_listeners_periodically(self, mock_time):
-    # Let's have a realistic start time
-    current_time = 1484695987.209386
-
     with self.graph.as_default():
-      mock_time.return_value = current_time
+      mock_time.return_value = MOCK_START_TIME
       listener = MockCheckpointSaverListener()
       hook = basic_session_run_hooks.CheckpointSaverHook(
           self.model_dir,
@@ -626,28 +635,28 @@ class CheckpointSaverHookTest(test.TestCase):
         sess.run(self.scaffold.init_op)
         mon_sess = monitored_session._HookedSession(sess, [hook])
 
-        mock_time.return_value = current_time + 0.5
+        mock_time.return_value = MOCK_START_TIME + 0.5
         mon_sess.run(self.train_op)  # hook runs here
 
-        mock_time.return_value = current_time + 0.5
+        mock_time.return_value = MOCK_START_TIME + 0.5
         mon_sess.run(self.train_op)
 
-        mock_time.return_value = current_time + 3.0
+        mock_time.return_value = MOCK_START_TIME + 3.0
         mon_sess.run(self.train_op)  # hook runs here
 
-        mock_time.return_value = current_time + 3.5
+        mock_time.return_value = MOCK_START_TIME + 3.5
         mon_sess.run(self.train_op)
 
-        mock_time.return_value = current_time + 4.0
+        mock_time.return_value = MOCK_START_TIME + 4.0
         mon_sess.run(self.train_op)
 
-        mock_time.return_value = current_time + 6.5
+        mock_time.return_value = MOCK_START_TIME + 6.5
         mon_sess.run(self.train_op)  # hook runs here
 
-        mock_time.return_value = current_time + 7.0
+        mock_time.return_value = MOCK_START_TIME + 7.0
         mon_sess.run(self.train_op)  # hook won't run here, so it does at end
 
-        mock_time.return_value = current_time + 7.5
+        mock_time.return_value = MOCK_START_TIME + 7.5
         hook.end(sess)  # hook runs here
       self.assertEqual({
           'begin': 1,
@@ -913,7 +922,9 @@ class StepCounterHookTest(test.TestCase):
   def tearDown(self):
     shutil.rmtree(self.log_dir, ignore_errors=True)
 
-  def test_step_counter_every_n_steps(self):
+  @test.mock.patch.object(time, 'time')
+  def test_step_counter_every_n_steps(self, mock_time):
+    mock_time.return_value = MOCK_START_TIME
     with ops.Graph().as_default() as g, session_lib.Session() as sess:
       variables.get_or_create_global_step()
       train_op = training_util._increment_global_step(1)
@@ -921,11 +932,11 @@ class StepCounterHookTest(test.TestCase):
       hook = basic_session_run_hooks.StepCounterHook(
           summary_writer=summary_writer, every_n_steps=10)
       hook.begin()
-      sess.run(variables_lib.global_variables_initializer())
+      self.evaluate(variables_lib.global_variables_initializer())
       mon_sess = monitored_session._HookedSession(sess, [hook])
       with test.mock.patch.object(tf_logging, 'warning') as mock_log:
         for _ in range(30):
-          time.sleep(0.01)
+          mock_time.return_value += 0.01
           mon_sess.run(train_op)
         # logging.warning should not be called.
         self.assertIsNone(mock_log.call_args)
@@ -941,7 +952,9 @@ class StepCounterHookTest(test.TestCase):
         self.assertEqual('global_step/sec', summary_value.tag)
         self.assertGreater(summary_value.simple_value, 0)
 
-  def test_step_counter_every_n_secs(self):
+  @test.mock.patch.object(time, 'time')
+  def test_step_counter_every_n_secs(self, mock_time):
+    mock_time.return_value = MOCK_START_TIME
     with ops.Graph().as_default() as g, session_lib.Session() as sess:
       variables.get_or_create_global_step()
       train_op = training_util._increment_global_step(1)
@@ -950,12 +963,12 @@ class StepCounterHookTest(test.TestCase):
           summary_writer=summary_writer, every_n_steps=None, every_n_secs=0.1)
 
       hook.begin()
-      sess.run(variables_lib.global_variables_initializer())
+      self.evaluate(variables_lib.global_variables_initializer())
       mon_sess = monitored_session._HookedSession(sess, [hook])
       mon_sess.run(train_op)
-      time.sleep(0.2)
+      mock_time.return_value += 0.2
       mon_sess.run(train_op)
-      time.sleep(0.2)
+      mock_time.return_value += 0.2
       mon_sess.run(train_op)
       hook.end(sess)
 
@@ -987,7 +1000,7 @@ class StepCounterHookTest(test.TestCase):
           summary_writer=summary_writer, every_n_steps=1, every_n_secs=None)
 
       hook.begin()
-      sess.run(variables_lib.global_variables_initializer())
+      self.evaluate(variables_lib.global_variables_initializer())
       mon_sess = monitored_session._HookedSession(sess, [hook])
       mon_sess.run(train_op)
       mon_sess.run(train_op)
@@ -1007,7 +1020,7 @@ class StepCounterHookTest(test.TestCase):
     with ops.Graph().as_default(), session_lib.Session() as sess:
       variables.get_or_create_global_step()
       train_op = training_util._increment_global_step(0)  # keep same.
-      sess.run(variables_lib.global_variables_initializer())
+      self.evaluate(variables_lib.global_variables_initializer())
       hook = basic_session_run_hooks.StepCounterHook(
           every_n_steps=1, every_n_secs=None)
       hook.begin()
@@ -1034,16 +1047,18 @@ class StepCounterHookTest(test.TestCase):
         summary_writer=self.summary_writer, every_n_steps=every_n_steps)
     self.hook._set_steps_per_run(steps_per_run)
     self.hook.begin()
-    sess.run(variables_lib.global_variables_initializer())
+    self.evaluate(variables_lib.global_variables_initializer())
     self.mon_sess = monitored_session._HookedSession(sess, [self.hook])
 
-  def test_steps_per_run_less_than_every_n_steps(self):
+  @test.mock.patch.object(time, 'time')
+  def test_steps_per_run_less_than_every_n_steps(self, mock_time):
+    mock_time.return_value = MOCK_START_TIME
     with ops.Graph().as_default() as g, session_lib.Session() as sess:
       self._setup_steps_per_run_test(10, 5, g, sess)
 
       # Logs at 15, 25
       for _ in range(5):
-        time.sleep(0.01)
+        mock_time.return_value += 0.01
         self.mon_sess.run(self.train_op)
 
       self.hook.end(sess)
@@ -1058,13 +1073,15 @@ class StepCounterHookTest(test.TestCase):
         self.assertEqual('global_step/sec', summary_value.tag)
         self.assertGreater(summary_value.simple_value, 0)
 
-  def test_steps_per_run_equal_every_n_steps(self):
+  @test.mock.patch.object(time, 'time')
+  def test_steps_per_run_equal_every_n_steps(self, mock_time):
+    mock_time.return_value = MOCK_START_TIME
     with ops.Graph().as_default() as g, session_lib.Session() as sess:
       self._setup_steps_per_run_test(5, 5, g, sess)
 
       # Logs at 10, 15, 20, 25
       for _ in range(5):
-        time.sleep(0.01)
+        mock_time.return_value += 0.01
         self.mon_sess.run(self.train_op)
 
       self.hook.end(sess)
@@ -1080,13 +1097,15 @@ class StepCounterHookTest(test.TestCase):
         self.assertEqual('global_step/sec', summary_value.tag)
         self.assertGreater(summary_value.simple_value, 0)
 
-  def test_steps_per_run_greater_than_every_n_steps(self):
+  @test.mock.patch.object(time, 'time')
+  def test_steps_per_run_greater_than_every_n_steps(self, mock_time):
+    mock_time.return_value = MOCK_START_TIME
     with ops.Graph().as_default() as g, session_lib.Session() as sess:
       self._setup_steps_per_run_test(5, 10, g, sess)
 
       # Logs at 20, 30, 40, 50
       for _ in range(5):
-        time.sleep(0.01)
+        mock_time.return_value += 0.01
         self.mon_sess.run(self.train_op)
 
       self.hook.end(sess)
@@ -1129,11 +1148,13 @@ class SummarySaverHookTest(test.TestCase):
       basic_session_run_hooks.SummarySaverHook(
           scaffold=monitored_session.Scaffold(), summary_op=self.summary_op)
 
+  @test_util.run_deprecated_v1
   def test_raise_in_both_secs_and_steps(self):
     with self.assertRaises(ValueError):
       basic_session_run_hooks.SummarySaverHook(
           save_secs=10, save_steps=20, summary_writer=self.summary_writer)
 
+  @test_util.run_deprecated_v1
   def test_raise_in_none_secs_and_steps(self):
     with self.assertRaises(ValueError):
       basic_session_run_hooks.SummarySaverHook(
@@ -1147,7 +1168,7 @@ class SummarySaverHookTest(test.TestCase):
 
     with self.cached_session() as sess:
       hook.begin()
-      sess.run(variables_lib.global_variables_initializer())
+      self.evaluate(variables_lib.global_variables_initializer())
       mon_sess = monitored_session._HookedSession(sess, [hook])
       for _ in range(30):
         mon_sess.run(self.train_op)
@@ -1179,7 +1200,7 @@ class SummarySaverHookTest(test.TestCase):
 
     with self.cached_session() as sess:
       hook.begin()
-      sess.run(variables_lib.global_variables_initializer())
+      self.evaluate(variables_lib.global_variables_initializer())
       mon_sess = monitored_session._HookedSession(sess, [hook])
       for _ in range(10):
         mon_sess.run(self.train_op)
@@ -1199,7 +1220,9 @@ class SummarySaverHookTest(test.TestCase):
             },
         })
 
-  def test_save_secs_saving_once_every_step(self):
+  @test.mock.patch.object(time, 'time')
+  def test_save_secs_saving_once_every_step(self, mock_time):
+    mock_time.return_value = MOCK_START_TIME
     hook = basic_session_run_hooks.SummarySaverHook(
         save_secs=0.5,
         summary_writer=self.summary_writer,
@@ -1207,11 +1230,11 @@ class SummarySaverHookTest(test.TestCase):
 
     with self.cached_session() as sess:
       hook.begin()
-      sess.run(variables_lib.global_variables_initializer())
+      self.evaluate(variables_lib.global_variables_initializer())
       mon_sess = monitored_session._HookedSession(sess, [hook])
       for _ in range(4):
         mon_sess.run(self.train_op)
-        time.sleep(0.5)
+        mock_time.return_value += 0.5
       hook.end(sess)
 
     self.summary_writer.assert_summaries(
@@ -1242,7 +1265,7 @@ class SummarySaverHookTest(test.TestCase):
 
     with self.cached_session() as sess:
       hook.begin()
-      sess.run(variables_lib.global_variables_initializer())
+      self.evaluate(variables_lib.global_variables_initializer())
       mon_sess = monitored_session._HookedSession(sess, [hook])
       for _ in range(8):
         mon_sess.run(self.train_op)
@@ -1279,27 +1302,43 @@ class GlobalStepWaiterHookTest(test.TestCase):
             session_run_hook.SessionRunContext(
                 original_args=None, session=sess))
 
-  def test_wait_for_step(self):
+  @test.mock.patch.object(time, 'sleep')
+  def test_wait_for_step(self, mock_sleep):
     with ops.Graph().as_default():
       gstep = variables.get_or_create_global_step()
       hook = basic_session_run_hooks.GlobalStepWaiterHook(wait_until_step=1000)
       hook.begin()
+
       with session_lib.Session() as sess:
-        sess.run(variables_lib.global_variables_initializer())
-        waiter = threading.Thread(
-            target=hook.before_run,
-            args=(session_run_hook.SessionRunContext(
-                original_args=None, session=sess),))
-        waiter.daemon = True
-        waiter.start()
-        time.sleep(1.0)
-        self.assertTrue(waiter.is_alive())
-        sess.run(state_ops.assign(gstep, 500))
-        time.sleep(1.0)
-        self.assertTrue(waiter.is_alive())
-        sess.run(state_ops.assign(gstep, 1100))
-        time.sleep(1.2)
-        self.assertFalse(waiter.is_alive())
+        # Mock out calls to time.sleep() to update the global step.
+
+        class Context(object):
+          counter = 0
+
+        def mock_sleep_side_effect(seconds):
+          del seconds  # argument is ignored
+          Context.counter += 1
+          if Context.counter == 1:
+            # The first time sleep() is called, we update the global_step from
+            # 0 to 500.
+            sess.run(state_ops.assign(gstep, 500))
+          elif Context.counter == 2:
+            # The second time sleep() is called, we update the global_step from
+            # 500 to 1100.
+            sess.run(state_ops.assign(gstep, 1100))
+          else:
+            raise AssertionError(
+                'Expected before_run() to terminate after the second call to '
+                'time.sleep()')
+
+        mock_sleep.side_effect = mock_sleep_side_effect
+
+        # Run the mocked-out interaction with the hook.
+        self.evaluate(variables_lib.global_variables_initializer())
+        run_context = session_run_hook.SessionRunContext(
+            original_args=None, session=sess)
+        hook.before_run(run_context)
+        self.assertEqual(Context.counter, 2)
 
 
 class FinalOpsHookTest(test.TestCase):
@@ -1333,7 +1372,7 @@ class FinalOpsHookTest(test.TestCase):
   def test_final_ops_triggers_out_of_range_error(self):
     with ops.Graph().as_default():
       dataset = dataset_ops.Dataset.range(1)
-      iterator = dataset.make_one_shot_iterator()
+      iterator = dataset_ops.make_one_shot_iterator(dataset)
       read_ops = iterator.get_next()
       final_ops = read_ops
 
@@ -1390,7 +1429,7 @@ class ResourceSummarySaverHookTest(test.TestCase):
 
     with self.cached_session() as sess:
       hook.begin()
-      sess.run(variables_lib.global_variables_initializer())
+      self.evaluate(variables_lib.global_variables_initializer())
       mon_sess = monitored_session._HookedSession(sess, [hook])
       for _ in range(30):
         mon_sess.run(self.train_op)
@@ -1446,10 +1485,12 @@ class ProfilerHookTest(test.TestCase):
   def _count_timeline_files(self):
     return len(gfile.Glob(self.filepattern))
 
+  @test_util.run_deprecated_v1
   def test_raise_in_both_secs_and_steps(self):
     with self.assertRaises(ValueError):
       basic_session_run_hooks.ProfilerHook(save_secs=10, save_steps=20)
 
+  @test_util.run_deprecated_v1
   def test_raise_in_none_secs_and_steps(self):
     with self.assertRaises(ValueError):
       basic_session_run_hooks.ProfilerHook(save_secs=None, save_steps=None)
@@ -1465,29 +1506,27 @@ class ProfilerHookTest(test.TestCase):
   @test.mock.patch.object(time, 'time')
   def test_save_secs_saves_periodically(self, mock_time):
     # Pick a fixed start time.
-    current_time = 1484863632.
-
     with self.graph.as_default():
-      mock_time.return_value = current_time
+      mock_time.return_value = MOCK_START_TIME
       hook = basic_session_run_hooks.ProfilerHook(
           save_secs=2, output_dir=self.output_dir)
       with monitored_session.SingularMonitoredSession(hooks=[hook]) as sess:
         sess.run(self.train_op)  # Not saved.
         self.assertEqual(0, self._count_timeline_files())
         # Simulate 2.5 seconds of sleep.
-        mock_time.return_value = current_time + 2.5
+        mock_time.return_value = MOCK_START_TIME + 2.5
         sess.run(self.train_op)  # Saved.
         self.assertEqual(1, self._count_timeline_files())
 
         # Pretend some small amount of time has passed.
-        mock_time.return_value = current_time + 2.6
+        mock_time.return_value = MOCK_START_TIME + 2.6
         sess.run(self.train_op)  # Not saved.
         # Edge test just before we should save the timeline.
-        mock_time.return_value = current_time + 4.4
+        mock_time.return_value = MOCK_START_TIME + 4.4
         sess.run(self.train_op)  # Not saved.
         self.assertEqual(1, self._count_timeline_files())
 
-        mock_time.return_value = current_time + 4.5
+        mock_time.return_value = MOCK_START_TIME + 4.5
         sess.run(self.train_op)  # Saved.
         self.assertEqual(2, self._count_timeline_files())
 
diff --git a/tensorflow/python/training/checkpoint_management_test.py b/tensorflow/python/training/checkpoint_management_test.py
index 3a061bcb35c..8606ec4a206 100644
--- a/tensorflow/python/training/checkpoint_management_test.py
+++ b/tensorflow/python/training/checkpoint_management_test.py
@@ -62,6 +62,7 @@ class LatestCheckpointWithRelativePaths(test.TestCase):
     finally:
       shutil.rmtree(tempdir)
 
+  @test_util.run_deprecated_v1
   def testNameCollision(self):
     # Make sure we have a clean directory to work in.
     with self.tempDir() as tempdir:
@@ -99,6 +100,7 @@ class LatestCheckpointWithRelativePaths(test.TestCase):
           self.assertIsNotNone(
               checkpoint_management.latest_checkpoint(traindir))
 
+  @test_util.run_deprecated_v1
   def testRelativePath(self):
     # Make sure we have a clean directory to work in.
     with self.tempDir() as tempdir:
@@ -123,9 +125,9 @@ class LatestCheckpointWithRelativePaths(test.TestCase):
           # Record a short training history.
           variables.global_variables_initializer().run()
           save.save(sess, filepath, global_step=0)
-          inc.eval()
+          self.evaluate(inc)
           save.save(sess, filepath, global_step=1)
-          inc.eval()
+          self.evaluate(inc)
           save.save(sess, filepath, global_step=2)
 
         with self.cached_session() as sess:
@@ -270,6 +272,7 @@ class SaverUtilsTest(test.TestCase):
   def tearDown(self):
     gfile.DeleteRecursively(self._base_dir)
 
+  @test_util.run_deprecated_v1
   def testCheckpointExists(self):
     for sharded in (False, True):
       for version in (saver_pb2.SaverDef.V2, saver_pb2.SaverDef.V1):
@@ -288,6 +291,7 @@ class SaverUtilsTest(test.TestCase):
           ckpt_prefix = checkpoint_management.latest_checkpoint(self._base_dir)
           self.assertTrue(checkpoint_management.checkpoint_exists(ckpt_prefix))
 
+  @test_util.run_deprecated_v1
   def testGetCheckpointMtimes(self):
     prefixes = []
     for version in (saver_pb2.SaverDef.V2, saver_pb2.SaverDef.V1):
@@ -302,6 +306,7 @@ class SaverUtilsTest(test.TestCase):
     self.assertEqual(2, len(mtimes))
     self.assertTrue(mtimes[1] >= mtimes[0])
 
+  @test_util.run_deprecated_v1
   def testRemoveCheckpoint(self):
     for sharded in (False, True):
       for version in (saver_pb2.SaverDef.V2, saver_pb2.SaverDef.V1):
diff --git a/tensorflow/python/training/checkpoint_ops_test.py b/tensorflow/python/training/checkpoint_ops_test.py
index dde84314977..21ad3df1c8f 100644
--- a/tensorflow/python/training/checkpoint_ops_test.py
+++ b/tensorflow/python/training/checkpoint_ops_test.py
@@ -47,7 +47,7 @@ class LoadAndRemapWrappersTest(test.TestCase):
       with variable_scope.variable_scope('some_scope'):
         variable_scope.get_variable(name='embeddings', shape=[5, 16],
                                     initializer=initializer)
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       saver = saver_lib.Saver()
       saver.save(sess, checkpoint_prefix, global_step=5)
     self.checkpoint_file = '{}-5'.format(checkpoint_prefix)
@@ -115,7 +115,8 @@ class LoadAndRemapWrappersTest(test.TestCase):
         axis=1)
 
     with self.cached_session():
-      self.assertAllClose(expected_remapped_matrix, remapped_matrix.eval())
+      self.assertAllClose(expected_remapped_matrix,
+                          self.evaluate(remapped_matrix))
 
   def test_load_and_remap_output_layer_weight_initializer_linear(self):
     """Tests for the output layer initializer in the linear multi-class case."""
diff --git a/tensorflow/python/training/checkpoint_utils.py b/tensorflow/python/training/checkpoint_utils.py
index 857da431db2..58166dbb681 100644
--- a/tensorflow/python/training/checkpoint_utils.py
+++ b/tensorflow/python/training/checkpoint_utils.py
@@ -101,7 +101,7 @@ def list_variables(ckpt_dir_or_file):
   return result
 
 
-@tf_export("train.init_from_checkpoint")
+@tf_export(v1=["train.init_from_checkpoint"])
 def init_from_checkpoint(ckpt_dir_or_file, assignment_map):
   """Replaces `tf.Variable` initializers so they load from a checkpoint file.
 
@@ -187,7 +187,7 @@ def init_from_checkpoint(ckpt_dir_or_file, assignment_map):
     _init_from_checkpoint(None, ckpt_dir_or_file, assignment_map)
   else:
     distribution_strategy_context.get_replica_context().merge_call(
-        _init_from_checkpoint, ckpt_dir_or_file, assignment_map)
+        _init_from_checkpoint, args=(ckpt_dir_or_file, assignment_map))
 
 
 def _init_from_checkpoint(_, ckpt_dir_or_file, assignment_map):
diff --git a/tensorflow/python/training/checkpointable/BUILD b/tensorflow/python/training/checkpointable/BUILD
index d26932c1aae..f97f42a6593 100644
--- a/tensorflow/python/training/checkpointable/BUILD
+++ b/tensorflow/python/training/checkpointable/BUILD
@@ -152,7 +152,7 @@ py_test(
         "//tensorflow/python:variable_scope",
         "//tensorflow/python/eager:backprop",
         "//tensorflow/python/eager:context",
-        "//tensorflow/python/eager:function",
+        "//tensorflow/python/eager:def_function",
         "//tensorflow/python/eager:test",
         "//tensorflow/python/keras:engine",
         "//tensorflow/python/keras:layers",
diff --git a/tensorflow/python/training/checkpointable/data_structures.py b/tensorflow/python/training/checkpointable/data_structures.py
index c29e5db0753..817552f3269 100644
--- a/tensorflow/python/training/checkpointable/data_structures.py
+++ b/tensorflow/python/training/checkpointable/data_structures.py
@@ -111,9 +111,6 @@ class CheckpointableDataStructure(base.CheckpointableBase):
   """Base class for data structures which contain checkpointable objects."""
 
   def __init__(self):
-    # An append-only ordered set
-    self._layers = []
-
     self.trainable = True
     self._extra_variables = []
 
@@ -128,21 +125,30 @@ class CheckpointableDataStructure(base.CheckpointableBase):
           ("Only checkpointable objects (such as Layers or Optimizers) may be "
            "stored in a List object. Got %s, which does not inherit from "
            "CheckpointableBase.") % (value,))
-    if (isinstance(value, CheckpointableDataStructure)
-        or layer_utils.is_layer(value)
-        or layer_utils.has_weights(value)):
-      # Check for object-identity rather than with __eq__ to avoid
-      # de-duplicating empty container types. Automatically generated list
-      # wrappers keep things like "[] == []" true, which means "[] in [[]]" is
-      # also true. This becomes not true once one of the lists is mutated.
-      if not any((layer is value for layer in self._layers)):
-        self._layers.append(value)
-        if hasattr(value, "_use_resource_variables"):
-          # In subclassed models, legacy layers (tf.layers) must always use
-          # resource variables.
-          value._use_resource_variables = True  # pylint: disable=protected-access
+    if hasattr(value, "_use_resource_variables"):
+      # In subclassed models, legacy layers (tf.layers) must always use
+      # resource variables.
+      value._use_resource_variables = True  # pylint: disable=protected-access
     return value
 
+  @property
+  def _values(self):
+    """An iterable/sequence which may contain checkpointable objects."""
+    raise NotImplementedError("Abstract method")
+
+  @property
+  def _layers(self):
+    """All Layers and Layer containers, including empty containers."""
+    # Filter objects on demand so that wrapper objects use values from the thing
+    # they're wrapping if out of sync.
+    collected = []
+    for obj in self._values:
+      if (isinstance(obj, CheckpointableDataStructure)
+          or layer_utils.is_layer(obj)
+          or layer_utils.has_weights(obj)):
+        collected.append(obj)
+    return collected
+
   @property
   def layers(self):
     return layer_utils.filter_empty_layer_containers(self._layers)
@@ -265,6 +271,10 @@ class List(CheckpointableDataStructure, collections.Sequence):
   def _name_element(self, index):
     return "%d" % (index,)
 
+  @property
+  def _values(self):
+    return self
+
   def append(self, value):
     """Add a new checkpointable value."""
     value = self._track_value(value, self._name_element(len(self._storage)))
@@ -479,6 +489,14 @@ class Mapping(CheckpointableDataStructure, collections.Mapping):
   def _make_storage(self, *args, **kwargs):
     return dict(*args, **kwargs)
 
+  @property
+  def _values(self):
+    # Sort items deterministically by key
+    ordered = list(zip(*sorted(self.items(), key=lambda it: it[0])))
+    if ordered:
+      return ordered[1]
+    return []
+
   def _name_element(self, key):
     if not isinstance(key, six.string_types):
       raise TypeError(
diff --git a/tensorflow/python/training/checkpointable/data_structures_test.py b/tensorflow/python/training/checkpointable/data_structures_test.py
index ff7d1f1d2d7..9cefd942ac9 100644
--- a/tensorflow/python/training/checkpointable/data_structures_test.py
+++ b/tensorflow/python/training/checkpointable/data_structures_test.py
@@ -253,6 +253,13 @@ class ListTests(test.TestCase):
     l.append(1)
     self.assertEqual([1], l_wrapper)
 
+  def testLayerCollectionWithExternalMutation(self):
+    l = []
+    l_wrapper = data_structures._ListWrapper(l)
+    layer = core.Dense(1)
+    l.append(layer)
+    self.assertEqual([layer], l_wrapper.layers)
+
   def testHashing(self):
     has_sequences = set([data_structures.List(),
                          data_structures.List()])
@@ -324,6 +331,20 @@ class MappingTests(test.TestCase):
     with self.assertRaises(TypeError):
       mapping[1] = data_structures.List()
 
+  def testLayerCollectionWithExternalMutation(self):
+    d = {}
+    root = tracking.Checkpointable()
+    root.wrapper = d
+    self.assertEqual([], root.wrapper.layers)
+    self.assertEqual([], root.wrapper.trainable_weights)
+    layer1 = core.Dense(1)
+    layer2 = core.Dense(1)
+    d["a"] = layer1
+    d["b"] = layer2
+    self.assertEqual([layer1, layer2], root.wrapper.layers)
+    # The layers have still not created variables
+    self.assertEqual([], root.wrapper.trainable_weights)
+
   def testHashing(self):
     has_mappings = set([data_structures.Mapping(),
                         data_structures.Mapping()])
diff --git a/tensorflow/python/training/checkpointable/tracking.py b/tensorflow/python/training/checkpointable/tracking.py
index c85b208d479..4e96aee0c51 100644
--- a/tensorflow/python/training/checkpointable/tracking.py
+++ b/tensorflow/python/training/checkpointable/tracking.py
@@ -17,6 +17,10 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.eager import context
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.training.checkpointable import base
 from tensorflow.python.training.checkpointable import data_structures
 from tensorflow.python.util import tf_contextlib
@@ -145,3 +149,36 @@ class TrackableResource(base.CheckpointableBase):
     if self._resource_handle is None:
       self._resource_handle = self.create_resource()
     return self._resource_handle
+
+
+class TrackableAsset(base.CheckpointableBase):
+  """Base class for asset files which need to be tracked."""
+
+  def __init__(self, path):
+    """Record the full path to the asset."""
+    # We use a variable here so that @tf.functions do not capture a literal
+    # value. The init_scope prevents functions from capturing `path` in an
+    # initialization graph, since it is transient and should not end up in a
+    # serialized function body. When serialized in a SavedModel, the variable
+    # will be set during the loading process to its location in the assets/
+    # directory.
+    with ops.init_scope():
+      if context.executing_eagerly():
+        self._path = self._no_dependency(
+            resource_variable_ops.ResourceVariable(
+                path, dtype=dtypes.string,
+                name="asset_path"))
+      else:
+        # Adding a variable is too disruptive when v1-style graph building,
+        # since things may get fed and local variable initializers would then
+        # need to be run.
+        self._path = path
+
+  @property
+  def asset_path(self):
+    """Fetch the current asset path."""
+    return self._path
+
+ops.register_tensor_conversion_function(
+    TrackableAsset,
+    lambda asset, **kw: ops.internal_convert_to_tensor(asset.asset_path, **kw))
diff --git a/tensorflow/python/training/checkpointable/util.py b/tensorflow/python/training/checkpointable/util.py
index f45f7445f13..36d3a7bebd4 100644
--- a/tensorflow/python/training/checkpointable/util.py
+++ b/tensorflow/python/training/checkpointable/util.py
@@ -31,6 +31,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.lib.io import file_io
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_io_ops as io_ops
 from tensorflow.python.ops import init_ops
@@ -549,13 +550,11 @@ def _serialize_slot_variables(checkpointable_objects, node_ids, object_names):
   return slot_variables
 
 
-def _serialize_checkpointables(
-    checkpointable_objects, node_ids, object_names, slot_variables,
+def _add_attributes_to_object_graph(
+    checkpointable_objects, object_graph_proto, node_ids, object_names,
     saveables_cache, object_map):
-  """Name non-slot `Checkpointable`s and add them to `object_graph_proto`."""
-  object_graph_proto = (
-      checkpointable_object_graph_pb2.CheckpointableObjectGraph())
-  named_saveables = []
+  """Create SaveableObjects and corresponding SerializedTensor protos."""
+  named_saveable_objects = []
   if saveables_cache is None:
     # No SaveableObject caching. Either we're executing eagerly, or building a
     # static save which is specialized to the current Python state.
@@ -564,10 +563,9 @@ def _serialize_checkpointables(
     # If we are caching SaveableObjects, we need to build up a feed_dict with
     # functions computing volatile Python state to be saved with the checkpoint.
     feed_additions = {}
-  for checkpoint_id, checkpointable in enumerate(checkpointable_objects):
+  for checkpoint_id, (checkpointable, object_proto) in enumerate(
+      zip(checkpointable_objects, object_graph_proto.nodes)):
     assert node_ids[checkpointable] == checkpoint_id
-    object_proto = object_graph_proto.nodes.add()
-    object_proto.slot_variables.extend(slot_variables.get(checkpointable, ()))
     object_name = object_names[checkpointable]
     if object_map:
       object_to_save = object_map.get(checkpointable, checkpointable)
@@ -645,14 +643,26 @@ def _serialize_checkpointables(
                      "value.")
                     % (checkpointable, new_feed_key))
             feed_additions.update(saveable_feed_dict)
-        named_saveables.append(saveable)
+        named_saveable_objects.append(saveable)
 
+  return named_saveable_objects, feed_additions
+
+
+def _fill_object_graph_proto(checkpointable_objects, node_ids, slot_variables,
+                             object_graph_proto=None):
+  """Name non-slot `Checkpointable`s and add them to `object_graph_proto`."""
+  if object_graph_proto is None:
+    object_graph_proto = (
+        checkpointable_object_graph_pb2.CheckpointableObjectGraph())
+  for checkpoint_id, checkpointable in enumerate(checkpointable_objects):
+    assert node_ids[checkpointable] == checkpoint_id
+    object_proto = object_graph_proto.nodes.add()
+    object_proto.slot_variables.extend(slot_variables.get(checkpointable, ()))
     for child in checkpointable._checkpoint_dependencies:  # pylint: disable=protected-access
       child_proto = object_proto.children.add()
       child_proto.node_id = node_ids[child.ref]
       child_proto.local_name = child.name
-
-  return named_saveables, object_graph_proto, feed_additions
+  return object_graph_proto
 
 
 def _serialize_gathered_objects(
@@ -668,13 +678,18 @@ def _serialize_gathered_objects(
       checkpointable_objects=checkpointable_objects,
       node_ids=node_ids,
       object_names=object_names)
-  return _serialize_checkpointables(
+  object_graph_proto = _fill_object_graph_proto(
       checkpointable_objects=checkpointable_objects,
       node_ids=node_ids,
+      slot_variables=slot_variables)
+  named_saveable_objects, feed_additions = _add_attributes_to_object_graph(
+      checkpointable_objects=checkpointable_objects,
+      object_graph_proto=object_graph_proto,
+      node_ids=node_ids,
       object_names=object_names,
-      slot_variables=slot_variables,
       saveables_cache=saveables_cache,
       object_map=object_map)
+  return named_saveable_objects, object_graph_proto, feed_additions
 
 
 def _serialize_object_graph(root_checkpointable, saveables_cache):
@@ -716,6 +731,23 @@ def named_saveables(root_checkpointable):
   return _serialize_object_graph(root_checkpointable, None)[0]
 
 
+def _find_objects(root_checkpointable):
+  """Find and number objects which are dependencies of `root_checkpointable`."""
+  checkpointable_objects, path_to_root = (
+      _breadth_first_checkpointable_traversal(root_checkpointable))
+  object_names = _ObjectIdentityDictionary()
+  for obj, path in path_to_root.items():
+    object_names[obj] = _object_prefix_from_path(path)
+  node_ids = _ObjectIdentityDictionary()
+  for node_id, node in enumerate(checkpointable_objects):
+    node_ids[node] = node_id
+  slot_variables = _serialize_slot_variables(
+      checkpointable_objects=checkpointable_objects,
+      node_ids=node_ids,
+      object_names=object_names)
+  return checkpointable_objects, node_ids, slot_variables
+
+
 def list_objects(root_checkpointable):
   """Traverse the object graph and list all accessible objects.
 
@@ -730,23 +762,18 @@ def list_objects(root_checkpointable):
   Returns:
     A flat list of objects.
   """
-  # TODO(allenl): Extract out gathering logic so the naming logic doesn't have
-  # to run.
-  checkpointable_objects, path_to_root = (
-      _breadth_first_checkpointable_traversal(root_checkpointable))
-  object_names = _ObjectIdentityDictionary()
-  for obj, path in path_to_root.items():
-    object_names[obj] = _object_prefix_from_path(path)
-  node_ids = _ObjectIdentityDictionary()
-  for node_id, node in enumerate(checkpointable_objects):
-    node_ids[node] = node_id
-  _serialize_slot_variables(
-      checkpointable_objects=checkpointable_objects,
-      node_ids=node_ids,
-      object_names=object_names)
+  checkpointable_objects, _, _ = _find_objects(root_checkpointable)
   return checkpointable_objects
 
 
+def make_object_graph_without_attributes(root_checkpointable, proto=None):
+  """Fill an object graph proto, ignoring variable values."""
+  checkpointable_objects, node_ids, slot_variables = _find_objects(
+      root_checkpointable)
+  return _fill_object_graph_proto(
+      checkpointable_objects, node_ids, slot_variables, proto)
+
+
 def gather_initializers(root_checkpointable):
   """Traverse the object graph and find initialization ops.
 
@@ -1434,6 +1461,7 @@ class CheckpointableSaver(object):
     elif session is None:
       session = ops.get_default_session()
 
+    file_io.recursive_create_dir(os.path.dirname(file_prefix))
     with ops.device("/cpu:0"):
       save_path = saver.save(
           sess=_SessionWithFeedDictAdditions(
@@ -1898,3 +1926,4 @@ class Checkpoint(tracking.Checkpointable):
     # initialization when executing eagerly.
     self._maybe_create_save_counter()
     return status
+
diff --git a/tensorflow/python/training/checkpointable/util_test.py b/tensorflow/python/training/checkpointable/util_test.py
index 19955140123..de9cac08632 100644
--- a/tensorflow/python/training/checkpointable/util_test.py
+++ b/tensorflow/python/training/checkpointable/util_test.py
@@ -26,7 +26,7 @@ from tensorflow.python import pywrap_tensorflow
 from tensorflow.python.client import session as session_lib
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
-from tensorflow.python.eager import function
+from tensorflow.python.eager import def_function
 from tensorflow.python.eager import test
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -44,6 +44,7 @@ from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.training import adam
 from tensorflow.python.training import checkpoint_management
+from tensorflow.python.training import momentum
 from tensorflow.python.training import saver as saver_lib
 from tensorflow.python.training import training_util
 from tensorflow.python.training.checkpointable import base
@@ -198,6 +199,17 @@ class InterfaceTests(test.TestCase):
     with self.assertRaises(NotImplementedError):
       checkpoint_reversed.save(prefix)
 
+  @test_util.run_in_graph_and_eager_modes(assert_no_eager_garbage=True)
+  def test_object_graph_no_attributes(self):
+    root = tracking.Checkpointable()
+    root.v = resource_variable_ops.ResourceVariable(1.)
+    root.opt = momentum.MomentumOptimizer(0.01, 0.5)
+    root.opt.minimize(root.v.read_value)
+    object_graph = checkpointable_utils.make_object_graph_without_attributes(
+        root)
+    # Four objects: Root, v, opt, and a slot variable for v
+    self.assertEqual(4, len(object_graph.nodes))
+
 
 class _MirroringSaveable(saver_lib.BaseSaverBuilder.SaveableObject):
 
@@ -632,7 +644,7 @@ class CheckpointingTests(test.TestCase):
             checkpoint_directory)
         status = root.restore(save_path=checkpoint_path)
         def train_fn():
-          @function.defun
+          @def_function.function
           def _call_model(x):
             return model(x)
           with backprop.GradientTape() as tape:
diff --git a/tensorflow/python/training/coordinator.py b/tensorflow/python/training/coordinator.py
index 0ff97d85e37..b7e5c98c78e 100644
--- a/tensorflow/python/training/coordinator.py
+++ b/tensorflow/python/training/coordinator.py
@@ -408,7 +408,7 @@ class Coordinator(object):
 
 
 # Threads for the standard services.
-@tf_export("train.LooperThread")
+@tf_export(v1=["train.LooperThread"])
 class LooperThread(threading.Thread):
   """A thread that runs code repeatedly, optionally on a timer.
 
diff --git a/tensorflow/python/training/device_setter.py b/tensorflow/python/training/device_setter.py
index be80c365715..5874a1ff415 100644
--- a/tensorflow/python/training/device_setter.py
+++ b/tensorflow/python/training/device_setter.py
@@ -130,7 +130,7 @@ class _ReplicaDeviceChooser(object):
     return worker_device.to_string()
 
 
-@tf_export("train.replica_device_setter")
+@tf_export(v1=["train.replica_device_setter"])
 def replica_device_setter(ps_tasks=0, ps_device="/job:ps",
                           worker_device="/job:worker", merge_devices=True,
                           cluster=None, ps_ops=None, ps_strategy=None):
diff --git a/tensorflow/python/training/device_setter_test.py b/tensorflow/python/training/device_setter_test.py
index 85b75502ab0..3cff87b326f 100644
--- a/tensorflow/python/training/device_setter_test.py
+++ b/tensorflow/python/training/device_setter_test.py
@@ -19,6 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
@@ -33,6 +34,7 @@ class DeviceSetterTest(test.TestCase):
       "worker": ["worker0:2222", "worker1:2222", "worker2:2222"]
   })
 
+  @test_util.run_deprecated_v1
   def testCPUOverride(self):
     with ops.device(
         device_setter.replica_device_setter(cluster=self._cluster_spec)):
@@ -47,12 +49,14 @@ class DeviceSetterTest(test.TestCase):
       self.assertDeviceEqual("/job:ps/task:1", w.initializer.device)
       self.assertDeviceEqual("/job:worker/cpu:0", a.device)
 
+  @test_util.run_deprecated_v1
   def testResource(self):
     with ops.device(
         device_setter.replica_device_setter(cluster=self._cluster_spec)):
       v = resource_variable_ops.ResourceVariable([1, 2])
       self.assertDeviceEqual("/job:ps/task:0", v.device)
 
+  @test_util.run_deprecated_v1
   def testPS2TasksWithClusterSpecClass(self):
     with ops.device(
         device_setter.replica_device_setter(cluster=self._cluster_spec)):
@@ -65,6 +69,7 @@ class DeviceSetterTest(test.TestCase):
       self.assertDeviceEqual("/job:ps/task:1", w.initializer.device)
       self.assertDeviceEqual("/job:worker", a.device)
 
+  @test_util.run_deprecated_v1
   def testPS2TasksPinVariableToJob(self):
     with ops.device(
         device_setter.replica_device_setter(cluster=self._cluster_spec)):
@@ -82,6 +87,7 @@ class DeviceSetterTest(test.TestCase):
       self.assertDeviceEqual("/job:ps/task:1", x.initializer.device)
       self.assertDeviceEqual("/job:worker", a.device)
 
+  @test_util.run_deprecated_v1
   def testPS2TasksUseCpuForPS(self):
     with ops.device(
         device_setter.replica_device_setter(ps_tasks=1, ps_device="/cpu:0")):
@@ -95,6 +101,7 @@ class DeviceSetterTest(test.TestCase):
       self.assertDeviceEqual("/job:moon/cpu:0", w.initializer.device)
       self.assertDeviceEqual("/job:worker", a.device)
 
+  @test_util.run_deprecated_v1
   def testPS2TasksNoMerging(self):
     with ops.device(
         device_setter.replica_device_setter(
@@ -109,6 +116,7 @@ class DeviceSetterTest(test.TestCase):
       self.assertDeviceEqual("/job:ps", w.initializer.device)
       self.assertDeviceEqual("/job:worker", a.device)
 
+  @test_util.run_deprecated_v1
   def testPS2TasksWithClusterSpecDict(self):
     with ops.device(
         device_setter.replica_device_setter(cluster=self._cluster_spec.as_dict(
@@ -122,6 +130,7 @@ class DeviceSetterTest(test.TestCase):
       self.assertDeviceEqual("/job:ps/task:1", w.initializer.device)
       self.assertDeviceEqual("/job:worker", a.device)
 
+  @test_util.run_deprecated_v1
   def testPS2TasksWithClusterDef(self):
     with ops.device(
         device_setter.replica_device_setter(
@@ -135,6 +144,7 @@ class DeviceSetterTest(test.TestCase):
       self.assertDeviceEqual("/job:ps/task:1", w.initializer.device)
       self.assertDeviceEqual("/job:worker", a.device)
 
+  @test_util.run_deprecated_v1
   def testPS2TasksWithDevice(self):
     cluster_spec = server_lib.ClusterSpec({
         "sun": ["sun0:2222", "sun1:2222", "sun2:2222"],
@@ -155,6 +165,7 @@ class DeviceSetterTest(test.TestCase):
       self.assertDeviceEqual("/job:moon/task:1", w.initializer.device)
       self.assertDeviceEqual("/job:sun", a.device)
 
+  @test_util.run_deprecated_v1
   def testPS2TasksWithCPUConstraint(self):
     cluster_spec = server_lib.ClusterSpec({
         "sun": ["sun0:2222", "sun1:2222", "sun2:2222"],
diff --git a/tensorflow/python/training/distribute.py b/tensorflow/python/training/distribute.py
index 95104ad5779..ad27bc8a702 100644
--- a/tensorflow/python/training/distribute.py
+++ b/tensorflow/python/training/distribute.py
@@ -12,1234 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Class DistributionStrategy, ReplicaContext, and supporting APIs."""
+"""Deprecated, please use ../distribute/distribute_lib.py."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import threading
-
-from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import resource_variable_ops
-from tensorflow.python.ops import variable_scope
-from tensorflow.python.ops.losses import losses_impl
-from tensorflow.python.platform import tf_logging
-from tensorflow.python.training import device_util
-from tensorflow.python.training import distribution_strategy_context
-from tensorflow.python.util import nest
-
-
-# ------------------------------------------------------------------------------
-# Context tracking whether in a distribution.update() or .update_non_slot()
-# call.
-
-
-_update_device = threading.local()
-
-
-def get_update_device():
-  """Get the current device if in a `DistributionStrategy.update()` call."""
-  try:
-    return _update_device.current
-  except AttributeError:
-    return None
-
-
-class UpdateContext(object):
-  """Context manager when you are in `update()` or `update_non_slot()`."""
-
-  def __init__(self, device):
-    self._device = device
-    self._old_device = None
-
-  def __enter__(self):
-    self._old_device = get_update_device()
-    _update_device.current = self._device
-
-  def __exit__(self, exception_type, exception_value, traceback):
-    del exception_type, exception_value, traceback
-    _update_device.current = self._old_device
-
-
-# ------------------------------------------------------------------------------
-# Public utility functions.
-
-
-def get_loss_reduction():
-  """Reduce `aggregation` corresponding to the last loss reduction."""
-  loss_reduction = ops.get_default_graph()._last_loss_reduction  # pylint: disable=protected-access
-  if loss_reduction == losses_impl.Reduction.SUM:
-    return variable_scope.VariableAggregation.SUM
-  return variable_scope.VariableAggregation.MEAN
-
-
-# ------------------------------------------------------------------------------
-# Internal API for validating the current thread mode
-
-
-def _require_cross_replica_context(distribution_strategy):
-  """Verify in cross-replica context for `distribution_strategy`."""
-  context = _get_per_thread_mode()
-  if context.cross_replica_context is distribution_strategy: return
-  # We have an error to report, figure out the right message.
-  if context.distribution_strategy is not distribution_strategy:
-    if not distribution_strategy_context.has_distribution_strategy():
-      raise RuntimeError(
-          'Need to be inside "with distribution_strategy.scope()" for %s' %
-          (distribution_strategy,))
-    else:
-      raise RuntimeError(
-          "Mixing different DistributionStrategy objects: %s is not %s" %
-          (context.distribution_strategy, distribution_strategy))
-  assert context.cross_replica_context is None
-  raise RuntimeError("Method requires being in cross-replica context, use "
-                     "get_replica_context().merge_call()")
-
-
-def require_replica_context(replica_ctx):
-  """Verify in `replica_ctx` replica context."""
-  context = _get_per_thread_mode()
-  if context.replica_context is replica_ctx: return
-  # We have an error to report, figure out the right message.
-  if context.replica_context is None:
-    raise RuntimeError("Need to be inside `call_for_each_replica()`")
-  if context.distribution_strategy is replica_ctx.distribution_strategy:
-    # Two different ReplicaContexts with the same DistributionStrategy.
-    raise RuntimeError("Mismatching replica context.")
-  raise RuntimeError(
-      "Mismatching DistributionStrategy objects: %s is not %s." %
-      (context.distribution_strategy, replica_ctx.distribution_strategy))
-
-
-def _require_distribution_strategy_scope(distribution_strategy):
-  """Verify in a `distribution_strategy.scope()` in this thread."""
-  context = _get_per_thread_mode()
-  if context.distribution_strategy is distribution_strategy: return
-  # We have an error to report, figure out the right message.
-  if not distribution_strategy_context.has_distribution_strategy():
-    raise RuntimeError(
-        'Need to be inside "with distribution_strategy.scope()" for %s' %
-        (distribution_strategy,))
-  else:
-    raise RuntimeError(
-        "Mixing different DistributionStrategy objects: %s is not %s" %
-        (context.distribution_strategy, distribution_strategy))
-
-
-# ------------------------------------------------------------------------------
-# Internal context managers used to implement the DistributionStrategy
-# base class
-
-
-class _CurrentDistributionContext(object):
-  """Context manager for setting the `DistributionStrategy` and var creator."""
-
-  def __init__(self,
-               distribution_strategy,
-               var_creator_scope,
-               var_scope=None,
-               default_device=None):
-    self._context = distribution_strategy_context._CrossReplicaThreadMode(  # pylint: disable=protected-access
-        distribution_strategy)
-    self._var_creator_scope = var_creator_scope
-    self._var_scope = var_scope
-    if default_device:
-      self._device_scope = ops.device(default_device)
-    else:
-      self._device_scope = None
-
-  def __enter__(self):
-    _push_per_thread_mode(self._context)
-    if self._var_scope:
-      self._var_scope.__enter__()
-    self._var_creator_scope.__enter__()
-    if self._device_scope:
-      self._device_scope.__enter__()
-    return self._context.distribution_strategy
-
-  def __exit__(self, exception_type, exception_value, traceback):
-    if self._device_scope:
-      self._device_scope.__exit__(exception_type, exception_value, traceback)
-    self._var_creator_scope.__exit__(exception_type, exception_value, traceback)
-    if self._var_scope:
-      self._var_scope.__exit__(exception_type, exception_value, traceback)
-    _pop_per_thread_mode()
-
-
-class _SameScopeAgainContext(object):
-  """Trivial context manager when you are already in `scope()`."""
-
-  def __init__(self, distribution_strategy):
-    self._distribution_strategy = distribution_strategy
-
-  def __enter__(self):
-    return self._distribution_strategy
-
-  def __exit__(self, exception_type, exception_value, traceback):
-    del exception_type, exception_value, traceback
-
-
-# ------------------------------------------------------------------------------
-# Base classes for all distribution strategies.
-
-
-class DistributionStrategy(object):
-  """A list of devices with a state & compute distribution policy.
-
-  See [tensorflow/contrib/distribute/README.md](
-  https://www.tensorflow.org/code/tensorflow/contrib/distribute/README.md)
-  for overview and examples.
-
-  The intent is that you can write an algorithm in a stylized way and
-  it will be usable with a variety of different `DistributionStrategy`
-  implementations. Each descendant will implement a different strategy
-  for distributing the algorithm across multiple devices/machines.
-  Furthermore, these changes can be hidden inside the specific layers
-  and other library classes that need special treatment to run in a
-  distributed setting, so that most users' model definition code can
-  run unchanged. The `DistributionStrategy` API works the same way
-  with eager and graph execution.
-
-  First let's introduce a few high-level concepts:
-
-  * _Data parallelism_ is where we run multiple copies of the model
-    on different slices of the input data. This is in contrast to
-    _model parallelism_ where we divide up a single copy of a model
-    across multiple devices.
-    Note: we only support data parallelism for now, but
-    hope to add support for model parallelism in the future.
-  * A _replica_ is one copy of the model, running on one slice of the
-    input data.
-  * _Synchronous_, or more commonly _sync_, training is where the
-    updates from each replica are aggregated together before updating
-    the model variables. This is in contrast to _asynchronous_, or
-    _async_ training, where each replica updates the model variables
-    independently.
-  * Furthermore you might run your computation on multiple devices
-    on one machine (or "host"), or on multiple machines/hosts.
-    If you are running on multiple machines, you might have a
-    single master host that drives computation across all of them,
-    or you might have multiple clients driving the computation
-    asynchronously.
-
-  To distribute an algorithm, we might use some of these ingredients:
-
-  * Parameter servers: These are hosts that hold a single copy of
-    parameters/variables. All replicas that want to operate on a variable
-    retrieve it at the beginning of a step and send an update to be
-    applied at the end of the step. Can support either sync or async
-    training.
-  * Mirrored variables: These are variables that are copied to multiple
-    devices, where we keep the copies in sync by applying the same
-    updates to every copy. Normally would only be used with sync training.
-  * Reductions and Allreduce: A _reduction_ is some method of
-    aggregating multiple values into one value, like "sum" or
-    "mean". If doing sync training, we will perform a reduction on the
-    gradients to a parameter from all replicas before applying the
-    update. Allreduce is an algorithm for performing a reduction on
-    values from multiple devices and making the result available on
-    all of those devices.
-  * In the future we will have support for TensorFlow's partitioned
-    variables, where a single variable is split across multiple
-    devices.
-
-  We have then a few approaches we want to support:
-
-  * Code written (as if) with no knowledge of class `DistributionStrategy`.
-    This code should work as before, even if some of the layers, etc.
-    used by that code are written to be distribution-aware. This is done
-    by having a default `DistributionStrategy` that gives ordinary behavior,
-    and by default being in a single replica context.
-  * Ordinary model code that you want to run using a specific
-    `DistributionStrategy`. This can be as simple as:
-
-    ```
-    with my_distribution.scope():
-      iterator = my_distribution.distribute_dataset(
-          dataset).make_one_shot_iterator()
-      replica_train_ops = my_distribution.call_for_each_replica(
-          replica_fn, args=(iterator.get_next(),))
-      train_op = tf.group(my_distribution.unwrap(replica_train_ops))
-    ```
-
-    This takes an ordinary `dataset` and `replica_fn` and runs it
-    distributed using a particular `DistributionStrategy` in
-    `my_distribution`. Any variables created in `replica_fn` are created
-    using `my_distribution`'s policy, and library functions called by
-    `replica_fn` can use the `get_replica_context()` API to get enhanced
-    behavior in this case.
-
-    You can also create an initializable iterator instead of a one-shot
-    iterator. In that case, you will need to ensure that you initialize the
-    iterator before calling get_next.
-    ```
-    iterator = my_distribution.distribute_dataset(
-        dataset).make_initializable_iterator())
-    session.run(iterator.initializer)
-    ```
-
-  * If you want to write a distributed algorithm, you may use any of
-    the `DistributionStrategy` APIs inside a
-    `with my_distribution.scope():` block of code.
-
-  Lower-level concepts:
-
-  * Wrapped values: In order to represent values parallel across devices
-    (either replicas or the devices associated with a particular value), we
-    wrap them in a "PerReplica" or "Mirrored" object that contains a map
-    from device to values. "PerReplica" is used when the value may be
-    different across replicas, and "Mirrored" when the value are the same.
-  * Unwrapping and merging: Consider calling a function `fn` on
-    multiple replicas, like `call_for_each_replica(fn, args=[w])` with an
-    argument `w` that is a wrapped value. This means `w` will have a
-    map taking replica device `d0` to `w0`, replica device `d1` to `w1`,
-    etc. `call_for_each_replica()` unwraps `w` before calling `fn`, so
-    it calls `fn(w0)` on `d0`, `fn(w1)` on `d1`, etc.  It then merges
-    the return values from `fn()`, which can possibly result in
-    wrapped values. For example, let's say `fn()` returns a tuple with
-    three components: `(x, a, v0)` from replica 0, `(x, b, v1)` on replica 1,
-    etc. If the first component is the same object `x` from every
-    replica, then the first component of the merged result will also be
-    `x`. If the second component is different (`a`, `b`, ...)  from
-    each replica, then the merged value will have a wrapped map from
-    replica device to the different values. If the third component is
-    the members of a mirrored variable (`v` maps `d0` to `v0`, `d1` to
-    `v1`, etc.), then the merged result will be that mirrored variable
-    (`v`).
-  * Replica context vs. Cross-replica context: _replica context_ is when we
-    are in some function that is being called once for each replica.
-    Otherwise we are in cross-replica context, which is useful for
-    calling `DistributionStrategy` methods which operate across the
-    replicas (like `reduce()`). By default you start in a replica context
-    (the default "single replica context") and then some methods can
-    switch you back and forth, as described below.
-  * Worker devices vs. parameter devices: Most replica computations will
-    happen on worker devices. Since we don't yet support model
-    parallelism, there will be one worker device per replica. When using
-    parameter servers (see above), the set of devices holding
-    variables may be different, otherwise the parameter devices might
-    match the worker devices.
-  * Non-slot devices are some subset of the parameter devices where we
-    put all the non-slot variables. We need to ensure that all
-    non-slot variables are allocated on the same device, or mirrored
-    across the same set of devices. If you have some variable you want
-    to colocate all the non-slot variables with, you can use
-    `colocate_vars_with()` to get the remaining non-slot variables on
-    the same device.  Otherwise you can use `non_slot_devices()` to
-    pick a consistent set of devices to pass to both
-    `colocate_vars_with()` and `update_non_slot()`.
-
-  When using a `DistributionStrategy`, we have a new type dimension
-  called _locality_ that says what values are compatible with which
-  APIs:
-
-  * T: different value for each replica (e.g. a PerReplica-wrapped value).
-  * M: value is "mirrored" across replicas, i.e. there are copies with the
-    same value on each replica (e.g. a Mirrored-wrapped value).
-  * V(`v`): value is "mirrored" across all the devices which have a
-    copy of variable `v` (also a Mirrored-wrapped value, but over
-    parameter devices instead of worker devices).
-  * N: value is "mirrored" across all the "non-slot" devices
-
-  Rules for methods with respect to locality and single-replica vs.
-  cross-replica context:
-
-  * `with d.scope()`: default single-replica context -> cross-replica context
-    for `d`
-  * `with d.colocate_vars_with(v)`: in replica/cross-replica context, variables
-    will be created with locality V(`v`). That is, if we write
-    `with d.colocate_vars_with(v1): v2 = tf.get_variable(...)`, then
-    `v2` will have locality V(`v1`), i.e. locality V(`v2`) will equal
-    V(`v1`).
-  * `with d.colocate_vars_with(d.non_slot_devices(...))`: in
-    replica/cross-replica context, variables will be created with locality N
-  * `v = tf.get_variable(...)`: in replica/cross-replica context, creates
-    a variable (which by definition will have locality V(`v`), though
-    will match another locality if inside a `colocate_vars_with`
-    scope).
-  * `d.distribute_dataset(dataset).make_one_shot_iterator()`: in cross-replica
-    context, produces an iterator with locality T
-  * `d.broadcast(t)`: in cross-replica context, produces a value with locality M
-  * `d.broadcast(t, v)`: in cross-replica context, produces a value with
-    locality V(`v`)
-  * `d.call_for_each_replica(fn, ...)`: in cross-replica context, runs
-    `fn()` in a replica context (and so may call `get_replica_context()` and
-    use its API, including `merge_call()` to get back to cross-replica
-    context), once for each replica. May use values with locality T or
-    M, and any variable.
-  * `d.reduce(m, t, t)`: in cross-replica context, accepts t with locality T
-    and produces a value with locality M.
-  * `d.reduce(m, t, v)`: in cross-replica context, accepts t with
-    locality T and produces a value with locality V(`v`).
-  * `d.batch_reduce(m, [(t, v)]): see `d.reduce()`
-  * `d.update(v, fn, ...)`: in cross-replica context, runs `fn()` once
-    for each device `v` is copied to, all inputs should have locality
-    V(`v`), output will have locality V(`v`) as well.
-  * `d.update_non_slot(d.non_slot_devices(), fn)`: in cross-replica
-    context, like `d.update()` except with locality N.
-  * `d.read_var(v)`: Gets the (read-only) value of the variable `v` (on
-    the device determined by the current device scope), aggregating
-    across replicas for replica-local variables. Frequently, this will be
-    done automatically when using `v` in an expression or fetching it in
-    a cross-replica context, but this function can be used to force that
-    conversion happens at a particular point in time (for example, to
-    add the result of the conversion to a graph collection).
-
-  The standard pattern for updating variables is to:
-
-  1. Wrap your input dataset in `d.distribute_dataset()` and create an iterator.
-  2. Define each replica `d.call_for_each_replica()` up to the point of
-     getting a list of gradient, variable pairs.
-  3. Call `d.reduce(VariableAggregation.SUM, t, v)` or `d.batch_reduce()` to sum
-     the gradients (with locality T) into values with locality V(`v`).
-  4. Call `d.update(v)` for each variable to update its value.
-
-  Steps 3 and 4 are done automatically by class `Optimizer` if you call
-  its `apply_gradients` method in a replica context. Otherwise you can
-  manually call its `_distributed_apply` method in a cross-replica context.
-
-  Another thing you might want to do in the middle of your replica function
-  is an all-reduce of some intermediate value, using `d.reduce()` or
-  `d.batch_reduce()`. You simply provide the same tensor as the input and
-  destination.
-
-  Layers should expect to be called in a replica context, and can use
-  the `get_replica_context()` function to get a `ReplicaContext` object. The
-  `ReplicaContext` object has a `merge_call()` method for entering
-  cross-replica context where you can use `reduce()` (or
-  `batch_reduce()`) and then optionally `update()` to update state.
-
-  You may use this API whether or not a `DistributionStrategy` is
-  being used, since there is a default implementation of
-  `ReplicaContext` and `DistributionStrategy`.
-  """
-
-  # TODO(josh11b): Raise an exception if variable partitioning requested before
-  #   we add support.
-  # TODO(josh11b): Also `parameter_device_index` property?
-  # TODO(josh11b): `map()`
-  # TODO(josh11b): ClusterSpec/ClusterResolver
-  # TODO(josh11b): Partitioned computations, state; sharding
-  # TODO(josh11b): Model parallelism: "replicas" with multiple devices; shuffling
-  # TODO(josh11b): List of replicas with their worker and parameter devices
-  #   (where the parameter devices may overlap in the ps case).
-
-  def __init__(self):
-    self._default_device = None
-    # This property is used to determine if we should set drop_remainder=True
-    # when creating Datasets from numpy array inputs.
-    self._require_static_shapes = False
-
-  def scope(self):
-    """Returns a context manager selecting this DistributionStrategy as current.
-
-    Inside a `with distribution_strategy.scope():` code block, this thread
-    will use a variable creator set by `distribution_strategy`, and will
-    enter its "cross-replica context".
-
-    Returns:
-      A context manager.
-    """
-    if distribution_strategy_context.has_distribution_strategy():
-      _require_cross_replica_context(self)
-      return _SameScopeAgainContext(self)
-
-    def creator_with_resource_vars(*args, **kwargs):
-      _require_distribution_strategy_scope(self)
-      kwargs["use_resource"] = True
-      return self._create_variable(*args, **kwargs)
-
-    def distributed_getter(getter, *args, **kwargs):
-      if not self._allow_variable_partition():
-        if kwargs.pop("partitioner", None) is not None:
-          tf_logging.log_first_n(
-              tf_logging.WARN, "Partitioned variables are disabled when using "
-              "current DistributionStrategy.", 1)
-      return getter(*args, **kwargs)
-
-    return _CurrentDistributionContext(
-        self, variable_scope.variable_creator_scope(creator_with_resource_vars),
-        variable_scope.variable_scope(
-            variable_scope.get_variable_scope(),
-            custom_getter=distributed_getter), self._default_device)
-
-  def _allow_variable_partition(self):
-    return False
-
-  def _create_variable(self, next_creator, *args, **kwargs):
-    # Note: should support "colocate_with" argument.
-    raise NotImplementedError("must be implemented in descendants")
-
-  def read_var(self, v):
-    """Reads the value of a variable.
-
-    Returns the aggregate value of a replica-local variable, or the
-    (read-only) value of any other variable.
-
-    Args:
-      v: A variable allocated within the scope of this `DistributionStrategy`.
-
-    Returns:
-      A tensor representing the value of `v`, aggregated across replicas if
-      necessary.
-    """
-    raise NotImplementedError("must be implemented in descendants")
-
-  def colocate_vars_with(self, colocate_with_variable):
-    """Scope that controls which devices variables will be created on.
-
-    No operations should be added to the graph inside this scope, it
-    should only be used when creating variables (some implementations
-    work by changing variable creation, others work by using a
-    tf.colocate_with() scope).
-
-    This may only be used inside `self.scope()`.
-
-    Example usage:
-
-    ```
-    with distribution_strategy.scope():
-      var1 = tf.get_variable(...)
-      with distribution_strategy.colocate_vars_with(v1):
-        # var2 and var3 will be created on the same device(s) as var1
-        var2 = tf.get_variable(...)
-        var3 = tf.get_variable(...)
-
-      def fn(v1, v2, v3):
-        # operates on v1 from var1, v2 from var2, and v3 from var3
-
-      # `fn` runs on every device `v1` is on, `v2` and `v3` will be there too.
-      distribution_strategy.update(v1, fn, v2, v3)
-    ```
-
-    Args:
-      colocate_with_variable: A created in `self.scope()`. Variables created
-        while in the returned context manager will be on the same set of
-        devices as `colocate_with_variable`.
-
-    Returns:
-      A context manager.
-    """
-    def create_colocated_variable(next_creator, *args, **kwargs):
-      _require_distribution_strategy_scope(self)
-      kwargs["use_resource"] = True
-      kwargs["colocate_with"] = colocate_with_variable
-      return next_creator(*args, **kwargs)
-
-    _require_distribution_strategy_scope(self)
-    return variable_scope.variable_creator_scope(create_colocated_variable)
-
-  def _call_dataset_fn(self, dataset_fn):
-    result = dataset_fn()
-    if not isinstance(result, dataset_ops.Dataset):
-      raise ValueError(
-          "dataset_fn() must return a tf.data.Dataset when using a "
-          "DistributionStrategy.")
-    return result
-
-  # TODO(josh11b): `PerReplicaDataset` currently only implements a few methods of
-  # Dataset API such as make_one_shot_iterator and make_initializable_iterator.
-  # Extend to implement more functionality of datasets.
-  def distribute_dataset(self, dataset_fn):
-    """Return a `dataset` split across all replicas.
-
-    Suitable for providing input to for `call_for_each_replica()` by creating an
-    iterator:
-
-    ```
-    def dataset_fn():
-      return tf.data.Dataset.from_tensors([[1.]]).repeat()
-    with distribution_strategy.scope():
-      distributed_dataset = distribution_strategy.distribute_dataset(dataset_fn)
-      iterator = distributed_dataset.make_one_shot_iterator()
-      replica_results = distribution_strategy.call_for_each_replica(
-          replica_fn, args=(iterator.get_next(),))
-    ```
-
-    Args:
-      dataset_fn: A function that returns a `tf.data.Dataset`.
-
-    Returns:
-      A `PerReplicaDataset` that will produce data for each replica.
-    """
-    raise NotImplementedError("must be implemented in descendants")
-
-  def broadcast(self, tensor, destinations=None):
-    """Mirror a tensor on one device to all worker devices.
-
-    Args:
-      tensor: A Tensor value to broadcast.
-      destinations: An optional mirrored variable, device string, or
-        list of device strings, specifying the destination devices
-        to copy `tensor` to. Defaults to `self.worker_devices`.
-
-    Returns:
-      A value mirrored to `destinations` devices.
-    """
-    # TODO(josh11b): More docstring
-    _require_cross_replica_context(self)
-    return self._broadcast(tensor, destinations)
-
-  def _broadcast(self, tensor, destinations):
-    raise NotImplementedError("must be implemented in descendants")
-
-  def initialize(self):
-    """Any initialization to be done before running any computations.
-
-    In eager mode, it executes any initialization as a side effect.
-    In graph mode, it creates the initialization ops and returns them.
-
-    For example, TPU initialize_system ops.
-
-    Returns:
-      A list of ops to execute.
-    """
-    return []
-
-  def finalize(self):
-    """Any final actions to be done at the end of all computations.
-
-    In eager mode, it executes any finalize actions as a side effect.
-    In graph mode, it creates the finalize ops and returns them.
-
-    For example, TPU shutdown ops.
-
-    Returns:
-      A list of ops to execute.
-    """
-    return []
-
-  def run_steps_on_dataset(self, fn, iterator, iterations=1,
-                           initial_loop_values=None):
-    """Run `fn` with input from `iterator` for `iterations` times.
-
-    This method can be used to run a step function for training a number of
-    times using input from a dataset.
-
-    Args:
-      fn: function to run using this distribution strategy. The function must
-        have the following signature: `def fn(context, *inputs)`.
-        `context` is an instance of `MultiStepContext` that will be passed when
-        `fn` is run. `context` can be used to specify the outputs to be returned
-        from `fn` by calling `context.set_last_step_output`. It can also be used
-        to capture non tensor outputs by `context.set_non_tensor_output`.
-        See `MultiStepContext` documentation for more information.
-        `inputs` will have same type/structure as `iterator.get_next()`. If the
-        `iterator.get_next()` returns a tuple say `return x, y` then whose will
-        be unpacked and passed to the `step_fn`; and step_fn signature would
-        look like `def step_fn(context, x, y)`. If the iterator returns a single
-        value say `return x` then the value is passed as is; the step_fn
-        signature would look like `def step_fn(context, x)`.
-        Typically, `fn` will use `call_for_each_replica` method of the strategy
-        to distribute the computation over multiple replicas.
-      iterator: Iterator of a dataset that represents the input for `fn`. The
-        caller is responsible for initializing the iterator as needed.
-      iterations: (Optional) Number of iterations that `fn` should be run.
-        Defaults to 1.
-      initial_loop_values: (Optional) Initial values to be passed into the
-        loop that runs `fn`. Defaults to `None`. # TODO(priyag): Remove
-        initial_loop_values argument when we have a mechanism to infer the
-        outputs of `fn`.
-
-    Returns:
-      Returns the `MultiStepContext` object which has the following properties,
-      among other things:
-        - run_op: An op that runs `fn` `iterations` times.
-        - last_step_outputs: A dictionary containing tensors set using
-        `context.set_last_step_output`. Evaluating this returns the value of
-        the tensors after the last iteration.
-        - non_tensor_outputs: A dictionatry containing anything that was set by
-          `fn` by calling `context.set_non_tensor_output`.
-    """
-    _require_cross_replica_context(self)
-    return self._run_steps_on_dataset(fn, iterator, iterations,
-                                      initial_loop_values)
-
-  def _run_steps_on_dataset(self, fn, iterator, iterations,
-                            initial_loop_values):
-    raise NotImplementedError("must be implemented in descendants")
-
-  def call_for_each_replica(self, fn, *args, **kwargs):
-    """Run `fn` once per replica.
-
-    `fn` may call `tf.get_replica_context()` to access methods such as
-    `replica_id()` and `merge_call()`.
-
-    `merge_call()` is used to communicate between the replicas and
-    re-enter the cross-replica context. All replicas pause their execution
-    having encountered a `merge_call()` call. After that the
-    `merge_fn`-function is executed. Its results are then unwrapped and
-    given back to each replica call. After that execution resumes until
-    `fn` is complete or encounters another `merge_call()`.  Example:
-
-    ```python
-    # Called once in "cross-replica" context.
-    def merge_fn(distribution, three_plus_replica_id):
-      # sum the values across replicas
-      return sum(distribution.unwrap(three_plus_replica_id))
-
-    # Called once per replica in `distribution`, in a "replica" context.
-    def fn(three):
-      replica_ctx = tf.get_replica_context()
-      v = three + replica_ctx.replica_id
-      # Computes the sum of the `v` values across all replicas.
-      s = replica_ctx.merge_call(merge_fn, args=(v,))
-      return s + v
-
-    with distribution.scope():
-      # in "cross-replica" context
-      ...
-      merged_results = distribution.call_for_each_replica(fn, args=[3])
-      # merged_results has the values from every replica execution of `fn`.
-      print(distribution.unwrap(merged_results))  # Prints a list
-    ```
-
-    Args:
-      fn: function to run (will be run once per replica).
-      args: Tuple or list with positional arguments for `fn`.
-      kwargs: Dict with keyword arguments for `fn`.
-
-    Returns:
-      Merged return value of `fn` across all replicas.
-    """
-    _require_cross_replica_context(self)
-    # Handle old *args, **kwargs, and new args=(...), kwargs={...}, to
-    # allow transition.
-    a = kwargs.pop("args", None)
-    if a is not None:
-      if args:
-        raise ValueError(
-            "Can't pass *args and args=... to call_for_each_replica")
-      args = a
-    k = kwargs.pop("kwargs", None)
-    if k is not None:
-      if kwargs:
-        raise ValueError(
-            "Can't pass **kwargs and kwargs=... to call_for_each_replica")
-      kwargs = k
-    kwargs.pop("run_concurrently", None)  # Ignore old option.
-    return self._call_for_each_replica(fn, args, kwargs)
-
-  def _call_for_each_replica(self, fn, args, kwargs):
-    raise NotImplementedError("must be implemented in descendants")
-
-  def reduce(self, aggregation, value, destinations):
-    """Combine (via e.g. sum or mean) values across replicas.
-
-    Args:
-      aggregation: Indicates how a variable will be aggregated. Accepted values
-        are `tf.VariableAggregation.SUM`, `tf.VariableAggregation.MEAN`,
-        `tf.VariableAggregation.ONLY_FIRST_REPLICA`.
-      value: A per-replica value with one value per replica.
-      destinations: A mirrored variable, a per-replica tensor, a device string,
-        or list of device strings. The return value will be copied to all
-        destination devices (or all the devices where the `destinations` value
-        resides). To perform an all-reduction, pass `value` to `destinations`.
-
-    Returns:
-      A value mirrored to `destinations`.
-    """
-    # TODO(josh11b): More docstring
-    # TODO(josh11b): Return an unwrapped value if colocate_with is a
-    # single device.
-    _require_cross_replica_context(self)
-    assert aggregation in [
-        variable_scope.VariableAggregation.SUM,
-        variable_scope.VariableAggregation.MEAN,
-        variable_scope.VariableAggregation.ONLY_FIRST_REPLICA
-    ]
-    return self._reduce(aggregation, value, destinations)
-
-  def _reduce(self, aggregation, value, destinations):
-    raise NotImplementedError("must be implemented in descendants")
-
-  def batch_reduce(self, aggregation, value_destination_pairs):
-    """Combine multiple `reduce` calls into one for faster execution.
-
-    Args:
-      aggregation: Indicates how a variable will be aggregated. Accepted values
-        are `tf.VariableAggregation.SUM`, `tf.VariableAggregation.MEAN`,
-        `tf.VariableAggregation.ONLY_FIRST_REPLICA`.
-      value_destination_pairs: A sequence of (value, destinations)
-        pairs. See `reduce()` for a description.
-
-    Returns:
-      A list of mirrored values, one per pair in `value_destination_pairs`.
-    """
-    # TODO(josh11b): More docstring
-    _require_cross_replica_context(self)
-    assert aggregation in [
-        variable_scope.VariableAggregation.SUM,
-        variable_scope.VariableAggregation.MEAN,
-        variable_scope.VariableAggregation.ONLY_FIRST_REPLICA
-    ]
-    return self._batch_reduce(aggregation, value_destination_pairs)
-
-  def _batch_reduce(self, aggregation, value_destination_pairs):
-    return [
-        self.reduce(aggregation, t, destinations=v)
-        for t, v in value_destination_pairs
-    ]
-
-  def update(self, var, fn, *args, **kwargs):
-    """Run `fn` to update `var` using inputs mirrored to the same devices.
-
-    If `var` is mirrored across multiple devices, then this implements
-    logic like:
-
-    ```
-    results = {}
-    for device, v in var:
-      with tf.device(device):
-        # *args and **kwargs will be unwrapped if they are mirrored.
-        results[device] = fn(v, *args, **kwargs)
-    return merged(results)
-    ```
-
-    Otherwise this returns `fn(var, *args, **kwargs)` colocated with `var`.
-
-    Neither `*args` nor `**kwargs` may contain per-replica values.
-    If they contain mirrored values, they will be unwrapped before
-    calling `fn`.
-
-    Args:
-      var: Variable, possibly mirrored to multiple devices, to operate on.
-      fn: Function to call. Should take the variable as the first argument.
-      *args: Additional positional arguments to pass to `fn()`.
-      **kwargs: Keyword arguments to pass to `fn()`. If "grouped=False" is
-        specified, the return value will be unwrapped.
-
-    Returns:
-      By default, the merged return value of `fn` across all replicas.  The
-      merged result has dependencies to make sure that if it is evaluated at
-      all, the side effects (updates) will happen on every replica. If instead
-      "grouped=False" is specified, this function will return a nest of lists
-      where each list has an element per replica, and the caller is responsible
-      for ensuring all elements are executed.
-    """
-    _require_cross_replica_context(self)
-    options = {"grouped": kwargs.pop("grouped", True)}
-    return self._update(var, options, fn, *args, **kwargs)
-
-  def _update(self, var, options, fn, *args, **kwargs):
-    raise NotImplementedError("must be implemented in descendants")
-
-  def update_non_slot(self, colocate_with, fn, *args, **kwargs):
-    """Runs `fn(*args, **kwargs)` on `colocate_with` devices.
-
-    Args:
-      colocate_with: The return value of `non_slot_devices()`.
-      fn: Function to execute.
-      *args: Positional arguments to pass to `fn()`.
-      **kwargs: Keyword arguments to pass to `fn()`. If "grouped=False" is
-        specified, the return value will be unwrapped and the caller is
-        responsible for ensuring all elements are executed.
-
-    Returns:
-      Return value of `fn`, possibly merged across devices.
-    """
-    _require_cross_replica_context(self)
-    options = {"grouped": kwargs.pop("grouped", True)}
-    return self._update_non_slot(colocate_with, options, fn, *args, **kwargs)
-
-  def _update_non_slot(self, colocate_with, options, fn, *args, **kwargs):
-    raise NotImplementedError("must be implemented in descendants")
-
-  def unwrap(self, value):
-    """Returns the list of all per-replica values contained in `value`.
-
-    Args:
-      value: A value returned by `call_for_each_replica()` or a variable
-        created in `scope()`.
-
-    Returns:
-      A list of values contained in `value`. If `value` represents a single
-      value, this returns `[value].`
-    """
-    return self._unwrap(value)
-
-  def value_container(self, value):
-    """Returns the container that this per-replica `value` belongs to.
-
-    Args:
-      value: A value returned by `call_for_each_replica()` or a variable
-        created in `scope()`.
-
-    Returns:
-      A container that `value` belongs to.
-      If value does not belong to any container (including the case of
-      container having been destroyed), returns the value itself.
-      `value in unwrap(value_container(value))` will always be true.
-    """
-    raise NotImplementedError("must be implemented in descendants")
-
-  def _unwrap(self, distributed_value):
-    raise NotImplementedError("must be implemented in descendants")
-
-  def group(self, value, name=None):
-    """Shortcut for `tf.group(distribution.unwrap(value))`."""
-    value = nest.flatten(self.unwrap(value))
-
-    if len(value) != 1 or name is not None:
-      return control_flow_ops.group(value, name=name)
-    # Special handling for the common case of one op.
-    v, = value
-    if hasattr(v, "op"):
-      v = v.op
-    return v
-
-  @property
-  def require_static_shapes(self):
-    return self._require_static_shapes
-
-  @property
-  def num_replicas(self):
-    """Returns number of replicas, for purposes of averaging across replicas.
-
-    DEPRECATED: use `num_replicas_in_sync` instead.
-    """
-    raise NotImplementedError("must be implemented in descendants")
-
-  @property
-  def num_replicas_in_sync(self):
-    """Returns number of replicas over which gradients are aggregated."""
-    raise NotImplementedError("must be implemented in descendants")
-
-  @property
-  def worker_devices(self):
-    """Returns the list of devices used to run `call_for_each_replica()` calls.
-    """
-    # TODO(josh11b): More docstring
-    raise NotImplementedError("must be implemented in descendants")
-
-  @property
-  def parameter_devices(self):
-    """Returns the list of devices used for variable and `update` placement."""
-    # TODO(josh11b): More docstring
-    raise NotImplementedError("must be implemented in descendants")
-
-  def non_slot_devices(self, var_list):
-    """Device(s) for non-slot variables.
-
-    Create variables on these devices in a
-    `with colocate_vars_with(non_slot_devices(...)):` block.
-    Update those using `update_non_slot()`.
-
-    Args:
-      var_list: The list of variables being optimized, needed with the
-        default `DistributionStrategy`.
-    """
-    raise NotImplementedError("must be implemented in descendants")
-
-  @property
-  def worker_device_index(self):
-    """An object mapping worker device to an id.
-
-    This might be passed as an argument to `call_for_each_replica()`, as in:
-
-    ```
-    with distribution_strategy.scope():
-
-      def fn(device_id):
-        # device_id is an integer. `fn` is being executed on device:
-        #    distribution_strategy.worker_devices[device_id].
-
-      distribution_strategy.call_for_each_replica(
-          fn, distribution_strategy.worker_device_index)
-    ```
-
-    Returns:
-      An index object, or the integer 0 if there is only a single replica.
-    """
-    _require_cross_replica_context(self)
-    return self._worker_device_index()
-
-  def _worker_device_index(self):
-    raise NotImplementedError("must be implemented in descendants")
-
-  @property
-  def between_graph(self):
-    """Whether the strategy uses between-graph replication or not.
-
-      This is expected to return a constant value that will not be changed
-      throughout its life cycle.
-    """
-    raise NotImplementedError("must be implemented in descendants")
-
-  def configure(self,
-                session_config=None,
-                cluster_spec=None,
-                task_type=None,
-                task_id=None):
-    """Configures the strategy class."""
-    del session_config, cluster_spec, task_type, task_id
-
-  @property
-  def should_init(self):
-    """Whether initialization is needed."""
-    raise NotImplementedError("must be implemented in descendants")
-
-  @property
-  def should_checkpoint(self):
-    """Whether checkpointing is needed."""
-    raise NotImplementedError("must be implemented in descendants")
-
-  @property
-  def should_save_summary(self):
-    """Whether saving summaries is needed."""
-    raise NotImplementedError("must be implemented in descendants")
-
-
-# A note about the difference between the context managers
-# `ReplicaContext` (defined here) and `_CurrentDistributionContext`
-# (defined above) used by `DistributionStrategy.scope()`:
-#
-# * a ReplicaContext is only present during a `call_for_each_replica()`
-#   call (except during a `merge_run` call) and in such a scope it
-#   will be returned by calls to `get_replica_context()`.  Implementers of new
-#   DistributionStrategy descendants will frequently also need to
-#   define a descendant of ReplicaContext, and are responsible for
-#   entering and exiting this context.
-#
-# * DistributionStrategy.scope() sets up a variable_creator scope that
-#   changes variable creation calls (e.g. to make mirrored
-#   variables). This is intended as an outer scope that users enter once
-#   around their model creation and graph definition. There is no
-#   anticipated need to define descendants of _CurrentDistributionContext.
-#   It sets the current DistributionStrategy for purposes of
-#   `get_distribution_strategy()` and `has_distribution_strategy()`
-#   and switches the thread mode to a "cross-replica context".
-class ReplicaContext(object):
-  """DistributionStrategy API inside a `call_for_each_replica()` call."""
-
-  def __init__(self, distribution_strategy, replica_id):
-    self._distribution_strategy = distribution_strategy
-    self._thread_context = distribution_strategy_context._InReplicaThreadMode(  # pylint: disable=protected-access
-        self)
-    self._replica_id = replica_id
-
-  def __enter__(self):
-    _push_per_thread_mode(self._thread_context)
-
-  def __exit__(self, exception_type, exception_value, traceback):
-    _pop_per_thread_mode()
-
-  def merge_call(self, merge_fn, *args, **kwargs):
-    """Merge args across replicas and run `merge_fn` in a cross-replica context.
-
-    This allows communication and coordination when there are multiple calls
-    to a model function triggered by a call to
-    `distribution.call_for_each_replica(model_fn, ...)`.
-
-    See `MirroredDistribution.call_for_each_replica()` for an explanation.
-
-    Otherwise, this is equivalent to:
-
-    ```
-    distribution = get_distribution_strategy()
-    with cross-replica-context(distribution):
-      return merge_fn(distribution, *args, **kwargs)
-    ```
-
-    Args:
-      merge_fn: function that joins arguments from threads that are given as
-        PerReplica. It accepts `DistributionStrategy` object as the first
-        argument.
-      args: List or tuple with positional per-thread arguments for `merge_fn`
-      kwargs: Dict with keyword per-thread arguments for `merge_fn`.
-
-    Returns:
-      The return value of `merge_fn`, except for `PerReplica` values which are
-      unpacked.
-    """
-    require_replica_context(self)
-    # Handle old *args, **kwargs, and new args=(...), kwargs={...}, to
-    # allow transition.
-    a = kwargs.pop("args", None)
-    if a is not None:
-      if args:
-        raise ValueError(
-            "Can't pass *args and args=... to merge_call")
-      args = a
-    k = kwargs.pop("kwargs", None)
-    if k is not None:
-      if kwargs:
-        raise ValueError(
-            "Can't pass **kwargs and kwargs=... to merge_call")
-      kwargs = k
-    return self._merge_call(merge_fn, args, kwargs)
-
-  def _merge_call(self, merge_fn, args, kwargs):
-    """Default implementation for single replica."""
-    _push_per_thread_mode(  # thread-local, so not needed with multiple threads
-        distribution_strategy_context._CrossReplicaThreadMode(  # pylint: disable=protected-access
-            self._distribution_strategy))
-    try:
-      return merge_fn(self._distribution_strategy, *args, **kwargs)
-    finally:
-      _pop_per_thread_mode()
-
-  @property
-  def num_replicas(self):
-    """Returns number of replicas, for purposes of averaging across replicas."""
-    return self._distribution_strategy.num_replicas
-
-  @property
-  def num_replicas_in_sync(self):
-    """Returns number of replicas over which gradients are aggregated."""
-    return self._distribution_strategy.num_replicas_in_sync
-
-  @property
-  def replica_id(self):
-    """Which replica is being defined, a number from 0 to `num_replicas - 1`."""
-    require_replica_context(self)
-    return self._replica_id
-
-  @property
-  def distribution_strategy(self):
-    """The current `DistributionStrategy` object."""
-    return self._distribution_strategy
-
-  @property
-  def device(self):
-    """BEING DELETED: use .devices instead."""
-    raise RuntimeError("Use .devices instead")
-
-  @property
-  def devices(self):
-    """The devices this replica is to be executed on, as a list of strings."""
-    require_replica_context(self)
-    return [device_util.current()]
-
-  # TODO(josh11b): Implement `start_all_reduce(method, t)` for efficient
-  # all-reduce. It would return a function returning the result of reducing `t`
-  # across all replicas. The caller would wait to call this function until they
-  # needed the reduce result, allowing an efficient implementation:
-  # * With eager execution, the reduction could be performed asynchronously
-  #   in the background, not blocking until the result was needed.
-  # * When constructing a graph, it could batch up all reduction requests up
-  #   to that point that the first result is needed. Most likely this can be
-  #   implemented in terms of `merge_call()` and `batch_reduce()`.
-
-# ------------------------------------------------------------------------------
-
-
-class _DefaultDistributionStrategy(DistributionStrategy):
-  """Default `DistributionStrategy` if none is explicitly selected."""
-
-  def scope(self):
-    """Context manager setting a variable creator and `self` as current."""
-    if distribution_strategy_context.has_distribution_strategy():
-      raise RuntimeError("Must not nest DistributionStrategy scopes.")
-
-    def creator(next_creator, *args, **kwargs):
-      _require_distribution_strategy_scope(self)
-      return next_creator(*args, **kwargs)
-
-    return _CurrentDistributionContext(
-        self, variable_scope.variable_creator_scope(creator))
-
-  def colocate_vars_with(self, colocate_with_variable):
-    """Does not require `self.scope`."""
-    _require_distribution_strategy_scope(self)
-    return ops.colocate_with(colocate_with_variable)
-
-  def distribute_dataset(self, dataset_fn):
-    return self._call_dataset_fn(dataset_fn)
-
-  def _broadcast(self, tensor, destinations):
-    if destinations is None:
-      return tensor
-    else:
-      raise NotImplementedError("TODO")
-
-  def _call_for_each_replica(self, fn, args, kwargs):
-    with ReplicaContext(self, replica_id=0):
-      return fn(*args, **kwargs)
-
-  def _reduce(self, aggregation, value, destinations):
-    # TODO(josh11b): Use destinations?
-    del aggregation, destinations
-    return value
-
-  def _update(self, var, options, fn, *args, **kwargs):
-    # The implementations of _update() and _update_non_slot() are identical
-    # except _update() passes `var` as the first argument to `fn()`.
-    return self._update_non_slot(var, options, fn, var, *args, **kwargs)
-
-  def _update_non_slot(self, colocate_with, options, fn, *args, **kwargs):
-    should_group = options.pop("grouped")
-    assert not options  # Validate that we are processing all of the options.
-    # TODO(josh11b): Figure out what we should be passing to UpdateContext()
-    # once that value is used for something.
-    with ops.colocate_with(colocate_with), UpdateContext(colocate_with):
-      result = fn(*args, **kwargs)
-      if should_group:
-        return result
-      else:
-        return nest.map_structure(self._unwrap, result)
-
-  def read_var(self, replica_local_var):
-    return array_ops.identity(replica_local_var)
-
-  def _unwrap(self, distributed_value):
-    return [distributed_value]
-
-  def value_container(self, value):
-    return value
-
-  @property
-  def num_replicas(self):
-    return 1
-
-  @property
-  def num_replicas_in_sync(self):
-    return 1
-
-  @property
-  def worker_devices(self):
-    raise RuntimeError(
-        "worker_devices() method unsupported by _DefaultDistributionStrategy.")
-
-  @property
-  def parameter_devices(self):
-    raise RuntimeError("parameter_devices() method unsupported by "
-                       "_DefaultDistributionStrategy.")
-
-  def non_slot_devices(self, var_list):
-    return min(var_list, key=lambda x: x.name)
-
-  def _worker_device_index(self):
-    raise RuntimeError("worker_device_index() method unsupported by "
-                       "_DefaultDistributionStrategy.")
-
-
-# ------------------------------------------------------------------------------
-# We haven't yet implemented deserialization for DistributedVariables.
-# So here we catch any attempts to deserialize variables
-# when using distribution strategies.
-# pylint: disable=protected-access
-_original_from_proto = resource_variable_ops._from_proto_fn
-
-
-def _from_proto_fn(v, import_scope=None):
-  if distribution_strategy_context.has_distribution_strategy():
-    raise NotImplementedError(
-        "Deserialization of variables is not yet supported when using"
-        "distributed strategies.")
-  else:
-    return _original_from_proto(v, import_scope=import_scope)
-
-resource_variable_ops._from_proto_fn = _from_proto_fn
-# pylint: enable=protected-access
-
-
-#-------------------------------------------------------------------------------
-# Shorthand for some methods from distribution_strategy_context.
-_push_per_thread_mode = distribution_strategy_context._push_per_thread_mode  # pylint: disable=protected-access
-_get_per_thread_mode = distribution_strategy_context._get_per_thread_mode  # pylint: disable=protected-access
-_pop_per_thread_mode = distribution_strategy_context._pop_per_thread_mode  # pylint: disable=protected-access
+# pylint: disable=wildcard-import
+from tensorflow.python.distribute.distribute_lib import *
diff --git a/tensorflow/python/training/distribution_strategy_context.py b/tensorflow/python/training/distribution_strategy_context.py
index 278f35b97e4..7391bf3b22d 100644
--- a/tensorflow/python/training/distribution_strategy_context.py
+++ b/tensorflow/python/training/distribution_strategy_context.py
@@ -12,195 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Utility to get distribution strategy related contexts."""
+"""Deprecated, please use ../distribute/distribution_strategy_context.py."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.framework import ops
-from tensorflow.python.util.lazy_loader import LazyLoader
-
-
-# There is a circular dependency between this and `distribute` module. So we
-# load it lazily to workaround this.
-distribute_lib = LazyLoader(
-    "distribute_lib", globals(),
-    "tensorflow.python.training.distribute")
-
-# ------------------------------------------------------------------------------
-# Internal API for setting the current thread mode as being either in a
-# replica or cross-replica context for a particular distribution strategy.
-
-
-class _ThreadMode(object):
-
-  def __init__(self, dist, cross, replica):
-    self.distribution_strategy = dist
-    self.cross_replica_context = cross
-    self.replica_context = replica
-
-
-class _CrossReplicaThreadMode(_ThreadMode):
-
-  def __init__(self, distribution_strategy):
-    _ThreadMode.__init__(
-        self, distribution_strategy, distribution_strategy, None)
-
-
-class _InReplicaThreadMode(_ThreadMode):
-
-  def __init__(self, replica_ctx):
-    _ThreadMode.__init__(
-        self, replica_ctx.distribution_strategy, None, replica_ctx)
-
-
-def _push_per_thread_mode(context):
-  ops.get_default_graph()._distribution_strategy_stack.append(context)  # pylint: disable=protected-access
-
-
-def _pop_per_thread_mode():
-  ops.get_default_graph()._distribution_strategy_stack.pop(-1)  # pylint: disable=protected-access
-
-
-class _DefaultReplicaThreadMode(_ThreadMode):
-  """Type of default value returned by `_get_per_thread_mode()`.
-
-  Used when the thread-local stack is empty.
-  """
-
-  def __init__(self):
-    _ThreadMode.__init__(self, _get_default_distribution_strategy(), None,
-                         _get_default_replica_context())
-
-
-def _get_per_thread_mode():
-  try:
-    return ops.get_default_graph()._distribution_strategy_stack[-1]  # pylint: disable=protected-access
-  except (AttributeError, IndexError):
-    return _get_default_replica_mode()
-
-
-# ------------------------------------------------------------------------------
-# Public API for accessing the current thread mode
-
-
-def get_replica_context():
-  """Returns the current ReplicaContext or None if in a cross-replica context.
-
-  Note that execution:
-
-  1. starts in the default (single-replica) replica context (this function
-     will return the default ReplicaContext object);
-  2. switches to cross-replica context (in which case this will return
-     None) when entering a `with DistributionStrategy.scope():` block;
-  3. switches to a (non-default) replica context inside
-     `call_for_each_replica(fn, ...)`;
-  4. if `fn` calls `get_replica_context()->merge_call(merge_fn, ...)`, then
-     inside `merge_fn` you are back in the cross-replica context (and again
-     this function will return None).
-
-  Note that you can also go directly from step 1 to 4 to switch to a
-  cross-replica context for the default `DistributionStrategy`. You may
-  also switch from the cross-replica context of 4 to a replica context by
-  calling `call_for_each_replica()`, jumping back to step 3.
-
-  Most `DistributionStrategy` methods may only be executed in
-  a cross-replica context, in a replica context you should use the
-  `ReplicaContext` API instead.
-
-  Returns:
-    The current `ReplicaContext` object when in a replica context scope,
-    else None.
-
-    Exactly one of `get_replica_context()` and `get_cross_replica_context()`
-    will return None in a particular block.
-  """
-  return _get_per_thread_mode().replica_context
-
-
-def get_cross_replica_context():
-  """Returns the current DistributionStrategy if in a cross-replica context.
-
-  Note that execution:
-
-  1. starts in the default (single-replica) replica context;
-  2. switches to cross-replica context when entering a
-     `with DistributionStrategy.scope():` block;
-  3. switches to a (non-default) replica context inside
-     `call_for_each_replica(fn, ...)`;
-  4. if `fn` calls `get_replica_context()->merge_call(merge_fn, ...)`, then
-     inside `merge_fn` you are back in the cross-replica context.
-
-  Note that you can also go directly from step 1 to 4 to switch to a
-  cross-replica context for the default `DistributionStrategy`. You may
-  also switch from the cross-replica context of 4 to a replica context by
-  calling `call_for_each_replica()`, jumping back to step 3.
-
-  Most `DistributionStrategy` methods may only be executed in
-  a cross-replica context.
-
-  Returns:
-    Returns the current `DistributionStrategy` object in a cross-replica
-    context, or None.
-
-    Exactly one of `get_replica_context()` and `get_cross_replica_context()`
-    will return None in a particular block.
-  """
-  return _get_per_thread_mode().cross_replica_context
-
-
-def get_distribution_strategy():
-  """Returns the current `DistributionStrategy` object.
-
-  Prefer to use `get_replica_context()` or `get_cross_replica_context()`
-  instead when possible.
-
-  Returns:
-    A `DistributionStrategy` object. Inside a
-    `with distribution_strategy.scope()` block, it returns
-    `distribution_strategy`, otherwise it returns the default
-    (single-replica) `DistributionStrategy` object.
-  """
-  return _get_per_thread_mode().distribution_strategy
-
-
-def has_distribution_strategy():
-  """Return if there is a current non-default `DistributionStrategy`.
-
-  Returns:
-    True if inside a `with distribution_strategy.scope():`.
-  """
-  return get_distribution_strategy() is not _get_default_distribution_strategy()
-
-
-# ------------------------------------------------------------------------------
-# Defaults that are used when no distribution strategy is explicitly created.
-# We create them lazily in a function so that we can workaround the circular
-# dependency on distribute_lib. See lazy loader at the top of this file.
-
-_defaults = {
-    "distribution_strategy": None,
-    "replica_context": None,
-    "replica_mode": None
-}
-
-
-def _get_default_distribution_strategy():
-  if _defaults["distribution_strategy"] is None:
-    _defaults["distribution_strategy"] = (
-        distribute_lib._DefaultDistributionStrategy())  # pylint: disable=protected-access
-  return _defaults["distribution_strategy"]
-
-
-def _get_default_replica_context():
-  if _defaults["replica_context"] is None:
-    _defaults["replica_context"] = distribute_lib.ReplicaContext(
-        _get_default_distribution_strategy(), replica_id=0)
-  return _defaults["replica_context"]
-
-
-def _get_default_replica_mode():
-  if _defaults["replica_mode"] is None:
-    _defaults["replica_mode"] = _DefaultReplicaThreadMode()
-  return _defaults["replica_mode"]
+# pylint: disable=wildcard-import
+from tensorflow.python.distribute.distribution_strategy_context import *
diff --git a/tensorflow/python/training/evaluation.py b/tensorflow/python/training/evaluation.py
index 2c4eb02d533..a10178f8cfe 100644
--- a/tensorflow/python/training/evaluation.py
+++ b/tensorflow/python/training/evaluation.py
@@ -230,7 +230,7 @@ def _evaluate_once(checkpoint_path,
   hooks = list(hooks or [])
 
   if eval_ops is not None:
-    if any([isinstance(h, _MultiStepStopAfterNEvalsHook) for h in hooks]):
+    if any(isinstance(h, _MultiStepStopAfterNEvalsHook) for h in hooks):
       steps_per_run_variable = \
           basic_session_run_hooks.get_or_create_steps_per_run_variable()
       update_eval_step = state_ops.assign_add(
diff --git a/tensorflow/python/training/ftrl.py b/tensorflow/python/training/ftrl.py
index 2fafc9a2d80..a2ef3c76b4e 100644
--- a/tensorflow/python/training/ftrl.py
+++ b/tensorflow/python/training/ftrl.py
@@ -25,7 +25,7 @@ from tensorflow.python.training import training_ops
 from tensorflow.python.util.tf_export import tf_export
 
 
-@tf_export("train.FtrlOptimizer")
+@tf_export(v1=["train.FtrlOptimizer"])
 class FtrlOptimizer(optimizer.Optimizer):
   """Optimizer that implements the FTRL algorithm.
 
diff --git a/tensorflow/python/training/ftrl_test.py b/tensorflow/python/training/ftrl_test.py
index 15c50bc8788..39b299c64a3 100644
--- a/tensorflow/python/training/ftrl_test.py
+++ b/tensorflow/python/training/ftrl_test.py
@@ -23,6 +23,7 @@ import numpy as np
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import embedding_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import resource_variable_ops
@@ -54,7 +55,7 @@ class FtrlOptimizerTest(test.TestCase):
         update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
         variables.global_variables_initializer().run()
 
-        v0_val, v1_val = sess.run([var0, var1])
+        v0_val, v1_val = self.evaluate([var0, var1])
         self.assertAllClose([0.0, 0.0], v0_val)
         self.assertAllClose([0.0, 0.0], v1_val)
 
@@ -62,18 +63,21 @@ class FtrlOptimizerTest(test.TestCase):
         for _ in range(3):
           update.run()
 
-        v0_val, v1_val = sess.run([var0, var1])
+        v0_val, v1_val = self.evaluate([var0, var1])
         self.assertAllCloseAccordingToType(
             np.array([-2.60260963, -4.29698515]), v0_val)
         self.assertAllCloseAccordingToType(
             np.array([-0.28432083, -0.56694895]), v1_val)
 
+  @test_util.run_deprecated_v1
   def testFtrlWithoutRegularization(self):
     self.doTestFtrlwithoutRegularization(use_resource=False)
 
+  @test_util.run_deprecated_v1
   def testResourceFtrlWithoutRegularization(self):
     self.doTestFtrlwithoutRegularization(use_resource=True)
 
+  @test_util.run_deprecated_v1
   def testFtrlwithoutRegularization2(self):
     for dtype in [dtypes.half, dtypes.float32]:
       with self.cached_session() as sess:
@@ -90,19 +94,20 @@ class FtrlOptimizerTest(test.TestCase):
         update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
         variables.global_variables_initializer().run()
 
-        v0_val, v1_val = sess.run([var0, var1])
+        v0_val, v1_val = self.evaluate([var0, var1])
         self.assertAllCloseAccordingToType([1.0, 2.0], v0_val)
         self.assertAllCloseAccordingToType([4.0, 3.0], v1_val)
 
         # Run 3 steps FTRL
         for _ in range(3):
           update.run()
-        v0_val, v1_val = sess.run([var0, var1])
+        v0_val, v1_val = self.evaluate([var0, var1])
         self.assertAllCloseAccordingToType(
             np.array([-2.55607247, -3.98729396]), v0_val)
         self.assertAllCloseAccordingToType(
             np.array([-0.28232238, -0.56096673]), v1_val)
 
+  @test_util.run_deprecated_v1
   def testMinimizeSparseResourceVariable(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
       with self.cached_session():
@@ -113,12 +118,15 @@ class FtrlOptimizerTest(test.TestCase):
         sgd_op = ftrl.FtrlOptimizer(1.0).minimize(loss)
         variables.global_variables_initializer().run()
         # Fetch params to validate initial values
-        self.assertAllCloseAccordingToType([[1.0, 2.0]], var0.eval())
+        self.assertAllCloseAccordingToType([[1.0, 2.0]], self.evaluate(var0))
         # Run 1 step of sgd
         sgd_op.run()
         # Validate updated params
-        self.assertAllCloseAccordingToType([[0, 1]], var0.eval(), atol=0.01)
+        self.assertAllCloseAccordingToType([[0, 1]],
+                                           self.evaluate(var0),
+                                           atol=0.01)
 
+  @test_util.run_deprecated_v1
   def testFtrlWithL1(self):
     for dtype in [dtypes.half, dtypes.float32]:
       with self.cached_session() as sess:
@@ -135,19 +143,20 @@ class FtrlOptimizerTest(test.TestCase):
         update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
         variables.global_variables_initializer().run()
 
-        v0_val, v1_val = sess.run([var0, var1])
+        v0_val, v1_val = self.evaluate([var0, var1])
         self.assertAllCloseAccordingToType([1.0, 2.0], v0_val)
         self.assertAllCloseAccordingToType([4.0, 3.0], v1_val)
 
         # Run 10 steps FTRL
         for _ in range(10):
           update.run()
-        v0_val, v1_val = sess.run([var0, var1])
+        v0_val, v1_val = self.evaluate([var0, var1])
         self.assertAllCloseAccordingToType(
             np.array([-7.66718769, -10.91273689]), v0_val)
         self.assertAllCloseAccordingToType(
             np.array([-0.93460727, -1.86147261]), v1_val)
 
+  @test_util.run_deprecated_v1
   def testFtrlWithL1_L2(self):
     for dtype in [dtypes.half, dtypes.float32]:
       with self.cached_session() as sess:
@@ -164,7 +173,7 @@ class FtrlOptimizerTest(test.TestCase):
         update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
         variables.global_variables_initializer().run()
 
-        v0_val, v1_val = sess.run([var0, var1])
+        v0_val, v1_val = self.evaluate([var0, var1])
         self.assertAllCloseAccordingToType([1.0, 2.0], v0_val)
         self.assertAllCloseAccordingToType([4.0, 3.0], v1_val)
 
@@ -172,12 +181,13 @@ class FtrlOptimizerTest(test.TestCase):
         for _ in range(10):
           update.run()
 
-        v0_val, v1_val = sess.run([var0, var1])
+        v0_val, v1_val = self.evaluate([var0, var1])
         self.assertAllCloseAccordingToType(
             np.array([-0.24059935, -0.46829352]), v0_val)
         self.assertAllCloseAccordingToType(
             np.array([-0.02406147, -0.04830509]), v1_val)
 
+  @test_util.run_deprecated_v1
   def testFtrlWithL1_L2_L2Shrinkage(self):
     """Test the new FTRL op with support for l2 shrinkage.
 
@@ -201,7 +211,7 @@ class FtrlOptimizerTest(test.TestCase):
         update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
         variables.global_variables_initializer().run()
 
-        v0_val, v1_val = sess.run([var0, var1])
+        v0_val, v1_val = self.evaluate([var0, var1])
         self.assertAllCloseAccordingToType([1.0, 2.0], v0_val)
         self.assertAllCloseAccordingToType([4.0, 3.0], v1_val)
 
@@ -209,12 +219,13 @@ class FtrlOptimizerTest(test.TestCase):
         for _ in range(10):
           update.run()
 
-        v0_val, v1_val = sess.run([var0, var1])
+        v0_val, v1_val = self.evaluate([var0, var1])
         self.assertAllCloseAccordingToType(
             np.array([-0.22578995, -0.44345796]), v0_val)
         self.assertAllCloseAccordingToType(
             np.array([-0.14378493, -0.13229476]), v1_val)
 
+  @test_util.run_deprecated_v1
   def testFtrlWithL1_L2_L2ShrinkageSparse(self):
     """Tests the new FTRL op with support for l2 shrinkage on sparse grads."""
     for dtype in [dtypes.half, dtypes.float32]:
@@ -237,7 +248,7 @@ class FtrlOptimizerTest(test.TestCase):
         update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
         variables.global_variables_initializer().run()
 
-        v0_val, v1_val = sess.run([var0, var1])
+        v0_val, v1_val = self.evaluate([var0, var1])
         self.assertAllCloseAccordingToType([[1.0], [2.0]], v0_val)
         self.assertAllCloseAccordingToType([[4.0], [3.0]], v1_val)
 
@@ -245,10 +256,11 @@ class FtrlOptimizerTest(test.TestCase):
         for _ in range(10):
           update.run()
 
-        v0_val, v1_val = sess.run([var0, var1])
+        v0_val, v1_val = self.evaluate([var0, var1])
         self.assertAllCloseAccordingToType([[-0.22578995], [2.]], v0_val)
         self.assertAllCloseAccordingToType([[4.], [-0.13229476]], v1_val)
 
+  @test_util.run_deprecated_v1
   def testFtrlWithL2ShrinkageDoesNotChangeLrSchedule(self):
     """Verifies that l2 shrinkage in FTRL does not change lr schedule."""
     for dtype in [dtypes.half, dtypes.float32]:
@@ -273,7 +285,7 @@ class FtrlOptimizerTest(test.TestCase):
         update1 = opt1.apply_gradients([(grads1, var1)])
         variables.global_variables_initializer().run()
 
-        v0_val, v1_val = sess.run([var0, var1])
+        v0_val, v1_val = self.evaluate([var0, var1])
         self.assertAllCloseAccordingToType([1.0, 2.0], v0_val)
         self.assertAllCloseAccordingToType([1.0, 2.0], v1_val)
 
@@ -282,12 +294,12 @@ class FtrlOptimizerTest(test.TestCase):
           update0.run()
           update1.run()
 
-        v0_val, v1_val = sess.run([var0, var1])
+        v0_val, v1_val = self.evaluate([var0, var1])
         # var0 is experiencing L2 shrinkage so it should be smaller than var1
         # in magnitude.
         self.assertTrue((v0_val**2 < v1_val**2).all())
-        accum0 = list(sess.run(opt0._slots)["accum"].values())[0]
-        accum1 = list(sess.run(opt1._slots)["accum"].values())[0]
+        accum0 = list(self.evaluate(opt0._slots)["accum"].values())[0]
+        accum1 = list(self.evaluate(opt1._slots)["accum"].values())[0]
         # L2 shrinkage should not change how we update grad accumulator.
         self.assertAllCloseAccordingToType(accum0, accum1)
 
@@ -311,7 +323,7 @@ class FtrlOptimizerTest(test.TestCase):
     variables.global_variables_initializer().run()
 
     sess = ops.get_default_session()
-    v0_val, v1_val = sess.run([var0, var1])
+    v0_val, v1_val = self.evaluate([var0, var1])
     if is_sparse:
       self.assertAllCloseAccordingToType([[0.0], [0.0]], v0_val)
       self.assertAllCloseAccordingToType([[0.0], [0.0]], v1_val)
@@ -323,7 +335,7 @@ class FtrlOptimizerTest(test.TestCase):
     for _ in range(steps):
       update.run()
 
-    v0_val, v1_val = sess.run([var0, var1])
+    v0_val, v1_val = self.evaluate([var0, var1])
     return v0_val, v1_val
 
   # When variables are initialized with Zero, FTRL-Proximal has two properties:
@@ -333,6 +345,7 @@ class FtrlOptimizerTest(test.TestCase):
   # with Adagrad.
   # So, basing on these two properties, we test if our implementation of
   # FTRL-Proximal performs same updates as Adagrad or GradientDescent.
+  @test_util.run_deprecated_v1
   def testEquivAdagradwithoutRegularization(self):
     for dtype in [dtypes.half, dtypes.float32]:
       with self.cached_session():
@@ -353,6 +366,7 @@ class FtrlOptimizerTest(test.TestCase):
       self.assertAllCloseAccordingToType(val0, val2)
       self.assertAllCloseAccordingToType(val1, val3)
 
+  @test_util.run_deprecated_v1
   def testEquivSparseAdagradwithoutRegularization(self):
     for dtype in [dtypes.half, dtypes.float32]:
       with self.cached_session():
@@ -376,6 +390,7 @@ class FtrlOptimizerTest(test.TestCase):
       self.assertAllCloseAccordingToType(val0, val2)
       self.assertAllCloseAccordingToType(val1, val3)
 
+  @test_util.run_deprecated_v1
   def testEquivSparseGradientDescentwithoutRegularization(self):
     for dtype in [dtypes.half, dtypes.float32]:
       with self.cached_session():
@@ -399,6 +414,7 @@ class FtrlOptimizerTest(test.TestCase):
       self.assertAllCloseAccordingToType(val0, val2)
       self.assertAllCloseAccordingToType(val1, val3)
 
+  @test_util.run_deprecated_v1
   def testEquivGradientDescentwithoutRegularization(self):
     for dtype in [dtypes.half, dtypes.float32]:
       with self.cached_session():
diff --git a/tensorflow/python/training/gradient_descent.py b/tensorflow/python/training/gradient_descent.py
index ef50f6315dd..1a527345ef6 100644
--- a/tensorflow/python/training/gradient_descent.py
+++ b/tensorflow/python/training/gradient_descent.py
@@ -26,7 +26,7 @@ from tensorflow.python.training import training_ops
 from tensorflow.python.util.tf_export import tf_export
 
 
-@tf_export("train.GradientDescentOptimizer")
+@tf_export(v1=["train.GradientDescentOptimizer"])
 class GradientDescentOptimizer(optimizer.Optimizer):
   """Optimizer that implements the gradient descent algorithm.
   """
diff --git a/tensorflow/python/training/gradient_descent_test.py b/tensorflow/python/training/gradient_descent_test.py
index 1ddea598e52..5a6c5cfa747 100644
--- a/tensorflow/python/training/gradient_descent_test.py
+++ b/tensorflow/python/training/gradient_descent_test.py
@@ -24,6 +24,7 @@ from tensorflow.python.eager import function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import embedding_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import resource_variable_ops
@@ -35,6 +36,7 @@ from tensorflow.python.training import gradient_descent
 
 class GradientDescentOptimizerTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testBasic(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
       with self.cached_session():
@@ -47,17 +49,18 @@ class GradientDescentOptimizerTest(test.TestCase):
             zip([grads0, grads1], [var0, var1]))
         variables.global_variables_initializer().run()
         # Fetch params to validate initial values
-        self.assertAllCloseAccordingToType([1.0, 2.0], var0.eval())
-        self.assertAllCloseAccordingToType([3.0, 4.0], var1.eval())
+        self.assertAllCloseAccordingToType([1.0, 2.0], self.evaluate(var0))
+        self.assertAllCloseAccordingToType([3.0, 4.0], self.evaluate(var1))
         # Run 1 step of sgd
         sgd_op.run()
         # Validate updated params
         self.assertAllCloseAccordingToType([1.0 - 3.0 * 0.1, 2.0 - 3.0 * 0.1],
-                                           var0.eval())
+                                           self.evaluate(var0))
         self.assertAllCloseAccordingToType([3.0 - 3.0 * 0.01, 4.0 - 3.0 * 0.01],
-                                           var1.eval())
+                                           self.evaluate(var1))
         self.assertEqual(0, len(optimizer.variables()))
 
+  @test_util.run_deprecated_v1
   def testBasicResourceVariable(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
       with self.cached_session():
@@ -73,16 +76,17 @@ class GradientDescentOptimizerTest(test.TestCase):
         # a long-term solution for this.
         resources.initialize_resources([var0, var1]).run()
         # Fetch params to validate initial values
-        self.assertAllCloseAccordingToType([1.0, 2.0], var0.eval())
-        self.assertAllCloseAccordingToType([3.0, 4.0], var1.eval())
+        self.assertAllCloseAccordingToType([1.0, 2.0], self.evaluate(var0))
+        self.assertAllCloseAccordingToType([3.0, 4.0], self.evaluate(var1))
         # Run 1 step of sgd
         sgd_op.run()
         # Validate updated params
         self.assertAllCloseAccordingToType([1.0 - 3.0 * 0.1, 2.0 - 3.0 * 0.1],
-                                           var0.eval())
+                                           self.evaluate(var0))
         self.assertAllCloseAccordingToType([3.0 - 3.0 * 0.01, 4.0 - 3.0 * 0.01],
-                                           var1.eval())
+                                           self.evaluate(var1))
 
+  @test_util.run_deprecated_v1
   def testBasicCallableParams(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
       with self.cached_session():
@@ -99,16 +103,17 @@ class GradientDescentOptimizerTest(test.TestCase):
         # a long-term solution for this.
         resources.initialize_resources([var0, var1]).run()
         # Fetch params to validate initial values
-        self.assertAllCloseAccordingToType([1.0, 2.0], var0.eval())
-        self.assertAllCloseAccordingToType([3.0, 4.0], var1.eval())
+        self.assertAllCloseAccordingToType([1.0, 2.0], self.evaluate(var0))
+        self.assertAllCloseAccordingToType([3.0, 4.0], self.evaluate(var1))
         # Run 1 step of sgd
         sgd_op.run()
         # Validate updated params
         self.assertAllCloseAccordingToType([1.0 - 3.0 * 0.1, 2.0 - 3.0 * 0.1],
-                                           var0.eval())
+                                           self.evaluate(var0))
         self.assertAllCloseAccordingToType([3.0 - 3.0 * 0.01, 4.0 - 3.0 * 0.01],
-                                           var1.eval())
+                                           self.evaluate(var1))
 
+  @test_util.run_deprecated_v1
   def testMinimizeResourceVariable(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
       with self.cached_session():
@@ -124,17 +129,18 @@ class GradientDescentOptimizerTest(test.TestCase):
         # a long-term solution for this.
         resources.initialize_resources([var0, var1]).run()
         # Fetch params to validate initial values
-        self.assertAllCloseAccordingToType([[1.0, 2.0]], var0.eval())
-        self.assertAllCloseAccordingToType([3.0], var1.eval())
+        self.assertAllCloseAccordingToType([[1.0, 2.0]], self.evaluate(var0))
+        self.assertAllCloseAccordingToType([3.0], self.evaluate(var1))
         # Run 1 step of sgd
         sgd_op.run()
         # Validate updated params
         np_pred = 1.0 * 4.0 + 2.0 * 5.0 + 3.0
         np_grad = 2 * np_pred
         self.assertAllCloseAccordingToType(
-            [[1.0 - np_grad * 4.0, 2.0 - np_grad * 5.0]], var0.eval())
-        self.assertAllCloseAccordingToType([3.0 - np_grad], var1.eval())
+            [[1.0 - np_grad * 4.0, 2.0 - np_grad * 5.0]], self.evaluate(var0))
+        self.assertAllCloseAccordingToType([3.0 - np_grad], self.evaluate(var1))
 
+  @test_util.run_deprecated_v1
   def testMinimizeSparseResourceVariable(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
       with self.cached_session():
@@ -151,17 +157,18 @@ class GradientDescentOptimizerTest(test.TestCase):
         # a long-term solution for this.
         variables.global_variables_initializer().run()
         # Fetch params to validate initial values
-        self.assertAllCloseAccordingToType([[1.0, 2.0]], var0.eval())
-        self.assertAllCloseAccordingToType([3.0], var1.eval())
+        self.assertAllCloseAccordingToType([[1.0, 2.0]], self.evaluate(var0))
+        self.assertAllCloseAccordingToType([3.0], self.evaluate(var1))
         # Run 1 step of sgd
         sgd_op.run()
         # Validate updated params
         np_pred = 1.0 * 4.0 + 2.0 * 5.0 + 3.0
         np_grad = 2 * np_pred
         self.assertAllCloseAccordingToType(
-            [[1.0 - np_grad * 4.0, 2.0 - np_grad * 5.0]], var0.eval())
-        self.assertAllCloseAccordingToType([3.0 - np_grad], var1.eval())
+            [[1.0 - np_grad * 4.0, 2.0 - np_grad * 5.0]], self.evaluate(var0))
+        self.assertAllCloseAccordingToType([3.0 - np_grad], self.evaluate(var1))
 
+  @test_util.run_deprecated_v1
   def testTensorLearningRate(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
       with self.cached_session():
@@ -174,16 +181,17 @@ class GradientDescentOptimizerTest(test.TestCase):
             lrate).apply_gradients(zip([grads0, grads1], [var0, var1]))
         variables.global_variables_initializer().run()
         # Fetch params to validate initial values
-        self.assertAllCloseAccordingToType([1.0, 2.0], var0.eval())
-        self.assertAllCloseAccordingToType([3.0, 4.0], var1.eval())
+        self.assertAllCloseAccordingToType([1.0, 2.0], self.evaluate(var0))
+        self.assertAllCloseAccordingToType([3.0, 4.0], self.evaluate(var1))
         # Run 1 step of sgd
         sgd_op.run()
         # Validate updated params
         self.assertAllCloseAccordingToType([1.0 - 3.0 * 0.1, 2.0 - 3.0 * 0.1],
-                                           var0.eval())
+                                           self.evaluate(var0))
         self.assertAllCloseAccordingToType([3.0 - 3.0 * 0.01, 4.0 - 3.0 * 0.01],
-                                           var1.eval())
+                                           self.evaluate(var1))
 
+  @test_util.run_deprecated_v1
   def testGradWrtRef(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
       with self.cached_session():
@@ -193,8 +201,9 @@ class GradientDescentOptimizerTest(test.TestCase):
         grads_and_vars = opt.compute_gradients(vars_[0] + vars_[1], vars_)
         variables.global_variables_initializer().run()
         for grad, _ in grads_and_vars:
-          self.assertAllCloseAccordingToType([1.0], grad.eval())
+          self.assertAllCloseAccordingToType([1.0], self.evaluate(grad))
 
+  @test_util.run_deprecated_v1
   def testWithGlobalStep(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
       with self.cached_session():
@@ -207,17 +216,18 @@ class GradientDescentOptimizerTest(test.TestCase):
             zip([grads0, grads1], [var0, var1]), global_step=global_step)
         variables.global_variables_initializer().run()
         # Fetch params to validate initial values
-        self.assertAllCloseAccordingToType([1.0, 2.0], var0.eval())
-        self.assertAllCloseAccordingToType([3.0, 4.0], var1.eval())
+        self.assertAllCloseAccordingToType([1.0, 2.0], self.evaluate(var0))
+        self.assertAllCloseAccordingToType([3.0, 4.0], self.evaluate(var1))
         # Run 1 step of sgd
         sgd_op.run()
         # Validate updated params and global_step
         self.assertAllCloseAccordingToType([1.0 - 3.0 * 0.1, 2.0 - 3.0 * 0.1],
-                                           var0.eval())
+                                           self.evaluate(var0))
         self.assertAllCloseAccordingToType([3.0 - 3.0 * 0.01, 4.0 - 3.0 * 0.01],
-                                           var1.eval())
-        self.assertAllCloseAccordingToType(1, global_step.eval())
+                                           self.evaluate(var1))
+        self.assertAllCloseAccordingToType(1, self.evaluate(global_step))
 
+  @test_util.run_deprecated_v1
   def testSparseBasic(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
       with self.cached_session():
@@ -237,15 +247,15 @@ class GradientDescentOptimizerTest(test.TestCase):
             zip([grads0, grads1], [var0, var1]))
         variables.global_variables_initializer().run()
         # Fetch params to validate initial values
-        self.assertAllCloseAccordingToType([[1.0], [2.0]], var0.eval())
-        self.assertAllCloseAccordingToType([[3.0], [4.0]], var1.eval())
+        self.assertAllCloseAccordingToType([[1.0], [2.0]], self.evaluate(var0))
+        self.assertAllCloseAccordingToType([[3.0], [4.0]], self.evaluate(var1))
         # Run 1 step of sgd
         sgd_op.run()
         # Validate updated params
         self.assertAllCloseAccordingToType([[1.0 - 3.0 * 0.1], [2.0]],
-                                           var0.eval())
+                                           self.evaluate(var0))
         self.assertAllCloseAccordingToType([[3.0], [4.0 - 3.0 * 0.01]],
-                                           var1.eval())
+                                           self.evaluate(var1))
 
   def testCapturingInDefunWhileExecutingEagerly(self):
     with context.eager_mode():
diff --git a/tensorflow/python/training/input_test.py b/tensorflow/python/training/input_test.py
index 085b77d1d6a..a3d268a0174 100644
--- a/tensorflow/python/training/input_test.py
+++ b/tensorflow/python/training/input_test.py
@@ -28,6 +28,7 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variables
@@ -41,6 +42,7 @@ from tensorflow.python.util import compat
 
 class MatchFilenamesOnceTest(test_lib.TestCase):
 
+  @test_util.run_deprecated_v1
   def test(self):
     temp_dir = self.get_temp_dir()
     filenames = [os.path.join(temp_dir, n) for n in os.listdir(temp_dir)]
@@ -58,35 +60,41 @@ class MatchFilenamesOnceTest(test_lib.TestCase):
       one = inp.match_filenames_once(additional[1])
       variables.global_variables_initializer().run()
       variables.local_variables_initializer().run()
-      self.assertItemsEqual(map(compat.as_bytes, filenames), star.eval())
-      self.assertItemsEqual(map(compat.as_bytes, additional), question.eval())
-      self.assertItemsEqual([compat.as_bytes(additional[1])], one.eval())
+      self.assertItemsEqual(
+          map(compat.as_bytes, filenames), self.evaluate(star))
+      self.assertItemsEqual(
+          map(compat.as_bytes, additional), self.evaluate(question))
+      self.assertItemsEqual([compat.as_bytes(additional[1])],
+                            self.evaluate(one))
 
 
 class LimitEpochsTest(test_lib.TestCase):
 
+  @test_util.run_deprecated_v1
   def testNoLimit(self):
     with self.cached_session():
       seven = constant_op.constant(7)
       seven_forever = inp.limit_epochs(seven)
       variables.local_variables_initializer().run()
       for _ in range(100):
-        self.assertEqual(7, seven_forever.eval())
+        self.assertEqual(7, self.evaluate(seven_forever))
 
+  @test_util.run_deprecated_v1
   def testLimit(self):
     with self.cached_session():
       love_me = constant_op.constant("Love Me")
       love_me_two_times = inp.limit_epochs(love_me, num_epochs=2)
       variables.global_variables_initializer().run()
       variables.local_variables_initializer().run()
-      self.assertEqual(b"Love Me", love_me_two_times.eval())
-      self.assertEqual(b"Love Me", love_me_two_times.eval())
+      self.assertEqual(b"Love Me", self.evaluate(love_me_two_times))
+      self.assertEqual(b"Love Me", self.evaluate(love_me_two_times))
       with self.assertRaises(errors_impl.OutOfRangeError):
-        love_me_two_times.eval()
+        self.evaluate(love_me_two_times)
 
 
 class InputProducerTest(test_lib.TestCase):
 
+  @test_util.run_deprecated_v1
   def testNoShuffle(self):
     with self.cached_session():
       input_tensor = [[1, 2, 3, 4],
@@ -102,14 +110,16 @@ class InputProducerTest(test_lib.TestCase):
       threads = queue_runner_impl.start_queue_runners()
 
       # No randomness, so just see repeated copies of the input.
-      self.assertAllEqual(input_tensor * num_epochs, dequeue_many.eval())
+      self.assertAllEqual(input_tensor * num_epochs,
+                          self.evaluate(dequeue_many))
 
       # Reached the limit.
       with self.assertRaises(errors_impl.OutOfRangeError):
-        dequeue.eval()
+        self.evaluate(dequeue)
       for thread in threads:
         thread.join()
 
+  @test_util.run_deprecated_v1
   def testNoShapeInference(self):
     with self.cached_session():
       # Disable shape inference for the input.
@@ -127,14 +137,15 @@ class InputProducerTest(test_lib.TestCase):
       threads = queue_runner_impl.start_queue_runners()
 
       # No randomness, so just see repeated copies of the input.
-      self.assertAllEqual(input_value * num_epochs, dequeue_many.eval())
+      self.assertAllEqual(input_value * num_epochs, self.evaluate(dequeue_many))
 
       # Reached the limit.
       with self.assertRaises(errors_impl.OutOfRangeError):
-        dequeue.eval()
+        self.evaluate(dequeue)
       for thread in threads:
         thread.join()
 
+  @test_util.run_deprecated_v1
   def testShapeError(self):
     input_tensor = array_ops.placeholder(dtypes.float32, None)
     with self.assertRaisesRegexp(ValueError, "fully defined shape"):
@@ -143,6 +154,7 @@ class InputProducerTest(test_lib.TestCase):
 
 class StringInputProducerTest(test_lib.TestCase):
 
+  @test_util.run_deprecated_v1
   def testNoShuffle(self):
     with self.cached_session():
       strings = [b"to", b"be", b"or", b"not", b"to", b"be"]
@@ -156,15 +168,16 @@ class StringInputProducerTest(test_lib.TestCase):
       threads = queue_runner_impl.start_queue_runners()
 
       # No randomness, so just see repeated copies of the input.
-      output = dequeue_many.eval()
+      output = self.evaluate(dequeue_many)
       self.assertAllEqual(strings * num_epochs, output)
 
       # Reached the limit.
       with self.assertRaises(errors_impl.OutOfRangeError):
-        dequeue.eval()
+        self.evaluate(dequeue)
       for thread in threads:
         thread.join()
 
+  @test_util.run_deprecated_v1
   def testShuffle(self):
     with self.cached_session():
       strings = [b"a", b"b", b"c"]
@@ -184,7 +197,7 @@ class StringInputProducerTest(test_lib.TestCase):
       for e in expected:
         frequency[e] = 0
       for _ in range(num_epochs):
-        output = dequeue_many.eval()
+        output = self.evaluate(dequeue_many)
         key = b"".join(output)
         self.assertIn(key, expected)
         frequency[key] += 1
@@ -200,7 +213,7 @@ class StringInputProducerTest(test_lib.TestCase):
 
       # Reached the limit.
       with self.assertRaises(errors_impl.OutOfRangeError):
-        dequeue.eval()
+        self.evaluate(dequeue)
       for thread in threads:
         thread.join()
 
@@ -210,6 +223,7 @@ class StringInputProducerTest(test_lib.TestCase):
       with self.assertRaises(ValueError):
         _ = inp.string_input_producer([])
 
+  @test_util.run_deprecated_v1
   def testNullString(self):
     # Runtime check for empty string list.  This is slightly oblique:
     # The queue runner should die with an assertion error on the null
@@ -224,11 +238,12 @@ class StringInputProducerTest(test_lib.TestCase):
       variables.local_variables_initializer().run()
       threads = queue_runner_impl.start_queue_runners(coord=coord)
       with self.assertRaises(errors_impl.OutOfRangeError):
-        dequeue.eval()
+        self.evaluate(dequeue)
       coord.request_stop()
       for thread in threads:
         thread.join()
 
+  @test_util.run_deprecated_v1
   def testSharedName(self):
     with self.cached_session():
       strings = [b"to", b"be", b"or", b"not", b"to", b"be"]
@@ -237,6 +252,7 @@ class StringInputProducerTest(test_lib.TestCase):
       self.assertProtoEquals("s: 'SHARED_NAME_XYZ'",
                              queue.queue_ref.op.node_def.attr["shared_name"])
 
+  @test_util.run_deprecated_v1
   def testConstructionRace(self):
     with self.cached_session() as sess:
       strings = [b"to", b"be", b"or", b"not", b"to", b"be"]
@@ -252,13 +268,14 @@ class StringInputProducerTest(test_lib.TestCase):
           # writing of the `tf.Graph` object. However, many users
           # write code this way, so we include this test to ensure
           # that we can support it.
-          self.assertEquals(string, sess.run(queue.dequeue()))
+          self.assertEquals(string, self.evaluate(queue.dequeue()))
       coord.request_stop()
       coord.join(threads)
 
 
 class RangeInputProducerTest(test_lib.TestCase):
 
+  @test_util.run_deprecated_v1
   def testNoShuffle(self):
     with self.cached_session():
       num_epochs = 3
@@ -272,15 +289,16 @@ class RangeInputProducerTest(test_lib.TestCase):
       threads = queue_runner_impl.start_queue_runners()
 
       # No randomness, so just see repeated copies of the input.
-      output = dequeue_many.eval()
+      output = self.evaluate(dequeue_many)
       self.assertAllEqual(list(xrange(range_size)) * num_epochs, output)
 
       # Reached the limit.
       with self.assertRaises(errors_impl.OutOfRangeError):
-        dequeue.eval()
+        self.evaluate(dequeue)
       for thread in threads:
         thread.join()
 
+  @test_util.run_deprecated_v1
   def testShuffle(self):
     with self.cached_session():
       num_epochs = 200
@@ -300,7 +318,7 @@ class RangeInputProducerTest(test_lib.TestCase):
       for e in expected:
         frequency[e] = 0
       for _ in range(num_epochs):
-        output = dequeue_many.eval()
+        output = self.evaluate(dequeue_many)
         key = 10 * (output[0] + 1) + (output[1] + 1)
         self.assertIn(key, expected)
         frequency[key] += 1
@@ -316,10 +334,11 @@ class RangeInputProducerTest(test_lib.TestCase):
 
       # Reached the limit.
       with self.assertRaises(errors_impl.OutOfRangeError):
-        dequeue.eval()
+        self.evaluate(dequeue)
       for thread in threads:
         thread.join()
 
+  @test_util.run_deprecated_v1
   def testSharedName(self):
     with self.cached_session():
       range_size = 5
@@ -331,6 +350,7 @@ class RangeInputProducerTest(test_lib.TestCase):
 
 class SliceInputProducerTest(test_lib.TestCase):
 
+  @test_util.run_deprecated_v1
   def testNoShuffle(self):
     with self.cached_session() as sess:
       num_epochs = 3
@@ -344,17 +364,18 @@ class SliceInputProducerTest(test_lib.TestCase):
 
       # No randomness, so just see repeated copies of the input.
       num_items = len(source_strings) * num_epochs
-      output = [sess.run(slices) for _ in range(num_items)]
+      output = [self.evaluate(slices) for _ in range(num_items)]
       out_strings, out_ints = zip(*output)
       self.assertAllEqual(source_strings * num_epochs, out_strings)
       self.assertAllEqual(source_ints * num_epochs, out_ints)
 
       # Reached the limit.
       with self.assertRaises(errors_impl.OutOfRangeError):
-        sess.run(slices)
+        self.evaluate(slices)
       for thread in threads:
         thread.join()
 
+  @test_util.run_deprecated_v1
   def testShuffle(self):
     with self.cached_session() as sess:
       num_epochs = 1200
@@ -379,7 +400,7 @@ class SliceInputProducerTest(test_lib.TestCase):
       for e in expected:
         frequency[e] = 0
       for _ in range(num_epochs):
-        output = [sess.run(slices) for _ in range(len(source_strings))]
+        output = [self.evaluate(slices) for _ in range(len(source_strings))]
         key = b",".join([s + compat.as_bytes(str(i)) for s, i in output])
         self.assertIn(key, expected)
         frequency[key] += 1
@@ -395,10 +416,11 @@ class SliceInputProducerTest(test_lib.TestCase):
 
       # Reached the limit.
       with self.assertRaises(errors_impl.OutOfRangeError):
-        sess.run(slices)
+        self.evaluate(slices)
       for thread in threads:
         thread.join()
 
+  @test_util.run_deprecated_v1
   def testSharedName(self):
     with self.cached_session():
       source_strings = ["A", "B", "D", "G"]
@@ -470,7 +492,7 @@ class BatchTest(test_lib.TestCase):
       threads = queue_runner_impl.start_queue_runners()
 
       for i in range(num_batches):
-        results = sess.run(batched_fetch)
+        results = self.evaluate(batched_fetch)
         self.assertAllEqual(results[0],
                             np.arange(i * batch_size, (i + 1) * batch_size))
         self.assertAllEqual(
@@ -487,38 +509,43 @@ class BatchTest(test_lib.TestCase):
 
       # Reached the limit.
       with self.assertRaises(errors_impl.OutOfRangeError):
-        sess.run(batched_fetch)
+        self.evaluate(batched_fetch)
       for thread in threads:
         thread.join()
 
+  @test_util.run_deprecated_v1
   def testOneThread(self):
     self._testOneThreadHelper(use_dict=False)
 
+  @test_util.run_deprecated_v1
   def testOneThreadDict(self):
     self._testOneThreadHelper(use_dict=True)
 
+  @test_util.run_deprecated_v1
   def testUint32DataTypes(self):
     values = constant_op.constant([0, 1, 2, 3, 4, 5], dtype=dtypes.uint32)
     batched = inp.batch([values], batch_size=2)
     with self.cached_session() as sess:
       coord = coordinator.Coordinator()
       threads = queue_runner_impl.start_queue_runners(sess=sess, coord=coord)
-      sess.run(batched)
+      self.evaluate(batched)
       coord.request_stop()
       for thread in threads:
         thread.join()
 
+  @test_util.run_deprecated_v1
   def testUint64DataTypes(self):
     values = constant_op.constant([0, 1, 2, 3, 4, 5], dtype=dtypes.uint64)
     batched = inp.batch([values], batch_size=2)
     with self.cached_session() as sess:
       coord = coordinator.Coordinator()
       threads = queue_runner_impl.start_queue_runners(sess=sess, coord=coord)
-      sess.run(batched)
+      self.evaluate(batched)
       coord.request_stop()
       for thread in threads:
         thread.join()
 
+  @test_util.run_deprecated_v1
   def testOneThreadDynamicPad(self):
     with self.cached_session() as sess:
       batch_size = 10
@@ -535,7 +562,7 @@ class BatchTest(test_lib.TestCase):
       threads = queue_runner_impl.start_queue_runners()
 
       for i in range(num_batches):
-        results = sess.run(batched)
+        results = self.evaluate(batched)
         expected_results = np.arange(i * batch_size, (i + 1) * batch_size)
         max_len = expected_results[-1]
         self.assertAllEqual(results[0], expected_results)
@@ -545,10 +572,11 @@ class BatchTest(test_lib.TestCase):
 
       # Reached the limit.
       with self.assertRaises(errors_impl.OutOfRangeError):
-        sess.run(batched)
+        self.evaluate(batched)
       for thread in threads:
         thread.join()
 
+  @test_util.run_deprecated_v1
   def testOneThreadEnqueueMany(self):
     with self.cached_session() as sess:
       batch_size = 10
@@ -567,7 +595,7 @@ class BatchTest(test_lib.TestCase):
       threads = queue_runner_impl.start_queue_runners()
 
       for i in range(num_batches):
-        results = sess.run(batched)
+        results = self.evaluate(batched)
         self.assertAllEqual(results[0],
                             np.arange(i * batch_size, (i + 1) * batch_size))
         self.assertAllEqual(
@@ -580,10 +608,11 @@ class BatchTest(test_lib.TestCase):
 
       # Reached the limit.
       with self.assertRaises(errors_impl.OutOfRangeError):
-        sess.run(batched)
+        self.evaluate(batched)
       for thread in threads:
         thread.join()
 
+  @test_util.run_deprecated_v1
   def testManyThreads(self):
     with self.cached_session() as sess:
       batch_size = 10
@@ -606,7 +635,7 @@ class BatchTest(test_lib.TestCase):
 
       all_counts = []
       for i in range(num_batches):
-        results = sess.run(batched)
+        results = self.evaluate(batched)
         tf_logging.info("Batch %d: %s", i, results[0])
         self.assertEqual(len(results[0]), batch_size)
         self.assertAllEqual(results[0], results[1].values)
@@ -620,10 +649,11 @@ class BatchTest(test_lib.TestCase):
 
       # Reached the limit.
       with self.assertRaises(errors_impl.OutOfRangeError):
-        sess.run(batched)
+        self.evaluate(batched)
       for thread in threads:
         thread.join()
 
+  @test_util.run_deprecated_v1
   def testOneThreadSmallerBatch(self):
     with self.cached_session() as sess:
       batch_size = 10
@@ -647,7 +677,7 @@ class BatchTest(test_lib.TestCase):
       threads = queue_runner_impl.start_queue_runners()
 
       for i in range(num_batches):
-        results = sess.run(batched)
+        results = self.evaluate(batched)
         self.assertAllEqual(results[0],
                             np.arange(i * batch_size, (i + 1) * batch_size))
         self.assertAllEqual(
@@ -663,7 +693,7 @@ class BatchTest(test_lib.TestCase):
         self.assertAllEqual(results[2], [b"string"] * batch_size)
 
       # Reached the final batch with extra_elements.
-      results = sess.run(batched)
+      results = self.evaluate(batched)
       self.assertAllEqual(results[0],
                           np.arange(num_batches * batch_size,
                                     num_batches * batch_size + extra_elements))
@@ -677,10 +707,11 @@ class BatchTest(test_lib.TestCase):
 
       # Reached the limit.
       with self.assertRaises(errors_impl.OutOfRangeError):
-        sess.run(batched)
+        self.evaluate(batched)
       for thread in threads:
         thread.join()
 
+  @test_util.run_deprecated_v1
   def testManyThreadsSmallerBatch(self):
     with self.cached_session() as sess:
       batch_size = 10
@@ -705,7 +736,7 @@ class BatchTest(test_lib.TestCase):
 
       all_counts = []
       for i in range(num_batches):
-        results = sess.run(batched)
+        results = self.evaluate(batched)
         tf_logging.info("Batch %d: %s", i, results[0])
         self.assertEqual(len(results[0]), batch_size)
         self.assertAllEqual(results[0], results[1].values)
@@ -717,7 +748,7 @@ class BatchTest(test_lib.TestCase):
         self.assertAllEqual(results[2], [b"string"] * batch_size)
 
       # Reached the final batch with extra_elements.
-      results = sess.run(batched)
+      results = self.evaluate(batched)
       tf_logging.info("Last Batch: %s", results[0])
       self.assertEqual(len(results[0]), extra_elements)
       self.assertAllEqual(results[0], results[1].values)
@@ -732,10 +763,11 @@ class BatchTest(test_lib.TestCase):
 
       # Reached the limit.
       with self.assertRaises(errors_impl.OutOfRangeError):
-        sess.run(batched)
+        self.evaluate(batched)
       for thread in threads:
         thread.join()
 
+  @test_util.run_deprecated_v1
   def testSharedName(self):
     with self.cached_session():
       batch_size = 10
@@ -753,12 +785,14 @@ class BatchTest(test_lib.TestCase):
           "s: 'SHARED_NAME_XYZ'",
           batched[0].op.inputs[0].op.node_def.attr["shared_name"])
 
+  @test_util.run_deprecated_v1
   def testCannotInferRankError(self):
     with self.cached_session():
       x = array_ops.placeholder(dtype=dtypes.int64)
       with self.assertRaisesRegexp(ValueError, "Cannot infer Tensor's rank"):
         inp.batch([x], batch_size=2)
 
+  @test_util.run_deprecated_v1
   def testBatchedSparseTensorInferredShape(self):
     sparse = sparse_tensor.SparseTensor(
         indices=[[0]], values=[1.0], dense_shape=[1])
@@ -766,6 +800,7 @@ class BatchTest(test_lib.TestCase):
     batched = inp.batch([sparse], batch_size=2)
     self.assertAllEqual((2,), batched.dense_shape.get_shape().as_list())
 
+  @test_util.run_deprecated_v1
   def testBatchedSparseTensorInferredShapeEnqueueMany(self):
     sparse = sparse_tensor.SparseTensor(
         indices=[[0]], values=[1.0], dense_shape=[1])
@@ -773,6 +808,7 @@ class BatchTest(test_lib.TestCase):
     batched = inp.batch([sparse], batch_size=2, enqueue_many=True)
     self.assertAllEqual((1,), batched.dense_shape.get_shape().as_list())
 
+  @test_util.run_deprecated_v1
   def testBatchedSparseTensorInferredShapeUnknownRank(self):
     sparse = sparse_tensor.SparseTensor(
         indices=array_ops.placeholder(dtypes.int64),
@@ -782,6 +818,7 @@ class BatchTest(test_lib.TestCase):
     batched = inp.batch([sparse], batch_size=2)
     self.assertIs(None, batched.dense_shape.get_shape().num_elements())
 
+  @test_util.run_deprecated_v1
   def testBatchedSparseTensorInferredShapeUnknownRankEnqueueMany(self):
     sparse = sparse_tensor.SparseTensor(
         indices=array_ops.placeholder(dtypes.int64),
@@ -791,6 +828,7 @@ class BatchTest(test_lib.TestCase):
     batched = inp.batch([sparse], batch_size=2, enqueue_many=True)
     self.assertIs(None, batched.dense_shape.get_shape().num_elements())
 
+  @test_util.run_deprecated_v1
   def testSingleElementDict(self):
     x = inp.batch({"c": [12, 12]}, batch_size=8)
     self.assertAllEqual((8, 2), x["c"].get_shape().as_list())
@@ -823,35 +861,42 @@ class BatchTest(test_lib.TestCase):
       threads = queue_runner_impl.start_queue_runners()
 
       for _ in range(num_batches):
-        results = sess.run(batched)
+        results = self.evaluate(batched)
         self.assertAllEqual([0] * batch_size, np.mod(results[0], 2))
         self.assertAllEqual([0] * batch_size, np.mod(results[1].values, 2))
         self.assertAllEqual([b"string"] * batch_size, results[2])
 
       # Reached the limit.
       with self.assertRaises(errors_impl.OutOfRangeError):
-        sess.run(batched)
+        self.evaluate(batched)
       for thread in threads:
         thread.join()
 
+  @test_util.run_deprecated_v1
   def testSingleThreadKeepInput(self):
     self._testKeepInputHelper(1, False)
 
+  @test_util.run_deprecated_v1
   def testSingleThreadKeepInputEnqueueMany(self):
     self._testKeepInputHelper(1, True)
 
+  @test_util.run_deprecated_v1
   def testMultipleThreadKeepInput(self):
     self._testKeepInputHelper(5, False)
 
+  @test_util.run_deprecated_v1
   def testMultipleThreadKeepInputEnqueueMany(self):
     self._testKeepInputHelper(5, True)
 
+  @test_util.run_deprecated_v1
   def testMaybeEnqueuePerExample(self):
     self._testKeepInputHelper(1, True, keep_input_vector=True)
 
+  @test_util.run_deprecated_v1
   def testMultipleThreadMaybeEnqueuePerExample(self):
     self._testKeepInputHelper(5, True, keep_input_vector=True)
 
+  @test_util.run_deprecated_v1
   def testInvalidKeepInputVector(self):
     # Can't have vector `keep_input` with `enqueue_many=False`.
     with self.assertRaisesRegexp(ValueError, "`keep_input` cannot be a vector"):
@@ -873,6 +918,7 @@ class BatchTest(test_lib.TestCase):
                       batch_size=1,
                       enqueue_many=True)
 
+  @test_util.run_deprecated_v1
   def testMaybeBatchedSparseTensorInferredShape(self):
     sparse = sparse_tensor.SparseTensor(
         indices=[[0]], values=[1.0], dense_shape=[1])
@@ -880,6 +926,7 @@ class BatchTest(test_lib.TestCase):
     batched = inp.maybe_batch([sparse], keep_input=True, batch_size=2)
     self.assertAllEqual((2,), batched.dense_shape.get_shape().as_list())
 
+  @test_util.run_deprecated_v1
   def testMaybeBatchedSparseTensorInferredShapeEnqueueMany(self):
     sparse = sparse_tensor.SparseTensor(
         indices=[[0]], values=[1.0], dense_shape=[1])
@@ -888,6 +935,7 @@ class BatchTest(test_lib.TestCase):
         [sparse], keep_input=True, batch_size=2, enqueue_many=True)
     self.assertAllEqual((1,), batched.dense_shape.get_shape().as_list())
 
+  @test_util.run_deprecated_v1
   def testMaybeBatchedSparseTensorInferredShapeEnqueueManyPerExample(self):
     sparse = sparse_tensor.SparseTensor(
         indices=[[0], [0]], values=[1.0, 2.0], dense_shape=[2])
@@ -896,6 +944,7 @@ class BatchTest(test_lib.TestCase):
         [sparse], keep_input=[True, False], batch_size=2, enqueue_many=True)
     self.assertAllEqual((1,), batched.dense_shape.get_shape().as_list())
 
+  @test_util.run_deprecated_v1
   def testMaybeBatchedSparseTensorInferredShapeUnknownRank(self):
     sparse = sparse_tensor.SparseTensor(
         indices=array_ops.placeholder(dtypes.int64),
@@ -905,6 +954,7 @@ class BatchTest(test_lib.TestCase):
     batched = inp.maybe_batch([sparse], keep_input=True, batch_size=2)
     self.assertIs(None, batched.dense_shape.get_shape().num_elements())
 
+  @test_util.run_deprecated_v1
   def testMaybeBatchedSparseTensorInferredShapeUnknownRankEnqueueMany(self):
     sparse = sparse_tensor.SparseTensor(
         indices=array_ops.placeholder(dtypes.int64),
@@ -915,6 +965,7 @@ class BatchTest(test_lib.TestCase):
         [sparse], keep_input=True, batch_size=2, enqueue_many=True)
     self.assertIs(None, batched.dense_shape.get_shape().num_elements())
 
+  @test_util.run_deprecated_v1
   def testMaybeBatchedSparseTensorInferredShapeUnknownRankPerExample(self):
     sparse = sparse_tensor.SparseTensor(
         indices=array_ops.placeholder(dtypes.int64),
@@ -925,6 +976,7 @@ class BatchTest(test_lib.TestCase):
         [sparse], keep_input=[True, False], batch_size=2, enqueue_many=True)
     self.assertIs(None, batched.dense_shape.get_shape().num_elements())
 
+  @test_util.run_deprecated_v1
   def testMaybeBatchCorrectValues(self):
     sparse_t = sparse_tensor.SparseTensor(
         indices=[[0, 1], [0, 2], [1, 0], [1, 3]],
@@ -938,7 +990,7 @@ class BatchTest(test_lib.TestCase):
       coord = coordinator.Coordinator()
       threads = queue_runner_impl.start_queue_runners(coord=coord)
 
-      batched_np = batched.eval()
+      batched_np = self.evaluate(batched)
 
       coord.request_stop()
       for thread in threads:
@@ -1016,7 +1068,7 @@ class BatchJoinTest(test_lib.TestCase):
       saw_both = 0
       num_batches = (num_a + num_b) // batch_size
       for i in range(num_batches):
-        results = sess.run(batched_fetch)
+        results = self.evaluate(batched_fetch)
         self.assertEqual(3, len(results))
         self.assertEqual(batch_size, len(results[0]))
         self.assertEqual(batch_size, len(results[2]))
@@ -1047,16 +1099,19 @@ class BatchJoinTest(test_lib.TestCase):
 
       # Reached the limit.
       with self.assertRaises(errors_impl.OutOfRangeError):
-        sess.run(batched_fetch)
+        self.evaluate(batched_fetch)
       for thread in threads:
         thread.join()
 
+  @test_util.run_deprecated_v1
   def testTwoThreads(self):
     self._testTwoThreadsHelper(use_dict=False)
 
+  @test_util.run_deprecated_v1
   def testTwoThreadsDict(self):
     self._testTwoThreadsHelper(use_dict=True)
 
+  @test_util.run_deprecated_v1
   def testMismatchedDictKeys(self):
     with self.assertRaisesRegexp(ValueError, "must have the same keys"):
       inp.batch_join(
@@ -1071,6 +1126,7 @@ class BatchJoinTest(test_lib.TestCase):
           }],
           batch_size=8)
 
+  @test_util.run_deprecated_v1
   def testTwoThreadsDynamicPad(self):
     with self.cached_session() as sess:
       # Two threads, the first generates (0..69, ["a"] * 1..70).
@@ -1112,7 +1168,7 @@ class BatchJoinTest(test_lib.TestCase):
       saw_both = 0
       num_batches = (num_a + num_b) // batch_size
       for i in range(num_batches):
-        results = sess.run(batched)
+        results = self.evaluate(batched)
         self.assertEqual(2, len(results))
         self.assertEqual(len(results[0]), batch_size)
         self.assertEqual(len(results[1]), batch_size)
@@ -1144,10 +1200,11 @@ class BatchJoinTest(test_lib.TestCase):
 
       # Reached the limit.
       with self.assertRaises(errors_impl.OutOfRangeError):
-        sess.run(batched)
+        self.evaluate(batched)
       for thread in threads:
         thread.join()
 
+  @test_util.run_deprecated_v1
   def testTwoThreadsSmallerBatch(self):
     with self.cached_session() as sess:
       extra_elements = 2
@@ -1197,7 +1254,7 @@ class BatchJoinTest(test_lib.TestCase):
       saw_both = 0
       num_batches = (num_a + num_b) // batch_size
       for i in range(num_batches):
-        results = sess.run(batched)
+        results = self.evaluate(batched)
         tf_logging.info("Batch %d: %s", i, results[0])
         self.assertEqual(len(results[0]), batch_size)
         self.assertEqual(len(results[2]), batch_size)
@@ -1217,7 +1274,7 @@ class BatchJoinTest(test_lib.TestCase):
                             [results[0][i] for i in which_b])
 
       # Reached the final batch with 2 * extra_elements.
-      results = sess.run(batched)
+      results = self.evaluate(batched)
       tf_logging.info("Last Batch: %s", results[0])
       self.assertEqual(len(results[0]), 2 * extra_elements)
       self.assertEqual(len(results[2]), 2 * extra_elements)
@@ -1245,10 +1302,11 @@ class BatchJoinTest(test_lib.TestCase):
 
       # Reached the limit.
       with self.assertRaises(errors_impl.OutOfRangeError):
-        sess.run(batched)
+        self.evaluate(batched)
       for thread in threads:
         thread.join()
 
+  @test_util.run_deprecated_v1
   def testTwoThreadsDynamicPadSmallerBatch(self):
     with self.cached_session() as sess:
       extra_elements = 2
@@ -1292,7 +1350,7 @@ class BatchJoinTest(test_lib.TestCase):
       saw_both = 0
       num_batches = (num_a + num_b) // batch_size
       for i in range(num_batches):
-        results = sess.run(batched)
+        results = self.evaluate(batched)
         tf_logging.info("Batch %d: %s", i, results[0])
         self.assertEqual(len(results[0]), batch_size)
         self.assertEqual(len(results[1]), batch_size)
@@ -1312,7 +1370,7 @@ class BatchJoinTest(test_lib.TestCase):
                             [results[0][i] for i in which_b])
 
       # Reached the final batch with 2 * extra_elements.
-      results = sess.run(batched)
+      results = self.evaluate(batched)
       tf_logging.info("Last Batch: %s", results[0])
       self.assertEqual(len(results[0]), 2 * extra_elements)
       self.assertEqual(len(results[1]), 2 * extra_elements)
@@ -1343,10 +1401,11 @@ class BatchJoinTest(test_lib.TestCase):
 
       # Reached the limit.
       with self.assertRaises(errors_impl.OutOfRangeError):
-        sess.run(batched)
+        self.evaluate(batched)
       for thread in threads:
         thread.join()
 
+  @test_util.run_deprecated_v1
   def testSharedName(self):
     with self.cached_session():
       batch_size = 10
@@ -1369,12 +1428,14 @@ class BatchJoinTest(test_lib.TestCase):
           "s: 'SHARED_NAME_XYZ'",
           batched[0].op.inputs[0].op.node_def.attr["shared_name"])
 
+  @test_util.run_deprecated_v1
   def testCannotInferRankError(self):
     with self.cached_session():
       x = array_ops.placeholder(dtype=dtypes.int64)
       with self.assertRaisesRegexp(ValueError, "Cannot infer Tensor's rank"):
         inp.batch_join([[x]], batch_size=2)
 
+  @test_util.run_deprecated_v1
   def testSingleElementDict(self):
     x = inp.batch_join([{"c": [12, 12]}], batch_size=8)
     self.assertAllEqual((8, 2), x["c"].get_shape().as_list())
@@ -1406,7 +1467,7 @@ class BatchJoinTest(test_lib.TestCase):
       threads = queue_runner_impl.start_queue_runners()
 
       for _ in range(num_batches):
-        results = sess.run(batched)
+        results = self.evaluate(batched)
         self.assertAllEqual(
             [0] * batch_size,
             np.mod(results[0], 2),)
@@ -1417,28 +1478,35 @@ class BatchJoinTest(test_lib.TestCase):
 
       # Reached the limit.
       with self.assertRaises(errors_impl.OutOfRangeError):
-        sess.run(batched)
+        self.evaluate(batched)
       for thread in threads:
         thread.join()
 
+  @test_util.run_deprecated_v1
   def testSingleThreadKeepInput(self):
     self._testKeepInputHelper(1, False)
 
+  @test_util.run_deprecated_v1
   def testSingleThreadKeepInputEnqueueMany(self):
     self._testKeepInputHelper(1, True)
 
+  @test_util.run_deprecated_v1
   def testMultipleThreadKeepInput(self):
     self._testKeepInputHelper(5, False)
 
+  @test_util.run_deprecated_v1
   def testMultipleThreadKeepInputEnqueueMany(self):
     self._testKeepInputHelper(5, True)
 
+  @test_util.run_deprecated_v1
   def testSingleThreadKeepInputPerExample(self):
     self._testKeepInputHelper(1, True, keep_input_vector=True)
 
+  @test_util.run_deprecated_v1
   def testMultipleThreadKeepInputPerExample(self):
     self._testKeepInputHelper(5, True, keep_input_vector=True)
 
+  @test_util.run_deprecated_v1
   def testInvalidKeepInputVector(self):
     # Can't have vector `keep_input` with `enqueue_many=False`.
     with self.assertRaisesRegexp(ValueError, "`keep_input` cannot be a vector"):
@@ -1460,6 +1528,7 @@ class BatchJoinTest(test_lib.TestCase):
                            batch_size=1,
                            enqueue_many=True)
 
+  @test_util.run_deprecated_v1
   def testMaybeBatchedSparseTensorInferredShape(self):
     sparse = sparse_tensor.SparseTensor(
         indices=[[0]], values=[1.0], dense_shape=[1])
@@ -1467,6 +1536,7 @@ class BatchJoinTest(test_lib.TestCase):
     batched = inp.maybe_batch_join([[sparse]], keep_input=True, batch_size=2)
     self.assertAllEqual((2,), batched.dense_shape.get_shape().as_list())
 
+  @test_util.run_deprecated_v1
   def testMaybeBatchedSparseTensorInferredShapeEnqueueMany(self):
     sparse = sparse_tensor.SparseTensor(
         indices=[[0]], values=[1.0], dense_shape=[1])
@@ -1475,6 +1545,7 @@ class BatchJoinTest(test_lib.TestCase):
         [[sparse]], keep_input=True, batch_size=2, enqueue_many=True)
     self.assertAllEqual((1,), batched.dense_shape.get_shape().as_list())
 
+  @test_util.run_deprecated_v1
   def testMaybeBatchedSparseTensorInferredShapeEnqueueManyPerExample(self):
     sparse = sparse_tensor.SparseTensor(
         indices=[[0], [0]], values=[1.0, 2.0], dense_shape=[2])
@@ -1483,6 +1554,7 @@ class BatchJoinTest(test_lib.TestCase):
         [[sparse]], keep_input=[True, False], batch_size=2, enqueue_many=True)
     self.assertAllEqual((1,), batched.dense_shape.get_shape().as_list())
 
+  @test_util.run_deprecated_v1
   def testMaybeBatchedSparseTensorInferredShapeUnknownRank(self):
     sparse = sparse_tensor.SparseTensor(
         indices=array_ops.placeholder(dtypes.int64),
@@ -1492,6 +1564,7 @@ class BatchJoinTest(test_lib.TestCase):
     batched = inp.maybe_batch_join([[sparse]], keep_input=True, batch_size=2)
     self.assertIs(None, batched.dense_shape.get_shape().num_elements())
 
+  @test_util.run_deprecated_v1
   def testMaybeBatchedSparseTensorInferredShapeUnknownRankEnqueueMany(self):
     sparse = sparse_tensor.SparseTensor(
         indices=array_ops.placeholder(dtypes.int64),
@@ -1502,6 +1575,7 @@ class BatchJoinTest(test_lib.TestCase):
         [[sparse]], keep_input=True, batch_size=2, enqueue_many=True)
     self.assertIs(None, batched.dense_shape.get_shape().num_elements())
 
+  @test_util.run_deprecated_v1
   def testMaybeBatchedSparseTensorInferredShapeUnknownRankPerExample(self):
     sparse = sparse_tensor.SparseTensor(
         indices=array_ops.placeholder(dtypes.int64),
@@ -1512,6 +1586,7 @@ class BatchJoinTest(test_lib.TestCase):
         [[sparse]], keep_input=[True, False], batch_size=2, enqueue_many=True)
     self.assertIs(None, batched.dense_shape.get_shape().num_elements())
 
+  @test_util.run_deprecated_v1
   def testMaybeBatchCorrectValues(self):
     sparse = sparse_tensor.SparseTensor(
         indices=[[0, 1], [0, 2], [1, 0], [1, 3]],
@@ -1525,7 +1600,7 @@ class BatchJoinTest(test_lib.TestCase):
       coord = coordinator.Coordinator()
       threads = queue_runner_impl.start_queue_runners(coord=coord)
 
-      batched_np = batched.eval()
+      batched_np = self.evaluate(batched)
 
       coord.request_stop()
       for thread in threads:
@@ -1575,7 +1650,7 @@ class ShuffleBatchTest(test_lib.TestCase):
 
       all_counts = []
       for i in range(num_batches):
-        results = sess.run(batched_fetch)
+        results = self.evaluate(batched_fetch)
         self.assertEqual(len(results[0]), batch_size)
         all_counts.extend(results[0])
         self.assertAllEqual(
@@ -1593,16 +1668,19 @@ class ShuffleBatchTest(test_lib.TestCase):
 
       # Reached the limit.
       with self.assertRaises(errors_impl.OutOfRangeError):
-        sess.run(batched_fetch)
+        self.evaluate(batched_fetch)
       for thread in threads:
         thread.join()
 
+  @test_util.run_deprecated_v1
   def testOneThread(self):
     self._testOneThreadHelper(use_dict=False)
 
+  @test_util.run_deprecated_v1
   def testOneThreadDict(self):
     self._testOneThreadHelper(use_dict=True)
 
+  @test_util.run_deprecated_v1
   def testOneThreadSmallerBatch(self):
     with self.cached_session() as sess:
       batch_size = 10
@@ -1630,7 +1708,7 @@ class ShuffleBatchTest(test_lib.TestCase):
 
       all_counts = []
       for _ in range(num_batches):
-        results = sess.run(batched_fetch)
+        results = self.evaluate(batched_fetch)
         self.assertEqual(len(results[0]), batch_size)
         all_counts.extend(results[0])
         self.assertAllEqual(
@@ -1641,7 +1719,7 @@ class ShuffleBatchTest(test_lib.TestCase):
         self.assertAllEqual(results[2], [b"string"] * batch_size)
 
       # Reached the final batch with extra elements.
-      results = sess.run(batched)
+      results = self.evaluate(batched)
       self.assertAllEqual(results[1].dense_shape, [extra_elements, 1])
       self.assertAllEqual(results[2], [b"string"] * extra_elements)
       all_counts.extend(results[0])
@@ -1655,10 +1733,11 @@ class ShuffleBatchTest(test_lib.TestCase):
 
       # Reached the limit.
       with self.assertRaises(errors_impl.OutOfRangeError):
-        sess.run(batched_fetch)
+        self.evaluate(batched_fetch)
       for thread in threads:
         thread.join()
 
+  @test_util.run_deprecated_v1
   def testManyThreads(self):
     with self.cached_session() as sess:
       batch_size = 10
@@ -1683,7 +1762,7 @@ class ShuffleBatchTest(test_lib.TestCase):
 
       all_counts = []
       for i in range(num_batches):
-        results = sess.run(batched)
+        results = self.evaluate(batched)
         tf_logging.info("Batch %d: %s", i, results[0])
         self.assertEqual(len(results[0]), batch_size)
         all_counts.extend(results[0])
@@ -1702,10 +1781,11 @@ class ShuffleBatchTest(test_lib.TestCase):
 
       # Reached the limit.
       with self.assertRaises(errors_impl.OutOfRangeError):
-        sess.run(batched)
+        self.evaluate(batched)
       for thread in threads:
         thread.join()
 
+  @test_util.run_deprecated_v1
   def testManyThreadsSmallerBatch(self):
     with self.cached_session() as sess:
       batch_size = 10
@@ -1733,7 +1813,7 @@ class ShuffleBatchTest(test_lib.TestCase):
 
       all_counts = []
       for i in range(num_batches):
-        results = sess.run(batched)
+        results = self.evaluate(batched)
         tf_logging.info("Batch %d: %s", i, results[0])
         self.assertEqual(len(results[0]), batch_size)
         all_counts.extend(results[0])
@@ -1745,7 +1825,7 @@ class ShuffleBatchTest(test_lib.TestCase):
         self.assertAllEqual(results[2], [b"string"] * batch_size)
 
       # Reached the final batch with extra elements.
-      results = sess.run(batched)
+      results = self.evaluate(batched)
       self.assertAllEqual(results[0].shape, [extra_elements])
       self.assertAllEqual(results[1].dense_shape, [extra_elements, 1])
       self.assertAllEqual(results[2], [b"string"] * extra_elements)
@@ -1760,10 +1840,11 @@ class ShuffleBatchTest(test_lib.TestCase):
 
       # Reached the limit.
       with self.assertRaises(errors_impl.OutOfRangeError):
-        sess.run(batched)
+        self.evaluate(batched)
       for thread in threads:
         thread.join()
 
+  @test_util.run_deprecated_v1
   def testSharedName(self):
     with self.cached_session():
       batch_size = 10
@@ -1813,35 +1894,42 @@ class ShuffleBatchTest(test_lib.TestCase):
       threads = queue_runner_impl.start_queue_runners()
 
       for _ in range(num_batches):
-        results = sess.run(batched)
+        results = self.evaluate(batched)
         self.assertAllEqual([0] * batch_size, np.mod(results[0], 2))
         self.assertAllEqual([0] * batch_size, np.mod(results[1].values, 2))
         self.assertAllEqual([b"string"] * batch_size, results[2])
 
       # Reached the limit.
       with self.assertRaises(errors_impl.OutOfRangeError):
-        sess.run(batched)
+        self.evaluate(batched)
       for thread in threads:
         thread.join()
 
+  @test_util.run_deprecated_v1
   def testSingleThreadKeepInput(self):
     self._testKeepInputHelper(1, False)
 
+  @test_util.run_deprecated_v1
   def testSingleThreadKeepInputEnqueueMany(self):
     self._testKeepInputHelper(1, True)
 
+  @test_util.run_deprecated_v1
   def testMultipleThreadKeepInput(self):
     self._testKeepInputHelper(5, False)
 
+  @test_util.run_deprecated_v1
   def testMultipleThreadKeepInputEnqueueMany(self):
     self._testKeepInputHelper(5, True)
 
+  @test_util.run_deprecated_v1
   def testSingleThreadKeepInputPerExample(self):
     self._testKeepInputHelper(1, True, keep_input_vector=True)
 
+  @test_util.run_deprecated_v1
   def testMultipleThreadKeepInputPerExample(self):
     self._testKeepInputHelper(5, True, keep_input_vector=True)
 
+  @test_util.run_deprecated_v1
   def testInvalidKeepInputVector(self):
     # Can't have vector `keep_input` with `enqueue_many=False`.
     with self.assertRaisesRegexp(ValueError, "`keep_input` cannot be a vector"):
@@ -1860,6 +1948,7 @@ class ShuffleBatchTest(test_lib.TestCase):
                               keep_input=array_ops.placeholder(dtypes.bool),
                               enqueue_many=True)
 
+  @test_util.run_deprecated_v1
   def testMaybeBatchedSparseTensorInferredShape(self):
     sparse = sparse_tensor.SparseTensor(
         indices=[[0]], values=[1.0], dense_shape=[1])
@@ -1867,6 +1956,7 @@ class ShuffleBatchTest(test_lib.TestCase):
     batched = inp.maybe_shuffle_batch([sparse], 2, 10, 1, True)
     self.assertAllEqual((2,), batched.dense_shape.get_shape().as_list())
 
+  @test_util.run_deprecated_v1
   def testMaybeBatchedSparseTensorInferredShapeEnqueueMany(self):
     sparse = sparse_tensor.SparseTensor(
         indices=[[0]], values=[1.0], dense_shape=[1])
@@ -1875,6 +1965,7 @@ class ShuffleBatchTest(test_lib.TestCase):
         [sparse], 2, 10, 1, True, enqueue_many=True)
     self.assertAllEqual((1,), batched.dense_shape.get_shape().as_list())
 
+  @test_util.run_deprecated_v1
   def testMaybeBatchedSparseTensorInferredShapeEnqueueManyPerExample(self):
     sparse = sparse_tensor.SparseTensor(
         indices=[[0], [0]], values=[1.0, 2.0], dense_shape=[2])
@@ -1883,6 +1974,7 @@ class ShuffleBatchTest(test_lib.TestCase):
         [sparse], 2, 10, 1, [True, False], enqueue_many=True)
     self.assertAllEqual((1,), batched.dense_shape.get_shape().as_list())
 
+  @test_util.run_deprecated_v1
   def testMaybeBatchedSparseTensorInferredShapeUnknownRank(self):
     sparse = sparse_tensor.SparseTensor(
         indices=array_ops.placeholder(dtypes.int64),
@@ -1892,6 +1984,7 @@ class ShuffleBatchTest(test_lib.TestCase):
     batched = inp.maybe_shuffle_batch([sparse], 2, 10, 1, True)
     self.assertIs(None, batched.dense_shape.get_shape().num_elements())
 
+  @test_util.run_deprecated_v1
   def testMaybeBatchedSparseTensorInferredShapeUnknownRankEnqueueMany(self):
     sparse = sparse_tensor.SparseTensor(
         indices=array_ops.placeholder(dtypes.int64),
@@ -1902,6 +1995,7 @@ class ShuffleBatchTest(test_lib.TestCase):
         [sparse], 2, 10, 1, True, enqueue_many=True)
     self.assertIs(None, batched.dense_shape.get_shape().num_elements())
 
+  @test_util.run_deprecated_v1
   def testMaybeBatchedSparseTensorInferredShapeUnknownRankPerExample(self):
     sparse = sparse_tensor.SparseTensor(
         indices=array_ops.placeholder(dtypes.int64),
@@ -1986,7 +2080,7 @@ class ShuffleBatchJoinTest(test_lib.TestCase):
       saw_both = 0
       num_batches = (num_a + num_b) // batch_size
       for i in range(num_batches):
-        results = sess.run(batched_fetch)
+        results = self.evaluate(batched_fetch)
         self.assertEqual(3, len(results))
         self.assertEqual(len(results[0]), batch_size)
         self.assertEqual(len(results[2]), batch_size)
@@ -2016,16 +2110,19 @@ class ShuffleBatchJoinTest(test_lib.TestCase):
 
       # Reached the limit.
       with self.assertRaises(errors_impl.OutOfRangeError):
-        sess.run(batched_fetch)
+        self.evaluate(batched_fetch)
       for thread in threads:
         thread.join()
 
+  @test_util.run_deprecated_v1
   def testTwoThreads(self):
     self._testTwoThreadsHelper(use_dict=False)
 
+  @test_util.run_deprecated_v1
   def testTwoThreadsDict(self):
     self._testTwoThreadsHelper(use_dict=True)
 
+  @test_util.run_deprecated_v1
   def testTwoThreadsSmallerBatch(self):
     with self.cached_session() as sess:
       # Two threads, the first generates (0..26, "a").
@@ -2078,7 +2175,7 @@ class ShuffleBatchJoinTest(test_lib.TestCase):
       saw_both = 0
       num_batches = (num_a + num_b) // batch_size
       for i in range(num_batches):
-        results = sess.run(batched)
+        results = self.evaluate(batched)
         tf_logging.info("Batch %d: %s", i, results[0])
         self.assertEqual(len(results[0]), batch_size)
         self.assertEqual(len(results[2]), batch_size)
@@ -2098,7 +2195,7 @@ class ShuffleBatchJoinTest(test_lib.TestCase):
                             [results[0][i] for i in which_b])
 
       # Reached end with 2 * extra_elements left
-      results = sess.run(batched)
+      results = self.evaluate(batched)
       self.assertEqual(len(results[0]), 2 * extra_elements)
       self.assertAllEqual(results[1].dense_shape, [2 * extra_elements, 1])
       self.assertEqual(len(results[2]), 2 * extra_elements)
@@ -2125,10 +2222,11 @@ class ShuffleBatchJoinTest(test_lib.TestCase):
 
       # Reached the limit.
       with self.assertRaises(errors_impl.OutOfRangeError):
-        sess.run(batched)
+        self.evaluate(batched)
       for thread in threads:
         thread.join()
 
+  @test_util.run_deprecated_v1
   def testMismatchedDictKeys(self):
     with self.assertRaisesRegexp(ValueError, "must have the same keys"):
       inp.shuffle_batch_join(
@@ -2146,6 +2244,7 @@ class ShuffleBatchJoinTest(test_lib.TestCase):
           min_after_dequeue=16,
           seed=223607)
 
+  @test_util.run_deprecated_v1
   def testSharedName(self):
     with self.cached_session():
       batch_size = 10
@@ -2199,35 +2298,42 @@ class ShuffleBatchJoinTest(test_lib.TestCase):
       threads = queue_runner_impl.start_queue_runners()
 
       for _ in range(num_batches):
-        results = sess.run(batched)
+        results = self.evaluate(batched)
         self.assertAllEqual([0] * batch_size, np.mod(results[0], 2))
         self.assertAllEqual([0] * batch_size, np.mod(results[1].values, 2))
         self.assertAllEqual([b"string"] * batch_size, results[2])
 
       # Reached the limit.
       with self.assertRaises(errors_impl.OutOfRangeError):
-        sess.run(batched)
+        self.evaluate(batched)
       for thread in threads:
         thread.join()
 
+  @test_util.run_deprecated_v1
   def testSingleThreadKeepInput(self):
     self._testKeepInputHelper(1, False)
 
+  @test_util.run_deprecated_v1
   def testSingleThreadKeepInputEnqueueMany(self):
     self._testKeepInputHelper(1, True)
 
+  @test_util.run_deprecated_v1
   def testMultipleThreadKeepInput(self):
     self._testKeepInputHelper(5, False)
 
+  @test_util.run_deprecated_v1
   def testMultipleThreadKeepInputEnqueueMany(self):
     self._testKeepInputHelper(5, True)
 
+  @test_util.run_deprecated_v1
   def testSingleThreadKeepInputPerExample(self):
     self._testKeepInputHelper(1, True, keep_input_vector=True)
 
+  @test_util.run_deprecated_v1
   def testMultipleThreadKeepInputPerExample(self):
     self._testKeepInputHelper(5, True, keep_input_vector=True)
 
+  @test_util.run_deprecated_v1
   def testInvalidKeepInputVector(self):
     # Can't have vector `keep_input` with `enqueue_many=False`.
     with self.assertRaisesRegexp(ValueError, "`keep_input` cannot be a vector"):
@@ -2249,6 +2355,7 @@ class ShuffleBatchJoinTest(test_lib.TestCase):
           keep_input=array_ops.placeholder(dtypes.bool),
           enqueue_many=True)
 
+  @test_util.run_deprecated_v1
   def testMaybeBatchedSparseTensorInferredShape(self):
     sparse = sparse_tensor.SparseTensor(
         indices=[[0]], values=[1.0], dense_shape=[1])
@@ -2256,6 +2363,7 @@ class ShuffleBatchJoinTest(test_lib.TestCase):
     batched = inp.maybe_shuffle_batch_join([[sparse]], 2, 10, 1, True)
     self.assertAllEqual((2,), batched.dense_shape.get_shape().as_list())
 
+  @test_util.run_deprecated_v1
   def testMaybeBatchedSparseTensorInferredShapeEnqueueMany(self):
     sparse = sparse_tensor.SparseTensor(
         indices=[[0]], values=[1.0], dense_shape=[1])
@@ -2264,6 +2372,7 @@ class ShuffleBatchJoinTest(test_lib.TestCase):
         [[sparse]], 2, 10, 1, True, enqueue_many=True)
     self.assertAllEqual((1,), batched.dense_shape.get_shape().as_list())
 
+  @test_util.run_deprecated_v1
   def testMaybeBatchedSparseTensorInferredShapeEnqueueManyPerExample(self):
     sparse = sparse_tensor.SparseTensor(
         indices=[[0], [0]], values=[1.0, 2.0], dense_shape=[2])
@@ -2272,6 +2381,7 @@ class ShuffleBatchJoinTest(test_lib.TestCase):
         [[sparse]], 2, 10, 1, [True, False], enqueue_many=True)
     self.assertAllEqual((1,), batched.dense_shape.get_shape().as_list())
 
+  @test_util.run_deprecated_v1
   def testMaybeBatchedSparseTensorInferredShapeUnknownRank(self):
     sparse = sparse_tensor.SparseTensor(
         indices=array_ops.placeholder(dtypes.int64),
@@ -2281,6 +2391,7 @@ class ShuffleBatchJoinTest(test_lib.TestCase):
     batched = inp.maybe_shuffle_batch_join([[sparse]], 2, 10, 1, True)
     self.assertIs(None, batched.dense_shape.get_shape().num_elements())
 
+  @test_util.run_deprecated_v1
   def testMaybeBatchedSparseTensorInferredShapeUnknownRankEnqueueMany(self):
     sparse = sparse_tensor.SparseTensor(
         indices=array_ops.placeholder(dtypes.int64),
@@ -2291,6 +2402,7 @@ class ShuffleBatchJoinTest(test_lib.TestCase):
         [[sparse]], 2, 10, 1, True, enqueue_many=True)
     self.assertIs(None, batched.dense_shape.get_shape().num_elements())
 
+  @test_util.run_deprecated_v1
   def testMaybeBatchedSparseTensorInferredShapeUnknownRankPerExample(self):
     sparse = sparse_tensor.SparseTensor(
         indices=array_ops.placeholder(dtypes.int64),
diff --git a/tensorflow/python/training/learning_rate_decay.py b/tensorflow/python/training/learning_rate_decay.py
index 29b54653219..c52e89db1f4 100644
--- a/tensorflow/python/training/learning_rate_decay.py
+++ b/tensorflow/python/training/learning_rate_decay.py
@@ -100,7 +100,7 @@ def exponential_decay(learning_rate,
   return decayed_lr
 
 
-@tf_export(v1=["train.piecewise_constant"])
+@tf_export(v1=["train.piecewise_constant_decay", "train.piecewise_constant"])
 def piecewise_constant(x, boundaries, values, name=None):
   """Piecewise constant from boundaries and interval values.
 
diff --git a/tensorflow/python/training/learning_rate_decay_test.py b/tensorflow/python/training/learning_rate_decay_test.py
index 03a32f6ca09..9de5bc8168f 100644
--- a/tensorflow/python/training/learning_rate_decay_test.py
+++ b/tensorflow/python/training/learning_rate_decay_test.py
@@ -61,24 +61,24 @@ class LRDecayTest(test_util.TensorFlowTestCase):
       self.evaluate(step.assign(100))
       self.assertAllClose(self.evaluate(decayed_lr), expected, 1e-6)
 
+  @test_util.run_deprecated_v1
   def testVariables(self):
-    with self.cached_session():
-      step = variables.VariableV1(1)
-      assign_1 = step.assign(1)
-      assign_2 = step.assign(2)
-      assign_100 = step.assign(100)
-      decayed_lr = learning_rate_decay.exponential_decay(.1, step, 3, 0.96,
-                                                         staircase=True)
-      variables.global_variables_initializer().run()
-      # No change to learning rate
-      assign_1.op.run()
-      self.assertAllClose(decayed_lr.eval(), .1, 1e-6)
-      assign_2.op.run()
-      self.assertAllClose(decayed_lr.eval(), .1, 1e-6)
-      # Decayed learning rate
-      assign_100.op.run()
-      expected = .1 * 0.96 ** (100 // 3)
-      self.assertAllClose(decayed_lr.eval(), expected, 1e-6)
+    step = variables.VariableV1(1)
+    assign_1 = step.assign(1)
+    assign_2 = step.assign(2)
+    assign_100 = step.assign(100)
+    decayed_lr = learning_rate_decay.exponential_decay(
+        .1, step, 3, 0.96, staircase=True)
+    self.evaluate(variables.global_variables_initializer())
+    # No change to learning rate
+    self.evaluate(assign_1.op)
+    self.assertAllClose(self.evaluate(decayed_lr), .1, 1e-6)
+    self.evaluate(assign_2.op)
+    self.assertAllClose(self.evaluate(decayed_lr), .1, 1e-6)
+    # Decayed learning rate
+    self.evaluate(assign_100.op)
+    expected = .1 * 0.96**(100 // 3)
+    self.assertAllClose(self.evaluate(decayed_lr), expected, 1e-6)
 
   @test_util.run_in_graph_and_eager_modes
   def testPiecewiseConstant(self):
@@ -101,6 +101,7 @@ class LRDecayTest(test_util.TensorFlowTestCase):
     self.assertAllClose(self.evaluate(decayed_lr), 0.001, 1e-6)
 
   @test_util.run_in_graph_and_eager_modes
+  @test_util.run_deprecated_v1
   def testPiecewiseConstantEdgeCases(self):
     x_int = resource_variable_ops.ResourceVariable(
         0, dtype=variables.dtypes.int32)
diff --git a/tensorflow/python/training/learning_rate_decay_v2.py b/tensorflow/python/training/learning_rate_decay_v2.py
index 9c5e144be6b..eb69feb17d3 100644
--- a/tensorflow/python/training/learning_rate_decay_v2.py
+++ b/tensorflow/python/training/learning_rate_decay_v2.py
@@ -117,7 +117,7 @@ def exponential_decay(learning_rate,
                            decay_rate, staircase, name)
 
 
-@tf_export("train.piecewise_constant", v1=[])
+@tf_export("train.piecewise_constant_decay", v1=[])
 def piecewise_constant(x, boundaries, values, name=None):
   """Piecewise constant from boundaries and interval values.
 
diff --git a/tensorflow/python/training/learning_rate_decay_v2_test.py b/tensorflow/python/training/learning_rate_decay_v2_test.py
index b2ac93f06fe..cb96773e299 100644
--- a/tensorflow/python/training/learning_rate_decay_v2_test.py
+++ b/tensorflow/python/training/learning_rate_decay_v2_test.py
@@ -61,24 +61,24 @@ class LRDecayTestV2(test_util.TensorFlowTestCase):
       self.evaluate(step.assign(100))
       self.assertAllClose(self.evaluate(decayed_lr()), expected, 1e-6)
 
+  @test_util.run_deprecated_v1
   def testVariables(self):
-    with self.cached_session():
-      step = variables.Variable(1)
-      assign_1 = step.assign(1)
-      assign_2 = step.assign(2)
-      assign_100 = step.assign(100)
-      decayed_lr = learning_rate_decay_v2.exponential_decay(.1, step, 3, 0.96,
-                                                            staircase=True)
-      variables.global_variables_initializer().run()
-      # No change to learning rate
-      assign_1.op.run()
-      self.assertAllClose(decayed_lr().eval(), .1, 1e-6)
-      assign_2.op.run()
-      self.assertAllClose(decayed_lr().eval(), .1, 1e-6)
-      # Decayed learning rate
-      assign_100.op.run()
-      expected = .1 * 0.96 ** (100 // 3)
-      self.assertAllClose(decayed_lr().eval(), expected, 1e-6)
+    step = variables.Variable(1)
+    assign_1 = step.assign(1)
+    assign_2 = step.assign(2)
+    assign_100 = step.assign(100)
+    decayed_lr = learning_rate_decay_v2.exponential_decay(
+        .1, step, 3, 0.96, staircase=True)
+    self.evaluate(variables.global_variables_initializer())
+    # No change to learning rate
+    self.evaluate(assign_1.op)
+    self.assertAllClose(self.evaluate(decayed_lr()), .1, 1e-6)
+    self.evaluate(assign_2.op)
+    self.assertAllClose(self.evaluate(decayed_lr()), .1, 1e-6)
+    # Decayed learning rate
+    self.evaluate(assign_100.op)
+    expected = .1 * 0.96**(100 // 3)
+    self.assertAllClose(self.evaluate(decayed_lr()), expected, 1e-6)
 
   @test_util.run_in_graph_and_eager_modes
   def testPiecewiseConstant(self):
diff --git a/tensorflow/python/training/momentum.py b/tensorflow/python/training/momentum.py
index 4a280e7c514..f3bc83bbfa1 100644
--- a/tensorflow/python/training/momentum.py
+++ b/tensorflow/python/training/momentum.py
@@ -25,7 +25,7 @@ from tensorflow.python.training import training_ops
 from tensorflow.python.util.tf_export import tf_export
 
 
-@tf_export("train.MomentumOptimizer")
+@tf_export(v1=["train.MomentumOptimizer"])
 class MomentumOptimizer(optimizer.Optimizer):
   """Optimizer that implements the Momentum algorithm.
 
diff --git a/tensorflow/python/training/momentum_test.py b/tensorflow/python/training/momentum_test.py
index 8a21c39d323..ba155fa6c64 100644
--- a/tensorflow/python/training/momentum_test.py
+++ b/tensorflow/python/training/momentum_test.py
@@ -160,6 +160,7 @@ class MomentumOptimizerTest(test.TestCase):
       self.assertStartsWith(optimizer_variables[1].name, "var3")
       self.assertEquals(2, len(optimizer_variables))
 
+  @test_util.run_deprecated_v1
   def testNesterovMomentum(self):
     for dtype in [dtypes.float32, dtypes.float64]:
       with self.cached_session():
@@ -183,9 +184,10 @@ class MomentumOptimizerTest(test.TestCase):
           var1_np, accum1_np = self._update_nesterov_momentum_numpy(var1_np,
                                                                     accum1_np,
                                                                     3, 2.0, 0.9)
-          self.assertAllClose(var0_np, var0.eval())
-          self.assertAllClose(var1_np, var1.eval())
+          self.assertAllClose(var0_np, self.evaluate(var0))
+          self.assertAllClose(var1_np, self.evaluate(var1))
 
+  @test_util.run_deprecated_v1
   def testSparseNesterovMomentum(self):
     for dtype in [dtypes.float32, dtypes.float64]:
       with self.cached_session():
@@ -224,8 +226,8 @@ class MomentumOptimizerTest(test.TestCase):
           var1_np, accum1_np = self._update_nesterov_momentum_numpy(var1_np,
                                                                     accum1_np,
                                                                     3, 2.0, 0.9)
-          self.assertAllClose(var0_np, var0.eval())
-          self.assertAllClose(var1_np, var1.eval())
+          self.assertAllClose(var0_np, self.evaluate(var0))
+          self.assertAllClose(var1_np, self.evaluate(var1))
 
   @test_util.run_in_graph_and_eager_modes(reset_test=True)
   def testMinimizeSparseResourceVariable(self):
@@ -280,6 +282,7 @@ class MomentumOptimizerTest(test.TestCase):
     self.evaluate(sgd_op)
     self.assertAllCloseAccordingToType([[1, 1], [0, 0]], self.evaluate(var0))
 
+  @test_util.run_deprecated_v1
   def testTensorLearningRateAndMomentum(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
       with self.cached_session():
@@ -303,37 +306,43 @@ class MomentumOptimizerTest(test.TestCase):
         self.assertFalse(slot1 in variables.trainable_variables())
 
         # Fetch params to validate initial values
-        self.assertAllClose([1.0, 2.0], var0.eval())
-        self.assertAllClose([3.0, 4.0], var1.eval())
+        self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+        self.assertAllClose([3.0, 4.0], self.evaluate(var1))
         # Step 1: the momentum accumulators where 0. So we should see a normal
         # update: v -= grad * learning_rate
         mom_update.run()
         # Check that the momentum accumulators have been updated.
-        self.assertAllCloseAccordingToType(np.array([0.1, 0.1]), slot0.eval())
-        self.assertAllCloseAccordingToType(np.array([0.01, 0.01]), slot1.eval())
+        self.assertAllCloseAccordingToType(
+            np.array([0.1, 0.1]), self.evaluate(slot0))
+        self.assertAllCloseAccordingToType(
+            np.array([0.01, 0.01]), self.evaluate(slot1))
         # Check that the parameters have been updated.
         self.assertAllCloseAccordingToType(
-            np.array([1.0 - (0.1 * 2.0), 2.0 - (0.1 * 2.0)]), var0.eval())
+            np.array([1.0 - (0.1 * 2.0), 2.0 - (0.1 * 2.0)]),
+            self.evaluate(var0))
         self.assertAllCloseAccordingToType(
-            np.array([3.0 - (0.01 * 2.0), 4.0 - (0.01 * 2.0)]), var1.eval())
+            np.array([3.0 - (0.01 * 2.0), 4.0 - (0.01 * 2.0)]),
+            self.evaluate(var1))
         # Step 2: the momentum accumulators contain the previous update.
         mom_update.run()
         # Check that the momentum accumulators have been updated.
         self.assertAllCloseAccordingToType(
-            np.array([(0.9 * 0.1 + 0.1), (0.9 * 0.1 + 0.1)]), slot0.eval())
+            np.array([(0.9 * 0.1 + 0.1), (0.9 * 0.1 + 0.1)]),
+            self.evaluate(slot0))
         self.assertAllCloseAccordingToType(
-            np.array([(0.9 * 0.01 + 0.01), (0.9 * 0.01 + 0.01)]), slot1.eval())
+            np.array([(0.9 * 0.01 + 0.01), (0.9 * 0.01 + 0.01)]),
+            self.evaluate(slot1))
         # Check that the parameters have been updated.
         self.assertAllCloseAccordingToType(
             np.array([
                 1.0 - (0.1 * 2.0) - ((0.9 * 0.1 + 0.1) * 2.0),
                 2.0 - (0.1 * 2.0) - ((0.9 * 0.1 + 0.1) * 2.0)
-            ]), var0.eval())
+            ]), self.evaluate(var0))
         self.assertAllCloseAccordingToType(
             np.array([
-                2.98 - ((0.9 * 0.01 + 0.01) * 2.0), 3.98 - (
-                    (0.9 * 0.01 + 0.01) * 2.0)
-            ]), var1.eval())
+                2.98 - ((0.9 * 0.01 + 0.01) * 2.0),
+                3.98 - ((0.9 * 0.01 + 0.01) * 2.0)
+            ]), self.evaluate(var1))
 
   def _dbParamsMom01(self):
     """Return dist-belief momentum values.
@@ -434,6 +443,7 @@ class MomentumOptimizerTest(test.TestCase):
     # pylint: enable=line-too-long
     return db_grad, db_out
 
+  @test_util.run_deprecated_v1
   def testLikeDistBeliefMom01(self):
     with self.cached_session():
       db_grad, db_out = self._dbParamsMom01()
@@ -445,8 +455,9 @@ class MomentumOptimizerTest(test.TestCase):
       variables.global_variables_initializer().run()
       for i in xrange(num_samples):
         mom_update.run(feed_dict={grads0: db_grad[i]})
-        self.assertAllClose(np.array(db_out[i]), var0.eval())
+        self.assertAllClose(np.array(db_out[i]), self.evaluate(var0))
 
+  @test_util.run_deprecated_v1
   def testSparse(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
       with self.cached_session():
@@ -476,46 +487,59 @@ class MomentumOptimizerTest(test.TestCase):
         self.assertEquals(slot1.get_shape(), var1.get_shape())
 
         # Fetch params to validate initial values
-        self.assertAllClose([0, 0], var0.eval()[0])
-        self.assertAllClose([0, 0], var0.eval()[1])
-        self.assertAllClose([1, 1], var1.eval()[2])
+        self.assertAllClose([0, 0], self.evaluate(var0)[0])
+        self.assertAllClose([0, 0], self.evaluate(var0)[1])
+        self.assertAllClose([1, 1], self.evaluate(var1)[2])
 
         # Step 1: the momentum accumulators are 0. So we should see a normal
         # update: v -= grad * learning_rate
         mom_update.run()
         # Check that the momentum accumulators have been updated.
-        self.assertAllCloseAccordingToType(np.array([0, 0]), slot0.eval()[0])
-        self.assertAllCloseAccordingToType(np.array([.1, .1]), slot0.eval()[1])
         self.assertAllCloseAccordingToType(
-            np.array([.01, .01]), slot1.eval()[2])
+            np.array([0, 0]),
+            self.evaluate(slot0)[0])
+        self.assertAllCloseAccordingToType(
+            np.array([.1, .1]),
+            self.evaluate(slot0)[1])
+        self.assertAllCloseAccordingToType(
+            np.array([.01, .01]),
+            self.evaluate(slot1)[2])
         # Check that the parameters have been updated.
-        self.assertAllCloseAccordingToType(np.array([0, 0]), var0.eval()[0])
         self.assertAllCloseAccordingToType(
-            np.array([-(0.1 * 2.0), -(0.1 * 2.0)]), var0.eval()[1])
+            np.array([0, 0]),
+            self.evaluate(var0)[0])
         self.assertAllCloseAccordingToType(
-            np.array([1.0 - (0.01 * 2.0), 1.0 - (0.01 * 2.0)]), var1.eval()[2])
+            np.array([-(0.1 * 2.0), -(0.1 * 2.0)]),
+            self.evaluate(var0)[1])
+        self.assertAllCloseAccordingToType(
+            np.array([1.0 - (0.01 * 2.0), 1.0 - (0.01 * 2.0)]),
+            self.evaluate(var1)[2])
         # Step 2: the momentum accumulators contain the previous update.
         mom_update.run()
         # Check that the momentum accumulators have been updated.
-        self.assertAllClose(np.array([0, 0]), slot0.eval()[0])
+        self.assertAllClose(np.array([0, 0]), self.evaluate(slot0)[0])
         self.assertAllCloseAccordingToType(
-            np.array([(0.9 * 0.1 + 0.1), (0.9 * 0.1 + 0.1)]), slot0.eval()[1])
+            np.array([(0.9 * 0.1 + 0.1), (0.9 * 0.1 + 0.1)]),
+            self.evaluate(slot0)[1])
         self.assertAllCloseAccordingToType(
             np.array([(0.9 * 0.01 + 0.01), (0.9 * 0.01 + 0.01)]),
-            slot1.eval()[2])
+            self.evaluate(slot1)[2])
         # Check that the parameters have been updated.
-        self.assertAllClose(np.array([0, 0]), var0.eval()[0])
+        self.assertAllClose(np.array([0, 0]), self.evaluate(var0)[0])
         self.assertAllCloseAccordingToType(
             np.array([
-                -(0.1 * 2.0) - ((0.9 * 0.1 + 0.1) * 2.0), -(0.1 * 2.0) - (
-                    (0.9 * 0.1 + 0.1) * 2.0)
-            ]), var0.eval()[1])
+                -(0.1 * 2.0) - ((0.9 * 0.1 + 0.1) * 2.0),
+                -(0.1 * 2.0) - ((0.9 * 0.1 + 0.1) * 2.0)
+            ]),
+            self.evaluate(var0)[1])
         self.assertAllCloseAccordingToType(
             np.array([
-                0.98 - ((0.9 * 0.01 + 0.01) * 2.0), 0.98 - (
-                    (0.9 * 0.01 + 0.01) * 2.0)
-            ]), var1.eval()[2])
+                0.98 - ((0.9 * 0.01 + 0.01) * 2.0),
+                0.98 - ((0.9 * 0.01 + 0.01) * 2.0)
+            ]),
+            self.evaluate(var1)[2])
 
+  @test_util.run_deprecated_v1
   def testSharing(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
       with self.cached_session():
@@ -538,37 +562,43 @@ class MomentumOptimizerTest(test.TestCase):
         self.assertEquals(slot1.get_shape(), var1.get_shape())
 
         # Fetch params to validate initial values
-        self.assertAllClose([1.0, 2.0], var0.eval())
-        self.assertAllClose([3.0, 4.0], var1.eval())
+        self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+        self.assertAllClose([3.0, 4.0], self.evaluate(var1))
         # Step 1: the momentum accumulators where 0. So we should see a normal
         # update: v -= grad * learning_rate
         mom_update1.run()
         # Check that the momentum accumulators have been updated.
-        self.assertAllCloseAccordingToType(np.array([0.1, 0.1]), slot0.eval())
-        self.assertAllCloseAccordingToType(np.array([0.01, 0.01]), slot1.eval())
+        self.assertAllCloseAccordingToType(
+            np.array([0.1, 0.1]), self.evaluate(slot0))
+        self.assertAllCloseAccordingToType(
+            np.array([0.01, 0.01]), self.evaluate(slot1))
         # Check that the parameters have been updated.
         self.assertAllCloseAccordingToType(
-            np.array([1.0 - (0.1 * 2.0), 2.0 - (0.1 * 2.0)]), var0.eval())
+            np.array([1.0 - (0.1 * 2.0), 2.0 - (0.1 * 2.0)]),
+            self.evaluate(var0))
         self.assertAllCloseAccordingToType(
-            np.array([3.0 - (0.01 * 2.0), 4.0 - (0.01 * 2.0)]), var1.eval())
+            np.array([3.0 - (0.01 * 2.0), 4.0 - (0.01 * 2.0)]),
+            self.evaluate(var1))
         # Step 2: the second momentum accumulators contain the previous update.
         mom_update2.run()
         # Check that the momentum accumulators have been updated.
         self.assertAllCloseAccordingToType(
-            np.array([(0.9 * 0.1 + 0.1), (0.9 * 0.1 + 0.1)]), slot0.eval())
+            np.array([(0.9 * 0.1 + 0.1), (0.9 * 0.1 + 0.1)]),
+            self.evaluate(slot0))
         self.assertAllCloseAccordingToType(
-            np.array([(0.9 * 0.01 + 0.01), (0.9 * 0.01 + 0.01)]), slot1.eval())
+            np.array([(0.9 * 0.01 + 0.01), (0.9 * 0.01 + 0.01)]),
+            self.evaluate(slot1))
         # Check that the parameters have been updated.
         self.assertAllCloseAccordingToType(
             np.array([
                 1.0 - (0.1 * 2.0) - ((0.9 * 0.1 + 0.1) * 2.0),
                 2.0 - (0.1 * 2.0) - ((0.9 * 0.1 + 0.1) * 2.0)
-            ]), var0.eval())
+            ]), self.evaluate(var0))
         self.assertAllCloseAccordingToType(
             np.array([
-                2.98 - ((0.9 * 0.01 + 0.01) * 2.0), 3.98 - (
-                    (0.9 * 0.01 + 0.01) * 2.0)
-            ]), var1.eval())
+                2.98 - ((0.9 * 0.01 + 0.01) * 2.0),
+                3.98 - ((0.9 * 0.01 + 0.01) * 2.0)
+            ]), self.evaluate(var1))
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/training/monitored_session.py b/tensorflow/python/training/monitored_session.py
index 162fef971db..6a7d27df5c3 100644
--- a/tensorflow/python/training/monitored_session.py
+++ b/tensorflow/python/training/monitored_session.py
@@ -54,7 +54,7 @@ _PREEMPTION_ERRORS = (errors.AbortedError, errors.UnavailableError)
 USE_DEFAULT = object()
 
 
-@tf_export('train.Scaffold')
+@tf_export(v1=['train.Scaffold'])
 class Scaffold(object):
   """Structure to create or gather pieces commonly needed to train a model.
 
@@ -508,7 +508,7 @@ def MonitoredTrainingSession(master='',  # pylint: disable=invalid-name
       stop_grace_period_secs=stop_grace_period_secs)
 
 
-@tf_export('train.SessionCreator')
+@tf_export(v1=['train.SessionCreator'])
 @six.add_metaclass(abc.ABCMeta)
 class SessionCreator(object):
   """A factory for tf.Session."""
@@ -519,7 +519,7 @@ class SessionCreator(object):
         'create_session is not implemented for {}.'.format(self))
 
 
-@tf_export('train.ChiefSessionCreator')
+@tf_export(v1=['train.ChiefSessionCreator'])
 class ChiefSessionCreator(SessionCreator):
   """Creates a tf.Session for a chief."""
 
@@ -571,7 +571,7 @@ class ChiefSessionCreator(SessionCreator):
         init_fn=self._scaffold.init_fn)
 
 
-@tf_export('train.WorkerSessionCreator')
+@tf_export(v1=['train.WorkerSessionCreator'])
 class WorkerSessionCreator(SessionCreator):
   """Creates a tf.Session for a worker."""
 
@@ -840,10 +840,18 @@ class _MonitoredSession(object):
     return self._coordinated_creator.tf_sess is None
 
   def _tf_sess(self):
+    """Return underlying tf.Session object.
+
+    Warning: accessing the returned object in user code is likely to cause races
+    or "flaky tests".
+
+    Returns:
+      A tf.Session object.
+    """
     return self._coordinated_creator.tf_sess
 
 
-@tf_export('train.MonitoredSession')
+@tf_export(v1=['train.MonitoredSession'])
 class MonitoredSession(_MonitoredSession):
   """Session-like object that handles initialization, recovery and hooks.
 
@@ -926,7 +934,7 @@ class MonitoredSession(_MonitoredSession):
         stop_grace_period_secs=stop_grace_period_secs)
 
 
-@tf_export('train.SingularMonitoredSession')
+@tf_export(v1=['train.SingularMonitoredSession'])
 class SingularMonitoredSession(_MonitoredSession):
   """Session-like object that handles initialization, restoring, and hooks.
 
diff --git a/tensorflow/python/training/monitored_session_test.py b/tensorflow/python/training/monitored_session_test.py
index c870d99de9e..9dbcfa52b7c 100644
--- a/tensorflow/python/training/monitored_session_test.py
+++ b/tensorflow/python/training/monitored_session_test.py
@@ -37,6 +37,7 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import resource_variable_ops
@@ -382,6 +383,16 @@ class MonitoredTrainingSessionTest(test.TestCase):
         self.assertEqual(0, session.run(gstep))
 
 
+class MockExtended(object):
+
+  def __init__(self, between_graph, should_init, should_checkpoint,
+               should_save_summary):
+    self.experimental_between_graph = between_graph
+    self.experimental_should_init = should_init
+    self.should_checkpoint = should_checkpoint
+    self.should_save_summary = should_save_summary
+
+
 class MockStrategy(object):
 
   def __init__(self,
@@ -389,26 +400,8 @@ class MockStrategy(object):
                should_init=True,
                should_checkpoint=None,
                should_save_summary=None):
-    self._between_graph = between_graph
-    self._should_init = should_init
-    self._should_checkpoint = should_checkpoint
-    self._should_save_summary = should_save_summary
-
-  @property
-  def between_graph(self):
-    return self._between_graph
-
-  @property
-  def should_init(self):
-    return self._should_init
-
-  @property
-  def should_checkpoint(self):
-    return self._should_checkpoint
-
-  @property
-  def should_save_summary(self):
-    return self._should_save_summary
+    self.extended = MockExtended(between_graph, should_init, should_checkpoint,
+                                 should_save_summary)
 
 
 class MonitoredTrainingSessionWithDistributeCoordinatorTest(test.TestCase):
@@ -512,6 +505,7 @@ class StopAtNSession(monitored_session._WrappedSession):
 class WrappedSessionTest(test.TestCase):
   """_WrappedSession tests."""
 
+  @test_util.run_deprecated_v1
   def test_properties(self):
     with self.cached_session() as sess:
       constant_op.constant(0.0)
@@ -519,6 +513,7 @@ class WrappedSessionTest(test.TestCase):
       self.assertEquals(sess.graph, wrapped_sess.graph)
       self.assertEquals(sess.sess_str, wrapped_sess.sess_str)
 
+  @test_util.run_deprecated_v1
   def test_should_stop_on_close(self):
     with self.cached_session() as sess:
       wrapped_sess = monitored_session._WrappedSession(sess)
@@ -526,6 +521,7 @@ class WrappedSessionTest(test.TestCase):
       wrapped_sess.close()
       self.assertTrue(wrapped_sess.should_stop())
 
+  @test_util.run_deprecated_v1
   def test_should_stop_uses_check_stop(self):
     with self.cached_session() as sess:
       wrapped_sess = StopAtNSession(sess, 3)
@@ -534,6 +530,7 @@ class WrappedSessionTest(test.TestCase):
       self.assertFalse(wrapped_sess.should_stop())
       self.assertTrue(wrapped_sess.should_stop())
 
+  @test_util.run_deprecated_v1
   def test_should_stop_delegates_to_wrapped_session(self):
     with self.cached_session() as sess:
       wrapped_sess0 = StopAtNSession(sess, 4)
@@ -552,6 +549,7 @@ class WrappedSessionTest(test.TestCase):
       wrapped_sess.close()
       self.assertTrue(wrapped_sess.should_stop())
 
+  @test_util.run_deprecated_v1
   def test_run(self):
     with self.cached_session() as sess:
       c = constant_op.constant(0)
@@ -569,6 +567,7 @@ def busy_wait_for_coord_stop(coord):
 class CoordinatedSessionTest(test.TestCase):
   """_CoordinatedSession tests."""
 
+  @test_util.run_deprecated_v1
   def test_properties(self):
     with self.cached_session() as sess:
       constant_op.constant(0.0)
@@ -577,6 +576,7 @@ class CoordinatedSessionTest(test.TestCase):
       self.assertEquals(sess.graph, coord_sess.graph)
       self.assertEquals(sess.sess_str, coord_sess.sess_str)
 
+  @test_util.run_deprecated_v1
   def test_run(self):
     with self.cached_session() as sess:
       c = constant_op.constant(0)
@@ -585,6 +585,7 @@ class CoordinatedSessionTest(test.TestCase):
       coord_sess = monitored_session._CoordinatedSession(sess, coord)
       self.assertEqual(42, coord_sess.run(v, feed_dict={c: 42}))
 
+  @test_util.run_deprecated_v1
   def test_should_stop_on_close(self):
     with self.cached_session() as sess:
       coord = coordinator.Coordinator()
@@ -593,6 +594,7 @@ class CoordinatedSessionTest(test.TestCase):
       coord_sess.close()
       self.assertTrue(coord_sess.should_stop())
 
+  @test_util.run_deprecated_v1
   def test_should_stop_on_coord_stop(self):
     with self.cached_session() as sess:
       coord = coordinator.Coordinator()
@@ -601,6 +603,7 @@ class CoordinatedSessionTest(test.TestCase):
       coord.request_stop()
       self.assertTrue(coord_sess.should_stop())
 
+  @test_util.run_deprecated_v1
   def test_dont_request_stop_on_exception_in_main_thread(self):
     with self.cached_session() as sess:
       c = constant_op.constant(0)
@@ -615,6 +618,7 @@ class CoordinatedSessionTest(test.TestCase):
       self.assertFalse(coord.should_stop())
       self.assertFalse(coord_sess.should_stop())
 
+  @test_util.run_deprecated_v1
   def test_stop_threads_on_close_after_exception(self):
     with self.cached_session() as sess:
       c = constant_op.constant(0)
@@ -662,6 +666,7 @@ class CoordinatedSessionTest(test.TestCase):
       self.assertTrue(coord.should_stop())
       self.assertTrue(coord_sess.should_stop())
 
+  @test_util.run_deprecated_v1
   def test_propagates_exception_trace(self):
     assertion = control_flow_ops.Assert(False, ['This should fail.'])
     with self.cached_session() as sess:
@@ -809,6 +814,7 @@ class RecoverableSessionTest(test.TestCase):
     def create_session(self):
       return self._sess
 
+  @test_util.run_deprecated_v1
   def test_properties(self):
     with self.cached_session() as sess:
       constant_op.constant(0.0)
@@ -817,6 +823,7 @@ class RecoverableSessionTest(test.TestCase):
       self.assertEquals(sess.graph, recoverable_sess.graph)
       self.assertEquals(sess.sess_str, recoverable_sess.sess_str)
 
+  @test_util.run_deprecated_v1
   def test_run(self):
     with self.cached_session() as sess:
       c = constant_op.constant(0)
@@ -825,6 +832,7 @@ class RecoverableSessionTest(test.TestCase):
           self._SessionReturner(sess))
       self.assertEqual(51, recoverable_sess.run(v, feed_dict={c: 51}))
 
+  @test_util.run_deprecated_v1
   def test_recovery(self):
     with self.cached_session() as sess:
 
@@ -871,6 +879,7 @@ class RecoverableSessionTest(test.TestCase):
       with self.assertRaisesRegexp(IndexError, 'pop from empty list'):
         recoverable_sess.run(v, feed_dict={c: -12})
 
+  @test_util.run_deprecated_v1
   def test_recovery_from_coordinator_exception(self):
     with self.cached_session() as test_session:
       session_creator = CountingSessionCreator(test_session)
@@ -896,6 +905,7 @@ class RecoverableSessionTest(test.TestCase):
       self.assertFalse(session.should_stop())
       self.assertEqual(2, session_creator.number_of_sessions_created)
 
+  @test_util.run_deprecated_v1
   def test_recovery_from_non_preemption_in_coordinator(self):
     with self.cached_session() as test_session:
       session_creator = CountingSessionCreator(test_session)
@@ -925,6 +935,7 @@ class RecoverableSessionTest(test.TestCase):
       with self.assertRaises(errors_impl.UnknownError):
         session.close()
 
+  @test_util.run_deprecated_v1
   def test_recovery_from_session_getting_stuck(self):
     with self.cached_session() as test_session:
       session_creator = CountingSessionCreator(test_session)
@@ -949,6 +960,7 @@ class RecoverableSessionTest(test.TestCase):
       self.assertFalse(session.should_stop())
       self.assertEqual(2, session_creator.number_of_sessions_created)
 
+  @test_util.run_deprecated_v1
   def test_step_fn_recovery_from_coordinator_exception_when_run_hooks(self):
     with self.cached_session() as test_session:
       session_creator = CountingSessionCreator(test_session)
@@ -979,6 +991,7 @@ class RecoverableSessionTest(test.TestCase):
       self.assertFalse(session.should_stop())
       self.assertEqual(2, session_creator.number_of_sessions_created)
 
+  @test_util.run_deprecated_v1
   def test_recovery_from_non_preemption_in_coordinator_when_run_hooks(self):
     with self.cached_session() as test_session:
       session_creator = CountingSessionCreator(test_session)
@@ -1013,6 +1026,7 @@ class RecoverableSessionTest(test.TestCase):
       with self.assertRaises(errors_impl.UnknownError):
         session.close()
 
+  @test_util.run_deprecated_v1
   def test_recovery_from_session_getting_stuck_when_run_hooks(self):
     with self.cached_session() as test_session:
       session_creator = CountingSessionCreator(test_session)
@@ -1057,6 +1071,7 @@ class RecoverableSessionTest(test.TestCase):
     # exception.
     return session
 
+  @test_util.run_deprecated_v1
   def test_step_fn_recovery_from_coordinator_exception_with_raw_session(self):
     with self.cached_session() as test_session:
       session_creator = CountingSessionCreator(test_session)
@@ -1089,6 +1104,7 @@ class RecoverableSessionTest(test.TestCase):
       self.assertFalse(session.should_stop())
       self.assertEqual(2, session_creator.number_of_sessions_created)
 
+  @test_util.run_deprecated_v1
   def test_recovery_from_non_preemption_in_coordinator_with_raw_session(self):
     with self.cached_session() as test_session:
       session_creator = CountingSessionCreator(test_session)
@@ -1126,6 +1142,7 @@ class RecoverableSessionTest(test.TestCase):
       with self.assertRaises(errors_impl.UnknownError):
         session.close()
 
+  @test_util.run_deprecated_v1
   def test_recovery_from_session_getting_stuck_with_raw_session(self):
     with self.cached_session() as test_session:
       session_creator = CountingSessionCreator(test_session)
@@ -1178,7 +1195,7 @@ class HookedSessionTest(test.TestCase):
       mock_run = FakeSession(sess)
       mon_sess = monitored_session._HookedSession(sess=mock_run, hooks=[])
       a_tensor = constant_op.constant([0], name='a_tensor')
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       output = mon_sess.run(fetches=a_tensor,
                             feed_dict='a_feed',
                             options='an_option',
@@ -1197,7 +1214,7 @@ class HookedSessionTest(test.TestCase):
       mon_sess = monitored_session._HookedSession(
           sess=sess, hooks=[mock_hook, mock_hook2])
       a_tensor = constant_op.constant([0], name='a_tensor')
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       mon_sess.run(a_tensor)
 
       for hook in [mock_hook, mock_hook2]:
@@ -1222,7 +1239,7 @@ class HookedSessionTest(test.TestCase):
       mon_sess = monitored_session._HookedSession(
           sess=sess, hooks=[mock_hook, mock_hook2])
       constant_op.constant([0], name='a_tensor')
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
 
       mon_sess.run(fetches='a_tensor')
       self.assertFalse(mon_sess.should_stop())
@@ -1242,7 +1259,7 @@ class HookedSessionTest(test.TestCase):
       third_tensor = constant_op.constant([10], name='third_tensor')
       mock_hook.request = session_run_hook.SessionRunArgs([another_tensor])
       mock_hook2.request = session_run_hook.SessionRunArgs([third_tensor])
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
 
       output = mon_sess.run(fetches=a_tensor)
       self.assertEqual(output, [0])
@@ -1262,7 +1279,7 @@ class HookedSessionTest(test.TestCase):
           None, feed_dict={a_tensor: [5]})
       mock_hook2.request = session_run_hook.SessionRunArgs(
           None, feed_dict={b_tensor: [10]})
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
 
       self.assertEqual(mon_sess.run(fetches=add_tensor), [15])
 
@@ -1280,7 +1297,7 @@ class HookedSessionTest(test.TestCase):
           None, feed_dict={a_tensor: [5]})
       mock_hook2.request = session_run_hook.SessionRunArgs(
           None, feed_dict={b_tensor: [10]})
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
 
       feed_dict = {c_tensor: [20]}
       self.assertEqual(
@@ -1301,7 +1318,7 @@ class HookedSessionTest(test.TestCase):
           None, feed_dict={a_tensor: [5]})
       mock_hook2.request = session_run_hook.SessionRunArgs(
           None, feed_dict={a_tensor: [10]})
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
 
       with self.assertRaisesRegexp(RuntimeError, 'Same tensor is fed'):
         mon_sess.run(fetches=add_tensor)
@@ -1319,7 +1336,7 @@ class HookedSessionTest(test.TestCase):
           None, feed_dict={a_tensor: [5]})
       mock_hook2.request = session_run_hook.SessionRunArgs(
           None, feed_dict={b_tensor: [10]})
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
 
       with self.assertRaisesRegexp(RuntimeError, 'Same tensor is fed'):
         mon_sess.run(fetches=add_tensor, feed_dict={b_tensor: [10]})
@@ -1451,6 +1468,7 @@ class MonitoredSessionTest(test.TestCase):
   # This set of tests, verifies the supervised session behavior when exceptions
   # are raised next to the innermost session run() call.
 
+  @test_util.run_deprecated_v1
   def test_recovery(self):
     logdir = _test_dir(self.get_temp_dir(), 'test_recovery')
     with ops.Graph().as_default():
@@ -1803,6 +1821,7 @@ class MonitoredSessionTest(test.TestCase):
             isinstance(hook.run_metadata_list[0], config_pb2.RunMetadata))
         self.assertGreater(len(hook.run_metadata_list[0].partition_graphs), 0)
 
+  @test_util.run_deprecated_v1
   def test_with_statement_and_close(self):
     # Test case for https://github.com/tensorflow/tensorflow/issues/12224
     # where close() inside the with should have a better error message.
diff --git a/tensorflow/python/training/moving_averages.py b/tensorflow/python/training/moving_averages.py
index fc9eb479cc3..8785f9a8e71 100644
--- a/tensorflow/python/training/moving_averages.py
+++ b/tensorflow/python/training/moving_averages.py
@@ -17,6 +17,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.distribute import reduce_util as ds_reduce_util
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import control_flow_ops
@@ -95,11 +96,11 @@ def assign_moving_average(variable, value, decay, zero_debias=True, name=None):
       # In a replica context, we update variable using the mean of value across
       # replicas.
       def merge_fn(strategy, v, value):
-        value = strategy.reduce(
-            variable_scope.VariableAggregation.MEAN, value, v)
+        value = strategy.extended.reduce_to(
+            ds_reduce_util.ReduceOp.MEAN, value, v)
         return strategy.update(v, update_fn, value)
 
-      return replica_context.merge_call(merge_fn, variable, value)
+      return replica_context.merge_call(merge_fn, args=(variable, value))
     else:
       strategy = distribution_strategy_context.get_cross_replica_context()
       return strategy.update(variable, update_fn, value)
diff --git a/tensorflow/python/training/moving_averages_test.py b/tensorflow/python/training/moving_averages_test.py
index bb2fca66e3c..b15f7377f07 100644
--- a/tensorflow/python/training/moving_averages_test.py
+++ b/tensorflow/python/training/moving_averages_test.py
@@ -35,6 +35,7 @@ from tensorflow.python.training import saver as saver_lib
 
 class MovingAveragesTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testAssignMovingAverageWithoutZeroDebias(self):
     with self.cached_session():
       var = variables.Variable([10.0, 11.0])
@@ -43,12 +44,13 @@ class MovingAveragesTest(test.TestCase):
       assign = moving_averages.assign_moving_average(
           var, val, decay, zero_debias=False)
       variables.global_variables_initializer().run()
-      self.assertAllClose([10.0, 11.0], var.eval())
+      self.assertAllClose([10.0, 11.0], self.evaluate(var))
       assign.op.run()
       self.assertAllClose(
           [10.0 * 0.25 + 1.0 * (1.0 - 0.25), 11.0 * 0.25 + 2.0 * (1.0 - 0.25)],
-          var.eval())
+          self.evaluate(var))
 
+  @test_util.run_deprecated_v1
   def testAssignMovingAverage(self):
     with self.cached_session():
       var = variables.Variable([0.0, 0.0])
@@ -56,12 +58,13 @@ class MovingAveragesTest(test.TestCase):
       decay = 0.25
       assign = moving_averages.assign_moving_average(var, val, decay)
       variables.global_variables_initializer().run()
-      self.assertAllClose([0.0, 0.0], var.eval())
+      self.assertAllClose([0.0, 0.0], self.evaluate(var))
       assign.op.run()
-      self.assertAllClose([
-          1.0 * (1.0 - 0.25) / (1 - 0.25), 2.0 * (1.0 - 0.25) / (1 - 0.25)
-      ], var.eval())
+      self.assertAllClose(
+          [1.0 * (1.0 - 0.25) / (1 - 0.25), 2.0 * (1.0 - 0.25) / (1 - 0.25)],
+          self.evaluate(var))
 
+  @test_util.run_deprecated_v1
   def testAssignMovingAverageNewNamingMultipleCalls(self):
     with variable_scope.variable_scope("scope1") as vs1:
       with variable_scope.variable_scope("scope2"):
@@ -76,6 +79,7 @@ class MovingAveragesTest(test.TestCase):
     actual_names = [v.name for v in vs1.global_variables()]
     self.assertSetEqual(set(expected_names), set(actual_names))
 
+  @test_util.run_deprecated_v1
   def testAssignMovingAverageNewNamingMultipleCallsWithReuse(self):
     with variable_scope.variable_scope("scope1") as vs1:
       var = variable_scope.get_variable("Var", shape=[])
@@ -86,6 +90,7 @@ class MovingAveragesTest(test.TestCase):
       moving_averages.assign_moving_average(var, 0.0, 0.99)
       moving_averages.assign_moving_average(var, 0.0, 0.99)
 
+  @test_util.run_deprecated_v1
   def testWeightedMovingAverage(self):
     with self.cached_session() as sess:
       decay = 0.5
@@ -111,6 +116,7 @@ class MovingAveragesTest(test.TestCase):
       denominator_2 = denominator_1 * decay + weight_2 * (1.0 - decay)
       self.assertAllClose(numerator_2 / denominator_2, wma_array)
 
+  @test_util.run_deprecated_v1
   def testWeightedMovingAverageBfloat16(self):
     bfloat16 = pywrap_tensorflow.TF_bfloat16_type()
     with self.cached_session() as sess:
@@ -179,66 +185,72 @@ class ExponentialMovingAverageTest(test.TestCase):
     self.assertEqual("add/ExponentialMovingAverage:0", avg2.name)
 
     # Check initial values.
-    self.assertAllClose(tens, var0.eval())
-    self.assertAllClose(thirties, var1.eval())
-    self.assertAllClose(_Repeat(10.0 + 30.0, dim), tensor2.eval())
+    self.assertAllClose(tens, self.evaluate(var0))
+    self.assertAllClose(thirties, self.evaluate(var1))
+    self.assertAllClose(_Repeat(10.0 + 30.0, dim), self.evaluate(tensor2))
 
     # Check that averages are initialized correctly.
-    self.assertAllClose(tens, avg0.eval())
-    self.assertAllClose(thirties, avg1.eval())
+    self.assertAllClose(tens, self.evaluate(avg0))
+    self.assertAllClose(thirties, self.evaluate(avg1))
     # Note that averages of Tensor's initialize to zeros_like since no value
     # of the Tensor is known because the Op has not been run (yet).
-    self.assertAllClose(_Repeat(0.0, dim), avg2.eval())
+    self.assertAllClose(_Repeat(0.0, dim), self.evaluate(avg2))
 
     # Update the averages and check.
     update.run()
     dk = actual_decay
 
     expected = _Repeat(10.0 * dk + 10.0 * (1 - dk), dim)
-    self.assertAllClose(expected, avg0.eval())
+    self.assertAllClose(expected, self.evaluate(avg0))
     expected = _Repeat(30.0 * dk + 30.0 * (1 - dk), dim)
-    self.assertAllClose(expected, avg1.eval())
+    self.assertAllClose(expected, self.evaluate(avg1))
     expected = _Repeat(0.0 * dk + (10.0 + 30.0) * (1 - dk) / _Scale(dk, 1), dim)
-    self.assertAllClose(expected, avg2.eval())
+    self.assertAllClose(expected, self.evaluate(avg2))
 
     # Again, update the averages and check.
     update.run()
     expected = _Repeat((10.0 * dk + 10.0 * (1 - dk)) * dk + 10.0 * (1 - dk),
                        dim)
-    self.assertAllClose(expected, avg0.eval())
+    self.assertAllClose(expected, self.evaluate(avg0))
     expected = _Repeat((30.0 * dk + 30.0 * (1 - dk)) * dk + 30.0 * (1 - dk),
                        dim)
-    self.assertAllClose(expected, avg1.eval())
+    self.assertAllClose(expected, self.evaluate(avg1))
     expected = _Repeat(((0.0 * dk + (10.0 + 30.0) * (1 - dk)) * dk +
                         (10.0 + 30.0) * (1 - dk)) / _Scale(dk, 2), dim)
-    self.assertAllClose(expected, avg2.eval())
+    self.assertAllClose(expected, self.evaluate(avg2))
 
+  @test_util.run_deprecated_v1
   def testAverageVariablesNoNumUpdates_Scalar(self):
     with self.cached_session():
       ema = moving_averages.ExponentialMovingAverage(0.25)
       self._CheckDecay(ema, actual_decay=0.25, dim=1)
 
+  @test_util.run_deprecated_v1
   def testAverageVariablesNoNumUpdates_Scalar_Debias(self):
     with self.cached_session():
       ema = moving_averages.ExponentialMovingAverage(0.25, zero_debias=True)
       self._CheckDecay(ema, actual_decay=0.25, dim=1)
 
+  @test_util.run_deprecated_v1
   def testAverageVariablesNoNumUpdates_Vector(self):
     with self.cached_session():
       ema = moving_averages.ExponentialMovingAverage(0.25)
       self._CheckDecay(ema, actual_decay=0.25, dim=5)
 
+  @test_util.run_deprecated_v1
   def testAverageVariablesNoNumUpdates_Vector_Debias(self):
     with self.cached_session():
       ema = moving_averages.ExponentialMovingAverage(0.25, zero_debias=True)
       self._CheckDecay(ema, actual_decay=0.25, dim=5)
 
+  @test_util.run_deprecated_v1
   def testAverageVariablesNumUpdates_Scalar(self):
     with self.cached_session():
       # With num_updates 1, the decay applied is 0.1818
       ema = moving_averages.ExponentialMovingAverage(0.25, num_updates=1)
       self._CheckDecay(ema, actual_decay=0.181818, dim=1)
 
+  @test_util.run_deprecated_v1
   def testAverageVariablesNumUpdates_Scalar_Debias(self):
     with self.cached_session():
       # With num_updates 1, the decay applied is 0.1818
@@ -246,12 +258,14 @@ class ExponentialMovingAverageTest(test.TestCase):
           0.25, num_updates=1, zero_debias=True)
       self._CheckDecay(ema, actual_decay=0.181818, dim=1)
 
+  @test_util.run_deprecated_v1
   def testAverageVariablesNumUpdates_Vector(self):
     with self.cached_session():
       # With num_updates 1, the decay applied is 0.1818
       ema = moving_averages.ExponentialMovingAverage(0.25, num_updates=1)
       self._CheckDecay(ema, actual_decay=0.181818, dim=5)
 
+  @test_util.run_deprecated_v1
   def testAverageVariablesNumUpdates_Vector_Debias(self):
     with self.cached_session():
       # With num_updates 1, the decay applied is 0.1818
@@ -259,6 +273,7 @@ class ExponentialMovingAverageTest(test.TestCase):
           0.25, num_updates=1, zero_debias=True)
       self._CheckDecay(ema, actual_decay=0.181818, dim=5)
 
+  @test_util.run_deprecated_v1
   def testAverageVariablesWithControlDeps(self):
     with self.cached_session() as sess:
       v0 = variables.Variable(0, name="v0")
@@ -274,16 +289,17 @@ class ExponentialMovingAverageTest(test.TestCase):
       self.assertEqual([], v1_avg.value().op.control_inputs)
       self.assertEqual([], v1_avg.value().op.control_inputs)
       # We should be able to initialize v1_avg before v0.
-      sess.run(v1_avg.initializer)
-      sess.run(v0.initializer)
-      self.assertEqual([10.0], sess.run(v1_avg))
+      self.evaluate(v1_avg.initializer)
+      self.evaluate(v0.initializer)
+      self.assertEqual([10.0], self.evaluate(v1_avg))
       # running ema_op should add to v0 (in addition to updating v1_avg)
-      sess.run(assign_to_v1)
-      sess.run(ema_op)
-      self.assertEqual(1, sess.run(v0))
-      self.assertEqual([17.5], sess.run(v1_avg))
+      self.evaluate(assign_to_v1)
+      self.evaluate(ema_op)
+      self.assertEqual(1, self.evaluate(v0))
+      self.assertEqual([17.5], self.evaluate(v1_avg))
 
   @test_util.run_in_graph_and_eager_modes
+  @test_util.run_deprecated_v1
   def testBasicEager(self):
     v0 = variables.Variable(1.0)
     v1 = variables.Variable(2.0)
@@ -339,9 +355,11 @@ class ExponentialMovingAverageTest(test.TestCase):
       self.assertEqual(ema.average(v1).op.name, ema.average_name(v1))
       self.assertEqual(ema.average(tensor2).op.name, ema.average_name(tensor2))
 
+  @test_util.run_deprecated_v1
   def testAverageVariablesNames(self):
     self.averageVariablesNamesHelper(zero_debias=True)
 
+  @test_util.run_deprecated_v1
   def testAverageVariablesNamesNoDebias(self):
     self.averageVariablesNamesHelper(zero_debias=False)
 
@@ -387,12 +405,15 @@ class ExponentialMovingAverageTest(test.TestCase):
         self.assertEqual(
             ema.average(tensor2).op.name, ema.average_name(tensor2))
 
+  @test_util.run_deprecated_v1
   def testAverageVariablesNamesRespectScope(self):
     self.averageVariablesNamesRespectScopeHelper(zero_debias=True)
 
+  @test_util.run_deprecated_v1
   def testAverageVariablesNamesRespectScopeNoDebias(self):
     self.averageVariablesNamesRespectScopeHelper(zero_debias=False)
 
+  @test_util.run_deprecated_v1
   def testSubsetAverageVariablesNames(self):
     with self.cached_session():
       v0 = variables.Variable(10.0, name="v0")
@@ -421,6 +442,7 @@ class ExponentialMovingAverageTest(test.TestCase):
       self.assertEqual(ema.average(v1).op.name, ema.average_name(v1))
       self.assertEqual(ema.average(tensor2).op.name, ema.average_name(tensor2))
 
+  @test_util.run_deprecated_v1
   def testAverageVariablesDeviceAssignment(self):
     with ops.device("/job:dev_v0"):
       v0 = variables.Variable(10.0, name="v0")
@@ -451,6 +473,7 @@ class ExponentialMovingAverageTest(test.TestCase):
       _ = saver_lib.import_meta_graph(meta_graph)
     return graph_copy
 
+  @test_util.run_deprecated_v1
   def testImportedGraphVariablesToRestore(self):
     g = ops.Graph()
     with g.as_default():
diff --git a/tensorflow/python/training/optimizer.py b/tensorflow/python/training/optimizer.py
index 9dfa9d2afb2..a9508b862ae 100644
--- a/tensorflow/python/training/optimizer.py
+++ b/tensorflow/python/training/optimizer.py
@@ -24,6 +24,8 @@ import abc
 
 import six
 
+from tensorflow.python.distribute import distribute_lib
+from tensorflow.python.distribute import reduce_util as ds_reduce_util
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
@@ -36,7 +38,6 @@ from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
-from tensorflow.python.training import distribute as distribute_lib
 from tensorflow.python.training import distribution_strategy_context as distribute_ctx
 from tensorflow.python.training import slot_creator
 from tensorflow.python.training.checkpointable import base as checkpointable
@@ -200,8 +201,7 @@ def _get_processor(v):
       return _TensorProcessor(v)
     else:
       return _DenseResourceVariableProcessor(v)
-  if isinstance(
-      v, resource_variable_ops.ResourceVariable) and not v._in_graph_mode:  # pylint: disable=protected-access
+  if resource_variable_ops.is_resource_variable(v) and not v._in_graph_mode:  # pylint: disable=protected-access
     # True if and only if `v` was initialized eagerly.
     return _DenseResourceVariableProcessor(v)
   if v.op.type == "VarHandleOp":
@@ -213,7 +213,7 @@ def _get_processor(v):
   raise NotImplementedError("Trying to optimize unsupported type ", v)
 
 
-@tf_export("train.Optimizer")
+@tf_export(v1=["train.Optimizer"])
 class Optimizer(
     # Optimizers inherit from CheckpointableBase rather than Checkpointable
     # since they do most of their dependency management themselves (slot
@@ -520,8 +520,7 @@ class Optimizer(
 
   @staticmethod
   def _scale_loss(loss_value):
-    if (distribute_lib.get_loss_reduction() ==
-        variable_scope.VariableAggregation.MEAN):
+    if distribute_lib.get_loss_reduction() == ds_reduce_util.ReduceOp.MEAN:
       num_replicas = \
         distribute_ctx.get_distribution_strategy().num_replicas_in_sync
       if num_replicas > 1:
@@ -565,7 +564,7 @@ class Optimizer(
     if distribute_ctx.has_distribution_strategy():
       grads_and_vars = get_filtered_grad_fn(lambda: grads_and_vars)()
       return distribute_ctx.get_replica_context().merge_call(
-          self._distributed_apply, grads_and_vars, global_step, name)
+          self._distributed_apply, args=(grads_and_vars, global_step, name))
 
     # No DistributionStrategy case.
     grads_and_vars = tuple(grads_and_vars)  # Make sure repeat iteration works.
@@ -658,14 +657,16 @@ class Optimizer(
     Returns:
       An `Operation` that applies the specified gradients across all
       replicas. If `global_step` was not None, that operation also
-      increments `global_step`.
+      increments `global_step`
     """
-    reduced_grads = distribution.batch_reduce(
-        variable_scope.VariableAggregation.SUM, grads_and_vars)
+    reduced_grads = distribution.extended.batch_reduce_to(
+        ds_reduce_util.ReduceOp.SUM, grads_and_vars)
     var_list = [v for _, v in grads_and_vars]
     grads_and_vars = zip(reduced_grads, var_list)
+
     # Note that this is called in a cross-replica context.
-    self._create_slots(var_list)
+    with ops.init_scope():
+      self._create_slots(var_list)
 
     def update(v, g):
       """Apply gradients to a replica variable."""
@@ -682,7 +683,13 @@ class Optimizer(
             "Gradient must be a Tensor, IndexedSlices, or None: %s" % g)
       p = _get_processor(v)
 
-      scope_name = "" if context.executing_eagerly() else v.op.name
+      if context.executing_eagerly() or (
+          resource_variable_ops.is_resource_variable(v) and
+          not v._in_graph_mode):  # pylint: disable=protected-access
+        scope_name = v.name.split(":")[0]
+      else:
+        scope_name = v.op.name
+
       # device_policy is set because non-mirrored tensors will be read in
       # `update_op`. `_resource_apply_dense`, `lr_t`, `beta1_t` and `beta2_t`
       # is an example.
@@ -695,21 +702,23 @@ class Optimizer(
       update_ops = [
           op
           for grad, var in grads_and_vars
-          for op in distribution.update(var, update, grad, grouped=False)
+          for op in distribution.extended.update(
+              var, update, args=(grad,), group=False)
       ]
 
       def finish(self, update_ops):
         return self._finish(update_ops, "update")
 
-      non_slot_devices = distribution.non_slot_devices(var_list)
-      finish_updates = distribution.update_non_slot(
-          non_slot_devices, finish, self, update_ops, grouped=False)
+      non_slot_devices = distribution.extended.non_slot_devices(var_list)
+      finish_updates = distribution.extended.update_non_slot(
+          non_slot_devices, finish, args=(self, update_ops), group=False)
       if global_step is None:
         apply_updates = distribution.group(finish_updates, name=name)
       else:
         with ops.control_dependencies(finish_updates):
-          apply_updates = distribution.update(
-              global_step, state_ops.assign_add, 1, name=name)
+          apply_updates = distribution.extended.update(
+              global_step, state_ops.assign_add, args=(1,),
+              kwargs={"name": name})
 
       if not context.executing_eagerly():
         if isinstance(apply_updates, ops.Tensor):
@@ -747,7 +756,7 @@ class Optimizer(
       # `_resource_apply_dense`.
       distributed_container = var._distributed_container()
       assert distributed_container is not None
-      if context.executing_eagerly():
+      if ops.executing_eagerly_outside_functions():
         key = distributed_container._unique_id
       else:
         key = (distributed_container.graph, distributed_container._shared_name)
diff --git a/tensorflow/python/training/optimizer_test.py b/tensorflow/python/training/optimizer_test.py
index 7a7d01d50e0..e175b5a7998 100644
--- a/tensorflow/python/training/optimizer_test.py
+++ b/tensorflow/python/training/optimizer_test.py
@@ -62,6 +62,7 @@ class OptimizerTest(test.TestCase):
       self.assertAllClose([-14., -13.], self.evaluate(var0))
       self.assertAllClose([-6., -5.], self.evaluate(var1))
 
+  @test_util.run_deprecated_v1
   def testAggregationMethod(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
       with self.cached_session():
@@ -79,14 +80,15 @@ class OptimizerTest(test.TestCase):
 
         variables.global_variables_initializer().run()
         # Fetch params to validate initial values
-        self.assertAllClose([1.0, 2.0], var0.eval())
-        self.assertAllClose([3.0, 4.0], var1.eval())
+        self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+        self.assertAllClose([3.0, 4.0], self.evaluate(var1))
         # Run 1 step of sgd through optimizer
         opt_op.run()
         # Validate updated params
-        self.assertAllClose([-14., -13.], var0.eval())
-        self.assertAllClose([-6., -5.], var1.eval())
+        self.assertAllClose([-14., -13.], self.evaluate(var0))
+        self.assertAllClose([-6., -5.], self.evaluate(var1))
 
+  @test_util.run_deprecated_v1
   def testPrecomputedGradient(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
       with self.cached_session():
@@ -102,15 +104,15 @@ class OptimizerTest(test.TestCase):
 
         variables.global_variables_initializer().run()
         # Fetch params to validate initial values
-        self.assertAllClose([1.0, 2.0], var0.eval())
-        self.assertAllClose([3.0, 4.0], var1.eval())
+        self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+        self.assertAllClose([3.0, 4.0], self.evaluate(var1))
         # Run 1 step of sgd through optimizer
         opt_op.run()
         # Validate updated params
         self.assertAllClose([1.0 - 3 * 5 * 42.0, 2.0 - 3 * 5 * (-42.0)],
-                            var0.eval())
+                            self.evaluate(var0))
         self.assertAllClose([3.0 - 3 * 3 * 42.0, 4.0 - 3 * 3 * (-42.0)],
-                            var1.eval())
+                            self.evaluate(var1))
 
   @test_util.run_in_graph_and_eager_modes
   def testNoVariables(self):
@@ -230,6 +232,7 @@ class OptimizerTest(test.TestCase):
     with self.assertRaises(NotImplementedError):
       sgd_op.apply_gradients(grads_and_vars)
 
+  @test_util.run_deprecated_v1
   def testTrainOp(self):
     with self.cached_session():
       var0 = variables.Variable([1.0, 2.0])
@@ -241,6 +244,7 @@ class OptimizerTest(test.TestCase):
       opt_op = sgd_op.minimize(cost, global_step, [var0, var1])
       self.assertTrue(opt_op in ops.get_collection(ops.GraphKeys.TRAIN_OP))
 
+  @test_util.run_deprecated_v1
   def testConstraint(self):
     constraint_01 = lambda x: clip_ops.clip_by_value(x, -0.1, 0.)
     constraint_0 = lambda x: clip_ops.clip_by_value(x, 0., 1.)
@@ -257,13 +261,13 @@ class OptimizerTest(test.TestCase):
 
       variables.global_variables_initializer().run()
       # Fetch params to validate initial values
-      self.assertAllClose([1.0, 2.0], var0.eval())
-      self.assertAllClose([3.0, 4.0], var1.eval())
+      self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+      self.assertAllClose([3.0, 4.0], self.evaluate(var1))
       # Run 1 step of sgd through optimizer
       opt_op.run()
       # Validate updated params
-      self.assertAllClose([-0.1, -0.1], var0.eval())
-      self.assertAllClose([0., 0.], var1.eval())
+      self.assertAllClose([-0.1, -0.1], self.evaluate(var0))
+      self.assertAllClose([0., 0.], self.evaluate(var1))
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/training/proximal_adagrad.py b/tensorflow/python/training/proximal_adagrad.py
index 9bd677b8efc..2ea628a56b4 100644
--- a/tensorflow/python/training/proximal_adagrad.py
+++ b/tensorflow/python/training/proximal_adagrad.py
@@ -26,7 +26,7 @@ from tensorflow.python.training import training_ops
 from tensorflow.python.util.tf_export import tf_export
 
 
-@tf_export("train.ProximalAdagradOptimizer")
+@tf_export(v1=["train.ProximalAdagradOptimizer"])
 class ProximalAdagradOptimizer(optimizer.Optimizer):
   # pylint: disable=line-too-long
   """Optimizer that implements the Proximal Adagrad algorithm.
diff --git a/tensorflow/python/training/proximal_adagrad_test.py b/tensorflow/python/training/proximal_adagrad_test.py
index 74e06a5e2e6..ce214ac418a 100644
--- a/tensorflow/python/training/proximal_adagrad_test.py
+++ b/tensorflow/python/training/proximal_adagrad_test.py
@@ -23,6 +23,7 @@ import numpy as np
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import embedding_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import resource_variable_ops
@@ -48,7 +49,7 @@ class ProximalAdagradOptimizerTest(test.TestCase):
       update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
       variables.global_variables_initializer().run()
 
-      v0_val, v1_val = sess.run([var0, var1])
+      v0_val, v1_val = self.evaluate([var0, var1])
       self.assertAllClose([0.0, 0.0], v0_val)
       self.assertAllClose([0.0, 0.0], v1_val)
 
@@ -56,7 +57,7 @@ class ProximalAdagradOptimizerTest(test.TestCase):
       for _ in range(3):
         update.run()
 
-      v0_val, v1_val = sess.run([var0, var1])
+      v0_val, v1_val = self.evaluate([var0, var1])
       self.assertAllClose(np.array([-2.60260963, -4.29698515]), v0_val)
       self.assertAllClose(np.array([-0.28432083, -0.56694895]), v1_val)
       opt_vars = opt.variables()
@@ -64,12 +65,15 @@ class ProximalAdagradOptimizerTest(test.TestCase):
       self.assertStartsWith(opt_vars[1].name, var1._shared_name)
       self.assertEqual(2, len(opt_vars))
 
+  @test_util.run_deprecated_v1
   def testProximalAdagradwithoutRegularization(self):
     self.doTestProximalAdagradwithoutRegularization(use_resource=False)
 
+  @test_util.run_deprecated_v1
   def testResourceProximalAdagradwithoutRegularization(self):
     self.doTestProximalAdagradwithoutRegularization(use_resource=True)
 
+  @test_util.run_deprecated_v1
   def testProximalAdagradwithoutRegularization2(self):
     with self.cached_session() as sess:
       var0 = variables.Variable([1.0, 2.0])
@@ -85,17 +89,18 @@ class ProximalAdagradOptimizerTest(test.TestCase):
       update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
       variables.global_variables_initializer().run()
 
-      v0_val, v1_val = sess.run([var0, var1])
+      v0_val, v1_val = self.evaluate([var0, var1])
       self.assertAllClose([1.0, 2.0], v0_val)
       self.assertAllClose([4.0, 3.0], v1_val)
 
       # Run 3 steps Proximal Adagrad.
       for _ in range(3):
         update.run()
-      v0_val, v1_val = sess.run([var0, var1])
+      v0_val, v1_val = self.evaluate([var0, var1])
       self.assertAllClose(np.array([-1.60261, -2.296985]), v0_val)
       self.assertAllClose(np.array([3.715679, 2.433051]), v1_val)
 
+  @test_util.run_deprecated_v1
   def testMinimizeSparseResourceVariable(self):
     for dtype in [dtypes.float32, dtypes.float64]:
       with self.cached_session():
@@ -106,13 +111,15 @@ class ProximalAdagradOptimizerTest(test.TestCase):
         sgd_op = proximal_adagrad.ProximalAdagradOptimizer(1.0).minimize(loss)
         variables.global_variables_initializer().run()
         # Fetch params to validate initial values
-        self.assertAllCloseAccordingToType([[1.0, 2.0]], var0.eval())
+        self.assertAllCloseAccordingToType([[1.0, 2.0]], self.evaluate(var0))
         # Run 1 step of sgd
         sgd_op.run()
         # Validate updated params
-        self.assertAllCloseAccordingToType(
-            [[0, 1]], var0.eval(), atol=0.01)
+        self.assertAllCloseAccordingToType([[0, 1]],
+                                           self.evaluate(var0),
+                                           atol=0.01)
 
+  @test_util.run_deprecated_v1
   def testProximalAdagradWithL1(self):
     with self.cached_session() as sess:
       var0 = variables.Variable([1.0, 2.0])
@@ -128,17 +135,18 @@ class ProximalAdagradOptimizerTest(test.TestCase):
       update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
       variables.global_variables_initializer().run()
 
-      v0_val, v1_val = sess.run([var0, var1])
+      v0_val, v1_val = self.evaluate([var0, var1])
       self.assertAllClose([1.0, 2.0], v0_val)
       self.assertAllClose([4.0, 3.0], v1_val)
 
       # Run 10 steps Proximal Adagrad
       for _ in range(10):
         update.run()
-      v0_val, v1_val = sess.run([var0, var1])
+      v0_val, v1_val = self.evaluate([var0, var1])
       self.assertAllClose(np.array([-6.663634, -9.190331]), v0_val)
       self.assertAllClose(np.array([2.959304, 1.029232]), v1_val)
 
+  @test_util.run_deprecated_v1
   def testProximalAdagradWithL1_L2(self):
     with self.cached_session() as sess:
       var0 = variables.Variable([1.0, 2.0])
@@ -154,7 +162,7 @@ class ProximalAdagradOptimizerTest(test.TestCase):
       update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
       variables.global_variables_initializer().run()
 
-      v0_val, v1_val = sess.run([var0, var1])
+      v0_val, v1_val = self.evaluate([var0, var1])
       self.assertAllClose([1.0, 2.0], v0_val)
       self.assertAllClose([4.0, 3.0], v1_val)
 
@@ -162,7 +170,7 @@ class ProximalAdagradOptimizerTest(test.TestCase):
       for _ in range(10):
         update.run()
 
-      v0_val, v1_val = sess.run([var0, var1])
+      v0_val, v1_val = self.evaluate([var0, var1])
       self.assertAllClose(np.array([-0.0495, -0.0995]), v0_val)
       self.assertAllClose(np.array([-0.0045, -0.0095]), v1_val)
 
@@ -190,7 +198,7 @@ class ProximalAdagradOptimizerTest(test.TestCase):
     variables.global_variables_initializer().run()
 
     sess = ops.get_default_session()
-    v0_val, v1_val = sess.run([var0, var1])
+    v0_val, v1_val = self.evaluate([var0, var1])
     if is_sparse:
       self.assertAllClose([[1.0], [2.0]], v0_val)
       self.assertAllClose([[3.0], [4.0]], v1_val)
@@ -202,9 +210,10 @@ class ProximalAdagradOptimizerTest(test.TestCase):
     for _ in range(steps):
       update.run()
 
-    v0_val, v1_val = sess.run([var0, var1])
+    v0_val, v1_val = self.evaluate([var0, var1])
     return v0_val, v1_val
 
+  @test_util.run_deprecated_v1
   def testEquivAdagradwithoutRegularization(self):
     with self.cached_session():
       val0, val1 = self.applyOptimizer(
@@ -222,6 +231,7 @@ class ProximalAdagradOptimizerTest(test.TestCase):
     self.assertAllClose(val0, val2)
     self.assertAllClose(val1, val3)
 
+  @test_util.run_deprecated_v1
   def testEquivSparseAdagradwithoutRegularization(self):
     with self.cached_session():
       val0, val1 = self.applyOptimizer(
diff --git a/tensorflow/python/training/proximal_gradient_descent_test.py b/tensorflow/python/training/proximal_gradient_descent_test.py
index f77f68b2343..25b206605dc 100644
--- a/tensorflow/python/training/proximal_gradient_descent_test.py
+++ b/tensorflow/python/training/proximal_gradient_descent_test.py
@@ -23,6 +23,7 @@ import numpy as np
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import embedding_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import resource_variable_ops
@@ -50,7 +51,7 @@ class ProximalGradientDescentOptimizerTest(test.TestCase):
       update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
       variables.global_variables_initializer().run()
 
-      v0_val, v1_val = sess.run([var0, var1])
+      v0_val, v1_val = self.evaluate([var0, var1])
       self.assertAllClose([0.0, 0.0], v0_val)
       self.assertAllClose([0.0, 0.0], v1_val)
 
@@ -58,16 +59,19 @@ class ProximalGradientDescentOptimizerTest(test.TestCase):
       for _ in range(3):
         update.run()
 
-      v0_val, v1_val = sess.run([var0, var1])
+      v0_val, v1_val = self.evaluate([var0, var1])
       self.assertAllClose(np.array([-0.9, -1.8]), v0_val)
       self.assertAllClose(np.array([-0.09, -0.18]), v1_val)
 
+  @test_util.run_deprecated_v1
   def testProximalGradientDescentwithoutRegularization(self):
     self.doTestProximalGradientDescentwithoutRegularization(use_resource=False)
 
+  @test_util.run_deprecated_v1
   def testResourceProximalGradientDescentwithoutRegularization(self):
     self.doTestProximalGradientDescentwithoutRegularization(use_resource=True)
 
+  @test_util.run_deprecated_v1
   def testProximalGradientDescentwithoutRegularization2(self):
     with self.cached_session() as sess:
       var0 = variables.Variable([1.0, 2.0])
@@ -80,7 +84,7 @@ class ProximalGradientDescentOptimizerTest(test.TestCase):
       update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
       variables.global_variables_initializer().run()
 
-      v0_val, v1_val = sess.run([var0, var1])
+      v0_val, v1_val = self.evaluate([var0, var1])
       self.assertAllClose([1.0, 2.0], v0_val)
       self.assertAllClose([4.0, 3.0], v1_val)
 
@@ -88,10 +92,11 @@ class ProximalGradientDescentOptimizerTest(test.TestCase):
       for _ in range(3):
         update.run()
 
-      v0_val, v1_val = sess.run([var0, var1])
+      v0_val, v1_val = self.evaluate([var0, var1])
       self.assertAllClose(np.array([0.1, 0.2]), v0_val)
       self.assertAllClose(np.array([3.91, 2.82]), v1_val)
 
+  @test_util.run_deprecated_v1
   def testMinimizeSparseResourceVariable(self):
     for dtype in [dtypes.float32, dtypes.float64]:
       with self.cached_session():
@@ -103,13 +108,15 @@ class ProximalGradientDescentOptimizerTest(test.TestCase):
             1.0).minimize(loss)
         variables.global_variables_initializer().run()
         # Fetch params to validate initial values
-        self.assertAllCloseAccordingToType([[1.0, 2.0]], var0.eval())
+        self.assertAllCloseAccordingToType([[1.0, 2.0]], self.evaluate(var0))
         # Run 1 step of sgd
         sgd_op.run()
         # Validate updated params
-        self.assertAllCloseAccordingToType(
-            [[-111, -138]], var0.eval(), atol=0.01)
+        self.assertAllCloseAccordingToType([[-111, -138]],
+                                           self.evaluate(var0),
+                                           atol=0.01)
 
+  @test_util.run_deprecated_v1
   def testProximalGradientDescentWithL1_L2(self):
     with self.cached_session() as sess:
       var0 = variables.Variable([1.0, 2.0])
@@ -122,7 +129,7 @@ class ProximalGradientDescentOptimizerTest(test.TestCase):
       update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
       variables.global_variables_initializer().run()
 
-      v0_val, v1_val = sess.run([var0, var1])
+      v0_val, v1_val = self.evaluate([var0, var1])
       self.assertAllClose([1.0, 2.0], v0_val)
       self.assertAllClose([4.0, 3.0], v1_val)
 
@@ -130,7 +137,7 @@ class ProximalGradientDescentOptimizerTest(test.TestCase):
       for _ in range(10):
         update.run()
 
-      v0_val, v1_val = sess.run([var0, var1])
+      v0_val, v1_val = self.evaluate([var0, var1])
       self.assertAllClose(np.array([-0.0495, -0.0995]), v0_val)
       self.assertAllClose(np.array([-0.0045, -0.0095]), v1_val)
 
@@ -158,7 +165,7 @@ class ProximalGradientDescentOptimizerTest(test.TestCase):
     variables.global_variables_initializer().run()
 
     sess = ops.get_default_session()
-    v0_val, v1_val = sess.run([var0, var1])
+    v0_val, v1_val = self.evaluate([var0, var1])
     if is_sparse:
       self.assertAllClose([[1.0], [2.0]], v0_val)
       self.assertAllClose([[3.0], [4.0]], v1_val)
@@ -170,9 +177,10 @@ class ProximalGradientDescentOptimizerTest(test.TestCase):
     for _ in range(steps):
       update.run()
 
-    v0_val, v1_val = sess.run([var0, var1])
+    v0_val, v1_val = self.evaluate([var0, var1])
     return v0_val, v1_val
 
+  @test_util.run_deprecated_v1
   def testEquivSparseGradientDescentwithoutRegularization(self):
     with self.cached_session():
       val0, val1 = self.applyOptimizer(
@@ -189,6 +197,7 @@ class ProximalGradientDescentOptimizerTest(test.TestCase):
     self.assertAllClose(val0, val2)
     self.assertAllClose(val1, val3)
 
+  @test_util.run_deprecated_v1
   def testEquivGradientDescentwithoutRegularization(self):
     with self.cached_session():
       val0, val1 = self.applyOptimizer(
diff --git a/tensorflow/python/training/quantize_training_test.py b/tensorflow/python/training/quantize_training_test.py
index 6edbf7665fb..62e783f2000 100644
--- a/tensorflow/python/training/quantize_training_test.py
+++ b/tensorflow/python/training/quantize_training_test.py
@@ -25,6 +25,7 @@ from tensorflow.python.client import session
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import importer
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
@@ -52,6 +53,7 @@ class PywrapQuantizeTrainingTest(test.TestCase):
 
   # Test that save/restoring works for EMA variables generated in the
   # quantized training rewrite.
+  @test_util.run_deprecated_v1
   def testQuantizedSaveRestore(self):
     save_path = os.path.join(self.get_temp_dir(), 'quantized_save_restore')
 
@@ -73,11 +75,11 @@ class PywrapQuantizeTrainingTest(test.TestCase):
       _ = importer.import_graph_def(result, name='')
 
       # Initialize the variable.
-      sess.run(g.get_operation_by_name(init_op.name))
+      self.evaluate(g.get_operation_by_name(init_op.name))
 
       # Run the graph for one step to assign values to the quantization min/max
       # variables.
-      sess.run(g.get_tensor_by_name(c.name))
+      self.evaluate(g.get_tensor_by_name(c.name))
 
       saver.save(sess, save_path)
 
diff --git a/tensorflow/python/training/queue_runner_test.py b/tensorflow/python/training/queue_runner_test.py
index 15fe42bbd85..4113cecf55d 100644
--- a/tensorflow/python/training/queue_runner_test.py
+++ b/tensorflow/python/training/queue_runner_test.py
@@ -26,6 +26,7 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.ops import variables
@@ -40,6 +41,7 @@ _MockOp = collections.namedtuple("MockOp", ["name"])
 
 class QueueRunnerTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testBasic(self):
     with self.cached_session() as sess:
       # CountUpTo will raise OUT_OF_RANGE when it reaches the count.
@@ -58,8 +60,9 @@ class QueueRunnerTest(test.TestCase):
         t.join()
       self.assertEqual(0, len(qr.exceptions_raised))
       # The variable should be 3.
-      self.assertEqual(3, var.eval())
+      self.assertEqual(3, self.evaluate(var))
 
+  @test_util.run_deprecated_v1
   def testTwoOps(self):
     with self.cached_session() as sess:
       # CountUpTo will raise OUT_OF_RANGE when it reaches the count.
@@ -80,9 +83,10 @@ class QueueRunnerTest(test.TestCase):
       for t in threads:
         t.join()
       self.assertEqual(0, len(qr.exceptions_raised))
-      self.assertEqual(3, var0.eval())
-      self.assertEqual(30, var1.eval())
+      self.assertEqual(3, self.evaluate(var0))
+      self.assertEqual(30, self.evaluate(var1))
 
+  @test_util.run_deprecated_v1
   def testExceptionsCaptured(self):
     with self.cached_session() as sess:
       queue = data_flow_ops.FIFOQueue(10, dtypes.float32)
@@ -99,6 +103,7 @@ class QueueRunnerTest(test.TestCase):
       self.assertTrue("Operation not in the graph" in str(exceptions[0]))
       self.assertTrue("Operation not in the graph" in str(exceptions[1]))
 
+  @test_util.run_deprecated_v1
   def testRealDequeueEnqueue(self):
     with self.cached_session() as sess:
       q0 = data_flow_ops.FIFOQueue(3, dtypes.float32)
@@ -121,12 +126,13 @@ class QueueRunnerTest(test.TestCase):
       # It should have terminated cleanly.
       self.assertEqual(0, len(qr.exceptions_raised))
       # The 2 values should be in queue1.
-      self.assertEqual(10.0, dequeue1.eval())
-      self.assertEqual(10.0, dequeue1.eval())
+      self.assertEqual(10.0, self.evaluate(dequeue1))
+      self.assertEqual(10.0, self.evaluate(dequeue1))
       # And queue1 should now be closed.
       with self.assertRaisesRegexp(errors_impl.OutOfRangeError, "is closed"):
-        dequeue1.eval()
+        self.evaluate(dequeue1)
 
+  @test_util.run_deprecated_v1
   def testRespectCoordShouldStop(self):
     with self.cached_session() as sess:
       # CountUpTo will raise OUT_OF_RANGE when it reaches the count.
@@ -149,8 +155,9 @@ class QueueRunnerTest(test.TestCase):
       coord.join()
       self.assertEqual(0, len(qr.exceptions_raised))
       # The variable should be 0.
-      self.assertEqual(0, var.eval())
+      self.assertEqual(0, self.evaluate(var))
 
+  @test_util.run_deprecated_v1
   def testRequestStopOnException(self):
     with self.cached_session() as sess:
       queue = data_flow_ops.FIFOQueue(10, dtypes.float32)
@@ -163,6 +170,7 @@ class QueueRunnerTest(test.TestCase):
       with self.assertRaisesRegexp(ValueError, "Operation not in the graph"):
         coord.join()
 
+  @test_util.run_deprecated_v1
   def testGracePeriod(self):
     with self.cached_session() as sess:
       # The enqueue will quickly block.
@@ -180,6 +188,7 @@ class QueueRunnerTest(test.TestCase):
       # the queue to be closed and the enqueue to terminate.
       coord.join(stop_grace_period_secs=1.0)
 
+  @test_util.run_deprecated_v1
   def testMultipleSessions(self):
     with self.cached_session() as sess:
       with session.Session() as other_sess:
@@ -195,6 +204,7 @@ class QueueRunnerTest(test.TestCase):
         other_threads = qr.create_threads(other_sess, coord=coord)
         self.assertEqual(len(threads), len(other_threads))
 
+  @test_util.run_deprecated_v1
   def testIgnoreMultiStarts(self):
     with self.cached_session() as sess:
       # CountUpTo will raise OUT_OF_RANGE when it reaches the count.
@@ -211,6 +221,7 @@ class QueueRunnerTest(test.TestCase):
       new_threads = qr.create_threads(sess, coord=coord)
       self.assertEqual([], new_threads)
 
+  @test_util.run_deprecated_v1
   def testThreads(self):
     with self.cached_session() as sess:
       # CountUpTo will raise OUT_OF_RANGE when it reaches the count.
@@ -238,6 +249,7 @@ class QueueRunnerTest(test.TestCase):
       self.assertEqual(1, len(exceptions))
       self.assertTrue("Operation not in the graph" in str(exceptions[0]))
 
+  @test_util.run_deprecated_v1
   def testName(self):
     with ops.name_scope("scope"):
       queue = data_flow_ops.FIFOQueue(10, dtypes.float32, name="queue")
@@ -247,6 +259,7 @@ class QueueRunnerTest(test.TestCase):
     self.assertEqual(
         1, len(ops.get_collection(ops.GraphKeys.QUEUE_RUNNERS, "scope")))
 
+  @test_util.run_deprecated_v1
   def testStartQueueRunners(self):
     # CountUpTo will raise OUT_OF_RANGE when it reaches the count.
     zero64 = constant_op.constant(0, dtype=dtypes.int64)
@@ -263,8 +276,9 @@ class QueueRunnerTest(test.TestCase):
         t.join()
       self.assertEqual(0, len(qr.exceptions_raised))
       # The variable should be 3.
-      self.assertEqual(3, var.eval())
+      self.assertEqual(3, self.evaluate(var))
 
+  @test_util.run_deprecated_v1
   def testStartQueueRunnersRaisesIfNotASession(self):
     zero64 = constant_op.constant(0, dtype=dtypes.int64)
     var = variables.VariableV1(zero64)
@@ -278,6 +292,7 @@ class QueueRunnerTest(test.TestCase):
       with self.assertRaisesRegexp(TypeError, "tf.Session"):
         queue_runner_impl.start_queue_runners("NotASession")
 
+  @test_util.run_deprecated_v1
   def testStartQueueRunnersIgnoresMonitoredSession(self):
     zero64 = constant_op.constant(0, dtype=dtypes.int64)
     var = variables.VariableV1(zero64)
@@ -292,6 +307,7 @@ class QueueRunnerTest(test.TestCase):
           monitored_session.MonitoredSession())
       self.assertFalse(threads)
 
+  @test_util.run_deprecated_v1
   def testStartQueueRunnersNonDefaultGraph(self):
     # CountUpTo will raise OUT_OF_RANGE when it reaches the count.
     graph = ops.Graph()
@@ -310,7 +326,7 @@ class QueueRunnerTest(test.TestCase):
         t.join()
       self.assertEqual(0, len(qr.exceptions_raised))
       # The variable should be 3.
-      self.assertEqual(3, var.eval())
+      self.assertEqual(3, self.evaluate(var))
 
   def testQueueRunnerSerializationRoundTrip(self):
     graph = ops.Graph()
diff --git a/tensorflow/python/training/rmsprop.py b/tensorflow/python/training/rmsprop.py
index f38c9861d64..fb53b5883f5 100644
--- a/tensorflow/python/training/rmsprop.py
+++ b/tensorflow/python/training/rmsprop.py
@@ -50,7 +50,7 @@ from tensorflow.python.training import training_ops
 from tensorflow.python.util.tf_export import tf_export
 
 
-@tf_export("train.RMSPropOptimizer")
+@tf_export(v1=["train.RMSPropOptimizer"])
 class RMSPropOptimizer(optimizer.Optimizer):
   """Optimizer that implements the RMSProp algorithm.
 
diff --git a/tensorflow/python/training/rmsprop_test.py b/tensorflow/python/training/rmsprop_test.py
index b63abe05295..8f029d5310e 100644
--- a/tensorflow/python/training/rmsprop_test.py
+++ b/tensorflow/python/training/rmsprop_test.py
@@ -28,6 +28,7 @@ from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import embedding_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import resource_variable_ops
@@ -88,11 +89,12 @@ class RMSPropOptimizerTest(test.TestCase):
       var_t[gindex] = var[gindex] - mom_t[gindex]
     return var_t, mg_t, rms_t, mom_t
 
+  @test_util.run_deprecated_v1
   def testDense(self):
     # TODO(yori): Use ParameterizedTest when available
     for (dtype, learning_rate, decay, momentum,
          epsilon, centered, use_resource) in _TESTPARAMS:
-      with self.cached_session(use_gpu=True):
+      with test_util.use_gpu():
         # Initialize variables for numpy implementation.
         var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
         grads0_np = np.array([0.1, 0.2], dtype=dtype.as_numpy_dtype)
@@ -115,7 +117,7 @@ class RMSPropOptimizerTest(test.TestCase):
             centered=centered)
 
         update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
-        variables.global_variables_initializer().run()
+        self.evaluate(variables.global_variables_initializer())
 
         mg0 = opt.get_slot(var0, "mg")
         self.assertEqual(mg0 is not None, centered)
@@ -138,12 +140,12 @@ class RMSPropOptimizerTest(test.TestCase):
         mom1_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
 
         # Fetch params to validate initial values
-        self.assertAllClose([1.0, 2.0], var0.eval())
-        self.assertAllClose([3.0, 4.0], var1.eval())
+        self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+        self.assertAllClose([3.0, 4.0], self.evaluate(var1))
 
         # Run 4 steps of RMSProp
         for _ in range(1, 5):
-          update.run()
+          self.evaluate(update)
 
           var0_np, mg0_np, rms0_np, mom0_np = self._rmsprop_update_numpy(
               var0_np, grads0_np, mg0_np, rms0_np, mom0_np, learning_rate,
@@ -154,15 +156,16 @@ class RMSPropOptimizerTest(test.TestCase):
 
           # Validate updated params
           if centered:
-            self.assertAllCloseAccordingToType(mg0_np, mg0.eval())
-            self.assertAllCloseAccordingToType(mg1_np, mg1.eval())
-          self.assertAllCloseAccordingToType(rms0_np, rms0.eval())
-          self.assertAllCloseAccordingToType(rms1_np, rms1.eval())
-          self.assertAllCloseAccordingToType(mom0_np, mom0.eval())
-          self.assertAllCloseAccordingToType(mom1_np, mom1.eval())
-          self.assertAllCloseAccordingToType(var0_np, var0.eval())
-          self.assertAllCloseAccordingToType(var1_np, var1.eval())
+            self.assertAllCloseAccordingToType(mg0_np, self.evaluate(mg0))
+            self.assertAllCloseAccordingToType(mg1_np, self.evaluate(mg1))
+          self.assertAllCloseAccordingToType(rms0_np, self.evaluate(rms0))
+          self.assertAllCloseAccordingToType(rms1_np, self.evaluate(rms1))
+          self.assertAllCloseAccordingToType(mom0_np, self.evaluate(mom0))
+          self.assertAllCloseAccordingToType(mom1_np, self.evaluate(mom1))
+          self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
+          self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
 
+  @test_util.run_deprecated_v1
   def testMinimizeSparseResourceVariable(self):
     for dtype in [dtypes.float32, dtypes.float64]:
       with self.cached_session():
@@ -176,15 +179,17 @@ class RMSPropOptimizerTest(test.TestCase):
             momentum=0.0,
             epsilon=0.0,
             centered=False).minimize(loss)
-        variables.global_variables_initializer().run()
+        self.evaluate(variables.global_variables_initializer())
         # Fetch params to validate initial values
-        self.assertAllCloseAccordingToType([[1.0, 2.0]], var0.eval())
+        self.assertAllCloseAccordingToType([[1.0, 2.0]], self.evaluate(var0))
         # Run 1 step of sgd
-        sgd_op.run()
+        self.evaluate(sgd_op)
         # Validate updated params
-        self.assertAllCloseAccordingToType(
-            [[0., 1.]], var0.eval(), atol=0.01)
+        self.assertAllCloseAccordingToType([[0., 1.]],
+                                           self.evaluate(var0),
+                                           atol=0.01)
 
+  @test_util.run_deprecated_v1
   def testMinimizeSparseResourceVariableCentered(self):
     for dtype in [dtypes.float32, dtypes.float64]:
       with self.cached_session():
@@ -198,20 +203,22 @@ class RMSPropOptimizerTest(test.TestCase):
             momentum=0.0,
             epsilon=1.0,
             centered=True).minimize(loss)
-        variables.global_variables_initializer().run()
+        self.evaluate(variables.global_variables_initializer())
         # Fetch params to validate initial values
-        self.assertAllCloseAccordingToType([[1.0, 2.0]], var0.eval())
+        self.assertAllCloseAccordingToType([[1.0, 2.0]], self.evaluate(var0))
         # Run 1 step of sgd
-        sgd_op.run()
+        self.evaluate(sgd_op)
         # Validate updated params
-        self.assertAllCloseAccordingToType(
-            [[-111, -138]], var0.eval(), atol=0.01)
+        self.assertAllCloseAccordingToType([[-111, -138]],
+                                           self.evaluate(var0),
+                                           atol=0.01)
 
+  @test_util.run_deprecated_v1
   def testSparse(self):
     # TODO(yori): Use ParameterizedTest when available
     for (dtype, learning_rate, decay,
          momentum, epsilon, centered, _) in _TESTPARAMS:
-      with self.cached_session(use_gpu=True):
+      with test_util.use_gpu():
         # Initialize variables for numpy implementation.
         var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
         grads0_np = np.array([0.1], dtype=dtype.as_numpy_dtype)
@@ -235,7 +242,7 @@ class RMSPropOptimizerTest(test.TestCase):
             epsilon=epsilon,
             centered=centered)
         update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
-        variables.global_variables_initializer().run()
+        self.evaluate(variables.global_variables_initializer())
 
         mg0 = opt.get_slot(var0, "mg")
         self.assertEqual(mg0 is not None, centered)
@@ -258,12 +265,12 @@ class RMSPropOptimizerTest(test.TestCase):
         mom1_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
 
         # Fetch params to validate initial values
-        self.assertAllClose([1.0, 2.0], var0.eval())
-        self.assertAllClose([3.0, 4.0], var1.eval())
+        self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+        self.assertAllClose([3.0, 4.0], self.evaluate(var1))
 
         # Run 4 steps of RMSProp
         for _ in range(1, 5):
-          update.run()
+          self.evaluate(update)
 
           var0_np, mg0_np, rms0_np, mom0_np = self._sparse_rmsprop_update_numpy(
               var0_np, grads0_np_indices, grads0_np, mg0_np, rms0_np, mom0_np,
@@ -274,18 +281,19 @@ class RMSPropOptimizerTest(test.TestCase):
 
           # Validate updated params
           if centered:
-            self.assertAllCloseAccordingToType(mg0_np, mg0.eval())
-            self.assertAllCloseAccordingToType(mg1_np, mg1.eval())
-          self.assertAllCloseAccordingToType(rms0_np, rms0.eval())
-          self.assertAllCloseAccordingToType(rms1_np, rms1.eval())
-          self.assertAllCloseAccordingToType(mom0_np, mom0.eval())
-          self.assertAllCloseAccordingToType(mom1_np, mom1.eval())
-          self.assertAllCloseAccordingToType(var0_np, var0.eval())
-          self.assertAllCloseAccordingToType(var1_np, var1.eval())
+            self.assertAllCloseAccordingToType(mg0_np, self.evaluate(mg0))
+            self.assertAllCloseAccordingToType(mg1_np, self.evaluate(mg1))
+          self.assertAllCloseAccordingToType(rms0_np, self.evaluate(rms0))
+          self.assertAllCloseAccordingToType(rms1_np, self.evaluate(rms1))
+          self.assertAllCloseAccordingToType(mom0_np, self.evaluate(mom0))
+          self.assertAllCloseAccordingToType(mom1_np, self.evaluate(mom1))
+          self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
+          self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
 
+  @test_util.run_deprecated_v1
   def testWithoutMomentum(self):
     for dtype in [dtypes.half, dtypes.float32]:
-      with self.cached_session(use_gpu=True):
+      with test_util.use_gpu():
         var0 = variables.Variable([1.0, 2.0], dtype=dtype)
         var1 = variables.Variable([3.0, 4.0], dtype=dtype)
         grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
@@ -293,7 +301,7 @@ class RMSPropOptimizerTest(test.TestCase):
         opt = rmsprop.RMSPropOptimizer(
             learning_rate=2.0, decay=0.9, momentum=0.0, epsilon=1.0)
         update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
-        variables.global_variables_initializer().run()
+        self.evaluate(variables.global_variables_initializer())
 
         rms0 = opt.get_slot(var0, "rms")
         self.assertTrue(rms0 is not None)
@@ -305,34 +313,36 @@ class RMSPropOptimizerTest(test.TestCase):
         self.assertTrue(mom1 is not None)
 
         # Fetch params to validate initial values
-        self.assertAllClose([1.0, 2.0], var0.eval())
-        self.assertAllClose([3.0, 4.0], var1.eval())
+        self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+        self.assertAllClose([3.0, 4.0], self.evaluate(var1))
         # Step 1: the rms accumulators where 1. So we should see a normal
         # update: v -= grad * learning_rate
-        update.run()
+        self.evaluate(update)
         # Check the root mean square accumulators.
         self.assertAllCloseAccordingToType(
-            np.array([0.901, 0.901]), rms0.eval())
+            np.array([0.901, 0.901]), self.evaluate(rms0))
         self.assertAllCloseAccordingToType(
-            np.array([0.90001, 0.90001]), rms1.eval())
+            np.array([0.90001, 0.90001]), self.evaluate(rms1))
         # Check the parameters.
         self.assertAllCloseAccordingToType(
             np.array([
                 1.0 - (0.1 * 2.0 / math.sqrt(0.901 + 1.0)),
                 2.0 - (0.1 * 2.0 / math.sqrt(0.901 + 1.0))
-            ]), var0.eval())
+            ]), self.evaluate(var0))
         self.assertAllCloseAccordingToType(
             np.array([
                 3.0 - (0.01 * 2.0 / math.sqrt(0.90001 + 1.0)),
                 4.0 - (0.01 * 2.0 / math.sqrt(0.90001 + 1.0))
-            ]), var1.eval())
+            ]), self.evaluate(var1))
         # Step 2: the root mean square accumulators contain the previous update.
-        update.run()
+        self.evaluate(update)
         # Check the rms accumulators.
         self.assertAllCloseAccordingToType(
-            np.array([0.901 * 0.9 + 0.001, 0.901 * 0.9 + 0.001]), rms0.eval())
+            np.array([0.901 * 0.9 + 0.001, 0.901 * 0.9 + 0.001]),
+            self.evaluate(rms0))
         self.assertAllCloseAccordingToType(
-            np.array([0.90001 * 0.9 + 1e-5, 0.90001 * 0.9 + 1e-5]), rms1.eval())
+            np.array([0.90001 * 0.9 + 1e-5, 0.90001 * 0.9 + 1e-5]),
+            self.evaluate(rms1))
         # Check the parameters.
         self.assertAllCloseAccordingToType(
             np.array([
@@ -340,18 +350,19 @@ class RMSPropOptimizerTest(test.TestCase):
                 (0.1 * 2.0 / math.sqrt(0.901 * 0.9 + 0.001 + 1.0)),
                 2.0 - (0.1 * 2.0 / math.sqrt(0.901 + 1.0)) -
                 (0.1 * 2.0 / math.sqrt(0.901 * 0.9 + 0.001 + 1.0))
-            ]), var0.eval())
+            ]), self.evaluate(var0))
         self.assertAllCloseAccordingToType(
             np.array([
                 3.0 - (0.01 * 2.0 / math.sqrt(0.90001 + 1.0)) -
                 (0.01 * 2.0 / math.sqrt(0.90001 * 0.9 + 1e-5 + 1.0)),
                 4.0 - (0.01 * 2.0 / math.sqrt(0.90001 + 1.0)) -
                 (0.01 * 2.0 / math.sqrt(0.90001 * 0.9 + 1e-5 + 1.0))
-            ]), var1.eval())
+            ]), self.evaluate(var1))
 
+  @test_util.run_deprecated_v1
   def testWithMomentum(self):
     for dtype in [dtypes.half, dtypes.float32]:
-      with self.cached_session(use_gpu=True):
+      with test_util.use_gpu():
         var0 = variables.Variable([1.0, 2.0], dtype=dtype)
         var1 = variables.Variable([3.0, 4.0], dtype=dtype)
         grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
@@ -360,7 +371,7 @@ class RMSPropOptimizerTest(test.TestCase):
         opt = rmsprop.RMSPropOptimizer(
             learning_rate=2.0, decay=0.9, momentum=0.5, epsilon=1e-5)
         update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
-        variables.global_variables_initializer().run()
+        self.evaluate(variables.global_variables_initializer())
 
         rms0 = opt.get_slot(var0, "rms")
         self.assertTrue(rms0 is not None)
@@ -372,57 +383,61 @@ class RMSPropOptimizerTest(test.TestCase):
         self.assertTrue(mom1 is not None)
 
         # Fetch params to validate initial values
-        self.assertAllClose([1.0, 2.0], var0.eval())
-        self.assertAllClose([3.0, 4.0], var1.eval())
+        self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+        self.assertAllClose([3.0, 4.0], self.evaluate(var1))
         # Step 1: rms = 1, mom = 0. So we should see a normal
         # update: v -= grad * learning_rate
-        update.run()
+        self.evaluate(update)
         # Check the root mean square accumulators.
         self.assertAllCloseAccordingToType(
-            np.array([0.901, 0.901]), rms0.eval())
+            np.array([0.901, 0.901]), self.evaluate(rms0))
         self.assertAllCloseAccordingToType(
-            np.array([0.90001, 0.90001]), rms1.eval())
+            np.array([0.90001, 0.90001]), self.evaluate(rms1))
         # Check the momentum accumulators
         self.assertAllCloseAccordingToType(
             np.array([(0.1 * 2.0 / math.sqrt(0.901 + 1e-5)),
-                      (0.1 * 2.0 / math.sqrt(0.901 + 1e-5))]), mom0.eval())
+                      (0.1 * 2.0 / math.sqrt(0.901 + 1e-5))]),
+            self.evaluate(mom0))
         self.assertAllCloseAccordingToType(
             np.array([(0.01 * 2.0 / math.sqrt(0.90001 + 1e-5)),
-                      (0.01 * 2.0 / math.sqrt(0.90001 + 1e-5))]), mom1.eval())
+                      (0.01 * 2.0 / math.sqrt(0.90001 + 1e-5))]),
+            self.evaluate(mom1))
 
         # Check that the parameters.
         self.assertAllCloseAccordingToType(
             np.array([
                 1.0 - (0.1 * 2.0 / math.sqrt(0.901 + 1e-5)),
                 2.0 - (0.1 * 2.0 / math.sqrt(0.901 + 1e-5))
-            ]), var0.eval())
+            ]), self.evaluate(var0))
         self.assertAllCloseAccordingToType(
             np.array([
                 3.0 - (0.01 * 2.0 / math.sqrt(0.90001 + 1e-5)),
                 4.0 - (0.01 * 2.0 / math.sqrt(0.90001 + 1e-5))
-            ]), var1.eval())
+            ]), self.evaluate(var1))
 
         # Step 2: the root mean square accumulators contain the previous update.
-        update.run()
+        self.evaluate(update)
         # Check the rms accumulators.
         self.assertAllCloseAccordingToType(
-            np.array([0.901 * 0.9 + 0.001, 0.901 * 0.9 + 0.001]), rms0.eval())
+            np.array([0.901 * 0.9 + 0.001, 0.901 * 0.9 + 0.001]),
+            self.evaluate(rms0))
         self.assertAllCloseAccordingToType(
-            np.array([0.90001 * 0.9 + 1e-5, 0.90001 * 0.9 + 1e-5]), rms1.eval())
+            np.array([0.90001 * 0.9 + 1e-5, 0.90001 * 0.9 + 1e-5]),
+            self.evaluate(rms1))
         self.assertAllCloseAccordingToType(
             np.array([
                 0.5 * (0.1 * 2.0 / math.sqrt(0.901 + 1e-5)) +
                 (0.1 * 2.0 / math.sqrt(0.901 * 0.9 + 0.001 + 1e-5)),
                 0.5 * (0.1 * 2.0 / math.sqrt(0.901 + 1e-5)) +
                 (0.1 * 2.0 / math.sqrt(0.901 * 0.9 + 0.001 + 1e-5))
-            ]), mom0.eval())
+            ]), self.evaluate(mom0))
         self.assertAllCloseAccordingToType(
             np.array([
                 0.5 * (0.01 * 2.0 / math.sqrt(0.90001 + 1e-5)) +
                 (0.01 * 2.0 / math.sqrt(0.90001 * 0.9 + 2e-5)),
                 0.5 * (0.01 * 2.0 / math.sqrt(0.90001 + 1e-5)) +
                 (0.01 * 2.0 / math.sqrt(0.90001 * 0.9 + 2e-5))
-            ]), mom1.eval())
+            ]), self.evaluate(mom1))
 
         # Check the parameters.
         self.assertAllCloseAccordingToType(
@@ -433,7 +448,7 @@ class RMSPropOptimizerTest(test.TestCase):
                 2.0 - (0.1 * 2.0 / math.sqrt(0.901 + 1e-5)) -
                 (0.5 * (0.1 * 2.0 / math.sqrt(0.901 + 1e-5)) +
                  (0.1 * 2.0 / math.sqrt(0.901 * 0.9 + 0.001 + 1e-5)))
-            ]), var0.eval())
+            ]), self.evaluate(var0))
 
         self.assertAllCloseAccordingToType(
             np.array([
@@ -443,7 +458,7 @@ class RMSPropOptimizerTest(test.TestCase):
                 4.0 - (0.01 * 2.0 / math.sqrt(0.90001 + 1e-5)) -
                 (0.5 * (0.01 * 2.0 / math.sqrt(0.90001 + 1e-5)) +
                  (0.01 * 2.0 / math.sqrt(0.90001 * 0.9 + 2e-5)))
-            ]), var1.eval())
+            ]), self.evaluate(var1))
 
   def testCallableParams(self):
     with context.eager_mode():
diff --git a/tensorflow/python/training/saver.py b/tensorflow/python/training/saver.py
index a29926a57df..4cd09f8a1d5 100644
--- a/tensorflow/python/training/saver.py
+++ b/tensorflow/python/training/saver.py
@@ -1077,16 +1077,28 @@ class Saver(object):
     @compatibility(eager)
     When eager execution is enabled, `var_list` must specify a `list` or `dict`
     of variables to save. Otherwise, a `RuntimeError` will be raised.
+
+    Although Saver works in some cases when executing eagerly, it is
+    fragile. Please switch to `tf.train.Checkpoint` or
+    `tf.keras.Model.save_weights`, which perform a more robust object-based
+    saving. These APIs will load checkpoints written by `Saver`.
     @end_compatibility
     """
     if defer_build and var_list:
       raise ValueError(
           "If `var_list` is provided then build cannot be deferred. "
           "Either set defer_build=False or var_list=None.")
-    if context.executing_eagerly() and var_list is None:
-      raise RuntimeError(
-          "When eager execution is enabled, `var_list` must specify a list or "
-          "dict of variables to save")
+    if context.executing_eagerly():
+      logging.warning(
+          "Saver is deprecated, please switch to tf.train.Checkpoint or "
+          "tf.keras.Model.save_weights for training checkpoints. When "
+          "executing eagerly variables do not necessarily have unique names, "
+          "and so the variable.name-based lookups Saver performs are "
+          "error-prone.")
+      if var_list is None:
+        raise RuntimeError(
+            "When eager execution is enabled, `var_list` must specify a list "
+            "or dict of variables to save")
     self._var_list = var_list
     self._reshape = reshape
     self._sharded = sharded
@@ -1899,16 +1911,40 @@ def saver_from_object_based_checkpoint(
     builder = BulkSaverBuilder()
 
   saveables = builder._ValidateAndSliceInputs(var_list)  # pylint: disable=protected-access
+  current_names = set()
+  for saveable in saveables:
+    for spec in saveable.specs:
+      current_names.add(spec.name)
+  previous_names = set(names_to_keys.keys())
+  missing_names = current_names - previous_names
+  if missing_names:
+    extra_names = previous_names - current_names
+    intersecting_names = previous_names.intersection(current_names)
+    raise errors.NotFoundError(
+        None, None,
+        message=(
+            "\n\nExisting variables not in the checkpoint: %s\n\n"
+            "Variables names when this checkpoint was written which don't "
+            "exist now: %s\n\n"
+            "(%d variable name(s) did match)\n\n"
+            "Could not find some variables in the checkpoint (see names "
+            "above). Saver was attempting to load an object-based checkpoint "
+            "(saved using tf.train.Checkpoint or tf.keras.Model.save_weights) "
+            "using variable names. If the checkpoint was written with eager "
+            "execution enabled, it's possible that variable names have "
+            "changed (for example missing a '_1' suffix). It's also "
+            "possible that there are new variables which did not exist "
+            "when the checkpoint was written. You can construct a "
+            "Saver(var_list=...) with only the variables which previously "
+            "existed, and if variable names have changed you may need to "
+            "make this a dictionary with the old names as keys. If you're "
+            "using an Estimator, you'll need to return a tf.train.Saver "
+            "inside a tf.train.Scaffold from your model_fn.")
+        % (", ".join(sorted(missing_names)), ", ".join(sorted(extra_names)),
+           len(intersecting_names)))
   for saveable in saveables:
     for spec in saveable.specs:
-      if spec.name not in names_to_keys:
-        raise errors.NotFoundError(
-            None, None,
-            message=("Attempting to load an object-based checkpoint using "
-                     "variable names, but could not find %s in the "
-                     "checkpoint.") % spec.name)
       spec.name = names_to_keys[spec.name]
-
   if cached_saver is None:
     return Saver(saveables)
   return cached_saver
diff --git a/tensorflow/python/training/saver_large_partitioned_variable_test.py b/tensorflow/python/training/saver_large_partitioned_variable_test.py
index 1a44511cfeb..84458836d06 100644
--- a/tensorflow/python/training/saver_large_partitioned_variable_test.py
+++ b/tensorflow/python/training/saver_large_partitioned_variable_test.py
@@ -25,6 +25,7 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import partitioned_variables
+from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 from tensorflow.python.training import saver
@@ -44,8 +45,12 @@ class SaverLargePartitionedVariableTest(test.TestCase):
         # split into smaller sized variables.
         init = lambda shape, dtype, partition_info: constant_op.constant(
             True, dtype, shape)
-        partitioned_var = partitioned_variables.create_partitioned_variables(
-            [1 << 31], [4], init, dtype=dtypes.bool, name=var_name)
+        partitioned_var = list(variable_scope.get_variable(
+            var_name,
+            shape=[1 << 31],
+            partitioner=partitioned_variables.fixed_size_partitioner(4),
+            initializer=init,
+            dtype=dtypes.bool))
         variables.global_variables_initializer().run()
         save = saver.Saver(partitioned_var)
         val = save.save(sess, save_path)
diff --git a/tensorflow/python/training/saver_test.py b/tensorflow/python/training/saver_test.py
index eb2690985d5..5d621ba4ffa 100644
--- a/tensorflow/python/training/saver_test.py
+++ b/tensorflow/python/training/saver_test.py
@@ -170,6 +170,7 @@ class SaverTest(test.TestCase):
   def testResourceBasic(self):
     self.basicSaveRestore(resource_variable_ops.ResourceVariable)
 
+  @test_util.run_deprecated_v1
   def testResourceColocation(self):
     partitioner = partitioned_variables.fixed_size_partitioner(num_shards=2)
     with ops_lib.device("/job:ps/device:GPU:0"):
@@ -227,7 +228,7 @@ class SaverTest(test.TestCase):
         w1 = resource_variable_ops.ResourceVariable(1.0, name="w1")
         w2 = resource_variable_ops.ResourceVariable(2.0, name="w2")
         graph_saver = saver_module.Saver([w1, w2])
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         graph_saver.save(sess, graph_ckpt_prefix)
 
     with context.eager_mode():
@@ -260,7 +261,7 @@ class SaverTest(test.TestCase):
         w3 = resource_variable_ops.ResourceVariable(0.0, name="w3")
         w4 = resource_variable_ops.ResourceVariable(0.0, name="w4")
         graph_saver = saver_module.Saver([w3, w4])
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         graph_saver.restore(sess, eager_ckpt_prefix)
         self.assertAllEqual(w3.eval(), 3.0)
         self.assertAllEqual(w4.eval(), 4.0)
@@ -300,6 +301,7 @@ class SaverTest(test.TestCase):
             not op.name.startswith("saver2/save/"))]
     self.assertEqual(ops_in_saver2_scope_but_not_save_scope, [])
 
+  @test_util.run_deprecated_v1
   def testSaveCopyRestoreWithSaveRelativePaths(self):
     """Save, copy checkpoint dir and restore from copied dir.
 
@@ -326,7 +328,7 @@ class SaverTest(test.TestCase):
 
     with self.cached_session() as sess:
       # Initialize all variables
-      sess.run(init_all_op)
+      self.evaluate(init_all_op)
 
       # Check that the parameter nodes have been initialized.
       self.assertEqual(10.0, v0.eval())
@@ -369,6 +371,7 @@ class SaverTest(test.TestCase):
       self.assertEqual(b"k1", v2.keys().eval())
       self.assertEqual(30.0, v2.values().eval())
 
+  @test_util.run_deprecated_v1
   def testFilenameTensor(self):
     v0 = variables.VariableV1(0, name="v0")
     filename = b"somerandomfilename"
@@ -376,7 +379,7 @@ class SaverTest(test.TestCase):
     with self.cached_session() as sess:
       tensor = sess.graph.get_tensor_by_name(
           save.saver_def.filename_tensor_name)
-      self.assertEqual(sess.run(tensor), filename)
+      self.assertEqual(self.evaluate(tensor), filename)
 
   def testInvalidPath(self):
     v0 = variables.VariableV1(0, name="v0")
@@ -387,6 +390,7 @@ class SaverTest(test.TestCase):
             ValueError, "The passed save_path is not a valid checkpoint:"):
           save.restore(sess, "invalid path")
 
+  @test_util.run_deprecated_v1
   def testInt64(self):
     save_path = os.path.join(self.get_temp_dir(), "int64")
 
@@ -407,7 +411,7 @@ class SaverTest(test.TestCase):
 
       with self.assertRaisesWithPredicateMatch(
           errors_impl.OpError, lambda e: "uninitialized value v" in e.message):
-        sess.run(v)
+        self.evaluate(v)
 
       # Restore the saved values in the parameter nodes.
       save.restore(sess, save_path)
@@ -462,6 +466,7 @@ class SaverTest(test.TestCase):
       # Verify non-duplicate names work.
       saver_module.Saver({"v0": v0, "v2": v2.saveable})
 
+  @test_util.run_deprecated_v1
   def testBasicsWithListOfVariables(self):
     save_path = os.path.join(self.get_temp_dir(), "basics_with_list")
 
@@ -497,10 +502,10 @@ class SaverTest(test.TestCase):
 
       with self.assertRaisesWithPredicateMatch(
           errors_impl.OpError, lambda e: "uninitialized value v0" in e.message):
-        sess.run(v0)
+        self.evaluate(v0)
       with self.assertRaisesWithPredicateMatch(
           errors_impl.OpError, lambda e: "uninitialized value v1" in e.message):
-        sess.run(v1)
+        self.evaluate(v1)
       self.assertEqual(0, len(v2.keys().eval()))
       self.assertEqual(0, len(v2.values().eval()))
 
@@ -557,6 +562,7 @@ class SaverTest(test.TestCase):
     # The cached readers should know to re-read the file.
     self._SaveAndLoad("var1", 1.1, 2.2, save_path)
 
+  @test_util.run_deprecated_v1
   def testAllowEmpty(self):
     save_path = os.path.join(self.get_temp_dir(), "allow_empty")
     with self.cached_session() as sess:
@@ -661,6 +667,7 @@ class SaverTest(test.TestCase):
       self.assertAllClose(1.0, one.eval())
       self.assertAllClose([2.0, 2.0, 2.0], twos.eval())
 
+  @test_util.run_deprecated_v1
   def testReshape(self):
     save_path = os.path.join(self.get_temp_dir(), "variables_reshape")
     with session.Session("", graph=ops_lib.Graph()) as sess:
@@ -719,6 +726,7 @@ class SaverTest(test.TestCase):
   def testSaveWithGlobalStepWithPadding(self):
     self.testSaveWithGlobalStep(pad_step_number=True)
 
+  @test_util.run_deprecated_v1
   def testSaveToNonexistingPath(self):
     file_io.write_string_to_file(
         os.path.join(self.get_temp_dir(), "actually_a_file"), "")
@@ -742,7 +750,7 @@ class SaverTest(test.TestCase):
       try:
         with self.cached_session() as sess:
           # Initialize all variables
-          sess.run(init_all_op)
+          self.evaluate(init_all_op)
 
           # Check that the parameter nodes have been initialized.
           self.assertEqual(10.0, v0.eval())
@@ -761,6 +769,7 @@ class SaverTest(test.TestCase):
         error_msg_template = "Parent directory of {} doesn't exist, can't save."
         self.assertEqual(error_msg_template.format(save_path), str(exc))
 
+  @test_util.run_deprecated_v1
   def testSaveToURI(self):
     # ParseURI functions don't work on Windows yet.
     # TODO(jhseu): Remove this check when it works.
@@ -777,7 +786,7 @@ class SaverTest(test.TestCase):
 
     with self.cached_session() as sess:
       # Initialize all variables
-      sess.run(init_all_op)
+      self.evaluate(init_all_op)
 
       # Check that the parameter nodes have been initialized.
       self.assertEqual(10.0, v0.eval())
@@ -824,11 +833,11 @@ class SaverTest(test.TestCase):
     save_graph = ops_lib.Graph()
     with save_graph.as_default(), self.session(graph=save_graph) as sess:
       orig_vars = _model()
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       save = saver_module.Saver(max_to_keep=1)
       variables.global_variables_initializer().run()
       save.save(sess, save_dir)
-      orig_vals = sess.run(orig_vars)
+      orig_vals = self.evaluate(orig_vars)
 
     restore_graph = ops_lib.Graph()
     with restore_graph.as_default(), self.session(
@@ -836,7 +845,7 @@ class SaverTest(test.TestCase):
       restored_vars = _model()
       save = saver_module.Saver(max_to_keep=1)
       save.restore(sess, save_dir)
-      restored_vals = sess.run(restored_vars)
+      restored_vals = self.evaluate(restored_vars)
 
     for orig, restored in zip(orig_vals, restored_vals):
       self.assertAllEqual(orig, restored)
@@ -982,6 +991,7 @@ class SaveRestoreShardedTest(test.TestCase):
           checkpoint_management.latest_checkpoint(self.get_temp_dir()),
           os.path.join(self.get_temp_dir(), "sharded_basics"))
 
+  @test_util.run_deprecated_v1
   def testSaverDef(self):
     with self.cached_session():
       v0 = variables.VariableV1(123, name="v0")
@@ -998,19 +1008,12 @@ class SaveRestoreShardedTest(test.TestCase):
 
     call_saver_with_dict = False  # updated by test loop below
 
-    def _save(slices=None, partitioner=None):
+    def _save(partitioner=None):
       with self.session(graph=ops_lib.Graph()) as sess:
         # Calls .eval() to return the ndarray that makes up the full variable.
         rnd = random_ops.random_uniform(var_full_shape).eval()
 
-        if slices:
-          assert not partitioner
-          # TODO(apassos): make create_partitioned_variables take use_resource
-          # option to make this test passable without creating a named
-          # variable_scope.
-          vs = partitioned_variables.create_partitioned_variables(
-              var_full_shape, slices, rnd, name=var_name)
-        elif partitioner:
+        if partitioner:
           vs = [
               variable_scope.get_variable(
                   var_name,
@@ -1027,7 +1030,7 @@ class SaveRestoreShardedTest(test.TestCase):
 
         variables.global_variables_initializer().run()
         if call_saver_with_dict:
-          saver = saver_module.Saver({var_name: (vs if slices else vs[0])})
+          saver = saver_module.Saver({var_name: vs[0]})
         else:
           saver = saver_module.Saver(vs)
         actual_path = saver.save(sess, saved_path)
@@ -1035,16 +1038,9 @@ class SaveRestoreShardedTest(test.TestCase):
 
         return rnd
 
-    def _restore(slices=None, partitioner=None):
+    def _restore(partitioner=None):
       with self.session(graph=ops_lib.Graph()) as sess:
-        if slices:
-          assert not partitioner
-          new_vs = partitioned_variables.create_partitioned_variables(
-              var_full_shape,
-              slices,
-              array_ops.zeros(var_full_shape),  # != original contents.
-              name=var_name)
-        elif partitioner:
+        if partitioner:
           new_vs = [
               variable_scope.get_variable(
                   var_name,
@@ -1063,7 +1059,7 @@ class SaveRestoreShardedTest(test.TestCase):
         variables.global_variables_initializer().run()
         if call_saver_with_dict:
           saver = saver_module.Saver({
-              var_name: (new_vs if slices else new_vs[0])
+              var_name: new_vs[0]
           })
         else:
           saver = saver_module.Saver(new_vs)
@@ -1071,11 +1067,7 @@ class SaveRestoreShardedTest(test.TestCase):
 
         if partitioner:
           return new_vs[0].as_tensor().eval()
-        elif slices and slices[0] != 1:
-          return array_ops.concat(new_vs, 0).eval()
-        elif slices and slices[1] != 1:
-          return array_ops.concat(new_vs, 1).eval()
-        else:  # Non-sliced.
+        else:
           return new_vs[0].eval()
 
     for call_saver_with_dict in {False, True}:
@@ -1086,32 +1078,30 @@ class SaveRestoreShardedTest(test.TestCase):
       restored_full = _restore()
       self.assertAllEqual(saved_full, restored_full)
 
-      # Saves 10 horizontal parts of a partitioned variable.
-      # Restores into a full variable, non-sliced.
-      saved_full = _save(slices=[10, 1])
-      restored_full = _restore()
-      self.assertAllEqual(saved_full, restored_full)
-
-      # Restores into a different number/orientation of slices.
-      restored_full = _restore(slices=[2, 1])  # 2 horizon parts.
-      self.assertAllEqual(saved_full, restored_full)
-      restored_full = _restore(slices=[1, 3])  # 3 vertical parts.
-      self.assertAllEqual(saved_full, restored_full)
-
-      # Restores into a PartitionedVariable
+      # Restores into the same number of partitions.
       restored_full = _restore(
           partitioner=partitioned_variables.fixed_size_partitioner(
               num_shards=2))
       self.assertAllEqual(saved_full, restored_full)
 
-      # Now, saves a full variable and restores in slices.
-      saved_full = _save()
-      restored_full = _restore(slices=[1, 3])
+      # Restores into a different number of partitions.
+      restored_full = _restore(
+          partitioner=partitioned_variables.fixed_size_partitioner(
+              num_shards=3))
       self.assertAllEqual(saved_full, restored_full)
 
+      # Now, saves a full variable and restores PartitionedVariable.
+      saved_full = _save()
+      restored_full = _restore(
+          partitioner=partitioned_variables.fixed_size_partitioner(
+              num_shards=3))
+      self.assertAllEqual(saved_full, restored_full)
+
+  @test_util.run_deprecated_v1
   def testPartitionedVariable(self):
     self._testPartitionedVariables(use_resource=False)
 
+  @test_util.run_deprecated_v1
   def testPartitionedResourceVariable(self):
     self._testPartitionedVariables(use_resource=True)
 
@@ -1206,6 +1196,7 @@ class MaxToKeepTest(test.TestCase):
       # Deleted by the first helper.
       self.assertFalse(checkpoint_management.checkpoint_exists(s3))
 
+  @test_util.run_deprecated_v1
   def testNonSharded(self):
     save_dir = self._get_test_dir("max_to_keep_non_sharded")
 
@@ -1443,6 +1434,7 @@ class MaxToKeepTest(test.TestCase):
       self.assertTrue(
           gfile.Exists(checkpoint_management.meta_graph_filename(s3)))
 
+  @test_util.run_deprecated_v1
   def testNoMaxToKeep(self):
     save_dir = self._get_test_dir("no_max_to_keep")
     save_dir2 = self._get_test_dir("max_to_keep_0")
@@ -1471,6 +1463,7 @@ class MaxToKeepTest(test.TestCase):
       self.assertEqual([], save2.last_checkpoints)
       self.assertTrue(checkpoint_management.checkpoint_exists(s2))
 
+  @test_util.run_deprecated_v1
   def testNoMetaGraph(self):
     save_dir = self._get_test_dir("no_meta_graph")
 
@@ -1494,6 +1487,7 @@ class KeepCheckpointEveryNHoursTest(test.TestCase):
 
   @test_util.run_in_graph_and_eager_modes
   @test.mock.patch.object(saver_module, "time")
+  @test_util.run_deprecated_v1
   def testNonSharded(self, mock_time):
     save_dir = self._get_test_dir("keep_checkpoint_every_n_hours")
 
@@ -1613,6 +1607,7 @@ class SaveRestoreWithVariableNameMap(test.TestCase):
       self.assertEqual(20.0, self.evaluate(v1))
 
   @test_util.run_in_graph_and_eager_modes
+  @test_util.run_deprecated_v1
   def testNonReshapeResourceVariable(self):
     self._testNonReshape(resource_variable_ops.ResourceVariable)
 
@@ -1627,6 +1622,7 @@ class MetaGraphTest(test.TestCase):
     gfile.MakeDirs(test_dir)
     return test_dir
 
+  @test_util.run_deprecated_v1
   def testAddCollectionDef(self):
     test_dir = self._get_test_dir("good_collection")
     filename = os.path.join(test_dir, "metafile")
@@ -1769,18 +1765,20 @@ class MetaGraphTest(test.TestCase):
       self.assertEqual([], v1.get_shape())
       with self.assertRaisesWithPredicateMatch(
           errors_impl.OpError, lambda e: "uninitialized value v1" in e.message):
-        sess.run(v1)
+        self.evaluate(v1)
       # Retrieves saver1. Verifies that new_saver1 can restore v1.
       new_saver1 = savers[1]
       new_saver1.restore(sess, saver1_ckpt)
       v1 = sess.graph.get_tensor_by_name("v1:0")
       self.assertEqual(11.0, v1.eval())
 
+  @test_util.run_deprecated_v1
   def testMultiSaverCollection(self):
     test_dir = self._get_test_dir("saver_collection")
     self._testMultiSaverCollectionSave(test_dir)
     self._testMultiSaverCollectionRestore(test_dir)
 
+  @test_util.run_deprecated_v1
   def testClearExtraneousSavers(self):
     test_dir = self._get_test_dir("clear_extraneous_savers")
     filename = os.path.join(test_dir, "metafile")
@@ -1835,6 +1833,7 @@ class MetaGraphTest(test.TestCase):
       self.assertEqual(33, len(meta_graph_def0.graph_def.node))
       self.assertEqual(21, len(meta_graph_def1.graph_def.node))
 
+  @test_util.run_deprecated_v1
   def testBinaryAndTextFormat(self):
     test_dir = self._get_test_dir("binary_and_text")
     filename = os.path.join(test_dir, "metafile")
@@ -1867,6 +1866,7 @@ class MetaGraphTest(test.TestCase):
                                                lambda e: "does not exist"):
         saver_module.import_meta_graph(filename)
 
+  @test_util.run_deprecated_v1
   def testSliceVariable(self):
     test_dir = self._get_test_dir("slice_saver")
     filename = os.path.join(test_dir, "metafile")
@@ -1949,9 +1949,9 @@ class MetaGraphTest(test.TestCase):
 
     with self.cached_session() as sess:
       # Initializes all the variables.
-      sess.run(init_all_op)
+      self.evaluate(init_all_op)
       # Runs to logit.
-      sess.run(logits)
+      self.evaluate(logits)
       # Creates a saver.
       saver0 = saver_module.Saver()
       saver0.save(sess, saver0_ckpt)
@@ -1991,7 +1991,7 @@ class MetaGraphTest(test.TestCase):
       ops_lib.add_to_collection("train_op", train_op)
 
       # Runs train_op.
-      sess.run(train_op)
+      self.evaluate(train_op)
 
       # Generates MetaGraphDef.
       saver_module.export_meta_graph(train_filename)
@@ -2005,8 +2005,9 @@ class MetaGraphTest(test.TestCase):
       # Restores from checkpoint.
       new_saver.restore(sess, saver0_ckpt)
       train_op = ops_lib.get_collection("train_op")[0]
-      sess.run(train_op)
+      self.evaluate(train_op)
 
+  @test_util.run_deprecated_v1
   def testGraphExtension(self):
     test_dir = self._get_test_dir("graph_extension")
     self._testGraphExtensionSave(test_dir)
@@ -2037,8 +2038,8 @@ class MetaGraphTest(test.TestCase):
 
       # Generate a MetaGraphDef containing the while loop.
       with session.Session() as sess:
-        sess.run(init_op)
-        sess.run(output)
+        self.evaluate(init_op)
+        self.evaluate(output)
         saver = saver_module.Saver()
         saver.save(sess, saver_ckpt)
         saver.export_meta_graph(filename)
@@ -2053,8 +2054,8 @@ class MetaGraphTest(test.TestCase):
       no_constfold_config.graph_options.rewrite_options.constant_folding = (
           rewriter_config_pb2.RewriterConfig.OFF)
       with session.Session(config=no_constfold_config) as sess:
-        sess.run(init_op)
-        expected_grad_value = sess.run(grad)
+        self.evaluate(init_op)
+        expected_grad_value = self.evaluate(grad)
 
     # Restore the MetaGraphDef into a new Graph.
     with ops_lib.Graph().as_default():
@@ -2070,8 +2071,8 @@ class MetaGraphTest(test.TestCase):
       init_op = variables.global_variables_initializer()
 
       with session.Session(config=no_constfold_config) as sess:
-        sess.run(init_op)
-        actual_grad_value = sess.run(grad)
+        self.evaluate(init_op)
+        actual_grad_value = self.evaluate(grad)
         self.assertEqual(expected_grad_value, actual_grad_value)
 
   def _testWhileLoopAndGradientSerDes(self, outer_body_fn):
@@ -2092,6 +2093,7 @@ class MetaGraphTest(test.TestCase):
       return i + 1, x + r
     self._testWhileLoopAndGradientSerDes(body)
 
+  @test_util.run_deprecated_v1
   def testNestedControlFlowSerDes(self):
     # Test while loop in a cond in a while loop.
     # pylint: disable=g-long-lambda
@@ -2120,6 +2122,7 @@ class MetaGraphTest(test.TestCase):
                                       lambda: math_ops.multiply(x, -1.0))))
     # pylint: enable=g-long-lambda
 
+  @test_util.run_deprecated_v1
   def testStrippedOpListDef(self):
     with self.cached_session():
       # Creates a graph.
@@ -2157,6 +2160,7 @@ class MetaGraphTest(test.TestCase):
         self.assertEqual(o.summary, "")
         self.assertEqual(o.description, "")
 
+  @test_util.run_deprecated_v1
   def testStripDefaultValuedAttrs(self):
     """Verifies that default valued attrs are stripped, unless disabled."""
 
@@ -2193,6 +2197,7 @@ class MetaGraphTest(test.TestCase):
       self.assertIn("T", node_def.attr)
       self.assertIn("Tout", node_def.attr)
 
+  @test_util.run_deprecated_v1
   def testImportIntoNamescope(self):
     # Test that we can import a meta graph into a namescope.
     test_dir = self._get_test_dir("import_into_namescope")
@@ -2209,7 +2214,7 @@ class MetaGraphTest(test.TestCase):
                                                       logits=logit, name="cost")
       adam.AdamOptimizer().minimize(cost, name="optimize")
       saver = saver_module.Saver()
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       saver.save(sess, filename)
 
     graph = ops_lib.Graph()
@@ -2246,7 +2251,7 @@ class MetaGraphTest(test.TestCase):
 
       # Create a variable in graph_2 under scope "my_scope".
       variables.VariableV1(array_ops.zeros([10]), name="my_scope/my_var")
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       # Restore the checkpoint into a different scope "subgraph_2".
       new_saver_2 = saver_module.import_meta_graph(
           filename + ".meta", graph=graph_2, import_scope="subgraph_2")
@@ -2263,6 +2268,7 @@ class MetaGraphTest(test.TestCase):
           filename + ".meta", graph=graph_2, import_scope="my_scope")
       self.assertIsInstance(new_saver_3, saver_module.Saver)
 
+  @test_util.run_deprecated_v1
   def testImportIntoImplicitNamescope(self):
     # Test that we can import a meta graph into an implicit namescope.
     test_dir = self._get_test_dir("import_into_namescope")
@@ -2279,7 +2285,7 @@ class MetaGraphTest(test.TestCase):
                                                       logits=logit, name="cost")
       adam.AdamOptimizer().minimize(cost, name="optimize")
       saver = saver_module.Saver()
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       saver.save(sess, filename)
 
     graph = ops_lib.Graph()
@@ -2316,12 +2322,12 @@ class MetaGraphTest(test.TestCase):
           meta_graph_def, clear_devices=False, import_scope="new_model")
       # Device refers to GPU, which is not available here.
       with self.assertRaises(errors_impl.InvalidArgumentError):
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
 
     with session.Session(graph=ops_lib.Graph()) as sess:
       saver_module.import_meta_graph(
           meta_graph_def, clear_devices=True, import_scope="new_model")
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       sess.run(["new_model/optimize"], {
           "new_model/image:0": np.random.random([1, 784]),
           "new_model/label:0": np.random.randint(
@@ -2348,7 +2354,7 @@ class MetaGraphTest(test.TestCase):
 
     with session.Session(graph=ops_lib.Graph()) as sess:
       saver_module.import_meta_graph(meta_graph_def, import_scope="new_model")
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       sess.run(["new_model/optimize"], {
           "new_model/image:0": np.random.random([1, 784]),
           "new_model/label:0": np.random.randint(
@@ -2358,7 +2364,7 @@ class MetaGraphTest(test.TestCase):
   def testPreserveDatasetAndFunctions(self):
     with ops_lib.Graph().as_default() as g:
       dataset = dataset_ops.Dataset.range(10).map(lambda x: x * x)
-      iterator = dataset.make_one_shot_iterator()
+      iterator = dataset_ops.make_one_shot_iterator(dataset)
       next_element = iterator.get_next()
       _ = array_ops.identity(next_element, name="output")
 
@@ -2374,7 +2380,7 @@ class MetaGraphTest(test.TestCase):
                            meta_graph_def_from_graph_def]:
       with session.Session(graph=ops_lib.Graph()) as sess:
         saver_module.import_meta_graph(meta_graph_def, import_scope="new_model")
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         for i in range(10):
           self.assertEqual(i * i, sess.run("new_model/output:0"))
         with self.assertRaises(errors.OutOfRangeError):
@@ -2385,6 +2391,7 @@ class CheckpointReaderTest(test.TestCase):
 
   _WRITE_VERSION = saver_pb2.SaverDef.V1
 
+  @test_util.run_deprecated_v1
   def testDebugString(self):
     # Builds a graph.
     v0 = variables.VariableV1(
@@ -2400,7 +2407,7 @@ class CheckpointReaderTest(test.TestCase):
     save_path = os.path.join(self.get_temp_dir(),
                              "ckpt_for_debug_string" + str(self._WRITE_VERSION))
     with self.cached_session() as sess:
-      sess.run(init_all_op)
+      self.evaluate(init_all_op)
       # Saves a checkpoint.
       save.save(sess, save_path)
 
@@ -2546,7 +2553,7 @@ class ScopedGraphTest(test.TestCase):
       self.assertEqual(["biases:0", "weights:0"], sorted(var_list.keys()))
 
     with self.session(graph=graph) as sess:
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       saver = saver_module.Saver(var_list=var_list, max_to_keep=1)
       saver.save(sess, os.path.join(test_dir, ckpt_filename), write_state=False)
 
@@ -2609,13 +2616,14 @@ class ScopedGraphTest(test.TestCase):
       saver = saver_module.Saver(var_list=var_list, max_to_keep=1)
       saver.restore(sess, os.path.join(test_dir, ckpt_filename))
       # Verify that we have restored weights1 and biases1.
-      sess.run([weights1, biases1])
+      self.evaluate([weights1, biases1])
       # Initialize the rest of the variables and run logits.
-      sess.run(init_rest_op)
-      sess.run(logits)
+      self.evaluate(init_rest_op)
+      self.evaluate(logits)
 
   # Verifies that we can save the subgraph under "hidden1" and restore it
   # into "new_hidden1" in the new graph.
+  @test_util.run_deprecated_v1
   def testScopedSaveAndRestore(self):
     test_dir = self._get_test_dir("scoped_export_import")
     ckpt_filename = "ckpt"
@@ -2625,6 +2633,7 @@ class ScopedGraphTest(test.TestCase):
 
   # Verifies that we can copy the subgraph under "hidden1" and copy it
   # to different name scope in the same graph or different graph.
+  @test_util.run_deprecated_v1
   def testCopyScopedGraph(self):
     test_dir = self._get_test_dir("scoped_copy")
     saver0_ckpt = os.path.join(test_dir, "saver0.ckpt")
@@ -2640,7 +2649,7 @@ class ScopedGraphTest(test.TestCase):
 
     # Run the graph and save scoped checkpoint.
     with self.session(graph=graph1) as sess:
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       _, var_list_1 = meta_graph.export_scoped_meta_graph(
           export_scope="hidden1")
       saver = saver_module.Saver(var_list=var_list_1, max_to_keep=1)
@@ -2681,6 +2690,7 @@ class ScopedGraphTest(test.TestCase):
       saver3.restore(sess, saver0_ckpt)
       self.assertAllClose(expected, sess.run("new_hidden1/relu:0"))
 
+  @test_util.run_deprecated_v1
   def testExportGraphDefWithScope(self):
     test_dir = self._get_test_dir("export_graph_def")
     saver0_ckpt = os.path.join(test_dir, "saver0.ckpt")
@@ -2696,7 +2706,7 @@ class ScopedGraphTest(test.TestCase):
 
     # Run the graph and save scoped checkpoint.
     with self.session(graph=graph1) as sess:
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       _, var_list_1 = meta_graph.export_scoped_meta_graph(
           graph_def=graph1.as_graph_def(), export_scope="hidden1")
       saver = saver_module.Saver(var_list=var_list_1, max_to_keep=1)
@@ -2717,6 +2727,7 @@ class ScopedGraphTest(test.TestCase):
       saver3.restore(sess, saver0_ckpt)
       self.assertAllClose(expected, sess.run("new_hidden1/relu:0"))
 
+  @test_util.run_deprecated_v1
   def testSerializeSaverWithScope(self):
     test_dir = self._get_test_dir("export_graph_def")
     saver1_ckpt = os.path.join(test_dir, "saver1.ckpt")
@@ -2964,7 +2975,7 @@ class CheckpointableCompatibilityTests(test.TestCase):
     a_saver = saver_module.Saver([a])
     b_saver = saver_module.Saver([b])
     with self.cached_session() as sess:
-      sess.run(a.initializer)
+      self.evaluate(a.initializer)
       save_path = a_saver.save(sess=sess, save_path=checkpoint_prefix)
       with self.assertRaisesRegexp(
           errors.NotFoundError, "Key b not found in checkpoint"):
@@ -2977,6 +2988,7 @@ class CheckpointableCompatibilityTests(test.TestCase):
       # exception" block in Python 3.
       self.assertNotIn("NewCheckpointReader", cs.exception.message)
 
+  @test_util.run_deprecated_v1
   def testGraphChangedForRestoreErrorRaised(self):
     checkpoint_directory = self.get_temp_dir()
     checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
@@ -2986,7 +2998,7 @@ class CheckpointableCompatibilityTests(test.TestCase):
       a_saver = saver_module.Saver([a])
 
       with self.session(graph=g) as sess:
-        sess.run(a.initializer)
+        self.evaluate(a.initializer)
         save_path = a_saver.save(sess=sess, save_path=checkpoint_prefix)
 
     with ops_lib.Graph().as_default() as g:
@@ -2998,6 +3010,7 @@ class CheckpointableCompatibilityTests(test.TestCase):
             "a mismatch between the current graph and the graph"):
           a_saver.restore(sess=sess, save_path=save_path)
 
+  @test_util.run_deprecated_v1
   def testLoadFromObjectBasedGraph(self):
     checkpoint_directory = self.get_temp_dir()
     checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
@@ -3029,7 +3042,7 @@ class CheckpointableCompatibilityTests(test.TestCase):
       self.assertEqual(before_second_restore_ops,
                        restore_graph.get_operations())
       with self.assertRaisesRegexp(errors.NotFoundError,
-                                   "could not find a_variable"):
+                                   "Could not find some variables"):
         saver.restore(sess=sess, save_path=second_path)
 
   def testLoadFromObjectBasedEager(self):
diff --git a/tensorflow/python/training/server_lib_multiple_containers_test.py b/tensorflow/python/training/server_lib_multiple_containers_test.py
index f599e9b55b9..fb6118942bd 100644
--- a/tensorflow/python/training/server_lib_multiple_containers_test.py
+++ b/tensorflow/python/training/server_lib_multiple_containers_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 from tensorflow.python.client import session
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 from tensorflow.python.training import server_lib
@@ -33,6 +34,7 @@ class MultipleContainersTest(test.TestCase):
   # TODO(b/34465411): Starting multiple servers with different configurations
   # in the same test is flaky. Move this test case back into
   # "server_lib_test.py" when this is no longer the case.
+  @test_util.run_deprecated_v1
   def testMultipleContainers(self):
     with ops.container("test0"):
       v0 = variables.Variable(1.0, name="v0")
diff --git a/tensorflow/python/training/server_lib_same_variables_clear_container_test.py b/tensorflow/python/training/server_lib_same_variables_clear_container_test.py
index 11e6f28ab05..e0ab21bbd97 100644
--- a/tensorflow/python/training/server_lib_same_variables_clear_container_test.py
+++ b/tensorflow/python/training/server_lib_same_variables_clear_container_test.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 
 from tensorflow.python.client import session
 from tensorflow.python.framework import errors_impl
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 from tensorflow.python.training import server_lib
@@ -32,6 +33,7 @@ class SameVariablesClearContainerTest(test.TestCase):
   # TODO(b/34465411): Starting multiple servers with different configurations
   # in the same test is flaky. Move this test case back into
   # "server_lib_test.py" when this is no longer the case.
+  @test_util.run_deprecated_v1
   def testSameVariablesClearContainer(self):
     # Starts two servers with different names so they map to different
     # resource "containers".
@@ -60,9 +62,9 @@ class SameVariablesClearContainerTest(test.TestCase):
     session.Session.reset(server0.target, ["local0"])
     sess = session.Session(server0.target)
     with self.assertRaises(errors_impl.FailedPreconditionError):
-      sess.run(v0)
+      self.evaluate(v0)
     # Reinitializes v0 for the following test.
-    sess.run(v0.initializer)
+    self.evaluate(v0.initializer)
 
     # Verifies that v1 is still valid.
     self.assertAllEqual(2.0, sess_1.run(v1))
@@ -71,10 +73,10 @@ class SameVariablesClearContainerTest(test.TestCase):
     session.Session.reset(server1.target, ["local1"])
     sess = session.Session(server1.target)
     with self.assertRaises(errors_impl.FailedPreconditionError):
-      sess.run(v1)
+      self.evaluate(v1)
     # Verifies that v0 is still valid.
     sess = session.Session(server0.target)
-    self.assertAllEqual(1.0, sess.run(v0))
+    self.assertAllEqual(1.0, self.evaluate(v0))
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/training/server_lib_same_variables_clear_test.py b/tensorflow/python/training/server_lib_same_variables_clear_test.py
index 4682f1ab84d..7b147af6c55 100644
--- a/tensorflow/python/training/server_lib_same_variables_clear_test.py
+++ b/tensorflow/python/training/server_lib_same_variables_clear_test.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 
 from tensorflow.python.client import session
 from tensorflow.python.framework import errors_impl
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
@@ -32,6 +33,7 @@ class SameVariablesClearTest(test.TestCase):
   # TODO(b/34465411): Starting multiple servers with different configurations
   # in the same test is flaky. Move this test case back into
   # "server_lib_test.py" when this is no longer the case.
+  @test_util.run_deprecated_v1
   def testSameVariablesClear(self):
     server = server_lib.Server.create_local_server()
 
diff --git a/tensorflow/python/training/server_lib_same_variables_no_clear_test.py b/tensorflow/python/training/server_lib_same_variables_no_clear_test.py
index 5aa7f45c2b3..1b2d588f444 100644
--- a/tensorflow/python/training/server_lib_same_variables_no_clear_test.py
+++ b/tensorflow/python/training/server_lib_same_variables_no_clear_test.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 
 from tensorflow.python.client import session
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
@@ -33,6 +34,7 @@ class SameVariablesNoClearTest(test.TestCase):
   # TODO(b/34465411): Starting multiple servers with different configurations
   # in the same test is flaky. Move this test case back into
   # "server_lib_test.py" when this is no longer the case.
+  @test_util.run_deprecated_v1
   def testSameVariablesNoClear(self):
     server = server_lib.Server.create_local_server()
 
diff --git a/tensorflow/python/training/server_lib_sparse_job_test.py b/tensorflow/python/training/server_lib_sparse_job_test.py
index 1a6b44b90e8..93b06e62160 100644
--- a/tensorflow/python/training/server_lib_sparse_job_test.py
+++ b/tensorflow/python/training/server_lib_sparse_job_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 from tensorflow.python.client import session
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
 from tensorflow.python.training import server_lib
 
@@ -30,13 +31,14 @@ class SparseJobTest(test.TestCase):
   # TODO(b/34465411): Starting multiple servers with different configurations
   # in the same test is flaky. Move this test case back into
   # "server_lib_test.py" when this is no longer the case.
+  @test_util.run_deprecated_v1
   def testSparseJob(self):
     server = server_lib.Server({"local": {37: "localhost:0"}})
     with ops.device("/job:local/task:37"):
       a = constant_op.constant(1.0)
 
     with session.Session(server.target) as sess:
-      self.assertEqual(1.0, sess.run(a))
+      self.assertEqual(1.0, self.evaluate(a))
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/training/server_lib_test.py b/tensorflow/python/training/server_lib_test.py
index cf995707fc5..323e94c257c 100644
--- a/tensorflow/python/training/server_lib_test.py
+++ b/tensorflow/python/training/server_lib_test.py
@@ -174,7 +174,7 @@ class GrpcServerTest(test.TestCase):
     # is not supported, but it should successfully ignore it.
     sess = session.InteractiveSession(server.target)
     c = constant_op.constant(42.0)
-    self.assertEqual(42.0, c.eval())
+    self.assertEqual(42.0, self.evaluate(c))
     sess.close()
 
   def testSetConfiguration(self):
diff --git a/tensorflow/python/training/session_manager.py b/tensorflow/python/training/session_manager.py
index cd313c2ce05..14658630c55 100644
--- a/tensorflow/python/training/session_manager.py
+++ b/tensorflow/python/training/session_manager.py
@@ -46,7 +46,7 @@ def _maybe_name(obj):
     return "<no name for %s>" % type(obj)
 
 
-@tf_export("train.SessionManager")
+@tf_export(v1=["train.SessionManager"])
 class SessionManager(object):
   """Training helper that restores from checkpoint and creates session.
 
diff --git a/tensorflow/python/training/session_manager_test.py b/tensorflow/python/training/session_manager_test.py
index 2b5c3b01def..4294ffa8512 100644
--- a/tensorflow/python/training/session_manager_test.py
+++ b/tensorflow/python/training/session_manager_test.py
@@ -25,6 +25,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import variables
@@ -68,6 +69,7 @@ class SessionManagerTest(test.TestCase):
           "", init_fn=lambda sess: sess.run(v.initializer))
       self.assertAllClose([125], sess.run(v))
 
+  @test_util.run_deprecated_v1
   def testPrepareSessionFails(self):
     checkpoint_dir = os.path.join(self.get_temp_dir(), "prepare_session")
     checkpoint_dir2 = os.path.join(self.get_temp_dir(), "prepare_session2")
@@ -152,6 +154,7 @@ class SessionManagerTest(test.TestCase):
               sess.graph.get_tensor_by_name("v:0")).eval(session=sess))
       self.assertEquals(1, sess.run(v))
 
+  @test_util.run_deprecated_v1
   def testRecoverSession(self):
     # Create a checkpoint.
     checkpoint_dir = os.path.join(self.get_temp_dir(), "recover_session")
@@ -206,6 +209,7 @@ class SessionManagerTest(test.TestCase):
               variables.global_variables()),
           local_init_op=None)
 
+  @test_util.run_deprecated_v1
   def testRecoverSessionWithReadyForLocalInitOp(self):
     # Create a checkpoint.
     checkpoint_dir = os.path.join(self.get_temp_dir(),
@@ -259,6 +263,7 @@ class SessionManagerTest(test.TestCase):
       self.assertEquals(1, sess.run(v))
       self.assertEquals(1, sess.run(w))
 
+  @test_util.run_deprecated_v1
   def testRecoverSessionWithReadyForLocalInitOpFailsToReadyLocal(self):
     # We use ready_for_local_init_op=tf.report_uninitialized_variables(),
     # which causes recover_session to not run local_init_op, and to return
@@ -315,6 +320,7 @@ class SessionManagerTest(test.TestCase):
               sess.graph.get_tensor_by_name("w:0")).eval(session=sess))
       self.assertEquals(1, sess.run(v))
 
+  @test_util.run_deprecated_v1
   def testRecoverSessionNoChkptStillRunsLocalInitOp(self):
     # This test checks for backwards compatibility.
     # In particular, we continue to ensure that recover_session will execute
@@ -343,6 +349,7 @@ class SessionManagerTest(test.TestCase):
               sess.graph.get_tensor_by_name("w:0")).eval(session=sess))
       self.assertEquals(1, sess.run(w))
 
+  @test_util.run_deprecated_v1
   def testRecoverSessionFailsStillRunsLocalInitOp(self):
     # Create a checkpoint.
     checkpoint_dir = os.path.join(
@@ -386,6 +393,7 @@ class SessionManagerTest(test.TestCase):
               sess.graph.get_tensor_by_name("w:0")).eval(session=sess))
       self.assertEquals(1, sess.run(w))
 
+  @test_util.run_deprecated_v1
   def testWaitForSessionLocalInit(self):
     server = server_lib.Server.create_local_server()
     with ops.Graph().as_default() as graph:
@@ -437,6 +445,7 @@ class SessionManagerTest(test.TestCase):
         # because of overly restrictive ready_for_local_init_op
         sm.wait_for_session("", max_wait_secs=3)
 
+  @test_util.run_deprecated_v1
   def testWaitForSessionInsufficientReadyForLocalInitCheck(self):
     with ops.Graph().as_default() as graph:
       v = variables.VariableV1(1, name="v")
@@ -454,6 +463,7 @@ class SessionManagerTest(test.TestCase):
                                  "Session was not ready after waiting.*"):
       sm.wait_for_session("", max_wait_secs=3)
 
+  @test_util.run_deprecated_v1
   def testPrepareSessionWithReadyForLocalInitOp(self):
     with ops.Graph().as_default():
       v = variables.VariableV1(1, name="v")
@@ -493,6 +503,7 @@ class SessionManagerTest(test.TestCase):
       self.assertEquals(1, sess.run(w))
       self.assertEquals(3, sess.run(x))
 
+  @test_util.run_deprecated_v1
   def testPrepareSessionWithPartialInitOp(self):
     with ops.Graph().as_default():
       v = variables.VariableV1(1, name="v")
@@ -559,6 +570,7 @@ class SessionManagerTest(test.TestCase):
       self.assertEquals(1, sess.run(w_res))
       self.assertEquals(3, sess.run(x_res))
 
+  @test_util.run_deprecated_v1
   def testPrepareSessionWithCyclicInitializer(self):
     # Regression test. Previously Variable._build_initializer_expr would enter
     # into an infinite recursion when the variable's initial_value involved
@@ -632,6 +644,7 @@ class SessionManagerTest(test.TestCase):
           "Init operations did not make model ready for local_init"):
         sm2.prepare_session("", init_op=None)
 
+  @test_util.run_deprecated_v1
   def testPrepareSessionWithInsufficientReadyForLocalInitCheck(self):
     with ops.Graph().as_default():
       v = variables.VariableV1(1, name="v")
@@ -684,6 +697,7 @@ class ObsoleteSessionManagerTest(test.TestCase):
           "", init_fn=lambda sess: sess.run(v.initializer))
       self.assertAllClose([125], sess.run(v))
 
+  @test_util.run_deprecated_v1
   def testPrepareSessionFails(self):
     checkpoint_dir = os.path.join(self.get_temp_dir(), "prepare_session")
     checkpoint_dir2 = os.path.join(self.get_temp_dir(), "prepare_session2")
@@ -745,6 +759,7 @@ class ObsoleteSessionManagerTest(test.TestCase):
           variables.is_variable_initialized(
               sess.graph.get_tensor_by_name("v:0")).eval(session=sess))
 
+  @test_util.run_deprecated_v1
   def testRecoverSession(self):
     # Create a checkpoint.
     checkpoint_dir = os.path.join(self.get_temp_dir(), "recover_session")
diff --git a/tensorflow/python/training/session_run_hook.py b/tensorflow/python/training/session_run_hook.py
index 5daea931288..e9a61def743 100644
--- a/tensorflow/python/training/session_run_hook.py
+++ b/tensorflow/python/training/session_run_hook.py
@@ -186,7 +186,7 @@ class SessionRunHook(object):
     pass
 
 
-@tf_export("train.SessionRunArgs")
+@tf_export(v1=["train.SessionRunArgs"])
 class SessionRunArgs(
     collections.namedtuple("SessionRunArgs",
                            ["fetches", "feed_dict", "options"])):
@@ -211,7 +211,7 @@ class SessionRunArgs(
     return super(SessionRunArgs, cls).__new__(cls, fetches, feed_dict, options)
 
 
-@tf_export("train.SessionRunContext")
+@tf_export(v1=["train.SessionRunContext"])
 class SessionRunContext(object):
   """Provides information about the `session.run()` call being made.
 
@@ -263,7 +263,7 @@ class SessionRunContext(object):
     self._stop_requested = True
 
 
-@tf_export("train.SessionRunValues")
+@tf_export(v1=["train.SessionRunValues"])
 class SessionRunValues(
     collections.namedtuple("SessionRunValues",
                            ["results", "options", "run_metadata"])):
diff --git a/tensorflow/python/training/slot_creator_test.py b/tensorflow/python/training/slot_creator_test.py
index 6d6364169fd..1f26aaa434e 100644
--- a/tensorflow/python/training/slot_creator_test.py
+++ b/tensorflow/python/training/slot_creator_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import variable_scope
@@ -31,6 +32,7 @@ from tensorflow.python.training import slot_creator
 
 class SlotCreatorTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testCreateSlotFromVariable(self):
     with self.cached_session():
       v = variables.Variable([1.0, 2.5], name="var")
@@ -41,8 +43,9 @@ class SlotCreatorTest(test.TestCase):
       self.assertEqual("var/slot", slot.op.name)
       self.assertEqual([2], slot.get_shape().as_list())
       self.assertEqual(dtypes.float32, slot.dtype.base_dtype)
-      self.assertAllEqual([1.0, 2.5], slot.eval())
+      self.assertAllEqual([1.0, 2.5], self.evaluate(slot))
 
+  @test_util.run_deprecated_v1
   def testCreateSlotFromTensor(self):
     with self.cached_session():
       v = constant_op.constant([1.0, 2.5], name="const")
@@ -53,8 +56,9 @@ class SlotCreatorTest(test.TestCase):
       self.assertEqual("const/slot", slot.op.name)
       self.assertEqual([2], slot.get_shape().as_list())
       self.assertEqual(dtypes.float32, slot.dtype.base_dtype)
-      self.assertAllEqual([2.0, 5.0], slot.eval())
+      self.assertAllEqual([2.0, 5.0], self.evaluate(slot))
 
+  @test_util.run_deprecated_v1
   def testCreateZerosSlotFromVariable(self):
     with self.cached_session():
       v = variables.Variable([1.0, 2.5], name="var")
@@ -67,8 +71,9 @@ class SlotCreatorTest(test.TestCase):
       self.assertEqual("var/slot", slot.op.name)
       self.assertEqual([2], slot.get_shape().as_list())
       self.assertEqual(dtypes.float64, slot.dtype.base_dtype)
-      self.assertAllEqual([0.0, 0.0], slot.eval())
+      self.assertAllEqual([0.0, 0.0], self.evaluate(slot))
 
+  @test_util.run_deprecated_v1
   def testCreateZerosSlotFromDynamicShapedVariable(self):
     with self.cached_session():
       dyn_shape = constant_op.constant([2], dtype=dtypes.int32)
@@ -88,8 +93,9 @@ class SlotCreatorTest(test.TestCase):
       self.assertEqual("var/slot", slot.op.name)
       self.assertEqual([2], array_ops.shape(slot).eval())
       self.assertEqual(dtypes.float64, slot.dtype.base_dtype)
-      self.assertAllEqual([0.0, 0.0], slot.eval())
+      self.assertAllEqual([0.0, 0.0], self.evaluate(slot))
 
+  @test_util.run_deprecated_v1
   def testCreateZerosSlotFromTensor(self):
     with self.cached_session():
       v = constant_op.constant([1.0, 2.5], name="const")
@@ -101,8 +107,9 @@ class SlotCreatorTest(test.TestCase):
       self.assertEqual("const/slot", slot.op.name)
       self.assertEqual([2], slot.get_shape().as_list())
       self.assertEqual(dtypes.float32, slot.dtype.base_dtype)
-      self.assertAllEqual([0.0, 0.0], slot.eval())
+      self.assertAllEqual([0.0, 0.0], self.evaluate(slot))
 
+  @test_util.run_deprecated_v1
   def testCreateZerosSlotFromDynamicShapedTensor(self):
     with self.cached_session():
       v = random_ops.random_uniform([2], dtype=dtypes.float64)
@@ -116,8 +123,9 @@ class SlotCreatorTest(test.TestCase):
       self.assertEqual("const/slot", slot.op.name)
       self.assertEqual([2], array_ops.shape(slot).eval())
       self.assertEqual(dtypes.float64, slot.dtype.base_dtype)
-      self.assertAllEqual([0.0, 0.0], slot.eval())
+      self.assertAllEqual([0.0, 0.0], self.evaluate(slot))
 
+  @test_util.run_deprecated_v1
   def testCreateSlotFromVariableRespectsScope(self):
     # See discussion on #2740.
     with self.cached_session():
diff --git a/tensorflow/python/training/supervisor.py b/tensorflow/python/training/supervisor.py
index a5e626d3204..de60dd456ff 100644
--- a/tensorflow/python/training/supervisor.py
+++ b/tensorflow/python/training/supervisor.py
@@ -40,7 +40,7 @@ from tensorflow.python.util import deprecation
 from tensorflow.python.util.tf_export import tf_export
 
 
-@tf_export("train.Supervisor")
+@tf_export(v1=["train.Supervisor"])
 class Supervisor(object):
   """A training helper that checkpoints models and computes summaries.
 
diff --git a/tensorflow/python/training/supervisor_test.py b/tensorflow/python/training/supervisor_test.py
index 7cd99d86801..f6505acc9ac 100644
--- a/tensorflow/python/training/supervisor_test.py
+++ b/tensorflow/python/training/supervisor_test.py
@@ -35,6 +35,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import meta_graph
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import io_ops
 from tensorflow.python.ops import parsing_ops
@@ -100,7 +101,7 @@ class SupervisorTest(test.TestCase):
       sv = supervisor.Supervisor(logdir=logdir)
       sess = sv.prepare_or_wait_for_session("")
       for _ in xrange(10):
-        sess.run(my_op)
+        self.evaluate(my_op)
       sess.close()
       sv.stop()
 
@@ -111,7 +112,7 @@ class SupervisorTest(test.TestCase):
       sv = supervisor.Supervisor(logdir=logdir)
       with sv.managed_session("") as sess:
         for _ in xrange(10):
-          sess.run(my_op)
+          self.evaluate(my_op)
       # Supervisor has been stopped.
       self.assertTrue(sv.should_stop())
 
@@ -128,7 +129,7 @@ class SupervisorTest(test.TestCase):
             if step == 1:
               raise RuntimeError("failing here")
             else:
-              sess.run(my_op)
+              self.evaluate(my_op)
       # Supervisor has been stopped.
       self.assertTrue(sv.should_stop())
       self.assertEqual(1, last_step)
@@ -146,7 +147,7 @@ class SupervisorTest(test.TestCase):
             raise errors_impl.OutOfRangeError(my_op.op.node_def, my_op.op,
                                               "all done")
           else:
-            sess.run(my_op)
+            self.evaluate(my_op)
       # Supervisor has been stopped.  OutOfRangeError was not thrown.
       self.assertTrue(sv.should_stop())
       self.assertEqual(3, last_step)
@@ -335,7 +336,7 @@ class SupervisorTest(test.TestCase):
       sess = sv.prepare_or_wait_for_session(
           "", config=config_pb2.ConfigProto(device_count={"CPU": 2}))
       for _ in xrange(10):
-        sess.run(my_op)
+        self.evaluate(my_op)
       sess.close()
       sv.stop()
 
@@ -420,6 +421,7 @@ class SupervisorTest(test.TestCase):
       with self.assertRaisesRegexp(RuntimeError, "requires a summary writer"):
         sv.summary_computed(sess, sess.run(summ))
 
+  @test_util.run_deprecated_v1
   def testLogdirButExplicitlyNoSummaryWriter(self):
     logdir = self._test_dir("explicit_no_summary_writer")
     with ops.Graph().as_default():
@@ -505,6 +507,7 @@ class SupervisorTest(test.TestCase):
       sv = supervisor.Supervisor(logdir="", session_manager=sm)
       sv.prepare_or_wait_for_session("")
 
+  @test_util.run_deprecated_v1
   def testInitOp(self):
     logdir = self._test_dir("default_init_op")
     with ops.Graph().as_default():
@@ -514,6 +517,7 @@ class SupervisorTest(test.TestCase):
       self.assertAllClose([1.0, 2.0, 3.0], sess.run(v))
       sv.stop()
 
+  @test_util.run_deprecated_v1
   def testInitFn(self):
     logdir = self._test_dir("default_init_op")
     with ops.Graph().as_default():
@@ -527,6 +531,7 @@ class SupervisorTest(test.TestCase):
       self.assertAllClose([1.0, 2.0, 3.0], sess.run(v))
       sv.stop()
 
+  @test_util.run_deprecated_v1
   def testInitOpWithFeedDict(self):
     logdir = self._test_dir("feed_dict_init_op")
     with ops.Graph().as_default():
@@ -540,6 +545,7 @@ class SupervisorTest(test.TestCase):
       self.assertAllClose([1.0, 2.0, 3.0], sess.run(v))
       sv.stop()
 
+  @test_util.run_deprecated_v1
   def testReadyForLocalInitOp(self):
     server = server_lib.Server.create_local_server()
     logdir = self._test_dir("default_ready_for_local_init_op")
@@ -582,6 +588,7 @@ class SupervisorTest(test.TestCase):
     sv0.stop()
     sv1.stop()
 
+  @test_util.run_deprecated_v1
   def testReadyForLocalInitOpRestoreFromCheckpoint(self):
     server = server_lib.Server.create_local_server()
     logdir = self._test_dir("ready_for_local_init_op_restore")
@@ -713,6 +720,7 @@ class SupervisorTest(test.TestCase):
                                    "Variables not initialized: w"):
         sv.prepare_or_wait_for_session(server.target)
 
+  @test_util.run_deprecated_v1
   def testSetupFail(self):
     logdir = self._test_dir("setup_fail")
     with ops.Graph().as_default():
@@ -723,6 +731,7 @@ class SupervisorTest(test.TestCase):
       variables.VariableV1([1.0, 2.0, 3.0], name="v")
       supervisor.Supervisor(logdir=logdir, is_chief=False)
 
+  @test_util.run_deprecated_v1
   def testDefaultGlobalStep(self):
     logdir = self._test_dir("default_global_step")
     with ops.Graph().as_default():
@@ -732,6 +741,7 @@ class SupervisorTest(test.TestCase):
       self.assertEquals(287, sess.run(sv.global_step))
       sv.stop()
 
+  @test_util.run_deprecated_v1
   def testRestoreFromMetaGraph(self):
     logdir = self._test_dir("restore_from_meta_graph")
     with ops.Graph().as_default():
@@ -753,6 +763,7 @@ class SupervisorTest(test.TestCase):
   # This test is based on the fact that the standard services start
   # right away and get to run once before sv.stop() returns.
   # We still sleep a bit to make the test robust.
+  @test_util.run_deprecated_v1
   def testStandardServicesWithoutGlobalStep(self):
     logdir = self._test_dir("standard_services_without_global_step")
     # Create a checkpoint.
@@ -799,10 +810,11 @@ class SupervisorTest(test.TestCase):
       v = variables.VariableV1([10.10], name="foo")
       sav = saver_lib.Saver([v])
       sav.restore(sess, save_path)
-      self.assertEqual(1.0, v.eval()[0])
+      self.assertEqual(1.0, self.evaluate(v)[0])
 
   # Same as testStandardServicesNoGlobalStep but with a global step.
   # We should get a summary about the step time.
+  @test_util.run_deprecated_v1
   def testStandardServicesWithGlobalStep(self):
     logdir = self._test_dir("standard_services_with_global_step")
     # Create a checkpoint.
@@ -863,7 +875,7 @@ class SupervisorTest(test.TestCase):
       v = variables.VariableV1([-12], name="global_step")
       sav = saver_lib.Saver([v])
       sav.restore(sess, save_path)
-      self.assertEqual(123, v.eval()[0])
+      self.assertEqual(123, self.evaluate(v)[0])
 
   def testNoQueueRunners(self):
     with ops.Graph().as_default(), self.cached_session() as sess:
diff --git a/tensorflow/python/training/sync_replicas_optimizer.py b/tensorflow/python/training/sync_replicas_optimizer.py
index fbde8fe3c2a..172c1411505 100644
--- a/tensorflow/python/training/sync_replicas_optimizer.py
+++ b/tensorflow/python/training/sync_replicas_optimizer.py
@@ -44,6 +44,9 @@ from tensorflow.python.util.tf_export import tf_export
 class SyncReplicasOptimizer(optimizer.Optimizer):
   """Class to synchronize, aggregate gradients and pass them to the optimizer.
 
+  This class is deprecated. For synchrononous training, please use [Distribution
+  Strategies](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/distribute).
+
   In a typical asynchronous training environment, it's common to have some
   stale gradients. For example, with a N-replica asynchronous training,
   gradients will be applied to the variables N times independently. Depending
@@ -142,9 +145,9 @@ class SyncReplicasOptimizer(optimizer.Optimizer):
 
   @deprecation.deprecated(
       None,
-      "The `SyncReplicaOptimizer` is deprecated. For synchrononous training, "
-      "please use [Distribution Strategies](https://github.com/tensorflow/"
-      "tensorflow/tree/master/tensorflow/contrib/distribute).",
+      "The `SyncReplicaOptimizer` class is deprecated. For synchrononous "
+      "training, please use [Distribution Strategies](https://github.com/"
+      "tensorflow/tensorflow/tree/master/tensorflow/contrib/distribute).",
       warn_once=True)
   def __init__(self,
                opt,
diff --git a/tensorflow/python/training/training_ops_test.py b/tensorflow/python/training/training_ops_test.py
index 02164828250..51f49ca0818 100644
--- a/tensorflow/python/training/training_ops_test.py
+++ b/tensorflow/python/training/training_ops_test.py
@@ -24,6 +24,7 @@ import numpy as np
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.framework.test_util import TensorFlowTestCase
 # Import resource_variable_ops for the variables-to-tensor implicit conversion.
 from tensorflow.python.ops import resource_variable_ops  # pylint: disable=unused-import
@@ -53,12 +54,13 @@ class TrainingOpsTest(TensorFlowTestCase):
     with self.session(use_gpu=use_gpu):
       var = variables.VariableV1(x)
       variables.global_variables_initializer().run()
-      self.assertAllCloseAccordingToType(x, var.eval())
+      self.assertAllCloseAccordingToType(x, self.evaluate(var))
       apply_sgd = training_ops.apply_gradient_descent(var, alpha, delta)
-      out = apply_sgd.eval()
+      out = self.evaluate(apply_sgd)
       self.assertShapeEqual(out, apply_sgd)
       self.assertAllCloseAccordingToType(x - alpha * delta, out)
 
+  @test_util.run_deprecated_v1
   def testApplyGradientDescent(self):
     for (dtype, use_gpu) in itertools.product(
         [np.float16, np.float32, np.float64], [False, True]):
@@ -74,13 +76,13 @@ class TrainingOpsTest(TensorFlowTestCase):
       accum = variables.VariableV1(y)
       variables.global_variables_initializer().run()
 
-      self.assertAllCloseAccordingToType(x, var.eval())
+      self.assertAllCloseAccordingToType(x, self.evaluate(var))
       apply_adagrad = training_ops.apply_adagrad(var, accum, lr, grad)
-      out = apply_adagrad.eval()
+      out = self.evaluate(apply_adagrad)
       self.assertShapeEqual(out, apply_adagrad)
       self.assertAllCloseAccordingToType(x - lr * grad * (y + grad * grad)**
                                          (-0.5), out)
-      self.assertAllCloseAccordingToType(y + grad * grad, accum.eval())
+      self.assertAllCloseAccordingToType(y + grad * grad, self.evaluate(accum))
 
   def _testTypesForFtrl(self,
                         x,
@@ -99,10 +101,10 @@ class TrainingOpsTest(TensorFlowTestCase):
       linear = variables.VariableV1(z)
       variables.global_variables_initializer().run()
 
-      self.assertAllCloseAccordingToType(x, var.eval())
+      self.assertAllCloseAccordingToType(x, self.evaluate(var))
       apply_ftrl = training_ops.apply_ftrl(var, accum, linear, grad, lr, l1, l2,
                                            lr_power)
-      out = apply_ftrl.eval()
+      out = self.evaluate(apply_ftrl)
       self.assertShapeEqual(out, apply_ftrl)
       accum_update = y + grad * grad
       linear_update = z + grad - (accum_update**(-lr_power) - y**
@@ -112,19 +114,22 @@ class TrainingOpsTest(TensorFlowTestCase):
           np.sign(linear_update[i]) * l1 - linear_update[i]) / (quadratic[i]) if
                                np.abs(linear_update[i]) > l1 else 0.0
                                for i in range(linear_update.size)])
-      self.assertAllCloseAccordingToType(accum_update, accum.eval())
+      self.assertAllCloseAccordingToType(accum_update, self.evaluate(accum))
       if x.dtype == np.float16:
         # The calculations here really are not very precise in float16.
-        self.assertAllClose(linear_update, linear.eval(), rtol=2e-2, atol=2e-2)
+        self.assertAllClose(
+            linear_update, self.evaluate(linear), rtol=2e-2, atol=2e-2)
         self.assertAllClose(expected_out, out, rtol=2e-2, atol=2e-2)
       elif x.dtype == np.float32:
         # The calculations here not sufficiently precise in float32.
-        self.assertAllClose(linear_update, linear.eval(), rtol=1e-5, atol=1e-5)
+        self.assertAllClose(
+            linear_update, self.evaluate(linear), rtol=1e-5, atol=1e-5)
         self.assertAllClose(expected_out, out, rtol=1e-5, atol=1e-5)
       else:
-        self.assertAllClose(linear_update, linear.eval())
+        self.assertAllClose(linear_update, self.evaluate(linear))
         self.assertAllClose(expected_out, out)
 
+  @test_util.run_deprecated_v1
   def testApplyAdagrad(self):
     for (dtype, use_gpu) in itertools.product(
         [np.float16, np.float32, np.float64], [False, True]):
@@ -134,6 +139,7 @@ class TrainingOpsTest(TensorFlowTestCase):
       grad = np.arange(100).astype(dtype)
       self._testTypesForAdagrad(x, y, lr, grad, use_gpu)
 
+  @test_util.run_deprecated_v1
   def testApplyFtrl(self):
     for dtype in [np.float16, np.float32, np.float64]:
       x = np.arange(100).astype(dtype)
@@ -152,19 +158,19 @@ class TrainingOpsTest(TensorFlowTestCase):
       accum = variables.VariableV1(y)
       variables.global_variables_initializer().run()
 
-      self.assertAllCloseAccordingToType(x, var.eval())
+      self.assertAllCloseAccordingToType(x, self.evaluate(var))
       sparse_apply_adagrad = training_ops.sparse_apply_adagrad(
           var, accum, lr, grad,
           constant_op.constant(indices, self._toType(indices.dtype)))
-      out = sparse_apply_adagrad.eval()
+      out = self.evaluate(sparse_apply_adagrad)
       self.assertShapeEqual(out, sparse_apply_adagrad)
 
       for (i, index) in enumerate(indices):
         self.assertAllCloseAccordingToType(
             x[index] - lr * grad[i] * (y[index] + grad[i] * grad[i])**(-0.5),
-            var.eval()[index])
+            self.evaluate(var)[index])
         self.assertAllCloseAccordingToType(y[index] + grad[i] * grad[i],
-                                           accum.eval()[index])
+                                           self.evaluate(accum)[index])
 
   def _testTypesForSparseFtrl(self,
                               x,
@@ -183,7 +189,7 @@ class TrainingOpsTest(TensorFlowTestCase):
       linear = variables.VariableV1(z)
       variables.global_variables_initializer().run()
 
-      self.assertAllCloseAccordingToType(x, var.eval())
+      self.assertAllCloseAccordingToType(x, self.evaluate(var))
       sparse_apply_ftrl = training_ops.sparse_apply_ftrl(
           var,
           accum,
@@ -194,16 +200,18 @@ class TrainingOpsTest(TensorFlowTestCase):
           l1,
           l2,
           lr_power=lr_power)
-      out = sparse_apply_ftrl.eval()
+      out = self.evaluate(sparse_apply_ftrl)
       self.assertShapeEqual(out, sparse_apply_ftrl)
 
       for (i, index) in enumerate(indices):
-        self.assertAllCloseAccordingToType(x[index] - lr * grad[i] *
-                                           (y[index] + grad[i] * grad[i])**
-                                           (lr_power), var.eval()[index])
+        self.assertAllCloseAccordingToType(
+            x[index] - lr * grad[i] * (y[index] + grad[i] * grad[i])**
+            (lr_power),
+            self.evaluate(var)[index])
         self.assertAllCloseAccordingToType(y[index] + grad[i] * grad[i],
-                                           accum.eval()[index])
+                                           self.evaluate(accum)[index])
 
+  @test_util.run_deprecated_v1
   def testSparseApplyAdagrad(self):
     for (dtype, index_type) in itertools.product(
         [np.float16, np.float32, np.float64], [np.int32, np.int64]):
@@ -217,6 +225,7 @@ class TrainingOpsTest(TensorFlowTestCase):
       indices = np.array([0, 2]).astype(index_type)
       self._testTypesForSparseAdagrad(x, y, lr, grad, indices)
 
+  @test_util.run_deprecated_v1
   def testSparseApplyAdagradDim1(self):
     for (dtype, index_type) in itertools.product(
         [np.float16, np.float32, np.float64], [np.int32, np.int64]):
@@ -230,6 +239,7 @@ class TrainingOpsTest(TensorFlowTestCase):
       indices = np.array([0, 2]).astype(index_type)
       self._testTypesForSparseAdagrad(x, y, lr, grad, indices)
 
+  @test_util.run_deprecated_v1
   def testSparseApplyFtrlDim1(self):
     for (dtype, index_type) in itertools.product(
         [np.float16, np.float32, np.float64], [np.int32, np.int64]):
@@ -245,6 +255,7 @@ class TrainingOpsTest(TensorFlowTestCase):
       indices = np.array([0, 2]).astype(index_type)
       self._testTypesForSparseFtrl(x, y, z, lr, grad, indices)
 
+  @test_util.run_deprecated_v1
   def testApplyAdam(self):
     for dtype, use_gpu in itertools.product(
         [np.float16, np.float32, np.float64], [False, True]):
@@ -276,13 +287,13 @@ class TrainingOpsTest(TensorFlowTestCase):
       epsilon_t = constant_op.constant(epsilon, self._toType(var.dtype), [])
       variables.global_variables_initializer().run()
 
-      self.assertAllCloseAccordingToType(var, var_t.eval())
+      self.assertAllCloseAccordingToType(var, self.evaluate(var_t))
       new_var, _, _ = self._adamUpdateNumpy(var, grad, t, m, v, lr, beta1,
                                             beta2, epsilon)
       apply_adam = training_ops.apply_adam(var_t, m_t, v_t, beta1_power_t,
                                            beta2_power_t, lr_t, beta1_t,
                                            beta2_t, epsilon_t, grad)
-      out = apply_adam.eval()
+      out = self.evaluate(apply_adam)
       self.assertShapeEqual(out, apply_adam)
       self.assertAllCloseAccordingToType(new_var, out)
 
diff --git a/tensorflow/python/training/training_util_test.py b/tensorflow/python/training/training_util_test.py
index ba64e785ac6..3317008fce0 100644
--- a/tensorflow/python/training/training_util_test.py
+++ b/tensorflow/python/training/training_util_test.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 from tensorflow.python.training import monitored_session
@@ -46,6 +47,7 @@ class GlobalStepTest(test.TestCase):
     self.assertRaisesRegexp(TypeError, 'does not have integer type',
                             training_util.get_global_step, g)
 
+  @test_util.run_deprecated_v1
   def test_invalid_shape(self):
     with ops.Graph().as_default() as g:
       self.assertIsNone(training_util.get_global_step())
@@ -70,6 +72,7 @@ class GlobalStepTest(test.TestCase):
                               training_util.create_global_step, g)
       self._assert_global_step(training_util.create_global_step(ops.Graph()))
 
+  @test_util.run_deprecated_v1
   def test_get_global_step(self):
     with ops.Graph().as_default() as g:
       self.assertIsNone(training_util.get_global_step())
diff --git a/tensorflow/python/training/warm_starting_util.py b/tensorflow/python/training/warm_starting_util.py
index 78dbb465b55..8c97f101da8 100644
--- a/tensorflow/python/training/warm_starting_util.py
+++ b/tensorflow/python/training/warm_starting_util.py
@@ -32,7 +32,7 @@ from tensorflow.python.training import saver
 from tensorflow.python.util.tf_export import tf_export
 
 
-@tf_export("train.VocabInfo")
+@tf_export(v1=["train.VocabInfo"])
 class VocabInfo(
     collections.namedtuple("VocabInfo", [
         "new_vocab",
@@ -248,7 +248,7 @@ def _warm_start_var_with_vocab(var,
     prev_tensor_name = _infer_var_name(var)
 
   # TODO(eddz): Fix functionality for rank-1 Variables (like FC biases).
-  total_v_first_axis = sum([v.get_shape().as_list()[0] for v in var])
+  total_v_first_axis = sum(v.get_shape().as_list()[0] for v in var)
   for v in var:
     v_shape = v.get_shape().as_list()
     slice_info = v._get_save_slice_info()
@@ -333,12 +333,12 @@ def _get_grouped_variables(vars_to_warm_start):
         ops.GraphKeys.TRAINABLE_VARIABLES,
         scope=vars_to_warm_start)
   elif isinstance(vars_to_warm_start, list):
-    if all([isinstance(v, str) for v in vars_to_warm_start]):
+    if all(isinstance(v, str) for v in vars_to_warm_start):
       list_of_vars = []
       for v in vars_to_warm_start:
         list_of_vars += ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES,
                                            scope=v)
-    elif all([checkpoint_utils._is_variable(v) for v in vars_to_warm_start]):  # pylint: disable=protected-access
+    elif all(checkpoint_utils._is_variable(v) for v in vars_to_warm_start):  # pylint: disable=protected-access
       list_of_vars = vars_to_warm_start
     else:
       raise ValueError("If `vars_to_warm_start` is a list, it must be all "
@@ -360,7 +360,7 @@ def _get_grouped_variables(vars_to_warm_start):
   return grouped_variables
 
 
-@tf_export("train.warm_start")
+@tf_export(v1=["train.warm_start"])
 def warm_start(ckpt_to_initialize_from,
                vars_to_warm_start=".*",
                var_name_to_vocab_info=None,
diff --git a/tensorflow/python/training/warm_starting_util_test.py b/tensorflow/python/training/warm_starting_util_test.py
index 91a0b53b3a8..fa1f370f41e 100644
--- a/tensorflow/python/training/warm_starting_util_test.py
+++ b/tensorflow/python/training/warm_starting_util_test.py
@@ -22,7 +22,7 @@ import os
 import numpy as np
 import six
 
-from tensorflow.python.feature_column import feature_column as fc
+from tensorflow.python.feature_column import feature_column_lib as fc
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
@@ -49,7 +49,7 @@ class WarmStartingUtilTest(test.TestCase):
     return vocab_file
 
   def _write_checkpoint(self, sess):
-    sess.run(variables.global_variables_initializer())
+    self.evaluate(variables.global_variables_initializer())
     saver = saver_lib.Saver()
     ckpt_prefix = os.path.join(self.get_temp_dir(), "model")
     saver.save(sess, ckpt_prefix, global_step=0)
@@ -70,7 +70,7 @@ class WarmStartingUtilTest(test.TestCase):
         if partitioner:
           self.assertTrue(isinstance(var, variables.PartitionedVariable))
           var = var._get_variable_list()
-        return var, sess.run(var)
+        return var, self.evaluate(var)
 
   def _create_prev_run_vars(self,
                             var_names,
@@ -86,7 +86,7 @@ class WarmStartingUtilTest(test.TestCase):
               shape=shape,
               initializer=initializer))
         self._write_checkpoint(sess)
-        return [sess.run(var) for var in all_vars]
+        return [self.evaluate(var) for var in all_vars]
 
   def _create_dummy_inputs(self):
     return {
@@ -125,7 +125,7 @@ class WarmStartingUtilTest(test.TestCase):
         prev_tensor_name, var = ws_util._get_var_info(fruit_weights)
         checkpoint_utils.init_from_checkpoint(self.get_temp_dir(),
                                               {prev_tensor_name: var})
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         self.assertAllClose(prev_val, fruit_weights.eval(sess))
 
   def testWarmStartVarPrevVarPartitioned(self):
@@ -143,7 +143,7 @@ class WarmStartingUtilTest(test.TestCase):
         prev_tensor_name, var = ws_util._get_var_info(fruit_weights)
         checkpoint_utils.init_from_checkpoint(self.get_temp_dir(),
                                               {prev_tensor_name: var})
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         self.assertAllClose(prev_val, fruit_weights.eval(sess))
 
   def testWarmStartVarCurrentVarPartitioned(self):
@@ -162,7 +162,7 @@ class WarmStartingUtilTest(test.TestCase):
         prev_tensor_name, var = ws_util._get_var_info(fruit_weights)
         checkpoint_utils.init_from_checkpoint(self.get_temp_dir(),
                                               {prev_tensor_name: var})
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         fruit_weights = fruit_weights._get_variable_list()
         new_val = np.concatenate(
             [fruit_weights[0].eval(sess), fruit_weights[1].eval(sess)], axis=0)
@@ -189,7 +189,7 @@ class WarmStartingUtilTest(test.TestCase):
             fruit_weights, prev_tensor_name="old_scope/fruit_weights")
         checkpoint_utils.init_from_checkpoint(self.get_temp_dir(),
                                               {prev_tensor_name: var})
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         fruit_weights = fruit_weights._get_variable_list()
         new_val = np.concatenate(
             [fruit_weights[0].eval(sess), fruit_weights[1].eval(sess)], axis=0)
@@ -211,7 +211,7 @@ class WarmStartingUtilTest(test.TestCase):
             "fruit_weights", initializer=[[0.], [0.], [0.], [0.], [0.]])
         ws_util._warm_start_var_with_vocab(fruit_weights, new_vocab_path, 5,
                                            self.get_temp_dir(), prev_vocab_path)
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         self.assertAllClose([[2.], [1.5], [1.], [0.5], [0.]],
                             fruit_weights.eval(sess))
 
@@ -236,7 +236,7 @@ class WarmStartingUtilTest(test.TestCase):
                                            prev_ckpt=self.get_temp_dir(),
                                            prev_vocab_path=prev_vocab_path,
                                            axis=1)
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         self.assertAllClose([[0.3, 0.5, 0.], [0.8, 1.0, 0.], [1.2, 1.5, 0.],
                              [2.3, 2., 0.]], fruit_output_layer.eval(sess))
 
@@ -261,7 +261,7 @@ class WarmStartingUtilTest(test.TestCase):
             self.get_temp_dir(),
             prev_vocab_path,
             previous_vocab_size=2)
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         # Old vocabulary limited to ['apple', 'banana'].
         self.assertAllClose([[0.], [0.], [1.], [0.5], [0.]],
                             fruit_weights.eval(sess))
@@ -285,7 +285,7 @@ class WarmStartingUtilTest(test.TestCase):
             "fruit_weights", initializer=[[0.], [0.], [0.], [0.], [0.]])
         ws_util._warm_start_var_with_vocab(fruit_weights, new_vocab_path, 5,
                                            self.get_temp_dir(), prev_vocab_path)
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         self.assertAllClose([[2.], [1.5], [1.], [0.5], [0.]],
                             fruit_weights.eval(sess))
 
@@ -312,7 +312,7 @@ class WarmStartingUtilTest(test.TestCase):
                                            prev_ckpt=self.get_temp_dir(),
                                            prev_vocab_path=prev_vocab_path,
                                            axis=1)
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         self.assertAllClose([[0.3, 0.5, 0.], [0.8, 1.0, 0.], [1.2, 1.5, 0.],
                              [2.3, 2., 0.]], fruit_output_layer.eval(sess))
 
@@ -340,7 +340,7 @@ class WarmStartingUtilTest(test.TestCase):
             self.get_temp_dir(),
             prev_vocab_path,
             current_oov_buckets=1)
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         self.assertTrue(
             isinstance(fruit_weights, variables.PartitionedVariable))
         fruit_weights_vars = fruit_weights._get_variable_list()
@@ -372,7 +372,7 @@ class WarmStartingUtilTest(test.TestCase):
                                            prev_ckpt=self.get_temp_dir(),
                                            prev_vocab_path=prev_vocab_path,
                                            axis=1)
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         self.assertTrue(
             isinstance(fruit_output_layer, variables.PartitionedVariable))
         fruit_output_layer_vars = fruit_output_layer._get_variable_list()
@@ -404,7 +404,7 @@ class WarmStartingUtilTest(test.TestCase):
             partitioner=lambda shape, dtype: [2, 1])
         ws_util._warm_start_var_with_vocab(fruit_weights, new_vocab_path, 6,
                                            self.get_temp_dir(), prev_vocab_path)
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         self.assertTrue(
             isinstance(fruit_weights, variables.PartitionedVariable))
         fruit_weights_vars = fruit_weights._get_variable_list()
@@ -438,7 +438,7 @@ class WarmStartingUtilTest(test.TestCase):
                                            prev_ckpt=self.get_temp_dir(),
                                            prev_vocab_path=prev_vocab_path,
                                            axis=1)
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         self.assertTrue(
             isinstance(fruit_output_layer, variables.PartitionedVariable))
         fruit_output_layer_vars = fruit_output_layer._get_variable_list()
@@ -463,7 +463,7 @@ class WarmStartingUtilTest(test.TestCase):
             shape=[10, 1],
             initializer=zeros())
         ws_util.warm_start(self.get_temp_dir(), vars_to_warm_start=[var])
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         # Verify weights were correctly warm-started (init overridden to ones).
         self.assertAllEqual(var.eval(), prev_int_val)
 
@@ -483,7 +483,7 @@ class WarmStartingUtilTest(test.TestCase):
             shape=[10, 1],
             initializer=zeros())
         ws_util.warm_start(self.get_temp_dir(), vars_to_warm_start=["v1"])
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         # Verify weights were correctly warm-started (init overridden to ones).
         self.assertAllEqual(var.eval(), prev_int_val)
 
@@ -519,7 +519,7 @@ class WarmStartingUtilTest(test.TestCase):
                            # This warm-starts both v1 and v1/Momentum, but only
                            # v2 (and not v2/Momentum).
                            vars_to_warm_start=["v1", "v2[^/]"])
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         # Verify the selection of weights were correctly warm-started (init
         # overridden to ones).
         self.assertAllEqual(v1.eval(), prev_v1_val)
@@ -542,7 +542,7 @@ class WarmStartingUtilTest(test.TestCase):
     with ops.Graph().as_default() as g:
       with self.session(graph=g) as sess:
         cols_to_vars = self._create_linear_model([sc_int], partitioner)
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         # Without warm-starting, the weights should be initialized using default
         # initializer (which is init_ops.zeros_initializer).
         self._assert_cols_to_vars(cols_to_vars, {sc_int: [np.zeros([10, 1])]},
@@ -553,7 +553,7 @@ class WarmStartingUtilTest(test.TestCase):
       with self.session(graph=g) as sess:
         cols_to_vars = self._create_linear_model([sc_int], partitioner)
         ws_util.warm_start(self.get_temp_dir(), vars_to_warm_start=".*sc_int.*")
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         # Verify weights were correctly warm-started.
         self._assert_cols_to_vars(cols_to_vars, {sc_int: [prev_int_val]}, sess)
 
@@ -571,7 +571,7 @@ class WarmStartingUtilTest(test.TestCase):
     with ops.Graph().as_default() as g:
       with self.session(graph=g) as sess:
         cols_to_vars = self._create_linear_model([sc_hash], partitioner)
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         # Without warm-starting, the weights should be initialized using default
         # initializer (which is init_ops.zeros_initializer).
         self._assert_cols_to_vars(cols_to_vars, {sc_hash: [np.zeros([15, 1])]},
@@ -583,7 +583,7 @@ class WarmStartingUtilTest(test.TestCase):
         cols_to_vars = self._create_linear_model([sc_hash], partitioner)
         ws_util.warm_start(
             self.get_temp_dir(), vars_to_warm_start=".*sc_hash.*")
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         # Verify weights were correctly warm-started.
         self._assert_cols_to_vars(cols_to_vars, {sc_hash: [prev_hash_val]},
                                   sess)
@@ -605,7 +605,7 @@ class WarmStartingUtilTest(test.TestCase):
     with ops.Graph().as_default() as g:
       with self.session(graph=g) as sess:
         cols_to_vars = self._create_linear_model([sc_vocab], partitioner)
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         # Without warm-starting, the weights should be initialized using default
         # initializer (which is init_ops.zeros_initializer).
         self._assert_cols_to_vars(cols_to_vars, {sc_vocab: [np.zeros([4, 1])]},
@@ -619,7 +619,7 @@ class WarmStartingUtilTest(test.TestCase):
         # vocab is assumed to be same as new vocab.
         ws_util.warm_start(
             self.get_temp_dir(), vars_to_warm_start=".*sc_vocab.*")
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         # Verify weights were correctly warm-started.
         self._assert_cols_to_vars(cols_to_vars, {sc_vocab: [prev_vocab_val]},
                                   sess)
@@ -641,7 +641,7 @@ class WarmStartingUtilTest(test.TestCase):
     with ops.Graph().as_default() as g:
       with self.session(graph=g) as sess:
         cols_to_vars = self._create_linear_model([sc_vocab], partitioner)
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         # Without warm-starting, the weights should be initialized using default
         # initializer (which is init_ops.zeros_initializer).
         self._assert_cols_to_vars(cols_to_vars, {sc_vocab: [np.zeros([4, 1])]},
@@ -657,7 +657,7 @@ class WarmStartingUtilTest(test.TestCase):
             # Explicitly provide the file prefix instead of just the dir.
             os.path.join(self.get_temp_dir(), "model-0"),
             vars_to_warm_start=".*sc_vocab.*")
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         # Verify weights were correctly warm-started.
         self._assert_cols_to_vars(cols_to_vars, {sc_vocab: [prev_vocab_val]},
                                   sess)
@@ -686,7 +686,7 @@ class WarmStartingUtilTest(test.TestCase):
     with ops.Graph().as_default() as g:
       with self.session(graph=g) as sess:
         cols_to_vars = self._create_linear_model([sc_vocab], partitioner)
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         # Without warm-starting, the weights should be initialized using default
         # initializer (which is init_ops.zeros_initializer).
         self._assert_cols_to_vars(cols_to_vars, {sc_vocab: [np.zeros([2, 1])]},
@@ -708,7 +708,7 @@ class WarmStartingUtilTest(test.TestCase):
             var_name_to_vocab_info={
                 "linear_model/sc_vocab/weights": vocab_info
             })
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         # Verify weights were correctly warm-started.  'banana' isn't in the
         # first two entries of the old vocabulary, so it's newly initialized.
         self._assert_cols_to_vars(cols_to_vars, {sc_vocab: [[[1], [0]]]}, sess)
@@ -729,7 +729,7 @@ class WarmStartingUtilTest(test.TestCase):
     with ops.Graph().as_default() as g:
       with self.session(graph=g) as sess:
         cols_to_vars = self._create_linear_model([real_bucket], partitioner)
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         # Without warm-starting, the weights should be initialized using default
         # initializer (which is init_ops.zeros_initializer).
         self._assert_cols_to_vars(cols_to_vars,
@@ -741,7 +741,7 @@ class WarmStartingUtilTest(test.TestCase):
         cols_to_vars = self._create_linear_model([real_bucket], partitioner)
         ws_util.warm_start(
             self.get_temp_dir(), vars_to_warm_start=".*real_bucketized.*")
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         # Verify weights were correctly warm-started.
         self._assert_cols_to_vars(cols_to_vars,
                                   {real_bucket: [prev_bucket_val]}, sess)
@@ -800,7 +800,7 @@ class WarmStartingUtilTest(test.TestCase):
     with ops.Graph().as_default() as g:
       with self.session(graph=g) as sess:
         cols_to_vars = self._create_linear_model(all_linear_cols, partitioner)
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         # Without warm-starting, all weights should be initialized using default
         # initializer (which is init_ops.zeros_initializer).
         self._assert_cols_to_vars(cols_to_vars, {
@@ -826,7 +826,7 @@ class WarmStartingUtilTest(test.TestCase):
             var_name_to_vocab_info={
                 "linear_model/sc_vocab/weights": vocab_info
             })
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         # Verify weights were correctly warm-started.
         self._assert_cols_to_vars(cols_to_vars, {
             sc_int: [prev_int_val],
@@ -865,7 +865,7 @@ class WarmStartingUtilTest(test.TestCase):
             "linear_model/sc_vocab/weights",
             initializer=[[0.5], [1.], [2.], [3.]])
         self._write_checkpoint(sess)
-        prev_keys_val = sess.run(sc_keys_weights)
+        prev_keys_val = self.evaluate(sc_keys_weights)
 
     def _partitioner(shape, dtype):  # pylint:disable=unused-argument
       # Partition each var into 2 equal slices.
@@ -892,7 +892,7 @@ class WarmStartingUtilTest(test.TestCase):
                 ws_util._infer_var_name(cols_to_vars[sc_keys]):
                     "some_other_name"
             })
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         # Verify weights were correctly warm-started.  Var corresponding to
         # sc_hash should not be warm-started.  Var corresponding to sc_vocab
         # should be correctly warm-started after vocab remapping.
@@ -933,7 +933,7 @@ class WarmStartingUtilTest(test.TestCase):
             "linear_model/sc_vocab/weights",
             initializer=[[0.5], [1.], [2.], [3.]])
         self._write_checkpoint(sess)
-        prev_keys_val = sess.run(sc_keys_weights)
+        prev_keys_val = self.evaluate(sc_keys_weights)
 
     # New graph, new session with warm-starting.
     with ops.Graph().as_default() as g:
@@ -955,7 +955,7 @@ class WarmStartingUtilTest(test.TestCase):
                 ws_util._infer_var_name(cols_to_vars[sc_keys]):
                     "some_other_name"
             })
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         # Verify weights were correctly warm-started.  Var corresponding to
         # sc_hash should not be warm-started.  Var corresponding to sc_vocab
         # should be correctly warm-started after vocab remapping.
@@ -1024,7 +1024,7 @@ class WarmStartingUtilTest(test.TestCase):
                 ws_util._infer_var_name(cols_to_vars[sc_keys]):
                     "some_other_name"
             })
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         # Verify weights were correctly warm-started.  Var corresponding to
         # sc_vocab should be correctly warm-started after vocab remapping,
         # and neither of the other two should be warm-started..
@@ -1091,7 +1091,7 @@ class WarmStartingUtilTest(test.TestCase):
                 ws_util._infer_var_name(cols_to_vars[emb_vocab_column]):
                     vocab_info
             })
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         # Verify weights were correctly warm-started. Var corresponding to
         # emb_vocab_column should be correctly warm-started after vocab
         # remapping. Missing values are filled in with the EmbeddingColumn's
@@ -1163,7 +1163,7 @@ class WarmStartingUtilTest(test.TestCase):
             var_name_to_vocab_info={
                 "linear_model/sc_vocab_embedding/embedding_weights": vocab_info
             })
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         # Verify weights were correctly warm-started. Var corresponding to
         # emb_vocab should be correctly warm-started after vocab remapping.
         # Missing values are filled in with the EmbeddingColumn's initializer.
diff --git a/tensorflow/python/util/deprecation.py b/tensorflow/python/util/deprecation.py
index 4c68d1aaae3..9aaf0c2de97 100644
--- a/tensorflow/python/util/deprecation.py
+++ b/tensorflow/python/util/deprecation.py
@@ -28,6 +28,7 @@ from tensorflow.python.util import is_in_graph_mode
 from tensorflow.python.util import tf_contextlib
 from tensorflow.python.util import tf_decorator
 from tensorflow.python.util import tf_inspect
+from tensorflow.python.util import tf_stack
 
 
 # Allow deprecation warnings to be silenced temporarily with a context manager.
@@ -98,21 +99,9 @@ def _validate_deprecation_args(date, instructions):
 
 def _call_location(outer=False):
   """Returns call location given level up from current call."""
-  frame = tf_inspect.currentframe()
-  if frame:
-    # CPython internals are available, use them for performance.
-    # walk back two frames to get to deprecated function caller.
-    frame = frame.f_back
-    if frame.f_back:
-      frame = frame.f_back
-    if outer and frame.f_back:
-      frame = frame.f_back
-    return '%s:%d' % (frame.f_code.co_filename, frame.f_lineno)
-  else:
-    # Slow fallback path
-    stack = tf_inspect.stack(0)  # 0 avoids generating unused context
-    entry = stack[3 if outer else 2]
-    return '%s:%d' % (entry[1], entry[2])
+  stack = tf_stack.extract_stack()
+  frame = stack[-4 if outer else -3]
+  return '{filename}:{lineno}'.format(filename=frame[0], lineno=frame[1])
 
 
 def _wrap_decorator(wrapped_function):
diff --git a/tensorflow/python/util/deprecation_test.py b/tensorflow/python/util/deprecation_test.py
index 34cbca52a1b..035c416d793 100644
--- a/tensorflow/python/util/deprecation_test.py
+++ b/tensorflow/python/util/deprecation_test.py
@@ -19,6 +19,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import deprecation
@@ -174,6 +175,7 @@ class DeprecationTest(test.TestCase):
                         set(args[1:]))
 
   @test.mock.patch.object(logging, "warning", autospec=True)
+  @test_util.run_deprecated_v1
   def test_static_fn_with_doc(self, mock_warning):
     date = "2016-07-04"
     instructions = "This is how you update..."
@@ -214,6 +216,7 @@ class DeprecationTest(test.TestCase):
     self._assert_subset(set(["after " + date, instructions]), set(args[1:]))
 
   @test.mock.patch.object(logging, "warning", autospec=True)
+  @test_util.run_deprecated_v1
   def test_static_fn_with_one_line_doc(self, mock_warning):
     date = "2016-07-04"
     instructions = "This is how you update..."
@@ -239,6 +242,7 @@ class DeprecationTest(test.TestCase):
     self._assert_subset(set(["after " + date, instructions]), set(args[1:]))
 
   @test.mock.patch.object(logging, "warning", autospec=True)
+  @test_util.run_deprecated_v1
   def test_static_fn_no_doc(self, mock_warning):
     date = "2016-07-04"
     instructions = "This is how you update..."
@@ -488,6 +492,7 @@ class DeprecatedArgsTest(test.TestCase):
       deprecation.deprecated_args(date, instructions, "missing")(_fn)
 
   @test.mock.patch.object(logging, "warning", autospec=True)
+  @test_util.run_deprecated_v1
   def test_static_fn_with_doc(self, mock_warning):
     date = "2016-07-04"
     instructions = "This is how you update..."
@@ -535,6 +540,7 @@ class DeprecatedArgsTest(test.TestCase):
     self._assert_subset(set(["after " + date, instructions]), set(args[1:]))
 
   @test.mock.patch.object(logging, "warning", autospec=True)
+  @test_util.run_deprecated_v1
   def test_static_fn_with_one_line_doc(self, mock_warning):
     date = "2016-07-04"
     instructions = "This is how you update..."
@@ -565,6 +571,7 @@ class DeprecatedArgsTest(test.TestCase):
     self._assert_subset(set(["after " + date, instructions]), set(args[1:]))
 
   @test.mock.patch.object(logging, "warning", autospec=True)
+  @test_util.run_deprecated_v1
   def test_static_fn_no_doc(self, mock_warning):
     date = "2016-07-04"
     instructions = "This is how you update..."
@@ -595,6 +602,7 @@ class DeprecatedArgsTest(test.TestCase):
     self._assert_subset(set(["after " + date, instructions]), set(args[1:]))
 
   @test.mock.patch.object(logging, "warning", autospec=True)
+  @test_util.run_deprecated_v1
   def test_varargs(self, mock_warning):
     date = "2016-07-04"
     instructions = "This is how you update..."
@@ -615,6 +623,7 @@ class DeprecatedArgsTest(test.TestCase):
     self._assert_subset(set(["after " + date, instructions]), set(args[1:]))
 
   @test.mock.patch.object(logging, "warning", autospec=True)
+  @test_util.run_deprecated_v1
   def test_kwargs(self, mock_warning):
     date = "2016-07-04"
     instructions = "This is how you update..."
@@ -635,6 +644,7 @@ class DeprecatedArgsTest(test.TestCase):
     self._assert_subset(set(["after " + date, instructions]), set(args[1:]))
 
   @test.mock.patch.object(logging, "warning", autospec=True)
+  @test_util.run_deprecated_v1
   def test_positional_and_named(self, mock_warning):
     date = "2016-07-04"
     instructions = "This is how you update..."
@@ -660,6 +670,7 @@ class DeprecatedArgsTest(test.TestCase):
                         set(args2[1:]))
 
   @test.mock.patch.object(logging, "warning", autospec=True)
+  @test_util.run_deprecated_v1
   def test_positional_and_named_with_ok_vals(self, mock_warning):
     date = "2016-07-04"
     instructions = "This is how you update..."
@@ -692,6 +703,7 @@ class DeprecatedArgsTest(test.TestCase):
     self.assertEqual(0, mock_warning.call_count)
 
   @test.mock.patch.object(logging, "warning", autospec=True)
+  @test_util.run_deprecated_v1
   def test_deprecated_args_once(self, mock_warning):
     date = "2016-07-04"
     instructions = "This is how you update..."
@@ -708,6 +720,7 @@ class DeprecatedArgsTest(test.TestCase):
     self.assertEqual(1, mock_warning.call_count)
 
   @test.mock.patch.object(logging, "warning", autospec=True)
+  @test_util.run_deprecated_v1
   def test_deprecated_multiple_args_once_each(self, mock_warning):
     date = "2016-07-04"
     instructions = "This is how you update..."
@@ -752,6 +765,7 @@ class DeprecatedArgValuesTest(test.TestCase):
       deprecation.deprecated_arg_values(date, instructions)
 
   @test.mock.patch.object(logging, "warning", autospec=True)
+  @test_util.run_deprecated_v1
   def test_static_fn_with_doc(self, mock_warning):
     date = "2016-07-04"
     instructions = "This is how you update..."
@@ -804,6 +818,7 @@ class DeprecatedArgValuesTest(test.TestCase):
     self.assertEqual(2, mock_warning.call_count)
 
   @test.mock.patch.object(logging, "warning", autospec=True)
+  @test_util.run_deprecated_v1
   def test_static_fn_with_one_line_doc(self, mock_warning):
     date = "2016-07-04"
     instructions = "This is how you update..."
@@ -839,6 +854,7 @@ class DeprecatedArgValuesTest(test.TestCase):
     self.assertEqual(2, mock_warning.call_count)
 
   @test.mock.patch.object(logging, "warning", autospec=True)
+  @test_util.run_deprecated_v1
   def test_static_fn_no_doc(self, mock_warning):
     date = "2016-07-04"
     instructions = "This is how you update..."
diff --git a/tensorflow/python/util/dispatch.py b/tensorflow/python/util/dispatch.py
new file mode 100644
index 00000000000..e7a56b5922c
--- /dev/null
+++ b/tensorflow/python/util/dispatch.py
@@ -0,0 +1,192 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Type-based dispatch for TensorFlow ops.
+
+"Operation dispatchers" can be used to override the behavior for TensorFlow ops
+when they are called with otherwise unsupported argument types.  In particular,
+when an operation is called with arguments that would cause it to raise a
+TypeError, it falls back on its registered operation dispatchers.  If any
+registered dispatchers can handle the arguments, then its result is returned.
+Otherwise, the original TypeError is raised.
+
+By default, dispatch support is added to the generated op wrappers for any
+visible ops by default.  Ops that are implemented in Python can opt in to
+dispatch support using the `add_dispatch_support` decorator.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import itertools
+
+from tensorflow.python.util import tf_decorator
+from tensorflow.python.util import tf_inspect
+
+# Private function attribute used to store a list of dispatchers.
+DISPATCH_ATTR = "_tf_dispatchers"
+
+
+class OpDispatcher(object):
+  """Abstract base class for TensorFlow operator dispatchers.
+
+  Each operation dispatcher acts as an override handler for a single
+  TensorFlow operation, and its results are used when the handler indicates
+  that it can handle the operation's arguments (by returning any value other
+  than `OpDispatcher.NOT_SUPPORTED`).
+  """
+
+  # Sentinel value that can be returned to indicate that an operation
+  # dispatcher does not support a given set of arguments.
+  NOT_SUPPORTED = object()
+
+  def handle(self, args, kwargs):  # pylint: disable=unused-argument
+    """Handle this dispatcher's operation with the specified arguments.
+
+    If this operation dispatcher can handle the given arguments, then
+    return an appropriate value (or raise an appropriate exception).
+
+    Args:
+      args: The arguments to the operation.
+      kwargs: They keyword arguments to the operation.
+
+    Returns:
+      The result of the operation, or `OpDispatcher.NOT_SUPPORTED` if this
+      dispatcher can not handle the given arguments.
+    """
+    return self.NOT_SUPPORTED
+
+  def register(self, op):
+    """Register this dispatcher as a handler for `op`.
+
+    Args:
+      op: Python function: the TensorFlow operation that should be handled. Must
+        have a dispatch list (which is added automatically for generated ops,
+        and can be added to Python ops using the `add_dispatch_support`
+        decorator).
+    """
+    if not hasattr(op, DISPATCH_ATTR):
+      raise AssertionError("Dispatching not enabled for %s" % op)
+    getattr(op, DISPATCH_ATTR).append(self)
+
+
+def dispatch(op, *args, **kwargs):
+  """Returns the result from the first successful dispatcher for a given op.
+
+  Calls the `handle` method of each `OpDispatcher` that has been registered
+  to handle `op`, and returns the value from the first successful handler.
+
+  Args:
+    op: Python function: the operation to dispatch for.
+    *args: The arguments to the operation.
+    **kwargs: They keyword arguments to the operation.
+
+  Returns:
+    The result of the operation, or `NOT_SUPPORTED` if no registered
+    dispatcher can handle the given arguments.
+  """
+  for dispatcher in getattr(op, DISPATCH_ATTR):
+    result = dispatcher.handle(args, kwargs)
+    if result is not OpDispatcher.NOT_SUPPORTED:
+      return result
+  return OpDispatcher.NOT_SUPPORTED
+
+
+class _TypeBasedDispatcher(OpDispatcher):
+  """Dispatcher that handles op if any arguments have a specified type.
+
+  Checks the types of the arguments and keyword arguments (including elements
+  of lists or tuples), and if any argument values have the indicated type(s),
+  then delegates to an override function.
+  """
+
+  def __init__(self, override_func, types):
+    self._types = types
+    self._override_func = override_func
+
+  def _handles(self, args, kwargs):
+    for arg in itertools.chain(args, kwargs.values()):
+      if (isinstance(arg, self._types) or
+          (isinstance(arg, (list, tuple)) and
+           any(isinstance(elt, self._types) for elt in arg))):
+        return True
+    return False
+
+  def handle(self, args, kwargs):
+    if self._handles(args, kwargs):
+      return self._override_func(*args, **kwargs)
+    else:
+      return self.NOT_SUPPORTED
+
+
+# pylint: disable=g-doc-return-or-yield
+def dispatch_for_types(op, *types):
+  """Decorator to declare that a Python function overrides an op for a type.
+
+  The decorated function is used to override `op` if any of the arguments or
+  keyword arguments (including elements of lists or tuples) have one of the
+  specified types.
+
+  Example:
+
+  ```python
+  @dispatch_for_types(math_ops.add, RaggedTensor, RaggedTensorValue)
+  def ragged_add(x, y, name=None): ...
+  ```
+
+  Args:
+    op: Python function: the operation that should be overridden.
+    *types: The argument types for which this function should be used.
+  """
+
+  def decorator(func):
+    if tf_inspect.getargspec(func) != tf_inspect.getargspec(op):
+      raise AssertionError("The decorated function's signature must exactly "
+                           "match the signature of the overridden op.")
+    _TypeBasedDispatcher(func, types).register(op)
+    return func
+
+  return decorator
+
+
+# pylint: enable=g-doc-return-or-yield
+
+
+def add_dispatch_list(target):
+  """Decorator that adds a dispatch_list attribute to an op."""
+  assert not hasattr(target, DISPATCH_ATTR)
+  setattr(target, DISPATCH_ATTR, [])
+  return target
+
+
+def add_dispatch_support(target):
+  """Decorator that adds a dispatch handling wrapper to an op."""
+  add_dispatch_list(target)
+
+  def wrapper(*args, **kwargs):
+    """Call target, and fall back on dispatchers if there is a TypeError."""
+    try:
+      return target(*args, **kwargs)
+    except (TypeError, ValueError):
+      # Note: convert_to_eager_tensor currently raises a ValueError, not a
+      # TypeError, when given unexpected types.  So we need to catch both.
+      result = dispatch(wrapper, *args, **kwargs)
+      if result is not OpDispatcher.NOT_SUPPORTED:
+        return result
+      else:
+        raise
+
+  setattr(wrapper, DISPATCH_ATTR, [])
+  return tf_decorator.make_decorator(target, wrapper)
diff --git a/tensorflow/python/util/dispatch_test.py b/tensorflow/python/util/dispatch_test.py
new file mode 100644
index 00000000000..b7c5c8eca8d
--- /dev/null
+++ b/tensorflow/python/util/dispatch_test.py
@@ -0,0 +1,120 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for operator dispatch."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import gen_math_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import googletest
+from tensorflow.python.util import dispatch
+from tensorflow.python.util.tf_export import tf_export
+
+
+class CustomTensor(object):
+  """A fake composite tensor class, for testing type-based dispatching."""
+
+  def __init__(self, tensor, score):
+    self.tensor = ops.convert_to_tensor(tensor)
+    self.score = score
+
+
+@tf_export("test_op")
+@dispatch.add_dispatch_support
+def test_op(x, y, z):
+  """A fake op for testing dispatch of Python ops."""
+  return x + (2 * y) + (3 * z)
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class DispatchTest(test_util.TensorFlowTestCase):
+
+  def testAddDispatchForTypes_With_CppOp(self):
+    original_handlers = gen_math_ops.add._tf_dispatchers[:]
+
+    # Override the behavior of gen_math_ops.add.
+    @dispatch.dispatch_for_types(gen_math_ops.add, CustomTensor)
+    def custom_add(x, y, name=None):  # pylint: disable=unused-variable
+      return CustomTensor(gen_math_ops.add(x.tensor, y.tensor, name),
+                          (x.score+y.score) / 2.0)
+    self.assertEqual(len(math_ops.add._tf_dispatchers),
+                     len(original_handlers) + 1)
+
+    # Test that we see the overridden behavior when using CustomTensors.
+    x = CustomTensor([1, 2, 3], 2.0)
+    y = CustomTensor([7, 8, 2], 0.0)
+    x_plus_y = gen_math_ops.add(x, y)
+    self.assertAllEqual(self.evaluate(x_plus_y.tensor), [8, 10, 5])
+    self.assertNear(x_plus_y.score, 1.0, 0.001)
+
+    # Test that we still get the right behavior when using normal Tensors.
+    a = [1, 2, 3]
+    b = [4, 5, 6]
+    a_plus_b = gen_math_ops.add(a, b)
+    self.assertAllEqual(a_plus_b, [5, 7, 9])
+
+    # Test that we still get a TypeError or ValueError if we pass some
+    # type that's not supported by any dispatcher.
+    with self.assertRaises((TypeError, ValueError)):
+      gen_math_ops.add(a, None)
+
+    # Clean up
+    gen_math_ops.add._tf_dispatchers = original_handlers
+
+  def testAddDispatchForTypes_With_PythonOp(self):
+    original_handlers = test_op._tf_dispatchers[:]
+
+    @dispatch.dispatch_for_types(test_op, CustomTensor)
+    def override_for_test_op(x, y, z):  # pylint: disable=unused-variable
+      return CustomTensor(test_op(x.tensor, y.tensor, z.tensor),
+                          (x.score + y.score + z.score) / 3.0)
+
+    x = CustomTensor([1, 2, 3], 0.2)
+    y = CustomTensor([7, 8, 2], 0.4)
+    z = CustomTensor([0, 1, 2], 0.6)
+
+    result = test_op(x, y, z)
+    self.assertAllEqual(self.evaluate(result.tensor), [15, 21, 13])
+    self.assertNear(result.score, 0.4, 0.001)
+
+    # Clean up
+    test_op._tf_dispatchers = original_handlers
+
+  def testDispatchForTypes_SignatureMismatch(self):
+    with self.assertRaisesRegexp(AssertionError, "The decorated function's "
+                                 "signature must exactly match.*"):
+      @dispatch.dispatch_for_types(test_op, CustomTensor)
+      def override_for_test_op(a, b, c):  # pylint: disable=unused-variable
+        return CustomTensor(test_op(a.tensor, b.tensor, c.tensor),
+                            (a.score + b.score + c.score) / 3.0)
+
+  def testDispatchForTypes_OpDoesNotSupportDispatch(self):
+    def some_op(x, y):
+      return x + y
+
+    with self.assertRaisesRegexp(AssertionError, "Dispatching not enabled for"):
+      @dispatch.dispatch_for_types(some_op, CustomTensor)
+      def override_for_some_op(x, y):  # pylint: disable=unused-variable
+        return x if x.score > 0 else y
+
+
+if __name__ == "__main__":
+  googletest.main()
+
+
diff --git a/tensorflow/python/util/nest_test.py b/tensorflow/python/util/nest_test.py
index 997a3c5c36f..d0d0c5f7935 100644
--- a/tensorflow/python/util/nest_test.py
+++ b/tensorflow/python/util/nest_test.py
@@ -482,6 +482,7 @@ class NestTest(parameterized.TestCase, test.TestCase):
     self.assertEqual(nt.a[1][::-1], rev_nt.a[1])
     self.assertEqual(nt.b[::-1], rev_nt.b)
 
+  @test_util.run_deprecated_v1
   def testMapStructureOverPlaceholders(self):
     inp_a = (array_ops.placeholder(dtypes.float32, shape=[3, 4]),
              array_ops.placeholder(dtypes.float32, shape=[3, 7]))
diff --git a/tensorflow/python/util/py_checkpoint_reader.i b/tensorflow/python/util/py_checkpoint_reader.i
index 1c73f7f06f1..a1b98a2a759 100644
--- a/tensorflow/python/util/py_checkpoint_reader.i
+++ b/tensorflow/python/util/py_checkpoint_reader.i
@@ -165,7 +165,6 @@ def NewCheckpointReader(filepattern):
     from tensorflow.python.util import compat
     return CheckpointReader(compat.as_bytes(filepattern), status)
 
-NewCheckpointReader._tf_api_names = ['train.NewCheckpointReader']
 NewCheckpointReader._tf_api_names_v1 = ['train.NewCheckpointReader']
 %}
 
diff --git a/tensorflow/python/util/tf_export.py b/tensorflow/python/util/tf_export.py
index 0924b36ade8..ec70cae7d2f 100644
--- a/tensorflow/python/util/tf_export.py
+++ b/tensorflow/python/util/tf_export.py
@@ -50,6 +50,10 @@ from tensorflow.python.util import tf_decorator
 ESTIMATOR_API_NAME = 'estimator'
 TENSORFLOW_API_NAME = 'tensorflow'
 
+# List of subpackage names used by TensorFlow components. Have to check that
+# TensorFlow core repo does not export any symbols under these names.
+SUBPACKAGE_NAMESPACES = [ESTIMATOR_API_NAME]
+
 _Attributes = collections.namedtuple(
     'ExportedApiAttributes', ['names', 'constants'])
 
@@ -78,6 +82,11 @@ class SymbolAlreadyExposedError(Exception):
   pass
 
 
+class InvalidSymbolNameError(Exception):
+  """Raised when trying to export symbol as an invalid or unallowed name."""
+  pass
+
+
 def get_canonical_name_for_symbol(
     symbol, api_name=TENSORFLOW_API_NAME,
     add_prefix_to_v1_names=False):
@@ -163,6 +172,37 @@ class api_export(object):  # pylint: disable=invalid-name
     self._overrides = kwargs.get('overrides', [])
     self._allow_multiple_exports = kwargs.get('allow_multiple_exports', False)
 
+    self._validate_symbol_names()
+
+  def _validate_symbol_names(self):
+    """Validate you are exporting symbols under an allowed package.
+
+    We need to ensure things exported by tf_export, estimator_export, etc.
+    export symbols under disjoint top-level package names.
+
+    For TensorFlow, we check that it does not export anything under subpackage
+    names used by components (estimator, keras, etc.).
+
+    For each component, we check that it exports everything under its own
+    subpackage.
+
+    Raises:
+      InvalidSymbolNameError: If you try to export symbol under disallowed name.
+    """
+    all_symbol_names = set(self._names) | set(self._names_v1)
+    if self._api_name == TENSORFLOW_API_NAME:
+      for subpackage in SUBPACKAGE_NAMESPACES:
+        if any(n.startswith(subpackage) for n in all_symbol_names):
+          raise InvalidSymbolNameError(
+              '@tf_export is not allowed to export symbols under %s.*' % (
+                  subpackage))
+    else:
+      if not all(n.startswith(self._api_name) for n in all_symbol_names):
+        raise InvalidSymbolNameError(
+            'Can only export symbols under package name of component. '
+            'e.g. tensorflow_estimator must export all symbols under '
+            'tf.estimator')
+
   def __call__(self, func):
     """Calls this decorator.
 
diff --git a/tensorflow/python/util/tf_export_test.py b/tensorflow/python/util/tf_export_test.py
index 4ae1dc55e06..a0fac8bf362 100644
--- a/tensorflow/python/util/tf_export_test.py
+++ b/tensorflow/python/util/tf_export_test.py
@@ -130,6 +130,26 @@ class ValidateExportTest(test.TestCase):
     with self.assertRaises(tf_export.SymbolAlreadyExposedError):
       export_decorator(_test_function)
 
+  def testRaisesExceptionIfInvalidSymbolName(self):
+    # TensorFlow code is not allowed to export symbols under package
+    # tf.estimator
+    with self.assertRaises(tf_export.InvalidSymbolNameError):
+      tf_export.tf_export('estimator.invalid')
+
+    # All symbols exported by Estimator must be under tf.estimator package.
+    with self.assertRaises(tf_export.InvalidSymbolNameError):
+      tf_export.estimator_export('invalid')
+    with self.assertRaises(tf_export.InvalidSymbolNameError):
+      tf_export.estimator_export('Estimator.invalid')
+    with self.assertRaises(tf_export.InvalidSymbolNameError):
+      tf_export.estimator_export('invalid.estimator')
+
+  def testRaisesExceptionIfInvalidV1SymbolName(self):
+    with self.assertRaises(tf_export.InvalidSymbolNameError):
+      tf_export.tf_export('valid', v1=['estimator.invalid'])
+    with self.assertRaises(tf_export.InvalidSymbolNameError):
+      tf_export.estimator_export('estimator.valid', v1=['invalid'])
+
   def testOverridesFunction(self):
     _test_function2._tf_api_names = ['abc']
 
diff --git a/tensorflow/python/util/tf_should_use_test.py b/tensorflow/python/util/tf_should_use_test.py
index fedbe1dff6a..65d848cf2a5 100644
--- a/tensorflow/python/util/tf_should_use_test.py
+++ b/tensorflow/python/util/tf_should_use_test.py
@@ -24,6 +24,7 @@ import gc
 import sys
 
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
 from tensorflow.python.platform import tf_logging
 from tensorflow.python.util import tf_should_use
@@ -39,6 +40,7 @@ def reroute_error():
 
 class TfShouldUseTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testAddShouldUseWarningWhenNotUsed(self):
     c = constant_op.constant(0, name='blah0')
     def in_this_function():
@@ -52,6 +54,7 @@ class TfShouldUseTest(test.TestCase):
     self.assertIn('in_this_function', msg)
     self.assertFalse(gc.garbage)
 
+  @test_util.run_deprecated_v1
   def testAddShouldUseFatalWhenNotUsed(self):
     c = constant_op.constant(0, name='blah0')
     def in_this_function():
@@ -74,6 +77,7 @@ class TfShouldUseTest(test.TestCase):
     error.assert_not_called()
     fatal.assert_not_called()
 
+  @test_util.run_deprecated_v1
   def testAddShouldUseWarningWhenUsedWithAdd(self):
     def add(h):
       _ = h + 1
@@ -81,6 +85,7 @@ class TfShouldUseTest(test.TestCase):
     gc.collect()
     self.assertFalse(gc.garbage)
 
+  @test_util.run_deprecated_v1
   def testAddShouldUseWarningWhenUsedWithGetName(self):
     def get_name(h):
       _ = h.name
@@ -88,6 +93,7 @@ class TfShouldUseTest(test.TestCase):
     gc.collect()
     self.assertFalse(gc.garbage)
 
+  @test_util.run_deprecated_v1
   def testShouldUseResult(self):
     @tf_should_use.should_use_result
     def return_const(value):
@@ -101,6 +107,7 @@ class TfShouldUseTest(test.TestCase):
     gc.collect()
     self.assertFalse(gc.garbage)
 
+  @test_util.run_deprecated_v1
   def testShouldUseResultWhenNotReallyUsed(self):
     @tf_should_use.should_use_result
     def return_const(value):
@@ -111,7 +118,7 @@ class TfShouldUseTest(test.TestCase):
         # Creating another op and executing it does not mark the
         # unused op as being "used".
         v = constant_op.constant(1.0, name='meh')
-        v.eval()
+        self.evaluate(v)
     msg = '\n'.join(error.call_args[0])
     self.assertIn('Object was never used', msg)
     self.assertIn('blah3:0', msg)
diff --git a/tensorflow/stream_executor/BUILD b/tensorflow/stream_executor/BUILD
index 5c9d85acf4e..4c764a7b099 100644
--- a/tensorflow/stream_executor/BUILD
+++ b/tensorflow/stream_executor/BUILD
@@ -1,6 +1,8 @@
 licenses(["restricted"])
 
 load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda_is_configured")
+load("//tensorflow/core:platform/default/build_config.bzl", "tf_proto_library")
+load("//tensorflow/core:platform/default/build_config.bzl", "tf_additional_all_protos")
 load("//tensorflow/core:platform/default/build_config_root.bzl", "if_static")
 load("//tensorflow:tensorflow.bzl", "cc_header_only_library")
 
@@ -13,6 +15,14 @@ STREAM_EXECUTOR_HEADERS = glob([
     "platform/**/*.h",
 ])
 
+tf_proto_library(
+    name = "dnn_proto",
+    srcs = ["dnn.proto"],
+    cc_api_version = 2,
+    default_header = True,
+    protodeps = tf_additional_all_protos(),
+)
+
 cc_library(
     name = "stream_executor_impl",
     srcs = glob(
@@ -35,6 +45,7 @@ cc_library(
     }),
     visibility = ["//visibility:public"],
     deps = [
+        ":dnn_proto_cc_impl",
         "//tensorflow/core:lib",
         "//tensorflow/core:ptr_util",
         "@com_google_absl//absl/container:flat_hash_map",
@@ -51,6 +62,7 @@ cc_library(
     hdrs = STREAM_EXECUTOR_HEADERS,
     visibility = ["//visibility:public"],
     deps = [
+        ":dnn_proto_cc",
         "//tensorflow/core:lib",
         "//tensorflow/core:ptr_util",
         "@com_google_absl//absl/strings",
@@ -96,11 +108,8 @@ cc_library(
         "@local_config_cuda//cuda:cuda_headers",
     ] + if_cuda_is_configured([
         "//tensorflow/core:cuda",
-        "@local_config_cuda//cuda:cublas",
         "@local_config_cuda//cuda:cuda_driver",
         "@local_config_cuda//cuda:cudnn",
-        "@local_config_cuda//cuda:cufft",
-        "@local_config_cuda//cuda:curand",
     ]),
     alwayslink = 1,
 )
diff --git a/tensorflow/stream_executor/cuda/cuda_blas.cc b/tensorflow/stream_executor/cuda/cuda_blas.cc
index 7fabb35e28c..957f6c98da5 100644
--- a/tensorflow/stream_executor/cuda/cuda_blas.cc
+++ b/tensorflow/stream_executor/cuda/cuda_blas.cc
@@ -58,6 +58,11 @@ limitations under the License.
 #include "tensorflow/stream_executor/cuda/cuda_stream.h"
 #include "tensorflow/stream_executor/cuda/cuda_timer.h"
 #include "tensorflow/stream_executor/device_memory.h"
+
+#ifndef PLATFORM_GOOGLE
+#include "tensorflow/stream_executor/dso_loader.h"
+#endif
+
 #include "tensorflow/stream_executor/lib/env.h"
 #include "tensorflow/stream_executor/lib/initialize.h"
 #include "tensorflow/stream_executor/lib/status.h"
@@ -76,21 +81,8 @@ PLUGIN_REGISTRY_DEFINE_PLUGIN_ID(kCuBlasPlugin);
 
 namespace wrap {
 
-#define STREAM_EXECUTOR_CUBLAS_WRAP(__name)                         \
-  struct WrapperShim__##__name {                                    \
-    static const char *kName;                                       \
-    template <typename... Args>                                     \
-    cublasStatus_t operator()(CUDAExecutor *parent, Args... args) { \
-      cuda::ScopedActivateExecutorContext sac{parent};              \
-      return ::__name(args...);                                     \
-    }                                                               \
-  } __name;                                                         \
-  const char *WrapperShim__##__name::kName = #__name;
-
-#define STREAM_EXECUTOR_CUBLAS_V2_WRAP(__name) \
-  STREAM_EXECUTOR_CUBLAS_WRAP(__name)
-
-#define CUBLAS_BLAS_ROUTINE_EACH(__macro) \
+// clang-format off
+#define CUBLAS_ROUTINE_EACH(__macro)      \
   __macro(cublasSnrm2)                    \
   __macro(cublasDnrm2)                    \
   __macro(cublasScnrm2)                   \
@@ -262,6 +254,58 @@ namespace wrap {
   __macro(cublasCdgmm)                    \
   __macro(cublasZdgmm)
 
+// clang-format off
+
+#ifdef PLATFORM_GOOGLE
+#define STREAM_EXECUTOR_CUBLAS_WRAP(__name)                         \
+  struct WrapperShim__##__name {                                    \
+    static const char *kName;                                       \
+    template <typename... Args>                                     \
+    cublasStatus_t operator()(CUDAExecutor *parent, Args... args) { \
+      cuda::ScopedActivateExecutorContext sac{parent};              \
+      return ::__name(args...);                                     \
+    }                                                               \
+  } __name;                                                         \
+  const char *WrapperShim__##__name::kName = #__name;
+
+#define STREAM_EXECUTOR_CUBLAS_V2_WRAP(__name) \
+  STREAM_EXECUTOR_CUBLAS_WRAP(__name)
+
+#else
+
+#define STREAM_EXECUTOR_CUBLAS_WRAP(__name)                               \
+  struct DynLoadShim__##__name {                                          \
+    static const char* kName;                                             \
+    using FuncPtrT = std::add_pointer<decltype(::__name)>::type;          \
+    static void* GetDsoHandle() {                                         \
+      auto s = internal::CachedDsoLoader::GetCublasDsoHandle();           \
+      return s.ValueOrDie();                                              \
+    }                                                                     \
+    static FuncPtrT LoadOrDie() {                                         \
+      void* f;                                                            \
+      auto s = port::Env::Default()->GetSymbolFromLibrary(GetDsoHandle(), \
+                                                          kName, &f);     \
+      CHECK(s.ok()) << "could not find " << kName                         \
+                    << " in cublas DSO; dlerror: " << s.error_message();  \
+      return reinterpret_cast<FuncPtrT>(f);                               \
+    }                                                                     \
+    static FuncPtrT DynLoad() {                                           \
+      static FuncPtrT f = LoadOrDie();                                    \
+      return f;                                                           \
+    }                                                                     \
+    template <typename... Args>                                           \
+    cublasStatus_t operator()(CUDAExecutor* parent, Args... args) {       \
+      cuda::ScopedActivateExecutorContext sac{parent};                    \
+      return DynLoad()(args...);                                          \
+    }                                                                     \
+  } __name;                                                               \
+  const char* DynLoadShim__##__name::kName = #__name;
+
+#define STREAM_EXECUTOR_CUBLAS_V2_WRAP(__name) \
+  STREAM_EXECUTOR_CUBLAS_WRAP(__name)
+
+#endif
+
 STREAM_EXECUTOR_CUBLAS_V2_WRAP(cublasCreate)
 STREAM_EXECUTOR_CUBLAS_V2_WRAP(cublasDestroy)
 STREAM_EXECUTOR_CUBLAS_V2_WRAP(cublasSetStream)
@@ -271,7 +315,7 @@ STREAM_EXECUTOR_CUBLAS_WRAP(cublasSgemmBatched)
 STREAM_EXECUTOR_CUBLAS_WRAP(cublasDgemmBatched)
 STREAM_EXECUTOR_CUBLAS_WRAP(cublasCgemmBatched)
 STREAM_EXECUTOR_CUBLAS_WRAP(cublasZgemmBatched)
-CUBLAS_BLAS_ROUTINE_EACH(STREAM_EXECUTOR_CUBLAS_V2_WRAP)
+CUBLAS_ROUTINE_EACH(STREAM_EXECUTOR_CUBLAS_V2_WRAP)
 
 #if CUDA_VERSION >= 7050
 STREAM_EXECUTOR_CUBLAS_WRAP(cublasSgemmEx)
@@ -424,7 +468,8 @@ class ScopedCublasMathMode {
   // Note that when false is returned, an appropriate error has already been
   // logged.
   bool Init(cublasMath_t new_mode) {
-    cublasStatus_t ret = wrap::cublasGetMathMode(parent_, handle_, &old_mode_);
+    cublasStatus_t ret =
+        wrap::cublasGetMathMode(parent_, handle_, &old_mode_);
     if (ret != CUBLAS_STATUS_SUCCESS) {
       LOG(ERROR) << "failed to get old cublas math mode: " << ToString(ret);
       return ok_ = false;
@@ -442,7 +487,8 @@ class ScopedCublasMathMode {
   // successful in the first place.
   ~ScopedCublasMathMode() {
     if (ok_) {
-      cublasStatus_t ret = wrap::cublasSetMathMode(parent_, handle_, old_mode_);
+      cublasStatus_t ret =
+          wrap::cublasSetMathMode(parent_, handle_, old_mode_);
       if (ret != CUBLAS_STATUS_SUCCESS) {
         LOG(ERROR) << "failed to set former cublas math mode: "
                    << ToString(ret);
@@ -675,16 +721,16 @@ bool CUDABlas::DoBlasAsum(Stream *stream, uint64 elem_count,
                           const DeviceMemory<std::complex<float>> &x, int incx,
                           DeviceMemory<float> *result) {
   return DoBlasInternal(
-      wrap::cublasScasum, stream, false /* = pointer_mode_host */, elem_count,
-      CUDAComplex(CUDAMemory(x)), incx, CUDAMemoryMutable(result));
+      wrap::cublasScasum, stream, false /* = pointer_mode_host */,
+      elem_count, CUDAComplex(CUDAMemory(x)), incx, CUDAMemoryMutable(result));
 }
 
 bool CUDABlas::DoBlasAsum(Stream *stream, uint64 elem_count,
                           const DeviceMemory<std::complex<double>> &x, int incx,
                           DeviceMemory<double> *result) {
   return DoBlasInternal(
-      wrap::cublasDzasum, stream, false /* = pointer_mode_host */, elem_count,
-      CUDAComplex(CUDAMemory(x)), incx, CUDAMemoryMutable(result));
+      wrap::cublasDzasum, stream, false /* = pointer_mode_host */,
+      elem_count, CUDAComplex(CUDAMemory(x)), incx, CUDAMemoryMutable(result));
 }
 
 bool CUDABlas::DoBlasAxpy(Stream *stream, uint64 elem_count, float alpha,
@@ -835,16 +881,16 @@ bool CUDABlas::DoBlasNrm2(Stream *stream, uint64 elem_count,
                           const DeviceMemory<std::complex<float>> &x, int incx,
                           DeviceMemory<float> *result) {
   return DoBlasInternal(
-      wrap::cublasScnrm2, stream, false /* = pointer_mode_host */, elem_count,
-      CUDAComplex(CUDAMemory(x)), incx, CUDAMemoryMutable(result));
+      wrap::cublasScnrm2, stream, false /* = pointer_mode_host */,
+      elem_count, CUDAComplex(CUDAMemory(x)), incx, CUDAMemoryMutable(result));
 }
 
 bool CUDABlas::DoBlasNrm2(Stream *stream, uint64 elem_count,
                           const DeviceMemory<std::complex<double>> &x, int incx,
                           DeviceMemory<double> *result) {
   return DoBlasInternal(
-      wrap::cublasDznrm2, stream, false /* = pointer_mode_host */, elem_count,
-      CUDAComplex(CUDAMemory(x)), incx, CUDAMemoryMutable(result));
+      wrap::cublasDznrm2, stream, false /* = pointer_mode_host */,
+      elem_count, CUDAComplex(CUDAMemory(x)), incx, CUDAMemoryMutable(result));
 }
 
 bool CUDABlas::DoBlasRot(Stream *stream, uint64 elem_count,
@@ -1060,48 +1106,48 @@ bool CUDABlas::DoBlasIamax(Stream *stream, uint64 elem_count,
                            const DeviceMemory<std::complex<float>> &x, int incx,
                            DeviceMemory<int> *result) {
   return DoBlasInternal(
-      wrap::cublasIcamax, stream, false /* = pointer_mode_host */, elem_count,
-      CUDAComplex(CUDAMemory(x)), incx, CUDAMemoryMutable(result));
+      wrap::cublasIcamax, stream, false /* = pointer_mode_host */,
+      elem_count, CUDAComplex(CUDAMemory(x)), incx, CUDAMemoryMutable(result));
 }
 
 bool CUDABlas::DoBlasIamax(Stream *stream, uint64 elem_count,
                            const DeviceMemory<std::complex<double>> &x,
                            int incx, DeviceMemory<int> *result) {
   return DoBlasInternal(
-      wrap::cublasIzamax, stream, false /* = pointer_mode_host */, elem_count,
-      CUDAComplex(CUDAMemory(x)), incx, CUDAMemoryMutable(result));
+      wrap::cublasIzamax, stream, false /* = pointer_mode_host */,
+      elem_count, CUDAComplex(CUDAMemory(x)), incx, CUDAMemoryMutable(result));
 }
 
 bool CUDABlas::DoBlasIamin(Stream *stream, uint64 elem_count,
                            const DeviceMemory<float> &x, int incx,
                            DeviceMemory<int> *result) {
   return DoBlasInternal(
-      wrap::cublasIsamin, stream, false /* = pointer_mode_host */, elem_count,
-      CUDAComplex(CUDAMemory(x)), incx, CUDAMemoryMutable(result));
+      wrap::cublasIsamin, stream, false /* = pointer_mode_host */,
+      elem_count, CUDAComplex(CUDAMemory(x)), incx, CUDAMemoryMutable(result));
 }
 
 bool CUDABlas::DoBlasIamin(Stream *stream, uint64 elem_count,
                            const DeviceMemory<double> &x, int incx,
                            DeviceMemory<int> *result) {
   return DoBlasInternal(
-      wrap::cublasIdamin, stream, false /* = pointer_mode_host */, elem_count,
-      CUDAComplex(CUDAMemory(x)), incx, CUDAMemoryMutable(result));
+      wrap::cublasIdamin, stream, false /* = pointer_mode_host */,
+      elem_count, CUDAComplex(CUDAMemory(x)), incx, CUDAMemoryMutable(result));
 }
 
 bool CUDABlas::DoBlasIamin(Stream *stream, uint64 elem_count,
                            const DeviceMemory<std::complex<float>> &x, int incx,
                            DeviceMemory<int> *result) {
   return DoBlasInternal(
-      wrap::cublasIcamin, stream, false /* = pointer_mode_host */, elem_count,
-      CUDAComplex(CUDAMemory(x)), incx, CUDAMemoryMutable(result));
+      wrap::cublasIcamin, stream, false /* = pointer_mode_host */,
+      elem_count, CUDAComplex(CUDAMemory(x)), incx, CUDAMemoryMutable(result));
 }
 
 bool CUDABlas::DoBlasIamin(Stream *stream, uint64 elem_count,
                            const DeviceMemory<std::complex<double>> &x,
                            int incx, DeviceMemory<int> *result) {
   return DoBlasInternal(
-      wrap::cublasIzamin, stream, false /* = pointer_mode_host */, elem_count,
-      CUDAComplex(CUDAMemory(x)), incx, CUDAMemoryMutable(result));
+      wrap::cublasIzamin, stream, false /* = pointer_mode_host */,
+      elem_count, CUDAComplex(CUDAMemory(x)), incx, CUDAMemoryMutable(result));
 }
 
 bool CUDABlas::DoBlasGbmv(Stream *stream, blas::Transpose trans, uint64 m,
diff --git a/tensorflow/stream_executor/cuda/cuda_dnn.cc b/tensorflow/stream_executor/cuda/cuda_dnn.cc
index 19397c7dbf2..1f2e2f48bbd 100644
--- a/tensorflow/stream_executor/cuda/cuda_dnn.cc
+++ b/tensorflow/stream_executor/cuda/cuda_dnn.cc
@@ -132,43 +132,6 @@ string ToString(cudnnStatus_t status) {
   }
 }
 
-template <typename T>
-cudnnDataType_t GetCudnnDataType(
-    dnn::DataLayout = dnn::DataLayout::kBatchDepthYX);
-
-template <>
-cudnnDataType_t GetCudnnDataType<double>(dnn::DataLayout) {
-  return CUDNN_DATA_DOUBLE;
-}
-
-template <>
-cudnnDataType_t GetCudnnDataType<float>(dnn::DataLayout) {
-  return CUDNN_DATA_FLOAT;
-}
-
-template <>
-cudnnDataType_t GetCudnnDataType<Eigen::half>(dnn::DataLayout) {
-  return CUDNN_DATA_HALF;
-}
-
-template <>
-cudnnDataType_t GetCudnnDataType<int8>(dnn::DataLayout layout) {
-  switch (layout) {
-    case dnn::DataLayout::kYXDepthBatch:
-    case dnn::DataLayout::kYXBatchDepth:
-    case dnn::DataLayout::kBatchYXDepth:
-    case dnn::DataLayout::kBatchDepthYX:
-      return CUDNN_DATA_INT8;
-    case dnn::DataLayout::kBatchDepthYX4:
-      return CUDNN_DATA_INT8x4;
-  }
-}
-
-template <>
-cudnnDataType_t GetCudnnDataType<int32>(dnn::DataLayout) {
-  return CUDNN_DATA_INT32;
-}
-
 // RAII wrapper for all calls to cuDNN with a cuDNN handle argument.
 //
 // See CudnnAccess::GetHandle() for details.
@@ -685,10 +648,10 @@ class CudnnConvolutionDescriptor {
     CHECK_CUDNN_OK(cudnnSetConvolutionNdDescriptor(
         handle_.get(), convolution_descriptor.ndims(), padding.data(),
         strides.data(), dilations.data(),
-        // NOTE(keveman): cuDNN supports convolution and cross correlation.
-        // However, almost all the use cases do cross correlation, so just
-        // hard coding it here.
-        CUDNN_CROSS_CORRELATION, data_type));
+        convolution_descriptor.convolution_not_crosscorr()
+            ? CUDNN_CONVOLUTION
+            : CUDNN_CROSS_CORRELATION,
+        data_type));
 
     // NOTE(benbarsdell): This only applies if tensor op math is enabled
     //                      and algo selection is set to Default.
@@ -861,11 +824,19 @@ cudnnDataType_t ToCudnnDataType(
     case dnn::DataType::kInt8:
       return data_layout == dnn::DataLayout::kBatchDepthYX4 ? CUDNN_DATA_INT8x4
                                                             : CUDNN_DATA_INT8;
+    case dnn::DataType::kInt32:
+      return CUDNN_DATA_INT32;
     default:
       LOG(FATAL) << "Invalid DNN data type: " << static_cast<int>(data_type);
   }
 }
 
+template <typename T>
+cudnnDataType_t GetCudnnDataType(
+    dnn::DataLayout data_layout = dnn::DataLayout::kBatchDepthYX) {
+  return ToCudnnDataType(dnn::ToDataType<T>::value, data_layout);
+}
+
 cudnnRNNInputMode_t ToCudnnRnnInputMode(dnn::RnnInputMode input_mode) {
   switch (input_mode) {
     case dnn::RnnInputMode::kRnnLinearSkip:
@@ -2345,27 +2316,6 @@ struct ConvDoFP32ComputationFP16Input {
   static constexpr bool kDefaultFlag = true;
 };
 
-// A group of helper functions to return the internal compute type for
-// convolutions in cudnn.
-template <typename T>
-cudnnDataType_t GetConvComputeType() {
-  return CUDNN_DATA_FLOAT;
-}
-
-template <>
-cudnnDataType_t GetConvComputeType<Eigen::half>() {
-  if (CudnnEnvVar<ConvDoFP32ComputationFP16Input>::IsEnabled()) {
-    return CUDNN_DATA_FLOAT;
-  } else {
-    return CUDNN_DATA_HALF;
-  }
-}
-
-template <>
-cudnnDataType_t GetConvComputeType<double>() {
-  return CUDNN_DATA_DOUBLE;
-}
-
 // A helper struct to decide whether to use FP32 as the internal compute type
 // for rnn when the input data type is FP16. At present it is turned off,
 // users can explicitly control them through an env-var
@@ -2437,7 +2387,7 @@ port::Status CudnnSupport::DoConvolveImpl(
     const DeviceMemory<T>& filter_data,
     const dnn::ConvolutionDescriptor& convolution_descriptor,
     const dnn::BatchDescriptor& output_descriptor, DeviceMemory<T>* output_data,
-    ScratchAllocator* scratch_allocator,
+    dnn::DataType accumulator_type, ScratchAllocator* scratch_allocator,
     const dnn::AlgorithmConfig& algorithm_config,
     dnn::ProfileResult* output_profile_result) {
   cudnnDataType_t cudnn_type = GetCudnnDataType<T>();
@@ -2445,7 +2395,7 @@ port::Status CudnnSupport::DoConvolveImpl(
   CudnnTensorDescriptor output_nd(output_descriptor, cudnn_type);
   CudnnFilterDescriptor filter(filter_descriptor, cudnn_type);
   CudnnConvolutionDescriptor conv(convolution_descriptor,
-                                  GetConvComputeType<T>());
+                                  ToCudnnDataType(accumulator_type));
 
   auto cudnn = cudnn_->GetHandle(parent_, stream);
   // Alpha is the scaling factor for input.
@@ -2536,8 +2486,7 @@ port::Status CudnnSupport::DoConvolveImpl(
   return port::Status::OK();
 }
 
-template <typename AccumulatorType, typename ElementType, typename BiasType,
-          typename ScaleType>
+template <typename ElementType, typename BiasType, typename ScaleType>
 port::Status CudnnSupport::DoFusedConvolveImpl(
     Stream* stream, const dnn::BatchDescriptor& conv_input_descriptor,
     const DeviceMemory<ElementType>& conv_input_data,
@@ -2548,7 +2497,8 @@ port::Status CudnnSupport::DoFusedConvolveImpl(
     ScaleType side_input_scale, const dnn::BatchDescriptor& bias_descriptor,
     const DeviceMemory<BiasType>& biases, dnn::ActivationMode activation_mode,
     const dnn::BatchDescriptor& output_descriptor,
-    DeviceMemory<ElementType>* output_data, ScratchAllocator* scratch_allocator,
+    DeviceMemory<ElementType>* output_data, dnn::DataType accumulator_type,
+    ScratchAllocator* scratch_allocator,
     const dnn::AlgorithmConfig& algorithm_config,
     dnn::ProfileResult* output_profile_result) {
   if (activation_mode != dnn::ActivationMode::kRelu &&
@@ -2569,7 +2519,7 @@ port::Status CudnnSupport::DoFusedConvolveImpl(
       GetCudnnDataType<ElementType>(conv_input_descriptor.layout()));
   CudnnTensorDescriptor bias_nd(bias_descriptor, GetCudnnDataType<BiasType>());
   CudnnConvolutionDescriptor conv(convolution_descriptor,
-                                  GetCudnnDataType<AccumulatorType>());
+                                  ToCudnnDataType(accumulator_type));
 
   auto cudnn = cudnn_->GetHandle(parent_, stream);
 
@@ -2938,10 +2888,10 @@ bool CudnnSupport::DoConvolve(
     const dnn::AlgorithmConfig& algorithm_config,
     dnn::ProfileResult* output_profile_result) {
   return IsStatusOk(
-      DoConvolveImpl<float>(
-          stream, batch_descriptor, input_data, filter_descriptor, filter_data,
-          convolution_descriptor, output_descriptor, output_data,
-          scratch_allocator, algorithm_config, output_profile_result),
+      DoConvolveImpl(stream, batch_descriptor, input_data, filter_descriptor,
+                     filter_data, convolution_descriptor, output_descriptor,
+                     output_data, dnn::DataType::kFloat, scratch_allocator,
+                     algorithm_config, output_profile_result),
       /*report_error=*/!output_profile_result);
 }
 
@@ -2956,10 +2906,10 @@ bool CudnnSupport::DoConvolve(
     const dnn::AlgorithmConfig& algorithm_config,
     dnn::ProfileResult* output_profile_result) {
   return IsStatusOk(
-      DoConvolveImpl<double>(
-          stream, batch_descriptor, input_data, filter_descriptor, filter_data,
-          convolution_descriptor, output_descriptor, output_data,
-          scratch_allocator, algorithm_config, output_profile_result),
+      DoConvolveImpl(stream, batch_descriptor, input_data, filter_descriptor,
+                     filter_data, convolution_descriptor, output_descriptor,
+                     output_data, dnn::DataType::kDouble, scratch_allocator,
+                     algorithm_config, output_profile_result),
       /*report_error=*/!output_profile_result);
 }
 
@@ -2973,11 +2923,15 @@ bool CudnnSupport::DoConvolve(
     DeviceMemory<Eigen::half>* output_data, ScratchAllocator* scratch_allocator,
     const dnn::AlgorithmConfig& algorithm_config,
     dnn::ProfileResult* output_profile_result) {
+  dnn::DataType acc_type =
+      CudnnEnvVar<ConvDoFP32ComputationFP16Input>::IsEnabled()
+          ? dnn::DataType::kFloat
+          : dnn::DataType::kHalf;
   return IsStatusOk(
-      DoConvolveImpl<Eigen::half>(
-          stream, batch_descriptor, input_data, filter_descriptor, filter_data,
-          convolution_descriptor, output_descriptor, output_data,
-          scratch_allocator, algorithm_config, output_profile_result),
+      DoConvolveImpl(stream, batch_descriptor, input_data, filter_descriptor,
+                     filter_data, convolution_descriptor, output_descriptor,
+                     output_data, acc_type, scratch_allocator, algorithm_config,
+                     output_profile_result),
       /*report_error=*/!output_profile_result);
 }
 
@@ -2995,12 +2949,13 @@ bool CudnnSupport::DoFusedConvolve(
     const dnn::AlgorithmConfig& algorithm_config,
     dnn::ProfileResult* output_profile_result) {
   return IsStatusOk(
-      DoFusedConvolveImpl<double>(
-          stream, conv_input_descriptor, conv_input_data, conv_input_scale,
-          filter_descriptor, filter_data, convolution_descriptor,
-          side_input_data, side_input_scale, bias_descriptor, biases,
-          activation_mode, output_descriptor, output_data, scratch_allocator,
-          algorithm_config, output_profile_result),
+      DoFusedConvolveImpl(stream, conv_input_descriptor, conv_input_data,
+                          conv_input_scale, filter_descriptor, filter_data,
+                          convolution_descriptor, side_input_data,
+                          side_input_scale, bias_descriptor, biases,
+                          activation_mode, output_descriptor, output_data,
+                          dnn::DataType::kDouble, scratch_allocator,
+                          algorithm_config, output_profile_result),
       /*report_error=*/!output_profile_result);
 }
 
@@ -3018,12 +2973,13 @@ bool CudnnSupport::DoFusedConvolve(
     const dnn::AlgorithmConfig& algorithm_config,
     dnn::ProfileResult* output_profile_result) {
   return IsStatusOk(
-      DoFusedConvolveImpl<float>(
-          stream, conv_input_descriptor, conv_input_data, conv_input_scale,
-          filter_descriptor, filter_data, convolution_descriptor,
-          side_input_data, side_input_scale, bias_descriptor, biases,
-          activation_mode, output_descriptor, output_data, scratch_allocator,
-          algorithm_config, output_profile_result),
+      DoFusedConvolveImpl(stream, conv_input_descriptor, conv_input_data,
+                          conv_input_scale, filter_descriptor, filter_data,
+                          convolution_descriptor, side_input_data,
+                          side_input_scale, bias_descriptor, biases,
+                          activation_mode, output_descriptor, output_data,
+                          dnn::DataType::kFloat, scratch_allocator,
+                          algorithm_config, output_profile_result),
       /*report_error=*/!output_profile_result);
 }
 
@@ -3041,13 +2997,17 @@ bool CudnnSupport::DoFusedConvolve(
     DeviceMemory<Eigen::half>* output_data, ScratchAllocator* scratch_allocator,
     const dnn::AlgorithmConfig& algorithm_config,
     dnn::ProfileResult* output_profile_result) {
+  dnn::DataType acc_type =
+      CudnnEnvVar<ConvDoFP32ComputationFP16Input>::IsEnabled()
+          ? dnn::DataType::kFloat
+          : dnn::DataType::kHalf;
   return IsStatusOk(
-      DoFusedConvolveImpl<float>(
+      DoFusedConvolveImpl(
           stream, conv_input_descriptor, conv_input_data, conv_input_scale,
           filter_descriptor, filter_data, convolution_descriptor,
           side_input_data, side_input_scale, bias_descriptor, biases,
-          activation_mode, output_descriptor, output_data, scratch_allocator,
-          algorithm_config, output_profile_result),
+          activation_mode, output_descriptor, output_data, acc_type,
+          scratch_allocator, algorithm_config, output_profile_result),
       /*report_error=*/!output_profile_result);
 }
 
@@ -3073,12 +3033,13 @@ bool CudnnSupport::DoFusedConvolve(
     return false;
   }
   return IsStatusOk(
-      DoFusedConvolveImpl<int32>(
-          stream, conv_input_descriptor, conv_input_data, conv_input_scale,
-          filter_descriptor, filter_data, convolution_descriptor,
-          side_input_data, side_input_scale, bias_descriptor, biases,
-          activation_mode, output_descriptor, output_data, scratch_allocator,
-          algorithm_config, output_profile_result),
+      DoFusedConvolveImpl(stream, conv_input_descriptor, conv_input_data,
+                          conv_input_scale, filter_descriptor, filter_data,
+                          convolution_descriptor, side_input_data,
+                          side_input_scale, bias_descriptor, biases,
+                          activation_mode, output_descriptor, output_data,
+                          dnn::DataType::kInt32, scratch_allocator,
+                          algorithm_config, output_profile_result),
       /*report_error=*/!output_profile_result);
 }
 
@@ -3112,7 +3073,8 @@ port::Status CudnnSupport::DoConvolveBackwardDataImpl(
     DeviceMemory<T> backward_output_data,
     const dnn::ConvolutionDescriptor& convolution_descriptor,
     const dnn::BatchDescriptor& input_descriptor,
-    DeviceMemory<T>* backward_input_data, ScratchAllocator* scratch_allocator,
+    DeviceMemory<T>* backward_input_data, dnn::DataType accumulator_type,
+    ScratchAllocator* scratch_allocator,
     const dnn::AlgorithmConfig& algorithm_config,
     dnn::ProfileResult* output_profile_result) {
   cudnnDataType_t cudnn_type = GetCudnnDataType<T>();
@@ -3133,7 +3095,7 @@ port::Status CudnnSupport::DoConvolveBackwardDataImpl(
   CudnnTensorDescriptor in_back_nd(input_descriptor, cudnn_type);
   CudnnFilterDescriptor filter(filter_descriptor, cudnn_type);
   CudnnConvolutionDescriptor conv(convolution_descriptor,
-                                  GetConvComputeType<T>());
+                                  ToCudnnDataType(accumulator_type));
 
   const bool is_profiling = output_profile_result != nullptr;
 
@@ -3213,11 +3175,11 @@ bool CudnnSupport::DoConvolveBackwardData(
     const dnn::AlgorithmConfig& algorithm_config,
     dnn::ProfileResult* output_profile_result) {
   return IsStatusOk(
-      DoConvolveBackwardDataImpl(stream, filter_descriptor, filter_data,
-                                 output_descriptor, backward_output_data,
-                                 convolution_descriptor, input_descriptor,
-                                 backward_input_data, scratch_allocator,
-                                 algorithm_config, output_profile_result),
+      DoConvolveBackwardDataImpl(
+          stream, filter_descriptor, filter_data, output_descriptor,
+          backward_output_data, convolution_descriptor, input_descriptor,
+          backward_input_data, dnn::DataType::kDouble, scratch_allocator,
+          algorithm_config, output_profile_result),
       /*report_error=*/!output_profile_result);
 }
 
@@ -3233,11 +3195,11 @@ bool CudnnSupport::DoConvolveBackwardData(
     const dnn::AlgorithmConfig& algorithm_config,
     dnn::ProfileResult* output_profile_result) {
   return IsStatusOk(
-      DoConvolveBackwardDataImpl(stream, filter_descriptor, filter_data,
-                                 output_descriptor, backward_output_data,
-                                 convolution_descriptor, input_descriptor,
-                                 backward_input_data, scratch_allocator,
-                                 algorithm_config, output_profile_result),
+      DoConvolveBackwardDataImpl(
+          stream, filter_descriptor, filter_data, output_descriptor,
+          backward_output_data, convolution_descriptor, input_descriptor,
+          backward_input_data, dnn::DataType::kFloat, scratch_allocator,
+          algorithm_config, output_profile_result),
       /*report_error=*/!output_profile_result);
 }
 
@@ -3252,12 +3214,16 @@ bool CudnnSupport::DoConvolveBackwardData(
     ScratchAllocator* scratch_allocator,
     const dnn::AlgorithmConfig& algorithm_config,
     dnn::ProfileResult* output_profile_result) {
+  dnn::DataType acc_type =
+      CudnnEnvVar<ConvDoFP32ComputationFP16Input>::IsEnabled()
+          ? dnn::DataType::kFloat
+          : dnn::DataType::kHalf;
   return IsStatusOk(
-      DoConvolveBackwardDataImpl(stream, filter_descriptor, filter_data,
-                                 output_descriptor, backward_output_data,
-                                 convolution_descriptor, input_descriptor,
-                                 backward_input_data, scratch_allocator,
-                                 algorithm_config, output_profile_result),
+      DoConvolveBackwardDataImpl(
+          stream, filter_descriptor, filter_data, output_descriptor,
+          backward_output_data, convolution_descriptor, input_descriptor,
+          backward_input_data, acc_type, scratch_allocator, algorithm_config,
+          output_profile_result),
       /*report_error=*/!output_profile_result);
 }
 
@@ -3269,7 +3235,8 @@ port::Status CudnnSupport::DoConvolveBackwardFilterImpl(
     DeviceMemory<T> backward_output_data,
     const dnn::ConvolutionDescriptor& convolution_descriptor,
     const dnn::FilterDescriptor& filter_descriptor,
-    DeviceMemory<T>* backward_filter_data, ScratchAllocator* scratch_allocator,
+    DeviceMemory<T>* backward_filter_data, dnn::DataType accumulator_type,
+    ScratchAllocator* scratch_allocator,
     const dnn::AlgorithmConfig& algorithm_config,
     dnn::ProfileResult* output_profile_result) {
   cudnnDataType_t cudnn_type = GetCudnnDataType<T>();
@@ -3290,7 +3257,7 @@ port::Status CudnnSupport::DoConvolveBackwardFilterImpl(
   CudnnTensorDescriptor input_nd(input_descriptor, cudnn_type);
   CudnnFilterDescriptor filter(filter_descriptor, cudnn_type);
   CudnnConvolutionDescriptor conv(convolution_descriptor,
-                                  GetConvComputeType<T>());
+                                  ToCudnnDataType(accumulator_type));
 
   const bool is_profiling = output_profile_result != nullptr;
 
@@ -3406,11 +3373,12 @@ bool CudnnSupport::DoConvolveBackwardFilter(
     const dnn::AlgorithmConfig& algorithm_config,
     dnn::ProfileResult* output_profile_result) {
   return IsStatusOk(
-      DoConvolveBackwardFilterImpl(stream, input_descriptor, input_data,
-                                   output_descriptor, backward_output_data,
-                                   convolution_descriptor, filter_descriptor,
-                                   backward_filter_data, scratch_allocator,
-                                   algorithm_config, output_profile_result),
+      DoConvolveBackwardFilterImpl(
+          stream, input_descriptor, input_data, output_descriptor,
+          backward_output_data, convolution_descriptor, filter_descriptor,
+          backward_filter_data, dnn::DataType::kDouble,
+
+          scratch_allocator, algorithm_config, output_profile_result),
       /*report_error=*/!output_profile_result);
 }
 
@@ -3425,13 +3393,14 @@ bool CudnnSupport::DoConvolveBackwardFilter(
     ScratchAllocator* scratch_allocator,
     const dnn::AlgorithmConfig& algorithm_config,
     dnn::ProfileResult* output_profile_result) {
-  return IsStatusOk(
-      DoConvolveBackwardFilterImpl(stream, input_descriptor, input_data,
-                                   output_descriptor, backward_output_data,
-                                   convolution_descriptor, filter_descriptor,
-                                   backward_filter_data, scratch_allocator,
-                                   algorithm_config, output_profile_result),
-      /*report_error=*/!output_profile_result);
+  return IsStatusOk(DoConvolveBackwardFilterImpl(
+                        stream, input_descriptor, input_data, output_descriptor,
+                        backward_output_data, convolution_descriptor,
+                        filter_descriptor, backward_filter_data,
+
+                        dnn::DataType::kFloat, scratch_allocator,
+                        algorithm_config, output_profile_result),
+                    /*report_error=*/!output_profile_result);
 }
 
 bool CudnnSupport::DoConvolveBackwardFilter(
@@ -3445,12 +3414,16 @@ bool CudnnSupport::DoConvolveBackwardFilter(
     ScratchAllocator* scratch_allocator,
     const dnn::AlgorithmConfig& algorithm_config,
     dnn::ProfileResult* output_profile_result) {
+  dnn::DataType acc_type =
+      CudnnEnvVar<ConvDoFP32ComputationFP16Input>::IsEnabled()
+          ? dnn::DataType::kFloat
+          : dnn::DataType::kHalf;
   return IsStatusOk(
-      DoConvolveBackwardFilterImpl(stream, input_descriptor, input_data,
-                                   output_descriptor, backward_output_data,
-                                   convolution_descriptor, filter_descriptor,
-                                   backward_filter_data, scratch_allocator,
-                                   algorithm_config, output_profile_result),
+      DoConvolveBackwardFilterImpl(
+          stream, input_descriptor, input_data, output_descriptor,
+          backward_output_data, convolution_descriptor, filter_descriptor,
+          backward_filter_data, acc_type, scratch_allocator, algorithm_config,
+          output_profile_result),
       /*report_error=*/!output_profile_result);
 }
 
diff --git a/tensorflow/stream_executor/cuda/cuda_dnn.h b/tensorflow/stream_executor/cuda/cuda_dnn.h
index 74f6f935b84..0641be140d2 100644
--- a/tensorflow/stream_executor/cuda/cuda_dnn.h
+++ b/tensorflow/stream_executor/cuda/cuda_dnn.h
@@ -670,12 +670,12 @@ class CudnnSupport : public dnn::DnnSupport {
       const DeviceMemory<T>& filter_data,
       const dnn::ConvolutionDescriptor& convolution_descriptor,
       const dnn::BatchDescriptor& output_descriptor,
-      DeviceMemory<T>* output_data, ScratchAllocator* scratch_allocator,
+      DeviceMemory<T>* output_data, dnn::DataType accumulator_type,
+      ScratchAllocator* scratch_allocator,
       const dnn::AlgorithmConfig& algorithm_config,
       dnn::ProfileResult* output_profile_result);
 
-  template <typename AccumulatorType, typename ElementType, typename BiasType,
-            typename ScaleType>
+  template <typename ElementType, typename BiasType, typename ScaleType>
   port::Status DoFusedConvolveImpl(
       Stream* stream, const dnn::BatchDescriptor& conv_input_descriptor,
       const DeviceMemory<ElementType>& conv_input_data,
@@ -687,7 +687,7 @@ class CudnnSupport : public dnn::DnnSupport {
       ScaleType side_input_scale, const dnn::BatchDescriptor& bias_descriptor,
       const DeviceMemory<BiasType>& biases, dnn::ActivationMode activation_mode,
       const dnn::BatchDescriptor& output_descriptor,
-      DeviceMemory<ElementType>* output_data,
+      DeviceMemory<ElementType>* output_data, dnn::DataType accumulator_type,
       ScratchAllocator* scratch_allocator,
       const dnn::AlgorithmConfig& algorithm_config,
       dnn::ProfileResult* output_profile_result);
@@ -700,7 +700,8 @@ class CudnnSupport : public dnn::DnnSupport {
       DeviceMemory<T> backward_output_data,
       const dnn::ConvolutionDescriptor& convolution_descriptor,
       const dnn::BatchDescriptor& input_descriptor,
-      DeviceMemory<T>* backward_input_data, ScratchAllocator* scratch_allocator,
+      DeviceMemory<T>* backward_input_data, dnn::DataType accumulator_type,
+      ScratchAllocator* scratch_allocator,
       const dnn::AlgorithmConfig& algorithm_config,
       dnn::ProfileResult* output_profile_result);
 
@@ -712,7 +713,7 @@ class CudnnSupport : public dnn::DnnSupport {
       DeviceMemory<T> backward_output_data,
       const dnn::ConvolutionDescriptor& convolution_descriptor,
       const dnn::FilterDescriptor& filter_descriptor,
-      DeviceMemory<T>* backward_filter_data,
+      DeviceMemory<T>* backward_filter_data, dnn::DataType accumulator_type,
       ScratchAllocator* scratch_allocator,
       const dnn::AlgorithmConfig& algorithm_config,
       dnn::ProfileResult* output_profile_result);
diff --git a/tensorflow/stream_executor/cuda/cuda_fft.cc b/tensorflow/stream_executor/cuda/cuda_fft.cc
index cbf388a0f89..acac7d63688 100644
--- a/tensorflow/stream_executor/cuda/cuda_fft.cc
+++ b/tensorflow/stream_executor/cuda/cuda_fft.cc
@@ -23,6 +23,11 @@ limitations under the License.
 #include "tensorflow/stream_executor/cuda/cuda_platform_id.h"
 #include "tensorflow/stream_executor/cuda/cuda_stream.h"
 #include "tensorflow/stream_executor/device_memory.h"
+
+#ifndef PLATFORM_GOOGLE
+#include "tensorflow/stream_executor/dso_loader.h"
+#endif
+
 #include "tensorflow/stream_executor/lib/env.h"
 #include "tensorflow/stream_executor/lib/initialize.h"
 #include "tensorflow/stream_executor/lib/status.h"
@@ -38,6 +43,7 @@ PLUGIN_REGISTRY_DEFINE_PLUGIN_ID(kCuFftPlugin);
 
 namespace wrap {
 
+#ifdef PLATFORM_GOOGLE
 // This macro wraps a global identifier, given by __name, in a callable
 // structure that loads the DLL symbol out of the DSO handle in a thread-safe
 // manner on first use. This dynamic loading technique is used to avoid DSO
@@ -52,22 +58,69 @@ namespace wrap {
     }                                                            \
   } __name;
 
-#define CUFFT_ROUTINE_EACH(__macro)                                            \
-  __macro(cufftDestroy) __macro(cufftSetStream) __macro(cufftPlan1d)           \
-      __macro(cufftPlan2d) __macro(cufftPlan3d) __macro(cufftPlanMany)         \
-          __macro(cufftExecD2Z) __macro(cufftExecZ2D) __macro(cufftExecC2C)    \
-              __macro(cufftExecC2R) __macro(cufftExecZ2Z)                      \
-                  __macro(cufftExecR2C) __macro(cufftCreate)                   \
-                      __macro(cufftSetAutoAllocation)                          \
-                          __macro(cufftSetWorkArea) __macro(cufftGetSize1d)    \
-                              __macro(cufftMakePlan1d) __macro(cufftGetSize2d) \
-                                  __macro(cufftMakePlan2d)                     \
-                                      __macro(cufftGetSize3d)                  \
-                                          __macro(cufftMakePlan3d)             \
-                                              __macro(cufftGetSizeMany)        \
-                                                  __macro(cufftMakePlanMany)
+#else
+
+#define STREAM_EXECUTOR_CUFFT_WRAP(__name)                                \
+  struct DynLoadShim__##__name {                                          \
+    static const char *kName;                                             \
+    using FuncPtrT = std::add_pointer<decltype(::__name)>::type;          \
+    static void *GetDsoHandle() {                                         \
+      auto s = internal::CachedDsoLoader::GetCufftDsoHandle();            \
+      return s.ValueOrDie();                                              \
+    }                                                                     \
+    static FuncPtrT LoadOrDie() {                                         \
+      void *f;                                                            \
+      auto s = port::Env::Default()->GetSymbolFromLibrary(GetDsoHandle(), \
+                                                          kName, &f);     \
+      CHECK(s.ok()) << "could not find " << kName                         \
+                    << " in cufft DSO; dlerror: " << s.error_message();   \
+      return reinterpret_cast<FuncPtrT>(f);                               \
+    }                                                                     \
+    static FuncPtrT DynLoad() {                                           \
+      static FuncPtrT f = LoadOrDie();                                    \
+      return f;                                                           \
+    }                                                                     \
+    template <typename... Args>                                           \
+    cufftResult operator()(CUDAExecutor *parent, Args... args) {          \
+      cuda::ScopedActivateExecutorContext sac{parent};                    \
+      return DynLoad()(args...);                                          \
+    }                                                                     \
+  } __name;                                                               \
+  const char *DynLoadShim__##__name::kName = #__name;
+
+#endif
+
+// clang-format off
+
+#define CUFFT_ROUTINE_EACH(__macro)                                     \
+  __macro(cufftDestroy)                                                 \
+  __macro(cufftSetStream)                                               \
+  __macro(cufftPlan1d)                                                  \
+  __macro(cufftPlan2d)                                                  \
+  __macro(cufftPlan3d)                                                  \
+  __macro(cufftPlanMany)                                                \
+  __macro(cufftExecD2Z)                                                 \
+  __macro(cufftExecZ2D)                                                 \
+  __macro(cufftExecC2C)                                                 \
+  __macro(cufftExecC2R)                                                 \
+  __macro(cufftExecZ2Z)                                                 \
+  __macro(cufftExecR2C)                                                 \
+  __macro(cufftCreate)                                                  \
+  __macro(cufftSetAutoAllocation)                                       \
+  __macro(cufftSetWorkArea)                                             \
+  __macro(cufftGetSize1d)                                               \
+  __macro(cufftMakePlan1d)                                              \
+  __macro(cufftGetSize2d)                                               \
+  __macro(cufftMakePlan2d)                                              \
+  __macro(cufftGetSize3d)                                               \
+  __macro(cufftMakePlan3d)                                              \
+  __macro(cufftGetSizeMany)                                             \
+  __macro(cufftMakePlanMany)
+
+// clang-format on
 
 CUFFT_ROUTINE_EACH(STREAM_EXECUTOR_CUFFT_WRAP)
+#undef CUFFT_ROUTINE_EACH
 
 }  // namespace wrap
 
diff --git a/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc b/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc
index ad9154226c4..4874d096ad5 100644
--- a/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc
+++ b/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc
@@ -662,8 +662,13 @@ bool CUDAExecutor::MemcpyDeviceToDevice(Stream *stream,
 }
 
 bool CUDAExecutor::HostCallback(Stream *stream,
-                                std::function<void()> callback) {
-  auto callback_ptr = new std::function<void()>(callback);
+                                std::function<port::Status()> callback) {
+  auto callback_ptr = new std::function<void()>([callback]() {
+    port::Status s = callback();
+    if (!s.ok()) {
+      LOG(WARNING) << "Host callback failed: " << s;
+    }
+  });
   return CUDADriver::AddStreamCallback(context_, AsCUDAStreamValue(stream),
                                        InternalHostCallback, callback_ptr);
 }
diff --git a/tensorflow/stream_executor/cuda/cuda_gpu_executor.h b/tensorflow/stream_executor/cuda/cuda_gpu_executor.h
index 90bf1c0242f..ae8e4abf920 100644
--- a/tensorflow/stream_executor/cuda/cuda_gpu_executor.h
+++ b/tensorflow/stream_executor/cuda/cuda_gpu_executor.h
@@ -148,7 +148,8 @@ class CUDAExecutor : public internal::StreamExecutorInterface {
                             const DeviceMemoryBase &gpu_src,
                             uint64 size) override;
 
-  bool HostCallback(Stream *stream, std::function<void()> callback) override;
+  bool HostCallback(Stream *stream,
+                    std::function<port::Status()> callback) override;
 
   bool AllocateStream(Stream *stream) override;
 
diff --git a/tensorflow/stream_executor/cuda/cuda_rng.cc b/tensorflow/stream_executor/cuda/cuda_rng.cc
index 88c4f157927..7f920719321 100644
--- a/tensorflow/stream_executor/cuda/cuda_rng.cc
+++ b/tensorflow/stream_executor/cuda/cuda_rng.cc
@@ -21,6 +21,11 @@ limitations under the License.
 #include "tensorflow/stream_executor/cuda/cuda_platform_id.h"
 #include "tensorflow/stream_executor/cuda/cuda_stream.h"
 #include "tensorflow/stream_executor/device_memory.h"
+
+#ifndef PLATFORM_GOOGLE
+#include "tensorflow/stream_executor/dso_loader.h"
+#endif
+
 #include "tensorflow/stream_executor/lib/env.h"
 #include "tensorflow/stream_executor/lib/initialize.h"
 #include "tensorflow/stream_executor/lib/status.h"
@@ -61,6 +66,7 @@ PLUGIN_REGISTRY_DEFINE_PLUGIN_ID(kCuRandPlugin);
 
 namespace wrap {
 
+#ifdef PLATFORM_GOOGLE
 #define STREAM_EXECUTOR_CURAND_WRAP(__name)                         \
   struct WrapperShim__##__name {                                    \
     template <typename... Args>                                     \
@@ -70,6 +76,36 @@ namespace wrap {
     }                                                               \
   } __name;
 
+#else
+#define STREAM_EXECUTOR_CURAND_WRAP(__name)                               \
+  struct DynLoadShim__##__name {                                          \
+    static const char *kName;                                             \
+    using FuncPtrT = std::add_pointer<decltype(::__name)>::type;          \
+    static void *GetDsoHandle() {                                         \
+      auto s = internal::CachedDsoLoader::GetCurandDsoHandle();           \
+      return s.ValueOrDie();                                              \
+    }                                                                     \
+    static FuncPtrT LoadOrDie() {                                         \
+      void *f;                                                            \
+      auto s = port::Env::Default()->GetSymbolFromLibrary(GetDsoHandle(), \
+                                                          kName, &f);     \
+      CHECK(s.ok()) << "could not find " << kName                         \
+                    << " in curand DSO; dlerror: " << s.error_message();  \
+      return reinterpret_cast<FuncPtrT>(f);                               \
+    }                                                                     \
+    static FuncPtrT DynLoad() {                                           \
+      static FuncPtrT f = LoadOrDie();                                    \
+      return f;                                                           \
+    }                                                                     \
+    template <typename... Args>                                           \
+    curandStatus_t operator()(CUDAExecutor *parent, Args... args) {       \
+      cuda::ScopedActivateExecutorContext sac{parent};                    \
+      return DynLoad()(args...);                                          \
+    }                                                                     \
+  } __name;                                                               \
+  const char *DynLoadShim__##__name::kName = #__name;
+#endif
+
 STREAM_EXECUTOR_CURAND_WRAP(curandCreateGenerator);
 STREAM_EXECUTOR_CURAND_WRAP(curandDestroyGenerator);
 STREAM_EXECUTOR_CURAND_WRAP(curandSetStream);
diff --git a/tensorflow/stream_executor/device_description.cc b/tensorflow/stream_executor/device_description.cc
index 4120e230dbf..0b991b7ba8c 100644
--- a/tensorflow/stream_executor/device_description.cc
+++ b/tensorflow/stream_executor/device_description.cc
@@ -140,21 +140,11 @@ void CalculateDimensionality(const DeviceDescription &device_description,
                              uint64 element_count, uint64 *threads_per_block,
                              uint64 *block_count) {
   *threads_per_block = device_description.threads_per_block_limit();
-  *block_count = DivideCeil(element_count, *threads_per_block);
+  *block_count = port::MathUtil::CeilOfRatio(element_count, *threads_per_block);
   if (*block_count == 1) {
     CHECK_LE(element_count, *threads_per_block);
     *threads_per_block = element_count;
   }
 }
 
-// Round value up to a multiple of n.
-static uint64 RoundUp(uint64 value, uint64 n) {
-  return port::MathUtil::CeilOfRatio(value, n) * n;
-}
-
-// Round value down to a multiple of n.
-static uint64 RoundDown(uint64 value, uint64 n) {
-  return port::MathUtil::FloorOfRatio(value, n) * n;
-}
-
 }  // namespace stream_executor
diff --git a/tensorflow/stream_executor/dnn.cc b/tensorflow/stream_executor/dnn.cc
index 3d8e691ab28..faa662211eb 100644
--- a/tensorflow/stream_executor/dnn.cc
+++ b/tensorflow/stream_executor/dnn.cc
@@ -23,7 +23,7 @@ namespace stream_executor {
 namespace dnn {
 
 uint64 AlgorithmDesc::hash() const {
-  return ::tensorflow::Hash64Combine(algo_, tensor_ops_enabled_);
+  return ::tensorflow::Hash64Combine(algo_id(), tensor_ops_enabled());
 }
 
 bool DnnSupport::GetConvolveAlgorithms(
@@ -187,6 +187,9 @@ std::tuple<int, int, int> GetDimIndices(const DataLayout& layout,
       batch_idx = 0;
       spatial_idx = 2;
       break;
+
+    default:
+      LOG(FATAL) << "Unknown layout " << layout;
   }
 
   return std::make_tuple(depth_idx, batch_idx, spatial_idx);
@@ -233,28 +236,27 @@ string AlgorithmConfig::ToString() const {
 // -- BatchDescriptor
 
 BatchDescriptor::BatchDescriptor(int ndims)
-    : count_(0),
-      feature_map_count_(0),
-      spatial_size_(ndims, 0),
-      value_max_(0.0),
+    : value_max_(0.0),
       value_min_(0.0),
-      layout_(DataLayout::kYXDepthBatch),
-      ndims_(ndims),
-      quantized_activation_mode_(QuantizedActivationMode::k8Bit) {}
+      quantized_activation_mode_(QuantizedActivationMode::k8Bit) {
+  tensor_.mutable_dimensions()->Resize(ndims + 2, 0);
+  set_layout(DataLayout::kYXDepthBatch);
+}
 
 BatchDescriptor::BatchDescriptor() : BatchDescriptor(/*ndims=*/2) {}
 
 std::vector<int64> BatchDescriptor::full_dims(const DataLayout& layout) const {
-  std::vector<int64> bdyx_dims(ndims_ + 2);
+  std::vector<int64> bdyx_dims(ndims() + 2);
   bdyx_dims[0] = count();
   bdyx_dims[1] = feature_map_count();
-  std::copy(spatial_size_.begin(), spatial_size_.end(), bdyx_dims.begin() + 2);
+  std::copy(spatial_size().begin(), spatial_size().end(),
+            bdyx_dims.begin() + 2);
   return ReorderDims(bdyx_dims, DataLayout::kBatchDepthYX, layout);
 }
 
 std::vector<int64> BatchDescriptor::full_strides(
     const DataLayout& layout) const {
-  if (layout_ == DataLayout::kBatchDepthYX4) {
+  if (this->layout() == DataLayout::kBatchDepthYX4) {
     LOG(FATAL)
         << "Cannot compute full strides for batch descriptor " << ToString()
         << ", because its layout is kBatchDepthYX4. In fact, "
@@ -262,36 +264,32 @@ std::vector<int64> BatchDescriptor::full_strides(
            "Use cudnnSetTensor4DDescriptor to set cudnnTensorDescriptor_t "
            "instead.";
   }
-  std::vector<int64> phys_dims = full_dims(layout_);
+  std::vector<int64> phys_dims = full_dims(this->layout());
   std::vector<int64> phys_strides(phys_dims.size());
-  phys_strides[ndims_ + 1] = 1;
-  for (int i = ndims_; i >= 0; i--) {
+  phys_strides[ndims() + 1] = 1;
+  for (int i = ndims(); i >= 0; i--) {
     phys_strides[i] = phys_strides[i + 1] * phys_dims[i + 1];
   }
-  return ReorderDims(phys_strides, layout_, layout);
+  return ReorderDims(phys_strides, this->layout(), layout);
 }
 
 void BatchDescriptor::CloneFrom(const BatchDescriptor& other) {
-  count_ = other.count_;
-  feature_map_count_ = other.feature_map_count_;
-  spatial_size_ = other.spatial_size_;
+  tensor_ = other.tensor_;
   value_max_ = other.value_max_;
   value_min_ = other.value_min_;
-  layout_ = other.layout_;
-  ndims_ = other.ndims_;
   quantized_activation_mode_ = other.quantized_activation_mode_;
 }
 
 string BatchDescriptor::ToString() const {
   string spatial;
-  for (int i = 0; i < ndims_; i++) {
-    port::Appendf(&spatial, "%lld ", spatial_size_[i]);
+  for (int i = 0; i < ndims(); i++) {
+    port::Appendf(&spatial, "%lld ", spatial_size()[i]);
   }
   return port::Printf(
       "{count: %lld feature_map_count: %lld spatial: %s "
       "value_min: %f value_max: %f layout: %s}",
-      count_, feature_map_count_, spatial.c_str(), value_min_, value_max_,
-      DataLayoutString(layout_).c_str());
+      count(), feature_map_count(), spatial.c_str(), value_min_, value_max_,
+      DataLayoutString(layout()).c_str());
 }
 
 string BatchDescriptor::ToShortString() const {
@@ -302,8 +300,8 @@ string BatchDescriptor::ToShortString() const {
   string batch = absl::StrCat("b", count());
 
   string spatial = "s";
-  for (int i = 0; i < ndims_; i++) {
-    port::Appendf(&spatial, "%lld ", spatial_size_[i]);
+  for (int i = 0; i < ndims(); i++) {
+    port::Appendf(&spatial, "%lld ", spatial_size()[i]);
   }
 
   string suffix;
@@ -333,18 +331,18 @@ string BatchDescriptor::ToShortString() const {
 
 int64 BatchDescriptor::NodesPerFeatureMap() const {
   int64 ret = 1;
-  for (int i = 0; i < ndims_; i++) {
-    ret *= spatial_size_[i];
+  for (int i = 0; i < ndims(); i++) {
+    ret *= spatial_size()[i];
   }
   return ret;
 }
 
 int64 BatchDescriptor::NodesAcrossFeatureMaps() const {
-  return NodesPerFeatureMap() * feature_map_count_;
+  return NodesPerFeatureMap() * feature_map_count();
 }
 
 int64 BatchDescriptor::ElementCount() const {
-  return count_ * feature_map_count_ * NodesPerFeatureMap();
+  return count() * feature_map_count() * NodesPerFeatureMap();
 }
 
 int64 BatchDescriptor::FullyConnectedWeightCount(
@@ -372,33 +370,27 @@ BatchDescriptor BatchDescriptor::DepthConcatenateOutputDescriptor(
 
 // -- FilterDescriptor
 
-FilterDescriptor::FilterDescriptor(int ndims)
-    : output_feature_map_count_(0),
-      input_feature_map_count_(0),
-      input_filter_dims_(ndims, 0),
-      ndims_(ndims),
-      layout_(FilterLayout::kOutputInputYX) {}
+FilterDescriptor::FilterDescriptor(int ndims) {
+  tensor_.mutable_dimensions()->Resize(ndims + 2, 0);
+  set_layout(FilterLayout::kOutputInputYX);
+}
 
 FilterDescriptor::FilterDescriptor() : FilterDescriptor(/*ndims=*/2) {}
 
 FilterDescriptor::~FilterDescriptor() {}
 
 void FilterDescriptor::CloneFrom(const FilterDescriptor& other) {
-  set_output_feature_map_count(other.output_feature_map_count())
-      .set_input_feature_map_count(other.input_feature_map_count())
-      .set_layout(other.layout());
-  input_filter_dims_ = other.input_filter_dims_;
-  ndims_ = other.ndims_;
+  tensor_ = other.tensor_;
 }
 
 string FilterDescriptor::ToString() const {
   string desc = port::Printf(
       "{output_feature_map_count: %lld input_feature_map_count: %lld "
       "layout: %s shape: ",
-      output_feature_map_count_, input_feature_map_count_,
-      FilterLayoutString(layout_).c_str());
-  for (int i = 0; i < ndims_; i++) {
-    port::Appendf(&desc, "%lld ", input_filter_dims_[i]);
+      output_feature_map_count(), input_feature_map_count(),
+      FilterLayoutString(layout()).c_str());
+  for (int i = 0; i < ndims(); i++) {
+    port::Appendf(&desc, "%lld ", input_filter_dims()[i]);
   }
   absl::StrAppend(&desc, "}");
 
@@ -409,15 +401,15 @@ string FilterDescriptor::ToShortString() const {
   // All the constituent strings are less than 15 characters, so the
   // small string optimization ensures that there will be at most one
   // heap memory allocation.
-  string od = absl::StrCat("od", output_feature_map_count_);
-  string id = absl::StrCat("id", input_feature_map_count_);
+  string od = absl::StrCat("od", output_feature_map_count());
+  string id = absl::StrCat("id", input_feature_map_count());
 
   string spatial = "s";
-  for (int i = 0; i < ndims_; i++) {
-    port::Appendf(&spatial, "%lld ", input_filter_dims_[i]);
+  for (int i = 0; i < ndims(); i++) {
+    port::Appendf(&spatial, "%lld ", input_filter_dims()[i]);
   }
 
-  switch (layout_) {
+  switch (layout()) {
     case FilterLayout::kOutputInputYX:
       return absl::StrCat(od, id, spatial);
     case FilterLayout::kOutputYXInput:
@@ -429,27 +421,28 @@ string FilterDescriptor::ToShortString() const {
     case FilterLayout::kYXInputOutput:
       return absl::StrCat(spatial, id, od);
     default:
-      LOG(FATAL) << "Unknown layout " << static_cast<int32>(layout_);
+      LOG(FATAL) << "Unknown layout " << static_cast<int32>(layout());
       return "";  // Avoid return warning (unreachable)
   }
 }
 
 int64 FilterDescriptor::ComputeWeightCount() const {
-  int64 ret = output_feature_map_count_ * input_feature_map_count_;
-  for (int i = 0; i < ndims_; i++) {
-    ret *= input_filter_dims_[i];
+  int64 ret = output_feature_map_count() * input_feature_map_count();
+  for (int i = 0; i < ndims(); i++) {
+    ret *= input_filter_dims()[i];
   }
   return ret;
 }
 
 // -- ConvolutionDescriptor
 
-ConvolutionDescriptor::ConvolutionDescriptor(int ndims)
-    : zero_padding_(ndims, 0),
-      filter_strides_(ndims, 1),
-      dilation_rates_(ndims, 1),
-      group_count_(1),
-      ndims_(ndims) {}
+ConvolutionDescriptor::ConvolutionDescriptor(int ndims) {
+  proto_.mutable_paddings()->Resize(ndims, 0);
+  proto_.mutable_strides()->Resize(ndims, 1);
+  proto_.mutable_dilations()->Resize(ndims, 1);
+  proto_.set_group_count(1);
+  proto_.set_convolution_mode(ConvolutionMode::CROSS_CORRELATION);
+}
 
 ConvolutionDescriptor::ConvolutionDescriptor()
     : ConvolutionDescriptor(/*ndims=*/2) {}
@@ -460,10 +453,10 @@ string ConvolutionDescriptor::ToString() const {
   string padding;
   string strides;
   string dilations;
-  for (int i = 0; i < ndims_; i++) {
-    port::Appendf(&padding, "%lld ", zero_padding_[i]);
-    port::Appendf(&strides, "%lld ", filter_strides_[i]);
-    port::Appendf(&dilations, "%lld ", dilation_rates_[i]);
+  for (int i = 0; i < ndims(); i++) {
+    port::Appendf(&padding, "%lld ", this->padding()[i]);
+    port::Appendf(&strides, "%lld ", this->strides()[i]);
+    port::Appendf(&dilations, "%lld ", this->dilations()[i]);
   }
 
   return port::Printf(
@@ -475,15 +468,15 @@ string ConvolutionDescriptor::ToString() const {
 
 string ConvolutionDescriptor::ToShortString() const {
   string desc;
-  for (int i = 0; i < ndims_; i++) {
+  for (int i = 0; i < ndims(); i++) {
     if (i > 0) port::Appendf(&desc, "_");
-    port::Appendf(&desc, "p%d:%lld", i, zero_padding_[i]);
+    port::Appendf(&desc, "p%d:%lld", i, padding()[i]);
   }
-  for (int i = 0; i < ndims_; i++) {
-    port::Appendf(&desc, "_s%d:%lld", i, filter_strides_[i]);
+  for (int i = 0; i < ndims(); i++) {
+    port::Appendf(&desc, "_s%d:%lld", i, strides()[i]);
   }
-  for (int i = 0; i < ndims_; i++) {
-    port::Appendf(&desc, "_d%d:%lld", i, dilation_rates_[i]);
+  for (int i = 0; i < ndims(); i++) {
+    port::Appendf(&desc, "_d%d:%lld", i, dilations()[i]);
   }
   return desc;
 }
diff --git a/tensorflow/stream_executor/dnn.h b/tensorflow/stream_executor/dnn.h
index c934301829d..c044a356efb 100644
--- a/tensorflow/stream_executor/dnn.h
+++ b/tensorflow/stream_executor/dnn.h
@@ -29,7 +29,9 @@ limitations under the License.
 
 #include "absl/types/optional.h"
 #include "absl/types/span.h"
+#include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/stream_executor/device_memory.h"
+#include "tensorflow/stream_executor/dnn.pb.h"
 #include "tensorflow/stream_executor/lib/array_slice.h"
 #include "tensorflow/stream_executor/lib/status.h"
 #include "tensorflow/stream_executor/lib/statusor.h"
@@ -48,19 +50,6 @@ class ScratchAllocator;
 
 namespace dnn {
 
-// Describes how an input or output layer's data is formatted.
-// Specify int64 so there's no padding in BatchDescriptor.
-enum class DataLayout : int64 {
-  kYXDepthBatch = 0,  // Same as dist_belief::DF_DEPTH_MAJOR.
-  kYXBatchDepth,      // Same as dist_belief::DF_BATCH_MAJOR.
-  kBatchYXDepth,      // Same as run_brain output, and tensorflow's layout.
-  kBatchDepthYX,      // cuDNN's NCHW layout, data laid out as image, feature
-                      // maps, rows, columns.
-  kBatchDepthYX4,     // cuDNN's NCHW_VECT_C layout, data laid out the same as
-                      // kBatchDepthYX but each element is a vector of 4 feature
-                      // maps.
-};
-
 // Specifies an index to use when accessing specific spatial dimensions.
 enum class DimIndex : int {
   X = 0,
@@ -73,8 +62,27 @@ inline int64 GetDim(absl::Span<const int64> data, DimIndex dim) {
   return data.rbegin()[static_cast<int64>(dim)];
 }
 
+inline void SetDim(absl::Span<int64> data, DimIndex dim, int64 value) {
+  data.rbegin()[static_cast<int64>(dim)] = value;
+}
+
 inline void SetDim(std::vector<int64>* data, DimIndex dim, int64 value) {
-  data->rbegin()[static_cast<int64>(dim)] = value;
+  return SetDim(absl::MakeSpan(*data), dim, value);
+}
+
+// tensorflow::int64 is not the same type as tensorflow::protobuf_int64 in
+// open-source. Wrapper function that gives an int64 array slice view of a
+// repeated int64 protobuf field.
+inline absl::Span<const int64> AsInt64Slice(
+    const tensorflow::protobuf::RepeatedField<tensorflow::protobuf_int64>& v) {
+  return absl::Span<const int64>(reinterpret_cast<const int64*>(v.data()),
+                                 v.size());
+}
+
+inline absl::Span<int64> AsInt64Slice(
+    tensorflow::protobuf::RepeatedField<tensorflow::protobuf_int64>* v) {
+  return absl::Span<int64>(reinterpret_cast<int64*>(v->mutable_data()),
+                           v->size());
 }
 
 // Returns a string representation of the given data layout.
@@ -87,14 +95,6 @@ enum class QuantizedActivationMode {
   k32Bit = 4,
 };
 
-// Specifies the data type used by an operation.
-enum class DataType {
-  kFloat = 0,
-  kDouble = 1,
-  kHalf = 2,
-  kInt8 = 3,
-};
-
 // A helper class to convert C/C++ types to the proper enums.
 template <typename T>
 struct ToDataType;
@@ -114,6 +114,10 @@ template <>
 struct ToDataType<int8> {
   static constexpr DataType value = DataType::kInt8;
 };
+template <>
+struct ToDataType<int32> {
+  static constexpr DataType value = DataType::kInt32;
+};
 
 // Specifies the types of a RNN model.
 enum class RnnMode {
@@ -245,15 +249,15 @@ class BatchDescriptor {
   string ToShortString() const;
 
   // Accessors.
-  int64 count() const { return count_; }
-  int64 feature_map_count() const { return feature_map_count_; }
-  int64 height() const { return GetDim(spatial_size_, DimIndex::Y); }
-  int64 width() const { return GetDim(spatial_size_, DimIndex::X); }
-  int64 spatial_dim(DimIndex dim) const { return GetDim(spatial_size_, dim); }
-  int ndims() const { return ndims_; }
+  int64 count() const { return tensor_.dimensions(0); }
+  int64 feature_map_count() const { return tensor_.dimensions(1); }
+  int64 height() const { return GetDim(spatial_size(), DimIndex::Y); }
+  int64 width() const { return GetDim(spatial_size(), DimIndex::X); }
+  int64 spatial_dim(DimIndex dim) const { return GetDim(spatial_size(), dim); }
+  int ndims() const { return spatial_size().size(); }
   float value_max() const { return value_max_; }
   float value_min() const { return value_min_; }
-  DataLayout layout() const { return layout_; }
+  DataLayout layout() const { return tensor_.data_layout(); }
   QuantizedActivationMode quantized_activation_mode() const {
     return quantized_activation_mode_;
   }
@@ -267,23 +271,23 @@ class BatchDescriptor {
 
   // Named-argument helpers for avoiding user error during construction.
   BatchDescriptor& set_count(int64 value) {
-    count_ = value;
+    tensor_.set_dimensions(0, value);
     return *this;
   }
   BatchDescriptor& set_feature_map_count(int64 value) {
-    feature_map_count_ = value;
+    tensor_.set_dimensions(1, value);
     return *this;
   }
   BatchDescriptor& set_height(int64 value) {
-    SetDim(&spatial_size_, DimIndex::Y, value);
+    SetDim(spatial_size(), DimIndex::Y, value);
     return *this;
   }
   BatchDescriptor& set_width(int64 value) {
-    SetDim(&spatial_size_, DimIndex::X, value);
+    SetDim(spatial_size(), DimIndex::X, value);
     return *this;
   }
   BatchDescriptor& set_spatial_dim(DimIndex dim, int64 value) {
-    SetDim(&spatial_size_, dim, value);
+    SetDim(spatial_size(), dim, value);
     return *this;
   }
   BatchDescriptor& set_value_max(float value) {
@@ -295,7 +299,7 @@ class BatchDescriptor {
     return *this;
   }
   BatchDescriptor& set_layout(DataLayout layout) {
-    layout_ = layout;
+    tensor_.set_data_layout(layout);
     return *this;
   }
   BatchDescriptor& set_quantized_activation_mode(
@@ -334,31 +338,20 @@ class BatchDescriptor {
       port::ArraySlice<dnn::BatchDescriptor> inputs);
 
  private:
-  int64 count_;
-  int64 feature_map_count_;
-  // Stored as: ..., y, x.
-  std::vector<int64> spatial_size_;
+  absl::Span<const int64> spatial_size() const {
+    return AsInt64Slice(tensor_.dimensions()).subspan(2);
+  }
+
+  absl::Span<int64> spatial_size() {
+    return AsInt64Slice(tensor_.mutable_dimensions()).subspan(2);
+  }
+
+  TensorDescriptorProto tensor_;
   float value_max_;
   float value_min_;
-  DataLayout layout_;
-  int ndims_;
   QuantizedActivationMode quantized_activation_mode_;
 };
 
-// Describes how a filter is laid out in the memory.
-// Specify int64 so there's no padding in FilterDescriptor.
-enum class FilterLayout : int64 {
-  kOutputInputYX = 0,  // cuDNN's default filter layout, laid out as:
-                       // (major) output feature maps >> input feature maps >>
-                       // rows >> columns (minor).
-  kOutputYXInput,      // major to minor:
-                       //   (output features, row, columns, input features)
-  kOutputInputYX4,  // laid out the same as kOutputInputYX but each element is a
-                    // vector of 4 feature maps.
-  kInputYXOutput,   // Same as dist_belief's default filter layout.
-  kYXInputOutput,   // Same as tensorflow's default filter layout.
-};
-
 // Returns a string representation of the given filter layout.
 string FilterLayoutString(FilterLayout layout);
 
@@ -398,30 +391,30 @@ class FilterDescriptor {
 
   // Named-argument helpers for avoiding user error during construction.
   FilterDescriptor& set_output_feature_map_count(int64 value) {
-    output_feature_map_count_ = value;
+    tensor_.set_dimensions(0, value);
     return *this;
   }
   FilterDescriptor& set_input_feature_map_count(int64 value) {
-    input_feature_map_count_ = value;
+    tensor_.set_dimensions(1, value);
     return *this;
   }
   FilterDescriptor& set_input_filter_height(int64 value) {
-    SetDim(&input_filter_dims_, DimIndex::Y, value);
+    SetDim(input_filter_dims(), DimIndex::Y, value);
     return *this;
   }
   FilterDescriptor& set_input_filter_width(int64 value) {
-    SetDim(&input_filter_dims_, DimIndex::X, value);
+    SetDim(input_filter_dims(), DimIndex::X, value);
     return *this;
   }
   FilterDescriptor& set_layout(FilterLayout layout) {
-    layout_ = layout;
+    tensor_.set_filter_layout(layout);
     return *this;
   }
   FilterDescriptor& set_spatial_dim(DimIndex dim, int64 value) {
-    SetDim(&input_filter_dims_, dim, value);
+    SetDim(input_filter_dims(), dim, value);
     return *this;
   }
-  int ndims() const { return ndims_; }
+  int ndims() const { return input_filter_dims().size(); }
 
   void CloneFrom(const FilterDescriptor& other);
 
@@ -434,32 +427,32 @@ class FilterDescriptor {
 
   // Returns the number of biases required as parameters for a convolution
   // using this filter descriptor.
-  int64 bias_count() const { return output_feature_map_count_; }
+  int64 bias_count() const { return output_feature_map_count(); }
 
-  int64 output_feature_map_count() const { return output_feature_map_count_; }
-  int64 input_feature_map_count() const { return input_feature_map_count_; }
+  int64 output_feature_map_count() const { return tensor_.dimensions(0); }
+  int64 input_feature_map_count() const { return tensor_.dimensions(1); }
   int64 input_filter_height() const {
-    return GetDim(input_filter_dims_, DimIndex::Y);
+    return GetDim(input_filter_dims(), DimIndex::Y);
   }
   int64 input_filter_width() const {
-    return GetDim(input_filter_dims_, DimIndex::X);
+    return GetDim(input_filter_dims(), DimIndex::X);
   }
   int64 input_filter_dim(DimIndex dim) const {
-    return GetDim(input_filter_dims_, dim);
+    return GetDim(input_filter_dims(), dim);
   }
 
-  FilterLayout layout() const { return layout_; }
+  FilterLayout layout() const { return tensor_.filter_layout(); }
+
   absl::Span<const int64> input_filter_dims() const {
-    return input_filter_dims_;
+    return AsInt64Slice(tensor_.dimensions()).subspan(2);
   }
 
  private:
-  int64 output_feature_map_count_;
-  int64 input_feature_map_count_;
-  // Stored as: ..., y, x.
-  std::vector<int64> input_filter_dims_;
-  int ndims_;
-  FilterLayout layout_;
+  absl::Span<int64> input_filter_dims() {
+    return AsInt64Slice(tensor_.mutable_dimensions()).subspan(2);
+  }
+
+  TensorDescriptorProto tensor_;
 };
 
 // Describes how padding should be aligned when the total number of pad
@@ -500,6 +493,11 @@ std::ostream& operator<<(std::ostream& str, dnn::PadAlignment alignment);
 //   cells between each filter element in the "y dimension".
 // - horizontal_dilation_rate: there will be (horizontal_dilation_rate - 1)
 //   skipped cells between each filter element in the "x dimension".
+// - convolution_not_crosscor: By default (convolution_not_crosscor == false),
+//   we perform cross correlation rather than convolution. With the flag set,
+//   we perform convolution. Convolution and cross correlation are related by
+//   rotating the filter by 180 degrees (or equivalently flipping all spatial
+//   dimensions).
 class ConvolutionDescriptor {
  public:
   // By default construction, there is no zero-padding and the filter stride is
@@ -513,84 +511,102 @@ class ConvolutionDescriptor {
   string ToShortString() const;
 
   ConvolutionDescriptor& set_zero_padding_height(int64 value) {
-    SetDim(&zero_padding_, DimIndex::Y, value);
+    SetDim(padding(), DimIndex::Y, value);
     return *this;
   }
   ConvolutionDescriptor& set_zero_padding_width(int64 value) {
-    SetDim(&zero_padding_, DimIndex::X, value);
+    SetDim(padding(), DimIndex::X, value);
     return *this;
   }
   ConvolutionDescriptor& set_zero_padding(DimIndex dim, int64 value) {
-    SetDim(&zero_padding_, dim, value);
+    SetDim(padding(), dim, value);
     return *this;
   }
   ConvolutionDescriptor& set_vertical_filter_stride(int64 value) {
-    SetDim(&filter_strides_, DimIndex::Y, value);
+    SetDim(strides(), DimIndex::Y, value);
     return *this;
   }
   ConvolutionDescriptor& set_horizontal_filter_stride(int64 value) {
-    SetDim(&filter_strides_, DimIndex::X, value);
+    SetDim(strides(), DimIndex::X, value);
     return *this;
   }
   ConvolutionDescriptor& set_filter_stride(DimIndex dim, int64 value) {
-    SetDim(&filter_strides_, dim, value);
+    SetDim(strides(), dim, value);
     return *this;
   }
   ConvolutionDescriptor& set_vertical_dilation_rate(int64 value) {
-    SetDim(&dilation_rates_, DimIndex::Y, value);
+    SetDim(dilations(), DimIndex::Y, value);
     return *this;
   }
   ConvolutionDescriptor& set_horizontal_dilation_rate(int64 value) {
-    SetDim(&dilation_rates_, DimIndex::X, value);
+    SetDim(dilations(), DimIndex::X, value);
     return *this;
   }
   ConvolutionDescriptor& set_dilation_rate(DimIndex dim, int64 value) {
-    SetDim(&dilation_rates_, dim, value);
+    SetDim(dilations(), dim, value);
     return *this;
   }
   ConvolutionDescriptor& set_group_count(int group_count) {
-    group_count_ = group_count;
+    proto_.set_group_count(group_count);
     return *this;
   }
-  int64 zero_padding_height() const {
-    return GetDim(zero_padding_, DimIndex::Y);
-  }
-  int64 zero_padding_width() const {
-    return GetDim(zero_padding_, DimIndex::X);
+  ConvolutionDescriptor& set_convolution_not_crosscorr(bool conv) {
+    proto_.set_convolution_mode(conv ? ConvolutionMode::CONVOLUTION
+                                     : ConvolutionMode::CROSS_CORRELATION);
+    return *this;
   }
+  int64 zero_padding_height() const { return GetDim(padding(), DimIndex::Y); }
+  int64 zero_padding_width() const { return GetDim(padding(), DimIndex::X); }
   int64 vertical_filter_stride() const {
-    return GetDim(filter_strides_, DimIndex::Y);
+    return GetDim(strides(), DimIndex::Y);
   }
   int64 horizontal_filter_stride() const {
-    return GetDim(filter_strides_, DimIndex::X);
+    return GetDim(strides(), DimIndex::X);
   }
   int64 vertical_dilation_rate() const {
-    return GetDim(dilation_rates_, DimIndex::Y);
+    return GetDim(dilations(), DimIndex::Y);
   }
   int64 horizontal_dilation_rate() const {
-    return GetDim(dilation_rates_, DimIndex::X);
+    return GetDim(dilations(), DimIndex::X);
   }
 
-  int zero_padding(DimIndex dim) const { return GetDim(zero_padding_, dim); }
-  int filter_stride(DimIndex dim) const { return GetDim(filter_strides_, dim); }
-  int dilation_rate(DimIndex dim) const { return GetDim(dilation_rates_, dim); }
+  int zero_padding(DimIndex dim) const { return GetDim(padding(), dim); }
+  int filter_stride(DimIndex dim) const { return GetDim(strides(), dim); }
+  int dilation_rate(DimIndex dim) const { return GetDim(dilations(), dim); }
   // TODO(timshen): remove this function. No users of this class is setting a
   // non-default pad alignment.
   PadAlignment pad_alignment() const { return PadAlignment::kDefault; }
-  int group_count() const { return group_count_; }
-  int ndims() const { return ndims_; }
+  int group_count() const { return proto_.group_count(); }
+  int ndims() const { return padding().size(); }
+  bool convolution_not_crosscorr() const {
+    return proto_.convolution_mode() == ConvolutionMode::CONVOLUTION;
+  }
 
-  absl::Span<const int64> strides() const { return filter_strides_; }
-  absl::Span<const int64> dilations() const { return dilation_rates_; }
-  absl::Span<const int64> padding() const { return zero_padding_; }
+  absl::Span<const int64> strides() const {
+    return AsInt64Slice(proto_.strides());
+  }
+
+  absl::Span<const int64> dilations() const {
+    return AsInt64Slice(proto_.dilations());
+  }
+
+  absl::Span<const int64> padding() const {
+    return AsInt64Slice(proto_.paddings());
+  }
 
  private:
-  // Stored as: .. y, x.
-  std::vector<int64> zero_padding_;
-  std::vector<int64> filter_strides_;
-  std::vector<int64> dilation_rates_;
-  int group_count_;
-  int ndims_;
+  absl::Span<int64> strides() { return AsInt64Slice(proto_.mutable_strides()); }
+
+  absl::Span<int64> dilations() {
+    return AsInt64Slice(proto_.mutable_dilations());
+  }
+
+  absl::Span<int64> padding() {
+    return AsInt64Slice(proto_.mutable_paddings());
+  }
+
+  ConvolutionDescriptorProto proto_;
+
   // TODO(leary) cudnn provides these fields, but need to characterize what
   // their effect is -- they may be boolean rather than integral.
   // int64 upscale_input_x;
@@ -714,21 +730,23 @@ class PoolingDescriptor {
 class AlgorithmDesc {
  public:
   typedef int64 Index;
-  AlgorithmDesc(Index a, bool use_tensor_ops)
-      : algo_(a), tensor_ops_enabled_(use_tensor_ops) {
-    DCHECK_NE(a, -1);
+  AlgorithmDesc(Index a, bool use_tensor_ops) {
+    proto_.set_algo_id(a);
+    proto_.set_math_type(use_tensor_ops ? AlgorithmProto::TENSOR_OP_MATH
+                                        : AlgorithmProto::DEFAULT_MATH);
   }
-  bool tensor_ops_enabled() const { return tensor_ops_enabled_; }
-  Index algo_id() const { return algo_; }
+  bool tensor_ops_enabled() const {
+    return proto_.math_type() == AlgorithmProto::TENSOR_OP_MATH;
+  }
+  Index algo_id() const { return proto_.algo_id(); }
   bool operator==(const AlgorithmDesc& other) const {
-    return this->algo_ == other.algo_ &&
-           this->tensor_ops_enabled_ == other.tensor_ops_enabled_;
+    return algo_id() == other.algo_id() &&
+           tensor_ops_enabled() == other.tensor_ops_enabled();
   }
   uint64 hash() const;
 
  private:
-  Index algo_;
-  bool tensor_ops_enabled_;
+  AlgorithmProto proto_;
 };
 
 // Describes the result from a perf experiment.
@@ -872,24 +890,6 @@ class NormalizeDescriptor {
   int32 segment_size_;
 };
 
-// Describes a kind of non-linearity (threshold-like mathematical function).
-enum class ActivationMode {
-  kNone = 0,
-  kSigmoid,
-  // Rectified linear activation: f(x) = x < 0 ? 0 : x
-  kRelu,
-  // Rectified linear activation, where upper maximum is 6.0.
-  kRelu6,
-  // Rectified linear activation, where upper maximum specified by
-  // BatchDescriptor::value_max().
-  kReluX,
-  kTanh,
-  // Like ReluX, but passes all values in the range [-X,X].
-  kBandPass,
-
-  kNumActivationModes,  // Always in the end.
-};
-
 // Returns a string representation of the given activation mode.
 string ActivationModeString(ActivationMode mode);
 
diff --git a/tensorflow/stream_executor/dnn.proto b/tensorflow/stream_executor/dnn.proto
new file mode 100644
index 00000000000..56b079c3f5b
--- /dev/null
+++ b/tensorflow/stream_executor/dnn.proto
@@ -0,0 +1,103 @@
+// LINT: LEGACY_NAMES
+syntax = "proto3";
+
+package stream_executor.dnn;
+
+// Specifies the data type used by an operation.
+enum DataType {
+  kFloat = 0;
+  kDouble = 1;
+  kHalf = 2;
+  kInt8 = 3;
+  kInt32 = 4;
+}
+
+// Describes how a convolution input or output layer's data is formatted.
+enum DataLayout {
+  // Naming convention:
+  // Y <-> row or height
+  // X <-> column or width
+  // Batch <-> batch, or N
+  // Depth <-> feature, or channel
+  // TODO(timshen): turn them into cuDNN names, e.g. kNCHW.
+  kYXDepthBatch = 0;
+  kYXBatchDepth = 1;
+  kBatchYXDepth = 2;   // cuDNN's NHWC layout
+  kBatchDepthYX = 3;   // cuDNN's NCHW layout
+  kBatchDepthYX4 = 4;  // cuDNN's NCHW_VECT_C layout
+}
+
+// Describes how a convolution filter is laid out in the memory.
+enum FilterLayout {
+  // Naming convention:
+  // Y <-> row or height
+  // X <-> column or width
+  // Output <-> output feature, or N
+  // Input <-> input feature, or N
+  // TODO(timshen): turn them into cuDNN names, e.g. kNCHW.
+  kOutputInputYX = 0;   // cuDNN's NCHW layout
+  kOutputYXInput = 1;   // cuDNN's NHWC layout
+  kOutputInputYX4 = 2;  // cuDNN's NCHW_VECT_C layout
+  kInputYXOutput = 3;
+  kYXInputOutput = 4;
+}
+
+// Describes a kind of non-linearity (threshold-like mathematical function).
+enum ActivationMode {
+  kNone = 0;
+  kSigmoid = 1;
+  // Rectified linear activation: f(x) = x < 0 ? 0 : x
+  kRelu = 2;
+  // Rectified linear activation; where upper maximum is 6.0.
+  kRelu6 = 3;
+  // Rectified linear activation; where upper maximum specified by
+  // BatchDescriptor::value_max().
+  kReluX = 4;
+  kTanh = 5;
+  // Like ReluX; but passes all values in the range [-X,X].
+  kBandPass = 6;
+}
+
+// Describe the math definition for the conv op. The popular behavior is
+// actually called cross-correlation in math, despite the operation is often
+// referred as convolution. See cuDNN cudnnConvolutionMode_t.
+enum ConvolutionMode {
+  CROSS_CORRELATION = 0;
+  CONVOLUTION = 1;
+}
+
+// Generic tensor representation.
+message TensorDescriptorProto {
+  repeated int64 dimensions = 1;
+  DataType data_type = 2;
+  oneof layout_oneof {
+    DataLayout data_layout = 3;
+    FilterLayout filter_layout = 4;
+  }
+}
+
+// Generic algorithm representation.
+message AlgorithmProto {
+  enum MathType {
+    DEFAULT_MATH = 0;
+    // The GPU may operate 4x4 matrix FMA.
+    // See cuDNN's documentation for CUDNN_TENSOR_OP_MATH.
+    TENSOR_OP_MATH = 1;
+  }
+  int64 algo_id = 1;
+  MathType math_type = 2;
+}
+
+// Convolution-specific parameters.
+message ConvolutionDescriptorProto {
+  repeated int64 paddings = 1;
+  repeated int64 strides = 2;
+  repeated int64 dilations = 3;
+  // The "accumulator" type. For example, use F32 as an accumulator for F16
+  // convolutions.
+  // See cuDNN's cudnnConvolutionMode_t.
+  DataType compute_mode = 4;
+  // See cuDNN's group count.
+  int32 group_count = 5;
+  ConvolutionMode convolution_mode = 6;
+}
diff --git a/tensorflow/stream_executor/host/host_gpu_executor.cc b/tensorflow/stream_executor/host/host_gpu_executor.cc
index 8adf739b170..1396a83dfb1 100644
--- a/tensorflow/stream_executor/host/host_gpu_executor.cc
+++ b/tensorflow/stream_executor/host/host_gpu_executor.cc
@@ -148,8 +148,13 @@ port::Status HostExecutor::SynchronousMemcpyDeviceToDevice(
 }
 
 bool HostExecutor::HostCallback(Stream *stream,
-                                std::function<void()> callback) {
-  AsHostStream(stream)->EnqueueTask(callback);
+                                std::function<port::Status()> callback) {
+  AsHostStream(stream)->EnqueueTask([callback]() {
+    port::Status s = callback();
+    if (!s.ok()) {
+      LOG(WARNING) << "Host callback failed: " << s;
+    }
+  });
   return true;
 }
 
diff --git a/tensorflow/stream_executor/host/host_gpu_executor.h b/tensorflow/stream_executor/host/host_gpu_executor.h
index 7ba1f181015..56e3c2aa6a9 100644
--- a/tensorflow/stream_executor/host/host_gpu_executor.h
+++ b/tensorflow/stream_executor/host/host_gpu_executor.h
@@ -103,7 +103,8 @@ class HostExecutor : public internal::StreamExecutorInterface {
                                                const DeviceMemoryBase &gpu_src,
                                                uint64 size) override;
 
-  bool HostCallback(Stream *stream, std::function<void()> callback) override;
+  bool HostCallback(Stream *stream,
+                    std::function<port::Status()> callback) override;
 
   port::Status AllocateEvent(Event *event) override {
     return port::Status(port::error::UNIMPLEMENTED, "");
diff --git a/tensorflow/stream_executor/stream.cc b/tensorflow/stream_executor/stream.cc
index 5421e4f4a5e..3edc66cde80 100644
--- a/tensorflow/stream_executor/stream.cc
+++ b/tensorflow/stream_executor/stream.cc
@@ -191,8 +191,11 @@ string ToVlogString(dnn::DataType data_type) {
       return "dnn::DataType::kHalf";
     case dnn::DataType::kInt8:
       return "dnn::DataType::kInt8";
+    case dnn::DataType::kInt32:
+      return "dnn::DataType::kInt32";
+    default:
+      return "unknown DataType";
   }
-  return "unknown DataType";
 }
 
 // Used together with PARAM to VLOG calls made to the stream. Intended
diff --git a/tensorflow/stream_executor/stream.h b/tensorflow/stream_executor/stream.h
index e1629b5b308..0fc90cf83d6 100644
--- a/tensorflow/stream_executor/stream.h
+++ b/tensorflow/stream_executor/stream.h
@@ -2033,9 +2033,20 @@ class Stream {
   // transferred to the caller.
   internal::StreamInterface *implementation() { return implementation_.get(); }
 
+  // Entrains onto the stream a callback to the host (from the device).
+  // Behaves as ThenDoHostCallbackWithStatus below, but the callback should
+  // never fail or its failure is inconsequential.
+  //
+  // This is kept for backward compatibility. Future code should use
+  // ThenDoHostCallbackWithStatus and explicitly return a success status.
+  // TODO(b/112125301): Eventually remove this method.
+  Stream &ThenDoHostCallback(std::function<void()> callback);
+
   // Entrains onto the stream a callback to the host (from the device).
   // Host callbacks block/occupy the stream just as device functions
   // (execute one at a time, block later stream operations).
+  // Whether the callback return status affects the result of BlockHostUntilDone
+  // is platform-dependent.
   //
   // Behavior is undefined when synchronizing using OpenCL user events.
   // Behavior is undefined if host callbacks call device routines or insert
@@ -2043,11 +2054,6 @@ class Stream {
   //
   // On certain platforms, ThenDoHostCallback is expected to have significant
   // negative effects on performance.
-  Stream &ThenDoHostCallback(std::function<void()> callback);
-
-  // Entrains onto the stream a callback to the host (from the device).
-  // Behaves as ThenDoHostCallback above, but returns a Status instead of void.
-  // This overload should be preferred if the callback could fail.
   Stream &ThenDoHostCallbackWithStatus(std::function<port::Status()> callback);
 
   // Returns the StreamExecutor (parent object) associated with this stream.
diff --git a/tensorflow/stream_executor/stream_executor_internal.cc b/tensorflow/stream_executor/stream_executor_internal.cc
index 7df6a361c68..341c6edccd3 100644
--- a/tensorflow/stream_executor/stream_executor_internal.cc
+++ b/tensorflow/stream_executor/stream_executor_internal.cc
@@ -36,16 +36,15 @@ StreamExecutorFactory* MakeOpenCLExecutorImplementation() {
 
 StreamExecutorFactory MakeHostExecutorImplementation;
 
-// TODO(b/112125301): Consolodate this down to one implementation of
-// HostCallback, taking a callback that returns a Status.
-bool StreamExecutorInterface::HostCallback(
-    Stream* stream, std::function<port::Status()> callback) {
-  return HostCallback(stream, [callback]() {
-    port::Status s = callback();
-    if (!s.ok()) {
-      LOG(WARNING) << "HostCallback failed: " << s;
-    }
-  });
+// The default implementation just calls the other HostCallback method.
+// It should make all existing code that uses a void() callback still work.
+bool StreamExecutorInterface::HostCallback(Stream* stream,
+                                           std::function<void()> callback) {
+  return HostCallback(
+      stream, std::function<port::Status()>([callback]() -> port::Status {
+        callback();
+        return port::Status::OK();
+      }));
 }
 
 }  // namespace internal
diff --git a/tensorflow/stream_executor/stream_executor_internal.h b/tensorflow/stream_executor/stream_executor_internal.h
index 32f75fd1bc1..0c2c33cfca2 100644
--- a/tensorflow/stream_executor/stream_executor_internal.h
+++ b/tensorflow/stream_executor/stream_executor_internal.h
@@ -237,9 +237,9 @@ class StreamExecutorInterface {
   virtual bool MemcpyDeviceToDevice(Stream *stream, DeviceMemoryBase *gpu_dst,
                                     const DeviceMemoryBase &gpu_src,
                                     uint64 size) = 0;
-  virtual bool HostCallback(Stream *stream, std::function<void()> callback) = 0;
+  virtual bool HostCallback(Stream *stream, std::function<void()> callback);
   virtual bool HostCallback(Stream *stream,
-                            std::function<port::Status()> callback);
+                            std::function<port::Status()> callback) = 0;
   virtual port::Status AllocateEvent(Event *event) = 0;
   virtual port::Status DeallocateEvent(Event *event) = 0;
   virtual port::Status RecordEvent(Stream *stream, Event *event) = 0;
diff --git a/tensorflow/tensorflow.bzl b/tensorflow/tensorflow.bzl
index 8e5ab94b536..ed1de5a31ca 100644
--- a/tensorflow/tensorflow.bzl
+++ b/tensorflow/tensorflow.bzl
@@ -203,8 +203,12 @@ def if_override_eigen_strong_inline(a):
         "//conditions:default": [],
     })
 
-def if_not_tx2_llvm_or_windows_cuda(a):
-    return if_not_windows_cuda(a)
+def if_nccl(a):
+    return select({
+        "//tensorflow:no_nccl_support": [],
+        "//tensorflow:windows": [],
+        "//conditions:default": a,
+    })
 
 def get_win_copts(is_external = False):
     WINDOWS_COPTS = [
@@ -1307,13 +1311,13 @@ def _py_wrap_cc_impl(ctx):
         ctx.outputs.py_out.dirname,
     ]
     args += ["-l" + f.path for f in ctx.files.swig_includes]
-    args += ["-I" + i for i in swig_include_dirs]
+    args += ["-I" + i for i in swig_include_dirs.to_list()]
     args += [src.path]
     outputs = [ctx.outputs.cc_out, ctx.outputs.py_out]
     ctx.action(
         executable = ctx.executable._swig,
         arguments = args,
-        inputs = list(inputs),
+        inputs = inputs.to_list(),
         outputs = outputs,
         mnemonic = "PythonSwig",
         progress_message = "SWIGing " + src.path,
@@ -1493,7 +1497,7 @@ check_deps = rule(
     },
 )
 
-def tf_custom_op_library(name, srcs = [], gpu_srcs = [], deps = [], linkopts = [], **kwargs):
+def tf_custom_op_library(name, srcs = [], gpu_srcs = [], deps = [], linkopts = [], copts = [], **kwargs):
     """Helper to build a dynamic library (.so) from the sources containing implementations of custom ops and kernels.
     """
     cuda_deps = [
@@ -1505,12 +1509,18 @@ def tf_custom_op_library(name, srcs = [], gpu_srcs = [], deps = [], linkopts = [
         clean_dep("//tensorflow/core:stream_executor_headers_lib"),
     ]
     deps = deps + tf_custom_op_library_additional_deps()
+
+    # Override EIGEN_STRONG_INLINE to inline when
+    # --define=override_eigen_strong_inline=true to avoid long compiling time.
+    # See https://github.com/tensorflow/tensorflow/issues/10521
+    copts = copts + if_override_eigen_strong_inline(["/DEIGEN_STRONG_INLINE=inline"])
+
     if gpu_srcs:
         basename = name.split(".")[0]
         native.cc_library(
             name = basename + "_gpu",
             srcs = gpu_srcs,
-            copts = _cuda_copts() + if_tensorrt(["-DGOOGLE_TENSORRT=1"]),
+            copts = copts + _cuda_copts() + if_tensorrt(["-DGOOGLE_TENSORRT=1"]),
             features = if_cuda(["-use_header_modules"]),
             deps = deps + if_cuda_is_configured_compat(cuda_deps) + if_rocm_is_configured(rocm_deps),
             **kwargs
@@ -1531,7 +1541,7 @@ def tf_custom_op_library(name, srcs = [], gpu_srcs = [], deps = [], linkopts = [
         srcs = srcs,
         deps = deps + if_cuda_is_configured_compat(cuda_deps) + if_rocm_is_configured(rocm_deps),
         data = if_static([name + "_check_deps"]),
-        copts = tf_copts(is_external = True),
+        copts = copts + tf_copts(is_external = True),
         features = ["windows_export_all_symbols"],
         linkopts = linkopts + select({
             "//conditions:default": [
@@ -2022,3 +2032,6 @@ register_extension_info(
     extension_name = "cc_library_with_android_deps",
     label_regex_for_dep = "{extension_name}",
 )
+
+def tensorflow_opensource_extra_deps():
+    return []
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.-config-proto.-experimental.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-config-proto.-experimental.pbtxt
index f7491649c22..a1083d732a1 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.-config-proto.-experimental.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.-config-proto.-experimental.pbtxt
@@ -20,7 +20,13 @@ tf_proto {
       label: LABEL_OPTIONAL
       type: TYPE_INT32
     }
-    reserved_range {
+    field {
+      name: "use_numa_affinity"
+      number: 5
+      label: LABEL_OPTIONAL
+      type: TYPE_BOOL
+    }
+     reserved_range {
       start: 2
       end: 3
     }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.-config-proto.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-config-proto.pbtxt
index 53b532beab3..b505d813509 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.-config-proto.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.-config-proto.pbtxt
@@ -143,6 +143,12 @@ tf_proto {
         label: LABEL_OPTIONAL
         type: TYPE_INT32
       }
+      field {
+        name: "use_numa_affinity"
+        number: 5
+        label: LABEL_OPTIONAL
+        type: TYPE_BOOL
+      }
       reserved_range {
         start: 2
         end: 3
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.-gradient-tape.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-gradient-tape.pbtxt
index 0a16d6ab92f..2299a009d3d 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.-gradient-tape.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.-gradient-tape.pbtxt
@@ -6,10 +6,18 @@ tf_class {
     name: "__init__"
     argspec: "args=[\'self\', \'persistent\', \'watch_accessed_variables\'], varargs=None, keywords=None, defaults=[\'False\', \'True\'], "
   }
+  member_method {
+    name: "batch_jacobian"
+    argspec: "args=[\'self\', \'target\', \'source\', \'unconnected_gradients\', \'parallel_iterations\', \'experimental_use_pfor\'], varargs=None, keywords=None, defaults=[\'UnconnectedGradients.NONE\', \'None\', \'True\'], "
+  }
   member_method {
     name: "gradient"
     argspec: "args=[\'self\', \'target\', \'sources\', \'output_gradients\', \'unconnected_gradients\'], varargs=None, keywords=None, defaults=[\'None\', \'UnconnectedGradients.NONE\'], "
   }
+  member_method {
+    name: "jacobian"
+    argspec: "args=[\'self\', \'target\', \'sources\', \'unconnected_gradients\', \'parallel_iterations\', \'experimental_use_pfor\'], varargs=None, keywords=None, defaults=[\'UnconnectedGradients.NONE\', \'None\', \'True\'], "
+  }
   member_method {
     name: "reset"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.-tensor-spec.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-tensor-spec.pbtxt
new file mode 100644
index 00000000000..493dcba8922
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.-tensor-spec.pbtxt
@@ -0,0 +1,33 @@
+path: "tensorflow.TensorSpec"
+tf_class {
+  is_instance: "<class \'tensorflow.python.framework.tensor_spec.TensorSpec\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "shape"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'shape\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "from_spec"
+    argspec: "args=[\'cls\', \'spec\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "from_tensor"
+    argspec: "args=[\'cls\', \'tensor\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "is_compatible_with"
+    argspec: "args=[\'self\', \'spec_or_tensor\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.-dataset.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.-dataset.pbtxt
index 8b7f63e43e2..f59082baeb2 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.-dataset.pbtxt
@@ -1,6 +1,7 @@
 path: "tensorflow.data.Dataset"
 tf_class {
-  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.Dataset\'>"
+  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetV1\'>"
+  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetV2\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "output_classes"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.-fixed-length-record-dataset.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.-fixed-length-record-dataset.pbtxt
index 81358cecbc0..d73168b070e 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.-fixed-length-record-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.-fixed-length-record-dataset.pbtxt
@@ -1,8 +1,9 @@
 path: "tensorflow.data.FixedLengthRecordDataset"
 tf_class {
-  is_instance: "<class \'tensorflow.python.data.ops.readers.FixedLengthRecordDataset\'>"
-  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetSource\'>"
-  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.Dataset\'>"
+  is_instance: "<class \'tensorflow.python.data.ops.readers.FixedLengthRecordDatasetV1\'>"
+  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetV1Adapter\'>"
+  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetV1\'>"
+  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetV2\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "output_classes"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.-options.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.-options.pbtxt
index 9d032d43de1..72fc2c3a9ee 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.-options.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.-options.pbtxt
@@ -1,6 +1,7 @@
 path: "tensorflow.data.Options"
 tf_class {
   is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.Options\'>"
+  is_instance: "<class \'tensorflow.python.data.util.options.OptionsBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "experimental_autotune"
@@ -10,50 +11,22 @@ tf_class {
     name: "experimental_deterministic"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "experimental_filter_fusion"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "experimental_hoist_random_uniform"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "experimental_map_and_batch_fusion"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "experimental_map_and_filter_fusion"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "experimental_map_fusion"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "experimental_map_parallelization"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "experimental_map_vectorization"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "experimental_noop_elimination"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "experimental_numa_aware"
     mtype: "<type \'property\'>"
   }
   member {
-    name: "experimental_shuffle_and_repeat_fusion"
+    name: "experimental_optimization"
     mtype: "<type \'property\'>"
   }
   member {
     name: "experimental_stats"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "experimental_threading"
+    mtype: "<type \'property\'>"
+  }
   member_method {
     name: "__init__"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.-t-f-record-dataset.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.-t-f-record-dataset.pbtxt
index 7b7a9ebaf08..51224cd6b45 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.-t-f-record-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.-t-f-record-dataset.pbtxt
@@ -1,7 +1,9 @@
 path: "tensorflow.data.TFRecordDataset"
 tf_class {
-  is_instance: "<class \'tensorflow.python.data.ops.readers.TFRecordDataset\'>"
-  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.Dataset\'>"
+  is_instance: "<class \'tensorflow.python.data.ops.readers.TFRecordDatasetV1\'>"
+  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetV1Adapter\'>"
+  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetV1\'>"
+  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetV2\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "output_classes"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.-text-line-dataset.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.-text-line-dataset.pbtxt
index 1c305abf68c..a10add1b7e3 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.-text-line-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.-text-line-dataset.pbtxt
@@ -1,8 +1,9 @@
 path: "tensorflow.data.TextLineDataset"
 tf_class {
-  is_instance: "<class \'tensorflow.python.data.ops.readers.TextLineDataset\'>"
-  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetSource\'>"
-  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.Dataset\'>"
+  is_instance: "<class \'tensorflow.python.data.ops.readers.TextLineDatasetV1\'>"
+  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetV1Adapter\'>"
+  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetV1\'>"
+  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetV2\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "output_classes"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-csv-dataset.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-csv-dataset.pbtxt
index 2520e28a3c7..71b597c19c5 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-csv-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-csv-dataset.pbtxt
@@ -1,8 +1,9 @@
 path: "tensorflow.data.experimental.CsvDataset"
 tf_class {
-  is_instance: "<class \'tensorflow.python.data.experimental.ops.readers.CsvDataset\'>"
-  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetSource\'>"
-  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.Dataset\'>"
+  is_instance: "<class \'tensorflow.python.data.experimental.ops.readers.CsvDatasetV1\'>"
+  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetV1Adapter\'>"
+  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetV1\'>"
+  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetV2\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "output_classes"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-optimization-options.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-optimization-options.pbtxt
new file mode 100644
index 00000000000..9ca75828e55
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-optimization-options.pbtxt
@@ -0,0 +1,46 @@
+path: "tensorflow.data.experimental.OptimizationOptions"
+tf_class {
+  is_instance: "<class \'tensorflow.python.data.experimental.ops.optimization_options.OptimizationOptions\'>"
+  is_instance: "<class \'tensorflow.python.data.util.options.OptionsBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "filter_fusion"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "hoist_random_uniform"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "map_and_batch_fusion"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "map_and_filter_fusion"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "map_fusion"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "map_parallelization"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "map_vectorization"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "noop_elimination"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "shuffle_and_repeat_fusion"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-random-dataset.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-random-dataset.pbtxt
index 1dd53b1eabd..20646e87b5f 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-random-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-random-dataset.pbtxt
@@ -1,8 +1,9 @@
 path: "tensorflow.data.experimental.RandomDataset"
 tf_class {
-  is_instance: "<class \'tensorflow.python.data.experimental.ops.random_ops.RandomDataset\'>"
-  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetSource\'>"
-  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.Dataset\'>"
+  is_instance: "<class \'tensorflow.python.data.experimental.ops.random_ops.RandomDatasetV1\'>"
+  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetV1Adapter\'>"
+  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetV1\'>"
+  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetV2\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "output_classes"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-sql-dataset.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-sql-dataset.pbtxt
index 8fdd9dc52e3..86c5ff5b0bd 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-sql-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-sql-dataset.pbtxt
@@ -1,8 +1,9 @@
 path: "tensorflow.data.experimental.SqlDataset"
 tf_class {
-  is_instance: "<class \'tensorflow.python.data.experimental.ops.readers.SqlDataset\'>"
-  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetSource\'>"
-  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.Dataset\'>"
+  is_instance: "<class \'tensorflow.python.data.experimental.ops.readers.SqlDatasetV1\'>"
+  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetV1Adapter\'>"
+  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetV1\'>"
+  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetV2\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "output_classes"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-stats-options.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-stats-options.pbtxt
index f423eed42cc..892f8c1fb89 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-stats-options.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-stats-options.pbtxt
@@ -1,6 +1,7 @@
 path: "tensorflow.data.experimental.StatsOptions"
 tf_class {
   is_instance: "<class \'tensorflow.python.data.experimental.ops.stats_options.StatsOptions\'>"
+  is_instance: "<class \'tensorflow.python.data.util.options.OptionsBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "aggregator"
@@ -20,6 +21,6 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'aggregator\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
 }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-threading-options.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-threading-options.pbtxt
new file mode 100644
index 00000000000..5b5ebf10801
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-threading-options.pbtxt
@@ -0,0 +1,18 @@
+path: "tensorflow.data.experimental.ThreadingOptions"
+tf_class {
+  is_instance: "<class \'tensorflow.python.data.experimental.ops.threading_options.ThreadingOptions\'>"
+  is_instance: "<class \'tensorflow.python.data.util.options.OptionsBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "max_intra_op_parallelism"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "private_threadpool_size"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.pbtxt
index 4c253bb8adf..f981b1af177 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.pbtxt
@@ -12,6 +12,14 @@ tf_module {
     name: "CsvDataset"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "INFINITE_CARDINALITY"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "OptimizationOptions"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "Optional"
     mtype: "<type \'type\'>"
@@ -40,6 +48,14 @@ tf_module {
     name: "TFRecordWriter"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "ThreadingOptions"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "UNKNOWN_CARDINALITY"
+    mtype: "<type \'int\'>"
+  }
   member_method {
     name: "Counter"
     argspec: "args=[\'start\', \'step\', \'dtype\'], varargs=None, keywords=None, defaults=[\'0\', \'1\', \"<dtype: \'int64\'>\"], "
@@ -48,6 +64,10 @@ tf_module {
     name: "bucket_by_sequence_length"
     argspec: "args=[\'element_length_func\', \'bucket_boundaries\', \'bucket_batch_sizes\', \'padded_shapes\', \'padding_values\', \'pad_to_bucket_boundary\', \'no_padding\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'False\', \'False\'], "
   }
+  member_method {
+    name: "cardinality"
+    argspec: "args=[\'dataset\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "choose_from_datasets"
     argspec: "args=[\'datasets\', \'choice_dataset\'], varargs=None, keywords=None, defaults=None"
@@ -64,6 +84,10 @@ tf_module {
     name: "enumerate_dataset"
     argspec: "args=[\'start\'], varargs=None, keywords=None, defaults=[\'0\'], "
   }
+  member_method {
+    name: "filter_for_shard"
+    argspec: "args=[\'num_shards\', \'shard_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_next_as_optional"
     argspec: "args=[\'iterator\'], varargs=None, keywords=None, defaults=None"
@@ -90,7 +114,7 @@ tf_module {
   }
   member_method {
     name: "make_batched_features_dataset"
-    argspec: "args=[\'file_pattern\', \'batch_size\', \'features\', \'reader\', \'label_key\', \'reader_args\', \'num_epochs\', \'shuffle\', \'shuffle_buffer_size\', \'shuffle_seed\', \'prefetch_buffer_size\', \'reader_num_threads\', \'parser_num_threads\', \'sloppy_ordering\', \'drop_final_batch\'], varargs=None, keywords=None, defaults=[\"<class \'tensorflow.python.data.ops.readers.TFRecordDataset\'>\", \'None\', \'None\', \'None\', \'True\', \'10000\', \'None\', \'-1\', \'1\', \'2\', \'False\', \'False\'], "
+    argspec: "args=[\'file_pattern\', \'batch_size\', \'features\', \'reader\', \'label_key\', \'reader_args\', \'num_epochs\', \'shuffle\', \'shuffle_buffer_size\', \'shuffle_seed\', \'prefetch_buffer_size\', \'reader_num_threads\', \'parser_num_threads\', \'sloppy_ordering\', \'drop_final_batch\'], varargs=None, keywords=None, defaults=[\"<class \'tensorflow.python.data.ops.readers.TFRecordDatasetV1\'>\", \'None\', \'None\', \'None\', \'True\', \'10000\', \'None\', \'-1\', \'1\', \'2\', \'False\', \'False\'], "
   }
   member_method {
     name: "make_csv_dataset"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.pbtxt
index 509bbae8332..aa474680592 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.pbtxt
@@ -28,4 +28,12 @@ tf_module {
     name: "experimental"
     mtype: "<type \'module\'>"
   }
+  member_method {
+    name: "make_initializable_iterator"
+    argspec: "args=[\'dataset\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "make_one_shot_iterator"
+    argspec: "args=[\'dataset\'], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.debugging.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.debugging.pbtxt
index ab6287f8cd0..8a7f1e9363b 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.debugging.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.debugging.pbtxt
@@ -78,7 +78,7 @@ tf_module {
   }
   member_method {
     name: "assert_scalar"
-    argspec: "args=[\'tensor\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'tensor\', \'name\', \'message\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "assert_type"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.distribute.-input-context.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distribute.-input-context.pbtxt
new file mode 100644
index 00000000000..583cbc66549
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.-input-context.pbtxt
@@ -0,0 +1,25 @@
+path: "tensorflow.distribute.InputContext"
+tf_class {
+  is_instance: "<class \'tensorflow.python.distribute.distribute_lib.InputContext\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "input_pipeline_id"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "num_input_pipelines"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "num_replicas_in_sync"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'num_input_pipelines\', \'input_pipeline_id\', \'num_replicas_in_sync\'], varargs=None, keywords=None, defaults=[\'1\', \'0\', \'1\'], "
+  }
+  member_method {
+    name: "get_per_replica_batch_size"
+    argspec: "args=[\'self\', \'global_batch_size\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.distribute.-input-replication-mode.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distribute.-input-replication-mode.pbtxt
new file mode 100644
index 00000000000..6a7a3a97aa0
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.-input-replication-mode.pbtxt
@@ -0,0 +1,8 @@
+path: "tensorflow.distribute.InputReplicationMode"
+tf_class {
+  is_instance: "<enum \'InputReplicationMode\'>"
+  member {
+    name: "PER_WORKER"
+    mtype: "<enum \'InputReplicationMode\'>"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.distribute.-reduce-op.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distribute.-reduce-op.pbtxt
new file mode 100644
index 00000000000..4899f38cad2
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.-reduce-op.pbtxt
@@ -0,0 +1,12 @@
+path: "tensorflow.distribute.ReduceOp"
+tf_class {
+  is_instance: "<enum \'ReduceOp\'>"
+  member {
+    name: "MEAN"
+    mtype: "<enum \'ReduceOp\'>"
+  }
+  member {
+    name: "SUM"
+    mtype: "<enum \'ReduceOp\'>"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.distribute.-replica-context.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distribute.-replica-context.pbtxt
new file mode 100644
index 00000000000..df707e8920e
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.-replica-context.pbtxt
@@ -0,0 +1,33 @@
+path: "tensorflow.distribute.ReplicaContext"
+tf_class {
+  is_instance: "<class \'tensorflow.python.distribute.distribute_lib.ReplicaContext\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "devices"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "distribution_strategy"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "num_replicas_in_sync"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "replica_id_in_sync_group"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "strategy"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'strategy\', \'replica_id_in_sync_group\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "merge_call"
+    argspec: "args=[\'self\', \'merge_fn\', \'args\', \'kwargs\'], varargs=None, keywords=None, defaults=[\'()\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.distribute.-strategy-extended.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distribute.-strategy-extended.pbtxt
new file mode 100644
index 00000000000..77706e57133
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.-strategy-extended.pbtxt
@@ -0,0 +1,81 @@
+path: "tensorflow.distribute.StrategyExtended"
+tf_class {
+  is_instance: "<class \'tensorflow.python.distribute.distribute_lib.DistributionStrategyExtended\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "experimental_between_graph"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "experimental_require_static_shapes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "experimental_should_init"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "parameter_devices"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "should_checkpoint"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "should_save_summary"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "worker_devices"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'container_strategy\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "batch_reduce_to"
+    argspec: "args=[\'self\', \'reduce_op\', \'value_destination_pairs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "broadcast_to"
+    argspec: "args=[\'self\', \'tensor\', \'destinations\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call_for_each_replica"
+    argspec: "args=[\'self\', \'fn\', \'args\', \'kwargs\'], varargs=None, keywords=None, defaults=[\'()\', \'None\'], "
+  }
+  member_method {
+    name: "colocate_vars_with"
+    argspec: "args=[\'self\', \'colocate_with_variable\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "experimental_run_steps_on_iterator"
+    argspec: "args=[\'self\', \'fn\', \'iterator\', \'iterations\', \'initial_loop_values\'], varargs=None, keywords=None, defaults=[\'1\', \'None\'], "
+  }
+  member_method {
+    name: "non_slot_devices"
+    argspec: "args=[\'self\', \'var_list\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "read_var"
+    argspec: "args=[\'self\', \'v\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reduce_to"
+    argspec: "args=[\'self\', \'reduce_op\', \'value\', \'destinations\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update"
+    argspec: "args=[\'self\', \'var\', \'fn\', \'args\', \'kwargs\', \'group\'], varargs=None, keywords=None, defaults=[\'()\', \'None\', \'True\'], "
+  }
+  member_method {
+    name: "update_non_slot"
+    argspec: "args=[\'self\', \'colocate_with\', \'fn\', \'args\', \'kwargs\', \'group\'], varargs=None, keywords=None, defaults=[\'()\', \'None\', \'True\'], "
+  }
+  member_method {
+    name: "value_container"
+    argspec: "args=[\'self\', \'value\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.distribute.-strategy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distribute.-strategy.pbtxt
new file mode 100644
index 00000000000..9eb73d2c0d9
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.-strategy.pbtxt
@@ -0,0 +1,137 @@
+path: "tensorflow.distribute.Strategy"
+tf_class {
+  is_instance: "<class \'tensorflow.python.distribute.distribute_lib.DistributionStrategy\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "between_graph"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "extended"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "num_replicas_in_sync"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "parameter_devices"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "require_static_shapes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "should_checkpoint"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "should_init"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "should_save_summary"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "worker_devices"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'extended\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "batch_reduce"
+    argspec: "args=[\'self\', \'aggregation\', \'value_destination_pairs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "broadcast"
+    argspec: "args=[\'self\', \'tensor\', \'destinations\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "call_for_each_replica"
+    argspec: "args=[\'self\', \'fn\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "colocate_vars_with"
+    argspec: "args=[\'self\', \'colocate_with_variable\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "configure"
+    argspec: "args=[\'self\', \'session_config\', \'cluster_spec\', \'task_type\', \'task_id\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "distribute_dataset"
+    argspec: "args=[\'self\', \'dataset_fn\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "experimental_finalize"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "experimental_initialize"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "finalize"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "group"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "initialize"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "make_dataset_iterator"
+    argspec: "args=[\'self\', \'dataset\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "make_input_fn_iterator"
+    argspec: "args=[\'self\', \'input_fn\', \'replication_mode\'], varargs=None, keywords=None, defaults=[\'InputReplicationMode.PER_WORKER\'], "
+  }
+  member_method {
+    name: "non_slot_devices"
+    argspec: "args=[\'self\', \'var_list\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "read_var"
+    argspec: "args=[\'self\', \'v\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reduce"
+    argspec: "args=[\'self\', \'reduce_op\', \'value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "run_steps_on_dataset"
+    argspec: "args=[\'self\', \'fn\', \'iterator\', \'iterations\', \'initial_loop_values\'], varargs=None, keywords=None, defaults=[\'1\', \'None\'], "
+  }
+  member_method {
+    name: "scope"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "unwrap"
+    argspec: "args=[\'self\', \'value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update"
+    argspec: "args=[\'self\', \'var\', \'fn\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "update_config_proto"
+    argspec: "args=[\'self\', \'config_proto\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_non_slot"
+    argspec: "args=[\'self\', \'colocate_with\', \'fn\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "value_container"
+    argspec: "args=[\'self\', \'value\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.distribute.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distribute.pbtxt
new file mode 100644
index 00000000000..4d833b54ba0
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.pbtxt
@@ -0,0 +1,47 @@
+path: "tensorflow.distribute"
+tf_module {
+  member {
+    name: "InputContext"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "InputReplicationMode"
+    mtype: "<class \'enum.EnumMeta\'>"
+  }
+  member {
+    name: "ReduceOp"
+    mtype: "<class \'enum.EnumMeta\'>"
+  }
+  member {
+    name: "ReplicaContext"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "Strategy"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "StrategyExtended"
+    mtype: "<type \'type\'>"
+  }
+  member_method {
+    name: "get_loss_reduction"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_replica_context"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_strategy"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "has_strategy"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "in_cross_replica_context"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-baseline-classifier.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-baseline-classifier.pbtxt
index 32b84e90ce6..ee3a72bfce7 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-baseline-classifier.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-baseline-classifier.pbtxt
@@ -2,6 +2,7 @@ path: "tensorflow.estimator.BaselineClassifier"
 tf_class {
   is_instance: "<class \'tensorflow_estimator.python.estimator.canned.baseline.BaselineClassifier\'>"
   is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.Estimator\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.EstimatorV2\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "config"
@@ -31,9 +32,13 @@ tf_class {
     name: "evaluate"
     argspec: "args=[\'self\', \'input_fn\', \'steps\', \'hooks\', \'checkpoint_path\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
+  member_method {
+    name: "experimental_export_all_saved_models"
+    argspec: "args=[\'self\', \'export_dir_base\', \'input_receiver_fn_map\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+  }
   member_method {
     name: "export_saved_model"
-    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'experimental_mode\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'infer\'], "
   }
   member_method {
     name: "export_savedmodel"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-baseline-estimator.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-baseline-estimator.pbtxt
index 94933e7ffd6..38b27f735ff 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-baseline-estimator.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-baseline-estimator.pbtxt
@@ -2,6 +2,7 @@ path: "tensorflow.estimator.BaselineEstimator"
 tf_class {
   is_instance: "<class \'tensorflow_estimator.python.estimator.canned.baseline.BaselineEstimator\'>"
   is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.Estimator\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.EstimatorV2\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "config"
@@ -31,9 +32,13 @@ tf_class {
     name: "evaluate"
     argspec: "args=[\'self\', \'input_fn\', \'steps\', \'hooks\', \'checkpoint_path\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
+  member_method {
+    name: "experimental_export_all_saved_models"
+    argspec: "args=[\'self\', \'export_dir_base\', \'input_receiver_fn_map\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+  }
   member_method {
     name: "export_saved_model"
-    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'experimental_mode\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'infer\'], "
   }
   member_method {
     name: "export_savedmodel"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-baseline-regressor.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-baseline-regressor.pbtxt
index db7776b5bf6..3874b84d5a6 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-baseline-regressor.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-baseline-regressor.pbtxt
@@ -2,6 +2,7 @@ path: "tensorflow.estimator.BaselineRegressor"
 tf_class {
   is_instance: "<class \'tensorflow_estimator.python.estimator.canned.baseline.BaselineRegressor\'>"
   is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.Estimator\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.EstimatorV2\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "config"
@@ -31,9 +32,13 @@ tf_class {
     name: "evaluate"
     argspec: "args=[\'self\', \'input_fn\', \'steps\', \'hooks\', \'checkpoint_path\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
+  member_method {
+    name: "experimental_export_all_saved_models"
+    argspec: "args=[\'self\', \'export_dir_base\', \'input_receiver_fn_map\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+  }
   member_method {
     name: "export_saved_model"
-    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'experimental_mode\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'infer\'], "
   }
   member_method {
     name: "export_savedmodel"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-boosted-trees-classifier.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-boosted-trees-classifier.pbtxt
index d530c71482a..e138ce936ec 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-boosted-trees-classifier.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-boosted-trees-classifier.pbtxt
@@ -3,6 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow_estimator.python.estimator.canned.boosted_trees.BoostedTreesClassifier\'>"
   is_instance: "<class \'tensorflow_estimator.python.estimator.canned.boosted_trees._BoostedTreesBase\'>"
   is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.Estimator\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.EstimatorV2\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "config"
@@ -32,6 +33,10 @@ tf_class {
     name: "evaluate"
     argspec: "args=[\'self\', \'input_fn\', \'steps\', \'hooks\', \'checkpoint_path\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
+  member_method {
+    name: "experimental_export_all_saved_models"
+    argspec: "args=[\'self\', \'export_dir_base\', \'input_receiver_fn_map\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+  }
   member_method {
     name: "experimental_feature_importances"
     argspec: "args=[\'self\', \'normalize\'], varargs=None, keywords=None, defaults=[\'False\'], "
@@ -42,7 +47,7 @@ tf_class {
   }
   member_method {
     name: "export_saved_model"
-    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'experimental_mode\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'infer\'], "
   }
   member_method {
     name: "export_savedmodel"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-boosted-trees-regressor.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-boosted-trees-regressor.pbtxt
index 4703c0f561a..eae0a292a96 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-boosted-trees-regressor.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-boosted-trees-regressor.pbtxt
@@ -3,6 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow_estimator.python.estimator.canned.boosted_trees.BoostedTreesRegressor\'>"
   is_instance: "<class \'tensorflow_estimator.python.estimator.canned.boosted_trees._BoostedTreesBase\'>"
   is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.Estimator\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.EstimatorV2\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "config"
@@ -32,6 +33,10 @@ tf_class {
     name: "evaluate"
     argspec: "args=[\'self\', \'input_fn\', \'steps\', \'hooks\', \'checkpoint_path\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
+  member_method {
+    name: "experimental_export_all_saved_models"
+    argspec: "args=[\'self\', \'export_dir_base\', \'input_receiver_fn_map\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+  }
   member_method {
     name: "experimental_feature_importances"
     argspec: "args=[\'self\', \'normalize\'], varargs=None, keywords=None, defaults=[\'False\'], "
@@ -42,7 +47,7 @@ tf_class {
   }
   member_method {
     name: "export_saved_model"
-    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'experimental_mode\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'infer\'], "
   }
   member_method {
     name: "export_savedmodel"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-d-n-n-classifier.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-d-n-n-classifier.pbtxt
index ce6040d0f27..b54133b294e 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-d-n-n-classifier.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-d-n-n-classifier.pbtxt
@@ -2,6 +2,7 @@ path: "tensorflow.estimator.DNNClassifier"
 tf_class {
   is_instance: "<class \'tensorflow_estimator.python.estimator.canned.dnn.DNNClassifier\'>"
   is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.Estimator\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.EstimatorV2\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "config"
@@ -31,9 +32,13 @@ tf_class {
     name: "evaluate"
     argspec: "args=[\'self\', \'input_fn\', \'steps\', \'hooks\', \'checkpoint_path\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
+  member_method {
+    name: "experimental_export_all_saved_models"
+    argspec: "args=[\'self\', \'export_dir_base\', \'input_receiver_fn_map\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+  }
   member_method {
     name: "export_saved_model"
-    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'experimental_mode\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'infer\'], "
   }
   member_method {
     name: "export_savedmodel"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-d-n-n-estimator.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-d-n-n-estimator.pbtxt
index 4635a1544c3..09e0d381924 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-d-n-n-estimator.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-d-n-n-estimator.pbtxt
@@ -2,6 +2,7 @@ path: "tensorflow.estimator.DNNEstimator"
 tf_class {
   is_instance: "<class \'tensorflow_estimator.python.estimator.canned.dnn.DNNEstimator\'>"
   is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.Estimator\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.EstimatorV2\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "config"
@@ -31,9 +32,13 @@ tf_class {
     name: "evaluate"
     argspec: "args=[\'self\', \'input_fn\', \'steps\', \'hooks\', \'checkpoint_path\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
+  member_method {
+    name: "experimental_export_all_saved_models"
+    argspec: "args=[\'self\', \'export_dir_base\', \'input_receiver_fn_map\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+  }
   member_method {
     name: "export_saved_model"
-    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'experimental_mode\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'infer\'], "
   }
   member_method {
     name: "export_savedmodel"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-d-n-n-linear-combined-classifier.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-d-n-n-linear-combined-classifier.pbtxt
index e85007e16ed..5a1d85a9b10 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-d-n-n-linear-combined-classifier.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-d-n-n-linear-combined-classifier.pbtxt
@@ -2,6 +2,7 @@ path: "tensorflow.estimator.DNNLinearCombinedClassifier"
 tf_class {
   is_instance: "<class \'tensorflow_estimator.python.estimator.canned.dnn_linear_combined.DNNLinearCombinedClassifier\'>"
   is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.Estimator\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.EstimatorV2\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "config"
@@ -31,9 +32,13 @@ tf_class {
     name: "evaluate"
     argspec: "args=[\'self\', \'input_fn\', \'steps\', \'hooks\', \'checkpoint_path\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
+  member_method {
+    name: "experimental_export_all_saved_models"
+    argspec: "args=[\'self\', \'export_dir_base\', \'input_receiver_fn_map\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+  }
   member_method {
     name: "export_saved_model"
-    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'experimental_mode\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'infer\'], "
   }
   member_method {
     name: "export_savedmodel"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-d-n-n-linear-combined-estimator.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-d-n-n-linear-combined-estimator.pbtxt
index a23f5daeac4..e311f96d3dc 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-d-n-n-linear-combined-estimator.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-d-n-n-linear-combined-estimator.pbtxt
@@ -2,6 +2,7 @@ path: "tensorflow.estimator.DNNLinearCombinedEstimator"
 tf_class {
   is_instance: "<class \'tensorflow_estimator.python.estimator.canned.dnn_linear_combined.DNNLinearCombinedEstimator\'>"
   is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.Estimator\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.EstimatorV2\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "config"
@@ -31,9 +32,13 @@ tf_class {
     name: "evaluate"
     argspec: "args=[\'self\', \'input_fn\', \'steps\', \'hooks\', \'checkpoint_path\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
+  member_method {
+    name: "experimental_export_all_saved_models"
+    argspec: "args=[\'self\', \'export_dir_base\', \'input_receiver_fn_map\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+  }
   member_method {
     name: "export_saved_model"
-    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'experimental_mode\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'infer\'], "
   }
   member_method {
     name: "export_savedmodel"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-d-n-n-linear-combined-regressor.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-d-n-n-linear-combined-regressor.pbtxt
index 8a55bb835ff..db4780e4c01 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-d-n-n-linear-combined-regressor.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-d-n-n-linear-combined-regressor.pbtxt
@@ -2,6 +2,7 @@ path: "tensorflow.estimator.DNNLinearCombinedRegressor"
 tf_class {
   is_instance: "<class \'tensorflow_estimator.python.estimator.canned.dnn_linear_combined.DNNLinearCombinedRegressor\'>"
   is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.Estimator\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.EstimatorV2\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "config"
@@ -31,9 +32,13 @@ tf_class {
     name: "evaluate"
     argspec: "args=[\'self\', \'input_fn\', \'steps\', \'hooks\', \'checkpoint_path\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
+  member_method {
+    name: "experimental_export_all_saved_models"
+    argspec: "args=[\'self\', \'export_dir_base\', \'input_receiver_fn_map\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+  }
   member_method {
     name: "export_saved_model"
-    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'experimental_mode\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'infer\'], "
   }
   member_method {
     name: "export_savedmodel"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-d-n-n-regressor.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-d-n-n-regressor.pbtxt
index 2c4128ec480..a44e719099e 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-d-n-n-regressor.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-d-n-n-regressor.pbtxt
@@ -2,6 +2,7 @@ path: "tensorflow.estimator.DNNRegressor"
 tf_class {
   is_instance: "<class \'tensorflow_estimator.python.estimator.canned.dnn.DNNRegressor\'>"
   is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.Estimator\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.EstimatorV2\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "config"
@@ -31,9 +32,13 @@ tf_class {
     name: "evaluate"
     argspec: "args=[\'self\', \'input_fn\', \'steps\', \'hooks\', \'checkpoint_path\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
+  member_method {
+    name: "experimental_export_all_saved_models"
+    argspec: "args=[\'self\', \'export_dir_base\', \'input_receiver_fn_map\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+  }
   member_method {
     name: "export_saved_model"
-    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'experimental_mode\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'infer\'], "
   }
   member_method {
     name: "export_savedmodel"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-estimator.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-estimator.pbtxt
index 9d270a87ab8..bff6c86cd75 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-estimator.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-estimator.pbtxt
@@ -1,6 +1,7 @@
 path: "tensorflow.estimator.Estimator"
 tf_class {
   is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.Estimator\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.EstimatorV2\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "config"
@@ -30,9 +31,13 @@ tf_class {
     name: "evaluate"
     argspec: "args=[\'self\', \'input_fn\', \'steps\', \'hooks\', \'checkpoint_path\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
+  member_method {
+    name: "experimental_export_all_saved_models"
+    argspec: "args=[\'self\', \'export_dir_base\', \'input_receiver_fn_map\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+  }
   member_method {
     name: "export_saved_model"
-    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'experimental_mode\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'infer\'], "
   }
   member_method {
     name: "export_savedmodel"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-linear-classifier.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-linear-classifier.pbtxt
index 4b5de2e2450..2c8e82517be 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-linear-classifier.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-linear-classifier.pbtxt
@@ -2,6 +2,7 @@ path: "tensorflow.estimator.LinearClassifier"
 tf_class {
   is_instance: "<class \'tensorflow_estimator.python.estimator.canned.linear.LinearClassifier\'>"
   is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.Estimator\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.EstimatorV2\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "config"
@@ -31,9 +32,13 @@ tf_class {
     name: "evaluate"
     argspec: "args=[\'self\', \'input_fn\', \'steps\', \'hooks\', \'checkpoint_path\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
+  member_method {
+    name: "experimental_export_all_saved_models"
+    argspec: "args=[\'self\', \'export_dir_base\', \'input_receiver_fn_map\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+  }
   member_method {
     name: "export_saved_model"
-    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'experimental_mode\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'infer\'], "
   }
   member_method {
     name: "export_savedmodel"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-linear-estimator.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-linear-estimator.pbtxt
index 3d6b03098aa..2148374fdee 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-linear-estimator.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-linear-estimator.pbtxt
@@ -2,6 +2,7 @@ path: "tensorflow.estimator.LinearEstimator"
 tf_class {
   is_instance: "<class \'tensorflow_estimator.python.estimator.canned.linear.LinearEstimator\'>"
   is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.Estimator\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.EstimatorV2\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "config"
@@ -31,9 +32,13 @@ tf_class {
     name: "evaluate"
     argspec: "args=[\'self\', \'input_fn\', \'steps\', \'hooks\', \'checkpoint_path\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
+  member_method {
+    name: "experimental_export_all_saved_models"
+    argspec: "args=[\'self\', \'export_dir_base\', \'input_receiver_fn_map\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+  }
   member_method {
     name: "export_saved_model"
-    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'experimental_mode\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'infer\'], "
   }
   member_method {
     name: "export_savedmodel"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-linear-regressor.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-linear-regressor.pbtxt
index 0d1510e9ab1..1bdc6124fe9 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-linear-regressor.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-linear-regressor.pbtxt
@@ -2,6 +2,7 @@ path: "tensorflow.estimator.LinearRegressor"
 tf_class {
   is_instance: "<class \'tensorflow_estimator.python.estimator.canned.linear.LinearRegressor\'>"
   is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.Estimator\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.EstimatorV2\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "config"
@@ -31,9 +32,13 @@ tf_class {
     name: "evaluate"
     argspec: "args=[\'self\', \'input_fn\', \'steps\', \'hooks\', \'checkpoint_path\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
+  member_method {
+    name: "experimental_export_all_saved_models"
+    argspec: "args=[\'self\', \'export_dir_base\', \'input_receiver_fn_map\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+  }
   member_method {
     name: "export_saved_model"
-    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'experimental_mode\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'infer\'], "
   }
   member_method {
     name: "export_savedmodel"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.train.-profiler-hook.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.experimental.-in-memory-evaluator-hook.pbtxt
similarity index 71%
rename from tensorflow/tools/api/golden/v2/tensorflow.train.-profiler-hook.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.estimator.experimental.-in-memory-evaluator-hook.pbtxt
index 4df6c4156a8..aba120218cc 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.train.-profiler-hook.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.experimental.-in-memory-evaluator-hook.pbtxt
@@ -1,11 +1,11 @@
-path: "tensorflow.train.ProfilerHook"
+path: "tensorflow.estimator.experimental.InMemoryEvaluatorHook"
 tf_class {
-  is_instance: "<class \'tensorflow.python.training.basic_session_run_hooks.ProfilerHook\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.hooks.InMemoryEvaluatorHook\'>"
   is_instance: "<class \'tensorflow.python.training.session_run_hook.SessionRunHook\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'save_steps\', \'save_secs\', \'output_dir\', \'show_dataflow\', \'show_memory\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'\', \'True\', \'False\'], "
+    argspec: "args=[\'self\', \'estimator\', \'input_fn\', \'steps\', \'hooks\', \'name\', \'every_n_iter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'100\'], "
   }
   member_method {
     name: "after_create_session"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.estimator.experimental.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.experimental.pbtxt
index cabca3e883f..f0fd7ce782d 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.estimator.experimental.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.experimental.pbtxt
@@ -1,9 +1,17 @@
 path: "tensorflow.estimator.experimental"
 tf_module {
+  member {
+    name: "InMemoryEvaluatorHook"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "LinearSDCA"
     mtype: "<type \'type\'>"
   }
+  member_method {
+    name: "build_raw_supervised_input_receiver_fn"
+    argspec: "args=[\'features\', \'labels\', \'default_batch_size\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "call_logit_fn"
     argspec: "args=[\'logit_fn\', \'features\', \'mode\', \'params\', \'config\'], varargs=None, keywords=None, defaults=None"
@@ -20,6 +28,10 @@ tf_module {
     name: "make_early_stopping_hook"
     argspec: "args=[\'estimator\', \'should_stop_fn\', \'run_every_secs\', \'run_every_steps\'], varargs=None, keywords=None, defaults=[\'60\', \'None\'], "
   }
+  member_method {
+    name: "make_stop_at_checkpoint_step_hook"
+    argspec: "args=[\'estimator\', \'last_step\', \'wait_after_file_check_secs\'], varargs=None, keywords=None, defaults=[\'30\'], "
+  }
   member_method {
     name: "stop_if_higher_hook"
     argspec: "args=[\'estimator\', \'metric_name\', \'threshold\', \'eval_dir\', \'min_steps\', \'run_every_secs\', \'run_every_steps\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \'60\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.image.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.image.pbtxt
index 0a231f1b651..15d0e099bab 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.image.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.image.pbtxt
@@ -172,6 +172,10 @@ tf_module {
     name: "random_saturation"
     argspec: "args=[\'image\', \'lower\', \'upper\', \'seed\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "resize"
+    argspec: "args=[\'images\', \'size\', \'method\', \'align_corners\', \'preserve_aspect_ratio\'], varargs=None, keywords=None, defaults=[\'0\', \'False\', \'False\'], "
+  }
   member_method {
     name: "resize_area"
     argspec: "args=[\'images\', \'size\', \'align_corners\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
@@ -240,6 +244,10 @@ tf_module {
     name: "total_variation"
     argspec: "args=[\'images\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "transpose"
+    argspec: "args=[\'image\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "transpose_image"
     argspec: "args=[\'image\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.io.gfile.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.io.gfile.pbtxt
new file mode 100644
index 00000000000..93d9b0fd75b
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.io.gfile.pbtxt
@@ -0,0 +1,51 @@
+path: "tensorflow.io.gfile"
+tf_module {
+  member_method {
+    name: "copy"
+    argspec: "args=[\'src\', \'dst\', \'overwrite\'], varargs=None, keywords=None, defaults=[\'False\'], "
+  }
+  member_method {
+    name: "exists"
+    argspec: "args=[\'path\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "glob"
+    argspec: "args=[\'pattern\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "isdir"
+    argspec: "args=[\'path\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "listdir"
+    argspec: "args=[\'path\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "makedirs"
+    argspec: "args=[\'path\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "mkdir"
+    argspec: "args=[\'path\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "remove"
+    argspec: "args=[\'path\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "rename"
+    argspec: "args=[\'src\', \'dst\', \'overwrite\'], varargs=None, keywords=None, defaults=[\'False\'], "
+  }
+  member_method {
+    name: "rmtree"
+    argspec: "args=[\'path\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "stat"
+    argspec: "args=[\'path\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "walk"
+    argspec: "args=[\'top\', \'topdown\', \'onerror\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.io.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.io.pbtxt
index 64b63ed1a4a..b760ec38906 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.io.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.io.pbtxt
@@ -44,10 +44,22 @@ tf_module {
     name: "VarLenFeature"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "gfile"
+    mtype: "<type \'module\'>"
+  }
+  member_method {
+    name: "decode_and_crop_jpeg"
+    argspec: "args=[\'contents\', \'crop_window\', \'channels\', \'ratio\', \'fancy_upscaling\', \'try_recover_truncated\', \'acceptable_fraction\', \'dct_method\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'1\', \'True\', \'False\', \'1\', \'\', \'None\'], "
+  }
   member_method {
     name: "decode_base64"
     argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "decode_bmp"
+    argspec: "args=[\'contents\', \'channels\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'None\'], "
+  }
   member_method {
     name: "decode_compressed"
     argspec: "args=[\'bytes\', \'compression_type\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'None\'], "
@@ -56,10 +68,26 @@ tf_module {
     name: "decode_csv"
     argspec: "args=[\'records\', \'record_defaults\', \'field_delim\', \'use_quote_delim\', \'name\', \'na_value\', \'select_cols\'], varargs=None, keywords=None, defaults=[\',\', \'True\', \'None\', \'\', \'None\'], "
   }
+  member_method {
+    name: "decode_gif"
+    argspec: "args=[\'contents\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "decode_image"
+    argspec: "args=[\'contents\', \'channels\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \"<dtype: \'uint8\'>\", \'None\'], "
+  }
+  member_method {
+    name: "decode_jpeg"
+    argspec: "args=[\'contents\', \'channels\', \'ratio\', \'fancy_upscaling\', \'try_recover_truncated\', \'acceptable_fraction\', \'dct_method\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'1\', \'True\', \'False\', \'1\', \'\', \'None\'], "
+  }
   member_method {
     name: "decode_json_example"
     argspec: "args=[\'json_examples\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "decode_png"
+    argspec: "args=[\'contents\', \'channels\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \"<dtype: \'uint8\'>\", \'None\'], "
+  }
   member_method {
     name: "decode_raw"
     argspec: "args=[\'bytes\', \'out_type\', \'little_endian\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
@@ -72,6 +100,18 @@ tf_module {
     name: "encode_base64"
     argspec: "args=[\'input\', \'pad\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
   }
+  member_method {
+    name: "encode_jpeg"
+    argspec: "args=[\'image\', \'format\', \'quality\', \'progressive\', \'optimize_size\', \'chroma_downsampling\', \'density_unit\', \'x_density\', \'y_density\', \'xmp_metadata\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'95\', \'False\', \'False\', \'True\', \'in\', \'300\', \'300\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "extract_jpeg_shape"
+    argspec: "args=[\'contents\', \'output_type\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'int32\'>\", \'None\'], "
+  }
+  member_method {
+    name: "is_jpeg"
+    argspec: "args=[\'contents\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "match_filenames_once"
     argspec: "args=[\'pattern\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.-model.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.-model.pbtxt
index 8ccba990bdd..a3254cbd947 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.-model.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.-model.pbtxt
@@ -41,6 +41,14 @@ tf_class {
     name: "losses"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "metrics"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "metrics_names"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "name"
     mtype: "<type \'property\'>"
@@ -69,6 +77,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "run_eagerly"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "state_updates"
     mtype: "<type \'property\'>"
@@ -105,13 +117,17 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
@@ -225,6 +241,10 @@ tf_class {
     name: "predict_on_batch"
     argspec: "args=[\'self\', \'x\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "reset_metrics"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "reset_states"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -247,7 +267,7 @@ tf_class {
   }
   member_method {
     name: "test_on_batch"
-    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'reset_metrics\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'True\'], "
   }
   member_method {
     name: "to_json"
@@ -259,6 +279,6 @@ tf_class {
   }
   member_method {
     name: "train_on_batch"
-    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'class_weight\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'class_weight\', \'reset_metrics\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\'], "
   }
 }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.-sequential.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.-sequential.pbtxt
index 27aa91a6452..b70e9ee98d5 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.-sequential.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.-sequential.pbtxt
@@ -42,6 +42,14 @@ tf_class {
     name: "losses"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "metrics"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "metrics_names"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "name"
     mtype: "<type \'property\'>"
@@ -70,6 +78,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "run_eagerly"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "state_updates"
     mtype: "<type \'property\'>"
@@ -110,13 +122,17 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
@@ -242,6 +258,10 @@ tf_class {
     name: "predict_proba"
     argspec: "args=[\'self\', \'x\', \'batch_size\', \'verbose\'], varargs=None, keywords=None, defaults=[\'32\', \'0\'], "
   }
+  member_method {
+    name: "reset_metrics"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "reset_states"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -264,7 +284,7 @@ tf_class {
   }
   member_method {
     name: "test_on_batch"
-    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'reset_metrics\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'True\'], "
   }
   member_method {
     name: "to_json"
@@ -276,6 +296,6 @@ tf_class {
   }
   member_method {
     name: "train_on_batch"
-    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'class_weight\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'class_weight\', \'reset_metrics\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\'], "
   }
 }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.backend.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.backend.pbtxt
index b0e5d2bde7d..8cd0c6ea5f0 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.backend.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.backend.pbtxt
@@ -398,7 +398,7 @@ tf_module {
   }
   member_method {
     name: "rnn"
-    argspec: "args=[\'step_function\', \'inputs\', \'initial_states\', \'go_backwards\', \'mask\', \'constants\', \'unroll\', \'input_length\', \'time_major\'], varargs=None, keywords=None, defaults=[\'False\', \'None\', \'None\', \'False\', \'None\', \'False\'], "
+    argspec: "args=[\'step_function\', \'inputs\', \'initial_states\', \'go_backwards\', \'mask\', \'constants\', \'unroll\', \'input_length\', \'time_major\', \'zero_output_for_mask\'], varargs=None, keywords=None, defaults=[\'False\', \'None\', \'None\', \'False\', \'None\', \'False\', \'False\'], "
   }
   member_method {
     name: "round"
@@ -512,6 +512,10 @@ tf_module {
     name: "temporal_padding"
     argspec: "args=[\'x\', \'padding\'], varargs=None, keywords=None, defaults=[\'(1, 1)\'], "
   }
+  member_method {
+    name: "tile"
+    argspec: "args=[\'x\', \'n\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "to_dense"
     argspec: "args=[\'tensor\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-peephole-l-s-t-m-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-peephole-l-s-t-m-cell.pbtxt
index db69e25c5b7..1d814b2c8b5 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-peephole-l-s-t-m-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-peephole-l-s-t-m-cell.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-activation.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-activation.pbtxt
index 5510465d7b0..b84629540e7 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-activation.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-activation.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-activity-regularization.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-activity-regularization.pbtxt
index 38ec8a0aff0..5918a13ad86 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-activity-regularization.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-activity-regularization.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-add.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-add.pbtxt
index 41cb8e30bfb..599da06427d 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-add.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-add.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-alpha-dropout.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-alpha-dropout.pbtxt
index 9a7aaa8e961..f9ff1538c81 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-alpha-dropout.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-alpha-dropout.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling1-d.pbtxt
index 014f5828fad..723fc9cdb0d 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling1-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling2-d.pbtxt
index cc303bf7b98..957ce2f0ce8 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling2-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling3-d.pbtxt
index 628447ce355..a52c0af6817 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling3-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average.pbtxt
index f03c986c222..a004db62ddc 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool1-d.pbtxt
index a6e4856de9b..44f83d1387c 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool1-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool2-d.pbtxt
index a01eaf8a126..8378faf7188 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool2-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool3-d.pbtxt
index 0d6698f2ef4..9d5655c9644 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool3-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-batch-normalization.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-batch-normalization.pbtxt
index f1b23be48f7..b3d3c84f92e 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-batch-normalization.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-batch-normalization.pbtxt
@@ -1,6 +1,7 @@
 path: "tensorflow.keras.layers.BatchNormalization"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras.layers.normalization.BatchNormalization\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.normalization.BatchNormalizationV1\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.normalization.BatchNormalizationV2\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
@@ -88,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-bidirectional.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-bidirectional.pbtxt
index 0672cd5b7b8..d37a6b47105 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-bidirectional.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-bidirectional.pbtxt
@@ -97,6 +97,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-concatenate.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-concatenate.pbtxt
index b25ae1e82e8..1ad7a91be0b 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-concatenate.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-concatenate.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt
index bb1918eba65..cb9abc25396 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt
@@ -178,6 +178,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv1-d.pbtxt
index 16e0fd5a313..47dba1d81f8 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv1-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv2-d-transpose.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv2-d-transpose.pbtxt
index 381839d6deb..fd649418961 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv2-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv2-d-transpose.pbtxt
@@ -90,6 +90,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv2-d.pbtxt
index 543bae6fa96..1b1425d5319 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv2-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv3-d-transpose.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv3-d-transpose.pbtxt
index 2933f9f4b3a..1741063fe8b 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv3-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv3-d-transpose.pbtxt
@@ -90,6 +90,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv3-d.pbtxt
index 072943dc2c7..50feb4f458a 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv3-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution1-d.pbtxt
index 222a1ef4fc5..faaa535df9f 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution1-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution2-d-transpose.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution2-d-transpose.pbtxt
index 9c9c7461c8b..4079329d1ee 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution2-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution2-d-transpose.pbtxt
@@ -90,6 +90,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution2-d.pbtxt
index f9390671781..32e56696e16 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution2-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt
index 44ca598724a..381abe73401 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt
@@ -90,6 +90,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution3-d.pbtxt
index 471b18ef850..b3e4bf9689d 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution3-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping1-d.pbtxt
index 0f250a09b7e..7aeff8003c3 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping1-d.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping2-d.pbtxt
index f52128483c6..a1728d9d4f9 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping2-d.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping3-d.pbtxt
index 98daf3bab12..8d8fd142cc6 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping3-d.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cu-d-n-n-g-r-u.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cu-d-n-n-g-r-u.pbtxt
index b207c680005..7758209adf8 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cu-d-n-n-g-r-u.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cu-d-n-n-g-r-u.pbtxt
@@ -98,6 +98,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cu-d-n-n-l-s-t-m.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cu-d-n-n-l-s-t-m.pbtxt
index 2d7a09ceda9..7c463ff1257 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cu-d-n-n-l-s-t-m.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cu-d-n-n-l-s-t-m.pbtxt
@@ -98,6 +98,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dense.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dense.pbtxt
index 3ac38257593..4960d0264e9 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dense.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dense.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-depthwise-conv2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-depthwise-conv2-d.pbtxt
index 280ec8c25fa..8fad7535f88 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-depthwise-conv2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-depthwise-conv2-d.pbtxt
@@ -90,6 +90,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dot.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dot.pbtxt
index 560f66f9c7a..5b425f2d4d7 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dot.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dot.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dropout.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dropout.pbtxt
index c0543529c38..f6c4d0a438e 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dropout.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dropout.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-e-l-u.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-e-l-u.pbtxt
index 04eb2824b9b..82b761fc176 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-e-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-e-l-u.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-embedding.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-embedding.pbtxt
index f400432915f..c9ff323877e 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-embedding.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-embedding.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-flatten.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-flatten.pbtxt
index ab176b441a2..9b4165d4cbf 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-flatten.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-flatten.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-g-r-u-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-g-r-u-cell.pbtxt
index c3895a0ac12..f225f7c4309 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-g-r-u-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-g-r-u-cell.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-g-r-u.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-g-r-u.pbtxt
index 9e24bb8ae6a..855d0017001 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-g-r-u.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-g-r-u.pbtxt
@@ -161,6 +161,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-gaussian-dropout.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-gaussian-dropout.pbtxt
index 55e0d7ef023..2c404c99cd2 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-gaussian-dropout.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-gaussian-dropout.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-gaussian-noise.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-gaussian-noise.pbtxt
index 38fbff5e4a3..6f109d59d0f 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-gaussian-noise.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-gaussian-noise.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling1-d.pbtxt
index a8094c0bde3..69f8a9031d3 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling1-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling2-d.pbtxt
index 929f48df231..4299f765e52 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling2-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling3-d.pbtxt
index 2e6d59337f1..9153a1a2406 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling3-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool1-d.pbtxt
index 3ebe162f573..625e81fd232 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool1-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool2-d.pbtxt
index 4e3e258430c..2fc769742c7 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool2-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool3-d.pbtxt
index fb9166316f6..e307a65c7c5 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool3-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool1-d.pbtxt
index c0a53b847b4..4394ad0364e 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool1-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool2-d.pbtxt
index 87b7f6797a0..050ed39fe98 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool2-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool3-d.pbtxt
index 98bf96fa0c2..436191821ef 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool3-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling1-d.pbtxt
index ff6c6f3ec4d..4ba540aa6ad 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling1-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling2-d.pbtxt
index c9d4158d1c4..a2e9322cb3f 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling2-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling3-d.pbtxt
index 9953102ff99..5d16a57fc1a 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling3-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-input-layer.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-input-layer.pbtxt
index 2617f5a95fa..9dd29c1251e 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-input-layer.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-input-layer.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-input-spec.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-input-spec.pbtxt
index 5fd0a47a68c..bc3ceb67a4e 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-input-spec.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-input-spec.pbtxt
@@ -1,6 +1,6 @@
 path: "tensorflow.keras.layers.InputSpec"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.InputSpec\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.input_spec.InputSpec\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt
index e9f6ef45aaf..0045d5775e2 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-l-s-t-m.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-l-s-t-m.pbtxt
index 1b1ccbe1180..529c750f987 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-l-s-t-m.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-l-s-t-m.pbtxt
@@ -161,6 +161,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-lambda.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-lambda.pbtxt
index 2e0b6bac24f..d4d1bc6b6bb 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-lambda.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-lambda.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-layer.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-layer.pbtxt
index 1e93d1118a4..e1f54911809 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-layer.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-layer.pbtxt
@@ -87,6 +87,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-leaky-re-l-u.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-leaky-re-l-u.pbtxt
index bfd36012a7e..9b69d9a9447 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-leaky-re-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-leaky-re-l-u.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-locally-connected1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-locally-connected1-d.pbtxt
index 5ad5990d7e6..fd522594325 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-locally-connected1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-locally-connected1-d.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-locally-connected2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-locally-connected2-d.pbtxt
index 40d03369a52..5fc8af0d035 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-locally-connected2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-locally-connected2-d.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-masking.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-masking.pbtxt
index 86666b51bb8..7f8932270e6 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-masking.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-masking.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool1-d.pbtxt
index d26da270e74..4723b99cb07 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool1-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool2-d.pbtxt
index 85f23df671d..173c5d4a8b1 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool2-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool3-d.pbtxt
index 235806b9650..14e1899e145 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool3-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling1-d.pbtxt
index 524c5fd69e5..a708e652bf0 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling1-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling2-d.pbtxt
index fda2562fc8c..e6706b5cf9f 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling2-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling3-d.pbtxt
index 71d2d09a8d1..a73c082d1bb 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling3-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-maximum.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-maximum.pbtxt
index 12949b39a6f..f3f195554bb 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-maximum.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-maximum.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-minimum.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-minimum.pbtxt
index ab16d0021e6..f345d1d67b2 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-minimum.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-minimum.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-multiply.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-multiply.pbtxt
index 61ccbf59627..31cb8bc177c 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-multiply.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-multiply.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-p-re-l-u.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-p-re-l-u.pbtxt
index ce2320d7030..44cccc92bd2 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-p-re-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-p-re-l-u.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-permute.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-permute.pbtxt
index 69848af8cf8..b55e191ff1a 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-permute.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-permute.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-r-n-n.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-r-n-n.pbtxt
index 3358f26aebf..e9575436e5b 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-r-n-n.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-r-n-n.pbtxt
@@ -92,6 +92,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-re-l-u.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-re-l-u.pbtxt
index 413f45f018a..98223b207f2 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-re-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-re-l-u.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-repeat-vector.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-repeat-vector.pbtxt
index 9c61ff60274..2df918b16b2 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-repeat-vector.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-repeat-vector.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-reshape.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-reshape.pbtxt
index baa91804c49..ce5f9e21290 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-reshape.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-reshape.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-conv1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-conv1-d.pbtxt
index 15a5d6ac9ea..a0bb917775f 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-conv1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-conv1-d.pbtxt
@@ -90,6 +90,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-conv2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-conv2-d.pbtxt
index be43bd5b3c1..d7942f201bd 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-conv2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-conv2-d.pbtxt
@@ -90,6 +90,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-convolution1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-convolution1-d.pbtxt
index 6105992c7a3..f7ac9042d46 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-convolution1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-convolution1-d.pbtxt
@@ -90,6 +90,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-convolution2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-convolution2-d.pbtxt
index 1b6cf1e9ecb..e5a92688220 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-convolution2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-convolution2-d.pbtxt
@@ -90,6 +90,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-simple-r-n-n-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-simple-r-n-n-cell.pbtxt
index 29488a37f8f..0fe2c974a76 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-simple-r-n-n-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-simple-r-n-n-cell.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-simple-r-n-n.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-simple-r-n-n.pbtxt
index 3d70cf8b659..2ee5873f0f1 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-simple-r-n-n.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-simple-r-n-n.pbtxt
@@ -149,6 +149,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-softmax.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-softmax.pbtxt
index d29731ecf9d..5b8f64aa357 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-softmax.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-softmax.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout1-d.pbtxt
index a6d7494ca7d..240cb6e562f 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout1-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout2-d.pbtxt
index c36e802693d..6226c469f8a 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout2-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout3-d.pbtxt
index 9c46cfe40fd..34dabce6d8d 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout3-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt
index 8982f787940..0ddf628ace5 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt
@@ -96,6 +96,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-subtract.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-subtract.pbtxt
index ec2cc502984..12eb35ad154 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-subtract.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-subtract.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-thresholded-re-l-u.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-thresholded-re-l-u.pbtxt
index d7bc1980f32..c41020c2b45 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-thresholded-re-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-thresholded-re-l-u.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-time-distributed.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-time-distributed.pbtxt
index fec2de6b49e..479f89cf6ae 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-time-distributed.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-time-distributed.pbtxt
@@ -93,6 +93,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling1-d.pbtxt
index 3d285e7f17d..233363ce026 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling1-d.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling2-d.pbtxt
index b05e5ec84de..cb6228ac446 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling2-d.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling3-d.pbtxt
index 728eca415a8..03bad3ccb61 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling3-d.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-wrapper.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-wrapper.pbtxt
index da64e77c39c..158996792a4 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-wrapper.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-wrapper.pbtxt
@@ -92,6 +92,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding1-d.pbtxt
index 2f505f9293f..63a56cd3eeb 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding1-d.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding2-d.pbtxt
index f82c77072e6..965a4cca046 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding2-d.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding3-d.pbtxt
index 54e01a99177..1a624308878 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding3-d.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-binary-crossentropy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-binary-crossentropy.pbtxt
new file mode 100644
index 00000000000..2f7da93f6f4
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-binary-crossentropy.pbtxt
@@ -0,0 +1,22 @@
+path: "tensorflow.keras.losses.BinaryCrossentropy"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.losses.BinaryCrossentropy\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.Loss\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'from_logits\', \'label_smoothing\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'0\', \'sum_over_batch_size\', \'None\'], "
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-categorical-crossentropy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-categorical-crossentropy.pbtxt
new file mode 100644
index 00000000000..b3a7cd80973
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-categorical-crossentropy.pbtxt
@@ -0,0 +1,22 @@
+path: "tensorflow.keras.losses.CategoricalCrossentropy"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.losses.CategoricalCrossentropy\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.Loss\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'from_logits\', \'label_smoothing\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'0\', \'sum_over_batch_size\', \'None\'], "
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-mean-absolute-error.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-mean-absolute-error.pbtxt
new file mode 100644
index 00000000000..712bb2ecd35
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-mean-absolute-error.pbtxt
@@ -0,0 +1,22 @@
+path: "tensorflow.keras.losses.MeanAbsoluteError"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.losses.MeanAbsoluteError\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.Loss\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'sum_over_batch_size\', \'None\'], "
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-mean-absolute-percentage-error.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-mean-absolute-percentage-error.pbtxt
new file mode 100644
index 00000000000..7fe362da89b
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-mean-absolute-percentage-error.pbtxt
@@ -0,0 +1,22 @@
+path: "tensorflow.keras.losses.MeanAbsolutePercentageError"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.losses.MeanAbsolutePercentageError\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.Loss\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'sum_over_batch_size\', \'None\'], "
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-mean-squared-error.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-mean-squared-error.pbtxt
new file mode 100644
index 00000000000..a5718533500
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-mean-squared-error.pbtxt
@@ -0,0 +1,22 @@
+path: "tensorflow.keras.losses.MeanSquaredError"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.losses.MeanSquaredError\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.Loss\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'sum_over_batch_size\', \'None\'], "
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-mean-squared-logarithmic-error.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-mean-squared-logarithmic-error.pbtxt
new file mode 100644
index 00000000000..200006db355
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-mean-squared-logarithmic-error.pbtxt
@@ -0,0 +1,22 @@
+path: "tensorflow.keras.losses.MeanSquaredLogarithmicError"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.losses.MeanSquaredLogarithmicError\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.Loss\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'sum_over_batch_size\', \'None\'], "
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.pbtxt
index eca6b915388..9e26ddbdca0 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.pbtxt
@@ -1,5 +1,29 @@
 path: "tensorflow.keras.losses"
 tf_module {
+  member {
+    name: "BinaryCrossentropy"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "CategoricalCrossentropy"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "MeanAbsoluteError"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "MeanAbsolutePercentageError"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "MeanSquaredError"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "MeanSquaredLogarithmicError"
+    mtype: "<type \'type\'>"
+  }
   member_method {
     name: "KLD"
     argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
@@ -22,11 +46,11 @@ tf_module {
   }
   member_method {
     name: "binary_crossentropy"
-    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'y_true\', \'y_pred\', \'from_logits\'], varargs=None, keywords=None, defaults=[\'False\'], "
   }
   member_method {
     name: "categorical_crossentropy"
-    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'y_true\', \'y_pred\', \'from_logits\'], varargs=None, keywords=None, defaults=[\'False\'], "
   }
   member_method {
     name: "categorical_hinge"
@@ -106,7 +130,7 @@ tf_module {
   }
   member_method {
     name: "sparse_categorical_crossentropy"
-    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'y_true\', \'y_pred\', \'from_logits\'], varargs=None, keywords=None, defaults=[\'False\'], "
   }
   member_method {
     name: "squared_hinge"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-accuracy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-accuracy.pbtxt
new file mode 100644
index 00000000000..2db07df5235
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-accuracy.pbtxt
@@ -0,0 +1,194 @@
+path: "tensorflow.keras.metrics.Accuracy"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.Accuracy\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'accuracy\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-binary-accuracy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-binary-accuracy.pbtxt
new file mode 100644
index 00000000000..904ad3a21a0
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-binary-accuracy.pbtxt
@@ -0,0 +1,194 @@
+path: "tensorflow.keras.metrics.BinaryAccuracy"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.BinaryAccuracy\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\', \'threshold\'], varargs=None, keywords=None, defaults=[\'binary_accuracy\', \'None\', \'0.5\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-categorical-accuracy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-categorical-accuracy.pbtxt
new file mode 100644
index 00000000000..17b74924fab
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-categorical-accuracy.pbtxt
@@ -0,0 +1,194 @@
+path: "tensorflow.keras.metrics.CategoricalAccuracy"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.CategoricalAccuracy\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'categorical_accuracy\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-false-negatives.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-false-negatives.pbtxt
new file mode 100644
index 00000000000..49f577e1367
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-false-negatives.pbtxt
@@ -0,0 +1,193 @@
+path: "tensorflow.keras.metrics.FalseNegatives"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.FalseNegatives\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics._ConfusionMatrixConditionCount\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'thresholds\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-false-positives.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-false-positives.pbtxt
new file mode 100644
index 00000000000..e8baf858669
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-false-positives.pbtxt
@@ -0,0 +1,193 @@
+path: "tensorflow.keras.metrics.FalsePositives"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.FalsePositives\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics._ConfusionMatrixConditionCount\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'thresholds\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean.pbtxt
new file mode 100644
index 00000000000..40fe64bbd2c
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean.pbtxt
@@ -0,0 +1,192 @@
+path: "tensorflow.keras.metrics.Mean"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'mean\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'values\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-precision.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-precision.pbtxt
new file mode 100644
index 00000000000..ae6a85026da
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-precision.pbtxt
@@ -0,0 +1,192 @@
+path: "tensorflow.keras.metrics.Precision"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.Precision\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'thresholds\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-recall.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-recall.pbtxt
new file mode 100644
index 00000000000..31068a51d51
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-recall.pbtxt
@@ -0,0 +1,192 @@
+path: "tensorflow.keras.metrics.Recall"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.Recall\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'thresholds\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sparse-categorical-accuracy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sparse-categorical-accuracy.pbtxt
new file mode 100644
index 00000000000..0c17452292a
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sparse-categorical-accuracy.pbtxt
@@ -0,0 +1,194 @@
+path: "tensorflow.keras.metrics.SparseCategoricalAccuracy"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.SparseCategoricalAccuracy\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'sparse_categorical_accuracy\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-true-negatives.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-true-negatives.pbtxt
new file mode 100644
index 00000000000..1b5eb8d0de5
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-true-negatives.pbtxt
@@ -0,0 +1,193 @@
+path: "tensorflow.keras.metrics.TrueNegatives"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.TrueNegatives\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics._ConfusionMatrixConditionCount\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'thresholds\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-true-positives.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-true-positives.pbtxt
new file mode 100644
index 00000000000..5b9c470e32d
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-true-positives.pbtxt
@@ -0,0 +1,193 @@
+path: "tensorflow.keras.metrics.TruePositives"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.TruePositives\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics._ConfusionMatrixConditionCount\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'thresholds\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.pbtxt
index a296e131586..8cab17edc59 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.pbtxt
@@ -1,5 +1,49 @@
 path: "tensorflow.keras.metrics"
 tf_module {
+  member {
+    name: "Accuracy"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "BinaryAccuracy"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "CategoricalAccuracy"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "FalseNegatives"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "FalsePositives"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "Mean"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "Precision"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "Recall"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "SparseCategoricalAccuracy"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "TrueNegatives"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "TruePositives"
+    mtype: "<type \'type\'>"
+  }
   member_method {
     name: "KLD"
     argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
@@ -26,7 +70,7 @@ tf_module {
   }
   member_method {
     name: "binary_crossentropy"
-    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'y_true\', \'y_pred\', \'from_logits\'], varargs=None, keywords=None, defaults=[\'False\'], "
   }
   member_method {
     name: "categorical_accuracy"
@@ -34,7 +78,7 @@ tf_module {
   }
   member_method {
     name: "categorical_crossentropy"
-    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'y_true\', \'y_pred\', \'from_logits\'], varargs=None, keywords=None, defaults=[\'False\'], "
   }
   member_method {
     name: "cosine"
@@ -110,7 +154,7 @@ tf_module {
   }
   member_method {
     name: "sparse_categorical_crossentropy"
-    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'y_true\', \'y_pred\', \'from_logits\'], varargs=None, keywords=None, defaults=[\'False\'], "
   }
   member_method {
     name: "sparse_top_k_categorical_accuracy"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-model.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-model.pbtxt
index ccff809f2b2..c58c7bef22d 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-model.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-model.pbtxt
@@ -41,6 +41,14 @@ tf_class {
     name: "losses"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "metrics"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "metrics_names"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "name"
     mtype: "<type \'property\'>"
@@ -69,6 +77,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "run_eagerly"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "state_updates"
     mtype: "<type \'property\'>"
@@ -105,13 +117,17 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
@@ -225,6 +241,10 @@ tf_class {
     name: "predict_on_batch"
     argspec: "args=[\'self\', \'x\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "reset_metrics"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "reset_states"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -247,7 +267,7 @@ tf_class {
   }
   member_method {
     name: "test_on_batch"
-    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'reset_metrics\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'True\'], "
   }
   member_method {
     name: "to_json"
@@ -259,6 +279,6 @@ tf_class {
   }
   member_method {
     name: "train_on_batch"
-    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'class_weight\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'class_weight\', \'reset_metrics\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\'], "
   }
 }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-sequential.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-sequential.pbtxt
index b0fc7f97f1d..473a1c16fb1 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-sequential.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-sequential.pbtxt
@@ -42,6 +42,14 @@ tf_class {
     name: "losses"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "metrics"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "metrics_names"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "name"
     mtype: "<type \'property\'>"
@@ -70,6 +78,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "run_eagerly"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "state_updates"
     mtype: "<type \'property\'>"
@@ -110,13 +122,17 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
@@ -242,6 +258,10 @@ tf_class {
     name: "predict_proba"
     argspec: "args=[\'self\', \'x\', \'batch_size\', \'verbose\'], varargs=None, keywords=None, defaults=[\'32\', \'0\'], "
   }
+  member_method {
+    name: "reset_metrics"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "reset_states"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -264,7 +284,7 @@ tf_class {
   }
   member_method {
     name: "test_on_batch"
-    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'reset_metrics\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'True\'], "
   }
   member_method {
     name: "to_json"
@@ -276,6 +296,6 @@ tf_class {
   }
   member_method {
     name: "train_on_batch"
-    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'class_weight\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'class_weight\', \'reset_metrics\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\'], "
   }
 }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.utils.-progbar.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.utils.-progbar.pbtxt
index be4496e753f..8177cc71ed3 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.utils.-progbar.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.utils.-progbar.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'target\', \'width\', \'verbose\', \'interval\', \'stateful_metrics\'], varargs=None, keywords=None, defaults=[\'30\', \'1\', \'0.05\', \'None\'], "
+    argspec: "args=[\'self\', \'target\', \'width\', \'verbose\', \'interval\', \'stateful_metrics\', \'unit_name\'], varargs=None, keywords=None, defaults=[\'30\', \'1\', \'0.05\', \'None\', \'step\'], "
   }
   member_method {
     name: "add"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-average-pooling1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-average-pooling1-d.pbtxt
index c82e67526b2..059c91f724a 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-average-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-average-pooling1-d.pbtxt
@@ -99,6 +99,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-average-pooling2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-average-pooling2-d.pbtxt
index 1d031cb5f84..d06c8e81ee5 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-average-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-average-pooling2-d.pbtxt
@@ -99,6 +99,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-average-pooling3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-average-pooling3-d.pbtxt
index a8dda6655df..6be8e7c210f 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-average-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-average-pooling3-d.pbtxt
@@ -99,6 +99,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-batch-normalization.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-batch-normalization.pbtxt
index 97f65ed8943..16d9ecce10c 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-batch-normalization.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-batch-normalization.pbtxt
@@ -1,7 +1,8 @@
 path: "tensorflow.layers.BatchNormalization"
 tf_class {
   is_instance: "<class \'tensorflow.python.layers.normalization.BatchNormalization\'>"
-  is_instance: "<class \'tensorflow.python.keras.layers.normalization.BatchNormalization\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.normalization.BatchNormalizationV1\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.normalization.BatchNormalizationV2\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
@@ -98,6 +99,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv1-d.pbtxt
index ccd9578f0d6..21c695935ce 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv1-d.pbtxt
@@ -99,6 +99,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv2-d-transpose.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv2-d-transpose.pbtxt
index 9cbb58d721b..f24d0307207 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv2-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv2-d-transpose.pbtxt
@@ -100,6 +100,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv2-d.pbtxt
index c75ea3911e1..0a510ece355 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv2-d.pbtxt
@@ -99,6 +99,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv3-d-transpose.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv3-d-transpose.pbtxt
index 5dc834e5141..d0ee44bed3c 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv3-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv3-d-transpose.pbtxt
@@ -100,6 +100,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv3-d.pbtxt
index 96ab209874a..546de3cdab3 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv3-d.pbtxt
@@ -99,6 +99,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-dense.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-dense.pbtxt
index 7e9656b3525..3ad311581eb 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-dense.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-dense.pbtxt
@@ -98,6 +98,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-dropout.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-dropout.pbtxt
index e9a2269a6e8..9b83271350c 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-dropout.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-dropout.pbtxt
@@ -98,6 +98,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-flatten.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-flatten.pbtxt
index 7d2eaaab2a8..87a7fb3d843 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-flatten.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-flatten.pbtxt
@@ -98,6 +98,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-input-spec.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-input-spec.pbtxt
index fd02c919aeb..80834e08f7a 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-input-spec.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-input-spec.pbtxt
@@ -1,6 +1,6 @@
 path: "tensorflow.layers.InputSpec"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.InputSpec\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.input_spec.InputSpec\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-layer.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-layer.pbtxt
index 8bc3eb26e9c..32b17e90ade 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-layer.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-layer.pbtxt
@@ -96,6 +96,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-max-pooling1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-max-pooling1-d.pbtxt
index 6a0dcce56ac..643c469717c 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-max-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-max-pooling1-d.pbtxt
@@ -99,6 +99,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-max-pooling2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-max-pooling2-d.pbtxt
index b6c84edf2a2..434e25adc12 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-max-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-max-pooling2-d.pbtxt
@@ -99,6 +99,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-max-pooling3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-max-pooling3-d.pbtxt
index 062a02fa590..089fc6f9243 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-max-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-max-pooling3-d.pbtxt
@@ -99,6 +99,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-separable-conv1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-separable-conv1-d.pbtxt
index eaad0fb23ef..bc3d58b9ca9 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-separable-conv1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-separable-conv1-d.pbtxt
@@ -100,6 +100,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-separable-conv2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-separable-conv2-d.pbtxt
index ece28a8ce96..fe7d71af3a4 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-separable-conv2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-separable-conv2-d.pbtxt
@@ -100,6 +100,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-block-diag.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-block-diag.pbtxt
index 973705dae2f..773c74e64d1 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-block-diag.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-block-diag.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "batch_shape_tensor"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'batch_shape_tensor\'], "
   }
+  member_method {
+    name: "cholesky"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'cholesky\'], "
+  }
   member_method {
     name: "determinant"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'det\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-circulant.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-circulant.pbtxt
index de917706d55..533544d21f2 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-circulant.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-circulant.pbtxt
@@ -96,6 +96,10 @@ tf_class {
     name: "block_shape_tensor"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "cholesky"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'cholesky\'], "
+  }
   member_method {
     name: "convolution_kernel"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'convolution_kernel\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-circulant2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-circulant2-d.pbtxt
index c4e6a21c3ac..e3926eb6d47 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-circulant2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-circulant2-d.pbtxt
@@ -96,6 +96,10 @@ tf_class {
     name: "block_shape_tensor"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "cholesky"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'cholesky\'], "
+  }
   member_method {
     name: "convolution_kernel"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'convolution_kernel\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-circulant3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-circulant3-d.pbtxt
index 2e085a8e289..ba209df7824 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-circulant3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-circulant3-d.pbtxt
@@ -96,6 +96,10 @@ tf_class {
     name: "block_shape_tensor"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "cholesky"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'cholesky\'], "
+  }
   member_method {
     name: "convolution_kernel"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'convolution_kernel\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-composition.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-composition.pbtxt
index 42d22bce42d..081fb0e08bc 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-composition.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-composition.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "batch_shape_tensor"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'batch_shape_tensor\'], "
   }
+  member_method {
+    name: "cholesky"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'cholesky\'], "
+  }
   member_method {
     name: "determinant"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'det\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-diag.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-diag.pbtxt
index d6749fdcec6..2014a043016 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-diag.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-diag.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "batch_shape_tensor"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'batch_shape_tensor\'], "
   }
+  member_method {
+    name: "cholesky"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'cholesky\'], "
+  }
   member_method {
     name: "determinant"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'det\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-full-matrix.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-full-matrix.pbtxt
index d9f363d1336..9a87ae96877 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-full-matrix.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-full-matrix.pbtxt
@@ -75,6 +75,10 @@ tf_class {
     name: "batch_shape_tensor"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'batch_shape_tensor\'], "
   }
+  member_method {
+    name: "cholesky"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'cholesky\'], "
+  }
   member_method {
     name: "determinant"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'det\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-identity.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-identity.pbtxt
index aac7ee31ed6..33afb835ce1 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-identity.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-identity.pbtxt
@@ -76,6 +76,10 @@ tf_class {
     name: "batch_shape_tensor"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'batch_shape_tensor\'], "
   }
+  member_method {
+    name: "cholesky"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'cholesky\'], "
+  }
   member_method {
     name: "determinant"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'det\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-kronecker.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-kronecker.pbtxt
index c11d3908293..a9078c8ab5c 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-kronecker.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-kronecker.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "batch_shape_tensor"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'batch_shape_tensor\'], "
   }
+  member_method {
+    name: "cholesky"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'cholesky\'], "
+  }
   member_method {
     name: "determinant"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'det\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-low-rank-update.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-low-rank-update.pbtxt
index 3ee800269e6..4cfa3bb30d7 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-low-rank-update.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-low-rank-update.pbtxt
@@ -99,6 +99,10 @@ tf_class {
     name: "batch_shape_tensor"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'batch_shape_tensor\'], "
   }
+  member_method {
+    name: "cholesky"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'cholesky\'], "
+  }
   member_method {
     name: "determinant"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'det\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-lower-triangular.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-lower-triangular.pbtxt
index 63a1bc2321e..a87649133fd 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-lower-triangular.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-lower-triangular.pbtxt
@@ -75,6 +75,10 @@ tf_class {
     name: "batch_shape_tensor"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'batch_shape_tensor\'], "
   }
+  member_method {
+    name: "cholesky"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'cholesky\'], "
+  }
   member_method {
     name: "determinant"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'det\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-scaled-identity.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-scaled-identity.pbtxt
index e2c5a505a7d..32656467840 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-scaled-identity.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-scaled-identity.pbtxt
@@ -80,6 +80,10 @@ tf_class {
     name: "batch_shape_tensor"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'batch_shape_tensor\'], "
   }
+  member_method {
+    name: "cholesky"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'cholesky\'], "
+  }
   member_method {
     name: "determinant"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'det\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-zeros.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-zeros.pbtxt
index a1b0e06b475..49d8890c894 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-zeros.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-zeros.pbtxt
@@ -75,6 +75,10 @@ tf_class {
     name: "batch_shape_tensor"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'batch_shape_tensor\'], "
   }
+  member_method {
+    name: "cholesky"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'cholesky\'], "
+  }
   member_method {
     name: "determinant"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'det\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator.pbtxt
index 6d849dc040f..c89dc067b33 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator.pbtxt
@@ -74,6 +74,10 @@ tf_class {
     name: "batch_shape_tensor"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'batch_shape_tensor\'], "
   }
+  member_method {
+    name: "cholesky"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'cholesky\'], "
+  }
   member_method {
     name: "determinant"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'det\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.lite.constants.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.lite.constants.pbtxt
index 08845553e55..ef6c777665c 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.lite.constants.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.lite.constants.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.lite.constants"
 tf_module {
   member {
     name: "FLOAT"
-    mtype: "<type \'int\'>"
+    mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
   }
   member {
     name: "GRAPHVIZ_DOT"
@@ -10,19 +10,19 @@ tf_module {
   }
   member {
     name: "INT32"
-    mtype: "<type \'int\'>"
+    mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
   }
   member {
     name: "INT64"
-    mtype: "<type \'int\'>"
+    mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
   }
   member {
     name: "QUANTIZED_UINT8"
-    mtype: "<type \'int\'>"
+    mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
   }
   member {
     name: "STRING"
-    mtype: "<type \'int\'>"
+    mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
   }
   member {
     name: "TFLITE"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.math.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.math.pbtxt
index a8334fdd1d1..f34e2c2aa5a 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.math.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.math.pbtxt
@@ -176,6 +176,26 @@ tf_module {
     name: "invert_permutation"
     argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "is_finite"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "is_inf"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "is_nan"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "is_non_decreasing"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "is_strictly_increasing"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "l2_normalize"
     argspec: "args=[\'x\', \'axis\', \'epsilon\', \'name\', \'dim\'], varargs=None, keywords=None, defaults=[\'None\', \'1e-12\', \'None\', \'None\'], "
@@ -298,7 +318,7 @@ tf_module {
   }
   member_method {
     name: "reduce_std"
-    argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+    argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
   }
   member_method {
     name: "reduce_sum"
@@ -306,7 +326,7 @@ tf_module {
   }
   member_method {
     name: "reduce_variance"
-    argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+    argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
   }
   member_method {
     name: "rint"
@@ -322,7 +342,7 @@ tf_module {
   }
   member_method {
     name: "scalar_mul"
-    argspec: "args=[\'scalar\', \'x\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'scalar\', \'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "segment_max"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.metrics.-accuracy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.metrics.-accuracy.pbtxt
new file mode 100644
index 00000000000..f8e12f88173
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.metrics.-accuracy.pbtxt
@@ -0,0 +1,194 @@
+path: "tensorflow.metrics.Accuracy"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.Accuracy\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'accuracy\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.metrics.-binary-accuracy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.metrics.-binary-accuracy.pbtxt
new file mode 100644
index 00000000000..b9bc6a716a1
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.metrics.-binary-accuracy.pbtxt
@@ -0,0 +1,194 @@
+path: "tensorflow.metrics.BinaryAccuracy"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.BinaryAccuracy\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\', \'threshold\'], varargs=None, keywords=None, defaults=[\'binary_accuracy\', \'None\', \'0.5\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.metrics.-categorical-accuracy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.metrics.-categorical-accuracy.pbtxt
new file mode 100644
index 00000000000..0ef75d8756f
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.metrics.-categorical-accuracy.pbtxt
@@ -0,0 +1,194 @@
+path: "tensorflow.metrics.CategoricalAccuracy"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.CategoricalAccuracy\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'categorical_accuracy\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.metrics.-false-negatives.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.metrics.-false-negatives.pbtxt
new file mode 100644
index 00000000000..33226a2df62
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.metrics.-false-negatives.pbtxt
@@ -0,0 +1,193 @@
+path: "tensorflow.metrics.FalseNegatives"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.FalseNegatives\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics._ConfusionMatrixConditionCount\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'thresholds\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.metrics.-false-positives.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.metrics.-false-positives.pbtxt
new file mode 100644
index 00000000000..9953162ea3e
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.metrics.-false-positives.pbtxt
@@ -0,0 +1,193 @@
+path: "tensorflow.metrics.FalsePositives"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.FalsePositives\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics._ConfusionMatrixConditionCount\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'thresholds\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.metrics.-mean.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.metrics.-mean.pbtxt
new file mode 100644
index 00000000000..7fe6d6fda96
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.metrics.-mean.pbtxt
@@ -0,0 +1,192 @@
+path: "tensorflow.metrics.Mean"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'mean\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'values\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.metrics.-precision.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.metrics.-precision.pbtxt
new file mode 100644
index 00000000000..8c3271a109c
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.metrics.-precision.pbtxt
@@ -0,0 +1,192 @@
+path: "tensorflow.metrics.Precision"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.Precision\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'thresholds\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.metrics.-recall.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.metrics.-recall.pbtxt
new file mode 100644
index 00000000000..840a68bbc78
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.metrics.-recall.pbtxt
@@ -0,0 +1,192 @@
+path: "tensorflow.metrics.Recall"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.Recall\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'thresholds\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.metrics.-sparse-categorical-accuracy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.metrics.-sparse-categorical-accuracy.pbtxt
new file mode 100644
index 00000000000..7bce43fbdeb
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.metrics.-sparse-categorical-accuracy.pbtxt
@@ -0,0 +1,194 @@
+path: "tensorflow.metrics.SparseCategoricalAccuracy"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.SparseCategoricalAccuracy\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'sparse_categorical_accuracy\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.metrics.-true-negatives.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.metrics.-true-negatives.pbtxt
new file mode 100644
index 00000000000..83cd5b736bc
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.metrics.-true-negatives.pbtxt
@@ -0,0 +1,193 @@
+path: "tensorflow.metrics.TrueNegatives"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.TrueNegatives\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics._ConfusionMatrixConditionCount\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'thresholds\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.metrics.-true-positives.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.metrics.-true-positives.pbtxt
new file mode 100644
index 00000000000..5b2502eafee
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.metrics.-true-positives.pbtxt
@@ -0,0 +1,193 @@
+path: "tensorflow.metrics.TruePositives"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.TruePositives\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics._ConfusionMatrixConditionCount\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'thresholds\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.metrics.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.metrics.pbtxt
index e9b996c9f53..f5c267a1664 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.metrics.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.metrics.pbtxt
@@ -1,5 +1,49 @@
 path: "tensorflow.metrics"
 tf_module {
+  member {
+    name: "Accuracy"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "BinaryAccuracy"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "CategoricalAccuracy"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "FalseNegatives"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "FalsePositives"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "Mean"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "Precision"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "Recall"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "SparseCategoricalAccuracy"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "TrueNegatives"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "TruePositives"
+    mtype: "<type \'type\'>"
+  }
   member_method {
     name: "accuracy"
     argspec: "args=[\'labels\', \'predictions\', \'weights\', \'metrics_collections\', \'updates_collections\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.nn.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.nn.pbtxt
index 93f2fda2acf..40e20f8c919 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.nn.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.nn.pbtxt
@@ -44,6 +44,10 @@ tf_module {
     name: "bidirectional_dynamic_rnn"
     argspec: "args=[\'cell_fw\', \'cell_bw\', \'inputs\', \'sequence_length\', \'initial_state_fw\', \'initial_state_bw\', \'dtype\', \'parallel_iterations\', \'swap_memory\', \'time_major\', \'scope\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'False\', \'False\', \'None\'], "
   }
+  member_method {
+    name: "collapse_repeated"
+    argspec: "args=[\'labels\', \'seq_length\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "compute_accidental_hits"
     argspec: "args=[\'true_classes\', \'sampled_candidates\', \'num_true\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
@@ -72,6 +76,10 @@ tf_module {
     name: "conv3d"
     argspec: "args=[\'input\', \'filter\', \'strides\', \'padding\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'NDHWC\', \'[1, 1, 1, 1, 1]\', \'None\'], "
   }
+  member_method {
+    name: "conv3d_backprop_filter"
+    argspec: "args=[\'input\', \'filter_sizes\', \'out_backprop\', \'strides\', \'padding\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'NDHWC\', \'[1, 1, 1, 1, 1]\', \'None\'], "
+  }
   member_method {
     name: "conv3d_backprop_filter_v2"
     argspec: "args=[\'input\', \'filter_sizes\', \'out_backprop\', \'strides\', \'padding\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'NDHWC\', \'[1, 1, 1, 1, 1]\', \'None\'], "
@@ -104,6 +112,14 @@ tf_module {
     name: "ctc_loss"
     argspec: "args=[\'labels\', \'inputs\', \'sequence_length\', \'preprocess_collapse_repeated\', \'ctc_merge_repeated\', \'ignore_longer_outputs_than_inputs\', \'time_major\'], varargs=None, keywords=None, defaults=[\'False\', \'True\', \'False\', \'True\'], "
   }
+  member_method {
+    name: "ctc_loss_v2"
+    argspec: "args=[\'labels\', \'logits\', \'label_length\', \'logit_length\', \'logits_time_major\', \'unique\', \'blank_index\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "ctc_unique_labels"
+    argspec: "args=[\'labels\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "depth_to_space"
     argspec: "args=[\'input\', \'block_size\', \'name\', \'data_format\'], varargs=None, keywords=None, defaults=[\'None\', \'NHWC\'], "
@@ -112,6 +128,14 @@ tf_module {
     name: "depthwise_conv2d"
     argspec: "args=[\'input\', \'filter\', \'strides\', \'padding\', \'rate\', \'name\', \'data_format\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
+  member_method {
+    name: "depthwise_conv2d_backprop_filter"
+    argspec: "args=[\'input\', \'filter_sizes\', \'out_backprop\', \'strides\', \'padding\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'NHWC\', \'[1, 1, 1, 1]\', \'None\'], "
+  }
+  member_method {
+    name: "depthwise_conv2d_backprop_input"
+    argspec: "args=[\'input_sizes\', \'filter\', \'out_backprop\', \'strides\', \'padding\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'NHWC\', \'[1, 1, 1, 1]\', \'None\'], "
+  }
   member_method {
     name: "depthwise_conv2d_native"
     argspec: "args=[\'input\', \'filter\', \'strides\', \'padding\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'NHWC\', \'[1, 1, 1, 1]\', \'None\'], "
@@ -130,7 +154,7 @@ tf_module {
   }
   member_method {
     name: "dropout"
-    argspec: "args=[\'x\', \'keep_prob\', \'noise_shape\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+    argspec: "args=[\'x\', \'keep_prob\', \'noise_shape\', \'seed\', \'name\', \'rate\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "dynamic_rnn"
@@ -302,7 +326,7 @@ tf_module {
   }
   member_method {
     name: "softmax_cross_entropy_with_logits_v2"
-    argspec: "args=[\'labels\', \'logits\', \'dim\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'None\'], "
+    argspec: "args=[\'labels\', \'logits\', \'axis\', \'name\', \'dim\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "softplus"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-basic-l-s-t-m-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-basic-l-s-t-m-cell.pbtxt
index 88b8f37c4ff..f7f9978c063 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-basic-l-s-t-m-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-basic-l-s-t-m-cell.pbtxt
@@ -107,6 +107,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-basic-r-n-n-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-basic-r-n-n-cell.pbtxt
index a4483fefa27..f9e898484b9 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-basic-r-n-n-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-basic-r-n-n-cell.pbtxt
@@ -107,6 +107,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-device-wrapper.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-device-wrapper.pbtxt
index 381c4975d7d..9e52a425261 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-device-wrapper.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-device-wrapper.pbtxt
@@ -106,6 +106,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-dropout-wrapper.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-dropout-wrapper.pbtxt
index 912365a28b1..9836433d08c 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-dropout-wrapper.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-dropout-wrapper.pbtxt
@@ -110,6 +110,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-g-r-u-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-g-r-u-cell.pbtxt
index a4bb3219c79..5fd9b329bde 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-g-r-u-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-g-r-u-cell.pbtxt
@@ -107,6 +107,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-l-s-t-m-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-l-s-t-m-cell.pbtxt
index 715bfd5fc7c..76c8cff22b1 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-l-s-t-m-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-l-s-t-m-cell.pbtxt
@@ -107,6 +107,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-multi-r-n-n-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-multi-r-n-n-cell.pbtxt
index b66c0f89cc9..f53567af52f 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-multi-r-n-n-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-multi-r-n-n-cell.pbtxt
@@ -106,6 +106,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-r-n-n-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-r-n-n-cell.pbtxt
index faeb4f35133..d3b68e4f297 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-r-n-n-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-r-n-n-cell.pbtxt
@@ -105,6 +105,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-residual-wrapper.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-residual-wrapper.pbtxt
index caa2e600800..1f7840ab919 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-residual-wrapper.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-residual-wrapper.pbtxt
@@ -106,6 +106,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.pbtxt
index f4dce81659d..584c74f99d8 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.pbtxt
@@ -244,6 +244,10 @@ tf_module {
     name: "TensorShape"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "TensorSpec"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "TextLineReader"
     mtype: "<type \'type\'>"
@@ -320,6 +324,10 @@ tf_module {
     name: "debugging"
     mtype: "<type \'module\'>"
   }
+  member {
+    name: "distribute"
+    mtype: "<type \'module\'>"
+  }
   member {
     name: "distributions"
     mtype: "<type \'module\'>"
@@ -692,6 +700,10 @@ tf_module {
     name: "argmin"
     argspec: "args=[\'input\', \'axis\', \'name\', \'dimension\', \'output_type\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \"<dtype: \'int64\'>\"], "
   }
+  member_method {
+    name: "argsort"
+    argspec: "args=[\'values\', \'axis\', \'direction\', \'stable\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'ASCENDING\', \'False\', \'None\'], "
+  }
   member_method {
     name: "as_dtype"
     argspec: "args=[\'type_value\'], varargs=None, keywords=None, defaults=None"
@@ -778,7 +790,7 @@ tf_module {
   }
   member_method {
     name: "assert_scalar"
-    argspec: "args=[\'tensor\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'tensor\', \'name\', \'message\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "assert_type"
@@ -1040,10 +1052,18 @@ tf_module {
     name: "dimension_value"
     argspec: "args=[\'dimension\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "disable_eager_execution"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "disable_resource_variables"
     argspec: "args=[], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "disable_v2_behavior"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "disable_v2_tensorshape"
     argspec: "args=[], varargs=None, keywords=None, defaults=None"
@@ -1084,6 +1104,10 @@ tf_module {
     name: "enable_resource_variables"
     argspec: "args=[], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "enable_v2_behavior"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "enable_v2_tensorshape"
     argspec: "args=[], varargs=None, keywords=None, defaults=None"
@@ -1232,6 +1256,10 @@ tf_module {
     name: "get_local_variable"
     argspec: "args=[\'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'collections\', \'caching_device\', \'partitioner\', \'validate_shape\', \'use_resource\', \'custom_getter\', \'constraint\', \'synchronization\', \'aggregation\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'False\', \'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
+  member_method {
+    name: "get_logger"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_seed"
     argspec: "args=[\'op_seed\'], varargs=None, keywords=None, defaults=None"
@@ -1466,7 +1494,7 @@ tf_module {
   }
   member_method {
     name: "make_tensor_proto"
-    argspec: "args=[\'values\', \'dtype\', \'shape\', \'verify_shape\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'False\'], "
+    argspec: "args=[\'values\', \'dtype\', \'shape\', \'verify_shape\', \'allow_broadcast\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'False\', \'False\'], "
   }
   member_method {
     name: "map_fn"
@@ -1810,7 +1838,7 @@ tf_module {
   }
   member_method {
     name: "scalar_mul"
-    argspec: "args=[\'scalar\', \'x\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'scalar\', \'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "scan"
@@ -1948,6 +1976,10 @@ tf_module {
     name: "slice"
     argspec: "args=[\'input_\', \'begin\', \'size\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "sort"
+    argspec: "args=[\'values\', \'axis\', \'direction\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'ASCENDING\', \'None\'], "
+  }
   member_method {
     name: "space_to_batch"
     argspec: "args=[\'input\', \'paddings\', \'block_size\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -1962,7 +1994,7 @@ tf_module {
   }
   member_method {
     name: "sparse_add"
-    argspec: "args=[\'a\', \'b\', \'thresh\'], varargs=None, keywords=None, defaults=[\'0\'], "
+    argspec: "args=[\'a\', \'b\', \'threshold\', \'thresh\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "sparse_concat"
@@ -2156,6 +2188,18 @@ tf_module {
     name: "tanh"
     argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "tensor_scatter_add"
+    argspec: "args=[\'tensor\', \'indices\', \'updates\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "tensor_scatter_sub"
+    argspec: "args=[\'tensor\', \'indices\', \'updates\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "tensor_scatter_update"
+    argspec: "args=[\'tensor\', \'indices\', \'updates\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "tensordot"
     argspec: "args=[\'a\', \'b\', \'axes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -2296,6 +2340,10 @@ tf_module {
     name: "while_loop"
     argspec: "args=[\'cond\', \'body\', \'loop_vars\', \'shape_invariants\', \'parallel_iterations\', \'back_prop\', \'swap_memory\', \'name\', \'maximum_iterations\', \'return_same_structure\'], varargs=None, keywords=None, defaults=[\'None\', \'10\', \'True\', \'False\', \'None\', \'None\', \'False\'], "
   }
+  member_method {
+    name: "wrap_function"
+    argspec: "args=[\'fn\', \'signature\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "write_file"
     argspec: "args=[\'filename\', \'contents\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.quantization.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.quantization.pbtxt
index 2948b7318ea..632c2f8f83c 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.quantization.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.quantization.pbtxt
@@ -34,7 +34,7 @@ tf_module {
   }
   member_method {
     name: "quantize_and_dequantize"
-    argspec: "args=[\'input\', \'input_min\', \'input_max\', \'signed_input\', \'num_bits\', \'range_given\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'8\', \'False\', \'None\'], "
+    argspec: "args=[\'input\', \'input_min\', \'input_max\', \'signed_input\', \'num_bits\', \'range_given\', \'round_mode\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'8\', \'False\', \'HALF_TO_EVEN\', \'None\'], "
   }
   member_method {
     name: "quantized_concat"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.random.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.random.pbtxt
index 160c09798d0..107534e0869 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.random.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.random.pbtxt
@@ -1,5 +1,9 @@
 path: "tensorflow.random"
 tf_module {
+  member_method {
+    name: "categorical"
+    argspec: "args=[\'logits\', \'num_samples\', \'dtype\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
   member_method {
     name: "gamma"
     argspec: "args=[\'shape\', \'alpha\', \'beta\', \'dtype\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \"<dtype: \'float32\'>\", \'None\', \'None\'], "
@@ -32,6 +36,10 @@ tf_module {
     name: "shuffle"
     argspec: "args=[\'value\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
+  member_method {
+    name: "stateless_categorical"
+    argspec: "args=[\'logits\', \'num_samples\', \'seed\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'int64\'>\", \'None\'], "
+  }
   member_method {
     name: "stateless_multinomial"
     argspec: "args=[\'logits\', \'num_samples\', \'seed\', \'output_dtype\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'int64\'>\", \'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.saved_model.-builder.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.saved_model.-builder.pbtxt
index 67457de0708..e4cc0061a95 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.saved_model.-builder.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.saved_model.-builder.pbtxt
@@ -1,6 +1,7 @@
 path: "tensorflow.saved_model.Builder"
 tf_class {
   is_instance: "<class \'tensorflow.python.saved_model.builder_impl.SavedModelBuilder\'>"
+  is_instance: "<class \'tensorflow.python.saved_model.builder_impl._SavedModelBuilder\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.saved_model.builder.-saved-model-builder.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.saved_model.builder.-saved-model-builder.pbtxt
index 83bd7035409..44860b11720 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.saved_model.builder.-saved-model-builder.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.saved_model.builder.-saved-model-builder.pbtxt
@@ -1,6 +1,7 @@
 path: "tensorflow.saved_model.builder.SavedModelBuilder"
 tf_class {
   is_instance: "<class \'tensorflow.python.saved_model.builder_impl.SavedModelBuilder\'>"
+  is_instance: "<class \'tensorflow.python.saved_model.builder_impl._SavedModelBuilder\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.saved_model.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.saved_model.pbtxt
index 2055bfbf066..3929003fa1f 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.saved_model.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.saved_model.pbtxt
@@ -148,6 +148,10 @@ tf_module {
     name: "classification_signature_def"
     argspec: "args=[\'examples\', \'classes\', \'scores\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "contains_saved_model"
+    argspec: "args=[\'export_dir\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_tensor_from_tensor_info"
     argspec: "args=[\'tensor_info\', \'graph\', \'import_scope\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.sets.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.sets.pbtxt
index 8a196b1a556..09d6f1424b7 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.sets.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.sets.pbtxt
@@ -1,5 +1,13 @@
 path: "tensorflow.sets"
 tf_module {
+  member_method {
+    name: "difference"
+    argspec: "args=[\'a\', \'b\', \'aminusb\', \'validate_indices\'], varargs=None, keywords=None, defaults=[\'True\', \'True\'], "
+  }
+  member_method {
+    name: "intersection"
+    argspec: "args=[\'a\', \'b\', \'validate_indices\'], varargs=None, keywords=None, defaults=[\'True\'], "
+  }
   member_method {
     name: "set_difference"
     argspec: "args=[\'a\', \'b\', \'aminusb\', \'validate_indices\'], varargs=None, keywords=None, defaults=[\'True\', \'True\'], "
@@ -16,4 +24,12 @@ tf_module {
     name: "set_union"
     argspec: "args=[\'a\', \'b\', \'validate_indices\'], varargs=None, keywords=None, defaults=[\'True\'], "
   }
+  member_method {
+    name: "size"
+    argspec: "args=[\'a\', \'validate_indices\'], varargs=None, keywords=None, defaults=[\'True\'], "
+  }
+  member_method {
+    name: "union"
+    argspec: "args=[\'a\', \'b\', \'validate_indices\'], varargs=None, keywords=None, defaults=[\'True\'], "
+  }
 }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.signal.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.signal.pbtxt
index 2c50c41f186..ea717b4d719 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.signal.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.signal.pbtxt
@@ -1,5 +1,9 @@
 path: "tensorflow.signal"
 tf_module {
+  member_method {
+    name: "dct"
+    argspec: "args=[\'input\', \'type\', \'n\', \'axis\', \'norm\', \'name\'], varargs=None, keywords=None, defaults=[\'2\', \'None\', \'-1\', \'None\', \'None\'], "
+  }
   member_method {
     name: "fft"
     argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -24,6 +28,10 @@ tf_module {
     name: "hann_window"
     argspec: "args=[\'window_length\', \'periodic\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \"<dtype: \'float32\'>\", \'None\'], "
   }
+  member_method {
+    name: "idct"
+    argspec: "args=[\'input\', \'type\', \'n\', \'axis\', \'norm\', \'name\'], varargs=None, keywords=None, defaults=[\'2\', \'None\', \'-1\', \'None\', \'None\'], "
+  }
   member_method {
     name: "ifft"
     argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -44,6 +52,18 @@ tf_module {
     name: "inverse_stft_window_fn"
     argspec: "args=[\'frame_step\', \'forward_window_fn\', \'name\'], varargs=None, keywords=None, defaults=[\'<function hann_window instance>\', \'None\'], "
   }
+  member_method {
+    name: "irfft"
+    argspec: "args=[\'input_tensor\', \'fft_length\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "irfft2d"
+    argspec: "args=[\'input_tensor\', \'fft_length\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "irfft3d"
+    argspec: "args=[\'input_tensor\', \'fft_length\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "linear_to_mel_weight_matrix"
     argspec: "args=[\'num_mel_bins\', \'num_spectrogram_bins\', \'sample_rate\', \'lower_edge_hertz\', \'upper_edge_hertz\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'20\', \'129\', \'8000\', \'125.0\', \'3800.0\', \"<dtype: \'float32\'>\", \'None\'], "
@@ -56,6 +76,18 @@ tf_module {
     name: "overlap_and_add"
     argspec: "args=[\'signal\', \'frame_step\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "rfft"
+    argspec: "args=[\'input_tensor\', \'fft_length\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "rfft2d"
+    argspec: "args=[\'input_tensor\', \'fft_length\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "rfft3d"
+    argspec: "args=[\'input_tensor\', \'fft_length\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "stft"
     argspec: "args=[\'signals\', \'frame_length\', \'frame_step\', \'fft_length\', \'window_fn\', \'pad_end\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'<function hann_window instance>\', \'False\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.sparse.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.sparse.pbtxt
index 32bd8d5f8ed..33e342bc754 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.sparse.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.sparse.pbtxt
@@ -10,7 +10,7 @@ tf_module {
   }
   member_method {
     name: "add"
-    argspec: "args=[\'a\', \'b\', \'thresh\'], varargs=None, keywords=None, defaults=[\'0\'], "
+    argspec: "args=[\'a\', \'b\', \'threshold\', \'thresh\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "concat"
@@ -112,6 +112,10 @@ tf_module {
     name: "softmax"
     argspec: "args=[\'sp_input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "sparse_dense_matmul"
+    argspec: "args=[\'sp_a\', \'b\', \'adjoint_a\', \'adjoint_b\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'None\'], "
+  }
   member_method {
     name: "split"
     argspec: "args=[\'keyword_required\', \'sp_input\', \'num_split\', \'axis\', \'name\', \'split_dim\'], varargs=None, keywords=None, defaults=[\'KeywordRequired()\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.strings.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.strings.pbtxt
index 03144cbe709..a1cd581a86b 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.strings.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.strings.pbtxt
@@ -52,6 +52,10 @@ tf_module {
     name: "to_number"
     argspec: "args=[\'string_tensor\', \'out_type\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'float32\'>\", \'None\'], "
   }
+  member_method {
+    name: "unicode_encode"
+    argspec: "args=[\'input\', \'output_encoding\', \'errors\', \'replacement_char\', \'name\'], varargs=None, keywords=None, defaults=[\'replace\', \'65533\', \'None\'], "
+  }
   member_method {
     name: "unicode_script"
     argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.test.-benchmark.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.test.-benchmark.pbtxt
index df528e26b60..6fc489c8604 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.test.-benchmark.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.test.-benchmark.pbtxt
@@ -6,6 +6,10 @@ tf_class {
   member_method {
     name: "__init__"
   }
+  member_method {
+    name: "evaluate"
+    argspec: "args=[\'self\', \'tensors\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "is_abstract"
     argspec: "args=[\'cls\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.train.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.train.pbtxt
index 877c55c6b38..bdb3ea2197c 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.train.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.train.pbtxt
@@ -396,6 +396,10 @@ tf_module {
     name: "piecewise_constant"
     argspec: "args=[\'x\', \'boundaries\', \'values\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "piecewise_constant_decay"
+    argspec: "args=[\'x\', \'boundaries\', \'values\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "polynomial_decay"
     argspec: "args=[\'learning_rate\', \'global_step\', \'decay_steps\', \'end_learning_rate\', \'power\', \'cycle\', \'name\'], varargs=None, keywords=None, defaults=[\'0.0001\', \'1.0\', \'False\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-conditional-accumulator-base.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-conditional-accumulator-base.pbtxt
deleted file mode 100644
index c9a32c16b34..00000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.-conditional-accumulator-base.pbtxt
+++ /dev/null
@@ -1,29 +0,0 @@
-path: "tensorflow.ConditionalAccumulatorBase"
-tf_class {
-  is_instance: "<class \'tensorflow.python.ops.data_flow_ops.ConditionalAccumulatorBase\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "accumulator_ref"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "dtype"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "name"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'dtype\', \'shape\', \'accumulator_ref\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "num_accumulated"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "set_global_step"
-    argspec: "args=[\'self\', \'new_global_step\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-conditional-accumulator.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-conditional-accumulator.pbtxt
deleted file mode 100644
index 15e0ab76b6f..00000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.-conditional-accumulator.pbtxt
+++ /dev/null
@@ -1,38 +0,0 @@
-path: "tensorflow.ConditionalAccumulator"
-tf_class {
-  is_instance: "<class \'tensorflow.python.ops.data_flow_ops.ConditionalAccumulator\'>"
-  is_instance: "<class \'tensorflow.python.ops.data_flow_ops.ConditionalAccumulatorBase\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "accumulator_ref"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "dtype"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "name"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'dtype\', \'shape\', \'shared_name\', \'name\', \'reduction_type\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'conditional_accumulator\', \'MEAN\'], "
-  }
-  member_method {
-    name: "apply_grad"
-    argspec: "args=[\'self\', \'grad\', \'local_step\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'None\'], "
-  }
-  member_method {
-    name: "num_accumulated"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "set_global_step"
-    argspec: "args=[\'self\', \'new_global_step\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "take_grad"
-    argspec: "args=[\'self\', \'num_required\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-config-proto.-experimental.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-config-proto.-experimental.pbtxt
index f7491649c22..caa72fe5a61 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.-config-proto.-experimental.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.-config-proto.-experimental.pbtxt
@@ -20,6 +20,12 @@ tf_proto {
       label: LABEL_OPTIONAL
       type: TYPE_INT32
     }
+    field {
+      name: "use_numa_affinity"
+      number: 5
+      label: LABEL_OPTIONAL
+      type: TYPE_BOOL
+    }
     reserved_range {
       start: 2
       end: 3
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-config-proto.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-config-proto.pbtxt
index 53b532beab3..b505d813509 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.-config-proto.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.-config-proto.pbtxt
@@ -143,6 +143,12 @@ tf_proto {
         label: LABEL_OPTIONAL
         type: TYPE_INT32
       }
+      field {
+        name: "use_numa_affinity"
+        number: 5
+        label: LABEL_OPTIONAL
+        type: TYPE_BOOL
+      }
       reserved_range {
         start: 2
         end: 3
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-device-spec.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-device-spec.pbtxt
deleted file mode 100644
index 92e535c3414..00000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.-device-spec.pbtxt
+++ /dev/null
@@ -1,37 +0,0 @@
-path: "tensorflow.DeviceSpec"
-tf_class {
-  is_instance: "<class \'tensorflow.python.framework.device.DeviceSpec\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "job"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "replica"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "task"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'job\', \'replica\', \'task\', \'device_type\', \'device_index\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "from_string"
-    argspec: "args=[\'spec\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "merge_from"
-    argspec: "args=[\'self\', \'dev\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "parse_from_string"
-    argspec: "args=[\'self\', \'spec\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "to_string"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-dimension.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-dimension.pbtxt
deleted file mode 100644
index a9ab27719b4..00000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.-dimension.pbtxt
+++ /dev/null
@@ -1,25 +0,0 @@
-path: "tensorflow.Dimension"
-tf_class {
-  is_instance: "<class \'tensorflow.python.framework.tensor_shape.Dimension\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "value"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'value\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "assert_is_compatible_with"
-    argspec: "args=[\'self\', \'other\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "is_compatible_with"
-    argspec: "args=[\'self\', \'other\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "merge_with"
-    argspec: "args=[\'self\', \'other\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-gradient-tape.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-gradient-tape.pbtxt
index 0a16d6ab92f..2299a009d3d 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.-gradient-tape.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.-gradient-tape.pbtxt
@@ -6,10 +6,18 @@ tf_class {
     name: "__init__"
     argspec: "args=[\'self\', \'persistent\', \'watch_accessed_variables\'], varargs=None, keywords=None, defaults=[\'False\', \'True\'], "
   }
+  member_method {
+    name: "batch_jacobian"
+    argspec: "args=[\'self\', \'target\', \'source\', \'unconnected_gradients\', \'parallel_iterations\', \'experimental_use_pfor\'], varargs=None, keywords=None, defaults=[\'UnconnectedGradients.NONE\', \'None\', \'True\'], "
+  }
   member_method {
     name: "gradient"
     argspec: "args=[\'self\', \'target\', \'sources\', \'output_gradients\', \'unconnected_gradients\'], varargs=None, keywords=None, defaults=[\'None\', \'UnconnectedGradients.NONE\'], "
   }
+  member_method {
+    name: "jacobian"
+    argspec: "args=[\'self\', \'target\', \'sources\', \'unconnected_gradients\', \'parallel_iterations\', \'experimental_use_pfor\'], varargs=None, keywords=None, defaults=[\'UnconnectedGradients.NONE\', \'None\', \'True\'], "
+  }
   member_method {
     name: "reset"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-graph-keys.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-graph-keys.pbtxt
deleted file mode 100644
index ffe47909339..00000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.-graph-keys.pbtxt
+++ /dev/null
@@ -1,140 +0,0 @@
-path: "tensorflow.GraphKeys"
-tf_class {
-  is_instance: "<class \'tensorflow.python.framework.ops.GraphKeys\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "ACTIVATIONS"
-    mtype: "<type \'str\'>"
-  }
-  member {
-    name: "ASSET_FILEPATHS"
-    mtype: "<type \'str\'>"
-  }
-  member {
-    name: "BIASES"
-    mtype: "<type \'str\'>"
-  }
-  member {
-    name: "CONCATENATED_VARIABLES"
-    mtype: "<type \'str\'>"
-  }
-  member {
-    name: "COND_CONTEXT"
-    mtype: "<type \'str\'>"
-  }
-  member {
-    name: "EVAL_STEP"
-    mtype: "<type \'str\'>"
-  }
-  member {
-    name: "GLOBAL_STEP"
-    mtype: "<type \'str\'>"
-  }
-  member {
-    name: "GLOBAL_VARIABLES"
-    mtype: "<type \'str\'>"
-  }
-  member {
-    name: "INIT_OP"
-    mtype: "<type \'str\'>"
-  }
-  member {
-    name: "LOCAL_INIT_OP"
-    mtype: "<type \'str\'>"
-  }
-  member {
-    name: "LOCAL_RESOURCES"
-    mtype: "<type \'str\'>"
-  }
-  member {
-    name: "LOCAL_VARIABLES"
-    mtype: "<type \'str\'>"
-  }
-  member {
-    name: "LOSSES"
-    mtype: "<type \'str\'>"
-  }
-  member {
-    name: "METRIC_VARIABLES"
-    mtype: "<type \'str\'>"
-  }
-  member {
-    name: "MODEL_VARIABLES"
-    mtype: "<type \'str\'>"
-  }
-  member {
-    name: "MOVING_AVERAGE_VARIABLES"
-    mtype: "<type \'str\'>"
-  }
-  member {
-    name: "QUEUE_RUNNERS"
-    mtype: "<type \'str\'>"
-  }
-  member {
-    name: "READY_FOR_LOCAL_INIT_OP"
-    mtype: "<type \'str\'>"
-  }
-  member {
-    name: "READY_OP"
-    mtype: "<type \'str\'>"
-  }
-  member {
-    name: "REGULARIZATION_LOSSES"
-    mtype: "<type \'str\'>"
-  }
-  member {
-    name: "RESOURCES"
-    mtype: "<type \'str\'>"
-  }
-  member {
-    name: "SAVEABLE_OBJECTS"
-    mtype: "<type \'str\'>"
-  }
-  member {
-    name: "SAVERS"
-    mtype: "<type \'str\'>"
-  }
-  member {
-    name: "SUMMARIES"
-    mtype: "<type \'str\'>"
-  }
-  member {
-    name: "SUMMARY_OP"
-    mtype: "<type \'str\'>"
-  }
-  member {
-    name: "TABLE_INITIALIZERS"
-    mtype: "<type \'str\'>"
-  }
-  member {
-    name: "TRAINABLE_RESOURCE_VARIABLES"
-    mtype: "<type \'str\'>"
-  }
-  member {
-    name: "TRAINABLE_VARIABLES"
-    mtype: "<type \'str\'>"
-  }
-  member {
-    name: "TRAIN_OP"
-    mtype: "<type \'str\'>"
-  }
-  member {
-    name: "UPDATE_OPS"
-    mtype: "<type \'str\'>"
-  }
-  member {
-    name: "VARIABLES"
-    mtype: "<type \'str\'>"
-  }
-  member {
-    name: "WEIGHTS"
-    mtype: "<type \'str\'>"
-  }
-  member {
-    name: "WHILE_CONTEXT"
-    mtype: "<type \'str\'>"
-  }
-  member_method {
-    name: "__init__"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-tensor-info.-coo-sparse.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-tensor-info.-coo-sparse.pbtxt
deleted file mode 100644
index 0064c8460cb..00000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.-tensor-info.-coo-sparse.pbtxt
+++ /dev/null
@@ -1,24 +0,0 @@
-path: "tensorflow.TensorInfo.CooSparse"
-tf_proto {
-  descriptor {
-    name: "CooSparse"
-    field {
-      name: "values_tensor_name"
-      number: 1
-      label: LABEL_OPTIONAL
-      type: TYPE_STRING
-    }
-    field {
-      name: "indices_tensor_name"
-      number: 2
-      label: LABEL_OPTIONAL
-      type: TYPE_STRING
-    }
-    field {
-      name: "dense_shape_tensor_name"
-      number: 3
-      label: LABEL_OPTIONAL
-      type: TYPE_STRING
-    }
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-tensor-info.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-tensor-info.pbtxt
deleted file mode 100644
index 63566c808e5..00000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.-tensor-info.pbtxt
+++ /dev/null
@@ -1,59 +0,0 @@
-path: "tensorflow.TensorInfo"
-tf_proto {
-  descriptor {
-    name: "TensorInfo"
-    field {
-      name: "name"
-      number: 1
-      label: LABEL_OPTIONAL
-      type: TYPE_STRING
-      oneof_index: 0
-    }
-    field {
-      name: "coo_sparse"
-      number: 4
-      label: LABEL_OPTIONAL
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.TensorInfo.CooSparse"
-      oneof_index: 0
-    }
-    field {
-      name: "dtype"
-      number: 2
-      label: LABEL_OPTIONAL
-      type: TYPE_ENUM
-      type_name: ".tensorflow.DataType"
-    }
-    field {
-      name: "tensor_shape"
-      number: 3
-      label: LABEL_OPTIONAL
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.TensorShapeProto"
-    }
-    nested_type {
-      name: "CooSparse"
-      field {
-        name: "values_tensor_name"
-        number: 1
-        label: LABEL_OPTIONAL
-        type: TYPE_STRING
-      }
-      field {
-        name: "indices_tensor_name"
-        number: 2
-        label: LABEL_OPTIONAL
-        type: TYPE_STRING
-      }
-      field {
-        name: "dense_shape_tensor_name"
-        number: 3
-        label: LABEL_OPTIONAL
-        type: TYPE_STRING
-      }
-    }
-    oneof_decl {
-      name: "encoding"
-    }
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-tensor-spec.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-tensor-spec.pbtxt
new file mode 100644
index 00000000000..493dcba8922
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.-tensor-spec.pbtxt
@@ -0,0 +1,33 @@
+path: "tensorflow.TensorSpec"
+tf_class {
+  is_instance: "<class \'tensorflow.python.framework.tensor_spec.TensorSpec\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "shape"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'shape\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "from_spec"
+    argspec: "args=[\'cls\', \'spec\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "from_tensor"
+    argspec: "args=[\'cls\', \'tensor\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "is_compatible_with"
+    argspec: "args=[\'self\', \'spec_or_tensor\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.app.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.app.pbtxt
deleted file mode 100644
index 67e1b76caba..00000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.app.pbtxt
+++ /dev/null
@@ -1,7 +0,0 @@
-path: "tensorflow.app"
-tf_module {
-  member_method {
-    name: "run"
-    argspec: "args=[\'main\', \'argv\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.-dataset.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.-dataset.pbtxt
index 8b7f63e43e2..ac8dd2de7fe 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.-dataset.pbtxt
@@ -1,6 +1,6 @@
 path: "tensorflow.data.Dataset"
 tf_class {
-  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.Dataset\'>"
+  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetV2\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "output_classes"
@@ -16,7 +16,6 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "apply"
@@ -46,10 +45,6 @@ tf_class {
     name: "from_generator"
     argspec: "args=[\'generator\', \'output_types\', \'output_shapes\', \'args\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
-  member_method {
-    name: "from_sparse_tensor_slices"
-    argspec: "args=[\'sparse_tensor\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "from_tensor_slices"
     argspec: "args=[\'tensors\'], varargs=None, keywords=None, defaults=None"
@@ -66,14 +61,6 @@ tf_class {
     name: "list_files"
     argspec: "args=[\'file_pattern\', \'shuffle\', \'seed\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
-  member_method {
-    name: "make_initializable_iterator"
-    argspec: "args=[\'self\', \'shared_name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "make_one_shot_iterator"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "map"
     argspec: "args=[\'self\', \'map_func\', \'num_parallel_calls\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -102,10 +89,6 @@ tf_class {
     name: "repeat"
     argspec: "args=[\'self\', \'count\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
-  member_method {
-    name: "shard"
-    argspec: "args=[\'self\', \'num_shards\', \'index\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "shuffle"
     argspec: "args=[\'self\', \'buffer_size\', \'seed\', \'reshuffle_each_iteration\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.-fixed-length-record-dataset.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.-fixed-length-record-dataset.pbtxt
index 81358cecbc0..f1573512438 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.-fixed-length-record-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.-fixed-length-record-dataset.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.data.FixedLengthRecordDataset"
 tf_class {
-  is_instance: "<class \'tensorflow.python.data.ops.readers.FixedLengthRecordDataset\'>"
+  is_instance: "<class \'tensorflow.python.data.ops.readers.FixedLengthRecordDatasetV2\'>"
   is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetSource\'>"
-  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.Dataset\'>"
+  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetV2\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "output_classes"
@@ -48,10 +48,6 @@ tf_class {
     name: "from_generator"
     argspec: "args=[\'generator\', \'output_types\', \'output_shapes\', \'args\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
-  member_method {
-    name: "from_sparse_tensor_slices"
-    argspec: "args=[\'sparse_tensor\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "from_tensor_slices"
     argspec: "args=[\'tensors\'], varargs=None, keywords=None, defaults=None"
@@ -68,14 +64,6 @@ tf_class {
     name: "list_files"
     argspec: "args=[\'file_pattern\', \'shuffle\', \'seed\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
-  member_method {
-    name: "make_initializable_iterator"
-    argspec: "args=[\'self\', \'shared_name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "make_one_shot_iterator"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "map"
     argspec: "args=[\'self\', \'map_func\', \'num_parallel_calls\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -104,10 +92,6 @@ tf_class {
     name: "repeat"
     argspec: "args=[\'self\', \'count\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
-  member_method {
-    name: "shard"
-    argspec: "args=[\'self\', \'num_shards\', \'index\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "shuffle"
     argspec: "args=[\'self\', \'buffer_size\', \'seed\', \'reshuffle_each_iteration\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.-iterator.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.-iterator.pbtxt
deleted file mode 100644
index 4f0147a5238..00000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.-iterator.pbtxt
+++ /dev/null
@@ -1,46 +0,0 @@
-path: "tensorflow.data.Iterator"
-tf_class {
-  is_instance: "<class \'tensorflow.python.data.ops.iterator_ops.Iterator\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "initializer"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output_classes"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output_shapes"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output_types"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'iterator_resource\', \'initializer\', \'output_types\', \'output_shapes\', \'output_classes\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "from_string_handle"
-    argspec: "args=[\'string_handle\', \'output_types\', \'output_shapes\', \'output_classes\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
-  }
-  member_method {
-    name: "from_structure"
-    argspec: "args=[\'output_types\', \'output_shapes\', \'shared_name\', \'output_classes\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "get_next"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "make_initializer"
-    argspec: "args=[\'self\', \'dataset\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "string_handle"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.-options.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.-options.pbtxt
index 9d032d43de1..72fc2c3a9ee 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.-options.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.-options.pbtxt
@@ -1,6 +1,7 @@
 path: "tensorflow.data.Options"
 tf_class {
   is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.Options\'>"
+  is_instance: "<class \'tensorflow.python.data.util.options.OptionsBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "experimental_autotune"
@@ -10,50 +11,22 @@ tf_class {
     name: "experimental_deterministic"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "experimental_filter_fusion"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "experimental_hoist_random_uniform"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "experimental_map_and_batch_fusion"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "experimental_map_and_filter_fusion"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "experimental_map_fusion"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "experimental_map_parallelization"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "experimental_map_vectorization"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "experimental_noop_elimination"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "experimental_numa_aware"
     mtype: "<type \'property\'>"
   }
   member {
-    name: "experimental_shuffle_and_repeat_fusion"
+    name: "experimental_optimization"
     mtype: "<type \'property\'>"
   }
   member {
     name: "experimental_stats"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "experimental_threading"
+    mtype: "<type \'property\'>"
+  }
   member_method {
     name: "__init__"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.-t-f-record-dataset.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.-t-f-record-dataset.pbtxt
index 7b7a9ebaf08..690da98b1ac 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.-t-f-record-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.-t-f-record-dataset.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.data.TFRecordDataset"
 tf_class {
-  is_instance: "<class \'tensorflow.python.data.ops.readers.TFRecordDataset\'>"
-  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.Dataset\'>"
+  is_instance: "<class \'tensorflow.python.data.ops.readers.TFRecordDatasetV2\'>"
+  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetV2\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "output_classes"
@@ -47,10 +47,6 @@ tf_class {
     name: "from_generator"
     argspec: "args=[\'generator\', \'output_types\', \'output_shapes\', \'args\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
-  member_method {
-    name: "from_sparse_tensor_slices"
-    argspec: "args=[\'sparse_tensor\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "from_tensor_slices"
     argspec: "args=[\'tensors\'], varargs=None, keywords=None, defaults=None"
@@ -67,14 +63,6 @@ tf_class {
     name: "list_files"
     argspec: "args=[\'file_pattern\', \'shuffle\', \'seed\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
-  member_method {
-    name: "make_initializable_iterator"
-    argspec: "args=[\'self\', \'shared_name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "make_one_shot_iterator"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "map"
     argspec: "args=[\'self\', \'map_func\', \'num_parallel_calls\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -103,10 +91,6 @@ tf_class {
     name: "repeat"
     argspec: "args=[\'self\', \'count\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
-  member_method {
-    name: "shard"
-    argspec: "args=[\'self\', \'num_shards\', \'index\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "shuffle"
     argspec: "args=[\'self\', \'buffer_size\', \'seed\', \'reshuffle_each_iteration\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.-text-line-dataset.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.-text-line-dataset.pbtxt
index 1c305abf68c..fe0bc1a4db5 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.-text-line-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.-text-line-dataset.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.data.TextLineDataset"
 tf_class {
-  is_instance: "<class \'tensorflow.python.data.ops.readers.TextLineDataset\'>"
+  is_instance: "<class \'tensorflow.python.data.ops.readers.TextLineDatasetV2\'>"
   is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetSource\'>"
-  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.Dataset\'>"
+  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetV2\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "output_classes"
@@ -48,10 +48,6 @@ tf_class {
     name: "from_generator"
     argspec: "args=[\'generator\', \'output_types\', \'output_shapes\', \'args\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
-  member_method {
-    name: "from_sparse_tensor_slices"
-    argspec: "args=[\'sparse_tensor\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "from_tensor_slices"
     argspec: "args=[\'tensors\'], varargs=None, keywords=None, defaults=None"
@@ -68,14 +64,6 @@ tf_class {
     name: "list_files"
     argspec: "args=[\'file_pattern\', \'shuffle\', \'seed\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
-  member_method {
-    name: "make_initializable_iterator"
-    argspec: "args=[\'self\', \'shared_name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "make_one_shot_iterator"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "map"
     argspec: "args=[\'self\', \'map_func\', \'num_parallel_calls\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -104,10 +92,6 @@ tf_class {
     name: "repeat"
     argspec: "args=[\'self\', \'count\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
-  member_method {
-    name: "shard"
-    argspec: "args=[\'self\', \'num_shards\', \'index\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "shuffle"
     argspec: "args=[\'self\', \'buffer_size\', \'seed\', \'reshuffle_each_iteration\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-csv-dataset.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-csv-dataset.pbtxt
index 2520e28a3c7..261129b1321 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-csv-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-csv-dataset.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.data.experimental.CsvDataset"
 tf_class {
-  is_instance: "<class \'tensorflow.python.data.experimental.ops.readers.CsvDataset\'>"
+  is_instance: "<class \'tensorflow.python.data.experimental.ops.readers.CsvDatasetV2\'>"
   is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetSource\'>"
-  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.Dataset\'>"
+  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetV2\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "output_classes"
@@ -48,10 +48,6 @@ tf_class {
     name: "from_generator"
     argspec: "args=[\'generator\', \'output_types\', \'output_shapes\', \'args\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
-  member_method {
-    name: "from_sparse_tensor_slices"
-    argspec: "args=[\'sparse_tensor\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "from_tensor_slices"
     argspec: "args=[\'tensors\'], varargs=None, keywords=None, defaults=None"
@@ -68,14 +64,6 @@ tf_class {
     name: "list_files"
     argspec: "args=[\'file_pattern\', \'shuffle\', \'seed\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
-  member_method {
-    name: "make_initializable_iterator"
-    argspec: "args=[\'self\', \'shared_name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "make_one_shot_iterator"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "map"
     argspec: "args=[\'self\', \'map_func\', \'num_parallel_calls\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -104,10 +92,6 @@ tf_class {
     name: "repeat"
     argspec: "args=[\'self\', \'count\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
-  member_method {
-    name: "shard"
-    argspec: "args=[\'self\', \'num_shards\', \'index\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "shuffle"
     argspec: "args=[\'self\', \'buffer_size\', \'seed\', \'reshuffle_each_iteration\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-optimization-options.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-optimization-options.pbtxt
new file mode 100644
index 00000000000..9ca75828e55
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-optimization-options.pbtxt
@@ -0,0 +1,46 @@
+path: "tensorflow.data.experimental.OptimizationOptions"
+tf_class {
+  is_instance: "<class \'tensorflow.python.data.experimental.ops.optimization_options.OptimizationOptions\'>"
+  is_instance: "<class \'tensorflow.python.data.util.options.OptionsBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "filter_fusion"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "hoist_random_uniform"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "map_and_batch_fusion"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "map_and_filter_fusion"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "map_fusion"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "map_parallelization"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "map_vectorization"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "noop_elimination"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "shuffle_and_repeat_fusion"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-random-dataset.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-random-dataset.pbtxt
index 1dd53b1eabd..0b34bbc9426 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-random-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-random-dataset.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.data.experimental.RandomDataset"
 tf_class {
-  is_instance: "<class \'tensorflow.python.data.experimental.ops.random_ops.RandomDataset\'>"
+  is_instance: "<class \'tensorflow.python.data.experimental.ops.random_ops.RandomDatasetV2\'>"
   is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetSource\'>"
-  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.Dataset\'>"
+  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetV2\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "output_classes"
@@ -48,10 +48,6 @@ tf_class {
     name: "from_generator"
     argspec: "args=[\'generator\', \'output_types\', \'output_shapes\', \'args\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
-  member_method {
-    name: "from_sparse_tensor_slices"
-    argspec: "args=[\'sparse_tensor\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "from_tensor_slices"
     argspec: "args=[\'tensors\'], varargs=None, keywords=None, defaults=None"
@@ -68,14 +64,6 @@ tf_class {
     name: "list_files"
     argspec: "args=[\'file_pattern\', \'shuffle\', \'seed\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
-  member_method {
-    name: "make_initializable_iterator"
-    argspec: "args=[\'self\', \'shared_name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "make_one_shot_iterator"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "map"
     argspec: "args=[\'self\', \'map_func\', \'num_parallel_calls\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -104,10 +92,6 @@ tf_class {
     name: "repeat"
     argspec: "args=[\'self\', \'count\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
-  member_method {
-    name: "shard"
-    argspec: "args=[\'self\', \'num_shards\', \'index\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "shuffle"
     argspec: "args=[\'self\', \'buffer_size\', \'seed\', \'reshuffle_each_iteration\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-sql-dataset.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-sql-dataset.pbtxt
index 8fdd9dc52e3..0e61890eee4 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-sql-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-sql-dataset.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.data.experimental.SqlDataset"
 tf_class {
-  is_instance: "<class \'tensorflow.python.data.experimental.ops.readers.SqlDataset\'>"
+  is_instance: "<class \'tensorflow.python.data.experimental.ops.readers.SqlDatasetV2\'>"
   is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetSource\'>"
-  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.Dataset\'>"
+  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetV2\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "output_classes"
@@ -48,10 +48,6 @@ tf_class {
     name: "from_generator"
     argspec: "args=[\'generator\', \'output_types\', \'output_shapes\', \'args\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
-  member_method {
-    name: "from_sparse_tensor_slices"
-    argspec: "args=[\'sparse_tensor\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "from_tensor_slices"
     argspec: "args=[\'tensors\'], varargs=None, keywords=None, defaults=None"
@@ -68,14 +64,6 @@ tf_class {
     name: "list_files"
     argspec: "args=[\'file_pattern\', \'shuffle\', \'seed\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
-  member_method {
-    name: "make_initializable_iterator"
-    argspec: "args=[\'self\', \'shared_name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "make_one_shot_iterator"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "map"
     argspec: "args=[\'self\', \'map_func\', \'num_parallel_calls\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -104,10 +92,6 @@ tf_class {
     name: "repeat"
     argspec: "args=[\'self\', \'count\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
-  member_method {
-    name: "shard"
-    argspec: "args=[\'self\', \'num_shards\', \'index\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "shuffle"
     argspec: "args=[\'self\', \'buffer_size\', \'seed\', \'reshuffle_each_iteration\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-stats-options.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-stats-options.pbtxt
index f423eed42cc..892f8c1fb89 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-stats-options.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-stats-options.pbtxt
@@ -1,6 +1,7 @@
 path: "tensorflow.data.experimental.StatsOptions"
 tf_class {
   is_instance: "<class \'tensorflow.python.data.experimental.ops.stats_options.StatsOptions\'>"
+  is_instance: "<class \'tensorflow.python.data.util.options.OptionsBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "aggregator"
@@ -20,6 +21,6 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'aggregator\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-threading-options.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-threading-options.pbtxt
new file mode 100644
index 00000000000..5b5ebf10801
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-threading-options.pbtxt
@@ -0,0 +1,18 @@
+path: "tensorflow.data.experimental.ThreadingOptions"
+tf_class {
+  is_instance: "<class \'tensorflow.python.data.experimental.ops.threading_options.ThreadingOptions\'>"
+  is_instance: "<class \'tensorflow.python.data.util.options.OptionsBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "max_intra_op_parallelism"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "private_threadpool_size"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.pbtxt
index 4c253bb8adf..f981b1af177 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.pbtxt
@@ -12,6 +12,14 @@ tf_module {
     name: "CsvDataset"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "INFINITE_CARDINALITY"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "OptimizationOptions"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "Optional"
     mtype: "<type \'type\'>"
@@ -40,6 +48,14 @@ tf_module {
     name: "TFRecordWriter"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "ThreadingOptions"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "UNKNOWN_CARDINALITY"
+    mtype: "<type \'int\'>"
+  }
   member_method {
     name: "Counter"
     argspec: "args=[\'start\', \'step\', \'dtype\'], varargs=None, keywords=None, defaults=[\'0\', \'1\', \"<dtype: \'int64\'>\"], "
@@ -48,6 +64,10 @@ tf_module {
     name: "bucket_by_sequence_length"
     argspec: "args=[\'element_length_func\', \'bucket_boundaries\', \'bucket_batch_sizes\', \'padded_shapes\', \'padding_values\', \'pad_to_bucket_boundary\', \'no_padding\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'False\', \'False\'], "
   }
+  member_method {
+    name: "cardinality"
+    argspec: "args=[\'dataset\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "choose_from_datasets"
     argspec: "args=[\'datasets\', \'choice_dataset\'], varargs=None, keywords=None, defaults=None"
@@ -64,6 +84,10 @@ tf_module {
     name: "enumerate_dataset"
     argspec: "args=[\'start\'], varargs=None, keywords=None, defaults=[\'0\'], "
   }
+  member_method {
+    name: "filter_for_shard"
+    argspec: "args=[\'num_shards\', \'shard_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_next_as_optional"
     argspec: "args=[\'iterator\'], varargs=None, keywords=None, defaults=None"
@@ -90,7 +114,7 @@ tf_module {
   }
   member_method {
     name: "make_batched_features_dataset"
-    argspec: "args=[\'file_pattern\', \'batch_size\', \'features\', \'reader\', \'label_key\', \'reader_args\', \'num_epochs\', \'shuffle\', \'shuffle_buffer_size\', \'shuffle_seed\', \'prefetch_buffer_size\', \'reader_num_threads\', \'parser_num_threads\', \'sloppy_ordering\', \'drop_final_batch\'], varargs=None, keywords=None, defaults=[\"<class \'tensorflow.python.data.ops.readers.TFRecordDataset\'>\", \'None\', \'None\', \'None\', \'True\', \'10000\', \'None\', \'-1\', \'1\', \'2\', \'False\', \'False\'], "
+    argspec: "args=[\'file_pattern\', \'batch_size\', \'features\', \'reader\', \'label_key\', \'reader_args\', \'num_epochs\', \'shuffle\', \'shuffle_buffer_size\', \'shuffle_seed\', \'prefetch_buffer_size\', \'reader_num_threads\', \'parser_num_threads\', \'sloppy_ordering\', \'drop_final_batch\'], varargs=None, keywords=None, defaults=[\"<class \'tensorflow.python.data.ops.readers.TFRecordDatasetV1\'>\", \'None\', \'None\', \'None\', \'True\', \'10000\', \'None\', \'-1\', \'1\', \'2\', \'False\', \'False\'], "
   }
   member_method {
     name: "make_csv_dataset"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.pbtxt
index 509bbae8332..4c3d6ddd852 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.pbtxt
@@ -8,10 +8,6 @@ tf_module {
     name: "FixedLengthRecordDataset"
     mtype: "<type \'type\'>"
   }
-  member {
-    name: "Iterator"
-    mtype: "<type \'type\'>"
-  }
   member {
     name: "Options"
     mtype: "<type \'type\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.debugging.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.debugging.pbtxt
index ab6287f8cd0..314aedda909 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.debugging.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.debugging.pbtxt
@@ -6,19 +6,19 @@ tf_module {
   }
   member_method {
     name: "assert_all_finite"
-    argspec: "args=[\'t\', \'msg\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'x\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "assert_equal"
-    argspec: "args=[\'x\', \'y\', \'data\', \'summarize\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'x\', \'y\', \'message\', \'summarize\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "assert_greater"
-    argspec: "args=[\'x\', \'y\', \'data\', \'summarize\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'x\', \'y\', \'message\', \'summarize\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "assert_greater_equal"
-    argspec: "args=[\'x\', \'y\', \'data\', \'summarize\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'x\', \'y\', \'message\', \'summarize\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "assert_integer"
@@ -26,35 +26,35 @@ tf_module {
   }
   member_method {
     name: "assert_less"
-    argspec: "args=[\'x\', \'y\', \'data\', \'summarize\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'x\', \'y\', \'message\', \'summarize\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "assert_less_equal"
-    argspec: "args=[\'x\', \'y\', \'data\', \'summarize\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'x\', \'y\', \'message\', \'summarize\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "assert_near"
-    argspec: "args=[\'x\', \'y\', \'rtol\', \'atol\', \'data\', \'summarize\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'x\', \'y\', \'rtol\', \'atol\', \'message\', \'summarize\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "assert_negative"
-    argspec: "args=[\'x\', \'data\', \'summarize\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'x\', \'message\', \'summarize\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "assert_non_negative"
-    argspec: "args=[\'x\', \'data\', \'summarize\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'x\', \'message\', \'summarize\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "assert_non_positive"
-    argspec: "args=[\'x\', \'data\', \'summarize\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'x\', \'message\', \'summarize\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "assert_none_equal"
-    argspec: "args=[\'x\', \'y\', \'data\', \'summarize\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'x\', \'y\', \'summarize\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "assert_positive"
-    argspec: "args=[\'x\', \'data\', \'summarize\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'x\', \'message\', \'summarize\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "assert_proper_iterable"
@@ -62,15 +62,15 @@ tf_module {
   }
   member_method {
     name: "assert_rank"
-    argspec: "args=[\'x\', \'rank\', \'data\', \'summarize\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'x\', \'rank\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "assert_rank_at_least"
-    argspec: "args=[\'x\', \'rank\', \'data\', \'summarize\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'x\', \'rank\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "assert_rank_in"
-    argspec: "args=[\'x\', \'ranks\', \'data\', \'summarize\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'x\', \'ranks\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "assert_same_float_dtype"
@@ -78,7 +78,7 @@ tf_module {
   }
   member_method {
     name: "assert_scalar"
-    argspec: "args=[\'tensor\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'tensor\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "assert_type"
@@ -88,28 +88,8 @@ tf_module {
     name: "check_numerics"
     argspec: "args=[\'tensor\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
-  member_method {
-    name: "is_finite"
-    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "is_inf"
-    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "is_nan"
-    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "is_non_decreasing"
-    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
   member_method {
     name: "is_numeric_tensor"
     argspec: "args=[\'tensor\'], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "is_strictly_increasing"
-    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-input-context.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-input-context.pbtxt
new file mode 100644
index 00000000000..583cbc66549
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-input-context.pbtxt
@@ -0,0 +1,25 @@
+path: "tensorflow.distribute.InputContext"
+tf_class {
+  is_instance: "<class \'tensorflow.python.distribute.distribute_lib.InputContext\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "input_pipeline_id"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "num_input_pipelines"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "num_replicas_in_sync"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'num_input_pipelines\', \'input_pipeline_id\', \'num_replicas_in_sync\'], varargs=None, keywords=None, defaults=[\'1\', \'0\', \'1\'], "
+  }
+  member_method {
+    name: "get_per_replica_batch_size"
+    argspec: "args=[\'self\', \'global_batch_size\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-input-replication-mode.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-input-replication-mode.pbtxt
new file mode 100644
index 00000000000..6a7a3a97aa0
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-input-replication-mode.pbtxt
@@ -0,0 +1,8 @@
+path: "tensorflow.distribute.InputReplicationMode"
+tf_class {
+  is_instance: "<enum \'InputReplicationMode\'>"
+  member {
+    name: "PER_WORKER"
+    mtype: "<enum \'InputReplicationMode\'>"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-reduce-op.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-reduce-op.pbtxt
new file mode 100644
index 00000000000..4899f38cad2
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-reduce-op.pbtxt
@@ -0,0 +1,12 @@
+path: "tensorflow.distribute.ReduceOp"
+tf_class {
+  is_instance: "<enum \'ReduceOp\'>"
+  member {
+    name: "MEAN"
+    mtype: "<enum \'ReduceOp\'>"
+  }
+  member {
+    name: "SUM"
+    mtype: "<enum \'ReduceOp\'>"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-replica-context.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-replica-context.pbtxt
new file mode 100644
index 00000000000..df707e8920e
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-replica-context.pbtxt
@@ -0,0 +1,33 @@
+path: "tensorflow.distribute.ReplicaContext"
+tf_class {
+  is_instance: "<class \'tensorflow.python.distribute.distribute_lib.ReplicaContext\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "devices"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "distribution_strategy"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "num_replicas_in_sync"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "replica_id_in_sync_group"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "strategy"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'strategy\', \'replica_id_in_sync_group\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "merge_call"
+    argspec: "args=[\'self\', \'merge_fn\', \'args\', \'kwargs\'], varargs=None, keywords=None, defaults=[\'()\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-strategy-extended.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-strategy-extended.pbtxt
new file mode 100644
index 00000000000..77706e57133
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-strategy-extended.pbtxt
@@ -0,0 +1,81 @@
+path: "tensorflow.distribute.StrategyExtended"
+tf_class {
+  is_instance: "<class \'tensorflow.python.distribute.distribute_lib.DistributionStrategyExtended\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "experimental_between_graph"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "experimental_require_static_shapes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "experimental_should_init"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "parameter_devices"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "should_checkpoint"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "should_save_summary"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "worker_devices"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'container_strategy\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "batch_reduce_to"
+    argspec: "args=[\'self\', \'reduce_op\', \'value_destination_pairs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "broadcast_to"
+    argspec: "args=[\'self\', \'tensor\', \'destinations\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call_for_each_replica"
+    argspec: "args=[\'self\', \'fn\', \'args\', \'kwargs\'], varargs=None, keywords=None, defaults=[\'()\', \'None\'], "
+  }
+  member_method {
+    name: "colocate_vars_with"
+    argspec: "args=[\'self\', \'colocate_with_variable\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "experimental_run_steps_on_iterator"
+    argspec: "args=[\'self\', \'fn\', \'iterator\', \'iterations\', \'initial_loop_values\'], varargs=None, keywords=None, defaults=[\'1\', \'None\'], "
+  }
+  member_method {
+    name: "non_slot_devices"
+    argspec: "args=[\'self\', \'var_list\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "read_var"
+    argspec: "args=[\'self\', \'v\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reduce_to"
+    argspec: "args=[\'self\', \'reduce_op\', \'value\', \'destinations\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update"
+    argspec: "args=[\'self\', \'var\', \'fn\', \'args\', \'kwargs\', \'group\'], varargs=None, keywords=None, defaults=[\'()\', \'None\', \'True\'], "
+  }
+  member_method {
+    name: "update_non_slot"
+    argspec: "args=[\'self\', \'colocate_with\', \'fn\', \'args\', \'kwargs\', \'group\'], varargs=None, keywords=None, defaults=[\'()\', \'None\', \'True\'], "
+  }
+  member_method {
+    name: "value_container"
+    argspec: "args=[\'self\', \'value\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-strategy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-strategy.pbtxt
new file mode 100644
index 00000000000..9eb73d2c0d9
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-strategy.pbtxt
@@ -0,0 +1,137 @@
+path: "tensorflow.distribute.Strategy"
+tf_class {
+  is_instance: "<class \'tensorflow.python.distribute.distribute_lib.DistributionStrategy\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "between_graph"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "extended"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "num_replicas_in_sync"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "parameter_devices"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "require_static_shapes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "should_checkpoint"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "should_init"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "should_save_summary"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "worker_devices"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'extended\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "batch_reduce"
+    argspec: "args=[\'self\', \'aggregation\', \'value_destination_pairs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "broadcast"
+    argspec: "args=[\'self\', \'tensor\', \'destinations\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "call_for_each_replica"
+    argspec: "args=[\'self\', \'fn\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "colocate_vars_with"
+    argspec: "args=[\'self\', \'colocate_with_variable\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "configure"
+    argspec: "args=[\'self\', \'session_config\', \'cluster_spec\', \'task_type\', \'task_id\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "distribute_dataset"
+    argspec: "args=[\'self\', \'dataset_fn\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "experimental_finalize"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "experimental_initialize"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "finalize"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "group"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "initialize"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "make_dataset_iterator"
+    argspec: "args=[\'self\', \'dataset\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "make_input_fn_iterator"
+    argspec: "args=[\'self\', \'input_fn\', \'replication_mode\'], varargs=None, keywords=None, defaults=[\'InputReplicationMode.PER_WORKER\'], "
+  }
+  member_method {
+    name: "non_slot_devices"
+    argspec: "args=[\'self\', \'var_list\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "read_var"
+    argspec: "args=[\'self\', \'v\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reduce"
+    argspec: "args=[\'self\', \'reduce_op\', \'value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "run_steps_on_dataset"
+    argspec: "args=[\'self\', \'fn\', \'iterator\', \'iterations\', \'initial_loop_values\'], varargs=None, keywords=None, defaults=[\'1\', \'None\'], "
+  }
+  member_method {
+    name: "scope"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "unwrap"
+    argspec: "args=[\'self\', \'value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update"
+    argspec: "args=[\'self\', \'var\', \'fn\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "update_config_proto"
+    argspec: "args=[\'self\', \'config_proto\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_non_slot"
+    argspec: "args=[\'self\', \'colocate_with\', \'fn\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "value_container"
+    argspec: "args=[\'self\', \'value\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.pbtxt
new file mode 100644
index 00000000000..4d833b54ba0
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.pbtxt
@@ -0,0 +1,47 @@
+path: "tensorflow.distribute"
+tf_module {
+  member {
+    name: "InputContext"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "InputReplicationMode"
+    mtype: "<class \'enum.EnumMeta\'>"
+  }
+  member {
+    name: "ReduceOp"
+    mtype: "<class \'enum.EnumMeta\'>"
+  }
+  member {
+    name: "ReplicaContext"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "Strategy"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "StrategyExtended"
+    mtype: "<type \'type\'>"
+  }
+  member_method {
+    name: "get_loss_reduction"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_replica_context"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_strategy"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "has_strategy"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "in_cross_replica_context"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-baseline-classifier.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-baseline-classifier.pbtxt
index 32b84e90ce6..efe9e746970 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-baseline-classifier.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-baseline-classifier.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.estimator.BaselineClassifier"
 tf_class {
-  is_instance: "<class \'tensorflow_estimator.python.estimator.canned.baseline.BaselineClassifier\'>"
-  is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.Estimator\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.canned.baseline.BaselineClassifierV2\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.EstimatorV2\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "config"
@@ -21,7 +21,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'model_dir\', \'n_classes\', \'weight_column\', \'label_vocabulary\', \'optimizer\', \'config\', \'loss_reduction\'], varargs=None, keywords=None, defaults=[\'None\', \'2\', \'None\', \'None\', \'Ftrl\', \'None\', \'weighted_sum\'], "
+    argspec: "args=[\'self\', \'model_dir\', \'n_classes\', \'weight_column\', \'label_vocabulary\', \'optimizer\', \'config\', \'loss_reduction\'], varargs=None, keywords=None, defaults=[\'None\', \'2\', \'None\', \'None\', \'Ftrl\', \'None\', \'weighted_sum_over_batch_size\'], "
   }
   member_method {
     name: "eval_dir"
@@ -32,12 +32,12 @@ tf_class {
     argspec: "args=[\'self\', \'input_fn\', \'steps\', \'hooks\', \'checkpoint_path\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
-    name: "export_saved_model"
-    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+    name: "experimental_export_all_saved_models"
+    argspec: "args=[\'self\', \'export_dir_base\', \'input_receiver_fn_map\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
   }
   member_method {
-    name: "export_savedmodel"
-    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'strip_default_attrs\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'False\'], "
+    name: "export_saved_model"
+    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'experimental_mode\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'infer\'], "
   }
   member_method {
     name: "get_variable_names"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-baseline-estimator.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-baseline-estimator.pbtxt
index 94933e7ffd6..382d392f39e 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-baseline-estimator.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-baseline-estimator.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.estimator.BaselineEstimator"
 tf_class {
-  is_instance: "<class \'tensorflow_estimator.python.estimator.canned.baseline.BaselineEstimator\'>"
-  is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.Estimator\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.canned.baseline.BaselineEstimatorV2\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.EstimatorV2\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "config"
@@ -32,12 +32,12 @@ tf_class {
     argspec: "args=[\'self\', \'input_fn\', \'steps\', \'hooks\', \'checkpoint_path\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
-    name: "export_saved_model"
-    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+    name: "experimental_export_all_saved_models"
+    argspec: "args=[\'self\', \'export_dir_base\', \'input_receiver_fn_map\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
   }
   member_method {
-    name: "export_savedmodel"
-    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'strip_default_attrs\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'False\'], "
+    name: "export_saved_model"
+    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'experimental_mode\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'infer\'], "
   }
   member_method {
     name: "get_variable_names"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-baseline-regressor.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-baseline-regressor.pbtxt
index db7776b5bf6..a7300bf06bb 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-baseline-regressor.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-baseline-regressor.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.estimator.BaselineRegressor"
 tf_class {
-  is_instance: "<class \'tensorflow_estimator.python.estimator.canned.baseline.BaselineRegressor\'>"
-  is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.Estimator\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.canned.baseline.BaselineRegressorV2\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.EstimatorV2\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "config"
@@ -21,7 +21,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'model_dir\', \'label_dimension\', \'weight_column\', \'optimizer\', \'config\', \'loss_reduction\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'None\', \'Ftrl\', \'None\', \'weighted_sum\'], "
+    argspec: "args=[\'self\', \'model_dir\', \'label_dimension\', \'weight_column\', \'optimizer\', \'config\', \'loss_reduction\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'None\', \'Ftrl\', \'None\', \'weighted_sum_over_batch_size\'], "
   }
   member_method {
     name: "eval_dir"
@@ -32,12 +32,12 @@ tf_class {
     argspec: "args=[\'self\', \'input_fn\', \'steps\', \'hooks\', \'checkpoint_path\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
-    name: "export_saved_model"
-    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+    name: "experimental_export_all_saved_models"
+    argspec: "args=[\'self\', \'export_dir_base\', \'input_receiver_fn_map\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
   }
   member_method {
-    name: "export_savedmodel"
-    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'strip_default_attrs\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'False\'], "
+    name: "export_saved_model"
+    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'experimental_mode\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'infer\'], "
   }
   member_method {
     name: "get_variable_names"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-boosted-trees-classifier.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-boosted-trees-classifier.pbtxt
index d530c71482a..e138ce936ec 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-boosted-trees-classifier.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-boosted-trees-classifier.pbtxt
@@ -3,6 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow_estimator.python.estimator.canned.boosted_trees.BoostedTreesClassifier\'>"
   is_instance: "<class \'tensorflow_estimator.python.estimator.canned.boosted_trees._BoostedTreesBase\'>"
   is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.Estimator\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.EstimatorV2\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "config"
@@ -32,6 +33,10 @@ tf_class {
     name: "evaluate"
     argspec: "args=[\'self\', \'input_fn\', \'steps\', \'hooks\', \'checkpoint_path\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
+  member_method {
+    name: "experimental_export_all_saved_models"
+    argspec: "args=[\'self\', \'export_dir_base\', \'input_receiver_fn_map\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+  }
   member_method {
     name: "experimental_feature_importances"
     argspec: "args=[\'self\', \'normalize\'], varargs=None, keywords=None, defaults=[\'False\'], "
@@ -42,7 +47,7 @@ tf_class {
   }
   member_method {
     name: "export_saved_model"
-    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'experimental_mode\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'infer\'], "
   }
   member_method {
     name: "export_savedmodel"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-boosted-trees-regressor.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-boosted-trees-regressor.pbtxt
index 4703c0f561a..eae0a292a96 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-boosted-trees-regressor.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-boosted-trees-regressor.pbtxt
@@ -3,6 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow_estimator.python.estimator.canned.boosted_trees.BoostedTreesRegressor\'>"
   is_instance: "<class \'tensorflow_estimator.python.estimator.canned.boosted_trees._BoostedTreesBase\'>"
   is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.Estimator\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.EstimatorV2\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "config"
@@ -32,6 +33,10 @@ tf_class {
     name: "evaluate"
     argspec: "args=[\'self\', \'input_fn\', \'steps\', \'hooks\', \'checkpoint_path\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
+  member_method {
+    name: "experimental_export_all_saved_models"
+    argspec: "args=[\'self\', \'export_dir_base\', \'input_receiver_fn_map\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+  }
   member_method {
     name: "experimental_feature_importances"
     argspec: "args=[\'self\', \'normalize\'], varargs=None, keywords=None, defaults=[\'False\'], "
@@ -42,7 +47,7 @@ tf_class {
   }
   member_method {
     name: "export_saved_model"
-    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'experimental_mode\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'infer\'], "
   }
   member_method {
     name: "export_savedmodel"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-d-n-n-classifier.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-d-n-n-classifier.pbtxt
index ce6040d0f27..a540085aba4 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-d-n-n-classifier.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-d-n-n-classifier.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.estimator.DNNClassifier"
 tf_class {
-  is_instance: "<class \'tensorflow_estimator.python.estimator.canned.dnn.DNNClassifier\'>"
-  is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.Estimator\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.canned.dnn.DNNClassifierV2\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.EstimatorV2\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "config"
@@ -21,7 +21,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'hidden_units\', \'feature_columns\', \'model_dir\', \'n_classes\', \'weight_column\', \'label_vocabulary\', \'optimizer\', \'activation_fn\', \'dropout\', \'input_layer_partitioner\', \'config\', \'warm_start_from\', \'loss_reduction\', \'batch_norm\'], varargs=None, keywords=None, defaults=[\'None\', \'2\', \'None\', \'None\', \'Adagrad\', \'<function relu instance>\', \'None\', \'None\', \'None\', \'None\', \'weighted_sum\', \'False\'], "
+    argspec: "args=[\'self\', \'hidden_units\', \'feature_columns\', \'model_dir\', \'n_classes\', \'weight_column\', \'label_vocabulary\', \'optimizer\', \'activation_fn\', \'dropout\', \'input_layer_partitioner\', \'config\', \'warm_start_from\', \'loss_reduction\', \'batch_norm\'], varargs=None, keywords=None, defaults=[\'None\', \'2\', \'None\', \'None\', \'Adagrad\', \'<function relu instance>\', \'None\', \'None\', \'None\', \'None\', \'weighted_sum_over_batch_size\', \'False\'], "
   }
   member_method {
     name: "eval_dir"
@@ -32,12 +32,12 @@ tf_class {
     argspec: "args=[\'self\', \'input_fn\', \'steps\', \'hooks\', \'checkpoint_path\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
-    name: "export_saved_model"
-    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+    name: "experimental_export_all_saved_models"
+    argspec: "args=[\'self\', \'export_dir_base\', \'input_receiver_fn_map\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
   }
   member_method {
-    name: "export_savedmodel"
-    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'strip_default_attrs\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'False\'], "
+    name: "export_saved_model"
+    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'experimental_mode\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'infer\'], "
   }
   member_method {
     name: "get_variable_names"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-d-n-n-estimator.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-d-n-n-estimator.pbtxt
index 4635a1544c3..d1b29d670a0 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-d-n-n-estimator.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-d-n-n-estimator.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.estimator.DNNEstimator"
 tf_class {
-  is_instance: "<class \'tensorflow_estimator.python.estimator.canned.dnn.DNNEstimator\'>"
-  is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.Estimator\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.canned.dnn.DNNEstimatorV2\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.EstimatorV2\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "config"
@@ -32,12 +32,12 @@ tf_class {
     argspec: "args=[\'self\', \'input_fn\', \'steps\', \'hooks\', \'checkpoint_path\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
-    name: "export_saved_model"
-    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+    name: "experimental_export_all_saved_models"
+    argspec: "args=[\'self\', \'export_dir_base\', \'input_receiver_fn_map\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
   }
   member_method {
-    name: "export_savedmodel"
-    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'strip_default_attrs\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'False\'], "
+    name: "export_saved_model"
+    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'experimental_mode\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'infer\'], "
   }
   member_method {
     name: "get_variable_names"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-d-n-n-linear-combined-classifier.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-d-n-n-linear-combined-classifier.pbtxt
index e85007e16ed..f6c3910a9fe 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-d-n-n-linear-combined-classifier.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-d-n-n-linear-combined-classifier.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.estimator.DNNLinearCombinedClassifier"
 tf_class {
-  is_instance: "<class \'tensorflow_estimator.python.estimator.canned.dnn_linear_combined.DNNLinearCombinedClassifier\'>"
-  is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.Estimator\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.canned.dnn_linear_combined.DNNLinearCombinedClassifierV2\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.EstimatorV2\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "config"
@@ -21,7 +21,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'model_dir\', \'linear_feature_columns\', \'linear_optimizer\', \'dnn_feature_columns\', \'dnn_optimizer\', \'dnn_hidden_units\', \'dnn_activation_fn\', \'dnn_dropout\', \'n_classes\', \'weight_column\', \'label_vocabulary\', \'input_layer_partitioner\', \'config\', \'warm_start_from\', \'loss_reduction\', \'batch_norm\', \'linear_sparse_combiner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'Ftrl\', \'None\', \'Adagrad\', \'None\', \'<function relu instance>\', \'None\', \'2\', \'None\', \'None\', \'None\', \'None\', \'None\', \'weighted_sum\', \'False\', \'sum\'], "
+    argspec: "args=[\'self\', \'model_dir\', \'linear_feature_columns\', \'linear_optimizer\', \'dnn_feature_columns\', \'dnn_optimizer\', \'dnn_hidden_units\', \'dnn_activation_fn\', \'dnn_dropout\', \'n_classes\', \'weight_column\', \'label_vocabulary\', \'input_layer_partitioner\', \'config\', \'warm_start_from\', \'loss_reduction\', \'batch_norm\', \'linear_sparse_combiner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'Ftrl\', \'None\', \'Adagrad\', \'None\', \'<function relu instance>\', \'None\', \'2\', \'None\', \'None\', \'None\', \'None\', \'None\', \'weighted_sum_over_batch_size\', \'False\', \'sum\'], "
   }
   member_method {
     name: "eval_dir"
@@ -32,12 +32,12 @@ tf_class {
     argspec: "args=[\'self\', \'input_fn\', \'steps\', \'hooks\', \'checkpoint_path\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
-    name: "export_saved_model"
-    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+    name: "experimental_export_all_saved_models"
+    argspec: "args=[\'self\', \'export_dir_base\', \'input_receiver_fn_map\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
   }
   member_method {
-    name: "export_savedmodel"
-    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'strip_default_attrs\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'False\'], "
+    name: "export_saved_model"
+    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'experimental_mode\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'infer\'], "
   }
   member_method {
     name: "get_variable_names"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-d-n-n-linear-combined-estimator.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-d-n-n-linear-combined-estimator.pbtxt
index a23f5daeac4..b78527279ca 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-d-n-n-linear-combined-estimator.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-d-n-n-linear-combined-estimator.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.estimator.DNNLinearCombinedEstimator"
 tf_class {
-  is_instance: "<class \'tensorflow_estimator.python.estimator.canned.dnn_linear_combined.DNNLinearCombinedEstimator\'>"
-  is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.Estimator\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.canned.dnn_linear_combined.DNNLinearCombinedEstimatorV2\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.EstimatorV2\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "config"
@@ -32,12 +32,12 @@ tf_class {
     argspec: "args=[\'self\', \'input_fn\', \'steps\', \'hooks\', \'checkpoint_path\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
-    name: "export_saved_model"
-    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+    name: "experimental_export_all_saved_models"
+    argspec: "args=[\'self\', \'export_dir_base\', \'input_receiver_fn_map\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
   }
   member_method {
-    name: "export_savedmodel"
-    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'strip_default_attrs\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'False\'], "
+    name: "export_saved_model"
+    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'experimental_mode\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'infer\'], "
   }
   member_method {
     name: "get_variable_names"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-d-n-n-linear-combined-regressor.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-d-n-n-linear-combined-regressor.pbtxt
index 8a55bb835ff..9133f0d3b28 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-d-n-n-linear-combined-regressor.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-d-n-n-linear-combined-regressor.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.estimator.DNNLinearCombinedRegressor"
 tf_class {
-  is_instance: "<class \'tensorflow_estimator.python.estimator.canned.dnn_linear_combined.DNNLinearCombinedRegressor\'>"
-  is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.Estimator\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.canned.dnn_linear_combined.DNNLinearCombinedRegressorV2\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.EstimatorV2\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "config"
@@ -21,7 +21,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'model_dir\', \'linear_feature_columns\', \'linear_optimizer\', \'dnn_feature_columns\', \'dnn_optimizer\', \'dnn_hidden_units\', \'dnn_activation_fn\', \'dnn_dropout\', \'label_dimension\', \'weight_column\', \'input_layer_partitioner\', \'config\', \'warm_start_from\', \'loss_reduction\', \'batch_norm\', \'linear_sparse_combiner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'Ftrl\', \'None\', \'Adagrad\', \'None\', \'<function relu instance>\', \'None\', \'1\', \'None\', \'None\', \'None\', \'None\', \'weighted_sum\', \'False\', \'sum\'], "
+    argspec: "args=[\'self\', \'model_dir\', \'linear_feature_columns\', \'linear_optimizer\', \'dnn_feature_columns\', \'dnn_optimizer\', \'dnn_hidden_units\', \'dnn_activation_fn\', \'dnn_dropout\', \'label_dimension\', \'weight_column\', \'input_layer_partitioner\', \'config\', \'warm_start_from\', \'loss_reduction\', \'batch_norm\', \'linear_sparse_combiner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'Ftrl\', \'None\', \'Adagrad\', \'None\', \'<function relu instance>\', \'None\', \'1\', \'None\', \'None\', \'None\', \'None\', \'weighted_sum_over_batch_size\', \'False\', \'sum\'], "
   }
   member_method {
     name: "eval_dir"
@@ -32,12 +32,12 @@ tf_class {
     argspec: "args=[\'self\', \'input_fn\', \'steps\', \'hooks\', \'checkpoint_path\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
-    name: "export_saved_model"
-    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+    name: "experimental_export_all_saved_models"
+    argspec: "args=[\'self\', \'export_dir_base\', \'input_receiver_fn_map\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
   }
   member_method {
-    name: "export_savedmodel"
-    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'strip_default_attrs\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'False\'], "
+    name: "export_saved_model"
+    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'experimental_mode\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'infer\'], "
   }
   member_method {
     name: "get_variable_names"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-d-n-n-regressor.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-d-n-n-regressor.pbtxt
index 2c4128ec480..a58d733302d 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-d-n-n-regressor.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-d-n-n-regressor.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.estimator.DNNRegressor"
 tf_class {
-  is_instance: "<class \'tensorflow_estimator.python.estimator.canned.dnn.DNNRegressor\'>"
-  is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.Estimator\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.canned.dnn.DNNRegressorV2\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.EstimatorV2\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "config"
@@ -21,7 +21,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'hidden_units\', \'feature_columns\', \'model_dir\', \'label_dimension\', \'weight_column\', \'optimizer\', \'activation_fn\', \'dropout\', \'input_layer_partitioner\', \'config\', \'warm_start_from\', \'loss_reduction\', \'batch_norm\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'None\', \'Adagrad\', \'<function relu instance>\', \'None\', \'None\', \'None\', \'None\', \'weighted_sum\', \'False\'], "
+    argspec: "args=[\'self\', \'hidden_units\', \'feature_columns\', \'model_dir\', \'label_dimension\', \'weight_column\', \'optimizer\', \'activation_fn\', \'dropout\', \'input_layer_partitioner\', \'config\', \'warm_start_from\', \'loss_reduction\', \'batch_norm\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'None\', \'Adagrad\', \'<function relu instance>\', \'None\', \'None\', \'None\', \'None\', \'weighted_sum_over_batch_size\', \'False\'], "
   }
   member_method {
     name: "eval_dir"
@@ -32,12 +32,12 @@ tf_class {
     argspec: "args=[\'self\', \'input_fn\', \'steps\', \'hooks\', \'checkpoint_path\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
-    name: "export_saved_model"
-    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+    name: "experimental_export_all_saved_models"
+    argspec: "args=[\'self\', \'export_dir_base\', \'input_receiver_fn_map\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
   }
   member_method {
-    name: "export_savedmodel"
-    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'strip_default_attrs\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'False\'], "
+    name: "export_saved_model"
+    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'experimental_mode\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'infer\'], "
   }
   member_method {
     name: "get_variable_names"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-estimator.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-estimator.pbtxt
index 9d270a87ab8..a1f0e76c8b8 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-estimator.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-estimator.pbtxt
@@ -1,6 +1,6 @@
 path: "tensorflow.estimator.Estimator"
 tf_class {
-  is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.Estimator\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.EstimatorV2\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "config"
@@ -31,12 +31,12 @@ tf_class {
     argspec: "args=[\'self\', \'input_fn\', \'steps\', \'hooks\', \'checkpoint_path\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
-    name: "export_saved_model"
-    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+    name: "experimental_export_all_saved_models"
+    argspec: "args=[\'self\', \'export_dir_base\', \'input_receiver_fn_map\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
   }
   member_method {
-    name: "export_savedmodel"
-    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'strip_default_attrs\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'False\'], "
+    name: "export_saved_model"
+    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'experimental_mode\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'infer\'], "
   }
   member_method {
     name: "get_variable_names"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-linear-classifier.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-linear-classifier.pbtxt
index 4acbff2cfff..47de660a386 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-linear-classifier.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-linear-classifier.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.estimator.LinearClassifier"
 tf_class {
   is_instance: "<class \'tensorflow_estimator.python.estimator.canned.linear.LinearClassifierV2\'>"
-  is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.Estimator\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.EstimatorV2\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "config"
@@ -32,12 +32,12 @@ tf_class {
     argspec: "args=[\'self\', \'input_fn\', \'steps\', \'hooks\', \'checkpoint_path\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
-    name: "export_saved_model"
-    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+    name: "experimental_export_all_saved_models"
+    argspec: "args=[\'self\', \'export_dir_base\', \'input_receiver_fn_map\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
   }
   member_method {
-    name: "export_savedmodel"
-    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'strip_default_attrs\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'False\'], "
+    name: "export_saved_model"
+    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'experimental_mode\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'infer\'], "
   }
   member_method {
     name: "get_variable_names"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-linear-estimator.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-linear-estimator.pbtxt
index 3d6b03098aa..66a127606a5 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-linear-estimator.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-linear-estimator.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.estimator.LinearEstimator"
 tf_class {
-  is_instance: "<class \'tensorflow_estimator.python.estimator.canned.linear.LinearEstimator\'>"
-  is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.Estimator\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.canned.linear.LinearEstimatorV2\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.EstimatorV2\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "config"
@@ -32,12 +32,12 @@ tf_class {
     argspec: "args=[\'self\', \'input_fn\', \'steps\', \'hooks\', \'checkpoint_path\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
-    name: "export_saved_model"
-    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+    name: "experimental_export_all_saved_models"
+    argspec: "args=[\'self\', \'export_dir_base\', \'input_receiver_fn_map\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
   }
   member_method {
-    name: "export_savedmodel"
-    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'strip_default_attrs\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'False\'], "
+    name: "export_saved_model"
+    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'experimental_mode\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'infer\'], "
   }
   member_method {
     name: "get_variable_names"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-linear-regressor.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-linear-regressor.pbtxt
index 0d1510e9ab1..5c094fe1318 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-linear-regressor.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-linear-regressor.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.estimator.LinearRegressor"
 tf_class {
-  is_instance: "<class \'tensorflow_estimator.python.estimator.canned.linear.LinearRegressor\'>"
-  is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.Estimator\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.canned.linear.LinearRegressorV2\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.EstimatorV2\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "config"
@@ -21,7 +21,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'feature_columns\', \'model_dir\', \'label_dimension\', \'weight_column\', \'optimizer\', \'config\', \'partitioner\', \'warm_start_from\', \'loss_reduction\', \'sparse_combiner\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'None\', \'Ftrl\', \'None\', \'None\', \'None\', \'weighted_sum\', \'sum\'], "
+    argspec: "args=[\'self\', \'feature_columns\', \'model_dir\', \'label_dimension\', \'weight_column\', \'optimizer\', \'config\', \'partitioner\', \'warm_start_from\', \'loss_reduction\', \'sparse_combiner\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'None\', \'Ftrl\', \'None\', \'None\', \'None\', \'weighted_sum_over_batch_size\', \'sum\'], "
   }
   member_method {
     name: "eval_dir"
@@ -32,12 +32,12 @@ tf_class {
     argspec: "args=[\'self\', \'input_fn\', \'steps\', \'hooks\', \'checkpoint_path\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
-    name: "export_saved_model"
-    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+    name: "experimental_export_all_saved_models"
+    argspec: "args=[\'self\', \'export_dir_base\', \'input_receiver_fn_map\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
   }
   member_method {
-    name: "export_savedmodel"
-    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'strip_default_attrs\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'False\'], "
+    name: "export_saved_model"
+    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'experimental_mode\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'infer\'], "
   }
   member_method {
     name: "get_variable_names"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.experimental.-in-memory-evaluator-hook.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.experimental.-in-memory-evaluator-hook.pbtxt
new file mode 100644
index 00000000000..aba120218cc
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.experimental.-in-memory-evaluator-hook.pbtxt
@@ -0,0 +1,30 @@
+path: "tensorflow.estimator.experimental.InMemoryEvaluatorHook"
+tf_class {
+  is_instance: "<class \'tensorflow_estimator.python.estimator.hooks.InMemoryEvaluatorHook\'>"
+  is_instance: "<class \'tensorflow.python.training.session_run_hook.SessionRunHook\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'estimator\', \'input_fn\', \'steps\', \'hooks\', \'name\', \'every_n_iter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'100\'], "
+  }
+  member_method {
+    name: "after_create_session"
+    argspec: "args=[\'self\', \'session\', \'coord\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "after_run"
+    argspec: "args=[\'self\', \'run_context\', \'run_values\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "before_run"
+    argspec: "args=[\'self\', \'run_context\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "begin"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "end"
+    argspec: "args=[\'self\', \'session\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.experimental.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.experimental.pbtxt
index cabca3e883f..f0fd7ce782d 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.estimator.experimental.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.experimental.pbtxt
@@ -1,9 +1,17 @@
 path: "tensorflow.estimator.experimental"
 tf_module {
+  member {
+    name: "InMemoryEvaluatorHook"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "LinearSDCA"
     mtype: "<type \'type\'>"
   }
+  member_method {
+    name: "build_raw_supervised_input_receiver_fn"
+    argspec: "args=[\'features\', \'labels\', \'default_batch_size\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "call_logit_fn"
     argspec: "args=[\'logit_fn\', \'features\', \'mode\', \'params\', \'config\'], varargs=None, keywords=None, defaults=None"
@@ -20,6 +28,10 @@ tf_module {
     name: "make_early_stopping_hook"
     argspec: "args=[\'estimator\', \'should_stop_fn\', \'run_every_secs\', \'run_every_steps\'], varargs=None, keywords=None, defaults=[\'60\', \'None\'], "
   }
+  member_method {
+    name: "make_stop_at_checkpoint_step_hook"
+    argspec: "args=[\'estimator\', \'last_step\', \'wait_after_file_check_secs\'], varargs=None, keywords=None, defaults=[\'30\'], "
+  }
   member_method {
     name: "stop_if_higher_hook"
     argspec: "args=[\'estimator\', \'metric_name\', \'threshold\', \'eval_dir\', \'min_steps\', \'run_every_secs\', \'run_every_steps\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \'60\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.feature_column.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.feature_column.pbtxt
index f06e7989537..3aadd7dc341 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.feature_column.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.feature_column.pbtxt
@@ -14,7 +14,7 @@ tf_module {
   }
   member_method {
     name: "categorical_column_with_vocabulary_file"
-    argspec: "args=[\'key\', \'vocabulary_file\', \'vocabulary_size\', \'num_oov_buckets\', \'default_value\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \'None\', \"<dtype: \'string\'>\"], "
+    argspec: "args=[\'key\', \'vocabulary_file\', \'vocabulary_size\', \'dtype\', \'default_value\', \'num_oov_buckets\'], varargs=None, keywords=None, defaults=[\'None\', \"<dtype: \'string\'>\", \'None\', \'0\'], "
   }
   member_method {
     name: "categorical_column_with_vocabulary_list"
@@ -32,14 +32,6 @@ tf_module {
     name: "indicator_column"
     argspec: "args=[\'categorical_column\'], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "input_layer"
-    argspec: "args=[\'features\', \'feature_columns\', \'weight_collections\', \'trainable\', \'cols_to_vars\', \'cols_to_output_tensors\'], varargs=None, keywords=None, defaults=[\'None\', \'True\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "linear_model"
-    argspec: "args=[\'features\', \'feature_columns\', \'units\', \'sparse_combiner\', \'weight_collections\', \'trainable\', \'cols_to_vars\'], varargs=None, keywords=None, defaults=[\'1\', \'sum\', \'None\', \'True\', \'None\'], "
-  }
   member_method {
     name: "make_parse_example_spec"
     argspec: "args=[\'feature_columns\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.gfile.-fast-g-file.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.gfile.-fast-g-file.pbtxt
deleted file mode 100644
index eecfaffd0a6..00000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.gfile.-fast-g-file.pbtxt
+++ /dev/null
@@ -1,58 +0,0 @@
-path: "tensorflow.gfile.FastGFile"
-tf_class {
-  is_instance: "<class \'tensorflow.python.platform.gfile.FastGFile\'>"
-  is_instance: "<class \'tensorflow.python.lib.io.file_io.FileIO\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "mode"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "name"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'name\', \'mode\'], varargs=None, keywords=None, defaults=[\'r\'], "
-  }
-  member_method {
-    name: "close"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "flush"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "next"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "read"
-    argspec: "args=[\'self\', \'n\'], varargs=None, keywords=None, defaults=[\'-1\'], "
-  }
-  member_method {
-    name: "readline"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "readlines"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "seek"
-    argspec: "args=[\'self\', \'offset\', \'whence\', \'position\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \'None\'], "
-  }
-  member_method {
-    name: "size"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "tell"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "write"
-    argspec: "args=[\'self\', \'file_content\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.gfile.-g-file.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.gfile.-g-file.pbtxt
deleted file mode 100644
index 305251059d9..00000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.gfile.-g-file.pbtxt
+++ /dev/null
@@ -1,58 +0,0 @@
-path: "tensorflow.gfile.GFile"
-tf_class {
-  is_instance: "<class \'tensorflow.python.platform.gfile.GFile\'>"
-  is_instance: "<class \'tensorflow.python.lib.io.file_io.FileIO\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "mode"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "name"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'name\', \'mode\'], varargs=None, keywords=None, defaults=[\'r\'], "
-  }
-  member_method {
-    name: "close"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "flush"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "next"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "read"
-    argspec: "args=[\'self\', \'n\'], varargs=None, keywords=None, defaults=[\'-1\'], "
-  }
-  member_method {
-    name: "readline"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "readlines"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "seek"
-    argspec: "args=[\'self\', \'offset\', \'whence\', \'position\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \'None\'], "
-  }
-  member_method {
-    name: "size"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "tell"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "write"
-    argspec: "args=[\'self\', \'file_content\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.gfile.-open.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.gfile.-open.pbtxt
deleted file mode 100644
index 6e8894180a4..00000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.gfile.-open.pbtxt
+++ /dev/null
@@ -1,58 +0,0 @@
-path: "tensorflow.gfile.Open"
-tf_class {
-  is_instance: "<class \'tensorflow.python.platform.gfile.GFile\'>"
-  is_instance: "<class \'tensorflow.python.lib.io.file_io.FileIO\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "mode"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "name"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'name\', \'mode\'], varargs=None, keywords=None, defaults=[\'r\'], "
-  }
-  member_method {
-    name: "close"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "flush"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "next"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "read"
-    argspec: "args=[\'self\', \'n\'], varargs=None, keywords=None, defaults=[\'-1\'], "
-  }
-  member_method {
-    name: "readline"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "readlines"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "seek"
-    argspec: "args=[\'self\', \'offset\', \'whence\', \'position\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \'None\'], "
-  }
-  member_method {
-    name: "size"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "tell"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "write"
-    argspec: "args=[\'self\', \'file_content\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.gfile.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.gfile.pbtxt
deleted file mode 100644
index 65b55a8b7c4..00000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.gfile.pbtxt
+++ /dev/null
@@ -1,63 +0,0 @@
-path: "tensorflow.gfile"
-tf_module {
-  member {
-    name: "FastGFile"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "GFile"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "Open"
-    mtype: "<type \'type\'>"
-  }
-  member_method {
-    name: "Copy"
-    argspec: "args=[\'oldpath\', \'newpath\', \'overwrite\'], varargs=None, keywords=None, defaults=[\'False\'], "
-  }
-  member_method {
-    name: "DeleteRecursively"
-    argspec: "args=[\'dirname\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "Exists"
-    argspec: "args=[\'filename\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "Glob"
-    argspec: "args=[\'filename\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "IsDirectory"
-    argspec: "args=[\'dirname\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "ListDirectory"
-    argspec: "args=[\'dirname\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "MakeDirs"
-    argspec: "args=[\'dirname\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "MkDir"
-    argspec: "args=[\'dirname\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "Remove"
-    argspec: "args=[\'filename\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "Rename"
-    argspec: "args=[\'oldname\', \'newname\', \'overwrite\'], varargs=None, keywords=None, defaults=[\'False\'], "
-  }
-  member_method {
-    name: "Stat"
-    argspec: "args=[\'filename\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "Walk"
-    argspec: "args=[\'top\', \'in_order\'], varargs=None, keywords=None, defaults=[\'True\'], "
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.image.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.image.pbtxt
index 0a231f1b651..3c6ed1cfb83 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.image.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.image.pbtxt
@@ -38,7 +38,7 @@ tf_module {
   }
   member_method {
     name: "crop_and_resize"
-    argspec: "args=[\'image\', \'boxes\', \'box_ind\', \'crop_size\', \'method\', \'extrapolation_value\', \'name\'], varargs=None, keywords=None, defaults=[\'bilinear\', \'0\', \'None\'], "
+    argspec: "args=[\'image\', \'boxes\', \'box_indices\', \'crop_size\', \'method\', \'extrapolation_value\', \'name\'], varargs=None, keywords=None, defaults=[\'bilinear\', \'0\', \'None\'], "
   }
   member_method {
     name: "crop_to_bounding_box"
@@ -86,7 +86,7 @@ tf_module {
   }
   member_method {
     name: "extract_image_patches"
-    argspec: "args=[\'images\', \'ksizes\', \'strides\', \'rates\', \'padding\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'images\', \'sizes\', \'strides\', \'rates\', \'padding\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "extract_jpeg_shape"
@@ -173,16 +173,8 @@ tf_module {
     argspec: "args=[\'image\', \'lower\', \'upper\', \'seed\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
-    name: "resize_area"
-    argspec: "args=[\'images\', \'size\', \'align_corners\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
-  }
-  member_method {
-    name: "resize_bicubic"
-    argspec: "args=[\'images\', \'size\', \'align_corners\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
-  }
-  member_method {
-    name: "resize_bilinear"
-    argspec: "args=[\'images\', \'size\', \'align_corners\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+    name: "resize"
+    argspec: "args=[\'images\', \'size\', \'method\', \'align_corners\', \'preserve_aspect_ratio\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'False\', \'False\', \'None\'], "
   }
   member_method {
     name: "resize_image_with_crop_or_pad"
@@ -192,14 +184,6 @@ tf_module {
     name: "resize_image_with_pad"
     argspec: "args=[\'image\', \'target_height\', \'target_width\', \'method\'], varargs=None, keywords=None, defaults=[\'0\'], "
   }
-  member_method {
-    name: "resize_images"
-    argspec: "args=[\'images\', \'size\', \'method\', \'align_corners\', \'preserve_aspect_ratio\'], varargs=None, keywords=None, defaults=[\'0\', \'False\', \'False\'], "
-  }
-  member_method {
-    name: "resize_nearest_neighbor"
-    argspec: "args=[\'images\', \'size\', \'align_corners\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
-  }
   member_method {
     name: "rgb_to_grayscale"
     argspec: "args=[\'images\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -222,7 +206,7 @@ tf_module {
   }
   member_method {
     name: "sample_distorted_bounding_box"
-    argspec: "args=[\'image_size\', \'bounding_boxes\', \'seed\', \'seed2\', \'min_object_covered\', \'aspect_ratio_range\', \'area_range\', \'max_attempts\', \'use_image_if_no_bounding_boxes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'0.1\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'image_size\', \'bounding_boxes\', \'seed\', \'min_object_covered\', \'aspect_ratio_range\', \'area_range\', \'max_attempts\', \'use_image_if_no_bounding_boxes\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'0.1\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "sobel_edges"
@@ -241,8 +225,8 @@ tf_module {
     argspec: "args=[\'images\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
-    name: "transpose_image"
-    argspec: "args=[\'image\'], varargs=None, keywords=None, defaults=None"
+    name: "transpose"
+    argspec: "args=[\'image\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "yiq_to_rgb"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.io.gfile.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.io.gfile.pbtxt
new file mode 100644
index 00000000000..93d9b0fd75b
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.io.gfile.pbtxt
@@ -0,0 +1,51 @@
+path: "tensorflow.io.gfile"
+tf_module {
+  member_method {
+    name: "copy"
+    argspec: "args=[\'src\', \'dst\', \'overwrite\'], varargs=None, keywords=None, defaults=[\'False\'], "
+  }
+  member_method {
+    name: "exists"
+    argspec: "args=[\'path\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "glob"
+    argspec: "args=[\'pattern\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "isdir"
+    argspec: "args=[\'path\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "listdir"
+    argspec: "args=[\'path\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "makedirs"
+    argspec: "args=[\'path\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "mkdir"
+    argspec: "args=[\'path\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "remove"
+    argspec: "args=[\'path\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "rename"
+    argspec: "args=[\'src\', \'dst\', \'overwrite\'], varargs=None, keywords=None, defaults=[\'False\'], "
+  }
+  member_method {
+    name: "rmtree"
+    argspec: "args=[\'path\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "stat"
+    argspec: "args=[\'path\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "walk"
+    argspec: "args=[\'top\', \'topdown\', \'onerror\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.io.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.io.pbtxt
index 64b63ed1a4a..8906329742c 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.io.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.io.pbtxt
@@ -44,22 +44,50 @@ tf_module {
     name: "VarLenFeature"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "gfile"
+    mtype: "<type \'module\'>"
+  }
+  member_method {
+    name: "decode_and_crop_jpeg"
+    argspec: "args=[\'contents\', \'crop_window\', \'channels\', \'ratio\', \'fancy_upscaling\', \'try_recover_truncated\', \'acceptable_fraction\', \'dct_method\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'1\', \'True\', \'False\', \'1\', \'\', \'None\'], "
+  }
   member_method {
     name: "decode_base64"
     argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "decode_bmp"
+    argspec: "args=[\'contents\', \'channels\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'None\'], "
+  }
   member_method {
     name: "decode_compressed"
     argspec: "args=[\'bytes\', \'compression_type\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'None\'], "
   }
   member_method {
     name: "decode_csv"
-    argspec: "args=[\'records\', \'record_defaults\', \'field_delim\', \'use_quote_delim\', \'name\', \'na_value\', \'select_cols\'], varargs=None, keywords=None, defaults=[\',\', \'True\', \'None\', \'\', \'None\'], "
+    argspec: "args=[\'records\', \'record_defaults\', \'field_delim\', \'use_quote_delim\', \'na_value\', \'select_cols\', \'name\'], varargs=None, keywords=None, defaults=[\',\', \'True\', \'\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "decode_gif"
+    argspec: "args=[\'contents\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "decode_image"
+    argspec: "args=[\'contents\', \'channels\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \"<dtype: \'uint8\'>\", \'None\'], "
+  }
+  member_method {
+    name: "decode_jpeg"
+    argspec: "args=[\'contents\', \'channels\', \'ratio\', \'fancy_upscaling\', \'try_recover_truncated\', \'acceptable_fraction\', \'dct_method\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'1\', \'True\', \'False\', \'1\', \'\', \'None\'], "
   }
   member_method {
     name: "decode_json_example"
     argspec: "args=[\'json_examples\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "decode_png"
+    argspec: "args=[\'contents\', \'channels\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \"<dtype: \'uint8\'>\", \'None\'], "
+  }
   member_method {
     name: "decode_raw"
     argspec: "args=[\'bytes\', \'out_type\', \'little_endian\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
@@ -72,6 +100,18 @@ tf_module {
     name: "encode_base64"
     argspec: "args=[\'input\', \'pad\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
   }
+  member_method {
+    name: "encode_jpeg"
+    argspec: "args=[\'image\', \'format\', \'quality\', \'progressive\', \'optimize_size\', \'chroma_downsampling\', \'density_unit\', \'x_density\', \'y_density\', \'xmp_metadata\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'95\', \'False\', \'False\', \'True\', \'in\', \'300\', \'300\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "extract_jpeg_shape"
+    argspec: "args=[\'contents\', \'output_type\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'int32\'>\", \'None\'], "
+  }
+  member_method {
+    name: "is_jpeg"
+    argspec: "args=[\'contents\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "match_filenames_once"
     argspec: "args=[\'pattern\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -82,7 +122,7 @@ tf_module {
   }
   member_method {
     name: "parse_example"
-    argspec: "args=[\'serialized\', \'features\', \'name\', \'example_names\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'serialized\', \'features\', \'example_names\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "parse_sequence_example"
@@ -90,7 +130,7 @@ tf_module {
   }
   member_method {
     name: "parse_single_example"
-    argspec: "args=[\'serialized\', \'features\', \'name\', \'example_names\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'serialized\', \'features\', \'example_names\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "parse_single_sequence_example"
@@ -106,20 +146,16 @@ tf_module {
   }
   member_method {
     name: "serialize_many_sparse"
-    argspec: "args=[\'sp_input\', \'name\', \'out_type\'], varargs=None, keywords=None, defaults=[\'None\', \"<dtype: \'string\'>\"], "
+    argspec: "args=[\'sp_input\', \'out_type\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'string\'>\", \'None\'], "
   }
   member_method {
     name: "serialize_sparse"
-    argspec: "args=[\'sp_input\', \'name\', \'out_type\'], varargs=None, keywords=None, defaults=[\'None\', \"<dtype: \'string\'>\"], "
+    argspec: "args=[\'sp_input\', \'out_type\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'string\'>\", \'None\'], "
   }
   member_method {
     name: "serialize_tensor"
     argspec: "args=[\'tensor\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
-  member_method {
-    name: "tf_record_iterator"
-    argspec: "args=[\'path\', \'options\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
   member_method {
     name: "write_file"
     argspec: "args=[\'filename\', \'contents\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.-model.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.-model.pbtxt
index 8ccba990bdd..a3254cbd947 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.-model.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.-model.pbtxt
@@ -41,6 +41,14 @@ tf_class {
     name: "losses"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "metrics"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "metrics_names"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "name"
     mtype: "<type \'property\'>"
@@ -69,6 +77,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "run_eagerly"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "state_updates"
     mtype: "<type \'property\'>"
@@ -105,13 +117,17 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
@@ -225,6 +241,10 @@ tf_class {
     name: "predict_on_batch"
     argspec: "args=[\'self\', \'x\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "reset_metrics"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "reset_states"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -247,7 +267,7 @@ tf_class {
   }
   member_method {
     name: "test_on_batch"
-    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'reset_metrics\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'True\'], "
   }
   member_method {
     name: "to_json"
@@ -259,6 +279,6 @@ tf_class {
   }
   member_method {
     name: "train_on_batch"
-    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'class_weight\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'class_weight\', \'reset_metrics\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\'], "
   }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.-sequential.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.-sequential.pbtxt
index 27aa91a6452..b70e9ee98d5 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.-sequential.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.-sequential.pbtxt
@@ -42,6 +42,14 @@ tf_class {
     name: "losses"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "metrics"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "metrics_names"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "name"
     mtype: "<type \'property\'>"
@@ -70,6 +78,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "run_eagerly"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "state_updates"
     mtype: "<type \'property\'>"
@@ -110,13 +122,17 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
@@ -242,6 +258,10 @@ tf_class {
     name: "predict_proba"
     argspec: "args=[\'self\', \'x\', \'batch_size\', \'verbose\'], varargs=None, keywords=None, defaults=[\'32\', \'0\'], "
   }
+  member_method {
+    name: "reset_metrics"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "reset_states"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -264,7 +284,7 @@ tf_class {
   }
   member_method {
     name: "test_on_batch"
-    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'reset_metrics\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'True\'], "
   }
   member_method {
     name: "to_json"
@@ -276,6 +296,6 @@ tf_class {
   }
   member_method {
     name: "train_on_batch"
-    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'class_weight\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'class_weight\', \'reset_metrics\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\'], "
   }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.backend.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.backend.pbtxt
index b30778b2a08..d200d3d26d7 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.backend.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.backend.pbtxt
@@ -394,7 +394,7 @@ tf_module {
   }
   member_method {
     name: "rnn"
-    argspec: "args=[\'step_function\', \'inputs\', \'initial_states\', \'go_backwards\', \'mask\', \'constants\', \'unroll\', \'input_length\', \'time_major\'], varargs=None, keywords=None, defaults=[\'False\', \'None\', \'None\', \'False\', \'None\', \'False\'], "
+    argspec: "args=[\'step_function\', \'inputs\', \'initial_states\', \'go_backwards\', \'mask\', \'constants\', \'unroll\', \'input_length\', \'time_major\', \'zero_output_for_mask\'], varargs=None, keywords=None, defaults=[\'False\', \'None\', \'None\', \'False\', \'None\', \'False\', \'False\'], "
   }
   member_method {
     name: "round"
@@ -508,6 +508,10 @@ tf_module {
     name: "temporal_padding"
     argspec: "args=[\'x\', \'padding\'], varargs=None, keywords=None, defaults=[\'(1, 1)\'], "
   }
+  member_method {
+    name: "tile"
+    argspec: "args=[\'x\', \'n\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "to_dense"
     argspec: "args=[\'tensor\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-peephole-l-s-t-m-cell.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-peephole-l-s-t-m-cell.pbtxt
index db69e25c5b7..1d814b2c8b5 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-peephole-l-s-t-m-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-peephole-l-s-t-m-cell.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-activation.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-activation.pbtxt
index 5510465d7b0..b84629540e7 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-activation.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-activation.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-activity-regularization.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-activity-regularization.pbtxt
index 38ec8a0aff0..5918a13ad86 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-activity-regularization.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-activity-regularization.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-add.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-add.pbtxt
index 41cb8e30bfb..599da06427d 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-add.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-add.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-alpha-dropout.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-alpha-dropout.pbtxt
index 9a7aaa8e961..f9ff1538c81 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-alpha-dropout.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-alpha-dropout.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling1-d.pbtxt
index 014f5828fad..723fc9cdb0d 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling1-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling2-d.pbtxt
index cc303bf7b98..957ce2f0ce8 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling2-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling3-d.pbtxt
index 628447ce355..a52c0af6817 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling3-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average.pbtxt
index f03c986c222..a004db62ddc 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool1-d.pbtxt
index a6e4856de9b..44f83d1387c 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool1-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool2-d.pbtxt
index a01eaf8a126..8378faf7188 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool2-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool3-d.pbtxt
index 0d6698f2ef4..9d5655c9644 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool3-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-batch-normalization.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-batch-normalization.pbtxt
index f1b23be48f7..5da79268129 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-batch-normalization.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-batch-normalization.pbtxt
@@ -1,6 +1,6 @@
 path: "tensorflow.keras.layers.BatchNormalization"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras.layers.normalization.BatchNormalization\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.normalization.BatchNormalizationV2\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-bidirectional.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-bidirectional.pbtxt
index 0672cd5b7b8..d37a6b47105 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-bidirectional.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-bidirectional.pbtxt
@@ -97,6 +97,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-concatenate.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-concatenate.pbtxt
index b25ae1e82e8..1ad7a91be0b 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-concatenate.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-concatenate.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt
index bb1918eba65..cb9abc25396 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt
@@ -178,6 +178,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv1-d.pbtxt
index 16e0fd5a313..47dba1d81f8 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv1-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv2-d-transpose.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv2-d-transpose.pbtxt
index 381839d6deb..fd649418961 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv2-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv2-d-transpose.pbtxt
@@ -90,6 +90,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv2-d.pbtxt
index 543bae6fa96..1b1425d5319 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv2-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv3-d-transpose.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv3-d-transpose.pbtxt
index 2933f9f4b3a..1741063fe8b 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv3-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv3-d-transpose.pbtxt
@@ -90,6 +90,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv3-d.pbtxt
index 072943dc2c7..50feb4f458a 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv3-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution1-d.pbtxt
index 222a1ef4fc5..faaa535df9f 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution1-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution2-d-transpose.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution2-d-transpose.pbtxt
index 9c9c7461c8b..4079329d1ee 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution2-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution2-d-transpose.pbtxt
@@ -90,6 +90,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution2-d.pbtxt
index f9390671781..32e56696e16 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution2-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt
index 44ca598724a..381abe73401 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt
@@ -90,6 +90,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution3-d.pbtxt
index 471b18ef850..b3e4bf9689d 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution3-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping1-d.pbtxt
index 0f250a09b7e..7aeff8003c3 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping1-d.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping2-d.pbtxt
index f52128483c6..a1728d9d4f9 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping2-d.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping3-d.pbtxt
index 98daf3bab12..8d8fd142cc6 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping3-d.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cu-d-n-n-g-r-u.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cu-d-n-n-g-r-u.pbtxt
index b207c680005..7758209adf8 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cu-d-n-n-g-r-u.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cu-d-n-n-g-r-u.pbtxt
@@ -98,6 +98,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cu-d-n-n-l-s-t-m.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cu-d-n-n-l-s-t-m.pbtxt
index 2d7a09ceda9..7c463ff1257 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cu-d-n-n-l-s-t-m.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cu-d-n-n-l-s-t-m.pbtxt
@@ -98,6 +98,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-multi-r-n-n-cell.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dense-features.pbtxt
similarity index 77%
rename from tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-multi-r-n-n-cell.pbtxt
rename to tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dense-features.pbtxt
index b66c0f89cc9..0781a93bd56 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-multi-r-n-n-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dense-features.pbtxt
@@ -1,8 +1,6 @@
-path: "tensorflow.nn.rnn_cell.MultiRNNCell"
+path: "tensorflow.keras.layers.DenseFeatures"
 tf_class {
-  is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.MultiRNNCell\'>"
-  is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.RNNCell\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.feature_column.feature_column_v2.DenseFeatures\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
@@ -14,10 +12,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -66,18 +60,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "output_size"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "state_size"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -100,12 +82,16 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'cells\', \'state_is_tuple\'], varargs=None, keywords=None, defaults=[\'True\'], "
+    argspec: "args=[\'self\', \'feature_columns\', \'trainable\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'True\', \'None\'], "
   }
   member_method {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -116,7 +102,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
@@ -128,7 +114,7 @@ tf_class {
   }
   member_method {
     name: "call"
-    argspec: "args=[\'self\', \'inputs\', \'state\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'features\', \'cols_to_output_tensors\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "compute_mask"
@@ -150,10 +136,6 @@ tf_class {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "get_initial_state"
-    argspec: "args=[\'self\', \'inputs\', \'batch_size\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
-  }
   member_method {
     name: "get_input_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -194,8 +176,4 @@ tf_class {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "zero_state"
-    argspec: "args=[\'self\', \'batch_size\', \'dtype\'], varargs=None, keywords=None, defaults=None"
-  }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dense.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dense.pbtxt
index 3ac38257593..4960d0264e9 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dense.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dense.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-depthwise-conv2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-depthwise-conv2-d.pbtxt
index 280ec8c25fa..8fad7535f88 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-depthwise-conv2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-depthwise-conv2-d.pbtxt
@@ -90,6 +90,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dot.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dot.pbtxt
index 560f66f9c7a..5b425f2d4d7 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dot.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dot.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dropout.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dropout.pbtxt
index c0543529c38..f6c4d0a438e 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dropout.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dropout.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-e-l-u.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-e-l-u.pbtxt
index 04eb2824b9b..82b761fc176 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-e-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-e-l-u.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-embedding.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-embedding.pbtxt
index f400432915f..c9ff323877e 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-embedding.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-embedding.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-flatten.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-flatten.pbtxt
index ab176b441a2..9b4165d4cbf 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-flatten.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-flatten.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-g-r-u-cell.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-g-r-u-cell.pbtxt
index c3895a0ac12..f225f7c4309 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-g-r-u-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-g-r-u-cell.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-g-r-u.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-g-r-u.pbtxt
index 9e24bb8ae6a..855d0017001 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-g-r-u.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-g-r-u.pbtxt
@@ -161,6 +161,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-gaussian-dropout.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-gaussian-dropout.pbtxt
index 55e0d7ef023..2c404c99cd2 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-gaussian-dropout.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-gaussian-dropout.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-gaussian-noise.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-gaussian-noise.pbtxt
index 38fbff5e4a3..6f109d59d0f 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-gaussian-noise.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-gaussian-noise.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling1-d.pbtxt
index a8094c0bde3..69f8a9031d3 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling1-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling2-d.pbtxt
index 929f48df231..4299f765e52 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling2-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling3-d.pbtxt
index 2e6d59337f1..9153a1a2406 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling3-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool1-d.pbtxt
index 3ebe162f573..625e81fd232 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool1-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool2-d.pbtxt
index 4e3e258430c..2fc769742c7 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool2-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool3-d.pbtxt
index fb9166316f6..e307a65c7c5 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool3-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool1-d.pbtxt
index c0a53b847b4..4394ad0364e 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool1-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool2-d.pbtxt
index 87b7f6797a0..050ed39fe98 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool2-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool3-d.pbtxt
index 98bf96fa0c2..436191821ef 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool3-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling1-d.pbtxt
index ff6c6f3ec4d..4ba540aa6ad 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling1-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling2-d.pbtxt
index c9d4158d1c4..a2e9322cb3f 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling2-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling3-d.pbtxt
index 9953102ff99..5d16a57fc1a 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling3-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-input-layer.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-input-layer.pbtxt
index 2617f5a95fa..9dd29c1251e 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-input-layer.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-input-layer.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-input-spec.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-input-spec.pbtxt
index 5fd0a47a68c..bc3ceb67a4e 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-input-spec.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-input-spec.pbtxt
@@ -1,6 +1,6 @@
 path: "tensorflow.keras.layers.InputSpec"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.InputSpec\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.input_spec.InputSpec\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt
index e9f6ef45aaf..0045d5775e2 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-l-s-t-m.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-l-s-t-m.pbtxt
index 1b1ccbe1180..529c750f987 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-l-s-t-m.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-l-s-t-m.pbtxt
@@ -161,6 +161,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-lambda.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-lambda.pbtxt
index 2e0b6bac24f..d4d1bc6b6bb 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-lambda.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-lambda.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-layer.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-layer.pbtxt
index 1e93d1118a4..e1f54911809 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-layer.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-layer.pbtxt
@@ -87,6 +87,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-leaky-re-l-u.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-leaky-re-l-u.pbtxt
index bfd36012a7e..9b69d9a9447 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-leaky-re-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-leaky-re-l-u.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-linear-model.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-linear-model.pbtxt
new file mode 100644
index 00000000000..2b66576c96b
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-linear-model.pbtxt
@@ -0,0 +1,289 @@
+path: "tensorflow.keras.layers.LinearModel"
+tf_class {
+  is_instance: "<class \'tensorflow.python.feature_column.feature_column_v2.LinearModel\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.training.Model\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.network.Network\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "bias"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_spec"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "layers"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "metrics"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "metrics_names"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "run_eagerly"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "state_updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'feature_columns\', \'units\', \'sparse_combiner\', \'trainable\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'1\', \'sum\', \'True\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'features\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compile"
+    argspec: "args=[\'self\', \'optimizer\', \'loss\', \'metrics\', \'loss_weights\', \'sample_weight_mode\', \'weighted_metrics\', \'target_tensors\', \'distribute\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "evaluate"
+    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'verbose\', \'sample_weight\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'1\', \'None\', \'None\', \'10\', \'1\', \'False\'], "
+  }
+  member_method {
+    name: "evaluate_generator"
+    argspec: "args=[\'self\', \'generator\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'verbose\'], varargs=None, keywords=None, defaults=[\'None\', \'10\', \'1\', \'False\', \'0\'], "
+  }
+  member_method {
+    name: "fit"
+    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'epochs\', \'verbose\', \'callbacks\', \'validation_split\', \'validation_data\', \'shuffle\', \'class_weight\', \'sample_weight\', \'initial_epoch\', \'steps_per_epoch\', \'validation_steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'1\', \'1\', \'None\', \'0.0\', \'None\', \'True\', \'None\', \'None\', \'0\', \'None\', \'None\', \'10\', \'1\', \'False\'], "
+  }
+  member_method {
+    name: "fit_generator"
+    argspec: "args=[\'self\', \'generator\', \'steps_per_epoch\', \'epochs\', \'verbose\', \'callbacks\', \'validation_data\', \'validation_steps\', \'class_weight\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'shuffle\', \'initial_epoch\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'1\', \'None\', \'None\', \'None\', \'None\', \'10\', \'1\', \'False\', \'True\', \'0\'], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_layer"
+    argspec: "args=[\'self\', \'name\', \'index\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "load_weights"
+    argspec: "args=[\'self\', \'filepath\', \'by_name\'], varargs=None, keywords=None, defaults=[\'False\'], "
+  }
+  member_method {
+    name: "predict"
+    argspec: "args=[\'self\', \'x\', \'batch_size\', \'verbose\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \'None\', \'10\', \'1\', \'False\'], "
+  }
+  member_method {
+    name: "predict_generator"
+    argspec: "args=[\'self\', \'generator\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'verbose\'], varargs=None, keywords=None, defaults=[\'None\', \'10\', \'1\', \'False\', \'0\'], "
+  }
+  member_method {
+    name: "predict_on_batch"
+    argspec: "args=[\'self\', \'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_metrics"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save"
+    argspec: "args=[\'self\', \'filepath\', \'overwrite\', \'include_optimizer\'], varargs=None, keywords=None, defaults=[\'True\', \'True\'], "
+  }
+  member_method {
+    name: "save_weights"
+    argspec: "args=[\'self\', \'filepath\', \'overwrite\', \'save_format\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "summary"
+    argspec: "args=[\'self\', \'line_length\', \'positions\', \'print_fn\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "test_on_batch"
+    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'reset_metrics\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'True\'], "
+  }
+  member_method {
+    name: "to_json"
+    argspec: "args=[\'self\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "to_yaml"
+    argspec: "args=[\'self\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "train_on_batch"
+    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'class_weight\', \'reset_metrics\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-locally-connected1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-locally-connected1-d.pbtxt
index 5ad5990d7e6..fd522594325 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-locally-connected1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-locally-connected1-d.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-locally-connected2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-locally-connected2-d.pbtxt
index 40d03369a52..5fc8af0d035 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-locally-connected2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-locally-connected2-d.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-masking.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-masking.pbtxt
index 86666b51bb8..7f8932270e6 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-masking.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-masking.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool1-d.pbtxt
index d26da270e74..4723b99cb07 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool1-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool2-d.pbtxt
index 85f23df671d..173c5d4a8b1 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool2-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool3-d.pbtxt
index 235806b9650..14e1899e145 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool3-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling1-d.pbtxt
index 524c5fd69e5..a708e652bf0 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling1-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling2-d.pbtxt
index fda2562fc8c..e6706b5cf9f 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling2-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling3-d.pbtxt
index 71d2d09a8d1..a73c082d1bb 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling3-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-maximum.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-maximum.pbtxt
index 12949b39a6f..f3f195554bb 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-maximum.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-maximum.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-minimum.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-minimum.pbtxt
index ab16d0021e6..f345d1d67b2 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-minimum.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-minimum.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-multiply.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-multiply.pbtxt
index 61ccbf59627..31cb8bc177c 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-multiply.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-multiply.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-p-re-l-u.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-p-re-l-u.pbtxt
index ce2320d7030..44cccc92bd2 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-p-re-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-p-re-l-u.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-permute.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-permute.pbtxt
index 69848af8cf8..b55e191ff1a 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-permute.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-permute.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-r-n-n.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-r-n-n.pbtxt
index 3358f26aebf..e9575436e5b 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-r-n-n.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-r-n-n.pbtxt
@@ -92,6 +92,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-re-l-u.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-re-l-u.pbtxt
index 413f45f018a..98223b207f2 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-re-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-re-l-u.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-repeat-vector.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-repeat-vector.pbtxt
index 9c61ff60274..2df918b16b2 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-repeat-vector.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-repeat-vector.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-reshape.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-reshape.pbtxt
index baa91804c49..ce5f9e21290 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-reshape.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-reshape.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-conv1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-conv1-d.pbtxt
index 15a5d6ac9ea..a0bb917775f 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-conv1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-conv1-d.pbtxt
@@ -90,6 +90,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-conv2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-conv2-d.pbtxt
index be43bd5b3c1..d7942f201bd 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-conv2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-conv2-d.pbtxt
@@ -90,6 +90,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-convolution1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-convolution1-d.pbtxt
index 6105992c7a3..f7ac9042d46 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-convolution1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-convolution1-d.pbtxt
@@ -90,6 +90,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-convolution2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-convolution2-d.pbtxt
index 1b6cf1e9ecb..e5a92688220 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-convolution2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-convolution2-d.pbtxt
@@ -90,6 +90,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-simple-r-n-n-cell.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-simple-r-n-n-cell.pbtxt
index 29488a37f8f..0fe2c974a76 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-simple-r-n-n-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-simple-r-n-n-cell.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-simple-r-n-n.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-simple-r-n-n.pbtxt
index 3d70cf8b659..2ee5873f0f1 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-simple-r-n-n.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-simple-r-n-n.pbtxt
@@ -149,6 +149,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-softmax.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-softmax.pbtxt
index d29731ecf9d..5b8f64aa357 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-softmax.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-softmax.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout1-d.pbtxt
index a6d7494ca7d..240cb6e562f 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout1-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout2-d.pbtxt
index c36e802693d..6226c469f8a 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout2-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout3-d.pbtxt
index 9c46cfe40fd..34dabce6d8d 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout3-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt
index 8982f787940..0ddf628ace5 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt
@@ -96,6 +96,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-subtract.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-subtract.pbtxt
index ec2cc502984..12eb35ad154 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-subtract.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-subtract.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-thresholded-re-l-u.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-thresholded-re-l-u.pbtxt
index d7bc1980f32..c41020c2b45 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-thresholded-re-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-thresholded-re-l-u.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-time-distributed.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-time-distributed.pbtxt
index fec2de6b49e..479f89cf6ae 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-time-distributed.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-time-distributed.pbtxt
@@ -93,6 +93,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling1-d.pbtxt
index 3d285e7f17d..233363ce026 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling1-d.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling2-d.pbtxt
index b05e5ec84de..cb6228ac446 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling2-d.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling3-d.pbtxt
index 728eca415a8..03bad3ccb61 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling3-d.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-wrapper.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-wrapper.pbtxt
index da64e77c39c..158996792a4 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-wrapper.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-wrapper.pbtxt
@@ -92,6 +92,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding1-d.pbtxt
index 2f505f9293f..63a56cd3eeb 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding1-d.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding2-d.pbtxt
index f82c77072e6..965a4cca046 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding2-d.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding3-d.pbtxt
index 54e01a99177..1a624308878 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding3-d.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.pbtxt
index 9d7e5bb8c78..3b4724ef104 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.pbtxt
@@ -124,6 +124,10 @@ tf_module {
     name: "Dense"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "DenseFeatures"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "DepthwiseConv2D"
     mtype: "<type \'type\'>"
@@ -240,6 +244,10 @@ tf_module {
     name: "LeakyReLU"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "LinearModel"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "LocallyConnected1D"
     mtype: "<type \'type\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-binary-crossentropy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-binary-crossentropy.pbtxt
new file mode 100644
index 00000000000..2f7da93f6f4
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-binary-crossentropy.pbtxt
@@ -0,0 +1,22 @@
+path: "tensorflow.keras.losses.BinaryCrossentropy"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.losses.BinaryCrossentropy\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.Loss\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'from_logits\', \'label_smoothing\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'0\', \'sum_over_batch_size\', \'None\'], "
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-categorical-crossentropy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-categorical-crossentropy.pbtxt
new file mode 100644
index 00000000000..b3a7cd80973
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-categorical-crossentropy.pbtxt
@@ -0,0 +1,22 @@
+path: "tensorflow.keras.losses.CategoricalCrossentropy"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.losses.CategoricalCrossentropy\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.Loss\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'from_logits\', \'label_smoothing\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'0\', \'sum_over_batch_size\', \'None\'], "
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-mean-absolute-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-mean-absolute-error.pbtxt
new file mode 100644
index 00000000000..712bb2ecd35
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-mean-absolute-error.pbtxt
@@ -0,0 +1,22 @@
+path: "tensorflow.keras.losses.MeanAbsoluteError"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.losses.MeanAbsoluteError\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.Loss\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'sum_over_batch_size\', \'None\'], "
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-mean-absolute-percentage-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-mean-absolute-percentage-error.pbtxt
new file mode 100644
index 00000000000..7fe362da89b
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-mean-absolute-percentage-error.pbtxt
@@ -0,0 +1,22 @@
+path: "tensorflow.keras.losses.MeanAbsolutePercentageError"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.losses.MeanAbsolutePercentageError\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.Loss\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'sum_over_batch_size\', \'None\'], "
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-mean-squared-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-mean-squared-error.pbtxt
new file mode 100644
index 00000000000..a5718533500
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-mean-squared-error.pbtxt
@@ -0,0 +1,22 @@
+path: "tensorflow.keras.losses.MeanSquaredError"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.losses.MeanSquaredError\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.Loss\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'sum_over_batch_size\', \'None\'], "
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-mean-squared-logarithmic-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-mean-squared-logarithmic-error.pbtxt
new file mode 100644
index 00000000000..200006db355
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-mean-squared-logarithmic-error.pbtxt
@@ -0,0 +1,22 @@
+path: "tensorflow.keras.losses.MeanSquaredLogarithmicError"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.losses.MeanSquaredLogarithmicError\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.Loss\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'sum_over_batch_size\', \'None\'], "
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-reduction.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-reduction.pbtxt
new file mode 100644
index 00000000000..f20ed26e2ea
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-reduction.pbtxt
@@ -0,0 +1,28 @@
+path: "tensorflow.keras.losses.Reduction"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.losses.losses_impl.ReductionV2\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "NONE"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "SUM"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "SUM_OVER_BATCH_SIZE"
+    mtype: "<type \'str\'>"
+  }
+  member_method {
+    name: "__init__"
+  }
+  member_method {
+    name: "all"
+    argspec: "args=[\'cls\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "validate"
+    argspec: "args=[\'cls\', \'key\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.pbtxt
index eca6b915388..c198096d252 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.pbtxt
@@ -1,5 +1,33 @@
 path: "tensorflow.keras.losses"
 tf_module {
+  member {
+    name: "BinaryCrossentropy"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "CategoricalCrossentropy"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "MeanAbsoluteError"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "MeanAbsolutePercentageError"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "MeanSquaredError"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "MeanSquaredLogarithmicError"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "Reduction"
+    mtype: "<type \'type\'>"
+  }
   member_method {
     name: "KLD"
     argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
@@ -22,11 +50,11 @@ tf_module {
   }
   member_method {
     name: "binary_crossentropy"
-    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'y_true\', \'y_pred\', \'from_logits\'], varargs=None, keywords=None, defaults=[\'False\'], "
   }
   member_method {
     name: "categorical_crossentropy"
-    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'y_true\', \'y_pred\', \'from_logits\'], varargs=None, keywords=None, defaults=[\'False\'], "
   }
   member_method {
     name: "categorical_hinge"
@@ -106,7 +134,7 @@ tf_module {
   }
   member_method {
     name: "sparse_categorical_crossentropy"
-    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'y_true\', \'y_pred\', \'from_logits\'], varargs=None, keywords=None, defaults=[\'False\'], "
   }
   member_method {
     name: "squared_hinge"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-accuracy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-accuracy.pbtxt
new file mode 100644
index 00000000000..2db07df5235
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-accuracy.pbtxt
@@ -0,0 +1,194 @@
+path: "tensorflow.keras.metrics.Accuracy"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.Accuracy\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'accuracy\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-binary-accuracy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-binary-accuracy.pbtxt
new file mode 100644
index 00000000000..904ad3a21a0
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-binary-accuracy.pbtxt
@@ -0,0 +1,194 @@
+path: "tensorflow.keras.metrics.BinaryAccuracy"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.BinaryAccuracy\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\', \'threshold\'], varargs=None, keywords=None, defaults=[\'binary_accuracy\', \'None\', \'0.5\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-categorical-accuracy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-categorical-accuracy.pbtxt
new file mode 100644
index 00000000000..17b74924fab
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-categorical-accuracy.pbtxt
@@ -0,0 +1,194 @@
+path: "tensorflow.keras.metrics.CategoricalAccuracy"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.CategoricalAccuracy\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'categorical_accuracy\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-false-negatives.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-false-negatives.pbtxt
new file mode 100644
index 00000000000..49f577e1367
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-false-negatives.pbtxt
@@ -0,0 +1,193 @@
+path: "tensorflow.keras.metrics.FalseNegatives"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.FalseNegatives\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics._ConfusionMatrixConditionCount\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'thresholds\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-false-positives.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-false-positives.pbtxt
new file mode 100644
index 00000000000..e8baf858669
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-false-positives.pbtxt
@@ -0,0 +1,193 @@
+path: "tensorflow.keras.metrics.FalsePositives"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.FalsePositives\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics._ConfusionMatrixConditionCount\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'thresholds\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean.pbtxt
new file mode 100644
index 00000000000..40fe64bbd2c
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean.pbtxt
@@ -0,0 +1,192 @@
+path: "tensorflow.keras.metrics.Mean"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'mean\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'values\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-precision.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-precision.pbtxt
new file mode 100644
index 00000000000..ae6a85026da
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-precision.pbtxt
@@ -0,0 +1,192 @@
+path: "tensorflow.keras.metrics.Precision"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.Precision\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'thresholds\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-recall.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-recall.pbtxt
new file mode 100644
index 00000000000..31068a51d51
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-recall.pbtxt
@@ -0,0 +1,192 @@
+path: "tensorflow.keras.metrics.Recall"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.Recall\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'thresholds\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sparse-categorical-accuracy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sparse-categorical-accuracy.pbtxt
new file mode 100644
index 00000000000..0c17452292a
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sparse-categorical-accuracy.pbtxt
@@ -0,0 +1,194 @@
+path: "tensorflow.keras.metrics.SparseCategoricalAccuracy"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.SparseCategoricalAccuracy\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'sparse_categorical_accuracy\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-true-negatives.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-true-negatives.pbtxt
new file mode 100644
index 00000000000..1b5eb8d0de5
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-true-negatives.pbtxt
@@ -0,0 +1,193 @@
+path: "tensorflow.keras.metrics.TrueNegatives"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.TrueNegatives\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics._ConfusionMatrixConditionCount\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'thresholds\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-true-positives.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-true-positives.pbtxt
new file mode 100644
index 00000000000..5b9c470e32d
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-true-positives.pbtxt
@@ -0,0 +1,193 @@
+path: "tensorflow.keras.metrics.TruePositives"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.TruePositives\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics._ConfusionMatrixConditionCount\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'thresholds\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.pbtxt
index a296e131586..8cab17edc59 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.pbtxt
@@ -1,5 +1,49 @@
 path: "tensorflow.keras.metrics"
 tf_module {
+  member {
+    name: "Accuracy"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "BinaryAccuracy"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "CategoricalAccuracy"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "FalseNegatives"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "FalsePositives"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "Mean"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "Precision"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "Recall"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "SparseCategoricalAccuracy"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "TrueNegatives"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "TruePositives"
+    mtype: "<type \'type\'>"
+  }
   member_method {
     name: "KLD"
     argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
@@ -26,7 +70,7 @@ tf_module {
   }
   member_method {
     name: "binary_crossentropy"
-    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'y_true\', \'y_pred\', \'from_logits\'], varargs=None, keywords=None, defaults=[\'False\'], "
   }
   member_method {
     name: "categorical_accuracy"
@@ -34,7 +78,7 @@ tf_module {
   }
   member_method {
     name: "categorical_crossentropy"
-    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'y_true\', \'y_pred\', \'from_logits\'], varargs=None, keywords=None, defaults=[\'False\'], "
   }
   member_method {
     name: "cosine"
@@ -110,7 +154,7 @@ tf_module {
   }
   member_method {
     name: "sparse_categorical_crossentropy"
-    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'y_true\', \'y_pred\', \'from_logits\'], varargs=None, keywords=None, defaults=[\'False\'], "
   }
   member_method {
     name: "sparse_top_k_categorical_accuracy"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-model.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-model.pbtxt
index ccff809f2b2..c58c7bef22d 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-model.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-model.pbtxt
@@ -41,6 +41,14 @@ tf_class {
     name: "losses"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "metrics"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "metrics_names"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "name"
     mtype: "<type \'property\'>"
@@ -69,6 +77,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "run_eagerly"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "state_updates"
     mtype: "<type \'property\'>"
@@ -105,13 +117,17 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
@@ -225,6 +241,10 @@ tf_class {
     name: "predict_on_batch"
     argspec: "args=[\'self\', \'x\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "reset_metrics"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "reset_states"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -247,7 +267,7 @@ tf_class {
   }
   member_method {
     name: "test_on_batch"
-    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'reset_metrics\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'True\'], "
   }
   member_method {
     name: "to_json"
@@ -259,6 +279,6 @@ tf_class {
   }
   member_method {
     name: "train_on_batch"
-    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'class_weight\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'class_weight\', \'reset_metrics\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\'], "
   }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-sequential.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-sequential.pbtxt
index b0fc7f97f1d..473a1c16fb1 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-sequential.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-sequential.pbtxt
@@ -42,6 +42,14 @@ tf_class {
     name: "losses"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "metrics"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "metrics_names"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "name"
     mtype: "<type \'property\'>"
@@ -70,6 +78,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "run_eagerly"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "state_updates"
     mtype: "<type \'property\'>"
@@ -110,13 +122,17 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
@@ -242,6 +258,10 @@ tf_class {
     name: "predict_proba"
     argspec: "args=[\'self\', \'x\', \'batch_size\', \'verbose\'], varargs=None, keywords=None, defaults=[\'32\', \'0\'], "
   }
+  member_method {
+    name: "reset_metrics"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "reset_states"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -264,7 +284,7 @@ tf_class {
   }
   member_method {
     name: "test_on_batch"
-    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'reset_metrics\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'True\'], "
   }
   member_method {
     name: "to_json"
@@ -276,6 +296,6 @@ tf_class {
   }
   member_method {
     name: "train_on_batch"
-    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'class_weight\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'class_weight\', \'reset_metrics\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\'], "
   }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.utils.-progbar.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.utils.-progbar.pbtxt
index be4496e753f..8177cc71ed3 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.utils.-progbar.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.utils.-progbar.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'target\', \'width\', \'verbose\', \'interval\', \'stateful_metrics\'], varargs=None, keywords=None, defaults=[\'30\', \'1\', \'0.05\', \'None\'], "
+    argspec: "args=[\'self\', \'target\', \'width\', \'verbose\', \'interval\', \'stateful_metrics\', \'unit_name\'], varargs=None, keywords=None, defaults=[\'30\', \'1\', \'0.05\', \'None\', \'step\'], "
   }
   member_method {
     name: "add"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-block-diag.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-block-diag.pbtxt
index 973705dae2f..773c74e64d1 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-block-diag.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-block-diag.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "batch_shape_tensor"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'batch_shape_tensor\'], "
   }
+  member_method {
+    name: "cholesky"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'cholesky\'], "
+  }
   member_method {
     name: "determinant"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'det\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-circulant.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-circulant.pbtxt
index de917706d55..533544d21f2 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-circulant.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-circulant.pbtxt
@@ -96,6 +96,10 @@ tf_class {
     name: "block_shape_tensor"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "cholesky"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'cholesky\'], "
+  }
   member_method {
     name: "convolution_kernel"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'convolution_kernel\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-circulant2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-circulant2-d.pbtxt
index c4e6a21c3ac..e3926eb6d47 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-circulant2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-circulant2-d.pbtxt
@@ -96,6 +96,10 @@ tf_class {
     name: "block_shape_tensor"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "cholesky"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'cholesky\'], "
+  }
   member_method {
     name: "convolution_kernel"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'convolution_kernel\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-circulant3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-circulant3-d.pbtxt
index 2e085a8e289..ba209df7824 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-circulant3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-circulant3-d.pbtxt
@@ -96,6 +96,10 @@ tf_class {
     name: "block_shape_tensor"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "cholesky"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'cholesky\'], "
+  }
   member_method {
     name: "convolution_kernel"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'convolution_kernel\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-composition.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-composition.pbtxt
index 42d22bce42d..081fb0e08bc 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-composition.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-composition.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "batch_shape_tensor"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'batch_shape_tensor\'], "
   }
+  member_method {
+    name: "cholesky"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'cholesky\'], "
+  }
   member_method {
     name: "determinant"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'det\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-diag.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-diag.pbtxt
index d6749fdcec6..2014a043016 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-diag.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-diag.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "batch_shape_tensor"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'batch_shape_tensor\'], "
   }
+  member_method {
+    name: "cholesky"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'cholesky\'], "
+  }
   member_method {
     name: "determinant"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'det\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-full-matrix.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-full-matrix.pbtxt
index d9f363d1336..9a87ae96877 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-full-matrix.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-full-matrix.pbtxt
@@ -75,6 +75,10 @@ tf_class {
     name: "batch_shape_tensor"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'batch_shape_tensor\'], "
   }
+  member_method {
+    name: "cholesky"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'cholesky\'], "
+  }
   member_method {
     name: "determinant"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'det\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-identity.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-identity.pbtxt
index aac7ee31ed6..33afb835ce1 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-identity.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-identity.pbtxt
@@ -76,6 +76,10 @@ tf_class {
     name: "batch_shape_tensor"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'batch_shape_tensor\'], "
   }
+  member_method {
+    name: "cholesky"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'cholesky\'], "
+  }
   member_method {
     name: "determinant"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'det\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-kronecker.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-kronecker.pbtxt
index c11d3908293..a9078c8ab5c 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-kronecker.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-kronecker.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "batch_shape_tensor"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'batch_shape_tensor\'], "
   }
+  member_method {
+    name: "cholesky"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'cholesky\'], "
+  }
   member_method {
     name: "determinant"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'det\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-low-rank-update.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-low-rank-update.pbtxt
index 3ee800269e6..4cfa3bb30d7 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-low-rank-update.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-low-rank-update.pbtxt
@@ -99,6 +99,10 @@ tf_class {
     name: "batch_shape_tensor"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'batch_shape_tensor\'], "
   }
+  member_method {
+    name: "cholesky"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'cholesky\'], "
+  }
   member_method {
     name: "determinant"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'det\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-lower-triangular.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-lower-triangular.pbtxt
index 63a1bc2321e..a87649133fd 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-lower-triangular.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-lower-triangular.pbtxt
@@ -75,6 +75,10 @@ tf_class {
     name: "batch_shape_tensor"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'batch_shape_tensor\'], "
   }
+  member_method {
+    name: "cholesky"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'cholesky\'], "
+  }
   member_method {
     name: "determinant"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'det\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-scaled-identity.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-scaled-identity.pbtxt
index e2c5a505a7d..32656467840 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-scaled-identity.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-scaled-identity.pbtxt
@@ -80,6 +80,10 @@ tf_class {
     name: "batch_shape_tensor"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'batch_shape_tensor\'], "
   }
+  member_method {
+    name: "cholesky"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'cholesky\'], "
+  }
   member_method {
     name: "determinant"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'det\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-zeros.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-zeros.pbtxt
index a1b0e06b475..49d8890c894 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-zeros.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-zeros.pbtxt
@@ -75,6 +75,10 @@ tf_class {
     name: "batch_shape_tensor"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'batch_shape_tensor\'], "
   }
+  member_method {
+    name: "cholesky"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'cholesky\'], "
+  }
   member_method {
     name: "determinant"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'det\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator.pbtxt
index 6d849dc040f..c89dc067b33 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator.pbtxt
@@ -74,6 +74,10 @@ tf_class {
     name: "batch_shape_tensor"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'batch_shape_tensor\'], "
   }
+  member_method {
+    name: "cholesky"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'cholesky\'], "
+  }
   member_method {
     name: "determinant"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'det\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.pbtxt
index 1a4098d121b..a3599bfa801 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.pbtxt
@@ -118,7 +118,7 @@ tf_module {
   }
   member_method {
     name: "l2_normalize"
-    argspec: "args=[\'x\', \'axis\', \'epsilon\', \'name\', \'dim\'], varargs=None, keywords=None, defaults=[\'None\', \'1e-12\', \'None\', \'None\'], "
+    argspec: "args=[\'x\', \'axis\', \'epsilon\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'1e-12\', \'None\'], "
   }
   member_method {
     name: "logdet"
@@ -142,7 +142,7 @@ tf_module {
   }
   member_method {
     name: "norm"
-    argspec: "args=[\'tensor\', \'ord\', \'axis\', \'keepdims\', \'name\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'euclidean\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'tensor\', \'ord\', \'axis\', \'keepdims\', \'name\'], varargs=None, keywords=None, defaults=[\'euclidean\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "qr"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.lite.constants.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.lite.constants.pbtxt
index 08845553e55..4d5c4893b41 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.lite.constants.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.lite.constants.pbtxt
@@ -1,29 +1,9 @@
 path: "tensorflow.lite.constants"
 tf_module {
-  member {
-    name: "FLOAT"
-    mtype: "<type \'int\'>"
-  }
   member {
     name: "GRAPHVIZ_DOT"
     mtype: "<type \'int\'>"
   }
-  member {
-    name: "INT32"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "INT64"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "QUANTIZED_UINT8"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "STRING"
-    mtype: "<type \'int\'>"
-  }
   member {
     name: "TFLITE"
     mtype: "<type \'int\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.logging.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.logging.pbtxt
deleted file mode 100644
index 85bb15455da..00000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.logging.pbtxt
+++ /dev/null
@@ -1,83 +0,0 @@
-path: "tensorflow.logging"
-tf_module {
-  member {
-    name: "DEBUG"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "ERROR"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "FATAL"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "INFO"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "WARN"
-    mtype: "<type \'int\'>"
-  }
-  member_method {
-    name: "TaskLevelStatusMessage"
-    argspec: "args=[\'msg\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "debug"
-    argspec: "args=[\'msg\'], varargs=args, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "error"
-    argspec: "args=[\'msg\'], varargs=args, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "fatal"
-    argspec: "args=[\'msg\'], varargs=args, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "flush"
-    argspec: "args=[], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_verbosity"
-    argspec: "args=[], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "info"
-    argspec: "args=[\'msg\'], varargs=args, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "log"
-    argspec: "args=[\'level\', \'msg\'], varargs=args, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "log_every_n"
-    argspec: "args=[\'level\', \'msg\', \'n\'], varargs=args, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "log_first_n"
-    argspec: "args=[\'level\', \'msg\', \'n\'], varargs=args, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "log_if"
-    argspec: "args=[\'level\', \'msg\', \'condition\'], varargs=args, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "set_verbosity"
-    argspec: "args=[\'v\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "vlog"
-    argspec: "args=[\'level\', \'msg\'], varargs=args, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "warn"
-    argspec: "args=[\'msg\'], varargs=args, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "warning"
-    argspec: "args=[\'msg\'], varargs=args, keywords=kwargs, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.losses.-reduction.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.losses.-reduction.pbtxt
index 258ad5047eb..6a44e4ce66c 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.losses.-reduction.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.losses.-reduction.pbtxt
@@ -1,11 +1,7 @@
 path: "tensorflow.losses.Reduction"
 tf_class {
-  is_instance: "<class \'tensorflow.python.ops.losses.losses_impl.Reduction\'>"
+  is_instance: "<class \'tensorflow.python.ops.losses.losses_impl.ReductionV2\'>"
   is_instance: "<type \'object\'>"
-  member {
-    name: "MEAN"
-    mtype: "<type \'str\'>"
-  }
   member {
     name: "NONE"
     mtype: "<type \'str\'>"
@@ -14,18 +10,10 @@ tf_class {
     name: "SUM"
     mtype: "<type \'str\'>"
   }
-  member {
-    name: "SUM_BY_NONZERO_WEIGHTS"
-    mtype: "<type \'str\'>"
-  }
   member {
     name: "SUM_OVER_BATCH_SIZE"
     mtype: "<type \'str\'>"
   }
-  member {
-    name: "SUM_OVER_NONZERO_WEIGHTS"
-    mtype: "<type \'str\'>"
-  }
   member_method {
     name: "__init__"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.losses.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.losses.pbtxt
index c1d190ae116..233b1a0131a 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.losses.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.losses.pbtxt
@@ -4,22 +4,10 @@ tf_module {
     name: "Reduction"
     mtype: "<type \'type\'>"
   }
-  member_method {
-    name: "absolute_difference"
-    argspec: "args=[\'labels\', \'predictions\', \'weights\', \'scope\', \'loss_collection\', \'reduction\'], varargs=None, keywords=None, defaults=[\'1.0\', \'None\', \'losses\', \'weighted_sum_by_nonzero_weights\'], "
-  }
   member_method {
     name: "add_loss"
     argspec: "args=[\'loss\', \'loss_collection\'], varargs=None, keywords=None, defaults=[\'losses\'], "
   }
-  member_method {
-    name: "compute_weighted_loss"
-    argspec: "args=[\'losses\', \'weights\', \'scope\', \'loss_collection\', \'reduction\'], varargs=None, keywords=None, defaults=[\'1.0\', \'None\', \'losses\', \'weighted_sum_by_nonzero_weights\'], "
-  }
-  member_method {
-    name: "cosine_distance"
-    argspec: "args=[\'labels\', \'predictions\', \'axis\', \'weights\', \'scope\', \'loss_collection\', \'reduction\', \'dim\'], varargs=None, keywords=None, defaults=[\'None\', \'1.0\', \'None\', \'losses\', \'weighted_sum_by_nonzero_weights\', \'None\'], "
-  }
   member_method {
     name: "get_losses"
     argspec: "args=[\'scope\', \'loss_collection\'], varargs=None, keywords=None, defaults=[\'None\', \'losses\'], "
@@ -36,36 +24,4 @@ tf_module {
     name: "get_total_loss"
     argspec: "args=[\'add_regularization_losses\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'total_loss\'], "
   }
-  member_method {
-    name: "hinge_loss"
-    argspec: "args=[\'labels\', \'logits\', \'weights\', \'scope\', \'loss_collection\', \'reduction\'], varargs=None, keywords=None, defaults=[\'1.0\', \'None\', \'losses\', \'weighted_sum_by_nonzero_weights\'], "
-  }
-  member_method {
-    name: "huber_loss"
-    argspec: "args=[\'labels\', \'predictions\', \'weights\', \'delta\', \'scope\', \'loss_collection\', \'reduction\'], varargs=None, keywords=None, defaults=[\'1.0\', \'1.0\', \'None\', \'losses\', \'weighted_sum_by_nonzero_weights\'], "
-  }
-  member_method {
-    name: "log_loss"
-    argspec: "args=[\'labels\', \'predictions\', \'weights\', \'epsilon\', \'scope\', \'loss_collection\', \'reduction\'], varargs=None, keywords=None, defaults=[\'1.0\', \'1e-07\', \'None\', \'losses\', \'weighted_sum_by_nonzero_weights\'], "
-  }
-  member_method {
-    name: "mean_pairwise_squared_error"
-    argspec: "args=[\'labels\', \'predictions\', \'weights\', \'scope\', \'loss_collection\'], varargs=None, keywords=None, defaults=[\'1.0\', \'None\', \'losses\'], "
-  }
-  member_method {
-    name: "mean_squared_error"
-    argspec: "args=[\'labels\', \'predictions\', \'weights\', \'scope\', \'loss_collection\', \'reduction\'], varargs=None, keywords=None, defaults=[\'1.0\', \'None\', \'losses\', \'weighted_sum_by_nonzero_weights\'], "
-  }
-  member_method {
-    name: "sigmoid_cross_entropy"
-    argspec: "args=[\'multi_class_labels\', \'logits\', \'weights\', \'label_smoothing\', \'scope\', \'loss_collection\', \'reduction\'], varargs=None, keywords=None, defaults=[\'1.0\', \'0\', \'None\', \'losses\', \'weighted_sum_by_nonzero_weights\'], "
-  }
-  member_method {
-    name: "softmax_cross_entropy"
-    argspec: "args=[\'onehot_labels\', \'logits\', \'weights\', \'label_smoothing\', \'scope\', \'loss_collection\', \'reduction\'], varargs=None, keywords=None, defaults=[\'1.0\', \'0\', \'None\', \'losses\', \'weighted_sum_by_nonzero_weights\'], "
-  }
-  member_method {
-    name: "sparse_softmax_cross_entropy"
-    argspec: "args=[\'labels\', \'logits\', \'weights\', \'scope\', \'loss_collection\', \'reduction\'], varargs=None, keywords=None, defaults=[\'1.0\', \'None\', \'losses\', \'weighted_sum_by_nonzero_weights\'], "
-  }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.math.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.math.pbtxt
index a441e42b0a9..979d77ea6b3 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.math.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.math.pbtxt
@@ -78,7 +78,7 @@ tf_module {
   }
   member_method {
     name: "bincount"
-    argspec: "args=[\'arr\', \'weights\', \'minlength\', \'maxlength\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \"<dtype: \'int32\'>\"], "
+    argspec: "args=[\'arr\', \'weights\', \'minlength\', \'maxlength\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \"<dtype: \'int32\'>\", \'None\'], "
   }
   member_method {
     name: "ceil"
@@ -86,7 +86,7 @@ tf_module {
   }
   member_method {
     name: "confusion_matrix"
-    argspec: "args=[\'labels\', \'predictions\', \'num_classes\', \'dtype\', \'name\', \'weights\'], varargs=None, keywords=None, defaults=[\'None\', \"<dtype: \'int32\'>\", \'None\', \'None\'], "
+    argspec: "args=[\'labels\', \'predictions\', \'num_classes\', \'weights\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \"<dtype: \'int32\'>\", \'None\'], "
   }
   member_method {
     name: "conj"
@@ -102,7 +102,7 @@ tf_module {
   }
   member_method {
     name: "count_nonzero"
-    argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'dtype\', \'name\', \'reduction_indices\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \"<dtype: \'int64\'>\", \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'input\', \'axis\', \'keepdims\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \"<dtype: \'int64\'>\", \'None\'], "
   }
   member_method {
     name: "cumprod"
@@ -176,9 +176,29 @@ tf_module {
     name: "invert_permutation"
     argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "is_finite"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "is_inf"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "is_nan"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "is_non_decreasing"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "is_strictly_increasing"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "l2_normalize"
-    argspec: "args=[\'x\', \'axis\', \'epsilon\', \'name\', \'dim\'], varargs=None, keywords=None, defaults=[\'None\', \'1e-12\', \'None\', \'None\'], "
+    argspec: "args=[\'x\', \'axis\', \'epsilon\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'1e-12\', \'None\'], "
   }
   member_method {
     name: "lbeta"
@@ -210,7 +230,7 @@ tf_module {
   }
   member_method {
     name: "log_softmax"
-    argspec: "args=[\'logits\', \'axis\', \'name\', \'dim\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+    argspec: "args=[\'logits\', \'axis\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "logical_and"
@@ -270,43 +290,43 @@ tf_module {
   }
   member_method {
     name: "reduce_all"
-    argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\', \'reduction_indices\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
   }
   member_method {
     name: "reduce_any"
-    argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\', \'reduction_indices\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
   }
   member_method {
     name: "reduce_logsumexp"
-    argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\', \'reduction_indices\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
   }
   member_method {
     name: "reduce_max"
-    argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\', \'reduction_indices\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
   }
   member_method {
     name: "reduce_mean"
-    argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\', \'reduction_indices\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
   }
   member_method {
     name: "reduce_min"
-    argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\', \'reduction_indices\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
   }
   member_method {
     name: "reduce_prod"
-    argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\', \'reduction_indices\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
   }
   member_method {
     name: "reduce_std"
-    argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+    argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
   }
   member_method {
     name: "reduce_sum"
-    argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\', \'reduction_indices\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
   }
   member_method {
     name: "reduce_variance"
-    argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+    argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
   }
   member_method {
     name: "rint"
@@ -322,7 +342,7 @@ tf_module {
   }
   member_method {
     name: "scalar_mul"
-    argspec: "args=[\'scalar\', \'x\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'scalar\', \'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "segment_max"
@@ -362,7 +382,7 @@ tf_module {
   }
   member_method {
     name: "softmax"
-    argspec: "args=[\'logits\', \'axis\', \'name\', \'dim\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+    argspec: "args=[\'logits\', \'axis\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "softplus"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-accuracy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-accuracy.pbtxt
new file mode 100644
index 00000000000..f8e12f88173
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-accuracy.pbtxt
@@ -0,0 +1,194 @@
+path: "tensorflow.metrics.Accuracy"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.Accuracy\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'accuracy\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-binary-accuracy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-binary-accuracy.pbtxt
new file mode 100644
index 00000000000..b9bc6a716a1
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-binary-accuracy.pbtxt
@@ -0,0 +1,194 @@
+path: "tensorflow.metrics.BinaryAccuracy"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.BinaryAccuracy\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\', \'threshold\'], varargs=None, keywords=None, defaults=[\'binary_accuracy\', \'None\', \'0.5\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-categorical-accuracy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-categorical-accuracy.pbtxt
new file mode 100644
index 00000000000..0ef75d8756f
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-categorical-accuracy.pbtxt
@@ -0,0 +1,194 @@
+path: "tensorflow.metrics.CategoricalAccuracy"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.CategoricalAccuracy\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'categorical_accuracy\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-false-negatives.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-false-negatives.pbtxt
new file mode 100644
index 00000000000..33226a2df62
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-false-negatives.pbtxt
@@ -0,0 +1,193 @@
+path: "tensorflow.metrics.FalseNegatives"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.FalseNegatives\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics._ConfusionMatrixConditionCount\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'thresholds\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-false-positives.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-false-positives.pbtxt
new file mode 100644
index 00000000000..9953162ea3e
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-false-positives.pbtxt
@@ -0,0 +1,193 @@
+path: "tensorflow.metrics.FalsePositives"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.FalsePositives\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics._ConfusionMatrixConditionCount\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'thresholds\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean.pbtxt
new file mode 100644
index 00000000000..7fe6d6fda96
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean.pbtxt
@@ -0,0 +1,192 @@
+path: "tensorflow.metrics.Mean"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'mean\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'values\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-precision.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-precision.pbtxt
new file mode 100644
index 00000000000..8c3271a109c
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-precision.pbtxt
@@ -0,0 +1,192 @@
+path: "tensorflow.metrics.Precision"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.Precision\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'thresholds\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-recall.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-recall.pbtxt
new file mode 100644
index 00000000000..840a68bbc78
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-recall.pbtxt
@@ -0,0 +1,192 @@
+path: "tensorflow.metrics.Recall"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.Recall\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'thresholds\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-sparse-categorical-accuracy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-sparse-categorical-accuracy.pbtxt
new file mode 100644
index 00000000000..7bce43fbdeb
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-sparse-categorical-accuracy.pbtxt
@@ -0,0 +1,194 @@
+path: "tensorflow.metrics.SparseCategoricalAccuracy"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.SparseCategoricalAccuracy\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'sparse_categorical_accuracy\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-true-negatives.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-true-negatives.pbtxt
new file mode 100644
index 00000000000..83cd5b736bc
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-true-negatives.pbtxt
@@ -0,0 +1,193 @@
+path: "tensorflow.metrics.TrueNegatives"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.TrueNegatives\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics._ConfusionMatrixConditionCount\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'thresholds\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-true-positives.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-true-positives.pbtxt
new file mode 100644
index 00000000000..5b2502eafee
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-true-positives.pbtxt
@@ -0,0 +1,193 @@
+path: "tensorflow.metrics.TruePositives"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.TruePositives\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics._ConfusionMatrixConditionCount\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'thresholds\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.pbtxt
index e9b996c9f53..773efd03fc8 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.pbtxt
@@ -1,135 +1,47 @@
 path: "tensorflow.metrics"
 tf_module {
-  member_method {
-    name: "accuracy"
-    argspec: "args=[\'labels\', \'predictions\', \'weights\', \'metrics_collections\', \'updates_collections\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  member {
+    name: "Accuracy"
+    mtype: "<type \'type\'>"
   }
-  member_method {
-    name: "auc"
-    argspec: "args=[\'labels\', \'predictions\', \'weights\', \'num_thresholds\', \'metrics_collections\', \'updates_collections\', \'curve\', \'name\', \'summation_method\'], varargs=None, keywords=None, defaults=[\'None\', \'200\', \'None\', \'None\', \'ROC\', \'None\', \'trapezoidal\'], "
+  member {
+    name: "BinaryAccuracy"
+    mtype: "<type \'type\'>"
   }
-  member_method {
-    name: "average_precision_at_k"
-    argspec: "args=[\'labels\', \'predictions\', \'k\', \'weights\', \'metrics_collections\', \'updates_collections\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  member {
+    name: "CategoricalAccuracy"
+    mtype: "<type \'type\'>"
   }
-  member_method {
-    name: "false_negatives"
-    argspec: "args=[\'labels\', \'predictions\', \'weights\', \'metrics_collections\', \'updates_collections\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  member {
+    name: "FalseNegatives"
+    mtype: "<type \'type\'>"
   }
-  member_method {
-    name: "false_negatives_at_thresholds"
-    argspec: "args=[\'labels\', \'predictions\', \'thresholds\', \'weights\', \'metrics_collections\', \'updates_collections\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  member {
+    name: "FalsePositives"
+    mtype: "<type \'type\'>"
   }
-  member_method {
-    name: "false_positives"
-    argspec: "args=[\'labels\', \'predictions\', \'weights\', \'metrics_collections\', \'updates_collections\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  member {
+    name: "Mean"
+    mtype: "<type \'type\'>"
   }
-  member_method {
-    name: "false_positives_at_thresholds"
-    argspec: "args=[\'labels\', \'predictions\', \'thresholds\', \'weights\', \'metrics_collections\', \'updates_collections\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  member {
+    name: "Precision"
+    mtype: "<type \'type\'>"
   }
-  member_method {
-    name: "mean"
-    argspec: "args=[\'values\', \'weights\', \'metrics_collections\', \'updates_collections\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  member {
+    name: "Recall"
+    mtype: "<type \'type\'>"
   }
-  member_method {
-    name: "mean_absolute_error"
-    argspec: "args=[\'labels\', \'predictions\', \'weights\', \'metrics_collections\', \'updates_collections\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  member {
+    name: "SparseCategoricalAccuracy"
+    mtype: "<type \'type\'>"
   }
-  member_method {
-    name: "mean_cosine_distance"
-    argspec: "args=[\'labels\', \'predictions\', \'dim\', \'weights\', \'metrics_collections\', \'updates_collections\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  member {
+    name: "TrueNegatives"
+    mtype: "<type \'type\'>"
   }
-  member_method {
-    name: "mean_iou"
-    argspec: "args=[\'labels\', \'predictions\', \'num_classes\', \'weights\', \'metrics_collections\', \'updates_collections\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "mean_per_class_accuracy"
-    argspec: "args=[\'labels\', \'predictions\', \'num_classes\', \'weights\', \'metrics_collections\', \'updates_collections\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "mean_relative_error"
-    argspec: "args=[\'labels\', \'predictions\', \'normalizer\', \'weights\', \'metrics_collections\', \'updates_collections\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "mean_squared_error"
-    argspec: "args=[\'labels\', \'predictions\', \'weights\', \'metrics_collections\', \'updates_collections\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "mean_tensor"
-    argspec: "args=[\'values\', \'weights\', \'metrics_collections\', \'updates_collections\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "percentage_below"
-    argspec: "args=[\'values\', \'threshold\', \'weights\', \'metrics_collections\', \'updates_collections\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "precision"
-    argspec: "args=[\'labels\', \'predictions\', \'weights\', \'metrics_collections\', \'updates_collections\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "precision_at_k"
-    argspec: "args=[\'labels\', \'predictions\', \'k\', \'class_id\', \'weights\', \'metrics_collections\', \'updates_collections\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "precision_at_thresholds"
-    argspec: "args=[\'labels\', \'predictions\', \'thresholds\', \'weights\', \'metrics_collections\', \'updates_collections\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "precision_at_top_k"
-    argspec: "args=[\'labels\', \'predictions_idx\', \'k\', \'class_id\', \'weights\', \'metrics_collections\', \'updates_collections\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "recall"
-    argspec: "args=[\'labels\', \'predictions\', \'weights\', \'metrics_collections\', \'updates_collections\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "recall_at_k"
-    argspec: "args=[\'labels\', \'predictions\', \'k\', \'class_id\', \'weights\', \'metrics_collections\', \'updates_collections\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "recall_at_thresholds"
-    argspec: "args=[\'labels\', \'predictions\', \'thresholds\', \'weights\', \'metrics_collections\', \'updates_collections\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "recall_at_top_k"
-    argspec: "args=[\'labels\', \'predictions_idx\', \'k\', \'class_id\', \'weights\', \'metrics_collections\', \'updates_collections\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "root_mean_squared_error"
-    argspec: "args=[\'labels\', \'predictions\', \'weights\', \'metrics_collections\', \'updates_collections\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "sensitivity_at_specificity"
-    argspec: "args=[\'labels\', \'predictions\', \'specificity\', \'weights\', \'num_thresholds\', \'metrics_collections\', \'updates_collections\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'200\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "sparse_average_precision_at_k"
-    argspec: "args=[\'labels\', \'predictions\', \'k\', \'weights\', \'metrics_collections\', \'updates_collections\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "sparse_precision_at_k"
-    argspec: "args=[\'labels\', \'predictions\', \'k\', \'class_id\', \'weights\', \'metrics_collections\', \'updates_collections\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "specificity_at_sensitivity"
-    argspec: "args=[\'labels\', \'predictions\', \'sensitivity\', \'weights\', \'num_thresholds\', \'metrics_collections\', \'updates_collections\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'200\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "true_negatives"
-    argspec: "args=[\'labels\', \'predictions\', \'weights\', \'metrics_collections\', \'updates_collections\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "true_negatives_at_thresholds"
-    argspec: "args=[\'labels\', \'predictions\', \'thresholds\', \'weights\', \'metrics_collections\', \'updates_collections\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "true_positives"
-    argspec: "args=[\'labels\', \'predictions\', \'weights\', \'metrics_collections\', \'updates_collections\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "true_positives_at_thresholds"
-    argspec: "args=[\'labels\', \'predictions\', \'thresholds\', \'weights\', \'metrics_collections\', \'updates_collections\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  member {
+    name: "TruePositives"
+    mtype: "<type \'type\'>"
   }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.nn.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.nn.pbtxt
index 2dc5c48aa6e..63bf24b5d5e 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.nn.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.nn.pbtxt
@@ -30,7 +30,7 @@ tf_module {
   }
   member_method {
     name: "batch_norm_with_global_normalization"
-    argspec: "args=[\'t\', \'m\', \'v\', \'beta\', \'gamma\', \'variance_epsilon\', \'scale_after_normalization\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'input\', \'mean\', \'variance\', \'beta\', \'gamma\', \'variance_epsilon\', \'scale_after_normalization\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "batch_normalization"
@@ -41,8 +41,8 @@ tf_module {
     argspec: "args=[\'value\', \'bias\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
-    name: "bidirectional_dynamic_rnn"
-    argspec: "args=[\'cell_fw\', \'cell_bw\', \'inputs\', \'sequence_length\', \'initial_state_fw\', \'initial_state_bw\', \'dtype\', \'parallel_iterations\', \'swap_memory\', \'time_major\', \'scope\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'False\', \'False\', \'None\'], "
+    name: "collapse_repeated"
+    argspec: "args=[\'labels\', \'seq_length\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "compute_accidental_hits"
@@ -50,43 +50,43 @@ tf_module {
   }
   member_method {
     name: "conv1d"
-    argspec: "args=[\'value\', \'filters\', \'stride\', \'padding\', \'use_cudnn_on_gpu\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+    argspec: "args=[\'input\', \'filters\', \'stride\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "conv2d"
-    argspec: "args=[\'input\', \'filter\', \'strides\', \'padding\', \'use_cudnn_on_gpu\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'NHWC\', \'[1, 1, 1, 1]\', \'None\'], "
+    argspec: "args=[\'input\', \'filters\', \'strides\', \'padding\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'NHWC\', \'None\', \'None\'], "
   }
   member_method {
     name: "conv2d_backprop_filter"
-    argspec: "args=[\'input\', \'filter_sizes\', \'out_backprop\', \'strides\', \'padding\', \'use_cudnn_on_gpu\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'NHWC\', \'[1, 1, 1, 1]\', \'None\'], "
+    argspec: "args=[\'input\', \'filter_sizes\', \'out_backprop\', \'strides\', \'padding\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'NHWC\', \'None\', \'None\'], "
   }
   member_method {
     name: "conv2d_backprop_input"
-    argspec: "args=[\'input_sizes\', \'filter\', \'out_backprop\', \'strides\', \'padding\', \'use_cudnn_on_gpu\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'NHWC\', \'[1, 1, 1, 1]\', \'None\'], "
+    argspec: "args=[\'input_sizes\', \'filters\', \'out_backprop\', \'strides\', \'padding\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'NHWC\', \'None\', \'None\'], "
   }
   member_method {
     name: "conv2d_transpose"
-    argspec: "args=[\'value\', \'filter\', \'output_shape\', \'strides\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'SAME\', \'NHWC\', \'None\'], "
+    argspec: "args=[\'input\', \'filters\', \'output_shape\', \'strides\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'SAME\', \'NHWC\', \'None\'], "
   }
   member_method {
     name: "conv3d"
-    argspec: "args=[\'input\', \'filter\', \'strides\', \'padding\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'NDHWC\', \'[1, 1, 1, 1, 1]\', \'None\'], "
+    argspec: "args=[\'input\', \'filters\', \'strides\', \'padding\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'NDHWC\', \'None\', \'None\'], "
   }
   member_method {
-    name: "conv3d_backprop_filter_v2"
+    name: "conv3d_backprop_filter"
     argspec: "args=[\'input\', \'filter_sizes\', \'out_backprop\', \'strides\', \'padding\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'NDHWC\', \'[1, 1, 1, 1, 1]\', \'None\'], "
   }
   member_method {
     name: "conv3d_transpose"
-    argspec: "args=[\'value\', \'filter\', \'output_shape\', \'strides\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'SAME\', \'NDHWC\', \'None\'], "
+    argspec: "args=[\'input\', \'filters\', \'output_shape\', \'strides\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'SAME\', \'NDHWC\', \'None\'], "
   }
   member_method {
     name: "convolution"
-    argspec: "args=[\'input\', \'filter\', \'padding\', \'strides\', \'dilation_rate\', \'name\', \'data_format\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'input\', \'filters\', \'strides\', \'padding\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'VALID\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "crelu"
-    argspec: "args=[\'features\', \'name\', \'axis\'], varargs=None, keywords=None, defaults=[\'None\', \'-1\'], "
+    argspec: "args=[\'features\', \'axis\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'None\'], "
   }
   member_method {
     name: "ctc_beam_search_decoder"
@@ -98,7 +98,11 @@ tf_module {
   }
   member_method {
     name: "ctc_loss"
-    argspec: "args=[\'labels\', \'inputs\', \'sequence_length\', \'preprocess_collapse_repeated\', \'ctc_merge_repeated\', \'ignore_longer_outputs_than_inputs\', \'time_major\'], varargs=None, keywords=None, defaults=[\'False\', \'True\', \'False\', \'True\'], "
+    argspec: "args=[\'labels\', \'logits\', \'label_length\', \'logit_length\', \'logits_time_major\', \'unique\', \'blank_index\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "ctc_unique_labels"
+    argspec: "args=[\'labels\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "depth_to_space"
@@ -106,27 +110,23 @@ tf_module {
   }
   member_method {
     name: "depthwise_conv2d"
-    argspec: "args=[\'input\', \'filter\', \'strides\', \'padding\', \'rate\', \'name\', \'data_format\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+    argspec: "args=[\'input\', \'filter\', \'strides\', \'padding\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
   member_method {
-    name: "depthwise_conv2d_native"
-    argspec: "args=[\'input\', \'filter\', \'strides\', \'padding\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'NHWC\', \'[1, 1, 1, 1]\', \'None\'], "
-  }
-  member_method {
-    name: "depthwise_conv2d_native_backprop_filter"
+    name: "depthwise_conv2d_backprop_filter"
     argspec: "args=[\'input\', \'filter_sizes\', \'out_backprop\', \'strides\', \'padding\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'NHWC\', \'[1, 1, 1, 1]\', \'None\'], "
   }
   member_method {
-    name: "depthwise_conv2d_native_backprop_input"
+    name: "depthwise_conv2d_backprop_input"
     argspec: "args=[\'input_sizes\', \'filter\', \'out_backprop\', \'strides\', \'padding\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'NHWC\', \'[1, 1, 1, 1]\', \'None\'], "
   }
   member_method {
     name: "dilation2d"
-    argspec: "args=[\'input\', \'filter\', \'strides\', \'rates\', \'padding\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'input\', \'filters\', \'strides\', \'padding\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "dropout"
-    argspec: "args=[\'x\', \'keep_prob\', \'noise_shape\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+    argspec: "args=[\'x\', \'rate\', \'noise_shape\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "elu"
@@ -142,7 +142,7 @@ tf_module {
   }
   member_method {
     name: "erosion2d"
-    argspec: "args=[\'value\', \'kernel\', \'strides\', \'rates\', \'padding\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'value\', \'filters\', \'strides\', \'padding\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "fixed_unigram_candidate_sampler"
@@ -150,15 +150,11 @@ tf_module {
   }
   member_method {
     name: "fractional_avg_pool"
-    argspec: "args=[\'value\', \'pooling_ratio\', \'pseudo_random\', \'overlapping\', \'deterministic\', \'seed\', \'seed2\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'False\', \'0\', \'0\', \'None\'], "
+    argspec: "args=[\'value\', \'pooling_ratio\', \'pseudo_random\', \'overlapping\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'0\', \'None\'], "
   }
   member_method {
     name: "fractional_max_pool"
-    argspec: "args=[\'value\', \'pooling_ratio\', \'pseudo_random\', \'overlapping\', \'deterministic\', \'seed\', \'seed2\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'False\', \'0\', \'0\', \'None\'], "
-  }
-  member_method {
-    name: "fused_batch_norm"
-    argspec: "args=[\'x\', \'scale\', \'offset\', \'mean\', \'variance\', \'epsilon\', \'data_format\', \'is_training\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'0.001\', \'NHWC\', \'True\', \'None\'], "
+    argspec: "args=[\'value\', \'pooling_ratio\', \'pseudo_random\', \'overlapping\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'0\', \'None\'], "
   }
   member_method {
     name: "in_top_k"
@@ -170,7 +166,7 @@ tf_module {
   }
   member_method {
     name: "l2_normalize"
-    argspec: "args=[\'x\', \'axis\', \'epsilon\', \'name\', \'dim\'], varargs=None, keywords=None, defaults=[\'None\', \'1e-12\', \'None\', \'None\'], "
+    argspec: "args=[\'x\', \'axis\', \'epsilon\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'1e-12\', \'None\'], "
   }
   member_method {
     name: "leaky_relu"
@@ -190,7 +186,7 @@ tf_module {
   }
   member_method {
     name: "log_softmax"
-    argspec: "args=[\'logits\', \'axis\', \'name\', \'dim\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+    argspec: "args=[\'logits\', \'axis\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "lrn"
@@ -206,15 +202,15 @@ tf_module {
   }
   member_method {
     name: "max_pool_with_argmax"
-    argspec: "args=[\'input\', \'ksize\', \'strides\', \'padding\', \'Targmax\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'int64\'>\", \'None\'], "
+    argspec: "args=[\'input\', \'ksize\', \'strides\', \'padding\', \'data_format\', \'output_dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'NHWC\', \"<dtype: \'int64\'>\", \'None\'], "
   }
   member_method {
     name: "moments"
-    argspec: "args=[\'x\', \'axes\', \'shift\', \'name\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'False\'], "
+    argspec: "args=[\'x\', \'axes\', \'shift\', \'keepdims\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
   }
   member_method {
     name: "nce_loss"
-    argspec: "args=[\'weights\', \'biases\', \'labels\', \'inputs\', \'num_sampled\', \'num_classes\', \'num_true\', \'sampled_values\', \'remove_accidental_hits\', \'partition_strategy\', \'name\'], varargs=None, keywords=None, defaults=[\'1\', \'None\', \'False\', \'mod\', \'nce_loss\'], "
+    argspec: "args=[\'weights\', \'biases\', \'labels\', \'inputs\', \'num_sampled\', \'num_classes\', \'num_true\', \'sampled_values\', \'remove_accidental_hits\', \'name\'], varargs=None, keywords=None, defaults=[\'1\', \'None\', \'False\', \'nce_loss\'], "
   }
   member_method {
     name: "normalize_moments"
@@ -222,23 +218,7 @@ tf_module {
   }
   member_method {
     name: "pool"
-    argspec: "args=[\'input\', \'window_shape\', \'pooling_type\', \'padding\', \'dilation_rate\', \'strides\', \'name\', \'data_format\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "quantized_avg_pool"
-    argspec: "args=[\'input\', \'min_input\', \'max_input\', \'ksize\', \'strides\', \'padding\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "quantized_conv2d"
-    argspec: "args=[\'input\', \'filter\', \'min_input\', \'max_input\', \'min_filter\', \'max_filter\', \'strides\', \'padding\', \'out_type\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'qint32\'>\", \'[1, 1, 1, 1]\', \'None\'], "
-  }
-  member_method {
-    name: "quantized_max_pool"
-    argspec: "args=[\'input\', \'min_input\', \'max_input\', \'ksize\', \'strides\', \'padding\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "quantized_relu_x"
-    argspec: "args=[\'features\', \'max_value\', \'min_features\', \'max_features\', \'out_type\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'quint8\'>\", \'None\'], "
+    argspec: "args=[\'input\', \'window_shape\', \'pooling_type\', \'strides\', \'padding\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'VALID\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "relu"
@@ -266,7 +246,7 @@ tf_module {
   }
   member_method {
     name: "separable_conv2d"
-    argspec: "args=[\'input\', \'depthwise_filter\', \'pointwise_filter\', \'strides\', \'padding\', \'rate\', \'name\', \'data_format\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+    argspec: "args=[\'input\', \'depthwise_filter\', \'pointwise_filter\', \'strides\', \'padding\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "sigmoid"
@@ -278,11 +258,11 @@ tf_module {
   }
   member_method {
     name: "softmax"
-    argspec: "args=[\'logits\', \'axis\', \'name\', \'dim\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+    argspec: "args=[\'logits\', \'axis\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "softmax_cross_entropy_with_logits"
-    argspec: "args=[\'labels\', \'logits\', \'dim\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'None\'], "
+    argspec: "args=[\'labels\', \'logits\', \'axis\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'None\'], "
   }
   member_method {
     name: "softplus"
@@ -304,17 +284,13 @@ tf_module {
     name: "sparse_softmax_cross_entropy_with_logits"
     argspec: "args=[\'_sentinel\', \'labels\', \'logits\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
-  member_method {
-    name: "static_bidirectional_rnn"
-    argspec: "args=[\'cell_fw\', \'cell_bw\', \'inputs\', \'initial_state_fw\', \'initial_state_bw\', \'dtype\', \'sequence_length\', \'scope\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
-  }
   member_method {
     name: "static_state_saving_rnn"
     argspec: "args=[\'cell\', \'inputs\', \'state_saver\', \'state_name\', \'sequence_length\', \'scope\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "sufficient_statistics"
-    argspec: "args=[\'x\', \'axes\', \'shift\', \'keep_dims\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+    argspec: "args=[\'x\', \'axes\', \'shift\', \'keepdims\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
   }
   member_method {
     name: "tanh"
@@ -330,16 +306,12 @@ tf_module {
   }
   member_method {
     name: "weighted_moments"
-    argspec: "args=[\'x\', \'axes\', \'frequency_weights\', \'name\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'False\'], "
+    argspec: "args=[\'x\', \'axes\', \'frequency_weights\', \'keepdims\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
   }
   member_method {
     name: "with_space_to_batch"
     argspec: "args=[\'input\', \'dilation_rate\', \'padding\', \'op\', \'filter_shape\', \'spatial_dims\', \'data_format\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
-  member_method {
-    name: "xw_plus_b"
-    argspec: "args=[\'x\', \'weights\', \'biases\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
   member_method {
     name: "zero_fraction"
     argspec: "args=[\'value\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-device-wrapper.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-device-wrapper.pbtxt
index 381c4975d7d..9e52a425261 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-device-wrapper.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-device-wrapper.pbtxt
@@ -106,6 +106,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-dropout-wrapper.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-dropout-wrapper.pbtxt
index 912365a28b1..9836433d08c 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-dropout-wrapper.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-dropout-wrapper.pbtxt
@@ -110,6 +110,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-r-n-n-cell.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-r-n-n-cell.pbtxt
index faeb4f35133..d3b68e4f297 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-r-n-n-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-r-n-n-cell.pbtxt
@@ -105,6 +105,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-residual-wrapper.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-residual-wrapper.pbtxt
index caa2e600800..1f7840ab919 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-residual-wrapper.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-residual-wrapper.pbtxt
@@ -106,6 +106,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.pbtxt
index 3c78b07b394..b1f687f5296 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.pbtxt
@@ -12,10 +12,6 @@ tf_module {
     name: "LSTMStateTuple"
     mtype: "<type \'type\'>"
   }
-  member {
-    name: "MultiRNNCell"
-    mtype: "<type \'type\'>"
-  }
   member {
     name: "RNNCell"
     mtype: "<type \'type\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.pbtxt
index 078b471a4c6..cb38ae0b498 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.pbtxt
@@ -8,14 +8,6 @@ tf_module {
     name: "AttrValue"
     mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
   }
-  member {
-    name: "ConditionalAccumulator"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "ConditionalAccumulatorBase"
-    mtype: "<type \'type\'>"
-  }
   member {
     name: "ConfigProto"
     mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
@@ -24,14 +16,6 @@ tf_module {
     name: "DType"
     mtype: "<type \'type\'>"
   }
-  member {
-    name: "DeviceSpec"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "Dimension"
-    mtype: "<type \'type\'>"
-  }
   member {
     name: "Event"
     mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
@@ -56,10 +40,6 @@ tf_module {
     name: "GraphDef"
     mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
   }
-  member {
-    name: "GraphKeys"
-    mtype: "<type \'type\'>"
-  }
   member {
     name: "GraphOptions"
     mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
@@ -137,11 +117,11 @@ tf_module {
     mtype: "<type \'type\'>"
   }
   member {
-    name: "TensorInfo"
-    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
+    name: "TensorShape"
+    mtype: "<type \'type\'>"
   }
   member {
-    name: "TensorShape"
+    name: "TensorSpec"
     mtype: "<type \'type\'>"
   }
   member {
@@ -160,10 +140,6 @@ tf_module {
     name: "VariableSynchronization"
     mtype: "<class \'enum.EnumMeta\'>"
   }
-  member {
-    name: "app"
-    mtype: "<type \'module\'>"
-  }
   member {
     name: "bfloat16"
     mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
@@ -200,6 +176,10 @@ tf_module {
     name: "debugging"
     mtype: "<type \'module\'>"
   }
+  member {
+    name: "distribute"
+    mtype: "<type \'module\'>"
+  }
   member {
     name: "double"
     mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
@@ -224,10 +204,6 @@ tf_module {
     name: "feature_column"
     mtype: "<type \'module\'>"
   }
-  member {
-    name: "flags"
-    mtype: "<type \'module\'>"
-  }
   member {
     name: "float16"
     mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
@@ -240,10 +216,6 @@ tf_module {
     name: "float64"
     mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
   }
-  member {
-    name: "gfile"
-    mtype: "<type \'module\'>"
-  }
   member {
     name: "glorot_uniform_initializer"
     mtype: "<type \'type\'>"
@@ -296,10 +268,6 @@ tf_module {
     name: "lite"
     mtype: "<type \'module\'>"
   }
-  member {
-    name: "logging"
-    mtype: "<type \'module\'>"
-  }
   member {
     name: "losses"
     mtype: "<type \'module\'>"
@@ -328,14 +296,6 @@ tf_module {
     name: "ones_initializer"
     mtype: "<type \'type\'>"
   }
-  member {
-    name: "profiler"
-    mtype: "<type \'module\'>"
-  }
-  member {
-    name: "pywrap_tensorflow"
-    mtype: "<type \'module\'>"
-  }
   member {
     name: "qint16"
     mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
@@ -392,10 +352,6 @@ tf_module {
     name: "sparse"
     mtype: "<type \'module\'>"
   }
-  member {
-    name: "spectral"
-    mtype: "<type \'module\'>"
-  }
   member {
     name: "string"
     mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
@@ -476,14 +432,6 @@ tf_module {
     name: "add_n"
     argspec: "args=[\'inputs\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
-  member_method {
-    name: "arg_max"
-    argspec: "args=[\'input\', \'dimension\', \'output_type\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'int64\'>\", \'None\'], "
-  }
-  member_method {
-    name: "arg_min"
-    argspec: "args=[\'input\', \'dimension\', \'output_type\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'int64\'>\", \'None\'], "
-  }
   member_method {
     name: "argmax"
     argspec: "args=[\'input\', \'axis\', \'output_type\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \"<dtype: \'int64\'>\", \'None\'], "
@@ -492,6 +440,10 @@ tf_module {
     name: "argmin"
     argspec: "args=[\'input\', \'axis\', \'output_type\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \"<dtype: \'int64\'>\", \'None\'], "
   }
+  member_method {
+    name: "argsort"
+    argspec: "args=[\'values\', \'axis\', \'direction\', \'stable\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'ASCENDING\', \'False\', \'None\'], "
+  }
   member_method {
     name: "as_dtype"
     argspec: "args=[\'type_value\'], varargs=None, keywords=None, defaults=None"
@@ -510,19 +462,19 @@ tf_module {
   }
   member_method {
     name: "assert_equal"
-    argspec: "args=[\'x\', \'y\', \'data\', \'summarize\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'x\', \'y\', \'message\', \'summarize\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "assert_greater"
-    argspec: "args=[\'x\', \'y\', \'data\', \'summarize\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'x\', \'y\', \'message\', \'summarize\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "assert_less"
-    argspec: "args=[\'x\', \'y\', \'data\', \'summarize\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'x\', \'y\', \'message\', \'summarize\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "assert_rank"
-    argspec: "args=[\'x\', \'rank\', \'data\', \'summarize\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'x\', \'rank\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "atan"
@@ -546,10 +498,6 @@ tf_module {
   }
   member_method {
     name: "batch_to_space"
-    argspec: "args=[\'input\', \'crops\', \'block_size\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "batch_to_space_nd"
     argspec: "args=[\'input\', \'block_shape\', \'crops\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
@@ -580,10 +528,6 @@ tf_module {
     name: "cast"
     argspec: "args=[\'x\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
-  member_method {
-    name: "clip_by_average_norm"
-    argspec: "args=[\'t\', \'clip_norm\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
   member_method {
     name: "clip_by_global_norm"
     argspec: "args=[\'t_list\', \'clip_norm\', \'use_norm\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
@@ -606,11 +550,11 @@ tf_module {
   }
   member_method {
     name: "cond"
-    argspec: "args=[\'pred\', \'true_fn\', \'false_fn\', \'strict\', \'name\', \'fn1\', \'fn2\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'False\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'pred\', \'true_fn\', \'false_fn\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "constant"
-    argspec: "args=[\'value\', \'dtype\', \'shape\', \'name\', \'verify_shape\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'Const\', \'False\'], "
+    argspec: "args=[\'value\', \'dtype\', \'shape\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'Const\'], "
   }
   member_method {
     name: "control_dependencies"
@@ -628,14 +572,6 @@ tf_module {
     name: "cosh"
     argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
-  member_method {
-    name: "count_nonzero"
-    argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'dtype\', \'name\', \'reduction_indices\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \"<dtype: \'int64\'>\", \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "create_partitioned_variables"
-    argspec: "args=[\'shape\', \'slicing\', \'initializer\', \'dtype\', \'trainable\', \'collections\', \'name\', \'reuse\'], varargs=None, keywords=None, defaults=[\"<dtype: \'float32\'>\", \'True\', \'None\', \'None\', \'None\'], "
-  }
   member_method {
     name: "cumsum"
     argspec: "args=[\'x\', \'axis\', \'exclusive\', \'reverse\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'False\', \'False\', \'None\'], "
@@ -648,10 +584,6 @@ tf_module {
     name: "device"
     argspec: "args=[\'device_name\'], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "div"
-    argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
   member_method {
     name: "div_no_nan"
     argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -676,10 +608,6 @@ tf_module {
     name: "einsum"
     argspec: "args=[\'equation\'], varargs=inputs, keywords=kwargs, defaults=None"
   }
-  member_method {
-    name: "enable_eager_execution"
-    argspec: "args=[\'config\', \'device_policy\', \'execution_mode\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
-  }
   member_method {
     name: "ensure_shape"
     argspec: "args=[\'x\', \'shape\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -698,7 +626,7 @@ tf_module {
   }
   member_method {
     name: "expand_dims"
-    argspec: "args=[\'input\', \'axis\', \'name\', \'dim\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+    argspec: "args=[\'input\', \'axis\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "extract_volume_patches"
@@ -712,10 +640,6 @@ tf_module {
     name: "fill"
     argspec: "args=[\'dims\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
-  member_method {
-    name: "fixed_size_partitioner"
-    argspec: "args=[\'num_shards\', \'axis\'], varargs=None, keywords=None, defaults=[\'0\'], "
-  }
   member_method {
     name: "floor"
     argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -738,31 +662,23 @@ tf_module {
   }
   member_method {
     name: "function"
-    argspec: "args=[\'func\', \'input_signature\', \'autograph\', \'experimental_autograph_options\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'False\', \'None\'], "
+    argspec: "args=[\'func\', \'input_signature\', \'autograph\', \'experimental_autograph_options\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'True\', \'None\'], "
   }
   member_method {
     name: "gather"
-    argspec: "args=[\'params\', \'indices\', \'validate_indices\', \'name\', \'axis\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'0\'], "
+    argspec: "args=[\'params\', \'indices\', \'validate_indices\', \'axis\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \'None\'], "
   }
   member_method {
     name: "gather_nd"
     argspec: "args=[\'params\', \'indices\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
-    name: "get_collection"
-    argspec: "args=[\'key\', \'scope\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "get_collection_ref"
-    argspec: "args=[\'key\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_default_graph"
+    name: "get_logger"
     argspec: "args=[], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "gradients"
-    argspec: "args=[\'ys\', \'xs\', \'grad_ys\', \'name\', \'colocate_gradients_with_ops\', \'gate_gradients\', \'aggregation_method\', \'stop_gradients\', \'unconnected_gradients\'], varargs=None, keywords=None, defaults=[\'None\', \'gradients\', \'False\', \'False\', \'None\', \'None\', \'UnconnectedGradients.NONE\'], "
+    argspec: "args=[\'ys\', \'xs\', \'grad_ys\', \'name\', \'gate_gradients\', \'aggregation_method\', \'stop_gradients\', \'unconnected_gradients\'], varargs=None, keywords=None, defaults=[\'None\', \'gradients\', \'False\', \'None\', \'None\', \'UnconnectedGradients.NONE\'], "
   }
   member_method {
     name: "greater"
@@ -782,7 +698,7 @@ tf_module {
   }
   member_method {
     name: "hessians"
-    argspec: "args=[\'ys\', \'xs\', \'name\', \'colocate_gradients_with_ops\', \'gate_gradients\', \'aggregation_method\'], varargs=None, keywords=None, defaults=[\'hessians\', \'False\', \'False\', \'None\'], "
+    argspec: "args=[\'ys\', \'xs\', \'gate_gradients\', \'aggregation_method\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\', \'hessians\'], "
   }
   member_method {
     name: "histogram_fixed_width"
@@ -816,18 +732,10 @@ tf_module {
     name: "less_equal"
     argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
-  member_method {
-    name: "lin_space"
-    argspec: "args=[\'start\', \'stop\', \'num\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
   member_method {
     name: "linspace"
     argspec: "args=[\'start\', \'stop\', \'num\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
-  member_method {
-    name: "load_file_system_library"
-    argspec: "args=[\'library_filename\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "load_library"
     argspec: "args=[\'library_location\'], varargs=None, keywords=None, defaults=None"
@@ -836,14 +744,6 @@ tf_module {
     name: "load_op_library"
     argspec: "args=[\'library_filename\'], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "log"
-    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "log1p"
-    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
   member_method {
     name: "logical_and"
     argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -880,10 +780,6 @@ tf_module {
     name: "meshgrid"
     argspec: "args=[], varargs=args, keywords=kwargs, defaults=None"
   }
-  member_method {
-    name: "min_max_variable_partitioner"
-    argspec: "args=[\'max_partitions\', \'axis\', \'min_slice_size\', \'bytes_per_string_element\'], varargs=None, keywords=None, defaults=[\'1\', \'0\', \'262144\', \'16\'], "
-  }
   member_method {
     name: "minimum"
     argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -892,10 +788,6 @@ tf_module {
     name: "mod"
     argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
-  member_method {
-    name: "multinomial"
-    argspec: "args=[\'logits\', \'num_samples\', \'seed\', \'name\', \'output_dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
-  }
   member_method {
     name: "multiply"
     argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -918,7 +810,7 @@ tf_module {
   }
   member_method {
     name: "norm"
-    argspec: "args=[\'tensor\', \'ord\', \'axis\', \'keepdims\', \'name\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'euclidean\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'tensor\', \'ord\', \'axis\', \'keepdims\', \'name\'], varargs=None, keywords=None, defaults=[\'euclidean\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "not_equal"
@@ -934,11 +826,11 @@ tf_module {
   }
   member_method {
     name: "ones_like"
-    argspec: "args=[\'tensor\', \'dtype\', \'name\', \'optimize\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'True\'], "
+    argspec: "args=[\'input\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "pad"
-    argspec: "args=[\'tensor\', \'paddings\', \'mode\', \'name\', \'constant_values\'], varargs=None, keywords=None, defaults=[\'CONSTANT\', \'None\', \'0\'], "
+    argspec: "args=[\'tensor\', \'paddings\', \'mode\', \'constant_values\', \'name\'], varargs=None, keywords=None, defaults=[\'CONSTANT\', \'0\', \'None\'], "
   }
   member_method {
     name: "parallel_stack"
@@ -956,10 +848,6 @@ tf_module {
     name: "py_function"
     argspec: "args=[\'func\', \'inp\', \'Tout\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
-  member_method {
-    name: "quantize_v2"
-    argspec: "args=[\'input\', \'min_range\', \'max_range\', \'T\', \'mode\', \'name\', \'round_mode\'], varargs=None, keywords=None, defaults=[\'MIN_COMBINED\', \'None\', \'HALF_AWAY_FROM_ZERO\'], "
-  }
   member_method {
     name: "range"
     argspec: "args=[\'start\', \'limit\', \'delta\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'None\', \'range\'], "
@@ -974,35 +862,35 @@ tf_module {
   }
   member_method {
     name: "reduce_all"
-    argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\', \'reduction_indices\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
   }
   member_method {
     name: "reduce_any"
-    argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\', \'reduction_indices\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
   }
   member_method {
     name: "reduce_logsumexp"
-    argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\', \'reduction_indices\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
   }
   member_method {
     name: "reduce_max"
-    argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\', \'reduction_indices\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
   }
   member_method {
     name: "reduce_mean"
-    argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\', \'reduction_indices\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
   }
   member_method {
     name: "reduce_min"
-    argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\', \'reduction_indices\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
   }
   member_method {
     name: "reduce_prod"
-    argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\', \'reduction_indices\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
   }
   member_method {
     name: "reduce_sum"
-    argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\', \'reduction_indices\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
   }
   member_method {
     name: "register_tensor_conversion_function"
@@ -1012,10 +900,6 @@ tf_module {
     name: "required_space_to_batch_paddings"
     argspec: "args=[\'input_shape\', \'block_shape\', \'base_paddings\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
-  member_method {
-    name: "reset_default_graph"
-    argspec: "args=[], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "reshape"
     argspec: "args=[\'tensor\', \'shape\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -1026,7 +910,7 @@ tf_module {
   }
   member_method {
     name: "reverse_sequence"
-    argspec: "args=[\'input\', \'seq_lengths\', \'seq_axis\', \'batch_axis\', \'name\', \'seq_dim\', \'batch_dim\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'input\', \'seq_lengths\', \'seq_axis\', \'batch_axis\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "roll"
@@ -1042,7 +926,7 @@ tf_module {
   }
   member_method {
     name: "scalar_mul"
-    argspec: "args=[\'scalar\', \'x\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'scalar\', \'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "scan"
@@ -1076,13 +960,9 @@ tf_module {
     name: "sequence_mask"
     argspec: "args=[\'lengths\', \'maxlen\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \"<dtype: \'bool\'>\", \'None\'], "
   }
-  member_method {
-    name: "set_random_seed"
-    argspec: "args=[\'seed\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "shape"
-    argspec: "args=[\'input\', \'name\', \'out_type\'], varargs=None, keywords=None, defaults=[\'None\', \"<dtype: \'int32\'>\"], "
+    argspec: "args=[\'input\', \'out_type\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'int32\'>\", \'None\'], "
   }
   member_method {
     name: "shape_n"
@@ -1106,44 +986,20 @@ tf_module {
   }
   member_method {
     name: "size"
-    argspec: "args=[\'input\', \'name\', \'out_type\'], varargs=None, keywords=None, defaults=[\'None\', \"<dtype: \'int32\'>\"], "
+    argspec: "args=[\'input\', \'out_type\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'int32\'>\", \'None\'], "
   }
   member_method {
     name: "slice"
     argspec: "args=[\'input_\', \'begin\', \'size\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "sort"
+    argspec: "args=[\'values\', \'axis\', \'direction\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'ASCENDING\', \'None\'], "
+  }
   member_method {
     name: "space_to_batch_nd"
     argspec: "args=[\'input\', \'block_shape\', \'paddings\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
-  member_method {
-    name: "sparse_concat"
-    argspec: "args=[\'axis\', \'sp_inputs\', \'name\', \'expand_nonconcat_dim\', \'concat_dim\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
-  }
-  member_method {
-    name: "sparse_reduce_max"
-    argspec: "args=[\'sp_input\', \'axis\', \'keepdims\', \'reduction_axes\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "sparse_reduce_max_sparse"
-    argspec: "args=[\'sp_input\', \'axis\', \'keepdims\', \'reduction_axes\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "sparse_reduce_sum"
-    argspec: "args=[\'sp_input\', \'axis\', \'keepdims\', \'reduction_axes\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "sparse_reduce_sum_sparse"
-    argspec: "args=[\'sp_input\', \'axis\', \'keepdims\', \'reduction_axes\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "sparse_split"
-    argspec: "args=[\'keyword_required\', \'sp_input\', \'num_split\', \'axis\', \'name\', \'split_dim\'], varargs=None, keywords=None, defaults=[\'KeywordRequired()\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "sparse_to_dense"
-    argspec: "args=[\'sparse_indices\', \'output_shape\', \'sparse_values\', \'default_value\', \'validate_indices\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'True\', \'None\'], "
-  }
   member_method {
     name: "split"
     argspec: "args=[\'value\', \'num_or_size_splits\', \'axis\', \'num\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'None\', \'split\'], "
@@ -1158,7 +1014,7 @@ tf_module {
   }
   member_method {
     name: "squeeze"
-    argspec: "args=[\'input\', \'axis\', \'name\', \'squeeze_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+    argspec: "args=[\'input\', \'axis\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "stack"
@@ -1192,6 +1048,18 @@ tf_module {
     name: "tanh"
     argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "tensor_scatter_add"
+    argspec: "args=[\'tensor\', \'indices\', \'updates\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "tensor_scatter_sub"
+    argspec: "args=[\'tensor\', \'indices\', \'updates\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "tensor_scatter_update"
+    argspec: "args=[\'tensor\', \'indices\', \'updates\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "tensordot"
     argspec: "args=[\'a\', \'b\', \'axes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -1206,16 +1074,12 @@ tf_module {
   }
   member_method {
     name: "transpose"
-    argspec: "args=[\'a\', \'perm\', \'name\', \'conjugate\'], varargs=None, keywords=None, defaults=[\'None\', \'transpose\', \'False\'], "
+    argspec: "args=[\'a\', \'perm\', \'conjugate\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'transpose\'], "
   }
   member_method {
     name: "truediv"
     argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
-  member_method {
-    name: "truncated_normal"
-    argspec: "args=[\'shape\', \'mean\', \'stddev\', \'dtype\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'0.0\', \'1.0\', \"<dtype: \'float32\'>\", \'None\', \'None\'], "
-  }
   member_method {
     name: "truncatediv"
     argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -1226,7 +1090,7 @@ tf_module {
   }
   member_method {
     name: "tuple"
-    argspec: "args=[\'tensors\', \'name\', \'control_inputs\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'tensors\', \'control_inputs\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "unique"
@@ -1244,10 +1108,6 @@ tf_module {
     name: "unstack"
     argspec: "args=[\'value\', \'num\', \'axis\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \'unstack\'], "
   }
-  member_method {
-    name: "variable_axis_size_partitioner"
-    argspec: "args=[\'max_shard_bytes\', \'axis\', \'bytes_per_string_element\', \'max_shards\'], varargs=None, keywords=None, defaults=[\'0\', \'16\', \'None\'], "
-  }
   member_method {
     name: "variable_creator_scope"
     argspec: "args=[\'variable_creator\'], varargs=None, keywords=None, defaults=None"
@@ -1258,7 +1118,7 @@ tf_module {
   }
   member_method {
     name: "while_loop"
-    argspec: "args=[\'cond\', \'body\', \'loop_vars\', \'shape_invariants\', \'parallel_iterations\', \'back_prop\', \'swap_memory\', \'name\', \'maximum_iterations\', \'return_same_structure\'], varargs=None, keywords=None, defaults=[\'None\', \'10\', \'True\', \'False\', \'None\', \'None\', \'False\'], "
+    argspec: "args=[\'cond\', \'body\', \'loop_vars\', \'shape_invariants\', \'parallel_iterations\', \'back_prop\', \'swap_memory\', \'maximum_iterations\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'10\', \'True\', \'False\', \'None\', \'None\'], "
   }
   member_method {
     name: "zeros"
@@ -1266,6 +1126,6 @@ tf_module {
   }
   member_method {
     name: "zeros_like"
-    argspec: "args=[\'tensor\', \'dtype\', \'name\', \'optimize\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'True\'], "
+    argspec: "args=[\'input\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.profiler.-advice-proto.-checker.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.profiler.-advice-proto.-checker.pbtxt
deleted file mode 100644
index e09c44cc9ce..00000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.profiler.-advice-proto.-checker.pbtxt
+++ /dev/null
@@ -1,12 +0,0 @@
-path: "tensorflow.profiler.AdviceProto.Checker"
-tf_proto {
-  descriptor {
-    name: "Checker"
-    field {
-      name: "reports"
-      number: 2
-      label: LABEL_REPEATED
-      type: TYPE_STRING
-    }
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.profiler.-advice-proto.-checkers-entry.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.profiler.-advice-proto.-checkers-entry.pbtxt
deleted file mode 100644
index 87462435496..00000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.profiler.-advice-proto.-checkers-entry.pbtxt
+++ /dev/null
@@ -1,22 +0,0 @@
-path: "tensorflow.profiler.AdviceProto.CheckersEntry"
-tf_proto {
-  descriptor {
-    name: "CheckersEntry"
-    field {
-      name: "key"
-      number: 1
-      label: LABEL_OPTIONAL
-      type: TYPE_STRING
-    }
-    field {
-      name: "value"
-      number: 2
-      label: LABEL_OPTIONAL
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.tfprof.AdviceProto.Checker"
-    }
-    options {
-      map_entry: true
-    }
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.profiler.-advice-proto.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.profiler.-advice-proto.pbtxt
deleted file mode 100644
index a8a8858ccd5..00000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.profiler.-advice-proto.pbtxt
+++ /dev/null
@@ -1,41 +0,0 @@
-path: "tensorflow.profiler.AdviceProto"
-tf_proto {
-  descriptor {
-    name: "AdviceProto"
-    field {
-      name: "checkers"
-      number: 1
-      label: LABEL_REPEATED
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.tfprof.AdviceProto.CheckersEntry"
-    }
-    nested_type {
-      name: "CheckersEntry"
-      field {
-        name: "key"
-        number: 1
-        label: LABEL_OPTIONAL
-        type: TYPE_STRING
-      }
-      field {
-        name: "value"
-        number: 2
-        label: LABEL_OPTIONAL
-        type: TYPE_MESSAGE
-        type_name: ".tensorflow.tfprof.AdviceProto.Checker"
-      }
-      options {
-        map_entry: true
-      }
-    }
-    nested_type {
-      name: "Checker"
-      field {
-        name: "reports"
-        number: 2
-        label: LABEL_REPEATED
-        type: TYPE_STRING
-      }
-    }
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.profiler.-graph-node-proto.-input-shapes-entry.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.profiler.-graph-node-proto.-input-shapes-entry.pbtxt
deleted file mode 100644
index afec73f537a..00000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.profiler.-graph-node-proto.-input-shapes-entry.pbtxt
+++ /dev/null
@@ -1,22 +0,0 @@
-path: "tensorflow.profiler.GraphNodeProto.InputShapesEntry"
-tf_proto {
-  descriptor {
-    name: "InputShapesEntry"
-    field {
-      name: "key"
-      number: 1
-      label: LABEL_OPTIONAL
-      type: TYPE_INT32
-    }
-    field {
-      name: "value"
-      number: 2
-      label: LABEL_OPTIONAL
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.TensorShapeProto"
-    }
-    options {
-      map_entry: true
-    }
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.profiler.-graph-node-proto.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.profiler.-graph-node-proto.pbtxt
deleted file mode 100644
index 3c831770053..00000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.profiler.-graph-node-proto.pbtxt
+++ /dev/null
@@ -1,191 +0,0 @@
-path: "tensorflow.profiler.GraphNodeProto"
-tf_proto {
-  descriptor {
-    name: "GraphNodeProto"
-    field {
-      name: "name"
-      number: 1
-      label: LABEL_OPTIONAL
-      type: TYPE_STRING
-    }
-    field {
-      name: "tensor_value"
-      number: 15
-      label: LABEL_OPTIONAL
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.tfprof.TFProfTensorProto"
-    }
-    field {
-      name: "run_count"
-      number: 21
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "exec_micros"
-      number: 2
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "accelerator_exec_micros"
-      number: 17
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "cpu_exec_micros"
-      number: 18
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "requested_bytes"
-      number: 3
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "peak_bytes"
-      number: 24
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "residual_bytes"
-      number: 25
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "output_bytes"
-      number: 26
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "parameters"
-      number: 4
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "float_ops"
-      number: 13
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "devices"
-      number: 10
-      label: LABEL_REPEATED
-      type: TYPE_STRING
-    }
-    field {
-      name: "total_definition_count"
-      number: 23
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "total_run_count"
-      number: 22
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "total_exec_micros"
-      number: 6
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "total_accelerator_exec_micros"
-      number: 19
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "total_cpu_exec_micros"
-      number: 20
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "total_requested_bytes"
-      number: 7
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "total_peak_bytes"
-      number: 27
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "total_residual_bytes"
-      number: 28
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "total_output_bytes"
-      number: 29
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "total_parameters"
-      number: 8
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "total_float_ops"
-      number: 14
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "shapes"
-      number: 11
-      label: LABEL_REPEATED
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.TensorShapeProto"
-    }
-    field {
-      name: "input_shapes"
-      number: 16
-      label: LABEL_REPEATED
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.tfprof.GraphNodeProto.InputShapesEntry"
-    }
-    field {
-      name: "children"
-      number: 12
-      label: LABEL_REPEATED
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.tfprof.GraphNodeProto"
-    }
-    nested_type {
-      name: "InputShapesEntry"
-      field {
-        name: "key"
-        number: 1
-        label: LABEL_OPTIONAL
-        type: TYPE_INT32
-      }
-      field {
-        name: "value"
-        number: 2
-        label: LABEL_OPTIONAL
-        type: TYPE_MESSAGE
-        type_name: ".tensorflow.TensorShapeProto"
-      }
-      options {
-        map_entry: true
-      }
-    }
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.profiler.-multi-graph-node-proto.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.profiler.-multi-graph-node-proto.pbtxt
deleted file mode 100644
index 2b08a05437f..00000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.profiler.-multi-graph-node-proto.pbtxt
+++ /dev/null
@@ -1,134 +0,0 @@
-path: "tensorflow.profiler.MultiGraphNodeProto"
-tf_proto {
-  descriptor {
-    name: "MultiGraphNodeProto"
-    field {
-      name: "name"
-      number: 1
-      label: LABEL_OPTIONAL
-      type: TYPE_STRING
-    }
-    field {
-      name: "exec_micros"
-      number: 2
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "accelerator_exec_micros"
-      number: 12
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "cpu_exec_micros"
-      number: 13
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "requested_bytes"
-      number: 3
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "peak_bytes"
-      number: 16
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "residual_bytes"
-      number: 17
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "output_bytes"
-      number: 18
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "parameters"
-      number: 4
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "float_ops"
-      number: 5
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "total_exec_micros"
-      number: 6
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "total_accelerator_exec_micros"
-      number: 14
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "total_cpu_exec_micros"
-      number: 15
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "total_requested_bytes"
-      number: 7
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "total_peak_bytes"
-      number: 19
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "total_residual_bytes"
-      number: 20
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "total_output_bytes"
-      number: 21
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "total_parameters"
-      number: 8
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "total_float_ops"
-      number: 9
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "graph_nodes"
-      number: 10
-      label: LABEL_REPEATED
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.tfprof.GraphNodeProto"
-    }
-    field {
-      name: "children"
-      number: 11
-      label: LABEL_REPEATED
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.tfprof.MultiGraphNodeProto"
-    }
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.profiler.-op-log-proto.-id-to-string-entry.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.profiler.-op-log-proto.-id-to-string-entry.pbtxt
deleted file mode 100644
index b3adc50c7e1..00000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.profiler.-op-log-proto.-id-to-string-entry.pbtxt
+++ /dev/null
@@ -1,21 +0,0 @@
-path: "tensorflow.profiler.OpLogProto.IdToStringEntry"
-tf_proto {
-  descriptor {
-    name: "IdToStringEntry"
-    field {
-      name: "key"
-      number: 1
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "value"
-      number: 2
-      label: LABEL_OPTIONAL
-      type: TYPE_STRING
-    }
-    options {
-      map_entry: true
-    }
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.profiler.-op-log-proto.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.profiler.-op-log-proto.pbtxt
deleted file mode 100644
index 7510c566ba5..00000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.profiler.-op-log-proto.pbtxt
+++ /dev/null
@@ -1,38 +0,0 @@
-path: "tensorflow.profiler.OpLogProto"
-tf_proto {
-  descriptor {
-    name: "OpLogProto"
-    field {
-      name: "log_entries"
-      number: 1
-      label: LABEL_REPEATED
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.tfprof.OpLogEntry"
-    }
-    field {
-      name: "id_to_string"
-      number: 2
-      label: LABEL_REPEATED
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.tfprof.OpLogProto.IdToStringEntry"
-    }
-    nested_type {
-      name: "IdToStringEntry"
-      field {
-        name: "key"
-        number: 1
-        label: LABEL_OPTIONAL
-        type: TYPE_INT64
-      }
-      field {
-        name: "value"
-        number: 2
-        label: LABEL_OPTIONAL
-        type: TYPE_STRING
-      }
-      options {
-        map_entry: true
-      }
-    }
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.profiler.-profile-option-builder.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.profiler.-profile-option-builder.pbtxt
deleted file mode 100644
index 19ff38a3900..00000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.profiler.-profile-option-builder.pbtxt
+++ /dev/null
@@ -1,93 +0,0 @@
-path: "tensorflow.profiler.ProfileOptionBuilder"
-tf_class {
-  is_instance: "<class \'tensorflow.python.profiler.option_builder.ProfileOptionBuilder\'>"
-  is_instance: "<type \'object\'>"
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'options\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "account_displayed_op_only"
-    argspec: "args=[\'self\', \'is_true\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "build"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "float_operation"
-    argspec: "args=[], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "order_by"
-    argspec: "args=[\'self\', \'attribute\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "select"
-    argspec: "args=[\'self\', \'attributes\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "time_and_memory"
-    argspec: "args=[\'min_micros\', \'min_bytes\', \'min_accelerator_micros\', \'min_cpu_micros\', \'min_peak_bytes\', \'min_residual_bytes\', \'min_output_bytes\'], varargs=None, keywords=None, defaults=[\'1\', \'1\', \'0\', \'0\', \'0\', \'0\', \'0\'], "
-  }
-  member_method {
-    name: "trainable_variables_parameter"
-    argspec: "args=[], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "with_accounted_types"
-    argspec: "args=[\'self\', \'account_type_regexes\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "with_empty_output"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "with_file_output"
-    argspec: "args=[\'self\', \'outfile\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "with_max_depth"
-    argspec: "args=[\'self\', \'max_depth\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "with_min_execution_time"
-    argspec: "args=[\'self\', \'min_micros\', \'min_accelerator_micros\', \'min_cpu_micros\'], varargs=None, keywords=None, defaults=[\'0\', \'0\', \'0\'], "
-  }
-  member_method {
-    name: "with_min_float_operations"
-    argspec: "args=[\'self\', \'min_float_ops\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "with_min_memory"
-    argspec: "args=[\'self\', \'min_bytes\', \'min_peak_bytes\', \'min_residual_bytes\', \'min_output_bytes\'], varargs=None, keywords=None, defaults=[\'0\', \'0\', \'0\', \'0\'], "
-  }
-  member_method {
-    name: "with_min_occurrence"
-    argspec: "args=[\'self\', \'min_occurrence\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "with_min_parameters"
-    argspec: "args=[\'self\', \'min_params\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "with_node_names"
-    argspec: "args=[\'self\', \'start_name_regexes\', \'show_name_regexes\', \'hide_name_regexes\', \'trim_name_regexes\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "with_pprof_output"
-    argspec: "args=[\'self\', \'pprof_file\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "with_stdout_output"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "with_step"
-    argspec: "args=[\'self\', \'step\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "with_timeline_output"
-    argspec: "args=[\'self\', \'timeline_file\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.profiler.-profiler.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.profiler.-profiler.pbtxt
deleted file mode 100644
index acb61dae9f0..00000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.profiler.-profiler.pbtxt
+++ /dev/null
@@ -1,37 +0,0 @@
-path: "tensorflow.profiler.Profiler"
-tf_class {
-  is_instance: "<class \'tensorflow.python.profiler.model_analyzer.Profiler\'>"
-  is_instance: "<type \'object\'>"
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'graph\', \'op_log\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
-  }
-  member_method {
-    name: "add_step"
-    argspec: "args=[\'self\', \'step\', \'run_meta\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "advise"
-    argspec: "args=[\'self\', \'options\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "profile_graph"
-    argspec: "args=[\'self\', \'options\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "profile_name_scope"
-    argspec: "args=[\'self\', \'options\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "profile_operations"
-    argspec: "args=[\'self\', \'options\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "profile_python"
-    argspec: "args=[\'self\', \'options\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "serialize_to_string"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.profiler.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.profiler.pbtxt
deleted file mode 100644
index 7b4d3ac522a..00000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.profiler.pbtxt
+++ /dev/null
@@ -1,39 +0,0 @@
-path: "tensorflow.profiler"
-tf_module {
-  member {
-    name: "AdviceProto"
-    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
-  }
-  member {
-    name: "GraphNodeProto"
-    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
-  }
-  member {
-    name: "MultiGraphNodeProto"
-    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
-  }
-  member {
-    name: "OpLogProto"
-    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
-  }
-  member {
-    name: "ProfileOptionBuilder"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "Profiler"
-    mtype: "<type \'type\'>"
-  }
-  member_method {
-    name: "advise"
-    argspec: "args=[\'graph\', \'run_meta\', \'options\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'0\'], "
-  }
-  member_method {
-    name: "profile"
-    argspec: "args=[\'graph\', \'run_meta\', \'op_log\', \'cmd\', \'options\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'scope\', \'0\'], "
-  }
-  member_method {
-    name: "write_op_log"
-    argspec: "args=[\'graph\', \'log_dir\', \'op_log\', \'run_meta\', \'add_trace\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'True\'], "
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.quantization.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.quantization.pbtxt
index 2948b7318ea..632c2f8f83c 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.quantization.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.quantization.pbtxt
@@ -34,7 +34,7 @@ tf_module {
   }
   member_method {
     name: "quantize_and_dequantize"
-    argspec: "args=[\'input\', \'input_min\', \'input_max\', \'signed_input\', \'num_bits\', \'range_given\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'8\', \'False\', \'None\'], "
+    argspec: "args=[\'input\', \'input_min\', \'input_max\', \'signed_input\', \'num_bits\', \'range_given\', \'round_mode\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'8\', \'False\', \'HALF_TO_EVEN\', \'None\'], "
   }
   member_method {
     name: "quantized_concat"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.random.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.random.pbtxt
index 160c09798d0..de5cb6b7172 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.random.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.random.pbtxt
@@ -1,31 +1,35 @@
 path: "tensorflow.random"
 tf_module {
+  member_method {
+    name: "all_candidate_sampler"
+    argspec: "args=[\'true_classes\', \'num_true\', \'num_sampled\', \'unique\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "categorical"
+    argspec: "args=[\'logits\', \'num_samples\', \'dtype\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "fixed_unigram_candidate_sampler"
+    argspec: "args=[\'true_classes\', \'num_true\', \'num_sampled\', \'unique\', \'range_max\', \'vocab_file\', \'distortion\', \'num_reserved_ids\', \'num_shards\', \'shard\', \'unigrams\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'1.0\', \'0\', \'1\', \'0\', \'()\', \'None\', \'None\'], "
+  }
   member_method {
     name: "gamma"
     argspec: "args=[\'shape\', \'alpha\', \'beta\', \'dtype\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \"<dtype: \'float32\'>\", \'None\', \'None\'], "
   }
-  member_method {
-    name: "get_seed"
-    argspec: "args=[\'op_seed\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "log_uniform_candidate_sampler"
     argspec: "args=[\'true_classes\', \'num_true\', \'num_sampled\', \'unique\', \'range_max\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
-  member_method {
-    name: "multinomial"
-    argspec: "args=[\'logits\', \'num_samples\', \'seed\', \'name\', \'output_dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
-  }
   member_method {
     name: "normal"
     argspec: "args=[\'shape\', \'mean\', \'stddev\', \'dtype\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'0.0\', \'1.0\', \"<dtype: \'float32\'>\", \'None\', \'None\'], "
   }
   member_method {
     name: "poisson"
-    argspec: "args=[\'lam\', \'shape\', \'dtype\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'float32\'>\", \'None\', \'None\'], "
+    argspec: "args=[\'shape\', \'lam\', \'dtype\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'float32\'>\", \'None\', \'None\'], "
   }
   member_method {
-    name: "set_random_seed"
+    name: "set_seed"
     argspec: "args=[\'seed\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
@@ -33,8 +37,8 @@ tf_module {
     argspec: "args=[\'value\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
-    name: "stateless_multinomial"
-    argspec: "args=[\'logits\', \'num_samples\', \'seed\', \'output_dtype\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'int64\'>\", \'None\'], "
+    name: "stateless_categorical"
+    argspec: "args=[\'logits\', \'num_samples\', \'seed\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'int64\'>\", \'None\'], "
   }
   member_method {
     name: "stateless_normal"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.saved_model.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.saved_model.pbtxt
index d57936a2f1c..63bebb20bca 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.saved_model.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.saved_model.pbtxt
@@ -32,14 +32,6 @@ tf_module {
     name: "GPU"
     mtype: "<type \'str\'>"
   }
-  member {
-    name: "LEGACY_INIT_OP_KEY"
-    mtype: "<type \'str\'>"
-  }
-  member {
-    name: "MAIN_OP_KEY"
-    mtype: "<type \'str\'>"
-  }
   member {
     name: "PREDICT_INPUTS"
     mtype: "<type \'str\'>"
@@ -105,12 +97,12 @@ tf_module {
     argspec: "args=[\'examples\', \'classes\', \'scores\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
-    name: "is_valid_signature"
-    argspec: "args=[\'signature_def\'], varargs=None, keywords=None, defaults=None"
+    name: "contains_saved_model"
+    argspec: "args=[\'export_dir\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
-    name: "maybe_saved_model_directory"
-    argspec: "args=[\'export_dir\'], varargs=None, keywords=None, defaults=None"
+    name: "is_valid_signature"
+    argspec: "args=[\'signature_def\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "predict_signature_def"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.sets.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.sets.pbtxt
index 8a196b1a556..900d08ff47c 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.sets.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.sets.pbtxt
@@ -1,19 +1,19 @@
 path: "tensorflow.sets"
 tf_module {
   member_method {
-    name: "set_difference"
+    name: "difference"
     argspec: "args=[\'a\', \'b\', \'aminusb\', \'validate_indices\'], varargs=None, keywords=None, defaults=[\'True\', \'True\'], "
   }
   member_method {
-    name: "set_intersection"
+    name: "intersection"
     argspec: "args=[\'a\', \'b\', \'validate_indices\'], varargs=None, keywords=None, defaults=[\'True\'], "
   }
   member_method {
-    name: "set_size"
+    name: "size"
     argspec: "args=[\'a\', \'validate_indices\'], varargs=None, keywords=None, defaults=[\'True\'], "
   }
   member_method {
-    name: "set_union"
+    name: "union"
     argspec: "args=[\'a\', \'b\', \'validate_indices\'], varargs=None, keywords=None, defaults=[\'True\'], "
   }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.signal.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.signal.pbtxt
index 2c50c41f186..ea717b4d719 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.signal.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.signal.pbtxt
@@ -1,5 +1,9 @@
 path: "tensorflow.signal"
 tf_module {
+  member_method {
+    name: "dct"
+    argspec: "args=[\'input\', \'type\', \'n\', \'axis\', \'norm\', \'name\'], varargs=None, keywords=None, defaults=[\'2\', \'None\', \'-1\', \'None\', \'None\'], "
+  }
   member_method {
     name: "fft"
     argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -24,6 +28,10 @@ tf_module {
     name: "hann_window"
     argspec: "args=[\'window_length\', \'periodic\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \"<dtype: \'float32\'>\", \'None\'], "
   }
+  member_method {
+    name: "idct"
+    argspec: "args=[\'input\', \'type\', \'n\', \'axis\', \'norm\', \'name\'], varargs=None, keywords=None, defaults=[\'2\', \'None\', \'-1\', \'None\', \'None\'], "
+  }
   member_method {
     name: "ifft"
     argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -44,6 +52,18 @@ tf_module {
     name: "inverse_stft_window_fn"
     argspec: "args=[\'frame_step\', \'forward_window_fn\', \'name\'], varargs=None, keywords=None, defaults=[\'<function hann_window instance>\', \'None\'], "
   }
+  member_method {
+    name: "irfft"
+    argspec: "args=[\'input_tensor\', \'fft_length\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "irfft2d"
+    argspec: "args=[\'input_tensor\', \'fft_length\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "irfft3d"
+    argspec: "args=[\'input_tensor\', \'fft_length\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "linear_to_mel_weight_matrix"
     argspec: "args=[\'num_mel_bins\', \'num_spectrogram_bins\', \'sample_rate\', \'lower_edge_hertz\', \'upper_edge_hertz\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'20\', \'129\', \'8000\', \'125.0\', \'3800.0\', \"<dtype: \'float32\'>\", \'None\'], "
@@ -56,6 +76,18 @@ tf_module {
     name: "overlap_and_add"
     argspec: "args=[\'signal\', \'frame_step\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "rfft"
+    argspec: "args=[\'input_tensor\', \'fft_length\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "rfft2d"
+    argspec: "args=[\'input_tensor\', \'fft_length\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "rfft3d"
+    argspec: "args=[\'input_tensor\', \'fft_length\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "stft"
     argspec: "args=[\'signals\', \'frame_length\', \'frame_step\', \'fft_length\', \'window_fn\', \'pad_end\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'<function hann_window instance>\', \'False\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.sparse.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.sparse.pbtxt
index 9c9c4d838e9..9808200d72c 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.sparse.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.sparse.pbtxt
@@ -10,11 +10,11 @@ tf_module {
   }
   member_method {
     name: "add"
-    argspec: "args=[\'a\', \'b\', \'thresh\'], varargs=None, keywords=None, defaults=[\'0\'], "
+    argspec: "args=[\'a\', \'b\', \'threshold\'], varargs=None, keywords=None, defaults=[\'0\'], "
   }
   member_method {
     name: "concat"
-    argspec: "args=[\'axis\', \'sp_inputs\', \'name\', \'expand_nonconcat_dim\', \'concat_dim\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+    argspec: "args=[\'axis\', \'sp_inputs\', \'expand_nonconcat_dim\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
   }
   member_method {
     name: "cross"
@@ -40,37 +40,21 @@ tf_module {
     name: "mask"
     argspec: "args=[\'a\', \'mask_indices\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
-  member_method {
-    name: "matmul"
-    argspec: "args=[\'sp_a\', \'b\', \'adjoint_a\', \'adjoint_b\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'None\'], "
-  }
   member_method {
     name: "maximum"
     argspec: "args=[\'sp_a\', \'sp_b\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
-  member_method {
-    name: "merge"
-    argspec: "args=[\'sp_ids\', \'sp_values\', \'vocab_size\', \'name\', \'already_sorted\'], varargs=None, keywords=None, defaults=[\'None\', \'False\'], "
-  }
   member_method {
     name: "minimum"
     argspec: "args=[\'sp_a\', \'sp_b\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "reduce_max"
-    argspec: "args=[\'sp_input\', \'axis\', \'keepdims\', \'reduction_axes\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "reduce_max_sparse"
-    argspec: "args=[\'sp_input\', \'axis\', \'keepdims\', \'reduction_axes\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'sp_input\', \'axis\', \'keepdims\', \'output_is_sparse\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'False\', \'None\'], "
   }
   member_method {
     name: "reduce_sum"
-    argspec: "args=[\'sp_input\', \'axis\', \'keepdims\', \'reduction_axes\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "reduce_sum_sparse"
-    argspec: "args=[\'sp_input\', \'axis\', \'keepdims\', \'reduction_axes\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'sp_input\', \'axis\', \'keepdims\', \'output_is_sparse\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'False\', \'None\'], "
   }
   member_method {
     name: "reorder"
@@ -90,15 +74,15 @@ tf_module {
   }
   member_method {
     name: "segment_mean"
-    argspec: "args=[\'data\', \'indices\', \'segment_ids\', \'name\', \'num_segments\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'data\', \'indices\', \'segment_ids\', \'num_segments\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "segment_sqrt_n"
-    argspec: "args=[\'data\', \'indices\', \'segment_ids\', \'name\', \'num_segments\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'data\', \'indices\', \'segment_ids\', \'num_segments\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "segment_sum"
-    argspec: "args=[\'data\', \'indices\', \'segment_ids\', \'name\', \'num_segments\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'data\', \'indices\', \'segment_ids\', \'num_segments\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "slice"
@@ -108,9 +92,13 @@ tf_module {
     name: "softmax"
     argspec: "args=[\'sp_input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "sparse_dense_matmul"
+    argspec: "args=[\'sp_a\', \'b\', \'adjoint_a\', \'adjoint_b\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'None\'], "
+  }
   member_method {
     name: "split"
-    argspec: "args=[\'keyword_required\', \'sp_input\', \'num_split\', \'axis\', \'name\', \'split_dim\'], varargs=None, keywords=None, defaults=[\'KeywordRequired()\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'sp_input\', \'num_split\', \'axis\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "to_dense"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.spectral.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.spectral.pbtxt
deleted file mode 100644
index b0f0783e300..00000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.spectral.pbtxt
+++ /dev/null
@@ -1,35 +0,0 @@
-path: "tensorflow.spectral"
-tf_module {
-  member_method {
-    name: "dct"
-    argspec: "args=[\'input\', \'type\', \'n\', \'axis\', \'norm\', \'name\'], varargs=None, keywords=None, defaults=[\'2\', \'None\', \'-1\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "idct"
-    argspec: "args=[\'input\', \'type\', \'n\', \'axis\', \'norm\', \'name\'], varargs=None, keywords=None, defaults=[\'2\', \'None\', \'-1\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "irfft"
-    argspec: "args=[\'input_tensor\', \'fft_length\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
-  }
-  member_method {
-    name: "irfft2d"
-    argspec: "args=[\'input_tensor\', \'fft_length\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
-  }
-  member_method {
-    name: "irfft3d"
-    argspec: "args=[\'input_tensor\', \'fft_length\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
-  }
-  member_method {
-    name: "rfft"
-    argspec: "args=[\'input_tensor\', \'fft_length\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
-  }
-  member_method {
-    name: "rfft2d"
-    argspec: "args=[\'input_tensor\', \'fft_length\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
-  }
-  member_method {
-    name: "rfft3d"
-    argspec: "args=[\'input_tensor\', \'fft_length\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.strings.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.strings.pbtxt
index 03144cbe709..f6e32ed08c8 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.strings.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.strings.pbtxt
@@ -10,11 +10,11 @@ tf_module {
   }
   member_method {
     name: "length"
-    argspec: "args=[\'input\', \'name\', \'unit\'], varargs=None, keywords=None, defaults=[\'None\', \'BYTE\'], "
+    argspec: "args=[\'input\', \'unit\', \'name\'], varargs=None, keywords=None, defaults=[\'BYTE\', \'None\'], "
   }
   member_method {
     name: "reduce_join"
-    argspec: "args=[\'inputs\', \'axis\', \'keep_dims\', \'separator\', \'name\', \'reduction_indices\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'\', \'None\', \'None\'], "
+    argspec: "args=[\'inputs\', \'axis\', \'keepdims\', \'separator\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'\', \'None\'], "
   }
   member_method {
     name: "regex_full_match"
@@ -34,11 +34,11 @@ tf_module {
   }
   member_method {
     name: "substr"
-    argspec: "args=[\'input\', \'pos\', \'len\', \'name\', \'unit\'], varargs=None, keywords=None, defaults=[\'None\', \'BYTE\'], "
+    argspec: "args=[\'input\', \'pos\', \'len\', \'unit\', \'name\'], varargs=None, keywords=None, defaults=[\'BYTE\', \'None\'], "
   }
   member_method {
     name: "to_hash_bucket"
-    argspec: "args=[\'string_tensor\', \'num_buckets\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'input\', \'num_buckets\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "to_hash_bucket_fast"
@@ -50,7 +50,11 @@ tf_module {
   }
   member_method {
     name: "to_number"
-    argspec: "args=[\'string_tensor\', \'out_type\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'float32\'>\", \'None\'], "
+    argspec: "args=[\'input\', \'out_type\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'float32\'>\", \'None\'], "
+  }
+  member_method {
+    name: "unicode_encode"
+    argspec: "args=[\'input\', \'output_encoding\', \'errors\', \'replacement_char\', \'name\'], varargs=None, keywords=None, defaults=[\'replace\', \'65533\', \'None\'], "
   }
   member_method {
     name: "unicode_script"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.summary.-summary-writer.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.summary.-summary-writer.pbtxt
new file mode 100644
index 00000000000..6715c14e168
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.summary.-summary-writer.pbtxt
@@ -0,0 +1,29 @@
+path: "tensorflow.summary.SummaryWriter"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.summary_ops_v2.SummaryWriter\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'resource\', \'init_op_fn\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "as_default"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "close"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "flush"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "init"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_as_default"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.summary.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.summary.pbtxt
index 7ed9cd77a01..42a74a65fbb 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.summary.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.summary.pbtxt
@@ -24,44 +24,24 @@ tf_module {
     name: "SummaryDescription"
     mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
   }
+  member {
+    name: "SummaryWriter"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "TaggedRunMetadata"
     mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
   }
   member_method {
-    name: "audio"
-    argspec: "args=[\'name\', \'tensor\', \'sample_rate\', \'max_outputs\', \'collections\', \'family\'], varargs=None, keywords=None, defaults=[\'3\', \'None\', \'None\'], "
+    name: "create_file_writer"
+    argspec: "args=[\'logdir\', \'max_queue\', \'flush_millis\', \'filename_suffix\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
-    name: "get_summary_description"
-    argspec: "args=[\'node_def\'], varargs=None, keywords=None, defaults=None"
+    name: "flush"
+    argspec: "args=[\'writer\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
-    name: "histogram"
-    argspec: "args=[\'name\', \'values\', \'collections\', \'family\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
-  }
-  member_method {
-    name: "image"
-    argspec: "args=[\'name\', \'tensor\', \'max_outputs\', \'collections\', \'family\'], varargs=None, keywords=None, defaults=[\'3\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "merge"
-    argspec: "args=[\'inputs\', \'collections\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
-  }
-  member_method {
-    name: "merge_all"
-    argspec: "args=[\'key\', \'scope\', \'name\'], varargs=None, keywords=None, defaults=[\'summaries\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "scalar"
-    argspec: "args=[\'name\', \'tensor\', \'collections\', \'family\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
-  }
-  member_method {
-    name: "tensor_summary"
-    argspec: "args=[\'name\', \'tensor\', \'summary_description\', \'collections\', \'summary_metadata\', \'family\', \'display_name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "text"
-    argspec: "args=[\'name\', \'tensor\', \'collections\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    name: "import_event"
+    argspec: "args=[\'tensor\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.test.-benchmark.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.test.-benchmark.pbtxt
index df528e26b60..6fc489c8604 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.test.-benchmark.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.test.-benchmark.pbtxt
@@ -6,6 +6,10 @@ tf_class {
   member_method {
     name: "__init__"
   }
+  member_method {
+    name: "evaluate"
+    argspec: "args=[\'self\', \'tensors\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "is_abstract"
     argspec: "args=[\'cls\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.test.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.test.pbtxt
index af3f06d8de3..72ce7330445 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.test.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.test.pbtxt
@@ -12,26 +12,14 @@ tf_module {
     name: "TestCase"
     mtype: "<type \'type\'>"
   }
-  member {
-    name: "mock"
-    mtype: "<type \'module\'>"
-  }
   member_method {
     name: "assert_equal_graph_def"
-    argspec: "args=[\'actual\', \'expected\', \'checkpoint_v2\'], varargs=None, keywords=None, defaults=[\'False\'], "
+    argspec: "args=[\'actual\', \'expected\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "benchmark_config"
     argspec: "args=[], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "compute_gradient"
-    argspec: "args=[\'x\', \'x_shape\', \'y\', \'y_shape\', \'x_init_value\', \'delta\', \'init_targets\', \'extra_feed_dict\'], varargs=None, keywords=None, defaults=[\'None\', \'0.001\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "compute_gradient_error"
-    argspec: "args=[\'x\', \'x_shape\', \'y\', \'y_shape\', \'x_init_value\', \'delta\', \'extra_feed_dict\'], varargs=None, keywords=None, defaults=[\'None\', \'0.001\', \'None\'], "
-  }
   member_method {
     name: "create_local_cluster"
     argspec: "args=[\'num_workers\', \'num_ps\', \'protocol\', \'worker_config\', \'ps_config\'], varargs=None, keywords=None, defaults=[\'grpc\', \'None\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.train.-adadelta-optimizer.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.-adadelta-optimizer.pbtxt
deleted file mode 100644
index 1f1d8b6f9e2..00000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.train.-adadelta-optimizer.pbtxt
+++ /dev/null
@@ -1,51 +0,0 @@
-path: "tensorflow.train.AdadeltaOptimizer"
-tf_class {
-  is_instance: "<class \'tensorflow.python.training.adadelta.AdadeltaOptimizer\'>"
-  is_instance: "<class \'tensorflow.python.training.optimizer.Optimizer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "GATE_GRAPH"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "GATE_NONE"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "GATE_OP"
-    mtype: "<type \'int\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'learning_rate\', \'rho\', \'epsilon\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'0.001\', \'0.95\', \'1e-08\', \'False\', \'Adadelta\'], "
-  }
-  member_method {
-    name: "apply_gradients"
-    argspec: "args=[\'self\', \'grads_and_vars\', \'global_step\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
-  }
-  member_method {
-    name: "compute_gradients"
-    argspec: "args=[\'self\', \'loss\', \'var_list\', \'gate_gradients\', \'aggregation_method\', \'colocate_gradients_with_ops\', \'grad_loss\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'None\', \'False\', \'None\'], "
-  }
-  member_method {
-    name: "get_name"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_slot"
-    argspec: "args=[\'self\', \'var\', \'name\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_slot_names"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "minimize"
-    argspec: "args=[\'self\', \'loss\', \'global_step\', \'var_list\', \'gate_gradients\', \'aggregation_method\', \'colocate_gradients_with_ops\', \'name\', \'grad_loss\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'1\', \'None\', \'False\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "variables"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.train.-adagrad-d-a-optimizer.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.-adagrad-d-a-optimizer.pbtxt
deleted file mode 100644
index a7c05d48490..00000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.train.-adagrad-d-a-optimizer.pbtxt
+++ /dev/null
@@ -1,51 +0,0 @@
-path: "tensorflow.train.AdagradDAOptimizer"
-tf_class {
-  is_instance: "<class \'tensorflow.python.training.adagrad_da.AdagradDAOptimizer\'>"
-  is_instance: "<class \'tensorflow.python.training.optimizer.Optimizer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "GATE_GRAPH"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "GATE_NONE"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "GATE_OP"
-    mtype: "<type \'int\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'learning_rate\', \'global_step\', \'initial_gradient_squared_accumulator_value\', \'l1_regularization_strength\', \'l2_regularization_strength\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'0.1\', \'0.0\', \'0.0\', \'False\', \'AdagradDA\'], "
-  }
-  member_method {
-    name: "apply_gradients"
-    argspec: "args=[\'self\', \'grads_and_vars\', \'global_step\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
-  }
-  member_method {
-    name: "compute_gradients"
-    argspec: "args=[\'self\', \'loss\', \'var_list\', \'gate_gradients\', \'aggregation_method\', \'colocate_gradients_with_ops\', \'grad_loss\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'None\', \'False\', \'None\'], "
-  }
-  member_method {
-    name: "get_name"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_slot"
-    argspec: "args=[\'self\', \'var\', \'name\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_slot_names"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "minimize"
-    argspec: "args=[\'self\', \'loss\', \'global_step\', \'var_list\', \'gate_gradients\', \'aggregation_method\', \'colocate_gradients_with_ops\', \'name\', \'grad_loss\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'1\', \'None\', \'False\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "variables"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.train.-adagrad-optimizer.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.-adagrad-optimizer.pbtxt
deleted file mode 100644
index bc8b92389c6..00000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.train.-adagrad-optimizer.pbtxt
+++ /dev/null
@@ -1,51 +0,0 @@
-path: "tensorflow.train.AdagradOptimizer"
-tf_class {
-  is_instance: "<class \'tensorflow.python.training.adagrad.AdagradOptimizer\'>"
-  is_instance: "<class \'tensorflow.python.training.optimizer.Optimizer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "GATE_GRAPH"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "GATE_NONE"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "GATE_OP"
-    mtype: "<type \'int\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'learning_rate\', \'initial_accumulator_value\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'0.1\', \'False\', \'Adagrad\'], "
-  }
-  member_method {
-    name: "apply_gradients"
-    argspec: "args=[\'self\', \'grads_and_vars\', \'global_step\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
-  }
-  member_method {
-    name: "compute_gradients"
-    argspec: "args=[\'self\', \'loss\', \'var_list\', \'gate_gradients\', \'aggregation_method\', \'colocate_gradients_with_ops\', \'grad_loss\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'None\', \'False\', \'None\'], "
-  }
-  member_method {
-    name: "get_name"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_slot"
-    argspec: "args=[\'self\', \'var\', \'name\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_slot_names"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "minimize"
-    argspec: "args=[\'self\', \'loss\', \'global_step\', \'var_list\', \'gate_gradients\', \'aggregation_method\', \'colocate_gradients_with_ops\', \'name\', \'grad_loss\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'1\', \'None\', \'False\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "variables"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.train.-adam-optimizer.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.-adam-optimizer.pbtxt
deleted file mode 100644
index 5d17be9378f..00000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.train.-adam-optimizer.pbtxt
+++ /dev/null
@@ -1,51 +0,0 @@
-path: "tensorflow.train.AdamOptimizer"
-tf_class {
-  is_instance: "<class \'tensorflow.python.training.adam.AdamOptimizer\'>"
-  is_instance: "<class \'tensorflow.python.training.optimizer.Optimizer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "GATE_GRAPH"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "GATE_NONE"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "GATE_OP"
-    mtype: "<type \'int\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'learning_rate\', \'beta1\', \'beta2\', \'epsilon\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'0.001\', \'0.9\', \'0.999\', \'1e-08\', \'False\', \'Adam\'], "
-  }
-  member_method {
-    name: "apply_gradients"
-    argspec: "args=[\'self\', \'grads_and_vars\', \'global_step\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
-  }
-  member_method {
-    name: "compute_gradients"
-    argspec: "args=[\'self\', \'loss\', \'var_list\', \'gate_gradients\', \'aggregation_method\', \'colocate_gradients_with_ops\', \'grad_loss\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'None\', \'False\', \'None\'], "
-  }
-  member_method {
-    name: "get_name"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_slot"
-    argspec: "args=[\'self\', \'var\', \'name\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_slot_names"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "minimize"
-    argspec: "args=[\'self\', \'loss\', \'global_step\', \'var_list\', \'gate_gradients\', \'aggregation_method\', \'colocate_gradients_with_ops\', \'name\', \'grad_loss\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'1\', \'None\', \'False\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "variables"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.train.-checkpoint-saver-listener.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.-checkpoint-saver-listener.pbtxt
deleted file mode 100644
index 9d3688e5657..00000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.train.-checkpoint-saver-listener.pbtxt
+++ /dev/null
@@ -1,24 +0,0 @@
-path: "tensorflow.train.CheckpointSaverListener"
-tf_class {
-  is_instance: "<class \'tensorflow.python.training.basic_session_run_hooks.CheckpointSaverListener\'>"
-  is_instance: "<type \'object\'>"
-  member_method {
-    name: "__init__"
-  }
-  member_method {
-    name: "after_save"
-    argspec: "args=[\'self\', \'session\', \'global_step_value\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "before_save"
-    argspec: "args=[\'self\', \'session\', \'global_step_value\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "begin"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "end"
-    argspec: "args=[\'self\', \'session\', \'global_step_value\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.train.-chief-session-creator.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.-chief-session-creator.pbtxt
deleted file mode 100644
index abbe273be32..00000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.train.-chief-session-creator.pbtxt
+++ /dev/null
@@ -1,14 +0,0 @@
-path: "tensorflow.train.ChiefSessionCreator"
-tf_class {
-  is_instance: "<class \'tensorflow.python.training.monitored_session.ChiefSessionCreator\'>"
-  is_instance: "<class \'tensorflow.python.training.monitored_session.SessionCreator\'>"
-  is_instance: "<type \'object\'>"
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'scaffold\', \'master\', \'config\', \'checkpoint_dir\', \'checkpoint_filename_with_path\'], varargs=None, keywords=None, defaults=[\'None\', \'\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "create_session"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.train.-ftrl-optimizer.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.-ftrl-optimizer.pbtxt
deleted file mode 100644
index d265fdeb01c..00000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.train.-ftrl-optimizer.pbtxt
+++ /dev/null
@@ -1,51 +0,0 @@
-path: "tensorflow.train.FtrlOptimizer"
-tf_class {
-  is_instance: "<class \'tensorflow.python.training.ftrl.FtrlOptimizer\'>"
-  is_instance: "<class \'tensorflow.python.training.optimizer.Optimizer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "GATE_GRAPH"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "GATE_NONE"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "GATE_OP"
-    mtype: "<type \'int\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'learning_rate\', \'learning_rate_power\', \'initial_accumulator_value\', \'l1_regularization_strength\', \'l2_regularization_strength\', \'use_locking\', \'name\', \'accum_name\', \'linear_name\', \'l2_shrinkage_regularization_strength\'], varargs=None, keywords=None, defaults=[\'-0.5\', \'0.1\', \'0.0\', \'0.0\', \'False\', \'Ftrl\', \'None\', \'None\', \'0.0\'], "
-  }
-  member_method {
-    name: "apply_gradients"
-    argspec: "args=[\'self\', \'grads_and_vars\', \'global_step\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
-  }
-  member_method {
-    name: "compute_gradients"
-    argspec: "args=[\'self\', \'loss\', \'var_list\', \'gate_gradients\', \'aggregation_method\', \'colocate_gradients_with_ops\', \'grad_loss\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'None\', \'False\', \'None\'], "
-  }
-  member_method {
-    name: "get_name"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_slot"
-    argspec: "args=[\'self\', \'var\', \'name\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_slot_names"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "minimize"
-    argspec: "args=[\'self\', \'loss\', \'global_step\', \'var_list\', \'gate_gradients\', \'aggregation_method\', \'colocate_gradients_with_ops\', \'name\', \'grad_loss\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'1\', \'None\', \'False\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "variables"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.train.-gradient-descent-optimizer.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.-gradient-descent-optimizer.pbtxt
deleted file mode 100644
index c673e29cd4d..00000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.train.-gradient-descent-optimizer.pbtxt
+++ /dev/null
@@ -1,51 +0,0 @@
-path: "tensorflow.train.GradientDescentOptimizer"
-tf_class {
-  is_instance: "<class \'tensorflow.python.training.gradient_descent.GradientDescentOptimizer\'>"
-  is_instance: "<class \'tensorflow.python.training.optimizer.Optimizer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "GATE_GRAPH"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "GATE_NONE"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "GATE_OP"
-    mtype: "<type \'int\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'learning_rate\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'GradientDescent\'], "
-  }
-  member_method {
-    name: "apply_gradients"
-    argspec: "args=[\'self\', \'grads_and_vars\', \'global_step\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
-  }
-  member_method {
-    name: "compute_gradients"
-    argspec: "args=[\'self\', \'loss\', \'var_list\', \'gate_gradients\', \'aggregation_method\', \'colocate_gradients_with_ops\', \'grad_loss\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'None\', \'False\', \'None\'], "
-  }
-  member_method {
-    name: "get_name"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_slot"
-    argspec: "args=[\'self\', \'var\', \'name\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_slot_names"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "minimize"
-    argspec: "args=[\'self\', \'loss\', \'global_step\', \'var_list\', \'gate_gradients\', \'aggregation_method\', \'colocate_gradients_with_ops\', \'name\', \'grad_loss\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'1\', \'None\', \'False\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "variables"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.train.-looper-thread.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.-looper-thread.pbtxt
deleted file mode 100644
index c61859004e8..00000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.train.-looper-thread.pbtxt
+++ /dev/null
@@ -1,73 +0,0 @@
-path: "tensorflow.train.LooperThread"
-tf_class {
-  is_instance: "<class \'tensorflow.python.training.coordinator.LooperThread\'>"
-  is_instance: "<class \'threading.Thread\'>"
-  member {
-    name: "daemon"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "ident"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "name"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'coord\', \'timer_interval_secs\', \'target\', \'args\', \'kwargs\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "getName"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "isAlive"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "isDaemon"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "is_alive"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "join"
-    argspec: "args=[\'self\', \'timeout\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "loop"
-    argspec: "args=[\'coord\', \'timer_interval_secs\', \'target\', \'args\', \'kwargs\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
-  }
-  member_method {
-    name: "run"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "run_loop"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "setDaemon"
-    argspec: "args=[\'self\', \'daemonic\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "setName"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "start"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "start_loop"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "stop_loop"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.train.-momentum-optimizer.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.-momentum-optimizer.pbtxt
deleted file mode 100644
index 8199f63b9b8..00000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.train.-momentum-optimizer.pbtxt
+++ /dev/null
@@ -1,51 +0,0 @@
-path: "tensorflow.train.MomentumOptimizer"
-tf_class {
-  is_instance: "<class \'tensorflow.python.training.momentum.MomentumOptimizer\'>"
-  is_instance: "<class \'tensorflow.python.training.optimizer.Optimizer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "GATE_GRAPH"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "GATE_NONE"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "GATE_OP"
-    mtype: "<type \'int\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'learning_rate\', \'momentum\', \'use_locking\', \'name\', \'use_nesterov\'], varargs=None, keywords=None, defaults=[\'False\', \'Momentum\', \'False\'], "
-  }
-  member_method {
-    name: "apply_gradients"
-    argspec: "args=[\'self\', \'grads_and_vars\', \'global_step\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
-  }
-  member_method {
-    name: "compute_gradients"
-    argspec: "args=[\'self\', \'loss\', \'var_list\', \'gate_gradients\', \'aggregation_method\', \'colocate_gradients_with_ops\', \'grad_loss\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'None\', \'False\', \'None\'], "
-  }
-  member_method {
-    name: "get_name"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_slot"
-    argspec: "args=[\'self\', \'var\', \'name\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_slot_names"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "minimize"
-    argspec: "args=[\'self\', \'loss\', \'global_step\', \'var_list\', \'gate_gradients\', \'aggregation_method\', \'colocate_gradients_with_ops\', \'name\', \'grad_loss\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'1\', \'None\', \'False\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "variables"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.train.-monitored-session.-step-context.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.-monitored-session.-step-context.pbtxt
deleted file mode 100644
index 03efe6639e0..00000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.train.-monitored-session.-step-context.pbtxt
+++ /dev/null
@@ -1,21 +0,0 @@
-path: "tensorflow.train.MonitoredSession.StepContext"
-tf_class {
-  is_instance: "<class \'tensorflow.python.training.monitored_session.StepContext\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "session"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'session\', \'run_with_hooks_fn\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "request_stop"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "run_with_hooks"
-    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.train.-monitored-session.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.-monitored-session.pbtxt
deleted file mode 100644
index 09b7b3fb538..00000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.train.-monitored-session.pbtxt
+++ /dev/null
@@ -1,34 +0,0 @@
-path: "tensorflow.train.MonitoredSession"
-tf_class {
-  is_instance: "<class \'tensorflow.python.training.monitored_session.MonitoredSession\'>"
-  is_instance: "<class \'tensorflow.python.training.monitored_session._MonitoredSession\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "StepContext"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'session_creator\', \'hooks\', \'stop_grace_period_secs\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'120\'], "
-  }
-  member_method {
-    name: "close"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "run"
-    argspec: "args=[\'self\', \'fetches\', \'feed_dict\', \'options\', \'run_metadata\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "run_step_fn"
-    argspec: "args=[\'self\', \'step_fn\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "should_stop"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.train.-nan-loss-during-training-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.-nan-loss-during-training-error.pbtxt
deleted file mode 100644
index e415819b3d7..00000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.train.-nan-loss-during-training-error.pbtxt
+++ /dev/null
@@ -1,12 +0,0 @@
-path: "tensorflow.train.NanLossDuringTrainingError"
-tf_class {
-  is_instance: "<class \'tensorflow.python.training.basic_session_run_hooks.NanLossDuringTrainingError\'>"
-  is_instance: "<type \'exceptions.RuntimeError\'>"
-  member {
-    name: "args"
-    mtype: "<type \'getset_descriptor\'>"
-  }
-  member_method {
-    name: "__init__"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.train.-optimizer.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.-optimizer.pbtxt
deleted file mode 100644
index 876bb35e391..00000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.train.-optimizer.pbtxt
+++ /dev/null
@@ -1,50 +0,0 @@
-path: "tensorflow.train.Optimizer"
-tf_class {
-  is_instance: "<class \'tensorflow.python.training.optimizer.Optimizer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "GATE_GRAPH"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "GATE_NONE"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "GATE_OP"
-    mtype: "<type \'int\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "apply_gradients"
-    argspec: "args=[\'self\', \'grads_and_vars\', \'global_step\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
-  }
-  member_method {
-    name: "compute_gradients"
-    argspec: "args=[\'self\', \'loss\', \'var_list\', \'gate_gradients\', \'aggregation_method\', \'colocate_gradients_with_ops\', \'grad_loss\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'None\', \'False\', \'None\'], "
-  }
-  member_method {
-    name: "get_name"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_slot"
-    argspec: "args=[\'self\', \'var\', \'name\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_slot_names"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "minimize"
-    argspec: "args=[\'self\', \'loss\', \'global_step\', \'var_list\', \'gate_gradients\', \'aggregation_method\', \'colocate_gradients_with_ops\', \'name\', \'grad_loss\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'1\', \'None\', \'False\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "variables"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.train.-proximal-adagrad-optimizer.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.-proximal-adagrad-optimizer.pbtxt
deleted file mode 100644
index 14349a74efb..00000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.train.-proximal-adagrad-optimizer.pbtxt
+++ /dev/null
@@ -1,51 +0,0 @@
-path: "tensorflow.train.ProximalAdagradOptimizer"
-tf_class {
-  is_instance: "<class \'tensorflow.python.training.proximal_adagrad.ProximalAdagradOptimizer\'>"
-  is_instance: "<class \'tensorflow.python.training.optimizer.Optimizer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "GATE_GRAPH"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "GATE_NONE"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "GATE_OP"
-    mtype: "<type \'int\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'learning_rate\', \'initial_accumulator_value\', \'l1_regularization_strength\', \'l2_regularization_strength\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'0.1\', \'0.0\', \'0.0\', \'False\', \'ProximalAdagrad\'], "
-  }
-  member_method {
-    name: "apply_gradients"
-    argspec: "args=[\'self\', \'grads_and_vars\', \'global_step\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
-  }
-  member_method {
-    name: "compute_gradients"
-    argspec: "args=[\'self\', \'loss\', \'var_list\', \'gate_gradients\', \'aggregation_method\', \'colocate_gradients_with_ops\', \'grad_loss\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'None\', \'False\', \'None\'], "
-  }
-  member_method {
-    name: "get_name"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_slot"
-    argspec: "args=[\'self\', \'var\', \'name\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_slot_names"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "minimize"
-    argspec: "args=[\'self\', \'loss\', \'global_step\', \'var_list\', \'gate_gradients\', \'aggregation_method\', \'colocate_gradients_with_ops\', \'name\', \'grad_loss\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'1\', \'None\', \'False\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "variables"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.train.-r-m-s-prop-optimizer.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.-r-m-s-prop-optimizer.pbtxt
deleted file mode 100644
index 906384a2875..00000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.train.-r-m-s-prop-optimizer.pbtxt
+++ /dev/null
@@ -1,51 +0,0 @@
-path: "tensorflow.train.RMSPropOptimizer"
-tf_class {
-  is_instance: "<class \'tensorflow.python.training.rmsprop.RMSPropOptimizer\'>"
-  is_instance: "<class \'tensorflow.python.training.optimizer.Optimizer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "GATE_GRAPH"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "GATE_NONE"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "GATE_OP"
-    mtype: "<type \'int\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'learning_rate\', \'decay\', \'momentum\', \'epsilon\', \'use_locking\', \'centered\', \'name\'], varargs=None, keywords=None, defaults=[\'0.9\', \'0.0\', \'1e-10\', \'False\', \'False\', \'RMSProp\'], "
-  }
-  member_method {
-    name: "apply_gradients"
-    argspec: "args=[\'self\', \'grads_and_vars\', \'global_step\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
-  }
-  member_method {
-    name: "compute_gradients"
-    argspec: "args=[\'self\', \'loss\', \'var_list\', \'gate_gradients\', \'aggregation_method\', \'colocate_gradients_with_ops\', \'grad_loss\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'None\', \'False\', \'None\'], "
-  }
-  member_method {
-    name: "get_name"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_slot"
-    argspec: "args=[\'self\', \'var\', \'name\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_slot_names"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "minimize"
-    argspec: "args=[\'self\', \'loss\', \'global_step\', \'var_list\', \'gate_gradients\', \'aggregation_method\', \'colocate_gradients_with_ops\', \'name\', \'grad_loss\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'1\', \'None\', \'False\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "variables"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.train.-scaffold.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.-scaffold.pbtxt
deleted file mode 100644
index 38cc98b48e7..00000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.train.-scaffold.pbtxt
+++ /dev/null
@@ -1,53 +0,0 @@
-path: "tensorflow.train.Scaffold"
-tf_class {
-  is_instance: "<class \'tensorflow.python.training.monitored_session.Scaffold\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "init_feed_dict"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "init_fn"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "init_op"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "local_init_op"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "ready_for_local_init_op"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "ready_op"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "saver"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "summary_op"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'init_op\', \'init_feed_dict\', \'init_fn\', \'ready_op\', \'ready_for_local_init_op\', \'local_init_op\', \'summary_op\', \'saver\', \'copy_from_scaffold\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "default_local_init_op"
-    argspec: "args=[], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "finalize"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_or_default"
-    argspec: "args=[\'arg_name\', \'collection_key\', \'default_constructor\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.train.-second-or-step-timer.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.-second-or-step-timer.pbtxt
deleted file mode 100644
index 3c5a6ac13cc..00000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.train.-second-or-step-timer.pbtxt
+++ /dev/null
@@ -1,26 +0,0 @@
-path: "tensorflow.train.SecondOrStepTimer"
-tf_class {
-  is_instance: "<class \'tensorflow.python.training.basic_session_run_hooks.SecondOrStepTimer\'>"
-  is_instance: "<class \'tensorflow.python.training.basic_session_run_hooks._HookTimer\'>"
-  is_instance: "<type \'object\'>"
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'every_secs\', \'every_steps\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
-  }
-  member_method {
-    name: "last_triggered_step"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "reset"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "should_trigger_for_step"
-    argspec: "args=[\'self\', \'step\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "update_last_triggered_step"
-    argspec: "args=[\'self\', \'step\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.train.-session-creator.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.-session-creator.pbtxt
deleted file mode 100644
index beb232715f7..00000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.train.-session-creator.pbtxt
+++ /dev/null
@@ -1,12 +0,0 @@
-path: "tensorflow.train.SessionCreator"
-tf_class {
-  is_instance: "<class \'tensorflow.python.training.monitored_session.SessionCreator\'>"
-  is_instance: "<type \'object\'>"
-  member_method {
-    name: "__init__"
-  }
-  member_method {
-    name: "create_session"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.train.-session-manager.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.-session-manager.pbtxt
deleted file mode 100644
index 448764fe081..00000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.train.-session-manager.pbtxt
+++ /dev/null
@@ -1,21 +0,0 @@
-path: "tensorflow.train.SessionManager"
-tf_class {
-  is_instance: "<class \'tensorflow.python.training.session_manager.SessionManager\'>"
-  is_instance: "<type \'object\'>"
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'local_init_op\', \'ready_op\', \'ready_for_local_init_op\', \'graph\', \'recovery_wait_secs\', \'local_init_run_options\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'30\', \'None\'], "
-  }
-  member_method {
-    name: "prepare_session"
-    argspec: "args=[\'self\', \'master\', \'init_op\', \'saver\', \'checkpoint_dir\', \'checkpoint_filename_with_path\', \'wait_for_checkpoint\', \'max_wait_secs\', \'config\', \'init_feed_dict\', \'init_fn\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'False\', \'7200\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "recover_session"
-    argspec: "args=[\'self\', \'master\', \'saver\', \'checkpoint_dir\', \'checkpoint_filename_with_path\', \'wait_for_checkpoint\', \'max_wait_secs\', \'config\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'False\', \'7200\', \'None\'], "
-  }
-  member_method {
-    name: "wait_for_session"
-    argspec: "args=[\'self\', \'master\', \'config\', \'max_wait_secs\'], varargs=None, keywords=None, defaults=[\'None\', \'inf\'], "
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.train.-session-run-args.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.-session-run-args.pbtxt
deleted file mode 100644
index 442990893e3..00000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.train.-session-run-args.pbtxt
+++ /dev/null
@@ -1,27 +0,0 @@
-path: "tensorflow.train.SessionRunArgs"
-tf_class {
-  is_instance: "<class \'tensorflow.python.training.session_run_hook.SessionRunArgs\'>"
-  is_instance: "<class \'tensorflow.python.training.session_run_hook.SessionRunArgs\'>"
-  is_instance: "<type \'tuple\'>"
-  member {
-    name: "feed_dict"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "fetches"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "options"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-  }
-  member_method {
-    name: "count"
-  }
-  member_method {
-    name: "index"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.train.-session-run-context.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.-session-run-context.pbtxt
deleted file mode 100644
index d5adb15c95f..00000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.train.-session-run-context.pbtxt
+++ /dev/null
@@ -1,25 +0,0 @@
-path: "tensorflow.train.SessionRunContext"
-tf_class {
-  is_instance: "<class \'tensorflow.python.training.session_run_hook.SessionRunContext\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "original_args"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "session"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "stop_requested"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'original_args\', \'session\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "request_stop"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.train.-session-run-values.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.-session-run-values.pbtxt
deleted file mode 100644
index 0b401d59c40..00000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.train.-session-run-values.pbtxt
+++ /dev/null
@@ -1,27 +0,0 @@
-path: "tensorflow.train.SessionRunValues"
-tf_class {
-  is_instance: "<class \'tensorflow.python.training.session_run_hook.SessionRunValues\'>"
-  is_instance: "<class \'tensorflow.python.training.session_run_hook.SessionRunValues\'>"
-  is_instance: "<type \'tuple\'>"
-  member {
-    name: "options"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "results"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "run_metadata"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-  }
-  member_method {
-    name: "count"
-  }
-  member_method {
-    name: "index"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.train.-singular-monitored-session.-step-context.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.-singular-monitored-session.-step-context.pbtxt
deleted file mode 100644
index 36d8ce7ff82..00000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.train.-singular-monitored-session.-step-context.pbtxt
+++ /dev/null
@@ -1,21 +0,0 @@
-path: "tensorflow.train.SingularMonitoredSession.StepContext"
-tf_class {
-  is_instance: "<class \'tensorflow.python.training.monitored_session.StepContext\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "session"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'session\', \'run_with_hooks_fn\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "request_stop"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "run_with_hooks"
-    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.train.-singular-monitored-session.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.-singular-monitored-session.pbtxt
deleted file mode 100644
index de0f2c1c1a2..00000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.train.-singular-monitored-session.pbtxt
+++ /dev/null
@@ -1,38 +0,0 @@
-path: "tensorflow.train.SingularMonitoredSession"
-tf_class {
-  is_instance: "<class \'tensorflow.python.training.monitored_session.SingularMonitoredSession\'>"
-  is_instance: "<class \'tensorflow.python.training.monitored_session._MonitoredSession\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "StepContext"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'hooks\', \'scaffold\', \'master\', \'config\', \'checkpoint_dir\', \'stop_grace_period_secs\', \'checkpoint_filename_with_path\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'\', \'None\', \'None\', \'120\', \'None\'], "
-  }
-  member_method {
-    name: "close"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "raw_session"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "run"
-    argspec: "args=[\'self\', \'fetches\', \'feed_dict\', \'options\', \'run_metadata\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "run_step_fn"
-    argspec: "args=[\'self\', \'step_fn\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "should_stop"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.train.-supervisor.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.-supervisor.pbtxt
deleted file mode 100644
index 9677e5a98e4..00000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.train.-supervisor.pbtxt
+++ /dev/null
@@ -1,153 +0,0 @@
-path: "tensorflow.train.Supervisor"
-tf_class {
-  is_instance: "<class \'tensorflow.python.training.supervisor.Supervisor\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "USE_DEFAULT"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "coord"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "global_step"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "init_feed_dict"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "init_op"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "is_chief"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "ready_for_local_init_op"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "ready_op"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "save_model_secs"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "save_path"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "save_summaries_secs"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "saver"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "session_manager"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "summary_op"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "summary_writer"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "Loop"
-    argspec: "args=[\'self\', \'timer_interval_secs\', \'target\', \'args\', \'kwargs\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
-  }
-  member_method {
-    name: "PrepareSession"
-    argspec: "args=[\'self\', \'master\', \'config\', \'wait_for_checkpoint\', \'max_wait_secs\', \'start_standard_services\'], varargs=None, keywords=None, defaults=[\'\', \'None\', \'False\', \'7200\', \'True\'], "
-  }
-  member_method {
-    name: "RequestStop"
-    argspec: "args=[\'self\', \'ex\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "ShouldStop"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "StartQueueRunners"
-    argspec: "args=[\'self\', \'sess\', \'queue_runners\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "StartStandardServices"
-    argspec: "args=[\'self\', \'sess\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "Stop"
-    argspec: "args=[\'self\', \'threads\', \'close_summary_writer\', \'ignore_live_threads\'], varargs=None, keywords=None, defaults=[\'None\', \'True\', \'False\'], "
-  }
-  member_method {
-    name: "StopOnException"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "SummaryComputed"
-    argspec: "args=[\'self\', \'sess\', \'summary\', \'global_step\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "WaitForStop"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'graph\', \'ready_op\', \'ready_for_local_init_op\', \'is_chief\', \'init_op\', \'init_feed_dict\', \'local_init_op\', \'logdir\', \'summary_op\', \'saver\', \'global_step\', \'save_summaries_secs\', \'save_model_secs\', \'recovery_wait_secs\', \'stop_grace_secs\', \'checkpoint_basename\', \'session_manager\', \'summary_writer\', \'init_fn\', \'local_init_run_options\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \'0\', \'True\', \'0\', \'None\', \'0\', \'None\', \'0\', \'0\', \'0\', \'120\', \'600\', \'30\', \'120\', \'model.ckpt\', \'None\', \'0\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "loop"
-    argspec: "args=[\'self\', \'timer_interval_secs\', \'target\', \'args\', \'kwargs\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
-  }
-  member_method {
-    name: "managed_session"
-    argspec: "args=[], varargs=args, keywords=kwds, defaults=None"
-  }
-  member_method {
-    name: "prepare_or_wait_for_session"
-    argspec: "args=[\'self\', \'master\', \'config\', \'wait_for_checkpoint\', \'max_wait_secs\', \'start_standard_services\'], varargs=None, keywords=None, defaults=[\'\', \'None\', \'False\', \'7200\', \'True\'], "
-  }
-  member_method {
-    name: "request_stop"
-    argspec: "args=[\'self\', \'ex\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "should_stop"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "start_queue_runners"
-    argspec: "args=[\'self\', \'sess\', \'queue_runners\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "start_standard_services"
-    argspec: "args=[\'self\', \'sess\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "stop"
-    argspec: "args=[\'self\', \'threads\', \'close_summary_writer\', \'ignore_live_threads\'], varargs=None, keywords=None, defaults=[\'None\', \'True\', \'False\'], "
-  }
-  member_method {
-    name: "stop_on_exception"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "summary_computed"
-    argspec: "args=[\'self\', \'sess\', \'summary\', \'global_step\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "wait_for_stop"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.train.-vocab-info.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.-vocab-info.pbtxt
deleted file mode 100644
index 39b946b82f3..00000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.train.-vocab-info.pbtxt
+++ /dev/null
@@ -1,43 +0,0 @@
-path: "tensorflow.train.VocabInfo"
-tf_class {
-  is_instance: "<class \'tensorflow.python.training.warm_starting_util.VocabInfo\'>"
-  is_instance: "<class \'tensorflow.python.training.warm_starting_util.VocabInfo\'>"
-  is_instance: "<type \'tuple\'>"
-  member {
-    name: "axis"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "backup_initializer"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "new_vocab"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "new_vocab_size"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "num_oov_buckets"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "old_vocab"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "old_vocab_size"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-  }
-  member_method {
-    name: "count"
-  }
-  member_method {
-    name: "index"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.train.-worker-session-creator.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.-worker-session-creator.pbtxt
deleted file mode 100644
index ac263580687..00000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.train.-worker-session-creator.pbtxt
+++ /dev/null
@@ -1,14 +0,0 @@
-path: "tensorflow.train.WorkerSessionCreator"
-tf_class {
-  is_instance: "<class \'tensorflow.python.training.monitored_session.WorkerSessionCreator\'>"
-  is_instance: "<class \'tensorflow.python.training.monitored_session.SessionCreator\'>"
-  is_instance: "<type \'object\'>"
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'scaffold\', \'master\', \'config\', \'max_wait_secs\'], varargs=None, keywords=None, defaults=[\'None\', \'\', \'None\', \'1800\'], "
-  }
-  member_method {
-    name: "create_session"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.train.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.pbtxt
index a091daa2985..3ff4b69d39d 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.train.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.train.pbtxt
@@ -1,21 +1,5 @@
 path: "tensorflow.train"
 tf_module {
-  member {
-    name: "AdadeltaOptimizer"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "AdagradDAOptimizer"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "AdagradOptimizer"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "AdamOptimizer"
-    mtype: "<type \'type\'>"
-  }
   member {
     name: "BytesList"
     mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
@@ -32,14 +16,6 @@ tf_module {
     name: "CheckpointSaverHook"
     mtype: "<type \'type\'>"
   }
-  member {
-    name: "CheckpointSaverListener"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "ChiefSessionCreator"
-    mtype: "<type \'type\'>"
-  }
   member {
     name: "ClusterDef"
     mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
@@ -88,18 +64,10 @@ tf_module {
     name: "FloatList"
     mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
   }
-  member {
-    name: "FtrlOptimizer"
-    mtype: "<type \'type\'>"
-  }
   member {
     name: "GlobalStepWaiterHook"
     mtype: "<type \'type\'>"
   }
-  member {
-    name: "GradientDescentOptimizer"
-    mtype: "<type \'type\'>"
-  }
   member {
     name: "Int64List"
     mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
@@ -112,54 +80,14 @@ tf_module {
     name: "LoggingTensorHook"
     mtype: "<type \'type\'>"
   }
-  member {
-    name: "LooperThread"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "MomentumOptimizer"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "MonitoredSession"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "NanLossDuringTrainingError"
-    mtype: "<type \'type\'>"
-  }
   member {
     name: "NanTensorHook"
     mtype: "<type \'type\'>"
   }
-  member {
-    name: "Optimizer"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "ProfilerHook"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "ProximalAdagradOptimizer"
-    mtype: "<type \'type\'>"
-  }
   member {
     name: "ProximalGradientDescentOptimizer"
     mtype: "<type \'type\'>"
   }
-  member {
-    name: "RMSPropOptimizer"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "Scaffold"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "SecondOrStepTimer"
-    mtype: "<type \'type\'>"
-  }
   member {
     name: "SequenceExample"
     mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
@@ -172,34 +100,10 @@ tf_module {
     name: "ServerDef"
     mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
   }
-  member {
-    name: "SessionCreator"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "SessionManager"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "SessionRunArgs"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "SessionRunContext"
-    mtype: "<type \'type\'>"
-  }
   member {
     name: "SessionRunHook"
     mtype: "<type \'type\'>"
   }
-  member {
-    name: "SessionRunValues"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "SingularMonitoredSession"
-    mtype: "<type \'type\'>"
-  }
   member {
     name: "StepCounterHook"
     mtype: "<type \'type\'>"
@@ -212,22 +116,6 @@ tf_module {
     name: "SummarySaverHook"
     mtype: "<type \'type\'>"
   }
-  member {
-    name: "Supervisor"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "VocabInfo"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "WorkerSessionCreator"
-    mtype: "<type \'type\'>"
-  }
-  member_method {
-    name: "NewCheckpointReader"
-    argspec: "args=[\'filepattern\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "cosine_decay"
     argspec: "args=[\'learning_rate\', \'global_step\', \'decay_steps\', \'alpha\', \'name\'], varargs=None, keywords=None, defaults=[\'0.0\', \'None\'], "
@@ -244,10 +132,6 @@ tf_module {
     name: "get_checkpoint_state"
     argspec: "args=[\'checkpoint_dir\', \'latest_filename\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
-  member_method {
-    name: "init_from_checkpoint"
-    argspec: "args=[\'ckpt_dir_or_file\', \'assignment_map\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "inverse_time_decay"
     argspec: "args=[\'learning_rate\', \'global_step\', \'decay_steps\', \'decay_rate\', \'staircase\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
@@ -281,17 +165,13 @@ tf_module {
     argspec: "args=[\'learning_rate\', \'global_step\', \'decay_steps\', \'initial_variance\', \'variance_decay\', \'num_periods\', \'alpha\', \'beta\', \'name\'], varargs=None, keywords=None, defaults=[\'1.0\', \'0.55\', \'0.5\', \'0.0\', \'0.001\', \'None\'], "
   }
   member_method {
-    name: "piecewise_constant"
+    name: "piecewise_constant_decay"
     argspec: "args=[\'x\', \'boundaries\', \'values\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "polynomial_decay"
     argspec: "args=[\'learning_rate\', \'global_step\', \'decay_steps\', \'end_learning_rate\', \'power\', \'cycle\', \'name\'], varargs=None, keywords=None, defaults=[\'0.0001\', \'1.0\', \'False\', \'None\'], "
   }
-  member_method {
-    name: "replica_device_setter"
-    argspec: "args=[\'ps_tasks\', \'ps_device\', \'worker_device\', \'merge_devices\', \'cluster\', \'ps_ops\', \'ps_strategy\'], varargs=None, keywords=None, defaults=[\'0\', \'/job:ps\', \'/job:worker\', \'True\', \'None\', \'None\', \'None\'], "
-  }
   member_method {
     name: "sdca_fprint"
     argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -308,8 +188,4 @@ tf_module {
     name: "summary_iterator"
     argspec: "args=[\'path\'], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "warm_start"
-    argspec: "args=[\'ckpt_to_initialize_from\', \'vars_to_warm_start\', \'var_name_to_vocab_info\', \'var_name_to_prev_var_name\'], varargs=None, keywords=None, defaults=[\'.*\', \'None\', \'None\'], "
-  }
 }
diff --git a/tensorflow/tools/api/tests/api_compatibility_test.py b/tensorflow/tools/api/tests/api_compatibility_test.py
index fb489ea80fb..e7f23a11740 100644
--- a/tensorflow/tools/api/tests/api_compatibility_test.py
+++ b/tensorflow/tools/api/tests/api_compatibility_test.py
@@ -33,12 +33,13 @@ import re
 import sys
 
 import tensorflow as tf
-from tensorflow._api import v2 as tf_v2
+from tensorflow._api.v2 import v2 as tf_v2
 
 from google.protobuf import message
 from google.protobuf import text_format
 
 from tensorflow.python.lib.io import file_io
+from tensorflow.python.framework import test_util
 from tensorflow.python.platform import resource_loader
 from tensorflow.python.platform import test
 from tensorflow.python.platform import tf_logging as logging
@@ -126,9 +127,9 @@ def _FilterNonCoreGoldenFiles(golden_file_list):
   filtered_file_list = []
   filtered_package_prefixes = ['tensorflow.%s.' % p for p in _NON_CORE_PACKAGES]
   for f in golden_file_list:
-    if any([
+    if any(
         f.rsplit('/')[-1].startswith(pre) for pre in filtered_package_prefixes
-    ]):
+    ):
       continue
     filtered_file_list.append(f)
   return filtered_file_list
@@ -310,6 +311,7 @@ class ApiCompatibilityTest(test.TestCase):
         update_goldens=FLAGS.update_goldens,
         api_version=api_version)
 
+  @test_util.run_deprecated_v1
   def testAPIBackwardsCompatibility(self):
     api_version = 1
     golden_file_pattern = os.path.join(
@@ -328,6 +330,7 @@ class ApiCompatibilityTest(test.TestCase):
         'tensorflow.python.util.lazy_loader.LazyLoader'
         in str(type(tf.contrib)))
 
+  @test_util.run_deprecated_v1
   def testAPIBackwardsCompatibilityV1(self):
     api_version = 1
     golden_file_pattern = os.path.join(
diff --git a/tensorflow/tools/ci_build/Dockerfile.rbe.cuda10.0-cudnn7-ubuntu14.04 b/tensorflow/tools/ci_build/Dockerfile.rbe.cuda10.0-cudnn7-ubuntu14.04
new file mode 100644
index 00000000000..03de89b7176
--- /dev/null
+++ b/tensorflow/tools/ci_build/Dockerfile.rbe.cuda10.0-cudnn7-ubuntu14.04
@@ -0,0 +1,75 @@
+# To push a new version, run:
+# $ docker build -f Dockerfile.rbe.cuda10.0-cudnn7-ubuntu14.04 \
+#       --tag "gcr.io/asci-toolchain/nosla-cuda10.0-cudnn7-ubuntu14.04" .
+# $ docker push gcr.io/asci-toolchain/nosla-cuda10.0-cudnn7-ubuntu14.04
+
+FROM ubuntu:14.04
+LABEL maintainer="Manuel Klimek <klimek@google.com>"
+
+RUN apt-get update && apt-get install -y --no-install-recommends ca-certificates apt-transport-https gnupg-curl && \
+    rm -rf /var/lib/apt/lists/* && \
+    NVIDIA_GPGKEY_SUM=d1be581509378368edeec8c1eb2958702feedf3bc3d17011adbf24efacce4ab5 && \
+    NVIDIA_GPGKEY_FPR=ae09fe4bbd223a84b2ccfce3f60f4b3d7fa2af80 && \
+    apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1604/x86_64/7fa2af80.pub && \
+    apt-key adv --export --no-emit-version -a $NVIDIA_GPGKEY_FPR | tail -n +2 > cudasign.pub && \
+    echo "$NVIDIA_GPGKEY_SUM  cudasign.pub" | sha256sum -c --strict - && rm cudasign.pub && \
+    echo "deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1604/x86_64 /" > /etc/apt/sources.list.d/cuda.list && \
+    echo "deb https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1604/x86_64 /" > /etc/apt/sources.list.d/nvidia-ml.list
+
+ENV CUDA_VERSION 10.0.130
+ENV CUDA_PKG_VERSION 10-0=$CUDA_VERSION-1
+ENV CUDNN_VERSION 7.3.1.20
+ENV NCCL_VERSION 2.3.5
+ENV NVIDIA_DRIVER_CAPABILITIES compute,utility
+ENV NVIDIA_REQUIRE_CUDA "cuda>=10.0,driver>=410"
+ENV NVIDIA_VISIBLE_DEVICES all
+ENV PATH /usr/local/cuda/bin:${PATH}
+
+# TODO(b/110903506): /usr/loca/cuda/lib64/stubs should not be needed in
+# LD_LIBRARY_PATH. The stubs/libcuda.so is not meant to used at runtime. The
+# correct way to pass the path to bfd-ld is to pass
+# -Wl,-rpath-link=/usr/local/cuda/lib64/stubs to all binaries transitively
+# depending on libcuda. Optimally, builds targeting cuda would do that
+# internally.
+ENV LD_LIBRARY_PATH /usr/local/cuda/lib64/stubs
+
+LABEL com.nvidia.cudnn.version="${CUDNN_VERSION}"
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+        cuda-command-line-tools-$CUDA_PKG_VERSION \
+        cuda-compat-10-0=410.48-1 \
+        cuda-cudart-$CUDA_PKG_VERSION \
+        cuda-libraries-$CUDA_PKG_VERSION \
+        cuda-libraries-dev-$CUDA_PKG_VERSION \
+        cuda-minimal-build-$CUDA_PKG_VERSION \
+        cuda-nvml-dev-$CUDA_PKG_VERSION \
+        cuda-nvtx-$CUDA_PKG_VERSION \
+        libcudnn7=$CUDNN_VERSION-1+cuda10.0 \
+        libcudnn7=$CUDNN_VERSION-1+cuda10.0 \
+        libcudnn7-dev=$CUDNN_VERSION-1+cuda10.0 \
+        libnccl2=$NCCL_VERSION-2+cuda10.0 \
+        libnccl-dev=$NCCL_VERSION-2+cuda10.0 && \
+    ln -s cuda-10.0 /usr/local/cuda && \
+    apt-mark hold libcudnn7 && \
+    apt-mark hold libnccl2 && \
+    rm -rf /var/lib/apt/lists/*
+
+# TODO(b/110903506): Provide a link to the SONAME of libcuda.so.
+# https://github.com/NVIDIA/nvidia-docker/issues/775
+RUN ln -s libcuda.so /usr/local/cuda/lib64/stubs/libcuda.so.1
+
+# TODO(klimek): Once the TODO in tensorflow's configure.py to correctly find
+# libnccl is resolved, delete this block.
+RUN ln -s /usr/lib/x86_64-linux-gnu/libnccl.so /usr/lib/libnccl.so \
+ && ln -s /usr/lib/x86_64-linux-gnu/libnccl.so /usr/lib/libnccl.so.2
+
+# Copy and run the install scripts.
+COPY install/*.sh /install/
+ARG DEBIAN_FRONTEND=noninteractive
+RUN /install/install_bootstrap_deb_packages.sh
+RUN add-apt-repository -y ppa:openjdk-r/ppa && \
+    add-apt-repository -y ppa:george-edison55/cmake-3.x
+RUN /install/install_deb_packages.sh
+RUN /install/install_pip_packages.sh
+RUN /install/install_golang.sh
+
diff --git a/tensorflow/tools/ci_build/builds/libtensorflow.sh b/tensorflow/tools/ci_build/builds/libtensorflow.sh
index 9b3ff0cba7d..44abcc309b9 100755
--- a/tensorflow/tools/ci_build/builds/libtensorflow.sh
+++ b/tensorflow/tools/ci_build/builds/libtensorflow.sh
@@ -55,6 +55,7 @@ function build_libtensorflow_tarball() {
   export CC_OPT_FLAGS='-mavx'
   if [ "${TF_NEED_CUDA}" == "1" ]; then
     BAZEL_OPTS="${BAZEL_OPTS} --config=cuda"
+    export TF_NEED_ROCM=0
   fi
   bazel clean --expunge
   yes "" | ./configure
diff --git a/tensorflow/tools/ci_build/windows/cpu/pip/build_tf_windows.sh b/tensorflow/tools/ci_build/windows/cpu/pip/build_tf_windows.sh
index 177ef390dbd..46f5bdef09d 100644
--- a/tensorflow/tools/ci_build/windows/cpu/pip/build_tf_windows.sh
+++ b/tensorflow/tools/ci_build/windows/cpu/pip/build_tf_windows.sh
@@ -58,6 +58,8 @@ PY_TEST_DIR="py_test_dir"
 SKIP_TEST=0
 RELEASE_BUILD=0
 TEST_TARGET="//${PY_TEST_DIR}/tensorflow/python/..."
+PROJECT_NAME=""
+EXTRA_BUILD_FLAGS=""
 
 # --skip_test            Skip running tests
 # --enable_remote_cache  Add options to enable remote cache for build and test
@@ -73,6 +75,20 @@ for ARG in "$@"; do
     --release_build) RELEASE_BUILD=1 ;;
     --test_core_only) TEST_TARGET="//${PY_TEST_DIR}/tensorflow/python/..." ;;
     --test_contrib_only) TEST_TARGET="//${PY_TEST_DIR}/tensorflow/contrib/..." ;;
+    --extra_build_flags)
+      shift
+      if [[ -z "$1" ]]; then
+        break
+      fi
+      EXTRA_BUILD_FLAGS="$1"
+      ;;
+    --project_name)
+      shift
+      if [[ -z "$1" ]]; then
+        break
+      fi
+      PROJECT_NAME="$1"
+      ;;
     *)
   esac
 done
@@ -88,7 +104,11 @@ fi
 
 if [[ "$TF_NIGHTLY" == 1 ]]; then
   python tensorflow/tools/ci_build/update_version.py --nightly
-  EXTRA_PIP_FLAG="--nightly_flag"
+  if [ -z ${PROJECT_NAME} ]; then
+    EXTRA_PIP_FLAGS="--nightly_flag"
+  else
+    EXTRA_PIP_FLAGS="--project_name=${PROJECT_NAME} --nightly_flag"
+  fi
 fi
 
 # Enable short object file path to avoid long path issue on Windows.
@@ -100,7 +120,8 @@ fi
 
 run_configure_for_cpu_build
 
-bazel build --announce_rc --config=opt tensorflow/tools/pip_package:build_pip_package || exit $?
+bazel build --announce_rc --config=opt ${EXTRA_BUILD_FLAGS} \
+  tensorflow/tools/pip_package:build_pip_package || exit $?
 
 if [[ "$SKIP_TEST" == 1 ]]; then
   exit 0
@@ -109,7 +130,7 @@ fi
 # Create a python test directory to avoid package name conflict
 create_python_test_dir "${PY_TEST_DIR}"
 
-./bazel-bin/tensorflow/tools/pip_package/build_pip_package "$PWD/${PY_TEST_DIR}" "${EXTRA_PIP_FLAG}"
+./bazel-bin/tensorflow/tools/pip_package/build_pip_package "$PWD/${PY_TEST_DIR}" "${EXTRA_PIP_FLAGS}"
 
 if [[ "$TF_NIGHTLY" == 1 ]]; then
   exit 0
@@ -126,8 +147,8 @@ N_JOBS="${NUMBER_OF_PROCESSORS}"
 # which will result testing system installed tensorflow
 bazel test --announce_rc --config=opt -k --test_output=errors \
   --define=no_tensorflow_py_deps=true --test_lang_filters=py \
-  --test_tag_filters=-no_pip,-no_windows,-no_oss \
-  --build_tag_filters=-no_pip,-no_windows,-no_oss --build_tests_only \
+  --test_tag_filters=-no_pip,-no_windows,-no_oss,-gpu \
+  --build_tag_filters=-no_pip,-no_windows,-no_oss,-gpu --build_tests_only \
   --test_size_filters=small,medium \
   --jobs="${N_JOBS}" --test_timeout="300,450,1200,3600" \
   --flaky_test_attempts=3 \
diff --git a/tensorflow/tools/ci_build/windows/gpu/pip/build_tf_windows.sh b/tensorflow/tools/ci_build/windows/gpu/pip/build_tf_windows.sh
index 6178d7794df..3aec8d65843 100644
--- a/tensorflow/tools/ci_build/windows/gpu/pip/build_tf_windows.sh
+++ b/tensorflow/tools/ci_build/windows/gpu/pip/build_tf_windows.sh
@@ -58,6 +58,8 @@ PY_TEST_DIR="py_test_dir"
 SKIP_TEST=0
 RELEASE_BUILD=0
 TEST_TARGET="//${PY_TEST_DIR}/tensorflow/python/..."
+PROJECT_NAME=""
+EXTRA_BUILD_FLAGS=""
 
 # --skip_test            Skip running tests
 # --enable_remote_cache  Add options to enable remote cache for build and test
@@ -73,6 +75,20 @@ for ARG in "$@"; do
     --release_build) RELEASE_BUILD=1 ;;
     --test_core_only) TEST_TARGET="//${PY_TEST_DIR}/tensorflow/python/..." ;;
     --test_contrib_only) TEST_TARGET="//${PY_TEST_DIR}/tensorflow/contrib/..." ;;
+    --extra_build_flags)
+      shift
+      if [[ -z "$1" ]]; then
+        break
+      fi
+      EXTRA_BUILD_FLAGS="$1"
+      ;;
+    --project_name)
+      shift
+      if [[ -z "$1" ]]; then
+        break
+      fi
+      PROJECT_NAME="$1"
+      ;;
     *)
   esac
 done
@@ -88,7 +104,11 @@ fi
 
 if [[ "$TF_NIGHTLY" == 1 ]]; then
   python tensorflow/tools/ci_build/update_version.py --nightly
-  EXTRA_PIP_FLAG="--nightly_flag"
+  if [ -z ${PROJECT_NAME} ]; then
+    EXTRA_PIP_FLAGS="--nightly_flag"
+  else
+    EXTRA_PIP_FLAGS="--project_name=${PROJECT_NAME} --nightly_flag"
+  fi
 fi
 
 # Enable short object file path to avoid long path issue on Windows.
@@ -104,6 +124,7 @@ fi
 run_configure_for_gpu_build
 
 bazel build --announce_rc --config=opt --define=no_tensorflow_py_deps=true \
+  ${EXTRA_BUILD_FLAGS} \
   tensorflow/tools/pip_package:build_pip_package || exit $?
 
 if [[ "$SKIP_TEST" == 1 ]]; then
@@ -113,7 +134,8 @@ fi
 # Create a python test directory to avoid package name conflict
 create_python_test_dir "${PY_TEST_DIR}"
 
-./bazel-bin/tensorflow/tools/pip_package/build_pip_package "$PWD/${PY_TEST_DIR}" --gpu "${EXTRA_PIP_FLAG}"
+./bazel-bin/tensorflow/tools/pip_package/build_pip_package "$PWD/${PY_TEST_DIR}" \
+  --gpu "${EXTRA_PIP_FLAGS}"
 
 if [[ "$TF_NIGHTLY" == 1 ]]; then
   exit 0
diff --git a/tensorflow/tools/compatibility/BUILD b/tensorflow/tools/compatibility/BUILD
index 5f619c4e62a..05d924c092c 100644
--- a/tensorflow/tools/compatibility/BUILD
+++ b/tensorflow/tools/compatibility/BUILD
@@ -14,6 +14,18 @@ py_library(
     srcs_version = "PY2AND3",
 )
 
+py_test(
+    name = "ast_edits_test",
+    srcs = ["ast_edits_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":ast_edits",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_test_lib",
+        "@six_archive//:six",
+    ],
+)
+
 py_binary(
     name = "tf_upgrade",
     srcs = ["tf_upgrade.py"],
@@ -39,8 +51,8 @@ py_library(
     srcs_version = "PY2AND3",
 )
 
-py_binary(
-    name = "tf_upgrade_v2",
+py_library(
+    name = "tf_upgrade_v2_lib",
     srcs = [
         "renames_v2.py",
         "tf_upgrade_v2.py",
@@ -49,14 +61,28 @@ py_binary(
     deps = [":ast_edits"],
 )
 
+py_binary(
+    name = "tf_upgrade_v2",
+    srcs = ["tf_upgrade_v2_main.py"],
+    main = "tf_upgrade_v2_main.py",
+    srcs_version = "PY2AND3",
+    deps = [
+        ":ast_edits",
+        ":tf_upgrade_v2_lib",
+    ],
+)
+
 py_test(
     name = "tf_upgrade_v2_test",
     srcs = ["tf_upgrade_v2_test.py"],
     srcs_version = "PY2AND3",
     deps = [
         ":tf_upgrade_v2",
+        "//tensorflow:tensorflow_py",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_test_lib",
+        "//tensorflow/tools/common:public_api",
+        "//tensorflow/tools/common:traverse",
         "@six_archive//:six",
     ],
 )
@@ -100,18 +126,28 @@ py_test(
 genrule(
     name = "generate_upgraded_file_v2",
     testonly = 1,
-    srcs = ["testdata/test_file_v1_10.py"],
+    srcs = ["testdata/test_file_v1_12.py"],
     outs = [
         "test_file_v2_0.py",
         "report_v2.txt",
     ],
     cmd = ("$(location :tf_upgrade_v2)" +
-           " --infile $(location testdata/test_file_v1_10.py)" +
+           " --infile $(location testdata/test_file_v1_12.py)" +
            " --outfile $(location test_file_v2_0.py)" +
            " --reportfile $(location report_v2.txt)"),
     tools = [":tf_upgrade_v2"],
 )
 
+py_test(
+    name = "test_file_v1_12",
+    size = "small",
+    srcs = ["testdata/test_file_v1_12.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow:tensorflow_py",
+    ],
+)
+
 py_test(
     name = "test_file_v2_0",
     size = "small",
@@ -128,6 +164,6 @@ exports_files(
         "tf_upgrade.py",
         "renames_v2.py",
         "testdata/test_file_v0_11.py",
-        "testdata/test_file_v1_10.py",
+        "testdata/test_file_v1_12.py",
     ],
 )
diff --git a/tensorflow/tools/compatibility/README.md b/tensorflow/tools/compatibility/README.md
index aabc7b253d6..6ff42b1fefe 100644
--- a/tensorflow/tools/compatibility/README.md
+++ b/tensorflow/tools/compatibility/README.md
@@ -1,60 +1,77 @@
 # TensorFlow Python API Upgrade Utility
 
 This tool allows you to upgrade your existing TensorFlow Python scripts.
-This script can be run on a single Python file:
+Specifically: \
+`tf_upgrade_v2.py`: upgrades code from TensorFlow 1.12 to TensorFlow 2.0 preview. \
+`tf_upgrade.py`: upgrades code to TensorFlow 1.0 from TensorFlow 0.11.
+
+## Running the script from pip package
+
+First, install TensorFlow pip package. See
+https://www.tensorflow.org/install/pip.
+
+Upgrade script can be run on a single Python file:
 
 ```
-tf_upgrade.py --infile foo.py --outfile foo-upgraded.py
+tf_upgrade_v2 --infile foo.py --outfile foo-upgraded.py
 ```
 
 It will print a list of errors it finds that it can't fix. You can also run
 it on a directory tree:
 
 ```
+# upgrade the .py files and copy all the other files to the outtree
+tf_upgrade_v2 --intree coolcode --outtree coolcode-upgraded
+
 # just upgrade the .py files
-tf_upgrade.py --intree coolcode --outtree coolcode-upgraded
-# after upgrade the .py files, then copy all the other files to the outtree
-tf_upgrade.py --intree coolcode --outtree coolcode-upgraded --copyotherfiles True
+tf_upgrade_v2 --intree coolcode --outtree coolcode-upgraded --copyotherfiles False
 ```
 
-In either case, it will also dump out a report e.g. which will detail changes
+
+## Report
+
+The script will also dump out a report e.g. which will detail changes
 e.g.:
 
 ```
-third_party/tensorflow/tools/compatibility/test_file_v0.11.py Line 125
+'tensorflow/tools/compatibility/testdata/test_file_v1_12.py' Line 65
+--------------------------------------------------------------------------------
 
-Renamed keyword argument from `dim` to `axis`
-Renamed keyword argument from `squeeze_dims` to `axis`
+Added keyword 'input' to reordered function 'tf.argmax'
+Renamed keyword argument from 'dimension' to 'axis'
+
+    Old:         tf.argmax([[1, 3, 2]], dimension=0))
+                                        ~~~~~~~~~~
+    New:         tf.argmax(input=[[1, 3, 2]], axis=0))
 
-    Old:                   [[1, 2, 3]], dim=1), squeeze_dims=[1]).eval(),
-                                        ~~~~    ~~~~~~~~~~~~~
-    New:                   [[1, 2, 3]], axis=1), axis=[1]).eval(),
-                                        ~~~~~    ~~~~~
 ```
 
 ## Caveats
 
 - Don't update parts of your code manually before running this script. In
-particular, functions that have had reordered arguments like `tf.concat`
-or `tf.split` will cause the script to incorrectly add keyword arguments that
-mismap arguments.
+particular, functions that have had reordered arguments like `tf.argmax`
+or `tf.batch_to_space` will cause the script to incorrectly add keyword
+arguments that mismap arguments.
 
 - This script wouldn't actually reorder arguments. Instead, the script will add
 keyword arguments to functions that had their arguments reordered.
 
 - This script is not able to upgrade all functions. One notable example is
-`tf.reverse()` which has been changed to take a list of indices rather than
-a tensor of bools. If the script detects this, it will report this to stdout
+`tf.nn.conv2d` that no longer takes `use_cudnn_on_gpu` argument.
+If the script detects this, it will report this to stdout
 (and in the report), and you can fix it manually. For example if you have
-`tf.reverse(a, [False, True, True])` you will need to manually change it to
-`tf.reverse(a, [1, 2])`.
+`tf.nn.conv2d(inputs, filters, strides, padding, use_cudnn_on_gpu=True)`
+you will need to manually change it to
+`tf.nn.conv2d(input, filters, strides, padding)`.
 
 - There are some syntaxes that are not handleable with this script as this
-script was designed to use only standard python packages. If the script fails
-with "A necessary keyword argument failed to be inserted." or
+script was designed to use only standard python packages.
+There is an alternative available for TensorFlow 0.* to 1.0 upgrade script.
+If the script fails with "A necessary keyword argument failed to be inserted." or
 "Failed to find keyword lexicographically. Fix manually.", you can try
 [@machrisaa's fork of this script](https://github.com/machrisaa/tf0to1).
 [@machrisaa](https://github.com/machrisaa) has used the
 [RedBaron Python refactoring engine](https://redbaron.readthedocs.io/en/latest/)
 which is able to localize syntactic elements more reliably than the built-in
-`ast` module this script is based upon.
+`ast` module this script is based upon. Note that the alternative script is not
+available for TensorFlow 2.0 upgrade.
diff --git a/tensorflow/tools/compatibility/ast_edits.py b/tensorflow/tools/compatibility/ast_edits.py
index 56c67b83565..eac2150502d 100644
--- a/tensorflow/tools/compatibility/ast_edits.py
+++ b/tensorflow/tools/compatibility/ast_edits.py
@@ -21,11 +21,16 @@ from __future__ import print_function
 import ast
 import collections
 import os
+import re
 import shutil
 import sys
 import tempfile
 import traceback
 
+# Some regular expressions we will need for parsing
+FIND_OPEN = re.compile(r"^\s*(\[).*$")
+FIND_STRING_CHARS = re.compile(r"['\"]")
+
 
 class APIChangeSpec(object):
   """This class defines the transformations that need to happen.
@@ -40,6 +45,10 @@ class APIChangeSpec(object):
   * `function_reorders`: maps functions whose argument order has changed to the
     list of arguments in the new order
   * `function_handle`: maps function names to custom handlers for the function
+  * `function_warnings`: maps full names of functions to warnings that will be
+    printed out if the function is used. (e.g. tf.nn.convolution())
+  * `unrestricted_function_warnings`: maps names of functions to warnings that
+    will be printed out when the function is used (e.g. foo.convolution()).
 
   For an example, see `TFAPIChangeSpec`.
   """
@@ -53,7 +62,7 @@ class _FileEditTuple(
   Fields:
     comment: A description of the edit and why it was made.
     line: The line number in the file where the edit occurs (1-indexed).
-    start: The line number in the file where the edit occurs (0-indexed).
+    start: The column number in the file where the edit occurs (0-indexed).
     old: text string to remove (this must match what was in file).
     new: text string to add in place of `old`.
   """
@@ -195,6 +204,29 @@ class _ASTCallVisitor(ast.NodeVisitor):
     except KeyError:
       pass
 
+  def _print_warning_for_function_unrestricted(self, node):
+    """Print a warning when specific functions are called.
+
+    The function _print_warning_for_function matches the full name of the called
+    function, e.g., tf.foo.bar(). This function matches the function name that
+    is called, as long as the function is an attribute. For example,
+    `tf.foo.bar()` and `foo.bar()` are matched, but not `bar()`.
+
+    Args:
+      node: ast.Call object
+    """
+    function_warnings = getattr(
+        self._api_change_spec, "unrestricted_function_warnings", {})
+    if isinstance(node.func, ast.Attribute):
+      function_name = node.func.attr
+      try:
+        warning_message = function_warnings[function_name]
+        self._file_edit.add(warning_message,
+                            node.lineno, node.col_offset, "", "",
+                            error="%s requires manual check." % function_name)
+      except KeyError:
+        pass
+
   def _get_attribute_full_path(self, node):
     """Traverse an attribute to generate a full name e.g. tf.foo.bar.
 
@@ -209,11 +241,11 @@ class _ASTCallVisitor(ast.NodeVisitor):
     items = []
     while not isinstance(curr, ast.Name):
       if not isinstance(curr, ast.Attribute):
-        return None
+        return None, None
       items.append(curr.attr)
       curr = curr.value
     items.append(curr.id)
-    return ".".join(reversed(items))
+    return ".".join(reversed(items)), items[0]
 
   def _find_true_position(self, node):
     """Return correct line number and column offset for a given node.
@@ -221,13 +253,12 @@ class _ASTCallVisitor(ast.NodeVisitor):
     This is necessary mainly because ListComp's location reporting reports
     the next token after the list comprehension list opening.
 
+    Returns:
+      lineno, offset for the given node
+
     Args:
       node: Node for which we wish to know the lineno and col_offset
     """
-    import re
-    find_open = re.compile("^\s*(\\[).*$")
-    find_string_chars = re.compile("['\"]")
-
     if isinstance(node, ast.ListComp):
       # Strangely, ast.ListComp returns the col_offset of the first token
       # after the '[' token which appears to be a bug. Workaround by
@@ -241,7 +272,7 @@ class _ASTCallVisitor(ast.NodeVisitor):
         reversed_preceding_text = text[:col][::-1]
         # First find if a [ can be found with only whitespace between it and
         # col.
-        m = find_open.match(reversed_preceding_text)
+        m = FIND_OPEN.match(reversed_preceding_text)
         if m:
           new_col_offset = col - m.start(1) - 1
           return line, new_col_offset
@@ -260,7 +291,7 @@ class _ASTCallVisitor(ast.NodeVisitor):
             comment_start = prev_line.find("#")
             if comment_start == -1:
               col = len(prev_line) - 1
-            elif find_string_chars.search(prev_line[comment_start:]) is None:
+            elif FIND_STRING_CHARS.search(prev_line[comment_start:]) is None:
               col = comment_start
             else:
               return None, None
@@ -276,9 +307,10 @@ class _ASTCallVisitor(ast.NodeVisitor):
     Args:
       node: Current Node
     """
+    self._print_warning_for_function_unrestricted(node)
 
     # Find a simple attribute name path e.g. "tf.foo.bar"
-    full_name = self._get_attribute_full_path(node.func)
+    full_name, name = self._get_attribute_full_path(node.func)
 
     # Make sure the func is marked as being part of a call
     node.func.is_function_for_call = True
@@ -286,6 +318,9 @@ class _ASTCallVisitor(ast.NodeVisitor):
     if full_name:
       # Call special handlers
       function_handles = self._api_change_spec.function_handle
+      glob_name = "*.{}".format(name)
+      if glob_name in function_handles:
+        function_handles[glob_name](self._file_edit, node)
       if full_name in function_handles:
         function_handles[full_name](self._file_edit, node)
 
@@ -358,10 +393,11 @@ class _ASTCallVisitor(ast.NodeVisitor):
     Args:
       node: Node that is of type ast.Attribute
     """
-    full_name = self._get_attribute_full_path(node)
+    full_name, _ = self._get_attribute_full_path(node)
     if full_name:
-      self._rename_functions(node, full_name)
+      # Make sure the warning comes first, otherwise the name may have changed
       self._print_warning_for_function(node, full_name)
+      self._rename_functions(node, full_name)
     if full_name in self._api_change_spec.change_to_function:
       if not hasattr(node, "is_function_for_call"):
         new_text = full_name + "()"
diff --git a/tensorflow/tools/compatibility/ast_edits_test.py b/tensorflow/tools/compatibility/ast_edits_test.py
new file mode 100644
index 00000000000..99f20a026fc
--- /dev/null
+++ b/tensorflow/tools/compatibility/ast_edits_test.py
@@ -0,0 +1,420 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for ast_edits which is used in tf upgraders.
+
+All of the tests assume that we want to change from an API containing
+
+    def f(a, b, kw1, kw2): ...
+    def g(a, b, kw1, c, kw1_alias): ...
+    def g2(a, b, kw1, c, d, kw1_alias): ...
+    def h(a, kw1, kw2, kw1_alias, kw2_alias): ...
+
+and the changes to the API consist of renaming, reordering, and/or removing
+arguments. Thus, we want to be able to generate changes to produce each of the
+following new APIs:
+
+    def f(a, b, kw1, kw3): ...
+    def f(a, b, kw2, kw1): ...
+    def f(a, b, kw3, kw1): ...
+    def g(a, b, kw1, c): ...
+    def g(a, b, c, kw1): ...
+    def g2(a, b, kw1, c, d): ...
+    def g2(a, b, c, d, kw1): ...
+    def h(a, kw1, kw2): ...
+
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import six
+from tensorflow.python.framework import test_util
+from tensorflow.python.platform import test as test_lib
+from tensorflow.tools.compatibility import ast_edits
+
+
+class NoUpdateSpec(ast_edits.APIChangeSpec):
+  """A specification of an API change which doesn't change anything."""
+
+  def __init__(self):
+    self.function_handle = {}
+    self.function_reorders = {}
+    self.function_keyword_renames = {}
+    self.symbol_renames = {}
+    self.function_warnings = {}
+    self.unrestricted_function_warnings = {}
+    self.change_to_function = {}
+
+
+class RenameKeywordSpec(NoUpdateSpec):
+  """A specification where kw2 gets renamed to kw3.
+
+  The new API is
+
+    def f(a, b, kw1, kw3): ...
+
+  """
+
+  def __init__(self):
+    NoUpdateSpec.__init__(self)
+    self.update_renames()
+
+  def update_renames(self):
+    self.function_keyword_renames["f"] = {"kw2": "kw3"}
+
+
+class ReorderKeywordSpec(NoUpdateSpec):
+  """A specification where kw2 gets moved in front of kw1.
+
+  The new API is
+
+    def f(a, b, kw2, kw1): ...
+
+  """
+
+  def __init__(self):
+    NoUpdateSpec.__init__(self)
+    self.update_reorders()
+
+  def update_reorders(self):
+    # Note that these should be in the old order.
+    self.function_reorders["f"] = ["a", "b", "kw1", "kw2"]
+
+
+class ReorderAndRenameKeywordSpec(ReorderKeywordSpec, RenameKeywordSpec):
+  """A specification where kw2 gets moved in front of kw1 and is changed to kw3.
+
+  The new API is
+
+    def f(a, b, kw3, kw1): ...
+
+  """
+
+  def __init__(self):
+    ReorderKeywordSpec.__init__(self)
+    RenameKeywordSpec.__init__(self)
+    self.update_renames()
+    self.update_reorders()
+
+
+class RemoveDeprecatedAliasKeyword(NoUpdateSpec):
+  """A specification where kw1_alias is removed in g.
+
+  The new API is
+
+    def g(a, b, kw1, c): ...
+    def g2(a, b, kw1, c, d): ...
+
+  """
+
+  def __init__(self):
+    NoUpdateSpec.__init__(self)
+    self.function_keyword_renames["g"] = {"kw1_alias": "kw1"}
+    self.function_keyword_renames["g2"] = {"kw1_alias": "kw1"}
+
+
+class RemoveDeprecatedAliasAndReorderRest(RemoveDeprecatedAliasKeyword):
+  """A specification where kw1_alias is removed in g.
+
+  The new API is
+
+    def g(a, b, c, kw1): ...
+    def g2(a, b, c, d, kw1): ...
+
+  """
+
+  def __init__(self):
+    RemoveDeprecatedAliasKeyword.__init__(self)
+    # Note that these should be in the old order.
+    self.function_reorders["g"] = ["a", "b", "kw1", "c"]
+    self.function_reorders["g2"] = ["a", "b", "kw1", "c", "d"]
+
+
+class RemoveMultipleKeywordArguments(NoUpdateSpec):
+  """A specification where both keyword aliases are removed from h.
+
+  The new API is
+
+    def h(a, kw1, kw2): ...
+
+  """
+
+  def __init__(self):
+    NoUpdateSpec.__init__(self)
+    self.function_keyword_renames["h"] = {
+        "kw1_alias": "kw1",
+        "kw2_alias": "kw2",
+    }
+
+
+class TestAstEdits(test_util.TensorFlowTestCase):
+
+  def _upgrade(self, spec, old_file_text):
+    in_file = six.StringIO(old_file_text)
+    out_file = six.StringIO()
+    upgrader = ast_edits.ASTCodeUpgrader(spec)
+    count, report, errors = (
+        upgrader.process_opened_file("test.py", in_file,
+                                     "test_out.py", out_file))
+    return (count, report, errors), out_file.getvalue()
+
+  def testNoTransformIfNothingIsSupplied(self):
+    text = "f(a, b, kw1=c, kw2=d)\n"
+    _, new_text = self._upgrade(NoUpdateSpec(), text)
+    self.assertEqual(new_text, text)
+
+    text = "f(a, b, c, d)\n"
+    _, new_text = self._upgrade(NoUpdateSpec(), text)
+    self.assertEqual(new_text, text)
+
+  def testKeywordRename(self):
+    """Test that we get the expected result if renaming kw2 to kw3."""
+    text = "f(a, b, kw1=c, kw2=d)\n"
+    expected = "f(a, b, kw1=c, kw3=d)\n"
+    _, new_text = self._upgrade(RenameKeywordSpec(), text)
+    self.assertEqual(new_text, expected)
+
+    # No keywords specified, no reordering, so we should get input as output
+    text = "f(a, b, c, d)\n"
+    _, new_text = self._upgrade(RenameKeywordSpec(), text)
+    self.assertEqual(new_text, text)
+
+  def testKeywordReorder(self):
+    """Test that we get the expected result if kw2 is now before kw1."""
+    text = "f(a, b, kw1=c, kw2=d)\n"
+    acceptable_outputs = [
+        # No change is a valid output
+        text,
+        # Just reordering the kw.. args is also ok
+        "f(a, b, kw2=d, kw1=c)\n",
+        # Also cases where all arguments are fully specified are allowed
+        "f(a=a, b=b, kw1=c, kw2=d)\n",
+        "f(a=a, b=b, kw2=d, kw1=c)\n",
+    ]
+    _, new_text = self._upgrade(ReorderKeywordSpec(), text)
+    self.assertIn(new_text, acceptable_outputs)
+
+    # Keywords are reordered, so we should reorder arguments too
+    text = "f(a, b, c, d)\n"
+    acceptable_outputs = [
+        "f(a, b, d, c)\n",
+        "f(a=a, b=b, kw1=c, kw2=d)\n",
+        "f(a=a, b=b, kw2=d, kw1=c)\n",
+    ]
+    _, new_text = self._upgrade(ReorderKeywordSpec(), text)
+    self.assertIn(new_text, acceptable_outputs)
+
+  def testKeywordReorderAndRename(self):
+    """Test that we get the expected result if kw2 is renamed and moved."""
+    text = "f(a, b, kw1=c, kw2=d)\n"
+    acceptable_outputs = [
+        "f(a, b, kw3=d, kw1=c)\n",
+        "f(a=a, b=b, kw1=c, kw3=d)\n",
+        "f(a=a, b=b, kw3=d, kw1=c)\n",
+    ]
+    _, new_text = self._upgrade(ReorderAndRenameKeywordSpec(), text)
+    self.assertIn(new_text, acceptable_outputs)
+
+    # Keywords are reordered, so we should reorder arguments too
+    text = "f(a, b, c, d)\n"
+    acceptable_outputs = [
+        "f(a, b, d, c)\n",
+        "f(a=a, b=b, kw1=c, kw3=d)\n",
+        "f(a=a, b=b, kw3=d, kw1=c)\n",
+    ]
+    _, new_text = self._upgrade(ReorderAndRenameKeywordSpec(), text)
+    self.assertIn(new_text, acceptable_outputs)
+
+  def testRemoveDeprecatedKeywordAlias(self):
+    """Test that we get the expected result if a keyword alias is removed."""
+    text = "g(a, b, kw1=x, c=c)\n"
+    acceptable_outputs = [
+        # Not using deprecated alias, so original is ok
+        text,
+        "g(a=a, b=b, kw1=x, c=c)\n",
+    ]
+    _, new_text = self._upgrade(RemoveDeprecatedAliasKeyword(), text)
+    self.assertIn(new_text, acceptable_outputs)
+
+    # No keyword used, should be no change
+    text = "g(a, b, x, c)\n"
+    _, new_text = self._upgrade(RemoveDeprecatedAliasKeyword(), text)
+    self.assertEqual(new_text, text)
+
+    # If we used the alias, it should get renamed
+    text = "g(a, b, kw1_alias=x, c=c)\n"
+    acceptable_outputs = [
+        "g(a, b, kw1=x, c=c)\n",
+        "g(a, b, c=c, kw1=x)\n",
+        "g(a=a, b=b, kw1=x, c=c)\n",
+        "g(a=a, b=b, c=c, kw1=x)\n",
+    ]
+    _, new_text = self._upgrade(RemoveDeprecatedAliasKeyword(), text)
+    self.assertIn(new_text, acceptable_outputs)
+
+    # It should get renamed even if it's last
+    text = "g(a, b, c=c, kw1_alias=x)\n"
+    acceptable_outputs = [
+        "g(a, b, kw1=x, c=c)\n",
+        "g(a, b, c=c, kw1=x)\n",
+        "g(a=a, b=b, kw1=x, c=c)\n",
+        "g(a=a, b=b, c=c, kw1=x)\n",
+    ]
+    _, new_text = self._upgrade(RemoveDeprecatedAliasKeyword(), text)
+    self.assertIn(new_text, acceptable_outputs)
+
+  def testRemoveDeprecatedKeywordAndReorder(self):
+    """Test for when a keyword alias is removed and args are reordered."""
+    text = "g(a, b, kw1=x, c=c)\n"
+    acceptable_outputs = [
+        "g(a, b, c=c, kw1=x)\n",
+        "g(a=a, b=b, kw1=x, c=c)\n",
+    ]
+    _, new_text = self._upgrade(RemoveDeprecatedAliasAndReorderRest(), text)
+    self.assertIn(new_text, acceptable_outputs)
+
+    # Keywords are reordered, so we should reorder arguments too
+    text = "g(a, b, x, c)\n"
+    # Don't accept an output which doesn't reorder c and d
+    acceptable_outputs = [
+        "g(a, b, c, x)\n",
+        "g(a=a, b=b, kw1=x, c=c)\n",
+    ]
+    _, new_text = self._upgrade(RemoveDeprecatedAliasAndReorderRest(), text)
+    self.assertIn(new_text, acceptable_outputs)
+
+    # If we used the alias, it should get renamed
+    text = "g(a, b, kw1_alias=x, c=c)\n"
+    acceptable_outputs = [
+        "g(a, b, kw1=x, c=c)\n",
+        "g(a, b, c=c, kw1=x)\n",
+        "g(a=a, b=b, kw1=x, c=c)\n",
+        "g(a=a, b=b, c=c, kw1=x)\n",
+    ]
+    _, new_text = self._upgrade(RemoveDeprecatedAliasKeyword(), text)
+    self.assertIn(new_text, acceptable_outputs)
+
+    # It should get renamed and reordered even if it's last
+    text = "g(a, b, c=c, kw1_alias=x)\n"
+    acceptable_outputs = [
+        "g(a, b, kw1=x, c=c)\n",
+        "g(a, b, c=c, kw1=x)\n",
+        "g(a=a, b=b, kw1=x, c=c)\n",
+        "g(a=a, b=b, c=c, kw1=x)\n",
+    ]
+    _, new_text = self._upgrade(RemoveDeprecatedAliasKeyword(), text)
+    self.assertIn(new_text, acceptable_outputs)
+
+  def testRemoveDeprecatedKeywordAndReorder2(self):
+    """Same as testRemoveDeprecatedKeywordAndReorder but on g2 (more args)."""
+    text = "g2(a, b, kw1=x, c=c, d=d)\n"
+    acceptable_outputs = [
+        "g2(a, b, c=c, d=d, kw1=x)\n",
+        "g2(a=a, b=b, kw1=x, c=c, d=d)\n",
+    ]
+    _, new_text = self._upgrade(RemoveDeprecatedAliasAndReorderRest(), text)
+    self.assertIn(new_text, acceptable_outputs)
+
+    # Keywords are reordered, so we should reorder arguments too
+    text = "g2(a, b, x, c, d)\n"
+    # Don't accept an output which doesn't reorder c and d
+    acceptable_outputs = [
+        "g2(a, b, c, d, x)\n",
+        "g2(a=a, b=b, kw1=x, c=c, d=d)\n",
+    ]
+    _, new_text = self._upgrade(RemoveDeprecatedAliasAndReorderRest(), text)
+    self.assertIn(new_text, acceptable_outputs)
+
+    # If we used the alias, it should get renamed
+    text = "g2(a, b, kw1_alias=x, c=c, d=d)\n"
+    acceptable_outputs = [
+        "g2(a, b, kw1=x, c=c, d=d)\n",
+        "g2(a, b, c=c, d=d, kw1=x)\n",
+        "g2(a=a, b=b, kw1=x, c=c, d=d)\n",
+        "g2(a=a, b=b, c=c, d=d, kw1=x)\n",
+    ]
+    _, new_text = self._upgrade(RemoveDeprecatedAliasKeyword(), text)
+    self.assertIn(new_text, acceptable_outputs)
+
+    # It should get renamed and reordered even if it's not in order
+    text = "g2(a, b, d=d, c=c, kw1_alias=x)\n"
+    acceptable_outputs = [
+        "g2(a, b, kw1=x, c=c, d=d)\n",
+        "g2(a, b, c=c, d=d, kw1=x)\n",
+        "g2(a, b, d=d, c=c, kw1=x)\n",
+        "g2(a=a, b=b, kw1=x, c=c, d=d)\n",
+        "g2(a=a, b=b, c=c, d=d, kw1=x)\n",
+        "g2(a=a, b=b, d=d, c=c, kw1=x)\n",
+    ]
+    _, new_text = self._upgrade(RemoveDeprecatedAliasKeyword(), text)
+    self.assertIn(new_text, acceptable_outputs)
+
+  def testRemoveMultipleKeywords(self):
+    """Remove multiple keywords at once."""
+    # Not using deprecated keywords -> no rename
+    text = "h(a, kw1=x, kw2=y)\n"
+    _, new_text = self._upgrade(RemoveMultipleKeywordArguments(), text)
+    self.assertEqual(new_text, text)
+
+    # Using positional arguments (in proper order) -> no change
+    text = "h(a, x, y)\n"
+    _, new_text = self._upgrade(RemoveMultipleKeywordArguments(), text)
+    self.assertEqual(new_text, text)
+
+    # Use only the old names, in order
+    text = "h(a, kw1_alias=x, kw2_alias=y)\n"
+    acceptable_outputs = [
+        "h(a, x, y)\n",
+        "h(a, kw1=x, kw2=y)\n",
+        "h(a=a, kw1=x, kw2=y)\n",
+        "h(a, kw2=y, kw1=x)\n",
+        "h(a=a, kw2=y, kw1=x)\n",
+    ]
+    _, new_text = self._upgrade(RemoveMultipleKeywordArguments(), text)
+    self.assertIn(new_text, acceptable_outputs)
+
+    # Use only the old names, in reverse order, should give one of same outputs
+    text = "h(a, kw2_alias=y, kw1_alias=x)\n"
+    _, new_text = self._upgrade(RemoveMultipleKeywordArguments(), text)
+    self.assertIn(new_text, acceptable_outputs)
+
+    # Mix old and new names
+    text = "h(a, kw1=x, kw2_alias=y)\n"
+    _, new_text = self._upgrade(RemoveMultipleKeywordArguments(), text)
+    self.assertIn(new_text, acceptable_outputs)
+
+  def testUnrestrictedFunctionWarnings(self):
+    class FooWarningSpec(NoUpdateSpec):
+      """Usages of function attribute foo() prints out a warning."""
+
+      def __init__(self):
+        NoUpdateSpec.__init__(self)
+        self.unrestricted_function_warnings = {"foo": "not good"}
+    texts = ["object.foo()", "get_object().foo()",
+             "get_object().foo()", "object.foo().bar()"]
+    for text in texts:
+      (_, report, _), _ = self._upgrade(FooWarningSpec(), text)
+      self.assertIn("not good", report)
+
+    # Note that foo() won't result in a warning, because in this case foo is
+    # not an attribute, but a name.
+    false_alarms = ["foo", "foo()", "foo.bar()", "obj.run_foo()", "obj.foo"]
+    for text in false_alarms:
+      (_, report, _), _ = self._upgrade(FooWarningSpec(), text)
+      self.assertNotIn("not good", report)
+
+
+if __name__ == "__main__":
+  test_lib.main()
diff --git a/tensorflow/tools/compatibility/renames_v2.py b/tensorflow/tools/compatibility/renames_v2.py
index 5ea2fbcc4cd..78839e36bae 100644
--- a/tensorflow/tools/compatibility/renames_v2.py
+++ b/tensorflow/tools/compatibility/renames_v2.py
@@ -28,6 +28,10 @@ renames = {
     'tf.AUTO_REUSE': 'tf.compat.v1.AUTO_REUSE',
     'tf.COMPILER_VERSION': 'tf.version.COMPILER_VERSION',
     'tf.CXX11_ABI_FLAG': 'tf.sysconfig.CXX11_ABI_FLAG',
+    'tf.ConditionalAccumulator': 'tf.compat.v1.ConditionalAccumulator',
+    'tf.ConditionalAccumulatorBase': 'tf.compat.v1.ConditionalAccumulatorBase',
+    'tf.DeviceSpec': 'tf.compat.v1.DeviceSpec',
+    'tf.Dimension': 'tf.compat.v1.Dimension',
     'tf.FixedLenFeature': 'tf.io.FixedLenFeature',
     'tf.FixedLenSequenceFeature': 'tf.io.FixedLenSequenceFeature',
     'tf.FixedLengthRecordReader': 'tf.compat.v1.FixedLengthRecordReader',
@@ -35,6 +39,7 @@ renames = {
     'tf.GRAPH_DEF_VERSION': 'tf.version.GRAPH_DEF_VERSION',
     'tf.GRAPH_DEF_VERSION_MIN_CONSUMER': 'tf.version.GRAPH_DEF_VERSION_MIN_CONSUMER',
     'tf.GRAPH_DEF_VERSION_MIN_PRODUCER': 'tf.version.GRAPH_DEF_VERSION_MIN_PRODUCER',
+    'tf.GraphKeys': 'tf.compat.v1.GraphKeys',
     'tf.IdentityReader': 'tf.compat.v1.IdentityReader',
     'tf.InteractiveSession': 'tf.compat.v1.InteractiveSession',
     'tf.LMDBReader': 'tf.compat.v1.LMDBReader',
@@ -53,12 +58,10 @@ renames = {
     'tf.SparseConditionalAccumulator': 'tf.sparse.SparseConditionalAccumulator',
     'tf.SparseFeature': 'tf.io.SparseFeature',
     'tf.TFRecordReader': 'tf.compat.v1.TFRecordReader',
-    'tf.TensorShape': 'tf.compat.v1.TensorShape',
+    'tf.TensorInfo': 'tf.compat.v1.TensorInfo',
     'tf.TextLineReader': 'tf.compat.v1.TextLineReader',
     'tf.VERSION': 'tf.version.VERSION',
     'tf.VarLenFeature': 'tf.io.VarLenFeature',
-    'tf.Variable': 'tf.compat.v1.Variable',
-    'tf.VariableAggregation': 'tf.compat.v1.VariableAggregation',
     'tf.VariableScope': 'tf.compat.v1.VariableScope',
     'tf.WholeFileReader': 'tf.compat.v1.WholeFileReader',
     'tf.accumulate_n': 'tf.math.accumulate_n',
@@ -67,59 +70,67 @@ renames = {
     'tf.add_to_collections': 'tf.compat.v1.add_to_collections',
     'tf.all_variables': 'tf.compat.v1.all_variables',
     'tf.angle': 'tf.math.angle',
-    'tf.argmax': 'tf.compat.v1.argmax',
-    'tf.argmin': 'tf.compat.v1.argmin',
-    'tf.assert_greater_equal': 'tf.debugging.assert_greater_equal',
-    'tf.assert_integer': 'tf.debugging.assert_integer',
-    'tf.assert_less_equal': 'tf.debugging.assert_less_equal',
-    'tf.assert_near': 'tf.debugging.assert_near',
-    'tf.assert_negative': 'tf.debugging.assert_negative',
-    'tf.assert_non_negative': 'tf.debugging.assert_non_negative',
-    'tf.assert_non_positive': 'tf.debugging.assert_non_positive',
-    'tf.assert_none_equal': 'tf.debugging.assert_none_equal',
-    'tf.assert_positive': 'tf.debugging.assert_positive',
+    'tf.app.run': 'tf.compat.v1.app.run',
+    'tf.arg_max': 'tf.compat.v1.arg_max',
+    'tf.arg_min': 'tf.compat.v1.arg_min',
+    'tf.assert_greater_equal': 'tf.compat.v1.assert_greater_equal',
+    'tf.assert_integer': 'tf.compat.v1.assert_integer',
+    'tf.assert_less_equal': 'tf.compat.v1.assert_less_equal',
+    'tf.assert_near': 'tf.compat.v1.assert_near',
+    'tf.assert_negative': 'tf.compat.v1.assert_negative',
+    'tf.assert_non_negative': 'tf.compat.v1.assert_non_negative',
+    'tf.assert_non_positive': 'tf.compat.v1.assert_non_positive',
+    'tf.assert_none_equal': 'tf.compat.v1.assert_none_equal',
+    'tf.assert_positive': 'tf.compat.v1.assert_positive',
     'tf.assert_proper_iterable': 'tf.debugging.assert_proper_iterable',
-    'tf.assert_rank_at_least': 'tf.debugging.assert_rank_at_least',
-    'tf.assert_rank_in': 'tf.debugging.assert_rank_in',
+    'tf.assert_rank_at_least': 'tf.compat.v1.assert_rank_at_least',
+    'tf.assert_rank_in': 'tf.compat.v1.assert_rank_in',
     'tf.assert_same_float_dtype': 'tf.debugging.assert_same_float_dtype',
-    'tf.assert_scalar': 'tf.debugging.assert_scalar',
-    'tf.assert_type': 'tf.debugging.assert_type',
+    'tf.assert_scalar': 'tf.compat.v1.assert_scalar',
+    'tf.assert_type': 'tf.compat.v1.assert_type',
     'tf.assert_variables_initialized': 'tf.compat.v1.assert_variables_initialized',
     'tf.assign': 'tf.compat.v1.assign',
     'tf.assign_add': 'tf.compat.v1.assign_add',
     'tf.assign_sub': 'tf.compat.v1.assign_sub',
     'tf.betainc': 'tf.math.betainc',
-    'tf.bincount': 'tf.math.bincount',
     'tf.ceil': 'tf.math.ceil',
     'tf.check_numerics': 'tf.debugging.check_numerics',
     'tf.cholesky': 'tf.linalg.cholesky',
     'tf.cholesky_solve': 'tf.linalg.cholesky_solve',
+    'tf.clip_by_average_norm': 'tf.compat.v1.clip_by_average_norm',
     'tf.colocate_with': 'tf.compat.v1.colocate_with',
-    'tf.confusion_matrix': 'tf.math.confusion_matrix',
     'tf.conj': 'tf.math.conj',
     'tf.container': 'tf.compat.v1.container',
-    'tf.convert_to_tensor': 'tf.compat.v1.convert_to_tensor',
     'tf.convert_to_tensor_or_indexed_slices': 'tf.compat.v1.convert_to_tensor_or_indexed_slices',
     'tf.convert_to_tensor_or_sparse_tensor': 'tf.compat.v1.convert_to_tensor_or_sparse_tensor',
+    'tf.count_nonzero': 'tf.compat.v1.count_nonzero',
     'tf.count_up_to': 'tf.compat.v1.count_up_to',
+    'tf.create_partitioned_variables': 'tf.compat.v1.create_partitioned_variables',
     'tf.cross': 'tf.linalg.cross',
     'tf.cumprod': 'tf.math.cumprod',
+    'tf.data.make_initializable_iterator': 'tf.compat.v1.data.make_initializable_iterator',
+    'tf.data.make_one_shot_iterator': 'tf.compat.v1.data.make_one_shot_iterator',
+    'tf.debugging.is_finite': 'tf.math.is_finite',
+    'tf.debugging.is_inf': 'tf.math.is_inf',
+    'tf.debugging.is_nan': 'tf.math.is_nan',
+    'tf.debugging.is_non_decreasing': 'tf.math.is_non_decreasing',
+    'tf.debugging.is_strictly_increasing': 'tf.math.is_strictly_increasing',
     'tf.decode_base64': 'tf.io.decode_base64',
     'tf.decode_compressed': 'tf.io.decode_compressed',
-    'tf.decode_csv': 'tf.io.decode_csv',
     'tf.decode_json_example': 'tf.io.decode_json_example',
     'tf.decode_raw': 'tf.io.decode_raw',
     'tf.delete_session_tensor': 'tf.compat.v1.delete_session_tensor',
     'tf.depth_to_space': 'tf.nn.depth_to_space',
     'tf.dequantize': 'tf.quantization.dequantize',
     'tf.deserialize_many_sparse': 'tf.io.deserialize_many_sparse',
-    'tf.device': 'tf.compat.v1.device',
     'tf.diag': 'tf.linalg.tensor_diag',
     'tf.diag_part': 'tf.linalg.tensor_diag_part',
     'tf.digamma': 'tf.math.digamma',
     'tf.dimension_at_index': 'tf.compat.v1.dimension_at_index',
     'tf.dimension_value': 'tf.compat.v1.dimension_value',
+    'tf.disable_eager_execution': 'tf.compat.v1.disable_eager_execution',
     'tf.disable_resource_variables': 'tf.compat.v1.disable_resource_variables',
+    'tf.disable_v2_behavior': 'tf.compat.v1.disable_v2_behavior',
     'tf.disable_v2_tensorshape': 'tf.compat.v1.disable_v2_tensorshape',
     'tf.distributions.Bernoulli': 'tf.compat.v1.distributions.Bernoulli',
     'tf.distributions.Beta': 'tf.compat.v1.distributions.Beta',
@@ -139,30 +150,41 @@ renames = {
     'tf.distributions.StudentT': 'tf.compat.v1.distributions.StudentT',
     'tf.distributions.Uniform': 'tf.compat.v1.distributions.Uniform',
     'tf.distributions.kl_divergence': 'tf.compat.v1.distributions.kl_divergence',
+    'tf.div': 'tf.compat.v1.div',
+    'tf.enable_eager_execution': 'tf.compat.v1.enable_eager_execution',
     'tf.enable_resource_variables': 'tf.compat.v1.enable_resource_variables',
+    'tf.enable_v2_behavior': 'tf.compat.v1.enable_v2_behavior',
     'tf.enable_v2_tensorshape': 'tf.compat.v1.enable_v2_tensorshape',
     'tf.encode_base64': 'tf.io.encode_base64',
     'tf.erf': 'tf.math.erf',
     'tf.erfc': 'tf.math.erfc',
     'tf.expm1': 'tf.math.expm1',
-    'tf.extract_image_patches': 'tf.image.extract_image_patches',
     'tf.fake_quant_with_min_max_args': 'tf.quantization.fake_quant_with_min_max_args',
     'tf.fake_quant_with_min_max_args_gradient': 'tf.quantization.fake_quant_with_min_max_args_gradient',
     'tf.fake_quant_with_min_max_vars': 'tf.quantization.fake_quant_with_min_max_vars',
     'tf.fake_quant_with_min_max_vars_gradient': 'tf.quantization.fake_quant_with_min_max_vars_gradient',
     'tf.fake_quant_with_min_max_vars_per_channel': 'tf.quantization.fake_quant_with_min_max_vars_per_channel',
     'tf.fake_quant_with_min_max_vars_per_channel_gradient': 'tf.quantization.fake_quant_with_min_max_vars_per_channel_gradient',
+    'tf.feature_column.input_layer': 'tf.compat.v1.feature_column.input_layer',
+    'tf.feature_column.linear_model': 'tf.compat.v1.feature_column.linear_model',
     'tf.fft': 'tf.signal.fft',
     'tf.fft2d': 'tf.signal.fft2d',
     'tf.fft3d': 'tf.signal.fft3d',
+    'tf.fixed_size_partitioner': 'tf.compat.v1.fixed_size_partitioner',
     'tf.floordiv': 'tf.math.floordiv',
+    'tf.get_collection': 'tf.compat.v1.get_collection',
+    'tf.get_collection_ref': 'tf.compat.v1.get_collection_ref',
+    'tf.get_default_graph': 'tf.compat.v1.get_default_graph',
     'tf.get_default_session': 'tf.compat.v1.get_default_session',
     'tf.get_local_variable': 'tf.compat.v1.get_local_variable',
-    'tf.get_seed': 'tf.random.get_seed',
+    'tf.get_seed': 'tf.compat.v1.get_seed',
     'tf.get_session_handle': 'tf.compat.v1.get_session_handle',
     'tf.get_session_tensor': 'tf.compat.v1.get_session_tensor',
     'tf.get_variable': 'tf.compat.v1.get_variable',
     'tf.get_variable_scope': 'tf.compat.v1.get_variable_scope',
+    'tf.gfile.FastGFile': 'tf.compat.v1.gfile.FastGFile',
+    'tf.gfile.GFile': 'tf.compat.v1.gfile.GFile',
+    'tf.gfile.Open': 'tf.compat.v1.gfile.Open',
     'tf.global_norm': 'tf.linalg.global_norm',
     'tf.global_variables': 'tf.compat.v1.global_variables',
     'tf.global_variables_initializer': 'tf.compat.v1.global_variables_initializer',
@@ -178,6 +200,12 @@ renames = {
     'tf.igamma': 'tf.math.igamma',
     'tf.igammac': 'tf.math.igammac',
     'tf.imag': 'tf.math.imag',
+    'tf.image.resize_area': 'tf.compat.v1.image.resize_area',
+    'tf.image.resize_bicubic': 'tf.compat.v1.image.resize_bicubic',
+    'tf.image.resize_bilinear': 'tf.compat.v1.image.resize_bilinear',
+    'tf.image.resize_images': 'tf.compat.v1.image.resize_images',
+    'tf.image.resize_nearest_neighbor': 'tf.compat.v1.image.resize_nearest_neighbor',
+    'tf.image.transpose_image': 'tf.compat.v1.image.transpose_image',
     'tf.initialize_all_tables': 'tf.compat.v1.initialize_all_tables',
     'tf.initialize_all_variables': 'tf.compat.v1.initialize_all_variables',
     'tf.initialize_local_variables': 'tf.compat.v1.initialize_local_variables',
@@ -187,12 +215,13 @@ renames = {
     'tf.initializers.tables_initializer': 'tf.compat.v1.initializers.tables_initializer',
     'tf.initializers.variables': 'tf.compat.v1.initializers.variables',
     'tf.invert_permutation': 'tf.math.invert_permutation',
-    'tf.is_finite': 'tf.debugging.is_finite',
-    'tf.is_inf': 'tf.debugging.is_inf',
-    'tf.is_nan': 'tf.debugging.is_nan',
-    'tf.is_non_decreasing': 'tf.debugging.is_non_decreasing',
+    'tf.io.tf_record_iterator': 'tf.compat.v1.io.tf_record_iterator',
+    'tf.is_finite': 'tf.math.is_finite',
+    'tf.is_inf': 'tf.math.is_inf',
+    'tf.is_nan': 'tf.math.is_nan',
+    'tf.is_non_decreasing': 'tf.math.is_non_decreasing',
     'tf.is_numeric_tensor': 'tf.debugging.is_numeric_tensor',
-    'tf.is_strictly_increasing': 'tf.debugging.is_strictly_increasing',
+    'tf.is_strictly_increasing': 'tf.math.is_strictly_increasing',
     'tf.is_variable_initialized': 'tf.compat.v1.is_variable_initialized',
     'tf.keras.backend.get_session': 'tf.compat.v1.keras.backend.get_session',
     'tf.layers.AveragePooling1D': 'tf.compat.v1.layers.AveragePooling1D',
@@ -235,13 +264,46 @@ renames = {
     'tf.layers.separable_conv2d': 'tf.compat.v1.layers.separable_conv2d',
     'tf.lbeta': 'tf.math.lbeta',
     'tf.lgamma': 'tf.math.lgamma',
+    'tf.lin_space': 'tf.linspace',
     'tf.local_variables': 'tf.compat.v1.local_variables',
     'tf.local_variables_initializer': 'tf.compat.v1.local_variables_initializer',
+    'tf.log': 'tf.math.log',
+    'tf.log1p': 'tf.math.log1p',
     'tf.log_sigmoid': 'tf.math.log_sigmoid',
+    'tf.logging.DEBUG': 'tf.compat.v1.logging.DEBUG',
+    'tf.logging.ERROR': 'tf.compat.v1.logging.ERROR',
+    'tf.logging.FATAL': 'tf.compat.v1.logging.FATAL',
+    'tf.logging.INFO': 'tf.compat.v1.logging.INFO',
+    'tf.logging.TaskLevelStatusMessage': 'tf.compat.v1.logging.TaskLevelStatusMessage',
+    'tf.logging.WARN': 'tf.compat.v1.logging.WARN',
+    'tf.logging.debug': 'tf.compat.v1.logging.debug',
+    'tf.logging.error': 'tf.compat.v1.logging.error',
+    'tf.logging.fatal': 'tf.compat.v1.logging.fatal',
+    'tf.logging.flush': 'tf.compat.v1.logging.flush',
+    'tf.logging.get_verbosity': 'tf.compat.v1.logging.get_verbosity',
+    'tf.logging.info': 'tf.compat.v1.logging.info',
+    'tf.logging.log': 'tf.compat.v1.logging.log',
+    'tf.logging.log_every_n': 'tf.compat.v1.logging.log_every_n',
+    'tf.logging.log_first_n': 'tf.compat.v1.logging.log_first_n',
+    'tf.logging.log_if': 'tf.compat.v1.logging.log_if',
+    'tf.logging.set_verbosity': 'tf.compat.v1.logging.set_verbosity',
+    'tf.logging.vlog': 'tf.compat.v1.logging.vlog',
+    'tf.logging.warn': 'tf.compat.v1.logging.warn',
+    'tf.logging.warning': 'tf.compat.v1.logging.warning',
     'tf.logical_xor': 'tf.math.logical_xor',
+    'tf.losses.absolute_difference': 'tf.compat.v1.losses.absolute_difference',
+    'tf.losses.compute_weighted_loss': 'tf.compat.v1.losses.compute_weighted_loss',
+    'tf.losses.cosine_distance': 'tf.compat.v1.losses.cosine_distance',
+    'tf.losses.hinge_loss': 'tf.compat.v1.losses.hinge_loss',
+    'tf.losses.huber_loss': 'tf.compat.v1.losses.huber_loss',
+    'tf.losses.log_loss': 'tf.compat.v1.losses.log_loss',
+    'tf.losses.mean_pairwise_squared_error': 'tf.compat.v1.losses.mean_pairwise_squared_error',
+    'tf.losses.mean_squared_error': 'tf.compat.v1.losses.mean_squared_error',
+    'tf.losses.sigmoid_cross_entropy': 'tf.compat.v1.losses.sigmoid_cross_entropy',
+    'tf.losses.softmax_cross_entropy': 'tf.compat.v1.losses.softmax_cross_entropy',
+    'tf.losses.sparse_softmax_cross_entropy': 'tf.compat.v1.losses.sparse_softmax_cross_entropy',
     'tf.make_template': 'tf.compat.v1.make_template',
     'tf.make_tensor_proto': 'tf.compat.v1.make_tensor_proto',
-    'tf.manip.batch_to_space_nd': 'tf.batch_to_space_nd',
     'tf.manip.gather_nd': 'tf.gather_nd',
     'tf.manip.reshape': 'tf.reshape',
     'tf.manip.reverse': 'tf.reverse',
@@ -250,8 +312,6 @@ renames = {
     'tf.manip.space_to_batch_nd': 'tf.space_to_batch_nd',
     'tf.manip.tile': 'tf.tile',
     'tf.matching_files': 'tf.io.matching_files',
-    'tf.math.argmax': 'tf.compat.v1.math.argmax',
-    'tf.math.argmin': 'tf.compat.v1.math.argmin',
     'tf.matrix_band_part': 'tf.linalg.band_part',
     'tf.matrix_determinant': 'tf.linalg.det',
     'tf.matrix_diag': 'tf.linalg.diag',
@@ -262,49 +322,105 @@ renames = {
     'tf.matrix_solve_ls': 'tf.linalg.lstsq',
     'tf.matrix_transpose': 'tf.linalg.transpose',
     'tf.matrix_triangular_solve': 'tf.linalg.triangular_solve',
+    'tf.metrics.accuracy': 'tf.compat.v1.metrics.accuracy',
+    'tf.metrics.auc': 'tf.compat.v1.metrics.auc',
+    'tf.metrics.average_precision_at_k': 'tf.compat.v1.metrics.average_precision_at_k',
+    'tf.metrics.false_negatives': 'tf.compat.v1.metrics.false_negatives',
+    'tf.metrics.false_negatives_at_thresholds': 'tf.compat.v1.metrics.false_negatives_at_thresholds',
+    'tf.metrics.false_positives': 'tf.compat.v1.metrics.false_positives',
+    'tf.metrics.false_positives_at_thresholds': 'tf.compat.v1.metrics.false_positives_at_thresholds',
+    'tf.metrics.mean': 'tf.compat.v1.metrics.mean',
+    'tf.metrics.mean_absolute_error': 'tf.compat.v1.metrics.mean_absolute_error',
+    'tf.metrics.mean_cosine_distance': 'tf.compat.v1.metrics.mean_cosine_distance',
+    'tf.metrics.mean_iou': 'tf.compat.v1.metrics.mean_iou',
+    'tf.metrics.mean_per_class_accuracy': 'tf.compat.v1.metrics.mean_per_class_accuracy',
+    'tf.metrics.mean_relative_error': 'tf.compat.v1.metrics.mean_relative_error',
+    'tf.metrics.mean_squared_error': 'tf.compat.v1.metrics.mean_squared_error',
+    'tf.metrics.mean_tensor': 'tf.compat.v1.metrics.mean_tensor',
+    'tf.metrics.percentage_below': 'tf.compat.v1.metrics.percentage_below',
+    'tf.metrics.precision': 'tf.compat.v1.metrics.precision',
+    'tf.metrics.precision_at_k': 'tf.compat.v1.metrics.precision_at_k',
+    'tf.metrics.precision_at_thresholds': 'tf.compat.v1.metrics.precision_at_thresholds',
+    'tf.metrics.precision_at_top_k': 'tf.compat.v1.metrics.precision_at_top_k',
+    'tf.metrics.recall': 'tf.compat.v1.metrics.recall',
+    'tf.metrics.recall_at_k': 'tf.compat.v1.metrics.recall_at_k',
+    'tf.metrics.recall_at_thresholds': 'tf.compat.v1.metrics.recall_at_thresholds',
+    'tf.metrics.recall_at_top_k': 'tf.compat.v1.metrics.recall_at_top_k',
+    'tf.metrics.root_mean_squared_error': 'tf.compat.v1.metrics.root_mean_squared_error',
+    'tf.metrics.sensitivity_at_specificity': 'tf.compat.v1.metrics.sensitivity_at_specificity',
+    'tf.metrics.sparse_average_precision_at_k': 'tf.compat.v1.metrics.sparse_average_precision_at_k',
+    'tf.metrics.sparse_precision_at_k': 'tf.compat.v1.metrics.sparse_precision_at_k',
+    'tf.metrics.specificity_at_sensitivity': 'tf.compat.v1.metrics.specificity_at_sensitivity',
+    'tf.metrics.true_negatives': 'tf.compat.v1.metrics.true_negatives',
+    'tf.metrics.true_negatives_at_thresholds': 'tf.compat.v1.metrics.true_negatives_at_thresholds',
+    'tf.metrics.true_positives': 'tf.compat.v1.metrics.true_positives',
+    'tf.metrics.true_positives_at_thresholds': 'tf.compat.v1.metrics.true_positives_at_thresholds',
+    'tf.min_max_variable_partitioner': 'tf.compat.v1.min_max_variable_partitioner',
     'tf.model_variables': 'tf.compat.v1.model_variables',
     'tf.moving_average_variables': 'tf.compat.v1.moving_average_variables',
-    'tf.nn.ctc_beam_search_decoder': 'tf.compat.v1.nn.ctc_beam_search_decoder',
+    'tf.nn.bidirectional_dynamic_rnn': 'tf.compat.v1.nn.bidirectional_dynamic_rnn',
+    'tf.nn.conv3d_backprop_filter_v2': 'tf.nn.conv3d_backprop_filter',
     'tf.nn.ctc_beam_search_decoder_v2': 'tf.nn.ctc_beam_search_decoder',
+    'tf.nn.ctc_loss_v2': 'tf.nn.ctc_loss',
+    'tf.nn.depthwise_conv2d_native': 'tf.compat.v1.nn.depthwise_conv2d_native',
+    'tf.nn.depthwise_conv2d_native_backprop_filter': 'tf.nn.depthwise_conv2d_backprop_filter',
+    'tf.nn.depthwise_conv2d_native_backprop_input': 'tf.nn.depthwise_conv2d_backprop_input',
     'tf.nn.dynamic_rnn': 'tf.compat.v1.nn.dynamic_rnn',
     'tf.nn.log_uniform_candidate_sampler': 'tf.random.log_uniform_candidate_sampler',
+    'tf.nn.quantized_avg_pool': 'tf.compat.v1.nn.quantized_avg_pool',
+    'tf.nn.quantized_conv2d': 'tf.compat.v1.nn.quantized_conv2d',
+    'tf.nn.quantized_max_pool': 'tf.compat.v1.nn.quantized_max_pool',
+    'tf.nn.quantized_relu_x': 'tf.compat.v1.nn.quantized_relu_x',
     'tf.nn.raw_rnn': 'tf.compat.v1.nn.raw_rnn',
     'tf.nn.rnn_cell.BasicLSTMCell': 'tf.compat.v1.nn.rnn_cell.BasicLSTMCell',
     'tf.nn.rnn_cell.BasicRNNCell': 'tf.compat.v1.nn.rnn_cell.BasicRNNCell',
     'tf.nn.rnn_cell.GRUCell': 'tf.compat.v1.nn.rnn_cell.GRUCell',
     'tf.nn.rnn_cell.LSTMCell': 'tf.compat.v1.nn.rnn_cell.LSTMCell',
-    'tf.nn.softmax_cross_entropy_with_logits': 'tf.compat.v1.nn.softmax_cross_entropy_with_logits',
-    'tf.nn.softmax_cross_entropy_with_logits_v2': 'tf.nn.softmax_cross_entropy_with_logits',
+    'tf.nn.rnn_cell.MultiRNNCell': 'tf.compat.v1.nn.rnn_cell.MultiRNNCell',
+    'tf.nn.static_bidirectional_rnn': 'tf.compat.v1.nn.static_bidirectional_rnn',
     'tf.nn.static_rnn': 'tf.compat.v1.nn.static_rnn',
     'tf.nn.uniform_candidate_sampler': 'tf.random.uniform_candidate_sampler',
+    'tf.nn.xw_plus_b': 'tf.compat.v1.nn.xw_plus_b',
     'tf.op_scope': 'tf.compat.v1.op_scope',
     'tf.orthogonal_initializer': 'tf.keras.initializers.Orthogonal',
-    'tf.parse_example': 'tf.io.parse_example',
-    'tf.parse_single_example': 'tf.io.parse_single_example',
+    'tf.parse_example': 'tf.compat.v1.parse_example',
+    'tf.parse_single_example': 'tf.compat.v1.parse_single_example',
     'tf.parse_single_sequence_example': 'tf.io.parse_single_sequence_example',
     'tf.parse_tensor': 'tf.io.parse_tensor',
     'tf.placeholder': 'tf.compat.v1.placeholder',
     'tf.placeholder_with_default': 'tf.compat.v1.placeholder_with_default',
     'tf.polygamma': 'tf.math.polygamma',
+    'tf.profiler.AdviceProto': 'tf.compat.v1.profiler.AdviceProto',
+    'tf.profiler.GraphNodeProto': 'tf.compat.v1.profiler.GraphNodeProto',
+    'tf.profiler.MultiGraphNodeProto': 'tf.compat.v1.profiler.MultiGraphNodeProto',
+    'tf.profiler.OpLogProto': 'tf.compat.v1.profiler.OpLogProto',
+    'tf.profiler.ProfileOptionBuilder': 'tf.compat.v1.profiler.ProfileOptionBuilder',
+    'tf.profiler.Profiler': 'tf.compat.v1.profiler.Profiler',
+    'tf.profiler.advise': 'tf.compat.v1.profiler.advise',
+    'tf.profiler.profile': 'tf.compat.v1.profiler.profile',
+    'tf.profiler.write_op_log': 'tf.compat.v1.profiler.write_op_log',
+    'tf.py_func': 'tf.compat.v1.py_func',
     'tf.python_io.TFRecordCompressionType': 'tf.io.TFRecordCompressionType',
     'tf.python_io.TFRecordOptions': 'tf.io.TFRecordOptions',
     'tf.python_io.TFRecordWriter': 'tf.io.TFRecordWriter',
-    'tf.python_io.tf_record_iterator': 'tf.io.tf_record_iterator',
+    'tf.python_io.tf_record_iterator': 'tf.compat.v1.python_io.tf_record_iterator',
     'tf.qr': 'tf.linalg.qr',
     'tf.quantize': 'tf.quantization.quantize',
     'tf.quantized_concat': 'tf.quantization.quantized_concat',
+    'tf.random.get_seed': 'tf.compat.v1.random.get_seed',
+    'tf.random.set_random_seed': 'tf.compat.v1.random.set_random_seed',
     'tf.random_crop': 'tf.image.random_crop',
     'tf.random_gamma': 'tf.random.gamma',
     'tf.random_normal': 'tf.random.normal',
-    'tf.random_poisson': 'tf.random.poisson',
+    'tf.random_poisson': 'tf.compat.v1.random_poisson',
     'tf.random_shuffle': 'tf.random.shuffle',
     'tf.random_uniform': 'tf.random.uniform',
     'tf.read_file': 'tf.io.read_file',
     'tf.real': 'tf.math.real',
     'tf.reciprocal': 'tf.math.reciprocal',
-    'tf.reduce_join': 'tf.strings.reduce_join',
     'tf.regex_replace': 'tf.strings.regex_replace',
     'tf.report_uninitialized_variables': 'tf.compat.v1.report_uninitialized_variables',
+    'tf.reset_default_graph': 'tf.compat.v1.reset_default_graph',
     'tf.resource_loader.get_data_files_path': 'tf.compat.v1.resource_loader.get_data_files_path',
     'tf.resource_loader.get_path_to_datafile': 'tf.compat.v1.resource_loader.get_path_to_datafile',
     'tf.resource_loader.get_root_dir_with_all_resources': 'tf.compat.v1.resource_loader.get_root_dir_with_all_resources',
@@ -314,13 +430,15 @@ renames = {
     'tf.rint': 'tf.math.rint',
     'tf.rsqrt': 'tf.math.rsqrt',
     'tf.saved_model.Builder': 'tf.compat.v1.saved_model.Builder',
+    'tf.saved_model.LEGACY_INIT_OP_KEY': 'tf.compat.v1.saved_model.LEGACY_INIT_OP_KEY',
+    'tf.saved_model.MAIN_OP_KEY': 'tf.compat.v1.saved_model.MAIN_OP_KEY',
     'tf.saved_model.TRAINING': 'tf.saved_model.TRANING',
     'tf.saved_model.build_tensor_info': 'tf.compat.v1.saved_model.build_tensor_info',
     'tf.saved_model.builder.SavedModelBuilder': 'tf.compat.v1.saved_model.builder.SavedModelBuilder',
     'tf.saved_model.constants.ASSETS_DIRECTORY': 'tf.saved_model.ASSETS_DIRECTORY',
     'tf.saved_model.constants.ASSETS_KEY': 'tf.saved_model.ASSETS_KEY',
-    'tf.saved_model.constants.LEGACY_INIT_OP_KEY': 'tf.saved_model.LEGACY_INIT_OP_KEY',
-    'tf.saved_model.constants.MAIN_OP_KEY': 'tf.saved_model.MAIN_OP_KEY',
+    'tf.saved_model.constants.LEGACY_INIT_OP_KEY': 'tf.compat.v1.saved_model.constants.LEGACY_INIT_OP_KEY',
+    'tf.saved_model.constants.MAIN_OP_KEY': 'tf.compat.v1.saved_model.constants.MAIN_OP_KEY',
     'tf.saved_model.constants.SAVED_MODEL_FILENAME_PB': 'tf.saved_model.SAVED_MODEL_FILENAME_PB',
     'tf.saved_model.constants.SAVED_MODEL_FILENAME_PBTXT': 'tf.saved_model.SAVED_MODEL_FILENAME_PBTXT',
     'tf.saved_model.constants.SAVED_MODEL_SCHEMA_VERSION': 'tf.saved_model.SAVED_MODEL_SCHEMA_VERSION',
@@ -330,10 +448,11 @@ renames = {
     'tf.saved_model.get_tensor_from_tensor_info': 'tf.compat.v1.saved_model.get_tensor_from_tensor_info',
     'tf.saved_model.load': 'tf.compat.v1.saved_model.load',
     'tf.saved_model.loader.load': 'tf.compat.v1.saved_model.loader.load',
-    'tf.saved_model.loader.maybe_saved_model_directory': 'tf.saved_model.maybe_saved_model_directory',
+    'tf.saved_model.loader.maybe_saved_model_directory': 'tf.compat.v1.saved_model.loader.maybe_saved_model_directory',
     'tf.saved_model.main_op.main_op': 'tf.compat.v1.saved_model.main_op.main_op',
     'tf.saved_model.main_op.main_op_with_restore': 'tf.compat.v1.saved_model.main_op.main_op_with_restore',
     'tf.saved_model.main_op_with_restore': 'tf.compat.v1.saved_model.main_op_with_restore',
+    'tf.saved_model.maybe_saved_model_directory': 'tf.compat.v1.saved_model.maybe_saved_model_directory',
     'tf.saved_model.signature_constants.CLASSIFY_INPUTS': 'tf.saved_model.CLASSIFY_INPUTS',
     'tf.saved_model.signature_constants.CLASSIFY_METHOD_NAME': 'tf.saved_model.CLASSIFY_METHOD_NAME',
     'tf.saved_model.signature_constants.CLASSIFY_OUTPUT_CLASSES': 'tf.saved_model.CLASSIFY_OUTPUT_CLASSES',
@@ -370,51 +489,80 @@ renames = {
     'tf.segment_sum': 'tf.math.segment_sum',
     'tf.self_adjoint_eig': 'tf.linalg.eigh',
     'tf.self_adjoint_eigvals': 'tf.linalg.eigvalsh',
-    'tf.serialize_many_sparse': 'tf.io.serialize_many_sparse',
-    'tf.serialize_sparse': 'tf.io.serialize_sparse',
+    'tf.serialize_many_sparse': 'tf.compat.v1.serialize_many_sparse',
+    'tf.serialize_sparse': 'tf.compat.v1.serialize_sparse',
     'tf.serialize_tensor': 'tf.io.serialize_tensor',
+    'tf.set_random_seed': 'tf.compat.v1.set_random_seed',
     'tf.setdiff1d': 'tf.compat.v1.setdiff1d',
+    'tf.sets.set_difference': 'tf.sets.difference',
+    'tf.sets.set_intersection': 'tf.sets.intersection',
+    'tf.sets.set_size': 'tf.sets.size',
+    'tf.sets.set_union': 'tf.sets.union',
     'tf.space_to_batch': 'tf.nn.space_to_batch',
     'tf.space_to_depth': 'tf.nn.space_to_depth',
+    'tf.sparse.matmul': 'tf.sparse.sparse_dense_matmul',
+    'tf.sparse.merge': 'tf.compat.v1.sparse.merge',
     'tf.sparse.placeholder': 'tf.compat.v1.sparse.placeholder',
-    'tf.sparse_add': 'tf.sparse.add',
+    'tf.sparse.reduce_max_sparse': 'tf.compat.v1.sparse.reduce_max_sparse',
+    'tf.sparse.reduce_sum_sparse': 'tf.compat.v1.sparse.reduce_sum_sparse',
     'tf.sparse_fill_empty_rows': 'tf.sparse.fill_empty_rows',
     'tf.sparse_mask': 'tf.sparse.mask',
-    'tf.sparse_matmul': 'tf.compat.v1.sparse_matmul',
     'tf.sparse_maximum': 'tf.sparse.maximum',
-    'tf.sparse_merge': 'tf.sparse.merge',
+    'tf.sparse_merge': 'tf.compat.v1.sparse_merge',
     'tf.sparse_minimum': 'tf.sparse.minimum',
     'tf.sparse_placeholder': 'tf.compat.v1.sparse_placeholder',
+    'tf.sparse_reduce_max': 'tf.compat.v1.sparse_reduce_max',
+    'tf.sparse_reduce_max_sparse': 'tf.compat.v1.sparse_reduce_max_sparse',
+    'tf.sparse_reduce_sum': 'tf.compat.v1.sparse_reduce_sum',
+    'tf.sparse_reduce_sum_sparse': 'tf.compat.v1.sparse_reduce_sum_sparse',
     'tf.sparse_reorder': 'tf.sparse.reorder',
     'tf.sparse_reset_shape': 'tf.sparse.reset_shape',
     'tf.sparse_reshape': 'tf.sparse.reshape',
     'tf.sparse_retain': 'tf.sparse.retain',
-    'tf.sparse_segment_mean': 'tf.sparse.segment_mean',
-    'tf.sparse_segment_sqrt_n': 'tf.sparse.segment_sqrt_n',
-    'tf.sparse_segment_sum': 'tf.sparse.segment_sum',
+    'tf.sparse_segment_mean': 'tf.compat.v1.sparse_segment_mean',
+    'tf.sparse_segment_sqrt_n': 'tf.compat.v1.sparse_segment_sqrt_n',
+    'tf.sparse_segment_sum': 'tf.compat.v1.sparse_segment_sum',
     'tf.sparse_slice': 'tf.sparse.slice',
     'tf.sparse_softmax': 'tf.sparse.softmax',
-    'tf.sparse_tensor_dense_matmul': 'tf.sparse.matmul',
+    'tf.sparse_tensor_dense_matmul': 'tf.sparse.sparse_dense_matmul',
     'tf.sparse_tensor_to_dense': 'tf.sparse.to_dense',
+    'tf.sparse_to_dense': 'tf.compat.v1.sparse_to_dense',
     'tf.sparse_to_indicator': 'tf.sparse.to_indicator',
     'tf.sparse_transpose': 'tf.sparse.transpose',
+    'tf.spectral.dct': 'tf.signal.dct',
     'tf.spectral.fft': 'tf.signal.fft',
     'tf.spectral.fft2d': 'tf.signal.fft2d',
     'tf.spectral.fft3d': 'tf.signal.fft3d',
+    'tf.spectral.idct': 'tf.signal.idct',
     'tf.spectral.ifft': 'tf.signal.ifft',
     'tf.spectral.ifft2d': 'tf.signal.ifft2d',
     'tf.spectral.ifft3d': 'tf.signal.ifft3d',
+    'tf.spectral.irfft': 'tf.signal.irfft',
+    'tf.spectral.irfft2d': 'tf.signal.irfft2d',
+    'tf.spectral.irfft3d': 'tf.signal.irfft3d',
+    'tf.spectral.rfft': 'tf.signal.rfft',
+    'tf.spectral.rfft2d': 'tf.signal.rfft2d',
+    'tf.spectral.rfft3d': 'tf.signal.rfft3d',
     'tf.squared_difference': 'tf.math.squared_difference',
     'tf.string_join': 'tf.strings.join',
     'tf.string_strip': 'tf.strings.strip',
-    'tf.string_to_hash_bucket': 'tf.strings.to_hash_bucket',
     'tf.string_to_hash_bucket_fast': 'tf.strings.to_hash_bucket_fast',
     'tf.string_to_hash_bucket_strong': 'tf.strings.to_hash_bucket_strong',
-    'tf.string_to_number': 'tf.strings.to_number',
+    'tf.summary.audio': 'tf.compat.v1.summary.audio',
+    'tf.summary.get_summary_description': 'tf.compat.v1.summary.get_summary_description',
+    'tf.summary.histogram': 'tf.compat.v1.summary.histogram',
+    'tf.summary.image': 'tf.compat.v1.summary.image',
+    'tf.summary.merge': 'tf.compat.v1.summary.merge',
+    'tf.summary.merge_all': 'tf.compat.v1.summary.merge_all',
+    'tf.summary.scalar': 'tf.compat.v1.summary.scalar',
+    'tf.summary.tensor_summary': 'tf.compat.v1.summary.tensor_summary',
+    'tf.summary.text': 'tf.compat.v1.summary.text',
     'tf.svd': 'tf.linalg.svd',
     'tf.tables_initializer': 'tf.compat.v1.tables_initializer',
+    'tf.test.compute_gradient': 'tf.compat.v1.test.compute_gradient',
     'tf.test.compute_gradient_error': 'tf.compat.v1.test.compute_gradient_error',
     'tf.test.get_temp_dir': 'tf.compat.v1.test.get_temp_dir',
+    'tf.test.mock': 'tf.compat.v1.test.mock',
     'tf.test.test_src_dir_path': 'tf.compat.v1.test.test_src_dir_path',
     'tf.to_bfloat16': 'tf.compat.v1.to_bfloat16',
     'tf.to_complex128': 'tf.compat.v1.to_complex128',
@@ -424,22 +572,47 @@ renames = {
     'tf.to_int32': 'tf.compat.v1.to_int32',
     'tf.to_int64': 'tf.compat.v1.to_int64',
     'tf.trace': 'tf.linalg.trace',
+    'tf.train.AdadeltaOptimizer': 'tf.compat.v1.train.AdadeltaOptimizer',
+    'tf.train.AdagradDAOptimizer': 'tf.compat.v1.train.AdagradDAOptimizer',
+    'tf.train.AdagradOptimizer': 'tf.compat.v1.train.AdagradOptimizer',
+    'tf.train.AdamOptimizer': 'tf.compat.v1.train.AdamOptimizer',
+    'tf.train.CheckpointSaverListener': 'tf.compat.v1.train.CheckpointSaverListener',
+    'tf.train.ChiefSessionCreator': 'tf.compat.v1.train.ChiefSessionCreator',
+    'tf.train.FtrlOptimizer': 'tf.compat.v1.train.FtrlOptimizer',
+    'tf.train.GradientDescentOptimizer': 'tf.compat.v1.train.GradientDescentOptimizer',
+    'tf.train.LooperThread': 'tf.compat.v1.train.LooperThread',
+    'tf.train.MomentumOptimizer': 'tf.compat.v1.train.MomentumOptimizer',
+    'tf.train.MonitoredSession': 'tf.compat.v1.train.MonitoredSession',
     'tf.train.MonitoredTrainingSession': 'tf.compat.v1.train.MonitoredTrainingSession',
+    'tf.train.NanLossDuringTrainingError': 'tf.compat.v1.train.NanLossDuringTrainingError',
+    'tf.train.NewCheckpointReader': 'tf.compat.v1.train.NewCheckpointReader',
+    'tf.train.Optimizer': 'tf.compat.v1.train.Optimizer',
+    'tf.train.ProfilerHook': 'tf.compat.v1.train.ProfilerHook',
+    'tf.train.ProximalAdagradOptimizer': 'tf.compat.v1.train.ProximalAdagradOptimizer',
     'tf.train.QueueRunner': 'tf.compat.v1.train.QueueRunner',
+    'tf.train.RMSPropOptimizer': 'tf.compat.v1.train.RMSPropOptimizer',
     'tf.train.Saver': 'tf.compat.v1.train.Saver',
     'tf.train.SaverDef': 'tf.compat.v1.train.SaverDef',
+    'tf.train.Scaffold': 'tf.compat.v1.train.Scaffold',
+    'tf.train.SecondOrStepTimer': 'tf.compat.v1.train.SecondOrStepTimer',
+    'tf.train.SessionCreator': 'tf.compat.v1.train.SessionCreator',
+    'tf.train.SessionManager': 'tf.compat.v1.train.SessionManager',
+    'tf.train.SessionRunArgs': 'tf.compat.v1.train.SessionRunArgs',
+    'tf.train.SessionRunContext': 'tf.compat.v1.train.SessionRunContext',
+    'tf.train.SessionRunValues': 'tf.compat.v1.train.SessionRunValues',
+    'tf.train.SingularMonitoredSession': 'tf.compat.v1.train.SingularMonitoredSession',
+    'tf.train.Supervisor': 'tf.compat.v1.train.Supervisor',
     'tf.train.SyncReplicasOptimizer': 'tf.compat.v1.train.SyncReplicasOptimizer',
+    'tf.train.VocabInfo': 'tf.estimator.VocabInfo',
+    'tf.train.WorkerSessionCreator': 'tf.compat.v1.train.WorkerSessionCreator',
     'tf.train.add_queue_runner': 'tf.compat.v1.train.add_queue_runner',
     'tf.train.assert_global_step': 'tf.compat.v1.train.assert_global_step',
     'tf.train.basic_train_loop': 'tf.compat.v1.train.basic_train_loop',
     'tf.train.batch': 'tf.compat.v1.train.batch',
     'tf.train.batch_join': 'tf.compat.v1.train.batch_join',
     'tf.train.checkpoint_exists': 'tf.compat.v1.train.checkpoint_exists',
-    'tf.train.cosine_decay': 'tf.compat.v1.train.cosine_decay',
-    'tf.train.cosine_decay_restarts': 'tf.compat.v1.train.cosine_decay_restarts',
     'tf.train.create_global_step': 'tf.compat.v1.train.create_global_step',
     'tf.train.do_quantize_training_on_graphdef': 'tf.compat.v1.train.do_quantize_training_on_graphdef',
-    'tf.train.exponential_decay': 'tf.compat.v1.train.exponential_decay',
     'tf.train.export_meta_graph': 'tf.compat.v1.train.export_meta_graph',
     'tf.train.generate_checkpoint_state_proto': 'tf.compat.v1.train.generate_checkpoint_state_proto',
     'tf.train.get_checkpoint_mtimes': 'tf.compat.v1.train.get_checkpoint_mtimes',
@@ -447,32 +620,31 @@ renames = {
     'tf.train.get_or_create_global_step': 'tf.compat.v1.train.get_or_create_global_step',
     'tf.train.global_step': 'tf.compat.v1.train.global_step',
     'tf.train.import_meta_graph': 'tf.compat.v1.train.import_meta_graph',
+    'tf.train.init_from_checkpoint': 'tf.compat.v1.train.init_from_checkpoint',
     'tf.train.input_producer': 'tf.compat.v1.train.input_producer',
-    'tf.train.inverse_time_decay': 'tf.compat.v1.train.inverse_time_decay',
     'tf.train.limit_epochs': 'tf.compat.v1.train.limit_epochs',
-    'tf.train.linear_cosine_decay': 'tf.compat.v1.train.linear_cosine_decay',
     'tf.train.match_filenames_once': 'tf.io.match_filenames_once',
     'tf.train.maybe_batch': 'tf.compat.v1.train.maybe_batch',
     'tf.train.maybe_batch_join': 'tf.compat.v1.train.maybe_batch_join',
     'tf.train.maybe_shuffle_batch': 'tf.compat.v1.train.maybe_shuffle_batch',
     'tf.train.maybe_shuffle_batch_join': 'tf.compat.v1.train.maybe_shuffle_batch_join',
-    'tf.train.natural_exp_decay': 'tf.compat.v1.train.natural_exp_decay',
-    'tf.train.noisy_linear_cosine_decay': 'tf.compat.v1.train.noisy_linear_cosine_decay',
     'tf.train.piecewise_constant': 'tf.compat.v1.train.piecewise_constant',
-    'tf.train.polynomial_decay': 'tf.compat.v1.train.polynomial_decay',
     'tf.train.queue_runner.QueueRunner': 'tf.compat.v1.train.queue_runner.QueueRunner',
     'tf.train.queue_runner.add_queue_runner': 'tf.compat.v1.train.queue_runner.add_queue_runner',
     'tf.train.queue_runner.start_queue_runners': 'tf.compat.v1.train.queue_runner.start_queue_runners',
     'tf.train.range_input_producer': 'tf.compat.v1.train.range_input_producer',
     'tf.train.remove_checkpoint': 'tf.compat.v1.train.remove_checkpoint',
+    'tf.train.replica_device_setter': 'tf.compat.v1.train.replica_device_setter',
     'tf.train.shuffle_batch': 'tf.compat.v1.train.shuffle_batch',
     'tf.train.shuffle_batch_join': 'tf.compat.v1.train.shuffle_batch_join',
     'tf.train.slice_input_producer': 'tf.compat.v1.train.slice_input_producer',
     'tf.train.start_queue_runners': 'tf.compat.v1.train.start_queue_runners',
     'tf.train.string_input_producer': 'tf.compat.v1.train.string_input_producer',
     'tf.train.update_checkpoint_state': 'tf.compat.v1.train.update_checkpoint_state',
+    'tf.train.warm_start': 'tf.compat.v1.train.warm_start',
     'tf.train.write_graph': 'tf.io.write_graph',
     'tf.trainable_variables': 'tf.compat.v1.trainable_variables',
+    'tf.truncated_normal': 'tf.random.truncated_normal',
     'tf.uniform_unit_scaling_initializer': 'tf.initializers.uniform_unit_scaling',
     'tf.unsorted_segment_max': 'tf.math.unsorted_segment_max',
     'tf.unsorted_segment_mean': 'tf.math.unsorted_segment_mean',
@@ -480,12 +652,13 @@ renames = {
     'tf.unsorted_segment_prod': 'tf.math.unsorted_segment_prod',
     'tf.unsorted_segment_sqrt_n': 'tf.math.unsorted_segment_sqrt_n',
     'tf.unsorted_segment_sum': 'tf.math.unsorted_segment_sum',
-    'tf.variable_creator_scope': 'tf.compat.v1.variable_creator_scope',
+    'tf.variable_axis_size_partitioner': 'tf.compat.v1.variable_axis_size_partitioner',
     'tf.variable_op_scope': 'tf.compat.v1.variable_op_scope',
     'tf.variable_scope': 'tf.compat.v1.variable_scope',
     'tf.variables_initializer': 'tf.compat.v1.variables_initializer',
     'tf.variance_scaling_initializer': 'tf.keras.initializers.VarianceScaling',
-    'tf.verify_tensor_all_finite': 'tf.debugging.assert_all_finite',
+    'tf.verify_tensor_all_finite': 'tf.compat.v1.verify_tensor_all_finite',
+    'tf.wrap_function': 'tf.compat.v1.wrap_function',
     'tf.write_file': 'tf.io.write_file',
     'tf.zeta': 'tf.math.zeta'
 }
diff --git a/tensorflow/tools/compatibility/testdata/test_file_v1_12.py b/tensorflow/tools/compatibility/testdata/test_file_v1_12.py
new file mode 100644
index 00000000000..fd688781b0d
--- /dev/null
+++ b/tensorflow/tools/compatibility/testdata/test_file_v1_12.py
@@ -0,0 +1,71 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for tf upgrader."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import tensorflow as tf
+from tensorflow.python.framework import test_util
+from tensorflow.python.platform import test as test_lib
+
+
+class TestUpgrade(test_util.TensorFlowTestCase):
+  """Test various APIs that have been changed in 2.0."""
+
+  def setUp(self):
+    tf.enable_eager_execution()
+
+  def testRenames(self):
+    with self.cached_session():
+      self.assertAllClose(1.04719755, tf.acos(0.5))
+      self.assertAllClose(0.5, tf.rsqrt(4.0))
+
+  def testSerializeSparseTensor(self):
+    sp_input = tf.SparseTensor(
+        indices=tf.constant([[1]], dtype=tf.int64),
+        values=tf.constant([2], dtype=tf.int64),
+        dense_shape=[2])
+
+    with self.cached_session():
+      serialized_sp = tf.serialize_sparse(sp_input, 'serialize_name', tf.string)
+      self.assertEqual((3,), serialized_sp.shape)
+      self.assertTrue(serialized_sp[0].numpy())  # check non-empty
+
+  def testSerializeManySparse(self):
+    sp_input = tf.SparseTensor(
+        indices=tf.constant([[0, 1]], dtype=tf.int64),
+        values=tf.constant([2], dtype=tf.int64),
+        dense_shape=[1, 2])
+
+    with self.cached_session():
+      serialized_sp = tf.serialize_many_sparse(
+          sp_input, 'serialize_name', tf.string)
+      self.assertEqual((1, 3), serialized_sp.shape)
+
+  def testArgMaxMin(self):
+    self.assertAllClose(
+        [1],
+        tf.argmax([[1, 3, 2]], name='abc', dimension=1))
+    self.assertAllClose(
+        [0, 0, 0],
+        tf.argmax([[1, 3, 2]], dimension=0))
+    self.assertAllClose(
+        [0],
+        tf.argmin([[1, 3, 2]], name='abc', dimension=1))
+
+
+if __name__ == "__main__":
+  test_lib.main()
diff --git a/tensorflow/tools/compatibility/tf_upgrade_v2.py b/tensorflow/tools/compatibility/tf_upgrade_v2.py
index 0df8b0f3769..655e680d5bf 100644
--- a/tensorflow/tools/compatibility/tf_upgrade_v2.py
+++ b/tensorflow/tools/compatibility/tf_upgrade_v2.py
@@ -18,8 +18,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import argparse
-
 from tensorflow.tools.compatibility import ast_edits
 from tensorflow.tools.compatibility import renames_v2
 
@@ -31,23 +29,452 @@ class TFAPIChangeSpec(ast_edits.APIChangeSpec):
     # Maps from a function name to a dictionary that describes how to
     # map from an old argument keyword to the new argument keyword.
     self.function_keyword_renames = {
+        "tf.argmin": {
+            "dimension": "axis",
+        },
+        "tf.argmax": {
+            "dimension": "axis",
+        },
+        "tf.image.crop_and_resize": {
+            "box_ind": "box_indices",
+        },
+        "tf.image.extract_image_patches": {
+            "ksizes": "sizes",
+        },
+        "tf.extract_image_patches": {
+            "ksizes": "sizes",
+        },
+        "tf.expand_dims": {
+            "dim": "axis",
+        },
+        "tf.batch_to_space": {
+            "block_size": "block_shape",
+        },
+        "tf.constant": {
+            "verify_shape": "verify_shape_is_now_always_true",
+        },
         "tf.convert_to_tensor": {
             "preferred_dtype": "dtype_hint"
         },
+        "tf.nn.softmax_cross_entropy_with_logits_v2": {
+            "dim": "axis"
+        },
+        "tf.linalg.l2_normalize": {
+            "dim": "axis",
+        },
+        "tf.math.count_nonzero": {
+            "input_tensor": "input",
+            "keep_dims": "keepdims",
+            "reduction_indices": "axis",
+        },
+        "tf.nn.erosion2d": {
+            "kernel": "filters",
+            "rates": "dilations",
+        },
+        "tf.math.l2_normalize": {
+            "dim": "axis",
+        },
+        "tf.math.log_softmax": {
+            "dim": "axis",
+        },
+        "tf.math.softmax": {
+            "dim": "axis"
+        },
+        "tf.nn.l2_normalize": {
+            "dim": "axis",
+        },
+        "tf.nn.log_softmax": {
+            "dim": "axis",
+        },
+        "tf.nn.moments": {
+            "keep_dims": "keepdims",
+        },
+        "tf.nn.pool": {
+            "dilation_rate": "dilations"
+        },
+        "tf.nn.separable_conv2d": {
+            "rate": "dilations"
+        },
+        "tf.nn.softmax": {
+            "dim": "axis"
+        },
+        "tf.nn.sufficient_statistics": {
+            "keep_dims": "keepdims"
+        },
+        "tf.debugging.assert_all_finite": {
+            "t": "x",
+            "msg": "message",
+        },
+        "tf.sparse.add": {
+            "thresh": "threshold",
+        },
+        "tf.sparse_add": {
+            "thresh": "threshold",
+        },
+        "tf.sparse.concat": {
+            "concat_dim": "axis",
+        },
+        "tf.sparse_concat": {
+            "concat_dim": "axis",
+        },
+        "tf.sparse.split": {
+            "split_dim": "axis",
+        },
+        "tf.max_pool_with_argmax": {
+            "Targmax": "output_dtype",
+        },
+        "tf.multinomial": {
+            "output_dtype": "dtype",
+        },
+        "tf.random.multinomial": {
+            "output_dtype": "dtype",
+        },
+        "tf.nn.batch_norm_with_global_normalization": {
+            "t": "input",
+            "m": "mean",
+            "v": "variance",
+        },
+        "tf.nn.dilation2d": {
+            "filter": "filters",
+            "rates": "dilations",
+        },
+        "tf.nn.conv3d": {
+            "filter": "filters"
+        },
+        "tf.zeros_like": {
+            "tensor": "input",
+        },
+        "tf.ones_like": {
+            "tensor": "input",
+        },
+        "tf.nn.conv3d_transpose": {
+            "value": "input",
+            "filter": "filters",
+        },
+        "tf.nn.convolution": {
+            "filter": "filters",
+            "dilation_rate": "dilations",
+        },
+        "tf.gfile.Exists": {
+            "filename": "path",
+        },
+        "tf.gfile.Remove": {
+            "filename": "path",
+        },
+        "tf.gfile.Stat": {
+            "filename": "path",
+        },
+        "tf.gfile.Glob": {
+            "filename": "pattern",
+        },
+        "tf.gfile.MkDir": {
+            "dirname": "path",
+        },
+        "tf.gfile.MakeDirs": {
+            "dirname": "path",
+        },
+        "tf.gfile.DeleteRecursively": {
+            "dirname": "path",
+        },
+        "tf.gfile.IsDirectory": {
+            "dirname": "path",
+        },
+        "tf.gfile.ListDirectory": {
+            "dirname": "path",
+        },
+        "tf.gfile.Copy": {
+            "oldpath": "src",
+            "newpath": "dst",
+        },
+        "tf.gfile.Rename": {
+            "oldname": "src",
+            "newname": "dst",
+        },
+        "tf.gfile.Walk": {
+            "in_order": "topdown",
+        },
+        "tf.random.stateless_multinomial": {
+            "output_dtype": "dtype",
+        },
+        "tf.string_to_number": {
+            "string_tensor": "input",
+        },
+        "tf.strings.to_number": {
+            "string_tensor": "input",
+        },
+        "tf.string_to_hash_bucket": {
+            "string_tensor": "input",
+        },
+        "tf.strings.to_hash_bucket": {
+            "string_tensor": "input",
+        },
+        "tf.reduce_all": {
+            "reduction_indices": "axis",
+            "keep_dims": "keepdims",
+        },
+        "tf.math.reduce_all": {
+            "reduction_indices": "axis",
+            "keep_dims": "keepdims",
+        },
+        "tf.reduce_any": {
+            "reduction_indices": "axis",
+            "keep_dims": "keepdims",
+        },
+        "tf.math.reduce_any": {
+            "reduction_indices": "axis",
+            "keep_dims": "keepdims",
+        },
+        "tf.reduce_min": {
+            "reduction_indices": "axis",
+            "keep_dims": "keepdims",
+        },
+        "tf.math.reduce_min": {
+            "reduction_indices": "axis",
+            "keep_dims": "keepdims",
+        },
+        "tf.reduce_max": {
+            "reduction_indices": "axis",
+            "keep_dims": "keepdims",
+        },
+        "tf.math.reduce_max": {
+            "reduction_indices": "axis",
+            "keep_dims": "keepdims",
+        },
+        "tf.reduce_sum": {
+            "reduction_indices": "axis",
+            "keep_dims": "keepdims",
+        },
+        "tf.math.reduce_sum": {
+            "reduction_indices": "axis",
+            "keep_dims": "keepdims",
+        },
+        "tf.reduce_mean": {
+            "reduction_indices": "axis",
+            "keep_dims": "keepdims",
+        },
+        "tf.math.reduce_mean": {
+            "reduction_indices": "axis",
+            "keep_dims": "keepdims",
+        },
+        "tf.reduce_prod": {
+            "reduction_indices": "axis",
+            "keep_dims": "keepdims",
+        },
+        "tf.math.reduce_prod": {
+            "reduction_indices": "axis",
+            "keep_dims": "keepdims",
+        },
+        "tf.reduce_logsumexp": {
+            "reduction_indices": "axis",
+            "keep_dims": "keepdims",
+        },
+        "tf.math.reduce_logsumexp": {
+            "reduction_indices": "axis",
+            "keep_dims": "keepdims",
+        },
+        "tf.reduce_join": {
+            "keep_dims": "keepdims",
+            "reduction_indices": "axis"
+        },
+        "tf.strings.reduce_join": {
+            "keep_dims": "keepdims",
+            "reduction_indices": "axis"
+        },
+        "tf.squeeze": {
+            "squeeze_dims": "axis",
+        },
     }
 
+    # pylint: disable=line-too-long
+    # Add additional renames not in renames_v2.py here.
+    # IMPORTANT: For the renames in here, if you also need to add to
+    # function_reorders or function_keyword_renames, use the OLD function name.
+    # These renames happen after the arguments have been processed.
+    self.manual_symbol_renames = {
+        "tf.batch_to_space_nd":
+            "tf.batch_to_space",
+        "tf.extract_image_patches":
+            "tf.image.extract_image_patches",
+        "tf.gfile.Copy":
+            "tf.io.gfile.copy",
+        "tf.gfile.DeleteRecursively":
+            "tf.io.gfile.rmtree",
+        "tf.gfile.Exists":
+            "tf.io.gfile.exists",
+        "tf.gfile.Glob":
+            "tf.io.gfile.glob",
+        "tf.gfile.IsDirectory":
+            "tf.io.gfile.isdir",
+        "tf.gfile.ListDirectory":
+            "tf.io.gfile.listdir",
+        "tf.gfile.MakeDirs":
+            "tf.io.gfile.makedirs",
+        "tf.gfile.MkDir":
+            "tf.io.gfile.mkdir",
+        "tf.gfile.Remove":
+            "tf.io.gfile.remove",
+        "tf.gfile.Rename":
+            "tf.io.gfile.rename",
+        "tf.gfile.Stat":
+            "tf.io.gfile.stat",
+        "tf.gfile.Walk":
+            "tf.io.gfile.walk",
+        "tf.contrib.data.AUTOTUNE":
+            "tf.data.experimental.AUTOTUNE",
+        "tf.contrib.data.Counter":
+            "tf.data.experimental.Counter",
+        "tf.contrib.data.CheckpointInputPipelineHook":
+            "tf.data.experimental.CheckpointInputPipelineHook",
+        "tf.contrib.data.CsvDataset":
+            "tf.data.experimental.CsvDataset",
+        "tf.contrib.data.Optional":
+            "tf.data.experimental.Optional",
+        "tf.contrib.data.RandomDataset":
+            "tf.data.experimental.RandomDataset",
+        "tf.contrib.data.Reducer":
+            "tf.data.experimental.Reducer",
+        "tf.contrib.data.SqlDataset":
+            "tf.data.experimental.SqlDataset",
+        "tf.contrib.data.StatsAggregator":
+            "tf.data.experimental.StatsAggregator",
+        "tf.contrib.data.TFRecordWriter":
+            "tf.data.experimental.TFRecordWriter",
+        "tf.contrib.data.assert_element_shape":
+            "tf.data.experimental.assert_element_shape",
+        "tf.contrib.data.batch_and_drop_remainder":
+            "tf.compat.v1.contrib.data.batch_and_drop_remainder",
+        "tf.contrib.data.bucket_by_sequence_length":
+            "tf.data.experimental.bucket_by_sequence_length",
+        "tf.contrib.data.choose_from_datasets":
+            "tf.data.experimental.choose_from_datasets",
+        "tf.contrib.data.copy_to_device":
+            "tf.data.experimental.copy_to_device",
+        "tf.contrib.data.dense_to_sparse_batch":
+            "tf.data.experimental.dense_to_sparse_batch",
+        "tf.contrib.data.enumerate_dataset":
+            "tf.data.experimental.enumerate_dataset",
+        "tf.contrib.data.get_next_as_optional":
+            "tf.data.experimental.get_next_as_optional",
+        "tf.contrib.data.get_single_element":
+            "tf.data.experimental.get_single_element",
+        "tf.contrib.data.group_by_reducer":
+            "tf.data.experimental.group_by_reducer",
+        "tf.contrib.data.group_by_window":
+            "tf.data.experimental.group_by_window",
+        "tf.contrib.data.ignore_errors":
+            "tf.data.experimental.ignore_errors",
+        "tf.contrib.data.latency_stats":
+            "tf.data.experimental.latency_stats",
+        "tf.contrib.data.make_batched_features_dataset":
+            "tf.data.experimental.make_batched_features_dataset",
+        "tf.contrib.data.make_csv_dataset":
+            "tf.data.experimental.make_csv_dataset",
+        "tf.contrib.data.make_saveable_from_iterator":
+            "tf.data.experimental.make_saveable_from_iterator",
+        "tf.contrib.data.map_and_batch":
+            "tf.data.experimental.map_and_batch",
+        "tf.contrib.data.padded_batch_and_drop_remainder":
+            "tf.compat.v1.contrib.data.padded_batch_and_drop_remainder",
+        "tf.contrib.data.parallel_interleave":
+            "tf.data.experimental.parallel_interleave",
+        "tf.contrib.data.parse_example_dataset":
+            "tf.data.experimental.parse_example_dataset",
+        "tf.contrib.data.prefetch_to_device":
+            "tf.data.experimental.prefetch_to_device",
+        "tf.contrib.data.read_batch_features":
+            "tf.compat.v1.contrib.data.read_batch_features",
+        "tf.contrib.data.reduce_dataset":
+            "tf.compat.v1.contrib.data.reduce_dataset",
+        "tf.contrib.data.rejection_resample":
+            "tf.data.experimental.rejection_resample",
+        "tf.contrib.data.sample_from_datasets":
+            "tf.data.experimental.sample_from_datasets",
+        "tf.contrib.data.scan":
+            "tf.data.experimental.scan",
+        "tf.contrib.data.set_stats_aggregator":
+            "tf.data.experimental.set_stats_aggregator",
+        "tf.contrib.data.shuffle_and_repeat":
+            "tf.data.experimental.shuffle_and_repeat",
+        "tf.contrib.data.sliding_window_batch":
+            "tf.compat.v1.contrib.data.sliding_window_batch",
+        "tf.contrib.data.sloppy_interleave":
+            "tf.compat.v1.contrib.data.sloppy_interleave",
+        "tf.contrib.data.unbatch":
+            "tf.data.experimental.unbatch",
+        "tf.contrib.data.unique":
+            "tf.data.experimental.unique",
+        "tf.contrib.framework.sort":
+            "tf.sort",
+        "tf.contrib.framework.argsort":
+            "tf.argsort",
+        "tf.manip.batch_to_space_nd":
+            "tf.batch_to_space",
+        "tf.quantize_v2":
+            "tf.quantization.quantize",
+        "tf.sparse_add":
+            "tf.sparse.add",
+        "tf.sparse_concat":
+            "tf.sparse.concat",
+        "tf.sparse_split":
+            "tf.sparse.split",
+        "tf.sparse_matmul":
+            "tf.linalg.matmul",
+        "tf.random.stateless_multinomial":
+            "tf.random.stateless_categorical",
+        "tf.string_to_hash_bucket":
+            "tf.strings.to_hash_bucket",
+        "tf.string_to_number":
+            "tf.strings.to_number",
+        "tf.multinomial":
+            "tf.random.categorical",
+        "tf.random.multinomial":
+            "tf.random.categorical",
+        "tf.reduce_join":
+            "tf.strings.reduce_join",
+        "tf.load_file_system_library":
+            "tf.load_library",
+        "tf.pywrap_tensorflow":
+            "tf.compat.v1.pywrap_tensorflow",
+        "tf.bincount":
+            "tf.math.bincount",
+        "tf.confusion_matrix":
+            "tf.math.confusion_matrix",
+        "tf.train.confusion_matrix":
+            "tf.math.confusion_matrix",
+        "tf.decode_csv":
+            "tf.io.decode_csv",
+        "tf.data.Iterator":
+            "tf.compat.v1.data.Iterator",
+        "tf.parse_example":
+            "tf.io.parse_example",
+        "tf.parse_single_example":
+            "tf.io.parse_single_example",
+        "tf.nn.fused_batch_norm":
+            "tf.compat.v1.nn.fused_batch_norm",
+        "tf.nn.softmax_cross_entropy_with_logits_v2":
+            "tf.nn.softmax_cross_entropy_with_logits",
+        "tf.losses.Reduction.MEAN":
+            "tf.compat.v1.losses.Reduction.MEAN",
+        "tf.losses.Reduction.SUM_BY_NONZERO_WEIGHTS":
+            "tf.compat.v1.losses.Reduction.SUM_BY_NONZERO_WEIGHTS",
+        "tf.losses.Reduction.SUM_OVER_NONZERO_WEIGHTS":
+            "tf.compat.v1.losses.Reduction.SUM_OVER_NONZERO_WEIGHTS",
+        "tf.lite.constants.FLOAT":
+            "tf.float32",
+        "tf.lite.constants.INT32":
+            "tf.int32",
+        "tf.lite.constants.INT64":
+            "tf.int64",
+        "tf.lite.constants.STRING":
+            "tf.string",
+        "tf.lite.constants.QUANTIZED_UINT8":
+            "tf.uint8",
+    }
+    # pylint: enable=line-too-long
+
     # Mapping from function to the new name of the function
     self.symbol_renames = renames_v2.renames
-    # pylint: disable=line-too-long
-    # Add additional renames not in renames_v2.py here.
-    self.symbol_renames.update({
-    })
-    # pylint: enable=line-too-long
-
-    # For custom behavior and if auto-generate rename in renames_v2.py
-    # is incorrect, add the op name here to exclude it from renames_v2.py.
-    excluded_renames = [
-    ]
+    self.symbol_renames.update(self.manual_symbol_renames)
 
     # Variables that should be changed to functions.
     self.change_to_function = {}
@@ -55,22 +482,194 @@ class TFAPIChangeSpec(ast_edits.APIChangeSpec):
     # Functions that were reordered should be changed to the new keyword args
     # for safety, if positional arguments are used. If you have reversed the
     # positional arguments yourself, this could do the wrong thing.
+    # IMPORTANT: order here should correspond to OLD argument order.
+    # We just prepend "arg_name=" to all arguments in function calls.
     self.function_reorders = {
-        "tf.convert_to_tensor": ["value", "dtype", "preferred_dtype", "name"],
-        "tf.argmin": ["input", "axis", "output_type", "name"],
-        "tf.argmax": ["input", "axis", "output_type", "name"],
+        "tf.io.serialize_sparse": ["sp_input", "name", "out_type"],
+        "tf.io.serialize_many_sparse": ["sp_input", "name", "out_type"],
+        "tf.argmax": ["input", "axis", "name", "axis", "output_type"],
+        "tf.argmin": ["input", "axis", "name", "axis", "output_type"],
+        "tf.batch_to_space": ["input", "crops", "block_size", "name"],
         "tf.boolean_mask": ["tensor", "mask", "name", "axis"],
+        "tf.convert_to_tensor": ["value", "dtype", "name", "preferred_dtype"],
+        "tf.nn.moments": ["x", "axes", "shift", "keepdims", "name"],
+        "tf.nn.convolution": [
+            "input", "filter", "padding", "strides", "dilation_rate", "name",
+            "data_format"
+        ],
+        "tf.nn.crelu": ["features", "name", "axis"],
+        "tf.nn.pool": [
+            "input", "window_shape", "pooling_type", "padding", "dilation_rate",
+            "strides", "name", "data_format"
+        ],
+        "tf.nn.depthwise_conv2d": [
+            "input", "filter", "strides", "padding", "rate", "name",
+            "data_format"
+        ],
+        "tf.multinomial": [
+            "logits", "num_samples", "seed", "name", "output_dtype"
+        ],
+        "tf.random.multinomial": [
+            "logits", "num_samples", "seed", "name", "output_dtype"
+        ],
+        "tf.pad": ["tensor", "paddings", "mode", "name", "constant_values"],
+        "tf.quantize_v2": [
+            "input", "min_range", "max_range", "T", "mode", "name", "round_mode"
+        ],
+        "tf.feature_column.categorical_column_with_vocabulary_file": [
+            "key", "vocabulary_file", "vocabulary_size", "num_oov_buckets",
+            "default_value", "dtype"
+        ],
+        "tf.shape": ["input", "name", "out_type"],
+        "tf.size": ["input", "name", "out_type"],
+        "tf.random.poisson": ["lam", "shape", "dtype", "seed", "name"],
+        "tf.sparse.add": ["a", "b", "thresh"],
+        "tf.sparse_add": ["a", "b", "thresh"],
+        "tf.sparse.concat": [
+            "axis", "sp_inputs", "name", "expand_nonconcat_dim", "concat_dim"
+        ],
+        "tf.sparse_concat": [
+            "axis", "sp_inputs", "name", "expand_nonconcat_dim", "concat_dim"
+        ],
+        "tf.sparse.segment_mean": [
+            "data", "indices", "segment_ids", "name", "num_segments"
+        ],
+        "tf.sparse.segment_sqrt_n": [
+            "data", "indices", "segment_ids", "name", "num_segments"
+        ],
+        "tf.sparse.segment_sum": [
+            "data", "indices", "segment_ids", "name", "num_segments"
+        ],
+        "tf.sparse_matmul": [
+            "a", "b", "transpose_a", "transpose_b", "a_is_sparse",
+            "b_is_sparse", "name"
+        ],
+        "tf.io.decode_csv": [
+            "records",
+            "record_defaults",
+            "field_delim",
+            "use_quote_delim",
+            "name",
+            "na_value",
+            "select_cols",
+        ],
+        "tf.strings.substr": ["input", "pos", "len", "name", "unit"],
+        "tf.strings.reduce_join": [
+            "input", "axis", "keep_dims", "separator", "name",
+            "reduction_indices"
+        ],
+        "tf.strings.length": ["input", "name", "unit"],
+        "tf.transpose": ["a", "perm", "name", "conjugate"],
+        "tf.tuple": ["tensors", "name", "control_inputs"],
+        "tf.parse_example": [
+            "serialized", "features", "name", "example_names"
+        ],
+        "tf.parse_single_example": [
+            "serialized", "features", "name", "example_names"
+        ],
+        "tf.io.parse_example": [
+            "serialized", "features", "name", "example_names"
+        ],
+        "tf.io.parse_single_example": [
+            "serialized", "features", "name", "example_names"
+        ],
+        "tf.while_loop": [
+            "cond", "body", "loop_vars", "shape_invariants",
+            "parallel_iterations", "back_prop", "swap_memory", "name",
+            "maximum_iterations", "return_same_structure"
+        ],
+        "tf.reduce_all": [
+            "input_tensor", "axis", "keepdims", "name", "reduction_indices",
+            "keep_dims"
+        ],
+        "tf.math.reduce_all": [
+            "input_tensor", "axis", "keepdims", "name", "reduction_indices",
+            "keep_dims"
+        ],
+        "tf.reduce_any": [
+            "input_tensor", "axis", "keepdims", "name", "reduction_indices",
+            "keep_dims"
+        ],
+        "tf.math.reduce_any": [
+            "input_tensor", "axis", "keepdims", "name", "reduction_indices",
+            "keep_dims"
+        ],
+        "tf.reduce_min": [
+            "input_tensor", "axis", "keepdims", "name", "reduction_indices",
+            "keep_dims"
+        ],
+        "tf.math.reduce_min": [
+            "input_tensor", "axis", "keepdims", "name", "reduction_indices",
+            "keep_dims"
+        ],
+        "tf.reduce_max": [
+            "input_tensor", "axis", "keepdims", "name", "reduction_indices",
+            "keep_dims"
+        ],
+        "tf.math.reduce_max": [
+            "input_tensor", "axis", "keepdims", "name", "reduction_indices",
+            "keep_dims"
+        ],
+        "tf.reduce_sum": [
+            "input_tensor", "axis", "keepdims", "name", "reduction_indices",
+            "keep_dims"
+        ],
+        "tf.math.reduce_sum": [
+            "input_tensor", "axis", "keepdims", "name", "reduction_indices",
+            "keep_dims"
+        ],
+        "tf.reduce_mean": [
+            "input_tensor", "axis", "keepdims", "name", "reduction_indices",
+            "keep_dims"
+        ],
+        "tf.math.reduce_mean": [
+            "input_tensor", "axis", "keepdims", "name", "reduction_indices",
+            "keep_dims"
+        ],
+        "tf.reduce_prod": [
+            "input_tensor", "axis", "keepdims", "name", "reduction_indices",
+            "keep_dims"
+        ],
+        "tf.math.reduce_prod": [
+            "input_tensor", "axis", "keepdims", "name", "reduction_indices",
+            "keep_dims"
+        ],
+        "tf.reduce_logsumexp": [
+            "input_tensor", "axis", "keepdims", "name", "reduction_indices",
+            "keep_dims"
+        ],
+        "tf.math.reduce_logsumexp": [
+            "input_tensor", "axis", "keepdims", "name", "reduction_indices",
+            "keep_dims"
+        ],
+        "tf.reduce_join": [
+            "input", "axis", "keep_dims", "separator", "name",
+            "reduction_indices"
+        ],
+        "tf.confusion_matrix": [
+            "labels", "predictions", "num_classes", "dtype", "name", "weights"
+        ],
+        "tf.math.confusion_matrix": [
+            "labels", "predictions", "num_classes", "dtype", "name", "weights"
+        ]
     }
 
     # Specially handled functions.
-    self.function_handle = {}
+    self.function_handle = {
+        "tf.nn.dropout": self._dropout_handler,
+        "tf.gradients": self._colocate_handler("tf.gradients"),
+        "*.minimize": self._colocate_handler("Optimizer.minimize"),
+        "*.compute_gradients":
+            self._colocate_handler("Optimizer.compute_gradients"),
+    }
 
     decay_function_comment = (
-        "ERROR: <function name> has been changed to return a callable instead "
-        "of a tensor when graph building, but its functionality remains "
+        "WARNING: <function name> has been changed to return a callable instead"
+        " of a tensor when graph building, but its functionality remains "
         "unchanged during eager execution (returns a callable like "
         "before). The converter cannot detect and fix this reliably, so "
-        "you need to inspect this usage manually.\n"
+        "this usage has been converted to compat.v1 (even though it may already"
+        " be correct).\n"
     )
 
     # TODO(b/118888586): add default value change to update script.
@@ -79,99 +678,187 @@ class TFAPIChangeSpec(ast_edits.APIChangeSpec):
         "SUM_OVER_BATCH_SIZE.\n"
     )
 
+    assert_return_type_comment = (
+        "WARNING: assert_* functions have been changed to return None, the "
+        "data argument has been removed, and arguments have been reordered."
+        "\nThe calls have been converted to compat.v1 for safety (even though "
+        " they may already have been correct)."
+    )
+
+    assert_rank_comment = (
+        "WARNING: assert_rank_* functions have been changed to return None, and"
+        " the data and summarize arguments have been removed."
+        "\nThe calls have been converted to compat.v1 for safety (even though "
+        " they may already have been correct)."
+    )
+
+    tf_01s_like_no_optimize_comment = (
+        "WARNING: tf.zeros_like and tf.ones_like no longer have the optimize "
+        "argument in TF 2.0 or after (also, `tensor' argument is renamed to "
+        "`input')."
+        "\nThe calls have been converted to compat.v1 for safety (even though "
+        " they may already have been correct)."
+    )
+
     # Function warnings. <function name> placeholder inside warnings will be
     # replaced by function name.
     self.function_warnings = {
-        "tf.train.exponential_decay": decay_function_comment,
-        "tf.train.piecewise_constant": decay_function_comment,
-        "tf.train.polynomial_decay": decay_function_comment,
-        "tf.train.natural_exp_decay": decay_function_comment,
-        "tf.train.inverse_time_decay": decay_function_comment,
-        "tf.train.cosine_decay": decay_function_comment,
-        "tf.train.cosine_decay_restarts": decay_function_comment,
-        "tf.train.linear_cosine_decay": decay_function_comment,
-        "tf.train.noisy_linear_cosine_decay": decay_function_comment,
-        "tf.estimator.LinearClassifier": default_loss_reduction_changed,
+        "tf.assert_greater": assert_return_type_comment,
+        "tf.assert_equal": assert_return_type_comment,
+        "tf.assert_less": assert_return_type_comment,
+        "tf.assert_rank": assert_rank_comment,
+        "tf.debugging.assert_equal": assert_return_type_comment,
+        "tf.debugging.assert_greater": assert_return_type_comment,
+        "tf.debugging.assert_greater_equal": assert_return_type_comment,
+        "tf.debugging.assert_integer": assert_return_type_comment,
+        "tf.debugging.assert_less": assert_return_type_comment,
+        "tf.debugging.assert_less_equal": assert_return_type_comment,
+        "tf.debugging.assert_near": assert_return_type_comment,
+        "tf.debugging.assert_negative": assert_return_type_comment,
+        "tf.debugging.assert_non_negative": assert_return_type_comment,
+        "tf.debugging.assert_non_positive": assert_return_type_comment,
+        "tf.debugging.assert_none_equal": assert_return_type_comment,
+        "tf.debugging.assert_positive": assert_return_type_comment,
+        "tf.debugging.assert_rank": assert_rank_comment,
+        "tf.debugging.assert_rank_at_least": assert_rank_comment,
+        "tf.debugging.assert_rank_in": assert_rank_comment,
+        "tf.flags": "tf.flags has been removed, please use the argparse or absl"
+                    " module if you need command line parsing.",
+        "tf.train.exponential_decay":
+            decay_function_comment,
+        "tf.train.piecewise_constant_decay":
+            decay_function_comment,
+        "tf.train.polynomial_decay":
+            decay_function_comment,
+        "tf.train.natural_exp_decay":
+            decay_function_comment,
+        "tf.train.inverse_time_decay":
+            decay_function_comment,
+        "tf.train.cosine_decay":
+            decay_function_comment,
+        "tf.train.cosine_decay_restarts":
+            decay_function_comment,
+        "tf.train.linear_cosine_decay":
+            decay_function_comment,
+        "tf.train.noisy_linear_cosine_decay":
+            decay_function_comment,
+        "tf.estimator.LinearClassifier":
+            default_loss_reduction_changed,
+        "tf.estimator.LinearRegressor":
+            default_loss_reduction_changed,
+        "tf.estimator.DNNLinearCombinedClassifier":
+            default_loss_reduction_changed,
+        "tf.estimator.DNNLinearCombinedRegressor":
+            default_loss_reduction_changed,
+        "tf.estimator.DNNRegressor":
+            default_loss_reduction_changed,
+        "tf.estimator.DNNClassifier":
+            default_loss_reduction_changed,
+        "tf.estimator.BaselineClassifier":
+            default_loss_reduction_changed,
+        "tf.estimator.BaselineRegressor":
+            default_loss_reduction_changed,
+        "tf.nn.conv1d":
+        "WARNING: use_cudnn_on_gpu argument has been removed and \"value\" was "
+        "renamed to \"input\"",
+        "tf.nn.conv2d":
+        "WARNING: use_cudnn_on_gpu argument has been removed and \"filter\" "
+        "was renamed to \"filters\"",
+        "tf.nn.conv2d_backprop_filter":
+        "WARNING: use_cudnn_on_gpu argument has been removed",
+        "tf.nn.conv2d_backprop_input":
+        "WARNING: use_cudnn_on_gpu argument has been removed and \"filter\" "
+        "was renamed to \"filters\"",
+        "tf.nn.erosion2d":
+        "WARNING: <function name> now requires a data_format argument",
+        "tf.nn.nce_loss":
+        "WARNING: `partition_strategy` has been removed from `tf.nn.nce_loss` "
+        " The 'div' strategy is used by default.",
+        "tf.zeros_like": tf_01s_like_no_optimize_comment,
+        "tf.ones_like": tf_01s_like_no_optimize_comment,
     }
-    # Right now we can't have both a rename and a warning.
+
     self.symbol_renames = {
         name: new_name
         for name, new_name in self.symbol_renames.items()
-        if name not in self.function_warnings and name not in excluded_renames
     }
 
+    export_saved_model_renamed = (
+        "(Manual edit required) Please rename the method export_savedmodel() "
+        "to export_saved_model(). Two things to note:\n\t(1) The argument "
+        "strip_default_attributes has been removed. The function will always "
+        "strip the default attributes from ops. If this breaks your code, "
+        "please switch to tf.compat.v1.estimator.Estimator.\n\t(2) This change "
+        "only effects core estimator. If you are using "
+        "tf.contrib.learn.Estimator, please switch to using core estimator.")
 
-if __name__ == "__main__":
-  parser = argparse.ArgumentParser(
-      formatter_class=argparse.RawDescriptionHelpFormatter,
-      description="""Convert a TensorFlow Python file to 2.0
+    make_initializable_iterator_deprecation = (
+        "(Manual edit required) The "
+        "`tf.data.Dataset.make_initializable_iterator()` method has been "
+        "removed. If you are using the Estimator API, you can return a dataset "
+        "directly from your input functions without creating an iterator. "
+        "As a last resort, please replace calls to that method on `dataset` "
+        "with a call to "
+        "`tf.compat.v1.data.make_initializable_iterator(dataset)`.")
 
-Simple usage:
-  tf_convert_v2.py --infile foo.py --outfile bar.py
-  tf_convert_v2.py --intree ~/code/old --outtree ~/code/new
-""")
-  parser.add_argument(
-      "--infile",
-      dest="input_file",
-      help="If converting a single file, the name of the file "
-      "to convert")
-  parser.add_argument(
-      "--outfile",
-      dest="output_file",
-      help="If converting a single file, the output filename.")
-  parser.add_argument(
-      "--intree",
-      dest="input_tree",
-      help="If converting a whole tree of files, the directory "
-      "to read from (relative or absolute).")
-  parser.add_argument(
-      "--outtree",
-      dest="output_tree",
-      help="If converting a whole tree of files, the output "
-      "directory (relative or absolute).")
-  parser.add_argument(
-      "--copyotherfiles",
-      dest="copy_other_files",
-      help=("If converting a whole tree of files, whether to "
-            "copy the other files."),
-      type=bool,
-      default=False)
-  parser.add_argument(
-      "--reportfile",
-      dest="report_filename",
-      help=("The name of the file where the report log is "
-            "stored."
-            "(default: %(default)s)"),
-      default="report.txt")
-  args = parser.parse_args()
+    make_one_shot_iterator_deprecation = (
+        "(Manual edit required) The "
+        "`tf.data.Dataset.make_one_shot_iterator()` method has been "
+        "removed. If you are using eager execution, you can iterate over "
+        "`dataset` using a Python `for` loop. If you are using the Estimator "
+        "API, you can return a dataset directly from your input functions "
+        "without creating an iterator. As a last resort, please replace calls "
+        "to that method on `dataset` with a call to "
+        "`tf.compat.v1.data.make_one_shot_iterator(dataset)`.")
 
-  upgrade = ast_edits.ASTCodeUpgrader(TFAPIChangeSpec())
-  report_text = None
-  report_filename = args.report_filename
-  files_processed = 0
-  if args.input_file:
-    if not args.output_file:
-      raise ValueError(
-          "--outfile=<output file> argument is required when converting a "
-          "single file.")
-    files_processed, report_text, errors = upgrade.process_file(
-        args.input_file, args.output_file)
-    files_processed = 1
-  elif args.input_tree:
-    if not args.output_tree:
-      raise ValueError(
-          "--outtree=<output directory> argument is required when converting a "
-          "file tree.")
-    files_processed, report_text, errors = upgrade.process_tree(
-        args.input_tree, args.output_tree, args.copy_other_files)
-  else:
-    parser.print_help()
-  if report_text:
-    open(report_filename, "w").write(report_text)
-    print("TensorFlow 2.0 Upgrade Script")
-    print("-----------------------------")
-    print("Converted %d files\n" % files_processed)
-    print("Detected %d errors that require attention" % len(errors))
-    print("-" * 80)
-    print("\n".join(errors))
-    print("\nMake sure to read the detailed log %r\n" % report_filename)
+    # Specify warnings for functions that aren't restricted to the tf.x.y.z
+    # format. This should only be used for methods with unique names, e.g.
+    # export_savedmodel, which is only defined in Estimator objects.
+    self.unrestricted_function_warnings = {
+        "export_savedmodel": export_saved_model_renamed,
+        "make_initializable_iterator": make_initializable_iterator_deprecation,
+        "make_one_shot_iterator": make_one_shot_iterator_deprecation,
+    }
+
+  @staticmethod
+  def _dropout_handler(file_edit_recorder, node):
+    if len(node.args) < 2:
+      comment = ("ERROR: tf.nn.dropout did not take arguments, so automatic "
+                 "transformation was disabled. tf.nn.dropout has changed "
+                 "the semantics of the second argument.")
+      file_edit_recorder.add(
+          comment,
+          node.lineno,
+          node.col_offset,
+          "tf.nn.dropout",
+          "tf.nn.dropout",
+          error="tf.nn.dropout requires manual check.")
+    else:
+      comment = ("WARNING: tf.nn.dropout has changed the semantics of the "
+                 "second argument. Please check the transformation.\n")
+      file_edit_recorder.add(
+          comment,
+          node.args[1].lineno,
+          node.args[1].col_offset,
+          "",
+          "1 - ")
+
+  @staticmethod
+  def _colocate_handler(name):
+    def _helper(file_edit_recorder, node):
+      for keyword in node.keywords:
+        if keyword.arg == "colocate_gradients_with_ops":
+          # TODO(jhseu): Since ast_edit.py does string replacement, there's no
+          # straightforward way to remove the argument. Try to fix before 2.0 is
+          # final.
+          comment = ("For tf.gradients and tf.Optimizer.minimize, "
+                     "colocate_gradients_with_op has been removed and now "
+                     "defaults to True.")
+          file_edit_recorder.add(
+              comment,
+              node.lineno,
+              node.col_offset,
+              "",
+              "",
+              error="{} requires manual check.".format(name))
+    return _helper
diff --git a/tensorflow/tools/compatibility/tf_upgrade_v2_main.py b/tensorflow/tools/compatibility/tf_upgrade_v2_main.py
new file mode 100644
index 00000000000..543d0786423
--- /dev/null
+++ b/tensorflow/tools/compatibility/tf_upgrade_v2_main.py
@@ -0,0 +1,104 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Upgrader for Python scripts from 1.* TensorFlow to 2.0 TensorFlow."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+
+from tensorflow.tools.compatibility import ast_edits
+from tensorflow.tools.compatibility import tf_upgrade_v2
+
+
+def main():
+  parser = argparse.ArgumentParser(
+      formatter_class=argparse.RawDescriptionHelpFormatter,
+      description="""Convert a TensorFlow Python file to 2.0
+
+Simple usage:
+  tf_upgrade_v2.py --infile foo.py --outfile bar.py
+  tf_upgrade_v2.py --intree ~/code/old --outtree ~/code/new
+""")
+  parser.add_argument(
+      "--infile",
+      dest="input_file",
+      help="If converting a single file, the name of the file "
+      "to convert")
+  parser.add_argument(
+      "--outfile",
+      dest="output_file",
+      help="If converting a single file, the output filename.")
+  parser.add_argument(
+      "--intree",
+      dest="input_tree",
+      help="If converting a whole tree of files, the directory "
+      "to read from (relative or absolute).")
+  parser.add_argument(
+      "--outtree",
+      dest="output_tree",
+      help="If converting a whole tree of files, the output "
+      "directory (relative or absolute).")
+  parser.add_argument(
+      "--copyotherfiles",
+      dest="copy_other_files",
+      help=("If converting a whole tree of files, whether to "
+            "copy the other files."),
+      type=bool,
+      default=True)
+  parser.add_argument(
+      "--reportfile",
+      dest="report_filename",
+      help=("The name of the file where the report log is "
+            "stored."
+            "(default: %(default)s)"),
+      default="report.txt")
+  args = parser.parse_args()
+
+  upgrade = ast_edits.ASTCodeUpgrader(tf_upgrade_v2.TFAPIChangeSpec())
+  report_text = None
+  report_filename = args.report_filename
+  files_processed = 0
+  if args.input_file:
+    if not args.output_file:
+      raise ValueError(
+          "--outfile=<output file> argument is required when converting a "
+          "single file.")
+    files_processed, report_text, errors = upgrade.process_file(
+        args.input_file, args.output_file)
+    files_processed = 1
+  elif args.input_tree:
+    if not args.output_tree:
+      raise ValueError(
+          "--outtree=<output directory> argument is required when converting a "
+          "file tree.")
+    files_processed, report_text, errors = upgrade.process_tree(
+        args.input_tree, args.output_tree, args.copy_other_files)
+  else:
+    parser.print_help()
+  if report_text:
+    open(report_filename, "w").write(report_text)
+    print("TensorFlow 2.0 Upgrade Script")
+    print("-----------------------------")
+    print("Converted %d files\n" % files_processed)
+    print("Detected %d errors that require attention" % len(errors))
+    print("-" * 80)
+    print("\n".join(errors))
+    print("\nMake sure to read the detailed log %r\n" % report_filename)
+
+
+if __name__ == "__main__":
+  main()
diff --git a/tensorflow/tools/compatibility/tf_upgrade_v2_test.py b/tensorflow/tools/compatibility/tf_upgrade_v2_test.py
index 9060b1c71f1..b8b02c9c7fc 100644
--- a/tensorflow/tools/compatibility/tf_upgrade_v2_test.py
+++ b/tensorflow/tools/compatibility/tf_upgrade_v2_test.py
@@ -17,15 +17,71 @@
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
+
 import os
 import tempfile
+
 import six
+import tensorflow as tf
+# OSS TF V2 import placeholder.
+
+
 from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test as test_lib
+from tensorflow.python.util import tf_decorator
+from tensorflow.python.util import tf_export
+from tensorflow.python.util import tf_inspect
+from tensorflow.tools.common import public_api
+from tensorflow.tools.common import traverse
 from tensorflow.tools.compatibility import ast_edits
 from tensorflow.tools.compatibility import tf_upgrade_v2
 
 
+_TENSORFLOW_API_ATTR_V1 = (
+    tf_export.API_ATTRS_V1[tf_export.TENSORFLOW_API_NAME].names)
+_TENSORFLOW_API_ATTR = tf_export.API_ATTRS[tf_export.TENSORFLOW_API_NAME].names
+_ESTIMATOR_API_ATTR_V1 = (
+    tf_export.API_ATTRS_V1[tf_export.ESTIMATOR_API_NAME].names)
+_ESTIMATOR_API_ATTR = tf_export.API_ATTRS[tf_export.ESTIMATOR_API_NAME].names
+
+
+def get_v1_names(symbol):
+  names_v1 = []
+  if hasattr(symbol, _TENSORFLOW_API_ATTR_V1):
+    names_v1.extend(getattr(symbol, _TENSORFLOW_API_ATTR_V1))
+  if hasattr(symbol, _ESTIMATOR_API_ATTR_V1):
+    names_v1.extend(getattr(symbol, _ESTIMATOR_API_ATTR_V1))
+  return names_v1
+
+
+def get_v2_names(symbol):
+  names_v2 = set()
+  if hasattr(symbol, _TENSORFLOW_API_ATTR):
+    names_v2.update(getattr(symbol, _TENSORFLOW_API_ATTR))
+  if hasattr(symbol, _ESTIMATOR_API_ATTR):
+    names_v2.update(getattr(symbol, _ESTIMATOR_API_ATTR))
+  return list(names_v2)
+
+
+def get_func_and_args_from_str(call_str):
+  """Parse call string to get function and argument names.
+
+  Args:
+    call_str: Call string must be in the form:
+              `tf.foo(arg1=val1, arg2=val2, ...)`.
+
+  Returns:
+    (function_name, list of arg names) tuple.
+  """
+  open_paren_index = call_str.find("(")
+  close_paren_index = call_str.rfind(")")
+
+  function_name = call_str[:call_str.find("(")]
+  args = call_str[open_paren_index+1:close_paren_index].split(",")
+  args = [arg.split("=")[0].strip() for arg in args]
+  return function_name, args
+
+
 class TestUpgrade(test_util.TensorFlowTestCase):
   """Test various APIs that have been changed in 2.0.
 
@@ -34,6 +90,22 @@ class TestUpgrade(test_util.TensorFlowTestCase):
   work when run with current TensorFlow.
   """
 
+  @classmethod
+  def setUpClass(cls):
+    cls.v2_symbols = {}
+    if not hasattr(tf.compat, "v2"):
+      return
+
+    def symbol_collector(unused_path, unused_parent, children):
+      for child in children:
+        _, attr = tf_decorator.unwrap(child[1])
+        api_names_v2 = get_v2_names(attr)
+        for name in api_names_v2:
+          cls.v2_symbols["tf." + name] = attr
+
+    visitor = public_api.PublicAPIVisitor(symbol_collector)
+    traverse.traverse(tf.compat.v2, visitor)
+
   def _upgrade(self, old_file_text):
     in_file = six.StringIO(old_file_text)
     out_file = six.StringIO()
@@ -64,6 +136,85 @@ class TestUpgrade(test_util.TensorFlowTestCase):
     _, unused_report, unused_errors, new_text = self._upgrade(text)
     self.assertEqual(new_text, "tf.math.rsqrt(tf.math.log_sigmoid(3.8))\n")
 
+  def testAllAPI(self):
+    if not hasattr(tf.compat, "v2"):
+      return
+
+    # Converts all symbols in the v1 namespace to the v2 namespace, raising
+    # an error if the target of the conversion is not in the v2 namespace.
+    def conversion_visitor(unused_path, unused_parent, children):
+      for child in children:
+        _, attr = tf_decorator.unwrap(child[1])
+        api_names = get_v1_names(attr)
+        for name in api_names:
+          _, _, _, text = self._upgrade("tf." + name)
+          if (text and
+              not text.startswith("tf.compat.v1") and
+              text not in self.v2_symbols):
+            self.assertFalse(
+                True, "Symbol %s generated from %s not in v2 API" % (
+                    text, name))
+
+    visitor = public_api.PublicAPIVisitor(conversion_visitor)
+    visitor.do_not_descend_map["tf"].append("contrib")
+    visitor.private_map["tf.compat"] = ["v1", "v2"]
+    traverse.traverse(tf.compat.v1, visitor)
+
+  def testKeywordArgNames(self):
+    if not hasattr(tf.compat, "v2"):
+      return
+
+    all_keyword_renames = (
+        tf_upgrade_v2.TFAPIChangeSpec().function_keyword_renames)
+    v2_name_exceptions = {"verify_shape_is_now_always_true"}
+
+    # Visitor that verifies V1 argument names, converts to V2 and checks
+    # V2 argument names.
+    def conversion_visitor(unused_path, unused_parent, children):
+      for child in children:
+        _, attr = tf_decorator.unwrap(child[1])
+        names_v1 = get_v1_names(attr)
+
+        for name in names_v1:
+          name = "tf.%s" % name
+          if name not in all_keyword_renames:
+            continue
+          arg_names_v1 = tf_inspect.getargspec(attr)[0]
+          keyword_renames = all_keyword_renames[name]
+          self.assertEqual(type(keyword_renames), dict)
+
+          # Assert that v1 function has valid v1 argument names.
+          for from_name, _ in keyword_renames.items():
+            self.assertIn(
+                from_name, arg_names_v1,
+                "%s not found in %s arguments: %s" %
+                (from_name, name, str(arg_names_v1)))
+
+          # Assert that arg names after converting to v2 are present in
+          # v2 function.
+          # 1. First, create an input of the form:
+          #    tf.foo(arg1=val1, arg2=val2, ...)
+          args = ",".join(
+              ["%s=%d" % (from_name, from_index)
+               for from_index, from_name in enumerate(keyword_renames.keys())])
+          text_input = "%s(%s)" % (name, args)
+          # 2. Convert the input to V2.
+          _, _, _, text = self._upgrade(text_input)
+          new_function_name, new_args = get_func_and_args_from_str(text)
+          # 3. Verify V2 function and arguments.
+          # Note: If we rename arguments, new function must be available in 2.0.
+          # We should not be using compat.v1 in this case.
+          self.assertIn(new_function_name, self.v2_symbols)
+          args_v2 = tf_inspect.getargspec(self.v2_symbols[new_function_name])[0]
+          args_v2.extend(v2_name_exceptions)
+          for new_arg in new_args:
+            self.assertIn(new_arg, args_v2)
+
+    visitor = public_api.PublicAPIVisitor(conversion_visitor)
+    visitor.do_not_descend_map["tf"].append("contrib")
+    visitor.private_map["tf.compat"] = ["v1", "v2"]
+    traverse.traverse(tf.compat.v1, visitor)
+
   def testRenameConstant(self):
     text = "tf.MONOLITHIC_BUILD\n"
     _, unused_report, unused_errors, new_text = self._upgrade(text)
@@ -72,6 +223,16 @@ class TestUpgrade(test_util.TensorFlowTestCase):
     _, unused_report, unused_errors, new_text = self._upgrade(text)
     self.assertEqual(new_text, "some_call(tf.sysconfig.MONOLITHIC_BUILD)\n")
 
+  def testRenameArgs(self):
+    text = ("tf.nn.pool(input_a, window_shape_a, pooling_type_a, padding_a, "
+            "dilation_rate_a, strides_a, name_a, data_format_a)\n")
+    _, unused_report, unused_errors, new_text = self._upgrade(text)
+    self.assertEqual(new_text,
+                     ("tf.nn.pool(input=input_a, window_shape=window_shape_a,"
+                      " pooling_type=pooling_type_a, padding=padding_a, "
+                      "dilations=dilation_rate_a, strides=strides_a, "
+                      "name=name_a, data_format=data_format_a)\n"))
+
   def testReorder(self):
     text = "tf.boolean_mask(a, b, c, d)\n"
     _, unused_report, unused_errors, new_text = self._upgrade(text)
@@ -79,7 +240,7 @@ class TestUpgrade(test_util.TensorFlowTestCase):
                      "tf.boolean_mask(tensor=a, mask=b, name=c, axis=d)\n")
 
   def testLearningRateDecay(self):
-    for decay in ["tf.train.exponential_decay", "tf.train.piecewise_constant",
+    for decay in ["tf.train.exponential_decay",
                   "tf.train.polynomial_decay", "tf.train.natural_exp_decay",
                   "tf.train.inverse_time_decay", "tf.train.cosine_decay",
                   "tf.train.cosine_decay_restarts",
@@ -87,18 +248,208 @@ class TestUpgrade(test_util.TensorFlowTestCase):
                   "tf.train.noisy_linear_cosine_decay"]:
 
       text = "%s(a, b)\n" % decay
-      _, report, errors, new_text = self._upgrade(text)
-      self.assertEqual(text, new_text)
+      _, report, errors, _ = self._upgrade(text)
       self.assertEqual(errors, ["test.py:1: %s requires manual check." % decay])
       self.assertIn("%s has been changed" % decay, report)
 
-  def testEstimatorLossReductionChangege(self):
-    text = "tf.estimator.LinearClassifier(a, b)\n"
-    _, report, errors, new_text = self._upgrade(text)
+  def testPiecewiseDecay(self):
+    text = "tf.train.piecewise_constant_decay(a, b)\n"
+    _, report, errors, _ = self._upgrade(text)
+    self.assertEqual(
+        errors,
+        ["test.py:1: tf.train.piecewise_constant_decay requires manual check."])
+    self.assertIn("tf.train.piecewise_constant_decay has been changed", report)
+
+  def testEstimatorLossReductionChange(self):
+    classes = [
+        "LinearClassifier", "LinearRegressor", "DNNLinearCombinedClassifier",
+        "DNNLinearCombinedRegressor", "DNNRegressor", "DNNClassifier",
+        "BaselineClassifier", "BaselineRegressor"
+    ]
+    for c in classes:
+      ns = "tf.estimator." + c
+      text = ns + "(a, b)"
+      _, report, errors, new_text = self._upgrade(text)
+      self.assertEqual(text, new_text)
+      self.assertEqual(errors, ["test.py:1: %s requires manual check." % ns])
+      self.assertIn("loss_reduction has been changed", report)
+
+  def testDropout(self):
+    text = "tf.nn.dropout(x, keep_prob, name=\"foo\")\n"
+    _, unused_report, unused_errors, new_text = self._upgrade(text)
+    self.assertEqual(
+        new_text,
+        "tf.nn.dropout(x, 1 - keep_prob, name=\"foo\")\n",
+    )
+
+    text = "tf.nn.dropout(x)\n"
+    _, unused_report, errors, new_text = self._upgrade(text)
+    self.assertEqual(new_text, text)
+    self.assertEqual(
+        errors,
+        ["test.py:1: tf.nn.dropout requires manual check."]
+    )
+
+  def testCountNonZeroChanges(self):
+    text = (
+        "tf.math.count_nonzero(input_tensor=input, dtype=dtype, name=name, "
+        "reduction_indices=axis, keep_dims=keepdims)\n"
+        )
+    _, unused_report, unused_errors, new_text = self._upgrade(text)
+    expected_text = (
+        "tf.math.count_nonzero(input=input, dtype=dtype, name=name, "
+        "axis=axis, keepdims=keepdims)\n"
+        )
+    self.assertEqual(new_text, expected_text)
+
+  def testRandomMultinomialToRandomCategorical(self):
+    text = (
+        "tf.random.multinomial(logits, samples, seed, name, output_dtype)\n"
+        )
+    _, unused_report, unused_errors, new_text = self._upgrade(text)
+    expected_text = (
+        "tf.random.categorical(logits=logits, num_samples=samples, seed=seed, "
+        "name=name, dtype=output_dtype)\n"
+        )
+    self.assertEqual(new_text, expected_text)
+
+    text = (
+        "tf.multinomial(logits, samples, seed, name, output_dtype)\n"
+        )
+    _, unused_report, unused_errors, new_text = self._upgrade(text)
+    expected_text = (
+        "tf.random.categorical(logits=logits, num_samples=samples, seed=seed, "
+        "name=name, dtype=output_dtype)\n"
+        )
+    self.assertEqual(new_text, expected_text)
+
+  def testConvolutionOpUpdate(self):
+    text = (
+        "tf.nn.convolution(input, filter, padding, strides, dilation_rate, "
+        "name, data_format)"
+    )
+    _, unused_report, unused_errors, new_text = self._upgrade(text)
+    expected_text = (
+        "tf.nn.convolution(input=input, filters=filter, padding=padding, "
+        "strides=strides, dilations=dilation_rate, name=name, "
+        "data_format=data_format)"
+    )
+    self.assertEqual(new_text, expected_text)
+
+  def testColocateGradientsWithOps(self):
+    text = "tf.gradients(a, foo=False)\n"
+    _, unused_report, errors, new_text = self._upgrade(text)
     self.assertEqual(text, new_text)
-    self.assertEqual(errors, ["test.py:1: %s requires manual check."
-                              % "tf.estimator.LinearClassifier"])
-    self.assertIn("loss_reduction has been changed", report)
+    self.assertEqual(errors, [])
+
+    text = "tf.gradients(a, colocate_gradients_with_ops=False)\n"
+    _, unused_report, errors, new_text = self._upgrade(text)
+    self.assertEqual(text, new_text)
+    self.assertEqual(errors, ["test.py:1: tf.gradients requires manual check."])
+
+    text = "optimizer.minimize(a, foo=False)\n"
+    _, unused_report, errors, new_text = self._upgrade(text)
+    self.assertEqual(text, new_text)
+    self.assertEqual(errors, [])
+
+    text = "optimizer.minimize(a, colocate_gradients_with_ops=False)\n"
+    _, unused_report, errors, new_text = self._upgrade(text)
+    self.assertEqual(text, new_text)
+    self.assertEqual(errors,
+                     ["test.py:1: Optimizer.minimize requires manual check."])
+
+    text = "optimizer.compute_gradients(a, foo=False)\n"
+    _, unused_report, errors, new_text = self._upgrade(text)
+    self.assertEqual(text, new_text)
+    self.assertEqual(errors, [])
+
+    text = "optimizer.compute_gradients(a, colocate_gradients_with_ops=False)\n"
+    _, unused_report, errors, new_text = self._upgrade(text)
+    self.assertEqual(text, new_text)
+    self.assertEqual(errors,
+                     ["test.py:1: Optimizer.compute_gradients "
+                      "requires manual check."])
+
+  def testExportSavedModelRename(self):
+    text = "self.est.export_savedmodel(path)"
+    _, report, unused_errors, unused_new_text = self._upgrade(text)
+    self.assertIn(
+        "rename the method export_savedmodel() to export_saved_model()",
+        report)
+
+  def testArgmin(self):
+    text = "tf.argmin(input, name=n, dimension=1, output_type=type)"
+    expected_text = "tf.argmin(input=input, name=n, axis=1, output_type=type)"
+    _, unused_report, unused_errors, new_text = self._upgrade(text)
+    self.assertEqual(new_text, expected_text)
+
+    text = "tf.argmin(input, 0)"
+    expected_text = "tf.argmin(input=input, axis=0)"
+    _, unused_report, unused_errors, new_text = self._upgrade(text)
+    self.assertEqual(new_text, expected_text)
+
+  def testArgmax(self):
+    text = "tf.argmax(input, name=n, dimension=1, output_type=type)"
+    expected_text = "tf.argmax(input=input, name=n, axis=1, output_type=type)"
+    _, unused_report, unused_errors, new_text = self._upgrade(text)
+    self.assertEqual(new_text, expected_text)
+
+    text = "tf.argmax(input, 0)"
+    expected_text = "tf.argmax(input=input, axis=0)"
+    _, unused_report, unused_errors, new_text = self._upgrade(text)
+    self.assertEqual(new_text, expected_text)
+
+  def testBatchToSpace(self):
+    text = "tf.batch_to_space_nd(input, block_shape, crops, name)"
+    expected_text = "tf.batch_to_space(input, block_shape, crops, name)"
+    _, unused_report, unused_errors, new_text = self._upgrade(text)
+    self.assertEqual(new_text, expected_text)
+
+    text = "tf.batch_to_space(input, crops, block_size, name)"
+    expected_text = (
+        "tf.batch_to_space(input=input, crops=crops, block_shape=block_size, "
+        "name=name)")
+    _, unused_report, unused_errors, new_text = self._upgrade(text)
+    self.assertEqual(new_text, expected_text)
+
+    text = "tf.manip.batch_to_space_nd(input, block_shape, crops, name)"
+    expected_text = "tf.batch_to_space(input, block_shape, crops, name)"
+    _, unused_report, unused_errors, new_text = self._upgrade(text)
+    self.assertEqual(new_text, expected_text)
+
+  def testExtractImagePatches(self):
+    text = (
+        "tf.extract_image_patches(images, ksizes=ksizes, strides=strides,"
+        "rates=rates, padding=padding, name=name)")
+    expected_text = (
+        "tf.image.extract_image_patches(images, sizes=ksizes, strides=strides,"
+        "rates=rates, padding=padding, name=name)")
+    _, unused_report, unused_errors, new_text = self._upgrade(text)
+    self.assertEqual(new_text, expected_text)
+
+  def testStatelessMultinomial(self):
+    text = (
+        "tf.random.stateless_multinomial(logits, num_samples, seed, "
+        "output_dtype=dtype, name=name)")
+    expected_text = (
+        "tf.random.stateless_categorical(logits, num_samples, seed, "
+        "dtype=dtype, name=name)")
+    _, unused_report, unused_errors, new_text = self._upgrade(text)
+    self.assertEqual(new_text, expected_text)
+
+  def testSoftMaxCrossEntropyWithLogitsV2(self):
+    text = "tf.nn.softmax_cross_entropy_with_logits_v2(labels, logits, dim=2)"
+    expected_text = (
+        "tf.nn.softmax_cross_entropy_with_logits(labels, logits, axis=2)")
+    _, unused_report, unused_errors, new_text = self._upgrade(text)
+    self.assertEqual(new_text, expected_text)
+
+  def testSparseMatmul(self):
+    text = ("tf.sparse_matmul(a, b, c, d, e, f, g)\n")
+    expected_text = ("tf.linalg.matmul(a=a, b=b, transpose_a=c, transpose_b=d, "
+                     "a_is_sparse=e, b_is_sparse=f, name=g)\n")
+    _, unused_report, unused_errors, new_text = self._upgrade(text)
+    self.assertEqual(new_text, expected_text)
 
 
 class TestUpgradeFiles(test_util.TensorFlowTestCase):
diff --git a/tensorflow/tools/compatibility/update/BUILD b/tensorflow/tools/compatibility/update/BUILD
index 0ee45508155..b9725a74ee5 100644
--- a/tensorflow/tools/compatibility/update/BUILD
+++ b/tensorflow/tools/compatibility/update/BUILD
@@ -12,5 +12,6 @@ py_binary(
         "//tensorflow/python:no_contrib",
         "//tensorflow/tools/common:public_api",
         "//tensorflow/tools/common:traverse",
+        "//tensorflow/tools/compatibility:tf_upgrade_v2_lib",
     ],
 )
diff --git a/tensorflow/tools/compatibility/update/generate_v2_renames_map.py b/tensorflow/tools/compatibility/update/generate_v2_renames_map.py
index 43aa8e057e1..19ad6c3a2a5 100644
--- a/tensorflow/tools/compatibility/update/generate_v2_renames_map.py
+++ b/tensorflow/tools/compatibility/update/generate_v2_renames_map.py
@@ -32,6 +32,7 @@ from tensorflow.python.util import tf_decorator
 from tensorflow.python.util import tf_export
 from tensorflow.tools.common import public_api
 from tensorflow.tools.common import traverse
+from tensorflow.tools.compatibility import tf_upgrade_v2
 
 
 _OUTPUT_FILE_PATH = 'third_party/tensorflow/tools/compatibility/renames_v2.py'
@@ -71,6 +72,50 @@ _TENSORFLOW_CONSTANTS_ATTR_V1 = (
 _TENSORFLOW_CONSTANTS_ATTR = (
     tf_export.API_ATTRS[tf_export.TENSORFLOW_API_NAME].constants)
 
+_ESTIMATOR_API_ATTR_V1 = (
+    tf_export.API_ATTRS_V1[tf_export.ESTIMATOR_API_NAME].names)
+_ESTIMATOR_API_ATTR = tf_export.API_ATTRS[tf_export.ESTIMATOR_API_NAME].names
+_ESTIMATOR_CONSTANTS_ATTR_V1 = (
+    tf_export.API_ATTRS_V1[tf_export.ESTIMATOR_API_NAME].constants)
+_ESTIMATOR_CONSTANTS_ATTR = (
+    tf_export.API_ATTRS[tf_export.ESTIMATOR_API_NAME].constants)
+
+
+def get_v1_names(symbol):
+  names_v1 = []
+  if hasattr(symbol, _TENSORFLOW_API_ATTR_V1):
+    names_v1.extend(getattr(symbol, _TENSORFLOW_API_ATTR_V1))
+  if hasattr(symbol, _ESTIMATOR_API_ATTR_V1):
+    names_v1.extend(getattr(symbol, _ESTIMATOR_API_ATTR_V1))
+  return names_v1
+
+
+def get_v2_names(symbol):
+  names_v2 = []
+  if hasattr(symbol, _TENSORFLOW_API_ATTR):
+    names_v2.extend(getattr(symbol, _TENSORFLOW_API_ATTR))
+  if hasattr(symbol, _ESTIMATOR_API_ATTR):
+    names_v2.extend(getattr(symbol, _ESTIMATOR_API_ATTR))
+  return list(names_v2)
+
+
+def get_v1_constants(module):
+  constants_v1 = []
+  if hasattr(module, _TENSORFLOW_CONSTANTS_ATTR_V1):
+    constants_v1.extend(getattr(module, _TENSORFLOW_CONSTANTS_ATTR_V1))
+  if hasattr(module, _ESTIMATOR_CONSTANTS_ATTR_V1):
+    constants_v1.extend(getattr(module, _ESTIMATOR_CONSTANTS_ATTR_V1))
+  return constants_v1
+
+
+def get_v2_constants(module):
+  constants_v2 = []
+  if hasattr(module, _TENSORFLOW_CONSTANTS_ATTR):
+    constants_v2.extend(getattr(module, _TENSORFLOW_CONSTANTS_ATTR))
+  if hasattr(module, _ESTIMATOR_CONSTANTS_ATTR):
+    constants_v2.extend(getattr(module, _ESTIMATOR_CONSTANTS_ATTR))
+  return constants_v2
+
 
 def get_canonical_name(v2_names, v1_name):
   if v2_names:
@@ -78,18 +123,34 @@ def get_canonical_name(v2_names, v1_name):
   return 'compat.v1.%s' % v1_name
 
 
+def get_all_v2_names():
+  """Get a set of function/class names available in TensorFlow 2.0."""
+  v2_names = set()  # All op names in TensorFlow 2.0
+
+  def visit(unused_path, unused_parent, children):
+    """Visitor that collects TF 2.0 names."""
+    for child in children:
+      _, attr = tf_decorator.unwrap(child[1])
+      api_names_v2 = get_v2_names(attr)
+      for name in api_names_v2:
+        v2_names.add(name)
+
+  visitor = public_api.PublicAPIVisitor(visit)
+  visitor.do_not_descend_map['tf'].append('contrib')
+  traverse.traverse(tf.compat.v2, visitor)
+  return v2_names
+
+
 def collect_constant_renames():
   """Looks for constants that need to be renamed in TF 2.0.
 
   Returns:
-    List of tuples of the form (current name, new name).
+    Set of tuples of the form (current name, new name).
   """
   renames = set()
   for module in sys.modules.values():
-    if not hasattr(module, _TENSORFLOW_CONSTANTS_ATTR_V1):
-      continue
-    constants_v1_list = getattr(module, _TENSORFLOW_CONSTANTS_ATTR_V1)
-    constants_v2_list = getattr(module, _TENSORFLOW_CONSTANTS_ATTR)
+    constants_v1_list = get_v1_constants(module)
+    constants_v2_list = get_v2_constants(module)
 
     # _tf_api_constants attribute contains a list of tuples:
     # (api_names_list, constant_name)
@@ -115,26 +176,21 @@ def collect_function_renames():
   """Looks for functions/classes that need to be renamed in TF 2.0.
 
   Returns:
-    List of tuples of the form (current name, new name).
+    Set of tuples of the form (current name, new name).
   """
   # Set of rename lines to write to output file in the form:
   #   'tf.deprecated_name': 'tf.canonical_name'
   renames = set()
-  v2_names = set()  # All op names in TensorFlow 2.0
 
   def visit(unused_path, unused_parent, children):
     """Visitor that collects rename strings to add to rename_line_set."""
     for child in children:
       _, attr = tf_decorator.unwrap(child[1])
-      if not hasattr(attr, '__dict__'):
-        continue
-      api_names_v1 = attr.__dict__.get(_TENSORFLOW_API_ATTR_V1, [])
-      api_names_v2 = attr.__dict__.get(_TENSORFLOW_API_ATTR, [])
+      api_names_v1 = get_v1_names(attr)
+      api_names_v2 = get_v2_names(attr)
       deprecated_api_names = set(api_names_v1) - set(api_names_v2)
       for name in deprecated_api_names:
         renames.add((name, get_canonical_name(api_names_v2, name)))
-      for name in api_names_v2:
-        v2_names.add(name)
 
   visitor = public_api.PublicAPIVisitor(visit)
   visitor.do_not_descend_map['tf'].append('contrib')
@@ -144,8 +200,9 @@ def collect_function_renames():
   # It is possible that a different function is exported with the
   # same name. For e.g. when creating a different function to
   # rename arguments. Exclude it from renames in this case.
-  renames = {name: new_name for name, new_name in renames.items()
-             if name not in v2_names}
+  v2_names = get_all_v2_names()
+  renames = set((name, new_name) for name, new_name in renames
+                if name not in v2_names)
   return renames
 
 
@@ -163,12 +220,15 @@ def update_renames_v2(output_file_path):
   function_renames = collect_function_renames()
   constant_renames = collect_constant_renames()
   all_renames = function_renames.union(constant_renames)
+  manual_renames = set(
+      tf_upgrade_v2.TFAPIChangeSpec().manual_symbol_renames.keys())
 
   # List of rename lines to write to output file in the form:
   #   'tf.deprecated_name': 'tf.canonical_name'
   rename_lines = [
       get_rename_line(name, canonical_name)
-      for name, canonical_name in all_renames]
+      for name, canonical_name in all_renames
+      if 'tf.' + name not in manual_renames]
   renames_file_text = '%srenames = {\n%s\n}\n' % (
       _FILE_HEADER, ',\n'.join(sorted(rename_lines)))
   file_io.write_string_to_file(output_file_path, renames_file_text)
diff --git a/tensorflow/tools/docker/Dockerfile b/tensorflow/tools/docker/Dockerfile
index 205128ad58a..6676de02a41 100644
--- a/tensorflow/tools/docker/Dockerfile
+++ b/tensorflow/tools/docker/Dockerfile
@@ -1,4 +1,4 @@
-FROM ubuntu:16.04
+FROM ubuntu:18.04
 
 LABEL maintainer="Craig Citro <craigcitro@google.com>"
 
@@ -8,7 +8,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
         curl \
         libfreetype6-dev \
         libhdf5-serial-dev \
-        libpng12-dev \
+        libpng-dev \
         libzmq3-dev \
         pkg-config \
         python \
diff --git a/tensorflow/tools/docker/Dockerfile.devel b/tensorflow/tools/docker/Dockerfile.devel
index a3893a2713d..c256dd364ef 100644
--- a/tensorflow/tools/docker/Dockerfile.devel
+++ b/tensorflow/tools/docker/Dockerfile.devel
@@ -1,4 +1,4 @@
-FROM ubuntu:16.04
+FROM ubuntu:18.04
 
 LABEL maintainer="Craig Citro <craigcitro@google.com>"
 
@@ -9,7 +9,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
         libcurl3-dev \
         libfreetype6-dev \
         libhdf5-serial-dev \
-        libpng12-dev \
+        libpng-dev \
         libzmq3-dev \
         pkg-config \
         python-dev \
diff --git a/tensorflow/tools/docker/Dockerfile.devel-mkl b/tensorflow/tools/docker/Dockerfile.devel-mkl
index bd2883ddba0..2341c0e8ccf 100755
--- a/tensorflow/tools/docker/Dockerfile.devel-mkl
+++ b/tensorflow/tools/docker/Dockerfile.devel-mkl
@@ -1,4 +1,4 @@
-FROM ubuntu:16.04
+FROM ubuntu:18.04
 
 LABEL maintainer="Clayne Robison <clayne.b.robison@intel.com>"
 
@@ -16,7 +16,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
         libcurl3-dev \
         libfreetype6-dev \
         libhdf5-serial-dev \
-        libpng12-dev \
+        libpng-dev \
         libzmq3-dev \
         libssl-dev \
         pkg-config \
diff --git a/tensorflow/tools/docker/Dockerfile.devel-mkl-horovod b/tensorflow/tools/docker/Dockerfile.devel-mkl-horovod
index df084e029c8..5e24617b219 100755
--- a/tensorflow/tools/docker/Dockerfile.devel-mkl-horovod
+++ b/tensorflow/tools/docker/Dockerfile.devel-mkl-horovod
@@ -1,4 +1,4 @@
-FROM ubuntu:16.04
+FROM ubuntu:18.04
 
 LABEL maintainer="Cong Xu <cong.xu@intel.com>"
 
@@ -16,7 +16,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
         libcurl3-dev \
         libfreetype6-dev \
         libhdf5-serial-dev \
-        libpng12-dev \
+        libpng-dev \
         libzmq3-dev \
         pkg-config \
         python-dev \
diff --git a/tensorflow/tools/docker/Dockerfile.mkl b/tensorflow/tools/docker/Dockerfile.mkl
index ac41cffe4bc..dad27697fa1 100755
--- a/tensorflow/tools/docker/Dockerfile.mkl
+++ b/tensorflow/tools/docker/Dockerfile.mkl
@@ -1,4 +1,4 @@
-FROM ubuntu:16.04
+FROM ubuntu:18.04
 
 LABEL maintainer="Clayne Robison <clayne.b.robison@intel.com>"
 
@@ -17,7 +17,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
         curl \
         libfreetype6-dev \
         libhdf5-serial-dev \
-        libpng12-dev \
+        libpng-dev \
         libzmq3-dev \
         pkg-config \
         ${PYTHON} \
diff --git a/tensorflow/tools/docker/Dockerfile.mkl-horovod b/tensorflow/tools/docker/Dockerfile.mkl-horovod
index 0432cd5e80c..19dc45c62cb 100755
--- a/tensorflow/tools/docker/Dockerfile.mkl-horovod
+++ b/tensorflow/tools/docker/Dockerfile.mkl-horovod
@@ -1,4 +1,4 @@
-FROM ubuntu:16.04
+FROM ubuntu:18.04
 
 LABEL maintainer="Cong Xu <cong.xu@intel.com>"
 
@@ -17,7 +17,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
         curl \
         libfreetype6-dev \
         libhdf5-serial-dev \
-        libpng12-dev \
+        libpng-dev \
         libzmq3-dev \
         pkg-config \
         python \
diff --git a/tensorflow/tools/dockerfiles/.gitignore b/tensorflow/tools/dockerfiles/.gitignore
new file mode 100644
index 00000000000..d7efa472a92
--- /dev/null
+++ b/tensorflow/tools/dockerfiles/.gitignore
@@ -0,0 +1 @@
+dockerfiles/*.temp.Dockerfile
diff --git a/tensorflow/tools/dockerfiles/README.md b/tensorflow/tools/dockerfiles/README.md
index 7c8ca1d1c7a..2ac68666d08 100644
--- a/tensorflow/tools/dockerfiles/README.md
+++ b/tensorflow/tools/dockerfiles/README.md
@@ -1,8 +1,12 @@
 # TensorFlow Dockerfiles
 
-This directory houses TensorFlow's Dockerfiles. **DO NOT EDIT THE DOCKERFILES
-MANUALLY!** They are maintained by `assembler.py`, which builds Dockerfiles from
-the files in `partials/` and the rules in `spec.yml`. See [the Contributing
+This directory houses TensorFlow's Dockerfiles and the infrastructure used to
+create and deploy them to [Docker
+Hub](https://hub.docker.com/r/tensorflow/tensorflow).
+
+**DO NOT EDIT THE DOCKERFILES/ DIRECTORY MANUALLY!** The files within are
+maintained by `assembler.py`, which builds Dockerfiles from the files in
+`partials/` and the rules in `spec.yml`. See [the Contributing
 section](#contributing) for more information.
 
 These Dockerfiles are planned to replace the Dockerfiles used to generate
@@ -20,10 +24,10 @@ $ docker build -f ./dockerfiles/cpu.Dockerfile -t tf .
 Each Dockerfile has its own set of available `--build-arg`s which are documented
 in the Dockerfile itself.
 
-## Running
+## Running Locally Built Images
 
 After building the image with the tag `tf` (for example), use `docker run` to
-run the images. Examples are below.
+run the images.
 
 Note for new Docker users: the `-v` and `-u` flags share directories between
 the Docker container and your machine, and very important. Without
@@ -42,8 +46,10 @@ $ docker run -u $(id -u):$(id -g) -v $(pwd):/my-devel -it tf
 # GPU-based images (set up nvidia-docker2 first)
 $ docker run --runtime=nvidia -u $(id -u):$(id -g) -v $(pwd):/my-devel -it tf
 
-# Images with Jupyter run on port 8888, and needs a volume for notebooks
-$ docker run --user $(id -u):$(id -g) -p 8888:8888 -v $(pwd):/notebooks -it tf
+# Images with Jupyter run on port 8888 and need a volume for your notebooks
+# You can change $(PWD) to the full path to a directory if your notebooks
+# live outside the current directory.
+$ docker run --user $(id -u):$(id -g) -p 8888:8888 -v $(PWD):/tf/notebooks -it tf
 ```
 
 These images do not come with the TensorFlow source code -- but the development
@@ -60,11 +66,32 @@ You can use the `Dockerfile` in this directory to build an editing environment
 that has all of the Python dependencies you'll need:
 
 ```bash
-$ docker build -t tf-assembler -f assembler.Dockerfile .
+# Build the tools-helper image so you can run the assembler
+$ docker build -t tf-tools -f tools.Dockerfile .
 
 # Set --user to set correct permissions on generated files
-$ docker run --user $(id -u):$(id -g) -it -v $(pwd):/tf tf-assembler bash 
+$ docker run --user $(id -u):$(id -g) -it -v $(pwd):/tf tf-tools bash 
 
-# In the container...
-/tf $ python3 ./assembler.py -o dockerfiles -s spec.yml
+# Next you can make a handy alias depending on what you're doing. When building
+# Docker images, you need to run as root with docker.sock mounted so that the
+# container can run Docker commands. When assembling Dockerfiles, though, you'll
+# want to run as your user so that new files have the right permissions.
+
+# If you're BUILDING OR DEPLOYING DOCKER IMAGES, run as root with docker.sock:
+$ alias asm_images="docker run --rm -v $(pwd):/tf -v /var/run/docker.sock:/var/run/docker.sock tf-tools python3 assembler.py "
+
+# If you're REBUILDING OR ADDING DOCKERFILES, remove docker.sock and add -u:
+$ alias asm_dockerfiles="docker run --rm -u $(id -u):$(id -g) -v $(pwd):/tf tf-tools python3 assembler.py "
+
+# Check flags
+$ asm_dockerfiles --help
+
+# Assemble all of the Dockerfiles
+$ asm_dockerfiles --release ubuntu-dockerfiles --construct_dockerfiles
+
+# Build all of the "nightly" images on your local machine:
+$ asm_images --release nightly --build_images
+
+# Build version release for version 99.0, except "gpu" tags:
+$ asm_images --release versioned --arg _TAG_PREFIX=99.0 --build_images --exclude_tags_matching '*.gpu.*'
 ```
diff --git a/tensorflow/tools/dockerfiles/assembler.py b/tensorflow/tools/dockerfiles/assembler.py
index 9cdd9bb0cb0..9d8a59aebcf 100644
--- a/tensorflow/tools/dockerfiles/assembler.py
+++ b/tensorflow/tools/dockerfiles/assembler.py
@@ -11,63 +11,144 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# ==============================================================================
-"""Assemble common TF Dockerfiles from many parts.
+# ============================================================================
+"""Multipurpose TensorFlow Docker Helper.
 
-This script constructs TF's Dockerfiles by aggregating partial
-Dockerfiles. See README.md for usage examples.
+- Assembles Dockerfiles
+- Builds images (and optionally runs image tests)
+- Pushes images to Docker Hub (provided with credentials)
+
+Read README.md (in this directory) for instructions!
 """
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import collections
 import copy
 import errno
+import itertools
+import multiprocessing
 import os
-import os.path
 import re
 import shutil
-import textwrap
+import sys
 
 from absl import app
 from absl import flags
 import cerberus
+import docker
 import yaml
 
 FLAGS = flags.FLAGS
 
+flags.DEFINE_string('hub_username', None,
+                    'Dockerhub username, only used with --upload_to_hub')
+
+flags.DEFINE_string(
+    'hub_password', None,
+    ('Dockerhub password, only used with --upload_to_hub. Use from an env param'
+     'so your password isn\'t in your history.'))
+
+flags.DEFINE_integer('hub_timeout', 3600,
+                     'Abort Hub upload if it takes longer than this.')
+
+flags.DEFINE_string(
+    'repository', 'tensorflow',
+    'Tag local images as {repository}:tag (in addition to the '
+    'hub_repository, if uploading to hub)')
+
+flags.DEFINE_string(
+    'hub_repository', None,
+    'Push tags to this Docker Hub repository, e.g. tensorflow/tensorflow')
+
 flags.DEFINE_boolean(
-    'dry_run', False, 'Do not actually generate Dockerfiles', short_name='n')
+    'upload_to_hub',
+    False,
+    ('Push built images to Docker Hub (you must also provide --hub_username, '
+     '--hub_password, and --hub_repository)'),
+    short_name='u',
+)
+
+flags.DEFINE_boolean(
+    'construct_dockerfiles', False, 'Do not build images', short_name='d')
+
+flags.DEFINE_boolean(
+    'keep_temp_dockerfiles',
+    False,
+    'Retain .temp.Dockerfiles created while building images.',
+    short_name='k')
+
+flags.DEFINE_boolean(
+    'build_images', False, 'Do not build images', short_name='b')
 
 flags.DEFINE_string(
-    'spec_file',
-    './spec.yml',
-    'Path to a YAML specification file',
-    short_name='s')
+    'run_tests_path', None,
+    ('Execute test scripts on generated Dockerfiles before pushing them. '
+     'Flag value must be a full path to the "tests" directory, which is usually'
+     ' $(realpath ./tests). A failed tests counts the same as a failed build.'))
+
+flags.DEFINE_boolean(
+    'stop_on_failure', False,
+    ('Stop processing tags if any one build fails. If False or not specified, '
+     'failures are reported but do not affect the other images.'))
+
+flags.DEFINE_boolean(
+    'dry_run',
+    False,
+    'Do not build or deploy anything at all.',
+    short_name='n',
+)
 
 flags.DEFINE_string(
-    'output_dir',
-    './dockerfiles', ('Path to an output directory for Dockerfiles. '
-                      'Will be created if it doesn\'t exist.'),
+    'exclude_tags_matching',
+    None,
+    ('Regular expression that skips processing on any tag it matches. Must '
+     'match entire string, e.g. ".*gpu.*" ignores all GPU tags.'),
+    short_name='x')
+
+flags.DEFINE_string(
+    'only_tags_matching',
+    None,
+    ('Regular expression that skips processing on any tag it does not match. '
+     'Must match entire string, e.g. ".*gpu.*" includes only GPU tags.'),
+    short_name='i')
+
+flags.DEFINE_string(
+    'dockerfile_dir',
+    './dockerfiles', 'Path to an output directory for Dockerfiles.'
+    ' Will be created if it doesn\'t exist.'
+    ' Existing files in this directory will be deleted when new Dockerfiles'
+    ' are made.',
     short_name='o')
 
 flags.DEFINE_string(
     'partial_dir',
     './partials',
-    'Path to a directory containing foo.partial.Dockerfile partial files.',
+    'Path to a directory containing foo.partial.Dockerfile partial files.'
+    ' can have subdirectories, e.g. "bar/baz.partial.Dockerfile".',
     short_name='p')
 
-flags.DEFINE_boolean(
-    'quiet_dry_run',
-    True,
-    'Do not print contents of dry run Dockerfiles.',
-    short_name='q')
+flags.DEFINE_multi_string(
+    'release', [],
+    'Set of releases to build and tag. Defaults to every release type.',
+    short_name='r')
 
-flags.DEFINE_boolean(
-    'validate', True, 'Validate generated Dockerfiles', short_name='c')
+flags.DEFINE_multi_string(
+    'arg', [],
+    ('Extra build arguments. These are used for expanding tag names if needed '
+     '(e.g. --arg _TAG_PREFIX=foo) and for using as build arguments (unused '
+     'args will print a warning).'),
+    short_name='a')
 
-# Schema to verify the contents of spec.yml with Cerberus.
+flags.DEFINE_string(
+    'spec_file',
+    './spec.yml',
+    'Path to the YAML specification file',
+    short_name='s')
+
+# Schema to verify the contents of tag-spec.yml with Cerberus.
 # Must be converted to a dict from yaml to work.
 # Note: can add python references with e.g.
 # !!python/name:builtins.str
@@ -76,79 +157,76 @@ SCHEMA_TEXT = """
 header:
   type: string
 
-partials:
+slice_sets:
+  type: dict
+  keyschema:
+    type: string
+  valueschema:
+     type: list
+     schema:
+        type: dict
+        schema:
+           add_to_name:
+             type: string
+           dockerfile_exclusive_name:
+             type: string
+           partials:
+             type: list
+             schema:
+               type: string
+               ispartial: true
+           test_runtime:
+             type: string
+             required: false
+           tests:
+             type: list
+             default: []
+             schema:
+               type: string
+           args:
+             type: list
+             default: []
+             schema:
+               type: string
+               isfullarg: true
+
+releases:
   type: dict
   keyschema:
     type: string
   valueschema:
     type: dict
     schema:
-      desc:
-        type: string
-      args:
-        type: dict
-        keyschema:
-          type: string
-        valueschema:
-          anyof:
-            - type: [ boolean, number, string ]
-            - type: dict
-              schema:
-                 default:
-                    type: [ boolean, number, string ]
-                 desc:
-                    type: string
-                 options:
-                    type: list
-                    schema:
-                       type: string
-
-images:
-  keyschema:
-    type: string
-  valueschema:
-    type: dict
-    schema:
-      desc:
-        type: string
-      arg-defaults:
-        type: list
-        schema:
-          anyof:
-            - type: dict
-              keyschema:
-                type: string
-                arg_in_use: true
-              valueschema:
-                type: string
-            - type: string
-              isimage: true
-      create-dockerfile:
+      is_dockerfiles:
         type: boolean
-      partials:
+        required: false
+        default: false
+      upload_images:
+        type: boolean
+        required: false
+        default: true
+      tag_specs:
         type: list
+        required: true
         schema:
-          anyof:
-            - type: dict
-              keyschema:
-                type: string
-                regex: image
-              valueschema:
-                type: string
-                isimage: true
-            - type: string
-              ispartial: true
+          type: string
 """
 
 
-class TfDockerValidator(cerberus.Validator):
-  """Custom Cerberus validator for TF dockerfile spec.
+class TfDockerTagValidator(cerberus.Validator):
+  """Custom Cerberus validator for TF tag spec.
 
   Note: Each _validate_foo function's docstring must end with a segment
   describing its own validation schema, e.g. "The rule's arguments are...". If
   you add a new validator, you can copy/paste that section.
   """
 
+  def __init__(self, *args, **kwargs):
+    # See http://docs.python-cerberus.org/en/stable/customize.html
+    if 'partials' in kwargs:
+      self.partials = kwargs['partials']
+    super(cerberus.Validator, self).__init__(*args, **kwargs)
+
   def _validate_ispartial(self, ispartial, field, value):
     """Validate that a partial references an existing partial spec.
 
@@ -156,146 +234,190 @@ class TfDockerValidator(cerberus.Validator):
       ispartial: Value of the rule, a bool
       field: The field being validated
       value: The field's value
-
     The rule's arguments are validated against this schema:
     {'type': 'boolean'}
     """
-    if ispartial and value not in self.root_document.get('partials', dict()):
-      self._error(field, '{} is not an existing partial.'.format(value))
+    if ispartial and value not in self.partials:
+      self._error(field,
+                  '{} is not present in the partials directory.'.format(value))
 
-  def _validate_isimage(self, isimage, field, value):
-    """Validate that an image references an existing partial spec.
+  def _validate_isfullarg(self, isfullarg, field, value):
+    """Validate that a string is either a FULL=arg or NOT.
 
     Args:
-      isimage: Value of the rule, a bool
+      isfullarg: Value of the rule, a bool
       field: The field being validated
       value: The field's value
-
     The rule's arguments are validated against this schema:
     {'type': 'boolean'}
     """
-    if isimage and value not in self.root_document.get('images', dict()):
-      self._error(field, '{} is not an existing image.'.format(value))
-
-  def _validate_arg_in_use(self, arg_in_use, field, value):
-    """Validate that an arg references an existing partial spec's args.
-
-    Args:
-      arg_in_use: Value of the rule, a bool
-      field: The field being validated
-      value: The field's value
-
-    The rule's arguments are validated against this schema:
-    {'type': 'boolean'}
-    """
-    if arg_in_use:
-      for partial in self.root_document.get('partials', dict()).values():
-        if value in partial.get('args', tuple()):
-          return
-
-      self._error(field, '{} is not an arg used in any partial.'.format(value))
+    if isfullarg and '=' not in value:
+      self._error(field, '{} should be of the form ARG=VALUE.'.format(value))
+    if not isfullarg and '=' in value:
+      self._error(field, '{} should be of the form ARG (no =).'.format(value))
 
 
-def build_partial_description(partial_spec):
-  """Create the documentation lines for a specific partial.
+def eprint(*args, **kwargs):
+  print(*args, file=sys.stderr, flush=True, **kwargs)
 
-  Generates something like this:
 
-    # This is the partial's description, from spec.yml.
-    # --build-arg ARG_NAME=argdefault
-    #    this is one of the args.
-    # --build-arg ANOTHER_ARG=(some|choices)
-    #    another arg.
+def aggregate_all_slice_combinations(spec, slice_set_names):
+  """Figure out all of the possible slice groupings for a tag spec."""
+  slice_sets = copy.deepcopy(spec['slice_sets'])
+
+  for name in slice_set_names:
+    for slice_set in slice_sets[name]:
+      slice_set['set_name'] = name
+
+  slices_grouped_but_not_keyed = [slice_sets[name] for name in slice_set_names]
+  all_slice_combos = list(itertools.product(*slices_grouped_but_not_keyed))
+  return all_slice_combos
+
+
+def build_name_from_slices(format_string, slices, args, is_dockerfile=False):
+  """Build the tag name (cpu-devel...) from a list of slices."""
+  name_formatter = copy.deepcopy(args)
+  name_formatter.update({s['set_name']: s['add_to_name'] for s in slices})
+  name_formatter.update({
+      s['set_name']: s['dockerfile_exclusive_name']
+      for s in slices
+      if is_dockerfile and 'dockerfile_exclusive_name' in s
+  })
+  name = format_string.format(**name_formatter)
+  return name
+
+
+def update_args_dict(args_dict, updater):
+  """Update a dict of arg values with more values from a list or dict."""
+  if isinstance(updater, list):
+    for arg in updater:
+      key, sep, value = arg.partition('=')
+      if sep == '=':
+        args_dict[key] = value
+  if isinstance(updater, dict):
+    for key, value in updater.items():
+      args_dict[key] = value
+  return args_dict
+
+
+def get_slice_sets_and_required_args(slice_sets, tag_spec):
+  """Extract used-slice-sets and required CLI arguments from a spec string.
+
+  For example, {FOO}{bar}{bat} finds FOO, bar, and bat. Assuming bar and bat
+  are both named slice sets, FOO must be specified on the command line.
 
   Args:
-    partial_spec: A dict representing one of the partials from spec.yml. Doesn't
-      include the name of the partial; is a dict like { desc: ..., args: ... }.
+     slice_sets: Dict of named slice sets
+     tag_spec: The tag spec string, e.g. {_FOO}{blep}
 
   Returns:
-    A commented string describing this partial.
+     (used_slice_sets, required_args), a tuple of lists
   """
+  required_args = []
+  used_slice_sets = []
 
-  # Start from linewrapped desc field
-  lines = []
-  wrapper = textwrap.TextWrapper(
-      initial_indent='# ', subsequent_indent='# ', width=80)
-  description = wrapper.fill(partial_spec.get('desc', '( no comments )'))
-  lines.extend(['#', description])
-
-  # Document each arg
-  for arg, arg_data in partial_spec.get('args', dict()).items():
-    # Wrap arg description with comment lines
-    desc = arg_data.get('desc', '( no description )')
-    desc = textwrap.fill(
-        desc,
-        initial_indent='#    ',
-        subsequent_indent='#    ',
-        width=80,
-        drop_whitespace=False)
-
-    # Document (each|option|like|this)
-    if 'options' in arg_data:
-      arg_options = ' ({})'.format('|'.join(arg_data['options']))
+  extract_bracketed_words = re.compile(r'\{([^}]+)\}')
+  possible_args_or_slice_set_names = extract_bracketed_words.findall(tag_spec)
+  for name in possible_args_or_slice_set_names:
+    if name in slice_sets:
+      used_slice_sets.append(name)
     else:
-      arg_options = ''
+      required_args.append(name)
 
-    # Add usage sample
-    arg_use = '# --build-arg {}={}{}'.format(arg,
-                                             arg_data.get('default', '(unset)'),
-                                             arg_options)
-    lines.extend([arg_use, desc])
-
-  return '\n'.join(lines)
+  return (used_slice_sets, required_args)
 
 
-def construct_contents(partial_specs, image_spec):
-  """Assemble the dockerfile contents for an image spec.
+def gather_tag_args(slices, cli_input_args, required_args):
+  """Build a dictionary of all the CLI and slice-specified args for a tag."""
+  args = dict()
 
-  It assembles a concrete list of partial references into a single, large
-  string.
-  Also expands argument defaults, so that the resulting Dockerfile doesn't have
-  to be configured with --build-arg=... every time. That is, any ARG directive
-  will be updated with a new default value.
+  for s in slices:
+    args = update_args_dict(args, s['args'])
+
+  args = update_args_dict(args, cli_input_args)
+  for arg in required_args:
+    if arg not in args:
+      eprint(('> Error: {} is not a valid slice_set, and also isn\'t an arg '
+              'provided on the command line. If it is an arg, please specify '
+              'it with --arg. If not, check the slice_sets list.'.format(arg)))
+      exit(1)
+
+  return args
+
+
+def gather_slice_list_items(slices, key):
+  """For a list of slices, get the flattened list of all of a certain key."""
+  return list(itertools.chain(*[s[key] for s in slices if key in s]))
+
+
+def find_first_slice_value(slices, key):
+  """For a list of slices, get the first value for a certain key."""
+  for s in slices:
+    if key in s:
+      return s[key]
+
+
+def assemble_tags(spec, cli_args, enabled_releases, all_partials):
+  """Gather all the tags based on our spec.
 
   Args:
-    partial_specs: The dict from spec.yml["partials"].
-    image_spec: One of the dict values from spec.yml["images"].
+    spec: Nested dict containing full Tag spec
+    cli_args: List of ARG=foo arguments to pass along to Docker build
+    enabled_releases: List of releases to parse. Empty list = all
+    all_partials: Dict of every partial, for reference
 
   Returns:
-    A string containing a valid Dockerfile based on the partials listed in
-    image_spec.
+    Dict of tags and how to build them
   """
-  processed_partial_strings = []
-  for partial_name in image_spec['partials']:
-    # Apply image arg-defaults to existing arg defaults
-    partial_spec = copy.deepcopy(partial_specs[partial_name])
-    args = partial_spec.get('args', dict())
-    for k_v in image_spec.get('arg-defaults', []):
-      arg, value = list(k_v.items())[0]
-      if arg in args:
-        args[arg]['default'] = value
+  tag_data = collections.defaultdict(list)
 
-    # Read partial file contents
-    filename = partial_spec.get('file', partial_name)
-    partial_path = os.path.join(FLAGS.partial_dir,
-                                '{}.partial.Dockerfile'.format(filename))
-    with open(partial_path, 'r') as f_partial:
-      partial_contents = f_partial.read()
+  for name, release in spec['releases'].items():
+    for tag_spec in release['tag_specs']:
+      if enabled_releases and name not in enabled_releases:
+        eprint('> Skipping release {}'.format(name))
+        continue
 
-    # Replace ARG FOO=BAR with ARG FOO=[new-default]
-    for arg, arg_data in args.items():
-      if 'default' in arg_data and arg_data['default']:
-        default = '={}'.format(arg_data['default'])
-      else:
-        default = ''
-      partial_contents = re.sub(r'ARG {}.*'.format(arg), 'ARG {}{}'.format(
-          arg, default), partial_contents)
+      used_slice_sets, required_cli_args = get_slice_sets_and_required_args(
+          spec['slice_sets'], tag_spec)
 
-    # Store updated partial contents
-    processed_partial_strings.append(partial_contents)
+      slice_combos = aggregate_all_slice_combinations(spec, used_slice_sets)
+      for slices in slice_combos:
 
-  # Join everything together
-  return '\n'.join(processed_partial_strings)
+        tag_args = gather_tag_args(slices, cli_args, required_cli_args)
+        tag_name = build_name_from_slices(tag_spec, slices, tag_args,
+                                          release['is_dockerfiles'])
+        used_partials = gather_slice_list_items(slices, 'partials')
+        used_tests = gather_slice_list_items(slices, 'tests')
+        test_runtime = find_first_slice_value(slices, 'test_runtime')
+        dockerfile_contents = merge_partials(spec['header'], used_partials,
+                                             all_partials)
+
+        tag_data[tag_name].append({
+            'release': name,
+            'tag_spec': tag_spec,
+            'is_dockerfiles': release['is_dockerfiles'],
+            'upload_images': release['upload_images'],
+            'cli_args': tag_args,
+            'partials': used_partials,
+            'tests': used_tests,
+            'test_runtime': test_runtime,
+            'dockerfile_contents': dockerfile_contents,
+        })
+
+  return tag_data
+
+
+def merge_partials(header, used_partials, all_partials):
+  """Merge all partial contents with their header."""
+  used_partials = list(used_partials)
+  return '\n'.join([header] + [all_partials[u] for u in used_partials])
+
+
+def upload_in_background(hub_repository, dock, image, tag):
+  """Upload a docker image (to be used by multiprocessing)."""
+  image.tag(hub_repository, tag=tag)
+  for line in list(dock.images.push(hub_repository, tag=tag, stream=True)):
+    print(line)
 
 
 def mkdir_p(path):
@@ -307,247 +429,228 @@ def mkdir_p(path):
       raise
 
 
-def construct_documentation(header, partial_specs, image_spec):
-  """Assemble all of the documentation for a single dockerfile.
-
-  Builds explanations of included partials and available build args.
+def gather_existing_partials(partial_path):
+  """Find and read all available partials.
 
   Args:
-    header: The string from spec.yml["header"]; will be commented and wrapped.
-    partial_specs: The dict from spec.yml["partials"].
-    image_spec: The spec for the dockerfile being built.
+    partial_path (string): read partials from this directory.
 
   Returns:
-    A string containing a commented header that documents the contents of the
-    dockerfile.
-
+    Dict[string, string] of partial short names (like "ubuntu/python" or
+      "bazel") to the full contents of that partial.
   """
-  # Comment and wrap header and image description
-  commented_header = '\n'.join(
-      [('# ' + l).rstrip() for l in header.splitlines()])
-  commented_desc = '\n'.join(
-      ['# ' + l for l in image_spec.get('desc', '').splitlines()])
-  partial_descriptions = []
-
-  # Build documentation for each partial in the image
-  for partial in image_spec['partials']:
-    # Copy partial data for default args unique to this image
-    partial_spec = copy.deepcopy(partial_specs[partial])
-    args = partial_spec.get('args', dict())
-
-    # Overwrite any existing arg defaults
-    for k_v in image_spec.get('arg-defaults', []):
-      arg, value = list(k_v.items())[0]
-      if arg in args:
-        args[arg]['default'] = value
-
-    # Build the description from new args
-    partial_description = build_partial_description(partial_spec)
-    partial_descriptions.append(partial_description)
-
-  contents = [commented_header, '#', commented_desc] + partial_descriptions
-  return '\n'.join(contents) + '\n'
-
-
-def normalize_partial_args(partial_specs):
-  """Normalize the shorthand form of a partial's args specification.
-
-  Turns this:
-
-    partial:
-      args:
-        SOME_ARG: arg_value
-
-  Into this:
-
-    partial:
-       args:
-         SOME_ARG:
-            default: arg_value
-
-  Args:
-    partial_specs: The dict from spec.yml["partials"]. This dict is modified in
-      place.
-
-  Returns:
-    The modified contents of partial_specs.
-
-  """
-  for _, partial in partial_specs.items():
-    args = partial.get('args', dict())
-    for arg, value in args.items():
-      if not isinstance(value, dict):
-        new_value = {'default': value}
-        args[arg] = new_value
-
-  return partial_specs
-
-
-def flatten_args_references(image_specs):
-  """Resolve all default-args in each image spec to a concrete dict.
-
-  Turns this:
-
-    example-image:
-      arg-defaults:
-        - MY_ARG: ARG_VALUE
-
-    another-example:
-      arg-defaults:
-        - ANOTHER_ARG: ANOTHER_VALUE
-        - example_image
-
-  Into this:
-
-    example-image:
-      arg-defaults:
-        - MY_ARG: ARG_VALUE
-
-    another-example:
-      arg-defaults:
-        - ANOTHER_ARG: ANOTHER_VALUE
-        - MY_ARG: ARG_VALUE
-
-  Args:
-    image_specs: A dict of image_spec dicts; should be the contents of the
-      "images" key in the global spec.yaml. This dict is modified in place and
-      then returned.
-
-  Returns:
-    The modified contents of image_specs.
-  """
-  for _, image_spec in image_specs.items():
-    too_deep = 0
-    while str in map(type, image_spec.get('arg-defaults', [])) and too_deep < 5:
-      new_args = []
-      for arg in image_spec['arg-defaults']:
-        if isinstance(arg, str):
-          new_args.extend(image_specs[arg]['arg-defaults'])
-        else:
-          new_args.append(arg)
-
-      image_spec['arg-defaults'] = new_args
-      too_deep += 1
-
-  return image_specs
-
-
-def flatten_partial_references(image_specs):
-  """Resolve all partial references in each image spec to a concrete list.
-
-  Turns this:
-
-    example-image:
-      partials:
-        - foo
-
-    another-example:
-      partials:
-        - bar
-        - image: example-image
-        - bat
-
-  Into this:
-
-    example-image:
-      partials:
-        - foo
-
-    another-example:
-      partials:
-        - bar
-        - foo
-        - bat
-  Args:
-    image_specs: A dict of image_spec dicts; should be the contents of the
-      "images" key in the global spec.yaml. This dict is modified in place and
-      then returned.
-
-  Returns:
-    The modified contents of image_specs.
-  """
-  for _, image_spec in image_specs.items():
-    too_deep = 0
-    while dict in map(type, image_spec['partials']) and too_deep < 5:
-      new_partials = []
-      for partial in image_spec['partials']:
-        if isinstance(partial, str):
-          new_partials.append(partial)
-        else:
-          new_partials.extend(image_specs[partial['image']]['partials'])
-
-      image_spec['partials'] = new_partials
-      too_deep += 1
-
-  return image_specs
-
-
-def construct_dockerfiles(tf_spec):
-  """Generate a mapping of {"cpu": <cpu dockerfile contents>, ...}.
-
-  Args:
-    tf_spec: The full spec.yml loaded as a python object.
-
-  Returns:
-    A string:string dict of short names ("cpu-devel") to Dockerfile contents.
-  """
-  names_to_contents = dict()
-  image_specs = tf_spec['images']
-  image_specs = flatten_partial_references(image_specs)
-  image_specs = flatten_args_references(image_specs)
-  partial_specs = tf_spec['partials']
-  partial_specs = normalize_partial_args(partial_specs)
-
-  for name, image_spec in image_specs.items():
-    if not image_spec.get('create-dockerfile', True):
-      continue
-    documentation = construct_documentation(tf_spec['header'], partial_specs,
-                                            image_spec)
-    contents = construct_contents(partial_specs, image_spec)
-    names_to_contents[name] = '\n'.join([documentation, contents])
-
-  return names_to_contents
+  partials = dict()
+  for path, _, files in os.walk(partial_path):
+    for name in files:
+      fullpath = os.path.join(path, name)
+      if '.partial.Dockerfile' not in fullpath:
+        eprint(('> Probably not a problem: skipping {}, which is not a '
+                'partial.').format(fullpath))
+        continue
+      # partial_dir/foo/bar.partial.Dockerfile -> foo/bar
+      simple_name = fullpath[len(partial_path) + 1:-len('.partial.dockerfile')]
+      with open(fullpath, 'r') as f:
+        partial_contents = f.read()
+      partials[simple_name] = partial_contents
+  return partials
 
 
 def main(argv):
   if len(argv) > 1:
-    raise app.UsageError('Unexpected command line args found: {}'.format(argv))
+    raise app.UsageError('Too many command-line arguments.')
 
+  # Read the full spec file, used for everything
   with open(FLAGS.spec_file, 'r') as spec_file:
-    tf_spec = yaml.load(spec_file)
+    tag_spec = yaml.load(spec_file)
+
+  # Get existing partial contents
+  partials = gather_existing_partials(FLAGS.partial_dir)
 
   # Abort if spec.yaml is invalid
-  if FLAGS.validate:
-    schema = yaml.load(SCHEMA_TEXT)
-    v = TfDockerValidator(schema)
-    if not v.validate(tf_spec):
-      print('>> ERROR: {} is an invalid spec! The errors are:'.format(
-          FLAGS.spec_file))
-      print(yaml.dump(v.errors, indent=2))
+  schema = yaml.load(SCHEMA_TEXT)
+  v = TfDockerTagValidator(schema, partials=partials)
+  if not v.validate(tag_spec):
+    eprint('> Error: {} is an invalid spec! The errors are:'.format(
+        FLAGS.spec_file))
+    eprint(yaml.dump(v.errors, indent=2))
+    exit(1)
+  tag_spec = v.normalized(tag_spec)
+
+  # Assemble tags and images used to build them
+  all_tags = assemble_tags(tag_spec, FLAGS.arg, FLAGS.release, partials)
+
+  # Empty Dockerfile directory if building new Dockerfiles
+  if FLAGS.construct_dockerfiles:
+    eprint('> Emptying Dockerfile dir "{}"'.format(FLAGS.dockerfile_dir))
+    shutil.rmtree(FLAGS.dockerfile_dir, ignore_errors=True)
+    mkdir_p(FLAGS.dockerfile_dir)
+
+  # Set up Docker helper
+  dock = docker.from_env()
+
+  # Login to Docker if uploading images
+  if FLAGS.upload_to_hub:
+    if not FLAGS.hub_username:
+      eprint('> Error: please set --hub_username when uploading to Dockerhub.')
       exit(1)
-  else:
-    print('>> WARNING: Not validating {}'.format(FLAGS.spec_file))
+    if not FLAGS.hub_repository:
+      eprint(
+          '> Error: please set --hub_repository when uploading to Dockerhub.')
+      exit(1)
+    if not FLAGS.hub_password:
+      eprint('> Error: please set --hub_password when uploading to Dockerhub.')
+      exit(1)
+    dock.login(
+        username=FLAGS.hub_username,
+        password=FLAGS.hub_password,
+    )
 
-  # Generate mapping of { "cpu-devel": "<cpu-devel dockerfile contents>", ... }
-  names_to_contents = construct_dockerfiles(tf_spec)
+  # Each tag has a name ('tag') and a definition consisting of the contents
+  # of its Dockerfile, its build arg list, etc.
+  failed_tags = []
+  for tag, tag_defs in all_tags.items():
+    for tag_def in tag_defs:
+      eprint('> Working on {}'.format(tag))
 
-  # Write each completed Dockerfile
-  if not FLAGS.dry_run:
-    print('>> Emptying destination dir "{}"'.format(FLAGS.output_dir))
-    shutil.rmtree(FLAGS.output_dir, ignore_errors=True)
-    mkdir_p(FLAGS.output_dir)
-  else:
-    print('>> Skipping creation of {} (dry run)'.format(FLAGS.output_dir))
-  for name, contents in names_to_contents.items():
-    path = os.path.join(FLAGS.output_dir, name + '.Dockerfile')
-    if FLAGS.dry_run:
-      print('>> Skipping writing contents of {} (dry run)'.format(path))
-      print(contents)
-    else:
-      mkdir_p(FLAGS.output_dir)
-      print('>> Writing {}'.format(path))
-      with open(path, 'w') as f:
-        f.write(contents)
+      if FLAGS.exclude_tags_matching and re.match(FLAGS.exclude_tags_matching,
+                                                  tag):
+        eprint('>> Excluded due to match against "{}".'.format(
+            FLAGS.exclude_tags_matching))
+        continue
+
+      if FLAGS.only_tags_matching and not re.match(FLAGS.only_tags_matching,
+                                                   tag):
+        eprint('>> Excluded due to failure to match against "{}".'.format(
+            FLAGS.only_tags_matching))
+        continue
+
+      # Write releases marked "is_dockerfiles" into the Dockerfile directory
+      if FLAGS.construct_dockerfiles:
+        path = os.path.join(FLAGS.dockerfile_dir, tag + '.Dockerfile')
+        if tag_def['is_dockerfiles']:
+          eprint('>> Writing {}...'.format(path))
+          if not FLAGS.dry_run:
+            with open(path, 'w') as f:
+              f.write(tag_def['dockerfile_contents'])
+
+      # Don't build any images for dockerfile-only releases
+      if not FLAGS.build_images:
+        continue
+
+      # Generate a temporary Dockerfile to use to build, since docker-py
+      # needs a filepath relative to the build context (i.e. the current
+      # directory)
+      dockerfile = os.path.join(FLAGS.dockerfile_dir, tag + '.temp.Dockerfile')
+      if not FLAGS.dry_run:
+        with open(dockerfile, 'w') as f:
+          f.write(tag_def['dockerfile_contents'])
+      eprint('>> (Temporary) writing {}...'.format(dockerfile))
+
+      repo_tag = '{}:{}'.format(FLAGS.repository, tag)
+      eprint('>> Building {} using build args:'.format(repo_tag))
+      for arg, value in tag_def['cli_args'].items():
+        eprint('>>> {}={}'.format(arg, value))
+
+      # Note that we are NOT using cache_from, which appears to limit
+      # available cache layers to those from explicitly specified layers. Many
+      # of our layers are similar between local builds, so we want to use the
+      # implied local build cache.
+      tag_failed = False
+      image, logs = None, []
+      if not FLAGS.dry_run:
+        try:
+          image, logs = dock.images.build(
+              timeout=FLAGS.hub_timeout,
+              path='.',
+              dockerfile=dockerfile,
+              buildargs=tag_def['cli_args'],
+              tag=repo_tag)
+
+          # Print logs after finishing
+          log_lines = [l.get('stream', '') for l in logs]
+          eprint(''.join(log_lines))
+
+          # Run tests if requested, and dump output
+          # Could be improved by backgrounding, but would need better
+          # multiprocessing support to track failures properly.
+          if FLAGS.run_tests_path:
+            if not tag_def['tests']:
+              eprint('>>> No tests to run.')
+            for test in tag_def['tests']:
+              eprint('>> Testing {}...'.format(test))
+              container, = dock.containers.run(
+                  image,
+                  '/tests/' + test,
+                  working_dir='/',
+                  log_config={'type': 'journald'},
+                  detach=True,
+                  stderr=True,
+                  stdout=True,
+                  volumes={FLAGS.run_tests_path:
+                           {'bind': '/tests', 'mode': 'ro'}},
+                  runtime=tag_def['test_runtime']),
+              ret = container.wait()
+              code = ret['StatusCode']
+              out = container.logs(stdout=True, stderr=False)
+              err = container.logs(stdout=False, stderr=True)
+              container.remove()
+              if out:
+                eprint('>>> Output stdout:')
+                eprint(out.decode('utf-8'))
+              else:
+                eprint('>>> No test standard out.')
+              if err:
+                eprint('>>> Output stderr:')
+                eprint(out.decode('utf-8'))
+              else:
+                eprint('>>> No test standard err.')
+              if code != 0:
+                eprint('>> {} failed tests with status: "{}"'.format(
+                    repo_tag, code))
+                failed_tags.append(tag)
+                tag_failed = True
+                if FLAGS.stop_on_failure:
+                  eprint('>> ABORTING due to --stop_on_failure!')
+                  exit(1)
+              else:
+                eprint('>> Tests look good!')
+
+        except docker.errors.BuildError as e:
+          eprint('>> {} failed to build with message: "{}"'.format(
+              repo_tag, e.msg))
+          eprint('>> Build logs follow:')
+          log_lines = [l.get('stream', '') for l in e.build_log]
+          eprint(''.join(log_lines))
+          failed_tags.append(tag)
+          tag_failed = True
+          if FLAGS.stop_on_failure:
+            eprint('>> ABORTING due to --stop_on_failure!')
+            exit(1)
+
+        # Clean temporary dockerfiles if they were created earlier
+        if not FLAGS.keep_temp_dockerfiles:
+          os.remove(dockerfile)
+
+      # Upload new images to DockerHub as long as they built + passed tests
+      if FLAGS.upload_to_hub:
+        if not tag_def['upload_images']:
+          continue
+        if tag_failed:
+          continue
+
+        eprint('>> Uploading to {}:{}'.format(FLAGS.hub_repository, tag))
+        if not FLAGS.dry_run:
+          p = multiprocessing.Process(
+              target=upload_in_background,
+              args=(FLAGS.hub_repository, dock, image, tag))
+          p.start()
+
+  if failed_tags:
+    eprint(
+        '> Some tags failed to build or failed testing, check scrollback for '
+        'errors: {}'.format(
+            ','.join(failed_tags)))
+    exit(1)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/cpu-devel-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/cpu-devel-jupyter.Dockerfile
index dab7178db3a..14ddf081992 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/cpu-devel-jupyter.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/cpu-devel-jupyter.Dockerfile
@@ -16,27 +16,12 @@
 # THIS IS A GENERATED DOCKERFILE.
 #
 # This file was assembled from multiple pieces, whose use is documented
-# below. Please refer to the the TensorFlow dockerfiles documentation for
-# more information. Build args are documented as their default value.
-#
-# Ubuntu-based, CPU-only environment for developing changes for TensorFlow, with Jupyter included.
-#
-# Start from Ubuntu, with TF development packages (no GPU support)
-# --build-arg UBUNTU_VERSION=16.04
-#    ( no description )
-#
-# Python is required for TensorFlow and other libraries.
-# --build-arg USE_PYTHON_3_NOT_2=True
-#    Install python 3 over Python 2
-#
-# Install the latest version of Bazel and Python development tools.
-#
-# Configure TensorFlow's shell prompt and login tools.
-#
-# Launch Jupyter on execution instead of a bash prompt.
+# throughout. Please refer to the TensorFlow dockerfiles documentation
+# for more information.
 
 ARG UBUNTU_VERSION=16.04
-FROM ubuntu:${UBUNTU_VERSION}
+
+FROM ubuntu:${UBUNTU_VERSION} AS base
 
 RUN apt-get update && apt-get install -y --no-install-recommends \
         build-essential \
@@ -48,7 +33,6 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
         libpng12-dev \
         libzmq3-dev \
         pkg-config \
-        python-dev \
         rsync \
         software-properties-common \
         unzip \
@@ -59,8 +43,11 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
         && \
     apt-get clean && \
     rm -rf /var/lib/apt/lists/*
+ 
+ENV CI_BUILD_PYTHON python
 
-ARG USE_PYTHON_3_NOT_2=True
+
+ARG USE_PYTHON_3_NOT_2
 ARG _PY_SUFFIX=${USE_PYTHON_3_NOT_2:+3}
 ARG PYTHON=python${_PY_SUFFIX}
 ARG PIP=pip${_PY_SUFFIX}
@@ -72,10 +59,13 @@ RUN apt-get update && apt-get install -y \
     ${PYTHON} \
     ${PYTHON}-pip
 
-RUN ${PIP} install --upgrade \
+RUN ${PIP} --no-cache-dir install --upgrade \
     pip \
     setuptools
 
+# Some TF tools expect a "python" binary
+RUN ln -s $(which ${PYTHON}) /usr/local/bin/python 
+
 RUN apt-get update && apt-get install -y \
     build-essential \
     curl \
@@ -84,6 +74,20 @@ RUN apt-get update && apt-get install -y \
     ${PYTHON}-dev \
     swig
 
+RUN ${PIP} --no-cache-dir install \
+    Pillow \
+    h5py \
+    keras_applications \
+    keras_preprocessing \
+    matplotlib \
+    mock \
+    numpy \
+    scipy \
+    sklearn \
+    pandas \
+    && test "${USE_PYTHON_3_NOT_2}" -eq 1 && true || ${PIP} --no-cache-dir install \
+    enum34
+
 # Install bazel
 RUN echo "deb [arch=amd64] http://storage.googleapis.com/bazel-apt stable jdk1.8" | tee /etc/apt/sources.list.d/bazel.list && \
     curl https://bazel.build/bazel-release.pub.gpg | apt-key add - && \
@@ -93,11 +97,19 @@ RUN echo "deb [arch=amd64] http://storage.googleapis.com/bazel-apt stable jdk1.8
 COPY bashrc /etc/bash.bashrc
 RUN chmod a+rwx /etc/bash.bashrc
 
-RUN ${PIP} install jupyter
+RUN ${PIP} install jupyter matplotlib
 
-RUN mkdir /notebooks && chmod a+rwx /notebooks
+RUN mkdir -p /tf/tensorflow-tutorials && chmod -R a+rwx /tf/
 RUN mkdir /.local && chmod a+rwx /.local
-WORKDIR /notebooks
+RUN apt-get install -y --no-install-recommends wget
+WORKDIR /tf/tensorflow-tutorials
+RUN wget https://raw.githubusercontent.com/tensorflow/docs/master/site/en/tutorials/keras/basic_classification.ipynb
+RUN wget https://raw.githubusercontent.com/tensorflow/docs/master/site/en/tutorials/keras/basic_text_classification.ipynb
+COPY readme-for-jupyter.md README.md
+RUN apt-get autoremove -y && apt-get remove -y wget
+WORKDIR /tf
 EXPOSE 8888
 
-CMD ["bash", "-c", "source /etc/bash.bashrc && jupyter notebook --notebook-dir=/notebooks --ip 0.0.0.0 --no-browser --allow-root"]
+RUN ${PYTHON} -m ipykernel.kernelspec
+
+CMD ["bash", "-c", "source /etc/bash.bashrc && jupyter notebook --notebook-dir=/tf --ip 0.0.0.0 --no-browser --allow-root"]
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/cpu-devel.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/cpu-devel.Dockerfile
index 68566ccc8aa..16973b47af5 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/cpu-devel.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/cpu-devel.Dockerfile
@@ -16,25 +16,12 @@
 # THIS IS A GENERATED DOCKERFILE.
 #
 # This file was assembled from multiple pieces, whose use is documented
-# below. Please refer to the the TensorFlow dockerfiles documentation for
-# more information. Build args are documented as their default value.
-#
-# Ubuntu-based, CPU-only environment for developing changes for TensorFlow.
-#
-# Start from Ubuntu, with TF development packages (no GPU support)
-# --build-arg UBUNTU_VERSION=16.04
-#    ( no description )
-#
-# Python is required for TensorFlow and other libraries.
-# --build-arg USE_PYTHON_3_NOT_2=True
-#    Install python 3 over Python 2
-#
-# Install the latest version of Bazel and Python development tools.
-#
-# Configure TensorFlow's shell prompt and login tools.
+# throughout. Please refer to the TensorFlow dockerfiles documentation
+# for more information.
 
 ARG UBUNTU_VERSION=16.04
-FROM ubuntu:${UBUNTU_VERSION}
+
+FROM ubuntu:${UBUNTU_VERSION} AS base
 
 RUN apt-get update && apt-get install -y --no-install-recommends \
         build-essential \
@@ -46,7 +33,6 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
         libpng12-dev \
         libzmq3-dev \
         pkg-config \
-        python-dev \
         rsync \
         software-properties-common \
         unzip \
@@ -57,8 +43,11 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
         && \
     apt-get clean && \
     rm -rf /var/lib/apt/lists/*
+ 
+ENV CI_BUILD_PYTHON python
 
-ARG USE_PYTHON_3_NOT_2=True
+
+ARG USE_PYTHON_3_NOT_2
 ARG _PY_SUFFIX=${USE_PYTHON_3_NOT_2:+3}
 ARG PYTHON=python${_PY_SUFFIX}
 ARG PIP=pip${_PY_SUFFIX}
@@ -70,10 +59,13 @@ RUN apt-get update && apt-get install -y \
     ${PYTHON} \
     ${PYTHON}-pip
 
-RUN ${PIP} install --upgrade \
+RUN ${PIP} --no-cache-dir install --upgrade \
     pip \
     setuptools
 
+# Some TF tools expect a "python" binary
+RUN ln -s $(which ${PYTHON}) /usr/local/bin/python 
+
 RUN apt-get update && apt-get install -y \
     build-essential \
     curl \
@@ -82,6 +74,20 @@ RUN apt-get update && apt-get install -y \
     ${PYTHON}-dev \
     swig
 
+RUN ${PIP} --no-cache-dir install \
+    Pillow \
+    h5py \
+    keras_applications \
+    keras_preprocessing \
+    matplotlib \
+    mock \
+    numpy \
+    scipy \
+    sklearn \
+    pandas \
+    && test "${USE_PYTHON_3_NOT_2}" -eq 1 && true || ${PIP} --no-cache-dir install \
+    enum34
+
 # Install bazel
 RUN echo "deb [arch=amd64] http://storage.googleapis.com/bazel-apt stable jdk1.8" | tee /etc/apt/sources.list.d/bazel.list && \
     curl https://bazel.build/bazel-release.pub.gpg | apt-key add - && \
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/cpu-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/cpu-jupyter.Dockerfile
index f889ed6f91d..d8fabadec28 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/cpu-jupyter.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/cpu-jupyter.Dockerfile
@@ -16,31 +16,14 @@
 # THIS IS A GENERATED DOCKERFILE.
 #
 # This file was assembled from multiple pieces, whose use is documented
-# below. Please refer to the the TensorFlow dockerfiles documentation for
-# more information. Build args are documented as their default value.
-#
-# Ubuntu-based, CPU-only environment for using TensorFlow, with Jupyter included.
-#
-# Start from Ubuntu (no GPU support)
-# --build-arg UBUNTU_VERSION=16.04
-#    ( no description )
-#
-# Python is required for TensorFlow and other libraries.
-# --build-arg USE_PYTHON_3_NOT_2=True
-#    Install python 3 over Python 2
-#
-# Install the TensorFlow Python package.
-# --build-arg TF_PACKAGE=tensorflow (tensorflow|tensorflow-gpu|tf-nightly|tf-nightly-gpu)
-#    The specific TensorFlow Python package to install
-#
-# Configure TensorFlow's shell prompt and login tools.
-#
-# Launch Jupyter on execution instead of a bash prompt.
+# throughout. Please refer to the TensorFlow dockerfiles documentation
+# for more information.
 
 ARG UBUNTU_VERSION=16.04
-FROM ubuntu:${UBUNTU_VERSION}
 
-ARG USE_PYTHON_3_NOT_2=True
+FROM ubuntu:${UBUNTU_VERSION} as base
+
+ARG USE_PYTHON_3_NOT_2
 ARG _PY_SUFFIX=${USE_PYTHON_3_NOT_2:+3}
 ARG PYTHON=python${_PY_SUFFIX}
 ARG PIP=pip${_PY_SUFFIX}
@@ -52,21 +35,37 @@ RUN apt-get update && apt-get install -y \
     ${PYTHON} \
     ${PYTHON}-pip
 
-RUN ${PIP} install --upgrade \
+RUN ${PIP} --no-cache-dir install --upgrade \
     pip \
     setuptools
 
+# Some TF tools expect a "python" binary
+RUN ln -s $(which ${PYTHON}) /usr/local/bin/python 
+
+# Options:
+#   tensorflow
+#   tensorflow-gpu
+#   tf-nightly
+#   tf-nightly-gpu
 ARG TF_PACKAGE=tensorflow
 RUN ${PIP} install ${TF_PACKAGE}
 
 COPY bashrc /etc/bash.bashrc
 RUN chmod a+rwx /etc/bash.bashrc
 
-RUN ${PIP} install jupyter
+RUN ${PIP} install jupyter matplotlib
 
-RUN mkdir /notebooks && chmod a+rwx /notebooks
+RUN mkdir -p /tf/tensorflow-tutorials && chmod -R a+rwx /tf/
 RUN mkdir /.local && chmod a+rwx /.local
-WORKDIR /notebooks
+RUN apt-get install -y --no-install-recommends wget
+WORKDIR /tf/tensorflow-tutorials
+RUN wget https://raw.githubusercontent.com/tensorflow/docs/master/site/en/tutorials/keras/basic_classification.ipynb
+RUN wget https://raw.githubusercontent.com/tensorflow/docs/master/site/en/tutorials/keras/basic_text_classification.ipynb
+COPY readme-for-jupyter.md README.md
+RUN apt-get autoremove -y && apt-get remove -y wget
+WORKDIR /tf
 EXPOSE 8888
 
-CMD ["bash", "-c", "source /etc/bash.bashrc && jupyter notebook --notebook-dir=/notebooks --ip 0.0.0.0 --no-browser --allow-root"]
+RUN ${PYTHON} -m ipykernel.kernelspec
+
+CMD ["bash", "-c", "source /etc/bash.bashrc && jupyter notebook --notebook-dir=/tf --ip 0.0.0.0 --no-browser --allow-root"]
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/cpu.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/cpu.Dockerfile
index 182a534bed9..857b5e20471 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/cpu.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/cpu.Dockerfile
@@ -16,29 +16,14 @@
 # THIS IS A GENERATED DOCKERFILE.
 #
 # This file was assembled from multiple pieces, whose use is documented
-# below. Please refer to the the TensorFlow dockerfiles documentation for
-# more information. Build args are documented as their default value.
-#
-# Ubuntu-based, CPU-only environment for using TensorFlow
-#
-# Start from Ubuntu (no GPU support)
-# --build-arg UBUNTU_VERSION=16.04
-#    ( no description )
-#
-# Python is required for TensorFlow and other libraries.
-# --build-arg USE_PYTHON_3_NOT_2=True
-#    Install python 3 over Python 2
-#
-# Install the TensorFlow Python package.
-# --build-arg TF_PACKAGE=tensorflow (tensorflow|tensorflow-gpu|tf-nightly|tf-nightly-gpu)
-#    The specific TensorFlow Python package to install
-#
-# Configure TensorFlow's shell prompt and login tools.
+# throughout. Please refer to the TensorFlow dockerfiles documentation
+# for more information.
 
 ARG UBUNTU_VERSION=16.04
-FROM ubuntu:${UBUNTU_VERSION}
 
-ARG USE_PYTHON_3_NOT_2=True
+FROM ubuntu:${UBUNTU_VERSION} as base
+
+ARG USE_PYTHON_3_NOT_2
 ARG _PY_SUFFIX=${USE_PYTHON_3_NOT_2:+3}
 ARG PYTHON=python${_PY_SUFFIX}
 ARG PIP=pip${_PY_SUFFIX}
@@ -50,10 +35,18 @@ RUN apt-get update && apt-get install -y \
     ${PYTHON} \
     ${PYTHON}-pip
 
-RUN ${PIP} install --upgrade \
+RUN ${PIP} --no-cache-dir install --upgrade \
     pip \
     setuptools
 
+# Some TF tools expect a "python" binary
+RUN ln -s $(which ${PYTHON}) /usr/local/bin/python 
+
+# Options:
+#   tensorflow
+#   tensorflow-gpu
+#   tf-nightly
+#   tf-nightly-gpu
 ARG TF_PACKAGE=tensorflow
 RUN ${PIP} install ${TF_PACKAGE}
 
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/nvidia-devel-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/gpu-devel-jupyter.Dockerfile
similarity index 66%
rename from tensorflow/tools/dockerfiles/dockerfiles/nvidia-devel-jupyter.Dockerfile
rename to tensorflow/tools/dockerfiles/dockerfiles/gpu-devel-jupyter.Dockerfile
index 17faa84a682..9ecaec38c2e 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/nvidia-devel-jupyter.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/gpu-devel-jupyter.Dockerfile
@@ -16,28 +16,12 @@
 # THIS IS A GENERATED DOCKERFILE.
 #
 # This file was assembled from multiple pieces, whose use is documented
-# below. Please refer to the the TensorFlow dockerfiles documentation for
-# more information. Build args are documented as their default value.
-#
-# Ubuntu-based, Nvidia-GPU-enabled environment for developing changes for TensorFlow, with Jupyter included.
-#
-# Start from Nvidia's Ubuntu base image with CUDA and CuDNN, with TF development
-# packages.
-# --build-arg UBUNTU_VERSION=16.04
-#    ( no description )
-#
-# Python is required for TensorFlow and other libraries.
-# --build-arg USE_PYTHON_3_NOT_2=True
-#    Install python 3 over Python 2
-#
-# Install the latest version of Bazel and Python development tools.
-#
-# Configure TensorFlow's shell prompt and login tools.
-#
-# Launch Jupyter on execution instead of a bash prompt.
+# throughout. Please refer to the TensorFlow dockerfiles documentation
+# for more information.
 
 ARG UBUNTU_VERSION=16.04
-FROM nvidia/cuda:9.0-base-ubuntu${UBUNTU_VERSION}
+
+FROM nvidia/cuda:9.0-base-ubuntu${UBUNTU_VERSION} as base
 
 RUN apt-get update && apt-get install -y --no-install-recommends \
         build-essential \
@@ -60,6 +44,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
         libpng12-dev \
         libzmq3-dev \
         pkg-config \
+        python-dev \
         rsync \
         software-properties-common \
         unzip \
@@ -82,11 +67,19 @@ RUN mkdir /usr/local/cuda-9.0/lib &&  \
     ln -s /usr/lib/x86_64-linux-gnu/libnccl.so.2 /usr/local/cuda/lib/libnccl.so.2 && \
     ln -s /usr/include/nccl.h /usr/local/cuda/include/nccl.h
 
-# TODO(tobyboyd): Remove after license is excluded from BUILD file.
-RUN gunzip /usr/share/doc/libnccl2/NCCL-SLA.txt.gz && \
-    cp /usr/share/doc/libnccl2/NCCL-SLA.txt /usr/local/cuda/
+# Configure the build for our CUDA configuration.
+ENV CI_BUILD_PYTHON python
+ENV LD_LIBRARY_PATH /usr/local/cuda/extras/CUPTI/lib64:$LD_LIBRARY_PATH
+ENV TF_NEED_CUDA 1
+ENV TF_NEED_TENSORRT 1
+ENV TF_CUDA_COMPUTE_CAPABILITIES=3.5,5.2,6.0,6.1,7.0
+ENV TF_CUDA_VERSION=9.0
+ENV TF_CUDNN_VERSION=7
 
-ARG USE_PYTHON_3_NOT_2=True
+# NCCL 2.x
+ENV TF_NCCL_VERSION=2
+
+ARG USE_PYTHON_3_NOT_2
 ARG _PY_SUFFIX=${USE_PYTHON_3_NOT_2:+3}
 ARG PYTHON=python${_PY_SUFFIX}
 ARG PIP=pip${_PY_SUFFIX}
@@ -98,10 +91,13 @@ RUN apt-get update && apt-get install -y \
     ${PYTHON} \
     ${PYTHON}-pip
 
-RUN ${PIP} install --upgrade \
+RUN ${PIP} --no-cache-dir install --upgrade \
     pip \
     setuptools
 
+# Some TF tools expect a "python" binary
+RUN ln -s $(which ${PYTHON}) /usr/local/bin/python 
+
 RUN apt-get update && apt-get install -y \
     build-essential \
     curl \
@@ -110,6 +106,20 @@ RUN apt-get update && apt-get install -y \
     ${PYTHON}-dev \
     swig
 
+RUN ${PIP} --no-cache-dir install \
+    Pillow \
+    h5py \
+    keras_applications \
+    keras_preprocessing \
+    matplotlib \
+    mock \
+    numpy \
+    scipy \
+    sklearn \
+    pandas \
+    && test "${USE_PYTHON_3_NOT_2}" -eq 1 && true || ${PIP} --no-cache-dir install \
+    enum34
+
 # Install bazel
 RUN echo "deb [arch=amd64] http://storage.googleapis.com/bazel-apt stable jdk1.8" | tee /etc/apt/sources.list.d/bazel.list && \
     curl https://bazel.build/bazel-release.pub.gpg | apt-key add - && \
@@ -119,11 +129,19 @@ RUN echo "deb [arch=amd64] http://storage.googleapis.com/bazel-apt stable jdk1.8
 COPY bashrc /etc/bash.bashrc
 RUN chmod a+rwx /etc/bash.bashrc
 
-RUN ${PIP} install jupyter
+RUN ${PIP} install jupyter matplotlib
 
-RUN mkdir /notebooks && chmod a+rwx /notebooks
+RUN mkdir -p /tf/tensorflow-tutorials && chmod -R a+rwx /tf/
 RUN mkdir /.local && chmod a+rwx /.local
-WORKDIR /notebooks
+RUN apt-get install -y --no-install-recommends wget
+WORKDIR /tf/tensorflow-tutorials
+RUN wget https://raw.githubusercontent.com/tensorflow/docs/master/site/en/tutorials/keras/basic_classification.ipynb
+RUN wget https://raw.githubusercontent.com/tensorflow/docs/master/site/en/tutorials/keras/basic_text_classification.ipynb
+COPY readme-for-jupyter.md README.md
+RUN apt-get autoremove -y && apt-get remove -y wget
+WORKDIR /tf
 EXPOSE 8888
 
-CMD ["bash", "-c", "source /etc/bash.bashrc && jupyter notebook --notebook-dir=/notebooks --ip 0.0.0.0 --no-browser --allow-root"]
+RUN ${PYTHON} -m ipykernel.kernelspec
+
+CMD ["bash", "-c", "source /etc/bash.bashrc && jupyter notebook --notebook-dir=/tf --ip 0.0.0.0 --no-browser --allow-root"]
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/nvidia-devel.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/gpu-devel.Dockerfile
similarity index 76%
rename from tensorflow/tools/dockerfiles/dockerfiles/nvidia-devel.Dockerfile
rename to tensorflow/tools/dockerfiles/dockerfiles/gpu-devel.Dockerfile
index a3ba02a684c..c79bc3cf4c0 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/nvidia-devel.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/gpu-devel.Dockerfile
@@ -16,26 +16,12 @@
 # THIS IS A GENERATED DOCKERFILE.
 #
 # This file was assembled from multiple pieces, whose use is documented
-# below. Please refer to the the TensorFlow dockerfiles documentation for
-# more information. Build args are documented as their default value.
-#
-# Ubuntu-based, Nvidia-GPU-enabled environment for developing changes for TensorFlow.
-#
-# Start from Nvidia's Ubuntu base image with CUDA and CuDNN, with TF development
-# packages.
-# --build-arg UBUNTU_VERSION=16.04
-#    ( no description )
-#
-# Python is required for TensorFlow and other libraries.
-# --build-arg USE_PYTHON_3_NOT_2=True
-#    Install python 3 over Python 2
-#
-# Install the latest version of Bazel and Python development tools.
-#
-# Configure TensorFlow's shell prompt and login tools.
+# throughout. Please refer to the TensorFlow dockerfiles documentation
+# for more information.
 
 ARG UBUNTU_VERSION=16.04
-FROM nvidia/cuda:9.0-base-ubuntu${UBUNTU_VERSION}
+
+FROM nvidia/cuda:9.0-base-ubuntu${UBUNTU_VERSION} as base
 
 RUN apt-get update && apt-get install -y --no-install-recommends \
         build-essential \
@@ -58,6 +44,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
         libpng12-dev \
         libzmq3-dev \
         pkg-config \
+        python-dev \
         rsync \
         software-properties-common \
         unzip \
@@ -80,11 +67,19 @@ RUN mkdir /usr/local/cuda-9.0/lib &&  \
     ln -s /usr/lib/x86_64-linux-gnu/libnccl.so.2 /usr/local/cuda/lib/libnccl.so.2 && \
     ln -s /usr/include/nccl.h /usr/local/cuda/include/nccl.h
 
-# TODO(tobyboyd): Remove after license is excluded from BUILD file.
-RUN gunzip /usr/share/doc/libnccl2/NCCL-SLA.txt.gz && \
-    cp /usr/share/doc/libnccl2/NCCL-SLA.txt /usr/local/cuda/
+# Configure the build for our CUDA configuration.
+ENV CI_BUILD_PYTHON python
+ENV LD_LIBRARY_PATH /usr/local/cuda/extras/CUPTI/lib64:$LD_LIBRARY_PATH
+ENV TF_NEED_CUDA 1
+ENV TF_NEED_TENSORRT 1
+ENV TF_CUDA_COMPUTE_CAPABILITIES=3.5,5.2,6.0,6.1,7.0
+ENV TF_CUDA_VERSION=9.0
+ENV TF_CUDNN_VERSION=7
 
-ARG USE_PYTHON_3_NOT_2=True
+# NCCL 2.x
+ENV TF_NCCL_VERSION=2
+
+ARG USE_PYTHON_3_NOT_2
 ARG _PY_SUFFIX=${USE_PYTHON_3_NOT_2:+3}
 ARG PYTHON=python${_PY_SUFFIX}
 ARG PIP=pip${_PY_SUFFIX}
@@ -96,10 +91,13 @@ RUN apt-get update && apt-get install -y \
     ${PYTHON} \
     ${PYTHON}-pip
 
-RUN ${PIP} install --upgrade \
+RUN ${PIP} --no-cache-dir install --upgrade \
     pip \
     setuptools
 
+# Some TF tools expect a "python" binary
+RUN ln -s $(which ${PYTHON}) /usr/local/bin/python 
+
 RUN apt-get update && apt-get install -y \
     build-essential \
     curl \
@@ -108,6 +106,20 @@ RUN apt-get update && apt-get install -y \
     ${PYTHON}-dev \
     swig
 
+RUN ${PIP} --no-cache-dir install \
+    Pillow \
+    h5py \
+    keras_applications \
+    keras_preprocessing \
+    matplotlib \
+    mock \
+    numpy \
+    scipy \
+    sklearn \
+    pandas \
+    && test "${USE_PYTHON_3_NOT_2}" -eq 1 && true || ${PIP} --no-cache-dir install \
+    enum34
+
 # Install bazel
 RUN echo "deb [arch=amd64] http://storage.googleapis.com/bazel-apt stable jdk1.8" | tee /etc/apt/sources.list.d/bazel.list && \
     curl https://bazel.build/bazel-release.pub.gpg | apt-key add - && \
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/nvidia-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/gpu-jupyter.Dockerfile
similarity index 62%
rename from tensorflow/tools/dockerfiles/dockerfiles/nvidia-jupyter.Dockerfile
rename to tensorflow/tools/dockerfiles/dockerfiles/gpu-jupyter.Dockerfile
index fbdea4628ad..acfe4d8607d 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/nvidia-jupyter.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/gpu-jupyter.Dockerfile
@@ -16,30 +16,13 @@
 # THIS IS A GENERATED DOCKERFILE.
 #
 # This file was assembled from multiple pieces, whose use is documented
-# below. Please refer to the the TensorFlow dockerfiles documentation for
-# more information. Build args are documented as their default value.
-#
-# Ubuntu-based, Nvidia-GPU-enabled environment for using TensorFlow, with Jupyter included.
-#
-# NVIDIA with CUDA and CuDNN, no dev stuff
-# --build-arg UBUNTU_VERSION=16.04
-#    ( no description )
-#
-# Python is required for TensorFlow and other libraries.
-# --build-arg USE_PYTHON_3_NOT_2=True
-#    Install python 3 over Python 2
-#
-# Install the TensorFlow Python package.
-# --build-arg TF_PACKAGE=tensorflow-gpu (tensorflow|tensorflow-gpu|tf-nightly|tf-nightly-gpu)
-#    The specific TensorFlow Python package to install
-#
-# Configure TensorFlow's shell prompt and login tools.
-#
-# Launch Jupyter on execution instead of a bash prompt.
+# throughout. Please refer to the TensorFlow dockerfiles documentation
+# for more information.
 
-FROM nvidia/cuda:9.0-base-ubuntu16.04
+ARG UBUNTU_VERSION=16.04
+
+FROM nvidia/cuda:9.0-base-ubuntu${UBUNTU_VERSION} as base
 
-# Pick up some TF dependencies
 RUN apt-get update && apt-get install -y --no-install-recommends \
         build-essential \
         cuda-command-line-tools-9-0 \
@@ -48,6 +31,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
         cuda-curand-9-0 \
         cuda-cusolver-9-0 \
         cuda-cusparse-9-0 \
+        curl \
         libcudnn7=7.2.1.38-1+cuda9.0 \
         libnccl2=2.2.13-1+cuda9.0 \
         libfreetype6-dev \
@@ -55,6 +39,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
         libpng12-dev \
         libzmq3-dev \
         pkg-config \
+        rsync \
         software-properties-common \
         unzip \
         && \
@@ -66,7 +51,10 @@ RUN apt-get update && \
         apt-get update && \
         apt-get install libnvinfer4=4.1.2-1+cuda9.0
 
-ARG USE_PYTHON_3_NOT_2=True
+# For CUDA profiling, TensorFlow requires CUPTI.
+ENV LD_LIBRARY_PATH /usr/local/cuda/extras/CUPTI/lib64:$LD_LIBRARY_PATH
+
+ARG USE_PYTHON_3_NOT_2
 ARG _PY_SUFFIX=${USE_PYTHON_3_NOT_2:+3}
 ARG PYTHON=python${_PY_SUFFIX}
 ARG PIP=pip${_PY_SUFFIX}
@@ -78,21 +66,37 @@ RUN apt-get update && apt-get install -y \
     ${PYTHON} \
     ${PYTHON}-pip
 
-RUN ${PIP} install --upgrade \
+RUN ${PIP} --no-cache-dir install --upgrade \
     pip \
     setuptools
 
-ARG TF_PACKAGE=tensorflow-gpu
+# Some TF tools expect a "python" binary
+RUN ln -s $(which ${PYTHON}) /usr/local/bin/python 
+
+# Options:
+#   tensorflow
+#   tensorflow-gpu
+#   tf-nightly
+#   tf-nightly-gpu
+ARG TF_PACKAGE=tensorflow
 RUN ${PIP} install ${TF_PACKAGE}
 
 COPY bashrc /etc/bash.bashrc
 RUN chmod a+rwx /etc/bash.bashrc
 
-RUN ${PIP} install jupyter
+RUN ${PIP} install jupyter matplotlib
 
-RUN mkdir /notebooks && chmod a+rwx /notebooks
+RUN mkdir -p /tf/tensorflow-tutorials && chmod -R a+rwx /tf/
 RUN mkdir /.local && chmod a+rwx /.local
-WORKDIR /notebooks
+RUN apt-get install -y --no-install-recommends wget
+WORKDIR /tf/tensorflow-tutorials
+RUN wget https://raw.githubusercontent.com/tensorflow/docs/master/site/en/tutorials/keras/basic_classification.ipynb
+RUN wget https://raw.githubusercontent.com/tensorflow/docs/master/site/en/tutorials/keras/basic_text_classification.ipynb
+COPY readme-for-jupyter.md README.md
+RUN apt-get autoremove -y && apt-get remove -y wget
+WORKDIR /tf
 EXPOSE 8888
 
-CMD ["bash", "-c", "source /etc/bash.bashrc && jupyter notebook --notebook-dir=/notebooks --ip 0.0.0.0 --no-browser --allow-root"]
+RUN ${PYTHON} -m ipykernel.kernelspec
+
+CMD ["bash", "-c", "source /etc/bash.bashrc && jupyter notebook --notebook-dir=/tf --ip 0.0.0.0 --no-browser --allow-root"]
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/nvidia.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/gpu.Dockerfile
similarity index 69%
rename from tensorflow/tools/dockerfiles/dockerfiles/nvidia.Dockerfile
rename to tensorflow/tools/dockerfiles/dockerfiles/gpu.Dockerfile
index e0312dbc294..f36a21eaf0c 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/nvidia.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/gpu.Dockerfile
@@ -16,28 +16,13 @@
 # THIS IS A GENERATED DOCKERFILE.
 #
 # This file was assembled from multiple pieces, whose use is documented
-# below. Please refer to the the TensorFlow dockerfiles documentation for
-# more information. Build args are documented as their default value.
-#
-# Ubuntu-based, Nvidia-GPU-enabled environment for using TensorFlow.
-#
-# NVIDIA with CUDA and CuDNN, no dev stuff
-# --build-arg UBUNTU_VERSION=16.04
-#    ( no description )
-#
-# Python is required for TensorFlow and other libraries.
-# --build-arg USE_PYTHON_3_NOT_2=True
-#    Install python 3 over Python 2
-#
-# Install the TensorFlow Python package.
-# --build-arg TF_PACKAGE=tensorflow-gpu (tensorflow|tensorflow-gpu|tf-nightly|tf-nightly-gpu)
-#    The specific TensorFlow Python package to install
-#
-# Configure TensorFlow's shell prompt and login tools.
+# throughout. Please refer to the TensorFlow dockerfiles documentation
+# for more information.
 
-FROM nvidia/cuda:9.0-base-ubuntu16.04
+ARG UBUNTU_VERSION=16.04
+
+FROM nvidia/cuda:9.0-base-ubuntu${UBUNTU_VERSION} as base
 
-# Pick up some TF dependencies
 RUN apt-get update && apt-get install -y --no-install-recommends \
         build-essential \
         cuda-command-line-tools-9-0 \
@@ -46,6 +31,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
         cuda-curand-9-0 \
         cuda-cusolver-9-0 \
         cuda-cusparse-9-0 \
+        curl \
         libcudnn7=7.2.1.38-1+cuda9.0 \
         libnccl2=2.2.13-1+cuda9.0 \
         libfreetype6-dev \
@@ -53,6 +39,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
         libpng12-dev \
         libzmq3-dev \
         pkg-config \
+        rsync \
         software-properties-common \
         unzip \
         && \
@@ -64,7 +51,10 @@ RUN apt-get update && \
         apt-get update && \
         apt-get install libnvinfer4=4.1.2-1+cuda9.0
 
-ARG USE_PYTHON_3_NOT_2=True
+# For CUDA profiling, TensorFlow requires CUPTI.
+ENV LD_LIBRARY_PATH /usr/local/cuda/extras/CUPTI/lib64:$LD_LIBRARY_PATH
+
+ARG USE_PYTHON_3_NOT_2
 ARG _PY_SUFFIX=${USE_PYTHON_3_NOT_2:+3}
 ARG PYTHON=python${_PY_SUFFIX}
 ARG PIP=pip${_PY_SUFFIX}
@@ -76,11 +66,19 @@ RUN apt-get update && apt-get install -y \
     ${PYTHON} \
     ${PYTHON}-pip
 
-RUN ${PIP} install --upgrade \
+RUN ${PIP} --no-cache-dir install --upgrade \
     pip \
     setuptools
 
-ARG TF_PACKAGE=tensorflow-gpu
+# Some TF tools expect a "python" binary
+RUN ln -s $(which ${PYTHON}) /usr/local/bin/python 
+
+# Options:
+#   tensorflow
+#   tensorflow-gpu
+#   tf-nightly
+#   tf-nightly-gpu
+ARG TF_PACKAGE=tensorflow
 RUN ${PIP} install ${TF_PACKAGE}
 
 COPY bashrc /etc/bash.bashrc
diff --git a/tensorflow/tools/dockerfiles/partials/jupyter.partial.Dockerfile b/tensorflow/tools/dockerfiles/partials/jupyter.partial.Dockerfile
index 2c9b9f3f9a0..c4ec6095c0c 100644
--- a/tensorflow/tools/dockerfiles/partials/jupyter.partial.Dockerfile
+++ b/tensorflow/tools/dockerfiles/partials/jupyter.partial.Dockerfile
@@ -1,8 +1,16 @@
-RUN ${PIP} install jupyter
+RUN ${PIP} install jupyter matplotlib
 
-RUN mkdir /notebooks && chmod a+rwx /notebooks
+RUN mkdir -p /tf/tensorflow-tutorials && chmod -R a+rwx /tf/
 RUN mkdir /.local && chmod a+rwx /.local
-WORKDIR /notebooks
+RUN apt-get install -y --no-install-recommends wget
+WORKDIR /tf/tensorflow-tutorials
+RUN wget https://raw.githubusercontent.com/tensorflow/docs/master/site/en/tutorials/keras/basic_classification.ipynb
+RUN wget https://raw.githubusercontent.com/tensorflow/docs/master/site/en/tutorials/keras/basic_text_classification.ipynb
+COPY readme-for-jupyter.md README.md
+RUN apt-get autoremove -y && apt-get remove -y wget
+WORKDIR /tf
 EXPOSE 8888
 
-CMD ["bash", "-c", "source /etc/bash.bashrc && jupyter notebook --notebook-dir=/notebooks --ip 0.0.0.0 --no-browser --allow-root"]
+RUN ${PYTHON} -m ipykernel.kernelspec
+
+CMD ["bash", "-c", "source /etc/bash.bashrc && jupyter notebook --notebook-dir=/tf --ip 0.0.0.0 --no-browser --allow-root"]
diff --git a/tensorflow/tools/dockerfiles/partials/tensorflow.partial.Dockerfile b/tensorflow/tools/dockerfiles/partials/tensorflow.partial.Dockerfile
index 96e79547f0c..76758bd147e 100644
--- a/tensorflow/tools/dockerfiles/partials/tensorflow.partial.Dockerfile
+++ b/tensorflow/tools/dockerfiles/partials/tensorflow.partial.Dockerfile
@@ -1,2 +1,7 @@
-ARG TF_PACKAGE
+# Options:
+#   tensorflow
+#   tensorflow-gpu
+#   tf-nightly
+#   tf-nightly-gpu
+ARG TF_PACKAGE=tensorflow
 RUN ${PIP} install ${TF_PACKAGE}
diff --git a/tensorflow/tools/dockerfiles/partials/test-import.partial.Dockerfile b/tensorflow/tools/dockerfiles/partials/test-import.partial.Dockerfile
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/tensorflow/tools/dockerfiles/partials/ubuntu.partial.Dockerfile b/tensorflow/tools/dockerfiles/partials/ubuntu.partial.Dockerfile
deleted file mode 100644
index 0a50735bf83..00000000000
--- a/tensorflow/tools/dockerfiles/partials/ubuntu.partial.Dockerfile
+++ /dev/null
@@ -1,2 +0,0 @@
-ARG UBUNTU_VERSION=16.04
-FROM ubuntu:${UBUNTU_VERSION}
diff --git a/tensorflow/tools/dockerfiles/partials/bazel.partial.Dockerfile b/tensorflow/tools/dockerfiles/partials/ubuntu/bazel.partial.Dockerfile
similarity index 58%
rename from tensorflow/tools/dockerfiles/partials/bazel.partial.Dockerfile
rename to tensorflow/tools/dockerfiles/partials/ubuntu/bazel.partial.Dockerfile
index b08d8bdd14b..156bb019914 100644
--- a/tensorflow/tools/dockerfiles/partials/bazel.partial.Dockerfile
+++ b/tensorflow/tools/dockerfiles/partials/ubuntu/bazel.partial.Dockerfile
@@ -6,6 +6,20 @@ RUN apt-get update && apt-get install -y \
     ${PYTHON}-dev \
     swig
 
+RUN ${PIP} --no-cache-dir install \
+    Pillow \
+    h5py \
+    keras_applications \
+    keras_preprocessing \
+    matplotlib \
+    mock \
+    numpy \
+    scipy \
+    sklearn \
+    pandas \
+    && test "${USE_PYTHON_3_NOT_2}" -eq 1 && true || ${PIP} --no-cache-dir install \
+    enum34
+
 # Install bazel
 RUN echo "deb [arch=amd64] http://storage.googleapis.com/bazel-apt stable jdk1.8" | tee /etc/apt/sources.list.d/bazel.list && \
     curl https://bazel.build/bazel-release.pub.gpg | apt-key add - && \
diff --git a/tensorflow/tools/dockerfiles/partials/ubuntu-devel.partial.Dockerfile b/tensorflow/tools/dockerfiles/partials/ubuntu/cpu-devel.partial.Dockerfile
similarity index 86%
rename from tensorflow/tools/dockerfiles/partials/ubuntu-devel.partial.Dockerfile
rename to tensorflow/tools/dockerfiles/partials/ubuntu/cpu-devel.partial.Dockerfile
index bc792722766..901652cc281 100644
--- a/tensorflow/tools/dockerfiles/partials/ubuntu-devel.partial.Dockerfile
+++ b/tensorflow/tools/dockerfiles/partials/ubuntu/cpu-devel.partial.Dockerfile
@@ -1,5 +1,4 @@
-ARG UBUNTU_VERSION=16.04
-FROM ubuntu:${UBUNTU_VERSION}
+FROM ubuntu:${UBUNTU_VERSION} AS base
 
 RUN apt-get update && apt-get install -y --no-install-recommends \
         build-essential \
@@ -11,7 +10,6 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
         libpng12-dev \
         libzmq3-dev \
         pkg-config \
-        python-dev \
         rsync \
         software-properties-common \
         unzip \
@@ -22,3 +20,6 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
         && \
     apt-get clean && \
     rm -rf /var/lib/apt/lists/*
+ 
+ENV CI_BUILD_PYTHON python
+
diff --git a/tensorflow/tools/dockerfiles/partials/ubuntu/cpu.partial.Dockerfile b/tensorflow/tools/dockerfiles/partials/ubuntu/cpu.partial.Dockerfile
new file mode 100644
index 00000000000..d01b26e27f6
--- /dev/null
+++ b/tensorflow/tools/dockerfiles/partials/ubuntu/cpu.partial.Dockerfile
@@ -0,0 +1 @@
+FROM ubuntu:${UBUNTU_VERSION} as base
diff --git a/tensorflow/tools/dockerfiles/partials/nvidia-devel.partial.Dockerfile b/tensorflow/tools/dockerfiles/partials/ubuntu/nvidia-devel.partial.Dockerfile
similarity index 78%
rename from tensorflow/tools/dockerfiles/partials/nvidia-devel.partial.Dockerfile
rename to tensorflow/tools/dockerfiles/partials/ubuntu/nvidia-devel.partial.Dockerfile
index 45159f711fc..48d457e40cf 100644
--- a/tensorflow/tools/dockerfiles/partials/nvidia-devel.partial.Dockerfile
+++ b/tensorflow/tools/dockerfiles/partials/ubuntu/nvidia-devel.partial.Dockerfile
@@ -1,5 +1,4 @@
-ARG UBUNTU_VERSION=16.04
-FROM nvidia/cuda:9.0-base-ubuntu${UBUNTU_VERSION}
+FROM nvidia/cuda:9.0-base-ubuntu${UBUNTU_VERSION} as base
 
 RUN apt-get update && apt-get install -y --no-install-recommends \
         build-essential \
@@ -22,6 +21,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
         libpng12-dev \
         libzmq3-dev \
         pkg-config \
+        python-dev \
         rsync \
         software-properties-common \
         unzip \
@@ -44,6 +44,14 @@ RUN mkdir /usr/local/cuda-9.0/lib &&  \
     ln -s /usr/lib/x86_64-linux-gnu/libnccl.so.2 /usr/local/cuda/lib/libnccl.so.2 && \
     ln -s /usr/include/nccl.h /usr/local/cuda/include/nccl.h
 
-# TODO(tobyboyd): Remove after license is excluded from BUILD file.
-RUN gunzip /usr/share/doc/libnccl2/NCCL-SLA.txt.gz && \
-    cp /usr/share/doc/libnccl2/NCCL-SLA.txt /usr/local/cuda/
+# Configure the build for our CUDA configuration.
+ENV CI_BUILD_PYTHON python
+ENV LD_LIBRARY_PATH /usr/local/cuda/extras/CUPTI/lib64:$LD_LIBRARY_PATH
+ENV TF_NEED_CUDA 1
+ENV TF_NEED_TENSORRT 1
+ENV TF_CUDA_COMPUTE_CAPABILITIES=3.5,5.2,6.0,6.1,7.0
+ENV TF_CUDA_VERSION=9.0
+ENV TF_CUDNN_VERSION=7
+
+# NCCL 2.x
+ENV TF_NCCL_VERSION=2
diff --git a/tensorflow/tools/dockerfiles/partials/nvidia.partial.Dockerfile b/tensorflow/tools/dockerfiles/partials/ubuntu/nvidia.partial.Dockerfile
similarity index 78%
rename from tensorflow/tools/dockerfiles/partials/nvidia.partial.Dockerfile
rename to tensorflow/tools/dockerfiles/partials/ubuntu/nvidia.partial.Dockerfile
index 1064390af3b..1dc8e43aadd 100644
--- a/tensorflow/tools/dockerfiles/partials/nvidia.partial.Dockerfile
+++ b/tensorflow/tools/dockerfiles/partials/ubuntu/nvidia.partial.Dockerfile
@@ -1,6 +1,5 @@
-FROM nvidia/cuda:9.0-base-ubuntu16.04
+FROM nvidia/cuda:9.0-base-ubuntu${UBUNTU_VERSION} as base
 
-# Pick up some TF dependencies
 RUN apt-get update && apt-get install -y --no-install-recommends \
         build-essential \
         cuda-command-line-tools-9-0 \
@@ -9,6 +8,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
         cuda-curand-9-0 \
         cuda-cusolver-9-0 \
         cuda-cusparse-9-0 \
+        curl \
         libcudnn7=7.2.1.38-1+cuda9.0 \
         libnccl2=2.2.13-1+cuda9.0 \
         libfreetype6-dev \
@@ -16,6 +16,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
         libpng12-dev \
         libzmq3-dev \
         pkg-config \
+        rsync \
         software-properties-common \
         unzip \
         && \
@@ -26,3 +27,6 @@ RUN apt-get update && \
         apt-get install nvinfer-runtime-trt-repo-ubuntu1604-4.0.1-ga-cuda9.0 && \
         apt-get update && \
         apt-get install libnvinfer4=4.1.2-1+cuda9.0
+
+# For CUDA profiling, TensorFlow requires CUPTI.
+ENV LD_LIBRARY_PATH /usr/local/cuda/extras/CUPTI/lib64:$LD_LIBRARY_PATH
diff --git a/tensorflow/tools/dockerfiles/partials/python.partial.Dockerfile b/tensorflow/tools/dockerfiles/partials/ubuntu/python.partial.Dockerfile
similarity index 66%
rename from tensorflow/tools/dockerfiles/partials/python.partial.Dockerfile
rename to tensorflow/tools/dockerfiles/partials/ubuntu/python.partial.Dockerfile
index ee08af73a8e..6af47319538 100644
--- a/tensorflow/tools/dockerfiles/partials/python.partial.Dockerfile
+++ b/tensorflow/tools/dockerfiles/partials/ubuntu/python.partial.Dockerfile
@@ -10,6 +10,9 @@ RUN apt-get update && apt-get install -y \
     ${PYTHON} \
     ${PYTHON}-pip
 
-RUN ${PIP} install --upgrade \
+RUN ${PIP} --no-cache-dir install --upgrade \
     pip \
     setuptools
+
+# Some TF tools expect a "python" binary
+RUN ln -s $(which ${PYTHON}) /usr/local/bin/python 
diff --git a/tensorflow/tools/dockerfiles/partials/ubuntu/test-devel.partial.Dockerfile b/tensorflow/tools/dockerfiles/partials/ubuntu/test-devel.partial.Dockerfile
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/tensorflow/tools/dockerfiles/partials/ubuntu/version.partial.Dockerfile b/tensorflow/tools/dockerfiles/partials/ubuntu/version.partial.Dockerfile
new file mode 100644
index 00000000000..6ecd2b8b1ac
--- /dev/null
+++ b/tensorflow/tools/dockerfiles/partials/ubuntu/version.partial.Dockerfile
@@ -0,0 +1 @@
+ARG UBUNTU_VERSION=16.04
diff --git a/tensorflow/tools/dockerfiles/readme-for-jupyter.md b/tensorflow/tools/dockerfiles/readme-for-jupyter.md
new file mode 100644
index 00000000000..f104a7533b8
--- /dev/null
+++ b/tensorflow/tools/dockerfiles/readme-for-jupyter.md
@@ -0,0 +1,3 @@
+Want more tutorials like these?
+
+Check out tensorflow.org/tutorials!
diff --git a/tensorflow/tools/dockerfiles/spec.yml b/tensorflow/tools/dockerfiles/spec.yml
index 28bf9a55da1..4826ddd8e22 100644
--- a/tensorflow/tools/dockerfiles/spec.yml
+++ b/tensorflow/tools/dockerfiles/spec.yml
@@ -1,195 +1,135 @@
-# ======
-# HEADER
-# ======
-#
-# This is commented-out and prepended to each generated Dockerfile.
 header: |
-    Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+    # Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+    #
+    # Licensed under the Apache License, Version 2.0 (the "License");
+    # you may not use this file except in compliance with the License.
+    # You may obtain a copy of the License at
+    #
+    #     http://www.apache.org/licenses/LICENSE-2.0
+    #
+    # Unless required by applicable law or agreed to in writing, software
+    # distributed under the License is distributed on an "AS IS" BASIS,
+    # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    # See the License for the specific language governing permissions and
+    # limitations under the License.
+    # ============================================================================
+    #
+    # THIS IS A GENERATED DOCKERFILE.
+    #
+    # This file was assembled from multiple pieces, whose use is documented
+    # throughout. Please refer to the TensorFlow dockerfiles documentation
+    # for more information.
 
-    Licensed under the Apache License, Version 2.0 (the "License");
-    you may not use this file except in compliance with the License.
-    You may obtain a copy of the License at
-
-        http://www.apache.org/licenses/LICENSE-2.0
-
-    Unless required by applicable law or agreed to in writing, software
-    distributed under the License is distributed on an "AS IS" BASIS,
-    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-    See the License for the specific language governing permissions and
-    limitations under the License.
-    ============================================================================
-
-    THIS IS A GENERATED DOCKERFILE.
-
-    This file was assembled from multiple pieces, whose use is documented
-    below. Please refer to the the TensorFlow dockerfiles documentation for
-    more information. Build args are documented as their default value.
-
-# ========
-# PARTIALS
-# ========
+# A combinatorial explosion of Docker images and Dockerfiles.
+# Each "release" defines all of the ways to combine related but separate chunks
+# of functionality ("slices") by listing all of the "slice sets" to use when
+# building.
 #
-# Represent and document pieces of a Dockerfile. Spec:
-# 
-# name: the name of the partial, is referenced from the images section
-#   desc: A description, inserted later into the Dockerfile
-#   file: Alternative file prefix, e.g. file.partial.Dockerfile. The default is
-#         the name of the partial.
-#   args: A dict of ARGs in the Dockerfile; each entry has the format
-#      ARG_NAME: VALUE where VALUE is one of:
-#         - a dict:
-#             desc: Documentation for the arg
-#             default: Default value for the arg; is written to the Dockerfile
-#             options: List of strings, part of documentation
-#         - a concrete value: the same as a dictionary with default: [value].
+# For example, a release that uses {nightly}{py} would create 4 Dockerfiles
+# (which could become images or concrete Dockerfiles), because the "nightly"
+# and "py" slice sets both have two entries:
+#
+#   - nightly (no -py2 because the Python 2 slice set has add_to_name: ""
+#   - nightly-py3
+#   - nightly-gpu (similar)
+#   - nightly-gpu-py3
 
-partials:
-    ubuntu:
-        desc: Start from Ubuntu (no GPU support)
-        args:
-            UBUNTU_VERSION: 16.04
+releases:
+    nightly:
+        tag_specs:
+            - "{nightly}{py}{jupyter}"
 
-    ubuntu-devel:
-        desc: Start from Ubuntu, with TF development packages (no GPU support)
-        args:
-            UBUNTU_VERSION: 16.04
+    versioned:
+        tag_specs:
+            - "{_TAG_PREFIX}{ubuntu}{py}{jupyter}"
 
-    bazel:
-        desc: Install the latest version of Bazel and Python development tools.
+    ubuntu-dockerfiles:
+        is_dockerfiles: true
+        upload_images: false
+        tag_specs:
+            - "{ubuntu}{jupyter}"
 
-    nvidia:
-        desc: NVIDIA with CUDA and CuDNN, no dev stuff
-        args:
-            UBUNTU_VERSION: 16.04
+slice_sets:
 
-    nvidia-devel:
-        desc: >
-            Start from Nvidia's Ubuntu base image with CUDA and CuDNN, with TF
-            development packages.
-        args:
-            UBUNTU_VERSION: 16.04
+    py:
+        - add_to_name: ""
+          args:
+              - USE_PYTHON_3_NOT_2=
+        - add_to_name: "-py3"
+          args:
+              - USE_PYTHON_3_NOT_2=1
 
-    python:
-        desc: Python is required for TensorFlow and other libraries.
-        args:
-            USE_PYTHON_3_NOT_2:
-                default: true
-                desc: Install python 3 over Python 2
-                
-    tensorflow:
-        desc: Install the TensorFlow Python package.
-        args:
-            TF_PACKAGE:
-                default: tensorflow
-                options:
-                    - tensorflow
-                    - tensorflow-gpu
-                    - tf-nightly
-                    - tf-nightly-gpu
-                desc: The specific TensorFlow Python package to install
-    shell:
-        desc: Configure TensorFlow's shell prompt and login tools.
     jupyter:
-        desc: Launch Jupyter on execution instead of a bash prompt.
+        - add_to_name: ""
+        - add_to_name: "-jupyter"
+          partials:
+              - jupyter
 
-# ======
-# IMAGES
-# ======
-# 
-# Represent Dockerfiles. Spec:
-# 
-# name: the name of the image, possibly referenced by other images
-#   desc: A description, inserted later into the Dockerfile
-#   create-dockerfile: Create a dockerfile based on this. Useful for creating
-#      extensible base images that don't need a file. Default is true.
-#   partials: List of VALUEs, where a VALUE is either:
-#      - the name of a partial, which inserts that partial into this image
-#      - image: [name of another image], which inserts the partials from that
-#        image into this image
-#   arg-defaults: List of VALUEs, where a VALUE is either:
-#      - ARG_NAME: VALUE, which sets the ARG_NAME to VALUE wherever it appears
-#        in this image's partials
-#      - [name of another image], which loads the default args from that image
-images:
+    ubuntu:
+        - add_to_name: ""
+          dockerfile_exclusive_name: "cpu"
+          partials:
+              - ubuntu/version
+              - ubuntu/cpu
+              - ubuntu/python
+              - tensorflow
+              - shell
+        - add_to_name: "-gpu"
+          dockerfile_exclusive_name: "gpu"
+          args:
+              - TF_PACKAGE=tensorflow-gpu
+          partials:
+              - ubuntu/version
+              - ubuntu/nvidia
+              - ubuntu/python
+              - tensorflow
+              - shell
+          tests:
+              - import-gpu.sh
+          test_runtime: nvidia
+        - add_to_name: "-devel"
+          dockerfile_exclusive_name: "cpu-devel"
+          partials:
+              - ubuntu/version
+              - ubuntu/cpu-devel
+              - ubuntu/python
+              - ubuntu/bazel
+              - shell
+          tests:
+              - build-cpu.sh
+        - add_to_name: "-gpu-devel"
+          dockerfile_exclusive_name: "gpu-devel"
+          partials:
+              - ubuntu/version
+              - ubuntu/nvidia-devel
+              - ubuntu/python
+              - ubuntu/bazel
+              - shell
+          tests:
+              - build-gpu.sh
+          test_runtime: nvidia
 
-    nodev:
-        create-dockerfile: false
-        partials:
-            - python
-            - tensorflow
-            - shell
-
-    dev:
-        create-dockerfile: false
-        partials:
-            - python
-            - bazel
-            - shell
-
-    cpu:
-      desc: Ubuntu-based, CPU-only environment for using TensorFlow
-      partials:
-        - ubuntu
-        - image: nodev
-
-    cpu-devel:
-      desc: >
-          Ubuntu-based, CPU-only environment for developing changes for
-          TensorFlow.
-      partials:
-        - ubuntu-devel
-        - image: dev
-
-    nvidia:
-      desc: Ubuntu-based, Nvidia-GPU-enabled environment for using TensorFlow.
-      arg-defaults: 
-        - TF_PACKAGE: tensorflow-gpu
-      partials:
-        - nvidia
-        - image: nodev
-
-    nvidia-devel:
-      desc: >
-          Ubuntu-based, Nvidia-GPU-enabled environment for developing changes
-          for TensorFlow.
-      arg-defaults: 
-        - TF_PACKAGE: tensorflow-gpu
-      partials:
-        - nvidia-devel
-        - image: dev
-
-    cpu-jupyter:
-      desc: >
-          Ubuntu-based, CPU-only environment for using TensorFlow, with Jupyter
-          included.
-      partials:
-        - image: cpu
-        - jupyter
-
-    cpu-devel-jupyter:
-      desc: >
-         Ubuntu-based, CPU-only environment for developing changes for
-         TensorFlow, with Jupyter included.
-      partials:
-        - image: cpu-devel
-        - jupyter
-
-    nvidia-jupyter:
-      desc: >
-        Ubuntu-based, Nvidia-GPU-enabled environment for using TensorFlow, with
-        Jupyter included.
-      arg-defaults: 
-        - nvidia
-      partials:
-        - image: nvidia
-        - jupyter
-
-    nvidia-devel-jupyter:
-      desc: >
-        Ubuntu-based, Nvidia-GPU-enabled environment for developing changes for
-        TensorFlow, with Jupyter included.
-      arg-defaults: 
-        - nvidia-devel
-      partials:
-        - image: nvidia-devel
-        - jupyter
+    nightly:
+        - add_to_name: "nightly"
+          partials:
+              - ubuntu/version
+              - ubuntu/cpu
+              - ubuntu/python
+              - tensorflow
+              - shell
+          args:
+              - TF_PACKAGE=tf-nightly
+          tests:
+              - import.sh
+        - add_to_name: "nightly-gpu"
+          partials:
+              - ubuntu/version
+              - ubuntu/nvidia
+              - ubuntu/python
+              - tensorflow
+              - shell
+          test_runtime: nvidia
+          tests:
+              - import-gpu.sh
+          args:
+              - TF_PACKAGE=tf-nightly-gpu
diff --git a/tensorflow/tools/dockerfiles/tests/build-cpu.sh b/tensorflow/tools/dockerfiles/tests/build-cpu.sh
new file mode 100755
index 00000000000..bcdc4c2139c
--- /dev/null
+++ b/tensorflow/tools/dockerfiles/tests/build-cpu.sh
@@ -0,0 +1,37 @@
+#!/usr/bin/env bash
+
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+# Download and build TensorFlow.
+set -euxo pipefail
+git clone --branch=master --depth=1 https://github.com/tensorflow/tensorflow.git /tensorflow
+cd /tensorflow
+
+ln -s $(which ${PYTHON}) /usr/local/bin/python 
+
+# For optimized builds appropriate for the hardware platform of your choosing, uncomment below...
+# For ivy-bridge or sandy-bridge
+# --copt=-march="ivybridge" \
+# for haswell, broadwell, or skylake
+# --copt=-march="haswell" \
+tensorflow/tools/ci_build/builds/configured CPU \
+  bazel build -c opt --copt=-mavx --cxxopt="-D_GLIBCXX_USE_CXX11_ABI=0" \
+      tensorflow/tools/pip_package:build_pip_package && \
+  bazel-bin/tensorflow/tools/pip_package/build_pip_package /tmp/pip && \
+  pip --no-cache-dir install --upgrade /tmp/pip/tensorflow-*.whl && \
+  rm -rf /tmp/pip && \
+  rm -rf /root/.cache
+
diff --git a/tensorflow/tools/dockerfiles/tests/build-gpu.sh b/tensorflow/tools/dockerfiles/tests/build-gpu.sh
new file mode 100755
index 00000000000..76b25d5a741
--- /dev/null
+++ b/tensorflow/tools/dockerfiles/tests/build-gpu.sh
@@ -0,0 +1,36 @@
+#!/usr/bin/env bash
+
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+# Download and build TensorFlow.
+set -euxo pipefail
+git clone --branch=master --depth=1 https://github.com/tensorflow/tensorflow.git /tensorflow
+cd /tensorflow
+
+ln -s $(which ${PYTHON}) /usr/local/bin/python 
+
+ln -s /usr/local/cuda/lib64/stubs/libcuda.so /usr/local/cuda/lib64/stubs/libcuda.so.1
+
+LD_LIBRARY_PATH=/usr/local/cuda/lib64/stubs:${LD_LIBRARY_PATH} \
+tensorflow/tools/ci_build/builds/configured GPU \
+bazel build -c opt --copt=-mavx --config=cuda \
+    --cxxopt="-D_GLIBCXX_USE_CXX11_ABI=0" \
+    tensorflow/tools/pip_package:build_pip_package && \
+rm /usr/local/cuda/lib64/stubs/libcuda.so.1 && \
+bazel-bin/tensorflow/tools/pip_package/build_pip_package /tmp/pip && \
+pip --no-cache-dir install --upgrade /tmp/pip/tensorflow-*.whl && \
+rm -rf /tmp/pip && \
+rm -rf /root/.cache
diff --git a/tensorflow/tools/dockerfiles/tests/import-gpu.sh b/tensorflow/tools/dockerfiles/tests/import-gpu.sh
new file mode 100755
index 00000000000..6559210dcbf
--- /dev/null
+++ b/tensorflow/tools/dockerfiles/tests/import-gpu.sh
@@ -0,0 +1,18 @@
+#!/usr/bin/env bash
+
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+python -c 'import tensorflow as tf; tf.test.is_gpu_available() or exit(1)'
diff --git a/tensorflow/tools/dockerfiles/tests/import.sh b/tensorflow/tools/dockerfiles/tests/import.sh
new file mode 100755
index 00000000000..b73bd86a852
--- /dev/null
+++ b/tensorflow/tools/dockerfiles/tests/import.sh
@@ -0,0 +1,19 @@
+#!/usr/bin/env bash
+
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+set -euxo pipefail
+python -c 'import tensorflow as tf'
diff --git a/tensorflow/tools/dockerfiles/assembler.Dockerfile b/tensorflow/tools/dockerfiles/tools.Dockerfile
similarity index 95%
rename from tensorflow/tools/dockerfiles/assembler.Dockerfile
rename to tensorflow/tools/dockerfiles/tools.Dockerfile
index 7a8e07fced3..e8929295a5e 100644
--- a/tensorflow/tools/dockerfiles/assembler.Dockerfile
+++ b/tensorflow/tools/dockerfiles/tools.Dockerfile
@@ -20,8 +20,9 @@
 FROM debian:stretch
 LABEL maintainer="Austin Anderson <angerson@google.com>"
 
-RUN apt-get update && apt-get install -y python3 python3-pip bash
-RUN pip3 install --upgrade pip setuptools pyyaml absl-py cerberus
+RUN apt-get update && apt-get install -y python3 python3-pip bash curl
+RUN curl -sSL https://get.docker.com/ | sh
+RUN pip3 install --upgrade pip setuptools pyyaml absl-py cerberus docker
 
 WORKDIR /tf
 VOLUME ["/tf"]
diff --git a/tensorflow/tools/docs/BUILD b/tensorflow/tools/docs/BUILD
index 1a53f241773..b072853a4ec 100644
--- a/tensorflow/tools/docs/BUILD
+++ b/tensorflow/tools/docs/BUILD
@@ -142,15 +142,28 @@ py_test(
     ],
 )
 
-py_binary(
-    name = "generate_1_0",
-    srcs = ["generate_1_0.py"],
+py_test(
+    name = "generate2_test",
+    srcs = ["generate2_test.py"],
     srcs_version = "PY2AND3",
-    deps = [
-        ":generate_lib",
-        "//tensorflow:tensorflow_py",
-        "//tensorflow/python/debug:debug_py",
+    tags = [
+        "manual",
+        # No reason to run sanitizers or fastbuild for this test.
+        "noasan",
+        "nomsan",
+        "notsan",
+        "optonly",
     ],
+    deps = [
+        ":generate2",
+    ],
+)
+
+py_binary(
+    name = "generate2",
+    srcs = ["generate2.py"],
+    srcs_version = "PY2AND3",
+    deps = ["//tensorflow:tensorflow_py"],
 )
 
 py_library(
diff --git a/tensorflow/tools/docs/generate2.py b/tensorflow/tools/docs/generate2.py
new file mode 100644
index 00000000000..fba909d26de
--- /dev/null
+++ b/tensorflow/tools/docs/generate2.py
@@ -0,0 +1,82 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+r"""A tool to generate api_docs for TensorFlow2.
+
+```
+python generate2.py --output_dir=/tmp/out
+```
+
+Requires a local installation of:
+  https://github.com/tensorflow/docs/tree/master/tools
+  tf-nightly-2.0-preview
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from os import path
+
+from absl import app
+from absl import flags
+
+import tensorflow as tf
+
+from tensorflow_docs.api_generator import generate_lib
+
+FLAGS = flags.FLAGS
+
+flags.DEFINE_string(
+    "code_url_prefix",
+    "/code/stable/tensorflow/",
+    "A url to prepend to code paths when creating links to defining code")
+
+flags.DEFINE_string(
+    "output_dir", "/tmp/out",
+    "A directory, where the docs will be output to.")
+
+flags.DEFINE_bool("search_hints", True,
+                  "Include meta-data search hints at the top of each file.")
+
+
+def build_docs(output_dir, code_url_prefix, search_hints=True):
+  """Build api docs for tensorflow v2.
+
+  Args:
+    output_dir: A string path, where to put the files.
+    code_url_prefix: prefix for "Defined in" links.
+    search_hints: Bool. Include meta-data search hints at the top of each file.
+  """
+  base_dir = path.dirname(tf.__file__)
+  doc_generator = generate_lib.DocGenerator(
+      root_title="TensorFlow 2.0 Preview",
+      py_modules=[("tf", tf)],
+      base_dir=base_dir,
+      search_hints=search_hints,
+      code_url_prefix=code_url_prefix,
+      site_path="api_docs/")
+
+  doc_generator.build(output_dir)
+
+
+def main(argv):
+  del argv
+  build_docs(output_dir=FLAGS.output_dir,
+             code_url_prefix=FLAGS.code_url_prefix,
+             search_hints=FLAGS.search_hints)
+
+
+if __name__ == "__main__":
+  app.run(main)
diff --git a/tensorflow/contrib/estimator/python/estimator/linear.py b/tensorflow/tools/docs/generate2_test.py
similarity index 60%
rename from tensorflow/contrib/estimator/python/estimator/linear.py
rename to tensorflow/tools/docs/generate2_test.py
index b6a4444f66c..774d45c536b 100644
--- a/tensorflow/contrib/estimator/python/estimator/linear.py
+++ b/tensorflow/tools/docs/generate2_test.py
@@ -12,21 +12,28 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""linear python module.
-
-Importing from tensorflow.python.estimator is unsupported
-and will soon break!
-"""
-# pylint: disable=unused-import,g-bad-import-order,g-import-not-at-top,wildcard-import
+"""Tests for tensorflow.tools.docs.generate2."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow_estimator.contrib.estimator.python.estimator import linear
+import os
+import shutil
 
-# Include attrs that start with single underscore.
-_HAS_DYNAMIC_ATTRIBUTES = True
-linear.__all__ = [s for s in dir(linear) if not s.startswith('__')]
+from tensorflow.python.platform import googletest
+from tensorflow.tools.docs import generate2
 
-from tensorflow_estimator.contrib.estimator.python.estimator.linear import *
+
+class Generate2Test(googletest.TestCase):
+
+  def test_end_to_end(self):
+    output_dir = os.path.join(googletest.GetTempDir(), 'output')
+    if os.path.exists(output_dir):
+      shutil.rmtree(output_dir)
+    os.makedirs(output_dir)
+    generate2.build_docs(output_dir=output_dir, code_url_prefix='')
+
+
+if __name__ == '__main__':
+  googletest.main()
diff --git a/tensorflow/tools/docs/generate_1_0.py b/tensorflow/tools/docs/generate_1_0.py
deleted file mode 100644
index f4384e0ced7..00000000000
--- a/tensorflow/tools/docs/generate_1_0.py
+++ /dev/null
@@ -1,92 +0,0 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Generate docs for the TensorFlow Python API."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import os
-import sys
-
-import tensorflow as tf
-
-from tensorflow.python import debug as tf_debug
-from tensorflow.python.util import tf_inspect
-from tensorflow.tools.docs import generate_lib
-
-if __name__ == '__main__':
-  doc_generator = generate_lib.DocGenerator()
-  doc_generator.add_output_dir_argument()
-  doc_generator.add_src_dir_argument()
-
-  # This doc generator works on the TensorFlow codebase. Since this script lives
-  # at tensorflow/tools/docs, and all code is defined somewhere inside
-  # tensorflow/, we can compute the base directory (two levels up), which is
-  # valid unless we're trying to apply this to a different code base, or are
-  # moving the script around.
-  script_dir = os.path.dirname(tf_inspect.getfile(tf_inspect.currentframe()))
-  default_base_dir = os.path.join(script_dir, '..', '..')
-  doc_generator.add_base_dir_argument(default_base_dir)
-
-  flags = doc_generator.parse_known_args()
-
-  # tf_debug is not imported with tf, it's a separate module altogether
-  doc_generator.set_py_modules([('tf', tf), ('tfdbg', tf_debug)])
-
-  doc_generator.set_do_not_descend_map({
-      'tf': ['cli', 'lib', 'wrappers'],
-      'tf.contrib': [
-          'compiler',
-          'factorization',
-          'grid_rnn',
-          'labeled_tensor',
-          'quantization',
-          'session_bundle',
-          'slim',
-          'solvers',
-          'specs',
-          'tensor_forest',
-          'tensorboard',
-          'testing',
-          'training',
-          'tfprof',
-      ],
-      'tf.contrib.bayesflow': [
-          'entropy', 'monte_carlo', 'special_math',
-          'stochastic_gradient_estimators', 'stochastic_graph',
-          'stochastic_tensor', 'stochastic_variables', 'variational_inference'
-      ],
-      'tf.contrib.distributions': ['bijector'],
-      'tf.contrib.ffmpeg': ['ffmpeg_ops'],
-      'tf.contrib.graph_editor': [
-          'edit', 'match', 'reroute', 'subgraph', 'transform', 'select', 'util'
-      ],
-      'tf.contrib.layers': ['feature_column', 'summaries'],
-      'tf.contrib.learn': [
-          'datasets',
-          'head',
-          'graph_actions',
-          'io',
-          'models',
-          'monitors',
-          'ops',
-          'preprocessing',
-          'utils',
-      ],
-      'tf.contrib.util': ['loader'],
-  })
-
-  sys.exit(doc_generator.build(flags))
diff --git a/tensorflow/tools/docs/parser.py b/tensorflow/tools/docs/parser.py
index 83b4bf81288..6dc18ee8dc9 100644
--- a/tensorflow/tools/docs/parser.py
+++ b/tensorflow/tools/docs/parser.py
@@ -39,7 +39,7 @@ def is_free_function(py_object, full_name, index):
   """Check if input is a free function (and not a class- or static method).
 
   Args:
-    py_object: The the object in question.
+    py_object: The object in question.
     full_name: The full name of the object, like `tf.module.symbol`.
     index: The {full_name:py_object} dictionary for the public API.
 
diff --git a/tensorflow/tools/lib_package/BUILD b/tensorflow/tools/lib_package/BUILD
index 7aaa845ae92..1cac5ee1383 100644
--- a/tensorflow/tools/lib_package/BUILD
+++ b/tensorflow/tools/lib_package/BUILD
@@ -112,6 +112,7 @@ pkg_tar(
 genrule(
     name = "clicenses_generate",
     srcs = [
+        "//third_party/icu/data:LICENSE",
         "//third_party/hadoop:LICENSE.txt",
         "//third_party/eigen3:LICENSE",
         "//third_party/fft2d:LICENSE",
@@ -180,6 +181,7 @@ genrule(
 genrule(
     name = "jnilicenses_generate",
     srcs = [
+        "//third_party/icu/data:LICENSE",
         "//third_party/hadoop:LICENSE.txt",
         "//third_party/eigen3:LICENSE",
         "//third_party/fft2d:LICENSE",
diff --git a/tensorflow/tools/pip_package/BUILD b/tensorflow/tools/pip_package/BUILD
index 3a863d3c523..82c6bf383fb 100644
--- a/tensorflow/tools/pip_package/BUILD
+++ b/tensorflow/tools/pip_package/BUILD
@@ -88,6 +88,9 @@ COMMON_PIP_DEPS = [
     "//tensorflow/contrib/timeseries:timeseries_pip",
     "//tensorflow/contrib/tpu",
     "//tensorflow/examples/tutorials/mnist:package",
+    "//tensorflow/lite/python:interpreter_test_data",
+    "//tensorflow/lite/python:tflite_convert",
+    "//tensorflow/lite/toco/python:toco_from_protos",
     # "//tensorflow/python/autograph/converters:converters",
     # "//tensorflow/python/autograph/core:core",
     "//tensorflow/python/autograph/core:test_lib",
@@ -124,7 +127,7 @@ COMMON_PIP_DEPS = [
 py_binary(
     name = "simple_console_for_windows",
     srcs = ["simple_console_for_windows.py"],
-    data = COMMON_PIP_DEPS,
+    data = COMMON_PIP_DEPS + ["//tensorflow/python:pywrap_tensorflow_import_lib_file"],
     srcs_version = "PY2AND3",
     deps = ["//tensorflow:tensorflow_py"],
 )
@@ -132,6 +135,7 @@ py_binary(
 filegroup(
     name = "licenses",
     data = [
+        "//third_party/icu/data:LICENSE",
         "//third_party/eigen3:LICENSE",
         "//third_party/fft2d:LICENSE",
         "//third_party/hadoop:LICENSE.txt",
@@ -227,15 +231,9 @@ sh_binary(
     data = select({
         "//tensorflow:windows": [
             ":simple_console_for_windows",
-            "//tensorflow/lite/python:interpreter_test_data",
-            "//tensorflow/lite/python:tflite_convert",
-            "//tensorflow/lite/toco/python:toco_from_protos",
         ],
         "//conditions:default": COMMON_PIP_DEPS + [
             ":simple_console",
-            "//tensorflow/lite/python:interpreter_test_data",
-            "//tensorflow/lite/python:tflite_convert",
-            "//tensorflow/lite/toco/python:toco_from_protos",
         ],
     }) + if_mkl_ml(["//third_party/mkl:intel_binary_blob"]),
 )
diff --git a/tensorflow/tools/pip_package/setup.py b/tensorflow/tools/pip_package/setup.py
index 07475cc0c4d..85c913f1588 100644
--- a/tensorflow/tools/pip_package/setup.py
+++ b/tensorflow/tools/pip_package/setup.py
@@ -87,7 +87,8 @@ if 'tf_nightly' in project_name:
   for i, pkg in enumerate(REQUIRED_PACKAGES):
     if 'tensorboard' in pkg:
       REQUIRED_PACKAGES[i] = 'tb-nightly >= 1.13.0a0, < 1.14.0a0'
-      break
+    if 'tensorflow_estimator' in pkg:
+      REQUIRED_PACKAGES[i] = 'tf-estimator-nightly'
 
 # weakref.finalize and enum were introduced in Python 3.4
 if sys.version_info < (3, 4):
@@ -106,6 +107,7 @@ CONSOLE_SCRIPTS = [
     # TensorBoard command, pip will inappropriately remove it during install,
     # even though the command is not removed, just moved to a different wheel.
     'tensorboard = tensorboard.main:run_main',
+    'tf_upgrade_v2 = tensorflow.tools.compatibility.tf_upgrade_v2_main:main',
 ]
 # pylint: enable=line-too-long
 
diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index 6d3562caef6..7e5f84be165 100755
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -77,31 +77,31 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
     mkl_repository(
         name = "mkl_linux",
         build_file = clean_dep("//third_party/mkl:mkl.BUILD"),
-        sha256 = "e2233534a9d15c387e22260997af4312a39e9f86f791768409be273b5453c4e6",
-        strip_prefix = "mklml_lnx_2019.0.20180710",
+        sha256 = "f00dc3b142a5be399bdeebd7e7ea369545a35d4fb84c86f98b6b048d72685295",
+        strip_prefix = "mklml_lnx_2019.0.1.20180928",
         urls = [
-            "https://mirror.bazel.build/github.com/intel/mkl-dnn/releases/download/v0.16/mklml_lnx_2019.0.20180710.tgz",
-            "https://github.com/intel/mkl-dnn/releases/download/v0.16/mklml_lnx_2019.0.20180710.tgz",
+            "https://mirror.bazel.build/github.com/intel/mkl-dnn/releases/download/v0.17-rc/mklml_lnx_2019.0.1.20180928.tgz",
+            "https://github.com/intel/mkl-dnn/releases/download/v0.17-rc/mklml_lnx_2019.0.1.20180928.tgz",
         ],
     )
     mkl_repository(
         name = "mkl_windows",
         build_file = clean_dep("//third_party/mkl:mkl.BUILD"),
-        sha256 = "3fdcff17b018a0082491adf3ba143358265336a801646e46e0191ec8d58d24a2",
-        strip_prefix = "mklml_win_2019.0.20180710",
+        sha256 = "efef90b7b9613fab10f44c8ac4ff28db613a112c64ed94826d7e44df09c44b0b",
+        strip_prefix = "mklml_win_2019.0.1.20180928",
         urls = [
-            "https://mirror.bazel.build/github.com/intel/mkl-dnn/releases/download/v0.16/mklml_win_2019.0.20180710.zip",
-            "https://github.com/intel/mkl-dnn/releases/download/v0.16/mklml_win_2019.0.20180710.zip",
+            "https://mirror.bazel.build/github.com/intel/mkl-dnn/releases/download/v0.17-rc/mklml_win_2019.0.1.20180928.zip",
+            "https://github.com/intel/mkl-dnn/releases/download/v0.17-rc/mklml_win_2019.0.1.20180928.zip",
         ],
     )
     mkl_repository(
         name = "mkl_darwin",
         build_file = clean_dep("//third_party/mkl:mkl.BUILD"),
-        sha256 = "411a30014a938eb83fb9f37b3dbe8e371b106fc1dd621fc23123cadc72737ce6",
-        strip_prefix = "mklml_mac_2019.0.20180710",
+        sha256 = "83f02938a0c095274db7b8b7b694157abafa3837c5cbaef740440d466c86a477",
+        strip_prefix = "mklml_mac_2019.0.1.20180928",
         urls = [
-            "https://mirror.bazel.build/github.com/intel/mkl-dnn/releases/download/v0.16/mklml_mac_2019.0.20180710.tgz",
-            "https://github.com/intel/mkl-dnn/releases/download/v0.16/mklml_mac_2019.0.20180710.tgz",
+            "https://mirror.bazel.build/github.com/intel/mkl-dnn/releases/download/v0.17-rc/mklml_mac_2019.0.1.20180928.tgz",
+            "https://github.com/intel/mkl-dnn/releases/download/v0.17-rc/mklml_mac_2019.0.1.20180928.tgz",
         ],
     )
 
@@ -112,11 +112,11 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
     tf_http_archive(
         name = "mkl_dnn",
         build_file = clean_dep("//third_party/mkl_dnn:mkldnn.BUILD"),
-        sha256 = "363cc9239eacf8e7917753c6d8c94f767e4cd049160d0654a61ef32d5e1b3049",
-        strip_prefix = "mkl-dnn-4e333787e0d66a1dca1218e99a891d493dbc8ef1",
+        sha256 = "b100f57af4a2b59a3a37a1ba38f77b644d2107d758a1a7f4e51310063cd21e73",
+        strip_prefix = "mkl-dnn-733fc908874c71a5285043931a1cf80aa923165c",
         urls = [
-            "https://mirror.bazel.build/github.com/intel/mkl-dnn/archive/4e333787e0d66a1dca1218e99a891d493dbc8ef1.tar.gz",
-            "https://github.com/intel/mkl-dnn/archive/4e333787e0d66a1dca1218e99a891d493dbc8ef1.tar.gz",
+            "https://mirror.bazel.build/github.com/intel/mkl-dnn/archive/733fc908874c71a5285043931a1cf80aa923165c.tar.gz",
+            "https://github.com/intel/mkl-dnn/archive/733fc908874c71a5285043931a1cf80aa923165c.tar.gz",
         ],
     )
 
@@ -134,12 +134,11 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
     tf_http_archive(
         name = "eigen_archive",
         build_file = clean_dep("//third_party:eigen.BUILD"),
-        patch_file = clean_dep("//third_party:eigen_reshaped.patch"),
-        sha256 = "d66cec3b54b3dfaa4666c1d49481a7197f93fc078cd53c54e2b4a8893a529c9f",
-        strip_prefix = "eigen-eigen-b4890dc6bc34",
+        sha256 = "37a483ec219c43219b6e0fc07e799277a4a36abb2b9f4162cfcd256aa211eae8",
+        strip_prefix = "eigen-eigen-2e50f4a5542a",
         urls = [
-            "https://mirror.bazel.build/bitbucket.org/eigen/eigen/get/b4890dc6bc34.tar.gz",
-            "https://bitbucket.org/eigen/eigen/get/b4890dc6bc34.tar.gz",
+            "https://mirror.bazel.build/bitbucket.org/eigen/eigen/get/2e50f4a5542a.tar.gz",
+            "https://bitbucket.org/eigen/eigen/get/2e50f4a5542a.tar.gz",
         ],
     )
 
@@ -180,15 +179,15 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
 
     tf_http_archive(
         name = "com_github_googlecloudplatform_google_cloud_cpp",
-        sha256 = "fdd3b3aecce60987e5525e55bf3a21d68a8695320bd5b980775af6507eec3944",
-        strip_prefix = "google-cloud-cpp-14760a86c4ffab9943b476305c4fe927ad95db1c",
+        sha256 = "3ade2072e6588ff56c0434abe6c63aa5f3f2d56be15a299bafc7e9cdf0a12c17",
+        strip_prefix = "google-cloud-cpp-0.3.0",
         system_build_file = clean_dep("//third_party/systemlibs:google_cloud_cpp.BUILD"),
         system_link_files = {
             "//third_party/systemlibs:google_cloud_cpp.google.cloud.bigtable.BUILD": "google/cloud/bigtable/BUILD",
         },
         urls = [
-            "https://mirror.bazel.build/github.com/GoogleCloudPlatform/google-cloud-cpp/archive/14760a86c4ffab9943b476305c4fe927ad95db1c.tar.gz",
-            "https://github.com/GoogleCloudPlatform/google-cloud-cpp/archive/14760a86c4ffab9943b476305c4fe927ad95db1c.tar.gz",
+            "https://mirror.bazel.build/github.com/GoogleCloudPlatform/google-cloud-cpp/archive/v0.3.0.tar.gz",
+            "https://github.com/GoogleCloudPlatform/google-cloud-cpp/archive/v0.3.0.tar.gz",
         ],
     )
 
@@ -348,11 +347,11 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
     )
 
     PROTOBUF_URLS = [
-        "https://mirror.bazel.build/github.com/google/protobuf/archive/v3.6.1.tar.gz",
-        "https://github.com/google/protobuf/archive/v3.6.1.tar.gz",
+        "https://mirror.bazel.build/github.com/google/protobuf/archive/v3.6.1.1.tar.gz",
+        "https://github.com/google/protobuf/archive/v3.6.1.1.tar.gz",
     ]
-    PROTOBUF_SHA256 = "3d4e589d81b2006ca603c1ab712c9715a76227293032d05b26fca603f90b3f5b"
-    PROTOBUF_STRIP_PREFIX = "protobuf-3.6.1"
+    PROTOBUF_SHA256 = "1ade182f91f0fa6c6116195def5d22270e01b9d03fe91319e4c6215022d0d24b"
+    PROTOBUF_STRIP_PREFIX = "protobuf-3.6.1.1"
 
     tf_http_archive(
         name = "protobuf_archive",
@@ -473,11 +472,11 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
     tf_http_archive(
         name = "llvm",
         build_file = clean_dep("//third_party/llvm:llvm.autogenerated.BUILD"),
-        sha256 = "a22a9b4c3af50a52ba0015b6987bba7202c3ec8e1d40ae76ee7d7643638936ae",
-        strip_prefix = "llvm-b4ace5f3454131a3070ef7c11e19e42fc9a80b4e",
+        sha256 = "34170a4aa07e434dd537d98a705dcf1b3901f73820fe1d6b9370e8c1c94e9157",
+        strip_prefix = "llvm-0487bd8f42c8b38166ff825d56014d0ff49db604",
         urls = [
-            "https://mirror.bazel.build/github.com/llvm-mirror/llvm/archive/b4ace5f3454131a3070ef7c11e19e42fc9a80b4e.tar.gz",
-            "https://github.com/llvm-mirror/llvm/archive/b4ace5f3454131a3070ef7c11e19e42fc9a80b4e.tar.gz",
+            "https://mirror.bazel.build/github.com/llvm-mirror/llvm/archive/0487bd8f42c8b38166ff825d56014d0ff49db604.tar.gz",
+            "https://github.com/llvm-mirror/llvm/archive/0487bd8f42c8b38166ff825d56014d0ff49db604.tar.gz",
         ],
     )
 
@@ -690,11 +689,11 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
     tf_http_archive(
         name = "arm_neon_2_x86_sse",
         build_file = clean_dep("//third_party:arm_neon_2_x86_sse.BUILD"),
-        sha256 = "c8d90aa4357f8079d427e87a6f4c493da1fa4140aee926c05902d7ec1533d9a5",
-        strip_prefix = "ARM_NEON_2_x86_SSE-0f77d9d182265259b135dad949230ecbf1a2633d",
+        sha256 = "213733991310b904b11b053ac224fee2d4e0179e46b52fe7f8735b8831e04dcc",
+        strip_prefix = "ARM_NEON_2_x86_SSE-1200fe90bb174a6224a525ee60148671a786a71f",
         urls = [
-            "https://mirror.bazel.build/github.com/intel/ARM_NEON_2_x86_SSE/archive/0f77d9d182265259b135dad949230ecbf1a2633d.tar.gz",
-            "https://github.com/intel/ARM_NEON_2_x86_SSE/archive/0f77d9d182265259b135dad949230ecbf1a2633d.tar.gz",
+            "https://mirror.bazel.build/github.com/intel/ARM_NEON_2_x86_SSE/archive/1200fe90bb174a6224a525ee60148671a786a71f.tar.gz",
+            "https://github.com/intel/ARM_NEON_2_x86_SSE/archive/1200fe90bb174a6224a525ee60148671a786a71f.tar.gz",
         ],
     )
 
diff --git a/third_party/clang_toolchain/download_clang.bzl b/third_party/clang_toolchain/download_clang.bzl
index 9023e250b2f..7ced9027473 100644
--- a/third_party/clang_toolchain/download_clang.bzl
+++ b/third_party/clang_toolchain/download_clang.bzl
@@ -39,15 +39,15 @@ def download_clang(repo_ctx, out_folder):
 
     # Latest CLANG_REVISION and CLANG_SUB_REVISION of the Chromiums's release
     # can be found in https://chromium.googlesource.com/chromium/src/tools/clang/+/master/scripts/update.py
-    CLANG_REVISION = "346388"
+    CLANG_REVISION = "347933"
     CLANG_SUB_REVISION = 1
 
     package_version = "%s-%s" % (CLANG_REVISION, CLANG_SUB_REVISION)
 
     checksums = {
-        "Linux_x64": "5e5564e4e743414c7eaec9fd9e739732ddd2a343e49bde4c88fc2530b1c598b9",
-        "Mac": "19271a7cc5c2bcaf9643d3dd622b5458569dc662bbc58f63b129cf6e3a4e3243",
-        "Win": "60b0bd1f11e53892109f4159e2aba0f803604823e07875ca98b82bd5628d7f4d",
+        "Linux_x64": "cae3643fdf5d46fc9bc8731212bb37573547148d90b64b083165e090133d11b0",
+        "Mac": "083a0e91a38c06e568652313ac7372b17a101268f7d65533d721ca30413442b4",
+        "Win": "43160487cfc7e88076a369a2b6e8e4a0f42e104c28d8903f3aaa62d630aba949",
     }
 
     platform_folder = _get_platform_folder(repo_ctx.os.name)
diff --git a/third_party/eigen_reshaped.patch b/third_party/eigen_reshaped.patch
deleted file mode 100644
index 7acfdcf9fef..00000000000
--- a/third_party/eigen_reshaped.patch
+++ /dev/null
@@ -1,48 +0,0 @@
---- a/Eigen/src/Core/util/ReshapedHelper.h	(date 1541195478000)
-+++ b/Eigen/src/Core/util/ReshapedHelper.h	(date 1541195478000)
-@@ -39,6 +39,11 @@
-   return total/other;
- }
-
-+template<int Flags, int Order>
-+struct get_compiletime_reshape_order {
-+  enum { value = Order == AutoOrder ? Flags & RowMajorBit : Order };
-+};
-+
- }
-
- } // end namespace Eigen
---- a/Eigen/src/plugins/ReshapedMethods.h	(date 1541195254000)
-+++ b/Eigen/src/plugins/ReshapedMethods.h	(date 1541195254000)
-@@ -105,13 +105,13 @@
- inline Reshaped<EIGEN_RESHAPED_METHOD_CONST Derived,
-                 internal::get_compiletime_reshape_size<NRowsType,NColsType,SizeAtCompileTime>::value,
-                 internal::get_compiletime_reshape_size<NColsType,NRowsType,SizeAtCompileTime>::value,
--                (Order==AutoOrder?Flags&RowMajorBit:Order)>
-+                internal::get_compiletime_reshape_order<Flags,Order>::value>
- reshaped(NRowsType nRows, NColsType nCols) EIGEN_RESHAPED_METHOD_CONST
- {
-   return Reshaped<EIGEN_RESHAPED_METHOD_CONST Derived,
-                   internal::get_compiletime_reshape_size<NRowsType,NColsType,SizeAtCompileTime>::value,
-                   internal::get_compiletime_reshape_size<NColsType,NRowsType,SizeAtCompileTime>::value,
--                  (Order==AutoOrder?Flags&RowMajorBit:Order)>
-+                  internal::get_compiletime_reshape_order<Flags,Order>::value>
-                 (derived(),
-                  internal::get_runtime_reshape_size(nRows,internal::get_runtime_value(nCols),size()),
-                  internal::get_runtime_reshape_size(nCols,internal::get_runtime_value(nRows),size()));
-@@ -128,11 +128,13 @@
-
- template<int Order>
- EIGEN_DEVICE_FUNC
--inline Reshaped<EIGEN_RESHAPED_METHOD_CONST Derived, SizeAtCompileTime, 1, (Order==AutoOrder?Flags&RowMajorBit:Order)>
-+inline Reshaped<EIGEN_RESHAPED_METHOD_CONST Derived, SizeAtCompileTime, 1,
-+                internal::get_compiletime_reshape_order<Flags,Order>::value>
- reshaped() EIGEN_RESHAPED_METHOD_CONST
- {
-   EIGEN_STATIC_ASSERT(Order==RowMajor || Order==ColMajor || Order==AutoOrder, INVALID_TEMPLATE_PARAMETER);
--  return Reshaped<EIGEN_RESHAPED_METHOD_CONST Derived, SizeAtCompileTime, 1, (Order==AutoOrder?Flags&RowMajorBit:Order)>
-+  return Reshaped<EIGEN_RESHAPED_METHOD_CONST Derived, SizeAtCompileTime, 1,
-+                  internal::get_compiletime_reshape_order<Flags,Order>::value>
-                 (derived(), size(), 1);
- }
- 
\ No newline at end of file
diff --git a/third_party/googleapis.BUILD b/third_party/googleapis.BUILD
index 95e999af188..b8871eda728 100644
--- a/third_party/googleapis.BUILD
+++ b/third_party/googleapis.BUILD
@@ -13,7 +13,9 @@
 # limitations under the License.
 
 package(default_visibility = ["//visibility:public"])
+
 licenses(["notice"])  # Apache 2.0
+
 exports_files(["LICENSE"])
 
 load("@protobuf_archive//:protobuf.bzl", "cc_proto_library")
@@ -21,6 +23,9 @@ load("@protobuf_archive//:protobuf.bzl", "cc_proto_library")
 cc_proto_library(
     name = "bigtable_protos",
     srcs = [
+        "google/api/annotations.proto",
+        "google/api/auth.proto",
+        "google/api/http.proto",
         "google/bigtable/admin/v2/bigtable_instance_admin.proto",
         "google/bigtable/admin/v2/bigtable_table_admin.proto",
         "google/bigtable/admin/v2/common.proto",
@@ -31,15 +36,12 @@ cc_proto_library(
         "google/iam/v1/iam_policy.proto",
         "google/iam/v1/policy.proto",
         "google/longrunning/operations.proto",
-        "google/rpc/status.proto",
         "google/rpc/error_details.proto",
-        "google/api/annotations.proto",
-        "google/api/auth.proto",
-        "google/api/http.proto",
+        "google/rpc/status.proto",
     ],
     include = ".",
-    protoc = "@protobuf_archive//:protoc",
     default_runtime = "@protobuf_archive//:protobuf",
-    deps = ["@protobuf_archive//:cc_wkt_protos"],
+    protoc = "@protobuf_archive//:protoc",
     use_grpc_plugin = True,
+    deps = ["@protobuf_archive//:cc_wkt_protos"],
 )
diff --git a/third_party/gpus/crosstool/CROSSTOOL.tpl b/third_party/gpus/crosstool/CROSSTOOL.tpl
index 3189cf8e316..921188cbb43 100644
--- a/third_party/gpus/crosstool/CROSSTOOL.tpl
+++ b/third_party/gpus/crosstool/CROSSTOOL.tpl
@@ -184,7 +184,8 @@ toolchain {
       action: "c++-link-dynamic-library"
       action: "c++-link-nodeps-dynamic-library"
       flag_group {
-        flag:"-no-canonical-prefixes"
+        flag: "-no-canonical-prefixes"
+        %{extra_no_canonical_prefixes_flags}
       }
     }
   }
diff --git a/third_party/gpus/cuda_configure.bzl b/third_party/gpus/cuda_configure.bzl
index 831a3067b24..03c67bcb3d7 100644
--- a/third_party/gpus/cuda_configure.bzl
+++ b/third_party/gpus/cuda_configure.bzl
@@ -1418,6 +1418,7 @@ def _create_local_cuda_repository(repository_ctx):
         flag: "-Wno-invalid-partial-specialization"
     """
     cuda_defines["%{host_compiler_includes}"] = host_compiler_includes
+    cuda_defines["%{extra_no_canonical_prefixes_flags}"] = ""
     _tpl(repository_ctx, "crosstool:BUILD", {
         "%{linker_files}": ":empty",
         "%{win_linker_files}": ":empty"
@@ -1439,6 +1440,14 @@ def _create_local_cuda_repository(repository_ctx):
             repository_ctx, cuda_config) +
         "\n  cxx_builtin_include_directory: \"%s\"" % cupti_header_dir +
         "\n  cxx_builtin_include_directory: \"%s\"" % cudnn_header_dir)
+
+    # For gcc, do not canonicalize system header paths; some versions of gcc
+    # pick the shortest possible path for system includes when creating the
+    # .d file - given that includes that are prefixed with "../" multiple
+    # time quickly grow longer than the root of the tree, this can lead to
+    # bazel's header check failing.
+    cuda_defines["%{extra_no_canonical_prefixes_flags}"] = (
+        "flag: \"-fno-canonical-system-headers\"")
     nvcc_path = str(
         repository_ctx.path("%s/bin/nvcc%s" % (
             cuda_config.cuda_toolkit_path,
diff --git a/third_party/gpus/rocm_configure.bzl b/third_party/gpus/rocm_configure.bzl
index 9108639b0bf..6df6799bd76 100644
--- a/third_party/gpus/rocm_configure.bzl
+++ b/third_party/gpus/rocm_configure.bzl
@@ -105,7 +105,7 @@ def get_cxx_inc_directories(repository_ctx, cc):
     return includes_cpp + [
         inc
         for inc in includes_c
-        if inc not in includes_cpp_set
+        if inc not in includes_cpp_set.to_list()
     ]
 
 def auto_configure_fail(msg):
diff --git a/third_party/icu/data/BUILD.bazel b/third_party/icu/data/BUILD.bazel
new file mode 100644
index 00000000000..7db21566e4e
--- /dev/null
+++ b/third_party/icu/data/BUILD.bazel
@@ -0,0 +1,46 @@
+package(
+    default_visibility = ["//visibility:public"],
+)
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+# Data for core MIME/Unix/Windows encodings:
+# ISO 8859-2..9, 15; Windows-125x; EUC-CN; GBK (Windows cp936); GB 18030;
+# Big5 (Windows cp950); SJIS (Windows cp932); EUC-JP; EUC-KR, KS C 5601;
+# Windows cp949. Data is pre-processed for little-endian platforms. To replicate
+# this pre-processing (if you want additional encodings, for example), do the
+# following:
+#
+# First, download, build, and install ICU. This installs tools such as makeconv.
+# Then, run the following from your icu4c/source directory:
+#   $ cd data/mappings
+#   $ rm *.cnv  # there shouldn't be any .cnv files here to begin with
+#   $ grep \.ucm ucmcore.mk | \
+#     sed 's/\(UCM_SOURCE_CORE=\)\?\([^ ]\+\.ucm\)\\\?/\2/g' | \
+#     tr '\n' ' ' | xargs makeconv
+#   $ ls *.cnv > filelist.lst
+#   $ pkgdata -m common -p ucmcore filelist.lst
+#   $ genccode -f custom_conversion_data ucmcore.dat
+# This creates custom_conversion_data.c. You will need to change the target
+# :conversion_data to depend on your custom source instead of :conversion_data.c
+filegroup(
+    name = "conversion_files",
+    srcs = glob(["icu_conversion_data.c.gz.*"]),
+)
+
+# Data files are compressed and split to work around git performance degradation
+# around large files.
+genrule(
+    name = "merge_conversion_data",
+    srcs = [":conversion_files"],
+    outs = ["conversion_data.c"],
+    cmd = "cat $(locations :conversion_files) | gunzip > $@",
+)
+
+cc_library(
+    name = "conversion_data",
+    srcs = [":conversion_data.c"],
+    deps = ["@icu//:headers"],
+)
diff --git a/third_party/icu/data/LICENSE b/third_party/icu/data/LICENSE
new file mode 100644
index 00000000000..25b6eb9d341
--- /dev/null
+++ b/third_party/icu/data/LICENSE
@@ -0,0 +1,414 @@
+COPYRIGHT AND PERMISSION NOTICE (ICU 58 and later)
+
+Copyright © 1991-2018 Unicode, Inc. All rights reserved.
+Distributed under the Terms of Use in http://www.unicode.org/copyright.html.
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of the Unicode data files and any associated documentation
+(the "Data Files") or Unicode software and any associated documentation
+(the "Software") to deal in the Data Files or Software
+without restriction, including without limitation the rights to use,
+copy, modify, merge, publish, distribute, and/or sell copies of
+the Data Files or Software, and to permit persons to whom the Data Files
+or Software are furnished to do so, provided that either
+(a) this copyright and permission notice appear with all copies
+of the Data Files or Software, or
+(b) this copyright and permission notice appear in associated
+Documentation.
+
+THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF
+ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT OF THIRD PARTY RIGHTS.
+IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS
+NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL
+DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
+DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
+TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
+PERFORMANCE OF THE DATA FILES OR SOFTWARE.
+
+Except as contained in this notice, the name of a copyright holder
+shall not be used in advertising or otherwise to promote the sale,
+use or other dealings in these Data Files or Software without prior
+written authorization of the copyright holder.
+
+---------------------
+
+Third-Party Software Licenses
+
+This section contains third-party software notices and/or additional
+terms for licensed third-party software components included within ICU
+libraries.
+
+1. ICU License - ICU 1.8.1 to ICU 57.1
+
+COPYRIGHT AND PERMISSION NOTICE
+
+Copyright (c) 1995-2016 International Business Machines Corporation and others
+All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, and/or sell copies of the Software, and to permit persons
+to whom the Software is furnished to do so, provided that the above
+copyright notice(s) and this permission notice appear in all copies of
+the Software and that both the above copyright notice(s) and this
+permission notice appear in supporting documentation.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT
+OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+HOLDERS INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY
+SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER
+RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF
+CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+Except as contained in this notice, the name of a copyright holder
+shall not be used in advertising or otherwise to promote the sale, use
+or other dealings in this Software without prior written authorization
+of the copyright holder.
+
+All trademarks and registered trademarks mentioned herein are the
+property of their respective owners.
+
+2. Chinese/Japanese Word Break Dictionary Data (cjdict.txt)
+
+ #     The Google Chrome software developed by Google is licensed under
+ # the BSD license. Other software included in this distribution is
+ # provided under other licenses, as set forth below.
+ #
+ #  The BSD License
+ #  http://opensource.org/licenses/bsd-license.php
+ #  Copyright (C) 2006-2008, Google Inc.
+ #
+ #  All rights reserved.
+ #
+ #  Redistribution and use in source and binary forms, with or without
+ # modification, are permitted provided that the following conditions are met:
+ #
+ #  Redistributions of source code must retain the above copyright notice,
+ # this list of conditions and the following disclaimer.
+ #  Redistributions in binary form must reproduce the above
+ # copyright notice, this list of conditions and the following
+ # disclaimer in the documentation and/or other materials provided with
+ # the distribution.
+ #  Neither the name of  Google Inc. nor the names of its
+ # contributors may be used to endorse or promote products derived from
+ # this software without specific prior written permission.
+ #
+ #
+ #  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
+ # CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
+ # INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ # BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ #
+ #
+ #  The word list in cjdict.txt are generated by combining three word lists
+ # listed below with further processing for compound word breaking. The
+ # frequency is generated with an iterative training against Google web
+ # corpora.
+ #
+ #  * Libtabe (Chinese)
+ #    - https://sourceforge.net/project/?group_id=1519
+ #    - Its license terms and conditions are shown below.
+ #
+ #  * IPADIC (Japanese)
+ #    - http://chasen.aist-nara.ac.jp/chasen/distribution.html
+ #    - Its license terms and conditions are shown below.
+ #
+ #  ---------COPYING.libtabe ---- BEGIN--------------------
+ #
+ #  /*
+ #   * Copyright (c) 1999 TaBE Project.
+ #   * Copyright (c) 1999 Pai-Hsiang Hsiao.
+ #   * All rights reserved.
+ #   *
+ #   * Redistribution and use in source and binary forms, with or without
+ #   * modification, are permitted provided that the following conditions
+ #   * are met:
+ #   *
+ #   * . Redistributions of source code must retain the above copyright
+ #   *   notice, this list of conditions and the following disclaimer.
+ #   * . Redistributions in binary form must reproduce the above copyright
+ #   *   notice, this list of conditions and the following disclaimer in
+ #   *   the documentation and/or other materials provided with the
+ #   *   distribution.
+ #   * . Neither the name of the TaBE Project nor the names of its
+ #   *   contributors may be used to endorse or promote products derived
+ #   *   from this software without specific prior written permission.
+ #   *
+ #   * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ #   * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ #   * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ #   * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ #   * REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ #   * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ #   * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ #   * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ #   * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ #   * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ #   * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ #   * OF THE POSSIBILITY OF SUCH DAMAGE.
+ #   */
+ #
+ #  /*
+ #   * Copyright (c) 1999 Computer Systems and Communication Lab,
+ #   *                    Institute of Information Science, Academia
+ #       *                    Sinica. All rights reserved.
+ #   *
+ #   * Redistribution and use in source and binary forms, with or without
+ #   * modification, are permitted provided that the following conditions
+ #   * are met:
+ #   *
+ #   * . Redistributions of source code must retain the above copyright
+ #   *   notice, this list of conditions and the following disclaimer.
+ #   * . Redistributions in binary form must reproduce the above copyright
+ #   *   notice, this list of conditions and the following disclaimer in
+ #   *   the documentation and/or other materials provided with the
+ #   *   distribution.
+ #   * . Neither the name of the Computer Systems and Communication Lab
+ #   *   nor the names of its contributors may be used to endorse or
+ #   *   promote products derived from this software without specific
+ #   *   prior written permission.
+ #   *
+ #   * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ #   * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ #   * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ #   * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ #   * REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ #   * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ #   * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ #   * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ #   * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ #   * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ #   * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ #   * OF THE POSSIBILITY OF SUCH DAMAGE.
+ #   */
+ #
+ #  Copyright 1996 Chih-Hao Tsai @ Beckman Institute,
+ #      University of Illinois
+ #  c-tsai4@uiuc.edu  http://casper.beckman.uiuc.edu/~c-tsai4
+ #
+ #  ---------------COPYING.libtabe-----END--------------------------------
+ #
+ #
+ #  ---------------COPYING.ipadic-----BEGIN-------------------------------
+ #
+ #  Copyright 2000, 2001, 2002, 2003 Nara Institute of Science
+ #  and Technology.  All Rights Reserved.
+ #
+ #  Use, reproduction, and distribution of this software is permitted.
+ #  Any copy of this software, whether in its original form or modified,
+ #  must include both the above copyright notice and the following
+ #  paragraphs.
+ #
+ #  Nara Institute of Science and Technology (NAIST),
+ #  the copyright holders, disclaims all warranties with regard to this
+ #  software, including all implied warranties of merchantability and
+ #  fitness, in no event shall NAIST be liable for
+ #  any special, indirect or consequential damages or any damages
+ #  whatsoever resulting from loss of use, data or profits, whether in an
+ #  action of contract, negligence or other tortuous action, arising out
+ #  of or in connection with the use or performance of this software.
+ #
+ #  A large portion of the dictionary entries
+ #  originate from ICOT Free Software.  The following conditions for ICOT
+ #  Free Software applies to the current dictionary as well.
+ #
+ #  Each User may also freely distribute the Program, whether in its
+ #  original form or modified, to any third party or parties, PROVIDED
+ #  that the provisions of Section 3 ("NO WARRANTY") will ALWAYS appear
+ #  on, or be attached to, the Program, which is distributed substantially
+ #  in the same form as set out herein and that such intended
+ #  distribution, if actually made, will neither violate or otherwise
+ #  contravene any of the laws and regulations of the countries having
+ #  jurisdiction over the User or the intended distribution itself.
+ #
+ #  NO WARRANTY
+ #
+ #  The program was produced on an experimental basis in the course of the
+ #  research and development conducted during the project and is provided
+ #  to users as so produced on an experimental basis.  Accordingly, the
+ #  program is provided without any warranty whatsoever, whether express,
+ #  implied, statutory or otherwise.  The term "warranty" used herein
+ #  includes, but is not limited to, any warranty of the quality,
+ #  performance, merchantability and fitness for a particular purpose of
+ #  the program and the nonexistence of any infringement or violation of
+ #  any right of any third party.
+ #
+ #  Each user of the program will agree and understand, and be deemed to
+ #  have agreed and understood, that there is no warranty whatsoever for
+ #  the program and, accordingly, the entire risk arising from or
+ #  otherwise connected with the program is assumed by the user.
+ #
+ #  Therefore, neither ICOT, the copyright holder, or any other
+ #  organization that participated in or was otherwise related to the
+ #  development of the program and their respective officials, directors,
+ #  officers and other employees shall be held liable for any and all
+ #  damages, including, without limitation, general, special, incidental
+ #  and consequential damages, arising out of or otherwise in connection
+ #  with the use or inability to use the program or any product, material
+ #  or result produced or otherwise obtained by using the program,
+ #  regardless of whether they have been advised of, or otherwise had
+ #  knowledge of, the possibility of such damages at any time during the
+ #  project or thereafter.  Each user will be deemed to have agreed to the
+ #  foregoing by his or her commencement of use of the program.  The term
+ #  "use" as used herein includes, but is not limited to, the use,
+ #  modification, copying and distribution of the program and the
+ #  production of secondary products from the program.
+ #
+ #  In the case where the program, whether in its original form or
+ #  modified, was distributed or delivered to or received by a user from
+ #  any person, organization or entity other than ICOT, unless it makes or
+ #  grants independently of ICOT any specific warranty to the user in
+ #  writing, such person, organization or entity, will also be exempted
+ #  from and not be held liable to the user for any such damages as noted
+ #  above as far as the program is concerned.
+ #
+ #  ---------------COPYING.ipadic-----END----------------------------------
+
+3. Lao Word Break Dictionary Data (laodict.txt)
+
+ #  Copyright (c) 2013 International Business Machines Corporation
+ #  and others. All Rights Reserved.
+ #
+ # Project: http://code.google.com/p/lao-dictionary/
+ # Dictionary: http://lao-dictionary.googlecode.com/git/Lao-Dictionary.txt
+ # License: http://lao-dictionary.googlecode.com/git/Lao-Dictionary-LICENSE.txt
+ #              (copied below)
+ #
+ #  This file is derived from the above dictionary, with slight
+ #  modifications.
+ #  ----------------------------------------------------------------------
+ #  Copyright (C) 2013 Brian Eugene Wilson, Robert Martin Campbell.
+ #  All rights reserved.
+ #
+ #  Redistribution and use in source and binary forms, with or without
+ #  modification,
+ #  are permitted provided that the following conditions are met:
+ #
+ #
+ # Redistributions of source code must retain the above copyright notice, this
+ #  list of conditions and the following disclaimer. Redistributions in
+ #  binary form must reproduce the above copyright notice, this list of
+ #  conditions and the following disclaimer in the documentation and/or
+ #  other materials provided with the distribution.
+ #
+ #
+ # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ # FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ # COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
+ # INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ # (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ # HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ # STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ # OF THE POSSIBILITY OF SUCH DAMAGE.
+ #  --------------------------------------------------------------------------
+
+4. Burmese Word Break Dictionary Data (burmesedict.txt)
+
+ #  Copyright (c) 2014 International Business Machines Corporation
+ #  and others. All Rights Reserved.
+ #
+ #  This list is part of a project hosted at:
+ #    github.com/kanyawtech/myanmar-karen-word-lists
+ #
+ #  --------------------------------------------------------------------------
+ #  Copyright (c) 2013, LeRoy Benjamin Sharon
+ #  All rights reserved.
+ #
+ #  Redistribution and use in source and binary forms, with or without
+ #  modification, are permitted provided that the following conditions
+ #  are met: Redistributions of source code must retain the above
+ #  copyright notice, this list of conditions and the following
+ #  disclaimer.  Redistributions in binary form must reproduce the
+ #  above copyright notice, this list of conditions and the following
+ #  disclaimer in the documentation and/or other materials provided
+ #  with the distribution.
+ #
+ #    Neither the name Myanmar Karen Word Lists, nor the names of its
+ #    contributors may be used to endorse or promote products derived
+ #    from this software without specific prior written permission.
+ #
+ #  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
+ #  CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
+ #  INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ #  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ #  DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS
+ #  BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ #  EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+ #  TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ #  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ #  ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
+ #  TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF
+ #  THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ #  SUCH DAMAGE.
+ #  --------------------------------------------------------------------------
+
+5. Time Zone Database
+
+  ICU uses the public domain data and code derived from Time Zone
+Database for its time zone support. The ownership of the TZ database
+is explained in BCP 175: Procedure for Maintaining the Time Zone
+Database section 7.
+
+ # 7.  Database Ownership
+ #
+ #    The TZ database itself is not an IETF Contribution or an IETF
+ #    document.  Rather it is a pre-existing and regularly updated work
+ #    that is in the public domain, and is intended to remain in the
+ #    public domain.  Therefore, BCPs 78 [RFC5378] and 79 [RFC3979] do
+ #    not apply to the TZ Database or contributions that individuals make
+ #    to it.  Should any claims be made and substantiated against the TZ
+ #    Database, the organization that is providing the IANA
+ #    Considerations defined in this RFC, under the memorandum of
+ #    understanding with the IETF, currently ICANN, may act in accordance
+ #    with all competent court orders.  No ownership claims will be made
+ #    by ICANN or the IETF Trust on the database or the code.  Any person
+ #    making a contribution to the database or code waives all rights to
+ #    future claims in that contribution or in the TZ Database.
+
+6. Google double-conversion
+
+Copyright 2006-2011, the V8 project authors. All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above
+      copyright notice, this list of conditions and the following
+      disclaimer in the documentation and/or other materials provided
+      with the distribution.
+    * Neither the name of Google Inc. nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/third_party/icu/data/icu_conversion_data.c.gz.aa b/third_party/icu/data/icu_conversion_data.c.gz.aa
new file mode 100644
index 00000000000..b68a2c6516f
Binary files /dev/null and b/third_party/icu/data/icu_conversion_data.c.gz.aa differ
diff --git a/third_party/icu/data/icu_conversion_data.c.gz.ab b/third_party/icu/data/icu_conversion_data.c.gz.ab
new file mode 100644
index 00000000000..d60aa92d675
Binary files /dev/null and b/third_party/icu/data/icu_conversion_data.c.gz.ab differ
diff --git a/third_party/icu/data/icu_conversion_data.c.gz.ac b/third_party/icu/data/icu_conversion_data.c.gz.ac
new file mode 100644
index 00000000000..de9b69ff947
Binary files /dev/null and b/third_party/icu/data/icu_conversion_data.c.gz.ac differ
diff --git a/third_party/icu/data/icu_conversion_data.c.gz.ad b/third_party/icu/data/icu_conversion_data.c.gz.ad
new file mode 100644
index 00000000000..d5abb06b8ca
Binary files /dev/null and b/third_party/icu/data/icu_conversion_data.c.gz.ad differ
diff --git a/third_party/icu/data/icu_conversion_data.c.gz.ae b/third_party/icu/data/icu_conversion_data.c.gz.ae
new file mode 100644
index 00000000000..0e54fdb9eaf
Binary files /dev/null and b/third_party/icu/data/icu_conversion_data.c.gz.ae differ
diff --git a/third_party/icu/data/icu_conversion_data.c.gz.af b/third_party/icu/data/icu_conversion_data.c.gz.af
new file mode 100644
index 00000000000..cfbeb165ad3
Binary files /dev/null and b/third_party/icu/data/icu_conversion_data.c.gz.af differ
diff --git a/third_party/icu/data/icu_conversion_data.c.gz.ag b/third_party/icu/data/icu_conversion_data.c.gz.ag
new file mode 100644
index 00000000000..bde20b6da62
Binary files /dev/null and b/third_party/icu/data/icu_conversion_data.c.gz.ag differ
diff --git a/third_party/icu/data/icu_conversion_data.c.gz.ah b/third_party/icu/data/icu_conversion_data.c.gz.ah
new file mode 100644
index 00000000000..ae31dffbe2a
Binary files /dev/null and b/third_party/icu/data/icu_conversion_data.c.gz.ah differ
diff --git a/third_party/icu/data/icu_conversion_data.c.gz.ai b/third_party/icu/data/icu_conversion_data.c.gz.ai
new file mode 100644
index 00000000000..981b869561a
Binary files /dev/null and b/third_party/icu/data/icu_conversion_data.c.gz.ai differ
diff --git a/third_party/icu/data/icu_conversion_data.c.gz.aj b/third_party/icu/data/icu_conversion_data.c.gz.aj
new file mode 100644
index 00000000000..1ae6bce382a
Binary files /dev/null and b/third_party/icu/data/icu_conversion_data.c.gz.aj differ
diff --git a/third_party/icu/udata.patch b/third_party/icu/udata.patch
new file mode 100644
index 00000000000..d6d59100e48
--- /dev/null
+++ b/third_party/icu/udata.patch
@@ -0,0 +1,53 @@
+--- /icu4c/source/common/udata.cpp.old	2018-06-19 22:34:56.000000000 -0700
++++ /icu4c/source/common/udata.cpp	2018-10-19 14:26:09.778950855 -0700
+@@ -18,15 +18,15 @@
+ 
+ #include "unicode/utypes.h"  /* U_PLATFORM etc. */
+ 
+-#ifdef __GNUC__
+-/* if gcc
+-#define ATTRIBUTE_WEAK __attribute__ ((weak))
+-might have to #include some other header
+-*/
++#if defined(__GNUC__) || defined(__SUNPRO_CC)
++#  define ATTRIBUTE_WEAK __attribute__ ((weak))
++#else
++#  define ATTRIBUTE_WEAK
+ #endif
+ 
+ #include "unicode/putil.h"
+ #include "unicode/udata.h"
++#include "unicode/umachine.h"
+ #include "unicode/uversion.h"
+ #include "charstr.h"
+ #include "cmemory.h"
+@@ -641,10 +641,11 @@
+  * partial-data-library access functions where each returns a pointer
+  * to its data package, if it is linked in.
+  */
+-/*
+-extern const void *uprv_getICUData_collation(void) ATTRIBUTE_WEAK;
+-extern const void *uprv_getICUData_conversion(void) ATTRIBUTE_WEAK;
+-*/
++
++//extern "C" const void *uprv_getICUData_collation(void);
++U_CDECL_BEGIN
++const void *uprv_getICUData_conversion(void) ATTRIBUTE_WEAK;
++U_CDECL_END
+ 
+ /*----------------------------------------------------------------------*
+  *                                                                      *
+@@ -702,10 +703,11 @@
+         if (uprv_getICUData_collation) {
+             setCommonICUDataPointer(uprv_getICUData_collation(), FALSE, pErrorCode);
+         }
++        */
+         if (uprv_getICUData_conversion) {
+-            setCommonICUDataPointer(uprv_getICUData_conversion(), FALSE, pErrorCode);
++          setCommonICUDataPointer(uprv_getICUData_conversion(), FALSE, pErrorCode);
+         }
+-        */
++
+ #if U_PLATFORM_HAS_WINUWP_API == 0 // Windows UWP Platform does not support dll icu data at this time
+         setCommonICUDataPointer(&U_ICUDATA_ENTRY_POINT, FALSE, pErrorCode);
+         {
diff --git a/third_party/icu/workspace.bzl b/third_party/icu/workspace.bzl
index a4f653e0261..f100836b410 100644
--- a/third_party/icu/workspace.bzl
+++ b/third_party/icu/workspace.bzl
@@ -2,6 +2,11 @@
 
 load("//third_party:repo.bzl", "third_party_http_archive")
 
+# Sanitize a dependency so that it works correctly from code that includes
+# TensorFlow as a submodule.
+def clean_dep(dep):
+    return str(Label(dep))
+
 def repo():
     third_party_http_archive(
         name = "icu",
@@ -13,4 +18,5 @@ def repo():
         ],
         build_file = "//third_party/icu:BUILD.bazel",
         system_build_file = "//third_party/icu:BUILD.system",
+        patch_file = clean_dep("//third_party/icu:udata.patch"),
     )
diff --git a/third_party/libxsmm.BUILD b/third_party/libxsmm.BUILD
index ee49d281abc..dc7dcc95170 100644
--- a/third_party/libxsmm.BUILD
+++ b/third_party/libxsmm.BUILD
@@ -38,8 +38,8 @@ genrule(
         ":libxsmm_interface",
     ],
     visibility = [
-        "//third_party/eigen3:__pkg__",
         "//tensorflow/core/kernels:__pkg__",
+        "//third_party/eigen3:__pkg__",
     ],
 )
 
diff --git a/third_party/llvm/llvm.autogenerated.BUILD b/third_party/llvm/llvm.autogenerated.BUILD
index 776935739ac..eb468aa65fc 100644
--- a/third_party/llvm/llvm.autogenerated.BUILD
+++ b/third_party/llvm/llvm.autogenerated.BUILD
@@ -823,6 +823,7 @@ cc_library(
     ]),
     copts = llvm_copts + ["-Iexternal/llvm/lib/Target/ARM"],
     deps = [
+        ":arm_asm_printer",
         ":arm_desc",
         ":arm_info",
         ":arm_utils",
@@ -2141,6 +2142,7 @@ cc_library(
         ":core",
         ":global_i_sel",
         ":mc",
+        ":profile_data",
         ":selection_dag",
         ":support",
         ":target",
diff --git a/third_party/llvm/llvm.bzl b/third_party/llvm/llvm.bzl
index 54ca86f3272..5a977f82c41 100644
--- a/third_party/llvm/llvm.bzl
+++ b/third_party/llvm/llvm.bzl
@@ -250,6 +250,7 @@ linux_cmake_vars = {
 # CMake variables specific to the Darwin (Mac OS X) platform.
 darwin_cmake_vars = {
     "HAVE_MALLOC_MALLOC_H": 1,
+    "HAVE_MALLOC_ZONE_STATISTICS": 1,
 }
 
 # CMake variables specific to the Windows platform.
diff --git a/third_party/mkl_dnn/mkldnn.BUILD b/third_party/mkl_dnn/mkldnn.BUILD
index 597ac69e2ff..7a8ed3bf439 100644
--- a/third_party/mkl_dnn/mkldnn.BUILD
+++ b/third_party/mkl_dnn/mkldnn.BUILD
@@ -42,8 +42,8 @@ cc_library(
         "src",
         "src/common",
         "src/cpu",
-        "src/cpu/xbyak",
         "src/cpu/gemm",
+        "src/cpu/xbyak",
     ],
     nocopts = "-fno-exceptions",
     visibility = ["//visibility:public"],
diff --git a/third_party/nccl/archive.BUILD b/third_party/nccl/archive.BUILD
index c0833828a73..7a08f97ef32 100644
--- a/third_party/nccl/archive.BUILD
+++ b/third_party/nccl/archive.BUILD
@@ -64,13 +64,13 @@ nccl_library(
         ":device_srcs",
     ],
     copts = ["-DNCCL_OP=0"] + rdc_copts(),
+    linkstatic = True,
     prefix = "sum_",
     deps = [
-        ":src_hdrs",
         ":include_hdrs",
+        ":src_hdrs",
         "@local_config_cuda//cuda:cuda_headers",
     ],
-    linkstatic = True,
 )
 
 nccl_library(
@@ -80,13 +80,13 @@ nccl_library(
         ":device_srcs",
     ],
     copts = ["-DNCCL_OP=1"] + rdc_copts(),
+    linkstatic = True,
     prefix = "_prod",
     deps = [
-        ":src_hdrs",
         ":include_hdrs",
+        ":src_hdrs",
         "@local_config_cuda//cuda:cuda_headers",
     ],
-    linkstatic = True,
 )
 
 nccl_library(
@@ -96,13 +96,13 @@ nccl_library(
         ":device_srcs",
     ],
     copts = ["-DNCCL_OP=2"] + rdc_copts(),
+    linkstatic = True,
     prefix = "min_",
     deps = [
-        ":src_hdrs",
         ":include_hdrs",
+        ":src_hdrs",
         "@local_config_cuda//cuda:cuda_headers",
     ],
-    linkstatic = True,
 )
 
 nccl_library(
@@ -112,28 +112,28 @@ nccl_library(
         ":device_srcs",
     ],
     copts = ["-DNCCL_OP=3"] + rdc_copts(),
+    linkstatic = True,
     prefix = "max_",
     deps = [
-        ":src_hdrs",
         ":include_hdrs",
+        ":src_hdrs",
         "@local_config_cuda//cuda:cuda_headers",
     ],
-    linkstatic = True,
 )
 
 nccl_library(
     name = "functions",
     srcs = [
-        ":device_hdrs",
         "src/collectives/device/functions.cu",
+        ":device_hdrs",
     ],
     copts = rdc_copts(),
+    linkstatic = True,
     deps = [
-        ":src_hdrs",
         ":include_hdrs",
+        ":src_hdrs",
         "@local_config_cuda//cuda:cuda_headers",
     ],
-    linkstatic = True,
 )
 
 rdc_library(
@@ -162,13 +162,13 @@ nccl_library(
         "src/nccl.h",
     ],
     hdrs = ["src/nccl.h"],
+    copts = cuda_default_copts(),
     include_prefix = "third_party/nccl",
     strip_include_prefix = "src",
-    copts = cuda_default_copts(),
+    visibility = ["//visibility:public"],
     deps = [
         ":device_code",
         ":include_hdrs",
         ":src_hdrs",
     ],
-    visibility = ["//visibility:public"],
 )
diff --git a/third_party/ngraph/ngraph.BUILD b/third_party/ngraph/ngraph.BUILD
index f556c5279df..63e9548c532 100644
--- a/third_party/ngraph/ngraph.BUILD
+++ b/third_party/ngraph/ngraph.BUILD
@@ -97,13 +97,6 @@ cc_library(
         "src/ngraph/runtime/cpu/pass/cpu_workspace_insertion.cpp",
     ],
     hdrs = glob(["src/ngraph/runtime/cpu/**/*.hpp"]) + glob([]),
-    deps = [
-        ":ngraph_headers",
-        "@eigen_archive//:eigen",
-        "@nlohmann_json_lib",
-        "@tbb",
-        "@mkl_dnn//:mkl_dnn",
-    ],
     copts = [
         "-I external/ngraph/src",
         "-I external/nlohmann_json_lib/include/",
@@ -113,6 +106,13 @@ cc_library(
         '-D PROJECT_ROOT_DIR=\\"\\"',
     ],
     visibility = ["//visibility:public"],
+    deps = [
+        ":ngraph_headers",
+        "@eigen_archive//:eigen",
+        "@mkl_dnn",
+        "@nlohmann_json_lib",
+        "@tbb",
+    ],
     alwayslink = 1,
 )
 
@@ -138,12 +138,6 @@ cc_library(
         "src/ngraph/runtime/*.cpp",
         "src/ngraph/type/*.cpp",
     ]),
-    deps = [
-        ":ngraph_headers",
-        ":ngraph_cpu_backend",
-        "@eigen_archive//:eigen",
-        "@nlohmann_json_lib",
-    ],
     copts = [
         "-I external/ngraph/src",
         "-I external/nlohmann_json_lib/include/",
@@ -152,5 +146,11 @@ cc_library(
         '-D PROJECT_ROOT_DIR=\\"\\"',
     ],
     visibility = ["//visibility:public"],
+    deps = [
+        ":ngraph_cpu_backend",
+        ":ngraph_headers",
+        "@eigen_archive//:eigen",
+        "@nlohmann_json_lib",
+    ],
     alwayslink = 1,
 )
diff --git a/third_party/ngraph/ngraph_tf.BUILD b/third_party/ngraph/ngraph_tf.BUILD
index 068e411e81b..db9a66f9b5b 100644
--- a/third_party/ngraph/ngraph_tf.BUILD
+++ b/third_party/ngraph/ngraph_tf.BUILD
@@ -10,6 +10,10 @@ load(
 cc_library(
     name = "ngraph_tf",
     srcs = [
+        "logging/ngraph_log.cc",
+        "logging/ngraph_log.h",
+        "logging/tf_graph_writer.cc",
+        "logging/tf_graph_writer.h",
         "src/ngraph_api.cc",
         "src/ngraph_api.h",
         "src/ngraph_assign_clusters.cc",
@@ -41,27 +45,23 @@ cc_library(
         "src/tf_deadness_analysis.h",
         "src/tf_graphcycles.cc",
         "src/tf_graphcycles.h",
-        "logging/ngraph_log.h",
-        "logging/ngraph_log.cc",
-        "logging/tf_graph_writer.h",
-        "logging/tf_graph_writer.cc",
-    ],
-    deps = [
-        "@org_tensorflow//tensorflow/core:protos_all_proto_text",
-        "@org_tensorflow//tensorflow/core:framework_headers_lib",
-        "@org_tensorflow//tensorflow/core:core_cpu_headers_lib",
-        "@ngraph//:ngraph_core",
-        "@com_google_absl//absl/container:container_memory",
-        "@com_google_absl//absl/container:flat_hash_set",
-        "@com_google_absl//absl/types:variant",
     ],
     copts = [
         "-I external/ngraph_tf/src",
         "-I external/ngraph_tf/logging",
         "-I external/ngraph/src",
     ],
-    alwayslink = 1,
     visibility = ["//visibility:public"],
+    deps = [
+        "@com_google_absl//absl/container:container_memory",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/types:variant",
+        "@ngraph//:ngraph_core",
+        "@org_tensorflow//tensorflow/core:core_cpu_headers_lib",
+        "@org_tensorflow//tensorflow/core:framework_headers_lib",
+        "@org_tensorflow//tensorflow/core:protos_all_proto_text",
+    ],
+    alwayslink = 1,
 )
 
 tf_cc_test(
@@ -82,6 +82,12 @@ tf_cc_test(
         "test/test_utilities.h",
         "test/tf_exec.cpp",
     ],
+    extra_copts = [
+        "-fexceptions ",
+        "-I external/ngraph_tf/src",
+        "-I external/ngraph_tf/logging",
+        "-I external/ngraph/src",
+    ],
     deps = [
         ":ngraph_tf",
         "@com_google_googletest//:gtest",
@@ -89,10 +95,4 @@ tf_cc_test(
         "@org_tensorflow//tensorflow/cc:client_session",
         "@org_tensorflow//tensorflow/core:tensorflow",
     ],
-    extra_copts = [
-        "-fexceptions ",
-        "-I external/ngraph_tf/src",
-        "-I external/ngraph_tf/logging",
-        "-I external/ngraph/src",
-    ],
 )
diff --git a/third_party/ngraph/tbb.BUILD b/third_party/ngraph/tbb.BUILD
index 04e6544ffb5..c78a2d79ddf 100644
--- a/third_party/ngraph/tbb.BUILD
+++ b/third_party/ngraph/tbb.BUILD
@@ -14,6 +14,10 @@ genrule(
     srcs = glob(["**"]) + [
         "@local_config_cc//:toolchain",
     ],
+    outs = [
+        "libtbb.a",
+        "libtbbmalloc.a",
+    ],
     cmd = """
 	    set -e
 	    WORK_DIR=$$PWD
@@ -45,19 +49,15 @@ genrule(
         cp build/build_{release,debug}/*.a $$DEST_DIR
 		cd $$WORK_DIR
 	""",
-    outs = [
-        "libtbb.a",
-        "libtbbmalloc.a",
-    ],
 )
 
 cc_library(
     name = "tbb",
+    srcs = ["libtbb.a"],
     hdrs = glob([
         "include/serial/**",
         "include/tbb/**/**",
     ]),
-    srcs = ["libtbb.a"],
     includes = ["include"],
     visibility = ["//visibility:public"],
 )
diff --git a/third_party/png.BUILD b/third_party/png.BUILD
index c26a2897176..e82948648e4 100644
--- a/third_party/png.BUILD
+++ b/third_party/png.BUILD
@@ -44,11 +44,11 @@ cc_library(
         "png.h",
         "pngconf.h",
     ],
-    includes = ["."],
     copts = select({
         ":windows": ["-DPNG_INTEL_SSE_OPT=1"],
         "//conditions:default": [],
     }),
+    includes = ["."],
     linkopts = select({
         ":windows": [],
         "//conditions:default": ["-lm"],
diff --git a/third_party/repo.bzl b/third_party/repo.bzl
index 07b853ff11c..bad6d20a08c 100644
--- a/third_party/repo.bzl
+++ b/third_party/repo.bzl
@@ -84,7 +84,7 @@ def _apply_delete(ctx, paths):
 def _tf_http_archive(ctx):
     if ("mirror.bazel.build" not in ctx.attr.urls[0] and
         (len(ctx.attr.urls) < 2 and
-         ctx.attr.name not in _SINGLE_URL_WHITELIST)):
+         ctx.attr.name not in _SINGLE_URL_WHITELIST.to_list())):
         fail("tf_http_archive(urls) must have redundant URLs. The " +
              "mirror.bazel.build URL must be present and it must come first. " +
              "Even if you don't have permission to mirror the file, please " +
@@ -150,7 +150,7 @@ ensure best practices are followed.
 def _third_party_http_archive(ctx):
     if ("mirror.bazel.build" not in ctx.attr.urls[0] and
         (len(ctx.attr.urls) < 2 and
-         ctx.attr.name not in _SINGLE_URL_WHITELIST)):
+         ctx.attr.name not in _SINGLE_URL_WHITELIST.to_list())):
         fail("tf_http_archive(urls) must have redundant URLs. The " +
              "mirror.bazel.build URL must be present and it must come first. " +
              "Even if you don't have permission to mirror the file, please " +
diff --git a/third_party/toolchains/BUILD b/third_party/toolchains/BUILD
index a7b4687c020..9da417fd5fe 100644
--- a/third_party/toolchains/BUILD
+++ b/third_party/toolchains/BUILD
@@ -35,3 +35,16 @@ platform(
             value:"docker://gcr.io/asci-toolchain/nosla-cuda9.0-cudnn7-ubuntu14.04@%s"
         }""" % container_digests["cuda9.0-cudnn7-ubuntu14.04"],
 )
+
+platform(
+    name = "rbe_cuda10.0-cudnn7-ubuntu14.04",
+    constraint_values = [
+        "@bazel_tools//platforms:x86_64",
+        "@bazel_tools//platforms:linux",
+    ],
+    remote_execution_properties = """
+        properties: {
+            name: "container-image"
+            value:"docker://gcr.io/asci-toolchain/nosla-cuda10.0-cudnn7-ubuntu14.04@%s"
+        }""" % container_digests["cuda10.0-cudnn7-ubuntu14.04"],
+)
diff --git a/third_party/toolchains/preconfig/generate/containers.bzl b/third_party/toolchains/preconfig/generate/containers.bzl
index 1f9e29d4402..7099b9bf3e4 100644
--- a/third_party/toolchains/preconfig/generate/containers.bzl
+++ b/third_party/toolchains/preconfig/generate/containers.bzl
@@ -1,4 +1,4 @@
 container_digests = {
     "cuda9.0-cudnn7-ubuntu14.04": "sha256:c26138f4c38c754da2bad44a8a068523abf7fbd71d58a57ce92e5342c5431bf5",
-    "cuda10.0-cudnn7-ubuntu14.04": "sha256:34c4a55e2376b300cdc2b903775fc32e62352f6e33f927df5653743324378bfc",
+    "cuda10.0-cudnn7-ubuntu14.04": "sha256:66e7d592c8149291d5562a0f3093655a15b09c22e0eb30a87b3b6469b7a30ffc",
 }
diff --git a/third_party/toolchains/preconfig/ubuntu14.04/cuda10.0-cudnn7/WORKSPACE b/third_party/toolchains/preconfig/ubuntu14.04/cuda10.0-cudnn7/WORKSPACE
new file mode 100644
index 00000000000..b61f572d6d2
--- /dev/null
+++ b/third_party/toolchains/preconfig/ubuntu14.04/cuda10.0-cudnn7/WORKSPACE
@@ -0,0 +1,2 @@
+# DO NOT EDIT: automatically generated WORKSPACE file for cuda_configure rule
+workspace(name = "local_config_cuda")
diff --git a/third_party/toolchains/preconfig/ubuntu14.04/cuda10.0-cudnn7/cuda/BUILD b/third_party/toolchains/preconfig/ubuntu14.04/cuda10.0-cudnn7/cuda/BUILD
new file mode 100755
index 00000000000..c813efccf9b
--- /dev/null
+++ b/third_party/toolchains/preconfig/ubuntu14.04/cuda10.0-cudnn7/cuda/BUILD
@@ -0,0 +1,1275 @@
+licenses(["restricted"])  # MPL2, portions GPL v3, LGPL v3, BSD-like
+
+package(default_visibility = ["//visibility:public"])
+
+config_setting(
+    name = "using_nvcc",
+    values = {
+        "define": "using_cuda_nvcc=true",
+    },
+)
+
+config_setting(
+    name = "using_clang",
+    values = {
+        "define": "using_cuda_clang=true",
+    },
+)
+
+# Equivalent to using_clang && -c opt.
+config_setting(
+    name = "using_clang_opt",
+    values = {
+        "define": "using_cuda_clang=true",
+        "compilation_mode": "opt",
+    },
+)
+
+config_setting(
+    name = "darwin",
+    values = {"cpu": "darwin"},
+    visibility = ["//visibility:public"],
+)
+
+config_setting(
+    name = "freebsd",
+    values = {"cpu": "freebsd"},
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "cuda_headers",
+    hdrs = [
+        "cuda/cuda_config.h",
+        ":cuda-include",
+        ":cudnn-include",
+    ],
+    includes = [
+        ".",
+        "cuda/include",
+        "cuda/include/crt",
+    ],
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "cudart_static",
+    srcs = ["cuda/lib/libcudart_static.a"],
+    includes = [
+        ".",
+        "cuda/include",
+    ],
+    linkopts = select({
+        ":freebsd": [],
+        "//conditions:default": ["-ldl"],
+    }) + [
+        "-lpthread",
+        "-lrt",
+    ],
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "cuda_driver",
+    srcs = ["cuda/lib/libcuda.so"],
+    includes = [
+        ".",
+        "cuda/include",
+    ],
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "cudart",
+    srcs = ["cuda/lib/libcudart.so.10.0"],
+    data = ["cuda/lib/libcudart.so.10.0"],
+    includes = [
+        ".",
+        "cuda/include",
+    ],
+    linkstatic = 1,
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "cublas",
+    srcs = ["cuda/lib/libcublas.so.10.0"],
+    data = ["cuda/lib/libcublas.so.10.0"],
+    includes = [
+        ".",
+        "cuda/include",
+    ],
+    linkstatic = 1,
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "cusolver",
+    srcs = ["cuda/lib/libcusolver.so.10.0"],
+    data = ["cuda/lib/libcusolver.so.10.0"],
+    includes = [
+        ".",
+        "cuda/include",
+    ],
+    linkopts = ["-lgomp"],
+    linkstatic = 1,
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "cudnn",
+    srcs = ["cuda/lib/libcudnn.so.7"],
+    data = ["cuda/lib/libcudnn.so.7"],
+    includes = [
+        ".",
+        "cuda/include",
+    ],
+    linkstatic = 1,
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "cudnn_header",
+    includes = [
+        ".",
+        "cuda/include",
+    ],
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "cufft",
+    srcs = ["cuda/lib/libcufft.so.10.0"],
+    data = ["cuda/lib/libcufft.so.10.0"],
+    includes = [
+        ".",
+        "cuda/include",
+    ],
+    linkstatic = 1,
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "curand",
+    srcs = ["cuda/lib/libcurand.so.10.0"],
+    data = ["cuda/lib/libcurand.so.10.0"],
+    includes = [
+        ".",
+        "cuda/include",
+    ],
+    linkstatic = 1,
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "cuda",
+    visibility = ["//visibility:public"],
+    deps = [
+        ":cublas",
+        ":cuda_headers",
+        ":cudart",
+        ":cudnn",
+        ":cufft",
+        ":curand",
+    ],
+)
+
+cc_library(
+    name = "cupti_headers",
+    hdrs = [
+        "cuda/cuda_config.h",
+        ":cuda-extras",
+    ],
+    includes = [
+        ".",
+        "cuda/extras/CUPTI/include/",
+    ],
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "cupti_dsos",
+    data = ["cuda/lib/libcupti.so.10.0"],
+    includes = [
+        ".",
+        "cuda/include",
+    ],
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "libdevice_root",
+    data = [":cuda-nvvm"],
+    visibility = ["//visibility:public"],
+)
+
+genrule(
+    name = "cuda-include",
+    outs = [
+        "cuda/include/CL/cl.h",
+        "cuda/include/CL/cl.hpp",
+        "cuda/include/CL/cl_egl.h",
+        "cuda/include/CL/cl_ext.h",
+        "cuda/include/CL/cl_gl.h",
+        "cuda/include/CL/cl_gl_ext.h",
+        "cuda/include/CL/cl_platform.h",
+        "cuda/include/CL/opencl.h",
+        "cuda/include/builtin_types.h",
+        "cuda/include/channel_descriptor.h",
+        "cuda/include/common_functions.h",
+        "cuda/include/cooperative_groups.h",
+        "cuda/include/cooperative_groups_helpers.h",
+        "cuda/include/crt/common_functions.h",
+        "cuda/include/crt/device_double_functions.h",
+        "cuda/include/crt/device_double_functions.hpp",
+        "cuda/include/crt/device_functions.h",
+        "cuda/include/crt/device_functions.hpp",
+        "cuda/include/crt/func_macro.h",
+        "cuda/include/crt/host_config.h",
+        "cuda/include/crt/host_defines.h",
+        "cuda/include/crt/host_runtime.h",
+        "cuda/include/crt/math_functions.h",
+        "cuda/include/crt/math_functions.hpp",
+        "cuda/include/crt/mma.h",
+        "cuda/include/crt/mma.hpp",
+        "cuda/include/crt/nvfunctional",
+        "cuda/include/crt/sm_70_rt.h",
+        "cuda/include/crt/sm_70_rt.hpp",
+        "cuda/include/crt/storage_class.h",
+        "cuda/include/cuComplex.h",
+        "cuda/include/cublas.h",
+        "cuda/include/cublasXt.h",
+        "cuda/include/cublas_api.h",
+        "cuda/include/cublas_v2.h",
+        "cuda/include/cuda.h",
+        "cuda/include/cudaEGL.h",
+        "cuda/include/cudaGL.h",
+        "cuda/include/cudaProfiler.h",
+        "cuda/include/cudaVDPAU.h",
+        "cuda/include/cuda_device_runtime_api.h",
+        "cuda/include/cuda_egl_interop.h",
+        "cuda/include/cuda_fp16.h",
+        "cuda/include/cuda_fp16.hpp",
+        "cuda/include/cuda_gl_interop.h",
+        "cuda/include/cuda_occupancy.h",
+        "cuda/include/cuda_profiler_api.h",
+        "cuda/include/cuda_runtime.h",
+        "cuda/include/cuda_runtime_api.h",
+        "cuda/include/cuda_surface_types.h",
+        "cuda/include/cuda_texture_types.h",
+        "cuda/include/cuda_vdpau_interop.h",
+        "cuda/include/cudalibxt.h",
+        "cuda/include/cudart_platform.h",
+        "cuda/include/cufft.h",
+        "cuda/include/cufftXt.h",
+        "cuda/include/cufftw.h",
+        "cuda/include/curand.h",
+        "cuda/include/curand_discrete.h",
+        "cuda/include/curand_discrete2.h",
+        "cuda/include/curand_globals.h",
+        "cuda/include/curand_kernel.h",
+        "cuda/include/curand_lognormal.h",
+        "cuda/include/curand_mrg32k3a.h",
+        "cuda/include/curand_mtgp32.h",
+        "cuda/include/curand_mtgp32_host.h",
+        "cuda/include/curand_mtgp32_kernel.h",
+        "cuda/include/curand_mtgp32dc_p_11213.h",
+        "cuda/include/curand_normal.h",
+        "cuda/include/curand_normal_static.h",
+        "cuda/include/curand_philox4x32_x.h",
+        "cuda/include/curand_poisson.h",
+        "cuda/include/curand_precalc.h",
+        "cuda/include/curand_uniform.h",
+        "cuda/include/cusolverDn.h",
+        "cuda/include/cusolverRf.h",
+        "cuda/include/cusolverSp.h",
+        "cuda/include/cusolverSp_LOWLEVEL_PREVIEW.h",
+        "cuda/include/cusolver_common.h",
+        "cuda/include/cusparse.h",
+        "cuda/include/cusparse_v2.h",
+        "cuda/include/device_atomic_functions.h",
+        "cuda/include/device_atomic_functions.hpp",
+        "cuda/include/device_double_functions.h",
+        "cuda/include/device_functions.h",
+        "cuda/include/device_launch_parameters.h",
+        "cuda/include/device_types.h",
+        "cuda/include/driver_functions.h",
+        "cuda/include/driver_types.h",
+        "cuda/include/fatBinaryCtl.h",
+        "cuda/include/fatbinary.h",
+        "cuda/include/host_config.h",
+        "cuda/include/host_defines.h",
+        "cuda/include/library_types.h",
+        "cuda/include/math_constants.h",
+        "cuda/include/math_functions.h",
+        "cuda/include/mma.h",
+        "cuda/include/npp.h",
+        "cuda/include/nppcore.h",
+        "cuda/include/nppdefs.h",
+        "cuda/include/nppi.h",
+        "cuda/include/nppi_arithmetic_and_logical_operations.h",
+        "cuda/include/nppi_color_conversion.h",
+        "cuda/include/nppi_compression_functions.h",
+        "cuda/include/nppi_computer_vision.h",
+        "cuda/include/nppi_data_exchange_and_initialization.h",
+        "cuda/include/nppi_filtering_functions.h",
+        "cuda/include/nppi_geometry_transforms.h",
+        "cuda/include/nppi_linear_transforms.h",
+        "cuda/include/nppi_morphological_operations.h",
+        "cuda/include/nppi_statistics_functions.h",
+        "cuda/include/nppi_support_functions.h",
+        "cuda/include/nppi_threshold_and_compare_operations.h",
+        "cuda/include/npps.h",
+        "cuda/include/npps_arithmetic_and_logical_operations.h",
+        "cuda/include/npps_conversion_functions.h",
+        "cuda/include/npps_filtering_functions.h",
+        "cuda/include/npps_initialization.h",
+        "cuda/include/npps_statistics_functions.h",
+        "cuda/include/npps_support_functions.h",
+        "cuda/include/nppversion.h",
+        "cuda/include/nvToolsExt.h",
+        "cuda/include/nvToolsExtCuda.h",
+        "cuda/include/nvToolsExtCudaRt.h",
+        "cuda/include/nvToolsExtMeta.h",
+        "cuda/include/nvToolsExtSync.h",
+        "cuda/include/nvblas.h",
+        "cuda/include/nvfunctional",
+        "cuda/include/nvgraph.h",
+        "cuda/include/nvjpeg.h",
+        "cuda/include/nvml.h",
+        "cuda/include/nvrtc.h",
+        "cuda/include/nvtx3/nvToolsExt.h",
+        "cuda/include/nvtx3/nvToolsExtCuda.h",
+        "cuda/include/nvtx3/nvToolsExtCudaRt.h",
+        "cuda/include/nvtx3/nvToolsExtOpenCL.h",
+        "cuda/include/nvtx3/nvToolsExtSync.h",
+        "cuda/include/nvtx3/nvtxDetail/nvtxImpl.h",
+        "cuda/include/nvtx3/nvtxDetail/nvtxImplCore.h",
+        "cuda/include/nvtx3/nvtxDetail/nvtxImplCudaRt_v3.h",
+        "cuda/include/nvtx3/nvtxDetail/nvtxImplCuda_v3.h",
+        "cuda/include/nvtx3/nvtxDetail/nvtxImplOpenCL_v3.h",
+        "cuda/include/nvtx3/nvtxDetail/nvtxImplSync_v3.h",
+        "cuda/include/nvtx3/nvtxDetail/nvtxInit.h",
+        "cuda/include/nvtx3/nvtxDetail/nvtxInitDecls.h",
+        "cuda/include/nvtx3/nvtxDetail/nvtxInitDefs.h",
+        "cuda/include/nvtx3/nvtxDetail/nvtxLinkOnce.h",
+        "cuda/include/nvtx3/nvtxDetail/nvtxTypes.h",
+        "cuda/include/sm_20_atomic_functions.h",
+        "cuda/include/sm_20_atomic_functions.hpp",
+        "cuda/include/sm_20_intrinsics.h",
+        "cuda/include/sm_20_intrinsics.hpp",
+        "cuda/include/sm_30_intrinsics.h",
+        "cuda/include/sm_30_intrinsics.hpp",
+        "cuda/include/sm_32_atomic_functions.h",
+        "cuda/include/sm_32_atomic_functions.hpp",
+        "cuda/include/sm_32_intrinsics.h",
+        "cuda/include/sm_32_intrinsics.hpp",
+        "cuda/include/sm_35_atomic_functions.h",
+        "cuda/include/sm_35_intrinsics.h",
+        "cuda/include/sm_60_atomic_functions.h",
+        "cuda/include/sm_60_atomic_functions.hpp",
+        "cuda/include/sm_61_intrinsics.h",
+        "cuda/include/sm_61_intrinsics.hpp",
+        "cuda/include/sobol_direction_vectors.h",
+        "cuda/include/surface_functions.h",
+        "cuda/include/surface_functions.hpp",
+        "cuda/include/surface_indirect_functions.h",
+        "cuda/include/surface_indirect_functions.hpp",
+        "cuda/include/surface_types.h",
+        "cuda/include/texture_fetch_functions.h",
+        "cuda/include/texture_fetch_functions.hpp",
+        "cuda/include/texture_indirect_functions.h",
+        "cuda/include/texture_indirect_functions.hpp",
+        "cuda/include/texture_types.h",
+        "cuda/include/thrust/adjacent_difference.h",
+        "cuda/include/thrust/advance.h",
+        "cuda/include/thrust/binary_search.h",
+        "cuda/include/thrust/complex.h",
+        "cuda/include/thrust/copy.h",
+        "cuda/include/thrust/count.h",
+        "cuda/include/thrust/detail/adjacent_difference.inl",
+        "cuda/include/thrust/detail/advance.inl",
+        "cuda/include/thrust/detail/alignment.h",
+        "cuda/include/thrust/detail/allocator/allocator_traits.h",
+        "cuda/include/thrust/detail/allocator/allocator_traits.inl",
+        "cuda/include/thrust/detail/allocator/copy_construct_range.h",
+        "cuda/include/thrust/detail/allocator/copy_construct_range.inl",
+        "cuda/include/thrust/detail/allocator/default_construct_range.h",
+        "cuda/include/thrust/detail/allocator/default_construct_range.inl",
+        "cuda/include/thrust/detail/allocator/destroy_range.h",
+        "cuda/include/thrust/detail/allocator/destroy_range.inl",
+        "cuda/include/thrust/detail/allocator/fill_construct_range.h",
+        "cuda/include/thrust/detail/allocator/fill_construct_range.inl",
+        "cuda/include/thrust/detail/allocator/malloc_allocator.h",
+        "cuda/include/thrust/detail/allocator/malloc_allocator.inl",
+        "cuda/include/thrust/detail/allocator/no_throw_allocator.h",
+        "cuda/include/thrust/detail/allocator/tagged_allocator.h",
+        "cuda/include/thrust/detail/allocator/tagged_allocator.inl",
+        "cuda/include/thrust/detail/allocator/temporary_allocator.h",
+        "cuda/include/thrust/detail/allocator/temporary_allocator.inl",
+        "cuda/include/thrust/detail/binary_search.inl",
+        "cuda/include/thrust/detail/complex/arithmetic.h",
+        "cuda/include/thrust/detail/complex/c99math.h",
+        "cuda/include/thrust/detail/complex/catrig.h",
+        "cuda/include/thrust/detail/complex/catrigf.h",
+        "cuda/include/thrust/detail/complex/ccosh.h",
+        "cuda/include/thrust/detail/complex/ccoshf.h",
+        "cuda/include/thrust/detail/complex/cexp.h",
+        "cuda/include/thrust/detail/complex/cexpf.h",
+        "cuda/include/thrust/detail/complex/clog.h",
+        "cuda/include/thrust/detail/complex/clogf.h",
+        "cuda/include/thrust/detail/complex/complex.inl",
+        "cuda/include/thrust/detail/complex/cpow.h",
+        "cuda/include/thrust/detail/complex/cproj.h",
+        "cuda/include/thrust/detail/complex/csinh.h",
+        "cuda/include/thrust/detail/complex/csinhf.h",
+        "cuda/include/thrust/detail/complex/csqrt.h",
+        "cuda/include/thrust/detail/complex/csqrtf.h",
+        "cuda/include/thrust/detail/complex/ctanh.h",
+        "cuda/include/thrust/detail/complex/ctanhf.h",
+        "cuda/include/thrust/detail/complex/math_private.h",
+        "cuda/include/thrust/detail/complex/stream.h",
+        "cuda/include/thrust/detail/config.h",
+        "cuda/include/thrust/detail/config/compiler.h",
+        "cuda/include/thrust/detail/config/compiler_fence.h",
+        "cuda/include/thrust/detail/config/config.h",
+        "cuda/include/thrust/detail/config/debug.h",
+        "cuda/include/thrust/detail/config/device_system.h",
+        "cuda/include/thrust/detail/config/exec_check_disable.h",
+        "cuda/include/thrust/detail/config/forceinline.h",
+        "cuda/include/thrust/detail/config/global_workarounds.h",
+        "cuda/include/thrust/detail/config/host_device.h",
+        "cuda/include/thrust/detail/config/host_system.h",
+        "cuda/include/thrust/detail/config/simple_defines.h",
+        "cuda/include/thrust/detail/contiguous_storage.h",
+        "cuda/include/thrust/detail/contiguous_storage.inl",
+        "cuda/include/thrust/detail/copy.h",
+        "cuda/include/thrust/detail/copy.inl",
+        "cuda/include/thrust/detail/copy_if.h",
+        "cuda/include/thrust/detail/copy_if.inl",
+        "cuda/include/thrust/detail/count.inl",
+        "cuda/include/thrust/detail/cstdint.h",
+        "cuda/include/thrust/detail/device_delete.inl",
+        "cuda/include/thrust/detail/device_free.inl",
+        "cuda/include/thrust/detail/device_malloc.inl",
+        "cuda/include/thrust/detail/device_new.inl",
+        "cuda/include/thrust/detail/device_ptr.inl",
+        "cuda/include/thrust/detail/device_reference.inl",
+        "cuda/include/thrust/detail/device_vector.inl",
+        "cuda/include/thrust/detail/dispatch/is_trivial_copy.h",
+        "cuda/include/thrust/detail/distance.inl",
+        "cuda/include/thrust/detail/equal.inl",
+        "cuda/include/thrust/detail/execute_with_allocator.h",
+        "cuda/include/thrust/detail/execution_policy.h",
+        "cuda/include/thrust/detail/extrema.inl",
+        "cuda/include/thrust/detail/fill.inl",
+        "cuda/include/thrust/detail/find.inl",
+        "cuda/include/thrust/detail/for_each.inl",
+        "cuda/include/thrust/detail/function.h",
+        "cuda/include/thrust/detail/functional.inl",
+        "cuda/include/thrust/detail/functional/actor.h",
+        "cuda/include/thrust/detail/functional/actor.inl",
+        "cuda/include/thrust/detail/functional/argument.h",
+        "cuda/include/thrust/detail/functional/composite.h",
+        "cuda/include/thrust/detail/functional/operators.h",
+        "cuda/include/thrust/detail/functional/operators/arithmetic_operators.h",
+        "cuda/include/thrust/detail/functional/operators/assignment_operator.h",
+        "cuda/include/thrust/detail/functional/operators/bitwise_operators.h",
+        "cuda/include/thrust/detail/functional/operators/compound_assignment_operators.h",
+        "cuda/include/thrust/detail/functional/operators/logical_operators.h",
+        "cuda/include/thrust/detail/functional/operators/operator_adaptors.h",
+        "cuda/include/thrust/detail/functional/operators/relational_operators.h",
+        "cuda/include/thrust/detail/functional/placeholder.h",
+        "cuda/include/thrust/detail/functional/value.h",
+        "cuda/include/thrust/detail/gather.inl",
+        "cuda/include/thrust/detail/generate.inl",
+        "cuda/include/thrust/detail/get_iterator_value.h",
+        "cuda/include/thrust/detail/host_vector.inl",
+        "cuda/include/thrust/detail/inner_product.inl",
+        "cuda/include/thrust/detail/integer_math.h",
+        "cuda/include/thrust/detail/integer_traits.h",
+        "cuda/include/thrust/detail/internal_functional.h",
+        "cuda/include/thrust/detail/logical.inl",
+        "cuda/include/thrust/detail/malloc_and_free.h",
+        "cuda/include/thrust/detail/merge.inl",
+        "cuda/include/thrust/detail/minmax.h",
+        "cuda/include/thrust/detail/mismatch.inl",
+        "cuda/include/thrust/detail/mpl/math.h",
+        "cuda/include/thrust/detail/numeric_traits.h",
+        "cuda/include/thrust/detail/overlapped_copy.h",
+        "cuda/include/thrust/detail/pair.inl",
+        "cuda/include/thrust/detail/partition.inl",
+        "cuda/include/thrust/detail/pointer.h",
+        "cuda/include/thrust/detail/pointer.inl",
+        "cuda/include/thrust/detail/preprocessor.h",
+        "cuda/include/thrust/detail/range/head_flags.h",
+        "cuda/include/thrust/detail/range/tail_flags.h",
+        "cuda/include/thrust/detail/raw_pointer_cast.h",
+        "cuda/include/thrust/detail/raw_reference_cast.h",
+        "cuda/include/thrust/detail/reduce.inl",
+        "cuda/include/thrust/detail/reference.h",
+        "cuda/include/thrust/detail/reference.inl",
+        "cuda/include/thrust/detail/reference_forward_declaration.h",
+        "cuda/include/thrust/detail/remove.inl",
+        "cuda/include/thrust/detail/replace.inl",
+        "cuda/include/thrust/detail/reverse.inl",
+        "cuda/include/thrust/detail/scan.inl",
+        "cuda/include/thrust/detail/scatter.inl",
+        "cuda/include/thrust/detail/seq.h",
+        "cuda/include/thrust/detail/sequence.inl",
+        "cuda/include/thrust/detail/set_operations.inl",
+        "cuda/include/thrust/detail/sort.inl",
+        "cuda/include/thrust/detail/static_assert.h",
+        "cuda/include/thrust/detail/static_map.h",
+        "cuda/include/thrust/detail/swap.h",
+        "cuda/include/thrust/detail/swap.inl",
+        "cuda/include/thrust/detail/swap_ranges.inl",
+        "cuda/include/thrust/detail/tabulate.inl",
+        "cuda/include/thrust/detail/temporary_array.h",
+        "cuda/include/thrust/detail/temporary_array.inl",
+        "cuda/include/thrust/detail/temporary_buffer.h",
+        "cuda/include/thrust/detail/transform.inl",
+        "cuda/include/thrust/detail/transform_reduce.inl",
+        "cuda/include/thrust/detail/transform_scan.inl",
+        "cuda/include/thrust/detail/trivial_sequence.h",
+        "cuda/include/thrust/detail/tuple.inl",
+        "cuda/include/thrust/detail/tuple_meta_transform.h",
+        "cuda/include/thrust/detail/tuple_transform.h",
+        "cuda/include/thrust/detail/type_traits.h",
+        "cuda/include/thrust/detail/type_traits/algorithm/intermediate_type_from_function_and_iterators.h",
+        "cuda/include/thrust/detail/type_traits/function_traits.h",
+        "cuda/include/thrust/detail/type_traits/has_member_function.h",
+        "cuda/include/thrust/detail/type_traits/has_nested_type.h",
+        "cuda/include/thrust/detail/type_traits/has_trivial_assign.h",
+        "cuda/include/thrust/detail/type_traits/is_call_possible.h",
+        "cuda/include/thrust/detail/type_traits/is_metafunction_defined.h",
+        "cuda/include/thrust/detail/type_traits/iterator/is_discard_iterator.h",
+        "cuda/include/thrust/detail/type_traits/iterator/is_output_iterator.h",
+        "cuda/include/thrust/detail/type_traits/minimum_type.h",
+        "cuda/include/thrust/detail/type_traits/pointer_traits.h",
+        "cuda/include/thrust/detail/type_traits/result_of_adaptable_function.h",
+        "cuda/include/thrust/detail/uninitialized_copy.inl",
+        "cuda/include/thrust/detail/uninitialized_fill.inl",
+        "cuda/include/thrust/detail/unique.inl",
+        "cuda/include/thrust/detail/use_default.h",
+        "cuda/include/thrust/detail/util/align.h",
+        "cuda/include/thrust/detail/util/blocking.h",
+        "cuda/include/thrust/detail/vector_base.h",
+        "cuda/include/thrust/detail/vector_base.inl",
+        "cuda/include/thrust/device_allocator.h",
+        "cuda/include/thrust/device_delete.h",
+        "cuda/include/thrust/device_free.h",
+        "cuda/include/thrust/device_malloc.h",
+        "cuda/include/thrust/device_malloc_allocator.h",
+        "cuda/include/thrust/device_new.h",
+        "cuda/include/thrust/device_new_allocator.h",
+        "cuda/include/thrust/device_ptr.h",
+        "cuda/include/thrust/device_reference.h",
+        "cuda/include/thrust/device_vector.h",
+        "cuda/include/thrust/distance.h",
+        "cuda/include/thrust/equal.h",
+        "cuda/include/thrust/execution_policy.h",
+        "cuda/include/thrust/extrema.h",
+        "cuda/include/thrust/fill.h",
+        "cuda/include/thrust/find.h",
+        "cuda/include/thrust/for_each.h",
+        "cuda/include/thrust/functional.h",
+        "cuda/include/thrust/gather.h",
+        "cuda/include/thrust/generate.h",
+        "cuda/include/thrust/host_vector.h",
+        "cuda/include/thrust/inner_product.h",
+        "cuda/include/thrust/iterator/constant_iterator.h",
+        "cuda/include/thrust/iterator/counting_iterator.h",
+        "cuda/include/thrust/iterator/detail/any_assign.h",
+        "cuda/include/thrust/iterator/detail/any_system_tag.h",
+        "cuda/include/thrust/iterator/detail/constant_iterator_base.h",
+        "cuda/include/thrust/iterator/detail/counting_iterator.inl",
+        "cuda/include/thrust/iterator/detail/device_system_tag.h",
+        "cuda/include/thrust/iterator/detail/discard_iterator_base.h",
+        "cuda/include/thrust/iterator/detail/distance_from_result.h",
+        "cuda/include/thrust/iterator/detail/host_system_tag.h",
+        "cuda/include/thrust/iterator/detail/is_iterator_category.h",
+        "cuda/include/thrust/iterator/detail/is_trivial_iterator.h",
+        "cuda/include/thrust/iterator/detail/iterator_adaptor_base.h",
+        "cuda/include/thrust/iterator/detail/iterator_category_to_system.h",
+        "cuda/include/thrust/iterator/detail/iterator_category_to_traversal.h",
+        "cuda/include/thrust/iterator/detail/iterator_category_with_system_and_traversal.h",
+        "cuda/include/thrust/iterator/detail/iterator_facade_category.h",
+        "cuda/include/thrust/iterator/detail/iterator_traits.inl",
+        "cuda/include/thrust/iterator/detail/iterator_traversal_tags.h",
+        "cuda/include/thrust/iterator/detail/join_iterator.h",
+        "cuda/include/thrust/iterator/detail/minimum_category.h",
+        "cuda/include/thrust/iterator/detail/minimum_system.h",
+        "cuda/include/thrust/iterator/detail/normal_iterator.h",
+        "cuda/include/thrust/iterator/detail/permutation_iterator_base.h",
+        "cuda/include/thrust/iterator/detail/retag.h",
+        "cuda/include/thrust/iterator/detail/reverse_iterator.inl",
+        "cuda/include/thrust/iterator/detail/reverse_iterator_base.h",
+        "cuda/include/thrust/iterator/detail/tagged_iterator.h",
+        "cuda/include/thrust/iterator/detail/transform_iterator.inl",
+        "cuda/include/thrust/iterator/detail/transform_output_iterator.inl",
+        "cuda/include/thrust/iterator/detail/tuple_of_iterator_references.h",
+        "cuda/include/thrust/iterator/detail/universal_categories.h",
+        "cuda/include/thrust/iterator/detail/zip_iterator.inl",
+        "cuda/include/thrust/iterator/detail/zip_iterator_base.h",
+        "cuda/include/thrust/iterator/discard_iterator.h",
+        "cuda/include/thrust/iterator/iterator_adaptor.h",
+        "cuda/include/thrust/iterator/iterator_categories.h",
+        "cuda/include/thrust/iterator/iterator_facade.h",
+        "cuda/include/thrust/iterator/iterator_traits.h",
+        "cuda/include/thrust/iterator/permutation_iterator.h",
+        "cuda/include/thrust/iterator/retag.h",
+        "cuda/include/thrust/iterator/reverse_iterator.h",
+        "cuda/include/thrust/iterator/transform_iterator.h",
+        "cuda/include/thrust/iterator/transform_output_iterator.h",
+        "cuda/include/thrust/iterator/zip_iterator.h",
+        "cuda/include/thrust/logical.h",
+        "cuda/include/thrust/memory.h",
+        "cuda/include/thrust/merge.h",
+        "cuda/include/thrust/mismatch.h",
+        "cuda/include/thrust/pair.h",
+        "cuda/include/thrust/partition.h",
+        "cuda/include/thrust/random.h",
+        "cuda/include/thrust/random/detail/discard_block_engine.inl",
+        "cuda/include/thrust/random/detail/linear_congruential_engine.inl",
+        "cuda/include/thrust/random/detail/linear_congruential_engine_discard.h",
+        "cuda/include/thrust/random/detail/linear_feedback_shift_engine.inl",
+        "cuda/include/thrust/random/detail/linear_feedback_shift_engine_wordmask.h",
+        "cuda/include/thrust/random/detail/mod.h",
+        "cuda/include/thrust/random/detail/normal_distribution.inl",
+        "cuda/include/thrust/random/detail/normal_distribution_base.h",
+        "cuda/include/thrust/random/detail/random_core_access.h",
+        "cuda/include/thrust/random/detail/subtract_with_carry_engine.inl",
+        "cuda/include/thrust/random/detail/uniform_int_distribution.inl",
+        "cuda/include/thrust/random/detail/uniform_real_distribution.inl",
+        "cuda/include/thrust/random/detail/xor_combine_engine.inl",
+        "cuda/include/thrust/random/detail/xor_combine_engine_max.h",
+        "cuda/include/thrust/random/discard_block_engine.h",
+        "cuda/include/thrust/random/linear_congruential_engine.h",
+        "cuda/include/thrust/random/linear_feedback_shift_engine.h",
+        "cuda/include/thrust/random/normal_distribution.h",
+        "cuda/include/thrust/random/subtract_with_carry_engine.h",
+        "cuda/include/thrust/random/uniform_int_distribution.h",
+        "cuda/include/thrust/random/uniform_real_distribution.h",
+        "cuda/include/thrust/random/xor_combine_engine.h",
+        "cuda/include/thrust/reduce.h",
+        "cuda/include/thrust/remove.h",
+        "cuda/include/thrust/replace.h",
+        "cuda/include/thrust/reverse.h",
+        "cuda/include/thrust/scan.h",
+        "cuda/include/thrust/scatter.h",
+        "cuda/include/thrust/sequence.h",
+        "cuda/include/thrust/set_operations.h",
+        "cuda/include/thrust/sort.h",
+        "cuda/include/thrust/swap.h",
+        "cuda/include/thrust/system/cpp/detail/adjacent_difference.h",
+        "cuda/include/thrust/system/cpp/detail/assign_value.h",
+        "cuda/include/thrust/system/cpp/detail/binary_search.h",
+        "cuda/include/thrust/system/cpp/detail/copy.h",
+        "cuda/include/thrust/system/cpp/detail/copy_if.h",
+        "cuda/include/thrust/system/cpp/detail/count.h",
+        "cuda/include/thrust/system/cpp/detail/equal.h",
+        "cuda/include/thrust/system/cpp/detail/execution_policy.h",
+        "cuda/include/thrust/system/cpp/detail/extrema.h",
+        "cuda/include/thrust/system/cpp/detail/fill.h",
+        "cuda/include/thrust/system/cpp/detail/find.h",
+        "cuda/include/thrust/system/cpp/detail/for_each.h",
+        "cuda/include/thrust/system/cpp/detail/gather.h",
+        "cuda/include/thrust/system/cpp/detail/generate.h",
+        "cuda/include/thrust/system/cpp/detail/get_value.h",
+        "cuda/include/thrust/system/cpp/detail/inner_product.h",
+        "cuda/include/thrust/system/cpp/detail/iter_swap.h",
+        "cuda/include/thrust/system/cpp/detail/logical.h",
+        "cuda/include/thrust/system/cpp/detail/malloc_and_free.h",
+        "cuda/include/thrust/system/cpp/detail/memory.inl",
+        "cuda/include/thrust/system/cpp/detail/merge.h",
+        "cuda/include/thrust/system/cpp/detail/mismatch.h",
+        "cuda/include/thrust/system/cpp/detail/par.h",
+        "cuda/include/thrust/system/cpp/detail/partition.h",
+        "cuda/include/thrust/system/cpp/detail/reduce.h",
+        "cuda/include/thrust/system/cpp/detail/reduce_by_key.h",
+        "cuda/include/thrust/system/cpp/detail/remove.h",
+        "cuda/include/thrust/system/cpp/detail/replace.h",
+        "cuda/include/thrust/system/cpp/detail/reverse.h",
+        "cuda/include/thrust/system/cpp/detail/scan.h",
+        "cuda/include/thrust/system/cpp/detail/scan_by_key.h",
+        "cuda/include/thrust/system/cpp/detail/scatter.h",
+        "cuda/include/thrust/system/cpp/detail/sequence.h",
+        "cuda/include/thrust/system/cpp/detail/set_operations.h",
+        "cuda/include/thrust/system/cpp/detail/sort.h",
+        "cuda/include/thrust/system/cpp/detail/swap_ranges.h",
+        "cuda/include/thrust/system/cpp/detail/tabulate.h",
+        "cuda/include/thrust/system/cpp/detail/temporary_buffer.h",
+        "cuda/include/thrust/system/cpp/detail/transform.h",
+        "cuda/include/thrust/system/cpp/detail/transform_reduce.h",
+        "cuda/include/thrust/system/cpp/detail/transform_scan.h",
+        "cuda/include/thrust/system/cpp/detail/uninitialized_copy.h",
+        "cuda/include/thrust/system/cpp/detail/uninitialized_fill.h",
+        "cuda/include/thrust/system/cpp/detail/unique.h",
+        "cuda/include/thrust/system/cpp/detail/unique_by_key.h",
+        "cuda/include/thrust/system/cpp/detail/vector.inl",
+        "cuda/include/thrust/system/cpp/execution_policy.h",
+        "cuda/include/thrust/system/cpp/memory.h",
+        "cuda/include/thrust/system/cpp/vector.h",
+        "cuda/include/thrust/system/cuda/config.h",
+        "cuda/include/thrust/system/cuda/detail/adjacent_difference.h",
+        "cuda/include/thrust/system/cuda/detail/assign_value.h",
+        "cuda/include/thrust/system/cuda/detail/binary_search.h",
+        "cuda/include/thrust/system/cuda/detail/copy.h",
+        "cuda/include/thrust/system/cuda/detail/copy_if.h",
+        "cuda/include/thrust/system/cuda/detail/core/agent_launcher.h",
+        "cuda/include/thrust/system/cuda/detail/core/alignment.h",
+        "cuda/include/thrust/system/cuda/detail/core/triple_chevron_launch.h",
+        "cuda/include/thrust/system/cuda/detail/core/util.h",
+        "cuda/include/thrust/system/cuda/detail/count.h",
+        "cuda/include/thrust/system/cuda/detail/cross_system.h",
+        "cuda/include/thrust/system/cuda/detail/cub/agent/agent_histogram.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/agent/agent_radix_sort_downsweep.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/agent/agent_radix_sort_upsweep.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/agent/agent_reduce.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/agent/agent_reduce_by_key.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/agent/agent_rle.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/agent/agent_scan.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/agent/agent_segment_fixup.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/agent/agent_select_if.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/agent/agent_spmv_orig.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/agent/single_pass_scan_operators.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/block/block_adjacent_difference.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/block/block_discontinuity.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/block/block_exchange.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/block/block_histogram.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/block/block_load.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/block/block_radix_rank.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/block/block_radix_sort.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/block/block_raking_layout.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/block/block_reduce.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/block/block_scan.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/block/block_shuffle.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/block/block_store.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_histogram_atomic.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_histogram_sort.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_reduce_raking.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_reduce_raking_commutative_only.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_reduce_warp_reductions.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_raking.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans2.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans3.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/cub.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/device/device_histogram.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/device/device_partition.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/device/device_radix_sort.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/device/device_reduce.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/device/device_run_length_encode.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/device/device_scan.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/device/device_segmented_radix_sort.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/device/device_segmented_reduce.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/device/device_select.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/device/device_spmv.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_histogram.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_radix_sort.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_reduce.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_reduce_by_key.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_rle.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_scan.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_select_if.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_spmv_orig.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/grid/grid_barrier.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/grid/grid_even_share.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/grid/grid_mapping.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/grid/grid_queue.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/host/mutex.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/iterator/arg_index_input_iterator.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/iterator/cache_modified_input_iterator.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/iterator/cache_modified_output_iterator.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/iterator/constant_input_iterator.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/iterator/counting_input_iterator.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/iterator/discard_output_iterator.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/iterator/tex_obj_input_iterator.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/iterator/tex_ref_input_iterator.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/iterator/transform_input_iterator.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/thread/thread_load.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/thread/thread_operators.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/thread/thread_reduce.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/thread/thread_scan.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/thread/thread_search.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/thread/thread_store.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/util_allocator.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/util_arch.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/util_debug.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/util_device.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/util_macro.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/util_namespace.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/util_ptx.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/util_type.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/warp/specializations/warp_reduce_shfl.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/warp/specializations/warp_reduce_smem.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/warp/specializations/warp_scan_shfl.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/warp/specializations/warp_scan_smem.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/warp/warp_reduce.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/warp/warp_scan.cuh",
+        "cuda/include/thrust/system/cuda/detail/equal.h",
+        "cuda/include/thrust/system/cuda/detail/error.inl",
+        "cuda/include/thrust/system/cuda/detail/execution_policy.h",
+        "cuda/include/thrust/system/cuda/detail/extrema.h",
+        "cuda/include/thrust/system/cuda/detail/fill.h",
+        "cuda/include/thrust/system/cuda/detail/find.h",
+        "cuda/include/thrust/system/cuda/detail/for_each.h",
+        "cuda/include/thrust/system/cuda/detail/gather.h",
+        "cuda/include/thrust/system/cuda/detail/generate.h",
+        "cuda/include/thrust/system/cuda/detail/get_value.h",
+        "cuda/include/thrust/system/cuda/detail/guarded_cuda_runtime_api.h",
+        "cuda/include/thrust/system/cuda/detail/guarded_driver_types.h",
+        "cuda/include/thrust/system/cuda/detail/inner_product.h",
+        "cuda/include/thrust/system/cuda/detail/internal/copy_cross_system.h",
+        "cuda/include/thrust/system/cuda/detail/internal/copy_device_to_device.h",
+        "cuda/include/thrust/system/cuda/detail/iter_swap.h",
+        "cuda/include/thrust/system/cuda/detail/logical.h",
+        "cuda/include/thrust/system/cuda/detail/malloc_and_free.h",
+        "cuda/include/thrust/system/cuda/detail/memory.inl",
+        "cuda/include/thrust/system/cuda/detail/merge.h",
+        "cuda/include/thrust/system/cuda/detail/mismatch.h",
+        "cuda/include/thrust/system/cuda/detail/par.h",
+        "cuda/include/thrust/system/cuda/detail/par_to_seq.h",
+        "cuda/include/thrust/system/cuda/detail/parallel_for.h",
+        "cuda/include/thrust/system/cuda/detail/partition.h",
+        "cuda/include/thrust/system/cuda/detail/reduce.h",
+        "cuda/include/thrust/system/cuda/detail/reduce_by_key.h",
+        "cuda/include/thrust/system/cuda/detail/remove.h",
+        "cuda/include/thrust/system/cuda/detail/replace.h",
+        "cuda/include/thrust/system/cuda/detail/reverse.h",
+        "cuda/include/thrust/system/cuda/detail/scan.h",
+        "cuda/include/thrust/system/cuda/detail/scan_by_key.h",
+        "cuda/include/thrust/system/cuda/detail/scatter.h",
+        "cuda/include/thrust/system/cuda/detail/sequence.h",
+        "cuda/include/thrust/system/cuda/detail/set_operations.h",
+        "cuda/include/thrust/system/cuda/detail/sort.h",
+        "cuda/include/thrust/system/cuda/detail/swap_ranges.h",
+        "cuda/include/thrust/system/cuda/detail/tabulate.h",
+        "cuda/include/thrust/system/cuda/detail/temporary_buffer.h",
+        "cuda/include/thrust/system/cuda/detail/terminate.h",
+        "cuda/include/thrust/system/cuda/detail/transform.h",
+        "cuda/include/thrust/system/cuda/detail/transform_reduce.h",
+        "cuda/include/thrust/system/cuda/detail/transform_scan.h",
+        "cuda/include/thrust/system/cuda/detail/uninitialized_copy.h",
+        "cuda/include/thrust/system/cuda/detail/uninitialized_fill.h",
+        "cuda/include/thrust/system/cuda/detail/unique.h",
+        "cuda/include/thrust/system/cuda/detail/unique_by_key.h",
+        "cuda/include/thrust/system/cuda/detail/util.h",
+        "cuda/include/thrust/system/cuda/detail/vector.inl",
+        "cuda/include/thrust/system/cuda/error.h",
+        "cuda/include/thrust/system/cuda/execution_policy.h",
+        "cuda/include/thrust/system/cuda/experimental/pinned_allocator.h",
+        "cuda/include/thrust/system/cuda/memory.h",
+        "cuda/include/thrust/system/cuda/vector.h",
+        "cuda/include/thrust/system/detail/adl/adjacent_difference.h",
+        "cuda/include/thrust/system/detail/adl/assign_value.h",
+        "cuda/include/thrust/system/detail/adl/binary_search.h",
+        "cuda/include/thrust/system/detail/adl/copy.h",
+        "cuda/include/thrust/system/detail/adl/copy_if.h",
+        "cuda/include/thrust/system/detail/adl/count.h",
+        "cuda/include/thrust/system/detail/adl/equal.h",
+        "cuda/include/thrust/system/detail/adl/extrema.h",
+        "cuda/include/thrust/system/detail/adl/fill.h",
+        "cuda/include/thrust/system/detail/adl/find.h",
+        "cuda/include/thrust/system/detail/adl/for_each.h",
+        "cuda/include/thrust/system/detail/adl/gather.h",
+        "cuda/include/thrust/system/detail/adl/generate.h",
+        "cuda/include/thrust/system/detail/adl/get_value.h",
+        "cuda/include/thrust/system/detail/adl/inner_product.h",
+        "cuda/include/thrust/system/detail/adl/iter_swap.h",
+        "cuda/include/thrust/system/detail/adl/logical.h",
+        "cuda/include/thrust/system/detail/adl/malloc_and_free.h",
+        "cuda/include/thrust/system/detail/adl/merge.h",
+        "cuda/include/thrust/system/detail/adl/mismatch.h",
+        "cuda/include/thrust/system/detail/adl/partition.h",
+        "cuda/include/thrust/system/detail/adl/reduce.h",
+        "cuda/include/thrust/system/detail/adl/reduce_by_key.h",
+        "cuda/include/thrust/system/detail/adl/remove.h",
+        "cuda/include/thrust/system/detail/adl/replace.h",
+        "cuda/include/thrust/system/detail/adl/reverse.h",
+        "cuda/include/thrust/system/detail/adl/scan.h",
+        "cuda/include/thrust/system/detail/adl/scan_by_key.h",
+        "cuda/include/thrust/system/detail/adl/scatter.h",
+        "cuda/include/thrust/system/detail/adl/sequence.h",
+        "cuda/include/thrust/system/detail/adl/set_operations.h",
+        "cuda/include/thrust/system/detail/adl/sort.h",
+        "cuda/include/thrust/system/detail/adl/swap_ranges.h",
+        "cuda/include/thrust/system/detail/adl/tabulate.h",
+        "cuda/include/thrust/system/detail/adl/temporary_buffer.h",
+        "cuda/include/thrust/system/detail/adl/transform.h",
+        "cuda/include/thrust/system/detail/adl/transform_reduce.h",
+        "cuda/include/thrust/system/detail/adl/transform_scan.h",
+        "cuda/include/thrust/system/detail/adl/uninitialized_copy.h",
+        "cuda/include/thrust/system/detail/adl/uninitialized_fill.h",
+        "cuda/include/thrust/system/detail/adl/unique.h",
+        "cuda/include/thrust/system/detail/adl/unique_by_key.h",
+        "cuda/include/thrust/system/detail/bad_alloc.h",
+        "cuda/include/thrust/system/detail/errno.h",
+        "cuda/include/thrust/system/detail/error_category.inl",
+        "cuda/include/thrust/system/detail/error_code.inl",
+        "cuda/include/thrust/system/detail/error_condition.inl",
+        "cuda/include/thrust/system/detail/generic/adjacent_difference.h",
+        "cuda/include/thrust/system/detail/generic/adjacent_difference.inl",
+        "cuda/include/thrust/system/detail/generic/advance.h",
+        "cuda/include/thrust/system/detail/generic/advance.inl",
+        "cuda/include/thrust/system/detail/generic/binary_search.h",
+        "cuda/include/thrust/system/detail/generic/binary_search.inl",
+        "cuda/include/thrust/system/detail/generic/copy.h",
+        "cuda/include/thrust/system/detail/generic/copy.inl",
+        "cuda/include/thrust/system/detail/generic/copy_if.h",
+        "cuda/include/thrust/system/detail/generic/copy_if.inl",
+        "cuda/include/thrust/system/detail/generic/count.h",
+        "cuda/include/thrust/system/detail/generic/count.inl",
+        "cuda/include/thrust/system/detail/generic/distance.h",
+        "cuda/include/thrust/system/detail/generic/distance.inl",
+        "cuda/include/thrust/system/detail/generic/equal.h",
+        "cuda/include/thrust/system/detail/generic/equal.inl",
+        "cuda/include/thrust/system/detail/generic/extrema.h",
+        "cuda/include/thrust/system/detail/generic/extrema.inl",
+        "cuda/include/thrust/system/detail/generic/fill.h",
+        "cuda/include/thrust/system/detail/generic/find.h",
+        "cuda/include/thrust/system/detail/generic/find.inl",
+        "cuda/include/thrust/system/detail/generic/for_each.h",
+        "cuda/include/thrust/system/detail/generic/gather.h",
+        "cuda/include/thrust/system/detail/generic/gather.inl",
+        "cuda/include/thrust/system/detail/generic/generate.h",
+        "cuda/include/thrust/system/detail/generic/generate.inl",
+        "cuda/include/thrust/system/detail/generic/inner_product.h",
+        "cuda/include/thrust/system/detail/generic/inner_product.inl",
+        "cuda/include/thrust/system/detail/generic/logical.h",
+        "cuda/include/thrust/system/detail/generic/memory.h",
+        "cuda/include/thrust/system/detail/generic/memory.inl",
+        "cuda/include/thrust/system/detail/generic/merge.h",
+        "cuda/include/thrust/system/detail/generic/merge.inl",
+        "cuda/include/thrust/system/detail/generic/mismatch.h",
+        "cuda/include/thrust/system/detail/generic/mismatch.inl",
+        "cuda/include/thrust/system/detail/generic/partition.h",
+        "cuda/include/thrust/system/detail/generic/partition.inl",
+        "cuda/include/thrust/system/detail/generic/reduce.h",
+        "cuda/include/thrust/system/detail/generic/reduce.inl",
+        "cuda/include/thrust/system/detail/generic/reduce_by_key.h",
+        "cuda/include/thrust/system/detail/generic/reduce_by_key.inl",
+        "cuda/include/thrust/system/detail/generic/remove.h",
+        "cuda/include/thrust/system/detail/generic/remove.inl",
+        "cuda/include/thrust/system/detail/generic/replace.h",
+        "cuda/include/thrust/system/detail/generic/replace.inl",
+        "cuda/include/thrust/system/detail/generic/reverse.h",
+        "cuda/include/thrust/system/detail/generic/reverse.inl",
+        "cuda/include/thrust/system/detail/generic/scalar/binary_search.h",
+        "cuda/include/thrust/system/detail/generic/scalar/binary_search.inl",
+        "cuda/include/thrust/system/detail/generic/scan.h",
+        "cuda/include/thrust/system/detail/generic/scan.inl",
+        "cuda/include/thrust/system/detail/generic/scan_by_key.h",
+        "cuda/include/thrust/system/detail/generic/scan_by_key.inl",
+        "cuda/include/thrust/system/detail/generic/scatter.h",
+        "cuda/include/thrust/system/detail/generic/scatter.inl",
+        "cuda/include/thrust/system/detail/generic/select_system.h",
+        "cuda/include/thrust/system/detail/generic/sequence.h",
+        "cuda/include/thrust/system/detail/generic/sequence.inl",
+        "cuda/include/thrust/system/detail/generic/set_operations.h",
+        "cuda/include/thrust/system/detail/generic/set_operations.inl",
+        "cuda/include/thrust/system/detail/generic/sort.h",
+        "cuda/include/thrust/system/detail/generic/sort.inl",
+        "cuda/include/thrust/system/detail/generic/swap_ranges.h",
+        "cuda/include/thrust/system/detail/generic/swap_ranges.inl",
+        "cuda/include/thrust/system/detail/generic/tabulate.h",
+        "cuda/include/thrust/system/detail/generic/tabulate.inl",
+        "cuda/include/thrust/system/detail/generic/tag.h",
+        "cuda/include/thrust/system/detail/generic/temporary_buffer.h",
+        "cuda/include/thrust/system/detail/generic/temporary_buffer.inl",
+        "cuda/include/thrust/system/detail/generic/transform.h",
+        "cuda/include/thrust/system/detail/generic/transform.inl",
+        "cuda/include/thrust/system/detail/generic/transform_reduce.h",
+        "cuda/include/thrust/system/detail/generic/transform_reduce.inl",
+        "cuda/include/thrust/system/detail/generic/transform_scan.h",
+        "cuda/include/thrust/system/detail/generic/transform_scan.inl",
+        "cuda/include/thrust/system/detail/generic/type_traits.h",
+        "cuda/include/thrust/system/detail/generic/uninitialized_copy.h",
+        "cuda/include/thrust/system/detail/generic/uninitialized_copy.inl",
+        "cuda/include/thrust/system/detail/generic/uninitialized_fill.h",
+        "cuda/include/thrust/system/detail/generic/uninitialized_fill.inl",
+        "cuda/include/thrust/system/detail/generic/unique.h",
+        "cuda/include/thrust/system/detail/generic/unique.inl",
+        "cuda/include/thrust/system/detail/generic/unique_by_key.h",
+        "cuda/include/thrust/system/detail/generic/unique_by_key.inl",
+        "cuda/include/thrust/system/detail/internal/decompose.h",
+        "cuda/include/thrust/system/detail/sequential/adjacent_difference.h",
+        "cuda/include/thrust/system/detail/sequential/assign_value.h",
+        "cuda/include/thrust/system/detail/sequential/binary_search.h",
+        "cuda/include/thrust/system/detail/sequential/copy.h",
+        "cuda/include/thrust/system/detail/sequential/copy.inl",
+        "cuda/include/thrust/system/detail/sequential/copy_backward.h",
+        "cuda/include/thrust/system/detail/sequential/copy_if.h",
+        "cuda/include/thrust/system/detail/sequential/count.h",
+        "cuda/include/thrust/system/detail/sequential/equal.h",
+        "cuda/include/thrust/system/detail/sequential/execution_policy.h",
+        "cuda/include/thrust/system/detail/sequential/extrema.h",
+        "cuda/include/thrust/system/detail/sequential/fill.h",
+        "cuda/include/thrust/system/detail/sequential/find.h",
+        "cuda/include/thrust/system/detail/sequential/for_each.h",
+        "cuda/include/thrust/system/detail/sequential/gather.h",
+        "cuda/include/thrust/system/detail/sequential/general_copy.h",
+        "cuda/include/thrust/system/detail/sequential/generate.h",
+        "cuda/include/thrust/system/detail/sequential/get_value.h",
+        "cuda/include/thrust/system/detail/sequential/inner_product.h",
+        "cuda/include/thrust/system/detail/sequential/insertion_sort.h",
+        "cuda/include/thrust/system/detail/sequential/iter_swap.h",
+        "cuda/include/thrust/system/detail/sequential/logical.h",
+        "cuda/include/thrust/system/detail/sequential/malloc_and_free.h",
+        "cuda/include/thrust/system/detail/sequential/merge.h",
+        "cuda/include/thrust/system/detail/sequential/merge.inl",
+        "cuda/include/thrust/system/detail/sequential/mismatch.h",
+        "cuda/include/thrust/system/detail/sequential/partition.h",
+        "cuda/include/thrust/system/detail/sequential/reduce.h",
+        "cuda/include/thrust/system/detail/sequential/reduce_by_key.h",
+        "cuda/include/thrust/system/detail/sequential/remove.h",
+        "cuda/include/thrust/system/detail/sequential/replace.h",
+        "cuda/include/thrust/system/detail/sequential/reverse.h",
+        "cuda/include/thrust/system/detail/sequential/scan.h",
+        "cuda/include/thrust/system/detail/sequential/scan_by_key.h",
+        "cuda/include/thrust/system/detail/sequential/scatter.h",
+        "cuda/include/thrust/system/detail/sequential/sequence.h",
+        "cuda/include/thrust/system/detail/sequential/set_operations.h",
+        "cuda/include/thrust/system/detail/sequential/sort.h",
+        "cuda/include/thrust/system/detail/sequential/sort.inl",
+        "cuda/include/thrust/system/detail/sequential/stable_merge_sort.h",
+        "cuda/include/thrust/system/detail/sequential/stable_merge_sort.inl",
+        "cuda/include/thrust/system/detail/sequential/stable_primitive_sort.h",
+        "cuda/include/thrust/system/detail/sequential/stable_primitive_sort.inl",
+        "cuda/include/thrust/system/detail/sequential/stable_radix_sort.h",
+        "cuda/include/thrust/system/detail/sequential/stable_radix_sort.inl",
+        "cuda/include/thrust/system/detail/sequential/swap_ranges.h",
+        "cuda/include/thrust/system/detail/sequential/tabulate.h",
+        "cuda/include/thrust/system/detail/sequential/temporary_buffer.h",
+        "cuda/include/thrust/system/detail/sequential/transform.h",
+        "cuda/include/thrust/system/detail/sequential/transform_reduce.h",
+        "cuda/include/thrust/system/detail/sequential/transform_scan.h",
+        "cuda/include/thrust/system/detail/sequential/trivial_copy.h",
+        "cuda/include/thrust/system/detail/sequential/uninitialized_copy.h",
+        "cuda/include/thrust/system/detail/sequential/uninitialized_fill.h",
+        "cuda/include/thrust/system/detail/sequential/unique.h",
+        "cuda/include/thrust/system/detail/sequential/unique_by_key.h",
+        "cuda/include/thrust/system/detail/system_error.inl",
+        "cuda/include/thrust/system/error_code.h",
+        "cuda/include/thrust/system/omp/detail/adjacent_difference.h",
+        "cuda/include/thrust/system/omp/detail/assign_value.h",
+        "cuda/include/thrust/system/omp/detail/binary_search.h",
+        "cuda/include/thrust/system/omp/detail/copy.h",
+        "cuda/include/thrust/system/omp/detail/copy.inl",
+        "cuda/include/thrust/system/omp/detail/copy_if.h",
+        "cuda/include/thrust/system/omp/detail/copy_if.inl",
+        "cuda/include/thrust/system/omp/detail/count.h",
+        "cuda/include/thrust/system/omp/detail/default_decomposition.h",
+        "cuda/include/thrust/system/omp/detail/default_decomposition.inl",
+        "cuda/include/thrust/system/omp/detail/equal.h",
+        "cuda/include/thrust/system/omp/detail/execution_policy.h",
+        "cuda/include/thrust/system/omp/detail/extrema.h",
+        "cuda/include/thrust/system/omp/detail/fill.h",
+        "cuda/include/thrust/system/omp/detail/find.h",
+        "cuda/include/thrust/system/omp/detail/for_each.h",
+        "cuda/include/thrust/system/omp/detail/for_each.inl",
+        "cuda/include/thrust/system/omp/detail/gather.h",
+        "cuda/include/thrust/system/omp/detail/generate.h",
+        "cuda/include/thrust/system/omp/detail/get_value.h",
+        "cuda/include/thrust/system/omp/detail/inner_product.h",
+        "cuda/include/thrust/system/omp/detail/iter_swap.h",
+        "cuda/include/thrust/system/omp/detail/logical.h",
+        "cuda/include/thrust/system/omp/detail/malloc_and_free.h",
+        "cuda/include/thrust/system/omp/detail/memory.inl",
+        "cuda/include/thrust/system/omp/detail/merge.h",
+        "cuda/include/thrust/system/omp/detail/mismatch.h",
+        "cuda/include/thrust/system/omp/detail/par.h",
+        "cuda/include/thrust/system/omp/detail/partition.h",
+        "cuda/include/thrust/system/omp/detail/partition.inl",
+        "cuda/include/thrust/system/omp/detail/reduce.h",
+        "cuda/include/thrust/system/omp/detail/reduce.inl",
+        "cuda/include/thrust/system/omp/detail/reduce_by_key.h",
+        "cuda/include/thrust/system/omp/detail/reduce_by_key.inl",
+        "cuda/include/thrust/system/omp/detail/reduce_intervals.h",
+        "cuda/include/thrust/system/omp/detail/reduce_intervals.inl",
+        "cuda/include/thrust/system/omp/detail/remove.h",
+        "cuda/include/thrust/system/omp/detail/remove.inl",
+        "cuda/include/thrust/system/omp/detail/replace.h",
+        "cuda/include/thrust/system/omp/detail/reverse.h",
+        "cuda/include/thrust/system/omp/detail/scan.h",
+        "cuda/include/thrust/system/omp/detail/scan_by_key.h",
+        "cuda/include/thrust/system/omp/detail/scatter.h",
+        "cuda/include/thrust/system/omp/detail/sequence.h",
+        "cuda/include/thrust/system/omp/detail/set_operations.h",
+        "cuda/include/thrust/system/omp/detail/sort.h",
+        "cuda/include/thrust/system/omp/detail/sort.inl",
+        "cuda/include/thrust/system/omp/detail/swap_ranges.h",
+        "cuda/include/thrust/system/omp/detail/tabulate.h",
+        "cuda/include/thrust/system/omp/detail/temporary_buffer.h",
+        "cuda/include/thrust/system/omp/detail/transform.h",
+        "cuda/include/thrust/system/omp/detail/transform_reduce.h",
+        "cuda/include/thrust/system/omp/detail/transform_scan.h",
+        "cuda/include/thrust/system/omp/detail/uninitialized_copy.h",
+        "cuda/include/thrust/system/omp/detail/uninitialized_fill.h",
+        "cuda/include/thrust/system/omp/detail/unique.h",
+        "cuda/include/thrust/system/omp/detail/unique.inl",
+        "cuda/include/thrust/system/omp/detail/unique_by_key.h",
+        "cuda/include/thrust/system/omp/detail/unique_by_key.inl",
+        "cuda/include/thrust/system/omp/detail/vector.inl",
+        "cuda/include/thrust/system/omp/execution_policy.h",
+        "cuda/include/thrust/system/omp/memory.h",
+        "cuda/include/thrust/system/omp/vector.h",
+        "cuda/include/thrust/system/system_error.h",
+        "cuda/include/thrust/system/tbb/detail/adjacent_difference.h",
+        "cuda/include/thrust/system/tbb/detail/assign_value.h",
+        "cuda/include/thrust/system/tbb/detail/binary_search.h",
+        "cuda/include/thrust/system/tbb/detail/copy.h",
+        "cuda/include/thrust/system/tbb/detail/copy.inl",
+        "cuda/include/thrust/system/tbb/detail/copy_if.h",
+        "cuda/include/thrust/system/tbb/detail/copy_if.inl",
+        "cuda/include/thrust/system/tbb/detail/count.h",
+        "cuda/include/thrust/system/tbb/detail/equal.h",
+        "cuda/include/thrust/system/tbb/detail/execution_policy.h",
+        "cuda/include/thrust/system/tbb/detail/extrema.h",
+        "cuda/include/thrust/system/tbb/detail/fill.h",
+        "cuda/include/thrust/system/tbb/detail/find.h",
+        "cuda/include/thrust/system/tbb/detail/for_each.h",
+        "cuda/include/thrust/system/tbb/detail/for_each.inl",
+        "cuda/include/thrust/system/tbb/detail/gather.h",
+        "cuda/include/thrust/system/tbb/detail/generate.h",
+        "cuda/include/thrust/system/tbb/detail/get_value.h",
+        "cuda/include/thrust/system/tbb/detail/inner_product.h",
+        "cuda/include/thrust/system/tbb/detail/iter_swap.h",
+        "cuda/include/thrust/system/tbb/detail/logical.h",
+        "cuda/include/thrust/system/tbb/detail/malloc_and_free.h",
+        "cuda/include/thrust/system/tbb/detail/memory.inl",
+        "cuda/include/thrust/system/tbb/detail/merge.h",
+        "cuda/include/thrust/system/tbb/detail/merge.inl",
+        "cuda/include/thrust/system/tbb/detail/mismatch.h",
+        "cuda/include/thrust/system/tbb/detail/par.h",
+        "cuda/include/thrust/system/tbb/detail/partition.h",
+        "cuda/include/thrust/system/tbb/detail/partition.inl",
+        "cuda/include/thrust/system/tbb/detail/reduce.h",
+        "cuda/include/thrust/system/tbb/detail/reduce.inl",
+        "cuda/include/thrust/system/tbb/detail/reduce_by_key.h",
+        "cuda/include/thrust/system/tbb/detail/reduce_by_key.inl",
+        "cuda/include/thrust/system/tbb/detail/reduce_intervals.h",
+        "cuda/include/thrust/system/tbb/detail/remove.h",
+        "cuda/include/thrust/system/tbb/detail/remove.inl",
+        "cuda/include/thrust/system/tbb/detail/replace.h",
+        "cuda/include/thrust/system/tbb/detail/reverse.h",
+        "cuda/include/thrust/system/tbb/detail/scan.h",
+        "cuda/include/thrust/system/tbb/detail/scan.inl",
+        "cuda/include/thrust/system/tbb/detail/scan_by_key.h",
+        "cuda/include/thrust/system/tbb/detail/scatter.h",
+        "cuda/include/thrust/system/tbb/detail/sequence.h",
+        "cuda/include/thrust/system/tbb/detail/set_operations.h",
+        "cuda/include/thrust/system/tbb/detail/sort.h",
+        "cuda/include/thrust/system/tbb/detail/sort.inl",
+        "cuda/include/thrust/system/tbb/detail/swap_ranges.h",
+        "cuda/include/thrust/system/tbb/detail/tabulate.h",
+        "cuda/include/thrust/system/tbb/detail/temporary_buffer.h",
+        "cuda/include/thrust/system/tbb/detail/transform.h",
+        "cuda/include/thrust/system/tbb/detail/transform_reduce.h",
+        "cuda/include/thrust/system/tbb/detail/transform_scan.h",
+        "cuda/include/thrust/system/tbb/detail/uninitialized_copy.h",
+        "cuda/include/thrust/system/tbb/detail/uninitialized_fill.h",
+        "cuda/include/thrust/system/tbb/detail/unique.h",
+        "cuda/include/thrust/system/tbb/detail/unique.inl",
+        "cuda/include/thrust/system/tbb/detail/unique_by_key.h",
+        "cuda/include/thrust/system/tbb/detail/unique_by_key.inl",
+        "cuda/include/thrust/system/tbb/detail/vector.inl",
+        "cuda/include/thrust/system/tbb/execution_policy.h",
+        "cuda/include/thrust/system/tbb/memory.h",
+        "cuda/include/thrust/system/tbb/vector.h",
+        "cuda/include/thrust/system_error.h",
+        "cuda/include/thrust/tabulate.h",
+        "cuda/include/thrust/transform.h",
+        "cuda/include/thrust/transform_reduce.h",
+        "cuda/include/thrust/transform_scan.h",
+        "cuda/include/thrust/tuple.h",
+        "cuda/include/thrust/uninitialized_copy.h",
+        "cuda/include/thrust/uninitialized_fill.h",
+        "cuda/include/thrust/unique.h",
+        "cuda/include/thrust/version.h",
+        "cuda/include/vector_functions.h",
+        "cuda/include/vector_functions.hpp",
+        "cuda/include/vector_types.h",
+    ],
+    cmd = """
+if [ -d "$(@D)/extras" ]; then rm $(@D)/extras -drf; fi && if [ -d "$(@D)/include" ]; then rm $(@D)/include -drf; fi && if [ -d "$(@D)/lib" ]; then rm $(@D)/lib -drf; fi && if [ -d "$(@D)/nvvm" ]; then rm $(@D)/nvvm -drf; fi && cp -f "/usr/local/cuda-10.0/include/CL/cl.h" "$(@D)/cuda/include/CL/cl.h" && cp -f "/usr/local/cuda-10.0/include/CL/cl.hpp" "$(@D)/cuda/include/CL/cl.hpp" && cp -f "/usr/local/cuda-10.0/include/CL/cl_egl.h" "$(@D)/cuda/include/CL/cl_egl.h" && cp -f "/usr/local/cuda-10.0/include/CL/cl_ext.h" "$(@D)/cuda/include/CL/cl_ext.h" && cp -f "/usr/local/cuda-10.0/include/CL/cl_gl.h" "$(@D)/cuda/include/CL/cl_gl.h" && cp -f "/usr/local/cuda-10.0/include/CL/cl_gl_ext.h" "$(@D)/cuda/include/CL/cl_gl_ext.h" && cp -f "/usr/local/cuda-10.0/include/CL/cl_platform.h" "$(@D)/cuda/include/CL/cl_platform.h" && cp -f "/usr/local/cuda-10.0/include/CL/opencl.h" "$(@D)/cuda/include/CL/opencl.h" && cp -f "/usr/local/cuda-10.0/include/builtin_types.h" "$(@D)/cuda/include/builtin_types.h" && cp -f "/usr/local/cuda-10.0/include/channel_descriptor.h" "$(@D)/cuda/include/channel_descriptor.h" && cp -f "/usr/local/cuda-10.0/include/common_functions.h" "$(@D)/cuda/include/common_functions.h" && cp -f "/usr/local/cuda-10.0/include/cooperative_groups.h" "$(@D)/cuda/include/cooperative_groups.h" && cp -f "/usr/local/cuda-10.0/include/cooperative_groups_helpers.h" "$(@D)/cuda/include/cooperative_groups_helpers.h" && cp -f "/usr/local/cuda-10.0/include/crt/common_functions.h" "$(@D)/cuda/include/crt/common_functions.h" && cp -f "/usr/local/cuda-10.0/include/crt/device_double_functions.h" "$(@D)/cuda/include/crt/device_double_functions.h" && cp -f "/usr/local/cuda-10.0/include/crt/device_double_functions.hpp" "$(@D)/cuda/include/crt/device_double_functions.hpp" && cp -f "/usr/local/cuda-10.0/include/crt/device_functions.h" "$(@D)/cuda/include/crt/device_functions.h" && cp -f "/usr/local/cuda-10.0/include/crt/device_functions.hpp" "$(@D)/cuda/include/crt/device_functions.hpp" && cp -f "/usr/local/cuda-10.0/include/crt/func_macro.h" "$(@D)/cuda/include/crt/func_macro.h" && cp -f "/usr/local/cuda-10.0/include/crt/host_config.h" "$(@D)/cuda/include/crt/host_config.h" && cp -f "/usr/local/cuda-10.0/include/crt/host_defines.h" "$(@D)/cuda/include/crt/host_defines.h" && cp -f "/usr/local/cuda-10.0/include/crt/host_runtime.h" "$(@D)/cuda/include/crt/host_runtime.h" && cp -f "/usr/local/cuda-10.0/include/crt/math_functions.h" "$(@D)/cuda/include/crt/math_functions.h" && cp -f "/usr/local/cuda-10.0/include/crt/math_functions.hpp" "$(@D)/cuda/include/crt/math_functions.hpp" && cp -f "/usr/local/cuda-10.0/include/crt/mma.h" "$(@D)/cuda/include/crt/mma.h" && cp -f "/usr/local/cuda-10.0/include/crt/mma.hpp" "$(@D)/cuda/include/crt/mma.hpp" && cp -f "/usr/local/cuda-10.0/include/crt/nvfunctional" "$(@D)/cuda/include/crt/nvfunctional" && cp -f "/usr/local/cuda-10.0/include/crt/sm_70_rt.h" "$(@D)/cuda/include/crt/sm_70_rt.h" && cp -f "/usr/local/cuda-10.0/include/crt/sm_70_rt.hpp" "$(@D)/cuda/include/crt/sm_70_rt.hpp" && cp -f "/usr/local/cuda-10.0/include/crt/storage_class.h" "$(@D)/cuda/include/crt/storage_class.h" && cp -f "/usr/local/cuda-10.0/include/cuComplex.h" "$(@D)/cuda/include/cuComplex.h" && cp -f "/usr/local/cuda-10.0/include/cublas.h" "$(@D)/cuda/include/cublas.h" && cp -f "/usr/local/cuda-10.0/include/cublasXt.h" "$(@D)/cuda/include/cublasXt.h" && cp -f "/usr/local/cuda-10.0/include/cublas_api.h" "$(@D)/cuda/include/cublas_api.h" && cp -f "/usr/local/cuda-10.0/include/cublas_v2.h" "$(@D)/cuda/include/cublas_v2.h" && cp -f "/usr/local/cuda-10.0/include/cuda.h" "$(@D)/cuda/include/cuda.h" && cp -f "/usr/local/cuda-10.0/include/cudaEGL.h" "$(@D)/cuda/include/cudaEGL.h" && cp -f "/usr/local/cuda-10.0/include/cudaGL.h" "$(@D)/cuda/include/cudaGL.h" && cp -f "/usr/local/cuda-10.0/include/cudaProfiler.h" "$(@D)/cuda/include/cudaProfiler.h" && cp -f "/usr/local/cuda-10.0/include/cudaVDPAU.h" "$(@D)/cuda/include/cudaVDPAU.h" && cp -f "/usr/local/cuda-10.0/include/cuda_device_runtime_api.h" "$(@D)/cuda/include/cuda_device_runtime_api.h" && cp -f "/usr/local/cuda-10.0/include/cuda_egl_interop.h" "$(@D)/cuda/include/cuda_egl_interop.h" && cp -f "/usr/local/cuda-10.0/include/cuda_fp16.h" "$(@D)/cuda/include/cuda_fp16.h" && cp -f "/usr/local/cuda-10.0/include/cuda_fp16.hpp" "$(@D)/cuda/include/cuda_fp16.hpp" && cp -f "/usr/local/cuda-10.0/include/cuda_gl_interop.h" "$(@D)/cuda/include/cuda_gl_interop.h" && cp -f "/usr/local/cuda-10.0/include/cuda_occupancy.h" "$(@D)/cuda/include/cuda_occupancy.h" && cp -f "/usr/local/cuda-10.0/include/cuda_profiler_api.h" "$(@D)/cuda/include/cuda_profiler_api.h" && cp -f "/usr/local/cuda-10.0/include/cuda_runtime.h" "$(@D)/cuda/include/cuda_runtime.h" && cp -f "/usr/local/cuda-10.0/include/cuda_runtime_api.h" "$(@D)/cuda/include/cuda_runtime_api.h" && cp -f "/usr/local/cuda-10.0/include/cuda_surface_types.h" "$(@D)/cuda/include/cuda_surface_types.h" && cp -f "/usr/local/cuda-10.0/include/cuda_texture_types.h" "$(@D)/cuda/include/cuda_texture_types.h" && cp -f "/usr/local/cuda-10.0/include/cuda_vdpau_interop.h" "$(@D)/cuda/include/cuda_vdpau_interop.h" && cp -f "/usr/local/cuda-10.0/include/cudalibxt.h" "$(@D)/cuda/include/cudalibxt.h" && cp -f "/usr/local/cuda-10.0/include/cudart_platform.h" "$(@D)/cuda/include/cudart_platform.h" && cp -f "/usr/local/cuda-10.0/include/cufft.h" "$(@D)/cuda/include/cufft.h" && cp -f "/usr/local/cuda-10.0/include/cufftXt.h" "$(@D)/cuda/include/cufftXt.h" && cp -f "/usr/local/cuda-10.0/include/cufftw.h" "$(@D)/cuda/include/cufftw.h" && cp -f "/usr/local/cuda-10.0/include/curand.h" "$(@D)/cuda/include/curand.h" && cp -f "/usr/local/cuda-10.0/include/curand_discrete.h" "$(@D)/cuda/include/curand_discrete.h" && cp -f "/usr/local/cuda-10.0/include/curand_discrete2.h" "$(@D)/cuda/include/curand_discrete2.h" && cp -f "/usr/local/cuda-10.0/include/curand_globals.h" "$(@D)/cuda/include/curand_globals.h" && cp -f "/usr/local/cuda-10.0/include/curand_kernel.h" "$(@D)/cuda/include/curand_kernel.h" && cp -f "/usr/local/cuda-10.0/include/curand_lognormal.h" "$(@D)/cuda/include/curand_lognormal.h" && cp -f "/usr/local/cuda-10.0/include/curand_mrg32k3a.h" "$(@D)/cuda/include/curand_mrg32k3a.h" && cp -f "/usr/local/cuda-10.0/include/curand_mtgp32.h" "$(@D)/cuda/include/curand_mtgp32.h" && cp -f "/usr/local/cuda-10.0/include/curand_mtgp32_host.h" "$(@D)/cuda/include/curand_mtgp32_host.h" && cp -f "/usr/local/cuda-10.0/include/curand_mtgp32_kernel.h" "$(@D)/cuda/include/curand_mtgp32_kernel.h" && cp -f "/usr/local/cuda-10.0/include/curand_mtgp32dc_p_11213.h" "$(@D)/cuda/include/curand_mtgp32dc_p_11213.h" && cp -f "/usr/local/cuda-10.0/include/curand_normal.h" "$(@D)/cuda/include/curand_normal.h" && cp -f "/usr/local/cuda-10.0/include/curand_normal_static.h" "$(@D)/cuda/include/curand_normal_static.h" && cp -f "/usr/local/cuda-10.0/include/curand_philox4x32_x.h" "$(@D)/cuda/include/curand_philox4x32_x.h" && cp -f "/usr/local/cuda-10.0/include/curand_poisson.h" "$(@D)/cuda/include/curand_poisson.h" && cp -f "/usr/local/cuda-10.0/include/curand_precalc.h" "$(@D)/cuda/include/curand_precalc.h" && cp -f "/usr/local/cuda-10.0/include/curand_uniform.h" "$(@D)/cuda/include/curand_uniform.h" && cp -f "/usr/local/cuda-10.0/include/cusolverDn.h" "$(@D)/cuda/include/cusolverDn.h" && cp -f "/usr/local/cuda-10.0/include/cusolverRf.h" "$(@D)/cuda/include/cusolverRf.h" && cp -f "/usr/local/cuda-10.0/include/cusolverSp.h" "$(@D)/cuda/include/cusolverSp.h" && cp -f "/usr/local/cuda-10.0/include/cusolverSp_LOWLEVEL_PREVIEW.h" "$(@D)/cuda/include/cusolverSp_LOWLEVEL_PREVIEW.h" && cp -f "/usr/local/cuda-10.0/include/cusolver_common.h" "$(@D)/cuda/include/cusolver_common.h" && cp -f "/usr/local/cuda-10.0/include/cusparse.h" "$(@D)/cuda/include/cusparse.h" && cp -f "/usr/local/cuda-10.0/include/cusparse_v2.h" "$(@D)/cuda/include/cusparse_v2.h" && cp -f "/usr/local/cuda-10.0/include/device_atomic_functions.h" "$(@D)/cuda/include/device_atomic_functions.h" && cp -f "/usr/local/cuda-10.0/include/device_atomic_functions.hpp" "$(@D)/cuda/include/device_atomic_functions.hpp" && cp -f "/usr/local/cuda-10.0/include/device_double_functions.h" "$(@D)/cuda/include/device_double_functions.h" && cp -f "/usr/local/cuda-10.0/include/device_functions.h" "$(@D)/cuda/include/device_functions.h" && cp -f "/usr/local/cuda-10.0/include/device_launch_parameters.h" "$(@D)/cuda/include/device_launch_parameters.h" && cp -f "/usr/local/cuda-10.0/include/device_types.h" "$(@D)/cuda/include/device_types.h" && cp -f "/usr/local/cuda-10.0/include/driver_functions.h" "$(@D)/cuda/include/driver_functions.h" && cp -f "/usr/local/cuda-10.0/include/driver_types.h" "$(@D)/cuda/include/driver_types.h" && cp -f "/usr/local/cuda-10.0/include/fatBinaryCtl.h" "$(@D)/cuda/include/fatBinaryCtl.h" && cp -f "/usr/local/cuda-10.0/include/fatbinary.h" "$(@D)/cuda/include/fatbinary.h" && cp -f "/usr/local/cuda-10.0/include/host_config.h" "$(@D)/cuda/include/host_config.h" && cp -f "/usr/local/cuda-10.0/include/host_defines.h" "$(@D)/cuda/include/host_defines.h" && cp -f "/usr/local/cuda-10.0/include/library_types.h" "$(@D)/cuda/include/library_types.h" && cp -f "/usr/local/cuda-10.0/include/math_constants.h" "$(@D)/cuda/include/math_constants.h" && cp -f "/usr/local/cuda-10.0/include/math_functions.h" "$(@D)/cuda/include/math_functions.h" && cp -f "/usr/local/cuda-10.0/include/mma.h" "$(@D)/cuda/include/mma.h" && cp -f "/usr/local/cuda-10.0/include/npp.h" "$(@D)/cuda/include/npp.h" && cp -f "/usr/local/cuda-10.0/include/nppcore.h" "$(@D)/cuda/include/nppcore.h" && cp -f "/usr/local/cuda-10.0/include/nppdefs.h" "$(@D)/cuda/include/nppdefs.h" && cp -f "/usr/local/cuda-10.0/include/nppi.h" "$(@D)/cuda/include/nppi.h" && cp -f "/usr/local/cuda-10.0/include/nppi_arithmetic_and_logical_operations.h" "$(@D)/cuda/include/nppi_arithmetic_and_logical_operations.h" && cp -f "/usr/local/cuda-10.0/include/nppi_color_conversion.h" "$(@D)/cuda/include/nppi_color_conversion.h" && cp -f "/usr/local/cuda-10.0/include/nppi_compression_functions.h" "$(@D)/cuda/include/nppi_compression_functions.h" && cp -f "/usr/local/cuda-10.0/include/nppi_computer_vision.h" "$(@D)/cuda/include/nppi_computer_vision.h" && cp -f "/usr/local/cuda-10.0/include/nppi_data_exchange_and_initialization.h" "$(@D)/cuda/include/nppi_data_exchange_and_initialization.h" && cp -f "/usr/local/cuda-10.0/include/nppi_filtering_functions.h" "$(@D)/cuda/include/nppi_filtering_functions.h" && cp -f "/usr/local/cuda-10.0/include/nppi_geometry_transforms.h" "$(@D)/cuda/include/nppi_geometry_transforms.h" && cp -f "/usr/local/cuda-10.0/include/nppi_linear_transforms.h" "$(@D)/cuda/include/nppi_linear_transforms.h" && cp -f "/usr/local/cuda-10.0/include/nppi_morphological_operations.h" "$(@D)/cuda/include/nppi_morphological_operations.h" && cp -f "/usr/local/cuda-10.0/include/nppi_statistics_functions.h" "$(@D)/cuda/include/nppi_statistics_functions.h" && cp -f "/usr/local/cuda-10.0/include/nppi_support_functions.h" "$(@D)/cuda/include/nppi_support_functions.h" && cp -f "/usr/local/cuda-10.0/include/nppi_threshold_and_compare_operations.h" "$(@D)/cuda/include/nppi_threshold_and_compare_operations.h" && cp -f "/usr/local/cuda-10.0/include/npps.h" "$(@D)/cuda/include/npps.h" && cp -f "/usr/local/cuda-10.0/include/npps_arithmetic_and_logical_operations.h" "$(@D)/cuda/include/npps_arithmetic_and_logical_operations.h" && cp -f "/usr/local/cuda-10.0/include/npps_conversion_functions.h" "$(@D)/cuda/include/npps_conversion_functions.h" && cp -f "/usr/local/cuda-10.0/include/npps_filtering_functions.h" "$(@D)/cuda/include/npps_filtering_functions.h" && cp -f "/usr/local/cuda-10.0/include/npps_initialization.h" "$(@D)/cuda/include/npps_initialization.h" && cp -f "/usr/local/cuda-10.0/include/npps_statistics_functions.h" "$(@D)/cuda/include/npps_statistics_functions.h" && cp -f "/usr/local/cuda-10.0/include/npps_support_functions.h" "$(@D)/cuda/include/npps_support_functions.h" && cp -f "/usr/local/cuda-10.0/include/nppversion.h" "$(@D)/cuda/include/nppversion.h" && cp -f "/usr/local/cuda-10.0/include/nvToolsExt.h" "$(@D)/cuda/include/nvToolsExt.h" && cp -f "/usr/local/cuda-10.0/include/nvToolsExtCuda.h" "$(@D)/cuda/include/nvToolsExtCuda.h" && cp -f "/usr/local/cuda-10.0/include/nvToolsExtCudaRt.h" "$(@D)/cuda/include/nvToolsExtCudaRt.h" && cp -f "/usr/local/cuda-10.0/include/nvToolsExtMeta.h" "$(@D)/cuda/include/nvToolsExtMeta.h" && cp -f "/usr/local/cuda-10.0/include/nvToolsExtSync.h" "$(@D)/cuda/include/nvToolsExtSync.h" && cp -f "/usr/local/cuda-10.0/include/nvblas.h" "$(@D)/cuda/include/nvblas.h" && cp -f "/usr/local/cuda-10.0/include/nvfunctional" "$(@D)/cuda/include/nvfunctional" && cp -f "/usr/local/cuda-10.0/include/nvgraph.h" "$(@D)/cuda/include/nvgraph.h" && cp -f "/usr/local/cuda-10.0/include/nvjpeg.h" "$(@D)/cuda/include/nvjpeg.h" && cp -f "/usr/local/cuda-10.0/include/nvml.h" "$(@D)/cuda/include/nvml.h" && cp -f "/usr/local/cuda-10.0/include/nvrtc.h" "$(@D)/cuda/include/nvrtc.h" && cp -f "/usr/local/cuda-10.0/include/nvtx3/nvToolsExt.h" "$(@D)/cuda/include/nvtx3/nvToolsExt.h" && cp -f "/usr/local/cuda-10.0/include/nvtx3/nvToolsExtCuda.h" "$(@D)/cuda/include/nvtx3/nvToolsExtCuda.h" && cp -f "/usr/local/cuda-10.0/include/nvtx3/nvToolsExtCudaRt.h" "$(@D)/cuda/include/nvtx3/nvToolsExtCudaRt.h" && cp -f "/usr/local/cuda-10.0/include/nvtx3/nvToolsExtOpenCL.h" "$(@D)/cuda/include/nvtx3/nvToolsExtOpenCL.h" && cp -f "/usr/local/cuda-10.0/include/nvtx3/nvToolsExtSync.h" "$(@D)/cuda/include/nvtx3/nvToolsExtSync.h" && cp -f "/usr/local/cuda-10.0/include/nvtx3/nvtxDetail/nvtxImpl.h" "$(@D)/cuda/include/nvtx3/nvtxDetail/nvtxImpl.h" && cp -f "/usr/local/cuda-10.0/include/nvtx3/nvtxDetail/nvtxImplCore.h" "$(@D)/cuda/include/nvtx3/nvtxDetail/nvtxImplCore.h" && cp -f "/usr/local/cuda-10.0/include/nvtx3/nvtxDetail/nvtxImplCudaRt_v3.h" "$(@D)/cuda/include/nvtx3/nvtxDetail/nvtxImplCudaRt_v3.h" && cp -f "/usr/local/cuda-10.0/include/nvtx3/nvtxDetail/nvtxImplCuda_v3.h" "$(@D)/cuda/include/nvtx3/nvtxDetail/nvtxImplCuda_v3.h" && cp -f "/usr/local/cuda-10.0/include/nvtx3/nvtxDetail/nvtxImplOpenCL_v3.h" "$(@D)/cuda/include/nvtx3/nvtxDetail/nvtxImplOpenCL_v3.h" && cp -f "/usr/local/cuda-10.0/include/nvtx3/nvtxDetail/nvtxImplSync_v3.h" "$(@D)/cuda/include/nvtx3/nvtxDetail/nvtxImplSync_v3.h" && cp -f "/usr/local/cuda-10.0/include/nvtx3/nvtxDetail/nvtxInit.h" "$(@D)/cuda/include/nvtx3/nvtxDetail/nvtxInit.h" && cp -f "/usr/local/cuda-10.0/include/nvtx3/nvtxDetail/nvtxInitDecls.h" "$(@D)/cuda/include/nvtx3/nvtxDetail/nvtxInitDecls.h" && cp -f "/usr/local/cuda-10.0/include/nvtx3/nvtxDetail/nvtxInitDefs.h" "$(@D)/cuda/include/nvtx3/nvtxDetail/nvtxInitDefs.h" && cp -f "/usr/local/cuda-10.0/include/nvtx3/nvtxDetail/nvtxLinkOnce.h" "$(@D)/cuda/include/nvtx3/nvtxDetail/nvtxLinkOnce.h" && cp -f "/usr/local/cuda-10.0/include/nvtx3/nvtxDetail/nvtxTypes.h" "$(@D)/cuda/include/nvtx3/nvtxDetail/nvtxTypes.h" && cp -f "/usr/local/cuda-10.0/include/sm_20_atomic_functions.h" "$(@D)/cuda/include/sm_20_atomic_functions.h" && cp -f "/usr/local/cuda-10.0/include/sm_20_atomic_functions.hpp" "$(@D)/cuda/include/sm_20_atomic_functions.hpp" && cp -f "/usr/local/cuda-10.0/include/sm_20_intrinsics.h" "$(@D)/cuda/include/sm_20_intrinsics.h" && cp -f "/usr/local/cuda-10.0/include/sm_20_intrinsics.hpp" "$(@D)/cuda/include/sm_20_intrinsics.hpp" && cp -f "/usr/local/cuda-10.0/include/sm_30_intrinsics.h" "$(@D)/cuda/include/sm_30_intrinsics.h" && cp -f "/usr/local/cuda-10.0/include/sm_30_intrinsics.hpp" "$(@D)/cuda/include/sm_30_intrinsics.hpp" && cp -f "/usr/local/cuda-10.0/include/sm_32_atomic_functions.h" "$(@D)/cuda/include/sm_32_atomic_functions.h" && cp -f "/usr/local/cuda-10.0/include/sm_32_atomic_functions.hpp" "$(@D)/cuda/include/sm_32_atomic_functions.hpp" && cp -f "/usr/local/cuda-10.0/include/sm_32_intrinsics.h" "$(@D)/cuda/include/sm_32_intrinsics.h" && cp -f "/usr/local/cuda-10.0/include/sm_32_intrinsics.hpp" "$(@D)/cuda/include/sm_32_intrinsics.hpp" && cp -f "/usr/local/cuda-10.0/include/sm_35_atomic_functions.h" "$(@D)/cuda/include/sm_35_atomic_functions.h" && cp -f "/usr/local/cuda-10.0/include/sm_35_intrinsics.h" "$(@D)/cuda/include/sm_35_intrinsics.h" && cp -f "/usr/local/cuda-10.0/include/sm_60_atomic_functions.h" "$(@D)/cuda/include/sm_60_atomic_functions.h" && cp -f "/usr/local/cuda-10.0/include/sm_60_atomic_functions.hpp" "$(@D)/cuda/include/sm_60_atomic_functions.hpp" && cp -f "/usr/local/cuda-10.0/include/sm_61_intrinsics.h" "$(@D)/cuda/include/sm_61_intrinsics.h" && cp -f "/usr/local/cuda-10.0/include/sm_61_intrinsics.hpp" "$(@D)/cuda/include/sm_61_intrinsics.hpp" && cp -f "/usr/local/cuda-10.0/include/sobol_direction_vectors.h" "$(@D)/cuda/include/sobol_direction_vectors.h" && cp -f "/usr/local/cuda-10.0/include/surface_functions.h" "$(@D)/cuda/include/surface_functions.h" && cp -f "/usr/local/cuda-10.0/include/surface_functions.hpp" "$(@D)/cuda/include/surface_functions.hpp" && cp -f "/usr/local/cuda-10.0/include/surface_indirect_functions.h" "$(@D)/cuda/include/surface_indirect_functions.h" && cp -f "/usr/local/cuda-10.0/include/surface_indirect_functions.hpp" "$(@D)/cuda/include/surface_indirect_functions.hpp" && cp -f "/usr/local/cuda-10.0/include/surface_types.h" "$(@D)/cuda/include/surface_types.h" && cp -f "/usr/local/cuda-10.0/include/texture_fetch_functions.h" "$(@D)/cuda/include/texture_fetch_functions.h" && cp -f "/usr/local/cuda-10.0/include/texture_fetch_functions.hpp" "$(@D)/cuda/include/texture_fetch_functions.hpp" && cp -f "/usr/local/cuda-10.0/include/texture_indirect_functions.h" "$(@D)/cuda/include/texture_indirect_functions.h" && cp -f "/usr/local/cuda-10.0/include/texture_indirect_functions.hpp" "$(@D)/cuda/include/texture_indirect_functions.hpp" && cp -f "/usr/local/cuda-10.0/include/texture_types.h" "$(@D)/cuda/include/texture_types.h" && cp -f "/usr/local/cuda-10.0/include/thrust/adjacent_difference.h" "$(@D)/cuda/include/thrust/adjacent_difference.h" && cp -f "/usr/local/cuda-10.0/include/thrust/advance.h" "$(@D)/cuda/include/thrust/advance.h" && cp -f "/usr/local/cuda-10.0/include/thrust/binary_search.h" "$(@D)/cuda/include/thrust/binary_search.h" && cp -f "/usr/local/cuda-10.0/include/thrust/complex.h" "$(@D)/cuda/include/thrust/complex.h" && cp -f "/usr/local/cuda-10.0/include/thrust/copy.h" "$(@D)/cuda/include/thrust/copy.h" && cp -f "/usr/local/cuda-10.0/include/thrust/count.h" "$(@D)/cuda/include/thrust/count.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/adjacent_difference.inl" "$(@D)/cuda/include/thrust/detail/adjacent_difference.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/advance.inl" "$(@D)/cuda/include/thrust/detail/advance.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/alignment.h" "$(@D)/cuda/include/thrust/detail/alignment.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/allocator/allocator_traits.h" "$(@D)/cuda/include/thrust/detail/allocator/allocator_traits.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/allocator/allocator_traits.inl" "$(@D)/cuda/include/thrust/detail/allocator/allocator_traits.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/allocator/copy_construct_range.h" "$(@D)/cuda/include/thrust/detail/allocator/copy_construct_range.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/allocator/copy_construct_range.inl" "$(@D)/cuda/include/thrust/detail/allocator/copy_construct_range.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/allocator/default_construct_range.h" "$(@D)/cuda/include/thrust/detail/allocator/default_construct_range.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/allocator/default_construct_range.inl" "$(@D)/cuda/include/thrust/detail/allocator/default_construct_range.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/allocator/destroy_range.h" "$(@D)/cuda/include/thrust/detail/allocator/destroy_range.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/allocator/destroy_range.inl" "$(@D)/cuda/include/thrust/detail/allocator/destroy_range.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/allocator/fill_construct_range.h" "$(@D)/cuda/include/thrust/detail/allocator/fill_construct_range.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/allocator/fill_construct_range.inl" "$(@D)/cuda/include/thrust/detail/allocator/fill_construct_range.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/allocator/malloc_allocator.h" "$(@D)/cuda/include/thrust/detail/allocator/malloc_allocator.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/allocator/malloc_allocator.inl" "$(@D)/cuda/include/thrust/detail/allocator/malloc_allocator.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/allocator/no_throw_allocator.h" "$(@D)/cuda/include/thrust/detail/allocator/no_throw_allocator.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/allocator/tagged_allocator.h" "$(@D)/cuda/include/thrust/detail/allocator/tagged_allocator.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/allocator/tagged_allocator.inl" "$(@D)/cuda/include/thrust/detail/allocator/tagged_allocator.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/allocator/temporary_allocator.h" "$(@D)/cuda/include/thrust/detail/allocator/temporary_allocator.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/allocator/temporary_allocator.inl" "$(@D)/cuda/include/thrust/detail/allocator/temporary_allocator.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/binary_search.inl" "$(@D)/cuda/include/thrust/detail/binary_search.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/complex/arithmetic.h" "$(@D)/cuda/include/thrust/detail/complex/arithmetic.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/complex/c99math.h" "$(@D)/cuda/include/thrust/detail/complex/c99math.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/complex/catrig.h" "$(@D)/cuda/include/thrust/detail/complex/catrig.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/complex/catrigf.h" "$(@D)/cuda/include/thrust/detail/complex/catrigf.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/complex/ccosh.h" "$(@D)/cuda/include/thrust/detail/complex/ccosh.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/complex/ccoshf.h" "$(@D)/cuda/include/thrust/detail/complex/ccoshf.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/complex/cexp.h" "$(@D)/cuda/include/thrust/detail/complex/cexp.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/complex/cexpf.h" "$(@D)/cuda/include/thrust/detail/complex/cexpf.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/complex/clog.h" "$(@D)/cuda/include/thrust/detail/complex/clog.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/complex/clogf.h" "$(@D)/cuda/include/thrust/detail/complex/clogf.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/complex/complex.inl" "$(@D)/cuda/include/thrust/detail/complex/complex.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/complex/cpow.h" "$(@D)/cuda/include/thrust/detail/complex/cpow.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/complex/cproj.h" "$(@D)/cuda/include/thrust/detail/complex/cproj.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/complex/csinh.h" "$(@D)/cuda/include/thrust/detail/complex/csinh.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/complex/csinhf.h" "$(@D)/cuda/include/thrust/detail/complex/csinhf.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/complex/csqrt.h" "$(@D)/cuda/include/thrust/detail/complex/csqrt.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/complex/csqrtf.h" "$(@D)/cuda/include/thrust/detail/complex/csqrtf.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/complex/ctanh.h" "$(@D)/cuda/include/thrust/detail/complex/ctanh.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/complex/ctanhf.h" "$(@D)/cuda/include/thrust/detail/complex/ctanhf.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/complex/math_private.h" "$(@D)/cuda/include/thrust/detail/complex/math_private.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/complex/stream.h" "$(@D)/cuda/include/thrust/detail/complex/stream.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/config.h" "$(@D)/cuda/include/thrust/detail/config.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/config/compiler.h" "$(@D)/cuda/include/thrust/detail/config/compiler.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/config/compiler_fence.h" "$(@D)/cuda/include/thrust/detail/config/compiler_fence.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/config/config.h" "$(@D)/cuda/include/thrust/detail/config/config.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/config/debug.h" "$(@D)/cuda/include/thrust/detail/config/debug.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/config/device_system.h" "$(@D)/cuda/include/thrust/detail/config/device_system.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/config/exec_check_disable.h" "$(@D)/cuda/include/thrust/detail/config/exec_check_disable.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/config/forceinline.h" "$(@D)/cuda/include/thrust/detail/config/forceinline.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/config/global_workarounds.h" "$(@D)/cuda/include/thrust/detail/config/global_workarounds.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/config/host_device.h" "$(@D)/cuda/include/thrust/detail/config/host_device.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/config/host_system.h" "$(@D)/cuda/include/thrust/detail/config/host_system.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/config/simple_defines.h" "$(@D)/cuda/include/thrust/detail/config/simple_defines.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/contiguous_storage.h" "$(@D)/cuda/include/thrust/detail/contiguous_storage.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/contiguous_storage.inl" "$(@D)/cuda/include/thrust/detail/contiguous_storage.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/copy.h" "$(@D)/cuda/include/thrust/detail/copy.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/copy.inl" "$(@D)/cuda/include/thrust/detail/copy.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/copy_if.h" "$(@D)/cuda/include/thrust/detail/copy_if.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/copy_if.inl" "$(@D)/cuda/include/thrust/detail/copy_if.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/count.inl" "$(@D)/cuda/include/thrust/detail/count.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/cstdint.h" "$(@D)/cuda/include/thrust/detail/cstdint.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/device_delete.inl" "$(@D)/cuda/include/thrust/detail/device_delete.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/device_free.inl" "$(@D)/cuda/include/thrust/detail/device_free.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/device_malloc.inl" "$(@D)/cuda/include/thrust/detail/device_malloc.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/device_new.inl" "$(@D)/cuda/include/thrust/detail/device_new.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/device_ptr.inl" "$(@D)/cuda/include/thrust/detail/device_ptr.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/device_reference.inl" "$(@D)/cuda/include/thrust/detail/device_reference.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/device_vector.inl" "$(@D)/cuda/include/thrust/detail/device_vector.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/dispatch/is_trivial_copy.h" "$(@D)/cuda/include/thrust/detail/dispatch/is_trivial_copy.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/distance.inl" "$(@D)/cuda/include/thrust/detail/distance.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/equal.inl" "$(@D)/cuda/include/thrust/detail/equal.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/execute_with_allocator.h" "$(@D)/cuda/include/thrust/detail/execute_with_allocator.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/execution_policy.h" "$(@D)/cuda/include/thrust/detail/execution_policy.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/extrema.inl" "$(@D)/cuda/include/thrust/detail/extrema.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/fill.inl" "$(@D)/cuda/include/thrust/detail/fill.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/find.inl" "$(@D)/cuda/include/thrust/detail/find.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/for_each.inl" "$(@D)/cuda/include/thrust/detail/for_each.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/function.h" "$(@D)/cuda/include/thrust/detail/function.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/functional.inl" "$(@D)/cuda/include/thrust/detail/functional.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/functional/actor.h" "$(@D)/cuda/include/thrust/detail/functional/actor.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/functional/actor.inl" "$(@D)/cuda/include/thrust/detail/functional/actor.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/functional/argument.h" "$(@D)/cuda/include/thrust/detail/functional/argument.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/functional/composite.h" "$(@D)/cuda/include/thrust/detail/functional/composite.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/functional/operators.h" "$(@D)/cuda/include/thrust/detail/functional/operators.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/functional/operators/arithmetic_operators.h" "$(@D)/cuda/include/thrust/detail/functional/operators/arithmetic_operators.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/functional/operators/assignment_operator.h" "$(@D)/cuda/include/thrust/detail/functional/operators/assignment_operator.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/functional/operators/bitwise_operators.h" "$(@D)/cuda/include/thrust/detail/functional/operators/bitwise_operators.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/functional/operators/compound_assignment_operators.h" "$(@D)/cuda/include/thrust/detail/functional/operators/compound_assignment_operators.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/functional/operators/logical_operators.h" "$(@D)/cuda/include/thrust/detail/functional/operators/logical_operators.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/functional/operators/operator_adaptors.h" "$(@D)/cuda/include/thrust/detail/functional/operators/operator_adaptors.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/functional/operators/relational_operators.h" "$(@D)/cuda/include/thrust/detail/functional/operators/relational_operators.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/functional/placeholder.h" "$(@D)/cuda/include/thrust/detail/functional/placeholder.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/functional/value.h" "$(@D)/cuda/include/thrust/detail/functional/value.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/gather.inl" "$(@D)/cuda/include/thrust/detail/gather.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/generate.inl" "$(@D)/cuda/include/thrust/detail/generate.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/get_iterator_value.h" "$(@D)/cuda/include/thrust/detail/get_iterator_value.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/host_vector.inl" "$(@D)/cuda/include/thrust/detail/host_vector.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/inner_product.inl" "$(@D)/cuda/include/thrust/detail/inner_product.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/integer_math.h" "$(@D)/cuda/include/thrust/detail/integer_math.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/integer_traits.h" "$(@D)/cuda/include/thrust/detail/integer_traits.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/internal_functional.h" "$(@D)/cuda/include/thrust/detail/internal_functional.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/logical.inl" "$(@D)/cuda/include/thrust/detail/logical.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/malloc_and_free.h" "$(@D)/cuda/include/thrust/detail/malloc_and_free.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/merge.inl" "$(@D)/cuda/include/thrust/detail/merge.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/minmax.h" "$(@D)/cuda/include/thrust/detail/minmax.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/mismatch.inl" "$(@D)/cuda/include/thrust/detail/mismatch.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/mpl/math.h" "$(@D)/cuda/include/thrust/detail/mpl/math.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/numeric_traits.h" "$(@D)/cuda/include/thrust/detail/numeric_traits.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/overlapped_copy.h" "$(@D)/cuda/include/thrust/detail/overlapped_copy.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/pair.inl" "$(@D)/cuda/include/thrust/detail/pair.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/partition.inl" "$(@D)/cuda/include/thrust/detail/partition.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/pointer.h" "$(@D)/cuda/include/thrust/detail/pointer.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/pointer.inl" "$(@D)/cuda/include/thrust/detail/pointer.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/preprocessor.h" "$(@D)/cuda/include/thrust/detail/preprocessor.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/range/head_flags.h" "$(@D)/cuda/include/thrust/detail/range/head_flags.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/range/tail_flags.h" "$(@D)/cuda/include/thrust/detail/range/tail_flags.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/raw_pointer_cast.h" "$(@D)/cuda/include/thrust/detail/raw_pointer_cast.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/raw_reference_cast.h" "$(@D)/cuda/include/thrust/detail/raw_reference_cast.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/reduce.inl" "$(@D)/cuda/include/thrust/detail/reduce.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/reference.h" "$(@D)/cuda/include/thrust/detail/reference.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/reference.inl" "$(@D)/cuda/include/thrust/detail/reference.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/reference_forward_declaration.h" "$(@D)/cuda/include/thrust/detail/reference_forward_declaration.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/remove.inl" "$(@D)/cuda/include/thrust/detail/remove.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/replace.inl" "$(@D)/cuda/include/thrust/detail/replace.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/reverse.inl" "$(@D)/cuda/include/thrust/detail/reverse.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/scan.inl" "$(@D)/cuda/include/thrust/detail/scan.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/scatter.inl" "$(@D)/cuda/include/thrust/detail/scatter.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/seq.h" "$(@D)/cuda/include/thrust/detail/seq.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/sequence.inl" "$(@D)/cuda/include/thrust/detail/sequence.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/set_operations.inl" "$(@D)/cuda/include/thrust/detail/set_operations.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/sort.inl" "$(@D)/cuda/include/thrust/detail/sort.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/static_assert.h" "$(@D)/cuda/include/thrust/detail/static_assert.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/static_map.h" "$(@D)/cuda/include/thrust/detail/static_map.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/swap.h" "$(@D)/cuda/include/thrust/detail/swap.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/swap.inl" "$(@D)/cuda/include/thrust/detail/swap.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/swap_ranges.inl" "$(@D)/cuda/include/thrust/detail/swap_ranges.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/tabulate.inl" "$(@D)/cuda/include/thrust/detail/tabulate.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/temporary_array.h" "$(@D)/cuda/include/thrust/detail/temporary_array.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/temporary_array.inl" "$(@D)/cuda/include/thrust/detail/temporary_array.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/temporary_buffer.h" "$(@D)/cuda/include/thrust/detail/temporary_buffer.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/transform.inl" "$(@D)/cuda/include/thrust/detail/transform.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/transform_reduce.inl" "$(@D)/cuda/include/thrust/detail/transform_reduce.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/transform_scan.inl" "$(@D)/cuda/include/thrust/detail/transform_scan.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/trivial_sequence.h" "$(@D)/cuda/include/thrust/detail/trivial_sequence.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/tuple.inl" "$(@D)/cuda/include/thrust/detail/tuple.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/tuple_meta_transform.h" "$(@D)/cuda/include/thrust/detail/tuple_meta_transform.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/tuple_transform.h" "$(@D)/cuda/include/thrust/detail/tuple_transform.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/type_traits.h" "$(@D)/cuda/include/thrust/detail/type_traits.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/type_traits/algorithm/intermediate_type_from_function_and_iterators.h" "$(@D)/cuda/include/thrust/detail/type_traits/algorithm/intermediate_type_from_function_and_iterators.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/type_traits/function_traits.h" "$(@D)/cuda/include/thrust/detail/type_traits/function_traits.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/type_traits/has_member_function.h" "$(@D)/cuda/include/thrust/detail/type_traits/has_member_function.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/type_traits/has_nested_type.h" "$(@D)/cuda/include/thrust/detail/type_traits/has_nested_type.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/type_traits/has_trivial_assign.h" "$(@D)/cuda/include/thrust/detail/type_traits/has_trivial_assign.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/type_traits/is_call_possible.h" "$(@D)/cuda/include/thrust/detail/type_traits/is_call_possible.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/type_traits/is_metafunction_defined.h" "$(@D)/cuda/include/thrust/detail/type_traits/is_metafunction_defined.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/type_traits/iterator/is_discard_iterator.h" "$(@D)/cuda/include/thrust/detail/type_traits/iterator/is_discard_iterator.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/type_traits/iterator/is_output_iterator.h" "$(@D)/cuda/include/thrust/detail/type_traits/iterator/is_output_iterator.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/type_traits/minimum_type.h" "$(@D)/cuda/include/thrust/detail/type_traits/minimum_type.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/type_traits/pointer_traits.h" "$(@D)/cuda/include/thrust/detail/type_traits/pointer_traits.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/type_traits/result_of_adaptable_function.h" "$(@D)/cuda/include/thrust/detail/type_traits/result_of_adaptable_function.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/uninitialized_copy.inl" "$(@D)/cuda/include/thrust/detail/uninitialized_copy.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/uninitialized_fill.inl" "$(@D)/cuda/include/thrust/detail/uninitialized_fill.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/unique.inl" "$(@D)/cuda/include/thrust/detail/unique.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/use_default.h" "$(@D)/cuda/include/thrust/detail/use_default.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/util/align.h" "$(@D)/cuda/include/thrust/detail/util/align.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/util/blocking.h" "$(@D)/cuda/include/thrust/detail/util/blocking.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/vector_base.h" "$(@D)/cuda/include/thrust/detail/vector_base.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/vector_base.inl" "$(@D)/cuda/include/thrust/detail/vector_base.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/device_allocator.h" "$(@D)/cuda/include/thrust/device_allocator.h" && cp -f "/usr/local/cuda-10.0/include/thrust/device_delete.h" "$(@D)/cuda/include/thrust/device_delete.h" && cp -f "/usr/local/cuda-10.0/include/thrust/device_free.h" "$(@D)/cuda/include/thrust/device_free.h" && cp -f "/usr/local/cuda-10.0/include/thrust/device_malloc.h" "$(@D)/cuda/include/thrust/device_malloc.h" && cp -f "/usr/local/cuda-10.0/include/thrust/device_malloc_allocator.h" "$(@D)/cuda/include/thrust/device_malloc_allocator.h" && cp -f "/usr/local/cuda-10.0/include/thrust/device_new.h" "$(@D)/cuda/include/thrust/device_new.h" && cp -f "/usr/local/cuda-10.0/include/thrust/device_new_allocator.h" "$(@D)/cuda/include/thrust/device_new_allocator.h" && cp -f "/usr/local/cuda-10.0/include/thrust/device_ptr.h" "$(@D)/cuda/include/thrust/device_ptr.h" && cp -f "/usr/local/cuda-10.0/include/thrust/device_reference.h" "$(@D)/cuda/include/thrust/device_reference.h" && cp -f "/usr/local/cuda-10.0/include/thrust/device_vector.h" "$(@D)/cuda/include/thrust/device_vector.h" && cp -f "/usr/local/cuda-10.0/include/thrust/distance.h" "$(@D)/cuda/include/thrust/distance.h" && cp -f "/usr/local/cuda-10.0/include/thrust/equal.h" "$(@D)/cuda/include/thrust/equal.h" && cp -f "/usr/local/cuda-10.0/include/thrust/execution_policy.h" "$(@D)/cuda/include/thrust/execution_policy.h" && cp -f "/usr/local/cuda-10.0/include/thrust/extrema.h" "$(@D)/cuda/include/thrust/extrema.h" && cp -f "/usr/local/cuda-10.0/include/thrust/fill.h" "$(@D)/cuda/include/thrust/fill.h" && cp -f "/usr/local/cuda-10.0/include/thrust/find.h" "$(@D)/cuda/include/thrust/find.h" && cp -f "/usr/local/cuda-10.0/include/thrust/for_each.h" "$(@D)/cuda/include/thrust/for_each.h" && cp -f "/usr/local/cuda-10.0/include/thrust/functional.h" "$(@D)/cuda/include/thrust/functional.h" && cp -f "/usr/local/cuda-10.0/include/thrust/gather.h" "$(@D)/cuda/include/thrust/gather.h" && cp -f "/usr/local/cuda-10.0/include/thrust/generate.h" "$(@D)/cuda/include/thrust/generate.h" && cp -f "/usr/local/cuda-10.0/include/thrust/host_vector.h" "$(@D)/cuda/include/thrust/host_vector.h" && cp -f "/usr/local/cuda-10.0/include/thrust/inner_product.h" "$(@D)/cuda/include/thrust/inner_product.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/constant_iterator.h" "$(@D)/cuda/include/thrust/iterator/constant_iterator.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/counting_iterator.h" "$(@D)/cuda/include/thrust/iterator/counting_iterator.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/detail/any_assign.h" "$(@D)/cuda/include/thrust/iterator/detail/any_assign.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/detail/any_system_tag.h" "$(@D)/cuda/include/thrust/iterator/detail/any_system_tag.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/detail/constant_iterator_base.h" "$(@D)/cuda/include/thrust/iterator/detail/constant_iterator_base.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/detail/counting_iterator.inl" "$(@D)/cuda/include/thrust/iterator/detail/counting_iterator.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/detail/device_system_tag.h" "$(@D)/cuda/include/thrust/iterator/detail/device_system_tag.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/detail/discard_iterator_base.h" "$(@D)/cuda/include/thrust/iterator/detail/discard_iterator_base.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/detail/distance_from_result.h" "$(@D)/cuda/include/thrust/iterator/detail/distance_from_result.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/detail/host_system_tag.h" "$(@D)/cuda/include/thrust/iterator/detail/host_system_tag.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/detail/is_iterator_category.h" "$(@D)/cuda/include/thrust/iterator/detail/is_iterator_category.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/detail/is_trivial_iterator.h" "$(@D)/cuda/include/thrust/iterator/detail/is_trivial_iterator.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/detail/iterator_adaptor_base.h" "$(@D)/cuda/include/thrust/iterator/detail/iterator_adaptor_base.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/detail/iterator_category_to_system.h" "$(@D)/cuda/include/thrust/iterator/detail/iterator_category_to_system.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/detail/iterator_category_to_traversal.h" "$(@D)/cuda/include/thrust/iterator/detail/iterator_category_to_traversal.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/detail/iterator_category_with_system_and_traversal.h" "$(@D)/cuda/include/thrust/iterator/detail/iterator_category_with_system_and_traversal.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/detail/iterator_facade_category.h" "$(@D)/cuda/include/thrust/iterator/detail/iterator_facade_category.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/detail/iterator_traits.inl" "$(@D)/cuda/include/thrust/iterator/detail/iterator_traits.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/detail/iterator_traversal_tags.h" "$(@D)/cuda/include/thrust/iterator/detail/iterator_traversal_tags.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/detail/join_iterator.h" "$(@D)/cuda/include/thrust/iterator/detail/join_iterator.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/detail/minimum_category.h" "$(@D)/cuda/include/thrust/iterator/detail/minimum_category.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/detail/minimum_system.h" "$(@D)/cuda/include/thrust/iterator/detail/minimum_system.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/detail/normal_iterator.h" "$(@D)/cuda/include/thrust/iterator/detail/normal_iterator.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/detail/permutation_iterator_base.h" "$(@D)/cuda/include/thrust/iterator/detail/permutation_iterator_base.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/detail/retag.h" "$(@D)/cuda/include/thrust/iterator/detail/retag.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/detail/reverse_iterator.inl" "$(@D)/cuda/include/thrust/iterator/detail/reverse_iterator.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/detail/reverse_iterator_base.h" "$(@D)/cuda/include/thrust/iterator/detail/reverse_iterator_base.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/detail/tagged_iterator.h" "$(@D)/cuda/include/thrust/iterator/detail/tagged_iterator.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/detail/transform_iterator.inl" "$(@D)/cuda/include/thrust/iterator/detail/transform_iterator.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/detail/transform_output_iterator.inl" "$(@D)/cuda/include/thrust/iterator/detail/transform_output_iterator.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/detail/tuple_of_iterator_references.h" "$(@D)/cuda/include/thrust/iterator/detail/tuple_of_iterator_references.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/detail/universal_categories.h" "$(@D)/cuda/include/thrust/iterator/detail/universal_categories.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/detail/zip_iterator.inl" "$(@D)/cuda/include/thrust/iterator/detail/zip_iterator.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/detail/zip_iterator_base.h" "$(@D)/cuda/include/thrust/iterator/detail/zip_iterator_base.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/discard_iterator.h" "$(@D)/cuda/include/thrust/iterator/discard_iterator.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/iterator_adaptor.h" "$(@D)/cuda/include/thrust/iterator/iterator_adaptor.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/iterator_categories.h" "$(@D)/cuda/include/thrust/iterator/iterator_categories.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/iterator_facade.h" "$(@D)/cuda/include/thrust/iterator/iterator_facade.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/iterator_traits.h" "$(@D)/cuda/include/thrust/iterator/iterator_traits.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/permutation_iterator.h" "$(@D)/cuda/include/thrust/iterator/permutation_iterator.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/retag.h" "$(@D)/cuda/include/thrust/iterator/retag.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/reverse_iterator.h" "$(@D)/cuda/include/thrust/iterator/reverse_iterator.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/transform_iterator.h" "$(@D)/cuda/include/thrust/iterator/transform_iterator.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/transform_output_iterator.h" "$(@D)/cuda/include/thrust/iterator/transform_output_iterator.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/zip_iterator.h" "$(@D)/cuda/include/thrust/iterator/zip_iterator.h" && cp -f "/usr/local/cuda-10.0/include/thrust/logical.h" "$(@D)/cuda/include/thrust/logical.h" && cp -f "/usr/local/cuda-10.0/include/thrust/memory.h" "$(@D)/cuda/include/thrust/memory.h" && cp -f "/usr/local/cuda-10.0/include/thrust/merge.h" "$(@D)/cuda/include/thrust/merge.h" && cp -f "/usr/local/cuda-10.0/include/thrust/mismatch.h" "$(@D)/cuda/include/thrust/mismatch.h" && cp -f "/usr/local/cuda-10.0/include/thrust/pair.h" "$(@D)/cuda/include/thrust/pair.h" && cp -f "/usr/local/cuda-10.0/include/thrust/partition.h" "$(@D)/cuda/include/thrust/partition.h" && cp -f "/usr/local/cuda-10.0/include/thrust/random.h" "$(@D)/cuda/include/thrust/random.h" && cp -f "/usr/local/cuda-10.0/include/thrust/random/detail/discard_block_engine.inl" "$(@D)/cuda/include/thrust/random/detail/discard_block_engine.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/random/detail/linear_congruential_engine.inl" "$(@D)/cuda/include/thrust/random/detail/linear_congruential_engine.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/random/detail/linear_congruential_engine_discard.h" "$(@D)/cuda/include/thrust/random/detail/linear_congruential_engine_discard.h" && cp -f "/usr/local/cuda-10.0/include/thrust/random/detail/linear_feedback_shift_engine.inl" "$(@D)/cuda/include/thrust/random/detail/linear_feedback_shift_engine.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/random/detail/linear_feedback_shift_engine_wordmask.h" "$(@D)/cuda/include/thrust/random/detail/linear_feedback_shift_engine_wordmask.h" && cp -f "/usr/local/cuda-10.0/include/thrust/random/detail/mod.h" "$(@D)/cuda/include/thrust/random/detail/mod.h" && cp -f "/usr/local/cuda-10.0/include/thrust/random/detail/normal_distribution.inl" "$(@D)/cuda/include/thrust/random/detail/normal_distribution.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/random/detail/normal_distribution_base.h" "$(@D)/cuda/include/thrust/random/detail/normal_distribution_base.h" && cp -f "/usr/local/cuda-10.0/include/thrust/random/detail/random_core_access.h" "$(@D)/cuda/include/thrust/random/detail/random_core_access.h" && cp -f "/usr/local/cuda-10.0/include/thrust/random/detail/subtract_with_carry_engine.inl" "$(@D)/cuda/include/thrust/random/detail/subtract_with_carry_engine.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/random/detail/uniform_int_distribution.inl" "$(@D)/cuda/include/thrust/random/detail/uniform_int_distribution.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/random/detail/uniform_real_distribution.inl" "$(@D)/cuda/include/thrust/random/detail/uniform_real_distribution.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/random/detail/xor_combine_engine.inl" "$(@D)/cuda/include/thrust/random/detail/xor_combine_engine.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/random/detail/xor_combine_engine_max.h" "$(@D)/cuda/include/thrust/random/detail/xor_combine_engine_max.h" && cp -f "/usr/local/cuda-10.0/include/thrust/random/discard_block_engine.h" "$(@D)/cuda/include/thrust/random/discard_block_engine.h" && cp -f "/usr/local/cuda-10.0/include/thrust/random/linear_congruential_engine.h" "$(@D)/cuda/include/thrust/random/linear_congruential_engine.h" && cp -f "/usr/local/cuda-10.0/include/thrust/random/linear_feedback_shift_engine.h" "$(@D)/cuda/include/thrust/random/linear_feedback_shift_engine.h" && cp -f "/usr/local/cuda-10.0/include/thrust/random/normal_distribution.h" "$(@D)/cuda/include/thrust/random/normal_distribution.h" && cp -f "/usr/local/cuda-10.0/include/thrust/random/subtract_with_carry_engine.h" "$(@D)/cuda/include/thrust/random/subtract_with_carry_engine.h" && cp -f "/usr/local/cuda-10.0/include/thrust/random/uniform_int_distribution.h" "$(@D)/cuda/include/thrust/random/uniform_int_distribution.h" && cp -f "/usr/local/cuda-10.0/include/thrust/random/uniform_real_distribution.h" "$(@D)/cuda/include/thrust/random/uniform_real_distribution.h" && cp -f "/usr/local/cuda-10.0/include/thrust/random/xor_combine_engine.h" "$(@D)/cuda/include/thrust/random/xor_combine_engine.h" && cp -f "/usr/local/cuda-10.0/include/thrust/reduce.h" "$(@D)/cuda/include/thrust/reduce.h" && cp -f "/usr/local/cuda-10.0/include/thrust/remove.h" "$(@D)/cuda/include/thrust/remove.h" && cp -f "/usr/local/cuda-10.0/include/thrust/replace.h" "$(@D)/cuda/include/thrust/replace.h" && cp -f "/usr/local/cuda-10.0/include/thrust/reverse.h" "$(@D)/cuda/include/thrust/reverse.h" && cp -f "/usr/local/cuda-10.0/include/thrust/scan.h" "$(@D)/cuda/include/thrust/scan.h" && cp -f "/usr/local/cuda-10.0/include/thrust/scatter.h" "$(@D)/cuda/include/thrust/scatter.h" && cp -f "/usr/local/cuda-10.0/include/thrust/sequence.h" "$(@D)/cuda/include/thrust/sequence.h" && cp -f "/usr/local/cuda-10.0/include/thrust/set_operations.h" "$(@D)/cuda/include/thrust/set_operations.h" && cp -f "/usr/local/cuda-10.0/include/thrust/sort.h" "$(@D)/cuda/include/thrust/sort.h" && cp -f "/usr/local/cuda-10.0/include/thrust/swap.h" "$(@D)/cuda/include/thrust/swap.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/adjacent_difference.h" "$(@D)/cuda/include/thrust/system/cpp/detail/adjacent_difference.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/assign_value.h" "$(@D)/cuda/include/thrust/system/cpp/detail/assign_value.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/binary_search.h" "$(@D)/cuda/include/thrust/system/cpp/detail/binary_search.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/copy.h" "$(@D)/cuda/include/thrust/system/cpp/detail/copy.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/copy_if.h" "$(@D)/cuda/include/thrust/system/cpp/detail/copy_if.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/count.h" "$(@D)/cuda/include/thrust/system/cpp/detail/count.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/equal.h" "$(@D)/cuda/include/thrust/system/cpp/detail/equal.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/execution_policy.h" "$(@D)/cuda/include/thrust/system/cpp/detail/execution_policy.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/extrema.h" "$(@D)/cuda/include/thrust/system/cpp/detail/extrema.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/fill.h" "$(@D)/cuda/include/thrust/system/cpp/detail/fill.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/find.h" "$(@D)/cuda/include/thrust/system/cpp/detail/find.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/for_each.h" "$(@D)/cuda/include/thrust/system/cpp/detail/for_each.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/gather.h" "$(@D)/cuda/include/thrust/system/cpp/detail/gather.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/generate.h" "$(@D)/cuda/include/thrust/system/cpp/detail/generate.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/get_value.h" "$(@D)/cuda/include/thrust/system/cpp/detail/get_value.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/inner_product.h" "$(@D)/cuda/include/thrust/system/cpp/detail/inner_product.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/iter_swap.h" "$(@D)/cuda/include/thrust/system/cpp/detail/iter_swap.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/logical.h" "$(@D)/cuda/include/thrust/system/cpp/detail/logical.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/malloc_and_free.h" "$(@D)/cuda/include/thrust/system/cpp/detail/malloc_and_free.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/memory.inl" "$(@D)/cuda/include/thrust/system/cpp/detail/memory.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/merge.h" "$(@D)/cuda/include/thrust/system/cpp/detail/merge.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/mismatch.h" "$(@D)/cuda/include/thrust/system/cpp/detail/mismatch.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/par.h" "$(@D)/cuda/include/thrust/system/cpp/detail/par.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/partition.h" "$(@D)/cuda/include/thrust/system/cpp/detail/partition.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/reduce.h" "$(@D)/cuda/include/thrust/system/cpp/detail/reduce.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/reduce_by_key.h" "$(@D)/cuda/include/thrust/system/cpp/detail/reduce_by_key.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/remove.h" "$(@D)/cuda/include/thrust/system/cpp/detail/remove.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/replace.h" "$(@D)/cuda/include/thrust/system/cpp/detail/replace.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/reverse.h" "$(@D)/cuda/include/thrust/system/cpp/detail/reverse.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/scan.h" "$(@D)/cuda/include/thrust/system/cpp/detail/scan.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/scan_by_key.h" "$(@D)/cuda/include/thrust/system/cpp/detail/scan_by_key.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/scatter.h" "$(@D)/cuda/include/thrust/system/cpp/detail/scatter.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/sequence.h" "$(@D)/cuda/include/thrust/system/cpp/detail/sequence.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/set_operations.h" "$(@D)/cuda/include/thrust/system/cpp/detail/set_operations.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/sort.h" "$(@D)/cuda/include/thrust/system/cpp/detail/sort.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/swap_ranges.h" "$(@D)/cuda/include/thrust/system/cpp/detail/swap_ranges.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/tabulate.h" "$(@D)/cuda/include/thrust/system/cpp/detail/tabulate.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/temporary_buffer.h" "$(@D)/cuda/include/thrust/system/cpp/detail/temporary_buffer.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/transform.h" "$(@D)/cuda/include/thrust/system/cpp/detail/transform.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/transform_reduce.h" "$(@D)/cuda/include/thrust/system/cpp/detail/transform_reduce.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/transform_scan.h" "$(@D)/cuda/include/thrust/system/cpp/detail/transform_scan.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/uninitialized_copy.h" "$(@D)/cuda/include/thrust/system/cpp/detail/uninitialized_copy.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/uninitialized_fill.h" "$(@D)/cuda/include/thrust/system/cpp/detail/uninitialized_fill.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/unique.h" "$(@D)/cuda/include/thrust/system/cpp/detail/unique.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/unique_by_key.h" "$(@D)/cuda/include/thrust/system/cpp/detail/unique_by_key.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/vector.inl" "$(@D)/cuda/include/thrust/system/cpp/detail/vector.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/execution_policy.h" "$(@D)/cuda/include/thrust/system/cpp/execution_policy.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/memory.h" "$(@D)/cuda/include/thrust/system/cpp/memory.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/vector.h" "$(@D)/cuda/include/thrust/system/cpp/vector.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/config.h" "$(@D)/cuda/include/thrust/system/cuda/config.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/adjacent_difference.h" "$(@D)/cuda/include/thrust/system/cuda/detail/adjacent_difference.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/assign_value.h" "$(@D)/cuda/include/thrust/system/cuda/detail/assign_value.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/binary_search.h" "$(@D)/cuda/include/thrust/system/cuda/detail/binary_search.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/copy.h" "$(@D)/cuda/include/thrust/system/cuda/detail/copy.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/copy_if.h" "$(@D)/cuda/include/thrust/system/cuda/detail/copy_if.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/core/agent_launcher.h" "$(@D)/cuda/include/thrust/system/cuda/detail/core/agent_launcher.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/core/alignment.h" "$(@D)/cuda/include/thrust/system/cuda/detail/core/alignment.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/core/triple_chevron_launch.h" "$(@D)/cuda/include/thrust/system/cuda/detail/core/triple_chevron_launch.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/core/util.h" "$(@D)/cuda/include/thrust/system/cuda/detail/core/util.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/count.h" "$(@D)/cuda/include/thrust/system/cuda/detail/count.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cross_system.h" "$(@D)/cuda/include/thrust/system/cuda/detail/cross_system.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/agent/agent_histogram.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_histogram.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/agent/agent_radix_sort_downsweep.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_radix_sort_downsweep.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/agent/agent_radix_sort_upsweep.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_radix_sort_upsweep.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/agent/agent_reduce.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_reduce.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/agent/agent_reduce_by_key.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_reduce_by_key.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/agent/agent_rle.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_rle.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/agent/agent_scan.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_scan.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/agent/agent_segment_fixup.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_segment_fixup.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/agent/agent_select_if.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_select_if.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/agent/agent_spmv_orig.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_spmv_orig.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/agent/single_pass_scan_operators.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/single_pass_scan_operators.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/block/block_adjacent_difference.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_adjacent_difference.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/block/block_discontinuity.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_discontinuity.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/block/block_exchange.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_exchange.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/block/block_histogram.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_histogram.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/block/block_load.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_load.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/block/block_radix_rank.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_radix_rank.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/block/block_radix_sort.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_radix_sort.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/block/block_raking_layout.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_raking_layout.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/block/block_reduce.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_reduce.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/block/block_scan.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_scan.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/block/block_shuffle.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_shuffle.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/block/block_store.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_store.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/block/specializations/block_histogram_atomic.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_histogram_atomic.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/block/specializations/block_histogram_sort.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_histogram_sort.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/block/specializations/block_reduce_raking.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_reduce_raking.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/block/specializations/block_reduce_raking_commutative_only.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_reduce_raking_commutative_only.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/block/specializations/block_reduce_warp_reductions.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_reduce_warp_reductions.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_raking.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_raking.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans2.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans2.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans3.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans3.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/cub.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/cub.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/device/device_histogram.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_histogram.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/device/device_partition.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_partition.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/device/device_radix_sort.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_radix_sort.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/device/device_reduce.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_reduce.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/device/device_run_length_encode.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_run_length_encode.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/device/device_scan.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_scan.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/device/device_segmented_radix_sort.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_segmented_radix_sort.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/device/device_segmented_reduce.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_segmented_reduce.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/device/device_select.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_select.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/device/device_spmv.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_spmv.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_histogram.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_histogram.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_radix_sort.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_radix_sort.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_reduce.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_reduce.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_reduce_by_key.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_reduce_by_key.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_rle.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_rle.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_scan.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_scan.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_select_if.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_select_if.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_spmv_orig.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_spmv_orig.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/grid/grid_barrier.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/grid/grid_barrier.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/grid/grid_even_share.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/grid/grid_even_share.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/grid/grid_mapping.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/grid/grid_mapping.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/grid/grid_queue.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/grid/grid_queue.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/host/mutex.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/host/mutex.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/iterator/arg_index_input_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/arg_index_input_iterator.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/iterator/cache_modified_input_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/cache_modified_input_iterator.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/iterator/cache_modified_output_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/cache_modified_output_iterator.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/iterator/constant_input_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/constant_input_iterator.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/iterator/counting_input_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/counting_input_iterator.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/iterator/discard_output_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/discard_output_iterator.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/iterator/tex_obj_input_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/tex_obj_input_iterator.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/iterator/tex_ref_input_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/tex_ref_input_iterator.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/iterator/transform_input_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/transform_input_iterator.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/thread/thread_load.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/thread/thread_load.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/thread/thread_operators.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/thread/thread_operators.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/thread/thread_reduce.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/thread/thread_reduce.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/thread/thread_scan.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/thread/thread_scan.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/thread/thread_search.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/thread/thread_search.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/thread/thread_store.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/thread/thread_store.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/util_allocator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/util_allocator.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/util_arch.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/util_arch.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/util_debug.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/util_debug.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/util_device.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/util_device.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/util_macro.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/util_macro.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/util_namespace.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/util_namespace.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/util_ptx.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/util_ptx.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/util_type.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/util_type.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/warp/specializations/warp_reduce_shfl.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/warp/specializations/warp_reduce_shfl.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/warp/specializations/warp_reduce_smem.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/warp/specializations/warp_reduce_smem.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/warp/specializations/warp_scan_shfl.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/warp/specializations/warp_scan_shfl.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/warp/specializations/warp_scan_smem.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/warp/specializations/warp_scan_smem.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/warp/warp_reduce.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/warp/warp_reduce.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/warp/warp_scan.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/warp/warp_scan.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/equal.h" "$(@D)/cuda/include/thrust/system/cuda/detail/equal.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/error.inl" "$(@D)/cuda/include/thrust/system/cuda/detail/error.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/execution_policy.h" "$(@D)/cuda/include/thrust/system/cuda/detail/execution_policy.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/extrema.h" "$(@D)/cuda/include/thrust/system/cuda/detail/extrema.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/fill.h" "$(@D)/cuda/include/thrust/system/cuda/detail/fill.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/find.h" "$(@D)/cuda/include/thrust/system/cuda/detail/find.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/for_each.h" "$(@D)/cuda/include/thrust/system/cuda/detail/for_each.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/gather.h" "$(@D)/cuda/include/thrust/system/cuda/detail/gather.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/generate.h" "$(@D)/cuda/include/thrust/system/cuda/detail/generate.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/get_value.h" "$(@D)/cuda/include/thrust/system/cuda/detail/get_value.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/guarded_cuda_runtime_api.h" "$(@D)/cuda/include/thrust/system/cuda/detail/guarded_cuda_runtime_api.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/guarded_driver_types.h" "$(@D)/cuda/include/thrust/system/cuda/detail/guarded_driver_types.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/inner_product.h" "$(@D)/cuda/include/thrust/system/cuda/detail/inner_product.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/internal/copy_cross_system.h" "$(@D)/cuda/include/thrust/system/cuda/detail/internal/copy_cross_system.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/internal/copy_device_to_device.h" "$(@D)/cuda/include/thrust/system/cuda/detail/internal/copy_device_to_device.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/iter_swap.h" "$(@D)/cuda/include/thrust/system/cuda/detail/iter_swap.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/logical.h" "$(@D)/cuda/include/thrust/system/cuda/detail/logical.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/malloc_and_free.h" "$(@D)/cuda/include/thrust/system/cuda/detail/malloc_and_free.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/memory.inl" "$(@D)/cuda/include/thrust/system/cuda/detail/memory.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/merge.h" "$(@D)/cuda/include/thrust/system/cuda/detail/merge.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/mismatch.h" "$(@D)/cuda/include/thrust/system/cuda/detail/mismatch.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/par.h" "$(@D)/cuda/include/thrust/system/cuda/detail/par.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/par_to_seq.h" "$(@D)/cuda/include/thrust/system/cuda/detail/par_to_seq.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/parallel_for.h" "$(@D)/cuda/include/thrust/system/cuda/detail/parallel_for.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/partition.h" "$(@D)/cuda/include/thrust/system/cuda/detail/partition.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/reduce.h" "$(@D)/cuda/include/thrust/system/cuda/detail/reduce.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/reduce_by_key.h" "$(@D)/cuda/include/thrust/system/cuda/detail/reduce_by_key.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/remove.h" "$(@D)/cuda/include/thrust/system/cuda/detail/remove.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/replace.h" "$(@D)/cuda/include/thrust/system/cuda/detail/replace.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/reverse.h" "$(@D)/cuda/include/thrust/system/cuda/detail/reverse.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/scan.h" "$(@D)/cuda/include/thrust/system/cuda/detail/scan.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/scan_by_key.h" "$(@D)/cuda/include/thrust/system/cuda/detail/scan_by_key.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/scatter.h" "$(@D)/cuda/include/thrust/system/cuda/detail/scatter.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/sequence.h" "$(@D)/cuda/include/thrust/system/cuda/detail/sequence.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/set_operations.h" "$(@D)/cuda/include/thrust/system/cuda/detail/set_operations.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/sort.h" "$(@D)/cuda/include/thrust/system/cuda/detail/sort.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/swap_ranges.h" "$(@D)/cuda/include/thrust/system/cuda/detail/swap_ranges.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/tabulate.h" "$(@D)/cuda/include/thrust/system/cuda/detail/tabulate.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/temporary_buffer.h" "$(@D)/cuda/include/thrust/system/cuda/detail/temporary_buffer.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/terminate.h" "$(@D)/cuda/include/thrust/system/cuda/detail/terminate.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/transform.h" "$(@D)/cuda/include/thrust/system/cuda/detail/transform.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/transform_reduce.h" "$(@D)/cuda/include/thrust/system/cuda/detail/transform_reduce.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/transform_scan.h" "$(@D)/cuda/include/thrust/system/cuda/detail/transform_scan.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/uninitialized_copy.h" "$(@D)/cuda/include/thrust/system/cuda/detail/uninitialized_copy.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/uninitialized_fill.h" "$(@D)/cuda/include/thrust/system/cuda/detail/uninitialized_fill.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/unique.h" "$(@D)/cuda/include/thrust/system/cuda/detail/unique.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/unique_by_key.h" "$(@D)/cuda/include/thrust/system/cuda/detail/unique_by_key.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/util.h" "$(@D)/cuda/include/thrust/system/cuda/detail/util.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/vector.inl" "$(@D)/cuda/include/thrust/system/cuda/detail/vector.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/error.h" "$(@D)/cuda/include/thrust/system/cuda/error.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/execution_policy.h" "$(@D)/cuda/include/thrust/system/cuda/execution_policy.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/experimental/pinned_allocator.h" "$(@D)/cuda/include/thrust/system/cuda/experimental/pinned_allocator.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/memory.h" "$(@D)/cuda/include/thrust/system/cuda/memory.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/vector.h" "$(@D)/cuda/include/thrust/system/cuda/vector.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/adjacent_difference.h" "$(@D)/cuda/include/thrust/system/detail/adl/adjacent_difference.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/assign_value.h" "$(@D)/cuda/include/thrust/system/detail/adl/assign_value.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/binary_search.h" "$(@D)/cuda/include/thrust/system/detail/adl/binary_search.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/copy.h" "$(@D)/cuda/include/thrust/system/detail/adl/copy.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/copy_if.h" "$(@D)/cuda/include/thrust/system/detail/adl/copy_if.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/count.h" "$(@D)/cuda/include/thrust/system/detail/adl/count.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/equal.h" "$(@D)/cuda/include/thrust/system/detail/adl/equal.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/extrema.h" "$(@D)/cuda/include/thrust/system/detail/adl/extrema.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/fill.h" "$(@D)/cuda/include/thrust/system/detail/adl/fill.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/find.h" "$(@D)/cuda/include/thrust/system/detail/adl/find.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/for_each.h" "$(@D)/cuda/include/thrust/system/detail/adl/for_each.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/gather.h" "$(@D)/cuda/include/thrust/system/detail/adl/gather.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/generate.h" "$(@D)/cuda/include/thrust/system/detail/adl/generate.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/get_value.h" "$(@D)/cuda/include/thrust/system/detail/adl/get_value.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/inner_product.h" "$(@D)/cuda/include/thrust/system/detail/adl/inner_product.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/iter_swap.h" "$(@D)/cuda/include/thrust/system/detail/adl/iter_swap.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/logical.h" "$(@D)/cuda/include/thrust/system/detail/adl/logical.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/malloc_and_free.h" "$(@D)/cuda/include/thrust/system/detail/adl/malloc_and_free.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/merge.h" "$(@D)/cuda/include/thrust/system/detail/adl/merge.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/mismatch.h" "$(@D)/cuda/include/thrust/system/detail/adl/mismatch.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/partition.h" "$(@D)/cuda/include/thrust/system/detail/adl/partition.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/reduce.h" "$(@D)/cuda/include/thrust/system/detail/adl/reduce.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/reduce_by_key.h" "$(@D)/cuda/include/thrust/system/detail/adl/reduce_by_key.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/remove.h" "$(@D)/cuda/include/thrust/system/detail/adl/remove.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/replace.h" "$(@D)/cuda/include/thrust/system/detail/adl/replace.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/reverse.h" "$(@D)/cuda/include/thrust/system/detail/adl/reverse.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/scan.h" "$(@D)/cuda/include/thrust/system/detail/adl/scan.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/scan_by_key.h" "$(@D)/cuda/include/thrust/system/detail/adl/scan_by_key.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/scatter.h" "$(@D)/cuda/include/thrust/system/detail/adl/scatter.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/sequence.h" "$(@D)/cuda/include/thrust/system/detail/adl/sequence.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/set_operations.h" "$(@D)/cuda/include/thrust/system/detail/adl/set_operations.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/sort.h" "$(@D)/cuda/include/thrust/system/detail/adl/sort.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/swap_ranges.h" "$(@D)/cuda/include/thrust/system/detail/adl/swap_ranges.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/tabulate.h" "$(@D)/cuda/include/thrust/system/detail/adl/tabulate.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/temporary_buffer.h" "$(@D)/cuda/include/thrust/system/detail/adl/temporary_buffer.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/transform.h" "$(@D)/cuda/include/thrust/system/detail/adl/transform.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/transform_reduce.h" "$(@D)/cuda/include/thrust/system/detail/adl/transform_reduce.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/transform_scan.h" "$(@D)/cuda/include/thrust/system/detail/adl/transform_scan.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/uninitialized_copy.h" "$(@D)/cuda/include/thrust/system/detail/adl/uninitialized_copy.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/uninitialized_fill.h" "$(@D)/cuda/include/thrust/system/detail/adl/uninitialized_fill.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/unique.h" "$(@D)/cuda/include/thrust/system/detail/adl/unique.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/unique_by_key.h" "$(@D)/cuda/include/thrust/system/detail/adl/unique_by_key.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/bad_alloc.h" "$(@D)/cuda/include/thrust/system/detail/bad_alloc.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/errno.h" "$(@D)/cuda/include/thrust/system/detail/errno.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/error_category.inl" "$(@D)/cuda/include/thrust/system/detail/error_category.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/error_code.inl" "$(@D)/cuda/include/thrust/system/detail/error_code.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/error_condition.inl" "$(@D)/cuda/include/thrust/system/detail/error_condition.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/adjacent_difference.h" "$(@D)/cuda/include/thrust/system/detail/generic/adjacent_difference.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/adjacent_difference.inl" "$(@D)/cuda/include/thrust/system/detail/generic/adjacent_difference.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/advance.h" "$(@D)/cuda/include/thrust/system/detail/generic/advance.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/advance.inl" "$(@D)/cuda/include/thrust/system/detail/generic/advance.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/binary_search.h" "$(@D)/cuda/include/thrust/system/detail/generic/binary_search.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/binary_search.inl" "$(@D)/cuda/include/thrust/system/detail/generic/binary_search.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/copy.h" "$(@D)/cuda/include/thrust/system/detail/generic/copy.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/copy.inl" "$(@D)/cuda/include/thrust/system/detail/generic/copy.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/copy_if.h" "$(@D)/cuda/include/thrust/system/detail/generic/copy_if.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/copy_if.inl" "$(@D)/cuda/include/thrust/system/detail/generic/copy_if.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/count.h" "$(@D)/cuda/include/thrust/system/detail/generic/count.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/count.inl" "$(@D)/cuda/include/thrust/system/detail/generic/count.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/distance.h" "$(@D)/cuda/include/thrust/system/detail/generic/distance.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/distance.inl" "$(@D)/cuda/include/thrust/system/detail/generic/distance.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/equal.h" "$(@D)/cuda/include/thrust/system/detail/generic/equal.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/equal.inl" "$(@D)/cuda/include/thrust/system/detail/generic/equal.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/extrema.h" "$(@D)/cuda/include/thrust/system/detail/generic/extrema.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/extrema.inl" "$(@D)/cuda/include/thrust/system/detail/generic/extrema.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/fill.h" "$(@D)/cuda/include/thrust/system/detail/generic/fill.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/find.h" "$(@D)/cuda/include/thrust/system/detail/generic/find.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/find.inl" "$(@D)/cuda/include/thrust/system/detail/generic/find.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/for_each.h" "$(@D)/cuda/include/thrust/system/detail/generic/for_each.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/gather.h" "$(@D)/cuda/include/thrust/system/detail/generic/gather.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/gather.inl" "$(@D)/cuda/include/thrust/system/detail/generic/gather.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/generate.h" "$(@D)/cuda/include/thrust/system/detail/generic/generate.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/generate.inl" "$(@D)/cuda/include/thrust/system/detail/generic/generate.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/inner_product.h" "$(@D)/cuda/include/thrust/system/detail/generic/inner_product.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/inner_product.inl" "$(@D)/cuda/include/thrust/system/detail/generic/inner_product.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/logical.h" "$(@D)/cuda/include/thrust/system/detail/generic/logical.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/memory.h" "$(@D)/cuda/include/thrust/system/detail/generic/memory.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/memory.inl" "$(@D)/cuda/include/thrust/system/detail/generic/memory.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/merge.h" "$(@D)/cuda/include/thrust/system/detail/generic/merge.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/merge.inl" "$(@D)/cuda/include/thrust/system/detail/generic/merge.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/mismatch.h" "$(@D)/cuda/include/thrust/system/detail/generic/mismatch.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/mismatch.inl" "$(@D)/cuda/include/thrust/system/detail/generic/mismatch.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/partition.h" "$(@D)/cuda/include/thrust/system/detail/generic/partition.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/partition.inl" "$(@D)/cuda/include/thrust/system/detail/generic/partition.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/reduce.h" "$(@D)/cuda/include/thrust/system/detail/generic/reduce.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/reduce.inl" "$(@D)/cuda/include/thrust/system/detail/generic/reduce.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/reduce_by_key.h" "$(@D)/cuda/include/thrust/system/detail/generic/reduce_by_key.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/reduce_by_key.inl" "$(@D)/cuda/include/thrust/system/detail/generic/reduce_by_key.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/remove.h" "$(@D)/cuda/include/thrust/system/detail/generic/remove.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/remove.inl" "$(@D)/cuda/include/thrust/system/detail/generic/remove.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/replace.h" "$(@D)/cuda/include/thrust/system/detail/generic/replace.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/replace.inl" "$(@D)/cuda/include/thrust/system/detail/generic/replace.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/reverse.h" "$(@D)/cuda/include/thrust/system/detail/generic/reverse.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/reverse.inl" "$(@D)/cuda/include/thrust/system/detail/generic/reverse.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/scalar/binary_search.h" "$(@D)/cuda/include/thrust/system/detail/generic/scalar/binary_search.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/scalar/binary_search.inl" "$(@D)/cuda/include/thrust/system/detail/generic/scalar/binary_search.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/scan.h" "$(@D)/cuda/include/thrust/system/detail/generic/scan.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/scan.inl" "$(@D)/cuda/include/thrust/system/detail/generic/scan.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/scan_by_key.h" "$(@D)/cuda/include/thrust/system/detail/generic/scan_by_key.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/scan_by_key.inl" "$(@D)/cuda/include/thrust/system/detail/generic/scan_by_key.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/scatter.h" "$(@D)/cuda/include/thrust/system/detail/generic/scatter.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/scatter.inl" "$(@D)/cuda/include/thrust/system/detail/generic/scatter.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/select_system.h" "$(@D)/cuda/include/thrust/system/detail/generic/select_system.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/sequence.h" "$(@D)/cuda/include/thrust/system/detail/generic/sequence.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/sequence.inl" "$(@D)/cuda/include/thrust/system/detail/generic/sequence.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/set_operations.h" "$(@D)/cuda/include/thrust/system/detail/generic/set_operations.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/set_operations.inl" "$(@D)/cuda/include/thrust/system/detail/generic/set_operations.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/sort.h" "$(@D)/cuda/include/thrust/system/detail/generic/sort.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/sort.inl" "$(@D)/cuda/include/thrust/system/detail/generic/sort.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/swap_ranges.h" "$(@D)/cuda/include/thrust/system/detail/generic/swap_ranges.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/swap_ranges.inl" "$(@D)/cuda/include/thrust/system/detail/generic/swap_ranges.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/tabulate.h" "$(@D)/cuda/include/thrust/system/detail/generic/tabulate.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/tabulate.inl" "$(@D)/cuda/include/thrust/system/detail/generic/tabulate.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/tag.h" "$(@D)/cuda/include/thrust/system/detail/generic/tag.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/temporary_buffer.h" "$(@D)/cuda/include/thrust/system/detail/generic/temporary_buffer.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/temporary_buffer.inl" "$(@D)/cuda/include/thrust/system/detail/generic/temporary_buffer.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/transform.h" "$(@D)/cuda/include/thrust/system/detail/generic/transform.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/transform.inl" "$(@D)/cuda/include/thrust/system/detail/generic/transform.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/transform_reduce.h" "$(@D)/cuda/include/thrust/system/detail/generic/transform_reduce.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/transform_reduce.inl" "$(@D)/cuda/include/thrust/system/detail/generic/transform_reduce.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/transform_scan.h" "$(@D)/cuda/include/thrust/system/detail/generic/transform_scan.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/transform_scan.inl" "$(@D)/cuda/include/thrust/system/detail/generic/transform_scan.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/type_traits.h" "$(@D)/cuda/include/thrust/system/detail/generic/type_traits.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/uninitialized_copy.h" "$(@D)/cuda/include/thrust/system/detail/generic/uninitialized_copy.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/uninitialized_copy.inl" "$(@D)/cuda/include/thrust/system/detail/generic/uninitialized_copy.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/uninitialized_fill.h" "$(@D)/cuda/include/thrust/system/detail/generic/uninitialized_fill.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/uninitialized_fill.inl" "$(@D)/cuda/include/thrust/system/detail/generic/uninitialized_fill.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/unique.h" "$(@D)/cuda/include/thrust/system/detail/generic/unique.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/unique.inl" "$(@D)/cuda/include/thrust/system/detail/generic/unique.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/unique_by_key.h" "$(@D)/cuda/include/thrust/system/detail/generic/unique_by_key.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/unique_by_key.inl" "$(@D)/cuda/include/thrust/system/detail/generic/unique_by_key.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/internal/decompose.h" "$(@D)/cuda/include/thrust/system/detail/internal/decompose.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/adjacent_difference.h" "$(@D)/cuda/include/thrust/system/detail/sequential/adjacent_difference.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/assign_value.h" "$(@D)/cuda/include/thrust/system/detail/sequential/assign_value.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/binary_search.h" "$(@D)/cuda/include/thrust/system/detail/sequential/binary_search.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/copy.h" "$(@D)/cuda/include/thrust/system/detail/sequential/copy.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/copy.inl" "$(@D)/cuda/include/thrust/system/detail/sequential/copy.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/copy_backward.h" "$(@D)/cuda/include/thrust/system/detail/sequential/copy_backward.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/copy_if.h" "$(@D)/cuda/include/thrust/system/detail/sequential/copy_if.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/count.h" "$(@D)/cuda/include/thrust/system/detail/sequential/count.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/equal.h" "$(@D)/cuda/include/thrust/system/detail/sequential/equal.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/execution_policy.h" "$(@D)/cuda/include/thrust/system/detail/sequential/execution_policy.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/extrema.h" "$(@D)/cuda/include/thrust/system/detail/sequential/extrema.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/fill.h" "$(@D)/cuda/include/thrust/system/detail/sequential/fill.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/find.h" "$(@D)/cuda/include/thrust/system/detail/sequential/find.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/for_each.h" "$(@D)/cuda/include/thrust/system/detail/sequential/for_each.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/gather.h" "$(@D)/cuda/include/thrust/system/detail/sequential/gather.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/general_copy.h" "$(@D)/cuda/include/thrust/system/detail/sequential/general_copy.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/generate.h" "$(@D)/cuda/include/thrust/system/detail/sequential/generate.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/get_value.h" "$(@D)/cuda/include/thrust/system/detail/sequential/get_value.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/inner_product.h" "$(@D)/cuda/include/thrust/system/detail/sequential/inner_product.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/insertion_sort.h" "$(@D)/cuda/include/thrust/system/detail/sequential/insertion_sort.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/iter_swap.h" "$(@D)/cuda/include/thrust/system/detail/sequential/iter_swap.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/logical.h" "$(@D)/cuda/include/thrust/system/detail/sequential/logical.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/malloc_and_free.h" "$(@D)/cuda/include/thrust/system/detail/sequential/malloc_and_free.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/merge.h" "$(@D)/cuda/include/thrust/system/detail/sequential/merge.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/merge.inl" "$(@D)/cuda/include/thrust/system/detail/sequential/merge.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/mismatch.h" "$(@D)/cuda/include/thrust/system/detail/sequential/mismatch.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/partition.h" "$(@D)/cuda/include/thrust/system/detail/sequential/partition.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/reduce.h" "$(@D)/cuda/include/thrust/system/detail/sequential/reduce.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/reduce_by_key.h" "$(@D)/cuda/include/thrust/system/detail/sequential/reduce_by_key.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/remove.h" "$(@D)/cuda/include/thrust/system/detail/sequential/remove.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/replace.h" "$(@D)/cuda/include/thrust/system/detail/sequential/replace.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/reverse.h" "$(@D)/cuda/include/thrust/system/detail/sequential/reverse.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/scan.h" "$(@D)/cuda/include/thrust/system/detail/sequential/scan.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/scan_by_key.h" "$(@D)/cuda/include/thrust/system/detail/sequential/scan_by_key.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/scatter.h" "$(@D)/cuda/include/thrust/system/detail/sequential/scatter.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/sequence.h" "$(@D)/cuda/include/thrust/system/detail/sequential/sequence.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/set_operations.h" "$(@D)/cuda/include/thrust/system/detail/sequential/set_operations.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/sort.h" "$(@D)/cuda/include/thrust/system/detail/sequential/sort.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/sort.inl" "$(@D)/cuda/include/thrust/system/detail/sequential/sort.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/stable_merge_sort.h" "$(@D)/cuda/include/thrust/system/detail/sequential/stable_merge_sort.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/stable_merge_sort.inl" "$(@D)/cuda/include/thrust/system/detail/sequential/stable_merge_sort.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/stable_primitive_sort.h" "$(@D)/cuda/include/thrust/system/detail/sequential/stable_primitive_sort.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/stable_primitive_sort.inl" "$(@D)/cuda/include/thrust/system/detail/sequential/stable_primitive_sort.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/stable_radix_sort.h" "$(@D)/cuda/include/thrust/system/detail/sequential/stable_radix_sort.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/stable_radix_sort.inl" "$(@D)/cuda/include/thrust/system/detail/sequential/stable_radix_sort.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/swap_ranges.h" "$(@D)/cuda/include/thrust/system/detail/sequential/swap_ranges.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/tabulate.h" "$(@D)/cuda/include/thrust/system/detail/sequential/tabulate.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/temporary_buffer.h" "$(@D)/cuda/include/thrust/system/detail/sequential/temporary_buffer.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/transform.h" "$(@D)/cuda/include/thrust/system/detail/sequential/transform.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/transform_reduce.h" "$(@D)/cuda/include/thrust/system/detail/sequential/transform_reduce.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/transform_scan.h" "$(@D)/cuda/include/thrust/system/detail/sequential/transform_scan.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/trivial_copy.h" "$(@D)/cuda/include/thrust/system/detail/sequential/trivial_copy.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/uninitialized_copy.h" "$(@D)/cuda/include/thrust/system/detail/sequential/uninitialized_copy.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/uninitialized_fill.h" "$(@D)/cuda/include/thrust/system/detail/sequential/uninitialized_fill.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/unique.h" "$(@D)/cuda/include/thrust/system/detail/sequential/unique.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/unique_by_key.h" "$(@D)/cuda/include/thrust/system/detail/sequential/unique_by_key.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/system_error.inl" "$(@D)/cuda/include/thrust/system/detail/system_error.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/error_code.h" "$(@D)/cuda/include/thrust/system/error_code.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/adjacent_difference.h" "$(@D)/cuda/include/thrust/system/omp/detail/adjacent_difference.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/assign_value.h" "$(@D)/cuda/include/thrust/system/omp/detail/assign_value.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/binary_search.h" "$(@D)/cuda/include/thrust/system/omp/detail/binary_search.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/copy.h" "$(@D)/cuda/include/thrust/system/omp/detail/copy.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/copy.inl" "$(@D)/cuda/include/thrust/system/omp/detail/copy.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/copy_if.h" "$(@D)/cuda/include/thrust/system/omp/detail/copy_if.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/copy_if.inl" "$(@D)/cuda/include/thrust/system/omp/detail/copy_if.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/count.h" "$(@D)/cuda/include/thrust/system/omp/detail/count.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/default_decomposition.h" "$(@D)/cuda/include/thrust/system/omp/detail/default_decomposition.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/default_decomposition.inl" "$(@D)/cuda/include/thrust/system/omp/detail/default_decomposition.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/equal.h" "$(@D)/cuda/include/thrust/system/omp/detail/equal.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/execution_policy.h" "$(@D)/cuda/include/thrust/system/omp/detail/execution_policy.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/extrema.h" "$(@D)/cuda/include/thrust/system/omp/detail/extrema.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/fill.h" "$(@D)/cuda/include/thrust/system/omp/detail/fill.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/find.h" "$(@D)/cuda/include/thrust/system/omp/detail/find.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/for_each.h" "$(@D)/cuda/include/thrust/system/omp/detail/for_each.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/for_each.inl" "$(@D)/cuda/include/thrust/system/omp/detail/for_each.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/gather.h" "$(@D)/cuda/include/thrust/system/omp/detail/gather.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/generate.h" "$(@D)/cuda/include/thrust/system/omp/detail/generate.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/get_value.h" "$(@D)/cuda/include/thrust/system/omp/detail/get_value.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/inner_product.h" "$(@D)/cuda/include/thrust/system/omp/detail/inner_product.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/iter_swap.h" "$(@D)/cuda/include/thrust/system/omp/detail/iter_swap.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/logical.h" "$(@D)/cuda/include/thrust/system/omp/detail/logical.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/malloc_and_free.h" "$(@D)/cuda/include/thrust/system/omp/detail/malloc_and_free.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/memory.inl" "$(@D)/cuda/include/thrust/system/omp/detail/memory.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/merge.h" "$(@D)/cuda/include/thrust/system/omp/detail/merge.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/mismatch.h" "$(@D)/cuda/include/thrust/system/omp/detail/mismatch.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/par.h" "$(@D)/cuda/include/thrust/system/omp/detail/par.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/partition.h" "$(@D)/cuda/include/thrust/system/omp/detail/partition.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/partition.inl" "$(@D)/cuda/include/thrust/system/omp/detail/partition.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/reduce.h" "$(@D)/cuda/include/thrust/system/omp/detail/reduce.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/reduce.inl" "$(@D)/cuda/include/thrust/system/omp/detail/reduce.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/reduce_by_key.h" "$(@D)/cuda/include/thrust/system/omp/detail/reduce_by_key.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/reduce_by_key.inl" "$(@D)/cuda/include/thrust/system/omp/detail/reduce_by_key.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/reduce_intervals.h" "$(@D)/cuda/include/thrust/system/omp/detail/reduce_intervals.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/reduce_intervals.inl" "$(@D)/cuda/include/thrust/system/omp/detail/reduce_intervals.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/remove.h" "$(@D)/cuda/include/thrust/system/omp/detail/remove.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/remove.inl" "$(@D)/cuda/include/thrust/system/omp/detail/remove.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/replace.h" "$(@D)/cuda/include/thrust/system/omp/detail/replace.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/reverse.h" "$(@D)/cuda/include/thrust/system/omp/detail/reverse.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/scan.h" "$(@D)/cuda/include/thrust/system/omp/detail/scan.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/scan_by_key.h" "$(@D)/cuda/include/thrust/system/omp/detail/scan_by_key.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/scatter.h" "$(@D)/cuda/include/thrust/system/omp/detail/scatter.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/sequence.h" "$(@D)/cuda/include/thrust/system/omp/detail/sequence.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/set_operations.h" "$(@D)/cuda/include/thrust/system/omp/detail/set_operations.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/sort.h" "$(@D)/cuda/include/thrust/system/omp/detail/sort.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/sort.inl" "$(@D)/cuda/include/thrust/system/omp/detail/sort.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/swap_ranges.h" "$(@D)/cuda/include/thrust/system/omp/detail/swap_ranges.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/tabulate.h" "$(@D)/cuda/include/thrust/system/omp/detail/tabulate.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/temporary_buffer.h" "$(@D)/cuda/include/thrust/system/omp/detail/temporary_buffer.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/transform.h" "$(@D)/cuda/include/thrust/system/omp/detail/transform.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/transform_reduce.h" "$(@D)/cuda/include/thrust/system/omp/detail/transform_reduce.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/transform_scan.h" "$(@D)/cuda/include/thrust/system/omp/detail/transform_scan.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/uninitialized_copy.h" "$(@D)/cuda/include/thrust/system/omp/detail/uninitialized_copy.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/uninitialized_fill.h" "$(@D)/cuda/include/thrust/system/omp/detail/uninitialized_fill.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/unique.h" "$(@D)/cuda/include/thrust/system/omp/detail/unique.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/unique.inl" "$(@D)/cuda/include/thrust/system/omp/detail/unique.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/unique_by_key.h" "$(@D)/cuda/include/thrust/system/omp/detail/unique_by_key.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/unique_by_key.inl" "$(@D)/cuda/include/thrust/system/omp/detail/unique_by_key.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/vector.inl" "$(@D)/cuda/include/thrust/system/omp/detail/vector.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/execution_policy.h" "$(@D)/cuda/include/thrust/system/omp/execution_policy.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/memory.h" "$(@D)/cuda/include/thrust/system/omp/memory.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/vector.h" "$(@D)/cuda/include/thrust/system/omp/vector.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/system_error.h" "$(@D)/cuda/include/thrust/system/system_error.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/adjacent_difference.h" "$(@D)/cuda/include/thrust/system/tbb/detail/adjacent_difference.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/assign_value.h" "$(@D)/cuda/include/thrust/system/tbb/detail/assign_value.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/binary_search.h" "$(@D)/cuda/include/thrust/system/tbb/detail/binary_search.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/copy.h" "$(@D)/cuda/include/thrust/system/tbb/detail/copy.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/copy.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/copy.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/copy_if.h" "$(@D)/cuda/include/thrust/system/tbb/detail/copy_if.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/copy_if.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/copy_if.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/count.h" "$(@D)/cuda/include/thrust/system/tbb/detail/count.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/equal.h" "$(@D)/cuda/include/thrust/system/tbb/detail/equal.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/execution_policy.h" "$(@D)/cuda/include/thrust/system/tbb/detail/execution_policy.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/extrema.h" "$(@D)/cuda/include/thrust/system/tbb/detail/extrema.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/fill.h" "$(@D)/cuda/include/thrust/system/tbb/detail/fill.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/find.h" "$(@D)/cuda/include/thrust/system/tbb/detail/find.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/for_each.h" "$(@D)/cuda/include/thrust/system/tbb/detail/for_each.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/for_each.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/for_each.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/gather.h" "$(@D)/cuda/include/thrust/system/tbb/detail/gather.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/generate.h" "$(@D)/cuda/include/thrust/system/tbb/detail/generate.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/get_value.h" "$(@D)/cuda/include/thrust/system/tbb/detail/get_value.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/inner_product.h" "$(@D)/cuda/include/thrust/system/tbb/detail/inner_product.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/iter_swap.h" "$(@D)/cuda/include/thrust/system/tbb/detail/iter_swap.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/logical.h" "$(@D)/cuda/include/thrust/system/tbb/detail/logical.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/malloc_and_free.h" "$(@D)/cuda/include/thrust/system/tbb/detail/malloc_and_free.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/memory.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/memory.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/merge.h" "$(@D)/cuda/include/thrust/system/tbb/detail/merge.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/merge.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/merge.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/mismatch.h" "$(@D)/cuda/include/thrust/system/tbb/detail/mismatch.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/par.h" "$(@D)/cuda/include/thrust/system/tbb/detail/par.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/partition.h" "$(@D)/cuda/include/thrust/system/tbb/detail/partition.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/partition.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/partition.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/reduce.h" "$(@D)/cuda/include/thrust/system/tbb/detail/reduce.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/reduce.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/reduce.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/reduce_by_key.h" "$(@D)/cuda/include/thrust/system/tbb/detail/reduce_by_key.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/reduce_by_key.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/reduce_by_key.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/reduce_intervals.h" "$(@D)/cuda/include/thrust/system/tbb/detail/reduce_intervals.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/remove.h" "$(@D)/cuda/include/thrust/system/tbb/detail/remove.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/remove.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/remove.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/replace.h" "$(@D)/cuda/include/thrust/system/tbb/detail/replace.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/reverse.h" "$(@D)/cuda/include/thrust/system/tbb/detail/reverse.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/scan.h" "$(@D)/cuda/include/thrust/system/tbb/detail/scan.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/scan.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/scan.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/scan_by_key.h" "$(@D)/cuda/include/thrust/system/tbb/detail/scan_by_key.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/scatter.h" "$(@D)/cuda/include/thrust/system/tbb/detail/scatter.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/sequence.h" "$(@D)/cuda/include/thrust/system/tbb/detail/sequence.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/set_operations.h" "$(@D)/cuda/include/thrust/system/tbb/detail/set_operations.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/sort.h" "$(@D)/cuda/include/thrust/system/tbb/detail/sort.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/sort.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/sort.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/swap_ranges.h" "$(@D)/cuda/include/thrust/system/tbb/detail/swap_ranges.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/tabulate.h" "$(@D)/cuda/include/thrust/system/tbb/detail/tabulate.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/temporary_buffer.h" "$(@D)/cuda/include/thrust/system/tbb/detail/temporary_buffer.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/transform.h" "$(@D)/cuda/include/thrust/system/tbb/detail/transform.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/transform_reduce.h" "$(@D)/cuda/include/thrust/system/tbb/detail/transform_reduce.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/transform_scan.h" "$(@D)/cuda/include/thrust/system/tbb/detail/transform_scan.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/uninitialized_copy.h" "$(@D)/cuda/include/thrust/system/tbb/detail/uninitialized_copy.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/uninitialized_fill.h" "$(@D)/cuda/include/thrust/system/tbb/detail/uninitialized_fill.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/unique.h" "$(@D)/cuda/include/thrust/system/tbb/detail/unique.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/unique.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/unique.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/unique_by_key.h" "$(@D)/cuda/include/thrust/system/tbb/detail/unique_by_key.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/unique_by_key.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/unique_by_key.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/vector.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/vector.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/execution_policy.h" "$(@D)/cuda/include/thrust/system/tbb/execution_policy.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/memory.h" "$(@D)/cuda/include/thrust/system/tbb/memory.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/vector.h" "$(@D)/cuda/include/thrust/system/tbb/vector.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system_error.h" "$(@D)/cuda/include/thrust/system_error.h" && cp -f "/usr/local/cuda-10.0/include/thrust/tabulate.h" "$(@D)/cuda/include/thrust/tabulate.h" && cp -f "/usr/local/cuda-10.0/include/thrust/transform.h" "$(@D)/cuda/include/thrust/transform.h" && cp -f "/usr/local/cuda-10.0/include/thrust/transform_reduce.h" "$(@D)/cuda/include/thrust/transform_reduce.h" && cp -f "/usr/local/cuda-10.0/include/thrust/transform_scan.h" "$(@D)/cuda/include/thrust/transform_scan.h" && cp -f "/usr/local/cuda-10.0/include/thrust/tuple.h" "$(@D)/cuda/include/thrust/tuple.h" && cp -f "/usr/local/cuda-10.0/include/thrust/uninitialized_copy.h" "$(@D)/cuda/include/thrust/uninitialized_copy.h" && cp -f "/usr/local/cuda-10.0/include/thrust/uninitialized_fill.h" "$(@D)/cuda/include/thrust/uninitialized_fill.h" && cp -f "/usr/local/cuda-10.0/include/thrust/unique.h" "$(@D)/cuda/include/thrust/unique.h" && cp -f "/usr/local/cuda-10.0/include/thrust/version.h" "$(@D)/cuda/include/thrust/version.h" && cp -f "/usr/local/cuda-10.0/include/vector_functions.h" "$(@D)/cuda/include/vector_functions.h" && cp -f "/usr/local/cuda-10.0/include/vector_functions.hpp" "$(@D)/cuda/include/vector_functions.hpp" && cp -f "/usr/local/cuda-10.0/include/vector_types.h" "$(@D)/cuda/include/vector_types.h"
+   """,
+)
+
+genrule(
+    name = "cuda-nvvm",
+    outs = [
+        "cuda/nvvm/libdevice/libdevice.10.bc",
+    ],
+    cmd = """
+if [ -d "$(@D)/extras" ]; then rm $(@D)/extras -drf; fi && if [ -d "$(@D)/include" ]; then rm $(@D)/include -drf; fi && if [ -d "$(@D)/lib" ]; then rm $(@D)/lib -drf; fi && if [ -d "$(@D)/nvvm" ]; then rm $(@D)/nvvm -drf; fi && cp -f "/usr/local/cuda-10.0/nvvm/libdevice/libdevice.10.bc" "$(@D)//libdevice.10.bc"
+   """,
+)
+
+genrule(
+    name = "cuda-extras",
+    outs = [
+        "cuda/extras/CUPTI/include/GL/gl.h",
+        "cuda/extras/CUPTI/include/GL/glew.h",
+        "cuda/extras/CUPTI/include/GL/glext.h",
+        "cuda/extras/CUPTI/include/GL/glu.h",
+        "cuda/extras/CUPTI/include/GL/glut.h",
+        "cuda/extras/CUPTI/include/GL/glx.h",
+        "cuda/extras/CUPTI/include/GL/glxext.h",
+        "cuda/extras/CUPTI/include/GL/wglew.h",
+        "cuda/extras/CUPTI/include/GL/wglext.h",
+        "cuda/extras/CUPTI/include/cuda_stdint.h",
+        "cuda/extras/CUPTI/include/cupti.h",
+        "cuda/extras/CUPTI/include/cupti_activity.h",
+        "cuda/extras/CUPTI/include/cupti_callbacks.h",
+        "cuda/extras/CUPTI/include/cupti_driver_cbid.h",
+        "cuda/extras/CUPTI/include/cupti_events.h",
+        "cuda/extras/CUPTI/include/cupti_metrics.h",
+        "cuda/extras/CUPTI/include/cupti_nvtx_cbid.h",
+        "cuda/extras/CUPTI/include/cupti_result.h",
+        "cuda/extras/CUPTI/include/cupti_runtime_cbid.h",
+        "cuda/extras/CUPTI/include/cupti_version.h",
+        "cuda/extras/CUPTI/include/generated_cudaGL_meta.h",
+        "cuda/extras/CUPTI/include/generated_cudaVDPAU_meta.h",
+        "cuda/extras/CUPTI/include/generated_cuda_gl_interop_meta.h",
+        "cuda/extras/CUPTI/include/generated_cuda_meta.h",
+        "cuda/extras/CUPTI/include/generated_cuda_runtime_api_meta.h",
+        "cuda/extras/CUPTI/include/generated_cuda_vdpau_interop_meta.h",
+        "cuda/extras/CUPTI/include/generated_nvtx_meta.h",
+        "cuda/extras/CUPTI/include/openacc/cupti_openacc.h",
+        "cuda/extras/CUPTI/include/openmp/cupti_openmp.h",
+        "cuda/extras/CUPTI/include/openmp/ompt.h",
+    ],
+    cmd = """
+if [ -d "$(@D)/extras" ]; then rm $(@D)/extras -drf; fi && if [ -d "$(@D)/include" ]; then rm $(@D)/include -drf; fi && if [ -d "$(@D)/lib" ]; then rm $(@D)/lib -drf; fi && if [ -d "$(@D)/nvvm" ]; then rm $(@D)/nvvm -drf; fi && cp -f "/usr/local/cuda-10.0/extras/CUPTI/include/GL/gl.h" "$(@D)/cuda/extras/CUPTI/include/GL/gl.h" && cp -f "/usr/local/cuda-10.0/extras/CUPTI/include/GL/glew.h" "$(@D)/cuda/extras/CUPTI/include/GL/glew.h" && cp -f "/usr/local/cuda-10.0/extras/CUPTI/include/GL/glext.h" "$(@D)/cuda/extras/CUPTI/include/GL/glext.h" && cp -f "/usr/local/cuda-10.0/extras/CUPTI/include/GL/glu.h" "$(@D)/cuda/extras/CUPTI/include/GL/glu.h" && cp -f "/usr/local/cuda-10.0/extras/CUPTI/include/GL/glut.h" "$(@D)/cuda/extras/CUPTI/include/GL/glut.h" && cp -f "/usr/local/cuda-10.0/extras/CUPTI/include/GL/glx.h" "$(@D)/cuda/extras/CUPTI/include/GL/glx.h" && cp -f "/usr/local/cuda-10.0/extras/CUPTI/include/GL/glxext.h" "$(@D)/cuda/extras/CUPTI/include/GL/glxext.h" && cp -f "/usr/local/cuda-10.0/extras/CUPTI/include/GL/wglew.h" "$(@D)/cuda/extras/CUPTI/include/GL/wglew.h" && cp -f "/usr/local/cuda-10.0/extras/CUPTI/include/GL/wglext.h" "$(@D)/cuda/extras/CUPTI/include/GL/wglext.h" && cp -f "/usr/local/cuda-10.0/extras/CUPTI/include/cuda_stdint.h" "$(@D)/cuda/extras/CUPTI/include/cuda_stdint.h" && cp -f "/usr/local/cuda-10.0/extras/CUPTI/include/cupti.h" "$(@D)/cuda/extras/CUPTI/include/cupti.h" && cp -f "/usr/local/cuda-10.0/extras/CUPTI/include/cupti_activity.h" "$(@D)/cuda/extras/CUPTI/include/cupti_activity.h" && cp -f "/usr/local/cuda-10.0/extras/CUPTI/include/cupti_callbacks.h" "$(@D)/cuda/extras/CUPTI/include/cupti_callbacks.h" && cp -f "/usr/local/cuda-10.0/extras/CUPTI/include/cupti_driver_cbid.h" "$(@D)/cuda/extras/CUPTI/include/cupti_driver_cbid.h" && cp -f "/usr/local/cuda-10.0/extras/CUPTI/include/cupti_events.h" "$(@D)/cuda/extras/CUPTI/include/cupti_events.h" && cp -f "/usr/local/cuda-10.0/extras/CUPTI/include/cupti_metrics.h" "$(@D)/cuda/extras/CUPTI/include/cupti_metrics.h" && cp -f "/usr/local/cuda-10.0/extras/CUPTI/include/cupti_nvtx_cbid.h" "$(@D)/cuda/extras/CUPTI/include/cupti_nvtx_cbid.h" && cp -f "/usr/local/cuda-10.0/extras/CUPTI/include/cupti_result.h" "$(@D)/cuda/extras/CUPTI/include/cupti_result.h" && cp -f "/usr/local/cuda-10.0/extras/CUPTI/include/cupti_runtime_cbid.h" "$(@D)/cuda/extras/CUPTI/include/cupti_runtime_cbid.h" && cp -f "/usr/local/cuda-10.0/extras/CUPTI/include/cupti_version.h" "$(@D)/cuda/extras/CUPTI/include/cupti_version.h" && cp -f "/usr/local/cuda-10.0/extras/CUPTI/include/generated_cudaGL_meta.h" "$(@D)/cuda/extras/CUPTI/include/generated_cudaGL_meta.h" && cp -f "/usr/local/cuda-10.0/extras/CUPTI/include/generated_cudaVDPAU_meta.h" "$(@D)/cuda/extras/CUPTI/include/generated_cudaVDPAU_meta.h" && cp -f "/usr/local/cuda-10.0/extras/CUPTI/include/generated_cuda_gl_interop_meta.h" "$(@D)/cuda/extras/CUPTI/include/generated_cuda_gl_interop_meta.h" && cp -f "/usr/local/cuda-10.0/extras/CUPTI/include/generated_cuda_meta.h" "$(@D)/cuda/extras/CUPTI/include/generated_cuda_meta.h" && cp -f "/usr/local/cuda-10.0/extras/CUPTI/include/generated_cuda_runtime_api_meta.h" "$(@D)/cuda/extras/CUPTI/include/generated_cuda_runtime_api_meta.h" && cp -f "/usr/local/cuda-10.0/extras/CUPTI/include/generated_cuda_vdpau_interop_meta.h" "$(@D)/cuda/extras/CUPTI/include/generated_cuda_vdpau_interop_meta.h" && cp -f "/usr/local/cuda-10.0/extras/CUPTI/include/generated_nvtx_meta.h" "$(@D)/cuda/extras/CUPTI/include/generated_nvtx_meta.h" && cp -f "/usr/local/cuda-10.0/extras/CUPTI/include/openacc/cupti_openacc.h" "$(@D)/cuda/extras/CUPTI/include/openacc/cupti_openacc.h" && cp -f "/usr/local/cuda-10.0/extras/CUPTI/include/openmp/cupti_openmp.h" "$(@D)/cuda/extras/CUPTI/include/openmp/cupti_openmp.h" && cp -f "/usr/local/cuda-10.0/extras/CUPTI/include/openmp/ompt.h" "$(@D)/cuda/extras/CUPTI/include/openmp/ompt.h"
+   """,
+)
+
+genrule(
+    name = "cuda-lib",
+    outs = [
+        "cuda/lib/libcuda.so",
+        "cuda/lib/libcudart.so.10.0",
+        "cuda/lib/libcudart_static.a",
+        "cuda/lib/libcublas.so.10.0",
+        "cuda/lib/libcusolver.so.10.0",
+        "cuda/lib/libcurand.so.10.0",
+        "cuda/lib/libcufft.so.10.0",
+        "cuda/lib/libcudnn.so.7",
+        "cuda/lib/libcupti.so.10.0",
+    ],
+    cmd = """
+if [ -d "$(@D)/extras" ]; then rm $(@D)/extras -drf; fi && if [ -d "$(@D)/include" ]; then rm $(@D)/include -drf; fi && if [ -d "$(@D)/lib" ]; then rm $(@D)/lib -drf; fi && if [ -d "$(@D)/nvvm" ]; then rm $(@D)/nvvm -drf; fi && cp -f "/usr/local/cuda-10.0/targets/x86_64-linux/lib/stubs/libcuda.so" "$(@D)/cuda/lib/libcuda.so" && cp -f "/usr/local/cuda-10.0/targets/x86_64-linux/lib/libcudart.so.10.0.130" "$(@D)/cuda/lib/libcudart.so.10.0" && cp -f "/usr/local/cuda-10.0/targets/x86_64-linux/lib/libcudart_static.a" "$(@D)/cuda/lib/libcudart_static.a" && cp -f "/usr/local/cuda-10.0/targets/x86_64-linux/lib/libcublas.so.10.0.130" "$(@D)/cuda/lib/libcublas.so.10.0" && cp -f "/usr/local/cuda-10.0/targets/x86_64-linux/lib/libcusolver.so.10.0.130" "$(@D)/cuda/lib/libcusolver.so.10.0" && cp -f "/usr/local/cuda-10.0/targets/x86_64-linux/lib/libcurand.so.10.0.130" "$(@D)/cuda/lib/libcurand.so.10.0" && cp -f "/usr/local/cuda-10.0/targets/x86_64-linux/lib/libcufft.so.10.0.145" "$(@D)/cuda/lib/libcufft.so.10.0" && cp -f "/usr/lib/x86_64-linux-gnu/libcudnn.so.7.3.1" "$(@D)/cuda/lib/libcudnn.so.7" && cp -f "/usr/local/cuda-10.0/extras/CUPTI/lib64/libcupti.so.10.0.130" "$(@D)/cuda/lib/libcupti.so.10.0"
+   """,
+)
+
+genrule(
+    name = "cudnn-include",
+    outs = [
+        "cuda/include/cudnn.h",
+    ],
+    cmd = """
+if [ -d "$(@D)/extras" ]; then rm $(@D)/extras -drf; fi && if [ -d "$(@D)/include" ]; then rm $(@D)/include -drf; fi && if [ -d "$(@D)/lib" ]; then rm $(@D)/lib -drf; fi && if [ -d "$(@D)/nvvm" ]; then rm $(@D)/nvvm -drf; fi && cp -f "/usr/include/cudnn.h" "$(@D)/cudnn.h"
+   """,
+)
diff --git a/third_party/toolchains/preconfig/ubuntu14.04/cuda10.0-cudnn7/cuda/build_defs.bzl b/third_party/toolchains/preconfig/ubuntu14.04/cuda10.0-cudnn7/cuda/build_defs.bzl
new file mode 100755
index 00000000000..a53c891d8bb
--- /dev/null
+++ b/third_party/toolchains/preconfig/ubuntu14.04/cuda10.0-cudnn7/cuda/build_defs.bzl
@@ -0,0 +1,31 @@
+# Macros for building CUDA code.
+def if_cuda(if_true, if_false = []):
+    """Shorthand for select()'ing on whether we're building with CUDA.
+
+    Returns a select statement which evaluates to if_true if we're building
+    with CUDA enabled.  Otherwise, the select statement evaluates to if_false.
+
+    """
+    return select({
+        "@local_config_cuda//cuda:using_nvcc": if_true,
+        "@local_config_cuda//cuda:using_clang": if_true,
+        "//conditions:default": if_false,
+    })
+
+def cuda_default_copts():
+    """Default options for all CUDA compilations."""
+    return if_cuda(["-x", "cuda", "-DGOOGLE_CUDA=1"] + [])
+
+def cuda_is_configured():
+    """Returns true if CUDA was enabled during the configure process."""
+    return True
+
+def if_cuda_is_configured(x):
+    """Tests if the CUDA was enabled during the configure process.
+
+    Unlike if_cuda(), this does not require that we are building with
+    --config=cuda. Used to allow non-CUDA code to depend on CUDA libraries.
+    """
+    if cuda_is_configured():
+        return x
+    return []
diff --git a/third_party/toolchains/preconfig/ubuntu14.04/cuda10.0-cudnn7/cuda/cuda/cuda_config.h b/third_party/toolchains/preconfig/ubuntu14.04/cuda10.0-cudnn7/cuda/cuda/cuda_config.h
new file mode 100755
index 00000000000..0934618e0b5
--- /dev/null
+++ b/third_party/toolchains/preconfig/ubuntu14.04/cuda10.0-cudnn7/cuda/cuda/cuda_config.h
@@ -0,0 +1,26 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef CUDA_CUDA_CONFIG_H_
+#define CUDA_CUDA_CONFIG_H_
+
+#define TF_CUDA_CAPABILITIES CudaVersion("3.0")
+
+#define TF_CUDA_VERSION "10.0"
+#define TF_CUDNN_VERSION "7"
+
+#define TF_CUDA_TOOLKIT_PATH "/usr/local/cuda-10.0"
+
+#endif  // CUDA_CUDA_CONFIG_H_
diff --git a/third_party/toolchains/preconfig/ubuntu14.04/cuda9.0-cudnn7/cuda/BUILD b/third_party/toolchains/preconfig/ubuntu14.04/cuda9.0-cudnn7/cuda/BUILD
index 247e0ace243..c6930904b56 100755
--- a/third_party/toolchains/preconfig/ubuntu14.04/cuda9.0-cudnn7/cuda/BUILD
+++ b/third_party/toolchains/preconfig/ubuntu14.04/cuda9.0-cudnn7/cuda/BUILD
@@ -1188,7 +1188,7 @@ genrule(
         "cuda/include/vector_types.h",
     ],
     cmd = """
-if [ -d "$(@D)/extras" ]; then rm $(@D)/extras -drf; fi && if [ -d "$(@D)/include" ]; then rm $(@D)/include -drf; fi && if [ -d "$(@D)/lib" ]; then rm $(@D)/lib -drf; fi && if [ -d "$(@D)/nvvm" ]; then rm $(@D)/nvvm -drf; fi && cp "/usr/local/cuda-9.0/include/CL/cl.h" "$(@D)/cuda/include/CL/cl.h" && cp "/usr/local/cuda-9.0/include/CL/cl.hpp" "$(@D)/cuda/include/CL/cl.hpp" && cp "/usr/local/cuda-9.0/include/CL/cl_egl.h" "$(@D)/cuda/include/CL/cl_egl.h" && cp "/usr/local/cuda-9.0/include/CL/cl_ext.h" "$(@D)/cuda/include/CL/cl_ext.h" && cp "/usr/local/cuda-9.0/include/CL/cl_gl.h" "$(@D)/cuda/include/CL/cl_gl.h" && cp "/usr/local/cuda-9.0/include/CL/cl_gl_ext.h" "$(@D)/cuda/include/CL/cl_gl_ext.h" && cp "/usr/local/cuda-9.0/include/CL/cl_platform.h" "$(@D)/cuda/include/CL/cl_platform.h" && cp "/usr/local/cuda-9.0/include/CL/opencl.h" "$(@D)/cuda/include/CL/opencl.h" && cp "/usr/local/cuda-9.0/include/builtin_types.h" "$(@D)/cuda/include/builtin_types.h" && cp "/usr/local/cuda-9.0/include/channel_descriptor.h" "$(@D)/cuda/include/channel_descriptor.h" && cp "/usr/local/cuda-9.0/include/common_functions.h" "$(@D)/cuda/include/common_functions.h" && cp "/usr/local/cuda-9.0/include/cooperative_groups.h" "$(@D)/cuda/include/cooperative_groups.h" && cp "/usr/local/cuda-9.0/include/cooperative_groups_helpers.h" "$(@D)/cuda/include/cooperative_groups_helpers.h" && cp "/usr/local/cuda-9.0/include/crt/common_functions.h" "$(@D)/cuda/include/crt/common_functions.h" && cp "/usr/local/cuda-9.0/include/crt/device_double_functions.h" "$(@D)/cuda/include/crt/device_double_functions.h" && cp "/usr/local/cuda-9.0/include/crt/device_double_functions.hpp" "$(@D)/cuda/include/crt/device_double_functions.hpp" && cp "/usr/local/cuda-9.0/include/crt/device_functions.h" "$(@D)/cuda/include/crt/device_functions.h" && cp "/usr/local/cuda-9.0/include/crt/device_functions.hpp" "$(@D)/cuda/include/crt/device_functions.hpp" && cp "/usr/local/cuda-9.0/include/crt/func_macro.h" "$(@D)/cuda/include/crt/func_macro.h" && cp "/usr/local/cuda-9.0/include/crt/host_config.h" "$(@D)/cuda/include/crt/host_config.h" && cp "/usr/local/cuda-9.0/include/crt/host_defines.h" "$(@D)/cuda/include/crt/host_defines.h" && cp "/usr/local/cuda-9.0/include/crt/host_runtime.h" "$(@D)/cuda/include/crt/host_runtime.h" && cp "/usr/local/cuda-9.0/include/crt/math_functions.h" "$(@D)/cuda/include/crt/math_functions.h" && cp "/usr/local/cuda-9.0/include/crt/math_functions.hpp" "$(@D)/cuda/include/crt/math_functions.hpp" && cp "/usr/local/cuda-9.0/include/crt/mma.h" "$(@D)/cuda/include/crt/mma.h" && cp "/usr/local/cuda-9.0/include/crt/mma.hpp" "$(@D)/cuda/include/crt/mma.hpp" && cp "/usr/local/cuda-9.0/include/crt/nvfunctional" "$(@D)/cuda/include/crt/nvfunctional" && cp "/usr/local/cuda-9.0/include/crt/sm_70_rt.h" "$(@D)/cuda/include/crt/sm_70_rt.h" && cp "/usr/local/cuda-9.0/include/crt/sm_70_rt.hpp" "$(@D)/cuda/include/crt/sm_70_rt.hpp" && cp "/usr/local/cuda-9.0/include/crt/storage_class.h" "$(@D)/cuda/include/crt/storage_class.h" && cp "/usr/local/cuda-9.0/include/cuComplex.h" "$(@D)/cuda/include/cuComplex.h" && cp "/usr/local/cuda-9.0/include/cublas.h" "$(@D)/cuda/include/cublas.h" && cp "/usr/local/cuda-9.0/include/cublasXt.h" "$(@D)/cuda/include/cublasXt.h" && cp "/usr/local/cuda-9.0/include/cublas_api.h" "$(@D)/cuda/include/cublas_api.h" && cp "/usr/local/cuda-9.0/include/cublas_v2.h" "$(@D)/cuda/include/cublas_v2.h" && cp "/usr/local/cuda-9.0/include/cuda.h" "$(@D)/cuda/include/cuda.h" && cp "/usr/local/cuda-9.0/include/cudaEGL.h" "$(@D)/cuda/include/cudaEGL.h" && cp "/usr/local/cuda-9.0/include/cudaGL.h" "$(@D)/cuda/include/cudaGL.h" && cp "/usr/local/cuda-9.0/include/cudaProfiler.h" "$(@D)/cuda/include/cudaProfiler.h" && cp "/usr/local/cuda-9.0/include/cudaVDPAU.h" "$(@D)/cuda/include/cudaVDPAU.h" && cp "/usr/local/cuda-9.0/include/cuda_device_runtime_api.h" "$(@D)/cuda/include/cuda_device_runtime_api.h" && cp "/usr/local/cuda-9.0/include/cuda_fp16.h" "$(@D)/cuda/include/cuda_fp16.h" && cp "/usr/local/cuda-9.0/include/cuda_fp16.hpp" "$(@D)/cuda/include/cuda_fp16.hpp" && cp "/usr/local/cuda-9.0/include/cuda_gl_interop.h" "$(@D)/cuda/include/cuda_gl_interop.h" && cp "/usr/local/cuda-9.0/include/cuda_occupancy.h" "$(@D)/cuda/include/cuda_occupancy.h" && cp "/usr/local/cuda-9.0/include/cuda_profiler_api.h" "$(@D)/cuda/include/cuda_profiler_api.h" && cp "/usr/local/cuda-9.0/include/cuda_runtime.h" "$(@D)/cuda/include/cuda_runtime.h" && cp "/usr/local/cuda-9.0/include/cuda_runtime_api.h" "$(@D)/cuda/include/cuda_runtime_api.h" && cp "/usr/local/cuda-9.0/include/cuda_surface_types.h" "$(@D)/cuda/include/cuda_surface_types.h" && cp "/usr/local/cuda-9.0/include/cuda_texture_types.h" "$(@D)/cuda/include/cuda_texture_types.h" && cp "/usr/local/cuda-9.0/include/cuda_vdpau_interop.h" "$(@D)/cuda/include/cuda_vdpau_interop.h" && cp "/usr/local/cuda-9.0/include/cudalibxt.h" "$(@D)/cuda/include/cudalibxt.h" && cp "/usr/local/cuda-9.0/include/cufft.h" "$(@D)/cuda/include/cufft.h" && cp "/usr/local/cuda-9.0/include/cufftXt.h" "$(@D)/cuda/include/cufftXt.h" && cp "/usr/local/cuda-9.0/include/cufftw.h" "$(@D)/cuda/include/cufftw.h" && cp "/usr/local/cuda-9.0/include/curand.h" "$(@D)/cuda/include/curand.h" && cp "/usr/local/cuda-9.0/include/curand_discrete.h" "$(@D)/cuda/include/curand_discrete.h" && cp "/usr/local/cuda-9.0/include/curand_discrete2.h" "$(@D)/cuda/include/curand_discrete2.h" && cp "/usr/local/cuda-9.0/include/curand_globals.h" "$(@D)/cuda/include/curand_globals.h" && cp "/usr/local/cuda-9.0/include/curand_kernel.h" "$(@D)/cuda/include/curand_kernel.h" && cp "/usr/local/cuda-9.0/include/curand_lognormal.h" "$(@D)/cuda/include/curand_lognormal.h" && cp "/usr/local/cuda-9.0/include/curand_mrg32k3a.h" "$(@D)/cuda/include/curand_mrg32k3a.h" && cp "/usr/local/cuda-9.0/include/curand_mtgp32.h" "$(@D)/cuda/include/curand_mtgp32.h" && cp "/usr/local/cuda-9.0/include/curand_mtgp32_host.h" "$(@D)/cuda/include/curand_mtgp32_host.h" && cp "/usr/local/cuda-9.0/include/curand_mtgp32_kernel.h" "$(@D)/cuda/include/curand_mtgp32_kernel.h" && cp "/usr/local/cuda-9.0/include/curand_mtgp32dc_p_11213.h" "$(@D)/cuda/include/curand_mtgp32dc_p_11213.h" && cp "/usr/local/cuda-9.0/include/curand_normal.h" "$(@D)/cuda/include/curand_normal.h" && cp "/usr/local/cuda-9.0/include/curand_normal_static.h" "$(@D)/cuda/include/curand_normal_static.h" && cp "/usr/local/cuda-9.0/include/curand_philox4x32_x.h" "$(@D)/cuda/include/curand_philox4x32_x.h" && cp "/usr/local/cuda-9.0/include/curand_poisson.h" "$(@D)/cuda/include/curand_poisson.h" && cp "/usr/local/cuda-9.0/include/curand_precalc.h" "$(@D)/cuda/include/curand_precalc.h" && cp "/usr/local/cuda-9.0/include/curand_uniform.h" "$(@D)/cuda/include/curand_uniform.h" && cp "/usr/local/cuda-9.0/include/cusolverDn.h" "$(@D)/cuda/include/cusolverDn.h" && cp "/usr/local/cuda-9.0/include/cusolverRf.h" "$(@D)/cuda/include/cusolverRf.h" && cp "/usr/local/cuda-9.0/include/cusolverSp.h" "$(@D)/cuda/include/cusolverSp.h" && cp "/usr/local/cuda-9.0/include/cusolverSp_LOWLEVEL_PREVIEW.h" "$(@D)/cuda/include/cusolverSp_LOWLEVEL_PREVIEW.h" && cp "/usr/local/cuda-9.0/include/cusolver_common.h" "$(@D)/cuda/include/cusolver_common.h" && cp "/usr/local/cuda-9.0/include/cusparse.h" "$(@D)/cuda/include/cusparse.h" && cp "/usr/local/cuda-9.0/include/cusparse_v2.h" "$(@D)/cuda/include/cusparse_v2.h" && cp "/usr/local/cuda-9.0/include/device_atomic_functions.h" "$(@D)/cuda/include/device_atomic_functions.h" && cp "/usr/local/cuda-9.0/include/device_atomic_functions.hpp" "$(@D)/cuda/include/device_atomic_functions.hpp" && cp "/usr/local/cuda-9.0/include/device_double_functions.h" "$(@D)/cuda/include/device_double_functions.h" && cp "/usr/local/cuda-9.0/include/device_double_functions.hpp" "$(@D)/cuda/include/device_double_functions.hpp" && cp "/usr/local/cuda-9.0/include/device_functions.h" "$(@D)/cuda/include/device_functions.h" && cp "/usr/local/cuda-9.0/include/device_functions.hpp" "$(@D)/cuda/include/device_functions.hpp" && cp "/usr/local/cuda-9.0/include/device_functions_decls.h" "$(@D)/cuda/include/device_functions_decls.h" && cp "/usr/local/cuda-9.0/include/device_launch_parameters.h" "$(@D)/cuda/include/device_launch_parameters.h" && cp "/usr/local/cuda-9.0/include/device_types.h" "$(@D)/cuda/include/device_types.h" && cp "/usr/local/cuda-9.0/include/driver_functions.h" "$(@D)/cuda/include/driver_functions.h" && cp "/usr/local/cuda-9.0/include/driver_types.h" "$(@D)/cuda/include/driver_types.h" && cp "/usr/local/cuda-9.0/include/dynlink_cuda.h" "$(@D)/cuda/include/dynlink_cuda.h" && cp "/usr/local/cuda-9.0/include/dynlink_cuda_cuda.h" "$(@D)/cuda/include/dynlink_cuda_cuda.h" && cp "/usr/local/cuda-9.0/include/dynlink_cuviddec.h" "$(@D)/cuda/include/dynlink_cuviddec.h" && cp "/usr/local/cuda-9.0/include/dynlink_nvcuvid.h" "$(@D)/cuda/include/dynlink_nvcuvid.h" && cp "/usr/local/cuda-9.0/include/fatBinaryCtl.h" "$(@D)/cuda/include/fatBinaryCtl.h" && cp "/usr/local/cuda-9.0/include/fatbinary.h" "$(@D)/cuda/include/fatbinary.h" && cp "/usr/local/cuda-9.0/include/host_config.h" "$(@D)/cuda/include/host_config.h" && cp "/usr/local/cuda-9.0/include/host_defines.h" "$(@D)/cuda/include/host_defines.h" && cp "/usr/local/cuda-9.0/include/library_types.h" "$(@D)/cuda/include/library_types.h" && cp "/usr/local/cuda-9.0/include/math_constants.h" "$(@D)/cuda/include/math_constants.h" && cp "/usr/local/cuda-9.0/include/math_functions.h" "$(@D)/cuda/include/math_functions.h" && cp "/usr/local/cuda-9.0/include/math_functions.hpp" "$(@D)/cuda/include/math_functions.hpp" && cp "/usr/local/cuda-9.0/include/math_functions_dbl_ptx3.h" "$(@D)/cuda/include/math_functions_dbl_ptx3.h" && cp "/usr/local/cuda-9.0/include/math_functions_dbl_ptx3.hpp" "$(@D)/cuda/include/math_functions_dbl_ptx3.hpp" && cp "/usr/local/cuda-9.0/include/mma.h" "$(@D)/cuda/include/mma.h" && cp "/usr/local/cuda-9.0/include/npp.h" "$(@D)/cuda/include/npp.h" && cp "/usr/local/cuda-9.0/include/nppcore.h" "$(@D)/cuda/include/nppcore.h" && cp "/usr/local/cuda-9.0/include/nppdefs.h" "$(@D)/cuda/include/nppdefs.h" && cp "/usr/local/cuda-9.0/include/nppi.h" "$(@D)/cuda/include/nppi.h" && cp "/usr/local/cuda-9.0/include/nppi_arithmetic_and_logical_operations.h" "$(@D)/cuda/include/nppi_arithmetic_and_logical_operations.h" && cp "/usr/local/cuda-9.0/include/nppi_color_conversion.h" "$(@D)/cuda/include/nppi_color_conversion.h" && cp "/usr/local/cuda-9.0/include/nppi_compression_functions.h" "$(@D)/cuda/include/nppi_compression_functions.h" && cp "/usr/local/cuda-9.0/include/nppi_computer_vision.h" "$(@D)/cuda/include/nppi_computer_vision.h" && cp "/usr/local/cuda-9.0/include/nppi_data_exchange_and_initialization.h" "$(@D)/cuda/include/nppi_data_exchange_and_initialization.h" && cp "/usr/local/cuda-9.0/include/nppi_filtering_functions.h" "$(@D)/cuda/include/nppi_filtering_functions.h" && cp "/usr/local/cuda-9.0/include/nppi_geometry_transforms.h" "$(@D)/cuda/include/nppi_geometry_transforms.h" && cp "/usr/local/cuda-9.0/include/nppi_linear_transforms.h" "$(@D)/cuda/include/nppi_linear_transforms.h" && cp "/usr/local/cuda-9.0/include/nppi_morphological_operations.h" "$(@D)/cuda/include/nppi_morphological_operations.h" && cp "/usr/local/cuda-9.0/include/nppi_statistics_functions.h" "$(@D)/cuda/include/nppi_statistics_functions.h" && cp "/usr/local/cuda-9.0/include/nppi_support_functions.h" "$(@D)/cuda/include/nppi_support_functions.h" && cp "/usr/local/cuda-9.0/include/nppi_threshold_and_compare_operations.h" "$(@D)/cuda/include/nppi_threshold_and_compare_operations.h" && cp "/usr/local/cuda-9.0/include/npps.h" "$(@D)/cuda/include/npps.h" && cp "/usr/local/cuda-9.0/include/npps_arithmetic_and_logical_operations.h" "$(@D)/cuda/include/npps_arithmetic_and_logical_operations.h" && cp "/usr/local/cuda-9.0/include/npps_conversion_functions.h" "$(@D)/cuda/include/npps_conversion_functions.h" && cp "/usr/local/cuda-9.0/include/npps_filtering_functions.h" "$(@D)/cuda/include/npps_filtering_functions.h" && cp "/usr/local/cuda-9.0/include/npps_initialization.h" "$(@D)/cuda/include/npps_initialization.h" && cp "/usr/local/cuda-9.0/include/npps_statistics_functions.h" "$(@D)/cuda/include/npps_statistics_functions.h" && cp "/usr/local/cuda-9.0/include/npps_support_functions.h" "$(@D)/cuda/include/npps_support_functions.h" && cp "/usr/local/cuda-9.0/include/nppversion.h" "$(@D)/cuda/include/nppversion.h" && cp "/usr/local/cuda-9.0/include/nvToolsExt.h" "$(@D)/cuda/include/nvToolsExt.h" && cp "/usr/local/cuda-9.0/include/nvToolsExtCuda.h" "$(@D)/cuda/include/nvToolsExtCuda.h" && cp "/usr/local/cuda-9.0/include/nvToolsExtCudaRt.h" "$(@D)/cuda/include/nvToolsExtCudaRt.h" && cp "/usr/local/cuda-9.0/include/nvToolsExtMeta.h" "$(@D)/cuda/include/nvToolsExtMeta.h" && cp "/usr/local/cuda-9.0/include/nvToolsExtSync.h" "$(@D)/cuda/include/nvToolsExtSync.h" && cp "/usr/local/cuda-9.0/include/nvblas.h" "$(@D)/cuda/include/nvblas.h" && cp "/usr/local/cuda-9.0/include/nvfunctional" "$(@D)/cuda/include/nvfunctional" && cp "/usr/local/cuda-9.0/include/nvgraph.h" "$(@D)/cuda/include/nvgraph.h" && cp "/usr/local/cuda-9.0/include/nvml.h" "$(@D)/cuda/include/nvml.h" && cp "/usr/local/cuda-9.0/include/nvrtc.h" "$(@D)/cuda/include/nvrtc.h" && cp "/usr/local/cuda-9.0/include/sm_20_atomic_functions.h" "$(@D)/cuda/include/sm_20_atomic_functions.h" && cp "/usr/local/cuda-9.0/include/sm_20_atomic_functions.hpp" "$(@D)/cuda/include/sm_20_atomic_functions.hpp" && cp "/usr/local/cuda-9.0/include/sm_20_intrinsics.h" "$(@D)/cuda/include/sm_20_intrinsics.h" && cp "/usr/local/cuda-9.0/include/sm_20_intrinsics.hpp" "$(@D)/cuda/include/sm_20_intrinsics.hpp" && cp "/usr/local/cuda-9.0/include/sm_30_intrinsics.h" "$(@D)/cuda/include/sm_30_intrinsics.h" && cp "/usr/local/cuda-9.0/include/sm_30_intrinsics.hpp" "$(@D)/cuda/include/sm_30_intrinsics.hpp" && cp "/usr/local/cuda-9.0/include/sm_32_atomic_functions.h" "$(@D)/cuda/include/sm_32_atomic_functions.h" && cp "/usr/local/cuda-9.0/include/sm_32_atomic_functions.hpp" "$(@D)/cuda/include/sm_32_atomic_functions.hpp" && cp "/usr/local/cuda-9.0/include/sm_32_intrinsics.h" "$(@D)/cuda/include/sm_32_intrinsics.h" && cp "/usr/local/cuda-9.0/include/sm_32_intrinsics.hpp" "$(@D)/cuda/include/sm_32_intrinsics.hpp" && cp "/usr/local/cuda-9.0/include/sm_35_atomic_functions.h" "$(@D)/cuda/include/sm_35_atomic_functions.h" && cp "/usr/local/cuda-9.0/include/sm_35_intrinsics.h" "$(@D)/cuda/include/sm_35_intrinsics.h" && cp "/usr/local/cuda-9.0/include/sm_60_atomic_functions.h" "$(@D)/cuda/include/sm_60_atomic_functions.h" && cp "/usr/local/cuda-9.0/include/sm_60_atomic_functions.hpp" "$(@D)/cuda/include/sm_60_atomic_functions.hpp" && cp "/usr/local/cuda-9.0/include/sm_61_intrinsics.h" "$(@D)/cuda/include/sm_61_intrinsics.h" && cp "/usr/local/cuda-9.0/include/sm_61_intrinsics.hpp" "$(@D)/cuda/include/sm_61_intrinsics.hpp" && cp "/usr/local/cuda-9.0/include/sobol_direction_vectors.h" "$(@D)/cuda/include/sobol_direction_vectors.h" && cp "/usr/local/cuda-9.0/include/surface_functions.h" "$(@D)/cuda/include/surface_functions.h" && cp "/usr/local/cuda-9.0/include/surface_functions.hpp" "$(@D)/cuda/include/surface_functions.hpp" && cp "/usr/local/cuda-9.0/include/surface_indirect_functions.h" "$(@D)/cuda/include/surface_indirect_functions.h" && cp "/usr/local/cuda-9.0/include/surface_indirect_functions.hpp" "$(@D)/cuda/include/surface_indirect_functions.hpp" && cp "/usr/local/cuda-9.0/include/surface_types.h" "$(@D)/cuda/include/surface_types.h" && cp "/usr/local/cuda-9.0/include/texture_fetch_functions.h" "$(@D)/cuda/include/texture_fetch_functions.h" && cp "/usr/local/cuda-9.0/include/texture_fetch_functions.hpp" "$(@D)/cuda/include/texture_fetch_functions.hpp" && cp "/usr/local/cuda-9.0/include/texture_indirect_functions.h" "$(@D)/cuda/include/texture_indirect_functions.h" && cp "/usr/local/cuda-9.0/include/texture_indirect_functions.hpp" "$(@D)/cuda/include/texture_indirect_functions.hpp" && cp "/usr/local/cuda-9.0/include/texture_types.h" "$(@D)/cuda/include/texture_types.h" && cp "/usr/local/cuda-9.0/include/thrust/adjacent_difference.h" "$(@D)/cuda/include/thrust/adjacent_difference.h" && cp "/usr/local/cuda-9.0/include/thrust/advance.h" "$(@D)/cuda/include/thrust/advance.h" && cp "/usr/local/cuda-9.0/include/thrust/binary_search.h" "$(@D)/cuda/include/thrust/binary_search.h" && cp "/usr/local/cuda-9.0/include/thrust/complex.h" "$(@D)/cuda/include/thrust/complex.h" && cp "/usr/local/cuda-9.0/include/thrust/copy.h" "$(@D)/cuda/include/thrust/copy.h" && cp "/usr/local/cuda-9.0/include/thrust/count.h" "$(@D)/cuda/include/thrust/count.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/adjacent_difference.inl" "$(@D)/cuda/include/thrust/detail/adjacent_difference.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/advance.inl" "$(@D)/cuda/include/thrust/detail/advance.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/allocator/allocator_traits.h" "$(@D)/cuda/include/thrust/detail/allocator/allocator_traits.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/allocator/allocator_traits.inl" "$(@D)/cuda/include/thrust/detail/allocator/allocator_traits.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/allocator/copy_construct_range.h" "$(@D)/cuda/include/thrust/detail/allocator/copy_construct_range.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/allocator/copy_construct_range.inl" "$(@D)/cuda/include/thrust/detail/allocator/copy_construct_range.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/allocator/default_construct_range.h" "$(@D)/cuda/include/thrust/detail/allocator/default_construct_range.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/allocator/default_construct_range.inl" "$(@D)/cuda/include/thrust/detail/allocator/default_construct_range.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/allocator/destroy_range.h" "$(@D)/cuda/include/thrust/detail/allocator/destroy_range.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/allocator/destroy_range.inl" "$(@D)/cuda/include/thrust/detail/allocator/destroy_range.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/allocator/fill_construct_range.h" "$(@D)/cuda/include/thrust/detail/allocator/fill_construct_range.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/allocator/fill_construct_range.inl" "$(@D)/cuda/include/thrust/detail/allocator/fill_construct_range.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/allocator/malloc_allocator.h" "$(@D)/cuda/include/thrust/detail/allocator/malloc_allocator.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/allocator/malloc_allocator.inl" "$(@D)/cuda/include/thrust/detail/allocator/malloc_allocator.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/allocator/no_throw_allocator.h" "$(@D)/cuda/include/thrust/detail/allocator/no_throw_allocator.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/allocator/tagged_allocator.h" "$(@D)/cuda/include/thrust/detail/allocator/tagged_allocator.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/allocator/tagged_allocator.inl" "$(@D)/cuda/include/thrust/detail/allocator/tagged_allocator.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/allocator/temporary_allocator.h" "$(@D)/cuda/include/thrust/detail/allocator/temporary_allocator.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/allocator/temporary_allocator.inl" "$(@D)/cuda/include/thrust/detail/allocator/temporary_allocator.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/binary_search.inl" "$(@D)/cuda/include/thrust/detail/binary_search.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/complex/arithmetic.h" "$(@D)/cuda/include/thrust/detail/complex/arithmetic.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/complex/c99math.h" "$(@D)/cuda/include/thrust/detail/complex/c99math.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/complex/catrig.h" "$(@D)/cuda/include/thrust/detail/complex/catrig.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/complex/catrigf.h" "$(@D)/cuda/include/thrust/detail/complex/catrigf.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/complex/ccosh.h" "$(@D)/cuda/include/thrust/detail/complex/ccosh.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/complex/ccoshf.h" "$(@D)/cuda/include/thrust/detail/complex/ccoshf.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/complex/cexp.h" "$(@D)/cuda/include/thrust/detail/complex/cexp.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/complex/cexpf.h" "$(@D)/cuda/include/thrust/detail/complex/cexpf.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/complex/clog.h" "$(@D)/cuda/include/thrust/detail/complex/clog.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/complex/clogf.h" "$(@D)/cuda/include/thrust/detail/complex/clogf.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/complex/complex.inl" "$(@D)/cuda/include/thrust/detail/complex/complex.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/complex/cpow.h" "$(@D)/cuda/include/thrust/detail/complex/cpow.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/complex/cpowf.h" "$(@D)/cuda/include/thrust/detail/complex/cpowf.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/complex/cproj.h" "$(@D)/cuda/include/thrust/detail/complex/cproj.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/complex/csinh.h" "$(@D)/cuda/include/thrust/detail/complex/csinh.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/complex/csinhf.h" "$(@D)/cuda/include/thrust/detail/complex/csinhf.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/complex/csqrt.h" "$(@D)/cuda/include/thrust/detail/complex/csqrt.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/complex/csqrtf.h" "$(@D)/cuda/include/thrust/detail/complex/csqrtf.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/complex/ctanh.h" "$(@D)/cuda/include/thrust/detail/complex/ctanh.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/complex/ctanhf.h" "$(@D)/cuda/include/thrust/detail/complex/ctanhf.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/complex/math_private.h" "$(@D)/cuda/include/thrust/detail/complex/math_private.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/complex/stream.h" "$(@D)/cuda/include/thrust/detail/complex/stream.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/config.h" "$(@D)/cuda/include/thrust/detail/config.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/config/compiler.h" "$(@D)/cuda/include/thrust/detail/config/compiler.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/config/compiler_fence.h" "$(@D)/cuda/include/thrust/detail/config/compiler_fence.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/config/config.h" "$(@D)/cuda/include/thrust/detail/config/config.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/config/debug.h" "$(@D)/cuda/include/thrust/detail/config/debug.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/config/device_system.h" "$(@D)/cuda/include/thrust/detail/config/device_system.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/config/exec_check_disable.h" "$(@D)/cuda/include/thrust/detail/config/exec_check_disable.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/config/forceinline.h" "$(@D)/cuda/include/thrust/detail/config/forceinline.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/config/global_workarounds.h" "$(@D)/cuda/include/thrust/detail/config/global_workarounds.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/config/host_device.h" "$(@D)/cuda/include/thrust/detail/config/host_device.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/config/host_system.h" "$(@D)/cuda/include/thrust/detail/config/host_system.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/config/simple_defines.h" "$(@D)/cuda/include/thrust/detail/config/simple_defines.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/contiguous_storage.h" "$(@D)/cuda/include/thrust/detail/contiguous_storage.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/contiguous_storage.inl" "$(@D)/cuda/include/thrust/detail/contiguous_storage.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/copy.h" "$(@D)/cuda/include/thrust/detail/copy.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/copy.inl" "$(@D)/cuda/include/thrust/detail/copy.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/copy_if.h" "$(@D)/cuda/include/thrust/detail/copy_if.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/copy_if.inl" "$(@D)/cuda/include/thrust/detail/copy_if.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/count.inl" "$(@D)/cuda/include/thrust/detail/count.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/cstdint.h" "$(@D)/cuda/include/thrust/detail/cstdint.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/device_delete.inl" "$(@D)/cuda/include/thrust/detail/device_delete.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/device_free.inl" "$(@D)/cuda/include/thrust/detail/device_free.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/device_malloc.inl" "$(@D)/cuda/include/thrust/detail/device_malloc.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/device_new.inl" "$(@D)/cuda/include/thrust/detail/device_new.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/device_ptr.inl" "$(@D)/cuda/include/thrust/detail/device_ptr.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/device_reference.inl" "$(@D)/cuda/include/thrust/detail/device_reference.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/device_vector.inl" "$(@D)/cuda/include/thrust/detail/device_vector.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/dispatch/is_trivial_copy.h" "$(@D)/cuda/include/thrust/detail/dispatch/is_trivial_copy.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/distance.inl" "$(@D)/cuda/include/thrust/detail/distance.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/equal.inl" "$(@D)/cuda/include/thrust/detail/equal.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/execute_with_allocator.h" "$(@D)/cuda/include/thrust/detail/execute_with_allocator.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/execution_policy.h" "$(@D)/cuda/include/thrust/detail/execution_policy.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/extrema.inl" "$(@D)/cuda/include/thrust/detail/extrema.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/fill.inl" "$(@D)/cuda/include/thrust/detail/fill.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/find.inl" "$(@D)/cuda/include/thrust/detail/find.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/for_each.inl" "$(@D)/cuda/include/thrust/detail/for_each.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/function.h" "$(@D)/cuda/include/thrust/detail/function.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/functional.inl" "$(@D)/cuda/include/thrust/detail/functional.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/functional/actor.h" "$(@D)/cuda/include/thrust/detail/functional/actor.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/functional/actor.inl" "$(@D)/cuda/include/thrust/detail/functional/actor.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/functional/argument.h" "$(@D)/cuda/include/thrust/detail/functional/argument.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/functional/composite.h" "$(@D)/cuda/include/thrust/detail/functional/composite.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/functional/operators.h" "$(@D)/cuda/include/thrust/detail/functional/operators.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/functional/operators/arithmetic_operators.h" "$(@D)/cuda/include/thrust/detail/functional/operators/arithmetic_operators.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/functional/operators/assignment_operator.h" "$(@D)/cuda/include/thrust/detail/functional/operators/assignment_operator.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/functional/operators/bitwise_operators.h" "$(@D)/cuda/include/thrust/detail/functional/operators/bitwise_operators.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/functional/operators/compound_assignment_operators.h" "$(@D)/cuda/include/thrust/detail/functional/operators/compound_assignment_operators.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/functional/operators/logical_operators.h" "$(@D)/cuda/include/thrust/detail/functional/operators/logical_operators.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/functional/operators/operator_adaptors.h" "$(@D)/cuda/include/thrust/detail/functional/operators/operator_adaptors.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/functional/operators/relational_operators.h" "$(@D)/cuda/include/thrust/detail/functional/operators/relational_operators.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/functional/placeholder.h" "$(@D)/cuda/include/thrust/detail/functional/placeholder.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/functional/value.h" "$(@D)/cuda/include/thrust/detail/functional/value.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/gather.inl" "$(@D)/cuda/include/thrust/detail/gather.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/generate.inl" "$(@D)/cuda/include/thrust/detail/generate.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/get_iterator_value.h" "$(@D)/cuda/include/thrust/detail/get_iterator_value.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/host_vector.inl" "$(@D)/cuda/include/thrust/detail/host_vector.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/inner_product.inl" "$(@D)/cuda/include/thrust/detail/inner_product.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/integer_math.h" "$(@D)/cuda/include/thrust/detail/integer_math.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/integer_traits.h" "$(@D)/cuda/include/thrust/detail/integer_traits.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/internal_functional.h" "$(@D)/cuda/include/thrust/detail/internal_functional.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/logical.inl" "$(@D)/cuda/include/thrust/detail/logical.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/malloc_and_free.h" "$(@D)/cuda/include/thrust/detail/malloc_and_free.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/merge.inl" "$(@D)/cuda/include/thrust/detail/merge.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/minmax.h" "$(@D)/cuda/include/thrust/detail/minmax.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/mismatch.inl" "$(@D)/cuda/include/thrust/detail/mismatch.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/mpl/math.h" "$(@D)/cuda/include/thrust/detail/mpl/math.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/numeric_traits.h" "$(@D)/cuda/include/thrust/detail/numeric_traits.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/overlapped_copy.h" "$(@D)/cuda/include/thrust/detail/overlapped_copy.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/pair.inl" "$(@D)/cuda/include/thrust/detail/pair.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/partition.inl" "$(@D)/cuda/include/thrust/detail/partition.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/pointer.h" "$(@D)/cuda/include/thrust/detail/pointer.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/pointer.inl" "$(@D)/cuda/include/thrust/detail/pointer.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/range/head_flags.h" "$(@D)/cuda/include/thrust/detail/range/head_flags.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/range/tail_flags.h" "$(@D)/cuda/include/thrust/detail/range/tail_flags.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/raw_pointer_cast.h" "$(@D)/cuda/include/thrust/detail/raw_pointer_cast.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/raw_reference_cast.h" "$(@D)/cuda/include/thrust/detail/raw_reference_cast.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/reduce.inl" "$(@D)/cuda/include/thrust/detail/reduce.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/reference.h" "$(@D)/cuda/include/thrust/detail/reference.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/reference.inl" "$(@D)/cuda/include/thrust/detail/reference.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/reference_forward_declaration.h" "$(@D)/cuda/include/thrust/detail/reference_forward_declaration.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/remove.inl" "$(@D)/cuda/include/thrust/detail/remove.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/replace.inl" "$(@D)/cuda/include/thrust/detail/replace.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/reverse.inl" "$(@D)/cuda/include/thrust/detail/reverse.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/scan.inl" "$(@D)/cuda/include/thrust/detail/scan.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/scatter.inl" "$(@D)/cuda/include/thrust/detail/scatter.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/seq.h" "$(@D)/cuda/include/thrust/detail/seq.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/sequence.inl" "$(@D)/cuda/include/thrust/detail/sequence.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/set_operations.inl" "$(@D)/cuda/include/thrust/detail/set_operations.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/sort.inl" "$(@D)/cuda/include/thrust/detail/sort.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/static_assert.h" "$(@D)/cuda/include/thrust/detail/static_assert.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/static_map.h" "$(@D)/cuda/include/thrust/detail/static_map.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/swap.h" "$(@D)/cuda/include/thrust/detail/swap.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/swap.inl" "$(@D)/cuda/include/thrust/detail/swap.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/swap_ranges.inl" "$(@D)/cuda/include/thrust/detail/swap_ranges.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/tabulate.inl" "$(@D)/cuda/include/thrust/detail/tabulate.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/temporary_array.h" "$(@D)/cuda/include/thrust/detail/temporary_array.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/temporary_array.inl" "$(@D)/cuda/include/thrust/detail/temporary_array.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/temporary_buffer.h" "$(@D)/cuda/include/thrust/detail/temporary_buffer.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/transform.inl" "$(@D)/cuda/include/thrust/detail/transform.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/transform_reduce.inl" "$(@D)/cuda/include/thrust/detail/transform_reduce.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/transform_scan.inl" "$(@D)/cuda/include/thrust/detail/transform_scan.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/trivial_sequence.h" "$(@D)/cuda/include/thrust/detail/trivial_sequence.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/tuple.inl" "$(@D)/cuda/include/thrust/detail/tuple.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/tuple_meta_transform.h" "$(@D)/cuda/include/thrust/detail/tuple_meta_transform.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/tuple_transform.h" "$(@D)/cuda/include/thrust/detail/tuple_transform.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/type_traits.h" "$(@D)/cuda/include/thrust/detail/type_traits.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/type_traits/algorithm/intermediate_type_from_function_and_iterators.h" "$(@D)/cuda/include/thrust/detail/type_traits/algorithm/intermediate_type_from_function_and_iterators.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/type_traits/function_traits.h" "$(@D)/cuda/include/thrust/detail/type_traits/function_traits.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/type_traits/has_member_function.h" "$(@D)/cuda/include/thrust/detail/type_traits/has_member_function.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/type_traits/has_nested_type.h" "$(@D)/cuda/include/thrust/detail/type_traits/has_nested_type.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/type_traits/has_trivial_assign.h" "$(@D)/cuda/include/thrust/detail/type_traits/has_trivial_assign.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/type_traits/is_call_possible.h" "$(@D)/cuda/include/thrust/detail/type_traits/is_call_possible.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/type_traits/is_metafunction_defined.h" "$(@D)/cuda/include/thrust/detail/type_traits/is_metafunction_defined.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/type_traits/iterator/is_discard_iterator.h" "$(@D)/cuda/include/thrust/detail/type_traits/iterator/is_discard_iterator.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/type_traits/iterator/is_output_iterator.h" "$(@D)/cuda/include/thrust/detail/type_traits/iterator/is_output_iterator.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/type_traits/minimum_type.h" "$(@D)/cuda/include/thrust/detail/type_traits/minimum_type.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/type_traits/pointer_traits.h" "$(@D)/cuda/include/thrust/detail/type_traits/pointer_traits.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/type_traits/result_of_adaptable_function.h" "$(@D)/cuda/include/thrust/detail/type_traits/result_of_adaptable_function.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/uninitialized_copy.inl" "$(@D)/cuda/include/thrust/detail/uninitialized_copy.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/uninitialized_fill.inl" "$(@D)/cuda/include/thrust/detail/uninitialized_fill.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/unique.inl" "$(@D)/cuda/include/thrust/detail/unique.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/use_default.h" "$(@D)/cuda/include/thrust/detail/use_default.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/util/align.h" "$(@D)/cuda/include/thrust/detail/util/align.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/util/blocking.h" "$(@D)/cuda/include/thrust/detail/util/blocking.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/vector_base.h" "$(@D)/cuda/include/thrust/detail/vector_base.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/vector_base.inl" "$(@D)/cuda/include/thrust/detail/vector_base.inl" && cp "/usr/local/cuda-9.0/include/thrust/device_allocator.h" "$(@D)/cuda/include/thrust/device_allocator.h" && cp "/usr/local/cuda-9.0/include/thrust/device_delete.h" "$(@D)/cuda/include/thrust/device_delete.h" && cp "/usr/local/cuda-9.0/include/thrust/device_free.h" "$(@D)/cuda/include/thrust/device_free.h" && cp "/usr/local/cuda-9.0/include/thrust/device_malloc.h" "$(@D)/cuda/include/thrust/device_malloc.h" && cp "/usr/local/cuda-9.0/include/thrust/device_malloc_allocator.h" "$(@D)/cuda/include/thrust/device_malloc_allocator.h" && cp "/usr/local/cuda-9.0/include/thrust/device_new.h" "$(@D)/cuda/include/thrust/device_new.h" && cp "/usr/local/cuda-9.0/include/thrust/device_new_allocator.h" "$(@D)/cuda/include/thrust/device_new_allocator.h" && cp "/usr/local/cuda-9.0/include/thrust/device_ptr.h" "$(@D)/cuda/include/thrust/device_ptr.h" && cp "/usr/local/cuda-9.0/include/thrust/device_reference.h" "$(@D)/cuda/include/thrust/device_reference.h" && cp "/usr/local/cuda-9.0/include/thrust/device_vector.h" "$(@D)/cuda/include/thrust/device_vector.h" && cp "/usr/local/cuda-9.0/include/thrust/distance.h" "$(@D)/cuda/include/thrust/distance.h" && cp "/usr/local/cuda-9.0/include/thrust/equal.h" "$(@D)/cuda/include/thrust/equal.h" && cp "/usr/local/cuda-9.0/include/thrust/execution_policy.h" "$(@D)/cuda/include/thrust/execution_policy.h" && cp "/usr/local/cuda-9.0/include/thrust/extrema.h" "$(@D)/cuda/include/thrust/extrema.h" && cp "/usr/local/cuda-9.0/include/thrust/fill.h" "$(@D)/cuda/include/thrust/fill.h" && cp "/usr/local/cuda-9.0/include/thrust/find.h" "$(@D)/cuda/include/thrust/find.h" && cp "/usr/local/cuda-9.0/include/thrust/for_each.h" "$(@D)/cuda/include/thrust/for_each.h" && cp "/usr/local/cuda-9.0/include/thrust/functional.h" "$(@D)/cuda/include/thrust/functional.h" && cp "/usr/local/cuda-9.0/include/thrust/gather.h" "$(@D)/cuda/include/thrust/gather.h" && cp "/usr/local/cuda-9.0/include/thrust/generate.h" "$(@D)/cuda/include/thrust/generate.h" && cp "/usr/local/cuda-9.0/include/thrust/host_vector.h" "$(@D)/cuda/include/thrust/host_vector.h" && cp "/usr/local/cuda-9.0/include/thrust/inner_product.h" "$(@D)/cuda/include/thrust/inner_product.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/constant_iterator.h" "$(@D)/cuda/include/thrust/iterator/constant_iterator.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/counting_iterator.h" "$(@D)/cuda/include/thrust/iterator/counting_iterator.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/any_assign.h" "$(@D)/cuda/include/thrust/iterator/detail/any_assign.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/any_system_tag.h" "$(@D)/cuda/include/thrust/iterator/detail/any_system_tag.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/constant_iterator_base.h" "$(@D)/cuda/include/thrust/iterator/detail/constant_iterator_base.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/counting_iterator.inl" "$(@D)/cuda/include/thrust/iterator/detail/counting_iterator.inl" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/device_system_tag.h" "$(@D)/cuda/include/thrust/iterator/detail/device_system_tag.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/discard_iterator_base.h" "$(@D)/cuda/include/thrust/iterator/detail/discard_iterator_base.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/distance_from_result.h" "$(@D)/cuda/include/thrust/iterator/detail/distance_from_result.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/host_system_tag.h" "$(@D)/cuda/include/thrust/iterator/detail/host_system_tag.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/is_iterator_category.h" "$(@D)/cuda/include/thrust/iterator/detail/is_iterator_category.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/is_trivial_iterator.h" "$(@D)/cuda/include/thrust/iterator/detail/is_trivial_iterator.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/iterator_adaptor_base.h" "$(@D)/cuda/include/thrust/iterator/detail/iterator_adaptor_base.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/iterator_category_to_system.h" "$(@D)/cuda/include/thrust/iterator/detail/iterator_category_to_system.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/iterator_category_to_traversal.h" "$(@D)/cuda/include/thrust/iterator/detail/iterator_category_to_traversal.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/iterator_category_with_system_and_traversal.h" "$(@D)/cuda/include/thrust/iterator/detail/iterator_category_with_system_and_traversal.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/iterator_facade_category.h" "$(@D)/cuda/include/thrust/iterator/detail/iterator_facade_category.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/iterator_traits.inl" "$(@D)/cuda/include/thrust/iterator/detail/iterator_traits.inl" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/iterator_traversal_tags.h" "$(@D)/cuda/include/thrust/iterator/detail/iterator_traversal_tags.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/join_iterator.h" "$(@D)/cuda/include/thrust/iterator/detail/join_iterator.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/minimum_category.h" "$(@D)/cuda/include/thrust/iterator/detail/minimum_category.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/minimum_system.h" "$(@D)/cuda/include/thrust/iterator/detail/minimum_system.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/normal_iterator.h" "$(@D)/cuda/include/thrust/iterator/detail/normal_iterator.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/permutation_iterator_base.h" "$(@D)/cuda/include/thrust/iterator/detail/permutation_iterator_base.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/retag.h" "$(@D)/cuda/include/thrust/iterator/detail/retag.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/reverse_iterator.inl" "$(@D)/cuda/include/thrust/iterator/detail/reverse_iterator.inl" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/reverse_iterator_base.h" "$(@D)/cuda/include/thrust/iterator/detail/reverse_iterator_base.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/tagged_iterator.h" "$(@D)/cuda/include/thrust/iterator/detail/tagged_iterator.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/transform_iterator.inl" "$(@D)/cuda/include/thrust/iterator/detail/transform_iterator.inl" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/transform_output_iterator.inl" "$(@D)/cuda/include/thrust/iterator/detail/transform_output_iterator.inl" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/tuple_of_iterator_references.h" "$(@D)/cuda/include/thrust/iterator/detail/tuple_of_iterator_references.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/universal_categories.h" "$(@D)/cuda/include/thrust/iterator/detail/universal_categories.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/zip_iterator.inl" "$(@D)/cuda/include/thrust/iterator/detail/zip_iterator.inl" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/zip_iterator_base.h" "$(@D)/cuda/include/thrust/iterator/detail/zip_iterator_base.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/discard_iterator.h" "$(@D)/cuda/include/thrust/iterator/discard_iterator.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/iterator_adaptor.h" "$(@D)/cuda/include/thrust/iterator/iterator_adaptor.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/iterator_categories.h" "$(@D)/cuda/include/thrust/iterator/iterator_categories.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/iterator_facade.h" "$(@D)/cuda/include/thrust/iterator/iterator_facade.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/iterator_traits.h" "$(@D)/cuda/include/thrust/iterator/iterator_traits.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/permutation_iterator.h" "$(@D)/cuda/include/thrust/iterator/permutation_iterator.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/retag.h" "$(@D)/cuda/include/thrust/iterator/retag.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/reverse_iterator.h" "$(@D)/cuda/include/thrust/iterator/reverse_iterator.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/transform_iterator.h" "$(@D)/cuda/include/thrust/iterator/transform_iterator.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/transform_output_iterator.h" "$(@D)/cuda/include/thrust/iterator/transform_output_iterator.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/zip_iterator.h" "$(@D)/cuda/include/thrust/iterator/zip_iterator.h" && cp "/usr/local/cuda-9.0/include/thrust/logical.h" "$(@D)/cuda/include/thrust/logical.h" && cp "/usr/local/cuda-9.0/include/thrust/memory.h" "$(@D)/cuda/include/thrust/memory.h" && cp "/usr/local/cuda-9.0/include/thrust/merge.h" "$(@D)/cuda/include/thrust/merge.h" && cp "/usr/local/cuda-9.0/include/thrust/mismatch.h" "$(@D)/cuda/include/thrust/mismatch.h" && cp "/usr/local/cuda-9.0/include/thrust/pair.h" "$(@D)/cuda/include/thrust/pair.h" && cp "/usr/local/cuda-9.0/include/thrust/partition.h" "$(@D)/cuda/include/thrust/partition.h" && cp "/usr/local/cuda-9.0/include/thrust/random.h" "$(@D)/cuda/include/thrust/random.h" && cp "/usr/local/cuda-9.0/include/thrust/random/detail/discard_block_engine.inl" "$(@D)/cuda/include/thrust/random/detail/discard_block_engine.inl" && cp "/usr/local/cuda-9.0/include/thrust/random/detail/linear_congruential_engine.inl" "$(@D)/cuda/include/thrust/random/detail/linear_congruential_engine.inl" && cp "/usr/local/cuda-9.0/include/thrust/random/detail/linear_congruential_engine_discard.h" "$(@D)/cuda/include/thrust/random/detail/linear_congruential_engine_discard.h" && cp "/usr/local/cuda-9.0/include/thrust/random/detail/linear_feedback_shift_engine.inl" "$(@D)/cuda/include/thrust/random/detail/linear_feedback_shift_engine.inl" && cp "/usr/local/cuda-9.0/include/thrust/random/detail/linear_feedback_shift_engine_wordmask.h" "$(@D)/cuda/include/thrust/random/detail/linear_feedback_shift_engine_wordmask.h" && cp "/usr/local/cuda-9.0/include/thrust/random/detail/mod.h" "$(@D)/cuda/include/thrust/random/detail/mod.h" && cp "/usr/local/cuda-9.0/include/thrust/random/detail/normal_distribution.inl" "$(@D)/cuda/include/thrust/random/detail/normal_distribution.inl" && cp "/usr/local/cuda-9.0/include/thrust/random/detail/normal_distribution_base.h" "$(@D)/cuda/include/thrust/random/detail/normal_distribution_base.h" && cp "/usr/local/cuda-9.0/include/thrust/random/detail/random_core_access.h" "$(@D)/cuda/include/thrust/random/detail/random_core_access.h" && cp "/usr/local/cuda-9.0/include/thrust/random/detail/subtract_with_carry_engine.inl" "$(@D)/cuda/include/thrust/random/detail/subtract_with_carry_engine.inl" && cp "/usr/local/cuda-9.0/include/thrust/random/detail/uniform_int_distribution.inl" "$(@D)/cuda/include/thrust/random/detail/uniform_int_distribution.inl" && cp "/usr/local/cuda-9.0/include/thrust/random/detail/uniform_real_distribution.inl" "$(@D)/cuda/include/thrust/random/detail/uniform_real_distribution.inl" && cp "/usr/local/cuda-9.0/include/thrust/random/detail/xor_combine_engine.inl" "$(@D)/cuda/include/thrust/random/detail/xor_combine_engine.inl" && cp "/usr/local/cuda-9.0/include/thrust/random/detail/xor_combine_engine_max.h" "$(@D)/cuda/include/thrust/random/detail/xor_combine_engine_max.h" && cp "/usr/local/cuda-9.0/include/thrust/random/discard_block_engine.h" "$(@D)/cuda/include/thrust/random/discard_block_engine.h" && cp "/usr/local/cuda-9.0/include/thrust/random/linear_congruential_engine.h" "$(@D)/cuda/include/thrust/random/linear_congruential_engine.h" && cp "/usr/local/cuda-9.0/include/thrust/random/linear_feedback_shift_engine.h" "$(@D)/cuda/include/thrust/random/linear_feedback_shift_engine.h" && cp "/usr/local/cuda-9.0/include/thrust/random/normal_distribution.h" "$(@D)/cuda/include/thrust/random/normal_distribution.h" && cp "/usr/local/cuda-9.0/include/thrust/random/subtract_with_carry_engine.h" "$(@D)/cuda/include/thrust/random/subtract_with_carry_engine.h" && cp "/usr/local/cuda-9.0/include/thrust/random/uniform_int_distribution.h" "$(@D)/cuda/include/thrust/random/uniform_int_distribution.h" && cp "/usr/local/cuda-9.0/include/thrust/random/uniform_real_distribution.h" "$(@D)/cuda/include/thrust/random/uniform_real_distribution.h" && cp "/usr/local/cuda-9.0/include/thrust/random/xor_combine_engine.h" "$(@D)/cuda/include/thrust/random/xor_combine_engine.h" && cp "/usr/local/cuda-9.0/include/thrust/reduce.h" "$(@D)/cuda/include/thrust/reduce.h" && cp "/usr/local/cuda-9.0/include/thrust/remove.h" "$(@D)/cuda/include/thrust/remove.h" && cp "/usr/local/cuda-9.0/include/thrust/replace.h" "$(@D)/cuda/include/thrust/replace.h" && cp "/usr/local/cuda-9.0/include/thrust/reverse.h" "$(@D)/cuda/include/thrust/reverse.h" && cp "/usr/local/cuda-9.0/include/thrust/scan.h" "$(@D)/cuda/include/thrust/scan.h" && cp "/usr/local/cuda-9.0/include/thrust/scatter.h" "$(@D)/cuda/include/thrust/scatter.h" && cp "/usr/local/cuda-9.0/include/thrust/sequence.h" "$(@D)/cuda/include/thrust/sequence.h" && cp "/usr/local/cuda-9.0/include/thrust/set_operations.h" "$(@D)/cuda/include/thrust/set_operations.h" && cp "/usr/local/cuda-9.0/include/thrust/sort.h" "$(@D)/cuda/include/thrust/sort.h" && cp "/usr/local/cuda-9.0/include/thrust/swap.h" "$(@D)/cuda/include/thrust/swap.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/adjacent_difference.h" "$(@D)/cuda/include/thrust/system/cpp/detail/adjacent_difference.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/assign_value.h" "$(@D)/cuda/include/thrust/system/cpp/detail/assign_value.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/binary_search.h" "$(@D)/cuda/include/thrust/system/cpp/detail/binary_search.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/copy.h" "$(@D)/cuda/include/thrust/system/cpp/detail/copy.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/copy_if.h" "$(@D)/cuda/include/thrust/system/cpp/detail/copy_if.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/count.h" "$(@D)/cuda/include/thrust/system/cpp/detail/count.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/equal.h" "$(@D)/cuda/include/thrust/system/cpp/detail/equal.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/execution_policy.h" "$(@D)/cuda/include/thrust/system/cpp/detail/execution_policy.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/extrema.h" "$(@D)/cuda/include/thrust/system/cpp/detail/extrema.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/fill.h" "$(@D)/cuda/include/thrust/system/cpp/detail/fill.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/find.h" "$(@D)/cuda/include/thrust/system/cpp/detail/find.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/for_each.h" "$(@D)/cuda/include/thrust/system/cpp/detail/for_each.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/gather.h" "$(@D)/cuda/include/thrust/system/cpp/detail/gather.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/generate.h" "$(@D)/cuda/include/thrust/system/cpp/detail/generate.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/get_value.h" "$(@D)/cuda/include/thrust/system/cpp/detail/get_value.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/inner_product.h" "$(@D)/cuda/include/thrust/system/cpp/detail/inner_product.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/iter_swap.h" "$(@D)/cuda/include/thrust/system/cpp/detail/iter_swap.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/logical.h" "$(@D)/cuda/include/thrust/system/cpp/detail/logical.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/malloc_and_free.h" "$(@D)/cuda/include/thrust/system/cpp/detail/malloc_and_free.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/memory.inl" "$(@D)/cuda/include/thrust/system/cpp/detail/memory.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/merge.h" "$(@D)/cuda/include/thrust/system/cpp/detail/merge.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/mismatch.h" "$(@D)/cuda/include/thrust/system/cpp/detail/mismatch.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/par.h" "$(@D)/cuda/include/thrust/system/cpp/detail/par.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/partition.h" "$(@D)/cuda/include/thrust/system/cpp/detail/partition.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/reduce.h" "$(@D)/cuda/include/thrust/system/cpp/detail/reduce.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/reduce_by_key.h" "$(@D)/cuda/include/thrust/system/cpp/detail/reduce_by_key.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/remove.h" "$(@D)/cuda/include/thrust/system/cpp/detail/remove.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/replace.h" "$(@D)/cuda/include/thrust/system/cpp/detail/replace.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/reverse.h" "$(@D)/cuda/include/thrust/system/cpp/detail/reverse.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/scan.h" "$(@D)/cuda/include/thrust/system/cpp/detail/scan.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/scan_by_key.h" "$(@D)/cuda/include/thrust/system/cpp/detail/scan_by_key.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/scatter.h" "$(@D)/cuda/include/thrust/system/cpp/detail/scatter.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/sequence.h" "$(@D)/cuda/include/thrust/system/cpp/detail/sequence.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/set_operations.h" "$(@D)/cuda/include/thrust/system/cpp/detail/set_operations.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/sort.h" "$(@D)/cuda/include/thrust/system/cpp/detail/sort.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/swap_ranges.h" "$(@D)/cuda/include/thrust/system/cpp/detail/swap_ranges.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/tabulate.h" "$(@D)/cuda/include/thrust/system/cpp/detail/tabulate.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/temporary_buffer.h" "$(@D)/cuda/include/thrust/system/cpp/detail/temporary_buffer.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/transform.h" "$(@D)/cuda/include/thrust/system/cpp/detail/transform.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/transform_reduce.h" "$(@D)/cuda/include/thrust/system/cpp/detail/transform_reduce.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/transform_scan.h" "$(@D)/cuda/include/thrust/system/cpp/detail/transform_scan.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/uninitialized_copy.h" "$(@D)/cuda/include/thrust/system/cpp/detail/uninitialized_copy.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/uninitialized_fill.h" "$(@D)/cuda/include/thrust/system/cpp/detail/uninitialized_fill.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/unique.h" "$(@D)/cuda/include/thrust/system/cpp/detail/unique.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/unique_by_key.h" "$(@D)/cuda/include/thrust/system/cpp/detail/unique_by_key.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/vector.inl" "$(@D)/cuda/include/thrust/system/cpp/detail/vector.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/execution_policy.h" "$(@D)/cuda/include/thrust/system/cpp/execution_policy.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/memory.h" "$(@D)/cuda/include/thrust/system/cpp/memory.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/vector.h" "$(@D)/cuda/include/thrust/system/cpp/vector.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/config.h" "$(@D)/cuda/include/thrust/system/cuda/config.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/adjacent_difference.h" "$(@D)/cuda/include/thrust/system/cuda/detail/adjacent_difference.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/assign_value.h" "$(@D)/cuda/include/thrust/system/cuda/detail/assign_value.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/binary_search.h" "$(@D)/cuda/include/thrust/system/cuda/detail/binary_search.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/copy.h" "$(@D)/cuda/include/thrust/system/cuda/detail/copy.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/copy_if.h" "$(@D)/cuda/include/thrust/system/cuda/detail/copy_if.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/core/agent_launcher.h" "$(@D)/cuda/include/thrust/system/cuda/detail/core/agent_launcher.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/core/alignment.h" "$(@D)/cuda/include/thrust/system/cuda/detail/core/alignment.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/core/triple_chevron_launch.h" "$(@D)/cuda/include/thrust/system/cuda/detail/core/triple_chevron_launch.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/core/util.h" "$(@D)/cuda/include/thrust/system/cuda/detail/core/util.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/count.h" "$(@D)/cuda/include/thrust/system/cuda/detail/count.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cross_system.h" "$(@D)/cuda/include/thrust/system/cuda/detail/cross_system.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/agent/agent_histogram.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_histogram.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/agent/agent_radix_sort_downsweep.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_radix_sort_downsweep.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/agent/agent_radix_sort_upsweep.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_radix_sort_upsweep.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/agent/agent_reduce.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_reduce.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/agent/agent_reduce_by_key.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_reduce_by_key.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/agent/agent_rle.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_rle.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/agent/agent_scan.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_scan.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/agent/agent_segment_fixup.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_segment_fixup.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/agent/agent_select_if.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_select_if.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/agent/agent_spmv_csrt.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_spmv_csrt.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/agent/agent_spmv_orig.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_spmv_orig.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/agent/agent_spmv_row_based.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_spmv_row_based.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/agent/single_pass_scan_operators.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/single_pass_scan_operators.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/block_adjacent_difference.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_adjacent_difference.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/block_discontinuity.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_discontinuity.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/block_exchange.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_exchange.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/block_histogram.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_histogram.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/block_load.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_load.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/block_radix_rank.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_radix_rank.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/block_radix_sort.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_radix_sort.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/block_raking_layout.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_raking_layout.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/block_reduce.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_reduce.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/block_scan.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_scan.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/block_shuffle.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_shuffle.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/block_store.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_store.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/specializations/block_histogram_atomic.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_histogram_atomic.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/specializations/block_histogram_sort.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_histogram_sort.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/specializations/block_reduce_raking.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_reduce_raking.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/specializations/block_reduce_raking_commutative_only.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_reduce_raking_commutative_only.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/specializations/block_reduce_warp_reductions.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_reduce_warp_reductions.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_raking.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_raking.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans2.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans2.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans3.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans3.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/cub.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/cub.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/device_histogram.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_histogram.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/device_partition.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_partition.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/device_radix_sort.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_radix_sort.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/device_reduce.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_reduce.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/device_run_length_encode.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_run_length_encode.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/device_scan.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_scan.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/device_segmented_radix_sort.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_segmented_radix_sort.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/device_segmented_reduce.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_segmented_reduce.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/device_select.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_select.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/device_spmv.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_spmv.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_histogram.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_histogram.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_radix_sort.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_radix_sort.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_reduce.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_reduce.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_reduce_by_key.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_reduce_by_key.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_rle.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_rle.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_scan.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_scan.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_select_if.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_select_if.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_spmv_csrt.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_spmv_csrt.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_spmv_orig.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_spmv_orig.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_spmv_row_based.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_spmv_row_based.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/grid/grid_barrier.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/grid/grid_barrier.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/grid/grid_even_share.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/grid/grid_even_share.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/grid/grid_mapping.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/grid/grid_mapping.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/grid/grid_queue.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/grid/grid_queue.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/host/mutex.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/host/mutex.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/iterator/arg_index_input_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/arg_index_input_iterator.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/iterator/cache_modified_input_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/cache_modified_input_iterator.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/iterator/cache_modified_output_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/cache_modified_output_iterator.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/iterator/constant_input_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/constant_input_iterator.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/iterator/counting_input_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/counting_input_iterator.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/iterator/discard_output_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/discard_output_iterator.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/iterator/tex_obj_input_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/tex_obj_input_iterator.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/iterator/tex_ref_input_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/tex_ref_input_iterator.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/iterator/transform_input_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/transform_input_iterator.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/thread/thread_load.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/thread/thread_load.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/thread/thread_operators.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/thread/thread_operators.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/thread/thread_reduce.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/thread/thread_reduce.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/thread/thread_scan.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/thread/thread_scan.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/thread/thread_search.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/thread/thread_search.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/thread/thread_store.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/thread/thread_store.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/util_allocator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/util_allocator.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/util_arch.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/util_arch.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/util_debug.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/util_debug.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/util_device.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/util_device.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/util_macro.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/util_macro.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/util_namespace.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/util_namespace.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/util_ptx.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/util_ptx.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/util_type.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/util_type.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/warp/specializations/warp_reduce_shfl.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/warp/specializations/warp_reduce_shfl.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/warp/specializations/warp_reduce_smem.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/warp/specializations/warp_reduce_smem.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/warp/specializations/warp_scan_shfl.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/warp/specializations/warp_scan_shfl.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/warp/specializations/warp_scan_smem.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/warp/specializations/warp_scan_smem.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/warp/warp_reduce.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/warp/warp_reduce.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/warp/warp_scan.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/warp/warp_scan.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/equal.h" "$(@D)/cuda/include/thrust/system/cuda/detail/equal.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/error.inl" "$(@D)/cuda/include/thrust/system/cuda/detail/error.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/execution_policy.h" "$(@D)/cuda/include/thrust/system/cuda/detail/execution_policy.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/extrema.h" "$(@D)/cuda/include/thrust/system/cuda/detail/extrema.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/fill.h" "$(@D)/cuda/include/thrust/system/cuda/detail/fill.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/find.h" "$(@D)/cuda/include/thrust/system/cuda/detail/find.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/for_each.h" "$(@D)/cuda/include/thrust/system/cuda/detail/for_each.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/gather.h" "$(@D)/cuda/include/thrust/system/cuda/detail/gather.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/generate.h" "$(@D)/cuda/include/thrust/system/cuda/detail/generate.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/get_value.h" "$(@D)/cuda/include/thrust/system/cuda/detail/get_value.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/guarded_cuda_runtime_api.h" "$(@D)/cuda/include/thrust/system/cuda/detail/guarded_cuda_runtime_api.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/guarded_driver_types.h" "$(@D)/cuda/include/thrust/system/cuda/detail/guarded_driver_types.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/inner_product.h" "$(@D)/cuda/include/thrust/system/cuda/detail/inner_product.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/internal/copy_cross_system.h" "$(@D)/cuda/include/thrust/system/cuda/detail/internal/copy_cross_system.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/internal/copy_device_to_device.h" "$(@D)/cuda/include/thrust/system/cuda/detail/internal/copy_device_to_device.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/iter_swap.h" "$(@D)/cuda/include/thrust/system/cuda/detail/iter_swap.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/logical.h" "$(@D)/cuda/include/thrust/system/cuda/detail/logical.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/malloc_and_free.h" "$(@D)/cuda/include/thrust/system/cuda/detail/malloc_and_free.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/memory.inl" "$(@D)/cuda/include/thrust/system/cuda/detail/memory.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/memory_buffer.h" "$(@D)/cuda/include/thrust/system/cuda/detail/memory_buffer.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/merge.h" "$(@D)/cuda/include/thrust/system/cuda/detail/merge.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/mismatch.h" "$(@D)/cuda/include/thrust/system/cuda/detail/mismatch.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/par.h" "$(@D)/cuda/include/thrust/system/cuda/detail/par.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/par_to_seq.h" "$(@D)/cuda/include/thrust/system/cuda/detail/par_to_seq.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/parallel_for.h" "$(@D)/cuda/include/thrust/system/cuda/detail/parallel_for.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/partition.h" "$(@D)/cuda/include/thrust/system/cuda/detail/partition.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/reduce.h" "$(@D)/cuda/include/thrust/system/cuda/detail/reduce.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/reduce_by_key.h" "$(@D)/cuda/include/thrust/system/cuda/detail/reduce_by_key.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/remove.h" "$(@D)/cuda/include/thrust/system/cuda/detail/remove.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/replace.h" "$(@D)/cuda/include/thrust/system/cuda/detail/replace.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/reverse.h" "$(@D)/cuda/include/thrust/system/cuda/detail/reverse.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/scan.h" "$(@D)/cuda/include/thrust/system/cuda/detail/scan.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/scan_by_key.h" "$(@D)/cuda/include/thrust/system/cuda/detail/scan_by_key.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/scatter.h" "$(@D)/cuda/include/thrust/system/cuda/detail/scatter.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/sequence.h" "$(@D)/cuda/include/thrust/system/cuda/detail/sequence.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/set_operations.h" "$(@D)/cuda/include/thrust/system/cuda/detail/set_operations.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/sort.h" "$(@D)/cuda/include/thrust/system/cuda/detail/sort.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/swap_ranges.h" "$(@D)/cuda/include/thrust/system/cuda/detail/swap_ranges.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/tabulate.h" "$(@D)/cuda/include/thrust/system/cuda/detail/tabulate.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/temporary_buffer.h" "$(@D)/cuda/include/thrust/system/cuda/detail/temporary_buffer.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/terminate.h" "$(@D)/cuda/include/thrust/system/cuda/detail/terminate.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/transform.h" "$(@D)/cuda/include/thrust/system/cuda/detail/transform.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/transform_reduce.h" "$(@D)/cuda/include/thrust/system/cuda/detail/transform_reduce.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/transform_scan.h" "$(@D)/cuda/include/thrust/system/cuda/detail/transform_scan.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/uninitialized_copy.h" "$(@D)/cuda/include/thrust/system/cuda/detail/uninitialized_copy.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/uninitialized_fill.h" "$(@D)/cuda/include/thrust/system/cuda/detail/uninitialized_fill.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/unique.h" "$(@D)/cuda/include/thrust/system/cuda/detail/unique.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/unique_by_key.h" "$(@D)/cuda/include/thrust/system/cuda/detail/unique_by_key.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/util.h" "$(@D)/cuda/include/thrust/system/cuda/detail/util.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/vector.inl" "$(@D)/cuda/include/thrust/system/cuda/detail/vector.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/error.h" "$(@D)/cuda/include/thrust/system/cuda/error.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/execution_policy.h" "$(@D)/cuda/include/thrust/system/cuda/execution_policy.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/experimental/pinned_allocator.h" "$(@D)/cuda/include/thrust/system/cuda/experimental/pinned_allocator.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/memory.h" "$(@D)/cuda/include/thrust/system/cuda/memory.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/vector.h" "$(@D)/cuda/include/thrust/system/cuda/vector.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/adjacent_difference.h" "$(@D)/cuda/include/thrust/system/detail/adl/adjacent_difference.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/assign_value.h" "$(@D)/cuda/include/thrust/system/detail/adl/assign_value.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/binary_search.h" "$(@D)/cuda/include/thrust/system/detail/adl/binary_search.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/copy.h" "$(@D)/cuda/include/thrust/system/detail/adl/copy.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/copy_if.h" "$(@D)/cuda/include/thrust/system/detail/adl/copy_if.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/count.h" "$(@D)/cuda/include/thrust/system/detail/adl/count.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/equal.h" "$(@D)/cuda/include/thrust/system/detail/adl/equal.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/extrema.h" "$(@D)/cuda/include/thrust/system/detail/adl/extrema.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/fill.h" "$(@D)/cuda/include/thrust/system/detail/adl/fill.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/find.h" "$(@D)/cuda/include/thrust/system/detail/adl/find.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/for_each.h" "$(@D)/cuda/include/thrust/system/detail/adl/for_each.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/gather.h" "$(@D)/cuda/include/thrust/system/detail/adl/gather.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/generate.h" "$(@D)/cuda/include/thrust/system/detail/adl/generate.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/get_value.h" "$(@D)/cuda/include/thrust/system/detail/adl/get_value.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/inner_product.h" "$(@D)/cuda/include/thrust/system/detail/adl/inner_product.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/iter_swap.h" "$(@D)/cuda/include/thrust/system/detail/adl/iter_swap.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/logical.h" "$(@D)/cuda/include/thrust/system/detail/adl/logical.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/malloc_and_free.h" "$(@D)/cuda/include/thrust/system/detail/adl/malloc_and_free.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/merge.h" "$(@D)/cuda/include/thrust/system/detail/adl/merge.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/mismatch.h" "$(@D)/cuda/include/thrust/system/detail/adl/mismatch.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/partition.h" "$(@D)/cuda/include/thrust/system/detail/adl/partition.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/reduce.h" "$(@D)/cuda/include/thrust/system/detail/adl/reduce.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/reduce_by_key.h" "$(@D)/cuda/include/thrust/system/detail/adl/reduce_by_key.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/remove.h" "$(@D)/cuda/include/thrust/system/detail/adl/remove.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/replace.h" "$(@D)/cuda/include/thrust/system/detail/adl/replace.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/reverse.h" "$(@D)/cuda/include/thrust/system/detail/adl/reverse.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/scan.h" "$(@D)/cuda/include/thrust/system/detail/adl/scan.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/scan_by_key.h" "$(@D)/cuda/include/thrust/system/detail/adl/scan_by_key.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/scatter.h" "$(@D)/cuda/include/thrust/system/detail/adl/scatter.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/sequence.h" "$(@D)/cuda/include/thrust/system/detail/adl/sequence.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/set_operations.h" "$(@D)/cuda/include/thrust/system/detail/adl/set_operations.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/sort.h" "$(@D)/cuda/include/thrust/system/detail/adl/sort.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/swap_ranges.h" "$(@D)/cuda/include/thrust/system/detail/adl/swap_ranges.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/tabulate.h" "$(@D)/cuda/include/thrust/system/detail/adl/tabulate.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/temporary_buffer.h" "$(@D)/cuda/include/thrust/system/detail/adl/temporary_buffer.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/transform.h" "$(@D)/cuda/include/thrust/system/detail/adl/transform.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/transform_reduce.h" "$(@D)/cuda/include/thrust/system/detail/adl/transform_reduce.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/transform_scan.h" "$(@D)/cuda/include/thrust/system/detail/adl/transform_scan.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/uninitialized_copy.h" "$(@D)/cuda/include/thrust/system/detail/adl/uninitialized_copy.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/uninitialized_fill.h" "$(@D)/cuda/include/thrust/system/detail/adl/uninitialized_fill.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/unique.h" "$(@D)/cuda/include/thrust/system/detail/adl/unique.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/unique_by_key.h" "$(@D)/cuda/include/thrust/system/detail/adl/unique_by_key.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/bad_alloc.h" "$(@D)/cuda/include/thrust/system/detail/bad_alloc.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/errno.h" "$(@D)/cuda/include/thrust/system/detail/errno.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/error_category.inl" "$(@D)/cuda/include/thrust/system/detail/error_category.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/error_code.inl" "$(@D)/cuda/include/thrust/system/detail/error_code.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/error_condition.inl" "$(@D)/cuda/include/thrust/system/detail/error_condition.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/adjacent_difference.h" "$(@D)/cuda/include/thrust/system/detail/generic/adjacent_difference.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/adjacent_difference.inl" "$(@D)/cuda/include/thrust/system/detail/generic/adjacent_difference.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/advance.h" "$(@D)/cuda/include/thrust/system/detail/generic/advance.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/advance.inl" "$(@D)/cuda/include/thrust/system/detail/generic/advance.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/binary_search.h" "$(@D)/cuda/include/thrust/system/detail/generic/binary_search.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/binary_search.inl" "$(@D)/cuda/include/thrust/system/detail/generic/binary_search.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/copy.h" "$(@D)/cuda/include/thrust/system/detail/generic/copy.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/copy.inl" "$(@D)/cuda/include/thrust/system/detail/generic/copy.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/copy_if.h" "$(@D)/cuda/include/thrust/system/detail/generic/copy_if.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/copy_if.inl" "$(@D)/cuda/include/thrust/system/detail/generic/copy_if.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/count.h" "$(@D)/cuda/include/thrust/system/detail/generic/count.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/count.inl" "$(@D)/cuda/include/thrust/system/detail/generic/count.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/distance.h" "$(@D)/cuda/include/thrust/system/detail/generic/distance.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/distance.inl" "$(@D)/cuda/include/thrust/system/detail/generic/distance.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/equal.h" "$(@D)/cuda/include/thrust/system/detail/generic/equal.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/equal.inl" "$(@D)/cuda/include/thrust/system/detail/generic/equal.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/extrema.h" "$(@D)/cuda/include/thrust/system/detail/generic/extrema.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/extrema.inl" "$(@D)/cuda/include/thrust/system/detail/generic/extrema.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/fill.h" "$(@D)/cuda/include/thrust/system/detail/generic/fill.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/find.h" "$(@D)/cuda/include/thrust/system/detail/generic/find.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/find.inl" "$(@D)/cuda/include/thrust/system/detail/generic/find.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/for_each.h" "$(@D)/cuda/include/thrust/system/detail/generic/for_each.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/gather.h" "$(@D)/cuda/include/thrust/system/detail/generic/gather.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/gather.inl" "$(@D)/cuda/include/thrust/system/detail/generic/gather.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/generate.h" "$(@D)/cuda/include/thrust/system/detail/generic/generate.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/generate.inl" "$(@D)/cuda/include/thrust/system/detail/generic/generate.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/inner_product.h" "$(@D)/cuda/include/thrust/system/detail/generic/inner_product.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/inner_product.inl" "$(@D)/cuda/include/thrust/system/detail/generic/inner_product.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/logical.h" "$(@D)/cuda/include/thrust/system/detail/generic/logical.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/memory.h" "$(@D)/cuda/include/thrust/system/detail/generic/memory.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/memory.inl" "$(@D)/cuda/include/thrust/system/detail/generic/memory.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/merge.h" "$(@D)/cuda/include/thrust/system/detail/generic/merge.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/merge.inl" "$(@D)/cuda/include/thrust/system/detail/generic/merge.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/mismatch.h" "$(@D)/cuda/include/thrust/system/detail/generic/mismatch.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/mismatch.inl" "$(@D)/cuda/include/thrust/system/detail/generic/mismatch.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/partition.h" "$(@D)/cuda/include/thrust/system/detail/generic/partition.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/partition.inl" "$(@D)/cuda/include/thrust/system/detail/generic/partition.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/reduce.h" "$(@D)/cuda/include/thrust/system/detail/generic/reduce.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/reduce.inl" "$(@D)/cuda/include/thrust/system/detail/generic/reduce.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/reduce_by_key.h" "$(@D)/cuda/include/thrust/system/detail/generic/reduce_by_key.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/reduce_by_key.inl" "$(@D)/cuda/include/thrust/system/detail/generic/reduce_by_key.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/remove.h" "$(@D)/cuda/include/thrust/system/detail/generic/remove.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/remove.inl" "$(@D)/cuda/include/thrust/system/detail/generic/remove.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/replace.h" "$(@D)/cuda/include/thrust/system/detail/generic/replace.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/replace.inl" "$(@D)/cuda/include/thrust/system/detail/generic/replace.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/reverse.h" "$(@D)/cuda/include/thrust/system/detail/generic/reverse.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/reverse.inl" "$(@D)/cuda/include/thrust/system/detail/generic/reverse.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/scalar/binary_search.h" "$(@D)/cuda/include/thrust/system/detail/generic/scalar/binary_search.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/scalar/binary_search.inl" "$(@D)/cuda/include/thrust/system/detail/generic/scalar/binary_search.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/scan.h" "$(@D)/cuda/include/thrust/system/detail/generic/scan.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/scan.inl" "$(@D)/cuda/include/thrust/system/detail/generic/scan.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/scan_by_key.h" "$(@D)/cuda/include/thrust/system/detail/generic/scan_by_key.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/scan_by_key.inl" "$(@D)/cuda/include/thrust/system/detail/generic/scan_by_key.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/scatter.h" "$(@D)/cuda/include/thrust/system/detail/generic/scatter.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/scatter.inl" "$(@D)/cuda/include/thrust/system/detail/generic/scatter.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/select_system.h" "$(@D)/cuda/include/thrust/system/detail/generic/select_system.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/sequence.h" "$(@D)/cuda/include/thrust/system/detail/generic/sequence.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/sequence.inl" "$(@D)/cuda/include/thrust/system/detail/generic/sequence.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/set_operations.h" "$(@D)/cuda/include/thrust/system/detail/generic/set_operations.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/set_operations.inl" "$(@D)/cuda/include/thrust/system/detail/generic/set_operations.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/sort.h" "$(@D)/cuda/include/thrust/system/detail/generic/sort.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/sort.inl" "$(@D)/cuda/include/thrust/system/detail/generic/sort.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/swap_ranges.h" "$(@D)/cuda/include/thrust/system/detail/generic/swap_ranges.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/swap_ranges.inl" "$(@D)/cuda/include/thrust/system/detail/generic/swap_ranges.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/tabulate.h" "$(@D)/cuda/include/thrust/system/detail/generic/tabulate.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/tabulate.inl" "$(@D)/cuda/include/thrust/system/detail/generic/tabulate.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/tag.h" "$(@D)/cuda/include/thrust/system/detail/generic/tag.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/temporary_buffer.h" "$(@D)/cuda/include/thrust/system/detail/generic/temporary_buffer.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/temporary_buffer.inl" "$(@D)/cuda/include/thrust/system/detail/generic/temporary_buffer.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/transform.h" "$(@D)/cuda/include/thrust/system/detail/generic/transform.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/transform.inl" "$(@D)/cuda/include/thrust/system/detail/generic/transform.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/transform_reduce.h" "$(@D)/cuda/include/thrust/system/detail/generic/transform_reduce.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/transform_reduce.inl" "$(@D)/cuda/include/thrust/system/detail/generic/transform_reduce.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/transform_scan.h" "$(@D)/cuda/include/thrust/system/detail/generic/transform_scan.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/transform_scan.inl" "$(@D)/cuda/include/thrust/system/detail/generic/transform_scan.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/type_traits.h" "$(@D)/cuda/include/thrust/system/detail/generic/type_traits.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/uninitialized_copy.h" "$(@D)/cuda/include/thrust/system/detail/generic/uninitialized_copy.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/uninitialized_copy.inl" "$(@D)/cuda/include/thrust/system/detail/generic/uninitialized_copy.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/uninitialized_fill.h" "$(@D)/cuda/include/thrust/system/detail/generic/uninitialized_fill.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/uninitialized_fill.inl" "$(@D)/cuda/include/thrust/system/detail/generic/uninitialized_fill.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/unique.h" "$(@D)/cuda/include/thrust/system/detail/generic/unique.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/unique.inl" "$(@D)/cuda/include/thrust/system/detail/generic/unique.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/unique_by_key.h" "$(@D)/cuda/include/thrust/system/detail/generic/unique_by_key.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/unique_by_key.inl" "$(@D)/cuda/include/thrust/system/detail/generic/unique_by_key.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/internal/decompose.h" "$(@D)/cuda/include/thrust/system/detail/internal/decompose.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/adjacent_difference.h" "$(@D)/cuda/include/thrust/system/detail/sequential/adjacent_difference.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/assign_value.h" "$(@D)/cuda/include/thrust/system/detail/sequential/assign_value.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/binary_search.h" "$(@D)/cuda/include/thrust/system/detail/sequential/binary_search.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/copy.h" "$(@D)/cuda/include/thrust/system/detail/sequential/copy.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/copy.inl" "$(@D)/cuda/include/thrust/system/detail/sequential/copy.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/copy_backward.h" "$(@D)/cuda/include/thrust/system/detail/sequential/copy_backward.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/copy_if.h" "$(@D)/cuda/include/thrust/system/detail/sequential/copy_if.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/count.h" "$(@D)/cuda/include/thrust/system/detail/sequential/count.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/equal.h" "$(@D)/cuda/include/thrust/system/detail/sequential/equal.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/execution_policy.h" "$(@D)/cuda/include/thrust/system/detail/sequential/execution_policy.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/extrema.h" "$(@D)/cuda/include/thrust/system/detail/sequential/extrema.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/fill.h" "$(@D)/cuda/include/thrust/system/detail/sequential/fill.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/find.h" "$(@D)/cuda/include/thrust/system/detail/sequential/find.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/for_each.h" "$(@D)/cuda/include/thrust/system/detail/sequential/for_each.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/gather.h" "$(@D)/cuda/include/thrust/system/detail/sequential/gather.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/general_copy.h" "$(@D)/cuda/include/thrust/system/detail/sequential/general_copy.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/generate.h" "$(@D)/cuda/include/thrust/system/detail/sequential/generate.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/get_value.h" "$(@D)/cuda/include/thrust/system/detail/sequential/get_value.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/inner_product.h" "$(@D)/cuda/include/thrust/system/detail/sequential/inner_product.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/insertion_sort.h" "$(@D)/cuda/include/thrust/system/detail/sequential/insertion_sort.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/iter_swap.h" "$(@D)/cuda/include/thrust/system/detail/sequential/iter_swap.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/logical.h" "$(@D)/cuda/include/thrust/system/detail/sequential/logical.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/malloc_and_free.h" "$(@D)/cuda/include/thrust/system/detail/sequential/malloc_and_free.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/merge.h" "$(@D)/cuda/include/thrust/system/detail/sequential/merge.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/merge.inl" "$(@D)/cuda/include/thrust/system/detail/sequential/merge.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/mismatch.h" "$(@D)/cuda/include/thrust/system/detail/sequential/mismatch.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/partition.h" "$(@D)/cuda/include/thrust/system/detail/sequential/partition.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/reduce.h" "$(@D)/cuda/include/thrust/system/detail/sequential/reduce.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/reduce_by_key.h" "$(@D)/cuda/include/thrust/system/detail/sequential/reduce_by_key.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/remove.h" "$(@D)/cuda/include/thrust/system/detail/sequential/remove.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/replace.h" "$(@D)/cuda/include/thrust/system/detail/sequential/replace.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/reverse.h" "$(@D)/cuda/include/thrust/system/detail/sequential/reverse.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/scan.h" "$(@D)/cuda/include/thrust/system/detail/sequential/scan.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/scan_by_key.h" "$(@D)/cuda/include/thrust/system/detail/sequential/scan_by_key.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/scatter.h" "$(@D)/cuda/include/thrust/system/detail/sequential/scatter.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/sequence.h" "$(@D)/cuda/include/thrust/system/detail/sequential/sequence.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/set_operations.h" "$(@D)/cuda/include/thrust/system/detail/sequential/set_operations.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/sort.h" "$(@D)/cuda/include/thrust/system/detail/sequential/sort.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/sort.inl" "$(@D)/cuda/include/thrust/system/detail/sequential/sort.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/stable_merge_sort.h" "$(@D)/cuda/include/thrust/system/detail/sequential/stable_merge_sort.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/stable_merge_sort.inl" "$(@D)/cuda/include/thrust/system/detail/sequential/stable_merge_sort.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/stable_primitive_sort.h" "$(@D)/cuda/include/thrust/system/detail/sequential/stable_primitive_sort.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/stable_primitive_sort.inl" "$(@D)/cuda/include/thrust/system/detail/sequential/stable_primitive_sort.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/stable_radix_sort.h" "$(@D)/cuda/include/thrust/system/detail/sequential/stable_radix_sort.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/stable_radix_sort.inl" "$(@D)/cuda/include/thrust/system/detail/sequential/stable_radix_sort.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/swap_ranges.h" "$(@D)/cuda/include/thrust/system/detail/sequential/swap_ranges.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/tabulate.h" "$(@D)/cuda/include/thrust/system/detail/sequential/tabulate.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/temporary_buffer.h" "$(@D)/cuda/include/thrust/system/detail/sequential/temporary_buffer.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/transform.h" "$(@D)/cuda/include/thrust/system/detail/sequential/transform.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/transform_reduce.h" "$(@D)/cuda/include/thrust/system/detail/sequential/transform_reduce.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/transform_scan.h" "$(@D)/cuda/include/thrust/system/detail/sequential/transform_scan.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/trivial_copy.h" "$(@D)/cuda/include/thrust/system/detail/sequential/trivial_copy.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/uninitialized_copy.h" "$(@D)/cuda/include/thrust/system/detail/sequential/uninitialized_copy.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/uninitialized_fill.h" "$(@D)/cuda/include/thrust/system/detail/sequential/uninitialized_fill.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/unique.h" "$(@D)/cuda/include/thrust/system/detail/sequential/unique.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/unique_by_key.h" "$(@D)/cuda/include/thrust/system/detail/sequential/unique_by_key.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/system_error.inl" "$(@D)/cuda/include/thrust/system/detail/system_error.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/error_code.h" "$(@D)/cuda/include/thrust/system/error_code.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/adjacent_difference.h" "$(@D)/cuda/include/thrust/system/omp/detail/adjacent_difference.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/assign_value.h" "$(@D)/cuda/include/thrust/system/omp/detail/assign_value.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/binary_search.h" "$(@D)/cuda/include/thrust/system/omp/detail/binary_search.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/copy.h" "$(@D)/cuda/include/thrust/system/omp/detail/copy.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/copy.inl" "$(@D)/cuda/include/thrust/system/omp/detail/copy.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/copy_if.h" "$(@D)/cuda/include/thrust/system/omp/detail/copy_if.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/copy_if.inl" "$(@D)/cuda/include/thrust/system/omp/detail/copy_if.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/count.h" "$(@D)/cuda/include/thrust/system/omp/detail/count.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/default_decomposition.h" "$(@D)/cuda/include/thrust/system/omp/detail/default_decomposition.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/default_decomposition.inl" "$(@D)/cuda/include/thrust/system/omp/detail/default_decomposition.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/equal.h" "$(@D)/cuda/include/thrust/system/omp/detail/equal.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/execution_policy.h" "$(@D)/cuda/include/thrust/system/omp/detail/execution_policy.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/extrema.h" "$(@D)/cuda/include/thrust/system/omp/detail/extrema.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/fill.h" "$(@D)/cuda/include/thrust/system/omp/detail/fill.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/find.h" "$(@D)/cuda/include/thrust/system/omp/detail/find.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/for_each.h" "$(@D)/cuda/include/thrust/system/omp/detail/for_each.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/for_each.inl" "$(@D)/cuda/include/thrust/system/omp/detail/for_each.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/gather.h" "$(@D)/cuda/include/thrust/system/omp/detail/gather.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/generate.h" "$(@D)/cuda/include/thrust/system/omp/detail/generate.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/get_value.h" "$(@D)/cuda/include/thrust/system/omp/detail/get_value.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/inner_product.h" "$(@D)/cuda/include/thrust/system/omp/detail/inner_product.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/iter_swap.h" "$(@D)/cuda/include/thrust/system/omp/detail/iter_swap.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/logical.h" "$(@D)/cuda/include/thrust/system/omp/detail/logical.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/malloc_and_free.h" "$(@D)/cuda/include/thrust/system/omp/detail/malloc_and_free.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/memory.inl" "$(@D)/cuda/include/thrust/system/omp/detail/memory.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/merge.h" "$(@D)/cuda/include/thrust/system/omp/detail/merge.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/mismatch.h" "$(@D)/cuda/include/thrust/system/omp/detail/mismatch.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/par.h" "$(@D)/cuda/include/thrust/system/omp/detail/par.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/partition.h" "$(@D)/cuda/include/thrust/system/omp/detail/partition.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/partition.inl" "$(@D)/cuda/include/thrust/system/omp/detail/partition.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/reduce.h" "$(@D)/cuda/include/thrust/system/omp/detail/reduce.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/reduce.inl" "$(@D)/cuda/include/thrust/system/omp/detail/reduce.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/reduce_by_key.h" "$(@D)/cuda/include/thrust/system/omp/detail/reduce_by_key.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/reduce_by_key.inl" "$(@D)/cuda/include/thrust/system/omp/detail/reduce_by_key.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/reduce_intervals.h" "$(@D)/cuda/include/thrust/system/omp/detail/reduce_intervals.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/reduce_intervals.inl" "$(@D)/cuda/include/thrust/system/omp/detail/reduce_intervals.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/remove.h" "$(@D)/cuda/include/thrust/system/omp/detail/remove.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/remove.inl" "$(@D)/cuda/include/thrust/system/omp/detail/remove.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/replace.h" "$(@D)/cuda/include/thrust/system/omp/detail/replace.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/reverse.h" "$(@D)/cuda/include/thrust/system/omp/detail/reverse.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/scan.h" "$(@D)/cuda/include/thrust/system/omp/detail/scan.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/scan_by_key.h" "$(@D)/cuda/include/thrust/system/omp/detail/scan_by_key.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/scatter.h" "$(@D)/cuda/include/thrust/system/omp/detail/scatter.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/sequence.h" "$(@D)/cuda/include/thrust/system/omp/detail/sequence.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/set_operations.h" "$(@D)/cuda/include/thrust/system/omp/detail/set_operations.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/sort.h" "$(@D)/cuda/include/thrust/system/omp/detail/sort.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/sort.inl" "$(@D)/cuda/include/thrust/system/omp/detail/sort.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/swap_ranges.h" "$(@D)/cuda/include/thrust/system/omp/detail/swap_ranges.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/tabulate.h" "$(@D)/cuda/include/thrust/system/omp/detail/tabulate.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/temporary_buffer.h" "$(@D)/cuda/include/thrust/system/omp/detail/temporary_buffer.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/transform.h" "$(@D)/cuda/include/thrust/system/omp/detail/transform.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/transform_reduce.h" "$(@D)/cuda/include/thrust/system/omp/detail/transform_reduce.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/transform_scan.h" "$(@D)/cuda/include/thrust/system/omp/detail/transform_scan.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/uninitialized_copy.h" "$(@D)/cuda/include/thrust/system/omp/detail/uninitialized_copy.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/uninitialized_fill.h" "$(@D)/cuda/include/thrust/system/omp/detail/uninitialized_fill.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/unique.h" "$(@D)/cuda/include/thrust/system/omp/detail/unique.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/unique.inl" "$(@D)/cuda/include/thrust/system/omp/detail/unique.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/unique_by_key.h" "$(@D)/cuda/include/thrust/system/omp/detail/unique_by_key.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/unique_by_key.inl" "$(@D)/cuda/include/thrust/system/omp/detail/unique_by_key.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/vector.inl" "$(@D)/cuda/include/thrust/system/omp/detail/vector.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/execution_policy.h" "$(@D)/cuda/include/thrust/system/omp/execution_policy.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/memory.h" "$(@D)/cuda/include/thrust/system/omp/memory.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/vector.h" "$(@D)/cuda/include/thrust/system/omp/vector.h" && cp "/usr/local/cuda-9.0/include/thrust/system/system_error.h" "$(@D)/cuda/include/thrust/system/system_error.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/adjacent_difference.h" "$(@D)/cuda/include/thrust/system/tbb/detail/adjacent_difference.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/assign_value.h" "$(@D)/cuda/include/thrust/system/tbb/detail/assign_value.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/binary_search.h" "$(@D)/cuda/include/thrust/system/tbb/detail/binary_search.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/copy.h" "$(@D)/cuda/include/thrust/system/tbb/detail/copy.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/copy.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/copy.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/copy_if.h" "$(@D)/cuda/include/thrust/system/tbb/detail/copy_if.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/copy_if.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/copy_if.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/count.h" "$(@D)/cuda/include/thrust/system/tbb/detail/count.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/equal.h" "$(@D)/cuda/include/thrust/system/tbb/detail/equal.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/execution_policy.h" "$(@D)/cuda/include/thrust/system/tbb/detail/execution_policy.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/extrema.h" "$(@D)/cuda/include/thrust/system/tbb/detail/extrema.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/fill.h" "$(@D)/cuda/include/thrust/system/tbb/detail/fill.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/find.h" "$(@D)/cuda/include/thrust/system/tbb/detail/find.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/for_each.h" "$(@D)/cuda/include/thrust/system/tbb/detail/for_each.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/for_each.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/for_each.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/gather.h" "$(@D)/cuda/include/thrust/system/tbb/detail/gather.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/generate.h" "$(@D)/cuda/include/thrust/system/tbb/detail/generate.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/get_value.h" "$(@D)/cuda/include/thrust/system/tbb/detail/get_value.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/inner_product.h" "$(@D)/cuda/include/thrust/system/tbb/detail/inner_product.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/iter_swap.h" "$(@D)/cuda/include/thrust/system/tbb/detail/iter_swap.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/logical.h" "$(@D)/cuda/include/thrust/system/tbb/detail/logical.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/malloc_and_free.h" "$(@D)/cuda/include/thrust/system/tbb/detail/malloc_and_free.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/memory.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/memory.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/merge.h" "$(@D)/cuda/include/thrust/system/tbb/detail/merge.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/merge.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/merge.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/mismatch.h" "$(@D)/cuda/include/thrust/system/tbb/detail/mismatch.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/par.h" "$(@D)/cuda/include/thrust/system/tbb/detail/par.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/partition.h" "$(@D)/cuda/include/thrust/system/tbb/detail/partition.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/partition.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/partition.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/reduce.h" "$(@D)/cuda/include/thrust/system/tbb/detail/reduce.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/reduce.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/reduce.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/reduce_by_key.h" "$(@D)/cuda/include/thrust/system/tbb/detail/reduce_by_key.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/reduce_by_key.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/reduce_by_key.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/reduce_intervals.h" "$(@D)/cuda/include/thrust/system/tbb/detail/reduce_intervals.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/remove.h" "$(@D)/cuda/include/thrust/system/tbb/detail/remove.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/remove.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/remove.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/replace.h" "$(@D)/cuda/include/thrust/system/tbb/detail/replace.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/reverse.h" "$(@D)/cuda/include/thrust/system/tbb/detail/reverse.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/scan.h" "$(@D)/cuda/include/thrust/system/tbb/detail/scan.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/scan.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/scan.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/scan_by_key.h" "$(@D)/cuda/include/thrust/system/tbb/detail/scan_by_key.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/scatter.h" "$(@D)/cuda/include/thrust/system/tbb/detail/scatter.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/sequence.h" "$(@D)/cuda/include/thrust/system/tbb/detail/sequence.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/set_operations.h" "$(@D)/cuda/include/thrust/system/tbb/detail/set_operations.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/sort.h" "$(@D)/cuda/include/thrust/system/tbb/detail/sort.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/sort.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/sort.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/swap_ranges.h" "$(@D)/cuda/include/thrust/system/tbb/detail/swap_ranges.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/tabulate.h" "$(@D)/cuda/include/thrust/system/tbb/detail/tabulate.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/temporary_buffer.h" "$(@D)/cuda/include/thrust/system/tbb/detail/temporary_buffer.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/transform.h" "$(@D)/cuda/include/thrust/system/tbb/detail/transform.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/transform_reduce.h" "$(@D)/cuda/include/thrust/system/tbb/detail/transform_reduce.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/transform_scan.h" "$(@D)/cuda/include/thrust/system/tbb/detail/transform_scan.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/uninitialized_copy.h" "$(@D)/cuda/include/thrust/system/tbb/detail/uninitialized_copy.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/uninitialized_fill.h" "$(@D)/cuda/include/thrust/system/tbb/detail/uninitialized_fill.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/unique.h" "$(@D)/cuda/include/thrust/system/tbb/detail/unique.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/unique.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/unique.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/unique_by_key.h" "$(@D)/cuda/include/thrust/system/tbb/detail/unique_by_key.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/unique_by_key.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/unique_by_key.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/vector.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/vector.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/execution_policy.h" "$(@D)/cuda/include/thrust/system/tbb/execution_policy.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/memory.h" "$(@D)/cuda/include/thrust/system/tbb/memory.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/vector.h" "$(@D)/cuda/include/thrust/system/tbb/vector.h" && cp "/usr/local/cuda-9.0/include/thrust/system_error.h" "$(@D)/cuda/include/thrust/system_error.h" && cp "/usr/local/cuda-9.0/include/thrust/tabulate.h" "$(@D)/cuda/include/thrust/tabulate.h" && cp "/usr/local/cuda-9.0/include/thrust/transform.h" "$(@D)/cuda/include/thrust/transform.h" && cp "/usr/local/cuda-9.0/include/thrust/transform_reduce.h" "$(@D)/cuda/include/thrust/transform_reduce.h" && cp "/usr/local/cuda-9.0/include/thrust/transform_scan.h" "$(@D)/cuda/include/thrust/transform_scan.h" && cp "/usr/local/cuda-9.0/include/thrust/tuple.h" "$(@D)/cuda/include/thrust/tuple.h" && cp "/usr/local/cuda-9.0/include/thrust/uninitialized_copy.h" "$(@D)/cuda/include/thrust/uninitialized_copy.h" && cp "/usr/local/cuda-9.0/include/thrust/uninitialized_fill.h" "$(@D)/cuda/include/thrust/uninitialized_fill.h" && cp "/usr/local/cuda-9.0/include/thrust/unique.h" "$(@D)/cuda/include/thrust/unique.h" && cp "/usr/local/cuda-9.0/include/thrust/version.h" "$(@D)/cuda/include/thrust/version.h" && cp "/usr/local/cuda-9.0/include/vector_functions.h" "$(@D)/cuda/include/vector_functions.h" && cp "/usr/local/cuda-9.0/include/vector_functions.hpp" "$(@D)/cuda/include/vector_functions.hpp" && cp "/usr/local/cuda-9.0/include/vector_types.h" "$(@D)/cuda/include/vector_types.h"
+if [ -d "$(@D)/extras" ]; then rm $(@D)/extras -drf; fi && if [ -d "$(@D)/include" ]; then rm $(@D)/include -drf; fi && if [ -d "$(@D)/lib" ]; then rm $(@D)/lib -drf; fi && if [ -d "$(@D)/nvvm" ]; then rm $(@D)/nvvm -drf; fi && cp -f "/usr/local/cuda-9.0/include/CL/cl.h" "$(@D)/cuda/include/CL/cl.h" && cp -f "/usr/local/cuda-9.0/include/CL/cl.hpp" "$(@D)/cuda/include/CL/cl.hpp" && cp -f "/usr/local/cuda-9.0/include/CL/cl_egl.h" "$(@D)/cuda/include/CL/cl_egl.h" && cp -f "/usr/local/cuda-9.0/include/CL/cl_ext.h" "$(@D)/cuda/include/CL/cl_ext.h" && cp -f "/usr/local/cuda-9.0/include/CL/cl_gl.h" "$(@D)/cuda/include/CL/cl_gl.h" && cp -f "/usr/local/cuda-9.0/include/CL/cl_gl_ext.h" "$(@D)/cuda/include/CL/cl_gl_ext.h" && cp -f "/usr/local/cuda-9.0/include/CL/cl_platform.h" "$(@D)/cuda/include/CL/cl_platform.h" && cp -f "/usr/local/cuda-9.0/include/CL/opencl.h" "$(@D)/cuda/include/CL/opencl.h" && cp -f "/usr/local/cuda-9.0/include/builtin_types.h" "$(@D)/cuda/include/builtin_types.h" && cp -f "/usr/local/cuda-9.0/include/channel_descriptor.h" "$(@D)/cuda/include/channel_descriptor.h" && cp -f "/usr/local/cuda-9.0/include/common_functions.h" "$(@D)/cuda/include/common_functions.h" && cp -f "/usr/local/cuda-9.0/include/cooperative_groups.h" "$(@D)/cuda/include/cooperative_groups.h" && cp -f "/usr/local/cuda-9.0/include/cooperative_groups_helpers.h" "$(@D)/cuda/include/cooperative_groups_helpers.h" && cp -f "/usr/local/cuda-9.0/include/crt/common_functions.h" "$(@D)/cuda/include/crt/common_functions.h" && cp -f "/usr/local/cuda-9.0/include/crt/device_double_functions.h" "$(@D)/cuda/include/crt/device_double_functions.h" && cp -f "/usr/local/cuda-9.0/include/crt/device_double_functions.hpp" "$(@D)/cuda/include/crt/device_double_functions.hpp" && cp -f "/usr/local/cuda-9.0/include/crt/device_functions.h" "$(@D)/cuda/include/crt/device_functions.h" && cp -f "/usr/local/cuda-9.0/include/crt/device_functions.hpp" "$(@D)/cuda/include/crt/device_functions.hpp" && cp -f "/usr/local/cuda-9.0/include/crt/func_macro.h" "$(@D)/cuda/include/crt/func_macro.h" && cp -f "/usr/local/cuda-9.0/include/crt/host_config.h" "$(@D)/cuda/include/crt/host_config.h" && cp -f "/usr/local/cuda-9.0/include/crt/host_defines.h" "$(@D)/cuda/include/crt/host_defines.h" && cp -f "/usr/local/cuda-9.0/include/crt/host_runtime.h" "$(@D)/cuda/include/crt/host_runtime.h" && cp -f "/usr/local/cuda-9.0/include/crt/math_functions.h" "$(@D)/cuda/include/crt/math_functions.h" && cp -f "/usr/local/cuda-9.0/include/crt/math_functions.hpp" "$(@D)/cuda/include/crt/math_functions.hpp" && cp -f "/usr/local/cuda-9.0/include/crt/mma.h" "$(@D)/cuda/include/crt/mma.h" && cp -f "/usr/local/cuda-9.0/include/crt/mma.hpp" "$(@D)/cuda/include/crt/mma.hpp" && cp -f "/usr/local/cuda-9.0/include/crt/nvfunctional" "$(@D)/cuda/include/crt/nvfunctional" && cp -f "/usr/local/cuda-9.0/include/crt/sm_70_rt.h" "$(@D)/cuda/include/crt/sm_70_rt.h" && cp -f "/usr/local/cuda-9.0/include/crt/sm_70_rt.hpp" "$(@D)/cuda/include/crt/sm_70_rt.hpp" && cp -f "/usr/local/cuda-9.0/include/crt/storage_class.h" "$(@D)/cuda/include/crt/storage_class.h" && cp -f "/usr/local/cuda-9.0/include/cuComplex.h" "$(@D)/cuda/include/cuComplex.h" && cp -f "/usr/local/cuda-9.0/include/cublas.h" "$(@D)/cuda/include/cublas.h" && cp -f "/usr/local/cuda-9.0/include/cublasXt.h" "$(@D)/cuda/include/cublasXt.h" && cp -f "/usr/local/cuda-9.0/include/cublas_api.h" "$(@D)/cuda/include/cublas_api.h" && cp -f "/usr/local/cuda-9.0/include/cublas_v2.h" "$(@D)/cuda/include/cublas_v2.h" && cp -f "/usr/local/cuda-9.0/include/cuda.h" "$(@D)/cuda/include/cuda.h" && cp -f "/usr/local/cuda-9.0/include/cudaEGL.h" "$(@D)/cuda/include/cudaEGL.h" && cp -f "/usr/local/cuda-9.0/include/cudaGL.h" "$(@D)/cuda/include/cudaGL.h" && cp -f "/usr/local/cuda-9.0/include/cudaProfiler.h" "$(@D)/cuda/include/cudaProfiler.h" && cp -f "/usr/local/cuda-9.0/include/cudaVDPAU.h" "$(@D)/cuda/include/cudaVDPAU.h" && cp -f "/usr/local/cuda-9.0/include/cuda_device_runtime_api.h" "$(@D)/cuda/include/cuda_device_runtime_api.h" && cp -f "/usr/local/cuda-9.0/include/cuda_fp16.h" "$(@D)/cuda/include/cuda_fp16.h" && cp -f "/usr/local/cuda-9.0/include/cuda_fp16.hpp" "$(@D)/cuda/include/cuda_fp16.hpp" && cp -f "/usr/local/cuda-9.0/include/cuda_gl_interop.h" "$(@D)/cuda/include/cuda_gl_interop.h" && cp -f "/usr/local/cuda-9.0/include/cuda_occupancy.h" "$(@D)/cuda/include/cuda_occupancy.h" && cp -f "/usr/local/cuda-9.0/include/cuda_profiler_api.h" "$(@D)/cuda/include/cuda_profiler_api.h" && cp -f "/usr/local/cuda-9.0/include/cuda_runtime.h" "$(@D)/cuda/include/cuda_runtime.h" && cp -f "/usr/local/cuda-9.0/include/cuda_runtime_api.h" "$(@D)/cuda/include/cuda_runtime_api.h" && cp -f "/usr/local/cuda-9.0/include/cuda_surface_types.h" "$(@D)/cuda/include/cuda_surface_types.h" && cp -f "/usr/local/cuda-9.0/include/cuda_texture_types.h" "$(@D)/cuda/include/cuda_texture_types.h" && cp -f "/usr/local/cuda-9.0/include/cuda_vdpau_interop.h" "$(@D)/cuda/include/cuda_vdpau_interop.h" && cp -f "/usr/local/cuda-9.0/include/cudalibxt.h" "$(@D)/cuda/include/cudalibxt.h" && cp -f "/usr/local/cuda-9.0/include/cufft.h" "$(@D)/cuda/include/cufft.h" && cp -f "/usr/local/cuda-9.0/include/cufftXt.h" "$(@D)/cuda/include/cufftXt.h" && cp -f "/usr/local/cuda-9.0/include/cufftw.h" "$(@D)/cuda/include/cufftw.h" && cp -f "/usr/local/cuda-9.0/include/curand.h" "$(@D)/cuda/include/curand.h" && cp -f "/usr/local/cuda-9.0/include/curand_discrete.h" "$(@D)/cuda/include/curand_discrete.h" && cp -f "/usr/local/cuda-9.0/include/curand_discrete2.h" "$(@D)/cuda/include/curand_discrete2.h" && cp -f "/usr/local/cuda-9.0/include/curand_globals.h" "$(@D)/cuda/include/curand_globals.h" && cp -f "/usr/local/cuda-9.0/include/curand_kernel.h" "$(@D)/cuda/include/curand_kernel.h" && cp -f "/usr/local/cuda-9.0/include/curand_lognormal.h" "$(@D)/cuda/include/curand_lognormal.h" && cp -f "/usr/local/cuda-9.0/include/curand_mrg32k3a.h" "$(@D)/cuda/include/curand_mrg32k3a.h" && cp -f "/usr/local/cuda-9.0/include/curand_mtgp32.h" "$(@D)/cuda/include/curand_mtgp32.h" && cp -f "/usr/local/cuda-9.0/include/curand_mtgp32_host.h" "$(@D)/cuda/include/curand_mtgp32_host.h" && cp -f "/usr/local/cuda-9.0/include/curand_mtgp32_kernel.h" "$(@D)/cuda/include/curand_mtgp32_kernel.h" && cp -f "/usr/local/cuda-9.0/include/curand_mtgp32dc_p_11213.h" "$(@D)/cuda/include/curand_mtgp32dc_p_11213.h" && cp -f "/usr/local/cuda-9.0/include/curand_normal.h" "$(@D)/cuda/include/curand_normal.h" && cp -f "/usr/local/cuda-9.0/include/curand_normal_static.h" "$(@D)/cuda/include/curand_normal_static.h" && cp -f "/usr/local/cuda-9.0/include/curand_philox4x32_x.h" "$(@D)/cuda/include/curand_philox4x32_x.h" && cp -f "/usr/local/cuda-9.0/include/curand_poisson.h" "$(@D)/cuda/include/curand_poisson.h" && cp -f "/usr/local/cuda-9.0/include/curand_precalc.h" "$(@D)/cuda/include/curand_precalc.h" && cp -f "/usr/local/cuda-9.0/include/curand_uniform.h" "$(@D)/cuda/include/curand_uniform.h" && cp -f "/usr/local/cuda-9.0/include/cusolverDn.h" "$(@D)/cuda/include/cusolverDn.h" && cp -f "/usr/local/cuda-9.0/include/cusolverRf.h" "$(@D)/cuda/include/cusolverRf.h" && cp -f "/usr/local/cuda-9.0/include/cusolverSp.h" "$(@D)/cuda/include/cusolverSp.h" && cp -f "/usr/local/cuda-9.0/include/cusolverSp_LOWLEVEL_PREVIEW.h" "$(@D)/cuda/include/cusolverSp_LOWLEVEL_PREVIEW.h" && cp -f "/usr/local/cuda-9.0/include/cusolver_common.h" "$(@D)/cuda/include/cusolver_common.h" && cp -f "/usr/local/cuda-9.0/include/cusparse.h" "$(@D)/cuda/include/cusparse.h" && cp -f "/usr/local/cuda-9.0/include/cusparse_v2.h" "$(@D)/cuda/include/cusparse_v2.h" && cp -f "/usr/local/cuda-9.0/include/device_atomic_functions.h" "$(@D)/cuda/include/device_atomic_functions.h" && cp -f "/usr/local/cuda-9.0/include/device_atomic_functions.hpp" "$(@D)/cuda/include/device_atomic_functions.hpp" && cp -f "/usr/local/cuda-9.0/include/device_double_functions.h" "$(@D)/cuda/include/device_double_functions.h" && cp -f "/usr/local/cuda-9.0/include/device_double_functions.hpp" "$(@D)/cuda/include/device_double_functions.hpp" && cp -f "/usr/local/cuda-9.0/include/device_functions.h" "$(@D)/cuda/include/device_functions.h" && cp -f "/usr/local/cuda-9.0/include/device_functions.hpp" "$(@D)/cuda/include/device_functions.hpp" && cp -f "/usr/local/cuda-9.0/include/device_functions_decls.h" "$(@D)/cuda/include/device_functions_decls.h" && cp -f "/usr/local/cuda-9.0/include/device_launch_parameters.h" "$(@D)/cuda/include/device_launch_parameters.h" && cp -f "/usr/local/cuda-9.0/include/device_types.h" "$(@D)/cuda/include/device_types.h" && cp -f "/usr/local/cuda-9.0/include/driver_functions.h" "$(@D)/cuda/include/driver_functions.h" && cp -f "/usr/local/cuda-9.0/include/driver_types.h" "$(@D)/cuda/include/driver_types.h" && cp -f "/usr/local/cuda-9.0/include/dynlink_cuda.h" "$(@D)/cuda/include/dynlink_cuda.h" && cp -f "/usr/local/cuda-9.0/include/dynlink_cuda_cuda.h" "$(@D)/cuda/include/dynlink_cuda_cuda.h" && cp -f "/usr/local/cuda-9.0/include/dynlink_cuviddec.h" "$(@D)/cuda/include/dynlink_cuviddec.h" && cp -f "/usr/local/cuda-9.0/include/dynlink_nvcuvid.h" "$(@D)/cuda/include/dynlink_nvcuvid.h" && cp -f "/usr/local/cuda-9.0/include/fatBinaryCtl.h" "$(@D)/cuda/include/fatBinaryCtl.h" && cp -f "/usr/local/cuda-9.0/include/fatbinary.h" "$(@D)/cuda/include/fatbinary.h" && cp -f "/usr/local/cuda-9.0/include/host_config.h" "$(@D)/cuda/include/host_config.h" && cp -f "/usr/local/cuda-9.0/include/host_defines.h" "$(@D)/cuda/include/host_defines.h" && cp -f "/usr/local/cuda-9.0/include/library_types.h" "$(@D)/cuda/include/library_types.h" && cp -f "/usr/local/cuda-9.0/include/math_constants.h" "$(@D)/cuda/include/math_constants.h" && cp -f "/usr/local/cuda-9.0/include/math_functions.h" "$(@D)/cuda/include/math_functions.h" && cp -f "/usr/local/cuda-9.0/include/math_functions.hpp" "$(@D)/cuda/include/math_functions.hpp" && cp -f "/usr/local/cuda-9.0/include/math_functions_dbl_ptx3.h" "$(@D)/cuda/include/math_functions_dbl_ptx3.h" && cp -f "/usr/local/cuda-9.0/include/math_functions_dbl_ptx3.hpp" "$(@D)/cuda/include/math_functions_dbl_ptx3.hpp" && cp -f "/usr/local/cuda-9.0/include/mma.h" "$(@D)/cuda/include/mma.h" && cp -f "/usr/local/cuda-9.0/include/npp.h" "$(@D)/cuda/include/npp.h" && cp -f "/usr/local/cuda-9.0/include/nppcore.h" "$(@D)/cuda/include/nppcore.h" && cp -f "/usr/local/cuda-9.0/include/nppdefs.h" "$(@D)/cuda/include/nppdefs.h" && cp -f "/usr/local/cuda-9.0/include/nppi.h" "$(@D)/cuda/include/nppi.h" && cp -f "/usr/local/cuda-9.0/include/nppi_arithmetic_and_logical_operations.h" "$(@D)/cuda/include/nppi_arithmetic_and_logical_operations.h" && cp -f "/usr/local/cuda-9.0/include/nppi_color_conversion.h" "$(@D)/cuda/include/nppi_color_conversion.h" && cp -f "/usr/local/cuda-9.0/include/nppi_compression_functions.h" "$(@D)/cuda/include/nppi_compression_functions.h" && cp -f "/usr/local/cuda-9.0/include/nppi_computer_vision.h" "$(@D)/cuda/include/nppi_computer_vision.h" && cp -f "/usr/local/cuda-9.0/include/nppi_data_exchange_and_initialization.h" "$(@D)/cuda/include/nppi_data_exchange_and_initialization.h" && cp -f "/usr/local/cuda-9.0/include/nppi_filtering_functions.h" "$(@D)/cuda/include/nppi_filtering_functions.h" && cp -f "/usr/local/cuda-9.0/include/nppi_geometry_transforms.h" "$(@D)/cuda/include/nppi_geometry_transforms.h" && cp -f "/usr/local/cuda-9.0/include/nppi_linear_transforms.h" "$(@D)/cuda/include/nppi_linear_transforms.h" && cp -f "/usr/local/cuda-9.0/include/nppi_morphological_operations.h" "$(@D)/cuda/include/nppi_morphological_operations.h" && cp -f "/usr/local/cuda-9.0/include/nppi_statistics_functions.h" "$(@D)/cuda/include/nppi_statistics_functions.h" && cp -f "/usr/local/cuda-9.0/include/nppi_support_functions.h" "$(@D)/cuda/include/nppi_support_functions.h" && cp -f "/usr/local/cuda-9.0/include/nppi_threshold_and_compare_operations.h" "$(@D)/cuda/include/nppi_threshold_and_compare_operations.h" && cp -f "/usr/local/cuda-9.0/include/npps.h" "$(@D)/cuda/include/npps.h" && cp -f "/usr/local/cuda-9.0/include/npps_arithmetic_and_logical_operations.h" "$(@D)/cuda/include/npps_arithmetic_and_logical_operations.h" && cp -f "/usr/local/cuda-9.0/include/npps_conversion_functions.h" "$(@D)/cuda/include/npps_conversion_functions.h" && cp -f "/usr/local/cuda-9.0/include/npps_filtering_functions.h" "$(@D)/cuda/include/npps_filtering_functions.h" && cp -f "/usr/local/cuda-9.0/include/npps_initialization.h" "$(@D)/cuda/include/npps_initialization.h" && cp -f "/usr/local/cuda-9.0/include/npps_statistics_functions.h" "$(@D)/cuda/include/npps_statistics_functions.h" && cp -f "/usr/local/cuda-9.0/include/npps_support_functions.h" "$(@D)/cuda/include/npps_support_functions.h" && cp -f "/usr/local/cuda-9.0/include/nppversion.h" "$(@D)/cuda/include/nppversion.h" && cp -f "/usr/local/cuda-9.0/include/nvToolsExt.h" "$(@D)/cuda/include/nvToolsExt.h" && cp -f "/usr/local/cuda-9.0/include/nvToolsExtCuda.h" "$(@D)/cuda/include/nvToolsExtCuda.h" && cp -f "/usr/local/cuda-9.0/include/nvToolsExtCudaRt.h" "$(@D)/cuda/include/nvToolsExtCudaRt.h" && cp -f "/usr/local/cuda-9.0/include/nvToolsExtMeta.h" "$(@D)/cuda/include/nvToolsExtMeta.h" && cp -f "/usr/local/cuda-9.0/include/nvToolsExtSync.h" "$(@D)/cuda/include/nvToolsExtSync.h" && cp -f "/usr/local/cuda-9.0/include/nvblas.h" "$(@D)/cuda/include/nvblas.h" && cp -f "/usr/local/cuda-9.0/include/nvfunctional" "$(@D)/cuda/include/nvfunctional" && cp -f "/usr/local/cuda-9.0/include/nvgraph.h" "$(@D)/cuda/include/nvgraph.h" && cp -f "/usr/local/cuda-9.0/include/nvml.h" "$(@D)/cuda/include/nvml.h" && cp -f "/usr/local/cuda-9.0/include/nvrtc.h" "$(@D)/cuda/include/nvrtc.h" && cp -f "/usr/local/cuda-9.0/include/sm_20_atomic_functions.h" "$(@D)/cuda/include/sm_20_atomic_functions.h" && cp -f "/usr/local/cuda-9.0/include/sm_20_atomic_functions.hpp" "$(@D)/cuda/include/sm_20_atomic_functions.hpp" && cp -f "/usr/local/cuda-9.0/include/sm_20_intrinsics.h" "$(@D)/cuda/include/sm_20_intrinsics.h" && cp -f "/usr/local/cuda-9.0/include/sm_20_intrinsics.hpp" "$(@D)/cuda/include/sm_20_intrinsics.hpp" && cp -f "/usr/local/cuda-9.0/include/sm_30_intrinsics.h" "$(@D)/cuda/include/sm_30_intrinsics.h" && cp -f "/usr/local/cuda-9.0/include/sm_30_intrinsics.hpp" "$(@D)/cuda/include/sm_30_intrinsics.hpp" && cp -f "/usr/local/cuda-9.0/include/sm_32_atomic_functions.h" "$(@D)/cuda/include/sm_32_atomic_functions.h" && cp -f "/usr/local/cuda-9.0/include/sm_32_atomic_functions.hpp" "$(@D)/cuda/include/sm_32_atomic_functions.hpp" && cp -f "/usr/local/cuda-9.0/include/sm_32_intrinsics.h" "$(@D)/cuda/include/sm_32_intrinsics.h" && cp -f "/usr/local/cuda-9.0/include/sm_32_intrinsics.hpp" "$(@D)/cuda/include/sm_32_intrinsics.hpp" && cp -f "/usr/local/cuda-9.0/include/sm_35_atomic_functions.h" "$(@D)/cuda/include/sm_35_atomic_functions.h" && cp -f "/usr/local/cuda-9.0/include/sm_35_intrinsics.h" "$(@D)/cuda/include/sm_35_intrinsics.h" && cp -f "/usr/local/cuda-9.0/include/sm_60_atomic_functions.h" "$(@D)/cuda/include/sm_60_atomic_functions.h" && cp -f "/usr/local/cuda-9.0/include/sm_60_atomic_functions.hpp" "$(@D)/cuda/include/sm_60_atomic_functions.hpp" && cp -f "/usr/local/cuda-9.0/include/sm_61_intrinsics.h" "$(@D)/cuda/include/sm_61_intrinsics.h" && cp -f "/usr/local/cuda-9.0/include/sm_61_intrinsics.hpp" "$(@D)/cuda/include/sm_61_intrinsics.hpp" && cp -f "/usr/local/cuda-9.0/include/sobol_direction_vectors.h" "$(@D)/cuda/include/sobol_direction_vectors.h" && cp -f "/usr/local/cuda-9.0/include/surface_functions.h" "$(@D)/cuda/include/surface_functions.h" && cp -f "/usr/local/cuda-9.0/include/surface_functions.hpp" "$(@D)/cuda/include/surface_functions.hpp" && cp -f "/usr/local/cuda-9.0/include/surface_indirect_functions.h" "$(@D)/cuda/include/surface_indirect_functions.h" && cp -f "/usr/local/cuda-9.0/include/surface_indirect_functions.hpp" "$(@D)/cuda/include/surface_indirect_functions.hpp" && cp -f "/usr/local/cuda-9.0/include/surface_types.h" "$(@D)/cuda/include/surface_types.h" && cp -f "/usr/local/cuda-9.0/include/texture_fetch_functions.h" "$(@D)/cuda/include/texture_fetch_functions.h" && cp -f "/usr/local/cuda-9.0/include/texture_fetch_functions.hpp" "$(@D)/cuda/include/texture_fetch_functions.hpp" && cp -f "/usr/local/cuda-9.0/include/texture_indirect_functions.h" "$(@D)/cuda/include/texture_indirect_functions.h" && cp -f "/usr/local/cuda-9.0/include/texture_indirect_functions.hpp" "$(@D)/cuda/include/texture_indirect_functions.hpp" && cp -f "/usr/local/cuda-9.0/include/texture_types.h" "$(@D)/cuda/include/texture_types.h" && cp -f "/usr/local/cuda-9.0/include/thrust/adjacent_difference.h" "$(@D)/cuda/include/thrust/adjacent_difference.h" && cp -f "/usr/local/cuda-9.0/include/thrust/advance.h" "$(@D)/cuda/include/thrust/advance.h" && cp -f "/usr/local/cuda-9.0/include/thrust/binary_search.h" "$(@D)/cuda/include/thrust/binary_search.h" && cp -f "/usr/local/cuda-9.0/include/thrust/complex.h" "$(@D)/cuda/include/thrust/complex.h" && cp -f "/usr/local/cuda-9.0/include/thrust/copy.h" "$(@D)/cuda/include/thrust/copy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/count.h" "$(@D)/cuda/include/thrust/count.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/adjacent_difference.inl" "$(@D)/cuda/include/thrust/detail/adjacent_difference.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/advance.inl" "$(@D)/cuda/include/thrust/detail/advance.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/allocator/allocator_traits.h" "$(@D)/cuda/include/thrust/detail/allocator/allocator_traits.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/allocator/allocator_traits.inl" "$(@D)/cuda/include/thrust/detail/allocator/allocator_traits.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/allocator/copy_construct_range.h" "$(@D)/cuda/include/thrust/detail/allocator/copy_construct_range.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/allocator/copy_construct_range.inl" "$(@D)/cuda/include/thrust/detail/allocator/copy_construct_range.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/allocator/default_construct_range.h" "$(@D)/cuda/include/thrust/detail/allocator/default_construct_range.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/allocator/default_construct_range.inl" "$(@D)/cuda/include/thrust/detail/allocator/default_construct_range.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/allocator/destroy_range.h" "$(@D)/cuda/include/thrust/detail/allocator/destroy_range.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/allocator/destroy_range.inl" "$(@D)/cuda/include/thrust/detail/allocator/destroy_range.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/allocator/fill_construct_range.h" "$(@D)/cuda/include/thrust/detail/allocator/fill_construct_range.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/allocator/fill_construct_range.inl" "$(@D)/cuda/include/thrust/detail/allocator/fill_construct_range.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/allocator/malloc_allocator.h" "$(@D)/cuda/include/thrust/detail/allocator/malloc_allocator.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/allocator/malloc_allocator.inl" "$(@D)/cuda/include/thrust/detail/allocator/malloc_allocator.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/allocator/no_throw_allocator.h" "$(@D)/cuda/include/thrust/detail/allocator/no_throw_allocator.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/allocator/tagged_allocator.h" "$(@D)/cuda/include/thrust/detail/allocator/tagged_allocator.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/allocator/tagged_allocator.inl" "$(@D)/cuda/include/thrust/detail/allocator/tagged_allocator.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/allocator/temporary_allocator.h" "$(@D)/cuda/include/thrust/detail/allocator/temporary_allocator.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/allocator/temporary_allocator.inl" "$(@D)/cuda/include/thrust/detail/allocator/temporary_allocator.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/binary_search.inl" "$(@D)/cuda/include/thrust/detail/binary_search.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/complex/arithmetic.h" "$(@D)/cuda/include/thrust/detail/complex/arithmetic.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/complex/c99math.h" "$(@D)/cuda/include/thrust/detail/complex/c99math.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/complex/catrig.h" "$(@D)/cuda/include/thrust/detail/complex/catrig.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/complex/catrigf.h" "$(@D)/cuda/include/thrust/detail/complex/catrigf.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/complex/ccosh.h" "$(@D)/cuda/include/thrust/detail/complex/ccosh.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/complex/ccoshf.h" "$(@D)/cuda/include/thrust/detail/complex/ccoshf.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/complex/cexp.h" "$(@D)/cuda/include/thrust/detail/complex/cexp.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/complex/cexpf.h" "$(@D)/cuda/include/thrust/detail/complex/cexpf.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/complex/clog.h" "$(@D)/cuda/include/thrust/detail/complex/clog.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/complex/clogf.h" "$(@D)/cuda/include/thrust/detail/complex/clogf.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/complex/complex.inl" "$(@D)/cuda/include/thrust/detail/complex/complex.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/complex/cpow.h" "$(@D)/cuda/include/thrust/detail/complex/cpow.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/complex/cpowf.h" "$(@D)/cuda/include/thrust/detail/complex/cpowf.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/complex/cproj.h" "$(@D)/cuda/include/thrust/detail/complex/cproj.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/complex/csinh.h" "$(@D)/cuda/include/thrust/detail/complex/csinh.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/complex/csinhf.h" "$(@D)/cuda/include/thrust/detail/complex/csinhf.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/complex/csqrt.h" "$(@D)/cuda/include/thrust/detail/complex/csqrt.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/complex/csqrtf.h" "$(@D)/cuda/include/thrust/detail/complex/csqrtf.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/complex/ctanh.h" "$(@D)/cuda/include/thrust/detail/complex/ctanh.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/complex/ctanhf.h" "$(@D)/cuda/include/thrust/detail/complex/ctanhf.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/complex/math_private.h" "$(@D)/cuda/include/thrust/detail/complex/math_private.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/complex/stream.h" "$(@D)/cuda/include/thrust/detail/complex/stream.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/config.h" "$(@D)/cuda/include/thrust/detail/config.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/config/compiler.h" "$(@D)/cuda/include/thrust/detail/config/compiler.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/config/compiler_fence.h" "$(@D)/cuda/include/thrust/detail/config/compiler_fence.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/config/config.h" "$(@D)/cuda/include/thrust/detail/config/config.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/config/debug.h" "$(@D)/cuda/include/thrust/detail/config/debug.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/config/device_system.h" "$(@D)/cuda/include/thrust/detail/config/device_system.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/config/exec_check_disable.h" "$(@D)/cuda/include/thrust/detail/config/exec_check_disable.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/config/forceinline.h" "$(@D)/cuda/include/thrust/detail/config/forceinline.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/config/global_workarounds.h" "$(@D)/cuda/include/thrust/detail/config/global_workarounds.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/config/host_device.h" "$(@D)/cuda/include/thrust/detail/config/host_device.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/config/host_system.h" "$(@D)/cuda/include/thrust/detail/config/host_system.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/config/simple_defines.h" "$(@D)/cuda/include/thrust/detail/config/simple_defines.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/contiguous_storage.h" "$(@D)/cuda/include/thrust/detail/contiguous_storage.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/contiguous_storage.inl" "$(@D)/cuda/include/thrust/detail/contiguous_storage.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/copy.h" "$(@D)/cuda/include/thrust/detail/copy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/copy.inl" "$(@D)/cuda/include/thrust/detail/copy.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/copy_if.h" "$(@D)/cuda/include/thrust/detail/copy_if.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/copy_if.inl" "$(@D)/cuda/include/thrust/detail/copy_if.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/count.inl" "$(@D)/cuda/include/thrust/detail/count.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/cstdint.h" "$(@D)/cuda/include/thrust/detail/cstdint.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/device_delete.inl" "$(@D)/cuda/include/thrust/detail/device_delete.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/device_free.inl" "$(@D)/cuda/include/thrust/detail/device_free.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/device_malloc.inl" "$(@D)/cuda/include/thrust/detail/device_malloc.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/device_new.inl" "$(@D)/cuda/include/thrust/detail/device_new.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/device_ptr.inl" "$(@D)/cuda/include/thrust/detail/device_ptr.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/device_reference.inl" "$(@D)/cuda/include/thrust/detail/device_reference.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/device_vector.inl" "$(@D)/cuda/include/thrust/detail/device_vector.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/dispatch/is_trivial_copy.h" "$(@D)/cuda/include/thrust/detail/dispatch/is_trivial_copy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/distance.inl" "$(@D)/cuda/include/thrust/detail/distance.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/equal.inl" "$(@D)/cuda/include/thrust/detail/equal.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/execute_with_allocator.h" "$(@D)/cuda/include/thrust/detail/execute_with_allocator.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/execution_policy.h" "$(@D)/cuda/include/thrust/detail/execution_policy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/extrema.inl" "$(@D)/cuda/include/thrust/detail/extrema.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/fill.inl" "$(@D)/cuda/include/thrust/detail/fill.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/find.inl" "$(@D)/cuda/include/thrust/detail/find.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/for_each.inl" "$(@D)/cuda/include/thrust/detail/for_each.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/function.h" "$(@D)/cuda/include/thrust/detail/function.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/functional.inl" "$(@D)/cuda/include/thrust/detail/functional.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/functional/actor.h" "$(@D)/cuda/include/thrust/detail/functional/actor.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/functional/actor.inl" "$(@D)/cuda/include/thrust/detail/functional/actor.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/functional/argument.h" "$(@D)/cuda/include/thrust/detail/functional/argument.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/functional/composite.h" "$(@D)/cuda/include/thrust/detail/functional/composite.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/functional/operators.h" "$(@D)/cuda/include/thrust/detail/functional/operators.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/functional/operators/arithmetic_operators.h" "$(@D)/cuda/include/thrust/detail/functional/operators/arithmetic_operators.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/functional/operators/assignment_operator.h" "$(@D)/cuda/include/thrust/detail/functional/operators/assignment_operator.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/functional/operators/bitwise_operators.h" "$(@D)/cuda/include/thrust/detail/functional/operators/bitwise_operators.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/functional/operators/compound_assignment_operators.h" "$(@D)/cuda/include/thrust/detail/functional/operators/compound_assignment_operators.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/functional/operators/logical_operators.h" "$(@D)/cuda/include/thrust/detail/functional/operators/logical_operators.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/functional/operators/operator_adaptors.h" "$(@D)/cuda/include/thrust/detail/functional/operators/operator_adaptors.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/functional/operators/relational_operators.h" "$(@D)/cuda/include/thrust/detail/functional/operators/relational_operators.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/functional/placeholder.h" "$(@D)/cuda/include/thrust/detail/functional/placeholder.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/functional/value.h" "$(@D)/cuda/include/thrust/detail/functional/value.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/gather.inl" "$(@D)/cuda/include/thrust/detail/gather.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/generate.inl" "$(@D)/cuda/include/thrust/detail/generate.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/get_iterator_value.h" "$(@D)/cuda/include/thrust/detail/get_iterator_value.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/host_vector.inl" "$(@D)/cuda/include/thrust/detail/host_vector.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/inner_product.inl" "$(@D)/cuda/include/thrust/detail/inner_product.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/integer_math.h" "$(@D)/cuda/include/thrust/detail/integer_math.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/integer_traits.h" "$(@D)/cuda/include/thrust/detail/integer_traits.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/internal_functional.h" "$(@D)/cuda/include/thrust/detail/internal_functional.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/logical.inl" "$(@D)/cuda/include/thrust/detail/logical.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/malloc_and_free.h" "$(@D)/cuda/include/thrust/detail/malloc_and_free.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/merge.inl" "$(@D)/cuda/include/thrust/detail/merge.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/minmax.h" "$(@D)/cuda/include/thrust/detail/minmax.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/mismatch.inl" "$(@D)/cuda/include/thrust/detail/mismatch.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/mpl/math.h" "$(@D)/cuda/include/thrust/detail/mpl/math.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/numeric_traits.h" "$(@D)/cuda/include/thrust/detail/numeric_traits.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/overlapped_copy.h" "$(@D)/cuda/include/thrust/detail/overlapped_copy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/pair.inl" "$(@D)/cuda/include/thrust/detail/pair.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/partition.inl" "$(@D)/cuda/include/thrust/detail/partition.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/pointer.h" "$(@D)/cuda/include/thrust/detail/pointer.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/pointer.inl" "$(@D)/cuda/include/thrust/detail/pointer.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/range/head_flags.h" "$(@D)/cuda/include/thrust/detail/range/head_flags.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/range/tail_flags.h" "$(@D)/cuda/include/thrust/detail/range/tail_flags.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/raw_pointer_cast.h" "$(@D)/cuda/include/thrust/detail/raw_pointer_cast.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/raw_reference_cast.h" "$(@D)/cuda/include/thrust/detail/raw_reference_cast.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/reduce.inl" "$(@D)/cuda/include/thrust/detail/reduce.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/reference.h" "$(@D)/cuda/include/thrust/detail/reference.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/reference.inl" "$(@D)/cuda/include/thrust/detail/reference.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/reference_forward_declaration.h" "$(@D)/cuda/include/thrust/detail/reference_forward_declaration.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/remove.inl" "$(@D)/cuda/include/thrust/detail/remove.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/replace.inl" "$(@D)/cuda/include/thrust/detail/replace.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/reverse.inl" "$(@D)/cuda/include/thrust/detail/reverse.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/scan.inl" "$(@D)/cuda/include/thrust/detail/scan.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/scatter.inl" "$(@D)/cuda/include/thrust/detail/scatter.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/seq.h" "$(@D)/cuda/include/thrust/detail/seq.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/sequence.inl" "$(@D)/cuda/include/thrust/detail/sequence.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/set_operations.inl" "$(@D)/cuda/include/thrust/detail/set_operations.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/sort.inl" "$(@D)/cuda/include/thrust/detail/sort.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/static_assert.h" "$(@D)/cuda/include/thrust/detail/static_assert.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/static_map.h" "$(@D)/cuda/include/thrust/detail/static_map.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/swap.h" "$(@D)/cuda/include/thrust/detail/swap.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/swap.inl" "$(@D)/cuda/include/thrust/detail/swap.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/swap_ranges.inl" "$(@D)/cuda/include/thrust/detail/swap_ranges.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/tabulate.inl" "$(@D)/cuda/include/thrust/detail/tabulate.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/temporary_array.h" "$(@D)/cuda/include/thrust/detail/temporary_array.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/temporary_array.inl" "$(@D)/cuda/include/thrust/detail/temporary_array.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/temporary_buffer.h" "$(@D)/cuda/include/thrust/detail/temporary_buffer.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/transform.inl" "$(@D)/cuda/include/thrust/detail/transform.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/transform_reduce.inl" "$(@D)/cuda/include/thrust/detail/transform_reduce.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/transform_scan.inl" "$(@D)/cuda/include/thrust/detail/transform_scan.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/trivial_sequence.h" "$(@D)/cuda/include/thrust/detail/trivial_sequence.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/tuple.inl" "$(@D)/cuda/include/thrust/detail/tuple.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/tuple_meta_transform.h" "$(@D)/cuda/include/thrust/detail/tuple_meta_transform.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/tuple_transform.h" "$(@D)/cuda/include/thrust/detail/tuple_transform.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/type_traits.h" "$(@D)/cuda/include/thrust/detail/type_traits.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/type_traits/algorithm/intermediate_type_from_function_and_iterators.h" "$(@D)/cuda/include/thrust/detail/type_traits/algorithm/intermediate_type_from_function_and_iterators.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/type_traits/function_traits.h" "$(@D)/cuda/include/thrust/detail/type_traits/function_traits.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/type_traits/has_member_function.h" "$(@D)/cuda/include/thrust/detail/type_traits/has_member_function.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/type_traits/has_nested_type.h" "$(@D)/cuda/include/thrust/detail/type_traits/has_nested_type.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/type_traits/has_trivial_assign.h" "$(@D)/cuda/include/thrust/detail/type_traits/has_trivial_assign.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/type_traits/is_call_possible.h" "$(@D)/cuda/include/thrust/detail/type_traits/is_call_possible.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/type_traits/is_metafunction_defined.h" "$(@D)/cuda/include/thrust/detail/type_traits/is_metafunction_defined.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/type_traits/iterator/is_discard_iterator.h" "$(@D)/cuda/include/thrust/detail/type_traits/iterator/is_discard_iterator.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/type_traits/iterator/is_output_iterator.h" "$(@D)/cuda/include/thrust/detail/type_traits/iterator/is_output_iterator.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/type_traits/minimum_type.h" "$(@D)/cuda/include/thrust/detail/type_traits/minimum_type.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/type_traits/pointer_traits.h" "$(@D)/cuda/include/thrust/detail/type_traits/pointer_traits.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/type_traits/result_of_adaptable_function.h" "$(@D)/cuda/include/thrust/detail/type_traits/result_of_adaptable_function.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/uninitialized_copy.inl" "$(@D)/cuda/include/thrust/detail/uninitialized_copy.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/uninitialized_fill.inl" "$(@D)/cuda/include/thrust/detail/uninitialized_fill.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/unique.inl" "$(@D)/cuda/include/thrust/detail/unique.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/use_default.h" "$(@D)/cuda/include/thrust/detail/use_default.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/util/align.h" "$(@D)/cuda/include/thrust/detail/util/align.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/util/blocking.h" "$(@D)/cuda/include/thrust/detail/util/blocking.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/vector_base.h" "$(@D)/cuda/include/thrust/detail/vector_base.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/vector_base.inl" "$(@D)/cuda/include/thrust/detail/vector_base.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/device_allocator.h" "$(@D)/cuda/include/thrust/device_allocator.h" && cp -f "/usr/local/cuda-9.0/include/thrust/device_delete.h" "$(@D)/cuda/include/thrust/device_delete.h" && cp -f "/usr/local/cuda-9.0/include/thrust/device_free.h" "$(@D)/cuda/include/thrust/device_free.h" && cp -f "/usr/local/cuda-9.0/include/thrust/device_malloc.h" "$(@D)/cuda/include/thrust/device_malloc.h" && cp -f "/usr/local/cuda-9.0/include/thrust/device_malloc_allocator.h" "$(@D)/cuda/include/thrust/device_malloc_allocator.h" && cp -f "/usr/local/cuda-9.0/include/thrust/device_new.h" "$(@D)/cuda/include/thrust/device_new.h" && cp -f "/usr/local/cuda-9.0/include/thrust/device_new_allocator.h" "$(@D)/cuda/include/thrust/device_new_allocator.h" && cp -f "/usr/local/cuda-9.0/include/thrust/device_ptr.h" "$(@D)/cuda/include/thrust/device_ptr.h" && cp -f "/usr/local/cuda-9.0/include/thrust/device_reference.h" "$(@D)/cuda/include/thrust/device_reference.h" && cp -f "/usr/local/cuda-9.0/include/thrust/device_vector.h" "$(@D)/cuda/include/thrust/device_vector.h" && cp -f "/usr/local/cuda-9.0/include/thrust/distance.h" "$(@D)/cuda/include/thrust/distance.h" && cp -f "/usr/local/cuda-9.0/include/thrust/equal.h" "$(@D)/cuda/include/thrust/equal.h" && cp -f "/usr/local/cuda-9.0/include/thrust/execution_policy.h" "$(@D)/cuda/include/thrust/execution_policy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/extrema.h" "$(@D)/cuda/include/thrust/extrema.h" && cp -f "/usr/local/cuda-9.0/include/thrust/fill.h" "$(@D)/cuda/include/thrust/fill.h" && cp -f "/usr/local/cuda-9.0/include/thrust/find.h" "$(@D)/cuda/include/thrust/find.h" && cp -f "/usr/local/cuda-9.0/include/thrust/for_each.h" "$(@D)/cuda/include/thrust/for_each.h" && cp -f "/usr/local/cuda-9.0/include/thrust/functional.h" "$(@D)/cuda/include/thrust/functional.h" && cp -f "/usr/local/cuda-9.0/include/thrust/gather.h" "$(@D)/cuda/include/thrust/gather.h" && cp -f "/usr/local/cuda-9.0/include/thrust/generate.h" "$(@D)/cuda/include/thrust/generate.h" && cp -f "/usr/local/cuda-9.0/include/thrust/host_vector.h" "$(@D)/cuda/include/thrust/host_vector.h" && cp -f "/usr/local/cuda-9.0/include/thrust/inner_product.h" "$(@D)/cuda/include/thrust/inner_product.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/constant_iterator.h" "$(@D)/cuda/include/thrust/iterator/constant_iterator.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/counting_iterator.h" "$(@D)/cuda/include/thrust/iterator/counting_iterator.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/any_assign.h" "$(@D)/cuda/include/thrust/iterator/detail/any_assign.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/any_system_tag.h" "$(@D)/cuda/include/thrust/iterator/detail/any_system_tag.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/constant_iterator_base.h" "$(@D)/cuda/include/thrust/iterator/detail/constant_iterator_base.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/counting_iterator.inl" "$(@D)/cuda/include/thrust/iterator/detail/counting_iterator.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/device_system_tag.h" "$(@D)/cuda/include/thrust/iterator/detail/device_system_tag.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/discard_iterator_base.h" "$(@D)/cuda/include/thrust/iterator/detail/discard_iterator_base.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/distance_from_result.h" "$(@D)/cuda/include/thrust/iterator/detail/distance_from_result.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/host_system_tag.h" "$(@D)/cuda/include/thrust/iterator/detail/host_system_tag.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/is_iterator_category.h" "$(@D)/cuda/include/thrust/iterator/detail/is_iterator_category.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/is_trivial_iterator.h" "$(@D)/cuda/include/thrust/iterator/detail/is_trivial_iterator.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/iterator_adaptor_base.h" "$(@D)/cuda/include/thrust/iterator/detail/iterator_adaptor_base.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/iterator_category_to_system.h" "$(@D)/cuda/include/thrust/iterator/detail/iterator_category_to_system.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/iterator_category_to_traversal.h" "$(@D)/cuda/include/thrust/iterator/detail/iterator_category_to_traversal.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/iterator_category_with_system_and_traversal.h" "$(@D)/cuda/include/thrust/iterator/detail/iterator_category_with_system_and_traversal.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/iterator_facade_category.h" "$(@D)/cuda/include/thrust/iterator/detail/iterator_facade_category.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/iterator_traits.inl" "$(@D)/cuda/include/thrust/iterator/detail/iterator_traits.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/iterator_traversal_tags.h" "$(@D)/cuda/include/thrust/iterator/detail/iterator_traversal_tags.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/join_iterator.h" "$(@D)/cuda/include/thrust/iterator/detail/join_iterator.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/minimum_category.h" "$(@D)/cuda/include/thrust/iterator/detail/minimum_category.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/minimum_system.h" "$(@D)/cuda/include/thrust/iterator/detail/minimum_system.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/normal_iterator.h" "$(@D)/cuda/include/thrust/iterator/detail/normal_iterator.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/permutation_iterator_base.h" "$(@D)/cuda/include/thrust/iterator/detail/permutation_iterator_base.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/retag.h" "$(@D)/cuda/include/thrust/iterator/detail/retag.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/reverse_iterator.inl" "$(@D)/cuda/include/thrust/iterator/detail/reverse_iterator.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/reverse_iterator_base.h" "$(@D)/cuda/include/thrust/iterator/detail/reverse_iterator_base.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/tagged_iterator.h" "$(@D)/cuda/include/thrust/iterator/detail/tagged_iterator.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/transform_iterator.inl" "$(@D)/cuda/include/thrust/iterator/detail/transform_iterator.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/transform_output_iterator.inl" "$(@D)/cuda/include/thrust/iterator/detail/transform_output_iterator.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/tuple_of_iterator_references.h" "$(@D)/cuda/include/thrust/iterator/detail/tuple_of_iterator_references.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/universal_categories.h" "$(@D)/cuda/include/thrust/iterator/detail/universal_categories.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/zip_iterator.inl" "$(@D)/cuda/include/thrust/iterator/detail/zip_iterator.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/zip_iterator_base.h" "$(@D)/cuda/include/thrust/iterator/detail/zip_iterator_base.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/discard_iterator.h" "$(@D)/cuda/include/thrust/iterator/discard_iterator.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/iterator_adaptor.h" "$(@D)/cuda/include/thrust/iterator/iterator_adaptor.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/iterator_categories.h" "$(@D)/cuda/include/thrust/iterator/iterator_categories.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/iterator_facade.h" "$(@D)/cuda/include/thrust/iterator/iterator_facade.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/iterator_traits.h" "$(@D)/cuda/include/thrust/iterator/iterator_traits.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/permutation_iterator.h" "$(@D)/cuda/include/thrust/iterator/permutation_iterator.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/retag.h" "$(@D)/cuda/include/thrust/iterator/retag.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/reverse_iterator.h" "$(@D)/cuda/include/thrust/iterator/reverse_iterator.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/transform_iterator.h" "$(@D)/cuda/include/thrust/iterator/transform_iterator.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/transform_output_iterator.h" "$(@D)/cuda/include/thrust/iterator/transform_output_iterator.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/zip_iterator.h" "$(@D)/cuda/include/thrust/iterator/zip_iterator.h" && cp -f "/usr/local/cuda-9.0/include/thrust/logical.h" "$(@D)/cuda/include/thrust/logical.h" && cp -f "/usr/local/cuda-9.0/include/thrust/memory.h" "$(@D)/cuda/include/thrust/memory.h" && cp -f "/usr/local/cuda-9.0/include/thrust/merge.h" "$(@D)/cuda/include/thrust/merge.h" && cp -f "/usr/local/cuda-9.0/include/thrust/mismatch.h" "$(@D)/cuda/include/thrust/mismatch.h" && cp -f "/usr/local/cuda-9.0/include/thrust/pair.h" "$(@D)/cuda/include/thrust/pair.h" && cp -f "/usr/local/cuda-9.0/include/thrust/partition.h" "$(@D)/cuda/include/thrust/partition.h" && cp -f "/usr/local/cuda-9.0/include/thrust/random.h" "$(@D)/cuda/include/thrust/random.h" && cp -f "/usr/local/cuda-9.0/include/thrust/random/detail/discard_block_engine.inl" "$(@D)/cuda/include/thrust/random/detail/discard_block_engine.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/random/detail/linear_congruential_engine.inl" "$(@D)/cuda/include/thrust/random/detail/linear_congruential_engine.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/random/detail/linear_congruential_engine_discard.h" "$(@D)/cuda/include/thrust/random/detail/linear_congruential_engine_discard.h" && cp -f "/usr/local/cuda-9.0/include/thrust/random/detail/linear_feedback_shift_engine.inl" "$(@D)/cuda/include/thrust/random/detail/linear_feedback_shift_engine.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/random/detail/linear_feedback_shift_engine_wordmask.h" "$(@D)/cuda/include/thrust/random/detail/linear_feedback_shift_engine_wordmask.h" && cp -f "/usr/local/cuda-9.0/include/thrust/random/detail/mod.h" "$(@D)/cuda/include/thrust/random/detail/mod.h" && cp -f "/usr/local/cuda-9.0/include/thrust/random/detail/normal_distribution.inl" "$(@D)/cuda/include/thrust/random/detail/normal_distribution.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/random/detail/normal_distribution_base.h" "$(@D)/cuda/include/thrust/random/detail/normal_distribution_base.h" && cp -f "/usr/local/cuda-9.0/include/thrust/random/detail/random_core_access.h" "$(@D)/cuda/include/thrust/random/detail/random_core_access.h" && cp -f "/usr/local/cuda-9.0/include/thrust/random/detail/subtract_with_carry_engine.inl" "$(@D)/cuda/include/thrust/random/detail/subtract_with_carry_engine.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/random/detail/uniform_int_distribution.inl" "$(@D)/cuda/include/thrust/random/detail/uniform_int_distribution.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/random/detail/uniform_real_distribution.inl" "$(@D)/cuda/include/thrust/random/detail/uniform_real_distribution.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/random/detail/xor_combine_engine.inl" "$(@D)/cuda/include/thrust/random/detail/xor_combine_engine.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/random/detail/xor_combine_engine_max.h" "$(@D)/cuda/include/thrust/random/detail/xor_combine_engine_max.h" && cp -f "/usr/local/cuda-9.0/include/thrust/random/discard_block_engine.h" "$(@D)/cuda/include/thrust/random/discard_block_engine.h" && cp -f "/usr/local/cuda-9.0/include/thrust/random/linear_congruential_engine.h" "$(@D)/cuda/include/thrust/random/linear_congruential_engine.h" && cp -f "/usr/local/cuda-9.0/include/thrust/random/linear_feedback_shift_engine.h" "$(@D)/cuda/include/thrust/random/linear_feedback_shift_engine.h" && cp -f "/usr/local/cuda-9.0/include/thrust/random/normal_distribution.h" "$(@D)/cuda/include/thrust/random/normal_distribution.h" && cp -f "/usr/local/cuda-9.0/include/thrust/random/subtract_with_carry_engine.h" "$(@D)/cuda/include/thrust/random/subtract_with_carry_engine.h" && cp -f "/usr/local/cuda-9.0/include/thrust/random/uniform_int_distribution.h" "$(@D)/cuda/include/thrust/random/uniform_int_distribution.h" && cp -f "/usr/local/cuda-9.0/include/thrust/random/uniform_real_distribution.h" "$(@D)/cuda/include/thrust/random/uniform_real_distribution.h" && cp -f "/usr/local/cuda-9.0/include/thrust/random/xor_combine_engine.h" "$(@D)/cuda/include/thrust/random/xor_combine_engine.h" && cp -f "/usr/local/cuda-9.0/include/thrust/reduce.h" "$(@D)/cuda/include/thrust/reduce.h" && cp -f "/usr/local/cuda-9.0/include/thrust/remove.h" "$(@D)/cuda/include/thrust/remove.h" && cp -f "/usr/local/cuda-9.0/include/thrust/replace.h" "$(@D)/cuda/include/thrust/replace.h" && cp -f "/usr/local/cuda-9.0/include/thrust/reverse.h" "$(@D)/cuda/include/thrust/reverse.h" && cp -f "/usr/local/cuda-9.0/include/thrust/scan.h" "$(@D)/cuda/include/thrust/scan.h" && cp -f "/usr/local/cuda-9.0/include/thrust/scatter.h" "$(@D)/cuda/include/thrust/scatter.h" && cp -f "/usr/local/cuda-9.0/include/thrust/sequence.h" "$(@D)/cuda/include/thrust/sequence.h" && cp -f "/usr/local/cuda-9.0/include/thrust/set_operations.h" "$(@D)/cuda/include/thrust/set_operations.h" && cp -f "/usr/local/cuda-9.0/include/thrust/sort.h" "$(@D)/cuda/include/thrust/sort.h" && cp -f "/usr/local/cuda-9.0/include/thrust/swap.h" "$(@D)/cuda/include/thrust/swap.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/adjacent_difference.h" "$(@D)/cuda/include/thrust/system/cpp/detail/adjacent_difference.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/assign_value.h" "$(@D)/cuda/include/thrust/system/cpp/detail/assign_value.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/binary_search.h" "$(@D)/cuda/include/thrust/system/cpp/detail/binary_search.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/copy.h" "$(@D)/cuda/include/thrust/system/cpp/detail/copy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/copy_if.h" "$(@D)/cuda/include/thrust/system/cpp/detail/copy_if.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/count.h" "$(@D)/cuda/include/thrust/system/cpp/detail/count.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/equal.h" "$(@D)/cuda/include/thrust/system/cpp/detail/equal.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/execution_policy.h" "$(@D)/cuda/include/thrust/system/cpp/detail/execution_policy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/extrema.h" "$(@D)/cuda/include/thrust/system/cpp/detail/extrema.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/fill.h" "$(@D)/cuda/include/thrust/system/cpp/detail/fill.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/find.h" "$(@D)/cuda/include/thrust/system/cpp/detail/find.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/for_each.h" "$(@D)/cuda/include/thrust/system/cpp/detail/for_each.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/gather.h" "$(@D)/cuda/include/thrust/system/cpp/detail/gather.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/generate.h" "$(@D)/cuda/include/thrust/system/cpp/detail/generate.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/get_value.h" "$(@D)/cuda/include/thrust/system/cpp/detail/get_value.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/inner_product.h" "$(@D)/cuda/include/thrust/system/cpp/detail/inner_product.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/iter_swap.h" "$(@D)/cuda/include/thrust/system/cpp/detail/iter_swap.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/logical.h" "$(@D)/cuda/include/thrust/system/cpp/detail/logical.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/malloc_and_free.h" "$(@D)/cuda/include/thrust/system/cpp/detail/malloc_and_free.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/memory.inl" "$(@D)/cuda/include/thrust/system/cpp/detail/memory.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/merge.h" "$(@D)/cuda/include/thrust/system/cpp/detail/merge.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/mismatch.h" "$(@D)/cuda/include/thrust/system/cpp/detail/mismatch.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/par.h" "$(@D)/cuda/include/thrust/system/cpp/detail/par.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/partition.h" "$(@D)/cuda/include/thrust/system/cpp/detail/partition.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/reduce.h" "$(@D)/cuda/include/thrust/system/cpp/detail/reduce.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/reduce_by_key.h" "$(@D)/cuda/include/thrust/system/cpp/detail/reduce_by_key.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/remove.h" "$(@D)/cuda/include/thrust/system/cpp/detail/remove.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/replace.h" "$(@D)/cuda/include/thrust/system/cpp/detail/replace.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/reverse.h" "$(@D)/cuda/include/thrust/system/cpp/detail/reverse.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/scan.h" "$(@D)/cuda/include/thrust/system/cpp/detail/scan.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/scan_by_key.h" "$(@D)/cuda/include/thrust/system/cpp/detail/scan_by_key.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/scatter.h" "$(@D)/cuda/include/thrust/system/cpp/detail/scatter.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/sequence.h" "$(@D)/cuda/include/thrust/system/cpp/detail/sequence.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/set_operations.h" "$(@D)/cuda/include/thrust/system/cpp/detail/set_operations.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/sort.h" "$(@D)/cuda/include/thrust/system/cpp/detail/sort.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/swap_ranges.h" "$(@D)/cuda/include/thrust/system/cpp/detail/swap_ranges.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/tabulate.h" "$(@D)/cuda/include/thrust/system/cpp/detail/tabulate.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/temporary_buffer.h" "$(@D)/cuda/include/thrust/system/cpp/detail/temporary_buffer.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/transform.h" "$(@D)/cuda/include/thrust/system/cpp/detail/transform.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/transform_reduce.h" "$(@D)/cuda/include/thrust/system/cpp/detail/transform_reduce.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/transform_scan.h" "$(@D)/cuda/include/thrust/system/cpp/detail/transform_scan.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/uninitialized_copy.h" "$(@D)/cuda/include/thrust/system/cpp/detail/uninitialized_copy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/uninitialized_fill.h" "$(@D)/cuda/include/thrust/system/cpp/detail/uninitialized_fill.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/unique.h" "$(@D)/cuda/include/thrust/system/cpp/detail/unique.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/unique_by_key.h" "$(@D)/cuda/include/thrust/system/cpp/detail/unique_by_key.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/vector.inl" "$(@D)/cuda/include/thrust/system/cpp/detail/vector.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/execution_policy.h" "$(@D)/cuda/include/thrust/system/cpp/execution_policy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/memory.h" "$(@D)/cuda/include/thrust/system/cpp/memory.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/vector.h" "$(@D)/cuda/include/thrust/system/cpp/vector.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/config.h" "$(@D)/cuda/include/thrust/system/cuda/config.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/adjacent_difference.h" "$(@D)/cuda/include/thrust/system/cuda/detail/adjacent_difference.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/assign_value.h" "$(@D)/cuda/include/thrust/system/cuda/detail/assign_value.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/binary_search.h" "$(@D)/cuda/include/thrust/system/cuda/detail/binary_search.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/copy.h" "$(@D)/cuda/include/thrust/system/cuda/detail/copy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/copy_if.h" "$(@D)/cuda/include/thrust/system/cuda/detail/copy_if.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/core/agent_launcher.h" "$(@D)/cuda/include/thrust/system/cuda/detail/core/agent_launcher.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/core/alignment.h" "$(@D)/cuda/include/thrust/system/cuda/detail/core/alignment.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/core/triple_chevron_launch.h" "$(@D)/cuda/include/thrust/system/cuda/detail/core/triple_chevron_launch.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/core/util.h" "$(@D)/cuda/include/thrust/system/cuda/detail/core/util.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/count.h" "$(@D)/cuda/include/thrust/system/cuda/detail/count.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cross_system.h" "$(@D)/cuda/include/thrust/system/cuda/detail/cross_system.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/agent/agent_histogram.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_histogram.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/agent/agent_radix_sort_downsweep.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_radix_sort_downsweep.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/agent/agent_radix_sort_upsweep.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_radix_sort_upsweep.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/agent/agent_reduce.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_reduce.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/agent/agent_reduce_by_key.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_reduce_by_key.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/agent/agent_rle.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_rle.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/agent/agent_scan.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_scan.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/agent/agent_segment_fixup.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_segment_fixup.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/agent/agent_select_if.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_select_if.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/agent/agent_spmv_csrt.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_spmv_csrt.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/agent/agent_spmv_orig.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_spmv_orig.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/agent/agent_spmv_row_based.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_spmv_row_based.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/agent/single_pass_scan_operators.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/single_pass_scan_operators.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/block_adjacent_difference.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_adjacent_difference.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/block_discontinuity.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_discontinuity.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/block_exchange.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_exchange.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/block_histogram.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_histogram.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/block_load.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_load.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/block_radix_rank.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_radix_rank.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/block_radix_sort.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_radix_sort.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/block_raking_layout.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_raking_layout.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/block_reduce.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_reduce.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/block_scan.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_scan.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/block_shuffle.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_shuffle.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/block_store.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_store.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/specializations/block_histogram_atomic.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_histogram_atomic.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/specializations/block_histogram_sort.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_histogram_sort.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/specializations/block_reduce_raking.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_reduce_raking.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/specializations/block_reduce_raking_commutative_only.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_reduce_raking_commutative_only.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/specializations/block_reduce_warp_reductions.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_reduce_warp_reductions.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_raking.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_raking.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans2.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans2.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans3.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans3.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/cub.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/cub.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/device_histogram.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_histogram.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/device_partition.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_partition.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/device_radix_sort.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_radix_sort.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/device_reduce.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_reduce.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/device_run_length_encode.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_run_length_encode.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/device_scan.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_scan.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/device_segmented_radix_sort.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_segmented_radix_sort.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/device_segmented_reduce.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_segmented_reduce.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/device_select.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_select.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/device_spmv.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_spmv.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_histogram.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_histogram.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_radix_sort.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_radix_sort.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_reduce.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_reduce.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_reduce_by_key.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_reduce_by_key.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_rle.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_rle.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_scan.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_scan.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_select_if.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_select_if.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_spmv_csrt.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_spmv_csrt.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_spmv_orig.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_spmv_orig.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_spmv_row_based.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_spmv_row_based.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/grid/grid_barrier.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/grid/grid_barrier.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/grid/grid_even_share.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/grid/grid_even_share.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/grid/grid_mapping.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/grid/grid_mapping.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/grid/grid_queue.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/grid/grid_queue.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/host/mutex.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/host/mutex.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/iterator/arg_index_input_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/arg_index_input_iterator.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/iterator/cache_modified_input_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/cache_modified_input_iterator.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/iterator/cache_modified_output_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/cache_modified_output_iterator.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/iterator/constant_input_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/constant_input_iterator.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/iterator/counting_input_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/counting_input_iterator.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/iterator/discard_output_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/discard_output_iterator.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/iterator/tex_obj_input_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/tex_obj_input_iterator.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/iterator/tex_ref_input_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/tex_ref_input_iterator.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/iterator/transform_input_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/transform_input_iterator.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/thread/thread_load.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/thread/thread_load.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/thread/thread_operators.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/thread/thread_operators.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/thread/thread_reduce.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/thread/thread_reduce.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/thread/thread_scan.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/thread/thread_scan.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/thread/thread_search.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/thread/thread_search.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/thread/thread_store.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/thread/thread_store.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/util_allocator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/util_allocator.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/util_arch.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/util_arch.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/util_debug.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/util_debug.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/util_device.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/util_device.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/util_macro.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/util_macro.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/util_namespace.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/util_namespace.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/util_ptx.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/util_ptx.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/util_type.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/util_type.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/warp/specializations/warp_reduce_shfl.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/warp/specializations/warp_reduce_shfl.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/warp/specializations/warp_reduce_smem.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/warp/specializations/warp_reduce_smem.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/warp/specializations/warp_scan_shfl.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/warp/specializations/warp_scan_shfl.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/warp/specializations/warp_scan_smem.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/warp/specializations/warp_scan_smem.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/warp/warp_reduce.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/warp/warp_reduce.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/warp/warp_scan.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/warp/warp_scan.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/equal.h" "$(@D)/cuda/include/thrust/system/cuda/detail/equal.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/error.inl" "$(@D)/cuda/include/thrust/system/cuda/detail/error.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/execution_policy.h" "$(@D)/cuda/include/thrust/system/cuda/detail/execution_policy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/extrema.h" "$(@D)/cuda/include/thrust/system/cuda/detail/extrema.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/fill.h" "$(@D)/cuda/include/thrust/system/cuda/detail/fill.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/find.h" "$(@D)/cuda/include/thrust/system/cuda/detail/find.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/for_each.h" "$(@D)/cuda/include/thrust/system/cuda/detail/for_each.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/gather.h" "$(@D)/cuda/include/thrust/system/cuda/detail/gather.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/generate.h" "$(@D)/cuda/include/thrust/system/cuda/detail/generate.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/get_value.h" "$(@D)/cuda/include/thrust/system/cuda/detail/get_value.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/guarded_cuda_runtime_api.h" "$(@D)/cuda/include/thrust/system/cuda/detail/guarded_cuda_runtime_api.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/guarded_driver_types.h" "$(@D)/cuda/include/thrust/system/cuda/detail/guarded_driver_types.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/inner_product.h" "$(@D)/cuda/include/thrust/system/cuda/detail/inner_product.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/internal/copy_cross_system.h" "$(@D)/cuda/include/thrust/system/cuda/detail/internal/copy_cross_system.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/internal/copy_device_to_device.h" "$(@D)/cuda/include/thrust/system/cuda/detail/internal/copy_device_to_device.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/iter_swap.h" "$(@D)/cuda/include/thrust/system/cuda/detail/iter_swap.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/logical.h" "$(@D)/cuda/include/thrust/system/cuda/detail/logical.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/malloc_and_free.h" "$(@D)/cuda/include/thrust/system/cuda/detail/malloc_and_free.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/memory.inl" "$(@D)/cuda/include/thrust/system/cuda/detail/memory.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/memory_buffer.h" "$(@D)/cuda/include/thrust/system/cuda/detail/memory_buffer.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/merge.h" "$(@D)/cuda/include/thrust/system/cuda/detail/merge.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/mismatch.h" "$(@D)/cuda/include/thrust/system/cuda/detail/mismatch.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/par.h" "$(@D)/cuda/include/thrust/system/cuda/detail/par.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/par_to_seq.h" "$(@D)/cuda/include/thrust/system/cuda/detail/par_to_seq.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/parallel_for.h" "$(@D)/cuda/include/thrust/system/cuda/detail/parallel_for.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/partition.h" "$(@D)/cuda/include/thrust/system/cuda/detail/partition.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/reduce.h" "$(@D)/cuda/include/thrust/system/cuda/detail/reduce.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/reduce_by_key.h" "$(@D)/cuda/include/thrust/system/cuda/detail/reduce_by_key.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/remove.h" "$(@D)/cuda/include/thrust/system/cuda/detail/remove.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/replace.h" "$(@D)/cuda/include/thrust/system/cuda/detail/replace.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/reverse.h" "$(@D)/cuda/include/thrust/system/cuda/detail/reverse.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/scan.h" "$(@D)/cuda/include/thrust/system/cuda/detail/scan.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/scan_by_key.h" "$(@D)/cuda/include/thrust/system/cuda/detail/scan_by_key.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/scatter.h" "$(@D)/cuda/include/thrust/system/cuda/detail/scatter.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/sequence.h" "$(@D)/cuda/include/thrust/system/cuda/detail/sequence.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/set_operations.h" "$(@D)/cuda/include/thrust/system/cuda/detail/set_operations.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/sort.h" "$(@D)/cuda/include/thrust/system/cuda/detail/sort.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/swap_ranges.h" "$(@D)/cuda/include/thrust/system/cuda/detail/swap_ranges.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/tabulate.h" "$(@D)/cuda/include/thrust/system/cuda/detail/tabulate.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/temporary_buffer.h" "$(@D)/cuda/include/thrust/system/cuda/detail/temporary_buffer.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/terminate.h" "$(@D)/cuda/include/thrust/system/cuda/detail/terminate.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/transform.h" "$(@D)/cuda/include/thrust/system/cuda/detail/transform.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/transform_reduce.h" "$(@D)/cuda/include/thrust/system/cuda/detail/transform_reduce.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/transform_scan.h" "$(@D)/cuda/include/thrust/system/cuda/detail/transform_scan.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/uninitialized_copy.h" "$(@D)/cuda/include/thrust/system/cuda/detail/uninitialized_copy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/uninitialized_fill.h" "$(@D)/cuda/include/thrust/system/cuda/detail/uninitialized_fill.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/unique.h" "$(@D)/cuda/include/thrust/system/cuda/detail/unique.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/unique_by_key.h" "$(@D)/cuda/include/thrust/system/cuda/detail/unique_by_key.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/util.h" "$(@D)/cuda/include/thrust/system/cuda/detail/util.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/vector.inl" "$(@D)/cuda/include/thrust/system/cuda/detail/vector.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/error.h" "$(@D)/cuda/include/thrust/system/cuda/error.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/execution_policy.h" "$(@D)/cuda/include/thrust/system/cuda/execution_policy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/experimental/pinned_allocator.h" "$(@D)/cuda/include/thrust/system/cuda/experimental/pinned_allocator.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/memory.h" "$(@D)/cuda/include/thrust/system/cuda/memory.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/vector.h" "$(@D)/cuda/include/thrust/system/cuda/vector.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/adjacent_difference.h" "$(@D)/cuda/include/thrust/system/detail/adl/adjacent_difference.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/assign_value.h" "$(@D)/cuda/include/thrust/system/detail/adl/assign_value.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/binary_search.h" "$(@D)/cuda/include/thrust/system/detail/adl/binary_search.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/copy.h" "$(@D)/cuda/include/thrust/system/detail/adl/copy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/copy_if.h" "$(@D)/cuda/include/thrust/system/detail/adl/copy_if.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/count.h" "$(@D)/cuda/include/thrust/system/detail/adl/count.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/equal.h" "$(@D)/cuda/include/thrust/system/detail/adl/equal.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/extrema.h" "$(@D)/cuda/include/thrust/system/detail/adl/extrema.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/fill.h" "$(@D)/cuda/include/thrust/system/detail/adl/fill.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/find.h" "$(@D)/cuda/include/thrust/system/detail/adl/find.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/for_each.h" "$(@D)/cuda/include/thrust/system/detail/adl/for_each.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/gather.h" "$(@D)/cuda/include/thrust/system/detail/adl/gather.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/generate.h" "$(@D)/cuda/include/thrust/system/detail/adl/generate.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/get_value.h" "$(@D)/cuda/include/thrust/system/detail/adl/get_value.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/inner_product.h" "$(@D)/cuda/include/thrust/system/detail/adl/inner_product.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/iter_swap.h" "$(@D)/cuda/include/thrust/system/detail/adl/iter_swap.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/logical.h" "$(@D)/cuda/include/thrust/system/detail/adl/logical.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/malloc_and_free.h" "$(@D)/cuda/include/thrust/system/detail/adl/malloc_and_free.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/merge.h" "$(@D)/cuda/include/thrust/system/detail/adl/merge.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/mismatch.h" "$(@D)/cuda/include/thrust/system/detail/adl/mismatch.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/partition.h" "$(@D)/cuda/include/thrust/system/detail/adl/partition.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/reduce.h" "$(@D)/cuda/include/thrust/system/detail/adl/reduce.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/reduce_by_key.h" "$(@D)/cuda/include/thrust/system/detail/adl/reduce_by_key.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/remove.h" "$(@D)/cuda/include/thrust/system/detail/adl/remove.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/replace.h" "$(@D)/cuda/include/thrust/system/detail/adl/replace.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/reverse.h" "$(@D)/cuda/include/thrust/system/detail/adl/reverse.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/scan.h" "$(@D)/cuda/include/thrust/system/detail/adl/scan.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/scan_by_key.h" "$(@D)/cuda/include/thrust/system/detail/adl/scan_by_key.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/scatter.h" "$(@D)/cuda/include/thrust/system/detail/adl/scatter.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/sequence.h" "$(@D)/cuda/include/thrust/system/detail/adl/sequence.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/set_operations.h" "$(@D)/cuda/include/thrust/system/detail/adl/set_operations.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/sort.h" "$(@D)/cuda/include/thrust/system/detail/adl/sort.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/swap_ranges.h" "$(@D)/cuda/include/thrust/system/detail/adl/swap_ranges.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/tabulate.h" "$(@D)/cuda/include/thrust/system/detail/adl/tabulate.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/temporary_buffer.h" "$(@D)/cuda/include/thrust/system/detail/adl/temporary_buffer.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/transform.h" "$(@D)/cuda/include/thrust/system/detail/adl/transform.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/transform_reduce.h" "$(@D)/cuda/include/thrust/system/detail/adl/transform_reduce.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/transform_scan.h" "$(@D)/cuda/include/thrust/system/detail/adl/transform_scan.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/uninitialized_copy.h" "$(@D)/cuda/include/thrust/system/detail/adl/uninitialized_copy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/uninitialized_fill.h" "$(@D)/cuda/include/thrust/system/detail/adl/uninitialized_fill.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/unique.h" "$(@D)/cuda/include/thrust/system/detail/adl/unique.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/unique_by_key.h" "$(@D)/cuda/include/thrust/system/detail/adl/unique_by_key.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/bad_alloc.h" "$(@D)/cuda/include/thrust/system/detail/bad_alloc.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/errno.h" "$(@D)/cuda/include/thrust/system/detail/errno.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/error_category.inl" "$(@D)/cuda/include/thrust/system/detail/error_category.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/error_code.inl" "$(@D)/cuda/include/thrust/system/detail/error_code.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/error_condition.inl" "$(@D)/cuda/include/thrust/system/detail/error_condition.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/adjacent_difference.h" "$(@D)/cuda/include/thrust/system/detail/generic/adjacent_difference.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/adjacent_difference.inl" "$(@D)/cuda/include/thrust/system/detail/generic/adjacent_difference.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/advance.h" "$(@D)/cuda/include/thrust/system/detail/generic/advance.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/advance.inl" "$(@D)/cuda/include/thrust/system/detail/generic/advance.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/binary_search.h" "$(@D)/cuda/include/thrust/system/detail/generic/binary_search.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/binary_search.inl" "$(@D)/cuda/include/thrust/system/detail/generic/binary_search.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/copy.h" "$(@D)/cuda/include/thrust/system/detail/generic/copy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/copy.inl" "$(@D)/cuda/include/thrust/system/detail/generic/copy.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/copy_if.h" "$(@D)/cuda/include/thrust/system/detail/generic/copy_if.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/copy_if.inl" "$(@D)/cuda/include/thrust/system/detail/generic/copy_if.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/count.h" "$(@D)/cuda/include/thrust/system/detail/generic/count.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/count.inl" "$(@D)/cuda/include/thrust/system/detail/generic/count.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/distance.h" "$(@D)/cuda/include/thrust/system/detail/generic/distance.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/distance.inl" "$(@D)/cuda/include/thrust/system/detail/generic/distance.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/equal.h" "$(@D)/cuda/include/thrust/system/detail/generic/equal.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/equal.inl" "$(@D)/cuda/include/thrust/system/detail/generic/equal.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/extrema.h" "$(@D)/cuda/include/thrust/system/detail/generic/extrema.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/extrema.inl" "$(@D)/cuda/include/thrust/system/detail/generic/extrema.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/fill.h" "$(@D)/cuda/include/thrust/system/detail/generic/fill.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/find.h" "$(@D)/cuda/include/thrust/system/detail/generic/find.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/find.inl" "$(@D)/cuda/include/thrust/system/detail/generic/find.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/for_each.h" "$(@D)/cuda/include/thrust/system/detail/generic/for_each.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/gather.h" "$(@D)/cuda/include/thrust/system/detail/generic/gather.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/gather.inl" "$(@D)/cuda/include/thrust/system/detail/generic/gather.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/generate.h" "$(@D)/cuda/include/thrust/system/detail/generic/generate.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/generate.inl" "$(@D)/cuda/include/thrust/system/detail/generic/generate.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/inner_product.h" "$(@D)/cuda/include/thrust/system/detail/generic/inner_product.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/inner_product.inl" "$(@D)/cuda/include/thrust/system/detail/generic/inner_product.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/logical.h" "$(@D)/cuda/include/thrust/system/detail/generic/logical.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/memory.h" "$(@D)/cuda/include/thrust/system/detail/generic/memory.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/memory.inl" "$(@D)/cuda/include/thrust/system/detail/generic/memory.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/merge.h" "$(@D)/cuda/include/thrust/system/detail/generic/merge.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/merge.inl" "$(@D)/cuda/include/thrust/system/detail/generic/merge.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/mismatch.h" "$(@D)/cuda/include/thrust/system/detail/generic/mismatch.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/mismatch.inl" "$(@D)/cuda/include/thrust/system/detail/generic/mismatch.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/partition.h" "$(@D)/cuda/include/thrust/system/detail/generic/partition.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/partition.inl" "$(@D)/cuda/include/thrust/system/detail/generic/partition.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/reduce.h" "$(@D)/cuda/include/thrust/system/detail/generic/reduce.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/reduce.inl" "$(@D)/cuda/include/thrust/system/detail/generic/reduce.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/reduce_by_key.h" "$(@D)/cuda/include/thrust/system/detail/generic/reduce_by_key.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/reduce_by_key.inl" "$(@D)/cuda/include/thrust/system/detail/generic/reduce_by_key.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/remove.h" "$(@D)/cuda/include/thrust/system/detail/generic/remove.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/remove.inl" "$(@D)/cuda/include/thrust/system/detail/generic/remove.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/replace.h" "$(@D)/cuda/include/thrust/system/detail/generic/replace.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/replace.inl" "$(@D)/cuda/include/thrust/system/detail/generic/replace.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/reverse.h" "$(@D)/cuda/include/thrust/system/detail/generic/reverse.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/reverse.inl" "$(@D)/cuda/include/thrust/system/detail/generic/reverse.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/scalar/binary_search.h" "$(@D)/cuda/include/thrust/system/detail/generic/scalar/binary_search.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/scalar/binary_search.inl" "$(@D)/cuda/include/thrust/system/detail/generic/scalar/binary_search.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/scan.h" "$(@D)/cuda/include/thrust/system/detail/generic/scan.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/scan.inl" "$(@D)/cuda/include/thrust/system/detail/generic/scan.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/scan_by_key.h" "$(@D)/cuda/include/thrust/system/detail/generic/scan_by_key.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/scan_by_key.inl" "$(@D)/cuda/include/thrust/system/detail/generic/scan_by_key.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/scatter.h" "$(@D)/cuda/include/thrust/system/detail/generic/scatter.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/scatter.inl" "$(@D)/cuda/include/thrust/system/detail/generic/scatter.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/select_system.h" "$(@D)/cuda/include/thrust/system/detail/generic/select_system.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/sequence.h" "$(@D)/cuda/include/thrust/system/detail/generic/sequence.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/sequence.inl" "$(@D)/cuda/include/thrust/system/detail/generic/sequence.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/set_operations.h" "$(@D)/cuda/include/thrust/system/detail/generic/set_operations.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/set_operations.inl" "$(@D)/cuda/include/thrust/system/detail/generic/set_operations.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/sort.h" "$(@D)/cuda/include/thrust/system/detail/generic/sort.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/sort.inl" "$(@D)/cuda/include/thrust/system/detail/generic/sort.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/swap_ranges.h" "$(@D)/cuda/include/thrust/system/detail/generic/swap_ranges.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/swap_ranges.inl" "$(@D)/cuda/include/thrust/system/detail/generic/swap_ranges.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/tabulate.h" "$(@D)/cuda/include/thrust/system/detail/generic/tabulate.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/tabulate.inl" "$(@D)/cuda/include/thrust/system/detail/generic/tabulate.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/tag.h" "$(@D)/cuda/include/thrust/system/detail/generic/tag.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/temporary_buffer.h" "$(@D)/cuda/include/thrust/system/detail/generic/temporary_buffer.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/temporary_buffer.inl" "$(@D)/cuda/include/thrust/system/detail/generic/temporary_buffer.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/transform.h" "$(@D)/cuda/include/thrust/system/detail/generic/transform.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/transform.inl" "$(@D)/cuda/include/thrust/system/detail/generic/transform.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/transform_reduce.h" "$(@D)/cuda/include/thrust/system/detail/generic/transform_reduce.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/transform_reduce.inl" "$(@D)/cuda/include/thrust/system/detail/generic/transform_reduce.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/transform_scan.h" "$(@D)/cuda/include/thrust/system/detail/generic/transform_scan.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/transform_scan.inl" "$(@D)/cuda/include/thrust/system/detail/generic/transform_scan.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/type_traits.h" "$(@D)/cuda/include/thrust/system/detail/generic/type_traits.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/uninitialized_copy.h" "$(@D)/cuda/include/thrust/system/detail/generic/uninitialized_copy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/uninitialized_copy.inl" "$(@D)/cuda/include/thrust/system/detail/generic/uninitialized_copy.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/uninitialized_fill.h" "$(@D)/cuda/include/thrust/system/detail/generic/uninitialized_fill.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/uninitialized_fill.inl" "$(@D)/cuda/include/thrust/system/detail/generic/uninitialized_fill.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/unique.h" "$(@D)/cuda/include/thrust/system/detail/generic/unique.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/unique.inl" "$(@D)/cuda/include/thrust/system/detail/generic/unique.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/unique_by_key.h" "$(@D)/cuda/include/thrust/system/detail/generic/unique_by_key.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/unique_by_key.inl" "$(@D)/cuda/include/thrust/system/detail/generic/unique_by_key.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/internal/decompose.h" "$(@D)/cuda/include/thrust/system/detail/internal/decompose.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/adjacent_difference.h" "$(@D)/cuda/include/thrust/system/detail/sequential/adjacent_difference.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/assign_value.h" "$(@D)/cuda/include/thrust/system/detail/sequential/assign_value.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/binary_search.h" "$(@D)/cuda/include/thrust/system/detail/sequential/binary_search.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/copy.h" "$(@D)/cuda/include/thrust/system/detail/sequential/copy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/copy.inl" "$(@D)/cuda/include/thrust/system/detail/sequential/copy.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/copy_backward.h" "$(@D)/cuda/include/thrust/system/detail/sequential/copy_backward.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/copy_if.h" "$(@D)/cuda/include/thrust/system/detail/sequential/copy_if.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/count.h" "$(@D)/cuda/include/thrust/system/detail/sequential/count.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/equal.h" "$(@D)/cuda/include/thrust/system/detail/sequential/equal.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/execution_policy.h" "$(@D)/cuda/include/thrust/system/detail/sequential/execution_policy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/extrema.h" "$(@D)/cuda/include/thrust/system/detail/sequential/extrema.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/fill.h" "$(@D)/cuda/include/thrust/system/detail/sequential/fill.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/find.h" "$(@D)/cuda/include/thrust/system/detail/sequential/find.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/for_each.h" "$(@D)/cuda/include/thrust/system/detail/sequential/for_each.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/gather.h" "$(@D)/cuda/include/thrust/system/detail/sequential/gather.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/general_copy.h" "$(@D)/cuda/include/thrust/system/detail/sequential/general_copy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/generate.h" "$(@D)/cuda/include/thrust/system/detail/sequential/generate.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/get_value.h" "$(@D)/cuda/include/thrust/system/detail/sequential/get_value.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/inner_product.h" "$(@D)/cuda/include/thrust/system/detail/sequential/inner_product.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/insertion_sort.h" "$(@D)/cuda/include/thrust/system/detail/sequential/insertion_sort.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/iter_swap.h" "$(@D)/cuda/include/thrust/system/detail/sequential/iter_swap.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/logical.h" "$(@D)/cuda/include/thrust/system/detail/sequential/logical.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/malloc_and_free.h" "$(@D)/cuda/include/thrust/system/detail/sequential/malloc_and_free.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/merge.h" "$(@D)/cuda/include/thrust/system/detail/sequential/merge.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/merge.inl" "$(@D)/cuda/include/thrust/system/detail/sequential/merge.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/mismatch.h" "$(@D)/cuda/include/thrust/system/detail/sequential/mismatch.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/partition.h" "$(@D)/cuda/include/thrust/system/detail/sequential/partition.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/reduce.h" "$(@D)/cuda/include/thrust/system/detail/sequential/reduce.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/reduce_by_key.h" "$(@D)/cuda/include/thrust/system/detail/sequential/reduce_by_key.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/remove.h" "$(@D)/cuda/include/thrust/system/detail/sequential/remove.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/replace.h" "$(@D)/cuda/include/thrust/system/detail/sequential/replace.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/reverse.h" "$(@D)/cuda/include/thrust/system/detail/sequential/reverse.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/scan.h" "$(@D)/cuda/include/thrust/system/detail/sequential/scan.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/scan_by_key.h" "$(@D)/cuda/include/thrust/system/detail/sequential/scan_by_key.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/scatter.h" "$(@D)/cuda/include/thrust/system/detail/sequential/scatter.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/sequence.h" "$(@D)/cuda/include/thrust/system/detail/sequential/sequence.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/set_operations.h" "$(@D)/cuda/include/thrust/system/detail/sequential/set_operations.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/sort.h" "$(@D)/cuda/include/thrust/system/detail/sequential/sort.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/sort.inl" "$(@D)/cuda/include/thrust/system/detail/sequential/sort.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/stable_merge_sort.h" "$(@D)/cuda/include/thrust/system/detail/sequential/stable_merge_sort.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/stable_merge_sort.inl" "$(@D)/cuda/include/thrust/system/detail/sequential/stable_merge_sort.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/stable_primitive_sort.h" "$(@D)/cuda/include/thrust/system/detail/sequential/stable_primitive_sort.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/stable_primitive_sort.inl" "$(@D)/cuda/include/thrust/system/detail/sequential/stable_primitive_sort.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/stable_radix_sort.h" "$(@D)/cuda/include/thrust/system/detail/sequential/stable_radix_sort.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/stable_radix_sort.inl" "$(@D)/cuda/include/thrust/system/detail/sequential/stable_radix_sort.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/swap_ranges.h" "$(@D)/cuda/include/thrust/system/detail/sequential/swap_ranges.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/tabulate.h" "$(@D)/cuda/include/thrust/system/detail/sequential/tabulate.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/temporary_buffer.h" "$(@D)/cuda/include/thrust/system/detail/sequential/temporary_buffer.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/transform.h" "$(@D)/cuda/include/thrust/system/detail/sequential/transform.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/transform_reduce.h" "$(@D)/cuda/include/thrust/system/detail/sequential/transform_reduce.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/transform_scan.h" "$(@D)/cuda/include/thrust/system/detail/sequential/transform_scan.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/trivial_copy.h" "$(@D)/cuda/include/thrust/system/detail/sequential/trivial_copy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/uninitialized_copy.h" "$(@D)/cuda/include/thrust/system/detail/sequential/uninitialized_copy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/uninitialized_fill.h" "$(@D)/cuda/include/thrust/system/detail/sequential/uninitialized_fill.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/unique.h" "$(@D)/cuda/include/thrust/system/detail/sequential/unique.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/unique_by_key.h" "$(@D)/cuda/include/thrust/system/detail/sequential/unique_by_key.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/system_error.inl" "$(@D)/cuda/include/thrust/system/detail/system_error.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/error_code.h" "$(@D)/cuda/include/thrust/system/error_code.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/adjacent_difference.h" "$(@D)/cuda/include/thrust/system/omp/detail/adjacent_difference.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/assign_value.h" "$(@D)/cuda/include/thrust/system/omp/detail/assign_value.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/binary_search.h" "$(@D)/cuda/include/thrust/system/omp/detail/binary_search.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/copy.h" "$(@D)/cuda/include/thrust/system/omp/detail/copy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/copy.inl" "$(@D)/cuda/include/thrust/system/omp/detail/copy.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/copy_if.h" "$(@D)/cuda/include/thrust/system/omp/detail/copy_if.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/copy_if.inl" "$(@D)/cuda/include/thrust/system/omp/detail/copy_if.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/count.h" "$(@D)/cuda/include/thrust/system/omp/detail/count.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/default_decomposition.h" "$(@D)/cuda/include/thrust/system/omp/detail/default_decomposition.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/default_decomposition.inl" "$(@D)/cuda/include/thrust/system/omp/detail/default_decomposition.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/equal.h" "$(@D)/cuda/include/thrust/system/omp/detail/equal.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/execution_policy.h" "$(@D)/cuda/include/thrust/system/omp/detail/execution_policy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/extrema.h" "$(@D)/cuda/include/thrust/system/omp/detail/extrema.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/fill.h" "$(@D)/cuda/include/thrust/system/omp/detail/fill.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/find.h" "$(@D)/cuda/include/thrust/system/omp/detail/find.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/for_each.h" "$(@D)/cuda/include/thrust/system/omp/detail/for_each.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/for_each.inl" "$(@D)/cuda/include/thrust/system/omp/detail/for_each.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/gather.h" "$(@D)/cuda/include/thrust/system/omp/detail/gather.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/generate.h" "$(@D)/cuda/include/thrust/system/omp/detail/generate.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/get_value.h" "$(@D)/cuda/include/thrust/system/omp/detail/get_value.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/inner_product.h" "$(@D)/cuda/include/thrust/system/omp/detail/inner_product.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/iter_swap.h" "$(@D)/cuda/include/thrust/system/omp/detail/iter_swap.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/logical.h" "$(@D)/cuda/include/thrust/system/omp/detail/logical.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/malloc_and_free.h" "$(@D)/cuda/include/thrust/system/omp/detail/malloc_and_free.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/memory.inl" "$(@D)/cuda/include/thrust/system/omp/detail/memory.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/merge.h" "$(@D)/cuda/include/thrust/system/omp/detail/merge.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/mismatch.h" "$(@D)/cuda/include/thrust/system/omp/detail/mismatch.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/par.h" "$(@D)/cuda/include/thrust/system/omp/detail/par.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/partition.h" "$(@D)/cuda/include/thrust/system/omp/detail/partition.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/partition.inl" "$(@D)/cuda/include/thrust/system/omp/detail/partition.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/reduce.h" "$(@D)/cuda/include/thrust/system/omp/detail/reduce.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/reduce.inl" "$(@D)/cuda/include/thrust/system/omp/detail/reduce.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/reduce_by_key.h" "$(@D)/cuda/include/thrust/system/omp/detail/reduce_by_key.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/reduce_by_key.inl" "$(@D)/cuda/include/thrust/system/omp/detail/reduce_by_key.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/reduce_intervals.h" "$(@D)/cuda/include/thrust/system/omp/detail/reduce_intervals.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/reduce_intervals.inl" "$(@D)/cuda/include/thrust/system/omp/detail/reduce_intervals.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/remove.h" "$(@D)/cuda/include/thrust/system/omp/detail/remove.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/remove.inl" "$(@D)/cuda/include/thrust/system/omp/detail/remove.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/replace.h" "$(@D)/cuda/include/thrust/system/omp/detail/replace.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/reverse.h" "$(@D)/cuda/include/thrust/system/omp/detail/reverse.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/scan.h" "$(@D)/cuda/include/thrust/system/omp/detail/scan.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/scan_by_key.h" "$(@D)/cuda/include/thrust/system/omp/detail/scan_by_key.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/scatter.h" "$(@D)/cuda/include/thrust/system/omp/detail/scatter.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/sequence.h" "$(@D)/cuda/include/thrust/system/omp/detail/sequence.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/set_operations.h" "$(@D)/cuda/include/thrust/system/omp/detail/set_operations.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/sort.h" "$(@D)/cuda/include/thrust/system/omp/detail/sort.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/sort.inl" "$(@D)/cuda/include/thrust/system/omp/detail/sort.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/swap_ranges.h" "$(@D)/cuda/include/thrust/system/omp/detail/swap_ranges.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/tabulate.h" "$(@D)/cuda/include/thrust/system/omp/detail/tabulate.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/temporary_buffer.h" "$(@D)/cuda/include/thrust/system/omp/detail/temporary_buffer.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/transform.h" "$(@D)/cuda/include/thrust/system/omp/detail/transform.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/transform_reduce.h" "$(@D)/cuda/include/thrust/system/omp/detail/transform_reduce.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/transform_scan.h" "$(@D)/cuda/include/thrust/system/omp/detail/transform_scan.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/uninitialized_copy.h" "$(@D)/cuda/include/thrust/system/omp/detail/uninitialized_copy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/uninitialized_fill.h" "$(@D)/cuda/include/thrust/system/omp/detail/uninitialized_fill.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/unique.h" "$(@D)/cuda/include/thrust/system/omp/detail/unique.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/unique.inl" "$(@D)/cuda/include/thrust/system/omp/detail/unique.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/unique_by_key.h" "$(@D)/cuda/include/thrust/system/omp/detail/unique_by_key.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/unique_by_key.inl" "$(@D)/cuda/include/thrust/system/omp/detail/unique_by_key.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/vector.inl" "$(@D)/cuda/include/thrust/system/omp/detail/vector.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/execution_policy.h" "$(@D)/cuda/include/thrust/system/omp/execution_policy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/memory.h" "$(@D)/cuda/include/thrust/system/omp/memory.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/vector.h" "$(@D)/cuda/include/thrust/system/omp/vector.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/system_error.h" "$(@D)/cuda/include/thrust/system/system_error.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/adjacent_difference.h" "$(@D)/cuda/include/thrust/system/tbb/detail/adjacent_difference.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/assign_value.h" "$(@D)/cuda/include/thrust/system/tbb/detail/assign_value.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/binary_search.h" "$(@D)/cuda/include/thrust/system/tbb/detail/binary_search.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/copy.h" "$(@D)/cuda/include/thrust/system/tbb/detail/copy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/copy.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/copy.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/copy_if.h" "$(@D)/cuda/include/thrust/system/tbb/detail/copy_if.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/copy_if.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/copy_if.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/count.h" "$(@D)/cuda/include/thrust/system/tbb/detail/count.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/equal.h" "$(@D)/cuda/include/thrust/system/tbb/detail/equal.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/execution_policy.h" "$(@D)/cuda/include/thrust/system/tbb/detail/execution_policy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/extrema.h" "$(@D)/cuda/include/thrust/system/tbb/detail/extrema.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/fill.h" "$(@D)/cuda/include/thrust/system/tbb/detail/fill.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/find.h" "$(@D)/cuda/include/thrust/system/tbb/detail/find.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/for_each.h" "$(@D)/cuda/include/thrust/system/tbb/detail/for_each.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/for_each.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/for_each.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/gather.h" "$(@D)/cuda/include/thrust/system/tbb/detail/gather.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/generate.h" "$(@D)/cuda/include/thrust/system/tbb/detail/generate.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/get_value.h" "$(@D)/cuda/include/thrust/system/tbb/detail/get_value.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/inner_product.h" "$(@D)/cuda/include/thrust/system/tbb/detail/inner_product.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/iter_swap.h" "$(@D)/cuda/include/thrust/system/tbb/detail/iter_swap.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/logical.h" "$(@D)/cuda/include/thrust/system/tbb/detail/logical.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/malloc_and_free.h" "$(@D)/cuda/include/thrust/system/tbb/detail/malloc_and_free.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/memory.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/memory.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/merge.h" "$(@D)/cuda/include/thrust/system/tbb/detail/merge.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/merge.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/merge.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/mismatch.h" "$(@D)/cuda/include/thrust/system/tbb/detail/mismatch.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/par.h" "$(@D)/cuda/include/thrust/system/tbb/detail/par.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/partition.h" "$(@D)/cuda/include/thrust/system/tbb/detail/partition.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/partition.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/partition.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/reduce.h" "$(@D)/cuda/include/thrust/system/tbb/detail/reduce.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/reduce.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/reduce.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/reduce_by_key.h" "$(@D)/cuda/include/thrust/system/tbb/detail/reduce_by_key.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/reduce_by_key.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/reduce_by_key.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/reduce_intervals.h" "$(@D)/cuda/include/thrust/system/tbb/detail/reduce_intervals.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/remove.h" "$(@D)/cuda/include/thrust/system/tbb/detail/remove.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/remove.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/remove.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/replace.h" "$(@D)/cuda/include/thrust/system/tbb/detail/replace.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/reverse.h" "$(@D)/cuda/include/thrust/system/tbb/detail/reverse.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/scan.h" "$(@D)/cuda/include/thrust/system/tbb/detail/scan.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/scan.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/scan.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/scan_by_key.h" "$(@D)/cuda/include/thrust/system/tbb/detail/scan_by_key.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/scatter.h" "$(@D)/cuda/include/thrust/system/tbb/detail/scatter.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/sequence.h" "$(@D)/cuda/include/thrust/system/tbb/detail/sequence.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/set_operations.h" "$(@D)/cuda/include/thrust/system/tbb/detail/set_operations.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/sort.h" "$(@D)/cuda/include/thrust/system/tbb/detail/sort.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/sort.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/sort.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/swap_ranges.h" "$(@D)/cuda/include/thrust/system/tbb/detail/swap_ranges.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/tabulate.h" "$(@D)/cuda/include/thrust/system/tbb/detail/tabulate.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/temporary_buffer.h" "$(@D)/cuda/include/thrust/system/tbb/detail/temporary_buffer.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/transform.h" "$(@D)/cuda/include/thrust/system/tbb/detail/transform.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/transform_reduce.h" "$(@D)/cuda/include/thrust/system/tbb/detail/transform_reduce.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/transform_scan.h" "$(@D)/cuda/include/thrust/system/tbb/detail/transform_scan.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/uninitialized_copy.h" "$(@D)/cuda/include/thrust/system/tbb/detail/uninitialized_copy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/uninitialized_fill.h" "$(@D)/cuda/include/thrust/system/tbb/detail/uninitialized_fill.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/unique.h" "$(@D)/cuda/include/thrust/system/tbb/detail/unique.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/unique.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/unique.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/unique_by_key.h" "$(@D)/cuda/include/thrust/system/tbb/detail/unique_by_key.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/unique_by_key.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/unique_by_key.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/vector.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/vector.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/execution_policy.h" "$(@D)/cuda/include/thrust/system/tbb/execution_policy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/memory.h" "$(@D)/cuda/include/thrust/system/tbb/memory.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/vector.h" "$(@D)/cuda/include/thrust/system/tbb/vector.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system_error.h" "$(@D)/cuda/include/thrust/system_error.h" && cp -f "/usr/local/cuda-9.0/include/thrust/tabulate.h" "$(@D)/cuda/include/thrust/tabulate.h" && cp -f "/usr/local/cuda-9.0/include/thrust/transform.h" "$(@D)/cuda/include/thrust/transform.h" && cp -f "/usr/local/cuda-9.0/include/thrust/transform_reduce.h" "$(@D)/cuda/include/thrust/transform_reduce.h" && cp -f "/usr/local/cuda-9.0/include/thrust/transform_scan.h" "$(@D)/cuda/include/thrust/transform_scan.h" && cp -f "/usr/local/cuda-9.0/include/thrust/tuple.h" "$(@D)/cuda/include/thrust/tuple.h" && cp -f "/usr/local/cuda-9.0/include/thrust/uninitialized_copy.h" "$(@D)/cuda/include/thrust/uninitialized_copy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/uninitialized_fill.h" "$(@D)/cuda/include/thrust/uninitialized_fill.h" && cp -f "/usr/local/cuda-9.0/include/thrust/unique.h" "$(@D)/cuda/include/thrust/unique.h" && cp -f "/usr/local/cuda-9.0/include/thrust/version.h" "$(@D)/cuda/include/thrust/version.h" && cp -f "/usr/local/cuda-9.0/include/vector_functions.h" "$(@D)/cuda/include/vector_functions.h" && cp -f "/usr/local/cuda-9.0/include/vector_functions.hpp" "$(@D)/cuda/include/vector_functions.hpp" && cp -f "/usr/local/cuda-9.0/include/vector_types.h" "$(@D)/cuda/include/vector_types.h"
    """,
 )
 
@@ -1198,7 +1198,7 @@ genrule(
         "cuda/nvvm/libdevice/libdevice.10.bc",
     ],
     cmd = """
-if [ -d "$(@D)/extras" ]; then rm $(@D)/extras -drf; fi && if [ -d "$(@D)/include" ]; then rm $(@D)/include -drf; fi && if [ -d "$(@D)/lib" ]; then rm $(@D)/lib -drf; fi && if [ -d "$(@D)/nvvm" ]; then rm $(@D)/nvvm -drf; fi && cp "/usr/local/cuda-9.0/nvvm/libdevice/libdevice.10.bc" "$(@D)//libdevice.10.bc"
+if [ -d "$(@D)/extras" ]; then rm $(@D)/extras -drf; fi && if [ -d "$(@D)/include" ]; then rm $(@D)/include -drf; fi && if [ -d "$(@D)/lib" ]; then rm $(@D)/lib -drf; fi && if [ -d "$(@D)/nvvm" ]; then rm $(@D)/nvvm -drf; fi && cp -f "/usr/local/cuda-9.0/nvvm/libdevice/libdevice.10.bc" "$(@D)//libdevice.10.bc"
    """,
 )
 
@@ -1235,7 +1235,7 @@ genrule(
         "cuda/extras/CUPTI/include/openacc/cupti_openacc.h",
     ],
     cmd = """
-if [ -d "$(@D)/extras" ]; then rm $(@D)/extras -drf; fi && if [ -d "$(@D)/include" ]; then rm $(@D)/include -drf; fi && if [ -d "$(@D)/lib" ]; then rm $(@D)/lib -drf; fi && if [ -d "$(@D)/nvvm" ]; then rm $(@D)/nvvm -drf; fi && cp "/usr/local/cuda-9.0/extras/CUPTI/include/GL/gl.h" "$(@D)/cuda/extras/CUPTI/include/GL/gl.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/GL/glew.h" "$(@D)/cuda/extras/CUPTI/include/GL/glew.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/GL/glext.h" "$(@D)/cuda/extras/CUPTI/include/GL/glext.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/GL/glu.h" "$(@D)/cuda/extras/CUPTI/include/GL/glu.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/GL/glut.h" "$(@D)/cuda/extras/CUPTI/include/GL/glut.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/GL/glx.h" "$(@D)/cuda/extras/CUPTI/include/GL/glx.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/GL/glxext.h" "$(@D)/cuda/extras/CUPTI/include/GL/glxext.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/GL/wglew.h" "$(@D)/cuda/extras/CUPTI/include/GL/wglew.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/GL/wglext.h" "$(@D)/cuda/extras/CUPTI/include/GL/wglext.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/cuda_stdint.h" "$(@D)/cuda/extras/CUPTI/include/cuda_stdint.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/cupti.h" "$(@D)/cuda/extras/CUPTI/include/cupti.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/cupti_activity.h" "$(@D)/cuda/extras/CUPTI/include/cupti_activity.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/cupti_callbacks.h" "$(@D)/cuda/extras/CUPTI/include/cupti_callbacks.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/cupti_driver_cbid.h" "$(@D)/cuda/extras/CUPTI/include/cupti_driver_cbid.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/cupti_events.h" "$(@D)/cuda/extras/CUPTI/include/cupti_events.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/cupti_metrics.h" "$(@D)/cuda/extras/CUPTI/include/cupti_metrics.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/cupti_nvtx_cbid.h" "$(@D)/cuda/extras/CUPTI/include/cupti_nvtx_cbid.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/cupti_result.h" "$(@D)/cuda/extras/CUPTI/include/cupti_result.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/cupti_runtime_cbid.h" "$(@D)/cuda/extras/CUPTI/include/cupti_runtime_cbid.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/cupti_version.h" "$(@D)/cuda/extras/CUPTI/include/cupti_version.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/generated_cudaGL_meta.h" "$(@D)/cuda/extras/CUPTI/include/generated_cudaGL_meta.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/generated_cudaVDPAU_meta.h" "$(@D)/cuda/extras/CUPTI/include/generated_cudaVDPAU_meta.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/generated_cuda_gl_interop_meta.h" "$(@D)/cuda/extras/CUPTI/include/generated_cuda_gl_interop_meta.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/generated_cuda_meta.h" "$(@D)/cuda/extras/CUPTI/include/generated_cuda_meta.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/generated_cuda_runtime_api_meta.h" "$(@D)/cuda/extras/CUPTI/include/generated_cuda_runtime_api_meta.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/generated_cuda_vdpau_interop_meta.h" "$(@D)/cuda/extras/CUPTI/include/generated_cuda_vdpau_interop_meta.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/generated_nvtx_meta.h" "$(@D)/cuda/extras/CUPTI/include/generated_nvtx_meta.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/openacc/cupti_openacc.h" "$(@D)/cuda/extras/CUPTI/include/openacc/cupti_openacc.h"
+if [ -d "$(@D)/extras" ]; then rm $(@D)/extras -drf; fi && if [ -d "$(@D)/include" ]; then rm $(@D)/include -drf; fi && if [ -d "$(@D)/lib" ]; then rm $(@D)/lib -drf; fi && if [ -d "$(@D)/nvvm" ]; then rm $(@D)/nvvm -drf; fi && cp -f "/usr/local/cuda-9.0/extras/CUPTI/include/GL/gl.h" "$(@D)/cuda/extras/CUPTI/include/GL/gl.h" && cp -f "/usr/local/cuda-9.0/extras/CUPTI/include/GL/glew.h" "$(@D)/cuda/extras/CUPTI/include/GL/glew.h" && cp -f "/usr/local/cuda-9.0/extras/CUPTI/include/GL/glext.h" "$(@D)/cuda/extras/CUPTI/include/GL/glext.h" && cp -f "/usr/local/cuda-9.0/extras/CUPTI/include/GL/glu.h" "$(@D)/cuda/extras/CUPTI/include/GL/glu.h" && cp -f "/usr/local/cuda-9.0/extras/CUPTI/include/GL/glut.h" "$(@D)/cuda/extras/CUPTI/include/GL/glut.h" && cp -f "/usr/local/cuda-9.0/extras/CUPTI/include/GL/glx.h" "$(@D)/cuda/extras/CUPTI/include/GL/glx.h" && cp -f "/usr/local/cuda-9.0/extras/CUPTI/include/GL/glxext.h" "$(@D)/cuda/extras/CUPTI/include/GL/glxext.h" && cp -f "/usr/local/cuda-9.0/extras/CUPTI/include/GL/wglew.h" "$(@D)/cuda/extras/CUPTI/include/GL/wglew.h" && cp -f "/usr/local/cuda-9.0/extras/CUPTI/include/GL/wglext.h" "$(@D)/cuda/extras/CUPTI/include/GL/wglext.h" && cp -f "/usr/local/cuda-9.0/extras/CUPTI/include/cuda_stdint.h" "$(@D)/cuda/extras/CUPTI/include/cuda_stdint.h" && cp -f "/usr/local/cuda-9.0/extras/CUPTI/include/cupti.h" "$(@D)/cuda/extras/CUPTI/include/cupti.h" && cp -f "/usr/local/cuda-9.0/extras/CUPTI/include/cupti_activity.h" "$(@D)/cuda/extras/CUPTI/include/cupti_activity.h" && cp -f "/usr/local/cuda-9.0/extras/CUPTI/include/cupti_callbacks.h" "$(@D)/cuda/extras/CUPTI/include/cupti_callbacks.h" && cp -f "/usr/local/cuda-9.0/extras/CUPTI/include/cupti_driver_cbid.h" "$(@D)/cuda/extras/CUPTI/include/cupti_driver_cbid.h" && cp -f "/usr/local/cuda-9.0/extras/CUPTI/include/cupti_events.h" "$(@D)/cuda/extras/CUPTI/include/cupti_events.h" && cp -f "/usr/local/cuda-9.0/extras/CUPTI/include/cupti_metrics.h" "$(@D)/cuda/extras/CUPTI/include/cupti_metrics.h" && cp -f "/usr/local/cuda-9.0/extras/CUPTI/include/cupti_nvtx_cbid.h" "$(@D)/cuda/extras/CUPTI/include/cupti_nvtx_cbid.h" && cp -f "/usr/local/cuda-9.0/extras/CUPTI/include/cupti_result.h" "$(@D)/cuda/extras/CUPTI/include/cupti_result.h" && cp -f "/usr/local/cuda-9.0/extras/CUPTI/include/cupti_runtime_cbid.h" "$(@D)/cuda/extras/CUPTI/include/cupti_runtime_cbid.h" && cp -f "/usr/local/cuda-9.0/extras/CUPTI/include/cupti_version.h" "$(@D)/cuda/extras/CUPTI/include/cupti_version.h" && cp -f "/usr/local/cuda-9.0/extras/CUPTI/include/generated_cudaGL_meta.h" "$(@D)/cuda/extras/CUPTI/include/generated_cudaGL_meta.h" && cp -f "/usr/local/cuda-9.0/extras/CUPTI/include/generated_cudaVDPAU_meta.h" "$(@D)/cuda/extras/CUPTI/include/generated_cudaVDPAU_meta.h" && cp -f "/usr/local/cuda-9.0/extras/CUPTI/include/generated_cuda_gl_interop_meta.h" "$(@D)/cuda/extras/CUPTI/include/generated_cuda_gl_interop_meta.h" && cp -f "/usr/local/cuda-9.0/extras/CUPTI/include/generated_cuda_meta.h" "$(@D)/cuda/extras/CUPTI/include/generated_cuda_meta.h" && cp -f "/usr/local/cuda-9.0/extras/CUPTI/include/generated_cuda_runtime_api_meta.h" "$(@D)/cuda/extras/CUPTI/include/generated_cuda_runtime_api_meta.h" && cp -f "/usr/local/cuda-9.0/extras/CUPTI/include/generated_cuda_vdpau_interop_meta.h" "$(@D)/cuda/extras/CUPTI/include/generated_cuda_vdpau_interop_meta.h" && cp -f "/usr/local/cuda-9.0/extras/CUPTI/include/generated_nvtx_meta.h" "$(@D)/cuda/extras/CUPTI/include/generated_nvtx_meta.h" && cp -f "/usr/local/cuda-9.0/extras/CUPTI/include/openacc/cupti_openacc.h" "$(@D)/cuda/extras/CUPTI/include/openacc/cupti_openacc.h"
    """,
 )
 
@@ -1253,7 +1253,7 @@ genrule(
         "cuda/lib/libcupti.so.9.0",
     ],
     cmd = """
-if [ -d "$(@D)/extras" ]; then rm $(@D)/extras -drf; fi && if [ -d "$(@D)/include" ]; then rm $(@D)/include -drf; fi && if [ -d "$(@D)/lib" ]; then rm $(@D)/lib -drf; fi && if [ -d "$(@D)/nvvm" ]; then rm $(@D)/nvvm -drf; fi && cp "/usr/local/cuda-9.0/targets/x86_64-linux/lib/stubs/libcuda.so" "$(@D)/cuda/lib/libcuda.so" && cp "/usr/local/cuda-9.0/targets/x86_64-linux/lib/libcudart.so.9.0.176" "$(@D)/cuda/lib/libcudart.so.9.0" && cp "/usr/local/cuda-9.0/targets/x86_64-linux/lib/libcudart_static.a" "$(@D)/cuda/lib/libcudart_static.a" && cp "/usr/local/cuda-9.0/targets/x86_64-linux/lib/libcublas.so.9.0.480" "$(@D)/cuda/lib/libcublas.so.9.0" && cp "/usr/local/cuda-9.0/targets/x86_64-linux/lib/libcusolver.so.9.0.176" "$(@D)/cuda/lib/libcusolver.so.9.0" && cp "/usr/local/cuda-9.0/targets/x86_64-linux/lib/libcurand.so.9.0.176" "$(@D)/cuda/lib/libcurand.so.9.0" && cp "/usr/local/cuda-9.0/targets/x86_64-linux/lib/libcufft.so.9.0.176" "$(@D)/cuda/lib/libcufft.so.9.0" && cp "/usr/lib/x86_64-linux-gnu/libcudnn.so.7.1.4" "$(@D)/cuda/lib/libcudnn.so.7" && cp "/usr/local/cuda/extras/CUPTI/lib64/libcupti.so.9.0.176" "$(@D)/cuda/lib/libcupti.so.9.0"
+if [ -d "$(@D)/extras" ]; then rm $(@D)/extras -drf; fi && if [ -d "$(@D)/include" ]; then rm $(@D)/include -drf; fi && if [ -d "$(@D)/lib" ]; then rm $(@D)/lib -drf; fi && if [ -d "$(@D)/nvvm" ]; then rm $(@D)/nvvm -drf; fi && cp -f "/usr/local/cuda-9.0/targets/x86_64-linux/lib/stubs/libcuda.so" "$(@D)/cuda/lib/libcuda.so" && cp -f "/usr/local/cuda-9.0/targets/x86_64-linux/lib/libcudart.so.9.0.176" "$(@D)/cuda/lib/libcudart.so.9.0" && cp -f "/usr/local/cuda-9.0/targets/x86_64-linux/lib/libcudart_static.a" "$(@D)/cuda/lib/libcudart_static.a" && cp -f "/usr/local/cuda-9.0/targets/x86_64-linux/lib/libcublas.so.9.0.480" "$(@D)/cuda/lib/libcublas.so.9.0" && cp -f "/usr/local/cuda-9.0/targets/x86_64-linux/lib/libcusolver.so.9.0.176" "$(@D)/cuda/lib/libcusolver.so.9.0" && cp -f "/usr/local/cuda-9.0/targets/x86_64-linux/lib/libcurand.so.9.0.176" "$(@D)/cuda/lib/libcurand.so.9.0" && cp -f "/usr/local/cuda-9.0/targets/x86_64-linux/lib/libcufft.so.9.0.176" "$(@D)/cuda/lib/libcufft.so.9.0" && cp -f "/usr/lib/x86_64-linux-gnu/libcudnn.so.7.1.4" "$(@D)/cuda/lib/libcudnn.so.7" && cp -f "/usr/local/cuda-9.0/extras/CUPTI/lib64/libcupti.so.9.0.176" "$(@D)/cuda/lib/libcupti.so.9.0"
    """,
 )
 
@@ -1263,6 +1263,6 @@ genrule(
         "cuda/include/cudnn.h",
     ],
     cmd = """
-if [ -d "$(@D)/extras" ]; then rm $(@D)/extras -drf; fi && if [ -d "$(@D)/include" ]; then rm $(@D)/include -drf; fi && if [ -d "$(@D)/lib" ]; then rm $(@D)/lib -drf; fi && if [ -d "$(@D)/nvvm" ]; then rm $(@D)/nvvm -drf; fi && cp "/usr/include/cudnn.h" "$(@D)/cudnn.h"
+if [ -d "$(@D)/extras" ]; then rm $(@D)/extras -drf; fi && if [ -d "$(@D)/include" ]; then rm $(@D)/include -drf; fi && if [ -d "$(@D)/lib" ]; then rm $(@D)/lib -drf; fi && if [ -d "$(@D)/nvvm" ]; then rm $(@D)/nvvm -drf; fi && cp -f "/usr/include/cudnn.h" "$(@D)/cudnn.h"
    """,
 )
diff --git a/third_party/toolchains/preconfig/ubuntu14.04/cuda9.0-cudnn7/cuda/build_defs.bzl b/third_party/toolchains/preconfig/ubuntu14.04/cuda9.0-cudnn7/cuda/build_defs.bzl
index 5c6703aab4f..a53c891d8bb 100755
--- a/third_party/toolchains/preconfig/ubuntu14.04/cuda9.0-cudnn7/cuda/build_defs.bzl
+++ b/third_party/toolchains/preconfig/ubuntu14.04/cuda9.0-cudnn7/cuda/build_defs.bzl
@@ -9,15 +9,13 @@ def if_cuda(if_true, if_false = []):
     return select({
         "@local_config_cuda//cuda:using_nvcc": if_true,
         "@local_config_cuda//cuda:using_clang": if_true,
-        "//conditions:default": if_false
+        "//conditions:default": if_false,
     })
 
-
 def cuda_default_copts():
     """Default options for all CUDA compilations."""
     return if_cuda(["-x", "cuda", "-DGOOGLE_CUDA=1"] + [])
 
-
 def cuda_is_configured():
     """Returns true if CUDA was enabled during the configure process."""
     return True
@@ -29,5 +27,5 @@ def if_cuda_is_configured(x):
     --config=cuda. Used to allow non-CUDA code to depend on CUDA libraries.
     """
     if cuda_is_configured():
-      return x
+        return x
     return []
diff --git a/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda10.0/BUILD b/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda10.0/BUILD
new file mode 100755
index 00000000000..6442e7628a4
--- /dev/null
+++ b/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda10.0/BUILD
@@ -0,0 +1,87 @@
+licenses(["restricted"])
+
+package(default_visibility = ["//visibility:public"])
+
+toolchain(
+    name = "toolchain-linux-x86_64",
+    exec_compatible_with = [
+        "@bazel_tools//platforms:linux",
+        "@bazel_tools//platforms:x86_64",
+    ],
+    target_compatible_with = [
+        "@bazel_tools//platforms:linux",
+        "@bazel_tools//platforms:x86_64",
+    ],
+    toolchain = ":cc-compiler-local",
+    toolchain_type = "@bazel_tools//tools/cpp:toolchain_type",
+)
+
+cc_toolchain_suite(
+    name = "toolchain",
+    toolchains = {
+        "local|compiler": ":cc-compiler-local",
+        "darwin|compiler": ":cc-compiler-darwin",
+        "x64_windows|msvc-cl": ":cc-compiler-windows",
+    },
+)
+
+cc_toolchain(
+    name = "cc-compiler-local",
+    all_files = ":crosstool_wrapper_driver_is_not_gcc",
+    compiler_files = ":empty",
+    cpu = "local",
+    dwp_files = ":empty",
+    dynamic_runtime_libs = [":empty"],
+    linker_files = ":crosstool_wrapper_driver_is_not_gcc",
+    objcopy_files = ":empty",
+    static_runtime_libs = [":empty"],
+    strip_files = ":empty",
+    # To support linker flags that need to go to the start of command line
+    # we need the toolchain to support parameter files. Parameter files are
+    # last on the command line and contain all shared libraries to link, so all
+    # regular options will be left of them.
+    supports_param_files = 1,
+)
+
+cc_toolchain(
+    name = "cc-compiler-darwin",
+    all_files = ":crosstool_wrapper_driver_is_not_gcc",
+    compiler_files = ":empty",
+    cpu = "darwin",
+    dwp_files = ":empty",
+    dynamic_runtime_libs = [":empty"],
+    linker_files = ":crosstool_wrapper_driver_is_not_gcc",
+    objcopy_files = ":empty",
+    static_runtime_libs = [":empty"],
+    strip_files = ":empty",
+    supports_param_files = 0,
+)
+
+cc_toolchain(
+    name = "cc-compiler-windows",
+    all_files = ":windows_msvc_wrapper_files",
+    compiler_files = ":empty",
+    cpu = "x64_windows",
+    dwp_files = ":empty",
+    dynamic_runtime_libs = [":empty"],
+    linker_files = ":windows_msvc_wrapper_files",
+    objcopy_files = ":empty",
+    static_runtime_libs = [":empty"],
+    strip_files = ":empty",
+    supports_param_files = 1,
+)
+
+filegroup(
+    name = "empty",
+    srcs = [],
+)
+
+filegroup(
+    name = "crosstool_wrapper_driver_is_not_gcc",
+    srcs = ["clang/bin/crosstool_wrapper_driver_is_not_gcc"],
+)
+
+filegroup(
+    name = "windows_msvc_wrapper_files",
+    srcs = glob(["windows/msvc_*"]),
+)
diff --git a/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda10.0/CROSSTOOL b/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda10.0/CROSSTOOL
new file mode 100755
index 00000000000..1c2e8bcae63
--- /dev/null
+++ b/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda10.0/CROSSTOOL
@@ -0,0 +1,1431 @@
+major_version: "local"
+minor_version: ""
+default_target_cpu: "same_as_host"
+
+default_toolchain {
+  cpu: "k8"
+  toolchain_identifier: "local_linux"
+}
+default_toolchain {
+  cpu: "piii"
+  toolchain_identifier: "local_linux"
+}
+default_toolchain {
+  cpu: "arm"
+  toolchain_identifier: "local_linux"
+}
+default_toolchain {
+  cpu: "darwin"
+  toolchain_identifier: "local_darwin"
+}
+default_toolchain {
+  cpu: "ppc"
+  toolchain_identifier: "local_linux"
+}
+default_toolchain {
+  cpu: "x64_windows"
+  toolchain_identifier: "local_windows"
+}
+
+toolchain {
+  abi_version: "local"
+  abi_libc_version: "local"
+  compiler: "compiler"
+  host_system_name: "local"
+  needsPic: true
+  target_libc: "local"
+  target_cpu: "local"
+  target_system_name: "local"
+  toolchain_identifier: "local_linux"
+
+  feature {
+    name: "c++11"
+    flag_set {
+      action: "c++-compile"
+      flag_group {
+        flag: "-std=c++11"
+      }
+    }
+  }
+
+  feature {
+    name: "stdlib"
+    flag_set {
+      action: "c++-link-executable"
+      action: "c++-link-dynamic-library"
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: "-lstdc++"
+      }
+    }
+  }
+
+  feature {
+    name: "determinism"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        # Make C++ compilation deterministic. Use linkstamping instead of these
+        # compiler symbols.
+        flag: "-Wno-builtin-macro-redefined"
+        flag: "-D__DATE__=\"redacted\""
+        flag: "-D__TIMESTAMP__=\"redacted\""
+        flag: "-D__TIME__=\"redacted\""
+      }
+    }
+  }
+
+  feature {
+    name: "alwayslink"
+    flag_set {
+      action: "c++-link-dynamic-library"
+      action: "c++-link-nodeps-dynamic-library"
+      action: "c++-link-executable"
+      flag_group {
+        flag: "-Wl,-no-as-needed"
+      }
+    }
+  }
+
+  # This feature will be enabled for builds that support pic by bazel.
+  feature {
+    name: "pic"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        expand_if_all_available: "pic"
+        flag: "-fPIC"
+      }
+      flag_group {
+        expand_if_none_available: "pic"
+        flag: "-fPIE"
+      }
+    }
+  }
+
+  # Security hardening on by default.
+  feature {
+    name: "hardening"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        # Conservative choice; -D_FORTIFY_SOURCE=2 may be unsafe in some cases.
+        # We need to undef it before redefining it as some distributions now
+        # have it enabled by default.
+        flag: "-U_FORTIFY_SOURCE"
+        flag: "-D_FORTIFY_SOURCE=1"
+        flag: "-fstack-protector"
+      }
+    }
+    flag_set {
+      action: "c++-link-dynamic-library"
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: "-Wl,-z,relro,-z,now"
+      }
+    }
+    flag_set {
+      action: "c++-link-executable"
+      flag_group {
+        flag: "-pie"
+        flag: "-Wl,-z,relro,-z,now"
+      }
+    }
+  }
+
+  feature {
+    name: "warnings"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        # All warnings are enabled. Maybe enable -Werror as well?
+        flag: "-Wall"
+        
+      }
+    }
+  }
+
+  # Keep stack frames for debugging, even in opt mode.
+  feature {
+    name: "frame-pointer"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        flag: "-fno-omit-frame-pointer"
+      }
+    }
+  }
+
+  feature {
+    name: "build-id"
+    flag_set {
+      action: "c++-link-executable"
+      action: "c++-link-dynamic-library"
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        # Stamp the binary with a unique identifier.
+        flag: "-Wl,--build-id=md5"
+        flag: "-Wl,--hash-style=gnu"
+      }
+    }
+  }
+
+  feature {
+    name: "no-canonical-prefixes"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      action: "c++-link-executable"
+      action: "c++-link-dynamic-library"
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: "-no-canonical-prefixes"
+        flag: "-fno-canonical-system-headers"
+      }
+    }
+  }
+
+  feature {
+    name: "disable-assertions"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        flag: "-DNDEBUG"
+      }
+    }
+  }
+
+  feature {
+    name: "linker-bin-path"
+
+    flag_set {
+      action: "c++-link-executable"
+      action: "c++-link-dynamic-library"
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: "-B/usr/bin"
+      }
+    }
+  }
+
+  feature {
+    name: "common"
+    implies: "stdlib"
+    implies: "c++11"
+    implies: "determinism"
+    implies: "alwayslink"
+    implies: "hardening"
+    implies: "warnings"
+    implies: "frame-pointer"
+    implies: "build-id"
+    implies: "no-canonical-prefixes"
+    implies: "linker-bin-path"
+  }
+
+  feature {
+    name: "opt"
+    implies: "common"
+    implies: "disable-assertions"
+
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        # No debug symbols.
+        # Maybe we should enable https://gcc.gnu.org/wiki/DebugFission for opt
+        # or even generally? However, that can't happen here, as it requires
+        # special handling in Bazel.
+        flag: "-g0"
+
+        # Conservative choice for -O
+        # -O3 can increase binary size and even slow down the resulting binaries.
+        # Profile first and / or use FDO if you need better performance than this.
+        flag: "-O2"
+
+        # Removal of unused code and data at link time (can this increase binary size in some cases?).
+        flag: "-ffunction-sections"
+        flag: "-fdata-sections"
+      }
+    }
+    flag_set {
+      action: "c++-link-dynamic-library"
+      action: "c++-link-nodeps-dynamic-library"
+      action: "c++-link-executable"
+      flag_group {
+        flag: "-Wl,--gc-sections"
+      }
+    }
+  }
+
+  feature {
+    name: "fastbuild"
+    implies: "common"
+  }
+
+  feature {
+    name: "dbg"
+    implies: "common"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        flag: "-g"
+      }
+    }
+  }
+
+  # Set clang as a C/C++ compiler.
+  tool_path { name: "gcc" path: "clang/bin/crosstool_wrapper_driver_is_not_gcc" }
+
+  # Use the default system toolchain for everything else.
+  tool_path { name: "ar" path: "/usr/bin/ar" }
+  tool_path { name: "compat-ld" path: "/usr/bin/ld" }
+  tool_path { name: "cpp" path: "/usr/bin/cpp" }
+  tool_path { name: "dwp" path: "/usr/bin/dwp" }
+  tool_path { name: "gcov" path: "/usr/bin/gcov" }
+  tool_path { name: "ld" path: "/usr/bin/ld" }
+  tool_path { name: "nm" path: "/usr/bin/nm" }
+  tool_path { name: "objcopy" path: "/usr/bin/objcopy" }
+  tool_path { name: "objdump" path: "/usr/bin/objdump" }
+  tool_path { name: "strip" path: "/usr/bin/strip" }
+
+  # Enabled dynamic linking.
+  linking_mode_flags { mode: DYNAMIC }
+
+  cxx_builtin_include_directory: "/usr/include/c++/4.8"
+  cxx_builtin_include_directory: "/usr/include/x86_64-linux-gnu/c++/4.8"
+  cxx_builtin_include_directory: "/usr/include/c++/4.8/backward"
+  cxx_builtin_include_directory: "/usr/lib/gcc/x86_64-linux-gnu/4.8/include"
+  cxx_builtin_include_directory: "/usr/local/include"
+  cxx_builtin_include_directory: "/usr/lib/gcc/x86_64-linux-gnu/4.8/include-fixed"
+  cxx_builtin_include_directory: "/usr/include/x86_64-linux-gnu"
+  cxx_builtin_include_directory: "/usr/include"
+  cxx_builtin_include_directory: "/usr/local/cuda-10.0/targets/x86_64-linux/include"
+  cxx_builtin_include_directory: "/usr/local/cuda-10.0/include"
+  cxx_builtin_include_directory: "/usr/local/cuda-10.0/extras/CUPTI/include"
+  cxx_builtin_include_directory: "/usr/include"
+}
+
+toolchain {
+  abi_version: "local"
+  abi_libc_version: "local"
+  compiler: "compiler"
+  host_system_name: "local"
+  needsPic: true
+  target_libc: "macosx"
+  target_cpu: "darwin"
+  target_system_name: "local"
+  toolchain_identifier: "local_darwin"
+  feature {
+    name: "c++11"
+    flag_set {
+      action: "c++-compile"
+      flag_group {
+        flag: "-std=c++11"
+      }
+    }
+  }
+
+  feature {
+    name: "stdlib"
+    flag_set {
+      action: "c++-link-executable"
+      action: "c++-link-dynamic-library"
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: "-lc++"
+      }
+    }
+  }
+
+  feature {
+    name: "determinism"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        # Make C++ compilation deterministic. Use linkstamping instead of these
+        # compiler symbols.
+        flag: "-Wno-builtin-macro-redefined"
+        flag: "-D__DATE__=\"redacted\""
+        flag: "-D__TIMESTAMP__=\"redacted\""
+        flag: "-D__TIME__=\"redacted\""
+      }
+    }
+  }
+
+  # This feature will be enabled for builds that support pic by bazel.
+  feature {
+    name: "pic"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        expand_if_all_available: "pic"
+        flag: "-fPIC"
+      }
+      flag_group {
+        expand_if_none_available: "pic"
+        flag: "-fPIE"
+      }
+    }
+  }
+
+  # Security hardening on by default.
+  feature {
+    name: "hardening"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        # Conservative choice; -D_FORTIFY_SOURCE=2 may be unsafe in some cases.
+        # We need to undef it before redefining it as some distributions now
+        # have it enabled by default.
+        flag: "-U_FORTIFY_SOURCE"
+        flag: "-D_FORTIFY_SOURCE=1"
+        flag: "-fstack-protector"
+      }
+    }
+    flag_set {
+      action: "c++-link-executable"
+      flag_group {
+        flag: "-pie"
+      }
+    }
+  }
+
+  feature {
+    name: "warnings"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        # All warnings are enabled. Maybe enable -Werror as well?
+        flag: "-Wall"
+        
+      }
+    }
+  }
+
+  # Keep stack frames for debugging, even in opt mode.
+  feature {
+    name: "frame-pointer"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        flag: "-fno-omit-frame-pointer"
+      }
+    }
+  }
+
+  feature {
+    name: "no-canonical-prefixes"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      action: "c++-link-executable"
+      action: "c++-link-dynamic-library"
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag:"-no-canonical-prefixes"
+      }
+    }
+  }
+
+  feature {
+    name: "disable-assertions"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        flag: "-DNDEBUG"
+      }
+    }
+  }
+
+  feature {
+    name: "linker-bin-path"
+
+    flag_set {
+      action: "c++-link-executable"
+      action: "c++-link-dynamic-library"
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: "-B/usr/bin"
+      }
+    }
+  }
+
+  feature {
+    name: "undefined-dynamic"
+    flag_set {
+      action: "c++-link-dynamic-library"
+      action: "c++-link-nodeps-dynamic-library"
+      action: "c++-link-executable"
+      flag_group {
+        flag: "-undefined"
+        flag: "dynamic_lookup"
+      }
+    }
+  }
+
+  feature {
+    name: "common"
+    implies: "stdlib"
+    implies: "c++11"
+    implies: "determinism"
+    implies: "hardening"
+    implies: "warnings"
+    implies: "frame-pointer"
+    implies: "no-canonical-prefixes"
+    implies: "linker-bin-path"
+    implies: "undefined-dynamic"
+  }
+
+  feature {
+    name: "opt"
+    implies: "common"
+    implies: "disable-assertions"
+
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        # No debug symbols.
+        # Maybe we should enable https://gcc.gnu.org/wiki/DebugFission for opt
+        # or even generally? However, that can't happen here, as it requires
+        # special handling in Bazel.
+        flag: "-g0"
+
+        # Conservative choice for -O
+        # -O3 can increase binary size and even slow down the resulting binaries.
+        # Profile first and / or use FDO if you need better performance than this.
+        flag: "-O2"
+
+        # Removal of unused code and data at link time (can this increase binary size in some cases?).
+        flag: "-ffunction-sections"
+        flag: "-fdata-sections"
+      }
+    }
+  }
+
+  feature {
+    name: "fastbuild"
+    implies: "common"
+  }
+
+  feature {
+    name: "dbg"
+    implies: "common"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        flag: "-g"
+      }
+    }
+  }
+
+  # Set clang as a C/C++ compiler.
+  tool_path { name: "gcc" path: "clang/bin/crosstool_wrapper_driver_is_not_gcc" }
+
+  # Use the default system toolchain for everything else.
+  tool_path { name: "ar" path: "/usr/bin/libtool" }
+  tool_path { name: "compat-ld" path: "/usr/bin/ld" }
+  tool_path { name: "cpp" path: "/usr/bin/cpp" }
+  tool_path { name: "dwp" path: "/usr/bin/dwp" }
+  tool_path { name: "gcov" path: "/usr/bin/gcov" }
+  tool_path { name: "ld" path: "/usr/bin/ld" }
+  tool_path { name: "nm" path: "/usr/bin/nm" }
+  tool_path { name: "objcopy" path: "/usr/bin/objcopy" }
+  tool_path { name: "objdump" path: "/usr/bin/objdump" }
+  tool_path { name: "strip" path: "/usr/bin/strip" }
+
+  # Enabled dynamic linking.
+  linking_mode_flags { mode: DYNAMIC }
+
+  cxx_builtin_include_directory: "/usr/include/c++/4.8"
+  cxx_builtin_include_directory: "/usr/include/x86_64-linux-gnu/c++/4.8"
+  cxx_builtin_include_directory: "/usr/include/c++/4.8/backward"
+  cxx_builtin_include_directory: "/usr/lib/gcc/x86_64-linux-gnu/4.8/include"
+  cxx_builtin_include_directory: "/usr/local/include"
+  cxx_builtin_include_directory: "/usr/lib/gcc/x86_64-linux-gnu/4.8/include-fixed"
+  cxx_builtin_include_directory: "/usr/include/x86_64-linux-gnu"
+  cxx_builtin_include_directory: "/usr/include"
+  cxx_builtin_include_directory: "/usr/local/cuda-10.0/targets/x86_64-linux/include"
+  cxx_builtin_include_directory: "/usr/local/cuda-10.0/include"
+  cxx_builtin_include_directory: "/usr/local/cuda-10.0/extras/CUPTI/include"
+  cxx_builtin_include_directory: "/usr/include"
+}
+
+toolchain {
+  toolchain_identifier: "local_windows"
+  host_system_name: "local"
+  target_system_name: "local"
+
+  abi_version: "local"
+  abi_libc_version: "local"
+  target_cpu: "x64_windows"
+  compiler: "msvc-cl"
+  target_libc: "msvcrt"
+
+
+
+  tool_path {
+    name: "ar"
+    path: ""
+  }
+  tool_path {
+    name: "ml"
+    path: ""
+  }
+  tool_path {
+    name: "cpp"
+    path: ""
+  }
+  tool_path {
+    name: "gcc"
+    path: ""
+  }
+  tool_path {
+    name: "gcov"
+    path: "wrapper/bin/msvc_nop.bat"
+  }
+  tool_path {
+    name: "ld"
+    path: ""
+  }
+  tool_path {
+    name: "nm"
+    path: "wrapper/bin/msvc_nop.bat"
+  }
+  tool_path {
+    name: "objcopy"
+    path: "wrapper/bin/msvc_nop.bat"
+  }
+  tool_path {
+    name: "objdump"
+    path: "wrapper/bin/msvc_nop.bat"
+  }
+  tool_path {
+    name: "strip"
+    path: "wrapper/bin/msvc_nop.bat"
+  }
+  supports_interface_shared_objects: true
+
+  # TODO(pcloudy): Review those flags below, they should be defined by cl.exe
+  compiler_flag: "/DCOMPILER_MSVC"
+
+  # Don't define min/max macros in windows.h.
+  compiler_flag: "/DNOMINMAX"
+
+  # Platform defines.
+  compiler_flag: "/D_WIN32_WINNT=0x0600"
+  # Turn off warning messages.
+  compiler_flag: "/D_CRT_SECURE_NO_DEPRECATE"
+  compiler_flag: "/D_CRT_SECURE_NO_WARNINGS"
+  compiler_flag: "/D_SILENCE_STDEXT_HASH_DEPRECATION_WARNINGS"
+
+  # Useful options to have on for compilation.
+  # Increase the capacity of object files to 2^32 sections.
+  compiler_flag: "/bigobj"
+  # Allocate 500MB for precomputed headers.
+  compiler_flag: "/Zm500"
+  # Use unsigned char by default.
+  compiler_flag: "/J"
+  # Use function level linking.
+  compiler_flag: "/Gy"
+  # Use string pooling.
+  compiler_flag: "/GF"
+  # Catch C++ exceptions only and tell the compiler to assume that functions declared
+  # as extern "C" never throw a C++ exception.
+  compiler_flag: "/EHsc"
+
+  # Globally disabled warnings.
+  # Don't warn about elements of array being be default initialized.
+  compiler_flag: "/wd4351"
+  # Don't warn about no matching delete found.
+  compiler_flag: "/wd4291"
+  # Don't warn about diamond inheritance patterns.
+  compiler_flag: "/wd4250"
+  # Don't warn about insecure functions (e.g. non _s functions).
+  compiler_flag: "/wd4996"
+
+  linker_flag: "/MACHINE:X64"
+
+  feature {
+    name: "no_legacy_features"
+  }
+
+  # Suppress startup banner.
+  feature {
+    name: "nologo"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      action: "c++-module-compile"
+      action: "c++-module-codegen"
+      action: "c++-header-parsing"
+      action: "assemble"
+      action: "preprocess-assemble"
+      action: "c++-link-executable"
+      action: "c++-link-dynamic-library"
+      action: "c++-link-nodeps-dynamic-library"
+      action: "c++-link-static-library"
+      flag_group {
+        flag: "/nologo"
+      }
+    }
+  }
+
+  feature {
+    name: 'has_configured_linker_path'
+  }
+
+  # This feature indicates strip is not supported, building stripped binary will just result a copy of orignial binary
+  feature {
+    name: 'no_stripping'
+  }
+
+  # This feature indicates this is a toolchain targeting Windows.
+  feature {
+    name: 'targets_windows'
+    implies: 'copy_dynamic_libraries_to_binary'
+    enabled: true
+  }
+
+  feature {
+    name: 'copy_dynamic_libraries_to_binary'
+  }
+
+  action_config {
+    config_name: 'assemble'
+    action_name: 'assemble'
+    tool {
+      tool_path: ''
+    }
+    implies: 'compiler_input_flags'
+    implies: 'compiler_output_flags'
+    implies: 'nologo'
+    implies: 'msvc_env'
+    implies: 'sysroot'
+  }
+
+  action_config {
+    config_name: 'preprocess-assemble'
+    action_name: 'preprocess-assemble'
+    tool {
+      tool_path: ''
+    }
+    implies: 'compiler_input_flags'
+    implies: 'compiler_output_flags'
+    implies: 'nologo'
+    implies: 'msvc_env'
+    implies: 'sysroot'
+  }
+
+  action_config {
+    config_name: 'c-compile'
+    action_name: 'c-compile'
+    tool {
+      tool_path: ''
+    }
+    implies: 'compiler_input_flags'
+    implies: 'compiler_output_flags'
+    implies: 'legacy_compile_flags'
+    implies: 'nologo'
+    implies: 'msvc_env'
+    implies: 'parse_showincludes'
+    implies: 'user_compile_flags'
+    implies: 'sysroot'
+    implies: 'unfiltered_compile_flags'
+  }
+
+  action_config {
+    config_name: 'c++-compile'
+    action_name: 'c++-compile'
+    tool {
+      tool_path: ''
+    }
+    implies: 'compiler_input_flags'
+    implies: 'compiler_output_flags'
+    implies: 'legacy_compile_flags'
+    implies: 'nologo'
+    implies: 'msvc_env'
+    implies: 'parse_showincludes'
+    implies: 'user_compile_flags'
+    implies: 'sysroot'
+    implies: 'unfiltered_compile_flags'
+  }
+
+  action_config {
+    config_name: 'c++-link-executable'
+    action_name: 'c++-link-executable'
+    tool {
+      tool_path: ''
+    }
+    implies: 'nologo'
+    implies: 'linkstamps'
+    implies: 'output_execpath_flags'
+    implies: 'input_param_flags'
+    implies: 'user_link_flags'
+    implies: 'legacy_link_flags'
+    implies: 'linker_subsystem_flag'
+    implies: 'linker_param_file'
+    implies: 'msvc_env'
+    implies: 'no_stripping'
+  }
+
+  action_config {
+    config_name: 'c++-link-dynamic-library'
+    action_name: 'c++-link-dynamic-library'
+    tool {
+      tool_path: ''
+    }
+    implies: 'nologo'
+    implies: 'shared_flag'
+    implies: 'linkstamps'
+    implies: 'output_execpath_flags'
+    implies: 'input_param_flags'
+    implies: 'user_link_flags'
+    implies: 'legacy_link_flags'
+    implies: 'linker_subsystem_flag'
+    implies: 'linker_param_file'
+    implies: 'msvc_env'
+    implies: 'no_stripping'
+    implies: 'has_configured_linker_path'
+    implies: 'def_file'
+  }
+
+  action_config {
+      config_name: 'c++-link-nodeps-dynamic-library'
+      action_name: 'c++-link-nodeps-dynamic-library'
+      tool {
+        tool_path: ''
+      }
+      implies: 'nologo'
+      implies: 'shared_flag'
+      implies: 'linkstamps'
+      implies: 'output_execpath_flags'
+      implies: 'input_param_flags'
+      implies: 'user_link_flags'
+      implies: 'legacy_link_flags'
+      implies: 'linker_subsystem_flag'
+      implies: 'linker_param_file'
+      implies: 'msvc_env'
+      implies: 'no_stripping'
+      implies: 'has_configured_linker_path'
+      implies: 'def_file'
+    }
+
+  action_config {
+    config_name: 'c++-link-static-library'
+    action_name: 'c++-link-static-library'
+    tool {
+      tool_path: ''
+    }
+    implies: 'nologo'
+    implies: 'archiver_flags'
+    implies: 'input_param_flags'
+    implies: 'linker_param_file'
+    implies: 'msvc_env'
+  }
+
+  # TODO(b/65151735): Remove legacy_compile_flags feature when legacy fields are
+  # not used in this crosstool
+  feature {
+    name: 'legacy_compile_flags'
+    flag_set {
+      expand_if_all_available: 'legacy_compile_flags'
+      action: 'preprocess-assemble'
+      action: 'c-compile'
+      action: 'c++-compile'
+      action: 'c++-header-parsing'
+      action: 'c++-module-compile'
+      action: 'c++-module-codegen'
+      flag_group {
+        iterate_over: 'legacy_compile_flags'
+        flag: '%{legacy_compile_flags}'
+      }
+    }
+  }
+
+  feature {
+    name: "msvc_env"
+    env_set {
+      action: "c-compile"
+      action: "c++-compile"
+      action: "c++-module-compile"
+      action: "c++-module-codegen"
+      action: "c++-header-parsing"
+      action: "assemble"
+      action: "preprocess-assemble"
+      action: "c++-link-executable"
+      action: "c++-link-dynamic-library"
+      action: "c++-link-nodeps-dynamic-library"
+      action: "c++-link-static-library"
+      env_entry {
+        key: "PATH"
+        value: ""
+      }
+      env_entry {
+        key: "INCLUDE"
+        value: ""
+      }
+      env_entry {
+        key: "LIB"
+        value: ""
+      }
+      env_entry {
+        key: "TMP"
+        value: ""
+      }
+      env_entry {
+        key: "TEMP"
+        value: ""
+      }
+    }
+  }
+
+  feature {
+    name: 'include_paths'
+    flag_set {
+      action: "assemble"
+      action: 'preprocess-assemble'
+      action: 'c-compile'
+      action: 'c++-compile'
+      action: 'c++-header-parsing'
+      action: 'c++-module-compile'
+      flag_group {
+        iterate_over: 'quote_include_paths'
+        flag: '/I%{quote_include_paths}'
+      }
+      flag_group {
+        iterate_over: 'include_paths'
+        flag: '/I%{include_paths}'
+      }
+      flag_group {
+        iterate_over: 'system_include_paths'
+        flag: '/I%{system_include_paths}'
+      }
+    }
+  }
+
+  feature {
+    name: "preprocessor_defines"
+    flag_set {
+      action: "assemble"
+      action: "preprocess-assemble"
+      action: "c-compile"
+      action: "c++-compile"
+      action: "c++-header-parsing"
+      action: "c++-module-compile"
+      flag_group {
+        flag: "/D%{preprocessor_defines}"
+        iterate_over: "preprocessor_defines"
+      }
+    }
+  }
+
+  # Tell Bazel to parse the output of /showIncludes
+  feature {
+    name: 'parse_showincludes'
+    flag_set {
+      action: 'preprocess-assemble'
+      action: 'c-compile'
+      action: 'c++-compile'
+      action: 'c++-module-compile'
+      action: 'c++-header-parsing'
+      flag_group {
+        flag: "/showIncludes"
+      }
+    }
+  }
+
+
+  feature {
+    name: 'generate_pdb_file'
+    requires: {
+      feature: 'dbg'
+    }
+    requires: {
+      feature: 'fastbuild'
+    }
+  }
+
+  feature {
+    name: 'shared_flag'
+    flag_set {
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: '/DLL'
+      }
+    }
+  }
+
+  feature {
+    name: 'linkstamps'
+    flag_set {
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      expand_if_all_available: 'linkstamp_paths'
+      flag_group {
+        iterate_over: 'linkstamp_paths'
+        flag: '%{linkstamp_paths}'
+      }
+    }
+  }
+
+  feature {
+    name: 'output_execpath_flags'
+    flag_set {
+      expand_if_all_available: 'output_execpath'
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: '/OUT:%{output_execpath}'
+      }
+    }
+  }
+
+  feature {
+    name: 'archiver_flags'
+    flag_set {
+      expand_if_all_available: 'output_execpath'
+      action: 'c++-link-static-library'
+      flag_group {
+        flag: '/OUT:%{output_execpath}'
+      }
+    }
+  }
+
+  feature {
+    name: 'input_param_flags'
+    flag_set {
+      expand_if_all_available: 'interface_library_output_path'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: "/IMPLIB:%{interface_library_output_path}"
+      }
+    }
+    flag_set {
+      expand_if_all_available: 'libopts'
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        iterate_over: 'libopts'
+        flag: '%{libopts}'
+      }
+    }
+    flag_set {
+      expand_if_all_available: 'libraries_to_link'
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      action: 'c++-link-static-library'
+      flag_group {
+        iterate_over: 'libraries_to_link'
+        flag_group {
+          expand_if_equal: {
+            variable: 'libraries_to_link.type'
+            value: 'object_file_group'
+          }
+          iterate_over: 'libraries_to_link.object_files'
+          flag_group {
+            flag: '%{libraries_to_link.object_files}'
+          }
+        }
+        flag_group {
+          expand_if_equal: {
+            variable: 'libraries_to_link.type'
+            value: 'object_file'
+          }
+          flag_group {
+            flag: '%{libraries_to_link.name}'
+          }
+        }
+        flag_group {
+          expand_if_equal: {
+            variable: 'libraries_to_link.type'
+            value: 'interface_library'
+          }
+          flag_group {
+            flag: '%{libraries_to_link.name}'
+          }
+        }
+        flag_group {
+          expand_if_equal: {
+            variable: 'libraries_to_link.type'
+            value: 'static_library'
+          }
+          flag_group {
+            expand_if_false: 'libraries_to_link.is_whole_archive'
+            flag: '%{libraries_to_link.name}'
+          }
+          flag_group {
+            expand_if_true: 'libraries_to_link.is_whole_archive'
+            flag: '/WHOLEARCHIVE:%{libraries_to_link.name}'
+          }
+        }
+      }
+    }
+  }
+
+  # Since this feature is declared earlier in the CROSSTOOL than
+  # "user_link_flags", this feature will be applied prior to it anwyhere they
+  # are both implied. And since "user_link_flags" contains the linkopts from
+  # the build rule, this allows the user to override the /SUBSYSTEM in the BUILD
+  # file.
+  feature {
+    name: 'linker_subsystem_flag'
+    flag_set {
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: '/SUBSYSTEM:CONSOLE'
+      }
+    }
+  }
+
+  # The "user_link_flags" contains user-defined linkopts (from build rules)
+  # so it should be defined after features that declare user-overridable flags.
+  # For example the "linker_subsystem_flag" defines a default "/SUBSYSTEM" flag
+  # but we want to let the user override it, therefore "link_flag_subsystem" is
+  # defined earlier in the CROSSTOOL file than "user_link_flags".
+  feature {
+    name: 'user_link_flags'
+    flag_set {
+      expand_if_all_available: 'user_link_flags'
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        iterate_over: 'user_link_flags'
+        flag: '%{user_link_flags}'
+      }
+    }
+  }
+  feature {
+    name: 'legacy_link_flags'
+    flag_set {
+      expand_if_all_available: 'legacy_link_flags'
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        iterate_over: 'legacy_link_flags'
+        flag: '%{legacy_link_flags}'
+      }
+    }
+  }
+
+  feature {
+    name: 'linker_param_file'
+    flag_set {
+      expand_if_all_available: 'linker_param_file'
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      action: 'c++-link-static-library'
+      flag_group {
+        flag: '@%{linker_param_file}'
+      }
+    }
+  }
+
+  feature {
+    name: 'static_link_msvcrt'
+  }
+
+  feature {
+    name: 'static_link_msvcrt_no_debug'
+    flag_set {
+      action: 'c-compile'
+      action: 'c++-compile'
+      flag_group {
+        flag: "/MT"
+      }
+    }
+    flag_set {
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: "/DEFAULTLIB:libcmt.lib"
+      }
+    }
+    requires: { feature: 'fastbuild'}
+    requires: { feature: 'opt'}
+  }
+
+  feature {
+    name: 'dynamic_link_msvcrt_no_debug'
+    flag_set {
+      action: 'c-compile'
+      action: 'c++-compile'
+      flag_group {
+        flag: "/MD"
+      }
+    }
+    flag_set {
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: "/DEFAULTLIB:msvcrt.lib"
+      }
+    }
+    requires: { feature: 'fastbuild'}
+    requires: { feature: 'opt'}
+  }
+
+  feature {
+    name: 'static_link_msvcrt_debug'
+    flag_set {
+      action: 'c-compile'
+      action: 'c++-compile'
+      flag_group {
+        flag: "/MTd"
+      }
+    }
+    flag_set {
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: "/DEFAULTLIB:libcmtd.lib"
+      }
+    }
+    requires: { feature: 'dbg'}
+  }
+
+  feature {
+    name: 'dynamic_link_msvcrt_debug'
+    flag_set {
+      action: 'c-compile'
+      action: 'c++-compile'
+      flag_group {
+        flag: "/MDd"
+      }
+    }
+    flag_set {
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: "/DEFAULTLIB:msvcrtd.lib"
+      }
+    }
+    requires: { feature: 'dbg'}
+  }
+
+  feature {
+    name: 'dbg'
+    flag_set {
+      action: 'c-compile'
+      action: 'c++-compile'
+      flag_group {
+        flag: "/Od"
+        flag: "/Z7"
+        flag: "/DDEBUG"
+      }
+    }
+    flag_set {
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: "/DEBUG:FULL"
+        flag: "/INCREMENTAL:NO"
+      }
+    }
+    implies: 'generate_pdb_file'
+  }
+
+  feature {
+    name: 'fastbuild'
+    flag_set {
+      action: 'c-compile'
+      action: 'c++-compile'
+      flag_group {
+        flag: "/Od"
+        flag: "/Z7"
+        flag: "/DDEBUG"
+      }
+    }
+    flag_set {
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: "/DEBUG:FASTLINK"
+        flag: "/INCREMENTAL:NO"
+      }
+    }
+    implies: 'generate_pdb_file'
+  }
+
+  feature {
+    name: 'opt'
+    flag_set {
+      action: 'c-compile'
+      action: 'c++-compile'
+      flag_group {
+        flag: "/O2"
+        flag: "/DNDEBUG"
+      }
+    }
+  }
+
+  feature {
+    name: 'user_compile_flags'
+    flag_set {
+      expand_if_all_available: 'user_compile_flags'
+      action: 'preprocess-assemble'
+      action: 'c-compile'
+      action: 'c++-compile'
+      action: 'c++-header-parsing'
+      action: 'c++-module-compile'
+      action: 'c++-module-codegen'
+      flag_group {
+        iterate_over: 'user_compile_flags'
+        flag: '%{user_compile_flags}'
+      }
+    }
+  }
+
+  feature {
+    name: 'sysroot'
+    flag_set {
+      expand_if_all_available: 'sysroot'
+      action: 'assemble'
+      action: 'preprocess-assemble'
+      action: 'c-compile'
+      action: 'c++-compile'
+      action: 'c++-header-parsing'
+      action: 'c++-module-compile'
+      action: 'c++-module-codegen'
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        iterate_over: 'sysroot'
+        flag: '--sysroot=%{sysroot}'
+      }
+    }
+  }
+
+  feature {
+    name: 'unfiltered_compile_flags'
+    flag_set {
+      expand_if_all_available: 'unfiltered_compile_flags'
+      action: 'preprocess-assemble'
+      action: 'c-compile'
+      action: 'c++-compile'
+      action: 'c++-header-parsing'
+      action: 'c++-module-compile'
+      action: 'c++-module-codegen'
+      flag_group {
+        iterate_over: 'unfiltered_compile_flags'
+        flag: '%{unfiltered_compile_flags}'
+      }
+    }
+  }
+
+  feature {
+    name: 'compiler_output_flags'
+    flag_set {
+      action: 'assemble'
+      flag_group {
+        expand_if_all_available: 'output_file'
+        expand_if_none_available: 'output_assembly_file'
+        expand_if_none_available: 'output_preprocess_file'
+        flag: '/Fo%{output_file}'
+        flag: '/Zi'
+      }
+    }
+    flag_set {
+      action: 'preprocess-assemble'
+      action: 'c-compile'
+      action: 'c++-compile'
+      action: 'c++-header-parsing'
+      action: 'c++-module-compile'
+      action: 'c++-module-codegen'
+      flag_group {
+        expand_if_all_available: 'output_file'
+        expand_if_none_available: 'output_assembly_file'
+        expand_if_none_available: 'output_preprocess_file'
+        flag: '/Fo%{output_file}'
+      }
+      flag_group {
+        expand_if_all_available: 'output_file'
+        expand_if_all_available: 'output_assembly_file'
+        flag: '/Fa%{output_file}'
+      }
+      flag_group {
+        expand_if_all_available: 'output_file'
+        expand_if_all_available: 'output_preprocess_file'
+        flag: '/P'
+        flag: '/Fi%{output_file}'
+      }
+    }
+  }
+
+  feature {
+    name: 'compiler_input_flags'
+    flag_set {
+      action: 'assemble'
+      action: 'preprocess-assemble'
+      action: 'c-compile'
+      action: 'c++-compile'
+      action: 'c++-header-parsing'
+      action: 'c++-module-compile'
+      action: 'c++-module-codegen'
+      flag_group {
+        expand_if_all_available: 'source_file'
+        flag: '/c'
+        flag: '%{source_file}'
+      }
+    }
+  }
+
+  feature {
+    name : 'def_file',
+    flag_set {
+      expand_if_all_available: 'def_file_path'
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: "/DEF:%{def_file_path}"
+        # We can specify a different DLL name in DEF file, /ignore:4070 suppresses
+        # the warning message about DLL name doesn't match the default one.
+        # See https://msdn.microsoft.com/en-us/library/sfkk2fz7.aspx
+        flag: "/ignore:4070"
+      }
+    }
+  }
+
+  feature {
+    name: 'windows_export_all_symbols'
+  }
+
+  feature {
+    name: 'no_windows_export_all_symbols'
+  }
+
+  linking_mode_flags { mode: DYNAMIC }
+}
diff --git a/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda10.0/clang/bin/crosstool_wrapper_driver_is_not_gcc b/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda10.0/clang/bin/crosstool_wrapper_driver_is_not_gcc
new file mode 100755
index 00000000000..7ae59e9967a
--- /dev/null
+++ b/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda10.0/clang/bin/crosstool_wrapper_driver_is_not_gcc
@@ -0,0 +1,264 @@
+#!/usr/bin/env python
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Crosstool wrapper for compiling CUDA programs.
+
+SYNOPSIS:
+  crosstool_wrapper_is_not_gcc [options passed in by cc_library()
+                                or cc_binary() rule]
+
+DESCRIPTION:
+  This script is expected to be called by the cc_library() or cc_binary() bazel
+  rules. When the option "-x cuda" is present in the list of arguments passed
+  to this script, it invokes the nvcc CUDA compiler. Most arguments are passed
+  as is as a string to --compiler-options of nvcc. When "-x cuda" is not
+  present, this wrapper invokes hybrid_driver_is_not_gcc with the input
+  arguments as is.
+
+NOTES:
+  Changes to the contents of this file must be propagated from
+  //third_party/gpus/crosstool/crosstool_wrapper_is_not_gcc to
+  //third_party/gpus/crosstool/v*/*/clang/bin/crosstool_wrapper_is_not_gcc
+"""
+
+from __future__ import print_function
+
+__author__ = 'keveman@google.com (Manjunath Kudlur)'
+
+from argparse import ArgumentParser
+import os
+import subprocess
+import re
+import sys
+import pipes
+
+# Template values set by cuda_autoconf.
+CPU_COMPILER = ('/usr/bin/gcc')
+GCC_HOST_COMPILER_PATH = ('/usr/bin/gcc')
+
+NVCC_PATH = '/usr/local/cuda-10.0/bin/nvcc'
+PREFIX_DIR = os.path.dirname(GCC_HOST_COMPILER_PATH)
+NVCC_VERSION = '10.0'
+
+def Log(s):
+  print('gpus/crosstool: {0}'.format(s))
+
+
+def GetOptionValue(argv, option):
+  """Extract the list of values for option from the argv list.
+
+  Args:
+    argv: A list of strings, possibly the argv passed to main().
+    option: The option whose value to extract, without the leading '-'.
+
+  Returns:
+    A list of values, either directly following the option,
+    (eg., -opt val1 val2) or values collected from multiple occurrences of
+    the option (eg., -opt val1 -opt val2).
+  """
+
+  parser = ArgumentParser()
+  parser.add_argument('-' + option, nargs='*', action='append')
+  args, _ = parser.parse_known_args(argv)
+  if not args or not vars(args)[option]:
+    return []
+  else:
+    return sum(vars(args)[option], [])
+
+
+def GetHostCompilerOptions(argv):
+  """Collect the -isystem, -iquote, and --sysroot option values from argv.
+
+  Args:
+    argv: A list of strings, possibly the argv passed to main().
+
+  Returns:
+    The string that can be used as the --compiler-options to nvcc.
+  """
+
+  parser = ArgumentParser()
+  parser.add_argument('-isystem', nargs='*', action='append')
+  parser.add_argument('-iquote', nargs='*', action='append')
+  parser.add_argument('--sysroot', nargs=1)
+  parser.add_argument('-g', nargs='*', action='append')
+  parser.add_argument('-fno-canonical-system-headers', action='store_true')
+
+  args, _ = parser.parse_known_args(argv)
+
+  opts = ''
+
+  if args.isystem:
+    opts += ' -isystem ' + ' -isystem '.join(sum(args.isystem, []))
+  if args.iquote:
+    opts += ' -iquote ' + ' -iquote '.join(sum(args.iquote, []))
+  if args.g:
+    opts += ' -g' + ' -g'.join(sum(args.g, []))
+  if args.fno_canonical_system_headers:
+    opts += ' -fno-canonical-system-headers'
+  if args.sysroot:
+    opts += ' --sysroot ' + args.sysroot[0]
+
+  return opts
+
+def _update_options(nvcc_options):
+  if NVCC_VERSION in ("7.0",):
+    return nvcc_options
+
+  update_options = { "relaxed-constexpr" : "expt-relaxed-constexpr" }
+  return [ update_options[opt] if opt in update_options else opt
+                    for opt in nvcc_options ]
+
+def GetNvccOptions(argv):
+  """Collect the -nvcc_options values from argv.
+
+  Args:
+    argv: A list of strings, possibly the argv passed to main().
+
+  Returns:
+    The string that can be passed directly to nvcc.
+  """
+
+  parser = ArgumentParser()
+  parser.add_argument('-nvcc_options', nargs='*', action='append')
+
+  args, _ = parser.parse_known_args(argv)
+
+  if args.nvcc_options:
+    options = _update_options(sum(args.nvcc_options, []))
+    return ' '.join(['--'+a for a in options])
+  return ''
+
+
+def InvokeNvcc(argv, log=False):
+  """Call nvcc with arguments assembled from argv.
+
+  Args:
+    argv: A list of strings, possibly the argv passed to main().
+    log: True if logging is requested.
+
+  Returns:
+    The return value of calling os.system('nvcc ' + args)
+  """
+
+  host_compiler_options = GetHostCompilerOptions(argv)
+  nvcc_compiler_options = GetNvccOptions(argv)
+  opt_option = GetOptionValue(argv, 'O')
+  m_options = GetOptionValue(argv, 'm')
+  m_options = ''.join([' -m' + m for m in m_options if m in ['32', '64']])
+  include_options = GetOptionValue(argv, 'I')
+  out_file = GetOptionValue(argv, 'o')
+  depfiles = GetOptionValue(argv, 'MF')
+  defines = GetOptionValue(argv, 'D')
+  defines = ''.join([' -D' + define for define in defines])
+  undefines = GetOptionValue(argv, 'U')
+  undefines = ''.join([' -U' + define for define in undefines])
+  std_options = GetOptionValue(argv, 'std')
+  # currently only c++11 is supported by Cuda 7.0 std argument
+  nvcc_allowed_std_options = ["c++11"]
+  std_options = ''.join([' -std=' + define
+      for define in std_options if define in nvcc_allowed_std_options])
+
+  # The list of source files get passed after the -c option. I don't know of
+  # any other reliable way to just get the list of source files to be compiled.
+  src_files = GetOptionValue(argv, 'c')
+
+  # Pass -w through from host to nvcc, but don't do anything fancier with
+  # warnings-related flags, since they're not necessarily the same across
+  # compilers.
+  warning_options = ' -w' if '-w' in argv else ''
+
+  if len(src_files) == 0:
+    return 1
+  if len(out_file) != 1:
+    return 1
+
+  opt = (' -O2' if (len(opt_option) > 0 and int(opt_option[0]) > 0)
+         else ' -g -G')
+
+  includes = (' -I ' + ' -I '.join(include_options)
+              if len(include_options) > 0
+              else '')
+
+  # Unfortunately, there are other options that have -c prefix too.
+  # So allowing only those look like C/C++ files.
+  src_files = [f for f in src_files if
+               re.search('\.cpp$|\.cc$|\.c$|\.cxx$|\.C$', f)]
+  srcs = ' '.join(src_files)
+  out = ' -o ' + out_file[0]
+
+  supported_cuda_compute_capabilities = [ "3.0" ]
+  nvccopts = '-D_FORCE_INLINES '
+  for capability in supported_cuda_compute_capabilities:
+    capability = capability.replace('.', '')
+    nvccopts += r'-gencode=arch=compute_%s,\"code=sm_%s,compute_%s\" ' % (
+        capability, capability, capability)
+  nvccopts += ' ' + nvcc_compiler_options
+  nvccopts += undefines
+  nvccopts += defines
+  nvccopts += std_options
+  nvccopts += m_options
+  nvccopts += warning_options
+
+  if depfiles:
+    # Generate the dependency file
+    depfile = depfiles[0]
+    cmd = (NVCC_PATH + ' ' + nvccopts +
+           ' --compiler-options "' + host_compiler_options + '"' +
+           ' --compiler-bindir=' + GCC_HOST_COMPILER_PATH +
+           ' -I .' +
+           ' -x cu ' + opt + includes + ' ' + srcs + ' -M -o ' + depfile)
+    if log: Log(cmd)
+    exit_status = os.system(cmd)
+    if exit_status != 0:
+      return exit_status
+
+  cmd = (NVCC_PATH + ' ' + nvccopts +
+         ' --compiler-options "' + host_compiler_options + ' -fPIC"' +
+         ' --compiler-bindir=' + GCC_HOST_COMPILER_PATH +
+         ' -I .' +
+         ' -x cu ' + opt + includes + ' -c ' + srcs + out)
+
+  # TODO(zhengxq): for some reason, 'gcc' needs this help to find 'as'.
+  # Need to investigate and fix.
+  cmd = 'PATH=' + PREFIX_DIR + ':$PATH ' + cmd
+  if log: Log(cmd)
+  return os.system(cmd)
+
+
+def main():
+  parser = ArgumentParser()
+  parser.add_argument('-x', nargs=1)
+  parser.add_argument('--cuda_log', action='store_true')
+  args, leftover = parser.parse_known_args(sys.argv[1:])
+
+  if args.x and args.x[0] == 'cuda':
+    if args.cuda_log: Log('-x cuda')
+    leftover = [pipes.quote(s) for s in leftover]
+    if args.cuda_log: Log('using nvcc')
+    return InvokeNvcc(leftover, log=args.cuda_log)
+
+  # Strip our flags before passing through to the CPU compiler for files which
+  # are not -x cuda. We can't just pass 'leftover' because it also strips -x.
+  # We not only want to pass -x to the CPU compiler, but also keep it in its
+  # relative location in the argv list (the compiler is actually sensitive to
+  # this).
+  cpu_compiler_flags = [flag for flag in sys.argv[1:]
+                             if not flag.startswith(('--cuda_log'))]
+
+  return subprocess.call([CPU_COMPILER] + cpu_compiler_flags)
+
+if __name__ == '__main__':
+  sys.exit(main())
diff --git a/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda10.0/windows/msvc_wrapper_for_nvcc.bat b/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda10.0/windows/msvc_wrapper_for_nvcc.bat
new file mode 100755
index 00000000000..e896e654fd7
--- /dev/null
+++ b/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda10.0/windows/msvc_wrapper_for_nvcc.bat
@@ -0,0 +1,20 @@
+:: Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+::
+:: Licensed under the Apache License, Version 2.0 (the "License");
+:: you may not use this file except in compliance with the License.
+:: You may obtain a copy of the License at
+::
+::     http://www.apache.org/licenses/LICENSE-2.0
+::
+:: Unless required by applicable law or agreed to in writing, software
+:: distributed under the License is distributed on an "AS IS" BASIS,
+:: WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+:: See the License for the specific language governing permissions and
+:: limitations under the License.
+:: =============================================================================
+
+:: Invoke msvc_wrapper_for_nvcc.py, which is located in the same directory.
+@echo OFF
+set arg0=%~0
+for %%F in ("%arg0%") do set DRIVER_BIN=%%~dpF
+"/usr/bin/python3" -B "%DRIVER_BIN%\msvc_wrapper_for_nvcc.py" %*
diff --git a/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda10.0/windows/msvc_wrapper_for_nvcc.py b/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda10.0/windows/msvc_wrapper_for_nvcc.py
new file mode 100755
index 00000000000..00483951af9
--- /dev/null
+++ b/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda10.0/windows/msvc_wrapper_for_nvcc.py
@@ -0,0 +1,192 @@
+#!/usr/bin/env python
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Crosstool wrapper for compiling CUDA programs with nvcc on Windows.
+
+DESCRIPTION:
+  This script is the Windows version of //third_party/gpus/crosstool/crosstool_wrapper_is_not_gcc
+"""
+
+from __future__ import print_function
+
+from argparse import ArgumentParser
+import os
+import subprocess
+import re
+import sys
+import pipes
+
+# Template values set by cuda_autoconf.
+CPU_COMPILER = ('/usr/bin/gcc')
+GCC_HOST_COMPILER_PATH = ('/usr/bin/gcc')
+
+NVCC_PATH = '/usr/local/cuda-10.0/bin/nvcc'
+NVCC_VERSION = '10.0'
+NVCC_TEMP_DIR = "C:\\Windows\\Temp\\nvcc_inter_files_tmp_dir"
+supported_cuda_compute_capabilities = [ "3.0" ]
+
+def Log(s):
+  print('gpus/crosstool: {0}'.format(s))
+
+
+def GetOptionValue(argv, option):
+  """Extract the list of values for option from options.
+
+  Args:
+    option: The option whose value to extract, without the leading '/'.
+
+  Returns:
+    1. A list of values, either directly following the option,
+    (eg., /opt val1 val2) or values collected from multiple occurrences of
+    the option (eg., /opt val1 /opt val2).
+    2. The leftover options.
+  """
+
+  parser = ArgumentParser(prefix_chars='/')
+  parser.add_argument('/' + option, nargs='*', action='append')
+  args, leftover = parser.parse_known_args(argv)
+  if args and vars(args)[option]:
+    return (sum(vars(args)[option], []), leftover)
+  return ([], leftover)
+
+def _update_options(nvcc_options):
+  if NVCC_VERSION in ("7.0",):
+    return nvcc_options
+
+  update_options = { "relaxed-constexpr" : "expt-relaxed-constexpr" }
+  return [ update_options[opt] if opt in update_options else opt
+                    for opt in nvcc_options ]
+
+def GetNvccOptions(argv):
+  """Collect the -nvcc_options values from argv.
+
+  Args:
+    argv: A list of strings, possibly the argv passed to main().
+
+  Returns:
+    1. The string that can be passed directly to nvcc.
+    2. The leftover options.
+  """
+
+  parser = ArgumentParser()
+  parser.add_argument('-nvcc_options', nargs='*', action='append')
+
+  args, leftover = parser.parse_known_args(argv)
+
+  if args.nvcc_options:
+    options = _update_options(sum(args.nvcc_options, []))
+    return (['--' + a for a in options], leftover)
+  return ([], leftover)
+
+
+def InvokeNvcc(argv, log=False):
+  """Call nvcc with arguments assembled from argv.
+
+  Args:
+    argv: A list of strings, possibly the argv passed to main().
+    log: True if logging is requested.
+
+  Returns:
+    The return value of calling os.system('nvcc ' + args)
+  """
+
+  src_files = [f for f in argv if
+               re.search('\.cpp$|\.cc$|\.c$|\.cxx$|\.C$', f)]
+  if len(src_files) == 0:
+    raise Error('No source files found for cuda compilation.')
+
+  out_file = [ f for f in argv if f.startswith('/Fo') ]
+  if len(out_file) != 1:
+    raise Error('Please sepecify exactly one output file for cuda compilation.')
+  out = ['-o', out_file[0][len('/Fo'):]]
+
+  nvcc_compiler_options, argv = GetNvccOptions(argv)
+
+  opt_option, argv = GetOptionValue(argv, 'O')
+  opt = ['-g', '-G']
+  if (len(opt_option) > 0 and opt_option[0] != 'd'):
+    opt = ['-O2']
+
+  include_options, argv = GetOptionValue(argv, 'I')
+  includes = ["-I " + include for include in include_options]
+
+  defines, argv = GetOptionValue(argv, 'D')
+  defines = ['-D' + define for define in defines]
+
+  undefines, argv = GetOptionValue(argv, 'U')
+  undefines = ['-U' + define for define in undefines]
+
+  # The rest of the unrecongized options should be passed to host compiler
+  host_compiler_options = [option for option in argv if option not in (src_files + out_file)]
+
+  m_options = ["-m64"]
+
+  nvccopts = ['-D_FORCE_INLINES']
+  for capability in supported_cuda_compute_capabilities:
+    capability = capability.replace('.', '')
+    nvccopts += [r'-gencode=arch=compute_%s,"code=sm_%s,compute_%s"' % (
+        capability, capability, capability)]
+  nvccopts += nvcc_compiler_options
+  nvccopts += undefines
+  nvccopts += defines
+  nvccopts += m_options
+  nvccopts += ['--compiler-options="' + " ".join(host_compiler_options) + '"']
+  nvccopts += ['-x', 'cu'] + opt + includes + out + ['-c'] + src_files
+  # If we don't specify --keep-dir, nvcc will generate intermediate files under TEMP
+  # Put them under NVCC_TEMP_DIR instead, then Bazel can ignore files under NVCC_TEMP_DIR during dependency check
+  # http://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html#options-for-guiding-compiler-driver
+  # Different actions are sharing NVCC_TEMP_DIR, so we cannot remove it if the directory already exists.
+  if os.path.isfile(NVCC_TEMP_DIR):
+    os.remove(NVCC_TEMP_DIR)
+  if not os.path.exists(NVCC_TEMP_DIR):
+    os.makedirs(NVCC_TEMP_DIR)
+  nvccopts += ['--keep', '--keep-dir', NVCC_TEMP_DIR]
+  cmd = [NVCC_PATH] + nvccopts
+  if log:
+    Log(cmd)
+  proc = subprocess.Popen(cmd,
+                          stdout=sys.stdout,
+                          stderr=sys.stderr,
+                          env=os.environ.copy(),
+                          shell=True)
+  proc.wait()
+  return proc.returncode
+
+def main():
+  parser = ArgumentParser()
+  parser.add_argument('-x', nargs=1)
+  parser.add_argument('--cuda_log', action='store_true')
+  args, leftover = parser.parse_known_args(sys.argv[1:])
+
+  if args.x and args.x[0] == 'cuda':
+    if args.cuda_log: Log('-x cuda')
+    leftover = [pipes.quote(s) for s in leftover]
+    if args.cuda_log: Log('using nvcc')
+    return InvokeNvcc(leftover, log=args.cuda_log)
+
+  # Strip our flags before passing through to the CPU compiler for files which
+  # are not -x cuda. We can't just pass 'leftover' because it also strips -x.
+  # We not only want to pass -x to the CPU compiler, but also keep it in its
+  # relative location in the argv list (the compiler is actually sensitive to
+  # this).
+  cpu_compiler_flags = [flag for flag in sys.argv[1:]
+                             if not flag.startswith(('--cuda_log'))
+                             and not flag.startswith(('-nvcc_options'))]
+
+  return subprocess.call([CPU_COMPILER] + cpu_compiler_flags)
+
+if __name__ == '__main__':
+  sys.exit(main())
diff --git a/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda9.0/BUILD b/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda9.0/BUILD
new file mode 100755
index 00000000000..6442e7628a4
--- /dev/null
+++ b/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda9.0/BUILD
@@ -0,0 +1,87 @@
+licenses(["restricted"])
+
+package(default_visibility = ["//visibility:public"])
+
+toolchain(
+    name = "toolchain-linux-x86_64",
+    exec_compatible_with = [
+        "@bazel_tools//platforms:linux",
+        "@bazel_tools//platforms:x86_64",
+    ],
+    target_compatible_with = [
+        "@bazel_tools//platforms:linux",
+        "@bazel_tools//platforms:x86_64",
+    ],
+    toolchain = ":cc-compiler-local",
+    toolchain_type = "@bazel_tools//tools/cpp:toolchain_type",
+)
+
+cc_toolchain_suite(
+    name = "toolchain",
+    toolchains = {
+        "local|compiler": ":cc-compiler-local",
+        "darwin|compiler": ":cc-compiler-darwin",
+        "x64_windows|msvc-cl": ":cc-compiler-windows",
+    },
+)
+
+cc_toolchain(
+    name = "cc-compiler-local",
+    all_files = ":crosstool_wrapper_driver_is_not_gcc",
+    compiler_files = ":empty",
+    cpu = "local",
+    dwp_files = ":empty",
+    dynamic_runtime_libs = [":empty"],
+    linker_files = ":crosstool_wrapper_driver_is_not_gcc",
+    objcopy_files = ":empty",
+    static_runtime_libs = [":empty"],
+    strip_files = ":empty",
+    # To support linker flags that need to go to the start of command line
+    # we need the toolchain to support parameter files. Parameter files are
+    # last on the command line and contain all shared libraries to link, so all
+    # regular options will be left of them.
+    supports_param_files = 1,
+)
+
+cc_toolchain(
+    name = "cc-compiler-darwin",
+    all_files = ":crosstool_wrapper_driver_is_not_gcc",
+    compiler_files = ":empty",
+    cpu = "darwin",
+    dwp_files = ":empty",
+    dynamic_runtime_libs = [":empty"],
+    linker_files = ":crosstool_wrapper_driver_is_not_gcc",
+    objcopy_files = ":empty",
+    static_runtime_libs = [":empty"],
+    strip_files = ":empty",
+    supports_param_files = 0,
+)
+
+cc_toolchain(
+    name = "cc-compiler-windows",
+    all_files = ":windows_msvc_wrapper_files",
+    compiler_files = ":empty",
+    cpu = "x64_windows",
+    dwp_files = ":empty",
+    dynamic_runtime_libs = [":empty"],
+    linker_files = ":windows_msvc_wrapper_files",
+    objcopy_files = ":empty",
+    static_runtime_libs = [":empty"],
+    strip_files = ":empty",
+    supports_param_files = 1,
+)
+
+filegroup(
+    name = "empty",
+    srcs = [],
+)
+
+filegroup(
+    name = "crosstool_wrapper_driver_is_not_gcc",
+    srcs = ["clang/bin/crosstool_wrapper_driver_is_not_gcc"],
+)
+
+filegroup(
+    name = "windows_msvc_wrapper_files",
+    srcs = glob(["windows/msvc_*"]),
+)
diff --git a/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda9.0/CROSSTOOL b/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda9.0/CROSSTOOL
new file mode 100755
index 00000000000..0d89a539b8d
--- /dev/null
+++ b/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda9.0/CROSSTOOL
@@ -0,0 +1,1431 @@
+major_version: "local"
+minor_version: ""
+default_target_cpu: "same_as_host"
+
+default_toolchain {
+  cpu: "k8"
+  toolchain_identifier: "local_linux"
+}
+default_toolchain {
+  cpu: "piii"
+  toolchain_identifier: "local_linux"
+}
+default_toolchain {
+  cpu: "arm"
+  toolchain_identifier: "local_linux"
+}
+default_toolchain {
+  cpu: "darwin"
+  toolchain_identifier: "local_darwin"
+}
+default_toolchain {
+  cpu: "ppc"
+  toolchain_identifier: "local_linux"
+}
+default_toolchain {
+  cpu: "x64_windows"
+  toolchain_identifier: "local_windows"
+}
+
+toolchain {
+  abi_version: "local"
+  abi_libc_version: "local"
+  compiler: "compiler"
+  host_system_name: "local"
+  needsPic: true
+  target_libc: "local"
+  target_cpu: "local"
+  target_system_name: "local"
+  toolchain_identifier: "local_linux"
+
+  feature {
+    name: "c++11"
+    flag_set {
+      action: "c++-compile"
+      flag_group {
+        flag: "-std=c++11"
+      }
+    }
+  }
+
+  feature {
+    name: "stdlib"
+    flag_set {
+      action: "c++-link-executable"
+      action: "c++-link-dynamic-library"
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: "-lstdc++"
+      }
+    }
+  }
+
+  feature {
+    name: "determinism"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        # Make C++ compilation deterministic. Use linkstamping instead of these
+        # compiler symbols.
+        flag: "-Wno-builtin-macro-redefined"
+        flag: "-D__DATE__=\"redacted\""
+        flag: "-D__TIMESTAMP__=\"redacted\""
+        flag: "-D__TIME__=\"redacted\""
+      }
+    }
+  }
+
+  feature {
+    name: "alwayslink"
+    flag_set {
+      action: "c++-link-dynamic-library"
+      action: "c++-link-nodeps-dynamic-library"
+      action: "c++-link-executable"
+      flag_group {
+        flag: "-Wl,-no-as-needed"
+      }
+    }
+  }
+
+  # This feature will be enabled for builds that support pic by bazel.
+  feature {
+    name: "pic"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        expand_if_all_available: "pic"
+        flag: "-fPIC"
+      }
+      flag_group {
+        expand_if_none_available: "pic"
+        flag: "-fPIE"
+      }
+    }
+  }
+
+  # Security hardening on by default.
+  feature {
+    name: "hardening"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        # Conservative choice; -D_FORTIFY_SOURCE=2 may be unsafe in some cases.
+        # We need to undef it before redefining it as some distributions now
+        # have it enabled by default.
+        flag: "-U_FORTIFY_SOURCE"
+        flag: "-D_FORTIFY_SOURCE=1"
+        flag: "-fstack-protector"
+      }
+    }
+    flag_set {
+      action: "c++-link-dynamic-library"
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: "-Wl,-z,relro,-z,now"
+      }
+    }
+    flag_set {
+      action: "c++-link-executable"
+      flag_group {
+        flag: "-pie"
+        flag: "-Wl,-z,relro,-z,now"
+      }
+    }
+  }
+
+  feature {
+    name: "warnings"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        # All warnings are enabled. Maybe enable -Werror as well?
+        flag: "-Wall"
+        
+      }
+    }
+  }
+
+  # Keep stack frames for debugging, even in opt mode.
+  feature {
+    name: "frame-pointer"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        flag: "-fno-omit-frame-pointer"
+      }
+    }
+  }
+
+  feature {
+    name: "build-id"
+    flag_set {
+      action: "c++-link-executable"
+      action: "c++-link-dynamic-library"
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        # Stamp the binary with a unique identifier.
+        flag: "-Wl,--build-id=md5"
+        flag: "-Wl,--hash-style=gnu"
+      }
+    }
+  }
+
+  feature {
+    name: "no-canonical-prefixes"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      action: "c++-link-executable"
+      action: "c++-link-dynamic-library"
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: "-no-canonical-prefixes"
+        flag: "-fno-canonical-system-headers"
+      }
+    }
+  }
+
+  feature {
+    name: "disable-assertions"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        flag: "-DNDEBUG"
+      }
+    }
+  }
+
+  feature {
+    name: "linker-bin-path"
+
+    flag_set {
+      action: "c++-link-executable"
+      action: "c++-link-dynamic-library"
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: "-B/usr/bin"
+      }
+    }
+  }
+
+  feature {
+    name: "common"
+    implies: "stdlib"
+    implies: "c++11"
+    implies: "determinism"
+    implies: "alwayslink"
+    implies: "hardening"
+    implies: "warnings"
+    implies: "frame-pointer"
+    implies: "build-id"
+    implies: "no-canonical-prefixes"
+    implies: "linker-bin-path"
+  }
+
+  feature {
+    name: "opt"
+    implies: "common"
+    implies: "disable-assertions"
+
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        # No debug symbols.
+        # Maybe we should enable https://gcc.gnu.org/wiki/DebugFission for opt
+        # or even generally? However, that can't happen here, as it requires
+        # special handling in Bazel.
+        flag: "-g0"
+
+        # Conservative choice for -O
+        # -O3 can increase binary size and even slow down the resulting binaries.
+        # Profile first and / or use FDO if you need better performance than this.
+        flag: "-O2"
+
+        # Removal of unused code and data at link time (can this increase binary size in some cases?).
+        flag: "-ffunction-sections"
+        flag: "-fdata-sections"
+      }
+    }
+    flag_set {
+      action: "c++-link-dynamic-library"
+      action: "c++-link-nodeps-dynamic-library"
+      action: "c++-link-executable"
+      flag_group {
+        flag: "-Wl,--gc-sections"
+      }
+    }
+  }
+
+  feature {
+    name: "fastbuild"
+    implies: "common"
+  }
+
+  feature {
+    name: "dbg"
+    implies: "common"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        flag: "-g"
+      }
+    }
+  }
+
+  # Set clang as a C/C++ compiler.
+  tool_path { name: "gcc" path: "clang/bin/crosstool_wrapper_driver_is_not_gcc" }
+
+  # Use the default system toolchain for everything else.
+  tool_path { name: "ar" path: "/usr/bin/ar" }
+  tool_path { name: "compat-ld" path: "/usr/bin/ld" }
+  tool_path { name: "cpp" path: "/usr/bin/cpp" }
+  tool_path { name: "dwp" path: "/usr/bin/dwp" }
+  tool_path { name: "gcov" path: "/usr/bin/gcov" }
+  tool_path { name: "ld" path: "/usr/bin/ld" }
+  tool_path { name: "nm" path: "/usr/bin/nm" }
+  tool_path { name: "objcopy" path: "/usr/bin/objcopy" }
+  tool_path { name: "objdump" path: "/usr/bin/objdump" }
+  tool_path { name: "strip" path: "/usr/bin/strip" }
+
+  # Enabled dynamic linking.
+  linking_mode_flags { mode: DYNAMIC }
+
+  cxx_builtin_include_directory: "/usr/include/c++/4.8"
+  cxx_builtin_include_directory: "/usr/include/x86_64-linux-gnu/c++/4.8"
+  cxx_builtin_include_directory: "/usr/include/c++/4.8/backward"
+  cxx_builtin_include_directory: "/usr/lib/gcc/x86_64-linux-gnu/4.8/include"
+  cxx_builtin_include_directory: "/usr/local/include"
+  cxx_builtin_include_directory: "/usr/lib/gcc/x86_64-linux-gnu/4.8/include-fixed"
+  cxx_builtin_include_directory: "/usr/include/x86_64-linux-gnu"
+  cxx_builtin_include_directory: "/usr/include"
+  cxx_builtin_include_directory: "/usr/local/cuda-9.0/targets/x86_64-linux/include"
+  cxx_builtin_include_directory: "/usr/local/cuda-9.0/include"
+  cxx_builtin_include_directory: "/usr/local/cuda-9.0/extras/CUPTI/include"
+  cxx_builtin_include_directory: "/usr/include"
+}
+
+toolchain {
+  abi_version: "local"
+  abi_libc_version: "local"
+  compiler: "compiler"
+  host_system_name: "local"
+  needsPic: true
+  target_libc: "macosx"
+  target_cpu: "darwin"
+  target_system_name: "local"
+  toolchain_identifier: "local_darwin"
+  feature {
+    name: "c++11"
+    flag_set {
+      action: "c++-compile"
+      flag_group {
+        flag: "-std=c++11"
+      }
+    }
+  }
+
+  feature {
+    name: "stdlib"
+    flag_set {
+      action: "c++-link-executable"
+      action: "c++-link-dynamic-library"
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: "-lc++"
+      }
+    }
+  }
+
+  feature {
+    name: "determinism"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        # Make C++ compilation deterministic. Use linkstamping instead of these
+        # compiler symbols.
+        flag: "-Wno-builtin-macro-redefined"
+        flag: "-D__DATE__=\"redacted\""
+        flag: "-D__TIMESTAMP__=\"redacted\""
+        flag: "-D__TIME__=\"redacted\""
+      }
+    }
+  }
+
+  # This feature will be enabled for builds that support pic by bazel.
+  feature {
+    name: "pic"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        expand_if_all_available: "pic"
+        flag: "-fPIC"
+      }
+      flag_group {
+        expand_if_none_available: "pic"
+        flag: "-fPIE"
+      }
+    }
+  }
+
+  # Security hardening on by default.
+  feature {
+    name: "hardening"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        # Conservative choice; -D_FORTIFY_SOURCE=2 may be unsafe in some cases.
+        # We need to undef it before redefining it as some distributions now
+        # have it enabled by default.
+        flag: "-U_FORTIFY_SOURCE"
+        flag: "-D_FORTIFY_SOURCE=1"
+        flag: "-fstack-protector"
+      }
+    }
+    flag_set {
+      action: "c++-link-executable"
+      flag_group {
+        flag: "-pie"
+      }
+    }
+  }
+
+  feature {
+    name: "warnings"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        # All warnings are enabled. Maybe enable -Werror as well?
+        flag: "-Wall"
+        
+      }
+    }
+  }
+
+  # Keep stack frames for debugging, even in opt mode.
+  feature {
+    name: "frame-pointer"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        flag: "-fno-omit-frame-pointer"
+      }
+    }
+  }
+
+  feature {
+    name: "no-canonical-prefixes"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      action: "c++-link-executable"
+      action: "c++-link-dynamic-library"
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag:"-no-canonical-prefixes"
+      }
+    }
+  }
+
+  feature {
+    name: "disable-assertions"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        flag: "-DNDEBUG"
+      }
+    }
+  }
+
+  feature {
+    name: "linker-bin-path"
+
+    flag_set {
+      action: "c++-link-executable"
+      action: "c++-link-dynamic-library"
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: "-B/usr/bin"
+      }
+    }
+  }
+
+  feature {
+    name: "undefined-dynamic"
+    flag_set {
+      action: "c++-link-dynamic-library"
+      action: "c++-link-nodeps-dynamic-library"
+      action: "c++-link-executable"
+      flag_group {
+        flag: "-undefined"
+        flag: "dynamic_lookup"
+      }
+    }
+  }
+
+  feature {
+    name: "common"
+    implies: "stdlib"
+    implies: "c++11"
+    implies: "determinism"
+    implies: "hardening"
+    implies: "warnings"
+    implies: "frame-pointer"
+    implies: "no-canonical-prefixes"
+    implies: "linker-bin-path"
+    implies: "undefined-dynamic"
+  }
+
+  feature {
+    name: "opt"
+    implies: "common"
+    implies: "disable-assertions"
+
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        # No debug symbols.
+        # Maybe we should enable https://gcc.gnu.org/wiki/DebugFission for opt
+        # or even generally? However, that can't happen here, as it requires
+        # special handling in Bazel.
+        flag: "-g0"
+
+        # Conservative choice for -O
+        # -O3 can increase binary size and even slow down the resulting binaries.
+        # Profile first and / or use FDO if you need better performance than this.
+        flag: "-O2"
+
+        # Removal of unused code and data at link time (can this increase binary size in some cases?).
+        flag: "-ffunction-sections"
+        flag: "-fdata-sections"
+      }
+    }
+  }
+
+  feature {
+    name: "fastbuild"
+    implies: "common"
+  }
+
+  feature {
+    name: "dbg"
+    implies: "common"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        flag: "-g"
+      }
+    }
+  }
+
+  # Set clang as a C/C++ compiler.
+  tool_path { name: "gcc" path: "clang/bin/crosstool_wrapper_driver_is_not_gcc" }
+
+  # Use the default system toolchain for everything else.
+  tool_path { name: "ar" path: "/usr/bin/libtool" }
+  tool_path { name: "compat-ld" path: "/usr/bin/ld" }
+  tool_path { name: "cpp" path: "/usr/bin/cpp" }
+  tool_path { name: "dwp" path: "/usr/bin/dwp" }
+  tool_path { name: "gcov" path: "/usr/bin/gcov" }
+  tool_path { name: "ld" path: "/usr/bin/ld" }
+  tool_path { name: "nm" path: "/usr/bin/nm" }
+  tool_path { name: "objcopy" path: "/usr/bin/objcopy" }
+  tool_path { name: "objdump" path: "/usr/bin/objdump" }
+  tool_path { name: "strip" path: "/usr/bin/strip" }
+
+  # Enabled dynamic linking.
+  linking_mode_flags { mode: DYNAMIC }
+
+  cxx_builtin_include_directory: "/usr/include/c++/4.8"
+  cxx_builtin_include_directory: "/usr/include/x86_64-linux-gnu/c++/4.8"
+  cxx_builtin_include_directory: "/usr/include/c++/4.8/backward"
+  cxx_builtin_include_directory: "/usr/lib/gcc/x86_64-linux-gnu/4.8/include"
+  cxx_builtin_include_directory: "/usr/local/include"
+  cxx_builtin_include_directory: "/usr/lib/gcc/x86_64-linux-gnu/4.8/include-fixed"
+  cxx_builtin_include_directory: "/usr/include/x86_64-linux-gnu"
+  cxx_builtin_include_directory: "/usr/include"
+  cxx_builtin_include_directory: "/usr/local/cuda-9.0/targets/x86_64-linux/include"
+  cxx_builtin_include_directory: "/usr/local/cuda-9.0/include"
+  cxx_builtin_include_directory: "/usr/local/cuda-9.0/extras/CUPTI/include"
+  cxx_builtin_include_directory: "/usr/include"
+}
+
+toolchain {
+  toolchain_identifier: "local_windows"
+  host_system_name: "local"
+  target_system_name: "local"
+
+  abi_version: "local"
+  abi_libc_version: "local"
+  target_cpu: "x64_windows"
+  compiler: "msvc-cl"
+  target_libc: "msvcrt"
+
+
+
+  tool_path {
+    name: "ar"
+    path: ""
+  }
+  tool_path {
+    name: "ml"
+    path: ""
+  }
+  tool_path {
+    name: "cpp"
+    path: ""
+  }
+  tool_path {
+    name: "gcc"
+    path: ""
+  }
+  tool_path {
+    name: "gcov"
+    path: "wrapper/bin/msvc_nop.bat"
+  }
+  tool_path {
+    name: "ld"
+    path: ""
+  }
+  tool_path {
+    name: "nm"
+    path: "wrapper/bin/msvc_nop.bat"
+  }
+  tool_path {
+    name: "objcopy"
+    path: "wrapper/bin/msvc_nop.bat"
+  }
+  tool_path {
+    name: "objdump"
+    path: "wrapper/bin/msvc_nop.bat"
+  }
+  tool_path {
+    name: "strip"
+    path: "wrapper/bin/msvc_nop.bat"
+  }
+  supports_interface_shared_objects: true
+
+  # TODO(pcloudy): Review those flags below, they should be defined by cl.exe
+  compiler_flag: "/DCOMPILER_MSVC"
+
+  # Don't define min/max macros in windows.h.
+  compiler_flag: "/DNOMINMAX"
+
+  # Platform defines.
+  compiler_flag: "/D_WIN32_WINNT=0x0600"
+  # Turn off warning messages.
+  compiler_flag: "/D_CRT_SECURE_NO_DEPRECATE"
+  compiler_flag: "/D_CRT_SECURE_NO_WARNINGS"
+  compiler_flag: "/D_SILENCE_STDEXT_HASH_DEPRECATION_WARNINGS"
+
+  # Useful options to have on for compilation.
+  # Increase the capacity of object files to 2^32 sections.
+  compiler_flag: "/bigobj"
+  # Allocate 500MB for precomputed headers.
+  compiler_flag: "/Zm500"
+  # Use unsigned char by default.
+  compiler_flag: "/J"
+  # Use function level linking.
+  compiler_flag: "/Gy"
+  # Use string pooling.
+  compiler_flag: "/GF"
+  # Catch C++ exceptions only and tell the compiler to assume that functions declared
+  # as extern "C" never throw a C++ exception.
+  compiler_flag: "/EHsc"
+
+  # Globally disabled warnings.
+  # Don't warn about elements of array being be default initialized.
+  compiler_flag: "/wd4351"
+  # Don't warn about no matching delete found.
+  compiler_flag: "/wd4291"
+  # Don't warn about diamond inheritance patterns.
+  compiler_flag: "/wd4250"
+  # Don't warn about insecure functions (e.g. non _s functions).
+  compiler_flag: "/wd4996"
+
+  linker_flag: "/MACHINE:X64"
+
+  feature {
+    name: "no_legacy_features"
+  }
+
+  # Suppress startup banner.
+  feature {
+    name: "nologo"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      action: "c++-module-compile"
+      action: "c++-module-codegen"
+      action: "c++-header-parsing"
+      action: "assemble"
+      action: "preprocess-assemble"
+      action: "c++-link-executable"
+      action: "c++-link-dynamic-library"
+      action: "c++-link-nodeps-dynamic-library"
+      action: "c++-link-static-library"
+      flag_group {
+        flag: "/nologo"
+      }
+    }
+  }
+
+  feature {
+    name: 'has_configured_linker_path'
+  }
+
+  # This feature indicates strip is not supported, building stripped binary will just result a copy of orignial binary
+  feature {
+    name: 'no_stripping'
+  }
+
+  # This feature indicates this is a toolchain targeting Windows.
+  feature {
+    name: 'targets_windows'
+    implies: 'copy_dynamic_libraries_to_binary'
+    enabled: true
+  }
+
+  feature {
+    name: 'copy_dynamic_libraries_to_binary'
+  }
+
+  action_config {
+    config_name: 'assemble'
+    action_name: 'assemble'
+    tool {
+      tool_path: ''
+    }
+    implies: 'compiler_input_flags'
+    implies: 'compiler_output_flags'
+    implies: 'nologo'
+    implies: 'msvc_env'
+    implies: 'sysroot'
+  }
+
+  action_config {
+    config_name: 'preprocess-assemble'
+    action_name: 'preprocess-assemble'
+    tool {
+      tool_path: ''
+    }
+    implies: 'compiler_input_flags'
+    implies: 'compiler_output_flags'
+    implies: 'nologo'
+    implies: 'msvc_env'
+    implies: 'sysroot'
+  }
+
+  action_config {
+    config_name: 'c-compile'
+    action_name: 'c-compile'
+    tool {
+      tool_path: ''
+    }
+    implies: 'compiler_input_flags'
+    implies: 'compiler_output_flags'
+    implies: 'legacy_compile_flags'
+    implies: 'nologo'
+    implies: 'msvc_env'
+    implies: 'parse_showincludes'
+    implies: 'user_compile_flags'
+    implies: 'sysroot'
+    implies: 'unfiltered_compile_flags'
+  }
+
+  action_config {
+    config_name: 'c++-compile'
+    action_name: 'c++-compile'
+    tool {
+      tool_path: ''
+    }
+    implies: 'compiler_input_flags'
+    implies: 'compiler_output_flags'
+    implies: 'legacy_compile_flags'
+    implies: 'nologo'
+    implies: 'msvc_env'
+    implies: 'parse_showincludes'
+    implies: 'user_compile_flags'
+    implies: 'sysroot'
+    implies: 'unfiltered_compile_flags'
+  }
+
+  action_config {
+    config_name: 'c++-link-executable'
+    action_name: 'c++-link-executable'
+    tool {
+      tool_path: ''
+    }
+    implies: 'nologo'
+    implies: 'linkstamps'
+    implies: 'output_execpath_flags'
+    implies: 'input_param_flags'
+    implies: 'user_link_flags'
+    implies: 'legacy_link_flags'
+    implies: 'linker_subsystem_flag'
+    implies: 'linker_param_file'
+    implies: 'msvc_env'
+    implies: 'no_stripping'
+  }
+
+  action_config {
+    config_name: 'c++-link-dynamic-library'
+    action_name: 'c++-link-dynamic-library'
+    tool {
+      tool_path: ''
+    }
+    implies: 'nologo'
+    implies: 'shared_flag'
+    implies: 'linkstamps'
+    implies: 'output_execpath_flags'
+    implies: 'input_param_flags'
+    implies: 'user_link_flags'
+    implies: 'legacy_link_flags'
+    implies: 'linker_subsystem_flag'
+    implies: 'linker_param_file'
+    implies: 'msvc_env'
+    implies: 'no_stripping'
+    implies: 'has_configured_linker_path'
+    implies: 'def_file'
+  }
+
+  action_config {
+      config_name: 'c++-link-nodeps-dynamic-library'
+      action_name: 'c++-link-nodeps-dynamic-library'
+      tool {
+        tool_path: ''
+      }
+      implies: 'nologo'
+      implies: 'shared_flag'
+      implies: 'linkstamps'
+      implies: 'output_execpath_flags'
+      implies: 'input_param_flags'
+      implies: 'user_link_flags'
+      implies: 'legacy_link_flags'
+      implies: 'linker_subsystem_flag'
+      implies: 'linker_param_file'
+      implies: 'msvc_env'
+      implies: 'no_stripping'
+      implies: 'has_configured_linker_path'
+      implies: 'def_file'
+    }
+
+  action_config {
+    config_name: 'c++-link-static-library'
+    action_name: 'c++-link-static-library'
+    tool {
+      tool_path: ''
+    }
+    implies: 'nologo'
+    implies: 'archiver_flags'
+    implies: 'input_param_flags'
+    implies: 'linker_param_file'
+    implies: 'msvc_env'
+  }
+
+  # TODO(b/65151735): Remove legacy_compile_flags feature when legacy fields are
+  # not used in this crosstool
+  feature {
+    name: 'legacy_compile_flags'
+    flag_set {
+      expand_if_all_available: 'legacy_compile_flags'
+      action: 'preprocess-assemble'
+      action: 'c-compile'
+      action: 'c++-compile'
+      action: 'c++-header-parsing'
+      action: 'c++-module-compile'
+      action: 'c++-module-codegen'
+      flag_group {
+        iterate_over: 'legacy_compile_flags'
+        flag: '%{legacy_compile_flags}'
+      }
+    }
+  }
+
+  feature {
+    name: "msvc_env"
+    env_set {
+      action: "c-compile"
+      action: "c++-compile"
+      action: "c++-module-compile"
+      action: "c++-module-codegen"
+      action: "c++-header-parsing"
+      action: "assemble"
+      action: "preprocess-assemble"
+      action: "c++-link-executable"
+      action: "c++-link-dynamic-library"
+      action: "c++-link-nodeps-dynamic-library"
+      action: "c++-link-static-library"
+      env_entry {
+        key: "PATH"
+        value: ""
+      }
+      env_entry {
+        key: "INCLUDE"
+        value: ""
+      }
+      env_entry {
+        key: "LIB"
+        value: ""
+      }
+      env_entry {
+        key: "TMP"
+        value: ""
+      }
+      env_entry {
+        key: "TEMP"
+        value: ""
+      }
+    }
+  }
+
+  feature {
+    name: 'include_paths'
+    flag_set {
+      action: "assemble"
+      action: 'preprocess-assemble'
+      action: 'c-compile'
+      action: 'c++-compile'
+      action: 'c++-header-parsing'
+      action: 'c++-module-compile'
+      flag_group {
+        iterate_over: 'quote_include_paths'
+        flag: '/I%{quote_include_paths}'
+      }
+      flag_group {
+        iterate_over: 'include_paths'
+        flag: '/I%{include_paths}'
+      }
+      flag_group {
+        iterate_over: 'system_include_paths'
+        flag: '/I%{system_include_paths}'
+      }
+    }
+  }
+
+  feature {
+    name: "preprocessor_defines"
+    flag_set {
+      action: "assemble"
+      action: "preprocess-assemble"
+      action: "c-compile"
+      action: "c++-compile"
+      action: "c++-header-parsing"
+      action: "c++-module-compile"
+      flag_group {
+        flag: "/D%{preprocessor_defines}"
+        iterate_over: "preprocessor_defines"
+      }
+    }
+  }
+
+  # Tell Bazel to parse the output of /showIncludes
+  feature {
+    name: 'parse_showincludes'
+    flag_set {
+      action: 'preprocess-assemble'
+      action: 'c-compile'
+      action: 'c++-compile'
+      action: 'c++-module-compile'
+      action: 'c++-header-parsing'
+      flag_group {
+        flag: "/showIncludes"
+      }
+    }
+  }
+
+
+  feature {
+    name: 'generate_pdb_file'
+    requires: {
+      feature: 'dbg'
+    }
+    requires: {
+      feature: 'fastbuild'
+    }
+  }
+
+  feature {
+    name: 'shared_flag'
+    flag_set {
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: '/DLL'
+      }
+    }
+  }
+
+  feature {
+    name: 'linkstamps'
+    flag_set {
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      expand_if_all_available: 'linkstamp_paths'
+      flag_group {
+        iterate_over: 'linkstamp_paths'
+        flag: '%{linkstamp_paths}'
+      }
+    }
+  }
+
+  feature {
+    name: 'output_execpath_flags'
+    flag_set {
+      expand_if_all_available: 'output_execpath'
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: '/OUT:%{output_execpath}'
+      }
+    }
+  }
+
+  feature {
+    name: 'archiver_flags'
+    flag_set {
+      expand_if_all_available: 'output_execpath'
+      action: 'c++-link-static-library'
+      flag_group {
+        flag: '/OUT:%{output_execpath}'
+      }
+    }
+  }
+
+  feature {
+    name: 'input_param_flags'
+    flag_set {
+      expand_if_all_available: 'interface_library_output_path'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: "/IMPLIB:%{interface_library_output_path}"
+      }
+    }
+    flag_set {
+      expand_if_all_available: 'libopts'
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        iterate_over: 'libopts'
+        flag: '%{libopts}'
+      }
+    }
+    flag_set {
+      expand_if_all_available: 'libraries_to_link'
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      action: 'c++-link-static-library'
+      flag_group {
+        iterate_over: 'libraries_to_link'
+        flag_group {
+          expand_if_equal: {
+            variable: 'libraries_to_link.type'
+            value: 'object_file_group'
+          }
+          iterate_over: 'libraries_to_link.object_files'
+          flag_group {
+            flag: '%{libraries_to_link.object_files}'
+          }
+        }
+        flag_group {
+          expand_if_equal: {
+            variable: 'libraries_to_link.type'
+            value: 'object_file'
+          }
+          flag_group {
+            flag: '%{libraries_to_link.name}'
+          }
+        }
+        flag_group {
+          expand_if_equal: {
+            variable: 'libraries_to_link.type'
+            value: 'interface_library'
+          }
+          flag_group {
+            flag: '%{libraries_to_link.name}'
+          }
+        }
+        flag_group {
+          expand_if_equal: {
+            variable: 'libraries_to_link.type'
+            value: 'static_library'
+          }
+          flag_group {
+            expand_if_false: 'libraries_to_link.is_whole_archive'
+            flag: '%{libraries_to_link.name}'
+          }
+          flag_group {
+            expand_if_true: 'libraries_to_link.is_whole_archive'
+            flag: '/WHOLEARCHIVE:%{libraries_to_link.name}'
+          }
+        }
+      }
+    }
+  }
+
+  # Since this feature is declared earlier in the CROSSTOOL than
+  # "user_link_flags", this feature will be applied prior to it anwyhere they
+  # are both implied. And since "user_link_flags" contains the linkopts from
+  # the build rule, this allows the user to override the /SUBSYSTEM in the BUILD
+  # file.
+  feature {
+    name: 'linker_subsystem_flag'
+    flag_set {
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: '/SUBSYSTEM:CONSOLE'
+      }
+    }
+  }
+
+  # The "user_link_flags" contains user-defined linkopts (from build rules)
+  # so it should be defined after features that declare user-overridable flags.
+  # For example the "linker_subsystem_flag" defines a default "/SUBSYSTEM" flag
+  # but we want to let the user override it, therefore "link_flag_subsystem" is
+  # defined earlier in the CROSSTOOL file than "user_link_flags".
+  feature {
+    name: 'user_link_flags'
+    flag_set {
+      expand_if_all_available: 'user_link_flags'
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        iterate_over: 'user_link_flags'
+        flag: '%{user_link_flags}'
+      }
+    }
+  }
+  feature {
+    name: 'legacy_link_flags'
+    flag_set {
+      expand_if_all_available: 'legacy_link_flags'
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        iterate_over: 'legacy_link_flags'
+        flag: '%{legacy_link_flags}'
+      }
+    }
+  }
+
+  feature {
+    name: 'linker_param_file'
+    flag_set {
+      expand_if_all_available: 'linker_param_file'
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      action: 'c++-link-static-library'
+      flag_group {
+        flag: '@%{linker_param_file}'
+      }
+    }
+  }
+
+  feature {
+    name: 'static_link_msvcrt'
+  }
+
+  feature {
+    name: 'static_link_msvcrt_no_debug'
+    flag_set {
+      action: 'c-compile'
+      action: 'c++-compile'
+      flag_group {
+        flag: "/MT"
+      }
+    }
+    flag_set {
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: "/DEFAULTLIB:libcmt.lib"
+      }
+    }
+    requires: { feature: 'fastbuild'}
+    requires: { feature: 'opt'}
+  }
+
+  feature {
+    name: 'dynamic_link_msvcrt_no_debug'
+    flag_set {
+      action: 'c-compile'
+      action: 'c++-compile'
+      flag_group {
+        flag: "/MD"
+      }
+    }
+    flag_set {
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: "/DEFAULTLIB:msvcrt.lib"
+      }
+    }
+    requires: { feature: 'fastbuild'}
+    requires: { feature: 'opt'}
+  }
+
+  feature {
+    name: 'static_link_msvcrt_debug'
+    flag_set {
+      action: 'c-compile'
+      action: 'c++-compile'
+      flag_group {
+        flag: "/MTd"
+      }
+    }
+    flag_set {
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: "/DEFAULTLIB:libcmtd.lib"
+      }
+    }
+    requires: { feature: 'dbg'}
+  }
+
+  feature {
+    name: 'dynamic_link_msvcrt_debug'
+    flag_set {
+      action: 'c-compile'
+      action: 'c++-compile'
+      flag_group {
+        flag: "/MDd"
+      }
+    }
+    flag_set {
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: "/DEFAULTLIB:msvcrtd.lib"
+      }
+    }
+    requires: { feature: 'dbg'}
+  }
+
+  feature {
+    name: 'dbg'
+    flag_set {
+      action: 'c-compile'
+      action: 'c++-compile'
+      flag_group {
+        flag: "/Od"
+        flag: "/Z7"
+        flag: "/DDEBUG"
+      }
+    }
+    flag_set {
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: "/DEBUG:FULL"
+        flag: "/INCREMENTAL:NO"
+      }
+    }
+    implies: 'generate_pdb_file'
+  }
+
+  feature {
+    name: 'fastbuild'
+    flag_set {
+      action: 'c-compile'
+      action: 'c++-compile'
+      flag_group {
+        flag: "/Od"
+        flag: "/Z7"
+        flag: "/DDEBUG"
+      }
+    }
+    flag_set {
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: "/DEBUG:FASTLINK"
+        flag: "/INCREMENTAL:NO"
+      }
+    }
+    implies: 'generate_pdb_file'
+  }
+
+  feature {
+    name: 'opt'
+    flag_set {
+      action: 'c-compile'
+      action: 'c++-compile'
+      flag_group {
+        flag: "/O2"
+        flag: "/DNDEBUG"
+      }
+    }
+  }
+
+  feature {
+    name: 'user_compile_flags'
+    flag_set {
+      expand_if_all_available: 'user_compile_flags'
+      action: 'preprocess-assemble'
+      action: 'c-compile'
+      action: 'c++-compile'
+      action: 'c++-header-parsing'
+      action: 'c++-module-compile'
+      action: 'c++-module-codegen'
+      flag_group {
+        iterate_over: 'user_compile_flags'
+        flag: '%{user_compile_flags}'
+      }
+    }
+  }
+
+  feature {
+    name: 'sysroot'
+    flag_set {
+      expand_if_all_available: 'sysroot'
+      action: 'assemble'
+      action: 'preprocess-assemble'
+      action: 'c-compile'
+      action: 'c++-compile'
+      action: 'c++-header-parsing'
+      action: 'c++-module-compile'
+      action: 'c++-module-codegen'
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        iterate_over: 'sysroot'
+        flag: '--sysroot=%{sysroot}'
+      }
+    }
+  }
+
+  feature {
+    name: 'unfiltered_compile_flags'
+    flag_set {
+      expand_if_all_available: 'unfiltered_compile_flags'
+      action: 'preprocess-assemble'
+      action: 'c-compile'
+      action: 'c++-compile'
+      action: 'c++-header-parsing'
+      action: 'c++-module-compile'
+      action: 'c++-module-codegen'
+      flag_group {
+        iterate_over: 'unfiltered_compile_flags'
+        flag: '%{unfiltered_compile_flags}'
+      }
+    }
+  }
+
+  feature {
+    name: 'compiler_output_flags'
+    flag_set {
+      action: 'assemble'
+      flag_group {
+        expand_if_all_available: 'output_file'
+        expand_if_none_available: 'output_assembly_file'
+        expand_if_none_available: 'output_preprocess_file'
+        flag: '/Fo%{output_file}'
+        flag: '/Zi'
+      }
+    }
+    flag_set {
+      action: 'preprocess-assemble'
+      action: 'c-compile'
+      action: 'c++-compile'
+      action: 'c++-header-parsing'
+      action: 'c++-module-compile'
+      action: 'c++-module-codegen'
+      flag_group {
+        expand_if_all_available: 'output_file'
+        expand_if_none_available: 'output_assembly_file'
+        expand_if_none_available: 'output_preprocess_file'
+        flag: '/Fo%{output_file}'
+      }
+      flag_group {
+        expand_if_all_available: 'output_file'
+        expand_if_all_available: 'output_assembly_file'
+        flag: '/Fa%{output_file}'
+      }
+      flag_group {
+        expand_if_all_available: 'output_file'
+        expand_if_all_available: 'output_preprocess_file'
+        flag: '/P'
+        flag: '/Fi%{output_file}'
+      }
+    }
+  }
+
+  feature {
+    name: 'compiler_input_flags'
+    flag_set {
+      action: 'assemble'
+      action: 'preprocess-assemble'
+      action: 'c-compile'
+      action: 'c++-compile'
+      action: 'c++-header-parsing'
+      action: 'c++-module-compile'
+      action: 'c++-module-codegen'
+      flag_group {
+        expand_if_all_available: 'source_file'
+        flag: '/c'
+        flag: '%{source_file}'
+      }
+    }
+  }
+
+  feature {
+    name : 'def_file',
+    flag_set {
+      expand_if_all_available: 'def_file_path'
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: "/DEF:%{def_file_path}"
+        # We can specify a different DLL name in DEF file, /ignore:4070 suppresses
+        # the warning message about DLL name doesn't match the default one.
+        # See https://msdn.microsoft.com/en-us/library/sfkk2fz7.aspx
+        flag: "/ignore:4070"
+      }
+    }
+  }
+
+  feature {
+    name: 'windows_export_all_symbols'
+  }
+
+  feature {
+    name: 'no_windows_export_all_symbols'
+  }
+
+  linking_mode_flags { mode: DYNAMIC }
+}
diff --git a/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda9.0/clang/bin/crosstool_wrapper_driver_is_not_gcc b/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda9.0/clang/bin/crosstool_wrapper_driver_is_not_gcc
new file mode 100755
index 00000000000..63893d3722f
--- /dev/null
+++ b/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda9.0/clang/bin/crosstool_wrapper_driver_is_not_gcc
@@ -0,0 +1,264 @@
+#!/usr/bin/env python
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Crosstool wrapper for compiling CUDA programs.
+
+SYNOPSIS:
+  crosstool_wrapper_is_not_gcc [options passed in by cc_library()
+                                or cc_binary() rule]
+
+DESCRIPTION:
+  This script is expected to be called by the cc_library() or cc_binary() bazel
+  rules. When the option "-x cuda" is present in the list of arguments passed
+  to this script, it invokes the nvcc CUDA compiler. Most arguments are passed
+  as is as a string to --compiler-options of nvcc. When "-x cuda" is not
+  present, this wrapper invokes hybrid_driver_is_not_gcc with the input
+  arguments as is.
+
+NOTES:
+  Changes to the contents of this file must be propagated from
+  //third_party/gpus/crosstool/crosstool_wrapper_is_not_gcc to
+  //third_party/gpus/crosstool/v*/*/clang/bin/crosstool_wrapper_is_not_gcc
+"""
+
+from __future__ import print_function
+
+__author__ = 'keveman@google.com (Manjunath Kudlur)'
+
+from argparse import ArgumentParser
+import os
+import subprocess
+import re
+import sys
+import pipes
+
+# Template values set by cuda_autoconf.
+CPU_COMPILER = ('/usr/bin/gcc')
+GCC_HOST_COMPILER_PATH = ('/usr/bin/gcc')
+
+NVCC_PATH = '/usr/local/cuda-9.0/bin/nvcc'
+PREFIX_DIR = os.path.dirname(GCC_HOST_COMPILER_PATH)
+NVCC_VERSION = '9.0'
+
+def Log(s):
+  print('gpus/crosstool: {0}'.format(s))
+
+
+def GetOptionValue(argv, option):
+  """Extract the list of values for option from the argv list.
+
+  Args:
+    argv: A list of strings, possibly the argv passed to main().
+    option: The option whose value to extract, without the leading '-'.
+
+  Returns:
+    A list of values, either directly following the option,
+    (eg., -opt val1 val2) or values collected from multiple occurrences of
+    the option (eg., -opt val1 -opt val2).
+  """
+
+  parser = ArgumentParser()
+  parser.add_argument('-' + option, nargs='*', action='append')
+  args, _ = parser.parse_known_args(argv)
+  if not args or not vars(args)[option]:
+    return []
+  else:
+    return sum(vars(args)[option], [])
+
+
+def GetHostCompilerOptions(argv):
+  """Collect the -isystem, -iquote, and --sysroot option values from argv.
+
+  Args:
+    argv: A list of strings, possibly the argv passed to main().
+
+  Returns:
+    The string that can be used as the --compiler-options to nvcc.
+  """
+
+  parser = ArgumentParser()
+  parser.add_argument('-isystem', nargs='*', action='append')
+  parser.add_argument('-iquote', nargs='*', action='append')
+  parser.add_argument('--sysroot', nargs=1)
+  parser.add_argument('-g', nargs='*', action='append')
+  parser.add_argument('-fno-canonical-system-headers', action='store_true')
+
+  args, _ = parser.parse_known_args(argv)
+
+  opts = ''
+
+  if args.isystem:
+    opts += ' -isystem ' + ' -isystem '.join(sum(args.isystem, []))
+  if args.iquote:
+    opts += ' -iquote ' + ' -iquote '.join(sum(args.iquote, []))
+  if args.g:
+    opts += ' -g' + ' -g'.join(sum(args.g, []))
+  if args.fno_canonical_system_headers:
+    opts += ' -fno-canonical-system-headers'
+  if args.sysroot:
+    opts += ' --sysroot ' + args.sysroot[0]
+
+  return opts
+
+def _update_options(nvcc_options):
+  if NVCC_VERSION in ("7.0",):
+    return nvcc_options
+
+  update_options = { "relaxed-constexpr" : "expt-relaxed-constexpr" }
+  return [ update_options[opt] if opt in update_options else opt
+                    for opt in nvcc_options ]
+
+def GetNvccOptions(argv):
+  """Collect the -nvcc_options values from argv.
+
+  Args:
+    argv: A list of strings, possibly the argv passed to main().
+
+  Returns:
+    The string that can be passed directly to nvcc.
+  """
+
+  parser = ArgumentParser()
+  parser.add_argument('-nvcc_options', nargs='*', action='append')
+
+  args, _ = parser.parse_known_args(argv)
+
+  if args.nvcc_options:
+    options = _update_options(sum(args.nvcc_options, []))
+    return ' '.join(['--'+a for a in options])
+  return ''
+
+
+def InvokeNvcc(argv, log=False):
+  """Call nvcc with arguments assembled from argv.
+
+  Args:
+    argv: A list of strings, possibly the argv passed to main().
+    log: True if logging is requested.
+
+  Returns:
+    The return value of calling os.system('nvcc ' + args)
+  """
+
+  host_compiler_options = GetHostCompilerOptions(argv)
+  nvcc_compiler_options = GetNvccOptions(argv)
+  opt_option = GetOptionValue(argv, 'O')
+  m_options = GetOptionValue(argv, 'm')
+  m_options = ''.join([' -m' + m for m in m_options if m in ['32', '64']])
+  include_options = GetOptionValue(argv, 'I')
+  out_file = GetOptionValue(argv, 'o')
+  depfiles = GetOptionValue(argv, 'MF')
+  defines = GetOptionValue(argv, 'D')
+  defines = ''.join([' -D' + define for define in defines])
+  undefines = GetOptionValue(argv, 'U')
+  undefines = ''.join([' -U' + define for define in undefines])
+  std_options = GetOptionValue(argv, 'std')
+  # currently only c++11 is supported by Cuda 7.0 std argument
+  nvcc_allowed_std_options = ["c++11"]
+  std_options = ''.join([' -std=' + define
+      for define in std_options if define in nvcc_allowed_std_options])
+
+  # The list of source files get passed after the -c option. I don't know of
+  # any other reliable way to just get the list of source files to be compiled.
+  src_files = GetOptionValue(argv, 'c')
+
+  # Pass -w through from host to nvcc, but don't do anything fancier with
+  # warnings-related flags, since they're not necessarily the same across
+  # compilers.
+  warning_options = ' -w' if '-w' in argv else ''
+
+  if len(src_files) == 0:
+    return 1
+  if len(out_file) != 1:
+    return 1
+
+  opt = (' -O2' if (len(opt_option) > 0 and int(opt_option[0]) > 0)
+         else ' -g -G')
+
+  includes = (' -I ' + ' -I '.join(include_options)
+              if len(include_options) > 0
+              else '')
+
+  # Unfortunately, there are other options that have -c prefix too.
+  # So allowing only those look like C/C++ files.
+  src_files = [f for f in src_files if
+               re.search('\.cpp$|\.cc$|\.c$|\.cxx$|\.C$', f)]
+  srcs = ' '.join(src_files)
+  out = ' -o ' + out_file[0]
+
+  supported_cuda_compute_capabilities = [ "3.0" ]
+  nvccopts = '-D_FORCE_INLINES '
+  for capability in supported_cuda_compute_capabilities:
+    capability = capability.replace('.', '')
+    nvccopts += r'-gencode=arch=compute_%s,\"code=sm_%s,compute_%s\" ' % (
+        capability, capability, capability)
+  nvccopts += ' ' + nvcc_compiler_options
+  nvccopts += undefines
+  nvccopts += defines
+  nvccopts += std_options
+  nvccopts += m_options
+  nvccopts += warning_options
+
+  if depfiles:
+    # Generate the dependency file
+    depfile = depfiles[0]
+    cmd = (NVCC_PATH + ' ' + nvccopts +
+           ' --compiler-options "' + host_compiler_options + '"' +
+           ' --compiler-bindir=' + GCC_HOST_COMPILER_PATH +
+           ' -I .' +
+           ' -x cu ' + opt + includes + ' ' + srcs + ' -M -o ' + depfile)
+    if log: Log(cmd)
+    exit_status = os.system(cmd)
+    if exit_status != 0:
+      return exit_status
+
+  cmd = (NVCC_PATH + ' ' + nvccopts +
+         ' --compiler-options "' + host_compiler_options + ' -fPIC"' +
+         ' --compiler-bindir=' + GCC_HOST_COMPILER_PATH +
+         ' -I .' +
+         ' -x cu ' + opt + includes + ' -c ' + srcs + out)
+
+  # TODO(zhengxq): for some reason, 'gcc' needs this help to find 'as'.
+  # Need to investigate and fix.
+  cmd = 'PATH=' + PREFIX_DIR + ':$PATH ' + cmd
+  if log: Log(cmd)
+  return os.system(cmd)
+
+
+def main():
+  parser = ArgumentParser()
+  parser.add_argument('-x', nargs=1)
+  parser.add_argument('--cuda_log', action='store_true')
+  args, leftover = parser.parse_known_args(sys.argv[1:])
+
+  if args.x and args.x[0] == 'cuda':
+    if args.cuda_log: Log('-x cuda')
+    leftover = [pipes.quote(s) for s in leftover]
+    if args.cuda_log: Log('using nvcc')
+    return InvokeNvcc(leftover, log=args.cuda_log)
+
+  # Strip our flags before passing through to the CPU compiler for files which
+  # are not -x cuda. We can't just pass 'leftover' because it also strips -x.
+  # We not only want to pass -x to the CPU compiler, but also keep it in its
+  # relative location in the argv list (the compiler is actually sensitive to
+  # this).
+  cpu_compiler_flags = [flag for flag in sys.argv[1:]
+                             if not flag.startswith(('--cuda_log'))]
+
+  return subprocess.call([CPU_COMPILER] + cpu_compiler_flags)
+
+if __name__ == '__main__':
+  sys.exit(main())
diff --git a/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda9.0/windows/msvc_wrapper_for_nvcc.bat b/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda9.0/windows/msvc_wrapper_for_nvcc.bat
new file mode 100755
index 00000000000..e896e654fd7
--- /dev/null
+++ b/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda9.0/windows/msvc_wrapper_for_nvcc.bat
@@ -0,0 +1,20 @@
+:: Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+::
+:: Licensed under the Apache License, Version 2.0 (the "License");
+:: you may not use this file except in compliance with the License.
+:: You may obtain a copy of the License at
+::
+::     http://www.apache.org/licenses/LICENSE-2.0
+::
+:: Unless required by applicable law or agreed to in writing, software
+:: distributed under the License is distributed on an "AS IS" BASIS,
+:: WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+:: See the License for the specific language governing permissions and
+:: limitations under the License.
+:: =============================================================================
+
+:: Invoke msvc_wrapper_for_nvcc.py, which is located in the same directory.
+@echo OFF
+set arg0=%~0
+for %%F in ("%arg0%") do set DRIVER_BIN=%%~dpF
+"/usr/bin/python3" -B "%DRIVER_BIN%\msvc_wrapper_for_nvcc.py" %*
diff --git a/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda9.0/windows/msvc_wrapper_for_nvcc.py b/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda9.0/windows/msvc_wrapper_for_nvcc.py
new file mode 100755
index 00000000000..859b3196d5d
--- /dev/null
+++ b/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda9.0/windows/msvc_wrapper_for_nvcc.py
@@ -0,0 +1,192 @@
+#!/usr/bin/env python
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Crosstool wrapper for compiling CUDA programs with nvcc on Windows.
+
+DESCRIPTION:
+  This script is the Windows version of //third_party/gpus/crosstool/crosstool_wrapper_is_not_gcc
+"""
+
+from __future__ import print_function
+
+from argparse import ArgumentParser
+import os
+import subprocess
+import re
+import sys
+import pipes
+
+# Template values set by cuda_autoconf.
+CPU_COMPILER = ('/usr/bin/gcc')
+GCC_HOST_COMPILER_PATH = ('/usr/bin/gcc')
+
+NVCC_PATH = '/usr/local/cuda-9.0/bin/nvcc'
+NVCC_VERSION = '9.0'
+NVCC_TEMP_DIR = "C:\\Windows\\Temp\\nvcc_inter_files_tmp_dir"
+supported_cuda_compute_capabilities = [ "3.0" ]
+
+def Log(s):
+  print('gpus/crosstool: {0}'.format(s))
+
+
+def GetOptionValue(argv, option):
+  """Extract the list of values for option from options.
+
+  Args:
+    option: The option whose value to extract, without the leading '/'.
+
+  Returns:
+    1. A list of values, either directly following the option,
+    (eg., /opt val1 val2) or values collected from multiple occurrences of
+    the option (eg., /opt val1 /opt val2).
+    2. The leftover options.
+  """
+
+  parser = ArgumentParser(prefix_chars='/')
+  parser.add_argument('/' + option, nargs='*', action='append')
+  args, leftover = parser.parse_known_args(argv)
+  if args and vars(args)[option]:
+    return (sum(vars(args)[option], []), leftover)
+  return ([], leftover)
+
+def _update_options(nvcc_options):
+  if NVCC_VERSION in ("7.0",):
+    return nvcc_options
+
+  update_options = { "relaxed-constexpr" : "expt-relaxed-constexpr" }
+  return [ update_options[opt] if opt in update_options else opt
+                    for opt in nvcc_options ]
+
+def GetNvccOptions(argv):
+  """Collect the -nvcc_options values from argv.
+
+  Args:
+    argv: A list of strings, possibly the argv passed to main().
+
+  Returns:
+    1. The string that can be passed directly to nvcc.
+    2. The leftover options.
+  """
+
+  parser = ArgumentParser()
+  parser.add_argument('-nvcc_options', nargs='*', action='append')
+
+  args, leftover = parser.parse_known_args(argv)
+
+  if args.nvcc_options:
+    options = _update_options(sum(args.nvcc_options, []))
+    return (['--' + a for a in options], leftover)
+  return ([], leftover)
+
+
+def InvokeNvcc(argv, log=False):
+  """Call nvcc with arguments assembled from argv.
+
+  Args:
+    argv: A list of strings, possibly the argv passed to main().
+    log: True if logging is requested.
+
+  Returns:
+    The return value of calling os.system('nvcc ' + args)
+  """
+
+  src_files = [f for f in argv if
+               re.search('\.cpp$|\.cc$|\.c$|\.cxx$|\.C$', f)]
+  if len(src_files) == 0:
+    raise Error('No source files found for cuda compilation.')
+
+  out_file = [ f for f in argv if f.startswith('/Fo') ]
+  if len(out_file) != 1:
+    raise Error('Please sepecify exactly one output file for cuda compilation.')
+  out = ['-o', out_file[0][len('/Fo'):]]
+
+  nvcc_compiler_options, argv = GetNvccOptions(argv)
+
+  opt_option, argv = GetOptionValue(argv, 'O')
+  opt = ['-g', '-G']
+  if (len(opt_option) > 0 and opt_option[0] != 'd'):
+    opt = ['-O2']
+
+  include_options, argv = GetOptionValue(argv, 'I')
+  includes = ["-I " + include for include in include_options]
+
+  defines, argv = GetOptionValue(argv, 'D')
+  defines = ['-D' + define for define in defines]
+
+  undefines, argv = GetOptionValue(argv, 'U')
+  undefines = ['-U' + define for define in undefines]
+
+  # The rest of the unrecongized options should be passed to host compiler
+  host_compiler_options = [option for option in argv if option not in (src_files + out_file)]
+
+  m_options = ["-m64"]
+
+  nvccopts = ['-D_FORCE_INLINES']
+  for capability in supported_cuda_compute_capabilities:
+    capability = capability.replace('.', '')
+    nvccopts += [r'-gencode=arch=compute_%s,"code=sm_%s,compute_%s"' % (
+        capability, capability, capability)]
+  nvccopts += nvcc_compiler_options
+  nvccopts += undefines
+  nvccopts += defines
+  nvccopts += m_options
+  nvccopts += ['--compiler-options="' + " ".join(host_compiler_options) + '"']
+  nvccopts += ['-x', 'cu'] + opt + includes + out + ['-c'] + src_files
+  # If we don't specify --keep-dir, nvcc will generate intermediate files under TEMP
+  # Put them under NVCC_TEMP_DIR instead, then Bazel can ignore files under NVCC_TEMP_DIR during dependency check
+  # http://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html#options-for-guiding-compiler-driver
+  # Different actions are sharing NVCC_TEMP_DIR, so we cannot remove it if the directory already exists.
+  if os.path.isfile(NVCC_TEMP_DIR):
+    os.remove(NVCC_TEMP_DIR)
+  if not os.path.exists(NVCC_TEMP_DIR):
+    os.makedirs(NVCC_TEMP_DIR)
+  nvccopts += ['--keep', '--keep-dir', NVCC_TEMP_DIR]
+  cmd = [NVCC_PATH] + nvccopts
+  if log:
+    Log(cmd)
+  proc = subprocess.Popen(cmd,
+                          stdout=sys.stdout,
+                          stderr=sys.stderr,
+                          env=os.environ.copy(),
+                          shell=True)
+  proc.wait()
+  return proc.returncode
+
+def main():
+  parser = ArgumentParser()
+  parser.add_argument('-x', nargs=1)
+  parser.add_argument('--cuda_log', action='store_true')
+  args, leftover = parser.parse_known_args(sys.argv[1:])
+
+  if args.x and args.x[0] == 'cuda':
+    if args.cuda_log: Log('-x cuda')
+    leftover = [pipes.quote(s) for s in leftover]
+    if args.cuda_log: Log('using nvcc')
+    return InvokeNvcc(leftover, log=args.cuda_log)
+
+  # Strip our flags before passing through to the CPU compiler for files which
+  # are not -x cuda. We can't just pass 'leftover' because it also strips -x.
+  # We not only want to pass -x to the CPU compiler, but also keep it in its
+  # relative location in the argv list (the compiler is actually sensitive to
+  # this).
+  cpu_compiler_flags = [flag for flag in sys.argv[1:]
+                             if not flag.startswith(('--cuda_log'))
+                             and not flag.startswith(('-nvcc_options'))]
+
+  return subprocess.call([CPU_COMPILER] + cpu_compiler_flags)
+
+if __name__ == '__main__':
+  sys.exit(main())
diff --git a/third_party/toolchains/preconfig/ubuntu14.04/py3/BUILD b/third_party/toolchains/preconfig/ubuntu14.04/py3/BUILD
index e021df9e1e3..460c879d32f 100755
--- a/third_party/toolchains/preconfig/ubuntu14.04/py3/BUILD
+++ b/third_party/toolchains/preconfig/ubuntu14.04/py3/BUILD
@@ -136,7 +136,7 @@ genrule(
         "python_include/weakrefobject.h",
     ],
     cmd = """
-cp "/usr/include/python3.4m/Python-ast.h" "$(@D)/python_include/Python-ast.h" && cp "/usr/include/python3.4m/Python.h" "$(@D)/python_include/Python.h" && cp "/usr/include/python3.4m/abstract.h" "$(@D)/python_include/abstract.h" && cp "/usr/include/python3.4m/accu.h" "$(@D)/python_include/accu.h" && cp "/usr/include/python3.4m/asdl.h" "$(@D)/python_include/asdl.h" && cp "/usr/include/python3.4m/ast.h" "$(@D)/python_include/ast.h" && cp "/usr/include/python3.4m/bitset.h" "$(@D)/python_include/bitset.h" && cp "/usr/include/python3.4m/bltinmodule.h" "$(@D)/python_include/bltinmodule.h" && cp "/usr/include/python3.4m/boolobject.h" "$(@D)/python_include/boolobject.h" && cp "/usr/include/python3.4m/bytearrayobject.h" "$(@D)/python_include/bytearrayobject.h" && cp "/usr/include/python3.4m/bytes_methods.h" "$(@D)/python_include/bytes_methods.h" && cp "/usr/include/python3.4m/bytesobject.h" "$(@D)/python_include/bytesobject.h" && cp "/usr/include/python3.4m/cellobject.h" "$(@D)/python_include/cellobject.h" && cp "/usr/include/python3.4m/ceval.h" "$(@D)/python_include/ceval.h" && cp "/usr/include/python3.4m/classobject.h" "$(@D)/python_include/classobject.h" && cp "/usr/include/python3.4m/code.h" "$(@D)/python_include/code.h" && cp "/usr/include/python3.4m/codecs.h" "$(@D)/python_include/codecs.h" && cp "/usr/include/python3.4m/compile.h" "$(@D)/python_include/compile.h" && cp "/usr/include/python3.4m/complexobject.h" "$(@D)/python_include/complexobject.h" && cp "/usr/include/python3.4m/datetime.h" "$(@D)/python_include/datetime.h" && cp "/usr/include/python3.4m/descrobject.h" "$(@D)/python_include/descrobject.h" && cp "/usr/include/python3.4m/dictobject.h" "$(@D)/python_include/dictobject.h" && cp "/usr/include/python3.4m/dtoa.h" "$(@D)/python_include/dtoa.h" && cp "/usr/include/python3.4m/dynamic_annotations.h" "$(@D)/python_include/dynamic_annotations.h" && cp "/usr/include/python3.4m/enumobject.h" "$(@D)/python_include/enumobject.h" && cp "/usr/include/python3.4m/errcode.h" "$(@D)/python_include/errcode.h" && cp "/usr/include/python3.4m/eval.h" "$(@D)/python_include/eval.h" && cp "/usr/include/python3.4m/fileobject.h" "$(@D)/python_include/fileobject.h" && cp "/usr/include/python3.4m/fileutils.h" "$(@D)/python_include/fileutils.h" && cp "/usr/include/python3.4m/floatobject.h" "$(@D)/python_include/floatobject.h" && cp "/usr/include/python3.4m/frameobject.h" "$(@D)/python_include/frameobject.h" && cp "/usr/include/python3.4m/funcobject.h" "$(@D)/python_include/funcobject.h" && cp "/usr/include/python3.4m/genobject.h" "$(@D)/python_include/genobject.h" && cp "/usr/include/python3.4m/graminit.h" "$(@D)/python_include/graminit.h" && cp "/usr/include/python3.4m/grammar.h" "$(@D)/python_include/grammar.h" && cp "/usr/include/python3.4m/import.h" "$(@D)/python_include/import.h" && cp "/usr/include/python3.4m/intrcheck.h" "$(@D)/python_include/intrcheck.h" && cp "/usr/include/python3.4m/iterobject.h" "$(@D)/python_include/iterobject.h" && cp "/usr/include/python3.4m/listobject.h" "$(@D)/python_include/listobject.h" && cp "/usr/include/python3.4m/longintrepr.h" "$(@D)/python_include/longintrepr.h" && cp "/usr/include/python3.4m/longobject.h" "$(@D)/python_include/longobject.h" && cp "/usr/include/python3.4m/marshal.h" "$(@D)/python_include/marshal.h" && cp "/usr/include/python3.4m/memoryobject.h" "$(@D)/python_include/memoryobject.h" && cp "/usr/include/python3.4m/metagrammar.h" "$(@D)/python_include/metagrammar.h" && cp "/usr/include/python3.4m/methodobject.h" "$(@D)/python_include/methodobject.h" && cp "/usr/include/python3.4m/modsupport.h" "$(@D)/python_include/modsupport.h" && cp "/usr/include/python3.4m/moduleobject.h" "$(@D)/python_include/moduleobject.h" && cp "/usr/include/python3.4m/namespaceobject.h" "$(@D)/python_include/namespaceobject.h" && cp "/usr/include/python3.4m/node.h" "$(@D)/python_include/node.h" && cp "/usr/include/python3.4m/object.h" "$(@D)/python_include/object.h" && cp "/usr/include/python3.4m/objimpl.h" "$(@D)/python_include/objimpl.h" && cp "/usr/include/python3.4m/opcode.h" "$(@D)/python_include/opcode.h" && cp "/usr/include/python3.4m/osdefs.h" "$(@D)/python_include/osdefs.h" && cp "/usr/include/python3.4m/parsetok.h" "$(@D)/python_include/parsetok.h" && cp "/usr/include/python3.4m/patchlevel.h" "$(@D)/python_include/patchlevel.h" && cp "/usr/include/python3.4m/pgen.h" "$(@D)/python_include/pgen.h" && cp "/usr/include/python3.4m/pgenheaders.h" "$(@D)/python_include/pgenheaders.h" && cp "/usr/include/python3.4m/py_curses.h" "$(@D)/python_include/py_curses.h" && cp "/usr/include/python3.4m/pyarena.h" "$(@D)/python_include/pyarena.h" && cp "/usr/include/python3.4m/pyatomic.h" "$(@D)/python_include/pyatomic.h" && cp "/usr/include/python3.4m/pycapsule.h" "$(@D)/python_include/pycapsule.h" && cp "/usr/include/python3.4m/pyconfig.h" "$(@D)/python_include/pyconfig.h" && cp "/usr/include/python3.4m/pyctype.h" "$(@D)/python_include/pyctype.h" && cp "/usr/include/python3.4m/pydebug.h" "$(@D)/python_include/pydebug.h" && cp "/usr/include/python3.4m/pyerrors.h" "$(@D)/python_include/pyerrors.h" && cp "/usr/include/python3.4m/pyexpat.h" "$(@D)/python_include/pyexpat.h" && cp "/usr/include/python3.4m/pyfpe.h" "$(@D)/python_include/pyfpe.h" && cp "/usr/include/python3.4m/pygetopt.h" "$(@D)/python_include/pygetopt.h" && cp "/usr/include/python3.4m/pyhash.h" "$(@D)/python_include/pyhash.h" && cp "/usr/include/python3.4m/pymacconfig.h" "$(@D)/python_include/pymacconfig.h" && cp "/usr/include/python3.4m/pymacro.h" "$(@D)/python_include/pymacro.h" && cp "/usr/include/python3.4m/pymath.h" "$(@D)/python_include/pymath.h" && cp "/usr/include/python3.4m/pymem.h" "$(@D)/python_include/pymem.h" && cp "/usr/include/python3.4m/pyport.h" "$(@D)/python_include/pyport.h" && cp "/usr/include/python3.4m/pystate.h" "$(@D)/python_include/pystate.h" && cp "/usr/include/python3.4m/pystrcmp.h" "$(@D)/python_include/pystrcmp.h" && cp "/usr/include/python3.4m/pystrtod.h" "$(@D)/python_include/pystrtod.h" && cp "/usr/include/python3.4m/pythonrun.h" "$(@D)/python_include/pythonrun.h" && cp "/usr/include/python3.4m/pythread.h" "$(@D)/python_include/pythread.h" && cp "/usr/include/python3.4m/pytime.h" "$(@D)/python_include/pytime.h" && cp "/usr/include/python3.4m/rangeobject.h" "$(@D)/python_include/rangeobject.h" && cp "/usr/include/python3.4m/setobject.h" "$(@D)/python_include/setobject.h" && cp "/usr/include/python3.4m/sliceobject.h" "$(@D)/python_include/sliceobject.h" && cp "/usr/include/python3.4m/structmember.h" "$(@D)/python_include/structmember.h" && cp "/usr/include/python3.4m/structseq.h" "$(@D)/python_include/structseq.h" && cp "/usr/include/python3.4m/symtable.h" "$(@D)/python_include/symtable.h" && cp "/usr/include/python3.4m/sysmodule.h" "$(@D)/python_include/sysmodule.h" && cp "/usr/include/python3.4m/token.h" "$(@D)/python_include/token.h" && cp "/usr/include/python3.4m/traceback.h" "$(@D)/python_include/traceback.h" && cp "/usr/include/python3.4m/tupleobject.h" "$(@D)/python_include/tupleobject.h" && cp "/usr/include/python3.4m/typeslots.h" "$(@D)/python_include/typeslots.h" && cp "/usr/include/python3.4m/ucnhash.h" "$(@D)/python_include/ucnhash.h" && cp "/usr/include/python3.4m/unicodeobject.h" "$(@D)/python_include/unicodeobject.h" && cp "/usr/include/python3.4m/warnings.h" "$(@D)/python_include/warnings.h" && cp "/usr/include/python3.4m/weakrefobject.h" "$(@D)/python_include/weakrefobject.h"
+cp -f "/usr/include/python3.4m/Python-ast.h" "$(@D)/python_include/Python-ast.h" && cp -f "/usr/include/python3.4m/Python.h" "$(@D)/python_include/Python.h" && cp -f "/usr/include/python3.4m/abstract.h" "$(@D)/python_include/abstract.h" && cp -f "/usr/include/python3.4m/accu.h" "$(@D)/python_include/accu.h" && cp -f "/usr/include/python3.4m/asdl.h" "$(@D)/python_include/asdl.h" && cp -f "/usr/include/python3.4m/ast.h" "$(@D)/python_include/ast.h" && cp -f "/usr/include/python3.4m/bitset.h" "$(@D)/python_include/bitset.h" && cp -f "/usr/include/python3.4m/bltinmodule.h" "$(@D)/python_include/bltinmodule.h" && cp -f "/usr/include/python3.4m/boolobject.h" "$(@D)/python_include/boolobject.h" && cp -f "/usr/include/python3.4m/bytearrayobject.h" "$(@D)/python_include/bytearrayobject.h" && cp -f "/usr/include/python3.4m/bytes_methods.h" "$(@D)/python_include/bytes_methods.h" && cp -f "/usr/include/python3.4m/bytesobject.h" "$(@D)/python_include/bytesobject.h" && cp -f "/usr/include/python3.4m/cellobject.h" "$(@D)/python_include/cellobject.h" && cp -f "/usr/include/python3.4m/ceval.h" "$(@D)/python_include/ceval.h" && cp -f "/usr/include/python3.4m/classobject.h" "$(@D)/python_include/classobject.h" && cp -f "/usr/include/python3.4m/code.h" "$(@D)/python_include/code.h" && cp -f "/usr/include/python3.4m/codecs.h" "$(@D)/python_include/codecs.h" && cp -f "/usr/include/python3.4m/compile.h" "$(@D)/python_include/compile.h" && cp -f "/usr/include/python3.4m/complexobject.h" "$(@D)/python_include/complexobject.h" && cp -f "/usr/include/python3.4m/datetime.h" "$(@D)/python_include/datetime.h" && cp -f "/usr/include/python3.4m/descrobject.h" "$(@D)/python_include/descrobject.h" && cp -f "/usr/include/python3.4m/dictobject.h" "$(@D)/python_include/dictobject.h" && cp -f "/usr/include/python3.4m/dtoa.h" "$(@D)/python_include/dtoa.h" && cp -f "/usr/include/python3.4m/dynamic_annotations.h" "$(@D)/python_include/dynamic_annotations.h" && cp -f "/usr/include/python3.4m/enumobject.h" "$(@D)/python_include/enumobject.h" && cp -f "/usr/include/python3.4m/errcode.h" "$(@D)/python_include/errcode.h" && cp -f "/usr/include/python3.4m/eval.h" "$(@D)/python_include/eval.h" && cp -f "/usr/include/python3.4m/fileobject.h" "$(@D)/python_include/fileobject.h" && cp -f "/usr/include/python3.4m/fileutils.h" "$(@D)/python_include/fileutils.h" && cp -f "/usr/include/python3.4m/floatobject.h" "$(@D)/python_include/floatobject.h" && cp -f "/usr/include/python3.4m/frameobject.h" "$(@D)/python_include/frameobject.h" && cp -f "/usr/include/python3.4m/funcobject.h" "$(@D)/python_include/funcobject.h" && cp -f "/usr/include/python3.4m/genobject.h" "$(@D)/python_include/genobject.h" && cp -f "/usr/include/python3.4m/graminit.h" "$(@D)/python_include/graminit.h" && cp -f "/usr/include/python3.4m/grammar.h" "$(@D)/python_include/grammar.h" && cp -f "/usr/include/python3.4m/import.h" "$(@D)/python_include/import.h" && cp -f "/usr/include/python3.4m/intrcheck.h" "$(@D)/python_include/intrcheck.h" && cp -f "/usr/include/python3.4m/iterobject.h" "$(@D)/python_include/iterobject.h" && cp -f "/usr/include/python3.4m/listobject.h" "$(@D)/python_include/listobject.h" && cp -f "/usr/include/python3.4m/longintrepr.h" "$(@D)/python_include/longintrepr.h" && cp -f "/usr/include/python3.4m/longobject.h" "$(@D)/python_include/longobject.h" && cp -f "/usr/include/python3.4m/marshal.h" "$(@D)/python_include/marshal.h" && cp -f "/usr/include/python3.4m/memoryobject.h" "$(@D)/python_include/memoryobject.h" && cp -f "/usr/include/python3.4m/metagrammar.h" "$(@D)/python_include/metagrammar.h" && cp -f "/usr/include/python3.4m/methodobject.h" "$(@D)/python_include/methodobject.h" && cp -f "/usr/include/python3.4m/modsupport.h" "$(@D)/python_include/modsupport.h" && cp -f "/usr/include/python3.4m/moduleobject.h" "$(@D)/python_include/moduleobject.h" && cp -f "/usr/include/python3.4m/namespaceobject.h" "$(@D)/python_include/namespaceobject.h" && cp -f "/usr/include/python3.4m/node.h" "$(@D)/python_include/node.h" && cp -f "/usr/include/python3.4m/object.h" "$(@D)/python_include/object.h" && cp -f "/usr/include/python3.4m/objimpl.h" "$(@D)/python_include/objimpl.h" && cp -f "/usr/include/python3.4m/opcode.h" "$(@D)/python_include/opcode.h" && cp -f "/usr/include/python3.4m/osdefs.h" "$(@D)/python_include/osdefs.h" && cp -f "/usr/include/python3.4m/parsetok.h" "$(@D)/python_include/parsetok.h" && cp -f "/usr/include/python3.4m/patchlevel.h" "$(@D)/python_include/patchlevel.h" && cp -f "/usr/include/python3.4m/pgen.h" "$(@D)/python_include/pgen.h" && cp -f "/usr/include/python3.4m/pgenheaders.h" "$(@D)/python_include/pgenheaders.h" && cp -f "/usr/include/python3.4m/py_curses.h" "$(@D)/python_include/py_curses.h" && cp -f "/usr/include/python3.4m/pyarena.h" "$(@D)/python_include/pyarena.h" && cp -f "/usr/include/python3.4m/pyatomic.h" "$(@D)/python_include/pyatomic.h" && cp -f "/usr/include/python3.4m/pycapsule.h" "$(@D)/python_include/pycapsule.h" && cp -f "/usr/include/python3.4m/pyconfig.h" "$(@D)/python_include/pyconfig.h" && cp -f "/usr/include/python3.4m/pyctype.h" "$(@D)/python_include/pyctype.h" && cp -f "/usr/include/python3.4m/pydebug.h" "$(@D)/python_include/pydebug.h" && cp -f "/usr/include/python3.4m/pyerrors.h" "$(@D)/python_include/pyerrors.h" && cp -f "/usr/include/python3.4m/pyexpat.h" "$(@D)/python_include/pyexpat.h" && cp -f "/usr/include/python3.4m/pyfpe.h" "$(@D)/python_include/pyfpe.h" && cp -f "/usr/include/python3.4m/pygetopt.h" "$(@D)/python_include/pygetopt.h" && cp -f "/usr/include/python3.4m/pyhash.h" "$(@D)/python_include/pyhash.h" && cp -f "/usr/include/python3.4m/pymacconfig.h" "$(@D)/python_include/pymacconfig.h" && cp -f "/usr/include/python3.4m/pymacro.h" "$(@D)/python_include/pymacro.h" && cp -f "/usr/include/python3.4m/pymath.h" "$(@D)/python_include/pymath.h" && cp -f "/usr/include/python3.4m/pymem.h" "$(@D)/python_include/pymem.h" && cp -f "/usr/include/python3.4m/pyport.h" "$(@D)/python_include/pyport.h" && cp -f "/usr/include/python3.4m/pystate.h" "$(@D)/python_include/pystate.h" && cp -f "/usr/include/python3.4m/pystrcmp.h" "$(@D)/python_include/pystrcmp.h" && cp -f "/usr/include/python3.4m/pystrtod.h" "$(@D)/python_include/pystrtod.h" && cp -f "/usr/include/python3.4m/pythonrun.h" "$(@D)/python_include/pythonrun.h" && cp -f "/usr/include/python3.4m/pythread.h" "$(@D)/python_include/pythread.h" && cp -f "/usr/include/python3.4m/pytime.h" "$(@D)/python_include/pytime.h" && cp -f "/usr/include/python3.4m/rangeobject.h" "$(@D)/python_include/rangeobject.h" && cp -f "/usr/include/python3.4m/setobject.h" "$(@D)/python_include/setobject.h" && cp -f "/usr/include/python3.4m/sliceobject.h" "$(@D)/python_include/sliceobject.h" && cp -f "/usr/include/python3.4m/structmember.h" "$(@D)/python_include/structmember.h" && cp -f "/usr/include/python3.4m/structseq.h" "$(@D)/python_include/structseq.h" && cp -f "/usr/include/python3.4m/symtable.h" "$(@D)/python_include/symtable.h" && cp -f "/usr/include/python3.4m/sysmodule.h" "$(@D)/python_include/sysmodule.h" && cp -f "/usr/include/python3.4m/token.h" "$(@D)/python_include/token.h" && cp -f "/usr/include/python3.4m/traceback.h" "$(@D)/python_include/traceback.h" && cp -f "/usr/include/python3.4m/tupleobject.h" "$(@D)/python_include/tupleobject.h" && cp -f "/usr/include/python3.4m/typeslots.h" "$(@D)/python_include/typeslots.h" && cp -f "/usr/include/python3.4m/ucnhash.h" "$(@D)/python_include/ucnhash.h" && cp -f "/usr/include/python3.4m/unicodeobject.h" "$(@D)/python_include/unicodeobject.h" && cp -f "/usr/include/python3.4m/warnings.h" "$(@D)/python_include/warnings.h" && cp -f "/usr/include/python3.4m/weakrefobject.h" "$(@D)/python_include/weakrefobject.h"
    """,
 )
 
@@ -171,6 +171,6 @@ genrule(
         "numpy_include/numpy/utils.h",
     ],
     cmd = """
-cp "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/__multiarray_api.h" "$(@D)/numpy_include/numpy/__multiarray_api.h" && cp "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/__ufunc_api.h" "$(@D)/numpy_include/numpy/__ufunc_api.h" && cp "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/_neighborhood_iterator_imp.h" "$(@D)/numpy_include/numpy/_neighborhood_iterator_imp.h" && cp "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/_numpyconfig.h" "$(@D)/numpy_include/numpy/_numpyconfig.h" && cp "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/arrayobject.h" "$(@D)/numpy_include/numpy/arrayobject.h" && cp "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/arrayscalars.h" "$(@D)/numpy_include/numpy/arrayscalars.h" && cp "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/halffloat.h" "$(@D)/numpy_include/numpy/halffloat.h" && cp "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/multiarray_api.txt" "$(@D)/numpy_include/numpy/multiarray_api.txt" && cp "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/ndarrayobject.h" "$(@D)/numpy_include/numpy/ndarrayobject.h" && cp "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/ndarraytypes.h" "$(@D)/numpy_include/numpy/ndarraytypes.h" && cp "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/noprefix.h" "$(@D)/numpy_include/numpy/noprefix.h" && cp "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/npy_1_7_deprecated_api.h" "$(@D)/numpy_include/numpy/npy_1_7_deprecated_api.h" && cp "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/npy_3kcompat.h" "$(@D)/numpy_include/numpy/npy_3kcompat.h" && cp "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/npy_common.h" "$(@D)/numpy_include/numpy/npy_common.h" && cp "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/npy_cpu.h" "$(@D)/numpy_include/numpy/npy_cpu.h" && cp "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/npy_endian.h" "$(@D)/numpy_include/numpy/npy_endian.h" && cp "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/npy_interrupt.h" "$(@D)/numpy_include/numpy/npy_interrupt.h" && cp "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/npy_math.h" "$(@D)/numpy_include/numpy/npy_math.h" && cp "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/npy_no_deprecated_api.h" "$(@D)/numpy_include/numpy/npy_no_deprecated_api.h" && cp "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/npy_os.h" "$(@D)/numpy_include/numpy/npy_os.h" && cp "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/numpyconfig.h" "$(@D)/numpy_include/numpy/numpyconfig.h" && cp "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/old_defines.h" "$(@D)/numpy_include/numpy/old_defines.h" && cp "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/oldnumeric.h" "$(@D)/numpy_include/numpy/oldnumeric.h" && cp "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/ufunc_api.txt" "$(@D)/numpy_include/numpy/ufunc_api.txt" && cp "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/ufuncobject.h" "$(@D)/numpy_include/numpy/ufuncobject.h" && cp "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/utils.h" "$(@D)/numpy_include/numpy/utils.h"
+cp -f "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/__multiarray_api.h" "$(@D)/numpy_include/numpy/__multiarray_api.h" && cp -f "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/__ufunc_api.h" "$(@D)/numpy_include/numpy/__ufunc_api.h" && cp -f "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/_neighborhood_iterator_imp.h" "$(@D)/numpy_include/numpy/_neighborhood_iterator_imp.h" && cp -f "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/_numpyconfig.h" "$(@D)/numpy_include/numpy/_numpyconfig.h" && cp -f "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/arrayobject.h" "$(@D)/numpy_include/numpy/arrayobject.h" && cp -f "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/arrayscalars.h" "$(@D)/numpy_include/numpy/arrayscalars.h" && cp -f "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/halffloat.h" "$(@D)/numpy_include/numpy/halffloat.h" && cp -f "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/multiarray_api.txt" "$(@D)/numpy_include/numpy/multiarray_api.txt" && cp -f "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/ndarrayobject.h" "$(@D)/numpy_include/numpy/ndarrayobject.h" && cp -f "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/ndarraytypes.h" "$(@D)/numpy_include/numpy/ndarraytypes.h" && cp -f "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/noprefix.h" "$(@D)/numpy_include/numpy/noprefix.h" && cp -f "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/npy_1_7_deprecated_api.h" "$(@D)/numpy_include/numpy/npy_1_7_deprecated_api.h" && cp -f "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/npy_3kcompat.h" "$(@D)/numpy_include/numpy/npy_3kcompat.h" && cp -f "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/npy_common.h" "$(@D)/numpy_include/numpy/npy_common.h" && cp -f "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/npy_cpu.h" "$(@D)/numpy_include/numpy/npy_cpu.h" && cp -f "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/npy_endian.h" "$(@D)/numpy_include/numpy/npy_endian.h" && cp -f "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/npy_interrupt.h" "$(@D)/numpy_include/numpy/npy_interrupt.h" && cp -f "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/npy_math.h" "$(@D)/numpy_include/numpy/npy_math.h" && cp -f "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/npy_no_deprecated_api.h" "$(@D)/numpy_include/numpy/npy_no_deprecated_api.h" && cp -f "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/npy_os.h" "$(@D)/numpy_include/numpy/npy_os.h" && cp -f "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/numpyconfig.h" "$(@D)/numpy_include/numpy/numpyconfig.h" && cp -f "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/old_defines.h" "$(@D)/numpy_include/numpy/old_defines.h" && cp -f "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/oldnumeric.h" "$(@D)/numpy_include/numpy/oldnumeric.h" && cp -f "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/ufunc_api.txt" "$(@D)/numpy_include/numpy/ufunc_api.txt" && cp -f "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/ufuncobject.h" "$(@D)/numpy_include/numpy/ufuncobject.h" && cp -f "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/utils.h" "$(@D)/numpy_include/numpy/utils.h"
    """,
 )
diff --git a/third_party/toolchains/preconfig/win_1803/BUILD b/third_party/toolchains/preconfig/win_1803/BUILD
index 45209d260d8..ac599bc2f3d 100644
--- a/third_party/toolchains/preconfig/win_1803/BUILD
+++ b/third_party/toolchains/preconfig/win_1803/BUILD
@@ -17,7 +17,7 @@ platform(
     remote_execution_properties = """
         properties:{
           name:"container-image"
-          value:"docker://gcr.io/tensorflow-testing/tf-rbe-win@sha256:bd22c6bfff6afc1fa4304ec4411df2410d93645494117585332a4e2258358422"
+          value:"docker://gcr.io/tensorflow-testing/tf-rbe-win@sha256:fbc5713566011cc27fc3651183a6e7c2fd56fc6f006618c53f8fc71e742feebd"
         }
         properties:{
           name: "OSFamily" value: "Windows"
diff --git a/tools/bazel.rc b/tools/bazel.rc
index 8c2052ee8ad..1fdf51f53e2 100644
--- a/tools/bazel.rc
+++ b/tools/bazel.rc
@@ -72,6 +72,7 @@ build:nogcp --define=no_gcp_support=true
 build:nohdfs --define=no_hdfs_support=true
 build:nokafka --define=no_kafka_support=true
 build:noignite --define=no_ignite_support=true
+build:nonccl --define=no_nccl_support=true
 
 build --define=use_fast_cpp_protos=true
 build --define=allow_oversize_protos=true